1 // SPDX-License-Identifier: GPL-2.0-only 2 /* 3 * Common code for Intel Running Average Power Limit (RAPL) support. 4 * Copyright (c) 2019, Intel Corporation. 5 */ 6 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt 7 8 #include <linux/bitmap.h> 9 #include <linux/cleanup.h> 10 #include <linux/cpu.h> 11 #include <linux/delay.h> 12 #include <linux/device.h> 13 #include <linux/intel_rapl.h> 14 #include <linux/kernel.h> 15 #include <linux/list.h> 16 #include <linux/log2.h> 17 #include <linux/module.h> 18 #include <linux/nospec.h> 19 #include <linux/perf_event.h> 20 #include <linux/platform_device.h> 21 #include <linux/powercap.h> 22 #include <linux/processor.h> 23 #include <linux/slab.h> 24 #include <linux/suspend.h> 25 #include <linux/sysfs.h> 26 #include <linux/types.h> 27 #include <linux/units.h> 28 29 #include <asm/cpu_device_id.h> 30 #include <asm/intel-family.h> 31 #include <asm/msr.h> 32 33 #define ENERGY_STATUS_MASK GENMASK(31, 0) 34 35 #define POWER_UNIT_OFFSET 0x00 36 #define POWER_UNIT_MASK GENMASK(3, 0) 37 38 #define ENERGY_UNIT_OFFSET 0x08 39 #define ENERGY_UNIT_MASK GENMASK(12, 8) 40 41 #define TIME_UNIT_OFFSET 0x10 42 #define TIME_UNIT_MASK GENMASK(19, 16) 43 44 /* Non HW constants */ 45 #define RAPL_PRIMITIVE_DUMMY BIT(2) 46 47 #define ENERGY_UNIT_SCALE 1000 /* scale from driver unit to powercap unit */ 48 49 /* per domain data, some are optional */ 50 #define NR_RAW_PRIMITIVES (NR_RAPL_PRIMITIVES - 2) 51 52 #define PACKAGE_PLN_INT_SAVED BIT(0) 53 54 #define RAPL_EVENT_MASK GENMASK(7, 0) 55 56 static const char *pl_names[NR_POWER_LIMITS] = { 57 [POWER_LIMIT1] = "long_term", 58 [POWER_LIMIT2] = "short_term", 59 [POWER_LIMIT4] = "peak_power", 60 }; 61 62 enum pl_prims { 63 PL_ENABLE, 64 PL_CLAMP, 65 PL_LIMIT, 66 PL_TIME_WINDOW, 67 PL_MAX_POWER, 68 PL_LOCK, 69 }; 70 71 static bool is_pl_valid(struct rapl_domain *rd, int pl) 72 { 73 if (pl < POWER_LIMIT1 || pl > POWER_LIMIT4) 74 return false; 75 return rd->rpl[pl].name ? true : false; 76 } 77 78 static int get_pl_lock_prim(struct rapl_domain *rd, int pl) 79 { 80 if (rd->rp->priv->type == RAPL_IF_TPMI) { 81 if (pl == POWER_LIMIT1) 82 return PL1_LOCK; 83 if (pl == POWER_LIMIT2) 84 return PL2_LOCK; 85 if (pl == POWER_LIMIT4) 86 return PL4_LOCK; 87 } 88 89 /* MSR/MMIO Interface doesn't have Lock bit for PL4 */ 90 if (pl == POWER_LIMIT4) 91 return -EINVAL; 92 93 /* 94 * Power Limit register that supports two power limits has a different 95 * bit position for the Lock bit. 96 */ 97 if (rd->rp->priv->limits[rd->id] & BIT(POWER_LIMIT2)) 98 return FW_HIGH_LOCK; 99 return FW_LOCK; 100 } 101 102 static int get_pl_prim(struct rapl_domain *rd, int pl, enum pl_prims prim) 103 { 104 switch (pl) { 105 case POWER_LIMIT1: 106 if (prim == PL_ENABLE) 107 return PL1_ENABLE; 108 if (prim == PL_CLAMP && rd->rp->priv->type != RAPL_IF_TPMI) 109 return PL1_CLAMP; 110 if (prim == PL_LIMIT) 111 return POWER_LIMIT1; 112 if (prim == PL_TIME_WINDOW) 113 return TIME_WINDOW1; 114 if (prim == PL_MAX_POWER) 115 return THERMAL_SPEC_POWER; 116 if (prim == PL_LOCK) 117 return get_pl_lock_prim(rd, pl); 118 return -EINVAL; 119 case POWER_LIMIT2: 120 if (prim == PL_ENABLE) 121 return PL2_ENABLE; 122 if (prim == PL_CLAMP && rd->rp->priv->type != RAPL_IF_TPMI) 123 return PL2_CLAMP; 124 if (prim == PL_LIMIT) 125 return POWER_LIMIT2; 126 if (prim == PL_TIME_WINDOW) 127 return TIME_WINDOW2; 128 if (prim == PL_MAX_POWER) 129 return MAX_POWER; 130 if (prim == PL_LOCK) 131 return get_pl_lock_prim(rd, pl); 132 return -EINVAL; 133 case POWER_LIMIT4: 134 if (prim == PL_LIMIT) 135 return POWER_LIMIT4; 136 if (prim == PL_ENABLE) 137 return PL4_ENABLE; 138 /* PL4 would be around two times PL2, use same prim as PL2. */ 139 if (prim == PL_MAX_POWER) 140 return MAX_POWER; 141 if (prim == PL_LOCK) 142 return get_pl_lock_prim(rd, pl); 143 return -EINVAL; 144 default: 145 return -EINVAL; 146 } 147 } 148 149 #define power_zone_to_rapl_domain(_zone) \ 150 container_of(_zone, struct rapl_domain, power_zone) 151 152 static const struct rapl_defaults *get_defaults(struct rapl_package *rp) 153 { 154 return rp->priv->defaults; 155 } 156 157 static void rapl_init_domains(struct rapl_package *rp); 158 static int rapl_read_data_raw(struct rapl_domain *rd, 159 enum rapl_primitives prim, 160 bool xlate, u64 *data, 161 bool pmu_ctx); 162 static int rapl_write_data_raw(struct rapl_domain *rd, 163 enum rapl_primitives prim, 164 unsigned long long value); 165 static int rapl_read_pl_data(struct rapl_domain *rd, int pl, 166 enum pl_prims pl_prim, 167 bool xlate, u64 *data); 168 static int rapl_write_pl_data(struct rapl_domain *rd, int pl, 169 enum pl_prims pl_prim, 170 unsigned long long value); 171 static u64 rapl_unit_xlate(struct rapl_domain *rd, 172 enum unit_type type, u64 value, int to_raw); 173 static void package_power_limit_irq_save(struct rapl_package *rp); 174 175 static LIST_HEAD(rapl_packages); /* guarded by CPU hotplug lock */ 176 177 static const char *const rapl_domain_names[] = { 178 "package", 179 "core", 180 "uncore", 181 "dram", 182 "psys", 183 }; 184 185 static int get_energy_counter(struct powercap_zone *power_zone, 186 u64 *energy_raw) 187 { 188 struct rapl_domain *rd; 189 u64 energy_now; 190 191 /* prevent CPU hotplug, make sure the RAPL domain does not go 192 * away while reading the counter. 193 */ 194 cpus_read_lock(); 195 rd = power_zone_to_rapl_domain(power_zone); 196 197 if (!rapl_read_data_raw(rd, ENERGY_COUNTER, true, &energy_now, false)) { 198 *energy_raw = energy_now; 199 cpus_read_unlock(); 200 201 return 0; 202 } 203 cpus_read_unlock(); 204 205 return -EIO; 206 } 207 208 static int get_max_energy_counter(struct powercap_zone *pcd_dev, u64 *energy) 209 { 210 struct rapl_domain *rd = power_zone_to_rapl_domain(pcd_dev); 211 212 *energy = rapl_unit_xlate(rd, ENERGY_UNIT, ENERGY_STATUS_MASK, 0); 213 return 0; 214 } 215 216 static int release_zone(struct powercap_zone *power_zone) 217 { 218 struct rapl_domain *rd = power_zone_to_rapl_domain(power_zone); 219 struct rapl_package *rp = rd->rp; 220 221 /* package zone is the last zone of a package, we can free 222 * memory here since all children has been unregistered. 223 */ 224 if (rd->id == RAPL_DOMAIN_PACKAGE) { 225 kfree(rd); 226 rp->domains = NULL; 227 } 228 229 return 0; 230 231 } 232 233 static int find_nr_power_limit(struct rapl_domain *rd) 234 { 235 int i, nr_pl = 0; 236 237 for (i = 0; i < NR_POWER_LIMITS; i++) { 238 if (is_pl_valid(rd, i)) 239 nr_pl++; 240 } 241 242 return nr_pl; 243 } 244 245 static int set_domain_enable(struct powercap_zone *power_zone, bool mode) 246 { 247 struct rapl_domain *rd = power_zone_to_rapl_domain(power_zone); 248 const struct rapl_defaults *defaults = get_defaults(rd->rp); 249 u64 val; 250 int ret; 251 252 cpus_read_lock(); 253 ret = rapl_write_pl_data(rd, POWER_LIMIT1, PL_ENABLE, mode); 254 if (ret) 255 goto end; 256 257 ret = rapl_read_pl_data(rd, POWER_LIMIT1, PL_ENABLE, false, &val); 258 if (ret) 259 goto end; 260 261 if (mode != val) { 262 pr_debug("%s cannot be %s\n", power_zone->name, 263 str_enabled_disabled(mode)); 264 goto end; 265 } 266 267 if (defaults->set_floor_freq) 268 defaults->set_floor_freq(rd, mode); 269 270 end: 271 cpus_read_unlock(); 272 273 return ret; 274 } 275 276 static int get_domain_enable(struct powercap_zone *power_zone, bool *mode) 277 { 278 struct rapl_domain *rd = power_zone_to_rapl_domain(power_zone); 279 u64 val; 280 int ret; 281 282 if (rd->rpl[POWER_LIMIT1].locked) { 283 *mode = false; 284 return 0; 285 } 286 cpus_read_lock(); 287 ret = rapl_read_pl_data(rd, POWER_LIMIT1, PL_ENABLE, true, &val); 288 if (!ret) 289 *mode = val; 290 cpus_read_unlock(); 291 292 return ret; 293 } 294 295 /* per RAPL domain ops, in the order of rapl_domain_type */ 296 static const struct powercap_zone_ops zone_ops[] = { 297 /* RAPL_DOMAIN_PACKAGE */ 298 { 299 .get_energy_uj = get_energy_counter, 300 .get_max_energy_range_uj = get_max_energy_counter, 301 .release = release_zone, 302 .set_enable = set_domain_enable, 303 .get_enable = get_domain_enable, 304 }, 305 /* RAPL_DOMAIN_PP0 */ 306 { 307 .get_energy_uj = get_energy_counter, 308 .get_max_energy_range_uj = get_max_energy_counter, 309 .release = release_zone, 310 .set_enable = set_domain_enable, 311 .get_enable = get_domain_enable, 312 }, 313 /* RAPL_DOMAIN_PP1 */ 314 { 315 .get_energy_uj = get_energy_counter, 316 .get_max_energy_range_uj = get_max_energy_counter, 317 .release = release_zone, 318 .set_enable = set_domain_enable, 319 .get_enable = get_domain_enable, 320 }, 321 /* RAPL_DOMAIN_DRAM */ 322 { 323 .get_energy_uj = get_energy_counter, 324 .get_max_energy_range_uj = get_max_energy_counter, 325 .release = release_zone, 326 .set_enable = set_domain_enable, 327 .get_enable = get_domain_enable, 328 }, 329 /* RAPL_DOMAIN_PLATFORM */ 330 { 331 .get_energy_uj = get_energy_counter, 332 .get_max_energy_range_uj = get_max_energy_counter, 333 .release = release_zone, 334 .set_enable = set_domain_enable, 335 .get_enable = get_domain_enable, 336 }, 337 }; 338 339 /* 340 * Constraint index used by powercap can be different than power limit (PL) 341 * index in that some PLs maybe missing due to non-existent MSRs. So we 342 * need to convert here by finding the valid PLs only (name populated). 343 */ 344 static int contraint_to_pl(struct rapl_domain *rd, int cid) 345 { 346 int i, j; 347 348 for (i = POWER_LIMIT1, j = 0; i < NR_POWER_LIMITS; i++) { 349 if (is_pl_valid(rd, i) && j++ == cid) { 350 pr_debug("%s: index %d\n", __func__, i); 351 return i; 352 } 353 } 354 pr_err("Cannot find matching power limit for constraint %d\n", cid); 355 356 return -EINVAL; 357 } 358 359 static int set_power_limit(struct powercap_zone *power_zone, int cid, 360 u64 power_limit) 361 { 362 struct rapl_domain *rd; 363 struct rapl_package *rp; 364 int ret = 0; 365 int id; 366 367 cpus_read_lock(); 368 rd = power_zone_to_rapl_domain(power_zone); 369 id = contraint_to_pl(rd, cid); 370 rp = rd->rp; 371 372 ret = rapl_write_pl_data(rd, id, PL_LIMIT, power_limit); 373 if (!ret) 374 package_power_limit_irq_save(rp); 375 cpus_read_unlock(); 376 return ret; 377 } 378 379 static int get_current_power_limit(struct powercap_zone *power_zone, int cid, 380 u64 *data) 381 { 382 struct rapl_domain *rd; 383 u64 val; 384 int ret = 0; 385 int id; 386 387 cpus_read_lock(); 388 rd = power_zone_to_rapl_domain(power_zone); 389 id = contraint_to_pl(rd, cid); 390 391 ret = rapl_read_pl_data(rd, id, PL_LIMIT, true, &val); 392 if (!ret) 393 *data = val; 394 395 cpus_read_unlock(); 396 397 return ret; 398 } 399 400 static int set_time_window(struct powercap_zone *power_zone, int cid, 401 u64 window) 402 { 403 struct rapl_domain *rd; 404 int ret = 0; 405 int id; 406 407 cpus_read_lock(); 408 rd = power_zone_to_rapl_domain(power_zone); 409 id = contraint_to_pl(rd, cid); 410 411 ret = rapl_write_pl_data(rd, id, PL_TIME_WINDOW, window); 412 413 cpus_read_unlock(); 414 return ret; 415 } 416 417 static int get_time_window(struct powercap_zone *power_zone, int cid, 418 u64 *data) 419 { 420 struct rapl_domain *rd; 421 u64 val; 422 int ret = 0; 423 int id; 424 425 cpus_read_lock(); 426 rd = power_zone_to_rapl_domain(power_zone); 427 id = contraint_to_pl(rd, cid); 428 429 ret = rapl_read_pl_data(rd, id, PL_TIME_WINDOW, true, &val); 430 if (!ret) 431 *data = val; 432 433 cpus_read_unlock(); 434 435 return ret; 436 } 437 438 static const char *get_constraint_name(struct powercap_zone *power_zone, 439 int cid) 440 { 441 struct rapl_domain *rd; 442 int id; 443 444 rd = power_zone_to_rapl_domain(power_zone); 445 id = contraint_to_pl(rd, cid); 446 if (id >= 0) 447 return rd->rpl[id].name; 448 449 return NULL; 450 } 451 452 static int get_max_power(struct powercap_zone *power_zone, int cid, u64 *data) 453 { 454 struct rapl_domain *rd; 455 u64 val; 456 int ret = 0; 457 int id; 458 459 cpus_read_lock(); 460 rd = power_zone_to_rapl_domain(power_zone); 461 id = contraint_to_pl(rd, cid); 462 463 ret = rapl_read_pl_data(rd, id, PL_MAX_POWER, true, &val); 464 if (!ret) 465 *data = val; 466 467 /* As a generalization rule, PL4 would be around two times PL2. */ 468 if (id == POWER_LIMIT4) 469 *data = *data * 2; 470 471 cpus_read_unlock(); 472 473 return ret; 474 } 475 476 static const struct powercap_zone_constraint_ops constraint_ops = { 477 .set_power_limit_uw = set_power_limit, 478 .get_power_limit_uw = get_current_power_limit, 479 .set_time_window_us = set_time_window, 480 .get_time_window_us = get_time_window, 481 .get_max_power_uw = get_max_power, 482 .get_name = get_constraint_name, 483 }; 484 485 /* Return the id used for read_raw/write_raw callback */ 486 static int get_rid(struct rapl_package *rp) 487 { 488 return rp->lead_cpu >= 0 ? rp->lead_cpu : rp->id; 489 } 490 491 /* called after domain detection and package level data are set */ 492 static void rapl_init_domains(struct rapl_package *rp) 493 { 494 enum rapl_domain_type i; 495 enum rapl_domain_reg_id j; 496 struct rapl_domain *rd = rp->domains; 497 498 for (i = 0; i < RAPL_DOMAIN_MAX; i++) { 499 unsigned int mask = rp->domain_map & (1 << i); 500 int t; 501 502 if (!mask) 503 continue; 504 505 rd->rp = rp; 506 507 if (i == RAPL_DOMAIN_PLATFORM && rp->id > 0) { 508 snprintf(rd->name, RAPL_DOMAIN_NAME_LENGTH, "psys-%d", 509 rp->lead_cpu >= 0 ? topology_physical_package_id(rp->lead_cpu) : 510 rp->id); 511 } else { 512 snprintf(rd->name, RAPL_DOMAIN_NAME_LENGTH, "%s", 513 rapl_domain_names[i]); 514 } 515 516 rd->id = i; 517 518 /* PL1 is supported by default */ 519 rp->priv->limits[i] |= BIT(POWER_LIMIT1); 520 521 for (t = POWER_LIMIT1; t < NR_POWER_LIMITS; t++) { 522 if (rp->priv->limits[i] & BIT(t)) 523 rd->rpl[t].name = pl_names[t]; 524 } 525 526 for (j = 0; j < RAPL_DOMAIN_REG_MAX; j++) 527 rd->regs[j] = rp->priv->regs[i][j]; 528 529 rd++; 530 } 531 } 532 533 static u64 rapl_unit_xlate(struct rapl_domain *rd, enum unit_type type, 534 u64 value, int to_raw) 535 { 536 u64 units = 1; 537 const struct rapl_defaults *defaults = get_defaults(rd->rp); 538 u64 scale = 1; 539 540 switch (type) { 541 case POWER_UNIT: 542 units = rd->power_unit; 543 break; 544 case ENERGY_UNIT: 545 scale = ENERGY_UNIT_SCALE; 546 units = rd->energy_unit; 547 break; 548 case TIME_UNIT: 549 return defaults->compute_time_window(rd, value, to_raw); 550 case ARBITRARY_UNIT: 551 default: 552 return value; 553 } 554 555 if (to_raw) 556 return div64_u64(value, units) * scale; 557 558 value *= units; 559 560 return div64_u64(value, scale); 561 } 562 563 static struct rapl_primitive_info *get_rpi(struct rapl_package *rp, int prim) 564 { 565 struct rapl_primitive_info *rpi = rp->priv->rpi; 566 567 if (prim < 0 || prim >= NR_RAPL_PRIMITIVES || !rpi) 568 return NULL; 569 570 return &rpi[prim]; 571 } 572 573 static int rapl_config(struct rapl_package *rp) 574 { 575 /* defaults_msr can be NULL on unsupported platforms */ 576 if (!rp->priv->defaults || !rp->priv->rpi) 577 return -ENODEV; 578 579 return 0; 580 } 581 582 static enum rapl_primitives 583 prim_fixups(struct rapl_domain *rd, enum rapl_primitives prim) 584 { 585 const struct rapl_defaults *defaults = get_defaults(rd->rp); 586 587 if (!defaults->spr_psys_bits) 588 return prim; 589 590 if (rd->id != RAPL_DOMAIN_PLATFORM) 591 return prim; 592 593 switch (prim) { 594 case POWER_LIMIT1: 595 return PSYS_POWER_LIMIT1; 596 case POWER_LIMIT2: 597 return PSYS_POWER_LIMIT2; 598 case PL1_ENABLE: 599 return PSYS_PL1_ENABLE; 600 case PL2_ENABLE: 601 return PSYS_PL2_ENABLE; 602 case TIME_WINDOW1: 603 return PSYS_TIME_WINDOW1; 604 case TIME_WINDOW2: 605 return PSYS_TIME_WINDOW2; 606 default: 607 return prim; 608 } 609 } 610 611 /* Read primitive data based on its related struct rapl_primitive_info. 612 * if xlate flag is set, return translated data based on data units, i.e. 613 * time, energy, and power. 614 * RAPL MSRs are non-architectual and are laid out not consistently across 615 * domains. Here we use primitive info to allow writing consolidated access 616 * functions. 617 * For a given primitive, it is processed by MSR mask and shift. Unit conversion 618 * is pre-assigned based on RAPL unit MSRs read at init time. 619 * 63-------------------------- 31--------------------------- 0 620 * | xxxxx (mask) | 621 * | |<- shift ----------------| 622 * 63-------------------------- 31--------------------------- 0 623 */ 624 static int rapl_read_data_raw(struct rapl_domain *rd, 625 enum rapl_primitives prim, bool xlate, u64 *data, 626 bool pmu_ctx) 627 { 628 u64 value; 629 enum rapl_primitives prim_fixed = prim_fixups(rd, prim); 630 struct rapl_primitive_info *rpi = get_rpi(rd->rp, prim_fixed); 631 struct reg_action ra; 632 633 if (!rpi || !rpi->name || rpi->flag & RAPL_PRIMITIVE_DUMMY) 634 return -EINVAL; 635 636 ra.reg = rd->regs[rpi->id]; 637 if (!ra.reg.val) 638 return -EINVAL; 639 640 ra.mask = rpi->mask; 641 642 if (rd->rp->priv->read_raw(get_rid(rd->rp), &ra, pmu_ctx)) { 643 pr_debug("failed to read reg 0x%llx for %s:%s\n", ra.reg.val, rd->rp->name, rd->name); 644 return -EIO; 645 } 646 647 value = ra.value >> rpi->shift; 648 649 if (xlate) 650 *data = rapl_unit_xlate(rd, rpi->unit, value, 0); 651 else 652 *data = value; 653 654 return 0; 655 } 656 657 /* Similar use of primitive info in the read counterpart */ 658 static int rapl_write_data_raw(struct rapl_domain *rd, 659 enum rapl_primitives prim, 660 unsigned long long value) 661 { 662 enum rapl_primitives prim_fixed = prim_fixups(rd, prim); 663 struct rapl_primitive_info *rpi = get_rpi(rd->rp, prim_fixed); 664 u64 bits; 665 struct reg_action ra; 666 int ret; 667 668 if (!rpi || !rpi->name || rpi->flag & RAPL_PRIMITIVE_DUMMY) 669 return -EINVAL; 670 671 bits = rapl_unit_xlate(rd, rpi->unit, value, 1); 672 bits <<= rpi->shift; 673 bits &= rpi->mask; 674 675 memset(&ra, 0, sizeof(ra)); 676 677 ra.reg = rd->regs[rpi->id]; 678 ra.mask = rpi->mask; 679 ra.value = bits; 680 681 ret = rd->rp->priv->write_raw(get_rid(rd->rp), &ra); 682 683 return ret; 684 } 685 686 static int rapl_read_pl_data(struct rapl_domain *rd, int pl, 687 enum pl_prims pl_prim, bool xlate, u64 *data) 688 { 689 enum rapl_primitives prim = get_pl_prim(rd, pl, pl_prim); 690 691 if (!is_pl_valid(rd, pl)) 692 return -EINVAL; 693 694 return rapl_read_data_raw(rd, prim, xlate, data, false); 695 } 696 697 static int rapl_write_pl_data(struct rapl_domain *rd, int pl, 698 enum pl_prims pl_prim, 699 unsigned long long value) 700 { 701 enum rapl_primitives prim = get_pl_prim(rd, pl, pl_prim); 702 703 if (!is_pl_valid(rd, pl)) 704 return -EINVAL; 705 706 if (rd->rpl[pl].locked) { 707 pr_debug("%s:%s:%s locked by BIOS\n", rd->rp->name, rd->name, pl_names[pl]); 708 return -EACCES; 709 } 710 711 return rapl_write_data_raw(rd, prim, value); 712 } 713 /* 714 * Raw RAPL data stored in MSRs are in certain scales. We need to 715 * convert them into standard units based on the units reported in 716 * the RAPL unit MSRs. This is specific to CPUs as the method to 717 * calculate units differ on different CPUs. 718 * We convert the units to below format based on CPUs. 719 * i.e. 720 * energy unit: picoJoules : Represented in picoJoules by default 721 * power unit : microWatts : Represented in milliWatts by default 722 * time unit : microseconds: Represented in seconds by default 723 */ 724 int rapl_default_check_unit(struct rapl_domain *rd) 725 { 726 struct reg_action ra; 727 u32 value; 728 729 ra.reg = rd->regs[RAPL_DOMAIN_REG_UNIT]; 730 ra.mask = ~0; 731 if (rd->rp->priv->read_raw(get_rid(rd->rp), &ra, false)) { 732 pr_err("Failed to read power unit REG 0x%llx on %s:%s, exit.\n", 733 ra.reg.val, rd->rp->name, rd->name); 734 return -ENODEV; 735 } 736 737 value = (ra.value & ENERGY_UNIT_MASK) >> ENERGY_UNIT_OFFSET; 738 rd->energy_unit = (ENERGY_UNIT_SCALE * MICROJOULE_PER_JOULE) >> value; 739 740 value = (ra.value & POWER_UNIT_MASK) >> POWER_UNIT_OFFSET; 741 rd->power_unit = MICROWATT_PER_WATT >> value; 742 743 value = (ra.value & TIME_UNIT_MASK) >> TIME_UNIT_OFFSET; 744 rd->time_unit = USEC_PER_SEC >> value; 745 746 pr_debug("Core CPU %s:%s energy=%dpJ, time=%dus, power=%duW\n", 747 rd->rp->name, rd->name, rd->energy_unit, rd->time_unit, rd->power_unit); 748 749 return 0; 750 } 751 EXPORT_SYMBOL_NS_GPL(rapl_default_check_unit, "INTEL_RAPL"); 752 753 static void power_limit_irq_save_cpu(void *info) 754 { 755 u32 l, h = 0; 756 struct rapl_package *rp = (struct rapl_package *)info; 757 758 /* save the state of PLN irq mask bit before disabling it */ 759 rdmsr_safe(MSR_IA32_PACKAGE_THERM_INTERRUPT, &l, &h); 760 if (!(rp->power_limit_irq & PACKAGE_PLN_INT_SAVED)) { 761 rp->power_limit_irq = l & PACKAGE_THERM_INT_PLN_ENABLE; 762 rp->power_limit_irq |= PACKAGE_PLN_INT_SAVED; 763 } 764 l &= ~PACKAGE_THERM_INT_PLN_ENABLE; 765 wrmsr_safe(MSR_IA32_PACKAGE_THERM_INTERRUPT, l, h); 766 } 767 768 /* REVISIT: 769 * When package power limit is set artificially low by RAPL, LVT 770 * thermal interrupt for package power limit should be ignored 771 * since we are not really exceeding the real limit. The intention 772 * is to avoid excessive interrupts while we are trying to save power. 773 * A useful feature might be routing the package_power_limit interrupt 774 * to userspace via eventfd. once we have a usecase, this is simple 775 * to do by adding an atomic notifier. 776 */ 777 778 static void package_power_limit_irq_save(struct rapl_package *rp) 779 { 780 if (rp->lead_cpu < 0) 781 return; 782 783 if (!boot_cpu_has(X86_FEATURE_PTS) || !boot_cpu_has(X86_FEATURE_PLN)) 784 return; 785 786 smp_call_function_single(rp->lead_cpu, power_limit_irq_save_cpu, rp, 1); 787 } 788 789 /* 790 * Restore per package power limit interrupt enable state. Called from cpu 791 * hotplug code on package removal. 792 */ 793 static void package_power_limit_irq_restore(struct rapl_package *rp) 794 { 795 u32 l, h; 796 797 if (rp->lead_cpu < 0) 798 return; 799 800 if (!boot_cpu_has(X86_FEATURE_PTS) || !boot_cpu_has(X86_FEATURE_PLN)) 801 return; 802 803 /* irq enable state not saved, nothing to restore */ 804 if (!(rp->power_limit_irq & PACKAGE_PLN_INT_SAVED)) 805 return; 806 807 rdmsr_safe(MSR_IA32_PACKAGE_THERM_INTERRUPT, &l, &h); 808 809 if (rp->power_limit_irq & PACKAGE_THERM_INT_PLN_ENABLE) 810 l |= PACKAGE_THERM_INT_PLN_ENABLE; 811 else 812 l &= ~PACKAGE_THERM_INT_PLN_ENABLE; 813 814 wrmsr_safe(MSR_IA32_PACKAGE_THERM_INTERRUPT, l, h); 815 } 816 817 void rapl_default_set_floor_freq(struct rapl_domain *rd, bool mode) 818 { 819 int i; 820 821 /* always enable clamp such that p-state can go below OS requested 822 * range. power capping priority over guranteed frequency. 823 */ 824 rapl_write_pl_data(rd, POWER_LIMIT1, PL_CLAMP, mode); 825 826 for (i = POWER_LIMIT2; i < NR_POWER_LIMITS; i++) { 827 rapl_write_pl_data(rd, i, PL_ENABLE, mode); 828 rapl_write_pl_data(rd, i, PL_CLAMP, mode); 829 } 830 } 831 EXPORT_SYMBOL_NS_GPL(rapl_default_set_floor_freq, "INTEL_RAPL"); 832 833 u64 rapl_default_compute_time_window(struct rapl_domain *rd, u64 value, bool to_raw) 834 { 835 u64 f, y; /* fraction and exp. used for time unit */ 836 837 /* 838 * Special processing based on 2^Y*(1+F/4), refer 839 * to Intel Software Developer's manual Vol.3B: CH 14.9.3. 840 */ 841 if (!to_raw) { 842 f = (value & 0x60) >> 5; 843 y = value & 0x1f; 844 value = (1ULL << y) * (4 + f) * rd->time_unit / 4; 845 } else { 846 if (value < rd->time_unit) 847 return 0; 848 849 do_div(value, rd->time_unit); 850 y = ilog2(value); 851 852 /* 853 * The target hardware field is 7 bits wide, so return all ones 854 * if the exponent is too large. 855 */ 856 if (y > 0x1f) 857 return 0x7f; 858 859 f = div64_u64(4 * (value - BIT_ULL(y)), BIT_ULL(y)); 860 value = (y & 0x1f) | ((f & 0x3) << 5); 861 } 862 return value; 863 } 864 EXPORT_SYMBOL_NS_GPL(rapl_default_compute_time_window, "INTEL_RAPL"); 865 866 /* Read once for all raw primitive data for domains */ 867 static void rapl_update_domain_data(struct rapl_package *rp) 868 { 869 int dmn, prim; 870 u64 val; 871 872 for (dmn = 0; dmn < rp->nr_domains; dmn++) { 873 pr_debug("update %s domain %s data\n", rp->name, 874 rp->domains[dmn].name); 875 /* exclude non-raw primitives */ 876 for (prim = 0; prim < NR_RAW_PRIMITIVES; prim++) { 877 struct rapl_primitive_info *rpi = get_rpi(rp, prim); 878 879 if (!rapl_read_data_raw(&rp->domains[dmn], prim, 880 rpi->unit, &val, false)) 881 rp->domains[dmn].rdd.primitives[prim] = val; 882 } 883 } 884 885 } 886 887 static int rapl_package_register_powercap(struct rapl_package *rp) 888 { 889 struct rapl_domain *rd; 890 struct powercap_zone *power_zone = NULL; 891 int nr_pl, ret; 892 893 /* Update the domain data of the new package */ 894 rapl_update_domain_data(rp); 895 896 /* first we register package domain as the parent zone */ 897 for (rd = rp->domains; rd < rp->domains + rp->nr_domains; rd++) { 898 if (rd->id == RAPL_DOMAIN_PACKAGE) { 899 nr_pl = find_nr_power_limit(rd); 900 pr_debug("register package domain %s\n", rp->name); 901 power_zone = powercap_register_zone(&rd->power_zone, 902 rp->priv->control_type, rp->name, 903 NULL, &zone_ops[rd->id], nr_pl, 904 &constraint_ops); 905 if (IS_ERR(power_zone)) { 906 pr_debug("failed to register power zone %s\n", 907 rp->name); 908 return PTR_ERR(power_zone); 909 } 910 /* track parent zone in per package/socket data */ 911 rp->power_zone = power_zone; 912 /* done, only one package domain per socket */ 913 break; 914 } 915 } 916 if (!power_zone) { 917 pr_err("no package domain found, unknown topology!\n"); 918 return -ENODEV; 919 } 920 /* now register domains as children of the socket/package */ 921 for (rd = rp->domains; rd < rp->domains + rp->nr_domains; rd++) { 922 struct powercap_zone *parent = rp->power_zone; 923 924 if (rd->id == RAPL_DOMAIN_PACKAGE) 925 continue; 926 if (rd->id == RAPL_DOMAIN_PLATFORM) 927 parent = NULL; 928 /* number of power limits per domain varies */ 929 nr_pl = find_nr_power_limit(rd); 930 power_zone = powercap_register_zone(&rd->power_zone, 931 rp->priv->control_type, 932 rd->name, parent, 933 &zone_ops[rd->id], nr_pl, 934 &constraint_ops); 935 936 if (IS_ERR(power_zone)) { 937 pr_debug("failed to register power_zone, %s:%s\n", 938 rp->name, rd->name); 939 ret = PTR_ERR(power_zone); 940 goto err_cleanup; 941 } 942 } 943 return 0; 944 945 err_cleanup: 946 /* 947 * Clean up previously initialized domains within the package if we 948 * failed after the first domain setup. 949 */ 950 while (--rd >= rp->domains) { 951 pr_debug("unregister %s domain %s\n", rp->name, rd->name); 952 powercap_unregister_zone(rp->priv->control_type, 953 &rd->power_zone); 954 } 955 956 return ret; 957 } 958 959 static int rapl_check_domain(int domain, struct rapl_package *rp) 960 { 961 struct reg_action ra; 962 963 switch (domain) { 964 case RAPL_DOMAIN_PACKAGE: 965 case RAPL_DOMAIN_PP0: 966 case RAPL_DOMAIN_PP1: 967 case RAPL_DOMAIN_DRAM: 968 case RAPL_DOMAIN_PLATFORM: 969 ra.reg = rp->priv->regs[domain][RAPL_DOMAIN_REG_STATUS]; 970 break; 971 default: 972 pr_err("invalid domain id %d\n", domain); 973 return -EINVAL; 974 } 975 /* make sure domain counters are available and contains non-zero 976 * values, otherwise skip it. 977 */ 978 979 ra.mask = ENERGY_STATUS_MASK; 980 if (rp->priv->read_raw(get_rid(rp), &ra, false) || !ra.value) 981 return -ENODEV; 982 983 return 0; 984 } 985 986 /* 987 * Get per domain energy/power/time unit. 988 * RAPL Interfaces without per domain unit register will use the package 989 * scope unit register to set per domain units. 990 */ 991 static int rapl_get_domain_unit(struct rapl_domain *rd) 992 { 993 const struct rapl_defaults *defaults = get_defaults(rd->rp); 994 int ret; 995 996 if (!rd->regs[RAPL_DOMAIN_REG_UNIT].val) { 997 if (!rd->rp->priv->reg_unit.val) { 998 pr_err("No valid Unit register found\n"); 999 return -ENODEV; 1000 } 1001 rd->regs[RAPL_DOMAIN_REG_UNIT] = rd->rp->priv->reg_unit; 1002 } 1003 1004 if (!defaults->check_unit) { 1005 pr_err("missing .check_unit() callback\n"); 1006 return -ENODEV; 1007 } 1008 1009 ret = defaults->check_unit(rd); 1010 if (ret) 1011 return ret; 1012 1013 if (rd->id == RAPL_DOMAIN_DRAM && defaults->dram_domain_energy_unit) 1014 rd->energy_unit = defaults->dram_domain_energy_unit; 1015 if (rd->id == RAPL_DOMAIN_PLATFORM && defaults->psys_domain_energy_unit) 1016 rd->energy_unit = defaults->psys_domain_energy_unit; 1017 return 0; 1018 } 1019 1020 /* 1021 * Check if power limits are available. Two cases when they are not available: 1022 * 1. Locked by BIOS, in this case we still provide read-only access so that 1023 * users can see what limit is set by the BIOS. 1024 * 2. Some CPUs make some domains monitoring only which means PLx MSRs may not 1025 * exist at all. In this case, we do not show the constraints in powercap. 1026 * 1027 * Called after domains are detected and initialized. 1028 */ 1029 static void rapl_detect_powerlimit(struct rapl_domain *rd) 1030 { 1031 u64 val64; 1032 int i; 1033 1034 for (i = POWER_LIMIT1; i < NR_POWER_LIMITS; i++) { 1035 if (!rapl_read_pl_data(rd, i, PL_LOCK, false, &val64)) { 1036 if (val64) { 1037 rd->rpl[i].locked = true; 1038 pr_info("%s:%s:%s locked by BIOS\n", 1039 rd->rp->name, rd->name, pl_names[i]); 1040 } 1041 } 1042 1043 if (rapl_read_pl_data(rd, i, PL_LIMIT, false, &val64)) 1044 rd->rpl[i].name = NULL; 1045 } 1046 } 1047 1048 /* Detect active and valid domains for the given CPU, caller must 1049 * ensure the CPU belongs to the targeted package and CPU hotlug is disabled. 1050 */ 1051 static int rapl_detect_domains(struct rapl_package *rp) 1052 { 1053 struct rapl_domain *rd; 1054 int i; 1055 1056 for (i = 0; i < RAPL_DOMAIN_MAX; i++) { 1057 /* use physical package id to read counters */ 1058 if (!rapl_check_domain(i, rp)) { 1059 rp->domain_map |= 1 << i; 1060 pr_info("Found RAPL domain %s\n", rapl_domain_names[i]); 1061 } 1062 } 1063 rp->nr_domains = bitmap_weight(&rp->domain_map, RAPL_DOMAIN_MAX); 1064 if (!rp->nr_domains) { 1065 pr_debug("no valid rapl domains found in %s\n", rp->name); 1066 return -ENODEV; 1067 } 1068 pr_debug("found %d domains on %s\n", rp->nr_domains, rp->name); 1069 1070 rp->domains = kzalloc_objs(struct rapl_domain, rp->nr_domains); 1071 if (!rp->domains) 1072 return -ENOMEM; 1073 1074 rapl_init_domains(rp); 1075 1076 for (rd = rp->domains; rd < rp->domains + rp->nr_domains; rd++) { 1077 rapl_get_domain_unit(rd); 1078 rapl_detect_powerlimit(rd); 1079 } 1080 1081 return 0; 1082 } 1083 1084 #ifdef CONFIG_PERF_EVENTS 1085 1086 /* 1087 * Support for RAPL PMU 1088 * 1089 * Register a PMU if any of the registered RAPL Packages have the requirement 1090 * of exposing its energy counters via Perf PMU. 1091 * 1092 * PMU Name: 1093 * power 1094 * 1095 * Events: 1096 * Name Event id RAPL Domain 1097 * energy_cores 0x01 RAPL_DOMAIN_PP0 1098 * energy_pkg 0x02 RAPL_DOMAIN_PACKAGE 1099 * energy_ram 0x03 RAPL_DOMAIN_DRAM 1100 * energy_gpu 0x04 RAPL_DOMAIN_PP1 1101 * energy_psys 0x05 RAPL_DOMAIN_PLATFORM 1102 * 1103 * Unit: 1104 * Joules 1105 * 1106 * Scale: 1107 * 2.3283064365386962890625e-10 1108 * The same RAPL domain in different RAPL Packages may have different 1109 * energy units. Use 2.3283064365386962890625e-10 (2^-32) Joules as 1110 * the fixed unit for all energy counters, and covert each hardware 1111 * counter increase to N times of PMU event counter increases. 1112 * 1113 * This is fully compatible with the current MSR RAPL PMU. This means that 1114 * userspace programs like turbostat can use the same code to handle RAPL Perf 1115 * PMU, no matter what RAPL Interface driver (MSR/TPMI, etc) is running 1116 * underlying on the platform. 1117 * 1118 * Note that RAPL Packages can be probed/removed dynamically, and the events 1119 * supported by each TPMI RAPL device can be different. Thus the RAPL PMU 1120 * support is done on demand, which means 1121 * 1. PMU is registered only if it is needed by a RAPL Package. PMU events for 1122 * unsupported counters are not exposed. 1123 * 2. PMU is unregistered and registered when a new RAPL Package is probed and 1124 * supports new counters that are not supported by current PMU. 1125 * 3. PMU is unregistered when all registered RAPL Packages don't need PMU. 1126 */ 1127 1128 struct rapl_pmu { 1129 struct pmu pmu; /* Perf PMU structure */ 1130 u64 timer_ms; /* Maximum expiration time to avoid counter overflow */ 1131 unsigned long domain_map; /* Events supported by current registered PMU */ 1132 bool registered; /* Whether the PMU has been registered or not */ 1133 }; 1134 1135 static struct rapl_pmu rapl_pmu; 1136 1137 /* PMU helpers */ 1138 1139 static void set_pmu_cpumask(struct rapl_package *rp, cpumask_var_t mask) 1140 { 1141 int cpu; 1142 1143 if (!rp->has_pmu) 1144 return; 1145 1146 /* Only TPMI & MSR RAPL are supported for now */ 1147 if (rp->priv->type != RAPL_IF_TPMI && rp->priv->type != RAPL_IF_MSR) 1148 return; 1149 1150 /* TPMI/MSR RAPL uses any CPU in the package for PMU */ 1151 for_each_online_cpu(cpu) 1152 if (topology_physical_package_id(cpu) == rp->id) 1153 cpumask_set_cpu(cpu, mask); 1154 } 1155 1156 static bool is_rp_pmu_cpu(struct rapl_package *rp, int cpu) 1157 { 1158 if (!rp->has_pmu) 1159 return false; 1160 1161 /* Only TPMI & MSR RAPL are supported for now */ 1162 if (rp->priv->type != RAPL_IF_TPMI && rp->priv->type != RAPL_IF_MSR) 1163 return false; 1164 1165 /* TPMI/MSR RAPL uses any CPU in the package for PMU */ 1166 return topology_physical_package_id(cpu) == rp->id; 1167 } 1168 1169 static struct rapl_package_pmu_data *event_to_pmu_data(struct perf_event *event) 1170 { 1171 struct rapl_package *rp = event->pmu_private; 1172 1173 return &rp->pmu_data; 1174 } 1175 1176 /* PMU event callbacks */ 1177 1178 static u64 event_read_counter(struct perf_event *event) 1179 { 1180 struct rapl_package *rp = event->pmu_private; 1181 u64 val; 1182 int ret; 1183 1184 /* Return 0 for unsupported events */ 1185 if (event->hw.idx < 0) 1186 return 0; 1187 1188 ret = rapl_read_data_raw(&rp->domains[event->hw.idx], ENERGY_COUNTER, false, &val, true); 1189 1190 /* Return 0 for failed read */ 1191 if (ret) 1192 return 0; 1193 1194 return val; 1195 } 1196 1197 static void __rapl_pmu_event_start(struct perf_event *event) 1198 { 1199 struct rapl_package_pmu_data *data = event_to_pmu_data(event); 1200 1201 if (WARN_ON_ONCE(!(event->hw.state & PERF_HES_STOPPED))) 1202 return; 1203 1204 event->hw.state = 0; 1205 1206 list_add_tail(&event->active_entry, &data->active_list); 1207 1208 local64_set(&event->hw.prev_count, event_read_counter(event)); 1209 if (++data->n_active == 1) 1210 hrtimer_start(&data->hrtimer, data->timer_interval, 1211 HRTIMER_MODE_REL_PINNED); 1212 } 1213 1214 static void rapl_pmu_event_start(struct perf_event *event, int mode) 1215 { 1216 struct rapl_package_pmu_data *data = event_to_pmu_data(event); 1217 unsigned long flags; 1218 1219 raw_spin_lock_irqsave(&data->lock, flags); 1220 __rapl_pmu_event_start(event); 1221 raw_spin_unlock_irqrestore(&data->lock, flags); 1222 } 1223 1224 static u64 rapl_event_update(struct perf_event *event) 1225 { 1226 struct hw_perf_event *hwc = &event->hw; 1227 struct rapl_package_pmu_data *data = event_to_pmu_data(event); 1228 u64 prev_raw_count, new_raw_count; 1229 s64 delta, sdelta; 1230 1231 /* 1232 * Follow the generic code to drain hwc->prev_count. 1233 * The loop is not expected to run for multiple times. 1234 */ 1235 prev_raw_count = local64_read(&hwc->prev_count); 1236 do { 1237 new_raw_count = event_read_counter(event); 1238 } while (!local64_try_cmpxchg(&hwc->prev_count, 1239 &prev_raw_count, new_raw_count)); 1240 1241 1242 /* 1243 * Now we have the new raw value and have updated the prev 1244 * timestamp already. We can now calculate the elapsed delta 1245 * (event-)time and add that to the generic event. 1246 */ 1247 delta = new_raw_count - prev_raw_count; 1248 1249 /* 1250 * Scale delta to smallest unit (2^-32) 1251 * users must then scale back: count * 1/(1e9*2^32) to get Joules 1252 * or use ldexp(count, -32). 1253 * Watts = Joules/Time delta 1254 */ 1255 sdelta = delta * data->scale[event->hw.flags]; 1256 1257 local64_add(sdelta, &event->count); 1258 1259 return new_raw_count; 1260 } 1261 1262 static void rapl_pmu_event_stop(struct perf_event *event, int mode) 1263 { 1264 struct rapl_package_pmu_data *data = event_to_pmu_data(event); 1265 struct hw_perf_event *hwc = &event->hw; 1266 unsigned long flags; 1267 1268 raw_spin_lock_irqsave(&data->lock, flags); 1269 1270 /* Mark event as deactivated and stopped */ 1271 if (!(hwc->state & PERF_HES_STOPPED)) { 1272 WARN_ON_ONCE(data->n_active <= 0); 1273 if (--data->n_active == 0) 1274 hrtimer_cancel(&data->hrtimer); 1275 1276 list_del(&event->active_entry); 1277 1278 WARN_ON_ONCE(hwc->state & PERF_HES_STOPPED); 1279 hwc->state |= PERF_HES_STOPPED; 1280 } 1281 1282 /* Check if update of sw counter is necessary */ 1283 if ((mode & PERF_EF_UPDATE) && !(hwc->state & PERF_HES_UPTODATE)) { 1284 /* 1285 * Drain the remaining delta count out of a event 1286 * that we are disabling: 1287 */ 1288 rapl_event_update(event); 1289 hwc->state |= PERF_HES_UPTODATE; 1290 } 1291 1292 raw_spin_unlock_irqrestore(&data->lock, flags); 1293 } 1294 1295 static int rapl_pmu_event_add(struct perf_event *event, int mode) 1296 { 1297 struct rapl_package_pmu_data *data = event_to_pmu_data(event); 1298 struct hw_perf_event *hwc = &event->hw; 1299 unsigned long flags; 1300 1301 raw_spin_lock_irqsave(&data->lock, flags); 1302 1303 hwc->state = PERF_HES_UPTODATE | PERF_HES_STOPPED; 1304 1305 if (mode & PERF_EF_START) 1306 __rapl_pmu_event_start(event); 1307 1308 raw_spin_unlock_irqrestore(&data->lock, flags); 1309 1310 return 0; 1311 } 1312 1313 static void rapl_pmu_event_del(struct perf_event *event, int flags) 1314 { 1315 rapl_pmu_event_stop(event, PERF_EF_UPDATE); 1316 } 1317 1318 /* RAPL PMU event ids, same as shown in sysfs */ 1319 enum perf_rapl_events { 1320 PERF_RAPL_PP0 = 1, /* all cores */ 1321 PERF_RAPL_PKG, /* entire package */ 1322 PERF_RAPL_RAM, /* DRAM */ 1323 PERF_RAPL_PP1, /* gpu */ 1324 PERF_RAPL_PSYS, /* psys */ 1325 PERF_RAPL_MAX 1326 }; 1327 1328 static const int event_to_domain[PERF_RAPL_MAX] = { 1329 [PERF_RAPL_PP0] = RAPL_DOMAIN_PP0, 1330 [PERF_RAPL_PKG] = RAPL_DOMAIN_PACKAGE, 1331 [PERF_RAPL_RAM] = RAPL_DOMAIN_DRAM, 1332 [PERF_RAPL_PP1] = RAPL_DOMAIN_PP1, 1333 [PERF_RAPL_PSYS] = RAPL_DOMAIN_PLATFORM, 1334 }; 1335 1336 static int rapl_pmu_event_init(struct perf_event *event) 1337 { 1338 struct rapl_package *pos, *rp = NULL; 1339 u64 cfg = event->attr.config & RAPL_EVENT_MASK; 1340 int domain, idx; 1341 1342 /* Only look at RAPL events */ 1343 if (event->attr.type != event->pmu->type) 1344 return -ENOENT; 1345 1346 /* Check for supported events only */ 1347 if (!cfg || cfg >= PERF_RAPL_MAX) 1348 return -EINVAL; 1349 1350 if (event->cpu < 0) 1351 return -EINVAL; 1352 1353 /* Find out which Package the event belongs to */ 1354 list_for_each_entry(pos, &rapl_packages, plist) { 1355 if (is_rp_pmu_cpu(pos, event->cpu)) { 1356 rp = pos; 1357 break; 1358 } 1359 } 1360 if (!rp) 1361 return -ENODEV; 1362 1363 /* Find out which RAPL Domain the event belongs to */ 1364 domain = event_to_domain[cfg]; 1365 1366 event->event_caps |= PERF_EV_CAP_READ_ACTIVE_PKG; 1367 event->pmu_private = rp; /* Which package */ 1368 event->hw.flags = domain; /* Which domain */ 1369 1370 event->hw.idx = -1; 1371 /* Find out the index in rp->domains[] to get domain pointer */ 1372 for (idx = 0; idx < rp->nr_domains; idx++) { 1373 if (rp->domains[idx].id == domain) { 1374 event->hw.idx = idx; 1375 break; 1376 } 1377 } 1378 1379 return 0; 1380 } 1381 1382 static void rapl_pmu_event_read(struct perf_event *event) 1383 { 1384 rapl_event_update(event); 1385 } 1386 1387 static enum hrtimer_restart rapl_hrtimer_handle(struct hrtimer *hrtimer) 1388 { 1389 struct rapl_package_pmu_data *data = 1390 container_of(hrtimer, struct rapl_package_pmu_data, hrtimer); 1391 struct perf_event *event; 1392 unsigned long flags; 1393 1394 if (!data->n_active) 1395 return HRTIMER_NORESTART; 1396 1397 raw_spin_lock_irqsave(&data->lock, flags); 1398 1399 list_for_each_entry(event, &data->active_list, active_entry) 1400 rapl_event_update(event); 1401 1402 raw_spin_unlock_irqrestore(&data->lock, flags); 1403 1404 hrtimer_forward_now(hrtimer, data->timer_interval); 1405 1406 return HRTIMER_RESTART; 1407 } 1408 1409 /* PMU sysfs attributes */ 1410 1411 /* 1412 * There are no default events, but we need to create "events" group (with 1413 * empty attrs) before updating it with detected events. 1414 */ 1415 static struct attribute *attrs_empty[] = { 1416 NULL, 1417 }; 1418 1419 static struct attribute_group pmu_events_group = { 1420 .name = "events", 1421 .attrs = attrs_empty, 1422 }; 1423 1424 static ssize_t cpumask_show(struct device *dev, 1425 struct device_attribute *attr, char *buf) 1426 { 1427 struct rapl_package *rp; 1428 cpumask_var_t cpu_mask; 1429 int ret; 1430 1431 if (!alloc_cpumask_var(&cpu_mask, GFP_KERNEL)) 1432 return -ENOMEM; 1433 1434 cpus_read_lock(); 1435 1436 cpumask_clear(cpu_mask); 1437 1438 /* Choose a cpu for each RAPL Package */ 1439 list_for_each_entry(rp, &rapl_packages, plist) { 1440 set_pmu_cpumask(rp, cpu_mask); 1441 } 1442 cpus_read_unlock(); 1443 1444 ret = cpumap_print_to_pagebuf(true, buf, cpu_mask); 1445 1446 free_cpumask_var(cpu_mask); 1447 1448 return ret; 1449 } 1450 1451 static DEVICE_ATTR_RO(cpumask); 1452 1453 static struct attribute *pmu_cpumask_attrs[] = { 1454 &dev_attr_cpumask.attr, 1455 NULL 1456 }; 1457 1458 static struct attribute_group pmu_cpumask_group = { 1459 .attrs = pmu_cpumask_attrs, 1460 }; 1461 1462 PMU_FORMAT_ATTR(event, "config:0-7"); 1463 static struct attribute *pmu_format_attr[] = { 1464 &format_attr_event.attr, 1465 NULL 1466 }; 1467 1468 static struct attribute_group pmu_format_group = { 1469 .name = "format", 1470 .attrs = pmu_format_attr, 1471 }; 1472 1473 static const struct attribute_group *pmu_attr_groups[] = { 1474 &pmu_events_group, 1475 &pmu_cpumask_group, 1476 &pmu_format_group, 1477 NULL 1478 }; 1479 1480 #define RAPL_EVENT_ATTR_STR(_name, v, str) \ 1481 static struct perf_pmu_events_attr event_attr_##v = { \ 1482 .attr = __ATTR(_name, 0444, perf_event_sysfs_show, NULL), \ 1483 .event_str = str, \ 1484 } 1485 1486 RAPL_EVENT_ATTR_STR(energy-cores, rapl_cores, "event=0x01"); 1487 RAPL_EVENT_ATTR_STR(energy-pkg, rapl_pkg, "event=0x02"); 1488 RAPL_EVENT_ATTR_STR(energy-ram, rapl_ram, "event=0x03"); 1489 RAPL_EVENT_ATTR_STR(energy-gpu, rapl_gpu, "event=0x04"); 1490 RAPL_EVENT_ATTR_STR(energy-psys, rapl_psys, "event=0x05"); 1491 1492 RAPL_EVENT_ATTR_STR(energy-cores.unit, rapl_unit_cores, "Joules"); 1493 RAPL_EVENT_ATTR_STR(energy-pkg.unit, rapl_unit_pkg, "Joules"); 1494 RAPL_EVENT_ATTR_STR(energy-ram.unit, rapl_unit_ram, "Joules"); 1495 RAPL_EVENT_ATTR_STR(energy-gpu.unit, rapl_unit_gpu, "Joules"); 1496 RAPL_EVENT_ATTR_STR(energy-psys.unit, rapl_unit_psys, "Joules"); 1497 1498 RAPL_EVENT_ATTR_STR(energy-cores.scale, rapl_scale_cores, "2.3283064365386962890625e-10"); 1499 RAPL_EVENT_ATTR_STR(energy-pkg.scale, rapl_scale_pkg, "2.3283064365386962890625e-10"); 1500 RAPL_EVENT_ATTR_STR(energy-ram.scale, rapl_scale_ram, "2.3283064365386962890625e-10"); 1501 RAPL_EVENT_ATTR_STR(energy-gpu.scale, rapl_scale_gpu, "2.3283064365386962890625e-10"); 1502 RAPL_EVENT_ATTR_STR(energy-psys.scale, rapl_scale_psys, "2.3283064365386962890625e-10"); 1503 1504 #define RAPL_EVENT_GROUP(_name, domain) \ 1505 static struct attribute *pmu_attr_##_name[] = { \ 1506 &event_attr_rapl_##_name.attr.attr, \ 1507 &event_attr_rapl_unit_##_name.attr.attr, \ 1508 &event_attr_rapl_scale_##_name.attr.attr, \ 1509 NULL \ 1510 }; \ 1511 static umode_t is_visible_##_name(struct kobject *kobj, struct attribute *attr, int event) \ 1512 { \ 1513 return rapl_pmu.domain_map & BIT(domain) ? attr->mode : 0; \ 1514 } \ 1515 static struct attribute_group pmu_group_##_name = { \ 1516 .name = "events", \ 1517 .attrs = pmu_attr_##_name, \ 1518 .is_visible = is_visible_##_name, \ 1519 } 1520 1521 RAPL_EVENT_GROUP(cores, RAPL_DOMAIN_PP0); 1522 RAPL_EVENT_GROUP(pkg, RAPL_DOMAIN_PACKAGE); 1523 RAPL_EVENT_GROUP(ram, RAPL_DOMAIN_DRAM); 1524 RAPL_EVENT_GROUP(gpu, RAPL_DOMAIN_PP1); 1525 RAPL_EVENT_GROUP(psys, RAPL_DOMAIN_PLATFORM); 1526 1527 static const struct attribute_group *pmu_attr_update[] = { 1528 &pmu_group_cores, 1529 &pmu_group_pkg, 1530 &pmu_group_ram, 1531 &pmu_group_gpu, 1532 &pmu_group_psys, 1533 NULL 1534 }; 1535 1536 static int rapl_pmu_update(struct rapl_package *rp) 1537 { 1538 int ret = 0; 1539 1540 /* Return if PMU already covers all events supported by current RAPL Package */ 1541 if (rapl_pmu.registered && !(rp->domain_map & (~rapl_pmu.domain_map))) 1542 goto end; 1543 1544 /* Unregister previous registered PMU */ 1545 if (rapl_pmu.registered) 1546 perf_pmu_unregister(&rapl_pmu.pmu); 1547 1548 rapl_pmu.registered = false; 1549 rapl_pmu.domain_map |= rp->domain_map; 1550 1551 memset(&rapl_pmu.pmu, 0, sizeof(struct pmu)); 1552 rapl_pmu.pmu.attr_groups = pmu_attr_groups; 1553 rapl_pmu.pmu.attr_update = pmu_attr_update; 1554 rapl_pmu.pmu.task_ctx_nr = perf_invalid_context; 1555 rapl_pmu.pmu.event_init = rapl_pmu_event_init; 1556 rapl_pmu.pmu.add = rapl_pmu_event_add; 1557 rapl_pmu.pmu.del = rapl_pmu_event_del; 1558 rapl_pmu.pmu.start = rapl_pmu_event_start; 1559 rapl_pmu.pmu.stop = rapl_pmu_event_stop; 1560 rapl_pmu.pmu.read = rapl_pmu_event_read; 1561 rapl_pmu.pmu.module = THIS_MODULE; 1562 rapl_pmu.pmu.capabilities = PERF_PMU_CAP_NO_EXCLUDE | PERF_PMU_CAP_NO_INTERRUPT; 1563 ret = perf_pmu_register(&rapl_pmu.pmu, "power", -1); 1564 if (ret) { 1565 pr_info("Failed to register PMU\n"); 1566 return ret; 1567 } 1568 1569 rapl_pmu.registered = true; 1570 end: 1571 rp->has_pmu = true; 1572 return ret; 1573 } 1574 1575 int rapl_package_add_pmu_locked(struct rapl_package *rp) 1576 { 1577 struct rapl_package_pmu_data *data = &rp->pmu_data; 1578 int idx; 1579 1580 if (rp->has_pmu) 1581 return -EEXIST; 1582 1583 for (idx = 0; idx < rp->nr_domains; idx++) { 1584 struct rapl_domain *rd = &rp->domains[idx]; 1585 int domain = rd->id; 1586 u64 val; 1587 1588 if (!test_bit(domain, &rp->domain_map)) 1589 continue; 1590 1591 /* 1592 * The RAPL PMU granularity is 2^-32 Joules 1593 * data->scale[]: times of 2^-32 Joules for each ENERGY COUNTER increase 1594 */ 1595 val = rd->energy_unit * (1ULL << 32); 1596 do_div(val, ENERGY_UNIT_SCALE * 1000000); 1597 data->scale[domain] = val; 1598 1599 if (!rapl_pmu.timer_ms) { 1600 struct rapl_primitive_info *rpi = get_rpi(rp, ENERGY_COUNTER); 1601 1602 /* 1603 * Calculate the timer rate: 1604 * Use reference of 200W for scaling the timeout to avoid counter 1605 * overflows. 1606 * 1607 * max_count = rpi->mask >> rpi->shift + 1 1608 * max_energy_pj = max_count * rd->energy_unit 1609 * max_time_sec = (max_energy_pj / 1000000000) / 200w 1610 * 1611 * rapl_pmu.timer_ms = max_time_sec * 1000 / 2 1612 */ 1613 val = (rpi->mask >> rpi->shift) + 1; 1614 val *= rd->energy_unit; 1615 do_div(val, 1000000 * 200 * 2); 1616 rapl_pmu.timer_ms = val; 1617 1618 pr_debug("%llu ms overflow timer\n", rapl_pmu.timer_ms); 1619 } 1620 1621 pr_debug("Domain %s: hw unit %lld * 2^-32 Joules\n", rd->name, data->scale[domain]); 1622 } 1623 1624 /* Initialize per package PMU data */ 1625 raw_spin_lock_init(&data->lock); 1626 INIT_LIST_HEAD(&data->active_list); 1627 data->timer_interval = ms_to_ktime(rapl_pmu.timer_ms); 1628 hrtimer_setup(&data->hrtimer, rapl_hrtimer_handle, CLOCK_MONOTONIC, HRTIMER_MODE_REL); 1629 1630 return rapl_pmu_update(rp); 1631 } 1632 EXPORT_SYMBOL_NS_GPL(rapl_package_add_pmu_locked, "INTEL_RAPL"); 1633 1634 int rapl_package_add_pmu(struct rapl_package *rp) 1635 { 1636 guard(cpus_read_lock)(); 1637 1638 return rapl_package_add_pmu_locked(rp); 1639 } 1640 EXPORT_SYMBOL_NS_GPL(rapl_package_add_pmu, "INTEL_RAPL"); 1641 1642 void rapl_package_remove_pmu_locked(struct rapl_package *rp) 1643 { 1644 struct rapl_package *pos; 1645 1646 if (!rp->has_pmu) 1647 return; 1648 1649 list_for_each_entry(pos, &rapl_packages, plist) { 1650 /* PMU is still needed */ 1651 if (pos->has_pmu && pos != rp) 1652 return; 1653 } 1654 1655 perf_pmu_unregister(&rapl_pmu.pmu); 1656 memset(&rapl_pmu, 0, sizeof(struct rapl_pmu)); 1657 } 1658 EXPORT_SYMBOL_NS_GPL(rapl_package_remove_pmu_locked, "INTEL_RAPL"); 1659 1660 void rapl_package_remove_pmu(struct rapl_package *rp) 1661 { 1662 guard(cpus_read_lock)(); 1663 1664 rapl_package_remove_pmu_locked(rp); 1665 } 1666 EXPORT_SYMBOL_NS_GPL(rapl_package_remove_pmu, "INTEL_RAPL"); 1667 #endif 1668 1669 /* called from CPU hotplug notifier, hotplug lock held */ 1670 void rapl_remove_package_cpuslocked(struct rapl_package *rp) 1671 { 1672 struct rapl_domain *rd, *rd_package = NULL; 1673 1674 package_power_limit_irq_restore(rp); 1675 1676 for (rd = rp->domains; rd < rp->domains + rp->nr_domains; rd++) { 1677 int i; 1678 1679 for (i = POWER_LIMIT1; i < NR_POWER_LIMITS; i++) { 1680 rapl_write_pl_data(rd, i, PL_ENABLE, 0); 1681 rapl_write_pl_data(rd, i, PL_CLAMP, 0); 1682 } 1683 1684 if (rd->id == RAPL_DOMAIN_PACKAGE) { 1685 rd_package = rd; 1686 continue; 1687 } 1688 pr_debug("remove package, undo power limit on %s: %s\n", 1689 rp->name, rd->name); 1690 powercap_unregister_zone(rp->priv->control_type, 1691 &rd->power_zone); 1692 } 1693 /* do parent zone last */ 1694 powercap_unregister_zone(rp->priv->control_type, 1695 &rd_package->power_zone); 1696 list_del(&rp->plist); 1697 kfree(rp); 1698 } 1699 EXPORT_SYMBOL_NS_GPL(rapl_remove_package_cpuslocked, "INTEL_RAPL"); 1700 1701 void rapl_remove_package(struct rapl_package *rp) 1702 { 1703 guard(cpus_read_lock)(); 1704 rapl_remove_package_cpuslocked(rp); 1705 } 1706 EXPORT_SYMBOL_NS_GPL(rapl_remove_package, "INTEL_RAPL"); 1707 1708 /* 1709 * RAPL Package energy counter scope: 1710 * 1. AMD/HYGON platforms use per-PKG package energy counter 1711 * 2. For Intel platforms 1712 * 2.1 CLX-AP platform has per-DIE package energy counter 1713 * 2.2 Other platforms that uses MSR RAPL are single die systems so the 1714 * package energy counter can be considered as per-PKG/per-DIE, 1715 * here it is considered as per-DIE. 1716 * 2.3 New platforms that use TPMI RAPL doesn't care about the 1717 * scope because they are not MSR/CPU based. 1718 */ 1719 #define rapl_msrs_are_pkg_scope() \ 1720 (boot_cpu_data.x86_vendor == X86_VENDOR_AMD || \ 1721 boot_cpu_data.x86_vendor == X86_VENDOR_HYGON) 1722 1723 /* caller to ensure CPU hotplug lock is held */ 1724 struct rapl_package *rapl_find_package_domain_cpuslocked(int id, struct rapl_if_priv *priv, 1725 bool id_is_cpu) 1726 { 1727 struct rapl_package *rp; 1728 int uid; 1729 1730 if (id_is_cpu) { 1731 uid = rapl_msrs_are_pkg_scope() ? 1732 topology_physical_package_id(id) : topology_logical_die_id(id); 1733 if (uid < 0) { 1734 pr_err("topology_logical_(package/die)_id() returned a negative value"); 1735 return NULL; 1736 } 1737 } 1738 else 1739 uid = id; 1740 1741 list_for_each_entry(rp, &rapl_packages, plist) { 1742 if (rp->id == uid 1743 && rp->priv->control_type == priv->control_type) 1744 return rp; 1745 } 1746 1747 return NULL; 1748 } 1749 EXPORT_SYMBOL_NS_GPL(rapl_find_package_domain_cpuslocked, "INTEL_RAPL"); 1750 1751 struct rapl_package *rapl_find_package_domain(int id, struct rapl_if_priv *priv, bool id_is_cpu) 1752 { 1753 guard(cpus_read_lock)(); 1754 return rapl_find_package_domain_cpuslocked(id, priv, id_is_cpu); 1755 } 1756 EXPORT_SYMBOL_NS_GPL(rapl_find_package_domain, "INTEL_RAPL"); 1757 1758 /* called from CPU hotplug notifier, hotplug lock held */ 1759 struct rapl_package *rapl_add_package_cpuslocked(int id, struct rapl_if_priv *priv, bool id_is_cpu) 1760 { 1761 struct rapl_package *rp; 1762 int ret; 1763 1764 rp = kzalloc_obj(struct rapl_package); 1765 if (!rp) 1766 return ERR_PTR(-ENOMEM); 1767 1768 if (id_is_cpu) { 1769 rp->id = rapl_msrs_are_pkg_scope() ? 1770 topology_physical_package_id(id) : topology_logical_die_id(id); 1771 if ((int)(rp->id) < 0) { 1772 pr_err("topology_logical_(package/die)_id() returned a negative value"); 1773 return ERR_PTR(-EINVAL); 1774 } 1775 rp->lead_cpu = id; 1776 if (!rapl_msrs_are_pkg_scope() && topology_max_dies_per_package() > 1) 1777 snprintf(rp->name, PACKAGE_DOMAIN_NAME_LENGTH, "package-%d-die-%d", 1778 topology_physical_package_id(id), topology_die_id(id)); 1779 else 1780 snprintf(rp->name, PACKAGE_DOMAIN_NAME_LENGTH, "package-%d", 1781 topology_physical_package_id(id)); 1782 } else { 1783 rp->id = id; 1784 rp->lead_cpu = -1; 1785 snprintf(rp->name, PACKAGE_DOMAIN_NAME_LENGTH, "package-%d", id); 1786 } 1787 1788 rp->priv = priv; 1789 ret = rapl_config(rp); 1790 if (ret) 1791 goto err_free_package; 1792 1793 /* check if the package contains valid domains */ 1794 if (rapl_detect_domains(rp)) { 1795 ret = -ENODEV; 1796 goto err_free_package; 1797 } 1798 ret = rapl_package_register_powercap(rp); 1799 if (!ret) { 1800 INIT_LIST_HEAD(&rp->plist); 1801 list_add(&rp->plist, &rapl_packages); 1802 return rp; 1803 } 1804 1805 err_free_package: 1806 kfree(rp->domains); 1807 kfree(rp); 1808 return ERR_PTR(ret); 1809 } 1810 EXPORT_SYMBOL_NS_GPL(rapl_add_package_cpuslocked, "INTEL_RAPL"); 1811 1812 struct rapl_package *rapl_add_package(int id, struct rapl_if_priv *priv, bool id_is_cpu) 1813 { 1814 guard(cpus_read_lock)(); 1815 return rapl_add_package_cpuslocked(id, priv, id_is_cpu); 1816 } 1817 EXPORT_SYMBOL_NS_GPL(rapl_add_package, "INTEL_RAPL"); 1818 1819 static void power_limit_state_save(void) 1820 { 1821 struct rapl_package *rp; 1822 struct rapl_domain *rd; 1823 int ret, i; 1824 1825 cpus_read_lock(); 1826 list_for_each_entry(rp, &rapl_packages, plist) { 1827 if (!rp->power_zone) 1828 continue; 1829 rd = power_zone_to_rapl_domain(rp->power_zone); 1830 for (i = POWER_LIMIT1; i < NR_POWER_LIMITS; i++) { 1831 ret = rapl_read_pl_data(rd, i, PL_LIMIT, true, 1832 &rd->rpl[i].last_power_limit); 1833 if (ret) 1834 rd->rpl[i].last_power_limit = 0; 1835 } 1836 } 1837 cpus_read_unlock(); 1838 } 1839 1840 static void power_limit_state_restore(void) 1841 { 1842 struct rapl_package *rp; 1843 struct rapl_domain *rd; 1844 int i; 1845 1846 cpus_read_lock(); 1847 list_for_each_entry(rp, &rapl_packages, plist) { 1848 if (!rp->power_zone) 1849 continue; 1850 rd = power_zone_to_rapl_domain(rp->power_zone); 1851 for (i = POWER_LIMIT1; i < NR_POWER_LIMITS; i++) 1852 if (rd->rpl[i].last_power_limit) 1853 rapl_write_pl_data(rd, i, PL_LIMIT, 1854 rd->rpl[i].last_power_limit); 1855 } 1856 cpus_read_unlock(); 1857 } 1858 1859 static int rapl_pm_callback(struct notifier_block *nb, 1860 unsigned long mode, void *_unused) 1861 { 1862 switch (mode) { 1863 case PM_SUSPEND_PREPARE: 1864 power_limit_state_save(); 1865 break; 1866 case PM_POST_SUSPEND: 1867 power_limit_state_restore(); 1868 break; 1869 } 1870 return NOTIFY_OK; 1871 } 1872 1873 static struct notifier_block rapl_pm_notifier = { 1874 .notifier_call = rapl_pm_callback, 1875 }; 1876 1877 static int __init rapl_init(void) 1878 { 1879 return register_pm_notifier(&rapl_pm_notifier); 1880 } 1881 1882 static void __exit rapl_exit(void) 1883 { 1884 unregister_pm_notifier(&rapl_pm_notifier); 1885 } 1886 1887 fs_initcall(rapl_init); 1888 module_exit(rapl_exit); 1889 1890 MODULE_DESCRIPTION("Intel Runtime Average Power Limit (RAPL) common code"); 1891 MODULE_AUTHOR("Jacob Pan <jacob.jun.pan@intel.com>"); 1892 MODULE_LICENSE("GPL v2"); 1893