1 // SPDX-License-Identifier: MIT 2 /* 3 * Copyright © 2020 Intel Corporation 4 */ 5 6 #include <linux/pm_qos.h> 7 #include <linux/sort.h> 8 9 #include "intel_engine_heartbeat.h" 10 #include "intel_engine_pm.h" 11 #include "intel_gpu_commands.h" 12 #include "intel_gt_clock_utils.h" 13 #include "intel_gt_pm.h" 14 #include "intel_rc6.h" 15 #include "selftest_engine_heartbeat.h" 16 #include "selftest_rps.h" 17 #include "selftests/igt_flush_test.h" 18 #include "selftests/igt_spinner.h" 19 #include "selftests/librapl.h" 20 21 /* Try to isolate the impact of cstates from determing frequency response */ 22 #define CPU_LATENCY 0 /* -1 to disable pm_qos, 0 to disable cstates */ 23 24 static void dummy_rps_work(struct work_struct *wrk) 25 { 26 } 27 28 static int cmp_u64(const void *A, const void *B) 29 { 30 const u64 *a = A, *b = B; 31 32 if (*a < *b) 33 return -1; 34 else if (*a > *b) 35 return 1; 36 else 37 return 0; 38 } 39 40 static int cmp_u32(const void *A, const void *B) 41 { 42 const u32 *a = A, *b = B; 43 44 if (*a < *b) 45 return -1; 46 else if (*a > *b) 47 return 1; 48 else 49 return 0; 50 } 51 52 static struct i915_vma * 53 create_spin_counter(struct intel_engine_cs *engine, 54 struct i915_address_space *vm, 55 bool srm, 56 u32 **cancel, 57 u32 **counter) 58 { 59 enum { 60 COUNT, 61 INC, 62 __NGPR__, 63 }; 64 #define CS_GPR(x) GEN8_RING_CS_GPR(engine->mmio_base, x) 65 struct drm_i915_gem_object *obj; 66 struct i915_vma *vma; 67 unsigned long end; 68 u32 *base, *cs; 69 int loop, i; 70 int err; 71 72 obj = i915_gem_object_create_internal(vm->i915, 64 << 10); 73 if (IS_ERR(obj)) 74 return ERR_CAST(obj); 75 76 end = obj->base.size / sizeof(u32) - 1; 77 78 vma = i915_vma_instance(obj, vm, NULL); 79 if (IS_ERR(vma)) { 80 i915_gem_object_put(obj); 81 return vma; 82 } 83 84 err = i915_vma_pin(vma, 0, 0, PIN_USER); 85 if (err) { 86 i915_vma_put(vma); 87 return ERR_PTR(err); 88 } 89 90 base = i915_gem_object_pin_map(obj, I915_MAP_WC); 91 if (IS_ERR(base)) { 92 i915_gem_object_put(obj); 93 return ERR_CAST(base); 94 } 95 cs = base; 96 97 *cs++ = MI_LOAD_REGISTER_IMM(__NGPR__ * 2); 98 for (i = 0; i < __NGPR__; i++) { 99 *cs++ = i915_mmio_reg_offset(CS_GPR(i)); 100 *cs++ = 0; 101 *cs++ = i915_mmio_reg_offset(CS_GPR(i)) + 4; 102 *cs++ = 0; 103 } 104 105 *cs++ = MI_LOAD_REGISTER_IMM(1); 106 *cs++ = i915_mmio_reg_offset(CS_GPR(INC)); 107 *cs++ = 1; 108 109 loop = cs - base; 110 111 /* Unroll the loop to avoid MI_BB_START stalls impacting measurements */ 112 for (i = 0; i < 1024; i++) { 113 *cs++ = MI_MATH(4); 114 *cs++ = MI_MATH_LOAD(MI_MATH_REG_SRCA, MI_MATH_REG(COUNT)); 115 *cs++ = MI_MATH_LOAD(MI_MATH_REG_SRCB, MI_MATH_REG(INC)); 116 *cs++ = MI_MATH_ADD; 117 *cs++ = MI_MATH_STORE(MI_MATH_REG(COUNT), MI_MATH_REG_ACCU); 118 119 if (srm) { 120 *cs++ = MI_STORE_REGISTER_MEM_GEN8; 121 *cs++ = i915_mmio_reg_offset(CS_GPR(COUNT)); 122 *cs++ = lower_32_bits(vma->node.start + end * sizeof(*cs)); 123 *cs++ = upper_32_bits(vma->node.start + end * sizeof(*cs)); 124 } 125 } 126 127 *cs++ = MI_BATCH_BUFFER_START_GEN8; 128 *cs++ = lower_32_bits(vma->node.start + loop * sizeof(*cs)); 129 *cs++ = upper_32_bits(vma->node.start + loop * sizeof(*cs)); 130 GEM_BUG_ON(cs - base > end); 131 132 i915_gem_object_flush_map(obj); 133 134 *cancel = base + loop; 135 *counter = srm ? memset32(base + end, 0, 1) : NULL; 136 return vma; 137 } 138 139 static u8 wait_for_freq(struct intel_rps *rps, u8 freq, int timeout_ms) 140 { 141 u8 history[64], i; 142 unsigned long end; 143 int sleep; 144 145 i = 0; 146 memset(history, freq, sizeof(history)); 147 sleep = 20; 148 149 /* The PCU does not change instantly, but drifts towards the goal? */ 150 end = jiffies + msecs_to_jiffies(timeout_ms); 151 do { 152 u8 act; 153 154 act = read_cagf(rps); 155 if (time_after(jiffies, end)) 156 return act; 157 158 /* Target acquired */ 159 if (act == freq) 160 return act; 161 162 /* Any change within the last N samples? */ 163 if (!memchr_inv(history, act, sizeof(history))) 164 return act; 165 166 history[i] = act; 167 i = (i + 1) % ARRAY_SIZE(history); 168 169 usleep_range(sleep, 2 * sleep); 170 sleep *= 2; 171 if (sleep > timeout_ms * 20) 172 sleep = timeout_ms * 20; 173 } while (1); 174 } 175 176 static u8 rps_set_check(struct intel_rps *rps, u8 freq) 177 { 178 mutex_lock(&rps->lock); 179 GEM_BUG_ON(!intel_rps_is_active(rps)); 180 intel_rps_set(rps, freq); 181 GEM_BUG_ON(rps->last_freq != freq); 182 mutex_unlock(&rps->lock); 183 184 return wait_for_freq(rps, freq, 50); 185 } 186 187 static void show_pstate_limits(struct intel_rps *rps) 188 { 189 struct drm_i915_private *i915 = rps_to_i915(rps); 190 191 if (IS_BROXTON(i915)) { 192 pr_info("P_STATE_CAP[%x]: 0x%08x\n", 193 i915_mmio_reg_offset(BXT_RP_STATE_CAP), 194 intel_uncore_read(rps_to_uncore(rps), 195 BXT_RP_STATE_CAP)); 196 } else if (IS_GEN(i915, 9)) { 197 pr_info("P_STATE_LIMITS[%x]: 0x%08x\n", 198 i915_mmio_reg_offset(GEN9_RP_STATE_LIMITS), 199 intel_uncore_read(rps_to_uncore(rps), 200 GEN9_RP_STATE_LIMITS)); 201 } 202 } 203 204 int live_rps_clock_interval(void *arg) 205 { 206 struct intel_gt *gt = arg; 207 struct intel_rps *rps = >->rps; 208 void (*saved_work)(struct work_struct *wrk); 209 struct intel_engine_cs *engine; 210 enum intel_engine_id id; 211 struct igt_spinner spin; 212 int err = 0; 213 214 if (!intel_rps_is_enabled(rps)) 215 return 0; 216 217 if (igt_spinner_init(&spin, gt)) 218 return -ENOMEM; 219 220 intel_gt_pm_wait_for_idle(gt); 221 saved_work = rps->work.func; 222 rps->work.func = dummy_rps_work; 223 224 intel_gt_pm_get(gt); 225 intel_rps_disable(>->rps); 226 227 intel_gt_check_clock_frequency(gt); 228 229 for_each_engine(engine, gt, id) { 230 struct i915_request *rq; 231 u32 cycles; 232 u64 dt; 233 234 if (!intel_engine_can_store_dword(engine)) 235 continue; 236 237 st_engine_heartbeat_disable(engine); 238 239 rq = igt_spinner_create_request(&spin, 240 engine->kernel_context, 241 MI_NOOP); 242 if (IS_ERR(rq)) { 243 st_engine_heartbeat_enable(engine); 244 err = PTR_ERR(rq); 245 break; 246 } 247 248 i915_request_add(rq); 249 250 if (!igt_wait_for_spinner(&spin, rq)) { 251 pr_err("%s: RPS spinner did not start\n", 252 engine->name); 253 igt_spinner_end(&spin); 254 st_engine_heartbeat_enable(engine); 255 intel_gt_set_wedged(engine->gt); 256 err = -EIO; 257 break; 258 } 259 260 intel_uncore_forcewake_get(gt->uncore, FORCEWAKE_ALL); 261 262 intel_uncore_write_fw(gt->uncore, GEN6_RP_CUR_UP_EI, 0); 263 264 /* Set the evaluation interval to infinity! */ 265 intel_uncore_write_fw(gt->uncore, 266 GEN6_RP_UP_EI, 0xffffffff); 267 intel_uncore_write_fw(gt->uncore, 268 GEN6_RP_UP_THRESHOLD, 0xffffffff); 269 270 intel_uncore_write_fw(gt->uncore, GEN6_RP_CONTROL, 271 GEN6_RP_ENABLE | GEN6_RP_UP_BUSY_AVG); 272 273 if (wait_for(intel_uncore_read_fw(gt->uncore, 274 GEN6_RP_CUR_UP_EI), 275 10)) { 276 /* Just skip the test; assume lack of HW support */ 277 pr_notice("%s: rps evaluation interval not ticking\n", 278 engine->name); 279 err = -ENODEV; 280 } else { 281 ktime_t dt_[5]; 282 u32 cycles_[5]; 283 int i; 284 285 for (i = 0; i < 5; i++) { 286 preempt_disable(); 287 288 dt_[i] = ktime_get(); 289 cycles_[i] = -intel_uncore_read_fw(gt->uncore, GEN6_RP_CUR_UP_EI); 290 291 udelay(1000); 292 293 dt_[i] = ktime_sub(ktime_get(), dt_[i]); 294 cycles_[i] += intel_uncore_read_fw(gt->uncore, GEN6_RP_CUR_UP_EI); 295 296 preempt_enable(); 297 } 298 299 /* Use the median of both cycle/dt; close enough */ 300 sort(cycles_, 5, sizeof(*cycles_), cmp_u32, NULL); 301 cycles = (cycles_[1] + 2 * cycles_[2] + cycles_[3]) / 4; 302 sort(dt_, 5, sizeof(*dt_), cmp_u64, NULL); 303 dt = div_u64(dt_[1] + 2 * dt_[2] + dt_[3], 4); 304 } 305 306 intel_uncore_write_fw(gt->uncore, GEN6_RP_CONTROL, 0); 307 intel_uncore_forcewake_put(gt->uncore, FORCEWAKE_ALL); 308 309 igt_spinner_end(&spin); 310 st_engine_heartbeat_enable(engine); 311 312 if (err == 0) { 313 u64 time = intel_gt_pm_interval_to_ns(gt, cycles); 314 u32 expected = 315 intel_gt_ns_to_pm_interval(gt, dt); 316 317 pr_info("%s: rps counted %d C0 cycles [%lldns] in %lldns [%d cycles], using GT clock frequency of %uKHz\n", 318 engine->name, cycles, time, dt, expected, 319 gt->clock_frequency / 1000); 320 321 if (10 * time < 8 * dt || 322 8 * time > 10 * dt) { 323 pr_err("%s: rps clock time does not match walltime!\n", 324 engine->name); 325 err = -EINVAL; 326 } 327 328 if (10 * expected < 8 * cycles || 329 8 * expected > 10 * cycles) { 330 pr_err("%s: walltime does not match rps clock ticks!\n", 331 engine->name); 332 err = -EINVAL; 333 } 334 } 335 336 if (igt_flush_test(gt->i915)) 337 err = -EIO; 338 339 break; /* once is enough */ 340 } 341 342 intel_rps_enable(>->rps); 343 intel_gt_pm_put(gt); 344 345 igt_spinner_fini(&spin); 346 347 intel_gt_pm_wait_for_idle(gt); 348 rps->work.func = saved_work; 349 350 if (err == -ENODEV) /* skipped, don't report a fail */ 351 err = 0; 352 353 return err; 354 } 355 356 int live_rps_control(void *arg) 357 { 358 struct intel_gt *gt = arg; 359 struct intel_rps *rps = >->rps; 360 void (*saved_work)(struct work_struct *wrk); 361 struct intel_engine_cs *engine; 362 enum intel_engine_id id; 363 struct igt_spinner spin; 364 int err = 0; 365 366 /* 367 * Check that the actual frequency matches our requested frequency, 368 * to verify our control mechanism. We have to be careful that the 369 * PCU may throttle the GPU in which case the actual frequency used 370 * will be lowered than requested. 371 */ 372 373 if (!intel_rps_is_enabled(rps)) 374 return 0; 375 376 if (IS_CHERRYVIEW(gt->i915)) /* XXX fragile PCU */ 377 return 0; 378 379 if (igt_spinner_init(&spin, gt)) 380 return -ENOMEM; 381 382 intel_gt_pm_wait_for_idle(gt); 383 saved_work = rps->work.func; 384 rps->work.func = dummy_rps_work; 385 386 intel_gt_pm_get(gt); 387 for_each_engine(engine, gt, id) { 388 struct i915_request *rq; 389 ktime_t min_dt, max_dt; 390 int f, limit; 391 int min, max; 392 393 if (!intel_engine_can_store_dword(engine)) 394 continue; 395 396 st_engine_heartbeat_disable(engine); 397 398 rq = igt_spinner_create_request(&spin, 399 engine->kernel_context, 400 MI_NOOP); 401 if (IS_ERR(rq)) { 402 err = PTR_ERR(rq); 403 break; 404 } 405 406 i915_request_add(rq); 407 408 if (!igt_wait_for_spinner(&spin, rq)) { 409 pr_err("%s: RPS spinner did not start\n", 410 engine->name); 411 igt_spinner_end(&spin); 412 st_engine_heartbeat_enable(engine); 413 intel_gt_set_wedged(engine->gt); 414 err = -EIO; 415 break; 416 } 417 418 if (rps_set_check(rps, rps->min_freq) != rps->min_freq) { 419 pr_err("%s: could not set minimum frequency [%x], only %x!\n", 420 engine->name, rps->min_freq, read_cagf(rps)); 421 igt_spinner_end(&spin); 422 st_engine_heartbeat_enable(engine); 423 show_pstate_limits(rps); 424 err = -EINVAL; 425 break; 426 } 427 428 for (f = rps->min_freq + 1; f < rps->max_freq; f++) { 429 if (rps_set_check(rps, f) < f) 430 break; 431 } 432 433 limit = rps_set_check(rps, f); 434 435 if (rps_set_check(rps, rps->min_freq) != rps->min_freq) { 436 pr_err("%s: could not restore minimum frequency [%x], only %x!\n", 437 engine->name, rps->min_freq, read_cagf(rps)); 438 igt_spinner_end(&spin); 439 st_engine_heartbeat_enable(engine); 440 show_pstate_limits(rps); 441 err = -EINVAL; 442 break; 443 } 444 445 max_dt = ktime_get(); 446 max = rps_set_check(rps, limit); 447 max_dt = ktime_sub(ktime_get(), max_dt); 448 449 min_dt = ktime_get(); 450 min = rps_set_check(rps, rps->min_freq); 451 min_dt = ktime_sub(ktime_get(), min_dt); 452 453 igt_spinner_end(&spin); 454 st_engine_heartbeat_enable(engine); 455 456 pr_info("%s: range:[%x:%uMHz, %x:%uMHz] limit:[%x:%uMHz], %x:%x response %lluns:%lluns\n", 457 engine->name, 458 rps->min_freq, intel_gpu_freq(rps, rps->min_freq), 459 rps->max_freq, intel_gpu_freq(rps, rps->max_freq), 460 limit, intel_gpu_freq(rps, limit), 461 min, max, ktime_to_ns(min_dt), ktime_to_ns(max_dt)); 462 463 if (limit == rps->min_freq) { 464 pr_err("%s: GPU throttled to minimum!\n", 465 engine->name); 466 show_pstate_limits(rps); 467 err = -ENODEV; 468 break; 469 } 470 471 if (igt_flush_test(gt->i915)) { 472 err = -EIO; 473 break; 474 } 475 } 476 intel_gt_pm_put(gt); 477 478 igt_spinner_fini(&spin); 479 480 intel_gt_pm_wait_for_idle(gt); 481 rps->work.func = saved_work; 482 483 return err; 484 } 485 486 static void show_pcu_config(struct intel_rps *rps) 487 { 488 struct drm_i915_private *i915 = rps_to_i915(rps); 489 unsigned int max_gpu_freq, min_gpu_freq; 490 intel_wakeref_t wakeref; 491 int gpu_freq; 492 493 if (!HAS_LLC(i915)) 494 return; 495 496 min_gpu_freq = rps->min_freq; 497 max_gpu_freq = rps->max_freq; 498 if (INTEL_GEN(i915) >= 9) { 499 /* Convert GT frequency to 50 HZ units */ 500 min_gpu_freq /= GEN9_FREQ_SCALER; 501 max_gpu_freq /= GEN9_FREQ_SCALER; 502 } 503 504 wakeref = intel_runtime_pm_get(rps_to_uncore(rps)->rpm); 505 506 pr_info("%5s %5s %5s\n", "GPU", "eCPU", "eRing"); 507 for (gpu_freq = min_gpu_freq; gpu_freq <= max_gpu_freq; gpu_freq++) { 508 int ia_freq = gpu_freq; 509 510 sandybridge_pcode_read(i915, 511 GEN6_PCODE_READ_MIN_FREQ_TABLE, 512 &ia_freq, NULL); 513 514 pr_info("%5d %5d %5d\n", 515 gpu_freq * 50, 516 ((ia_freq >> 0) & 0xff) * 100, 517 ((ia_freq >> 8) & 0xff) * 100); 518 } 519 520 intel_runtime_pm_put(rps_to_uncore(rps)->rpm, wakeref); 521 } 522 523 static u64 __measure_frequency(u32 *cntr, int duration_ms) 524 { 525 u64 dc, dt; 526 527 dt = ktime_get(); 528 dc = READ_ONCE(*cntr); 529 usleep_range(1000 * duration_ms, 2000 * duration_ms); 530 dc = READ_ONCE(*cntr) - dc; 531 dt = ktime_get() - dt; 532 533 return div64_u64(1000 * 1000 * dc, dt); 534 } 535 536 static u64 measure_frequency_at(struct intel_rps *rps, u32 *cntr, int *freq) 537 { 538 u64 x[5]; 539 int i; 540 541 *freq = rps_set_check(rps, *freq); 542 for (i = 0; i < 5; i++) 543 x[i] = __measure_frequency(cntr, 2); 544 *freq = (*freq + read_cagf(rps)) / 2; 545 546 /* A simple triangle filter for better result stability */ 547 sort(x, 5, sizeof(*x), cmp_u64, NULL); 548 return div_u64(x[1] + 2 * x[2] + x[3], 4); 549 } 550 551 static u64 __measure_cs_frequency(struct intel_engine_cs *engine, 552 int duration_ms) 553 { 554 u64 dc, dt; 555 556 dt = ktime_get(); 557 dc = intel_uncore_read_fw(engine->uncore, CS_GPR(0)); 558 usleep_range(1000 * duration_ms, 2000 * duration_ms); 559 dc = intel_uncore_read_fw(engine->uncore, CS_GPR(0)) - dc; 560 dt = ktime_get() - dt; 561 562 return div64_u64(1000 * 1000 * dc, dt); 563 } 564 565 static u64 measure_cs_frequency_at(struct intel_rps *rps, 566 struct intel_engine_cs *engine, 567 int *freq) 568 { 569 u64 x[5]; 570 int i; 571 572 *freq = rps_set_check(rps, *freq); 573 for (i = 0; i < 5; i++) 574 x[i] = __measure_cs_frequency(engine, 2); 575 *freq = (*freq + read_cagf(rps)) / 2; 576 577 /* A simple triangle filter for better result stability */ 578 sort(x, 5, sizeof(*x), cmp_u64, NULL); 579 return div_u64(x[1] + 2 * x[2] + x[3], 4); 580 } 581 582 static bool scaled_within(u64 x, u64 y, u32 f_n, u32 f_d) 583 { 584 return f_d * x > f_n * y && f_n * x < f_d * y; 585 } 586 587 int live_rps_frequency_cs(void *arg) 588 { 589 void (*saved_work)(struct work_struct *wrk); 590 struct intel_gt *gt = arg; 591 struct intel_rps *rps = >->rps; 592 struct intel_engine_cs *engine; 593 struct pm_qos_request qos; 594 enum intel_engine_id id; 595 int err = 0; 596 597 /* 598 * The premise is that the GPU does change freqency at our behest. 599 * Let's check there is a correspondence between the requested 600 * frequency, the actual frequency, and the observed clock rate. 601 */ 602 603 if (!intel_rps_is_enabled(rps)) 604 return 0; 605 606 if (INTEL_GEN(gt->i915) < 8) /* for CS simplicity */ 607 return 0; 608 609 if (CPU_LATENCY >= 0) 610 cpu_latency_qos_add_request(&qos, CPU_LATENCY); 611 612 intel_gt_pm_wait_for_idle(gt); 613 saved_work = rps->work.func; 614 rps->work.func = dummy_rps_work; 615 616 for_each_engine(engine, gt, id) { 617 struct i915_request *rq; 618 struct i915_vma *vma; 619 u32 *cancel, *cntr; 620 struct { 621 u64 count; 622 int freq; 623 } min, max; 624 625 st_engine_heartbeat_disable(engine); 626 627 vma = create_spin_counter(engine, 628 engine->kernel_context->vm, false, 629 &cancel, &cntr); 630 if (IS_ERR(vma)) { 631 err = PTR_ERR(vma); 632 st_engine_heartbeat_enable(engine); 633 break; 634 } 635 636 rq = intel_engine_create_kernel_request(engine); 637 if (IS_ERR(rq)) { 638 err = PTR_ERR(rq); 639 goto err_vma; 640 } 641 642 i915_vma_lock(vma); 643 err = i915_request_await_object(rq, vma->obj, false); 644 if (!err) 645 err = i915_vma_move_to_active(vma, rq, 0); 646 if (!err) 647 err = rq->engine->emit_bb_start(rq, 648 vma->node.start, 649 PAGE_SIZE, 0); 650 i915_vma_unlock(vma); 651 i915_request_add(rq); 652 if (err) 653 goto err_vma; 654 655 if (wait_for(intel_uncore_read(engine->uncore, CS_GPR(0)), 656 10)) { 657 pr_err("%s: timed loop did not start\n", 658 engine->name); 659 goto err_vma; 660 } 661 662 min.freq = rps->min_freq; 663 min.count = measure_cs_frequency_at(rps, engine, &min.freq); 664 665 max.freq = rps->max_freq; 666 max.count = measure_cs_frequency_at(rps, engine, &max.freq); 667 668 pr_info("%s: min:%lluKHz @ %uMHz, max:%lluKHz @ %uMHz [%d%%]\n", 669 engine->name, 670 min.count, intel_gpu_freq(rps, min.freq), 671 max.count, intel_gpu_freq(rps, max.freq), 672 (int)DIV64_U64_ROUND_CLOSEST(100 * min.freq * max.count, 673 max.freq * min.count)); 674 675 if (!scaled_within(max.freq * min.count, 676 min.freq * max.count, 677 2, 3)) { 678 int f; 679 680 pr_err("%s: CS did not scale with frequency! scaled min:%llu, max:%llu\n", 681 engine->name, 682 max.freq * min.count, 683 min.freq * max.count); 684 show_pcu_config(rps); 685 686 for (f = min.freq + 1; f <= rps->max_freq; f++) { 687 int act = f; 688 u64 count; 689 690 count = measure_cs_frequency_at(rps, engine, &act); 691 if (act < f) 692 break; 693 694 pr_info("%s: %x:%uMHz: %lluKHz [%d%%]\n", 695 engine->name, 696 act, intel_gpu_freq(rps, act), count, 697 (int)DIV64_U64_ROUND_CLOSEST(100 * min.freq * count, 698 act * min.count)); 699 700 f = act; /* may skip ahead [pcu granularity] */ 701 } 702 703 err = -EINVAL; 704 } 705 706 err_vma: 707 *cancel = MI_BATCH_BUFFER_END; 708 i915_gem_object_flush_map(vma->obj); 709 i915_gem_object_unpin_map(vma->obj); 710 i915_vma_unpin(vma); 711 i915_vma_put(vma); 712 713 st_engine_heartbeat_enable(engine); 714 if (igt_flush_test(gt->i915)) 715 err = -EIO; 716 if (err) 717 break; 718 } 719 720 intel_gt_pm_wait_for_idle(gt); 721 rps->work.func = saved_work; 722 723 if (CPU_LATENCY >= 0) 724 cpu_latency_qos_remove_request(&qos); 725 726 return err; 727 } 728 729 int live_rps_frequency_srm(void *arg) 730 { 731 void (*saved_work)(struct work_struct *wrk); 732 struct intel_gt *gt = arg; 733 struct intel_rps *rps = >->rps; 734 struct intel_engine_cs *engine; 735 struct pm_qos_request qos; 736 enum intel_engine_id id; 737 int err = 0; 738 739 /* 740 * The premise is that the GPU does change freqency at our behest. 741 * Let's check there is a correspondence between the requested 742 * frequency, the actual frequency, and the observed clock rate. 743 */ 744 745 if (!intel_rps_is_enabled(rps)) 746 return 0; 747 748 if (INTEL_GEN(gt->i915) < 8) /* for CS simplicity */ 749 return 0; 750 751 if (CPU_LATENCY >= 0) 752 cpu_latency_qos_add_request(&qos, CPU_LATENCY); 753 754 intel_gt_pm_wait_for_idle(gt); 755 saved_work = rps->work.func; 756 rps->work.func = dummy_rps_work; 757 758 for_each_engine(engine, gt, id) { 759 struct i915_request *rq; 760 struct i915_vma *vma; 761 u32 *cancel, *cntr; 762 struct { 763 u64 count; 764 int freq; 765 } min, max; 766 767 st_engine_heartbeat_disable(engine); 768 769 vma = create_spin_counter(engine, 770 engine->kernel_context->vm, true, 771 &cancel, &cntr); 772 if (IS_ERR(vma)) { 773 err = PTR_ERR(vma); 774 st_engine_heartbeat_enable(engine); 775 break; 776 } 777 778 rq = intel_engine_create_kernel_request(engine); 779 if (IS_ERR(rq)) { 780 err = PTR_ERR(rq); 781 goto err_vma; 782 } 783 784 i915_vma_lock(vma); 785 err = i915_request_await_object(rq, vma->obj, false); 786 if (!err) 787 err = i915_vma_move_to_active(vma, rq, 0); 788 if (!err) 789 err = rq->engine->emit_bb_start(rq, 790 vma->node.start, 791 PAGE_SIZE, 0); 792 i915_vma_unlock(vma); 793 i915_request_add(rq); 794 if (err) 795 goto err_vma; 796 797 if (wait_for(READ_ONCE(*cntr), 10)) { 798 pr_err("%s: timed loop did not start\n", 799 engine->name); 800 goto err_vma; 801 } 802 803 min.freq = rps->min_freq; 804 min.count = measure_frequency_at(rps, cntr, &min.freq); 805 806 max.freq = rps->max_freq; 807 max.count = measure_frequency_at(rps, cntr, &max.freq); 808 809 pr_info("%s: min:%lluKHz @ %uMHz, max:%lluKHz @ %uMHz [%d%%]\n", 810 engine->name, 811 min.count, intel_gpu_freq(rps, min.freq), 812 max.count, intel_gpu_freq(rps, max.freq), 813 (int)DIV64_U64_ROUND_CLOSEST(100 * min.freq * max.count, 814 max.freq * min.count)); 815 816 if (!scaled_within(max.freq * min.count, 817 min.freq * max.count, 818 1, 2)) { 819 int f; 820 821 pr_err("%s: CS did not scale with frequency! scaled min:%llu, max:%llu\n", 822 engine->name, 823 max.freq * min.count, 824 min.freq * max.count); 825 show_pcu_config(rps); 826 827 for (f = min.freq + 1; f <= rps->max_freq; f++) { 828 int act = f; 829 u64 count; 830 831 count = measure_frequency_at(rps, cntr, &act); 832 if (act < f) 833 break; 834 835 pr_info("%s: %x:%uMHz: %lluKHz [%d%%]\n", 836 engine->name, 837 act, intel_gpu_freq(rps, act), count, 838 (int)DIV64_U64_ROUND_CLOSEST(100 * min.freq * count, 839 act * min.count)); 840 841 f = act; /* may skip ahead [pcu granularity] */ 842 } 843 844 err = -EINVAL; 845 } 846 847 err_vma: 848 *cancel = MI_BATCH_BUFFER_END; 849 i915_gem_object_flush_map(vma->obj); 850 i915_gem_object_unpin_map(vma->obj); 851 i915_vma_unpin(vma); 852 i915_vma_put(vma); 853 854 st_engine_heartbeat_enable(engine); 855 if (igt_flush_test(gt->i915)) 856 err = -EIO; 857 if (err) 858 break; 859 } 860 861 intel_gt_pm_wait_for_idle(gt); 862 rps->work.func = saved_work; 863 864 if (CPU_LATENCY >= 0) 865 cpu_latency_qos_remove_request(&qos); 866 867 return err; 868 } 869 870 static void sleep_for_ei(struct intel_rps *rps, int timeout_us) 871 { 872 /* Flush any previous EI */ 873 usleep_range(timeout_us, 2 * timeout_us); 874 875 /* Reset the interrupt status */ 876 rps_disable_interrupts(rps); 877 GEM_BUG_ON(rps->pm_iir); 878 rps_enable_interrupts(rps); 879 880 /* And then wait for the timeout, for real this time */ 881 usleep_range(2 * timeout_us, 3 * timeout_us); 882 } 883 884 static int __rps_up_interrupt(struct intel_rps *rps, 885 struct intel_engine_cs *engine, 886 struct igt_spinner *spin) 887 { 888 struct intel_uncore *uncore = engine->uncore; 889 struct i915_request *rq; 890 u32 timeout; 891 892 if (!intel_engine_can_store_dword(engine)) 893 return 0; 894 895 rps_set_check(rps, rps->min_freq); 896 897 rq = igt_spinner_create_request(spin, engine->kernel_context, MI_NOOP); 898 if (IS_ERR(rq)) 899 return PTR_ERR(rq); 900 901 i915_request_get(rq); 902 i915_request_add(rq); 903 904 if (!igt_wait_for_spinner(spin, rq)) { 905 pr_err("%s: RPS spinner did not start\n", 906 engine->name); 907 i915_request_put(rq); 908 intel_gt_set_wedged(engine->gt); 909 return -EIO; 910 } 911 912 if (!intel_rps_is_active(rps)) { 913 pr_err("%s: RPS not enabled on starting spinner\n", 914 engine->name); 915 igt_spinner_end(spin); 916 i915_request_put(rq); 917 return -EINVAL; 918 } 919 920 if (!(rps->pm_events & GEN6_PM_RP_UP_THRESHOLD)) { 921 pr_err("%s: RPS did not register UP interrupt\n", 922 engine->name); 923 i915_request_put(rq); 924 return -EINVAL; 925 } 926 927 if (rps->last_freq != rps->min_freq) { 928 pr_err("%s: RPS did not program min frequency\n", 929 engine->name); 930 i915_request_put(rq); 931 return -EINVAL; 932 } 933 934 timeout = intel_uncore_read(uncore, GEN6_RP_UP_EI); 935 timeout = intel_gt_pm_interval_to_ns(engine->gt, timeout); 936 timeout = DIV_ROUND_UP(timeout, 1000); 937 938 sleep_for_ei(rps, timeout); 939 GEM_BUG_ON(i915_request_completed(rq)); 940 941 igt_spinner_end(spin); 942 i915_request_put(rq); 943 944 if (rps->cur_freq != rps->min_freq) { 945 pr_err("%s: Frequency unexpectedly changed [up], now %d!\n", 946 engine->name, intel_rps_read_actual_frequency(rps)); 947 return -EINVAL; 948 } 949 950 if (!(rps->pm_iir & GEN6_PM_RP_UP_THRESHOLD)) { 951 pr_err("%s: UP interrupt not recorded for spinner, pm_iir:%x, prev_up:%x, up_threshold:%x, up_ei:%x\n", 952 engine->name, rps->pm_iir, 953 intel_uncore_read(uncore, GEN6_RP_PREV_UP), 954 intel_uncore_read(uncore, GEN6_RP_UP_THRESHOLD), 955 intel_uncore_read(uncore, GEN6_RP_UP_EI)); 956 return -EINVAL; 957 } 958 959 return 0; 960 } 961 962 static int __rps_down_interrupt(struct intel_rps *rps, 963 struct intel_engine_cs *engine) 964 { 965 struct intel_uncore *uncore = engine->uncore; 966 u32 timeout; 967 968 rps_set_check(rps, rps->max_freq); 969 970 if (!(rps->pm_events & GEN6_PM_RP_DOWN_THRESHOLD)) { 971 pr_err("%s: RPS did not register DOWN interrupt\n", 972 engine->name); 973 return -EINVAL; 974 } 975 976 if (rps->last_freq != rps->max_freq) { 977 pr_err("%s: RPS did not program max frequency\n", 978 engine->name); 979 return -EINVAL; 980 } 981 982 timeout = intel_uncore_read(uncore, GEN6_RP_DOWN_EI); 983 timeout = intel_gt_pm_interval_to_ns(engine->gt, timeout); 984 timeout = DIV_ROUND_UP(timeout, 1000); 985 986 sleep_for_ei(rps, timeout); 987 988 if (rps->cur_freq != rps->max_freq) { 989 pr_err("%s: Frequency unexpectedly changed [down], now %d!\n", 990 engine->name, 991 intel_rps_read_actual_frequency(rps)); 992 return -EINVAL; 993 } 994 995 if (!(rps->pm_iir & (GEN6_PM_RP_DOWN_THRESHOLD | GEN6_PM_RP_DOWN_TIMEOUT))) { 996 pr_err("%s: DOWN interrupt not recorded for idle, pm_iir:%x, prev_down:%x, down_threshold:%x, down_ei:%x [prev_up:%x, up_threshold:%x, up_ei:%x]\n", 997 engine->name, rps->pm_iir, 998 intel_uncore_read(uncore, GEN6_RP_PREV_DOWN), 999 intel_uncore_read(uncore, GEN6_RP_DOWN_THRESHOLD), 1000 intel_uncore_read(uncore, GEN6_RP_DOWN_EI), 1001 intel_uncore_read(uncore, GEN6_RP_PREV_UP), 1002 intel_uncore_read(uncore, GEN6_RP_UP_THRESHOLD), 1003 intel_uncore_read(uncore, GEN6_RP_UP_EI)); 1004 return -EINVAL; 1005 } 1006 1007 return 0; 1008 } 1009 1010 int live_rps_interrupt(void *arg) 1011 { 1012 struct intel_gt *gt = arg; 1013 struct intel_rps *rps = >->rps; 1014 void (*saved_work)(struct work_struct *wrk); 1015 struct intel_engine_cs *engine; 1016 enum intel_engine_id id; 1017 struct igt_spinner spin; 1018 u32 pm_events; 1019 int err = 0; 1020 1021 /* 1022 * First, let's check whether or not we are receiving interrupts. 1023 */ 1024 1025 if (!intel_rps_has_interrupts(rps)) 1026 return 0; 1027 1028 intel_gt_pm_get(gt); 1029 pm_events = rps->pm_events; 1030 intel_gt_pm_put(gt); 1031 if (!pm_events) { 1032 pr_err("No RPS PM events registered, but RPS is enabled?\n"); 1033 return -ENODEV; 1034 } 1035 1036 if (igt_spinner_init(&spin, gt)) 1037 return -ENOMEM; 1038 1039 intel_gt_pm_wait_for_idle(gt); 1040 saved_work = rps->work.func; 1041 rps->work.func = dummy_rps_work; 1042 1043 for_each_engine(engine, gt, id) { 1044 /* Keep the engine busy with a spinner; expect an UP! */ 1045 if (pm_events & GEN6_PM_RP_UP_THRESHOLD) { 1046 intel_gt_pm_wait_for_idle(engine->gt); 1047 GEM_BUG_ON(intel_rps_is_active(rps)); 1048 1049 st_engine_heartbeat_disable(engine); 1050 1051 err = __rps_up_interrupt(rps, engine, &spin); 1052 1053 st_engine_heartbeat_enable(engine); 1054 if (err) 1055 goto out; 1056 1057 intel_gt_pm_wait_for_idle(engine->gt); 1058 } 1059 1060 /* Keep the engine awake but idle and check for DOWN */ 1061 if (pm_events & GEN6_PM_RP_DOWN_THRESHOLD) { 1062 st_engine_heartbeat_disable(engine); 1063 intel_rc6_disable(>->rc6); 1064 1065 err = __rps_down_interrupt(rps, engine); 1066 1067 intel_rc6_enable(>->rc6); 1068 st_engine_heartbeat_enable(engine); 1069 if (err) 1070 goto out; 1071 } 1072 } 1073 1074 out: 1075 if (igt_flush_test(gt->i915)) 1076 err = -EIO; 1077 1078 igt_spinner_fini(&spin); 1079 1080 intel_gt_pm_wait_for_idle(gt); 1081 rps->work.func = saved_work; 1082 1083 return err; 1084 } 1085 1086 static u64 __measure_power(int duration_ms) 1087 { 1088 u64 dE, dt; 1089 1090 dt = ktime_get(); 1091 dE = librapl_energy_uJ(); 1092 usleep_range(1000 * duration_ms, 2000 * duration_ms); 1093 dE = librapl_energy_uJ() - dE; 1094 dt = ktime_get() - dt; 1095 1096 return div64_u64(1000 * 1000 * dE, dt); 1097 } 1098 1099 static u64 measure_power_at(struct intel_rps *rps, int *freq) 1100 { 1101 u64 x[5]; 1102 int i; 1103 1104 *freq = rps_set_check(rps, *freq); 1105 for (i = 0; i < 5; i++) 1106 x[i] = __measure_power(5); 1107 *freq = (*freq + read_cagf(rps)) / 2; 1108 1109 /* A simple triangle filter for better result stability */ 1110 sort(x, 5, sizeof(*x), cmp_u64, NULL); 1111 return div_u64(x[1] + 2 * x[2] + x[3], 4); 1112 } 1113 1114 int live_rps_power(void *arg) 1115 { 1116 struct intel_gt *gt = arg; 1117 struct intel_rps *rps = >->rps; 1118 void (*saved_work)(struct work_struct *wrk); 1119 struct intel_engine_cs *engine; 1120 enum intel_engine_id id; 1121 struct igt_spinner spin; 1122 int err = 0; 1123 1124 /* 1125 * Our fundamental assumption is that running at lower frequency 1126 * actually saves power. Let's see if our RAPL measurement support 1127 * that theory. 1128 */ 1129 1130 if (!intel_rps_is_enabled(rps)) 1131 return 0; 1132 1133 if (!librapl_energy_uJ()) 1134 return 0; 1135 1136 if (igt_spinner_init(&spin, gt)) 1137 return -ENOMEM; 1138 1139 intel_gt_pm_wait_for_idle(gt); 1140 saved_work = rps->work.func; 1141 rps->work.func = dummy_rps_work; 1142 1143 for_each_engine(engine, gt, id) { 1144 struct i915_request *rq; 1145 struct { 1146 u64 power; 1147 int freq; 1148 } min, max; 1149 1150 if (!intel_engine_can_store_dword(engine)) 1151 continue; 1152 1153 st_engine_heartbeat_disable(engine); 1154 1155 rq = igt_spinner_create_request(&spin, 1156 engine->kernel_context, 1157 MI_NOOP); 1158 if (IS_ERR(rq)) { 1159 st_engine_heartbeat_enable(engine); 1160 err = PTR_ERR(rq); 1161 break; 1162 } 1163 1164 i915_request_add(rq); 1165 1166 if (!igt_wait_for_spinner(&spin, rq)) { 1167 pr_err("%s: RPS spinner did not start\n", 1168 engine->name); 1169 igt_spinner_end(&spin); 1170 st_engine_heartbeat_enable(engine); 1171 intel_gt_set_wedged(engine->gt); 1172 err = -EIO; 1173 break; 1174 } 1175 1176 max.freq = rps->max_freq; 1177 max.power = measure_power_at(rps, &max.freq); 1178 1179 min.freq = rps->min_freq; 1180 min.power = measure_power_at(rps, &min.freq); 1181 1182 igt_spinner_end(&spin); 1183 st_engine_heartbeat_enable(engine); 1184 1185 pr_info("%s: min:%llumW @ %uMHz, max:%llumW @ %uMHz\n", 1186 engine->name, 1187 min.power, intel_gpu_freq(rps, min.freq), 1188 max.power, intel_gpu_freq(rps, max.freq)); 1189 1190 if (10 * min.freq >= 9 * max.freq) { 1191 pr_notice("Could not control frequency, ran at [%d:%uMHz, %d:%uMhz]\n", 1192 min.freq, intel_gpu_freq(rps, min.freq), 1193 max.freq, intel_gpu_freq(rps, max.freq)); 1194 continue; 1195 } 1196 1197 if (11 * min.power > 10 * max.power) { 1198 pr_err("%s: did not conserve power when setting lower frequency!\n", 1199 engine->name); 1200 err = -EINVAL; 1201 break; 1202 } 1203 1204 if (igt_flush_test(gt->i915)) { 1205 err = -EIO; 1206 break; 1207 } 1208 } 1209 1210 igt_spinner_fini(&spin); 1211 1212 intel_gt_pm_wait_for_idle(gt); 1213 rps->work.func = saved_work; 1214 1215 return err; 1216 } 1217 1218 int live_rps_dynamic(void *arg) 1219 { 1220 struct intel_gt *gt = arg; 1221 struct intel_rps *rps = >->rps; 1222 struct intel_engine_cs *engine; 1223 enum intel_engine_id id; 1224 struct igt_spinner spin; 1225 int err = 0; 1226 1227 /* 1228 * We've looked at the bascs, and have established that we 1229 * can change the clock frequency and that the HW will generate 1230 * interrupts based on load. Now we check how we integrate those 1231 * moving parts into dynamic reclocking based on load. 1232 */ 1233 1234 if (!intel_rps_is_enabled(rps)) 1235 return 0; 1236 1237 if (igt_spinner_init(&spin, gt)) 1238 return -ENOMEM; 1239 1240 if (intel_rps_has_interrupts(rps)) 1241 pr_info("RPS has interrupt support\n"); 1242 if (intel_rps_uses_timer(rps)) 1243 pr_info("RPS has timer support\n"); 1244 1245 for_each_engine(engine, gt, id) { 1246 struct i915_request *rq; 1247 struct { 1248 ktime_t dt; 1249 u8 freq; 1250 } min, max; 1251 1252 if (!intel_engine_can_store_dword(engine)) 1253 continue; 1254 1255 intel_gt_pm_wait_for_idle(gt); 1256 GEM_BUG_ON(intel_rps_is_active(rps)); 1257 rps->cur_freq = rps->min_freq; 1258 1259 intel_engine_pm_get(engine); 1260 intel_rc6_disable(>->rc6); 1261 GEM_BUG_ON(rps->last_freq != rps->min_freq); 1262 1263 rq = igt_spinner_create_request(&spin, 1264 engine->kernel_context, 1265 MI_NOOP); 1266 if (IS_ERR(rq)) { 1267 err = PTR_ERR(rq); 1268 goto err; 1269 } 1270 1271 i915_request_add(rq); 1272 1273 max.dt = ktime_get(); 1274 max.freq = wait_for_freq(rps, rps->max_freq, 500); 1275 max.dt = ktime_sub(ktime_get(), max.dt); 1276 1277 igt_spinner_end(&spin); 1278 1279 min.dt = ktime_get(); 1280 min.freq = wait_for_freq(rps, rps->min_freq, 2000); 1281 min.dt = ktime_sub(ktime_get(), min.dt); 1282 1283 pr_info("%s: dynamically reclocked to %u:%uMHz while busy in %lluns, and %u:%uMHz while idle in %lluns\n", 1284 engine->name, 1285 max.freq, intel_gpu_freq(rps, max.freq), 1286 ktime_to_ns(max.dt), 1287 min.freq, intel_gpu_freq(rps, min.freq), 1288 ktime_to_ns(min.dt)); 1289 if (min.freq >= max.freq) { 1290 pr_err("%s: dynamic reclocking of spinner failed\n!", 1291 engine->name); 1292 err = -EINVAL; 1293 } 1294 1295 err: 1296 intel_rc6_enable(>->rc6); 1297 intel_engine_pm_put(engine); 1298 1299 if (igt_flush_test(gt->i915)) 1300 err = -EIO; 1301 if (err) 1302 break; 1303 } 1304 1305 igt_spinner_fini(&spin); 1306 1307 return err; 1308 } 1309