xref: /linux/drivers/gpu/drm/i915/gt/selftest_rps.c (revision dec1c62e91ba268ab2a6e339d4d7a59287d5eba1)
1 // SPDX-License-Identifier: MIT
2 /*
3  * Copyright © 2020 Intel Corporation
4  */
5 
6 #include <linux/pm_qos.h>
7 #include <linux/sort.h>
8 
9 #include "gem/i915_gem_internal.h"
10 
11 #include "intel_engine_heartbeat.h"
12 #include "intel_engine_pm.h"
13 #include "intel_engine_regs.h"
14 #include "intel_gpu_commands.h"
15 #include "intel_gt_clock_utils.h"
16 #include "intel_gt_pm.h"
17 #include "intel_rc6.h"
18 #include "selftest_engine_heartbeat.h"
19 #include "selftest_rps.h"
20 #include "selftests/igt_flush_test.h"
21 #include "selftests/igt_spinner.h"
22 #include "selftests/librapl.h"
23 
24 /* Try to isolate the impact of cstates from determing frequency response */
25 #define CPU_LATENCY 0 /* -1 to disable pm_qos, 0 to disable cstates */
26 
27 static void dummy_rps_work(struct work_struct *wrk)
28 {
29 }
30 
31 static int cmp_u64(const void *A, const void *B)
32 {
33 	const u64 *a = A, *b = B;
34 
35 	if (*a < *b)
36 		return -1;
37 	else if (*a > *b)
38 		return 1;
39 	else
40 		return 0;
41 }
42 
43 static int cmp_u32(const void *A, const void *B)
44 {
45 	const u32 *a = A, *b = B;
46 
47 	if (*a < *b)
48 		return -1;
49 	else if (*a > *b)
50 		return 1;
51 	else
52 		return 0;
53 }
54 
55 static struct i915_vma *
56 create_spin_counter(struct intel_engine_cs *engine,
57 		    struct i915_address_space *vm,
58 		    bool srm,
59 		    u32 **cancel,
60 		    u32 **counter)
61 {
62 	enum {
63 		COUNT,
64 		INC,
65 		__NGPR__,
66 	};
67 #define CS_GPR(x) GEN8_RING_CS_GPR(engine->mmio_base, x)
68 	struct drm_i915_gem_object *obj;
69 	struct i915_vma *vma;
70 	unsigned long end;
71 	u32 *base, *cs;
72 	int loop, i;
73 	int err;
74 
75 	obj = i915_gem_object_create_internal(vm->i915, 64 << 10);
76 	if (IS_ERR(obj))
77 		return ERR_CAST(obj);
78 
79 	end = obj->base.size / sizeof(u32) - 1;
80 
81 	vma = i915_vma_instance(obj, vm, NULL);
82 	if (IS_ERR(vma)) {
83 		err = PTR_ERR(vma);
84 		goto err_put;
85 	}
86 
87 	err = i915_vma_pin(vma, 0, 0, PIN_USER);
88 	if (err)
89 		goto err_unlock;
90 
91 	i915_vma_lock(vma);
92 
93 	base = i915_gem_object_pin_map(obj, I915_MAP_WC);
94 	if (IS_ERR(base)) {
95 		err = PTR_ERR(base);
96 		goto err_unpin;
97 	}
98 	cs = base;
99 
100 	*cs++ = MI_LOAD_REGISTER_IMM(__NGPR__ * 2);
101 	for (i = 0; i < __NGPR__; i++) {
102 		*cs++ = i915_mmio_reg_offset(CS_GPR(i));
103 		*cs++ = 0;
104 		*cs++ = i915_mmio_reg_offset(CS_GPR(i)) + 4;
105 		*cs++ = 0;
106 	}
107 
108 	*cs++ = MI_LOAD_REGISTER_IMM(1);
109 	*cs++ = i915_mmio_reg_offset(CS_GPR(INC));
110 	*cs++ = 1;
111 
112 	loop = cs - base;
113 
114 	/* Unroll the loop to avoid MI_BB_START stalls impacting measurements */
115 	for (i = 0; i < 1024; i++) {
116 		*cs++ = MI_MATH(4);
117 		*cs++ = MI_MATH_LOAD(MI_MATH_REG_SRCA, MI_MATH_REG(COUNT));
118 		*cs++ = MI_MATH_LOAD(MI_MATH_REG_SRCB, MI_MATH_REG(INC));
119 		*cs++ = MI_MATH_ADD;
120 		*cs++ = MI_MATH_STORE(MI_MATH_REG(COUNT), MI_MATH_REG_ACCU);
121 
122 		if (srm) {
123 			*cs++ = MI_STORE_REGISTER_MEM_GEN8;
124 			*cs++ = i915_mmio_reg_offset(CS_GPR(COUNT));
125 			*cs++ = lower_32_bits(vma->node.start + end * sizeof(*cs));
126 			*cs++ = upper_32_bits(vma->node.start + end * sizeof(*cs));
127 		}
128 	}
129 
130 	*cs++ = MI_BATCH_BUFFER_START_GEN8;
131 	*cs++ = lower_32_bits(vma->node.start + loop * sizeof(*cs));
132 	*cs++ = upper_32_bits(vma->node.start + loop * sizeof(*cs));
133 	GEM_BUG_ON(cs - base > end);
134 
135 	i915_gem_object_flush_map(obj);
136 
137 	*cancel = base + loop;
138 	*counter = srm ? memset32(base + end, 0, 1) : NULL;
139 	return vma;
140 
141 err_unpin:
142 	i915_vma_unpin(vma);
143 err_unlock:
144 	i915_vma_unlock(vma);
145 err_put:
146 	i915_gem_object_put(obj);
147 	return ERR_PTR(err);
148 }
149 
150 static u8 wait_for_freq(struct intel_rps *rps, u8 freq, int timeout_ms)
151 {
152 	u8 history[64], i;
153 	unsigned long end;
154 	int sleep;
155 
156 	i = 0;
157 	memset(history, freq, sizeof(history));
158 	sleep = 20;
159 
160 	/* The PCU does not change instantly, but drifts towards the goal? */
161 	end = jiffies + msecs_to_jiffies(timeout_ms);
162 	do {
163 		u8 act;
164 
165 		act = read_cagf(rps);
166 		if (time_after(jiffies, end))
167 			return act;
168 
169 		/* Target acquired */
170 		if (act == freq)
171 			return act;
172 
173 		/* Any change within the last N samples? */
174 		if (!memchr_inv(history, act, sizeof(history)))
175 			return act;
176 
177 		history[i] = act;
178 		i = (i + 1) % ARRAY_SIZE(history);
179 
180 		usleep_range(sleep, 2 * sleep);
181 		sleep *= 2;
182 		if (sleep > timeout_ms * 20)
183 			sleep = timeout_ms * 20;
184 	} while (1);
185 }
186 
187 static u8 rps_set_check(struct intel_rps *rps, u8 freq)
188 {
189 	mutex_lock(&rps->lock);
190 	GEM_BUG_ON(!intel_rps_is_active(rps));
191 	if (wait_for(!intel_rps_set(rps, freq), 50)) {
192 		mutex_unlock(&rps->lock);
193 		return 0;
194 	}
195 	GEM_BUG_ON(rps->last_freq != freq);
196 	mutex_unlock(&rps->lock);
197 
198 	return wait_for_freq(rps, freq, 50);
199 }
200 
201 static void show_pstate_limits(struct intel_rps *rps)
202 {
203 	struct drm_i915_private *i915 = rps_to_i915(rps);
204 
205 	if (IS_BROXTON(i915)) {
206 		pr_info("P_STATE_CAP[%x]: 0x%08x\n",
207 			i915_mmio_reg_offset(BXT_RP_STATE_CAP),
208 			intel_uncore_read(rps_to_uncore(rps),
209 					  BXT_RP_STATE_CAP));
210 	} else if (GRAPHICS_VER(i915) == 9) {
211 		pr_info("P_STATE_LIMITS[%x]: 0x%08x\n",
212 			i915_mmio_reg_offset(GEN9_RP_STATE_LIMITS),
213 			intel_uncore_read(rps_to_uncore(rps),
214 					  GEN9_RP_STATE_LIMITS));
215 	}
216 }
217 
218 int live_rps_clock_interval(void *arg)
219 {
220 	struct intel_gt *gt = arg;
221 	struct intel_rps *rps = &gt->rps;
222 	void (*saved_work)(struct work_struct *wrk);
223 	struct intel_engine_cs *engine;
224 	enum intel_engine_id id;
225 	struct igt_spinner spin;
226 	int err = 0;
227 
228 	if (!intel_rps_is_enabled(rps) || GRAPHICS_VER(gt->i915) < 6)
229 		return 0;
230 
231 	if (igt_spinner_init(&spin, gt))
232 		return -ENOMEM;
233 
234 	intel_gt_pm_wait_for_idle(gt);
235 	saved_work = rps->work.func;
236 	rps->work.func = dummy_rps_work;
237 
238 	intel_gt_pm_get(gt);
239 	intel_rps_disable(&gt->rps);
240 
241 	intel_gt_check_clock_frequency(gt);
242 
243 	for_each_engine(engine, gt, id) {
244 		struct i915_request *rq;
245 		u32 cycles;
246 		u64 dt;
247 
248 		if (!intel_engine_can_store_dword(engine))
249 			continue;
250 
251 		st_engine_heartbeat_disable(engine);
252 
253 		rq = igt_spinner_create_request(&spin,
254 						engine->kernel_context,
255 						MI_NOOP);
256 		if (IS_ERR(rq)) {
257 			st_engine_heartbeat_enable(engine);
258 			err = PTR_ERR(rq);
259 			break;
260 		}
261 
262 		i915_request_add(rq);
263 
264 		if (!igt_wait_for_spinner(&spin, rq)) {
265 			pr_err("%s: RPS spinner did not start\n",
266 			       engine->name);
267 			igt_spinner_end(&spin);
268 			st_engine_heartbeat_enable(engine);
269 			intel_gt_set_wedged(engine->gt);
270 			err = -EIO;
271 			break;
272 		}
273 
274 		intel_uncore_forcewake_get(gt->uncore, FORCEWAKE_ALL);
275 
276 		intel_uncore_write_fw(gt->uncore, GEN6_RP_CUR_UP_EI, 0);
277 
278 		/* Set the evaluation interval to infinity! */
279 		intel_uncore_write_fw(gt->uncore,
280 				      GEN6_RP_UP_EI, 0xffffffff);
281 		intel_uncore_write_fw(gt->uncore,
282 				      GEN6_RP_UP_THRESHOLD, 0xffffffff);
283 
284 		intel_uncore_write_fw(gt->uncore, GEN6_RP_CONTROL,
285 				      GEN6_RP_ENABLE | GEN6_RP_UP_BUSY_AVG);
286 
287 		if (wait_for(intel_uncore_read_fw(gt->uncore,
288 						  GEN6_RP_CUR_UP_EI),
289 			     10)) {
290 			/* Just skip the test; assume lack of HW support */
291 			pr_notice("%s: rps evaluation interval not ticking\n",
292 				  engine->name);
293 			err = -ENODEV;
294 		} else {
295 			ktime_t dt_[5];
296 			u32 cycles_[5];
297 			int i;
298 
299 			for (i = 0; i < 5; i++) {
300 				preempt_disable();
301 
302 				dt_[i] = ktime_get();
303 				cycles_[i] = -intel_uncore_read_fw(gt->uncore, GEN6_RP_CUR_UP_EI);
304 
305 				udelay(1000);
306 
307 				dt_[i] = ktime_sub(ktime_get(), dt_[i]);
308 				cycles_[i] += intel_uncore_read_fw(gt->uncore, GEN6_RP_CUR_UP_EI);
309 
310 				preempt_enable();
311 			}
312 
313 			/* Use the median of both cycle/dt; close enough */
314 			sort(cycles_, 5, sizeof(*cycles_), cmp_u32, NULL);
315 			cycles = (cycles_[1] + 2 * cycles_[2] + cycles_[3]) / 4;
316 			sort(dt_, 5, sizeof(*dt_), cmp_u64, NULL);
317 			dt = div_u64(dt_[1] + 2 * dt_[2] + dt_[3], 4);
318 		}
319 
320 		intel_uncore_write_fw(gt->uncore, GEN6_RP_CONTROL, 0);
321 		intel_uncore_forcewake_put(gt->uncore, FORCEWAKE_ALL);
322 
323 		igt_spinner_end(&spin);
324 		st_engine_heartbeat_enable(engine);
325 
326 		if (err == 0) {
327 			u64 time = intel_gt_pm_interval_to_ns(gt, cycles);
328 			u32 expected =
329 				intel_gt_ns_to_pm_interval(gt, dt);
330 
331 			pr_info("%s: rps counted %d C0 cycles [%lldns] in %lldns [%d cycles], using GT clock frequency of %uKHz\n",
332 				engine->name, cycles, time, dt, expected,
333 				gt->clock_frequency / 1000);
334 
335 			if (10 * time < 8 * dt ||
336 			    8 * time > 10 * dt) {
337 				pr_err("%s: rps clock time does not match walltime!\n",
338 				       engine->name);
339 				err = -EINVAL;
340 			}
341 
342 			if (10 * expected < 8 * cycles ||
343 			    8 * expected > 10 * cycles) {
344 				pr_err("%s: walltime does not match rps clock ticks!\n",
345 				       engine->name);
346 				err = -EINVAL;
347 			}
348 		}
349 
350 		if (igt_flush_test(gt->i915))
351 			err = -EIO;
352 
353 		break; /* once is enough */
354 	}
355 
356 	intel_rps_enable(&gt->rps);
357 	intel_gt_pm_put(gt);
358 
359 	igt_spinner_fini(&spin);
360 
361 	intel_gt_pm_wait_for_idle(gt);
362 	rps->work.func = saved_work;
363 
364 	if (err == -ENODEV) /* skipped, don't report a fail */
365 		err = 0;
366 
367 	return err;
368 }
369 
370 int live_rps_control(void *arg)
371 {
372 	struct intel_gt *gt = arg;
373 	struct intel_rps *rps = &gt->rps;
374 	void (*saved_work)(struct work_struct *wrk);
375 	struct intel_engine_cs *engine;
376 	enum intel_engine_id id;
377 	struct igt_spinner spin;
378 	int err = 0;
379 
380 	/*
381 	 * Check that the actual frequency matches our requested frequency,
382 	 * to verify our control mechanism. We have to be careful that the
383 	 * PCU may throttle the GPU in which case the actual frequency used
384 	 * will be lowered than requested.
385 	 */
386 
387 	if (!intel_rps_is_enabled(rps))
388 		return 0;
389 
390 	if (IS_CHERRYVIEW(gt->i915)) /* XXX fragile PCU */
391 		return 0;
392 
393 	if (igt_spinner_init(&spin, gt))
394 		return -ENOMEM;
395 
396 	intel_gt_pm_wait_for_idle(gt);
397 	saved_work = rps->work.func;
398 	rps->work.func = dummy_rps_work;
399 
400 	intel_gt_pm_get(gt);
401 	for_each_engine(engine, gt, id) {
402 		struct i915_request *rq;
403 		ktime_t min_dt, max_dt;
404 		int f, limit;
405 		int min, max;
406 
407 		if (!intel_engine_can_store_dword(engine))
408 			continue;
409 
410 		st_engine_heartbeat_disable(engine);
411 
412 		rq = igt_spinner_create_request(&spin,
413 						engine->kernel_context,
414 						MI_NOOP);
415 		if (IS_ERR(rq)) {
416 			err = PTR_ERR(rq);
417 			break;
418 		}
419 
420 		i915_request_add(rq);
421 
422 		if (!igt_wait_for_spinner(&spin, rq)) {
423 			pr_err("%s: RPS spinner did not start\n",
424 			       engine->name);
425 			igt_spinner_end(&spin);
426 			st_engine_heartbeat_enable(engine);
427 			intel_gt_set_wedged(engine->gt);
428 			err = -EIO;
429 			break;
430 		}
431 
432 		if (rps_set_check(rps, rps->min_freq) != rps->min_freq) {
433 			pr_err("%s: could not set minimum frequency [%x], only %x!\n",
434 			       engine->name, rps->min_freq, read_cagf(rps));
435 			igt_spinner_end(&spin);
436 			st_engine_heartbeat_enable(engine);
437 			show_pstate_limits(rps);
438 			err = -EINVAL;
439 			break;
440 		}
441 
442 		for (f = rps->min_freq + 1; f < rps->max_freq; f++) {
443 			if (rps_set_check(rps, f) < f)
444 				break;
445 		}
446 
447 		limit = rps_set_check(rps, f);
448 
449 		if (rps_set_check(rps, rps->min_freq) != rps->min_freq) {
450 			pr_err("%s: could not restore minimum frequency [%x], only %x!\n",
451 			       engine->name, rps->min_freq, read_cagf(rps));
452 			igt_spinner_end(&spin);
453 			st_engine_heartbeat_enable(engine);
454 			show_pstate_limits(rps);
455 			err = -EINVAL;
456 			break;
457 		}
458 
459 		max_dt = ktime_get();
460 		max = rps_set_check(rps, limit);
461 		max_dt = ktime_sub(ktime_get(), max_dt);
462 
463 		min_dt = ktime_get();
464 		min = rps_set_check(rps, rps->min_freq);
465 		min_dt = ktime_sub(ktime_get(), min_dt);
466 
467 		igt_spinner_end(&spin);
468 		st_engine_heartbeat_enable(engine);
469 
470 		pr_info("%s: range:[%x:%uMHz, %x:%uMHz] limit:[%x:%uMHz], %x:%x response %lluns:%lluns\n",
471 			engine->name,
472 			rps->min_freq, intel_gpu_freq(rps, rps->min_freq),
473 			rps->max_freq, intel_gpu_freq(rps, rps->max_freq),
474 			limit, intel_gpu_freq(rps, limit),
475 			min, max, ktime_to_ns(min_dt), ktime_to_ns(max_dt));
476 
477 		if (limit == rps->min_freq) {
478 			pr_err("%s: GPU throttled to minimum!\n",
479 			       engine->name);
480 			show_pstate_limits(rps);
481 			err = -ENODEV;
482 			break;
483 		}
484 
485 		if (igt_flush_test(gt->i915)) {
486 			err = -EIO;
487 			break;
488 		}
489 	}
490 	intel_gt_pm_put(gt);
491 
492 	igt_spinner_fini(&spin);
493 
494 	intel_gt_pm_wait_for_idle(gt);
495 	rps->work.func = saved_work;
496 
497 	return err;
498 }
499 
500 static void show_pcu_config(struct intel_rps *rps)
501 {
502 	struct drm_i915_private *i915 = rps_to_i915(rps);
503 	unsigned int max_gpu_freq, min_gpu_freq;
504 	intel_wakeref_t wakeref;
505 	int gpu_freq;
506 
507 	if (!HAS_LLC(i915))
508 		return;
509 
510 	min_gpu_freq = rps->min_freq;
511 	max_gpu_freq = rps->max_freq;
512 	if (GRAPHICS_VER(i915) >= 9) {
513 		/* Convert GT frequency to 50 HZ units */
514 		min_gpu_freq /= GEN9_FREQ_SCALER;
515 		max_gpu_freq /= GEN9_FREQ_SCALER;
516 	}
517 
518 	wakeref = intel_runtime_pm_get(rps_to_uncore(rps)->rpm);
519 
520 	pr_info("%5s  %5s  %5s\n", "GPU", "eCPU", "eRing");
521 	for (gpu_freq = min_gpu_freq; gpu_freq <= max_gpu_freq; gpu_freq++) {
522 		int ia_freq = gpu_freq;
523 
524 		snb_pcode_read(i915, GEN6_PCODE_READ_MIN_FREQ_TABLE,
525 			       &ia_freq, NULL);
526 
527 		pr_info("%5d  %5d  %5d\n",
528 			gpu_freq * 50,
529 			((ia_freq >> 0) & 0xff) * 100,
530 			((ia_freq >> 8) & 0xff) * 100);
531 	}
532 
533 	intel_runtime_pm_put(rps_to_uncore(rps)->rpm, wakeref);
534 }
535 
536 static u64 __measure_frequency(u32 *cntr, int duration_ms)
537 {
538 	u64 dc, dt;
539 
540 	dt = ktime_get();
541 	dc = READ_ONCE(*cntr);
542 	usleep_range(1000 * duration_ms, 2000 * duration_ms);
543 	dc = READ_ONCE(*cntr) - dc;
544 	dt = ktime_get() - dt;
545 
546 	return div64_u64(1000 * 1000 * dc, dt);
547 }
548 
549 static u64 measure_frequency_at(struct intel_rps *rps, u32 *cntr, int *freq)
550 {
551 	u64 x[5];
552 	int i;
553 
554 	*freq = rps_set_check(rps, *freq);
555 	for (i = 0; i < 5; i++)
556 		x[i] = __measure_frequency(cntr, 2);
557 	*freq = (*freq + read_cagf(rps)) / 2;
558 
559 	/* A simple triangle filter for better result stability */
560 	sort(x, 5, sizeof(*x), cmp_u64, NULL);
561 	return div_u64(x[1] + 2 * x[2] + x[3], 4);
562 }
563 
564 static u64 __measure_cs_frequency(struct intel_engine_cs *engine,
565 				  int duration_ms)
566 {
567 	u64 dc, dt;
568 
569 	dt = ktime_get();
570 	dc = intel_uncore_read_fw(engine->uncore, CS_GPR(0));
571 	usleep_range(1000 * duration_ms, 2000 * duration_ms);
572 	dc = intel_uncore_read_fw(engine->uncore, CS_GPR(0)) - dc;
573 	dt = ktime_get() - dt;
574 
575 	return div64_u64(1000 * 1000 * dc, dt);
576 }
577 
578 static u64 measure_cs_frequency_at(struct intel_rps *rps,
579 				   struct intel_engine_cs *engine,
580 				   int *freq)
581 {
582 	u64 x[5];
583 	int i;
584 
585 	*freq = rps_set_check(rps, *freq);
586 	for (i = 0; i < 5; i++)
587 		x[i] = __measure_cs_frequency(engine, 2);
588 	*freq = (*freq + read_cagf(rps)) / 2;
589 
590 	/* A simple triangle filter for better result stability */
591 	sort(x, 5, sizeof(*x), cmp_u64, NULL);
592 	return div_u64(x[1] + 2 * x[2] + x[3], 4);
593 }
594 
595 static bool scaled_within(u64 x, u64 y, u32 f_n, u32 f_d)
596 {
597 	return f_d * x > f_n * y && f_n * x < f_d * y;
598 }
599 
600 int live_rps_frequency_cs(void *arg)
601 {
602 	void (*saved_work)(struct work_struct *wrk);
603 	struct intel_gt *gt = arg;
604 	struct intel_rps *rps = &gt->rps;
605 	struct intel_engine_cs *engine;
606 	struct pm_qos_request qos;
607 	enum intel_engine_id id;
608 	int err = 0;
609 
610 	/*
611 	 * The premise is that the GPU does change frequency at our behest.
612 	 * Let's check there is a correspondence between the requested
613 	 * frequency, the actual frequency, and the observed clock rate.
614 	 */
615 
616 	if (!intel_rps_is_enabled(rps))
617 		return 0;
618 
619 	if (GRAPHICS_VER(gt->i915) < 8) /* for CS simplicity */
620 		return 0;
621 
622 	if (CPU_LATENCY >= 0)
623 		cpu_latency_qos_add_request(&qos, CPU_LATENCY);
624 
625 	intel_gt_pm_wait_for_idle(gt);
626 	saved_work = rps->work.func;
627 	rps->work.func = dummy_rps_work;
628 
629 	for_each_engine(engine, gt, id) {
630 		struct i915_request *rq;
631 		struct i915_vma *vma;
632 		u32 *cancel, *cntr;
633 		struct {
634 			u64 count;
635 			int freq;
636 		} min, max;
637 
638 		st_engine_heartbeat_disable(engine);
639 
640 		vma = create_spin_counter(engine,
641 					  engine->kernel_context->vm, false,
642 					  &cancel, &cntr);
643 		if (IS_ERR(vma)) {
644 			err = PTR_ERR(vma);
645 			st_engine_heartbeat_enable(engine);
646 			break;
647 		}
648 
649 		rq = intel_engine_create_kernel_request(engine);
650 		if (IS_ERR(rq)) {
651 			err = PTR_ERR(rq);
652 			goto err_vma;
653 		}
654 
655 		err = i915_request_await_object(rq, vma->obj, false);
656 		if (!err)
657 			err = i915_vma_move_to_active(vma, rq, 0);
658 		if (!err)
659 			err = rq->engine->emit_bb_start(rq,
660 							vma->node.start,
661 							PAGE_SIZE, 0);
662 		i915_request_add(rq);
663 		if (err)
664 			goto err_vma;
665 
666 		if (wait_for(intel_uncore_read(engine->uncore, CS_GPR(0)),
667 			     10)) {
668 			pr_err("%s: timed loop did not start\n",
669 			       engine->name);
670 			goto err_vma;
671 		}
672 
673 		min.freq = rps->min_freq;
674 		min.count = measure_cs_frequency_at(rps, engine, &min.freq);
675 
676 		max.freq = rps->max_freq;
677 		max.count = measure_cs_frequency_at(rps, engine, &max.freq);
678 
679 		pr_info("%s: min:%lluKHz @ %uMHz, max:%lluKHz @ %uMHz [%d%%]\n",
680 			engine->name,
681 			min.count, intel_gpu_freq(rps, min.freq),
682 			max.count, intel_gpu_freq(rps, max.freq),
683 			(int)DIV64_U64_ROUND_CLOSEST(100 * min.freq * max.count,
684 						     max.freq * min.count));
685 
686 		if (!scaled_within(max.freq * min.count,
687 				   min.freq * max.count,
688 				   2, 3)) {
689 			int f;
690 
691 			pr_err("%s: CS did not scale with frequency! scaled min:%llu, max:%llu\n",
692 			       engine->name,
693 			       max.freq * min.count,
694 			       min.freq * max.count);
695 			show_pcu_config(rps);
696 
697 			for (f = min.freq + 1; f <= rps->max_freq; f++) {
698 				int act = f;
699 				u64 count;
700 
701 				count = measure_cs_frequency_at(rps, engine, &act);
702 				if (act < f)
703 					break;
704 
705 				pr_info("%s: %x:%uMHz: %lluKHz [%d%%]\n",
706 					engine->name,
707 					act, intel_gpu_freq(rps, act), count,
708 					(int)DIV64_U64_ROUND_CLOSEST(100 * min.freq * count,
709 								     act * min.count));
710 
711 				f = act; /* may skip ahead [pcu granularity] */
712 			}
713 
714 			err = -EINTR; /* ignore error, continue on with test */
715 		}
716 
717 err_vma:
718 		*cancel = MI_BATCH_BUFFER_END;
719 		i915_gem_object_flush_map(vma->obj);
720 		i915_gem_object_unpin_map(vma->obj);
721 		i915_vma_unpin(vma);
722 		i915_vma_unlock(vma);
723 		i915_vma_put(vma);
724 
725 		st_engine_heartbeat_enable(engine);
726 		if (igt_flush_test(gt->i915))
727 			err = -EIO;
728 		if (err)
729 			break;
730 	}
731 
732 	intel_gt_pm_wait_for_idle(gt);
733 	rps->work.func = saved_work;
734 
735 	if (CPU_LATENCY >= 0)
736 		cpu_latency_qos_remove_request(&qos);
737 
738 	return err;
739 }
740 
741 int live_rps_frequency_srm(void *arg)
742 {
743 	void (*saved_work)(struct work_struct *wrk);
744 	struct intel_gt *gt = arg;
745 	struct intel_rps *rps = &gt->rps;
746 	struct intel_engine_cs *engine;
747 	struct pm_qos_request qos;
748 	enum intel_engine_id id;
749 	int err = 0;
750 
751 	/*
752 	 * The premise is that the GPU does change frequency at our behest.
753 	 * Let's check there is a correspondence between the requested
754 	 * frequency, the actual frequency, and the observed clock rate.
755 	 */
756 
757 	if (!intel_rps_is_enabled(rps))
758 		return 0;
759 
760 	if (GRAPHICS_VER(gt->i915) < 8) /* for CS simplicity */
761 		return 0;
762 
763 	if (CPU_LATENCY >= 0)
764 		cpu_latency_qos_add_request(&qos, CPU_LATENCY);
765 
766 	intel_gt_pm_wait_for_idle(gt);
767 	saved_work = rps->work.func;
768 	rps->work.func = dummy_rps_work;
769 
770 	for_each_engine(engine, gt, id) {
771 		struct i915_request *rq;
772 		struct i915_vma *vma;
773 		u32 *cancel, *cntr;
774 		struct {
775 			u64 count;
776 			int freq;
777 		} min, max;
778 
779 		st_engine_heartbeat_disable(engine);
780 
781 		vma = create_spin_counter(engine,
782 					  engine->kernel_context->vm, true,
783 					  &cancel, &cntr);
784 		if (IS_ERR(vma)) {
785 			err = PTR_ERR(vma);
786 			st_engine_heartbeat_enable(engine);
787 			break;
788 		}
789 
790 		rq = intel_engine_create_kernel_request(engine);
791 		if (IS_ERR(rq)) {
792 			err = PTR_ERR(rq);
793 			goto err_vma;
794 		}
795 
796 		err = i915_request_await_object(rq, vma->obj, false);
797 		if (!err)
798 			err = i915_vma_move_to_active(vma, rq, 0);
799 		if (!err)
800 			err = rq->engine->emit_bb_start(rq,
801 							vma->node.start,
802 							PAGE_SIZE, 0);
803 		i915_request_add(rq);
804 		if (err)
805 			goto err_vma;
806 
807 		if (wait_for(READ_ONCE(*cntr), 10)) {
808 			pr_err("%s: timed loop did not start\n",
809 			       engine->name);
810 			goto err_vma;
811 		}
812 
813 		min.freq = rps->min_freq;
814 		min.count = measure_frequency_at(rps, cntr, &min.freq);
815 
816 		max.freq = rps->max_freq;
817 		max.count = measure_frequency_at(rps, cntr, &max.freq);
818 
819 		pr_info("%s: min:%lluKHz @ %uMHz, max:%lluKHz @ %uMHz [%d%%]\n",
820 			engine->name,
821 			min.count, intel_gpu_freq(rps, min.freq),
822 			max.count, intel_gpu_freq(rps, max.freq),
823 			(int)DIV64_U64_ROUND_CLOSEST(100 * min.freq * max.count,
824 						     max.freq * min.count));
825 
826 		if (!scaled_within(max.freq * min.count,
827 				   min.freq * max.count,
828 				   1, 2)) {
829 			int f;
830 
831 			pr_err("%s: CS did not scale with frequency! scaled min:%llu, max:%llu\n",
832 			       engine->name,
833 			       max.freq * min.count,
834 			       min.freq * max.count);
835 			show_pcu_config(rps);
836 
837 			for (f = min.freq + 1; f <= rps->max_freq; f++) {
838 				int act = f;
839 				u64 count;
840 
841 				count = measure_frequency_at(rps, cntr, &act);
842 				if (act < f)
843 					break;
844 
845 				pr_info("%s: %x:%uMHz: %lluKHz [%d%%]\n",
846 					engine->name,
847 					act, intel_gpu_freq(rps, act), count,
848 					(int)DIV64_U64_ROUND_CLOSEST(100 * min.freq * count,
849 								     act * min.count));
850 
851 				f = act; /* may skip ahead [pcu granularity] */
852 			}
853 
854 			err = -EINTR; /* ignore error, continue on with test */
855 		}
856 
857 err_vma:
858 		*cancel = MI_BATCH_BUFFER_END;
859 		i915_gem_object_flush_map(vma->obj);
860 		i915_gem_object_unpin_map(vma->obj);
861 		i915_vma_unpin(vma);
862 		i915_vma_unlock(vma);
863 		i915_vma_put(vma);
864 
865 		st_engine_heartbeat_enable(engine);
866 		if (igt_flush_test(gt->i915))
867 			err = -EIO;
868 		if (err)
869 			break;
870 	}
871 
872 	intel_gt_pm_wait_for_idle(gt);
873 	rps->work.func = saved_work;
874 
875 	if (CPU_LATENCY >= 0)
876 		cpu_latency_qos_remove_request(&qos);
877 
878 	return err;
879 }
880 
881 static void sleep_for_ei(struct intel_rps *rps, int timeout_us)
882 {
883 	/* Flush any previous EI */
884 	usleep_range(timeout_us, 2 * timeout_us);
885 
886 	/* Reset the interrupt status */
887 	rps_disable_interrupts(rps);
888 	GEM_BUG_ON(rps->pm_iir);
889 	rps_enable_interrupts(rps);
890 
891 	/* And then wait for the timeout, for real this time */
892 	usleep_range(2 * timeout_us, 3 * timeout_us);
893 }
894 
895 static int __rps_up_interrupt(struct intel_rps *rps,
896 			      struct intel_engine_cs *engine,
897 			      struct igt_spinner *spin)
898 {
899 	struct intel_uncore *uncore = engine->uncore;
900 	struct i915_request *rq;
901 	u32 timeout;
902 
903 	if (!intel_engine_can_store_dword(engine))
904 		return 0;
905 
906 	rps_set_check(rps, rps->min_freq);
907 
908 	rq = igt_spinner_create_request(spin, engine->kernel_context, MI_NOOP);
909 	if (IS_ERR(rq))
910 		return PTR_ERR(rq);
911 
912 	i915_request_get(rq);
913 	i915_request_add(rq);
914 
915 	if (!igt_wait_for_spinner(spin, rq)) {
916 		pr_err("%s: RPS spinner did not start\n",
917 		       engine->name);
918 		i915_request_put(rq);
919 		intel_gt_set_wedged(engine->gt);
920 		return -EIO;
921 	}
922 
923 	if (!intel_rps_is_active(rps)) {
924 		pr_err("%s: RPS not enabled on starting spinner\n",
925 		       engine->name);
926 		igt_spinner_end(spin);
927 		i915_request_put(rq);
928 		return -EINVAL;
929 	}
930 
931 	if (!(rps->pm_events & GEN6_PM_RP_UP_THRESHOLD)) {
932 		pr_err("%s: RPS did not register UP interrupt\n",
933 		       engine->name);
934 		i915_request_put(rq);
935 		return -EINVAL;
936 	}
937 
938 	if (rps->last_freq != rps->min_freq) {
939 		pr_err("%s: RPS did not program min frequency\n",
940 		       engine->name);
941 		i915_request_put(rq);
942 		return -EINVAL;
943 	}
944 
945 	timeout = intel_uncore_read(uncore, GEN6_RP_UP_EI);
946 	timeout = intel_gt_pm_interval_to_ns(engine->gt, timeout);
947 	timeout = DIV_ROUND_UP(timeout, 1000);
948 
949 	sleep_for_ei(rps, timeout);
950 	GEM_BUG_ON(i915_request_completed(rq));
951 
952 	igt_spinner_end(spin);
953 	i915_request_put(rq);
954 
955 	if (rps->cur_freq != rps->min_freq) {
956 		pr_err("%s: Frequency unexpectedly changed [up], now %d!\n",
957 		       engine->name, intel_rps_read_actual_frequency(rps));
958 		return -EINVAL;
959 	}
960 
961 	if (!(rps->pm_iir & GEN6_PM_RP_UP_THRESHOLD)) {
962 		pr_err("%s: UP interrupt not recorded for spinner, pm_iir:%x, prev_up:%x, up_threshold:%x, up_ei:%x\n",
963 		       engine->name, rps->pm_iir,
964 		       intel_uncore_read(uncore, GEN6_RP_PREV_UP),
965 		       intel_uncore_read(uncore, GEN6_RP_UP_THRESHOLD),
966 		       intel_uncore_read(uncore, GEN6_RP_UP_EI));
967 		return -EINVAL;
968 	}
969 
970 	return 0;
971 }
972 
973 static int __rps_down_interrupt(struct intel_rps *rps,
974 				struct intel_engine_cs *engine)
975 {
976 	struct intel_uncore *uncore = engine->uncore;
977 	u32 timeout;
978 
979 	rps_set_check(rps, rps->max_freq);
980 
981 	if (!(rps->pm_events & GEN6_PM_RP_DOWN_THRESHOLD)) {
982 		pr_err("%s: RPS did not register DOWN interrupt\n",
983 		       engine->name);
984 		return -EINVAL;
985 	}
986 
987 	if (rps->last_freq != rps->max_freq) {
988 		pr_err("%s: RPS did not program max frequency\n",
989 		       engine->name);
990 		return -EINVAL;
991 	}
992 
993 	timeout = intel_uncore_read(uncore, GEN6_RP_DOWN_EI);
994 	timeout = intel_gt_pm_interval_to_ns(engine->gt, timeout);
995 	timeout = DIV_ROUND_UP(timeout, 1000);
996 
997 	sleep_for_ei(rps, timeout);
998 
999 	if (rps->cur_freq != rps->max_freq) {
1000 		pr_err("%s: Frequency unexpectedly changed [down], now %d!\n",
1001 		       engine->name,
1002 		       intel_rps_read_actual_frequency(rps));
1003 		return -EINVAL;
1004 	}
1005 
1006 	if (!(rps->pm_iir & (GEN6_PM_RP_DOWN_THRESHOLD | GEN6_PM_RP_DOWN_TIMEOUT))) {
1007 		pr_err("%s: DOWN interrupt not recorded for idle, pm_iir:%x, prev_down:%x, down_threshold:%x, down_ei:%x [prev_up:%x, up_threshold:%x, up_ei:%x]\n",
1008 		       engine->name, rps->pm_iir,
1009 		       intel_uncore_read(uncore, GEN6_RP_PREV_DOWN),
1010 		       intel_uncore_read(uncore, GEN6_RP_DOWN_THRESHOLD),
1011 		       intel_uncore_read(uncore, GEN6_RP_DOWN_EI),
1012 		       intel_uncore_read(uncore, GEN6_RP_PREV_UP),
1013 		       intel_uncore_read(uncore, GEN6_RP_UP_THRESHOLD),
1014 		       intel_uncore_read(uncore, GEN6_RP_UP_EI));
1015 		return -EINVAL;
1016 	}
1017 
1018 	return 0;
1019 }
1020 
1021 int live_rps_interrupt(void *arg)
1022 {
1023 	struct intel_gt *gt = arg;
1024 	struct intel_rps *rps = &gt->rps;
1025 	void (*saved_work)(struct work_struct *wrk);
1026 	struct intel_engine_cs *engine;
1027 	enum intel_engine_id id;
1028 	struct igt_spinner spin;
1029 	u32 pm_events;
1030 	int err = 0;
1031 
1032 	/*
1033 	 * First, let's check whether or not we are receiving interrupts.
1034 	 */
1035 
1036 	if (!intel_rps_has_interrupts(rps) || GRAPHICS_VER(gt->i915) < 6)
1037 		return 0;
1038 
1039 	intel_gt_pm_get(gt);
1040 	pm_events = rps->pm_events;
1041 	intel_gt_pm_put(gt);
1042 	if (!pm_events) {
1043 		pr_err("No RPS PM events registered, but RPS is enabled?\n");
1044 		return -ENODEV;
1045 	}
1046 
1047 	if (igt_spinner_init(&spin, gt))
1048 		return -ENOMEM;
1049 
1050 	intel_gt_pm_wait_for_idle(gt);
1051 	saved_work = rps->work.func;
1052 	rps->work.func = dummy_rps_work;
1053 
1054 	for_each_engine(engine, gt, id) {
1055 		/* Keep the engine busy with a spinner; expect an UP! */
1056 		if (pm_events & GEN6_PM_RP_UP_THRESHOLD) {
1057 			intel_gt_pm_wait_for_idle(engine->gt);
1058 			GEM_BUG_ON(intel_rps_is_active(rps));
1059 
1060 			st_engine_heartbeat_disable(engine);
1061 
1062 			err = __rps_up_interrupt(rps, engine, &spin);
1063 
1064 			st_engine_heartbeat_enable(engine);
1065 			if (err)
1066 				goto out;
1067 
1068 			intel_gt_pm_wait_for_idle(engine->gt);
1069 		}
1070 
1071 		/* Keep the engine awake but idle and check for DOWN */
1072 		if (pm_events & GEN6_PM_RP_DOWN_THRESHOLD) {
1073 			st_engine_heartbeat_disable(engine);
1074 			intel_rc6_disable(&gt->rc6);
1075 
1076 			err = __rps_down_interrupt(rps, engine);
1077 
1078 			intel_rc6_enable(&gt->rc6);
1079 			st_engine_heartbeat_enable(engine);
1080 			if (err)
1081 				goto out;
1082 		}
1083 	}
1084 
1085 out:
1086 	if (igt_flush_test(gt->i915))
1087 		err = -EIO;
1088 
1089 	igt_spinner_fini(&spin);
1090 
1091 	intel_gt_pm_wait_for_idle(gt);
1092 	rps->work.func = saved_work;
1093 
1094 	return err;
1095 }
1096 
1097 static u64 __measure_power(int duration_ms)
1098 {
1099 	u64 dE, dt;
1100 
1101 	dt = ktime_get();
1102 	dE = librapl_energy_uJ();
1103 	usleep_range(1000 * duration_ms, 2000 * duration_ms);
1104 	dE = librapl_energy_uJ() - dE;
1105 	dt = ktime_get() - dt;
1106 
1107 	return div64_u64(1000 * 1000 * dE, dt);
1108 }
1109 
1110 static u64 measure_power_at(struct intel_rps *rps, int *freq)
1111 {
1112 	u64 x[5];
1113 	int i;
1114 
1115 	*freq = rps_set_check(rps, *freq);
1116 	for (i = 0; i < 5; i++)
1117 		x[i] = __measure_power(5);
1118 	*freq = (*freq + read_cagf(rps)) / 2;
1119 
1120 	/* A simple triangle filter for better result stability */
1121 	sort(x, 5, sizeof(*x), cmp_u64, NULL);
1122 	return div_u64(x[1] + 2 * x[2] + x[3], 4);
1123 }
1124 
1125 int live_rps_power(void *arg)
1126 {
1127 	struct intel_gt *gt = arg;
1128 	struct intel_rps *rps = &gt->rps;
1129 	void (*saved_work)(struct work_struct *wrk);
1130 	struct intel_engine_cs *engine;
1131 	enum intel_engine_id id;
1132 	struct igt_spinner spin;
1133 	int err = 0;
1134 
1135 	/*
1136 	 * Our fundamental assumption is that running at lower frequency
1137 	 * actually saves power. Let's see if our RAPL measurement support
1138 	 * that theory.
1139 	 */
1140 
1141 	if (!intel_rps_is_enabled(rps) || GRAPHICS_VER(gt->i915) < 6)
1142 		return 0;
1143 
1144 	if (!librapl_supported(gt->i915))
1145 		return 0;
1146 
1147 	if (igt_spinner_init(&spin, gt))
1148 		return -ENOMEM;
1149 
1150 	intel_gt_pm_wait_for_idle(gt);
1151 	saved_work = rps->work.func;
1152 	rps->work.func = dummy_rps_work;
1153 
1154 	for_each_engine(engine, gt, id) {
1155 		struct i915_request *rq;
1156 		struct {
1157 			u64 power;
1158 			int freq;
1159 		} min, max;
1160 
1161 		if (!intel_engine_can_store_dword(engine))
1162 			continue;
1163 
1164 		st_engine_heartbeat_disable(engine);
1165 
1166 		rq = igt_spinner_create_request(&spin,
1167 						engine->kernel_context,
1168 						MI_NOOP);
1169 		if (IS_ERR(rq)) {
1170 			st_engine_heartbeat_enable(engine);
1171 			err = PTR_ERR(rq);
1172 			break;
1173 		}
1174 
1175 		i915_request_add(rq);
1176 
1177 		if (!igt_wait_for_spinner(&spin, rq)) {
1178 			pr_err("%s: RPS spinner did not start\n",
1179 			       engine->name);
1180 			igt_spinner_end(&spin);
1181 			st_engine_heartbeat_enable(engine);
1182 			intel_gt_set_wedged(engine->gt);
1183 			err = -EIO;
1184 			break;
1185 		}
1186 
1187 		max.freq = rps->max_freq;
1188 		max.power = measure_power_at(rps, &max.freq);
1189 
1190 		min.freq = rps->min_freq;
1191 		min.power = measure_power_at(rps, &min.freq);
1192 
1193 		igt_spinner_end(&spin);
1194 		st_engine_heartbeat_enable(engine);
1195 
1196 		pr_info("%s: min:%llumW @ %uMHz, max:%llumW @ %uMHz\n",
1197 			engine->name,
1198 			min.power, intel_gpu_freq(rps, min.freq),
1199 			max.power, intel_gpu_freq(rps, max.freq));
1200 
1201 		if (10 * min.freq >= 9 * max.freq) {
1202 			pr_notice("Could not control frequency, ran at [%d:%uMHz, %d:%uMhz]\n",
1203 				  min.freq, intel_gpu_freq(rps, min.freq),
1204 				  max.freq, intel_gpu_freq(rps, max.freq));
1205 			continue;
1206 		}
1207 
1208 		if (11 * min.power > 10 * max.power) {
1209 			pr_err("%s: did not conserve power when setting lower frequency!\n",
1210 			       engine->name);
1211 			err = -EINVAL;
1212 			break;
1213 		}
1214 
1215 		if (igt_flush_test(gt->i915)) {
1216 			err = -EIO;
1217 			break;
1218 		}
1219 	}
1220 
1221 	igt_spinner_fini(&spin);
1222 
1223 	intel_gt_pm_wait_for_idle(gt);
1224 	rps->work.func = saved_work;
1225 
1226 	return err;
1227 }
1228 
1229 int live_rps_dynamic(void *arg)
1230 {
1231 	struct intel_gt *gt = arg;
1232 	struct intel_rps *rps = &gt->rps;
1233 	struct intel_engine_cs *engine;
1234 	enum intel_engine_id id;
1235 	struct igt_spinner spin;
1236 	int err = 0;
1237 
1238 	/*
1239 	 * We've looked at the bascs, and have established that we
1240 	 * can change the clock frequency and that the HW will generate
1241 	 * interrupts based on load. Now we check how we integrate those
1242 	 * moving parts into dynamic reclocking based on load.
1243 	 */
1244 
1245 	if (!intel_rps_is_enabled(rps) || GRAPHICS_VER(gt->i915) < 6)
1246 		return 0;
1247 
1248 	if (igt_spinner_init(&spin, gt))
1249 		return -ENOMEM;
1250 
1251 	if (intel_rps_has_interrupts(rps))
1252 		pr_info("RPS has interrupt support\n");
1253 	if (intel_rps_uses_timer(rps))
1254 		pr_info("RPS has timer support\n");
1255 
1256 	for_each_engine(engine, gt, id) {
1257 		struct i915_request *rq;
1258 		struct {
1259 			ktime_t dt;
1260 			u8 freq;
1261 		} min, max;
1262 
1263 		if (!intel_engine_can_store_dword(engine))
1264 			continue;
1265 
1266 		intel_gt_pm_wait_for_idle(gt);
1267 		GEM_BUG_ON(intel_rps_is_active(rps));
1268 		rps->cur_freq = rps->min_freq;
1269 
1270 		intel_engine_pm_get(engine);
1271 		intel_rc6_disable(&gt->rc6);
1272 		GEM_BUG_ON(rps->last_freq != rps->min_freq);
1273 
1274 		rq = igt_spinner_create_request(&spin,
1275 						engine->kernel_context,
1276 						MI_NOOP);
1277 		if (IS_ERR(rq)) {
1278 			err = PTR_ERR(rq);
1279 			goto err;
1280 		}
1281 
1282 		i915_request_add(rq);
1283 
1284 		max.dt = ktime_get();
1285 		max.freq = wait_for_freq(rps, rps->max_freq, 500);
1286 		max.dt = ktime_sub(ktime_get(), max.dt);
1287 
1288 		igt_spinner_end(&spin);
1289 
1290 		min.dt = ktime_get();
1291 		min.freq = wait_for_freq(rps, rps->min_freq, 2000);
1292 		min.dt = ktime_sub(ktime_get(), min.dt);
1293 
1294 		pr_info("%s: dynamically reclocked to %u:%uMHz while busy in %lluns, and %u:%uMHz while idle in %lluns\n",
1295 			engine->name,
1296 			max.freq, intel_gpu_freq(rps, max.freq),
1297 			ktime_to_ns(max.dt),
1298 			min.freq, intel_gpu_freq(rps, min.freq),
1299 			ktime_to_ns(min.dt));
1300 		if (min.freq >= max.freq) {
1301 			pr_err("%s: dynamic reclocking of spinner failed\n!",
1302 			       engine->name);
1303 			err = -EINVAL;
1304 		}
1305 
1306 err:
1307 		intel_rc6_enable(&gt->rc6);
1308 		intel_engine_pm_put(engine);
1309 
1310 		if (igt_flush_test(gt->i915))
1311 			err = -EIO;
1312 		if (err)
1313 			break;
1314 	}
1315 
1316 	igt_spinner_fini(&spin);
1317 
1318 	return err;
1319 }
1320