xref: /linux/drivers/gpu/drm/i915/gt/selftest_rps.c (revision 8c994eff8fcfe8ecb1f1dbebed25b4d7bb75be12)
1 // SPDX-License-Identifier: MIT
2 /*
3  * Copyright © 2020 Intel Corporation
4  */
5 
6 #include <linux/pm_qos.h>
7 #include <linux/sort.h>
8 
9 #include "gem/i915_gem_internal.h"
10 
11 #include "i915_reg.h"
12 #include "intel_engine_heartbeat.h"
13 #include "intel_engine_pm.h"
14 #include "intel_engine_regs.h"
15 #include "intel_gpu_commands.h"
16 #include "intel_gt_clock_utils.h"
17 #include "intel_gt_pm.h"
18 #include "intel_rc6.h"
19 #include "selftest_engine_heartbeat.h"
20 #include "selftest_rps.h"
21 #include "selftests/igt_flush_test.h"
22 #include "selftests/igt_spinner.h"
23 #include "selftests/librapl.h"
24 
25 /* Try to isolate the impact of cstates from determing frequency response */
26 #define CPU_LATENCY 0 /* -1 to disable pm_qos, 0 to disable cstates */
27 
28 static void dummy_rps_work(struct work_struct *wrk)
29 {
30 }
31 
32 static int cmp_u64(const void *A, const void *B)
33 {
34 	const u64 *a = A, *b = B;
35 
36 	if (*a < *b)
37 		return -1;
38 	else if (*a > *b)
39 		return 1;
40 	else
41 		return 0;
42 }
43 
44 static int cmp_u32(const void *A, const void *B)
45 {
46 	const u32 *a = A, *b = B;
47 
48 	if (*a < *b)
49 		return -1;
50 	else if (*a > *b)
51 		return 1;
52 	else
53 		return 0;
54 }
55 
56 static struct i915_vma *
57 create_spin_counter(struct intel_engine_cs *engine,
58 		    struct i915_address_space *vm,
59 		    bool srm,
60 		    u32 **cancel,
61 		    u32 **counter)
62 {
63 	enum {
64 		COUNT,
65 		INC,
66 		__NGPR__,
67 	};
68 #define CS_GPR(x) GEN8_RING_CS_GPR(engine->mmio_base, x)
69 	struct drm_i915_gem_object *obj;
70 	struct i915_vma *vma;
71 	unsigned long end;
72 	u32 *base, *cs;
73 	int loop, i;
74 	int err;
75 
76 	obj = i915_gem_object_create_internal(vm->i915, 64 << 10);
77 	if (IS_ERR(obj))
78 		return ERR_CAST(obj);
79 
80 	end = obj->base.size / sizeof(u32) - 1;
81 
82 	vma = i915_vma_instance(obj, vm, NULL);
83 	if (IS_ERR(vma)) {
84 		err = PTR_ERR(vma);
85 		goto err_put;
86 	}
87 
88 	err = i915_vma_pin(vma, 0, 0, PIN_USER);
89 	if (err)
90 		goto err_unlock;
91 
92 	i915_vma_lock(vma);
93 
94 	base = i915_gem_object_pin_map(obj, I915_MAP_WC);
95 	if (IS_ERR(base)) {
96 		err = PTR_ERR(base);
97 		goto err_unpin;
98 	}
99 	cs = base;
100 
101 	*cs++ = MI_LOAD_REGISTER_IMM(__NGPR__ * 2);
102 	for (i = 0; i < __NGPR__; i++) {
103 		*cs++ = i915_mmio_reg_offset(CS_GPR(i));
104 		*cs++ = 0;
105 		*cs++ = i915_mmio_reg_offset(CS_GPR(i)) + 4;
106 		*cs++ = 0;
107 	}
108 
109 	*cs++ = MI_LOAD_REGISTER_IMM(1);
110 	*cs++ = i915_mmio_reg_offset(CS_GPR(INC));
111 	*cs++ = 1;
112 
113 	loop = cs - base;
114 
115 	/* Unroll the loop to avoid MI_BB_START stalls impacting measurements */
116 	for (i = 0; i < 1024; i++) {
117 		*cs++ = MI_MATH(4);
118 		*cs++ = MI_MATH_LOAD(MI_MATH_REG_SRCA, MI_MATH_REG(COUNT));
119 		*cs++ = MI_MATH_LOAD(MI_MATH_REG_SRCB, MI_MATH_REG(INC));
120 		*cs++ = MI_MATH_ADD;
121 		*cs++ = MI_MATH_STORE(MI_MATH_REG(COUNT), MI_MATH_REG_ACCU);
122 
123 		if (srm) {
124 			*cs++ = MI_STORE_REGISTER_MEM_GEN8;
125 			*cs++ = i915_mmio_reg_offset(CS_GPR(COUNT));
126 			*cs++ = lower_32_bits(i915_vma_offset(vma) + end * sizeof(*cs));
127 			*cs++ = upper_32_bits(i915_vma_offset(vma) + end * sizeof(*cs));
128 		}
129 	}
130 
131 	*cs++ = MI_BATCH_BUFFER_START_GEN8;
132 	*cs++ = lower_32_bits(i915_vma_offset(vma) + loop * sizeof(*cs));
133 	*cs++ = upper_32_bits(i915_vma_offset(vma) + loop * sizeof(*cs));
134 	GEM_BUG_ON(cs - base > end);
135 
136 	i915_gem_object_flush_map(obj);
137 
138 	*cancel = base + loop;
139 	*counter = srm ? memset32(base + end, 0, 1) : NULL;
140 	return vma;
141 
142 err_unpin:
143 	i915_vma_unpin(vma);
144 err_unlock:
145 	i915_vma_unlock(vma);
146 err_put:
147 	i915_gem_object_put(obj);
148 	return ERR_PTR(err);
149 }
150 
151 static u8 wait_for_freq(struct intel_rps *rps, u8 freq, int timeout_ms)
152 {
153 	u8 history[64], i;
154 	unsigned long end;
155 	int sleep;
156 
157 	i = 0;
158 	memset(history, freq, sizeof(history));
159 	sleep = 20;
160 
161 	/* The PCU does not change instantly, but drifts towards the goal? */
162 	end = jiffies + msecs_to_jiffies(timeout_ms);
163 	do {
164 		u8 act;
165 
166 		act = read_cagf(rps);
167 		if (time_after(jiffies, end))
168 			return act;
169 
170 		/* Target acquired */
171 		if (act == freq)
172 			return act;
173 
174 		/* Any change within the last N samples? */
175 		if (!memchr_inv(history, act, sizeof(history)))
176 			return act;
177 
178 		history[i] = act;
179 		i = (i + 1) % ARRAY_SIZE(history);
180 
181 		usleep_range(sleep, 2 * sleep);
182 		sleep *= 2;
183 		if (sleep > timeout_ms * 20)
184 			sleep = timeout_ms * 20;
185 	} while (1);
186 }
187 
188 static u8 rps_set_check(struct intel_rps *rps, u8 freq)
189 {
190 	mutex_lock(&rps->lock);
191 	GEM_BUG_ON(!intel_rps_is_active(rps));
192 	if (wait_for(!intel_rps_set(rps, freq), 50)) {
193 		mutex_unlock(&rps->lock);
194 		return 0;
195 	}
196 	GEM_BUG_ON(rps->last_freq != freq);
197 	mutex_unlock(&rps->lock);
198 
199 	return wait_for_freq(rps, freq, 50);
200 }
201 
202 static void show_pstate_limits(struct intel_rps *rps)
203 {
204 	struct drm_i915_private *i915 = rps_to_i915(rps);
205 
206 	if (IS_BROXTON(i915)) {
207 		pr_info("P_STATE_CAP[%x]: 0x%08x\n",
208 			i915_mmio_reg_offset(BXT_RP_STATE_CAP),
209 			intel_uncore_read(rps_to_uncore(rps),
210 					  BXT_RP_STATE_CAP));
211 	} else if (GRAPHICS_VER(i915) == 9) {
212 		pr_info("P_STATE_LIMITS[%x]: 0x%08x\n",
213 			i915_mmio_reg_offset(GEN9_RP_STATE_LIMITS),
214 			intel_uncore_read(rps_to_uncore(rps),
215 					  GEN9_RP_STATE_LIMITS));
216 	}
217 }
218 
219 int live_rps_clock_interval(void *arg)
220 {
221 	struct intel_gt *gt = arg;
222 	struct intel_rps *rps = &gt->rps;
223 	void (*saved_work)(struct work_struct *wrk);
224 	struct intel_engine_cs *engine;
225 	enum intel_engine_id id;
226 	struct igt_spinner spin;
227 	int err = 0;
228 
229 	if (!intel_rps_is_enabled(rps) || GRAPHICS_VER(gt->i915) < 6)
230 		return 0;
231 
232 	if (igt_spinner_init(&spin, gt))
233 		return -ENOMEM;
234 
235 	intel_gt_pm_wait_for_idle(gt);
236 	saved_work = rps->work.func;
237 	rps->work.func = dummy_rps_work;
238 
239 	intel_gt_pm_get(gt);
240 	intel_rps_disable(&gt->rps);
241 
242 	intel_gt_check_clock_frequency(gt);
243 
244 	for_each_engine(engine, gt, id) {
245 		struct i915_request *rq;
246 		u32 cycles;
247 		u64 dt;
248 
249 		if (!intel_engine_can_store_dword(engine))
250 			continue;
251 
252 		st_engine_heartbeat_disable(engine);
253 
254 		rq = igt_spinner_create_request(&spin,
255 						engine->kernel_context,
256 						MI_NOOP);
257 		if (IS_ERR(rq)) {
258 			st_engine_heartbeat_enable(engine);
259 			err = PTR_ERR(rq);
260 			break;
261 		}
262 
263 		i915_request_add(rq);
264 
265 		if (!igt_wait_for_spinner(&spin, rq)) {
266 			pr_err("%s: RPS spinner did not start\n",
267 			       engine->name);
268 			igt_spinner_end(&spin);
269 			st_engine_heartbeat_enable(engine);
270 			intel_gt_set_wedged(engine->gt);
271 			err = -EIO;
272 			break;
273 		}
274 
275 		intel_uncore_forcewake_get(gt->uncore, FORCEWAKE_ALL);
276 
277 		intel_uncore_write_fw(gt->uncore, GEN6_RP_CUR_UP_EI, 0);
278 
279 		/* Set the evaluation interval to infinity! */
280 		intel_uncore_write_fw(gt->uncore,
281 				      GEN6_RP_UP_EI, 0xffffffff);
282 		intel_uncore_write_fw(gt->uncore,
283 				      GEN6_RP_UP_THRESHOLD, 0xffffffff);
284 
285 		intel_uncore_write_fw(gt->uncore, GEN6_RP_CONTROL,
286 				      GEN6_RP_ENABLE | GEN6_RP_UP_BUSY_AVG);
287 
288 		if (wait_for(intel_uncore_read_fw(gt->uncore,
289 						  GEN6_RP_CUR_UP_EI),
290 			     10)) {
291 			/* Just skip the test; assume lack of HW support */
292 			pr_notice("%s: rps evaluation interval not ticking\n",
293 				  engine->name);
294 			err = -ENODEV;
295 		} else {
296 			ktime_t dt_[5];
297 			u32 cycles_[5];
298 			int i;
299 
300 			for (i = 0; i < 5; i++) {
301 				preempt_disable();
302 
303 				cycles_[i] = -intel_uncore_read_fw(gt->uncore, GEN6_RP_CUR_UP_EI);
304 				dt_[i] = ktime_get();
305 
306 				udelay(1000);
307 
308 				cycles_[i] += intel_uncore_read_fw(gt->uncore, GEN6_RP_CUR_UP_EI);
309 				dt_[i] = ktime_sub(ktime_get(), dt_[i]);
310 
311 				preempt_enable();
312 			}
313 
314 			/* Use the median of both cycle/dt; close enough */
315 			sort(cycles_, 5, sizeof(*cycles_), cmp_u32, NULL);
316 			cycles = (cycles_[1] + 2 * cycles_[2] + cycles_[3]) / 4;
317 			sort(dt_, 5, sizeof(*dt_), cmp_u64, NULL);
318 			dt = div_u64(dt_[1] + 2 * dt_[2] + dt_[3], 4);
319 		}
320 
321 		intel_uncore_write_fw(gt->uncore, GEN6_RP_CONTROL, 0);
322 		intel_uncore_forcewake_put(gt->uncore, FORCEWAKE_ALL);
323 
324 		igt_spinner_end(&spin);
325 		st_engine_heartbeat_enable(engine);
326 
327 		if (err == 0) {
328 			u64 time = intel_gt_pm_interval_to_ns(gt, cycles);
329 			u32 expected =
330 				intel_gt_ns_to_pm_interval(gt, dt);
331 
332 			pr_info("%s: rps counted %d C0 cycles [%lldns] in %lldns [%d cycles], using GT clock frequency of %uKHz\n",
333 				engine->name, cycles, time, dt, expected,
334 				gt->clock_frequency / 1000);
335 
336 			if (10 * time < 8 * dt ||
337 			    8 * time > 10 * dt) {
338 				pr_err("%s: rps clock time does not match walltime!\n",
339 				       engine->name);
340 				err = -EINVAL;
341 			}
342 
343 			if (10 * expected < 8 * cycles ||
344 			    8 * expected > 10 * cycles) {
345 				pr_err("%s: walltime does not match rps clock ticks!\n",
346 				       engine->name);
347 				err = -EINVAL;
348 			}
349 		}
350 
351 		if (igt_flush_test(gt->i915))
352 			err = -EIO;
353 
354 		break; /* once is enough */
355 	}
356 
357 	intel_rps_enable(&gt->rps);
358 	intel_gt_pm_put(gt);
359 
360 	igt_spinner_fini(&spin);
361 
362 	intel_gt_pm_wait_for_idle(gt);
363 	rps->work.func = saved_work;
364 
365 	if (err == -ENODEV) /* skipped, don't report a fail */
366 		err = 0;
367 
368 	return err;
369 }
370 
371 int live_rps_control(void *arg)
372 {
373 	struct intel_gt *gt = arg;
374 	struct intel_rps *rps = &gt->rps;
375 	void (*saved_work)(struct work_struct *wrk);
376 	struct intel_engine_cs *engine;
377 	enum intel_engine_id id;
378 	struct igt_spinner spin;
379 	int err = 0;
380 
381 	/*
382 	 * Check that the actual frequency matches our requested frequency,
383 	 * to verify our control mechanism. We have to be careful that the
384 	 * PCU may throttle the GPU in which case the actual frequency used
385 	 * will be lowered than requested.
386 	 */
387 
388 	if (!intel_rps_is_enabled(rps))
389 		return 0;
390 
391 	if (IS_CHERRYVIEW(gt->i915)) /* XXX fragile PCU */
392 		return 0;
393 
394 	if (igt_spinner_init(&spin, gt))
395 		return -ENOMEM;
396 
397 	intel_gt_pm_wait_for_idle(gt);
398 	saved_work = rps->work.func;
399 	rps->work.func = dummy_rps_work;
400 
401 	intel_gt_pm_get(gt);
402 	for_each_engine(engine, gt, id) {
403 		struct i915_request *rq;
404 		ktime_t min_dt, max_dt;
405 		int f, limit;
406 		int min, max;
407 
408 		if (!intel_engine_can_store_dword(engine))
409 			continue;
410 
411 		st_engine_heartbeat_disable(engine);
412 
413 		rq = igt_spinner_create_request(&spin,
414 						engine->kernel_context,
415 						MI_NOOP);
416 		if (IS_ERR(rq)) {
417 			err = PTR_ERR(rq);
418 			break;
419 		}
420 
421 		i915_request_add(rq);
422 
423 		if (!igt_wait_for_spinner(&spin, rq)) {
424 			pr_err("%s: RPS spinner did not start\n",
425 			       engine->name);
426 			igt_spinner_end(&spin);
427 			st_engine_heartbeat_enable(engine);
428 			intel_gt_set_wedged(engine->gt);
429 			err = -EIO;
430 			break;
431 		}
432 
433 		if (rps_set_check(rps, rps->min_freq) != rps->min_freq) {
434 			pr_err("%s: could not set minimum frequency [%x], only %x!\n",
435 			       engine->name, rps->min_freq, read_cagf(rps));
436 			igt_spinner_end(&spin);
437 			st_engine_heartbeat_enable(engine);
438 			show_pstate_limits(rps);
439 			err = -EINVAL;
440 			break;
441 		}
442 
443 		for (f = rps->min_freq + 1; f < rps->max_freq; f++) {
444 			if (rps_set_check(rps, f) < f)
445 				break;
446 		}
447 
448 		limit = rps_set_check(rps, f);
449 
450 		if (rps_set_check(rps, rps->min_freq) != rps->min_freq) {
451 			pr_err("%s: could not restore minimum frequency [%x], only %x!\n",
452 			       engine->name, rps->min_freq, read_cagf(rps));
453 			igt_spinner_end(&spin);
454 			st_engine_heartbeat_enable(engine);
455 			show_pstate_limits(rps);
456 			err = -EINVAL;
457 			break;
458 		}
459 
460 		max_dt = ktime_get();
461 		max = rps_set_check(rps, limit);
462 		max_dt = ktime_sub(ktime_get(), max_dt);
463 
464 		min_dt = ktime_get();
465 		min = rps_set_check(rps, rps->min_freq);
466 		min_dt = ktime_sub(ktime_get(), min_dt);
467 
468 		igt_spinner_end(&spin);
469 		st_engine_heartbeat_enable(engine);
470 
471 		pr_info("%s: range:[%x:%uMHz, %x:%uMHz] limit:[%x:%uMHz], %x:%x response %lluns:%lluns\n",
472 			engine->name,
473 			rps->min_freq, intel_gpu_freq(rps, rps->min_freq),
474 			rps->max_freq, intel_gpu_freq(rps, rps->max_freq),
475 			limit, intel_gpu_freq(rps, limit),
476 			min, max, ktime_to_ns(min_dt), ktime_to_ns(max_dt));
477 
478 		if (limit == rps->min_freq) {
479 			pr_err("%s: GPU throttled to minimum!\n",
480 			       engine->name);
481 			show_pstate_limits(rps);
482 			err = -ENODEV;
483 			break;
484 		}
485 
486 		if (igt_flush_test(gt->i915)) {
487 			err = -EIO;
488 			break;
489 		}
490 	}
491 	intel_gt_pm_put(gt);
492 
493 	igt_spinner_fini(&spin);
494 
495 	intel_gt_pm_wait_for_idle(gt);
496 	rps->work.func = saved_work;
497 
498 	return err;
499 }
500 
501 static void show_pcu_config(struct intel_rps *rps)
502 {
503 	struct drm_i915_private *i915 = rps_to_i915(rps);
504 	unsigned int max_gpu_freq, min_gpu_freq;
505 	intel_wakeref_t wakeref;
506 	int gpu_freq;
507 
508 	if (!HAS_LLC(i915))
509 		return;
510 
511 	min_gpu_freq = rps->min_freq;
512 	max_gpu_freq = rps->max_freq;
513 	if (GRAPHICS_VER(i915) >= 9) {
514 		/* Convert GT frequency to 50 HZ units */
515 		min_gpu_freq /= GEN9_FREQ_SCALER;
516 		max_gpu_freq /= GEN9_FREQ_SCALER;
517 	}
518 
519 	wakeref = intel_runtime_pm_get(rps_to_uncore(rps)->rpm);
520 
521 	pr_info("%5s  %5s  %5s\n", "GPU", "eCPU", "eRing");
522 	for (gpu_freq = min_gpu_freq; gpu_freq <= max_gpu_freq; gpu_freq++) {
523 		int ia_freq = gpu_freq;
524 
525 		snb_pcode_read(rps_to_gt(rps)->uncore, GEN6_PCODE_READ_MIN_FREQ_TABLE,
526 			       &ia_freq, NULL);
527 
528 		pr_info("%5d  %5d  %5d\n",
529 			gpu_freq * 50,
530 			((ia_freq >> 0) & 0xff) * 100,
531 			((ia_freq >> 8) & 0xff) * 100);
532 	}
533 
534 	intel_runtime_pm_put(rps_to_uncore(rps)->rpm, wakeref);
535 }
536 
537 static u64 __measure_frequency(u32 *cntr, int duration_ms)
538 {
539 	u64 dc, dt;
540 
541 	dc = READ_ONCE(*cntr);
542 	dt = ktime_get();
543 	usleep_range(1000 * duration_ms, 2000 * duration_ms);
544 	dc = READ_ONCE(*cntr) - dc;
545 	dt = ktime_get() - dt;
546 
547 	return div64_u64(1000 * 1000 * dc, dt);
548 }
549 
550 static u64 measure_frequency_at(struct intel_rps *rps, u32 *cntr, int *freq)
551 {
552 	u64 x[5];
553 	int i;
554 
555 	*freq = rps_set_check(rps, *freq);
556 	for (i = 0; i < 5; i++)
557 		x[i] = __measure_frequency(cntr, 2);
558 	*freq = (*freq + read_cagf(rps)) / 2;
559 
560 	/* A simple triangle filter for better result stability */
561 	sort(x, 5, sizeof(*x), cmp_u64, NULL);
562 	return div_u64(x[1] + 2 * x[2] + x[3], 4);
563 }
564 
565 static u64 __measure_cs_frequency(struct intel_engine_cs *engine,
566 				  int duration_ms)
567 {
568 	u64 dc, dt;
569 
570 	dc = intel_uncore_read_fw(engine->uncore, CS_GPR(0));
571 	dt = ktime_get();
572 	usleep_range(1000 * duration_ms, 2000 * duration_ms);
573 	dc = intel_uncore_read_fw(engine->uncore, CS_GPR(0)) - dc;
574 	dt = ktime_get() - dt;
575 
576 	return div64_u64(1000 * 1000 * dc, dt);
577 }
578 
579 static u64 measure_cs_frequency_at(struct intel_rps *rps,
580 				   struct intel_engine_cs *engine,
581 				   int *freq)
582 {
583 	u64 x[5];
584 	int i;
585 
586 	*freq = rps_set_check(rps, *freq);
587 	for (i = 0; i < 5; i++)
588 		x[i] = __measure_cs_frequency(engine, 2);
589 	*freq = (*freq + read_cagf(rps)) / 2;
590 
591 	/* A simple triangle filter for better result stability */
592 	sort(x, 5, sizeof(*x), cmp_u64, NULL);
593 	return div_u64(x[1] + 2 * x[2] + x[3], 4);
594 }
595 
596 static bool scaled_within(u64 x, u64 y, u32 f_n, u32 f_d)
597 {
598 	return f_d * x > f_n * y && f_n * x < f_d * y;
599 }
600 
601 int live_rps_frequency_cs(void *arg)
602 {
603 	void (*saved_work)(struct work_struct *wrk);
604 	struct intel_gt *gt = arg;
605 	struct intel_rps *rps = &gt->rps;
606 	struct intel_engine_cs *engine;
607 	struct pm_qos_request qos;
608 	enum intel_engine_id id;
609 	int err = 0;
610 
611 	/*
612 	 * The premise is that the GPU does change frequency at our behest.
613 	 * Let's check there is a correspondence between the requested
614 	 * frequency, the actual frequency, and the observed clock rate.
615 	 */
616 
617 	if (!intel_rps_is_enabled(rps))
618 		return 0;
619 
620 	if (GRAPHICS_VER(gt->i915) < 8) /* for CS simplicity */
621 		return 0;
622 
623 	if (CPU_LATENCY >= 0)
624 		cpu_latency_qos_add_request(&qos, CPU_LATENCY);
625 
626 	intel_gt_pm_wait_for_idle(gt);
627 	saved_work = rps->work.func;
628 	rps->work.func = dummy_rps_work;
629 
630 	for_each_engine(engine, gt, id) {
631 		struct i915_request *rq;
632 		struct i915_vma *vma;
633 		u32 *cancel, *cntr;
634 		struct {
635 			u64 count;
636 			int freq;
637 		} min, max;
638 
639 		st_engine_heartbeat_disable(engine);
640 
641 		vma = create_spin_counter(engine,
642 					  engine->kernel_context->vm, false,
643 					  &cancel, &cntr);
644 		if (IS_ERR(vma)) {
645 			err = PTR_ERR(vma);
646 			st_engine_heartbeat_enable(engine);
647 			break;
648 		}
649 
650 		rq = intel_engine_create_kernel_request(engine);
651 		if (IS_ERR(rq)) {
652 			err = PTR_ERR(rq);
653 			goto err_vma;
654 		}
655 
656 		err = i915_vma_move_to_active(vma, rq, 0);
657 		if (!err)
658 			err = rq->engine->emit_bb_start(rq,
659 							i915_vma_offset(vma),
660 							PAGE_SIZE, 0);
661 		i915_request_add(rq);
662 		if (err)
663 			goto err_vma;
664 
665 		if (wait_for(intel_uncore_read(engine->uncore, CS_GPR(0)),
666 			     10)) {
667 			pr_err("%s: timed loop did not start\n",
668 			       engine->name);
669 			goto err_vma;
670 		}
671 
672 		min.freq = rps->min_freq;
673 		min.count = measure_cs_frequency_at(rps, engine, &min.freq);
674 
675 		max.freq = rps->max_freq;
676 		max.count = measure_cs_frequency_at(rps, engine, &max.freq);
677 
678 		pr_info("%s: min:%lluKHz @ %uMHz, max:%lluKHz @ %uMHz [%d%%]\n",
679 			engine->name,
680 			min.count, intel_gpu_freq(rps, min.freq),
681 			max.count, intel_gpu_freq(rps, max.freq),
682 			(int)DIV64_U64_ROUND_CLOSEST(100 * min.freq * max.count,
683 						     max.freq * min.count));
684 
685 		if (!scaled_within(max.freq * min.count,
686 				   min.freq * max.count,
687 				   2, 3)) {
688 			int f;
689 
690 			pr_err("%s: CS did not scale with frequency! scaled min:%llu, max:%llu\n",
691 			       engine->name,
692 			       max.freq * min.count,
693 			       min.freq * max.count);
694 			show_pcu_config(rps);
695 
696 			for (f = min.freq + 1; f <= rps->max_freq; f++) {
697 				int act = f;
698 				u64 count;
699 
700 				count = measure_cs_frequency_at(rps, engine, &act);
701 				if (act < f)
702 					break;
703 
704 				pr_info("%s: %x:%uMHz: %lluKHz [%d%%]\n",
705 					engine->name,
706 					act, intel_gpu_freq(rps, act), count,
707 					(int)DIV64_U64_ROUND_CLOSEST(100 * min.freq * count,
708 								     act * min.count));
709 
710 				f = act; /* may skip ahead [pcu granularity] */
711 			}
712 
713 			err = -EINTR; /* ignore error, continue on with test */
714 		}
715 
716 err_vma:
717 		*cancel = MI_BATCH_BUFFER_END;
718 		i915_gem_object_flush_map(vma->obj);
719 		i915_gem_object_unpin_map(vma->obj);
720 		i915_vma_unpin(vma);
721 		i915_vma_unlock(vma);
722 		i915_vma_put(vma);
723 
724 		st_engine_heartbeat_enable(engine);
725 		if (igt_flush_test(gt->i915))
726 			err = -EIO;
727 		if (err)
728 			break;
729 	}
730 
731 	intel_gt_pm_wait_for_idle(gt);
732 	rps->work.func = saved_work;
733 
734 	if (CPU_LATENCY >= 0)
735 		cpu_latency_qos_remove_request(&qos);
736 
737 	return err;
738 }
739 
740 int live_rps_frequency_srm(void *arg)
741 {
742 	void (*saved_work)(struct work_struct *wrk);
743 	struct intel_gt *gt = arg;
744 	struct intel_rps *rps = &gt->rps;
745 	struct intel_engine_cs *engine;
746 	struct pm_qos_request qos;
747 	enum intel_engine_id id;
748 	int err = 0;
749 
750 	/*
751 	 * The premise is that the GPU does change frequency at our behest.
752 	 * Let's check there is a correspondence between the requested
753 	 * frequency, the actual frequency, and the observed clock rate.
754 	 */
755 
756 	if (!intel_rps_is_enabled(rps))
757 		return 0;
758 
759 	if (GRAPHICS_VER(gt->i915) < 8) /* for CS simplicity */
760 		return 0;
761 
762 	if (CPU_LATENCY >= 0)
763 		cpu_latency_qos_add_request(&qos, CPU_LATENCY);
764 
765 	intel_gt_pm_wait_for_idle(gt);
766 	saved_work = rps->work.func;
767 	rps->work.func = dummy_rps_work;
768 
769 	for_each_engine(engine, gt, id) {
770 		struct i915_request *rq;
771 		struct i915_vma *vma;
772 		u32 *cancel, *cntr;
773 		struct {
774 			u64 count;
775 			int freq;
776 		} min, max;
777 
778 		st_engine_heartbeat_disable(engine);
779 
780 		vma = create_spin_counter(engine,
781 					  engine->kernel_context->vm, true,
782 					  &cancel, &cntr);
783 		if (IS_ERR(vma)) {
784 			err = PTR_ERR(vma);
785 			st_engine_heartbeat_enable(engine);
786 			break;
787 		}
788 
789 		rq = intel_engine_create_kernel_request(engine);
790 		if (IS_ERR(rq)) {
791 			err = PTR_ERR(rq);
792 			goto err_vma;
793 		}
794 
795 		err = i915_vma_move_to_active(vma, rq, 0);
796 		if (!err)
797 			err = rq->engine->emit_bb_start(rq,
798 							i915_vma_offset(vma),
799 							PAGE_SIZE, 0);
800 		i915_request_add(rq);
801 		if (err)
802 			goto err_vma;
803 
804 		if (wait_for(READ_ONCE(*cntr), 10)) {
805 			pr_err("%s: timed loop did not start\n",
806 			       engine->name);
807 			goto err_vma;
808 		}
809 
810 		min.freq = rps->min_freq;
811 		min.count = measure_frequency_at(rps, cntr, &min.freq);
812 
813 		max.freq = rps->max_freq;
814 		max.count = measure_frequency_at(rps, cntr, &max.freq);
815 
816 		pr_info("%s: min:%lluKHz @ %uMHz, max:%lluKHz @ %uMHz [%d%%]\n",
817 			engine->name,
818 			min.count, intel_gpu_freq(rps, min.freq),
819 			max.count, intel_gpu_freq(rps, max.freq),
820 			(int)DIV64_U64_ROUND_CLOSEST(100 * min.freq * max.count,
821 						     max.freq * min.count));
822 
823 		if (!scaled_within(max.freq * min.count,
824 				   min.freq * max.count,
825 				   1, 2)) {
826 			int f;
827 
828 			pr_err("%s: CS did not scale with frequency! scaled min:%llu, max:%llu\n",
829 			       engine->name,
830 			       max.freq * min.count,
831 			       min.freq * max.count);
832 			show_pcu_config(rps);
833 
834 			for (f = min.freq + 1; f <= rps->max_freq; f++) {
835 				int act = f;
836 				u64 count;
837 
838 				count = measure_frequency_at(rps, cntr, &act);
839 				if (act < f)
840 					break;
841 
842 				pr_info("%s: %x:%uMHz: %lluKHz [%d%%]\n",
843 					engine->name,
844 					act, intel_gpu_freq(rps, act), count,
845 					(int)DIV64_U64_ROUND_CLOSEST(100 * min.freq * count,
846 								     act * min.count));
847 
848 				f = act; /* may skip ahead [pcu granularity] */
849 			}
850 
851 			err = -EINTR; /* ignore error, continue on with test */
852 		}
853 
854 err_vma:
855 		*cancel = MI_BATCH_BUFFER_END;
856 		i915_gem_object_flush_map(vma->obj);
857 		i915_gem_object_unpin_map(vma->obj);
858 		i915_vma_unpin(vma);
859 		i915_vma_unlock(vma);
860 		i915_vma_put(vma);
861 
862 		st_engine_heartbeat_enable(engine);
863 		if (igt_flush_test(gt->i915))
864 			err = -EIO;
865 		if (err)
866 			break;
867 	}
868 
869 	intel_gt_pm_wait_for_idle(gt);
870 	rps->work.func = saved_work;
871 
872 	if (CPU_LATENCY >= 0)
873 		cpu_latency_qos_remove_request(&qos);
874 
875 	return err;
876 }
877 
878 static void sleep_for_ei(struct intel_rps *rps, int timeout_us)
879 {
880 	/* Flush any previous EI */
881 	usleep_range(timeout_us, 2 * timeout_us);
882 
883 	/* Reset the interrupt status */
884 	rps_disable_interrupts(rps);
885 	GEM_BUG_ON(rps->pm_iir);
886 	rps_enable_interrupts(rps);
887 
888 	/* And then wait for the timeout, for real this time */
889 	usleep_range(2 * timeout_us, 3 * timeout_us);
890 }
891 
892 static int __rps_up_interrupt(struct intel_rps *rps,
893 			      struct intel_engine_cs *engine,
894 			      struct igt_spinner *spin)
895 {
896 	struct intel_uncore *uncore = engine->uncore;
897 	struct i915_request *rq;
898 	u32 timeout;
899 
900 	if (!intel_engine_can_store_dword(engine))
901 		return 0;
902 
903 	rps_set_check(rps, rps->min_freq);
904 
905 	rq = igt_spinner_create_request(spin, engine->kernel_context, MI_NOOP);
906 	if (IS_ERR(rq))
907 		return PTR_ERR(rq);
908 
909 	i915_request_get(rq);
910 	i915_request_add(rq);
911 
912 	if (!igt_wait_for_spinner(spin, rq)) {
913 		pr_err("%s: RPS spinner did not start\n",
914 		       engine->name);
915 		i915_request_put(rq);
916 		intel_gt_set_wedged(engine->gt);
917 		return -EIO;
918 	}
919 
920 	if (!intel_rps_is_active(rps)) {
921 		pr_err("%s: RPS not enabled on starting spinner\n",
922 		       engine->name);
923 		igt_spinner_end(spin);
924 		i915_request_put(rq);
925 		return -EINVAL;
926 	}
927 
928 	if (!(rps->pm_events & GEN6_PM_RP_UP_THRESHOLD)) {
929 		pr_err("%s: RPS did not register UP interrupt\n",
930 		       engine->name);
931 		i915_request_put(rq);
932 		return -EINVAL;
933 	}
934 
935 	if (rps->last_freq != rps->min_freq) {
936 		pr_err("%s: RPS did not program min frequency\n",
937 		       engine->name);
938 		i915_request_put(rq);
939 		return -EINVAL;
940 	}
941 
942 	timeout = intel_uncore_read(uncore, GEN6_RP_UP_EI);
943 	timeout = intel_gt_pm_interval_to_ns(engine->gt, timeout);
944 	timeout = DIV_ROUND_UP(timeout, 1000);
945 
946 	sleep_for_ei(rps, timeout);
947 	GEM_BUG_ON(i915_request_completed(rq));
948 
949 	igt_spinner_end(spin);
950 	i915_request_put(rq);
951 
952 	if (rps->cur_freq != rps->min_freq) {
953 		pr_err("%s: Frequency unexpectedly changed [up], now %d!\n",
954 		       engine->name, intel_rps_read_actual_frequency(rps));
955 		return -EINVAL;
956 	}
957 
958 	if (!(rps->pm_iir & GEN6_PM_RP_UP_THRESHOLD)) {
959 		pr_err("%s: UP interrupt not recorded for spinner, pm_iir:%x, prev_up:%x, up_threshold:%x, up_ei:%x\n",
960 		       engine->name, rps->pm_iir,
961 		       intel_uncore_read(uncore, GEN6_RP_PREV_UP),
962 		       intel_uncore_read(uncore, GEN6_RP_UP_THRESHOLD),
963 		       intel_uncore_read(uncore, GEN6_RP_UP_EI));
964 		return -EINVAL;
965 	}
966 
967 	return 0;
968 }
969 
970 static int __rps_down_interrupt(struct intel_rps *rps,
971 				struct intel_engine_cs *engine)
972 {
973 	struct intel_uncore *uncore = engine->uncore;
974 	u32 timeout;
975 
976 	rps_set_check(rps, rps->max_freq);
977 
978 	if (!(rps->pm_events & GEN6_PM_RP_DOWN_THRESHOLD)) {
979 		pr_err("%s: RPS did not register DOWN interrupt\n",
980 		       engine->name);
981 		return -EINVAL;
982 	}
983 
984 	if (rps->last_freq != rps->max_freq) {
985 		pr_err("%s: RPS did not program max frequency\n",
986 		       engine->name);
987 		return -EINVAL;
988 	}
989 
990 	timeout = intel_uncore_read(uncore, GEN6_RP_DOWN_EI);
991 	timeout = intel_gt_pm_interval_to_ns(engine->gt, timeout);
992 	timeout = DIV_ROUND_UP(timeout, 1000);
993 
994 	sleep_for_ei(rps, timeout);
995 
996 	if (rps->cur_freq != rps->max_freq) {
997 		pr_err("%s: Frequency unexpectedly changed [down], now %d!\n",
998 		       engine->name,
999 		       intel_rps_read_actual_frequency(rps));
1000 		return -EINVAL;
1001 	}
1002 
1003 	if (!(rps->pm_iir & (GEN6_PM_RP_DOWN_THRESHOLD | GEN6_PM_RP_DOWN_TIMEOUT))) {
1004 		pr_err("%s: DOWN interrupt not recorded for idle, pm_iir:%x, prev_down:%x, down_threshold:%x, down_ei:%x [prev_up:%x, up_threshold:%x, up_ei:%x]\n",
1005 		       engine->name, rps->pm_iir,
1006 		       intel_uncore_read(uncore, GEN6_RP_PREV_DOWN),
1007 		       intel_uncore_read(uncore, GEN6_RP_DOWN_THRESHOLD),
1008 		       intel_uncore_read(uncore, GEN6_RP_DOWN_EI),
1009 		       intel_uncore_read(uncore, GEN6_RP_PREV_UP),
1010 		       intel_uncore_read(uncore, GEN6_RP_UP_THRESHOLD),
1011 		       intel_uncore_read(uncore, GEN6_RP_UP_EI));
1012 		return -EINVAL;
1013 	}
1014 
1015 	return 0;
1016 }
1017 
1018 int live_rps_interrupt(void *arg)
1019 {
1020 	struct intel_gt *gt = arg;
1021 	struct intel_rps *rps = &gt->rps;
1022 	void (*saved_work)(struct work_struct *wrk);
1023 	struct intel_engine_cs *engine;
1024 	enum intel_engine_id id;
1025 	struct igt_spinner spin;
1026 	u32 pm_events;
1027 	int err = 0;
1028 
1029 	/*
1030 	 * First, let's check whether or not we are receiving interrupts.
1031 	 */
1032 
1033 	if (!intel_rps_has_interrupts(rps) || GRAPHICS_VER(gt->i915) < 6)
1034 		return 0;
1035 
1036 	intel_gt_pm_get(gt);
1037 	pm_events = rps->pm_events;
1038 	intel_gt_pm_put(gt);
1039 	if (!pm_events) {
1040 		pr_err("No RPS PM events registered, but RPS is enabled?\n");
1041 		return -ENODEV;
1042 	}
1043 
1044 	if (igt_spinner_init(&spin, gt))
1045 		return -ENOMEM;
1046 
1047 	intel_gt_pm_wait_for_idle(gt);
1048 	saved_work = rps->work.func;
1049 	rps->work.func = dummy_rps_work;
1050 
1051 	for_each_engine(engine, gt, id) {
1052 		/* Keep the engine busy with a spinner; expect an UP! */
1053 		if (pm_events & GEN6_PM_RP_UP_THRESHOLD) {
1054 			intel_gt_pm_wait_for_idle(engine->gt);
1055 			GEM_BUG_ON(intel_rps_is_active(rps));
1056 
1057 			st_engine_heartbeat_disable(engine);
1058 
1059 			err = __rps_up_interrupt(rps, engine, &spin);
1060 
1061 			st_engine_heartbeat_enable(engine);
1062 			if (err)
1063 				goto out;
1064 
1065 			intel_gt_pm_wait_for_idle(engine->gt);
1066 		}
1067 
1068 		/* Keep the engine awake but idle and check for DOWN */
1069 		if (pm_events & GEN6_PM_RP_DOWN_THRESHOLD) {
1070 			st_engine_heartbeat_disable(engine);
1071 			intel_rc6_disable(&gt->rc6);
1072 
1073 			err = __rps_down_interrupt(rps, engine);
1074 
1075 			intel_rc6_enable(&gt->rc6);
1076 			st_engine_heartbeat_enable(engine);
1077 			if (err)
1078 				goto out;
1079 		}
1080 	}
1081 
1082 out:
1083 	if (igt_flush_test(gt->i915))
1084 		err = -EIO;
1085 
1086 	igt_spinner_fini(&spin);
1087 
1088 	intel_gt_pm_wait_for_idle(gt);
1089 	rps->work.func = saved_work;
1090 
1091 	return err;
1092 }
1093 
1094 static u64 __measure_power(int duration_ms)
1095 {
1096 	u64 dE, dt;
1097 
1098 	dE = librapl_energy_uJ();
1099 	dt = ktime_get();
1100 	usleep_range(1000 * duration_ms, 2000 * duration_ms);
1101 	dE = librapl_energy_uJ() - dE;
1102 	dt = ktime_get() - dt;
1103 
1104 	return div64_u64(1000 * 1000 * dE, dt);
1105 }
1106 
1107 static u64 measure_power(struct intel_rps *rps, int *freq)
1108 {
1109 	u64 x[5];
1110 	int i;
1111 
1112 	for (i = 0; i < 5; i++)
1113 		x[i] = __measure_power(5);
1114 
1115 	*freq = (*freq + intel_rps_read_actual_frequency(rps)) / 2;
1116 
1117 	/* A simple triangle filter for better result stability */
1118 	sort(x, 5, sizeof(*x), cmp_u64, NULL);
1119 	return div_u64(x[1] + 2 * x[2] + x[3], 4);
1120 }
1121 
1122 static u64 measure_power_at(struct intel_rps *rps, int *freq)
1123 {
1124 	*freq = rps_set_check(rps, *freq);
1125 	return measure_power(rps, freq);
1126 }
1127 
1128 int live_rps_power(void *arg)
1129 {
1130 	struct intel_gt *gt = arg;
1131 	struct intel_rps *rps = &gt->rps;
1132 	void (*saved_work)(struct work_struct *wrk);
1133 	struct intel_engine_cs *engine;
1134 	enum intel_engine_id id;
1135 	struct igt_spinner spin;
1136 	int err = 0;
1137 
1138 	/*
1139 	 * Our fundamental assumption is that running at lower frequency
1140 	 * actually saves power. Let's see if our RAPL measurement support
1141 	 * that theory.
1142 	 */
1143 
1144 	if (!intel_rps_is_enabled(rps) || GRAPHICS_VER(gt->i915) < 6)
1145 		return 0;
1146 
1147 	if (!librapl_supported(gt->i915))
1148 		return 0;
1149 
1150 	if (igt_spinner_init(&spin, gt))
1151 		return -ENOMEM;
1152 
1153 	intel_gt_pm_wait_for_idle(gt);
1154 	saved_work = rps->work.func;
1155 	rps->work.func = dummy_rps_work;
1156 
1157 	for_each_engine(engine, gt, id) {
1158 		struct i915_request *rq;
1159 		struct {
1160 			u64 power;
1161 			int freq;
1162 		} min, max;
1163 
1164 		if (!intel_engine_can_store_dword(engine))
1165 			continue;
1166 
1167 		st_engine_heartbeat_disable(engine);
1168 
1169 		rq = igt_spinner_create_request(&spin,
1170 						engine->kernel_context,
1171 						MI_NOOP);
1172 		if (IS_ERR(rq)) {
1173 			st_engine_heartbeat_enable(engine);
1174 			err = PTR_ERR(rq);
1175 			break;
1176 		}
1177 
1178 		i915_request_add(rq);
1179 
1180 		if (!igt_wait_for_spinner(&spin, rq)) {
1181 			pr_err("%s: RPS spinner did not start\n",
1182 			       engine->name);
1183 			igt_spinner_end(&spin);
1184 			st_engine_heartbeat_enable(engine);
1185 			intel_gt_set_wedged(engine->gt);
1186 			err = -EIO;
1187 			break;
1188 		}
1189 
1190 		max.freq = rps->max_freq;
1191 		max.power = measure_power_at(rps, &max.freq);
1192 
1193 		min.freq = rps->min_freq;
1194 		min.power = measure_power_at(rps, &min.freq);
1195 
1196 		igt_spinner_end(&spin);
1197 		st_engine_heartbeat_enable(engine);
1198 
1199 		pr_info("%s: min:%llumW @ %uMHz, max:%llumW @ %uMHz\n",
1200 			engine->name,
1201 			min.power, intel_gpu_freq(rps, min.freq),
1202 			max.power, intel_gpu_freq(rps, max.freq));
1203 
1204 		if (10 * min.freq >= 9 * max.freq) {
1205 			pr_notice("Could not control frequency, ran at [%d:%uMHz, %d:%uMhz]\n",
1206 				  min.freq, intel_gpu_freq(rps, min.freq),
1207 				  max.freq, intel_gpu_freq(rps, max.freq));
1208 			continue;
1209 		}
1210 
1211 		if (11 * min.power > 10 * max.power) {
1212 			pr_err("%s: did not conserve power when setting lower frequency!\n",
1213 			       engine->name);
1214 			err = -EINVAL;
1215 			break;
1216 		}
1217 
1218 		if (igt_flush_test(gt->i915)) {
1219 			err = -EIO;
1220 			break;
1221 		}
1222 	}
1223 
1224 	igt_spinner_fini(&spin);
1225 
1226 	intel_gt_pm_wait_for_idle(gt);
1227 	rps->work.func = saved_work;
1228 
1229 	return err;
1230 }
1231 
1232 int live_rps_dynamic(void *arg)
1233 {
1234 	struct intel_gt *gt = arg;
1235 	struct intel_rps *rps = &gt->rps;
1236 	struct intel_engine_cs *engine;
1237 	enum intel_engine_id id;
1238 	struct igt_spinner spin;
1239 	int err = 0;
1240 
1241 	/*
1242 	 * We've looked at the bascs, and have established that we
1243 	 * can change the clock frequency and that the HW will generate
1244 	 * interrupts based on load. Now we check how we integrate those
1245 	 * moving parts into dynamic reclocking based on load.
1246 	 */
1247 
1248 	if (!intel_rps_is_enabled(rps) || GRAPHICS_VER(gt->i915) < 6)
1249 		return 0;
1250 
1251 	if (igt_spinner_init(&spin, gt))
1252 		return -ENOMEM;
1253 
1254 	if (intel_rps_has_interrupts(rps))
1255 		pr_info("RPS has interrupt support\n");
1256 	if (intel_rps_uses_timer(rps))
1257 		pr_info("RPS has timer support\n");
1258 
1259 	for_each_engine(engine, gt, id) {
1260 		struct i915_request *rq;
1261 		struct {
1262 			ktime_t dt;
1263 			u8 freq;
1264 		} min, max;
1265 
1266 		if (!intel_engine_can_store_dword(engine))
1267 			continue;
1268 
1269 		intel_gt_pm_wait_for_idle(gt);
1270 		GEM_BUG_ON(intel_rps_is_active(rps));
1271 		rps->cur_freq = rps->min_freq;
1272 
1273 		intel_engine_pm_get(engine);
1274 		intel_rc6_disable(&gt->rc6);
1275 		GEM_BUG_ON(rps->last_freq != rps->min_freq);
1276 
1277 		rq = igt_spinner_create_request(&spin,
1278 						engine->kernel_context,
1279 						MI_NOOP);
1280 		if (IS_ERR(rq)) {
1281 			err = PTR_ERR(rq);
1282 			goto err;
1283 		}
1284 
1285 		i915_request_add(rq);
1286 
1287 		max.dt = ktime_get();
1288 		max.freq = wait_for_freq(rps, rps->max_freq, 500);
1289 		max.dt = ktime_sub(ktime_get(), max.dt);
1290 
1291 		igt_spinner_end(&spin);
1292 
1293 		min.dt = ktime_get();
1294 		min.freq = wait_for_freq(rps, rps->min_freq, 2000);
1295 		min.dt = ktime_sub(ktime_get(), min.dt);
1296 
1297 		pr_info("%s: dynamically reclocked to %u:%uMHz while busy in %lluns, and %u:%uMHz while idle in %lluns\n",
1298 			engine->name,
1299 			max.freq, intel_gpu_freq(rps, max.freq),
1300 			ktime_to_ns(max.dt),
1301 			min.freq, intel_gpu_freq(rps, min.freq),
1302 			ktime_to_ns(min.dt));
1303 		if (min.freq >= max.freq) {
1304 			pr_err("%s: dynamic reclocking of spinner failed\n!",
1305 			       engine->name);
1306 			err = -EINVAL;
1307 		}
1308 
1309 err:
1310 		intel_rc6_enable(&gt->rc6);
1311 		intel_engine_pm_put(engine);
1312 
1313 		if (igt_flush_test(gt->i915))
1314 			err = -EIO;
1315 		if (err)
1316 			break;
1317 	}
1318 
1319 	igt_spinner_fini(&spin);
1320 
1321 	return err;
1322 }
1323