xref: /linux/drivers/gpu/drm/xe/xe_gt_sriov_vf.c (revision 2c142b63c8ee982cdfdba49a616027c266294838)
1 // SPDX-License-Identifier: MIT
2 /*
3  * Copyright © 2023-2024 Intel Corporation
4  */
5 
6 #include <linux/bitfield.h>
7 #include <linux/bsearch.h>
8 #include <linux/delay.h>
9 
10 #include <drm/drm_managed.h>
11 #include <drm/drm_print.h>
12 
13 #include "abi/guc_actions_sriov_abi.h"
14 #include "abi/guc_communication_mmio_abi.h"
15 #include "abi/guc_klvs_abi.h"
16 #include "abi/guc_relay_actions_abi.h"
17 #include "regs/xe_gt_regs.h"
18 
19 #include "xe_assert.h"
20 #include "xe_device.h"
21 #include "xe_ggtt.h"
22 #include "xe_gt_sriov_printk.h"
23 #include "xe_gt_sriov_vf.h"
24 #include "xe_gt_sriov_vf_types.h"
25 #include "xe_guc.h"
26 #include "xe_guc_ct.h"
27 #include "xe_guc_hxg_helpers.h"
28 #include "xe_guc_relay.h"
29 #include "xe_guc_submit.h"
30 #include "xe_irq.h"
31 #include "xe_lrc.h"
32 #include "xe_memirq.h"
33 #include "xe_mmio.h"
34 #include "xe_sriov.h"
35 #include "xe_sriov_vf.h"
36 #include "xe_sriov_vf_ccs.h"
37 #include "xe_tile_sriov_vf.h"
38 #include "xe_tlb_inval.h"
39 #include "xe_uc_fw.h"
40 #include "xe_wopcm.h"
41 
42 #define make_u64_from_u32(hi, lo) ((u64)((u64)(u32)(hi) << 32 | (u32)(lo)))
43 
44 #ifdef CONFIG_DRM_XE_DEBUG
45 enum VF_MIGRATION_WAIT_POINTS {
46 	VF_MIGRATION_WAIT_RESFIX_START	= BIT(0),
47 	VF_MIGRATION_WAIT_FIXUPS	= BIT(1),
48 	VF_MIGRATION_WAIT_RESTART_JOBS	= BIT(2),
49 	VF_MIGRATION_WAIT_RESFIX_DONE	= BIT(3),
50 };
51 
52 #define VF_MIGRATION_WAIT_DELAY_IN_MS	1000
vf_post_migration_inject_wait(struct xe_gt * gt,enum VF_MIGRATION_WAIT_POINTS wait)53 static void vf_post_migration_inject_wait(struct xe_gt *gt,
54 					  enum VF_MIGRATION_WAIT_POINTS wait)
55 {
56 	while (gt->sriov.vf.migration.debug.resfix_stoppers & wait) {
57 		xe_gt_dbg(gt,
58 			  "*TESTING* injecting %u ms delay due to resfix_stoppers=%#x, to continue clear %#x\n",
59 			  VF_MIGRATION_WAIT_DELAY_IN_MS,
60 			  gt->sriov.vf.migration.debug.resfix_stoppers, wait);
61 
62 		msleep(VF_MIGRATION_WAIT_DELAY_IN_MS);
63 	}
64 }
65 
66 #define VF_MIGRATION_INJECT_WAIT(gt, _POS) ({					\
67 	struct xe_gt *__gt = (gt);						\
68 	vf_post_migration_inject_wait(__gt, VF_MIGRATION_WAIT_##_POS);		\
69 	})
70 
71 #else
72 #define VF_MIGRATION_INJECT_WAIT(_gt, ...)	typecheck(struct xe_gt *, (_gt))
73 #endif
74 
guc_action_vf_reset(struct xe_guc * guc)75 static int guc_action_vf_reset(struct xe_guc *guc)
76 {
77 	u32 request[GUC_HXG_REQUEST_MSG_MIN_LEN] = {
78 		FIELD_PREP(GUC_HXG_MSG_0_ORIGIN, GUC_HXG_ORIGIN_HOST) |
79 		FIELD_PREP(GUC_HXG_MSG_0_TYPE, GUC_HXG_TYPE_REQUEST) |
80 		FIELD_PREP(GUC_HXG_REQUEST_MSG_0_ACTION, GUC_ACTION_VF2GUC_VF_RESET),
81 	};
82 	int ret;
83 
84 	ret = xe_guc_mmio_send(guc, request, ARRAY_SIZE(request));
85 
86 	return ret > 0 ? -EPROTO : ret;
87 }
88 
89 #define GUC_RESET_VF_STATE_RETRY_MAX	10
vf_reset_guc_state(struct xe_gt * gt)90 static int vf_reset_guc_state(struct xe_gt *gt)
91 {
92 	unsigned int retry = GUC_RESET_VF_STATE_RETRY_MAX;
93 	struct xe_guc *guc = &gt->uc.guc;
94 	int err;
95 
96 	do {
97 		err = guc_action_vf_reset(guc);
98 		if (!err || err != -ETIMEDOUT)
99 			break;
100 	} while (--retry);
101 
102 	if (unlikely(err))
103 		xe_gt_sriov_err(gt, "Failed to reset GuC state (%pe)\n", ERR_PTR(err));
104 	return err;
105 }
106 
107 /**
108  * xe_gt_sriov_vf_reset - Reset GuC VF internal state.
109  * @gt: the &xe_gt
110  *
111  * It requires functional `GuC MMIO based communication`_.
112  *
113  * Return: 0 on success or a negative error code on failure.
114  */
xe_gt_sriov_vf_reset(struct xe_gt * gt)115 int xe_gt_sriov_vf_reset(struct xe_gt *gt)
116 {
117 	if (!xe_device_uc_enabled(gt_to_xe(gt)))
118 		return -ENODEV;
119 
120 	return vf_reset_guc_state(gt);
121 }
122 
guc_action_match_version(struct xe_guc * guc,struct xe_uc_fw_version * wanted,struct xe_uc_fw_version * found)123 static int guc_action_match_version(struct xe_guc *guc,
124 				    struct xe_uc_fw_version *wanted,
125 				    struct xe_uc_fw_version *found)
126 {
127 	u32 request[VF2GUC_MATCH_VERSION_REQUEST_MSG_LEN] = {
128 		FIELD_PREP(GUC_HXG_MSG_0_ORIGIN, GUC_HXG_ORIGIN_HOST) |
129 		FIELD_PREP(GUC_HXG_MSG_0_TYPE, GUC_HXG_TYPE_REQUEST) |
130 		FIELD_PREP(GUC_HXG_REQUEST_MSG_0_ACTION,
131 			   GUC_ACTION_VF2GUC_MATCH_VERSION),
132 		FIELD_PREP(VF2GUC_MATCH_VERSION_REQUEST_MSG_1_BRANCH, wanted->branch) |
133 		FIELD_PREP(VF2GUC_MATCH_VERSION_REQUEST_MSG_1_MAJOR, wanted->major) |
134 		FIELD_PREP(VF2GUC_MATCH_VERSION_REQUEST_MSG_1_MINOR, wanted->minor),
135 	};
136 	u32 response[GUC_MAX_MMIO_MSG_LEN];
137 	int ret;
138 
139 	BUILD_BUG_ON(VF2GUC_MATCH_VERSION_RESPONSE_MSG_LEN > GUC_MAX_MMIO_MSG_LEN);
140 
141 	ret = xe_guc_mmio_send_recv(guc, request, ARRAY_SIZE(request), response);
142 	if (unlikely(ret < 0))
143 		return ret;
144 
145 	if (unlikely(FIELD_GET(VF2GUC_MATCH_VERSION_RESPONSE_MSG_0_MBZ, response[0])))
146 		return -EPROTO;
147 
148 	memset(found, 0, sizeof(struct xe_uc_fw_version));
149 	found->branch = FIELD_GET(VF2GUC_MATCH_VERSION_RESPONSE_MSG_1_BRANCH, response[1]);
150 	found->major = FIELD_GET(VF2GUC_MATCH_VERSION_RESPONSE_MSG_1_MAJOR, response[1]);
151 	found->minor = FIELD_GET(VF2GUC_MATCH_VERSION_RESPONSE_MSG_1_MINOR, response[1]);
152 	found->patch = FIELD_GET(VF2GUC_MATCH_VERSION_RESPONSE_MSG_1_PATCH, response[1]);
153 
154 	return 0;
155 }
156 
guc_action_match_version_any(struct xe_guc * guc,struct xe_uc_fw_version * found)157 static int guc_action_match_version_any(struct xe_guc *guc,
158 					struct xe_uc_fw_version *found)
159 {
160 	struct xe_uc_fw_version wanted = {
161 		.branch = GUC_VERSION_BRANCH_ANY,
162 		.major = GUC_VERSION_MAJOR_ANY,
163 		.minor = GUC_VERSION_MINOR_ANY,
164 		.patch = 0
165 	};
166 
167 	return guc_action_match_version(guc, &wanted, found);
168 }
169 
vf_minimum_guc_version(struct xe_gt * gt,struct xe_uc_fw_version * ver)170 static void vf_minimum_guc_version(struct xe_gt *gt, struct xe_uc_fw_version *ver)
171 {
172 	struct xe_device *xe = gt_to_xe(gt);
173 
174 	memset(ver, 0, sizeof(struct xe_uc_fw_version));
175 
176 	switch (xe->info.platform) {
177 	case XE_TIGERLAKE ... XE_PVC:
178 		/* 1.1 this is current baseline for Xe driver */
179 		ver->branch = 0;
180 		ver->major = 1;
181 		ver->minor = 1;
182 		break;
183 	default:
184 		/* 1.2 has support for the GMD_ID KLV */
185 		ver->branch = 0;
186 		ver->major = 1;
187 		ver->minor = 2;
188 		break;
189 	}
190 }
191 
vf_wanted_guc_version(struct xe_gt * gt,struct xe_uc_fw_version * ver)192 static void vf_wanted_guc_version(struct xe_gt *gt, struct xe_uc_fw_version *ver)
193 {
194 	/* for now it's the same as minimum */
195 	return vf_minimum_guc_version(gt, ver);
196 }
197 
vf_handshake_with_guc(struct xe_gt * gt)198 static int vf_handshake_with_guc(struct xe_gt *gt)
199 {
200 	struct xe_uc_fw_version *guc_version = &gt->sriov.vf.guc_version;
201 	struct xe_uc_fw_version wanted = {0};
202 	struct xe_guc *guc = &gt->uc.guc;
203 	bool old = false;
204 	int err;
205 
206 	xe_gt_assert(gt, IS_SRIOV_VF(gt_to_xe(gt)));
207 
208 	/* select wanted version - prefer previous (if any) */
209 	if (guc_version->major || guc_version->minor) {
210 		wanted = *guc_version;
211 		old = true;
212 	} else {
213 		vf_wanted_guc_version(gt, &wanted);
214 		xe_gt_assert(gt, wanted.major != GUC_VERSION_MAJOR_ANY);
215 
216 		/* First time we handshake, so record the minimum wanted */
217 		gt->sriov.vf.wanted_guc_version = wanted;
218 	}
219 
220 	err = guc_action_match_version(guc, &wanted, guc_version);
221 	if (unlikely(err))
222 		goto fail;
223 
224 	if (old) {
225 		/* we don't support interface version change */
226 		if (MAKE_GUC_VER_STRUCT(*guc_version) != MAKE_GUC_VER_STRUCT(wanted)) {
227 			xe_gt_sriov_err(gt, "New GuC interface version detected: %u.%u.%u.%u\n",
228 					guc_version->branch, guc_version->major,
229 					guc_version->minor, guc_version->patch);
230 			xe_gt_sriov_info(gt, "Previously used version was: %u.%u.%u.%u\n",
231 					 wanted.branch, wanted.major,
232 					 wanted.minor, wanted.patch);
233 			err = -EREMCHG;
234 			goto fail;
235 		} else {
236 			/* version is unchanged, no need to re-verify it */
237 			return 0;
238 		}
239 	}
240 
241 	/* illegal */
242 	if (guc_version->major > wanted.major) {
243 		err = -EPROTO;
244 		goto unsupported;
245 	}
246 
247 	/* there's no fallback on major version. */
248 	if (guc_version->major != wanted.major) {
249 		err = -ENOPKG;
250 		goto unsupported;
251 	}
252 
253 	/* check against minimum version supported by us */
254 	vf_minimum_guc_version(gt, &wanted);
255 	xe_gt_assert(gt, wanted.major != GUC_VERSION_MAJOR_ANY);
256 	if (MAKE_GUC_VER_STRUCT(*guc_version) < MAKE_GUC_VER_STRUCT(wanted)) {
257 		err = -ENOKEY;
258 		goto unsupported;
259 	}
260 
261 	xe_gt_sriov_dbg(gt, "using GuC interface version %u.%u.%u.%u\n",
262 			guc_version->branch, guc_version->major,
263 			guc_version->minor, guc_version->patch);
264 
265 	return 0;
266 
267 unsupported:
268 	xe_gt_sriov_err(gt, "Unsupported GuC version %u.%u.%u.%u (%pe)\n",
269 			guc_version->branch, guc_version->major,
270 			guc_version->minor, guc_version->patch,
271 			ERR_PTR(err));
272 fail:
273 	xe_gt_sriov_err(gt, "Unable to confirm GuC version %u.%u (%pe)\n",
274 			wanted.major, wanted.minor, ERR_PTR(err));
275 
276 	/* try again with *any* just to query which version is supported */
277 	if (!guc_action_match_version_any(guc, &wanted))
278 		xe_gt_sriov_notice(gt, "GuC reports interface version %u.%u.%u.%u\n",
279 				   wanted.branch, wanted.major, wanted.minor, wanted.patch);
280 	return err;
281 }
282 
283 /**
284  * xe_gt_sriov_vf_bootstrap - Query and setup GuC ABI interface version.
285  * @gt: the &xe_gt
286  *
287  * This function is for VF use only.
288  * It requires functional `GuC MMIO based communication`_.
289  *
290  * Return: 0 on success or a negative error code on failure.
291  */
xe_gt_sriov_vf_bootstrap(struct xe_gt * gt)292 int xe_gt_sriov_vf_bootstrap(struct xe_gt *gt)
293 {
294 	int err;
295 
296 	if (!xe_device_uc_enabled(gt_to_xe(gt)))
297 		return -ENODEV;
298 
299 	err = vf_reset_guc_state(gt);
300 	if (unlikely(err))
301 		return err;
302 
303 	err = vf_handshake_with_guc(gt);
304 	if (unlikely(err))
305 		return err;
306 
307 	return 0;
308 }
309 
310 /**
311  * xe_gt_sriov_vf_guc_versions - Minimum required and found GuC ABI versions
312  * @gt: the &xe_gt
313  * @wanted: pointer to the xe_uc_fw_version to be filled with the wanted version
314  * @found: pointer to the xe_uc_fw_version to be filled with the found version
315  *
316  * This function is for VF use only and it can only be used after successful
317  * version handshake with the GuC.
318  */
xe_gt_sriov_vf_guc_versions(struct xe_gt * gt,struct xe_uc_fw_version * wanted,struct xe_uc_fw_version * found)319 void xe_gt_sriov_vf_guc_versions(struct xe_gt *gt,
320 				 struct xe_uc_fw_version *wanted,
321 				 struct xe_uc_fw_version *found)
322 {
323 	xe_gt_assert(gt, IS_SRIOV_VF(gt_to_xe(gt)));
324 	xe_gt_assert(gt, gt->sriov.vf.guc_version.major);
325 
326 	if (wanted)
327 		*wanted = gt->sriov.vf.wanted_guc_version;
328 
329 	if (found)
330 		*found = gt->sriov.vf.guc_version;
331 }
332 
guc_action_vf_resfix_start(struct xe_guc * guc,u16 marker)333 static int guc_action_vf_resfix_start(struct xe_guc *guc, u16 marker)
334 {
335 	u32 request[GUC_HXG_REQUEST_MSG_MIN_LEN] = {
336 		FIELD_PREP(GUC_HXG_MSG_0_ORIGIN, GUC_HXG_ORIGIN_HOST) |
337 		FIELD_PREP(GUC_HXG_MSG_0_TYPE, GUC_HXG_TYPE_REQUEST) |
338 		FIELD_PREP(GUC_HXG_REQUEST_MSG_0_ACTION, GUC_ACTION_VF2GUC_RESFIX_START) |
339 		FIELD_PREP(VF2GUC_RESFIX_START_REQUEST_MSG_0_MARKER, marker),
340 	};
341 	int ret;
342 
343 	ret = xe_guc_mmio_send(guc, request, ARRAY_SIZE(request));
344 
345 	return ret > 0 ? -EPROTO : ret;
346 }
347 
vf_resfix_start(struct xe_gt * gt,u16 marker)348 static int vf_resfix_start(struct xe_gt *gt, u16 marker)
349 {
350 	struct xe_guc *guc = &gt->uc.guc;
351 
352 	xe_gt_assert(gt, IS_SRIOV_VF(gt_to_xe(gt)));
353 
354 	VF_MIGRATION_INJECT_WAIT(gt, RESFIX_START);
355 
356 	xe_gt_sriov_dbg_verbose(gt, "Sending resfix start marker %u\n", marker);
357 
358 	return guc_action_vf_resfix_start(guc, marker);
359 }
360 
guc_action_vf_resfix_done(struct xe_guc * guc,u16 marker)361 static int guc_action_vf_resfix_done(struct xe_guc *guc, u16 marker)
362 {
363 	u32 request[GUC_HXG_REQUEST_MSG_MIN_LEN] = {
364 		FIELD_PREP(GUC_HXG_MSG_0_ORIGIN, GUC_HXG_ORIGIN_HOST) |
365 		FIELD_PREP(GUC_HXG_MSG_0_TYPE, GUC_HXG_TYPE_REQUEST) |
366 		FIELD_PREP(GUC_HXG_REQUEST_MSG_0_ACTION, GUC_ACTION_VF2GUC_RESFIX_DONE) |
367 		FIELD_PREP(VF2GUC_RESFIX_DONE_REQUEST_MSG_0_MARKER, marker),
368 	};
369 	int ret;
370 
371 	ret = xe_guc_mmio_send(guc, request, ARRAY_SIZE(request));
372 
373 	return ret > 0 ? -EPROTO : ret;
374 }
375 
vf_resfix_done(struct xe_gt * gt,u16 marker)376 static int vf_resfix_done(struct xe_gt *gt, u16 marker)
377 {
378 	struct xe_guc *guc = &gt->uc.guc;
379 
380 	xe_gt_assert(gt, IS_SRIOV_VF(gt_to_xe(gt)));
381 
382 	xe_gt_sriov_dbg_verbose(gt, "Sending resfix done marker %u\n", marker);
383 
384 	return guc_action_vf_resfix_done(guc, marker);
385 }
386 
guc_action_query_single_klv(struct xe_guc * guc,u32 key,u32 * value,u32 value_len)387 static int guc_action_query_single_klv(struct xe_guc *guc, u32 key,
388 				       u32 *value, u32 value_len)
389 {
390 	u32 request[VF2GUC_QUERY_SINGLE_KLV_REQUEST_MSG_LEN] = {
391 		FIELD_PREP(GUC_HXG_MSG_0_ORIGIN, GUC_HXG_ORIGIN_HOST) |
392 		FIELD_PREP(GUC_HXG_MSG_0_TYPE, GUC_HXG_TYPE_REQUEST) |
393 		FIELD_PREP(GUC_HXG_REQUEST_MSG_0_ACTION,
394 			   GUC_ACTION_VF2GUC_QUERY_SINGLE_KLV),
395 		FIELD_PREP(VF2GUC_QUERY_SINGLE_KLV_REQUEST_MSG_1_KEY, key),
396 	};
397 	u32 response[GUC_MAX_MMIO_MSG_LEN];
398 	u32 length;
399 	int ret;
400 
401 	BUILD_BUG_ON(VF2GUC_QUERY_SINGLE_KLV_RESPONSE_MSG_MAX_LEN > GUC_MAX_MMIO_MSG_LEN);
402 	ret = xe_guc_mmio_send_recv(guc, request, ARRAY_SIZE(request), response);
403 	if (unlikely(ret < 0))
404 		return ret;
405 
406 	if (unlikely(FIELD_GET(VF2GUC_QUERY_SINGLE_KLV_RESPONSE_MSG_0_MBZ, response[0])))
407 		return -EPROTO;
408 
409 	length = FIELD_GET(VF2GUC_QUERY_SINGLE_KLV_RESPONSE_MSG_0_LENGTH, response[0]);
410 	if (unlikely(length > value_len))
411 		return -EOVERFLOW;
412 	if (unlikely(length < value_len))
413 		return -ENODATA;
414 
415 	switch (value_len) {
416 	default:
417 		xe_gt_WARN_ON(guc_to_gt(guc), value_len > 3);
418 		fallthrough;
419 	case 3:
420 		value[2] = FIELD_GET(VF2GUC_QUERY_SINGLE_KLV_RESPONSE_MSG_3_VALUE96, response[3]);
421 		fallthrough;
422 	case 2:
423 		value[1] = FIELD_GET(VF2GUC_QUERY_SINGLE_KLV_RESPONSE_MSG_2_VALUE64, response[2]);
424 		fallthrough;
425 	case 1:
426 		value[0] = FIELD_GET(VF2GUC_QUERY_SINGLE_KLV_RESPONSE_MSG_1_VALUE32, response[1]);
427 		fallthrough;
428 	case 0:
429 		break;
430 	}
431 
432 	return 0;
433 }
434 
guc_action_query_single_klv32(struct xe_guc * guc,u32 key,u32 * value32)435 static int guc_action_query_single_klv32(struct xe_guc *guc, u32 key, u32 *value32)
436 {
437 	return guc_action_query_single_klv(guc, key, value32, hxg_sizeof(u32));
438 }
439 
guc_action_query_single_klv64(struct xe_guc * guc,u32 key,u64 * value64)440 static int guc_action_query_single_klv64(struct xe_guc *guc, u32 key, u64 *value64)
441 {
442 	u32 value[2];
443 	int err;
444 
445 	err = guc_action_query_single_klv(guc, key, value, hxg_sizeof(value));
446 	if (unlikely(err))
447 		return err;
448 
449 	*value64 = make_u64_from_u32(value[1], value[0]);
450 	return 0;
451 }
452 
has_gmdid(struct xe_device * xe)453 static bool has_gmdid(struct xe_device *xe)
454 {
455 	return GRAPHICS_VERx100(xe) >= 1270;
456 }
457 
458 /**
459  * xe_gt_sriov_vf_gmdid - Query GMDID over MMIO.
460  * @gt: the &xe_gt
461  *
462  * This function is for VF use only.
463  *
464  * Return: value of GMDID KLV on success or 0 on failure.
465  */
xe_gt_sriov_vf_gmdid(struct xe_gt * gt)466 u32 xe_gt_sriov_vf_gmdid(struct xe_gt *gt)
467 {
468 	const char *type = xe_gt_is_media_type(gt) ? "media" : "graphics";
469 	struct xe_guc *guc = &gt->uc.guc;
470 	u32 value;
471 	int err;
472 
473 	xe_gt_assert(gt, IS_SRIOV_VF(gt_to_xe(gt)));
474 	xe_gt_assert(gt, !GRAPHICS_VERx100(gt_to_xe(gt)) || has_gmdid(gt_to_xe(gt)));
475 	xe_gt_assert(gt, gt->sriov.vf.guc_version.major > 1 || gt->sriov.vf.guc_version.minor >= 2);
476 
477 	err = guc_action_query_single_klv32(guc, GUC_KLV_GLOBAL_CFG_GMD_ID_KEY, &value);
478 	if (unlikely(err)) {
479 		xe_gt_sriov_err(gt, "Failed to obtain %s GMDID (%pe)\n",
480 				type, ERR_PTR(err));
481 		return 0;
482 	}
483 
484 	xe_gt_sriov_dbg(gt, "%s GMDID = %#x\n", type, value);
485 	return value;
486 }
487 
vf_get_ggtt_info(struct xe_gt * gt)488 static int vf_get_ggtt_info(struct xe_gt *gt)
489 {
490 	struct xe_tile *tile = gt_to_tile(gt);
491 	struct xe_guc *guc = &gt->uc.guc;
492 	u64 start, size, ggtt_size;
493 	int err;
494 
495 	xe_gt_assert(gt, IS_SRIOV_VF(gt_to_xe(gt)));
496 
497 	err = guc_action_query_single_klv64(guc, GUC_KLV_VF_CFG_GGTT_START_KEY, &start);
498 	if (unlikely(err))
499 		return err;
500 
501 	err = guc_action_query_single_klv64(guc, GUC_KLV_VF_CFG_GGTT_SIZE_KEY, &size);
502 	if (unlikely(err))
503 		return err;
504 
505 	if (!size)
506 		return -ENODATA;
507 
508 	xe_tile_sriov_vf_ggtt_base_store(tile, start);
509 	ggtt_size = xe_tile_sriov_vf_ggtt(tile);
510 	if (!ggtt_size) {
511 		/*
512 		 * This function is called once during xe_guc_init_noalloc(),
513 		 * at which point ggtt_size = 0 and we have to initialize everything,
514 		 * and GGTT is not yet initialized.
515 		 *
516 		 * Return early as there's nothing to fixup.
517 		 */
518 		xe_tile_sriov_vf_ggtt_store(tile, size);
519 		return 0;
520 	}
521 
522 	if (ggtt_size != size) {
523 		xe_gt_sriov_err(gt, "Unexpected GGTT reassignment: %lluK != %lluK\n",
524 				size / SZ_1K, ggtt_size / SZ_1K);
525 		return -EREMCHG;
526 	}
527 
528 	xe_gt_sriov_dbg_verbose(gt, "GGTT %#llx-%#llx = %lluK\n",
529 				start, start + size - 1, size / SZ_1K);
530 
531 	/*
532 	 * This function can be called repeatedly from post migration fixups,
533 	 * at which point we inform the GGTT of the new base address.
534 	 * xe_ggtt_shift_nodes() may be called multiple times for each migration,
535 	 * but will be a noop if the base is unchanged.
536 	 */
537 	xe_ggtt_shift_nodes(tile->mem.ggtt, start);
538 
539 	return 0;
540 }
541 
vf_get_lmem_info(struct xe_gt * gt)542 static int vf_get_lmem_info(struct xe_gt *gt)
543 {
544 	struct xe_tile *tile = gt_to_tile(gt);
545 	struct xe_guc *guc = &gt->uc.guc;
546 	char size_str[10];
547 	u64 size, lmem_size;
548 	int err;
549 
550 	xe_gt_assert(gt, IS_SRIOV_VF(gt_to_xe(gt)));
551 
552 	err = guc_action_query_single_klv64(guc, GUC_KLV_VF_CFG_LMEM_SIZE_KEY, &size);
553 	if (unlikely(err))
554 		return err;
555 
556 	lmem_size = xe_tile_sriov_vf_lmem(tile);
557 	if (lmem_size && lmem_size != size) {
558 		xe_gt_sriov_err(gt, "Unexpected LMEM reassignment: %lluM != %lluM\n",
559 				size / SZ_1M, lmem_size / SZ_1M);
560 		return -EREMCHG;
561 	}
562 
563 	string_get_size(size, 1, STRING_UNITS_2, size_str, sizeof(size_str));
564 	xe_gt_sriov_dbg_verbose(gt, "LMEM %lluM %s\n", size / SZ_1M, size_str);
565 
566 	xe_tile_sriov_vf_lmem_store(tile, size);
567 
568 	return size ? 0 : -ENODATA;
569 }
570 
vf_get_submission_cfg(struct xe_gt * gt)571 static int vf_get_submission_cfg(struct xe_gt *gt)
572 {
573 	struct xe_gt_sriov_vf_selfconfig *config = &gt->sriov.vf.self_config;
574 	struct xe_guc *guc = &gt->uc.guc;
575 	u32 num_ctxs, num_dbs;
576 	int err;
577 
578 	xe_gt_assert(gt, IS_SRIOV_VF(gt_to_xe(gt)));
579 
580 	err = guc_action_query_single_klv32(guc, GUC_KLV_VF_CFG_NUM_CONTEXTS_KEY, &num_ctxs);
581 	if (unlikely(err))
582 		return err;
583 
584 	err = guc_action_query_single_klv32(guc, GUC_KLV_VF_CFG_NUM_DOORBELLS_KEY, &num_dbs);
585 	if (unlikely(err))
586 		return err;
587 
588 	if (config->num_ctxs && config->num_ctxs != num_ctxs) {
589 		xe_gt_sriov_err(gt, "Unexpected CTXs reassignment: %u != %u\n",
590 				num_ctxs, config->num_ctxs);
591 		return -EREMCHG;
592 	}
593 	if (config->num_dbs && config->num_dbs != num_dbs) {
594 		xe_gt_sriov_err(gt, "Unexpected DBs reassignment: %u != %u\n",
595 				num_dbs, config->num_dbs);
596 		return -EREMCHG;
597 	}
598 
599 	xe_gt_sriov_dbg_verbose(gt, "CTXs %u DBs %u\n", num_ctxs, num_dbs);
600 
601 	config->num_ctxs = num_ctxs;
602 	config->num_dbs = num_dbs;
603 
604 	return config->num_ctxs ? 0 : -ENODATA;
605 }
606 
vf_cache_gmdid(struct xe_gt * gt)607 static void vf_cache_gmdid(struct xe_gt *gt)
608 {
609 	xe_gt_assert(gt, has_gmdid(gt_to_xe(gt)));
610 	xe_gt_assert(gt, IS_SRIOV_VF(gt_to_xe(gt)));
611 
612 	gt->sriov.vf.runtime.gmdid = xe_gt_sriov_vf_gmdid(gt);
613 }
614 
vf_query_sched_groups(struct xe_gt * gt)615 static int vf_query_sched_groups(struct xe_gt *gt)
616 {
617 	struct xe_guc *guc = &gt->uc.guc;
618 	struct xe_uc_fw_version guc_version;
619 	u32 value = 0;
620 	int err;
621 
622 	xe_gt_sriov_vf_guc_versions(gt, NULL, &guc_version);
623 
624 	if (MAKE_GUC_VER_STRUCT(guc_version) < MAKE_GUC_VER(1, 26, 0))
625 		return 0;
626 
627 	err = guc_action_query_single_klv32(guc,
628 					    GUC_KLV_GLOBAL_CFG_GROUP_SCHEDULING_AVAILABLE_KEY,
629 					    &value);
630 	if (unlikely(err)) {
631 		xe_gt_sriov_err(gt, "Failed to obtain sched groups status (%pe)\n",
632 				ERR_PTR(err));
633 		return err;
634 	}
635 
636 	/* valid values are 0 (disabled) and 1 (enabled) */
637 	if (value > 1) {
638 		xe_gt_sriov_err(gt, "Invalid sched groups status %u\n", value);
639 		return -EPROTO;
640 	}
641 
642 	xe_gt_sriov_dbg(gt, "sched groups %s\n", str_enabled_disabled(value));
643 	return value;
644 }
645 
vf_cache_sched_groups_status(struct xe_gt * gt)646 static int vf_cache_sched_groups_status(struct xe_gt *gt)
647 {
648 	int ret;
649 
650 	xe_gt_assert(gt, IS_SRIOV_VF(gt_to_xe(gt)));
651 
652 	ret = vf_query_sched_groups(gt);
653 	if (ret < 0)
654 		return ret;
655 
656 	gt->sriov.vf.runtime.uses_sched_groups = ret;
657 
658 	return 0;
659 }
660 
661 /**
662  * xe_gt_sriov_vf_query_config - Query SR-IOV config data over MMIO.
663  * @gt: the &xe_gt
664  *
665  * This function is for VF use only. This function may shift the GGTT and is
666  * performed under GGTT lock, making this step visible to all GTs that share a
667  * GGTT.
668  *
669  * Return: 0 on success or a negative error code on failure.
670  */
xe_gt_sriov_vf_query_config(struct xe_gt * gt)671 int xe_gt_sriov_vf_query_config(struct xe_gt *gt)
672 {
673 	struct xe_device *xe = gt_to_xe(gt);
674 	int err;
675 
676 	err = vf_get_ggtt_info(gt);
677 	if (unlikely(err))
678 		return err;
679 
680 	if (IS_DGFX(xe) && xe_gt_is_main_type(gt)) {
681 		err = vf_get_lmem_info(gt);
682 		if (unlikely(err))
683 			return err;
684 	}
685 
686 	err = vf_get_submission_cfg(gt);
687 	if (unlikely(err))
688 		return err;
689 
690 	err = vf_cache_sched_groups_status(gt);
691 	if (unlikely(err))
692 		return err;
693 
694 	if (has_gmdid(xe))
695 		vf_cache_gmdid(gt);
696 
697 	return 0;
698 }
699 
700 /**
701  * xe_gt_sriov_vf_sched_groups_enabled() - Check if PF has enabled multiple
702  * scheduler groups
703  * @gt: the &xe_gt
704  *
705  * This function is for VF use only.
706  *
707  * Return: true if shed groups were enabled, false otherwise.
708  */
xe_gt_sriov_vf_sched_groups_enabled(struct xe_gt * gt)709 bool xe_gt_sriov_vf_sched_groups_enabled(struct xe_gt *gt)
710 {
711 	xe_gt_assert(gt, IS_SRIOV_VF(gt_to_xe(gt)));
712 	xe_gt_assert(gt, gt->sriov.vf.guc_version.major);
713 
714 	return gt->sriov.vf.runtime.uses_sched_groups;
715 }
716 
717 /**
718  * xe_gt_sriov_vf_guc_ids - VF GuC context IDs configuration.
719  * @gt: the &xe_gt
720  *
721  * This function is for VF use only.
722  *
723  * Return: number of GuC context IDs assigned to VF.
724  */
xe_gt_sriov_vf_guc_ids(struct xe_gt * gt)725 u16 xe_gt_sriov_vf_guc_ids(struct xe_gt *gt)
726 {
727 	xe_gt_assert(gt, IS_SRIOV_VF(gt_to_xe(gt)));
728 	xe_gt_assert(gt, gt->sriov.vf.guc_version.major);
729 	xe_gt_assert(gt, gt->sriov.vf.self_config.num_ctxs);
730 
731 	return gt->sriov.vf.self_config.num_ctxs;
732 }
733 
relay_action_handshake(struct xe_gt * gt,u32 * major,u32 * minor)734 static int relay_action_handshake(struct xe_gt *gt, u32 *major, u32 *minor)
735 {
736 	u32 request[VF2PF_HANDSHAKE_REQUEST_MSG_LEN] = {
737 		FIELD_PREP(GUC_HXG_MSG_0_ORIGIN, GUC_HXG_ORIGIN_HOST) |
738 		FIELD_PREP(GUC_HXG_MSG_0_TYPE, GUC_HXG_TYPE_REQUEST) |
739 		FIELD_PREP(GUC_HXG_REQUEST_MSG_0_ACTION, GUC_RELAY_ACTION_VF2PF_HANDSHAKE),
740 		FIELD_PREP(VF2PF_HANDSHAKE_REQUEST_MSG_1_MAJOR, *major) |
741 		FIELD_PREP(VF2PF_HANDSHAKE_REQUEST_MSG_1_MINOR, *minor),
742 	};
743 	u32 response[VF2PF_HANDSHAKE_RESPONSE_MSG_LEN];
744 	int ret;
745 
746 	xe_gt_assert(gt, IS_SRIOV_VF(gt_to_xe(gt)));
747 
748 	ret = xe_guc_relay_send_to_pf(&gt->uc.guc.relay,
749 				      request, ARRAY_SIZE(request),
750 				      response, ARRAY_SIZE(response));
751 	if (unlikely(ret < 0))
752 		return ret;
753 
754 	if (unlikely(ret != VF2PF_HANDSHAKE_RESPONSE_MSG_LEN))
755 		return -EPROTO;
756 
757 	if (unlikely(FIELD_GET(VF2PF_HANDSHAKE_RESPONSE_MSG_0_MBZ, response[0])))
758 		return -EPROTO;
759 
760 	*major = FIELD_GET(VF2PF_HANDSHAKE_RESPONSE_MSG_1_MAJOR, response[1]);
761 	*minor = FIELD_GET(VF2PF_HANDSHAKE_RESPONSE_MSG_1_MINOR, response[1]);
762 
763 	return 0;
764 }
765 
vf_connect_pf(struct xe_device * xe,u16 major,u16 minor)766 static void vf_connect_pf(struct xe_device *xe, u16 major, u16 minor)
767 {
768 	xe_assert(xe, IS_SRIOV_VF(xe));
769 
770 	xe->sriov.vf.pf_version.major = major;
771 	xe->sriov.vf.pf_version.minor = minor;
772 }
773 
vf_disconnect_pf(struct xe_device * xe)774 static void vf_disconnect_pf(struct xe_device *xe)
775 {
776 	vf_connect_pf(xe, 0, 0);
777 }
778 
vf_handshake_with_pf(struct xe_gt * gt)779 static int vf_handshake_with_pf(struct xe_gt *gt)
780 {
781 	struct xe_device *xe = gt_to_xe(gt);
782 	u32 major_wanted = GUC_RELAY_VERSION_LATEST_MAJOR;
783 	u32 minor_wanted = GUC_RELAY_VERSION_LATEST_MINOR;
784 	u32 major = major_wanted, minor = minor_wanted;
785 	int err;
786 
787 	err = relay_action_handshake(gt, &major, &minor);
788 	if (unlikely(err))
789 		goto failed;
790 
791 	if (!major && !minor) {
792 		err = -ENODATA;
793 		goto failed;
794 	}
795 
796 	xe_gt_sriov_dbg(gt, "using VF/PF ABI %u.%u\n", major, minor);
797 	vf_connect_pf(xe, major, minor);
798 	return 0;
799 
800 failed:
801 	xe_gt_sriov_err(gt, "Unable to confirm VF/PF ABI version %u.%u (%pe)\n",
802 			major, minor, ERR_PTR(err));
803 	vf_disconnect_pf(xe);
804 	return err;
805 }
806 
807 /**
808  * xe_gt_sriov_vf_connect - Establish connection with the PF driver.
809  * @gt: the &xe_gt
810  *
811  * This function is for VF use only.
812  *
813  * Return: 0 on success or a negative error code on failure.
814  */
xe_gt_sriov_vf_connect(struct xe_gt * gt)815 int xe_gt_sriov_vf_connect(struct xe_gt *gt)
816 {
817 	int err;
818 
819 	err = vf_handshake_with_pf(gt);
820 	if (unlikely(err))
821 		goto failed;
822 
823 	return 0;
824 
825 failed:
826 	xe_gt_sriov_err(gt, "Failed to get version info (%pe)\n", ERR_PTR(err));
827 	return err;
828 }
829 
830 /**
831  * xe_gt_sriov_vf_default_lrcs_hwsp_rebase - Update GGTT references in HWSP of default LRCs.
832  * @gt: the &xe_gt struct instance
833  */
xe_gt_sriov_vf_default_lrcs_hwsp_rebase(struct xe_gt * gt)834 static void xe_gt_sriov_vf_default_lrcs_hwsp_rebase(struct xe_gt *gt)
835 {
836 	struct xe_hw_engine *hwe;
837 	enum xe_hw_engine_id id;
838 
839 	for_each_hw_engine(hwe, gt, id)
840 		xe_default_lrc_update_memirq_regs_with_address(hwe);
841 }
842 
vf_post_migration_mark_fixups_done(struct xe_gt * gt)843 static void vf_post_migration_mark_fixups_done(struct xe_gt *gt)
844 {
845 	WRITE_ONCE(gt->sriov.vf.migration.ggtt_need_fixes, false);
846 	smp_wmb();	/* Ensure above write visible before wake */
847 	wake_up_all(&gt->sriov.vf.migration.wq);
848 }
849 
vf_start_migration_recovery(struct xe_gt * gt)850 static void vf_start_migration_recovery(struct xe_gt *gt)
851 {
852 	bool started;
853 
854 	xe_gt_assert(gt, IS_SRIOV_VF(gt_to_xe(gt)));
855 
856 	spin_lock(&gt->sriov.vf.migration.lock);
857 
858 	if (!gt->sriov.vf.migration.recovery_queued &&
859 	    !gt->sriov.vf.migration.recovery_teardown) {
860 		gt->sriov.vf.migration.recovery_queued = true;
861 		WRITE_ONCE(gt->sriov.vf.migration.recovery_inprogress, true);
862 		WRITE_ONCE(gt->sriov.vf.migration.ggtt_need_fixes, true);
863 		smp_wmb();	/* Ensure above writes visible before wake */
864 
865 		xe_guc_ct_wake_waiters(&gt->uc.guc.ct);
866 
867 		started = queue_work(gt->ordered_wq, &gt->sriov.vf.migration.worker);
868 		xe_gt_sriov_info(gt, "VF migration recovery %s\n", started ?
869 				 "scheduled" : "already in progress");
870 	}
871 
872 	spin_unlock(&gt->sriov.vf.migration.lock);
873 }
874 
875 /**
876  * xe_gt_sriov_vf_migrated_event_handler - Start a VF migration recovery,
877  *   or just mark that a GuC is ready for it.
878  * @gt: the &xe_gt struct instance linked to target GuC
879  *
880  * This function shall be called only by VF.
881  */
xe_gt_sriov_vf_migrated_event_handler(struct xe_gt * gt)882 void xe_gt_sriov_vf_migrated_event_handler(struct xe_gt *gt)
883 {
884 	struct xe_device *xe = gt_to_xe(gt);
885 
886 	xe_gt_assert(gt, IS_SRIOV_VF(xe));
887 	xe_gt_assert(gt, xe_gt_sriov_vf_recovery_pending(gt));
888 
889 	if (!xe_sriov_vf_migration_supported(xe)) {
890 		xe_gt_sriov_err(gt, "migration not supported\n");
891 		return;
892 	}
893 
894 	xe_gt_sriov_info(gt, "ready for recovery after migration\n");
895 	vf_start_migration_recovery(gt);
896 }
897 
vf_is_negotiated(struct xe_gt * gt,u16 major,u16 minor)898 static bool vf_is_negotiated(struct xe_gt *gt, u16 major, u16 minor)
899 {
900 	struct xe_device *xe = gt_to_xe(gt);
901 
902 	xe_gt_assert(gt, IS_SRIOV_VF(xe));
903 
904 	return major == xe->sriov.vf.pf_version.major &&
905 	       minor <= xe->sriov.vf.pf_version.minor;
906 }
907 
vf_prepare_runtime_info(struct xe_gt * gt,unsigned int num_regs)908 static int vf_prepare_runtime_info(struct xe_gt *gt, unsigned int num_regs)
909 {
910 	struct vf_runtime_reg *regs = gt->sriov.vf.runtime.regs;
911 	unsigned int regs_size = round_up(num_regs, 4);
912 	struct xe_device *xe = gt_to_xe(gt);
913 
914 	xe_gt_assert(gt, IS_SRIOV_VF(xe));
915 
916 	if (regs) {
917 		if (num_regs <= gt->sriov.vf.runtime.regs_size) {
918 			memset(regs, 0, num_regs * sizeof(*regs));
919 			gt->sriov.vf.runtime.num_regs = num_regs;
920 			return 0;
921 		}
922 
923 		drmm_kfree(&xe->drm, regs);
924 		gt->sriov.vf.runtime.regs = NULL;
925 		gt->sriov.vf.runtime.num_regs = 0;
926 		gt->sriov.vf.runtime.regs_size = 0;
927 	}
928 
929 	regs = drmm_kcalloc(&xe->drm, regs_size, sizeof(*regs), GFP_KERNEL);
930 	if (unlikely(!regs))
931 		return -ENOMEM;
932 
933 	gt->sriov.vf.runtime.regs = regs;
934 	gt->sriov.vf.runtime.num_regs = num_regs;
935 	gt->sriov.vf.runtime.regs_size = regs_size;
936 	return 0;
937 }
938 
vf_query_runtime_info(struct xe_gt * gt)939 static int vf_query_runtime_info(struct xe_gt *gt)
940 {
941 	u32 request[VF2PF_QUERY_RUNTIME_REQUEST_MSG_LEN];
942 	u32 response[VF2PF_QUERY_RUNTIME_RESPONSE_MSG_MIN_LEN + 32]; /* up to 16 regs */
943 	u32 limit = (ARRAY_SIZE(response) - VF2PF_QUERY_RUNTIME_RESPONSE_MSG_MIN_LEN) / 2;
944 	u32 count, remaining, num, i;
945 	u32 start = 0;
946 	int ret;
947 
948 	xe_gt_assert(gt, IS_SRIOV_VF(gt_to_xe(gt)));
949 	xe_gt_assert(gt, limit);
950 
951 	/* this is part of the 1.0 PF/VF ABI */
952 	if (!vf_is_negotiated(gt, 1, 0))
953 		return -ENOPKG;
954 
955 	request[0] = FIELD_PREP(GUC_HXG_MSG_0_ORIGIN, GUC_HXG_ORIGIN_HOST) |
956 		     FIELD_PREP(GUC_HXG_MSG_0_TYPE, GUC_HXG_TYPE_REQUEST) |
957 		     FIELD_PREP(GUC_HXG_REQUEST_MSG_0_ACTION,
958 				GUC_RELAY_ACTION_VF2PF_QUERY_RUNTIME) |
959 		     FIELD_PREP(VF2PF_QUERY_RUNTIME_REQUEST_MSG_0_LIMIT, limit);
960 
961 repeat:
962 	request[1] = FIELD_PREP(VF2PF_QUERY_RUNTIME_REQUEST_MSG_1_START, start);
963 	ret = xe_guc_relay_send_to_pf(&gt->uc.guc.relay,
964 				      request, ARRAY_SIZE(request),
965 				      response, ARRAY_SIZE(response));
966 	if (unlikely(ret < 0))
967 		goto failed;
968 
969 	if (unlikely(ret < VF2PF_QUERY_RUNTIME_RESPONSE_MSG_MIN_LEN)) {
970 		ret = -EPROTO;
971 		goto failed;
972 	}
973 	if (unlikely((ret - VF2PF_QUERY_RUNTIME_RESPONSE_MSG_MIN_LEN) % 2)) {
974 		ret = -EPROTO;
975 		goto failed;
976 	}
977 
978 	num = (ret - VF2PF_QUERY_RUNTIME_RESPONSE_MSG_MIN_LEN) / 2;
979 	count = FIELD_GET(VF2PF_QUERY_RUNTIME_RESPONSE_MSG_0_COUNT, response[0]);
980 	remaining = FIELD_GET(VF2PF_QUERY_RUNTIME_RESPONSE_MSG_1_REMAINING, response[1]);
981 
982 	xe_gt_sriov_dbg_verbose(gt, "count=%u num=%u ret=%d start=%u remaining=%u\n",
983 				count, num, ret, start, remaining);
984 
985 	if (unlikely(count != num)) {
986 		ret = -EPROTO;
987 		goto failed;
988 	}
989 
990 	if (start == 0) {
991 		ret = vf_prepare_runtime_info(gt, num + remaining);
992 		if (unlikely(ret < 0))
993 			goto failed;
994 	} else if (unlikely(start + num > gt->sriov.vf.runtime.num_regs)) {
995 		ret = -EPROTO;
996 		goto failed;
997 	}
998 
999 	for (i = 0; i < num; ++i) {
1000 		struct vf_runtime_reg *reg = &gt->sriov.vf.runtime.regs[start + i];
1001 
1002 		reg->offset = response[VF2PF_QUERY_RUNTIME_RESPONSE_MSG_MIN_LEN + 2 * i];
1003 		reg->value = response[VF2PF_QUERY_RUNTIME_RESPONSE_MSG_MIN_LEN + 2 * i + 1];
1004 	}
1005 
1006 	if (remaining) {
1007 		start += num;
1008 		goto repeat;
1009 	}
1010 
1011 	return 0;
1012 
1013 failed:
1014 	vf_prepare_runtime_info(gt, 0);
1015 	return ret;
1016 }
1017 
vf_show_runtime_info(struct xe_gt * gt)1018 static void vf_show_runtime_info(struct xe_gt *gt)
1019 {
1020 	struct vf_runtime_reg *vf_regs = gt->sriov.vf.runtime.regs;
1021 	unsigned int size = gt->sriov.vf.runtime.num_regs;
1022 
1023 	xe_gt_assert(gt, IS_SRIOV_VF(gt_to_xe(gt)));
1024 
1025 	for (; size--; vf_regs++)
1026 		xe_gt_sriov_dbg(gt, "runtime(%#x) = %#x\n",
1027 				vf_regs->offset, vf_regs->value);
1028 }
1029 
1030 /**
1031  * xe_gt_sriov_vf_query_runtime - Query SR-IOV runtime data.
1032  * @gt: the &xe_gt
1033  *
1034  * This function is for VF use only.
1035  *
1036  * Return: 0 on success or a negative error code on failure.
1037  */
xe_gt_sriov_vf_query_runtime(struct xe_gt * gt)1038 int xe_gt_sriov_vf_query_runtime(struct xe_gt *gt)
1039 {
1040 	int err;
1041 
1042 	err = vf_query_runtime_info(gt);
1043 	if (unlikely(err))
1044 		goto failed;
1045 
1046 	if (IS_ENABLED(CONFIG_DRM_XE_DEBUG))
1047 		vf_show_runtime_info(gt);
1048 
1049 	return 0;
1050 
1051 failed:
1052 	xe_gt_sriov_err(gt, "Failed to get runtime info (%pe)\n",
1053 			ERR_PTR(err));
1054 	return err;
1055 }
1056 
vf_runtime_reg_cmp(const void * a,const void * b)1057 static int vf_runtime_reg_cmp(const void *a, const void *b)
1058 {
1059 	const struct vf_runtime_reg *ra = a;
1060 	const struct vf_runtime_reg *rb = b;
1061 
1062 	return (int)ra->offset - (int)rb->offset;
1063 }
1064 
vf_lookup_reg(struct xe_gt * gt,u32 addr)1065 static struct vf_runtime_reg *vf_lookup_reg(struct xe_gt *gt, u32 addr)
1066 {
1067 	struct xe_gt_sriov_vf_runtime *runtime = &gt->sriov.vf.runtime;
1068 	struct vf_runtime_reg key = { .offset = addr };
1069 
1070 	xe_gt_assert(gt, IS_SRIOV_VF(gt_to_xe(gt)));
1071 
1072 	return bsearch(&key, runtime->regs, runtime->num_regs, sizeof(key),
1073 		       vf_runtime_reg_cmp);
1074 }
1075 
1076 /**
1077  * xe_gt_sriov_vf_read32 - Get a register value from the runtime data.
1078  * @gt: the &xe_gt
1079  * @reg: the register to read
1080  *
1081  * This function is for VF use only.
1082  * This function shall be called after VF has connected to PF.
1083  * This function is dedicated for registers that VFs can't read directly.
1084  *
1085  * Return: register value obtained from the PF or 0 if not found.
1086  */
xe_gt_sriov_vf_read32(struct xe_gt * gt,struct xe_reg reg)1087 u32 xe_gt_sriov_vf_read32(struct xe_gt *gt, struct xe_reg reg)
1088 {
1089 	u32 addr = xe_mmio_adjusted_addr(&gt->mmio, reg.addr);
1090 	struct vf_runtime_reg *rr;
1091 
1092 	xe_gt_assert(gt, IS_SRIOV_VF(gt_to_xe(gt)));
1093 	xe_gt_assert(gt, !reg.vf);
1094 
1095 	if (reg.addr == GMD_ID.addr) {
1096 		xe_gt_sriov_dbg_verbose(gt, "gmdid(%#x) = %#x\n",
1097 					addr, gt->sriov.vf.runtime.gmdid);
1098 		return gt->sriov.vf.runtime.gmdid;
1099 	}
1100 
1101 	rr = vf_lookup_reg(gt, addr);
1102 	if (!rr) {
1103 		xe_gt_WARN(gt, IS_ENABLED(CONFIG_DRM_XE_DEBUG),
1104 			   "VF is trying to read an inaccessible register %#x+%#x\n",
1105 			   reg.addr, addr - reg.addr);
1106 		return 0;
1107 	}
1108 
1109 	xe_gt_sriov_dbg_verbose(gt, "runtime[%#x] = %#x\n", addr, rr->value);
1110 	return rr->value;
1111 }
1112 
1113 /**
1114  * xe_gt_sriov_vf_write32 - Handle a write to an inaccessible register.
1115  * @gt: the &xe_gt
1116  * @reg: the register to write
1117  * @val: value to write
1118  *
1119  * This function is for VF use only.
1120  * Currently it will trigger a WARN if running on debug build.
1121  */
xe_gt_sriov_vf_write32(struct xe_gt * gt,struct xe_reg reg,u32 val)1122 void xe_gt_sriov_vf_write32(struct xe_gt *gt, struct xe_reg reg, u32 val)
1123 {
1124 	u32 addr = xe_mmio_adjusted_addr(&gt->mmio, reg.addr);
1125 
1126 	xe_gt_assert(gt, IS_SRIOV_VF(gt_to_xe(gt)));
1127 	xe_gt_assert(gt, !reg.vf);
1128 
1129 	/*
1130 	 * In the future, we may want to handle selected writes to inaccessible
1131 	 * registers in some custom way, but for now let's just log a warning
1132 	 * about such attempt, as likely we might be doing something wrong.
1133 	 */
1134 	xe_gt_WARN(gt, IS_ENABLED(CONFIG_DRM_XE_DEBUG),
1135 		   "VF is trying to write %#x to an inaccessible register %#x+%#x\n",
1136 		   val, reg.addr, addr - reg.addr);
1137 }
1138 
1139 /**
1140  * xe_gt_sriov_vf_print_config() - Print VF self config.
1141  * @gt: the &xe_gt
1142  * @p: the &drm_printer
1143  *
1144  * This function is for VF use only.
1145  *
1146  * Return: always 0.
1147  */
xe_gt_sriov_vf_print_config(struct xe_gt * gt,struct drm_printer * p)1148 int xe_gt_sriov_vf_print_config(struct xe_gt *gt, struct drm_printer *p)
1149 {
1150 	struct xe_gt_sriov_vf_selfconfig *config = &gt->sriov.vf.self_config;
1151 	struct xe_device *xe = gt_to_xe(gt);
1152 	u64 lmem_size;
1153 	char buf[10];
1154 
1155 	xe_gt_assert(gt, IS_SRIOV_VF(gt_to_xe(gt)));
1156 
1157 	if (xe_gt_is_main_type(gt)) {
1158 		u64 ggtt_size = xe_tile_sriov_vf_ggtt(gt_to_tile(gt));
1159 		u64 ggtt_base = xe_tile_sriov_vf_ggtt_base(gt_to_tile(gt));
1160 
1161 		drm_printf(p, "GGTT range:\t%#llx-%#llx\n",
1162 			   ggtt_base, ggtt_base + ggtt_size - 1);
1163 		string_get_size(ggtt_size, 1, STRING_UNITS_2, buf, sizeof(buf));
1164 		drm_printf(p, "GGTT size:\t%llu (%s)\n", ggtt_size, buf);
1165 
1166 		if (IS_DGFX(xe)) {
1167 			lmem_size = xe_tile_sriov_vf_lmem(gt_to_tile(gt));
1168 			string_get_size(lmem_size, 1, STRING_UNITS_2, buf, sizeof(buf));
1169 			drm_printf(p, "LMEM size:\t%llu (%s)\n", lmem_size, buf);
1170 		}
1171 	}
1172 
1173 	drm_printf(p, "GuC contexts:\t%u\n", config->num_ctxs);
1174 	drm_printf(p, "GuC doorbells:\t%u\n", config->num_dbs);
1175 
1176 	return 0;
1177 }
1178 
1179 /**
1180  * xe_gt_sriov_vf_print_runtime() - Print VF's runtime regs received from PF.
1181  * @gt: the &xe_gt
1182  * @p: the &drm_printer
1183  *
1184  * This function is for VF use only.
1185  *
1186  * Return: always 0.
1187  */
xe_gt_sriov_vf_print_runtime(struct xe_gt * gt,struct drm_printer * p)1188 int xe_gt_sriov_vf_print_runtime(struct xe_gt *gt, struct drm_printer *p)
1189 {
1190 	struct vf_runtime_reg *vf_regs = gt->sriov.vf.runtime.regs;
1191 	unsigned int size = gt->sriov.vf.runtime.num_regs;
1192 
1193 	xe_gt_assert(gt, IS_SRIOV_VF(gt_to_xe(gt)));
1194 
1195 	for (; size--; vf_regs++)
1196 		drm_printf(p, "%#x = %#x\n", vf_regs->offset, vf_regs->value);
1197 
1198 	return 0;
1199 }
1200 
1201 /**
1202  * xe_gt_sriov_vf_print_version() - Print VF ABI versions.
1203  * @gt: the &xe_gt
1204  * @p: the &drm_printer
1205  *
1206  * This function is for VF use only.
1207  *
1208  * Return: always 0.
1209  */
xe_gt_sriov_vf_print_version(struct xe_gt * gt,struct drm_printer * p)1210 int xe_gt_sriov_vf_print_version(struct xe_gt *gt, struct drm_printer *p)
1211 {
1212 	struct xe_device *xe = gt_to_xe(gt);
1213 	struct xe_uc_fw_version *guc_version = &gt->sriov.vf.guc_version;
1214 	struct xe_uc_fw_version *wanted = &gt->sriov.vf.wanted_guc_version;
1215 	struct xe_sriov_vf_relay_version *pf_version = &xe->sriov.vf.pf_version;
1216 	struct xe_uc_fw_version ver;
1217 
1218 	xe_gt_assert(gt, IS_SRIOV_VF(gt_to_xe(gt)));
1219 
1220 	drm_printf(p, "GuC ABI:\n");
1221 
1222 	vf_minimum_guc_version(gt, &ver);
1223 	drm_printf(p, "\tbase:\t%u.%u.%u.*\n", ver.branch, ver.major, ver.minor);
1224 
1225 	drm_printf(p, "\twanted:\t%u.%u.%u.*\n",
1226 		   wanted->branch, wanted->major, wanted->minor);
1227 
1228 	drm_printf(p, "\thandshake:\t%u.%u.%u.%u\n",
1229 		   guc_version->branch, guc_version->major,
1230 		   guc_version->minor, guc_version->patch);
1231 
1232 	drm_printf(p, "PF ABI:\n");
1233 
1234 	drm_printf(p, "\tbase:\t%u.%u\n",
1235 		   GUC_RELAY_VERSION_BASE_MAJOR, GUC_RELAY_VERSION_BASE_MINOR);
1236 	drm_printf(p, "\twanted:\t%u.%u\n",
1237 		   GUC_RELAY_VERSION_LATEST_MAJOR, GUC_RELAY_VERSION_LATEST_MINOR);
1238 	drm_printf(p, "\thandshake:\t%u.%u\n",
1239 		   pf_version->major, pf_version->minor);
1240 
1241 	return 0;
1242 }
1243 
vf_post_migration_shutdown(struct xe_gt * gt)1244 static bool vf_post_migration_shutdown(struct xe_gt *gt)
1245 {
1246 	struct xe_device *xe = gt_to_xe(gt);
1247 
1248 	/*
1249 	 * On platforms where CCS must be restored by the primary GT, the media
1250 	 * GT's VF post-migration recovery must run afterward. Detect this case
1251 	 * and re-queue the media GT's restore work item if necessary.
1252 	 */
1253 	if (xe->info.needs_shared_vf_gt_wq && xe_gt_is_media_type(gt)) {
1254 		struct xe_gt *primary_gt = gt_to_tile(gt)->primary_gt;
1255 
1256 		if (xe_gt_sriov_vf_recovery_pending(primary_gt))
1257 			return true;
1258 	}
1259 
1260 	xe_guc_ct_flush_and_stop(&gt->uc.guc.ct);
1261 	xe_guc_submit_pause_vf(&gt->uc.guc);
1262 	xe_tlb_inval_reset(&gt->tlb_inval);
1263 
1264 	return false;
1265 }
1266 
post_migration_scratch_size(struct xe_device * xe)1267 static size_t post_migration_scratch_size(struct xe_device *xe)
1268 {
1269 	return max(xe_lrc_reg_size(xe), LRC_WA_BB_SIZE);
1270 }
1271 
vf_post_migration_fixups(struct xe_gt * gt)1272 static int vf_post_migration_fixups(struct xe_gt *gt)
1273 {
1274 	void *buf = gt->sriov.vf.migration.scratch;
1275 	int err;
1276 
1277 	VF_MIGRATION_INJECT_WAIT(gt, FIXUPS);
1278 
1279 	/* xe_gt_sriov_vf_query_config will fixup the GGTT addresses */
1280 	err = xe_gt_sriov_vf_query_config(gt);
1281 	if (err)
1282 		return err;
1283 
1284 	if (xe_gt_is_main_type(gt))
1285 		xe_sriov_vf_ccs_rebase(gt_to_xe(gt));
1286 
1287 	xe_gt_sriov_vf_default_lrcs_hwsp_rebase(gt);
1288 	err = xe_guc_contexts_hwsp_rebase(&gt->uc.guc, buf);
1289 	if (err)
1290 		return err;
1291 
1292 	atomic_inc(&gt->sriov.vf.migration.fixups_complete_count);
1293 
1294 	return 0;
1295 }
1296 
vf_post_migration_rearm(struct xe_gt * gt)1297 static void vf_post_migration_rearm(struct xe_gt *gt)
1298 {
1299 	VF_MIGRATION_INJECT_WAIT(gt, RESTART_JOBS);
1300 
1301 	/*
1302 	 * Make sure interrupts on the new HW are properly set. The GuC IRQ
1303 	 * must be working at this point, since the recovery did started,
1304 	 * but the rest was not enabled using the procedure from spec.
1305 	 */
1306 	xe_irq_resume(gt_to_xe(gt));
1307 
1308 	xe_guc_ct_restart(&gt->uc.guc.ct);
1309 	xe_guc_submit_unpause_prepare_vf(&gt->uc.guc);
1310 }
1311 
vf_post_migration_kickstart(struct xe_gt * gt)1312 static void vf_post_migration_kickstart(struct xe_gt *gt)
1313 {
1314 	xe_guc_submit_unpause_vf(&gt->uc.guc);
1315 }
1316 
vf_post_migration_abort(struct xe_gt * gt)1317 static void vf_post_migration_abort(struct xe_gt *gt)
1318 {
1319 	spin_lock_irq(&gt->sriov.vf.migration.lock);
1320 	WRITE_ONCE(gt->sriov.vf.migration.recovery_inprogress, false);
1321 	WRITE_ONCE(gt->sriov.vf.migration.ggtt_need_fixes, false);
1322 	spin_unlock_irq(&gt->sriov.vf.migration.lock);
1323 
1324 	wake_up_all(&gt->sriov.vf.migration.wq);
1325 
1326 	xe_guc_submit_pause_abort(&gt->uc.guc);
1327 }
1328 
vf_post_migration_resfix_done(struct xe_gt * gt,u16 marker)1329 static int vf_post_migration_resfix_done(struct xe_gt *gt, u16 marker)
1330 {
1331 	VF_MIGRATION_INJECT_WAIT(gt, RESFIX_DONE);
1332 
1333 	spin_lock_irq(&gt->sriov.vf.migration.lock);
1334 	if (gt->sriov.vf.migration.recovery_queued)
1335 		xe_gt_sriov_dbg(gt, "another recovery imminent\n");
1336 	else
1337 		WRITE_ONCE(gt->sriov.vf.migration.recovery_inprogress, false);
1338 	spin_unlock_irq(&gt->sriov.vf.migration.lock);
1339 
1340 	return vf_resfix_done(gt, marker);
1341 }
1342 
vf_post_migration_resfix_start(struct xe_gt * gt,u16 marker)1343 static int vf_post_migration_resfix_start(struct xe_gt *gt, u16 marker)
1344 {
1345 	int err;
1346 
1347 	err = vf_resfix_start(gt, marker);
1348 
1349 	guard(spinlock_irq) (&gt->sriov.vf.migration.lock);
1350 	gt->sriov.vf.migration.recovery_queued = false;
1351 
1352 	return err;
1353 }
1354 
vf_post_migration_next_resfix_marker(struct xe_gt * gt)1355 static u16 vf_post_migration_next_resfix_marker(struct xe_gt *gt)
1356 {
1357 	xe_gt_assert(gt, IS_SRIOV_VF(gt_to_xe(gt)));
1358 
1359 	BUILD_BUG_ON(1 + ((typeof(gt->sriov.vf.migration.resfix_marker))~0) >
1360 		     FIELD_MAX(VF2GUC_RESFIX_START_REQUEST_MSG_0_MARKER));
1361 
1362 	/* add 1 to avoid zero-marker */
1363 	return 1 + gt->sriov.vf.migration.resfix_marker++;
1364 }
1365 
vf_post_migration_recovery(struct xe_gt * gt)1366 static void vf_post_migration_recovery(struct xe_gt *gt)
1367 {
1368 	struct xe_device *xe = gt_to_xe(gt);
1369 	u16 marker;
1370 	bool retry;
1371 	int err;
1372 
1373 	xe_gt_sriov_dbg(gt, "migration recovery in progress\n");
1374 
1375 	retry = vf_post_migration_shutdown(gt);
1376 	if (retry)
1377 		goto queue;
1378 
1379 	if (!xe_sriov_vf_migration_supported(xe)) {
1380 		xe_gt_sriov_err(gt, "migration is not supported\n");
1381 		err = -ENOTRECOVERABLE;
1382 		goto fail;
1383 	}
1384 
1385 	marker = vf_post_migration_next_resfix_marker(gt);
1386 
1387 	err = vf_post_migration_resfix_start(gt, marker);
1388 	if (unlikely(err)) {
1389 		xe_gt_sriov_err(gt, "Recovery failed at GuC RESFIX_START step (%pe)\n",
1390 				ERR_PTR(err));
1391 		goto fail;
1392 	}
1393 
1394 	err = vf_post_migration_fixups(gt);
1395 	if (err)
1396 		goto fail;
1397 
1398 	vf_post_migration_mark_fixups_done(gt);
1399 	vf_post_migration_rearm(gt);
1400 
1401 	err = vf_post_migration_resfix_done(gt, marker);
1402 	if (err) {
1403 		if (err == -EREMCHG)
1404 			goto queue;
1405 
1406 		xe_gt_sriov_err(gt, "Recovery failed at GuC RESFIX_DONE step (%pe)\n",
1407 				ERR_PTR(err));
1408 		goto fail;
1409 	}
1410 
1411 	vf_post_migration_kickstart(gt);
1412 
1413 	xe_gt_sriov_notice(gt, "migration recovery ended\n");
1414 	return;
1415 fail:
1416 	vf_post_migration_abort(gt);
1417 	xe_gt_sriov_err(gt, "migration recovery failed (%pe)\n", ERR_PTR(err));
1418 	xe_device_declare_wedged(xe);
1419 	return;
1420 
1421 queue:
1422 	xe_gt_sriov_info(gt, "Re-queuing migration recovery\n");
1423 	queue_work(gt->ordered_wq, &gt->sriov.vf.migration.worker);
1424 }
1425 
migration_worker_func(struct work_struct * w)1426 static void migration_worker_func(struct work_struct *w)
1427 {
1428 	struct xe_gt *gt = container_of(w, struct xe_gt,
1429 					sriov.vf.migration.worker);
1430 
1431 	vf_post_migration_recovery(gt);
1432 }
1433 
vf_migration_fini(void * arg)1434 static void vf_migration_fini(void *arg)
1435 {
1436 	struct xe_gt *gt = arg;
1437 
1438 	spin_lock_irq(&gt->sriov.vf.migration.lock);
1439 	gt->sriov.vf.migration.recovery_teardown = true;
1440 	spin_unlock_irq(&gt->sriov.vf.migration.lock);
1441 
1442 	cancel_work_sync(&gt->sriov.vf.migration.worker);
1443 }
1444 
1445 /**
1446  * xe_gt_sriov_vf_init_early() - GT VF init early
1447  * @gt: the &xe_gt
1448  *
1449  * Return 0 on success, errno on failure
1450  */
xe_gt_sriov_vf_init_early(struct xe_gt * gt)1451 int xe_gt_sriov_vf_init_early(struct xe_gt *gt)
1452 {
1453 	void *buf;
1454 
1455 	if (!xe_sriov_vf_migration_supported(gt_to_xe(gt)))
1456 		return 0;
1457 
1458 	buf = drmm_kmalloc(&gt_to_xe(gt)->drm,
1459 			   post_migration_scratch_size(gt_to_xe(gt)),
1460 			   GFP_KERNEL);
1461 	if (!buf)
1462 		return -ENOMEM;
1463 
1464 	gt->sriov.vf.migration.scratch = buf;
1465 	spin_lock_init(&gt->sriov.vf.migration.lock);
1466 	INIT_WORK(&gt->sriov.vf.migration.worker, migration_worker_func);
1467 	init_waitqueue_head(&gt->sriov.vf.migration.wq);
1468 
1469 	return 0;
1470 }
1471 
1472 /**
1473  * xe_gt_sriov_vf_init() - GT VF init
1474  * @gt: the &xe_gt
1475  *
1476  * Return 0 on success, errno on failure
1477  */
xe_gt_sriov_vf_init(struct xe_gt * gt)1478 int xe_gt_sriov_vf_init(struct xe_gt *gt)
1479 {
1480 	if (!xe_sriov_vf_migration_supported(gt_to_xe(gt)))
1481 		return 0;
1482 
1483 	/*
1484 	 * We want to tear down the VF post-migration early during driver
1485 	 * unload; therefore, we add this finalization action later during
1486 	 * driver load.
1487 	 */
1488 	return devm_add_action_or_reset(gt_to_xe(gt)->drm.dev,
1489 					vf_migration_fini, gt);
1490 }
1491 
1492 /**
1493  * xe_gt_sriov_vf_recovery_pending() - VF post migration recovery pending
1494  * @gt: the &xe_gt
1495  *
1496  * The return value of this function must be immediately visible upon vCPU
1497  * unhalt and must persist until RESFIX_DONE is issued. This guarantee is
1498  * currently implemented only for platforms that support memirq. If non-memirq
1499  * platforms begin to support VF migration, this function will need to be
1500  * updated accordingly.
1501  *
1502  * Return: True if VF post migration recovery is pending, False otherwise
1503  */
xe_gt_sriov_vf_recovery_pending(struct xe_gt * gt)1504 bool xe_gt_sriov_vf_recovery_pending(struct xe_gt *gt)
1505 {
1506 	struct xe_memirq *memirq = &gt_to_tile(gt)->memirq;
1507 
1508 	xe_gt_assert(gt, IS_SRIOV_VF(gt_to_xe(gt)));
1509 
1510 	/* early detection until recovery starts */
1511 	if (xe_device_uses_memirq(gt_to_xe(gt)) &&
1512 	    xe_memirq_guc_sw_int_0_irq_pending(memirq, &gt->uc.guc))
1513 		return true;
1514 
1515 	return READ_ONCE(gt->sriov.vf.migration.recovery_inprogress);
1516 }
1517 
vf_valid_ggtt(struct xe_gt * gt)1518 static bool vf_valid_ggtt(struct xe_gt *gt)
1519 {
1520 	struct xe_memirq *memirq = &gt_to_tile(gt)->memirq;
1521 	bool irq_pending = xe_device_uses_memirq(gt_to_xe(gt)) &&
1522 		xe_memirq_guc_sw_int_0_irq_pending(memirq, &gt->uc.guc);
1523 
1524 	xe_gt_assert(gt, IS_SRIOV_VF(gt_to_xe(gt)));
1525 
1526 	if (irq_pending || READ_ONCE(gt->sriov.vf.migration.ggtt_need_fixes))
1527 		return false;
1528 
1529 	return true;
1530 }
1531 
1532 /**
1533  * xe_vf_migration_fixups_complete_count() - Get count of VF fixups completions.
1534  * @gt: the &xe_gt instance which contains affected Global GTT
1535  *
1536  * Return: number of times VF fixups were completed since driver
1537  * probe, or 0 if migration is not available, or -1 if fixups are
1538  * pending or being applied right now.
1539  */
xe_vf_migration_fixups_complete_count(struct xe_gt * gt)1540 int xe_vf_migration_fixups_complete_count(struct xe_gt *gt)
1541 {
1542 	if (!IS_SRIOV_VF(gt_to_xe(gt)) ||
1543 	    !xe_sriov_vf_migration_supported(gt_to_xe(gt)))
1544 		return 0;
1545 
1546 	/* should never match fixups_complete_count value */
1547 	if (!vf_valid_ggtt(gt))
1548 		return -1;
1549 
1550 	return atomic_read(&gt->sriov.vf.migration.fixups_complete_count);
1551 }
1552 
1553 /**
1554  * xe_gt_sriov_vf_wait_valid_ggtt() - wait for valid GGTT nodes and address refs
1555  * @gt: the &xe_gt instance which contains affected Global GTT
1556  *
1557  * Return: number of times VF fixups were completed since driver
1558  * probe, or 0 if migration is not available.
1559  */
xe_gt_sriov_vf_wait_valid_ggtt(struct xe_gt * gt)1560 int xe_gt_sriov_vf_wait_valid_ggtt(struct xe_gt *gt)
1561 {
1562 	int ret;
1563 
1564 	/*
1565 	 * this condition needs to be identical to one in
1566 	 * xe_vf_migration_fixups_complete_count()
1567 	 */
1568 	if (!IS_SRIOV_VF(gt_to_xe(gt)) ||
1569 	    !xe_sriov_vf_migration_supported(gt_to_xe(gt)))
1570 		return 0;
1571 
1572 	ret = wait_event_interruptible_timeout(gt->sriov.vf.migration.wq,
1573 					       vf_valid_ggtt(gt),
1574 					       HZ * 5);
1575 	xe_gt_WARN_ON(gt, !ret);
1576 
1577 	return atomic_read(&gt->sriov.vf.migration.fixups_complete_count);
1578 }
1579