1 // SPDX-License-Identifier: MIT
2 /*
3 * Copyright © 2023-2024 Intel Corporation
4 */
5
6 #include <linux/bitfield.h>
7 #include <linux/bsearch.h>
8 #include <linux/delay.h>
9
10 #include <drm/drm_managed.h>
11 #include <drm/drm_print.h>
12
13 #include "abi/guc_actions_sriov_abi.h"
14 #include "abi/guc_communication_mmio_abi.h"
15 #include "abi/guc_klvs_abi.h"
16 #include "abi/guc_relay_actions_abi.h"
17 #include "regs/xe_gt_regs.h"
18
19 #include "xe_assert.h"
20 #include "xe_device.h"
21 #include "xe_ggtt.h"
22 #include "xe_gt_sriov_printk.h"
23 #include "xe_gt_sriov_vf.h"
24 #include "xe_gt_sriov_vf_types.h"
25 #include "xe_guc.h"
26 #include "xe_guc_ct.h"
27 #include "xe_guc_hxg_helpers.h"
28 #include "xe_guc_relay.h"
29 #include "xe_guc_submit.h"
30 #include "xe_irq.h"
31 #include "xe_lrc.h"
32 #include "xe_memirq.h"
33 #include "xe_mmio.h"
34 #include "xe_sriov.h"
35 #include "xe_sriov_vf.h"
36 #include "xe_sriov_vf_ccs.h"
37 #include "xe_tile_sriov_vf.h"
38 #include "xe_tlb_inval.h"
39 #include "xe_uc_fw.h"
40 #include "xe_wopcm.h"
41
42 #define make_u64_from_u32(hi, lo) ((u64)((u64)(u32)(hi) << 32 | (u32)(lo)))
43
44 #ifdef CONFIG_DRM_XE_DEBUG
45 enum VF_MIGRATION_WAIT_POINTS {
46 VF_MIGRATION_WAIT_RESFIX_START = BIT(0),
47 VF_MIGRATION_WAIT_FIXUPS = BIT(1),
48 VF_MIGRATION_WAIT_RESTART_JOBS = BIT(2),
49 VF_MIGRATION_WAIT_RESFIX_DONE = BIT(3),
50 };
51
52 #define VF_MIGRATION_WAIT_DELAY_IN_MS 1000
vf_post_migration_inject_wait(struct xe_gt * gt,enum VF_MIGRATION_WAIT_POINTS wait)53 static void vf_post_migration_inject_wait(struct xe_gt *gt,
54 enum VF_MIGRATION_WAIT_POINTS wait)
55 {
56 while (gt->sriov.vf.migration.debug.resfix_stoppers & wait) {
57 xe_gt_dbg(gt,
58 "*TESTING* injecting %u ms delay due to resfix_stoppers=%#x, to continue clear %#x\n",
59 VF_MIGRATION_WAIT_DELAY_IN_MS,
60 gt->sriov.vf.migration.debug.resfix_stoppers, wait);
61
62 msleep(VF_MIGRATION_WAIT_DELAY_IN_MS);
63 }
64 }
65
66 #define VF_MIGRATION_INJECT_WAIT(gt, _POS) ({ \
67 struct xe_gt *__gt = (gt); \
68 vf_post_migration_inject_wait(__gt, VF_MIGRATION_WAIT_##_POS); \
69 })
70
71 #else
72 #define VF_MIGRATION_INJECT_WAIT(_gt, ...) typecheck(struct xe_gt *, (_gt))
73 #endif
74
guc_action_vf_reset(struct xe_guc * guc)75 static int guc_action_vf_reset(struct xe_guc *guc)
76 {
77 u32 request[GUC_HXG_REQUEST_MSG_MIN_LEN] = {
78 FIELD_PREP(GUC_HXG_MSG_0_ORIGIN, GUC_HXG_ORIGIN_HOST) |
79 FIELD_PREP(GUC_HXG_MSG_0_TYPE, GUC_HXG_TYPE_REQUEST) |
80 FIELD_PREP(GUC_HXG_REQUEST_MSG_0_ACTION, GUC_ACTION_VF2GUC_VF_RESET),
81 };
82 int ret;
83
84 ret = xe_guc_mmio_send(guc, request, ARRAY_SIZE(request));
85
86 return ret > 0 ? -EPROTO : ret;
87 }
88
89 #define GUC_RESET_VF_STATE_RETRY_MAX 10
vf_reset_guc_state(struct xe_gt * gt)90 static int vf_reset_guc_state(struct xe_gt *gt)
91 {
92 unsigned int retry = GUC_RESET_VF_STATE_RETRY_MAX;
93 struct xe_guc *guc = >->uc.guc;
94 int err;
95
96 do {
97 err = guc_action_vf_reset(guc);
98 if (!err || err != -ETIMEDOUT)
99 break;
100 } while (--retry);
101
102 if (unlikely(err))
103 xe_gt_sriov_err(gt, "Failed to reset GuC state (%pe)\n", ERR_PTR(err));
104 return err;
105 }
106
107 /**
108 * xe_gt_sriov_vf_reset - Reset GuC VF internal state.
109 * @gt: the &xe_gt
110 *
111 * It requires functional `GuC MMIO based communication`_.
112 *
113 * Return: 0 on success or a negative error code on failure.
114 */
xe_gt_sriov_vf_reset(struct xe_gt * gt)115 int xe_gt_sriov_vf_reset(struct xe_gt *gt)
116 {
117 if (!xe_device_uc_enabled(gt_to_xe(gt)))
118 return -ENODEV;
119
120 return vf_reset_guc_state(gt);
121 }
122
guc_action_match_version(struct xe_guc * guc,struct xe_uc_fw_version * wanted,struct xe_uc_fw_version * found)123 static int guc_action_match_version(struct xe_guc *guc,
124 struct xe_uc_fw_version *wanted,
125 struct xe_uc_fw_version *found)
126 {
127 u32 request[VF2GUC_MATCH_VERSION_REQUEST_MSG_LEN] = {
128 FIELD_PREP(GUC_HXG_MSG_0_ORIGIN, GUC_HXG_ORIGIN_HOST) |
129 FIELD_PREP(GUC_HXG_MSG_0_TYPE, GUC_HXG_TYPE_REQUEST) |
130 FIELD_PREP(GUC_HXG_REQUEST_MSG_0_ACTION,
131 GUC_ACTION_VF2GUC_MATCH_VERSION),
132 FIELD_PREP(VF2GUC_MATCH_VERSION_REQUEST_MSG_1_BRANCH, wanted->branch) |
133 FIELD_PREP(VF2GUC_MATCH_VERSION_REQUEST_MSG_1_MAJOR, wanted->major) |
134 FIELD_PREP(VF2GUC_MATCH_VERSION_REQUEST_MSG_1_MINOR, wanted->minor),
135 };
136 u32 response[GUC_MAX_MMIO_MSG_LEN];
137 int ret;
138
139 BUILD_BUG_ON(VF2GUC_MATCH_VERSION_RESPONSE_MSG_LEN > GUC_MAX_MMIO_MSG_LEN);
140
141 ret = xe_guc_mmio_send_recv(guc, request, ARRAY_SIZE(request), response);
142 if (unlikely(ret < 0))
143 return ret;
144
145 if (unlikely(FIELD_GET(VF2GUC_MATCH_VERSION_RESPONSE_MSG_0_MBZ, response[0])))
146 return -EPROTO;
147
148 memset(found, 0, sizeof(struct xe_uc_fw_version));
149 found->branch = FIELD_GET(VF2GUC_MATCH_VERSION_RESPONSE_MSG_1_BRANCH, response[1]);
150 found->major = FIELD_GET(VF2GUC_MATCH_VERSION_RESPONSE_MSG_1_MAJOR, response[1]);
151 found->minor = FIELD_GET(VF2GUC_MATCH_VERSION_RESPONSE_MSG_1_MINOR, response[1]);
152 found->patch = FIELD_GET(VF2GUC_MATCH_VERSION_RESPONSE_MSG_1_PATCH, response[1]);
153
154 return 0;
155 }
156
guc_action_match_version_any(struct xe_guc * guc,struct xe_uc_fw_version * found)157 static int guc_action_match_version_any(struct xe_guc *guc,
158 struct xe_uc_fw_version *found)
159 {
160 struct xe_uc_fw_version wanted = {
161 .branch = GUC_VERSION_BRANCH_ANY,
162 .major = GUC_VERSION_MAJOR_ANY,
163 .minor = GUC_VERSION_MINOR_ANY,
164 .patch = 0
165 };
166
167 return guc_action_match_version(guc, &wanted, found);
168 }
169
vf_minimum_guc_version(struct xe_gt * gt,struct xe_uc_fw_version * ver)170 static void vf_minimum_guc_version(struct xe_gt *gt, struct xe_uc_fw_version *ver)
171 {
172 struct xe_device *xe = gt_to_xe(gt);
173
174 memset(ver, 0, sizeof(struct xe_uc_fw_version));
175
176 switch (xe->info.platform) {
177 case XE_TIGERLAKE ... XE_PVC:
178 /* 1.1 this is current baseline for Xe driver */
179 ver->branch = 0;
180 ver->major = 1;
181 ver->minor = 1;
182 break;
183 default:
184 /* 1.2 has support for the GMD_ID KLV */
185 ver->branch = 0;
186 ver->major = 1;
187 ver->minor = 2;
188 break;
189 }
190 }
191
vf_wanted_guc_version(struct xe_gt * gt,struct xe_uc_fw_version * ver)192 static void vf_wanted_guc_version(struct xe_gt *gt, struct xe_uc_fw_version *ver)
193 {
194 /* for now it's the same as minimum */
195 return vf_minimum_guc_version(gt, ver);
196 }
197
vf_handshake_with_guc(struct xe_gt * gt)198 static int vf_handshake_with_guc(struct xe_gt *gt)
199 {
200 struct xe_uc_fw_version *guc_version = >->sriov.vf.guc_version;
201 struct xe_uc_fw_version wanted = {0};
202 struct xe_guc *guc = >->uc.guc;
203 bool old = false;
204 int err;
205
206 xe_gt_assert(gt, IS_SRIOV_VF(gt_to_xe(gt)));
207
208 /* select wanted version - prefer previous (if any) */
209 if (guc_version->major || guc_version->minor) {
210 wanted = *guc_version;
211 old = true;
212 } else {
213 vf_wanted_guc_version(gt, &wanted);
214 xe_gt_assert(gt, wanted.major != GUC_VERSION_MAJOR_ANY);
215
216 /* First time we handshake, so record the minimum wanted */
217 gt->sriov.vf.wanted_guc_version = wanted;
218 }
219
220 err = guc_action_match_version(guc, &wanted, guc_version);
221 if (unlikely(err))
222 goto fail;
223
224 if (old) {
225 /* we don't support interface version change */
226 if (MAKE_GUC_VER_STRUCT(*guc_version) != MAKE_GUC_VER_STRUCT(wanted)) {
227 xe_gt_sriov_err(gt, "New GuC interface version detected: %u.%u.%u.%u\n",
228 guc_version->branch, guc_version->major,
229 guc_version->minor, guc_version->patch);
230 xe_gt_sriov_info(gt, "Previously used version was: %u.%u.%u.%u\n",
231 wanted.branch, wanted.major,
232 wanted.minor, wanted.patch);
233 err = -EREMCHG;
234 goto fail;
235 } else {
236 /* version is unchanged, no need to re-verify it */
237 return 0;
238 }
239 }
240
241 /* illegal */
242 if (guc_version->major > wanted.major) {
243 err = -EPROTO;
244 goto unsupported;
245 }
246
247 /* there's no fallback on major version. */
248 if (guc_version->major != wanted.major) {
249 err = -ENOPKG;
250 goto unsupported;
251 }
252
253 /* check against minimum version supported by us */
254 vf_minimum_guc_version(gt, &wanted);
255 xe_gt_assert(gt, wanted.major != GUC_VERSION_MAJOR_ANY);
256 if (MAKE_GUC_VER_STRUCT(*guc_version) < MAKE_GUC_VER_STRUCT(wanted)) {
257 err = -ENOKEY;
258 goto unsupported;
259 }
260
261 xe_gt_sriov_dbg(gt, "using GuC interface version %u.%u.%u.%u\n",
262 guc_version->branch, guc_version->major,
263 guc_version->minor, guc_version->patch);
264
265 return 0;
266
267 unsupported:
268 xe_gt_sriov_err(gt, "Unsupported GuC version %u.%u.%u.%u (%pe)\n",
269 guc_version->branch, guc_version->major,
270 guc_version->minor, guc_version->patch,
271 ERR_PTR(err));
272 fail:
273 xe_gt_sriov_err(gt, "Unable to confirm GuC version %u.%u (%pe)\n",
274 wanted.major, wanted.minor, ERR_PTR(err));
275
276 /* try again with *any* just to query which version is supported */
277 if (!guc_action_match_version_any(guc, &wanted))
278 xe_gt_sriov_notice(gt, "GuC reports interface version %u.%u.%u.%u\n",
279 wanted.branch, wanted.major, wanted.minor, wanted.patch);
280 return err;
281 }
282
283 /**
284 * xe_gt_sriov_vf_bootstrap - Query and setup GuC ABI interface version.
285 * @gt: the &xe_gt
286 *
287 * This function is for VF use only.
288 * It requires functional `GuC MMIO based communication`_.
289 *
290 * Return: 0 on success or a negative error code on failure.
291 */
xe_gt_sriov_vf_bootstrap(struct xe_gt * gt)292 int xe_gt_sriov_vf_bootstrap(struct xe_gt *gt)
293 {
294 int err;
295
296 if (!xe_device_uc_enabled(gt_to_xe(gt)))
297 return -ENODEV;
298
299 err = vf_reset_guc_state(gt);
300 if (unlikely(err))
301 return err;
302
303 err = vf_handshake_with_guc(gt);
304 if (unlikely(err))
305 return err;
306
307 return 0;
308 }
309
310 /**
311 * xe_gt_sriov_vf_guc_versions - Minimum required and found GuC ABI versions
312 * @gt: the &xe_gt
313 * @wanted: pointer to the xe_uc_fw_version to be filled with the wanted version
314 * @found: pointer to the xe_uc_fw_version to be filled with the found version
315 *
316 * This function is for VF use only and it can only be used after successful
317 * version handshake with the GuC.
318 */
xe_gt_sriov_vf_guc_versions(struct xe_gt * gt,struct xe_uc_fw_version * wanted,struct xe_uc_fw_version * found)319 void xe_gt_sriov_vf_guc_versions(struct xe_gt *gt,
320 struct xe_uc_fw_version *wanted,
321 struct xe_uc_fw_version *found)
322 {
323 xe_gt_assert(gt, IS_SRIOV_VF(gt_to_xe(gt)));
324 xe_gt_assert(gt, gt->sriov.vf.guc_version.major);
325
326 if (wanted)
327 *wanted = gt->sriov.vf.wanted_guc_version;
328
329 if (found)
330 *found = gt->sriov.vf.guc_version;
331 }
332
guc_action_vf_resfix_start(struct xe_guc * guc,u16 marker)333 static int guc_action_vf_resfix_start(struct xe_guc *guc, u16 marker)
334 {
335 u32 request[GUC_HXG_REQUEST_MSG_MIN_LEN] = {
336 FIELD_PREP(GUC_HXG_MSG_0_ORIGIN, GUC_HXG_ORIGIN_HOST) |
337 FIELD_PREP(GUC_HXG_MSG_0_TYPE, GUC_HXG_TYPE_REQUEST) |
338 FIELD_PREP(GUC_HXG_REQUEST_MSG_0_ACTION, GUC_ACTION_VF2GUC_RESFIX_START) |
339 FIELD_PREP(VF2GUC_RESFIX_START_REQUEST_MSG_0_MARKER, marker),
340 };
341 int ret;
342
343 ret = xe_guc_mmio_send(guc, request, ARRAY_SIZE(request));
344
345 return ret > 0 ? -EPROTO : ret;
346 }
347
vf_resfix_start(struct xe_gt * gt,u16 marker)348 static int vf_resfix_start(struct xe_gt *gt, u16 marker)
349 {
350 struct xe_guc *guc = >->uc.guc;
351
352 xe_gt_assert(gt, IS_SRIOV_VF(gt_to_xe(gt)));
353
354 VF_MIGRATION_INJECT_WAIT(gt, RESFIX_START);
355
356 xe_gt_sriov_dbg_verbose(gt, "Sending resfix start marker %u\n", marker);
357
358 return guc_action_vf_resfix_start(guc, marker);
359 }
360
guc_action_vf_resfix_done(struct xe_guc * guc,u16 marker)361 static int guc_action_vf_resfix_done(struct xe_guc *guc, u16 marker)
362 {
363 u32 request[GUC_HXG_REQUEST_MSG_MIN_LEN] = {
364 FIELD_PREP(GUC_HXG_MSG_0_ORIGIN, GUC_HXG_ORIGIN_HOST) |
365 FIELD_PREP(GUC_HXG_MSG_0_TYPE, GUC_HXG_TYPE_REQUEST) |
366 FIELD_PREP(GUC_HXG_REQUEST_MSG_0_ACTION, GUC_ACTION_VF2GUC_RESFIX_DONE) |
367 FIELD_PREP(VF2GUC_RESFIX_DONE_REQUEST_MSG_0_MARKER, marker),
368 };
369 int ret;
370
371 ret = xe_guc_mmio_send(guc, request, ARRAY_SIZE(request));
372
373 return ret > 0 ? -EPROTO : ret;
374 }
375
vf_resfix_done(struct xe_gt * gt,u16 marker)376 static int vf_resfix_done(struct xe_gt *gt, u16 marker)
377 {
378 struct xe_guc *guc = >->uc.guc;
379
380 xe_gt_assert(gt, IS_SRIOV_VF(gt_to_xe(gt)));
381
382 xe_gt_sriov_dbg_verbose(gt, "Sending resfix done marker %u\n", marker);
383
384 return guc_action_vf_resfix_done(guc, marker);
385 }
386
guc_action_query_single_klv(struct xe_guc * guc,u32 key,u32 * value,u32 value_len)387 static int guc_action_query_single_klv(struct xe_guc *guc, u32 key,
388 u32 *value, u32 value_len)
389 {
390 u32 request[VF2GUC_QUERY_SINGLE_KLV_REQUEST_MSG_LEN] = {
391 FIELD_PREP(GUC_HXG_MSG_0_ORIGIN, GUC_HXG_ORIGIN_HOST) |
392 FIELD_PREP(GUC_HXG_MSG_0_TYPE, GUC_HXG_TYPE_REQUEST) |
393 FIELD_PREP(GUC_HXG_REQUEST_MSG_0_ACTION,
394 GUC_ACTION_VF2GUC_QUERY_SINGLE_KLV),
395 FIELD_PREP(VF2GUC_QUERY_SINGLE_KLV_REQUEST_MSG_1_KEY, key),
396 };
397 u32 response[GUC_MAX_MMIO_MSG_LEN];
398 u32 length;
399 int ret;
400
401 BUILD_BUG_ON(VF2GUC_QUERY_SINGLE_KLV_RESPONSE_MSG_MAX_LEN > GUC_MAX_MMIO_MSG_LEN);
402 ret = xe_guc_mmio_send_recv(guc, request, ARRAY_SIZE(request), response);
403 if (unlikely(ret < 0))
404 return ret;
405
406 if (unlikely(FIELD_GET(VF2GUC_QUERY_SINGLE_KLV_RESPONSE_MSG_0_MBZ, response[0])))
407 return -EPROTO;
408
409 length = FIELD_GET(VF2GUC_QUERY_SINGLE_KLV_RESPONSE_MSG_0_LENGTH, response[0]);
410 if (unlikely(length > value_len))
411 return -EOVERFLOW;
412 if (unlikely(length < value_len))
413 return -ENODATA;
414
415 switch (value_len) {
416 default:
417 xe_gt_WARN_ON(guc_to_gt(guc), value_len > 3);
418 fallthrough;
419 case 3:
420 value[2] = FIELD_GET(VF2GUC_QUERY_SINGLE_KLV_RESPONSE_MSG_3_VALUE96, response[3]);
421 fallthrough;
422 case 2:
423 value[1] = FIELD_GET(VF2GUC_QUERY_SINGLE_KLV_RESPONSE_MSG_2_VALUE64, response[2]);
424 fallthrough;
425 case 1:
426 value[0] = FIELD_GET(VF2GUC_QUERY_SINGLE_KLV_RESPONSE_MSG_1_VALUE32, response[1]);
427 fallthrough;
428 case 0:
429 break;
430 }
431
432 return 0;
433 }
434
guc_action_query_single_klv32(struct xe_guc * guc,u32 key,u32 * value32)435 static int guc_action_query_single_klv32(struct xe_guc *guc, u32 key, u32 *value32)
436 {
437 return guc_action_query_single_klv(guc, key, value32, hxg_sizeof(u32));
438 }
439
guc_action_query_single_klv64(struct xe_guc * guc,u32 key,u64 * value64)440 static int guc_action_query_single_klv64(struct xe_guc *guc, u32 key, u64 *value64)
441 {
442 u32 value[2];
443 int err;
444
445 err = guc_action_query_single_klv(guc, key, value, hxg_sizeof(value));
446 if (unlikely(err))
447 return err;
448
449 *value64 = make_u64_from_u32(value[1], value[0]);
450 return 0;
451 }
452
has_gmdid(struct xe_device * xe)453 static bool has_gmdid(struct xe_device *xe)
454 {
455 return GRAPHICS_VERx100(xe) >= 1270;
456 }
457
458 /**
459 * xe_gt_sriov_vf_gmdid - Query GMDID over MMIO.
460 * @gt: the &xe_gt
461 *
462 * This function is for VF use only.
463 *
464 * Return: value of GMDID KLV on success or 0 on failure.
465 */
xe_gt_sriov_vf_gmdid(struct xe_gt * gt)466 u32 xe_gt_sriov_vf_gmdid(struct xe_gt *gt)
467 {
468 const char *type = xe_gt_is_media_type(gt) ? "media" : "graphics";
469 struct xe_guc *guc = >->uc.guc;
470 u32 value;
471 int err;
472
473 xe_gt_assert(gt, IS_SRIOV_VF(gt_to_xe(gt)));
474 xe_gt_assert(gt, !GRAPHICS_VERx100(gt_to_xe(gt)) || has_gmdid(gt_to_xe(gt)));
475 xe_gt_assert(gt, gt->sriov.vf.guc_version.major > 1 || gt->sriov.vf.guc_version.minor >= 2);
476
477 err = guc_action_query_single_klv32(guc, GUC_KLV_GLOBAL_CFG_GMD_ID_KEY, &value);
478 if (unlikely(err)) {
479 xe_gt_sriov_err(gt, "Failed to obtain %s GMDID (%pe)\n",
480 type, ERR_PTR(err));
481 return 0;
482 }
483
484 xe_gt_sriov_dbg(gt, "%s GMDID = %#x\n", type, value);
485 return value;
486 }
487
vf_get_ggtt_info(struct xe_gt * gt)488 static int vf_get_ggtt_info(struct xe_gt *gt)
489 {
490 struct xe_tile *tile = gt_to_tile(gt);
491 struct xe_ggtt *ggtt = tile->mem.ggtt;
492 struct xe_guc *guc = >->uc.guc;
493 u64 start, size, ggtt_size;
494 s64 shift;
495 int err;
496
497 xe_gt_assert(gt, IS_SRIOV_VF(gt_to_xe(gt)));
498
499 guard(mutex)(&ggtt->lock);
500
501 err = guc_action_query_single_klv64(guc, GUC_KLV_VF_CFG_GGTT_START_KEY, &start);
502 if (unlikely(err))
503 return err;
504
505 err = guc_action_query_single_klv64(guc, GUC_KLV_VF_CFG_GGTT_SIZE_KEY, &size);
506 if (unlikely(err))
507 return err;
508
509 if (!size)
510 return -ENODATA;
511
512 ggtt_size = xe_tile_sriov_vf_ggtt(tile);
513 if (ggtt_size && ggtt_size != size) {
514 xe_gt_sriov_err(gt, "Unexpected GGTT reassignment: %lluK != %lluK\n",
515 size / SZ_1K, ggtt_size / SZ_1K);
516 return -EREMCHG;
517 }
518
519 xe_gt_sriov_dbg_verbose(gt, "GGTT %#llx-%#llx = %lluK\n",
520 start, start + size - 1, size / SZ_1K);
521
522 shift = start - (s64)xe_tile_sriov_vf_ggtt_base(tile);
523 xe_tile_sriov_vf_ggtt_base_store(tile, start);
524 xe_tile_sriov_vf_ggtt_store(tile, size);
525
526 if (shift && shift != start) {
527 xe_gt_sriov_info(gt, "Shifting GGTT base by %lld to 0x%016llx\n",
528 shift, start);
529 xe_tile_sriov_vf_fixup_ggtt_nodes_locked(gt_to_tile(gt), shift);
530 }
531
532 if (xe_sriov_vf_migration_supported(gt_to_xe(gt))) {
533 WRITE_ONCE(gt->sriov.vf.migration.ggtt_need_fixes, false);
534 smp_wmb(); /* Ensure above write visible before wake */
535 wake_up_all(>->sriov.vf.migration.wq);
536 }
537
538 return 0;
539 }
540
vf_get_lmem_info(struct xe_gt * gt)541 static int vf_get_lmem_info(struct xe_gt *gt)
542 {
543 struct xe_tile *tile = gt_to_tile(gt);
544 struct xe_guc *guc = >->uc.guc;
545 char size_str[10];
546 u64 size, lmem_size;
547 int err;
548
549 xe_gt_assert(gt, IS_SRIOV_VF(gt_to_xe(gt)));
550
551 err = guc_action_query_single_klv64(guc, GUC_KLV_VF_CFG_LMEM_SIZE_KEY, &size);
552 if (unlikely(err))
553 return err;
554
555 lmem_size = xe_tile_sriov_vf_lmem(tile);
556 if (lmem_size && lmem_size != size) {
557 xe_gt_sriov_err(gt, "Unexpected LMEM reassignment: %lluM != %lluM\n",
558 size / SZ_1M, lmem_size / SZ_1M);
559 return -EREMCHG;
560 }
561
562 string_get_size(size, 1, STRING_UNITS_2, size_str, sizeof(size_str));
563 xe_gt_sriov_dbg_verbose(gt, "LMEM %lluM %s\n", size / SZ_1M, size_str);
564
565 xe_tile_sriov_vf_lmem_store(tile, size);
566
567 return size ? 0 : -ENODATA;
568 }
569
vf_get_submission_cfg(struct xe_gt * gt)570 static int vf_get_submission_cfg(struct xe_gt *gt)
571 {
572 struct xe_gt_sriov_vf_selfconfig *config = >->sriov.vf.self_config;
573 struct xe_guc *guc = >->uc.guc;
574 u32 num_ctxs, num_dbs;
575 int err;
576
577 xe_gt_assert(gt, IS_SRIOV_VF(gt_to_xe(gt)));
578
579 err = guc_action_query_single_klv32(guc, GUC_KLV_VF_CFG_NUM_CONTEXTS_KEY, &num_ctxs);
580 if (unlikely(err))
581 return err;
582
583 err = guc_action_query_single_klv32(guc, GUC_KLV_VF_CFG_NUM_DOORBELLS_KEY, &num_dbs);
584 if (unlikely(err))
585 return err;
586
587 if (config->num_ctxs && config->num_ctxs != num_ctxs) {
588 xe_gt_sriov_err(gt, "Unexpected CTXs reassignment: %u != %u\n",
589 num_ctxs, config->num_ctxs);
590 return -EREMCHG;
591 }
592 if (config->num_dbs && config->num_dbs != num_dbs) {
593 xe_gt_sriov_err(gt, "Unexpected DBs reassignment: %u != %u\n",
594 num_dbs, config->num_dbs);
595 return -EREMCHG;
596 }
597
598 xe_gt_sriov_dbg_verbose(gt, "CTXs %u DBs %u\n", num_ctxs, num_dbs);
599
600 config->num_ctxs = num_ctxs;
601 config->num_dbs = num_dbs;
602
603 return config->num_ctxs ? 0 : -ENODATA;
604 }
605
vf_cache_gmdid(struct xe_gt * gt)606 static void vf_cache_gmdid(struct xe_gt *gt)
607 {
608 xe_gt_assert(gt, has_gmdid(gt_to_xe(gt)));
609 xe_gt_assert(gt, IS_SRIOV_VF(gt_to_xe(gt)));
610
611 gt->sriov.vf.runtime.gmdid = xe_gt_sriov_vf_gmdid(gt);
612 }
613
vf_query_sched_groups(struct xe_gt * gt)614 static int vf_query_sched_groups(struct xe_gt *gt)
615 {
616 struct xe_guc *guc = >->uc.guc;
617 struct xe_uc_fw_version guc_version;
618 u32 value = 0;
619 int err;
620
621 xe_gt_sriov_vf_guc_versions(gt, NULL, &guc_version);
622
623 if (MAKE_GUC_VER_STRUCT(guc_version) < MAKE_GUC_VER(1, 26, 0))
624 return 0;
625
626 err = guc_action_query_single_klv32(guc,
627 GUC_KLV_GLOBAL_CFG_GROUP_SCHEDULING_AVAILABLE_KEY,
628 &value);
629 if (unlikely(err)) {
630 xe_gt_sriov_err(gt, "Failed to obtain sched groups status (%pe)\n",
631 ERR_PTR(err));
632 return err;
633 }
634
635 /* valid values are 0 (disabled) and 1 (enabled) */
636 if (value > 1) {
637 xe_gt_sriov_err(gt, "Invalid sched groups status %u\n", value);
638 return -EPROTO;
639 }
640
641 xe_gt_sriov_dbg(gt, "sched groups %s\n", str_enabled_disabled(value));
642 return value;
643 }
644
vf_cache_sched_groups_status(struct xe_gt * gt)645 static int vf_cache_sched_groups_status(struct xe_gt *gt)
646 {
647 int ret;
648
649 xe_gt_assert(gt, IS_SRIOV_VF(gt_to_xe(gt)));
650
651 ret = vf_query_sched_groups(gt);
652 if (ret < 0)
653 return ret;
654
655 gt->sriov.vf.runtime.uses_sched_groups = ret;
656
657 return 0;
658 }
659
660 /**
661 * xe_gt_sriov_vf_query_config - Query SR-IOV config data over MMIO.
662 * @gt: the &xe_gt
663 *
664 * This function is for VF use only. This function may shift the GGTT and is
665 * performed under GGTT lock, making this step visible to all GTs that share a
666 * GGTT.
667 *
668 * Return: 0 on success or a negative error code on failure.
669 */
xe_gt_sriov_vf_query_config(struct xe_gt * gt)670 int xe_gt_sriov_vf_query_config(struct xe_gt *gt)
671 {
672 struct xe_device *xe = gt_to_xe(gt);
673 int err;
674
675 err = vf_get_ggtt_info(gt);
676 if (unlikely(err))
677 return err;
678
679 if (IS_DGFX(xe) && xe_gt_is_main_type(gt)) {
680 err = vf_get_lmem_info(gt);
681 if (unlikely(err))
682 return err;
683 }
684
685 err = vf_get_submission_cfg(gt);
686 if (unlikely(err))
687 return err;
688
689 err = vf_cache_sched_groups_status(gt);
690 if (unlikely(err))
691 return err;
692
693 if (has_gmdid(xe))
694 vf_cache_gmdid(gt);
695
696 return 0;
697 }
698
699 /**
700 * xe_gt_sriov_vf_sched_groups_enabled() - Check if PF has enabled multiple
701 * scheduler groups
702 * @gt: the &xe_gt
703 *
704 * This function is for VF use only.
705 *
706 * Return: true if shed groups were enabled, false otherwise.
707 */
xe_gt_sriov_vf_sched_groups_enabled(struct xe_gt * gt)708 bool xe_gt_sriov_vf_sched_groups_enabled(struct xe_gt *gt)
709 {
710 xe_gt_assert(gt, IS_SRIOV_VF(gt_to_xe(gt)));
711 xe_gt_assert(gt, gt->sriov.vf.guc_version.major);
712
713 return gt->sriov.vf.runtime.uses_sched_groups;
714 }
715
716 /**
717 * xe_gt_sriov_vf_guc_ids - VF GuC context IDs configuration.
718 * @gt: the &xe_gt
719 *
720 * This function is for VF use only.
721 *
722 * Return: number of GuC context IDs assigned to VF.
723 */
xe_gt_sriov_vf_guc_ids(struct xe_gt * gt)724 u16 xe_gt_sriov_vf_guc_ids(struct xe_gt *gt)
725 {
726 xe_gt_assert(gt, IS_SRIOV_VF(gt_to_xe(gt)));
727 xe_gt_assert(gt, gt->sriov.vf.guc_version.major);
728 xe_gt_assert(gt, gt->sriov.vf.self_config.num_ctxs);
729
730 return gt->sriov.vf.self_config.num_ctxs;
731 }
732
relay_action_handshake(struct xe_gt * gt,u32 * major,u32 * minor)733 static int relay_action_handshake(struct xe_gt *gt, u32 *major, u32 *minor)
734 {
735 u32 request[VF2PF_HANDSHAKE_REQUEST_MSG_LEN] = {
736 FIELD_PREP(GUC_HXG_MSG_0_ORIGIN, GUC_HXG_ORIGIN_HOST) |
737 FIELD_PREP(GUC_HXG_MSG_0_TYPE, GUC_HXG_TYPE_REQUEST) |
738 FIELD_PREP(GUC_HXG_REQUEST_MSG_0_ACTION, GUC_RELAY_ACTION_VF2PF_HANDSHAKE),
739 FIELD_PREP(VF2PF_HANDSHAKE_REQUEST_MSG_1_MAJOR, *major) |
740 FIELD_PREP(VF2PF_HANDSHAKE_REQUEST_MSG_1_MINOR, *minor),
741 };
742 u32 response[VF2PF_HANDSHAKE_RESPONSE_MSG_LEN];
743 int ret;
744
745 xe_gt_assert(gt, IS_SRIOV_VF(gt_to_xe(gt)));
746
747 ret = xe_guc_relay_send_to_pf(>->uc.guc.relay,
748 request, ARRAY_SIZE(request),
749 response, ARRAY_SIZE(response));
750 if (unlikely(ret < 0))
751 return ret;
752
753 if (unlikely(ret != VF2PF_HANDSHAKE_RESPONSE_MSG_LEN))
754 return -EPROTO;
755
756 if (unlikely(FIELD_GET(VF2PF_HANDSHAKE_RESPONSE_MSG_0_MBZ, response[0])))
757 return -EPROTO;
758
759 *major = FIELD_GET(VF2PF_HANDSHAKE_RESPONSE_MSG_1_MAJOR, response[1]);
760 *minor = FIELD_GET(VF2PF_HANDSHAKE_RESPONSE_MSG_1_MINOR, response[1]);
761
762 return 0;
763 }
764
vf_connect_pf(struct xe_device * xe,u16 major,u16 minor)765 static void vf_connect_pf(struct xe_device *xe, u16 major, u16 minor)
766 {
767 xe_assert(xe, IS_SRIOV_VF(xe));
768
769 xe->sriov.vf.pf_version.major = major;
770 xe->sriov.vf.pf_version.minor = minor;
771 }
772
vf_disconnect_pf(struct xe_device * xe)773 static void vf_disconnect_pf(struct xe_device *xe)
774 {
775 vf_connect_pf(xe, 0, 0);
776 }
777
vf_handshake_with_pf(struct xe_gt * gt)778 static int vf_handshake_with_pf(struct xe_gt *gt)
779 {
780 struct xe_device *xe = gt_to_xe(gt);
781 u32 major_wanted = GUC_RELAY_VERSION_LATEST_MAJOR;
782 u32 minor_wanted = GUC_RELAY_VERSION_LATEST_MINOR;
783 u32 major = major_wanted, minor = minor_wanted;
784 int err;
785
786 err = relay_action_handshake(gt, &major, &minor);
787 if (unlikely(err))
788 goto failed;
789
790 if (!major && !minor) {
791 err = -ENODATA;
792 goto failed;
793 }
794
795 xe_gt_sriov_dbg(gt, "using VF/PF ABI %u.%u\n", major, minor);
796 vf_connect_pf(xe, major, minor);
797 return 0;
798
799 failed:
800 xe_gt_sriov_err(gt, "Unable to confirm VF/PF ABI version %u.%u (%pe)\n",
801 major, minor, ERR_PTR(err));
802 vf_disconnect_pf(xe);
803 return err;
804 }
805
806 /**
807 * xe_gt_sriov_vf_connect - Establish connection with the PF driver.
808 * @gt: the &xe_gt
809 *
810 * This function is for VF use only.
811 *
812 * Return: 0 on success or a negative error code on failure.
813 */
xe_gt_sriov_vf_connect(struct xe_gt * gt)814 int xe_gt_sriov_vf_connect(struct xe_gt *gt)
815 {
816 int err;
817
818 err = vf_handshake_with_pf(gt);
819 if (unlikely(err))
820 goto failed;
821
822 return 0;
823
824 failed:
825 xe_gt_sriov_err(gt, "Failed to get version info (%pe)\n", ERR_PTR(err));
826 return err;
827 }
828
829 /**
830 * xe_gt_sriov_vf_default_lrcs_hwsp_rebase - Update GGTT references in HWSP of default LRCs.
831 * @gt: the &xe_gt struct instance
832 */
xe_gt_sriov_vf_default_lrcs_hwsp_rebase(struct xe_gt * gt)833 static void xe_gt_sriov_vf_default_lrcs_hwsp_rebase(struct xe_gt *gt)
834 {
835 struct xe_hw_engine *hwe;
836 enum xe_hw_engine_id id;
837
838 for_each_hw_engine(hwe, gt, id)
839 xe_default_lrc_update_memirq_regs_with_address(hwe);
840 }
841
vf_start_migration_recovery(struct xe_gt * gt)842 static void vf_start_migration_recovery(struct xe_gt *gt)
843 {
844 bool started;
845
846 xe_gt_assert(gt, IS_SRIOV_VF(gt_to_xe(gt)));
847
848 spin_lock(>->sriov.vf.migration.lock);
849
850 if (!gt->sriov.vf.migration.recovery_queued &&
851 !gt->sriov.vf.migration.recovery_teardown) {
852 gt->sriov.vf.migration.recovery_queued = true;
853 WRITE_ONCE(gt->sriov.vf.migration.recovery_inprogress, true);
854 WRITE_ONCE(gt->sriov.vf.migration.ggtt_need_fixes, true);
855 smp_wmb(); /* Ensure above writes visible before wake */
856
857 xe_guc_ct_wake_waiters(>->uc.guc.ct);
858
859 started = queue_work(gt->ordered_wq, >->sriov.vf.migration.worker);
860 xe_gt_sriov_info(gt, "VF migration recovery %s\n", started ?
861 "scheduled" : "already in progress");
862 }
863
864 spin_unlock(>->sriov.vf.migration.lock);
865 }
866
867 /**
868 * xe_gt_sriov_vf_migrated_event_handler - Start a VF migration recovery,
869 * or just mark that a GuC is ready for it.
870 * @gt: the &xe_gt struct instance linked to target GuC
871 *
872 * This function shall be called only by VF.
873 */
xe_gt_sriov_vf_migrated_event_handler(struct xe_gt * gt)874 void xe_gt_sriov_vf_migrated_event_handler(struct xe_gt *gt)
875 {
876 struct xe_device *xe = gt_to_xe(gt);
877
878 xe_gt_assert(gt, IS_SRIOV_VF(xe));
879 xe_gt_assert(gt, xe_gt_sriov_vf_recovery_pending(gt));
880
881 if (!xe_sriov_vf_migration_supported(xe)) {
882 xe_gt_sriov_err(gt, "migration not supported\n");
883 return;
884 }
885
886 xe_gt_sriov_info(gt, "ready for recovery after migration\n");
887 vf_start_migration_recovery(gt);
888 }
889
vf_is_negotiated(struct xe_gt * gt,u16 major,u16 minor)890 static bool vf_is_negotiated(struct xe_gt *gt, u16 major, u16 minor)
891 {
892 struct xe_device *xe = gt_to_xe(gt);
893
894 xe_gt_assert(gt, IS_SRIOV_VF(xe));
895
896 return major == xe->sriov.vf.pf_version.major &&
897 minor <= xe->sriov.vf.pf_version.minor;
898 }
899
vf_prepare_runtime_info(struct xe_gt * gt,unsigned int num_regs)900 static int vf_prepare_runtime_info(struct xe_gt *gt, unsigned int num_regs)
901 {
902 struct vf_runtime_reg *regs = gt->sriov.vf.runtime.regs;
903 unsigned int regs_size = round_up(num_regs, 4);
904 struct xe_device *xe = gt_to_xe(gt);
905
906 xe_gt_assert(gt, IS_SRIOV_VF(xe));
907
908 if (regs) {
909 if (num_regs <= gt->sriov.vf.runtime.regs_size) {
910 memset(regs, 0, num_regs * sizeof(*regs));
911 gt->sriov.vf.runtime.num_regs = num_regs;
912 return 0;
913 }
914
915 drmm_kfree(&xe->drm, regs);
916 gt->sriov.vf.runtime.regs = NULL;
917 gt->sriov.vf.runtime.num_regs = 0;
918 gt->sriov.vf.runtime.regs_size = 0;
919 }
920
921 regs = drmm_kcalloc(&xe->drm, regs_size, sizeof(*regs), GFP_KERNEL);
922 if (unlikely(!regs))
923 return -ENOMEM;
924
925 gt->sriov.vf.runtime.regs = regs;
926 gt->sriov.vf.runtime.num_regs = num_regs;
927 gt->sriov.vf.runtime.regs_size = regs_size;
928 return 0;
929 }
930
vf_query_runtime_info(struct xe_gt * gt)931 static int vf_query_runtime_info(struct xe_gt *gt)
932 {
933 u32 request[VF2PF_QUERY_RUNTIME_REQUEST_MSG_LEN];
934 u32 response[VF2PF_QUERY_RUNTIME_RESPONSE_MSG_MIN_LEN + 32]; /* up to 16 regs */
935 u32 limit = (ARRAY_SIZE(response) - VF2PF_QUERY_RUNTIME_RESPONSE_MSG_MIN_LEN) / 2;
936 u32 count, remaining, num, i;
937 u32 start = 0;
938 int ret;
939
940 xe_gt_assert(gt, IS_SRIOV_VF(gt_to_xe(gt)));
941 xe_gt_assert(gt, limit);
942
943 /* this is part of the 1.0 PF/VF ABI */
944 if (!vf_is_negotiated(gt, 1, 0))
945 return -ENOPKG;
946
947 request[0] = FIELD_PREP(GUC_HXG_MSG_0_ORIGIN, GUC_HXG_ORIGIN_HOST) |
948 FIELD_PREP(GUC_HXG_MSG_0_TYPE, GUC_HXG_TYPE_REQUEST) |
949 FIELD_PREP(GUC_HXG_REQUEST_MSG_0_ACTION,
950 GUC_RELAY_ACTION_VF2PF_QUERY_RUNTIME) |
951 FIELD_PREP(VF2PF_QUERY_RUNTIME_REQUEST_MSG_0_LIMIT, limit);
952
953 repeat:
954 request[1] = FIELD_PREP(VF2PF_QUERY_RUNTIME_REQUEST_MSG_1_START, start);
955 ret = xe_guc_relay_send_to_pf(>->uc.guc.relay,
956 request, ARRAY_SIZE(request),
957 response, ARRAY_SIZE(response));
958 if (unlikely(ret < 0))
959 goto failed;
960
961 if (unlikely(ret < VF2PF_QUERY_RUNTIME_RESPONSE_MSG_MIN_LEN)) {
962 ret = -EPROTO;
963 goto failed;
964 }
965 if (unlikely((ret - VF2PF_QUERY_RUNTIME_RESPONSE_MSG_MIN_LEN) % 2)) {
966 ret = -EPROTO;
967 goto failed;
968 }
969
970 num = (ret - VF2PF_QUERY_RUNTIME_RESPONSE_MSG_MIN_LEN) / 2;
971 count = FIELD_GET(VF2PF_QUERY_RUNTIME_RESPONSE_MSG_0_COUNT, response[0]);
972 remaining = FIELD_GET(VF2PF_QUERY_RUNTIME_RESPONSE_MSG_1_REMAINING, response[1]);
973
974 xe_gt_sriov_dbg_verbose(gt, "count=%u num=%u ret=%d start=%u remaining=%u\n",
975 count, num, ret, start, remaining);
976
977 if (unlikely(count != num)) {
978 ret = -EPROTO;
979 goto failed;
980 }
981
982 if (start == 0) {
983 ret = vf_prepare_runtime_info(gt, num + remaining);
984 if (unlikely(ret < 0))
985 goto failed;
986 } else if (unlikely(start + num > gt->sriov.vf.runtime.num_regs)) {
987 ret = -EPROTO;
988 goto failed;
989 }
990
991 for (i = 0; i < num; ++i) {
992 struct vf_runtime_reg *reg = >->sriov.vf.runtime.regs[start + i];
993
994 reg->offset = response[VF2PF_QUERY_RUNTIME_RESPONSE_MSG_MIN_LEN + 2 * i];
995 reg->value = response[VF2PF_QUERY_RUNTIME_RESPONSE_MSG_MIN_LEN + 2 * i + 1];
996 }
997
998 if (remaining) {
999 start += num;
1000 goto repeat;
1001 }
1002
1003 return 0;
1004
1005 failed:
1006 vf_prepare_runtime_info(gt, 0);
1007 return ret;
1008 }
1009
vf_show_runtime_info(struct xe_gt * gt)1010 static void vf_show_runtime_info(struct xe_gt *gt)
1011 {
1012 struct vf_runtime_reg *vf_regs = gt->sriov.vf.runtime.regs;
1013 unsigned int size = gt->sriov.vf.runtime.num_regs;
1014
1015 xe_gt_assert(gt, IS_SRIOV_VF(gt_to_xe(gt)));
1016
1017 for (; size--; vf_regs++)
1018 xe_gt_sriov_dbg(gt, "runtime(%#x) = %#x\n",
1019 vf_regs->offset, vf_regs->value);
1020 }
1021
1022 /**
1023 * xe_gt_sriov_vf_query_runtime - Query SR-IOV runtime data.
1024 * @gt: the &xe_gt
1025 *
1026 * This function is for VF use only.
1027 *
1028 * Return: 0 on success or a negative error code on failure.
1029 */
xe_gt_sriov_vf_query_runtime(struct xe_gt * gt)1030 int xe_gt_sriov_vf_query_runtime(struct xe_gt *gt)
1031 {
1032 int err;
1033
1034 err = vf_query_runtime_info(gt);
1035 if (unlikely(err))
1036 goto failed;
1037
1038 if (IS_ENABLED(CONFIG_DRM_XE_DEBUG))
1039 vf_show_runtime_info(gt);
1040
1041 return 0;
1042
1043 failed:
1044 xe_gt_sriov_err(gt, "Failed to get runtime info (%pe)\n",
1045 ERR_PTR(err));
1046 return err;
1047 }
1048
vf_runtime_reg_cmp(const void * a,const void * b)1049 static int vf_runtime_reg_cmp(const void *a, const void *b)
1050 {
1051 const struct vf_runtime_reg *ra = a;
1052 const struct vf_runtime_reg *rb = b;
1053
1054 return (int)ra->offset - (int)rb->offset;
1055 }
1056
vf_lookup_reg(struct xe_gt * gt,u32 addr)1057 static struct vf_runtime_reg *vf_lookup_reg(struct xe_gt *gt, u32 addr)
1058 {
1059 struct xe_gt_sriov_vf_runtime *runtime = >->sriov.vf.runtime;
1060 struct vf_runtime_reg key = { .offset = addr };
1061
1062 xe_gt_assert(gt, IS_SRIOV_VF(gt_to_xe(gt)));
1063
1064 return bsearch(&key, runtime->regs, runtime->num_regs, sizeof(key),
1065 vf_runtime_reg_cmp);
1066 }
1067
1068 /**
1069 * xe_gt_sriov_vf_read32 - Get a register value from the runtime data.
1070 * @gt: the &xe_gt
1071 * @reg: the register to read
1072 *
1073 * This function is for VF use only.
1074 * This function shall be called after VF has connected to PF.
1075 * This function is dedicated for registers that VFs can't read directly.
1076 *
1077 * Return: register value obtained from the PF or 0 if not found.
1078 */
xe_gt_sriov_vf_read32(struct xe_gt * gt,struct xe_reg reg)1079 u32 xe_gt_sriov_vf_read32(struct xe_gt *gt, struct xe_reg reg)
1080 {
1081 u32 addr = xe_mmio_adjusted_addr(>->mmio, reg.addr);
1082 struct vf_runtime_reg *rr;
1083
1084 xe_gt_assert(gt, IS_SRIOV_VF(gt_to_xe(gt)));
1085 xe_gt_assert(gt, !reg.vf);
1086
1087 if (reg.addr == GMD_ID.addr) {
1088 xe_gt_sriov_dbg_verbose(gt, "gmdid(%#x) = %#x\n",
1089 addr, gt->sriov.vf.runtime.gmdid);
1090 return gt->sriov.vf.runtime.gmdid;
1091 }
1092
1093 rr = vf_lookup_reg(gt, addr);
1094 if (!rr) {
1095 xe_gt_WARN(gt, IS_ENABLED(CONFIG_DRM_XE_DEBUG),
1096 "VF is trying to read an inaccessible register %#x+%#x\n",
1097 reg.addr, addr - reg.addr);
1098 return 0;
1099 }
1100
1101 xe_gt_sriov_dbg_verbose(gt, "runtime[%#x] = %#x\n", addr, rr->value);
1102 return rr->value;
1103 }
1104
1105 /**
1106 * xe_gt_sriov_vf_write32 - Handle a write to an inaccessible register.
1107 * @gt: the &xe_gt
1108 * @reg: the register to write
1109 * @val: value to write
1110 *
1111 * This function is for VF use only.
1112 * Currently it will trigger a WARN if running on debug build.
1113 */
xe_gt_sriov_vf_write32(struct xe_gt * gt,struct xe_reg reg,u32 val)1114 void xe_gt_sriov_vf_write32(struct xe_gt *gt, struct xe_reg reg, u32 val)
1115 {
1116 u32 addr = xe_mmio_adjusted_addr(>->mmio, reg.addr);
1117
1118 xe_gt_assert(gt, IS_SRIOV_VF(gt_to_xe(gt)));
1119 xe_gt_assert(gt, !reg.vf);
1120
1121 /*
1122 * In the future, we may want to handle selected writes to inaccessible
1123 * registers in some custom way, but for now let's just log a warning
1124 * about such attempt, as likely we might be doing something wrong.
1125 */
1126 xe_gt_WARN(gt, IS_ENABLED(CONFIG_DRM_XE_DEBUG),
1127 "VF is trying to write %#x to an inaccessible register %#x+%#x\n",
1128 val, reg.addr, addr - reg.addr);
1129 }
1130
1131 /**
1132 * xe_gt_sriov_vf_print_config - Print VF self config.
1133 * @gt: the &xe_gt
1134 * @p: the &drm_printer
1135 *
1136 * This function is for VF use only.
1137 */
xe_gt_sriov_vf_print_config(struct xe_gt * gt,struct drm_printer * p)1138 void xe_gt_sriov_vf_print_config(struct xe_gt *gt, struct drm_printer *p)
1139 {
1140 struct xe_gt_sriov_vf_selfconfig *config = >->sriov.vf.self_config;
1141 struct xe_device *xe = gt_to_xe(gt);
1142 u64 lmem_size;
1143 char buf[10];
1144
1145 xe_gt_assert(gt, IS_SRIOV_VF(gt_to_xe(gt)));
1146
1147 if (xe_gt_is_main_type(gt)) {
1148 u64 ggtt_size = xe_tile_sriov_vf_ggtt(gt_to_tile(gt));
1149 u64 ggtt_base = xe_tile_sriov_vf_ggtt_base(gt_to_tile(gt));
1150
1151 drm_printf(p, "GGTT range:\t%#llx-%#llx\n",
1152 ggtt_base, ggtt_base + ggtt_size - 1);
1153 string_get_size(ggtt_size, 1, STRING_UNITS_2, buf, sizeof(buf));
1154 drm_printf(p, "GGTT size:\t%llu (%s)\n", ggtt_size, buf);
1155
1156 if (IS_DGFX(xe)) {
1157 lmem_size = xe_tile_sriov_vf_lmem(gt_to_tile(gt));
1158 string_get_size(lmem_size, 1, STRING_UNITS_2, buf, sizeof(buf));
1159 drm_printf(p, "LMEM size:\t%llu (%s)\n", lmem_size, buf);
1160 }
1161 }
1162
1163 drm_printf(p, "GuC contexts:\t%u\n", config->num_ctxs);
1164 drm_printf(p, "GuC doorbells:\t%u\n", config->num_dbs);
1165 }
1166
1167 /**
1168 * xe_gt_sriov_vf_print_runtime - Print VF's runtime regs received from PF.
1169 * @gt: the &xe_gt
1170 * @p: the &drm_printer
1171 *
1172 * This function is for VF use only.
1173 */
xe_gt_sriov_vf_print_runtime(struct xe_gt * gt,struct drm_printer * p)1174 void xe_gt_sriov_vf_print_runtime(struct xe_gt *gt, struct drm_printer *p)
1175 {
1176 struct vf_runtime_reg *vf_regs = gt->sriov.vf.runtime.regs;
1177 unsigned int size = gt->sriov.vf.runtime.num_regs;
1178
1179 xe_gt_assert(gt, IS_SRIOV_VF(gt_to_xe(gt)));
1180
1181 for (; size--; vf_regs++)
1182 drm_printf(p, "%#x = %#x\n", vf_regs->offset, vf_regs->value);
1183 }
1184
1185 /**
1186 * xe_gt_sriov_vf_print_version - Print VF ABI versions.
1187 * @gt: the &xe_gt
1188 * @p: the &drm_printer
1189 *
1190 * This function is for VF use only.
1191 */
xe_gt_sriov_vf_print_version(struct xe_gt * gt,struct drm_printer * p)1192 void xe_gt_sriov_vf_print_version(struct xe_gt *gt, struct drm_printer *p)
1193 {
1194 struct xe_device *xe = gt_to_xe(gt);
1195 struct xe_uc_fw_version *guc_version = >->sriov.vf.guc_version;
1196 struct xe_uc_fw_version *wanted = >->sriov.vf.wanted_guc_version;
1197 struct xe_sriov_vf_relay_version *pf_version = &xe->sriov.vf.pf_version;
1198 struct xe_uc_fw_version ver;
1199
1200 xe_gt_assert(gt, IS_SRIOV_VF(gt_to_xe(gt)));
1201
1202 drm_printf(p, "GuC ABI:\n");
1203
1204 vf_minimum_guc_version(gt, &ver);
1205 drm_printf(p, "\tbase:\t%u.%u.%u.*\n", ver.branch, ver.major, ver.minor);
1206
1207 drm_printf(p, "\twanted:\t%u.%u.%u.*\n",
1208 wanted->branch, wanted->major, wanted->minor);
1209
1210 drm_printf(p, "\thandshake:\t%u.%u.%u.%u\n",
1211 guc_version->branch, guc_version->major,
1212 guc_version->minor, guc_version->patch);
1213
1214 drm_printf(p, "PF ABI:\n");
1215
1216 drm_printf(p, "\tbase:\t%u.%u\n",
1217 GUC_RELAY_VERSION_BASE_MAJOR, GUC_RELAY_VERSION_BASE_MINOR);
1218 drm_printf(p, "\twanted:\t%u.%u\n",
1219 GUC_RELAY_VERSION_LATEST_MAJOR, GUC_RELAY_VERSION_LATEST_MINOR);
1220 drm_printf(p, "\thandshake:\t%u.%u\n",
1221 pf_version->major, pf_version->minor);
1222 }
1223
vf_post_migration_shutdown(struct xe_gt * gt)1224 static bool vf_post_migration_shutdown(struct xe_gt *gt)
1225 {
1226 struct xe_device *xe = gt_to_xe(gt);
1227
1228 /*
1229 * On platforms where CCS must be restored by the primary GT, the media
1230 * GT's VF post-migration recovery must run afterward. Detect this case
1231 * and re-queue the media GT's restore work item if necessary.
1232 */
1233 if (xe->info.needs_shared_vf_gt_wq && xe_gt_is_media_type(gt)) {
1234 struct xe_gt *primary_gt = gt_to_tile(gt)->primary_gt;
1235
1236 if (xe_gt_sriov_vf_recovery_pending(primary_gt))
1237 return true;
1238 }
1239
1240 xe_guc_ct_flush_and_stop(>->uc.guc.ct);
1241 xe_guc_submit_pause_vf(>->uc.guc);
1242 xe_tlb_inval_reset(>->tlb_inval);
1243
1244 return false;
1245 }
1246
post_migration_scratch_size(struct xe_device * xe)1247 static size_t post_migration_scratch_size(struct xe_device *xe)
1248 {
1249 return max(xe_lrc_reg_size(xe), LRC_WA_BB_SIZE);
1250 }
1251
vf_post_migration_fixups(struct xe_gt * gt)1252 static int vf_post_migration_fixups(struct xe_gt *gt)
1253 {
1254 void *buf = gt->sriov.vf.migration.scratch;
1255 int err;
1256
1257 VF_MIGRATION_INJECT_WAIT(gt, FIXUPS);
1258
1259 /* xe_gt_sriov_vf_query_config will fixup the GGTT addresses */
1260 err = xe_gt_sriov_vf_query_config(gt);
1261 if (err)
1262 return err;
1263
1264 if (xe_gt_is_main_type(gt))
1265 xe_sriov_vf_ccs_rebase(gt_to_xe(gt));
1266
1267 xe_gt_sriov_vf_default_lrcs_hwsp_rebase(gt);
1268 err = xe_guc_contexts_hwsp_rebase(>->uc.guc, buf);
1269 if (err)
1270 return err;
1271
1272 return 0;
1273 }
1274
vf_post_migration_rearm(struct xe_gt * gt)1275 static void vf_post_migration_rearm(struct xe_gt *gt)
1276 {
1277 VF_MIGRATION_INJECT_WAIT(gt, RESTART_JOBS);
1278
1279 /*
1280 * Make sure interrupts on the new HW are properly set. The GuC IRQ
1281 * must be working at this point, since the recovery did started,
1282 * but the rest was not enabled using the procedure from spec.
1283 */
1284 xe_irq_resume(gt_to_xe(gt));
1285
1286 xe_guc_ct_restart(>->uc.guc.ct);
1287 xe_guc_submit_unpause_prepare_vf(>->uc.guc);
1288 }
1289
vf_post_migration_kickstart(struct xe_gt * gt)1290 static void vf_post_migration_kickstart(struct xe_gt *gt)
1291 {
1292 xe_guc_submit_unpause_vf(>->uc.guc);
1293 }
1294
vf_post_migration_abort(struct xe_gt * gt)1295 static void vf_post_migration_abort(struct xe_gt *gt)
1296 {
1297 spin_lock_irq(>->sriov.vf.migration.lock);
1298 WRITE_ONCE(gt->sriov.vf.migration.recovery_inprogress, false);
1299 WRITE_ONCE(gt->sriov.vf.migration.ggtt_need_fixes, false);
1300 spin_unlock_irq(>->sriov.vf.migration.lock);
1301
1302 wake_up_all(>->sriov.vf.migration.wq);
1303
1304 xe_guc_submit_pause_abort(>->uc.guc);
1305 }
1306
vf_post_migration_resfix_done(struct xe_gt * gt,u16 marker)1307 static int vf_post_migration_resfix_done(struct xe_gt *gt, u16 marker)
1308 {
1309 VF_MIGRATION_INJECT_WAIT(gt, RESFIX_DONE);
1310
1311 spin_lock_irq(>->sriov.vf.migration.lock);
1312 if (gt->sriov.vf.migration.recovery_queued)
1313 xe_gt_sriov_dbg(gt, "another recovery imminent\n");
1314 else
1315 WRITE_ONCE(gt->sriov.vf.migration.recovery_inprogress, false);
1316 spin_unlock_irq(>->sriov.vf.migration.lock);
1317
1318 return vf_resfix_done(gt, marker);
1319 }
1320
vf_post_migration_resfix_start(struct xe_gt * gt,u16 marker)1321 static int vf_post_migration_resfix_start(struct xe_gt *gt, u16 marker)
1322 {
1323 int err;
1324
1325 err = vf_resfix_start(gt, marker);
1326
1327 guard(spinlock_irq) (>->sriov.vf.migration.lock);
1328 gt->sriov.vf.migration.recovery_queued = false;
1329
1330 return err;
1331 }
1332
vf_post_migration_next_resfix_marker(struct xe_gt * gt)1333 static u16 vf_post_migration_next_resfix_marker(struct xe_gt *gt)
1334 {
1335 xe_gt_assert(gt, IS_SRIOV_VF(gt_to_xe(gt)));
1336
1337 BUILD_BUG_ON(1 + ((typeof(gt->sriov.vf.migration.resfix_marker))~0) >
1338 FIELD_MAX(VF2GUC_RESFIX_START_REQUEST_MSG_0_MARKER));
1339
1340 /* add 1 to avoid zero-marker */
1341 return 1 + gt->sriov.vf.migration.resfix_marker++;
1342 }
1343
vf_post_migration_recovery(struct xe_gt * gt)1344 static void vf_post_migration_recovery(struct xe_gt *gt)
1345 {
1346 struct xe_device *xe = gt_to_xe(gt);
1347 u16 marker;
1348 bool retry;
1349 int err;
1350
1351 xe_gt_sriov_dbg(gt, "migration recovery in progress\n");
1352
1353 retry = vf_post_migration_shutdown(gt);
1354 if (retry)
1355 goto queue;
1356
1357 if (!xe_sriov_vf_migration_supported(xe)) {
1358 xe_gt_sriov_err(gt, "migration is not supported\n");
1359 err = -ENOTRECOVERABLE;
1360 goto fail;
1361 }
1362
1363 marker = vf_post_migration_next_resfix_marker(gt);
1364
1365 err = vf_post_migration_resfix_start(gt, marker);
1366 if (unlikely(err)) {
1367 xe_gt_sriov_err(gt, "Recovery failed at GuC RESFIX_START step (%pe)\n",
1368 ERR_PTR(err));
1369 goto fail;
1370 }
1371
1372 err = vf_post_migration_fixups(gt);
1373 if (err)
1374 goto fail;
1375
1376 vf_post_migration_rearm(gt);
1377
1378 err = vf_post_migration_resfix_done(gt, marker);
1379 if (err) {
1380 if (err == -EREMCHG)
1381 goto queue;
1382
1383 xe_gt_sriov_err(gt, "Recovery failed at GuC RESFIX_DONE step (%pe)\n",
1384 ERR_PTR(err));
1385 goto fail;
1386 }
1387
1388 vf_post_migration_kickstart(gt);
1389
1390 xe_gt_sriov_notice(gt, "migration recovery ended\n");
1391 return;
1392 fail:
1393 vf_post_migration_abort(gt);
1394 xe_gt_sriov_err(gt, "migration recovery failed (%pe)\n", ERR_PTR(err));
1395 xe_device_declare_wedged(xe);
1396 return;
1397
1398 queue:
1399 xe_gt_sriov_info(gt, "Re-queuing migration recovery\n");
1400 queue_work(gt->ordered_wq, >->sriov.vf.migration.worker);
1401 }
1402
migration_worker_func(struct work_struct * w)1403 static void migration_worker_func(struct work_struct *w)
1404 {
1405 struct xe_gt *gt = container_of(w, struct xe_gt,
1406 sriov.vf.migration.worker);
1407
1408 vf_post_migration_recovery(gt);
1409 }
1410
vf_migration_fini(void * arg)1411 static void vf_migration_fini(void *arg)
1412 {
1413 struct xe_gt *gt = arg;
1414
1415 spin_lock_irq(>->sriov.vf.migration.lock);
1416 gt->sriov.vf.migration.recovery_teardown = true;
1417 spin_unlock_irq(>->sriov.vf.migration.lock);
1418
1419 cancel_work_sync(>->sriov.vf.migration.worker);
1420 }
1421
1422 /**
1423 * xe_gt_sriov_vf_init_early() - GT VF init early
1424 * @gt: the &xe_gt
1425 *
1426 * Return 0 on success, errno on failure
1427 */
xe_gt_sriov_vf_init_early(struct xe_gt * gt)1428 int xe_gt_sriov_vf_init_early(struct xe_gt *gt)
1429 {
1430 void *buf;
1431
1432 if (!xe_sriov_vf_migration_supported(gt_to_xe(gt)))
1433 return 0;
1434
1435 buf = drmm_kmalloc(>_to_xe(gt)->drm,
1436 post_migration_scratch_size(gt_to_xe(gt)),
1437 GFP_KERNEL);
1438 if (!buf)
1439 return -ENOMEM;
1440
1441 gt->sriov.vf.migration.scratch = buf;
1442 spin_lock_init(>->sriov.vf.migration.lock);
1443 INIT_WORK(>->sriov.vf.migration.worker, migration_worker_func);
1444 init_waitqueue_head(>->sriov.vf.migration.wq);
1445
1446 return 0;
1447 }
1448
1449 /**
1450 * xe_gt_sriov_vf_init() - GT VF init
1451 * @gt: the &xe_gt
1452 *
1453 * Return 0 on success, errno on failure
1454 */
xe_gt_sriov_vf_init(struct xe_gt * gt)1455 int xe_gt_sriov_vf_init(struct xe_gt *gt)
1456 {
1457 if (!xe_sriov_vf_migration_supported(gt_to_xe(gt)))
1458 return 0;
1459
1460 /*
1461 * We want to tear down the VF post-migration early during driver
1462 * unload; therefore, we add this finalization action later during
1463 * driver load.
1464 */
1465 return devm_add_action_or_reset(gt_to_xe(gt)->drm.dev,
1466 vf_migration_fini, gt);
1467 }
1468
1469 /**
1470 * xe_gt_sriov_vf_recovery_pending() - VF post migration recovery pending
1471 * @gt: the &xe_gt
1472 *
1473 * The return value of this function must be immediately visible upon vCPU
1474 * unhalt and must persist until RESFIX_DONE is issued. This guarantee is
1475 * currently implemented only for platforms that support memirq. If non-memirq
1476 * platforms begin to support VF migration, this function will need to be
1477 * updated accordingly.
1478 *
1479 * Return: True if VF post migration recovery is pending, False otherwise
1480 */
xe_gt_sriov_vf_recovery_pending(struct xe_gt * gt)1481 bool xe_gt_sriov_vf_recovery_pending(struct xe_gt *gt)
1482 {
1483 struct xe_memirq *memirq = >_to_tile(gt)->memirq;
1484
1485 xe_gt_assert(gt, IS_SRIOV_VF(gt_to_xe(gt)));
1486
1487 /* early detection until recovery starts */
1488 if (xe_device_uses_memirq(gt_to_xe(gt)) &&
1489 xe_memirq_guc_sw_int_0_irq_pending(memirq, >->uc.guc))
1490 return true;
1491
1492 return READ_ONCE(gt->sriov.vf.migration.recovery_inprogress);
1493 }
1494
vf_valid_ggtt(struct xe_gt * gt)1495 static bool vf_valid_ggtt(struct xe_gt *gt)
1496 {
1497 struct xe_memirq *memirq = >_to_tile(gt)->memirq;
1498 bool irq_pending = xe_device_uses_memirq(gt_to_xe(gt)) &&
1499 xe_memirq_guc_sw_int_0_irq_pending(memirq, >->uc.guc);
1500
1501 xe_gt_assert(gt, IS_SRIOV_VF(gt_to_xe(gt)));
1502
1503 if (irq_pending || READ_ONCE(gt->sriov.vf.migration.ggtt_need_fixes))
1504 return false;
1505
1506 return true;
1507 }
1508
1509 /**
1510 * xe_gt_sriov_vf_wait_valid_ggtt() - VF wait for valid GGTT addresses
1511 * @gt: the &xe_gt
1512 */
xe_gt_sriov_vf_wait_valid_ggtt(struct xe_gt * gt)1513 void xe_gt_sriov_vf_wait_valid_ggtt(struct xe_gt *gt)
1514 {
1515 int ret;
1516
1517 if (!IS_SRIOV_VF(gt_to_xe(gt)) ||
1518 !xe_sriov_vf_migration_supported(gt_to_xe(gt)))
1519 return;
1520
1521 ret = wait_event_interruptible_timeout(gt->sriov.vf.migration.wq,
1522 vf_valid_ggtt(gt),
1523 HZ * 5);
1524 xe_gt_WARN_ON(gt, !ret);
1525 }
1526