1 // SPDX-License-Identifier: MIT
2 /*
3 * Copyright © 2022 Intel Corporation
4 */
5
6 #include "xe_guc_ct.h"
7
8 #include <linux/bitfield.h>
9 #include <linux/circ_buf.h>
10 #include <linux/delay.h>
11 #include <linux/fault-inject.h>
12
13 #include <kunit/static_stub.h>
14
15 #include <drm/drm_managed.h>
16
17 #include "abi/guc_actions_abi.h"
18 #include "abi/guc_actions_sriov_abi.h"
19 #include "abi/guc_klvs_abi.h"
20 #include "xe_bo.h"
21 #include "xe_devcoredump.h"
22 #include "xe_device.h"
23 #include "xe_gt.h"
24 #include "xe_gt_pagefault.h"
25 #include "xe_gt_printk.h"
26 #include "xe_gt_sriov_pf_control.h"
27 #include "xe_gt_sriov_pf_monitor.h"
28 #include "xe_gt_sriov_printk.h"
29 #include "xe_guc.h"
30 #include "xe_guc_log.h"
31 #include "xe_guc_relay.h"
32 #include "xe_guc_submit.h"
33 #include "xe_guc_tlb_inval.h"
34 #include "xe_map.h"
35 #include "xe_pm.h"
36 #include "xe_trace_guc.h"
37
38 static void receive_g2h(struct xe_guc_ct *ct);
39 static void g2h_worker_func(struct work_struct *w);
40 static void safe_mode_worker_func(struct work_struct *w);
41 static void ct_exit_safe_mode(struct xe_guc_ct *ct);
42 static void guc_ct_change_state(struct xe_guc_ct *ct,
43 enum xe_guc_ct_state state);
44
45 #if IS_ENABLED(CONFIG_DRM_XE_DEBUG)
46 enum {
47 /* Internal states, not error conditions */
48 CT_DEAD_STATE_REARM, /* 0x0001 */
49 CT_DEAD_STATE_CAPTURE, /* 0x0002 */
50
51 /* Error conditions */
52 CT_DEAD_SETUP, /* 0x0004 */
53 CT_DEAD_H2G_WRITE, /* 0x0008 */
54 CT_DEAD_H2G_HAS_ROOM, /* 0x0010 */
55 CT_DEAD_G2H_READ, /* 0x0020 */
56 CT_DEAD_G2H_RECV, /* 0x0040 */
57 CT_DEAD_G2H_RELEASE, /* 0x0080 */
58 CT_DEAD_DEADLOCK, /* 0x0100 */
59 CT_DEAD_PROCESS_FAILED, /* 0x0200 */
60 CT_DEAD_FAST_G2H, /* 0x0400 */
61 CT_DEAD_PARSE_G2H_RESPONSE, /* 0x0800 */
62 CT_DEAD_PARSE_G2H_UNKNOWN, /* 0x1000 */
63 CT_DEAD_PARSE_G2H_ORIGIN, /* 0x2000 */
64 CT_DEAD_PARSE_G2H_TYPE, /* 0x4000 */
65 CT_DEAD_CRASH, /* 0x8000 */
66 };
67
68 static void ct_dead_worker_func(struct work_struct *w);
69 static void ct_dead_capture(struct xe_guc_ct *ct, struct guc_ctb *ctb, u32 reason_code);
70
71 #define CT_DEAD(ct, ctb, reason_code) ct_dead_capture((ct), (ctb), CT_DEAD_##reason_code)
72 #else
73 #define CT_DEAD(ct, ctb, reason) \
74 do { \
75 struct guc_ctb *_ctb = (ctb); \
76 if (_ctb) \
77 _ctb->info.broken = true; \
78 } while (0)
79 #endif
80
81 /* Used when a CT send wants to block and / or receive data */
82 struct g2h_fence {
83 u32 *response_buffer;
84 u32 seqno;
85 u32 response_data;
86 u16 response_len;
87 u16 error;
88 u16 hint;
89 u16 reason;
90 bool cancel;
91 bool retry;
92 bool fail;
93 bool done;
94 };
95
96 #define make_u64(hi, lo) ((u64)((u64)(u32)(hi) << 32 | (u32)(lo)))
97
g2h_fence_init(struct g2h_fence * g2h_fence,u32 * response_buffer)98 static void g2h_fence_init(struct g2h_fence *g2h_fence, u32 *response_buffer)
99 {
100 memset(g2h_fence, 0, sizeof(*g2h_fence));
101 g2h_fence->response_buffer = response_buffer;
102 g2h_fence->seqno = ~0x0;
103 }
104
g2h_fence_cancel(struct g2h_fence * g2h_fence)105 static void g2h_fence_cancel(struct g2h_fence *g2h_fence)
106 {
107 g2h_fence->cancel = true;
108 g2h_fence->fail = true;
109 g2h_fence->done = true;
110 }
111
g2h_fence_needs_alloc(struct g2h_fence * g2h_fence)112 static bool g2h_fence_needs_alloc(struct g2h_fence *g2h_fence)
113 {
114 return g2h_fence->seqno == ~0x0;
115 }
116
117 static struct xe_guc *
ct_to_guc(struct xe_guc_ct * ct)118 ct_to_guc(struct xe_guc_ct *ct)
119 {
120 return container_of(ct, struct xe_guc, ct);
121 }
122
123 static struct xe_gt *
ct_to_gt(struct xe_guc_ct * ct)124 ct_to_gt(struct xe_guc_ct *ct)
125 {
126 return container_of(ct, struct xe_gt, uc.guc.ct);
127 }
128
129 static struct xe_device *
ct_to_xe(struct xe_guc_ct * ct)130 ct_to_xe(struct xe_guc_ct *ct)
131 {
132 return gt_to_xe(ct_to_gt(ct));
133 }
134
135 /**
136 * DOC: GuC CTB Blob
137 *
138 * We allocate single blob to hold both CTB descriptors and buffers:
139 *
140 * +--------+-----------------------------------------------+------+
141 * | offset | contents | size |
142 * +========+===============================================+======+
143 * | 0x0000 | H2G CTB Descriptor (send) | |
144 * +--------+-----------------------------------------------+ 4K |
145 * | 0x0800 | G2H CTB Descriptor (g2h) | |
146 * +--------+-----------------------------------------------+------+
147 * | 0x1000 | H2G CT Buffer (send) | n*4K |
148 * | | | |
149 * +--------+-----------------------------------------------+------+
150 * | 0x1000 | G2H CT Buffer (g2h) | m*4K |
151 * | + n*4K | | |
152 * +--------+-----------------------------------------------+------+
153 *
154 * Size of each ``CT Buffer`` must be multiple of 4K.
155 * We don't expect too many messages in flight at any time, unless we are
156 * using the GuC submission. In that case each request requires a minimum
157 * 2 dwords which gives us a maximum 256 queue'd requests. Hopefully this
158 * enough space to avoid backpressure on the driver. We increase the size
159 * of the receive buffer (relative to the send) to ensure a G2H response
160 * CTB has a landing spot.
161 *
162 * In addition to submissions, the G2H buffer needs to be able to hold
163 * enough space for recoverable page fault notifications. The number of
164 * page faults is interrupt driven and can be as much as the number of
165 * compute resources available. However, most of the actual work for these
166 * is in a separate page fault worker thread. Therefore we only need to
167 * make sure the queue has enough space to handle all of the submissions
168 * and responses and an extra buffer for incoming page faults.
169 */
170
171 #define CTB_DESC_SIZE ALIGN(sizeof(struct guc_ct_buffer_desc), SZ_2K)
172 #define CTB_H2G_BUFFER_SIZE (SZ_4K)
173 #define CTB_G2H_BUFFER_SIZE (SZ_128K)
174 #define G2H_ROOM_BUFFER_SIZE (CTB_G2H_BUFFER_SIZE / 2)
175
176 /**
177 * xe_guc_ct_queue_proc_time_jiffies - Return maximum time to process a full
178 * CT command queue
179 * @ct: the &xe_guc_ct. Unused at this moment but will be used in the future.
180 *
181 * Observation is that a 4KiB buffer full of commands takes a little over a
182 * second to process. Use that to calculate maximum time to process a full CT
183 * command queue.
184 *
185 * Return: Maximum time to process a full CT queue in jiffies.
186 */
xe_guc_ct_queue_proc_time_jiffies(struct xe_guc_ct * ct)187 long xe_guc_ct_queue_proc_time_jiffies(struct xe_guc_ct *ct)
188 {
189 BUILD_BUG_ON(!IS_ALIGNED(CTB_H2G_BUFFER_SIZE, SZ_4));
190 return (CTB_H2G_BUFFER_SIZE / SZ_4K) * HZ;
191 }
192
guc_ct_size(void)193 static size_t guc_ct_size(void)
194 {
195 return 2 * CTB_DESC_SIZE + CTB_H2G_BUFFER_SIZE +
196 CTB_G2H_BUFFER_SIZE;
197 }
198
guc_ct_fini(struct drm_device * drm,void * arg)199 static void guc_ct_fini(struct drm_device *drm, void *arg)
200 {
201 struct xe_guc_ct *ct = arg;
202
203 #if IS_ENABLED(CONFIG_DRM_XE_DEBUG)
204 cancel_work_sync(&ct->dead.worker);
205 #endif
206 ct_exit_safe_mode(ct);
207 destroy_workqueue(ct->g2h_wq);
208 xa_destroy(&ct->fence_lookup);
209 }
210
primelockdep(struct xe_guc_ct * ct)211 static void primelockdep(struct xe_guc_ct *ct)
212 {
213 if (!IS_ENABLED(CONFIG_LOCKDEP))
214 return;
215
216 fs_reclaim_acquire(GFP_KERNEL);
217 might_lock(&ct->lock);
218 fs_reclaim_release(GFP_KERNEL);
219 }
220
xe_guc_ct_init_noalloc(struct xe_guc_ct * ct)221 int xe_guc_ct_init_noalloc(struct xe_guc_ct *ct)
222 {
223 struct xe_device *xe = ct_to_xe(ct);
224 struct xe_gt *gt = ct_to_gt(ct);
225 int err;
226
227 xe_gt_assert(gt, !(guc_ct_size() % PAGE_SIZE));
228
229 err = drmm_mutex_init(&xe->drm, &ct->lock);
230 if (err)
231 return err;
232
233 primelockdep(ct);
234
235 ct->g2h_wq = alloc_ordered_workqueue("xe-g2h-wq", WQ_MEM_RECLAIM);
236 if (!ct->g2h_wq)
237 return -ENOMEM;
238
239 spin_lock_init(&ct->fast_lock);
240 xa_init(&ct->fence_lookup);
241 INIT_WORK(&ct->g2h_worker, g2h_worker_func);
242 INIT_DELAYED_WORK(&ct->safe_mode_worker, safe_mode_worker_func);
243 #if IS_ENABLED(CONFIG_DRM_XE_DEBUG)
244 spin_lock_init(&ct->dead.lock);
245 INIT_WORK(&ct->dead.worker, ct_dead_worker_func);
246 #if IS_ENABLED(CONFIG_DRM_XE_DEBUG_GUC)
247 stack_depot_init();
248 #endif
249 #endif
250 init_waitqueue_head(&ct->wq);
251 init_waitqueue_head(&ct->g2h_fence_wq);
252
253 err = drmm_add_action_or_reset(&xe->drm, guc_ct_fini, ct);
254 if (err)
255 return err;
256
257 xe_gt_assert(gt, ct->state == XE_GUC_CT_STATE_NOT_INITIALIZED);
258 ct->state = XE_GUC_CT_STATE_DISABLED;
259 return 0;
260 }
261 ALLOW_ERROR_INJECTION(xe_guc_ct_init_noalloc, ERRNO); /* See xe_pci_probe() */
262
guc_action_disable_ct(void * arg)263 static void guc_action_disable_ct(void *arg)
264 {
265 struct xe_guc_ct *ct = arg;
266
267 guc_ct_change_state(ct, XE_GUC_CT_STATE_DISABLED);
268 }
269
xe_guc_ct_init(struct xe_guc_ct * ct)270 int xe_guc_ct_init(struct xe_guc_ct *ct)
271 {
272 struct xe_device *xe = ct_to_xe(ct);
273 struct xe_gt *gt = ct_to_gt(ct);
274 struct xe_tile *tile = gt_to_tile(gt);
275 struct xe_bo *bo;
276
277 bo = xe_managed_bo_create_pin_map(xe, tile, guc_ct_size(),
278 XE_BO_FLAG_SYSTEM |
279 XE_BO_FLAG_GGTT |
280 XE_BO_FLAG_GGTT_INVALIDATE |
281 XE_BO_FLAG_PINNED_NORESTORE);
282 if (IS_ERR(bo))
283 return PTR_ERR(bo);
284
285 ct->bo = bo;
286
287 return devm_add_action_or_reset(xe->drm.dev, guc_action_disable_ct, ct);
288 }
289 ALLOW_ERROR_INJECTION(xe_guc_ct_init, ERRNO); /* See xe_pci_probe() */
290
291 /**
292 * xe_guc_ct_init_post_hwconfig - Reinitialize the GuC CTB in VRAM
293 * @ct: the &xe_guc_ct
294 *
295 * Allocate a new BO in VRAM and free the previous BO that was allocated
296 * in system memory (SMEM). Applicable only for DGFX products.
297 *
298 * Return: 0 on success, or a negative errno on failure.
299 */
xe_guc_ct_init_post_hwconfig(struct xe_guc_ct * ct)300 int xe_guc_ct_init_post_hwconfig(struct xe_guc_ct *ct)
301 {
302 struct xe_device *xe = ct_to_xe(ct);
303 struct xe_gt *gt = ct_to_gt(ct);
304 struct xe_tile *tile = gt_to_tile(gt);
305 int ret;
306
307 xe_assert(xe, !xe_guc_ct_enabled(ct));
308
309 if (IS_DGFX(xe)) {
310 ret = xe_managed_bo_reinit_in_vram(xe, tile, &ct->bo);
311 if (ret)
312 return ret;
313 }
314
315 devm_remove_action(xe->drm.dev, guc_action_disable_ct, ct);
316 return devm_add_action_or_reset(xe->drm.dev, guc_action_disable_ct, ct);
317 }
318
319 #define desc_read(xe_, guc_ctb__, field_) \
320 xe_map_rd_field(xe_, &guc_ctb__->desc, 0, \
321 struct guc_ct_buffer_desc, field_)
322
323 #define desc_write(xe_, guc_ctb__, field_, val_) \
324 xe_map_wr_field(xe_, &guc_ctb__->desc, 0, \
325 struct guc_ct_buffer_desc, field_, val_)
326
guc_ct_ctb_h2g_init(struct xe_device * xe,struct guc_ctb * h2g,struct iosys_map * map)327 static void guc_ct_ctb_h2g_init(struct xe_device *xe, struct guc_ctb *h2g,
328 struct iosys_map *map)
329 {
330 h2g->info.size = CTB_H2G_BUFFER_SIZE / sizeof(u32);
331 h2g->info.resv_space = 0;
332 h2g->info.tail = 0;
333 h2g->info.head = 0;
334 h2g->info.space = CIRC_SPACE(h2g->info.tail, h2g->info.head,
335 h2g->info.size) -
336 h2g->info.resv_space;
337 h2g->info.broken = false;
338
339 h2g->desc = *map;
340 xe_map_memset(xe, &h2g->desc, 0, 0, sizeof(struct guc_ct_buffer_desc));
341
342 h2g->cmds = IOSYS_MAP_INIT_OFFSET(map, CTB_DESC_SIZE * 2);
343 }
344
guc_ct_ctb_g2h_init(struct xe_device * xe,struct guc_ctb * g2h,struct iosys_map * map)345 static void guc_ct_ctb_g2h_init(struct xe_device *xe, struct guc_ctb *g2h,
346 struct iosys_map *map)
347 {
348 g2h->info.size = CTB_G2H_BUFFER_SIZE / sizeof(u32);
349 g2h->info.resv_space = G2H_ROOM_BUFFER_SIZE / sizeof(u32);
350 g2h->info.head = 0;
351 g2h->info.tail = 0;
352 g2h->info.space = CIRC_SPACE(g2h->info.tail, g2h->info.head,
353 g2h->info.size) -
354 g2h->info.resv_space;
355 g2h->info.broken = false;
356
357 g2h->desc = IOSYS_MAP_INIT_OFFSET(map, CTB_DESC_SIZE);
358 xe_map_memset(xe, &g2h->desc, 0, 0, sizeof(struct guc_ct_buffer_desc));
359
360 g2h->cmds = IOSYS_MAP_INIT_OFFSET(map, CTB_DESC_SIZE * 2 +
361 CTB_H2G_BUFFER_SIZE);
362 }
363
guc_ct_ctb_h2g_register(struct xe_guc_ct * ct)364 static int guc_ct_ctb_h2g_register(struct xe_guc_ct *ct)
365 {
366 struct xe_guc *guc = ct_to_guc(ct);
367 u32 desc_addr, ctb_addr, size;
368 int err;
369
370 desc_addr = xe_bo_ggtt_addr(ct->bo);
371 ctb_addr = xe_bo_ggtt_addr(ct->bo) + CTB_DESC_SIZE * 2;
372 size = ct->ctbs.h2g.info.size * sizeof(u32);
373
374 err = xe_guc_self_cfg64(guc,
375 GUC_KLV_SELF_CFG_H2G_CTB_DESCRIPTOR_ADDR_KEY,
376 desc_addr);
377 if (err)
378 return err;
379
380 err = xe_guc_self_cfg64(guc,
381 GUC_KLV_SELF_CFG_H2G_CTB_ADDR_KEY,
382 ctb_addr);
383 if (err)
384 return err;
385
386 return xe_guc_self_cfg32(guc,
387 GUC_KLV_SELF_CFG_H2G_CTB_SIZE_KEY,
388 size);
389 }
390
guc_ct_ctb_g2h_register(struct xe_guc_ct * ct)391 static int guc_ct_ctb_g2h_register(struct xe_guc_ct *ct)
392 {
393 struct xe_guc *guc = ct_to_guc(ct);
394 u32 desc_addr, ctb_addr, size;
395 int err;
396
397 desc_addr = xe_bo_ggtt_addr(ct->bo) + CTB_DESC_SIZE;
398 ctb_addr = xe_bo_ggtt_addr(ct->bo) + CTB_DESC_SIZE * 2 +
399 CTB_H2G_BUFFER_SIZE;
400 size = ct->ctbs.g2h.info.size * sizeof(u32);
401
402 err = xe_guc_self_cfg64(guc,
403 GUC_KLV_SELF_CFG_G2H_CTB_DESCRIPTOR_ADDR_KEY,
404 desc_addr);
405 if (err)
406 return err;
407
408 err = xe_guc_self_cfg64(guc,
409 GUC_KLV_SELF_CFG_G2H_CTB_ADDR_KEY,
410 ctb_addr);
411 if (err)
412 return err;
413
414 return xe_guc_self_cfg32(guc,
415 GUC_KLV_SELF_CFG_G2H_CTB_SIZE_KEY,
416 size);
417 }
418
guc_ct_control_toggle(struct xe_guc_ct * ct,bool enable)419 static int guc_ct_control_toggle(struct xe_guc_ct *ct, bool enable)
420 {
421 u32 request[HOST2GUC_CONTROL_CTB_REQUEST_MSG_LEN] = {
422 FIELD_PREP(GUC_HXG_MSG_0_ORIGIN, GUC_HXG_ORIGIN_HOST) |
423 FIELD_PREP(GUC_HXG_MSG_0_TYPE, GUC_HXG_TYPE_REQUEST) |
424 FIELD_PREP(GUC_HXG_REQUEST_MSG_0_ACTION,
425 GUC_ACTION_HOST2GUC_CONTROL_CTB),
426 FIELD_PREP(HOST2GUC_CONTROL_CTB_REQUEST_MSG_1_CONTROL,
427 enable ? GUC_CTB_CONTROL_ENABLE :
428 GUC_CTB_CONTROL_DISABLE),
429 };
430 int ret = xe_guc_mmio_send(ct_to_guc(ct), request, ARRAY_SIZE(request));
431
432 return ret > 0 ? -EPROTO : ret;
433 }
434
guc_ct_change_state(struct xe_guc_ct * ct,enum xe_guc_ct_state state)435 static void guc_ct_change_state(struct xe_guc_ct *ct,
436 enum xe_guc_ct_state state)
437 {
438 struct xe_gt *gt = ct_to_gt(ct);
439 struct g2h_fence *g2h_fence;
440 unsigned long idx;
441
442 mutex_lock(&ct->lock); /* Serialise dequeue_one_g2h() */
443 spin_lock_irq(&ct->fast_lock); /* Serialise CT fast-path */
444
445 xe_gt_assert(ct_to_gt(ct), ct->g2h_outstanding == 0 ||
446 state == XE_GUC_CT_STATE_STOPPED);
447
448 if (ct->g2h_outstanding)
449 xe_pm_runtime_put(ct_to_xe(ct));
450 ct->g2h_outstanding = 0;
451 ct->state = state;
452
453 xe_gt_dbg(gt, "GuC CT communication channel %s\n",
454 state == XE_GUC_CT_STATE_STOPPED ? "stopped" :
455 str_enabled_disabled(state == XE_GUC_CT_STATE_ENABLED));
456
457 spin_unlock_irq(&ct->fast_lock);
458
459 /* cancel all in-flight send-recv requests */
460 xa_for_each(&ct->fence_lookup, idx, g2h_fence)
461 g2h_fence_cancel(g2h_fence);
462
463 /* make sure guc_ct_send_recv() will see g2h_fence changes */
464 smp_mb();
465 wake_up_all(&ct->g2h_fence_wq);
466
467 /*
468 * Lockdep doesn't like this under the fast lock and he destroy only
469 * needs to be serialized with the send path which ct lock provides.
470 */
471 xa_destroy(&ct->fence_lookup);
472
473 mutex_unlock(&ct->lock);
474 }
475
ct_needs_safe_mode(struct xe_guc_ct * ct)476 static bool ct_needs_safe_mode(struct xe_guc_ct *ct)
477 {
478 return !pci_dev_msi_enabled(to_pci_dev(ct_to_xe(ct)->drm.dev));
479 }
480
ct_restart_safe_mode_worker(struct xe_guc_ct * ct)481 static bool ct_restart_safe_mode_worker(struct xe_guc_ct *ct)
482 {
483 if (!ct_needs_safe_mode(ct))
484 return false;
485
486 queue_delayed_work(ct->g2h_wq, &ct->safe_mode_worker, HZ / 10);
487 return true;
488 }
489
safe_mode_worker_func(struct work_struct * w)490 static void safe_mode_worker_func(struct work_struct *w)
491 {
492 struct xe_guc_ct *ct = container_of(w, struct xe_guc_ct, safe_mode_worker.work);
493
494 receive_g2h(ct);
495
496 if (!ct_restart_safe_mode_worker(ct))
497 xe_gt_dbg(ct_to_gt(ct), "GuC CT safe-mode canceled\n");
498 }
499
ct_enter_safe_mode(struct xe_guc_ct * ct)500 static void ct_enter_safe_mode(struct xe_guc_ct *ct)
501 {
502 if (ct_restart_safe_mode_worker(ct))
503 xe_gt_dbg(ct_to_gt(ct), "GuC CT safe-mode enabled\n");
504 }
505
ct_exit_safe_mode(struct xe_guc_ct * ct)506 static void ct_exit_safe_mode(struct xe_guc_ct *ct)
507 {
508 if (cancel_delayed_work_sync(&ct->safe_mode_worker))
509 xe_gt_dbg(ct_to_gt(ct), "GuC CT safe-mode disabled\n");
510 }
511
xe_guc_ct_enable(struct xe_guc_ct * ct)512 int xe_guc_ct_enable(struct xe_guc_ct *ct)
513 {
514 struct xe_device *xe = ct_to_xe(ct);
515 struct xe_gt *gt = ct_to_gt(ct);
516 int err;
517
518 xe_gt_assert(gt, !xe_guc_ct_enabled(ct));
519
520 xe_map_memset(xe, &ct->bo->vmap, 0, 0, xe_bo_size(ct->bo));
521 guc_ct_ctb_h2g_init(xe, &ct->ctbs.h2g, &ct->bo->vmap);
522 guc_ct_ctb_g2h_init(xe, &ct->ctbs.g2h, &ct->bo->vmap);
523
524 err = guc_ct_ctb_h2g_register(ct);
525 if (err)
526 goto err_out;
527
528 err = guc_ct_ctb_g2h_register(ct);
529 if (err)
530 goto err_out;
531
532 err = guc_ct_control_toggle(ct, true);
533 if (err)
534 goto err_out;
535
536 guc_ct_change_state(ct, XE_GUC_CT_STATE_ENABLED);
537
538 smp_mb();
539 wake_up_all(&ct->wq);
540
541 if (ct_needs_safe_mode(ct))
542 ct_enter_safe_mode(ct);
543
544 #if IS_ENABLED(CONFIG_DRM_XE_DEBUG)
545 /*
546 * The CT has now been reset so the dumper can be re-armed
547 * after any existing dead state has been dumped.
548 */
549 spin_lock_irq(&ct->dead.lock);
550 if (ct->dead.reason) {
551 ct->dead.reason |= (1 << CT_DEAD_STATE_REARM);
552 queue_work(system_unbound_wq, &ct->dead.worker);
553 }
554 spin_unlock_irq(&ct->dead.lock);
555 #endif
556
557 return 0;
558
559 err_out:
560 xe_gt_err(gt, "Failed to enable GuC CT (%pe)\n", ERR_PTR(err));
561 CT_DEAD(ct, NULL, SETUP);
562
563 return err;
564 }
565
stop_g2h_handler(struct xe_guc_ct * ct)566 static void stop_g2h_handler(struct xe_guc_ct *ct)
567 {
568 cancel_work_sync(&ct->g2h_worker);
569 }
570
571 /**
572 * xe_guc_ct_disable - Set GuC to disabled state
573 * @ct: the &xe_guc_ct
574 *
575 * Set GuC CT to disabled state and stop g2h handler. No outstanding g2h expected
576 * in this transition.
577 */
xe_guc_ct_disable(struct xe_guc_ct * ct)578 void xe_guc_ct_disable(struct xe_guc_ct *ct)
579 {
580 guc_ct_change_state(ct, XE_GUC_CT_STATE_DISABLED);
581 ct_exit_safe_mode(ct);
582 stop_g2h_handler(ct);
583 }
584
585 /**
586 * xe_guc_ct_stop - Set GuC to stopped state
587 * @ct: the &xe_guc_ct
588 *
589 * Set GuC CT to stopped state, stop g2h handler, and clear any outstanding g2h
590 */
xe_guc_ct_stop(struct xe_guc_ct * ct)591 void xe_guc_ct_stop(struct xe_guc_ct *ct)
592 {
593 if (!xe_guc_ct_initialized(ct))
594 return;
595
596 guc_ct_change_state(ct, XE_GUC_CT_STATE_STOPPED);
597 stop_g2h_handler(ct);
598 }
599
h2g_has_room(struct xe_guc_ct * ct,u32 cmd_len)600 static bool h2g_has_room(struct xe_guc_ct *ct, u32 cmd_len)
601 {
602 struct guc_ctb *h2g = &ct->ctbs.h2g;
603
604 lockdep_assert_held(&ct->lock);
605
606 if (cmd_len > h2g->info.space) {
607 h2g->info.head = desc_read(ct_to_xe(ct), h2g, head);
608
609 if (h2g->info.head > h2g->info.size) {
610 struct xe_device *xe = ct_to_xe(ct);
611 u32 desc_status = desc_read(xe, h2g, status);
612
613 desc_write(xe, h2g, status, desc_status | GUC_CTB_STATUS_OVERFLOW);
614
615 xe_gt_err(ct_to_gt(ct), "CT: invalid head offset %u >= %u)\n",
616 h2g->info.head, h2g->info.size);
617 CT_DEAD(ct, h2g, H2G_HAS_ROOM);
618 return false;
619 }
620
621 h2g->info.space = CIRC_SPACE(h2g->info.tail, h2g->info.head,
622 h2g->info.size) -
623 h2g->info.resv_space;
624 if (cmd_len > h2g->info.space)
625 return false;
626 }
627
628 return true;
629 }
630
g2h_has_room(struct xe_guc_ct * ct,u32 g2h_len)631 static bool g2h_has_room(struct xe_guc_ct *ct, u32 g2h_len)
632 {
633 if (!g2h_len)
634 return true;
635
636 lockdep_assert_held(&ct->fast_lock);
637
638 return ct->ctbs.g2h.info.space > g2h_len;
639 }
640
has_room(struct xe_guc_ct * ct,u32 cmd_len,u32 g2h_len)641 static int has_room(struct xe_guc_ct *ct, u32 cmd_len, u32 g2h_len)
642 {
643 lockdep_assert_held(&ct->lock);
644
645 if (!g2h_has_room(ct, g2h_len) || !h2g_has_room(ct, cmd_len))
646 return -EBUSY;
647
648 return 0;
649 }
650
h2g_reserve_space(struct xe_guc_ct * ct,u32 cmd_len)651 static void h2g_reserve_space(struct xe_guc_ct *ct, u32 cmd_len)
652 {
653 lockdep_assert_held(&ct->lock);
654 ct->ctbs.h2g.info.space -= cmd_len;
655 }
656
__g2h_reserve_space(struct xe_guc_ct * ct,u32 g2h_len,u32 num_g2h)657 static void __g2h_reserve_space(struct xe_guc_ct *ct, u32 g2h_len, u32 num_g2h)
658 {
659 xe_gt_assert(ct_to_gt(ct), g2h_len <= ct->ctbs.g2h.info.space);
660 xe_gt_assert(ct_to_gt(ct), (!g2h_len && !num_g2h) ||
661 (g2h_len && num_g2h));
662
663 if (g2h_len) {
664 lockdep_assert_held(&ct->fast_lock);
665
666 if (!ct->g2h_outstanding)
667 xe_pm_runtime_get_noresume(ct_to_xe(ct));
668
669 ct->ctbs.g2h.info.space -= g2h_len;
670 ct->g2h_outstanding += num_g2h;
671 }
672 }
673
__g2h_release_space(struct xe_guc_ct * ct,u32 g2h_len)674 static void __g2h_release_space(struct xe_guc_ct *ct, u32 g2h_len)
675 {
676 bool bad = false;
677
678 lockdep_assert_held(&ct->fast_lock);
679
680 bad = ct->ctbs.g2h.info.space + g2h_len >
681 ct->ctbs.g2h.info.size - ct->ctbs.g2h.info.resv_space;
682 bad |= !ct->g2h_outstanding;
683
684 if (bad) {
685 xe_gt_err(ct_to_gt(ct), "Invalid G2H release: %d + %d vs %d - %d -> %d vs %d, outstanding = %d!\n",
686 ct->ctbs.g2h.info.space, g2h_len,
687 ct->ctbs.g2h.info.size, ct->ctbs.g2h.info.resv_space,
688 ct->ctbs.g2h.info.space + g2h_len,
689 ct->ctbs.g2h.info.size - ct->ctbs.g2h.info.resv_space,
690 ct->g2h_outstanding);
691 CT_DEAD(ct, &ct->ctbs.g2h, G2H_RELEASE);
692 return;
693 }
694
695 ct->ctbs.g2h.info.space += g2h_len;
696 if (!--ct->g2h_outstanding)
697 xe_pm_runtime_put(ct_to_xe(ct));
698 }
699
g2h_release_space(struct xe_guc_ct * ct,u32 g2h_len)700 static void g2h_release_space(struct xe_guc_ct *ct, u32 g2h_len)
701 {
702 spin_lock_irq(&ct->fast_lock);
703 __g2h_release_space(ct, g2h_len);
704 spin_unlock_irq(&ct->fast_lock);
705 }
706
707 #if IS_ENABLED(CONFIG_DRM_XE_DEBUG)
fast_req_track(struct xe_guc_ct * ct,u16 fence,u16 action)708 static void fast_req_track(struct xe_guc_ct *ct, u16 fence, u16 action)
709 {
710 unsigned int slot = fence % ARRAY_SIZE(ct->fast_req);
711 #if IS_ENABLED(CONFIG_DRM_XE_DEBUG_GUC)
712 unsigned long entries[SZ_32];
713 unsigned int n;
714
715 n = stack_trace_save(entries, ARRAY_SIZE(entries), 1);
716
717 /* May be called under spinlock, so avoid sleeping */
718 ct->fast_req[slot].stack = stack_depot_save(entries, n, GFP_NOWAIT);
719 #endif
720 ct->fast_req[slot].fence = fence;
721 ct->fast_req[slot].action = action;
722 }
723 #else
fast_req_track(struct xe_guc_ct * ct,u16 fence,u16 action)724 static void fast_req_track(struct xe_guc_ct *ct, u16 fence, u16 action)
725 {
726 }
727 #endif
728
729 /*
730 * The CT protocol accepts a 16 bits fence. This field is fully owned by the
731 * driver, the GuC will just copy it to the reply message. Since we need to
732 * be able to distinguish between replies to REQUEST and FAST_REQUEST messages,
733 * we use one bit of the seqno as an indicator for that and a rolling counter
734 * for the remaining 15 bits.
735 */
736 #define CT_SEQNO_MASK GENMASK(14, 0)
737 #define CT_SEQNO_UNTRACKED BIT(15)
next_ct_seqno(struct xe_guc_ct * ct,bool is_g2h_fence)738 static u16 next_ct_seqno(struct xe_guc_ct *ct, bool is_g2h_fence)
739 {
740 u32 seqno = ct->fence_seqno++ & CT_SEQNO_MASK;
741
742 if (!is_g2h_fence)
743 seqno |= CT_SEQNO_UNTRACKED;
744
745 return seqno;
746 }
747
748 #define H2G_CT_HEADERS (GUC_CTB_HDR_LEN + 1) /* one DW CTB header and one DW HxG header */
749
h2g_write(struct xe_guc_ct * ct,const u32 * action,u32 len,u32 ct_fence_value,bool want_response)750 static int h2g_write(struct xe_guc_ct *ct, const u32 *action, u32 len,
751 u32 ct_fence_value, bool want_response)
752 {
753 struct xe_device *xe = ct_to_xe(ct);
754 struct xe_gt *gt = ct_to_gt(ct);
755 struct guc_ctb *h2g = &ct->ctbs.h2g;
756 u32 cmd[H2G_CT_HEADERS];
757 u32 tail = h2g->info.tail;
758 u32 full_len;
759 struct iosys_map map = IOSYS_MAP_INIT_OFFSET(&h2g->cmds,
760 tail * sizeof(u32));
761 u32 desc_status;
762
763 full_len = len + GUC_CTB_HDR_LEN;
764
765 lockdep_assert_held(&ct->lock);
766 xe_gt_assert(gt, full_len <= GUC_CTB_MSG_MAX_LEN);
767
768 desc_status = desc_read(xe, h2g, status);
769 if (desc_status) {
770 xe_gt_err(gt, "CT write: non-zero status: %u\n", desc_status);
771 goto corrupted;
772 }
773
774 if (IS_ENABLED(CONFIG_DRM_XE_DEBUG)) {
775 u32 desc_tail = desc_read(xe, h2g, tail);
776 u32 desc_head = desc_read(xe, h2g, head);
777
778 if (tail != desc_tail) {
779 desc_write(xe, h2g, status, desc_status | GUC_CTB_STATUS_MISMATCH);
780 xe_gt_err(gt, "CT write: tail was modified %u != %u\n", desc_tail, tail);
781 goto corrupted;
782 }
783
784 if (tail > h2g->info.size) {
785 desc_write(xe, h2g, status, desc_status | GUC_CTB_STATUS_OVERFLOW);
786 xe_gt_err(gt, "CT write: tail out of range: %u vs %u\n",
787 tail, h2g->info.size);
788 goto corrupted;
789 }
790
791 if (desc_head >= h2g->info.size) {
792 desc_write(xe, h2g, status, desc_status | GUC_CTB_STATUS_OVERFLOW);
793 xe_gt_err(gt, "CT write: invalid head offset %u >= %u)\n",
794 desc_head, h2g->info.size);
795 goto corrupted;
796 }
797 }
798
799 /* Command will wrap, zero fill (NOPs), return and check credits again */
800 if (tail + full_len > h2g->info.size) {
801 xe_map_memset(xe, &map, 0, 0,
802 (h2g->info.size - tail) * sizeof(u32));
803 h2g_reserve_space(ct, (h2g->info.size - tail));
804 h2g->info.tail = 0;
805 desc_write(xe, h2g, tail, h2g->info.tail);
806
807 return -EAGAIN;
808 }
809
810 /*
811 * dw0: CT header (including fence)
812 * dw1: HXG header (including action code)
813 * dw2+: action data
814 */
815 cmd[0] = FIELD_PREP(GUC_CTB_MSG_0_FORMAT, GUC_CTB_FORMAT_HXG) |
816 FIELD_PREP(GUC_CTB_MSG_0_NUM_DWORDS, len) |
817 FIELD_PREP(GUC_CTB_MSG_0_FENCE, ct_fence_value);
818 if (want_response) {
819 cmd[1] =
820 FIELD_PREP(GUC_HXG_MSG_0_TYPE, GUC_HXG_TYPE_REQUEST) |
821 FIELD_PREP(GUC_HXG_EVENT_MSG_0_ACTION |
822 GUC_HXG_EVENT_MSG_0_DATA0, action[0]);
823 } else {
824 fast_req_track(ct, ct_fence_value,
825 FIELD_GET(GUC_HXG_EVENT_MSG_0_ACTION, action[0]));
826
827 cmd[1] =
828 FIELD_PREP(GUC_HXG_MSG_0_TYPE, GUC_HXG_TYPE_FAST_REQUEST) |
829 FIELD_PREP(GUC_HXG_EVENT_MSG_0_ACTION |
830 GUC_HXG_EVENT_MSG_0_DATA0, action[0]);
831 }
832
833 /* H2G header in cmd[1] replaces action[0] so: */
834 --len;
835 ++action;
836
837 /* Write H2G ensuring visible before descriptor update */
838 xe_map_memcpy_to(xe, &map, 0, cmd, H2G_CT_HEADERS * sizeof(u32));
839 xe_map_memcpy_to(xe, &map, H2G_CT_HEADERS * sizeof(u32), action, len * sizeof(u32));
840 xe_device_wmb(xe);
841
842 /* Update local copies */
843 h2g->info.tail = (tail + full_len) % h2g->info.size;
844 h2g_reserve_space(ct, full_len);
845
846 /* Update descriptor */
847 desc_write(xe, h2g, tail, h2g->info.tail);
848
849 trace_xe_guc_ctb_h2g(xe, gt->info.id, *(action - 1), full_len,
850 desc_read(xe, h2g, head), h2g->info.tail);
851
852 return 0;
853
854 corrupted:
855 CT_DEAD(ct, &ct->ctbs.h2g, H2G_WRITE);
856 return -EPIPE;
857 }
858
__guc_ct_send_locked(struct xe_guc_ct * ct,const u32 * action,u32 len,u32 g2h_len,u32 num_g2h,struct g2h_fence * g2h_fence)859 static int __guc_ct_send_locked(struct xe_guc_ct *ct, const u32 *action,
860 u32 len, u32 g2h_len, u32 num_g2h,
861 struct g2h_fence *g2h_fence)
862 {
863 struct xe_gt *gt __maybe_unused = ct_to_gt(ct);
864 u16 seqno;
865 int ret;
866
867 xe_gt_assert(gt, xe_guc_ct_initialized(ct));
868 xe_gt_assert(gt, !g2h_len || !g2h_fence);
869 xe_gt_assert(gt, !num_g2h || !g2h_fence);
870 xe_gt_assert(gt, !g2h_len || num_g2h);
871 xe_gt_assert(gt, g2h_len || !num_g2h);
872 lockdep_assert_held(&ct->lock);
873
874 if (unlikely(ct->ctbs.h2g.info.broken)) {
875 ret = -EPIPE;
876 goto out;
877 }
878
879 if (ct->state == XE_GUC_CT_STATE_DISABLED) {
880 ret = -ENODEV;
881 goto out;
882 }
883
884 if (ct->state == XE_GUC_CT_STATE_STOPPED) {
885 ret = -ECANCELED;
886 goto out;
887 }
888
889 xe_gt_assert(gt, xe_guc_ct_enabled(ct));
890
891 if (g2h_fence) {
892 g2h_len = GUC_CTB_HXG_MSG_MAX_LEN;
893 num_g2h = 1;
894
895 if (g2h_fence_needs_alloc(g2h_fence)) {
896 g2h_fence->seqno = next_ct_seqno(ct, true);
897 ret = xa_err(xa_store(&ct->fence_lookup,
898 g2h_fence->seqno, g2h_fence,
899 GFP_ATOMIC));
900 if (ret)
901 goto out;
902 }
903
904 seqno = g2h_fence->seqno;
905 } else {
906 seqno = next_ct_seqno(ct, false);
907 }
908
909 if (g2h_len)
910 spin_lock_irq(&ct->fast_lock);
911 retry:
912 ret = has_room(ct, len + GUC_CTB_HDR_LEN, g2h_len);
913 if (unlikely(ret))
914 goto out_unlock;
915
916 ret = h2g_write(ct, action, len, seqno, !!g2h_fence);
917 if (unlikely(ret)) {
918 if (ret == -EAGAIN)
919 goto retry;
920 goto out_unlock;
921 }
922
923 __g2h_reserve_space(ct, g2h_len, num_g2h);
924 xe_guc_notify(ct_to_guc(ct));
925 out_unlock:
926 if (g2h_len)
927 spin_unlock_irq(&ct->fast_lock);
928 out:
929 return ret;
930 }
931
kick_reset(struct xe_guc_ct * ct)932 static void kick_reset(struct xe_guc_ct *ct)
933 {
934 xe_gt_reset_async(ct_to_gt(ct));
935 }
936
937 static int dequeue_one_g2h(struct xe_guc_ct *ct);
938
guc_ct_send_locked(struct xe_guc_ct * ct,const u32 * action,u32 len,u32 g2h_len,u32 num_g2h,struct g2h_fence * g2h_fence)939 static int guc_ct_send_locked(struct xe_guc_ct *ct, const u32 *action, u32 len,
940 u32 g2h_len, u32 num_g2h,
941 struct g2h_fence *g2h_fence)
942 {
943 struct xe_device *xe = ct_to_xe(ct);
944 struct xe_gt *gt = ct_to_gt(ct);
945 unsigned int sleep_period_ms = 1;
946 int ret;
947
948 xe_gt_assert(gt, !g2h_len || !g2h_fence);
949 lockdep_assert_held(&ct->lock);
950 xe_device_assert_mem_access(ct_to_xe(ct));
951
952 try_again:
953 ret = __guc_ct_send_locked(ct, action, len, g2h_len, num_g2h,
954 g2h_fence);
955
956 /*
957 * We wait to try to restore credits for about 1 second before bailing.
958 * In the case of H2G credits we have no choice but just to wait for the
959 * GuC to consume H2Gs in the channel so we use a wait / sleep loop. In
960 * the case of G2H we process any G2H in the channel, hopefully freeing
961 * credits as we consume the G2H messages.
962 */
963 if (unlikely(ret == -EBUSY &&
964 !h2g_has_room(ct, len + GUC_CTB_HDR_LEN))) {
965 struct guc_ctb *h2g = &ct->ctbs.h2g;
966
967 if (sleep_period_ms == 1024)
968 goto broken;
969
970 trace_xe_guc_ct_h2g_flow_control(xe, h2g->info.head, h2g->info.tail,
971 h2g->info.size,
972 h2g->info.space,
973 len + GUC_CTB_HDR_LEN);
974 msleep(sleep_period_ms);
975 sleep_period_ms <<= 1;
976
977 goto try_again;
978 } else if (unlikely(ret == -EBUSY)) {
979 struct xe_device *xe = ct_to_xe(ct);
980 struct guc_ctb *g2h = &ct->ctbs.g2h;
981
982 trace_xe_guc_ct_g2h_flow_control(xe, g2h->info.head,
983 desc_read(xe, g2h, tail),
984 g2h->info.size,
985 g2h->info.space,
986 g2h_fence ?
987 GUC_CTB_HXG_MSG_MAX_LEN :
988 g2h_len);
989
990 #define g2h_avail(ct) \
991 (desc_read(ct_to_xe(ct), (&ct->ctbs.g2h), tail) != ct->ctbs.g2h.info.head)
992 if (!wait_event_timeout(ct->wq, !ct->g2h_outstanding ||
993 g2h_avail(ct), HZ))
994 goto broken;
995 #undef g2h_avail
996
997 ret = dequeue_one_g2h(ct);
998 if (ret < 0) {
999 if (ret != -ECANCELED)
1000 xe_gt_err(ct_to_gt(ct), "CTB receive failed (%pe)",
1001 ERR_PTR(ret));
1002 goto broken;
1003 }
1004
1005 goto try_again;
1006 }
1007
1008 return ret;
1009
1010 broken:
1011 xe_gt_err(gt, "No forward process on H2G, reset required\n");
1012 CT_DEAD(ct, &ct->ctbs.h2g, DEADLOCK);
1013
1014 return -EDEADLK;
1015 }
1016
guc_ct_send(struct xe_guc_ct * ct,const u32 * action,u32 len,u32 g2h_len,u32 num_g2h,struct g2h_fence * g2h_fence)1017 static int guc_ct_send(struct xe_guc_ct *ct, const u32 *action, u32 len,
1018 u32 g2h_len, u32 num_g2h, struct g2h_fence *g2h_fence)
1019 {
1020 int ret;
1021
1022 xe_gt_assert(ct_to_gt(ct), !g2h_len || !g2h_fence);
1023
1024 mutex_lock(&ct->lock);
1025 ret = guc_ct_send_locked(ct, action, len, g2h_len, num_g2h, g2h_fence);
1026 mutex_unlock(&ct->lock);
1027
1028 return ret;
1029 }
1030
xe_guc_ct_send(struct xe_guc_ct * ct,const u32 * action,u32 len,u32 g2h_len,u32 num_g2h)1031 int xe_guc_ct_send(struct xe_guc_ct *ct, const u32 *action, u32 len,
1032 u32 g2h_len, u32 num_g2h)
1033 {
1034 int ret;
1035
1036 ret = guc_ct_send(ct, action, len, g2h_len, num_g2h, NULL);
1037 if (ret == -EDEADLK)
1038 kick_reset(ct);
1039
1040 return ret;
1041 }
1042
xe_guc_ct_send_locked(struct xe_guc_ct * ct,const u32 * action,u32 len,u32 g2h_len,u32 num_g2h)1043 int xe_guc_ct_send_locked(struct xe_guc_ct *ct, const u32 *action, u32 len,
1044 u32 g2h_len, u32 num_g2h)
1045 {
1046 int ret;
1047
1048 ret = guc_ct_send_locked(ct, action, len, g2h_len, num_g2h, NULL);
1049 if (ret == -EDEADLK)
1050 kick_reset(ct);
1051
1052 return ret;
1053 }
1054
xe_guc_ct_send_g2h_handler(struct xe_guc_ct * ct,const u32 * action,u32 len)1055 int xe_guc_ct_send_g2h_handler(struct xe_guc_ct *ct, const u32 *action, u32 len)
1056 {
1057 int ret;
1058
1059 lockdep_assert_held(&ct->lock);
1060
1061 ret = guc_ct_send_locked(ct, action, len, 0, 0, NULL);
1062 if (ret == -EDEADLK)
1063 kick_reset(ct);
1064
1065 return ret;
1066 }
1067
1068 /*
1069 * Check if a GT reset is in progress or will occur and if GT reset brought the
1070 * CT back up. Randomly picking 5 seconds for an upper limit to do a GT a reset.
1071 */
retry_failure(struct xe_guc_ct * ct,int ret)1072 static bool retry_failure(struct xe_guc_ct *ct, int ret)
1073 {
1074 if (!(ret == -EDEADLK || ret == -EPIPE || ret == -ENODEV))
1075 return false;
1076
1077 #define ct_alive(ct) \
1078 (xe_guc_ct_enabled(ct) && !ct->ctbs.h2g.info.broken && \
1079 !ct->ctbs.g2h.info.broken)
1080 if (!wait_event_interruptible_timeout(ct->wq, ct_alive(ct), HZ * 5))
1081 return false;
1082 #undef ct_alive
1083
1084 return true;
1085 }
1086
1087 #define GUC_SEND_RETRY_LIMIT 50
1088 #define GUC_SEND_RETRY_MSLEEP 5
1089
guc_ct_send_recv(struct xe_guc_ct * ct,const u32 * action,u32 len,u32 * response_buffer,bool no_fail)1090 static int guc_ct_send_recv(struct xe_guc_ct *ct, const u32 *action, u32 len,
1091 u32 *response_buffer, bool no_fail)
1092 {
1093 struct xe_gt *gt = ct_to_gt(ct);
1094 struct g2h_fence g2h_fence;
1095 unsigned int retries = 0;
1096 int ret = 0;
1097
1098 /*
1099 * We use a fence to implement blocking sends / receiving response data.
1100 * The seqno of the fence is sent in the H2G, returned in the G2H, and
1101 * an xarray is used as storage media with the seqno being to key.
1102 * Fields in the fence hold success, failure, retry status and the
1103 * response data. Safe to allocate on the stack as the xarray is the
1104 * only reference and it cannot be present after this function exits.
1105 */
1106 retry:
1107 g2h_fence_init(&g2h_fence, response_buffer);
1108 retry_same_fence:
1109 ret = guc_ct_send(ct, action, len, 0, 0, &g2h_fence);
1110 if (unlikely(ret == -ENOMEM)) {
1111 /* Retry allocation /w GFP_KERNEL */
1112 ret = xa_err(xa_store(&ct->fence_lookup, g2h_fence.seqno,
1113 &g2h_fence, GFP_KERNEL));
1114 if (ret)
1115 return ret;
1116
1117 goto retry_same_fence;
1118 } else if (unlikely(ret)) {
1119 if (ret == -EDEADLK)
1120 kick_reset(ct);
1121
1122 if (no_fail && retry_failure(ct, ret))
1123 goto retry_same_fence;
1124
1125 if (!g2h_fence_needs_alloc(&g2h_fence))
1126 xa_erase(&ct->fence_lookup, g2h_fence.seqno);
1127
1128 return ret;
1129 }
1130
1131 ret = wait_event_timeout(ct->g2h_fence_wq, g2h_fence.done, HZ);
1132 if (!ret) {
1133 LNL_FLUSH_WORK(&ct->g2h_worker);
1134 if (g2h_fence.done) {
1135 xe_gt_warn(gt, "G2H fence %u, action %04x, done\n",
1136 g2h_fence.seqno, action[0]);
1137 ret = 1;
1138 }
1139 }
1140
1141 /*
1142 * Ensure we serialize with completion side to prevent UAF with fence going out of scope on
1143 * the stack, since we have no clue if it will fire after the timeout before we can erase
1144 * from the xa. Also we have some dependent loads and stores below for which we need the
1145 * correct ordering, and we lack the needed barriers.
1146 */
1147 mutex_lock(&ct->lock);
1148 if (!ret) {
1149 xe_gt_err(gt, "Timed out wait for G2H, fence %u, action %04x, done %s",
1150 g2h_fence.seqno, action[0], str_yes_no(g2h_fence.done));
1151 xa_erase(&ct->fence_lookup, g2h_fence.seqno);
1152 mutex_unlock(&ct->lock);
1153 return -ETIME;
1154 }
1155
1156 if (g2h_fence.retry) {
1157 xe_gt_dbg(gt, "H2G action %#x retrying: reason %#x\n",
1158 action[0], g2h_fence.reason);
1159 mutex_unlock(&ct->lock);
1160 if (++retries > GUC_SEND_RETRY_LIMIT) {
1161 xe_gt_err(gt, "H2G action %#x reached retry limit=%u, aborting\n",
1162 action[0], GUC_SEND_RETRY_LIMIT);
1163 return -ELOOP;
1164 }
1165 msleep(GUC_SEND_RETRY_MSLEEP * retries);
1166 goto retry;
1167 }
1168 if (g2h_fence.fail) {
1169 if (g2h_fence.cancel) {
1170 xe_gt_dbg(gt, "H2G request %#x canceled!\n", action[0]);
1171 ret = -ECANCELED;
1172 goto unlock;
1173 }
1174 xe_gt_err(gt, "H2G request %#x failed: error %#x hint %#x\n",
1175 action[0], g2h_fence.error, g2h_fence.hint);
1176 ret = -EIO;
1177 }
1178
1179 if (ret > 0)
1180 ret = response_buffer ? g2h_fence.response_len : g2h_fence.response_data;
1181
1182 unlock:
1183 mutex_unlock(&ct->lock);
1184
1185 return ret;
1186 }
1187
1188 /**
1189 * xe_guc_ct_send_recv - Send and receive HXG to the GuC
1190 * @ct: the &xe_guc_ct
1191 * @action: the dword array with `HXG Request`_ message (can't be NULL)
1192 * @len: length of the `HXG Request`_ message (in dwords, can't be 0)
1193 * @response_buffer: placeholder for the `HXG Response`_ message (can be NULL)
1194 *
1195 * Send a `HXG Request`_ message to the GuC over CT communication channel and
1196 * blocks until GuC replies with a `HXG Response`_ message.
1197 *
1198 * For non-blocking communication with GuC use xe_guc_ct_send().
1199 *
1200 * Note: The size of &response_buffer must be at least GUC_CTB_MAX_DWORDS_.
1201 *
1202 * Return: response length (in dwords) if &response_buffer was not NULL, or
1203 * DATA0 from `HXG Response`_ if &response_buffer was NULL, or
1204 * a negative error code on failure.
1205 */
xe_guc_ct_send_recv(struct xe_guc_ct * ct,const u32 * action,u32 len,u32 * response_buffer)1206 int xe_guc_ct_send_recv(struct xe_guc_ct *ct, const u32 *action, u32 len,
1207 u32 *response_buffer)
1208 {
1209 KUNIT_STATIC_STUB_REDIRECT(xe_guc_ct_send_recv, ct, action, len, response_buffer);
1210 return guc_ct_send_recv(ct, action, len, response_buffer, false);
1211 }
1212 ALLOW_ERROR_INJECTION(xe_guc_ct_send_recv, ERRNO);
1213
xe_guc_ct_send_recv_no_fail(struct xe_guc_ct * ct,const u32 * action,u32 len,u32 * response_buffer)1214 int xe_guc_ct_send_recv_no_fail(struct xe_guc_ct *ct, const u32 *action,
1215 u32 len, u32 *response_buffer)
1216 {
1217 return guc_ct_send_recv(ct, action, len, response_buffer, true);
1218 }
1219
msg_to_hxg(u32 * msg)1220 static u32 *msg_to_hxg(u32 *msg)
1221 {
1222 return msg + GUC_CTB_MSG_MIN_LEN;
1223 }
1224
msg_len_to_hxg_len(u32 len)1225 static u32 msg_len_to_hxg_len(u32 len)
1226 {
1227 return len - GUC_CTB_MSG_MIN_LEN;
1228 }
1229
parse_g2h_event(struct xe_guc_ct * ct,u32 * msg,u32 len)1230 static int parse_g2h_event(struct xe_guc_ct *ct, u32 *msg, u32 len)
1231 {
1232 u32 *hxg = msg_to_hxg(msg);
1233 u32 action = FIELD_GET(GUC_HXG_EVENT_MSG_0_ACTION, hxg[0]);
1234
1235 lockdep_assert_held(&ct->lock);
1236
1237 switch (action) {
1238 case XE_GUC_ACTION_SCHED_CONTEXT_MODE_DONE:
1239 case XE_GUC_ACTION_DEREGISTER_CONTEXT_DONE:
1240 case XE_GUC_ACTION_SCHED_ENGINE_MODE_DONE:
1241 case XE_GUC_ACTION_TLB_INVALIDATION_DONE:
1242 g2h_release_space(ct, len);
1243 }
1244
1245 return 0;
1246 }
1247
guc_crash_process_msg(struct xe_guc_ct * ct,u32 action)1248 static int guc_crash_process_msg(struct xe_guc_ct *ct, u32 action)
1249 {
1250 struct xe_gt *gt = ct_to_gt(ct);
1251
1252 if (action == XE_GUC_ACTION_NOTIFY_CRASH_DUMP_POSTED)
1253 xe_gt_err(gt, "GuC Crash dump notification\n");
1254 else if (action == XE_GUC_ACTION_NOTIFY_EXCEPTION)
1255 xe_gt_err(gt, "GuC Exception notification\n");
1256 else
1257 xe_gt_err(gt, "Unknown GuC crash notification: 0x%04X\n", action);
1258
1259 CT_DEAD(ct, NULL, CRASH);
1260
1261 kick_reset(ct);
1262
1263 return 0;
1264 }
1265
1266 #if IS_ENABLED(CONFIG_DRM_XE_DEBUG)
fast_req_report(struct xe_guc_ct * ct,u16 fence)1267 static void fast_req_report(struct xe_guc_ct *ct, u16 fence)
1268 {
1269 u16 fence_min = U16_MAX, fence_max = 0;
1270 struct xe_gt *gt = ct_to_gt(ct);
1271 bool found = false;
1272 unsigned int n;
1273 #if IS_ENABLED(CONFIG_DRM_XE_DEBUG_GUC)
1274 char *buf;
1275 #endif
1276
1277 lockdep_assert_held(&ct->lock);
1278
1279 for (n = 0; n < ARRAY_SIZE(ct->fast_req); n++) {
1280 if (ct->fast_req[n].fence < fence_min)
1281 fence_min = ct->fast_req[n].fence;
1282 if (ct->fast_req[n].fence > fence_max)
1283 fence_max = ct->fast_req[n].fence;
1284
1285 if (ct->fast_req[n].fence != fence)
1286 continue;
1287 found = true;
1288
1289 #if IS_ENABLED(CONFIG_DRM_XE_DEBUG_GUC)
1290 buf = kmalloc(SZ_4K, GFP_NOWAIT);
1291 if (buf && stack_depot_snprint(ct->fast_req[n].stack, buf, SZ_4K, 0))
1292 xe_gt_err(gt, "Fence 0x%x was used by action %#04x sent at:\n%s",
1293 fence, ct->fast_req[n].action, buf);
1294 else
1295 xe_gt_err(gt, "Fence 0x%x was used by action %#04x [failed to retrieve stack]\n",
1296 fence, ct->fast_req[n].action);
1297 kfree(buf);
1298 #else
1299 xe_gt_err(gt, "Fence 0x%x was used by action %#04x\n",
1300 fence, ct->fast_req[n].action);
1301 #endif
1302 break;
1303 }
1304
1305 if (!found)
1306 xe_gt_warn(gt, "Fence 0x%x not found - tracking buffer wrapped? [range = 0x%x -> 0x%x, next = 0x%X]\n",
1307 fence, fence_min, fence_max, ct->fence_seqno);
1308 }
1309 #else
fast_req_report(struct xe_guc_ct * ct,u16 fence)1310 static void fast_req_report(struct xe_guc_ct *ct, u16 fence)
1311 {
1312 }
1313 #endif
1314
parse_g2h_response(struct xe_guc_ct * ct,u32 * msg,u32 len)1315 static int parse_g2h_response(struct xe_guc_ct *ct, u32 *msg, u32 len)
1316 {
1317 struct xe_gt *gt = ct_to_gt(ct);
1318 u32 *hxg = msg_to_hxg(msg);
1319 u32 hxg_len = msg_len_to_hxg_len(len);
1320 u32 fence = FIELD_GET(GUC_CTB_MSG_0_FENCE, msg[0]);
1321 u32 type = FIELD_GET(GUC_HXG_MSG_0_TYPE, hxg[0]);
1322 struct g2h_fence *g2h_fence;
1323
1324 lockdep_assert_held(&ct->lock);
1325
1326 /*
1327 * Fences for FAST_REQUEST messages are not tracked in ct->fence_lookup.
1328 * Those messages should never fail, so if we do get an error back it
1329 * means we're likely doing an illegal operation and the GuC is
1330 * rejecting it. We have no way to inform the code that submitted the
1331 * H2G that the message was rejected, so we need to escalate the
1332 * failure to trigger a reset.
1333 */
1334 if (fence & CT_SEQNO_UNTRACKED) {
1335 if (type == GUC_HXG_TYPE_RESPONSE_FAILURE)
1336 xe_gt_err(gt, "FAST_REQ H2G fence 0x%x failed! e=0x%x, h=%u\n",
1337 fence,
1338 FIELD_GET(GUC_HXG_FAILURE_MSG_0_ERROR, hxg[0]),
1339 FIELD_GET(GUC_HXG_FAILURE_MSG_0_HINT, hxg[0]));
1340 else
1341 xe_gt_err(gt, "unexpected response %u for FAST_REQ H2G fence 0x%x!\n",
1342 type, fence);
1343
1344 fast_req_report(ct, fence);
1345
1346 CT_DEAD(ct, NULL, PARSE_G2H_RESPONSE);
1347
1348 return -EPROTO;
1349 }
1350
1351 g2h_fence = xa_erase(&ct->fence_lookup, fence);
1352 if (unlikely(!g2h_fence)) {
1353 /* Don't tear down channel, as send could've timed out */
1354 /* CT_DEAD(ct, NULL, PARSE_G2H_UNKNOWN); */
1355 xe_gt_warn(gt, "G2H fence (%u) not found!\n", fence);
1356 g2h_release_space(ct, GUC_CTB_HXG_MSG_MAX_LEN);
1357 return 0;
1358 }
1359
1360 xe_gt_assert(gt, fence == g2h_fence->seqno);
1361
1362 if (type == GUC_HXG_TYPE_RESPONSE_FAILURE) {
1363 g2h_fence->fail = true;
1364 g2h_fence->error = FIELD_GET(GUC_HXG_FAILURE_MSG_0_ERROR, hxg[0]);
1365 g2h_fence->hint = FIELD_GET(GUC_HXG_FAILURE_MSG_0_HINT, hxg[0]);
1366 } else if (type == GUC_HXG_TYPE_NO_RESPONSE_RETRY) {
1367 g2h_fence->retry = true;
1368 g2h_fence->reason = FIELD_GET(GUC_HXG_RETRY_MSG_0_REASON, hxg[0]);
1369 } else if (g2h_fence->response_buffer) {
1370 g2h_fence->response_len = hxg_len;
1371 memcpy(g2h_fence->response_buffer, hxg, hxg_len * sizeof(u32));
1372 } else {
1373 g2h_fence->response_data = FIELD_GET(GUC_HXG_RESPONSE_MSG_0_DATA0, hxg[0]);
1374 }
1375
1376 g2h_release_space(ct, GUC_CTB_HXG_MSG_MAX_LEN);
1377
1378 g2h_fence->done = true;
1379 smp_mb();
1380
1381 wake_up_all(&ct->g2h_fence_wq);
1382
1383 return 0;
1384 }
1385
parse_g2h_msg(struct xe_guc_ct * ct,u32 * msg,u32 len)1386 static int parse_g2h_msg(struct xe_guc_ct *ct, u32 *msg, u32 len)
1387 {
1388 struct xe_gt *gt = ct_to_gt(ct);
1389 u32 *hxg = msg_to_hxg(msg);
1390 u32 origin, type;
1391 int ret;
1392
1393 lockdep_assert_held(&ct->lock);
1394
1395 origin = FIELD_GET(GUC_HXG_MSG_0_ORIGIN, hxg[0]);
1396 if (unlikely(origin != GUC_HXG_ORIGIN_GUC)) {
1397 xe_gt_err(gt, "G2H channel broken on read, origin=%u, reset required\n",
1398 origin);
1399 CT_DEAD(ct, &ct->ctbs.g2h, PARSE_G2H_ORIGIN);
1400
1401 return -EPROTO;
1402 }
1403
1404 type = FIELD_GET(GUC_HXG_MSG_0_TYPE, hxg[0]);
1405 switch (type) {
1406 case GUC_HXG_TYPE_EVENT:
1407 ret = parse_g2h_event(ct, msg, len);
1408 break;
1409 case GUC_HXG_TYPE_RESPONSE_SUCCESS:
1410 case GUC_HXG_TYPE_RESPONSE_FAILURE:
1411 case GUC_HXG_TYPE_NO_RESPONSE_RETRY:
1412 ret = parse_g2h_response(ct, msg, len);
1413 break;
1414 default:
1415 xe_gt_err(gt, "G2H channel broken on read, type=%u, reset required\n",
1416 type);
1417 CT_DEAD(ct, &ct->ctbs.g2h, PARSE_G2H_TYPE);
1418
1419 ret = -EOPNOTSUPP;
1420 }
1421
1422 return ret;
1423 }
1424
process_g2h_msg(struct xe_guc_ct * ct,u32 * msg,u32 len)1425 static int process_g2h_msg(struct xe_guc_ct *ct, u32 *msg, u32 len)
1426 {
1427 struct xe_guc *guc = ct_to_guc(ct);
1428 struct xe_gt *gt = ct_to_gt(ct);
1429 u32 hxg_len = msg_len_to_hxg_len(len);
1430 u32 *hxg = msg_to_hxg(msg);
1431 u32 action, adj_len;
1432 u32 *payload;
1433 int ret = 0;
1434
1435 if (FIELD_GET(GUC_HXG_MSG_0_TYPE, hxg[0]) != GUC_HXG_TYPE_EVENT)
1436 return 0;
1437
1438 action = FIELD_GET(GUC_HXG_EVENT_MSG_0_ACTION, hxg[0]);
1439 payload = hxg + GUC_HXG_EVENT_MSG_MIN_LEN;
1440 adj_len = hxg_len - GUC_HXG_EVENT_MSG_MIN_LEN;
1441
1442 switch (action) {
1443 case XE_GUC_ACTION_SCHED_CONTEXT_MODE_DONE:
1444 ret = xe_guc_sched_done_handler(guc, payload, adj_len);
1445 break;
1446 case XE_GUC_ACTION_DEREGISTER_CONTEXT_DONE:
1447 ret = xe_guc_deregister_done_handler(guc, payload, adj_len);
1448 break;
1449 case XE_GUC_ACTION_CONTEXT_RESET_NOTIFICATION:
1450 ret = xe_guc_exec_queue_reset_handler(guc, payload, adj_len);
1451 break;
1452 case XE_GUC_ACTION_ENGINE_FAILURE_NOTIFICATION:
1453 ret = xe_guc_exec_queue_reset_failure_handler(guc, payload,
1454 adj_len);
1455 break;
1456 case XE_GUC_ACTION_SCHED_ENGINE_MODE_DONE:
1457 /* Selftest only at the moment */
1458 break;
1459 case XE_GUC_ACTION_STATE_CAPTURE_NOTIFICATION:
1460 ret = xe_guc_error_capture_handler(guc, payload, adj_len);
1461 break;
1462 case XE_GUC_ACTION_NOTIFY_FLUSH_LOG_BUFFER_TO_FILE:
1463 /* FIXME: Handle this */
1464 break;
1465 case XE_GUC_ACTION_NOTIFY_MEMORY_CAT_ERROR:
1466 ret = xe_guc_exec_queue_memory_cat_error_handler(guc, payload,
1467 adj_len);
1468 break;
1469 case XE_GUC_ACTION_REPORT_PAGE_FAULT_REQ_DESC:
1470 ret = xe_guc_pagefault_handler(guc, payload, adj_len);
1471 break;
1472 case XE_GUC_ACTION_TLB_INVALIDATION_DONE:
1473 ret = xe_guc_tlb_inval_done_handler(guc, payload, adj_len);
1474 break;
1475 case XE_GUC_ACTION_ACCESS_COUNTER_NOTIFY:
1476 ret = xe_guc_access_counter_notify_handler(guc, payload,
1477 adj_len);
1478 break;
1479 case XE_GUC_ACTION_GUC2PF_RELAY_FROM_VF:
1480 ret = xe_guc_relay_process_guc2pf(&guc->relay, hxg, hxg_len);
1481 break;
1482 case XE_GUC_ACTION_GUC2VF_RELAY_FROM_PF:
1483 ret = xe_guc_relay_process_guc2vf(&guc->relay, hxg, hxg_len);
1484 break;
1485 case GUC_ACTION_GUC2PF_VF_STATE_NOTIFY:
1486 ret = xe_gt_sriov_pf_control_process_guc2pf(gt, hxg, hxg_len);
1487 break;
1488 case GUC_ACTION_GUC2PF_ADVERSE_EVENT:
1489 ret = xe_gt_sriov_pf_monitor_process_guc2pf(gt, hxg, hxg_len);
1490 break;
1491 case XE_GUC_ACTION_NOTIFY_CRASH_DUMP_POSTED:
1492 case XE_GUC_ACTION_NOTIFY_EXCEPTION:
1493 ret = guc_crash_process_msg(ct, action);
1494 break;
1495 #if IS_ENABLED(CONFIG_DRM_XE_KUNIT_TEST)
1496 case XE_GUC_ACTION_TEST_G2G_RECV:
1497 ret = xe_guc_g2g_test_notification(guc, payload, adj_len);
1498 break;
1499 #endif
1500 default:
1501 xe_gt_err(gt, "unexpected G2H action 0x%04x\n", action);
1502 }
1503
1504 if (ret) {
1505 xe_gt_err(gt, "G2H action %#04x failed (%pe) len %u msg %*ph\n",
1506 action, ERR_PTR(ret), hxg_len, (int)sizeof(u32) * hxg_len, hxg);
1507 CT_DEAD(ct, NULL, PROCESS_FAILED);
1508 }
1509
1510 return 0;
1511 }
1512
g2h_read(struct xe_guc_ct * ct,u32 * msg,bool fast_path)1513 static int g2h_read(struct xe_guc_ct *ct, u32 *msg, bool fast_path)
1514 {
1515 struct xe_device *xe = ct_to_xe(ct);
1516 struct xe_gt *gt = ct_to_gt(ct);
1517 struct guc_ctb *g2h = &ct->ctbs.g2h;
1518 u32 tail, head, len, desc_status;
1519 s32 avail;
1520 u32 action;
1521 u32 *hxg;
1522
1523 xe_gt_assert(gt, xe_guc_ct_initialized(ct));
1524 lockdep_assert_held(&ct->fast_lock);
1525
1526 if (ct->state == XE_GUC_CT_STATE_DISABLED)
1527 return -ENODEV;
1528
1529 if (ct->state == XE_GUC_CT_STATE_STOPPED)
1530 return -ECANCELED;
1531
1532 if (g2h->info.broken)
1533 return -EPIPE;
1534
1535 xe_gt_assert(gt, xe_guc_ct_enabled(ct));
1536
1537 desc_status = desc_read(xe, g2h, status);
1538 if (desc_status) {
1539 if (desc_status & GUC_CTB_STATUS_DISABLED) {
1540 /*
1541 * Potentially valid if a CLIENT_RESET request resulted in
1542 * contexts/engines being reset. But should never happen as
1543 * no contexts should be active when CLIENT_RESET is sent.
1544 */
1545 xe_gt_err(gt, "CT read: unexpected G2H after GuC has stopped!\n");
1546 desc_status &= ~GUC_CTB_STATUS_DISABLED;
1547 }
1548
1549 if (desc_status) {
1550 xe_gt_err(gt, "CT read: non-zero status: %u\n", desc_status);
1551 goto corrupted;
1552 }
1553 }
1554
1555 if (IS_ENABLED(CONFIG_DRM_XE_DEBUG)) {
1556 u32 desc_tail = desc_read(xe, g2h, tail);
1557 /*
1558 u32 desc_head = desc_read(xe, g2h, head);
1559
1560 * info.head and desc_head are updated back-to-back at the end of
1561 * this function and nowhere else. Hence, they cannot be different
1562 * unless two g2h_read calls are running concurrently. Which is not
1563 * possible because it is guarded by ct->fast_lock. And yet, some
1564 * discrete platforms are regularly hitting this error :(.
1565 *
1566 * desc_head rolling backwards shouldn't cause any noticeable
1567 * problems - just a delay in GuC being allowed to proceed past that
1568 * point in the queue. So for now, just disable the error until it
1569 * can be root caused.
1570 *
1571 if (g2h->info.head != desc_head) {
1572 desc_write(xe, g2h, status, desc_status | GUC_CTB_STATUS_MISMATCH);
1573 xe_gt_err(gt, "CT read: head was modified %u != %u\n",
1574 desc_head, g2h->info.head);
1575 goto corrupted;
1576 }
1577 */
1578
1579 if (g2h->info.head > g2h->info.size) {
1580 desc_write(xe, g2h, status, desc_status | GUC_CTB_STATUS_OVERFLOW);
1581 xe_gt_err(gt, "CT read: head out of range: %u vs %u\n",
1582 g2h->info.head, g2h->info.size);
1583 goto corrupted;
1584 }
1585
1586 if (desc_tail >= g2h->info.size) {
1587 desc_write(xe, g2h, status, desc_status | GUC_CTB_STATUS_OVERFLOW);
1588 xe_gt_err(gt, "CT read: invalid tail offset %u >= %u)\n",
1589 desc_tail, g2h->info.size);
1590 goto corrupted;
1591 }
1592 }
1593
1594 /* Calculate DW available to read */
1595 tail = desc_read(xe, g2h, tail);
1596 avail = tail - g2h->info.head;
1597 if (unlikely(avail == 0))
1598 return 0;
1599
1600 if (avail < 0)
1601 avail += g2h->info.size;
1602
1603 /* Read header */
1604 xe_map_memcpy_from(xe, msg, &g2h->cmds, sizeof(u32) * g2h->info.head,
1605 sizeof(u32));
1606 len = FIELD_GET(GUC_CTB_MSG_0_NUM_DWORDS, msg[0]) + GUC_CTB_MSG_MIN_LEN;
1607 if (len > avail) {
1608 xe_gt_err(gt, "G2H channel broken on read, avail=%d, len=%d, reset required\n",
1609 avail, len);
1610 goto corrupted;
1611 }
1612
1613 head = (g2h->info.head + 1) % g2h->info.size;
1614 avail = len - 1;
1615
1616 /* Read G2H message */
1617 if (avail + head > g2h->info.size) {
1618 u32 avail_til_wrap = g2h->info.size - head;
1619
1620 xe_map_memcpy_from(xe, msg + 1,
1621 &g2h->cmds, sizeof(u32) * head,
1622 avail_til_wrap * sizeof(u32));
1623 xe_map_memcpy_from(xe, msg + 1 + avail_til_wrap,
1624 &g2h->cmds, 0,
1625 (avail - avail_til_wrap) * sizeof(u32));
1626 } else {
1627 xe_map_memcpy_from(xe, msg + 1,
1628 &g2h->cmds, sizeof(u32) * head,
1629 avail * sizeof(u32));
1630 }
1631
1632 hxg = msg_to_hxg(msg);
1633 action = FIELD_GET(GUC_HXG_EVENT_MSG_0_ACTION, hxg[0]);
1634
1635 if (fast_path) {
1636 if (FIELD_GET(GUC_HXG_MSG_0_TYPE, hxg[0]) != GUC_HXG_TYPE_EVENT)
1637 return 0;
1638
1639 switch (action) {
1640 case XE_GUC_ACTION_REPORT_PAGE_FAULT_REQ_DESC:
1641 case XE_GUC_ACTION_TLB_INVALIDATION_DONE:
1642 break; /* Process these in fast-path */
1643 default:
1644 return 0;
1645 }
1646 }
1647
1648 /* Update local / descriptor header */
1649 g2h->info.head = (head + avail) % g2h->info.size;
1650 desc_write(xe, g2h, head, g2h->info.head);
1651
1652 trace_xe_guc_ctb_g2h(xe, ct_to_gt(ct)->info.id,
1653 action, len, g2h->info.head, tail);
1654
1655 return len;
1656
1657 corrupted:
1658 CT_DEAD(ct, &ct->ctbs.g2h, G2H_READ);
1659 return -EPROTO;
1660 }
1661
g2h_fast_path(struct xe_guc_ct * ct,u32 * msg,u32 len)1662 static void g2h_fast_path(struct xe_guc_ct *ct, u32 *msg, u32 len)
1663 {
1664 struct xe_gt *gt = ct_to_gt(ct);
1665 struct xe_guc *guc = ct_to_guc(ct);
1666 u32 hxg_len = msg_len_to_hxg_len(len);
1667 u32 *hxg = msg_to_hxg(msg);
1668 u32 action = FIELD_GET(GUC_HXG_EVENT_MSG_0_ACTION, hxg[0]);
1669 u32 *payload = hxg + GUC_HXG_MSG_MIN_LEN;
1670 u32 adj_len = hxg_len - GUC_HXG_MSG_MIN_LEN;
1671 int ret = 0;
1672
1673 switch (action) {
1674 case XE_GUC_ACTION_REPORT_PAGE_FAULT_REQ_DESC:
1675 ret = xe_guc_pagefault_handler(guc, payload, adj_len);
1676 break;
1677 case XE_GUC_ACTION_TLB_INVALIDATION_DONE:
1678 __g2h_release_space(ct, len);
1679 ret = xe_guc_tlb_inval_done_handler(guc, payload, adj_len);
1680 break;
1681 default:
1682 xe_gt_warn(gt, "NOT_POSSIBLE");
1683 }
1684
1685 if (ret) {
1686 xe_gt_err(gt, "G2H action 0x%04x failed (%pe)\n",
1687 action, ERR_PTR(ret));
1688 CT_DEAD(ct, NULL, FAST_G2H);
1689 }
1690 }
1691
1692 /**
1693 * xe_guc_ct_fast_path - process critical G2H in the IRQ handler
1694 * @ct: GuC CT object
1695 *
1696 * Anything related to page faults is critical for performance, process these
1697 * critical G2H in the IRQ. This is safe as these handlers either just wake up
1698 * waiters or queue another worker.
1699 */
xe_guc_ct_fast_path(struct xe_guc_ct * ct)1700 void xe_guc_ct_fast_path(struct xe_guc_ct *ct)
1701 {
1702 struct xe_device *xe = ct_to_xe(ct);
1703 bool ongoing;
1704 int len;
1705
1706 ongoing = xe_pm_runtime_get_if_active(ct_to_xe(ct));
1707 if (!ongoing && xe_pm_read_callback_task(ct_to_xe(ct)) == NULL)
1708 return;
1709
1710 spin_lock(&ct->fast_lock);
1711 do {
1712 len = g2h_read(ct, ct->fast_msg, true);
1713 if (len > 0)
1714 g2h_fast_path(ct, ct->fast_msg, len);
1715 } while (len > 0);
1716 spin_unlock(&ct->fast_lock);
1717
1718 if (ongoing)
1719 xe_pm_runtime_put(xe);
1720 }
1721
1722 /* Returns less than zero on error, 0 on done, 1 on more available */
dequeue_one_g2h(struct xe_guc_ct * ct)1723 static int dequeue_one_g2h(struct xe_guc_ct *ct)
1724 {
1725 int len;
1726 int ret;
1727
1728 lockdep_assert_held(&ct->lock);
1729
1730 spin_lock_irq(&ct->fast_lock);
1731 len = g2h_read(ct, ct->msg, false);
1732 spin_unlock_irq(&ct->fast_lock);
1733 if (len <= 0)
1734 return len;
1735
1736 ret = parse_g2h_msg(ct, ct->msg, len);
1737 if (unlikely(ret < 0))
1738 return ret;
1739
1740 ret = process_g2h_msg(ct, ct->msg, len);
1741 if (unlikely(ret < 0))
1742 return ret;
1743
1744 return 1;
1745 }
1746
receive_g2h(struct xe_guc_ct * ct)1747 static void receive_g2h(struct xe_guc_ct *ct)
1748 {
1749 bool ongoing;
1750 int ret;
1751
1752 /*
1753 * Normal users must always hold mem_access.ref around CT calls. However
1754 * during the runtime pm callbacks we rely on CT to talk to the GuC, but
1755 * at this stage we can't rely on mem_access.ref and even the
1756 * callback_task will be different than current. For such cases we just
1757 * need to ensure we always process the responses from any blocking
1758 * ct_send requests or where we otherwise expect some response when
1759 * initiated from those callbacks (which will need to wait for the below
1760 * dequeue_one_g2h()). The dequeue_one_g2h() will gracefully fail if
1761 * the device has suspended to the point that the CT communication has
1762 * been disabled.
1763 *
1764 * If we are inside the runtime pm callback, we can be the only task
1765 * still issuing CT requests (since that requires having the
1766 * mem_access.ref). It seems like it might in theory be possible to
1767 * receive unsolicited events from the GuC just as we are
1768 * suspending-resuming, but those will currently anyway be lost when
1769 * eventually exiting from suspend, hence no need to wake up the device
1770 * here. If we ever need something stronger than get_if_ongoing() then
1771 * we need to be careful with blocking the pm callbacks from getting CT
1772 * responses, if the worker here is blocked on those callbacks
1773 * completing, creating a deadlock.
1774 */
1775 ongoing = xe_pm_runtime_get_if_active(ct_to_xe(ct));
1776 if (!ongoing && xe_pm_read_callback_task(ct_to_xe(ct)) == NULL)
1777 return;
1778
1779 do {
1780 mutex_lock(&ct->lock);
1781 ret = dequeue_one_g2h(ct);
1782 mutex_unlock(&ct->lock);
1783
1784 if (unlikely(ret == -EPROTO || ret == -EOPNOTSUPP)) {
1785 xe_gt_err(ct_to_gt(ct), "CT dequeue failed: %d", ret);
1786 CT_DEAD(ct, NULL, G2H_RECV);
1787 kick_reset(ct);
1788 }
1789 } while (ret == 1);
1790
1791 if (ongoing)
1792 xe_pm_runtime_put(ct_to_xe(ct));
1793 }
1794
g2h_worker_func(struct work_struct * w)1795 static void g2h_worker_func(struct work_struct *w)
1796 {
1797 struct xe_guc_ct *ct = container_of(w, struct xe_guc_ct, g2h_worker);
1798
1799 receive_g2h(ct);
1800 }
1801
xe_fixup_u64_in_cmds(struct xe_device * xe,struct iosys_map * cmds,u32 size,u32 idx,s64 shift)1802 static void xe_fixup_u64_in_cmds(struct xe_device *xe, struct iosys_map *cmds,
1803 u32 size, u32 idx, s64 shift)
1804 {
1805 u32 hi, lo;
1806 u64 offset;
1807
1808 lo = xe_map_rd_ring_u32(xe, cmds, idx, size);
1809 hi = xe_map_rd_ring_u32(xe, cmds, idx + 1, size);
1810 offset = make_u64(hi, lo);
1811 offset += shift;
1812 lo = lower_32_bits(offset);
1813 hi = upper_32_bits(offset);
1814 xe_map_wr_ring_u32(xe, cmds, idx, size, lo);
1815 xe_map_wr_ring_u32(xe, cmds, idx + 1, size, hi);
1816 }
1817
1818 /*
1819 * Shift any GGTT addresses within a single message left within CTB from
1820 * before post-migration recovery.
1821 * @ct: pointer to CT struct of the target GuC
1822 * @cmds: iomap buffer containing CT messages
1823 * @head: start of the target message within the buffer
1824 * @len: length of the target message
1825 * @size: size of the commands buffer
1826 * @shift: the address shift to be added to each GGTT reference
1827 * Return: true if the message was fixed or needed no fixups, false on failure
1828 */
ct_fixup_ggtt_in_message(struct xe_guc_ct * ct,struct iosys_map * cmds,u32 head,u32 len,u32 size,s64 shift)1829 static bool ct_fixup_ggtt_in_message(struct xe_guc_ct *ct,
1830 struct iosys_map *cmds, u32 head,
1831 u32 len, u32 size, s64 shift)
1832 {
1833 struct xe_gt *gt = ct_to_gt(ct);
1834 struct xe_device *xe = ct_to_xe(ct);
1835 u32 msg[GUC_HXG_MSG_MIN_LEN];
1836 u32 action, i, n;
1837
1838 xe_gt_assert(gt, len >= GUC_HXG_MSG_MIN_LEN);
1839
1840 msg[0] = xe_map_rd_ring_u32(xe, cmds, head, size);
1841 action = FIELD_GET(GUC_HXG_REQUEST_MSG_0_ACTION, msg[0]);
1842
1843 xe_gt_sriov_dbg_verbose(gt, "fixing H2G %#x\n", action);
1844
1845 switch (action) {
1846 case XE_GUC_ACTION_REGISTER_CONTEXT:
1847 if (len != XE_GUC_REGISTER_CONTEXT_MSG_LEN)
1848 goto err_len;
1849 xe_fixup_u64_in_cmds(xe, cmds, size, head +
1850 XE_GUC_REGISTER_CONTEXT_DATA_5_WQ_DESC_ADDR_LOWER,
1851 shift);
1852 xe_fixup_u64_in_cmds(xe, cmds, size, head +
1853 XE_GUC_REGISTER_CONTEXT_DATA_7_WQ_BUF_BASE_LOWER,
1854 shift);
1855 xe_fixup_u64_in_cmds(xe, cmds, size, head +
1856 XE_GUC_REGISTER_CONTEXT_DATA_10_HW_LRC_ADDR, shift);
1857 break;
1858 case XE_GUC_ACTION_REGISTER_CONTEXT_MULTI_LRC:
1859 if (len < XE_GUC_REGISTER_CONTEXT_MULTI_LRC_MSG_MIN_LEN)
1860 goto err_len;
1861 n = xe_map_rd_ring_u32(xe, cmds, head +
1862 XE_GUC_REGISTER_CONTEXT_MULTI_LRC_DATA_10_NUM_CTXS, size);
1863 if (len != XE_GUC_REGISTER_CONTEXT_MULTI_LRC_MSG_MIN_LEN + 2 * n)
1864 goto err_len;
1865 xe_fixup_u64_in_cmds(xe, cmds, size, head +
1866 XE_GUC_REGISTER_CONTEXT_MULTI_LRC_DATA_5_WQ_DESC_ADDR_LOWER,
1867 shift);
1868 xe_fixup_u64_in_cmds(xe, cmds, size, head +
1869 XE_GUC_REGISTER_CONTEXT_MULTI_LRC_DATA_7_WQ_BUF_BASE_LOWER,
1870 shift);
1871 for (i = 0; i < n; i++)
1872 xe_fixup_u64_in_cmds(xe, cmds, size, head +
1873 XE_GUC_REGISTER_CONTEXT_MULTI_LRC_DATA_11_HW_LRC_ADDR
1874 + 2 * i, shift);
1875 break;
1876 default:
1877 break;
1878 }
1879 return true;
1880
1881 err_len:
1882 xe_gt_err(gt, "Skipped G2G %#x message fixups, unexpected length (%u)\n", action, len);
1883 return false;
1884 }
1885
1886 /*
1887 * Apply fixups to the next outgoing CT message within given CTB
1888 * @ct: the &xe_guc_ct struct instance representing the target GuC
1889 * @h2g: the &guc_ctb struct instance of the target buffer
1890 * @shift: shift to be added to all GGTT addresses within the CTB
1891 * @mhead: pointer to an integer storing message start position; the
1892 * position is changed to next message before this function return
1893 * @avail: size of the area available for parsing, that is length
1894 * of all remaining messages stored within the CTB
1895 * Return: size of the area available for parsing after one message
1896 * has been parsed, that is length remaining from the updated mhead
1897 */
ct_fixup_ggtt_in_buffer(struct xe_guc_ct * ct,struct guc_ctb * h2g,s64 shift,u32 * mhead,s32 avail)1898 static int ct_fixup_ggtt_in_buffer(struct xe_guc_ct *ct, struct guc_ctb *h2g,
1899 s64 shift, u32 *mhead, s32 avail)
1900 {
1901 struct xe_gt *gt = ct_to_gt(ct);
1902 struct xe_device *xe = ct_to_xe(ct);
1903 u32 msg[GUC_HXG_MSG_MIN_LEN];
1904 u32 size = h2g->info.size;
1905 u32 head = *mhead;
1906 u32 len;
1907
1908 xe_gt_assert(gt, avail >= (s32)GUC_CTB_MSG_MIN_LEN);
1909
1910 /* Read header */
1911 msg[0] = xe_map_rd_ring_u32(xe, &h2g->cmds, head, size);
1912 len = FIELD_GET(GUC_CTB_MSG_0_NUM_DWORDS, msg[0]) + GUC_CTB_MSG_MIN_LEN;
1913
1914 if (unlikely(len > (u32)avail)) {
1915 xe_gt_err(gt, "H2G channel broken on read, avail=%d, len=%d, fixups skipped\n",
1916 avail, len);
1917 return 0;
1918 }
1919
1920 head = (head + GUC_CTB_MSG_MIN_LEN) % size;
1921 if (!ct_fixup_ggtt_in_message(ct, &h2g->cmds, head, msg_len_to_hxg_len(len), size, shift))
1922 return 0;
1923 *mhead = (head + msg_len_to_hxg_len(len)) % size;
1924
1925 return avail - len;
1926 }
1927
1928 /**
1929 * xe_guc_ct_fixup_messages_with_ggtt - Fixup any pending H2G CTB messages
1930 * @ct: pointer to CT struct of the target GuC
1931 * @ggtt_shift: shift to be added to all GGTT addresses within the CTB
1932 *
1933 * Messages in GuC to Host CTB are owned by GuC and any fixups in them
1934 * are made by GuC. But content of the Host to GuC CTB is owned by the
1935 * KMD, so fixups to GGTT references in any pending messages need to be
1936 * applied here.
1937 * This function updates GGTT offsets in payloads of pending H2G CTB
1938 * messages (messages which were not consumed by GuC before the VF got
1939 * paused).
1940 */
xe_guc_ct_fixup_messages_with_ggtt(struct xe_guc_ct * ct,s64 ggtt_shift)1941 void xe_guc_ct_fixup_messages_with_ggtt(struct xe_guc_ct *ct, s64 ggtt_shift)
1942 {
1943 struct guc_ctb *h2g = &ct->ctbs.h2g;
1944 struct xe_guc *guc = ct_to_guc(ct);
1945 struct xe_gt *gt = guc_to_gt(guc);
1946 u32 head, tail, size;
1947 s32 avail;
1948
1949 if (unlikely(h2g->info.broken))
1950 return;
1951
1952 h2g->info.head = desc_read(ct_to_xe(ct), h2g, head);
1953 head = h2g->info.head;
1954 tail = READ_ONCE(h2g->info.tail);
1955 size = h2g->info.size;
1956
1957 if (unlikely(head > size))
1958 goto corrupted;
1959
1960 if (unlikely(tail >= size))
1961 goto corrupted;
1962
1963 avail = tail - head;
1964
1965 /* beware of buffer wrap case */
1966 if (unlikely(avail < 0))
1967 avail += size;
1968 xe_gt_dbg(gt, "available %d (%u:%u:%u)\n", avail, head, tail, size);
1969 xe_gt_assert(gt, avail >= 0);
1970
1971 while (avail > 0)
1972 avail = ct_fixup_ggtt_in_buffer(ct, h2g, ggtt_shift, &head, avail);
1973
1974 return;
1975
1976 corrupted:
1977 xe_gt_err(gt, "Corrupted H2G descriptor head=%u tail=%u size=%u, fixups not applied\n",
1978 head, tail, size);
1979 h2g->info.broken = true;
1980 }
1981
guc_ct_snapshot_alloc(struct xe_guc_ct * ct,bool atomic,bool want_ctb)1982 static struct xe_guc_ct_snapshot *guc_ct_snapshot_alloc(struct xe_guc_ct *ct, bool atomic,
1983 bool want_ctb)
1984 {
1985 struct xe_guc_ct_snapshot *snapshot;
1986
1987 snapshot = kzalloc(sizeof(*snapshot), atomic ? GFP_ATOMIC : GFP_KERNEL);
1988 if (!snapshot)
1989 return NULL;
1990
1991 if (ct->bo && want_ctb) {
1992 snapshot->ctb_size = xe_bo_size(ct->bo);
1993 snapshot->ctb = kmalloc(snapshot->ctb_size, atomic ? GFP_ATOMIC : GFP_KERNEL);
1994 }
1995
1996 return snapshot;
1997 }
1998
guc_ctb_snapshot_capture(struct xe_device * xe,struct guc_ctb * ctb,struct guc_ctb_snapshot * snapshot)1999 static void guc_ctb_snapshot_capture(struct xe_device *xe, struct guc_ctb *ctb,
2000 struct guc_ctb_snapshot *snapshot)
2001 {
2002 xe_map_memcpy_from(xe, &snapshot->desc, &ctb->desc, 0,
2003 sizeof(struct guc_ct_buffer_desc));
2004 memcpy(&snapshot->info, &ctb->info, sizeof(struct guc_ctb_info));
2005 }
2006
guc_ctb_snapshot_print(struct guc_ctb_snapshot * snapshot,struct drm_printer * p)2007 static void guc_ctb_snapshot_print(struct guc_ctb_snapshot *snapshot,
2008 struct drm_printer *p)
2009 {
2010 drm_printf(p, "\tsize: %d\n", snapshot->info.size);
2011 drm_printf(p, "\tresv_space: %d\n", snapshot->info.resv_space);
2012 drm_printf(p, "\thead: %d\n", snapshot->info.head);
2013 drm_printf(p, "\ttail: %d\n", snapshot->info.tail);
2014 drm_printf(p, "\tspace: %d\n", snapshot->info.space);
2015 drm_printf(p, "\tbroken: %d\n", snapshot->info.broken);
2016 drm_printf(p, "\thead (memory): %d\n", snapshot->desc.head);
2017 drm_printf(p, "\ttail (memory): %d\n", snapshot->desc.tail);
2018 drm_printf(p, "\tstatus (memory): 0x%x\n", snapshot->desc.status);
2019 }
2020
guc_ct_snapshot_capture(struct xe_guc_ct * ct,bool atomic,bool want_ctb)2021 static struct xe_guc_ct_snapshot *guc_ct_snapshot_capture(struct xe_guc_ct *ct, bool atomic,
2022 bool want_ctb)
2023 {
2024 struct xe_device *xe = ct_to_xe(ct);
2025 struct xe_guc_ct_snapshot *snapshot;
2026
2027 snapshot = guc_ct_snapshot_alloc(ct, atomic, want_ctb);
2028 if (!snapshot) {
2029 xe_gt_err(ct_to_gt(ct), "Skipping CTB snapshot entirely.\n");
2030 return NULL;
2031 }
2032
2033 if (xe_guc_ct_enabled(ct) || ct->state == XE_GUC_CT_STATE_STOPPED) {
2034 snapshot->ct_enabled = true;
2035 snapshot->g2h_outstanding = READ_ONCE(ct->g2h_outstanding);
2036 guc_ctb_snapshot_capture(xe, &ct->ctbs.h2g, &snapshot->h2g);
2037 guc_ctb_snapshot_capture(xe, &ct->ctbs.g2h, &snapshot->g2h);
2038 }
2039
2040 if (ct->bo && snapshot->ctb)
2041 xe_map_memcpy_from(xe, snapshot->ctb, &ct->bo->vmap, 0, snapshot->ctb_size);
2042
2043 return snapshot;
2044 }
2045
2046 /**
2047 * xe_guc_ct_snapshot_capture - Take a quick snapshot of the CT state.
2048 * @ct: GuC CT object.
2049 *
2050 * This can be printed out in a later stage like during dev_coredump
2051 * analysis. This is safe to be called during atomic context.
2052 *
2053 * Returns: a GuC CT snapshot object that must be freed by the caller
2054 * by using `xe_guc_ct_snapshot_free`.
2055 */
xe_guc_ct_snapshot_capture(struct xe_guc_ct * ct)2056 struct xe_guc_ct_snapshot *xe_guc_ct_snapshot_capture(struct xe_guc_ct *ct)
2057 {
2058 return guc_ct_snapshot_capture(ct, true, true);
2059 }
2060
2061 /**
2062 * xe_guc_ct_snapshot_print - Print out a given GuC CT snapshot.
2063 * @snapshot: GuC CT snapshot object.
2064 * @p: drm_printer where it will be printed out.
2065 *
2066 * This function prints out a given GuC CT snapshot object.
2067 */
xe_guc_ct_snapshot_print(struct xe_guc_ct_snapshot * snapshot,struct drm_printer * p)2068 void xe_guc_ct_snapshot_print(struct xe_guc_ct_snapshot *snapshot,
2069 struct drm_printer *p)
2070 {
2071 if (!snapshot)
2072 return;
2073
2074 if (snapshot->ct_enabled) {
2075 drm_puts(p, "H2G CTB (all sizes in DW):\n");
2076 guc_ctb_snapshot_print(&snapshot->h2g, p);
2077
2078 drm_puts(p, "G2H CTB (all sizes in DW):\n");
2079 guc_ctb_snapshot_print(&snapshot->g2h, p);
2080 drm_printf(p, "\tg2h outstanding: %d\n",
2081 snapshot->g2h_outstanding);
2082
2083 if (snapshot->ctb) {
2084 drm_printf(p, "[CTB].length: 0x%zx\n", snapshot->ctb_size);
2085 xe_print_blob_ascii85(p, "[CTB].data", '\n',
2086 snapshot->ctb, 0, snapshot->ctb_size);
2087 }
2088 } else {
2089 drm_puts(p, "CT disabled\n");
2090 }
2091 }
2092
2093 /**
2094 * xe_guc_ct_snapshot_free - Free all allocated objects for a given snapshot.
2095 * @snapshot: GuC CT snapshot object.
2096 *
2097 * This function free all the memory that needed to be allocated at capture
2098 * time.
2099 */
xe_guc_ct_snapshot_free(struct xe_guc_ct_snapshot * snapshot)2100 void xe_guc_ct_snapshot_free(struct xe_guc_ct_snapshot *snapshot)
2101 {
2102 if (!snapshot)
2103 return;
2104
2105 kfree(snapshot->ctb);
2106 kfree(snapshot);
2107 }
2108
2109 /**
2110 * xe_guc_ct_print - GuC CT Print.
2111 * @ct: GuC CT.
2112 * @p: drm_printer where it will be printed out.
2113 * @want_ctb: Should the full CTB content be dumped (vs just the headers)
2114 *
2115 * This function will quickly capture a snapshot of the CT state
2116 * and immediately print it out.
2117 */
xe_guc_ct_print(struct xe_guc_ct * ct,struct drm_printer * p,bool want_ctb)2118 void xe_guc_ct_print(struct xe_guc_ct *ct, struct drm_printer *p, bool want_ctb)
2119 {
2120 struct xe_guc_ct_snapshot *snapshot;
2121
2122 snapshot = guc_ct_snapshot_capture(ct, false, want_ctb);
2123 xe_guc_ct_snapshot_print(snapshot, p);
2124 xe_guc_ct_snapshot_free(snapshot);
2125 }
2126
2127 #if IS_ENABLED(CONFIG_DRM_XE_DEBUG)
2128
2129 #ifdef CONFIG_FUNCTION_ERROR_INJECTION
2130 /*
2131 * This is a helper function which assists the driver in identifying if a fault
2132 * injection test is currently active, allowing it to reduce unnecessary debug
2133 * output. Typically, the function returns zero, but the fault injection
2134 * framework can alter this to return an error. Since faults are injected
2135 * through this function, it's important to ensure the compiler doesn't optimize
2136 * it into an inline function. To avoid such optimization, the 'noinline'
2137 * attribute is applied. Compiler optimizes the static function defined in the
2138 * header file as an inline function.
2139 */
xe_is_injection_active(void)2140 noinline int xe_is_injection_active(void) { return 0; }
2141 ALLOW_ERROR_INJECTION(xe_is_injection_active, ERRNO);
2142 #else
xe_is_injection_active(void)2143 int xe_is_injection_active(void) { return 0; }
2144 #endif
2145
ct_dead_capture(struct xe_guc_ct * ct,struct guc_ctb * ctb,u32 reason_code)2146 static void ct_dead_capture(struct xe_guc_ct *ct, struct guc_ctb *ctb, u32 reason_code)
2147 {
2148 struct xe_guc_log_snapshot *snapshot_log;
2149 struct xe_guc_ct_snapshot *snapshot_ct;
2150 struct xe_guc *guc = ct_to_guc(ct);
2151 unsigned long flags;
2152 bool have_capture;
2153
2154 if (ctb)
2155 ctb->info.broken = true;
2156 /*
2157 * Huge dump is getting generated when injecting error for guc CT/MMIO
2158 * functions. So, let us suppress the dump when fault is injected.
2159 */
2160 if (xe_is_injection_active())
2161 return;
2162
2163 /* Ignore further errors after the first dump until a reset */
2164 if (ct->dead.reported)
2165 return;
2166
2167 spin_lock_irqsave(&ct->dead.lock, flags);
2168
2169 /* And only capture one dump at a time */
2170 have_capture = ct->dead.reason & (1 << CT_DEAD_STATE_CAPTURE);
2171 ct->dead.reason |= (1 << reason_code) |
2172 (1 << CT_DEAD_STATE_CAPTURE);
2173
2174 spin_unlock_irqrestore(&ct->dead.lock, flags);
2175
2176 if (have_capture)
2177 return;
2178
2179 snapshot_log = xe_guc_log_snapshot_capture(&guc->log, true);
2180 snapshot_ct = xe_guc_ct_snapshot_capture((ct));
2181
2182 spin_lock_irqsave(&ct->dead.lock, flags);
2183
2184 if (ct->dead.snapshot_log || ct->dead.snapshot_ct) {
2185 xe_gt_err(ct_to_gt(ct), "Got unexpected dead CT capture!\n");
2186 xe_guc_log_snapshot_free(snapshot_log);
2187 xe_guc_ct_snapshot_free(snapshot_ct);
2188 } else {
2189 ct->dead.snapshot_log = snapshot_log;
2190 ct->dead.snapshot_ct = snapshot_ct;
2191 }
2192
2193 spin_unlock_irqrestore(&ct->dead.lock, flags);
2194
2195 queue_work(system_unbound_wq, &(ct)->dead.worker);
2196 }
2197
ct_dead_print(struct xe_dead_ct * dead)2198 static void ct_dead_print(struct xe_dead_ct *dead)
2199 {
2200 struct xe_guc_ct *ct = container_of(dead, struct xe_guc_ct, dead);
2201 struct xe_device *xe = ct_to_xe(ct);
2202 struct xe_gt *gt = ct_to_gt(ct);
2203 static int g_count;
2204 struct drm_printer ip = xe_gt_info_printer(gt);
2205 struct drm_printer lp = drm_line_printer(&ip, "Capture", ++g_count);
2206
2207 if (!dead->reason) {
2208 xe_gt_err(gt, "CTB is dead for no reason!?\n");
2209 return;
2210 }
2211
2212 /* Can't generate a genuine core dump at this point, so just do the good bits */
2213 drm_puts(&lp, "**** Xe Device Coredump ****\n");
2214 drm_printf(&lp, "Reason: CTB is dead - 0x%X\n", dead->reason);
2215 xe_device_snapshot_print(xe, &lp);
2216
2217 drm_printf(&lp, "**** GT #%d ****\n", gt->info.id);
2218 drm_printf(&lp, "\tTile: %d\n", gt->tile->id);
2219
2220 drm_puts(&lp, "**** GuC Log ****\n");
2221 xe_guc_log_snapshot_print(dead->snapshot_log, &lp);
2222
2223 drm_puts(&lp, "**** GuC CT ****\n");
2224 xe_guc_ct_snapshot_print(dead->snapshot_ct, &lp);
2225
2226 drm_puts(&lp, "Done.\n");
2227 }
2228
ct_dead_worker_func(struct work_struct * w)2229 static void ct_dead_worker_func(struct work_struct *w)
2230 {
2231 struct xe_guc_ct *ct = container_of(w, struct xe_guc_ct, dead.worker);
2232
2233 if (!ct->dead.reported) {
2234 ct->dead.reported = true;
2235 ct_dead_print(&ct->dead);
2236 }
2237
2238 spin_lock_irq(&ct->dead.lock);
2239
2240 xe_guc_log_snapshot_free(ct->dead.snapshot_log);
2241 ct->dead.snapshot_log = NULL;
2242 xe_guc_ct_snapshot_free(ct->dead.snapshot_ct);
2243 ct->dead.snapshot_ct = NULL;
2244
2245 if (ct->dead.reason & (1 << CT_DEAD_STATE_REARM)) {
2246 /* A reset has occurred so re-arm the error reporting */
2247 ct->dead.reason = 0;
2248 ct->dead.reported = false;
2249 }
2250
2251 spin_unlock_irq(&ct->dead.lock);
2252 }
2253 #endif
2254