1 // SPDX-License-Identifier: MIT
2 /*
3 * Copyright © 2022 Intel Corporation
4 */
5
6 #include "xe_guc_ct.h"
7
8 #include <linux/bitfield.h>
9 #include <linux/circ_buf.h>
10 #include <linux/delay.h>
11 #include <linux/fault-inject.h>
12
13 #include <kunit/static_stub.h>
14
15 #include <drm/drm_managed.h>
16
17 #include "abi/guc_actions_abi.h"
18 #include "abi/guc_actions_sriov_abi.h"
19 #include "abi/guc_klvs_abi.h"
20 #include "xe_bo.h"
21 #include "xe_devcoredump.h"
22 #include "xe_device.h"
23 #include "xe_gt.h"
24 #include "xe_gt_pagefault.h"
25 #include "xe_gt_printk.h"
26 #include "xe_gt_sriov_pf_control.h"
27 #include "xe_gt_sriov_pf_monitor.h"
28 #include "xe_gt_tlb_invalidation.h"
29 #include "xe_guc.h"
30 #include "xe_guc_log.h"
31 #include "xe_guc_relay.h"
32 #include "xe_guc_submit.h"
33 #include "xe_map.h"
34 #include "xe_pm.h"
35 #include "xe_trace_guc.h"
36
37 #if IS_ENABLED(CONFIG_DRM_XE_DEBUG)
38 enum {
39 /* Internal states, not error conditions */
40 CT_DEAD_STATE_REARM, /* 0x0001 */
41 CT_DEAD_STATE_CAPTURE, /* 0x0002 */
42
43 /* Error conditions */
44 CT_DEAD_SETUP, /* 0x0004 */
45 CT_DEAD_H2G_WRITE, /* 0x0008 */
46 CT_DEAD_H2G_HAS_ROOM, /* 0x0010 */
47 CT_DEAD_G2H_READ, /* 0x0020 */
48 CT_DEAD_G2H_RECV, /* 0x0040 */
49 CT_DEAD_G2H_RELEASE, /* 0x0080 */
50 CT_DEAD_DEADLOCK, /* 0x0100 */
51 CT_DEAD_PROCESS_FAILED, /* 0x0200 */
52 CT_DEAD_FAST_G2H, /* 0x0400 */
53 CT_DEAD_PARSE_G2H_RESPONSE, /* 0x0800 */
54 CT_DEAD_PARSE_G2H_UNKNOWN, /* 0x1000 */
55 CT_DEAD_PARSE_G2H_ORIGIN, /* 0x2000 */
56 CT_DEAD_PARSE_G2H_TYPE, /* 0x4000 */
57 };
58
59 static void ct_dead_worker_func(struct work_struct *w);
60 static void ct_dead_capture(struct xe_guc_ct *ct, struct guc_ctb *ctb, u32 reason_code);
61
62 #define CT_DEAD(ct, ctb, reason_code) ct_dead_capture((ct), (ctb), CT_DEAD_##reason_code)
63 #else
64 #define CT_DEAD(ct, ctb, reason) \
65 do { \
66 struct guc_ctb *_ctb = (ctb); \
67 if (_ctb) \
68 _ctb->info.broken = true; \
69 } while (0)
70 #endif
71
72 /* Used when a CT send wants to block and / or receive data */
73 struct g2h_fence {
74 u32 *response_buffer;
75 u32 seqno;
76 u32 response_data;
77 u16 response_len;
78 u16 error;
79 u16 hint;
80 u16 reason;
81 bool retry;
82 bool fail;
83 bool done;
84 };
85
g2h_fence_init(struct g2h_fence * g2h_fence,u32 * response_buffer)86 static void g2h_fence_init(struct g2h_fence *g2h_fence, u32 *response_buffer)
87 {
88 g2h_fence->response_buffer = response_buffer;
89 g2h_fence->response_data = 0;
90 g2h_fence->response_len = 0;
91 g2h_fence->fail = false;
92 g2h_fence->retry = false;
93 g2h_fence->done = false;
94 g2h_fence->seqno = ~0x0;
95 }
96
g2h_fence_needs_alloc(struct g2h_fence * g2h_fence)97 static bool g2h_fence_needs_alloc(struct g2h_fence *g2h_fence)
98 {
99 return g2h_fence->seqno == ~0x0;
100 }
101
102 static struct xe_guc *
ct_to_guc(struct xe_guc_ct * ct)103 ct_to_guc(struct xe_guc_ct *ct)
104 {
105 return container_of(ct, struct xe_guc, ct);
106 }
107
108 static struct xe_gt *
ct_to_gt(struct xe_guc_ct * ct)109 ct_to_gt(struct xe_guc_ct *ct)
110 {
111 return container_of(ct, struct xe_gt, uc.guc.ct);
112 }
113
114 static struct xe_device *
ct_to_xe(struct xe_guc_ct * ct)115 ct_to_xe(struct xe_guc_ct *ct)
116 {
117 return gt_to_xe(ct_to_gt(ct));
118 }
119
120 /**
121 * DOC: GuC CTB Blob
122 *
123 * We allocate single blob to hold both CTB descriptors and buffers:
124 *
125 * +--------+-----------------------------------------------+------+
126 * | offset | contents | size |
127 * +========+===============================================+======+
128 * | 0x0000 | H2G CTB Descriptor (send) | |
129 * +--------+-----------------------------------------------+ 4K |
130 * | 0x0800 | G2H CTB Descriptor (g2h) | |
131 * +--------+-----------------------------------------------+------+
132 * | 0x1000 | H2G CT Buffer (send) | n*4K |
133 * | | | |
134 * +--------+-----------------------------------------------+------+
135 * | 0x1000 | G2H CT Buffer (g2h) | m*4K |
136 * | + n*4K | | |
137 * +--------+-----------------------------------------------+------+
138 *
139 * Size of each ``CT Buffer`` must be multiple of 4K.
140 * We don't expect too many messages in flight at any time, unless we are
141 * using the GuC submission. In that case each request requires a minimum
142 * 2 dwords which gives us a maximum 256 queue'd requests. Hopefully this
143 * enough space to avoid backpressure on the driver. We increase the size
144 * of the receive buffer (relative to the send) to ensure a G2H response
145 * CTB has a landing spot.
146 *
147 * In addition to submissions, the G2H buffer needs to be able to hold
148 * enough space for recoverable page fault notifications. The number of
149 * page faults is interrupt driven and can be as much as the number of
150 * compute resources available. However, most of the actual work for these
151 * is in a separate page fault worker thread. Therefore we only need to
152 * make sure the queue has enough space to handle all of the submissions
153 * and responses and an extra buffer for incoming page faults.
154 */
155
156 #define CTB_DESC_SIZE ALIGN(sizeof(struct guc_ct_buffer_desc), SZ_2K)
157 #define CTB_H2G_BUFFER_SIZE (SZ_4K)
158 #define CTB_G2H_BUFFER_SIZE (SZ_128K)
159 #define G2H_ROOM_BUFFER_SIZE (CTB_G2H_BUFFER_SIZE / 2)
160
161 /**
162 * xe_guc_ct_queue_proc_time_jiffies - Return maximum time to process a full
163 * CT command queue
164 * @ct: the &xe_guc_ct. Unused at this moment but will be used in the future.
165 *
166 * Observation is that a 4KiB buffer full of commands takes a little over a
167 * second to process. Use that to calculate maximum time to process a full CT
168 * command queue.
169 *
170 * Return: Maximum time to process a full CT queue in jiffies.
171 */
xe_guc_ct_queue_proc_time_jiffies(struct xe_guc_ct * ct)172 long xe_guc_ct_queue_proc_time_jiffies(struct xe_guc_ct *ct)
173 {
174 BUILD_BUG_ON(!IS_ALIGNED(CTB_H2G_BUFFER_SIZE, SZ_4));
175 return (CTB_H2G_BUFFER_SIZE / SZ_4K) * HZ;
176 }
177
guc_ct_size(void)178 static size_t guc_ct_size(void)
179 {
180 return 2 * CTB_DESC_SIZE + CTB_H2G_BUFFER_SIZE +
181 CTB_G2H_BUFFER_SIZE;
182 }
183
guc_ct_fini(struct drm_device * drm,void * arg)184 static void guc_ct_fini(struct drm_device *drm, void *arg)
185 {
186 struct xe_guc_ct *ct = arg;
187
188 destroy_workqueue(ct->g2h_wq);
189 xa_destroy(&ct->fence_lookup);
190 }
191
192 static void receive_g2h(struct xe_guc_ct *ct);
193 static void g2h_worker_func(struct work_struct *w);
194 static void safe_mode_worker_func(struct work_struct *w);
195
primelockdep(struct xe_guc_ct * ct)196 static void primelockdep(struct xe_guc_ct *ct)
197 {
198 if (!IS_ENABLED(CONFIG_LOCKDEP))
199 return;
200
201 fs_reclaim_acquire(GFP_KERNEL);
202 might_lock(&ct->lock);
203 fs_reclaim_release(GFP_KERNEL);
204 }
205
xe_guc_ct_init(struct xe_guc_ct * ct)206 int xe_guc_ct_init(struct xe_guc_ct *ct)
207 {
208 struct xe_device *xe = ct_to_xe(ct);
209 struct xe_gt *gt = ct_to_gt(ct);
210 struct xe_tile *tile = gt_to_tile(gt);
211 struct xe_bo *bo;
212 int err;
213
214 xe_gt_assert(gt, !(guc_ct_size() % PAGE_SIZE));
215
216 ct->g2h_wq = alloc_ordered_workqueue("xe-g2h-wq", WQ_MEM_RECLAIM);
217 if (!ct->g2h_wq)
218 return -ENOMEM;
219
220 spin_lock_init(&ct->fast_lock);
221 xa_init(&ct->fence_lookup);
222 INIT_WORK(&ct->g2h_worker, g2h_worker_func);
223 INIT_DELAYED_WORK(&ct->safe_mode_worker, safe_mode_worker_func);
224 #if IS_ENABLED(CONFIG_DRM_XE_DEBUG)
225 spin_lock_init(&ct->dead.lock);
226 INIT_WORK(&ct->dead.worker, ct_dead_worker_func);
227 #endif
228 init_waitqueue_head(&ct->wq);
229 init_waitqueue_head(&ct->g2h_fence_wq);
230
231 err = drmm_mutex_init(&xe->drm, &ct->lock);
232 if (err)
233 return err;
234
235 primelockdep(ct);
236
237 bo = xe_managed_bo_create_pin_map(xe, tile, guc_ct_size(),
238 XE_BO_FLAG_SYSTEM |
239 XE_BO_FLAG_GGTT |
240 XE_BO_FLAG_GGTT_INVALIDATE);
241 if (IS_ERR(bo))
242 return PTR_ERR(bo);
243
244 ct->bo = bo;
245
246 err = drmm_add_action_or_reset(&xe->drm, guc_ct_fini, ct);
247 if (err)
248 return err;
249
250 xe_gt_assert(gt, ct->state == XE_GUC_CT_STATE_NOT_INITIALIZED);
251 ct->state = XE_GUC_CT_STATE_DISABLED;
252 return 0;
253 }
254 ALLOW_ERROR_INJECTION(xe_guc_ct_init, ERRNO); /* See xe_pci_probe() */
255
256 #define desc_read(xe_, guc_ctb__, field_) \
257 xe_map_rd_field(xe_, &guc_ctb__->desc, 0, \
258 struct guc_ct_buffer_desc, field_)
259
260 #define desc_write(xe_, guc_ctb__, field_, val_) \
261 xe_map_wr_field(xe_, &guc_ctb__->desc, 0, \
262 struct guc_ct_buffer_desc, field_, val_)
263
guc_ct_ctb_h2g_init(struct xe_device * xe,struct guc_ctb * h2g,struct iosys_map * map)264 static void guc_ct_ctb_h2g_init(struct xe_device *xe, struct guc_ctb *h2g,
265 struct iosys_map *map)
266 {
267 h2g->info.size = CTB_H2G_BUFFER_SIZE / sizeof(u32);
268 h2g->info.resv_space = 0;
269 h2g->info.tail = 0;
270 h2g->info.head = 0;
271 h2g->info.space = CIRC_SPACE(h2g->info.tail, h2g->info.head,
272 h2g->info.size) -
273 h2g->info.resv_space;
274 h2g->info.broken = false;
275
276 h2g->desc = *map;
277 xe_map_memset(xe, &h2g->desc, 0, 0, sizeof(struct guc_ct_buffer_desc));
278
279 h2g->cmds = IOSYS_MAP_INIT_OFFSET(map, CTB_DESC_SIZE * 2);
280 }
281
guc_ct_ctb_g2h_init(struct xe_device * xe,struct guc_ctb * g2h,struct iosys_map * map)282 static void guc_ct_ctb_g2h_init(struct xe_device *xe, struct guc_ctb *g2h,
283 struct iosys_map *map)
284 {
285 g2h->info.size = CTB_G2H_BUFFER_SIZE / sizeof(u32);
286 g2h->info.resv_space = G2H_ROOM_BUFFER_SIZE / sizeof(u32);
287 g2h->info.head = 0;
288 g2h->info.tail = 0;
289 g2h->info.space = CIRC_SPACE(g2h->info.tail, g2h->info.head,
290 g2h->info.size) -
291 g2h->info.resv_space;
292 g2h->info.broken = false;
293
294 g2h->desc = IOSYS_MAP_INIT_OFFSET(map, CTB_DESC_SIZE);
295 xe_map_memset(xe, &g2h->desc, 0, 0, sizeof(struct guc_ct_buffer_desc));
296
297 g2h->cmds = IOSYS_MAP_INIT_OFFSET(map, CTB_DESC_SIZE * 2 +
298 CTB_H2G_BUFFER_SIZE);
299 }
300
guc_ct_ctb_h2g_register(struct xe_guc_ct * ct)301 static int guc_ct_ctb_h2g_register(struct xe_guc_ct *ct)
302 {
303 struct xe_guc *guc = ct_to_guc(ct);
304 u32 desc_addr, ctb_addr, size;
305 int err;
306
307 desc_addr = xe_bo_ggtt_addr(ct->bo);
308 ctb_addr = xe_bo_ggtt_addr(ct->bo) + CTB_DESC_SIZE * 2;
309 size = ct->ctbs.h2g.info.size * sizeof(u32);
310
311 err = xe_guc_self_cfg64(guc,
312 GUC_KLV_SELF_CFG_H2G_CTB_DESCRIPTOR_ADDR_KEY,
313 desc_addr);
314 if (err)
315 return err;
316
317 err = xe_guc_self_cfg64(guc,
318 GUC_KLV_SELF_CFG_H2G_CTB_ADDR_KEY,
319 ctb_addr);
320 if (err)
321 return err;
322
323 return xe_guc_self_cfg32(guc,
324 GUC_KLV_SELF_CFG_H2G_CTB_SIZE_KEY,
325 size);
326 }
327
guc_ct_ctb_g2h_register(struct xe_guc_ct * ct)328 static int guc_ct_ctb_g2h_register(struct xe_guc_ct *ct)
329 {
330 struct xe_guc *guc = ct_to_guc(ct);
331 u32 desc_addr, ctb_addr, size;
332 int err;
333
334 desc_addr = xe_bo_ggtt_addr(ct->bo) + CTB_DESC_SIZE;
335 ctb_addr = xe_bo_ggtt_addr(ct->bo) + CTB_DESC_SIZE * 2 +
336 CTB_H2G_BUFFER_SIZE;
337 size = ct->ctbs.g2h.info.size * sizeof(u32);
338
339 err = xe_guc_self_cfg64(guc,
340 GUC_KLV_SELF_CFG_G2H_CTB_DESCRIPTOR_ADDR_KEY,
341 desc_addr);
342 if (err)
343 return err;
344
345 err = xe_guc_self_cfg64(guc,
346 GUC_KLV_SELF_CFG_G2H_CTB_ADDR_KEY,
347 ctb_addr);
348 if (err)
349 return err;
350
351 return xe_guc_self_cfg32(guc,
352 GUC_KLV_SELF_CFG_G2H_CTB_SIZE_KEY,
353 size);
354 }
355
guc_ct_control_toggle(struct xe_guc_ct * ct,bool enable)356 static int guc_ct_control_toggle(struct xe_guc_ct *ct, bool enable)
357 {
358 u32 request[HOST2GUC_CONTROL_CTB_REQUEST_MSG_LEN] = {
359 FIELD_PREP(GUC_HXG_MSG_0_ORIGIN, GUC_HXG_ORIGIN_HOST) |
360 FIELD_PREP(GUC_HXG_MSG_0_TYPE, GUC_HXG_TYPE_REQUEST) |
361 FIELD_PREP(GUC_HXG_REQUEST_MSG_0_ACTION,
362 GUC_ACTION_HOST2GUC_CONTROL_CTB),
363 FIELD_PREP(HOST2GUC_CONTROL_CTB_REQUEST_MSG_1_CONTROL,
364 enable ? GUC_CTB_CONTROL_ENABLE :
365 GUC_CTB_CONTROL_DISABLE),
366 };
367 int ret = xe_guc_mmio_send(ct_to_guc(ct), request, ARRAY_SIZE(request));
368
369 return ret > 0 ? -EPROTO : ret;
370 }
371
xe_guc_ct_set_state(struct xe_guc_ct * ct,enum xe_guc_ct_state state)372 static void xe_guc_ct_set_state(struct xe_guc_ct *ct,
373 enum xe_guc_ct_state state)
374 {
375 mutex_lock(&ct->lock); /* Serialise dequeue_one_g2h() */
376 spin_lock_irq(&ct->fast_lock); /* Serialise CT fast-path */
377
378 xe_gt_assert(ct_to_gt(ct), ct->g2h_outstanding == 0 ||
379 state == XE_GUC_CT_STATE_STOPPED);
380
381 if (ct->g2h_outstanding)
382 xe_pm_runtime_put(ct_to_xe(ct));
383 ct->g2h_outstanding = 0;
384 ct->state = state;
385
386 spin_unlock_irq(&ct->fast_lock);
387
388 /*
389 * Lockdep doesn't like this under the fast lock and he destroy only
390 * needs to be serialized with the send path which ct lock provides.
391 */
392 xa_destroy(&ct->fence_lookup);
393
394 mutex_unlock(&ct->lock);
395 }
396
ct_needs_safe_mode(struct xe_guc_ct * ct)397 static bool ct_needs_safe_mode(struct xe_guc_ct *ct)
398 {
399 return !pci_dev_msi_enabled(to_pci_dev(ct_to_xe(ct)->drm.dev));
400 }
401
ct_restart_safe_mode_worker(struct xe_guc_ct * ct)402 static bool ct_restart_safe_mode_worker(struct xe_guc_ct *ct)
403 {
404 if (!ct_needs_safe_mode(ct))
405 return false;
406
407 queue_delayed_work(ct->g2h_wq, &ct->safe_mode_worker, HZ / 10);
408 return true;
409 }
410
safe_mode_worker_func(struct work_struct * w)411 static void safe_mode_worker_func(struct work_struct *w)
412 {
413 struct xe_guc_ct *ct = container_of(w, struct xe_guc_ct, safe_mode_worker.work);
414
415 receive_g2h(ct);
416
417 if (!ct_restart_safe_mode_worker(ct))
418 xe_gt_dbg(ct_to_gt(ct), "GuC CT safe-mode canceled\n");
419 }
420
ct_enter_safe_mode(struct xe_guc_ct * ct)421 static void ct_enter_safe_mode(struct xe_guc_ct *ct)
422 {
423 if (ct_restart_safe_mode_worker(ct))
424 xe_gt_dbg(ct_to_gt(ct), "GuC CT safe-mode enabled\n");
425 }
426
ct_exit_safe_mode(struct xe_guc_ct * ct)427 static void ct_exit_safe_mode(struct xe_guc_ct *ct)
428 {
429 if (cancel_delayed_work_sync(&ct->safe_mode_worker))
430 xe_gt_dbg(ct_to_gt(ct), "GuC CT safe-mode disabled\n");
431 }
432
xe_guc_ct_enable(struct xe_guc_ct * ct)433 int xe_guc_ct_enable(struct xe_guc_ct *ct)
434 {
435 struct xe_device *xe = ct_to_xe(ct);
436 struct xe_gt *gt = ct_to_gt(ct);
437 int err;
438
439 xe_gt_assert(gt, !xe_guc_ct_enabled(ct));
440
441 xe_map_memset(xe, &ct->bo->vmap, 0, 0, ct->bo->size);
442 guc_ct_ctb_h2g_init(xe, &ct->ctbs.h2g, &ct->bo->vmap);
443 guc_ct_ctb_g2h_init(xe, &ct->ctbs.g2h, &ct->bo->vmap);
444
445 err = guc_ct_ctb_h2g_register(ct);
446 if (err)
447 goto err_out;
448
449 err = guc_ct_ctb_g2h_register(ct);
450 if (err)
451 goto err_out;
452
453 err = guc_ct_control_toggle(ct, true);
454 if (err)
455 goto err_out;
456
457 xe_guc_ct_set_state(ct, XE_GUC_CT_STATE_ENABLED);
458
459 smp_mb();
460 wake_up_all(&ct->wq);
461 xe_gt_dbg(gt, "GuC CT communication channel enabled\n");
462
463 if (ct_needs_safe_mode(ct))
464 ct_enter_safe_mode(ct);
465
466 #if IS_ENABLED(CONFIG_DRM_XE_DEBUG)
467 /*
468 * The CT has now been reset so the dumper can be re-armed
469 * after any existing dead state has been dumped.
470 */
471 spin_lock_irq(&ct->dead.lock);
472 if (ct->dead.reason)
473 ct->dead.reason |= (1 << CT_DEAD_STATE_REARM);
474 spin_unlock_irq(&ct->dead.lock);
475 #endif
476
477 return 0;
478
479 err_out:
480 xe_gt_err(gt, "Failed to enable GuC CT (%pe)\n", ERR_PTR(err));
481 CT_DEAD(ct, NULL, SETUP);
482
483 return err;
484 }
485
stop_g2h_handler(struct xe_guc_ct * ct)486 static void stop_g2h_handler(struct xe_guc_ct *ct)
487 {
488 cancel_work_sync(&ct->g2h_worker);
489 }
490
491 /**
492 * xe_guc_ct_disable - Set GuC to disabled state
493 * @ct: the &xe_guc_ct
494 *
495 * Set GuC CT to disabled state and stop g2h handler. No outstanding g2h expected
496 * in this transition.
497 */
xe_guc_ct_disable(struct xe_guc_ct * ct)498 void xe_guc_ct_disable(struct xe_guc_ct *ct)
499 {
500 xe_guc_ct_set_state(ct, XE_GUC_CT_STATE_DISABLED);
501 ct_exit_safe_mode(ct);
502 stop_g2h_handler(ct);
503 }
504
505 /**
506 * xe_guc_ct_stop - Set GuC to stopped state
507 * @ct: the &xe_guc_ct
508 *
509 * Set GuC CT to stopped state, stop g2h handler, and clear any outstanding g2h
510 */
xe_guc_ct_stop(struct xe_guc_ct * ct)511 void xe_guc_ct_stop(struct xe_guc_ct *ct)
512 {
513 xe_guc_ct_set_state(ct, XE_GUC_CT_STATE_STOPPED);
514 stop_g2h_handler(ct);
515 }
516
h2g_has_room(struct xe_guc_ct * ct,u32 cmd_len)517 static bool h2g_has_room(struct xe_guc_ct *ct, u32 cmd_len)
518 {
519 struct guc_ctb *h2g = &ct->ctbs.h2g;
520
521 lockdep_assert_held(&ct->lock);
522
523 if (cmd_len > h2g->info.space) {
524 h2g->info.head = desc_read(ct_to_xe(ct), h2g, head);
525
526 if (h2g->info.head > h2g->info.size) {
527 struct xe_device *xe = ct_to_xe(ct);
528 u32 desc_status = desc_read(xe, h2g, status);
529
530 desc_write(xe, h2g, status, desc_status | GUC_CTB_STATUS_OVERFLOW);
531
532 xe_gt_err(ct_to_gt(ct), "CT: invalid head offset %u >= %u)\n",
533 h2g->info.head, h2g->info.size);
534 CT_DEAD(ct, h2g, H2G_HAS_ROOM);
535 return false;
536 }
537
538 h2g->info.space = CIRC_SPACE(h2g->info.tail, h2g->info.head,
539 h2g->info.size) -
540 h2g->info.resv_space;
541 if (cmd_len > h2g->info.space)
542 return false;
543 }
544
545 return true;
546 }
547
g2h_has_room(struct xe_guc_ct * ct,u32 g2h_len)548 static bool g2h_has_room(struct xe_guc_ct *ct, u32 g2h_len)
549 {
550 if (!g2h_len)
551 return true;
552
553 lockdep_assert_held(&ct->fast_lock);
554
555 return ct->ctbs.g2h.info.space > g2h_len;
556 }
557
has_room(struct xe_guc_ct * ct,u32 cmd_len,u32 g2h_len)558 static int has_room(struct xe_guc_ct *ct, u32 cmd_len, u32 g2h_len)
559 {
560 lockdep_assert_held(&ct->lock);
561
562 if (!g2h_has_room(ct, g2h_len) || !h2g_has_room(ct, cmd_len))
563 return -EBUSY;
564
565 return 0;
566 }
567
h2g_reserve_space(struct xe_guc_ct * ct,u32 cmd_len)568 static void h2g_reserve_space(struct xe_guc_ct *ct, u32 cmd_len)
569 {
570 lockdep_assert_held(&ct->lock);
571 ct->ctbs.h2g.info.space -= cmd_len;
572 }
573
__g2h_reserve_space(struct xe_guc_ct * ct,u32 g2h_len,u32 num_g2h)574 static void __g2h_reserve_space(struct xe_guc_ct *ct, u32 g2h_len, u32 num_g2h)
575 {
576 xe_gt_assert(ct_to_gt(ct), g2h_len <= ct->ctbs.g2h.info.space);
577 xe_gt_assert(ct_to_gt(ct), (!g2h_len && !num_g2h) ||
578 (g2h_len && num_g2h));
579
580 if (g2h_len) {
581 lockdep_assert_held(&ct->fast_lock);
582
583 if (!ct->g2h_outstanding)
584 xe_pm_runtime_get_noresume(ct_to_xe(ct));
585
586 ct->ctbs.g2h.info.space -= g2h_len;
587 ct->g2h_outstanding += num_g2h;
588 }
589 }
590
__g2h_release_space(struct xe_guc_ct * ct,u32 g2h_len)591 static void __g2h_release_space(struct xe_guc_ct *ct, u32 g2h_len)
592 {
593 bool bad = false;
594
595 lockdep_assert_held(&ct->fast_lock);
596
597 bad = ct->ctbs.g2h.info.space + g2h_len >
598 ct->ctbs.g2h.info.size - ct->ctbs.g2h.info.resv_space;
599 bad |= !ct->g2h_outstanding;
600
601 if (bad) {
602 xe_gt_err(ct_to_gt(ct), "Invalid G2H release: %d + %d vs %d - %d -> %d vs %d, outstanding = %d!\n",
603 ct->ctbs.g2h.info.space, g2h_len,
604 ct->ctbs.g2h.info.size, ct->ctbs.g2h.info.resv_space,
605 ct->ctbs.g2h.info.space + g2h_len,
606 ct->ctbs.g2h.info.size - ct->ctbs.g2h.info.resv_space,
607 ct->g2h_outstanding);
608 CT_DEAD(ct, &ct->ctbs.g2h, G2H_RELEASE);
609 return;
610 }
611
612 ct->ctbs.g2h.info.space += g2h_len;
613 if (!--ct->g2h_outstanding)
614 xe_pm_runtime_put(ct_to_xe(ct));
615 }
616
g2h_release_space(struct xe_guc_ct * ct,u32 g2h_len)617 static void g2h_release_space(struct xe_guc_ct *ct, u32 g2h_len)
618 {
619 spin_lock_irq(&ct->fast_lock);
620 __g2h_release_space(ct, g2h_len);
621 spin_unlock_irq(&ct->fast_lock);
622 }
623
624 #define H2G_CT_HEADERS (GUC_CTB_HDR_LEN + 1) /* one DW CTB header and one DW HxG header */
625
h2g_write(struct xe_guc_ct * ct,const u32 * action,u32 len,u32 ct_fence_value,bool want_response)626 static int h2g_write(struct xe_guc_ct *ct, const u32 *action, u32 len,
627 u32 ct_fence_value, bool want_response)
628 {
629 struct xe_device *xe = ct_to_xe(ct);
630 struct xe_gt *gt = ct_to_gt(ct);
631 struct guc_ctb *h2g = &ct->ctbs.h2g;
632 u32 cmd[H2G_CT_HEADERS];
633 u32 tail = h2g->info.tail;
634 u32 full_len;
635 struct iosys_map map = IOSYS_MAP_INIT_OFFSET(&h2g->cmds,
636 tail * sizeof(u32));
637 u32 desc_status;
638
639 full_len = len + GUC_CTB_HDR_LEN;
640
641 lockdep_assert_held(&ct->lock);
642 xe_gt_assert(gt, full_len <= GUC_CTB_MSG_MAX_LEN);
643
644 desc_status = desc_read(xe, h2g, status);
645 if (desc_status) {
646 xe_gt_err(gt, "CT write: non-zero status: %u\n", desc_status);
647 goto corrupted;
648 }
649
650 if (IS_ENABLED(CONFIG_DRM_XE_DEBUG)) {
651 u32 desc_tail = desc_read(xe, h2g, tail);
652 u32 desc_head = desc_read(xe, h2g, head);
653
654 if (tail != desc_tail) {
655 desc_write(xe, h2g, status, desc_status | GUC_CTB_STATUS_MISMATCH);
656 xe_gt_err(gt, "CT write: tail was modified %u != %u\n", desc_tail, tail);
657 goto corrupted;
658 }
659
660 if (tail > h2g->info.size) {
661 desc_write(xe, h2g, status, desc_status | GUC_CTB_STATUS_OVERFLOW);
662 xe_gt_err(gt, "CT write: tail out of range: %u vs %u\n",
663 tail, h2g->info.size);
664 goto corrupted;
665 }
666
667 if (desc_head >= h2g->info.size) {
668 desc_write(xe, h2g, status, desc_status | GUC_CTB_STATUS_OVERFLOW);
669 xe_gt_err(gt, "CT write: invalid head offset %u >= %u)\n",
670 desc_head, h2g->info.size);
671 goto corrupted;
672 }
673 }
674
675 /* Command will wrap, zero fill (NOPs), return and check credits again */
676 if (tail + full_len > h2g->info.size) {
677 xe_map_memset(xe, &map, 0, 0,
678 (h2g->info.size - tail) * sizeof(u32));
679 h2g_reserve_space(ct, (h2g->info.size - tail));
680 h2g->info.tail = 0;
681 desc_write(xe, h2g, tail, h2g->info.tail);
682
683 return -EAGAIN;
684 }
685
686 /*
687 * dw0: CT header (including fence)
688 * dw1: HXG header (including action code)
689 * dw2+: action data
690 */
691 cmd[0] = FIELD_PREP(GUC_CTB_MSG_0_FORMAT, GUC_CTB_FORMAT_HXG) |
692 FIELD_PREP(GUC_CTB_MSG_0_NUM_DWORDS, len) |
693 FIELD_PREP(GUC_CTB_MSG_0_FENCE, ct_fence_value);
694 if (want_response) {
695 cmd[1] =
696 FIELD_PREP(GUC_HXG_MSG_0_TYPE, GUC_HXG_TYPE_REQUEST) |
697 FIELD_PREP(GUC_HXG_EVENT_MSG_0_ACTION |
698 GUC_HXG_EVENT_MSG_0_DATA0, action[0]);
699 } else {
700 cmd[1] =
701 FIELD_PREP(GUC_HXG_MSG_0_TYPE, GUC_HXG_TYPE_FAST_REQUEST) |
702 FIELD_PREP(GUC_HXG_EVENT_MSG_0_ACTION |
703 GUC_HXG_EVENT_MSG_0_DATA0, action[0]);
704 }
705
706 /* H2G header in cmd[1] replaces action[0] so: */
707 --len;
708 ++action;
709
710 /* Write H2G ensuring visable before descriptor update */
711 xe_map_memcpy_to(xe, &map, 0, cmd, H2G_CT_HEADERS * sizeof(u32));
712 xe_map_memcpy_to(xe, &map, H2G_CT_HEADERS * sizeof(u32), action, len * sizeof(u32));
713 xe_device_wmb(xe);
714
715 /* Update local copies */
716 h2g->info.tail = (tail + full_len) % h2g->info.size;
717 h2g_reserve_space(ct, full_len);
718
719 /* Update descriptor */
720 desc_write(xe, h2g, tail, h2g->info.tail);
721
722 trace_xe_guc_ctb_h2g(xe, gt->info.id, *(action - 1), full_len,
723 desc_read(xe, h2g, head), h2g->info.tail);
724
725 return 0;
726
727 corrupted:
728 CT_DEAD(ct, &ct->ctbs.h2g, H2G_WRITE);
729 return -EPIPE;
730 }
731
732 /*
733 * The CT protocol accepts a 16 bits fence. This field is fully owned by the
734 * driver, the GuC will just copy it to the reply message. Since we need to
735 * be able to distinguish between replies to REQUEST and FAST_REQUEST messages,
736 * we use one bit of the seqno as an indicator for that and a rolling counter
737 * for the remaining 15 bits.
738 */
739 #define CT_SEQNO_MASK GENMASK(14, 0)
740 #define CT_SEQNO_UNTRACKED BIT(15)
next_ct_seqno(struct xe_guc_ct * ct,bool is_g2h_fence)741 static u16 next_ct_seqno(struct xe_guc_ct *ct, bool is_g2h_fence)
742 {
743 u32 seqno = ct->fence_seqno++ & CT_SEQNO_MASK;
744
745 if (!is_g2h_fence)
746 seqno |= CT_SEQNO_UNTRACKED;
747
748 return seqno;
749 }
750
__guc_ct_send_locked(struct xe_guc_ct * ct,const u32 * action,u32 len,u32 g2h_len,u32 num_g2h,struct g2h_fence * g2h_fence)751 static int __guc_ct_send_locked(struct xe_guc_ct *ct, const u32 *action,
752 u32 len, u32 g2h_len, u32 num_g2h,
753 struct g2h_fence *g2h_fence)
754 {
755 struct xe_gt *gt __maybe_unused = ct_to_gt(ct);
756 u16 seqno;
757 int ret;
758
759 xe_gt_assert(gt, ct->state != XE_GUC_CT_STATE_NOT_INITIALIZED);
760 xe_gt_assert(gt, !g2h_len || !g2h_fence);
761 xe_gt_assert(gt, !num_g2h || !g2h_fence);
762 xe_gt_assert(gt, !g2h_len || num_g2h);
763 xe_gt_assert(gt, g2h_len || !num_g2h);
764 lockdep_assert_held(&ct->lock);
765
766 if (unlikely(ct->ctbs.h2g.info.broken)) {
767 ret = -EPIPE;
768 goto out;
769 }
770
771 if (ct->state == XE_GUC_CT_STATE_DISABLED) {
772 ret = -ENODEV;
773 goto out;
774 }
775
776 if (ct->state == XE_GUC_CT_STATE_STOPPED) {
777 ret = -ECANCELED;
778 goto out;
779 }
780
781 xe_gt_assert(gt, xe_guc_ct_enabled(ct));
782
783 if (g2h_fence) {
784 g2h_len = GUC_CTB_HXG_MSG_MAX_LEN;
785 num_g2h = 1;
786
787 if (g2h_fence_needs_alloc(g2h_fence)) {
788 g2h_fence->seqno = next_ct_seqno(ct, true);
789 ret = xa_err(xa_store(&ct->fence_lookup,
790 g2h_fence->seqno, g2h_fence,
791 GFP_ATOMIC));
792 if (ret)
793 goto out;
794 }
795
796 seqno = g2h_fence->seqno;
797 } else {
798 seqno = next_ct_seqno(ct, false);
799 }
800
801 if (g2h_len)
802 spin_lock_irq(&ct->fast_lock);
803 retry:
804 ret = has_room(ct, len + GUC_CTB_HDR_LEN, g2h_len);
805 if (unlikely(ret))
806 goto out_unlock;
807
808 ret = h2g_write(ct, action, len, seqno, !!g2h_fence);
809 if (unlikely(ret)) {
810 if (ret == -EAGAIN)
811 goto retry;
812 goto out_unlock;
813 }
814
815 __g2h_reserve_space(ct, g2h_len, num_g2h);
816 xe_guc_notify(ct_to_guc(ct));
817 out_unlock:
818 if (g2h_len)
819 spin_unlock_irq(&ct->fast_lock);
820 out:
821 return ret;
822 }
823
kick_reset(struct xe_guc_ct * ct)824 static void kick_reset(struct xe_guc_ct *ct)
825 {
826 xe_gt_reset_async(ct_to_gt(ct));
827 }
828
829 static int dequeue_one_g2h(struct xe_guc_ct *ct);
830
guc_ct_send_locked(struct xe_guc_ct * ct,const u32 * action,u32 len,u32 g2h_len,u32 num_g2h,struct g2h_fence * g2h_fence)831 static int guc_ct_send_locked(struct xe_guc_ct *ct, const u32 *action, u32 len,
832 u32 g2h_len, u32 num_g2h,
833 struct g2h_fence *g2h_fence)
834 {
835 struct xe_device *xe = ct_to_xe(ct);
836 struct xe_gt *gt = ct_to_gt(ct);
837 unsigned int sleep_period_ms = 1;
838 int ret;
839
840 xe_gt_assert(gt, !g2h_len || !g2h_fence);
841 lockdep_assert_held(&ct->lock);
842 xe_device_assert_mem_access(ct_to_xe(ct));
843
844 try_again:
845 ret = __guc_ct_send_locked(ct, action, len, g2h_len, num_g2h,
846 g2h_fence);
847
848 /*
849 * We wait to try to restore credits for about 1 second before bailing.
850 * In the case of H2G credits we have no choice but just to wait for the
851 * GuC to consume H2Gs in the channel so we use a wait / sleep loop. In
852 * the case of G2H we process any G2H in the channel, hopefully freeing
853 * credits as we consume the G2H messages.
854 */
855 if (unlikely(ret == -EBUSY &&
856 !h2g_has_room(ct, len + GUC_CTB_HDR_LEN))) {
857 struct guc_ctb *h2g = &ct->ctbs.h2g;
858
859 if (sleep_period_ms == 1024)
860 goto broken;
861
862 trace_xe_guc_ct_h2g_flow_control(xe, h2g->info.head, h2g->info.tail,
863 h2g->info.size,
864 h2g->info.space,
865 len + GUC_CTB_HDR_LEN);
866 msleep(sleep_period_ms);
867 sleep_period_ms <<= 1;
868
869 goto try_again;
870 } else if (unlikely(ret == -EBUSY)) {
871 struct xe_device *xe = ct_to_xe(ct);
872 struct guc_ctb *g2h = &ct->ctbs.g2h;
873
874 trace_xe_guc_ct_g2h_flow_control(xe, g2h->info.head,
875 desc_read(xe, g2h, tail),
876 g2h->info.size,
877 g2h->info.space,
878 g2h_fence ?
879 GUC_CTB_HXG_MSG_MAX_LEN :
880 g2h_len);
881
882 #define g2h_avail(ct) \
883 (desc_read(ct_to_xe(ct), (&ct->ctbs.g2h), tail) != ct->ctbs.g2h.info.head)
884 if (!wait_event_timeout(ct->wq, !ct->g2h_outstanding ||
885 g2h_avail(ct), HZ))
886 goto broken;
887 #undef g2h_avail
888
889 ret = dequeue_one_g2h(ct);
890 if (ret < 0) {
891 if (ret != -ECANCELED)
892 xe_gt_err(ct_to_gt(ct), "CTB receive failed (%pe)",
893 ERR_PTR(ret));
894 goto broken;
895 }
896
897 goto try_again;
898 }
899
900 return ret;
901
902 broken:
903 xe_gt_err(gt, "No forward process on H2G, reset required\n");
904 CT_DEAD(ct, &ct->ctbs.h2g, DEADLOCK);
905
906 return -EDEADLK;
907 }
908
guc_ct_send(struct xe_guc_ct * ct,const u32 * action,u32 len,u32 g2h_len,u32 num_g2h,struct g2h_fence * g2h_fence)909 static int guc_ct_send(struct xe_guc_ct *ct, const u32 *action, u32 len,
910 u32 g2h_len, u32 num_g2h, struct g2h_fence *g2h_fence)
911 {
912 int ret;
913
914 xe_gt_assert(ct_to_gt(ct), !g2h_len || !g2h_fence);
915
916 mutex_lock(&ct->lock);
917 ret = guc_ct_send_locked(ct, action, len, g2h_len, num_g2h, g2h_fence);
918 mutex_unlock(&ct->lock);
919
920 return ret;
921 }
922
xe_guc_ct_send(struct xe_guc_ct * ct,const u32 * action,u32 len,u32 g2h_len,u32 num_g2h)923 int xe_guc_ct_send(struct xe_guc_ct *ct, const u32 *action, u32 len,
924 u32 g2h_len, u32 num_g2h)
925 {
926 int ret;
927
928 ret = guc_ct_send(ct, action, len, g2h_len, num_g2h, NULL);
929 if (ret == -EDEADLK)
930 kick_reset(ct);
931
932 return ret;
933 }
934
xe_guc_ct_send_locked(struct xe_guc_ct * ct,const u32 * action,u32 len,u32 g2h_len,u32 num_g2h)935 int xe_guc_ct_send_locked(struct xe_guc_ct *ct, const u32 *action, u32 len,
936 u32 g2h_len, u32 num_g2h)
937 {
938 int ret;
939
940 ret = guc_ct_send_locked(ct, action, len, g2h_len, num_g2h, NULL);
941 if (ret == -EDEADLK)
942 kick_reset(ct);
943
944 return ret;
945 }
946
xe_guc_ct_send_g2h_handler(struct xe_guc_ct * ct,const u32 * action,u32 len)947 int xe_guc_ct_send_g2h_handler(struct xe_guc_ct *ct, const u32 *action, u32 len)
948 {
949 int ret;
950
951 lockdep_assert_held(&ct->lock);
952
953 ret = guc_ct_send_locked(ct, action, len, 0, 0, NULL);
954 if (ret == -EDEADLK)
955 kick_reset(ct);
956
957 return ret;
958 }
959
960 /*
961 * Check if a GT reset is in progress or will occur and if GT reset brought the
962 * CT back up. Randomly picking 5 seconds for an upper limit to do a GT a reset.
963 */
retry_failure(struct xe_guc_ct * ct,int ret)964 static bool retry_failure(struct xe_guc_ct *ct, int ret)
965 {
966 if (!(ret == -EDEADLK || ret == -EPIPE || ret == -ENODEV))
967 return false;
968
969 #define ct_alive(ct) \
970 (xe_guc_ct_enabled(ct) && !ct->ctbs.h2g.info.broken && \
971 !ct->ctbs.g2h.info.broken)
972 if (!wait_event_interruptible_timeout(ct->wq, ct_alive(ct), HZ * 5))
973 return false;
974 #undef ct_alive
975
976 return true;
977 }
978
guc_ct_send_recv(struct xe_guc_ct * ct,const u32 * action,u32 len,u32 * response_buffer,bool no_fail)979 static int guc_ct_send_recv(struct xe_guc_ct *ct, const u32 *action, u32 len,
980 u32 *response_buffer, bool no_fail)
981 {
982 struct xe_gt *gt = ct_to_gt(ct);
983 struct g2h_fence g2h_fence;
984 int ret = 0;
985
986 /*
987 * We use a fence to implement blocking sends / receiving response data.
988 * The seqno of the fence is sent in the H2G, returned in the G2H, and
989 * an xarray is used as storage media with the seqno being to key.
990 * Fields in the fence hold success, failure, retry status and the
991 * response data. Safe to allocate on the stack as the xarray is the
992 * only reference and it cannot be present after this function exits.
993 */
994 retry:
995 g2h_fence_init(&g2h_fence, response_buffer);
996 retry_same_fence:
997 ret = guc_ct_send(ct, action, len, 0, 0, &g2h_fence);
998 if (unlikely(ret == -ENOMEM)) {
999 /* Retry allocation /w GFP_KERNEL */
1000 ret = xa_err(xa_store(&ct->fence_lookup, g2h_fence.seqno,
1001 &g2h_fence, GFP_KERNEL));
1002 if (ret)
1003 return ret;
1004
1005 goto retry_same_fence;
1006 } else if (unlikely(ret)) {
1007 if (ret == -EDEADLK)
1008 kick_reset(ct);
1009
1010 if (no_fail && retry_failure(ct, ret))
1011 goto retry_same_fence;
1012
1013 if (!g2h_fence_needs_alloc(&g2h_fence))
1014 xa_erase(&ct->fence_lookup, g2h_fence.seqno);
1015
1016 return ret;
1017 }
1018
1019 ret = wait_event_timeout(ct->g2h_fence_wq, g2h_fence.done, HZ);
1020
1021 if (!ret) {
1022 LNL_FLUSH_WORK(&ct->g2h_worker);
1023 if (g2h_fence.done) {
1024 xe_gt_warn(gt, "G2H fence %u, action %04x, done\n",
1025 g2h_fence.seqno, action[0]);
1026 ret = 1;
1027 }
1028 }
1029
1030 /*
1031 * Ensure we serialize with completion side to prevent UAF with fence going out of scope on
1032 * the stack, since we have no clue if it will fire after the timeout before we can erase
1033 * from the xa. Also we have some dependent loads and stores below for which we need the
1034 * correct ordering, and we lack the needed barriers.
1035 */
1036 mutex_lock(&ct->lock);
1037 if (!ret) {
1038 xe_gt_err(gt, "Timed out wait for G2H, fence %u, action %04x, done %s",
1039 g2h_fence.seqno, action[0], str_yes_no(g2h_fence.done));
1040 xa_erase(&ct->fence_lookup, g2h_fence.seqno);
1041 mutex_unlock(&ct->lock);
1042 return -ETIME;
1043 }
1044
1045 if (g2h_fence.retry) {
1046 xe_gt_dbg(gt, "H2G action %#x retrying: reason %#x\n",
1047 action[0], g2h_fence.reason);
1048 mutex_unlock(&ct->lock);
1049 goto retry;
1050 }
1051 if (g2h_fence.fail) {
1052 xe_gt_err(gt, "H2G request %#x failed: error %#x hint %#x\n",
1053 action[0], g2h_fence.error, g2h_fence.hint);
1054 ret = -EIO;
1055 }
1056
1057 if (ret > 0)
1058 ret = response_buffer ? g2h_fence.response_len : g2h_fence.response_data;
1059
1060 mutex_unlock(&ct->lock);
1061
1062 return ret;
1063 }
1064
1065 /**
1066 * xe_guc_ct_send_recv - Send and receive HXG to the GuC
1067 * @ct: the &xe_guc_ct
1068 * @action: the dword array with `HXG Request`_ message (can't be NULL)
1069 * @len: length of the `HXG Request`_ message (in dwords, can't be 0)
1070 * @response_buffer: placeholder for the `HXG Response`_ message (can be NULL)
1071 *
1072 * Send a `HXG Request`_ message to the GuC over CT communication channel and
1073 * blocks until GuC replies with a `HXG Response`_ message.
1074 *
1075 * For non-blocking communication with GuC use xe_guc_ct_send().
1076 *
1077 * Note: The size of &response_buffer must be at least GUC_CTB_MAX_DWORDS_.
1078 *
1079 * Return: response length (in dwords) if &response_buffer was not NULL, or
1080 * DATA0 from `HXG Response`_ if &response_buffer was NULL, or
1081 * a negative error code on failure.
1082 */
xe_guc_ct_send_recv(struct xe_guc_ct * ct,const u32 * action,u32 len,u32 * response_buffer)1083 int xe_guc_ct_send_recv(struct xe_guc_ct *ct, const u32 *action, u32 len,
1084 u32 *response_buffer)
1085 {
1086 KUNIT_STATIC_STUB_REDIRECT(xe_guc_ct_send_recv, ct, action, len, response_buffer);
1087 return guc_ct_send_recv(ct, action, len, response_buffer, false);
1088 }
1089
xe_guc_ct_send_recv_no_fail(struct xe_guc_ct * ct,const u32 * action,u32 len,u32 * response_buffer)1090 int xe_guc_ct_send_recv_no_fail(struct xe_guc_ct *ct, const u32 *action,
1091 u32 len, u32 *response_buffer)
1092 {
1093 return guc_ct_send_recv(ct, action, len, response_buffer, true);
1094 }
1095
msg_to_hxg(u32 * msg)1096 static u32 *msg_to_hxg(u32 *msg)
1097 {
1098 return msg + GUC_CTB_MSG_MIN_LEN;
1099 }
1100
msg_len_to_hxg_len(u32 len)1101 static u32 msg_len_to_hxg_len(u32 len)
1102 {
1103 return len - GUC_CTB_MSG_MIN_LEN;
1104 }
1105
parse_g2h_event(struct xe_guc_ct * ct,u32 * msg,u32 len)1106 static int parse_g2h_event(struct xe_guc_ct *ct, u32 *msg, u32 len)
1107 {
1108 u32 *hxg = msg_to_hxg(msg);
1109 u32 action = FIELD_GET(GUC_HXG_EVENT_MSG_0_ACTION, hxg[0]);
1110
1111 lockdep_assert_held(&ct->lock);
1112
1113 switch (action) {
1114 case XE_GUC_ACTION_SCHED_CONTEXT_MODE_DONE:
1115 case XE_GUC_ACTION_DEREGISTER_CONTEXT_DONE:
1116 case XE_GUC_ACTION_SCHED_ENGINE_MODE_DONE:
1117 case XE_GUC_ACTION_TLB_INVALIDATION_DONE:
1118 g2h_release_space(ct, len);
1119 }
1120
1121 return 0;
1122 }
1123
parse_g2h_response(struct xe_guc_ct * ct,u32 * msg,u32 len)1124 static int parse_g2h_response(struct xe_guc_ct *ct, u32 *msg, u32 len)
1125 {
1126 struct xe_gt *gt = ct_to_gt(ct);
1127 u32 *hxg = msg_to_hxg(msg);
1128 u32 hxg_len = msg_len_to_hxg_len(len);
1129 u32 fence = FIELD_GET(GUC_CTB_MSG_0_FENCE, msg[0]);
1130 u32 type = FIELD_GET(GUC_HXG_MSG_0_TYPE, hxg[0]);
1131 struct g2h_fence *g2h_fence;
1132
1133 lockdep_assert_held(&ct->lock);
1134
1135 /*
1136 * Fences for FAST_REQUEST messages are not tracked in ct->fence_lookup.
1137 * Those messages should never fail, so if we do get an error back it
1138 * means we're likely doing an illegal operation and the GuC is
1139 * rejecting it. We have no way to inform the code that submitted the
1140 * H2G that the message was rejected, so we need to escalate the
1141 * failure to trigger a reset.
1142 */
1143 if (fence & CT_SEQNO_UNTRACKED) {
1144 if (type == GUC_HXG_TYPE_RESPONSE_FAILURE)
1145 xe_gt_err(gt, "FAST_REQ H2G fence 0x%x failed! e=0x%x, h=%u\n",
1146 fence,
1147 FIELD_GET(GUC_HXG_FAILURE_MSG_0_ERROR, hxg[0]),
1148 FIELD_GET(GUC_HXG_FAILURE_MSG_0_HINT, hxg[0]));
1149 else
1150 xe_gt_err(gt, "unexpected response %u for FAST_REQ H2G fence 0x%x!\n",
1151 type, fence);
1152 CT_DEAD(ct, NULL, PARSE_G2H_RESPONSE);
1153
1154 return -EPROTO;
1155 }
1156
1157 g2h_fence = xa_erase(&ct->fence_lookup, fence);
1158 if (unlikely(!g2h_fence)) {
1159 /* Don't tear down channel, as send could've timed out */
1160 /* CT_DEAD(ct, NULL, PARSE_G2H_UNKNOWN); */
1161 xe_gt_warn(gt, "G2H fence (%u) not found!\n", fence);
1162 g2h_release_space(ct, GUC_CTB_HXG_MSG_MAX_LEN);
1163 return 0;
1164 }
1165
1166 xe_gt_assert(gt, fence == g2h_fence->seqno);
1167
1168 if (type == GUC_HXG_TYPE_RESPONSE_FAILURE) {
1169 g2h_fence->fail = true;
1170 g2h_fence->error = FIELD_GET(GUC_HXG_FAILURE_MSG_0_ERROR, hxg[0]);
1171 g2h_fence->hint = FIELD_GET(GUC_HXG_FAILURE_MSG_0_HINT, hxg[0]);
1172 } else if (type == GUC_HXG_TYPE_NO_RESPONSE_RETRY) {
1173 g2h_fence->retry = true;
1174 g2h_fence->reason = FIELD_GET(GUC_HXG_RETRY_MSG_0_REASON, hxg[0]);
1175 } else if (g2h_fence->response_buffer) {
1176 g2h_fence->response_len = hxg_len;
1177 memcpy(g2h_fence->response_buffer, hxg, hxg_len * sizeof(u32));
1178 } else {
1179 g2h_fence->response_data = FIELD_GET(GUC_HXG_RESPONSE_MSG_0_DATA0, hxg[0]);
1180 }
1181
1182 g2h_release_space(ct, GUC_CTB_HXG_MSG_MAX_LEN);
1183
1184 g2h_fence->done = true;
1185 smp_mb();
1186
1187 wake_up_all(&ct->g2h_fence_wq);
1188
1189 return 0;
1190 }
1191
parse_g2h_msg(struct xe_guc_ct * ct,u32 * msg,u32 len)1192 static int parse_g2h_msg(struct xe_guc_ct *ct, u32 *msg, u32 len)
1193 {
1194 struct xe_gt *gt = ct_to_gt(ct);
1195 u32 *hxg = msg_to_hxg(msg);
1196 u32 origin, type;
1197 int ret;
1198
1199 lockdep_assert_held(&ct->lock);
1200
1201 origin = FIELD_GET(GUC_HXG_MSG_0_ORIGIN, hxg[0]);
1202 if (unlikely(origin != GUC_HXG_ORIGIN_GUC)) {
1203 xe_gt_err(gt, "G2H channel broken on read, origin=%u, reset required\n",
1204 origin);
1205 CT_DEAD(ct, &ct->ctbs.g2h, PARSE_G2H_ORIGIN);
1206
1207 return -EPROTO;
1208 }
1209
1210 type = FIELD_GET(GUC_HXG_MSG_0_TYPE, hxg[0]);
1211 switch (type) {
1212 case GUC_HXG_TYPE_EVENT:
1213 ret = parse_g2h_event(ct, msg, len);
1214 break;
1215 case GUC_HXG_TYPE_RESPONSE_SUCCESS:
1216 case GUC_HXG_TYPE_RESPONSE_FAILURE:
1217 case GUC_HXG_TYPE_NO_RESPONSE_RETRY:
1218 ret = parse_g2h_response(ct, msg, len);
1219 break;
1220 default:
1221 xe_gt_err(gt, "G2H channel broken on read, type=%u, reset required\n",
1222 type);
1223 CT_DEAD(ct, &ct->ctbs.g2h, PARSE_G2H_TYPE);
1224
1225 ret = -EOPNOTSUPP;
1226 }
1227
1228 return ret;
1229 }
1230
process_g2h_msg(struct xe_guc_ct * ct,u32 * msg,u32 len)1231 static int process_g2h_msg(struct xe_guc_ct *ct, u32 *msg, u32 len)
1232 {
1233 struct xe_guc *guc = ct_to_guc(ct);
1234 struct xe_gt *gt = ct_to_gt(ct);
1235 u32 hxg_len = msg_len_to_hxg_len(len);
1236 u32 *hxg = msg_to_hxg(msg);
1237 u32 action, adj_len;
1238 u32 *payload;
1239 int ret = 0;
1240
1241 if (FIELD_GET(GUC_HXG_MSG_0_TYPE, hxg[0]) != GUC_HXG_TYPE_EVENT)
1242 return 0;
1243
1244 action = FIELD_GET(GUC_HXG_EVENT_MSG_0_ACTION, hxg[0]);
1245 payload = hxg + GUC_HXG_EVENT_MSG_MIN_LEN;
1246 adj_len = hxg_len - GUC_HXG_EVENT_MSG_MIN_LEN;
1247
1248 switch (action) {
1249 case XE_GUC_ACTION_SCHED_CONTEXT_MODE_DONE:
1250 ret = xe_guc_sched_done_handler(guc, payload, adj_len);
1251 break;
1252 case XE_GUC_ACTION_DEREGISTER_CONTEXT_DONE:
1253 ret = xe_guc_deregister_done_handler(guc, payload, adj_len);
1254 break;
1255 case XE_GUC_ACTION_CONTEXT_RESET_NOTIFICATION:
1256 ret = xe_guc_exec_queue_reset_handler(guc, payload, adj_len);
1257 break;
1258 case XE_GUC_ACTION_ENGINE_FAILURE_NOTIFICATION:
1259 ret = xe_guc_exec_queue_reset_failure_handler(guc, payload,
1260 adj_len);
1261 break;
1262 case XE_GUC_ACTION_SCHED_ENGINE_MODE_DONE:
1263 /* Selftest only at the moment */
1264 break;
1265 case XE_GUC_ACTION_STATE_CAPTURE_NOTIFICATION:
1266 ret = xe_guc_error_capture_handler(guc, payload, adj_len);
1267 break;
1268 case XE_GUC_ACTION_NOTIFY_FLUSH_LOG_BUFFER_TO_FILE:
1269 /* FIXME: Handle this */
1270 break;
1271 case XE_GUC_ACTION_NOTIFY_MEMORY_CAT_ERROR:
1272 ret = xe_guc_exec_queue_memory_cat_error_handler(guc, payload,
1273 adj_len);
1274 break;
1275 case XE_GUC_ACTION_REPORT_PAGE_FAULT_REQ_DESC:
1276 ret = xe_guc_pagefault_handler(guc, payload, adj_len);
1277 break;
1278 case XE_GUC_ACTION_TLB_INVALIDATION_DONE:
1279 ret = xe_guc_tlb_invalidation_done_handler(guc, payload,
1280 adj_len);
1281 break;
1282 case XE_GUC_ACTION_ACCESS_COUNTER_NOTIFY:
1283 ret = xe_guc_access_counter_notify_handler(guc, payload,
1284 adj_len);
1285 break;
1286 case XE_GUC_ACTION_GUC2PF_RELAY_FROM_VF:
1287 ret = xe_guc_relay_process_guc2pf(&guc->relay, hxg, hxg_len);
1288 break;
1289 case XE_GUC_ACTION_GUC2VF_RELAY_FROM_PF:
1290 ret = xe_guc_relay_process_guc2vf(&guc->relay, hxg, hxg_len);
1291 break;
1292 case GUC_ACTION_GUC2PF_VF_STATE_NOTIFY:
1293 ret = xe_gt_sriov_pf_control_process_guc2pf(gt, hxg, hxg_len);
1294 break;
1295 case GUC_ACTION_GUC2PF_ADVERSE_EVENT:
1296 ret = xe_gt_sriov_pf_monitor_process_guc2pf(gt, hxg, hxg_len);
1297 break;
1298 default:
1299 xe_gt_err(gt, "unexpected G2H action 0x%04x\n", action);
1300 }
1301
1302 if (ret) {
1303 xe_gt_err(gt, "G2H action 0x%04x failed (%pe)\n",
1304 action, ERR_PTR(ret));
1305 CT_DEAD(ct, NULL, PROCESS_FAILED);
1306 }
1307
1308 return 0;
1309 }
1310
g2h_read(struct xe_guc_ct * ct,u32 * msg,bool fast_path)1311 static int g2h_read(struct xe_guc_ct *ct, u32 *msg, bool fast_path)
1312 {
1313 struct xe_device *xe = ct_to_xe(ct);
1314 struct xe_gt *gt = ct_to_gt(ct);
1315 struct guc_ctb *g2h = &ct->ctbs.g2h;
1316 u32 tail, head, len, desc_status;
1317 s32 avail;
1318 u32 action;
1319 u32 *hxg;
1320
1321 xe_gt_assert(gt, ct->state != XE_GUC_CT_STATE_NOT_INITIALIZED);
1322 lockdep_assert_held(&ct->fast_lock);
1323
1324 if (ct->state == XE_GUC_CT_STATE_DISABLED)
1325 return -ENODEV;
1326
1327 if (ct->state == XE_GUC_CT_STATE_STOPPED)
1328 return -ECANCELED;
1329
1330 if (g2h->info.broken)
1331 return -EPIPE;
1332
1333 xe_gt_assert(gt, xe_guc_ct_enabled(ct));
1334
1335 desc_status = desc_read(xe, g2h, status);
1336 if (desc_status) {
1337 if (desc_status & GUC_CTB_STATUS_DISABLED) {
1338 /*
1339 * Potentially valid if a CLIENT_RESET request resulted in
1340 * contexts/engines being reset. But should never happen as
1341 * no contexts should be active when CLIENT_RESET is sent.
1342 */
1343 xe_gt_err(gt, "CT read: unexpected G2H after GuC has stopped!\n");
1344 desc_status &= ~GUC_CTB_STATUS_DISABLED;
1345 }
1346
1347 if (desc_status) {
1348 xe_gt_err(gt, "CT read: non-zero status: %u\n", desc_status);
1349 goto corrupted;
1350 }
1351 }
1352
1353 if (IS_ENABLED(CONFIG_DRM_XE_DEBUG)) {
1354 u32 desc_tail = desc_read(xe, g2h, tail);
1355 /*
1356 u32 desc_head = desc_read(xe, g2h, head);
1357
1358 * info.head and desc_head are updated back-to-back at the end of
1359 * this function and nowhere else. Hence, they cannot be different
1360 * unless two g2h_read calls are running concurrently. Which is not
1361 * possible because it is guarded by ct->fast_lock. And yet, some
1362 * discrete platforms are reguarly hitting this error :(.
1363 *
1364 * desc_head rolling backwards shouldn't cause any noticeable
1365 * problems - just a delay in GuC being allowed to proceed past that
1366 * point in the queue. So for now, just disable the error until it
1367 * can be root caused.
1368 *
1369 if (g2h->info.head != desc_head) {
1370 desc_write(xe, g2h, status, desc_status | GUC_CTB_STATUS_MISMATCH);
1371 xe_gt_err(gt, "CT read: head was modified %u != %u\n",
1372 desc_head, g2h->info.head);
1373 goto corrupted;
1374 }
1375 */
1376
1377 if (g2h->info.head > g2h->info.size) {
1378 desc_write(xe, g2h, status, desc_status | GUC_CTB_STATUS_OVERFLOW);
1379 xe_gt_err(gt, "CT read: head out of range: %u vs %u\n",
1380 g2h->info.head, g2h->info.size);
1381 goto corrupted;
1382 }
1383
1384 if (desc_tail >= g2h->info.size) {
1385 desc_write(xe, g2h, status, desc_status | GUC_CTB_STATUS_OVERFLOW);
1386 xe_gt_err(gt, "CT read: invalid tail offset %u >= %u)\n",
1387 desc_tail, g2h->info.size);
1388 goto corrupted;
1389 }
1390 }
1391
1392 /* Calculate DW available to read */
1393 tail = desc_read(xe, g2h, tail);
1394 avail = tail - g2h->info.head;
1395 if (unlikely(avail == 0))
1396 return 0;
1397
1398 if (avail < 0)
1399 avail += g2h->info.size;
1400
1401 /* Read header */
1402 xe_map_memcpy_from(xe, msg, &g2h->cmds, sizeof(u32) * g2h->info.head,
1403 sizeof(u32));
1404 len = FIELD_GET(GUC_CTB_MSG_0_NUM_DWORDS, msg[0]) + GUC_CTB_MSG_MIN_LEN;
1405 if (len > avail) {
1406 xe_gt_err(gt, "G2H channel broken on read, avail=%d, len=%d, reset required\n",
1407 avail, len);
1408 goto corrupted;
1409 }
1410
1411 head = (g2h->info.head + 1) % g2h->info.size;
1412 avail = len - 1;
1413
1414 /* Read G2H message */
1415 if (avail + head > g2h->info.size) {
1416 u32 avail_til_wrap = g2h->info.size - head;
1417
1418 xe_map_memcpy_from(xe, msg + 1,
1419 &g2h->cmds, sizeof(u32) * head,
1420 avail_til_wrap * sizeof(u32));
1421 xe_map_memcpy_from(xe, msg + 1 + avail_til_wrap,
1422 &g2h->cmds, 0,
1423 (avail - avail_til_wrap) * sizeof(u32));
1424 } else {
1425 xe_map_memcpy_from(xe, msg + 1,
1426 &g2h->cmds, sizeof(u32) * head,
1427 avail * sizeof(u32));
1428 }
1429
1430 hxg = msg_to_hxg(msg);
1431 action = FIELD_GET(GUC_HXG_EVENT_MSG_0_ACTION, hxg[0]);
1432
1433 if (fast_path) {
1434 if (FIELD_GET(GUC_HXG_MSG_0_TYPE, hxg[0]) != GUC_HXG_TYPE_EVENT)
1435 return 0;
1436
1437 switch (action) {
1438 case XE_GUC_ACTION_REPORT_PAGE_FAULT_REQ_DESC:
1439 case XE_GUC_ACTION_TLB_INVALIDATION_DONE:
1440 break; /* Process these in fast-path */
1441 default:
1442 return 0;
1443 }
1444 }
1445
1446 /* Update local / descriptor header */
1447 g2h->info.head = (head + avail) % g2h->info.size;
1448 desc_write(xe, g2h, head, g2h->info.head);
1449
1450 trace_xe_guc_ctb_g2h(xe, ct_to_gt(ct)->info.id,
1451 action, len, g2h->info.head, tail);
1452
1453 return len;
1454
1455 corrupted:
1456 CT_DEAD(ct, &ct->ctbs.g2h, G2H_READ);
1457 return -EPROTO;
1458 }
1459
g2h_fast_path(struct xe_guc_ct * ct,u32 * msg,u32 len)1460 static void g2h_fast_path(struct xe_guc_ct *ct, u32 *msg, u32 len)
1461 {
1462 struct xe_gt *gt = ct_to_gt(ct);
1463 struct xe_guc *guc = ct_to_guc(ct);
1464 u32 hxg_len = msg_len_to_hxg_len(len);
1465 u32 *hxg = msg_to_hxg(msg);
1466 u32 action = FIELD_GET(GUC_HXG_EVENT_MSG_0_ACTION, hxg[0]);
1467 u32 *payload = hxg + GUC_HXG_MSG_MIN_LEN;
1468 u32 adj_len = hxg_len - GUC_HXG_MSG_MIN_LEN;
1469 int ret = 0;
1470
1471 switch (action) {
1472 case XE_GUC_ACTION_REPORT_PAGE_FAULT_REQ_DESC:
1473 ret = xe_guc_pagefault_handler(guc, payload, adj_len);
1474 break;
1475 case XE_GUC_ACTION_TLB_INVALIDATION_DONE:
1476 __g2h_release_space(ct, len);
1477 ret = xe_guc_tlb_invalidation_done_handler(guc, payload,
1478 adj_len);
1479 break;
1480 default:
1481 xe_gt_warn(gt, "NOT_POSSIBLE");
1482 }
1483
1484 if (ret) {
1485 xe_gt_err(gt, "G2H action 0x%04x failed (%pe)\n",
1486 action, ERR_PTR(ret));
1487 CT_DEAD(ct, NULL, FAST_G2H);
1488 }
1489 }
1490
1491 /**
1492 * xe_guc_ct_fast_path - process critical G2H in the IRQ handler
1493 * @ct: GuC CT object
1494 *
1495 * Anything related to page faults is critical for performance, process these
1496 * critical G2H in the IRQ. This is safe as these handlers either just wake up
1497 * waiters or queue another worker.
1498 */
xe_guc_ct_fast_path(struct xe_guc_ct * ct)1499 void xe_guc_ct_fast_path(struct xe_guc_ct *ct)
1500 {
1501 struct xe_device *xe = ct_to_xe(ct);
1502 bool ongoing;
1503 int len;
1504
1505 ongoing = xe_pm_runtime_get_if_active(ct_to_xe(ct));
1506 if (!ongoing && xe_pm_read_callback_task(ct_to_xe(ct)) == NULL)
1507 return;
1508
1509 spin_lock(&ct->fast_lock);
1510 do {
1511 len = g2h_read(ct, ct->fast_msg, true);
1512 if (len > 0)
1513 g2h_fast_path(ct, ct->fast_msg, len);
1514 } while (len > 0);
1515 spin_unlock(&ct->fast_lock);
1516
1517 if (ongoing)
1518 xe_pm_runtime_put(xe);
1519 }
1520
1521 /* Returns less than zero on error, 0 on done, 1 on more available */
dequeue_one_g2h(struct xe_guc_ct * ct)1522 static int dequeue_one_g2h(struct xe_guc_ct *ct)
1523 {
1524 int len;
1525 int ret;
1526
1527 lockdep_assert_held(&ct->lock);
1528
1529 spin_lock_irq(&ct->fast_lock);
1530 len = g2h_read(ct, ct->msg, false);
1531 spin_unlock_irq(&ct->fast_lock);
1532 if (len <= 0)
1533 return len;
1534
1535 ret = parse_g2h_msg(ct, ct->msg, len);
1536 if (unlikely(ret < 0))
1537 return ret;
1538
1539 ret = process_g2h_msg(ct, ct->msg, len);
1540 if (unlikely(ret < 0))
1541 return ret;
1542
1543 return 1;
1544 }
1545
receive_g2h(struct xe_guc_ct * ct)1546 static void receive_g2h(struct xe_guc_ct *ct)
1547 {
1548 bool ongoing;
1549 int ret;
1550
1551 /*
1552 * Normal users must always hold mem_access.ref around CT calls. However
1553 * during the runtime pm callbacks we rely on CT to talk to the GuC, but
1554 * at this stage we can't rely on mem_access.ref and even the
1555 * callback_task will be different than current. For such cases we just
1556 * need to ensure we always process the responses from any blocking
1557 * ct_send requests or where we otherwise expect some response when
1558 * initiated from those callbacks (which will need to wait for the below
1559 * dequeue_one_g2h()). The dequeue_one_g2h() will gracefully fail if
1560 * the device has suspended to the point that the CT communication has
1561 * been disabled.
1562 *
1563 * If we are inside the runtime pm callback, we can be the only task
1564 * still issuing CT requests (since that requires having the
1565 * mem_access.ref). It seems like it might in theory be possible to
1566 * receive unsolicited events from the GuC just as we are
1567 * suspending-resuming, but those will currently anyway be lost when
1568 * eventually exiting from suspend, hence no need to wake up the device
1569 * here. If we ever need something stronger than get_if_ongoing() then
1570 * we need to be careful with blocking the pm callbacks from getting CT
1571 * responses, if the worker here is blocked on those callbacks
1572 * completing, creating a deadlock.
1573 */
1574 ongoing = xe_pm_runtime_get_if_active(ct_to_xe(ct));
1575 if (!ongoing && xe_pm_read_callback_task(ct_to_xe(ct)) == NULL)
1576 return;
1577
1578 do {
1579 mutex_lock(&ct->lock);
1580 ret = dequeue_one_g2h(ct);
1581 mutex_unlock(&ct->lock);
1582
1583 if (unlikely(ret == -EPROTO || ret == -EOPNOTSUPP)) {
1584 xe_gt_err(ct_to_gt(ct), "CT dequeue failed: %d", ret);
1585 CT_DEAD(ct, NULL, G2H_RECV);
1586 kick_reset(ct);
1587 }
1588 } while (ret == 1);
1589
1590 if (ongoing)
1591 xe_pm_runtime_put(ct_to_xe(ct));
1592 }
1593
g2h_worker_func(struct work_struct * w)1594 static void g2h_worker_func(struct work_struct *w)
1595 {
1596 struct xe_guc_ct *ct = container_of(w, struct xe_guc_ct, g2h_worker);
1597
1598 receive_g2h(ct);
1599 }
1600
guc_ct_snapshot_alloc(struct xe_guc_ct * ct,bool atomic,bool want_ctb)1601 static struct xe_guc_ct_snapshot *guc_ct_snapshot_alloc(struct xe_guc_ct *ct, bool atomic,
1602 bool want_ctb)
1603 {
1604 struct xe_guc_ct_snapshot *snapshot;
1605
1606 snapshot = kzalloc(sizeof(*snapshot), atomic ? GFP_ATOMIC : GFP_KERNEL);
1607 if (!snapshot)
1608 return NULL;
1609
1610 if (ct->bo && want_ctb) {
1611 snapshot->ctb_size = ct->bo->size;
1612 snapshot->ctb = kmalloc(snapshot->ctb_size, atomic ? GFP_ATOMIC : GFP_KERNEL);
1613 }
1614
1615 return snapshot;
1616 }
1617
guc_ctb_snapshot_capture(struct xe_device * xe,struct guc_ctb * ctb,struct guc_ctb_snapshot * snapshot)1618 static void guc_ctb_snapshot_capture(struct xe_device *xe, struct guc_ctb *ctb,
1619 struct guc_ctb_snapshot *snapshot)
1620 {
1621 xe_map_memcpy_from(xe, &snapshot->desc, &ctb->desc, 0,
1622 sizeof(struct guc_ct_buffer_desc));
1623 memcpy(&snapshot->info, &ctb->info, sizeof(struct guc_ctb_info));
1624 }
1625
guc_ctb_snapshot_print(struct guc_ctb_snapshot * snapshot,struct drm_printer * p)1626 static void guc_ctb_snapshot_print(struct guc_ctb_snapshot *snapshot,
1627 struct drm_printer *p)
1628 {
1629 drm_printf(p, "\tsize: %d\n", snapshot->info.size);
1630 drm_printf(p, "\tresv_space: %d\n", snapshot->info.resv_space);
1631 drm_printf(p, "\thead: %d\n", snapshot->info.head);
1632 drm_printf(p, "\ttail: %d\n", snapshot->info.tail);
1633 drm_printf(p, "\tspace: %d\n", snapshot->info.space);
1634 drm_printf(p, "\tbroken: %d\n", snapshot->info.broken);
1635 drm_printf(p, "\thead (memory): %d\n", snapshot->desc.head);
1636 drm_printf(p, "\ttail (memory): %d\n", snapshot->desc.tail);
1637 drm_printf(p, "\tstatus (memory): 0x%x\n", snapshot->desc.status);
1638 }
1639
guc_ct_snapshot_capture(struct xe_guc_ct * ct,bool atomic,bool want_ctb)1640 static struct xe_guc_ct_snapshot *guc_ct_snapshot_capture(struct xe_guc_ct *ct, bool atomic,
1641 bool want_ctb)
1642 {
1643 struct xe_device *xe = ct_to_xe(ct);
1644 struct xe_guc_ct_snapshot *snapshot;
1645
1646 snapshot = guc_ct_snapshot_alloc(ct, atomic, want_ctb);
1647 if (!snapshot) {
1648 xe_gt_err(ct_to_gt(ct), "Skipping CTB snapshot entirely.\n");
1649 return NULL;
1650 }
1651
1652 if (xe_guc_ct_enabled(ct) || ct->state == XE_GUC_CT_STATE_STOPPED) {
1653 snapshot->ct_enabled = true;
1654 snapshot->g2h_outstanding = READ_ONCE(ct->g2h_outstanding);
1655 guc_ctb_snapshot_capture(xe, &ct->ctbs.h2g, &snapshot->h2g);
1656 guc_ctb_snapshot_capture(xe, &ct->ctbs.g2h, &snapshot->g2h);
1657 }
1658
1659 if (ct->bo && snapshot->ctb)
1660 xe_map_memcpy_from(xe, snapshot->ctb, &ct->bo->vmap, 0, snapshot->ctb_size);
1661
1662 return snapshot;
1663 }
1664
1665 /**
1666 * xe_guc_ct_snapshot_capture - Take a quick snapshot of the CT state.
1667 * @ct: GuC CT object.
1668 *
1669 * This can be printed out in a later stage like during dev_coredump
1670 * analysis. This is safe to be called during atomic context.
1671 *
1672 * Returns: a GuC CT snapshot object that must be freed by the caller
1673 * by using `xe_guc_ct_snapshot_free`.
1674 */
xe_guc_ct_snapshot_capture(struct xe_guc_ct * ct)1675 struct xe_guc_ct_snapshot *xe_guc_ct_snapshot_capture(struct xe_guc_ct *ct)
1676 {
1677 return guc_ct_snapshot_capture(ct, true, true);
1678 }
1679
1680 /**
1681 * xe_guc_ct_snapshot_print - Print out a given GuC CT snapshot.
1682 * @snapshot: GuC CT snapshot object.
1683 * @p: drm_printer where it will be printed out.
1684 *
1685 * This function prints out a given GuC CT snapshot object.
1686 */
xe_guc_ct_snapshot_print(struct xe_guc_ct_snapshot * snapshot,struct drm_printer * p)1687 void xe_guc_ct_snapshot_print(struct xe_guc_ct_snapshot *snapshot,
1688 struct drm_printer *p)
1689 {
1690 if (!snapshot)
1691 return;
1692
1693 if (snapshot->ct_enabled) {
1694 drm_puts(p, "H2G CTB (all sizes in DW):\n");
1695 guc_ctb_snapshot_print(&snapshot->h2g, p);
1696
1697 drm_puts(p, "G2H CTB (all sizes in DW):\n");
1698 guc_ctb_snapshot_print(&snapshot->g2h, p);
1699 drm_printf(p, "\tg2h outstanding: %d\n",
1700 snapshot->g2h_outstanding);
1701
1702 if (snapshot->ctb)
1703 xe_print_blob_ascii85(p, "CTB data", snapshot->ctb, 0, snapshot->ctb_size);
1704 } else {
1705 drm_puts(p, "CT disabled\n");
1706 }
1707 }
1708
1709 /**
1710 * xe_guc_ct_snapshot_free - Free all allocated objects for a given snapshot.
1711 * @snapshot: GuC CT snapshot object.
1712 *
1713 * This function free all the memory that needed to be allocated at capture
1714 * time.
1715 */
xe_guc_ct_snapshot_free(struct xe_guc_ct_snapshot * snapshot)1716 void xe_guc_ct_snapshot_free(struct xe_guc_ct_snapshot *snapshot)
1717 {
1718 if (!snapshot)
1719 return;
1720
1721 kfree(snapshot->ctb);
1722 kfree(snapshot);
1723 }
1724
1725 /**
1726 * xe_guc_ct_print - GuC CT Print.
1727 * @ct: GuC CT.
1728 * @p: drm_printer where it will be printed out.
1729 * @want_ctb: Should the full CTB content be dumped (vs just the headers)
1730 *
1731 * This function will quickly capture a snapshot of the CT state
1732 * and immediately print it out.
1733 */
xe_guc_ct_print(struct xe_guc_ct * ct,struct drm_printer * p,bool want_ctb)1734 void xe_guc_ct_print(struct xe_guc_ct *ct, struct drm_printer *p, bool want_ctb)
1735 {
1736 struct xe_guc_ct_snapshot *snapshot;
1737
1738 snapshot = guc_ct_snapshot_capture(ct, false, want_ctb);
1739 xe_guc_ct_snapshot_print(snapshot, p);
1740 xe_guc_ct_snapshot_free(snapshot);
1741 }
1742
1743 #if IS_ENABLED(CONFIG_DRM_XE_DEBUG)
ct_dead_capture(struct xe_guc_ct * ct,struct guc_ctb * ctb,u32 reason_code)1744 static void ct_dead_capture(struct xe_guc_ct *ct, struct guc_ctb *ctb, u32 reason_code)
1745 {
1746 struct xe_guc_log_snapshot *snapshot_log;
1747 struct xe_guc_ct_snapshot *snapshot_ct;
1748 struct xe_guc *guc = ct_to_guc(ct);
1749 unsigned long flags;
1750 bool have_capture;
1751
1752 if (ctb)
1753 ctb->info.broken = true;
1754
1755 /* Ignore further errors after the first dump until a reset */
1756 if (ct->dead.reported)
1757 return;
1758
1759 spin_lock_irqsave(&ct->dead.lock, flags);
1760
1761 /* And only capture one dump at a time */
1762 have_capture = ct->dead.reason & (1 << CT_DEAD_STATE_CAPTURE);
1763 ct->dead.reason |= (1 << reason_code) |
1764 (1 << CT_DEAD_STATE_CAPTURE);
1765
1766 spin_unlock_irqrestore(&ct->dead.lock, flags);
1767
1768 if (have_capture)
1769 return;
1770
1771 snapshot_log = xe_guc_log_snapshot_capture(&guc->log, true);
1772 snapshot_ct = xe_guc_ct_snapshot_capture((ct));
1773
1774 spin_lock_irqsave(&ct->dead.lock, flags);
1775
1776 if (ct->dead.snapshot_log || ct->dead.snapshot_ct) {
1777 xe_gt_err(ct_to_gt(ct), "Got unexpected dead CT capture!\n");
1778 xe_guc_log_snapshot_free(snapshot_log);
1779 xe_guc_ct_snapshot_free(snapshot_ct);
1780 } else {
1781 ct->dead.snapshot_log = snapshot_log;
1782 ct->dead.snapshot_ct = snapshot_ct;
1783 }
1784
1785 spin_unlock_irqrestore(&ct->dead.lock, flags);
1786
1787 queue_work(system_unbound_wq, &(ct)->dead.worker);
1788 }
1789
ct_dead_print(struct xe_dead_ct * dead)1790 static void ct_dead_print(struct xe_dead_ct *dead)
1791 {
1792 struct xe_guc_ct *ct = container_of(dead, struct xe_guc_ct, dead);
1793 struct xe_device *xe = ct_to_xe(ct);
1794 struct xe_gt *gt = ct_to_gt(ct);
1795 static int g_count;
1796 struct drm_printer ip = xe_gt_info_printer(gt);
1797 struct drm_printer lp = drm_line_printer(&ip, "Capture", ++g_count);
1798
1799 if (!dead->reason) {
1800 xe_gt_err(gt, "CTB is dead for no reason!?\n");
1801 return;
1802 }
1803
1804 drm_printf(&lp, "CTB is dead - reason=0x%X\n", dead->reason);
1805
1806 /* Can't generate a genuine core dump at this point, so just do the good bits */
1807 drm_puts(&lp, "**** Xe Device Coredump ****\n");
1808 xe_device_snapshot_print(xe, &lp);
1809
1810 drm_printf(&lp, "**** GT #%d ****\n", gt->info.id);
1811 drm_printf(&lp, "\tTile: %d\n", gt->tile->id);
1812
1813 drm_puts(&lp, "**** GuC Log ****\n");
1814 xe_guc_log_snapshot_print(dead->snapshot_log, &lp);
1815
1816 drm_puts(&lp, "**** GuC CT ****\n");
1817 xe_guc_ct_snapshot_print(dead->snapshot_ct, &lp);
1818
1819 drm_puts(&lp, "Done.\n");
1820 }
1821
ct_dead_worker_func(struct work_struct * w)1822 static void ct_dead_worker_func(struct work_struct *w)
1823 {
1824 struct xe_guc_ct *ct = container_of(w, struct xe_guc_ct, dead.worker);
1825
1826 if (!ct->dead.reported) {
1827 ct->dead.reported = true;
1828 ct_dead_print(&ct->dead);
1829 }
1830
1831 spin_lock_irq(&ct->dead.lock);
1832
1833 xe_guc_log_snapshot_free(ct->dead.snapshot_log);
1834 ct->dead.snapshot_log = NULL;
1835 xe_guc_ct_snapshot_free(ct->dead.snapshot_ct);
1836 ct->dead.snapshot_ct = NULL;
1837
1838 if (ct->dead.reason & (1 << CT_DEAD_STATE_REARM)) {
1839 /* A reset has occurred so re-arm the error reporting */
1840 ct->dead.reason = 0;
1841 ct->dead.reported = false;
1842 }
1843
1844 spin_unlock_irq(&ct->dead.lock);
1845 }
1846 #endif
1847