1 /*
2 * Copyright 2023 Advanced Micro Devices, Inc.
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice shall be included in
12 * all copies or substantial portions of the Software.
13 *
14 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
17 * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
18 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
19 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
20 * OTHER DEALINGS IN THE SOFTWARE.
21 */
22
23 #include "kfd_debug.h"
24 #include "kfd_device_queue_manager.h"
25 #include "kfd_topology.h"
26 #include <linux/file.h>
27 #include <uapi/linux/kfd_ioctl.h>
28 #include <uapi/linux/kfd_sysfs.h>
29
30 #define MAX_WATCH_ADDRESSES 4
31
kfd_dbg_ev_query_debug_event(struct kfd_process * process,unsigned int * queue_id,unsigned int * gpu_id,uint64_t exception_clear_mask,uint64_t * event_status)32 int kfd_dbg_ev_query_debug_event(struct kfd_process *process,
33 unsigned int *queue_id,
34 unsigned int *gpu_id,
35 uint64_t exception_clear_mask,
36 uint64_t *event_status)
37 {
38 struct process_queue_manager *pqm;
39 struct process_queue_node *pqn;
40 int i;
41
42 if (!(process && process->debug_trap_enabled))
43 return -ENODATA;
44
45 mutex_lock(&process->event_mutex);
46 *event_status = 0;
47 *queue_id = 0;
48 *gpu_id = 0;
49
50 /* find and report queue events */
51 pqm = &process->pqm;
52 list_for_each_entry(pqn, &pqm->queues, process_queue_list) {
53 uint64_t tmp = process->exception_enable_mask;
54
55 if (!pqn->q)
56 continue;
57
58 tmp &= pqn->q->properties.exception_status;
59
60 if (!tmp)
61 continue;
62
63 *event_status = pqn->q->properties.exception_status;
64 *queue_id = pqn->q->properties.queue_id;
65 *gpu_id = pqn->q->device->id;
66 pqn->q->properties.exception_status &= ~exception_clear_mask;
67 goto out;
68 }
69
70 /* find and report device events */
71 for (i = 0; i < process->n_pdds; i++) {
72 struct kfd_process_device *pdd = process->pdds[i];
73 uint64_t tmp = process->exception_enable_mask
74 & pdd->exception_status;
75
76 if (!tmp)
77 continue;
78
79 *event_status = pdd->exception_status;
80 *gpu_id = pdd->dev->id;
81 pdd->exception_status &= ~exception_clear_mask;
82 goto out;
83 }
84
85 /* report process events */
86 if (process->exception_enable_mask & process->exception_status) {
87 *event_status = process->exception_status;
88 process->exception_status &= ~exception_clear_mask;
89 }
90
91 out:
92 mutex_unlock(&process->event_mutex);
93 return *event_status ? 0 : -EAGAIN;
94 }
95
debug_event_write_work_handler(struct work_struct * work)96 void debug_event_write_work_handler(struct work_struct *work)
97 {
98 struct kfd_process *process;
99
100 static const char write_data = '.';
101 loff_t pos = 0;
102
103 process = container_of(work,
104 struct kfd_process,
105 debug_event_workarea);
106
107 if (process->debug_trap_enabled && process->dbg_ev_file)
108 kernel_write(process->dbg_ev_file, &write_data, 1, &pos);
109 }
110
111 /* update process/device/queue exception status, write to descriptor
112 * only if exception_status is enabled.
113 */
kfd_dbg_ev_raise(uint64_t event_mask,struct kfd_process * process,struct kfd_node * dev,unsigned int source_id,bool use_worker,void * exception_data,size_t exception_data_size)114 bool kfd_dbg_ev_raise(uint64_t event_mask,
115 struct kfd_process *process, struct kfd_node *dev,
116 unsigned int source_id, bool use_worker,
117 void *exception_data, size_t exception_data_size)
118 {
119 struct process_queue_manager *pqm;
120 struct process_queue_node *pqn;
121 int i;
122 static const char write_data = '.';
123 loff_t pos = 0;
124 bool is_subscribed = true;
125
126 if (!(process && process->debug_trap_enabled))
127 return false;
128
129 mutex_lock(&process->event_mutex);
130
131 if (event_mask & KFD_EC_MASK_DEVICE) {
132 for (i = 0; i < process->n_pdds; i++) {
133 struct kfd_process_device *pdd = process->pdds[i];
134
135 if (pdd->dev != dev)
136 continue;
137
138 pdd->exception_status |= event_mask & KFD_EC_MASK_DEVICE;
139
140 if (event_mask & KFD_EC_MASK(EC_DEVICE_MEMORY_VIOLATION)) {
141 if (!pdd->vm_fault_exc_data) {
142 pdd->vm_fault_exc_data = kmemdup(
143 exception_data,
144 exception_data_size,
145 GFP_KERNEL);
146 if (!pdd->vm_fault_exc_data)
147 pr_debug("Failed to allocate exception data memory");
148 } else {
149 pr_debug("Debugger exception data not saved\n");
150 print_hex_dump_bytes("exception data: ",
151 DUMP_PREFIX_OFFSET,
152 exception_data,
153 exception_data_size);
154 }
155 }
156 break;
157 }
158 } else if (event_mask & KFD_EC_MASK_PROCESS) {
159 process->exception_status |= event_mask & KFD_EC_MASK_PROCESS;
160 } else {
161 pqm = &process->pqm;
162 list_for_each_entry(pqn, &pqm->queues,
163 process_queue_list) {
164 int target_id;
165
166 if (!pqn->q)
167 continue;
168
169 target_id = event_mask & KFD_EC_MASK(EC_QUEUE_NEW) ?
170 pqn->q->properties.queue_id :
171 pqn->q->doorbell_id;
172
173 if (pqn->q->device != dev || target_id != source_id)
174 continue;
175
176 pqn->q->properties.exception_status |= event_mask;
177 break;
178 }
179 }
180
181 if (process->exception_enable_mask & event_mask) {
182 if (use_worker)
183 schedule_work(&process->debug_event_workarea);
184 else
185 kernel_write(process->dbg_ev_file,
186 &write_data,
187 1,
188 &pos);
189 } else {
190 is_subscribed = false;
191 }
192
193 mutex_unlock(&process->event_mutex);
194
195 return is_subscribed;
196 }
197
198 /* set pending event queue entry from ring entry */
kfd_set_dbg_ev_from_interrupt(struct kfd_node * dev,unsigned int pasid,uint32_t doorbell_id,uint64_t trap_mask,void * exception_data,size_t exception_data_size)199 bool kfd_set_dbg_ev_from_interrupt(struct kfd_node *dev,
200 unsigned int pasid,
201 uint32_t doorbell_id,
202 uint64_t trap_mask,
203 void *exception_data,
204 size_t exception_data_size)
205 {
206 struct kfd_process *p;
207 struct kfd_process_device *pdd = NULL;
208 bool signaled_to_debugger_or_runtime = false;
209
210 p = kfd_lookup_process_by_pasid(pasid, &pdd);
211
212 if (!pdd)
213 return false;
214
215 if (!kfd_dbg_ev_raise(trap_mask, p, dev, doorbell_id, true,
216 exception_data, exception_data_size)) {
217 struct process_queue_manager *pqm;
218 struct process_queue_node *pqn;
219
220 if (!!(trap_mask & KFD_EC_MASK_QUEUE) &&
221 p->runtime_info.runtime_state == DEBUG_RUNTIME_STATE_ENABLED) {
222 mutex_lock(&p->mutex);
223
224 pqm = &p->pqm;
225 list_for_each_entry(pqn, &pqm->queues,
226 process_queue_list) {
227
228 if (!(pqn->q && pqn->q->device == dev &&
229 pqn->q->doorbell_id == doorbell_id))
230 continue;
231
232 kfd_send_exception_to_runtime(p, pqn->q->properties.queue_id,
233 trap_mask);
234
235 signaled_to_debugger_or_runtime = true;
236
237 break;
238 }
239
240 mutex_unlock(&p->mutex);
241 } else if (trap_mask & KFD_EC_MASK(EC_DEVICE_MEMORY_VIOLATION)) {
242 kfd_evict_process_device(pdd);
243 kfd_signal_vm_fault_event(pdd, NULL, exception_data);
244
245 signaled_to_debugger_or_runtime = true;
246 }
247 } else {
248 signaled_to_debugger_or_runtime = true;
249 }
250
251 kfd_unref_process(p);
252
253 return signaled_to_debugger_or_runtime;
254 }
255
kfd_dbg_send_exception_to_runtime(struct kfd_process * p,unsigned int dev_id,unsigned int queue_id,uint64_t error_reason)256 int kfd_dbg_send_exception_to_runtime(struct kfd_process *p,
257 unsigned int dev_id,
258 unsigned int queue_id,
259 uint64_t error_reason)
260 {
261 if (error_reason & KFD_EC_MASK(EC_DEVICE_MEMORY_VIOLATION)) {
262 struct kfd_process_device *pdd = NULL;
263 struct kfd_hsa_memory_exception_data *data;
264 int i;
265
266 for (i = 0; i < p->n_pdds; i++) {
267 if (p->pdds[i]->dev->id == dev_id) {
268 pdd = p->pdds[i];
269 break;
270 }
271 }
272
273 if (!pdd)
274 return -ENODEV;
275
276 data = (struct kfd_hsa_memory_exception_data *)
277 pdd->vm_fault_exc_data;
278
279 kfd_evict_process_device(pdd);
280 kfd_signal_vm_fault_event(pdd, NULL, data);
281 error_reason &= ~KFD_EC_MASK(EC_DEVICE_MEMORY_VIOLATION);
282 }
283
284 if (error_reason & (KFD_EC_MASK(EC_PROCESS_RUNTIME))) {
285 /*
286 * block should only happen after the debugger receives runtime
287 * enable notice.
288 */
289 up(&p->runtime_enable_sema);
290 error_reason &= ~KFD_EC_MASK(EC_PROCESS_RUNTIME);
291 }
292
293 if (error_reason)
294 return kfd_send_exception_to_runtime(p, queue_id, error_reason);
295
296 return 0;
297 }
298
kfd_dbg_set_queue_workaround(struct queue * q,bool enable)299 static int kfd_dbg_set_queue_workaround(struct queue *q, bool enable)
300 {
301 struct mqd_update_info minfo = {0};
302 int err;
303
304 if (!q)
305 return 0;
306
307 if (!kfd_dbg_has_cwsr_workaround(q->device))
308 return 0;
309
310 if (enable && q->properties.is_user_cu_masked)
311 return -EBUSY;
312
313 minfo.update_flag = enable ? UPDATE_FLAG_DBG_WA_ENABLE : UPDATE_FLAG_DBG_WA_DISABLE;
314
315 q->properties.is_dbg_wa = enable;
316 err = q->device->dqm->ops.update_queue(q->device->dqm, q, &minfo);
317 if (err)
318 q->properties.is_dbg_wa = false;
319
320 return err;
321 }
322
kfd_dbg_set_workaround(struct kfd_process * target,bool enable)323 static int kfd_dbg_set_workaround(struct kfd_process *target, bool enable)
324 {
325 struct process_queue_manager *pqm = &target->pqm;
326 struct process_queue_node *pqn;
327 int r = 0;
328
329 list_for_each_entry(pqn, &pqm->queues, process_queue_list) {
330 r = kfd_dbg_set_queue_workaround(pqn->q, enable);
331 if (enable && r)
332 goto unwind;
333 }
334
335 return 0;
336
337 unwind:
338 list_for_each_entry(pqn, &pqm->queues, process_queue_list)
339 kfd_dbg_set_queue_workaround(pqn->q, false);
340
341 if (enable)
342 target->runtime_info.runtime_state = r == -EBUSY ?
343 DEBUG_RUNTIME_STATE_ENABLED_BUSY :
344 DEBUG_RUNTIME_STATE_ENABLED_ERROR;
345
346 return r;
347 }
348
kfd_dbg_set_mes_debug_mode(struct kfd_process_device * pdd,bool sq_trap_en)349 int kfd_dbg_set_mes_debug_mode(struct kfd_process_device *pdd, bool sq_trap_en)
350 {
351 uint32_t spi_dbg_cntl = pdd->spi_dbg_override | pdd->spi_dbg_launch_mode;
352 uint32_t flags = pdd->process->dbg_flags;
353 struct amdgpu_device *adev = pdd->dev->adev;
354 int r;
355
356 if (!kfd_dbg_is_per_vmid_supported(pdd->dev))
357 return 0;
358
359 if (!pdd->proc_ctx_cpu_ptr) {
360 r = amdgpu_amdkfd_alloc_kernel_mem(adev,
361 AMDGPU_MES_PROC_CTX_SIZE,
362 AMDGPU_GEM_DOMAIN_GTT,
363 &pdd->proc_ctx_bo,
364 &pdd->proc_ctx_gpu_addr,
365 &pdd->proc_ctx_cpu_ptr,
366 false);
367 if (r) {
368 dev_err(adev->dev,
369 "failed to allocate process context bo\n");
370 return r;
371 }
372 memset(pdd->proc_ctx_cpu_ptr, 0, AMDGPU_MES_PROC_CTX_SIZE);
373 }
374
375 return amdgpu_mes_set_shader_debugger(pdd->dev->adev,
376 pdd->proc_ctx_gpu_addr, spi_dbg_cntl,
377 pdd->watch_points, flags, sq_trap_en,
378 ffs(pdd->dev->xcc_mask) - 1);
379 }
380
381 #define KFD_DEBUGGER_INVALID_WATCH_POINT_ID -1
kfd_dbg_get_dev_watch_id(struct kfd_process_device * pdd,int * watch_id)382 static int kfd_dbg_get_dev_watch_id(struct kfd_process_device *pdd, int *watch_id)
383 {
384 int i;
385
386 *watch_id = KFD_DEBUGGER_INVALID_WATCH_POINT_ID;
387
388 spin_lock(&pdd->dev->watch_points_lock);
389
390 for (i = 0; i < MAX_WATCH_ADDRESSES; i++) {
391 /* device watchpoint in use so skip */
392 if ((pdd->dev->alloc_watch_ids >> i) & 0x1)
393 continue;
394
395 pdd->alloc_watch_ids |= 0x1 << i;
396 pdd->dev->alloc_watch_ids |= 0x1 << i;
397 *watch_id = i;
398 spin_unlock(&pdd->dev->watch_points_lock);
399 return 0;
400 }
401
402 spin_unlock(&pdd->dev->watch_points_lock);
403
404 return -ENOMEM;
405 }
406
kfd_dbg_clear_dev_watch_id(struct kfd_process_device * pdd,u32 watch_id)407 static void kfd_dbg_clear_dev_watch_id(struct kfd_process_device *pdd, u32 watch_id)
408 {
409 spin_lock(&pdd->dev->watch_points_lock);
410
411 /* process owns device watch point so safe to clear */
412 if (pdd->alloc_watch_ids & BIT(watch_id)) {
413 pdd->alloc_watch_ids &= ~BIT(watch_id);
414 pdd->dev->alloc_watch_ids &= ~BIT(watch_id);
415 }
416
417 spin_unlock(&pdd->dev->watch_points_lock);
418 }
419
kfd_dbg_owns_dev_watch_id(struct kfd_process_device * pdd,u32 watch_id)420 static bool kfd_dbg_owns_dev_watch_id(struct kfd_process_device *pdd, u32 watch_id)
421 {
422 bool owns_watch_id = false;
423
424 spin_lock(&pdd->dev->watch_points_lock);
425 owns_watch_id = pdd->alloc_watch_ids & BIT(watch_id);
426 spin_unlock(&pdd->dev->watch_points_lock);
427
428 return owns_watch_id;
429 }
430
kfd_dbg_trap_clear_dev_address_watch(struct kfd_process_device * pdd,uint32_t watch_id)431 int kfd_dbg_trap_clear_dev_address_watch(struct kfd_process_device *pdd,
432 uint32_t watch_id)
433 {
434 int r;
435
436 if (watch_id >= MAX_WATCH_ADDRESSES)
437 return -EINVAL;
438
439 if (!kfd_dbg_owns_dev_watch_id(pdd, watch_id))
440 return -EINVAL;
441
442 if (!pdd->dev->kfd->shared_resources.enable_mes) {
443 r = debug_lock_and_unmap(pdd->dev->dqm);
444 if (r)
445 return r;
446 }
447
448 amdgpu_gfx_off_ctrl(pdd->dev->adev, false);
449 pdd->watch_points[watch_id] = pdd->dev->kfd2kgd->clear_address_watch(
450 pdd->dev->adev,
451 watch_id);
452 amdgpu_gfx_off_ctrl(pdd->dev->adev, true);
453
454 if (!pdd->dev->kfd->shared_resources.enable_mes)
455 r = debug_map_and_unlock(pdd->dev->dqm);
456 else
457 r = kfd_dbg_set_mes_debug_mode(pdd, true);
458
459 kfd_dbg_clear_dev_watch_id(pdd, watch_id);
460
461 return r;
462 }
463
kfd_dbg_trap_set_dev_address_watch(struct kfd_process_device * pdd,uint64_t watch_address,uint32_t watch_address_mask,uint32_t * watch_id,uint32_t watch_mode)464 int kfd_dbg_trap_set_dev_address_watch(struct kfd_process_device *pdd,
465 uint64_t watch_address,
466 uint32_t watch_address_mask,
467 uint32_t *watch_id,
468 uint32_t watch_mode)
469 {
470 int xcc_id, r = kfd_dbg_get_dev_watch_id(pdd, watch_id);
471 uint32_t xcc_mask = pdd->dev->xcc_mask;
472
473 if (r)
474 return r;
475
476 if (*watch_id >= MAX_WATCH_ADDRESSES)
477 return -EINVAL;
478
479 if (!pdd->dev->kfd->shared_resources.enable_mes) {
480 r = debug_lock_and_unmap(pdd->dev->dqm);
481 if (r) {
482 kfd_dbg_clear_dev_watch_id(pdd, *watch_id);
483 return r;
484 }
485 }
486
487 amdgpu_gfx_off_ctrl(pdd->dev->adev, false);
488 for_each_inst(xcc_id, xcc_mask)
489 pdd->watch_points[*watch_id] = pdd->dev->kfd2kgd->set_address_watch(
490 pdd->dev->adev,
491 watch_address,
492 watch_address_mask,
493 *watch_id,
494 watch_mode,
495 pdd->dev->vm_info.last_vmid_kfd,
496 xcc_id);
497 amdgpu_gfx_off_ctrl(pdd->dev->adev, true);
498
499 if (!pdd->dev->kfd->shared_resources.enable_mes)
500 r = debug_map_and_unlock(pdd->dev->dqm);
501 else
502 r = kfd_dbg_set_mes_debug_mode(pdd, true);
503
504 /* HWS is broken so no point in HW rollback but release the watchpoint anyways */
505 if (r)
506 kfd_dbg_clear_dev_watch_id(pdd, *watch_id);
507
508 return 0;
509 }
510
kfd_dbg_clear_process_address_watch(struct kfd_process * target)511 static void kfd_dbg_clear_process_address_watch(struct kfd_process *target)
512 {
513 int i, j;
514
515 for (i = 0; i < target->n_pdds; i++)
516 for (j = 0; j < MAX_WATCH_ADDRESSES; j++)
517 kfd_dbg_trap_clear_dev_address_watch(target->pdds[i], j);
518 }
519
kfd_dbg_trap_set_flags(struct kfd_process * target,uint32_t * flags)520 int kfd_dbg_trap_set_flags(struct kfd_process *target, uint32_t *flags)
521 {
522 uint32_t prev_flags = target->dbg_flags;
523 int i, r = 0, rewind_count = 0;
524
525 for (i = 0; i < target->n_pdds; i++) {
526 uint32_t caps;
527 uint32_t caps2;
528 struct kfd_topology_device *topo_dev =
529 kfd_topology_device_by_id(target->pdds[i]->dev->id);
530 if (!topo_dev)
531 return -EINVAL;
532
533 caps = topo_dev->node_props.capability;
534 caps2 = topo_dev->node_props.capability2;
535
536 if (!(caps & HSA_CAP_TRAP_DEBUG_PRECISE_MEMORY_OPERATIONS_SUPPORTED) &&
537 (*flags & KFD_DBG_TRAP_FLAG_SINGLE_MEM_OP)) {
538 *flags = prev_flags;
539 return -EACCES;
540 }
541
542 if (!(caps & HSA_CAP_TRAP_DEBUG_PRECISE_ALU_OPERATIONS_SUPPORTED) &&
543 (*flags & KFD_DBG_TRAP_FLAG_SINGLE_ALU_OP)) {
544 *flags = prev_flags;
545 return -EACCES;
546 }
547
548 if (!(caps2 & HSA_CAP2_TRAP_DEBUG_LDS_OUT_OF_ADDR_RANGE_SUPPORTED) &&
549 (*flags & KFD_DBG_TRAP_FLAG_LDS_OUT_OF_ADDR_RANGE)) {
550 *flags = prev_flags;
551 return -EACCES;
552 }
553 }
554
555 target->dbg_flags = *flags;
556 *flags = prev_flags;
557 for (i = 0; i < target->n_pdds; i++) {
558 struct kfd_process_device *pdd = target->pdds[i];
559
560 if (!kfd_dbg_is_per_vmid_supported(pdd->dev))
561 continue;
562
563 if (!pdd->dev->kfd->shared_resources.enable_mes)
564 r = debug_refresh_runlist(pdd->dev->dqm);
565 else
566 r = kfd_dbg_set_mes_debug_mode(pdd, true);
567
568 if (r) {
569 target->dbg_flags = prev_flags;
570 break;
571 }
572
573 rewind_count++;
574 }
575
576 /* Rewind flags */
577 if (r) {
578 target->dbg_flags = prev_flags;
579
580 for (i = 0; i < rewind_count; i++) {
581 struct kfd_process_device *pdd = target->pdds[i];
582
583 if (!kfd_dbg_is_per_vmid_supported(pdd->dev))
584 continue;
585
586 if (!pdd->dev->kfd->shared_resources.enable_mes)
587 (void)debug_refresh_runlist(pdd->dev->dqm);
588 else
589 (void)kfd_dbg_set_mes_debug_mode(pdd, true);
590 }
591 }
592
593 return r;
594 }
595
596 /* kfd_dbg_trap_deactivate:
597 * target: target process
598 * unwind: If this is unwinding a failed kfd_dbg_trap_enable()
599 * unwind_count:
600 * If unwind == true, how far down the pdd list we need
601 * to unwind
602 * else: ignored
603 */
kfd_dbg_trap_deactivate(struct kfd_process * target,bool unwind,int unwind_count)604 void kfd_dbg_trap_deactivate(struct kfd_process *target, bool unwind, int unwind_count)
605 {
606 int i;
607
608 if (!unwind) {
609 uint32_t flags = 0;
610 int resume_count = resume_queues(target, 0, NULL);
611
612 if (resume_count)
613 pr_debug("Resumed %d queues\n", resume_count);
614
615 cancel_work_sync(&target->debug_event_workarea);
616 kfd_dbg_clear_process_address_watch(target);
617 kfd_dbg_trap_set_wave_launch_mode(target, 0);
618
619 kfd_dbg_trap_set_flags(target, &flags);
620 }
621
622 for (i = 0; i < target->n_pdds; i++) {
623 struct kfd_process_device *pdd = target->pdds[i];
624
625 /* If this is an unwind, and we have unwound the required
626 * enable calls on the pdd list, we need to stop now
627 * otherwise we may mess up another debugger session.
628 */
629 if (unwind && i == unwind_count)
630 break;
631
632 kfd_process_set_trap_debug_flag(&pdd->qpd, false);
633
634 /* GFX off is already disabled by debug activate if not RLC restore supported. */
635 if (kfd_dbg_is_rlc_restore_supported(pdd->dev))
636 amdgpu_gfx_off_ctrl(pdd->dev->adev, false);
637 pdd->spi_dbg_override =
638 pdd->dev->kfd2kgd->disable_debug_trap(
639 pdd->dev->adev,
640 target->runtime_info.ttmp_setup,
641 pdd->dev->vm_info.last_vmid_kfd);
642 amdgpu_gfx_off_ctrl(pdd->dev->adev, true);
643
644 if (!kfd_dbg_is_per_vmid_supported(pdd->dev) &&
645 release_debug_trap_vmid(pdd->dev->dqm, &pdd->qpd))
646 pr_err("Failed to release debug vmid on [%i]\n", pdd->dev->id);
647
648 if (!pdd->dev->kfd->shared_resources.enable_mes)
649 (void)debug_refresh_runlist(pdd->dev->dqm);
650 else
651 (void)kfd_dbg_set_mes_debug_mode(pdd,
652 !kfd_dbg_has_cwsr_workaround(pdd->dev));
653 }
654
655 kfd_dbg_set_workaround(target, false);
656 }
657
kfd_dbg_clean_exception_status(struct kfd_process * target)658 static void kfd_dbg_clean_exception_status(struct kfd_process *target)
659 {
660 struct process_queue_manager *pqm;
661 struct process_queue_node *pqn;
662 int i;
663
664 for (i = 0; i < target->n_pdds; i++) {
665 struct kfd_process_device *pdd = target->pdds[i];
666
667 kfd_process_drain_interrupts(pdd);
668
669 pdd->exception_status = 0;
670 }
671
672 pqm = &target->pqm;
673 list_for_each_entry(pqn, &pqm->queues, process_queue_list) {
674 if (!pqn->q)
675 continue;
676
677 pqn->q->properties.exception_status = 0;
678 }
679
680 target->exception_status = 0;
681 }
682
kfd_dbg_trap_disable(struct kfd_process * target)683 int kfd_dbg_trap_disable(struct kfd_process *target)
684 {
685 if (!target->debug_trap_enabled)
686 return 0;
687
688 /*
689 * Defer deactivation to runtime if runtime not enabled otherwise reset
690 * attached running target runtime state to enable for re-attach.
691 */
692 if (target->runtime_info.runtime_state == DEBUG_RUNTIME_STATE_ENABLED)
693 kfd_dbg_trap_deactivate(target, false, 0);
694 else if (target->runtime_info.runtime_state != DEBUG_RUNTIME_STATE_DISABLED)
695 target->runtime_info.runtime_state = DEBUG_RUNTIME_STATE_ENABLED;
696
697 cancel_work_sync(&target->debug_event_workarea);
698 fput(target->dbg_ev_file);
699 target->dbg_ev_file = NULL;
700
701 if (target->debugger_process) {
702 atomic_dec(&target->debugger_process->debugged_process_count);
703 target->debugger_process = NULL;
704 }
705
706 target->debug_trap_enabled = false;
707 kfd_dbg_clean_exception_status(target);
708 kfd_unref_process(target);
709
710 return 0;
711 }
712
kfd_dbg_trap_activate(struct kfd_process * target)713 int kfd_dbg_trap_activate(struct kfd_process *target)
714 {
715 int i, r = 0;
716
717 r = kfd_dbg_set_workaround(target, true);
718 if (r)
719 return r;
720
721 for (i = 0; i < target->n_pdds; i++) {
722 struct kfd_process_device *pdd = target->pdds[i];
723
724 if (!kfd_dbg_is_per_vmid_supported(pdd->dev)) {
725 r = reserve_debug_trap_vmid(pdd->dev->dqm, &pdd->qpd);
726
727 if (r) {
728 target->runtime_info.runtime_state = (r == -EBUSY) ?
729 DEBUG_RUNTIME_STATE_ENABLED_BUSY :
730 DEBUG_RUNTIME_STATE_ENABLED_ERROR;
731
732 goto unwind_err;
733 }
734 }
735
736 /* Disable GFX OFF to prevent garbage read/writes to debug registers.
737 * If RLC restore of debug registers is not supported and runtime enable
738 * hasn't done so already on ttmp setup request, restore the trap config registers.
739 *
740 * If RLC restore of debug registers is not supported, keep gfx off disabled for
741 * the debug session.
742 */
743 amdgpu_gfx_off_ctrl(pdd->dev->adev, false);
744 if (!(kfd_dbg_is_rlc_restore_supported(pdd->dev) ||
745 target->runtime_info.ttmp_setup))
746 pdd->dev->kfd2kgd->enable_debug_trap(pdd->dev->adev, true,
747 pdd->dev->vm_info.last_vmid_kfd);
748
749 pdd->spi_dbg_override = pdd->dev->kfd2kgd->enable_debug_trap(
750 pdd->dev->adev,
751 false,
752 pdd->dev->vm_info.last_vmid_kfd);
753
754 if (kfd_dbg_is_rlc_restore_supported(pdd->dev))
755 amdgpu_gfx_off_ctrl(pdd->dev->adev, true);
756
757 /*
758 * Setting the debug flag in the trap handler requires that the TMA has been
759 * allocated, which occurs during CWSR initialization.
760 * In the event that CWSR has not been initialized at this point, setting the
761 * flag will be called again during CWSR initialization if the target process
762 * is still debug enabled.
763 */
764 kfd_process_set_trap_debug_flag(&pdd->qpd, true);
765
766 if (!pdd->dev->kfd->shared_resources.enable_mes)
767 r = debug_refresh_runlist(pdd->dev->dqm);
768 else
769 r = kfd_dbg_set_mes_debug_mode(pdd, true);
770
771 if (r) {
772 target->runtime_info.runtime_state =
773 DEBUG_RUNTIME_STATE_ENABLED_ERROR;
774 goto unwind_err;
775 }
776 }
777
778 return 0;
779
780 unwind_err:
781 /* Enabling debug failed, we need to disable on
782 * all GPUs so the enable is all or nothing.
783 */
784 kfd_dbg_trap_deactivate(target, true, i);
785 return r;
786 }
787
kfd_dbg_trap_enable(struct kfd_process * target,uint32_t fd,void __user * runtime_info,uint32_t * runtime_size)788 int kfd_dbg_trap_enable(struct kfd_process *target, uint32_t fd,
789 void __user *runtime_info, uint32_t *runtime_size)
790 {
791 struct file *f;
792 uint32_t copy_size;
793 int i, r = 0;
794
795 if (target->debug_trap_enabled)
796 return -EALREADY;
797
798 /* Enable pre-checks */
799 for (i = 0; i < target->n_pdds; i++) {
800 struct kfd_process_device *pdd = target->pdds[i];
801
802 if (!KFD_IS_SOC15(pdd->dev))
803 return -ENODEV;
804
805 if (pdd->qpd.num_gws && (!kfd_dbg_has_gws_support(pdd->dev) ||
806 kfd_dbg_has_cwsr_workaround(pdd->dev)))
807 return -EBUSY;
808 }
809
810 copy_size = min((size_t)(*runtime_size), sizeof(target->runtime_info));
811
812 f = fget(fd);
813 if (!f) {
814 pr_err("Failed to get file for (%i)\n", fd);
815 return -EBADF;
816 }
817
818 target->dbg_ev_file = f;
819
820 /* defer activation to runtime if not runtime enabled */
821 if (target->runtime_info.runtime_state == DEBUG_RUNTIME_STATE_ENABLED)
822 kfd_dbg_trap_activate(target);
823
824 /* We already hold the process reference but hold another one for the
825 * debug session.
826 */
827 kref_get(&target->ref);
828 target->debug_trap_enabled = true;
829
830 if (target->debugger_process)
831 atomic_inc(&target->debugger_process->debugged_process_count);
832
833 if (copy_to_user(runtime_info, (void *)&target->runtime_info, copy_size)) {
834 kfd_dbg_trap_deactivate(target, false, 0);
835 r = -EFAULT;
836 }
837
838 *runtime_size = sizeof(target->runtime_info);
839
840 return r;
841 }
842
kfd_dbg_validate_trap_override_request(struct kfd_process * p,uint32_t trap_override,uint32_t trap_mask_request,uint32_t * trap_mask_supported)843 static int kfd_dbg_validate_trap_override_request(struct kfd_process *p,
844 uint32_t trap_override,
845 uint32_t trap_mask_request,
846 uint32_t *trap_mask_supported)
847 {
848 int i = 0;
849
850 *trap_mask_supported = 0xffffffff;
851
852 for (i = 0; i < p->n_pdds; i++) {
853 struct kfd_process_device *pdd = p->pdds[i];
854 int err = pdd->dev->kfd2kgd->validate_trap_override_request(
855 pdd->dev->adev,
856 trap_override,
857 trap_mask_supported);
858
859 if (err)
860 return err;
861 }
862
863 if (trap_mask_request & ~*trap_mask_supported)
864 return -EACCES;
865
866 return 0;
867 }
868
kfd_dbg_trap_set_wave_launch_override(struct kfd_process * target,uint32_t trap_override,uint32_t trap_mask_bits,uint32_t trap_mask_request,uint32_t * trap_mask_prev,uint32_t * trap_mask_supported)869 int kfd_dbg_trap_set_wave_launch_override(struct kfd_process *target,
870 uint32_t trap_override,
871 uint32_t trap_mask_bits,
872 uint32_t trap_mask_request,
873 uint32_t *trap_mask_prev,
874 uint32_t *trap_mask_supported)
875 {
876 int r = 0, i;
877
878 r = kfd_dbg_validate_trap_override_request(target,
879 trap_override,
880 trap_mask_request,
881 trap_mask_supported);
882
883 if (r)
884 return r;
885
886 for (i = 0; i < target->n_pdds; i++) {
887 struct kfd_process_device *pdd = target->pdds[i];
888
889 amdgpu_gfx_off_ctrl(pdd->dev->adev, false);
890 pdd->spi_dbg_override = pdd->dev->kfd2kgd->set_wave_launch_trap_override(
891 pdd->dev->adev,
892 pdd->dev->vm_info.last_vmid_kfd,
893 trap_override,
894 trap_mask_bits,
895 trap_mask_request,
896 trap_mask_prev,
897 pdd->spi_dbg_override);
898 amdgpu_gfx_off_ctrl(pdd->dev->adev, true);
899
900 if (!pdd->dev->kfd->shared_resources.enable_mes)
901 r = debug_refresh_runlist(pdd->dev->dqm);
902 else
903 r = kfd_dbg_set_mes_debug_mode(pdd, true);
904
905 if (r)
906 break;
907 }
908
909 return r;
910 }
911
kfd_dbg_trap_set_wave_launch_mode(struct kfd_process * target,uint8_t wave_launch_mode)912 int kfd_dbg_trap_set_wave_launch_mode(struct kfd_process *target,
913 uint8_t wave_launch_mode)
914 {
915 int r = 0, i;
916
917 if (wave_launch_mode != KFD_DBG_TRAP_WAVE_LAUNCH_MODE_NORMAL &&
918 wave_launch_mode != KFD_DBG_TRAP_WAVE_LAUNCH_MODE_HALT &&
919 wave_launch_mode != KFD_DBG_TRAP_WAVE_LAUNCH_MODE_DEBUG)
920 return -EINVAL;
921
922 for (i = 0; i < target->n_pdds; i++) {
923 struct kfd_process_device *pdd = target->pdds[i];
924
925 amdgpu_gfx_off_ctrl(pdd->dev->adev, false);
926 pdd->spi_dbg_launch_mode = pdd->dev->kfd2kgd->set_wave_launch_mode(
927 pdd->dev->adev,
928 wave_launch_mode,
929 pdd->dev->vm_info.last_vmid_kfd);
930 amdgpu_gfx_off_ctrl(pdd->dev->adev, true);
931
932 if (!pdd->dev->kfd->shared_resources.enable_mes)
933 r = debug_refresh_runlist(pdd->dev->dqm);
934 else
935 r = kfd_dbg_set_mes_debug_mode(pdd, true);
936
937 if (r)
938 break;
939 }
940
941 return r;
942 }
943
kfd_dbg_trap_query_exception_info(struct kfd_process * target,uint32_t source_id,uint32_t exception_code,bool clear_exception,void __user * info,uint32_t * info_size)944 int kfd_dbg_trap_query_exception_info(struct kfd_process *target,
945 uint32_t source_id,
946 uint32_t exception_code,
947 bool clear_exception,
948 void __user *info,
949 uint32_t *info_size)
950 {
951 bool found = false;
952 int r = 0;
953 uint32_t copy_size, actual_info_size = 0;
954 uint64_t *exception_status_ptr = NULL;
955
956 if (!target)
957 return -EINVAL;
958
959 if (!info || !info_size)
960 return -EINVAL;
961
962 mutex_lock(&target->event_mutex);
963
964 if (KFD_DBG_EC_TYPE_IS_QUEUE(exception_code)) {
965 /* Per queue exceptions */
966 struct queue *queue = NULL;
967 int i;
968
969 for (i = 0; i < target->n_pdds; i++) {
970 struct kfd_process_device *pdd = target->pdds[i];
971 struct qcm_process_device *qpd = &pdd->qpd;
972
973 list_for_each_entry(queue, &qpd->queues_list, list) {
974 if (!found && queue->properties.queue_id == source_id) {
975 found = true;
976 break;
977 }
978 }
979 if (found)
980 break;
981 }
982
983 if (!found) {
984 r = -EINVAL;
985 goto out;
986 }
987
988 if (!(queue->properties.exception_status & KFD_EC_MASK(exception_code))) {
989 r = -ENODATA;
990 goto out;
991 }
992 exception_status_ptr = &queue->properties.exception_status;
993 } else if (KFD_DBG_EC_TYPE_IS_DEVICE(exception_code)) {
994 /* Per device exceptions */
995 struct kfd_process_device *pdd = NULL;
996 int i;
997
998 for (i = 0; i < target->n_pdds; i++) {
999 pdd = target->pdds[i];
1000 if (pdd->dev->id == source_id) {
1001 found = true;
1002 break;
1003 }
1004 }
1005
1006 if (!found) {
1007 r = -EINVAL;
1008 goto out;
1009 }
1010
1011 if (!(pdd->exception_status & KFD_EC_MASK(exception_code))) {
1012 r = -ENODATA;
1013 goto out;
1014 }
1015
1016 if (exception_code == EC_DEVICE_MEMORY_VIOLATION) {
1017 copy_size = min((size_t)(*info_size), pdd->vm_fault_exc_data_size);
1018
1019 if (copy_to_user(info, pdd->vm_fault_exc_data, copy_size)) {
1020 r = -EFAULT;
1021 goto out;
1022 }
1023 actual_info_size = pdd->vm_fault_exc_data_size;
1024 if (clear_exception) {
1025 kfree(pdd->vm_fault_exc_data);
1026 pdd->vm_fault_exc_data = NULL;
1027 pdd->vm_fault_exc_data_size = 0;
1028 }
1029 }
1030 exception_status_ptr = &pdd->exception_status;
1031 } else if (KFD_DBG_EC_TYPE_IS_PROCESS(exception_code)) {
1032 /* Per process exceptions */
1033 if (!(target->exception_status & KFD_EC_MASK(exception_code))) {
1034 r = -ENODATA;
1035 goto out;
1036 }
1037
1038 if (exception_code == EC_PROCESS_RUNTIME) {
1039 copy_size = min((size_t)(*info_size), sizeof(target->runtime_info));
1040
1041 if (copy_to_user(info, (void *)&target->runtime_info, copy_size)) {
1042 r = -EFAULT;
1043 goto out;
1044 }
1045
1046 actual_info_size = sizeof(target->runtime_info);
1047 }
1048
1049 exception_status_ptr = &target->exception_status;
1050 } else {
1051 pr_debug("Bad exception type [%i]\n", exception_code);
1052 r = -EINVAL;
1053 goto out;
1054 }
1055
1056 *info_size = actual_info_size;
1057 if (clear_exception)
1058 *exception_status_ptr &= ~KFD_EC_MASK(exception_code);
1059 out:
1060 mutex_unlock(&target->event_mutex);
1061 return r;
1062 }
1063
kfd_dbg_trap_device_snapshot(struct kfd_process * target,uint64_t exception_clear_mask,void __user * user_info,uint32_t * number_of_device_infos,uint32_t * entry_size)1064 int kfd_dbg_trap_device_snapshot(struct kfd_process *target,
1065 uint64_t exception_clear_mask,
1066 void __user *user_info,
1067 uint32_t *number_of_device_infos,
1068 uint32_t *entry_size)
1069 {
1070 struct kfd_dbg_device_info_entry device_info;
1071 uint32_t tmp_entry_size, tmp_num_devices;
1072 int i, r = 0;
1073
1074 if (!(target && user_info && number_of_device_infos && entry_size))
1075 return -EINVAL;
1076
1077 tmp_entry_size = *entry_size;
1078
1079 tmp_num_devices = min_t(size_t, *number_of_device_infos, target->n_pdds);
1080 *number_of_device_infos = target->n_pdds;
1081 *entry_size = min_t(size_t, *entry_size, sizeof(device_info));
1082
1083 if (!tmp_num_devices)
1084 return 0;
1085
1086 memset(&device_info, 0, sizeof(device_info));
1087
1088 mutex_lock(&target->event_mutex);
1089
1090 /* Run over all pdd of the process */
1091 for (i = 0; i < tmp_num_devices; i++) {
1092 struct kfd_process_device *pdd = target->pdds[i];
1093 struct kfd_topology_device *topo_dev = kfd_topology_device_by_id(pdd->dev->id);
1094 if (!topo_dev) {
1095 r = -EINVAL;
1096 break;
1097 }
1098
1099 device_info.gpu_id = pdd->dev->id;
1100 device_info.exception_status = pdd->exception_status;
1101 device_info.lds_base = pdd->lds_base;
1102 device_info.lds_limit = pdd->lds_limit;
1103 device_info.scratch_base = pdd->scratch_base;
1104 device_info.scratch_limit = pdd->scratch_limit;
1105 device_info.gpuvm_base = pdd->gpuvm_base;
1106 device_info.gpuvm_limit = pdd->gpuvm_limit;
1107 device_info.location_id = topo_dev->node_props.location_id;
1108 device_info.vendor_id = topo_dev->node_props.vendor_id;
1109 device_info.device_id = topo_dev->node_props.device_id;
1110 device_info.revision_id = pdd->dev->adev->pdev->revision;
1111 device_info.subsystem_vendor_id = pdd->dev->adev->pdev->subsystem_vendor;
1112 device_info.subsystem_device_id = pdd->dev->adev->pdev->subsystem_device;
1113 device_info.fw_version = pdd->dev->kfd->mec_fw_version;
1114 device_info.gfx_target_version =
1115 topo_dev->node_props.gfx_target_version;
1116 device_info.simd_count = topo_dev->node_props.simd_count;
1117 device_info.max_waves_per_simd =
1118 topo_dev->node_props.max_waves_per_simd;
1119 device_info.array_count = topo_dev->node_props.array_count;
1120 device_info.simd_arrays_per_engine =
1121 topo_dev->node_props.simd_arrays_per_engine;
1122 device_info.num_xcc = NUM_XCC(pdd->dev->xcc_mask);
1123 device_info.capability = topo_dev->node_props.capability;
1124 device_info.debug_prop = topo_dev->node_props.debug_prop;
1125 device_info.capability2 = topo_dev->node_props.capability2;
1126
1127 if (exception_clear_mask)
1128 pdd->exception_status &= ~exception_clear_mask;
1129
1130 if (copy_to_user(user_info, &device_info, *entry_size)) {
1131 r = -EFAULT;
1132 break;
1133 }
1134
1135 user_info += tmp_entry_size;
1136 }
1137
1138 mutex_unlock(&target->event_mutex);
1139
1140 return r;
1141 }
1142
kfd_dbg_set_enabled_debug_exception_mask(struct kfd_process * target,uint64_t exception_set_mask)1143 void kfd_dbg_set_enabled_debug_exception_mask(struct kfd_process *target,
1144 uint64_t exception_set_mask)
1145 {
1146 uint64_t found_mask = 0;
1147 struct process_queue_manager *pqm;
1148 struct process_queue_node *pqn;
1149 static const char write_data = '.';
1150 loff_t pos = 0;
1151 int i;
1152
1153 mutex_lock(&target->event_mutex);
1154
1155 found_mask |= target->exception_status;
1156
1157 pqm = &target->pqm;
1158 list_for_each_entry(pqn, &pqm->queues, process_queue_list) {
1159 if (!pqn->q)
1160 continue;
1161
1162 found_mask |= pqn->q->properties.exception_status;
1163 }
1164
1165 for (i = 0; i < target->n_pdds; i++) {
1166 struct kfd_process_device *pdd = target->pdds[i];
1167
1168 found_mask |= pdd->exception_status;
1169 }
1170
1171 if (exception_set_mask & found_mask)
1172 kernel_write(target->dbg_ev_file, &write_data, 1, &pos);
1173
1174 target->exception_enable_mask = exception_set_mask;
1175
1176 mutex_unlock(&target->event_mutex);
1177 }
1178