xref: /linux/drivers/gpu/drm/amd/amdkfd/kfd_debug.c (revision 3fa7187eceee11998f756481e45ce8c4f9d9dc48)
1 /*
2  * Copyright 2023 Advanced Micro Devices, Inc.
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the "Software"),
6  * to deal in the Software without restriction, including without limitation
7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8  * and/or sell copies of the Software, and to permit persons to whom the
9  * Software is furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice shall be included in
12  * all copies or substantial portions of the Software.
13  *
14  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
17  * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
18  * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
19  * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
20  * OTHER DEALINGS IN THE SOFTWARE.
21  */
22 
23 #include "kfd_debug.h"
24 #include "kfd_device_queue_manager.h"
25 #include "kfd_topology.h"
26 #include <linux/file.h>
27 #include <uapi/linux/kfd_ioctl.h>
28 
29 #define MAX_WATCH_ADDRESSES	4
30 
31 int kfd_dbg_ev_query_debug_event(struct kfd_process *process,
32 		      unsigned int *queue_id,
33 		      unsigned int *gpu_id,
34 		      uint64_t exception_clear_mask,
35 		      uint64_t *event_status)
36 {
37 	struct process_queue_manager *pqm;
38 	struct process_queue_node *pqn;
39 	int i;
40 
41 	if (!(process && process->debug_trap_enabled))
42 		return -ENODATA;
43 
44 	mutex_lock(&process->event_mutex);
45 	*event_status = 0;
46 	*queue_id = 0;
47 	*gpu_id = 0;
48 
49 	/* find and report queue events */
50 	pqm = &process->pqm;
51 	list_for_each_entry(pqn, &pqm->queues, process_queue_list) {
52 		uint64_t tmp = process->exception_enable_mask;
53 
54 		if (!pqn->q)
55 			continue;
56 
57 		tmp &= pqn->q->properties.exception_status;
58 
59 		if (!tmp)
60 			continue;
61 
62 		*event_status = pqn->q->properties.exception_status;
63 		*queue_id = pqn->q->properties.queue_id;
64 		*gpu_id = pqn->q->device->id;
65 		pqn->q->properties.exception_status &= ~exception_clear_mask;
66 		goto out;
67 	}
68 
69 	/* find and report device events */
70 	for (i = 0; i < process->n_pdds; i++) {
71 		struct kfd_process_device *pdd = process->pdds[i];
72 		uint64_t tmp = process->exception_enable_mask
73 						& pdd->exception_status;
74 
75 		if (!tmp)
76 			continue;
77 
78 		*event_status = pdd->exception_status;
79 		*gpu_id = pdd->dev->id;
80 		pdd->exception_status &= ~exception_clear_mask;
81 		goto out;
82 	}
83 
84 	/* report process events */
85 	if (process->exception_enable_mask & process->exception_status) {
86 		*event_status = process->exception_status;
87 		process->exception_status &= ~exception_clear_mask;
88 	}
89 
90 out:
91 	mutex_unlock(&process->event_mutex);
92 	return *event_status ? 0 : -EAGAIN;
93 }
94 
95 void debug_event_write_work_handler(struct work_struct *work)
96 {
97 	struct kfd_process *process;
98 
99 	static const char write_data = '.';
100 	loff_t pos = 0;
101 
102 	process = container_of(work,
103 			struct kfd_process,
104 			debug_event_workarea);
105 
106 	kernel_write(process->dbg_ev_file, &write_data, 1, &pos);
107 }
108 
109 /* update process/device/queue exception status, write to descriptor
110  * only if exception_status is enabled.
111  */
112 bool kfd_dbg_ev_raise(uint64_t event_mask,
113 			struct kfd_process *process, struct kfd_node *dev,
114 			unsigned int source_id, bool use_worker,
115 			void *exception_data, size_t exception_data_size)
116 {
117 	struct process_queue_manager *pqm;
118 	struct process_queue_node *pqn;
119 	int i;
120 	static const char write_data = '.';
121 	loff_t pos = 0;
122 	bool is_subscribed = true;
123 
124 	if (!(process && process->debug_trap_enabled))
125 		return false;
126 
127 	mutex_lock(&process->event_mutex);
128 
129 	if (event_mask & KFD_EC_MASK_DEVICE) {
130 		for (i = 0; i < process->n_pdds; i++) {
131 			struct kfd_process_device *pdd = process->pdds[i];
132 
133 			if (pdd->dev != dev)
134 				continue;
135 
136 			pdd->exception_status |= event_mask & KFD_EC_MASK_DEVICE;
137 
138 			if (event_mask & KFD_EC_MASK(EC_DEVICE_MEMORY_VIOLATION)) {
139 				if (!pdd->vm_fault_exc_data) {
140 					pdd->vm_fault_exc_data = kmemdup(
141 							exception_data,
142 							exception_data_size,
143 							GFP_KERNEL);
144 					if (!pdd->vm_fault_exc_data)
145 						pr_debug("Failed to allocate exception data memory");
146 				} else {
147 					pr_debug("Debugger exception data not saved\n");
148 					print_hex_dump_bytes("exception data: ",
149 							DUMP_PREFIX_OFFSET,
150 							exception_data,
151 							exception_data_size);
152 				}
153 			}
154 			break;
155 		}
156 	} else if (event_mask & KFD_EC_MASK_PROCESS) {
157 		process->exception_status |= event_mask & KFD_EC_MASK_PROCESS;
158 	} else {
159 		pqm = &process->pqm;
160 		list_for_each_entry(pqn, &pqm->queues,
161 				process_queue_list) {
162 			int target_id;
163 
164 			if (!pqn->q)
165 				continue;
166 
167 			target_id = event_mask & KFD_EC_MASK(EC_QUEUE_NEW) ?
168 					pqn->q->properties.queue_id :
169 							pqn->q->doorbell_id;
170 
171 			if (pqn->q->device != dev || target_id != source_id)
172 				continue;
173 
174 			pqn->q->properties.exception_status |= event_mask;
175 			break;
176 		}
177 	}
178 
179 	if (process->exception_enable_mask & event_mask) {
180 		if (use_worker)
181 			schedule_work(&process->debug_event_workarea);
182 		else
183 			kernel_write(process->dbg_ev_file,
184 					&write_data,
185 					1,
186 					&pos);
187 	} else {
188 		is_subscribed = false;
189 	}
190 
191 	mutex_unlock(&process->event_mutex);
192 
193 	return is_subscribed;
194 }
195 
196 /* set pending event queue entry from ring entry  */
197 bool kfd_set_dbg_ev_from_interrupt(struct kfd_node *dev,
198 				   unsigned int pasid,
199 				   uint32_t doorbell_id,
200 				   uint64_t trap_mask,
201 				   void *exception_data,
202 				   size_t exception_data_size)
203 {
204 	struct kfd_process *p;
205 	bool signaled_to_debugger_or_runtime = false;
206 
207 	p = kfd_lookup_process_by_pasid(pasid);
208 
209 	if (!p)
210 		return false;
211 
212 	if (!kfd_dbg_ev_raise(trap_mask, p, dev, doorbell_id, true,
213 			      exception_data, exception_data_size)) {
214 		struct process_queue_manager *pqm;
215 		struct process_queue_node *pqn;
216 
217 		if (!!(trap_mask & KFD_EC_MASK_QUEUE) &&
218 		       p->runtime_info.runtime_state == DEBUG_RUNTIME_STATE_ENABLED) {
219 			mutex_lock(&p->mutex);
220 
221 			pqm = &p->pqm;
222 			list_for_each_entry(pqn, &pqm->queues,
223 							process_queue_list) {
224 
225 				if (!(pqn->q && pqn->q->device == dev &&
226 				      pqn->q->doorbell_id == doorbell_id))
227 					continue;
228 
229 				kfd_send_exception_to_runtime(p, pqn->q->properties.queue_id,
230 							      trap_mask);
231 
232 				signaled_to_debugger_or_runtime = true;
233 
234 				break;
235 			}
236 
237 			mutex_unlock(&p->mutex);
238 		} else if (trap_mask & KFD_EC_MASK(EC_DEVICE_MEMORY_VIOLATION)) {
239 			kfd_dqm_evict_pasid(dev->dqm, p->pasid);
240 			kfd_signal_vm_fault_event(dev, p->pasid, NULL,
241 							exception_data);
242 
243 			signaled_to_debugger_or_runtime = true;
244 		}
245 	} else {
246 		signaled_to_debugger_or_runtime = true;
247 	}
248 
249 	kfd_unref_process(p);
250 
251 	return signaled_to_debugger_or_runtime;
252 }
253 
254 int kfd_dbg_send_exception_to_runtime(struct kfd_process *p,
255 					unsigned int dev_id,
256 					unsigned int queue_id,
257 					uint64_t error_reason)
258 {
259 	if (error_reason & KFD_EC_MASK(EC_DEVICE_MEMORY_VIOLATION)) {
260 		struct kfd_process_device *pdd = NULL;
261 		struct kfd_hsa_memory_exception_data *data;
262 		int i;
263 
264 		for (i = 0; i < p->n_pdds; i++) {
265 			if (p->pdds[i]->dev->id == dev_id) {
266 				pdd = p->pdds[i];
267 				break;
268 			}
269 		}
270 
271 		if (!pdd)
272 			return -ENODEV;
273 
274 		data = (struct kfd_hsa_memory_exception_data *)
275 						pdd->vm_fault_exc_data;
276 
277 		kfd_dqm_evict_pasid(pdd->dev->dqm, p->pasid);
278 		kfd_signal_vm_fault_event(pdd->dev, p->pasid, NULL, data);
279 		error_reason &= ~KFD_EC_MASK(EC_DEVICE_MEMORY_VIOLATION);
280 	}
281 
282 	if (error_reason & (KFD_EC_MASK(EC_PROCESS_RUNTIME))) {
283 		/*
284 		 * block should only happen after the debugger receives runtime
285 		 * enable notice.
286 		 */
287 		up(&p->runtime_enable_sema);
288 		error_reason &= ~KFD_EC_MASK(EC_PROCESS_RUNTIME);
289 	}
290 
291 	if (error_reason)
292 		return kfd_send_exception_to_runtime(p, queue_id, error_reason);
293 
294 	return 0;
295 }
296 
297 static int kfd_dbg_set_queue_workaround(struct queue *q, bool enable)
298 {
299 	struct mqd_update_info minfo = {0};
300 	int err;
301 
302 	if (!q)
303 		return 0;
304 
305 	if (!kfd_dbg_has_cwsr_workaround(q->device))
306 		return 0;
307 
308 	if (enable && q->properties.is_user_cu_masked)
309 		return -EBUSY;
310 
311 	minfo.update_flag = enable ? UPDATE_FLAG_DBG_WA_ENABLE : UPDATE_FLAG_DBG_WA_DISABLE;
312 
313 	q->properties.is_dbg_wa = enable;
314 	err = q->device->dqm->ops.update_queue(q->device->dqm, q, &minfo);
315 	if (err)
316 		q->properties.is_dbg_wa = false;
317 
318 	return err;
319 }
320 
321 static int kfd_dbg_set_workaround(struct kfd_process *target, bool enable)
322 {
323 	struct process_queue_manager *pqm = &target->pqm;
324 	struct process_queue_node *pqn;
325 	int r = 0;
326 
327 	list_for_each_entry(pqn, &pqm->queues, process_queue_list) {
328 		r = kfd_dbg_set_queue_workaround(pqn->q, enable);
329 		if (enable && r)
330 			goto unwind;
331 	}
332 
333 	return 0;
334 
335 unwind:
336 	list_for_each_entry(pqn, &pqm->queues, process_queue_list)
337 		kfd_dbg_set_queue_workaround(pqn->q, false);
338 
339 	if (enable)
340 		target->runtime_info.runtime_state = r == -EBUSY ?
341 				DEBUG_RUNTIME_STATE_ENABLED_BUSY :
342 				DEBUG_RUNTIME_STATE_ENABLED_ERROR;
343 
344 	return r;
345 }
346 
347 int kfd_dbg_set_mes_debug_mode(struct kfd_process_device *pdd)
348 {
349 	uint32_t spi_dbg_cntl = pdd->spi_dbg_override | pdd->spi_dbg_launch_mode;
350 	uint32_t flags = pdd->process->dbg_flags;
351 	bool sq_trap_en = !!spi_dbg_cntl || !kfd_dbg_has_cwsr_workaround(pdd->dev);
352 
353 	if (!kfd_dbg_is_per_vmid_supported(pdd->dev))
354 		return 0;
355 
356 	return amdgpu_mes_set_shader_debugger(pdd->dev->adev, pdd->proc_ctx_gpu_addr, spi_dbg_cntl,
357 						pdd->watch_points, flags, sq_trap_en);
358 }
359 
360 #define KFD_DEBUGGER_INVALID_WATCH_POINT_ID -1
361 static int kfd_dbg_get_dev_watch_id(struct kfd_process_device *pdd, int *watch_id)
362 {
363 	int i;
364 
365 	*watch_id = KFD_DEBUGGER_INVALID_WATCH_POINT_ID;
366 
367 	spin_lock(&pdd->dev->kfd->watch_points_lock);
368 
369 	for (i = 0; i < MAX_WATCH_ADDRESSES; i++) {
370 		/* device watchpoint in use so skip */
371 		if ((pdd->dev->kfd->alloc_watch_ids >> i) & 0x1)
372 			continue;
373 
374 		pdd->alloc_watch_ids |= 0x1 << i;
375 		pdd->dev->kfd->alloc_watch_ids |= 0x1 << i;
376 		*watch_id = i;
377 		spin_unlock(&pdd->dev->kfd->watch_points_lock);
378 		return 0;
379 	}
380 
381 	spin_unlock(&pdd->dev->kfd->watch_points_lock);
382 
383 	return -ENOMEM;
384 }
385 
386 static void kfd_dbg_clear_dev_watch_id(struct kfd_process_device *pdd, int watch_id)
387 {
388 	spin_lock(&pdd->dev->kfd->watch_points_lock);
389 
390 	/* process owns device watch point so safe to clear */
391 	if ((pdd->alloc_watch_ids >> watch_id) & 0x1) {
392 		pdd->alloc_watch_ids &= ~(0x1 << watch_id);
393 		pdd->dev->kfd->alloc_watch_ids &= ~(0x1 << watch_id);
394 	}
395 
396 	spin_unlock(&pdd->dev->kfd->watch_points_lock);
397 }
398 
399 static bool kfd_dbg_owns_dev_watch_id(struct kfd_process_device *pdd, int watch_id)
400 {
401 	bool owns_watch_id = false;
402 
403 	spin_lock(&pdd->dev->kfd->watch_points_lock);
404 	owns_watch_id = watch_id < MAX_WATCH_ADDRESSES &&
405 			((pdd->alloc_watch_ids >> watch_id) & 0x1);
406 
407 	spin_unlock(&pdd->dev->kfd->watch_points_lock);
408 
409 	return owns_watch_id;
410 }
411 
412 int kfd_dbg_trap_clear_dev_address_watch(struct kfd_process_device *pdd,
413 					uint32_t watch_id)
414 {
415 	int r;
416 
417 	if (!kfd_dbg_owns_dev_watch_id(pdd, watch_id))
418 		return -EINVAL;
419 
420 	if (!pdd->dev->kfd->shared_resources.enable_mes) {
421 		r = debug_lock_and_unmap(pdd->dev->dqm);
422 		if (r)
423 			return r;
424 	}
425 
426 	amdgpu_gfx_off_ctrl(pdd->dev->adev, false);
427 	pdd->watch_points[watch_id] = pdd->dev->kfd2kgd->clear_address_watch(
428 							pdd->dev->adev,
429 							watch_id);
430 	amdgpu_gfx_off_ctrl(pdd->dev->adev, true);
431 
432 	if (!pdd->dev->kfd->shared_resources.enable_mes)
433 		r = debug_map_and_unlock(pdd->dev->dqm);
434 	else
435 		r = kfd_dbg_set_mes_debug_mode(pdd);
436 
437 	kfd_dbg_clear_dev_watch_id(pdd, watch_id);
438 
439 	return r;
440 }
441 
442 int kfd_dbg_trap_set_dev_address_watch(struct kfd_process_device *pdd,
443 					uint64_t watch_address,
444 					uint32_t watch_address_mask,
445 					uint32_t *watch_id,
446 					uint32_t watch_mode)
447 {
448 	int r = kfd_dbg_get_dev_watch_id(pdd, watch_id);
449 
450 	if (r)
451 		return r;
452 
453 	if (!pdd->dev->kfd->shared_resources.enable_mes) {
454 		r = debug_lock_and_unmap(pdd->dev->dqm);
455 		if (r) {
456 			kfd_dbg_clear_dev_watch_id(pdd, *watch_id);
457 			return r;
458 		}
459 	}
460 
461 	amdgpu_gfx_off_ctrl(pdd->dev->adev, false);
462 	pdd->watch_points[*watch_id] = pdd->dev->kfd2kgd->set_address_watch(
463 				pdd->dev->adev,
464 				watch_address,
465 				watch_address_mask,
466 				*watch_id,
467 				watch_mode,
468 				pdd->dev->vm_info.last_vmid_kfd);
469 	amdgpu_gfx_off_ctrl(pdd->dev->adev, true);
470 
471 	if (!pdd->dev->kfd->shared_resources.enable_mes)
472 		r = debug_map_and_unlock(pdd->dev->dqm);
473 	else
474 		r = kfd_dbg_set_mes_debug_mode(pdd);
475 
476 	/* HWS is broken so no point in HW rollback but release the watchpoint anyways */
477 	if (r)
478 		kfd_dbg_clear_dev_watch_id(pdd, *watch_id);
479 
480 	return 0;
481 }
482 
483 static void kfd_dbg_clear_process_address_watch(struct kfd_process *target)
484 {
485 	int i, j;
486 
487 	for (i = 0; i < target->n_pdds; i++)
488 		for (j = 0; j < MAX_WATCH_ADDRESSES; j++)
489 			kfd_dbg_trap_clear_dev_address_watch(target->pdds[i], j);
490 }
491 
492 int kfd_dbg_trap_set_flags(struct kfd_process *target, uint32_t *flags)
493 {
494 	uint32_t prev_flags = target->dbg_flags;
495 	int i, r = 0, rewind_count = 0;
496 
497 	for (i = 0; i < target->n_pdds; i++) {
498 		if (!kfd_dbg_is_per_vmid_supported(target->pdds[i]->dev) &&
499 			(*flags & KFD_DBG_TRAP_FLAG_SINGLE_MEM_OP)) {
500 			*flags = prev_flags;
501 			return -EACCES;
502 		}
503 	}
504 
505 	target->dbg_flags = *flags & KFD_DBG_TRAP_FLAG_SINGLE_MEM_OP;
506 	*flags = prev_flags;
507 	for (i = 0; i < target->n_pdds; i++) {
508 		struct kfd_process_device *pdd = target->pdds[i];
509 
510 		if (!kfd_dbg_is_per_vmid_supported(pdd->dev))
511 			continue;
512 
513 		if (!pdd->dev->kfd->shared_resources.enable_mes)
514 			r = debug_refresh_runlist(pdd->dev->dqm);
515 		else
516 			r = kfd_dbg_set_mes_debug_mode(pdd);
517 
518 		if (r) {
519 			target->dbg_flags = prev_flags;
520 			break;
521 		}
522 
523 		rewind_count++;
524 	}
525 
526 	/* Rewind flags */
527 	if (r) {
528 		target->dbg_flags = prev_flags;
529 
530 		for (i = 0; i < rewind_count; i++) {
531 			struct kfd_process_device *pdd = target->pdds[i];
532 
533 			if (!kfd_dbg_is_per_vmid_supported(pdd->dev))
534 				continue;
535 
536 			if (!pdd->dev->kfd->shared_resources.enable_mes)
537 				debug_refresh_runlist(pdd->dev->dqm);
538 			else
539 				kfd_dbg_set_mes_debug_mode(pdd);
540 		}
541 	}
542 
543 	return r;
544 }
545 
546 /* kfd_dbg_trap_deactivate:
547  *	target: target process
548  *	unwind: If this is unwinding a failed kfd_dbg_trap_enable()
549  *	unwind_count:
550  *		If unwind == true, how far down the pdd list we need
551  *				to unwind
552  *		else: ignored
553  */
554 void kfd_dbg_trap_deactivate(struct kfd_process *target, bool unwind, int unwind_count)
555 {
556 	int i;
557 
558 	if (!unwind) {
559 		uint32_t flags = 0;
560 		int resume_count = resume_queues(target, 0, NULL);
561 
562 		if (resume_count)
563 			pr_debug("Resumed %d queues\n", resume_count);
564 
565 		cancel_work_sync(&target->debug_event_workarea);
566 		kfd_dbg_clear_process_address_watch(target);
567 		kfd_dbg_trap_set_wave_launch_mode(target, 0);
568 
569 		kfd_dbg_trap_set_flags(target, &flags);
570 	}
571 
572 	for (i = 0; i < target->n_pdds; i++) {
573 		struct kfd_process_device *pdd = target->pdds[i];
574 
575 		/* If this is an unwind, and we have unwound the required
576 		 * enable calls on the pdd list, we need to stop now
577 		 * otherwise we may mess up another debugger session.
578 		 */
579 		if (unwind && i == unwind_count)
580 			break;
581 
582 		kfd_process_set_trap_debug_flag(&pdd->qpd, false);
583 
584 		/* GFX off is already disabled by debug activate if not RLC restore supported. */
585 		if (kfd_dbg_is_rlc_restore_supported(pdd->dev))
586 			amdgpu_gfx_off_ctrl(pdd->dev->adev, false);
587 		pdd->spi_dbg_override =
588 				pdd->dev->kfd2kgd->disable_debug_trap(
589 				pdd->dev->adev,
590 				target->runtime_info.ttmp_setup,
591 				pdd->dev->vm_info.last_vmid_kfd);
592 		amdgpu_gfx_off_ctrl(pdd->dev->adev, true);
593 
594 		if (!kfd_dbg_is_per_vmid_supported(pdd->dev) &&
595 				release_debug_trap_vmid(pdd->dev->dqm, &pdd->qpd))
596 			pr_err("Failed to release debug vmid on [%i]\n", pdd->dev->id);
597 
598 		if (!pdd->dev->kfd->shared_resources.enable_mes)
599 			debug_refresh_runlist(pdd->dev->dqm);
600 		else
601 			kfd_dbg_set_mes_debug_mode(pdd);
602 	}
603 
604 	kfd_dbg_set_workaround(target, false);
605 }
606 
607 static void kfd_dbg_clean_exception_status(struct kfd_process *target)
608 {
609 	struct process_queue_manager *pqm;
610 	struct process_queue_node *pqn;
611 	int i;
612 
613 	for (i = 0; i < target->n_pdds; i++) {
614 		struct kfd_process_device *pdd = target->pdds[i];
615 
616 		kfd_process_drain_interrupts(pdd);
617 
618 		pdd->exception_status = 0;
619 	}
620 
621 	pqm = &target->pqm;
622 	list_for_each_entry(pqn, &pqm->queues, process_queue_list) {
623 		if (!pqn->q)
624 			continue;
625 
626 		pqn->q->properties.exception_status = 0;
627 	}
628 
629 	target->exception_status = 0;
630 }
631 
632 int kfd_dbg_trap_disable(struct kfd_process *target)
633 {
634 	if (!target->debug_trap_enabled)
635 		return 0;
636 
637 	/*
638 	 * Defer deactivation to runtime if runtime not enabled otherwise reset
639 	 * attached running target runtime state to enable for re-attach.
640 	 */
641 	if (target->runtime_info.runtime_state == DEBUG_RUNTIME_STATE_ENABLED)
642 		kfd_dbg_trap_deactivate(target, false, 0);
643 	else if (target->runtime_info.runtime_state != DEBUG_RUNTIME_STATE_DISABLED)
644 		target->runtime_info.runtime_state = DEBUG_RUNTIME_STATE_ENABLED;
645 
646 	fput(target->dbg_ev_file);
647 	target->dbg_ev_file = NULL;
648 
649 	if (target->debugger_process) {
650 		atomic_dec(&target->debugger_process->debugged_process_count);
651 		target->debugger_process = NULL;
652 	}
653 
654 	target->debug_trap_enabled = false;
655 	kfd_dbg_clean_exception_status(target);
656 	kfd_unref_process(target);
657 
658 	return 0;
659 }
660 
661 int kfd_dbg_trap_activate(struct kfd_process *target)
662 {
663 	int i, r = 0;
664 
665 	r = kfd_dbg_set_workaround(target, true);
666 	if (r)
667 		return r;
668 
669 	for (i = 0; i < target->n_pdds; i++) {
670 		struct kfd_process_device *pdd = target->pdds[i];
671 
672 		if (!kfd_dbg_is_per_vmid_supported(pdd->dev)) {
673 			r = reserve_debug_trap_vmid(pdd->dev->dqm, &pdd->qpd);
674 
675 			if (r) {
676 				target->runtime_info.runtime_state = (r == -EBUSY) ?
677 							DEBUG_RUNTIME_STATE_ENABLED_BUSY :
678 							DEBUG_RUNTIME_STATE_ENABLED_ERROR;
679 
680 				goto unwind_err;
681 			}
682 		}
683 
684 		/* Disable GFX OFF to prevent garbage read/writes to debug registers.
685 		 * If RLC restore of debug registers is not supported and runtime enable
686 		 * hasn't done so already on ttmp setup request, restore the trap config registers.
687 		 *
688 		 * If RLC restore of debug registers is not supported, keep gfx off disabled for
689 		 * the debug session.
690 		 */
691 		amdgpu_gfx_off_ctrl(pdd->dev->adev, false);
692 		if (!(kfd_dbg_is_rlc_restore_supported(pdd->dev) ||
693 						target->runtime_info.ttmp_setup))
694 			pdd->dev->kfd2kgd->enable_debug_trap(pdd->dev->adev, true,
695 								pdd->dev->vm_info.last_vmid_kfd);
696 
697 		pdd->spi_dbg_override = pdd->dev->kfd2kgd->enable_debug_trap(
698 					pdd->dev->adev,
699 					false,
700 					pdd->dev->vm_info.last_vmid_kfd);
701 
702 		if (kfd_dbg_is_rlc_restore_supported(pdd->dev))
703 			amdgpu_gfx_off_ctrl(pdd->dev->adev, true);
704 
705 		/*
706 		 * Setting the debug flag in the trap handler requires that the TMA has been
707 		 * allocated, which occurs during CWSR initialization.
708 		 * In the event that CWSR has not been initialized at this point, setting the
709 		 * flag will be called again during CWSR initialization if the target process
710 		 * is still debug enabled.
711 		 */
712 		kfd_process_set_trap_debug_flag(&pdd->qpd, true);
713 
714 		if (!pdd->dev->kfd->shared_resources.enable_mes)
715 			r = debug_refresh_runlist(pdd->dev->dqm);
716 		else
717 			r = kfd_dbg_set_mes_debug_mode(pdd);
718 
719 		if (r) {
720 			target->runtime_info.runtime_state =
721 					DEBUG_RUNTIME_STATE_ENABLED_ERROR;
722 			goto unwind_err;
723 		}
724 	}
725 
726 	return 0;
727 
728 unwind_err:
729 	/* Enabling debug failed, we need to disable on
730 	 * all GPUs so the enable is all or nothing.
731 	 */
732 	kfd_dbg_trap_deactivate(target, true, i);
733 	return r;
734 }
735 
736 int kfd_dbg_trap_enable(struct kfd_process *target, uint32_t fd,
737 			void __user *runtime_info, uint32_t *runtime_size)
738 {
739 	struct file *f;
740 	uint32_t copy_size;
741 	int i, r = 0;
742 
743 	if (target->debug_trap_enabled)
744 		return -EALREADY;
745 
746 	/* Enable pre-checks */
747 	for (i = 0; i < target->n_pdds; i++) {
748 		struct kfd_process_device *pdd = target->pdds[i];
749 
750 		if (!KFD_IS_SOC15(pdd->dev))
751 			return -ENODEV;
752 
753 		if (!kfd_dbg_has_gws_support(pdd->dev) && pdd->qpd.num_gws)
754 			return -EBUSY;
755 	}
756 
757 	copy_size = min((size_t)(*runtime_size), sizeof(target->runtime_info));
758 
759 	f = fget(fd);
760 	if (!f) {
761 		pr_err("Failed to get file for (%i)\n", fd);
762 		return -EBADF;
763 	}
764 
765 	target->dbg_ev_file = f;
766 
767 	/* defer activation to runtime if not runtime enabled */
768 	if (target->runtime_info.runtime_state == DEBUG_RUNTIME_STATE_ENABLED)
769 		kfd_dbg_trap_activate(target);
770 
771 	/* We already hold the process reference but hold another one for the
772 	 * debug session.
773 	 */
774 	kref_get(&target->ref);
775 	target->debug_trap_enabled = true;
776 
777 	if (target->debugger_process)
778 		atomic_inc(&target->debugger_process->debugged_process_count);
779 
780 	if (copy_to_user(runtime_info, (void *)&target->runtime_info, copy_size)) {
781 		kfd_dbg_trap_deactivate(target, false, 0);
782 		r = -EFAULT;
783 	}
784 
785 	*runtime_size = sizeof(target->runtime_info);
786 
787 	return r;
788 }
789 
790 static int kfd_dbg_validate_trap_override_request(struct kfd_process *p,
791 						uint32_t trap_override,
792 						uint32_t trap_mask_request,
793 						uint32_t *trap_mask_supported)
794 {
795 	int i = 0;
796 
797 	*trap_mask_supported = 0xffffffff;
798 
799 	for (i = 0; i < p->n_pdds; i++) {
800 		struct kfd_process_device *pdd = p->pdds[i];
801 		int err = pdd->dev->kfd2kgd->validate_trap_override_request(
802 								pdd->dev->adev,
803 								trap_override,
804 								trap_mask_supported);
805 
806 		if (err)
807 			return err;
808 	}
809 
810 	if (trap_mask_request & ~*trap_mask_supported)
811 		return -EACCES;
812 
813 	return 0;
814 }
815 
816 int kfd_dbg_trap_set_wave_launch_override(struct kfd_process *target,
817 					uint32_t trap_override,
818 					uint32_t trap_mask_bits,
819 					uint32_t trap_mask_request,
820 					uint32_t *trap_mask_prev,
821 					uint32_t *trap_mask_supported)
822 {
823 	int r = 0, i;
824 
825 	r = kfd_dbg_validate_trap_override_request(target,
826 						trap_override,
827 						trap_mask_request,
828 						trap_mask_supported);
829 
830 	if (r)
831 		return r;
832 
833 	for (i = 0; i < target->n_pdds; i++) {
834 		struct kfd_process_device *pdd = target->pdds[i];
835 
836 		amdgpu_gfx_off_ctrl(pdd->dev->adev, false);
837 		pdd->spi_dbg_override = pdd->dev->kfd2kgd->set_wave_launch_trap_override(
838 				pdd->dev->adev,
839 				pdd->dev->vm_info.last_vmid_kfd,
840 				trap_override,
841 				trap_mask_bits,
842 				trap_mask_request,
843 				trap_mask_prev,
844 				pdd->spi_dbg_override);
845 		amdgpu_gfx_off_ctrl(pdd->dev->adev, true);
846 
847 		if (!pdd->dev->kfd->shared_resources.enable_mes)
848 			r = debug_refresh_runlist(pdd->dev->dqm);
849 		else
850 			r = kfd_dbg_set_mes_debug_mode(pdd);
851 
852 		if (r)
853 			break;
854 	}
855 
856 	return r;
857 }
858 
859 int kfd_dbg_trap_set_wave_launch_mode(struct kfd_process *target,
860 					uint8_t wave_launch_mode)
861 {
862 	int r = 0, i;
863 
864 	if (wave_launch_mode != KFD_DBG_TRAP_WAVE_LAUNCH_MODE_NORMAL &&
865 			wave_launch_mode != KFD_DBG_TRAP_WAVE_LAUNCH_MODE_HALT &&
866 			wave_launch_mode != KFD_DBG_TRAP_WAVE_LAUNCH_MODE_DEBUG)
867 		return -EINVAL;
868 
869 	for (i = 0; i < target->n_pdds; i++) {
870 		struct kfd_process_device *pdd = target->pdds[i];
871 
872 		amdgpu_gfx_off_ctrl(pdd->dev->adev, false);
873 		pdd->spi_dbg_launch_mode = pdd->dev->kfd2kgd->set_wave_launch_mode(
874 				pdd->dev->adev,
875 				wave_launch_mode,
876 				pdd->dev->vm_info.last_vmid_kfd);
877 		amdgpu_gfx_off_ctrl(pdd->dev->adev, true);
878 
879 		if (!pdd->dev->kfd->shared_resources.enable_mes)
880 			r = debug_refresh_runlist(pdd->dev->dqm);
881 		else
882 			r = kfd_dbg_set_mes_debug_mode(pdd);
883 
884 		if (r)
885 			break;
886 	}
887 
888 	return r;
889 }
890 
891 int kfd_dbg_trap_query_exception_info(struct kfd_process *target,
892 		uint32_t source_id,
893 		uint32_t exception_code,
894 		bool clear_exception,
895 		void __user *info,
896 		uint32_t *info_size)
897 {
898 	bool found = false;
899 	int r = 0;
900 	uint32_t copy_size, actual_info_size = 0;
901 	uint64_t *exception_status_ptr = NULL;
902 
903 	if (!target)
904 		return -EINVAL;
905 
906 	if (!info || !info_size)
907 		return -EINVAL;
908 
909 	mutex_lock(&target->event_mutex);
910 
911 	if (KFD_DBG_EC_TYPE_IS_QUEUE(exception_code)) {
912 		/* Per queue exceptions */
913 		struct queue *queue = NULL;
914 		int i;
915 
916 		for (i = 0; i < target->n_pdds; i++) {
917 			struct kfd_process_device *pdd = target->pdds[i];
918 			struct qcm_process_device *qpd = &pdd->qpd;
919 
920 			list_for_each_entry(queue, &qpd->queues_list, list) {
921 				if (!found && queue->properties.queue_id == source_id) {
922 					found = true;
923 					break;
924 				}
925 			}
926 			if (found)
927 				break;
928 		}
929 
930 		if (!found) {
931 			r = -EINVAL;
932 			goto out;
933 		}
934 
935 		if (!(queue->properties.exception_status & KFD_EC_MASK(exception_code))) {
936 			r = -ENODATA;
937 			goto out;
938 		}
939 		exception_status_ptr = &queue->properties.exception_status;
940 	} else if (KFD_DBG_EC_TYPE_IS_DEVICE(exception_code)) {
941 		/* Per device exceptions */
942 		struct kfd_process_device *pdd = NULL;
943 		int i;
944 
945 		for (i = 0; i < target->n_pdds; i++) {
946 			pdd = target->pdds[i];
947 			if (pdd->dev->id == source_id) {
948 				found = true;
949 				break;
950 			}
951 		}
952 
953 		if (!found) {
954 			r = -EINVAL;
955 			goto out;
956 		}
957 
958 		if (!(pdd->exception_status & KFD_EC_MASK(exception_code))) {
959 			r = -ENODATA;
960 			goto out;
961 		}
962 
963 		if (exception_code == EC_DEVICE_MEMORY_VIOLATION) {
964 			copy_size = min((size_t)(*info_size), pdd->vm_fault_exc_data_size);
965 
966 			if (copy_to_user(info, pdd->vm_fault_exc_data, copy_size)) {
967 				r = -EFAULT;
968 				goto out;
969 			}
970 			actual_info_size = pdd->vm_fault_exc_data_size;
971 			if (clear_exception) {
972 				kfree(pdd->vm_fault_exc_data);
973 				pdd->vm_fault_exc_data = NULL;
974 				pdd->vm_fault_exc_data_size = 0;
975 			}
976 		}
977 		exception_status_ptr = &pdd->exception_status;
978 	} else if (KFD_DBG_EC_TYPE_IS_PROCESS(exception_code)) {
979 		/* Per process exceptions */
980 		if (!(target->exception_status & KFD_EC_MASK(exception_code))) {
981 			r = -ENODATA;
982 			goto out;
983 		}
984 
985 		if (exception_code == EC_PROCESS_RUNTIME) {
986 			copy_size = min((size_t)(*info_size), sizeof(target->runtime_info));
987 
988 			if (copy_to_user(info, (void *)&target->runtime_info, copy_size)) {
989 				r = -EFAULT;
990 				goto out;
991 			}
992 
993 			actual_info_size = sizeof(target->runtime_info);
994 		}
995 
996 		exception_status_ptr = &target->exception_status;
997 	} else {
998 		pr_debug("Bad exception type [%i]\n", exception_code);
999 		r = -EINVAL;
1000 		goto out;
1001 	}
1002 
1003 	*info_size = actual_info_size;
1004 	if (clear_exception)
1005 		*exception_status_ptr &= ~KFD_EC_MASK(exception_code);
1006 out:
1007 	mutex_unlock(&target->event_mutex);
1008 	return r;
1009 }
1010 
1011 int kfd_dbg_trap_device_snapshot(struct kfd_process *target,
1012 		uint64_t exception_clear_mask,
1013 		void __user *user_info,
1014 		uint32_t *number_of_device_infos,
1015 		uint32_t *entry_size)
1016 {
1017 	struct kfd_dbg_device_info_entry device_info;
1018 	uint32_t tmp_entry_size = *entry_size, tmp_num_devices;
1019 	int i, r = 0;
1020 
1021 	if (!(target && user_info && number_of_device_infos && entry_size))
1022 		return -EINVAL;
1023 
1024 	tmp_num_devices = min_t(size_t, *number_of_device_infos, target->n_pdds);
1025 	*number_of_device_infos = target->n_pdds;
1026 	*entry_size = min_t(size_t, *entry_size, sizeof(device_info));
1027 
1028 	if (!tmp_num_devices)
1029 		return 0;
1030 
1031 	memset(&device_info, 0, sizeof(device_info));
1032 
1033 	mutex_lock(&target->event_mutex);
1034 
1035 	/* Run over all pdd of the process */
1036 	for (i = 0; i < tmp_num_devices; i++) {
1037 		struct kfd_process_device *pdd = target->pdds[i];
1038 		struct kfd_topology_device *topo_dev = kfd_topology_device_by_id(pdd->dev->id);
1039 
1040 		device_info.gpu_id = pdd->dev->id;
1041 		device_info.exception_status = pdd->exception_status;
1042 		device_info.lds_base = pdd->lds_base;
1043 		device_info.lds_limit = pdd->lds_limit;
1044 		device_info.scratch_base = pdd->scratch_base;
1045 		device_info.scratch_limit = pdd->scratch_limit;
1046 		device_info.gpuvm_base = pdd->gpuvm_base;
1047 		device_info.gpuvm_limit = pdd->gpuvm_limit;
1048 		device_info.location_id = topo_dev->node_props.location_id;
1049 		device_info.vendor_id = topo_dev->node_props.vendor_id;
1050 		device_info.device_id = topo_dev->node_props.device_id;
1051 		device_info.revision_id = pdd->dev->adev->pdev->revision;
1052 		device_info.subsystem_vendor_id = pdd->dev->adev->pdev->subsystem_vendor;
1053 		device_info.subsystem_device_id = pdd->dev->adev->pdev->subsystem_device;
1054 		device_info.fw_version = pdd->dev->kfd->mec_fw_version;
1055 		device_info.gfx_target_version =
1056 			topo_dev->node_props.gfx_target_version;
1057 		device_info.simd_count = topo_dev->node_props.simd_count;
1058 		device_info.max_waves_per_simd =
1059 			topo_dev->node_props.max_waves_per_simd;
1060 		device_info.array_count = topo_dev->node_props.array_count;
1061 		device_info.simd_arrays_per_engine =
1062 			topo_dev->node_props.simd_arrays_per_engine;
1063 		device_info.num_xcc = NUM_XCC(pdd->dev->xcc_mask);
1064 		device_info.capability = topo_dev->node_props.capability;
1065 		device_info.debug_prop = topo_dev->node_props.debug_prop;
1066 
1067 		if (exception_clear_mask)
1068 			pdd->exception_status &= ~exception_clear_mask;
1069 
1070 		if (copy_to_user(user_info, &device_info, *entry_size)) {
1071 			r = -EFAULT;
1072 			break;
1073 		}
1074 
1075 		user_info += tmp_entry_size;
1076 	}
1077 
1078 	mutex_unlock(&target->event_mutex);
1079 
1080 	return r;
1081 }
1082 
1083 void kfd_dbg_set_enabled_debug_exception_mask(struct kfd_process *target,
1084 					uint64_t exception_set_mask)
1085 {
1086 	uint64_t found_mask = 0;
1087 	struct process_queue_manager *pqm;
1088 	struct process_queue_node *pqn;
1089 	static const char write_data = '.';
1090 	loff_t pos = 0;
1091 	int i;
1092 
1093 	mutex_lock(&target->event_mutex);
1094 
1095 	found_mask |= target->exception_status;
1096 
1097 	pqm = &target->pqm;
1098 	list_for_each_entry(pqn, &pqm->queues, process_queue_list) {
1099 		if (!pqn->q)
1100 			continue;
1101 
1102 		found_mask |= pqn->q->properties.exception_status;
1103 	}
1104 
1105 	for (i = 0; i < target->n_pdds; i++) {
1106 		struct kfd_process_device *pdd = target->pdds[i];
1107 
1108 		found_mask |= pdd->exception_status;
1109 	}
1110 
1111 	if (exception_set_mask & found_mask)
1112 		kernel_write(target->dbg_ev_file, &write_data, 1, &pos);
1113 
1114 	target->exception_enable_mask = exception_set_mask;
1115 
1116 	mutex_unlock(&target->event_mutex);
1117 }
1118