xref: /linux/drivers/gpu/drm/amd/amdkfd/kfd_debug.c (revision 3a39d672e7f48b8d6b91a09afa4b55352773b4b5)
1 /*
2  * Copyright 2023 Advanced Micro Devices, Inc.
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the "Software"),
6  * to deal in the Software without restriction, including without limitation
7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8  * and/or sell copies of the Software, and to permit persons to whom the
9  * Software is furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice shall be included in
12  * all copies or substantial portions of the Software.
13  *
14  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
17  * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
18  * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
19  * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
20  * OTHER DEALINGS IN THE SOFTWARE.
21  */
22 
23 #include "kfd_debug.h"
24 #include "kfd_device_queue_manager.h"
25 #include "kfd_topology.h"
26 #include <linux/file.h>
27 #include <uapi/linux/kfd_ioctl.h>
28 #include <uapi/linux/kfd_sysfs.h>
29 
30 #define MAX_WATCH_ADDRESSES	4
31 
kfd_dbg_ev_query_debug_event(struct kfd_process * process,unsigned int * queue_id,unsigned int * gpu_id,uint64_t exception_clear_mask,uint64_t * event_status)32 int kfd_dbg_ev_query_debug_event(struct kfd_process *process,
33 		      unsigned int *queue_id,
34 		      unsigned int *gpu_id,
35 		      uint64_t exception_clear_mask,
36 		      uint64_t *event_status)
37 {
38 	struct process_queue_manager *pqm;
39 	struct process_queue_node *pqn;
40 	int i;
41 
42 	if (!(process && process->debug_trap_enabled))
43 		return -ENODATA;
44 
45 	mutex_lock(&process->event_mutex);
46 	*event_status = 0;
47 	*queue_id = 0;
48 	*gpu_id = 0;
49 
50 	/* find and report queue events */
51 	pqm = &process->pqm;
52 	list_for_each_entry(pqn, &pqm->queues, process_queue_list) {
53 		uint64_t tmp = process->exception_enable_mask;
54 
55 		if (!pqn->q)
56 			continue;
57 
58 		tmp &= pqn->q->properties.exception_status;
59 
60 		if (!tmp)
61 			continue;
62 
63 		*event_status = pqn->q->properties.exception_status;
64 		*queue_id = pqn->q->properties.queue_id;
65 		*gpu_id = pqn->q->device->id;
66 		pqn->q->properties.exception_status &= ~exception_clear_mask;
67 		goto out;
68 	}
69 
70 	/* find and report device events */
71 	for (i = 0; i < process->n_pdds; i++) {
72 		struct kfd_process_device *pdd = process->pdds[i];
73 		uint64_t tmp = process->exception_enable_mask
74 						& pdd->exception_status;
75 
76 		if (!tmp)
77 			continue;
78 
79 		*event_status = pdd->exception_status;
80 		*gpu_id = pdd->dev->id;
81 		pdd->exception_status &= ~exception_clear_mask;
82 		goto out;
83 	}
84 
85 	/* report process events */
86 	if (process->exception_enable_mask & process->exception_status) {
87 		*event_status = process->exception_status;
88 		process->exception_status &= ~exception_clear_mask;
89 	}
90 
91 out:
92 	mutex_unlock(&process->event_mutex);
93 	return *event_status ? 0 : -EAGAIN;
94 }
95 
debug_event_write_work_handler(struct work_struct * work)96 void debug_event_write_work_handler(struct work_struct *work)
97 {
98 	struct kfd_process *process;
99 
100 	static const char write_data = '.';
101 	loff_t pos = 0;
102 
103 	process = container_of(work,
104 			struct kfd_process,
105 			debug_event_workarea);
106 
107 	if (process->debug_trap_enabled && process->dbg_ev_file)
108 		kernel_write(process->dbg_ev_file, &write_data, 1, &pos);
109 }
110 
111 /* update process/device/queue exception status, write to descriptor
112  * only if exception_status is enabled.
113  */
kfd_dbg_ev_raise(uint64_t event_mask,struct kfd_process * process,struct kfd_node * dev,unsigned int source_id,bool use_worker,void * exception_data,size_t exception_data_size)114 bool kfd_dbg_ev_raise(uint64_t event_mask,
115 			struct kfd_process *process, struct kfd_node *dev,
116 			unsigned int source_id, bool use_worker,
117 			void *exception_data, size_t exception_data_size)
118 {
119 	struct process_queue_manager *pqm;
120 	struct process_queue_node *pqn;
121 	int i;
122 	static const char write_data = '.';
123 	loff_t pos = 0;
124 	bool is_subscribed = true;
125 
126 	if (!(process && process->debug_trap_enabled))
127 		return false;
128 
129 	mutex_lock(&process->event_mutex);
130 
131 	if (event_mask & KFD_EC_MASK_DEVICE) {
132 		for (i = 0; i < process->n_pdds; i++) {
133 			struct kfd_process_device *pdd = process->pdds[i];
134 
135 			if (pdd->dev != dev)
136 				continue;
137 
138 			pdd->exception_status |= event_mask & KFD_EC_MASK_DEVICE;
139 
140 			if (event_mask & KFD_EC_MASK(EC_DEVICE_MEMORY_VIOLATION)) {
141 				if (!pdd->vm_fault_exc_data) {
142 					pdd->vm_fault_exc_data = kmemdup(
143 							exception_data,
144 							exception_data_size,
145 							GFP_KERNEL);
146 					if (!pdd->vm_fault_exc_data)
147 						pr_debug("Failed to allocate exception data memory");
148 				} else {
149 					pr_debug("Debugger exception data not saved\n");
150 					print_hex_dump_bytes("exception data: ",
151 							DUMP_PREFIX_OFFSET,
152 							exception_data,
153 							exception_data_size);
154 				}
155 			}
156 			break;
157 		}
158 	} else if (event_mask & KFD_EC_MASK_PROCESS) {
159 		process->exception_status |= event_mask & KFD_EC_MASK_PROCESS;
160 	} else {
161 		pqm = &process->pqm;
162 		list_for_each_entry(pqn, &pqm->queues,
163 				process_queue_list) {
164 			int target_id;
165 
166 			if (!pqn->q)
167 				continue;
168 
169 			target_id = event_mask & KFD_EC_MASK(EC_QUEUE_NEW) ?
170 					pqn->q->properties.queue_id :
171 							pqn->q->doorbell_id;
172 
173 			if (pqn->q->device != dev || target_id != source_id)
174 				continue;
175 
176 			pqn->q->properties.exception_status |= event_mask;
177 			break;
178 		}
179 	}
180 
181 	if (process->exception_enable_mask & event_mask) {
182 		if (use_worker)
183 			schedule_work(&process->debug_event_workarea);
184 		else
185 			kernel_write(process->dbg_ev_file,
186 					&write_data,
187 					1,
188 					&pos);
189 	} else {
190 		is_subscribed = false;
191 	}
192 
193 	mutex_unlock(&process->event_mutex);
194 
195 	return is_subscribed;
196 }
197 
198 /* set pending event queue entry from ring entry  */
kfd_set_dbg_ev_from_interrupt(struct kfd_node * dev,unsigned int pasid,uint32_t doorbell_id,uint64_t trap_mask,void * exception_data,size_t exception_data_size)199 bool kfd_set_dbg_ev_from_interrupt(struct kfd_node *dev,
200 				   unsigned int pasid,
201 				   uint32_t doorbell_id,
202 				   uint64_t trap_mask,
203 				   void *exception_data,
204 				   size_t exception_data_size)
205 {
206 	struct kfd_process *p;
207 	bool signaled_to_debugger_or_runtime = false;
208 
209 	p = kfd_lookup_process_by_pasid(pasid);
210 
211 	if (!p)
212 		return false;
213 
214 	if (!kfd_dbg_ev_raise(trap_mask, p, dev, doorbell_id, true,
215 			      exception_data, exception_data_size)) {
216 		struct process_queue_manager *pqm;
217 		struct process_queue_node *pqn;
218 
219 		if (!!(trap_mask & KFD_EC_MASK_QUEUE) &&
220 		       p->runtime_info.runtime_state == DEBUG_RUNTIME_STATE_ENABLED) {
221 			mutex_lock(&p->mutex);
222 
223 			pqm = &p->pqm;
224 			list_for_each_entry(pqn, &pqm->queues,
225 							process_queue_list) {
226 
227 				if (!(pqn->q && pqn->q->device == dev &&
228 				      pqn->q->doorbell_id == doorbell_id))
229 					continue;
230 
231 				kfd_send_exception_to_runtime(p, pqn->q->properties.queue_id,
232 							      trap_mask);
233 
234 				signaled_to_debugger_or_runtime = true;
235 
236 				break;
237 			}
238 
239 			mutex_unlock(&p->mutex);
240 		} else if (trap_mask & KFD_EC_MASK(EC_DEVICE_MEMORY_VIOLATION)) {
241 			kfd_dqm_evict_pasid(dev->dqm, p->pasid);
242 			kfd_signal_vm_fault_event(dev, p->pasid, NULL,
243 							exception_data);
244 
245 			signaled_to_debugger_or_runtime = true;
246 		}
247 	} else {
248 		signaled_to_debugger_or_runtime = true;
249 	}
250 
251 	kfd_unref_process(p);
252 
253 	return signaled_to_debugger_or_runtime;
254 }
255 
kfd_dbg_send_exception_to_runtime(struct kfd_process * p,unsigned int dev_id,unsigned int queue_id,uint64_t error_reason)256 int kfd_dbg_send_exception_to_runtime(struct kfd_process *p,
257 					unsigned int dev_id,
258 					unsigned int queue_id,
259 					uint64_t error_reason)
260 {
261 	if (error_reason & KFD_EC_MASK(EC_DEVICE_MEMORY_VIOLATION)) {
262 		struct kfd_process_device *pdd = NULL;
263 		struct kfd_hsa_memory_exception_data *data;
264 		int i;
265 
266 		for (i = 0; i < p->n_pdds; i++) {
267 			if (p->pdds[i]->dev->id == dev_id) {
268 				pdd = p->pdds[i];
269 				break;
270 			}
271 		}
272 
273 		if (!pdd)
274 			return -ENODEV;
275 
276 		data = (struct kfd_hsa_memory_exception_data *)
277 						pdd->vm_fault_exc_data;
278 
279 		kfd_dqm_evict_pasid(pdd->dev->dqm, p->pasid);
280 		kfd_signal_vm_fault_event(pdd->dev, p->pasid, NULL, data);
281 		error_reason &= ~KFD_EC_MASK(EC_DEVICE_MEMORY_VIOLATION);
282 	}
283 
284 	if (error_reason & (KFD_EC_MASK(EC_PROCESS_RUNTIME))) {
285 		/*
286 		 * block should only happen after the debugger receives runtime
287 		 * enable notice.
288 		 */
289 		up(&p->runtime_enable_sema);
290 		error_reason &= ~KFD_EC_MASK(EC_PROCESS_RUNTIME);
291 	}
292 
293 	if (error_reason)
294 		return kfd_send_exception_to_runtime(p, queue_id, error_reason);
295 
296 	return 0;
297 }
298 
kfd_dbg_set_queue_workaround(struct queue * q,bool enable)299 static int kfd_dbg_set_queue_workaround(struct queue *q, bool enable)
300 {
301 	struct mqd_update_info minfo = {0};
302 	int err;
303 
304 	if (!q)
305 		return 0;
306 
307 	if (!kfd_dbg_has_cwsr_workaround(q->device))
308 		return 0;
309 
310 	if (enable && q->properties.is_user_cu_masked)
311 		return -EBUSY;
312 
313 	minfo.update_flag = enable ? UPDATE_FLAG_DBG_WA_ENABLE : UPDATE_FLAG_DBG_WA_DISABLE;
314 
315 	q->properties.is_dbg_wa = enable;
316 	err = q->device->dqm->ops.update_queue(q->device->dqm, q, &minfo);
317 	if (err)
318 		q->properties.is_dbg_wa = false;
319 
320 	return err;
321 }
322 
kfd_dbg_set_workaround(struct kfd_process * target,bool enable)323 static int kfd_dbg_set_workaround(struct kfd_process *target, bool enable)
324 {
325 	struct process_queue_manager *pqm = &target->pqm;
326 	struct process_queue_node *pqn;
327 	int r = 0;
328 
329 	list_for_each_entry(pqn, &pqm->queues, process_queue_list) {
330 		r = kfd_dbg_set_queue_workaround(pqn->q, enable);
331 		if (enable && r)
332 			goto unwind;
333 	}
334 
335 	return 0;
336 
337 unwind:
338 	list_for_each_entry(pqn, &pqm->queues, process_queue_list)
339 		kfd_dbg_set_queue_workaround(pqn->q, false);
340 
341 	if (enable)
342 		target->runtime_info.runtime_state = r == -EBUSY ?
343 				DEBUG_RUNTIME_STATE_ENABLED_BUSY :
344 				DEBUG_RUNTIME_STATE_ENABLED_ERROR;
345 
346 	return r;
347 }
348 
kfd_dbg_set_mes_debug_mode(struct kfd_process_device * pdd,bool sq_trap_en)349 int kfd_dbg_set_mes_debug_mode(struct kfd_process_device *pdd, bool sq_trap_en)
350 {
351 	uint32_t spi_dbg_cntl = pdd->spi_dbg_override | pdd->spi_dbg_launch_mode;
352 	uint32_t flags = pdd->process->dbg_flags;
353 
354 	if (!kfd_dbg_is_per_vmid_supported(pdd->dev))
355 		return 0;
356 
357 	return amdgpu_mes_set_shader_debugger(pdd->dev->adev, pdd->proc_ctx_gpu_addr, spi_dbg_cntl,
358 						pdd->watch_points, flags, sq_trap_en);
359 }
360 
361 #define KFD_DEBUGGER_INVALID_WATCH_POINT_ID -1
kfd_dbg_get_dev_watch_id(struct kfd_process_device * pdd,int * watch_id)362 static int kfd_dbg_get_dev_watch_id(struct kfd_process_device *pdd, int *watch_id)
363 {
364 	int i;
365 
366 	*watch_id = KFD_DEBUGGER_INVALID_WATCH_POINT_ID;
367 
368 	spin_lock(&pdd->dev->watch_points_lock);
369 
370 	for (i = 0; i < MAX_WATCH_ADDRESSES; i++) {
371 		/* device watchpoint in use so skip */
372 		if ((pdd->dev->alloc_watch_ids >> i) & 0x1)
373 			continue;
374 
375 		pdd->alloc_watch_ids |= 0x1 << i;
376 		pdd->dev->alloc_watch_ids |= 0x1 << i;
377 		*watch_id = i;
378 		spin_unlock(&pdd->dev->watch_points_lock);
379 		return 0;
380 	}
381 
382 	spin_unlock(&pdd->dev->watch_points_lock);
383 
384 	return -ENOMEM;
385 }
386 
kfd_dbg_clear_dev_watch_id(struct kfd_process_device * pdd,int watch_id)387 static void kfd_dbg_clear_dev_watch_id(struct kfd_process_device *pdd, int watch_id)
388 {
389 	spin_lock(&pdd->dev->watch_points_lock);
390 
391 	/* process owns device watch point so safe to clear */
392 	if ((pdd->alloc_watch_ids >> watch_id) & 0x1) {
393 		pdd->alloc_watch_ids &= ~(0x1 << watch_id);
394 		pdd->dev->alloc_watch_ids &= ~(0x1 << watch_id);
395 	}
396 
397 	spin_unlock(&pdd->dev->watch_points_lock);
398 }
399 
kfd_dbg_owns_dev_watch_id(struct kfd_process_device * pdd,int watch_id)400 static bool kfd_dbg_owns_dev_watch_id(struct kfd_process_device *pdd, int watch_id)
401 {
402 	bool owns_watch_id = false;
403 
404 	spin_lock(&pdd->dev->watch_points_lock);
405 	owns_watch_id = watch_id < MAX_WATCH_ADDRESSES &&
406 			((pdd->alloc_watch_ids >> watch_id) & 0x1);
407 
408 	spin_unlock(&pdd->dev->watch_points_lock);
409 
410 	return owns_watch_id;
411 }
412 
kfd_dbg_trap_clear_dev_address_watch(struct kfd_process_device * pdd,uint32_t watch_id)413 int kfd_dbg_trap_clear_dev_address_watch(struct kfd_process_device *pdd,
414 					uint32_t watch_id)
415 {
416 	int r;
417 
418 	if (!kfd_dbg_owns_dev_watch_id(pdd, watch_id))
419 		return -EINVAL;
420 
421 	if (!pdd->dev->kfd->shared_resources.enable_mes) {
422 		r = debug_lock_and_unmap(pdd->dev->dqm);
423 		if (r)
424 			return r;
425 	}
426 
427 	amdgpu_gfx_off_ctrl(pdd->dev->adev, false);
428 	pdd->watch_points[watch_id] = pdd->dev->kfd2kgd->clear_address_watch(
429 							pdd->dev->adev,
430 							watch_id);
431 	amdgpu_gfx_off_ctrl(pdd->dev->adev, true);
432 
433 	if (!pdd->dev->kfd->shared_resources.enable_mes)
434 		r = debug_map_and_unlock(pdd->dev->dqm);
435 	else
436 		r = kfd_dbg_set_mes_debug_mode(pdd, true);
437 
438 	kfd_dbg_clear_dev_watch_id(pdd, watch_id);
439 
440 	return r;
441 }
442 
kfd_dbg_trap_set_dev_address_watch(struct kfd_process_device * pdd,uint64_t watch_address,uint32_t watch_address_mask,uint32_t * watch_id,uint32_t watch_mode)443 int kfd_dbg_trap_set_dev_address_watch(struct kfd_process_device *pdd,
444 					uint64_t watch_address,
445 					uint32_t watch_address_mask,
446 					uint32_t *watch_id,
447 					uint32_t watch_mode)
448 {
449 	int xcc_id, r = kfd_dbg_get_dev_watch_id(pdd, watch_id);
450 	uint32_t xcc_mask = pdd->dev->xcc_mask;
451 
452 	if (r)
453 		return r;
454 
455 	if (!pdd->dev->kfd->shared_resources.enable_mes) {
456 		r = debug_lock_and_unmap(pdd->dev->dqm);
457 		if (r) {
458 			kfd_dbg_clear_dev_watch_id(pdd, *watch_id);
459 			return r;
460 		}
461 	}
462 
463 	amdgpu_gfx_off_ctrl(pdd->dev->adev, false);
464 	for_each_inst(xcc_id, xcc_mask)
465 		pdd->watch_points[*watch_id] = pdd->dev->kfd2kgd->set_address_watch(
466 				pdd->dev->adev,
467 				watch_address,
468 				watch_address_mask,
469 				*watch_id,
470 				watch_mode,
471 				pdd->dev->vm_info.last_vmid_kfd,
472 				xcc_id);
473 	amdgpu_gfx_off_ctrl(pdd->dev->adev, true);
474 
475 	if (!pdd->dev->kfd->shared_resources.enable_mes)
476 		r = debug_map_and_unlock(pdd->dev->dqm);
477 	else
478 		r = kfd_dbg_set_mes_debug_mode(pdd, true);
479 
480 	/* HWS is broken so no point in HW rollback but release the watchpoint anyways */
481 	if (r)
482 		kfd_dbg_clear_dev_watch_id(pdd, *watch_id);
483 
484 	return 0;
485 }
486 
kfd_dbg_clear_process_address_watch(struct kfd_process * target)487 static void kfd_dbg_clear_process_address_watch(struct kfd_process *target)
488 {
489 	int i, j;
490 
491 	for (i = 0; i < target->n_pdds; i++)
492 		for (j = 0; j < MAX_WATCH_ADDRESSES; j++)
493 			kfd_dbg_trap_clear_dev_address_watch(target->pdds[i], j);
494 }
495 
kfd_dbg_trap_set_flags(struct kfd_process * target,uint32_t * flags)496 int kfd_dbg_trap_set_flags(struct kfd_process *target, uint32_t *flags)
497 {
498 	uint32_t prev_flags = target->dbg_flags;
499 	int i, r = 0, rewind_count = 0;
500 
501 	for (i = 0; i < target->n_pdds; i++) {
502 		struct kfd_topology_device *topo_dev =
503 				kfd_topology_device_by_id(target->pdds[i]->dev->id);
504 		uint32_t caps = topo_dev->node_props.capability;
505 
506 		if (!(caps & HSA_CAP_TRAP_DEBUG_PRECISE_MEMORY_OPERATIONS_SUPPORTED) &&
507 			(*flags & KFD_DBG_TRAP_FLAG_SINGLE_MEM_OP)) {
508 			*flags = prev_flags;
509 			return -EACCES;
510 		}
511 
512 		if (!(caps & HSA_CAP_TRAP_DEBUG_PRECISE_ALU_OPERATIONS_SUPPORTED) &&
513 		    (*flags & KFD_DBG_TRAP_FLAG_SINGLE_ALU_OP)) {
514 			*flags = prev_flags;
515 			return -EACCES;
516 		}
517 	}
518 
519 	target->dbg_flags = *flags;
520 	*flags = prev_flags;
521 	for (i = 0; i < target->n_pdds; i++) {
522 		struct kfd_process_device *pdd = target->pdds[i];
523 
524 		if (!kfd_dbg_is_per_vmid_supported(pdd->dev))
525 			continue;
526 
527 		if (!pdd->dev->kfd->shared_resources.enable_mes)
528 			r = debug_refresh_runlist(pdd->dev->dqm);
529 		else
530 			r = kfd_dbg_set_mes_debug_mode(pdd, true);
531 
532 		if (r) {
533 			target->dbg_flags = prev_flags;
534 			break;
535 		}
536 
537 		rewind_count++;
538 	}
539 
540 	/* Rewind flags */
541 	if (r) {
542 		target->dbg_flags = prev_flags;
543 
544 		for (i = 0; i < rewind_count; i++) {
545 			struct kfd_process_device *pdd = target->pdds[i];
546 
547 			if (!kfd_dbg_is_per_vmid_supported(pdd->dev))
548 				continue;
549 
550 			if (!pdd->dev->kfd->shared_resources.enable_mes)
551 				debug_refresh_runlist(pdd->dev->dqm);
552 			else
553 				kfd_dbg_set_mes_debug_mode(pdd, true);
554 		}
555 	}
556 
557 	return r;
558 }
559 
560 /* kfd_dbg_trap_deactivate:
561  *	target: target process
562  *	unwind: If this is unwinding a failed kfd_dbg_trap_enable()
563  *	unwind_count:
564  *		If unwind == true, how far down the pdd list we need
565  *				to unwind
566  *		else: ignored
567  */
kfd_dbg_trap_deactivate(struct kfd_process * target,bool unwind,int unwind_count)568 void kfd_dbg_trap_deactivate(struct kfd_process *target, bool unwind, int unwind_count)
569 {
570 	int i;
571 
572 	if (!unwind) {
573 		uint32_t flags = 0;
574 		int resume_count = resume_queues(target, 0, NULL);
575 
576 		if (resume_count)
577 			pr_debug("Resumed %d queues\n", resume_count);
578 
579 		cancel_work_sync(&target->debug_event_workarea);
580 		kfd_dbg_clear_process_address_watch(target);
581 		kfd_dbg_trap_set_wave_launch_mode(target, 0);
582 
583 		kfd_dbg_trap_set_flags(target, &flags);
584 	}
585 
586 	for (i = 0; i < target->n_pdds; i++) {
587 		struct kfd_process_device *pdd = target->pdds[i];
588 
589 		/* If this is an unwind, and we have unwound the required
590 		 * enable calls on the pdd list, we need to stop now
591 		 * otherwise we may mess up another debugger session.
592 		 */
593 		if (unwind && i == unwind_count)
594 			break;
595 
596 		kfd_process_set_trap_debug_flag(&pdd->qpd, false);
597 
598 		/* GFX off is already disabled by debug activate if not RLC restore supported. */
599 		if (kfd_dbg_is_rlc_restore_supported(pdd->dev))
600 			amdgpu_gfx_off_ctrl(pdd->dev->adev, false);
601 		pdd->spi_dbg_override =
602 				pdd->dev->kfd2kgd->disable_debug_trap(
603 				pdd->dev->adev,
604 				target->runtime_info.ttmp_setup,
605 				pdd->dev->vm_info.last_vmid_kfd);
606 		amdgpu_gfx_off_ctrl(pdd->dev->adev, true);
607 
608 		if (!kfd_dbg_is_per_vmid_supported(pdd->dev) &&
609 				release_debug_trap_vmid(pdd->dev->dqm, &pdd->qpd))
610 			pr_err("Failed to release debug vmid on [%i]\n", pdd->dev->id);
611 
612 		if (!pdd->dev->kfd->shared_resources.enable_mes)
613 			debug_refresh_runlist(pdd->dev->dqm);
614 		else
615 			kfd_dbg_set_mes_debug_mode(pdd, !kfd_dbg_has_cwsr_workaround(pdd->dev));
616 	}
617 
618 	kfd_dbg_set_workaround(target, false);
619 }
620 
kfd_dbg_clean_exception_status(struct kfd_process * target)621 static void kfd_dbg_clean_exception_status(struct kfd_process *target)
622 {
623 	struct process_queue_manager *pqm;
624 	struct process_queue_node *pqn;
625 	int i;
626 
627 	for (i = 0; i < target->n_pdds; i++) {
628 		struct kfd_process_device *pdd = target->pdds[i];
629 
630 		kfd_process_drain_interrupts(pdd);
631 
632 		pdd->exception_status = 0;
633 	}
634 
635 	pqm = &target->pqm;
636 	list_for_each_entry(pqn, &pqm->queues, process_queue_list) {
637 		if (!pqn->q)
638 			continue;
639 
640 		pqn->q->properties.exception_status = 0;
641 	}
642 
643 	target->exception_status = 0;
644 }
645 
kfd_dbg_trap_disable(struct kfd_process * target)646 int kfd_dbg_trap_disable(struct kfd_process *target)
647 {
648 	if (!target->debug_trap_enabled)
649 		return 0;
650 
651 	/*
652 	 * Defer deactivation to runtime if runtime not enabled otherwise reset
653 	 * attached running target runtime state to enable for re-attach.
654 	 */
655 	if (target->runtime_info.runtime_state == DEBUG_RUNTIME_STATE_ENABLED)
656 		kfd_dbg_trap_deactivate(target, false, 0);
657 	else if (target->runtime_info.runtime_state != DEBUG_RUNTIME_STATE_DISABLED)
658 		target->runtime_info.runtime_state = DEBUG_RUNTIME_STATE_ENABLED;
659 
660 	cancel_work_sync(&target->debug_event_workarea);
661 	fput(target->dbg_ev_file);
662 	target->dbg_ev_file = NULL;
663 
664 	if (target->debugger_process) {
665 		atomic_dec(&target->debugger_process->debugged_process_count);
666 		target->debugger_process = NULL;
667 	}
668 
669 	target->debug_trap_enabled = false;
670 	kfd_dbg_clean_exception_status(target);
671 	kfd_unref_process(target);
672 
673 	return 0;
674 }
675 
kfd_dbg_trap_activate(struct kfd_process * target)676 int kfd_dbg_trap_activate(struct kfd_process *target)
677 {
678 	int i, r = 0;
679 
680 	r = kfd_dbg_set_workaround(target, true);
681 	if (r)
682 		return r;
683 
684 	for (i = 0; i < target->n_pdds; i++) {
685 		struct kfd_process_device *pdd = target->pdds[i];
686 
687 		if (!kfd_dbg_is_per_vmid_supported(pdd->dev)) {
688 			r = reserve_debug_trap_vmid(pdd->dev->dqm, &pdd->qpd);
689 
690 			if (r) {
691 				target->runtime_info.runtime_state = (r == -EBUSY) ?
692 							DEBUG_RUNTIME_STATE_ENABLED_BUSY :
693 							DEBUG_RUNTIME_STATE_ENABLED_ERROR;
694 
695 				goto unwind_err;
696 			}
697 		}
698 
699 		/* Disable GFX OFF to prevent garbage read/writes to debug registers.
700 		 * If RLC restore of debug registers is not supported and runtime enable
701 		 * hasn't done so already on ttmp setup request, restore the trap config registers.
702 		 *
703 		 * If RLC restore of debug registers is not supported, keep gfx off disabled for
704 		 * the debug session.
705 		 */
706 		amdgpu_gfx_off_ctrl(pdd->dev->adev, false);
707 		if (!(kfd_dbg_is_rlc_restore_supported(pdd->dev) ||
708 						target->runtime_info.ttmp_setup))
709 			pdd->dev->kfd2kgd->enable_debug_trap(pdd->dev->adev, true,
710 								pdd->dev->vm_info.last_vmid_kfd);
711 
712 		pdd->spi_dbg_override = pdd->dev->kfd2kgd->enable_debug_trap(
713 					pdd->dev->adev,
714 					false,
715 					pdd->dev->vm_info.last_vmid_kfd);
716 
717 		if (kfd_dbg_is_rlc_restore_supported(pdd->dev))
718 			amdgpu_gfx_off_ctrl(pdd->dev->adev, true);
719 
720 		/*
721 		 * Setting the debug flag in the trap handler requires that the TMA has been
722 		 * allocated, which occurs during CWSR initialization.
723 		 * In the event that CWSR has not been initialized at this point, setting the
724 		 * flag will be called again during CWSR initialization if the target process
725 		 * is still debug enabled.
726 		 */
727 		kfd_process_set_trap_debug_flag(&pdd->qpd, true);
728 
729 		if (!pdd->dev->kfd->shared_resources.enable_mes)
730 			r = debug_refresh_runlist(pdd->dev->dqm);
731 		else
732 			r = kfd_dbg_set_mes_debug_mode(pdd, true);
733 
734 		if (r) {
735 			target->runtime_info.runtime_state =
736 					DEBUG_RUNTIME_STATE_ENABLED_ERROR;
737 			goto unwind_err;
738 		}
739 	}
740 
741 	return 0;
742 
743 unwind_err:
744 	/* Enabling debug failed, we need to disable on
745 	 * all GPUs so the enable is all or nothing.
746 	 */
747 	kfd_dbg_trap_deactivate(target, true, i);
748 	return r;
749 }
750 
kfd_dbg_trap_enable(struct kfd_process * target,uint32_t fd,void __user * runtime_info,uint32_t * runtime_size)751 int kfd_dbg_trap_enable(struct kfd_process *target, uint32_t fd,
752 			void __user *runtime_info, uint32_t *runtime_size)
753 {
754 	struct file *f;
755 	uint32_t copy_size;
756 	int i, r = 0;
757 
758 	if (target->debug_trap_enabled)
759 		return -EALREADY;
760 
761 	/* Enable pre-checks */
762 	for (i = 0; i < target->n_pdds; i++) {
763 		struct kfd_process_device *pdd = target->pdds[i];
764 
765 		if (!KFD_IS_SOC15(pdd->dev))
766 			return -ENODEV;
767 
768 		if (pdd->qpd.num_gws && (!kfd_dbg_has_gws_support(pdd->dev) ||
769 					 kfd_dbg_has_cwsr_workaround(pdd->dev)))
770 			return -EBUSY;
771 	}
772 
773 	copy_size = min((size_t)(*runtime_size), sizeof(target->runtime_info));
774 
775 	f = fget(fd);
776 	if (!f) {
777 		pr_err("Failed to get file for (%i)\n", fd);
778 		return -EBADF;
779 	}
780 
781 	target->dbg_ev_file = f;
782 
783 	/* defer activation to runtime if not runtime enabled */
784 	if (target->runtime_info.runtime_state == DEBUG_RUNTIME_STATE_ENABLED)
785 		kfd_dbg_trap_activate(target);
786 
787 	/* We already hold the process reference but hold another one for the
788 	 * debug session.
789 	 */
790 	kref_get(&target->ref);
791 	target->debug_trap_enabled = true;
792 
793 	if (target->debugger_process)
794 		atomic_inc(&target->debugger_process->debugged_process_count);
795 
796 	if (copy_to_user(runtime_info, (void *)&target->runtime_info, copy_size)) {
797 		kfd_dbg_trap_deactivate(target, false, 0);
798 		r = -EFAULT;
799 	}
800 
801 	*runtime_size = sizeof(target->runtime_info);
802 
803 	return r;
804 }
805 
kfd_dbg_validate_trap_override_request(struct kfd_process * p,uint32_t trap_override,uint32_t trap_mask_request,uint32_t * trap_mask_supported)806 static int kfd_dbg_validate_trap_override_request(struct kfd_process *p,
807 						uint32_t trap_override,
808 						uint32_t trap_mask_request,
809 						uint32_t *trap_mask_supported)
810 {
811 	int i = 0;
812 
813 	*trap_mask_supported = 0xffffffff;
814 
815 	for (i = 0; i < p->n_pdds; i++) {
816 		struct kfd_process_device *pdd = p->pdds[i];
817 		int err = pdd->dev->kfd2kgd->validate_trap_override_request(
818 								pdd->dev->adev,
819 								trap_override,
820 								trap_mask_supported);
821 
822 		if (err)
823 			return err;
824 	}
825 
826 	if (trap_mask_request & ~*trap_mask_supported)
827 		return -EACCES;
828 
829 	return 0;
830 }
831 
kfd_dbg_trap_set_wave_launch_override(struct kfd_process * target,uint32_t trap_override,uint32_t trap_mask_bits,uint32_t trap_mask_request,uint32_t * trap_mask_prev,uint32_t * trap_mask_supported)832 int kfd_dbg_trap_set_wave_launch_override(struct kfd_process *target,
833 					uint32_t trap_override,
834 					uint32_t trap_mask_bits,
835 					uint32_t trap_mask_request,
836 					uint32_t *trap_mask_prev,
837 					uint32_t *trap_mask_supported)
838 {
839 	int r = 0, i;
840 
841 	r = kfd_dbg_validate_trap_override_request(target,
842 						trap_override,
843 						trap_mask_request,
844 						trap_mask_supported);
845 
846 	if (r)
847 		return r;
848 
849 	for (i = 0; i < target->n_pdds; i++) {
850 		struct kfd_process_device *pdd = target->pdds[i];
851 
852 		amdgpu_gfx_off_ctrl(pdd->dev->adev, false);
853 		pdd->spi_dbg_override = pdd->dev->kfd2kgd->set_wave_launch_trap_override(
854 				pdd->dev->adev,
855 				pdd->dev->vm_info.last_vmid_kfd,
856 				trap_override,
857 				trap_mask_bits,
858 				trap_mask_request,
859 				trap_mask_prev,
860 				pdd->spi_dbg_override);
861 		amdgpu_gfx_off_ctrl(pdd->dev->adev, true);
862 
863 		if (!pdd->dev->kfd->shared_resources.enable_mes)
864 			r = debug_refresh_runlist(pdd->dev->dqm);
865 		else
866 			r = kfd_dbg_set_mes_debug_mode(pdd, true);
867 
868 		if (r)
869 			break;
870 	}
871 
872 	return r;
873 }
874 
kfd_dbg_trap_set_wave_launch_mode(struct kfd_process * target,uint8_t wave_launch_mode)875 int kfd_dbg_trap_set_wave_launch_mode(struct kfd_process *target,
876 					uint8_t wave_launch_mode)
877 {
878 	int r = 0, i;
879 
880 	if (wave_launch_mode != KFD_DBG_TRAP_WAVE_LAUNCH_MODE_NORMAL &&
881 			wave_launch_mode != KFD_DBG_TRAP_WAVE_LAUNCH_MODE_HALT &&
882 			wave_launch_mode != KFD_DBG_TRAP_WAVE_LAUNCH_MODE_DEBUG)
883 		return -EINVAL;
884 
885 	for (i = 0; i < target->n_pdds; i++) {
886 		struct kfd_process_device *pdd = target->pdds[i];
887 
888 		amdgpu_gfx_off_ctrl(pdd->dev->adev, false);
889 		pdd->spi_dbg_launch_mode = pdd->dev->kfd2kgd->set_wave_launch_mode(
890 				pdd->dev->adev,
891 				wave_launch_mode,
892 				pdd->dev->vm_info.last_vmid_kfd);
893 		amdgpu_gfx_off_ctrl(pdd->dev->adev, true);
894 
895 		if (!pdd->dev->kfd->shared_resources.enable_mes)
896 			r = debug_refresh_runlist(pdd->dev->dqm);
897 		else
898 			r = kfd_dbg_set_mes_debug_mode(pdd, true);
899 
900 		if (r)
901 			break;
902 	}
903 
904 	return r;
905 }
906 
kfd_dbg_trap_query_exception_info(struct kfd_process * target,uint32_t source_id,uint32_t exception_code,bool clear_exception,void __user * info,uint32_t * info_size)907 int kfd_dbg_trap_query_exception_info(struct kfd_process *target,
908 		uint32_t source_id,
909 		uint32_t exception_code,
910 		bool clear_exception,
911 		void __user *info,
912 		uint32_t *info_size)
913 {
914 	bool found = false;
915 	int r = 0;
916 	uint32_t copy_size, actual_info_size = 0;
917 	uint64_t *exception_status_ptr = NULL;
918 
919 	if (!target)
920 		return -EINVAL;
921 
922 	if (!info || !info_size)
923 		return -EINVAL;
924 
925 	mutex_lock(&target->event_mutex);
926 
927 	if (KFD_DBG_EC_TYPE_IS_QUEUE(exception_code)) {
928 		/* Per queue exceptions */
929 		struct queue *queue = NULL;
930 		int i;
931 
932 		for (i = 0; i < target->n_pdds; i++) {
933 			struct kfd_process_device *pdd = target->pdds[i];
934 			struct qcm_process_device *qpd = &pdd->qpd;
935 
936 			list_for_each_entry(queue, &qpd->queues_list, list) {
937 				if (!found && queue->properties.queue_id == source_id) {
938 					found = true;
939 					break;
940 				}
941 			}
942 			if (found)
943 				break;
944 		}
945 
946 		if (!found) {
947 			r = -EINVAL;
948 			goto out;
949 		}
950 
951 		if (!(queue->properties.exception_status & KFD_EC_MASK(exception_code))) {
952 			r = -ENODATA;
953 			goto out;
954 		}
955 		exception_status_ptr = &queue->properties.exception_status;
956 	} else if (KFD_DBG_EC_TYPE_IS_DEVICE(exception_code)) {
957 		/* Per device exceptions */
958 		struct kfd_process_device *pdd = NULL;
959 		int i;
960 
961 		for (i = 0; i < target->n_pdds; i++) {
962 			pdd = target->pdds[i];
963 			if (pdd->dev->id == source_id) {
964 				found = true;
965 				break;
966 			}
967 		}
968 
969 		if (!found) {
970 			r = -EINVAL;
971 			goto out;
972 		}
973 
974 		if (!(pdd->exception_status & KFD_EC_MASK(exception_code))) {
975 			r = -ENODATA;
976 			goto out;
977 		}
978 
979 		if (exception_code == EC_DEVICE_MEMORY_VIOLATION) {
980 			copy_size = min((size_t)(*info_size), pdd->vm_fault_exc_data_size);
981 
982 			if (copy_to_user(info, pdd->vm_fault_exc_data, copy_size)) {
983 				r = -EFAULT;
984 				goto out;
985 			}
986 			actual_info_size = pdd->vm_fault_exc_data_size;
987 			if (clear_exception) {
988 				kfree(pdd->vm_fault_exc_data);
989 				pdd->vm_fault_exc_data = NULL;
990 				pdd->vm_fault_exc_data_size = 0;
991 			}
992 		}
993 		exception_status_ptr = &pdd->exception_status;
994 	} else if (KFD_DBG_EC_TYPE_IS_PROCESS(exception_code)) {
995 		/* Per process exceptions */
996 		if (!(target->exception_status & KFD_EC_MASK(exception_code))) {
997 			r = -ENODATA;
998 			goto out;
999 		}
1000 
1001 		if (exception_code == EC_PROCESS_RUNTIME) {
1002 			copy_size = min((size_t)(*info_size), sizeof(target->runtime_info));
1003 
1004 			if (copy_to_user(info, (void *)&target->runtime_info, copy_size)) {
1005 				r = -EFAULT;
1006 				goto out;
1007 			}
1008 
1009 			actual_info_size = sizeof(target->runtime_info);
1010 		}
1011 
1012 		exception_status_ptr = &target->exception_status;
1013 	} else {
1014 		pr_debug("Bad exception type [%i]\n", exception_code);
1015 		r = -EINVAL;
1016 		goto out;
1017 	}
1018 
1019 	*info_size = actual_info_size;
1020 	if (clear_exception)
1021 		*exception_status_ptr &= ~KFD_EC_MASK(exception_code);
1022 out:
1023 	mutex_unlock(&target->event_mutex);
1024 	return r;
1025 }
1026 
kfd_dbg_trap_device_snapshot(struct kfd_process * target,uint64_t exception_clear_mask,void __user * user_info,uint32_t * number_of_device_infos,uint32_t * entry_size)1027 int kfd_dbg_trap_device_snapshot(struct kfd_process *target,
1028 		uint64_t exception_clear_mask,
1029 		void __user *user_info,
1030 		uint32_t *number_of_device_infos,
1031 		uint32_t *entry_size)
1032 {
1033 	struct kfd_dbg_device_info_entry device_info;
1034 	uint32_t tmp_entry_size, tmp_num_devices;
1035 	int i, r = 0;
1036 
1037 	if (!(target && user_info && number_of_device_infos && entry_size))
1038 		return -EINVAL;
1039 
1040 	tmp_entry_size = *entry_size;
1041 
1042 	tmp_num_devices = min_t(size_t, *number_of_device_infos, target->n_pdds);
1043 	*number_of_device_infos = target->n_pdds;
1044 	*entry_size = min_t(size_t, *entry_size, sizeof(device_info));
1045 
1046 	if (!tmp_num_devices)
1047 		return 0;
1048 
1049 	memset(&device_info, 0, sizeof(device_info));
1050 
1051 	mutex_lock(&target->event_mutex);
1052 
1053 	/* Run over all pdd of the process */
1054 	for (i = 0; i < tmp_num_devices; i++) {
1055 		struct kfd_process_device *pdd = target->pdds[i];
1056 		struct kfd_topology_device *topo_dev = kfd_topology_device_by_id(pdd->dev->id);
1057 
1058 		device_info.gpu_id = pdd->dev->id;
1059 		device_info.exception_status = pdd->exception_status;
1060 		device_info.lds_base = pdd->lds_base;
1061 		device_info.lds_limit = pdd->lds_limit;
1062 		device_info.scratch_base = pdd->scratch_base;
1063 		device_info.scratch_limit = pdd->scratch_limit;
1064 		device_info.gpuvm_base = pdd->gpuvm_base;
1065 		device_info.gpuvm_limit = pdd->gpuvm_limit;
1066 		device_info.location_id = topo_dev->node_props.location_id;
1067 		device_info.vendor_id = topo_dev->node_props.vendor_id;
1068 		device_info.device_id = topo_dev->node_props.device_id;
1069 		device_info.revision_id = pdd->dev->adev->pdev->revision;
1070 		device_info.subsystem_vendor_id = pdd->dev->adev->pdev->subsystem_vendor;
1071 		device_info.subsystem_device_id = pdd->dev->adev->pdev->subsystem_device;
1072 		device_info.fw_version = pdd->dev->kfd->mec_fw_version;
1073 		device_info.gfx_target_version =
1074 			topo_dev->node_props.gfx_target_version;
1075 		device_info.simd_count = topo_dev->node_props.simd_count;
1076 		device_info.max_waves_per_simd =
1077 			topo_dev->node_props.max_waves_per_simd;
1078 		device_info.array_count = topo_dev->node_props.array_count;
1079 		device_info.simd_arrays_per_engine =
1080 			topo_dev->node_props.simd_arrays_per_engine;
1081 		device_info.num_xcc = NUM_XCC(pdd->dev->xcc_mask);
1082 		device_info.capability = topo_dev->node_props.capability;
1083 		device_info.debug_prop = topo_dev->node_props.debug_prop;
1084 
1085 		if (exception_clear_mask)
1086 			pdd->exception_status &= ~exception_clear_mask;
1087 
1088 		if (copy_to_user(user_info, &device_info, *entry_size)) {
1089 			r = -EFAULT;
1090 			break;
1091 		}
1092 
1093 		user_info += tmp_entry_size;
1094 	}
1095 
1096 	mutex_unlock(&target->event_mutex);
1097 
1098 	return r;
1099 }
1100 
kfd_dbg_set_enabled_debug_exception_mask(struct kfd_process * target,uint64_t exception_set_mask)1101 void kfd_dbg_set_enabled_debug_exception_mask(struct kfd_process *target,
1102 					uint64_t exception_set_mask)
1103 {
1104 	uint64_t found_mask = 0;
1105 	struct process_queue_manager *pqm;
1106 	struct process_queue_node *pqn;
1107 	static const char write_data = '.';
1108 	loff_t pos = 0;
1109 	int i;
1110 
1111 	mutex_lock(&target->event_mutex);
1112 
1113 	found_mask |= target->exception_status;
1114 
1115 	pqm = &target->pqm;
1116 	list_for_each_entry(pqn, &pqm->queues, process_queue_list) {
1117 		if (!pqn->q)
1118 			continue;
1119 
1120 		found_mask |= pqn->q->properties.exception_status;
1121 	}
1122 
1123 	for (i = 0; i < target->n_pdds; i++) {
1124 		struct kfd_process_device *pdd = target->pdds[i];
1125 
1126 		found_mask |= pdd->exception_status;
1127 	}
1128 
1129 	if (exception_set_mask & found_mask)
1130 		kernel_write(target->dbg_ev_file, &write_data, 1, &pos);
1131 
1132 	target->exception_enable_mask = exception_set_mask;
1133 
1134 	mutex_unlock(&target->event_mutex);
1135 }
1136