xref: /linux/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c (revision 5946dbe1c802efef3b12a4eecab1471f725f4ca9)
1 // SPDX-License-Identifier: MIT
2 /*
3  * Copyright 2023 Advanced Micro Devices, Inc.
4  *
5  * Permission is hereby granted, free of charge, to any person obtaining a
6  * copy of this software and associated documentation files (the "Software"),
7  * to deal in the Software without restriction, including without limitation
8  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
9  * and/or sell copies of the Software, and to permit persons to whom the
10  * Software is furnished to do so, subject to the following conditions:
11  *
12  * The above copyright notice and this permission notice shall be included in
13  * all copies or substantial portions of the Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18  * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
19  * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
20  * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
21  * OTHER DEALINGS IN THE SOFTWARE.
22  *
23  */
24 
25 #include <drm/drm_auth.h>
26 #include <drm/drm_exec.h>
27 #include <linux/pm_runtime.h>
28 #include <drm/drm_drv.h>
29 
30 #include "amdgpu.h"
31 #include "amdgpu_reset.h"
32 #include "amdgpu_vm.h"
33 #include "amdgpu_userq.h"
34 #include "amdgpu_hmm.h"
35 #include "amdgpu_userq_fence.h"
36 
37 u32 amdgpu_userq_get_supported_ip_mask(struct amdgpu_device *adev)
38 {
39 	int i;
40 	u32 userq_ip_mask = 0;
41 
42 	for (i = 0; i < AMDGPU_HW_IP_NUM; i++) {
43 		if (adev->userq_funcs[i])
44 			userq_ip_mask |= (1 << i);
45 	}
46 
47 	return userq_ip_mask;
48 }
49 
50 static bool amdgpu_userq_is_reset_type_supported(struct amdgpu_device *adev,
51 				enum amdgpu_ring_type ring_type, int reset_type)
52 {
53 
54 	if (ring_type < 0 || ring_type >= AMDGPU_RING_TYPE_MAX)
55 		return false;
56 
57 	switch (ring_type) {
58 	case AMDGPU_RING_TYPE_GFX:
59 		if (adev->gfx.gfx_supported_reset & reset_type)
60 			return true;
61 		break;
62 	case AMDGPU_RING_TYPE_COMPUTE:
63 		if (adev->gfx.compute_supported_reset & reset_type)
64 			return true;
65 		break;
66 	case AMDGPU_RING_TYPE_SDMA:
67 		if (adev->sdma.supported_reset & reset_type)
68 			return true;
69 		break;
70 	case AMDGPU_RING_TYPE_VCN_DEC:
71 	case AMDGPU_RING_TYPE_VCN_ENC:
72 		if (adev->vcn.supported_reset & reset_type)
73 			return true;
74 		break;
75 	case AMDGPU_RING_TYPE_VCN_JPEG:
76 		if (adev->jpeg.supported_reset & reset_type)
77 			return true;
78 		break;
79 	default:
80 		break;
81 	}
82 	return false;
83 }
84 
85 static void amdgpu_userq_gpu_reset(struct amdgpu_device *adev)
86 {
87 	if (amdgpu_device_should_recover_gpu(adev)) {
88 		amdgpu_reset_domain_schedule(adev->reset_domain,
89 					     &adev->userq_reset_work);
90 		/* Wait for the reset job to complete */
91 		flush_work(&adev->userq_reset_work);
92 	}
93 }
94 
95 static int
96 amdgpu_userq_detect_and_reset_queues(struct amdgpu_userq_mgr *uq_mgr)
97 {
98 	struct amdgpu_device *adev = uq_mgr->adev;
99 	const int queue_types[] = {
100 		AMDGPU_RING_TYPE_COMPUTE,
101 		AMDGPU_RING_TYPE_GFX,
102 		AMDGPU_RING_TYPE_SDMA
103 	};
104 	const int num_queue_types = ARRAY_SIZE(queue_types);
105 	bool gpu_reset = false;
106 	int r = 0;
107 	int i;
108 
109 	/* Warning if current process mutex is not held */
110 	WARN_ON(!mutex_is_locked(&uq_mgr->userq_mutex));
111 
112 	if (unlikely(adev->debug_disable_gpu_ring_reset)) {
113 		dev_err(adev->dev, "userq reset disabled by debug mask\n");
114 		return 0;
115 	}
116 
117 	/*
118 	 * If GPU recovery feature is disabled system-wide,
119 	 * skip all reset detection logic
120 	 */
121 	if (!amdgpu_gpu_recovery)
122 		return 0;
123 
124 	/*
125 	 * Iterate through all queue types to detect and reset problematic queues
126 	 * Process each queue type in the defined order
127 	 */
128 	for (i = 0; i < num_queue_types; i++) {
129 		int ring_type = queue_types[i];
130 		const struct amdgpu_userq_funcs *funcs = adev->userq_funcs[ring_type];
131 
132 		if (!amdgpu_userq_is_reset_type_supported(adev, ring_type, AMDGPU_RESET_TYPE_PER_QUEUE))
133 				continue;
134 
135 		if (atomic_read(&uq_mgr->userq_count[ring_type]) > 0 &&
136 		    funcs && funcs->detect_and_reset) {
137 			r = funcs->detect_and_reset(adev, ring_type);
138 			if (r) {
139 				gpu_reset = true;
140 				break;
141 			}
142 		}
143 	}
144 
145 	if (gpu_reset)
146 		amdgpu_userq_gpu_reset(adev);
147 
148 	return r;
149 }
150 
151 static int amdgpu_userq_buffer_va_list_add(struct amdgpu_usermode_queue *queue,
152 					   struct amdgpu_bo_va_mapping *va_map, u64 addr)
153 {
154 	struct amdgpu_userq_va_cursor *va_cursor;
155 	struct userq_va_list;
156 
157 	va_cursor = kzalloc(sizeof(*va_cursor), GFP_KERNEL);
158 	if (!va_cursor)
159 		return -ENOMEM;
160 
161 	INIT_LIST_HEAD(&va_cursor->list);
162 	va_cursor->gpu_addr = addr;
163 	atomic_set(&va_map->bo_va->userq_va_mapped, 1);
164 	list_add(&va_cursor->list, &queue->userq_va_list);
165 
166 	return 0;
167 }
168 
169 int amdgpu_userq_input_va_validate(struct amdgpu_device *adev,
170 				   struct amdgpu_usermode_queue *queue,
171 				   u64 addr, u64 expected_size)
172 {
173 	struct amdgpu_bo_va_mapping *va_map;
174 	struct amdgpu_vm *vm = queue->vm;
175 	u64 user_addr;
176 	u64 size;
177 	int r = 0;
178 
179 	user_addr = (addr & AMDGPU_GMC_HOLE_MASK) >> AMDGPU_GPU_PAGE_SHIFT;
180 	size = expected_size >> AMDGPU_GPU_PAGE_SHIFT;
181 
182 	r = amdgpu_bo_reserve(vm->root.bo, false);
183 	if (r)
184 		return r;
185 
186 	va_map = amdgpu_vm_bo_lookup_mapping(vm, user_addr);
187 	if (!va_map) {
188 		r = -EINVAL;
189 		goto out_err;
190 	}
191 	/* Only validate the userq whether resident in the VM mapping range */
192 	if (user_addr >= va_map->start  &&
193 	    va_map->last - user_addr + 1 >= size) {
194 		amdgpu_userq_buffer_va_list_add(queue, va_map, user_addr);
195 		amdgpu_bo_unreserve(vm->root.bo);
196 		return 0;
197 	}
198 
199 	r = -EINVAL;
200 out_err:
201 	amdgpu_bo_unreserve(vm->root.bo);
202 	return r;
203 }
204 
205 static bool amdgpu_userq_buffer_va_mapped(struct amdgpu_vm *vm, u64 addr)
206 {
207 	struct amdgpu_bo_va_mapping *mapping;
208 	bool r;
209 
210 	if (amdgpu_bo_reserve(vm->root.bo, false))
211 		return false;
212 
213 	mapping = amdgpu_vm_bo_lookup_mapping(vm, addr);
214 	if (!IS_ERR_OR_NULL(mapping) && atomic_read(&mapping->bo_va->userq_va_mapped))
215 		r = true;
216 	else
217 		r = false;
218 	amdgpu_bo_unreserve(vm->root.bo);
219 
220 	return r;
221 }
222 
223 static bool amdgpu_userq_buffer_vas_mapped(struct amdgpu_usermode_queue *queue)
224 {
225 	struct amdgpu_userq_va_cursor *va_cursor, *tmp;
226 	int r = 0;
227 
228 	list_for_each_entry_safe(va_cursor, tmp, &queue->userq_va_list, list) {
229 		r += amdgpu_userq_buffer_va_mapped(queue->vm, va_cursor->gpu_addr);
230 		dev_dbg(queue->userq_mgr->adev->dev,
231 			"validate the userq mapping:%p va:%llx r:%d\n",
232 			queue, va_cursor->gpu_addr, r);
233 	}
234 
235 	if (r != 0)
236 		return true;
237 
238 	return false;
239 }
240 
241 static void amdgpu_userq_buffer_va_list_del(struct amdgpu_bo_va_mapping *mapping,
242 					    struct amdgpu_userq_va_cursor *va_cursor)
243 {
244 	atomic_set(&mapping->bo_va->userq_va_mapped, 0);
245 	list_del(&va_cursor->list);
246 	kfree(va_cursor);
247 }
248 
249 static int amdgpu_userq_buffer_vas_list_cleanup(struct amdgpu_device *adev,
250 						struct amdgpu_usermode_queue *queue)
251 {
252 	struct amdgpu_userq_va_cursor *va_cursor, *tmp;
253 	struct amdgpu_bo_va_mapping *mapping;
254 	int r;
255 
256 	r = amdgpu_bo_reserve(queue->vm->root.bo, false);
257 	if (r)
258 		return r;
259 
260 	list_for_each_entry_safe(va_cursor, tmp, &queue->userq_va_list, list) {
261 		mapping = amdgpu_vm_bo_lookup_mapping(queue->vm, va_cursor->gpu_addr);
262 		if (!mapping) {
263 			r = -EINVAL;
264 			goto err;
265 		}
266 		dev_dbg(adev->dev, "delete the userq:%p va:%llx\n",
267 			queue, va_cursor->gpu_addr);
268 		amdgpu_userq_buffer_va_list_del(mapping, va_cursor);
269 	}
270 err:
271 	amdgpu_bo_unreserve(queue->vm->root.bo);
272 	return r;
273 }
274 
275 static int amdgpu_userq_preempt_helper(struct amdgpu_usermode_queue *queue)
276 {
277 	struct amdgpu_userq_mgr *uq_mgr = queue->userq_mgr;
278 	struct amdgpu_device *adev = uq_mgr->adev;
279 	const struct amdgpu_userq_funcs *userq_funcs =
280 		adev->userq_funcs[queue->queue_type];
281 	bool found_hung_queue = false;
282 	int r = 0;
283 
284 	if (queue->state == AMDGPU_USERQ_STATE_MAPPED) {
285 		r = userq_funcs->preempt(queue);
286 		if (r) {
287 			queue->state = AMDGPU_USERQ_STATE_HUNG;
288 			found_hung_queue = true;
289 		} else {
290 			queue->state = AMDGPU_USERQ_STATE_PREEMPTED;
291 		}
292 	}
293 
294 	if (found_hung_queue)
295 		amdgpu_userq_detect_and_reset_queues(uq_mgr);
296 
297 	return r;
298 }
299 
300 static int amdgpu_userq_restore_helper(struct amdgpu_usermode_queue *queue)
301 {
302 	struct amdgpu_userq_mgr *uq_mgr = queue->userq_mgr;
303 	struct amdgpu_device *adev = uq_mgr->adev;
304 	const struct amdgpu_userq_funcs *userq_funcs =
305 		adev->userq_funcs[queue->queue_type];
306 	int r = 0;
307 
308 	if (queue->state == AMDGPU_USERQ_STATE_PREEMPTED) {
309 		r = userq_funcs->restore(queue);
310 		if (r) {
311 			queue->state = AMDGPU_USERQ_STATE_HUNG;
312 		} else {
313 			queue->state = AMDGPU_USERQ_STATE_MAPPED;
314 		}
315 	}
316 
317 	return r;
318 }
319 
320 static int amdgpu_userq_unmap_helper(struct amdgpu_usermode_queue *queue)
321 {
322 	struct amdgpu_userq_mgr *uq_mgr = queue->userq_mgr;
323 	struct amdgpu_device *adev = uq_mgr->adev;
324 	const struct amdgpu_userq_funcs *userq_funcs =
325 		adev->userq_funcs[queue->queue_type];
326 	bool found_hung_queue = false;
327 	int r = 0;
328 
329 	if ((queue->state == AMDGPU_USERQ_STATE_MAPPED) ||
330 		(queue->state == AMDGPU_USERQ_STATE_PREEMPTED)) {
331 		r = userq_funcs->unmap(queue);
332 		if (r) {
333 			queue->state = AMDGPU_USERQ_STATE_HUNG;
334 			found_hung_queue = true;
335 		} else {
336 			queue->state = AMDGPU_USERQ_STATE_UNMAPPED;
337 		}
338 	}
339 
340 	if (found_hung_queue)
341 		amdgpu_userq_detect_and_reset_queues(uq_mgr);
342 
343 	return r;
344 }
345 
346 static int amdgpu_userq_map_helper(struct amdgpu_usermode_queue *queue)
347 {
348 	struct amdgpu_userq_mgr *uq_mgr = queue->userq_mgr;
349 	struct amdgpu_device *adev = uq_mgr->adev;
350 	const struct amdgpu_userq_funcs *userq_funcs =
351 		adev->userq_funcs[queue->queue_type];
352 	int r = 0;
353 
354 	if (queue->state == AMDGPU_USERQ_STATE_UNMAPPED) {
355 		r = userq_funcs->map(queue);
356 		if (r) {
357 			queue->state = AMDGPU_USERQ_STATE_HUNG;
358 			amdgpu_userq_detect_and_reset_queues(uq_mgr);
359 		} else {
360 			queue->state = AMDGPU_USERQ_STATE_MAPPED;
361 		}
362 	}
363 
364 	return r;
365 }
366 
367 static int amdgpu_userq_wait_for_last_fence(struct amdgpu_usermode_queue *queue)
368 {
369 	struct amdgpu_userq_mgr *uq_mgr = queue->userq_mgr;
370 	struct dma_fence *f = queue->last_fence;
371 	int ret = 0;
372 
373 	if (f && !dma_fence_is_signaled(f)) {
374 		ret = dma_fence_wait_timeout(f, true, MAX_SCHEDULE_TIMEOUT);
375 		if (ret <= 0) {
376 			drm_file_err(uq_mgr->file, "Timed out waiting for fence=%llu:%llu\n",
377 				     f->context, f->seqno);
378 			queue->state = AMDGPU_USERQ_STATE_HUNG;
379 			return -ETIME;
380 		}
381 	}
382 
383 	return ret;
384 }
385 
386 static void amdgpu_userq_cleanup(struct amdgpu_usermode_queue *queue,
387 				 int queue_id)
388 {
389 	struct amdgpu_userq_mgr *uq_mgr = queue->userq_mgr;
390 	struct amdgpu_device *adev = uq_mgr->adev;
391 	const struct amdgpu_userq_funcs *uq_funcs = adev->userq_funcs[queue->queue_type];
392 
393 	/* Wait for mode-1 reset to complete */
394 	down_read(&adev->reset_domain->sem);
395 
396 	/* Drop the userq reference. */
397 	amdgpu_userq_buffer_vas_list_cleanup(adev, queue);
398 	uq_funcs->mqd_destroy(queue);
399 	amdgpu_userq_fence_driver_free(queue);
400 	/* Use interrupt-safe locking since IRQ handlers may access these XArrays */
401 	xa_erase_irq(&uq_mgr->userq_xa, (unsigned long)queue_id);
402 	xa_erase_irq(&adev->userq_doorbell_xa, queue->doorbell_index);
403 	queue->userq_mgr = NULL;
404 	list_del(&queue->userq_va_list);
405 	kfree(queue);
406 
407 	up_read(&adev->reset_domain->sem);
408 }
409 
410 static struct amdgpu_usermode_queue *
411 amdgpu_userq_find(struct amdgpu_userq_mgr *uq_mgr, int qid)
412 {
413 	return xa_load(&uq_mgr->userq_xa, qid);
414 }
415 
416 void
417 amdgpu_userq_ensure_ev_fence(struct amdgpu_userq_mgr *uq_mgr,
418 			     struct amdgpu_eviction_fence_mgr *evf_mgr)
419 {
420 	struct amdgpu_eviction_fence *ev_fence;
421 
422 retry:
423 	/* Flush any pending resume work to create ev_fence */
424 	flush_delayed_work(&uq_mgr->resume_work);
425 
426 	mutex_lock(&uq_mgr->userq_mutex);
427 	spin_lock(&evf_mgr->ev_fence_lock);
428 	ev_fence = evf_mgr->ev_fence;
429 	spin_unlock(&evf_mgr->ev_fence_lock);
430 	if (!ev_fence || dma_fence_is_signaled(&ev_fence->base)) {
431 		mutex_unlock(&uq_mgr->userq_mutex);
432 		/*
433 		 * Looks like there was no pending resume work,
434 		 * add one now to create a valid eviction fence
435 		 */
436 		schedule_delayed_work(&uq_mgr->resume_work, 0);
437 		goto retry;
438 	}
439 }
440 
441 int amdgpu_userq_create_object(struct amdgpu_userq_mgr *uq_mgr,
442 			       struct amdgpu_userq_obj *userq_obj,
443 			       int size)
444 {
445 	struct amdgpu_device *adev = uq_mgr->adev;
446 	struct amdgpu_bo_param bp;
447 	int r;
448 
449 	memset(&bp, 0, sizeof(bp));
450 	bp.byte_align = PAGE_SIZE;
451 	bp.domain = AMDGPU_GEM_DOMAIN_GTT;
452 	bp.flags = AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS |
453 		   AMDGPU_GEM_CREATE_CPU_ACCESS_REQUIRED;
454 	bp.type = ttm_bo_type_kernel;
455 	bp.size = size;
456 	bp.resv = NULL;
457 	bp.bo_ptr_size = sizeof(struct amdgpu_bo);
458 
459 	r = amdgpu_bo_create(adev, &bp, &userq_obj->obj);
460 	if (r) {
461 		drm_file_err(uq_mgr->file, "Failed to allocate BO for userqueue (%d)", r);
462 		return r;
463 	}
464 
465 	r = amdgpu_bo_reserve(userq_obj->obj, true);
466 	if (r) {
467 		drm_file_err(uq_mgr->file, "Failed to reserve BO to map (%d)", r);
468 		goto free_obj;
469 	}
470 
471 	r = amdgpu_ttm_alloc_gart(&(userq_obj->obj)->tbo);
472 	if (r) {
473 		drm_file_err(uq_mgr->file, "Failed to alloc GART for userqueue object (%d)", r);
474 		goto unresv;
475 	}
476 
477 	r = amdgpu_bo_kmap(userq_obj->obj, &userq_obj->cpu_ptr);
478 	if (r) {
479 		drm_file_err(uq_mgr->file, "Failed to map BO for userqueue (%d)", r);
480 		goto unresv;
481 	}
482 
483 	userq_obj->gpu_addr = amdgpu_bo_gpu_offset(userq_obj->obj);
484 	amdgpu_bo_unreserve(userq_obj->obj);
485 	memset(userq_obj->cpu_ptr, 0, size);
486 	return 0;
487 
488 unresv:
489 	amdgpu_bo_unreserve(userq_obj->obj);
490 
491 free_obj:
492 	amdgpu_bo_unref(&userq_obj->obj);
493 	return r;
494 }
495 
496 void amdgpu_userq_destroy_object(struct amdgpu_userq_mgr *uq_mgr,
497 				 struct amdgpu_userq_obj *userq_obj)
498 {
499 	amdgpu_bo_kunmap(userq_obj->obj);
500 	amdgpu_bo_unref(&userq_obj->obj);
501 }
502 
503 uint64_t
504 amdgpu_userq_get_doorbell_index(struct amdgpu_userq_mgr *uq_mgr,
505 				struct amdgpu_db_info *db_info,
506 				struct drm_file *filp)
507 {
508 	uint64_t index;
509 	struct drm_gem_object *gobj;
510 	struct amdgpu_userq_obj *db_obj = db_info->db_obj;
511 	int r, db_size;
512 
513 	gobj = drm_gem_object_lookup(filp, db_info->doorbell_handle);
514 	if (gobj == NULL) {
515 		drm_file_err(uq_mgr->file, "Can't find GEM object for doorbell\n");
516 		return -EINVAL;
517 	}
518 
519 	db_obj->obj = amdgpu_bo_ref(gem_to_amdgpu_bo(gobj));
520 	drm_gem_object_put(gobj);
521 
522 	r = amdgpu_bo_reserve(db_obj->obj, true);
523 	if (r) {
524 		drm_file_err(uq_mgr->file, "[Usermode queues] Failed to pin doorbell object\n");
525 		goto unref_bo;
526 	}
527 
528 	/* Pin the BO before generating the index, unpin in queue destroy */
529 	r = amdgpu_bo_pin(db_obj->obj, AMDGPU_GEM_DOMAIN_DOORBELL);
530 	if (r) {
531 		drm_file_err(uq_mgr->file, "[Usermode queues] Failed to pin doorbell object\n");
532 		goto unresv_bo;
533 	}
534 
535 	switch (db_info->queue_type) {
536 	case AMDGPU_HW_IP_GFX:
537 	case AMDGPU_HW_IP_COMPUTE:
538 	case AMDGPU_HW_IP_DMA:
539 		db_size = sizeof(u64);
540 		break;
541 	default:
542 		drm_file_err(uq_mgr->file, "[Usermode queues] IP %d not support\n",
543 			     db_info->queue_type);
544 		r = -EINVAL;
545 		goto unpin_bo;
546 	}
547 
548 	index = amdgpu_doorbell_index_on_bar(uq_mgr->adev, db_obj->obj,
549 					     db_info->doorbell_offset, db_size);
550 	drm_dbg_driver(adev_to_drm(uq_mgr->adev),
551 		       "[Usermode queues] doorbell index=%lld\n", index);
552 	amdgpu_bo_unreserve(db_obj->obj);
553 	return index;
554 
555 unpin_bo:
556 	amdgpu_bo_unpin(db_obj->obj);
557 unresv_bo:
558 	amdgpu_bo_unreserve(db_obj->obj);
559 unref_bo:
560 	amdgpu_bo_unref(&db_obj->obj);
561 	return r;
562 }
563 
564 static int
565 amdgpu_userq_destroy(struct drm_file *filp, int queue_id)
566 {
567 	struct amdgpu_fpriv *fpriv = filp->driver_priv;
568 	struct amdgpu_userq_mgr *uq_mgr = &fpriv->userq_mgr;
569 	struct amdgpu_device *adev = uq_mgr->adev;
570 	struct amdgpu_usermode_queue *queue;
571 	int r = 0;
572 
573 	cancel_delayed_work_sync(&uq_mgr->resume_work);
574 	mutex_lock(&uq_mgr->userq_mutex);
575 
576 	queue = amdgpu_userq_find(uq_mgr, queue_id);
577 	if (!queue) {
578 		drm_dbg_driver(adev_to_drm(uq_mgr->adev), "Invalid queue id to destroy\n");
579 		mutex_unlock(&uq_mgr->userq_mutex);
580 		return -EINVAL;
581 	}
582 	amdgpu_userq_wait_for_last_fence(queue);
583 	r = amdgpu_bo_reserve(queue->db_obj.obj, true);
584 	if (!r) {
585 		amdgpu_bo_unpin(queue->db_obj.obj);
586 		amdgpu_bo_unreserve(queue->db_obj.obj);
587 	}
588 	amdgpu_bo_unref(&queue->db_obj.obj);
589 
590 	r = amdgpu_bo_reserve(queue->wptr_obj.obj, true);
591 	if (!r) {
592 		amdgpu_bo_unpin(queue->wptr_obj.obj);
593 		amdgpu_bo_unreserve(queue->wptr_obj.obj);
594 	}
595 	amdgpu_bo_unref(&queue->wptr_obj.obj);
596 
597 	atomic_dec(&uq_mgr->userq_count[queue->queue_type]);
598 #if defined(CONFIG_DEBUG_FS)
599 	debugfs_remove_recursive(queue->debugfs_queue);
600 #endif
601 	amdgpu_userq_detect_and_reset_queues(uq_mgr);
602 	r = amdgpu_userq_unmap_helper(queue);
603 	/*TODO: It requires a reset for userq hw unmap error*/
604 	if (unlikely(r != AMDGPU_USERQ_STATE_UNMAPPED)) {
605 		drm_warn(adev_to_drm(uq_mgr->adev), "trying to destroy a HW mapping userq\n");
606 		queue->state = AMDGPU_USERQ_STATE_HUNG;
607 	}
608 	amdgpu_userq_cleanup(queue, queue_id);
609 	mutex_unlock(&uq_mgr->userq_mutex);
610 
611 	pm_runtime_put_autosuspend(adev_to_drm(adev)->dev);
612 
613 	return r;
614 }
615 
616 static int amdgpu_userq_priority_permit(struct drm_file *filp,
617 					int priority)
618 {
619 	if (priority < AMDGPU_USERQ_CREATE_FLAGS_QUEUE_PRIORITY_HIGH)
620 		return 0;
621 
622 	if (capable(CAP_SYS_NICE))
623 		return 0;
624 
625 	if (drm_is_current_master(filp))
626 		return 0;
627 
628 	return -EACCES;
629 }
630 
631 #if defined(CONFIG_DEBUG_FS)
632 static int amdgpu_mqd_info_read(struct seq_file *m, void *unused)
633 {
634 	struct amdgpu_usermode_queue *queue = m->private;
635 	struct amdgpu_bo *bo;
636 	int r;
637 
638 	if (!queue || !queue->mqd.obj)
639 		return -EINVAL;
640 
641 	bo = amdgpu_bo_ref(queue->mqd.obj);
642 	r = amdgpu_bo_reserve(bo, true);
643 	if (r) {
644 		amdgpu_bo_unref(&bo);
645 		return -EINVAL;
646 	}
647 
648 	seq_printf(m, "queue_type: %d\n", queue->queue_type);
649 	seq_printf(m, "mqd_gpu_address: 0x%llx\n", amdgpu_bo_gpu_offset(queue->mqd.obj));
650 
651 	amdgpu_bo_unreserve(bo);
652 	amdgpu_bo_unref(&bo);
653 
654 	return 0;
655 }
656 
657 static int amdgpu_mqd_info_open(struct inode *inode, struct file *file)
658 {
659 	return single_open(file, amdgpu_mqd_info_read, inode->i_private);
660 }
661 
662 static const struct file_operations amdgpu_mqd_info_fops = {
663 	.owner = THIS_MODULE,
664 	.open = amdgpu_mqd_info_open,
665 	.read = seq_read,
666 	.llseek = seq_lseek,
667 	.release = single_release,
668 };
669 #endif
670 
671 static int
672 amdgpu_userq_create(struct drm_file *filp, union drm_amdgpu_userq *args)
673 {
674 	struct amdgpu_fpriv *fpriv = filp->driver_priv;
675 	struct amdgpu_userq_mgr *uq_mgr = &fpriv->userq_mgr;
676 	struct amdgpu_device *adev = uq_mgr->adev;
677 	const struct amdgpu_userq_funcs *uq_funcs;
678 	struct amdgpu_usermode_queue *queue;
679 	struct amdgpu_db_info db_info;
680 	char *queue_name;
681 	bool skip_map_queue;
682 	u32 qid;
683 	uint64_t index;
684 	int r = 0;
685 	int priority =
686 		(args->in.flags & AMDGPU_USERQ_CREATE_FLAGS_QUEUE_PRIORITY_MASK) >>
687 		AMDGPU_USERQ_CREATE_FLAGS_QUEUE_PRIORITY_SHIFT;
688 
689 	r = amdgpu_userq_priority_permit(filp, priority);
690 	if (r)
691 		return r;
692 
693 	r = pm_runtime_get_sync(adev_to_drm(adev)->dev);
694 	if (r < 0) {
695 		drm_file_err(uq_mgr->file, "pm_runtime_get_sync() failed for userqueue create\n");
696 		pm_runtime_put_autosuspend(adev_to_drm(adev)->dev);
697 		return r;
698 	}
699 
700 	/*
701 	 * There could be a situation that we are creating a new queue while
702 	 * the other queues under this UQ_mgr are suspended. So if there is any
703 	 * resume work pending, wait for it to get done.
704 	 *
705 	 * This will also make sure we have a valid eviction fence ready to be used.
706 	 */
707 	amdgpu_userq_ensure_ev_fence(&fpriv->userq_mgr, &fpriv->evf_mgr);
708 
709 	uq_funcs = adev->userq_funcs[args->in.ip_type];
710 	if (!uq_funcs) {
711 		drm_file_err(uq_mgr->file, "Usermode queue is not supported for this IP (%u)\n",
712 			     args->in.ip_type);
713 		r = -EINVAL;
714 		goto unlock;
715 	}
716 
717 	queue = kzalloc(sizeof(struct amdgpu_usermode_queue), GFP_KERNEL);
718 	if (!queue) {
719 		drm_file_err(uq_mgr->file, "Failed to allocate memory for queue\n");
720 		r = -ENOMEM;
721 		goto unlock;
722 	}
723 
724 	INIT_LIST_HEAD(&queue->userq_va_list);
725 	queue->doorbell_handle = args->in.doorbell_handle;
726 	queue->queue_type = args->in.ip_type;
727 	queue->vm = &fpriv->vm;
728 	queue->priority = priority;
729 
730 	db_info.queue_type = queue->queue_type;
731 	db_info.doorbell_handle = queue->doorbell_handle;
732 	db_info.db_obj = &queue->db_obj;
733 	db_info.doorbell_offset = args->in.doorbell_offset;
734 
735 	queue->userq_mgr = uq_mgr;
736 	/* Validate the userq virtual address.*/
737 	if (amdgpu_userq_input_va_validate(adev, queue, args->in.queue_va, args->in.queue_size) ||
738 	    amdgpu_userq_input_va_validate(adev, queue, args->in.rptr_va, AMDGPU_GPU_PAGE_SIZE) ||
739 	    amdgpu_userq_input_va_validate(adev, queue, args->in.wptr_va, AMDGPU_GPU_PAGE_SIZE)) {
740 		r = -EINVAL;
741 		kfree(queue);
742 		goto unlock;
743 	}
744 
745 	/* Convert relative doorbell offset into absolute doorbell index */
746 	index = amdgpu_userq_get_doorbell_index(uq_mgr, &db_info, filp);
747 	if (index == (uint64_t)-EINVAL) {
748 		drm_file_err(uq_mgr->file, "Failed to get doorbell for queue\n");
749 		kfree(queue);
750 		r = -EINVAL;
751 		goto unlock;
752 	}
753 
754 	queue->doorbell_index = index;
755 	xa_init_flags(&queue->fence_drv_xa, XA_FLAGS_ALLOC);
756 	r = amdgpu_userq_fence_driver_alloc(adev, queue);
757 	if (r) {
758 		drm_file_err(uq_mgr->file, "Failed to alloc fence driver\n");
759 		goto unlock;
760 	}
761 
762 	r = uq_funcs->mqd_create(queue, &args->in);
763 	if (r) {
764 		drm_file_err(uq_mgr->file, "Failed to create Queue\n");
765 		amdgpu_userq_fence_driver_free(queue);
766 		kfree(queue);
767 		goto unlock;
768 	}
769 
770 	/* Wait for mode-1 reset to complete */
771 	down_read(&adev->reset_domain->sem);
772 	r = xa_err(xa_store_irq(&adev->userq_doorbell_xa, index, queue, GFP_KERNEL));
773 	if (r) {
774 		kfree(queue);
775 		up_read(&adev->reset_domain->sem);
776 		goto unlock;
777 	}
778 
779 	r = xa_alloc(&uq_mgr->userq_xa, &qid, queue,
780 		     XA_LIMIT(1, AMDGPU_MAX_USERQ_COUNT), GFP_KERNEL);
781 	if (r) {
782 		drm_file_err(uq_mgr->file, "Failed to allocate a queue id\n");
783 		amdgpu_userq_fence_driver_free(queue);
784 		uq_funcs->mqd_destroy(queue);
785 		kfree(queue);
786 		r = -ENOMEM;
787 		up_read(&adev->reset_domain->sem);
788 		goto unlock;
789 	}
790 	up_read(&adev->reset_domain->sem);
791 
792 	/* don't map the queue if scheduling is halted */
793 	if (adev->userq_halt_for_enforce_isolation &&
794 	    ((queue->queue_type == AMDGPU_HW_IP_GFX) ||
795 	     (queue->queue_type == AMDGPU_HW_IP_COMPUTE)))
796 		skip_map_queue = true;
797 	else
798 		skip_map_queue = false;
799 	if (!skip_map_queue) {
800 		r = amdgpu_userq_map_helper(queue);
801 		if (r) {
802 			drm_file_err(uq_mgr->file, "Failed to map Queue\n");
803 			xa_erase(&uq_mgr->userq_xa, qid);
804 			amdgpu_userq_fence_driver_free(queue);
805 			uq_funcs->mqd_destroy(queue);
806 			kfree(queue);
807 			goto unlock;
808 		}
809 	}
810 
811 	queue_name = kasprintf(GFP_KERNEL, "queue-%d", qid);
812 	if (!queue_name) {
813 		r = -ENOMEM;
814 		goto unlock;
815 	}
816 
817 #if defined(CONFIG_DEBUG_FS)
818 	/* Queue dentry per client to hold MQD information   */
819 	queue->debugfs_queue = debugfs_create_dir(queue_name, filp->debugfs_client);
820 	debugfs_create_file("mqd_info", 0444, queue->debugfs_queue, queue, &amdgpu_mqd_info_fops);
821 #endif
822 	kfree(queue_name);
823 
824 	args->out.queue_id = qid;
825 	atomic_inc(&uq_mgr->userq_count[queue->queue_type]);
826 
827 unlock:
828 	mutex_unlock(&uq_mgr->userq_mutex);
829 
830 	return r;
831 }
832 
833 static int amdgpu_userq_input_args_validate(struct drm_device *dev,
834 					union drm_amdgpu_userq *args,
835 					struct drm_file *filp)
836 {
837 	struct amdgpu_device *adev = drm_to_adev(dev);
838 
839 	switch (args->in.op) {
840 	case AMDGPU_USERQ_OP_CREATE:
841 		if (args->in.flags & ~(AMDGPU_USERQ_CREATE_FLAGS_QUEUE_PRIORITY_MASK |
842 				       AMDGPU_USERQ_CREATE_FLAGS_QUEUE_SECURE))
843 			return -EINVAL;
844 		/* Usermode queues are only supported for GFX IP as of now */
845 		if (args->in.ip_type != AMDGPU_HW_IP_GFX &&
846 		    args->in.ip_type != AMDGPU_HW_IP_DMA &&
847 		    args->in.ip_type != AMDGPU_HW_IP_COMPUTE) {
848 			drm_file_err(filp, "Usermode queue doesn't support IP type %u\n",
849 				     args->in.ip_type);
850 			return -EINVAL;
851 		}
852 
853 		if ((args->in.flags & AMDGPU_USERQ_CREATE_FLAGS_QUEUE_SECURE) &&
854 		    (args->in.ip_type != AMDGPU_HW_IP_GFX) &&
855 		    (args->in.ip_type != AMDGPU_HW_IP_COMPUTE) &&
856 		    !amdgpu_is_tmz(adev)) {
857 			drm_file_err(filp, "Secure only supported on GFX/Compute queues\n");
858 			return -EINVAL;
859 		}
860 
861 		if (args->in.queue_va == AMDGPU_BO_INVALID_OFFSET ||
862 		    args->in.queue_va == 0 ||
863 		    args->in.queue_size == 0) {
864 			drm_file_err(filp, "invalidate userq queue va or size\n");
865 			return -EINVAL;
866 		}
867 		if (!args->in.wptr_va || !args->in.rptr_va) {
868 			drm_file_err(filp, "invalidate userq queue rptr or wptr\n");
869 			return -EINVAL;
870 		}
871 		break;
872 	case AMDGPU_USERQ_OP_FREE:
873 		if (args->in.ip_type ||
874 		    args->in.doorbell_handle ||
875 		    args->in.doorbell_offset ||
876 		    args->in.flags ||
877 		    args->in.queue_va ||
878 		    args->in.queue_size ||
879 		    args->in.rptr_va ||
880 		    args->in.wptr_va ||
881 		    args->in.mqd ||
882 		    args->in.mqd_size)
883 			return -EINVAL;
884 		break;
885 	default:
886 		return -EINVAL;
887 	}
888 
889 	return 0;
890 }
891 
892 int amdgpu_userq_ioctl(struct drm_device *dev, void *data,
893 		       struct drm_file *filp)
894 {
895 	union drm_amdgpu_userq *args = data;
896 	int r;
897 
898 	if (amdgpu_userq_input_args_validate(dev, args, filp) < 0)
899 		return -EINVAL;
900 
901 	switch (args->in.op) {
902 	case AMDGPU_USERQ_OP_CREATE:
903 		r = amdgpu_userq_create(filp, args);
904 		if (r)
905 			drm_file_err(filp, "Failed to create usermode queue\n");
906 		break;
907 
908 	case AMDGPU_USERQ_OP_FREE:
909 		r = amdgpu_userq_destroy(filp, args->in.queue_id);
910 		if (r)
911 			drm_file_err(filp, "Failed to destroy usermode queue\n");
912 		break;
913 
914 	default:
915 		drm_dbg_driver(dev, "Invalid user queue op specified: %d\n", args->in.op);
916 		return -EINVAL;
917 	}
918 
919 	return r;
920 }
921 
922 static int
923 amdgpu_userq_restore_all(struct amdgpu_userq_mgr *uq_mgr)
924 {
925 	struct amdgpu_usermode_queue *queue;
926 	unsigned long queue_id;
927 	int ret = 0, r;
928 
929 	/* Resume all the queues for this process */
930 	xa_for_each(&uq_mgr->userq_xa, queue_id, queue) {
931 		if (!amdgpu_userq_buffer_vas_mapped(queue)) {
932 			drm_file_err(uq_mgr->file,
933 				     "trying restore queue without va mapping\n");
934 			queue->state = AMDGPU_USERQ_STATE_INVALID_VA;
935 			continue;
936 		}
937 
938 		r = amdgpu_userq_restore_helper(queue);
939 		if (r)
940 			ret = r;
941 	}
942 
943 	if (ret)
944 		drm_file_err(uq_mgr->file, "Failed to map all the queues\n");
945 	return ret;
946 }
947 
948 static int amdgpu_userq_validate_vm(void *param, struct amdgpu_bo *bo)
949 {
950 	struct ttm_operation_ctx ctx = { false, false };
951 
952 	amdgpu_bo_placement_from_domain(bo, bo->allowed_domains);
953 	return ttm_bo_validate(&bo->tbo, &bo->placement, &ctx);
954 }
955 
956 /* Handle all BOs on the invalidated list, validate them and update the PTs */
957 static int
958 amdgpu_userq_bo_validate(struct amdgpu_device *adev, struct drm_exec *exec,
959 			 struct amdgpu_vm *vm)
960 {
961 	struct ttm_operation_ctx ctx = { false, false };
962 	struct amdgpu_bo_va *bo_va;
963 	struct amdgpu_bo *bo;
964 	int ret;
965 
966 	spin_lock(&vm->status_lock);
967 	while (!list_empty(&vm->invalidated)) {
968 		bo_va = list_first_entry(&vm->invalidated,
969 					 struct amdgpu_bo_va,
970 					 base.vm_status);
971 		spin_unlock(&vm->status_lock);
972 
973 		bo = bo_va->base.bo;
974 		ret = drm_exec_prepare_obj(exec, &bo->tbo.base, 2);
975 		if (unlikely(ret))
976 			return ret;
977 
978 		amdgpu_bo_placement_from_domain(bo, bo->allowed_domains);
979 		ret = ttm_bo_validate(&bo->tbo, &bo->placement, &ctx);
980 		if (ret)
981 			return ret;
982 
983 		/* This moves the bo_va to the done list */
984 		ret = amdgpu_vm_bo_update(adev, bo_va, false);
985 		if (ret)
986 			return ret;
987 
988 		spin_lock(&vm->status_lock);
989 	}
990 	spin_unlock(&vm->status_lock);
991 
992 	return 0;
993 }
994 
995 /* Make sure the whole VM is ready to be used */
996 static int
997 amdgpu_userq_vm_validate(struct amdgpu_userq_mgr *uq_mgr)
998 {
999 	struct amdgpu_fpriv *fpriv = uq_mgr_to_fpriv(uq_mgr);
1000 	bool invalidated = false, new_addition = false;
1001 	struct ttm_operation_ctx ctx = { true, false };
1002 	struct amdgpu_device *adev = uq_mgr->adev;
1003 	struct amdgpu_hmm_range *range;
1004 	struct amdgpu_vm *vm = &fpriv->vm;
1005 	unsigned long key, tmp_key;
1006 	struct amdgpu_bo_va *bo_va;
1007 	struct amdgpu_bo *bo;
1008 	struct drm_exec exec;
1009 	struct xarray xa;
1010 	int ret;
1011 
1012 	xa_init(&xa);
1013 
1014 retry_lock:
1015 	drm_exec_init(&exec, DRM_EXEC_IGNORE_DUPLICATES, 0);
1016 	drm_exec_until_all_locked(&exec) {
1017 		ret = amdgpu_vm_lock_pd(vm, &exec, 1);
1018 		drm_exec_retry_on_contention(&exec);
1019 		if (unlikely(ret))
1020 			goto unlock_all;
1021 
1022 		ret = amdgpu_vm_lock_done_list(vm, &exec, 1);
1023 		drm_exec_retry_on_contention(&exec);
1024 		if (unlikely(ret))
1025 			goto unlock_all;
1026 
1027 		/* This validates PDs, PTs and per VM BOs */
1028 		ret = amdgpu_vm_validate(adev, vm, NULL,
1029 					 amdgpu_userq_validate_vm,
1030 					 NULL);
1031 		if (unlikely(ret))
1032 			goto unlock_all;
1033 
1034 		/* This locks and validates the remaining evicted BOs */
1035 		ret = amdgpu_userq_bo_validate(adev, &exec, vm);
1036 		drm_exec_retry_on_contention(&exec);
1037 		if (unlikely(ret))
1038 			goto unlock_all;
1039 	}
1040 
1041 	if (invalidated) {
1042 		xa_for_each(&xa, tmp_key, range) {
1043 			bo = range->bo;
1044 			amdgpu_bo_placement_from_domain(bo, AMDGPU_GEM_DOMAIN_CPU);
1045 			ret = ttm_bo_validate(&bo->tbo, &bo->placement, &ctx);
1046 			if (ret)
1047 				goto unlock_all;
1048 
1049 			amdgpu_ttm_tt_set_user_pages(bo->tbo.ttm, range);
1050 
1051 			amdgpu_bo_placement_from_domain(bo, AMDGPU_GEM_DOMAIN_GTT);
1052 			ret = ttm_bo_validate(&bo->tbo, &bo->placement, &ctx);
1053 			if (ret)
1054 				goto unlock_all;
1055 		}
1056 		invalidated = false;
1057 	}
1058 
1059 	ret = amdgpu_vm_handle_moved(adev, vm, NULL);
1060 	if (ret)
1061 		goto unlock_all;
1062 
1063 	key = 0;
1064 	/* Validate User Ptr BOs */
1065 	list_for_each_entry(bo_va, &vm->done, base.vm_status) {
1066 		bo = bo_va->base.bo;
1067 		if (!bo)
1068 			continue;
1069 
1070 		if (!amdgpu_ttm_tt_is_userptr(bo->tbo.ttm))
1071 			continue;
1072 
1073 		range = xa_load(&xa, key);
1074 		if (range && range->bo != bo) {
1075 			xa_erase(&xa, key);
1076 			amdgpu_hmm_range_free(range);
1077 			range = NULL;
1078 		}
1079 
1080 		if (!range) {
1081 			range = amdgpu_hmm_range_alloc(bo);
1082 			if (!range) {
1083 				ret = -ENOMEM;
1084 				goto unlock_all;
1085 			}
1086 
1087 			xa_store(&xa, key, range, GFP_KERNEL);
1088 			new_addition = true;
1089 		}
1090 		key++;
1091 	}
1092 
1093 	if (new_addition) {
1094 		drm_exec_fini(&exec);
1095 		xa_for_each(&xa, tmp_key, range) {
1096 			if (!range)
1097 				continue;
1098 			bo = range->bo;
1099 			ret = amdgpu_ttm_tt_get_user_pages(bo, range);
1100 			if (ret)
1101 				goto unlock_all;
1102 		}
1103 
1104 		invalidated = true;
1105 		new_addition = false;
1106 		goto retry_lock;
1107 	}
1108 
1109 	ret = amdgpu_vm_update_pdes(adev, vm, false);
1110 	if (ret)
1111 		goto unlock_all;
1112 
1113 	/*
1114 	 * We need to wait for all VM updates to finish before restarting the
1115 	 * queues. Using the done list like that is now ok since everything is
1116 	 * locked in place.
1117 	 */
1118 	list_for_each_entry(bo_va, &vm->done, base.vm_status)
1119 		dma_fence_wait(bo_va->last_pt_update, false);
1120 	dma_fence_wait(vm->last_update, false);
1121 
1122 	ret = amdgpu_eviction_fence_replace_fence(&fpriv->evf_mgr, &exec);
1123 	if (ret)
1124 		drm_file_err(uq_mgr->file, "Failed to replace eviction fence\n");
1125 
1126 unlock_all:
1127 	drm_exec_fini(&exec);
1128 	xa_for_each(&xa, tmp_key, range) {
1129 		if (!range)
1130 			continue;
1131 		bo = range->bo;
1132 		amdgpu_hmm_range_free(range);
1133 	}
1134 	xa_destroy(&xa);
1135 	return ret;
1136 }
1137 
1138 static void amdgpu_userq_restore_worker(struct work_struct *work)
1139 {
1140 	struct amdgpu_userq_mgr *uq_mgr = work_to_uq_mgr(work, resume_work.work);
1141 	struct amdgpu_fpriv *fpriv = uq_mgr_to_fpriv(uq_mgr);
1142 	int ret;
1143 
1144 	flush_delayed_work(&fpriv->evf_mgr.suspend_work);
1145 
1146 	mutex_lock(&uq_mgr->userq_mutex);
1147 
1148 	ret = amdgpu_userq_vm_validate(uq_mgr);
1149 	if (ret) {
1150 		drm_file_err(uq_mgr->file, "Failed to validate BOs to restore\n");
1151 		goto unlock;
1152 	}
1153 
1154 	ret = amdgpu_userq_restore_all(uq_mgr);
1155 	if (ret) {
1156 		drm_file_err(uq_mgr->file, "Failed to restore all queues\n");
1157 		goto unlock;
1158 	}
1159 
1160 unlock:
1161 	mutex_unlock(&uq_mgr->userq_mutex);
1162 }
1163 
1164 static int
1165 amdgpu_userq_evict_all(struct amdgpu_userq_mgr *uq_mgr)
1166 {
1167 	struct amdgpu_usermode_queue *queue;
1168 	unsigned long queue_id;
1169 	int ret = 0, r;
1170 
1171 	amdgpu_userq_detect_and_reset_queues(uq_mgr);
1172 	/* Try to unmap all the queues in this process ctx */
1173 	xa_for_each(&uq_mgr->userq_xa, queue_id, queue) {
1174 		r = amdgpu_userq_preempt_helper(queue);
1175 		if (r)
1176 			ret = r;
1177 	}
1178 
1179 	if (ret)
1180 		drm_file_err(uq_mgr->file, "Couldn't unmap all the queues\n");
1181 	return ret;
1182 }
1183 
1184 void amdgpu_userq_reset_work(struct work_struct *work)
1185 {
1186 	struct amdgpu_device *adev = container_of(work, struct amdgpu_device,
1187 						  userq_reset_work);
1188 	struct amdgpu_reset_context reset_context;
1189 
1190 	memset(&reset_context, 0, sizeof(reset_context));
1191 
1192 	reset_context.method = AMD_RESET_METHOD_NONE;
1193 	reset_context.reset_req_dev = adev;
1194 	reset_context.src = AMDGPU_RESET_SRC_USERQ;
1195 	set_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags);
1196 	/*set_bit(AMDGPU_SKIP_COREDUMP, &reset_context.flags);*/
1197 
1198 	amdgpu_device_gpu_recover(adev, NULL, &reset_context);
1199 }
1200 
1201 static int
1202 amdgpu_userq_wait_for_signal(struct amdgpu_userq_mgr *uq_mgr)
1203 {
1204 	struct amdgpu_usermode_queue *queue;
1205 	unsigned long queue_id;
1206 	int ret;
1207 
1208 	xa_for_each(&uq_mgr->userq_xa, queue_id, queue) {
1209 		struct dma_fence *f = queue->last_fence;
1210 
1211 		if (!f || dma_fence_is_signaled(f))
1212 			continue;
1213 		ret = dma_fence_wait_timeout(f, true, msecs_to_jiffies(100));
1214 		if (ret <= 0) {
1215 			drm_file_err(uq_mgr->file, "Timed out waiting for fence=%llu:%llu\n",
1216 				     f->context, f->seqno);
1217 			return -ETIMEDOUT;
1218 		}
1219 	}
1220 
1221 	return 0;
1222 }
1223 
1224 void
1225 amdgpu_userq_evict(struct amdgpu_userq_mgr *uq_mgr,
1226 		   struct amdgpu_eviction_fence *ev_fence)
1227 {
1228 	struct amdgpu_fpriv *fpriv = uq_mgr_to_fpriv(uq_mgr);
1229 	struct amdgpu_eviction_fence_mgr *evf_mgr = &fpriv->evf_mgr;
1230 	struct amdgpu_device *adev = uq_mgr->adev;
1231 	int ret;
1232 
1233 	/* Wait for any pending userqueue fence work to finish */
1234 	ret = amdgpu_userq_wait_for_signal(uq_mgr);
1235 	if (ret)
1236 		dev_err(adev->dev, "Not evicting userqueue, timeout waiting for work\n");
1237 
1238 	ret = amdgpu_userq_evict_all(uq_mgr);
1239 	if (ret)
1240 		dev_err(adev->dev, "Failed to evict userqueue\n");
1241 
1242 	/* Signal current eviction fence */
1243 	amdgpu_eviction_fence_signal(evf_mgr, ev_fence);
1244 
1245 	if (evf_mgr->fd_closing) {
1246 		cancel_delayed_work_sync(&uq_mgr->resume_work);
1247 		return;
1248 	}
1249 
1250 	/* Schedule a resume work */
1251 	schedule_delayed_work(&uq_mgr->resume_work, 0);
1252 }
1253 
1254 int amdgpu_userq_mgr_init(struct amdgpu_userq_mgr *userq_mgr, struct drm_file *file_priv,
1255 			  struct amdgpu_device *adev)
1256 {
1257 	mutex_init(&userq_mgr->userq_mutex);
1258 	xa_init_flags(&userq_mgr->userq_xa, XA_FLAGS_ALLOC);
1259 	userq_mgr->adev = adev;
1260 	userq_mgr->file = file_priv;
1261 
1262 	INIT_DELAYED_WORK(&userq_mgr->resume_work, amdgpu_userq_restore_worker);
1263 	return 0;
1264 }
1265 
1266 void amdgpu_userq_mgr_fini(struct amdgpu_userq_mgr *userq_mgr)
1267 {
1268 	struct amdgpu_usermode_queue *queue;
1269 	unsigned long queue_id;
1270 
1271 	cancel_delayed_work_sync(&userq_mgr->resume_work);
1272 
1273 	mutex_lock(&userq_mgr->userq_mutex);
1274 	amdgpu_userq_detect_and_reset_queues(userq_mgr);
1275 	xa_for_each(&userq_mgr->userq_xa, queue_id, queue) {
1276 		amdgpu_userq_wait_for_last_fence(queue);
1277 		amdgpu_userq_unmap_helper(queue);
1278 		amdgpu_userq_cleanup(queue, queue_id);
1279 	}
1280 
1281 	xa_destroy(&userq_mgr->userq_xa);
1282 	mutex_unlock(&userq_mgr->userq_mutex);
1283 	mutex_destroy(&userq_mgr->userq_mutex);
1284 }
1285 
1286 int amdgpu_userq_suspend(struct amdgpu_device *adev)
1287 {
1288 	u32 ip_mask = amdgpu_userq_get_supported_ip_mask(adev);
1289 	struct amdgpu_usermode_queue *queue;
1290 	struct amdgpu_userq_mgr *uqm;
1291 	unsigned long queue_id;
1292 	int r;
1293 
1294 	if (!ip_mask)
1295 		return 0;
1296 
1297 	xa_for_each(&adev->userq_doorbell_xa, queue_id, queue) {
1298 		uqm = queue->userq_mgr;
1299 		cancel_delayed_work_sync(&uqm->resume_work);
1300 		guard(mutex)(&uqm->userq_mutex);
1301 		amdgpu_userq_detect_and_reset_queues(uqm);
1302 		if (adev->in_s0ix)
1303 			r = amdgpu_userq_preempt_helper(queue);
1304 		else
1305 			r = amdgpu_userq_unmap_helper(queue);
1306 		if (r)
1307 			return r;
1308 	}
1309 	return 0;
1310 }
1311 
1312 int amdgpu_userq_resume(struct amdgpu_device *adev)
1313 {
1314 	u32 ip_mask = amdgpu_userq_get_supported_ip_mask(adev);
1315 	struct amdgpu_usermode_queue *queue;
1316 	struct amdgpu_userq_mgr *uqm;
1317 	unsigned long queue_id;
1318 	int r;
1319 
1320 	if (!ip_mask)
1321 		return 0;
1322 
1323 	xa_for_each(&adev->userq_doorbell_xa, queue_id, queue) {
1324 		uqm = queue->userq_mgr;
1325 		guard(mutex)(&uqm->userq_mutex);
1326 		if (adev->in_s0ix)
1327 			r = amdgpu_userq_restore_helper(queue);
1328 		else
1329 			r = amdgpu_userq_map_helper(queue);
1330 		if (r)
1331 			return r;
1332 	}
1333 
1334 	return 0;
1335 }
1336 
1337 int amdgpu_userq_stop_sched_for_enforce_isolation(struct amdgpu_device *adev,
1338 						  u32 idx)
1339 {
1340 	u32 ip_mask = amdgpu_userq_get_supported_ip_mask(adev);
1341 	struct amdgpu_usermode_queue *queue;
1342 	struct amdgpu_userq_mgr *uqm;
1343 	unsigned long queue_id;
1344 	int ret = 0, r;
1345 
1346 	/* only need to stop gfx/compute */
1347 	if (!(ip_mask & ((1 << AMDGPU_HW_IP_GFX) | (1 << AMDGPU_HW_IP_COMPUTE))))
1348 		return 0;
1349 
1350 	if (adev->userq_halt_for_enforce_isolation)
1351 		dev_warn(adev->dev, "userq scheduling already stopped!\n");
1352 	adev->userq_halt_for_enforce_isolation = true;
1353 	xa_for_each(&adev->userq_doorbell_xa, queue_id, queue) {
1354 		uqm = queue->userq_mgr;
1355 		cancel_delayed_work_sync(&uqm->resume_work);
1356 		mutex_lock(&uqm->userq_mutex);
1357 		if (((queue->queue_type == AMDGPU_HW_IP_GFX) ||
1358 		     (queue->queue_type == AMDGPU_HW_IP_COMPUTE)) &&
1359 		    (queue->xcp_id == idx)) {
1360 			amdgpu_userq_detect_and_reset_queues(uqm);
1361 			r = amdgpu_userq_preempt_helper(queue);
1362 			if (r)
1363 				ret = r;
1364 		}
1365 		mutex_unlock(&uqm->userq_mutex);
1366 	}
1367 
1368 	return ret;
1369 }
1370 
1371 int amdgpu_userq_start_sched_for_enforce_isolation(struct amdgpu_device *adev,
1372 						   u32 idx)
1373 {
1374 	u32 ip_mask = amdgpu_userq_get_supported_ip_mask(adev);
1375 	struct amdgpu_usermode_queue *queue;
1376 	struct amdgpu_userq_mgr *uqm;
1377 	unsigned long queue_id;
1378 	int ret = 0, r;
1379 
1380 	/* only need to stop gfx/compute */
1381 	if (!(ip_mask & ((1 << AMDGPU_HW_IP_GFX) | (1 << AMDGPU_HW_IP_COMPUTE))))
1382 		return 0;
1383 
1384 	if (!adev->userq_halt_for_enforce_isolation)
1385 		dev_warn(adev->dev, "userq scheduling already started!\n");
1386 	adev->userq_halt_for_enforce_isolation = false;
1387 	xa_for_each(&adev->userq_doorbell_xa, queue_id, queue) {
1388 		uqm = queue->userq_mgr;
1389 		mutex_lock(&uqm->userq_mutex);
1390 			if (((queue->queue_type == AMDGPU_HW_IP_GFX) ||
1391 			     (queue->queue_type == AMDGPU_HW_IP_COMPUTE)) &&
1392 			    (queue->xcp_id == idx)) {
1393 			r = amdgpu_userq_restore_helper(queue);
1394 			if (r)
1395 				ret = r;
1396 			}
1397 		mutex_unlock(&uqm->userq_mutex);
1398 	}
1399 
1400 	return ret;
1401 }
1402 
1403 int amdgpu_userq_gem_va_unmap_validate(struct amdgpu_device *adev,
1404 				       struct amdgpu_bo_va_mapping *mapping,
1405 				       uint64_t saddr)
1406 {
1407 	u32 ip_mask = amdgpu_userq_get_supported_ip_mask(adev);
1408 	struct amdgpu_bo_va *bo_va = mapping->bo_va;
1409 	struct dma_resv *resv = bo_va->base.bo->tbo.base.resv;
1410 	int ret = 0;
1411 
1412 	if (!ip_mask)
1413 		return 0;
1414 
1415 	dev_warn_once(adev->dev, "now unmapping a vital queue va:%llx\n", saddr);
1416 	/**
1417 	 * The userq VA mapping reservation should include the eviction fence,
1418 	 * if the eviction fence can't signal successfully during unmapping,
1419 	 * then driver will warn to flag this improper unmap of the userq VA.
1420 	 * Note: The eviction fence may be attached to different BOs, and this
1421 	 * unmap is only for one kind of userq VAs, so at this point suppose
1422 	 * the eviction fence is always unsignaled.
1423 	 */
1424 	if (!dma_resv_test_signaled(resv, DMA_RESV_USAGE_BOOKKEEP)) {
1425 		ret = dma_resv_wait_timeout(resv, DMA_RESV_USAGE_BOOKKEEP, true,
1426 					    MAX_SCHEDULE_TIMEOUT);
1427 		if (ret <= 0)
1428 			return -EBUSY;
1429 	}
1430 
1431 	return 0;
1432 }
1433 
1434 void amdgpu_userq_pre_reset(struct amdgpu_device *adev)
1435 {
1436 	const struct amdgpu_userq_funcs *userq_funcs;
1437 	struct amdgpu_usermode_queue *queue;
1438 	struct amdgpu_userq_mgr *uqm;
1439 	unsigned long queue_id;
1440 
1441 	xa_for_each(&adev->userq_doorbell_xa, queue_id, queue) {
1442 		uqm = queue->userq_mgr;
1443 		cancel_delayed_work_sync(&uqm->resume_work);
1444 		if (queue->state == AMDGPU_USERQ_STATE_MAPPED) {
1445 			amdgpu_userq_wait_for_last_fence(queue);
1446 			userq_funcs = adev->userq_funcs[queue->queue_type];
1447 			userq_funcs->unmap(queue);
1448 			/* just mark all queues as hung at this point.
1449 			 * if unmap succeeds, we could map again
1450 			 * in amdgpu_userq_post_reset() if vram is not lost
1451 			 */
1452 			queue->state = AMDGPU_USERQ_STATE_HUNG;
1453 			amdgpu_userq_fence_driver_force_completion(queue);
1454 		}
1455 	}
1456 }
1457 
1458 int amdgpu_userq_post_reset(struct amdgpu_device *adev, bool vram_lost)
1459 {
1460 	/* if any queue state is AMDGPU_USERQ_STATE_UNMAPPED
1461 	 * at this point, we should be able to map it again
1462 	 * and continue if vram is not lost.
1463 	 */
1464 	struct amdgpu_usermode_queue *queue;
1465 	const struct amdgpu_userq_funcs *userq_funcs;
1466 	unsigned long queue_id;
1467 	int r = 0;
1468 
1469 	xa_for_each(&adev->userq_doorbell_xa, queue_id, queue) {
1470 		if (queue->state == AMDGPU_USERQ_STATE_HUNG && !vram_lost) {
1471 			userq_funcs = adev->userq_funcs[queue->queue_type];
1472 			/* Re-map queue */
1473 			r = userq_funcs->map(queue);
1474 			if (r) {
1475 				dev_err(adev->dev, "Failed to remap queue %ld\n", queue_id);
1476 				continue;
1477 			}
1478 			queue->state = AMDGPU_USERQ_STATE_MAPPED;
1479 		}
1480 	}
1481 
1482 	return r;
1483 }
1484