xref: /linux/drivers/gpu/drm/amd/amdkfd/kfd_queue.c (revision 75372d75a4e23783583998ed99d5009d555850da)
1 // SPDX-License-Identifier: GPL-2.0 OR MIT
2 /*
3  * Copyright 2014-2022 Advanced Micro Devices, Inc.
4  *
5  * Permission is hereby granted, free of charge, to any person obtaining a
6  * copy of this software and associated documentation files (the "Software"),
7  * to deal in the Software without restriction, including without limitation
8  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
9  * and/or sell copies of the Software, and to permit persons to whom the
10  * Software is furnished to do so, subject to the following conditions:
11  *
12  * The above copyright notice and this permission notice shall be included in
13  * all copies or substantial portions of the Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18  * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
19  * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
20  * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
21  * OTHER DEALINGS IN THE SOFTWARE.
22  *
23  */
24 
25 #include <linux/slab.h>
26 #include "kfd_priv.h"
27 #include "kfd_topology.h"
28 #include "kfd_svm.h"
29 
30 void print_queue_properties(struct queue_properties *q)
31 {
32 	if (!q)
33 		return;
34 
35 	pr_debug("Printing queue properties:\n");
36 	pr_debug("Queue Type: %u\n", q->type);
37 	pr_debug("Queue Size: %llu\n", q->queue_size);
38 	pr_debug("Queue percent: %u\n", q->queue_percent);
39 	pr_debug("Queue Address: 0x%llX\n", q->queue_address);
40 	pr_debug("Queue Id: %u\n", q->queue_id);
41 	pr_debug("Queue Process Vmid: %u\n", q->vmid);
42 	pr_debug("Queue Read Pointer: 0x%px\n", q->read_ptr);
43 	pr_debug("Queue Write Pointer: 0x%px\n", q->write_ptr);
44 	pr_debug("Queue Doorbell Pointer: 0x%p\n", q->doorbell_ptr);
45 	pr_debug("Queue Doorbell Offset: %u\n", q->doorbell_off);
46 }
47 
48 void print_queue(struct queue *q)
49 {
50 	if (!q)
51 		return;
52 	pr_debug("Printing queue:\n");
53 	pr_debug("Queue Type: %u\n", q->properties.type);
54 	pr_debug("Queue Size: %llu\n", q->properties.queue_size);
55 	pr_debug("Queue percent: %u\n", q->properties.queue_percent);
56 	pr_debug("Queue Address: 0x%llX\n", q->properties.queue_address);
57 	pr_debug("Queue Id: %u\n", q->properties.queue_id);
58 	pr_debug("Queue Process Vmid: %u\n", q->properties.vmid);
59 	pr_debug("Queue Read Pointer: 0x%px\n", q->properties.read_ptr);
60 	pr_debug("Queue Write Pointer: 0x%px\n", q->properties.write_ptr);
61 	pr_debug("Queue Doorbell Pointer: 0x%p\n", q->properties.doorbell_ptr);
62 	pr_debug("Queue Doorbell Offset: %u\n", q->properties.doorbell_off);
63 	pr_debug("Queue MQD Address: 0x%p\n", q->mqd);
64 	pr_debug("Queue MQD Gart: 0x%llX\n", q->gart_mqd_addr);
65 	pr_debug("Queue Process Address: 0x%p\n", q->process);
66 	pr_debug("Queue Device Address: 0x%p\n", q->device);
67 }
68 
69 int init_queue(struct queue **q, const struct queue_properties *properties)
70 {
71 	struct queue *tmp_q;
72 
73 	tmp_q = kzalloc(sizeof(*tmp_q), GFP_KERNEL);
74 	if (!tmp_q)
75 		return -ENOMEM;
76 
77 	memcpy(&tmp_q->properties, properties, sizeof(*properties));
78 
79 	*q = tmp_q;
80 	return 0;
81 }
82 
83 void uninit_queue(struct queue *q)
84 {
85 	kfree(q);
86 }
87 
88 #if IS_ENABLED(CONFIG_HSA_AMD_SVM)
89 
90 static int kfd_queue_buffer_svm_get(struct kfd_process_device *pdd, u64 addr, u64 size)
91 {
92 	struct kfd_process *p = pdd->process;
93 	struct list_head update_list;
94 	struct svm_range *prange;
95 	int ret = -EINVAL;
96 
97 	INIT_LIST_HEAD(&update_list);
98 	addr >>= PAGE_SHIFT;
99 	size >>= PAGE_SHIFT;
100 
101 	mutex_lock(&p->svms.lock);
102 
103 	/*
104 	 * range may split to multiple svm pranges aligned to granularity boundaery.
105 	 */
106 	while (size) {
107 		uint32_t gpuid, gpuidx;
108 		int r;
109 
110 		prange = svm_range_from_addr(&p->svms, addr, NULL);
111 		if (!prange)
112 			break;
113 
114 		if (!prange->mapped_to_gpu)
115 			break;
116 
117 		r = kfd_process_gpuid_from_node(p, pdd->dev, &gpuid, &gpuidx);
118 		if (r < 0)
119 			break;
120 		if (!test_bit(gpuidx, prange->bitmap_access) &&
121 		    !test_bit(gpuidx, prange->bitmap_aip))
122 			break;
123 
124 		if (!(prange->flags & KFD_IOCTL_SVM_FLAG_GPU_ALWAYS_MAPPED))
125 			break;
126 
127 		list_add(&prange->update_list, &update_list);
128 
129 		if (prange->last - prange->start + 1 >= size) {
130 			size = 0;
131 			break;
132 		}
133 
134 		size -= prange->last - prange->start + 1;
135 		addr += prange->last - prange->start + 1;
136 	}
137 	if (size) {
138 		pr_debug("[0x%llx 0x%llx] not registered\n", addr, addr + size - 1);
139 		goto out_unlock;
140 	}
141 
142 	list_for_each_entry(prange, &update_list, update_list)
143 		atomic_inc(&prange->queue_refcount);
144 	ret = 0;
145 
146 out_unlock:
147 	mutex_unlock(&p->svms.lock);
148 	return ret;
149 }
150 
151 static void kfd_queue_buffer_svm_put(struct kfd_process_device *pdd, u64 addr, u64 size)
152 {
153 	struct kfd_process *p = pdd->process;
154 	struct svm_range *prange, *pchild;
155 	struct interval_tree_node *node;
156 	unsigned long last;
157 
158 	addr >>= PAGE_SHIFT;
159 	last = addr + (size >> PAGE_SHIFT) - 1;
160 
161 	mutex_lock(&p->svms.lock);
162 
163 	node = interval_tree_iter_first(&p->svms.objects, addr, last);
164 	while (node) {
165 		struct interval_tree_node *next_node;
166 		unsigned long next_start;
167 
168 		prange = container_of(node, struct svm_range, it_node);
169 		next_node = interval_tree_iter_next(node, addr, last);
170 		next_start = min(node->last, last) + 1;
171 
172 		if (atomic_add_unless(&prange->queue_refcount, -1, 0)) {
173 			list_for_each_entry(pchild, &prange->child_list, child_list)
174 				atomic_add_unless(&pchild->queue_refcount, -1, 0);
175 		}
176 
177 		node = next_node;
178 		addr = next_start;
179 	}
180 
181 	mutex_unlock(&p->svms.lock);
182 }
183 #else
184 
185 static int kfd_queue_buffer_svm_get(struct kfd_process_device *pdd, u64 addr, u64 size)
186 {
187 	return -EINVAL;
188 }
189 
190 static void kfd_queue_buffer_svm_put(struct kfd_process_device *pdd, u64 addr, u64 size)
191 {
192 }
193 
194 #endif
195 
196 int kfd_queue_buffer_get(struct amdgpu_vm *vm, void __user *addr, struct amdgpu_bo **pbo,
197 			 u64 expected_size)
198 {
199 	struct amdgpu_bo_va_mapping *mapping;
200 	u64 user_addr;
201 	u64 size;
202 
203 	user_addr = (u64)addr >> AMDGPU_GPU_PAGE_SHIFT;
204 	size = expected_size >> AMDGPU_GPU_PAGE_SHIFT;
205 
206 	mapping = amdgpu_vm_bo_lookup_mapping(vm, user_addr);
207 	if (!mapping)
208 		goto out_err;
209 
210 	if (user_addr != mapping->start ||
211 	    (size != 0 && user_addr + size - 1 != mapping->last)) {
212 		pr_debug("expected size 0x%llx not equal to mapping addr 0x%llx size 0x%llx\n",
213 			expected_size, mapping->start << AMDGPU_GPU_PAGE_SHIFT,
214 			(mapping->last - mapping->start + 1) << AMDGPU_GPU_PAGE_SHIFT);
215 		goto out_err;
216 	}
217 
218 	*pbo = amdgpu_bo_ref(mapping->bo_va->base.bo);
219 	mapping->bo_va->queue_refcount++;
220 	return 0;
221 
222 out_err:
223 	*pbo = NULL;
224 	return -EINVAL;
225 }
226 
227 /* FIXME: remove this function, just call amdgpu_bo_unref directly */
228 void kfd_queue_buffer_put(struct amdgpu_bo **bo)
229 {
230 	amdgpu_bo_unref(bo);
231 }
232 
233 int kfd_queue_acquire_buffers(struct kfd_process_device *pdd, struct queue_properties *properties)
234 {
235 	struct kfd_topology_device *topo_dev;
236 	u64 expected_queue_size;
237 	struct amdgpu_vm *vm;
238 	u32 total_cwsr_size;
239 	int err;
240 
241 	topo_dev = kfd_topology_device_by_id(pdd->dev->id);
242 	if (!topo_dev)
243 		return -EINVAL;
244 
245 	/* AQL queues on GFX7 and GFX8 appear twice their actual size */
246 	if (properties->type == KFD_QUEUE_TYPE_COMPUTE &&
247 	    properties->format == KFD_QUEUE_FORMAT_AQL &&
248 	    topo_dev->node_props.gfx_target_version >= 70000 &&
249 	    topo_dev->node_props.gfx_target_version < 90000)
250 		/* metadata_queue_size not supported on GFX7/GFX8 */
251 		expected_queue_size =
252 			properties->queue_size / 2;
253 	else
254 		expected_queue_size =
255 			properties->queue_size + properties->metadata_queue_size;
256 
257 	vm = drm_priv_to_vm(pdd->drm_priv);
258 	err = amdgpu_bo_reserve(vm->root.bo, false);
259 	if (err)
260 		return err;
261 
262 	err = kfd_queue_buffer_get(vm, properties->write_ptr, &properties->wptr_bo, PAGE_SIZE);
263 	if (err)
264 		goto out_err_unreserve;
265 
266 	err = kfd_queue_buffer_get(vm, properties->read_ptr, &properties->rptr_bo, PAGE_SIZE);
267 	if (err)
268 		goto out_err_unreserve;
269 
270 	err = kfd_queue_buffer_get(vm, (void *)properties->queue_address,
271 				   &properties->ring_bo, expected_queue_size);
272 	if (err)
273 		goto out_err_unreserve;
274 
275 	/* only compute queue requires EOP buffer and CWSR area */
276 	if (properties->type != KFD_QUEUE_TYPE_COMPUTE)
277 		goto out_unreserve;
278 
279 	/* EOP buffer is not required for all ASICs */
280 	if (properties->eop_ring_buffer_address) {
281 		if (properties->eop_ring_buffer_size != topo_dev->node_props.eop_buffer_size) {
282 			pr_debug("queue eop bo size 0x%x not equal to node eop buf size 0x%x\n",
283 				properties->eop_ring_buffer_size,
284 				topo_dev->node_props.eop_buffer_size);
285 			err = -EINVAL;
286 			goto out_err_unreserve;
287 		}
288 		err = kfd_queue_buffer_get(vm, (void *)properties->eop_ring_buffer_address,
289 					   &properties->eop_buf_bo,
290 					   properties->eop_ring_buffer_size);
291 		if (err)
292 			goto out_err_unreserve;
293 	}
294 
295 	if (properties->ctl_stack_size != topo_dev->node_props.ctl_stack_size) {
296 		pr_debug("queue ctl stack size 0x%x not equal to node ctl stack size 0x%x\n",
297 			properties->ctl_stack_size,
298 			topo_dev->node_props.ctl_stack_size);
299 		err = -EINVAL;
300 		goto out_err_unreserve;
301 	}
302 
303 	if (properties->ctx_save_restore_area_size < topo_dev->node_props.cwsr_size) {
304 		pr_debug("queue cwsr size 0x%x not sufficient for node cwsr size 0x%x\n",
305 			properties->ctx_save_restore_area_size,
306 			topo_dev->node_props.cwsr_size);
307 		err = -EINVAL;
308 		goto out_err_unreserve;
309 	}
310 
311 	total_cwsr_size = (properties->ctx_save_restore_area_size +
312 			   topo_dev->node_props.debug_memory_size) * NUM_XCC(pdd->dev->xcc_mask);
313 	total_cwsr_size = ALIGN(total_cwsr_size, PAGE_SIZE);
314 
315 	err = kfd_queue_buffer_get(vm, (void *)properties->ctx_save_restore_area_address,
316 				   &properties->cwsr_bo, total_cwsr_size);
317 	if (!err)
318 		goto out_unreserve;
319 
320 	amdgpu_bo_unreserve(vm->root.bo);
321 
322 	err = kfd_queue_buffer_svm_get(pdd, properties->ctx_save_restore_area_address,
323 				       total_cwsr_size);
324 	if (err)
325 		goto out_err_release;
326 
327 	return 0;
328 
329 out_unreserve:
330 	amdgpu_bo_unreserve(vm->root.bo);
331 	return 0;
332 
333 out_err_unreserve:
334 	amdgpu_bo_unreserve(vm->root.bo);
335 out_err_release:
336 	/* FIXME: make a _locked version of this that can be called before
337 	 * dropping the VM reservation.
338 	 */
339 	kfd_queue_unref_bo_vas(pdd, properties);
340 	kfd_queue_release_buffers(pdd, properties);
341 	return err;
342 }
343 
344 int kfd_queue_release_buffers(struct kfd_process_device *pdd, struct queue_properties *properties)
345 {
346 	struct kfd_topology_device *topo_dev;
347 	u32 total_cwsr_size;
348 
349 	kfd_queue_buffer_put(&properties->wptr_bo);
350 	kfd_queue_buffer_put(&properties->rptr_bo);
351 	kfd_queue_buffer_put(&properties->ring_bo);
352 	kfd_queue_buffer_put(&properties->eop_buf_bo);
353 	kfd_queue_buffer_put(&properties->cwsr_bo);
354 
355 	topo_dev = kfd_topology_device_by_id(pdd->dev->id);
356 	if (!topo_dev)
357 		return -EINVAL;
358 	total_cwsr_size = (properties->ctx_save_restore_area_size +
359 			   topo_dev->node_props.debug_memory_size) * NUM_XCC(pdd->dev->xcc_mask);
360 	total_cwsr_size = ALIGN(total_cwsr_size, PAGE_SIZE);
361 
362 	kfd_queue_buffer_svm_put(pdd, properties->ctx_save_restore_area_address, total_cwsr_size);
363 	return 0;
364 }
365 
366 void kfd_queue_unref_bo_va(struct amdgpu_vm *vm, struct amdgpu_bo **bo)
367 {
368 	if (*bo) {
369 		struct amdgpu_bo_va *bo_va;
370 
371 		bo_va = amdgpu_vm_bo_find(vm, *bo);
372 		if (bo_va && bo_va->queue_refcount)
373 			bo_va->queue_refcount--;
374 	}
375 }
376 
377 int kfd_queue_unref_bo_vas(struct kfd_process_device *pdd,
378 			   struct queue_properties *properties)
379 {
380 	struct amdgpu_vm *vm;
381 	int err;
382 
383 	vm = drm_priv_to_vm(pdd->drm_priv);
384 	err = amdgpu_bo_reserve(vm->root.bo, false);
385 	if (err)
386 		return err;
387 
388 	kfd_queue_unref_bo_va(vm, &properties->wptr_bo);
389 	kfd_queue_unref_bo_va(vm, &properties->rptr_bo);
390 	kfd_queue_unref_bo_va(vm, &properties->ring_bo);
391 	kfd_queue_unref_bo_va(vm, &properties->eop_buf_bo);
392 	kfd_queue_unref_bo_va(vm, &properties->cwsr_bo);
393 
394 	amdgpu_bo_unreserve(vm->root.bo);
395 	return 0;
396 }
397 
398 #define DEBUGGER_BYTES_ALIGN	64
399 #define DEBUGGER_BYTES_PER_WAVE	32
400 
401 static u32 kfd_get_sgpr_size_per_cu(u32 gfxv)
402 {
403 	u32 sgpr_size = 0x4000;
404 
405 	if (gfxv == 120500 ||
406 	    gfxv == 120501)
407 		sgpr_size = 0x8000;
408 
409 	return sgpr_size;
410 }
411 
412 static u32 kfd_get_vgpr_size_per_cu(u32 gfxv)
413 {
414 	u32 vgpr_size = 0x40000;
415 
416 	if (gfxv == 90402 ||			/* GFX_VERSION_AQUA_VANJARAM */
417 	    gfxv == 90010 ||			/* GFX_VERSION_ALDEBARAN */
418 	    gfxv == 90008 ||			/* GFX_VERSION_ARCTURUS */
419 	    gfxv == 90500)
420 		vgpr_size = 0x80000;
421 	else if (gfxv == 110000 ||		/* GFX_VERSION_PLUM_BONITO */
422 		 gfxv == 110001 ||		/* GFX_VERSION_WHEAT_NAS */
423 		 gfxv == 110501 ||		/* GFX_VERSION_GFX1151 */
424 		 gfxv == 120000 ||		/* GFX_VERSION_GFX1200 */
425 		 gfxv == 120001)		/* GFX_VERSION_GFX1201 */
426 		vgpr_size = 0x60000;
427 	else if (gfxv == 120500 ||		/* GFX_VERSION_GFX1250 */
428 		 gfxv == 120501)		/* GFX_VERSION_GFX1251 */
429 		vgpr_size = 0x80000;
430 
431 	return vgpr_size;
432 }
433 
434 static u32 kfd_get_hwreg_size_per_cu(u32 gfxv)
435 {
436 	u32 hwreg_size = 0x1000;
437 
438 	if (gfxv == 120500 || gfxv == 120501)
439 		hwreg_size = 0x8000;
440 
441 	return hwreg_size;
442 }
443 
444 static u32 kfd_get_lds_size_per_cu(u32 gfxv, struct kfd_node_properties *props)
445 {
446 	u32 lds_size = 0x10000;
447 
448 	if (gfxv == 90500 || gfxv == 120500 || gfxv == 120501)
449 		lds_size = props->lds_size_in_kb << 10;
450 
451 	return lds_size;
452 }
453 
454 static u32 get_num_waves(struct kfd_node_properties *props, u32 gfxv, u32 cu_num)
455 {
456 	u32 wave_num = 0;
457 
458 	if (gfxv < 100100)
459 		wave_num = min(cu_num * 40,
460 				props->array_count / props->simd_arrays_per_engine * 512);
461 	else if (gfxv < 120500)
462 		wave_num = cu_num * 32;
463 	else if (gfxv <= 120501)
464 		wave_num = cu_num * 64;
465 
466 	WARN_ON(wave_num == 0);
467 
468 	return wave_num;
469 }
470 
471 #define WG_CONTEXT_DATA_SIZE_PER_CU(gfxv, props)	\
472 	(kfd_get_vgpr_size_per_cu(gfxv) + kfd_get_sgpr_size_per_cu(gfxv) +\
473 	 kfd_get_lds_size_per_cu(gfxv, props) + kfd_get_hwreg_size_per_cu(gfxv))
474 
475 #define CNTL_STACK_BYTES_PER_WAVE(gfxv)	\
476 	((gfxv) >= 100100 ? 12 : 8)	/* GFX_VERSION_NAVI10*/
477 
478 #define SIZEOF_HSA_USER_CONTEXT_SAVE_AREA_HEADER 40
479 
480 void kfd_queue_ctx_save_restore_size(struct kfd_topology_device *dev)
481 {
482 	struct kfd_node_properties *props = &dev->node_props;
483 	u32 gfxv = props->gfx_target_version;
484 	u32 ctl_stack_size;
485 	u32 wg_data_size;
486 	u32 wave_num;
487 	u32 cu_num;
488 
489 	if (gfxv < 80001)	/* GFX_VERSION_CARRIZO */
490 		return;
491 
492 	cu_num = props->simd_count / props->simd_per_cu / NUM_XCC(dev->gpu->xcc_mask);
493 	wave_num = get_num_waves(props, gfxv, cu_num);
494 
495 	wg_data_size = ALIGN(cu_num * WG_CONTEXT_DATA_SIZE_PER_CU(gfxv, props), PAGE_SIZE);
496 	ctl_stack_size = wave_num * CNTL_STACK_BYTES_PER_WAVE(gfxv) + 8;
497 	ctl_stack_size = ALIGN(SIZEOF_HSA_USER_CONTEXT_SAVE_AREA_HEADER + ctl_stack_size,
498 			       PAGE_SIZE);
499 
500 	if ((gfxv / 10000 * 10000) == 100000) {
501 		/* HW design limits control stack size to 0x7000.
502 		 * This is insufficient for theoretical PM4 cases
503 		 * but sufficient for AQL, limited by SPI events.
504 		 */
505 		ctl_stack_size = min(ctl_stack_size, 0x7000);
506 	}
507 
508 	props->ctl_stack_size = ctl_stack_size;
509 	props->debug_memory_size = ALIGN(wave_num * DEBUGGER_BYTES_PER_WAVE, DEBUGGER_BYTES_ALIGN);
510 	props->cwsr_size = ctl_stack_size + wg_data_size;
511 
512 	if (gfxv == 80002)	/* GFX_VERSION_TONGA */
513 		props->eop_buffer_size = 0x8000;
514 	else if (gfxv == 90402)	/* GFX_VERSION_AQUA_VANJARAM */
515 		props->eop_buffer_size = 4096;
516 	else if (gfxv >= 80000)
517 		props->eop_buffer_size = 4096;
518 }
519