xref: /linux/drivers/gpu/drm/amd/amdkfd/kfd_queue.c (revision c3f15273721f2ee60d32fc7d4f2c233a1eff47a8)
1 // SPDX-License-Identifier: GPL-2.0 OR MIT
2 /*
3  * Copyright 2014-2022 Advanced Micro Devices, Inc.
4  *
5  * Permission is hereby granted, free of charge, to any person obtaining a
6  * copy of this software and associated documentation files (the "Software"),
7  * to deal in the Software without restriction, including without limitation
8  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
9  * and/or sell copies of the Software, and to permit persons to whom the
10  * Software is furnished to do so, subject to the following conditions:
11  *
12  * The above copyright notice and this permission notice shall be included in
13  * all copies or substantial portions of the Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18  * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
19  * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
20  * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
21  * OTHER DEALINGS IN THE SOFTWARE.
22  *
23  */
24 
25 #include <linux/slab.h>
26 #include "kfd_priv.h"
27 #include "kfd_topology.h"
28 #include "kfd_svm.h"
29 
30 void print_queue_properties(struct queue_properties *q)
31 {
32 	if (!q)
33 		return;
34 
35 	pr_debug("Printing queue properties:\n");
36 	pr_debug("Queue Type: %u\n", q->type);
37 	pr_debug("Queue Size: %llu\n", q->queue_size);
38 	pr_debug("Queue percent: %u\n", q->queue_percent);
39 	pr_debug("Queue Address: 0x%llX\n", q->queue_address);
40 	pr_debug("Queue Id: %u\n", q->queue_id);
41 	pr_debug("Queue Process Vmid: %u\n", q->vmid);
42 	pr_debug("Queue Read Pointer: 0x%px\n", q->read_ptr);
43 	pr_debug("Queue Write Pointer: 0x%px\n", q->write_ptr);
44 	pr_debug("Queue Doorbell Pointer: 0x%p\n", q->doorbell_ptr);
45 	pr_debug("Queue Doorbell Offset: %u\n", q->doorbell_off);
46 }
47 
48 void print_queue(struct queue *q)
49 {
50 	if (!q)
51 		return;
52 	pr_debug("Printing queue:\n");
53 	pr_debug("Queue Type: %u\n", q->properties.type);
54 	pr_debug("Queue Size: %llu\n", q->properties.queue_size);
55 	pr_debug("Queue percent: %u\n", q->properties.queue_percent);
56 	pr_debug("Queue Address: 0x%llX\n", q->properties.queue_address);
57 	pr_debug("Queue Id: %u\n", q->properties.queue_id);
58 	pr_debug("Queue Process Vmid: %u\n", q->properties.vmid);
59 	pr_debug("Queue Read Pointer: 0x%px\n", q->properties.read_ptr);
60 	pr_debug("Queue Write Pointer: 0x%px\n", q->properties.write_ptr);
61 	pr_debug("Queue Doorbell Pointer: 0x%p\n", q->properties.doorbell_ptr);
62 	pr_debug("Queue Doorbell Offset: %u\n", q->properties.doorbell_off);
63 	pr_debug("Queue MQD Address: 0x%p\n", q->mqd);
64 	pr_debug("Queue MQD Gart: 0x%llX\n", q->gart_mqd_addr);
65 	pr_debug("Queue Process Address: 0x%p\n", q->process);
66 	pr_debug("Queue Device Address: 0x%p\n", q->device);
67 }
68 
69 int init_queue(struct queue **q, const struct queue_properties *properties)
70 {
71 	struct queue *tmp_q;
72 
73 	tmp_q = kzalloc(sizeof(*tmp_q), GFP_KERNEL);
74 	if (!tmp_q)
75 		return -ENOMEM;
76 
77 	memcpy(&tmp_q->properties, properties, sizeof(*properties));
78 
79 	*q = tmp_q;
80 	return 0;
81 }
82 
83 void uninit_queue(struct queue *q)
84 {
85 	kfree(q);
86 }
87 
88 static int kfd_queue_buffer_svm_get(struct kfd_process_device *pdd, u64 addr, u64 size)
89 {
90 	struct kfd_process *p = pdd->process;
91 	struct list_head update_list;
92 	struct svm_range *prange;
93 	int ret = -EINVAL;
94 
95 	INIT_LIST_HEAD(&update_list);
96 	addr >>= PAGE_SHIFT;
97 	size >>= PAGE_SHIFT;
98 
99 	mutex_lock(&p->svms.lock);
100 
101 	/*
102 	 * range may split to multiple svm pranges aligned to granularity boundaery.
103 	 */
104 	while (size) {
105 		uint32_t gpuid, gpuidx;
106 		int r;
107 
108 		prange = svm_range_from_addr(&p->svms, addr, NULL);
109 		if (!prange)
110 			break;
111 
112 		if (!prange->mapped_to_gpu)
113 			break;
114 
115 		r = kfd_process_gpuid_from_node(p, pdd->dev, &gpuid, &gpuidx);
116 		if (r < 0)
117 			break;
118 		if (!test_bit(gpuidx, prange->bitmap_access) &&
119 		    !test_bit(gpuidx, prange->bitmap_aip))
120 			break;
121 
122 		if (!(prange->flags & KFD_IOCTL_SVM_FLAG_GPU_ALWAYS_MAPPED))
123 			break;
124 
125 		list_add(&prange->update_list, &update_list);
126 
127 		if (prange->last - prange->start + 1 >= size) {
128 			size = 0;
129 			break;
130 		}
131 
132 		size -= prange->last - prange->start + 1;
133 		addr += prange->last - prange->start + 1;
134 	}
135 	if (size) {
136 		pr_debug("[0x%llx 0x%llx] not registered\n", addr, addr + size - 1);
137 		goto out_unlock;
138 	}
139 
140 	list_for_each_entry(prange, &update_list, update_list)
141 		atomic_inc(&prange->queue_refcount);
142 	ret = 0;
143 
144 out_unlock:
145 	mutex_unlock(&p->svms.lock);
146 	return ret;
147 }
148 
149 static void kfd_queue_buffer_svm_put(struct kfd_process_device *pdd, u64 addr, u64 size)
150 {
151 	struct kfd_process *p = pdd->process;
152 	struct svm_range *prange, *pchild;
153 	struct interval_tree_node *node;
154 	unsigned long last;
155 
156 	addr >>= PAGE_SHIFT;
157 	last = addr + (size >> PAGE_SHIFT) - 1;
158 
159 	mutex_lock(&p->svms.lock);
160 
161 	node = interval_tree_iter_first(&p->svms.objects, addr, last);
162 	while (node) {
163 		struct interval_tree_node *next_node;
164 		unsigned long next_start;
165 
166 		prange = container_of(node, struct svm_range, it_node);
167 		next_node = interval_tree_iter_next(node, addr, last);
168 		next_start = min(node->last, last) + 1;
169 
170 		if (atomic_add_unless(&prange->queue_refcount, -1, 0)) {
171 			list_for_each_entry(pchild, &prange->child_list, child_list)
172 				atomic_add_unless(&pchild->queue_refcount, -1, 0);
173 		}
174 
175 		node = next_node;
176 		addr = next_start;
177 	}
178 
179 	mutex_unlock(&p->svms.lock);
180 }
181 
182 int kfd_queue_buffer_get(struct amdgpu_vm *vm, void __user *addr, struct amdgpu_bo **pbo,
183 			 u64 expected_size)
184 {
185 	struct amdgpu_bo_va_mapping *mapping;
186 	u64 user_addr;
187 	u64 size;
188 
189 	user_addr = (u64)addr >> AMDGPU_GPU_PAGE_SHIFT;
190 	size = expected_size >> AMDGPU_GPU_PAGE_SHIFT;
191 
192 	mapping = amdgpu_vm_bo_lookup_mapping(vm, user_addr);
193 	if (!mapping)
194 		goto out_err;
195 
196 	if (user_addr != mapping->start ||
197 	    (size != 0 && user_addr + size - 1 != mapping->last)) {
198 		pr_debug("expected size 0x%llx not equal to mapping addr 0x%llx size 0x%llx\n",
199 			expected_size, mapping->start << AMDGPU_GPU_PAGE_SHIFT,
200 			(mapping->last - mapping->start + 1) << AMDGPU_GPU_PAGE_SHIFT);
201 		goto out_err;
202 	}
203 
204 	*pbo = amdgpu_bo_ref(mapping->bo_va->base.bo);
205 	mapping->bo_va->queue_refcount++;
206 	return 0;
207 
208 out_err:
209 	*pbo = NULL;
210 	return -EINVAL;
211 }
212 
213 void kfd_queue_buffer_put(struct amdgpu_vm *vm, struct amdgpu_bo **bo)
214 {
215 	if (*bo) {
216 		struct amdgpu_bo_va *bo_va;
217 
218 		bo_va = amdgpu_vm_bo_find(vm, *bo);
219 		if (bo_va)
220 			bo_va->queue_refcount--;
221 	}
222 
223 	amdgpu_bo_unref(bo);
224 }
225 
226 int kfd_queue_acquire_buffers(struct kfd_process_device *pdd, struct queue_properties *properties)
227 {
228 	struct kfd_topology_device *topo_dev;
229 	struct amdgpu_vm *vm;
230 	u32 total_cwsr_size;
231 	int err;
232 
233 	topo_dev = kfd_topology_device_by_id(pdd->dev->id);
234 	if (!topo_dev)
235 		return -EINVAL;
236 
237 	vm = drm_priv_to_vm(pdd->drm_priv);
238 	err = amdgpu_bo_reserve(vm->root.bo, false);
239 	if (err)
240 		return err;
241 
242 	err = kfd_queue_buffer_get(vm, properties->write_ptr, &properties->wptr_bo, PAGE_SIZE);
243 	if (err)
244 		goto out_err_unreserve;
245 
246 	err = kfd_queue_buffer_get(vm, properties->read_ptr, &properties->rptr_bo, PAGE_SIZE);
247 	if (err)
248 		goto out_err_unreserve;
249 
250 	err = kfd_queue_buffer_get(vm, (void *)properties->queue_address,
251 				   &properties->ring_bo, properties->queue_size);
252 	if (err)
253 		goto out_err_unreserve;
254 
255 	/* only compute queue requires EOP buffer and CWSR area */
256 	if (properties->type != KFD_QUEUE_TYPE_COMPUTE)
257 		goto out_unreserve;
258 
259 	/* EOP buffer is not required for all ASICs */
260 	if (properties->eop_ring_buffer_address) {
261 		if (properties->eop_ring_buffer_size != topo_dev->node_props.eop_buffer_size) {
262 			pr_debug("queue eop bo size 0x%lx not equal to node eop buf size 0x%x\n",
263 				properties->eop_buf_bo->tbo.base.size,
264 				topo_dev->node_props.eop_buffer_size);
265 			err = -EINVAL;
266 			goto out_err_unreserve;
267 		}
268 		err = kfd_queue_buffer_get(vm, (void *)properties->eop_ring_buffer_address,
269 					   &properties->eop_buf_bo,
270 					   properties->eop_ring_buffer_size);
271 		if (err)
272 			goto out_err_unreserve;
273 	}
274 
275 	if (properties->ctl_stack_size != topo_dev->node_props.ctl_stack_size) {
276 		pr_debug("queue ctl stack size 0x%x not equal to node ctl stack size 0x%x\n",
277 			properties->ctl_stack_size,
278 			topo_dev->node_props.ctl_stack_size);
279 		err = -EINVAL;
280 		goto out_err_unreserve;
281 	}
282 
283 	if (properties->ctx_save_restore_area_size != topo_dev->node_props.cwsr_size) {
284 		pr_debug("queue cwsr size 0x%x not equal to node cwsr size 0x%x\n",
285 			properties->ctx_save_restore_area_size,
286 			topo_dev->node_props.cwsr_size);
287 		err = -EINVAL;
288 		goto out_err_unreserve;
289 	}
290 
291 	total_cwsr_size = (topo_dev->node_props.cwsr_size + topo_dev->node_props.debug_memory_size)
292 			  * NUM_XCC(pdd->dev->xcc_mask);
293 	total_cwsr_size = ALIGN(total_cwsr_size, PAGE_SIZE);
294 
295 	err = kfd_queue_buffer_get(vm, (void *)properties->ctx_save_restore_area_address,
296 				   &properties->cwsr_bo, total_cwsr_size);
297 	if (!err)
298 		goto out_unreserve;
299 
300 	amdgpu_bo_unreserve(vm->root.bo);
301 
302 	err = kfd_queue_buffer_svm_get(pdd, properties->ctx_save_restore_area_address,
303 				       total_cwsr_size);
304 	if (err)
305 		goto out_err_release;
306 
307 	return 0;
308 
309 out_unreserve:
310 	amdgpu_bo_unreserve(vm->root.bo);
311 	return 0;
312 
313 out_err_unreserve:
314 	amdgpu_bo_unreserve(vm->root.bo);
315 out_err_release:
316 	kfd_queue_release_buffers(pdd, properties);
317 	return err;
318 }
319 
320 int kfd_queue_release_buffers(struct kfd_process_device *pdd, struct queue_properties *properties)
321 {
322 	struct kfd_topology_device *topo_dev;
323 	struct amdgpu_vm *vm;
324 	u32 total_cwsr_size;
325 	int err;
326 
327 	vm = drm_priv_to_vm(pdd->drm_priv);
328 	err = amdgpu_bo_reserve(vm->root.bo, false);
329 	if (err)
330 		return err;
331 
332 	kfd_queue_buffer_put(vm, &properties->wptr_bo);
333 	kfd_queue_buffer_put(vm, &properties->rptr_bo);
334 	kfd_queue_buffer_put(vm, &properties->ring_bo);
335 	kfd_queue_buffer_put(vm, &properties->eop_buf_bo);
336 	kfd_queue_buffer_put(vm, &properties->cwsr_bo);
337 
338 	amdgpu_bo_unreserve(vm->root.bo);
339 
340 	topo_dev = kfd_topology_device_by_id(pdd->dev->id);
341 	if (!topo_dev)
342 		return -EINVAL;
343 	total_cwsr_size = (topo_dev->node_props.cwsr_size + topo_dev->node_props.debug_memory_size)
344 			  * NUM_XCC(pdd->dev->xcc_mask);
345 	total_cwsr_size = ALIGN(total_cwsr_size, PAGE_SIZE);
346 
347 	kfd_queue_buffer_svm_put(pdd, properties->ctx_save_restore_area_address, total_cwsr_size);
348 	return 0;
349 }
350 
351 #define SGPR_SIZE_PER_CU	0x4000
352 #define LDS_SIZE_PER_CU		0x10000
353 #define HWREG_SIZE_PER_CU	0x1000
354 #define DEBUGGER_BYTES_ALIGN	64
355 #define DEBUGGER_BYTES_PER_WAVE	32
356 
357 static u32 kfd_get_vgpr_size_per_cu(u32 gfxv)
358 {
359 	u32 vgpr_size = 0x40000;
360 
361 	if ((gfxv / 100 * 100) == 90400 ||	/* GFX_VERSION_AQUA_VANJARAM */
362 	    gfxv == 90010 ||			/* GFX_VERSION_ALDEBARAN */
363 	    gfxv == 90008)			/* GFX_VERSION_ARCTURUS */
364 		vgpr_size = 0x80000;
365 	else if (gfxv == 110000 ||		/* GFX_VERSION_PLUM_BONITO */
366 		 gfxv == 110001 ||		/* GFX_VERSION_WHEAT_NAS */
367 		 gfxv == 120000 ||		/* GFX_VERSION_GFX1200 */
368 		 gfxv == 120001)		/* GFX_VERSION_GFX1201 */
369 		vgpr_size = 0x60000;
370 
371 	return vgpr_size;
372 }
373 
374 #define WG_CONTEXT_DATA_SIZE_PER_CU(gfxv)	\
375 	(kfd_get_vgpr_size_per_cu(gfxv) + SGPR_SIZE_PER_CU +\
376 	 LDS_SIZE_PER_CU + HWREG_SIZE_PER_CU)
377 
378 #define CNTL_STACK_BYTES_PER_WAVE(gfxv)	\
379 	((gfxv) >= 100100 ? 12 : 8)	/* GFX_VERSION_NAVI10*/
380 
381 #define SIZEOF_HSA_USER_CONTEXT_SAVE_AREA_HEADER 40
382 
383 void kfd_queue_ctx_save_restore_size(struct kfd_topology_device *dev)
384 {
385 	struct kfd_node_properties *props = &dev->node_props;
386 	u32 gfxv = props->gfx_target_version;
387 	u32 ctl_stack_size;
388 	u32 wg_data_size;
389 	u32 wave_num;
390 	u32 cu_num;
391 
392 	if (gfxv < 80001)	/* GFX_VERSION_CARRIZO */
393 		return;
394 
395 	cu_num = props->simd_count / props->simd_per_cu / NUM_XCC(dev->gpu->xcc_mask);
396 	wave_num = (gfxv < 100100) ?	/* GFX_VERSION_NAVI10 */
397 		    min(cu_num * 40, props->array_count / props->simd_arrays_per_engine * 512)
398 		    : cu_num * 32;
399 
400 	wg_data_size = ALIGN(cu_num * WG_CONTEXT_DATA_SIZE_PER_CU(gfxv), PAGE_SIZE);
401 	ctl_stack_size = wave_num * CNTL_STACK_BYTES_PER_WAVE(gfxv) + 8;
402 	ctl_stack_size = ALIGN(SIZEOF_HSA_USER_CONTEXT_SAVE_AREA_HEADER + ctl_stack_size,
403 			       PAGE_SIZE);
404 
405 	if ((gfxv / 10000 * 10000) == 100000) {
406 		/* HW design limits control stack size to 0x7000.
407 		 * This is insufficient for theoretical PM4 cases
408 		 * but sufficient for AQL, limited by SPI events.
409 		 */
410 		ctl_stack_size = min(ctl_stack_size, 0x7000);
411 	}
412 
413 	props->ctl_stack_size = ctl_stack_size;
414 	props->debug_memory_size = ALIGN(wave_num * DEBUGGER_BYTES_PER_WAVE, DEBUGGER_BYTES_ALIGN);
415 	props->cwsr_size = ctl_stack_size + wg_data_size;
416 
417 	if (gfxv == 80002)	/* GFX_VERSION_TONGA */
418 		props->eop_buffer_size = 0x8000;
419 	else if ((gfxv / 100 * 100) == 90400)	/* GFX_VERSION_AQUA_VANJARAM */
420 		props->eop_buffer_size = 4096;
421 	else if (gfxv >= 80000)
422 		props->eop_buffer_size = 4096;
423 }
424