xref: /linux/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager.c (revision bfb4a6c721517a11b277e8841f8a7a64b1b14b72)
1 // SPDX-License-Identifier: GPL-2.0 OR MIT
2 /*
3  * Copyright 2014-2022 Advanced Micro Devices, Inc.
4  *
5  * Permission is hereby granted, free of charge, to any person obtaining a
6  * copy of this software and associated documentation files (the "Software"),
7  * to deal in the Software without restriction, including without limitation
8  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
9  * and/or sell copies of the Software, and to permit persons to whom the
10  * Software is furnished to do so, subject to the following conditions:
11  *
12  * The above copyright notice and this permission notice shall be included in
13  * all copies or substantial portions of the Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18  * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
19  * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
20  * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
21  * OTHER DEALINGS IN THE SOFTWARE.
22  *
23  */
24 
25 #include <linux/slab.h>
26 #include <linux/mutex.h>
27 #include "kfd_device_queue_manager.h"
28 #include "kfd_kernel_queue.h"
29 #include "kfd_priv.h"
30 
31 #define OVER_SUBSCRIPTION_PROCESS_COUNT (1 << 0)
32 #define OVER_SUBSCRIPTION_COMPUTE_QUEUE_COUNT (1 << 1)
33 #define OVER_SUBSCRIPTION_GWS_QUEUE_COUNT (1 << 2)
34 #define OVER_SUBSCRIPTION_XNACK_CONFLICT (1 << 3)
35 
36 static inline void inc_wptr(unsigned int *wptr, unsigned int increment_bytes,
37 				unsigned int buffer_size_bytes)
38 {
39 	unsigned int temp = *wptr + increment_bytes / sizeof(uint32_t);
40 
41 	WARN((temp * sizeof(uint32_t)) > buffer_size_bytes,
42 	     "Runlist IB overflow");
43 	*wptr = temp;
44 }
45 
46 static void pm_calc_rlib_size(struct packet_manager *pm,
47 				unsigned int *rlib_size,
48 				int *over_subscription,
49 				int xnack_conflict)
50 {
51 	unsigned int process_count, queue_count, compute_queue_count, gws_queue_count;
52 	unsigned int map_queue_size;
53 	unsigned int max_proc_per_quantum = 1;
54 	struct kfd_node *node = pm->dqm->dev;
55 	struct device *dev = node->adev->dev;
56 
57 	process_count = pm->dqm->processes_count;
58 	queue_count = pm->dqm->active_queue_count;
59 	compute_queue_count = pm->dqm->active_cp_queue_count;
60 	gws_queue_count = pm->dqm->gws_queue_count;
61 
62 	/* check if there is over subscription
63 	 * Note: the arbitration between the number of VMIDs and
64 	 * hws_max_conc_proc has been done in
65 	 * kgd2kfd_device_init().
66 	 */
67 	*over_subscription = 0;
68 
69 	if (node->max_proc_per_quantum > 1)
70 		max_proc_per_quantum = node->max_proc_per_quantum;
71 
72 	if (process_count > max_proc_per_quantum)
73 		*over_subscription |= OVER_SUBSCRIPTION_PROCESS_COUNT;
74 	if (compute_queue_count > get_cp_queues_num(pm->dqm))
75 		*over_subscription |= OVER_SUBSCRIPTION_COMPUTE_QUEUE_COUNT;
76 	if (gws_queue_count > 1)
77 		*over_subscription |= OVER_SUBSCRIPTION_GWS_QUEUE_COUNT;
78 	if (xnack_conflict && (node->adev->gmc.xnack_flags & AMDGPU_GMC_XNACK_FLAG_CHAIN))
79 		*over_subscription |= OVER_SUBSCRIPTION_XNACK_CONFLICT;
80 
81 	if (*over_subscription)
82 		dev_dbg(dev, "Over subscribed runlist\n");
83 
84 	map_queue_size = pm->pmf->map_queues_size;
85 	/* calculate run list ib allocation size */
86 	*rlib_size = process_count * pm->pmf->map_process_size +
87 		     queue_count * map_queue_size;
88 
89 	/*
90 	 * Increase the allocation size in case we need a chained run list
91 	 * when over subscription
92 	 */
93 	if (*over_subscription)
94 		*rlib_size += pm->pmf->runlist_size;
95 
96 	dev_dbg(dev, "runlist ib size %d\n", *rlib_size);
97 }
98 
99 static int pm_allocate_runlist_ib(struct packet_manager *pm,
100 				unsigned int **rl_buffer,
101 				uint64_t *rl_gpu_buffer,
102 				unsigned int *rl_buffer_size,
103 				int *is_over_subscription,
104 				int xnack_conflict)
105 {
106 	struct kfd_node *node = pm->dqm->dev;
107 	struct device *dev = node->adev->dev;
108 	int retval;
109 
110 	if (WARN_ON(pm->allocated))
111 		return -EINVAL;
112 
113 	pm_calc_rlib_size(pm, rl_buffer_size, is_over_subscription,
114 				xnack_conflict);
115 
116 	mutex_lock(&pm->lock);
117 
118 	retval = kfd_gtt_sa_allocate(node, *rl_buffer_size, &pm->ib_buffer_obj);
119 
120 	if (retval) {
121 		dev_err(dev, "Failed to allocate runlist IB\n");
122 		goto out;
123 	}
124 
125 	*(void **)rl_buffer = pm->ib_buffer_obj->cpu_ptr;
126 	*rl_gpu_buffer = pm->ib_buffer_obj->gpu_addr;
127 
128 	memset(*rl_buffer, 0, *rl_buffer_size);
129 	pm->allocated = true;
130 
131 out:
132 	mutex_unlock(&pm->lock);
133 	return retval;
134 }
135 
136 static int pm_create_runlist_ib(struct packet_manager *pm,
137 				struct list_head *queues,
138 				uint64_t *rl_gpu_addr,
139 				size_t *rl_size_bytes)
140 {
141 	unsigned int alloc_size_bytes;
142 	unsigned int *rl_buffer, rl_wptr, i;
143 	struct kfd_node *node = pm->dqm->dev;
144 	struct device *dev = node->adev->dev;
145 	int retval, processes_mapped;
146 	struct device_process_node *cur;
147 	struct qcm_process_device *qpd;
148 	struct queue *q;
149 	struct kernel_queue *kq;
150 	int is_over_subscription;
151 	int xnack_enabled = -1;
152 	bool xnack_conflict = 0;
153 
154 	rl_wptr = retval = processes_mapped = 0;
155 
156 	/* Check if processes set different xnack modes */
157 	list_for_each_entry(cur, queues, list) {
158 		qpd = cur->qpd;
159 		if (xnack_enabled < 0)
160 			/* First process */
161 			xnack_enabled = qpd->pqm->process->xnack_enabled;
162 		else if (qpd->pqm->process->xnack_enabled != xnack_enabled) {
163 			/* Found a process with a different xnack mode */
164 			xnack_conflict = 1;
165 			break;
166 		}
167 	}
168 
169 	retval = pm_allocate_runlist_ib(pm, &rl_buffer, rl_gpu_addr,
170 				&alloc_size_bytes, &is_over_subscription,
171 				xnack_conflict);
172 	if (retval)
173 		return retval;
174 
175 	*rl_size_bytes = alloc_size_bytes;
176 	pm->ib_size_bytes = alloc_size_bytes;
177 
178 	dev_dbg(dev, "Building runlist ib process count: %d queues count %d\n",
179 		pm->dqm->processes_count, pm->dqm->active_queue_count);
180 
181 build_runlist_ib:
182 	/* build the run list ib packet */
183 	list_for_each_entry(cur, queues, list) {
184 		qpd = cur->qpd;
185 		/* group processes with the same xnack mode together */
186 		if (qpd->pqm->process->xnack_enabled != xnack_enabled)
187 			continue;
188 		/* build map process packet */
189 		if (processes_mapped >= pm->dqm->processes_count) {
190 			dev_dbg(dev, "Not enough space left in runlist IB\n");
191 			pm_release_ib(pm);
192 			return -ENOMEM;
193 		}
194 
195 		retval = pm->pmf->map_process(pm, &rl_buffer[rl_wptr], qpd);
196 		if (retval)
197 			return retval;
198 
199 		processes_mapped++;
200 		inc_wptr(&rl_wptr, pm->pmf->map_process_size,
201 				alloc_size_bytes);
202 
203 		list_for_each_entry(kq, &qpd->priv_queue_list, list) {
204 			if (!kq->queue->properties.is_active)
205 				continue;
206 
207 			dev_dbg(dev,
208 				"static_queue, mapping kernel q %d, is debug status %d\n",
209 				kq->queue->queue, qpd->is_debug);
210 
211 			retval = pm->pmf->map_queues(pm,
212 						&rl_buffer[rl_wptr],
213 						kq->queue,
214 						qpd->is_debug);
215 			if (retval)
216 				return retval;
217 
218 			inc_wptr(&rl_wptr,
219 				pm->pmf->map_queues_size,
220 				alloc_size_bytes);
221 		}
222 
223 		list_for_each_entry(q, &qpd->queues_list, list) {
224 			if (!q->properties.is_active)
225 				continue;
226 
227 			dev_dbg(dev,
228 				"static_queue, mapping user queue %d, is debug status %d\n",
229 				q->queue, qpd->is_debug);
230 
231 			retval = pm->pmf->map_queues(pm,
232 						&rl_buffer[rl_wptr],
233 						q,
234 						qpd->is_debug);
235 
236 			if (retval)
237 				return retval;
238 
239 			inc_wptr(&rl_wptr,
240 				pm->pmf->map_queues_size,
241 				alloc_size_bytes);
242 		}
243 	}
244 	if (xnack_conflict) {
245 		/* pick up processes with the other xnack mode */
246 		xnack_enabled = !xnack_enabled;
247 		xnack_conflict = 0;
248 		goto build_runlist_ib;
249 	}
250 
251 	dev_dbg(dev, "Finished map process and queues to runlist\n");
252 
253 	if (is_over_subscription) {
254 		if (!pm->is_over_subscription)
255 			dev_warn(dev, "Runlist is getting oversubscribed due to%s%s%s%s. Expect reduced ROCm performance.\n",
256 				is_over_subscription & OVER_SUBSCRIPTION_PROCESS_COUNT ?
257 				" too many processes" : "",
258 				is_over_subscription & OVER_SUBSCRIPTION_COMPUTE_QUEUE_COUNT ?
259 				" too many queues" : "",
260 				is_over_subscription & OVER_SUBSCRIPTION_GWS_QUEUE_COUNT ?
261 				" multiple processes using cooperative launch" : "",
262 				is_over_subscription & OVER_SUBSCRIPTION_XNACK_CONFLICT ?
263 				" xnack on/off processes mixed on gfx9" : "");
264 
265 		retval = pm->pmf->runlist(pm, &rl_buffer[rl_wptr],
266 					*rl_gpu_addr,
267 					alloc_size_bytes / sizeof(uint32_t),
268 					true);
269 	}
270 	pm->is_over_subscription = !!is_over_subscription;
271 
272 	for (i = 0; i < alloc_size_bytes / sizeof(uint32_t); i++)
273 		pr_debug("0x%2X ", rl_buffer[i]);
274 	pr_debug("\n");
275 
276 	return retval;
277 }
278 
279 int pm_init(struct packet_manager *pm, struct device_queue_manager *dqm)
280 {
281 	switch (dqm->dev->adev->asic_type) {
282 	case CHIP_KAVERI:
283 	case CHIP_HAWAII:
284 		/* PM4 packet structures on CIK are the same as on VI */
285 	case CHIP_CARRIZO:
286 	case CHIP_TONGA:
287 	case CHIP_FIJI:
288 	case CHIP_POLARIS10:
289 	case CHIP_POLARIS11:
290 	case CHIP_POLARIS12:
291 	case CHIP_VEGAM:
292 		pm->pmf = &kfd_vi_pm_funcs;
293 		break;
294 	default:
295 		if (KFD_GC_VERSION(dqm->dev) == IP_VERSION(9, 4, 2) ||
296 		    KFD_GC_VERSION(dqm->dev) == IP_VERSION(9, 4, 3) ||
297 		    KFD_GC_VERSION(dqm->dev) == IP_VERSION(9, 4, 4) ||
298 		    KFD_GC_VERSION(dqm->dev) == IP_VERSION(9, 5, 0))
299 			pm->pmf = &kfd_aldebaran_pm_funcs;
300 		else if (KFD_GC_VERSION(dqm->dev) >= IP_VERSION(9, 0, 1))
301 			pm->pmf = &kfd_v9_pm_funcs;
302 		else {
303 			WARN(1, "Unexpected ASIC family %u",
304 			     dqm->dev->adev->asic_type);
305 			return -EINVAL;
306 		}
307 	}
308 
309 	pm->dqm = dqm;
310 	mutex_init(&pm->lock);
311 	pm->priv_queue = kernel_queue_init(dqm->dev, KFD_QUEUE_TYPE_HIQ);
312 	if (!pm->priv_queue) {
313 		mutex_destroy(&pm->lock);
314 		return -ENOMEM;
315 	}
316 	pm->allocated = false;
317 
318 	return 0;
319 }
320 
321 void pm_uninit(struct packet_manager *pm)
322 {
323 	mutex_destroy(&pm->lock);
324 	kernel_queue_uninit(pm->priv_queue);
325 	pm->priv_queue = NULL;
326 }
327 
328 int pm_send_set_resources(struct packet_manager *pm,
329 				struct scheduling_resources *res)
330 {
331 	struct kfd_node *node = pm->dqm->dev;
332 	struct device *dev = node->adev->dev;
333 	uint32_t *buffer, size;
334 	int retval = 0;
335 
336 	size = pm->pmf->set_resources_size;
337 	mutex_lock(&pm->lock);
338 	kq_acquire_packet_buffer(pm->priv_queue,
339 					size / sizeof(uint32_t),
340 					(unsigned int **)&buffer);
341 	if (!buffer) {
342 		dev_err(dev, "Failed to allocate buffer on kernel queue\n");
343 		retval = -ENOMEM;
344 		goto out;
345 	}
346 
347 	retval = pm->pmf->set_resources(pm, buffer, res);
348 	if (!retval)
349 		retval = kq_submit_packet(pm->priv_queue);
350 	else
351 		kq_rollback_packet(pm->priv_queue);
352 
353 out:
354 	mutex_unlock(&pm->lock);
355 
356 	return retval;
357 }
358 
359 int pm_send_runlist(struct packet_manager *pm, struct list_head *dqm_queues)
360 {
361 	uint64_t rl_gpu_ib_addr;
362 	uint32_t *rl_buffer;
363 	size_t rl_ib_size, packet_size_dwords;
364 	int retval;
365 
366 	retval = pm_create_runlist_ib(pm, dqm_queues, &rl_gpu_ib_addr,
367 					&rl_ib_size);
368 	if (retval)
369 		goto fail_create_runlist_ib;
370 
371 	pr_debug("runlist IB address: 0x%llX\n", rl_gpu_ib_addr);
372 
373 	packet_size_dwords = pm->pmf->runlist_size / sizeof(uint32_t);
374 	mutex_lock(&pm->lock);
375 
376 	retval = kq_acquire_packet_buffer(pm->priv_queue,
377 					packet_size_dwords, &rl_buffer);
378 	if (retval)
379 		goto fail_acquire_packet_buffer;
380 
381 	retval = pm->pmf->runlist(pm, rl_buffer, rl_gpu_ib_addr,
382 					rl_ib_size / sizeof(uint32_t), false);
383 	if (retval)
384 		goto fail_create_runlist;
385 
386 	retval = kq_submit_packet(pm->priv_queue);
387 
388 	mutex_unlock(&pm->lock);
389 
390 	return retval;
391 
392 fail_create_runlist:
393 	kq_rollback_packet(pm->priv_queue);
394 fail_acquire_packet_buffer:
395 	mutex_unlock(&pm->lock);
396 fail_create_runlist_ib:
397 	pm_release_ib(pm);
398 	return retval;
399 }
400 
401 int pm_send_query_status(struct packet_manager *pm, uint64_t fence_address,
402 			uint64_t fence_value)
403 {
404 	struct kfd_node *node = pm->dqm->dev;
405 	struct device *dev = node->adev->dev;
406 	uint32_t *buffer, size;
407 	int retval = 0;
408 
409 	if (WARN_ON(!fence_address))
410 		return -EFAULT;
411 
412 	size = pm->pmf->query_status_size;
413 	mutex_lock(&pm->lock);
414 	kq_acquire_packet_buffer(pm->priv_queue,
415 			size / sizeof(uint32_t), (unsigned int **)&buffer);
416 	if (!buffer) {
417 		dev_err(dev, "Failed to allocate buffer on kernel queue\n");
418 		retval = -ENOMEM;
419 		goto out;
420 	}
421 
422 	retval = pm->pmf->query_status(pm, buffer, fence_address, fence_value);
423 	if (!retval)
424 		retval = kq_submit_packet(pm->priv_queue);
425 	else
426 		kq_rollback_packet(pm->priv_queue);
427 
428 out:
429 	mutex_unlock(&pm->lock);
430 	return retval;
431 }
432 
433 /* pm_config_dequeue_wait_counts: Configure dequeue timer Wait Counts
434  *  by writing to CP_IQ_WAIT_TIME2 registers.
435  *
436  *  @cmd: See emum kfd_config_dequeue_wait_counts_cmd definition
437  *  @value: Depends on the cmd. This parameter is unused for
438  *    KFD_DEQUEUE_WAIT_INIT and KFD_DEQUEUE_WAIT_RESET. For
439  *    KFD_DEQUEUE_WAIT_SET_SCH_WAVE it holds value to be set
440  *
441  */
442 int pm_config_dequeue_wait_counts(struct packet_manager *pm,
443 		enum kfd_config_dequeue_wait_counts_cmd cmd,
444 		uint32_t value)
445 {
446 	struct kfd_node *node = pm->dqm->dev;
447 	struct device *dev = node->adev->dev;
448 	int retval = 0;
449 	uint32_t *buffer, size;
450 
451 	if (!pm->pmf->config_dequeue_wait_counts ||
452 	    !pm->pmf->config_dequeue_wait_counts_size)
453 		return 0;
454 
455 	if (cmd == KFD_DEQUEUE_WAIT_INIT && (KFD_GC_VERSION(pm->dqm->dev) < IP_VERSION(9, 4, 1) ||
456 	   KFD_GC_VERSION(pm->dqm->dev) >= IP_VERSION(10, 0, 0)))
457 		return 0;
458 
459 	size = pm->pmf->config_dequeue_wait_counts_size;
460 
461 	mutex_lock(&pm->lock);
462 
463 	if (size) {
464 		kq_acquire_packet_buffer(pm->priv_queue,
465 			size / sizeof(uint32_t),
466 			(unsigned int **)&buffer);
467 
468 		if (!buffer) {
469 			dev_err(dev,
470 				"Failed to allocate buffer on kernel queue\n");
471 			retval = -ENOMEM;
472 			goto out;
473 		}
474 
475 		retval = pm->pmf->config_dequeue_wait_counts(pm, buffer,
476 							     cmd, value);
477 		if (!retval) {
478 			retval = kq_submit_packet(pm->priv_queue);
479 
480 			/* If default value is modified, cache that in dqm->wait_times */
481 			if (!retval && cmd == KFD_DEQUEUE_WAIT_INIT)
482 				update_dqm_wait_times(pm->dqm);
483 		} else {
484 			kq_rollback_packet(pm->priv_queue);
485 		}
486 	}
487 out:
488 	mutex_unlock(&pm->lock);
489 	return retval;
490 }
491 
492 int pm_send_unmap_queue(struct packet_manager *pm,
493 			enum kfd_unmap_queues_filter filter,
494 			uint32_t filter_param, bool reset)
495 {
496 	struct kfd_node *node = pm->dqm->dev;
497 	struct device *dev = node->adev->dev;
498 	uint32_t *buffer, size;
499 	int retval = 0;
500 
501 	size = pm->pmf->unmap_queues_size;
502 	mutex_lock(&pm->lock);
503 	kq_acquire_packet_buffer(pm->priv_queue,
504 			size / sizeof(uint32_t), (unsigned int **)&buffer);
505 	if (!buffer) {
506 		dev_err(dev, "Failed to allocate buffer on kernel queue\n");
507 		retval = -ENOMEM;
508 		goto out;
509 	}
510 
511 	retval = pm->pmf->unmap_queues(pm, buffer, filter, filter_param, reset);
512 	if (!retval)
513 		retval = kq_submit_packet(pm->priv_queue);
514 	else
515 		kq_rollback_packet(pm->priv_queue);
516 
517 out:
518 	mutex_unlock(&pm->lock);
519 	return retval;
520 }
521 
522 void pm_release_ib(struct packet_manager *pm)
523 {
524 	mutex_lock(&pm->lock);
525 	if (pm->allocated) {
526 		kfd_gtt_sa_free(pm->dqm->dev, pm->ib_buffer_obj);
527 		pm->allocated = false;
528 	}
529 	mutex_unlock(&pm->lock);
530 }
531 
532 #if defined(CONFIG_DEBUG_FS)
533 
534 int pm_debugfs_runlist(struct seq_file *m, void *data)
535 {
536 	struct packet_manager *pm = data;
537 
538 	mutex_lock(&pm->lock);
539 
540 	if (!pm->allocated) {
541 		seq_puts(m, "  No active runlist\n");
542 		goto out;
543 	}
544 
545 	seq_hex_dump(m, "  ", DUMP_PREFIX_OFFSET, 32, 4,
546 		     pm->ib_buffer_obj->cpu_ptr, pm->ib_size_bytes, false);
547 
548 out:
549 	mutex_unlock(&pm->lock);
550 	return 0;
551 }
552 
553 int pm_debugfs_hang_hws(struct packet_manager *pm)
554 {
555 	struct kfd_node *node = pm->dqm->dev;
556 	struct device *dev = node->adev->dev;
557 	uint32_t *buffer, size;
558 	int r = 0;
559 
560 	if (!pm->priv_queue)
561 		return -EAGAIN;
562 
563 	size = pm->pmf->query_status_size;
564 	mutex_lock(&pm->lock);
565 	kq_acquire_packet_buffer(pm->priv_queue,
566 			size / sizeof(uint32_t), (unsigned int **)&buffer);
567 	if (!buffer) {
568 		dev_err(dev, "Failed to allocate buffer on kernel queue\n");
569 		r = -ENOMEM;
570 		goto out;
571 	}
572 	memset(buffer, 0x55, size);
573 	kq_submit_packet(pm->priv_queue);
574 
575 	dev_info(dev, "Submitting %x %x %x %x %x %x %x to HIQ to hang the HWS.",
576 		 buffer[0], buffer[1], buffer[2], buffer[3], buffer[4],
577 		 buffer[5], buffer[6]);
578 out:
579 	mutex_unlock(&pm->lock);
580 	return r;
581 }
582 
583 
584 #endif
585