1 /*
2 * Copyright © 2014 Broadcom
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24 #include <linux/module.h>
25 #include <linux/platform_device.h>
26 #include <linux/pm_runtime.h>
27 #include <linux/device.h>
28 #include <linux/io.h>
29 #include <linux/sched/signal.h>
30 #include <linux/dma-fence-array.h>
31
32 #include <drm/drm_exec.h>
33 #include <drm/drm_syncobj.h>
34
35 #include "uapi/drm/vc4_drm.h"
36 #include "vc4_drv.h"
37 #include "vc4_regs.h"
38 #include "vc4_trace.h"
39
40 static void
vc4_queue_hangcheck(struct drm_device * dev)41 vc4_queue_hangcheck(struct drm_device *dev)
42 {
43 struct vc4_dev *vc4 = to_vc4_dev(dev);
44
45 mod_timer(&vc4->hangcheck.timer,
46 round_jiffies_up(jiffies + msecs_to_jiffies(100)));
47 }
48
49 struct vc4_hang_state {
50 struct drm_vc4_get_hang_state user_state;
51
52 u32 bo_count;
53 struct drm_gem_object **bo;
54 };
55
56 static void
vc4_free_hang_state(struct drm_device * dev,struct vc4_hang_state * state)57 vc4_free_hang_state(struct drm_device *dev, struct vc4_hang_state *state)
58 {
59 unsigned int i;
60
61 for (i = 0; i < state->user_state.bo_count; i++)
62 drm_gem_object_put(state->bo[i]);
63
64 kfree(state);
65 }
66
67 int
vc4_get_hang_state_ioctl(struct drm_device * dev,void * data,struct drm_file * file_priv)68 vc4_get_hang_state_ioctl(struct drm_device *dev, void *data,
69 struct drm_file *file_priv)
70 {
71 struct drm_vc4_get_hang_state *get_state = data;
72 struct drm_vc4_get_hang_state_bo *bo_state;
73 struct vc4_hang_state *kernel_state;
74 struct drm_vc4_get_hang_state *state;
75 struct vc4_dev *vc4 = to_vc4_dev(dev);
76 unsigned long irqflags;
77 u32 i;
78 int ret = 0;
79
80 if (WARN_ON_ONCE(vc4->gen > VC4_GEN_4))
81 return -ENODEV;
82
83 if (!vc4->v3d) {
84 DRM_DEBUG("VC4_GET_HANG_STATE with no VC4 V3D probed\n");
85 return -ENODEV;
86 }
87
88 spin_lock_irqsave(&vc4->job_lock, irqflags);
89 kernel_state = vc4->hang_state;
90 if (!kernel_state) {
91 spin_unlock_irqrestore(&vc4->job_lock, irqflags);
92 return -ENOENT;
93 }
94 state = &kernel_state->user_state;
95
96 /* If the user's array isn't big enough, just return the
97 * required array size.
98 */
99 if (get_state->bo_count < state->bo_count) {
100 get_state->bo_count = state->bo_count;
101 spin_unlock_irqrestore(&vc4->job_lock, irqflags);
102 return 0;
103 }
104
105 vc4->hang_state = NULL;
106 spin_unlock_irqrestore(&vc4->job_lock, irqflags);
107
108 /* Save the user's BO pointer, so we don't stomp it with the memcpy. */
109 state->bo = get_state->bo;
110 memcpy(get_state, state, sizeof(*state));
111
112 bo_state = kcalloc(state->bo_count, sizeof(*bo_state), GFP_KERNEL);
113 if (!bo_state) {
114 ret = -ENOMEM;
115 goto err_free;
116 }
117
118 for (i = 0; i < state->bo_count; i++) {
119 struct vc4_bo *vc4_bo = to_vc4_bo(kernel_state->bo[i]);
120 u32 handle;
121
122 ret = drm_gem_handle_create(file_priv, kernel_state->bo[i],
123 &handle);
124
125 if (ret) {
126 state->bo_count = i;
127 goto err_delete_handle;
128 }
129 bo_state[i].handle = handle;
130 bo_state[i].paddr = vc4_bo->base.dma_addr;
131 bo_state[i].size = vc4_bo->base.base.size;
132 }
133
134 if (copy_to_user(u64_to_user_ptr(get_state->bo),
135 bo_state,
136 state->bo_count * sizeof(*bo_state)))
137 ret = -EFAULT;
138
139 err_delete_handle:
140 if (ret) {
141 for (i = 0; i < state->bo_count; i++)
142 drm_gem_handle_delete(file_priv, bo_state[i].handle);
143 }
144
145 err_free:
146 vc4_free_hang_state(dev, kernel_state);
147 kfree(bo_state);
148
149 return ret;
150 }
151
152 static void
vc4_save_hang_state(struct drm_device * dev)153 vc4_save_hang_state(struct drm_device *dev)
154 {
155 struct vc4_dev *vc4 = to_vc4_dev(dev);
156 struct drm_vc4_get_hang_state *state;
157 struct vc4_hang_state *kernel_state;
158 struct vc4_exec_info *exec[2];
159 struct vc4_bo *bo;
160 unsigned long irqflags;
161 unsigned int i, j, k, unref_list_count;
162
163 kernel_state = kcalloc(1, sizeof(*kernel_state), GFP_KERNEL);
164 if (!kernel_state)
165 return;
166
167 state = &kernel_state->user_state;
168
169 spin_lock_irqsave(&vc4->job_lock, irqflags);
170 exec[0] = vc4_first_bin_job(vc4);
171 exec[1] = vc4_first_render_job(vc4);
172 if (!exec[0] && !exec[1]) {
173 spin_unlock_irqrestore(&vc4->job_lock, irqflags);
174 return;
175 }
176
177 /* Get the bos from both binner and renderer into hang state. */
178 state->bo_count = 0;
179 for (i = 0; i < 2; i++) {
180 if (!exec[i])
181 continue;
182
183 unref_list_count = 0;
184 list_for_each_entry(bo, &exec[i]->unref_list, unref_head)
185 unref_list_count++;
186 state->bo_count += exec[i]->bo_count + unref_list_count;
187 }
188
189 kernel_state->bo = kcalloc(state->bo_count,
190 sizeof(*kernel_state->bo), GFP_ATOMIC);
191
192 if (!kernel_state->bo) {
193 spin_unlock_irqrestore(&vc4->job_lock, irqflags);
194 return;
195 }
196
197 k = 0;
198 for (i = 0; i < 2; i++) {
199 if (!exec[i])
200 continue;
201
202 for (j = 0; j < exec[i]->bo_count; j++) {
203 bo = to_vc4_bo(exec[i]->bo[j]);
204
205 /* Retain BOs just in case they were marked purgeable.
206 * This prevents the BO from being purged before
207 * someone had a chance to dump the hang state.
208 */
209 WARN_ON(!refcount_read(&bo->usecnt));
210 refcount_inc(&bo->usecnt);
211 drm_gem_object_get(exec[i]->bo[j]);
212 kernel_state->bo[k++] = exec[i]->bo[j];
213 }
214
215 list_for_each_entry(bo, &exec[i]->unref_list, unref_head) {
216 /* No need to retain BOs coming from the ->unref_list
217 * because they are naturally unpurgeable.
218 */
219 drm_gem_object_get(&bo->base.base);
220 kernel_state->bo[k++] = &bo->base.base;
221 }
222 }
223
224 WARN_ON_ONCE(k != state->bo_count);
225
226 if (exec[0])
227 state->start_bin = exec[0]->ct0ca;
228 if (exec[1])
229 state->start_render = exec[1]->ct1ca;
230
231 spin_unlock_irqrestore(&vc4->job_lock, irqflags);
232
233 state->ct0ca = V3D_READ(V3D_CTNCA(0));
234 state->ct0ea = V3D_READ(V3D_CTNEA(0));
235
236 state->ct1ca = V3D_READ(V3D_CTNCA(1));
237 state->ct1ea = V3D_READ(V3D_CTNEA(1));
238
239 state->ct0cs = V3D_READ(V3D_CTNCS(0));
240 state->ct1cs = V3D_READ(V3D_CTNCS(1));
241
242 state->ct0ra0 = V3D_READ(V3D_CT00RA0);
243 state->ct1ra0 = V3D_READ(V3D_CT01RA0);
244
245 state->bpca = V3D_READ(V3D_BPCA);
246 state->bpcs = V3D_READ(V3D_BPCS);
247 state->bpoa = V3D_READ(V3D_BPOA);
248 state->bpos = V3D_READ(V3D_BPOS);
249
250 state->vpmbase = V3D_READ(V3D_VPMBASE);
251
252 state->dbge = V3D_READ(V3D_DBGE);
253 state->fdbgo = V3D_READ(V3D_FDBGO);
254 state->fdbgb = V3D_READ(V3D_FDBGB);
255 state->fdbgr = V3D_READ(V3D_FDBGR);
256 state->fdbgs = V3D_READ(V3D_FDBGS);
257 state->errstat = V3D_READ(V3D_ERRSTAT);
258
259 /* We need to turn purgeable BOs into unpurgeable ones so that
260 * userspace has a chance to dump the hang state before the kernel
261 * decides to purge those BOs.
262 * Note that BO consistency at dump time cannot be guaranteed. For
263 * example, if the owner of these BOs decides to re-use them or mark
264 * them purgeable again there's nothing we can do to prevent it.
265 */
266 for (i = 0; i < kernel_state->user_state.bo_count; i++) {
267 struct vc4_bo *bo = to_vc4_bo(kernel_state->bo[i]);
268
269 if (bo->madv == __VC4_MADV_NOTSUPP)
270 continue;
271
272 mutex_lock(&bo->madv_lock);
273 if (!WARN_ON(bo->madv == __VC4_MADV_PURGED))
274 bo->madv = VC4_MADV_WILLNEED;
275 refcount_dec(&bo->usecnt);
276 mutex_unlock(&bo->madv_lock);
277 }
278
279 spin_lock_irqsave(&vc4->job_lock, irqflags);
280 if (vc4->hang_state) {
281 spin_unlock_irqrestore(&vc4->job_lock, irqflags);
282 vc4_free_hang_state(dev, kernel_state);
283 } else {
284 vc4->hang_state = kernel_state;
285 spin_unlock_irqrestore(&vc4->job_lock, irqflags);
286 }
287 }
288
289 static void
vc4_reset(struct drm_device * dev)290 vc4_reset(struct drm_device *dev)
291 {
292 struct vc4_dev *vc4 = to_vc4_dev(dev);
293
294 DRM_INFO("Resetting GPU.\n");
295
296 mutex_lock(&vc4->power_lock);
297 if (vc4->power_refcount) {
298 /* Power the device off and back on the by dropping the
299 * reference on runtime PM.
300 */
301 pm_runtime_put_sync_suspend(&vc4->v3d->pdev->dev);
302 pm_runtime_get_sync(&vc4->v3d->pdev->dev);
303 }
304 mutex_unlock(&vc4->power_lock);
305
306 vc4_irq_reset(dev);
307
308 /* Rearm the hangcheck -- another job might have been waiting
309 * for our hung one to get kicked off, and vc4_irq_reset()
310 * would have started it.
311 */
312 vc4_queue_hangcheck(dev);
313 }
314
315 static void
vc4_reset_work(struct work_struct * work)316 vc4_reset_work(struct work_struct *work)
317 {
318 struct vc4_dev *vc4 =
319 container_of(work, struct vc4_dev, hangcheck.reset_work);
320
321 vc4_save_hang_state(&vc4->base);
322
323 vc4_reset(&vc4->base);
324 }
325
326 static void
vc4_hangcheck_elapsed(struct timer_list * t)327 vc4_hangcheck_elapsed(struct timer_list *t)
328 {
329 struct vc4_dev *vc4 = timer_container_of(vc4, t, hangcheck.timer);
330 struct drm_device *dev = &vc4->base;
331 uint32_t ct0ca, ct1ca;
332 unsigned long irqflags;
333 struct vc4_exec_info *bin_exec, *render_exec;
334
335 spin_lock_irqsave(&vc4->job_lock, irqflags);
336
337 bin_exec = vc4_first_bin_job(vc4);
338 render_exec = vc4_first_render_job(vc4);
339
340 /* If idle, we can stop watching for hangs. */
341 if (!bin_exec && !render_exec) {
342 spin_unlock_irqrestore(&vc4->job_lock, irqflags);
343 return;
344 }
345
346 ct0ca = V3D_READ(V3D_CTNCA(0));
347 ct1ca = V3D_READ(V3D_CTNCA(1));
348
349 /* If we've made any progress in execution, rearm the timer
350 * and wait.
351 */
352 if ((bin_exec && ct0ca != bin_exec->last_ct0ca) ||
353 (render_exec && ct1ca != render_exec->last_ct1ca)) {
354 if (bin_exec)
355 bin_exec->last_ct0ca = ct0ca;
356 if (render_exec)
357 render_exec->last_ct1ca = ct1ca;
358 spin_unlock_irqrestore(&vc4->job_lock, irqflags);
359 vc4_queue_hangcheck(dev);
360 return;
361 }
362
363 spin_unlock_irqrestore(&vc4->job_lock, irqflags);
364
365 /* We've gone too long with no progress, reset. This has to
366 * be done from a work struct, since resetting can sleep and
367 * this timer hook isn't allowed to.
368 */
369 schedule_work(&vc4->hangcheck.reset_work);
370 }
371
372 static void
submit_cl(struct drm_device * dev,uint32_t thread,uint32_t start,uint32_t end)373 submit_cl(struct drm_device *dev, uint32_t thread, uint32_t start, uint32_t end)
374 {
375 struct vc4_dev *vc4 = to_vc4_dev(dev);
376
377 /* Set the current and end address of the control list.
378 * Writing the end register is what starts the job.
379 */
380 V3D_WRITE(V3D_CTNCA(thread), start);
381 V3D_WRITE(V3D_CTNEA(thread), end);
382 }
383
384 int
vc4_wait_for_seqno(struct drm_device * dev,uint64_t seqno,uint64_t timeout_ns,bool interruptible)385 vc4_wait_for_seqno(struct drm_device *dev, uint64_t seqno, uint64_t timeout_ns,
386 bool interruptible)
387 {
388 struct vc4_dev *vc4 = to_vc4_dev(dev);
389 int ret = 0;
390 unsigned long timeout_expire;
391 DEFINE_WAIT(wait);
392
393 if (WARN_ON_ONCE(vc4->gen > VC4_GEN_4))
394 return -ENODEV;
395
396 if (vc4->finished_seqno >= seqno)
397 return 0;
398
399 if (timeout_ns == 0)
400 return -ETIME;
401
402 timeout_expire = jiffies + nsecs_to_jiffies(timeout_ns);
403
404 trace_vc4_wait_for_seqno_begin(dev, seqno, timeout_ns);
405 for (;;) {
406 prepare_to_wait(&vc4->job_wait_queue, &wait,
407 interruptible ? TASK_INTERRUPTIBLE :
408 TASK_UNINTERRUPTIBLE);
409
410 if (interruptible && signal_pending(current)) {
411 ret = -ERESTARTSYS;
412 break;
413 }
414
415 if (vc4->finished_seqno >= seqno)
416 break;
417
418 if (timeout_ns != ~0ull) {
419 if (time_after_eq(jiffies, timeout_expire)) {
420 ret = -ETIME;
421 break;
422 }
423 schedule_timeout(timeout_expire - jiffies);
424 } else {
425 schedule();
426 }
427 }
428
429 finish_wait(&vc4->job_wait_queue, &wait);
430 trace_vc4_wait_for_seqno_end(dev, seqno);
431
432 return ret;
433 }
434
435 static void
vc4_flush_caches(struct drm_device * dev)436 vc4_flush_caches(struct drm_device *dev)
437 {
438 struct vc4_dev *vc4 = to_vc4_dev(dev);
439
440 /* Flush the GPU L2 caches. These caches sit on top of system
441 * L3 (the 128kb or so shared with the CPU), and are
442 * non-allocating in the L3.
443 */
444 V3D_WRITE(V3D_L2CACTL,
445 V3D_L2CACTL_L2CCLR);
446
447 V3D_WRITE(V3D_SLCACTL,
448 VC4_SET_FIELD(0xf, V3D_SLCACTL_T1CC) |
449 VC4_SET_FIELD(0xf, V3D_SLCACTL_T0CC) |
450 VC4_SET_FIELD(0xf, V3D_SLCACTL_UCC) |
451 VC4_SET_FIELD(0xf, V3D_SLCACTL_ICC));
452 }
453
454 static void
vc4_flush_texture_caches(struct drm_device * dev)455 vc4_flush_texture_caches(struct drm_device *dev)
456 {
457 struct vc4_dev *vc4 = to_vc4_dev(dev);
458
459 V3D_WRITE(V3D_L2CACTL,
460 V3D_L2CACTL_L2CCLR);
461
462 V3D_WRITE(V3D_SLCACTL,
463 VC4_SET_FIELD(0xf, V3D_SLCACTL_T1CC) |
464 VC4_SET_FIELD(0xf, V3D_SLCACTL_T0CC));
465 }
466
467 /* Sets the registers for the next job to be actually be executed in
468 * the hardware.
469 *
470 * The job_lock should be held during this.
471 */
472 void
vc4_submit_next_bin_job(struct drm_device * dev)473 vc4_submit_next_bin_job(struct drm_device *dev)
474 {
475 struct vc4_dev *vc4 = to_vc4_dev(dev);
476 struct vc4_exec_info *exec;
477
478 if (WARN_ON_ONCE(vc4->gen > VC4_GEN_4))
479 return;
480
481 again:
482 exec = vc4_first_bin_job(vc4);
483 if (!exec)
484 return;
485
486 vc4_flush_caches(dev);
487
488 /* Only start the perfmon if it was not already started by a previous
489 * job.
490 */
491 if (exec->perfmon && vc4->active_perfmon != exec->perfmon)
492 vc4_perfmon_start(vc4, exec->perfmon);
493
494 /* Either put the job in the binner if it uses the binner, or
495 * immediately move it to the to-be-rendered queue.
496 */
497 if (exec->ct0ca != exec->ct0ea) {
498 trace_vc4_submit_cl(dev, false, exec->seqno, exec->ct0ca,
499 exec->ct0ea);
500 submit_cl(dev, 0, exec->ct0ca, exec->ct0ea);
501 } else {
502 struct vc4_exec_info *next;
503
504 vc4_move_job_to_render(dev, exec);
505 next = vc4_first_bin_job(vc4);
506
507 /* We can't start the next bin job if the previous job had a
508 * different perfmon instance attached to it. The same goes
509 * if one of them had a perfmon attached to it and the other
510 * one doesn't.
511 */
512 if (next && next->perfmon == exec->perfmon)
513 goto again;
514 }
515 }
516
517 void
vc4_submit_next_render_job(struct drm_device * dev)518 vc4_submit_next_render_job(struct drm_device *dev)
519 {
520 struct vc4_dev *vc4 = to_vc4_dev(dev);
521 struct vc4_exec_info *exec = vc4_first_render_job(vc4);
522
523 if (!exec)
524 return;
525
526 if (WARN_ON_ONCE(vc4->gen > VC4_GEN_4))
527 return;
528
529 /* A previous RCL may have written to one of our textures, and
530 * our full cache flush at bin time may have occurred before
531 * that RCL completed. Flush the texture cache now, but not
532 * the instructions or uniforms (since we don't write those
533 * from an RCL).
534 */
535 vc4_flush_texture_caches(dev);
536
537 trace_vc4_submit_cl(dev, true, exec->seqno, exec->ct1ca, exec->ct1ea);
538 submit_cl(dev, 1, exec->ct1ca, exec->ct1ea);
539 }
540
541 void
vc4_move_job_to_render(struct drm_device * dev,struct vc4_exec_info * exec)542 vc4_move_job_to_render(struct drm_device *dev, struct vc4_exec_info *exec)
543 {
544 struct vc4_dev *vc4 = to_vc4_dev(dev);
545 bool was_empty = list_empty(&vc4->render_job_list);
546
547 if (WARN_ON_ONCE(vc4->gen > VC4_GEN_4))
548 return;
549
550 list_move_tail(&exec->head, &vc4->render_job_list);
551 if (was_empty)
552 vc4_submit_next_render_job(dev);
553 }
554
555 static void
vc4_attach_fences(struct vc4_exec_info * exec)556 vc4_attach_fences(struct vc4_exec_info *exec)
557 {
558 struct vc4_bo *bo;
559 unsigned i;
560
561 for (i = 0; i < exec->bo_count; i++) {
562 bo = to_vc4_bo(exec->bo[i]);
563 dma_resv_add_fence(bo->base.base.resv, exec->fence,
564 DMA_RESV_USAGE_READ);
565 }
566
567 for (i = 0; i < exec->rcl_write_bo_count; i++) {
568 bo = to_vc4_bo(&exec->rcl_write_bo[i]->base);
569 dma_resv_add_fence(bo->base.base.resv, exec->fence,
570 DMA_RESV_USAGE_WRITE);
571 }
572 }
573
574 /* Takes the reservation lock on all the BOs being referenced, so that
575 * at queue submit time we can update the reservations.
576 *
577 * We don't lock the RCL the tile alloc/state BOs, or overflow memory
578 * (all of which are on exec->unref_list). They're entirely private
579 * to vc4, so we don't attach dma-buf fences to them.
580 */
581 static int
vc4_lock_bo_reservations(struct vc4_exec_info * exec,struct drm_exec * exec_ctx)582 vc4_lock_bo_reservations(struct vc4_exec_info *exec,
583 struct drm_exec *exec_ctx)
584 {
585 int ret;
586
587 /* Reserve space for our shared (read-only) fence references,
588 * before we commit the CL to the hardware.
589 */
590 drm_exec_init(exec_ctx, DRM_EXEC_INTERRUPTIBLE_WAIT, exec->bo_count);
591 drm_exec_until_all_locked(exec_ctx) {
592 ret = drm_exec_prepare_array(exec_ctx, exec->bo,
593 exec->bo_count, 1);
594 }
595
596 if (ret) {
597 drm_exec_fini(exec_ctx);
598 return ret;
599 }
600
601 return 0;
602 }
603
604 /* Queues a struct vc4_exec_info for execution. If no job is
605 * currently executing, then submits it.
606 *
607 * Unlike most GPUs, our hardware only handles one command list at a
608 * time. To queue multiple jobs at once, we'd need to edit the
609 * previous command list to have a jump to the new one at the end, and
610 * then bump the end address. That's a change for a later date,
611 * though.
612 */
613 static int
vc4_queue_submit(struct drm_device * dev,struct vc4_exec_info * exec,struct drm_exec * exec_ctx,struct drm_syncobj * out_sync)614 vc4_queue_submit(struct drm_device *dev, struct vc4_exec_info *exec,
615 struct drm_exec *exec_ctx,
616 struct drm_syncobj *out_sync)
617 {
618 struct vc4_dev *vc4 = to_vc4_dev(dev);
619 struct vc4_exec_info *renderjob;
620 uint64_t seqno;
621 unsigned long irqflags;
622 struct vc4_fence *fence;
623
624 fence = kzalloc(sizeof(*fence), GFP_KERNEL);
625 if (!fence)
626 return -ENOMEM;
627 fence->dev = dev;
628
629 spin_lock_irqsave(&vc4->job_lock, irqflags);
630
631 seqno = ++vc4->emit_seqno;
632 exec->seqno = seqno;
633
634 dma_fence_init(&fence->base, &vc4_fence_ops, &vc4->job_lock,
635 vc4->dma_fence_context, exec->seqno);
636 fence->seqno = exec->seqno;
637 exec->fence = &fence->base;
638
639 if (out_sync)
640 drm_syncobj_replace_fence(out_sync, exec->fence);
641
642 vc4_attach_fences(exec);
643
644 drm_exec_fini(exec_ctx);
645
646 list_add_tail(&exec->head, &vc4->bin_job_list);
647
648 /* If no bin job was executing and if the render job (if any) has the
649 * same perfmon as our job attached to it (or if both jobs don't have
650 * perfmon activated), then kick ours off. Otherwise, it'll get
651 * started when the previous job's flush/render done interrupt occurs.
652 */
653 renderjob = vc4_first_render_job(vc4);
654 if (vc4_first_bin_job(vc4) == exec &&
655 (!renderjob || renderjob->perfmon == exec->perfmon)) {
656 vc4_submit_next_bin_job(dev);
657 vc4_queue_hangcheck(dev);
658 }
659
660 spin_unlock_irqrestore(&vc4->job_lock, irqflags);
661
662 return 0;
663 }
664
665 /**
666 * vc4_cl_lookup_bos() - Sets up exec->bo[] with the GEM objects
667 * referenced by the job.
668 * @dev: DRM device
669 * @file_priv: DRM file for this fd
670 * @exec: V3D job being set up
671 *
672 * The command validator needs to reference BOs by their index within
673 * the submitted job's BO list. This does the validation of the job's
674 * BO list and reference counting for the lifetime of the job.
675 */
676 static int
vc4_cl_lookup_bos(struct drm_device * dev,struct drm_file * file_priv,struct vc4_exec_info * exec)677 vc4_cl_lookup_bos(struct drm_device *dev,
678 struct drm_file *file_priv,
679 struct vc4_exec_info *exec)
680 {
681 struct drm_vc4_submit_cl *args = exec->args;
682 int ret = 0;
683 int i;
684
685 exec->bo_count = args->bo_handle_count;
686
687 if (!exec->bo_count) {
688 /* See comment on bo_index for why we have to check
689 * this.
690 */
691 DRM_DEBUG("Rendering requires BOs to validate\n");
692 return -EINVAL;
693 }
694
695 ret = drm_gem_objects_lookup(file_priv, u64_to_user_ptr(args->bo_handles),
696 exec->bo_count, &exec->bo);
697
698 if (ret)
699 goto fail_put_bo;
700
701 for (i = 0; i < exec->bo_count; i++) {
702 ret = vc4_bo_inc_usecnt(to_vc4_bo(exec->bo[i]));
703 if (ret)
704 goto fail_dec_usecnt;
705 }
706
707 return 0;
708
709 fail_dec_usecnt:
710 /* Decrease usecnt on acquired objects.
711 * We cannot rely on vc4_complete_exec() to release resources here,
712 * because vc4_complete_exec() has no information about which BO has
713 * had its ->usecnt incremented.
714 * To make things easier we just free everything explicitly and set
715 * exec->bo to NULL so that vc4_complete_exec() skips the 'BO release'
716 * step.
717 */
718 for (i-- ; i >= 0; i--)
719 vc4_bo_dec_usecnt(to_vc4_bo(exec->bo[i]));
720
721 fail_put_bo:
722 /* Release any reference to acquired objects. */
723 for (i = 0; i < exec->bo_count && exec->bo[i]; i++)
724 drm_gem_object_put(exec->bo[i]);
725
726 kvfree(exec->bo);
727 exec->bo = NULL;
728 return ret;
729 }
730
731 static int
vc4_get_bcl(struct drm_device * dev,struct vc4_exec_info * exec)732 vc4_get_bcl(struct drm_device *dev, struct vc4_exec_info *exec)
733 {
734 struct drm_vc4_submit_cl *args = exec->args;
735 struct vc4_dev *vc4 = to_vc4_dev(dev);
736 void *temp = NULL;
737 void *bin;
738 int ret = 0;
739 uint32_t bin_offset = 0;
740 uint32_t shader_rec_offset = roundup(bin_offset + args->bin_cl_size,
741 16);
742 uint32_t uniforms_offset = shader_rec_offset + args->shader_rec_size;
743 uint32_t exec_size = uniforms_offset + args->uniforms_size;
744 uint32_t temp_size = exec_size + (sizeof(struct vc4_shader_state) *
745 args->shader_rec_count);
746 struct vc4_bo *bo;
747
748 if (shader_rec_offset < args->bin_cl_size ||
749 uniforms_offset < shader_rec_offset ||
750 exec_size < uniforms_offset ||
751 args->shader_rec_count >= (UINT_MAX /
752 sizeof(struct vc4_shader_state)) ||
753 temp_size < exec_size) {
754 DRM_DEBUG("overflow in exec arguments\n");
755 ret = -EINVAL;
756 goto fail;
757 }
758
759 /* Allocate space where we'll store the copied in user command lists
760 * and shader records.
761 *
762 * We don't just copy directly into the BOs because we need to
763 * read the contents back for validation, and I think the
764 * bo->vaddr is uncached access.
765 */
766 temp = kvmalloc_array(temp_size, 1, GFP_KERNEL);
767 if (!temp) {
768 drm_err(dev, "Failed to allocate storage for copying "
769 "in bin/render CLs.\n");
770 ret = -ENOMEM;
771 goto fail;
772 }
773 bin = temp + bin_offset;
774 exec->shader_rec_u = temp + shader_rec_offset;
775 exec->uniforms_u = temp + uniforms_offset;
776 exec->shader_state = temp + exec_size;
777 exec->shader_state_size = args->shader_rec_count;
778
779 if (copy_from_user(bin,
780 u64_to_user_ptr(args->bin_cl),
781 args->bin_cl_size)) {
782 ret = -EFAULT;
783 goto fail;
784 }
785
786 if (copy_from_user(exec->shader_rec_u,
787 u64_to_user_ptr(args->shader_rec),
788 args->shader_rec_size)) {
789 ret = -EFAULT;
790 goto fail;
791 }
792
793 if (copy_from_user(exec->uniforms_u,
794 u64_to_user_ptr(args->uniforms),
795 args->uniforms_size)) {
796 ret = -EFAULT;
797 goto fail;
798 }
799
800 bo = vc4_bo_create(dev, exec_size, true, VC4_BO_TYPE_BCL);
801 if (IS_ERR(bo)) {
802 drm_err(dev, "Couldn't allocate BO for binning\n");
803 ret = PTR_ERR(bo);
804 goto fail;
805 }
806 exec->exec_bo = &bo->base;
807
808 list_add_tail(&to_vc4_bo(&exec->exec_bo->base)->unref_head,
809 &exec->unref_list);
810
811 exec->ct0ca = exec->exec_bo->dma_addr + bin_offset;
812
813 exec->bin_u = bin;
814
815 exec->shader_rec_v = exec->exec_bo->vaddr + shader_rec_offset;
816 exec->shader_rec_p = exec->exec_bo->dma_addr + shader_rec_offset;
817 exec->shader_rec_size = args->shader_rec_size;
818
819 exec->uniforms_v = exec->exec_bo->vaddr + uniforms_offset;
820 exec->uniforms_p = exec->exec_bo->dma_addr + uniforms_offset;
821 exec->uniforms_size = args->uniforms_size;
822
823 ret = vc4_validate_bin_cl(dev,
824 exec->exec_bo->vaddr + bin_offset,
825 bin,
826 exec);
827 if (ret)
828 goto fail;
829
830 ret = vc4_validate_shader_recs(dev, exec);
831 if (ret)
832 goto fail;
833
834 if (exec->found_tile_binning_mode_config_packet) {
835 ret = vc4_v3d_bin_bo_get(vc4, &exec->bin_bo_used);
836 if (ret)
837 goto fail;
838 }
839
840 fail:
841 kvfree(temp);
842 return ret;
843 }
844
845 static void
vc4_complete_exec(struct drm_device * dev,struct vc4_exec_info * exec)846 vc4_complete_exec(struct drm_device *dev, struct vc4_exec_info *exec)
847 {
848 struct vc4_dev *vc4 = to_vc4_dev(dev);
849 unsigned long irqflags;
850 unsigned i;
851
852 /* If we got force-completed because of GPU reset rather than
853 * through our IRQ handler, signal the fence now.
854 */
855 if (exec->fence) {
856 dma_fence_signal(exec->fence);
857 dma_fence_put(exec->fence);
858 }
859
860 if (exec->bo) {
861 for (i = 0; i < exec->bo_count; i++) {
862 struct vc4_bo *bo = to_vc4_bo(exec->bo[i]);
863
864 vc4_bo_dec_usecnt(bo);
865 drm_gem_object_put(exec->bo[i]);
866 }
867 kvfree(exec->bo);
868 }
869
870 while (!list_empty(&exec->unref_list)) {
871 struct vc4_bo *bo = list_first_entry(&exec->unref_list,
872 struct vc4_bo, unref_head);
873 list_del(&bo->unref_head);
874 drm_gem_object_put(&bo->base.base);
875 }
876
877 /* Free up the allocation of any bin slots we used. */
878 spin_lock_irqsave(&vc4->job_lock, irqflags);
879 vc4->bin_alloc_used &= ~exec->bin_slots;
880 spin_unlock_irqrestore(&vc4->job_lock, irqflags);
881
882 /* Release the reference on the binner BO if needed. */
883 if (exec->bin_bo_used)
884 vc4_v3d_bin_bo_put(vc4);
885
886 /* Release the reference we had on the perf monitor. */
887 vc4_perfmon_put(exec->perfmon);
888
889 vc4_v3d_pm_put(vc4);
890
891 kfree(exec);
892 }
893
894 void
vc4_job_handle_completed(struct vc4_dev * vc4)895 vc4_job_handle_completed(struct vc4_dev *vc4)
896 {
897 unsigned long irqflags;
898
899 if (WARN_ON_ONCE(vc4->gen > VC4_GEN_4))
900 return;
901
902 spin_lock_irqsave(&vc4->job_lock, irqflags);
903 while (!list_empty(&vc4->job_done_list)) {
904 struct vc4_exec_info *exec =
905 list_first_entry(&vc4->job_done_list,
906 struct vc4_exec_info, head);
907 list_del(&exec->head);
908
909 spin_unlock_irqrestore(&vc4->job_lock, irqflags);
910 vc4_complete_exec(&vc4->base, exec);
911 spin_lock_irqsave(&vc4->job_lock, irqflags);
912 }
913
914 spin_unlock_irqrestore(&vc4->job_lock, irqflags);
915 }
916
917 /* Scheduled when any job has been completed, this walks the list of
918 * jobs that had completed and unrefs their BOs and frees their exec
919 * structs.
920 */
921 static void
vc4_job_done_work(struct work_struct * work)922 vc4_job_done_work(struct work_struct *work)
923 {
924 struct vc4_dev *vc4 =
925 container_of(work, struct vc4_dev, job_done_work);
926
927 vc4_job_handle_completed(vc4);
928 }
929
930 static int
vc4_wait_for_seqno_ioctl_helper(struct drm_device * dev,uint64_t seqno,uint64_t * timeout_ns)931 vc4_wait_for_seqno_ioctl_helper(struct drm_device *dev,
932 uint64_t seqno,
933 uint64_t *timeout_ns)
934 {
935 unsigned long start = jiffies;
936 int ret = vc4_wait_for_seqno(dev, seqno, *timeout_ns, true);
937
938 if ((ret == -EINTR || ret == -ERESTARTSYS) && *timeout_ns != ~0ull) {
939 uint64_t delta = jiffies_to_nsecs(jiffies - start);
940
941 if (*timeout_ns >= delta)
942 *timeout_ns -= delta;
943 }
944
945 return ret;
946 }
947
948 int
vc4_wait_seqno_ioctl(struct drm_device * dev,void * data,struct drm_file * file_priv)949 vc4_wait_seqno_ioctl(struct drm_device *dev, void *data,
950 struct drm_file *file_priv)
951 {
952 struct vc4_dev *vc4 = to_vc4_dev(dev);
953 struct drm_vc4_wait_seqno *args = data;
954
955 if (WARN_ON_ONCE(vc4->gen > VC4_GEN_4))
956 return -ENODEV;
957
958 return vc4_wait_for_seqno_ioctl_helper(dev, args->seqno,
959 &args->timeout_ns);
960 }
961
962 int
vc4_wait_bo_ioctl(struct drm_device * dev,void * data,struct drm_file * file_priv)963 vc4_wait_bo_ioctl(struct drm_device *dev, void *data,
964 struct drm_file *file_priv)
965 {
966 struct vc4_dev *vc4 = to_vc4_dev(dev);
967 int ret;
968 struct drm_vc4_wait_bo *args = data;
969 unsigned long timeout_jiffies =
970 usecs_to_jiffies(div_u64(args->timeout_ns, 1000));
971 ktime_t start = ktime_get();
972 u64 delta_ns;
973
974 if (WARN_ON_ONCE(vc4->gen > VC4_GEN_4))
975 return -ENODEV;
976
977 if (args->pad != 0)
978 return -EINVAL;
979
980 ret = drm_gem_dma_resv_wait(file_priv, args->handle,
981 true, timeout_jiffies);
982
983 /* Decrement the user's timeout, in case we got interrupted
984 * such that the ioctl will be restarted.
985 */
986 delta_ns = ktime_to_ns(ktime_sub(ktime_get(), start));
987 if (delta_ns < args->timeout_ns)
988 args->timeout_ns -= delta_ns;
989 else
990 args->timeout_ns = 0;
991
992 return ret;
993 }
994
995 /**
996 * vc4_submit_cl_ioctl() - Submits a job (frame) to the VC4.
997 * @dev: DRM device
998 * @data: ioctl argument
999 * @file_priv: DRM file for this fd
1000 *
1001 * This is the main entrypoint for userspace to submit a 3D frame to
1002 * the GPU. Userspace provides the binner command list (if
1003 * applicable), and the kernel sets up the render command list to draw
1004 * to the framebuffer described in the ioctl, using the command lists
1005 * that the 3D engine's binner will produce.
1006 */
1007 int
vc4_submit_cl_ioctl(struct drm_device * dev,void * data,struct drm_file * file_priv)1008 vc4_submit_cl_ioctl(struct drm_device *dev, void *data,
1009 struct drm_file *file_priv)
1010 {
1011 struct vc4_dev *vc4 = to_vc4_dev(dev);
1012 struct vc4_file *vc4file = file_priv->driver_priv;
1013 struct drm_vc4_submit_cl *args = data;
1014 struct drm_syncobj *out_sync = NULL;
1015 struct vc4_exec_info *exec;
1016 struct drm_exec exec_ctx;
1017 struct dma_fence *in_fence;
1018 int ret = 0;
1019
1020 trace_vc4_submit_cl_ioctl(dev, args->bin_cl_size,
1021 args->shader_rec_size,
1022 args->bo_handle_count);
1023
1024 if (WARN_ON_ONCE(vc4->gen > VC4_GEN_4))
1025 return -ENODEV;
1026
1027 if (!vc4->v3d) {
1028 DRM_DEBUG("VC4_SUBMIT_CL with no VC4 V3D probed\n");
1029 return -ENODEV;
1030 }
1031
1032 if ((args->flags & ~(VC4_SUBMIT_CL_USE_CLEAR_COLOR |
1033 VC4_SUBMIT_CL_FIXED_RCL_ORDER |
1034 VC4_SUBMIT_CL_RCL_ORDER_INCREASING_X |
1035 VC4_SUBMIT_CL_RCL_ORDER_INCREASING_Y)) != 0) {
1036 DRM_DEBUG("Unknown flags: 0x%02x\n", args->flags);
1037 return -EINVAL;
1038 }
1039
1040 if (args->pad2 != 0) {
1041 DRM_DEBUG("Invalid pad: 0x%08x\n", args->pad2);
1042 return -EINVAL;
1043 }
1044
1045 exec = kcalloc(1, sizeof(*exec), GFP_KERNEL);
1046 if (!exec)
1047 return -ENOMEM;
1048
1049 exec->dev = vc4;
1050
1051 ret = vc4_v3d_pm_get(vc4);
1052 if (ret) {
1053 kfree(exec);
1054 return ret;
1055 }
1056
1057 exec->args = args;
1058 INIT_LIST_HEAD(&exec->unref_list);
1059
1060 ret = vc4_cl_lookup_bos(dev, file_priv, exec);
1061 if (ret)
1062 goto fail;
1063
1064 if (args->perfmonid) {
1065 exec->perfmon = vc4_perfmon_find(vc4file,
1066 args->perfmonid);
1067 if (!exec->perfmon) {
1068 ret = -ENOENT;
1069 goto fail;
1070 }
1071 }
1072
1073 if (args->in_sync) {
1074 ret = drm_syncobj_find_fence(file_priv, args->in_sync,
1075 0, 0, &in_fence);
1076 if (ret)
1077 goto fail;
1078
1079 /* When the fence (or fence array) is exclusively from our
1080 * context we can skip the wait since jobs are executed in
1081 * order of their submission through this ioctl and this can
1082 * only have fences from a prior job.
1083 */
1084 if (!dma_fence_match_context(in_fence,
1085 vc4->dma_fence_context)) {
1086 ret = dma_fence_wait(in_fence, true);
1087 if (ret) {
1088 dma_fence_put(in_fence);
1089 goto fail;
1090 }
1091 }
1092
1093 dma_fence_put(in_fence);
1094 }
1095
1096 if (exec->args->bin_cl_size != 0) {
1097 ret = vc4_get_bcl(dev, exec);
1098 if (ret)
1099 goto fail;
1100 } else {
1101 exec->ct0ca = 0;
1102 exec->ct0ea = 0;
1103 }
1104
1105 ret = vc4_get_rcl(dev, exec);
1106 if (ret)
1107 goto fail;
1108
1109 ret = vc4_lock_bo_reservations(exec, &exec_ctx);
1110 if (ret)
1111 goto fail;
1112
1113 if (args->out_sync) {
1114 out_sync = drm_syncobj_find(file_priv, args->out_sync);
1115 if (!out_sync) {
1116 ret = -EINVAL;
1117 goto fail_unreserve;
1118 }
1119
1120 /* We replace the fence in out_sync in vc4_queue_submit since
1121 * the render job could execute immediately after that call.
1122 * If it finishes before our ioctl processing resumes the
1123 * render job fence could already have been freed.
1124 */
1125 }
1126
1127 /* Clear this out of the struct we'll be putting in the queue,
1128 * since it's part of our stack.
1129 */
1130 exec->args = NULL;
1131
1132 ret = vc4_queue_submit(dev, exec, &exec_ctx, out_sync);
1133
1134 /* The syncobj isn't part of the exec data and we need to free our
1135 * reference even if job submission failed.
1136 */
1137 if (out_sync)
1138 drm_syncobj_put(out_sync);
1139
1140 if (ret)
1141 goto fail_unreserve;
1142
1143 /* Return the seqno for our job. */
1144 args->seqno = vc4->emit_seqno;
1145
1146 return 0;
1147
1148 fail_unreserve:
1149 drm_exec_fini(&exec_ctx);
1150 fail:
1151 vc4_complete_exec(&vc4->base, exec);
1152
1153 return ret;
1154 }
1155
1156 static void vc4_gem_destroy(struct drm_device *dev, void *unused);
vc4_gem_init(struct drm_device * dev)1157 int vc4_gem_init(struct drm_device *dev)
1158 {
1159 struct vc4_dev *vc4 = to_vc4_dev(dev);
1160 int ret;
1161
1162 if (WARN_ON_ONCE(vc4->gen > VC4_GEN_4))
1163 return -ENODEV;
1164
1165 vc4->dma_fence_context = dma_fence_context_alloc(1);
1166
1167 INIT_LIST_HEAD(&vc4->bin_job_list);
1168 INIT_LIST_HEAD(&vc4->render_job_list);
1169 INIT_LIST_HEAD(&vc4->job_done_list);
1170 spin_lock_init(&vc4->job_lock);
1171
1172 INIT_WORK(&vc4->hangcheck.reset_work, vc4_reset_work);
1173 timer_setup(&vc4->hangcheck.timer, vc4_hangcheck_elapsed, 0);
1174
1175 INIT_WORK(&vc4->job_done_work, vc4_job_done_work);
1176
1177 ret = drmm_mutex_init(dev, &vc4->power_lock);
1178 if (ret)
1179 return ret;
1180
1181 INIT_LIST_HEAD(&vc4->purgeable.list);
1182
1183 ret = drmm_mutex_init(dev, &vc4->purgeable.lock);
1184 if (ret)
1185 return ret;
1186
1187 return drmm_add_action_or_reset(dev, vc4_gem_destroy, NULL);
1188 }
1189
vc4_gem_destroy(struct drm_device * dev,void * unused)1190 static void vc4_gem_destroy(struct drm_device *dev, void *unused)
1191 {
1192 struct vc4_dev *vc4 = to_vc4_dev(dev);
1193
1194 /* Waiting for exec to finish would need to be done before
1195 * unregistering V3D.
1196 */
1197 WARN_ON(vc4->emit_seqno != vc4->finished_seqno);
1198
1199 /* V3D should already have disabled its interrupt and cleared
1200 * the overflow allocation registers. Now free the object.
1201 */
1202 if (vc4->bin_bo) {
1203 drm_gem_object_put(&vc4->bin_bo->base.base);
1204 vc4->bin_bo = NULL;
1205 }
1206
1207 if (vc4->hang_state)
1208 vc4_free_hang_state(dev, vc4->hang_state);
1209 }
1210
vc4_gem_madvise_ioctl(struct drm_device * dev,void * data,struct drm_file * file_priv)1211 int vc4_gem_madvise_ioctl(struct drm_device *dev, void *data,
1212 struct drm_file *file_priv)
1213 {
1214 struct vc4_dev *vc4 = to_vc4_dev(dev);
1215 struct drm_vc4_gem_madvise *args = data;
1216 struct drm_gem_object *gem_obj;
1217 struct vc4_bo *bo;
1218 int ret;
1219
1220 if (WARN_ON_ONCE(vc4->gen > VC4_GEN_4))
1221 return -ENODEV;
1222
1223 switch (args->madv) {
1224 case VC4_MADV_DONTNEED:
1225 case VC4_MADV_WILLNEED:
1226 break;
1227 default:
1228 return -EINVAL;
1229 }
1230
1231 if (args->pad != 0)
1232 return -EINVAL;
1233
1234 gem_obj = drm_gem_object_lookup(file_priv, args->handle);
1235 if (!gem_obj) {
1236 DRM_DEBUG("Failed to look up GEM BO %d\n", args->handle);
1237 return -ENOENT;
1238 }
1239
1240 bo = to_vc4_bo(gem_obj);
1241
1242 /* Only BOs exposed to userspace can be purged. */
1243 if (bo->madv == __VC4_MADV_NOTSUPP) {
1244 DRM_DEBUG("madvise not supported on this BO\n");
1245 ret = -EINVAL;
1246 goto out_put_gem;
1247 }
1248
1249 /* Not sure it's safe to purge imported BOs. Let's just assume it's
1250 * not until proven otherwise.
1251 */
1252 if (gem_obj->import_attach) {
1253 DRM_DEBUG("madvise not supported on imported BOs\n");
1254 ret = -EINVAL;
1255 goto out_put_gem;
1256 }
1257
1258 mutex_lock(&bo->madv_lock);
1259
1260 if (args->madv == VC4_MADV_DONTNEED && bo->madv == VC4_MADV_WILLNEED &&
1261 !refcount_read(&bo->usecnt)) {
1262 /* If the BO is about to be marked as purgeable, is not used
1263 * and is not already purgeable or purged, add it to the
1264 * purgeable list.
1265 */
1266 vc4_bo_add_to_purgeable_pool(bo);
1267 } else if (args->madv == VC4_MADV_WILLNEED &&
1268 bo->madv == VC4_MADV_DONTNEED &&
1269 !refcount_read(&bo->usecnt)) {
1270 /* The BO has not been purged yet, just remove it from
1271 * the purgeable list.
1272 */
1273 vc4_bo_remove_from_purgeable_pool(bo);
1274 }
1275
1276 /* Save the purged state. */
1277 args->retained = bo->madv != __VC4_MADV_PURGED;
1278
1279 /* Update internal madv state only if the bo was not purged. */
1280 if (bo->madv != __VC4_MADV_PURGED)
1281 bo->madv = args->madv;
1282
1283 mutex_unlock(&bo->madv_lock);
1284
1285 ret = 0;
1286
1287 out_put_gem:
1288 drm_gem_object_put(gem_obj);
1289
1290 return ret;
1291 }
1292