1 // SPDX-License-Identifier: GPL-2.0+ 2 /* Copyright (C) 2018 Broadcom */ 3 4 /** 5 * DOC: Broadcom V3D scheduling 6 * 7 * The shared DRM GPU scheduler is used to coordinate submitting jobs 8 * to the hardware. Each DRM fd (roughly a client process) gets its 9 * own scheduler entity, which will process jobs in order. The GPU 10 * scheduler will round-robin between clients to submit the next job. 11 * 12 * For simplicity, and in order to keep latency low for interactive 13 * jobs when bulk background jobs are queued up, we submit a new job 14 * to the HW only when it has completed the last one, instead of 15 * filling up the CT[01]Q FIFOs with jobs. Similarly, we use 16 * drm_sched_job_add_dependency() to manage the dependency between bin and 17 * render, instead of having the clients submit jobs using the HW's 18 * semaphores to interlock between them. 19 */ 20 21 #include <linux/sched/clock.h> 22 #include <linux/kthread.h> 23 24 #include "v3d_drv.h" 25 #include "v3d_regs.h" 26 #include "v3d_trace.h" 27 28 static struct v3d_job * 29 to_v3d_job(struct drm_sched_job *sched_job) 30 { 31 return container_of(sched_job, struct v3d_job, base); 32 } 33 34 static struct v3d_bin_job * 35 to_bin_job(struct drm_sched_job *sched_job) 36 { 37 return container_of(sched_job, struct v3d_bin_job, base.base); 38 } 39 40 static struct v3d_render_job * 41 to_render_job(struct drm_sched_job *sched_job) 42 { 43 return container_of(sched_job, struct v3d_render_job, base.base); 44 } 45 46 static struct v3d_tfu_job * 47 to_tfu_job(struct drm_sched_job *sched_job) 48 { 49 return container_of(sched_job, struct v3d_tfu_job, base.base); 50 } 51 52 static struct v3d_csd_job * 53 to_csd_job(struct drm_sched_job *sched_job) 54 { 55 return container_of(sched_job, struct v3d_csd_job, base.base); 56 } 57 58 static void 59 v3d_sched_job_free(struct drm_sched_job *sched_job) 60 { 61 struct v3d_job *job = to_v3d_job(sched_job); 62 63 v3d_job_cleanup(job); 64 } 65 66 static void 67 v3d_switch_perfmon(struct v3d_dev *v3d, struct v3d_job *job) 68 { 69 if (job->perfmon != v3d->active_perfmon) 70 v3d_perfmon_stop(v3d, v3d->active_perfmon, true); 71 72 if (job->perfmon && v3d->active_perfmon != job->perfmon) 73 v3d_perfmon_start(v3d, job->perfmon); 74 } 75 76 static struct dma_fence *v3d_bin_job_run(struct drm_sched_job *sched_job) 77 { 78 struct v3d_bin_job *job = to_bin_job(sched_job); 79 struct v3d_dev *v3d = job->base.v3d; 80 struct v3d_file_priv *file = job->base.file->driver_priv; 81 struct drm_device *dev = &v3d->drm; 82 struct dma_fence *fence; 83 unsigned long irqflags; 84 85 if (unlikely(job->base.base.s_fence->finished.error)) 86 return NULL; 87 88 /* Lock required around bin_job update vs 89 * v3d_overflow_mem_work(). 90 */ 91 spin_lock_irqsave(&v3d->job_lock, irqflags); 92 v3d->bin_job = job; 93 /* Clear out the overflow allocation, so we don't 94 * reuse the overflow attached to a previous job. 95 */ 96 V3D_CORE_WRITE(0, V3D_PTB_BPOS, 0); 97 spin_unlock_irqrestore(&v3d->job_lock, irqflags); 98 99 v3d_invalidate_caches(v3d); 100 101 fence = v3d_fence_create(v3d, V3D_BIN); 102 if (IS_ERR(fence)) 103 return NULL; 104 105 if (job->base.irq_fence) 106 dma_fence_put(job->base.irq_fence); 107 job->base.irq_fence = dma_fence_get(fence); 108 109 trace_v3d_submit_cl(dev, false, to_v3d_fence(fence)->seqno, 110 job->start, job->end); 111 112 file->start_ns[V3D_BIN] = local_clock(); 113 v3d->queue[V3D_BIN].start_ns = file->start_ns[V3D_BIN]; 114 115 v3d_switch_perfmon(v3d, &job->base); 116 117 /* Set the current and end address of the control list. 118 * Writing the end register is what starts the job. 119 */ 120 if (job->qma) { 121 V3D_CORE_WRITE(0, V3D_CLE_CT0QMA, job->qma); 122 V3D_CORE_WRITE(0, V3D_CLE_CT0QMS, job->qms); 123 } 124 if (job->qts) { 125 V3D_CORE_WRITE(0, V3D_CLE_CT0QTS, 126 V3D_CLE_CT0QTS_ENABLE | 127 job->qts); 128 } 129 V3D_CORE_WRITE(0, V3D_CLE_CT0QBA, job->start); 130 V3D_CORE_WRITE(0, V3D_CLE_CT0QEA, job->end); 131 132 return fence; 133 } 134 135 static struct dma_fence *v3d_render_job_run(struct drm_sched_job *sched_job) 136 { 137 struct v3d_render_job *job = to_render_job(sched_job); 138 struct v3d_dev *v3d = job->base.v3d; 139 struct v3d_file_priv *file = job->base.file->driver_priv; 140 struct drm_device *dev = &v3d->drm; 141 struct dma_fence *fence; 142 143 if (unlikely(job->base.base.s_fence->finished.error)) 144 return NULL; 145 146 v3d->render_job = job; 147 148 /* Can we avoid this flush? We need to be careful of 149 * scheduling, though -- imagine job0 rendering to texture and 150 * job1 reading, and them being executed as bin0, bin1, 151 * render0, render1, so that render1's flush at bin time 152 * wasn't enough. 153 */ 154 v3d_invalidate_caches(v3d); 155 156 fence = v3d_fence_create(v3d, V3D_RENDER); 157 if (IS_ERR(fence)) 158 return NULL; 159 160 if (job->base.irq_fence) 161 dma_fence_put(job->base.irq_fence); 162 job->base.irq_fence = dma_fence_get(fence); 163 164 trace_v3d_submit_cl(dev, true, to_v3d_fence(fence)->seqno, 165 job->start, job->end); 166 167 file->start_ns[V3D_RENDER] = local_clock(); 168 v3d->queue[V3D_RENDER].start_ns = file->start_ns[V3D_RENDER]; 169 170 v3d_switch_perfmon(v3d, &job->base); 171 172 /* XXX: Set the QCFG */ 173 174 /* Set the current and end address of the control list. 175 * Writing the end register is what starts the job. 176 */ 177 V3D_CORE_WRITE(0, V3D_CLE_CT1QBA, job->start); 178 V3D_CORE_WRITE(0, V3D_CLE_CT1QEA, job->end); 179 180 return fence; 181 } 182 183 static struct dma_fence * 184 v3d_tfu_job_run(struct drm_sched_job *sched_job) 185 { 186 struct v3d_tfu_job *job = to_tfu_job(sched_job); 187 struct v3d_dev *v3d = job->base.v3d; 188 struct v3d_file_priv *file = job->base.file->driver_priv; 189 struct drm_device *dev = &v3d->drm; 190 struct dma_fence *fence; 191 192 fence = v3d_fence_create(v3d, V3D_TFU); 193 if (IS_ERR(fence)) 194 return NULL; 195 196 v3d->tfu_job = job; 197 if (job->base.irq_fence) 198 dma_fence_put(job->base.irq_fence); 199 job->base.irq_fence = dma_fence_get(fence); 200 201 trace_v3d_submit_tfu(dev, to_v3d_fence(fence)->seqno); 202 203 file->start_ns[V3D_TFU] = local_clock(); 204 v3d->queue[V3D_TFU].start_ns = file->start_ns[V3D_TFU]; 205 206 V3D_WRITE(V3D_TFU_IIA(v3d->ver), job->args.iia); 207 V3D_WRITE(V3D_TFU_IIS(v3d->ver), job->args.iis); 208 V3D_WRITE(V3D_TFU_ICA(v3d->ver), job->args.ica); 209 V3D_WRITE(V3D_TFU_IUA(v3d->ver), job->args.iua); 210 V3D_WRITE(V3D_TFU_IOA(v3d->ver), job->args.ioa); 211 if (v3d->ver >= 71) 212 V3D_WRITE(V3D_V7_TFU_IOC, job->args.v71.ioc); 213 V3D_WRITE(V3D_TFU_IOS(v3d->ver), job->args.ios); 214 V3D_WRITE(V3D_TFU_COEF0(v3d->ver), job->args.coef[0]); 215 if (v3d->ver >= 71 || (job->args.coef[0] & V3D_TFU_COEF0_USECOEF)) { 216 V3D_WRITE(V3D_TFU_COEF1(v3d->ver), job->args.coef[1]); 217 V3D_WRITE(V3D_TFU_COEF2(v3d->ver), job->args.coef[2]); 218 V3D_WRITE(V3D_TFU_COEF3(v3d->ver), job->args.coef[3]); 219 } 220 /* ICFG kicks off the job. */ 221 V3D_WRITE(V3D_TFU_ICFG(v3d->ver), job->args.icfg | V3D_TFU_ICFG_IOC); 222 223 return fence; 224 } 225 226 static struct dma_fence * 227 v3d_csd_job_run(struct drm_sched_job *sched_job) 228 { 229 struct v3d_csd_job *job = to_csd_job(sched_job); 230 struct v3d_dev *v3d = job->base.v3d; 231 struct v3d_file_priv *file = job->base.file->driver_priv; 232 struct drm_device *dev = &v3d->drm; 233 struct dma_fence *fence; 234 int i, csd_cfg0_reg, csd_cfg_reg_count; 235 236 v3d->csd_job = job; 237 238 v3d_invalidate_caches(v3d); 239 240 fence = v3d_fence_create(v3d, V3D_CSD); 241 if (IS_ERR(fence)) 242 return NULL; 243 244 if (job->base.irq_fence) 245 dma_fence_put(job->base.irq_fence); 246 job->base.irq_fence = dma_fence_get(fence); 247 248 trace_v3d_submit_csd(dev, to_v3d_fence(fence)->seqno); 249 250 file->start_ns[V3D_CSD] = local_clock(); 251 v3d->queue[V3D_CSD].start_ns = file->start_ns[V3D_CSD]; 252 253 v3d_switch_perfmon(v3d, &job->base); 254 255 csd_cfg0_reg = V3D_CSD_QUEUED_CFG0(v3d->ver); 256 csd_cfg_reg_count = v3d->ver < 71 ? 6 : 7; 257 for (i = 1; i <= csd_cfg_reg_count; i++) 258 V3D_CORE_WRITE(0, csd_cfg0_reg + 4 * i, job->args.cfg[i]); 259 /* CFG0 write kicks off the job. */ 260 V3D_CORE_WRITE(0, csd_cfg0_reg, job->args.cfg[0]); 261 262 return fence; 263 } 264 265 static struct dma_fence * 266 v3d_cache_clean_job_run(struct drm_sched_job *sched_job) 267 { 268 struct v3d_job *job = to_v3d_job(sched_job); 269 struct v3d_dev *v3d = job->v3d; 270 struct v3d_file_priv *file = job->file->driver_priv; 271 u64 runtime; 272 273 file->start_ns[V3D_CACHE_CLEAN] = local_clock(); 274 v3d->queue[V3D_CACHE_CLEAN].start_ns = file->start_ns[V3D_CACHE_CLEAN]; 275 276 v3d_clean_caches(v3d); 277 278 runtime = local_clock() - file->start_ns[V3D_CACHE_CLEAN]; 279 280 file->enabled_ns[V3D_CACHE_CLEAN] += runtime; 281 v3d->queue[V3D_CACHE_CLEAN].enabled_ns += runtime; 282 283 file->jobs_sent[V3D_CACHE_CLEAN]++; 284 v3d->queue[V3D_CACHE_CLEAN].jobs_sent++; 285 286 file->start_ns[V3D_CACHE_CLEAN] = 0; 287 v3d->queue[V3D_CACHE_CLEAN].start_ns = 0; 288 289 return NULL; 290 } 291 292 static enum drm_gpu_sched_stat 293 v3d_gpu_reset_for_timeout(struct v3d_dev *v3d, struct drm_sched_job *sched_job) 294 { 295 enum v3d_queue q; 296 297 mutex_lock(&v3d->reset_lock); 298 299 /* block scheduler */ 300 for (q = 0; q < V3D_MAX_QUEUES; q++) 301 drm_sched_stop(&v3d->queue[q].sched, sched_job); 302 303 if (sched_job) 304 drm_sched_increase_karma(sched_job); 305 306 /* get the GPU back into the init state */ 307 v3d_reset(v3d); 308 309 for (q = 0; q < V3D_MAX_QUEUES; q++) 310 drm_sched_resubmit_jobs(&v3d->queue[q].sched); 311 312 /* Unblock schedulers and restart their jobs. */ 313 for (q = 0; q < V3D_MAX_QUEUES; q++) { 314 drm_sched_start(&v3d->queue[q].sched, true); 315 } 316 317 mutex_unlock(&v3d->reset_lock); 318 319 return DRM_GPU_SCHED_STAT_NOMINAL; 320 } 321 322 /* If the current address or return address have changed, then the GPU 323 * has probably made progress and we should delay the reset. This 324 * could fail if the GPU got in an infinite loop in the CL, but that 325 * is pretty unlikely outside of an i-g-t testcase. 326 */ 327 static enum drm_gpu_sched_stat 328 v3d_cl_job_timedout(struct drm_sched_job *sched_job, enum v3d_queue q, 329 u32 *timedout_ctca, u32 *timedout_ctra) 330 { 331 struct v3d_job *job = to_v3d_job(sched_job); 332 struct v3d_dev *v3d = job->v3d; 333 u32 ctca = V3D_CORE_READ(0, V3D_CLE_CTNCA(q)); 334 u32 ctra = V3D_CORE_READ(0, V3D_CLE_CTNRA(q)); 335 336 if (*timedout_ctca != ctca || *timedout_ctra != ctra) { 337 *timedout_ctca = ctca; 338 *timedout_ctra = ctra; 339 return DRM_GPU_SCHED_STAT_NOMINAL; 340 } 341 342 return v3d_gpu_reset_for_timeout(v3d, sched_job); 343 } 344 345 static enum drm_gpu_sched_stat 346 v3d_bin_job_timedout(struct drm_sched_job *sched_job) 347 { 348 struct v3d_bin_job *job = to_bin_job(sched_job); 349 350 return v3d_cl_job_timedout(sched_job, V3D_BIN, 351 &job->timedout_ctca, &job->timedout_ctra); 352 } 353 354 static enum drm_gpu_sched_stat 355 v3d_render_job_timedout(struct drm_sched_job *sched_job) 356 { 357 struct v3d_render_job *job = to_render_job(sched_job); 358 359 return v3d_cl_job_timedout(sched_job, V3D_RENDER, 360 &job->timedout_ctca, &job->timedout_ctra); 361 } 362 363 static enum drm_gpu_sched_stat 364 v3d_generic_job_timedout(struct drm_sched_job *sched_job) 365 { 366 struct v3d_job *job = to_v3d_job(sched_job); 367 368 return v3d_gpu_reset_for_timeout(job->v3d, sched_job); 369 } 370 371 static enum drm_gpu_sched_stat 372 v3d_csd_job_timedout(struct drm_sched_job *sched_job) 373 { 374 struct v3d_csd_job *job = to_csd_job(sched_job); 375 struct v3d_dev *v3d = job->base.v3d; 376 u32 batches = V3D_CORE_READ(0, V3D_CSD_CURRENT_CFG4(v3d->ver)); 377 378 /* If we've made progress, skip reset and let the timer get 379 * rearmed. 380 */ 381 if (job->timedout_batches != batches) { 382 job->timedout_batches = batches; 383 return DRM_GPU_SCHED_STAT_NOMINAL; 384 } 385 386 return v3d_gpu_reset_for_timeout(v3d, sched_job); 387 } 388 389 static const struct drm_sched_backend_ops v3d_bin_sched_ops = { 390 .run_job = v3d_bin_job_run, 391 .timedout_job = v3d_bin_job_timedout, 392 .free_job = v3d_sched_job_free, 393 }; 394 395 static const struct drm_sched_backend_ops v3d_render_sched_ops = { 396 .run_job = v3d_render_job_run, 397 .timedout_job = v3d_render_job_timedout, 398 .free_job = v3d_sched_job_free, 399 }; 400 401 static const struct drm_sched_backend_ops v3d_tfu_sched_ops = { 402 .run_job = v3d_tfu_job_run, 403 .timedout_job = v3d_generic_job_timedout, 404 .free_job = v3d_sched_job_free, 405 }; 406 407 static const struct drm_sched_backend_ops v3d_csd_sched_ops = { 408 .run_job = v3d_csd_job_run, 409 .timedout_job = v3d_csd_job_timedout, 410 .free_job = v3d_sched_job_free 411 }; 412 413 static const struct drm_sched_backend_ops v3d_cache_clean_sched_ops = { 414 .run_job = v3d_cache_clean_job_run, 415 .timedout_job = v3d_generic_job_timedout, 416 .free_job = v3d_sched_job_free 417 }; 418 419 int 420 v3d_sched_init(struct v3d_dev *v3d) 421 { 422 int hw_jobs_limit = 1; 423 int job_hang_limit = 0; 424 int hang_limit_ms = 500; 425 int ret; 426 427 ret = drm_sched_init(&v3d->queue[V3D_BIN].sched, 428 &v3d_bin_sched_ops, NULL, 429 DRM_SCHED_PRIORITY_COUNT, 430 hw_jobs_limit, job_hang_limit, 431 msecs_to_jiffies(hang_limit_ms), NULL, 432 NULL, "v3d_bin", v3d->drm.dev); 433 if (ret) 434 return ret; 435 436 ret = drm_sched_init(&v3d->queue[V3D_RENDER].sched, 437 &v3d_render_sched_ops, NULL, 438 DRM_SCHED_PRIORITY_COUNT, 439 hw_jobs_limit, job_hang_limit, 440 msecs_to_jiffies(hang_limit_ms), NULL, 441 NULL, "v3d_render", v3d->drm.dev); 442 if (ret) 443 goto fail; 444 445 ret = drm_sched_init(&v3d->queue[V3D_TFU].sched, 446 &v3d_tfu_sched_ops, NULL, 447 DRM_SCHED_PRIORITY_COUNT, 448 hw_jobs_limit, job_hang_limit, 449 msecs_to_jiffies(hang_limit_ms), NULL, 450 NULL, "v3d_tfu", v3d->drm.dev); 451 if (ret) 452 goto fail; 453 454 if (v3d_has_csd(v3d)) { 455 ret = drm_sched_init(&v3d->queue[V3D_CSD].sched, 456 &v3d_csd_sched_ops, NULL, 457 DRM_SCHED_PRIORITY_COUNT, 458 hw_jobs_limit, job_hang_limit, 459 msecs_to_jiffies(hang_limit_ms), NULL, 460 NULL, "v3d_csd", v3d->drm.dev); 461 if (ret) 462 goto fail; 463 464 ret = drm_sched_init(&v3d->queue[V3D_CACHE_CLEAN].sched, 465 &v3d_cache_clean_sched_ops, NULL, 466 DRM_SCHED_PRIORITY_COUNT, 467 hw_jobs_limit, job_hang_limit, 468 msecs_to_jiffies(hang_limit_ms), NULL, 469 NULL, "v3d_cache_clean", v3d->drm.dev); 470 if (ret) 471 goto fail; 472 } 473 474 return 0; 475 476 fail: 477 v3d_sched_fini(v3d); 478 return ret; 479 } 480 481 void 482 v3d_sched_fini(struct v3d_dev *v3d) 483 { 484 enum v3d_queue q; 485 486 for (q = 0; q < V3D_MAX_QUEUES; q++) { 487 if (v3d->queue[q].sched.ready) 488 drm_sched_fini(&v3d->queue[q].sched); 489 } 490 } 491