1 // SPDX-License-Identifier: GPL-2.0 OR MIT 2 /* Copyright 2017-2019 Qiang Yu <yuq825@gmail.com> */ 3 4 #include <linux/hardirq.h> 5 #include <linux/iosys-map.h> 6 #include <linux/kthread.h> 7 #include <linux/slab.h> 8 #include <linux/vmalloc.h> 9 #include <linux/pm_runtime.h> 10 11 #include "lima_devfreq.h" 12 #include "lima_drv.h" 13 #include "lima_sched.h" 14 #include "lima_vm.h" 15 #include "lima_mmu.h" 16 #include "lima_l2_cache.h" 17 #include "lima_gem.h" 18 #include "lima_trace.h" 19 20 struct lima_fence { 21 struct dma_fence base; 22 struct lima_sched_pipe *pipe; 23 }; 24 25 static struct kmem_cache *lima_fence_slab; 26 static int lima_fence_slab_refcnt; 27 28 int lima_sched_slab_init(void) 29 { 30 if (!lima_fence_slab) { 31 lima_fence_slab = kmem_cache_create( 32 "lima_fence", sizeof(struct lima_fence), 0, 33 SLAB_HWCACHE_ALIGN, NULL); 34 if (!lima_fence_slab) 35 return -ENOMEM; 36 } 37 38 lima_fence_slab_refcnt++; 39 return 0; 40 } 41 42 void lima_sched_slab_fini(void) 43 { 44 if (!--lima_fence_slab_refcnt) { 45 kmem_cache_destroy(lima_fence_slab); 46 lima_fence_slab = NULL; 47 } 48 } 49 50 static inline struct lima_fence *to_lima_fence(struct dma_fence *fence) 51 { 52 return container_of(fence, struct lima_fence, base); 53 } 54 55 static const char *lima_fence_get_driver_name(struct dma_fence *fence) 56 { 57 return "lima"; 58 } 59 60 static const char *lima_fence_get_timeline_name(struct dma_fence *fence) 61 { 62 struct lima_fence *f = to_lima_fence(fence); 63 64 return f->pipe->base.name; 65 } 66 67 static void lima_fence_release_rcu(struct rcu_head *rcu) 68 { 69 struct dma_fence *f = container_of(rcu, struct dma_fence, rcu); 70 struct lima_fence *fence = to_lima_fence(f); 71 72 kmem_cache_free(lima_fence_slab, fence); 73 } 74 75 static void lima_fence_release(struct dma_fence *fence) 76 { 77 struct lima_fence *f = to_lima_fence(fence); 78 79 call_rcu(&f->base.rcu, lima_fence_release_rcu); 80 } 81 82 static const struct dma_fence_ops lima_fence_ops = { 83 .get_driver_name = lima_fence_get_driver_name, 84 .get_timeline_name = lima_fence_get_timeline_name, 85 .release = lima_fence_release, 86 }; 87 88 static struct lima_fence *lima_fence_create(struct lima_sched_pipe *pipe) 89 { 90 struct lima_fence *fence; 91 92 fence = kmem_cache_zalloc(lima_fence_slab, GFP_KERNEL); 93 if (!fence) 94 return NULL; 95 96 fence->pipe = pipe; 97 dma_fence_init(&fence->base, &lima_fence_ops, &pipe->fence_lock, 98 pipe->fence_context, ++pipe->fence_seqno); 99 100 return fence; 101 } 102 103 static inline struct lima_sched_task *to_lima_task(struct drm_sched_job *job) 104 { 105 return container_of(job, struct lima_sched_task, base); 106 } 107 108 static inline struct lima_sched_pipe *to_lima_pipe(struct drm_gpu_scheduler *sched) 109 { 110 return container_of(sched, struct lima_sched_pipe, base); 111 } 112 113 int lima_sched_task_init(struct lima_sched_task *task, 114 struct lima_sched_context *context, 115 struct lima_bo **bos, int num_bos, 116 struct lima_vm *vm, 117 u64 drm_client_id) 118 { 119 int err, i; 120 121 task->bos = kmemdup(bos, sizeof(*bos) * num_bos, GFP_KERNEL); 122 if (!task->bos) 123 return -ENOMEM; 124 125 for (i = 0; i < num_bos; i++) 126 drm_gem_object_get(&bos[i]->base.base); 127 128 err = drm_sched_job_init(&task->base, &context->base, 1, vm, 129 drm_client_id); 130 if (err) { 131 kfree(task->bos); 132 return err; 133 } 134 135 drm_sched_job_arm(&task->base); 136 137 task->num_bos = num_bos; 138 task->vm = lima_vm_get(vm); 139 140 return 0; 141 } 142 143 void lima_sched_task_fini(struct lima_sched_task *task) 144 { 145 int i; 146 147 drm_sched_job_cleanup(&task->base); 148 149 if (task->bos) { 150 for (i = 0; i < task->num_bos; i++) 151 drm_gem_object_put(&task->bos[i]->base.base); 152 kfree(task->bos); 153 } 154 155 lima_vm_put(task->vm); 156 } 157 158 int lima_sched_context_init(struct lima_sched_pipe *pipe, 159 struct lima_sched_context *context) 160 { 161 struct drm_gpu_scheduler *sched = &pipe->base; 162 163 return drm_sched_entity_init(&context->base, DRM_SCHED_PRIORITY_NORMAL, 164 &sched, 1, NULL); 165 } 166 167 void lima_sched_context_fini(struct lima_sched_pipe *pipe, 168 struct lima_sched_context *context) 169 { 170 drm_sched_entity_destroy(&context->base); 171 } 172 173 struct dma_fence *lima_sched_context_queue_task(struct lima_sched_task *task) 174 { 175 struct dma_fence *fence = dma_fence_get(&task->base.s_fence->finished); 176 177 trace_lima_task_submit(task); 178 drm_sched_entity_push_job(&task->base); 179 return fence; 180 } 181 182 static int lima_pm_busy(struct lima_device *ldev) 183 { 184 int ret; 185 186 /* resume GPU if it has been suspended by runtime PM */ 187 ret = pm_runtime_resume_and_get(ldev->dev); 188 if (ret < 0) 189 return ret; 190 191 lima_devfreq_record_busy(&ldev->devfreq); 192 return 0; 193 } 194 195 static void lima_pm_idle(struct lima_device *ldev) 196 { 197 lima_devfreq_record_idle(&ldev->devfreq); 198 199 /* GPU can do auto runtime suspend */ 200 pm_runtime_mark_last_busy(ldev->dev); 201 pm_runtime_put_autosuspend(ldev->dev); 202 } 203 204 static struct dma_fence *lima_sched_run_job(struct drm_sched_job *job) 205 { 206 struct lima_sched_task *task = to_lima_task(job); 207 struct lima_sched_pipe *pipe = to_lima_pipe(job->sched); 208 struct lima_device *ldev = pipe->ldev; 209 struct lima_fence *fence; 210 int i, err; 211 212 /* after GPU reset */ 213 if (job->s_fence->finished.error < 0) 214 return NULL; 215 216 fence = lima_fence_create(pipe); 217 if (!fence) 218 return NULL; 219 220 err = lima_pm_busy(ldev); 221 if (err < 0) { 222 dma_fence_put(&fence->base); 223 return NULL; 224 } 225 226 task->fence = &fence->base; 227 228 /* for caller usage of the fence, otherwise irq handler 229 * may consume the fence before caller use it 230 */ 231 dma_fence_get(task->fence); 232 233 pipe->current_task = task; 234 235 /* this is needed for MMU to work correctly, otherwise GP/PP 236 * will hang or page fault for unknown reason after running for 237 * a while. 238 * 239 * Need to investigate: 240 * 1. is it related to TLB 241 * 2. how much performance will be affected by L2 cache flush 242 * 3. can we reduce the calling of this function because all 243 * GP/PP use the same L2 cache on mali400 244 * 245 * TODO: 246 * 1. move this to task fini to save some wait time? 247 * 2. when GP/PP use different l2 cache, need PP wait GP l2 248 * cache flush? 249 */ 250 for (i = 0; i < pipe->num_l2_cache; i++) 251 lima_l2_cache_flush(pipe->l2_cache[i]); 252 253 lima_vm_put(pipe->current_vm); 254 pipe->current_vm = lima_vm_get(task->vm); 255 256 if (pipe->bcast_mmu) 257 lima_mmu_switch_vm(pipe->bcast_mmu, pipe->current_vm); 258 else { 259 for (i = 0; i < pipe->num_mmu; i++) 260 lima_mmu_switch_vm(pipe->mmu[i], pipe->current_vm); 261 } 262 263 trace_lima_task_run(task); 264 265 pipe->error = false; 266 pipe->task_run(pipe, task); 267 268 return task->fence; 269 } 270 271 static void lima_sched_build_error_task_list(struct lima_sched_task *task) 272 { 273 struct lima_sched_error_task *et; 274 struct lima_sched_pipe *pipe = to_lima_pipe(task->base.sched); 275 struct lima_ip *ip = pipe->processor[0]; 276 int pipe_id = ip->id == lima_ip_gp ? lima_pipe_gp : lima_pipe_pp; 277 struct lima_device *dev = ip->dev; 278 struct lima_sched_context *sched_ctx = 279 container_of(task->base.entity, 280 struct lima_sched_context, base); 281 struct lima_ctx *ctx = 282 container_of(sched_ctx, struct lima_ctx, context[pipe_id]); 283 struct lima_dump_task *dt; 284 struct lima_dump_chunk *chunk; 285 struct lima_dump_chunk_pid *pid_chunk; 286 struct lima_dump_chunk_buffer *buffer_chunk; 287 u32 size, task_size, mem_size; 288 int i; 289 struct iosys_map map; 290 int ret; 291 292 mutex_lock(&dev->error_task_list_lock); 293 294 if (dev->dump.num_tasks >= lima_max_error_tasks) { 295 dev_info(dev->dev, "fail to save task state from %s pid %d: " 296 "error task list is full\n", ctx->pname, ctx->pid); 297 goto out; 298 } 299 300 /* frame chunk */ 301 size = sizeof(struct lima_dump_chunk) + pipe->frame_size; 302 /* process name chunk */ 303 size += sizeof(struct lima_dump_chunk) + sizeof(ctx->pname); 304 /* pid chunk */ 305 size += sizeof(struct lima_dump_chunk); 306 /* buffer chunks */ 307 for (i = 0; i < task->num_bos; i++) { 308 struct lima_bo *bo = task->bos[i]; 309 310 size += sizeof(struct lima_dump_chunk); 311 size += bo->heap_size ? bo->heap_size : lima_bo_size(bo); 312 } 313 314 task_size = size + sizeof(struct lima_dump_task); 315 mem_size = task_size + sizeof(*et); 316 et = kvmalloc(mem_size, GFP_KERNEL); 317 if (!et) { 318 dev_err(dev->dev, "fail to alloc task dump buffer of size %x\n", 319 mem_size); 320 goto out; 321 } 322 323 et->data = et + 1; 324 et->size = task_size; 325 326 dt = et->data; 327 memset(dt, 0, sizeof(*dt)); 328 dt->id = pipe_id; 329 dt->size = size; 330 331 chunk = (struct lima_dump_chunk *)(dt + 1); 332 memset(chunk, 0, sizeof(*chunk)); 333 chunk->id = LIMA_DUMP_CHUNK_FRAME; 334 chunk->size = pipe->frame_size; 335 memcpy(chunk + 1, task->frame, pipe->frame_size); 336 dt->num_chunks++; 337 338 chunk = (void *)(chunk + 1) + chunk->size; 339 memset(chunk, 0, sizeof(*chunk)); 340 chunk->id = LIMA_DUMP_CHUNK_PROCESS_NAME; 341 chunk->size = sizeof(ctx->pname); 342 memcpy(chunk + 1, ctx->pname, sizeof(ctx->pname)); 343 dt->num_chunks++; 344 345 pid_chunk = (void *)(chunk + 1) + chunk->size; 346 memset(pid_chunk, 0, sizeof(*pid_chunk)); 347 pid_chunk->id = LIMA_DUMP_CHUNK_PROCESS_ID; 348 pid_chunk->pid = ctx->pid; 349 dt->num_chunks++; 350 351 buffer_chunk = (void *)(pid_chunk + 1) + pid_chunk->size; 352 for (i = 0; i < task->num_bos; i++) { 353 struct lima_bo *bo = task->bos[i]; 354 void *data; 355 356 memset(buffer_chunk, 0, sizeof(*buffer_chunk)); 357 buffer_chunk->id = LIMA_DUMP_CHUNK_BUFFER; 358 buffer_chunk->va = lima_vm_get_va(task->vm, bo); 359 360 if (bo->heap_size) { 361 buffer_chunk->size = bo->heap_size; 362 363 data = vmap(bo->base.pages, bo->heap_size >> PAGE_SHIFT, 364 VM_MAP, pgprot_writecombine(PAGE_KERNEL)); 365 if (!data) { 366 kvfree(et); 367 goto out; 368 } 369 370 memcpy(buffer_chunk + 1, data, buffer_chunk->size); 371 372 vunmap(data); 373 } else { 374 buffer_chunk->size = lima_bo_size(bo); 375 376 ret = drm_gem_vmap(&bo->base.base, &map); 377 if (ret) { 378 kvfree(et); 379 goto out; 380 } 381 382 memcpy(buffer_chunk + 1, map.vaddr, buffer_chunk->size); 383 384 drm_gem_vunmap(&bo->base.base, &map); 385 } 386 387 buffer_chunk = (void *)(buffer_chunk + 1) + buffer_chunk->size; 388 dt->num_chunks++; 389 } 390 391 list_add(&et->list, &dev->error_task_list); 392 dev->dump.size += et->size; 393 dev->dump.num_tasks++; 394 395 dev_info(dev->dev, "save error task state success\n"); 396 397 out: 398 mutex_unlock(&dev->error_task_list_lock); 399 } 400 401 static enum drm_gpu_sched_stat lima_sched_timedout_job(struct drm_sched_job *job) 402 { 403 struct lima_sched_pipe *pipe = to_lima_pipe(job->sched); 404 struct lima_sched_task *task = to_lima_task(job); 405 struct lima_device *ldev = pipe->ldev; 406 struct lima_ip *ip = pipe->processor[0]; 407 int i; 408 409 /* 410 * If the GPU managed to complete this jobs fence, the timeout is 411 * spurious. Bail out. 412 */ 413 if (dma_fence_is_signaled(task->fence)) { 414 DRM_WARN("%s spurious timeout\n", lima_ip_name(ip)); 415 return DRM_GPU_SCHED_STAT_NOMINAL; 416 } 417 418 /* 419 * Lima IRQ handler may take a long time to process an interrupt 420 * if there is another IRQ handler hogging the processing. 421 * In order to catch such cases and not report spurious Lima job 422 * timeouts, synchronize the IRQ handler and re-check the fence 423 * status. 424 */ 425 for (i = 0; i < pipe->num_processor; i++) 426 synchronize_irq(pipe->processor[i]->irq); 427 if (pipe->bcast_processor) 428 synchronize_irq(pipe->bcast_processor->irq); 429 430 if (dma_fence_is_signaled(task->fence)) { 431 DRM_WARN("%s unexpectedly high interrupt latency\n", lima_ip_name(ip)); 432 return DRM_GPU_SCHED_STAT_NOMINAL; 433 } 434 435 /* 436 * The task might still finish while this timeout handler runs. 437 * To prevent a race condition on its completion, mask all irqs 438 * on the running core until the next hard reset completes. 439 */ 440 pipe->task_mask_irq(pipe); 441 442 if (!pipe->error) 443 DRM_ERROR("%s job timeout\n", lima_ip_name(ip)); 444 445 drm_sched_stop(&pipe->base, &task->base); 446 447 drm_sched_increase_karma(&task->base); 448 449 if (lima_max_error_tasks) 450 lima_sched_build_error_task_list(task); 451 452 pipe->task_error(pipe); 453 454 if (pipe->bcast_mmu) 455 lima_mmu_page_fault_resume(pipe->bcast_mmu); 456 else { 457 for (i = 0; i < pipe->num_mmu; i++) 458 lima_mmu_page_fault_resume(pipe->mmu[i]); 459 } 460 461 lima_vm_put(pipe->current_vm); 462 pipe->current_vm = NULL; 463 pipe->current_task = NULL; 464 465 lima_pm_idle(ldev); 466 467 drm_sched_resubmit_jobs(&pipe->base); 468 drm_sched_start(&pipe->base, 0); 469 470 return DRM_GPU_SCHED_STAT_NOMINAL; 471 } 472 473 static void lima_sched_free_job(struct drm_sched_job *job) 474 { 475 struct lima_sched_task *task = to_lima_task(job); 476 struct lima_sched_pipe *pipe = to_lima_pipe(job->sched); 477 struct lima_vm *vm = task->vm; 478 struct lima_bo **bos = task->bos; 479 int i; 480 481 dma_fence_put(task->fence); 482 483 for (i = 0; i < task->num_bos; i++) 484 lima_vm_bo_del(vm, bos[i]); 485 486 lima_sched_task_fini(task); 487 kmem_cache_free(pipe->task_slab, task); 488 } 489 490 static const struct drm_sched_backend_ops lima_sched_ops = { 491 .run_job = lima_sched_run_job, 492 .timedout_job = lima_sched_timedout_job, 493 .free_job = lima_sched_free_job, 494 }; 495 496 static void lima_sched_recover_work(struct work_struct *work) 497 { 498 struct lima_sched_pipe *pipe = 499 container_of(work, struct lima_sched_pipe, recover_work); 500 int i; 501 502 for (i = 0; i < pipe->num_l2_cache; i++) 503 lima_l2_cache_flush(pipe->l2_cache[i]); 504 505 if (pipe->bcast_mmu) { 506 lima_mmu_flush_tlb(pipe->bcast_mmu); 507 } else { 508 for (i = 0; i < pipe->num_mmu; i++) 509 lima_mmu_flush_tlb(pipe->mmu[i]); 510 } 511 512 if (pipe->task_recover(pipe)) 513 drm_sched_fault(&pipe->base); 514 } 515 516 int lima_sched_pipe_init(struct lima_sched_pipe *pipe, const char *name) 517 { 518 unsigned int timeout = lima_sched_timeout_ms > 0 ? 519 lima_sched_timeout_ms : 10000; 520 const struct drm_sched_init_args args = { 521 .ops = &lima_sched_ops, 522 .num_rqs = DRM_SCHED_PRIORITY_COUNT, 523 .credit_limit = 1, 524 .hang_limit = lima_job_hang_limit, 525 .timeout = msecs_to_jiffies(timeout), 526 .name = name, 527 .dev = pipe->ldev->dev, 528 }; 529 530 pipe->fence_context = dma_fence_context_alloc(1); 531 spin_lock_init(&pipe->fence_lock); 532 533 INIT_WORK(&pipe->recover_work, lima_sched_recover_work); 534 535 return drm_sched_init(&pipe->base, &args); 536 } 537 538 void lima_sched_pipe_fini(struct lima_sched_pipe *pipe) 539 { 540 drm_sched_fini(&pipe->base); 541 } 542 543 void lima_sched_pipe_task_done(struct lima_sched_pipe *pipe) 544 { 545 struct lima_sched_task *task = pipe->current_task; 546 struct lima_device *ldev = pipe->ldev; 547 548 if (pipe->error) { 549 if (task && task->recoverable) 550 schedule_work(&pipe->recover_work); 551 else 552 drm_sched_fault(&pipe->base); 553 } else { 554 pipe->task_fini(pipe); 555 dma_fence_signal(task->fence); 556 557 lima_pm_idle(ldev); 558 } 559 } 560