10856cab1SChristian König /* 20856cab1SChristian König * Copyright 2015 Advanced Micro Devices, Inc. 30856cab1SChristian König * 40856cab1SChristian König * Permission is hereby granted, free of charge, to any person obtaining a 50856cab1SChristian König * copy of this software and associated documentation files (the "Software"), 60856cab1SChristian König * to deal in the Software without restriction, including without limitation 70856cab1SChristian König * the rights to use, copy, modify, merge, publish, distribute, sublicense, 80856cab1SChristian König * and/or sell copies of the Software, and to permit persons to whom the 90856cab1SChristian König * Software is furnished to do so, subject to the following conditions: 100856cab1SChristian König * 110856cab1SChristian König * The above copyright notice and this permission notice shall be included in 120856cab1SChristian König * all copies or substantial portions of the Software. 130856cab1SChristian König * 140856cab1SChristian König * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 150856cab1SChristian König * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 160856cab1SChristian König * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 170856cab1SChristian König * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR 180856cab1SChristian König * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, 190856cab1SChristian König * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR 200856cab1SChristian König * OTHER DEALINGS IN THE SOFTWARE. 210856cab1SChristian König * 220856cab1SChristian König * 230856cab1SChristian König */ 240856cab1SChristian König #include <linux/kthread.h> 250856cab1SChristian König #include <linux/wait.h> 260856cab1SChristian König #include <linux/sched.h> 27fdf2f6c5SSam Ravnborg 28ca4e1724SAndrey Grodzovsky #include <drm/drm_drv.h> 29ca4e1724SAndrey Grodzovsky 300856cab1SChristian König #include "amdgpu.h" 310856cab1SChristian König #include "amdgpu_trace.h" 32f1549c09SLikun Gao #include "amdgpu_reset.h" 330856cab1SChristian König 34a6a1f036SLuben Tuikov static enum drm_gpu_sched_stat amdgpu_job_timedout(struct drm_sched_job *s_job) 350de2479cSMonk Liu { 363320b8d2SChristian König struct amdgpu_ring *ring = to_amdgpu_ring(s_job->sched); 373320b8d2SChristian König struct amdgpu_job *job = to_amdgpu_job(s_job); 380346bfd9STrigger Huang struct amdgpu_task_info ti; 3995a2f917SYintian Tao struct amdgpu_device *adev = ring->adev; 40ca4e1724SAndrey Grodzovsky int idx; 417258fa31SSurbhi Kakarya int r; 42ca4e1724SAndrey Grodzovsky 43c58a863bSGuchun Chen if (!drm_dev_enter(adev_to_drm(adev), &idx)) { 44ca4e1724SAndrey Grodzovsky DRM_INFO("%s - device unplugged skipping recovery on scheduler:%s", 45ca4e1724SAndrey Grodzovsky __func__, s_job->sched->name); 46ca4e1724SAndrey Grodzovsky 47ca4e1724SAndrey Grodzovsky /* Effectively the job is aborted as the device is gone */ 48ca4e1724SAndrey Grodzovsky return DRM_GPU_SCHED_STAT_ENODEV; 49ca4e1724SAndrey Grodzovsky } 500346bfd9STrigger Huang 510346bfd9STrigger Huang memset(&ti, 0, sizeof(struct amdgpu_task_info)); 52194eb174SVictor Zhao adev->job_hang = true; 530e51a772SChristian König 54cc063ea2SMarek Olšák if (amdgpu_gpu_recovery && 55cc063ea2SMarek Olšák amdgpu_ring_soft_recovery(ring, job->vmid, s_job->s_fence->parent)) { 567876fa4fSChristian König DRM_ERROR("ring %s timeout, but soft recovered\n", 577876fa4fSChristian König s_job->sched->name); 58ca4e1724SAndrey Grodzovsky goto exit; 597876fa4fSChristian König } 607876fa4fSChristian König 610346bfd9STrigger Huang amdgpu_vm_get_task_info(ring->adev, job->pasid, &ti); 62f024e883SChristian König DRM_ERROR("ring %s timeout, signaled seq=%u, emitted seq=%u\n", 633320b8d2SChristian König job->base.sched->name, atomic_read(&ring->fence_drv.last_seq), 643320b8d2SChristian König ring->fence_drv.sync_seq); 650346bfd9STrigger Huang DRM_ERROR("Process information: process %s pid %d thread %s pid %d\n", 660346bfd9STrigger Huang ti.process_name, ti.tgid, ti.task_name, ti.pid); 674fbf87e2SMonk Liu 6895a2f917SYintian Tao if (amdgpu_device_should_recover_gpu(ring->adev)) { 69f1549c09SLikun Gao struct amdgpu_reset_context reset_context; 70f1549c09SLikun Gao memset(&reset_context, 0, sizeof(reset_context)); 71f1549c09SLikun Gao 72f1549c09SLikun Gao reset_context.method = AMD_RESET_METHOD_NONE; 73f1549c09SLikun Gao reset_context.reset_req_dev = adev; 74f1549c09SLikun Gao clear_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags); 75dac6b808SVictor Zhao clear_bit(AMDGPU_SKIP_MODE2_RESET, &reset_context.flags); 76f1549c09SLikun Gao 77f1549c09SLikun Gao r = amdgpu_device_gpu_recover(ring->adev, job, &reset_context); 787258fa31SSurbhi Kakarya if (r) 797258fa31SSurbhi Kakarya DRM_ERROR("GPU Recovery Failed: %d\n", r); 8095a2f917SYintian Tao } else { 81c3b6c607SMonk Liu drm_sched_suspend_timeout(&ring->sched); 8295a2f917SYintian Tao if (amdgpu_sriov_vf(adev)) 8395a2f917SYintian Tao adev->virt.tdr_debug = true; 8495a2f917SYintian Tao } 85ca4e1724SAndrey Grodzovsky 86ca4e1724SAndrey Grodzovsky exit: 87194eb174SVictor Zhao adev->job_hang = false; 88ca4e1724SAndrey Grodzovsky drm_dev_exit(idx); 89ca4e1724SAndrey Grodzovsky return DRM_GPU_SCHED_STAT_NOMINAL; 900de2479cSMonk Liu } 910de2479cSMonk Liu 92f7d66fb2SChristian König int amdgpu_job_alloc(struct amdgpu_device *adev, struct amdgpu_vm *vm, 93f7d66fb2SChristian König struct drm_sched_entity *entity, void *owner, 94f7d66fb2SChristian König unsigned int num_ibs, struct amdgpu_job **job) 950856cab1SChristian König { 960856cab1SChristian König if (num_ibs == 0) 970856cab1SChristian König return -EINVAL; 980856cab1SChristian König 996103b2f2SChristian König *job = kzalloc(struct_size(*job, ibs, num_ibs), GFP_KERNEL); 1000856cab1SChristian König if (!*job) 1010856cab1SChristian König return -ENOMEM; 1020856cab1SChristian König 103a1917b73SChristian König /* 104a1917b73SChristian König * Initialize the scheduler to at least some ring so that we always 105a1917b73SChristian König * have a pointer to adev. 106a1917b73SChristian König */ 107a1917b73SChristian König (*job)->base.sched = &adev->rings[0]->sched; 108c5637837SMonk Liu (*job)->vm = vm; 1090856cab1SChristian König 1101b2d5edaSChristian König amdgpu_sync_create(&(*job)->explicit_sync); 111c70b78a7SMonk Liu (*job)->vram_lost_counter = atomic_read(&adev->vram_lost_counter); 112d8de8260SAndrey Grodzovsky (*job)->vm_pd_addr = AMDGPU_BO_INVALID_OFFSET; 113e86f9ceeSChristian König 114f7d66fb2SChristian König if (!entity) 1150856cab1SChristian König return 0; 116f7d66fb2SChristian König 117f7d66fb2SChristian König return drm_sched_job_init(&(*job)->base, entity, owner); 1180856cab1SChristian König } 1190856cab1SChristian König 120f7d66fb2SChristian König int amdgpu_job_alloc_with_ib(struct amdgpu_device *adev, 121f7d66fb2SChristian König struct drm_sched_entity *entity, void *owner, 122f7d66fb2SChristian König size_t size, enum amdgpu_ib_pool_type pool_type, 1230856cab1SChristian König struct amdgpu_job **job) 1240856cab1SChristian König { 1250856cab1SChristian König int r; 1260856cab1SChristian König 127f7d66fb2SChristian König r = amdgpu_job_alloc(adev, NULL, entity, owner, 1, job); 1280856cab1SChristian König if (r) 1290856cab1SChristian König return r; 1300856cab1SChristian König 1314624459cSChristian König (*job)->num_ibs = 1; 132c8e42d57Sxinhui pan r = amdgpu_ib_get(adev, NULL, size, pool_type, &(*job)->ibs[0]); 133f7d66fb2SChristian König if (r) { 134f7d66fb2SChristian König if (entity) 135f7d66fb2SChristian König drm_sched_job_cleanup(&(*job)->base); 1360856cab1SChristian König kfree(*job); 137f7d66fb2SChristian König } 1380856cab1SChristian König 1390856cab1SChristian König return r; 1400856cab1SChristian König } 1410856cab1SChristian König 142736ec9faSChristian König void amdgpu_job_set_resources(struct amdgpu_job *job, struct amdgpu_bo *gds, 143736ec9faSChristian König struct amdgpu_bo *gws, struct amdgpu_bo *oa) 144736ec9faSChristian König { 145736ec9faSChristian König if (gds) { 146736ec9faSChristian König job->gds_base = amdgpu_bo_gpu_offset(gds) >> PAGE_SHIFT; 147736ec9faSChristian König job->gds_size = amdgpu_bo_size(gds) >> PAGE_SHIFT; 148736ec9faSChristian König } 149736ec9faSChristian König if (gws) { 150736ec9faSChristian König job->gws_base = amdgpu_bo_gpu_offset(gws) >> PAGE_SHIFT; 151736ec9faSChristian König job->gws_size = amdgpu_bo_size(gws) >> PAGE_SHIFT; 152736ec9faSChristian König } 153736ec9faSChristian König if (oa) { 154736ec9faSChristian König job->oa_base = amdgpu_bo_gpu_offset(oa) >> PAGE_SHIFT; 155736ec9faSChristian König job->oa_size = amdgpu_bo_size(oa) >> PAGE_SHIFT; 156736ec9faSChristian König } 157736ec9faSChristian König } 158736ec9faSChristian König 159a5fb4ec2SChristian König void amdgpu_job_free_resources(struct amdgpu_job *job) 1600856cab1SChristian König { 161a1917b73SChristian König struct amdgpu_ring *ring = to_amdgpu_ring(job->base.sched); 162f54d1867SChris Wilson struct dma_fence *f; 1631ab0d211SChristian König unsigned i; 1641ab0d211SChristian König 165c530b02fSJack Zhang /* use sched fence if available */ 166f6a3f660SAndrey Grodzovsky f = job->base.s_fence ? &job->base.s_fence->finished : &job->hw_fence; 1670856cab1SChristian König for (i = 0; i < job->num_ibs; ++i) 168a1917b73SChristian König amdgpu_ib_free(ring->adev, &job->ibs[i], f); 1690856cab1SChristian König } 1700856cab1SChristian König 1711b1f42d8SLucas Stach static void amdgpu_job_free_cb(struct drm_sched_job *s_job) 172b6723c8dSMonk Liu { 1733320b8d2SChristian König struct amdgpu_job *job = to_amdgpu_job(s_job); 174c5f74f78SChristian König 17526efecf9SSharat Masetty drm_sched_job_cleanup(s_job); 17626efecf9SSharat Masetty 1771b2d5edaSChristian König amdgpu_sync_free(&job->explicit_sync); 178c530b02fSJack Zhang dma_fence_put(&job->hw_fence); 179b6723c8dSMonk Liu } 180b6723c8dSMonk Liu 18168ce8b24SChristian König void amdgpu_job_set_gang_leader(struct amdgpu_job *job, 18268ce8b24SChristian König struct amdgpu_job *leader) 18368ce8b24SChristian König { 18468ce8b24SChristian König struct dma_fence *fence = &leader->base.s_fence->scheduled; 18568ce8b24SChristian König 18668ce8b24SChristian König WARN_ON(job->gang_submit); 18768ce8b24SChristian König 18868ce8b24SChristian König /* 18968ce8b24SChristian König * Don't add a reference when we are the gang leader to avoid circle 19068ce8b24SChristian König * dependency. 19168ce8b24SChristian König */ 19268ce8b24SChristian König if (job != leader) 19368ce8b24SChristian König dma_fence_get(fence); 19468ce8b24SChristian König job->gang_submit = fence; 19568ce8b24SChristian König } 19668ce8b24SChristian König 1971e24e31fSChristian König void amdgpu_job_free(struct amdgpu_job *job) 1981e24e31fSChristian König { 199f7d66fb2SChristian König if (job->base.entity) 200f7d66fb2SChristian König drm_sched_job_cleanup(&job->base); 201f7d66fb2SChristian König 2021e24e31fSChristian König amdgpu_job_free_resources(job); 2031b2d5edaSChristian König amdgpu_sync_free(&job->explicit_sync); 20468ce8b24SChristian König if (job->gang_submit != &job->base.s_fence->scheduled) 20568ce8b24SChristian König dma_fence_put(job->gang_submit); 206c530b02fSJack Zhang 2072581c5d8SYuBiao Wang if (!job->hw_fence.ops) 2082581c5d8SYuBiao Wang kfree(job); 2092581c5d8SYuBiao Wang else 210c530b02fSJack Zhang dma_fence_put(&job->hw_fence); 2111e24e31fSChristian König } 2121e24e31fSChristian König 213f7d66fb2SChristian König struct dma_fence *amdgpu_job_submit(struct amdgpu_job *job) 2140856cab1SChristian König { 215f7d66fb2SChristian König struct dma_fence *f; 2160856cab1SChristian König 217dbe48d03SDaniel Vetter drm_sched_job_arm(&job->base); 218f7d66fb2SChristian König f = dma_fence_get(&job->base.s_fence->finished); 219a5fb4ec2SChristian König amdgpu_job_free_resources(job); 2200e10e9a1SDaniel Vetter drm_sched_entity_push_job(&job->base); 2210856cab1SChristian König 222f7d66fb2SChristian König return f; 2230856cab1SChristian König } 2240856cab1SChristian König 225ee913fd9SChristian König int amdgpu_job_submit_direct(struct amdgpu_job *job, struct amdgpu_ring *ring, 226ee913fd9SChristian König struct dma_fence **fence) 227ee913fd9SChristian König { 228ee913fd9SChristian König int r; 229ee913fd9SChristian König 230ee913fd9SChristian König job->base.sched = &ring->sched; 231f6a3f660SAndrey Grodzovsky r = amdgpu_ib_schedule(ring, job->num_ibs, job->ibs, job, fence); 232f6a3f660SAndrey Grodzovsky 233ee913fd9SChristian König if (r) 234ee913fd9SChristian König return r; 235ee913fd9SChristian König 236ee913fd9SChristian König amdgpu_job_free(job); 237ee913fd9SChristian König return 0; 238ee913fd9SChristian König } 239ee913fd9SChristian König 240940ca22bSChristian König static struct dma_fence * 241940ca22bSChristian König amdgpu_job_dependency(struct drm_sched_job *sched_job, 2421b1f42d8SLucas Stach struct drm_sched_entity *s_entity) 2430856cab1SChristian König { 244068c3304SNayan Deshmukh struct amdgpu_ring *ring = to_amdgpu_ring(s_entity->rq->sched); 2450856cab1SChristian König struct amdgpu_job *job = to_amdgpu_job(sched_job); 246*1728baa7SChristian König struct dma_fence *fence = NULL; 247df83d1ebSChunming Zhou int r; 2480856cab1SChristian König 249940ca22bSChristian König while (!fence && job->vm && !job->vmid) { 250940ca22bSChristian König r = amdgpu_vmid_grab(job->vm, ring, job, &fence); 2510856cab1SChristian König if (r) 2520856cab1SChristian König DRM_ERROR("Error getting VM ID (%d)\n", r); 2530856cab1SChristian König } 2540856cab1SChristian König 25568ce8b24SChristian König if (!fence && job->gang_submit) 25668ce8b24SChristian König fence = amdgpu_device_switch_gang(ring->adev, job->gang_submit); 25768ce8b24SChristian König 2580856cab1SChristian König return fence; 2590856cab1SChristian König } 2600856cab1SChristian König 2611b1f42d8SLucas Stach static struct dma_fence *amdgpu_job_run(struct drm_sched_job *sched_job) 2620856cab1SChristian König { 2633320b8d2SChristian König struct amdgpu_ring *ring = to_amdgpu_ring(sched_job->sched); 26468ce8b24SChristian König struct amdgpu_device *adev = ring->adev; 26548f05f29SMonk Liu struct dma_fence *fence = NULL, *finished; 2660856cab1SChristian König struct amdgpu_job *job; 267db5e65fcSAndrey Grodzovsky int r = 0; 2680856cab1SChristian König 2690856cab1SChristian König job = to_amdgpu_job(sched_job); 27048f05f29SMonk Liu finished = &job->base.s_fence->finished; 271e86f9ceeSChristian König 2720856cab1SChristian König trace_amdgpu_sched_run_job(job); 27348f05f29SMonk Liu 27468ce8b24SChristian König /* Skip job if VRAM is lost and never resubmit gangs */ 27568ce8b24SChristian König if (job->vram_lost_counter != atomic_read(&adev->vram_lost_counter) || 27668ce8b24SChristian König (job->job_run_counter && job->gang_submit)) 27768ce8b24SChristian König dma_fence_set_error(finished, -ECANCELED); 278f1403342SChristian König 279f1403342SChristian König if (finished->error < 0) { 280f1403342SChristian König DRM_INFO("Skip scheduling IBs!\n"); 281f1403342SChristian König } else { 2823320b8d2SChristian König r = amdgpu_ib_schedule(ring, job->num_ibs, job->ibs, job, 28314e47f93SChristian König &fence); 28422a77cf6SChristian König if (r) 2850856cab1SChristian König DRM_ERROR("Error scheduling IBs (%d)\n", r); 28615d73ce6SChunming Zhou } 287b2ff0e8aSAndres Rodriguez 288c530b02fSJack Zhang job->job_run_counter++; 28922a77cf6SChristian König amdgpu_job_free_resources(job); 290db5e65fcSAndrey Grodzovsky 291db5e65fcSAndrey Grodzovsky fence = r ? ERR_PTR(r) : fence; 2920856cab1SChristian König return fence; 2930856cab1SChristian König } 2940856cab1SChristian König 2957c6e68c7SAndrey Grodzovsky #define to_drm_sched_job(sched_job) \ 2967c6e68c7SAndrey Grodzovsky container_of((sched_job), struct drm_sched_job, queue_node) 2977c6e68c7SAndrey Grodzovsky 2987c6e68c7SAndrey Grodzovsky void amdgpu_job_stop_all_jobs_on_sched(struct drm_gpu_scheduler *sched) 2997c6e68c7SAndrey Grodzovsky { 3007c6e68c7SAndrey Grodzovsky struct drm_sched_job *s_job; 3017c6e68c7SAndrey Grodzovsky struct drm_sched_entity *s_entity = NULL; 3027c6e68c7SAndrey Grodzovsky int i; 3037c6e68c7SAndrey Grodzovsky 3047c6e68c7SAndrey Grodzovsky /* Signal all jobs not yet scheduled */ 305e2d732fdSLuben Tuikov for (i = DRM_SCHED_PRIORITY_COUNT - 1; i >= DRM_SCHED_PRIORITY_MIN; i--) { 3067c6e68c7SAndrey Grodzovsky struct drm_sched_rq *rq = &sched->sched_rq[i]; 3077c6e68c7SAndrey Grodzovsky spin_lock(&rq->lock); 3087c6e68c7SAndrey Grodzovsky list_for_each_entry(s_entity, &rq->entities, list) { 3097c6e68c7SAndrey Grodzovsky while ((s_job = to_drm_sched_job(spsc_queue_pop(&s_entity->job_queue)))) { 3107c6e68c7SAndrey Grodzovsky struct drm_sched_fence *s_fence = s_job->s_fence; 3117c6e68c7SAndrey Grodzovsky 3127c6e68c7SAndrey Grodzovsky dma_fence_signal(&s_fence->scheduled); 3137c6e68c7SAndrey Grodzovsky dma_fence_set_error(&s_fence->finished, -EHWPOISON); 3147c6e68c7SAndrey Grodzovsky dma_fence_signal(&s_fence->finished); 3157c6e68c7SAndrey Grodzovsky } 3167c6e68c7SAndrey Grodzovsky } 3177c6e68c7SAndrey Grodzovsky spin_unlock(&rq->lock); 3187c6e68c7SAndrey Grodzovsky } 3197c6e68c7SAndrey Grodzovsky 3207c6e68c7SAndrey Grodzovsky /* Signal all jobs already scheduled to HW */ 3216efa4b46SLuben Tuikov list_for_each_entry(s_job, &sched->pending_list, list) { 3227c6e68c7SAndrey Grodzovsky struct drm_sched_fence *s_fence = s_job->s_fence; 3237c6e68c7SAndrey Grodzovsky 3247c6e68c7SAndrey Grodzovsky dma_fence_set_error(&s_fence->finished, -EHWPOISON); 3257c6e68c7SAndrey Grodzovsky dma_fence_signal(&s_fence->finished); 3267c6e68c7SAndrey Grodzovsky } 3277c6e68c7SAndrey Grodzovsky } 3287c6e68c7SAndrey Grodzovsky 3291b1f42d8SLucas Stach const struct drm_sched_backend_ops amdgpu_sched_ops = { 3300856cab1SChristian König .dependency = amdgpu_job_dependency, 3310856cab1SChristian König .run_job = amdgpu_job_run, 3320e51a772SChristian König .timedout_job = amdgpu_job_timedout, 333c5f74f78SChristian König .free_job = amdgpu_job_free_cb 3340856cab1SChristian König }; 335