1 // SPDX-License-Identifier: MIT 2 /* 3 * Copyright © 2022 Intel Corporation 4 */ 5 6 #include "xe_exec.h" 7 8 #include <drm/drm_device.h> 9 #include <drm/drm_file.h> 10 #include <drm/xe_drm.h> 11 #include <linux/delay.h> 12 13 #include "xe_bo.h" 14 #include "xe_device.h" 15 #include "xe_engine.h" 16 #include "xe_macros.h" 17 #include "xe_sched_job.h" 18 #include "xe_sync.h" 19 #include "xe_vm.h" 20 21 /** 22 * DOC: Execbuf (User GPU command submission) 23 * 24 * Execs have historically been rather complicated in DRM drivers (at least in 25 * the i915) because a few things: 26 * 27 * - Passing in a list BO which are read / written to creating implicit syncs 28 * - Binding at exec time 29 * - Flow controlling the ring at exec time 30 * 31 * In XE we avoid all of this complication by not allowing a BO list to be 32 * passed into an exec, using the dma-buf implicit sync uAPI, have binds as 33 * seperate operations, and using the DRM scheduler to flow control the ring. 34 * Let's deep dive on each of these. 35 * 36 * We can get away from a BO list by forcing the user to use in / out fences on 37 * every exec rather than the kernel tracking dependencies of BO (e.g. if the 38 * user knows an exec writes to a BO and reads from the BO in the next exec, it 39 * is the user's responsibility to pass in / out fence between the two execs). 40 * 41 * Implicit dependencies for external BOs are handled by using the dma-buf 42 * implicit dependency uAPI (TODO: add link). To make this works each exec must 43 * install the job's fence into the DMA_RESV_USAGE_WRITE slot of every external 44 * BO mapped in the VM. 45 * 46 * We do not allow a user to trigger a bind at exec time rather we have a VM 47 * bind IOCTL which uses the same in / out fence interface as exec. In that 48 * sense, a VM bind is basically the same operation as an exec from the user 49 * perspective. e.g. If an exec depends on a VM bind use the in / out fence 50 * interface (struct drm_xe_sync) to synchronize like syncing between two 51 * dependent execs. 52 * 53 * Although a user cannot trigger a bind, we still have to rebind userptrs in 54 * the VM that have been invalidated since the last exec, likewise we also have 55 * to rebind BOs that have been evicted by the kernel. We schedule these rebinds 56 * behind any pending kernel operations on any external BOs in VM or any BOs 57 * private to the VM. This is accomplished by the rebinds waiting on BOs 58 * DMA_RESV_USAGE_KERNEL slot (kernel ops) and kernel ops waiting on all BOs 59 * slots (inflight execs are in the DMA_RESV_USAGE_BOOKING for private BOs and 60 * in DMA_RESV_USAGE_WRITE for external BOs). 61 * 62 * Rebinds / dma-resv usage applies to non-compute mode VMs only as for compute 63 * mode VMs we use preempt fences and a rebind worker (TODO: add link). 64 * 65 * There is no need to flow control the ring in the exec as we write the ring at 66 * submission time and set the DRM scheduler max job limit SIZE_OF_RING / 67 * MAX_JOB_SIZE. The DRM scheduler will then hold all jobs until space in the 68 * ring is available. 69 * 70 * All of this results in a rather simple exec implementation. 71 * 72 * Flow 73 * ~~~~ 74 * 75 * .. code-block:: 76 * 77 * Parse input arguments 78 * Wait for any async VM bind passed as in-fences to start 79 * <----------------------------------------------------------------------| 80 * Lock global VM lock in read mode | 81 * Pin userptrs (also finds userptr invalidated since last exec) | 82 * Lock exec (VM dma-resv lock, external BOs dma-resv locks) | 83 * Validate BOs that have been evicted | 84 * Create job | 85 * Rebind invalidated userptrs + evicted BOs (non-compute-mode) | 86 * Add rebind fence dependency to job | 87 * Add job VM dma-resv bookkeeping slot (non-compute mode) | 88 * Add job to external BOs dma-resv write slots (non-compute mode) | 89 * Check if any userptrs invalidated since pin ------ Drop locks ---------| 90 * Install in / out fences for job 91 * Submit job 92 * Unlock all 93 */ 94 95 #define XE_EXEC_BIND_RETRY_TIMEOUT_MS 1000 96 97 static int xe_exec_begin(struct xe_engine *e, struct ww_acquire_ctx *ww, 98 struct ttm_validate_buffer tv_onstack[], 99 struct ttm_validate_buffer **tv, 100 struct list_head *objs) 101 { 102 struct xe_vm *vm = e->vm; 103 struct xe_vma *vma; 104 LIST_HEAD(dups); 105 ktime_t end = 0; 106 int err = 0; 107 108 *tv = NULL; 109 if (xe_vm_no_dma_fences(e->vm)) 110 return 0; 111 112 retry: 113 err = xe_vm_lock_dma_resv(vm, ww, tv_onstack, tv, objs, true, 1); 114 if (err) 115 return err; 116 117 /* 118 * Validate BOs that have been evicted (i.e. make sure the 119 * BOs have valid placements possibly moving an evicted BO back 120 * to a location where the GPU can access it). 121 */ 122 list_for_each_entry(vma, &vm->rebind_list, rebind_link) { 123 if (xe_vma_is_userptr(vma)) 124 continue; 125 126 err = xe_bo_validate(vma->bo, vm, false); 127 if (err) { 128 xe_vm_unlock_dma_resv(vm, tv_onstack, *tv, ww, objs); 129 *tv = NULL; 130 break; 131 } 132 } 133 134 /* 135 * With multiple active VMs, under memory pressure, it is possible that 136 * ttm_bo_validate() run into -EDEADLK and in such case returns -ENOMEM. 137 * Until ttm properly handles locking in such scenarios, best thing the 138 * driver can do is retry with a timeout. 139 */ 140 if (err == -ENOMEM) { 141 ktime_t cur = ktime_get(); 142 143 end = end ? : ktime_add_ms(cur, XE_EXEC_BIND_RETRY_TIMEOUT_MS); 144 if (ktime_before(cur, end)) { 145 msleep(20); 146 goto retry; 147 } 148 } 149 150 return err; 151 } 152 153 static void xe_exec_end(struct xe_engine *e, 154 struct ttm_validate_buffer *tv_onstack, 155 struct ttm_validate_buffer *tv, 156 struct ww_acquire_ctx *ww, 157 struct list_head *objs) 158 { 159 if (!xe_vm_no_dma_fences(e->vm)) 160 xe_vm_unlock_dma_resv(e->vm, tv_onstack, tv, ww, objs); 161 } 162 163 int xe_exec_ioctl(struct drm_device *dev, void *data, struct drm_file *file) 164 { 165 struct xe_device *xe = to_xe_device(dev); 166 struct xe_file *xef = to_xe_file(file); 167 struct drm_xe_exec *args = data; 168 struct drm_xe_sync __user *syncs_user = u64_to_user_ptr(args->syncs); 169 u64 __user *addresses_user = u64_to_user_ptr(args->address); 170 struct xe_engine *engine; 171 struct xe_sync_entry *syncs = NULL; 172 u64 addresses[XE_HW_ENGINE_MAX_INSTANCE]; 173 struct ttm_validate_buffer tv_onstack[XE_ONSTACK_TV]; 174 struct ttm_validate_buffer *tv = NULL; 175 u32 i, num_syncs = 0; 176 struct xe_sched_job *job; 177 struct dma_fence *rebind_fence; 178 struct xe_vm *vm; 179 struct ww_acquire_ctx ww; 180 struct list_head objs; 181 bool write_locked; 182 int err = 0; 183 184 if (XE_IOCTL_ERR(xe, args->extensions) || 185 XE_IOCTL_ERR(xe, args->pad[0] || args->pad[1] || args->pad[2]) || 186 XE_IOCTL_ERR(xe, args->reserved[0] || args->reserved[1])) 187 return -EINVAL; 188 189 engine = xe_engine_lookup(xef, args->engine_id); 190 if (XE_IOCTL_ERR(xe, !engine)) 191 return -ENOENT; 192 193 if (XE_IOCTL_ERR(xe, engine->flags & ENGINE_FLAG_VM)) 194 return -EINVAL; 195 196 if (XE_IOCTL_ERR(xe, engine->width != args->num_batch_buffer)) 197 return -EINVAL; 198 199 if (XE_IOCTL_ERR(xe, engine->flags & ENGINE_FLAG_BANNED)) { 200 err = -ECANCELED; 201 goto err_engine; 202 } 203 204 if (args->num_syncs) { 205 syncs = kcalloc(args->num_syncs, sizeof(*syncs), GFP_KERNEL); 206 if (!syncs) { 207 err = -ENOMEM; 208 goto err_engine; 209 } 210 } 211 212 vm = engine->vm; 213 214 for (i = 0; i < args->num_syncs; i++) { 215 err = xe_sync_entry_parse(xe, xef, &syncs[num_syncs++], 216 &syncs_user[i], true, 217 xe_vm_no_dma_fences(vm)); 218 if (err) 219 goto err_syncs; 220 } 221 222 if (xe_engine_is_parallel(engine)) { 223 err = __copy_from_user(addresses, addresses_user, sizeof(u64) * 224 engine->width); 225 if (err) { 226 err = -EFAULT; 227 goto err_syncs; 228 } 229 } 230 231 /* 232 * We can't install a job into the VM dma-resv shared slot before an 233 * async VM bind passed in as a fence without the risk of deadlocking as 234 * the bind can trigger an eviction which in turn depends on anything in 235 * the VM dma-resv shared slots. Not an ideal solution, but we wait for 236 * all dependent async VM binds to start (install correct fences into 237 * dma-resv slots) before moving forward. 238 */ 239 if (!xe_vm_no_dma_fences(vm) && 240 vm->flags & XE_VM_FLAG_ASYNC_BIND_OPS) { 241 for (i = 0; i < args->num_syncs; i++) { 242 struct dma_fence *fence = syncs[i].fence; 243 if (fence) { 244 err = xe_vm_async_fence_wait_start(fence); 245 if (err) 246 goto err_syncs; 247 } 248 } 249 } 250 251 retry: 252 if (!xe_vm_no_dma_fences(vm) && xe_vm_userptr_check_repin(vm)) { 253 err = down_write_killable(&vm->lock); 254 write_locked = true; 255 } else { 256 /* We don't allow execs while the VM is in error state */ 257 err = down_read_interruptible(&vm->lock); 258 write_locked = false; 259 } 260 if (err) 261 goto err_syncs; 262 263 /* We don't allow execs while the VM is in error state */ 264 if (vm->async_ops.error) { 265 err = vm->async_ops.error; 266 goto err_unlock_list; 267 } 268 269 /* 270 * Extreme corner where we exit a VM error state with a munmap style VM 271 * unbind inflight which requires a rebind. In this case the rebind 272 * needs to install some fences into the dma-resv slots. The worker to 273 * do this queued, let that worker make progress by dropping vm->lock, 274 * flushing the worker and retrying the exec. 275 */ 276 if (vm->async_ops.munmap_rebind_inflight) { 277 if (write_locked) 278 up_write(&vm->lock); 279 else 280 up_read(&vm->lock); 281 flush_work(&vm->async_ops.work); 282 goto retry; 283 } 284 285 if (write_locked) { 286 err = xe_vm_userptr_pin(vm); 287 downgrade_write(&vm->lock); 288 write_locked = false; 289 if (err) 290 goto err_unlock_list; 291 } 292 293 err = xe_exec_begin(engine, &ww, tv_onstack, &tv, &objs); 294 if (err) 295 goto err_unlock_list; 296 297 if (xe_vm_is_closed(engine->vm)) { 298 drm_warn(&xe->drm, "Trying to schedule after vm is closed\n"); 299 err = -EIO; 300 goto err_engine_end; 301 } 302 303 job = xe_sched_job_create(engine, xe_engine_is_parallel(engine) ? 304 addresses : &args->address); 305 if (IS_ERR(job)) { 306 err = PTR_ERR(job); 307 goto err_engine_end; 308 } 309 310 /* 311 * Rebind any invalidated userptr or evicted BOs in the VM, non-compute 312 * VM mode only. 313 */ 314 rebind_fence = xe_vm_rebind(vm, false); 315 if (IS_ERR(rebind_fence)) { 316 err = PTR_ERR(rebind_fence); 317 goto err_put_job; 318 } 319 320 /* 321 * We store the rebind_fence in the VM so subsequent execs don't get 322 * scheduled before the rebinds of userptrs / evicted BOs is complete. 323 */ 324 if (rebind_fence) { 325 dma_fence_put(vm->rebind_fence); 326 vm->rebind_fence = rebind_fence; 327 } 328 if (vm->rebind_fence) { 329 if (test_bit(DMA_FENCE_FLAG_SIGNALED_BIT, 330 &vm->rebind_fence->flags)) { 331 dma_fence_put(vm->rebind_fence); 332 vm->rebind_fence = NULL; 333 } else { 334 dma_fence_get(vm->rebind_fence); 335 err = drm_sched_job_add_dependency(&job->drm, 336 vm->rebind_fence); 337 if (err) 338 goto err_put_job; 339 } 340 } 341 342 /* Wait behind munmap style rebinds */ 343 if (!xe_vm_no_dma_fences(vm)) { 344 err = drm_sched_job_add_resv_dependencies(&job->drm, 345 &vm->resv, 346 DMA_RESV_USAGE_KERNEL); 347 if (err) 348 goto err_put_job; 349 } 350 351 for (i = 0; i < num_syncs && !err; i++) 352 err = xe_sync_entry_add_deps(&syncs[i], job); 353 if (err) 354 goto err_put_job; 355 356 if (!xe_vm_no_dma_fences(vm)) { 357 err = down_read_interruptible(&vm->userptr.notifier_lock); 358 if (err) 359 goto err_put_job; 360 361 err = __xe_vm_userptr_needs_repin(vm); 362 if (err) 363 goto err_repin; 364 } 365 366 /* 367 * Point of no return, if we error after this point just set an error on 368 * the job and let the DRM scheduler / backend clean up the job. 369 */ 370 xe_sched_job_arm(job); 371 if (!xe_vm_no_dma_fences(vm)) { 372 /* Block userptr invalidations / BO eviction */ 373 dma_resv_add_fence(&vm->resv, 374 &job->drm.s_fence->finished, 375 DMA_RESV_USAGE_BOOKKEEP); 376 377 /* 378 * Make implicit sync work across drivers, assuming all external 379 * BOs are written as we don't pass in a read / write list. 380 */ 381 xe_vm_fence_all_extobjs(vm, &job->drm.s_fence->finished, 382 DMA_RESV_USAGE_WRITE); 383 } 384 385 for (i = 0; i < num_syncs; i++) 386 xe_sync_entry_signal(&syncs[i], job, 387 &job->drm.s_fence->finished); 388 389 xe_sched_job_push(job); 390 xe_vm_reactivate_rebind(vm); 391 392 err_repin: 393 if (!xe_vm_no_dma_fences(vm)) 394 up_read(&vm->userptr.notifier_lock); 395 err_put_job: 396 if (err) 397 xe_sched_job_put(job); 398 err_engine_end: 399 xe_exec_end(engine, tv_onstack, tv, &ww, &objs); 400 err_unlock_list: 401 if (write_locked) 402 up_write(&vm->lock); 403 else 404 up_read(&vm->lock); 405 if (err == -EAGAIN) 406 goto retry; 407 err_syncs: 408 for (i = 0; i < num_syncs; i++) 409 xe_sync_entry_cleanup(&syncs[i]); 410 kfree(syncs); 411 err_engine: 412 xe_engine_put(engine); 413 414 return err; 415 } 416