1 // SPDX-License-Identifier: MIT 2 /* 3 * Copyright © 2020 Intel Corporation 4 */ 5 6 #include "xe_migrate.h" 7 8 #include <linux/bitfield.h> 9 #include <linux/sizes.h> 10 11 #include <drm/drm_managed.h> 12 #include <drm/ttm/ttm_tt.h> 13 #include <drm/xe_drm.h> 14 15 #include "generated/xe_wa_oob.h" 16 #include "instructions/xe_mi_commands.h" 17 #include "regs/xe_gpu_commands.h" 18 #include "tests/xe_test.h" 19 #include "xe_assert.h" 20 #include "xe_bb.h" 21 #include "xe_bo.h" 22 #include "xe_exec_queue.h" 23 #include "xe_ggtt.h" 24 #include "xe_gt.h" 25 #include "xe_hw_engine.h" 26 #include "xe_lrc.h" 27 #include "xe_map.h" 28 #include "xe_mocs.h" 29 #include "xe_pt.h" 30 #include "xe_res_cursor.h" 31 #include "xe_sched_job.h" 32 #include "xe_sync.h" 33 #include "xe_trace.h" 34 #include "xe_vm.h" 35 #include "xe_wa.h" 36 37 /** 38 * struct xe_migrate - migrate context. 39 */ 40 struct xe_migrate { 41 /** @q: Default exec queue used for migration */ 42 struct xe_exec_queue *q; 43 /** @tile: Backpointer to the tile this struct xe_migrate belongs to. */ 44 struct xe_tile *tile; 45 /** @job_mutex: Timeline mutex for @eng. */ 46 struct mutex job_mutex; 47 /** @pt_bo: Page-table buffer object. */ 48 struct xe_bo *pt_bo; 49 /** 50 * @cleared_bo: Zeroed out bo used as a source for CCS metadata clears 51 */ 52 struct xe_bo *cleared_bo; 53 /** @batch_base_ofs: VM offset of the migration batch buffer */ 54 u64 batch_base_ofs; 55 /** @usm_batch_base_ofs: VM offset of the usm batch buffer */ 56 u64 usm_batch_base_ofs; 57 /** @cleared_vram_ofs: VM offset of @cleared_bo. */ 58 u64 cleared_vram_ofs; 59 /** 60 * @fence: dma-fence representing the last migration job batch. 61 * Protected by @job_mutex. 62 */ 63 struct dma_fence *fence; 64 /** 65 * @vm_update_sa: For integrated, used to suballocate page-tables 66 * out of the pt_bo. 67 */ 68 struct drm_suballoc_manager vm_update_sa; 69 }; 70 71 #define MAX_PREEMPTDISABLE_TRANSFER SZ_8M /* Around 1ms. */ 72 #define NUM_KERNEL_PDE 17 73 #define NUM_PT_SLOTS 32 74 #define NUM_PT_PER_BLIT (MAX_PREEMPTDISABLE_TRANSFER / SZ_2M) 75 76 /** 77 * xe_tile_migrate_engine() - Get this tile's migrate engine. 78 * @tile: The tile. 79 * 80 * Returns the default migrate engine of this tile. 81 * TODO: Perhaps this function is slightly misplaced, and even unneeded? 82 * 83 * Return: The default migrate engine 84 */ 85 struct xe_exec_queue *xe_tile_migrate_engine(struct xe_tile *tile) 86 { 87 return tile->migrate->q; 88 } 89 90 static void xe_migrate_fini(struct drm_device *dev, void *arg) 91 { 92 struct xe_migrate *m = arg; 93 94 xe_vm_lock(m->q->vm, false); 95 xe_bo_unpin(m->pt_bo); 96 if (m->cleared_bo) 97 xe_bo_unpin(m->cleared_bo); 98 xe_vm_unlock(m->q->vm); 99 100 dma_fence_put(m->fence); 101 if (m->cleared_bo) 102 xe_bo_put(m->cleared_bo); 103 xe_bo_put(m->pt_bo); 104 drm_suballoc_manager_fini(&m->vm_update_sa); 105 mutex_destroy(&m->job_mutex); 106 xe_vm_close_and_put(m->q->vm); 107 xe_exec_queue_put(m->q); 108 } 109 110 static u64 xe_migrate_vm_addr(u64 slot, u32 level) 111 { 112 XE_WARN_ON(slot >= NUM_PT_SLOTS); 113 114 /* First slot is reserved for mapping of PT bo and bb, start from 1 */ 115 return (slot + 1ULL) << xe_pt_shift(level + 1); 116 } 117 118 static u64 xe_migrate_vram_ofs(struct xe_device *xe, u64 addr) 119 { 120 /* 121 * Remove the DPA to get a correct offset into identity table for the 122 * migrate offset 123 */ 124 addr -= xe->mem.vram.dpa_base; 125 return addr + (256ULL << xe_pt_shift(2)); 126 } 127 128 /* 129 * For flat CCS clearing we need a cleared chunk of memory to copy from, 130 * since the CCS clearing mode of XY_FAST_COLOR_BLT appears to be buggy 131 * (it clears on only 14 bytes in each chunk of 16). 132 * If clearing the main surface one can use the part of the main surface 133 * already cleared, but for clearing as part of copying non-compressed 134 * data out of system memory, we don't readily have a cleared part of 135 * VRAM to copy from, so create one to use for that case. 136 */ 137 static int xe_migrate_create_cleared_bo(struct xe_migrate *m, struct xe_vm *vm) 138 { 139 struct xe_tile *tile = m->tile; 140 struct xe_device *xe = vm->xe; 141 size_t cleared_size; 142 u64 vram_addr; 143 144 if (!xe_device_has_flat_ccs(xe)) 145 return 0; 146 147 cleared_size = xe_device_ccs_bytes(xe, MAX_PREEMPTDISABLE_TRANSFER); 148 cleared_size = PAGE_ALIGN(cleared_size); 149 m->cleared_bo = xe_bo_create_pin_map(xe, tile, vm, cleared_size, 150 ttm_bo_type_kernel, 151 XE_BO_CREATE_VRAM_IF_DGFX(tile) | 152 XE_BO_CREATE_PINNED_BIT); 153 if (IS_ERR(m->cleared_bo)) 154 return PTR_ERR(m->cleared_bo); 155 156 xe_map_memset(xe, &m->cleared_bo->vmap, 0, 0x00, cleared_size); 157 vram_addr = xe_bo_addr(m->cleared_bo, 0, XE_PAGE_SIZE); 158 m->cleared_vram_ofs = xe_migrate_vram_ofs(xe, vram_addr); 159 160 return 0; 161 } 162 163 static int xe_migrate_prepare_vm(struct xe_tile *tile, struct xe_migrate *m, 164 struct xe_vm *vm) 165 { 166 struct xe_device *xe = tile_to_xe(tile); 167 u16 pat_index = xe->pat.idx[XE_CACHE_WB]; 168 u8 id = tile->id; 169 u32 num_entries = NUM_PT_SLOTS, num_level = vm->pt_root[id]->level; 170 u32 map_ofs, level, i; 171 struct xe_bo *bo, *batch = tile->mem.kernel_bb_pool->bo; 172 u64 entry; 173 int ret; 174 175 /* Can't bump NUM_PT_SLOTS too high */ 176 BUILD_BUG_ON(NUM_PT_SLOTS > SZ_2M/XE_PAGE_SIZE); 177 /* Must be a multiple of 64K to support all platforms */ 178 BUILD_BUG_ON(NUM_PT_SLOTS * XE_PAGE_SIZE % SZ_64K); 179 /* And one slot reserved for the 4KiB page table updates */ 180 BUILD_BUG_ON(!(NUM_KERNEL_PDE & 1)); 181 182 /* Need to be sure everything fits in the first PT, or create more */ 183 xe_tile_assert(tile, m->batch_base_ofs + batch->size < SZ_2M); 184 185 bo = xe_bo_create_pin_map(vm->xe, tile, vm, 186 num_entries * XE_PAGE_SIZE, 187 ttm_bo_type_kernel, 188 XE_BO_CREATE_VRAM_IF_DGFX(tile) | 189 XE_BO_CREATE_PINNED_BIT); 190 if (IS_ERR(bo)) 191 return PTR_ERR(bo); 192 193 ret = xe_migrate_create_cleared_bo(m, vm); 194 if (ret) { 195 xe_bo_put(bo); 196 return ret; 197 } 198 199 entry = vm->pt_ops->pde_encode_bo(bo, bo->size - XE_PAGE_SIZE, pat_index); 200 xe_pt_write(xe, &vm->pt_root[id]->bo->vmap, 0, entry); 201 202 map_ofs = (num_entries - num_level) * XE_PAGE_SIZE; 203 204 /* Map the entire BO in our level 0 pt */ 205 for (i = 0, level = 0; i < num_entries; level++) { 206 entry = vm->pt_ops->pte_encode_bo(bo, i * XE_PAGE_SIZE, 207 pat_index, 0); 208 209 xe_map_wr(xe, &bo->vmap, map_ofs + level * 8, u64, entry); 210 211 if (vm->flags & XE_VM_FLAG_64K) 212 i += 16; 213 else 214 i += 1; 215 } 216 217 if (!IS_DGFX(xe)) { 218 xe_tile_assert(tile, !xe->info.supports_usm); 219 220 /* Write out batch too */ 221 m->batch_base_ofs = NUM_PT_SLOTS * XE_PAGE_SIZE; 222 for (i = 0; i < batch->size; 223 i += vm->flags & XE_VM_FLAG_64K ? XE_64K_PAGE_SIZE : 224 XE_PAGE_SIZE) { 225 entry = vm->pt_ops->pte_encode_bo(batch, i, 226 pat_index, 0); 227 228 xe_map_wr(xe, &bo->vmap, map_ofs + level * 8, u64, 229 entry); 230 level++; 231 } 232 } else { 233 u64 batch_addr = xe_bo_addr(batch, 0, XE_PAGE_SIZE); 234 235 m->batch_base_ofs = xe_migrate_vram_ofs(xe, batch_addr); 236 237 if (xe->info.supports_usm) { 238 batch = tile->primary_gt->usm.bb_pool->bo; 239 batch_addr = xe_bo_addr(batch, 0, XE_PAGE_SIZE); 240 m->usm_batch_base_ofs = xe_migrate_vram_ofs(xe, batch_addr); 241 } 242 } 243 244 for (level = 1; level < num_level; level++) { 245 u32 flags = 0; 246 247 if (vm->flags & XE_VM_FLAG_64K && level == 1) 248 flags = XE_PDE_64K; 249 250 entry = vm->pt_ops->pde_encode_bo(bo, map_ofs + (level - 1) * 251 XE_PAGE_SIZE, pat_index); 252 xe_map_wr(xe, &bo->vmap, map_ofs + XE_PAGE_SIZE * level, u64, 253 entry | flags); 254 } 255 256 /* Write PDE's that point to our BO. */ 257 for (i = 0; i < num_entries - num_level; i++) { 258 entry = vm->pt_ops->pde_encode_bo(bo, i * XE_PAGE_SIZE, 259 pat_index); 260 261 xe_map_wr(xe, &bo->vmap, map_ofs + XE_PAGE_SIZE + 262 (i + 1) * 8, u64, entry); 263 } 264 265 /* Identity map the entire vram at 256GiB offset */ 266 if (IS_DGFX(xe)) { 267 u64 pos, ofs, flags; 268 269 level = 2; 270 ofs = map_ofs + XE_PAGE_SIZE * level + 256 * 8; 271 flags = vm->pt_ops->pte_encode_addr(xe, 0, pat_index, level, 272 true, 0); 273 274 /* 275 * Use 1GB pages, it shouldn't matter the physical amount of 276 * vram is less, when we don't access it. 277 */ 278 for (pos = xe->mem.vram.dpa_base; 279 pos < xe->mem.vram.actual_physical_size + xe->mem.vram.dpa_base; 280 pos += SZ_1G, ofs += 8) 281 xe_map_wr(xe, &bo->vmap, ofs, u64, pos | flags); 282 } 283 284 /* 285 * Example layout created above, with root level = 3: 286 * [PT0...PT7]: kernel PT's for copy/clear; 64 or 4KiB PTE's 287 * [PT8]: Kernel PT for VM_BIND, 4 KiB PTE's 288 * [PT9...PT28]: Userspace PT's for VM_BIND, 4 KiB PTE's 289 * [PT29 = PDE 0] [PT30 = PDE 1] [PT31 = PDE 2] 290 * 291 * This makes the lowest part of the VM point to the pagetables. 292 * Hence the lowest 2M in the vm should point to itself, with a few writes 293 * and flushes, other parts of the VM can be used either for copying and 294 * clearing. 295 * 296 * For performance, the kernel reserves PDE's, so about 20 are left 297 * for async VM updates. 298 * 299 * To make it easier to work, each scratch PT is put in slot (1 + PT #) 300 * everywhere, this allows lockless updates to scratch pages by using 301 * the different addresses in VM. 302 */ 303 #define NUM_VMUSA_UNIT_PER_PAGE 32 304 #define VM_SA_UPDATE_UNIT_SIZE (XE_PAGE_SIZE / NUM_VMUSA_UNIT_PER_PAGE) 305 #define NUM_VMUSA_WRITES_PER_UNIT (VM_SA_UPDATE_UNIT_SIZE / sizeof(u64)) 306 drm_suballoc_manager_init(&m->vm_update_sa, 307 (map_ofs / XE_PAGE_SIZE - NUM_KERNEL_PDE) * 308 NUM_VMUSA_UNIT_PER_PAGE, 0); 309 310 m->pt_bo = bo; 311 return 0; 312 } 313 314 /* 315 * Due to workaround 16017236439, odd instance hardware copy engines are 316 * faster than even instance ones. 317 * This function returns the mask involving all fast copy engines and the 318 * reserved copy engine to be used as logical mask for migrate engine. 319 * Including the reserved copy engine is required to avoid deadlocks due to 320 * migrate jobs servicing the faults gets stuck behind the job that faulted. 321 */ 322 static u32 xe_migrate_usm_logical_mask(struct xe_gt *gt) 323 { 324 u32 logical_mask = 0; 325 struct xe_hw_engine *hwe; 326 enum xe_hw_engine_id id; 327 328 for_each_hw_engine(hwe, gt, id) { 329 if (hwe->class != XE_ENGINE_CLASS_COPY) 330 continue; 331 332 if (!XE_WA(gt, 16017236439) || 333 xe_gt_is_usm_hwe(gt, hwe) || hwe->instance & 1) 334 logical_mask |= BIT(hwe->logical_instance); 335 } 336 337 return logical_mask; 338 } 339 340 /** 341 * xe_migrate_init() - Initialize a migrate context 342 * @tile: Back-pointer to the tile we're initializing for. 343 * 344 * Return: Pointer to a migrate context on success. Error pointer on error. 345 */ 346 struct xe_migrate *xe_migrate_init(struct xe_tile *tile) 347 { 348 struct xe_device *xe = tile_to_xe(tile); 349 struct xe_gt *primary_gt = tile->primary_gt; 350 struct xe_migrate *m; 351 struct xe_vm *vm; 352 int err; 353 354 m = drmm_kzalloc(&xe->drm, sizeof(*m), GFP_KERNEL); 355 if (!m) 356 return ERR_PTR(-ENOMEM); 357 358 m->tile = tile; 359 360 /* Special layout, prepared below.. */ 361 vm = xe_vm_create(xe, XE_VM_FLAG_MIGRATION | 362 XE_VM_FLAG_SET_TILE_ID(tile)); 363 if (IS_ERR(vm)) 364 return ERR_CAST(vm); 365 366 xe_vm_lock(vm, false); 367 err = xe_migrate_prepare_vm(tile, m, vm); 368 xe_vm_unlock(vm); 369 if (err) { 370 xe_vm_close_and_put(vm); 371 return ERR_PTR(err); 372 } 373 374 if (xe->info.supports_usm) { 375 struct xe_hw_engine *hwe = xe_gt_hw_engine(primary_gt, 376 XE_ENGINE_CLASS_COPY, 377 primary_gt->usm.reserved_bcs_instance, 378 false); 379 u32 logical_mask = xe_migrate_usm_logical_mask(primary_gt); 380 381 if (!hwe || !logical_mask) 382 return ERR_PTR(-EINVAL); 383 384 m->q = xe_exec_queue_create(xe, vm, logical_mask, 1, hwe, 385 EXEC_QUEUE_FLAG_KERNEL | 386 EXEC_QUEUE_FLAG_PERMANENT); 387 } else { 388 m->q = xe_exec_queue_create_class(xe, primary_gt, vm, 389 XE_ENGINE_CLASS_COPY, 390 EXEC_QUEUE_FLAG_KERNEL | 391 EXEC_QUEUE_FLAG_PERMANENT); 392 } 393 if (IS_ERR(m->q)) { 394 xe_vm_close_and_put(vm); 395 return ERR_CAST(m->q); 396 } 397 if (xe->info.supports_usm) 398 m->q->priority = XE_EXEC_QUEUE_PRIORITY_KERNEL; 399 400 mutex_init(&m->job_mutex); 401 402 err = drmm_add_action_or_reset(&xe->drm, xe_migrate_fini, m); 403 if (err) 404 return ERR_PTR(err); 405 406 return m; 407 } 408 409 static u64 xe_migrate_res_sizes(struct xe_res_cursor *cur) 410 { 411 /* 412 * For VRAM we use identity mapped pages so we are limited to current 413 * cursor size. For system we program the pages ourselves so we have no 414 * such limitation. 415 */ 416 return min_t(u64, MAX_PREEMPTDISABLE_TRANSFER, 417 mem_type_is_vram(cur->mem_type) ? cur->size : 418 cur->remaining); 419 } 420 421 static u32 pte_update_size(struct xe_migrate *m, 422 bool is_vram, 423 struct ttm_resource *res, 424 struct xe_res_cursor *cur, 425 u64 *L0, u64 *L0_ofs, u32 *L0_pt, 426 u32 cmd_size, u32 pt_ofs, u32 avail_pts) 427 { 428 u32 cmds = 0; 429 430 *L0_pt = pt_ofs; 431 if (!is_vram) { 432 /* Clip L0 to available size */ 433 u64 size = min(*L0, (u64)avail_pts * SZ_2M); 434 u64 num_4k_pages = DIV_ROUND_UP(size, XE_PAGE_SIZE); 435 436 *L0 = size; 437 *L0_ofs = xe_migrate_vm_addr(pt_ofs, 0); 438 439 /* MI_STORE_DATA_IMM */ 440 cmds += 3 * DIV_ROUND_UP(num_4k_pages, 0x1ff); 441 442 /* PDE qwords */ 443 cmds += num_4k_pages * 2; 444 445 /* Each chunk has a single blit command */ 446 cmds += cmd_size; 447 } else { 448 /* Offset into identity map. */ 449 *L0_ofs = xe_migrate_vram_ofs(tile_to_xe(m->tile), 450 cur->start + vram_region_gpu_offset(res)); 451 cmds += cmd_size; 452 } 453 454 return cmds; 455 } 456 457 static void emit_pte(struct xe_migrate *m, 458 struct xe_bb *bb, u32 at_pt, 459 bool is_vram, 460 struct xe_res_cursor *cur, 461 u32 size, struct xe_bo *bo) 462 { 463 u16 pat_index = tile_to_xe(m->tile)->pat.idx[XE_CACHE_WB]; 464 u32 ptes; 465 u64 ofs = at_pt * XE_PAGE_SIZE; 466 u64 cur_ofs; 467 468 /* 469 * FIXME: Emitting VRAM PTEs to L0 PTs is forbidden. Currently 470 * we're only emitting VRAM PTEs during sanity tests, so when 471 * that's moved to a Kunit test, we should condition VRAM PTEs 472 * on running tests. 473 */ 474 475 ptes = DIV_ROUND_UP(size, XE_PAGE_SIZE); 476 477 while (ptes) { 478 u32 chunk = min(0x1ffU, ptes); 479 480 bb->cs[bb->len++] = MI_STORE_DATA_IMM | MI_SDI_NUM_QW(chunk); 481 bb->cs[bb->len++] = ofs; 482 bb->cs[bb->len++] = 0; 483 484 cur_ofs = ofs; 485 ofs += chunk * 8; 486 ptes -= chunk; 487 488 while (chunk--) { 489 u64 addr, flags = 0; 490 bool devmem = false; 491 492 addr = xe_res_dma(cur) & PAGE_MASK; 493 if (is_vram) { 494 /* Is this a 64K PTE entry? */ 495 if ((m->q->vm->flags & XE_VM_FLAG_64K) && 496 !(cur_ofs & (16 * 8 - 1))) { 497 xe_tile_assert(m->tile, IS_ALIGNED(addr, SZ_64K)); 498 flags |= XE_PTE_PS64; 499 } 500 501 addr += vram_region_gpu_offset(bo->ttm.resource); 502 devmem = true; 503 } 504 505 addr = m->q->vm->pt_ops->pte_encode_addr(m->tile->xe, 506 addr, pat_index, 507 0, devmem, flags); 508 bb->cs[bb->len++] = lower_32_bits(addr); 509 bb->cs[bb->len++] = upper_32_bits(addr); 510 511 xe_res_next(cur, min_t(u32, size, PAGE_SIZE)); 512 cur_ofs += 8; 513 } 514 } 515 } 516 517 #define EMIT_COPY_CCS_DW 5 518 static void emit_copy_ccs(struct xe_gt *gt, struct xe_bb *bb, 519 u64 dst_ofs, bool dst_is_indirect, 520 u64 src_ofs, bool src_is_indirect, 521 u32 size) 522 { 523 struct xe_device *xe = gt_to_xe(gt); 524 u32 *cs = bb->cs + bb->len; 525 u32 num_ccs_blks; 526 u32 mocs; 527 528 num_ccs_blks = DIV_ROUND_UP(xe_device_ccs_bytes(gt_to_xe(gt), size), 529 NUM_CCS_BYTES_PER_BLOCK); 530 xe_gt_assert(gt, num_ccs_blks <= NUM_CCS_BLKS_PER_XFER); 531 532 if (GRAPHICS_VERx100(xe) >= 2000) 533 mocs = FIELD_PREP(XE2_XY_CTRL_SURF_MOCS_INDEX_MASK, gt->mocs.uc_index); 534 else 535 mocs = FIELD_PREP(XY_CTRL_SURF_MOCS_MASK, gt->mocs.uc_index); 536 537 *cs++ = XY_CTRL_SURF_COPY_BLT | 538 (src_is_indirect ? 0x0 : 0x1) << SRC_ACCESS_TYPE_SHIFT | 539 (dst_is_indirect ? 0x0 : 0x1) << DST_ACCESS_TYPE_SHIFT | 540 ((num_ccs_blks - 1) & CCS_SIZE_MASK) << CCS_SIZE_SHIFT; 541 *cs++ = lower_32_bits(src_ofs); 542 *cs++ = upper_32_bits(src_ofs) | mocs; 543 *cs++ = lower_32_bits(dst_ofs); 544 *cs++ = upper_32_bits(dst_ofs) | mocs; 545 546 bb->len = cs - bb->cs; 547 } 548 549 #define EMIT_COPY_DW 10 550 static void emit_copy(struct xe_gt *gt, struct xe_bb *bb, 551 u64 src_ofs, u64 dst_ofs, unsigned int size, 552 unsigned int pitch) 553 { 554 struct xe_device *xe = gt_to_xe(gt); 555 u32 mocs = 0; 556 u32 tile_y = 0; 557 558 xe_gt_assert(gt, size / pitch <= S16_MAX); 559 xe_gt_assert(gt, pitch / 4 <= S16_MAX); 560 xe_gt_assert(gt, pitch <= U16_MAX); 561 562 if (GRAPHICS_VER(xe) >= 20) 563 mocs = FIELD_PREP(XE2_XY_FAST_COPY_BLT_MOCS_INDEX_MASK, gt->mocs.uc_index); 564 565 if (GRAPHICS_VERx100(xe) >= 1250) 566 tile_y = XY_FAST_COPY_BLT_D1_SRC_TILE4 | XY_FAST_COPY_BLT_D1_DST_TILE4; 567 568 bb->cs[bb->len++] = XY_FAST_COPY_BLT_CMD | (10 - 2); 569 bb->cs[bb->len++] = XY_FAST_COPY_BLT_DEPTH_32 | pitch | tile_y | mocs; 570 bb->cs[bb->len++] = 0; 571 bb->cs[bb->len++] = (size / pitch) << 16 | pitch / 4; 572 bb->cs[bb->len++] = lower_32_bits(dst_ofs); 573 bb->cs[bb->len++] = upper_32_bits(dst_ofs); 574 bb->cs[bb->len++] = 0; 575 bb->cs[bb->len++] = pitch | mocs; 576 bb->cs[bb->len++] = lower_32_bits(src_ofs); 577 bb->cs[bb->len++] = upper_32_bits(src_ofs); 578 } 579 580 static int job_add_deps(struct xe_sched_job *job, struct dma_resv *resv, 581 enum dma_resv_usage usage) 582 { 583 return drm_sched_job_add_resv_dependencies(&job->drm, resv, usage); 584 } 585 586 static u64 xe_migrate_batch_base(struct xe_migrate *m, bool usm) 587 { 588 return usm ? m->usm_batch_base_ofs : m->batch_base_ofs; 589 } 590 591 static u32 xe_migrate_ccs_copy(struct xe_migrate *m, 592 struct xe_bb *bb, 593 u64 src_ofs, bool src_is_vram, 594 u64 dst_ofs, bool dst_is_vram, u32 dst_size, 595 u64 ccs_ofs, bool copy_ccs) 596 { 597 struct xe_gt *gt = m->tile->primary_gt; 598 u32 flush_flags = 0; 599 600 if (xe_device_has_flat_ccs(gt_to_xe(gt)) && !copy_ccs && dst_is_vram) { 601 /* 602 * If the src is already in vram, then it should already 603 * have been cleared by us, or has been populated by the 604 * user. Make sure we copy the CCS aux state as-is. 605 * 606 * Otherwise if the bo doesn't have any CCS metadata attached, 607 * we still need to clear it for security reasons. 608 */ 609 u64 ccs_src_ofs = src_is_vram ? src_ofs : m->cleared_vram_ofs; 610 611 emit_copy_ccs(gt, bb, 612 dst_ofs, true, 613 ccs_src_ofs, src_is_vram, dst_size); 614 615 flush_flags = MI_FLUSH_DW_CCS; 616 } else if (copy_ccs) { 617 if (!src_is_vram) 618 src_ofs = ccs_ofs; 619 else if (!dst_is_vram) 620 dst_ofs = ccs_ofs; 621 622 /* 623 * At the moment, we don't support copying CCS metadata from 624 * system to system. 625 */ 626 xe_gt_assert(gt, src_is_vram || dst_is_vram); 627 628 emit_copy_ccs(gt, bb, dst_ofs, dst_is_vram, src_ofs, 629 src_is_vram, dst_size); 630 if (dst_is_vram) 631 flush_flags = MI_FLUSH_DW_CCS; 632 } 633 634 return flush_flags; 635 } 636 637 /** 638 * xe_migrate_copy() - Copy content of TTM resources. 639 * @m: The migration context. 640 * @src_bo: The buffer object @src is currently bound to. 641 * @dst_bo: If copying between resources created for the same bo, set this to 642 * the same value as @src_bo. If copying between buffer objects, set it to 643 * the buffer object @dst is currently bound to. 644 * @src: The source TTM resource. 645 * @dst: The dst TTM resource. 646 * 647 * Copies the contents of @src to @dst: On flat CCS devices, 648 * the CCS metadata is copied as well if needed, or if not present, 649 * the CCS metadata of @dst is cleared for security reasons. 650 * 651 * Return: Pointer to a dma_fence representing the last copy batch, or 652 * an error pointer on failure. If there is a failure, any copy operation 653 * started by the function call has been synced. 654 */ 655 struct dma_fence *xe_migrate_copy(struct xe_migrate *m, 656 struct xe_bo *src_bo, 657 struct xe_bo *dst_bo, 658 struct ttm_resource *src, 659 struct ttm_resource *dst) 660 { 661 struct xe_gt *gt = m->tile->primary_gt; 662 struct xe_device *xe = gt_to_xe(gt); 663 struct dma_fence *fence = NULL; 664 u64 size = src_bo->size; 665 struct xe_res_cursor src_it, dst_it, ccs_it; 666 u64 src_L0_ofs, dst_L0_ofs; 667 u32 src_L0_pt, dst_L0_pt; 668 u64 src_L0, dst_L0; 669 int pass = 0; 670 int err; 671 bool src_is_vram = mem_type_is_vram(src->mem_type); 672 bool dst_is_vram = mem_type_is_vram(dst->mem_type); 673 bool copy_ccs = xe_device_has_flat_ccs(xe) && 674 xe_bo_needs_ccs_pages(src_bo) && xe_bo_needs_ccs_pages(dst_bo); 675 bool copy_system_ccs = copy_ccs && (!src_is_vram || !dst_is_vram); 676 677 /* Copying CCS between two different BOs is not supported yet. */ 678 if (XE_WARN_ON(copy_ccs && src_bo != dst_bo)) 679 return ERR_PTR(-EINVAL); 680 681 if (src_bo != dst_bo && XE_WARN_ON(src_bo->size != dst_bo->size)) 682 return ERR_PTR(-EINVAL); 683 684 if (!src_is_vram) 685 xe_res_first_sg(xe_bo_sg(src_bo), 0, size, &src_it); 686 else 687 xe_res_first(src, 0, size, &src_it); 688 if (!dst_is_vram) 689 xe_res_first_sg(xe_bo_sg(dst_bo), 0, size, &dst_it); 690 else 691 xe_res_first(dst, 0, size, &dst_it); 692 693 if (copy_system_ccs) 694 xe_res_first_sg(xe_bo_sg(src_bo), xe_bo_ccs_pages_start(src_bo), 695 PAGE_ALIGN(xe_device_ccs_bytes(xe, size)), 696 &ccs_it); 697 698 while (size) { 699 u32 batch_size = 2; /* arb_clear() + MI_BATCH_BUFFER_END */ 700 struct xe_sched_job *job; 701 struct xe_bb *bb; 702 u32 flush_flags; 703 u32 update_idx; 704 u64 ccs_ofs, ccs_size; 705 u32 ccs_pt; 706 bool usm = xe->info.supports_usm; 707 708 src_L0 = xe_migrate_res_sizes(&src_it); 709 dst_L0 = xe_migrate_res_sizes(&dst_it); 710 711 drm_dbg(&xe->drm, "Pass %u, sizes: %llu & %llu\n", 712 pass++, src_L0, dst_L0); 713 714 src_L0 = min(src_L0, dst_L0); 715 716 batch_size += pte_update_size(m, src_is_vram, src, &src_it, &src_L0, 717 &src_L0_ofs, &src_L0_pt, 0, 0, 718 NUM_PT_PER_BLIT); 719 720 batch_size += pte_update_size(m, dst_is_vram, dst, &dst_it, &src_L0, 721 &dst_L0_ofs, &dst_L0_pt, 0, 722 NUM_PT_PER_BLIT, NUM_PT_PER_BLIT); 723 724 if (copy_system_ccs) { 725 ccs_size = xe_device_ccs_bytes(xe, src_L0); 726 batch_size += pte_update_size(m, false, NULL, &ccs_it, &ccs_size, 727 &ccs_ofs, &ccs_pt, 0, 728 2 * NUM_PT_PER_BLIT, 729 NUM_PT_PER_BLIT); 730 } 731 732 /* Add copy commands size here */ 733 batch_size += EMIT_COPY_DW + 734 (xe_device_has_flat_ccs(xe) ? EMIT_COPY_CCS_DW : 0); 735 736 bb = xe_bb_new(gt, batch_size, usm); 737 if (IS_ERR(bb)) { 738 err = PTR_ERR(bb); 739 goto err_sync; 740 } 741 742 if (!src_is_vram) 743 emit_pte(m, bb, src_L0_pt, src_is_vram, &src_it, src_L0, 744 src_bo); 745 else 746 xe_res_next(&src_it, src_L0); 747 748 if (!dst_is_vram) 749 emit_pte(m, bb, dst_L0_pt, dst_is_vram, &dst_it, src_L0, 750 dst_bo); 751 else 752 xe_res_next(&dst_it, src_L0); 753 754 if (copy_system_ccs) 755 emit_pte(m, bb, ccs_pt, false, &ccs_it, ccs_size, src_bo); 756 757 bb->cs[bb->len++] = MI_BATCH_BUFFER_END; 758 update_idx = bb->len; 759 760 emit_copy(gt, bb, src_L0_ofs, dst_L0_ofs, src_L0, 761 XE_PAGE_SIZE); 762 flush_flags = xe_migrate_ccs_copy(m, bb, src_L0_ofs, src_is_vram, 763 dst_L0_ofs, dst_is_vram, 764 src_L0, ccs_ofs, copy_ccs); 765 766 mutex_lock(&m->job_mutex); 767 job = xe_bb_create_migration_job(m->q, bb, 768 xe_migrate_batch_base(m, usm), 769 update_idx); 770 if (IS_ERR(job)) { 771 err = PTR_ERR(job); 772 goto err; 773 } 774 775 xe_sched_job_add_migrate_flush(job, flush_flags); 776 if (!fence) { 777 err = job_add_deps(job, src_bo->ttm.base.resv, 778 DMA_RESV_USAGE_BOOKKEEP); 779 if (!err && src_bo != dst_bo) 780 err = job_add_deps(job, dst_bo->ttm.base.resv, 781 DMA_RESV_USAGE_BOOKKEEP); 782 if (err) 783 goto err_job; 784 } 785 786 xe_sched_job_arm(job); 787 dma_fence_put(fence); 788 fence = dma_fence_get(&job->drm.s_fence->finished); 789 xe_sched_job_push(job); 790 791 dma_fence_put(m->fence); 792 m->fence = dma_fence_get(fence); 793 794 mutex_unlock(&m->job_mutex); 795 796 xe_bb_free(bb, fence); 797 size -= src_L0; 798 continue; 799 800 err_job: 801 xe_sched_job_put(job); 802 err: 803 mutex_unlock(&m->job_mutex); 804 xe_bb_free(bb, NULL); 805 806 err_sync: 807 /* Sync partial copy if any. FIXME: under job_mutex? */ 808 if (fence) { 809 dma_fence_wait(fence, false); 810 dma_fence_put(fence); 811 } 812 813 return ERR_PTR(err); 814 } 815 816 return fence; 817 } 818 819 static void emit_clear_link_copy(struct xe_gt *gt, struct xe_bb *bb, u64 src_ofs, 820 u32 size, u32 pitch) 821 { 822 struct xe_device *xe = gt_to_xe(gt); 823 u32 *cs = bb->cs + bb->len; 824 u32 len = PVC_MEM_SET_CMD_LEN_DW; 825 826 *cs++ = PVC_MEM_SET_CMD | PVC_MEM_SET_MATRIX | (len - 2); 827 *cs++ = pitch - 1; 828 *cs++ = (size / pitch) - 1; 829 *cs++ = pitch - 1; 830 *cs++ = lower_32_bits(src_ofs); 831 *cs++ = upper_32_bits(src_ofs); 832 if (GRAPHICS_VERx100(xe) >= 2000) 833 *cs++ = FIELD_PREP(XE2_MEM_SET_MOCS_INDEX_MASK, gt->mocs.uc_index); 834 else 835 *cs++ = FIELD_PREP(PVC_MEM_SET_MOCS_INDEX_MASK, gt->mocs.uc_index); 836 837 xe_gt_assert(gt, cs - bb->cs == len + bb->len); 838 839 bb->len += len; 840 } 841 842 static void emit_clear_main_copy(struct xe_gt *gt, struct xe_bb *bb, 843 u64 src_ofs, u32 size, u32 pitch, bool is_vram) 844 { 845 struct xe_device *xe = gt_to_xe(gt); 846 u32 *cs = bb->cs + bb->len; 847 u32 len = XY_FAST_COLOR_BLT_DW; 848 849 if (GRAPHICS_VERx100(xe) < 1250) 850 len = 11; 851 852 *cs++ = XY_FAST_COLOR_BLT_CMD | XY_FAST_COLOR_BLT_DEPTH_32 | 853 (len - 2); 854 if (GRAPHICS_VERx100(xe) >= 2000) 855 *cs++ = FIELD_PREP(XE2_XY_FAST_COLOR_BLT_MOCS_INDEX_MASK, gt->mocs.uc_index) | 856 (pitch - 1); 857 else 858 *cs++ = FIELD_PREP(XY_FAST_COLOR_BLT_MOCS_MASK, gt->mocs.uc_index) | 859 (pitch - 1); 860 *cs++ = 0; 861 *cs++ = (size / pitch) << 16 | pitch / 4; 862 *cs++ = lower_32_bits(src_ofs); 863 *cs++ = upper_32_bits(src_ofs); 864 *cs++ = (is_vram ? 0x0 : 0x1) << XY_FAST_COLOR_BLT_MEM_TYPE_SHIFT; 865 *cs++ = 0; 866 *cs++ = 0; 867 *cs++ = 0; 868 *cs++ = 0; 869 870 if (len > 11) { 871 *cs++ = 0; 872 *cs++ = 0; 873 *cs++ = 0; 874 *cs++ = 0; 875 *cs++ = 0; 876 } 877 878 xe_gt_assert(gt, cs - bb->cs == len + bb->len); 879 880 bb->len += len; 881 } 882 883 static bool has_service_copy_support(struct xe_gt *gt) 884 { 885 /* 886 * What we care about is whether the architecture was designed with 887 * service copy functionality (specifically the new MEM_SET / MEM_COPY 888 * instructions) so check the architectural engine list rather than the 889 * actual list since these instructions are usable on BCS0 even if 890 * all of the actual service copy engines (BCS1-BCS8) have been fused 891 * off. 892 */ 893 return gt->info.__engine_mask & GENMASK(XE_HW_ENGINE_BCS8, 894 XE_HW_ENGINE_BCS1); 895 } 896 897 static u32 emit_clear_cmd_len(struct xe_gt *gt) 898 { 899 if (has_service_copy_support(gt)) 900 return PVC_MEM_SET_CMD_LEN_DW; 901 else 902 return XY_FAST_COLOR_BLT_DW; 903 } 904 905 static void emit_clear(struct xe_gt *gt, struct xe_bb *bb, u64 src_ofs, 906 u32 size, u32 pitch, bool is_vram) 907 { 908 if (has_service_copy_support(gt)) 909 emit_clear_link_copy(gt, bb, src_ofs, size, pitch); 910 else 911 emit_clear_main_copy(gt, bb, src_ofs, size, pitch, 912 is_vram); 913 } 914 915 /** 916 * xe_migrate_clear() - Copy content of TTM resources. 917 * @m: The migration context. 918 * @bo: The buffer object @dst is currently bound to. 919 * @dst: The dst TTM resource to be cleared. 920 * 921 * Clear the contents of @dst to zero. On flat CCS devices, 922 * the CCS metadata is cleared to zero as well on VRAM destinations. 923 * TODO: Eliminate the @bo argument. 924 * 925 * Return: Pointer to a dma_fence representing the last clear batch, or 926 * an error pointer on failure. If there is a failure, any clear operation 927 * started by the function call has been synced. 928 */ 929 struct dma_fence *xe_migrate_clear(struct xe_migrate *m, 930 struct xe_bo *bo, 931 struct ttm_resource *dst) 932 { 933 bool clear_vram = mem_type_is_vram(dst->mem_type); 934 struct xe_gt *gt = m->tile->primary_gt; 935 struct xe_device *xe = gt_to_xe(gt); 936 struct dma_fence *fence = NULL; 937 u64 size = bo->size; 938 struct xe_res_cursor src_it; 939 struct ttm_resource *src = dst; 940 int err; 941 int pass = 0; 942 943 if (!clear_vram) 944 xe_res_first_sg(xe_bo_sg(bo), 0, bo->size, &src_it); 945 else 946 xe_res_first(src, 0, bo->size, &src_it); 947 948 while (size) { 949 u64 clear_L0_ofs; 950 u32 clear_L0_pt; 951 u32 flush_flags = 0; 952 u64 clear_L0; 953 struct xe_sched_job *job; 954 struct xe_bb *bb; 955 u32 batch_size, update_idx; 956 bool usm = xe->info.supports_usm; 957 958 clear_L0 = xe_migrate_res_sizes(&src_it); 959 drm_dbg(&xe->drm, "Pass %u, size: %llu\n", pass++, clear_L0); 960 961 /* Calculate final sizes and batch size.. */ 962 batch_size = 2 + 963 pte_update_size(m, clear_vram, src, &src_it, 964 &clear_L0, &clear_L0_ofs, &clear_L0_pt, 965 emit_clear_cmd_len(gt), 0, 966 NUM_PT_PER_BLIT); 967 if (xe_device_has_flat_ccs(xe) && clear_vram) 968 batch_size += EMIT_COPY_CCS_DW; 969 970 /* Clear commands */ 971 972 if (WARN_ON_ONCE(!clear_L0)) 973 break; 974 975 bb = xe_bb_new(gt, batch_size, usm); 976 if (IS_ERR(bb)) { 977 err = PTR_ERR(bb); 978 goto err_sync; 979 } 980 981 size -= clear_L0; 982 983 /* Preemption is enabled again by the ring ops. */ 984 if (!clear_vram) { 985 emit_pte(m, bb, clear_L0_pt, clear_vram, &src_it, clear_L0, 986 bo); 987 } else { 988 xe_res_next(&src_it, clear_L0); 989 } 990 bb->cs[bb->len++] = MI_BATCH_BUFFER_END; 991 update_idx = bb->len; 992 993 emit_clear(gt, bb, clear_L0_ofs, clear_L0, XE_PAGE_SIZE, 994 clear_vram); 995 if (xe_device_has_flat_ccs(xe) && clear_vram) { 996 emit_copy_ccs(gt, bb, clear_L0_ofs, true, 997 m->cleared_vram_ofs, false, clear_L0); 998 flush_flags = MI_FLUSH_DW_CCS; 999 } 1000 1001 mutex_lock(&m->job_mutex); 1002 job = xe_bb_create_migration_job(m->q, bb, 1003 xe_migrate_batch_base(m, usm), 1004 update_idx); 1005 if (IS_ERR(job)) { 1006 err = PTR_ERR(job); 1007 goto err; 1008 } 1009 1010 xe_sched_job_add_migrate_flush(job, flush_flags); 1011 if (!fence) { 1012 /* 1013 * There can't be anything userspace related at this 1014 * point, so we just need to respect any potential move 1015 * fences, which are always tracked as 1016 * DMA_RESV_USAGE_KERNEL. 1017 */ 1018 err = job_add_deps(job, bo->ttm.base.resv, 1019 DMA_RESV_USAGE_KERNEL); 1020 if (err) 1021 goto err_job; 1022 } 1023 1024 xe_sched_job_arm(job); 1025 dma_fence_put(fence); 1026 fence = dma_fence_get(&job->drm.s_fence->finished); 1027 xe_sched_job_push(job); 1028 1029 dma_fence_put(m->fence); 1030 m->fence = dma_fence_get(fence); 1031 1032 mutex_unlock(&m->job_mutex); 1033 1034 xe_bb_free(bb, fence); 1035 continue; 1036 1037 err_job: 1038 xe_sched_job_put(job); 1039 err: 1040 mutex_unlock(&m->job_mutex); 1041 xe_bb_free(bb, NULL); 1042 err_sync: 1043 /* Sync partial copies if any. FIXME: job_mutex? */ 1044 if (fence) { 1045 dma_fence_wait(m->fence, false); 1046 dma_fence_put(fence); 1047 } 1048 1049 return ERR_PTR(err); 1050 } 1051 1052 return fence; 1053 } 1054 1055 static void write_pgtable(struct xe_tile *tile, struct xe_bb *bb, u64 ppgtt_ofs, 1056 const struct xe_vm_pgtable_update *update, 1057 struct xe_migrate_pt_update *pt_update) 1058 { 1059 const struct xe_migrate_pt_update_ops *ops = pt_update->ops; 1060 u32 chunk; 1061 u32 ofs = update->ofs, size = update->qwords; 1062 1063 /* 1064 * If we have 512 entries (max), we would populate it ourselves, 1065 * and update the PDE above it to the new pointer. 1066 * The only time this can only happen if we have to update the top 1067 * PDE. This requires a BO that is almost vm->size big. 1068 * 1069 * This shouldn't be possible in practice.. might change when 16K 1070 * pages are used. Hence the assert. 1071 */ 1072 xe_tile_assert(tile, update->qwords <= 0x1ff); 1073 if (!ppgtt_ofs) 1074 ppgtt_ofs = xe_migrate_vram_ofs(tile_to_xe(tile), 1075 xe_bo_addr(update->pt_bo, 0, 1076 XE_PAGE_SIZE)); 1077 1078 do { 1079 u64 addr = ppgtt_ofs + ofs * 8; 1080 1081 chunk = min(update->qwords, 0x1ffU); 1082 1083 /* Ensure populatefn can do memset64 by aligning bb->cs */ 1084 if (!(bb->len & 1)) 1085 bb->cs[bb->len++] = MI_NOOP; 1086 1087 bb->cs[bb->len++] = MI_STORE_DATA_IMM | MI_SDI_NUM_QW(chunk); 1088 bb->cs[bb->len++] = lower_32_bits(addr); 1089 bb->cs[bb->len++] = upper_32_bits(addr); 1090 ops->populate(pt_update, tile, NULL, bb->cs + bb->len, ofs, chunk, 1091 update); 1092 1093 bb->len += chunk * 2; 1094 ofs += chunk; 1095 size -= chunk; 1096 } while (size); 1097 } 1098 1099 struct xe_vm *xe_migrate_get_vm(struct xe_migrate *m) 1100 { 1101 return xe_vm_get(m->q->vm); 1102 } 1103 1104 #if IS_ENABLED(CONFIG_DRM_XE_KUNIT_TEST) 1105 struct migrate_test_params { 1106 struct xe_test_priv base; 1107 bool force_gpu; 1108 }; 1109 1110 #define to_migrate_test_params(_priv) \ 1111 container_of(_priv, struct migrate_test_params, base) 1112 #endif 1113 1114 static struct dma_fence * 1115 xe_migrate_update_pgtables_cpu(struct xe_migrate *m, 1116 struct xe_vm *vm, struct xe_bo *bo, 1117 const struct xe_vm_pgtable_update *updates, 1118 u32 num_updates, bool wait_vm, 1119 struct xe_migrate_pt_update *pt_update) 1120 { 1121 XE_TEST_DECLARE(struct migrate_test_params *test = 1122 to_migrate_test_params 1123 (xe_cur_kunit_priv(XE_TEST_LIVE_MIGRATE));) 1124 const struct xe_migrate_pt_update_ops *ops = pt_update->ops; 1125 struct dma_fence *fence; 1126 int err; 1127 u32 i; 1128 1129 if (XE_TEST_ONLY(test && test->force_gpu)) 1130 return ERR_PTR(-ETIME); 1131 1132 if (bo && !dma_resv_test_signaled(bo->ttm.base.resv, 1133 DMA_RESV_USAGE_KERNEL)) 1134 return ERR_PTR(-ETIME); 1135 1136 if (wait_vm && !dma_resv_test_signaled(xe_vm_resv(vm), 1137 DMA_RESV_USAGE_BOOKKEEP)) 1138 return ERR_PTR(-ETIME); 1139 1140 if (ops->pre_commit) { 1141 pt_update->job = NULL; 1142 err = ops->pre_commit(pt_update); 1143 if (err) 1144 return ERR_PTR(err); 1145 } 1146 for (i = 0; i < num_updates; i++) { 1147 const struct xe_vm_pgtable_update *update = &updates[i]; 1148 1149 ops->populate(pt_update, m->tile, &update->pt_bo->vmap, NULL, 1150 update->ofs, update->qwords, update); 1151 } 1152 1153 if (vm) { 1154 trace_xe_vm_cpu_bind(vm); 1155 xe_device_wmb(vm->xe); 1156 } 1157 1158 fence = dma_fence_get_stub(); 1159 1160 return fence; 1161 } 1162 1163 static bool no_in_syncs(struct xe_sync_entry *syncs, u32 num_syncs) 1164 { 1165 int i; 1166 1167 for (i = 0; i < num_syncs; i++) { 1168 struct dma_fence *fence = syncs[i].fence; 1169 1170 if (fence && !test_bit(DMA_FENCE_FLAG_SIGNALED_BIT, 1171 &fence->flags)) 1172 return false; 1173 } 1174 1175 return true; 1176 } 1177 1178 /** 1179 * xe_migrate_update_pgtables() - Pipelined page-table update 1180 * @m: The migrate context. 1181 * @vm: The vm we'll be updating. 1182 * @bo: The bo whose dma-resv we will await before updating, or NULL if userptr. 1183 * @q: The exec queue to be used for the update or NULL if the default 1184 * migration engine is to be used. 1185 * @updates: An array of update descriptors. 1186 * @num_updates: Number of descriptors in @updates. 1187 * @syncs: Array of xe_sync_entry to await before updating. Note that waits 1188 * will block the engine timeline. 1189 * @num_syncs: Number of entries in @syncs. 1190 * @pt_update: Pointer to a struct xe_migrate_pt_update, which contains 1191 * pointers to callback functions and, if subclassed, private arguments to 1192 * those. 1193 * 1194 * Perform a pipelined page-table update. The update descriptors are typically 1195 * built under the same lock critical section as a call to this function. If 1196 * using the default engine for the updates, they will be performed in the 1197 * order they grab the job_mutex. If different engines are used, external 1198 * synchronization is needed for overlapping updates to maintain page-table 1199 * consistency. Note that the meaing of "overlapping" is that the updates 1200 * touch the same page-table, which might be a higher-level page-directory. 1201 * If no pipelining is needed, then updates may be performed by the cpu. 1202 * 1203 * Return: A dma_fence that, when signaled, indicates the update completion. 1204 */ 1205 struct dma_fence * 1206 xe_migrate_update_pgtables(struct xe_migrate *m, 1207 struct xe_vm *vm, 1208 struct xe_bo *bo, 1209 struct xe_exec_queue *q, 1210 const struct xe_vm_pgtable_update *updates, 1211 u32 num_updates, 1212 struct xe_sync_entry *syncs, u32 num_syncs, 1213 struct xe_migrate_pt_update *pt_update) 1214 { 1215 const struct xe_migrate_pt_update_ops *ops = pt_update->ops; 1216 struct xe_tile *tile = m->tile; 1217 struct xe_gt *gt = tile->primary_gt; 1218 struct xe_device *xe = tile_to_xe(tile); 1219 struct xe_sched_job *job; 1220 struct dma_fence *fence; 1221 struct drm_suballoc *sa_bo = NULL; 1222 struct xe_vma *vma = pt_update->vma; 1223 struct xe_bb *bb; 1224 u32 i, batch_size, ppgtt_ofs, update_idx, page_ofs = 0; 1225 u64 addr; 1226 int err = 0; 1227 bool usm = !q && xe->info.supports_usm; 1228 bool first_munmap_rebind = vma && 1229 vma->gpuva.flags & XE_VMA_FIRST_REBIND; 1230 struct xe_exec_queue *q_override = !q ? m->q : q; 1231 u16 pat_index = xe->pat.idx[XE_CACHE_WB]; 1232 1233 /* Use the CPU if no in syncs and engine is idle */ 1234 if (no_in_syncs(syncs, num_syncs) && xe_exec_queue_is_idle(q_override)) { 1235 fence = xe_migrate_update_pgtables_cpu(m, vm, bo, updates, 1236 num_updates, 1237 first_munmap_rebind, 1238 pt_update); 1239 if (!IS_ERR(fence) || fence == ERR_PTR(-EAGAIN)) 1240 return fence; 1241 } 1242 1243 /* fixed + PTE entries */ 1244 if (IS_DGFX(xe)) 1245 batch_size = 2; 1246 else 1247 batch_size = 6 + num_updates * 2; 1248 1249 for (i = 0; i < num_updates; i++) { 1250 u32 num_cmds = DIV_ROUND_UP(updates[i].qwords, 0x1ff); 1251 1252 /* align noop + MI_STORE_DATA_IMM cmd prefix */ 1253 batch_size += 4 * num_cmds + updates[i].qwords * 2; 1254 } 1255 1256 /* 1257 * XXX: Create temp bo to copy from, if batch_size becomes too big? 1258 * 1259 * Worst case: Sum(2 * (each lower level page size) + (top level page size)) 1260 * Should be reasonably bound.. 1261 */ 1262 xe_tile_assert(tile, batch_size < SZ_128K); 1263 1264 bb = xe_bb_new(gt, batch_size, !q && xe->info.supports_usm); 1265 if (IS_ERR(bb)) 1266 return ERR_CAST(bb); 1267 1268 /* For sysmem PTE's, need to map them in our hole.. */ 1269 if (!IS_DGFX(xe)) { 1270 ppgtt_ofs = NUM_KERNEL_PDE - 1; 1271 if (q) { 1272 xe_tile_assert(tile, num_updates <= NUM_VMUSA_WRITES_PER_UNIT); 1273 1274 sa_bo = drm_suballoc_new(&m->vm_update_sa, 1, 1275 GFP_KERNEL, true, 0); 1276 if (IS_ERR(sa_bo)) { 1277 err = PTR_ERR(sa_bo); 1278 goto err; 1279 } 1280 1281 ppgtt_ofs = NUM_KERNEL_PDE + 1282 (drm_suballoc_soffset(sa_bo) / 1283 NUM_VMUSA_UNIT_PER_PAGE); 1284 page_ofs = (drm_suballoc_soffset(sa_bo) % 1285 NUM_VMUSA_UNIT_PER_PAGE) * 1286 VM_SA_UPDATE_UNIT_SIZE; 1287 } 1288 1289 /* Map our PT's to gtt */ 1290 bb->cs[bb->len++] = MI_STORE_DATA_IMM | MI_SDI_NUM_QW(num_updates); 1291 bb->cs[bb->len++] = ppgtt_ofs * XE_PAGE_SIZE + page_ofs; 1292 bb->cs[bb->len++] = 0; /* upper_32_bits */ 1293 1294 for (i = 0; i < num_updates; i++) { 1295 struct xe_bo *pt_bo = updates[i].pt_bo; 1296 1297 xe_tile_assert(tile, pt_bo->size == SZ_4K); 1298 1299 addr = vm->pt_ops->pte_encode_bo(pt_bo, 0, pat_index, 0); 1300 bb->cs[bb->len++] = lower_32_bits(addr); 1301 bb->cs[bb->len++] = upper_32_bits(addr); 1302 } 1303 1304 bb->cs[bb->len++] = MI_BATCH_BUFFER_END; 1305 update_idx = bb->len; 1306 1307 addr = xe_migrate_vm_addr(ppgtt_ofs, 0) + 1308 (page_ofs / sizeof(u64)) * XE_PAGE_SIZE; 1309 for (i = 0; i < num_updates; i++) 1310 write_pgtable(tile, bb, addr + i * XE_PAGE_SIZE, 1311 &updates[i], pt_update); 1312 } else { 1313 /* phys pages, no preamble required */ 1314 bb->cs[bb->len++] = MI_BATCH_BUFFER_END; 1315 update_idx = bb->len; 1316 1317 for (i = 0; i < num_updates; i++) 1318 write_pgtable(tile, bb, 0, &updates[i], pt_update); 1319 } 1320 1321 if (!q) 1322 mutex_lock(&m->job_mutex); 1323 1324 job = xe_bb_create_migration_job(q ?: m->q, bb, 1325 xe_migrate_batch_base(m, usm), 1326 update_idx); 1327 if (IS_ERR(job)) { 1328 err = PTR_ERR(job); 1329 goto err_bb; 1330 } 1331 1332 /* Wait on BO move */ 1333 if (bo) { 1334 err = job_add_deps(job, bo->ttm.base.resv, 1335 DMA_RESV_USAGE_KERNEL); 1336 if (err) 1337 goto err_job; 1338 } 1339 1340 /* 1341 * Munmap style VM unbind, need to wait for all jobs to be complete / 1342 * trigger preempts before moving forward 1343 */ 1344 if (first_munmap_rebind) { 1345 err = job_add_deps(job, xe_vm_resv(vm), 1346 DMA_RESV_USAGE_BOOKKEEP); 1347 if (err) 1348 goto err_job; 1349 } 1350 1351 for (i = 0; !err && i < num_syncs; i++) 1352 err = xe_sync_entry_add_deps(&syncs[i], job); 1353 1354 if (err) 1355 goto err_job; 1356 1357 if (ops->pre_commit) { 1358 pt_update->job = job; 1359 err = ops->pre_commit(pt_update); 1360 if (err) 1361 goto err_job; 1362 } 1363 xe_sched_job_arm(job); 1364 fence = dma_fence_get(&job->drm.s_fence->finished); 1365 xe_sched_job_push(job); 1366 1367 if (!q) 1368 mutex_unlock(&m->job_mutex); 1369 1370 xe_bb_free(bb, fence); 1371 drm_suballoc_free(sa_bo, fence); 1372 1373 return fence; 1374 1375 err_job: 1376 xe_sched_job_put(job); 1377 err_bb: 1378 if (!q) 1379 mutex_unlock(&m->job_mutex); 1380 xe_bb_free(bb, NULL); 1381 err: 1382 drm_suballoc_free(sa_bo, NULL); 1383 return ERR_PTR(err); 1384 } 1385 1386 /** 1387 * xe_migrate_wait() - Complete all operations using the xe_migrate context 1388 * @m: Migrate context to wait for. 1389 * 1390 * Waits until the GPU no longer uses the migrate context's default engine 1391 * or its page-table objects. FIXME: What about separate page-table update 1392 * engines? 1393 */ 1394 void xe_migrate_wait(struct xe_migrate *m) 1395 { 1396 if (m->fence) 1397 dma_fence_wait(m->fence, false); 1398 } 1399 1400 #if IS_ENABLED(CONFIG_DRM_XE_KUNIT_TEST) 1401 #include "tests/xe_migrate.c" 1402 #endif 1403