1 // SPDX-License-Identifier: MIT 2 /* 3 * Copyright © 2020 Intel Corporation 4 */ 5 6 #include "xe_migrate.h" 7 8 #include <linux/bitfield.h> 9 #include <linux/sizes.h> 10 11 #include <drm/drm_managed.h> 12 #include <drm/ttm/ttm_tt.h> 13 #include <drm/xe_drm.h> 14 15 #include "generated/xe_wa_oob.h" 16 #include "instructions/xe_mi_commands.h" 17 #include "regs/xe_gpu_commands.h" 18 #include "tests/xe_test.h" 19 #include "xe_assert.h" 20 #include "xe_bb.h" 21 #include "xe_bo.h" 22 #include "xe_exec_queue.h" 23 #include "xe_ggtt.h" 24 #include "xe_gt.h" 25 #include "xe_hw_engine.h" 26 #include "xe_lrc.h" 27 #include "xe_map.h" 28 #include "xe_mocs.h" 29 #include "xe_pt.h" 30 #include "xe_res_cursor.h" 31 #include "xe_sched_job.h" 32 #include "xe_sync.h" 33 #include "xe_trace.h" 34 #include "xe_vm.h" 35 #include "xe_wa.h" 36 37 /** 38 * struct xe_migrate - migrate context. 39 */ 40 struct xe_migrate { 41 /** @q: Default exec queue used for migration */ 42 struct xe_exec_queue *q; 43 /** @tile: Backpointer to the tile this struct xe_migrate belongs to. */ 44 struct xe_tile *tile; 45 /** @job_mutex: Timeline mutex for @eng. */ 46 struct mutex job_mutex; 47 /** @pt_bo: Page-table buffer object. */ 48 struct xe_bo *pt_bo; 49 /** 50 * @cleared_bo: Zeroed out bo used as a source for CCS metadata clears 51 */ 52 struct xe_bo *cleared_bo; 53 /** @batch_base_ofs: VM offset of the migration batch buffer */ 54 u64 batch_base_ofs; 55 /** @usm_batch_base_ofs: VM offset of the usm batch buffer */ 56 u64 usm_batch_base_ofs; 57 /** @cleared_vram_ofs: VM offset of @cleared_bo. */ 58 u64 cleared_vram_ofs; 59 /** 60 * @fence: dma-fence representing the last migration job batch. 61 * Protected by @job_mutex. 62 */ 63 struct dma_fence *fence; 64 /** 65 * @vm_update_sa: For integrated, used to suballocate page-tables 66 * out of the pt_bo. 67 */ 68 struct drm_suballoc_manager vm_update_sa; 69 }; 70 71 #define MAX_PREEMPTDISABLE_TRANSFER SZ_8M /* Around 1ms. */ 72 #define NUM_KERNEL_PDE 17 73 #define NUM_PT_SLOTS 32 74 #define NUM_PT_PER_BLIT (MAX_PREEMPTDISABLE_TRANSFER / SZ_2M) 75 76 /** 77 * xe_tile_migrate_engine() - Get this tile's migrate engine. 78 * @tile: The tile. 79 * 80 * Returns the default migrate engine of this tile. 81 * TODO: Perhaps this function is slightly misplaced, and even unneeded? 82 * 83 * Return: The default migrate engine 84 */ 85 struct xe_exec_queue *xe_tile_migrate_engine(struct xe_tile *tile) 86 { 87 return tile->migrate->q; 88 } 89 90 static void xe_migrate_fini(struct drm_device *dev, void *arg) 91 { 92 struct xe_migrate *m = arg; 93 94 xe_vm_lock(m->q->vm, false); 95 xe_bo_unpin(m->pt_bo); 96 if (m->cleared_bo) 97 xe_bo_unpin(m->cleared_bo); 98 xe_vm_unlock(m->q->vm); 99 100 dma_fence_put(m->fence); 101 if (m->cleared_bo) 102 xe_bo_put(m->cleared_bo); 103 xe_bo_put(m->pt_bo); 104 drm_suballoc_manager_fini(&m->vm_update_sa); 105 mutex_destroy(&m->job_mutex); 106 xe_vm_close_and_put(m->q->vm); 107 xe_exec_queue_put(m->q); 108 } 109 110 static u64 xe_migrate_vm_addr(u64 slot, u32 level) 111 { 112 XE_WARN_ON(slot >= NUM_PT_SLOTS); 113 114 /* First slot is reserved for mapping of PT bo and bb, start from 1 */ 115 return (slot + 1ULL) << xe_pt_shift(level + 1); 116 } 117 118 static u64 xe_migrate_vram_ofs(struct xe_device *xe, u64 addr) 119 { 120 /* 121 * Remove the DPA to get a correct offset into identity table for the 122 * migrate offset 123 */ 124 addr -= xe->mem.vram.dpa_base; 125 return addr + (256ULL << xe_pt_shift(2)); 126 } 127 128 /* 129 * For flat CCS clearing we need a cleared chunk of memory to copy from, 130 * since the CCS clearing mode of XY_FAST_COLOR_BLT appears to be buggy 131 * (it clears on only 14 bytes in each chunk of 16). 132 * If clearing the main surface one can use the part of the main surface 133 * already cleared, but for clearing as part of copying non-compressed 134 * data out of system memory, we don't readily have a cleared part of 135 * VRAM to copy from, so create one to use for that case. 136 */ 137 static int xe_migrate_create_cleared_bo(struct xe_migrate *m, struct xe_vm *vm) 138 { 139 struct xe_tile *tile = m->tile; 140 struct xe_device *xe = vm->xe; 141 size_t cleared_size; 142 u64 vram_addr; 143 144 if (!xe_device_has_flat_ccs(xe)) 145 return 0; 146 147 cleared_size = xe_device_ccs_bytes(xe, MAX_PREEMPTDISABLE_TRANSFER); 148 cleared_size = PAGE_ALIGN(cleared_size); 149 m->cleared_bo = xe_bo_create_pin_map(xe, tile, vm, cleared_size, 150 ttm_bo_type_kernel, 151 XE_BO_CREATE_VRAM_IF_DGFX(tile) | 152 XE_BO_CREATE_PINNED_BIT); 153 if (IS_ERR(m->cleared_bo)) 154 return PTR_ERR(m->cleared_bo); 155 156 xe_map_memset(xe, &m->cleared_bo->vmap, 0, 0x00, cleared_size); 157 vram_addr = xe_bo_addr(m->cleared_bo, 0, XE_PAGE_SIZE); 158 m->cleared_vram_ofs = xe_migrate_vram_ofs(xe, vram_addr); 159 160 return 0; 161 } 162 163 static int xe_migrate_prepare_vm(struct xe_tile *tile, struct xe_migrate *m, 164 struct xe_vm *vm) 165 { 166 struct xe_device *xe = tile_to_xe(tile); 167 u16 pat_index = xe->pat.idx[XE_CACHE_WB]; 168 u8 id = tile->id; 169 u32 num_entries = NUM_PT_SLOTS, num_level = vm->pt_root[id]->level; 170 u32 map_ofs, level, i; 171 struct xe_bo *bo, *batch = tile->mem.kernel_bb_pool->bo; 172 u64 entry; 173 int ret; 174 175 /* Can't bump NUM_PT_SLOTS too high */ 176 BUILD_BUG_ON(NUM_PT_SLOTS > SZ_2M/XE_PAGE_SIZE); 177 /* Must be a multiple of 64K to support all platforms */ 178 BUILD_BUG_ON(NUM_PT_SLOTS * XE_PAGE_SIZE % SZ_64K); 179 /* And one slot reserved for the 4KiB page table updates */ 180 BUILD_BUG_ON(!(NUM_KERNEL_PDE & 1)); 181 182 /* Need to be sure everything fits in the first PT, or create more */ 183 xe_tile_assert(tile, m->batch_base_ofs + batch->size < SZ_2M); 184 185 bo = xe_bo_create_pin_map(vm->xe, tile, vm, 186 num_entries * XE_PAGE_SIZE, 187 ttm_bo_type_kernel, 188 XE_BO_CREATE_VRAM_IF_DGFX(tile) | 189 XE_BO_CREATE_PINNED_BIT); 190 if (IS_ERR(bo)) 191 return PTR_ERR(bo); 192 193 ret = xe_migrate_create_cleared_bo(m, vm); 194 if (ret) { 195 xe_bo_put(bo); 196 return ret; 197 } 198 199 entry = vm->pt_ops->pde_encode_bo(bo, bo->size - XE_PAGE_SIZE, pat_index); 200 xe_pt_write(xe, &vm->pt_root[id]->bo->vmap, 0, entry); 201 202 map_ofs = (num_entries - num_level) * XE_PAGE_SIZE; 203 204 /* Map the entire BO in our level 0 pt */ 205 for (i = 0, level = 0; i < num_entries; level++) { 206 entry = vm->pt_ops->pte_encode_bo(bo, i * XE_PAGE_SIZE, 207 pat_index, 0); 208 209 xe_map_wr(xe, &bo->vmap, map_ofs + level * 8, u64, entry); 210 211 if (vm->flags & XE_VM_FLAG_64K) 212 i += 16; 213 else 214 i += 1; 215 } 216 217 if (!IS_DGFX(xe)) { 218 xe_tile_assert(tile, !xe->info.supports_usm); 219 220 /* Write out batch too */ 221 m->batch_base_ofs = NUM_PT_SLOTS * XE_PAGE_SIZE; 222 for (i = 0; i < batch->size; 223 i += vm->flags & XE_VM_FLAG_64K ? XE_64K_PAGE_SIZE : 224 XE_PAGE_SIZE) { 225 entry = vm->pt_ops->pte_encode_bo(batch, i, 226 pat_index, 0); 227 228 xe_map_wr(xe, &bo->vmap, map_ofs + level * 8, u64, 229 entry); 230 level++; 231 } 232 } else { 233 u64 batch_addr = xe_bo_addr(batch, 0, XE_PAGE_SIZE); 234 235 m->batch_base_ofs = xe_migrate_vram_ofs(xe, batch_addr); 236 237 if (xe->info.supports_usm) { 238 batch = tile->primary_gt->usm.bb_pool->bo; 239 batch_addr = xe_bo_addr(batch, 0, XE_PAGE_SIZE); 240 m->usm_batch_base_ofs = xe_migrate_vram_ofs(xe, batch_addr); 241 } 242 } 243 244 for (level = 1; level < num_level; level++) { 245 u32 flags = 0; 246 247 if (vm->flags & XE_VM_FLAG_64K && level == 1) 248 flags = XE_PDE_64K; 249 250 entry = vm->pt_ops->pde_encode_bo(bo, map_ofs + (level - 1) * 251 XE_PAGE_SIZE, pat_index); 252 xe_map_wr(xe, &bo->vmap, map_ofs + XE_PAGE_SIZE * level, u64, 253 entry | flags); 254 } 255 256 /* Write PDE's that point to our BO. */ 257 for (i = 0; i < num_entries - num_level; i++) { 258 entry = vm->pt_ops->pde_encode_bo(bo, i * XE_PAGE_SIZE, 259 pat_index); 260 261 xe_map_wr(xe, &bo->vmap, map_ofs + XE_PAGE_SIZE + 262 (i + 1) * 8, u64, entry); 263 } 264 265 /* Identity map the entire vram at 256GiB offset */ 266 if (IS_DGFX(xe)) { 267 u64 pos, ofs, flags; 268 269 level = 2; 270 ofs = map_ofs + XE_PAGE_SIZE * level + 256 * 8; 271 flags = vm->pt_ops->pte_encode_addr(xe, 0, pat_index, level, 272 true, 0); 273 274 /* 275 * Use 1GB pages, it shouldn't matter the physical amount of 276 * vram is less, when we don't access it. 277 */ 278 for (pos = xe->mem.vram.dpa_base; 279 pos < xe->mem.vram.actual_physical_size + xe->mem.vram.dpa_base; 280 pos += SZ_1G, ofs += 8) 281 xe_map_wr(xe, &bo->vmap, ofs, u64, pos | flags); 282 } 283 284 /* 285 * Example layout created above, with root level = 3: 286 * [PT0...PT7]: kernel PT's for copy/clear; 64 or 4KiB PTE's 287 * [PT8]: Kernel PT for VM_BIND, 4 KiB PTE's 288 * [PT9...PT28]: Userspace PT's for VM_BIND, 4 KiB PTE's 289 * [PT29 = PDE 0] [PT30 = PDE 1] [PT31 = PDE 2] 290 * 291 * This makes the lowest part of the VM point to the pagetables. 292 * Hence the lowest 2M in the vm should point to itself, with a few writes 293 * and flushes, other parts of the VM can be used either for copying and 294 * clearing. 295 * 296 * For performance, the kernel reserves PDE's, so about 20 are left 297 * for async VM updates. 298 * 299 * To make it easier to work, each scratch PT is put in slot (1 + PT #) 300 * everywhere, this allows lockless updates to scratch pages by using 301 * the different addresses in VM. 302 */ 303 #define NUM_VMUSA_UNIT_PER_PAGE 32 304 #define VM_SA_UPDATE_UNIT_SIZE (XE_PAGE_SIZE / NUM_VMUSA_UNIT_PER_PAGE) 305 #define NUM_VMUSA_WRITES_PER_UNIT (VM_SA_UPDATE_UNIT_SIZE / sizeof(u64)) 306 drm_suballoc_manager_init(&m->vm_update_sa, 307 (map_ofs / XE_PAGE_SIZE - NUM_KERNEL_PDE) * 308 NUM_VMUSA_UNIT_PER_PAGE, 0); 309 310 m->pt_bo = bo; 311 return 0; 312 } 313 314 /* 315 * Due to workaround 16017236439, odd instance hardware copy engines are 316 * faster than even instance ones. 317 * This function returns the mask involving all fast copy engines and the 318 * reserved copy engine to be used as logical mask for migrate engine. 319 * Including the reserved copy engine is required to avoid deadlocks due to 320 * migrate jobs servicing the faults gets stuck behind the job that faulted. 321 */ 322 static u32 xe_migrate_usm_logical_mask(struct xe_gt *gt) 323 { 324 u32 logical_mask = 0; 325 struct xe_hw_engine *hwe; 326 enum xe_hw_engine_id id; 327 328 for_each_hw_engine(hwe, gt, id) { 329 if (hwe->class != XE_ENGINE_CLASS_COPY) 330 continue; 331 332 if (!XE_WA(gt, 16017236439) || 333 xe_gt_is_usm_hwe(gt, hwe) || hwe->instance & 1) 334 logical_mask |= BIT(hwe->logical_instance); 335 } 336 337 return logical_mask; 338 } 339 340 /** 341 * xe_migrate_init() - Initialize a migrate context 342 * @tile: Back-pointer to the tile we're initializing for. 343 * 344 * Return: Pointer to a migrate context on success. Error pointer on error. 345 */ 346 struct xe_migrate *xe_migrate_init(struct xe_tile *tile) 347 { 348 struct xe_device *xe = tile_to_xe(tile); 349 struct xe_gt *primary_gt = tile->primary_gt; 350 struct xe_migrate *m; 351 struct xe_vm *vm; 352 int err; 353 354 m = drmm_kzalloc(&xe->drm, sizeof(*m), GFP_KERNEL); 355 if (!m) 356 return ERR_PTR(-ENOMEM); 357 358 m->tile = tile; 359 360 /* Special layout, prepared below.. */ 361 vm = xe_vm_create(xe, XE_VM_FLAG_MIGRATION | 362 XE_VM_FLAG_SET_TILE_ID(tile)); 363 if (IS_ERR(vm)) 364 return ERR_CAST(vm); 365 366 xe_vm_lock(vm, false); 367 err = xe_migrate_prepare_vm(tile, m, vm); 368 xe_vm_unlock(vm); 369 if (err) { 370 xe_vm_close_and_put(vm); 371 return ERR_PTR(err); 372 } 373 374 if (xe->info.supports_usm) { 375 struct xe_hw_engine *hwe = xe_gt_hw_engine(primary_gt, 376 XE_ENGINE_CLASS_COPY, 377 primary_gt->usm.reserved_bcs_instance, 378 false); 379 u32 logical_mask = xe_migrate_usm_logical_mask(primary_gt); 380 381 if (!hwe || !logical_mask) 382 return ERR_PTR(-EINVAL); 383 384 m->q = xe_exec_queue_create(xe, vm, logical_mask, 1, hwe, 385 EXEC_QUEUE_FLAG_KERNEL | 386 EXEC_QUEUE_FLAG_PERMANENT); 387 } else { 388 m->q = xe_exec_queue_create_class(xe, primary_gt, vm, 389 XE_ENGINE_CLASS_COPY, 390 EXEC_QUEUE_FLAG_KERNEL | 391 EXEC_QUEUE_FLAG_PERMANENT); 392 } 393 if (IS_ERR(m->q)) { 394 xe_vm_close_and_put(vm); 395 return ERR_CAST(m->q); 396 } 397 if (xe->info.supports_usm) 398 m->q->priority = XE_EXEC_QUEUE_PRIORITY_KERNEL; 399 400 mutex_init(&m->job_mutex); 401 402 err = drmm_add_action_or_reset(&xe->drm, xe_migrate_fini, m); 403 if (err) 404 return ERR_PTR(err); 405 406 return m; 407 } 408 409 static void emit_arb_clear(struct xe_bb *bb) 410 { 411 /* 1 dword */ 412 bb->cs[bb->len++] = MI_ARB_ON_OFF | MI_ARB_DISABLE; 413 } 414 415 static u64 xe_migrate_res_sizes(struct xe_res_cursor *cur) 416 { 417 /* 418 * For VRAM we use identity mapped pages so we are limited to current 419 * cursor size. For system we program the pages ourselves so we have no 420 * such limitation. 421 */ 422 return min_t(u64, MAX_PREEMPTDISABLE_TRANSFER, 423 mem_type_is_vram(cur->mem_type) ? cur->size : 424 cur->remaining); 425 } 426 427 static u32 pte_update_size(struct xe_migrate *m, 428 bool is_vram, 429 struct ttm_resource *res, 430 struct xe_res_cursor *cur, 431 u64 *L0, u64 *L0_ofs, u32 *L0_pt, 432 u32 cmd_size, u32 pt_ofs, u32 avail_pts) 433 { 434 u32 cmds = 0; 435 436 *L0_pt = pt_ofs; 437 if (!is_vram) { 438 /* Clip L0 to available size */ 439 u64 size = min(*L0, (u64)avail_pts * SZ_2M); 440 u64 num_4k_pages = DIV_ROUND_UP(size, XE_PAGE_SIZE); 441 442 *L0 = size; 443 *L0_ofs = xe_migrate_vm_addr(pt_ofs, 0); 444 445 /* MI_STORE_DATA_IMM */ 446 cmds += 3 * DIV_ROUND_UP(num_4k_pages, 0x1ff); 447 448 /* PDE qwords */ 449 cmds += num_4k_pages * 2; 450 451 /* Each chunk has a single blit command */ 452 cmds += cmd_size; 453 } else { 454 /* Offset into identity map. */ 455 *L0_ofs = xe_migrate_vram_ofs(tile_to_xe(m->tile), 456 cur->start + vram_region_gpu_offset(res)); 457 cmds += cmd_size; 458 } 459 460 return cmds; 461 } 462 463 static void emit_pte(struct xe_migrate *m, 464 struct xe_bb *bb, u32 at_pt, 465 bool is_vram, 466 struct xe_res_cursor *cur, 467 u32 size, struct xe_bo *bo) 468 { 469 u16 pat_index = tile_to_xe(m->tile)->pat.idx[XE_CACHE_WB]; 470 u32 ptes; 471 u64 ofs = at_pt * XE_PAGE_SIZE; 472 u64 cur_ofs; 473 474 /* 475 * FIXME: Emitting VRAM PTEs to L0 PTs is forbidden. Currently 476 * we're only emitting VRAM PTEs during sanity tests, so when 477 * that's moved to a Kunit test, we should condition VRAM PTEs 478 * on running tests. 479 */ 480 481 ptes = DIV_ROUND_UP(size, XE_PAGE_SIZE); 482 483 while (ptes) { 484 u32 chunk = min(0x1ffU, ptes); 485 486 bb->cs[bb->len++] = MI_STORE_DATA_IMM | MI_SDI_NUM_QW(chunk); 487 bb->cs[bb->len++] = ofs; 488 bb->cs[bb->len++] = 0; 489 490 cur_ofs = ofs; 491 ofs += chunk * 8; 492 ptes -= chunk; 493 494 while (chunk--) { 495 u64 addr, flags = 0; 496 bool devmem = false; 497 498 addr = xe_res_dma(cur) & PAGE_MASK; 499 if (is_vram) { 500 /* Is this a 64K PTE entry? */ 501 if ((m->q->vm->flags & XE_VM_FLAG_64K) && 502 !(cur_ofs & (16 * 8 - 1))) { 503 xe_tile_assert(m->tile, IS_ALIGNED(addr, SZ_64K)); 504 flags |= XE_PTE_PS64; 505 } 506 507 addr += vram_region_gpu_offset(bo->ttm.resource); 508 devmem = true; 509 } 510 511 addr = m->q->vm->pt_ops->pte_encode_addr(m->tile->xe, 512 addr, pat_index, 513 0, devmem, flags); 514 bb->cs[bb->len++] = lower_32_bits(addr); 515 bb->cs[bb->len++] = upper_32_bits(addr); 516 517 xe_res_next(cur, min_t(u32, size, PAGE_SIZE)); 518 cur_ofs += 8; 519 } 520 } 521 } 522 523 #define EMIT_COPY_CCS_DW 5 524 static void emit_copy_ccs(struct xe_gt *gt, struct xe_bb *bb, 525 u64 dst_ofs, bool dst_is_indirect, 526 u64 src_ofs, bool src_is_indirect, 527 u32 size) 528 { 529 struct xe_device *xe = gt_to_xe(gt); 530 u32 *cs = bb->cs + bb->len; 531 u32 num_ccs_blks; 532 u32 mocs; 533 534 num_ccs_blks = DIV_ROUND_UP(xe_device_ccs_bytes(gt_to_xe(gt), size), 535 NUM_CCS_BYTES_PER_BLOCK); 536 xe_gt_assert(gt, num_ccs_blks <= NUM_CCS_BLKS_PER_XFER); 537 538 if (GRAPHICS_VERx100(xe) >= 2000) 539 mocs = FIELD_PREP(XE2_XY_CTRL_SURF_MOCS_INDEX_MASK, gt->mocs.uc_index); 540 else 541 mocs = FIELD_PREP(XY_CTRL_SURF_MOCS_MASK, gt->mocs.uc_index); 542 543 *cs++ = XY_CTRL_SURF_COPY_BLT | 544 (src_is_indirect ? 0x0 : 0x1) << SRC_ACCESS_TYPE_SHIFT | 545 (dst_is_indirect ? 0x0 : 0x1) << DST_ACCESS_TYPE_SHIFT | 546 ((num_ccs_blks - 1) & CCS_SIZE_MASK) << CCS_SIZE_SHIFT; 547 *cs++ = lower_32_bits(src_ofs); 548 *cs++ = upper_32_bits(src_ofs) | mocs; 549 *cs++ = lower_32_bits(dst_ofs); 550 *cs++ = upper_32_bits(dst_ofs) | mocs; 551 552 bb->len = cs - bb->cs; 553 } 554 555 #define EMIT_COPY_DW 10 556 static void emit_copy(struct xe_gt *gt, struct xe_bb *bb, 557 u64 src_ofs, u64 dst_ofs, unsigned int size, 558 unsigned int pitch) 559 { 560 struct xe_device *xe = gt_to_xe(gt); 561 u32 mocs = 0; 562 u32 tile_y = 0; 563 564 xe_gt_assert(gt, size / pitch <= S16_MAX); 565 xe_gt_assert(gt, pitch / 4 <= S16_MAX); 566 xe_gt_assert(gt, pitch <= U16_MAX); 567 568 if (GRAPHICS_VER(xe) >= 20) 569 mocs = FIELD_PREP(XE2_XY_FAST_COPY_BLT_MOCS_INDEX_MASK, gt->mocs.uc_index); 570 571 if (GRAPHICS_VERx100(xe) >= 1250) 572 tile_y = XY_FAST_COPY_BLT_D1_SRC_TILE4 | XY_FAST_COPY_BLT_D1_DST_TILE4; 573 574 bb->cs[bb->len++] = XY_FAST_COPY_BLT_CMD | (10 - 2); 575 bb->cs[bb->len++] = XY_FAST_COPY_BLT_DEPTH_32 | pitch | tile_y | mocs; 576 bb->cs[bb->len++] = 0; 577 bb->cs[bb->len++] = (size / pitch) << 16 | pitch / 4; 578 bb->cs[bb->len++] = lower_32_bits(dst_ofs); 579 bb->cs[bb->len++] = upper_32_bits(dst_ofs); 580 bb->cs[bb->len++] = 0; 581 bb->cs[bb->len++] = pitch | mocs; 582 bb->cs[bb->len++] = lower_32_bits(src_ofs); 583 bb->cs[bb->len++] = upper_32_bits(src_ofs); 584 } 585 586 static int job_add_deps(struct xe_sched_job *job, struct dma_resv *resv, 587 enum dma_resv_usage usage) 588 { 589 return drm_sched_job_add_resv_dependencies(&job->drm, resv, usage); 590 } 591 592 static u64 xe_migrate_batch_base(struct xe_migrate *m, bool usm) 593 { 594 return usm ? m->usm_batch_base_ofs : m->batch_base_ofs; 595 } 596 597 static u32 xe_migrate_ccs_copy(struct xe_migrate *m, 598 struct xe_bb *bb, 599 u64 src_ofs, bool src_is_vram, 600 u64 dst_ofs, bool dst_is_vram, u32 dst_size, 601 u64 ccs_ofs, bool copy_ccs) 602 { 603 struct xe_gt *gt = m->tile->primary_gt; 604 u32 flush_flags = 0; 605 606 if (xe_device_has_flat_ccs(gt_to_xe(gt)) && !copy_ccs && dst_is_vram) { 607 /* 608 * If the src is already in vram, then it should already 609 * have been cleared by us, or has been populated by the 610 * user. Make sure we copy the CCS aux state as-is. 611 * 612 * Otherwise if the bo doesn't have any CCS metadata attached, 613 * we still need to clear it for security reasons. 614 */ 615 u64 ccs_src_ofs = src_is_vram ? src_ofs : m->cleared_vram_ofs; 616 617 emit_copy_ccs(gt, bb, 618 dst_ofs, true, 619 ccs_src_ofs, src_is_vram, dst_size); 620 621 flush_flags = MI_FLUSH_DW_CCS; 622 } else if (copy_ccs) { 623 if (!src_is_vram) 624 src_ofs = ccs_ofs; 625 else if (!dst_is_vram) 626 dst_ofs = ccs_ofs; 627 628 /* 629 * At the moment, we don't support copying CCS metadata from 630 * system to system. 631 */ 632 xe_gt_assert(gt, src_is_vram || dst_is_vram); 633 634 emit_copy_ccs(gt, bb, dst_ofs, dst_is_vram, src_ofs, 635 src_is_vram, dst_size); 636 if (dst_is_vram) 637 flush_flags = MI_FLUSH_DW_CCS; 638 } 639 640 return flush_flags; 641 } 642 643 /** 644 * xe_migrate_copy() - Copy content of TTM resources. 645 * @m: The migration context. 646 * @src_bo: The buffer object @src is currently bound to. 647 * @dst_bo: If copying between resources created for the same bo, set this to 648 * the same value as @src_bo. If copying between buffer objects, set it to 649 * the buffer object @dst is currently bound to. 650 * @src: The source TTM resource. 651 * @dst: The dst TTM resource. 652 * 653 * Copies the contents of @src to @dst: On flat CCS devices, 654 * the CCS metadata is copied as well if needed, or if not present, 655 * the CCS metadata of @dst is cleared for security reasons. 656 * 657 * Return: Pointer to a dma_fence representing the last copy batch, or 658 * an error pointer on failure. If there is a failure, any copy operation 659 * started by the function call has been synced. 660 */ 661 struct dma_fence *xe_migrate_copy(struct xe_migrate *m, 662 struct xe_bo *src_bo, 663 struct xe_bo *dst_bo, 664 struct ttm_resource *src, 665 struct ttm_resource *dst) 666 { 667 struct xe_gt *gt = m->tile->primary_gt; 668 struct xe_device *xe = gt_to_xe(gt); 669 struct dma_fence *fence = NULL; 670 u64 size = src_bo->size; 671 struct xe_res_cursor src_it, dst_it, ccs_it; 672 u64 src_L0_ofs, dst_L0_ofs; 673 u32 src_L0_pt, dst_L0_pt; 674 u64 src_L0, dst_L0; 675 int pass = 0; 676 int err; 677 bool src_is_vram = mem_type_is_vram(src->mem_type); 678 bool dst_is_vram = mem_type_is_vram(dst->mem_type); 679 bool copy_ccs = xe_device_has_flat_ccs(xe) && 680 xe_bo_needs_ccs_pages(src_bo) && xe_bo_needs_ccs_pages(dst_bo); 681 bool copy_system_ccs = copy_ccs && (!src_is_vram || !dst_is_vram); 682 683 /* Copying CCS between two different BOs is not supported yet. */ 684 if (XE_WARN_ON(copy_ccs && src_bo != dst_bo)) 685 return ERR_PTR(-EINVAL); 686 687 if (src_bo != dst_bo && XE_WARN_ON(src_bo->size != dst_bo->size)) 688 return ERR_PTR(-EINVAL); 689 690 if (!src_is_vram) 691 xe_res_first_sg(xe_bo_get_sg(src_bo), 0, size, &src_it); 692 else 693 xe_res_first(src, 0, size, &src_it); 694 if (!dst_is_vram) 695 xe_res_first_sg(xe_bo_get_sg(dst_bo), 0, size, &dst_it); 696 else 697 xe_res_first(dst, 0, size, &dst_it); 698 699 if (copy_system_ccs) 700 xe_res_first_sg(xe_bo_get_sg(src_bo), xe_bo_ccs_pages_start(src_bo), 701 PAGE_ALIGN(xe_device_ccs_bytes(xe, size)), 702 &ccs_it); 703 704 while (size) { 705 u32 batch_size = 2; /* arb_clear() + MI_BATCH_BUFFER_END */ 706 struct xe_sched_job *job; 707 struct xe_bb *bb; 708 u32 flush_flags; 709 u32 update_idx; 710 u64 ccs_ofs, ccs_size; 711 u32 ccs_pt; 712 bool usm = xe->info.supports_usm; 713 714 src_L0 = xe_migrate_res_sizes(&src_it); 715 dst_L0 = xe_migrate_res_sizes(&dst_it); 716 717 drm_dbg(&xe->drm, "Pass %u, sizes: %llu & %llu\n", 718 pass++, src_L0, dst_L0); 719 720 src_L0 = min(src_L0, dst_L0); 721 722 batch_size += pte_update_size(m, src_is_vram, src, &src_it, &src_L0, 723 &src_L0_ofs, &src_L0_pt, 0, 0, 724 NUM_PT_PER_BLIT); 725 726 batch_size += pte_update_size(m, dst_is_vram, dst, &dst_it, &src_L0, 727 &dst_L0_ofs, &dst_L0_pt, 0, 728 NUM_PT_PER_BLIT, NUM_PT_PER_BLIT); 729 730 if (copy_system_ccs) { 731 ccs_size = xe_device_ccs_bytes(xe, src_L0); 732 batch_size += pte_update_size(m, false, NULL, &ccs_it, &ccs_size, 733 &ccs_ofs, &ccs_pt, 0, 734 2 * NUM_PT_PER_BLIT, 735 NUM_PT_PER_BLIT); 736 } 737 738 /* Add copy commands size here */ 739 batch_size += EMIT_COPY_DW + 740 (xe_device_has_flat_ccs(xe) ? EMIT_COPY_CCS_DW : 0); 741 742 bb = xe_bb_new(gt, batch_size, usm); 743 if (IS_ERR(bb)) { 744 err = PTR_ERR(bb); 745 goto err_sync; 746 } 747 748 /* Preemption is enabled again by the ring ops. */ 749 if (!src_is_vram || !dst_is_vram) 750 emit_arb_clear(bb); 751 752 if (!src_is_vram) 753 emit_pte(m, bb, src_L0_pt, src_is_vram, &src_it, src_L0, 754 src_bo); 755 else 756 xe_res_next(&src_it, src_L0); 757 758 if (!dst_is_vram) 759 emit_pte(m, bb, dst_L0_pt, dst_is_vram, &dst_it, src_L0, 760 dst_bo); 761 else 762 xe_res_next(&dst_it, src_L0); 763 764 if (copy_system_ccs) 765 emit_pte(m, bb, ccs_pt, false, &ccs_it, ccs_size, src_bo); 766 767 bb->cs[bb->len++] = MI_BATCH_BUFFER_END; 768 update_idx = bb->len; 769 770 emit_copy(gt, bb, src_L0_ofs, dst_L0_ofs, src_L0, 771 XE_PAGE_SIZE); 772 flush_flags = xe_migrate_ccs_copy(m, bb, src_L0_ofs, src_is_vram, 773 dst_L0_ofs, dst_is_vram, 774 src_L0, ccs_ofs, copy_ccs); 775 776 mutex_lock(&m->job_mutex); 777 job = xe_bb_create_migration_job(m->q, bb, 778 xe_migrate_batch_base(m, usm), 779 update_idx); 780 if (IS_ERR(job)) { 781 err = PTR_ERR(job); 782 goto err; 783 } 784 785 xe_sched_job_add_migrate_flush(job, flush_flags); 786 if (!fence) { 787 err = job_add_deps(job, src_bo->ttm.base.resv, 788 DMA_RESV_USAGE_BOOKKEEP); 789 if (!err && src_bo != dst_bo) 790 err = job_add_deps(job, dst_bo->ttm.base.resv, 791 DMA_RESV_USAGE_BOOKKEEP); 792 if (err) 793 goto err_job; 794 } 795 796 xe_sched_job_arm(job); 797 dma_fence_put(fence); 798 fence = dma_fence_get(&job->drm.s_fence->finished); 799 xe_sched_job_push(job); 800 801 dma_fence_put(m->fence); 802 m->fence = dma_fence_get(fence); 803 804 mutex_unlock(&m->job_mutex); 805 806 xe_bb_free(bb, fence); 807 size -= src_L0; 808 continue; 809 810 err_job: 811 xe_sched_job_put(job); 812 err: 813 mutex_unlock(&m->job_mutex); 814 xe_bb_free(bb, NULL); 815 816 err_sync: 817 /* Sync partial copy if any. FIXME: under job_mutex? */ 818 if (fence) { 819 dma_fence_wait(fence, false); 820 dma_fence_put(fence); 821 } 822 823 return ERR_PTR(err); 824 } 825 826 return fence; 827 } 828 829 static void emit_clear_link_copy(struct xe_gt *gt, struct xe_bb *bb, u64 src_ofs, 830 u32 size, u32 pitch) 831 { 832 struct xe_device *xe = gt_to_xe(gt); 833 u32 *cs = bb->cs + bb->len; 834 u32 len = PVC_MEM_SET_CMD_LEN_DW; 835 836 *cs++ = PVC_MEM_SET_CMD | PVC_MEM_SET_MATRIX | (len - 2); 837 *cs++ = pitch - 1; 838 *cs++ = (size / pitch) - 1; 839 *cs++ = pitch - 1; 840 *cs++ = lower_32_bits(src_ofs); 841 *cs++ = upper_32_bits(src_ofs); 842 if (GRAPHICS_VERx100(xe) >= 2000) 843 *cs++ = FIELD_PREP(XE2_MEM_SET_MOCS_INDEX_MASK, gt->mocs.uc_index); 844 else 845 *cs++ = FIELD_PREP(PVC_MEM_SET_MOCS_INDEX_MASK, gt->mocs.uc_index); 846 847 xe_gt_assert(gt, cs - bb->cs == len + bb->len); 848 849 bb->len += len; 850 } 851 852 static void emit_clear_main_copy(struct xe_gt *gt, struct xe_bb *bb, 853 u64 src_ofs, u32 size, u32 pitch, bool is_vram) 854 { 855 struct xe_device *xe = gt_to_xe(gt); 856 u32 *cs = bb->cs + bb->len; 857 u32 len = XY_FAST_COLOR_BLT_DW; 858 859 if (GRAPHICS_VERx100(xe) < 1250) 860 len = 11; 861 862 *cs++ = XY_FAST_COLOR_BLT_CMD | XY_FAST_COLOR_BLT_DEPTH_32 | 863 (len - 2); 864 if (GRAPHICS_VERx100(xe) >= 2000) 865 *cs++ = FIELD_PREP(XE2_XY_FAST_COLOR_BLT_MOCS_INDEX_MASK, gt->mocs.uc_index) | 866 (pitch - 1); 867 else 868 *cs++ = FIELD_PREP(XY_FAST_COLOR_BLT_MOCS_MASK, gt->mocs.uc_index) | 869 (pitch - 1); 870 *cs++ = 0; 871 *cs++ = (size / pitch) << 16 | pitch / 4; 872 *cs++ = lower_32_bits(src_ofs); 873 *cs++ = upper_32_bits(src_ofs); 874 *cs++ = (is_vram ? 0x0 : 0x1) << XY_FAST_COLOR_BLT_MEM_TYPE_SHIFT; 875 *cs++ = 0; 876 *cs++ = 0; 877 *cs++ = 0; 878 *cs++ = 0; 879 880 if (len > 11) { 881 *cs++ = 0; 882 *cs++ = 0; 883 *cs++ = 0; 884 *cs++ = 0; 885 *cs++ = 0; 886 } 887 888 xe_gt_assert(gt, cs - bb->cs == len + bb->len); 889 890 bb->len += len; 891 } 892 893 static bool has_service_copy_support(struct xe_gt *gt) 894 { 895 /* 896 * What we care about is whether the architecture was designed with 897 * service copy functionality (specifically the new MEM_SET / MEM_COPY 898 * instructions) so check the architectural engine list rather than the 899 * actual list since these instructions are usable on BCS0 even if 900 * all of the actual service copy engines (BCS1-BCS8) have been fused 901 * off. 902 */ 903 return gt->info.__engine_mask & GENMASK(XE_HW_ENGINE_BCS8, 904 XE_HW_ENGINE_BCS1); 905 } 906 907 static u32 emit_clear_cmd_len(struct xe_gt *gt) 908 { 909 if (has_service_copy_support(gt)) 910 return PVC_MEM_SET_CMD_LEN_DW; 911 else 912 return XY_FAST_COLOR_BLT_DW; 913 } 914 915 static void emit_clear(struct xe_gt *gt, struct xe_bb *bb, u64 src_ofs, 916 u32 size, u32 pitch, bool is_vram) 917 { 918 if (has_service_copy_support(gt)) 919 emit_clear_link_copy(gt, bb, src_ofs, size, pitch); 920 else 921 emit_clear_main_copy(gt, bb, src_ofs, size, pitch, 922 is_vram); 923 } 924 925 /** 926 * xe_migrate_clear() - Copy content of TTM resources. 927 * @m: The migration context. 928 * @bo: The buffer object @dst is currently bound to. 929 * @dst: The dst TTM resource to be cleared. 930 * 931 * Clear the contents of @dst to zero. On flat CCS devices, 932 * the CCS metadata is cleared to zero as well on VRAM destinations. 933 * TODO: Eliminate the @bo argument. 934 * 935 * Return: Pointer to a dma_fence representing the last clear batch, or 936 * an error pointer on failure. If there is a failure, any clear operation 937 * started by the function call has been synced. 938 */ 939 struct dma_fence *xe_migrate_clear(struct xe_migrate *m, 940 struct xe_bo *bo, 941 struct ttm_resource *dst) 942 { 943 bool clear_vram = mem_type_is_vram(dst->mem_type); 944 struct xe_gt *gt = m->tile->primary_gt; 945 struct xe_device *xe = gt_to_xe(gt); 946 struct dma_fence *fence = NULL; 947 u64 size = bo->size; 948 struct xe_res_cursor src_it; 949 struct ttm_resource *src = dst; 950 int err; 951 int pass = 0; 952 953 if (!clear_vram) 954 xe_res_first_sg(xe_bo_get_sg(bo), 0, bo->size, &src_it); 955 else 956 xe_res_first(src, 0, bo->size, &src_it); 957 958 while (size) { 959 u64 clear_L0_ofs; 960 u32 clear_L0_pt; 961 u32 flush_flags = 0; 962 u64 clear_L0; 963 struct xe_sched_job *job; 964 struct xe_bb *bb; 965 u32 batch_size, update_idx; 966 bool usm = xe->info.supports_usm; 967 968 clear_L0 = xe_migrate_res_sizes(&src_it); 969 drm_dbg(&xe->drm, "Pass %u, size: %llu\n", pass++, clear_L0); 970 971 /* Calculate final sizes and batch size.. */ 972 batch_size = 2 + 973 pte_update_size(m, clear_vram, src, &src_it, 974 &clear_L0, &clear_L0_ofs, &clear_L0_pt, 975 emit_clear_cmd_len(gt), 0, 976 NUM_PT_PER_BLIT); 977 if (xe_device_has_flat_ccs(xe) && clear_vram) 978 batch_size += EMIT_COPY_CCS_DW; 979 980 /* Clear commands */ 981 982 if (WARN_ON_ONCE(!clear_L0)) 983 break; 984 985 bb = xe_bb_new(gt, batch_size, usm); 986 if (IS_ERR(bb)) { 987 err = PTR_ERR(bb); 988 goto err_sync; 989 } 990 991 size -= clear_L0; 992 993 /* TODO: Add dependencies here */ 994 995 /* Preemption is enabled again by the ring ops. */ 996 if (!clear_vram) { 997 emit_arb_clear(bb); 998 emit_pte(m, bb, clear_L0_pt, clear_vram, &src_it, clear_L0, 999 bo); 1000 } else { 1001 xe_res_next(&src_it, clear_L0); 1002 } 1003 bb->cs[bb->len++] = MI_BATCH_BUFFER_END; 1004 update_idx = bb->len; 1005 1006 emit_clear(gt, bb, clear_L0_ofs, clear_L0, XE_PAGE_SIZE, 1007 clear_vram); 1008 if (xe_device_has_flat_ccs(xe) && clear_vram) { 1009 emit_copy_ccs(gt, bb, clear_L0_ofs, true, 1010 m->cleared_vram_ofs, false, clear_L0); 1011 flush_flags = MI_FLUSH_DW_CCS; 1012 } 1013 1014 mutex_lock(&m->job_mutex); 1015 job = xe_bb_create_migration_job(m->q, bb, 1016 xe_migrate_batch_base(m, usm), 1017 update_idx); 1018 if (IS_ERR(job)) { 1019 err = PTR_ERR(job); 1020 goto err; 1021 } 1022 1023 xe_sched_job_add_migrate_flush(job, flush_flags); 1024 1025 xe_sched_job_arm(job); 1026 dma_fence_put(fence); 1027 fence = dma_fence_get(&job->drm.s_fence->finished); 1028 xe_sched_job_push(job); 1029 1030 dma_fence_put(m->fence); 1031 m->fence = dma_fence_get(fence); 1032 1033 mutex_unlock(&m->job_mutex); 1034 1035 xe_bb_free(bb, fence); 1036 continue; 1037 1038 err: 1039 mutex_unlock(&m->job_mutex); 1040 xe_bb_free(bb, NULL); 1041 err_sync: 1042 /* Sync partial copies if any. FIXME: job_mutex? */ 1043 if (fence) { 1044 dma_fence_wait(m->fence, false); 1045 dma_fence_put(fence); 1046 } 1047 1048 return ERR_PTR(err); 1049 } 1050 1051 return fence; 1052 } 1053 1054 static void write_pgtable(struct xe_tile *tile, struct xe_bb *bb, u64 ppgtt_ofs, 1055 const struct xe_vm_pgtable_update *update, 1056 struct xe_migrate_pt_update *pt_update) 1057 { 1058 const struct xe_migrate_pt_update_ops *ops = pt_update->ops; 1059 u32 chunk; 1060 u32 ofs = update->ofs, size = update->qwords; 1061 1062 /* 1063 * If we have 512 entries (max), we would populate it ourselves, 1064 * and update the PDE above it to the new pointer. 1065 * The only time this can only happen if we have to update the top 1066 * PDE. This requires a BO that is almost vm->size big. 1067 * 1068 * This shouldn't be possible in practice.. might change when 16K 1069 * pages are used. Hence the assert. 1070 */ 1071 xe_tile_assert(tile, update->qwords <= 0x1ff); 1072 if (!ppgtt_ofs) 1073 ppgtt_ofs = xe_migrate_vram_ofs(tile_to_xe(tile), 1074 xe_bo_addr(update->pt_bo, 0, 1075 XE_PAGE_SIZE)); 1076 1077 do { 1078 u64 addr = ppgtt_ofs + ofs * 8; 1079 1080 chunk = min(update->qwords, 0x1ffU); 1081 1082 /* Ensure populatefn can do memset64 by aligning bb->cs */ 1083 if (!(bb->len & 1)) 1084 bb->cs[bb->len++] = MI_NOOP; 1085 1086 bb->cs[bb->len++] = MI_STORE_DATA_IMM | MI_SDI_NUM_QW(chunk); 1087 bb->cs[bb->len++] = lower_32_bits(addr); 1088 bb->cs[bb->len++] = upper_32_bits(addr); 1089 ops->populate(pt_update, tile, NULL, bb->cs + bb->len, ofs, chunk, 1090 update); 1091 1092 bb->len += chunk * 2; 1093 ofs += chunk; 1094 size -= chunk; 1095 } while (size); 1096 } 1097 1098 struct xe_vm *xe_migrate_get_vm(struct xe_migrate *m) 1099 { 1100 return xe_vm_get(m->q->vm); 1101 } 1102 1103 #if IS_ENABLED(CONFIG_DRM_XE_KUNIT_TEST) 1104 struct migrate_test_params { 1105 struct xe_test_priv base; 1106 bool force_gpu; 1107 }; 1108 1109 #define to_migrate_test_params(_priv) \ 1110 container_of(_priv, struct migrate_test_params, base) 1111 #endif 1112 1113 static struct dma_fence * 1114 xe_migrate_update_pgtables_cpu(struct xe_migrate *m, 1115 struct xe_vm *vm, struct xe_bo *bo, 1116 const struct xe_vm_pgtable_update *updates, 1117 u32 num_updates, bool wait_vm, 1118 struct xe_migrate_pt_update *pt_update) 1119 { 1120 XE_TEST_DECLARE(struct migrate_test_params *test = 1121 to_migrate_test_params 1122 (xe_cur_kunit_priv(XE_TEST_LIVE_MIGRATE));) 1123 const struct xe_migrate_pt_update_ops *ops = pt_update->ops; 1124 struct dma_fence *fence; 1125 int err; 1126 u32 i; 1127 1128 if (XE_TEST_ONLY(test && test->force_gpu)) 1129 return ERR_PTR(-ETIME); 1130 1131 if (bo && !dma_resv_test_signaled(bo->ttm.base.resv, 1132 DMA_RESV_USAGE_KERNEL)) 1133 return ERR_PTR(-ETIME); 1134 1135 if (wait_vm && !dma_resv_test_signaled(xe_vm_resv(vm), 1136 DMA_RESV_USAGE_BOOKKEEP)) 1137 return ERR_PTR(-ETIME); 1138 1139 if (ops->pre_commit) { 1140 pt_update->job = NULL; 1141 err = ops->pre_commit(pt_update); 1142 if (err) 1143 return ERR_PTR(err); 1144 } 1145 for (i = 0; i < num_updates; i++) { 1146 const struct xe_vm_pgtable_update *update = &updates[i]; 1147 1148 ops->populate(pt_update, m->tile, &update->pt_bo->vmap, NULL, 1149 update->ofs, update->qwords, update); 1150 } 1151 1152 if (vm) { 1153 trace_xe_vm_cpu_bind(vm); 1154 xe_device_wmb(vm->xe); 1155 } 1156 1157 fence = dma_fence_get_stub(); 1158 1159 return fence; 1160 } 1161 1162 static bool no_in_syncs(struct xe_sync_entry *syncs, u32 num_syncs) 1163 { 1164 int i; 1165 1166 for (i = 0; i < num_syncs; i++) { 1167 struct dma_fence *fence = syncs[i].fence; 1168 1169 if (fence && !test_bit(DMA_FENCE_FLAG_SIGNALED_BIT, 1170 &fence->flags)) 1171 return false; 1172 } 1173 1174 return true; 1175 } 1176 1177 /** 1178 * xe_migrate_update_pgtables() - Pipelined page-table update 1179 * @m: The migrate context. 1180 * @vm: The vm we'll be updating. 1181 * @bo: The bo whose dma-resv we will await before updating, or NULL if userptr. 1182 * @q: The exec queue to be used for the update or NULL if the default 1183 * migration engine is to be used. 1184 * @updates: An array of update descriptors. 1185 * @num_updates: Number of descriptors in @updates. 1186 * @syncs: Array of xe_sync_entry to await before updating. Note that waits 1187 * will block the engine timeline. 1188 * @num_syncs: Number of entries in @syncs. 1189 * @pt_update: Pointer to a struct xe_migrate_pt_update, which contains 1190 * pointers to callback functions and, if subclassed, private arguments to 1191 * those. 1192 * 1193 * Perform a pipelined page-table update. The update descriptors are typically 1194 * built under the same lock critical section as a call to this function. If 1195 * using the default engine for the updates, they will be performed in the 1196 * order they grab the job_mutex. If different engines are used, external 1197 * synchronization is needed for overlapping updates to maintain page-table 1198 * consistency. Note that the meaing of "overlapping" is that the updates 1199 * touch the same page-table, which might be a higher-level page-directory. 1200 * If no pipelining is needed, then updates may be performed by the cpu. 1201 * 1202 * Return: A dma_fence that, when signaled, indicates the update completion. 1203 */ 1204 struct dma_fence * 1205 xe_migrate_update_pgtables(struct xe_migrate *m, 1206 struct xe_vm *vm, 1207 struct xe_bo *bo, 1208 struct xe_exec_queue *q, 1209 const struct xe_vm_pgtable_update *updates, 1210 u32 num_updates, 1211 struct xe_sync_entry *syncs, u32 num_syncs, 1212 struct xe_migrate_pt_update *pt_update) 1213 { 1214 const struct xe_migrate_pt_update_ops *ops = pt_update->ops; 1215 struct xe_tile *tile = m->tile; 1216 struct xe_gt *gt = tile->primary_gt; 1217 struct xe_device *xe = tile_to_xe(tile); 1218 struct xe_sched_job *job; 1219 struct dma_fence *fence; 1220 struct drm_suballoc *sa_bo = NULL; 1221 struct xe_vma *vma = pt_update->vma; 1222 struct xe_bb *bb; 1223 u32 i, batch_size, ppgtt_ofs, update_idx, page_ofs = 0; 1224 u64 addr; 1225 int err = 0; 1226 bool usm = !q && xe->info.supports_usm; 1227 bool first_munmap_rebind = vma && 1228 vma->gpuva.flags & XE_VMA_FIRST_REBIND; 1229 struct xe_exec_queue *q_override = !q ? m->q : q; 1230 u16 pat_index = xe->pat.idx[XE_CACHE_WB]; 1231 1232 /* Use the CPU if no in syncs and engine is idle */ 1233 if (no_in_syncs(syncs, num_syncs) && xe_exec_queue_is_idle(q_override)) { 1234 fence = xe_migrate_update_pgtables_cpu(m, vm, bo, updates, 1235 num_updates, 1236 first_munmap_rebind, 1237 pt_update); 1238 if (!IS_ERR(fence) || fence == ERR_PTR(-EAGAIN)) 1239 return fence; 1240 } 1241 1242 /* fixed + PTE entries */ 1243 if (IS_DGFX(xe)) 1244 batch_size = 2; 1245 else 1246 batch_size = 6 + num_updates * 2; 1247 1248 for (i = 0; i < num_updates; i++) { 1249 u32 num_cmds = DIV_ROUND_UP(updates[i].qwords, 0x1ff); 1250 1251 /* align noop + MI_STORE_DATA_IMM cmd prefix */ 1252 batch_size += 4 * num_cmds + updates[i].qwords * 2; 1253 } 1254 1255 /* 1256 * XXX: Create temp bo to copy from, if batch_size becomes too big? 1257 * 1258 * Worst case: Sum(2 * (each lower level page size) + (top level page size)) 1259 * Should be reasonably bound.. 1260 */ 1261 xe_tile_assert(tile, batch_size < SZ_128K); 1262 1263 bb = xe_bb_new(gt, batch_size, !q && xe->info.supports_usm); 1264 if (IS_ERR(bb)) 1265 return ERR_CAST(bb); 1266 1267 /* For sysmem PTE's, need to map them in our hole.. */ 1268 if (!IS_DGFX(xe)) { 1269 ppgtt_ofs = NUM_KERNEL_PDE - 1; 1270 if (q) { 1271 xe_tile_assert(tile, num_updates <= NUM_VMUSA_WRITES_PER_UNIT); 1272 1273 sa_bo = drm_suballoc_new(&m->vm_update_sa, 1, 1274 GFP_KERNEL, true, 0); 1275 if (IS_ERR(sa_bo)) { 1276 err = PTR_ERR(sa_bo); 1277 goto err; 1278 } 1279 1280 ppgtt_ofs = NUM_KERNEL_PDE + 1281 (drm_suballoc_soffset(sa_bo) / 1282 NUM_VMUSA_UNIT_PER_PAGE); 1283 page_ofs = (drm_suballoc_soffset(sa_bo) % 1284 NUM_VMUSA_UNIT_PER_PAGE) * 1285 VM_SA_UPDATE_UNIT_SIZE; 1286 } 1287 1288 /* Preemption is enabled again by the ring ops. */ 1289 emit_arb_clear(bb); 1290 1291 /* Map our PT's to gtt */ 1292 bb->cs[bb->len++] = MI_STORE_DATA_IMM | MI_SDI_NUM_QW(num_updates); 1293 bb->cs[bb->len++] = ppgtt_ofs * XE_PAGE_SIZE + page_ofs; 1294 bb->cs[bb->len++] = 0; /* upper_32_bits */ 1295 1296 for (i = 0; i < num_updates; i++) { 1297 struct xe_bo *pt_bo = updates[i].pt_bo; 1298 1299 xe_tile_assert(tile, pt_bo->size == SZ_4K); 1300 1301 addr = vm->pt_ops->pte_encode_bo(pt_bo, 0, pat_index, 0); 1302 bb->cs[bb->len++] = lower_32_bits(addr); 1303 bb->cs[bb->len++] = upper_32_bits(addr); 1304 } 1305 1306 bb->cs[bb->len++] = MI_BATCH_BUFFER_END; 1307 update_idx = bb->len; 1308 1309 addr = xe_migrate_vm_addr(ppgtt_ofs, 0) + 1310 (page_ofs / sizeof(u64)) * XE_PAGE_SIZE; 1311 for (i = 0; i < num_updates; i++) 1312 write_pgtable(tile, bb, addr + i * XE_PAGE_SIZE, 1313 &updates[i], pt_update); 1314 } else { 1315 /* phys pages, no preamble required */ 1316 bb->cs[bb->len++] = MI_BATCH_BUFFER_END; 1317 update_idx = bb->len; 1318 1319 /* Preemption is enabled again by the ring ops. */ 1320 emit_arb_clear(bb); 1321 for (i = 0; i < num_updates; i++) 1322 write_pgtable(tile, bb, 0, &updates[i], pt_update); 1323 } 1324 1325 if (!q) 1326 mutex_lock(&m->job_mutex); 1327 1328 job = xe_bb_create_migration_job(q ?: m->q, bb, 1329 xe_migrate_batch_base(m, usm), 1330 update_idx); 1331 if (IS_ERR(job)) { 1332 err = PTR_ERR(job); 1333 goto err_bb; 1334 } 1335 1336 /* Wait on BO move */ 1337 if (bo) { 1338 err = job_add_deps(job, bo->ttm.base.resv, 1339 DMA_RESV_USAGE_KERNEL); 1340 if (err) 1341 goto err_job; 1342 } 1343 1344 /* 1345 * Munmap style VM unbind, need to wait for all jobs to be complete / 1346 * trigger preempts before moving forward 1347 */ 1348 if (first_munmap_rebind) { 1349 err = job_add_deps(job, xe_vm_resv(vm), 1350 DMA_RESV_USAGE_BOOKKEEP); 1351 if (err) 1352 goto err_job; 1353 } 1354 1355 for (i = 0; !err && i < num_syncs; i++) 1356 err = xe_sync_entry_add_deps(&syncs[i], job); 1357 1358 if (err) 1359 goto err_job; 1360 1361 if (ops->pre_commit) { 1362 pt_update->job = job; 1363 err = ops->pre_commit(pt_update); 1364 if (err) 1365 goto err_job; 1366 } 1367 xe_sched_job_arm(job); 1368 fence = dma_fence_get(&job->drm.s_fence->finished); 1369 xe_sched_job_push(job); 1370 1371 if (!q) 1372 mutex_unlock(&m->job_mutex); 1373 1374 xe_bb_free(bb, fence); 1375 drm_suballoc_free(sa_bo, fence); 1376 1377 return fence; 1378 1379 err_job: 1380 xe_sched_job_put(job); 1381 err_bb: 1382 if (!q) 1383 mutex_unlock(&m->job_mutex); 1384 xe_bb_free(bb, NULL); 1385 err: 1386 drm_suballoc_free(sa_bo, NULL); 1387 return ERR_PTR(err); 1388 } 1389 1390 /** 1391 * xe_migrate_wait() - Complete all operations using the xe_migrate context 1392 * @m: Migrate context to wait for. 1393 * 1394 * Waits until the GPU no longer uses the migrate context's default engine 1395 * or its page-table objects. FIXME: What about separate page-table update 1396 * engines? 1397 */ 1398 void xe_migrate_wait(struct xe_migrate *m) 1399 { 1400 if (m->fence) 1401 dma_fence_wait(m->fence, false); 1402 } 1403 1404 #if IS_ENABLED(CONFIG_DRM_XE_KUNIT_TEST) 1405 #include "tests/xe_migrate.c" 1406 #endif 1407