1 // SPDX-License-Identifier: MIT 2 /* 3 * Copyright © 2020 Intel Corporation 4 */ 5 6 #include "xe_migrate.h" 7 8 #include <linux/bitfield.h> 9 #include <linux/sizes.h> 10 11 #include <drm/drm_managed.h> 12 #include <drm/ttm/ttm_tt.h> 13 #include <drm/xe_drm.h> 14 15 #include "generated/xe_wa_oob.h" 16 #include "instructions/xe_mi_commands.h" 17 #include "regs/xe_gpu_commands.h" 18 #include "tests/xe_test.h" 19 #include "xe_assert.h" 20 #include "xe_bb.h" 21 #include "xe_bo.h" 22 #include "xe_exec_queue.h" 23 #include "xe_ggtt.h" 24 #include "xe_gt.h" 25 #include "xe_hw_engine.h" 26 #include "xe_lrc.h" 27 #include "xe_map.h" 28 #include "xe_mocs.h" 29 #include "xe_pt.h" 30 #include "xe_res_cursor.h" 31 #include "xe_sched_job.h" 32 #include "xe_sync.h" 33 #include "xe_trace.h" 34 #include "xe_vm.h" 35 #include "xe_wa.h" 36 37 /** 38 * struct xe_migrate - migrate context. 39 */ 40 struct xe_migrate { 41 /** @q: Default exec queue used for migration */ 42 struct xe_exec_queue *q; 43 /** @tile: Backpointer to the tile this struct xe_migrate belongs to. */ 44 struct xe_tile *tile; 45 /** @job_mutex: Timeline mutex for @eng. */ 46 struct mutex job_mutex; 47 /** @pt_bo: Page-table buffer object. */ 48 struct xe_bo *pt_bo; 49 /** @batch_base_ofs: VM offset of the migration batch buffer */ 50 u64 batch_base_ofs; 51 /** @usm_batch_base_ofs: VM offset of the usm batch buffer */ 52 u64 usm_batch_base_ofs; 53 /** @cleared_mem_ofs: VM offset of @cleared_bo. */ 54 u64 cleared_mem_ofs; 55 /** 56 * @fence: dma-fence representing the last migration job batch. 57 * Protected by @job_mutex. 58 */ 59 struct dma_fence *fence; 60 /** 61 * @vm_update_sa: For integrated, used to suballocate page-tables 62 * out of the pt_bo. 63 */ 64 struct drm_suballoc_manager vm_update_sa; 65 /** @min_chunk_size: For dgfx, Minimum chunk size */ 66 u64 min_chunk_size; 67 }; 68 69 #define MAX_PREEMPTDISABLE_TRANSFER SZ_8M /* Around 1ms. */ 70 #define MAX_CCS_LIMITED_TRANSFER SZ_4M /* XE_PAGE_SIZE * (FIELD_MAX(XE2_CCS_SIZE_MASK) + 1) */ 71 #define NUM_KERNEL_PDE 17 72 #define NUM_PT_SLOTS 32 73 #define LEVEL0_PAGE_TABLE_ENCODE_SIZE SZ_2M 74 75 /** 76 * xe_tile_migrate_engine() - Get this tile's migrate engine. 77 * @tile: The tile. 78 * 79 * Returns the default migrate engine of this tile. 80 * TODO: Perhaps this function is slightly misplaced, and even unneeded? 81 * 82 * Return: The default migrate engine 83 */ 84 struct xe_exec_queue *xe_tile_migrate_engine(struct xe_tile *tile) 85 { 86 return tile->migrate->q; 87 } 88 89 static void xe_migrate_fini(struct drm_device *dev, void *arg) 90 { 91 struct xe_migrate *m = arg; 92 93 xe_vm_lock(m->q->vm, false); 94 xe_bo_unpin(m->pt_bo); 95 xe_vm_unlock(m->q->vm); 96 97 dma_fence_put(m->fence); 98 xe_bo_put(m->pt_bo); 99 drm_suballoc_manager_fini(&m->vm_update_sa); 100 mutex_destroy(&m->job_mutex); 101 xe_vm_close_and_put(m->q->vm); 102 xe_exec_queue_put(m->q); 103 } 104 105 static u64 xe_migrate_vm_addr(u64 slot, u32 level) 106 { 107 XE_WARN_ON(slot >= NUM_PT_SLOTS); 108 109 /* First slot is reserved for mapping of PT bo and bb, start from 1 */ 110 return (slot + 1ULL) << xe_pt_shift(level + 1); 111 } 112 113 static u64 xe_migrate_vram_ofs(struct xe_device *xe, u64 addr) 114 { 115 /* 116 * Remove the DPA to get a correct offset into identity table for the 117 * migrate offset 118 */ 119 addr -= xe->mem.vram.dpa_base; 120 return addr + (256ULL << xe_pt_shift(2)); 121 } 122 123 static int xe_migrate_prepare_vm(struct xe_tile *tile, struct xe_migrate *m, 124 struct xe_vm *vm) 125 { 126 struct xe_device *xe = tile_to_xe(tile); 127 u16 pat_index = xe->pat.idx[XE_CACHE_WB]; 128 u8 id = tile->id; 129 u32 num_entries = NUM_PT_SLOTS, num_level = vm->pt_root[id]->level; 130 u32 map_ofs, level, i; 131 struct xe_bo *bo, *batch = tile->mem.kernel_bb_pool->bo; 132 u64 entry; 133 134 /* Can't bump NUM_PT_SLOTS too high */ 135 BUILD_BUG_ON(NUM_PT_SLOTS > SZ_2M/XE_PAGE_SIZE); 136 /* Must be a multiple of 64K to support all platforms */ 137 BUILD_BUG_ON(NUM_PT_SLOTS * XE_PAGE_SIZE % SZ_64K); 138 /* And one slot reserved for the 4KiB page table updates */ 139 BUILD_BUG_ON(!(NUM_KERNEL_PDE & 1)); 140 141 /* Need to be sure everything fits in the first PT, or create more */ 142 xe_tile_assert(tile, m->batch_base_ofs + batch->size < SZ_2M); 143 144 bo = xe_bo_create_pin_map(vm->xe, tile, vm, 145 num_entries * XE_PAGE_SIZE, 146 ttm_bo_type_kernel, 147 XE_BO_CREATE_VRAM_IF_DGFX(tile) | 148 XE_BO_CREATE_PINNED_BIT); 149 if (IS_ERR(bo)) 150 return PTR_ERR(bo); 151 152 entry = vm->pt_ops->pde_encode_bo(bo, bo->size - XE_PAGE_SIZE, pat_index); 153 xe_pt_write(xe, &vm->pt_root[id]->bo->vmap, 0, entry); 154 155 map_ofs = (num_entries - num_level) * XE_PAGE_SIZE; 156 157 /* Map the entire BO in our level 0 pt */ 158 for (i = 0, level = 0; i < num_entries; level++) { 159 entry = vm->pt_ops->pte_encode_bo(bo, i * XE_PAGE_SIZE, 160 pat_index, 0); 161 162 xe_map_wr(xe, &bo->vmap, map_ofs + level * 8, u64, entry); 163 164 if (vm->flags & XE_VM_FLAG_64K) 165 i += 16; 166 else 167 i += 1; 168 } 169 170 if (!IS_DGFX(xe)) { 171 /* Write out batch too */ 172 m->batch_base_ofs = NUM_PT_SLOTS * XE_PAGE_SIZE; 173 for (i = 0; i < batch->size; 174 i += vm->flags & XE_VM_FLAG_64K ? XE_64K_PAGE_SIZE : 175 XE_PAGE_SIZE) { 176 entry = vm->pt_ops->pte_encode_bo(batch, i, 177 pat_index, 0); 178 179 xe_map_wr(xe, &bo->vmap, map_ofs + level * 8, u64, 180 entry); 181 level++; 182 } 183 if (xe->info.has_usm) { 184 xe_tile_assert(tile, batch->size == SZ_1M); 185 186 batch = tile->primary_gt->usm.bb_pool->bo; 187 m->usm_batch_base_ofs = m->batch_base_ofs + SZ_1M; 188 xe_tile_assert(tile, batch->size == SZ_512K); 189 190 for (i = 0; i < batch->size; 191 i += vm->flags & XE_VM_FLAG_64K ? XE_64K_PAGE_SIZE : 192 XE_PAGE_SIZE) { 193 entry = vm->pt_ops->pte_encode_bo(batch, i, 194 pat_index, 0); 195 196 xe_map_wr(xe, &bo->vmap, map_ofs + level * 8, u64, 197 entry); 198 level++; 199 } 200 } 201 } else { 202 u64 batch_addr = xe_bo_addr(batch, 0, XE_PAGE_SIZE); 203 204 m->batch_base_ofs = xe_migrate_vram_ofs(xe, batch_addr); 205 206 if (xe->info.has_usm) { 207 batch = tile->primary_gt->usm.bb_pool->bo; 208 batch_addr = xe_bo_addr(batch, 0, XE_PAGE_SIZE); 209 m->usm_batch_base_ofs = xe_migrate_vram_ofs(xe, batch_addr); 210 } 211 } 212 213 for (level = 1; level < num_level; level++) { 214 u32 flags = 0; 215 216 if (vm->flags & XE_VM_FLAG_64K && level == 1) 217 flags = XE_PDE_64K; 218 219 entry = vm->pt_ops->pde_encode_bo(bo, map_ofs + (level - 1) * 220 XE_PAGE_SIZE, pat_index); 221 xe_map_wr(xe, &bo->vmap, map_ofs + XE_PAGE_SIZE * level, u64, 222 entry | flags); 223 } 224 225 /* Write PDE's that point to our BO. */ 226 for (i = 0; i < num_entries - num_level; i++) { 227 entry = vm->pt_ops->pde_encode_bo(bo, i * XE_PAGE_SIZE, 228 pat_index); 229 230 xe_map_wr(xe, &bo->vmap, map_ofs + XE_PAGE_SIZE + 231 (i + 1) * 8, u64, entry); 232 } 233 234 /* Set up a 1GiB NULL mapping at 255GiB offset. */ 235 level = 2; 236 xe_map_wr(xe, &bo->vmap, map_ofs + XE_PAGE_SIZE * level + 255 * 8, u64, 237 vm->pt_ops->pte_encode_addr(xe, 0, pat_index, level, IS_DGFX(xe), 0) 238 | XE_PTE_NULL); 239 m->cleared_mem_ofs = (255ULL << xe_pt_shift(level)); 240 241 /* Identity map the entire vram at 256GiB offset */ 242 if (IS_DGFX(xe)) { 243 u64 pos, ofs, flags; 244 245 level = 2; 246 ofs = map_ofs + XE_PAGE_SIZE * level + 256 * 8; 247 flags = vm->pt_ops->pte_encode_addr(xe, 0, pat_index, level, 248 true, 0); 249 250 /* 251 * Use 1GB pages, it shouldn't matter the physical amount of 252 * vram is less, when we don't access it. 253 */ 254 for (pos = xe->mem.vram.dpa_base; 255 pos < xe->mem.vram.actual_physical_size + xe->mem.vram.dpa_base; 256 pos += SZ_1G, ofs += 8) 257 xe_map_wr(xe, &bo->vmap, ofs, u64, pos | flags); 258 } 259 260 /* 261 * Example layout created above, with root level = 3: 262 * [PT0...PT7]: kernel PT's for copy/clear; 64 or 4KiB PTE's 263 * [PT8]: Kernel PT for VM_BIND, 4 KiB PTE's 264 * [PT9...PT28]: Userspace PT's for VM_BIND, 4 KiB PTE's 265 * [PT29 = PDE 0] [PT30 = PDE 1] [PT31 = PDE 2] 266 * 267 * This makes the lowest part of the VM point to the pagetables. 268 * Hence the lowest 2M in the vm should point to itself, with a few writes 269 * and flushes, other parts of the VM can be used either for copying and 270 * clearing. 271 * 272 * For performance, the kernel reserves PDE's, so about 20 are left 273 * for async VM updates. 274 * 275 * To make it easier to work, each scratch PT is put in slot (1 + PT #) 276 * everywhere, this allows lockless updates to scratch pages by using 277 * the different addresses in VM. 278 */ 279 #define NUM_VMUSA_UNIT_PER_PAGE 32 280 #define VM_SA_UPDATE_UNIT_SIZE (XE_PAGE_SIZE / NUM_VMUSA_UNIT_PER_PAGE) 281 #define NUM_VMUSA_WRITES_PER_UNIT (VM_SA_UPDATE_UNIT_SIZE / sizeof(u64)) 282 drm_suballoc_manager_init(&m->vm_update_sa, 283 (map_ofs / XE_PAGE_SIZE - NUM_KERNEL_PDE) * 284 NUM_VMUSA_UNIT_PER_PAGE, 0); 285 286 m->pt_bo = bo; 287 return 0; 288 } 289 290 /* 291 * Due to workaround 16017236439, odd instance hardware copy engines are 292 * faster than even instance ones. 293 * This function returns the mask involving all fast copy engines and the 294 * reserved copy engine to be used as logical mask for migrate engine. 295 * Including the reserved copy engine is required to avoid deadlocks due to 296 * migrate jobs servicing the faults gets stuck behind the job that faulted. 297 */ 298 static u32 xe_migrate_usm_logical_mask(struct xe_gt *gt) 299 { 300 u32 logical_mask = 0; 301 struct xe_hw_engine *hwe; 302 enum xe_hw_engine_id id; 303 304 for_each_hw_engine(hwe, gt, id) { 305 if (hwe->class != XE_ENGINE_CLASS_COPY) 306 continue; 307 308 if (!XE_WA(gt, 16017236439) || 309 xe_gt_is_usm_hwe(gt, hwe) || hwe->instance & 1) 310 logical_mask |= BIT(hwe->logical_instance); 311 } 312 313 return logical_mask; 314 } 315 316 /** 317 * xe_migrate_init() - Initialize a migrate context 318 * @tile: Back-pointer to the tile we're initializing for. 319 * 320 * Return: Pointer to a migrate context on success. Error pointer on error. 321 */ 322 struct xe_migrate *xe_migrate_init(struct xe_tile *tile) 323 { 324 struct xe_device *xe = tile_to_xe(tile); 325 struct xe_gt *primary_gt = tile->primary_gt; 326 struct xe_migrate *m; 327 struct xe_vm *vm; 328 int err; 329 330 m = drmm_kzalloc(&xe->drm, sizeof(*m), GFP_KERNEL); 331 if (!m) 332 return ERR_PTR(-ENOMEM); 333 334 m->tile = tile; 335 336 /* Special layout, prepared below.. */ 337 vm = xe_vm_create(xe, XE_VM_FLAG_MIGRATION | 338 XE_VM_FLAG_SET_TILE_ID(tile)); 339 if (IS_ERR(vm)) 340 return ERR_CAST(vm); 341 342 xe_vm_lock(vm, false); 343 err = xe_migrate_prepare_vm(tile, m, vm); 344 xe_vm_unlock(vm); 345 if (err) { 346 xe_vm_close_and_put(vm); 347 return ERR_PTR(err); 348 } 349 350 if (xe->info.has_usm) { 351 struct xe_hw_engine *hwe = xe_gt_hw_engine(primary_gt, 352 XE_ENGINE_CLASS_COPY, 353 primary_gt->usm.reserved_bcs_instance, 354 false); 355 u32 logical_mask = xe_migrate_usm_logical_mask(primary_gt); 356 357 if (!hwe || !logical_mask) 358 return ERR_PTR(-EINVAL); 359 360 m->q = xe_exec_queue_create(xe, vm, logical_mask, 1, hwe, 361 EXEC_QUEUE_FLAG_KERNEL | 362 EXEC_QUEUE_FLAG_PERMANENT | 363 EXEC_QUEUE_FLAG_HIGH_PRIORITY); 364 } else { 365 m->q = xe_exec_queue_create_class(xe, primary_gt, vm, 366 XE_ENGINE_CLASS_COPY, 367 EXEC_QUEUE_FLAG_KERNEL | 368 EXEC_QUEUE_FLAG_PERMANENT); 369 } 370 if (IS_ERR(m->q)) { 371 xe_vm_close_and_put(vm); 372 return ERR_CAST(m->q); 373 } 374 375 mutex_init(&m->job_mutex); 376 377 err = drmm_add_action_or_reset(&xe->drm, xe_migrate_fini, m); 378 if (err) 379 return ERR_PTR(err); 380 381 if (IS_DGFX(xe)) { 382 if (xe_device_has_flat_ccs(xe)) 383 /* min chunk size corresponds to 4K of CCS Metadata */ 384 m->min_chunk_size = SZ_4K * SZ_64K / 385 xe_device_ccs_bytes(xe, SZ_64K); 386 else 387 /* Somewhat arbitrary to avoid a huge amount of blits */ 388 m->min_chunk_size = SZ_64K; 389 m->min_chunk_size = roundup_pow_of_two(m->min_chunk_size); 390 drm_dbg(&xe->drm, "Migrate min chunk size is 0x%08llx\n", 391 (unsigned long long)m->min_chunk_size); 392 } 393 394 return m; 395 } 396 397 static u64 max_mem_transfer_per_pass(struct xe_device *xe) 398 { 399 if (!IS_DGFX(xe) && xe_device_has_flat_ccs(xe)) 400 return MAX_CCS_LIMITED_TRANSFER; 401 402 return MAX_PREEMPTDISABLE_TRANSFER; 403 } 404 405 static u64 xe_migrate_res_sizes(struct xe_migrate *m, struct xe_res_cursor *cur) 406 { 407 struct xe_device *xe = tile_to_xe(m->tile); 408 u64 size = min_t(u64, max_mem_transfer_per_pass(xe), cur->remaining); 409 410 if (mem_type_is_vram(cur->mem_type)) { 411 /* 412 * VRAM we want to blit in chunks with sizes aligned to 413 * min_chunk_size in order for the offset to CCS metadata to be 414 * page-aligned. If it's the last chunk it may be smaller. 415 * 416 * Another constraint is that we need to limit the blit to 417 * the VRAM block size, unless size is smaller than 418 * min_chunk_size. 419 */ 420 u64 chunk = max_t(u64, cur->size, m->min_chunk_size); 421 422 size = min_t(u64, size, chunk); 423 if (size > m->min_chunk_size) 424 size = round_down(size, m->min_chunk_size); 425 } 426 427 return size; 428 } 429 430 static bool xe_migrate_allow_identity(u64 size, const struct xe_res_cursor *cur) 431 { 432 /* If the chunk is not fragmented, allow identity map. */ 433 return cur->size >= size; 434 } 435 436 static u32 pte_update_size(struct xe_migrate *m, 437 bool is_vram, 438 struct ttm_resource *res, 439 struct xe_res_cursor *cur, 440 u64 *L0, u64 *L0_ofs, u32 *L0_pt, 441 u32 cmd_size, u32 pt_ofs, u32 avail_pts) 442 { 443 u32 cmds = 0; 444 445 *L0_pt = pt_ofs; 446 if (is_vram && xe_migrate_allow_identity(*L0, cur)) { 447 /* Offset into identity map. */ 448 *L0_ofs = xe_migrate_vram_ofs(tile_to_xe(m->tile), 449 cur->start + vram_region_gpu_offset(res)); 450 cmds += cmd_size; 451 } else { 452 /* Clip L0 to available size */ 453 u64 size = min(*L0, (u64)avail_pts * SZ_2M); 454 u64 num_4k_pages = DIV_ROUND_UP(size, XE_PAGE_SIZE); 455 456 *L0 = size; 457 *L0_ofs = xe_migrate_vm_addr(pt_ofs, 0); 458 459 /* MI_STORE_DATA_IMM */ 460 cmds += 3 * DIV_ROUND_UP(num_4k_pages, 0x1ff); 461 462 /* PDE qwords */ 463 cmds += num_4k_pages * 2; 464 465 /* Each chunk has a single blit command */ 466 cmds += cmd_size; 467 } 468 469 return cmds; 470 } 471 472 static void emit_pte(struct xe_migrate *m, 473 struct xe_bb *bb, u32 at_pt, 474 bool is_vram, bool is_comp_pte, 475 struct xe_res_cursor *cur, 476 u32 size, struct ttm_resource *res) 477 { 478 struct xe_device *xe = tile_to_xe(m->tile); 479 struct xe_vm *vm = m->q->vm; 480 u16 pat_index; 481 u32 ptes; 482 u64 ofs = at_pt * XE_PAGE_SIZE; 483 u64 cur_ofs; 484 485 /* Indirect access needs compression enabled uncached PAT index */ 486 if (GRAPHICS_VERx100(xe) >= 2000) 487 pat_index = is_comp_pte ? xe->pat.idx[XE_CACHE_NONE_COMPRESSION] : 488 xe->pat.idx[XE_CACHE_WB]; 489 else 490 pat_index = xe->pat.idx[XE_CACHE_WB]; 491 492 ptes = DIV_ROUND_UP(size, XE_PAGE_SIZE); 493 494 while (ptes) { 495 u32 chunk = min(0x1ffU, ptes); 496 497 bb->cs[bb->len++] = MI_STORE_DATA_IMM | MI_SDI_NUM_QW(chunk); 498 bb->cs[bb->len++] = ofs; 499 bb->cs[bb->len++] = 0; 500 501 cur_ofs = ofs; 502 ofs += chunk * 8; 503 ptes -= chunk; 504 505 while (chunk--) { 506 u64 addr, flags = 0; 507 bool devmem = false; 508 509 addr = xe_res_dma(cur) & PAGE_MASK; 510 if (is_vram) { 511 if (vm->flags & XE_VM_FLAG_64K) { 512 u64 va = cur_ofs * XE_PAGE_SIZE / 8; 513 514 xe_assert(xe, (va & (SZ_64K - 1)) == 515 (addr & (SZ_64K - 1))); 516 517 flags |= XE_PTE_PS64; 518 } 519 520 addr += vram_region_gpu_offset(res); 521 devmem = true; 522 } 523 524 addr = vm->pt_ops->pte_encode_addr(m->tile->xe, 525 addr, pat_index, 526 0, devmem, flags); 527 bb->cs[bb->len++] = lower_32_bits(addr); 528 bb->cs[bb->len++] = upper_32_bits(addr); 529 530 xe_res_next(cur, min_t(u32, size, PAGE_SIZE)); 531 cur_ofs += 8; 532 } 533 } 534 } 535 536 #define EMIT_COPY_CCS_DW 5 537 static void emit_copy_ccs(struct xe_gt *gt, struct xe_bb *bb, 538 u64 dst_ofs, bool dst_is_indirect, 539 u64 src_ofs, bool src_is_indirect, 540 u32 size) 541 { 542 struct xe_device *xe = gt_to_xe(gt); 543 u32 *cs = bb->cs + bb->len; 544 u32 num_ccs_blks; 545 u32 num_pages; 546 u32 ccs_copy_size; 547 u32 mocs; 548 549 if (GRAPHICS_VERx100(xe) >= 2000) { 550 num_pages = DIV_ROUND_UP(size, XE_PAGE_SIZE); 551 xe_gt_assert(gt, FIELD_FIT(XE2_CCS_SIZE_MASK, num_pages - 1)); 552 553 ccs_copy_size = REG_FIELD_PREP(XE2_CCS_SIZE_MASK, num_pages - 1); 554 mocs = FIELD_PREP(XE2_XY_CTRL_SURF_MOCS_INDEX_MASK, gt->mocs.uc_index); 555 556 } else { 557 num_ccs_blks = DIV_ROUND_UP(xe_device_ccs_bytes(gt_to_xe(gt), size), 558 NUM_CCS_BYTES_PER_BLOCK); 559 xe_gt_assert(gt, FIELD_FIT(CCS_SIZE_MASK, num_ccs_blks - 1)); 560 561 ccs_copy_size = REG_FIELD_PREP(CCS_SIZE_MASK, num_ccs_blks - 1); 562 mocs = FIELD_PREP(XY_CTRL_SURF_MOCS_MASK, gt->mocs.uc_index); 563 } 564 565 *cs++ = XY_CTRL_SURF_COPY_BLT | 566 (src_is_indirect ? 0x0 : 0x1) << SRC_ACCESS_TYPE_SHIFT | 567 (dst_is_indirect ? 0x0 : 0x1) << DST_ACCESS_TYPE_SHIFT | 568 ccs_copy_size; 569 *cs++ = lower_32_bits(src_ofs); 570 *cs++ = upper_32_bits(src_ofs) | mocs; 571 *cs++ = lower_32_bits(dst_ofs); 572 *cs++ = upper_32_bits(dst_ofs) | mocs; 573 574 bb->len = cs - bb->cs; 575 } 576 577 #define EMIT_COPY_DW 10 578 static void emit_copy(struct xe_gt *gt, struct xe_bb *bb, 579 u64 src_ofs, u64 dst_ofs, unsigned int size, 580 unsigned int pitch) 581 { 582 struct xe_device *xe = gt_to_xe(gt); 583 u32 mocs = 0; 584 u32 tile_y = 0; 585 586 xe_gt_assert(gt, size / pitch <= S16_MAX); 587 xe_gt_assert(gt, pitch / 4 <= S16_MAX); 588 xe_gt_assert(gt, pitch <= U16_MAX); 589 590 if (GRAPHICS_VER(xe) >= 20) 591 mocs = FIELD_PREP(XE2_XY_FAST_COPY_BLT_MOCS_INDEX_MASK, gt->mocs.uc_index); 592 593 if (GRAPHICS_VERx100(xe) >= 1250) 594 tile_y = XY_FAST_COPY_BLT_D1_SRC_TILE4 | XY_FAST_COPY_BLT_D1_DST_TILE4; 595 596 bb->cs[bb->len++] = XY_FAST_COPY_BLT_CMD | (10 - 2); 597 bb->cs[bb->len++] = XY_FAST_COPY_BLT_DEPTH_32 | pitch | tile_y | mocs; 598 bb->cs[bb->len++] = 0; 599 bb->cs[bb->len++] = (size / pitch) << 16 | pitch / 4; 600 bb->cs[bb->len++] = lower_32_bits(dst_ofs); 601 bb->cs[bb->len++] = upper_32_bits(dst_ofs); 602 bb->cs[bb->len++] = 0; 603 bb->cs[bb->len++] = pitch | mocs; 604 bb->cs[bb->len++] = lower_32_bits(src_ofs); 605 bb->cs[bb->len++] = upper_32_bits(src_ofs); 606 } 607 608 static int job_add_deps(struct xe_sched_job *job, struct dma_resv *resv, 609 enum dma_resv_usage usage) 610 { 611 return drm_sched_job_add_resv_dependencies(&job->drm, resv, usage); 612 } 613 614 static u64 xe_migrate_batch_base(struct xe_migrate *m, bool usm) 615 { 616 return usm ? m->usm_batch_base_ofs : m->batch_base_ofs; 617 } 618 619 static u32 xe_migrate_ccs_copy(struct xe_migrate *m, 620 struct xe_bb *bb, 621 u64 src_ofs, bool src_is_indirect, 622 u64 dst_ofs, bool dst_is_indirect, u32 dst_size, 623 u64 ccs_ofs, bool copy_ccs) 624 { 625 struct xe_gt *gt = m->tile->primary_gt; 626 u32 flush_flags = 0; 627 628 if (xe_device_has_flat_ccs(gt_to_xe(gt)) && !copy_ccs && dst_is_indirect) { 629 /* 630 * If the src is already in vram, then it should already 631 * have been cleared by us, or has been populated by the 632 * user. Make sure we copy the CCS aux state as-is. 633 * 634 * Otherwise if the bo doesn't have any CCS metadata attached, 635 * we still need to clear it for security reasons. 636 */ 637 u64 ccs_src_ofs = src_is_indirect ? src_ofs : m->cleared_mem_ofs; 638 639 emit_copy_ccs(gt, bb, 640 dst_ofs, true, 641 ccs_src_ofs, src_is_indirect, dst_size); 642 643 flush_flags = MI_FLUSH_DW_CCS; 644 } else if (copy_ccs) { 645 if (!src_is_indirect) 646 src_ofs = ccs_ofs; 647 else if (!dst_is_indirect) 648 dst_ofs = ccs_ofs; 649 650 xe_gt_assert(gt, src_is_indirect || dst_is_indirect); 651 652 emit_copy_ccs(gt, bb, dst_ofs, dst_is_indirect, src_ofs, 653 src_is_indirect, dst_size); 654 if (dst_is_indirect) 655 flush_flags = MI_FLUSH_DW_CCS; 656 } 657 658 return flush_flags; 659 } 660 661 /** 662 * xe_migrate_copy() - Copy content of TTM resources. 663 * @m: The migration context. 664 * @src_bo: The buffer object @src is currently bound to. 665 * @dst_bo: If copying between resources created for the same bo, set this to 666 * the same value as @src_bo. If copying between buffer objects, set it to 667 * the buffer object @dst is currently bound to. 668 * @src: The source TTM resource. 669 * @dst: The dst TTM resource. 670 * @copy_only_ccs: If true copy only CCS metadata 671 * 672 * Copies the contents of @src to @dst: On flat CCS devices, 673 * the CCS metadata is copied as well if needed, or if not present, 674 * the CCS metadata of @dst is cleared for security reasons. 675 * 676 * Return: Pointer to a dma_fence representing the last copy batch, or 677 * an error pointer on failure. If there is a failure, any copy operation 678 * started by the function call has been synced. 679 */ 680 struct dma_fence *xe_migrate_copy(struct xe_migrate *m, 681 struct xe_bo *src_bo, 682 struct xe_bo *dst_bo, 683 struct ttm_resource *src, 684 struct ttm_resource *dst, 685 bool copy_only_ccs) 686 { 687 struct xe_gt *gt = m->tile->primary_gt; 688 struct xe_device *xe = gt_to_xe(gt); 689 struct dma_fence *fence = NULL; 690 u64 size = src_bo->size; 691 struct xe_res_cursor src_it, dst_it, ccs_it; 692 u64 src_L0_ofs, dst_L0_ofs; 693 u32 src_L0_pt, dst_L0_pt; 694 u64 src_L0, dst_L0; 695 int pass = 0; 696 int err; 697 bool src_is_pltt = src->mem_type == XE_PL_TT; 698 bool dst_is_pltt = dst->mem_type == XE_PL_TT; 699 bool src_is_vram = mem_type_is_vram(src->mem_type); 700 bool dst_is_vram = mem_type_is_vram(dst->mem_type); 701 bool copy_ccs = xe_device_has_flat_ccs(xe) && 702 xe_bo_needs_ccs_pages(src_bo) && xe_bo_needs_ccs_pages(dst_bo); 703 bool copy_system_ccs = copy_ccs && (!src_is_vram || !dst_is_vram); 704 705 /* Copying CCS between two different BOs is not supported yet. */ 706 if (XE_WARN_ON(copy_ccs && src_bo != dst_bo)) 707 return ERR_PTR(-EINVAL); 708 709 if (src_bo != dst_bo && XE_WARN_ON(src_bo->size != dst_bo->size)) 710 return ERR_PTR(-EINVAL); 711 712 if (!src_is_vram) 713 xe_res_first_sg(xe_bo_sg(src_bo), 0, size, &src_it); 714 else 715 xe_res_first(src, 0, size, &src_it); 716 if (!dst_is_vram) 717 xe_res_first_sg(xe_bo_sg(dst_bo), 0, size, &dst_it); 718 else 719 xe_res_first(dst, 0, size, &dst_it); 720 721 if (copy_system_ccs) 722 xe_res_first_sg(xe_bo_sg(src_bo), xe_bo_ccs_pages_start(src_bo), 723 PAGE_ALIGN(xe_device_ccs_bytes(xe, size)), 724 &ccs_it); 725 726 while (size) { 727 u32 batch_size = 2; /* arb_clear() + MI_BATCH_BUFFER_END */ 728 struct xe_sched_job *job; 729 struct xe_bb *bb; 730 u32 flush_flags; 731 u32 update_idx; 732 u64 ccs_ofs, ccs_size; 733 u32 ccs_pt; 734 735 bool usm = xe->info.has_usm; 736 u32 avail_pts = max_mem_transfer_per_pass(xe) / LEVEL0_PAGE_TABLE_ENCODE_SIZE; 737 738 src_L0 = xe_migrate_res_sizes(m, &src_it); 739 dst_L0 = xe_migrate_res_sizes(m, &dst_it); 740 741 drm_dbg(&xe->drm, "Pass %u, sizes: %llu & %llu\n", 742 pass++, src_L0, dst_L0); 743 744 src_L0 = min(src_L0, dst_L0); 745 746 batch_size += pte_update_size(m, src_is_vram, src, &src_it, &src_L0, 747 &src_L0_ofs, &src_L0_pt, 0, 0, 748 avail_pts); 749 750 batch_size += pte_update_size(m, dst_is_vram, dst, &dst_it, &src_L0, 751 &dst_L0_ofs, &dst_L0_pt, 0, 752 avail_pts, avail_pts); 753 754 if (copy_system_ccs) { 755 ccs_size = xe_device_ccs_bytes(xe, src_L0); 756 batch_size += pte_update_size(m, false, NULL, &ccs_it, &ccs_size, 757 &ccs_ofs, &ccs_pt, 0, 758 2 * avail_pts, 759 avail_pts); 760 xe_assert(xe, IS_ALIGNED(ccs_it.start, PAGE_SIZE)); 761 } 762 763 /* Add copy commands size here */ 764 batch_size += ((copy_only_ccs) ? 0 : EMIT_COPY_DW) + 765 ((xe_device_has_flat_ccs(xe) ? EMIT_COPY_CCS_DW : 0)); 766 767 bb = xe_bb_new(gt, batch_size, usm); 768 if (IS_ERR(bb)) { 769 err = PTR_ERR(bb); 770 goto err_sync; 771 } 772 773 if (src_is_vram && xe_migrate_allow_identity(src_L0, &src_it)) 774 xe_res_next(&src_it, src_L0); 775 else 776 emit_pte(m, bb, src_L0_pt, src_is_vram, copy_system_ccs, 777 &src_it, src_L0, src); 778 779 if (dst_is_vram && xe_migrate_allow_identity(src_L0, &dst_it)) 780 xe_res_next(&dst_it, src_L0); 781 else 782 emit_pte(m, bb, dst_L0_pt, dst_is_vram, copy_system_ccs, 783 &dst_it, src_L0, dst); 784 785 if (copy_system_ccs) 786 emit_pte(m, bb, ccs_pt, false, false, &ccs_it, ccs_size, src); 787 788 bb->cs[bb->len++] = MI_BATCH_BUFFER_END; 789 update_idx = bb->len; 790 791 if (!copy_only_ccs) 792 emit_copy(gt, bb, src_L0_ofs, dst_L0_ofs, src_L0, XE_PAGE_SIZE); 793 794 flush_flags = xe_migrate_ccs_copy(m, bb, src_L0_ofs, 795 IS_DGFX(xe) ? src_is_vram : src_is_pltt, 796 dst_L0_ofs, 797 IS_DGFX(xe) ? dst_is_vram : dst_is_pltt, 798 src_L0, ccs_ofs, copy_ccs); 799 800 mutex_lock(&m->job_mutex); 801 job = xe_bb_create_migration_job(m->q, bb, 802 xe_migrate_batch_base(m, usm), 803 update_idx); 804 if (IS_ERR(job)) { 805 err = PTR_ERR(job); 806 goto err; 807 } 808 809 xe_sched_job_add_migrate_flush(job, flush_flags); 810 if (!fence) { 811 err = job_add_deps(job, src_bo->ttm.base.resv, 812 DMA_RESV_USAGE_BOOKKEEP); 813 if (!err && src_bo != dst_bo) 814 err = job_add_deps(job, dst_bo->ttm.base.resv, 815 DMA_RESV_USAGE_BOOKKEEP); 816 if (err) 817 goto err_job; 818 } 819 820 xe_sched_job_arm(job); 821 dma_fence_put(fence); 822 fence = dma_fence_get(&job->drm.s_fence->finished); 823 xe_sched_job_push(job); 824 825 dma_fence_put(m->fence); 826 m->fence = dma_fence_get(fence); 827 828 mutex_unlock(&m->job_mutex); 829 830 xe_bb_free(bb, fence); 831 size -= src_L0; 832 continue; 833 834 err_job: 835 xe_sched_job_put(job); 836 err: 837 mutex_unlock(&m->job_mutex); 838 xe_bb_free(bb, NULL); 839 840 err_sync: 841 /* Sync partial copy if any. FIXME: under job_mutex? */ 842 if (fence) { 843 dma_fence_wait(fence, false); 844 dma_fence_put(fence); 845 } 846 847 return ERR_PTR(err); 848 } 849 850 return fence; 851 } 852 853 static void emit_clear_link_copy(struct xe_gt *gt, struct xe_bb *bb, u64 src_ofs, 854 u32 size, u32 pitch) 855 { 856 struct xe_device *xe = gt_to_xe(gt); 857 u32 *cs = bb->cs + bb->len; 858 u32 len = PVC_MEM_SET_CMD_LEN_DW; 859 860 *cs++ = PVC_MEM_SET_CMD | PVC_MEM_SET_MATRIX | (len - 2); 861 *cs++ = pitch - 1; 862 *cs++ = (size / pitch) - 1; 863 *cs++ = pitch - 1; 864 *cs++ = lower_32_bits(src_ofs); 865 *cs++ = upper_32_bits(src_ofs); 866 if (GRAPHICS_VERx100(xe) >= 2000) 867 *cs++ = FIELD_PREP(XE2_MEM_SET_MOCS_INDEX_MASK, gt->mocs.uc_index); 868 else 869 *cs++ = FIELD_PREP(PVC_MEM_SET_MOCS_INDEX_MASK, gt->mocs.uc_index); 870 871 xe_gt_assert(gt, cs - bb->cs == len + bb->len); 872 873 bb->len += len; 874 } 875 876 static void emit_clear_main_copy(struct xe_gt *gt, struct xe_bb *bb, 877 u64 src_ofs, u32 size, u32 pitch, bool is_vram) 878 { 879 struct xe_device *xe = gt_to_xe(gt); 880 u32 *cs = bb->cs + bb->len; 881 u32 len = XY_FAST_COLOR_BLT_DW; 882 883 if (GRAPHICS_VERx100(xe) < 1250) 884 len = 11; 885 886 *cs++ = XY_FAST_COLOR_BLT_CMD | XY_FAST_COLOR_BLT_DEPTH_32 | 887 (len - 2); 888 if (GRAPHICS_VERx100(xe) >= 2000) 889 *cs++ = FIELD_PREP(XE2_XY_FAST_COLOR_BLT_MOCS_INDEX_MASK, gt->mocs.uc_index) | 890 (pitch - 1); 891 else 892 *cs++ = FIELD_PREP(XY_FAST_COLOR_BLT_MOCS_MASK, gt->mocs.uc_index) | 893 (pitch - 1); 894 *cs++ = 0; 895 *cs++ = (size / pitch) << 16 | pitch / 4; 896 *cs++ = lower_32_bits(src_ofs); 897 *cs++ = upper_32_bits(src_ofs); 898 *cs++ = (is_vram ? 0x0 : 0x1) << XY_FAST_COLOR_BLT_MEM_TYPE_SHIFT; 899 *cs++ = 0; 900 *cs++ = 0; 901 *cs++ = 0; 902 *cs++ = 0; 903 904 if (len > 11) { 905 *cs++ = 0; 906 *cs++ = 0; 907 *cs++ = 0; 908 *cs++ = 0; 909 *cs++ = 0; 910 } 911 912 xe_gt_assert(gt, cs - bb->cs == len + bb->len); 913 914 bb->len += len; 915 } 916 917 static bool has_service_copy_support(struct xe_gt *gt) 918 { 919 /* 920 * What we care about is whether the architecture was designed with 921 * service copy functionality (specifically the new MEM_SET / MEM_COPY 922 * instructions) so check the architectural engine list rather than the 923 * actual list since these instructions are usable on BCS0 even if 924 * all of the actual service copy engines (BCS1-BCS8) have been fused 925 * off. 926 */ 927 return gt->info.__engine_mask & GENMASK(XE_HW_ENGINE_BCS8, 928 XE_HW_ENGINE_BCS1); 929 } 930 931 static u32 emit_clear_cmd_len(struct xe_gt *gt) 932 { 933 if (has_service_copy_support(gt)) 934 return PVC_MEM_SET_CMD_LEN_DW; 935 else 936 return XY_FAST_COLOR_BLT_DW; 937 } 938 939 static void emit_clear(struct xe_gt *gt, struct xe_bb *bb, u64 src_ofs, 940 u32 size, u32 pitch, bool is_vram) 941 { 942 if (has_service_copy_support(gt)) 943 emit_clear_link_copy(gt, bb, src_ofs, size, pitch); 944 else 945 emit_clear_main_copy(gt, bb, src_ofs, size, pitch, 946 is_vram); 947 } 948 949 /** 950 * xe_migrate_clear() - Copy content of TTM resources. 951 * @m: The migration context. 952 * @bo: The buffer object @dst is currently bound to. 953 * @dst: The dst TTM resource to be cleared. 954 * 955 * Clear the contents of @dst to zero. On flat CCS devices, 956 * the CCS metadata is cleared to zero as well on VRAM destinations. 957 * TODO: Eliminate the @bo argument. 958 * 959 * Return: Pointer to a dma_fence representing the last clear batch, or 960 * an error pointer on failure. If there is a failure, any clear operation 961 * started by the function call has been synced. 962 */ 963 struct dma_fence *xe_migrate_clear(struct xe_migrate *m, 964 struct xe_bo *bo, 965 struct ttm_resource *dst) 966 { 967 bool clear_vram = mem_type_is_vram(dst->mem_type); 968 struct xe_gt *gt = m->tile->primary_gt; 969 struct xe_device *xe = gt_to_xe(gt); 970 bool clear_system_ccs = (xe_bo_needs_ccs_pages(bo) && !IS_DGFX(xe)) ? true : false; 971 struct dma_fence *fence = NULL; 972 u64 size = bo->size; 973 struct xe_res_cursor src_it; 974 struct ttm_resource *src = dst; 975 int err; 976 int pass = 0; 977 978 if (!clear_vram) 979 xe_res_first_sg(xe_bo_sg(bo), 0, bo->size, &src_it); 980 else 981 xe_res_first(src, 0, bo->size, &src_it); 982 983 while (size) { 984 u64 clear_L0_ofs; 985 u32 clear_L0_pt; 986 u32 flush_flags = 0; 987 u64 clear_L0; 988 struct xe_sched_job *job; 989 struct xe_bb *bb; 990 u32 batch_size, update_idx; 991 992 bool usm = xe->info.has_usm; 993 u32 avail_pts = max_mem_transfer_per_pass(xe) / LEVEL0_PAGE_TABLE_ENCODE_SIZE; 994 995 clear_L0 = xe_migrate_res_sizes(m, &src_it); 996 997 drm_dbg(&xe->drm, "Pass %u, size: %llu\n", pass++, clear_L0); 998 999 /* Calculate final sizes and batch size.. */ 1000 batch_size = 2 + 1001 pte_update_size(m, clear_vram, src, &src_it, 1002 &clear_L0, &clear_L0_ofs, &clear_L0_pt, 1003 clear_system_ccs ? 0 : emit_clear_cmd_len(gt), 0, 1004 avail_pts); 1005 1006 if (xe_device_has_flat_ccs(xe)) 1007 batch_size += EMIT_COPY_CCS_DW; 1008 1009 /* Clear commands */ 1010 1011 if (WARN_ON_ONCE(!clear_L0)) 1012 break; 1013 1014 bb = xe_bb_new(gt, batch_size, usm); 1015 if (IS_ERR(bb)) { 1016 err = PTR_ERR(bb); 1017 goto err_sync; 1018 } 1019 1020 size -= clear_L0; 1021 /* Preemption is enabled again by the ring ops. */ 1022 if (clear_vram && xe_migrate_allow_identity(clear_L0, &src_it)) 1023 xe_res_next(&src_it, clear_L0); 1024 else 1025 emit_pte(m, bb, clear_L0_pt, clear_vram, clear_system_ccs, 1026 &src_it, clear_L0, dst); 1027 1028 bb->cs[bb->len++] = MI_BATCH_BUFFER_END; 1029 update_idx = bb->len; 1030 1031 if (!clear_system_ccs) 1032 emit_clear(gt, bb, clear_L0_ofs, clear_L0, XE_PAGE_SIZE, clear_vram); 1033 1034 if (xe_device_has_flat_ccs(xe)) { 1035 emit_copy_ccs(gt, bb, clear_L0_ofs, true, 1036 m->cleared_mem_ofs, false, clear_L0); 1037 flush_flags = MI_FLUSH_DW_CCS; 1038 } 1039 1040 mutex_lock(&m->job_mutex); 1041 job = xe_bb_create_migration_job(m->q, bb, 1042 xe_migrate_batch_base(m, usm), 1043 update_idx); 1044 if (IS_ERR(job)) { 1045 err = PTR_ERR(job); 1046 goto err; 1047 } 1048 1049 xe_sched_job_add_migrate_flush(job, flush_flags); 1050 if (!fence) { 1051 /* 1052 * There can't be anything userspace related at this 1053 * point, so we just need to respect any potential move 1054 * fences, which are always tracked as 1055 * DMA_RESV_USAGE_KERNEL. 1056 */ 1057 err = job_add_deps(job, bo->ttm.base.resv, 1058 DMA_RESV_USAGE_KERNEL); 1059 if (err) 1060 goto err_job; 1061 } 1062 1063 xe_sched_job_arm(job); 1064 dma_fence_put(fence); 1065 fence = dma_fence_get(&job->drm.s_fence->finished); 1066 xe_sched_job_push(job); 1067 1068 dma_fence_put(m->fence); 1069 m->fence = dma_fence_get(fence); 1070 1071 mutex_unlock(&m->job_mutex); 1072 1073 xe_bb_free(bb, fence); 1074 continue; 1075 1076 err_job: 1077 xe_sched_job_put(job); 1078 err: 1079 mutex_unlock(&m->job_mutex); 1080 xe_bb_free(bb, NULL); 1081 err_sync: 1082 /* Sync partial copies if any. FIXME: job_mutex? */ 1083 if (fence) { 1084 dma_fence_wait(m->fence, false); 1085 dma_fence_put(fence); 1086 } 1087 1088 return ERR_PTR(err); 1089 } 1090 1091 if (clear_system_ccs) 1092 bo->ccs_cleared = true; 1093 1094 return fence; 1095 } 1096 1097 static void write_pgtable(struct xe_tile *tile, struct xe_bb *bb, u64 ppgtt_ofs, 1098 const struct xe_vm_pgtable_update *update, 1099 struct xe_migrate_pt_update *pt_update) 1100 { 1101 const struct xe_migrate_pt_update_ops *ops = pt_update->ops; 1102 u32 chunk; 1103 u32 ofs = update->ofs, size = update->qwords; 1104 1105 /* 1106 * If we have 512 entries (max), we would populate it ourselves, 1107 * and update the PDE above it to the new pointer. 1108 * The only time this can only happen if we have to update the top 1109 * PDE. This requires a BO that is almost vm->size big. 1110 * 1111 * This shouldn't be possible in practice.. might change when 16K 1112 * pages are used. Hence the assert. 1113 */ 1114 xe_tile_assert(tile, update->qwords <= 0x1ff); 1115 if (!ppgtt_ofs) 1116 ppgtt_ofs = xe_migrate_vram_ofs(tile_to_xe(tile), 1117 xe_bo_addr(update->pt_bo, 0, 1118 XE_PAGE_SIZE)); 1119 1120 do { 1121 u64 addr = ppgtt_ofs + ofs * 8; 1122 1123 chunk = min(update->qwords, 0x1ffU); 1124 1125 /* Ensure populatefn can do memset64 by aligning bb->cs */ 1126 if (!(bb->len & 1)) 1127 bb->cs[bb->len++] = MI_NOOP; 1128 1129 bb->cs[bb->len++] = MI_STORE_DATA_IMM | MI_SDI_NUM_QW(chunk); 1130 bb->cs[bb->len++] = lower_32_bits(addr); 1131 bb->cs[bb->len++] = upper_32_bits(addr); 1132 ops->populate(pt_update, tile, NULL, bb->cs + bb->len, ofs, chunk, 1133 update); 1134 1135 bb->len += chunk * 2; 1136 ofs += chunk; 1137 size -= chunk; 1138 } while (size); 1139 } 1140 1141 struct xe_vm *xe_migrate_get_vm(struct xe_migrate *m) 1142 { 1143 return xe_vm_get(m->q->vm); 1144 } 1145 1146 #if IS_ENABLED(CONFIG_DRM_XE_KUNIT_TEST) 1147 struct migrate_test_params { 1148 struct xe_test_priv base; 1149 bool force_gpu; 1150 }; 1151 1152 #define to_migrate_test_params(_priv) \ 1153 container_of(_priv, struct migrate_test_params, base) 1154 #endif 1155 1156 static struct dma_fence * 1157 xe_migrate_update_pgtables_cpu(struct xe_migrate *m, 1158 struct xe_vm *vm, struct xe_bo *bo, 1159 const struct xe_vm_pgtable_update *updates, 1160 u32 num_updates, bool wait_vm, 1161 struct xe_migrate_pt_update *pt_update) 1162 { 1163 XE_TEST_DECLARE(struct migrate_test_params *test = 1164 to_migrate_test_params 1165 (xe_cur_kunit_priv(XE_TEST_LIVE_MIGRATE));) 1166 const struct xe_migrate_pt_update_ops *ops = pt_update->ops; 1167 struct dma_fence *fence; 1168 int err; 1169 u32 i; 1170 1171 if (XE_TEST_ONLY(test && test->force_gpu)) 1172 return ERR_PTR(-ETIME); 1173 1174 if (bo && !dma_resv_test_signaled(bo->ttm.base.resv, 1175 DMA_RESV_USAGE_KERNEL)) 1176 return ERR_PTR(-ETIME); 1177 1178 if (wait_vm && !dma_resv_test_signaled(xe_vm_resv(vm), 1179 DMA_RESV_USAGE_BOOKKEEP)) 1180 return ERR_PTR(-ETIME); 1181 1182 if (ops->pre_commit) { 1183 pt_update->job = NULL; 1184 err = ops->pre_commit(pt_update); 1185 if (err) 1186 return ERR_PTR(err); 1187 } 1188 for (i = 0; i < num_updates; i++) { 1189 const struct xe_vm_pgtable_update *update = &updates[i]; 1190 1191 ops->populate(pt_update, m->tile, &update->pt_bo->vmap, NULL, 1192 update->ofs, update->qwords, update); 1193 } 1194 1195 if (vm) { 1196 trace_xe_vm_cpu_bind(vm); 1197 xe_device_wmb(vm->xe); 1198 } 1199 1200 fence = dma_fence_get_stub(); 1201 1202 return fence; 1203 } 1204 1205 static bool no_in_syncs(struct xe_vm *vm, struct xe_exec_queue *q, 1206 struct xe_sync_entry *syncs, u32 num_syncs) 1207 { 1208 struct dma_fence *fence; 1209 int i; 1210 1211 for (i = 0; i < num_syncs; i++) { 1212 fence = syncs[i].fence; 1213 1214 if (fence && !test_bit(DMA_FENCE_FLAG_SIGNALED_BIT, 1215 &fence->flags)) 1216 return false; 1217 } 1218 if (q) { 1219 fence = xe_exec_queue_last_fence_get(q, vm); 1220 if (!test_bit(DMA_FENCE_FLAG_SIGNALED_BIT, &fence->flags)) { 1221 dma_fence_put(fence); 1222 return false; 1223 } 1224 dma_fence_put(fence); 1225 } 1226 1227 return true; 1228 } 1229 1230 /** 1231 * xe_migrate_update_pgtables() - Pipelined page-table update 1232 * @m: The migrate context. 1233 * @vm: The vm we'll be updating. 1234 * @bo: The bo whose dma-resv we will await before updating, or NULL if userptr. 1235 * @q: The exec queue to be used for the update or NULL if the default 1236 * migration engine is to be used. 1237 * @updates: An array of update descriptors. 1238 * @num_updates: Number of descriptors in @updates. 1239 * @syncs: Array of xe_sync_entry to await before updating. Note that waits 1240 * will block the engine timeline. 1241 * @num_syncs: Number of entries in @syncs. 1242 * @pt_update: Pointer to a struct xe_migrate_pt_update, which contains 1243 * pointers to callback functions and, if subclassed, private arguments to 1244 * those. 1245 * 1246 * Perform a pipelined page-table update. The update descriptors are typically 1247 * built under the same lock critical section as a call to this function. If 1248 * using the default engine for the updates, they will be performed in the 1249 * order they grab the job_mutex. If different engines are used, external 1250 * synchronization is needed for overlapping updates to maintain page-table 1251 * consistency. Note that the meaing of "overlapping" is that the updates 1252 * touch the same page-table, which might be a higher-level page-directory. 1253 * If no pipelining is needed, then updates may be performed by the cpu. 1254 * 1255 * Return: A dma_fence that, when signaled, indicates the update completion. 1256 */ 1257 struct dma_fence * 1258 xe_migrate_update_pgtables(struct xe_migrate *m, 1259 struct xe_vm *vm, 1260 struct xe_bo *bo, 1261 struct xe_exec_queue *q, 1262 const struct xe_vm_pgtable_update *updates, 1263 u32 num_updates, 1264 struct xe_sync_entry *syncs, u32 num_syncs, 1265 struct xe_migrate_pt_update *pt_update) 1266 { 1267 const struct xe_migrate_pt_update_ops *ops = pt_update->ops; 1268 struct xe_tile *tile = m->tile; 1269 struct xe_gt *gt = tile->primary_gt; 1270 struct xe_device *xe = tile_to_xe(tile); 1271 struct xe_sched_job *job; 1272 struct dma_fence *fence; 1273 struct drm_suballoc *sa_bo = NULL; 1274 struct xe_vma *vma = pt_update->vma; 1275 struct xe_bb *bb; 1276 u32 i, batch_size, ppgtt_ofs, update_idx, page_ofs = 0; 1277 u64 addr; 1278 int err = 0; 1279 bool usm = !q && xe->info.has_usm; 1280 bool first_munmap_rebind = vma && 1281 vma->gpuva.flags & XE_VMA_FIRST_REBIND; 1282 struct xe_exec_queue *q_override = !q ? m->q : q; 1283 u16 pat_index = xe->pat.idx[XE_CACHE_WB]; 1284 1285 /* Use the CPU if no in syncs and engine is idle */ 1286 if (no_in_syncs(vm, q, syncs, num_syncs) && xe_exec_queue_is_idle(q_override)) { 1287 fence = xe_migrate_update_pgtables_cpu(m, vm, bo, updates, 1288 num_updates, 1289 first_munmap_rebind, 1290 pt_update); 1291 if (!IS_ERR(fence) || fence == ERR_PTR(-EAGAIN)) 1292 return fence; 1293 } 1294 1295 /* fixed + PTE entries */ 1296 if (IS_DGFX(xe)) 1297 batch_size = 2; 1298 else 1299 batch_size = 6 + num_updates * 2; 1300 1301 for (i = 0; i < num_updates; i++) { 1302 u32 num_cmds = DIV_ROUND_UP(updates[i].qwords, 0x1ff); 1303 1304 /* align noop + MI_STORE_DATA_IMM cmd prefix */ 1305 batch_size += 4 * num_cmds + updates[i].qwords * 2; 1306 } 1307 1308 /* 1309 * XXX: Create temp bo to copy from, if batch_size becomes too big? 1310 * 1311 * Worst case: Sum(2 * (each lower level page size) + (top level page size)) 1312 * Should be reasonably bound.. 1313 */ 1314 xe_tile_assert(tile, batch_size < SZ_128K); 1315 1316 bb = xe_bb_new(gt, batch_size, !q && xe->info.has_usm); 1317 if (IS_ERR(bb)) 1318 return ERR_CAST(bb); 1319 1320 /* For sysmem PTE's, need to map them in our hole.. */ 1321 if (!IS_DGFX(xe)) { 1322 ppgtt_ofs = NUM_KERNEL_PDE - 1; 1323 if (q) { 1324 xe_tile_assert(tile, num_updates <= NUM_VMUSA_WRITES_PER_UNIT); 1325 1326 sa_bo = drm_suballoc_new(&m->vm_update_sa, 1, 1327 GFP_KERNEL, true, 0); 1328 if (IS_ERR(sa_bo)) { 1329 err = PTR_ERR(sa_bo); 1330 goto err; 1331 } 1332 1333 ppgtt_ofs = NUM_KERNEL_PDE + 1334 (drm_suballoc_soffset(sa_bo) / 1335 NUM_VMUSA_UNIT_PER_PAGE); 1336 page_ofs = (drm_suballoc_soffset(sa_bo) % 1337 NUM_VMUSA_UNIT_PER_PAGE) * 1338 VM_SA_UPDATE_UNIT_SIZE; 1339 } 1340 1341 /* Map our PT's to gtt */ 1342 bb->cs[bb->len++] = MI_STORE_DATA_IMM | MI_SDI_NUM_QW(num_updates); 1343 bb->cs[bb->len++] = ppgtt_ofs * XE_PAGE_SIZE + page_ofs; 1344 bb->cs[bb->len++] = 0; /* upper_32_bits */ 1345 1346 for (i = 0; i < num_updates; i++) { 1347 struct xe_bo *pt_bo = updates[i].pt_bo; 1348 1349 xe_tile_assert(tile, pt_bo->size == SZ_4K); 1350 1351 addr = vm->pt_ops->pte_encode_bo(pt_bo, 0, pat_index, 0); 1352 bb->cs[bb->len++] = lower_32_bits(addr); 1353 bb->cs[bb->len++] = upper_32_bits(addr); 1354 } 1355 1356 bb->cs[bb->len++] = MI_BATCH_BUFFER_END; 1357 update_idx = bb->len; 1358 1359 addr = xe_migrate_vm_addr(ppgtt_ofs, 0) + 1360 (page_ofs / sizeof(u64)) * XE_PAGE_SIZE; 1361 for (i = 0; i < num_updates; i++) 1362 write_pgtable(tile, bb, addr + i * XE_PAGE_SIZE, 1363 &updates[i], pt_update); 1364 } else { 1365 /* phys pages, no preamble required */ 1366 bb->cs[bb->len++] = MI_BATCH_BUFFER_END; 1367 update_idx = bb->len; 1368 1369 for (i = 0; i < num_updates; i++) 1370 write_pgtable(tile, bb, 0, &updates[i], pt_update); 1371 } 1372 1373 if (!q) 1374 mutex_lock(&m->job_mutex); 1375 1376 job = xe_bb_create_migration_job(q ?: m->q, bb, 1377 xe_migrate_batch_base(m, usm), 1378 update_idx); 1379 if (IS_ERR(job)) { 1380 err = PTR_ERR(job); 1381 goto err_bb; 1382 } 1383 1384 /* Wait on BO move */ 1385 if (bo) { 1386 err = job_add_deps(job, bo->ttm.base.resv, 1387 DMA_RESV_USAGE_KERNEL); 1388 if (err) 1389 goto err_job; 1390 } 1391 1392 /* 1393 * Munmap style VM unbind, need to wait for all jobs to be complete / 1394 * trigger preempts before moving forward 1395 */ 1396 if (first_munmap_rebind) { 1397 err = job_add_deps(job, xe_vm_resv(vm), 1398 DMA_RESV_USAGE_BOOKKEEP); 1399 if (err) 1400 goto err_job; 1401 } 1402 1403 err = xe_sched_job_last_fence_add_dep(job, vm); 1404 for (i = 0; !err && i < num_syncs; i++) 1405 err = xe_sync_entry_add_deps(&syncs[i], job); 1406 1407 if (err) 1408 goto err_job; 1409 1410 if (ops->pre_commit) { 1411 pt_update->job = job; 1412 err = ops->pre_commit(pt_update); 1413 if (err) 1414 goto err_job; 1415 } 1416 xe_sched_job_arm(job); 1417 fence = dma_fence_get(&job->drm.s_fence->finished); 1418 xe_sched_job_push(job); 1419 1420 if (!q) 1421 mutex_unlock(&m->job_mutex); 1422 1423 xe_bb_free(bb, fence); 1424 drm_suballoc_free(sa_bo, fence); 1425 1426 return fence; 1427 1428 err_job: 1429 xe_sched_job_put(job); 1430 err_bb: 1431 if (!q) 1432 mutex_unlock(&m->job_mutex); 1433 xe_bb_free(bb, NULL); 1434 err: 1435 drm_suballoc_free(sa_bo, NULL); 1436 return ERR_PTR(err); 1437 } 1438 1439 /** 1440 * xe_migrate_wait() - Complete all operations using the xe_migrate context 1441 * @m: Migrate context to wait for. 1442 * 1443 * Waits until the GPU no longer uses the migrate context's default engine 1444 * or its page-table objects. FIXME: What about separate page-table update 1445 * engines? 1446 */ 1447 void xe_migrate_wait(struct xe_migrate *m) 1448 { 1449 if (m->fence) 1450 dma_fence_wait(m->fence, false); 1451 } 1452 1453 #if IS_ENABLED(CONFIG_DRM_XE_KUNIT_TEST) 1454 #include "tests/xe_migrate.c" 1455 #endif 1456