1 // SPDX-License-Identifier: MIT 2 /* 3 * Copyright © 2020 Intel Corporation 4 */ 5 6 #include "xe_migrate.h" 7 8 #include <linux/bitfield.h> 9 #include <linux/sizes.h> 10 11 #include <drm/drm_managed.h> 12 #include <drm/drm_pagemap.h> 13 #include <drm/ttm/ttm_tt.h> 14 #include <uapi/drm/xe_drm.h> 15 16 #include <generated/xe_wa_oob.h> 17 18 #include "instructions/xe_gpu_commands.h" 19 #include "instructions/xe_mi_commands.h" 20 #include "regs/xe_gtt_defs.h" 21 #include "tests/xe_test.h" 22 #include "xe_assert.h" 23 #include "xe_bb.h" 24 #include "xe_bo.h" 25 #include "xe_exec_queue.h" 26 #include "xe_ggtt.h" 27 #include "xe_gt.h" 28 #include "xe_hw_engine.h" 29 #include "xe_lrc.h" 30 #include "xe_map.h" 31 #include "xe_mocs.h" 32 #include "xe_pt.h" 33 #include "xe_res_cursor.h" 34 #include "xe_sa.h" 35 #include "xe_sched_job.h" 36 #include "xe_sync.h" 37 #include "xe_trace_bo.h" 38 #include "xe_validation.h" 39 #include "xe_vm.h" 40 #include "xe_vram.h" 41 42 /** 43 * struct xe_migrate - migrate context. 44 */ 45 struct xe_migrate { 46 /** @q: Default exec queue used for migration */ 47 struct xe_exec_queue *q; 48 /** @tile: Backpointer to the tile this struct xe_migrate belongs to. */ 49 struct xe_tile *tile; 50 /** @job_mutex: Timeline mutex for @eng. */ 51 struct mutex job_mutex; 52 /** @pt_bo: Page-table buffer object. */ 53 struct xe_bo *pt_bo; 54 /** @batch_base_ofs: VM offset of the migration batch buffer */ 55 u64 batch_base_ofs; 56 /** @usm_batch_base_ofs: VM offset of the usm batch buffer */ 57 u64 usm_batch_base_ofs; 58 /** @cleared_mem_ofs: VM offset of @cleared_bo. */ 59 u64 cleared_mem_ofs; 60 /** 61 * @fence: dma-fence representing the last migration job batch. 62 * Protected by @job_mutex. 63 */ 64 struct dma_fence *fence; 65 /** 66 * @vm_update_sa: For integrated, used to suballocate page-tables 67 * out of the pt_bo. 68 */ 69 struct drm_suballoc_manager vm_update_sa; 70 /** @min_chunk_size: For dgfx, Minimum chunk size */ 71 u64 min_chunk_size; 72 }; 73 74 #define MAX_PREEMPTDISABLE_TRANSFER SZ_8M /* Around 1ms. */ 75 #define MAX_CCS_LIMITED_TRANSFER SZ_4M /* XE_PAGE_SIZE * (FIELD_MAX(XE2_CCS_SIZE_MASK) + 1) */ 76 #define NUM_KERNEL_PDE 15 77 #define NUM_PT_SLOTS 32 78 #define LEVEL0_PAGE_TABLE_ENCODE_SIZE SZ_2M 79 #define MAX_NUM_PTE 512 80 #define IDENTITY_OFFSET 256ULL 81 82 /* 83 * Although MI_STORE_DATA_IMM's "length" field is 10-bits, 0x3FE is the largest 84 * legal value accepted. Since that instruction field is always stored in 85 * (val-2) format, this translates to 0x400 dwords for the true maximum length 86 * of the instruction. Subtracting the instruction header (1 dword) and 87 * address (2 dwords), that leaves 0x3FD dwords (0x1FE qwords) for PTE values. 88 */ 89 #define MAX_PTE_PER_SDI 0x1FEU 90 91 static void xe_migrate_fini(void *arg) 92 { 93 struct xe_migrate *m = arg; 94 95 xe_vm_lock(m->q->vm, false); 96 xe_bo_unpin(m->pt_bo); 97 xe_vm_unlock(m->q->vm); 98 99 dma_fence_put(m->fence); 100 xe_bo_put(m->pt_bo); 101 drm_suballoc_manager_fini(&m->vm_update_sa); 102 mutex_destroy(&m->job_mutex); 103 xe_vm_close_and_put(m->q->vm); 104 xe_exec_queue_put(m->q); 105 } 106 107 static u64 xe_migrate_vm_addr(u64 slot, u32 level) 108 { 109 XE_WARN_ON(slot >= NUM_PT_SLOTS); 110 111 /* First slot is reserved for mapping of PT bo and bb, start from 1 */ 112 return (slot + 1ULL) << xe_pt_shift(level + 1); 113 } 114 115 static u64 xe_migrate_vram_ofs(struct xe_device *xe, u64 addr, bool is_comp_pte) 116 { 117 /* 118 * Remove the DPA to get a correct offset into identity table for the 119 * migrate offset 120 */ 121 u64 identity_offset = IDENTITY_OFFSET; 122 123 if (GRAPHICS_VER(xe) >= 20 && is_comp_pte) 124 identity_offset += DIV_ROUND_UP_ULL(xe_vram_region_actual_physical_size 125 (xe->mem.vram), SZ_1G); 126 127 addr -= xe_vram_region_dpa_base(xe->mem.vram); 128 return addr + (identity_offset << xe_pt_shift(2)); 129 } 130 131 static void xe_migrate_program_identity(struct xe_device *xe, struct xe_vm *vm, struct xe_bo *bo, 132 u64 map_ofs, u64 vram_offset, u16 pat_index, u64 pt_2m_ofs) 133 { 134 struct xe_vram_region *vram = xe->mem.vram; 135 resource_size_t dpa_base = xe_vram_region_dpa_base(vram); 136 u64 pos, ofs, flags; 137 u64 entry; 138 /* XXX: Unclear if this should be usable_size? */ 139 u64 vram_limit = xe_vram_region_actual_physical_size(vram) + dpa_base; 140 u32 level = 2; 141 142 ofs = map_ofs + XE_PAGE_SIZE * level + vram_offset * 8; 143 flags = vm->pt_ops->pte_encode_addr(xe, 0, pat_index, level, 144 true, 0); 145 146 xe_assert(xe, IS_ALIGNED(xe_vram_region_usable_size(vram), SZ_2M)); 147 148 /* 149 * Use 1GB pages when possible, last chunk always use 2M 150 * pages as mixing reserved memory (stolen, WOCPM) with a single 151 * mapping is not allowed on certain platforms. 152 */ 153 for (pos = dpa_base; pos < vram_limit; 154 pos += SZ_1G, ofs += 8) { 155 if (pos + SZ_1G >= vram_limit) { 156 entry = vm->pt_ops->pde_encode_bo(bo, pt_2m_ofs); 157 xe_map_wr(xe, &bo->vmap, ofs, u64, entry); 158 159 flags = vm->pt_ops->pte_encode_addr(xe, 0, 160 pat_index, 161 level - 1, 162 true, 0); 163 164 for (ofs = pt_2m_ofs; pos < vram_limit; 165 pos += SZ_2M, ofs += 8) 166 xe_map_wr(xe, &bo->vmap, ofs, u64, pos | flags); 167 break; /* Ensure pos == vram_limit assert correct */ 168 } 169 170 xe_map_wr(xe, &bo->vmap, ofs, u64, pos | flags); 171 } 172 173 xe_assert(xe, pos == vram_limit); 174 } 175 176 static int xe_migrate_prepare_vm(struct xe_tile *tile, struct xe_migrate *m, 177 struct xe_vm *vm, struct drm_exec *exec) 178 { 179 struct xe_device *xe = tile_to_xe(tile); 180 u16 pat_index = xe->pat.idx[XE_CACHE_WB]; 181 u8 id = tile->id; 182 u32 num_entries = NUM_PT_SLOTS, num_level = vm->pt_root[id]->level; 183 #define VRAM_IDENTITY_MAP_COUNT 2 184 u32 num_setup = num_level + VRAM_IDENTITY_MAP_COUNT; 185 #undef VRAM_IDENTITY_MAP_COUNT 186 u32 map_ofs, level, i; 187 struct xe_bo *bo, *batch = tile->mem.kernel_bb_pool->bo; 188 u64 entry, pt29_ofs; 189 190 /* Can't bump NUM_PT_SLOTS too high */ 191 BUILD_BUG_ON(NUM_PT_SLOTS > SZ_2M/XE_PAGE_SIZE); 192 /* Must be a multiple of 64K to support all platforms */ 193 BUILD_BUG_ON(NUM_PT_SLOTS * XE_PAGE_SIZE % SZ_64K); 194 /* And one slot reserved for the 4KiB page table updates */ 195 BUILD_BUG_ON(!(NUM_KERNEL_PDE & 1)); 196 197 /* Need to be sure everything fits in the first PT, or create more */ 198 xe_tile_assert(tile, m->batch_base_ofs + xe_bo_size(batch) < SZ_2M); 199 200 bo = xe_bo_create_pin_map(vm->xe, tile, vm, 201 num_entries * XE_PAGE_SIZE, 202 ttm_bo_type_kernel, 203 XE_BO_FLAG_VRAM_IF_DGFX(tile) | 204 XE_BO_FLAG_PAGETABLE, exec); 205 if (IS_ERR(bo)) 206 return PTR_ERR(bo); 207 208 /* PT30 & PT31 reserved for 2M identity map */ 209 pt29_ofs = xe_bo_size(bo) - 3 * XE_PAGE_SIZE; 210 entry = vm->pt_ops->pde_encode_bo(bo, pt29_ofs); 211 xe_pt_write(xe, &vm->pt_root[id]->bo->vmap, 0, entry); 212 213 map_ofs = (num_entries - num_setup) * XE_PAGE_SIZE; 214 215 /* Map the entire BO in our level 0 pt */ 216 for (i = 0, level = 0; i < num_entries; level++) { 217 entry = vm->pt_ops->pte_encode_bo(bo, i * XE_PAGE_SIZE, 218 pat_index, 0); 219 220 xe_map_wr(xe, &bo->vmap, map_ofs + level * 8, u64, entry); 221 222 if (vm->flags & XE_VM_FLAG_64K) 223 i += 16; 224 else 225 i += 1; 226 } 227 228 if (!IS_DGFX(xe)) { 229 /* Write out batch too */ 230 m->batch_base_ofs = NUM_PT_SLOTS * XE_PAGE_SIZE; 231 for (i = 0; i < xe_bo_size(batch); 232 i += vm->flags & XE_VM_FLAG_64K ? XE_64K_PAGE_SIZE : 233 XE_PAGE_SIZE) { 234 entry = vm->pt_ops->pte_encode_bo(batch, i, 235 pat_index, 0); 236 237 xe_map_wr(xe, &bo->vmap, map_ofs + level * 8, u64, 238 entry); 239 level++; 240 } 241 if (xe->info.has_usm) { 242 xe_tile_assert(tile, xe_bo_size(batch) == SZ_1M); 243 244 batch = tile->primary_gt->usm.bb_pool->bo; 245 m->usm_batch_base_ofs = m->batch_base_ofs + SZ_1M; 246 xe_tile_assert(tile, xe_bo_size(batch) == SZ_512K); 247 248 for (i = 0; i < xe_bo_size(batch); 249 i += vm->flags & XE_VM_FLAG_64K ? XE_64K_PAGE_SIZE : 250 XE_PAGE_SIZE) { 251 entry = vm->pt_ops->pte_encode_bo(batch, i, 252 pat_index, 0); 253 254 xe_map_wr(xe, &bo->vmap, map_ofs + level * 8, u64, 255 entry); 256 level++; 257 } 258 } 259 } else { 260 u64 batch_addr = xe_bo_addr(batch, 0, XE_PAGE_SIZE); 261 262 m->batch_base_ofs = xe_migrate_vram_ofs(xe, batch_addr, false); 263 264 if (xe->info.has_usm) { 265 batch = tile->primary_gt->usm.bb_pool->bo; 266 batch_addr = xe_bo_addr(batch, 0, XE_PAGE_SIZE); 267 m->usm_batch_base_ofs = xe_migrate_vram_ofs(xe, batch_addr, false); 268 } 269 } 270 271 for (level = 1; level < num_level; level++) { 272 u32 flags = 0; 273 274 if (vm->flags & XE_VM_FLAG_64K && level == 1) 275 flags = XE_PDE_64K; 276 277 entry = vm->pt_ops->pde_encode_bo(bo, map_ofs + (u64)(level - 1) * 278 XE_PAGE_SIZE); 279 xe_map_wr(xe, &bo->vmap, map_ofs + XE_PAGE_SIZE * level, u64, 280 entry | flags); 281 } 282 283 /* Write PDE's that point to our BO. */ 284 for (i = 0; i < map_ofs / XE_PAGE_SIZE; i++) { 285 entry = vm->pt_ops->pde_encode_bo(bo, (u64)i * XE_PAGE_SIZE); 286 287 xe_map_wr(xe, &bo->vmap, map_ofs + XE_PAGE_SIZE + 288 (i + 1) * 8, u64, entry); 289 } 290 291 /* Set up a 1GiB NULL mapping at 255GiB offset. */ 292 level = 2; 293 xe_map_wr(xe, &bo->vmap, map_ofs + XE_PAGE_SIZE * level + 255 * 8, u64, 294 vm->pt_ops->pte_encode_addr(xe, 0, pat_index, level, IS_DGFX(xe), 0) 295 | XE_PTE_NULL); 296 m->cleared_mem_ofs = (255ULL << xe_pt_shift(level)); 297 298 /* Identity map the entire vram at 256GiB offset */ 299 if (IS_DGFX(xe)) { 300 u64 pt30_ofs = xe_bo_size(bo) - 2 * XE_PAGE_SIZE; 301 resource_size_t actual_phy_size = xe_vram_region_actual_physical_size(xe->mem.vram); 302 303 xe_migrate_program_identity(xe, vm, bo, map_ofs, IDENTITY_OFFSET, 304 pat_index, pt30_ofs); 305 xe_assert(xe, actual_phy_size <= (MAX_NUM_PTE - IDENTITY_OFFSET) * SZ_1G); 306 307 /* 308 * Identity map the entire vram for compressed pat_index for xe2+ 309 * if flat ccs is enabled. 310 */ 311 if (GRAPHICS_VER(xe) >= 20 && xe_device_has_flat_ccs(xe)) { 312 u16 comp_pat_index = xe->pat.idx[XE_CACHE_NONE_COMPRESSION]; 313 u64 vram_offset = IDENTITY_OFFSET + 314 DIV_ROUND_UP_ULL(actual_phy_size, SZ_1G); 315 u64 pt31_ofs = xe_bo_size(bo) - XE_PAGE_SIZE; 316 317 xe_assert(xe, actual_phy_size <= (MAX_NUM_PTE - IDENTITY_OFFSET - 318 IDENTITY_OFFSET / 2) * SZ_1G); 319 xe_migrate_program_identity(xe, vm, bo, map_ofs, vram_offset, 320 comp_pat_index, pt31_ofs); 321 } 322 } 323 324 /* 325 * Example layout created above, with root level = 3: 326 * [PT0...PT7]: kernel PT's for copy/clear; 64 or 4KiB PTE's 327 * [PT8]: Kernel PT for VM_BIND, 4 KiB PTE's 328 * [PT9...PT26]: Userspace PT's for VM_BIND, 4 KiB PTE's 329 * [PT27 = PDE 0] [PT28 = PDE 1] [PT29 = PDE 2] [PT30 & PT31 = 2M vram identity map] 330 * 331 * This makes the lowest part of the VM point to the pagetables. 332 * Hence the lowest 2M in the vm should point to itself, with a few writes 333 * and flushes, other parts of the VM can be used either for copying and 334 * clearing. 335 * 336 * For performance, the kernel reserves PDE's, so about 20 are left 337 * for async VM updates. 338 * 339 * To make it easier to work, each scratch PT is put in slot (1 + PT #) 340 * everywhere, this allows lockless updates to scratch pages by using 341 * the different addresses in VM. 342 */ 343 #define NUM_VMUSA_UNIT_PER_PAGE 32 344 #define VM_SA_UPDATE_UNIT_SIZE (XE_PAGE_SIZE / NUM_VMUSA_UNIT_PER_PAGE) 345 #define NUM_VMUSA_WRITES_PER_UNIT (VM_SA_UPDATE_UNIT_SIZE / sizeof(u64)) 346 drm_suballoc_manager_init(&m->vm_update_sa, 347 (size_t)(map_ofs / XE_PAGE_SIZE - NUM_KERNEL_PDE) * 348 NUM_VMUSA_UNIT_PER_PAGE, 0); 349 350 m->pt_bo = bo; 351 return 0; 352 } 353 354 /* 355 * Including the reserved copy engine is required to avoid deadlocks due to 356 * migrate jobs servicing the faults gets stuck behind the job that faulted. 357 */ 358 static u32 xe_migrate_usm_logical_mask(struct xe_gt *gt) 359 { 360 u32 logical_mask = 0; 361 struct xe_hw_engine *hwe; 362 enum xe_hw_engine_id id; 363 364 for_each_hw_engine(hwe, gt, id) { 365 if (hwe->class != XE_ENGINE_CLASS_COPY) 366 continue; 367 368 if (xe_gt_is_usm_hwe(gt, hwe)) 369 logical_mask |= BIT(hwe->logical_instance); 370 } 371 372 return logical_mask; 373 } 374 375 static bool xe_migrate_needs_ccs_emit(struct xe_device *xe) 376 { 377 return xe_device_has_flat_ccs(xe) && !(GRAPHICS_VER(xe) >= 20 && IS_DGFX(xe)); 378 } 379 380 /** 381 * xe_migrate_alloc - Allocate a migrate struct for a given &xe_tile 382 * @tile: &xe_tile 383 * 384 * Allocates a &xe_migrate for a given tile. 385 * 386 * Return: &xe_migrate on success, or NULL when out of memory. 387 */ 388 struct xe_migrate *xe_migrate_alloc(struct xe_tile *tile) 389 { 390 struct xe_migrate *m = drmm_kzalloc(&tile_to_xe(tile)->drm, sizeof(*m), GFP_KERNEL); 391 392 if (m) 393 m->tile = tile; 394 return m; 395 } 396 397 /** 398 * xe_migrate_init() - Initialize a migrate context 399 * @m: The migration context 400 * 401 * Return: 0 if successful, negative error code on failure 402 */ 403 int xe_migrate_init(struct xe_migrate *m) 404 { 405 struct xe_tile *tile = m->tile; 406 struct xe_gt *primary_gt = tile->primary_gt; 407 struct xe_device *xe = tile_to_xe(tile); 408 struct xe_validation_ctx ctx; 409 struct drm_exec exec; 410 struct xe_vm *vm; 411 int err; 412 413 /* Special layout, prepared below.. */ 414 vm = xe_vm_create(xe, XE_VM_FLAG_MIGRATION | 415 XE_VM_FLAG_SET_TILE_ID(tile), NULL); 416 if (IS_ERR(vm)) 417 return PTR_ERR(vm); 418 419 err = 0; 420 xe_validation_guard(&ctx, &xe->val, &exec, (struct xe_val_flags) {}, err) { 421 err = xe_vm_drm_exec_lock(vm, &exec); 422 drm_exec_retry_on_contention(&exec); 423 err = xe_migrate_prepare_vm(tile, m, vm, &exec); 424 drm_exec_retry_on_contention(&exec); 425 xe_validation_retry_on_oom(&ctx, &err); 426 } 427 if (err) 428 return err; 429 430 if (xe->info.has_usm) { 431 struct xe_hw_engine *hwe = xe_gt_hw_engine(primary_gt, 432 XE_ENGINE_CLASS_COPY, 433 primary_gt->usm.reserved_bcs_instance, 434 false); 435 u32 logical_mask = xe_migrate_usm_logical_mask(primary_gt); 436 437 if (!hwe || !logical_mask) { 438 err = -EINVAL; 439 goto err_out; 440 } 441 442 /* 443 * XXX: Currently only reserving 1 (likely slow) BCS instance on 444 * PVC, may want to revisit if performance is needed. 445 */ 446 m->q = xe_exec_queue_create(xe, vm, logical_mask, 1, hwe, 447 EXEC_QUEUE_FLAG_KERNEL | 448 EXEC_QUEUE_FLAG_PERMANENT | 449 EXEC_QUEUE_FLAG_HIGH_PRIORITY | 450 EXEC_QUEUE_FLAG_MIGRATE, 0); 451 } else { 452 m->q = xe_exec_queue_create_class(xe, primary_gt, vm, 453 XE_ENGINE_CLASS_COPY, 454 EXEC_QUEUE_FLAG_KERNEL | 455 EXEC_QUEUE_FLAG_PERMANENT | 456 EXEC_QUEUE_FLAG_MIGRATE, 0); 457 } 458 if (IS_ERR(m->q)) { 459 err = PTR_ERR(m->q); 460 goto err_out; 461 } 462 463 mutex_init(&m->job_mutex); 464 fs_reclaim_acquire(GFP_KERNEL); 465 might_lock(&m->job_mutex); 466 fs_reclaim_release(GFP_KERNEL); 467 468 err = devm_add_action_or_reset(xe->drm.dev, xe_migrate_fini, m); 469 if (err) 470 return err; 471 472 if (IS_DGFX(xe)) { 473 if (xe_migrate_needs_ccs_emit(xe)) 474 /* min chunk size corresponds to 4K of CCS Metadata */ 475 m->min_chunk_size = SZ_4K * SZ_64K / 476 xe_device_ccs_bytes(xe, SZ_64K); 477 else 478 /* Somewhat arbitrary to avoid a huge amount of blits */ 479 m->min_chunk_size = SZ_64K; 480 m->min_chunk_size = roundup_pow_of_two(m->min_chunk_size); 481 drm_dbg(&xe->drm, "Migrate min chunk size is 0x%08llx\n", 482 (unsigned long long)m->min_chunk_size); 483 } 484 485 return err; 486 487 err_out: 488 xe_vm_close_and_put(vm); 489 return err; 490 491 } 492 493 static u64 max_mem_transfer_per_pass(struct xe_device *xe) 494 { 495 if (!IS_DGFX(xe) && xe_device_has_flat_ccs(xe)) 496 return MAX_CCS_LIMITED_TRANSFER; 497 498 return MAX_PREEMPTDISABLE_TRANSFER; 499 } 500 501 static u64 xe_migrate_res_sizes(struct xe_migrate *m, struct xe_res_cursor *cur) 502 { 503 struct xe_device *xe = tile_to_xe(m->tile); 504 u64 size = min_t(u64, max_mem_transfer_per_pass(xe), cur->remaining); 505 506 if (mem_type_is_vram(cur->mem_type)) { 507 /* 508 * VRAM we want to blit in chunks with sizes aligned to 509 * min_chunk_size in order for the offset to CCS metadata to be 510 * page-aligned. If it's the last chunk it may be smaller. 511 * 512 * Another constraint is that we need to limit the blit to 513 * the VRAM block size, unless size is smaller than 514 * min_chunk_size. 515 */ 516 u64 chunk = max_t(u64, cur->size, m->min_chunk_size); 517 518 size = min_t(u64, size, chunk); 519 if (size > m->min_chunk_size) 520 size = round_down(size, m->min_chunk_size); 521 } 522 523 return size; 524 } 525 526 static bool xe_migrate_allow_identity(u64 size, const struct xe_res_cursor *cur) 527 { 528 /* If the chunk is not fragmented, allow identity map. */ 529 return cur->size >= size; 530 } 531 532 #define PTE_UPDATE_FLAG_IS_VRAM BIT(0) 533 #define PTE_UPDATE_FLAG_IS_COMP_PTE BIT(1) 534 535 static u32 pte_update_size(struct xe_migrate *m, 536 u32 flags, 537 struct ttm_resource *res, 538 struct xe_res_cursor *cur, 539 u64 *L0, u64 *L0_ofs, u32 *L0_pt, 540 u32 cmd_size, u32 pt_ofs, u32 avail_pts) 541 { 542 u32 cmds = 0; 543 bool is_vram = PTE_UPDATE_FLAG_IS_VRAM & flags; 544 bool is_comp_pte = PTE_UPDATE_FLAG_IS_COMP_PTE & flags; 545 546 *L0_pt = pt_ofs; 547 if (is_vram && xe_migrate_allow_identity(*L0, cur)) { 548 /* Offset into identity map. */ 549 *L0_ofs = xe_migrate_vram_ofs(tile_to_xe(m->tile), 550 cur->start + vram_region_gpu_offset(res), 551 is_comp_pte); 552 cmds += cmd_size; 553 } else { 554 /* Clip L0 to available size */ 555 u64 size = min(*L0, (u64)avail_pts * SZ_2M); 556 u32 num_4k_pages = (size + XE_PAGE_SIZE - 1) >> XE_PTE_SHIFT; 557 558 *L0 = size; 559 *L0_ofs = xe_migrate_vm_addr(pt_ofs, 0); 560 561 /* MI_STORE_DATA_IMM */ 562 cmds += 3 * DIV_ROUND_UP(num_4k_pages, MAX_PTE_PER_SDI); 563 564 /* PDE qwords */ 565 cmds += num_4k_pages * 2; 566 567 /* Each chunk has a single blit command */ 568 cmds += cmd_size; 569 } 570 571 return cmds; 572 } 573 574 static void emit_pte(struct xe_migrate *m, 575 struct xe_bb *bb, u32 at_pt, 576 bool is_vram, bool is_comp_pte, 577 struct xe_res_cursor *cur, 578 u32 size, struct ttm_resource *res) 579 { 580 struct xe_device *xe = tile_to_xe(m->tile); 581 struct xe_vm *vm = m->q->vm; 582 u16 pat_index; 583 u32 ptes; 584 u64 ofs = (u64)at_pt * XE_PAGE_SIZE; 585 u64 cur_ofs; 586 587 /* Indirect access needs compression enabled uncached PAT index */ 588 if (GRAPHICS_VERx100(xe) >= 2000) 589 pat_index = is_comp_pte ? xe->pat.idx[XE_CACHE_NONE_COMPRESSION] : 590 xe->pat.idx[XE_CACHE_WB]; 591 else 592 pat_index = xe->pat.idx[XE_CACHE_WB]; 593 594 ptes = DIV_ROUND_UP(size, XE_PAGE_SIZE); 595 596 while (ptes) { 597 u32 chunk = min(MAX_PTE_PER_SDI, ptes); 598 599 bb->cs[bb->len++] = MI_STORE_DATA_IMM | MI_SDI_NUM_QW(chunk); 600 bb->cs[bb->len++] = ofs; 601 bb->cs[bb->len++] = 0; 602 603 cur_ofs = ofs; 604 ofs += chunk * 8; 605 ptes -= chunk; 606 607 while (chunk--) { 608 u64 addr, flags = 0; 609 bool devmem = false; 610 611 addr = xe_res_dma(cur) & PAGE_MASK; 612 if (is_vram) { 613 if (vm->flags & XE_VM_FLAG_64K) { 614 u64 va = cur_ofs * XE_PAGE_SIZE / 8; 615 616 xe_assert(xe, (va & (SZ_64K - 1)) == 617 (addr & (SZ_64K - 1))); 618 619 flags |= XE_PTE_PS64; 620 } 621 622 addr += vram_region_gpu_offset(res); 623 devmem = true; 624 } 625 626 addr = vm->pt_ops->pte_encode_addr(m->tile->xe, 627 addr, pat_index, 628 0, devmem, flags); 629 bb->cs[bb->len++] = lower_32_bits(addr); 630 bb->cs[bb->len++] = upper_32_bits(addr); 631 632 xe_res_next(cur, min_t(u32, size, PAGE_SIZE)); 633 cur_ofs += 8; 634 } 635 } 636 } 637 638 #define EMIT_COPY_CCS_DW 5 639 static void emit_copy_ccs(struct xe_gt *gt, struct xe_bb *bb, 640 u64 dst_ofs, bool dst_is_indirect, 641 u64 src_ofs, bool src_is_indirect, 642 u32 size) 643 { 644 struct xe_device *xe = gt_to_xe(gt); 645 u32 *cs = bb->cs + bb->len; 646 u32 num_ccs_blks; 647 u32 num_pages; 648 u32 ccs_copy_size; 649 u32 mocs; 650 651 if (GRAPHICS_VERx100(xe) >= 2000) { 652 num_pages = DIV_ROUND_UP(size, XE_PAGE_SIZE); 653 xe_gt_assert(gt, FIELD_FIT(XE2_CCS_SIZE_MASK, num_pages - 1)); 654 655 ccs_copy_size = REG_FIELD_PREP(XE2_CCS_SIZE_MASK, num_pages - 1); 656 mocs = FIELD_PREP(XE2_XY_CTRL_SURF_MOCS_INDEX_MASK, gt->mocs.uc_index); 657 658 } else { 659 num_ccs_blks = DIV_ROUND_UP(xe_device_ccs_bytes(gt_to_xe(gt), size), 660 NUM_CCS_BYTES_PER_BLOCK); 661 xe_gt_assert(gt, FIELD_FIT(CCS_SIZE_MASK, num_ccs_blks - 1)); 662 663 ccs_copy_size = REG_FIELD_PREP(CCS_SIZE_MASK, num_ccs_blks - 1); 664 mocs = FIELD_PREP(XY_CTRL_SURF_MOCS_MASK, gt->mocs.uc_index); 665 } 666 667 *cs++ = XY_CTRL_SURF_COPY_BLT | 668 (src_is_indirect ? 0x0 : 0x1) << SRC_ACCESS_TYPE_SHIFT | 669 (dst_is_indirect ? 0x0 : 0x1) << DST_ACCESS_TYPE_SHIFT | 670 ccs_copy_size; 671 *cs++ = lower_32_bits(src_ofs); 672 *cs++ = upper_32_bits(src_ofs) | mocs; 673 *cs++ = lower_32_bits(dst_ofs); 674 *cs++ = upper_32_bits(dst_ofs) | mocs; 675 676 bb->len = cs - bb->cs; 677 } 678 679 #define EMIT_COPY_DW 10 680 static void emit_copy(struct xe_gt *gt, struct xe_bb *bb, 681 u64 src_ofs, u64 dst_ofs, unsigned int size, 682 unsigned int pitch) 683 { 684 struct xe_device *xe = gt_to_xe(gt); 685 u32 mocs = 0; 686 u32 tile_y = 0; 687 688 xe_gt_assert(gt, !(pitch & 3)); 689 xe_gt_assert(gt, size / pitch <= S16_MAX); 690 xe_gt_assert(gt, pitch / 4 <= S16_MAX); 691 xe_gt_assert(gt, pitch <= U16_MAX); 692 693 if (GRAPHICS_VER(xe) >= 20) 694 mocs = FIELD_PREP(XE2_XY_FAST_COPY_BLT_MOCS_INDEX_MASK, gt->mocs.uc_index); 695 696 if (GRAPHICS_VERx100(xe) >= 1250) 697 tile_y = XY_FAST_COPY_BLT_D1_SRC_TILE4 | XY_FAST_COPY_BLT_D1_DST_TILE4; 698 699 bb->cs[bb->len++] = XY_FAST_COPY_BLT_CMD | (10 - 2); 700 bb->cs[bb->len++] = XY_FAST_COPY_BLT_DEPTH_32 | pitch | tile_y | mocs; 701 bb->cs[bb->len++] = 0; 702 bb->cs[bb->len++] = (size / pitch) << 16 | pitch / 4; 703 bb->cs[bb->len++] = lower_32_bits(dst_ofs); 704 bb->cs[bb->len++] = upper_32_bits(dst_ofs); 705 bb->cs[bb->len++] = 0; 706 bb->cs[bb->len++] = pitch | mocs; 707 bb->cs[bb->len++] = lower_32_bits(src_ofs); 708 bb->cs[bb->len++] = upper_32_bits(src_ofs); 709 } 710 711 static u64 xe_migrate_batch_base(struct xe_migrate *m, bool usm) 712 { 713 return usm ? m->usm_batch_base_ofs : m->batch_base_ofs; 714 } 715 716 static u32 xe_migrate_ccs_copy(struct xe_migrate *m, 717 struct xe_bb *bb, 718 u64 src_ofs, bool src_is_indirect, 719 u64 dst_ofs, bool dst_is_indirect, u32 dst_size, 720 u64 ccs_ofs, bool copy_ccs) 721 { 722 struct xe_gt *gt = m->tile->primary_gt; 723 u32 flush_flags = 0; 724 725 if (!copy_ccs && dst_is_indirect) { 726 /* 727 * If the src is already in vram, then it should already 728 * have been cleared by us, or has been populated by the 729 * user. Make sure we copy the CCS aux state as-is. 730 * 731 * Otherwise if the bo doesn't have any CCS metadata attached, 732 * we still need to clear it for security reasons. 733 */ 734 u64 ccs_src_ofs = src_is_indirect ? src_ofs : m->cleared_mem_ofs; 735 736 emit_copy_ccs(gt, bb, 737 dst_ofs, true, 738 ccs_src_ofs, src_is_indirect, dst_size); 739 740 flush_flags = MI_FLUSH_DW_CCS; 741 } else if (copy_ccs) { 742 if (!src_is_indirect) 743 src_ofs = ccs_ofs; 744 else if (!dst_is_indirect) 745 dst_ofs = ccs_ofs; 746 747 xe_gt_assert(gt, src_is_indirect || dst_is_indirect); 748 749 emit_copy_ccs(gt, bb, dst_ofs, dst_is_indirect, src_ofs, 750 src_is_indirect, dst_size); 751 if (dst_is_indirect) 752 flush_flags = MI_FLUSH_DW_CCS; 753 } 754 755 return flush_flags; 756 } 757 758 /** 759 * xe_migrate_copy() - Copy content of TTM resources. 760 * @m: The migration context. 761 * @src_bo: The buffer object @src is currently bound to. 762 * @dst_bo: If copying between resources created for the same bo, set this to 763 * the same value as @src_bo. If copying between buffer objects, set it to 764 * the buffer object @dst is currently bound to. 765 * @src: The source TTM resource. 766 * @dst: The dst TTM resource. 767 * @copy_only_ccs: If true copy only CCS metadata 768 * 769 * Copies the contents of @src to @dst: On flat CCS devices, 770 * the CCS metadata is copied as well if needed, or if not present, 771 * the CCS metadata of @dst is cleared for security reasons. 772 * 773 * Return: Pointer to a dma_fence representing the last copy batch, or 774 * an error pointer on failure. If there is a failure, any copy operation 775 * started by the function call has been synced. 776 */ 777 struct dma_fence *xe_migrate_copy(struct xe_migrate *m, 778 struct xe_bo *src_bo, 779 struct xe_bo *dst_bo, 780 struct ttm_resource *src, 781 struct ttm_resource *dst, 782 bool copy_only_ccs) 783 { 784 struct xe_gt *gt = m->tile->primary_gt; 785 struct xe_device *xe = gt_to_xe(gt); 786 struct dma_fence *fence = NULL; 787 u64 size = xe_bo_size(src_bo); 788 struct xe_res_cursor src_it, dst_it, ccs_it; 789 u64 src_L0_ofs, dst_L0_ofs; 790 u32 src_L0_pt, dst_L0_pt; 791 u64 src_L0, dst_L0; 792 int pass = 0; 793 int err; 794 bool src_is_pltt = src->mem_type == XE_PL_TT; 795 bool dst_is_pltt = dst->mem_type == XE_PL_TT; 796 bool src_is_vram = mem_type_is_vram(src->mem_type); 797 bool dst_is_vram = mem_type_is_vram(dst->mem_type); 798 bool type_device = src_bo->ttm.type == ttm_bo_type_device; 799 bool needs_ccs_emit = type_device && xe_migrate_needs_ccs_emit(xe); 800 bool copy_ccs = xe_device_has_flat_ccs(xe) && 801 xe_bo_needs_ccs_pages(src_bo) && xe_bo_needs_ccs_pages(dst_bo); 802 bool copy_system_ccs = copy_ccs && (!src_is_vram || !dst_is_vram); 803 bool use_comp_pat = type_device && xe_device_has_flat_ccs(xe) && 804 GRAPHICS_VER(xe) >= 20 && src_is_vram && !dst_is_vram; 805 806 /* Copying CCS between two different BOs is not supported yet. */ 807 if (XE_WARN_ON(copy_ccs && src_bo != dst_bo)) 808 return ERR_PTR(-EINVAL); 809 810 if (src_bo != dst_bo && XE_WARN_ON(xe_bo_size(src_bo) != xe_bo_size(dst_bo))) 811 return ERR_PTR(-EINVAL); 812 813 if (!src_is_vram) 814 xe_res_first_sg(xe_bo_sg(src_bo), 0, size, &src_it); 815 else 816 xe_res_first(src, 0, size, &src_it); 817 if (!dst_is_vram) 818 xe_res_first_sg(xe_bo_sg(dst_bo), 0, size, &dst_it); 819 else 820 xe_res_first(dst, 0, size, &dst_it); 821 822 if (copy_system_ccs) 823 xe_res_first_sg(xe_bo_sg(src_bo), xe_bo_ccs_pages_start(src_bo), 824 PAGE_ALIGN(xe_device_ccs_bytes(xe, size)), 825 &ccs_it); 826 827 while (size) { 828 u32 batch_size = 2; /* arb_clear() + MI_BATCH_BUFFER_END */ 829 struct xe_sched_job *job; 830 struct xe_bb *bb; 831 u32 flush_flags = 0; 832 u32 update_idx; 833 u64 ccs_ofs, ccs_size; 834 u32 ccs_pt; 835 u32 pte_flags; 836 837 bool usm = xe->info.has_usm; 838 u32 avail_pts = max_mem_transfer_per_pass(xe) / LEVEL0_PAGE_TABLE_ENCODE_SIZE; 839 840 src_L0 = xe_migrate_res_sizes(m, &src_it); 841 dst_L0 = xe_migrate_res_sizes(m, &dst_it); 842 843 drm_dbg(&xe->drm, "Pass %u, sizes: %llu & %llu\n", 844 pass++, src_L0, dst_L0); 845 846 src_L0 = min(src_L0, dst_L0); 847 848 pte_flags = src_is_vram ? PTE_UPDATE_FLAG_IS_VRAM : 0; 849 pte_flags |= use_comp_pat ? PTE_UPDATE_FLAG_IS_COMP_PTE : 0; 850 batch_size += pte_update_size(m, pte_flags, src, &src_it, &src_L0, 851 &src_L0_ofs, &src_L0_pt, 0, 0, 852 avail_pts); 853 if (copy_only_ccs) { 854 dst_L0_ofs = src_L0_ofs; 855 } else { 856 pte_flags = dst_is_vram ? PTE_UPDATE_FLAG_IS_VRAM : 0; 857 batch_size += pte_update_size(m, pte_flags, dst, 858 &dst_it, &src_L0, 859 &dst_L0_ofs, &dst_L0_pt, 860 0, avail_pts, avail_pts); 861 } 862 863 if (copy_system_ccs) { 864 xe_assert(xe, type_device); 865 ccs_size = xe_device_ccs_bytes(xe, src_L0); 866 batch_size += pte_update_size(m, 0, NULL, &ccs_it, &ccs_size, 867 &ccs_ofs, &ccs_pt, 0, 868 2 * avail_pts, 869 avail_pts); 870 xe_assert(xe, IS_ALIGNED(ccs_it.start, PAGE_SIZE)); 871 } 872 873 /* Add copy commands size here */ 874 batch_size += ((copy_only_ccs) ? 0 : EMIT_COPY_DW) + 875 ((needs_ccs_emit ? EMIT_COPY_CCS_DW : 0)); 876 877 bb = xe_bb_new(gt, batch_size, usm); 878 if (IS_ERR(bb)) { 879 err = PTR_ERR(bb); 880 goto err_sync; 881 } 882 883 if (src_is_vram && xe_migrate_allow_identity(src_L0, &src_it)) 884 xe_res_next(&src_it, src_L0); 885 else 886 emit_pte(m, bb, src_L0_pt, src_is_vram, copy_system_ccs || use_comp_pat, 887 &src_it, src_L0, src); 888 889 if (dst_is_vram && xe_migrate_allow_identity(src_L0, &dst_it)) 890 xe_res_next(&dst_it, src_L0); 891 else if (!copy_only_ccs) 892 emit_pte(m, bb, dst_L0_pt, dst_is_vram, copy_system_ccs, 893 &dst_it, src_L0, dst); 894 895 if (copy_system_ccs) 896 emit_pte(m, bb, ccs_pt, false, false, &ccs_it, ccs_size, src); 897 898 bb->cs[bb->len++] = MI_BATCH_BUFFER_END; 899 update_idx = bb->len; 900 901 if (!copy_only_ccs) 902 emit_copy(gt, bb, src_L0_ofs, dst_L0_ofs, src_L0, XE_PAGE_SIZE); 903 904 if (needs_ccs_emit) 905 flush_flags = xe_migrate_ccs_copy(m, bb, src_L0_ofs, 906 IS_DGFX(xe) ? src_is_vram : src_is_pltt, 907 dst_L0_ofs, 908 IS_DGFX(xe) ? dst_is_vram : dst_is_pltt, 909 src_L0, ccs_ofs, copy_ccs); 910 911 job = xe_bb_create_migration_job(m->q, bb, 912 xe_migrate_batch_base(m, usm), 913 update_idx); 914 if (IS_ERR(job)) { 915 err = PTR_ERR(job); 916 goto err; 917 } 918 919 xe_sched_job_add_migrate_flush(job, flush_flags | MI_INVALIDATE_TLB); 920 if (!fence) { 921 err = xe_sched_job_add_deps(job, src_bo->ttm.base.resv, 922 DMA_RESV_USAGE_BOOKKEEP); 923 if (!err && src_bo->ttm.base.resv != dst_bo->ttm.base.resv) 924 err = xe_sched_job_add_deps(job, dst_bo->ttm.base.resv, 925 DMA_RESV_USAGE_BOOKKEEP); 926 if (err) 927 goto err_job; 928 } 929 930 mutex_lock(&m->job_mutex); 931 xe_sched_job_arm(job); 932 dma_fence_put(fence); 933 fence = dma_fence_get(&job->drm.s_fence->finished); 934 xe_sched_job_push(job); 935 936 dma_fence_put(m->fence); 937 m->fence = dma_fence_get(fence); 938 939 mutex_unlock(&m->job_mutex); 940 941 xe_bb_free(bb, fence); 942 size -= src_L0; 943 continue; 944 945 err_job: 946 xe_sched_job_put(job); 947 err: 948 xe_bb_free(bb, NULL); 949 950 err_sync: 951 /* Sync partial copy if any. FIXME: under job_mutex? */ 952 if (fence) { 953 dma_fence_wait(fence, false); 954 dma_fence_put(fence); 955 } 956 957 return ERR_PTR(err); 958 } 959 960 return fence; 961 } 962 963 /** 964 * xe_migrate_lrc() - Get the LRC from migrate context. 965 * @migrate: Migrate context. 966 * 967 * Return: Pointer to LRC on success, error on failure 968 */ 969 struct xe_lrc *xe_migrate_lrc(struct xe_migrate *migrate) 970 { 971 return migrate->q->lrc[0]; 972 } 973 974 static int emit_flush_invalidate(struct xe_exec_queue *q, u32 *dw, int i, 975 u32 flags) 976 { 977 struct xe_lrc *lrc = xe_exec_queue_lrc(q); 978 dw[i++] = MI_FLUSH_DW | MI_INVALIDATE_TLB | MI_FLUSH_DW_OP_STOREDW | 979 MI_FLUSH_IMM_DW | flags; 980 dw[i++] = lower_32_bits(xe_lrc_start_seqno_ggtt_addr(lrc)) | 981 MI_FLUSH_DW_USE_GTT; 982 dw[i++] = upper_32_bits(xe_lrc_start_seqno_ggtt_addr(lrc)); 983 dw[i++] = MI_NOOP; 984 dw[i++] = MI_NOOP; 985 986 return i; 987 } 988 989 /** 990 * xe_migrate_ccs_rw_copy() - Copy content of TTM resources. 991 * @tile: Tile whose migration context to be used. 992 * @q : Execution to be used along with migration context. 993 * @src_bo: The buffer object @src is currently bound to. 994 * @read_write : Creates BB commands for CCS read/write. 995 * 996 * Creates batch buffer instructions to copy CCS metadata from CCS pool to 997 * memory and vice versa. 998 * 999 * This function should only be called for IGPU. 1000 * 1001 * Return: 0 if successful, negative error code on failure. 1002 */ 1003 int xe_migrate_ccs_rw_copy(struct xe_tile *tile, struct xe_exec_queue *q, 1004 struct xe_bo *src_bo, 1005 enum xe_sriov_vf_ccs_rw_ctxs read_write) 1006 1007 { 1008 bool src_is_pltt = read_write == XE_SRIOV_VF_CCS_READ_CTX; 1009 bool dst_is_pltt = read_write == XE_SRIOV_VF_CCS_WRITE_CTX; 1010 struct ttm_resource *src = src_bo->ttm.resource; 1011 struct xe_migrate *m = tile->migrate; 1012 struct xe_gt *gt = tile->primary_gt; 1013 u32 batch_size, batch_size_allocated; 1014 struct xe_device *xe = gt_to_xe(gt); 1015 struct xe_res_cursor src_it, ccs_it; 1016 u64 size = xe_bo_size(src_bo); 1017 struct xe_bb *bb = NULL; 1018 u64 src_L0, src_L0_ofs; 1019 u32 src_L0_pt; 1020 int err; 1021 1022 xe_res_first_sg(xe_bo_sg(src_bo), 0, size, &src_it); 1023 1024 xe_res_first_sg(xe_bo_sg(src_bo), xe_bo_ccs_pages_start(src_bo), 1025 PAGE_ALIGN(xe_device_ccs_bytes(xe, size)), 1026 &ccs_it); 1027 1028 /* Calculate Batch buffer size */ 1029 batch_size = 0; 1030 while (size) { 1031 batch_size += 10; /* Flush + ggtt addr + 2 NOP */ 1032 u64 ccs_ofs, ccs_size; 1033 u32 ccs_pt; 1034 1035 u32 avail_pts = max_mem_transfer_per_pass(xe) / LEVEL0_PAGE_TABLE_ENCODE_SIZE; 1036 1037 src_L0 = min_t(u64, max_mem_transfer_per_pass(xe), size); 1038 1039 batch_size += pte_update_size(m, false, src, &src_it, &src_L0, 1040 &src_L0_ofs, &src_L0_pt, 0, 0, 1041 avail_pts); 1042 1043 ccs_size = xe_device_ccs_bytes(xe, src_L0); 1044 batch_size += pte_update_size(m, 0, NULL, &ccs_it, &ccs_size, &ccs_ofs, 1045 &ccs_pt, 0, avail_pts, avail_pts); 1046 xe_assert(xe, IS_ALIGNED(ccs_it.start, PAGE_SIZE)); 1047 1048 /* Add copy commands size here */ 1049 batch_size += EMIT_COPY_CCS_DW; 1050 1051 size -= src_L0; 1052 } 1053 1054 bb = xe_bb_ccs_new(gt, batch_size, read_write); 1055 if (IS_ERR(bb)) { 1056 drm_err(&xe->drm, "BB allocation failed.\n"); 1057 err = PTR_ERR(bb); 1058 goto err_ret; 1059 } 1060 1061 batch_size_allocated = batch_size; 1062 size = xe_bo_size(src_bo); 1063 batch_size = 0; 1064 1065 /* 1066 * Emit PTE and copy commands here. 1067 * The CCS copy command can only support limited size. If the size to be 1068 * copied is more than the limit, divide copy into chunks. So, calculate 1069 * sizes here again before copy command is emitted. 1070 */ 1071 while (size) { 1072 batch_size += 10; /* Flush + ggtt addr + 2 NOP */ 1073 u32 flush_flags = 0; 1074 u64 ccs_ofs, ccs_size; 1075 u32 ccs_pt; 1076 1077 u32 avail_pts = max_mem_transfer_per_pass(xe) / LEVEL0_PAGE_TABLE_ENCODE_SIZE; 1078 1079 src_L0 = xe_migrate_res_sizes(m, &src_it); 1080 1081 batch_size += pte_update_size(m, false, src, &src_it, &src_L0, 1082 &src_L0_ofs, &src_L0_pt, 0, 0, 1083 avail_pts); 1084 1085 ccs_size = xe_device_ccs_bytes(xe, src_L0); 1086 batch_size += pte_update_size(m, 0, NULL, &ccs_it, &ccs_size, &ccs_ofs, 1087 &ccs_pt, 0, avail_pts, avail_pts); 1088 xe_assert(xe, IS_ALIGNED(ccs_it.start, PAGE_SIZE)); 1089 batch_size += EMIT_COPY_CCS_DW; 1090 1091 emit_pte(m, bb, src_L0_pt, false, true, &src_it, src_L0, src); 1092 1093 emit_pte(m, bb, ccs_pt, false, false, &ccs_it, ccs_size, src); 1094 1095 bb->len = emit_flush_invalidate(q, bb->cs, bb->len, flush_flags); 1096 flush_flags = xe_migrate_ccs_copy(m, bb, src_L0_ofs, src_is_pltt, 1097 src_L0_ofs, dst_is_pltt, 1098 src_L0, ccs_ofs, true); 1099 bb->len = emit_flush_invalidate(q, bb->cs, bb->len, flush_flags); 1100 1101 size -= src_L0; 1102 } 1103 1104 xe_assert(xe, (batch_size_allocated == bb->len)); 1105 src_bo->bb_ccs[read_write] = bb; 1106 1107 return 0; 1108 1109 err_ret: 1110 return err; 1111 } 1112 1113 /** 1114 * xe_get_migrate_exec_queue() - Get the execution queue from migrate context. 1115 * @migrate: Migrate context. 1116 * 1117 * Return: Pointer to execution queue on success, error on failure 1118 */ 1119 struct xe_exec_queue *xe_migrate_exec_queue(struct xe_migrate *migrate) 1120 { 1121 return migrate->q; 1122 } 1123 1124 static void emit_clear_link_copy(struct xe_gt *gt, struct xe_bb *bb, u64 src_ofs, 1125 u32 size, u32 pitch) 1126 { 1127 struct xe_device *xe = gt_to_xe(gt); 1128 u32 *cs = bb->cs + bb->len; 1129 u32 len = PVC_MEM_SET_CMD_LEN_DW; 1130 1131 *cs++ = PVC_MEM_SET_CMD | PVC_MEM_SET_MATRIX | (len - 2); 1132 *cs++ = pitch - 1; 1133 *cs++ = (size / pitch) - 1; 1134 *cs++ = pitch - 1; 1135 *cs++ = lower_32_bits(src_ofs); 1136 *cs++ = upper_32_bits(src_ofs); 1137 if (GRAPHICS_VERx100(xe) >= 2000) 1138 *cs++ = FIELD_PREP(XE2_MEM_SET_MOCS_INDEX_MASK, gt->mocs.uc_index); 1139 else 1140 *cs++ = FIELD_PREP(PVC_MEM_SET_MOCS_INDEX_MASK, gt->mocs.uc_index); 1141 1142 xe_gt_assert(gt, cs - bb->cs == len + bb->len); 1143 1144 bb->len += len; 1145 } 1146 1147 static void emit_clear_main_copy(struct xe_gt *gt, struct xe_bb *bb, 1148 u64 src_ofs, u32 size, u32 pitch, bool is_vram) 1149 { 1150 struct xe_device *xe = gt_to_xe(gt); 1151 u32 *cs = bb->cs + bb->len; 1152 u32 len = XY_FAST_COLOR_BLT_DW; 1153 1154 if (GRAPHICS_VERx100(xe) < 1250) 1155 len = 11; 1156 1157 *cs++ = XY_FAST_COLOR_BLT_CMD | XY_FAST_COLOR_BLT_DEPTH_32 | 1158 (len - 2); 1159 if (GRAPHICS_VERx100(xe) >= 2000) 1160 *cs++ = FIELD_PREP(XE2_XY_FAST_COLOR_BLT_MOCS_INDEX_MASK, gt->mocs.uc_index) | 1161 (pitch - 1); 1162 else 1163 *cs++ = FIELD_PREP(XY_FAST_COLOR_BLT_MOCS_MASK, gt->mocs.uc_index) | 1164 (pitch - 1); 1165 *cs++ = 0; 1166 *cs++ = (size / pitch) << 16 | pitch / 4; 1167 *cs++ = lower_32_bits(src_ofs); 1168 *cs++ = upper_32_bits(src_ofs); 1169 *cs++ = (is_vram ? 0x0 : 0x1) << XY_FAST_COLOR_BLT_MEM_TYPE_SHIFT; 1170 *cs++ = 0; 1171 *cs++ = 0; 1172 *cs++ = 0; 1173 *cs++ = 0; 1174 1175 if (len > 11) { 1176 *cs++ = 0; 1177 *cs++ = 0; 1178 *cs++ = 0; 1179 *cs++ = 0; 1180 *cs++ = 0; 1181 } 1182 1183 xe_gt_assert(gt, cs - bb->cs == len + bb->len); 1184 1185 bb->len += len; 1186 } 1187 1188 static bool has_service_copy_support(struct xe_gt *gt) 1189 { 1190 /* 1191 * What we care about is whether the architecture was designed with 1192 * service copy functionality (specifically the new MEM_SET / MEM_COPY 1193 * instructions) so check the architectural engine list rather than the 1194 * actual list since these instructions are usable on BCS0 even if 1195 * all of the actual service copy engines (BCS1-BCS8) have been fused 1196 * off. 1197 */ 1198 return gt->info.engine_mask & GENMASK(XE_HW_ENGINE_BCS8, 1199 XE_HW_ENGINE_BCS1); 1200 } 1201 1202 static u32 emit_clear_cmd_len(struct xe_gt *gt) 1203 { 1204 if (has_service_copy_support(gt)) 1205 return PVC_MEM_SET_CMD_LEN_DW; 1206 else 1207 return XY_FAST_COLOR_BLT_DW; 1208 } 1209 1210 static void emit_clear(struct xe_gt *gt, struct xe_bb *bb, u64 src_ofs, 1211 u32 size, u32 pitch, bool is_vram) 1212 { 1213 if (has_service_copy_support(gt)) 1214 emit_clear_link_copy(gt, bb, src_ofs, size, pitch); 1215 else 1216 emit_clear_main_copy(gt, bb, src_ofs, size, pitch, 1217 is_vram); 1218 } 1219 1220 /** 1221 * xe_migrate_clear() - Copy content of TTM resources. 1222 * @m: The migration context. 1223 * @bo: The buffer object @dst is currently bound to. 1224 * @dst: The dst TTM resource to be cleared. 1225 * @clear_flags: flags to specify which data to clear: CCS, BO, or both. 1226 * 1227 * Clear the contents of @dst to zero when XE_MIGRATE_CLEAR_FLAG_BO_DATA is set. 1228 * On flat CCS devices, the CCS metadata is cleared to zero with XE_MIGRATE_CLEAR_FLAG_CCS_DATA. 1229 * Set XE_MIGRATE_CLEAR_FLAG_FULL to clear bo as well as CCS metadata. 1230 * TODO: Eliminate the @bo argument. 1231 * 1232 * Return: Pointer to a dma_fence representing the last clear batch, or 1233 * an error pointer on failure. If there is a failure, any clear operation 1234 * started by the function call has been synced. 1235 */ 1236 struct dma_fence *xe_migrate_clear(struct xe_migrate *m, 1237 struct xe_bo *bo, 1238 struct ttm_resource *dst, 1239 u32 clear_flags) 1240 { 1241 bool clear_vram = mem_type_is_vram(dst->mem_type); 1242 bool clear_bo_data = XE_MIGRATE_CLEAR_FLAG_BO_DATA & clear_flags; 1243 bool clear_ccs = XE_MIGRATE_CLEAR_FLAG_CCS_DATA & clear_flags; 1244 struct xe_gt *gt = m->tile->primary_gt; 1245 struct xe_device *xe = gt_to_xe(gt); 1246 bool clear_only_system_ccs = false; 1247 struct dma_fence *fence = NULL; 1248 u64 size = xe_bo_size(bo); 1249 struct xe_res_cursor src_it; 1250 struct ttm_resource *src = dst; 1251 int err; 1252 1253 if (WARN_ON(!clear_bo_data && !clear_ccs)) 1254 return NULL; 1255 1256 if (!clear_bo_data && clear_ccs && !IS_DGFX(xe)) 1257 clear_only_system_ccs = true; 1258 1259 if (!clear_vram) 1260 xe_res_first_sg(xe_bo_sg(bo), 0, xe_bo_size(bo), &src_it); 1261 else 1262 xe_res_first(src, 0, xe_bo_size(bo), &src_it); 1263 1264 while (size) { 1265 u64 clear_L0_ofs; 1266 u32 clear_L0_pt; 1267 u32 flush_flags = 0; 1268 u64 clear_L0; 1269 struct xe_sched_job *job; 1270 struct xe_bb *bb; 1271 u32 batch_size, update_idx; 1272 u32 pte_flags; 1273 1274 bool usm = xe->info.has_usm; 1275 u32 avail_pts = max_mem_transfer_per_pass(xe) / LEVEL0_PAGE_TABLE_ENCODE_SIZE; 1276 1277 clear_L0 = xe_migrate_res_sizes(m, &src_it); 1278 1279 /* Calculate final sizes and batch size.. */ 1280 pte_flags = clear_vram ? PTE_UPDATE_FLAG_IS_VRAM : 0; 1281 batch_size = 2 + 1282 pte_update_size(m, pte_flags, src, &src_it, 1283 &clear_L0, &clear_L0_ofs, &clear_L0_pt, 1284 clear_bo_data ? emit_clear_cmd_len(gt) : 0, 0, 1285 avail_pts); 1286 1287 if (xe_migrate_needs_ccs_emit(xe)) 1288 batch_size += EMIT_COPY_CCS_DW; 1289 1290 /* Clear commands */ 1291 1292 if (WARN_ON_ONCE(!clear_L0)) 1293 break; 1294 1295 bb = xe_bb_new(gt, batch_size, usm); 1296 if (IS_ERR(bb)) { 1297 err = PTR_ERR(bb); 1298 goto err_sync; 1299 } 1300 1301 size -= clear_L0; 1302 /* Preemption is enabled again by the ring ops. */ 1303 if (clear_vram && xe_migrate_allow_identity(clear_L0, &src_it)) { 1304 xe_res_next(&src_it, clear_L0); 1305 } else { 1306 emit_pte(m, bb, clear_L0_pt, clear_vram, 1307 clear_only_system_ccs, &src_it, clear_L0, dst); 1308 flush_flags |= MI_INVALIDATE_TLB; 1309 } 1310 1311 bb->cs[bb->len++] = MI_BATCH_BUFFER_END; 1312 update_idx = bb->len; 1313 1314 if (clear_bo_data) 1315 emit_clear(gt, bb, clear_L0_ofs, clear_L0, XE_PAGE_SIZE, clear_vram); 1316 1317 if (xe_migrate_needs_ccs_emit(xe)) { 1318 emit_copy_ccs(gt, bb, clear_L0_ofs, true, 1319 m->cleared_mem_ofs, false, clear_L0); 1320 flush_flags |= MI_FLUSH_DW_CCS; 1321 } 1322 1323 job = xe_bb_create_migration_job(m->q, bb, 1324 xe_migrate_batch_base(m, usm), 1325 update_idx); 1326 if (IS_ERR(job)) { 1327 err = PTR_ERR(job); 1328 goto err; 1329 } 1330 1331 xe_sched_job_add_migrate_flush(job, flush_flags); 1332 if (!fence) { 1333 /* 1334 * There can't be anything userspace related at this 1335 * point, so we just need to respect any potential move 1336 * fences, which are always tracked as 1337 * DMA_RESV_USAGE_KERNEL. 1338 */ 1339 err = xe_sched_job_add_deps(job, bo->ttm.base.resv, 1340 DMA_RESV_USAGE_KERNEL); 1341 if (err) 1342 goto err_job; 1343 } 1344 1345 mutex_lock(&m->job_mutex); 1346 xe_sched_job_arm(job); 1347 dma_fence_put(fence); 1348 fence = dma_fence_get(&job->drm.s_fence->finished); 1349 xe_sched_job_push(job); 1350 1351 dma_fence_put(m->fence); 1352 m->fence = dma_fence_get(fence); 1353 1354 mutex_unlock(&m->job_mutex); 1355 1356 xe_bb_free(bb, fence); 1357 continue; 1358 1359 err_job: 1360 xe_sched_job_put(job); 1361 err: 1362 xe_bb_free(bb, NULL); 1363 err_sync: 1364 /* Sync partial copies if any. FIXME: job_mutex? */ 1365 if (fence) { 1366 dma_fence_wait(fence, false); 1367 dma_fence_put(fence); 1368 } 1369 1370 return ERR_PTR(err); 1371 } 1372 1373 if (clear_ccs) 1374 bo->ccs_cleared = true; 1375 1376 return fence; 1377 } 1378 1379 static void write_pgtable(struct xe_tile *tile, struct xe_bb *bb, u64 ppgtt_ofs, 1380 const struct xe_vm_pgtable_update_op *pt_op, 1381 const struct xe_vm_pgtable_update *update, 1382 struct xe_migrate_pt_update *pt_update) 1383 { 1384 const struct xe_migrate_pt_update_ops *ops = pt_update->ops; 1385 u32 chunk; 1386 u32 ofs = update->ofs, size = update->qwords; 1387 1388 /* 1389 * If we have 512 entries (max), we would populate it ourselves, 1390 * and update the PDE above it to the new pointer. 1391 * The only time this can only happen if we have to update the top 1392 * PDE. This requires a BO that is almost vm->size big. 1393 * 1394 * This shouldn't be possible in practice.. might change when 16K 1395 * pages are used. Hence the assert. 1396 */ 1397 xe_tile_assert(tile, update->qwords < MAX_NUM_PTE); 1398 if (!ppgtt_ofs) 1399 ppgtt_ofs = xe_migrate_vram_ofs(tile_to_xe(tile), 1400 xe_bo_addr(update->pt_bo, 0, 1401 XE_PAGE_SIZE), false); 1402 1403 do { 1404 u64 addr = ppgtt_ofs + ofs * 8; 1405 1406 chunk = min(size, MAX_PTE_PER_SDI); 1407 1408 /* Ensure populatefn can do memset64 by aligning bb->cs */ 1409 if (!(bb->len & 1)) 1410 bb->cs[bb->len++] = MI_NOOP; 1411 1412 bb->cs[bb->len++] = MI_STORE_DATA_IMM | MI_SDI_NUM_QW(chunk); 1413 bb->cs[bb->len++] = lower_32_bits(addr); 1414 bb->cs[bb->len++] = upper_32_bits(addr); 1415 if (pt_op->bind) 1416 ops->populate(pt_update, tile, NULL, bb->cs + bb->len, 1417 ofs, chunk, update); 1418 else 1419 ops->clear(pt_update, tile, NULL, bb->cs + bb->len, 1420 ofs, chunk, update); 1421 1422 bb->len += chunk * 2; 1423 ofs += chunk; 1424 size -= chunk; 1425 } while (size); 1426 } 1427 1428 struct xe_vm *xe_migrate_get_vm(struct xe_migrate *m) 1429 { 1430 return xe_vm_get(m->q->vm); 1431 } 1432 1433 #if IS_ENABLED(CONFIG_DRM_XE_KUNIT_TEST) 1434 struct migrate_test_params { 1435 struct xe_test_priv base; 1436 bool force_gpu; 1437 }; 1438 1439 #define to_migrate_test_params(_priv) \ 1440 container_of(_priv, struct migrate_test_params, base) 1441 #endif 1442 1443 static struct dma_fence * 1444 xe_migrate_update_pgtables_cpu(struct xe_migrate *m, 1445 struct xe_migrate_pt_update *pt_update) 1446 { 1447 XE_TEST_DECLARE(struct migrate_test_params *test = 1448 to_migrate_test_params 1449 (xe_cur_kunit_priv(XE_TEST_LIVE_MIGRATE));) 1450 const struct xe_migrate_pt_update_ops *ops = pt_update->ops; 1451 struct xe_vm *vm = pt_update->vops->vm; 1452 struct xe_vm_pgtable_update_ops *pt_update_ops = 1453 &pt_update->vops->pt_update_ops[pt_update->tile_id]; 1454 int err; 1455 u32 i, j; 1456 1457 if (XE_TEST_ONLY(test && test->force_gpu)) 1458 return ERR_PTR(-ETIME); 1459 1460 if (ops->pre_commit) { 1461 pt_update->job = NULL; 1462 err = ops->pre_commit(pt_update); 1463 if (err) 1464 return ERR_PTR(err); 1465 } 1466 1467 for (i = 0; i < pt_update_ops->num_ops; ++i) { 1468 const struct xe_vm_pgtable_update_op *pt_op = 1469 &pt_update_ops->ops[i]; 1470 1471 for (j = 0; j < pt_op->num_entries; j++) { 1472 const struct xe_vm_pgtable_update *update = 1473 &pt_op->entries[j]; 1474 1475 if (pt_op->bind) 1476 ops->populate(pt_update, m->tile, 1477 &update->pt_bo->vmap, NULL, 1478 update->ofs, update->qwords, 1479 update); 1480 else 1481 ops->clear(pt_update, m->tile, 1482 &update->pt_bo->vmap, NULL, 1483 update->ofs, update->qwords, update); 1484 } 1485 } 1486 1487 trace_xe_vm_cpu_bind(vm); 1488 xe_device_wmb(vm->xe); 1489 1490 return dma_fence_get_stub(); 1491 } 1492 1493 static struct dma_fence * 1494 __xe_migrate_update_pgtables(struct xe_migrate *m, 1495 struct xe_migrate_pt_update *pt_update, 1496 struct xe_vm_pgtable_update_ops *pt_update_ops) 1497 { 1498 const struct xe_migrate_pt_update_ops *ops = pt_update->ops; 1499 struct xe_tile *tile = m->tile; 1500 struct xe_gt *gt = tile->primary_gt; 1501 struct xe_device *xe = tile_to_xe(tile); 1502 struct xe_sched_job *job; 1503 struct dma_fence *fence; 1504 struct drm_suballoc *sa_bo = NULL; 1505 struct xe_bb *bb; 1506 u32 i, j, batch_size = 0, ppgtt_ofs, update_idx, page_ofs = 0; 1507 u32 num_updates = 0, current_update = 0; 1508 u64 addr; 1509 int err = 0; 1510 bool is_migrate = pt_update_ops->q == m->q; 1511 bool usm = is_migrate && xe->info.has_usm; 1512 1513 for (i = 0; i < pt_update_ops->num_ops; ++i) { 1514 struct xe_vm_pgtable_update_op *pt_op = &pt_update_ops->ops[i]; 1515 struct xe_vm_pgtable_update *updates = pt_op->entries; 1516 1517 num_updates += pt_op->num_entries; 1518 for (j = 0; j < pt_op->num_entries; ++j) { 1519 u32 num_cmds = DIV_ROUND_UP(updates[j].qwords, 1520 MAX_PTE_PER_SDI); 1521 1522 /* align noop + MI_STORE_DATA_IMM cmd prefix */ 1523 batch_size += 4 * num_cmds + updates[j].qwords * 2; 1524 } 1525 } 1526 1527 /* fixed + PTE entries */ 1528 if (IS_DGFX(xe)) 1529 batch_size += 2; 1530 else 1531 batch_size += 6 * (num_updates / MAX_PTE_PER_SDI + 1) + 1532 num_updates * 2; 1533 1534 bb = xe_bb_new(gt, batch_size, usm); 1535 if (IS_ERR(bb)) 1536 return ERR_CAST(bb); 1537 1538 /* For sysmem PTE's, need to map them in our hole.. */ 1539 if (!IS_DGFX(xe)) { 1540 u16 pat_index = xe->pat.idx[XE_CACHE_WB]; 1541 u32 ptes, ofs; 1542 1543 ppgtt_ofs = NUM_KERNEL_PDE - 1; 1544 if (!is_migrate) { 1545 u32 num_units = DIV_ROUND_UP(num_updates, 1546 NUM_VMUSA_WRITES_PER_UNIT); 1547 1548 if (num_units > m->vm_update_sa.size) { 1549 err = -ENOBUFS; 1550 goto err_bb; 1551 } 1552 sa_bo = drm_suballoc_new(&m->vm_update_sa, num_units, 1553 GFP_KERNEL, true, 0); 1554 if (IS_ERR(sa_bo)) { 1555 err = PTR_ERR(sa_bo); 1556 goto err_bb; 1557 } 1558 1559 ppgtt_ofs = NUM_KERNEL_PDE + 1560 (drm_suballoc_soffset(sa_bo) / 1561 NUM_VMUSA_UNIT_PER_PAGE); 1562 page_ofs = (drm_suballoc_soffset(sa_bo) % 1563 NUM_VMUSA_UNIT_PER_PAGE) * 1564 VM_SA_UPDATE_UNIT_SIZE; 1565 } 1566 1567 /* Map our PT's to gtt */ 1568 i = 0; 1569 j = 0; 1570 ptes = num_updates; 1571 ofs = ppgtt_ofs * XE_PAGE_SIZE + page_ofs; 1572 while (ptes) { 1573 u32 chunk = min(MAX_PTE_PER_SDI, ptes); 1574 u32 idx = 0; 1575 1576 bb->cs[bb->len++] = MI_STORE_DATA_IMM | 1577 MI_SDI_NUM_QW(chunk); 1578 bb->cs[bb->len++] = ofs; 1579 bb->cs[bb->len++] = 0; /* upper_32_bits */ 1580 1581 for (; i < pt_update_ops->num_ops; ++i) { 1582 struct xe_vm_pgtable_update_op *pt_op = 1583 &pt_update_ops->ops[i]; 1584 struct xe_vm_pgtable_update *updates = pt_op->entries; 1585 1586 for (; j < pt_op->num_entries; ++j, ++current_update, ++idx) { 1587 struct xe_vm *vm = pt_update->vops->vm; 1588 struct xe_bo *pt_bo = updates[j].pt_bo; 1589 1590 if (idx == chunk) 1591 goto next_cmd; 1592 1593 xe_tile_assert(tile, xe_bo_size(pt_bo) == SZ_4K); 1594 1595 /* Map a PT at most once */ 1596 if (pt_bo->update_index < 0) 1597 pt_bo->update_index = current_update; 1598 1599 addr = vm->pt_ops->pte_encode_bo(pt_bo, 0, 1600 pat_index, 0); 1601 bb->cs[bb->len++] = lower_32_bits(addr); 1602 bb->cs[bb->len++] = upper_32_bits(addr); 1603 } 1604 1605 j = 0; 1606 } 1607 1608 next_cmd: 1609 ptes -= chunk; 1610 ofs += chunk * sizeof(u64); 1611 } 1612 1613 bb->cs[bb->len++] = MI_BATCH_BUFFER_END; 1614 update_idx = bb->len; 1615 1616 addr = xe_migrate_vm_addr(ppgtt_ofs, 0) + 1617 (page_ofs / sizeof(u64)) * XE_PAGE_SIZE; 1618 for (i = 0; i < pt_update_ops->num_ops; ++i) { 1619 struct xe_vm_pgtable_update_op *pt_op = 1620 &pt_update_ops->ops[i]; 1621 struct xe_vm_pgtable_update *updates = pt_op->entries; 1622 1623 for (j = 0; j < pt_op->num_entries; ++j) { 1624 struct xe_bo *pt_bo = updates[j].pt_bo; 1625 1626 write_pgtable(tile, bb, addr + 1627 pt_bo->update_index * XE_PAGE_SIZE, 1628 pt_op, &updates[j], pt_update); 1629 } 1630 } 1631 } else { 1632 /* phys pages, no preamble required */ 1633 bb->cs[bb->len++] = MI_BATCH_BUFFER_END; 1634 update_idx = bb->len; 1635 1636 for (i = 0; i < pt_update_ops->num_ops; ++i) { 1637 struct xe_vm_pgtable_update_op *pt_op = 1638 &pt_update_ops->ops[i]; 1639 struct xe_vm_pgtable_update *updates = pt_op->entries; 1640 1641 for (j = 0; j < pt_op->num_entries; ++j) 1642 write_pgtable(tile, bb, 0, pt_op, &updates[j], 1643 pt_update); 1644 } 1645 } 1646 1647 job = xe_bb_create_migration_job(pt_update_ops->q, bb, 1648 xe_migrate_batch_base(m, usm), 1649 update_idx); 1650 if (IS_ERR(job)) { 1651 err = PTR_ERR(job); 1652 goto err_sa; 1653 } 1654 1655 xe_sched_job_add_migrate_flush(job, MI_INVALIDATE_TLB); 1656 1657 if (ops->pre_commit) { 1658 pt_update->job = job; 1659 err = ops->pre_commit(pt_update); 1660 if (err) 1661 goto err_job; 1662 } 1663 if (is_migrate) 1664 mutex_lock(&m->job_mutex); 1665 1666 xe_sched_job_arm(job); 1667 fence = dma_fence_get(&job->drm.s_fence->finished); 1668 xe_sched_job_push(job); 1669 1670 if (is_migrate) 1671 mutex_unlock(&m->job_mutex); 1672 1673 xe_bb_free(bb, fence); 1674 drm_suballoc_free(sa_bo, fence); 1675 1676 return fence; 1677 1678 err_job: 1679 xe_sched_job_put(job); 1680 err_sa: 1681 drm_suballoc_free(sa_bo, NULL); 1682 err_bb: 1683 xe_bb_free(bb, NULL); 1684 return ERR_PTR(err); 1685 } 1686 1687 /** 1688 * xe_migrate_update_pgtables() - Pipelined page-table update 1689 * @m: The migrate context. 1690 * @pt_update: PT update arguments 1691 * 1692 * Perform a pipelined page-table update. The update descriptors are typically 1693 * built under the same lock critical section as a call to this function. If 1694 * using the default engine for the updates, they will be performed in the 1695 * order they grab the job_mutex. If different engines are used, external 1696 * synchronization is needed for overlapping updates to maintain page-table 1697 * consistency. Note that the meaning of "overlapping" is that the updates 1698 * touch the same page-table, which might be a higher-level page-directory. 1699 * If no pipelining is needed, then updates may be performed by the cpu. 1700 * 1701 * Return: A dma_fence that, when signaled, indicates the update completion. 1702 */ 1703 struct dma_fence * 1704 xe_migrate_update_pgtables(struct xe_migrate *m, 1705 struct xe_migrate_pt_update *pt_update) 1706 1707 { 1708 struct xe_vm_pgtable_update_ops *pt_update_ops = 1709 &pt_update->vops->pt_update_ops[pt_update->tile_id]; 1710 struct dma_fence *fence; 1711 1712 fence = xe_migrate_update_pgtables_cpu(m, pt_update); 1713 1714 /* -ETIME indicates a job is needed, anything else is legit error */ 1715 if (!IS_ERR(fence) || PTR_ERR(fence) != -ETIME) 1716 return fence; 1717 1718 return __xe_migrate_update_pgtables(m, pt_update, pt_update_ops); 1719 } 1720 1721 /** 1722 * xe_migrate_wait() - Complete all operations using the xe_migrate context 1723 * @m: Migrate context to wait for. 1724 * 1725 * Waits until the GPU no longer uses the migrate context's default engine 1726 * or its page-table objects. FIXME: What about separate page-table update 1727 * engines? 1728 */ 1729 void xe_migrate_wait(struct xe_migrate *m) 1730 { 1731 if (m->fence) 1732 dma_fence_wait(m->fence, false); 1733 } 1734 1735 static u32 pte_update_cmd_size(u64 size) 1736 { 1737 u32 num_dword; 1738 u64 entries = DIV_U64_ROUND_UP(size, XE_PAGE_SIZE); 1739 1740 XE_WARN_ON(size > MAX_PREEMPTDISABLE_TRANSFER); 1741 1742 /* 1743 * MI_STORE_DATA_IMM command is used to update page table. Each 1744 * instruction can update maximumly MAX_PTE_PER_SDI pte entries. To 1745 * update n (n <= MAX_PTE_PER_SDI) pte entries, we need: 1746 * 1747 * - 1 dword for the MI_STORE_DATA_IMM command header (opcode etc) 1748 * - 2 dword for the page table's physical location 1749 * - 2*n dword for value of pte to fill (each pte entry is 2 dwords) 1750 */ 1751 num_dword = (1 + 2) * DIV_U64_ROUND_UP(entries, MAX_PTE_PER_SDI); 1752 num_dword += entries * 2; 1753 1754 return num_dword; 1755 } 1756 1757 static void build_pt_update_batch_sram(struct xe_migrate *m, 1758 struct xe_bb *bb, u32 pt_offset, 1759 struct drm_pagemap_addr *sram_addr, 1760 u32 size) 1761 { 1762 u16 pat_index = tile_to_xe(m->tile)->pat.idx[XE_CACHE_WB]; 1763 u32 ptes; 1764 int i = 0; 1765 1766 ptes = DIV_ROUND_UP(size, XE_PAGE_SIZE); 1767 while (ptes) { 1768 u32 chunk = min(MAX_PTE_PER_SDI, ptes); 1769 1770 bb->cs[bb->len++] = MI_STORE_DATA_IMM | MI_SDI_NUM_QW(chunk); 1771 bb->cs[bb->len++] = pt_offset; 1772 bb->cs[bb->len++] = 0; 1773 1774 pt_offset += chunk * 8; 1775 ptes -= chunk; 1776 1777 while (chunk--) { 1778 u64 addr = sram_addr[i].addr & PAGE_MASK; 1779 1780 xe_tile_assert(m->tile, sram_addr[i].proto == 1781 DRM_INTERCONNECT_SYSTEM); 1782 xe_tile_assert(m->tile, addr); 1783 addr = m->q->vm->pt_ops->pte_encode_addr(m->tile->xe, 1784 addr, pat_index, 1785 0, false, 0); 1786 bb->cs[bb->len++] = lower_32_bits(addr); 1787 bb->cs[bb->len++] = upper_32_bits(addr); 1788 1789 i++; 1790 } 1791 } 1792 } 1793 1794 enum xe_migrate_copy_dir { 1795 XE_MIGRATE_COPY_TO_VRAM, 1796 XE_MIGRATE_COPY_TO_SRAM, 1797 }; 1798 1799 #define XE_CACHELINE_BYTES 64ull 1800 #define XE_CACHELINE_MASK (XE_CACHELINE_BYTES - 1) 1801 1802 static struct dma_fence *xe_migrate_vram(struct xe_migrate *m, 1803 unsigned long len, 1804 unsigned long sram_offset, 1805 struct drm_pagemap_addr *sram_addr, 1806 u64 vram_addr, 1807 const enum xe_migrate_copy_dir dir) 1808 { 1809 struct xe_gt *gt = m->tile->primary_gt; 1810 struct xe_device *xe = gt_to_xe(gt); 1811 bool use_usm_batch = xe->info.has_usm; 1812 struct dma_fence *fence = NULL; 1813 u32 batch_size = 2; 1814 u64 src_L0_ofs, dst_L0_ofs; 1815 struct xe_sched_job *job; 1816 struct xe_bb *bb; 1817 u32 update_idx, pt_slot = 0; 1818 unsigned long npages = DIV_ROUND_UP(len + sram_offset, PAGE_SIZE); 1819 unsigned int pitch = len >= PAGE_SIZE && !(len & ~PAGE_MASK) ? 1820 PAGE_SIZE : 4; 1821 int err; 1822 unsigned long i, j; 1823 1824 if (drm_WARN_ON(&xe->drm, (len & XE_CACHELINE_MASK) || 1825 (sram_offset | vram_addr) & XE_CACHELINE_MASK)) 1826 return ERR_PTR(-EOPNOTSUPP); 1827 1828 xe_assert(xe, npages * PAGE_SIZE <= MAX_PREEMPTDISABLE_TRANSFER); 1829 1830 batch_size += pte_update_cmd_size(len); 1831 batch_size += EMIT_COPY_DW; 1832 1833 bb = xe_bb_new(gt, batch_size, use_usm_batch); 1834 if (IS_ERR(bb)) { 1835 err = PTR_ERR(bb); 1836 return ERR_PTR(err); 1837 } 1838 1839 /* 1840 * If the order of a struct drm_pagemap_addr entry is greater than 0, 1841 * the entry is populated by GPU pagemap but subsequent entries within 1842 * the range of that order are not populated. 1843 * build_pt_update_batch_sram() expects a fully populated array of 1844 * struct drm_pagemap_addr. Ensure this is the case even with higher 1845 * orders. 1846 */ 1847 for (i = 0; i < npages;) { 1848 unsigned int order = sram_addr[i].order; 1849 1850 for (j = 1; j < NR_PAGES(order) && i + j < npages; j++) 1851 if (!sram_addr[i + j].addr) 1852 sram_addr[i + j].addr = sram_addr[i].addr + j * PAGE_SIZE; 1853 1854 i += NR_PAGES(order); 1855 } 1856 1857 build_pt_update_batch_sram(m, bb, pt_slot * XE_PAGE_SIZE, 1858 sram_addr, len + sram_offset); 1859 1860 if (dir == XE_MIGRATE_COPY_TO_VRAM) { 1861 src_L0_ofs = xe_migrate_vm_addr(pt_slot, 0) + sram_offset; 1862 dst_L0_ofs = xe_migrate_vram_ofs(xe, vram_addr, false); 1863 1864 } else { 1865 src_L0_ofs = xe_migrate_vram_ofs(xe, vram_addr, false); 1866 dst_L0_ofs = xe_migrate_vm_addr(pt_slot, 0) + sram_offset; 1867 } 1868 1869 bb->cs[bb->len++] = MI_BATCH_BUFFER_END; 1870 update_idx = bb->len; 1871 1872 emit_copy(gt, bb, src_L0_ofs, dst_L0_ofs, len, pitch); 1873 1874 job = xe_bb_create_migration_job(m->q, bb, 1875 xe_migrate_batch_base(m, use_usm_batch), 1876 update_idx); 1877 if (IS_ERR(job)) { 1878 err = PTR_ERR(job); 1879 goto err; 1880 } 1881 1882 xe_sched_job_add_migrate_flush(job, MI_INVALIDATE_TLB); 1883 1884 mutex_lock(&m->job_mutex); 1885 xe_sched_job_arm(job); 1886 fence = dma_fence_get(&job->drm.s_fence->finished); 1887 xe_sched_job_push(job); 1888 1889 dma_fence_put(m->fence); 1890 m->fence = dma_fence_get(fence); 1891 mutex_unlock(&m->job_mutex); 1892 1893 xe_bb_free(bb, fence); 1894 1895 return fence; 1896 1897 err: 1898 xe_bb_free(bb, NULL); 1899 1900 return ERR_PTR(err); 1901 } 1902 1903 /** 1904 * xe_migrate_to_vram() - Migrate to VRAM 1905 * @m: The migration context. 1906 * @npages: Number of pages to migrate. 1907 * @src_addr: Array of DMA information (source of migrate) 1908 * @dst_addr: Device physical address of VRAM (destination of migrate) 1909 * 1910 * Copy from an array dma addresses to a VRAM device physical address 1911 * 1912 * Return: dma fence for migrate to signal completion on succees, ERR_PTR on 1913 * failure 1914 */ 1915 struct dma_fence *xe_migrate_to_vram(struct xe_migrate *m, 1916 unsigned long npages, 1917 struct drm_pagemap_addr *src_addr, 1918 u64 dst_addr) 1919 { 1920 return xe_migrate_vram(m, npages * PAGE_SIZE, 0, src_addr, dst_addr, 1921 XE_MIGRATE_COPY_TO_VRAM); 1922 } 1923 1924 /** 1925 * xe_migrate_from_vram() - Migrate from VRAM 1926 * @m: The migration context. 1927 * @npages: Number of pages to migrate. 1928 * @src_addr: Device physical address of VRAM (source of migrate) 1929 * @dst_addr: Array of DMA information (destination of migrate) 1930 * 1931 * Copy from a VRAM device physical address to an array dma addresses 1932 * 1933 * Return: dma fence for migrate to signal completion on succees, ERR_PTR on 1934 * failure 1935 */ 1936 struct dma_fence *xe_migrate_from_vram(struct xe_migrate *m, 1937 unsigned long npages, 1938 u64 src_addr, 1939 struct drm_pagemap_addr *dst_addr) 1940 { 1941 return xe_migrate_vram(m, npages * PAGE_SIZE, 0, dst_addr, src_addr, 1942 XE_MIGRATE_COPY_TO_SRAM); 1943 } 1944 1945 static void xe_migrate_dma_unmap(struct xe_device *xe, 1946 struct drm_pagemap_addr *pagemap_addr, 1947 int len, int write) 1948 { 1949 unsigned long i, npages = DIV_ROUND_UP(len, PAGE_SIZE); 1950 1951 for (i = 0; i < npages; ++i) { 1952 if (!pagemap_addr[i].addr) 1953 break; 1954 1955 dma_unmap_page(xe->drm.dev, pagemap_addr[i].addr, PAGE_SIZE, 1956 write ? DMA_TO_DEVICE : DMA_FROM_DEVICE); 1957 } 1958 kfree(pagemap_addr); 1959 } 1960 1961 static struct drm_pagemap_addr *xe_migrate_dma_map(struct xe_device *xe, 1962 void *buf, int len, 1963 int write) 1964 { 1965 struct drm_pagemap_addr *pagemap_addr; 1966 unsigned long i, npages = DIV_ROUND_UP(len, PAGE_SIZE); 1967 1968 pagemap_addr = kcalloc(npages, sizeof(*pagemap_addr), GFP_KERNEL); 1969 if (!pagemap_addr) 1970 return ERR_PTR(-ENOMEM); 1971 1972 for (i = 0; i < npages; ++i) { 1973 dma_addr_t addr; 1974 struct page *page; 1975 enum dma_data_direction dir = write ? DMA_TO_DEVICE : 1976 DMA_FROM_DEVICE; 1977 1978 if (is_vmalloc_addr(buf)) 1979 page = vmalloc_to_page(buf); 1980 else 1981 page = virt_to_page(buf); 1982 1983 addr = dma_map_page(xe->drm.dev, page, 0, PAGE_SIZE, dir); 1984 if (dma_mapping_error(xe->drm.dev, addr)) 1985 goto err_fault; 1986 1987 pagemap_addr[i] = 1988 drm_pagemap_addr_encode(addr, 1989 DRM_INTERCONNECT_SYSTEM, 1990 0, dir); 1991 buf += PAGE_SIZE; 1992 } 1993 1994 return pagemap_addr; 1995 1996 err_fault: 1997 xe_migrate_dma_unmap(xe, pagemap_addr, len, write); 1998 return ERR_PTR(-EFAULT); 1999 } 2000 2001 /** 2002 * xe_migrate_access_memory - Access memory of a BO via GPU 2003 * 2004 * @m: The migration context. 2005 * @bo: buffer object 2006 * @offset: access offset into buffer object 2007 * @buf: pointer to caller memory to read into or write from 2008 * @len: length of access 2009 * @write: write access 2010 * 2011 * Access memory of a BO via GPU either reading in or writing from a passed in 2012 * pointer. Pointer is dma mapped for GPU access and GPU commands are issued to 2013 * read to or write from pointer. 2014 * 2015 * Returns: 2016 * 0 if successful, negative error code on failure. 2017 */ 2018 int xe_migrate_access_memory(struct xe_migrate *m, struct xe_bo *bo, 2019 unsigned long offset, void *buf, int len, 2020 int write) 2021 { 2022 struct xe_tile *tile = m->tile; 2023 struct xe_device *xe = tile_to_xe(tile); 2024 struct xe_res_cursor cursor; 2025 struct dma_fence *fence = NULL; 2026 struct drm_pagemap_addr *pagemap_addr; 2027 unsigned long page_offset = (unsigned long)buf & ~PAGE_MASK; 2028 int bytes_left = len, current_page = 0; 2029 void *orig_buf = buf; 2030 2031 xe_bo_assert_held(bo); 2032 2033 /* Use bounce buffer for small access and unaligned access */ 2034 if (!IS_ALIGNED(len, XE_CACHELINE_BYTES) || 2035 !IS_ALIGNED((unsigned long)buf + offset, XE_CACHELINE_BYTES)) { 2036 int buf_offset = 0; 2037 void *bounce; 2038 int err; 2039 2040 BUILD_BUG_ON(!is_power_of_2(XE_CACHELINE_BYTES)); 2041 bounce = kmalloc(XE_CACHELINE_BYTES, GFP_KERNEL); 2042 if (!bounce) 2043 return -ENOMEM; 2044 2045 /* 2046 * Less than ideal for large unaligned access but this should be 2047 * fairly rare, can fixup if this becomes common. 2048 */ 2049 do { 2050 int copy_bytes = min_t(int, bytes_left, 2051 XE_CACHELINE_BYTES - 2052 (offset & XE_CACHELINE_MASK)); 2053 int ptr_offset = offset & XE_CACHELINE_MASK; 2054 2055 err = xe_migrate_access_memory(m, bo, 2056 offset & 2057 ~XE_CACHELINE_MASK, 2058 bounce, 2059 XE_CACHELINE_BYTES, 0); 2060 if (err) 2061 break; 2062 2063 if (write) { 2064 memcpy(bounce + ptr_offset, buf + buf_offset, copy_bytes); 2065 2066 err = xe_migrate_access_memory(m, bo, 2067 offset & ~XE_CACHELINE_MASK, 2068 bounce, 2069 XE_CACHELINE_BYTES, write); 2070 if (err) 2071 break; 2072 } else { 2073 memcpy(buf + buf_offset, bounce + ptr_offset, 2074 copy_bytes); 2075 } 2076 2077 bytes_left -= copy_bytes; 2078 buf_offset += copy_bytes; 2079 offset += copy_bytes; 2080 } while (bytes_left); 2081 2082 kfree(bounce); 2083 return err; 2084 } 2085 2086 pagemap_addr = xe_migrate_dma_map(xe, buf, len + page_offset, write); 2087 if (IS_ERR(pagemap_addr)) 2088 return PTR_ERR(pagemap_addr); 2089 2090 xe_res_first(bo->ttm.resource, offset, xe_bo_size(bo) - offset, &cursor); 2091 2092 do { 2093 struct dma_fence *__fence; 2094 u64 vram_addr = vram_region_gpu_offset(bo->ttm.resource) + 2095 cursor.start; 2096 int current_bytes; 2097 2098 if (cursor.size > MAX_PREEMPTDISABLE_TRANSFER) 2099 current_bytes = min_t(int, bytes_left, 2100 MAX_PREEMPTDISABLE_TRANSFER); 2101 else 2102 current_bytes = min_t(int, bytes_left, cursor.size); 2103 2104 if (current_bytes & ~PAGE_MASK) { 2105 int pitch = 4; 2106 2107 current_bytes = min_t(int, current_bytes, S16_MAX * pitch); 2108 } 2109 2110 __fence = xe_migrate_vram(m, current_bytes, 2111 (unsigned long)buf & ~PAGE_MASK, 2112 &pagemap_addr[current_page], 2113 vram_addr, write ? 2114 XE_MIGRATE_COPY_TO_VRAM : 2115 XE_MIGRATE_COPY_TO_SRAM); 2116 if (IS_ERR(__fence)) { 2117 if (fence) { 2118 dma_fence_wait(fence, false); 2119 dma_fence_put(fence); 2120 } 2121 fence = __fence; 2122 goto out_err; 2123 } 2124 2125 dma_fence_put(fence); 2126 fence = __fence; 2127 2128 buf += current_bytes; 2129 offset += current_bytes; 2130 current_page = (int)(buf - orig_buf) / PAGE_SIZE; 2131 bytes_left -= current_bytes; 2132 if (bytes_left) 2133 xe_res_next(&cursor, current_bytes); 2134 } while (bytes_left); 2135 2136 dma_fence_wait(fence, false); 2137 dma_fence_put(fence); 2138 2139 out_err: 2140 xe_migrate_dma_unmap(xe, pagemap_addr, len + page_offset, write); 2141 return IS_ERR(fence) ? PTR_ERR(fence) : 0; 2142 } 2143 2144 /** 2145 * xe_migrate_job_lock() - Lock migrate job lock 2146 * @m: The migration context. 2147 * @q: Queue associated with the operation which requires a lock 2148 * 2149 * Lock the migrate job lock if the queue is a migration queue, otherwise 2150 * assert the VM's dma-resv is held (user queue's have own locking). 2151 */ 2152 void xe_migrate_job_lock(struct xe_migrate *m, struct xe_exec_queue *q) 2153 { 2154 bool is_migrate = q == m->q; 2155 2156 if (is_migrate) 2157 mutex_lock(&m->job_mutex); 2158 else 2159 xe_vm_assert_held(q->vm); /* User queues VM's should be locked */ 2160 } 2161 2162 /** 2163 * xe_migrate_job_unlock() - Unlock migrate job lock 2164 * @m: The migration context. 2165 * @q: Queue associated with the operation which requires a lock 2166 * 2167 * Unlock the migrate job lock if the queue is a migration queue, otherwise 2168 * assert the VM's dma-resv is held (user queue's have own locking). 2169 */ 2170 void xe_migrate_job_unlock(struct xe_migrate *m, struct xe_exec_queue *q) 2171 { 2172 bool is_migrate = q == m->q; 2173 2174 if (is_migrate) 2175 mutex_unlock(&m->job_mutex); 2176 else 2177 xe_vm_assert_held(q->vm); /* User queues VM's should be locked */ 2178 } 2179 2180 #if IS_ENABLED(CONFIG_DRM_XE_KUNIT_TEST) 2181 #include "tests/xe_migrate.c" 2182 #endif 2183