1 // SPDX-License-Identifier: MIT 2 /* 3 * Copyright © 2020 Intel Corporation 4 */ 5 6 #include "xe_migrate.h" 7 8 #include <linux/bitfield.h> 9 #include <linux/sizes.h> 10 11 #include <drm/drm_managed.h> 12 #include <drm/ttm/ttm_tt.h> 13 #include <uapi/drm/xe_drm.h> 14 15 #include <generated/xe_wa_oob.h> 16 17 #include "instructions/xe_gpu_commands.h" 18 #include "instructions/xe_mi_commands.h" 19 #include "regs/xe_gtt_defs.h" 20 #include "tests/xe_test.h" 21 #include "xe_assert.h" 22 #include "xe_bb.h" 23 #include "xe_bo.h" 24 #include "xe_exec_queue.h" 25 #include "xe_ggtt.h" 26 #include "xe_gt.h" 27 #include "xe_hw_engine.h" 28 #include "xe_lrc.h" 29 #include "xe_map.h" 30 #include "xe_mocs.h" 31 #include "xe_pt.h" 32 #include "xe_res_cursor.h" 33 #include "xe_sa.h" 34 #include "xe_sched_job.h" 35 #include "xe_sync.h" 36 #include "xe_trace_bo.h" 37 #include "xe_vm.h" 38 #include "xe_vram.h" 39 40 /** 41 * struct xe_migrate - migrate context. 42 */ 43 struct xe_migrate { 44 /** @q: Default exec queue used for migration */ 45 struct xe_exec_queue *q; 46 /** @tile: Backpointer to the tile this struct xe_migrate belongs to. */ 47 struct xe_tile *tile; 48 /** @job_mutex: Timeline mutex for @eng. */ 49 struct mutex job_mutex; 50 /** @pt_bo: Page-table buffer object. */ 51 struct xe_bo *pt_bo; 52 /** @batch_base_ofs: VM offset of the migration batch buffer */ 53 u64 batch_base_ofs; 54 /** @usm_batch_base_ofs: VM offset of the usm batch buffer */ 55 u64 usm_batch_base_ofs; 56 /** @cleared_mem_ofs: VM offset of @cleared_bo. */ 57 u64 cleared_mem_ofs; 58 /** 59 * @fence: dma-fence representing the last migration job batch. 60 * Protected by @job_mutex. 61 */ 62 struct dma_fence *fence; 63 /** 64 * @vm_update_sa: For integrated, used to suballocate page-tables 65 * out of the pt_bo. 66 */ 67 struct drm_suballoc_manager vm_update_sa; 68 /** @min_chunk_size: For dgfx, Minimum chunk size */ 69 u64 min_chunk_size; 70 }; 71 72 #define MAX_PREEMPTDISABLE_TRANSFER SZ_8M /* Around 1ms. */ 73 #define MAX_CCS_LIMITED_TRANSFER SZ_4M /* XE_PAGE_SIZE * (FIELD_MAX(XE2_CCS_SIZE_MASK) + 1) */ 74 #define NUM_KERNEL_PDE 15 75 #define NUM_PT_SLOTS 32 76 #define LEVEL0_PAGE_TABLE_ENCODE_SIZE SZ_2M 77 #define MAX_NUM_PTE 512 78 #define IDENTITY_OFFSET 256ULL 79 80 /* 81 * Although MI_STORE_DATA_IMM's "length" field is 10-bits, 0x3FE is the largest 82 * legal value accepted. Since that instruction field is always stored in 83 * (val-2) format, this translates to 0x400 dwords for the true maximum length 84 * of the instruction. Subtracting the instruction header (1 dword) and 85 * address (2 dwords), that leaves 0x3FD dwords (0x1FE qwords) for PTE values. 86 */ 87 #define MAX_PTE_PER_SDI 0x1FEU 88 89 static void xe_migrate_fini(void *arg) 90 { 91 struct xe_migrate *m = arg; 92 93 xe_vm_lock(m->q->vm, false); 94 xe_bo_unpin(m->pt_bo); 95 xe_vm_unlock(m->q->vm); 96 97 dma_fence_put(m->fence); 98 xe_bo_put(m->pt_bo); 99 drm_suballoc_manager_fini(&m->vm_update_sa); 100 mutex_destroy(&m->job_mutex); 101 xe_vm_close_and_put(m->q->vm); 102 xe_exec_queue_put(m->q); 103 } 104 105 static u64 xe_migrate_vm_addr(u64 slot, u32 level) 106 { 107 XE_WARN_ON(slot >= NUM_PT_SLOTS); 108 109 /* First slot is reserved for mapping of PT bo and bb, start from 1 */ 110 return (slot + 1ULL) << xe_pt_shift(level + 1); 111 } 112 113 static u64 xe_migrate_vram_ofs(struct xe_device *xe, u64 addr, bool is_comp_pte) 114 { 115 /* 116 * Remove the DPA to get a correct offset into identity table for the 117 * migrate offset 118 */ 119 u64 identity_offset = IDENTITY_OFFSET; 120 121 if (GRAPHICS_VER(xe) >= 20 && is_comp_pte) 122 identity_offset += DIV_ROUND_UP_ULL(xe_vram_region_actual_physical_size 123 (xe->mem.vram), SZ_1G); 124 125 addr -= xe_vram_region_dpa_base(xe->mem.vram); 126 return addr + (identity_offset << xe_pt_shift(2)); 127 } 128 129 static void xe_migrate_program_identity(struct xe_device *xe, struct xe_vm *vm, struct xe_bo *bo, 130 u64 map_ofs, u64 vram_offset, u16 pat_index, u64 pt_2m_ofs) 131 { 132 struct xe_vram_region *vram = xe->mem.vram; 133 resource_size_t dpa_base = xe_vram_region_dpa_base(vram); 134 u64 pos, ofs, flags; 135 u64 entry; 136 /* XXX: Unclear if this should be usable_size? */ 137 u64 vram_limit = xe_vram_region_actual_physical_size(vram) + dpa_base; 138 u32 level = 2; 139 140 ofs = map_ofs + XE_PAGE_SIZE * level + vram_offset * 8; 141 flags = vm->pt_ops->pte_encode_addr(xe, 0, pat_index, level, 142 true, 0); 143 144 xe_assert(xe, IS_ALIGNED(xe_vram_region_usable_size(vram), SZ_2M)); 145 146 /* 147 * Use 1GB pages when possible, last chunk always use 2M 148 * pages as mixing reserved memory (stolen, WOCPM) with a single 149 * mapping is not allowed on certain platforms. 150 */ 151 for (pos = dpa_base; pos < vram_limit; 152 pos += SZ_1G, ofs += 8) { 153 if (pos + SZ_1G >= vram_limit) { 154 entry = vm->pt_ops->pde_encode_bo(bo, pt_2m_ofs, 155 pat_index); 156 xe_map_wr(xe, &bo->vmap, ofs, u64, entry); 157 158 flags = vm->pt_ops->pte_encode_addr(xe, 0, 159 pat_index, 160 level - 1, 161 true, 0); 162 163 for (ofs = pt_2m_ofs; pos < vram_limit; 164 pos += SZ_2M, ofs += 8) 165 xe_map_wr(xe, &bo->vmap, ofs, u64, pos | flags); 166 break; /* Ensure pos == vram_limit assert correct */ 167 } 168 169 xe_map_wr(xe, &bo->vmap, ofs, u64, pos | flags); 170 } 171 172 xe_assert(xe, pos == vram_limit); 173 } 174 175 static int xe_migrate_prepare_vm(struct xe_tile *tile, struct xe_migrate *m, 176 struct xe_vm *vm) 177 { 178 struct xe_device *xe = tile_to_xe(tile); 179 u16 pat_index = xe->pat.idx[XE_CACHE_WB]; 180 u8 id = tile->id; 181 u32 num_entries = NUM_PT_SLOTS, num_level = vm->pt_root[id]->level; 182 #define VRAM_IDENTITY_MAP_COUNT 2 183 u32 num_setup = num_level + VRAM_IDENTITY_MAP_COUNT; 184 #undef VRAM_IDENTITY_MAP_COUNT 185 u32 map_ofs, level, i; 186 struct xe_bo *bo, *batch = tile->mem.kernel_bb_pool->bo; 187 u64 entry, pt29_ofs; 188 189 /* Can't bump NUM_PT_SLOTS too high */ 190 BUILD_BUG_ON(NUM_PT_SLOTS > SZ_2M/XE_PAGE_SIZE); 191 /* Must be a multiple of 64K to support all platforms */ 192 BUILD_BUG_ON(NUM_PT_SLOTS * XE_PAGE_SIZE % SZ_64K); 193 /* And one slot reserved for the 4KiB page table updates */ 194 BUILD_BUG_ON(!(NUM_KERNEL_PDE & 1)); 195 196 /* Need to be sure everything fits in the first PT, or create more */ 197 xe_tile_assert(tile, m->batch_base_ofs + xe_bo_size(batch) < SZ_2M); 198 199 bo = xe_bo_create_pin_map(vm->xe, tile, vm, 200 num_entries * XE_PAGE_SIZE, 201 ttm_bo_type_kernel, 202 XE_BO_FLAG_VRAM_IF_DGFX(tile) | 203 XE_BO_FLAG_PAGETABLE); 204 if (IS_ERR(bo)) 205 return PTR_ERR(bo); 206 207 /* PT30 & PT31 reserved for 2M identity map */ 208 pt29_ofs = xe_bo_size(bo) - 3 * XE_PAGE_SIZE; 209 entry = vm->pt_ops->pde_encode_bo(bo, pt29_ofs, pat_index); 210 xe_pt_write(xe, &vm->pt_root[id]->bo->vmap, 0, entry); 211 212 map_ofs = (num_entries - num_setup) * XE_PAGE_SIZE; 213 214 /* Map the entire BO in our level 0 pt */ 215 for (i = 0, level = 0; i < num_entries; level++) { 216 entry = vm->pt_ops->pte_encode_bo(bo, i * XE_PAGE_SIZE, 217 pat_index, 0); 218 219 xe_map_wr(xe, &bo->vmap, map_ofs + level * 8, u64, entry); 220 221 if (vm->flags & XE_VM_FLAG_64K) 222 i += 16; 223 else 224 i += 1; 225 } 226 227 if (!IS_DGFX(xe)) { 228 /* Write out batch too */ 229 m->batch_base_ofs = NUM_PT_SLOTS * XE_PAGE_SIZE; 230 for (i = 0; i < xe_bo_size(batch); 231 i += vm->flags & XE_VM_FLAG_64K ? XE_64K_PAGE_SIZE : 232 XE_PAGE_SIZE) { 233 entry = vm->pt_ops->pte_encode_bo(batch, i, 234 pat_index, 0); 235 236 xe_map_wr(xe, &bo->vmap, map_ofs + level * 8, u64, 237 entry); 238 level++; 239 } 240 if (xe->info.has_usm) { 241 xe_tile_assert(tile, xe_bo_size(batch) == SZ_1M); 242 243 batch = tile->primary_gt->usm.bb_pool->bo; 244 m->usm_batch_base_ofs = m->batch_base_ofs + SZ_1M; 245 xe_tile_assert(tile, xe_bo_size(batch) == SZ_512K); 246 247 for (i = 0; i < xe_bo_size(batch); 248 i += vm->flags & XE_VM_FLAG_64K ? XE_64K_PAGE_SIZE : 249 XE_PAGE_SIZE) { 250 entry = vm->pt_ops->pte_encode_bo(batch, i, 251 pat_index, 0); 252 253 xe_map_wr(xe, &bo->vmap, map_ofs + level * 8, u64, 254 entry); 255 level++; 256 } 257 } 258 } else { 259 u64 batch_addr = xe_bo_addr(batch, 0, XE_PAGE_SIZE); 260 261 m->batch_base_ofs = xe_migrate_vram_ofs(xe, batch_addr, false); 262 263 if (xe->info.has_usm) { 264 batch = tile->primary_gt->usm.bb_pool->bo; 265 batch_addr = xe_bo_addr(batch, 0, XE_PAGE_SIZE); 266 m->usm_batch_base_ofs = xe_migrate_vram_ofs(xe, batch_addr, false); 267 } 268 } 269 270 for (level = 1; level < num_level; level++) { 271 u32 flags = 0; 272 273 if (vm->flags & XE_VM_FLAG_64K && level == 1) 274 flags = XE_PDE_64K; 275 276 entry = vm->pt_ops->pde_encode_bo(bo, map_ofs + (u64)(level - 1) * 277 XE_PAGE_SIZE, pat_index); 278 xe_map_wr(xe, &bo->vmap, map_ofs + XE_PAGE_SIZE * level, u64, 279 entry | flags); 280 } 281 282 /* Write PDE's that point to our BO. */ 283 for (i = 0; i < map_ofs / PAGE_SIZE; i++) { 284 entry = vm->pt_ops->pde_encode_bo(bo, (u64)i * XE_PAGE_SIZE, 285 pat_index); 286 287 xe_map_wr(xe, &bo->vmap, map_ofs + XE_PAGE_SIZE + 288 (i + 1) * 8, u64, entry); 289 } 290 291 /* Set up a 1GiB NULL mapping at 255GiB offset. */ 292 level = 2; 293 xe_map_wr(xe, &bo->vmap, map_ofs + XE_PAGE_SIZE * level + 255 * 8, u64, 294 vm->pt_ops->pte_encode_addr(xe, 0, pat_index, level, IS_DGFX(xe), 0) 295 | XE_PTE_NULL); 296 m->cleared_mem_ofs = (255ULL << xe_pt_shift(level)); 297 298 /* Identity map the entire vram at 256GiB offset */ 299 if (IS_DGFX(xe)) { 300 u64 pt30_ofs = xe_bo_size(bo) - 2 * XE_PAGE_SIZE; 301 resource_size_t actual_phy_size = xe_vram_region_actual_physical_size(xe->mem.vram); 302 303 xe_migrate_program_identity(xe, vm, bo, map_ofs, IDENTITY_OFFSET, 304 pat_index, pt30_ofs); 305 xe_assert(xe, actual_phy_size <= (MAX_NUM_PTE - IDENTITY_OFFSET) * SZ_1G); 306 307 /* 308 * Identity map the entire vram for compressed pat_index for xe2+ 309 * if flat ccs is enabled. 310 */ 311 if (GRAPHICS_VER(xe) >= 20 && xe_device_has_flat_ccs(xe)) { 312 u16 comp_pat_index = xe->pat.idx[XE_CACHE_NONE_COMPRESSION]; 313 u64 vram_offset = IDENTITY_OFFSET + 314 DIV_ROUND_UP_ULL(actual_phy_size, SZ_1G); 315 u64 pt31_ofs = xe_bo_size(bo) - XE_PAGE_SIZE; 316 317 xe_assert(xe, actual_phy_size <= (MAX_NUM_PTE - IDENTITY_OFFSET - 318 IDENTITY_OFFSET / 2) * SZ_1G); 319 xe_migrate_program_identity(xe, vm, bo, map_ofs, vram_offset, 320 comp_pat_index, pt31_ofs); 321 } 322 } 323 324 /* 325 * Example layout created above, with root level = 3: 326 * [PT0...PT7]: kernel PT's for copy/clear; 64 or 4KiB PTE's 327 * [PT8]: Kernel PT for VM_BIND, 4 KiB PTE's 328 * [PT9...PT26]: Userspace PT's for VM_BIND, 4 KiB PTE's 329 * [PT27 = PDE 0] [PT28 = PDE 1] [PT29 = PDE 2] [PT30 & PT31 = 2M vram identity map] 330 * 331 * This makes the lowest part of the VM point to the pagetables. 332 * Hence the lowest 2M in the vm should point to itself, with a few writes 333 * and flushes, other parts of the VM can be used either for copying and 334 * clearing. 335 * 336 * For performance, the kernel reserves PDE's, so about 20 are left 337 * for async VM updates. 338 * 339 * To make it easier to work, each scratch PT is put in slot (1 + PT #) 340 * everywhere, this allows lockless updates to scratch pages by using 341 * the different addresses in VM. 342 */ 343 #define NUM_VMUSA_UNIT_PER_PAGE 32 344 #define VM_SA_UPDATE_UNIT_SIZE (XE_PAGE_SIZE / NUM_VMUSA_UNIT_PER_PAGE) 345 #define NUM_VMUSA_WRITES_PER_UNIT (VM_SA_UPDATE_UNIT_SIZE / sizeof(u64)) 346 drm_suballoc_manager_init(&m->vm_update_sa, 347 (size_t)(map_ofs / XE_PAGE_SIZE - NUM_KERNEL_PDE) * 348 NUM_VMUSA_UNIT_PER_PAGE, 0); 349 350 m->pt_bo = bo; 351 return 0; 352 } 353 354 /* 355 * Including the reserved copy engine is required to avoid deadlocks due to 356 * migrate jobs servicing the faults gets stuck behind the job that faulted. 357 */ 358 static u32 xe_migrate_usm_logical_mask(struct xe_gt *gt) 359 { 360 u32 logical_mask = 0; 361 struct xe_hw_engine *hwe; 362 enum xe_hw_engine_id id; 363 364 for_each_hw_engine(hwe, gt, id) { 365 if (hwe->class != XE_ENGINE_CLASS_COPY) 366 continue; 367 368 if (xe_gt_is_usm_hwe(gt, hwe)) 369 logical_mask |= BIT(hwe->logical_instance); 370 } 371 372 return logical_mask; 373 } 374 375 static bool xe_migrate_needs_ccs_emit(struct xe_device *xe) 376 { 377 return xe_device_has_flat_ccs(xe) && !(GRAPHICS_VER(xe) >= 20 && IS_DGFX(xe)); 378 } 379 380 /** 381 * xe_migrate_alloc - Allocate a migrate struct for a given &xe_tile 382 * @tile: &xe_tile 383 * 384 * Allocates a &xe_migrate for a given tile. 385 * 386 * Return: &xe_migrate on success, or NULL when out of memory. 387 */ 388 struct xe_migrate *xe_migrate_alloc(struct xe_tile *tile) 389 { 390 struct xe_migrate *m = drmm_kzalloc(&tile_to_xe(tile)->drm, sizeof(*m), GFP_KERNEL); 391 392 if (m) 393 m->tile = tile; 394 return m; 395 } 396 397 /** 398 * xe_migrate_init() - Initialize a migrate context 399 * @tile: Back-pointer to the tile we're initializing for. 400 * 401 * Return: Pointer to a migrate context on success. Error pointer on error. 402 */ 403 struct xe_migrate *xe_migrate_init(struct xe_tile *tile) 404 { 405 struct xe_device *xe = tile_to_xe(tile); 406 struct xe_gt *primary_gt = tile->primary_gt; 407 struct xe_migrate *m = tile->migrate; 408 struct xe_vm *vm; 409 int err; 410 411 /* Special layout, prepared below.. */ 412 vm = xe_vm_create(xe, XE_VM_FLAG_MIGRATION | 413 XE_VM_FLAG_SET_TILE_ID(tile)); 414 if (IS_ERR(vm)) 415 return ERR_CAST(vm); 416 417 xe_vm_lock(vm, false); 418 err = xe_migrate_prepare_vm(tile, m, vm); 419 xe_vm_unlock(vm); 420 if (err) { 421 xe_vm_close_and_put(vm); 422 return ERR_PTR(err); 423 } 424 425 if (xe->info.has_usm) { 426 struct xe_hw_engine *hwe = xe_gt_hw_engine(primary_gt, 427 XE_ENGINE_CLASS_COPY, 428 primary_gt->usm.reserved_bcs_instance, 429 false); 430 u32 logical_mask = xe_migrate_usm_logical_mask(primary_gt); 431 432 if (!hwe || !logical_mask) 433 return ERR_PTR(-EINVAL); 434 435 /* 436 * XXX: Currently only reserving 1 (likely slow) BCS instance on 437 * PVC, may want to revisit if performance is needed. 438 */ 439 m->q = xe_exec_queue_create(xe, vm, logical_mask, 1, hwe, 440 EXEC_QUEUE_FLAG_KERNEL | 441 EXEC_QUEUE_FLAG_PERMANENT | 442 EXEC_QUEUE_FLAG_HIGH_PRIORITY | 443 EXEC_QUEUE_FLAG_MIGRATE, 0); 444 } else { 445 m->q = xe_exec_queue_create_class(xe, primary_gt, vm, 446 XE_ENGINE_CLASS_COPY, 447 EXEC_QUEUE_FLAG_KERNEL | 448 EXEC_QUEUE_FLAG_PERMANENT | 449 EXEC_QUEUE_FLAG_MIGRATE, 0); 450 } 451 if (IS_ERR(m->q)) { 452 xe_vm_close_and_put(vm); 453 return ERR_CAST(m->q); 454 } 455 456 mutex_init(&m->job_mutex); 457 fs_reclaim_acquire(GFP_KERNEL); 458 might_lock(&m->job_mutex); 459 fs_reclaim_release(GFP_KERNEL); 460 461 err = devm_add_action_or_reset(xe->drm.dev, xe_migrate_fini, m); 462 if (err) 463 return ERR_PTR(err); 464 465 if (IS_DGFX(xe)) { 466 if (xe_migrate_needs_ccs_emit(xe)) 467 /* min chunk size corresponds to 4K of CCS Metadata */ 468 m->min_chunk_size = SZ_4K * SZ_64K / 469 xe_device_ccs_bytes(xe, SZ_64K); 470 else 471 /* Somewhat arbitrary to avoid a huge amount of blits */ 472 m->min_chunk_size = SZ_64K; 473 m->min_chunk_size = roundup_pow_of_two(m->min_chunk_size); 474 drm_dbg(&xe->drm, "Migrate min chunk size is 0x%08llx\n", 475 (unsigned long long)m->min_chunk_size); 476 } 477 478 return m; 479 } 480 481 static u64 max_mem_transfer_per_pass(struct xe_device *xe) 482 { 483 if (!IS_DGFX(xe) && xe_device_has_flat_ccs(xe)) 484 return MAX_CCS_LIMITED_TRANSFER; 485 486 return MAX_PREEMPTDISABLE_TRANSFER; 487 } 488 489 static u64 xe_migrate_res_sizes(struct xe_migrate *m, struct xe_res_cursor *cur) 490 { 491 struct xe_device *xe = tile_to_xe(m->tile); 492 u64 size = min_t(u64, max_mem_transfer_per_pass(xe), cur->remaining); 493 494 if (mem_type_is_vram(cur->mem_type)) { 495 /* 496 * VRAM we want to blit in chunks with sizes aligned to 497 * min_chunk_size in order for the offset to CCS metadata to be 498 * page-aligned. If it's the last chunk it may be smaller. 499 * 500 * Another constraint is that we need to limit the blit to 501 * the VRAM block size, unless size is smaller than 502 * min_chunk_size. 503 */ 504 u64 chunk = max_t(u64, cur->size, m->min_chunk_size); 505 506 size = min_t(u64, size, chunk); 507 if (size > m->min_chunk_size) 508 size = round_down(size, m->min_chunk_size); 509 } 510 511 return size; 512 } 513 514 static bool xe_migrate_allow_identity(u64 size, const struct xe_res_cursor *cur) 515 { 516 /* If the chunk is not fragmented, allow identity map. */ 517 return cur->size >= size; 518 } 519 520 #define PTE_UPDATE_FLAG_IS_VRAM BIT(0) 521 #define PTE_UPDATE_FLAG_IS_COMP_PTE BIT(1) 522 523 static u32 pte_update_size(struct xe_migrate *m, 524 u32 flags, 525 struct ttm_resource *res, 526 struct xe_res_cursor *cur, 527 u64 *L0, u64 *L0_ofs, u32 *L0_pt, 528 u32 cmd_size, u32 pt_ofs, u32 avail_pts) 529 { 530 u32 cmds = 0; 531 bool is_vram = PTE_UPDATE_FLAG_IS_VRAM & flags; 532 bool is_comp_pte = PTE_UPDATE_FLAG_IS_COMP_PTE & flags; 533 534 *L0_pt = pt_ofs; 535 if (is_vram && xe_migrate_allow_identity(*L0, cur)) { 536 /* Offset into identity map. */ 537 *L0_ofs = xe_migrate_vram_ofs(tile_to_xe(m->tile), 538 cur->start + vram_region_gpu_offset(res), 539 is_comp_pte); 540 cmds += cmd_size; 541 } else { 542 /* Clip L0 to available size */ 543 u64 size = min(*L0, (u64)avail_pts * SZ_2M); 544 u32 num_4k_pages = (size + XE_PAGE_SIZE - 1) >> XE_PTE_SHIFT; 545 546 *L0 = size; 547 *L0_ofs = xe_migrate_vm_addr(pt_ofs, 0); 548 549 /* MI_STORE_DATA_IMM */ 550 cmds += 3 * DIV_ROUND_UP(num_4k_pages, MAX_PTE_PER_SDI); 551 552 /* PDE qwords */ 553 cmds += num_4k_pages * 2; 554 555 /* Each chunk has a single blit command */ 556 cmds += cmd_size; 557 } 558 559 return cmds; 560 } 561 562 static void emit_pte(struct xe_migrate *m, 563 struct xe_bb *bb, u32 at_pt, 564 bool is_vram, bool is_comp_pte, 565 struct xe_res_cursor *cur, 566 u32 size, struct ttm_resource *res) 567 { 568 struct xe_device *xe = tile_to_xe(m->tile); 569 struct xe_vm *vm = m->q->vm; 570 u16 pat_index; 571 u32 ptes; 572 u64 ofs = (u64)at_pt * XE_PAGE_SIZE; 573 u64 cur_ofs; 574 575 /* Indirect access needs compression enabled uncached PAT index */ 576 if (GRAPHICS_VERx100(xe) >= 2000) 577 pat_index = is_comp_pte ? xe->pat.idx[XE_CACHE_NONE_COMPRESSION] : 578 xe->pat.idx[XE_CACHE_WB]; 579 else 580 pat_index = xe->pat.idx[XE_CACHE_WB]; 581 582 ptes = DIV_ROUND_UP(size, XE_PAGE_SIZE); 583 584 while (ptes) { 585 u32 chunk = min(MAX_PTE_PER_SDI, ptes); 586 587 bb->cs[bb->len++] = MI_STORE_DATA_IMM | MI_SDI_NUM_QW(chunk); 588 bb->cs[bb->len++] = ofs; 589 bb->cs[bb->len++] = 0; 590 591 cur_ofs = ofs; 592 ofs += chunk * 8; 593 ptes -= chunk; 594 595 while (chunk--) { 596 u64 addr, flags = 0; 597 bool devmem = false; 598 599 addr = xe_res_dma(cur) & PAGE_MASK; 600 if (is_vram) { 601 if (vm->flags & XE_VM_FLAG_64K) { 602 u64 va = cur_ofs * XE_PAGE_SIZE / 8; 603 604 xe_assert(xe, (va & (SZ_64K - 1)) == 605 (addr & (SZ_64K - 1))); 606 607 flags |= XE_PTE_PS64; 608 } 609 610 addr += vram_region_gpu_offset(res); 611 devmem = true; 612 } 613 614 addr = vm->pt_ops->pte_encode_addr(m->tile->xe, 615 addr, pat_index, 616 0, devmem, flags); 617 bb->cs[bb->len++] = lower_32_bits(addr); 618 bb->cs[bb->len++] = upper_32_bits(addr); 619 620 xe_res_next(cur, min_t(u32, size, PAGE_SIZE)); 621 cur_ofs += 8; 622 } 623 } 624 } 625 626 #define EMIT_COPY_CCS_DW 5 627 static void emit_copy_ccs(struct xe_gt *gt, struct xe_bb *bb, 628 u64 dst_ofs, bool dst_is_indirect, 629 u64 src_ofs, bool src_is_indirect, 630 u32 size) 631 { 632 struct xe_device *xe = gt_to_xe(gt); 633 u32 *cs = bb->cs + bb->len; 634 u32 num_ccs_blks; 635 u32 num_pages; 636 u32 ccs_copy_size; 637 u32 mocs; 638 639 if (GRAPHICS_VERx100(xe) >= 2000) { 640 num_pages = DIV_ROUND_UP(size, XE_PAGE_SIZE); 641 xe_gt_assert(gt, FIELD_FIT(XE2_CCS_SIZE_MASK, num_pages - 1)); 642 643 ccs_copy_size = REG_FIELD_PREP(XE2_CCS_SIZE_MASK, num_pages - 1); 644 mocs = FIELD_PREP(XE2_XY_CTRL_SURF_MOCS_INDEX_MASK, gt->mocs.uc_index); 645 646 } else { 647 num_ccs_blks = DIV_ROUND_UP(xe_device_ccs_bytes(gt_to_xe(gt), size), 648 NUM_CCS_BYTES_PER_BLOCK); 649 xe_gt_assert(gt, FIELD_FIT(CCS_SIZE_MASK, num_ccs_blks - 1)); 650 651 ccs_copy_size = REG_FIELD_PREP(CCS_SIZE_MASK, num_ccs_blks - 1); 652 mocs = FIELD_PREP(XY_CTRL_SURF_MOCS_MASK, gt->mocs.uc_index); 653 } 654 655 *cs++ = XY_CTRL_SURF_COPY_BLT | 656 (src_is_indirect ? 0x0 : 0x1) << SRC_ACCESS_TYPE_SHIFT | 657 (dst_is_indirect ? 0x0 : 0x1) << DST_ACCESS_TYPE_SHIFT | 658 ccs_copy_size; 659 *cs++ = lower_32_bits(src_ofs); 660 *cs++ = upper_32_bits(src_ofs) | mocs; 661 *cs++ = lower_32_bits(dst_ofs); 662 *cs++ = upper_32_bits(dst_ofs) | mocs; 663 664 bb->len = cs - bb->cs; 665 } 666 667 #define EMIT_COPY_DW 10 668 static void emit_copy(struct xe_gt *gt, struct xe_bb *bb, 669 u64 src_ofs, u64 dst_ofs, unsigned int size, 670 unsigned int pitch) 671 { 672 struct xe_device *xe = gt_to_xe(gt); 673 u32 mocs = 0; 674 u32 tile_y = 0; 675 676 xe_gt_assert(gt, !(pitch & 3)); 677 xe_gt_assert(gt, size / pitch <= S16_MAX); 678 xe_gt_assert(gt, pitch / 4 <= S16_MAX); 679 xe_gt_assert(gt, pitch <= U16_MAX); 680 681 if (GRAPHICS_VER(xe) >= 20) 682 mocs = FIELD_PREP(XE2_XY_FAST_COPY_BLT_MOCS_INDEX_MASK, gt->mocs.uc_index); 683 684 if (GRAPHICS_VERx100(xe) >= 1250) 685 tile_y = XY_FAST_COPY_BLT_D1_SRC_TILE4 | XY_FAST_COPY_BLT_D1_DST_TILE4; 686 687 bb->cs[bb->len++] = XY_FAST_COPY_BLT_CMD | (10 - 2); 688 bb->cs[bb->len++] = XY_FAST_COPY_BLT_DEPTH_32 | pitch | tile_y | mocs; 689 bb->cs[bb->len++] = 0; 690 bb->cs[bb->len++] = (size / pitch) << 16 | pitch / 4; 691 bb->cs[bb->len++] = lower_32_bits(dst_ofs); 692 bb->cs[bb->len++] = upper_32_bits(dst_ofs); 693 bb->cs[bb->len++] = 0; 694 bb->cs[bb->len++] = pitch | mocs; 695 bb->cs[bb->len++] = lower_32_bits(src_ofs); 696 bb->cs[bb->len++] = upper_32_bits(src_ofs); 697 } 698 699 static u64 xe_migrate_batch_base(struct xe_migrate *m, bool usm) 700 { 701 return usm ? m->usm_batch_base_ofs : m->batch_base_ofs; 702 } 703 704 static u32 xe_migrate_ccs_copy(struct xe_migrate *m, 705 struct xe_bb *bb, 706 u64 src_ofs, bool src_is_indirect, 707 u64 dst_ofs, bool dst_is_indirect, u32 dst_size, 708 u64 ccs_ofs, bool copy_ccs) 709 { 710 struct xe_gt *gt = m->tile->primary_gt; 711 u32 flush_flags = 0; 712 713 if (!copy_ccs && dst_is_indirect) { 714 /* 715 * If the src is already in vram, then it should already 716 * have been cleared by us, or has been populated by the 717 * user. Make sure we copy the CCS aux state as-is. 718 * 719 * Otherwise if the bo doesn't have any CCS metadata attached, 720 * we still need to clear it for security reasons. 721 */ 722 u64 ccs_src_ofs = src_is_indirect ? src_ofs : m->cleared_mem_ofs; 723 724 emit_copy_ccs(gt, bb, 725 dst_ofs, true, 726 ccs_src_ofs, src_is_indirect, dst_size); 727 728 flush_flags = MI_FLUSH_DW_CCS; 729 } else if (copy_ccs) { 730 if (!src_is_indirect) 731 src_ofs = ccs_ofs; 732 else if (!dst_is_indirect) 733 dst_ofs = ccs_ofs; 734 735 xe_gt_assert(gt, src_is_indirect || dst_is_indirect); 736 737 emit_copy_ccs(gt, bb, dst_ofs, dst_is_indirect, src_ofs, 738 src_is_indirect, dst_size); 739 if (dst_is_indirect) 740 flush_flags = MI_FLUSH_DW_CCS; 741 } 742 743 return flush_flags; 744 } 745 746 /** 747 * xe_migrate_copy() - Copy content of TTM resources. 748 * @m: The migration context. 749 * @src_bo: The buffer object @src is currently bound to. 750 * @dst_bo: If copying between resources created for the same bo, set this to 751 * the same value as @src_bo. If copying between buffer objects, set it to 752 * the buffer object @dst is currently bound to. 753 * @src: The source TTM resource. 754 * @dst: The dst TTM resource. 755 * @copy_only_ccs: If true copy only CCS metadata 756 * 757 * Copies the contents of @src to @dst: On flat CCS devices, 758 * the CCS metadata is copied as well if needed, or if not present, 759 * the CCS metadata of @dst is cleared for security reasons. 760 * 761 * Return: Pointer to a dma_fence representing the last copy batch, or 762 * an error pointer on failure. If there is a failure, any copy operation 763 * started by the function call has been synced. 764 */ 765 struct dma_fence *xe_migrate_copy(struct xe_migrate *m, 766 struct xe_bo *src_bo, 767 struct xe_bo *dst_bo, 768 struct ttm_resource *src, 769 struct ttm_resource *dst, 770 bool copy_only_ccs) 771 { 772 struct xe_gt *gt = m->tile->primary_gt; 773 struct xe_device *xe = gt_to_xe(gt); 774 struct dma_fence *fence = NULL; 775 u64 size = xe_bo_size(src_bo); 776 struct xe_res_cursor src_it, dst_it, ccs_it; 777 u64 src_L0_ofs, dst_L0_ofs; 778 u32 src_L0_pt, dst_L0_pt; 779 u64 src_L0, dst_L0; 780 int pass = 0; 781 int err; 782 bool src_is_pltt = src->mem_type == XE_PL_TT; 783 bool dst_is_pltt = dst->mem_type == XE_PL_TT; 784 bool src_is_vram = mem_type_is_vram(src->mem_type); 785 bool dst_is_vram = mem_type_is_vram(dst->mem_type); 786 bool type_device = src_bo->ttm.type == ttm_bo_type_device; 787 bool needs_ccs_emit = type_device && xe_migrate_needs_ccs_emit(xe); 788 bool copy_ccs = xe_device_has_flat_ccs(xe) && 789 xe_bo_needs_ccs_pages(src_bo) && xe_bo_needs_ccs_pages(dst_bo); 790 bool copy_system_ccs = copy_ccs && (!src_is_vram || !dst_is_vram); 791 bool use_comp_pat = type_device && xe_device_has_flat_ccs(xe) && 792 GRAPHICS_VER(xe) >= 20 && src_is_vram && !dst_is_vram; 793 794 /* Copying CCS between two different BOs is not supported yet. */ 795 if (XE_WARN_ON(copy_ccs && src_bo != dst_bo)) 796 return ERR_PTR(-EINVAL); 797 798 if (src_bo != dst_bo && XE_WARN_ON(xe_bo_size(src_bo) != xe_bo_size(dst_bo))) 799 return ERR_PTR(-EINVAL); 800 801 if (!src_is_vram) 802 xe_res_first_sg(xe_bo_sg(src_bo), 0, size, &src_it); 803 else 804 xe_res_first(src, 0, size, &src_it); 805 if (!dst_is_vram) 806 xe_res_first_sg(xe_bo_sg(dst_bo), 0, size, &dst_it); 807 else 808 xe_res_first(dst, 0, size, &dst_it); 809 810 if (copy_system_ccs) 811 xe_res_first_sg(xe_bo_sg(src_bo), xe_bo_ccs_pages_start(src_bo), 812 PAGE_ALIGN(xe_device_ccs_bytes(xe, size)), 813 &ccs_it); 814 815 while (size) { 816 u32 batch_size = 2; /* arb_clear() + MI_BATCH_BUFFER_END */ 817 struct xe_sched_job *job; 818 struct xe_bb *bb; 819 u32 flush_flags = 0; 820 u32 update_idx; 821 u64 ccs_ofs, ccs_size; 822 u32 ccs_pt; 823 u32 pte_flags; 824 825 bool usm = xe->info.has_usm; 826 u32 avail_pts = max_mem_transfer_per_pass(xe) / LEVEL0_PAGE_TABLE_ENCODE_SIZE; 827 828 src_L0 = xe_migrate_res_sizes(m, &src_it); 829 dst_L0 = xe_migrate_res_sizes(m, &dst_it); 830 831 drm_dbg(&xe->drm, "Pass %u, sizes: %llu & %llu\n", 832 pass++, src_L0, dst_L0); 833 834 src_L0 = min(src_L0, dst_L0); 835 836 pte_flags = src_is_vram ? PTE_UPDATE_FLAG_IS_VRAM : 0; 837 pte_flags |= use_comp_pat ? PTE_UPDATE_FLAG_IS_COMP_PTE : 0; 838 batch_size += pte_update_size(m, pte_flags, src, &src_it, &src_L0, 839 &src_L0_ofs, &src_L0_pt, 0, 0, 840 avail_pts); 841 842 pte_flags = dst_is_vram ? PTE_UPDATE_FLAG_IS_VRAM : 0; 843 batch_size += pte_update_size(m, pte_flags, dst, &dst_it, &src_L0, 844 &dst_L0_ofs, &dst_L0_pt, 0, 845 avail_pts, avail_pts); 846 847 if (copy_system_ccs) { 848 xe_assert(xe, type_device); 849 ccs_size = xe_device_ccs_bytes(xe, src_L0); 850 batch_size += pte_update_size(m, 0, NULL, &ccs_it, &ccs_size, 851 &ccs_ofs, &ccs_pt, 0, 852 2 * avail_pts, 853 avail_pts); 854 xe_assert(xe, IS_ALIGNED(ccs_it.start, PAGE_SIZE)); 855 } 856 857 /* Add copy commands size here */ 858 batch_size += ((copy_only_ccs) ? 0 : EMIT_COPY_DW) + 859 ((needs_ccs_emit ? EMIT_COPY_CCS_DW : 0)); 860 861 bb = xe_bb_new(gt, batch_size, usm); 862 if (IS_ERR(bb)) { 863 err = PTR_ERR(bb); 864 goto err_sync; 865 } 866 867 if (src_is_vram && xe_migrate_allow_identity(src_L0, &src_it)) 868 xe_res_next(&src_it, src_L0); 869 else 870 emit_pte(m, bb, src_L0_pt, src_is_vram, copy_system_ccs || use_comp_pat, 871 &src_it, src_L0, src); 872 873 if (dst_is_vram && xe_migrate_allow_identity(src_L0, &dst_it)) 874 xe_res_next(&dst_it, src_L0); 875 else 876 emit_pte(m, bb, dst_L0_pt, dst_is_vram, copy_system_ccs, 877 &dst_it, src_L0, dst); 878 879 if (copy_system_ccs) 880 emit_pte(m, bb, ccs_pt, false, false, &ccs_it, ccs_size, src); 881 882 bb->cs[bb->len++] = MI_BATCH_BUFFER_END; 883 update_idx = bb->len; 884 885 if (!copy_only_ccs) 886 emit_copy(gt, bb, src_L0_ofs, dst_L0_ofs, src_L0, XE_PAGE_SIZE); 887 888 if (needs_ccs_emit) 889 flush_flags = xe_migrate_ccs_copy(m, bb, src_L0_ofs, 890 IS_DGFX(xe) ? src_is_vram : src_is_pltt, 891 dst_L0_ofs, 892 IS_DGFX(xe) ? dst_is_vram : dst_is_pltt, 893 src_L0, ccs_ofs, copy_ccs); 894 895 job = xe_bb_create_migration_job(m->q, bb, 896 xe_migrate_batch_base(m, usm), 897 update_idx); 898 if (IS_ERR(job)) { 899 err = PTR_ERR(job); 900 goto err; 901 } 902 903 xe_sched_job_add_migrate_flush(job, flush_flags); 904 if (!fence) { 905 err = xe_sched_job_add_deps(job, src_bo->ttm.base.resv, 906 DMA_RESV_USAGE_BOOKKEEP); 907 if (!err && src_bo != dst_bo) 908 err = xe_sched_job_add_deps(job, dst_bo->ttm.base.resv, 909 DMA_RESV_USAGE_BOOKKEEP); 910 if (err) 911 goto err_job; 912 } 913 914 mutex_lock(&m->job_mutex); 915 xe_sched_job_arm(job); 916 dma_fence_put(fence); 917 fence = dma_fence_get(&job->drm.s_fence->finished); 918 xe_sched_job_push(job); 919 920 dma_fence_put(m->fence); 921 m->fence = dma_fence_get(fence); 922 923 mutex_unlock(&m->job_mutex); 924 925 xe_bb_free(bb, fence); 926 size -= src_L0; 927 continue; 928 929 err_job: 930 xe_sched_job_put(job); 931 err: 932 xe_bb_free(bb, NULL); 933 934 err_sync: 935 /* Sync partial copy if any. FIXME: under job_mutex? */ 936 if (fence) { 937 dma_fence_wait(fence, false); 938 dma_fence_put(fence); 939 } 940 941 return ERR_PTR(err); 942 } 943 944 return fence; 945 } 946 947 /** 948 * xe_get_migrate_lrc() - Get the LRC from migrate context. 949 * @migrate: Migrate context. 950 * 951 * Return: Pointer to LRC on success, error on failure 952 */ 953 struct xe_lrc *xe_migrate_lrc(struct xe_migrate *migrate) 954 { 955 return migrate->q->lrc[0]; 956 } 957 958 static int emit_flush_invalidate(struct xe_migrate *m, u32 *dw, int i, 959 u32 flags) 960 { 961 dw[i++] = MI_FLUSH_DW | MI_INVALIDATE_TLB | MI_FLUSH_DW_OP_STOREDW | 962 MI_FLUSH_IMM_DW | flags; 963 dw[i++] = lower_32_bits(xe_lrc_start_seqno_ggtt_addr(xe_migrate_lrc(m))) | 964 MI_FLUSH_DW_USE_GTT; 965 dw[i++] = upper_32_bits(xe_lrc_start_seqno_ggtt_addr(xe_migrate_lrc(m))); 966 dw[i++] = MI_NOOP; 967 dw[i++] = MI_NOOP; 968 969 return i; 970 } 971 972 /** 973 * xe_migrate_ccs_rw_copy() - Copy content of TTM resources. 974 * @m: The migration context. 975 * @src_bo: The buffer object @src is currently bound to. 976 * @read_write : Creates BB commands for CCS read/write. 977 * 978 * Creates batch buffer instructions to copy CCS metadata from CCS pool to 979 * memory and vice versa. 980 * 981 * This function should only be called for IGPU. 982 * 983 * Return: 0 if successful, negative error code on failure. 984 */ 985 int xe_migrate_ccs_rw_copy(struct xe_migrate *m, 986 struct xe_bo *src_bo, 987 enum xe_sriov_vf_ccs_rw_ctxs read_write) 988 989 { 990 bool src_is_pltt = read_write == XE_SRIOV_VF_CCS_READ_CTX; 991 bool dst_is_pltt = read_write == XE_SRIOV_VF_CCS_WRITE_CTX; 992 struct ttm_resource *src = src_bo->ttm.resource; 993 struct xe_gt *gt = m->tile->primary_gt; 994 u32 batch_size, batch_size_allocated; 995 struct xe_device *xe = gt_to_xe(gt); 996 struct xe_res_cursor src_it, ccs_it; 997 u64 size = xe_bo_size(src_bo); 998 struct xe_bb *bb = NULL; 999 u64 src_L0, src_L0_ofs; 1000 u32 src_L0_pt; 1001 int err; 1002 1003 xe_res_first_sg(xe_bo_sg(src_bo), 0, size, &src_it); 1004 1005 xe_res_first_sg(xe_bo_sg(src_bo), xe_bo_ccs_pages_start(src_bo), 1006 PAGE_ALIGN(xe_device_ccs_bytes(xe, size)), 1007 &ccs_it); 1008 1009 /* Calculate Batch buffer size */ 1010 batch_size = 0; 1011 while (size) { 1012 batch_size += 10; /* Flush + ggtt addr + 2 NOP */ 1013 u64 ccs_ofs, ccs_size; 1014 u32 ccs_pt; 1015 1016 u32 avail_pts = max_mem_transfer_per_pass(xe) / LEVEL0_PAGE_TABLE_ENCODE_SIZE; 1017 1018 src_L0 = min_t(u64, max_mem_transfer_per_pass(xe), size); 1019 1020 batch_size += pte_update_size(m, false, src, &src_it, &src_L0, 1021 &src_L0_ofs, &src_L0_pt, 0, 0, 1022 avail_pts); 1023 1024 ccs_size = xe_device_ccs_bytes(xe, src_L0); 1025 batch_size += pte_update_size(m, 0, NULL, &ccs_it, &ccs_size, &ccs_ofs, 1026 &ccs_pt, 0, avail_pts, avail_pts); 1027 xe_assert(xe, IS_ALIGNED(ccs_it.start, PAGE_SIZE)); 1028 1029 /* Add copy commands size here */ 1030 batch_size += EMIT_COPY_CCS_DW; 1031 1032 size -= src_L0; 1033 } 1034 1035 bb = xe_bb_ccs_new(gt, batch_size, read_write); 1036 if (IS_ERR(bb)) { 1037 drm_err(&xe->drm, "BB allocation failed.\n"); 1038 err = PTR_ERR(bb); 1039 goto err_ret; 1040 } 1041 1042 batch_size_allocated = batch_size; 1043 size = xe_bo_size(src_bo); 1044 batch_size = 0; 1045 1046 /* 1047 * Emit PTE and copy commands here. 1048 * The CCS copy command can only support limited size. If the size to be 1049 * copied is more than the limit, divide copy into chunks. So, calculate 1050 * sizes here again before copy command is emitted. 1051 */ 1052 while (size) { 1053 batch_size += 10; /* Flush + ggtt addr + 2 NOP */ 1054 u32 flush_flags = 0; 1055 u64 ccs_ofs, ccs_size; 1056 u32 ccs_pt; 1057 1058 u32 avail_pts = max_mem_transfer_per_pass(xe) / LEVEL0_PAGE_TABLE_ENCODE_SIZE; 1059 1060 src_L0 = xe_migrate_res_sizes(m, &src_it); 1061 1062 batch_size += pte_update_size(m, false, src, &src_it, &src_L0, 1063 &src_L0_ofs, &src_L0_pt, 0, 0, 1064 avail_pts); 1065 1066 ccs_size = xe_device_ccs_bytes(xe, src_L0); 1067 batch_size += pte_update_size(m, 0, NULL, &ccs_it, &ccs_size, &ccs_ofs, 1068 &ccs_pt, 0, avail_pts, avail_pts); 1069 xe_assert(xe, IS_ALIGNED(ccs_it.start, PAGE_SIZE)); 1070 batch_size += EMIT_COPY_CCS_DW; 1071 1072 emit_pte(m, bb, src_L0_pt, false, true, &src_it, src_L0, src); 1073 1074 emit_pte(m, bb, ccs_pt, false, false, &ccs_it, ccs_size, src); 1075 1076 bb->len = emit_flush_invalidate(m, bb->cs, bb->len, flush_flags); 1077 flush_flags = xe_migrate_ccs_copy(m, bb, src_L0_ofs, src_is_pltt, 1078 src_L0_ofs, dst_is_pltt, 1079 src_L0, ccs_ofs, true); 1080 bb->len = emit_flush_invalidate(m, bb->cs, bb->len, flush_flags); 1081 1082 size -= src_L0; 1083 } 1084 1085 xe_assert(xe, (batch_size_allocated == bb->len)); 1086 src_bo->bb_ccs[read_write] = bb; 1087 1088 return 0; 1089 1090 err_ret: 1091 return err; 1092 } 1093 1094 /** 1095 * xe_get_migrate_exec_queue() - Get the execution queue from migrate context. 1096 * @migrate: Migrate context. 1097 * 1098 * Return: Pointer to execution queue on success, error on failure 1099 */ 1100 struct xe_exec_queue *xe_migrate_exec_queue(struct xe_migrate *migrate) 1101 { 1102 return migrate->q; 1103 } 1104 1105 static void emit_clear_link_copy(struct xe_gt *gt, struct xe_bb *bb, u64 src_ofs, 1106 u32 size, u32 pitch) 1107 { 1108 struct xe_device *xe = gt_to_xe(gt); 1109 u32 *cs = bb->cs + bb->len; 1110 u32 len = PVC_MEM_SET_CMD_LEN_DW; 1111 1112 *cs++ = PVC_MEM_SET_CMD | PVC_MEM_SET_MATRIX | (len - 2); 1113 *cs++ = pitch - 1; 1114 *cs++ = (size / pitch) - 1; 1115 *cs++ = pitch - 1; 1116 *cs++ = lower_32_bits(src_ofs); 1117 *cs++ = upper_32_bits(src_ofs); 1118 if (GRAPHICS_VERx100(xe) >= 2000) 1119 *cs++ = FIELD_PREP(XE2_MEM_SET_MOCS_INDEX_MASK, gt->mocs.uc_index); 1120 else 1121 *cs++ = FIELD_PREP(PVC_MEM_SET_MOCS_INDEX_MASK, gt->mocs.uc_index); 1122 1123 xe_gt_assert(gt, cs - bb->cs == len + bb->len); 1124 1125 bb->len += len; 1126 } 1127 1128 static void emit_clear_main_copy(struct xe_gt *gt, struct xe_bb *bb, 1129 u64 src_ofs, u32 size, u32 pitch, bool is_vram) 1130 { 1131 struct xe_device *xe = gt_to_xe(gt); 1132 u32 *cs = bb->cs + bb->len; 1133 u32 len = XY_FAST_COLOR_BLT_DW; 1134 1135 if (GRAPHICS_VERx100(xe) < 1250) 1136 len = 11; 1137 1138 *cs++ = XY_FAST_COLOR_BLT_CMD | XY_FAST_COLOR_BLT_DEPTH_32 | 1139 (len - 2); 1140 if (GRAPHICS_VERx100(xe) >= 2000) 1141 *cs++ = FIELD_PREP(XE2_XY_FAST_COLOR_BLT_MOCS_INDEX_MASK, gt->mocs.uc_index) | 1142 (pitch - 1); 1143 else 1144 *cs++ = FIELD_PREP(XY_FAST_COLOR_BLT_MOCS_MASK, gt->mocs.uc_index) | 1145 (pitch - 1); 1146 *cs++ = 0; 1147 *cs++ = (size / pitch) << 16 | pitch / 4; 1148 *cs++ = lower_32_bits(src_ofs); 1149 *cs++ = upper_32_bits(src_ofs); 1150 *cs++ = (is_vram ? 0x0 : 0x1) << XY_FAST_COLOR_BLT_MEM_TYPE_SHIFT; 1151 *cs++ = 0; 1152 *cs++ = 0; 1153 *cs++ = 0; 1154 *cs++ = 0; 1155 1156 if (len > 11) { 1157 *cs++ = 0; 1158 *cs++ = 0; 1159 *cs++ = 0; 1160 *cs++ = 0; 1161 *cs++ = 0; 1162 } 1163 1164 xe_gt_assert(gt, cs - bb->cs == len + bb->len); 1165 1166 bb->len += len; 1167 } 1168 1169 static bool has_service_copy_support(struct xe_gt *gt) 1170 { 1171 /* 1172 * What we care about is whether the architecture was designed with 1173 * service copy functionality (specifically the new MEM_SET / MEM_COPY 1174 * instructions) so check the architectural engine list rather than the 1175 * actual list since these instructions are usable on BCS0 even if 1176 * all of the actual service copy engines (BCS1-BCS8) have been fused 1177 * off. 1178 */ 1179 return gt->info.engine_mask & GENMASK(XE_HW_ENGINE_BCS8, 1180 XE_HW_ENGINE_BCS1); 1181 } 1182 1183 static u32 emit_clear_cmd_len(struct xe_gt *gt) 1184 { 1185 if (has_service_copy_support(gt)) 1186 return PVC_MEM_SET_CMD_LEN_DW; 1187 else 1188 return XY_FAST_COLOR_BLT_DW; 1189 } 1190 1191 static void emit_clear(struct xe_gt *gt, struct xe_bb *bb, u64 src_ofs, 1192 u32 size, u32 pitch, bool is_vram) 1193 { 1194 if (has_service_copy_support(gt)) 1195 emit_clear_link_copy(gt, bb, src_ofs, size, pitch); 1196 else 1197 emit_clear_main_copy(gt, bb, src_ofs, size, pitch, 1198 is_vram); 1199 } 1200 1201 /** 1202 * xe_migrate_clear() - Copy content of TTM resources. 1203 * @m: The migration context. 1204 * @bo: The buffer object @dst is currently bound to. 1205 * @dst: The dst TTM resource to be cleared. 1206 * @clear_flags: flags to specify which data to clear: CCS, BO, or both. 1207 * 1208 * Clear the contents of @dst to zero when XE_MIGRATE_CLEAR_FLAG_BO_DATA is set. 1209 * On flat CCS devices, the CCS metadata is cleared to zero with XE_MIGRATE_CLEAR_FLAG_CCS_DATA. 1210 * Set XE_MIGRATE_CLEAR_FLAG_FULL to clear bo as well as CCS metadata. 1211 * TODO: Eliminate the @bo argument. 1212 * 1213 * Return: Pointer to a dma_fence representing the last clear batch, or 1214 * an error pointer on failure. If there is a failure, any clear operation 1215 * started by the function call has been synced. 1216 */ 1217 struct dma_fence *xe_migrate_clear(struct xe_migrate *m, 1218 struct xe_bo *bo, 1219 struct ttm_resource *dst, 1220 u32 clear_flags) 1221 { 1222 bool clear_vram = mem_type_is_vram(dst->mem_type); 1223 bool clear_bo_data = XE_MIGRATE_CLEAR_FLAG_BO_DATA & clear_flags; 1224 bool clear_ccs = XE_MIGRATE_CLEAR_FLAG_CCS_DATA & clear_flags; 1225 struct xe_gt *gt = m->tile->primary_gt; 1226 struct xe_device *xe = gt_to_xe(gt); 1227 bool clear_only_system_ccs = false; 1228 struct dma_fence *fence = NULL; 1229 u64 size = xe_bo_size(bo); 1230 struct xe_res_cursor src_it; 1231 struct ttm_resource *src = dst; 1232 int err; 1233 1234 if (WARN_ON(!clear_bo_data && !clear_ccs)) 1235 return NULL; 1236 1237 if (!clear_bo_data && clear_ccs && !IS_DGFX(xe)) 1238 clear_only_system_ccs = true; 1239 1240 if (!clear_vram) 1241 xe_res_first_sg(xe_bo_sg(bo), 0, xe_bo_size(bo), &src_it); 1242 else 1243 xe_res_first(src, 0, xe_bo_size(bo), &src_it); 1244 1245 while (size) { 1246 u64 clear_L0_ofs; 1247 u32 clear_L0_pt; 1248 u32 flush_flags = 0; 1249 u64 clear_L0; 1250 struct xe_sched_job *job; 1251 struct xe_bb *bb; 1252 u32 batch_size, update_idx; 1253 u32 pte_flags; 1254 1255 bool usm = xe->info.has_usm; 1256 u32 avail_pts = max_mem_transfer_per_pass(xe) / LEVEL0_PAGE_TABLE_ENCODE_SIZE; 1257 1258 clear_L0 = xe_migrate_res_sizes(m, &src_it); 1259 1260 /* Calculate final sizes and batch size.. */ 1261 pte_flags = clear_vram ? PTE_UPDATE_FLAG_IS_VRAM : 0; 1262 batch_size = 2 + 1263 pte_update_size(m, pte_flags, src, &src_it, 1264 &clear_L0, &clear_L0_ofs, &clear_L0_pt, 1265 clear_bo_data ? emit_clear_cmd_len(gt) : 0, 0, 1266 avail_pts); 1267 1268 if (xe_migrate_needs_ccs_emit(xe)) 1269 batch_size += EMIT_COPY_CCS_DW; 1270 1271 /* Clear commands */ 1272 1273 if (WARN_ON_ONCE(!clear_L0)) 1274 break; 1275 1276 bb = xe_bb_new(gt, batch_size, usm); 1277 if (IS_ERR(bb)) { 1278 err = PTR_ERR(bb); 1279 goto err_sync; 1280 } 1281 1282 size -= clear_L0; 1283 /* Preemption is enabled again by the ring ops. */ 1284 if (clear_vram && xe_migrate_allow_identity(clear_L0, &src_it)) 1285 xe_res_next(&src_it, clear_L0); 1286 else 1287 emit_pte(m, bb, clear_L0_pt, clear_vram, clear_only_system_ccs, 1288 &src_it, clear_L0, dst); 1289 1290 bb->cs[bb->len++] = MI_BATCH_BUFFER_END; 1291 update_idx = bb->len; 1292 1293 if (clear_bo_data) 1294 emit_clear(gt, bb, clear_L0_ofs, clear_L0, XE_PAGE_SIZE, clear_vram); 1295 1296 if (xe_migrate_needs_ccs_emit(xe)) { 1297 emit_copy_ccs(gt, bb, clear_L0_ofs, true, 1298 m->cleared_mem_ofs, false, clear_L0); 1299 flush_flags = MI_FLUSH_DW_CCS; 1300 } 1301 1302 job = xe_bb_create_migration_job(m->q, bb, 1303 xe_migrate_batch_base(m, usm), 1304 update_idx); 1305 if (IS_ERR(job)) { 1306 err = PTR_ERR(job); 1307 goto err; 1308 } 1309 1310 xe_sched_job_add_migrate_flush(job, flush_flags); 1311 if (!fence) { 1312 /* 1313 * There can't be anything userspace related at this 1314 * point, so we just need to respect any potential move 1315 * fences, which are always tracked as 1316 * DMA_RESV_USAGE_KERNEL. 1317 */ 1318 err = xe_sched_job_add_deps(job, bo->ttm.base.resv, 1319 DMA_RESV_USAGE_KERNEL); 1320 if (err) 1321 goto err_job; 1322 } 1323 1324 mutex_lock(&m->job_mutex); 1325 xe_sched_job_arm(job); 1326 dma_fence_put(fence); 1327 fence = dma_fence_get(&job->drm.s_fence->finished); 1328 xe_sched_job_push(job); 1329 1330 dma_fence_put(m->fence); 1331 m->fence = dma_fence_get(fence); 1332 1333 mutex_unlock(&m->job_mutex); 1334 1335 xe_bb_free(bb, fence); 1336 continue; 1337 1338 err_job: 1339 xe_sched_job_put(job); 1340 err: 1341 xe_bb_free(bb, NULL); 1342 err_sync: 1343 /* Sync partial copies if any. FIXME: job_mutex? */ 1344 if (fence) { 1345 dma_fence_wait(fence, false); 1346 dma_fence_put(fence); 1347 } 1348 1349 return ERR_PTR(err); 1350 } 1351 1352 if (clear_ccs) 1353 bo->ccs_cleared = true; 1354 1355 return fence; 1356 } 1357 1358 static void write_pgtable(struct xe_tile *tile, struct xe_bb *bb, u64 ppgtt_ofs, 1359 const struct xe_vm_pgtable_update_op *pt_op, 1360 const struct xe_vm_pgtable_update *update, 1361 struct xe_migrate_pt_update *pt_update) 1362 { 1363 const struct xe_migrate_pt_update_ops *ops = pt_update->ops; 1364 u32 chunk; 1365 u32 ofs = update->ofs, size = update->qwords; 1366 1367 /* 1368 * If we have 512 entries (max), we would populate it ourselves, 1369 * and update the PDE above it to the new pointer. 1370 * The only time this can only happen if we have to update the top 1371 * PDE. This requires a BO that is almost vm->size big. 1372 * 1373 * This shouldn't be possible in practice.. might change when 16K 1374 * pages are used. Hence the assert. 1375 */ 1376 xe_tile_assert(tile, update->qwords < MAX_NUM_PTE); 1377 if (!ppgtt_ofs) 1378 ppgtt_ofs = xe_migrate_vram_ofs(tile_to_xe(tile), 1379 xe_bo_addr(update->pt_bo, 0, 1380 XE_PAGE_SIZE), false); 1381 1382 do { 1383 u64 addr = ppgtt_ofs + ofs * 8; 1384 1385 chunk = min(size, MAX_PTE_PER_SDI); 1386 1387 /* Ensure populatefn can do memset64 by aligning bb->cs */ 1388 if (!(bb->len & 1)) 1389 bb->cs[bb->len++] = MI_NOOP; 1390 1391 bb->cs[bb->len++] = MI_STORE_DATA_IMM | MI_SDI_NUM_QW(chunk); 1392 bb->cs[bb->len++] = lower_32_bits(addr); 1393 bb->cs[bb->len++] = upper_32_bits(addr); 1394 if (pt_op->bind) 1395 ops->populate(pt_update, tile, NULL, bb->cs + bb->len, 1396 ofs, chunk, update); 1397 else 1398 ops->clear(pt_update, tile, NULL, bb->cs + bb->len, 1399 ofs, chunk, update); 1400 1401 bb->len += chunk * 2; 1402 ofs += chunk; 1403 size -= chunk; 1404 } while (size); 1405 } 1406 1407 struct xe_vm *xe_migrate_get_vm(struct xe_migrate *m) 1408 { 1409 return xe_vm_get(m->q->vm); 1410 } 1411 1412 #if IS_ENABLED(CONFIG_DRM_XE_KUNIT_TEST) 1413 struct migrate_test_params { 1414 struct xe_test_priv base; 1415 bool force_gpu; 1416 }; 1417 1418 #define to_migrate_test_params(_priv) \ 1419 container_of(_priv, struct migrate_test_params, base) 1420 #endif 1421 1422 static struct dma_fence * 1423 xe_migrate_update_pgtables_cpu(struct xe_migrate *m, 1424 struct xe_migrate_pt_update *pt_update) 1425 { 1426 XE_TEST_DECLARE(struct migrate_test_params *test = 1427 to_migrate_test_params 1428 (xe_cur_kunit_priv(XE_TEST_LIVE_MIGRATE));) 1429 const struct xe_migrate_pt_update_ops *ops = pt_update->ops; 1430 struct xe_vm *vm = pt_update->vops->vm; 1431 struct xe_vm_pgtable_update_ops *pt_update_ops = 1432 &pt_update->vops->pt_update_ops[pt_update->tile_id]; 1433 int err; 1434 u32 i, j; 1435 1436 if (XE_TEST_ONLY(test && test->force_gpu)) 1437 return ERR_PTR(-ETIME); 1438 1439 if (ops->pre_commit) { 1440 pt_update->job = NULL; 1441 err = ops->pre_commit(pt_update); 1442 if (err) 1443 return ERR_PTR(err); 1444 } 1445 1446 for (i = 0; i < pt_update_ops->num_ops; ++i) { 1447 const struct xe_vm_pgtable_update_op *pt_op = 1448 &pt_update_ops->ops[i]; 1449 1450 for (j = 0; j < pt_op->num_entries; j++) { 1451 const struct xe_vm_pgtable_update *update = 1452 &pt_op->entries[j]; 1453 1454 if (pt_op->bind) 1455 ops->populate(pt_update, m->tile, 1456 &update->pt_bo->vmap, NULL, 1457 update->ofs, update->qwords, 1458 update); 1459 else 1460 ops->clear(pt_update, m->tile, 1461 &update->pt_bo->vmap, NULL, 1462 update->ofs, update->qwords, update); 1463 } 1464 } 1465 1466 trace_xe_vm_cpu_bind(vm); 1467 xe_device_wmb(vm->xe); 1468 1469 return dma_fence_get_stub(); 1470 } 1471 1472 static struct dma_fence * 1473 __xe_migrate_update_pgtables(struct xe_migrate *m, 1474 struct xe_migrate_pt_update *pt_update, 1475 struct xe_vm_pgtable_update_ops *pt_update_ops) 1476 { 1477 const struct xe_migrate_pt_update_ops *ops = pt_update->ops; 1478 struct xe_tile *tile = m->tile; 1479 struct xe_gt *gt = tile->primary_gt; 1480 struct xe_device *xe = tile_to_xe(tile); 1481 struct xe_sched_job *job; 1482 struct dma_fence *fence; 1483 struct drm_suballoc *sa_bo = NULL; 1484 struct xe_bb *bb; 1485 u32 i, j, batch_size = 0, ppgtt_ofs, update_idx, page_ofs = 0; 1486 u32 num_updates = 0, current_update = 0; 1487 u64 addr; 1488 int err = 0; 1489 bool is_migrate = pt_update_ops->q == m->q; 1490 bool usm = is_migrate && xe->info.has_usm; 1491 1492 for (i = 0; i < pt_update_ops->num_ops; ++i) { 1493 struct xe_vm_pgtable_update_op *pt_op = &pt_update_ops->ops[i]; 1494 struct xe_vm_pgtable_update *updates = pt_op->entries; 1495 1496 num_updates += pt_op->num_entries; 1497 for (j = 0; j < pt_op->num_entries; ++j) { 1498 u32 num_cmds = DIV_ROUND_UP(updates[j].qwords, 1499 MAX_PTE_PER_SDI); 1500 1501 /* align noop + MI_STORE_DATA_IMM cmd prefix */ 1502 batch_size += 4 * num_cmds + updates[j].qwords * 2; 1503 } 1504 } 1505 1506 /* fixed + PTE entries */ 1507 if (IS_DGFX(xe)) 1508 batch_size += 2; 1509 else 1510 batch_size += 6 * (num_updates / MAX_PTE_PER_SDI + 1) + 1511 num_updates * 2; 1512 1513 bb = xe_bb_new(gt, batch_size, usm); 1514 if (IS_ERR(bb)) 1515 return ERR_CAST(bb); 1516 1517 /* For sysmem PTE's, need to map them in our hole.. */ 1518 if (!IS_DGFX(xe)) { 1519 u16 pat_index = xe->pat.idx[XE_CACHE_WB]; 1520 u32 ptes, ofs; 1521 1522 ppgtt_ofs = NUM_KERNEL_PDE - 1; 1523 if (!is_migrate) { 1524 u32 num_units = DIV_ROUND_UP(num_updates, 1525 NUM_VMUSA_WRITES_PER_UNIT); 1526 1527 if (num_units > m->vm_update_sa.size) { 1528 err = -ENOBUFS; 1529 goto err_bb; 1530 } 1531 sa_bo = drm_suballoc_new(&m->vm_update_sa, num_units, 1532 GFP_KERNEL, true, 0); 1533 if (IS_ERR(sa_bo)) { 1534 err = PTR_ERR(sa_bo); 1535 goto err_bb; 1536 } 1537 1538 ppgtt_ofs = NUM_KERNEL_PDE + 1539 (drm_suballoc_soffset(sa_bo) / 1540 NUM_VMUSA_UNIT_PER_PAGE); 1541 page_ofs = (drm_suballoc_soffset(sa_bo) % 1542 NUM_VMUSA_UNIT_PER_PAGE) * 1543 VM_SA_UPDATE_UNIT_SIZE; 1544 } 1545 1546 /* Map our PT's to gtt */ 1547 i = 0; 1548 j = 0; 1549 ptes = num_updates; 1550 ofs = ppgtt_ofs * XE_PAGE_SIZE + page_ofs; 1551 while (ptes) { 1552 u32 chunk = min(MAX_PTE_PER_SDI, ptes); 1553 u32 idx = 0; 1554 1555 bb->cs[bb->len++] = MI_STORE_DATA_IMM | 1556 MI_SDI_NUM_QW(chunk); 1557 bb->cs[bb->len++] = ofs; 1558 bb->cs[bb->len++] = 0; /* upper_32_bits */ 1559 1560 for (; i < pt_update_ops->num_ops; ++i) { 1561 struct xe_vm_pgtable_update_op *pt_op = 1562 &pt_update_ops->ops[i]; 1563 struct xe_vm_pgtable_update *updates = pt_op->entries; 1564 1565 for (; j < pt_op->num_entries; ++j, ++current_update, ++idx) { 1566 struct xe_vm *vm = pt_update->vops->vm; 1567 struct xe_bo *pt_bo = updates[j].pt_bo; 1568 1569 if (idx == chunk) 1570 goto next_cmd; 1571 1572 xe_tile_assert(tile, xe_bo_size(pt_bo) == SZ_4K); 1573 1574 /* Map a PT at most once */ 1575 if (pt_bo->update_index < 0) 1576 pt_bo->update_index = current_update; 1577 1578 addr = vm->pt_ops->pte_encode_bo(pt_bo, 0, 1579 pat_index, 0); 1580 bb->cs[bb->len++] = lower_32_bits(addr); 1581 bb->cs[bb->len++] = upper_32_bits(addr); 1582 } 1583 1584 j = 0; 1585 } 1586 1587 next_cmd: 1588 ptes -= chunk; 1589 ofs += chunk * sizeof(u64); 1590 } 1591 1592 bb->cs[bb->len++] = MI_BATCH_BUFFER_END; 1593 update_idx = bb->len; 1594 1595 addr = xe_migrate_vm_addr(ppgtt_ofs, 0) + 1596 (page_ofs / sizeof(u64)) * XE_PAGE_SIZE; 1597 for (i = 0; i < pt_update_ops->num_ops; ++i) { 1598 struct xe_vm_pgtable_update_op *pt_op = 1599 &pt_update_ops->ops[i]; 1600 struct xe_vm_pgtable_update *updates = pt_op->entries; 1601 1602 for (j = 0; j < pt_op->num_entries; ++j) { 1603 struct xe_bo *pt_bo = updates[j].pt_bo; 1604 1605 write_pgtable(tile, bb, addr + 1606 pt_bo->update_index * XE_PAGE_SIZE, 1607 pt_op, &updates[j], pt_update); 1608 } 1609 } 1610 } else { 1611 /* phys pages, no preamble required */ 1612 bb->cs[bb->len++] = MI_BATCH_BUFFER_END; 1613 update_idx = bb->len; 1614 1615 for (i = 0; i < pt_update_ops->num_ops; ++i) { 1616 struct xe_vm_pgtable_update_op *pt_op = 1617 &pt_update_ops->ops[i]; 1618 struct xe_vm_pgtable_update *updates = pt_op->entries; 1619 1620 for (j = 0; j < pt_op->num_entries; ++j) 1621 write_pgtable(tile, bb, 0, pt_op, &updates[j], 1622 pt_update); 1623 } 1624 } 1625 1626 job = xe_bb_create_migration_job(pt_update_ops->q, bb, 1627 xe_migrate_batch_base(m, usm), 1628 update_idx); 1629 if (IS_ERR(job)) { 1630 err = PTR_ERR(job); 1631 goto err_sa; 1632 } 1633 1634 if (ops->pre_commit) { 1635 pt_update->job = job; 1636 err = ops->pre_commit(pt_update); 1637 if (err) 1638 goto err_job; 1639 } 1640 if (is_migrate) 1641 mutex_lock(&m->job_mutex); 1642 1643 xe_sched_job_arm(job); 1644 fence = dma_fence_get(&job->drm.s_fence->finished); 1645 xe_sched_job_push(job); 1646 1647 if (is_migrate) 1648 mutex_unlock(&m->job_mutex); 1649 1650 xe_bb_free(bb, fence); 1651 drm_suballoc_free(sa_bo, fence); 1652 1653 return fence; 1654 1655 err_job: 1656 xe_sched_job_put(job); 1657 err_sa: 1658 drm_suballoc_free(sa_bo, NULL); 1659 err_bb: 1660 xe_bb_free(bb, NULL); 1661 return ERR_PTR(err); 1662 } 1663 1664 /** 1665 * xe_migrate_update_pgtables() - Pipelined page-table update 1666 * @m: The migrate context. 1667 * @pt_update: PT update arguments 1668 * 1669 * Perform a pipelined page-table update. The update descriptors are typically 1670 * built under the same lock critical section as a call to this function. If 1671 * using the default engine for the updates, they will be performed in the 1672 * order they grab the job_mutex. If different engines are used, external 1673 * synchronization is needed for overlapping updates to maintain page-table 1674 * consistency. Note that the meaning of "overlapping" is that the updates 1675 * touch the same page-table, which might be a higher-level page-directory. 1676 * If no pipelining is needed, then updates may be performed by the cpu. 1677 * 1678 * Return: A dma_fence that, when signaled, indicates the update completion. 1679 */ 1680 struct dma_fence * 1681 xe_migrate_update_pgtables(struct xe_migrate *m, 1682 struct xe_migrate_pt_update *pt_update) 1683 1684 { 1685 struct xe_vm_pgtable_update_ops *pt_update_ops = 1686 &pt_update->vops->pt_update_ops[pt_update->tile_id]; 1687 struct dma_fence *fence; 1688 1689 fence = xe_migrate_update_pgtables_cpu(m, pt_update); 1690 1691 /* -ETIME indicates a job is needed, anything else is legit error */ 1692 if (!IS_ERR(fence) || PTR_ERR(fence) != -ETIME) 1693 return fence; 1694 1695 return __xe_migrate_update_pgtables(m, pt_update, pt_update_ops); 1696 } 1697 1698 /** 1699 * xe_migrate_wait() - Complete all operations using the xe_migrate context 1700 * @m: Migrate context to wait for. 1701 * 1702 * Waits until the GPU no longer uses the migrate context's default engine 1703 * or its page-table objects. FIXME: What about separate page-table update 1704 * engines? 1705 */ 1706 void xe_migrate_wait(struct xe_migrate *m) 1707 { 1708 if (m->fence) 1709 dma_fence_wait(m->fence, false); 1710 } 1711 1712 static u32 pte_update_cmd_size(u64 size) 1713 { 1714 u32 num_dword; 1715 u64 entries = DIV_U64_ROUND_UP(size, XE_PAGE_SIZE); 1716 1717 XE_WARN_ON(size > MAX_PREEMPTDISABLE_TRANSFER); 1718 1719 /* 1720 * MI_STORE_DATA_IMM command is used to update page table. Each 1721 * instruction can update maximumly MAX_PTE_PER_SDI pte entries. To 1722 * update n (n <= MAX_PTE_PER_SDI) pte entries, we need: 1723 * 1724 * - 1 dword for the MI_STORE_DATA_IMM command header (opcode etc) 1725 * - 2 dword for the page table's physical location 1726 * - 2*n dword for value of pte to fill (each pte entry is 2 dwords) 1727 */ 1728 num_dword = (1 + 2) * DIV_U64_ROUND_UP(entries, MAX_PTE_PER_SDI); 1729 num_dword += entries * 2; 1730 1731 return num_dword; 1732 } 1733 1734 static void build_pt_update_batch_sram(struct xe_migrate *m, 1735 struct xe_bb *bb, u32 pt_offset, 1736 dma_addr_t *sram_addr, u32 size) 1737 { 1738 u16 pat_index = tile_to_xe(m->tile)->pat.idx[XE_CACHE_WB]; 1739 u32 ptes; 1740 int i = 0; 1741 1742 ptes = DIV_ROUND_UP(size, XE_PAGE_SIZE); 1743 while (ptes) { 1744 u32 chunk = min(MAX_PTE_PER_SDI, ptes); 1745 1746 bb->cs[bb->len++] = MI_STORE_DATA_IMM | MI_SDI_NUM_QW(chunk); 1747 bb->cs[bb->len++] = pt_offset; 1748 bb->cs[bb->len++] = 0; 1749 1750 pt_offset += chunk * 8; 1751 ptes -= chunk; 1752 1753 while (chunk--) { 1754 u64 addr = sram_addr[i++] & PAGE_MASK; 1755 1756 xe_tile_assert(m->tile, addr); 1757 addr = m->q->vm->pt_ops->pte_encode_addr(m->tile->xe, 1758 addr, pat_index, 1759 0, false, 0); 1760 bb->cs[bb->len++] = lower_32_bits(addr); 1761 bb->cs[bb->len++] = upper_32_bits(addr); 1762 } 1763 } 1764 } 1765 1766 enum xe_migrate_copy_dir { 1767 XE_MIGRATE_COPY_TO_VRAM, 1768 XE_MIGRATE_COPY_TO_SRAM, 1769 }; 1770 1771 #define XE_CACHELINE_BYTES 64ull 1772 #define XE_CACHELINE_MASK (XE_CACHELINE_BYTES - 1) 1773 1774 static struct dma_fence *xe_migrate_vram(struct xe_migrate *m, 1775 unsigned long len, 1776 unsigned long sram_offset, 1777 dma_addr_t *sram_addr, u64 vram_addr, 1778 const enum xe_migrate_copy_dir dir) 1779 { 1780 struct xe_gt *gt = m->tile->primary_gt; 1781 struct xe_device *xe = gt_to_xe(gt); 1782 bool use_usm_batch = xe->info.has_usm; 1783 struct dma_fence *fence = NULL; 1784 u32 batch_size = 2; 1785 u64 src_L0_ofs, dst_L0_ofs; 1786 struct xe_sched_job *job; 1787 struct xe_bb *bb; 1788 u32 update_idx, pt_slot = 0; 1789 unsigned long npages = DIV_ROUND_UP(len + sram_offset, PAGE_SIZE); 1790 unsigned int pitch = len >= PAGE_SIZE && !(len & ~PAGE_MASK) ? 1791 PAGE_SIZE : 4; 1792 int err; 1793 1794 if (drm_WARN_ON(&xe->drm, (len & XE_CACHELINE_MASK) || 1795 (sram_offset | vram_addr) & XE_CACHELINE_MASK)) 1796 return ERR_PTR(-EOPNOTSUPP); 1797 1798 xe_assert(xe, npages * PAGE_SIZE <= MAX_PREEMPTDISABLE_TRANSFER); 1799 1800 batch_size += pte_update_cmd_size(len); 1801 batch_size += EMIT_COPY_DW; 1802 1803 bb = xe_bb_new(gt, batch_size, use_usm_batch); 1804 if (IS_ERR(bb)) { 1805 err = PTR_ERR(bb); 1806 return ERR_PTR(err); 1807 } 1808 1809 build_pt_update_batch_sram(m, bb, pt_slot * XE_PAGE_SIZE, 1810 sram_addr, len + sram_offset); 1811 1812 if (dir == XE_MIGRATE_COPY_TO_VRAM) { 1813 src_L0_ofs = xe_migrate_vm_addr(pt_slot, 0) + sram_offset; 1814 dst_L0_ofs = xe_migrate_vram_ofs(xe, vram_addr, false); 1815 1816 } else { 1817 src_L0_ofs = xe_migrate_vram_ofs(xe, vram_addr, false); 1818 dst_L0_ofs = xe_migrate_vm_addr(pt_slot, 0) + sram_offset; 1819 } 1820 1821 bb->cs[bb->len++] = MI_BATCH_BUFFER_END; 1822 update_idx = bb->len; 1823 1824 emit_copy(gt, bb, src_L0_ofs, dst_L0_ofs, len, pitch); 1825 1826 job = xe_bb_create_migration_job(m->q, bb, 1827 xe_migrate_batch_base(m, use_usm_batch), 1828 update_idx); 1829 if (IS_ERR(job)) { 1830 err = PTR_ERR(job); 1831 goto err; 1832 } 1833 1834 xe_sched_job_add_migrate_flush(job, 0); 1835 1836 mutex_lock(&m->job_mutex); 1837 xe_sched_job_arm(job); 1838 fence = dma_fence_get(&job->drm.s_fence->finished); 1839 xe_sched_job_push(job); 1840 1841 dma_fence_put(m->fence); 1842 m->fence = dma_fence_get(fence); 1843 mutex_unlock(&m->job_mutex); 1844 1845 xe_bb_free(bb, fence); 1846 1847 return fence; 1848 1849 err: 1850 xe_bb_free(bb, NULL); 1851 1852 return ERR_PTR(err); 1853 } 1854 1855 /** 1856 * xe_migrate_to_vram() - Migrate to VRAM 1857 * @m: The migration context. 1858 * @npages: Number of pages to migrate. 1859 * @src_addr: Array of dma addresses (source of migrate) 1860 * @dst_addr: Device physical address of VRAM (destination of migrate) 1861 * 1862 * Copy from an array dma addresses to a VRAM device physical address 1863 * 1864 * Return: dma fence for migrate to signal completion on succees, ERR_PTR on 1865 * failure 1866 */ 1867 struct dma_fence *xe_migrate_to_vram(struct xe_migrate *m, 1868 unsigned long npages, 1869 dma_addr_t *src_addr, 1870 u64 dst_addr) 1871 { 1872 return xe_migrate_vram(m, npages * PAGE_SIZE, 0, src_addr, dst_addr, 1873 XE_MIGRATE_COPY_TO_VRAM); 1874 } 1875 1876 /** 1877 * xe_migrate_from_vram() - Migrate from VRAM 1878 * @m: The migration context. 1879 * @npages: Number of pages to migrate. 1880 * @src_addr: Device physical address of VRAM (source of migrate) 1881 * @dst_addr: Array of dma addresses (destination of migrate) 1882 * 1883 * Copy from a VRAM device physical address to an array dma addresses 1884 * 1885 * Return: dma fence for migrate to signal completion on succees, ERR_PTR on 1886 * failure 1887 */ 1888 struct dma_fence *xe_migrate_from_vram(struct xe_migrate *m, 1889 unsigned long npages, 1890 u64 src_addr, 1891 dma_addr_t *dst_addr) 1892 { 1893 return xe_migrate_vram(m, npages * PAGE_SIZE, 0, dst_addr, src_addr, 1894 XE_MIGRATE_COPY_TO_SRAM); 1895 } 1896 1897 static void xe_migrate_dma_unmap(struct xe_device *xe, dma_addr_t *dma_addr, 1898 int len, int write) 1899 { 1900 unsigned long i, npages = DIV_ROUND_UP(len, PAGE_SIZE); 1901 1902 for (i = 0; i < npages; ++i) { 1903 if (!dma_addr[i]) 1904 break; 1905 1906 dma_unmap_page(xe->drm.dev, dma_addr[i], PAGE_SIZE, 1907 write ? DMA_TO_DEVICE : DMA_FROM_DEVICE); 1908 } 1909 kfree(dma_addr); 1910 } 1911 1912 static dma_addr_t *xe_migrate_dma_map(struct xe_device *xe, 1913 void *buf, int len, int write) 1914 { 1915 dma_addr_t *dma_addr; 1916 unsigned long i, npages = DIV_ROUND_UP(len, PAGE_SIZE); 1917 1918 dma_addr = kcalloc(npages, sizeof(*dma_addr), GFP_KERNEL); 1919 if (!dma_addr) 1920 return ERR_PTR(-ENOMEM); 1921 1922 for (i = 0; i < npages; ++i) { 1923 dma_addr_t addr; 1924 struct page *page; 1925 1926 if (is_vmalloc_addr(buf)) 1927 page = vmalloc_to_page(buf); 1928 else 1929 page = virt_to_page(buf); 1930 1931 addr = dma_map_page(xe->drm.dev, 1932 page, 0, PAGE_SIZE, 1933 write ? DMA_TO_DEVICE : 1934 DMA_FROM_DEVICE); 1935 if (dma_mapping_error(xe->drm.dev, addr)) 1936 goto err_fault; 1937 1938 dma_addr[i] = addr; 1939 buf += PAGE_SIZE; 1940 } 1941 1942 return dma_addr; 1943 1944 err_fault: 1945 xe_migrate_dma_unmap(xe, dma_addr, len, write); 1946 return ERR_PTR(-EFAULT); 1947 } 1948 1949 /** 1950 * xe_migrate_access_memory - Access memory of a BO via GPU 1951 * 1952 * @m: The migration context. 1953 * @bo: buffer object 1954 * @offset: access offset into buffer object 1955 * @buf: pointer to caller memory to read into or write from 1956 * @len: length of access 1957 * @write: write access 1958 * 1959 * Access memory of a BO via GPU either reading in or writing from a passed in 1960 * pointer. Pointer is dma mapped for GPU access and GPU commands are issued to 1961 * read to or write from pointer. 1962 * 1963 * Returns: 1964 * 0 if successful, negative error code on failure. 1965 */ 1966 int xe_migrate_access_memory(struct xe_migrate *m, struct xe_bo *bo, 1967 unsigned long offset, void *buf, int len, 1968 int write) 1969 { 1970 struct xe_tile *tile = m->tile; 1971 struct xe_device *xe = tile_to_xe(tile); 1972 struct xe_res_cursor cursor; 1973 struct dma_fence *fence = NULL; 1974 dma_addr_t *dma_addr; 1975 unsigned long page_offset = (unsigned long)buf & ~PAGE_MASK; 1976 int bytes_left = len, current_page = 0; 1977 void *orig_buf = buf; 1978 1979 xe_bo_assert_held(bo); 1980 1981 /* Use bounce buffer for small access and unaligned access */ 1982 if (!IS_ALIGNED(len, XE_CACHELINE_BYTES) || 1983 !IS_ALIGNED((unsigned long)buf + offset, XE_CACHELINE_BYTES)) { 1984 int buf_offset = 0; 1985 1986 /* 1987 * Less than ideal for large unaligned access but this should be 1988 * fairly rare, can fixup if this becomes common. 1989 */ 1990 do { 1991 u8 bounce[XE_CACHELINE_BYTES]; 1992 void *ptr = (void *)bounce; 1993 int err; 1994 int copy_bytes = min_t(int, bytes_left, 1995 XE_CACHELINE_BYTES - 1996 (offset & XE_CACHELINE_MASK)); 1997 int ptr_offset = offset & XE_CACHELINE_MASK; 1998 1999 err = xe_migrate_access_memory(m, bo, 2000 offset & 2001 ~XE_CACHELINE_MASK, 2002 (void *)ptr, 2003 sizeof(bounce), 0); 2004 if (err) 2005 return err; 2006 2007 if (write) { 2008 memcpy(ptr + ptr_offset, buf + buf_offset, copy_bytes); 2009 2010 err = xe_migrate_access_memory(m, bo, 2011 offset & ~XE_CACHELINE_MASK, 2012 (void *)ptr, 2013 sizeof(bounce), write); 2014 if (err) 2015 return err; 2016 } else { 2017 memcpy(buf + buf_offset, ptr + ptr_offset, 2018 copy_bytes); 2019 } 2020 2021 bytes_left -= copy_bytes; 2022 buf_offset += copy_bytes; 2023 offset += copy_bytes; 2024 } while (bytes_left); 2025 2026 return 0; 2027 } 2028 2029 dma_addr = xe_migrate_dma_map(xe, buf, len + page_offset, write); 2030 if (IS_ERR(dma_addr)) 2031 return PTR_ERR(dma_addr); 2032 2033 xe_res_first(bo->ttm.resource, offset, xe_bo_size(bo) - offset, &cursor); 2034 2035 do { 2036 struct dma_fence *__fence; 2037 u64 vram_addr = vram_region_gpu_offset(bo->ttm.resource) + 2038 cursor.start; 2039 int current_bytes; 2040 2041 if (cursor.size > MAX_PREEMPTDISABLE_TRANSFER) 2042 current_bytes = min_t(int, bytes_left, 2043 MAX_PREEMPTDISABLE_TRANSFER); 2044 else 2045 current_bytes = min_t(int, bytes_left, cursor.size); 2046 2047 if (fence) 2048 dma_fence_put(fence); 2049 2050 __fence = xe_migrate_vram(m, current_bytes, 2051 (unsigned long)buf & ~PAGE_MASK, 2052 dma_addr + current_page, 2053 vram_addr, write ? 2054 XE_MIGRATE_COPY_TO_VRAM : 2055 XE_MIGRATE_COPY_TO_SRAM); 2056 if (IS_ERR(__fence)) { 2057 if (fence) 2058 dma_fence_wait(fence, false); 2059 fence = __fence; 2060 goto out_err; 2061 } 2062 fence = __fence; 2063 2064 buf += current_bytes; 2065 offset += current_bytes; 2066 current_page = (int)(buf - orig_buf) / PAGE_SIZE; 2067 bytes_left -= current_bytes; 2068 if (bytes_left) 2069 xe_res_next(&cursor, current_bytes); 2070 } while (bytes_left); 2071 2072 dma_fence_wait(fence, false); 2073 dma_fence_put(fence); 2074 2075 out_err: 2076 xe_migrate_dma_unmap(xe, dma_addr, len + page_offset, write); 2077 return IS_ERR(fence) ? PTR_ERR(fence) : 0; 2078 } 2079 2080 /** 2081 * xe_migrate_job_lock() - Lock migrate job lock 2082 * @m: The migration context. 2083 * @q: Queue associated with the operation which requires a lock 2084 * 2085 * Lock the migrate job lock if the queue is a migration queue, otherwise 2086 * assert the VM's dma-resv is held (user queue's have own locking). 2087 */ 2088 void xe_migrate_job_lock(struct xe_migrate *m, struct xe_exec_queue *q) 2089 { 2090 bool is_migrate = q == m->q; 2091 2092 if (is_migrate) 2093 mutex_lock(&m->job_mutex); 2094 else 2095 xe_vm_assert_held(q->vm); /* User queues VM's should be locked */ 2096 } 2097 2098 /** 2099 * xe_migrate_job_unlock() - Unlock migrate job lock 2100 * @m: The migration context. 2101 * @q: Queue associated with the operation which requires a lock 2102 * 2103 * Unlock the migrate job lock if the queue is a migration queue, otherwise 2104 * assert the VM's dma-resv is held (user queue's have own locking). 2105 */ 2106 void xe_migrate_job_unlock(struct xe_migrate *m, struct xe_exec_queue *q) 2107 { 2108 bool is_migrate = q == m->q; 2109 2110 if (is_migrate) 2111 mutex_unlock(&m->job_mutex); 2112 else 2113 xe_vm_assert_held(q->vm); /* User queues VM's should be locked */ 2114 } 2115 2116 #if IS_ENABLED(CONFIG_DRM_XE_KUNIT_TEST) 2117 #include "tests/xe_migrate.c" 2118 #endif 2119