1 // SPDX-License-Identifier: MIT 2 /* 3 * Copyright © 2020 Intel Corporation 4 */ 5 6 #include "xe_migrate.h" 7 8 #include <linux/bitfield.h> 9 #include <linux/sizes.h> 10 11 #include <drm/drm_managed.h> 12 #include <drm/drm_pagemap.h> 13 #include <drm/ttm/ttm_tt.h> 14 #include <uapi/drm/xe_drm.h> 15 16 #include <generated/xe_wa_oob.h> 17 18 #include "instructions/xe_gpu_commands.h" 19 #include "instructions/xe_mi_commands.h" 20 #include "regs/xe_gtt_defs.h" 21 #include "tests/xe_test.h" 22 #include "xe_assert.h" 23 #include "xe_bb.h" 24 #include "xe_bo.h" 25 #include "xe_exec_queue.h" 26 #include "xe_ggtt.h" 27 #include "xe_gt.h" 28 #include "xe_gt_printk.h" 29 #include "xe_hw_engine.h" 30 #include "xe_lrc.h" 31 #include "xe_map.h" 32 #include "xe_mem_pool.h" 33 #include "xe_mocs.h" 34 #include "xe_pat.h" 35 #include "xe_printk.h" 36 #include "xe_pt.h" 37 #include "xe_res_cursor.h" 38 #include "xe_sa.h" 39 #include "xe_sched_job.h" 40 #include "xe_sriov_vf_ccs.h" 41 #include "xe_svm.h" 42 #include "xe_sync.h" 43 #include "xe_trace_bo.h" 44 #include "xe_validation.h" 45 #include "xe_vm.h" 46 #include "xe_vram.h" 47 48 /** 49 * struct xe_migrate - migrate context. 50 */ 51 struct xe_migrate { 52 /** @q: Default exec queue used for migration */ 53 struct xe_exec_queue *q; 54 /** @tile: Backpointer to the tile this struct xe_migrate belongs to. */ 55 struct xe_tile *tile; 56 /** @job_mutex: Timeline mutex for @eng. */ 57 struct mutex job_mutex; 58 /** @pt_bo: Page-table buffer object. */ 59 struct xe_bo *pt_bo; 60 /** @batch_base_ofs: VM offset of the migration batch buffer */ 61 u64 batch_base_ofs; 62 /** @usm_batch_base_ofs: VM offset of the usm batch buffer */ 63 u64 usm_batch_base_ofs; 64 /** @cleared_mem_ofs: VM offset of @cleared_bo. */ 65 u64 cleared_mem_ofs; 66 /** @large_page_copy_ofs: VM offset of 2M pages used for large copies */ 67 u64 large_page_copy_ofs; 68 /** 69 * @large_page_copy_pdes: BO offset to writeout 2M pages (PDEs) used for 70 * large copies 71 */ 72 u64 large_page_copy_pdes; 73 /** 74 * @fence: dma-fence representing the last migration job batch. 75 * Protected by @job_mutex. 76 */ 77 struct dma_fence *fence; 78 /** 79 * @vm_update_sa: For integrated, used to suballocate page-tables 80 * out of the pt_bo. 81 */ 82 struct drm_suballoc_manager vm_update_sa; 83 /** @min_chunk_size: For dgfx, Minimum chunk size */ 84 u64 min_chunk_size; 85 }; 86 87 #define MAX_PREEMPTDISABLE_TRANSFER SZ_8M /* Around 1ms. */ 88 #define MAX_CCS_LIMITED_TRANSFER SZ_4M /* XE_PAGE_SIZE * (FIELD_MAX(XE2_CCS_SIZE_MASK) + 1) */ 89 #define NUM_KERNEL_PDE 15 90 #define NUM_PT_SLOTS 32 91 #define LEVEL0_PAGE_TABLE_ENCODE_SIZE SZ_2M 92 #define MAX_NUM_PTE 512 93 #define IDENTITY_OFFSET 256ULL 94 95 /* 96 * Although MI_STORE_DATA_IMM's "length" field is 10-bits, 0x3FE is the largest 97 * legal value accepted. Since that instruction field is always stored in 98 * (val-2) format, this translates to 0x400 dwords for the true maximum length 99 * of the instruction. Subtracting the instruction header (1 dword) and 100 * address (2 dwords), that leaves 0x3FD dwords (0x1FE qwords) for PTE values. 101 */ 102 #define MAX_PTE_PER_SDI 0x1FEU 103 104 static void xe_migrate_fini(void *arg) 105 { 106 struct xe_migrate *m = arg; 107 108 xe_vm_lock(m->q->vm, false); 109 xe_bo_unpin(m->pt_bo); 110 xe_vm_unlock(m->q->vm); 111 112 dma_fence_put(m->fence); 113 xe_bo_put(m->pt_bo); 114 drm_suballoc_manager_fini(&m->vm_update_sa); 115 mutex_destroy(&m->job_mutex); 116 xe_vm_close_and_put(m->q->vm); 117 xe_exec_queue_put(m->q); 118 } 119 120 static u64 xe_migrate_vm_addr(u64 slot, u32 level) 121 { 122 XE_WARN_ON(slot >= NUM_PT_SLOTS); 123 124 /* First slot is reserved for mapping of PT bo and bb, start from 1 */ 125 return (slot + 1ULL) << xe_pt_shift(level + 1); 126 } 127 128 static u64 xe_migrate_vram_ofs(struct xe_device *xe, u64 addr, bool is_comp_pte) 129 { 130 /* 131 * Remove the DPA to get a correct offset into identity table for the 132 * migrate offset 133 */ 134 u64 identity_offset = IDENTITY_OFFSET; 135 136 if (GRAPHICS_VER(xe) >= 20 && is_comp_pte) 137 identity_offset += DIV_ROUND_UP_ULL(xe_vram_region_actual_physical_size 138 (xe->mem.vram), SZ_1G); 139 140 addr -= xe_vram_region_dpa_base(xe->mem.vram); 141 return addr + (identity_offset << xe_pt_shift(2)); 142 } 143 144 static void xe_migrate_program_identity(struct xe_device *xe, struct xe_vm *vm, struct xe_bo *bo, 145 u64 map_ofs, u64 vram_offset, u16 pat_index, u64 pt_2m_ofs) 146 { 147 struct xe_vram_region *vram = xe->mem.vram; 148 resource_size_t dpa_base = xe_vram_region_dpa_base(vram); 149 u64 pos, ofs, flags; 150 u64 entry; 151 /* XXX: Unclear if this should be usable_size? */ 152 u64 vram_limit = xe_vram_region_actual_physical_size(vram) + dpa_base; 153 u32 level = 2; 154 155 ofs = map_ofs + XE_PAGE_SIZE * level + vram_offset * 8; 156 flags = vm->pt_ops->pte_encode_addr(xe, 0, pat_index, level, 157 true, 0); 158 159 xe_assert(xe, IS_ALIGNED(xe_vram_region_usable_size(vram), SZ_2M)); 160 161 /* 162 * Use 1GB pages when possible, last chunk always use 2M 163 * pages as mixing reserved memory (stolen, WOCPM) with a single 164 * mapping is not allowed on certain platforms. 165 */ 166 for (pos = dpa_base; pos < vram_limit; 167 pos += SZ_1G, ofs += 8) { 168 if (pos + SZ_1G >= vram_limit) { 169 entry = vm->pt_ops->pde_encode_bo(bo, pt_2m_ofs); 170 xe_map_wr(xe, &bo->vmap, ofs, u64, entry); 171 172 flags = vm->pt_ops->pte_encode_addr(xe, 0, 173 pat_index, 174 level - 1, 175 true, 0); 176 177 for (ofs = pt_2m_ofs; pos < vram_limit; 178 pos += SZ_2M, ofs += 8) 179 xe_map_wr(xe, &bo->vmap, ofs, u64, pos | flags); 180 break; /* Ensure pos == vram_limit assert correct */ 181 } 182 183 xe_map_wr(xe, &bo->vmap, ofs, u64, pos | flags); 184 } 185 186 xe_assert(xe, pos == vram_limit); 187 } 188 189 static int xe_migrate_pt_bo_alloc(struct xe_tile *tile, struct xe_migrate *m, 190 struct xe_vm *vm, struct drm_exec *exec) 191 { 192 struct xe_bo *bo, *batch = tile->mem.kernel_bb_pool->bo; 193 u32 num_entries = NUM_PT_SLOTS; 194 195 /* Can't bump NUM_PT_SLOTS too high */ 196 BUILD_BUG_ON(NUM_PT_SLOTS > SZ_2M/XE_PAGE_SIZE); 197 /* Must be a multiple of 64K to support all platforms */ 198 BUILD_BUG_ON(NUM_PT_SLOTS * XE_PAGE_SIZE % SZ_64K); 199 /* And one slot reserved for the 4KiB page table updates */ 200 BUILD_BUG_ON(!(NUM_KERNEL_PDE & 1)); 201 202 /* Need to be sure everything fits in the first PT, or create more */ 203 xe_tile_assert(tile, m->batch_base_ofs + xe_bo_size(batch) < SZ_2M); 204 205 bo = xe_bo_create_pin_map(vm->xe, tile, vm, 206 num_entries * XE_PAGE_SIZE, 207 ttm_bo_type_kernel, 208 XE_BO_FLAG_VRAM_IF_DGFX(tile) | 209 XE_BO_FLAG_PAGETABLE, exec); 210 if (IS_ERR(bo)) 211 return PTR_ERR(bo); 212 213 m->pt_bo = bo; 214 return 0; 215 } 216 217 static void xe_migrate_prepare_vm(struct xe_tile *tile, struct xe_migrate *m, 218 struct xe_vm *vm, u32 *ofs) 219 { 220 struct xe_device *xe = tile_to_xe(tile); 221 u16 pat_index = xe_cache_pat_idx(xe, XE_CACHE_WB); 222 u8 id = tile->id; 223 u32 num_entries = NUM_PT_SLOTS, num_level = vm->pt_root[id]->level; 224 #define VRAM_IDENTITY_MAP_COUNT 2 225 u32 num_setup = num_level + VRAM_IDENTITY_MAP_COUNT; 226 #undef VRAM_IDENTITY_MAP_COUNT 227 u32 map_ofs, level, i; 228 struct xe_bo *bo = m->pt_bo, *batch = tile->mem.kernel_bb_pool->bo; 229 u64 entry, pt29_ofs; 230 231 /* PT30 & PT31 reserved for 2M identity map */ 232 pt29_ofs = xe_bo_size(bo) - 3 * XE_PAGE_SIZE; 233 entry = vm->pt_ops->pde_encode_bo(bo, pt29_ofs); 234 xe_pt_write(xe, &vm->pt_root[id]->bo->vmap, 0, entry); 235 236 map_ofs = (num_entries - num_setup) * XE_PAGE_SIZE; 237 238 /* Map the entire BO in our level 0 pt */ 239 for (i = 0, level = 0; i < num_entries; level++) { 240 entry = vm->pt_ops->pte_encode_bo(bo, i * XE_PAGE_SIZE, 241 pat_index, 0); 242 243 xe_map_wr(xe, &bo->vmap, map_ofs + level * 8, u64, entry); 244 245 if (vm->flags & XE_VM_FLAG_64K) 246 i += 16; 247 else 248 i += 1; 249 } 250 251 if (!IS_DGFX(xe)) { 252 /* Write out batch too */ 253 m->batch_base_ofs = NUM_PT_SLOTS * XE_PAGE_SIZE; 254 for (i = 0; i < xe_bo_size(batch); 255 i += vm->flags & XE_VM_FLAG_64K ? XE_64K_PAGE_SIZE : 256 XE_PAGE_SIZE) { 257 entry = vm->pt_ops->pte_encode_bo(batch, i, 258 pat_index, 0); 259 260 xe_map_wr(xe, &bo->vmap, map_ofs + level * 8, u64, 261 entry); 262 level++; 263 } 264 if (xe->info.has_usm) { 265 xe_tile_assert(tile, xe_bo_size(batch) == SZ_1M); 266 267 batch = tile->primary_gt->usm.bb_pool->bo; 268 m->usm_batch_base_ofs = m->batch_base_ofs + SZ_1M; 269 xe_tile_assert(tile, xe_bo_size(batch) == SZ_512K); 270 271 for (i = 0; i < xe_bo_size(batch); 272 i += vm->flags & XE_VM_FLAG_64K ? XE_64K_PAGE_SIZE : 273 XE_PAGE_SIZE) { 274 entry = vm->pt_ops->pte_encode_bo(batch, i, 275 pat_index, 0); 276 277 xe_map_wr(xe, &bo->vmap, map_ofs + level * 8, u64, 278 entry); 279 level++; 280 } 281 } 282 } else { 283 u64 batch_addr = xe_bo_addr(batch, 0, XE_PAGE_SIZE); 284 285 m->batch_base_ofs = xe_migrate_vram_ofs(xe, batch_addr, false); 286 287 if (xe->info.has_usm) { 288 batch = tile->primary_gt->usm.bb_pool->bo; 289 batch_addr = xe_bo_addr(batch, 0, XE_PAGE_SIZE); 290 m->usm_batch_base_ofs = xe_migrate_vram_ofs(xe, batch_addr, false); 291 } 292 } 293 294 for (level = 1; level < num_level; level++) { 295 u32 flags = 0; 296 297 if (vm->flags & XE_VM_FLAG_64K && level == 1) 298 flags = XE_PDE_64K; 299 300 entry = vm->pt_ops->pde_encode_bo(bo, map_ofs + (u64)(level - 1) * 301 XE_PAGE_SIZE); 302 xe_map_wr(xe, &bo->vmap, map_ofs + XE_PAGE_SIZE * level, u64, 303 entry | flags); 304 } 305 306 /* Write PDE's that point to our BO. */ 307 for (i = 0; i < map_ofs / XE_PAGE_SIZE; i++) { 308 entry = vm->pt_ops->pde_encode_bo(bo, (u64)i * XE_PAGE_SIZE); 309 310 xe_map_wr(xe, &bo->vmap, map_ofs + XE_PAGE_SIZE + 311 (i + 1) * 8, u64, entry); 312 } 313 314 /* Reserve 2M PDEs */ 315 level = 1; 316 m->large_page_copy_ofs = NUM_PT_SLOTS << xe_pt_shift(level); 317 m->large_page_copy_pdes = map_ofs + XE_PAGE_SIZE * level + 318 NUM_PT_SLOTS * 8; 319 320 /* Set up a 1GiB NULL mapping at 255GiB offset. */ 321 level = 2; 322 xe_map_wr(xe, &bo->vmap, map_ofs + XE_PAGE_SIZE * level + 255 * 8, u64, 323 vm->pt_ops->pte_encode_addr(xe, 0, pat_index, level, IS_DGFX(xe), 0) 324 | XE_PTE_NULL); 325 m->cleared_mem_ofs = (255ULL << xe_pt_shift(level)); 326 327 /* Identity map the entire vram at 256GiB offset */ 328 if (IS_DGFX(xe)) { 329 u64 pt30_ofs = xe_bo_size(bo) - 2 * XE_PAGE_SIZE; 330 resource_size_t actual_phy_size = xe_vram_region_actual_physical_size(xe->mem.vram); 331 332 xe_migrate_program_identity(xe, vm, bo, map_ofs, IDENTITY_OFFSET, 333 pat_index, pt30_ofs); 334 xe_assert(xe, actual_phy_size <= (MAX_NUM_PTE - IDENTITY_OFFSET) * SZ_1G); 335 336 /* 337 * Identity map the entire vram for compressed pat_index for xe2+ 338 * if flat ccs is enabled. 339 */ 340 if (GRAPHICS_VER(xe) >= 20 && xe_device_has_flat_ccs(xe)) { 341 u16 comp_pat_index = xe_cache_pat_idx(xe, XE_CACHE_NONE_COMPRESSION); 342 u64 vram_offset = IDENTITY_OFFSET + 343 DIV_ROUND_UP_ULL(actual_phy_size, SZ_1G); 344 u64 pt31_ofs = xe_bo_size(bo) - XE_PAGE_SIZE; 345 346 xe_assert(xe, actual_phy_size <= (MAX_NUM_PTE - IDENTITY_OFFSET - 347 IDENTITY_OFFSET / 2) * SZ_1G); 348 xe_migrate_program_identity(xe, vm, bo, map_ofs, vram_offset, 349 comp_pat_index, pt31_ofs); 350 } 351 } 352 353 if (ofs) 354 *ofs = map_ofs; 355 } 356 357 static void xe_migrate_suballoc_manager_init(struct xe_migrate *m, u32 map_ofs) 358 { 359 /* 360 * Example layout created above, with root level = 3: 361 * [PT0...PT7]: kernel PT's for copy/clear; 64 or 4KiB PTE's 362 * [PT8]: Kernel PT for VM_BIND, 4 KiB PTE's 363 * [PT9...PT26]: Userspace PT's for VM_BIND, 4 KiB PTE's 364 * [PT27 = PDE 0] [PT28 = PDE 1] [PT29 = PDE 2] [PT30 & PT31 = 2M vram identity map] 365 * 366 * This makes the lowest part of the VM point to the pagetables. 367 * Hence the lowest 2M in the vm should point to itself, with a few writes 368 * and flushes, other parts of the VM can be used either for copying and 369 * clearing. 370 * 371 * For performance, the kernel reserves PDE's, so about 20 are left 372 * for async VM updates. 373 * 374 * To make it easier to work, each scratch PT is put in slot (1 + PT #) 375 * everywhere, this allows lockless updates to scratch pages by using 376 * the different addresses in VM. 377 */ 378 #define NUM_VMUSA_UNIT_PER_PAGE 32 379 #define VM_SA_UPDATE_UNIT_SIZE (XE_PAGE_SIZE / NUM_VMUSA_UNIT_PER_PAGE) 380 #define NUM_VMUSA_WRITES_PER_UNIT (VM_SA_UPDATE_UNIT_SIZE / sizeof(u64)) 381 drm_suballoc_manager_init(&m->vm_update_sa, 382 (size_t)(map_ofs / XE_PAGE_SIZE - NUM_KERNEL_PDE) * 383 NUM_VMUSA_UNIT_PER_PAGE, 0); 384 } 385 386 /* 387 * Including the reserved copy engine is required to avoid deadlocks due to 388 * migrate jobs servicing the faults gets stuck behind the job that faulted. 389 */ 390 static u32 xe_migrate_usm_logical_mask(struct xe_gt *gt) 391 { 392 u32 logical_mask = 0; 393 struct xe_hw_engine *hwe; 394 enum xe_hw_engine_id id; 395 396 for_each_hw_engine(hwe, gt, id) { 397 if (hwe->class != XE_ENGINE_CLASS_COPY) 398 continue; 399 400 if (xe_gt_is_usm_hwe(gt, hwe)) 401 logical_mask |= BIT(hwe->logical_instance); 402 } 403 404 return logical_mask; 405 } 406 407 static bool xe_migrate_needs_ccs_emit(struct xe_device *xe) 408 { 409 return xe_device_has_flat_ccs(xe) && !(GRAPHICS_VER(xe) >= 20 && IS_DGFX(xe)); 410 } 411 412 /** 413 * xe_migrate_alloc - Allocate a migrate struct for a given &xe_tile 414 * @tile: &xe_tile 415 * 416 * Allocates a &xe_migrate for a given tile. 417 * 418 * Return: &xe_migrate on success, or NULL when out of memory. 419 */ 420 struct xe_migrate *xe_migrate_alloc(struct xe_tile *tile) 421 { 422 struct xe_migrate *m = drmm_kzalloc(&tile_to_xe(tile)->drm, sizeof(*m), GFP_KERNEL); 423 424 if (m) 425 m->tile = tile; 426 return m; 427 } 428 429 static int xe_migrate_lock_prepare_vm(struct xe_tile *tile, struct xe_migrate *m, struct xe_vm *vm) 430 { 431 struct xe_device *xe = tile_to_xe(tile); 432 struct xe_validation_ctx ctx; 433 struct drm_exec exec; 434 u32 map_ofs; 435 int err = 0; 436 437 xe_validation_guard(&ctx, &xe->val, &exec, (struct xe_val_flags) {}, err) { 438 err = xe_vm_drm_exec_lock(vm, &exec); 439 if (err) 440 return err; 441 442 drm_exec_retry_on_contention(&exec); 443 444 err = xe_migrate_pt_bo_alloc(tile, m, vm, &exec); 445 if (err) 446 return err; 447 448 xe_migrate_prepare_vm(tile, m, vm, &map_ofs); 449 xe_migrate_suballoc_manager_init(m, map_ofs); 450 drm_exec_retry_on_contention(&exec); 451 xe_validation_retry_on_oom(&ctx, &err); 452 } 453 454 return err; 455 } 456 457 /** 458 * xe_migrate_init() - Initialize a migrate context 459 * @m: The migration context 460 * 461 * Return: 0 if successful, negative error code on failure 462 */ 463 int xe_migrate_init(struct xe_migrate *m) 464 { 465 struct xe_tile *tile = m->tile; 466 struct xe_gt *primary_gt = tile->primary_gt; 467 struct xe_device *xe = tile_to_xe(tile); 468 struct xe_vm *vm; 469 int err; 470 471 /* Special layout, prepared below.. */ 472 vm = xe_vm_create(xe, XE_VM_FLAG_MIGRATION | 473 XE_VM_FLAG_SET_TILE_ID(tile), NULL); 474 if (IS_ERR(vm)) 475 return PTR_ERR(vm); 476 477 err = xe_migrate_lock_prepare_vm(tile, m, vm); 478 if (err) 479 goto err_out; 480 481 if (xe->info.has_usm) { 482 struct xe_hw_engine *hwe = xe_gt_hw_engine(primary_gt, 483 XE_ENGINE_CLASS_COPY, 484 primary_gt->usm.reserved_bcs_instance, 485 false); 486 u32 logical_mask = xe_migrate_usm_logical_mask(primary_gt); 487 488 if (!hwe || !logical_mask) { 489 err = -EINVAL; 490 goto err_out; 491 } 492 493 /* 494 * XXX: Currently only reserving 1 (likely slow) BCS instance on 495 * PVC, may want to revisit if performance is needed. 496 */ 497 m->q = xe_exec_queue_create(xe, vm, logical_mask, 1, hwe, 498 EXEC_QUEUE_FLAG_KERNEL | 499 EXEC_QUEUE_FLAG_PERMANENT | 500 EXEC_QUEUE_FLAG_HIGH_PRIORITY | 501 EXEC_QUEUE_FLAG_MIGRATE | 502 EXEC_QUEUE_FLAG_LOW_LATENCY, 0); 503 } else { 504 m->q = xe_exec_queue_create_class(xe, primary_gt, vm, 505 XE_ENGINE_CLASS_COPY, 506 EXEC_QUEUE_FLAG_KERNEL | 507 EXEC_QUEUE_FLAG_PERMANENT | 508 EXEC_QUEUE_FLAG_MIGRATE, 0); 509 } 510 if (IS_ERR(m->q)) { 511 err = PTR_ERR(m->q); 512 goto err_out; 513 } 514 515 mutex_init(&m->job_mutex); 516 fs_reclaim_acquire(GFP_KERNEL); 517 might_lock(&m->job_mutex); 518 fs_reclaim_release(GFP_KERNEL); 519 520 err = devm_add_action_or_reset(xe->drm.dev, xe_migrate_fini, m); 521 if (err) 522 return err; 523 524 if (IS_DGFX(xe)) { 525 if (xe_migrate_needs_ccs_emit(xe)) 526 /* min chunk size corresponds to 4K of CCS Metadata */ 527 m->min_chunk_size = SZ_4K * SZ_64K / 528 xe_device_ccs_bytes(xe, SZ_64K); 529 else 530 /* Somewhat arbitrary to avoid a huge amount of blits */ 531 m->min_chunk_size = SZ_64K; 532 m->min_chunk_size = roundup_pow_of_two(m->min_chunk_size); 533 drm_dbg(&xe->drm, "Migrate min chunk size is 0x%08llx\n", 534 (unsigned long long)m->min_chunk_size); 535 } 536 537 return err; 538 539 err_out: 540 xe_vm_close_and_put(vm); 541 return err; 542 543 } 544 545 static u64 max_mem_transfer_per_pass(struct xe_device *xe) 546 { 547 if (!IS_DGFX(xe) && xe_device_has_flat_ccs(xe)) 548 return MAX_CCS_LIMITED_TRANSFER; 549 550 return MAX_PREEMPTDISABLE_TRANSFER; 551 } 552 553 static u64 xe_migrate_res_sizes(struct xe_migrate *m, struct xe_res_cursor *cur) 554 { 555 struct xe_device *xe = tile_to_xe(m->tile); 556 u64 size = min_t(u64, max_mem_transfer_per_pass(xe), cur->remaining); 557 558 if (mem_type_is_vram(cur->mem_type)) { 559 /* 560 * VRAM we want to blit in chunks with sizes aligned to 561 * min_chunk_size in order for the offset to CCS metadata to be 562 * page-aligned. If it's the last chunk it may be smaller. 563 * 564 * Another constraint is that we need to limit the blit to 565 * the VRAM block size, unless size is smaller than 566 * min_chunk_size. 567 */ 568 u64 chunk = max_t(u64, cur->size, m->min_chunk_size); 569 570 size = min_t(u64, size, chunk); 571 if (size > m->min_chunk_size) 572 size = round_down(size, m->min_chunk_size); 573 } 574 575 return size; 576 } 577 578 static bool xe_migrate_allow_identity(u64 size, const struct xe_res_cursor *cur) 579 { 580 /* If the chunk is not fragmented, allow identity map. */ 581 return cur->size >= size; 582 } 583 584 #define PTE_UPDATE_FLAG_IS_VRAM BIT(0) 585 #define PTE_UPDATE_FLAG_IS_COMP_PTE BIT(1) 586 587 static u32 pte_update_size(struct xe_migrate *m, 588 u32 flags, 589 struct ttm_resource *res, 590 struct xe_res_cursor *cur, 591 u64 *L0, u64 *L0_ofs, u32 *L0_pt, 592 u32 cmd_size, u32 pt_ofs, u32 avail_pts) 593 { 594 u32 cmds = 0; 595 bool is_vram = PTE_UPDATE_FLAG_IS_VRAM & flags; 596 bool is_comp_pte = PTE_UPDATE_FLAG_IS_COMP_PTE & flags; 597 598 *L0_pt = pt_ofs; 599 if (is_vram && xe_migrate_allow_identity(*L0, cur)) { 600 /* Offset into identity map. */ 601 *L0_ofs = xe_migrate_vram_ofs(tile_to_xe(m->tile), 602 cur->start + vram_region_gpu_offset(res), 603 is_comp_pte); 604 cmds += cmd_size; 605 } else { 606 /* Clip L0 to available size */ 607 u64 size = min(*L0, (u64)avail_pts * SZ_2M); 608 u32 num_4k_pages = (size + XE_PAGE_SIZE - 1) >> XE_PTE_SHIFT; 609 610 *L0 = size; 611 *L0_ofs = xe_migrate_vm_addr(pt_ofs, 0); 612 613 /* MI_STORE_DATA_IMM */ 614 cmds += 3 * DIV_ROUND_UP(num_4k_pages, MAX_PTE_PER_SDI); 615 616 /* PDE qwords */ 617 cmds += num_4k_pages * 2; 618 619 /* Each chunk has a single blit command */ 620 cmds += cmd_size; 621 } 622 623 return cmds; 624 } 625 626 static void emit_pte(struct xe_migrate *m, 627 struct xe_bb *bb, u32 at_pt, 628 bool is_vram, bool is_comp_pte, 629 struct xe_res_cursor *cur, 630 u32 size, struct ttm_resource *res) 631 { 632 struct xe_device *xe = tile_to_xe(m->tile); 633 struct xe_vm *vm = m->q->vm; 634 u16 pat_index; 635 u32 ptes; 636 u64 ofs = (u64)at_pt * XE_PAGE_SIZE; 637 u64 cur_ofs; 638 639 /* Indirect access needs compression enabled uncached PAT index */ 640 if (GRAPHICS_VERx100(xe) >= 2000) 641 pat_index = is_comp_pte ? xe_cache_pat_idx(xe, XE_CACHE_NONE_COMPRESSION) : 642 xe_cache_pat_idx(xe, XE_CACHE_WB); 643 else 644 pat_index = xe_cache_pat_idx(xe, XE_CACHE_WB); 645 646 ptes = DIV_ROUND_UP(size, XE_PAGE_SIZE); 647 648 while (ptes) { 649 u32 chunk = min(MAX_PTE_PER_SDI, ptes); 650 651 bb->cs[bb->len++] = MI_STORE_DATA_IMM | MI_SDI_NUM_QW(chunk); 652 bb->cs[bb->len++] = ofs; 653 bb->cs[bb->len++] = 0; 654 655 cur_ofs = ofs; 656 ofs += chunk * 8; 657 ptes -= chunk; 658 659 while (chunk--) { 660 u64 addr, flags = 0; 661 bool devmem = false; 662 663 addr = xe_res_dma(cur) & PAGE_MASK; 664 if (is_vram) { 665 if (vm->flags & XE_VM_FLAG_64K) { 666 u64 va = cur_ofs * XE_PAGE_SIZE / 8; 667 668 xe_assert(xe, (va & (SZ_64K - 1)) == 669 (addr & (SZ_64K - 1))); 670 671 flags |= XE_PTE_PS64; 672 } 673 674 addr += vram_region_gpu_offset(res); 675 devmem = true; 676 } 677 678 addr = vm->pt_ops->pte_encode_addr(m->tile->xe, 679 addr, pat_index, 680 0, devmem, flags); 681 bb->cs[bb->len++] = lower_32_bits(addr); 682 bb->cs[bb->len++] = upper_32_bits(addr); 683 684 xe_res_next(cur, min_t(u32, size, PAGE_SIZE)); 685 cur_ofs += 8; 686 } 687 } 688 } 689 690 #define EMIT_COPY_CCS_DW 5 691 static void emit_copy_ccs(struct xe_gt *gt, struct xe_bb *bb, 692 u64 dst_ofs, bool dst_is_indirect, 693 u64 src_ofs, bool src_is_indirect, 694 u32 size) 695 { 696 struct xe_device *xe = gt_to_xe(gt); 697 u32 *cs = bb->cs + bb->len; 698 u32 num_ccs_blks; 699 u32 num_pages; 700 u32 ccs_copy_size; 701 u32 mocs; 702 703 if (GRAPHICS_VERx100(xe) >= 2000) { 704 num_pages = DIV_ROUND_UP(size, XE_PAGE_SIZE); 705 xe_gt_assert(gt, FIELD_FIT(XE2_CCS_SIZE_MASK, num_pages - 1)); 706 707 ccs_copy_size = REG_FIELD_PREP(XE2_CCS_SIZE_MASK, num_pages - 1); 708 mocs = FIELD_PREP(XE2_XY_CTRL_SURF_MOCS_INDEX_MASK, gt->mocs.uc_index); 709 710 } else { 711 num_ccs_blks = DIV_ROUND_UP(xe_device_ccs_bytes(gt_to_xe(gt), size), 712 NUM_CCS_BYTES_PER_BLOCK); 713 xe_gt_assert(gt, FIELD_FIT(CCS_SIZE_MASK, num_ccs_blks - 1)); 714 715 ccs_copy_size = REG_FIELD_PREP(CCS_SIZE_MASK, num_ccs_blks - 1); 716 mocs = FIELD_PREP(XY_CTRL_SURF_MOCS_MASK, gt->mocs.uc_index); 717 } 718 719 *cs++ = XY_CTRL_SURF_COPY_BLT | 720 (src_is_indirect ? 0x0 : 0x1) << SRC_ACCESS_TYPE_SHIFT | 721 (dst_is_indirect ? 0x0 : 0x1) << DST_ACCESS_TYPE_SHIFT | 722 ccs_copy_size; 723 *cs++ = lower_32_bits(src_ofs); 724 *cs++ = upper_32_bits(src_ofs) | mocs; 725 *cs++ = lower_32_bits(dst_ofs); 726 *cs++ = upper_32_bits(dst_ofs) | mocs; 727 728 bb->len = cs - bb->cs; 729 } 730 731 #define EMIT_COPY_DW 10 732 static void emit_xy_fast_copy(struct xe_gt *gt, struct xe_bb *bb, u64 src_ofs, 733 u64 dst_ofs, unsigned int size, 734 unsigned int pitch) 735 { 736 struct xe_device *xe = gt_to_xe(gt); 737 u32 mocs = 0; 738 u32 tile_y = 0; 739 740 xe_gt_assert(gt, !(pitch & 3)); 741 xe_gt_assert(gt, size / pitch <= S16_MAX); 742 xe_gt_assert(gt, pitch / 4 <= S16_MAX); 743 xe_gt_assert(gt, pitch <= U16_MAX); 744 745 if (GRAPHICS_VER(xe) >= 20) 746 mocs = FIELD_PREP(XE2_XY_FAST_COPY_BLT_MOCS_INDEX_MASK, gt->mocs.uc_index); 747 748 if (GRAPHICS_VERx100(xe) >= 1250) 749 tile_y = XY_FAST_COPY_BLT_D1_SRC_TILE4 | XY_FAST_COPY_BLT_D1_DST_TILE4; 750 751 bb->cs[bb->len++] = XY_FAST_COPY_BLT_CMD | (10 - 2); 752 bb->cs[bb->len++] = XY_FAST_COPY_BLT_DEPTH_32 | pitch | tile_y | mocs; 753 bb->cs[bb->len++] = 0; 754 bb->cs[bb->len++] = (size / pitch) << 16 | pitch / 4; 755 bb->cs[bb->len++] = lower_32_bits(dst_ofs); 756 bb->cs[bb->len++] = upper_32_bits(dst_ofs); 757 bb->cs[bb->len++] = 0; 758 bb->cs[bb->len++] = pitch | mocs; 759 bb->cs[bb->len++] = lower_32_bits(src_ofs); 760 bb->cs[bb->len++] = upper_32_bits(src_ofs); 761 } 762 763 #define PAGE_COPY_MODE_PS SZ_256 /* hw uses 256 bytes as the page-size */ 764 static void emit_mem_copy(struct xe_gt *gt, struct xe_bb *bb, u64 src_ofs, 765 u64 dst_ofs, unsigned int size, unsigned int pitch) 766 { 767 u32 mode, copy_type, width; 768 769 xe_gt_assert(gt, IS_ALIGNED(size, pitch)); 770 xe_gt_assert(gt, pitch <= U16_MAX); 771 xe_gt_assert(gt, pitch); 772 xe_gt_assert(gt, size); 773 774 if (IS_ALIGNED(size, PAGE_COPY_MODE_PS) && 775 IS_ALIGNED(lower_32_bits(src_ofs), PAGE_COPY_MODE_PS) && 776 IS_ALIGNED(lower_32_bits(dst_ofs), PAGE_COPY_MODE_PS)) { 777 mode = MEM_COPY_PAGE_COPY_MODE; 778 copy_type = 0; /* linear copy */ 779 width = size / PAGE_COPY_MODE_PS; 780 } else if (pitch > 1) { 781 xe_gt_assert(gt, size / pitch <= U16_MAX); 782 mode = 0; /* BYTE_COPY */ 783 copy_type = MEM_COPY_MATRIX_COPY; 784 width = pitch; 785 } else { 786 mode = 0; /* BYTE_COPY */ 787 copy_type = 0; /* linear copy */ 788 width = size; 789 } 790 791 xe_gt_assert(gt, width <= U16_MAX); 792 793 bb->cs[bb->len++] = MEM_COPY_CMD | mode | copy_type; 794 bb->cs[bb->len++] = width - 1; 795 bb->cs[bb->len++] = size / pitch - 1; /* ignored by hw for page-copy/linear above */ 796 bb->cs[bb->len++] = pitch - 1; 797 bb->cs[bb->len++] = pitch - 1; 798 bb->cs[bb->len++] = lower_32_bits(src_ofs); 799 bb->cs[bb->len++] = upper_32_bits(src_ofs); 800 bb->cs[bb->len++] = lower_32_bits(dst_ofs); 801 bb->cs[bb->len++] = upper_32_bits(dst_ofs); 802 bb->cs[bb->len++] = FIELD_PREP(MEM_COPY_SRC_MOCS_INDEX_MASK, gt->mocs.uc_index) | 803 FIELD_PREP(MEM_COPY_DST_MOCS_INDEX_MASK, gt->mocs.uc_index); 804 } 805 806 static void emit_copy(struct xe_gt *gt, struct xe_bb *bb, 807 u64 src_ofs, u64 dst_ofs, unsigned int size, 808 unsigned int pitch) 809 { 810 struct xe_device *xe = gt_to_xe(gt); 811 812 if (xe->info.has_mem_copy_instr) 813 emit_mem_copy(gt, bb, src_ofs, dst_ofs, size, pitch); 814 else 815 emit_xy_fast_copy(gt, bb, src_ofs, dst_ofs, size, pitch); 816 } 817 818 static u64 xe_migrate_batch_base(struct xe_migrate *m, bool usm) 819 { 820 return usm ? m->usm_batch_base_ofs : m->batch_base_ofs; 821 } 822 823 static u32 xe_migrate_ccs_copy(struct xe_migrate *m, 824 struct xe_bb *bb, 825 u64 src_ofs, bool src_is_indirect, 826 u64 dst_ofs, bool dst_is_indirect, u32 dst_size, 827 u64 ccs_ofs, bool copy_ccs) 828 { 829 struct xe_gt *gt = m->tile->primary_gt; 830 u32 flush_flags = 0; 831 832 if (!copy_ccs && dst_is_indirect) { 833 /* 834 * If the src is already in vram, then it should already 835 * have been cleared by us, or has been populated by the 836 * user. Make sure we copy the CCS aux state as-is. 837 * 838 * Otherwise if the bo doesn't have any CCS metadata attached, 839 * we still need to clear it for security reasons. 840 */ 841 u64 ccs_src_ofs = src_is_indirect ? src_ofs : m->cleared_mem_ofs; 842 843 emit_copy_ccs(gt, bb, 844 dst_ofs, true, 845 ccs_src_ofs, src_is_indirect, dst_size); 846 847 flush_flags = MI_FLUSH_DW_CCS; 848 } else if (copy_ccs) { 849 if (!src_is_indirect) 850 src_ofs = ccs_ofs; 851 else if (!dst_is_indirect) 852 dst_ofs = ccs_ofs; 853 854 xe_gt_assert(gt, src_is_indirect || dst_is_indirect); 855 856 emit_copy_ccs(gt, bb, dst_ofs, dst_is_indirect, src_ofs, 857 src_is_indirect, dst_size); 858 if (dst_is_indirect) 859 flush_flags = MI_FLUSH_DW_CCS; 860 } 861 862 return flush_flags; 863 } 864 865 static struct dma_fence *__xe_migrate_copy(struct xe_migrate *m, 866 struct xe_bo *src_bo, 867 struct xe_bo *dst_bo, 868 struct ttm_resource *src, 869 struct ttm_resource *dst, 870 bool copy_only_ccs, 871 bool is_vram_resolve) 872 { 873 struct xe_gt *gt = m->tile->primary_gt; 874 struct xe_device *xe = gt_to_xe(gt); 875 struct dma_fence *fence = NULL; 876 u64 size = xe_bo_size(src_bo); 877 struct xe_res_cursor src_it, dst_it, ccs_it; 878 u64 src_L0_ofs, dst_L0_ofs; 879 u32 src_L0_pt, dst_L0_pt; 880 u64 src_L0, dst_L0; 881 int pass = 0; 882 int err; 883 bool src_is_pltt = src->mem_type == XE_PL_TT; 884 bool dst_is_pltt = dst->mem_type == XE_PL_TT; 885 bool src_is_vram = mem_type_is_vram(src->mem_type); 886 bool dst_is_vram = mem_type_is_vram(dst->mem_type); 887 bool type_device = src_bo->ttm.type == ttm_bo_type_device; 888 bool needs_ccs_emit = type_device && xe_migrate_needs_ccs_emit(xe); 889 bool copy_ccs = xe_device_has_flat_ccs(xe) && 890 xe_bo_needs_ccs_pages(src_bo) && xe_bo_needs_ccs_pages(dst_bo); 891 bool copy_system_ccs = copy_ccs && (!src_is_vram || !dst_is_vram); 892 893 /* 894 * For decompression operation, always use the compression PAT index. 895 * Otherwise, only use the compression PAT index for device memory 896 * when copying from VRAM to system memory. 897 */ 898 bool use_comp_pat = is_vram_resolve || (type_device && 899 xe_device_has_flat_ccs(xe) && 900 GRAPHICS_VER(xe) >= 20 && src_is_vram && !dst_is_vram); 901 902 /* Copying CCS between two different BOs is not supported yet. */ 903 if (XE_WARN_ON(copy_ccs && src_bo != dst_bo)) 904 return ERR_PTR(-EINVAL); 905 906 if (src_bo != dst_bo && XE_WARN_ON(xe_bo_size(src_bo) != xe_bo_size(dst_bo))) 907 return ERR_PTR(-EINVAL); 908 909 if (!src_is_vram) 910 xe_res_first_sg(xe_bo_sg(src_bo), 0, size, &src_it); 911 else 912 xe_res_first(src, 0, size, &src_it); 913 if (!dst_is_vram) 914 xe_res_first_sg(xe_bo_sg(dst_bo), 0, size, &dst_it); 915 else 916 xe_res_first(dst, 0, size, &dst_it); 917 918 if (copy_system_ccs) 919 xe_res_first_sg(xe_bo_sg(src_bo), xe_bo_ccs_pages_start(src_bo), 920 PAGE_ALIGN(xe_device_ccs_bytes(xe, size)), 921 &ccs_it); 922 923 while (size) { 924 u32 batch_size = 1; /* MI_BATCH_BUFFER_END */ 925 struct xe_sched_job *job; 926 struct xe_bb *bb; 927 u32 flush_flags = 0; 928 u32 update_idx; 929 u64 ccs_ofs, ccs_size; 930 u32 ccs_pt; 931 u32 pte_flags; 932 933 bool usm = xe->info.has_usm; 934 u32 avail_pts = max_mem_transfer_per_pass(xe) / LEVEL0_PAGE_TABLE_ENCODE_SIZE; 935 936 src_L0 = xe_migrate_res_sizes(m, &src_it); 937 dst_L0 = xe_migrate_res_sizes(m, &dst_it); 938 939 drm_dbg(&xe->drm, "Pass %u, sizes: %llu & %llu\n", 940 pass++, src_L0, dst_L0); 941 942 src_L0 = min(src_L0, dst_L0); 943 944 pte_flags = src_is_vram ? PTE_UPDATE_FLAG_IS_VRAM : 0; 945 pte_flags |= use_comp_pat ? PTE_UPDATE_FLAG_IS_COMP_PTE : 0; 946 batch_size += pte_update_size(m, pte_flags, src, &src_it, &src_L0, 947 &src_L0_ofs, &src_L0_pt, 0, 0, 948 avail_pts); 949 if (copy_only_ccs) { 950 dst_L0_ofs = src_L0_ofs; 951 } else { 952 pte_flags = dst_is_vram ? PTE_UPDATE_FLAG_IS_VRAM : 0; 953 batch_size += pte_update_size(m, pte_flags, dst, 954 &dst_it, &src_L0, 955 &dst_L0_ofs, &dst_L0_pt, 956 0, avail_pts, avail_pts); 957 } 958 959 if (copy_system_ccs) { 960 xe_assert(xe, type_device); 961 ccs_size = xe_device_ccs_bytes(xe, src_L0); 962 batch_size += pte_update_size(m, 0, NULL, &ccs_it, &ccs_size, 963 &ccs_ofs, &ccs_pt, 0, 964 2 * avail_pts, 965 avail_pts); 966 xe_assert(xe, IS_ALIGNED(ccs_it.start, PAGE_SIZE)); 967 } 968 969 /* Add copy commands size here */ 970 batch_size += ((copy_only_ccs) ? 0 : EMIT_COPY_DW) + 971 ((needs_ccs_emit ? EMIT_COPY_CCS_DW : 0)); 972 973 bb = xe_bb_new(gt, batch_size, usm); 974 if (IS_ERR(bb)) { 975 err = PTR_ERR(bb); 976 goto err_sync; 977 } 978 979 if (src_is_vram && xe_migrate_allow_identity(src_L0, &src_it)) 980 xe_res_next(&src_it, src_L0); 981 else 982 emit_pte(m, bb, src_L0_pt, src_is_vram, copy_system_ccs || use_comp_pat, 983 &src_it, src_L0, src); 984 985 if (dst_is_vram && xe_migrate_allow_identity(src_L0, &dst_it)) 986 xe_res_next(&dst_it, src_L0); 987 else if (!copy_only_ccs) 988 emit_pte(m, bb, dst_L0_pt, dst_is_vram, copy_system_ccs, 989 &dst_it, src_L0, dst); 990 991 if (copy_system_ccs) 992 emit_pte(m, bb, ccs_pt, false, false, &ccs_it, ccs_size, src); 993 994 bb->cs[bb->len++] = MI_BATCH_BUFFER_END; 995 update_idx = bb->len; 996 997 if (!copy_only_ccs) 998 emit_copy(gt, bb, src_L0_ofs, dst_L0_ofs, src_L0, XE_PAGE_SIZE); 999 1000 if (needs_ccs_emit) 1001 flush_flags = xe_migrate_ccs_copy(m, bb, src_L0_ofs, 1002 IS_DGFX(xe) ? src_is_vram : src_is_pltt, 1003 dst_L0_ofs, 1004 IS_DGFX(xe) ? dst_is_vram : dst_is_pltt, 1005 src_L0, ccs_ofs, copy_ccs); 1006 1007 job = xe_bb_create_migration_job(m->q, bb, 1008 xe_migrate_batch_base(m, usm), 1009 update_idx); 1010 if (IS_ERR(job)) { 1011 err = PTR_ERR(job); 1012 goto err; 1013 } 1014 1015 xe_sched_job_add_migrate_flush(job, flush_flags | MI_INVALIDATE_TLB); 1016 if (!fence) { 1017 err = xe_sched_job_add_deps(job, src_bo->ttm.base.resv, 1018 DMA_RESV_USAGE_BOOKKEEP); 1019 if (!err && src_bo->ttm.base.resv != dst_bo->ttm.base.resv) 1020 err = xe_sched_job_add_deps(job, dst_bo->ttm.base.resv, 1021 DMA_RESV_USAGE_BOOKKEEP); 1022 if (err) 1023 goto err_job; 1024 } 1025 1026 mutex_lock(&m->job_mutex); 1027 xe_sched_job_arm(job); 1028 dma_fence_put(fence); 1029 fence = dma_fence_get(&job->drm.s_fence->finished); 1030 xe_sched_job_push(job); 1031 1032 dma_fence_put(m->fence); 1033 m->fence = dma_fence_get(fence); 1034 1035 mutex_unlock(&m->job_mutex); 1036 1037 xe_bb_free(bb, fence); 1038 size -= src_L0; 1039 continue; 1040 1041 err_job: 1042 xe_sched_job_put(job); 1043 err: 1044 xe_bb_free(bb, NULL); 1045 1046 err_sync: 1047 /* Sync partial copy if any. FIXME: under job_mutex? */ 1048 if (fence) { 1049 dma_fence_wait(fence, false); 1050 dma_fence_put(fence); 1051 } 1052 1053 return ERR_PTR(err); 1054 } 1055 1056 return fence; 1057 } 1058 1059 /** 1060 * xe_migrate_copy() - Copy content of TTM resources. 1061 * @m: The migration context. 1062 * @src_bo: The buffer object @src is currently bound to. 1063 * @dst_bo: If copying between resources created for the same bo, set this to 1064 * the same value as @src_bo. If copying between buffer objects, set it to 1065 * the buffer object @dst is currently bound to. 1066 * @src: The source TTM resource. 1067 * @dst: The dst TTM resource. 1068 * @copy_only_ccs: If true copy only CCS metadata 1069 * 1070 * Copies the contents of @src to @dst: On flat CCS devices, 1071 * the CCS metadata is copied as well if needed, or if not present, 1072 * the CCS metadata of @dst is cleared for security reasons. 1073 * 1074 * Return: Pointer to a dma_fence representing the last copy batch, or 1075 * an error pointer on failure. If there is a failure, any copy operation 1076 * started by the function call has been synced. 1077 */ 1078 struct dma_fence *xe_migrate_copy(struct xe_migrate *m, 1079 struct xe_bo *src_bo, 1080 struct xe_bo *dst_bo, 1081 struct ttm_resource *src, 1082 struct ttm_resource *dst, 1083 bool copy_only_ccs) 1084 { 1085 return __xe_migrate_copy(m, src_bo, dst_bo, src, dst, copy_only_ccs, false); 1086 } 1087 1088 /** 1089 * xe_migrate_resolve() - Resolve and decompress a buffer object if required. 1090 * @m: The migrate context 1091 * @bo: The buffer object to resolve 1092 * @res: The reservation object 1093 * 1094 * Wrapper around __xe_migrate_copy() with is_vram_resolve set to true 1095 * to trigger decompression if needed. 1096 * 1097 * Return: A dma_fence that signals on completion, or an ERR_PTR on failure. 1098 */ 1099 struct dma_fence *xe_migrate_resolve(struct xe_migrate *m, 1100 struct xe_bo *bo, 1101 struct ttm_resource *res) 1102 { 1103 return __xe_migrate_copy(m, bo, bo, res, res, false, true); 1104 } 1105 1106 /** 1107 * xe_migrate_lrc() - Get the LRC from migrate context. 1108 * @migrate: Migrate context. 1109 * 1110 * Return: Pointer to LRC on success, error on failure 1111 */ 1112 struct xe_lrc *xe_migrate_lrc(struct xe_migrate *migrate) 1113 { 1114 return migrate->q->lrc[0]; 1115 } 1116 1117 static u64 migrate_vm_ppgtt_addr_tlb_inval(void) 1118 { 1119 /* 1120 * The migrate VM is self-referential so it can modify its own PTEs (see 1121 * pte_update_size() or emit_pte() functions). We reserve NUM_KERNEL_PDE 1122 * entries for kernel operations (copies, clears, CCS migrate), and 1123 * suballocate the rest to user operations (binds/unbinds). With 1124 * NUM_KERNEL_PDE = 15, NUM_KERNEL_PDE - 1 is already used for PTE updates, 1125 * so assign NUM_KERNEL_PDE - 2 for TLB invalidation. 1126 */ 1127 return (NUM_KERNEL_PDE - 2) * XE_PAGE_SIZE; 1128 } 1129 1130 static int emit_flush_invalidate(u32 *dw, int i, u32 flags) 1131 { 1132 u64 addr = migrate_vm_ppgtt_addr_tlb_inval(); 1133 1134 dw[i++] = MI_FLUSH_DW | MI_INVALIDATE_TLB | MI_FLUSH_DW_OP_STOREDW | 1135 MI_FLUSH_IMM_DW | flags; 1136 dw[i++] = lower_32_bits(addr); 1137 dw[i++] = upper_32_bits(addr); 1138 dw[i++] = MI_NOOP; 1139 dw[i++] = MI_NOOP; 1140 1141 return i; 1142 } 1143 1144 /** 1145 * xe_migrate_ccs_rw_copy() - Copy content of TTM resources. 1146 * @tile: Tile whose migration context to be used. 1147 * @q : Execution to be used along with migration context. 1148 * @src_bo: The buffer object @src is currently bound to. 1149 * @read_write : Creates BB commands for CCS read/write. 1150 * 1151 * Creates batch buffer instructions to copy CCS metadata from CCS pool to 1152 * memory and vice versa. 1153 * 1154 * This function should only be called for IGPU. 1155 * 1156 * Return: 0 if successful, negative error code on failure. 1157 */ 1158 int xe_migrate_ccs_rw_copy(struct xe_tile *tile, struct xe_exec_queue *q, 1159 struct xe_bo *src_bo, 1160 enum xe_sriov_vf_ccs_rw_ctxs read_write) 1161 1162 { 1163 bool src_is_pltt = read_write == XE_SRIOV_VF_CCS_READ_CTX; 1164 bool dst_is_pltt = read_write == XE_SRIOV_VF_CCS_WRITE_CTX; 1165 struct ttm_resource *src = src_bo->ttm.resource; 1166 struct xe_migrate *m = tile->migrate; 1167 struct xe_gt *gt = tile->primary_gt; 1168 u32 batch_size, batch_size_allocated; 1169 struct xe_device *xe = gt_to_xe(gt); 1170 struct xe_res_cursor src_it, ccs_it; 1171 struct xe_mem_pool *bb_pool; 1172 struct xe_sriov_vf_ccs_ctx *ctx; 1173 u64 size = xe_bo_size(src_bo); 1174 struct xe_mem_pool_node *bb; 1175 u64 src_L0, src_L0_ofs; 1176 struct xe_bb xe_bb_tmp; 1177 u32 src_L0_pt; 1178 int err; 1179 1180 ctx = &xe->sriov.vf.ccs.contexts[read_write]; 1181 1182 xe_res_first_sg(xe_bo_sg(src_bo), 0, size, &src_it); 1183 1184 xe_res_first_sg(xe_bo_sg(src_bo), xe_bo_ccs_pages_start(src_bo), 1185 PAGE_ALIGN(xe_device_ccs_bytes(xe, size)), 1186 &ccs_it); 1187 1188 /* Calculate Batch buffer size */ 1189 batch_size = 0; 1190 while (size) { 1191 batch_size += 10; /* Flush + ggtt addr + 2 NOP */ 1192 u64 ccs_ofs, ccs_size; 1193 u32 ccs_pt; 1194 1195 u32 avail_pts = max_mem_transfer_per_pass(xe) / LEVEL0_PAGE_TABLE_ENCODE_SIZE; 1196 1197 src_L0 = min_t(u64, max_mem_transfer_per_pass(xe), size); 1198 1199 batch_size += pte_update_size(m, false, src, &src_it, &src_L0, 1200 &src_L0_ofs, &src_L0_pt, 0, 0, 1201 avail_pts); 1202 1203 ccs_size = xe_device_ccs_bytes(xe, src_L0); 1204 batch_size += pte_update_size(m, 0, NULL, &ccs_it, &ccs_size, &ccs_ofs, 1205 &ccs_pt, 0, avail_pts, avail_pts); 1206 xe_assert(xe, IS_ALIGNED(ccs_it.start, PAGE_SIZE)); 1207 1208 /* Add copy commands size here */ 1209 batch_size += EMIT_COPY_CCS_DW; 1210 1211 size -= src_L0; 1212 } 1213 1214 bb = xe_mem_pool_alloc_node(); 1215 if (IS_ERR(bb)) 1216 return PTR_ERR(bb); 1217 1218 bb_pool = ctx->mem.ccs_bb_pool; 1219 scoped_guard(mutex, xe_mem_pool_bo_swap_guard(bb_pool)) { 1220 xe_mem_pool_swap_shadow_locked(bb_pool); 1221 1222 err = xe_mem_pool_insert_node(bb_pool, bb, batch_size * sizeof(u32)); 1223 if (err) { 1224 xe_gt_err(gt, "BB allocation failed.\n"); 1225 kfree(bb); 1226 return err; 1227 } 1228 1229 batch_size_allocated = batch_size; 1230 size = xe_bo_size(src_bo); 1231 batch_size = 0; 1232 1233 xe_bb_tmp = (struct xe_bb){ .cs = xe_mem_pool_node_cpu_addr(bb), .len = 0 }; 1234 /* 1235 * Emit PTE and copy commands here. 1236 * The CCS copy command can only support limited size. If the size to be 1237 * copied is more than the limit, divide copy into chunks. So, calculate 1238 * sizes here again before copy command is emitted. 1239 */ 1240 1241 while (size) { 1242 batch_size += 10; /* Flush + ggtt addr + 2 NOP */ 1243 u32 flush_flags = 0; 1244 u64 ccs_ofs, ccs_size; 1245 u32 ccs_pt; 1246 1247 u32 avail_pts = max_mem_transfer_per_pass(xe) / 1248 LEVEL0_PAGE_TABLE_ENCODE_SIZE; 1249 1250 src_L0 = xe_migrate_res_sizes(m, &src_it); 1251 1252 batch_size += pte_update_size(m, false, src, &src_it, &src_L0, 1253 &src_L0_ofs, &src_L0_pt, 0, 0, 1254 avail_pts); 1255 1256 ccs_size = xe_device_ccs_bytes(xe, src_L0); 1257 batch_size += pte_update_size(m, 0, NULL, &ccs_it, &ccs_size, &ccs_ofs, 1258 &ccs_pt, 0, avail_pts, avail_pts); 1259 xe_assert(xe, IS_ALIGNED(ccs_it.start, PAGE_SIZE)); 1260 batch_size += EMIT_COPY_CCS_DW; 1261 1262 emit_pte(m, &xe_bb_tmp, src_L0_pt, false, true, &src_it, src_L0, src); 1263 1264 emit_pte(m, &xe_bb_tmp, ccs_pt, false, false, &ccs_it, ccs_size, src); 1265 1266 xe_bb_tmp.len = emit_flush_invalidate(xe_bb_tmp.cs, xe_bb_tmp.len, 1267 flush_flags); 1268 flush_flags = xe_migrate_ccs_copy(m, &xe_bb_tmp, src_L0_ofs, src_is_pltt, 1269 src_L0_ofs, dst_is_pltt, 1270 src_L0, ccs_ofs, true); 1271 xe_bb_tmp.len = emit_flush_invalidate(xe_bb_tmp.cs, xe_bb_tmp.len, 1272 flush_flags); 1273 1274 size -= src_L0; 1275 } 1276 1277 xe_assert(xe, (batch_size_allocated == xe_bb_tmp.len)); 1278 xe_assert(xe, bb->sa_node.size == xe_bb_tmp.len * sizeof(u32)); 1279 src_bo->bb_ccs[read_write] = bb; 1280 1281 xe_sriov_vf_ccs_rw_update_bb_addr(ctx); 1282 xe_mem_pool_sync_shadow_locked(bb); 1283 } 1284 1285 return 0; 1286 } 1287 1288 /** 1289 * xe_migrate_ccs_rw_copy_clear() - Clear the CCS read/write batch buffer 1290 * content. 1291 * @src_bo: The buffer object @src is currently bound to. 1292 * @read_write : Creates BB commands for CCS read/write. 1293 * 1294 * Directly clearing the BB lacks atomicity and can lead to undefined 1295 * behavior if the vCPU is halted mid-operation during the clearing 1296 * process. To avoid this issue, we use a shadow buffer object approach. 1297 * 1298 * First swap the SA BO address with the shadow BO, perform the clearing 1299 * operation on the BB, update the shadow BO in the ring buffer, then 1300 * sync the shadow and the actual buffer to maintain consistency. 1301 * 1302 * Returns: None. 1303 */ 1304 void xe_migrate_ccs_rw_copy_clear(struct xe_bo *src_bo, 1305 enum xe_sriov_vf_ccs_rw_ctxs read_write) 1306 { 1307 struct xe_mem_pool_node *bb = src_bo->bb_ccs[read_write]; 1308 struct xe_device *xe = xe_bo_device(src_bo); 1309 struct xe_mem_pool *bb_pool; 1310 struct xe_sriov_vf_ccs_ctx *ctx; 1311 u32 *cs; 1312 1313 xe_assert(xe, IS_SRIOV_VF(xe)); 1314 1315 ctx = &xe->sriov.vf.ccs.contexts[read_write]; 1316 bb_pool = ctx->mem.ccs_bb_pool; 1317 1318 scoped_guard(mutex, xe_mem_pool_bo_swap_guard(bb_pool)) { 1319 xe_mem_pool_swap_shadow_locked(bb_pool); 1320 1321 cs = xe_mem_pool_node_cpu_addr(bb); 1322 memset(cs, MI_NOOP, bb->sa_node.size); 1323 xe_sriov_vf_ccs_rw_update_bb_addr(ctx); 1324 1325 xe_mem_pool_sync_shadow_locked(bb); 1326 xe_mem_pool_free_node(bb); 1327 src_bo->bb_ccs[read_write] = NULL; 1328 } 1329 } 1330 1331 /** 1332 * xe_migrate_exec_queue() - Get the execution queue from migrate context. 1333 * @migrate: Migrate context. 1334 * 1335 * Return: Pointer to execution queue on success, error on failure 1336 */ 1337 struct xe_exec_queue *xe_migrate_exec_queue(struct xe_migrate *migrate) 1338 { 1339 return migrate->q; 1340 } 1341 1342 /** 1343 * xe_migrate_vram_copy_chunk() - Copy a chunk of a VRAM buffer object. 1344 * @vram_bo: The VRAM buffer object. 1345 * @vram_offset: The VRAM offset. 1346 * @sysmem_bo: The sysmem buffer object. 1347 * @sysmem_offset: The sysmem offset. 1348 * @size: The size of VRAM chunk to copy. 1349 * @dir: The direction of the copy operation. 1350 * 1351 * Copies a portion of a buffer object between VRAM and system memory. 1352 * On Xe2 platforms that support flat CCS, VRAM data is decompressed when 1353 * copying to system memory. 1354 * 1355 * Return: Pointer to a dma_fence representing the last copy batch, or 1356 * an error pointer on failure. If there is a failure, any copy operation 1357 * started by the function call has been synced. 1358 */ 1359 struct dma_fence *xe_migrate_vram_copy_chunk(struct xe_bo *vram_bo, u64 vram_offset, 1360 struct xe_bo *sysmem_bo, u64 sysmem_offset, 1361 u64 size, enum xe_migrate_copy_dir dir) 1362 { 1363 struct xe_device *xe = xe_bo_device(vram_bo); 1364 struct xe_tile *tile = vram_bo->tile; 1365 struct xe_gt *gt = tile->primary_gt; 1366 struct xe_migrate *m = tile->migrate; 1367 struct dma_fence *fence = NULL; 1368 struct ttm_resource *vram = vram_bo->ttm.resource; 1369 struct ttm_resource *sysmem = sysmem_bo->ttm.resource; 1370 struct xe_res_cursor vram_it, sysmem_it; 1371 u64 vram_L0_ofs, sysmem_L0_ofs; 1372 u32 vram_L0_pt, sysmem_L0_pt; 1373 u64 vram_L0, sysmem_L0; 1374 bool to_sysmem = (dir == XE_MIGRATE_COPY_TO_SRAM); 1375 bool use_comp_pat = to_sysmem && 1376 GRAPHICS_VER(xe) >= 20 && xe_device_has_flat_ccs(xe); 1377 int pass = 0; 1378 int err; 1379 1380 xe_assert(xe, IS_ALIGNED(vram_offset | sysmem_offset | size, PAGE_SIZE)); 1381 xe_assert(xe, xe_bo_is_vram(vram_bo)); 1382 xe_assert(xe, !xe_bo_is_vram(sysmem_bo)); 1383 xe_assert(xe, !range_overflows(vram_offset, size, (u64)vram_bo->ttm.base.size)); 1384 xe_assert(xe, !range_overflows(sysmem_offset, size, (u64)sysmem_bo->ttm.base.size)); 1385 1386 xe_res_first(vram, vram_offset, size, &vram_it); 1387 xe_res_first_sg(xe_bo_sg(sysmem_bo), sysmem_offset, size, &sysmem_it); 1388 1389 while (size) { 1390 u32 pte_flags = PTE_UPDATE_FLAG_IS_VRAM; 1391 u32 batch_size = 2; /* arb_clear() + MI_BATCH_BUFFER_END */ 1392 struct xe_sched_job *job; 1393 struct xe_bb *bb; 1394 u32 update_idx; 1395 bool usm = xe->info.has_usm; 1396 u32 avail_pts = max_mem_transfer_per_pass(xe) / LEVEL0_PAGE_TABLE_ENCODE_SIZE; 1397 1398 sysmem_L0 = xe_migrate_res_sizes(m, &sysmem_it); 1399 vram_L0 = min(xe_migrate_res_sizes(m, &vram_it), sysmem_L0); 1400 1401 xe_dbg(xe, "Pass %u, size: %llu\n", pass++, vram_L0); 1402 1403 pte_flags |= use_comp_pat ? PTE_UPDATE_FLAG_IS_COMP_PTE : 0; 1404 batch_size += pte_update_size(m, pte_flags, vram, &vram_it, &vram_L0, 1405 &vram_L0_ofs, &vram_L0_pt, 0, 0, avail_pts); 1406 1407 batch_size += pte_update_size(m, 0, sysmem, &sysmem_it, &vram_L0, &sysmem_L0_ofs, 1408 &sysmem_L0_pt, 0, avail_pts, avail_pts); 1409 batch_size += EMIT_COPY_DW; 1410 1411 bb = xe_bb_new(gt, batch_size, usm); 1412 if (IS_ERR(bb)) { 1413 err = PTR_ERR(bb); 1414 return ERR_PTR(err); 1415 } 1416 1417 if (xe_migrate_allow_identity(vram_L0, &vram_it)) 1418 xe_res_next(&vram_it, vram_L0); 1419 else 1420 emit_pte(m, bb, vram_L0_pt, true, use_comp_pat, &vram_it, vram_L0, vram); 1421 1422 emit_pte(m, bb, sysmem_L0_pt, false, false, &sysmem_it, vram_L0, sysmem); 1423 1424 bb->cs[bb->len++] = MI_BATCH_BUFFER_END; 1425 update_idx = bb->len; 1426 1427 if (to_sysmem) 1428 emit_copy(gt, bb, vram_L0_ofs, sysmem_L0_ofs, vram_L0, XE_PAGE_SIZE); 1429 else 1430 emit_copy(gt, bb, sysmem_L0_ofs, vram_L0_ofs, vram_L0, XE_PAGE_SIZE); 1431 1432 job = xe_bb_create_migration_job(m->q, bb, xe_migrate_batch_base(m, usm), 1433 update_idx); 1434 if (IS_ERR(job)) { 1435 xe_bb_free(bb, NULL); 1436 err = PTR_ERR(job); 1437 return ERR_PTR(err); 1438 } 1439 1440 xe_sched_job_add_migrate_flush(job, MI_INVALIDATE_TLB); 1441 1442 xe_assert(xe, dma_resv_test_signaled(vram_bo->ttm.base.resv, 1443 DMA_RESV_USAGE_BOOKKEEP)); 1444 xe_assert(xe, dma_resv_test_signaled(sysmem_bo->ttm.base.resv, 1445 DMA_RESV_USAGE_BOOKKEEP)); 1446 1447 scoped_guard(mutex, &m->job_mutex) { 1448 xe_sched_job_arm(job); 1449 dma_fence_put(fence); 1450 fence = dma_fence_get(&job->drm.s_fence->finished); 1451 xe_sched_job_push(job); 1452 1453 dma_fence_put(m->fence); 1454 m->fence = dma_fence_get(fence); 1455 } 1456 1457 xe_bb_free(bb, fence); 1458 size -= vram_L0; 1459 } 1460 1461 return fence; 1462 } 1463 1464 static void emit_clear_link_copy(struct xe_gt *gt, struct xe_bb *bb, u64 src_ofs, 1465 u32 size, u32 pitch) 1466 { 1467 struct xe_device *xe = gt_to_xe(gt); 1468 u32 *cs = bb->cs + bb->len; 1469 u32 len = PVC_MEM_SET_CMD_LEN_DW; 1470 1471 *cs++ = PVC_MEM_SET_CMD | PVC_MEM_SET_MATRIX | (len - 2); 1472 *cs++ = pitch - 1; 1473 *cs++ = (size / pitch) - 1; 1474 *cs++ = pitch - 1; 1475 *cs++ = lower_32_bits(src_ofs); 1476 *cs++ = upper_32_bits(src_ofs); 1477 if (GRAPHICS_VERx100(xe) >= 2000) 1478 *cs++ = FIELD_PREP(XE2_MEM_SET_MOCS_INDEX_MASK, gt->mocs.uc_index); 1479 else 1480 *cs++ = FIELD_PREP(PVC_MEM_SET_MOCS_INDEX_MASK, gt->mocs.uc_index); 1481 1482 xe_gt_assert(gt, cs - bb->cs == len + bb->len); 1483 1484 bb->len += len; 1485 } 1486 1487 static void emit_clear_main_copy(struct xe_gt *gt, struct xe_bb *bb, 1488 u64 src_ofs, u32 size, u32 pitch, bool is_vram) 1489 { 1490 struct xe_device *xe = gt_to_xe(gt); 1491 u32 *cs = bb->cs + bb->len; 1492 u32 len = XY_FAST_COLOR_BLT_DW; 1493 1494 if (GRAPHICS_VERx100(xe) < 1250) 1495 len = 11; 1496 1497 *cs++ = XY_FAST_COLOR_BLT_CMD | XY_FAST_COLOR_BLT_DEPTH_32 | 1498 (len - 2); 1499 if (GRAPHICS_VERx100(xe) >= 2000) 1500 *cs++ = FIELD_PREP(XE2_XY_FAST_COLOR_BLT_MOCS_INDEX_MASK, gt->mocs.uc_index) | 1501 (pitch - 1); 1502 else 1503 *cs++ = FIELD_PREP(XY_FAST_COLOR_BLT_MOCS_MASK, gt->mocs.uc_index) | 1504 (pitch - 1); 1505 *cs++ = 0; 1506 *cs++ = (size / pitch) << 16 | pitch / 4; 1507 *cs++ = lower_32_bits(src_ofs); 1508 *cs++ = upper_32_bits(src_ofs); 1509 *cs++ = (is_vram ? 0x0 : 0x1) << XY_FAST_COLOR_BLT_MEM_TYPE_SHIFT; 1510 *cs++ = 0; 1511 *cs++ = 0; 1512 *cs++ = 0; 1513 *cs++ = 0; 1514 1515 if (len > 11) { 1516 *cs++ = 0; 1517 *cs++ = 0; 1518 *cs++ = 0; 1519 *cs++ = 0; 1520 *cs++ = 0; 1521 } 1522 1523 xe_gt_assert(gt, cs - bb->cs == len + bb->len); 1524 1525 bb->len += len; 1526 } 1527 1528 static bool has_service_copy_support(struct xe_gt *gt) 1529 { 1530 /* 1531 * What we care about is whether the architecture was designed with 1532 * service copy functionality (specifically the new MEM_SET / MEM_COPY 1533 * instructions) so check the architectural engine list rather than the 1534 * actual list since these instructions are usable on BCS0 even if 1535 * all of the actual service copy engines (BCS1-BCS8) have been fused 1536 * off. 1537 */ 1538 return gt->info.engine_mask & GENMASK(XE_HW_ENGINE_BCS8, 1539 XE_HW_ENGINE_BCS1); 1540 } 1541 1542 static u32 emit_clear_cmd_len(struct xe_gt *gt) 1543 { 1544 if (has_service_copy_support(gt)) 1545 return PVC_MEM_SET_CMD_LEN_DW; 1546 else 1547 return XY_FAST_COLOR_BLT_DW; 1548 } 1549 1550 static void emit_clear(struct xe_gt *gt, struct xe_bb *bb, u64 src_ofs, 1551 u32 size, u32 pitch, bool is_vram) 1552 { 1553 if (has_service_copy_support(gt)) 1554 emit_clear_link_copy(gt, bb, src_ofs, size, pitch); 1555 else 1556 emit_clear_main_copy(gt, bb, src_ofs, size, pitch, 1557 is_vram); 1558 } 1559 1560 /** 1561 * xe_migrate_clear() - Copy content of TTM resources. 1562 * @m: The migration context. 1563 * @bo: The buffer object @dst is currently bound to. 1564 * @dst: The dst TTM resource to be cleared. 1565 * @clear_flags: flags to specify which data to clear: CCS, BO, or both. 1566 * 1567 * Clear the contents of @dst to zero when XE_MIGRATE_CLEAR_FLAG_BO_DATA is set. 1568 * On flat CCS devices, the CCS metadata is cleared to zero with XE_MIGRATE_CLEAR_FLAG_CCS_DATA. 1569 * Set XE_MIGRATE_CLEAR_FLAG_FULL to clear bo as well as CCS metadata. 1570 * TODO: Eliminate the @bo argument. 1571 * 1572 * Return: Pointer to a dma_fence representing the last clear batch, or 1573 * an error pointer on failure. If there is a failure, any clear operation 1574 * started by the function call has been synced. 1575 */ 1576 struct dma_fence *xe_migrate_clear(struct xe_migrate *m, 1577 struct xe_bo *bo, 1578 struct ttm_resource *dst, 1579 u32 clear_flags) 1580 { 1581 bool clear_vram = mem_type_is_vram(dst->mem_type); 1582 bool clear_bo_data = XE_MIGRATE_CLEAR_FLAG_BO_DATA & clear_flags; 1583 bool clear_ccs = XE_MIGRATE_CLEAR_FLAG_CCS_DATA & clear_flags; 1584 struct xe_gt *gt = m->tile->primary_gt; 1585 struct xe_device *xe = gt_to_xe(gt); 1586 bool clear_only_system_ccs = false; 1587 struct dma_fence *fence = NULL; 1588 u64 size = xe_bo_size(bo); 1589 struct xe_res_cursor src_it; 1590 struct ttm_resource *src = dst; 1591 int err; 1592 1593 if (WARN_ON(!clear_bo_data && !clear_ccs)) 1594 return NULL; 1595 1596 if (!clear_bo_data && clear_ccs && !IS_DGFX(xe)) 1597 clear_only_system_ccs = true; 1598 1599 if (!clear_vram) 1600 xe_res_first_sg(xe_bo_sg(bo), 0, xe_bo_size(bo), &src_it); 1601 else 1602 xe_res_first(src, 0, xe_bo_size(bo), &src_it); 1603 1604 while (size) { 1605 u64 clear_L0_ofs; 1606 u32 clear_L0_pt; 1607 u32 flush_flags = 0; 1608 u64 clear_L0; 1609 struct xe_sched_job *job; 1610 struct xe_bb *bb; 1611 u32 batch_size, update_idx; 1612 u32 pte_flags; 1613 1614 bool usm = xe->info.has_usm; 1615 u32 avail_pts = max_mem_transfer_per_pass(xe) / LEVEL0_PAGE_TABLE_ENCODE_SIZE; 1616 1617 clear_L0 = xe_migrate_res_sizes(m, &src_it); 1618 1619 /* Calculate final sizes and batch size.. */ 1620 pte_flags = clear_vram ? PTE_UPDATE_FLAG_IS_VRAM : 0; 1621 batch_size = 1 + 1622 pte_update_size(m, pte_flags, src, &src_it, 1623 &clear_L0, &clear_L0_ofs, &clear_L0_pt, 1624 clear_bo_data ? emit_clear_cmd_len(gt) : 0, 0, 1625 avail_pts); 1626 1627 if (xe_migrate_needs_ccs_emit(xe)) 1628 batch_size += EMIT_COPY_CCS_DW; 1629 1630 /* Clear commands */ 1631 1632 if (WARN_ON_ONCE(!clear_L0)) 1633 break; 1634 1635 bb = xe_bb_new(gt, batch_size, usm); 1636 if (IS_ERR(bb)) { 1637 err = PTR_ERR(bb); 1638 goto err_sync; 1639 } 1640 1641 size -= clear_L0; 1642 /* Preemption is enabled again by the ring ops. */ 1643 if (clear_vram && xe_migrate_allow_identity(clear_L0, &src_it)) { 1644 xe_res_next(&src_it, clear_L0); 1645 } else { 1646 emit_pte(m, bb, clear_L0_pt, clear_vram, 1647 clear_only_system_ccs, &src_it, clear_L0, dst); 1648 flush_flags |= MI_INVALIDATE_TLB; 1649 } 1650 1651 bb->cs[bb->len++] = MI_BATCH_BUFFER_END; 1652 update_idx = bb->len; 1653 1654 if (clear_bo_data) 1655 emit_clear(gt, bb, clear_L0_ofs, clear_L0, XE_PAGE_SIZE, clear_vram); 1656 1657 if (xe_migrate_needs_ccs_emit(xe)) { 1658 emit_copy_ccs(gt, bb, clear_L0_ofs, true, 1659 m->cleared_mem_ofs, false, clear_L0); 1660 flush_flags |= MI_FLUSH_DW_CCS; 1661 } 1662 1663 job = xe_bb_create_migration_job(m->q, bb, 1664 xe_migrate_batch_base(m, usm), 1665 update_idx); 1666 if (IS_ERR(job)) { 1667 err = PTR_ERR(job); 1668 goto err; 1669 } 1670 1671 xe_sched_job_add_migrate_flush(job, flush_flags); 1672 if (!fence) { 1673 /* 1674 * There can't be anything userspace related at this 1675 * point, so we just need to respect any potential move 1676 * fences, which are always tracked as 1677 * DMA_RESV_USAGE_KERNEL. 1678 */ 1679 err = xe_sched_job_add_deps(job, bo->ttm.base.resv, 1680 DMA_RESV_USAGE_KERNEL); 1681 if (err) 1682 goto err_job; 1683 } 1684 1685 mutex_lock(&m->job_mutex); 1686 xe_sched_job_arm(job); 1687 dma_fence_put(fence); 1688 fence = dma_fence_get(&job->drm.s_fence->finished); 1689 xe_sched_job_push(job); 1690 1691 dma_fence_put(m->fence); 1692 m->fence = dma_fence_get(fence); 1693 1694 mutex_unlock(&m->job_mutex); 1695 1696 xe_bb_free(bb, fence); 1697 continue; 1698 1699 err_job: 1700 xe_sched_job_put(job); 1701 err: 1702 xe_bb_free(bb, NULL); 1703 err_sync: 1704 /* Sync partial copies if any. FIXME: job_mutex? */ 1705 if (fence) { 1706 dma_fence_wait(fence, false); 1707 dma_fence_put(fence); 1708 } 1709 1710 return ERR_PTR(err); 1711 } 1712 1713 if (clear_ccs) 1714 bo->ccs_cleared = true; 1715 1716 return fence; 1717 } 1718 1719 static void write_pgtable(struct xe_tile *tile, struct xe_bb *bb, u64 ppgtt_ofs, 1720 const struct xe_vm_pgtable_update_op *pt_op, 1721 const struct xe_vm_pgtable_update *update, 1722 struct xe_migrate_pt_update *pt_update) 1723 { 1724 const struct xe_migrate_pt_update_ops *ops = pt_update->ops; 1725 u32 chunk; 1726 u32 ofs = update->ofs, size = update->qwords; 1727 1728 /* 1729 * If we have 512 entries (max), we would populate it ourselves, 1730 * and update the PDE above it to the new pointer. 1731 * The only time this can only happen if we have to update the top 1732 * PDE. This requires a BO that is almost vm->size big. 1733 * 1734 * This shouldn't be possible in practice.. might change when 16K 1735 * pages are used. Hence the assert. 1736 */ 1737 xe_tile_assert(tile, update->qwords < MAX_NUM_PTE); 1738 if (!ppgtt_ofs) 1739 ppgtt_ofs = xe_migrate_vram_ofs(tile_to_xe(tile), 1740 xe_bo_addr(update->pt_bo, 0, 1741 XE_PAGE_SIZE), false); 1742 1743 do { 1744 u64 addr = ppgtt_ofs + ofs * 8; 1745 1746 chunk = min(size, MAX_PTE_PER_SDI); 1747 1748 /* Ensure populatefn can do memset64 by aligning bb->cs */ 1749 if (!(bb->len & 1)) 1750 bb->cs[bb->len++] = MI_NOOP; 1751 1752 bb->cs[bb->len++] = MI_STORE_DATA_IMM | MI_SDI_NUM_QW(chunk); 1753 bb->cs[bb->len++] = lower_32_bits(addr); 1754 bb->cs[bb->len++] = upper_32_bits(addr); 1755 if (pt_op->bind) 1756 ops->populate(pt_update, tile, NULL, bb->cs + bb->len, 1757 ofs, chunk, update); 1758 else 1759 ops->clear(pt_update, tile, NULL, bb->cs + bb->len, 1760 ofs, chunk, update); 1761 1762 bb->len += chunk * 2; 1763 ofs += chunk; 1764 size -= chunk; 1765 } while (size); 1766 } 1767 1768 struct xe_vm *xe_migrate_get_vm(struct xe_migrate *m) 1769 { 1770 return xe_vm_get(m->q->vm); 1771 } 1772 1773 #if IS_ENABLED(CONFIG_DRM_XE_KUNIT_TEST) 1774 struct migrate_test_params { 1775 struct xe_test_priv base; 1776 bool force_gpu; 1777 }; 1778 1779 #define to_migrate_test_params(_priv) \ 1780 container_of(_priv, struct migrate_test_params, base) 1781 #endif 1782 1783 static struct dma_fence * 1784 xe_migrate_update_pgtables_cpu(struct xe_migrate *m, 1785 struct xe_migrate_pt_update *pt_update) 1786 { 1787 XE_TEST_DECLARE(struct migrate_test_params *test = 1788 to_migrate_test_params 1789 (xe_cur_kunit_priv(XE_TEST_LIVE_MIGRATE));) 1790 const struct xe_migrate_pt_update_ops *ops = pt_update->ops; 1791 struct xe_vm *vm = pt_update->vops->vm; 1792 struct xe_vm_pgtable_update_ops *pt_update_ops = 1793 &pt_update->vops->pt_update_ops[pt_update->tile_id]; 1794 int err; 1795 u32 i, j; 1796 1797 if (XE_TEST_ONLY(test && test->force_gpu)) 1798 return ERR_PTR(-ETIME); 1799 1800 if (ops->pre_commit) { 1801 pt_update->job = NULL; 1802 err = ops->pre_commit(pt_update); 1803 if (err) 1804 return ERR_PTR(err); 1805 } 1806 1807 for (i = 0; i < pt_update_ops->num_ops; ++i) { 1808 const struct xe_vm_pgtable_update_op *pt_op = 1809 &pt_update_ops->ops[i]; 1810 1811 for (j = 0; j < pt_op->num_entries; j++) { 1812 const struct xe_vm_pgtable_update *update = 1813 &pt_op->entries[j]; 1814 1815 if (pt_op->bind) 1816 ops->populate(pt_update, m->tile, 1817 &update->pt_bo->vmap, NULL, 1818 update->ofs, update->qwords, 1819 update); 1820 else 1821 ops->clear(pt_update, m->tile, 1822 &update->pt_bo->vmap, NULL, 1823 update->ofs, update->qwords, update); 1824 } 1825 } 1826 1827 trace_xe_vm_cpu_bind(vm); 1828 xe_device_wmb(vm->xe); 1829 1830 return dma_fence_get_stub(); 1831 } 1832 1833 static struct dma_fence * 1834 __xe_migrate_update_pgtables(struct xe_migrate *m, 1835 struct xe_migrate_pt_update *pt_update, 1836 struct xe_vm_pgtable_update_ops *pt_update_ops) 1837 { 1838 const struct xe_migrate_pt_update_ops *ops = pt_update->ops; 1839 struct xe_tile *tile = m->tile; 1840 struct xe_gt *gt = tile->primary_gt; 1841 struct xe_device *xe = tile_to_xe(tile); 1842 struct xe_sched_job *job; 1843 struct dma_fence *fence; 1844 struct drm_suballoc *sa_bo = NULL; 1845 struct xe_bb *bb; 1846 u32 i, j, batch_size = 0, ppgtt_ofs, update_idx, page_ofs = 0; 1847 u32 num_updates = 0, current_update = 0; 1848 u64 addr; 1849 int err = 0; 1850 bool is_migrate = pt_update_ops->q == m->q; 1851 bool usm = is_migrate && xe->info.has_usm; 1852 1853 for (i = 0; i < pt_update_ops->num_ops; ++i) { 1854 struct xe_vm_pgtable_update_op *pt_op = &pt_update_ops->ops[i]; 1855 struct xe_vm_pgtable_update *updates = pt_op->entries; 1856 1857 num_updates += pt_op->num_entries; 1858 for (j = 0; j < pt_op->num_entries; ++j) { 1859 u32 num_cmds = DIV_ROUND_UP(updates[j].qwords, 1860 MAX_PTE_PER_SDI); 1861 1862 /* align noop + MI_STORE_DATA_IMM cmd prefix */ 1863 batch_size += 4 * num_cmds + updates[j].qwords * 2; 1864 } 1865 } 1866 1867 /* fixed + PTE entries */ 1868 if (IS_DGFX(xe)) 1869 batch_size += 2; 1870 else 1871 batch_size += 6 * (num_updates / MAX_PTE_PER_SDI + 1) + 1872 num_updates * 2; 1873 1874 bb = xe_bb_new(gt, batch_size, usm); 1875 if (IS_ERR(bb)) 1876 return ERR_CAST(bb); 1877 1878 /* For sysmem PTE's, need to map them in our hole.. */ 1879 if (!IS_DGFX(xe)) { 1880 u16 pat_index = xe_cache_pat_idx(xe, XE_CACHE_WB); 1881 u32 ptes, ofs; 1882 1883 ppgtt_ofs = NUM_KERNEL_PDE - 1; 1884 if (!is_migrate) { 1885 u32 num_units = DIV_ROUND_UP(num_updates, 1886 NUM_VMUSA_WRITES_PER_UNIT); 1887 1888 if (num_units > m->vm_update_sa.size) { 1889 err = -ENOBUFS; 1890 goto err_bb; 1891 } 1892 sa_bo = drm_suballoc_new(&m->vm_update_sa, num_units, 1893 GFP_KERNEL, true, 0); 1894 if (IS_ERR(sa_bo)) { 1895 err = PTR_ERR(sa_bo); 1896 goto err_bb; 1897 } 1898 1899 ppgtt_ofs = NUM_KERNEL_PDE + 1900 (drm_suballoc_soffset(sa_bo) / 1901 NUM_VMUSA_UNIT_PER_PAGE); 1902 page_ofs = (drm_suballoc_soffset(sa_bo) % 1903 NUM_VMUSA_UNIT_PER_PAGE) * 1904 VM_SA_UPDATE_UNIT_SIZE; 1905 } 1906 1907 /* Map our PT's to gtt */ 1908 i = 0; 1909 j = 0; 1910 ptes = num_updates; 1911 ofs = ppgtt_ofs * XE_PAGE_SIZE + page_ofs; 1912 while (ptes) { 1913 u32 chunk = min(MAX_PTE_PER_SDI, ptes); 1914 u32 idx = 0; 1915 1916 bb->cs[bb->len++] = MI_STORE_DATA_IMM | 1917 MI_SDI_NUM_QW(chunk); 1918 bb->cs[bb->len++] = ofs; 1919 bb->cs[bb->len++] = 0; /* upper_32_bits */ 1920 1921 for (; i < pt_update_ops->num_ops; ++i) { 1922 struct xe_vm_pgtable_update_op *pt_op = 1923 &pt_update_ops->ops[i]; 1924 struct xe_vm_pgtable_update *updates = pt_op->entries; 1925 1926 for (; j < pt_op->num_entries; ++j, ++current_update, ++idx) { 1927 struct xe_vm *vm = pt_update->vops->vm; 1928 struct xe_bo *pt_bo = updates[j].pt_bo; 1929 1930 if (idx == chunk) 1931 goto next_cmd; 1932 1933 xe_tile_assert(tile, xe_bo_size(pt_bo) == SZ_4K); 1934 1935 /* Map a PT at most once */ 1936 if (pt_bo->update_index < 0) 1937 pt_bo->update_index = current_update; 1938 1939 addr = vm->pt_ops->pte_encode_bo(pt_bo, 0, 1940 pat_index, 0); 1941 bb->cs[bb->len++] = lower_32_bits(addr); 1942 bb->cs[bb->len++] = upper_32_bits(addr); 1943 } 1944 1945 j = 0; 1946 } 1947 1948 next_cmd: 1949 ptes -= chunk; 1950 ofs += chunk * sizeof(u64); 1951 } 1952 1953 bb->cs[bb->len++] = MI_BATCH_BUFFER_END; 1954 update_idx = bb->len; 1955 1956 addr = xe_migrate_vm_addr(ppgtt_ofs, 0) + 1957 (page_ofs / sizeof(u64)) * XE_PAGE_SIZE; 1958 for (i = 0; i < pt_update_ops->num_ops; ++i) { 1959 struct xe_vm_pgtable_update_op *pt_op = 1960 &pt_update_ops->ops[i]; 1961 struct xe_vm_pgtable_update *updates = pt_op->entries; 1962 1963 for (j = 0; j < pt_op->num_entries; ++j) { 1964 struct xe_bo *pt_bo = updates[j].pt_bo; 1965 1966 write_pgtable(tile, bb, addr + 1967 pt_bo->update_index * XE_PAGE_SIZE, 1968 pt_op, &updates[j], pt_update); 1969 } 1970 } 1971 } else { 1972 /* phys pages, no preamble required */ 1973 bb->cs[bb->len++] = MI_BATCH_BUFFER_END; 1974 update_idx = bb->len; 1975 1976 for (i = 0; i < pt_update_ops->num_ops; ++i) { 1977 struct xe_vm_pgtable_update_op *pt_op = 1978 &pt_update_ops->ops[i]; 1979 struct xe_vm_pgtable_update *updates = pt_op->entries; 1980 1981 for (j = 0; j < pt_op->num_entries; ++j) 1982 write_pgtable(tile, bb, 0, pt_op, &updates[j], 1983 pt_update); 1984 } 1985 } 1986 1987 job = xe_bb_create_migration_job(pt_update_ops->q, bb, 1988 xe_migrate_batch_base(m, usm), 1989 update_idx); 1990 if (IS_ERR(job)) { 1991 err = PTR_ERR(job); 1992 goto err_sa; 1993 } 1994 1995 xe_sched_job_add_migrate_flush(job, MI_INVALIDATE_TLB); 1996 1997 if (ops->pre_commit) { 1998 pt_update->job = job; 1999 err = ops->pre_commit(pt_update); 2000 if (err) 2001 goto err_job; 2002 } 2003 if (is_migrate) 2004 mutex_lock(&m->job_mutex); 2005 2006 xe_sched_job_arm(job); 2007 fence = dma_fence_get(&job->drm.s_fence->finished); 2008 xe_sched_job_push(job); 2009 2010 if (is_migrate) 2011 mutex_unlock(&m->job_mutex); 2012 2013 xe_bb_free(bb, fence); 2014 drm_suballoc_free(sa_bo, fence); 2015 2016 return fence; 2017 2018 err_job: 2019 xe_sched_job_put(job); 2020 err_sa: 2021 drm_suballoc_free(sa_bo, NULL); 2022 err_bb: 2023 xe_bb_free(bb, NULL); 2024 return ERR_PTR(err); 2025 } 2026 2027 /** 2028 * xe_migrate_update_pgtables() - Pipelined page-table update 2029 * @m: The migrate context. 2030 * @pt_update: PT update arguments 2031 * 2032 * Perform a pipelined page-table update. The update descriptors are typically 2033 * built under the same lock critical section as a call to this function. If 2034 * using the default engine for the updates, they will be performed in the 2035 * order they grab the job_mutex. If different engines are used, external 2036 * synchronization is needed for overlapping updates to maintain page-table 2037 * consistency. Note that the meaning of "overlapping" is that the updates 2038 * touch the same page-table, which might be a higher-level page-directory. 2039 * If no pipelining is needed, then updates may be performed by the cpu. 2040 * 2041 * Return: A dma_fence that, when signaled, indicates the update completion. 2042 */ 2043 struct dma_fence * 2044 xe_migrate_update_pgtables(struct xe_migrate *m, 2045 struct xe_migrate_pt_update *pt_update) 2046 2047 { 2048 struct xe_vm_pgtable_update_ops *pt_update_ops = 2049 &pt_update->vops->pt_update_ops[pt_update->tile_id]; 2050 struct dma_fence *fence; 2051 2052 fence = xe_migrate_update_pgtables_cpu(m, pt_update); 2053 2054 /* -ETIME indicates a job is needed, anything else is legit error */ 2055 if (!IS_ERR(fence) || PTR_ERR(fence) != -ETIME) 2056 return fence; 2057 2058 return __xe_migrate_update_pgtables(m, pt_update, pt_update_ops); 2059 } 2060 2061 /** 2062 * xe_migrate_wait() - Complete all operations using the xe_migrate context 2063 * @m: Migrate context to wait for. 2064 * 2065 * Waits until the GPU no longer uses the migrate context's default engine 2066 * or its page-table objects. FIXME: What about separate page-table update 2067 * engines? 2068 */ 2069 void xe_migrate_wait(struct xe_migrate *m) 2070 { 2071 if (m->fence) 2072 dma_fence_wait(m->fence, false); 2073 } 2074 2075 static u32 pte_update_cmd_size(u64 size) 2076 { 2077 u32 num_dword; 2078 u64 entries = DIV_U64_ROUND_UP(size, XE_PAGE_SIZE); 2079 2080 XE_WARN_ON(size > MAX_PREEMPTDISABLE_TRANSFER); 2081 2082 /* 2083 * MI_STORE_DATA_IMM command is used to update page table. Each 2084 * instruction can update maximumly MAX_PTE_PER_SDI pte entries. To 2085 * update n (n <= MAX_PTE_PER_SDI) pte entries, we need: 2086 * 2087 * - 1 dword for the MI_STORE_DATA_IMM command header (opcode etc) 2088 * - 2 dword for the page table's physical location 2089 * - 2*n dword for value of pte to fill (each pte entry is 2 dwords) 2090 */ 2091 num_dword = (1 + 2) * DIV_U64_ROUND_UP(entries, MAX_PTE_PER_SDI); 2092 num_dword += entries * 2; 2093 2094 return num_dword; 2095 } 2096 2097 static void build_pt_update_batch_sram(struct xe_migrate *m, 2098 struct xe_bb *bb, u32 pt_offset, 2099 struct drm_pagemap_addr *sram_addr, 2100 u32 size, int level) 2101 { 2102 u16 pat_index = xe_cache_pat_idx(tile_to_xe(m->tile), XE_CACHE_WB); 2103 u64 gpu_page_size = 0x1ull << xe_pt_shift(level); 2104 u32 ptes; 2105 int i = 0; 2106 2107 xe_tile_assert(m->tile, PAGE_ALIGNED(size)); 2108 2109 ptes = DIV_ROUND_UP(size, gpu_page_size); 2110 while (ptes) { 2111 u32 chunk = min(MAX_PTE_PER_SDI, ptes); 2112 2113 if (!level) 2114 chunk = ALIGN_DOWN(chunk, PAGE_SIZE / XE_PAGE_SIZE); 2115 2116 bb->cs[bb->len++] = MI_STORE_DATA_IMM | MI_SDI_NUM_QW(chunk); 2117 bb->cs[bb->len++] = pt_offset; 2118 bb->cs[bb->len++] = 0; 2119 2120 pt_offset += chunk * 8; 2121 ptes -= chunk; 2122 2123 while (chunk--) { 2124 u64 addr = sram_addr[i].addr; 2125 u64 pte; 2126 2127 xe_tile_assert(m->tile, sram_addr[i].proto == 2128 DRM_INTERCONNECT_SYSTEM || 2129 sram_addr[i].proto == XE_INTERCONNECT_P2P); 2130 xe_tile_assert(m->tile, addr); 2131 xe_tile_assert(m->tile, PAGE_ALIGNED(addr)); 2132 2133 again: 2134 pte = m->q->vm->pt_ops->pte_encode_addr(m->tile->xe, 2135 addr, pat_index, 2136 level, false, 0); 2137 bb->cs[bb->len++] = lower_32_bits(pte); 2138 bb->cs[bb->len++] = upper_32_bits(pte); 2139 2140 if (gpu_page_size < PAGE_SIZE) { 2141 addr += XE_PAGE_SIZE; 2142 if (!PAGE_ALIGNED(addr)) { 2143 chunk--; 2144 goto again; 2145 } 2146 i++; 2147 } else { 2148 i += gpu_page_size / PAGE_SIZE; 2149 } 2150 } 2151 } 2152 } 2153 2154 static bool xe_migrate_vram_use_pde(struct drm_pagemap_addr *sram_addr, 2155 unsigned long size) 2156 { 2157 u32 large_size = (0x1 << xe_pt_shift(1)); 2158 unsigned long i, incr = large_size / PAGE_SIZE; 2159 2160 for (i = 0; i < DIV_ROUND_UP(size, PAGE_SIZE); i += incr) 2161 if (PAGE_SIZE << sram_addr[i].order != large_size) 2162 return false; 2163 2164 return true; 2165 } 2166 2167 #define XE_CACHELINE_BYTES 64ull 2168 #define XE_CACHELINE_MASK (XE_CACHELINE_BYTES - 1) 2169 2170 static u32 xe_migrate_copy_pitch(struct xe_device *xe, u32 len) 2171 { 2172 u32 pitch; 2173 2174 if (IS_ALIGNED(len, PAGE_SIZE)) 2175 pitch = PAGE_SIZE; 2176 else if (IS_ALIGNED(len, SZ_4K)) 2177 pitch = SZ_4K; 2178 else if (IS_ALIGNED(len, SZ_256)) 2179 pitch = SZ_256; 2180 else if (IS_ALIGNED(len, 4)) 2181 pitch = 4; 2182 else 2183 pitch = 1; 2184 2185 xe_assert(xe, pitch > 1 || xe->info.has_mem_copy_instr); 2186 return pitch; 2187 } 2188 2189 static struct dma_fence *xe_migrate_vram(struct xe_migrate *m, 2190 unsigned long len, 2191 unsigned long sram_offset, 2192 struct drm_pagemap_addr *sram_addr, 2193 u64 vram_addr, 2194 struct dma_fence *deps, 2195 const enum xe_migrate_copy_dir dir) 2196 { 2197 struct xe_gt *gt = m->tile->primary_gt; 2198 struct xe_device *xe = gt_to_xe(gt); 2199 bool use_usm_batch = xe->info.has_usm; 2200 struct dma_fence *fence = NULL; 2201 u32 batch_size = 1; 2202 u64 src_L0_ofs, dst_L0_ofs; 2203 struct xe_sched_job *job; 2204 struct xe_bb *bb; 2205 u32 update_idx, pt_slot = 0; 2206 unsigned long npages = DIV_ROUND_UP(len + sram_offset, PAGE_SIZE); 2207 unsigned int pitch = xe_migrate_copy_pitch(xe, len); 2208 int err; 2209 unsigned long i, j; 2210 bool use_pde = xe_migrate_vram_use_pde(sram_addr, len + sram_offset); 2211 2212 if (!xe->info.has_mem_copy_instr && 2213 drm_WARN_ON(&xe->drm, 2214 (!IS_ALIGNED(len, pitch)) || (sram_offset | vram_addr) & XE_CACHELINE_MASK)) 2215 return ERR_PTR(-EOPNOTSUPP); 2216 2217 xe_assert(xe, npages * PAGE_SIZE <= MAX_PREEMPTDISABLE_TRANSFER); 2218 2219 batch_size += pte_update_cmd_size(npages << PAGE_SHIFT); 2220 batch_size += EMIT_COPY_DW; 2221 2222 bb = xe_bb_new(gt, batch_size, use_usm_batch); 2223 if (IS_ERR(bb)) { 2224 err = PTR_ERR(bb); 2225 return ERR_PTR(err); 2226 } 2227 2228 /* 2229 * If the order of a struct drm_pagemap_addr entry is greater than 0, 2230 * the entry is populated by GPU pagemap but subsequent entries within 2231 * the range of that order are not populated. 2232 * build_pt_update_batch_sram() expects a fully populated array of 2233 * struct drm_pagemap_addr. Ensure this is the case even with higher 2234 * orders. 2235 */ 2236 for (i = 0; !use_pde && i < npages;) { 2237 unsigned int order = sram_addr[i].order; 2238 2239 for (j = 1; j < NR_PAGES(order) && i + j < npages; j++) 2240 if (!sram_addr[i + j].addr) 2241 sram_addr[i + j].addr = sram_addr[i].addr + j * PAGE_SIZE; 2242 2243 i += NR_PAGES(order); 2244 } 2245 2246 if (use_pde) 2247 build_pt_update_batch_sram(m, bb, m->large_page_copy_pdes, 2248 sram_addr, npages << PAGE_SHIFT, 1); 2249 else 2250 build_pt_update_batch_sram(m, bb, pt_slot * XE_PAGE_SIZE, 2251 sram_addr, npages << PAGE_SHIFT, 0); 2252 2253 if (dir == XE_MIGRATE_COPY_TO_VRAM) { 2254 if (use_pde) 2255 src_L0_ofs = m->large_page_copy_ofs + sram_offset; 2256 else 2257 src_L0_ofs = xe_migrate_vm_addr(pt_slot, 0) + sram_offset; 2258 dst_L0_ofs = xe_migrate_vram_ofs(xe, vram_addr, false); 2259 2260 } else { 2261 src_L0_ofs = xe_migrate_vram_ofs(xe, vram_addr, false); 2262 if (use_pde) 2263 dst_L0_ofs = m->large_page_copy_ofs + sram_offset; 2264 else 2265 dst_L0_ofs = xe_migrate_vm_addr(pt_slot, 0) + sram_offset; 2266 } 2267 2268 bb->cs[bb->len++] = MI_BATCH_BUFFER_END; 2269 update_idx = bb->len; 2270 2271 emit_copy(gt, bb, src_L0_ofs, dst_L0_ofs, len, pitch); 2272 2273 job = xe_bb_create_migration_job(m->q, bb, 2274 xe_migrate_batch_base(m, use_usm_batch), 2275 update_idx); 2276 if (IS_ERR(job)) { 2277 err = PTR_ERR(job); 2278 goto err; 2279 } 2280 2281 xe_sched_job_add_migrate_flush(job, MI_INVALIDATE_TLB); 2282 2283 if (deps && !dma_fence_is_signaled(deps)) { 2284 dma_fence_get(deps); 2285 err = drm_sched_job_add_dependency(&job->drm, deps); 2286 if (err) 2287 dma_fence_wait(deps, false); 2288 err = 0; 2289 } 2290 2291 mutex_lock(&m->job_mutex); 2292 xe_sched_job_arm(job); 2293 fence = dma_fence_get(&job->drm.s_fence->finished); 2294 xe_sched_job_push(job); 2295 2296 dma_fence_put(m->fence); 2297 m->fence = dma_fence_get(fence); 2298 mutex_unlock(&m->job_mutex); 2299 2300 xe_bb_free(bb, fence); 2301 2302 return fence; 2303 2304 err: 2305 xe_bb_free(bb, NULL); 2306 2307 return ERR_PTR(err); 2308 } 2309 2310 /** 2311 * xe_migrate_to_vram() - Migrate to VRAM 2312 * @m: The migration context. 2313 * @npages: Number of pages to migrate. 2314 * @src_addr: Array of DMA information (source of migrate) 2315 * @dst_addr: Device physical address of VRAM (destination of migrate) 2316 * @deps: struct dma_fence representing the dependencies that need 2317 * to be signaled before migration. 2318 * 2319 * Copy from an array dma addresses to a VRAM device physical address 2320 * 2321 * Return: dma fence for migrate to signal completion on success, ERR_PTR on 2322 * failure 2323 */ 2324 struct dma_fence *xe_migrate_to_vram(struct xe_migrate *m, 2325 unsigned long npages, 2326 struct drm_pagemap_addr *src_addr, 2327 u64 dst_addr, 2328 struct dma_fence *deps) 2329 { 2330 return xe_migrate_vram(m, npages * PAGE_SIZE, 0, src_addr, dst_addr, 2331 deps, XE_MIGRATE_COPY_TO_VRAM); 2332 } 2333 2334 /** 2335 * xe_migrate_from_vram() - Migrate from VRAM 2336 * @m: The migration context. 2337 * @npages: Number of pages to migrate. 2338 * @src_addr: Device physical address of VRAM (source of migrate) 2339 * @dst_addr: Array of DMA information (destination of migrate) 2340 * @deps: struct dma_fence representing the dependencies that need 2341 * to be signaled before migration. 2342 * 2343 * Copy from a VRAM device physical address to an array dma addresses 2344 * 2345 * Return: dma fence for migrate to signal completion on success, ERR_PTR on 2346 * failure 2347 */ 2348 struct dma_fence *xe_migrate_from_vram(struct xe_migrate *m, 2349 unsigned long npages, 2350 u64 src_addr, 2351 struct drm_pagemap_addr *dst_addr, 2352 struct dma_fence *deps) 2353 { 2354 return xe_migrate_vram(m, npages * PAGE_SIZE, 0, dst_addr, src_addr, 2355 deps, XE_MIGRATE_COPY_TO_SRAM); 2356 } 2357 2358 static void xe_migrate_dma_unmap(struct xe_device *xe, 2359 struct drm_pagemap_addr *pagemap_addr, 2360 int len, int write) 2361 { 2362 unsigned long i, npages = DIV_ROUND_UP(len, PAGE_SIZE); 2363 2364 for (i = 0; i < npages; ++i) { 2365 if (!pagemap_addr[i].addr) 2366 break; 2367 2368 dma_unmap_page(xe->drm.dev, pagemap_addr[i].addr, PAGE_SIZE, 2369 write ? DMA_TO_DEVICE : DMA_FROM_DEVICE); 2370 } 2371 kfree(pagemap_addr); 2372 } 2373 2374 static struct drm_pagemap_addr *xe_migrate_dma_map(struct xe_device *xe, 2375 void *buf, int len, 2376 int write) 2377 { 2378 struct drm_pagemap_addr *pagemap_addr; 2379 unsigned long i, npages = DIV_ROUND_UP(len, PAGE_SIZE); 2380 2381 pagemap_addr = kzalloc_objs(*pagemap_addr, npages); 2382 if (!pagemap_addr) 2383 return ERR_PTR(-ENOMEM); 2384 2385 for (i = 0; i < npages; ++i) { 2386 dma_addr_t addr; 2387 struct page *page; 2388 enum dma_data_direction dir = write ? DMA_TO_DEVICE : 2389 DMA_FROM_DEVICE; 2390 2391 if (is_vmalloc_addr(buf)) 2392 page = vmalloc_to_page(buf); 2393 else 2394 page = virt_to_page(buf); 2395 2396 addr = dma_map_page(xe->drm.dev, page, 0, PAGE_SIZE, dir); 2397 if (dma_mapping_error(xe->drm.dev, addr)) 2398 goto err_fault; 2399 2400 pagemap_addr[i] = 2401 drm_pagemap_addr_encode(addr, 2402 DRM_INTERCONNECT_SYSTEM, 2403 0, dir); 2404 buf += PAGE_SIZE; 2405 } 2406 2407 return pagemap_addr; 2408 2409 err_fault: 2410 xe_migrate_dma_unmap(xe, pagemap_addr, len, write); 2411 return ERR_PTR(-EFAULT); 2412 } 2413 2414 /** 2415 * xe_migrate_access_memory - Access memory of a BO via GPU 2416 * 2417 * @m: The migration context. 2418 * @bo: buffer object 2419 * @offset: access offset into buffer object 2420 * @buf: pointer to caller memory to read into or write from 2421 * @len: length of access 2422 * @write: write access 2423 * 2424 * Access memory of a BO via GPU either reading in or writing from a passed in 2425 * pointer. Pointer is dma mapped for GPU access and GPU commands are issued to 2426 * read to or write from pointer. 2427 * 2428 * Returns: 2429 * 0 if successful, negative error code on failure. 2430 */ 2431 int xe_migrate_access_memory(struct xe_migrate *m, struct xe_bo *bo, 2432 unsigned long offset, void *buf, int len, 2433 int write) 2434 { 2435 struct xe_tile *tile = m->tile; 2436 struct xe_device *xe = tile_to_xe(tile); 2437 struct xe_res_cursor cursor; 2438 struct dma_fence *fence = NULL; 2439 struct drm_pagemap_addr *pagemap_addr; 2440 unsigned long page_offset = (unsigned long)buf & ~PAGE_MASK; 2441 int bytes_left = len, current_page = 0; 2442 void *orig_buf = buf; 2443 2444 xe_bo_assert_held(bo); 2445 2446 /* Use bounce buffer for small access and unaligned access */ 2447 if (!xe->info.has_mem_copy_instr && 2448 (!IS_ALIGNED(len, 4) || 2449 !IS_ALIGNED(page_offset, XE_CACHELINE_BYTES) || 2450 !IS_ALIGNED(offset, XE_CACHELINE_BYTES))) { 2451 int buf_offset = 0; 2452 void *bounce; 2453 int err; 2454 2455 BUILD_BUG_ON(!is_power_of_2(XE_CACHELINE_BYTES)); 2456 bounce = kmalloc(XE_CACHELINE_BYTES, GFP_KERNEL); 2457 if (!bounce) 2458 return -ENOMEM; 2459 2460 /* 2461 * Less than ideal for large unaligned access but this should be 2462 * fairly rare, can fixup if this becomes common. 2463 */ 2464 do { 2465 int copy_bytes = min_t(int, bytes_left, 2466 XE_CACHELINE_BYTES - 2467 (offset & XE_CACHELINE_MASK)); 2468 int ptr_offset = offset & XE_CACHELINE_MASK; 2469 2470 err = xe_migrate_access_memory(m, bo, 2471 offset & 2472 ~XE_CACHELINE_MASK, 2473 bounce, 2474 XE_CACHELINE_BYTES, 0); 2475 if (err) 2476 break; 2477 2478 if (write) { 2479 memcpy(bounce + ptr_offset, buf + buf_offset, copy_bytes); 2480 2481 err = xe_migrate_access_memory(m, bo, 2482 offset & ~XE_CACHELINE_MASK, 2483 bounce, 2484 XE_CACHELINE_BYTES, write); 2485 if (err) 2486 break; 2487 } else { 2488 memcpy(buf + buf_offset, bounce + ptr_offset, 2489 copy_bytes); 2490 } 2491 2492 bytes_left -= copy_bytes; 2493 buf_offset += copy_bytes; 2494 offset += copy_bytes; 2495 } while (bytes_left); 2496 2497 kfree(bounce); 2498 return err; 2499 } 2500 2501 pagemap_addr = xe_migrate_dma_map(xe, buf, len + page_offset, write); 2502 if (IS_ERR(pagemap_addr)) 2503 return PTR_ERR(pagemap_addr); 2504 2505 xe_res_first(bo->ttm.resource, offset, xe_bo_size(bo) - offset, &cursor); 2506 2507 do { 2508 struct dma_fence *__fence; 2509 u64 vram_addr = vram_region_gpu_offset(bo->ttm.resource) + 2510 cursor.start; 2511 int current_bytes; 2512 u32 pitch; 2513 2514 if (cursor.size > MAX_PREEMPTDISABLE_TRANSFER) 2515 current_bytes = min_t(int, bytes_left, 2516 MAX_PREEMPTDISABLE_TRANSFER); 2517 else 2518 current_bytes = min_t(int, bytes_left, cursor.size); 2519 2520 pitch = xe_migrate_copy_pitch(xe, current_bytes); 2521 if (xe->info.has_mem_copy_instr) 2522 current_bytes = min_t(int, current_bytes, U16_MAX * pitch); 2523 else 2524 current_bytes = min_t(int, current_bytes, 2525 round_down(S16_MAX * pitch, 2526 XE_CACHELINE_BYTES)); 2527 2528 __fence = xe_migrate_vram(m, current_bytes, 2529 (unsigned long)buf & ~PAGE_MASK, 2530 &pagemap_addr[current_page], 2531 vram_addr, NULL, write ? 2532 XE_MIGRATE_COPY_TO_VRAM : 2533 XE_MIGRATE_COPY_TO_SRAM); 2534 if (IS_ERR(__fence)) { 2535 if (fence) { 2536 dma_fence_wait(fence, false); 2537 dma_fence_put(fence); 2538 } 2539 fence = __fence; 2540 goto out_err; 2541 } 2542 2543 dma_fence_put(fence); 2544 fence = __fence; 2545 2546 buf += current_bytes; 2547 offset += current_bytes; 2548 current_page = (int)(buf - orig_buf) / PAGE_SIZE; 2549 bytes_left -= current_bytes; 2550 if (bytes_left) 2551 xe_res_next(&cursor, current_bytes); 2552 } while (bytes_left); 2553 2554 dma_fence_wait(fence, false); 2555 dma_fence_put(fence); 2556 2557 out_err: 2558 xe_migrate_dma_unmap(xe, pagemap_addr, len + page_offset, write); 2559 return IS_ERR(fence) ? PTR_ERR(fence) : 0; 2560 } 2561 2562 /** 2563 * xe_migrate_job_lock() - Lock migrate job lock 2564 * @m: The migration context. 2565 * @q: Queue associated with the operation which requires a lock 2566 * 2567 * Lock the migrate job lock if the queue is a migration queue, otherwise 2568 * assert the VM's dma-resv is held (user queue's have own locking). 2569 */ 2570 void xe_migrate_job_lock(struct xe_migrate *m, struct xe_exec_queue *q) 2571 { 2572 bool is_migrate = q == m->q; 2573 2574 if (is_migrate) 2575 mutex_lock(&m->job_mutex); 2576 else 2577 xe_vm_assert_held(q->user_vm); /* User queues VM's should be locked */ 2578 } 2579 2580 /** 2581 * xe_migrate_job_unlock() - Unlock migrate job lock 2582 * @m: The migration context. 2583 * @q: Queue associated with the operation which requires a lock 2584 * 2585 * Unlock the migrate job lock if the queue is a migration queue, otherwise 2586 * assert the VM's dma-resv is held (user queue's have own locking). 2587 */ 2588 void xe_migrate_job_unlock(struct xe_migrate *m, struct xe_exec_queue *q) 2589 { 2590 bool is_migrate = q == m->q; 2591 2592 if (is_migrate) 2593 mutex_unlock(&m->job_mutex); 2594 else 2595 xe_vm_assert_held(q->user_vm); /* User queues VM's should be locked */ 2596 } 2597 2598 #if IS_ENABLED(CONFIG_PROVE_LOCKING) 2599 /** 2600 * xe_migrate_job_lock_assert() - Assert migrate job lock held of queue 2601 * @q: Migrate queue 2602 */ 2603 void xe_migrate_job_lock_assert(struct xe_exec_queue *q) 2604 { 2605 struct xe_migrate *m = gt_to_tile(q->gt)->migrate; 2606 2607 xe_gt_assert(q->gt, q == m->q); 2608 lockdep_assert_held(&m->job_mutex); 2609 } 2610 #endif 2611 2612 #if IS_ENABLED(CONFIG_DRM_XE_KUNIT_TEST) 2613 #include "tests/xe_migrate.c" 2614 #endif 2615