1 // SPDX-License-Identifier: MIT 2 /* 3 * Copyright © 2020 Intel Corporation 4 */ 5 6 #include "xe_migrate.h" 7 8 #include <linux/bitfield.h> 9 #include <linux/sizes.h> 10 11 #include <drm/drm_managed.h> 12 #include <drm/ttm/ttm_tt.h> 13 #include <drm/xe_drm.h> 14 15 #include "generated/xe_wa_oob.h" 16 #include "instructions/xe_mi_commands.h" 17 #include "regs/xe_gpu_commands.h" 18 #include "tests/xe_test.h" 19 #include "xe_assert.h" 20 #include "xe_bb.h" 21 #include "xe_bo.h" 22 #include "xe_exec_queue.h" 23 #include "xe_ggtt.h" 24 #include "xe_gt.h" 25 #include "xe_hw_engine.h" 26 #include "xe_lrc.h" 27 #include "xe_map.h" 28 #include "xe_mocs.h" 29 #include "xe_pt.h" 30 #include "xe_res_cursor.h" 31 #include "xe_sched_job.h" 32 #include "xe_sync.h" 33 #include "xe_trace.h" 34 #include "xe_vm.h" 35 #include "xe_wa.h" 36 37 /** 38 * struct xe_migrate - migrate context. 39 */ 40 struct xe_migrate { 41 /** @q: Default exec queue used for migration */ 42 struct xe_exec_queue *q; 43 /** @tile: Backpointer to the tile this struct xe_migrate belongs to. */ 44 struct xe_tile *tile; 45 /** @job_mutex: Timeline mutex for @eng. */ 46 struct mutex job_mutex; 47 /** @pt_bo: Page-table buffer object. */ 48 struct xe_bo *pt_bo; 49 /** @batch_base_ofs: VM offset of the migration batch buffer */ 50 u64 batch_base_ofs; 51 /** @usm_batch_base_ofs: VM offset of the usm batch buffer */ 52 u64 usm_batch_base_ofs; 53 /** @cleared_mem_ofs: VM offset of @cleared_bo. */ 54 u64 cleared_mem_ofs; 55 /** 56 * @fence: dma-fence representing the last migration job batch. 57 * Protected by @job_mutex. 58 */ 59 struct dma_fence *fence; 60 /** 61 * @vm_update_sa: For integrated, used to suballocate page-tables 62 * out of the pt_bo. 63 */ 64 struct drm_suballoc_manager vm_update_sa; 65 /** @min_chunk_size: For dgfx, Minimum chunk size */ 66 u64 min_chunk_size; 67 }; 68 69 #define MAX_PREEMPTDISABLE_TRANSFER SZ_8M /* Around 1ms. */ 70 #define MAX_CCS_LIMITED_TRANSFER SZ_4M /* XE_PAGE_SIZE * (FIELD_MAX(XE2_CCS_SIZE_MASK) + 1) */ 71 #define NUM_KERNEL_PDE 17 72 #define NUM_PT_SLOTS 32 73 #define LEVEL0_PAGE_TABLE_ENCODE_SIZE SZ_2M 74 75 /** 76 * xe_tile_migrate_engine() - Get this tile's migrate engine. 77 * @tile: The tile. 78 * 79 * Returns the default migrate engine of this tile. 80 * TODO: Perhaps this function is slightly misplaced, and even unneeded? 81 * 82 * Return: The default migrate engine 83 */ 84 struct xe_exec_queue *xe_tile_migrate_engine(struct xe_tile *tile) 85 { 86 return tile->migrate->q; 87 } 88 89 static void xe_migrate_fini(struct drm_device *dev, void *arg) 90 { 91 struct xe_migrate *m = arg; 92 93 xe_vm_lock(m->q->vm, false); 94 xe_bo_unpin(m->pt_bo); 95 xe_vm_unlock(m->q->vm); 96 97 dma_fence_put(m->fence); 98 xe_bo_put(m->pt_bo); 99 drm_suballoc_manager_fini(&m->vm_update_sa); 100 mutex_destroy(&m->job_mutex); 101 xe_vm_close_and_put(m->q->vm); 102 xe_exec_queue_put(m->q); 103 } 104 105 static u64 xe_migrate_vm_addr(u64 slot, u32 level) 106 { 107 XE_WARN_ON(slot >= NUM_PT_SLOTS); 108 109 /* First slot is reserved for mapping of PT bo and bb, start from 1 */ 110 return (slot + 1ULL) << xe_pt_shift(level + 1); 111 } 112 113 static u64 xe_migrate_vram_ofs(struct xe_device *xe, u64 addr) 114 { 115 /* 116 * Remove the DPA to get a correct offset into identity table for the 117 * migrate offset 118 */ 119 addr -= xe->mem.vram.dpa_base; 120 return addr + (256ULL << xe_pt_shift(2)); 121 } 122 123 static int xe_migrate_prepare_vm(struct xe_tile *tile, struct xe_migrate *m, 124 struct xe_vm *vm) 125 { 126 struct xe_device *xe = tile_to_xe(tile); 127 u16 pat_index = xe->pat.idx[XE_CACHE_WB]; 128 u8 id = tile->id; 129 u32 num_entries = NUM_PT_SLOTS, num_level = vm->pt_root[id]->level; 130 u32 map_ofs, level, i; 131 struct xe_bo *bo, *batch = tile->mem.kernel_bb_pool->bo; 132 u64 entry; 133 134 /* Can't bump NUM_PT_SLOTS too high */ 135 BUILD_BUG_ON(NUM_PT_SLOTS > SZ_2M/XE_PAGE_SIZE); 136 /* Must be a multiple of 64K to support all platforms */ 137 BUILD_BUG_ON(NUM_PT_SLOTS * XE_PAGE_SIZE % SZ_64K); 138 /* And one slot reserved for the 4KiB page table updates */ 139 BUILD_BUG_ON(!(NUM_KERNEL_PDE & 1)); 140 141 /* Need to be sure everything fits in the first PT, or create more */ 142 xe_tile_assert(tile, m->batch_base_ofs + batch->size < SZ_2M); 143 144 bo = xe_bo_create_pin_map(vm->xe, tile, vm, 145 num_entries * XE_PAGE_SIZE, 146 ttm_bo_type_kernel, 147 XE_BO_CREATE_VRAM_IF_DGFX(tile) | 148 XE_BO_CREATE_PINNED_BIT); 149 if (IS_ERR(bo)) 150 return PTR_ERR(bo); 151 152 entry = vm->pt_ops->pde_encode_bo(bo, bo->size - XE_PAGE_SIZE, pat_index); 153 xe_pt_write(xe, &vm->pt_root[id]->bo->vmap, 0, entry); 154 155 map_ofs = (num_entries - num_level) * XE_PAGE_SIZE; 156 157 /* Map the entire BO in our level 0 pt */ 158 for (i = 0, level = 0; i < num_entries; level++) { 159 entry = vm->pt_ops->pte_encode_bo(bo, i * XE_PAGE_SIZE, 160 pat_index, 0); 161 162 xe_map_wr(xe, &bo->vmap, map_ofs + level * 8, u64, entry); 163 164 if (vm->flags & XE_VM_FLAG_64K) 165 i += 16; 166 else 167 i += 1; 168 } 169 170 if (!IS_DGFX(xe)) { 171 /* Write out batch too */ 172 m->batch_base_ofs = NUM_PT_SLOTS * XE_PAGE_SIZE; 173 if (xe->info.has_usm) { 174 batch = tile->primary_gt->usm.bb_pool->bo; 175 m->usm_batch_base_ofs = m->batch_base_ofs; 176 } 177 178 for (i = 0; i < batch->size; 179 i += vm->flags & XE_VM_FLAG_64K ? XE_64K_PAGE_SIZE : 180 XE_PAGE_SIZE) { 181 entry = vm->pt_ops->pte_encode_bo(batch, i, 182 pat_index, 0); 183 184 xe_map_wr(xe, &bo->vmap, map_ofs + level * 8, u64, 185 entry); 186 level++; 187 } 188 } else { 189 u64 batch_addr = xe_bo_addr(batch, 0, XE_PAGE_SIZE); 190 191 m->batch_base_ofs = xe_migrate_vram_ofs(xe, batch_addr); 192 193 if (xe->info.has_usm) { 194 batch = tile->primary_gt->usm.bb_pool->bo; 195 batch_addr = xe_bo_addr(batch, 0, XE_PAGE_SIZE); 196 m->usm_batch_base_ofs = xe_migrate_vram_ofs(xe, batch_addr); 197 } 198 } 199 200 for (level = 1; level < num_level; level++) { 201 u32 flags = 0; 202 203 if (vm->flags & XE_VM_FLAG_64K && level == 1) 204 flags = XE_PDE_64K; 205 206 entry = vm->pt_ops->pde_encode_bo(bo, map_ofs + (level - 1) * 207 XE_PAGE_SIZE, pat_index); 208 xe_map_wr(xe, &bo->vmap, map_ofs + XE_PAGE_SIZE * level, u64, 209 entry | flags); 210 } 211 212 /* Write PDE's that point to our BO. */ 213 for (i = 0; i < num_entries - num_level; i++) { 214 entry = vm->pt_ops->pde_encode_bo(bo, i * XE_PAGE_SIZE, 215 pat_index); 216 217 xe_map_wr(xe, &bo->vmap, map_ofs + XE_PAGE_SIZE + 218 (i + 1) * 8, u64, entry); 219 } 220 221 /* Set up a 1GiB NULL mapping at 255GiB offset. */ 222 level = 2; 223 xe_map_wr(xe, &bo->vmap, map_ofs + XE_PAGE_SIZE * level + 255 * 8, u64, 224 vm->pt_ops->pte_encode_addr(xe, 0, pat_index, level, IS_DGFX(xe), 0) 225 | XE_PTE_NULL); 226 m->cleared_mem_ofs = (255ULL << xe_pt_shift(level)); 227 228 /* Identity map the entire vram at 256GiB offset */ 229 if (IS_DGFX(xe)) { 230 u64 pos, ofs, flags; 231 232 level = 2; 233 ofs = map_ofs + XE_PAGE_SIZE * level + 256 * 8; 234 flags = vm->pt_ops->pte_encode_addr(xe, 0, pat_index, level, 235 true, 0); 236 237 /* 238 * Use 1GB pages, it shouldn't matter the physical amount of 239 * vram is less, when we don't access it. 240 */ 241 for (pos = xe->mem.vram.dpa_base; 242 pos < xe->mem.vram.actual_physical_size + xe->mem.vram.dpa_base; 243 pos += SZ_1G, ofs += 8) 244 xe_map_wr(xe, &bo->vmap, ofs, u64, pos | flags); 245 } 246 247 /* 248 * Example layout created above, with root level = 3: 249 * [PT0...PT7]: kernel PT's for copy/clear; 64 or 4KiB PTE's 250 * [PT8]: Kernel PT for VM_BIND, 4 KiB PTE's 251 * [PT9...PT28]: Userspace PT's for VM_BIND, 4 KiB PTE's 252 * [PT29 = PDE 0] [PT30 = PDE 1] [PT31 = PDE 2] 253 * 254 * This makes the lowest part of the VM point to the pagetables. 255 * Hence the lowest 2M in the vm should point to itself, with a few writes 256 * and flushes, other parts of the VM can be used either for copying and 257 * clearing. 258 * 259 * For performance, the kernel reserves PDE's, so about 20 are left 260 * for async VM updates. 261 * 262 * To make it easier to work, each scratch PT is put in slot (1 + PT #) 263 * everywhere, this allows lockless updates to scratch pages by using 264 * the different addresses in VM. 265 */ 266 #define NUM_VMUSA_UNIT_PER_PAGE 32 267 #define VM_SA_UPDATE_UNIT_SIZE (XE_PAGE_SIZE / NUM_VMUSA_UNIT_PER_PAGE) 268 #define NUM_VMUSA_WRITES_PER_UNIT (VM_SA_UPDATE_UNIT_SIZE / sizeof(u64)) 269 drm_suballoc_manager_init(&m->vm_update_sa, 270 (map_ofs / XE_PAGE_SIZE - NUM_KERNEL_PDE) * 271 NUM_VMUSA_UNIT_PER_PAGE, 0); 272 273 m->pt_bo = bo; 274 return 0; 275 } 276 277 /* 278 * Due to workaround 16017236439, odd instance hardware copy engines are 279 * faster than even instance ones. 280 * This function returns the mask involving all fast copy engines and the 281 * reserved copy engine to be used as logical mask for migrate engine. 282 * Including the reserved copy engine is required to avoid deadlocks due to 283 * migrate jobs servicing the faults gets stuck behind the job that faulted. 284 */ 285 static u32 xe_migrate_usm_logical_mask(struct xe_gt *gt) 286 { 287 u32 logical_mask = 0; 288 struct xe_hw_engine *hwe; 289 enum xe_hw_engine_id id; 290 291 for_each_hw_engine(hwe, gt, id) { 292 if (hwe->class != XE_ENGINE_CLASS_COPY) 293 continue; 294 295 if (!XE_WA(gt, 16017236439) || 296 xe_gt_is_usm_hwe(gt, hwe) || hwe->instance & 1) 297 logical_mask |= BIT(hwe->logical_instance); 298 } 299 300 return logical_mask; 301 } 302 303 /** 304 * xe_migrate_init() - Initialize a migrate context 305 * @tile: Back-pointer to the tile we're initializing for. 306 * 307 * Return: Pointer to a migrate context on success. Error pointer on error. 308 */ 309 struct xe_migrate *xe_migrate_init(struct xe_tile *tile) 310 { 311 struct xe_device *xe = tile_to_xe(tile); 312 struct xe_gt *primary_gt = tile->primary_gt; 313 struct xe_migrate *m; 314 struct xe_vm *vm; 315 int err; 316 317 m = drmm_kzalloc(&xe->drm, sizeof(*m), GFP_KERNEL); 318 if (!m) 319 return ERR_PTR(-ENOMEM); 320 321 m->tile = tile; 322 323 /* Special layout, prepared below.. */ 324 vm = xe_vm_create(xe, XE_VM_FLAG_MIGRATION | 325 XE_VM_FLAG_SET_TILE_ID(tile)); 326 if (IS_ERR(vm)) 327 return ERR_CAST(vm); 328 329 xe_vm_lock(vm, false); 330 err = xe_migrate_prepare_vm(tile, m, vm); 331 xe_vm_unlock(vm); 332 if (err) { 333 xe_vm_close_and_put(vm); 334 return ERR_PTR(err); 335 } 336 337 if (xe->info.has_usm) { 338 struct xe_hw_engine *hwe = xe_gt_hw_engine(primary_gt, 339 XE_ENGINE_CLASS_COPY, 340 primary_gt->usm.reserved_bcs_instance, 341 false); 342 u32 logical_mask = xe_migrate_usm_logical_mask(primary_gt); 343 344 if (!hwe || !logical_mask) 345 return ERR_PTR(-EINVAL); 346 347 m->q = xe_exec_queue_create(xe, vm, logical_mask, 1, hwe, 348 EXEC_QUEUE_FLAG_KERNEL | 349 EXEC_QUEUE_FLAG_PERMANENT | 350 EXEC_QUEUE_FLAG_HIGH_PRIORITY); 351 } else { 352 m->q = xe_exec_queue_create_class(xe, primary_gt, vm, 353 XE_ENGINE_CLASS_COPY, 354 EXEC_QUEUE_FLAG_KERNEL | 355 EXEC_QUEUE_FLAG_PERMANENT); 356 } 357 if (IS_ERR(m->q)) { 358 xe_vm_close_and_put(vm); 359 return ERR_CAST(m->q); 360 } 361 362 mutex_init(&m->job_mutex); 363 364 err = drmm_add_action_or_reset(&xe->drm, xe_migrate_fini, m); 365 if (err) 366 return ERR_PTR(err); 367 368 if (IS_DGFX(xe)) { 369 if (xe_device_has_flat_ccs(xe)) 370 /* min chunk size corresponds to 4K of CCS Metadata */ 371 m->min_chunk_size = SZ_4K * SZ_64K / 372 xe_device_ccs_bytes(xe, SZ_64K); 373 else 374 /* Somewhat arbitrary to avoid a huge amount of blits */ 375 m->min_chunk_size = SZ_64K; 376 m->min_chunk_size = roundup_pow_of_two(m->min_chunk_size); 377 drm_dbg(&xe->drm, "Migrate min chunk size is 0x%08llx\n", 378 (unsigned long long)m->min_chunk_size); 379 } 380 381 return m; 382 } 383 384 static u64 max_mem_transfer_per_pass(struct xe_device *xe) 385 { 386 if (!IS_DGFX(xe) && xe_device_has_flat_ccs(xe)) 387 return MAX_CCS_LIMITED_TRANSFER; 388 389 return MAX_PREEMPTDISABLE_TRANSFER; 390 } 391 392 static u64 xe_migrate_res_sizes(struct xe_migrate *m, struct xe_res_cursor *cur) 393 { 394 struct xe_device *xe = tile_to_xe(m->tile); 395 u64 size = min_t(u64, max_mem_transfer_per_pass(xe), cur->remaining); 396 397 if (mem_type_is_vram(cur->mem_type)) { 398 /* 399 * VRAM we want to blit in chunks with sizes aligned to 400 * min_chunk_size in order for the offset to CCS metadata to be 401 * page-aligned. If it's the last chunk it may be smaller. 402 * 403 * Another constraint is that we need to limit the blit to 404 * the VRAM block size, unless size is smaller than 405 * min_chunk_size. 406 */ 407 u64 chunk = max_t(u64, cur->size, m->min_chunk_size); 408 409 size = min_t(u64, size, chunk); 410 if (size > m->min_chunk_size) 411 size = round_down(size, m->min_chunk_size); 412 } 413 414 return size; 415 } 416 417 static bool xe_migrate_allow_identity(u64 size, const struct xe_res_cursor *cur) 418 { 419 /* If the chunk is not fragmented, allow identity map. */ 420 return cur->size >= size; 421 } 422 423 static u32 pte_update_size(struct xe_migrate *m, 424 bool is_vram, 425 struct ttm_resource *res, 426 struct xe_res_cursor *cur, 427 u64 *L0, u64 *L0_ofs, u32 *L0_pt, 428 u32 cmd_size, u32 pt_ofs, u32 avail_pts) 429 { 430 u32 cmds = 0; 431 432 *L0_pt = pt_ofs; 433 if (is_vram && xe_migrate_allow_identity(*L0, cur)) { 434 /* Offset into identity map. */ 435 *L0_ofs = xe_migrate_vram_ofs(tile_to_xe(m->tile), 436 cur->start + vram_region_gpu_offset(res)); 437 cmds += cmd_size; 438 } else { 439 /* Clip L0 to available size */ 440 u64 size = min(*L0, (u64)avail_pts * SZ_2M); 441 u64 num_4k_pages = DIV_ROUND_UP(size, XE_PAGE_SIZE); 442 443 *L0 = size; 444 *L0_ofs = xe_migrate_vm_addr(pt_ofs, 0); 445 446 /* MI_STORE_DATA_IMM */ 447 cmds += 3 * DIV_ROUND_UP(num_4k_pages, 0x1ff); 448 449 /* PDE qwords */ 450 cmds += num_4k_pages * 2; 451 452 /* Each chunk has a single blit command */ 453 cmds += cmd_size; 454 } 455 456 return cmds; 457 } 458 459 static void emit_pte(struct xe_migrate *m, 460 struct xe_bb *bb, u32 at_pt, 461 bool is_vram, bool is_comp_pte, 462 struct xe_res_cursor *cur, 463 u32 size, struct ttm_resource *res) 464 { 465 struct xe_device *xe = tile_to_xe(m->tile); 466 struct xe_vm *vm = m->q->vm; 467 u16 pat_index; 468 u32 ptes; 469 u64 ofs = at_pt * XE_PAGE_SIZE; 470 u64 cur_ofs; 471 472 /* Indirect access needs compression enabled uncached PAT index */ 473 if (GRAPHICS_VERx100(xe) >= 2000) 474 pat_index = is_comp_pte ? xe->pat.idx[XE_CACHE_NONE_COMPRESSION] : 475 xe->pat.idx[XE_CACHE_WB]; 476 else 477 pat_index = xe->pat.idx[XE_CACHE_WB]; 478 479 ptes = DIV_ROUND_UP(size, XE_PAGE_SIZE); 480 481 while (ptes) { 482 u32 chunk = min(0x1ffU, ptes); 483 484 bb->cs[bb->len++] = MI_STORE_DATA_IMM | MI_SDI_NUM_QW(chunk); 485 bb->cs[bb->len++] = ofs; 486 bb->cs[bb->len++] = 0; 487 488 cur_ofs = ofs; 489 ofs += chunk * 8; 490 ptes -= chunk; 491 492 while (chunk--) { 493 u64 addr, flags = 0; 494 bool devmem = false; 495 496 addr = xe_res_dma(cur) & PAGE_MASK; 497 if (is_vram) { 498 if (vm->flags & XE_VM_FLAG_64K) { 499 u64 va = cur_ofs * XE_PAGE_SIZE / 8; 500 501 xe_assert(xe, (va & (SZ_64K - 1)) == 502 (addr & (SZ_64K - 1))); 503 504 flags |= XE_PTE_PS64; 505 } 506 507 addr += vram_region_gpu_offset(res); 508 devmem = true; 509 } 510 511 addr = vm->pt_ops->pte_encode_addr(m->tile->xe, 512 addr, pat_index, 513 0, devmem, flags); 514 bb->cs[bb->len++] = lower_32_bits(addr); 515 bb->cs[bb->len++] = upper_32_bits(addr); 516 517 xe_res_next(cur, min_t(u32, size, PAGE_SIZE)); 518 cur_ofs += 8; 519 } 520 } 521 } 522 523 #define EMIT_COPY_CCS_DW 5 524 static void emit_copy_ccs(struct xe_gt *gt, struct xe_bb *bb, 525 u64 dst_ofs, bool dst_is_indirect, 526 u64 src_ofs, bool src_is_indirect, 527 u32 size) 528 { 529 struct xe_device *xe = gt_to_xe(gt); 530 u32 *cs = bb->cs + bb->len; 531 u32 num_ccs_blks; 532 u32 num_pages; 533 u32 ccs_copy_size; 534 u32 mocs; 535 536 if (GRAPHICS_VERx100(xe) >= 2000) { 537 num_pages = DIV_ROUND_UP(size, XE_PAGE_SIZE); 538 xe_gt_assert(gt, FIELD_FIT(XE2_CCS_SIZE_MASK, num_pages - 1)); 539 540 ccs_copy_size = REG_FIELD_PREP(XE2_CCS_SIZE_MASK, num_pages - 1); 541 mocs = FIELD_PREP(XE2_XY_CTRL_SURF_MOCS_INDEX_MASK, gt->mocs.uc_index); 542 543 } else { 544 num_ccs_blks = DIV_ROUND_UP(xe_device_ccs_bytes(gt_to_xe(gt), size), 545 NUM_CCS_BYTES_PER_BLOCK); 546 xe_gt_assert(gt, FIELD_FIT(CCS_SIZE_MASK, num_ccs_blks - 1)); 547 548 ccs_copy_size = REG_FIELD_PREP(CCS_SIZE_MASK, num_ccs_blks - 1); 549 mocs = FIELD_PREP(XY_CTRL_SURF_MOCS_MASK, gt->mocs.uc_index); 550 } 551 552 *cs++ = XY_CTRL_SURF_COPY_BLT | 553 (src_is_indirect ? 0x0 : 0x1) << SRC_ACCESS_TYPE_SHIFT | 554 (dst_is_indirect ? 0x0 : 0x1) << DST_ACCESS_TYPE_SHIFT | 555 ccs_copy_size; 556 *cs++ = lower_32_bits(src_ofs); 557 *cs++ = upper_32_bits(src_ofs) | mocs; 558 *cs++ = lower_32_bits(dst_ofs); 559 *cs++ = upper_32_bits(dst_ofs) | mocs; 560 561 bb->len = cs - bb->cs; 562 } 563 564 #define EMIT_COPY_DW 10 565 static void emit_copy(struct xe_gt *gt, struct xe_bb *bb, 566 u64 src_ofs, u64 dst_ofs, unsigned int size, 567 unsigned int pitch) 568 { 569 struct xe_device *xe = gt_to_xe(gt); 570 u32 mocs = 0; 571 u32 tile_y = 0; 572 573 xe_gt_assert(gt, size / pitch <= S16_MAX); 574 xe_gt_assert(gt, pitch / 4 <= S16_MAX); 575 xe_gt_assert(gt, pitch <= U16_MAX); 576 577 if (GRAPHICS_VER(xe) >= 20) 578 mocs = FIELD_PREP(XE2_XY_FAST_COPY_BLT_MOCS_INDEX_MASK, gt->mocs.uc_index); 579 580 if (GRAPHICS_VERx100(xe) >= 1250) 581 tile_y = XY_FAST_COPY_BLT_D1_SRC_TILE4 | XY_FAST_COPY_BLT_D1_DST_TILE4; 582 583 bb->cs[bb->len++] = XY_FAST_COPY_BLT_CMD | (10 - 2); 584 bb->cs[bb->len++] = XY_FAST_COPY_BLT_DEPTH_32 | pitch | tile_y | mocs; 585 bb->cs[bb->len++] = 0; 586 bb->cs[bb->len++] = (size / pitch) << 16 | pitch / 4; 587 bb->cs[bb->len++] = lower_32_bits(dst_ofs); 588 bb->cs[bb->len++] = upper_32_bits(dst_ofs); 589 bb->cs[bb->len++] = 0; 590 bb->cs[bb->len++] = pitch | mocs; 591 bb->cs[bb->len++] = lower_32_bits(src_ofs); 592 bb->cs[bb->len++] = upper_32_bits(src_ofs); 593 } 594 595 static int job_add_deps(struct xe_sched_job *job, struct dma_resv *resv, 596 enum dma_resv_usage usage) 597 { 598 return drm_sched_job_add_resv_dependencies(&job->drm, resv, usage); 599 } 600 601 static u64 xe_migrate_batch_base(struct xe_migrate *m, bool usm) 602 { 603 return usm ? m->usm_batch_base_ofs : m->batch_base_ofs; 604 } 605 606 static u32 xe_migrate_ccs_copy(struct xe_migrate *m, 607 struct xe_bb *bb, 608 u64 src_ofs, bool src_is_indirect, 609 u64 dst_ofs, bool dst_is_indirect, u32 dst_size, 610 u64 ccs_ofs, bool copy_ccs) 611 { 612 struct xe_gt *gt = m->tile->primary_gt; 613 u32 flush_flags = 0; 614 615 if (xe_device_has_flat_ccs(gt_to_xe(gt)) && !copy_ccs && dst_is_indirect) { 616 /* 617 * If the src is already in vram, then it should already 618 * have been cleared by us, or has been populated by the 619 * user. Make sure we copy the CCS aux state as-is. 620 * 621 * Otherwise if the bo doesn't have any CCS metadata attached, 622 * we still need to clear it for security reasons. 623 */ 624 u64 ccs_src_ofs = src_is_indirect ? src_ofs : m->cleared_mem_ofs; 625 626 emit_copy_ccs(gt, bb, 627 dst_ofs, true, 628 ccs_src_ofs, src_is_indirect, dst_size); 629 630 flush_flags = MI_FLUSH_DW_CCS; 631 } else if (copy_ccs) { 632 if (!src_is_indirect) 633 src_ofs = ccs_ofs; 634 else if (!dst_is_indirect) 635 dst_ofs = ccs_ofs; 636 637 xe_gt_assert(gt, src_is_indirect || dst_is_indirect); 638 639 emit_copy_ccs(gt, bb, dst_ofs, dst_is_indirect, src_ofs, 640 src_is_indirect, dst_size); 641 if (dst_is_indirect) 642 flush_flags = MI_FLUSH_DW_CCS; 643 } 644 645 return flush_flags; 646 } 647 648 /** 649 * xe_migrate_copy() - Copy content of TTM resources. 650 * @m: The migration context. 651 * @src_bo: The buffer object @src is currently bound to. 652 * @dst_bo: If copying between resources created for the same bo, set this to 653 * the same value as @src_bo. If copying between buffer objects, set it to 654 * the buffer object @dst is currently bound to. 655 * @src: The source TTM resource. 656 * @dst: The dst TTM resource. 657 * @copy_only_ccs: If true copy only CCS metadata 658 * 659 * Copies the contents of @src to @dst: On flat CCS devices, 660 * the CCS metadata is copied as well if needed, or if not present, 661 * the CCS metadata of @dst is cleared for security reasons. 662 * 663 * Return: Pointer to a dma_fence representing the last copy batch, or 664 * an error pointer on failure. If there is a failure, any copy operation 665 * started by the function call has been synced. 666 */ 667 struct dma_fence *xe_migrate_copy(struct xe_migrate *m, 668 struct xe_bo *src_bo, 669 struct xe_bo *dst_bo, 670 struct ttm_resource *src, 671 struct ttm_resource *dst, 672 bool copy_only_ccs) 673 { 674 struct xe_gt *gt = m->tile->primary_gt; 675 struct xe_device *xe = gt_to_xe(gt); 676 struct dma_fence *fence = NULL; 677 u64 size = src_bo->size; 678 struct xe_res_cursor src_it, dst_it, ccs_it; 679 u64 src_L0_ofs, dst_L0_ofs; 680 u32 src_L0_pt, dst_L0_pt; 681 u64 src_L0, dst_L0; 682 int pass = 0; 683 int err; 684 bool src_is_pltt = src->mem_type == XE_PL_TT; 685 bool dst_is_pltt = dst->mem_type == XE_PL_TT; 686 bool src_is_vram = mem_type_is_vram(src->mem_type); 687 bool dst_is_vram = mem_type_is_vram(dst->mem_type); 688 bool copy_ccs = xe_device_has_flat_ccs(xe) && 689 xe_bo_needs_ccs_pages(src_bo) && xe_bo_needs_ccs_pages(dst_bo); 690 bool copy_system_ccs = copy_ccs && (!src_is_vram || !dst_is_vram); 691 692 /* Copying CCS between two different BOs is not supported yet. */ 693 if (XE_WARN_ON(copy_ccs && src_bo != dst_bo)) 694 return ERR_PTR(-EINVAL); 695 696 if (src_bo != dst_bo && XE_WARN_ON(src_bo->size != dst_bo->size)) 697 return ERR_PTR(-EINVAL); 698 699 if (!src_is_vram) 700 xe_res_first_sg(xe_bo_sg(src_bo), 0, size, &src_it); 701 else 702 xe_res_first(src, 0, size, &src_it); 703 if (!dst_is_vram) 704 xe_res_first_sg(xe_bo_sg(dst_bo), 0, size, &dst_it); 705 else 706 xe_res_first(dst, 0, size, &dst_it); 707 708 if (copy_system_ccs) 709 xe_res_first_sg(xe_bo_sg(src_bo), xe_bo_ccs_pages_start(src_bo), 710 PAGE_ALIGN(xe_device_ccs_bytes(xe, size)), 711 &ccs_it); 712 713 while (size) { 714 u32 batch_size = 2; /* arb_clear() + MI_BATCH_BUFFER_END */ 715 struct xe_sched_job *job; 716 struct xe_bb *bb; 717 u32 flush_flags; 718 u32 update_idx; 719 u64 ccs_ofs, ccs_size; 720 u32 ccs_pt; 721 722 bool usm = xe->info.has_usm; 723 u32 avail_pts = max_mem_transfer_per_pass(xe) / LEVEL0_PAGE_TABLE_ENCODE_SIZE; 724 725 src_L0 = xe_migrate_res_sizes(m, &src_it); 726 dst_L0 = xe_migrate_res_sizes(m, &dst_it); 727 728 drm_dbg(&xe->drm, "Pass %u, sizes: %llu & %llu\n", 729 pass++, src_L0, dst_L0); 730 731 src_L0 = min(src_L0, dst_L0); 732 733 batch_size += pte_update_size(m, src_is_vram, src, &src_it, &src_L0, 734 &src_L0_ofs, &src_L0_pt, 0, 0, 735 avail_pts); 736 737 batch_size += pte_update_size(m, dst_is_vram, dst, &dst_it, &src_L0, 738 &dst_L0_ofs, &dst_L0_pt, 0, 739 avail_pts, avail_pts); 740 741 if (copy_system_ccs) { 742 ccs_size = xe_device_ccs_bytes(xe, src_L0); 743 batch_size += pte_update_size(m, false, NULL, &ccs_it, &ccs_size, 744 &ccs_ofs, &ccs_pt, 0, 745 2 * avail_pts, 746 avail_pts); 747 xe_assert(xe, IS_ALIGNED(ccs_it.start, PAGE_SIZE)); 748 } 749 750 /* Add copy commands size here */ 751 batch_size += ((copy_only_ccs) ? 0 : EMIT_COPY_DW) + 752 ((xe_device_has_flat_ccs(xe) ? EMIT_COPY_CCS_DW : 0)); 753 754 bb = xe_bb_new(gt, batch_size, usm); 755 if (IS_ERR(bb)) { 756 err = PTR_ERR(bb); 757 goto err_sync; 758 } 759 760 if (src_is_vram && xe_migrate_allow_identity(src_L0, &src_it)) 761 xe_res_next(&src_it, src_L0); 762 else 763 emit_pte(m, bb, src_L0_pt, src_is_vram, copy_system_ccs, 764 &src_it, src_L0, src); 765 766 if (dst_is_vram && xe_migrate_allow_identity(src_L0, &dst_it)) 767 xe_res_next(&dst_it, src_L0); 768 else 769 emit_pte(m, bb, dst_L0_pt, dst_is_vram, copy_system_ccs, 770 &dst_it, src_L0, dst); 771 772 if (copy_system_ccs) 773 emit_pte(m, bb, ccs_pt, false, false, &ccs_it, ccs_size, src); 774 775 bb->cs[bb->len++] = MI_BATCH_BUFFER_END; 776 update_idx = bb->len; 777 778 if (!copy_only_ccs) 779 emit_copy(gt, bb, src_L0_ofs, dst_L0_ofs, src_L0, XE_PAGE_SIZE); 780 781 flush_flags = xe_migrate_ccs_copy(m, bb, src_L0_ofs, 782 IS_DGFX(xe) ? src_is_vram : src_is_pltt, 783 dst_L0_ofs, 784 IS_DGFX(xe) ? dst_is_vram : dst_is_pltt, 785 src_L0, ccs_ofs, copy_ccs); 786 787 mutex_lock(&m->job_mutex); 788 job = xe_bb_create_migration_job(m->q, bb, 789 xe_migrate_batch_base(m, usm), 790 update_idx); 791 if (IS_ERR(job)) { 792 err = PTR_ERR(job); 793 goto err; 794 } 795 796 xe_sched_job_add_migrate_flush(job, flush_flags); 797 if (!fence) { 798 err = job_add_deps(job, src_bo->ttm.base.resv, 799 DMA_RESV_USAGE_BOOKKEEP); 800 if (!err && src_bo != dst_bo) 801 err = job_add_deps(job, dst_bo->ttm.base.resv, 802 DMA_RESV_USAGE_BOOKKEEP); 803 if (err) 804 goto err_job; 805 } 806 807 xe_sched_job_arm(job); 808 dma_fence_put(fence); 809 fence = dma_fence_get(&job->drm.s_fence->finished); 810 xe_sched_job_push(job); 811 812 dma_fence_put(m->fence); 813 m->fence = dma_fence_get(fence); 814 815 mutex_unlock(&m->job_mutex); 816 817 xe_bb_free(bb, fence); 818 size -= src_L0; 819 continue; 820 821 err_job: 822 xe_sched_job_put(job); 823 err: 824 mutex_unlock(&m->job_mutex); 825 xe_bb_free(bb, NULL); 826 827 err_sync: 828 /* Sync partial copy if any. FIXME: under job_mutex? */ 829 if (fence) { 830 dma_fence_wait(fence, false); 831 dma_fence_put(fence); 832 } 833 834 return ERR_PTR(err); 835 } 836 837 return fence; 838 } 839 840 static void emit_clear_link_copy(struct xe_gt *gt, struct xe_bb *bb, u64 src_ofs, 841 u32 size, u32 pitch) 842 { 843 struct xe_device *xe = gt_to_xe(gt); 844 u32 *cs = bb->cs + bb->len; 845 u32 len = PVC_MEM_SET_CMD_LEN_DW; 846 847 *cs++ = PVC_MEM_SET_CMD | PVC_MEM_SET_MATRIX | (len - 2); 848 *cs++ = pitch - 1; 849 *cs++ = (size / pitch) - 1; 850 *cs++ = pitch - 1; 851 *cs++ = lower_32_bits(src_ofs); 852 *cs++ = upper_32_bits(src_ofs); 853 if (GRAPHICS_VERx100(xe) >= 2000) 854 *cs++ = FIELD_PREP(XE2_MEM_SET_MOCS_INDEX_MASK, gt->mocs.uc_index); 855 else 856 *cs++ = FIELD_PREP(PVC_MEM_SET_MOCS_INDEX_MASK, gt->mocs.uc_index); 857 858 xe_gt_assert(gt, cs - bb->cs == len + bb->len); 859 860 bb->len += len; 861 } 862 863 static void emit_clear_main_copy(struct xe_gt *gt, struct xe_bb *bb, 864 u64 src_ofs, u32 size, u32 pitch, bool is_vram) 865 { 866 struct xe_device *xe = gt_to_xe(gt); 867 u32 *cs = bb->cs + bb->len; 868 u32 len = XY_FAST_COLOR_BLT_DW; 869 870 if (GRAPHICS_VERx100(xe) < 1250) 871 len = 11; 872 873 *cs++ = XY_FAST_COLOR_BLT_CMD | XY_FAST_COLOR_BLT_DEPTH_32 | 874 (len - 2); 875 if (GRAPHICS_VERx100(xe) >= 2000) 876 *cs++ = FIELD_PREP(XE2_XY_FAST_COLOR_BLT_MOCS_INDEX_MASK, gt->mocs.uc_index) | 877 (pitch - 1); 878 else 879 *cs++ = FIELD_PREP(XY_FAST_COLOR_BLT_MOCS_MASK, gt->mocs.uc_index) | 880 (pitch - 1); 881 *cs++ = 0; 882 *cs++ = (size / pitch) << 16 | pitch / 4; 883 *cs++ = lower_32_bits(src_ofs); 884 *cs++ = upper_32_bits(src_ofs); 885 *cs++ = (is_vram ? 0x0 : 0x1) << XY_FAST_COLOR_BLT_MEM_TYPE_SHIFT; 886 *cs++ = 0; 887 *cs++ = 0; 888 *cs++ = 0; 889 *cs++ = 0; 890 891 if (len > 11) { 892 *cs++ = 0; 893 *cs++ = 0; 894 *cs++ = 0; 895 *cs++ = 0; 896 *cs++ = 0; 897 } 898 899 xe_gt_assert(gt, cs - bb->cs == len + bb->len); 900 901 bb->len += len; 902 } 903 904 static bool has_service_copy_support(struct xe_gt *gt) 905 { 906 /* 907 * What we care about is whether the architecture was designed with 908 * service copy functionality (specifically the new MEM_SET / MEM_COPY 909 * instructions) so check the architectural engine list rather than the 910 * actual list since these instructions are usable on BCS0 even if 911 * all of the actual service copy engines (BCS1-BCS8) have been fused 912 * off. 913 */ 914 return gt->info.__engine_mask & GENMASK(XE_HW_ENGINE_BCS8, 915 XE_HW_ENGINE_BCS1); 916 } 917 918 static u32 emit_clear_cmd_len(struct xe_gt *gt) 919 { 920 if (has_service_copy_support(gt)) 921 return PVC_MEM_SET_CMD_LEN_DW; 922 else 923 return XY_FAST_COLOR_BLT_DW; 924 } 925 926 static void emit_clear(struct xe_gt *gt, struct xe_bb *bb, u64 src_ofs, 927 u32 size, u32 pitch, bool is_vram) 928 { 929 if (has_service_copy_support(gt)) 930 emit_clear_link_copy(gt, bb, src_ofs, size, pitch); 931 else 932 emit_clear_main_copy(gt, bb, src_ofs, size, pitch, 933 is_vram); 934 } 935 936 /** 937 * xe_migrate_clear() - Copy content of TTM resources. 938 * @m: The migration context. 939 * @bo: The buffer object @dst is currently bound to. 940 * @dst: The dst TTM resource to be cleared. 941 * 942 * Clear the contents of @dst to zero. On flat CCS devices, 943 * the CCS metadata is cleared to zero as well on VRAM destinations. 944 * TODO: Eliminate the @bo argument. 945 * 946 * Return: Pointer to a dma_fence representing the last clear batch, or 947 * an error pointer on failure. If there is a failure, any clear operation 948 * started by the function call has been synced. 949 */ 950 struct dma_fence *xe_migrate_clear(struct xe_migrate *m, 951 struct xe_bo *bo, 952 struct ttm_resource *dst) 953 { 954 bool clear_vram = mem_type_is_vram(dst->mem_type); 955 struct xe_gt *gt = m->tile->primary_gt; 956 struct xe_device *xe = gt_to_xe(gt); 957 bool clear_system_ccs = (xe_bo_needs_ccs_pages(bo) && !IS_DGFX(xe)) ? true : false; 958 struct dma_fence *fence = NULL; 959 u64 size = bo->size; 960 struct xe_res_cursor src_it; 961 struct ttm_resource *src = dst; 962 int err; 963 int pass = 0; 964 965 if (!clear_vram) 966 xe_res_first_sg(xe_bo_sg(bo), 0, bo->size, &src_it); 967 else 968 xe_res_first(src, 0, bo->size, &src_it); 969 970 while (size) { 971 u64 clear_L0_ofs; 972 u32 clear_L0_pt; 973 u32 flush_flags = 0; 974 u64 clear_L0; 975 struct xe_sched_job *job; 976 struct xe_bb *bb; 977 u32 batch_size, update_idx; 978 979 bool usm = xe->info.has_usm; 980 u32 avail_pts = max_mem_transfer_per_pass(xe) / LEVEL0_PAGE_TABLE_ENCODE_SIZE; 981 982 clear_L0 = xe_migrate_res_sizes(m, &src_it); 983 984 drm_dbg(&xe->drm, "Pass %u, size: %llu\n", pass++, clear_L0); 985 986 /* Calculate final sizes and batch size.. */ 987 batch_size = 2 + 988 pte_update_size(m, clear_vram, src, &src_it, 989 &clear_L0, &clear_L0_ofs, &clear_L0_pt, 990 clear_system_ccs ? 0 : emit_clear_cmd_len(gt), 0, 991 avail_pts); 992 993 if (xe_device_has_flat_ccs(xe)) 994 batch_size += EMIT_COPY_CCS_DW; 995 996 /* Clear commands */ 997 998 if (WARN_ON_ONCE(!clear_L0)) 999 break; 1000 1001 bb = xe_bb_new(gt, batch_size, usm); 1002 if (IS_ERR(bb)) { 1003 err = PTR_ERR(bb); 1004 goto err_sync; 1005 } 1006 1007 size -= clear_L0; 1008 /* Preemption is enabled again by the ring ops. */ 1009 if (clear_vram && xe_migrate_allow_identity(clear_L0, &src_it)) 1010 xe_res_next(&src_it, clear_L0); 1011 else 1012 emit_pte(m, bb, clear_L0_pt, clear_vram, clear_system_ccs, 1013 &src_it, clear_L0, dst); 1014 1015 bb->cs[bb->len++] = MI_BATCH_BUFFER_END; 1016 update_idx = bb->len; 1017 1018 if (!clear_system_ccs) 1019 emit_clear(gt, bb, clear_L0_ofs, clear_L0, XE_PAGE_SIZE, clear_vram); 1020 1021 if (xe_device_has_flat_ccs(xe)) { 1022 emit_copy_ccs(gt, bb, clear_L0_ofs, true, 1023 m->cleared_mem_ofs, false, clear_L0); 1024 flush_flags = MI_FLUSH_DW_CCS; 1025 } 1026 1027 mutex_lock(&m->job_mutex); 1028 job = xe_bb_create_migration_job(m->q, bb, 1029 xe_migrate_batch_base(m, usm), 1030 update_idx); 1031 if (IS_ERR(job)) { 1032 err = PTR_ERR(job); 1033 goto err; 1034 } 1035 1036 xe_sched_job_add_migrate_flush(job, flush_flags); 1037 if (!fence) { 1038 /* 1039 * There can't be anything userspace related at this 1040 * point, so we just need to respect any potential move 1041 * fences, which are always tracked as 1042 * DMA_RESV_USAGE_KERNEL. 1043 */ 1044 err = job_add_deps(job, bo->ttm.base.resv, 1045 DMA_RESV_USAGE_KERNEL); 1046 if (err) 1047 goto err_job; 1048 } 1049 1050 xe_sched_job_arm(job); 1051 dma_fence_put(fence); 1052 fence = dma_fence_get(&job->drm.s_fence->finished); 1053 xe_sched_job_push(job); 1054 1055 dma_fence_put(m->fence); 1056 m->fence = dma_fence_get(fence); 1057 1058 mutex_unlock(&m->job_mutex); 1059 1060 xe_bb_free(bb, fence); 1061 continue; 1062 1063 err_job: 1064 xe_sched_job_put(job); 1065 err: 1066 mutex_unlock(&m->job_mutex); 1067 xe_bb_free(bb, NULL); 1068 err_sync: 1069 /* Sync partial copies if any. FIXME: job_mutex? */ 1070 if (fence) { 1071 dma_fence_wait(m->fence, false); 1072 dma_fence_put(fence); 1073 } 1074 1075 return ERR_PTR(err); 1076 } 1077 1078 if (clear_system_ccs) 1079 bo->ccs_cleared = true; 1080 1081 return fence; 1082 } 1083 1084 static void write_pgtable(struct xe_tile *tile, struct xe_bb *bb, u64 ppgtt_ofs, 1085 const struct xe_vm_pgtable_update *update, 1086 struct xe_migrate_pt_update *pt_update) 1087 { 1088 const struct xe_migrate_pt_update_ops *ops = pt_update->ops; 1089 u32 chunk; 1090 u32 ofs = update->ofs, size = update->qwords; 1091 1092 /* 1093 * If we have 512 entries (max), we would populate it ourselves, 1094 * and update the PDE above it to the new pointer. 1095 * The only time this can only happen if we have to update the top 1096 * PDE. This requires a BO that is almost vm->size big. 1097 * 1098 * This shouldn't be possible in practice.. might change when 16K 1099 * pages are used. Hence the assert. 1100 */ 1101 xe_tile_assert(tile, update->qwords <= 0x1ff); 1102 if (!ppgtt_ofs) 1103 ppgtt_ofs = xe_migrate_vram_ofs(tile_to_xe(tile), 1104 xe_bo_addr(update->pt_bo, 0, 1105 XE_PAGE_SIZE)); 1106 1107 do { 1108 u64 addr = ppgtt_ofs + ofs * 8; 1109 1110 chunk = min(update->qwords, 0x1ffU); 1111 1112 /* Ensure populatefn can do memset64 by aligning bb->cs */ 1113 if (!(bb->len & 1)) 1114 bb->cs[bb->len++] = MI_NOOP; 1115 1116 bb->cs[bb->len++] = MI_STORE_DATA_IMM | MI_SDI_NUM_QW(chunk); 1117 bb->cs[bb->len++] = lower_32_bits(addr); 1118 bb->cs[bb->len++] = upper_32_bits(addr); 1119 ops->populate(pt_update, tile, NULL, bb->cs + bb->len, ofs, chunk, 1120 update); 1121 1122 bb->len += chunk * 2; 1123 ofs += chunk; 1124 size -= chunk; 1125 } while (size); 1126 } 1127 1128 struct xe_vm *xe_migrate_get_vm(struct xe_migrate *m) 1129 { 1130 return xe_vm_get(m->q->vm); 1131 } 1132 1133 #if IS_ENABLED(CONFIG_DRM_XE_KUNIT_TEST) 1134 struct migrate_test_params { 1135 struct xe_test_priv base; 1136 bool force_gpu; 1137 }; 1138 1139 #define to_migrate_test_params(_priv) \ 1140 container_of(_priv, struct migrate_test_params, base) 1141 #endif 1142 1143 static struct dma_fence * 1144 xe_migrate_update_pgtables_cpu(struct xe_migrate *m, 1145 struct xe_vm *vm, struct xe_bo *bo, 1146 const struct xe_vm_pgtable_update *updates, 1147 u32 num_updates, bool wait_vm, 1148 struct xe_migrate_pt_update *pt_update) 1149 { 1150 XE_TEST_DECLARE(struct migrate_test_params *test = 1151 to_migrate_test_params 1152 (xe_cur_kunit_priv(XE_TEST_LIVE_MIGRATE));) 1153 const struct xe_migrate_pt_update_ops *ops = pt_update->ops; 1154 struct dma_fence *fence; 1155 int err; 1156 u32 i; 1157 1158 if (XE_TEST_ONLY(test && test->force_gpu)) 1159 return ERR_PTR(-ETIME); 1160 1161 if (bo && !dma_resv_test_signaled(bo->ttm.base.resv, 1162 DMA_RESV_USAGE_KERNEL)) 1163 return ERR_PTR(-ETIME); 1164 1165 if (wait_vm && !dma_resv_test_signaled(xe_vm_resv(vm), 1166 DMA_RESV_USAGE_BOOKKEEP)) 1167 return ERR_PTR(-ETIME); 1168 1169 if (ops->pre_commit) { 1170 pt_update->job = NULL; 1171 err = ops->pre_commit(pt_update); 1172 if (err) 1173 return ERR_PTR(err); 1174 } 1175 for (i = 0; i < num_updates; i++) { 1176 const struct xe_vm_pgtable_update *update = &updates[i]; 1177 1178 ops->populate(pt_update, m->tile, &update->pt_bo->vmap, NULL, 1179 update->ofs, update->qwords, update); 1180 } 1181 1182 if (vm) { 1183 trace_xe_vm_cpu_bind(vm); 1184 xe_device_wmb(vm->xe); 1185 } 1186 1187 fence = dma_fence_get_stub(); 1188 1189 return fence; 1190 } 1191 1192 static bool no_in_syncs(struct xe_vm *vm, struct xe_exec_queue *q, 1193 struct xe_sync_entry *syncs, u32 num_syncs) 1194 { 1195 struct dma_fence *fence; 1196 int i; 1197 1198 for (i = 0; i < num_syncs; i++) { 1199 fence = syncs[i].fence; 1200 1201 if (fence && !test_bit(DMA_FENCE_FLAG_SIGNALED_BIT, 1202 &fence->flags)) 1203 return false; 1204 } 1205 if (q) { 1206 fence = xe_exec_queue_last_fence_get(q, vm); 1207 if (!test_bit(DMA_FENCE_FLAG_SIGNALED_BIT, &fence->flags)) 1208 return false; 1209 } 1210 1211 return true; 1212 } 1213 1214 /** 1215 * xe_migrate_update_pgtables() - Pipelined page-table update 1216 * @m: The migrate context. 1217 * @vm: The vm we'll be updating. 1218 * @bo: The bo whose dma-resv we will await before updating, or NULL if userptr. 1219 * @q: The exec queue to be used for the update or NULL if the default 1220 * migration engine is to be used. 1221 * @updates: An array of update descriptors. 1222 * @num_updates: Number of descriptors in @updates. 1223 * @syncs: Array of xe_sync_entry to await before updating. Note that waits 1224 * will block the engine timeline. 1225 * @num_syncs: Number of entries in @syncs. 1226 * @pt_update: Pointer to a struct xe_migrate_pt_update, which contains 1227 * pointers to callback functions and, if subclassed, private arguments to 1228 * those. 1229 * 1230 * Perform a pipelined page-table update. The update descriptors are typically 1231 * built under the same lock critical section as a call to this function. If 1232 * using the default engine for the updates, they will be performed in the 1233 * order they grab the job_mutex. If different engines are used, external 1234 * synchronization is needed for overlapping updates to maintain page-table 1235 * consistency. Note that the meaing of "overlapping" is that the updates 1236 * touch the same page-table, which might be a higher-level page-directory. 1237 * If no pipelining is needed, then updates may be performed by the cpu. 1238 * 1239 * Return: A dma_fence that, when signaled, indicates the update completion. 1240 */ 1241 struct dma_fence * 1242 xe_migrate_update_pgtables(struct xe_migrate *m, 1243 struct xe_vm *vm, 1244 struct xe_bo *bo, 1245 struct xe_exec_queue *q, 1246 const struct xe_vm_pgtable_update *updates, 1247 u32 num_updates, 1248 struct xe_sync_entry *syncs, u32 num_syncs, 1249 struct xe_migrate_pt_update *pt_update) 1250 { 1251 const struct xe_migrate_pt_update_ops *ops = pt_update->ops; 1252 struct xe_tile *tile = m->tile; 1253 struct xe_gt *gt = tile->primary_gt; 1254 struct xe_device *xe = tile_to_xe(tile); 1255 struct xe_sched_job *job; 1256 struct dma_fence *fence; 1257 struct drm_suballoc *sa_bo = NULL; 1258 struct xe_vma *vma = pt_update->vma; 1259 struct xe_bb *bb; 1260 u32 i, batch_size, ppgtt_ofs, update_idx, page_ofs = 0; 1261 u64 addr; 1262 int err = 0; 1263 bool usm = !q && xe->info.has_usm; 1264 bool first_munmap_rebind = vma && 1265 vma->gpuva.flags & XE_VMA_FIRST_REBIND; 1266 struct xe_exec_queue *q_override = !q ? m->q : q; 1267 u16 pat_index = xe->pat.idx[XE_CACHE_WB]; 1268 1269 /* Use the CPU if no in syncs and engine is idle */ 1270 if (no_in_syncs(vm, q, syncs, num_syncs) && xe_exec_queue_is_idle(q_override)) { 1271 fence = xe_migrate_update_pgtables_cpu(m, vm, bo, updates, 1272 num_updates, 1273 first_munmap_rebind, 1274 pt_update); 1275 if (!IS_ERR(fence) || fence == ERR_PTR(-EAGAIN)) 1276 return fence; 1277 } 1278 1279 /* fixed + PTE entries */ 1280 if (IS_DGFX(xe)) 1281 batch_size = 2; 1282 else 1283 batch_size = 6 + num_updates * 2; 1284 1285 for (i = 0; i < num_updates; i++) { 1286 u32 num_cmds = DIV_ROUND_UP(updates[i].qwords, 0x1ff); 1287 1288 /* align noop + MI_STORE_DATA_IMM cmd prefix */ 1289 batch_size += 4 * num_cmds + updates[i].qwords * 2; 1290 } 1291 1292 /* 1293 * XXX: Create temp bo to copy from, if batch_size becomes too big? 1294 * 1295 * Worst case: Sum(2 * (each lower level page size) + (top level page size)) 1296 * Should be reasonably bound.. 1297 */ 1298 xe_tile_assert(tile, batch_size < SZ_128K); 1299 1300 bb = xe_bb_new(gt, batch_size, !q && xe->info.has_usm); 1301 if (IS_ERR(bb)) 1302 return ERR_CAST(bb); 1303 1304 /* For sysmem PTE's, need to map them in our hole.. */ 1305 if (!IS_DGFX(xe)) { 1306 ppgtt_ofs = NUM_KERNEL_PDE - 1; 1307 if (q) { 1308 xe_tile_assert(tile, num_updates <= NUM_VMUSA_WRITES_PER_UNIT); 1309 1310 sa_bo = drm_suballoc_new(&m->vm_update_sa, 1, 1311 GFP_KERNEL, true, 0); 1312 if (IS_ERR(sa_bo)) { 1313 err = PTR_ERR(sa_bo); 1314 goto err; 1315 } 1316 1317 ppgtt_ofs = NUM_KERNEL_PDE + 1318 (drm_suballoc_soffset(sa_bo) / 1319 NUM_VMUSA_UNIT_PER_PAGE); 1320 page_ofs = (drm_suballoc_soffset(sa_bo) % 1321 NUM_VMUSA_UNIT_PER_PAGE) * 1322 VM_SA_UPDATE_UNIT_SIZE; 1323 } 1324 1325 /* Map our PT's to gtt */ 1326 bb->cs[bb->len++] = MI_STORE_DATA_IMM | MI_SDI_NUM_QW(num_updates); 1327 bb->cs[bb->len++] = ppgtt_ofs * XE_PAGE_SIZE + page_ofs; 1328 bb->cs[bb->len++] = 0; /* upper_32_bits */ 1329 1330 for (i = 0; i < num_updates; i++) { 1331 struct xe_bo *pt_bo = updates[i].pt_bo; 1332 1333 xe_tile_assert(tile, pt_bo->size == SZ_4K); 1334 1335 addr = vm->pt_ops->pte_encode_bo(pt_bo, 0, pat_index, 0); 1336 bb->cs[bb->len++] = lower_32_bits(addr); 1337 bb->cs[bb->len++] = upper_32_bits(addr); 1338 } 1339 1340 bb->cs[bb->len++] = MI_BATCH_BUFFER_END; 1341 update_idx = bb->len; 1342 1343 addr = xe_migrate_vm_addr(ppgtt_ofs, 0) + 1344 (page_ofs / sizeof(u64)) * XE_PAGE_SIZE; 1345 for (i = 0; i < num_updates; i++) 1346 write_pgtable(tile, bb, addr + i * XE_PAGE_SIZE, 1347 &updates[i], pt_update); 1348 } else { 1349 /* phys pages, no preamble required */ 1350 bb->cs[bb->len++] = MI_BATCH_BUFFER_END; 1351 update_idx = bb->len; 1352 1353 for (i = 0; i < num_updates; i++) 1354 write_pgtable(tile, bb, 0, &updates[i], pt_update); 1355 } 1356 1357 if (!q) 1358 mutex_lock(&m->job_mutex); 1359 1360 job = xe_bb_create_migration_job(q ?: m->q, bb, 1361 xe_migrate_batch_base(m, usm), 1362 update_idx); 1363 if (IS_ERR(job)) { 1364 err = PTR_ERR(job); 1365 goto err_bb; 1366 } 1367 1368 /* Wait on BO move */ 1369 if (bo) { 1370 err = job_add_deps(job, bo->ttm.base.resv, 1371 DMA_RESV_USAGE_KERNEL); 1372 if (err) 1373 goto err_job; 1374 } 1375 1376 /* 1377 * Munmap style VM unbind, need to wait for all jobs to be complete / 1378 * trigger preempts before moving forward 1379 */ 1380 if (first_munmap_rebind) { 1381 err = job_add_deps(job, xe_vm_resv(vm), 1382 DMA_RESV_USAGE_BOOKKEEP); 1383 if (err) 1384 goto err_job; 1385 } 1386 1387 err = xe_sched_job_last_fence_add_dep(job, vm); 1388 for (i = 0; !err && i < num_syncs; i++) 1389 err = xe_sync_entry_add_deps(&syncs[i], job); 1390 1391 if (err) 1392 goto err_job; 1393 1394 if (ops->pre_commit) { 1395 pt_update->job = job; 1396 err = ops->pre_commit(pt_update); 1397 if (err) 1398 goto err_job; 1399 } 1400 xe_sched_job_arm(job); 1401 fence = dma_fence_get(&job->drm.s_fence->finished); 1402 xe_sched_job_push(job); 1403 1404 if (!q) 1405 mutex_unlock(&m->job_mutex); 1406 1407 xe_bb_free(bb, fence); 1408 drm_suballoc_free(sa_bo, fence); 1409 1410 return fence; 1411 1412 err_job: 1413 xe_sched_job_put(job); 1414 err_bb: 1415 if (!q) 1416 mutex_unlock(&m->job_mutex); 1417 xe_bb_free(bb, NULL); 1418 err: 1419 drm_suballoc_free(sa_bo, NULL); 1420 return ERR_PTR(err); 1421 } 1422 1423 /** 1424 * xe_migrate_wait() - Complete all operations using the xe_migrate context 1425 * @m: Migrate context to wait for. 1426 * 1427 * Waits until the GPU no longer uses the migrate context's default engine 1428 * or its page-table objects. FIXME: What about separate page-table update 1429 * engines? 1430 */ 1431 void xe_migrate_wait(struct xe_migrate *m) 1432 { 1433 if (m->fence) 1434 dma_fence_wait(m->fence, false); 1435 } 1436 1437 #if IS_ENABLED(CONFIG_DRM_XE_KUNIT_TEST) 1438 #include "tests/xe_migrate.c" 1439 #endif 1440