1 // SPDX-License-Identifier: MIT
2 /*
3 * Copyright © 2020 Intel Corporation
4 */
5
6 #include "xe_migrate.h"
7
8 #include <linux/bitfield.h>
9 #include <linux/sizes.h>
10
11 #include <drm/drm_managed.h>
12 #include <drm/drm_pagemap.h>
13 #include <drm/ttm/ttm_tt.h>
14 #include <uapi/drm/xe_drm.h>
15
16 #include <generated/xe_wa_oob.h>
17
18 #include "instructions/xe_gpu_commands.h"
19 #include "instructions/xe_mi_commands.h"
20 #include "regs/xe_gtt_defs.h"
21 #include "tests/xe_test.h"
22 #include "xe_assert.h"
23 #include "xe_bb.h"
24 #include "xe_bo.h"
25 #include "xe_exec_queue.h"
26 #include "xe_ggtt.h"
27 #include "xe_gt.h"
28 #include "xe_hw_engine.h"
29 #include "xe_lrc.h"
30 #include "xe_map.h"
31 #include "xe_mocs.h"
32 #include "xe_pt.h"
33 #include "xe_res_cursor.h"
34 #include "xe_sa.h"
35 #include "xe_sched_job.h"
36 #include "xe_sync.h"
37 #include "xe_trace_bo.h"
38 #include "xe_validation.h"
39 #include "xe_vm.h"
40 #include "xe_vram.h"
41
42 /**
43 * struct xe_migrate - migrate context.
44 */
45 struct xe_migrate {
46 /** @q: Default exec queue used for migration */
47 struct xe_exec_queue *q;
48 /** @tile: Backpointer to the tile this struct xe_migrate belongs to. */
49 struct xe_tile *tile;
50 /** @job_mutex: Timeline mutex for @eng. */
51 struct mutex job_mutex;
52 /** @pt_bo: Page-table buffer object. */
53 struct xe_bo *pt_bo;
54 /** @batch_base_ofs: VM offset of the migration batch buffer */
55 u64 batch_base_ofs;
56 /** @usm_batch_base_ofs: VM offset of the usm batch buffer */
57 u64 usm_batch_base_ofs;
58 /** @cleared_mem_ofs: VM offset of @cleared_bo. */
59 u64 cleared_mem_ofs;
60 /**
61 * @fence: dma-fence representing the last migration job batch.
62 * Protected by @job_mutex.
63 */
64 struct dma_fence *fence;
65 /**
66 * @vm_update_sa: For integrated, used to suballocate page-tables
67 * out of the pt_bo.
68 */
69 struct drm_suballoc_manager vm_update_sa;
70 /** @min_chunk_size: For dgfx, Minimum chunk size */
71 u64 min_chunk_size;
72 };
73
74 #define MAX_PREEMPTDISABLE_TRANSFER SZ_8M /* Around 1ms. */
75 #define MAX_CCS_LIMITED_TRANSFER SZ_4M /* XE_PAGE_SIZE * (FIELD_MAX(XE2_CCS_SIZE_MASK) + 1) */
76 #define NUM_KERNEL_PDE 15
77 #define NUM_PT_SLOTS 32
78 #define LEVEL0_PAGE_TABLE_ENCODE_SIZE SZ_2M
79 #define MAX_NUM_PTE 512
80 #define IDENTITY_OFFSET 256ULL
81
82 /*
83 * Although MI_STORE_DATA_IMM's "length" field is 10-bits, 0x3FE is the largest
84 * legal value accepted. Since that instruction field is always stored in
85 * (val-2) format, this translates to 0x400 dwords for the true maximum length
86 * of the instruction. Subtracting the instruction header (1 dword) and
87 * address (2 dwords), that leaves 0x3FD dwords (0x1FE qwords) for PTE values.
88 */
89 #define MAX_PTE_PER_SDI 0x1FEU
90
xe_migrate_fini(void * arg)91 static void xe_migrate_fini(void *arg)
92 {
93 struct xe_migrate *m = arg;
94
95 xe_vm_lock(m->q->vm, false);
96 xe_bo_unpin(m->pt_bo);
97 xe_vm_unlock(m->q->vm);
98
99 dma_fence_put(m->fence);
100 xe_bo_put(m->pt_bo);
101 drm_suballoc_manager_fini(&m->vm_update_sa);
102 mutex_destroy(&m->job_mutex);
103 xe_vm_close_and_put(m->q->vm);
104 xe_exec_queue_put(m->q);
105 }
106
xe_migrate_vm_addr(u64 slot,u32 level)107 static u64 xe_migrate_vm_addr(u64 slot, u32 level)
108 {
109 XE_WARN_ON(slot >= NUM_PT_SLOTS);
110
111 /* First slot is reserved for mapping of PT bo and bb, start from 1 */
112 return (slot + 1ULL) << xe_pt_shift(level + 1);
113 }
114
xe_migrate_vram_ofs(struct xe_device * xe,u64 addr,bool is_comp_pte)115 static u64 xe_migrate_vram_ofs(struct xe_device *xe, u64 addr, bool is_comp_pte)
116 {
117 /*
118 * Remove the DPA to get a correct offset into identity table for the
119 * migrate offset
120 */
121 u64 identity_offset = IDENTITY_OFFSET;
122
123 if (GRAPHICS_VER(xe) >= 20 && is_comp_pte)
124 identity_offset += DIV_ROUND_UP_ULL(xe_vram_region_actual_physical_size
125 (xe->mem.vram), SZ_1G);
126
127 addr -= xe_vram_region_dpa_base(xe->mem.vram);
128 return addr + (identity_offset << xe_pt_shift(2));
129 }
130
xe_migrate_program_identity(struct xe_device * xe,struct xe_vm * vm,struct xe_bo * bo,u64 map_ofs,u64 vram_offset,u16 pat_index,u64 pt_2m_ofs)131 static void xe_migrate_program_identity(struct xe_device *xe, struct xe_vm *vm, struct xe_bo *bo,
132 u64 map_ofs, u64 vram_offset, u16 pat_index, u64 pt_2m_ofs)
133 {
134 struct xe_vram_region *vram = xe->mem.vram;
135 resource_size_t dpa_base = xe_vram_region_dpa_base(vram);
136 u64 pos, ofs, flags;
137 u64 entry;
138 /* XXX: Unclear if this should be usable_size? */
139 u64 vram_limit = xe_vram_region_actual_physical_size(vram) + dpa_base;
140 u32 level = 2;
141
142 ofs = map_ofs + XE_PAGE_SIZE * level + vram_offset * 8;
143 flags = vm->pt_ops->pte_encode_addr(xe, 0, pat_index, level,
144 true, 0);
145
146 xe_assert(xe, IS_ALIGNED(xe_vram_region_usable_size(vram), SZ_2M));
147
148 /*
149 * Use 1GB pages when possible, last chunk always use 2M
150 * pages as mixing reserved memory (stolen, WOCPM) with a single
151 * mapping is not allowed on certain platforms.
152 */
153 for (pos = dpa_base; pos < vram_limit;
154 pos += SZ_1G, ofs += 8) {
155 if (pos + SZ_1G >= vram_limit) {
156 entry = vm->pt_ops->pde_encode_bo(bo, pt_2m_ofs);
157 xe_map_wr(xe, &bo->vmap, ofs, u64, entry);
158
159 flags = vm->pt_ops->pte_encode_addr(xe, 0,
160 pat_index,
161 level - 1,
162 true, 0);
163
164 for (ofs = pt_2m_ofs; pos < vram_limit;
165 pos += SZ_2M, ofs += 8)
166 xe_map_wr(xe, &bo->vmap, ofs, u64, pos | flags);
167 break; /* Ensure pos == vram_limit assert correct */
168 }
169
170 xe_map_wr(xe, &bo->vmap, ofs, u64, pos | flags);
171 }
172
173 xe_assert(xe, pos == vram_limit);
174 }
175
xe_migrate_prepare_vm(struct xe_tile * tile,struct xe_migrate * m,struct xe_vm * vm,struct drm_exec * exec)176 static int xe_migrate_prepare_vm(struct xe_tile *tile, struct xe_migrate *m,
177 struct xe_vm *vm, struct drm_exec *exec)
178 {
179 struct xe_device *xe = tile_to_xe(tile);
180 u16 pat_index = xe->pat.idx[XE_CACHE_WB];
181 u8 id = tile->id;
182 u32 num_entries = NUM_PT_SLOTS, num_level = vm->pt_root[id]->level;
183 #define VRAM_IDENTITY_MAP_COUNT 2
184 u32 num_setup = num_level + VRAM_IDENTITY_MAP_COUNT;
185 #undef VRAM_IDENTITY_MAP_COUNT
186 u32 map_ofs, level, i;
187 struct xe_bo *bo, *batch = tile->mem.kernel_bb_pool->bo;
188 u64 entry, pt29_ofs;
189
190 /* Can't bump NUM_PT_SLOTS too high */
191 BUILD_BUG_ON(NUM_PT_SLOTS > SZ_2M/XE_PAGE_SIZE);
192 /* Must be a multiple of 64K to support all platforms */
193 BUILD_BUG_ON(NUM_PT_SLOTS * XE_PAGE_SIZE % SZ_64K);
194 /* And one slot reserved for the 4KiB page table updates */
195 BUILD_BUG_ON(!(NUM_KERNEL_PDE & 1));
196
197 /* Need to be sure everything fits in the first PT, or create more */
198 xe_tile_assert(tile, m->batch_base_ofs + xe_bo_size(batch) < SZ_2M);
199
200 bo = xe_bo_create_pin_map(vm->xe, tile, vm,
201 num_entries * XE_PAGE_SIZE,
202 ttm_bo_type_kernel,
203 XE_BO_FLAG_VRAM_IF_DGFX(tile) |
204 XE_BO_FLAG_PAGETABLE, exec);
205 if (IS_ERR(bo))
206 return PTR_ERR(bo);
207
208 /* PT30 & PT31 reserved for 2M identity map */
209 pt29_ofs = xe_bo_size(bo) - 3 * XE_PAGE_SIZE;
210 entry = vm->pt_ops->pde_encode_bo(bo, pt29_ofs);
211 xe_pt_write(xe, &vm->pt_root[id]->bo->vmap, 0, entry);
212
213 map_ofs = (num_entries - num_setup) * XE_PAGE_SIZE;
214
215 /* Map the entire BO in our level 0 pt */
216 for (i = 0, level = 0; i < num_entries; level++) {
217 entry = vm->pt_ops->pte_encode_bo(bo, i * XE_PAGE_SIZE,
218 pat_index, 0);
219
220 xe_map_wr(xe, &bo->vmap, map_ofs + level * 8, u64, entry);
221
222 if (vm->flags & XE_VM_FLAG_64K)
223 i += 16;
224 else
225 i += 1;
226 }
227
228 if (!IS_DGFX(xe)) {
229 /* Write out batch too */
230 m->batch_base_ofs = NUM_PT_SLOTS * XE_PAGE_SIZE;
231 for (i = 0; i < xe_bo_size(batch);
232 i += vm->flags & XE_VM_FLAG_64K ? XE_64K_PAGE_SIZE :
233 XE_PAGE_SIZE) {
234 entry = vm->pt_ops->pte_encode_bo(batch, i,
235 pat_index, 0);
236
237 xe_map_wr(xe, &bo->vmap, map_ofs + level * 8, u64,
238 entry);
239 level++;
240 }
241 if (xe->info.has_usm) {
242 xe_tile_assert(tile, xe_bo_size(batch) == SZ_1M);
243
244 batch = tile->primary_gt->usm.bb_pool->bo;
245 m->usm_batch_base_ofs = m->batch_base_ofs + SZ_1M;
246 xe_tile_assert(tile, xe_bo_size(batch) == SZ_512K);
247
248 for (i = 0; i < xe_bo_size(batch);
249 i += vm->flags & XE_VM_FLAG_64K ? XE_64K_PAGE_SIZE :
250 XE_PAGE_SIZE) {
251 entry = vm->pt_ops->pte_encode_bo(batch, i,
252 pat_index, 0);
253
254 xe_map_wr(xe, &bo->vmap, map_ofs + level * 8, u64,
255 entry);
256 level++;
257 }
258 }
259 } else {
260 u64 batch_addr = xe_bo_addr(batch, 0, XE_PAGE_SIZE);
261
262 m->batch_base_ofs = xe_migrate_vram_ofs(xe, batch_addr, false);
263
264 if (xe->info.has_usm) {
265 batch = tile->primary_gt->usm.bb_pool->bo;
266 batch_addr = xe_bo_addr(batch, 0, XE_PAGE_SIZE);
267 m->usm_batch_base_ofs = xe_migrate_vram_ofs(xe, batch_addr, false);
268 }
269 }
270
271 for (level = 1; level < num_level; level++) {
272 u32 flags = 0;
273
274 if (vm->flags & XE_VM_FLAG_64K && level == 1)
275 flags = XE_PDE_64K;
276
277 entry = vm->pt_ops->pde_encode_bo(bo, map_ofs + (u64)(level - 1) *
278 XE_PAGE_SIZE);
279 xe_map_wr(xe, &bo->vmap, map_ofs + XE_PAGE_SIZE * level, u64,
280 entry | flags);
281 }
282
283 /* Write PDE's that point to our BO. */
284 for (i = 0; i < map_ofs / XE_PAGE_SIZE; i++) {
285 entry = vm->pt_ops->pde_encode_bo(bo, (u64)i * XE_PAGE_SIZE);
286
287 xe_map_wr(xe, &bo->vmap, map_ofs + XE_PAGE_SIZE +
288 (i + 1) * 8, u64, entry);
289 }
290
291 /* Set up a 1GiB NULL mapping at 255GiB offset. */
292 level = 2;
293 xe_map_wr(xe, &bo->vmap, map_ofs + XE_PAGE_SIZE * level + 255 * 8, u64,
294 vm->pt_ops->pte_encode_addr(xe, 0, pat_index, level, IS_DGFX(xe), 0)
295 | XE_PTE_NULL);
296 m->cleared_mem_ofs = (255ULL << xe_pt_shift(level));
297
298 /* Identity map the entire vram at 256GiB offset */
299 if (IS_DGFX(xe)) {
300 u64 pt30_ofs = xe_bo_size(bo) - 2 * XE_PAGE_SIZE;
301 resource_size_t actual_phy_size = xe_vram_region_actual_physical_size(xe->mem.vram);
302
303 xe_migrate_program_identity(xe, vm, bo, map_ofs, IDENTITY_OFFSET,
304 pat_index, pt30_ofs);
305 xe_assert(xe, actual_phy_size <= (MAX_NUM_PTE - IDENTITY_OFFSET) * SZ_1G);
306
307 /*
308 * Identity map the entire vram for compressed pat_index for xe2+
309 * if flat ccs is enabled.
310 */
311 if (GRAPHICS_VER(xe) >= 20 && xe_device_has_flat_ccs(xe)) {
312 u16 comp_pat_index = xe->pat.idx[XE_CACHE_NONE_COMPRESSION];
313 u64 vram_offset = IDENTITY_OFFSET +
314 DIV_ROUND_UP_ULL(actual_phy_size, SZ_1G);
315 u64 pt31_ofs = xe_bo_size(bo) - XE_PAGE_SIZE;
316
317 xe_assert(xe, actual_phy_size <= (MAX_NUM_PTE - IDENTITY_OFFSET -
318 IDENTITY_OFFSET / 2) * SZ_1G);
319 xe_migrate_program_identity(xe, vm, bo, map_ofs, vram_offset,
320 comp_pat_index, pt31_ofs);
321 }
322 }
323
324 /*
325 * Example layout created above, with root level = 3:
326 * [PT0...PT7]: kernel PT's for copy/clear; 64 or 4KiB PTE's
327 * [PT8]: Kernel PT for VM_BIND, 4 KiB PTE's
328 * [PT9...PT26]: Userspace PT's for VM_BIND, 4 KiB PTE's
329 * [PT27 = PDE 0] [PT28 = PDE 1] [PT29 = PDE 2] [PT30 & PT31 = 2M vram identity map]
330 *
331 * This makes the lowest part of the VM point to the pagetables.
332 * Hence the lowest 2M in the vm should point to itself, with a few writes
333 * and flushes, other parts of the VM can be used either for copying and
334 * clearing.
335 *
336 * For performance, the kernel reserves PDE's, so about 20 are left
337 * for async VM updates.
338 *
339 * To make it easier to work, each scratch PT is put in slot (1 + PT #)
340 * everywhere, this allows lockless updates to scratch pages by using
341 * the different addresses in VM.
342 */
343 #define NUM_VMUSA_UNIT_PER_PAGE 32
344 #define VM_SA_UPDATE_UNIT_SIZE (XE_PAGE_SIZE / NUM_VMUSA_UNIT_PER_PAGE)
345 #define NUM_VMUSA_WRITES_PER_UNIT (VM_SA_UPDATE_UNIT_SIZE / sizeof(u64))
346 drm_suballoc_manager_init(&m->vm_update_sa,
347 (size_t)(map_ofs / XE_PAGE_SIZE - NUM_KERNEL_PDE) *
348 NUM_VMUSA_UNIT_PER_PAGE, 0);
349
350 m->pt_bo = bo;
351 return 0;
352 }
353
354 /*
355 * Including the reserved copy engine is required to avoid deadlocks due to
356 * migrate jobs servicing the faults gets stuck behind the job that faulted.
357 */
xe_migrate_usm_logical_mask(struct xe_gt * gt)358 static u32 xe_migrate_usm_logical_mask(struct xe_gt *gt)
359 {
360 u32 logical_mask = 0;
361 struct xe_hw_engine *hwe;
362 enum xe_hw_engine_id id;
363
364 for_each_hw_engine(hwe, gt, id) {
365 if (hwe->class != XE_ENGINE_CLASS_COPY)
366 continue;
367
368 if (xe_gt_is_usm_hwe(gt, hwe))
369 logical_mask |= BIT(hwe->logical_instance);
370 }
371
372 return logical_mask;
373 }
374
xe_migrate_needs_ccs_emit(struct xe_device * xe)375 static bool xe_migrate_needs_ccs_emit(struct xe_device *xe)
376 {
377 return xe_device_has_flat_ccs(xe) && !(GRAPHICS_VER(xe) >= 20 && IS_DGFX(xe));
378 }
379
380 /**
381 * xe_migrate_alloc - Allocate a migrate struct for a given &xe_tile
382 * @tile: &xe_tile
383 *
384 * Allocates a &xe_migrate for a given tile.
385 *
386 * Return: &xe_migrate on success, or NULL when out of memory.
387 */
xe_migrate_alloc(struct xe_tile * tile)388 struct xe_migrate *xe_migrate_alloc(struct xe_tile *tile)
389 {
390 struct xe_migrate *m = drmm_kzalloc(&tile_to_xe(tile)->drm, sizeof(*m), GFP_KERNEL);
391
392 if (m)
393 m->tile = tile;
394 return m;
395 }
396
xe_migrate_lock_prepare_vm(struct xe_tile * tile,struct xe_migrate * m,struct xe_vm * vm)397 static int xe_migrate_lock_prepare_vm(struct xe_tile *tile, struct xe_migrate *m, struct xe_vm *vm)
398 {
399 struct xe_device *xe = tile_to_xe(tile);
400 struct xe_validation_ctx ctx;
401 struct drm_exec exec;
402 int err = 0;
403
404 xe_validation_guard(&ctx, &xe->val, &exec, (struct xe_val_flags) {}, err) {
405 err = xe_vm_drm_exec_lock(vm, &exec);
406 drm_exec_retry_on_contention(&exec);
407 err = xe_migrate_prepare_vm(tile, m, vm, &exec);
408 drm_exec_retry_on_contention(&exec);
409 xe_validation_retry_on_oom(&ctx, &err);
410 }
411
412 return err;
413 }
414
415 /**
416 * xe_migrate_init() - Initialize a migrate context
417 * @m: The migration context
418 *
419 * Return: 0 if successful, negative error code on failure
420 */
xe_migrate_init(struct xe_migrate * m)421 int xe_migrate_init(struct xe_migrate *m)
422 {
423 struct xe_tile *tile = m->tile;
424 struct xe_gt *primary_gt = tile->primary_gt;
425 struct xe_device *xe = tile_to_xe(tile);
426 struct xe_vm *vm;
427 int err;
428
429 /* Special layout, prepared below.. */
430 vm = xe_vm_create(xe, XE_VM_FLAG_MIGRATION |
431 XE_VM_FLAG_SET_TILE_ID(tile), NULL);
432 if (IS_ERR(vm))
433 return PTR_ERR(vm);
434
435 err = xe_migrate_lock_prepare_vm(tile, m, vm);
436 if (err)
437 goto err_out;
438
439 if (xe->info.has_usm) {
440 struct xe_hw_engine *hwe = xe_gt_hw_engine(primary_gt,
441 XE_ENGINE_CLASS_COPY,
442 primary_gt->usm.reserved_bcs_instance,
443 false);
444 u32 logical_mask = xe_migrate_usm_logical_mask(primary_gt);
445
446 if (!hwe || !logical_mask) {
447 err = -EINVAL;
448 goto err_out;
449 }
450
451 /*
452 * XXX: Currently only reserving 1 (likely slow) BCS instance on
453 * PVC, may want to revisit if performance is needed.
454 */
455 m->q = xe_exec_queue_create(xe, vm, logical_mask, 1, hwe,
456 EXEC_QUEUE_FLAG_KERNEL |
457 EXEC_QUEUE_FLAG_PERMANENT |
458 EXEC_QUEUE_FLAG_HIGH_PRIORITY |
459 EXEC_QUEUE_FLAG_MIGRATE, 0);
460 } else {
461 m->q = xe_exec_queue_create_class(xe, primary_gt, vm,
462 XE_ENGINE_CLASS_COPY,
463 EXEC_QUEUE_FLAG_KERNEL |
464 EXEC_QUEUE_FLAG_PERMANENT |
465 EXEC_QUEUE_FLAG_MIGRATE, 0);
466 }
467 if (IS_ERR(m->q)) {
468 err = PTR_ERR(m->q);
469 goto err_out;
470 }
471
472 mutex_init(&m->job_mutex);
473 fs_reclaim_acquire(GFP_KERNEL);
474 might_lock(&m->job_mutex);
475 fs_reclaim_release(GFP_KERNEL);
476
477 err = devm_add_action_or_reset(xe->drm.dev, xe_migrate_fini, m);
478 if (err)
479 return err;
480
481 if (IS_DGFX(xe)) {
482 if (xe_migrate_needs_ccs_emit(xe))
483 /* min chunk size corresponds to 4K of CCS Metadata */
484 m->min_chunk_size = SZ_4K * SZ_64K /
485 xe_device_ccs_bytes(xe, SZ_64K);
486 else
487 /* Somewhat arbitrary to avoid a huge amount of blits */
488 m->min_chunk_size = SZ_64K;
489 m->min_chunk_size = roundup_pow_of_two(m->min_chunk_size);
490 drm_dbg(&xe->drm, "Migrate min chunk size is 0x%08llx\n",
491 (unsigned long long)m->min_chunk_size);
492 }
493
494 return err;
495
496 err_out:
497 xe_vm_close_and_put(vm);
498 return err;
499
500 }
501
max_mem_transfer_per_pass(struct xe_device * xe)502 static u64 max_mem_transfer_per_pass(struct xe_device *xe)
503 {
504 if (!IS_DGFX(xe) && xe_device_has_flat_ccs(xe))
505 return MAX_CCS_LIMITED_TRANSFER;
506
507 return MAX_PREEMPTDISABLE_TRANSFER;
508 }
509
xe_migrate_res_sizes(struct xe_migrate * m,struct xe_res_cursor * cur)510 static u64 xe_migrate_res_sizes(struct xe_migrate *m, struct xe_res_cursor *cur)
511 {
512 struct xe_device *xe = tile_to_xe(m->tile);
513 u64 size = min_t(u64, max_mem_transfer_per_pass(xe), cur->remaining);
514
515 if (mem_type_is_vram(cur->mem_type)) {
516 /*
517 * VRAM we want to blit in chunks with sizes aligned to
518 * min_chunk_size in order for the offset to CCS metadata to be
519 * page-aligned. If it's the last chunk it may be smaller.
520 *
521 * Another constraint is that we need to limit the blit to
522 * the VRAM block size, unless size is smaller than
523 * min_chunk_size.
524 */
525 u64 chunk = max_t(u64, cur->size, m->min_chunk_size);
526
527 size = min_t(u64, size, chunk);
528 if (size > m->min_chunk_size)
529 size = round_down(size, m->min_chunk_size);
530 }
531
532 return size;
533 }
534
xe_migrate_allow_identity(u64 size,const struct xe_res_cursor * cur)535 static bool xe_migrate_allow_identity(u64 size, const struct xe_res_cursor *cur)
536 {
537 /* If the chunk is not fragmented, allow identity map. */
538 return cur->size >= size;
539 }
540
541 #define PTE_UPDATE_FLAG_IS_VRAM BIT(0)
542 #define PTE_UPDATE_FLAG_IS_COMP_PTE BIT(1)
543
pte_update_size(struct xe_migrate * m,u32 flags,struct ttm_resource * res,struct xe_res_cursor * cur,u64 * L0,u64 * L0_ofs,u32 * L0_pt,u32 cmd_size,u32 pt_ofs,u32 avail_pts)544 static u32 pte_update_size(struct xe_migrate *m,
545 u32 flags,
546 struct ttm_resource *res,
547 struct xe_res_cursor *cur,
548 u64 *L0, u64 *L0_ofs, u32 *L0_pt,
549 u32 cmd_size, u32 pt_ofs, u32 avail_pts)
550 {
551 u32 cmds = 0;
552 bool is_vram = PTE_UPDATE_FLAG_IS_VRAM & flags;
553 bool is_comp_pte = PTE_UPDATE_FLAG_IS_COMP_PTE & flags;
554
555 *L0_pt = pt_ofs;
556 if (is_vram && xe_migrate_allow_identity(*L0, cur)) {
557 /* Offset into identity map. */
558 *L0_ofs = xe_migrate_vram_ofs(tile_to_xe(m->tile),
559 cur->start + vram_region_gpu_offset(res),
560 is_comp_pte);
561 cmds += cmd_size;
562 } else {
563 /* Clip L0 to available size */
564 u64 size = min(*L0, (u64)avail_pts * SZ_2M);
565 u32 num_4k_pages = (size + XE_PAGE_SIZE - 1) >> XE_PTE_SHIFT;
566
567 *L0 = size;
568 *L0_ofs = xe_migrate_vm_addr(pt_ofs, 0);
569
570 /* MI_STORE_DATA_IMM */
571 cmds += 3 * DIV_ROUND_UP(num_4k_pages, MAX_PTE_PER_SDI);
572
573 /* PDE qwords */
574 cmds += num_4k_pages * 2;
575
576 /* Each chunk has a single blit command */
577 cmds += cmd_size;
578 }
579
580 return cmds;
581 }
582
emit_pte(struct xe_migrate * m,struct xe_bb * bb,u32 at_pt,bool is_vram,bool is_comp_pte,struct xe_res_cursor * cur,u32 size,struct ttm_resource * res)583 static void emit_pte(struct xe_migrate *m,
584 struct xe_bb *bb, u32 at_pt,
585 bool is_vram, bool is_comp_pte,
586 struct xe_res_cursor *cur,
587 u32 size, struct ttm_resource *res)
588 {
589 struct xe_device *xe = tile_to_xe(m->tile);
590 struct xe_vm *vm = m->q->vm;
591 u16 pat_index;
592 u32 ptes;
593 u64 ofs = (u64)at_pt * XE_PAGE_SIZE;
594 u64 cur_ofs;
595
596 /* Indirect access needs compression enabled uncached PAT index */
597 if (GRAPHICS_VERx100(xe) >= 2000)
598 pat_index = is_comp_pte ? xe->pat.idx[XE_CACHE_NONE_COMPRESSION] :
599 xe->pat.idx[XE_CACHE_WB];
600 else
601 pat_index = xe->pat.idx[XE_CACHE_WB];
602
603 ptes = DIV_ROUND_UP(size, XE_PAGE_SIZE);
604
605 while (ptes) {
606 u32 chunk = min(MAX_PTE_PER_SDI, ptes);
607
608 bb->cs[bb->len++] = MI_STORE_DATA_IMM | MI_SDI_NUM_QW(chunk);
609 bb->cs[bb->len++] = ofs;
610 bb->cs[bb->len++] = 0;
611
612 cur_ofs = ofs;
613 ofs += chunk * 8;
614 ptes -= chunk;
615
616 while (chunk--) {
617 u64 addr, flags = 0;
618 bool devmem = false;
619
620 addr = xe_res_dma(cur) & PAGE_MASK;
621 if (is_vram) {
622 if (vm->flags & XE_VM_FLAG_64K) {
623 u64 va = cur_ofs * XE_PAGE_SIZE / 8;
624
625 xe_assert(xe, (va & (SZ_64K - 1)) ==
626 (addr & (SZ_64K - 1)));
627
628 flags |= XE_PTE_PS64;
629 }
630
631 addr += vram_region_gpu_offset(res);
632 devmem = true;
633 }
634
635 addr = vm->pt_ops->pte_encode_addr(m->tile->xe,
636 addr, pat_index,
637 0, devmem, flags);
638 bb->cs[bb->len++] = lower_32_bits(addr);
639 bb->cs[bb->len++] = upper_32_bits(addr);
640
641 xe_res_next(cur, min_t(u32, size, PAGE_SIZE));
642 cur_ofs += 8;
643 }
644 }
645 }
646
647 #define EMIT_COPY_CCS_DW 5
emit_copy_ccs(struct xe_gt * gt,struct xe_bb * bb,u64 dst_ofs,bool dst_is_indirect,u64 src_ofs,bool src_is_indirect,u32 size)648 static void emit_copy_ccs(struct xe_gt *gt, struct xe_bb *bb,
649 u64 dst_ofs, bool dst_is_indirect,
650 u64 src_ofs, bool src_is_indirect,
651 u32 size)
652 {
653 struct xe_device *xe = gt_to_xe(gt);
654 u32 *cs = bb->cs + bb->len;
655 u32 num_ccs_blks;
656 u32 num_pages;
657 u32 ccs_copy_size;
658 u32 mocs;
659
660 if (GRAPHICS_VERx100(xe) >= 2000) {
661 num_pages = DIV_ROUND_UP(size, XE_PAGE_SIZE);
662 xe_gt_assert(gt, FIELD_FIT(XE2_CCS_SIZE_MASK, num_pages - 1));
663
664 ccs_copy_size = REG_FIELD_PREP(XE2_CCS_SIZE_MASK, num_pages - 1);
665 mocs = FIELD_PREP(XE2_XY_CTRL_SURF_MOCS_INDEX_MASK, gt->mocs.uc_index);
666
667 } else {
668 num_ccs_blks = DIV_ROUND_UP(xe_device_ccs_bytes(gt_to_xe(gt), size),
669 NUM_CCS_BYTES_PER_BLOCK);
670 xe_gt_assert(gt, FIELD_FIT(CCS_SIZE_MASK, num_ccs_blks - 1));
671
672 ccs_copy_size = REG_FIELD_PREP(CCS_SIZE_MASK, num_ccs_blks - 1);
673 mocs = FIELD_PREP(XY_CTRL_SURF_MOCS_MASK, gt->mocs.uc_index);
674 }
675
676 *cs++ = XY_CTRL_SURF_COPY_BLT |
677 (src_is_indirect ? 0x0 : 0x1) << SRC_ACCESS_TYPE_SHIFT |
678 (dst_is_indirect ? 0x0 : 0x1) << DST_ACCESS_TYPE_SHIFT |
679 ccs_copy_size;
680 *cs++ = lower_32_bits(src_ofs);
681 *cs++ = upper_32_bits(src_ofs) | mocs;
682 *cs++ = lower_32_bits(dst_ofs);
683 *cs++ = upper_32_bits(dst_ofs) | mocs;
684
685 bb->len = cs - bb->cs;
686 }
687
688 #define EMIT_COPY_DW 10
emit_copy(struct xe_gt * gt,struct xe_bb * bb,u64 src_ofs,u64 dst_ofs,unsigned int size,unsigned int pitch)689 static void emit_copy(struct xe_gt *gt, struct xe_bb *bb,
690 u64 src_ofs, u64 dst_ofs, unsigned int size,
691 unsigned int pitch)
692 {
693 struct xe_device *xe = gt_to_xe(gt);
694 u32 mocs = 0;
695 u32 tile_y = 0;
696
697 xe_gt_assert(gt, !(pitch & 3));
698 xe_gt_assert(gt, size / pitch <= S16_MAX);
699 xe_gt_assert(gt, pitch / 4 <= S16_MAX);
700 xe_gt_assert(gt, pitch <= U16_MAX);
701
702 if (GRAPHICS_VER(xe) >= 20)
703 mocs = FIELD_PREP(XE2_XY_FAST_COPY_BLT_MOCS_INDEX_MASK, gt->mocs.uc_index);
704
705 if (GRAPHICS_VERx100(xe) >= 1250)
706 tile_y = XY_FAST_COPY_BLT_D1_SRC_TILE4 | XY_FAST_COPY_BLT_D1_DST_TILE4;
707
708 bb->cs[bb->len++] = XY_FAST_COPY_BLT_CMD | (10 - 2);
709 bb->cs[bb->len++] = XY_FAST_COPY_BLT_DEPTH_32 | pitch | tile_y | mocs;
710 bb->cs[bb->len++] = 0;
711 bb->cs[bb->len++] = (size / pitch) << 16 | pitch / 4;
712 bb->cs[bb->len++] = lower_32_bits(dst_ofs);
713 bb->cs[bb->len++] = upper_32_bits(dst_ofs);
714 bb->cs[bb->len++] = 0;
715 bb->cs[bb->len++] = pitch | mocs;
716 bb->cs[bb->len++] = lower_32_bits(src_ofs);
717 bb->cs[bb->len++] = upper_32_bits(src_ofs);
718 }
719
xe_migrate_batch_base(struct xe_migrate * m,bool usm)720 static u64 xe_migrate_batch_base(struct xe_migrate *m, bool usm)
721 {
722 return usm ? m->usm_batch_base_ofs : m->batch_base_ofs;
723 }
724
xe_migrate_ccs_copy(struct xe_migrate * m,struct xe_bb * bb,u64 src_ofs,bool src_is_indirect,u64 dst_ofs,bool dst_is_indirect,u32 dst_size,u64 ccs_ofs,bool copy_ccs)725 static u32 xe_migrate_ccs_copy(struct xe_migrate *m,
726 struct xe_bb *bb,
727 u64 src_ofs, bool src_is_indirect,
728 u64 dst_ofs, bool dst_is_indirect, u32 dst_size,
729 u64 ccs_ofs, bool copy_ccs)
730 {
731 struct xe_gt *gt = m->tile->primary_gt;
732 u32 flush_flags = 0;
733
734 if (!copy_ccs && dst_is_indirect) {
735 /*
736 * If the src is already in vram, then it should already
737 * have been cleared by us, or has been populated by the
738 * user. Make sure we copy the CCS aux state as-is.
739 *
740 * Otherwise if the bo doesn't have any CCS metadata attached,
741 * we still need to clear it for security reasons.
742 */
743 u64 ccs_src_ofs = src_is_indirect ? src_ofs : m->cleared_mem_ofs;
744
745 emit_copy_ccs(gt, bb,
746 dst_ofs, true,
747 ccs_src_ofs, src_is_indirect, dst_size);
748
749 flush_flags = MI_FLUSH_DW_CCS;
750 } else if (copy_ccs) {
751 if (!src_is_indirect)
752 src_ofs = ccs_ofs;
753 else if (!dst_is_indirect)
754 dst_ofs = ccs_ofs;
755
756 xe_gt_assert(gt, src_is_indirect || dst_is_indirect);
757
758 emit_copy_ccs(gt, bb, dst_ofs, dst_is_indirect, src_ofs,
759 src_is_indirect, dst_size);
760 if (dst_is_indirect)
761 flush_flags = MI_FLUSH_DW_CCS;
762 }
763
764 return flush_flags;
765 }
766
767 /**
768 * xe_migrate_copy() - Copy content of TTM resources.
769 * @m: The migration context.
770 * @src_bo: The buffer object @src is currently bound to.
771 * @dst_bo: If copying between resources created for the same bo, set this to
772 * the same value as @src_bo. If copying between buffer objects, set it to
773 * the buffer object @dst is currently bound to.
774 * @src: The source TTM resource.
775 * @dst: The dst TTM resource.
776 * @copy_only_ccs: If true copy only CCS metadata
777 *
778 * Copies the contents of @src to @dst: On flat CCS devices,
779 * the CCS metadata is copied as well if needed, or if not present,
780 * the CCS metadata of @dst is cleared for security reasons.
781 *
782 * Return: Pointer to a dma_fence representing the last copy batch, or
783 * an error pointer on failure. If there is a failure, any copy operation
784 * started by the function call has been synced.
785 */
xe_migrate_copy(struct xe_migrate * m,struct xe_bo * src_bo,struct xe_bo * dst_bo,struct ttm_resource * src,struct ttm_resource * dst,bool copy_only_ccs)786 struct dma_fence *xe_migrate_copy(struct xe_migrate *m,
787 struct xe_bo *src_bo,
788 struct xe_bo *dst_bo,
789 struct ttm_resource *src,
790 struct ttm_resource *dst,
791 bool copy_only_ccs)
792 {
793 struct xe_gt *gt = m->tile->primary_gt;
794 struct xe_device *xe = gt_to_xe(gt);
795 struct dma_fence *fence = NULL;
796 u64 size = xe_bo_size(src_bo);
797 struct xe_res_cursor src_it, dst_it, ccs_it;
798 u64 src_L0_ofs, dst_L0_ofs;
799 u32 src_L0_pt, dst_L0_pt;
800 u64 src_L0, dst_L0;
801 int pass = 0;
802 int err;
803 bool src_is_pltt = src->mem_type == XE_PL_TT;
804 bool dst_is_pltt = dst->mem_type == XE_PL_TT;
805 bool src_is_vram = mem_type_is_vram(src->mem_type);
806 bool dst_is_vram = mem_type_is_vram(dst->mem_type);
807 bool type_device = src_bo->ttm.type == ttm_bo_type_device;
808 bool needs_ccs_emit = type_device && xe_migrate_needs_ccs_emit(xe);
809 bool copy_ccs = xe_device_has_flat_ccs(xe) &&
810 xe_bo_needs_ccs_pages(src_bo) && xe_bo_needs_ccs_pages(dst_bo);
811 bool copy_system_ccs = copy_ccs && (!src_is_vram || !dst_is_vram);
812 bool use_comp_pat = type_device && xe_device_has_flat_ccs(xe) &&
813 GRAPHICS_VER(xe) >= 20 && src_is_vram && !dst_is_vram;
814
815 /* Copying CCS between two different BOs is not supported yet. */
816 if (XE_WARN_ON(copy_ccs && src_bo != dst_bo))
817 return ERR_PTR(-EINVAL);
818
819 if (src_bo != dst_bo && XE_WARN_ON(xe_bo_size(src_bo) != xe_bo_size(dst_bo)))
820 return ERR_PTR(-EINVAL);
821
822 if (!src_is_vram)
823 xe_res_first_sg(xe_bo_sg(src_bo), 0, size, &src_it);
824 else
825 xe_res_first(src, 0, size, &src_it);
826 if (!dst_is_vram)
827 xe_res_first_sg(xe_bo_sg(dst_bo), 0, size, &dst_it);
828 else
829 xe_res_first(dst, 0, size, &dst_it);
830
831 if (copy_system_ccs)
832 xe_res_first_sg(xe_bo_sg(src_bo), xe_bo_ccs_pages_start(src_bo),
833 PAGE_ALIGN(xe_device_ccs_bytes(xe, size)),
834 &ccs_it);
835
836 while (size) {
837 u32 batch_size = 2; /* arb_clear() + MI_BATCH_BUFFER_END */
838 struct xe_sched_job *job;
839 struct xe_bb *bb;
840 u32 flush_flags = 0;
841 u32 update_idx;
842 u64 ccs_ofs, ccs_size;
843 u32 ccs_pt;
844 u32 pte_flags;
845
846 bool usm = xe->info.has_usm;
847 u32 avail_pts = max_mem_transfer_per_pass(xe) / LEVEL0_PAGE_TABLE_ENCODE_SIZE;
848
849 src_L0 = xe_migrate_res_sizes(m, &src_it);
850 dst_L0 = xe_migrate_res_sizes(m, &dst_it);
851
852 drm_dbg(&xe->drm, "Pass %u, sizes: %llu & %llu\n",
853 pass++, src_L0, dst_L0);
854
855 src_L0 = min(src_L0, dst_L0);
856
857 pte_flags = src_is_vram ? PTE_UPDATE_FLAG_IS_VRAM : 0;
858 pte_flags |= use_comp_pat ? PTE_UPDATE_FLAG_IS_COMP_PTE : 0;
859 batch_size += pte_update_size(m, pte_flags, src, &src_it, &src_L0,
860 &src_L0_ofs, &src_L0_pt, 0, 0,
861 avail_pts);
862 if (copy_only_ccs) {
863 dst_L0_ofs = src_L0_ofs;
864 } else {
865 pte_flags = dst_is_vram ? PTE_UPDATE_FLAG_IS_VRAM : 0;
866 batch_size += pte_update_size(m, pte_flags, dst,
867 &dst_it, &src_L0,
868 &dst_L0_ofs, &dst_L0_pt,
869 0, avail_pts, avail_pts);
870 }
871
872 if (copy_system_ccs) {
873 xe_assert(xe, type_device);
874 ccs_size = xe_device_ccs_bytes(xe, src_L0);
875 batch_size += pte_update_size(m, 0, NULL, &ccs_it, &ccs_size,
876 &ccs_ofs, &ccs_pt, 0,
877 2 * avail_pts,
878 avail_pts);
879 xe_assert(xe, IS_ALIGNED(ccs_it.start, PAGE_SIZE));
880 }
881
882 /* Add copy commands size here */
883 batch_size += ((copy_only_ccs) ? 0 : EMIT_COPY_DW) +
884 ((needs_ccs_emit ? EMIT_COPY_CCS_DW : 0));
885
886 bb = xe_bb_new(gt, batch_size, usm);
887 if (IS_ERR(bb)) {
888 err = PTR_ERR(bb);
889 goto err_sync;
890 }
891
892 if (src_is_vram && xe_migrate_allow_identity(src_L0, &src_it))
893 xe_res_next(&src_it, src_L0);
894 else
895 emit_pte(m, bb, src_L0_pt, src_is_vram, copy_system_ccs || use_comp_pat,
896 &src_it, src_L0, src);
897
898 if (dst_is_vram && xe_migrate_allow_identity(src_L0, &dst_it))
899 xe_res_next(&dst_it, src_L0);
900 else if (!copy_only_ccs)
901 emit_pte(m, bb, dst_L0_pt, dst_is_vram, copy_system_ccs,
902 &dst_it, src_L0, dst);
903
904 if (copy_system_ccs)
905 emit_pte(m, bb, ccs_pt, false, false, &ccs_it, ccs_size, src);
906
907 bb->cs[bb->len++] = MI_BATCH_BUFFER_END;
908 update_idx = bb->len;
909
910 if (!copy_only_ccs)
911 emit_copy(gt, bb, src_L0_ofs, dst_L0_ofs, src_L0, XE_PAGE_SIZE);
912
913 if (needs_ccs_emit)
914 flush_flags = xe_migrate_ccs_copy(m, bb, src_L0_ofs,
915 IS_DGFX(xe) ? src_is_vram : src_is_pltt,
916 dst_L0_ofs,
917 IS_DGFX(xe) ? dst_is_vram : dst_is_pltt,
918 src_L0, ccs_ofs, copy_ccs);
919
920 job = xe_bb_create_migration_job(m->q, bb,
921 xe_migrate_batch_base(m, usm),
922 update_idx);
923 if (IS_ERR(job)) {
924 err = PTR_ERR(job);
925 goto err;
926 }
927
928 xe_sched_job_add_migrate_flush(job, flush_flags | MI_INVALIDATE_TLB);
929 if (!fence) {
930 err = xe_sched_job_add_deps(job, src_bo->ttm.base.resv,
931 DMA_RESV_USAGE_BOOKKEEP);
932 if (!err && src_bo->ttm.base.resv != dst_bo->ttm.base.resv)
933 err = xe_sched_job_add_deps(job, dst_bo->ttm.base.resv,
934 DMA_RESV_USAGE_BOOKKEEP);
935 if (err)
936 goto err_job;
937 }
938
939 mutex_lock(&m->job_mutex);
940 xe_sched_job_arm(job);
941 dma_fence_put(fence);
942 fence = dma_fence_get(&job->drm.s_fence->finished);
943 xe_sched_job_push(job);
944
945 dma_fence_put(m->fence);
946 m->fence = dma_fence_get(fence);
947
948 mutex_unlock(&m->job_mutex);
949
950 xe_bb_free(bb, fence);
951 size -= src_L0;
952 continue;
953
954 err_job:
955 xe_sched_job_put(job);
956 err:
957 xe_bb_free(bb, NULL);
958
959 err_sync:
960 /* Sync partial copy if any. FIXME: under job_mutex? */
961 if (fence) {
962 dma_fence_wait(fence, false);
963 dma_fence_put(fence);
964 }
965
966 return ERR_PTR(err);
967 }
968
969 return fence;
970 }
971
972 /**
973 * xe_migrate_lrc() - Get the LRC from migrate context.
974 * @migrate: Migrate context.
975 *
976 * Return: Pointer to LRC on success, error on failure
977 */
xe_migrate_lrc(struct xe_migrate * migrate)978 struct xe_lrc *xe_migrate_lrc(struct xe_migrate *migrate)
979 {
980 return migrate->q->lrc[0];
981 }
982
emit_flush_invalidate(struct xe_exec_queue * q,u32 * dw,int i,u32 flags)983 static int emit_flush_invalidate(struct xe_exec_queue *q, u32 *dw, int i,
984 u32 flags)
985 {
986 struct xe_lrc *lrc = xe_exec_queue_lrc(q);
987 dw[i++] = MI_FLUSH_DW | MI_INVALIDATE_TLB | MI_FLUSH_DW_OP_STOREDW |
988 MI_FLUSH_IMM_DW | flags;
989 dw[i++] = lower_32_bits(xe_lrc_start_seqno_ggtt_addr(lrc)) |
990 MI_FLUSH_DW_USE_GTT;
991 dw[i++] = upper_32_bits(xe_lrc_start_seqno_ggtt_addr(lrc));
992 dw[i++] = MI_NOOP;
993 dw[i++] = MI_NOOP;
994
995 return i;
996 }
997
998 /**
999 * xe_migrate_ccs_rw_copy() - Copy content of TTM resources.
1000 * @tile: Tile whose migration context to be used.
1001 * @q : Execution to be used along with migration context.
1002 * @src_bo: The buffer object @src is currently bound to.
1003 * @read_write : Creates BB commands for CCS read/write.
1004 *
1005 * Creates batch buffer instructions to copy CCS metadata from CCS pool to
1006 * memory and vice versa.
1007 *
1008 * This function should only be called for IGPU.
1009 *
1010 * Return: 0 if successful, negative error code on failure.
1011 */
xe_migrate_ccs_rw_copy(struct xe_tile * tile,struct xe_exec_queue * q,struct xe_bo * src_bo,enum xe_sriov_vf_ccs_rw_ctxs read_write)1012 int xe_migrate_ccs_rw_copy(struct xe_tile *tile, struct xe_exec_queue *q,
1013 struct xe_bo *src_bo,
1014 enum xe_sriov_vf_ccs_rw_ctxs read_write)
1015
1016 {
1017 bool src_is_pltt = read_write == XE_SRIOV_VF_CCS_READ_CTX;
1018 bool dst_is_pltt = read_write == XE_SRIOV_VF_CCS_WRITE_CTX;
1019 struct ttm_resource *src = src_bo->ttm.resource;
1020 struct xe_migrate *m = tile->migrate;
1021 struct xe_gt *gt = tile->primary_gt;
1022 u32 batch_size, batch_size_allocated;
1023 struct xe_device *xe = gt_to_xe(gt);
1024 struct xe_res_cursor src_it, ccs_it;
1025 u64 size = xe_bo_size(src_bo);
1026 struct xe_bb *bb = NULL;
1027 u64 src_L0, src_L0_ofs;
1028 u32 src_L0_pt;
1029 int err;
1030
1031 xe_res_first_sg(xe_bo_sg(src_bo), 0, size, &src_it);
1032
1033 xe_res_first_sg(xe_bo_sg(src_bo), xe_bo_ccs_pages_start(src_bo),
1034 PAGE_ALIGN(xe_device_ccs_bytes(xe, size)),
1035 &ccs_it);
1036
1037 /* Calculate Batch buffer size */
1038 batch_size = 0;
1039 while (size) {
1040 batch_size += 10; /* Flush + ggtt addr + 2 NOP */
1041 u64 ccs_ofs, ccs_size;
1042 u32 ccs_pt;
1043
1044 u32 avail_pts = max_mem_transfer_per_pass(xe) / LEVEL0_PAGE_TABLE_ENCODE_SIZE;
1045
1046 src_L0 = min_t(u64, max_mem_transfer_per_pass(xe), size);
1047
1048 batch_size += pte_update_size(m, false, src, &src_it, &src_L0,
1049 &src_L0_ofs, &src_L0_pt, 0, 0,
1050 avail_pts);
1051
1052 ccs_size = xe_device_ccs_bytes(xe, src_L0);
1053 batch_size += pte_update_size(m, 0, NULL, &ccs_it, &ccs_size, &ccs_ofs,
1054 &ccs_pt, 0, avail_pts, avail_pts);
1055 xe_assert(xe, IS_ALIGNED(ccs_it.start, PAGE_SIZE));
1056
1057 /* Add copy commands size here */
1058 batch_size += EMIT_COPY_CCS_DW;
1059
1060 size -= src_L0;
1061 }
1062
1063 bb = xe_bb_ccs_new(gt, batch_size, read_write);
1064 if (IS_ERR(bb)) {
1065 drm_err(&xe->drm, "BB allocation failed.\n");
1066 err = PTR_ERR(bb);
1067 goto err_ret;
1068 }
1069
1070 batch_size_allocated = batch_size;
1071 size = xe_bo_size(src_bo);
1072 batch_size = 0;
1073
1074 /*
1075 * Emit PTE and copy commands here.
1076 * The CCS copy command can only support limited size. If the size to be
1077 * copied is more than the limit, divide copy into chunks. So, calculate
1078 * sizes here again before copy command is emitted.
1079 */
1080 while (size) {
1081 batch_size += 10; /* Flush + ggtt addr + 2 NOP */
1082 u32 flush_flags = 0;
1083 u64 ccs_ofs, ccs_size;
1084 u32 ccs_pt;
1085
1086 u32 avail_pts = max_mem_transfer_per_pass(xe) / LEVEL0_PAGE_TABLE_ENCODE_SIZE;
1087
1088 src_L0 = xe_migrate_res_sizes(m, &src_it);
1089
1090 batch_size += pte_update_size(m, false, src, &src_it, &src_L0,
1091 &src_L0_ofs, &src_L0_pt, 0, 0,
1092 avail_pts);
1093
1094 ccs_size = xe_device_ccs_bytes(xe, src_L0);
1095 batch_size += pte_update_size(m, 0, NULL, &ccs_it, &ccs_size, &ccs_ofs,
1096 &ccs_pt, 0, avail_pts, avail_pts);
1097 xe_assert(xe, IS_ALIGNED(ccs_it.start, PAGE_SIZE));
1098 batch_size += EMIT_COPY_CCS_DW;
1099
1100 emit_pte(m, bb, src_L0_pt, false, true, &src_it, src_L0, src);
1101
1102 emit_pte(m, bb, ccs_pt, false, false, &ccs_it, ccs_size, src);
1103
1104 bb->len = emit_flush_invalidate(q, bb->cs, bb->len, flush_flags);
1105 flush_flags = xe_migrate_ccs_copy(m, bb, src_L0_ofs, src_is_pltt,
1106 src_L0_ofs, dst_is_pltt,
1107 src_L0, ccs_ofs, true);
1108 bb->len = emit_flush_invalidate(q, bb->cs, bb->len, flush_flags);
1109
1110 size -= src_L0;
1111 }
1112
1113 xe_assert(xe, (batch_size_allocated == bb->len));
1114 src_bo->bb_ccs[read_write] = bb;
1115
1116 return 0;
1117
1118 err_ret:
1119 return err;
1120 }
1121
1122 /**
1123 * xe_get_migrate_exec_queue() - Get the execution queue from migrate context.
1124 * @migrate: Migrate context.
1125 *
1126 * Return: Pointer to execution queue on success, error on failure
1127 */
xe_migrate_exec_queue(struct xe_migrate * migrate)1128 struct xe_exec_queue *xe_migrate_exec_queue(struct xe_migrate *migrate)
1129 {
1130 return migrate->q;
1131 }
1132
emit_clear_link_copy(struct xe_gt * gt,struct xe_bb * bb,u64 src_ofs,u32 size,u32 pitch)1133 static void emit_clear_link_copy(struct xe_gt *gt, struct xe_bb *bb, u64 src_ofs,
1134 u32 size, u32 pitch)
1135 {
1136 struct xe_device *xe = gt_to_xe(gt);
1137 u32 *cs = bb->cs + bb->len;
1138 u32 len = PVC_MEM_SET_CMD_LEN_DW;
1139
1140 *cs++ = PVC_MEM_SET_CMD | PVC_MEM_SET_MATRIX | (len - 2);
1141 *cs++ = pitch - 1;
1142 *cs++ = (size / pitch) - 1;
1143 *cs++ = pitch - 1;
1144 *cs++ = lower_32_bits(src_ofs);
1145 *cs++ = upper_32_bits(src_ofs);
1146 if (GRAPHICS_VERx100(xe) >= 2000)
1147 *cs++ = FIELD_PREP(XE2_MEM_SET_MOCS_INDEX_MASK, gt->mocs.uc_index);
1148 else
1149 *cs++ = FIELD_PREP(PVC_MEM_SET_MOCS_INDEX_MASK, gt->mocs.uc_index);
1150
1151 xe_gt_assert(gt, cs - bb->cs == len + bb->len);
1152
1153 bb->len += len;
1154 }
1155
emit_clear_main_copy(struct xe_gt * gt,struct xe_bb * bb,u64 src_ofs,u32 size,u32 pitch,bool is_vram)1156 static void emit_clear_main_copy(struct xe_gt *gt, struct xe_bb *bb,
1157 u64 src_ofs, u32 size, u32 pitch, bool is_vram)
1158 {
1159 struct xe_device *xe = gt_to_xe(gt);
1160 u32 *cs = bb->cs + bb->len;
1161 u32 len = XY_FAST_COLOR_BLT_DW;
1162
1163 if (GRAPHICS_VERx100(xe) < 1250)
1164 len = 11;
1165
1166 *cs++ = XY_FAST_COLOR_BLT_CMD | XY_FAST_COLOR_BLT_DEPTH_32 |
1167 (len - 2);
1168 if (GRAPHICS_VERx100(xe) >= 2000)
1169 *cs++ = FIELD_PREP(XE2_XY_FAST_COLOR_BLT_MOCS_INDEX_MASK, gt->mocs.uc_index) |
1170 (pitch - 1);
1171 else
1172 *cs++ = FIELD_PREP(XY_FAST_COLOR_BLT_MOCS_MASK, gt->mocs.uc_index) |
1173 (pitch - 1);
1174 *cs++ = 0;
1175 *cs++ = (size / pitch) << 16 | pitch / 4;
1176 *cs++ = lower_32_bits(src_ofs);
1177 *cs++ = upper_32_bits(src_ofs);
1178 *cs++ = (is_vram ? 0x0 : 0x1) << XY_FAST_COLOR_BLT_MEM_TYPE_SHIFT;
1179 *cs++ = 0;
1180 *cs++ = 0;
1181 *cs++ = 0;
1182 *cs++ = 0;
1183
1184 if (len > 11) {
1185 *cs++ = 0;
1186 *cs++ = 0;
1187 *cs++ = 0;
1188 *cs++ = 0;
1189 *cs++ = 0;
1190 }
1191
1192 xe_gt_assert(gt, cs - bb->cs == len + bb->len);
1193
1194 bb->len += len;
1195 }
1196
has_service_copy_support(struct xe_gt * gt)1197 static bool has_service_copy_support(struct xe_gt *gt)
1198 {
1199 /*
1200 * What we care about is whether the architecture was designed with
1201 * service copy functionality (specifically the new MEM_SET / MEM_COPY
1202 * instructions) so check the architectural engine list rather than the
1203 * actual list since these instructions are usable on BCS0 even if
1204 * all of the actual service copy engines (BCS1-BCS8) have been fused
1205 * off.
1206 */
1207 return gt->info.engine_mask & GENMASK(XE_HW_ENGINE_BCS8,
1208 XE_HW_ENGINE_BCS1);
1209 }
1210
emit_clear_cmd_len(struct xe_gt * gt)1211 static u32 emit_clear_cmd_len(struct xe_gt *gt)
1212 {
1213 if (has_service_copy_support(gt))
1214 return PVC_MEM_SET_CMD_LEN_DW;
1215 else
1216 return XY_FAST_COLOR_BLT_DW;
1217 }
1218
emit_clear(struct xe_gt * gt,struct xe_bb * bb,u64 src_ofs,u32 size,u32 pitch,bool is_vram)1219 static void emit_clear(struct xe_gt *gt, struct xe_bb *bb, u64 src_ofs,
1220 u32 size, u32 pitch, bool is_vram)
1221 {
1222 if (has_service_copy_support(gt))
1223 emit_clear_link_copy(gt, bb, src_ofs, size, pitch);
1224 else
1225 emit_clear_main_copy(gt, bb, src_ofs, size, pitch,
1226 is_vram);
1227 }
1228
1229 /**
1230 * xe_migrate_clear() - Copy content of TTM resources.
1231 * @m: The migration context.
1232 * @bo: The buffer object @dst is currently bound to.
1233 * @dst: The dst TTM resource to be cleared.
1234 * @clear_flags: flags to specify which data to clear: CCS, BO, or both.
1235 *
1236 * Clear the contents of @dst to zero when XE_MIGRATE_CLEAR_FLAG_BO_DATA is set.
1237 * On flat CCS devices, the CCS metadata is cleared to zero with XE_MIGRATE_CLEAR_FLAG_CCS_DATA.
1238 * Set XE_MIGRATE_CLEAR_FLAG_FULL to clear bo as well as CCS metadata.
1239 * TODO: Eliminate the @bo argument.
1240 *
1241 * Return: Pointer to a dma_fence representing the last clear batch, or
1242 * an error pointer on failure. If there is a failure, any clear operation
1243 * started by the function call has been synced.
1244 */
xe_migrate_clear(struct xe_migrate * m,struct xe_bo * bo,struct ttm_resource * dst,u32 clear_flags)1245 struct dma_fence *xe_migrate_clear(struct xe_migrate *m,
1246 struct xe_bo *bo,
1247 struct ttm_resource *dst,
1248 u32 clear_flags)
1249 {
1250 bool clear_vram = mem_type_is_vram(dst->mem_type);
1251 bool clear_bo_data = XE_MIGRATE_CLEAR_FLAG_BO_DATA & clear_flags;
1252 bool clear_ccs = XE_MIGRATE_CLEAR_FLAG_CCS_DATA & clear_flags;
1253 struct xe_gt *gt = m->tile->primary_gt;
1254 struct xe_device *xe = gt_to_xe(gt);
1255 bool clear_only_system_ccs = false;
1256 struct dma_fence *fence = NULL;
1257 u64 size = xe_bo_size(bo);
1258 struct xe_res_cursor src_it;
1259 struct ttm_resource *src = dst;
1260 int err;
1261
1262 if (WARN_ON(!clear_bo_data && !clear_ccs))
1263 return NULL;
1264
1265 if (!clear_bo_data && clear_ccs && !IS_DGFX(xe))
1266 clear_only_system_ccs = true;
1267
1268 if (!clear_vram)
1269 xe_res_first_sg(xe_bo_sg(bo), 0, xe_bo_size(bo), &src_it);
1270 else
1271 xe_res_first(src, 0, xe_bo_size(bo), &src_it);
1272
1273 while (size) {
1274 u64 clear_L0_ofs;
1275 u32 clear_L0_pt;
1276 u32 flush_flags = 0;
1277 u64 clear_L0;
1278 struct xe_sched_job *job;
1279 struct xe_bb *bb;
1280 u32 batch_size, update_idx;
1281 u32 pte_flags;
1282
1283 bool usm = xe->info.has_usm;
1284 u32 avail_pts = max_mem_transfer_per_pass(xe) / LEVEL0_PAGE_TABLE_ENCODE_SIZE;
1285
1286 clear_L0 = xe_migrate_res_sizes(m, &src_it);
1287
1288 /* Calculate final sizes and batch size.. */
1289 pte_flags = clear_vram ? PTE_UPDATE_FLAG_IS_VRAM : 0;
1290 batch_size = 2 +
1291 pte_update_size(m, pte_flags, src, &src_it,
1292 &clear_L0, &clear_L0_ofs, &clear_L0_pt,
1293 clear_bo_data ? emit_clear_cmd_len(gt) : 0, 0,
1294 avail_pts);
1295
1296 if (xe_migrate_needs_ccs_emit(xe))
1297 batch_size += EMIT_COPY_CCS_DW;
1298
1299 /* Clear commands */
1300
1301 if (WARN_ON_ONCE(!clear_L0))
1302 break;
1303
1304 bb = xe_bb_new(gt, batch_size, usm);
1305 if (IS_ERR(bb)) {
1306 err = PTR_ERR(bb);
1307 goto err_sync;
1308 }
1309
1310 size -= clear_L0;
1311 /* Preemption is enabled again by the ring ops. */
1312 if (clear_vram && xe_migrate_allow_identity(clear_L0, &src_it)) {
1313 xe_res_next(&src_it, clear_L0);
1314 } else {
1315 emit_pte(m, bb, clear_L0_pt, clear_vram,
1316 clear_only_system_ccs, &src_it, clear_L0, dst);
1317 flush_flags |= MI_INVALIDATE_TLB;
1318 }
1319
1320 bb->cs[bb->len++] = MI_BATCH_BUFFER_END;
1321 update_idx = bb->len;
1322
1323 if (clear_bo_data)
1324 emit_clear(gt, bb, clear_L0_ofs, clear_L0, XE_PAGE_SIZE, clear_vram);
1325
1326 if (xe_migrate_needs_ccs_emit(xe)) {
1327 emit_copy_ccs(gt, bb, clear_L0_ofs, true,
1328 m->cleared_mem_ofs, false, clear_L0);
1329 flush_flags |= MI_FLUSH_DW_CCS;
1330 }
1331
1332 job = xe_bb_create_migration_job(m->q, bb,
1333 xe_migrate_batch_base(m, usm),
1334 update_idx);
1335 if (IS_ERR(job)) {
1336 err = PTR_ERR(job);
1337 goto err;
1338 }
1339
1340 xe_sched_job_add_migrate_flush(job, flush_flags);
1341 if (!fence) {
1342 /*
1343 * There can't be anything userspace related at this
1344 * point, so we just need to respect any potential move
1345 * fences, which are always tracked as
1346 * DMA_RESV_USAGE_KERNEL.
1347 */
1348 err = xe_sched_job_add_deps(job, bo->ttm.base.resv,
1349 DMA_RESV_USAGE_KERNEL);
1350 if (err)
1351 goto err_job;
1352 }
1353
1354 mutex_lock(&m->job_mutex);
1355 xe_sched_job_arm(job);
1356 dma_fence_put(fence);
1357 fence = dma_fence_get(&job->drm.s_fence->finished);
1358 xe_sched_job_push(job);
1359
1360 dma_fence_put(m->fence);
1361 m->fence = dma_fence_get(fence);
1362
1363 mutex_unlock(&m->job_mutex);
1364
1365 xe_bb_free(bb, fence);
1366 continue;
1367
1368 err_job:
1369 xe_sched_job_put(job);
1370 err:
1371 xe_bb_free(bb, NULL);
1372 err_sync:
1373 /* Sync partial copies if any. FIXME: job_mutex? */
1374 if (fence) {
1375 dma_fence_wait(fence, false);
1376 dma_fence_put(fence);
1377 }
1378
1379 return ERR_PTR(err);
1380 }
1381
1382 if (clear_ccs)
1383 bo->ccs_cleared = true;
1384
1385 return fence;
1386 }
1387
write_pgtable(struct xe_tile * tile,struct xe_bb * bb,u64 ppgtt_ofs,const struct xe_vm_pgtable_update_op * pt_op,const struct xe_vm_pgtable_update * update,struct xe_migrate_pt_update * pt_update)1388 static void write_pgtable(struct xe_tile *tile, struct xe_bb *bb, u64 ppgtt_ofs,
1389 const struct xe_vm_pgtable_update_op *pt_op,
1390 const struct xe_vm_pgtable_update *update,
1391 struct xe_migrate_pt_update *pt_update)
1392 {
1393 const struct xe_migrate_pt_update_ops *ops = pt_update->ops;
1394 u32 chunk;
1395 u32 ofs = update->ofs, size = update->qwords;
1396
1397 /*
1398 * If we have 512 entries (max), we would populate it ourselves,
1399 * and update the PDE above it to the new pointer.
1400 * The only time this can only happen if we have to update the top
1401 * PDE. This requires a BO that is almost vm->size big.
1402 *
1403 * This shouldn't be possible in practice.. might change when 16K
1404 * pages are used. Hence the assert.
1405 */
1406 xe_tile_assert(tile, update->qwords < MAX_NUM_PTE);
1407 if (!ppgtt_ofs)
1408 ppgtt_ofs = xe_migrate_vram_ofs(tile_to_xe(tile),
1409 xe_bo_addr(update->pt_bo, 0,
1410 XE_PAGE_SIZE), false);
1411
1412 do {
1413 u64 addr = ppgtt_ofs + ofs * 8;
1414
1415 chunk = min(size, MAX_PTE_PER_SDI);
1416
1417 /* Ensure populatefn can do memset64 by aligning bb->cs */
1418 if (!(bb->len & 1))
1419 bb->cs[bb->len++] = MI_NOOP;
1420
1421 bb->cs[bb->len++] = MI_STORE_DATA_IMM | MI_SDI_NUM_QW(chunk);
1422 bb->cs[bb->len++] = lower_32_bits(addr);
1423 bb->cs[bb->len++] = upper_32_bits(addr);
1424 if (pt_op->bind)
1425 ops->populate(pt_update, tile, NULL, bb->cs + bb->len,
1426 ofs, chunk, update);
1427 else
1428 ops->clear(pt_update, tile, NULL, bb->cs + bb->len,
1429 ofs, chunk, update);
1430
1431 bb->len += chunk * 2;
1432 ofs += chunk;
1433 size -= chunk;
1434 } while (size);
1435 }
1436
xe_migrate_get_vm(struct xe_migrate * m)1437 struct xe_vm *xe_migrate_get_vm(struct xe_migrate *m)
1438 {
1439 return xe_vm_get(m->q->vm);
1440 }
1441
1442 #if IS_ENABLED(CONFIG_DRM_XE_KUNIT_TEST)
1443 struct migrate_test_params {
1444 struct xe_test_priv base;
1445 bool force_gpu;
1446 };
1447
1448 #define to_migrate_test_params(_priv) \
1449 container_of(_priv, struct migrate_test_params, base)
1450 #endif
1451
1452 static struct dma_fence *
xe_migrate_update_pgtables_cpu(struct xe_migrate * m,struct xe_migrate_pt_update * pt_update)1453 xe_migrate_update_pgtables_cpu(struct xe_migrate *m,
1454 struct xe_migrate_pt_update *pt_update)
1455 {
1456 XE_TEST_DECLARE(struct migrate_test_params *test =
1457 to_migrate_test_params
1458 (xe_cur_kunit_priv(XE_TEST_LIVE_MIGRATE));)
1459 const struct xe_migrate_pt_update_ops *ops = pt_update->ops;
1460 struct xe_vm *vm = pt_update->vops->vm;
1461 struct xe_vm_pgtable_update_ops *pt_update_ops =
1462 &pt_update->vops->pt_update_ops[pt_update->tile_id];
1463 int err;
1464 u32 i, j;
1465
1466 if (XE_TEST_ONLY(test && test->force_gpu))
1467 return ERR_PTR(-ETIME);
1468
1469 if (ops->pre_commit) {
1470 pt_update->job = NULL;
1471 err = ops->pre_commit(pt_update);
1472 if (err)
1473 return ERR_PTR(err);
1474 }
1475
1476 for (i = 0; i < pt_update_ops->num_ops; ++i) {
1477 const struct xe_vm_pgtable_update_op *pt_op =
1478 &pt_update_ops->ops[i];
1479
1480 for (j = 0; j < pt_op->num_entries; j++) {
1481 const struct xe_vm_pgtable_update *update =
1482 &pt_op->entries[j];
1483
1484 if (pt_op->bind)
1485 ops->populate(pt_update, m->tile,
1486 &update->pt_bo->vmap, NULL,
1487 update->ofs, update->qwords,
1488 update);
1489 else
1490 ops->clear(pt_update, m->tile,
1491 &update->pt_bo->vmap, NULL,
1492 update->ofs, update->qwords, update);
1493 }
1494 }
1495
1496 trace_xe_vm_cpu_bind(vm);
1497 xe_device_wmb(vm->xe);
1498
1499 return dma_fence_get_stub();
1500 }
1501
1502 static struct dma_fence *
__xe_migrate_update_pgtables(struct xe_migrate * m,struct xe_migrate_pt_update * pt_update,struct xe_vm_pgtable_update_ops * pt_update_ops)1503 __xe_migrate_update_pgtables(struct xe_migrate *m,
1504 struct xe_migrate_pt_update *pt_update,
1505 struct xe_vm_pgtable_update_ops *pt_update_ops)
1506 {
1507 const struct xe_migrate_pt_update_ops *ops = pt_update->ops;
1508 struct xe_tile *tile = m->tile;
1509 struct xe_gt *gt = tile->primary_gt;
1510 struct xe_device *xe = tile_to_xe(tile);
1511 struct xe_sched_job *job;
1512 struct dma_fence *fence;
1513 struct drm_suballoc *sa_bo = NULL;
1514 struct xe_bb *bb;
1515 u32 i, j, batch_size = 0, ppgtt_ofs, update_idx, page_ofs = 0;
1516 u32 num_updates = 0, current_update = 0;
1517 u64 addr;
1518 int err = 0;
1519 bool is_migrate = pt_update_ops->q == m->q;
1520 bool usm = is_migrate && xe->info.has_usm;
1521
1522 for (i = 0; i < pt_update_ops->num_ops; ++i) {
1523 struct xe_vm_pgtable_update_op *pt_op = &pt_update_ops->ops[i];
1524 struct xe_vm_pgtable_update *updates = pt_op->entries;
1525
1526 num_updates += pt_op->num_entries;
1527 for (j = 0; j < pt_op->num_entries; ++j) {
1528 u32 num_cmds = DIV_ROUND_UP(updates[j].qwords,
1529 MAX_PTE_PER_SDI);
1530
1531 /* align noop + MI_STORE_DATA_IMM cmd prefix */
1532 batch_size += 4 * num_cmds + updates[j].qwords * 2;
1533 }
1534 }
1535
1536 /* fixed + PTE entries */
1537 if (IS_DGFX(xe))
1538 batch_size += 2;
1539 else
1540 batch_size += 6 * (num_updates / MAX_PTE_PER_SDI + 1) +
1541 num_updates * 2;
1542
1543 bb = xe_bb_new(gt, batch_size, usm);
1544 if (IS_ERR(bb))
1545 return ERR_CAST(bb);
1546
1547 /* For sysmem PTE's, need to map them in our hole.. */
1548 if (!IS_DGFX(xe)) {
1549 u16 pat_index = xe->pat.idx[XE_CACHE_WB];
1550 u32 ptes, ofs;
1551
1552 ppgtt_ofs = NUM_KERNEL_PDE - 1;
1553 if (!is_migrate) {
1554 u32 num_units = DIV_ROUND_UP(num_updates,
1555 NUM_VMUSA_WRITES_PER_UNIT);
1556
1557 if (num_units > m->vm_update_sa.size) {
1558 err = -ENOBUFS;
1559 goto err_bb;
1560 }
1561 sa_bo = drm_suballoc_new(&m->vm_update_sa, num_units,
1562 GFP_KERNEL, true, 0);
1563 if (IS_ERR(sa_bo)) {
1564 err = PTR_ERR(sa_bo);
1565 goto err_bb;
1566 }
1567
1568 ppgtt_ofs = NUM_KERNEL_PDE +
1569 (drm_suballoc_soffset(sa_bo) /
1570 NUM_VMUSA_UNIT_PER_PAGE);
1571 page_ofs = (drm_suballoc_soffset(sa_bo) %
1572 NUM_VMUSA_UNIT_PER_PAGE) *
1573 VM_SA_UPDATE_UNIT_SIZE;
1574 }
1575
1576 /* Map our PT's to gtt */
1577 i = 0;
1578 j = 0;
1579 ptes = num_updates;
1580 ofs = ppgtt_ofs * XE_PAGE_SIZE + page_ofs;
1581 while (ptes) {
1582 u32 chunk = min(MAX_PTE_PER_SDI, ptes);
1583 u32 idx = 0;
1584
1585 bb->cs[bb->len++] = MI_STORE_DATA_IMM |
1586 MI_SDI_NUM_QW(chunk);
1587 bb->cs[bb->len++] = ofs;
1588 bb->cs[bb->len++] = 0; /* upper_32_bits */
1589
1590 for (; i < pt_update_ops->num_ops; ++i) {
1591 struct xe_vm_pgtable_update_op *pt_op =
1592 &pt_update_ops->ops[i];
1593 struct xe_vm_pgtable_update *updates = pt_op->entries;
1594
1595 for (; j < pt_op->num_entries; ++j, ++current_update, ++idx) {
1596 struct xe_vm *vm = pt_update->vops->vm;
1597 struct xe_bo *pt_bo = updates[j].pt_bo;
1598
1599 if (idx == chunk)
1600 goto next_cmd;
1601
1602 xe_tile_assert(tile, xe_bo_size(pt_bo) == SZ_4K);
1603
1604 /* Map a PT at most once */
1605 if (pt_bo->update_index < 0)
1606 pt_bo->update_index = current_update;
1607
1608 addr = vm->pt_ops->pte_encode_bo(pt_bo, 0,
1609 pat_index, 0);
1610 bb->cs[bb->len++] = lower_32_bits(addr);
1611 bb->cs[bb->len++] = upper_32_bits(addr);
1612 }
1613
1614 j = 0;
1615 }
1616
1617 next_cmd:
1618 ptes -= chunk;
1619 ofs += chunk * sizeof(u64);
1620 }
1621
1622 bb->cs[bb->len++] = MI_BATCH_BUFFER_END;
1623 update_idx = bb->len;
1624
1625 addr = xe_migrate_vm_addr(ppgtt_ofs, 0) +
1626 (page_ofs / sizeof(u64)) * XE_PAGE_SIZE;
1627 for (i = 0; i < pt_update_ops->num_ops; ++i) {
1628 struct xe_vm_pgtable_update_op *pt_op =
1629 &pt_update_ops->ops[i];
1630 struct xe_vm_pgtable_update *updates = pt_op->entries;
1631
1632 for (j = 0; j < pt_op->num_entries; ++j) {
1633 struct xe_bo *pt_bo = updates[j].pt_bo;
1634
1635 write_pgtable(tile, bb, addr +
1636 pt_bo->update_index * XE_PAGE_SIZE,
1637 pt_op, &updates[j], pt_update);
1638 }
1639 }
1640 } else {
1641 /* phys pages, no preamble required */
1642 bb->cs[bb->len++] = MI_BATCH_BUFFER_END;
1643 update_idx = bb->len;
1644
1645 for (i = 0; i < pt_update_ops->num_ops; ++i) {
1646 struct xe_vm_pgtable_update_op *pt_op =
1647 &pt_update_ops->ops[i];
1648 struct xe_vm_pgtable_update *updates = pt_op->entries;
1649
1650 for (j = 0; j < pt_op->num_entries; ++j)
1651 write_pgtable(tile, bb, 0, pt_op, &updates[j],
1652 pt_update);
1653 }
1654 }
1655
1656 job = xe_bb_create_migration_job(pt_update_ops->q, bb,
1657 xe_migrate_batch_base(m, usm),
1658 update_idx);
1659 if (IS_ERR(job)) {
1660 err = PTR_ERR(job);
1661 goto err_sa;
1662 }
1663
1664 xe_sched_job_add_migrate_flush(job, MI_INVALIDATE_TLB);
1665
1666 if (ops->pre_commit) {
1667 pt_update->job = job;
1668 err = ops->pre_commit(pt_update);
1669 if (err)
1670 goto err_job;
1671 }
1672 if (is_migrate)
1673 mutex_lock(&m->job_mutex);
1674
1675 xe_sched_job_arm(job);
1676 fence = dma_fence_get(&job->drm.s_fence->finished);
1677 xe_sched_job_push(job);
1678
1679 if (is_migrate)
1680 mutex_unlock(&m->job_mutex);
1681
1682 xe_bb_free(bb, fence);
1683 drm_suballoc_free(sa_bo, fence);
1684
1685 return fence;
1686
1687 err_job:
1688 xe_sched_job_put(job);
1689 err_sa:
1690 drm_suballoc_free(sa_bo, NULL);
1691 err_bb:
1692 xe_bb_free(bb, NULL);
1693 return ERR_PTR(err);
1694 }
1695
1696 /**
1697 * xe_migrate_update_pgtables() - Pipelined page-table update
1698 * @m: The migrate context.
1699 * @pt_update: PT update arguments
1700 *
1701 * Perform a pipelined page-table update. The update descriptors are typically
1702 * built under the same lock critical section as a call to this function. If
1703 * using the default engine for the updates, they will be performed in the
1704 * order they grab the job_mutex. If different engines are used, external
1705 * synchronization is needed for overlapping updates to maintain page-table
1706 * consistency. Note that the meaning of "overlapping" is that the updates
1707 * touch the same page-table, which might be a higher-level page-directory.
1708 * If no pipelining is needed, then updates may be performed by the cpu.
1709 *
1710 * Return: A dma_fence that, when signaled, indicates the update completion.
1711 */
1712 struct dma_fence *
xe_migrate_update_pgtables(struct xe_migrate * m,struct xe_migrate_pt_update * pt_update)1713 xe_migrate_update_pgtables(struct xe_migrate *m,
1714 struct xe_migrate_pt_update *pt_update)
1715
1716 {
1717 struct xe_vm_pgtable_update_ops *pt_update_ops =
1718 &pt_update->vops->pt_update_ops[pt_update->tile_id];
1719 struct dma_fence *fence;
1720
1721 fence = xe_migrate_update_pgtables_cpu(m, pt_update);
1722
1723 /* -ETIME indicates a job is needed, anything else is legit error */
1724 if (!IS_ERR(fence) || PTR_ERR(fence) != -ETIME)
1725 return fence;
1726
1727 return __xe_migrate_update_pgtables(m, pt_update, pt_update_ops);
1728 }
1729
1730 /**
1731 * xe_migrate_wait() - Complete all operations using the xe_migrate context
1732 * @m: Migrate context to wait for.
1733 *
1734 * Waits until the GPU no longer uses the migrate context's default engine
1735 * or its page-table objects. FIXME: What about separate page-table update
1736 * engines?
1737 */
xe_migrate_wait(struct xe_migrate * m)1738 void xe_migrate_wait(struct xe_migrate *m)
1739 {
1740 if (m->fence)
1741 dma_fence_wait(m->fence, false);
1742 }
1743
pte_update_cmd_size(u64 size)1744 static u32 pte_update_cmd_size(u64 size)
1745 {
1746 u32 num_dword;
1747 u64 entries = DIV_U64_ROUND_UP(size, XE_PAGE_SIZE);
1748
1749 XE_WARN_ON(size > MAX_PREEMPTDISABLE_TRANSFER);
1750
1751 /*
1752 * MI_STORE_DATA_IMM command is used to update page table. Each
1753 * instruction can update maximumly MAX_PTE_PER_SDI pte entries. To
1754 * update n (n <= MAX_PTE_PER_SDI) pte entries, we need:
1755 *
1756 * - 1 dword for the MI_STORE_DATA_IMM command header (opcode etc)
1757 * - 2 dword for the page table's physical location
1758 * - 2*n dword for value of pte to fill (each pte entry is 2 dwords)
1759 */
1760 num_dword = (1 + 2) * DIV_U64_ROUND_UP(entries, MAX_PTE_PER_SDI);
1761 num_dword += entries * 2;
1762
1763 return num_dword;
1764 }
1765
build_pt_update_batch_sram(struct xe_migrate * m,struct xe_bb * bb,u32 pt_offset,struct drm_pagemap_addr * sram_addr,u32 size)1766 static void build_pt_update_batch_sram(struct xe_migrate *m,
1767 struct xe_bb *bb, u32 pt_offset,
1768 struct drm_pagemap_addr *sram_addr,
1769 u32 size)
1770 {
1771 u16 pat_index = tile_to_xe(m->tile)->pat.idx[XE_CACHE_WB];
1772 u32 ptes;
1773 int i = 0;
1774
1775 ptes = DIV_ROUND_UP(size, XE_PAGE_SIZE);
1776 while (ptes) {
1777 u32 chunk = min(MAX_PTE_PER_SDI, ptes);
1778
1779 bb->cs[bb->len++] = MI_STORE_DATA_IMM | MI_SDI_NUM_QW(chunk);
1780 bb->cs[bb->len++] = pt_offset;
1781 bb->cs[bb->len++] = 0;
1782
1783 pt_offset += chunk * 8;
1784 ptes -= chunk;
1785
1786 while (chunk--) {
1787 u64 addr = sram_addr[i].addr & PAGE_MASK;
1788
1789 xe_tile_assert(m->tile, sram_addr[i].proto ==
1790 DRM_INTERCONNECT_SYSTEM);
1791 xe_tile_assert(m->tile, addr);
1792 addr = m->q->vm->pt_ops->pte_encode_addr(m->tile->xe,
1793 addr, pat_index,
1794 0, false, 0);
1795 bb->cs[bb->len++] = lower_32_bits(addr);
1796 bb->cs[bb->len++] = upper_32_bits(addr);
1797
1798 i++;
1799 }
1800 }
1801 }
1802
1803 enum xe_migrate_copy_dir {
1804 XE_MIGRATE_COPY_TO_VRAM,
1805 XE_MIGRATE_COPY_TO_SRAM,
1806 };
1807
1808 #define XE_CACHELINE_BYTES 64ull
1809 #define XE_CACHELINE_MASK (XE_CACHELINE_BYTES - 1)
1810
xe_migrate_vram(struct xe_migrate * m,unsigned long len,unsigned long sram_offset,struct drm_pagemap_addr * sram_addr,u64 vram_addr,const enum xe_migrate_copy_dir dir)1811 static struct dma_fence *xe_migrate_vram(struct xe_migrate *m,
1812 unsigned long len,
1813 unsigned long sram_offset,
1814 struct drm_pagemap_addr *sram_addr,
1815 u64 vram_addr,
1816 const enum xe_migrate_copy_dir dir)
1817 {
1818 struct xe_gt *gt = m->tile->primary_gt;
1819 struct xe_device *xe = gt_to_xe(gt);
1820 bool use_usm_batch = xe->info.has_usm;
1821 struct dma_fence *fence = NULL;
1822 u32 batch_size = 2;
1823 u64 src_L0_ofs, dst_L0_ofs;
1824 struct xe_sched_job *job;
1825 struct xe_bb *bb;
1826 u32 update_idx, pt_slot = 0;
1827 unsigned long npages = DIV_ROUND_UP(len + sram_offset, PAGE_SIZE);
1828 unsigned int pitch = len >= PAGE_SIZE && !(len & ~PAGE_MASK) ?
1829 PAGE_SIZE : 4;
1830 int err;
1831 unsigned long i, j;
1832
1833 if (drm_WARN_ON(&xe->drm, (len & XE_CACHELINE_MASK) ||
1834 (sram_offset | vram_addr) & XE_CACHELINE_MASK))
1835 return ERR_PTR(-EOPNOTSUPP);
1836
1837 xe_assert(xe, npages * PAGE_SIZE <= MAX_PREEMPTDISABLE_TRANSFER);
1838
1839 batch_size += pte_update_cmd_size(len);
1840 batch_size += EMIT_COPY_DW;
1841
1842 bb = xe_bb_new(gt, batch_size, use_usm_batch);
1843 if (IS_ERR(bb)) {
1844 err = PTR_ERR(bb);
1845 return ERR_PTR(err);
1846 }
1847
1848 /*
1849 * If the order of a struct drm_pagemap_addr entry is greater than 0,
1850 * the entry is populated by GPU pagemap but subsequent entries within
1851 * the range of that order are not populated.
1852 * build_pt_update_batch_sram() expects a fully populated array of
1853 * struct drm_pagemap_addr. Ensure this is the case even with higher
1854 * orders.
1855 */
1856 for (i = 0; i < npages;) {
1857 unsigned int order = sram_addr[i].order;
1858
1859 for (j = 1; j < NR_PAGES(order) && i + j < npages; j++)
1860 if (!sram_addr[i + j].addr)
1861 sram_addr[i + j].addr = sram_addr[i].addr + j * PAGE_SIZE;
1862
1863 i += NR_PAGES(order);
1864 }
1865
1866 build_pt_update_batch_sram(m, bb, pt_slot * XE_PAGE_SIZE,
1867 sram_addr, len + sram_offset);
1868
1869 if (dir == XE_MIGRATE_COPY_TO_VRAM) {
1870 src_L0_ofs = xe_migrate_vm_addr(pt_slot, 0) + sram_offset;
1871 dst_L0_ofs = xe_migrate_vram_ofs(xe, vram_addr, false);
1872
1873 } else {
1874 src_L0_ofs = xe_migrate_vram_ofs(xe, vram_addr, false);
1875 dst_L0_ofs = xe_migrate_vm_addr(pt_slot, 0) + sram_offset;
1876 }
1877
1878 bb->cs[bb->len++] = MI_BATCH_BUFFER_END;
1879 update_idx = bb->len;
1880
1881 emit_copy(gt, bb, src_L0_ofs, dst_L0_ofs, len, pitch);
1882
1883 job = xe_bb_create_migration_job(m->q, bb,
1884 xe_migrate_batch_base(m, use_usm_batch),
1885 update_idx);
1886 if (IS_ERR(job)) {
1887 err = PTR_ERR(job);
1888 goto err;
1889 }
1890
1891 xe_sched_job_add_migrate_flush(job, MI_INVALIDATE_TLB);
1892
1893 mutex_lock(&m->job_mutex);
1894 xe_sched_job_arm(job);
1895 fence = dma_fence_get(&job->drm.s_fence->finished);
1896 xe_sched_job_push(job);
1897
1898 dma_fence_put(m->fence);
1899 m->fence = dma_fence_get(fence);
1900 mutex_unlock(&m->job_mutex);
1901
1902 xe_bb_free(bb, fence);
1903
1904 return fence;
1905
1906 err:
1907 xe_bb_free(bb, NULL);
1908
1909 return ERR_PTR(err);
1910 }
1911
1912 /**
1913 * xe_migrate_to_vram() - Migrate to VRAM
1914 * @m: The migration context.
1915 * @npages: Number of pages to migrate.
1916 * @src_addr: Array of DMA information (source of migrate)
1917 * @dst_addr: Device physical address of VRAM (destination of migrate)
1918 *
1919 * Copy from an array dma addresses to a VRAM device physical address
1920 *
1921 * Return: dma fence for migrate to signal completion on succees, ERR_PTR on
1922 * failure
1923 */
xe_migrate_to_vram(struct xe_migrate * m,unsigned long npages,struct drm_pagemap_addr * src_addr,u64 dst_addr)1924 struct dma_fence *xe_migrate_to_vram(struct xe_migrate *m,
1925 unsigned long npages,
1926 struct drm_pagemap_addr *src_addr,
1927 u64 dst_addr)
1928 {
1929 return xe_migrate_vram(m, npages * PAGE_SIZE, 0, src_addr, dst_addr,
1930 XE_MIGRATE_COPY_TO_VRAM);
1931 }
1932
1933 /**
1934 * xe_migrate_from_vram() - Migrate from VRAM
1935 * @m: The migration context.
1936 * @npages: Number of pages to migrate.
1937 * @src_addr: Device physical address of VRAM (source of migrate)
1938 * @dst_addr: Array of DMA information (destination of migrate)
1939 *
1940 * Copy from a VRAM device physical address to an array dma addresses
1941 *
1942 * Return: dma fence for migrate to signal completion on succees, ERR_PTR on
1943 * failure
1944 */
xe_migrate_from_vram(struct xe_migrate * m,unsigned long npages,u64 src_addr,struct drm_pagemap_addr * dst_addr)1945 struct dma_fence *xe_migrate_from_vram(struct xe_migrate *m,
1946 unsigned long npages,
1947 u64 src_addr,
1948 struct drm_pagemap_addr *dst_addr)
1949 {
1950 return xe_migrate_vram(m, npages * PAGE_SIZE, 0, dst_addr, src_addr,
1951 XE_MIGRATE_COPY_TO_SRAM);
1952 }
1953
xe_migrate_dma_unmap(struct xe_device * xe,struct drm_pagemap_addr * pagemap_addr,int len,int write)1954 static void xe_migrate_dma_unmap(struct xe_device *xe,
1955 struct drm_pagemap_addr *pagemap_addr,
1956 int len, int write)
1957 {
1958 unsigned long i, npages = DIV_ROUND_UP(len, PAGE_SIZE);
1959
1960 for (i = 0; i < npages; ++i) {
1961 if (!pagemap_addr[i].addr)
1962 break;
1963
1964 dma_unmap_page(xe->drm.dev, pagemap_addr[i].addr, PAGE_SIZE,
1965 write ? DMA_TO_DEVICE : DMA_FROM_DEVICE);
1966 }
1967 kfree(pagemap_addr);
1968 }
1969
xe_migrate_dma_map(struct xe_device * xe,void * buf,int len,int write)1970 static struct drm_pagemap_addr *xe_migrate_dma_map(struct xe_device *xe,
1971 void *buf, int len,
1972 int write)
1973 {
1974 struct drm_pagemap_addr *pagemap_addr;
1975 unsigned long i, npages = DIV_ROUND_UP(len, PAGE_SIZE);
1976
1977 pagemap_addr = kcalloc(npages, sizeof(*pagemap_addr), GFP_KERNEL);
1978 if (!pagemap_addr)
1979 return ERR_PTR(-ENOMEM);
1980
1981 for (i = 0; i < npages; ++i) {
1982 dma_addr_t addr;
1983 struct page *page;
1984 enum dma_data_direction dir = write ? DMA_TO_DEVICE :
1985 DMA_FROM_DEVICE;
1986
1987 if (is_vmalloc_addr(buf))
1988 page = vmalloc_to_page(buf);
1989 else
1990 page = virt_to_page(buf);
1991
1992 addr = dma_map_page(xe->drm.dev, page, 0, PAGE_SIZE, dir);
1993 if (dma_mapping_error(xe->drm.dev, addr))
1994 goto err_fault;
1995
1996 pagemap_addr[i] =
1997 drm_pagemap_addr_encode(addr,
1998 DRM_INTERCONNECT_SYSTEM,
1999 0, dir);
2000 buf += PAGE_SIZE;
2001 }
2002
2003 return pagemap_addr;
2004
2005 err_fault:
2006 xe_migrate_dma_unmap(xe, pagemap_addr, len, write);
2007 return ERR_PTR(-EFAULT);
2008 }
2009
2010 /**
2011 * xe_migrate_access_memory - Access memory of a BO via GPU
2012 *
2013 * @m: The migration context.
2014 * @bo: buffer object
2015 * @offset: access offset into buffer object
2016 * @buf: pointer to caller memory to read into or write from
2017 * @len: length of access
2018 * @write: write access
2019 *
2020 * Access memory of a BO via GPU either reading in or writing from a passed in
2021 * pointer. Pointer is dma mapped for GPU access and GPU commands are issued to
2022 * read to or write from pointer.
2023 *
2024 * Returns:
2025 * 0 if successful, negative error code on failure.
2026 */
xe_migrate_access_memory(struct xe_migrate * m,struct xe_bo * bo,unsigned long offset,void * buf,int len,int write)2027 int xe_migrate_access_memory(struct xe_migrate *m, struct xe_bo *bo,
2028 unsigned long offset, void *buf, int len,
2029 int write)
2030 {
2031 struct xe_tile *tile = m->tile;
2032 struct xe_device *xe = tile_to_xe(tile);
2033 struct xe_res_cursor cursor;
2034 struct dma_fence *fence = NULL;
2035 struct drm_pagemap_addr *pagemap_addr;
2036 unsigned long page_offset = (unsigned long)buf & ~PAGE_MASK;
2037 int bytes_left = len, current_page = 0;
2038 void *orig_buf = buf;
2039
2040 xe_bo_assert_held(bo);
2041
2042 /* Use bounce buffer for small access and unaligned access */
2043 if (!IS_ALIGNED(len, XE_CACHELINE_BYTES) ||
2044 !IS_ALIGNED((unsigned long)buf + offset, XE_CACHELINE_BYTES)) {
2045 int buf_offset = 0;
2046 void *bounce;
2047 int err;
2048
2049 BUILD_BUG_ON(!is_power_of_2(XE_CACHELINE_BYTES));
2050 bounce = kmalloc(XE_CACHELINE_BYTES, GFP_KERNEL);
2051 if (!bounce)
2052 return -ENOMEM;
2053
2054 /*
2055 * Less than ideal for large unaligned access but this should be
2056 * fairly rare, can fixup if this becomes common.
2057 */
2058 do {
2059 int copy_bytes = min_t(int, bytes_left,
2060 XE_CACHELINE_BYTES -
2061 (offset & XE_CACHELINE_MASK));
2062 int ptr_offset = offset & XE_CACHELINE_MASK;
2063
2064 err = xe_migrate_access_memory(m, bo,
2065 offset &
2066 ~XE_CACHELINE_MASK,
2067 bounce,
2068 XE_CACHELINE_BYTES, 0);
2069 if (err)
2070 break;
2071
2072 if (write) {
2073 memcpy(bounce + ptr_offset, buf + buf_offset, copy_bytes);
2074
2075 err = xe_migrate_access_memory(m, bo,
2076 offset & ~XE_CACHELINE_MASK,
2077 bounce,
2078 XE_CACHELINE_BYTES, write);
2079 if (err)
2080 break;
2081 } else {
2082 memcpy(buf + buf_offset, bounce + ptr_offset,
2083 copy_bytes);
2084 }
2085
2086 bytes_left -= copy_bytes;
2087 buf_offset += copy_bytes;
2088 offset += copy_bytes;
2089 } while (bytes_left);
2090
2091 kfree(bounce);
2092 return err;
2093 }
2094
2095 pagemap_addr = xe_migrate_dma_map(xe, buf, len + page_offset, write);
2096 if (IS_ERR(pagemap_addr))
2097 return PTR_ERR(pagemap_addr);
2098
2099 xe_res_first(bo->ttm.resource, offset, xe_bo_size(bo) - offset, &cursor);
2100
2101 do {
2102 struct dma_fence *__fence;
2103 u64 vram_addr = vram_region_gpu_offset(bo->ttm.resource) +
2104 cursor.start;
2105 int current_bytes;
2106
2107 if (cursor.size > MAX_PREEMPTDISABLE_TRANSFER)
2108 current_bytes = min_t(int, bytes_left,
2109 MAX_PREEMPTDISABLE_TRANSFER);
2110 else
2111 current_bytes = min_t(int, bytes_left, cursor.size);
2112
2113 if (current_bytes & ~PAGE_MASK) {
2114 int pitch = 4;
2115
2116 current_bytes = min_t(int, current_bytes,
2117 round_down(S16_MAX * pitch,
2118 XE_CACHELINE_BYTES));
2119 }
2120
2121 __fence = xe_migrate_vram(m, current_bytes,
2122 (unsigned long)buf & ~PAGE_MASK,
2123 &pagemap_addr[current_page],
2124 vram_addr, write ?
2125 XE_MIGRATE_COPY_TO_VRAM :
2126 XE_MIGRATE_COPY_TO_SRAM);
2127 if (IS_ERR(__fence)) {
2128 if (fence) {
2129 dma_fence_wait(fence, false);
2130 dma_fence_put(fence);
2131 }
2132 fence = __fence;
2133 goto out_err;
2134 }
2135
2136 dma_fence_put(fence);
2137 fence = __fence;
2138
2139 buf += current_bytes;
2140 offset += current_bytes;
2141 current_page = (int)(buf - orig_buf) / PAGE_SIZE;
2142 bytes_left -= current_bytes;
2143 if (bytes_left)
2144 xe_res_next(&cursor, current_bytes);
2145 } while (bytes_left);
2146
2147 dma_fence_wait(fence, false);
2148 dma_fence_put(fence);
2149
2150 out_err:
2151 xe_migrate_dma_unmap(xe, pagemap_addr, len + page_offset, write);
2152 return IS_ERR(fence) ? PTR_ERR(fence) : 0;
2153 }
2154
2155 /**
2156 * xe_migrate_job_lock() - Lock migrate job lock
2157 * @m: The migration context.
2158 * @q: Queue associated with the operation which requires a lock
2159 *
2160 * Lock the migrate job lock if the queue is a migration queue, otherwise
2161 * assert the VM's dma-resv is held (user queue's have own locking).
2162 */
xe_migrate_job_lock(struct xe_migrate * m,struct xe_exec_queue * q)2163 void xe_migrate_job_lock(struct xe_migrate *m, struct xe_exec_queue *q)
2164 {
2165 bool is_migrate = q == m->q;
2166
2167 if (is_migrate)
2168 mutex_lock(&m->job_mutex);
2169 else
2170 xe_vm_assert_held(q->vm); /* User queues VM's should be locked */
2171 }
2172
2173 /**
2174 * xe_migrate_job_unlock() - Unlock migrate job lock
2175 * @m: The migration context.
2176 * @q: Queue associated with the operation which requires a lock
2177 *
2178 * Unlock the migrate job lock if the queue is a migration queue, otherwise
2179 * assert the VM's dma-resv is held (user queue's have own locking).
2180 */
xe_migrate_job_unlock(struct xe_migrate * m,struct xe_exec_queue * q)2181 void xe_migrate_job_unlock(struct xe_migrate *m, struct xe_exec_queue *q)
2182 {
2183 bool is_migrate = q == m->q;
2184
2185 if (is_migrate)
2186 mutex_unlock(&m->job_mutex);
2187 else
2188 xe_vm_assert_held(q->vm); /* User queues VM's should be locked */
2189 }
2190
2191 #if IS_ENABLED(CONFIG_DRM_XE_KUNIT_TEST)
2192 #include "tests/xe_migrate.c"
2193 #endif
2194