1 // SPDX-License-Identifier: MIT 2 /* 3 * Copyright © 2022 Intel Corporation 4 */ 5 6 #include "xe_ring_ops.h" 7 8 #include "regs/xe_gpu_commands.h" 9 #include "regs/xe_gt_regs.h" 10 #include "regs/xe_lrc_layout.h" 11 #include "regs/xe_regs.h" 12 #include "xe_engine_types.h" 13 #include "xe_gt.h" 14 #include "xe_lrc.h" 15 #include "xe_macros.h" 16 #include "xe_sched_job.h" 17 #include "xe_vm_types.h" 18 19 /* 20 * 3D-related flags that can't be set on _engines_ that lack access to the 3D 21 * pipeline (i.e., CCS engines). 22 */ 23 #define PIPE_CONTROL_3D_ENGINE_FLAGS (\ 24 PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH | \ 25 PIPE_CONTROL_DEPTH_CACHE_FLUSH | \ 26 PIPE_CONTROL_TILE_CACHE_FLUSH | \ 27 PIPE_CONTROL_DEPTH_STALL | \ 28 PIPE_CONTROL_STALL_AT_SCOREBOARD | \ 29 PIPE_CONTROL_PSD_SYNC | \ 30 PIPE_CONTROL_AMFS_FLUSH | \ 31 PIPE_CONTROL_VF_CACHE_INVALIDATE | \ 32 PIPE_CONTROL_GLOBAL_SNAPSHOT_RESET) 33 34 /* 3D-related flags that can't be set on _platforms_ that lack a 3D pipeline */ 35 #define PIPE_CONTROL_3D_ARCH_FLAGS ( \ 36 PIPE_CONTROL_3D_ENGINE_FLAGS | \ 37 PIPE_CONTROL_INDIRECT_STATE_DISABLE | \ 38 PIPE_CONTROL_FLUSH_ENABLE | \ 39 PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE | \ 40 PIPE_CONTROL_DC_FLUSH_ENABLE) 41 42 static u32 preparser_disable(bool state) 43 { 44 return MI_ARB_CHECK | BIT(8) | state; 45 } 46 47 static int emit_aux_table_inv(struct xe_gt *gt, struct xe_reg reg, 48 u32 *dw, int i) 49 { 50 dw[i++] = MI_LOAD_REGISTER_IMM(1) | MI_LRI_MMIO_REMAP_EN; 51 dw[i++] = reg.addr + gt->mmio.adj_offset; 52 dw[i++] = AUX_INV; 53 dw[i++] = MI_NOOP; 54 55 return i; 56 } 57 58 static int emit_user_interrupt(u32 *dw, int i) 59 { 60 dw[i++] = MI_USER_INTERRUPT; 61 dw[i++] = MI_ARB_ON_OFF | MI_ARB_ENABLE; 62 dw[i++] = MI_ARB_CHECK; 63 64 return i; 65 } 66 67 static int emit_store_imm_ggtt(u32 addr, u32 value, u32 *dw, int i) 68 { 69 dw[i++] = MI_STORE_DATA_IMM | BIT(22) /* GGTT */ | 2; 70 dw[i++] = addr; 71 dw[i++] = 0; 72 dw[i++] = value; 73 74 return i; 75 } 76 77 static int emit_flush_imm_ggtt(u32 addr, u32 value, u32 *dw, int i) 78 { 79 dw[i++] = (MI_FLUSH_DW + 1) | MI_FLUSH_DW_OP_STOREDW; 80 dw[i++] = addr | MI_FLUSH_DW_USE_GTT; 81 dw[i++] = 0; 82 dw[i++] = value; 83 84 return i; 85 } 86 87 static int emit_bb_start(u64 batch_addr, u32 ppgtt_flag, u32 *dw, int i) 88 { 89 dw[i++] = MI_BATCH_BUFFER_START | ppgtt_flag; 90 dw[i++] = lower_32_bits(batch_addr); 91 dw[i++] = upper_32_bits(batch_addr); 92 93 return i; 94 } 95 96 static int emit_flush_invalidate(u32 flag, u32 *dw, int i) 97 { 98 dw[i] = MI_FLUSH_DW + 1; 99 dw[i] |= flag; 100 dw[i++] |= MI_INVALIDATE_TLB | MI_FLUSH_DW_OP_STOREDW | 101 MI_FLUSH_DW_STORE_INDEX; 102 103 dw[i++] = LRC_PPHWSP_SCRATCH_ADDR | MI_FLUSH_DW_USE_GTT; 104 dw[i++] = 0; 105 dw[i++] = ~0U; 106 107 return i; 108 } 109 110 static int emit_pipe_invalidate(u32 mask_flags, u32 *dw, int i) 111 { 112 u32 flags = PIPE_CONTROL_CS_STALL | 113 PIPE_CONTROL_COMMAND_CACHE_INVALIDATE | 114 PIPE_CONTROL_INSTRUCTION_CACHE_INVALIDATE | 115 PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE | 116 PIPE_CONTROL_VF_CACHE_INVALIDATE | 117 PIPE_CONTROL_CONST_CACHE_INVALIDATE | 118 PIPE_CONTROL_STATE_CACHE_INVALIDATE | 119 PIPE_CONTROL_QW_WRITE | 120 PIPE_CONTROL_STORE_DATA_INDEX; 121 122 flags &= ~mask_flags; 123 124 dw[i++] = GFX_OP_PIPE_CONTROL(6); 125 dw[i++] = flags; 126 dw[i++] = LRC_PPHWSP_SCRATCH_ADDR; 127 dw[i++] = 0; 128 dw[i++] = 0; 129 dw[i++] = 0; 130 131 return i; 132 } 133 134 #define MI_STORE_QWORD_IMM_GEN8_POSTED (MI_INSTR(0x20, 3) | (1 << 21)) 135 136 static int emit_store_imm_ppgtt_posted(u64 addr, u64 value, 137 u32 *dw, int i) 138 { 139 dw[i++] = MI_STORE_QWORD_IMM_GEN8_POSTED; 140 dw[i++] = lower_32_bits(addr); 141 dw[i++] = upper_32_bits(addr); 142 dw[i++] = lower_32_bits(value); 143 dw[i++] = upper_32_bits(value); 144 145 return i; 146 } 147 148 static int emit_pipe_imm_ggtt(u32 addr, u32 value, bool stall_only, u32 *dw, 149 int i) 150 { 151 dw[i++] = GFX_OP_PIPE_CONTROL(6); 152 dw[i++] = (stall_only ? PIPE_CONTROL_CS_STALL : 153 PIPE_CONTROL_FLUSH_ENABLE | PIPE_CONTROL_CS_STALL) | 154 PIPE_CONTROL_GLOBAL_GTT_IVB | PIPE_CONTROL_QW_WRITE; 155 dw[i++] = addr; 156 dw[i++] = 0; 157 dw[i++] = value; 158 dw[i++] = 0; /* We're thrashing one extra dword. */ 159 160 return i; 161 } 162 163 static u32 get_ppgtt_flag(struct xe_sched_job *job) 164 { 165 return !(job->engine->flags & ENGINE_FLAG_WA) ? BIT(8) : 0; 166 } 167 168 static void __emit_job_gen12_copy(struct xe_sched_job *job, struct xe_lrc *lrc, 169 u64 batch_addr, u32 seqno) 170 { 171 u32 dw[MAX_JOB_SIZE_DW], i = 0; 172 u32 ppgtt_flag = get_ppgtt_flag(job); 173 174 i = emit_store_imm_ggtt(xe_lrc_start_seqno_ggtt_addr(lrc), 175 seqno, dw, i); 176 177 i = emit_bb_start(batch_addr, ppgtt_flag, dw, i); 178 179 if (job->user_fence.used) 180 i = emit_store_imm_ppgtt_posted(job->user_fence.addr, 181 job->user_fence.value, 182 dw, i); 183 184 i = emit_flush_imm_ggtt(xe_lrc_seqno_ggtt_addr(lrc), seqno, dw, i); 185 186 i = emit_user_interrupt(dw, i); 187 188 XE_BUG_ON(i > MAX_JOB_SIZE_DW); 189 190 xe_lrc_write_ring(lrc, dw, i * sizeof(*dw)); 191 } 192 193 static bool has_aux_ccs(struct xe_device *xe) 194 { 195 /* 196 * PVC is a special case that has no compression of either type 197 * (FlatCCS or AuxCCS). 198 */ 199 if (xe->info.platform == XE_PVC) 200 return false; 201 202 return !xe->info.has_flat_ccs; 203 } 204 205 static void __emit_job_gen12_video(struct xe_sched_job *job, struct xe_lrc *lrc, 206 u64 batch_addr, u32 seqno) 207 { 208 u32 dw[MAX_JOB_SIZE_DW], i = 0; 209 u32 ppgtt_flag = get_ppgtt_flag(job); 210 struct xe_gt *gt = job->engine->gt; 211 struct xe_device *xe = gt_to_xe(gt); 212 bool decode = job->engine->class == XE_ENGINE_CLASS_VIDEO_DECODE; 213 214 dw[i++] = preparser_disable(true); 215 216 /* hsdes: 1809175790 */ 217 if (has_aux_ccs(xe)) { 218 if (decode) 219 i = emit_aux_table_inv(gt, VD0_AUX_INV, dw, i); 220 else 221 i = emit_aux_table_inv(gt, VE0_AUX_INV, dw, i); 222 } 223 dw[i++] = preparser_disable(false); 224 225 i = emit_store_imm_ggtt(xe_lrc_start_seqno_ggtt_addr(lrc), 226 seqno, dw, i); 227 228 i = emit_bb_start(batch_addr, ppgtt_flag, dw, i); 229 230 if (job->user_fence.used) 231 i = emit_store_imm_ppgtt_posted(job->user_fence.addr, 232 job->user_fence.value, 233 dw, i); 234 235 i = emit_flush_imm_ggtt(xe_lrc_seqno_ggtt_addr(lrc), seqno, dw, i); 236 237 i = emit_user_interrupt(dw, i); 238 239 XE_BUG_ON(i > MAX_JOB_SIZE_DW); 240 241 xe_lrc_write_ring(lrc, dw, i * sizeof(*dw)); 242 } 243 244 static void __emit_job_gen12_render_compute(struct xe_sched_job *job, 245 struct xe_lrc *lrc, 246 u64 batch_addr, u32 seqno) 247 { 248 u32 dw[MAX_JOB_SIZE_DW], i = 0; 249 u32 ppgtt_flag = get_ppgtt_flag(job); 250 struct xe_gt *gt = job->engine->gt; 251 struct xe_device *xe = gt_to_xe(gt); 252 bool lacks_render = !(gt->info.engine_mask & XE_HW_ENGINE_RCS_MASK); 253 u32 mask_flags = 0; 254 255 dw[i++] = preparser_disable(true); 256 if (lacks_render) 257 mask_flags = PIPE_CONTROL_3D_ARCH_FLAGS; 258 else if (job->engine->class == XE_ENGINE_CLASS_COMPUTE) 259 mask_flags = PIPE_CONTROL_3D_ENGINE_FLAGS; 260 i = emit_pipe_invalidate(mask_flags, dw, i); 261 262 /* hsdes: 1809175790 */ 263 if (has_aux_ccs(xe)) 264 i = emit_aux_table_inv(gt, CCS_AUX_INV, dw, i); 265 266 dw[i++] = preparser_disable(false); 267 268 i = emit_store_imm_ggtt(xe_lrc_start_seqno_ggtt_addr(lrc), 269 seqno, dw, i); 270 271 i = emit_bb_start(batch_addr, ppgtt_flag, dw, i); 272 273 if (job->user_fence.used) 274 i = emit_store_imm_ppgtt_posted(job->user_fence.addr, 275 job->user_fence.value, 276 dw, i); 277 278 i = emit_pipe_imm_ggtt(xe_lrc_seqno_ggtt_addr(lrc), seqno, lacks_render, dw, i); 279 280 i = emit_user_interrupt(dw, i); 281 282 XE_BUG_ON(i > MAX_JOB_SIZE_DW); 283 284 xe_lrc_write_ring(lrc, dw, i * sizeof(*dw)); 285 } 286 287 static void emit_migration_job_gen12(struct xe_sched_job *job, 288 struct xe_lrc *lrc, u32 seqno) 289 { 290 u32 dw[MAX_JOB_SIZE_DW], i = 0; 291 292 i = emit_store_imm_ggtt(xe_lrc_start_seqno_ggtt_addr(lrc), 293 seqno, dw, i); 294 295 i = emit_bb_start(job->batch_addr[0], BIT(8), dw, i); 296 297 /* XXX: Do we need this? Leaving for now. */ 298 dw[i++] = preparser_disable(true); 299 i = emit_flush_invalidate(0, dw, i); 300 dw[i++] = preparser_disable(false); 301 302 i = emit_bb_start(job->batch_addr[1], BIT(8), dw, i); 303 304 dw[i++] = (MI_FLUSH_DW | MI_INVALIDATE_TLB | job->migrate_flush_flags | 305 MI_FLUSH_DW_OP_STOREDW) + 1; 306 dw[i++] = xe_lrc_seqno_ggtt_addr(lrc) | MI_FLUSH_DW_USE_GTT; 307 dw[i++] = 0; 308 dw[i++] = seqno; /* value */ 309 310 i = emit_user_interrupt(dw, i); 311 312 XE_BUG_ON(i > MAX_JOB_SIZE_DW); 313 314 xe_lrc_write_ring(lrc, dw, i * sizeof(*dw)); 315 } 316 317 static void emit_job_gen12_copy(struct xe_sched_job *job) 318 { 319 int i; 320 321 if (xe_sched_job_is_migration(job->engine)) { 322 emit_migration_job_gen12(job, job->engine->lrc, 323 xe_sched_job_seqno(job)); 324 return; 325 } 326 327 for (i = 0; i < job->engine->width; ++i) 328 __emit_job_gen12_copy(job, job->engine->lrc + i, 329 job->batch_addr[i], 330 xe_sched_job_seqno(job)); 331 } 332 333 static void emit_job_gen12_video(struct xe_sched_job *job) 334 { 335 int i; 336 337 /* FIXME: Not doing parallel handshake for now */ 338 for (i = 0; i < job->engine->width; ++i) 339 __emit_job_gen12_video(job, job->engine->lrc + i, 340 job->batch_addr[i], 341 xe_sched_job_seqno(job)); 342 } 343 344 static void emit_job_gen12_render_compute(struct xe_sched_job *job) 345 { 346 int i; 347 348 for (i = 0; i < job->engine->width; ++i) 349 __emit_job_gen12_render_compute(job, job->engine->lrc + i, 350 job->batch_addr[i], 351 xe_sched_job_seqno(job)); 352 } 353 354 static const struct xe_ring_ops ring_ops_gen12_copy = { 355 .emit_job = emit_job_gen12_copy, 356 }; 357 358 static const struct xe_ring_ops ring_ops_gen12_video = { 359 .emit_job = emit_job_gen12_video, 360 }; 361 362 static const struct xe_ring_ops ring_ops_gen12_render_compute = { 363 .emit_job = emit_job_gen12_render_compute, 364 }; 365 366 const struct xe_ring_ops * 367 xe_ring_ops_get(struct xe_gt *gt, enum xe_engine_class class) 368 { 369 switch (class) { 370 case XE_ENGINE_CLASS_COPY: 371 return &ring_ops_gen12_copy; 372 case XE_ENGINE_CLASS_VIDEO_DECODE: 373 case XE_ENGINE_CLASS_VIDEO_ENHANCE: 374 return &ring_ops_gen12_video; 375 case XE_ENGINE_CLASS_RENDER: 376 case XE_ENGINE_CLASS_COMPUTE: 377 return &ring_ops_gen12_render_compute; 378 default: 379 return NULL; 380 } 381 } 382