1 // SPDX-License-Identifier: MIT
2 /*
3 * Copyright © 2020 Intel Corporation
4 */
5
6 #include "gen6_engine_cs.h"
7 #include "intel_engine.h"
8 #include "intel_engine_regs.h"
9 #include "intel_gpu_commands.h"
10 #include "intel_gt.h"
11 #include "intel_gt_irq.h"
12 #include "intel_gt_pm_irq.h"
13 #include "intel_ring.h"
14
15 #define HWS_SCRATCH_ADDR (I915_GEM_HWS_SCRATCH * sizeof(u32))
16
17 /*
18 * Emits a PIPE_CONTROL with a non-zero post-sync operation, for
19 * implementing two workarounds on gen6. From section 1.4.7.1
20 * "PIPE_CONTROL" of the Sandy Bridge PRM volume 2 part 1:
21 *
22 * [DevSNB-C+{W/A}] Before any depth stall flush (including those
23 * produced by non-pipelined state commands), software needs to first
24 * send a PIPE_CONTROL with no bits set except Post-Sync Operation !=
25 * 0.
26 *
27 * [Dev-SNB{W/A}]: Before a PIPE_CONTROL with Write Cache Flush Enable
28 * =1, a PIPE_CONTROL with any non-zero post-sync-op is required.
29 *
30 * And the workaround for these two requires this workaround first:
31 *
32 * [Dev-SNB{W/A}]: Pipe-control with CS-stall bit set must be sent
33 * BEFORE the pipe-control with a post-sync op and no write-cache
34 * flushes.
35 *
36 * And this last workaround is tricky because of the requirements on
37 * that bit. From section 1.4.7.2.3 "Stall" of the Sandy Bridge PRM
38 * volume 2 part 1:
39 *
40 * "1 of the following must also be set:
41 * - Render Target Cache Flush Enable ([12] of DW1)
42 * - Depth Cache Flush Enable ([0] of DW1)
43 * - Stall at Pixel Scoreboard ([1] of DW1)
44 * - Depth Stall ([13] of DW1)
45 * - Post-Sync Operation ([13] of DW1)
46 * - Notify Enable ([8] of DW1)"
47 *
48 * The cache flushes require the workaround flush that triggered this
49 * one, so we can't use it. Depth stall would trigger the same.
50 * Post-sync nonzero is what triggered this second workaround, so we
51 * can't use that one either. Notify enable is IRQs, which aren't
52 * really our business. That leaves only stall at scoreboard.
53 */
54 static int
gen6_emit_post_sync_nonzero_flush(struct i915_request * rq)55 gen6_emit_post_sync_nonzero_flush(struct i915_request *rq)
56 {
57 u32 scratch_addr =
58 intel_gt_scratch_offset(rq->engine->gt,
59 INTEL_GT_SCRATCH_FIELD_RENDER_FLUSH);
60 u32 *cs;
61
62 cs = intel_ring_begin(rq, 6);
63 if (IS_ERR(cs))
64 return PTR_ERR(cs);
65
66 *cs++ = GFX_OP_PIPE_CONTROL(5);
67 *cs++ = PIPE_CONTROL_CS_STALL | PIPE_CONTROL_STALL_AT_SCOREBOARD;
68 *cs++ = scratch_addr | PIPE_CONTROL_GLOBAL_GTT;
69 *cs++ = 0; /* low dword */
70 *cs++ = 0; /* high dword */
71 *cs++ = MI_NOOP;
72 intel_ring_advance(rq, cs);
73
74 cs = intel_ring_begin(rq, 6);
75 if (IS_ERR(cs))
76 return PTR_ERR(cs);
77
78 *cs++ = GFX_OP_PIPE_CONTROL(5);
79 *cs++ = PIPE_CONTROL_QW_WRITE;
80 *cs++ = scratch_addr | PIPE_CONTROL_GLOBAL_GTT;
81 *cs++ = 0;
82 *cs++ = 0;
83 *cs++ = MI_NOOP;
84 intel_ring_advance(rq, cs);
85
86 return 0;
87 }
88
gen6_emit_flush_rcs(struct i915_request * rq,u32 mode)89 int gen6_emit_flush_rcs(struct i915_request *rq, u32 mode)
90 {
91 u32 scratch_addr =
92 intel_gt_scratch_offset(rq->engine->gt,
93 INTEL_GT_SCRATCH_FIELD_RENDER_FLUSH);
94 u32 *cs, flags = 0;
95 int ret;
96
97 /* Force SNB workarounds for PIPE_CONTROL flushes */
98 ret = gen6_emit_post_sync_nonzero_flush(rq);
99 if (ret)
100 return ret;
101
102 /*
103 * Just flush everything. Experiments have shown that reducing the
104 * number of bits based on the write domains has little performance
105 * impact. And when rearranging requests, the order of flushes is
106 * unknown.
107 */
108 if (mode & EMIT_FLUSH) {
109 flags |= PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH;
110 flags |= PIPE_CONTROL_DEPTH_CACHE_FLUSH;
111 /*
112 * Ensure that any following seqno writes only happen
113 * when the render cache is indeed flushed.
114 */
115 flags |= PIPE_CONTROL_CS_STALL;
116 }
117 if (mode & EMIT_INVALIDATE) {
118 flags |= PIPE_CONTROL_TLB_INVALIDATE;
119 flags |= PIPE_CONTROL_INSTRUCTION_CACHE_INVALIDATE;
120 flags |= PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE;
121 flags |= PIPE_CONTROL_VF_CACHE_INVALIDATE;
122 flags |= PIPE_CONTROL_CONST_CACHE_INVALIDATE;
123 flags |= PIPE_CONTROL_STATE_CACHE_INVALIDATE;
124 /*
125 * TLB invalidate requires a post-sync write.
126 */
127 flags |= PIPE_CONTROL_QW_WRITE | PIPE_CONTROL_CS_STALL;
128 }
129
130 cs = intel_ring_begin(rq, 4);
131 if (IS_ERR(cs))
132 return PTR_ERR(cs);
133
134 *cs++ = GFX_OP_PIPE_CONTROL(4);
135 *cs++ = flags;
136 *cs++ = scratch_addr | PIPE_CONTROL_GLOBAL_GTT;
137 *cs++ = 0;
138 intel_ring_advance(rq, cs);
139
140 return 0;
141 }
142
gen6_emit_breadcrumb_rcs(struct i915_request * rq,u32 * cs)143 u32 *gen6_emit_breadcrumb_rcs(struct i915_request *rq, u32 *cs)
144 {
145 /* First we do the gen6_emit_post_sync_nonzero_flush w/a */
146 *cs++ = GFX_OP_PIPE_CONTROL(4);
147 *cs++ = PIPE_CONTROL_CS_STALL | PIPE_CONTROL_STALL_AT_SCOREBOARD;
148 *cs++ = 0;
149 *cs++ = 0;
150
151 *cs++ = GFX_OP_PIPE_CONTROL(4);
152 *cs++ = PIPE_CONTROL_QW_WRITE;
153 *cs++ = intel_gt_scratch_offset(rq->engine->gt,
154 INTEL_GT_SCRATCH_FIELD_DEFAULT) |
155 PIPE_CONTROL_GLOBAL_GTT;
156 *cs++ = 0;
157
158 /* Finally we can flush and with it emit the breadcrumb */
159 *cs++ = GFX_OP_PIPE_CONTROL(4);
160 *cs++ = (PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH |
161 PIPE_CONTROL_DEPTH_CACHE_FLUSH |
162 PIPE_CONTROL_DC_FLUSH_ENABLE |
163 PIPE_CONTROL_QW_WRITE |
164 PIPE_CONTROL_CS_STALL);
165 *cs++ = i915_request_active_seqno(rq) |
166 PIPE_CONTROL_GLOBAL_GTT;
167 *cs++ = rq->fence.seqno;
168
169 *cs++ = MI_USER_INTERRUPT;
170 *cs++ = MI_NOOP;
171
172 rq->tail = intel_ring_offset(rq, cs);
173 assert_ring_tail_valid(rq->ring, rq->tail);
174
175 return cs;
176 }
177
mi_flush_dw(struct i915_request * rq,u32 flags)178 static int mi_flush_dw(struct i915_request *rq, u32 flags)
179 {
180 u32 cmd, *cs;
181
182 cs = intel_ring_begin(rq, 4);
183 if (IS_ERR(cs))
184 return PTR_ERR(cs);
185
186 cmd = MI_FLUSH_DW;
187
188 /*
189 * We always require a command barrier so that subsequent
190 * commands, such as breadcrumb interrupts, are strictly ordered
191 * wrt the contents of the write cache being flushed to memory
192 * (and thus being coherent from the CPU).
193 */
194 cmd |= MI_FLUSH_DW_STORE_INDEX | MI_FLUSH_DW_OP_STOREDW;
195
196 /*
197 * Bspec vol 1c.3 - blitter engine command streamer:
198 * "If ENABLED, all TLBs will be invalidated once the flush
199 * operation is complete. This bit is only valid when the
200 * Post-Sync Operation field is a value of 1h or 3h."
201 */
202 cmd |= flags;
203
204 *cs++ = cmd;
205 *cs++ = HWS_SCRATCH_ADDR | MI_FLUSH_DW_USE_GTT;
206 *cs++ = 0;
207 *cs++ = MI_NOOP;
208
209 intel_ring_advance(rq, cs);
210
211 return 0;
212 }
213
gen6_flush_dw(struct i915_request * rq,u32 mode,u32 invflags)214 static int gen6_flush_dw(struct i915_request *rq, u32 mode, u32 invflags)
215 {
216 return mi_flush_dw(rq, mode & EMIT_INVALIDATE ? invflags : 0);
217 }
218
gen6_emit_flush_xcs(struct i915_request * rq,u32 mode)219 int gen6_emit_flush_xcs(struct i915_request *rq, u32 mode)
220 {
221 return gen6_flush_dw(rq, mode, MI_INVALIDATE_TLB);
222 }
223
gen6_emit_flush_vcs(struct i915_request * rq,u32 mode)224 int gen6_emit_flush_vcs(struct i915_request *rq, u32 mode)
225 {
226 return gen6_flush_dw(rq, mode, MI_INVALIDATE_TLB | MI_INVALIDATE_BSD);
227 }
228
gen6_emit_bb_start(struct i915_request * rq,u64 offset,u32 len,unsigned int dispatch_flags)229 int gen6_emit_bb_start(struct i915_request *rq,
230 u64 offset, u32 len,
231 unsigned int dispatch_flags)
232 {
233 u32 security;
234 u32 *cs;
235
236 security = MI_BATCH_NON_SECURE_I965;
237 if (dispatch_flags & I915_DISPATCH_SECURE)
238 security = 0;
239
240 cs = intel_ring_begin(rq, 2);
241 if (IS_ERR(cs))
242 return PTR_ERR(cs);
243
244 cs = __gen6_emit_bb_start(cs, offset, security);
245 intel_ring_advance(rq, cs);
246
247 return 0;
248 }
249
250 int
hsw_emit_bb_start(struct i915_request * rq,u64 offset,u32 len,unsigned int dispatch_flags)251 hsw_emit_bb_start(struct i915_request *rq,
252 u64 offset, u32 len,
253 unsigned int dispatch_flags)
254 {
255 u32 security;
256 u32 *cs;
257
258 security = MI_BATCH_PPGTT_HSW | MI_BATCH_NON_SECURE_HSW;
259 if (dispatch_flags & I915_DISPATCH_SECURE)
260 security = 0;
261
262 cs = intel_ring_begin(rq, 2);
263 if (IS_ERR(cs))
264 return PTR_ERR(cs);
265
266 cs = __gen6_emit_bb_start(cs, offset, security);
267 intel_ring_advance(rq, cs);
268
269 return 0;
270 }
271
gen7_stall_cs(struct i915_request * rq)272 static int gen7_stall_cs(struct i915_request *rq)
273 {
274 u32 *cs;
275
276 cs = intel_ring_begin(rq, 4);
277 if (IS_ERR(cs))
278 return PTR_ERR(cs);
279
280 *cs++ = GFX_OP_PIPE_CONTROL(4);
281 *cs++ = PIPE_CONTROL_CS_STALL | PIPE_CONTROL_STALL_AT_SCOREBOARD;
282 *cs++ = 0;
283 *cs++ = 0;
284 intel_ring_advance(rq, cs);
285
286 return 0;
287 }
288
gen7_emit_flush_rcs(struct i915_request * rq,u32 mode)289 int gen7_emit_flush_rcs(struct i915_request *rq, u32 mode)
290 {
291 u32 scratch_addr =
292 intel_gt_scratch_offset(rq->engine->gt,
293 INTEL_GT_SCRATCH_FIELD_RENDER_FLUSH);
294 u32 *cs, flags = 0;
295
296 /*
297 * Ensure that any following seqno writes only happen when the render
298 * cache is indeed flushed.
299 *
300 * Workaround: 4th PIPE_CONTROL command (except the ones with only
301 * read-cache invalidate bits set) must have the CS_STALL bit set. We
302 * don't try to be clever and just set it unconditionally.
303 */
304 flags |= PIPE_CONTROL_CS_STALL;
305
306 /*
307 * CS_STALL suggests at least a post-sync write.
308 */
309 flags |= PIPE_CONTROL_QW_WRITE;
310 flags |= PIPE_CONTROL_GLOBAL_GTT_IVB;
311
312 /*
313 * Just flush everything. Experiments have shown that reducing the
314 * number of bits based on the write domains has little performance
315 * impact.
316 */
317 if (mode & EMIT_FLUSH) {
318 flags |= PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH;
319 flags |= PIPE_CONTROL_DEPTH_CACHE_FLUSH;
320 flags |= PIPE_CONTROL_DC_FLUSH_ENABLE;
321 flags |= PIPE_CONTROL_FLUSH_ENABLE;
322 }
323 if (mode & EMIT_INVALIDATE) {
324 flags |= PIPE_CONTROL_TLB_INVALIDATE;
325 flags |= PIPE_CONTROL_INSTRUCTION_CACHE_INVALIDATE;
326 flags |= PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE;
327 flags |= PIPE_CONTROL_VF_CACHE_INVALIDATE;
328 flags |= PIPE_CONTROL_CONST_CACHE_INVALIDATE;
329 flags |= PIPE_CONTROL_STATE_CACHE_INVALIDATE;
330 flags |= PIPE_CONTROL_MEDIA_STATE_CLEAR;
331
332 /*
333 * Workaround: we must issue a pipe_control with CS-stall bit
334 * set before a pipe_control command that has the state cache
335 * invalidate bit set.
336 */
337 gen7_stall_cs(rq);
338 }
339
340 cs = intel_ring_begin(rq, 4);
341 if (IS_ERR(cs))
342 return PTR_ERR(cs);
343
344 *cs++ = GFX_OP_PIPE_CONTROL(4);
345 *cs++ = flags;
346 *cs++ = scratch_addr;
347 *cs++ = 0;
348 intel_ring_advance(rq, cs);
349
350 return 0;
351 }
352
gen7_emit_breadcrumb_rcs(struct i915_request * rq,u32 * cs)353 u32 *gen7_emit_breadcrumb_rcs(struct i915_request *rq, u32 *cs)
354 {
355 *cs++ = GFX_OP_PIPE_CONTROL(4);
356 *cs++ = (PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH |
357 PIPE_CONTROL_DEPTH_CACHE_FLUSH |
358 PIPE_CONTROL_DC_FLUSH_ENABLE |
359 PIPE_CONTROL_FLUSH_ENABLE |
360 PIPE_CONTROL_QW_WRITE |
361 PIPE_CONTROL_GLOBAL_GTT_IVB |
362 PIPE_CONTROL_CS_STALL);
363 *cs++ = i915_request_active_seqno(rq);
364 *cs++ = rq->fence.seqno;
365
366 *cs++ = MI_USER_INTERRUPT;
367 *cs++ = MI_NOOP;
368
369 rq->tail = intel_ring_offset(rq, cs);
370 assert_ring_tail_valid(rq->ring, rq->tail);
371
372 return cs;
373 }
374
gen6_emit_breadcrumb_xcs(struct i915_request * rq,u32 * cs)375 u32 *gen6_emit_breadcrumb_xcs(struct i915_request *rq, u32 *cs)
376 {
377 GEM_BUG_ON(i915_request_active_timeline(rq)->hwsp_ggtt != rq->engine->status_page.vma);
378 GEM_BUG_ON(offset_in_page(rq->hwsp_seqno) != I915_GEM_HWS_SEQNO_ADDR);
379
380 *cs++ = MI_FLUSH_DW | MI_FLUSH_DW_OP_STOREDW | MI_FLUSH_DW_STORE_INDEX;
381 *cs++ = I915_GEM_HWS_SEQNO_ADDR | MI_FLUSH_DW_USE_GTT;
382 *cs++ = rq->fence.seqno;
383
384 *cs++ = MI_USER_INTERRUPT;
385
386 rq->tail = intel_ring_offset(rq, cs);
387 assert_ring_tail_valid(rq->ring, rq->tail);
388
389 return cs;
390 }
391
392 #define GEN7_XCS_WA 32
gen7_emit_breadcrumb_xcs(struct i915_request * rq,u32 * cs)393 u32 *gen7_emit_breadcrumb_xcs(struct i915_request *rq, u32 *cs)
394 {
395 int i;
396
397 GEM_BUG_ON(i915_request_active_timeline(rq)->hwsp_ggtt != rq->engine->status_page.vma);
398 GEM_BUG_ON(offset_in_page(rq->hwsp_seqno) != I915_GEM_HWS_SEQNO_ADDR);
399
400 *cs++ = MI_FLUSH_DW | MI_INVALIDATE_TLB |
401 MI_FLUSH_DW_OP_STOREDW | MI_FLUSH_DW_STORE_INDEX;
402 *cs++ = I915_GEM_HWS_SEQNO_ADDR | MI_FLUSH_DW_USE_GTT;
403 *cs++ = rq->fence.seqno;
404
405 for (i = 0; i < GEN7_XCS_WA; i++) {
406 *cs++ = MI_STORE_DWORD_INDEX;
407 *cs++ = I915_GEM_HWS_SEQNO_ADDR;
408 *cs++ = rq->fence.seqno;
409 }
410
411 *cs++ = MI_FLUSH_DW;
412 *cs++ = 0;
413 *cs++ = 0;
414
415 *cs++ = MI_USER_INTERRUPT;
416 *cs++ = MI_NOOP;
417
418 rq->tail = intel_ring_offset(rq, cs);
419 assert_ring_tail_valid(rq->ring, rq->tail);
420
421 return cs;
422 }
423 #undef GEN7_XCS_WA
424
gen6_irq_enable(struct intel_engine_cs * engine)425 void gen6_irq_enable(struct intel_engine_cs *engine)
426 {
427 ENGINE_WRITE(engine, RING_IMR,
428 ~(engine->irq_enable_mask | engine->irq_keep_mask));
429
430 /* Flush/delay to ensure the RING_IMR is active before the GT IMR */
431 ENGINE_POSTING_READ(engine, RING_IMR);
432
433 gen5_gt_enable_irq(engine->gt, engine->irq_enable_mask);
434 }
435
gen6_irq_disable(struct intel_engine_cs * engine)436 void gen6_irq_disable(struct intel_engine_cs *engine)
437 {
438 ENGINE_WRITE(engine, RING_IMR, ~engine->irq_keep_mask);
439 gen5_gt_disable_irq(engine->gt, engine->irq_enable_mask);
440 }
441
hsw_irq_enable_vecs(struct intel_engine_cs * engine)442 void hsw_irq_enable_vecs(struct intel_engine_cs *engine)
443 {
444 ENGINE_WRITE(engine, RING_IMR, ~engine->irq_enable_mask);
445
446 /* Flush/delay to ensure the RING_IMR is active before the GT IMR */
447 ENGINE_POSTING_READ(engine, RING_IMR);
448
449 gen6_gt_pm_unmask_irq(engine->gt, engine->irq_enable_mask);
450 }
451
hsw_irq_disable_vecs(struct intel_engine_cs * engine)452 void hsw_irq_disable_vecs(struct intel_engine_cs *engine)
453 {
454 ENGINE_WRITE(engine, RING_IMR, ~0);
455 gen6_gt_pm_mask_irq(engine->gt, engine->irq_enable_mask);
456 }
457