xref: /linux/drivers/gpu/drm/i915/gt/gen6_engine_cs.c (revision e7d759f31ca295d589f7420719c311870bb3166f)
1 // SPDX-License-Identifier: MIT
2 /*
3  * Copyright © 2020 Intel Corporation
4  */
5 
6 #include "gen6_engine_cs.h"
7 #include "intel_engine.h"
8 #include "intel_engine_regs.h"
9 #include "intel_gpu_commands.h"
10 #include "intel_gt.h"
11 #include "intel_gt_irq.h"
12 #include "intel_gt_pm_irq.h"
13 #include "intel_ring.h"
14 
15 #define HWS_SCRATCH_ADDR	(I915_GEM_HWS_SCRATCH * sizeof(u32))
16 
17 /*
18  * Emits a PIPE_CONTROL with a non-zero post-sync operation, for
19  * implementing two workarounds on gen6.  From section 1.4.7.1
20  * "PIPE_CONTROL" of the Sandy Bridge PRM volume 2 part 1:
21  *
22  * [DevSNB-C+{W/A}] Before any depth stall flush (including those
23  * produced by non-pipelined state commands), software needs to first
24  * send a PIPE_CONTROL with no bits set except Post-Sync Operation !=
25  * 0.
26  *
27  * [Dev-SNB{W/A}]: Before a PIPE_CONTROL with Write Cache Flush Enable
28  * =1, a PIPE_CONTROL with any non-zero post-sync-op is required.
29  *
30  * And the workaround for these two requires this workaround first:
31  *
32  * [Dev-SNB{W/A}]: Pipe-control with CS-stall bit set must be sent
33  * BEFORE the pipe-control with a post-sync op and no write-cache
34  * flushes.
35  *
36  * And this last workaround is tricky because of the requirements on
37  * that bit.  From section 1.4.7.2.3 "Stall" of the Sandy Bridge PRM
38  * volume 2 part 1:
39  *
40  *     "1 of the following must also be set:
41  *      - Render Target Cache Flush Enable ([12] of DW1)
42  *      - Depth Cache Flush Enable ([0] of DW1)
43  *      - Stall at Pixel Scoreboard ([1] of DW1)
44  *      - Depth Stall ([13] of DW1)
45  *      - Post-Sync Operation ([13] of DW1)
46  *      - Notify Enable ([8] of DW1)"
47  *
48  * The cache flushes require the workaround flush that triggered this
49  * one, so we can't use it.  Depth stall would trigger the same.
50  * Post-sync nonzero is what triggered this second workaround, so we
51  * can't use that one either.  Notify enable is IRQs, which aren't
52  * really our business.  That leaves only stall at scoreboard.
53  */
54 static int
55 gen6_emit_post_sync_nonzero_flush(struct i915_request *rq)
56 {
57 	u32 scratch_addr =
58 		intel_gt_scratch_offset(rq->engine->gt,
59 					INTEL_GT_SCRATCH_FIELD_RENDER_FLUSH);
60 	u32 *cs;
61 
62 	cs = intel_ring_begin(rq, 6);
63 	if (IS_ERR(cs))
64 		return PTR_ERR(cs);
65 
66 	*cs++ = GFX_OP_PIPE_CONTROL(5);
67 	*cs++ = PIPE_CONTROL_CS_STALL | PIPE_CONTROL_STALL_AT_SCOREBOARD;
68 	*cs++ = scratch_addr | PIPE_CONTROL_GLOBAL_GTT;
69 	*cs++ = 0; /* low dword */
70 	*cs++ = 0; /* high dword */
71 	*cs++ = MI_NOOP;
72 	intel_ring_advance(rq, cs);
73 
74 	cs = intel_ring_begin(rq, 6);
75 	if (IS_ERR(cs))
76 		return PTR_ERR(cs);
77 
78 	*cs++ = GFX_OP_PIPE_CONTROL(5);
79 	*cs++ = PIPE_CONTROL_QW_WRITE;
80 	*cs++ = scratch_addr | PIPE_CONTROL_GLOBAL_GTT;
81 	*cs++ = 0;
82 	*cs++ = 0;
83 	*cs++ = MI_NOOP;
84 	intel_ring_advance(rq, cs);
85 
86 	return 0;
87 }
88 
89 int gen6_emit_flush_rcs(struct i915_request *rq, u32 mode)
90 {
91 	u32 scratch_addr =
92 		intel_gt_scratch_offset(rq->engine->gt,
93 					INTEL_GT_SCRATCH_FIELD_RENDER_FLUSH);
94 	u32 *cs, flags = 0;
95 	int ret;
96 
97 	/* Force SNB workarounds for PIPE_CONTROL flushes */
98 	ret = gen6_emit_post_sync_nonzero_flush(rq);
99 	if (ret)
100 		return ret;
101 
102 	/*
103 	 * Just flush everything.  Experiments have shown that reducing the
104 	 * number of bits based on the write domains has little performance
105 	 * impact. And when rearranging requests, the order of flushes is
106 	 * unknown.
107 	 */
108 	if (mode & EMIT_FLUSH) {
109 		flags |= PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH;
110 		flags |= PIPE_CONTROL_DEPTH_CACHE_FLUSH;
111 		/*
112 		 * Ensure that any following seqno writes only happen
113 		 * when the render cache is indeed flushed.
114 		 */
115 		flags |= PIPE_CONTROL_CS_STALL;
116 	}
117 	if (mode & EMIT_INVALIDATE) {
118 		flags |= PIPE_CONTROL_TLB_INVALIDATE;
119 		flags |= PIPE_CONTROL_INSTRUCTION_CACHE_INVALIDATE;
120 		flags |= PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE;
121 		flags |= PIPE_CONTROL_VF_CACHE_INVALIDATE;
122 		flags |= PIPE_CONTROL_CONST_CACHE_INVALIDATE;
123 		flags |= PIPE_CONTROL_STATE_CACHE_INVALIDATE;
124 		/*
125 		 * TLB invalidate requires a post-sync write.
126 		 */
127 		flags |= PIPE_CONTROL_QW_WRITE | PIPE_CONTROL_CS_STALL;
128 	}
129 
130 	cs = intel_ring_begin(rq, 4);
131 	if (IS_ERR(cs))
132 		return PTR_ERR(cs);
133 
134 	*cs++ = GFX_OP_PIPE_CONTROL(4);
135 	*cs++ = flags;
136 	*cs++ = scratch_addr | PIPE_CONTROL_GLOBAL_GTT;
137 	*cs++ = 0;
138 	intel_ring_advance(rq, cs);
139 
140 	return 0;
141 }
142 
143 u32 *gen6_emit_breadcrumb_rcs(struct i915_request *rq, u32 *cs)
144 {
145 	/* First we do the gen6_emit_post_sync_nonzero_flush w/a */
146 	*cs++ = GFX_OP_PIPE_CONTROL(4);
147 	*cs++ = PIPE_CONTROL_CS_STALL | PIPE_CONTROL_STALL_AT_SCOREBOARD;
148 	*cs++ = 0;
149 	*cs++ = 0;
150 
151 	*cs++ = GFX_OP_PIPE_CONTROL(4);
152 	*cs++ = PIPE_CONTROL_QW_WRITE;
153 	*cs++ = intel_gt_scratch_offset(rq->engine->gt,
154 					INTEL_GT_SCRATCH_FIELD_DEFAULT) |
155 		PIPE_CONTROL_GLOBAL_GTT;
156 	*cs++ = 0;
157 
158 	/* Finally we can flush and with it emit the breadcrumb */
159 	*cs++ = GFX_OP_PIPE_CONTROL(4);
160 	*cs++ = (PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH |
161 		 PIPE_CONTROL_DEPTH_CACHE_FLUSH |
162 		 PIPE_CONTROL_DC_FLUSH_ENABLE |
163 		 PIPE_CONTROL_QW_WRITE |
164 		 PIPE_CONTROL_CS_STALL);
165 	*cs++ = i915_request_active_seqno(rq) |
166 		PIPE_CONTROL_GLOBAL_GTT;
167 	*cs++ = rq->fence.seqno;
168 
169 	*cs++ = MI_USER_INTERRUPT;
170 	*cs++ = MI_NOOP;
171 
172 	rq->tail = intel_ring_offset(rq, cs);
173 	assert_ring_tail_valid(rq->ring, rq->tail);
174 
175 	return cs;
176 }
177 
178 static int mi_flush_dw(struct i915_request *rq, u32 flags)
179 {
180 	u32 cmd, *cs;
181 
182 	cs = intel_ring_begin(rq, 4);
183 	if (IS_ERR(cs))
184 		return PTR_ERR(cs);
185 
186 	cmd = MI_FLUSH_DW;
187 
188 	/*
189 	 * We always require a command barrier so that subsequent
190 	 * commands, such as breadcrumb interrupts, are strictly ordered
191 	 * wrt the contents of the write cache being flushed to memory
192 	 * (and thus being coherent from the CPU).
193 	 */
194 	cmd |= MI_FLUSH_DW_STORE_INDEX | MI_FLUSH_DW_OP_STOREDW;
195 
196 	/*
197 	 * Bspec vol 1c.3 - blitter engine command streamer:
198 	 * "If ENABLED, all TLBs will be invalidated once the flush
199 	 * operation is complete. This bit is only valid when the
200 	 * Post-Sync Operation field is a value of 1h or 3h."
201 	 */
202 	cmd |= flags;
203 
204 	*cs++ = cmd;
205 	*cs++ = HWS_SCRATCH_ADDR | MI_FLUSH_DW_USE_GTT;
206 	*cs++ = 0;
207 	*cs++ = MI_NOOP;
208 
209 	intel_ring_advance(rq, cs);
210 
211 	return 0;
212 }
213 
214 static int gen6_flush_dw(struct i915_request *rq, u32 mode, u32 invflags)
215 {
216 	return mi_flush_dw(rq, mode & EMIT_INVALIDATE ? invflags : 0);
217 }
218 
219 int gen6_emit_flush_xcs(struct i915_request *rq, u32 mode)
220 {
221 	return gen6_flush_dw(rq, mode, MI_INVALIDATE_TLB);
222 }
223 
224 int gen6_emit_flush_vcs(struct i915_request *rq, u32 mode)
225 {
226 	return gen6_flush_dw(rq, mode, MI_INVALIDATE_TLB | MI_INVALIDATE_BSD);
227 }
228 
229 int gen6_emit_bb_start(struct i915_request *rq,
230 		       u64 offset, u32 len,
231 		       unsigned int dispatch_flags)
232 {
233 	u32 security;
234 	u32 *cs;
235 
236 	security = MI_BATCH_NON_SECURE_I965;
237 	if (dispatch_flags & I915_DISPATCH_SECURE)
238 		security = 0;
239 
240 	cs = intel_ring_begin(rq, 2);
241 	if (IS_ERR(cs))
242 		return PTR_ERR(cs);
243 
244 	cs = __gen6_emit_bb_start(cs, offset, security);
245 	intel_ring_advance(rq, cs);
246 
247 	return 0;
248 }
249 
250 int
251 hsw_emit_bb_start(struct i915_request *rq,
252 		  u64 offset, u32 len,
253 		  unsigned int dispatch_flags)
254 {
255 	u32 security;
256 	u32 *cs;
257 
258 	security = MI_BATCH_PPGTT_HSW | MI_BATCH_NON_SECURE_HSW;
259 	if (dispatch_flags & I915_DISPATCH_SECURE)
260 		security = 0;
261 
262 	cs = intel_ring_begin(rq, 2);
263 	if (IS_ERR(cs))
264 		return PTR_ERR(cs);
265 
266 	cs = __gen6_emit_bb_start(cs, offset, security);
267 	intel_ring_advance(rq, cs);
268 
269 	return 0;
270 }
271 
272 static int gen7_stall_cs(struct i915_request *rq)
273 {
274 	u32 *cs;
275 
276 	cs = intel_ring_begin(rq, 4);
277 	if (IS_ERR(cs))
278 		return PTR_ERR(cs);
279 
280 	*cs++ = GFX_OP_PIPE_CONTROL(4);
281 	*cs++ = PIPE_CONTROL_CS_STALL | PIPE_CONTROL_STALL_AT_SCOREBOARD;
282 	*cs++ = 0;
283 	*cs++ = 0;
284 	intel_ring_advance(rq, cs);
285 
286 	return 0;
287 }
288 
289 int gen7_emit_flush_rcs(struct i915_request *rq, u32 mode)
290 {
291 	u32 scratch_addr =
292 		intel_gt_scratch_offset(rq->engine->gt,
293 					INTEL_GT_SCRATCH_FIELD_RENDER_FLUSH);
294 	u32 *cs, flags = 0;
295 
296 	/*
297 	 * Ensure that any following seqno writes only happen when the render
298 	 * cache is indeed flushed.
299 	 *
300 	 * Workaround: 4th PIPE_CONTROL command (except the ones with only
301 	 * read-cache invalidate bits set) must have the CS_STALL bit set. We
302 	 * don't try to be clever and just set it unconditionally.
303 	 */
304 	flags |= PIPE_CONTROL_CS_STALL;
305 
306 	/*
307 	 * CS_STALL suggests at least a post-sync write.
308 	 */
309 	flags |= PIPE_CONTROL_QW_WRITE;
310 	flags |= PIPE_CONTROL_GLOBAL_GTT_IVB;
311 
312 	/*
313 	 * Just flush everything.  Experiments have shown that reducing the
314 	 * number of bits based on the write domains has little performance
315 	 * impact.
316 	 */
317 	if (mode & EMIT_FLUSH) {
318 		flags |= PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH;
319 		flags |= PIPE_CONTROL_DEPTH_CACHE_FLUSH;
320 		flags |= PIPE_CONTROL_DC_FLUSH_ENABLE;
321 		flags |= PIPE_CONTROL_FLUSH_ENABLE;
322 	}
323 	if (mode & EMIT_INVALIDATE) {
324 		flags |= PIPE_CONTROL_TLB_INVALIDATE;
325 		flags |= PIPE_CONTROL_INSTRUCTION_CACHE_INVALIDATE;
326 		flags |= PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE;
327 		flags |= PIPE_CONTROL_VF_CACHE_INVALIDATE;
328 		flags |= PIPE_CONTROL_CONST_CACHE_INVALIDATE;
329 		flags |= PIPE_CONTROL_STATE_CACHE_INVALIDATE;
330 		flags |= PIPE_CONTROL_MEDIA_STATE_CLEAR;
331 
332 		/*
333 		 * Workaround: we must issue a pipe_control with CS-stall bit
334 		 * set before a pipe_control command that has the state cache
335 		 * invalidate bit set.
336 		 */
337 		gen7_stall_cs(rq);
338 	}
339 
340 	cs = intel_ring_begin(rq, 4);
341 	if (IS_ERR(cs))
342 		return PTR_ERR(cs);
343 
344 	*cs++ = GFX_OP_PIPE_CONTROL(4);
345 	*cs++ = flags;
346 	*cs++ = scratch_addr;
347 	*cs++ = 0;
348 	intel_ring_advance(rq, cs);
349 
350 	return 0;
351 }
352 
353 u32 *gen7_emit_breadcrumb_rcs(struct i915_request *rq, u32 *cs)
354 {
355 	*cs++ = GFX_OP_PIPE_CONTROL(4);
356 	*cs++ = (PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH |
357 		 PIPE_CONTROL_DEPTH_CACHE_FLUSH |
358 		 PIPE_CONTROL_DC_FLUSH_ENABLE |
359 		 PIPE_CONTROL_FLUSH_ENABLE |
360 		 PIPE_CONTROL_QW_WRITE |
361 		 PIPE_CONTROL_GLOBAL_GTT_IVB |
362 		 PIPE_CONTROL_CS_STALL);
363 	*cs++ = i915_request_active_seqno(rq);
364 	*cs++ = rq->fence.seqno;
365 
366 	*cs++ = MI_USER_INTERRUPT;
367 	*cs++ = MI_NOOP;
368 
369 	rq->tail = intel_ring_offset(rq, cs);
370 	assert_ring_tail_valid(rq->ring, rq->tail);
371 
372 	return cs;
373 }
374 
375 u32 *gen6_emit_breadcrumb_xcs(struct i915_request *rq, u32 *cs)
376 {
377 	GEM_BUG_ON(i915_request_active_timeline(rq)->hwsp_ggtt != rq->engine->status_page.vma);
378 	GEM_BUG_ON(offset_in_page(rq->hwsp_seqno) != I915_GEM_HWS_SEQNO_ADDR);
379 
380 	*cs++ = MI_FLUSH_DW | MI_FLUSH_DW_OP_STOREDW | MI_FLUSH_DW_STORE_INDEX;
381 	*cs++ = I915_GEM_HWS_SEQNO_ADDR | MI_FLUSH_DW_USE_GTT;
382 	*cs++ = rq->fence.seqno;
383 
384 	*cs++ = MI_USER_INTERRUPT;
385 
386 	rq->tail = intel_ring_offset(rq, cs);
387 	assert_ring_tail_valid(rq->ring, rq->tail);
388 
389 	return cs;
390 }
391 
392 #define GEN7_XCS_WA 32
393 u32 *gen7_emit_breadcrumb_xcs(struct i915_request *rq, u32 *cs)
394 {
395 	int i;
396 
397 	GEM_BUG_ON(i915_request_active_timeline(rq)->hwsp_ggtt != rq->engine->status_page.vma);
398 	GEM_BUG_ON(offset_in_page(rq->hwsp_seqno) != I915_GEM_HWS_SEQNO_ADDR);
399 
400 	*cs++ = MI_FLUSH_DW | MI_INVALIDATE_TLB |
401 		MI_FLUSH_DW_OP_STOREDW | MI_FLUSH_DW_STORE_INDEX;
402 	*cs++ = I915_GEM_HWS_SEQNO_ADDR | MI_FLUSH_DW_USE_GTT;
403 	*cs++ = rq->fence.seqno;
404 
405 	for (i = 0; i < GEN7_XCS_WA; i++) {
406 		*cs++ = MI_STORE_DWORD_INDEX;
407 		*cs++ = I915_GEM_HWS_SEQNO_ADDR;
408 		*cs++ = rq->fence.seqno;
409 	}
410 
411 	*cs++ = MI_FLUSH_DW;
412 	*cs++ = 0;
413 	*cs++ = 0;
414 
415 	*cs++ = MI_USER_INTERRUPT;
416 	*cs++ = MI_NOOP;
417 
418 	rq->tail = intel_ring_offset(rq, cs);
419 	assert_ring_tail_valid(rq->ring, rq->tail);
420 
421 	return cs;
422 }
423 #undef GEN7_XCS_WA
424 
425 void gen6_irq_enable(struct intel_engine_cs *engine)
426 {
427 	ENGINE_WRITE(engine, RING_IMR,
428 		     ~(engine->irq_enable_mask | engine->irq_keep_mask));
429 
430 	/* Flush/delay to ensure the RING_IMR is active before the GT IMR */
431 	ENGINE_POSTING_READ(engine, RING_IMR);
432 
433 	gen5_gt_enable_irq(engine->gt, engine->irq_enable_mask);
434 }
435 
436 void gen6_irq_disable(struct intel_engine_cs *engine)
437 {
438 	ENGINE_WRITE(engine, RING_IMR, ~engine->irq_keep_mask);
439 	gen5_gt_disable_irq(engine->gt, engine->irq_enable_mask);
440 }
441 
442 void hsw_irq_enable_vecs(struct intel_engine_cs *engine)
443 {
444 	ENGINE_WRITE(engine, RING_IMR, ~engine->irq_enable_mask);
445 
446 	/* Flush/delay to ensure the RING_IMR is active before the GT IMR */
447 	ENGINE_POSTING_READ(engine, RING_IMR);
448 
449 	gen6_gt_pm_unmask_irq(engine->gt, engine->irq_enable_mask);
450 }
451 
452 void hsw_irq_disable_vecs(struct intel_engine_cs *engine)
453 {
454 	ENGINE_WRITE(engine, RING_IMR, ~0);
455 	gen6_gt_pm_mask_irq(engine->gt, engine->irq_enable_mask);
456 }
457