xref: /linux/drivers/gpu/drm/i915/gt/gen2_engine_cs.c (revision b8321ed4a40c02054f930ca59d3570caa27bc86c)
1 // SPDX-License-Identifier: MIT
2 /*
3  * Copyright © 2020 Intel Corporation
4  */
5 
6 #include "gen2_engine_cs.h"
7 #include "i915_drv.h"
8 #include "i915_reg.h"
9 #include "intel_engine.h"
10 #include "intel_engine_regs.h"
11 #include "intel_gpu_commands.h"
12 #include "intel_gt.h"
13 #include "intel_gt_irq.h"
14 #include "intel_ring.h"
15 
16 int gen2_emit_flush(struct i915_request *rq, u32 mode)
17 {
18 	unsigned int num_store_dw = 12;
19 	u32 cmd, *cs;
20 
21 	cmd = MI_FLUSH;
22 	if (mode & EMIT_INVALIDATE)
23 		cmd |= MI_READ_FLUSH;
24 
25 	cs = intel_ring_begin(rq, 2 + 4 * num_store_dw);
26 	if (IS_ERR(cs))
27 		return PTR_ERR(cs);
28 
29 	*cs++ = cmd;
30 	while (num_store_dw--) {
31 		*cs++ = MI_STORE_DWORD_INDEX;
32 		*cs++ = I915_GEM_HWS_SCRATCH * sizeof(u32);
33 		*cs++ = 0;
34 		*cs++ = MI_FLUSH | MI_NO_WRITE_FLUSH;
35 	}
36 	*cs++ = cmd;
37 
38 	intel_ring_advance(rq, cs);
39 
40 	return 0;
41 }
42 
43 int gen4_emit_flush_rcs(struct i915_request *rq, u32 mode)
44 {
45 	u32 cmd, *cs;
46 	int i;
47 
48 	/*
49 	 * read/write caches:
50 	 *
51 	 * I915_GEM_DOMAIN_RENDER is always invalidated, but is
52 	 * only flushed if MI_NO_WRITE_FLUSH is unset.  On 965, it is
53 	 * also flushed at 2d versus 3d pipeline switches.
54 	 *
55 	 * read-only caches:
56 	 *
57 	 * I915_GEM_DOMAIN_SAMPLER is flushed on pre-965 if
58 	 * MI_READ_FLUSH is set, and is always flushed on 965.
59 	 *
60 	 * I915_GEM_DOMAIN_COMMAND may not exist?
61 	 *
62 	 * I915_GEM_DOMAIN_INSTRUCTION, which exists on 965, is
63 	 * invalidated when MI_EXE_FLUSH is set.
64 	 *
65 	 * I915_GEM_DOMAIN_VERTEX, which exists on 965, is
66 	 * invalidated with every MI_FLUSH.
67 	 *
68 	 * TLBs:
69 	 *
70 	 * On 965, TLBs associated with I915_GEM_DOMAIN_COMMAND
71 	 * and I915_GEM_DOMAIN_CPU in are invalidated at PTE write and
72 	 * I915_GEM_DOMAIN_RENDER and I915_GEM_DOMAIN_SAMPLER
73 	 * are flushed at any MI_FLUSH.
74 	 */
75 
76 	cmd = MI_FLUSH;
77 	if (mode & EMIT_INVALIDATE) {
78 		cmd |= MI_EXE_FLUSH;
79 		if (IS_G4X(rq->engine->i915) || GRAPHICS_VER(rq->engine->i915) == 5)
80 			cmd |= MI_INVALIDATE_ISP;
81 	}
82 
83 	i = 2;
84 	if (mode & EMIT_INVALIDATE)
85 		i += 20;
86 
87 	cs = intel_ring_begin(rq, i);
88 	if (IS_ERR(cs))
89 		return PTR_ERR(cs);
90 
91 	*cs++ = cmd;
92 
93 	/*
94 	 * A random delay to let the CS invalidate take effect? Without this
95 	 * delay, the GPU relocation path fails as the CS does not see
96 	 * the updated contents. Just as important, if we apply the flushes
97 	 * to the EMIT_FLUSH branch (i.e. immediately after the relocation
98 	 * write and before the invalidate on the next batch), the relocations
99 	 * still fail. This implies that is a delay following invalidation
100 	 * that is required to reset the caches as opposed to a delay to
101 	 * ensure the memory is written.
102 	 */
103 	if (mode & EMIT_INVALIDATE) {
104 		*cs++ = GFX_OP_PIPE_CONTROL(4) | PIPE_CONTROL_QW_WRITE;
105 		*cs++ = intel_gt_scratch_offset(rq->engine->gt,
106 						INTEL_GT_SCRATCH_FIELD_DEFAULT) |
107 			PIPE_CONTROL_GLOBAL_GTT;
108 		*cs++ = 0;
109 		*cs++ = 0;
110 
111 		for (i = 0; i < 12; i++)
112 			*cs++ = MI_FLUSH;
113 
114 		*cs++ = GFX_OP_PIPE_CONTROL(4) | PIPE_CONTROL_QW_WRITE;
115 		*cs++ = intel_gt_scratch_offset(rq->engine->gt,
116 						INTEL_GT_SCRATCH_FIELD_DEFAULT) |
117 			PIPE_CONTROL_GLOBAL_GTT;
118 		*cs++ = 0;
119 		*cs++ = 0;
120 	}
121 
122 	*cs++ = cmd;
123 
124 	intel_ring_advance(rq, cs);
125 
126 	return 0;
127 }
128 
129 int gen4_emit_flush_vcs(struct i915_request *rq, u32 mode)
130 {
131 	u32 *cs;
132 
133 	cs = intel_ring_begin(rq, 2);
134 	if (IS_ERR(cs))
135 		return PTR_ERR(cs);
136 
137 	*cs++ = MI_FLUSH;
138 	*cs++ = MI_NOOP;
139 	intel_ring_advance(rq, cs);
140 
141 	return 0;
142 }
143 
144 static u32 *__gen2_emit_breadcrumb(struct i915_request *rq, u32 *cs,
145 				   int flush, int post)
146 {
147 	GEM_BUG_ON(i915_request_active_timeline(rq)->hwsp_ggtt != rq->engine->status_page.vma);
148 	GEM_BUG_ON(offset_in_page(rq->hwsp_seqno) != I915_GEM_HWS_SEQNO_ADDR);
149 
150 	*cs++ = MI_FLUSH;
151 
152 	while (flush--) {
153 		*cs++ = MI_STORE_DWORD_INDEX;
154 		*cs++ = I915_GEM_HWS_SCRATCH * sizeof(u32);
155 		*cs++ = rq->fence.seqno;
156 	}
157 
158 	while (post--) {
159 		*cs++ = MI_STORE_DWORD_INDEX;
160 		*cs++ = I915_GEM_HWS_SEQNO_ADDR;
161 		*cs++ = rq->fence.seqno;
162 	}
163 
164 	*cs++ = MI_USER_INTERRUPT;
165 
166 	rq->tail = intel_ring_offset(rq, cs);
167 	assert_ring_tail_valid(rq->ring, rq->tail);
168 
169 	return cs;
170 }
171 
172 u32 *gen3_emit_breadcrumb(struct i915_request *rq, u32 *cs)
173 {
174 	return __gen2_emit_breadcrumb(rq, cs, 16, 8);
175 }
176 
177 u32 *gen5_emit_breadcrumb(struct i915_request *rq, u32 *cs)
178 {
179 	return __gen2_emit_breadcrumb(rq, cs, 8, 8);
180 }
181 
182 /* Just userspace ABI convention to limit the wa batch bo to a resonable size */
183 #define I830_BATCH_LIMIT SZ_256K
184 #define I830_TLB_ENTRIES (2)
185 #define I830_WA_SIZE max(I830_TLB_ENTRIES * SZ_4K, I830_BATCH_LIMIT)
186 int i830_emit_bb_start(struct i915_request *rq,
187 		       u64 offset, u32 len,
188 		       unsigned int dispatch_flags)
189 {
190 	u32 *cs, cs_offset =
191 		intel_gt_scratch_offset(rq->engine->gt,
192 					INTEL_GT_SCRATCH_FIELD_DEFAULT);
193 
194 	GEM_BUG_ON(rq->engine->gt->scratch->size < I830_WA_SIZE);
195 
196 	cs = intel_ring_begin(rq, 6);
197 	if (IS_ERR(cs))
198 		return PTR_ERR(cs);
199 
200 	/* Evict the invalid PTE TLBs */
201 	*cs++ = COLOR_BLT_CMD | BLT_WRITE_RGBA;
202 	*cs++ = BLT_DEPTH_32 | BLT_ROP_COLOR_COPY | 4096;
203 	*cs++ = I830_TLB_ENTRIES << 16 | 4; /* load each page */
204 	*cs++ = cs_offset;
205 	*cs++ = 0xdeadbeef;
206 	*cs++ = MI_NOOP;
207 	intel_ring_advance(rq, cs);
208 
209 	if ((dispatch_flags & I915_DISPATCH_PINNED) == 0) {
210 		if (len > I830_BATCH_LIMIT)
211 			return -ENOSPC;
212 
213 		cs = intel_ring_begin(rq, 6 + 2);
214 		if (IS_ERR(cs))
215 			return PTR_ERR(cs);
216 
217 		/*
218 		 * Blit the batch (which has now all relocs applied) to the
219 		 * stable batch scratch bo area (so that the CS never
220 		 * stumbles over its tlb invalidation bug) ...
221 		 */
222 		*cs++ = SRC_COPY_BLT_CMD | BLT_WRITE_RGBA | (6 - 2);
223 		*cs++ = BLT_DEPTH_32 | BLT_ROP_SRC_COPY | 4096;
224 		*cs++ = DIV_ROUND_UP(len, 4096) << 16 | 4096;
225 		*cs++ = cs_offset;
226 		*cs++ = 4096;
227 		*cs++ = offset;
228 
229 		*cs++ = MI_FLUSH;
230 		*cs++ = MI_NOOP;
231 		intel_ring_advance(rq, cs);
232 
233 		/* ... and execute it. */
234 		offset = cs_offset;
235 	}
236 
237 	if (!(dispatch_flags & I915_DISPATCH_SECURE))
238 		offset |= MI_BATCH_NON_SECURE;
239 
240 	cs = intel_ring_begin(rq, 2);
241 	if (IS_ERR(cs))
242 		return PTR_ERR(cs);
243 
244 	*cs++ = MI_BATCH_BUFFER_START | MI_BATCH_GTT;
245 	*cs++ = offset;
246 	intel_ring_advance(rq, cs);
247 
248 	return 0;
249 }
250 
251 int gen3_emit_bb_start(struct i915_request *rq,
252 		       u64 offset, u32 len,
253 		       unsigned int dispatch_flags)
254 {
255 	u32 *cs;
256 
257 	if (!(dispatch_flags & I915_DISPATCH_SECURE))
258 		offset |= MI_BATCH_NON_SECURE;
259 
260 	cs = intel_ring_begin(rq, 2);
261 	if (IS_ERR(cs))
262 		return PTR_ERR(cs);
263 
264 	*cs++ = MI_BATCH_BUFFER_START | MI_BATCH_GTT;
265 	*cs++ = offset;
266 	intel_ring_advance(rq, cs);
267 
268 	return 0;
269 }
270 
271 int gen4_emit_bb_start(struct i915_request *rq,
272 		       u64 offset, u32 length,
273 		       unsigned int dispatch_flags)
274 {
275 	u32 security;
276 	u32 *cs;
277 
278 	security = MI_BATCH_NON_SECURE_I965;
279 	if (dispatch_flags & I915_DISPATCH_SECURE)
280 		security = 0;
281 
282 	cs = intel_ring_begin(rq, 2);
283 	if (IS_ERR(cs))
284 		return PTR_ERR(cs);
285 
286 	*cs++ = MI_BATCH_BUFFER_START | MI_BATCH_GTT | security;
287 	*cs++ = offset;
288 	intel_ring_advance(rq, cs);
289 
290 	return 0;
291 }
292 
293 void gen2_irq_enable(struct intel_engine_cs *engine)
294 {
295 	struct drm_i915_private *i915 = engine->i915;
296 
297 	i915->irq_mask &= ~engine->irq_enable_mask;
298 	intel_uncore_write16(&i915->uncore, GEN2_IMR, i915->irq_mask);
299 	ENGINE_POSTING_READ16(engine, RING_IMR);
300 }
301 
302 void gen2_irq_disable(struct intel_engine_cs *engine)
303 {
304 	struct drm_i915_private *i915 = engine->i915;
305 
306 	i915->irq_mask |= engine->irq_enable_mask;
307 	intel_uncore_write16(&i915->uncore, GEN2_IMR, i915->irq_mask);
308 }
309 
310 void gen3_irq_enable(struct intel_engine_cs *engine)
311 {
312 	engine->i915->irq_mask &= ~engine->irq_enable_mask;
313 	intel_uncore_write(engine->uncore, GEN2_IMR, engine->i915->irq_mask);
314 	intel_uncore_posting_read_fw(engine->uncore, GEN2_IMR);
315 }
316 
317 void gen3_irq_disable(struct intel_engine_cs *engine)
318 {
319 	engine->i915->irq_mask |= engine->irq_enable_mask;
320 	intel_uncore_write(engine->uncore, GEN2_IMR, engine->i915->irq_mask);
321 }
322 
323 void gen5_irq_enable(struct intel_engine_cs *engine)
324 {
325 	gen5_gt_enable_irq(engine->gt, engine->irq_enable_mask);
326 }
327 
328 void gen5_irq_disable(struct intel_engine_cs *engine)
329 {
330 	gen5_gt_disable_irq(engine->gt, engine->irq_enable_mask);
331 }
332