xref: /linux/drivers/gpu/drm/vc4/vc4_validate_shaders.c (revision 5d563a5da8717629ae72f9eadf1e0e340bd1658b)
1 /*
2  * Copyright © 2014 Broadcom
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the "Software"),
6  * to deal in the Software without restriction, including without limitation
7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8  * and/or sell copies of the Software, and to permit persons to whom the
9  * Software is furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice (including the next
12  * paragraph) shall be included in all copies or substantial portions of the
13  * Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21  * IN THE SOFTWARE.
22  */
23 
24 /**
25  * DOC: Shader validator for VC4.
26  *
27  * Since the VC4 has no IOMMU between it and system memory, a user
28  * with access to execute shaders could escalate privilege by
29  * overwriting system memory (using the VPM write address register in
30  * the general-purpose DMA mode) or reading system memory it shouldn't
31  * (reading it as a texture, uniform data, or direct-addressed TMU
32  * lookup).
33  *
34  * The shader validator walks over a shader's BO, ensuring that its
35  * accesses are appropriately bounded, and recording where texture
36  * accesses are made so that we can do relocations for them in the
37  * uniform stream.
38  *
39  * Shader BO are immutable for their lifetimes (enforced by not
40  * allowing mmaps, GEM prime export, or rendering to from a CL), so
41  * this validation is only performed at BO creation time.
42  */
43 
44 #include <drm/drm_print.h>
45 
46 #include "vc4_drv.h"
47 #include "vc4_qpu_defines.h"
48 
49 #define LIVE_REG_COUNT (32 + 32 + 4)
50 
51 struct vc4_shader_validation_state {
52 	/* Current IP being validated. */
53 	uint32_t ip;
54 
55 	/* IP at the end of the BO, do not read shader[max_ip] */
56 	uint32_t max_ip;
57 
58 	uint64_t *shader;
59 
60 	struct vc4_texture_sample_info tmu_setup[2];
61 	int tmu_write_count[2];
62 
63 	/* For registers that were last written to by a MIN instruction with
64 	 * one argument being a uniform, the address of the uniform.
65 	 * Otherwise, ~0.
66 	 *
67 	 * This is used for the validation of direct address memory reads.
68 	 */
69 	uint32_t live_min_clamp_offsets[LIVE_REG_COUNT];
70 	bool live_max_clamp_regs[LIVE_REG_COUNT];
71 	uint32_t live_immediates[LIVE_REG_COUNT];
72 
73 	/* Bitfield of which IPs are used as branch targets.
74 	 *
75 	 * Used for validation that the uniform stream is updated at the right
76 	 * points and clearing the texturing/clamping state.
77 	 */
78 	unsigned long *branch_targets;
79 
80 	/* Set when entering a basic block, and cleared when the uniform
81 	 * address update is found.  This is used to make sure that we don't
82 	 * read uniforms when the address is undefined.
83 	 */
84 	bool needs_uniform_address_update;
85 
86 	/* Set when we find a backwards branch.  If the branch is backwards,
87 	 * the taraget is probably doing an address reset to read uniforms,
88 	 * and so we need to be sure that a uniforms address is present in the
89 	 * stream, even if the shader didn't need to read uniforms in later
90 	 * basic blocks.
91 	 */
92 	bool needs_uniform_address_for_loop;
93 
94 	/* Set when we find an instruction writing the top half of the
95 	 * register files.  If we allowed writing the unusable regs in
96 	 * a threaded shader, then the other shader running on our
97 	 * QPU's clamp validation would be invalid.
98 	 */
99 	bool all_registers_used;
100 };
101 
102 static uint32_t
103 waddr_to_live_reg_index(uint32_t waddr, bool is_b)
104 {
105 	if (waddr < 32) {
106 		if (is_b)
107 			return 32 + waddr;
108 		else
109 			return waddr;
110 	} else if (waddr <= QPU_W_ACC3) {
111 		return 64 + waddr - QPU_W_ACC0;
112 	} else {
113 		return ~0;
114 	}
115 }
116 
117 static uint32_t
118 raddr_add_a_to_live_reg_index(uint64_t inst)
119 {
120 	uint32_t sig = QPU_GET_FIELD(inst, QPU_SIG);
121 	uint32_t add_a = QPU_GET_FIELD(inst, QPU_ADD_A);
122 	uint32_t raddr_a = QPU_GET_FIELD(inst, QPU_RADDR_A);
123 	uint32_t raddr_b = QPU_GET_FIELD(inst, QPU_RADDR_B);
124 
125 	if (add_a == QPU_MUX_A)
126 		return raddr_a;
127 	else if (add_a == QPU_MUX_B && sig != QPU_SIG_SMALL_IMM)
128 		return 32 + raddr_b;
129 	else if (add_a <= QPU_MUX_R3)
130 		return 64 + add_a;
131 	else
132 		return ~0;
133 }
134 
135 static bool
136 live_reg_is_upper_half(uint32_t lri)
137 {
138 	return	(lri >= 16 && lri < 32) ||
139 		(lri >= 32 + 16 && lri < 32 + 32);
140 }
141 
142 static bool
143 is_tmu_submit(uint32_t waddr)
144 {
145 	return (waddr == QPU_W_TMU0_S ||
146 		waddr == QPU_W_TMU1_S);
147 }
148 
149 static bool
150 is_tmu_write(uint32_t waddr)
151 {
152 	return (waddr >= QPU_W_TMU0_S &&
153 		waddr <= QPU_W_TMU1_B);
154 }
155 
156 static bool
157 record_texture_sample(struct vc4_validated_shader_info *validated_shader,
158 		      struct vc4_shader_validation_state *validation_state,
159 		      int tmu)
160 {
161 	uint32_t s = validated_shader->num_texture_samples;
162 	int i;
163 	struct vc4_texture_sample_info *temp_samples;
164 
165 	temp_samples = krealloc(validated_shader->texture_samples,
166 				(s + 1) * sizeof(*temp_samples),
167 				GFP_KERNEL);
168 	if (!temp_samples)
169 		return false;
170 
171 	memcpy(&temp_samples[s],
172 	       &validation_state->tmu_setup[tmu],
173 	       sizeof(*temp_samples));
174 
175 	validated_shader->num_texture_samples = s + 1;
176 	validated_shader->texture_samples = temp_samples;
177 
178 	for (i = 0; i < 4; i++)
179 		validation_state->tmu_setup[tmu].p_offset[i] = ~0;
180 
181 	return true;
182 }
183 
184 static bool
185 check_tmu_write(struct vc4_validated_shader_info *validated_shader,
186 		struct vc4_shader_validation_state *validation_state,
187 		bool is_mul)
188 {
189 	uint64_t inst = validation_state->shader[validation_state->ip];
190 	uint32_t waddr = (is_mul ?
191 			  QPU_GET_FIELD(inst, QPU_WADDR_MUL) :
192 			  QPU_GET_FIELD(inst, QPU_WADDR_ADD));
193 	uint32_t raddr_a = QPU_GET_FIELD(inst, QPU_RADDR_A);
194 	uint32_t raddr_b = QPU_GET_FIELD(inst, QPU_RADDR_B);
195 	int tmu = waddr > QPU_W_TMU0_B;
196 	bool submit = is_tmu_submit(waddr);
197 	bool is_direct = submit && validation_state->tmu_write_count[tmu] == 0;
198 	uint32_t sig = QPU_GET_FIELD(inst, QPU_SIG);
199 
200 	if (is_direct) {
201 		uint32_t add_b = QPU_GET_FIELD(inst, QPU_ADD_B);
202 		uint32_t clamp_reg, clamp_offset;
203 
204 		if (sig == QPU_SIG_SMALL_IMM) {
205 			DRM_DEBUG("direct TMU read used small immediate\n");
206 			return false;
207 		}
208 
209 		/* Make sure that this texture load is an add of the base
210 		 * address of the UBO to a clamped offset within the UBO.
211 		 */
212 		if (is_mul ||
213 		    QPU_GET_FIELD(inst, QPU_OP_ADD) != QPU_A_ADD) {
214 			DRM_DEBUG("direct TMU load wasn't an add\n");
215 			return false;
216 		}
217 
218 		/* We assert that the clamped address is the first
219 		 * argument, and the UBO base address is the second argument.
220 		 * This is arbitrary, but simpler than supporting flipping the
221 		 * two either way.
222 		 */
223 		clamp_reg = raddr_add_a_to_live_reg_index(inst);
224 		if (clamp_reg == ~0) {
225 			DRM_DEBUG("direct TMU load wasn't clamped\n");
226 			return false;
227 		}
228 
229 		clamp_offset = validation_state->live_min_clamp_offsets[clamp_reg];
230 		if (clamp_offset == ~0) {
231 			DRM_DEBUG("direct TMU load wasn't clamped\n");
232 			return false;
233 		}
234 
235 		/* Store the clamp value's offset in p1 (see reloc_tex() in
236 		 * vc4_validate.c).
237 		 */
238 		validation_state->tmu_setup[tmu].p_offset[1] =
239 			clamp_offset;
240 
241 		if (!(add_b == QPU_MUX_A && raddr_a == QPU_R_UNIF) &&
242 		    !(add_b == QPU_MUX_B && raddr_b == QPU_R_UNIF)) {
243 			DRM_DEBUG("direct TMU load didn't add to a uniform\n");
244 			return false;
245 		}
246 
247 		validation_state->tmu_setup[tmu].is_direct = true;
248 	} else {
249 		if (raddr_a == QPU_R_UNIF || (sig != QPU_SIG_SMALL_IMM &&
250 					      raddr_b == QPU_R_UNIF)) {
251 			DRM_DEBUG("uniform read in the same instruction as "
252 				  "texture setup.\n");
253 			return false;
254 		}
255 	}
256 
257 	if (validation_state->tmu_write_count[tmu] >= 4) {
258 		DRM_DEBUG("TMU%d got too many parameters before dispatch\n",
259 			  tmu);
260 		return false;
261 	}
262 	validation_state->tmu_setup[tmu].p_offset[validation_state->tmu_write_count[tmu]] =
263 		validated_shader->uniforms_size;
264 	validation_state->tmu_write_count[tmu]++;
265 	/* Since direct uses a RADDR uniform reference, it will get counted in
266 	 * check_instruction_reads()
267 	 */
268 	if (!is_direct) {
269 		if (validation_state->needs_uniform_address_update) {
270 			DRM_DEBUG("Texturing with undefined uniform address\n");
271 			return false;
272 		}
273 
274 		validated_shader->uniforms_size += 4;
275 	}
276 
277 	if (submit) {
278 		if (!record_texture_sample(validated_shader,
279 					   validation_state, tmu)) {
280 			return false;
281 		}
282 
283 		validation_state->tmu_write_count[tmu] = 0;
284 	}
285 
286 	return true;
287 }
288 
289 static bool require_uniform_address_uniform(struct vc4_validated_shader_info *validated_shader)
290 {
291 	uint32_t o = validated_shader->num_uniform_addr_offsets;
292 	uint32_t num_uniforms = validated_shader->uniforms_size / 4;
293 	u32 *offsets;
294 
295 	offsets = krealloc_array(validated_shader->uniform_addr_offsets,
296 				 o + 1,
297 				 sizeof(*validated_shader->uniform_addr_offsets),
298 				 GFP_KERNEL);
299 	if (!offsets)
300 		return false;
301 
302 	validated_shader->uniform_addr_offsets = offsets;
303 	validated_shader->uniform_addr_offsets[o] = num_uniforms;
304 	validated_shader->num_uniform_addr_offsets++;
305 
306 	return true;
307 }
308 
309 static bool
310 validate_uniform_address_write(struct vc4_validated_shader_info *validated_shader,
311 			       struct vc4_shader_validation_state *validation_state,
312 			       bool is_mul)
313 {
314 	uint64_t inst = validation_state->shader[validation_state->ip];
315 	u32 add_b = QPU_GET_FIELD(inst, QPU_ADD_B);
316 	u32 raddr_a = QPU_GET_FIELD(inst, QPU_RADDR_A);
317 	u32 raddr_b = QPU_GET_FIELD(inst, QPU_RADDR_B);
318 	u32 add_lri = raddr_add_a_to_live_reg_index(inst);
319 	/* We want our reset to be pointing at whatever uniform follows the
320 	 * uniforms base address.
321 	 */
322 	u32 expected_offset = validated_shader->uniforms_size + 4;
323 
324 	/* We only support absolute uniform address changes, and we
325 	 * require that they be in the current basic block before any
326 	 * of its uniform reads.
327 	 *
328 	 * One could potentially emit more efficient QPU code, by
329 	 * noticing that (say) an if statement does uniform control
330 	 * flow for all threads and that the if reads the same number
331 	 * of uniforms on each side.  However, this scheme is easy to
332 	 * validate so it's all we allow for now.
333 	 */
334 	switch (QPU_GET_FIELD(inst, QPU_SIG)) {
335 	case QPU_SIG_NONE:
336 	case QPU_SIG_SCOREBOARD_UNLOCK:
337 	case QPU_SIG_COLOR_LOAD:
338 	case QPU_SIG_LOAD_TMU0:
339 	case QPU_SIG_LOAD_TMU1:
340 		break;
341 	default:
342 		DRM_DEBUG("uniforms address change must be "
343 			  "normal math\n");
344 		return false;
345 	}
346 
347 	if (is_mul || QPU_GET_FIELD(inst, QPU_OP_ADD) != QPU_A_ADD) {
348 		DRM_DEBUG("Uniform address reset must be an ADD.\n");
349 		return false;
350 	}
351 
352 	if (QPU_GET_FIELD(inst, QPU_COND_ADD) != QPU_COND_ALWAYS) {
353 		DRM_DEBUG("Uniform address reset must be unconditional.\n");
354 		return false;
355 	}
356 
357 	if (QPU_GET_FIELD(inst, QPU_PACK) != QPU_PACK_A_NOP &&
358 	    !(inst & QPU_PM)) {
359 		DRM_DEBUG("No packing allowed on uniforms reset\n");
360 		return false;
361 	}
362 
363 	if (add_lri == -1) {
364 		DRM_DEBUG("First argument of uniform address write must be "
365 			  "an immediate value.\n");
366 		return false;
367 	}
368 
369 	if (validation_state->live_immediates[add_lri] != expected_offset) {
370 		DRM_DEBUG("Resetting uniforms with offset %db instead of %db\n",
371 			  validation_state->live_immediates[add_lri],
372 			  expected_offset);
373 		return false;
374 	}
375 
376 	if (!(add_b == QPU_MUX_A && raddr_a == QPU_R_UNIF) &&
377 	    !(add_b == QPU_MUX_B && raddr_b == QPU_R_UNIF)) {
378 		DRM_DEBUG("Second argument of uniform address write must be "
379 			  "a uniform.\n");
380 		return false;
381 	}
382 
383 	validation_state->needs_uniform_address_update = false;
384 	validation_state->needs_uniform_address_for_loop = false;
385 	return require_uniform_address_uniform(validated_shader);
386 }
387 
388 static bool
389 check_reg_write(struct vc4_validated_shader_info *validated_shader,
390 		struct vc4_shader_validation_state *validation_state,
391 		bool is_mul)
392 {
393 	uint64_t inst = validation_state->shader[validation_state->ip];
394 	uint32_t waddr = (is_mul ?
395 			  QPU_GET_FIELD(inst, QPU_WADDR_MUL) :
396 			  QPU_GET_FIELD(inst, QPU_WADDR_ADD));
397 	uint32_t sig = QPU_GET_FIELD(inst, QPU_SIG);
398 	bool ws = inst & QPU_WS;
399 	bool is_b = is_mul ^ ws;
400 	u32 lri = waddr_to_live_reg_index(waddr, is_b);
401 
402 	if (lri != -1) {
403 		uint32_t cond_add = QPU_GET_FIELD(inst, QPU_COND_ADD);
404 		uint32_t cond_mul = QPU_GET_FIELD(inst, QPU_COND_MUL);
405 
406 		if (sig == QPU_SIG_LOAD_IMM &&
407 		    QPU_GET_FIELD(inst, QPU_PACK) == QPU_PACK_A_NOP &&
408 		    ((is_mul && cond_mul == QPU_COND_ALWAYS) ||
409 		     (!is_mul && cond_add == QPU_COND_ALWAYS))) {
410 			validation_state->live_immediates[lri] =
411 				QPU_GET_FIELD(inst, QPU_LOAD_IMM);
412 		} else {
413 			validation_state->live_immediates[lri] = ~0;
414 		}
415 
416 		if (live_reg_is_upper_half(lri))
417 			validation_state->all_registers_used = true;
418 	}
419 
420 	switch (waddr) {
421 	case QPU_W_UNIFORMS_ADDRESS:
422 		if (is_b) {
423 			DRM_DEBUG("relative uniforms address change "
424 				  "unsupported\n");
425 			return false;
426 		}
427 
428 		return validate_uniform_address_write(validated_shader,
429 						      validation_state,
430 						      is_mul);
431 
432 	case QPU_W_TLB_COLOR_MS:
433 	case QPU_W_TLB_COLOR_ALL:
434 	case QPU_W_TLB_Z:
435 		/* These only interact with the tile buffer, not main memory,
436 		 * so they're safe.
437 		 */
438 		return true;
439 
440 	case QPU_W_TMU0_S:
441 	case QPU_W_TMU0_T:
442 	case QPU_W_TMU0_R:
443 	case QPU_W_TMU0_B:
444 	case QPU_W_TMU1_S:
445 	case QPU_W_TMU1_T:
446 	case QPU_W_TMU1_R:
447 	case QPU_W_TMU1_B:
448 		return check_tmu_write(validated_shader, validation_state,
449 				       is_mul);
450 
451 	case QPU_W_HOST_INT:
452 	case QPU_W_TMU_NOSWAP:
453 	case QPU_W_TLB_ALPHA_MASK:
454 	case QPU_W_MUTEX_RELEASE:
455 		/* XXX: I haven't thought about these, so don't support them
456 		 * for now.
457 		 */
458 		DRM_DEBUG("Unsupported waddr %d\n", waddr);
459 		return false;
460 
461 	case QPU_W_VPM_ADDR:
462 		DRM_DEBUG("General VPM DMA unsupported\n");
463 		return false;
464 
465 	case QPU_W_VPM:
466 	case QPU_W_VPMVCD_SETUP:
467 		/* We allow VPM setup in general, even including VPM DMA
468 		 * configuration setup, because the (unsafe) DMA can only be
469 		 * triggered by QPU_W_VPM_ADDR writes.
470 		 */
471 		return true;
472 
473 	case QPU_W_TLB_STENCIL_SETUP:
474 		return true;
475 	}
476 
477 	return true;
478 }
479 
480 static void
481 track_live_clamps(struct vc4_validated_shader_info *validated_shader,
482 		  struct vc4_shader_validation_state *validation_state)
483 {
484 	uint64_t inst = validation_state->shader[validation_state->ip];
485 	uint32_t op_add = QPU_GET_FIELD(inst, QPU_OP_ADD);
486 	uint32_t waddr_add = QPU_GET_FIELD(inst, QPU_WADDR_ADD);
487 	uint32_t waddr_mul = QPU_GET_FIELD(inst, QPU_WADDR_MUL);
488 	uint32_t cond_add = QPU_GET_FIELD(inst, QPU_COND_ADD);
489 	uint32_t add_a = QPU_GET_FIELD(inst, QPU_ADD_A);
490 	uint32_t add_b = QPU_GET_FIELD(inst, QPU_ADD_B);
491 	uint32_t raddr_a = QPU_GET_FIELD(inst, QPU_RADDR_A);
492 	uint32_t raddr_b = QPU_GET_FIELD(inst, QPU_RADDR_B);
493 	uint32_t sig = QPU_GET_FIELD(inst, QPU_SIG);
494 	bool ws = inst & QPU_WS;
495 	uint32_t lri_add_a, lri_add, lri_mul;
496 	bool add_a_is_min_0;
497 
498 	/* Check whether OP_ADD's A argumennt comes from a live MAX(x, 0),
499 	 * before we clear previous live state.
500 	 */
501 	lri_add_a = raddr_add_a_to_live_reg_index(inst);
502 	add_a_is_min_0 = (lri_add_a != ~0 &&
503 			  validation_state->live_max_clamp_regs[lri_add_a]);
504 
505 	/* Clear live state for registers written by our instruction. */
506 	lri_add = waddr_to_live_reg_index(waddr_add, ws);
507 	lri_mul = waddr_to_live_reg_index(waddr_mul, !ws);
508 	if (lri_mul != ~0) {
509 		validation_state->live_max_clamp_regs[lri_mul] = false;
510 		validation_state->live_min_clamp_offsets[lri_mul] = ~0;
511 	}
512 	if (lri_add != ~0) {
513 		validation_state->live_max_clamp_regs[lri_add] = false;
514 		validation_state->live_min_clamp_offsets[lri_add] = ~0;
515 	} else {
516 		/* Nothing further to do for live tracking, since only ADDs
517 		 * generate new live clamp registers.
518 		 */
519 		return;
520 	}
521 
522 	/* Now, handle remaining live clamp tracking for the ADD operation. */
523 
524 	if (cond_add != QPU_COND_ALWAYS)
525 		return;
526 
527 	if (op_add == QPU_A_MAX) {
528 		/* Track live clamps of a value to a minimum of 0 (in either
529 		 * arg).
530 		 */
531 		if (sig != QPU_SIG_SMALL_IMM || raddr_b != 0 ||
532 		    (add_a != QPU_MUX_B && add_b != QPU_MUX_B)) {
533 			return;
534 		}
535 
536 		validation_state->live_max_clamp_regs[lri_add] = true;
537 	} else if (op_add == QPU_A_MIN) {
538 		/* Track live clamps of a value clamped to a minimum of 0 and
539 		 * a maximum of some uniform's offset.
540 		 */
541 		if (!add_a_is_min_0)
542 			return;
543 
544 		if (!(add_b == QPU_MUX_A && raddr_a == QPU_R_UNIF) &&
545 		    !(add_b == QPU_MUX_B && raddr_b == QPU_R_UNIF &&
546 		      sig != QPU_SIG_SMALL_IMM)) {
547 			return;
548 		}
549 
550 		validation_state->live_min_clamp_offsets[lri_add] =
551 			validated_shader->uniforms_size;
552 	}
553 }
554 
555 static bool
556 check_instruction_writes(struct vc4_validated_shader_info *validated_shader,
557 			 struct vc4_shader_validation_state *validation_state)
558 {
559 	uint64_t inst = validation_state->shader[validation_state->ip];
560 	uint32_t waddr_add = QPU_GET_FIELD(inst, QPU_WADDR_ADD);
561 	uint32_t waddr_mul = QPU_GET_FIELD(inst, QPU_WADDR_MUL);
562 	bool ok;
563 
564 	if (is_tmu_write(waddr_add) && is_tmu_write(waddr_mul)) {
565 		DRM_DEBUG("ADD and MUL both set up textures\n");
566 		return false;
567 	}
568 
569 	ok = (check_reg_write(validated_shader, validation_state, false) &&
570 	      check_reg_write(validated_shader, validation_state, true));
571 
572 	track_live_clamps(validated_shader, validation_state);
573 
574 	return ok;
575 }
576 
577 static bool
578 check_branch(uint64_t inst,
579 	     struct vc4_validated_shader_info *validated_shader,
580 	     struct vc4_shader_validation_state *validation_state,
581 	     int ip)
582 {
583 	int32_t branch_imm = QPU_GET_FIELD(inst, QPU_BRANCH_TARGET);
584 	uint32_t waddr_add = QPU_GET_FIELD(inst, QPU_WADDR_ADD);
585 	uint32_t waddr_mul = QPU_GET_FIELD(inst, QPU_WADDR_MUL);
586 
587 	if ((int)branch_imm < 0)
588 		validation_state->needs_uniform_address_for_loop = true;
589 
590 	/* We don't want to have to worry about validation of this, and
591 	 * there's no need for it.
592 	 */
593 	if (waddr_add != QPU_W_NOP || waddr_mul != QPU_W_NOP) {
594 		DRM_DEBUG("branch instruction at %d wrote a register.\n",
595 			  validation_state->ip);
596 		return false;
597 	}
598 
599 	return true;
600 }
601 
602 static bool
603 check_instruction_reads(struct vc4_validated_shader_info *validated_shader,
604 			struct vc4_shader_validation_state *validation_state)
605 {
606 	uint64_t inst = validation_state->shader[validation_state->ip];
607 	uint32_t raddr_a = QPU_GET_FIELD(inst, QPU_RADDR_A);
608 	uint32_t raddr_b = QPU_GET_FIELD(inst, QPU_RADDR_B);
609 	uint32_t sig = QPU_GET_FIELD(inst, QPU_SIG);
610 
611 	if (raddr_a == QPU_R_UNIF ||
612 	    (raddr_b == QPU_R_UNIF && sig != QPU_SIG_SMALL_IMM)) {
613 		/* This can't overflow the uint32_t, because we're reading 8
614 		 * bytes of instruction to increment by 4 here, so we'd
615 		 * already be OOM.
616 		 */
617 		validated_shader->uniforms_size += 4;
618 
619 		if (validation_state->needs_uniform_address_update) {
620 			DRM_DEBUG("Uniform read with undefined uniform "
621 				  "address\n");
622 			return false;
623 		}
624 	}
625 
626 	if ((raddr_a >= 16 && raddr_a < 32) ||
627 	    (raddr_b >= 16 && raddr_b < 32 && sig != QPU_SIG_SMALL_IMM)) {
628 		validation_state->all_registers_used = true;
629 	}
630 
631 	return true;
632 }
633 
634 /* Make sure that all branches are absolute and point within the shader, and
635  * note their targets for later.
636  */
637 static bool
638 vc4_validate_branches(struct vc4_shader_validation_state *validation_state)
639 {
640 	uint32_t max_branch_target = 0;
641 	int ip;
642 	int last_branch = -2;
643 
644 	for (ip = 0; ip < validation_state->max_ip; ip++) {
645 		uint64_t inst = validation_state->shader[ip];
646 		int32_t branch_imm = QPU_GET_FIELD(inst, QPU_BRANCH_TARGET);
647 		uint32_t sig = QPU_GET_FIELD(inst, QPU_SIG);
648 		uint32_t after_delay_ip = ip + 4;
649 		uint32_t branch_target_ip;
650 
651 		if (sig == QPU_SIG_PROG_END) {
652 			/* There are two delay slots after program end is
653 			 * signaled that are still executed, then we're
654 			 * finished.  validation_state->max_ip is the
655 			 * instruction after the last valid instruction in the
656 			 * program.
657 			 */
658 			validation_state->max_ip = ip + 3;
659 			continue;
660 		}
661 
662 		if (sig != QPU_SIG_BRANCH)
663 			continue;
664 
665 		if (ip - last_branch < 4) {
666 			DRM_DEBUG("Branch at %d during delay slots\n", ip);
667 			return false;
668 		}
669 		last_branch = ip;
670 
671 		if (inst & QPU_BRANCH_REG) {
672 			DRM_DEBUG("branching from register relative "
673 				  "not supported\n");
674 			return false;
675 		}
676 
677 		if (!(inst & QPU_BRANCH_REL)) {
678 			DRM_DEBUG("relative branching required\n");
679 			return false;
680 		}
681 
682 		/* The actual branch target is the instruction after the delay
683 		 * slots, plus whatever byte offset is in the low 32 bits of
684 		 * the instruction.  Make sure we're not branching beyond the
685 		 * end of the shader object.
686 		 */
687 		if (branch_imm % sizeof(inst) != 0) {
688 			DRM_DEBUG("branch target not aligned\n");
689 			return false;
690 		}
691 
692 		branch_target_ip = after_delay_ip + (branch_imm >> 3);
693 		if (branch_target_ip >= validation_state->max_ip) {
694 			DRM_DEBUG("Branch at %d outside of shader (ip %d/%d)\n",
695 				  ip, branch_target_ip,
696 				  validation_state->max_ip);
697 			return false;
698 		}
699 		set_bit(branch_target_ip, validation_state->branch_targets);
700 
701 		/* Make sure that the non-branching path is also not outside
702 		 * the shader.
703 		 */
704 		if (after_delay_ip >= validation_state->max_ip) {
705 			DRM_DEBUG("Branch at %d continues past shader end "
706 				  "(%d/%d)\n",
707 				  ip, after_delay_ip, validation_state->max_ip);
708 			return false;
709 		}
710 		set_bit(after_delay_ip, validation_state->branch_targets);
711 		max_branch_target = max(max_branch_target, after_delay_ip);
712 	}
713 
714 	if (max_branch_target > validation_state->max_ip - 3) {
715 		DRM_DEBUG("Branch landed after QPU_SIG_PROG_END");
716 		return false;
717 	}
718 
719 	return true;
720 }
721 
722 /* Resets any known state for the shader, used when we may be branched to from
723  * multiple locations in the program (or at shader start).
724  */
725 static void
726 reset_validation_state(struct vc4_shader_validation_state *validation_state)
727 {
728 	int i;
729 
730 	for (i = 0; i < 8; i++)
731 		validation_state->tmu_setup[i / 4].p_offset[i % 4] = ~0;
732 
733 	for (i = 0; i < LIVE_REG_COUNT; i++) {
734 		validation_state->live_min_clamp_offsets[i] = ~0;
735 		validation_state->live_max_clamp_regs[i] = false;
736 		validation_state->live_immediates[i] = ~0;
737 	}
738 }
739 
740 static bool
741 texturing_in_progress(struct vc4_shader_validation_state *validation_state)
742 {
743 	return (validation_state->tmu_write_count[0] != 0 ||
744 		validation_state->tmu_write_count[1] != 0);
745 }
746 
747 static bool
748 vc4_handle_branch_target(struct vc4_shader_validation_state *validation_state)
749 {
750 	uint32_t ip = validation_state->ip;
751 
752 	if (!test_bit(ip, validation_state->branch_targets))
753 		return true;
754 
755 	if (texturing_in_progress(validation_state)) {
756 		DRM_DEBUG("Branch target landed during TMU setup\n");
757 		return false;
758 	}
759 
760 	/* Reset our live values tracking, since this instruction may have
761 	 * multiple predecessors.
762 	 *
763 	 * One could potentially do analysis to determine that, for
764 	 * example, all predecessors have a live max clamp in the same
765 	 * register, but we don't bother with that.
766 	 */
767 	reset_validation_state(validation_state);
768 
769 	/* Since we've entered a basic block from potentially multiple
770 	 * predecessors, we need the uniforms address to be updated before any
771 	 * unforms are read.  We require that after any branch point, the next
772 	 * uniform to be loaded is a uniform address offset.  That uniform's
773 	 * offset will be marked by the uniform address register write
774 	 * validation, or a one-off the end-of-program check.
775 	 */
776 	validation_state->needs_uniform_address_update = true;
777 
778 	return true;
779 }
780 
781 struct vc4_validated_shader_info *
782 vc4_validate_shader(struct drm_gem_dma_object *shader_obj)
783 {
784 	struct vc4_dev *vc4 = to_vc4_dev(shader_obj->base.dev);
785 	bool found_shader_end = false;
786 	int shader_end_ip = 0;
787 	uint32_t last_thread_switch_ip = -3;
788 	uint32_t ip;
789 	struct vc4_validated_shader_info *validated_shader = NULL;
790 	struct vc4_shader_validation_state validation_state;
791 
792 	if (WARN_ON_ONCE(vc4->gen > VC4_GEN_4))
793 		return NULL;
794 
795 	memset(&validation_state, 0, sizeof(validation_state));
796 	validation_state.shader = shader_obj->vaddr;
797 	validation_state.max_ip = shader_obj->base.size / sizeof(uint64_t);
798 
799 	reset_validation_state(&validation_state);
800 
801 	validation_state.branch_targets =
802 		kcalloc(BITS_TO_LONGS(validation_state.max_ip),
803 			sizeof(unsigned long), GFP_KERNEL);
804 	if (!validation_state.branch_targets)
805 		goto fail;
806 
807 	validated_shader = kzalloc_objs(*validated_shader, 1);
808 	if (!validated_shader)
809 		goto fail;
810 
811 	if (!vc4_validate_branches(&validation_state))
812 		goto fail;
813 
814 	for (ip = 0; ip < validation_state.max_ip; ip++) {
815 		uint64_t inst = validation_state.shader[ip];
816 		uint32_t sig = QPU_GET_FIELD(inst, QPU_SIG);
817 
818 		validation_state.ip = ip;
819 
820 		if (!vc4_handle_branch_target(&validation_state))
821 			goto fail;
822 
823 		if (ip == last_thread_switch_ip + 3) {
824 			/* Reset r0-r3 live clamp data */
825 			int i;
826 
827 			for (i = 64; i < LIVE_REG_COUNT; i++) {
828 				validation_state.live_min_clamp_offsets[i] = ~0;
829 				validation_state.live_max_clamp_regs[i] = false;
830 				validation_state.live_immediates[i] = ~0;
831 			}
832 		}
833 
834 		switch (sig) {
835 		case QPU_SIG_NONE:
836 		case QPU_SIG_WAIT_FOR_SCOREBOARD:
837 		case QPU_SIG_SCOREBOARD_UNLOCK:
838 		case QPU_SIG_COLOR_LOAD:
839 		case QPU_SIG_LOAD_TMU0:
840 		case QPU_SIG_LOAD_TMU1:
841 		case QPU_SIG_PROG_END:
842 		case QPU_SIG_SMALL_IMM:
843 		case QPU_SIG_THREAD_SWITCH:
844 		case QPU_SIG_LAST_THREAD_SWITCH:
845 			if (!check_instruction_writes(validated_shader,
846 						      &validation_state)) {
847 				DRM_DEBUG("Bad write at ip %d\n", ip);
848 				goto fail;
849 			}
850 
851 			if (!check_instruction_reads(validated_shader,
852 						     &validation_state))
853 				goto fail;
854 
855 			if (sig == QPU_SIG_PROG_END) {
856 				found_shader_end = true;
857 				shader_end_ip = ip;
858 			}
859 
860 			if (sig == QPU_SIG_THREAD_SWITCH ||
861 			    sig == QPU_SIG_LAST_THREAD_SWITCH) {
862 				validated_shader->is_threaded = true;
863 
864 				if (ip < last_thread_switch_ip + 3) {
865 					DRM_DEBUG("Thread switch too soon after "
866 						  "last switch at ip %d\n", ip);
867 					goto fail;
868 				}
869 				last_thread_switch_ip = ip;
870 			}
871 
872 			break;
873 
874 		case QPU_SIG_LOAD_IMM:
875 			if (!check_instruction_writes(validated_shader,
876 						      &validation_state)) {
877 				DRM_DEBUG("Bad LOAD_IMM write at ip %d\n", ip);
878 				goto fail;
879 			}
880 			break;
881 
882 		case QPU_SIG_BRANCH:
883 			if (!check_branch(inst, validated_shader,
884 					  &validation_state, ip))
885 				goto fail;
886 
887 			if (ip < last_thread_switch_ip + 3) {
888 				DRM_DEBUG("Branch in thread switch at ip %d",
889 					  ip);
890 				goto fail;
891 			}
892 
893 			break;
894 		default:
895 			DRM_DEBUG("Unsupported QPU signal %d at "
896 				  "instruction %d\n", sig, ip);
897 			goto fail;
898 		}
899 
900 		/* There are two delay slots after program end is signaled
901 		 * that are still executed, then we're finished.
902 		 */
903 		if (found_shader_end && ip == shader_end_ip + 2)
904 			break;
905 	}
906 
907 	if (ip == validation_state.max_ip) {
908 		DRM_DEBUG("shader failed to terminate before "
909 			  "shader BO end at %zd\n",
910 			  shader_obj->base.size);
911 		goto fail;
912 	}
913 
914 	/* Might corrupt other thread */
915 	if (validated_shader->is_threaded &&
916 	    validation_state.all_registers_used) {
917 		DRM_DEBUG("Shader uses threading, but uses the upper "
918 			  "half of the registers, too\n");
919 		goto fail;
920 	}
921 
922 	/* If we did a backwards branch and we haven't emitted a uniforms
923 	 * reset since then, we still need the uniforms stream to have the
924 	 * uniforms address available so that the backwards branch can do its
925 	 * uniforms reset.
926 	 *
927 	 * We could potentially prove that the backwards branch doesn't
928 	 * contain any uses of uniforms until program exit, but that doesn't
929 	 * seem to be worth the trouble.
930 	 */
931 	if (validation_state.needs_uniform_address_for_loop) {
932 		if (!require_uniform_address_uniform(validated_shader))
933 			goto fail;
934 		validated_shader->uniforms_size += 4;
935 	}
936 
937 	/* Again, no chance of integer overflow here because the worst case
938 	 * scenario is 8 bytes of uniforms plus handles per 8-byte
939 	 * instruction.
940 	 */
941 	validated_shader->uniforms_src_size =
942 		(validated_shader->uniforms_size +
943 		 4 * validated_shader->num_texture_samples);
944 
945 	kfree(validation_state.branch_targets);
946 
947 	return validated_shader;
948 
949 fail:
950 	kfree(validation_state.branch_targets);
951 	if (validated_shader) {
952 		kfree(validated_shader->uniform_addr_offsets);
953 		kfree(validated_shader->texture_samples);
954 		kfree(validated_shader);
955 	}
956 	return NULL;
957 }
958