1 /*
2 * Copyright © 2014 Broadcom
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24 /**
25 * DOC: Shader validator for VC4.
26 *
27 * Since the VC4 has no IOMMU between it and system memory, a user
28 * with access to execute shaders could escalate privilege by
29 * overwriting system memory (using the VPM write address register in
30 * the general-purpose DMA mode) or reading system memory it shouldn't
31 * (reading it as a texture, uniform data, or direct-addressed TMU
32 * lookup).
33 *
34 * The shader validator walks over a shader's BO, ensuring that its
35 * accesses are appropriately bounded, and recording where texture
36 * accesses are made so that we can do relocations for them in the
37 * uniform stream.
38 *
39 * Shader BO are immutable for their lifetimes (enforced by not
40 * allowing mmaps, GEM prime export, or rendering to from a CL), so
41 * this validation is only performed at BO creation time.
42 */
43
44 #include <drm/drm_print.h>
45
46 #include "vc4_drv.h"
47 #include "vc4_qpu_defines.h"
48
49 #define LIVE_REG_COUNT (32 + 32 + 4)
50
51 struct vc4_shader_validation_state {
52 /* Current IP being validated. */
53 uint32_t ip;
54
55 /* IP at the end of the BO, do not read shader[max_ip] */
56 uint32_t max_ip;
57
58 uint64_t *shader;
59
60 struct vc4_texture_sample_info tmu_setup[2];
61 int tmu_write_count[2];
62
63 /* For registers that were last written to by a MIN instruction with
64 * one argument being a uniform, the address of the uniform.
65 * Otherwise, ~0.
66 *
67 * This is used for the validation of direct address memory reads.
68 */
69 uint32_t live_min_clamp_offsets[LIVE_REG_COUNT];
70 bool live_max_clamp_regs[LIVE_REG_COUNT];
71 uint32_t live_immediates[LIVE_REG_COUNT];
72
73 /* Bitfield of which IPs are used as branch targets.
74 *
75 * Used for validation that the uniform stream is updated at the right
76 * points and clearing the texturing/clamping state.
77 */
78 unsigned long *branch_targets;
79
80 /* Set when entering a basic block, and cleared when the uniform
81 * address update is found. This is used to make sure that we don't
82 * read uniforms when the address is undefined.
83 */
84 bool needs_uniform_address_update;
85
86 /* Set when we find a backwards branch. If the branch is backwards,
87 * the taraget is probably doing an address reset to read uniforms,
88 * and so we need to be sure that a uniforms address is present in the
89 * stream, even if the shader didn't need to read uniforms in later
90 * basic blocks.
91 */
92 bool needs_uniform_address_for_loop;
93
94 /* Set when we find an instruction writing the top half of the
95 * register files. If we allowed writing the unusable regs in
96 * a threaded shader, then the other shader running on our
97 * QPU's clamp validation would be invalid.
98 */
99 bool all_registers_used;
100 };
101
102 static uint32_t
waddr_to_live_reg_index(uint32_t waddr,bool is_b)103 waddr_to_live_reg_index(uint32_t waddr, bool is_b)
104 {
105 if (waddr < 32) {
106 if (is_b)
107 return 32 + waddr;
108 else
109 return waddr;
110 } else if (waddr <= QPU_W_ACC3) {
111 return 64 + waddr - QPU_W_ACC0;
112 } else {
113 return ~0;
114 }
115 }
116
117 static uint32_t
raddr_add_a_to_live_reg_index(uint64_t inst)118 raddr_add_a_to_live_reg_index(uint64_t inst)
119 {
120 uint32_t sig = QPU_GET_FIELD(inst, QPU_SIG);
121 uint32_t add_a = QPU_GET_FIELD(inst, QPU_ADD_A);
122 uint32_t raddr_a = QPU_GET_FIELD(inst, QPU_RADDR_A);
123 uint32_t raddr_b = QPU_GET_FIELD(inst, QPU_RADDR_B);
124
125 if (add_a == QPU_MUX_A)
126 return raddr_a;
127 else if (add_a == QPU_MUX_B && sig != QPU_SIG_SMALL_IMM)
128 return 32 + raddr_b;
129 else if (add_a <= QPU_MUX_R3)
130 return 64 + add_a;
131 else
132 return ~0;
133 }
134
135 static bool
live_reg_is_upper_half(uint32_t lri)136 live_reg_is_upper_half(uint32_t lri)
137 {
138 return (lri >= 16 && lri < 32) ||
139 (lri >= 32 + 16 && lri < 32 + 32);
140 }
141
142 static bool
is_tmu_submit(uint32_t waddr)143 is_tmu_submit(uint32_t waddr)
144 {
145 return (waddr == QPU_W_TMU0_S ||
146 waddr == QPU_W_TMU1_S);
147 }
148
149 static bool
is_tmu_write(uint32_t waddr)150 is_tmu_write(uint32_t waddr)
151 {
152 return (waddr >= QPU_W_TMU0_S &&
153 waddr <= QPU_W_TMU1_B);
154 }
155
156 static bool
record_texture_sample(struct vc4_validated_shader_info * validated_shader,struct vc4_shader_validation_state * validation_state,int tmu)157 record_texture_sample(struct vc4_validated_shader_info *validated_shader,
158 struct vc4_shader_validation_state *validation_state,
159 int tmu)
160 {
161 uint32_t s = validated_shader->num_texture_samples;
162 int i;
163 struct vc4_texture_sample_info *temp_samples;
164
165 temp_samples = krealloc(validated_shader->texture_samples,
166 (s + 1) * sizeof(*temp_samples),
167 GFP_KERNEL);
168 if (!temp_samples)
169 return false;
170
171 memcpy(&temp_samples[s],
172 &validation_state->tmu_setup[tmu],
173 sizeof(*temp_samples));
174
175 validated_shader->num_texture_samples = s + 1;
176 validated_shader->texture_samples = temp_samples;
177
178 for (i = 0; i < 4; i++)
179 validation_state->tmu_setup[tmu].p_offset[i] = ~0;
180
181 return true;
182 }
183
184 static bool
check_tmu_write(struct vc4_validated_shader_info * validated_shader,struct vc4_shader_validation_state * validation_state,bool is_mul)185 check_tmu_write(struct vc4_validated_shader_info *validated_shader,
186 struct vc4_shader_validation_state *validation_state,
187 bool is_mul)
188 {
189 uint64_t inst = validation_state->shader[validation_state->ip];
190 uint32_t waddr = (is_mul ?
191 QPU_GET_FIELD(inst, QPU_WADDR_MUL) :
192 QPU_GET_FIELD(inst, QPU_WADDR_ADD));
193 uint32_t raddr_a = QPU_GET_FIELD(inst, QPU_RADDR_A);
194 uint32_t raddr_b = QPU_GET_FIELD(inst, QPU_RADDR_B);
195 int tmu = waddr > QPU_W_TMU0_B;
196 bool submit = is_tmu_submit(waddr);
197 bool is_direct = submit && validation_state->tmu_write_count[tmu] == 0;
198 uint32_t sig = QPU_GET_FIELD(inst, QPU_SIG);
199
200 if (is_direct) {
201 uint32_t add_b = QPU_GET_FIELD(inst, QPU_ADD_B);
202 uint32_t clamp_reg, clamp_offset;
203
204 if (sig == QPU_SIG_SMALL_IMM) {
205 DRM_DEBUG("direct TMU read used small immediate\n");
206 return false;
207 }
208
209 /* Make sure that this texture load is an add of the base
210 * address of the UBO to a clamped offset within the UBO.
211 */
212 if (is_mul ||
213 QPU_GET_FIELD(inst, QPU_OP_ADD) != QPU_A_ADD) {
214 DRM_DEBUG("direct TMU load wasn't an add\n");
215 return false;
216 }
217
218 /* We assert that the clamped address is the first
219 * argument, and the UBO base address is the second argument.
220 * This is arbitrary, but simpler than supporting flipping the
221 * two either way.
222 */
223 clamp_reg = raddr_add_a_to_live_reg_index(inst);
224 if (clamp_reg == ~0) {
225 DRM_DEBUG("direct TMU load wasn't clamped\n");
226 return false;
227 }
228
229 clamp_offset = validation_state->live_min_clamp_offsets[clamp_reg];
230 if (clamp_offset == ~0) {
231 DRM_DEBUG("direct TMU load wasn't clamped\n");
232 return false;
233 }
234
235 /* Store the clamp value's offset in p1 (see reloc_tex() in
236 * vc4_validate.c).
237 */
238 validation_state->tmu_setup[tmu].p_offset[1] =
239 clamp_offset;
240
241 if (!(add_b == QPU_MUX_A && raddr_a == QPU_R_UNIF) &&
242 !(add_b == QPU_MUX_B && raddr_b == QPU_R_UNIF)) {
243 DRM_DEBUG("direct TMU load didn't add to a uniform\n");
244 return false;
245 }
246
247 validation_state->tmu_setup[tmu].is_direct = true;
248 } else {
249 if (raddr_a == QPU_R_UNIF || (sig != QPU_SIG_SMALL_IMM &&
250 raddr_b == QPU_R_UNIF)) {
251 DRM_DEBUG("uniform read in the same instruction as "
252 "texture setup.\n");
253 return false;
254 }
255 }
256
257 if (validation_state->tmu_write_count[tmu] >= 4) {
258 DRM_DEBUG("TMU%d got too many parameters before dispatch\n",
259 tmu);
260 return false;
261 }
262 validation_state->tmu_setup[tmu].p_offset[validation_state->tmu_write_count[tmu]] =
263 validated_shader->uniforms_size;
264 validation_state->tmu_write_count[tmu]++;
265 /* Since direct uses a RADDR uniform reference, it will get counted in
266 * check_instruction_reads()
267 */
268 if (!is_direct) {
269 if (validation_state->needs_uniform_address_update) {
270 DRM_DEBUG("Texturing with undefined uniform address\n");
271 return false;
272 }
273
274 validated_shader->uniforms_size += 4;
275 }
276
277 if (submit) {
278 if (!record_texture_sample(validated_shader,
279 validation_state, tmu)) {
280 return false;
281 }
282
283 validation_state->tmu_write_count[tmu] = 0;
284 }
285
286 return true;
287 }
288
require_uniform_address_uniform(struct vc4_validated_shader_info * validated_shader)289 static bool require_uniform_address_uniform(struct vc4_validated_shader_info *validated_shader)
290 {
291 uint32_t o = validated_shader->num_uniform_addr_offsets;
292 uint32_t num_uniforms = validated_shader->uniforms_size / 4;
293
294 validated_shader->uniform_addr_offsets =
295 krealloc(validated_shader->uniform_addr_offsets,
296 (o + 1) *
297 sizeof(*validated_shader->uniform_addr_offsets),
298 GFP_KERNEL);
299 if (!validated_shader->uniform_addr_offsets)
300 return false;
301
302 validated_shader->uniform_addr_offsets[o] = num_uniforms;
303 validated_shader->num_uniform_addr_offsets++;
304
305 return true;
306 }
307
308 static bool
validate_uniform_address_write(struct vc4_validated_shader_info * validated_shader,struct vc4_shader_validation_state * validation_state,bool is_mul)309 validate_uniform_address_write(struct vc4_validated_shader_info *validated_shader,
310 struct vc4_shader_validation_state *validation_state,
311 bool is_mul)
312 {
313 uint64_t inst = validation_state->shader[validation_state->ip];
314 u32 add_b = QPU_GET_FIELD(inst, QPU_ADD_B);
315 u32 raddr_a = QPU_GET_FIELD(inst, QPU_RADDR_A);
316 u32 raddr_b = QPU_GET_FIELD(inst, QPU_RADDR_B);
317 u32 add_lri = raddr_add_a_to_live_reg_index(inst);
318 /* We want our reset to be pointing at whatever uniform follows the
319 * uniforms base address.
320 */
321 u32 expected_offset = validated_shader->uniforms_size + 4;
322
323 /* We only support absolute uniform address changes, and we
324 * require that they be in the current basic block before any
325 * of its uniform reads.
326 *
327 * One could potentially emit more efficient QPU code, by
328 * noticing that (say) an if statement does uniform control
329 * flow for all threads and that the if reads the same number
330 * of uniforms on each side. However, this scheme is easy to
331 * validate so it's all we allow for now.
332 */
333 switch (QPU_GET_FIELD(inst, QPU_SIG)) {
334 case QPU_SIG_NONE:
335 case QPU_SIG_SCOREBOARD_UNLOCK:
336 case QPU_SIG_COLOR_LOAD:
337 case QPU_SIG_LOAD_TMU0:
338 case QPU_SIG_LOAD_TMU1:
339 break;
340 default:
341 DRM_DEBUG("uniforms address change must be "
342 "normal math\n");
343 return false;
344 }
345
346 if (is_mul || QPU_GET_FIELD(inst, QPU_OP_ADD) != QPU_A_ADD) {
347 DRM_DEBUG("Uniform address reset must be an ADD.\n");
348 return false;
349 }
350
351 if (QPU_GET_FIELD(inst, QPU_COND_ADD) != QPU_COND_ALWAYS) {
352 DRM_DEBUG("Uniform address reset must be unconditional.\n");
353 return false;
354 }
355
356 if (QPU_GET_FIELD(inst, QPU_PACK) != QPU_PACK_A_NOP &&
357 !(inst & QPU_PM)) {
358 DRM_DEBUG("No packing allowed on uniforms reset\n");
359 return false;
360 }
361
362 if (add_lri == -1) {
363 DRM_DEBUG("First argument of uniform address write must be "
364 "an immediate value.\n");
365 return false;
366 }
367
368 if (validation_state->live_immediates[add_lri] != expected_offset) {
369 DRM_DEBUG("Resetting uniforms with offset %db instead of %db\n",
370 validation_state->live_immediates[add_lri],
371 expected_offset);
372 return false;
373 }
374
375 if (!(add_b == QPU_MUX_A && raddr_a == QPU_R_UNIF) &&
376 !(add_b == QPU_MUX_B && raddr_b == QPU_R_UNIF)) {
377 DRM_DEBUG("Second argument of uniform address write must be "
378 "a uniform.\n");
379 return false;
380 }
381
382 validation_state->needs_uniform_address_update = false;
383 validation_state->needs_uniform_address_for_loop = false;
384 return require_uniform_address_uniform(validated_shader);
385 }
386
387 static bool
check_reg_write(struct vc4_validated_shader_info * validated_shader,struct vc4_shader_validation_state * validation_state,bool is_mul)388 check_reg_write(struct vc4_validated_shader_info *validated_shader,
389 struct vc4_shader_validation_state *validation_state,
390 bool is_mul)
391 {
392 uint64_t inst = validation_state->shader[validation_state->ip];
393 uint32_t waddr = (is_mul ?
394 QPU_GET_FIELD(inst, QPU_WADDR_MUL) :
395 QPU_GET_FIELD(inst, QPU_WADDR_ADD));
396 uint32_t sig = QPU_GET_FIELD(inst, QPU_SIG);
397 bool ws = inst & QPU_WS;
398 bool is_b = is_mul ^ ws;
399 u32 lri = waddr_to_live_reg_index(waddr, is_b);
400
401 if (lri != -1) {
402 uint32_t cond_add = QPU_GET_FIELD(inst, QPU_COND_ADD);
403 uint32_t cond_mul = QPU_GET_FIELD(inst, QPU_COND_MUL);
404
405 if (sig == QPU_SIG_LOAD_IMM &&
406 QPU_GET_FIELD(inst, QPU_PACK) == QPU_PACK_A_NOP &&
407 ((is_mul && cond_mul == QPU_COND_ALWAYS) ||
408 (!is_mul && cond_add == QPU_COND_ALWAYS))) {
409 validation_state->live_immediates[lri] =
410 QPU_GET_FIELD(inst, QPU_LOAD_IMM);
411 } else {
412 validation_state->live_immediates[lri] = ~0;
413 }
414
415 if (live_reg_is_upper_half(lri))
416 validation_state->all_registers_used = true;
417 }
418
419 switch (waddr) {
420 case QPU_W_UNIFORMS_ADDRESS:
421 if (is_b) {
422 DRM_DEBUG("relative uniforms address change "
423 "unsupported\n");
424 return false;
425 }
426
427 return validate_uniform_address_write(validated_shader,
428 validation_state,
429 is_mul);
430
431 case QPU_W_TLB_COLOR_MS:
432 case QPU_W_TLB_COLOR_ALL:
433 case QPU_W_TLB_Z:
434 /* These only interact with the tile buffer, not main memory,
435 * so they're safe.
436 */
437 return true;
438
439 case QPU_W_TMU0_S:
440 case QPU_W_TMU0_T:
441 case QPU_W_TMU0_R:
442 case QPU_W_TMU0_B:
443 case QPU_W_TMU1_S:
444 case QPU_W_TMU1_T:
445 case QPU_W_TMU1_R:
446 case QPU_W_TMU1_B:
447 return check_tmu_write(validated_shader, validation_state,
448 is_mul);
449
450 case QPU_W_HOST_INT:
451 case QPU_W_TMU_NOSWAP:
452 case QPU_W_TLB_ALPHA_MASK:
453 case QPU_W_MUTEX_RELEASE:
454 /* XXX: I haven't thought about these, so don't support them
455 * for now.
456 */
457 DRM_DEBUG("Unsupported waddr %d\n", waddr);
458 return false;
459
460 case QPU_W_VPM_ADDR:
461 DRM_DEBUG("General VPM DMA unsupported\n");
462 return false;
463
464 case QPU_W_VPM:
465 case QPU_W_VPMVCD_SETUP:
466 /* We allow VPM setup in general, even including VPM DMA
467 * configuration setup, because the (unsafe) DMA can only be
468 * triggered by QPU_W_VPM_ADDR writes.
469 */
470 return true;
471
472 case QPU_W_TLB_STENCIL_SETUP:
473 return true;
474 }
475
476 return true;
477 }
478
479 static void
track_live_clamps(struct vc4_validated_shader_info * validated_shader,struct vc4_shader_validation_state * validation_state)480 track_live_clamps(struct vc4_validated_shader_info *validated_shader,
481 struct vc4_shader_validation_state *validation_state)
482 {
483 uint64_t inst = validation_state->shader[validation_state->ip];
484 uint32_t op_add = QPU_GET_FIELD(inst, QPU_OP_ADD);
485 uint32_t waddr_add = QPU_GET_FIELD(inst, QPU_WADDR_ADD);
486 uint32_t waddr_mul = QPU_GET_FIELD(inst, QPU_WADDR_MUL);
487 uint32_t cond_add = QPU_GET_FIELD(inst, QPU_COND_ADD);
488 uint32_t add_a = QPU_GET_FIELD(inst, QPU_ADD_A);
489 uint32_t add_b = QPU_GET_FIELD(inst, QPU_ADD_B);
490 uint32_t raddr_a = QPU_GET_FIELD(inst, QPU_RADDR_A);
491 uint32_t raddr_b = QPU_GET_FIELD(inst, QPU_RADDR_B);
492 uint32_t sig = QPU_GET_FIELD(inst, QPU_SIG);
493 bool ws = inst & QPU_WS;
494 uint32_t lri_add_a, lri_add, lri_mul;
495 bool add_a_is_min_0;
496
497 /* Check whether OP_ADD's A argumennt comes from a live MAX(x, 0),
498 * before we clear previous live state.
499 */
500 lri_add_a = raddr_add_a_to_live_reg_index(inst);
501 add_a_is_min_0 = (lri_add_a != ~0 &&
502 validation_state->live_max_clamp_regs[lri_add_a]);
503
504 /* Clear live state for registers written by our instruction. */
505 lri_add = waddr_to_live_reg_index(waddr_add, ws);
506 lri_mul = waddr_to_live_reg_index(waddr_mul, !ws);
507 if (lri_mul != ~0) {
508 validation_state->live_max_clamp_regs[lri_mul] = false;
509 validation_state->live_min_clamp_offsets[lri_mul] = ~0;
510 }
511 if (lri_add != ~0) {
512 validation_state->live_max_clamp_regs[lri_add] = false;
513 validation_state->live_min_clamp_offsets[lri_add] = ~0;
514 } else {
515 /* Nothing further to do for live tracking, since only ADDs
516 * generate new live clamp registers.
517 */
518 return;
519 }
520
521 /* Now, handle remaining live clamp tracking for the ADD operation. */
522
523 if (cond_add != QPU_COND_ALWAYS)
524 return;
525
526 if (op_add == QPU_A_MAX) {
527 /* Track live clamps of a value to a minimum of 0 (in either
528 * arg).
529 */
530 if (sig != QPU_SIG_SMALL_IMM || raddr_b != 0 ||
531 (add_a != QPU_MUX_B && add_b != QPU_MUX_B)) {
532 return;
533 }
534
535 validation_state->live_max_clamp_regs[lri_add] = true;
536 } else if (op_add == QPU_A_MIN) {
537 /* Track live clamps of a value clamped to a minimum of 0 and
538 * a maximum of some uniform's offset.
539 */
540 if (!add_a_is_min_0)
541 return;
542
543 if (!(add_b == QPU_MUX_A && raddr_a == QPU_R_UNIF) &&
544 !(add_b == QPU_MUX_B && raddr_b == QPU_R_UNIF &&
545 sig != QPU_SIG_SMALL_IMM)) {
546 return;
547 }
548
549 validation_state->live_min_clamp_offsets[lri_add] =
550 validated_shader->uniforms_size;
551 }
552 }
553
554 static bool
check_instruction_writes(struct vc4_validated_shader_info * validated_shader,struct vc4_shader_validation_state * validation_state)555 check_instruction_writes(struct vc4_validated_shader_info *validated_shader,
556 struct vc4_shader_validation_state *validation_state)
557 {
558 uint64_t inst = validation_state->shader[validation_state->ip];
559 uint32_t waddr_add = QPU_GET_FIELD(inst, QPU_WADDR_ADD);
560 uint32_t waddr_mul = QPU_GET_FIELD(inst, QPU_WADDR_MUL);
561 bool ok;
562
563 if (is_tmu_write(waddr_add) && is_tmu_write(waddr_mul)) {
564 DRM_DEBUG("ADD and MUL both set up textures\n");
565 return false;
566 }
567
568 ok = (check_reg_write(validated_shader, validation_state, false) &&
569 check_reg_write(validated_shader, validation_state, true));
570
571 track_live_clamps(validated_shader, validation_state);
572
573 return ok;
574 }
575
576 static bool
check_branch(uint64_t inst,struct vc4_validated_shader_info * validated_shader,struct vc4_shader_validation_state * validation_state,int ip)577 check_branch(uint64_t inst,
578 struct vc4_validated_shader_info *validated_shader,
579 struct vc4_shader_validation_state *validation_state,
580 int ip)
581 {
582 int32_t branch_imm = QPU_GET_FIELD(inst, QPU_BRANCH_TARGET);
583 uint32_t waddr_add = QPU_GET_FIELD(inst, QPU_WADDR_ADD);
584 uint32_t waddr_mul = QPU_GET_FIELD(inst, QPU_WADDR_MUL);
585
586 if ((int)branch_imm < 0)
587 validation_state->needs_uniform_address_for_loop = true;
588
589 /* We don't want to have to worry about validation of this, and
590 * there's no need for it.
591 */
592 if (waddr_add != QPU_W_NOP || waddr_mul != QPU_W_NOP) {
593 DRM_DEBUG("branch instruction at %d wrote a register.\n",
594 validation_state->ip);
595 return false;
596 }
597
598 return true;
599 }
600
601 static bool
check_instruction_reads(struct vc4_validated_shader_info * validated_shader,struct vc4_shader_validation_state * validation_state)602 check_instruction_reads(struct vc4_validated_shader_info *validated_shader,
603 struct vc4_shader_validation_state *validation_state)
604 {
605 uint64_t inst = validation_state->shader[validation_state->ip];
606 uint32_t raddr_a = QPU_GET_FIELD(inst, QPU_RADDR_A);
607 uint32_t raddr_b = QPU_GET_FIELD(inst, QPU_RADDR_B);
608 uint32_t sig = QPU_GET_FIELD(inst, QPU_SIG);
609
610 if (raddr_a == QPU_R_UNIF ||
611 (raddr_b == QPU_R_UNIF && sig != QPU_SIG_SMALL_IMM)) {
612 /* This can't overflow the uint32_t, because we're reading 8
613 * bytes of instruction to increment by 4 here, so we'd
614 * already be OOM.
615 */
616 validated_shader->uniforms_size += 4;
617
618 if (validation_state->needs_uniform_address_update) {
619 DRM_DEBUG("Uniform read with undefined uniform "
620 "address\n");
621 return false;
622 }
623 }
624
625 if ((raddr_a >= 16 && raddr_a < 32) ||
626 (raddr_b >= 16 && raddr_b < 32 && sig != QPU_SIG_SMALL_IMM)) {
627 validation_state->all_registers_used = true;
628 }
629
630 return true;
631 }
632
633 /* Make sure that all branches are absolute and point within the shader, and
634 * note their targets for later.
635 */
636 static bool
vc4_validate_branches(struct vc4_shader_validation_state * validation_state)637 vc4_validate_branches(struct vc4_shader_validation_state *validation_state)
638 {
639 uint32_t max_branch_target = 0;
640 int ip;
641 int last_branch = -2;
642
643 for (ip = 0; ip < validation_state->max_ip; ip++) {
644 uint64_t inst = validation_state->shader[ip];
645 int32_t branch_imm = QPU_GET_FIELD(inst, QPU_BRANCH_TARGET);
646 uint32_t sig = QPU_GET_FIELD(inst, QPU_SIG);
647 uint32_t after_delay_ip = ip + 4;
648 uint32_t branch_target_ip;
649
650 if (sig == QPU_SIG_PROG_END) {
651 /* There are two delay slots after program end is
652 * signaled that are still executed, then we're
653 * finished. validation_state->max_ip is the
654 * instruction after the last valid instruction in the
655 * program.
656 */
657 validation_state->max_ip = ip + 3;
658 continue;
659 }
660
661 if (sig != QPU_SIG_BRANCH)
662 continue;
663
664 if (ip - last_branch < 4) {
665 DRM_DEBUG("Branch at %d during delay slots\n", ip);
666 return false;
667 }
668 last_branch = ip;
669
670 if (inst & QPU_BRANCH_REG) {
671 DRM_DEBUG("branching from register relative "
672 "not supported\n");
673 return false;
674 }
675
676 if (!(inst & QPU_BRANCH_REL)) {
677 DRM_DEBUG("relative branching required\n");
678 return false;
679 }
680
681 /* The actual branch target is the instruction after the delay
682 * slots, plus whatever byte offset is in the low 32 bits of
683 * the instruction. Make sure we're not branching beyond the
684 * end of the shader object.
685 */
686 if (branch_imm % sizeof(inst) != 0) {
687 DRM_DEBUG("branch target not aligned\n");
688 return false;
689 }
690
691 branch_target_ip = after_delay_ip + (branch_imm >> 3);
692 if (branch_target_ip >= validation_state->max_ip) {
693 DRM_DEBUG("Branch at %d outside of shader (ip %d/%d)\n",
694 ip, branch_target_ip,
695 validation_state->max_ip);
696 return false;
697 }
698 set_bit(branch_target_ip, validation_state->branch_targets);
699
700 /* Make sure that the non-branching path is also not outside
701 * the shader.
702 */
703 if (after_delay_ip >= validation_state->max_ip) {
704 DRM_DEBUG("Branch at %d continues past shader end "
705 "(%d/%d)\n",
706 ip, after_delay_ip, validation_state->max_ip);
707 return false;
708 }
709 set_bit(after_delay_ip, validation_state->branch_targets);
710 max_branch_target = max(max_branch_target, after_delay_ip);
711 }
712
713 if (max_branch_target > validation_state->max_ip - 3) {
714 DRM_DEBUG("Branch landed after QPU_SIG_PROG_END");
715 return false;
716 }
717
718 return true;
719 }
720
721 /* Resets any known state for the shader, used when we may be branched to from
722 * multiple locations in the program (or at shader start).
723 */
724 static void
reset_validation_state(struct vc4_shader_validation_state * validation_state)725 reset_validation_state(struct vc4_shader_validation_state *validation_state)
726 {
727 int i;
728
729 for (i = 0; i < 8; i++)
730 validation_state->tmu_setup[i / 4].p_offset[i % 4] = ~0;
731
732 for (i = 0; i < LIVE_REG_COUNT; i++) {
733 validation_state->live_min_clamp_offsets[i] = ~0;
734 validation_state->live_max_clamp_regs[i] = false;
735 validation_state->live_immediates[i] = ~0;
736 }
737 }
738
739 static bool
texturing_in_progress(struct vc4_shader_validation_state * validation_state)740 texturing_in_progress(struct vc4_shader_validation_state *validation_state)
741 {
742 return (validation_state->tmu_write_count[0] != 0 ||
743 validation_state->tmu_write_count[1] != 0);
744 }
745
746 static bool
vc4_handle_branch_target(struct vc4_shader_validation_state * validation_state)747 vc4_handle_branch_target(struct vc4_shader_validation_state *validation_state)
748 {
749 uint32_t ip = validation_state->ip;
750
751 if (!test_bit(ip, validation_state->branch_targets))
752 return true;
753
754 if (texturing_in_progress(validation_state)) {
755 DRM_DEBUG("Branch target landed during TMU setup\n");
756 return false;
757 }
758
759 /* Reset our live values tracking, since this instruction may have
760 * multiple predecessors.
761 *
762 * One could potentially do analysis to determine that, for
763 * example, all predecessors have a live max clamp in the same
764 * register, but we don't bother with that.
765 */
766 reset_validation_state(validation_state);
767
768 /* Since we've entered a basic block from potentially multiple
769 * predecessors, we need the uniforms address to be updated before any
770 * unforms are read. We require that after any branch point, the next
771 * uniform to be loaded is a uniform address offset. That uniform's
772 * offset will be marked by the uniform address register write
773 * validation, or a one-off the end-of-program check.
774 */
775 validation_state->needs_uniform_address_update = true;
776
777 return true;
778 }
779
780 struct vc4_validated_shader_info *
vc4_validate_shader(struct drm_gem_dma_object * shader_obj)781 vc4_validate_shader(struct drm_gem_dma_object *shader_obj)
782 {
783 struct vc4_dev *vc4 = to_vc4_dev(shader_obj->base.dev);
784 bool found_shader_end = false;
785 int shader_end_ip = 0;
786 uint32_t last_thread_switch_ip = -3;
787 uint32_t ip;
788 struct vc4_validated_shader_info *validated_shader = NULL;
789 struct vc4_shader_validation_state validation_state;
790
791 if (WARN_ON_ONCE(vc4->gen > VC4_GEN_4))
792 return NULL;
793
794 memset(&validation_state, 0, sizeof(validation_state));
795 validation_state.shader = shader_obj->vaddr;
796 validation_state.max_ip = shader_obj->base.size / sizeof(uint64_t);
797
798 reset_validation_state(&validation_state);
799
800 validation_state.branch_targets =
801 kcalloc(BITS_TO_LONGS(validation_state.max_ip),
802 sizeof(unsigned long), GFP_KERNEL);
803 if (!validation_state.branch_targets)
804 goto fail;
805
806 validated_shader = kzalloc_objs(*validated_shader, 1);
807 if (!validated_shader)
808 goto fail;
809
810 if (!vc4_validate_branches(&validation_state))
811 goto fail;
812
813 for (ip = 0; ip < validation_state.max_ip; ip++) {
814 uint64_t inst = validation_state.shader[ip];
815 uint32_t sig = QPU_GET_FIELD(inst, QPU_SIG);
816
817 validation_state.ip = ip;
818
819 if (!vc4_handle_branch_target(&validation_state))
820 goto fail;
821
822 if (ip == last_thread_switch_ip + 3) {
823 /* Reset r0-r3 live clamp data */
824 int i;
825
826 for (i = 64; i < LIVE_REG_COUNT; i++) {
827 validation_state.live_min_clamp_offsets[i] = ~0;
828 validation_state.live_max_clamp_regs[i] = false;
829 validation_state.live_immediates[i] = ~0;
830 }
831 }
832
833 switch (sig) {
834 case QPU_SIG_NONE:
835 case QPU_SIG_WAIT_FOR_SCOREBOARD:
836 case QPU_SIG_SCOREBOARD_UNLOCK:
837 case QPU_SIG_COLOR_LOAD:
838 case QPU_SIG_LOAD_TMU0:
839 case QPU_SIG_LOAD_TMU1:
840 case QPU_SIG_PROG_END:
841 case QPU_SIG_SMALL_IMM:
842 case QPU_SIG_THREAD_SWITCH:
843 case QPU_SIG_LAST_THREAD_SWITCH:
844 if (!check_instruction_writes(validated_shader,
845 &validation_state)) {
846 DRM_DEBUG("Bad write at ip %d\n", ip);
847 goto fail;
848 }
849
850 if (!check_instruction_reads(validated_shader,
851 &validation_state))
852 goto fail;
853
854 if (sig == QPU_SIG_PROG_END) {
855 found_shader_end = true;
856 shader_end_ip = ip;
857 }
858
859 if (sig == QPU_SIG_THREAD_SWITCH ||
860 sig == QPU_SIG_LAST_THREAD_SWITCH) {
861 validated_shader->is_threaded = true;
862
863 if (ip < last_thread_switch_ip + 3) {
864 DRM_DEBUG("Thread switch too soon after "
865 "last switch at ip %d\n", ip);
866 goto fail;
867 }
868 last_thread_switch_ip = ip;
869 }
870
871 break;
872
873 case QPU_SIG_LOAD_IMM:
874 if (!check_instruction_writes(validated_shader,
875 &validation_state)) {
876 DRM_DEBUG("Bad LOAD_IMM write at ip %d\n", ip);
877 goto fail;
878 }
879 break;
880
881 case QPU_SIG_BRANCH:
882 if (!check_branch(inst, validated_shader,
883 &validation_state, ip))
884 goto fail;
885
886 if (ip < last_thread_switch_ip + 3) {
887 DRM_DEBUG("Branch in thread switch at ip %d",
888 ip);
889 goto fail;
890 }
891
892 break;
893 default:
894 DRM_DEBUG("Unsupported QPU signal %d at "
895 "instruction %d\n", sig, ip);
896 goto fail;
897 }
898
899 /* There are two delay slots after program end is signaled
900 * that are still executed, then we're finished.
901 */
902 if (found_shader_end && ip == shader_end_ip + 2)
903 break;
904 }
905
906 if (ip == validation_state.max_ip) {
907 DRM_DEBUG("shader failed to terminate before "
908 "shader BO end at %zd\n",
909 shader_obj->base.size);
910 goto fail;
911 }
912
913 /* Might corrupt other thread */
914 if (validated_shader->is_threaded &&
915 validation_state.all_registers_used) {
916 DRM_DEBUG("Shader uses threading, but uses the upper "
917 "half of the registers, too\n");
918 goto fail;
919 }
920
921 /* If we did a backwards branch and we haven't emitted a uniforms
922 * reset since then, we still need the uniforms stream to have the
923 * uniforms address available so that the backwards branch can do its
924 * uniforms reset.
925 *
926 * We could potentially prove that the backwards branch doesn't
927 * contain any uses of uniforms until program exit, but that doesn't
928 * seem to be worth the trouble.
929 */
930 if (validation_state.needs_uniform_address_for_loop) {
931 if (!require_uniform_address_uniform(validated_shader))
932 goto fail;
933 validated_shader->uniforms_size += 4;
934 }
935
936 /* Again, no chance of integer overflow here because the worst case
937 * scenario is 8 bytes of uniforms plus handles per 8-byte
938 * instruction.
939 */
940 validated_shader->uniforms_src_size =
941 (validated_shader->uniforms_size +
942 4 * validated_shader->num_texture_samples);
943
944 kfree(validation_state.branch_targets);
945
946 return validated_shader;
947
948 fail:
949 kfree(validation_state.branch_targets);
950 if (validated_shader) {
951 kfree(validated_shader->uniform_addr_offsets);
952 kfree(validated_shader->texture_samples);
953 kfree(validated_shader);
954 }
955 return NULL;
956 }
957