1 /* 2 * Copyright © 2014 Broadcom 3 * 4 * Permission is hereby granted, free of charge, to any person obtaining a 5 * copy of this software and associated documentation files (the "Software"), 6 * to deal in the Software without restriction, including without limitation 7 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 8 * and/or sell copies of the Software, and to permit persons to whom the 9 * Software is furnished to do so, subject to the following conditions: 10 * 11 * The above copyright notice and this permission notice (including the next 12 * paragraph) shall be included in all copies or substantial portions of the 13 * Software. 14 * 15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS 21 * IN THE SOFTWARE. 22 */ 23 24 /** 25 * DOC: Shader validator for VC4. 26 * 27 * Since the VC4 has no IOMMU between it and system memory, a user 28 * with access to execute shaders could escalate privilege by 29 * overwriting system memory (using the VPM write address register in 30 * the general-purpose DMA mode) or reading system memory it shouldn't 31 * (reading it as a texture, uniform data, or direct-addressed TMU 32 * lookup). 33 * 34 * The shader validator walks over a shader's BO, ensuring that its 35 * accesses are appropriately bounded, and recording where texture 36 * accesses are made so that we can do relocations for them in the 37 * uniform stream. 38 * 39 * Shader BO are immutable for their lifetimes (enforced by not 40 * allowing mmaps, GEM prime export, or rendering to from a CL), so 41 * this validation is only performed at BO creation time. 42 */ 43 44 #include <drm/drm_print.h> 45 46 #include "vc4_drv.h" 47 #include "vc4_qpu_defines.h" 48 49 #define LIVE_REG_COUNT (32 + 32 + 4) 50 51 struct vc4_shader_validation_state { 52 /* Current IP being validated. */ 53 uint32_t ip; 54 55 /* IP at the end of the BO, do not read shader[max_ip] */ 56 uint32_t max_ip; 57 58 uint64_t *shader; 59 60 struct vc4_texture_sample_info tmu_setup[2]; 61 int tmu_write_count[2]; 62 63 /* For registers that were last written to by a MIN instruction with 64 * one argument being a uniform, the address of the uniform. 65 * Otherwise, ~0. 66 * 67 * This is used for the validation of direct address memory reads. 68 */ 69 uint32_t live_min_clamp_offsets[LIVE_REG_COUNT]; 70 bool live_max_clamp_regs[LIVE_REG_COUNT]; 71 uint32_t live_immediates[LIVE_REG_COUNT]; 72 73 /* Bitfield of which IPs are used as branch targets. 74 * 75 * Used for validation that the uniform stream is updated at the right 76 * points and clearing the texturing/clamping state. 77 */ 78 unsigned long *branch_targets; 79 80 /* Set when entering a basic block, and cleared when the uniform 81 * address update is found. This is used to make sure that we don't 82 * read uniforms when the address is undefined. 83 */ 84 bool needs_uniform_address_update; 85 86 /* Set when we find a backwards branch. If the branch is backwards, 87 * the taraget is probably doing an address reset to read uniforms, 88 * and so we need to be sure that a uniforms address is present in the 89 * stream, even if the shader didn't need to read uniforms in later 90 * basic blocks. 91 */ 92 bool needs_uniform_address_for_loop; 93 94 /* Set when we find an instruction writing the top half of the 95 * register files. If we allowed writing the unusable regs in 96 * a threaded shader, then the other shader running on our 97 * QPU's clamp validation would be invalid. 98 */ 99 bool all_registers_used; 100 }; 101 102 static uint32_t 103 waddr_to_live_reg_index(uint32_t waddr, bool is_b) 104 { 105 if (waddr < 32) { 106 if (is_b) 107 return 32 + waddr; 108 else 109 return waddr; 110 } else if (waddr <= QPU_W_ACC3) { 111 return 64 + waddr - QPU_W_ACC0; 112 } else { 113 return ~0; 114 } 115 } 116 117 static uint32_t 118 raddr_add_a_to_live_reg_index(uint64_t inst) 119 { 120 uint32_t sig = QPU_GET_FIELD(inst, QPU_SIG); 121 uint32_t add_a = QPU_GET_FIELD(inst, QPU_ADD_A); 122 uint32_t raddr_a = QPU_GET_FIELD(inst, QPU_RADDR_A); 123 uint32_t raddr_b = QPU_GET_FIELD(inst, QPU_RADDR_B); 124 125 if (add_a == QPU_MUX_A) 126 return raddr_a; 127 else if (add_a == QPU_MUX_B && sig != QPU_SIG_SMALL_IMM) 128 return 32 + raddr_b; 129 else if (add_a <= QPU_MUX_R3) 130 return 64 + add_a; 131 else 132 return ~0; 133 } 134 135 static bool 136 live_reg_is_upper_half(uint32_t lri) 137 { 138 return (lri >= 16 && lri < 32) || 139 (lri >= 32 + 16 && lri < 32 + 32); 140 } 141 142 static bool 143 is_tmu_submit(uint32_t waddr) 144 { 145 return (waddr == QPU_W_TMU0_S || 146 waddr == QPU_W_TMU1_S); 147 } 148 149 static bool 150 is_tmu_write(uint32_t waddr) 151 { 152 return (waddr >= QPU_W_TMU0_S && 153 waddr <= QPU_W_TMU1_B); 154 } 155 156 static bool 157 record_texture_sample(struct vc4_validated_shader_info *validated_shader, 158 struct vc4_shader_validation_state *validation_state, 159 int tmu) 160 { 161 uint32_t s = validated_shader->num_texture_samples; 162 int i; 163 struct vc4_texture_sample_info *temp_samples; 164 165 temp_samples = krealloc(validated_shader->texture_samples, 166 (s + 1) * sizeof(*temp_samples), 167 GFP_KERNEL); 168 if (!temp_samples) 169 return false; 170 171 memcpy(&temp_samples[s], 172 &validation_state->tmu_setup[tmu], 173 sizeof(*temp_samples)); 174 175 validated_shader->num_texture_samples = s + 1; 176 validated_shader->texture_samples = temp_samples; 177 178 for (i = 0; i < 4; i++) 179 validation_state->tmu_setup[tmu].p_offset[i] = ~0; 180 181 return true; 182 } 183 184 static bool 185 check_tmu_write(struct vc4_validated_shader_info *validated_shader, 186 struct vc4_shader_validation_state *validation_state, 187 bool is_mul) 188 { 189 uint64_t inst = validation_state->shader[validation_state->ip]; 190 uint32_t waddr = (is_mul ? 191 QPU_GET_FIELD(inst, QPU_WADDR_MUL) : 192 QPU_GET_FIELD(inst, QPU_WADDR_ADD)); 193 uint32_t raddr_a = QPU_GET_FIELD(inst, QPU_RADDR_A); 194 uint32_t raddr_b = QPU_GET_FIELD(inst, QPU_RADDR_B); 195 int tmu = waddr > QPU_W_TMU0_B; 196 bool submit = is_tmu_submit(waddr); 197 bool is_direct = submit && validation_state->tmu_write_count[tmu] == 0; 198 uint32_t sig = QPU_GET_FIELD(inst, QPU_SIG); 199 200 if (is_direct) { 201 uint32_t add_b = QPU_GET_FIELD(inst, QPU_ADD_B); 202 uint32_t clamp_reg, clamp_offset; 203 204 if (sig == QPU_SIG_SMALL_IMM) { 205 DRM_DEBUG("direct TMU read used small immediate\n"); 206 return false; 207 } 208 209 /* Make sure that this texture load is an add of the base 210 * address of the UBO to a clamped offset within the UBO. 211 */ 212 if (is_mul || 213 QPU_GET_FIELD(inst, QPU_OP_ADD) != QPU_A_ADD) { 214 DRM_DEBUG("direct TMU load wasn't an add\n"); 215 return false; 216 } 217 218 /* We assert that the clamped address is the first 219 * argument, and the UBO base address is the second argument. 220 * This is arbitrary, but simpler than supporting flipping the 221 * two either way. 222 */ 223 clamp_reg = raddr_add_a_to_live_reg_index(inst); 224 if (clamp_reg == ~0) { 225 DRM_DEBUG("direct TMU load wasn't clamped\n"); 226 return false; 227 } 228 229 clamp_offset = validation_state->live_min_clamp_offsets[clamp_reg]; 230 if (clamp_offset == ~0) { 231 DRM_DEBUG("direct TMU load wasn't clamped\n"); 232 return false; 233 } 234 235 /* Store the clamp value's offset in p1 (see reloc_tex() in 236 * vc4_validate.c). 237 */ 238 validation_state->tmu_setup[tmu].p_offset[1] = 239 clamp_offset; 240 241 if (!(add_b == QPU_MUX_A && raddr_a == QPU_R_UNIF) && 242 !(add_b == QPU_MUX_B && raddr_b == QPU_R_UNIF)) { 243 DRM_DEBUG("direct TMU load didn't add to a uniform\n"); 244 return false; 245 } 246 247 validation_state->tmu_setup[tmu].is_direct = true; 248 } else { 249 if (raddr_a == QPU_R_UNIF || (sig != QPU_SIG_SMALL_IMM && 250 raddr_b == QPU_R_UNIF)) { 251 DRM_DEBUG("uniform read in the same instruction as " 252 "texture setup.\n"); 253 return false; 254 } 255 } 256 257 if (validation_state->tmu_write_count[tmu] >= 4) { 258 DRM_DEBUG("TMU%d got too many parameters before dispatch\n", 259 tmu); 260 return false; 261 } 262 validation_state->tmu_setup[tmu].p_offset[validation_state->tmu_write_count[tmu]] = 263 validated_shader->uniforms_size; 264 validation_state->tmu_write_count[tmu]++; 265 /* Since direct uses a RADDR uniform reference, it will get counted in 266 * check_instruction_reads() 267 */ 268 if (!is_direct) { 269 if (validation_state->needs_uniform_address_update) { 270 DRM_DEBUG("Texturing with undefined uniform address\n"); 271 return false; 272 } 273 274 validated_shader->uniforms_size += 4; 275 } 276 277 if (submit) { 278 if (!record_texture_sample(validated_shader, 279 validation_state, tmu)) { 280 return false; 281 } 282 283 validation_state->tmu_write_count[tmu] = 0; 284 } 285 286 return true; 287 } 288 289 static bool require_uniform_address_uniform(struct vc4_validated_shader_info *validated_shader) 290 { 291 uint32_t o = validated_shader->num_uniform_addr_offsets; 292 uint32_t num_uniforms = validated_shader->uniforms_size / 4; 293 294 validated_shader->uniform_addr_offsets = 295 krealloc(validated_shader->uniform_addr_offsets, 296 (o + 1) * 297 sizeof(*validated_shader->uniform_addr_offsets), 298 GFP_KERNEL); 299 if (!validated_shader->uniform_addr_offsets) 300 return false; 301 302 validated_shader->uniform_addr_offsets[o] = num_uniforms; 303 validated_shader->num_uniform_addr_offsets++; 304 305 return true; 306 } 307 308 static bool 309 validate_uniform_address_write(struct vc4_validated_shader_info *validated_shader, 310 struct vc4_shader_validation_state *validation_state, 311 bool is_mul) 312 { 313 uint64_t inst = validation_state->shader[validation_state->ip]; 314 u32 add_b = QPU_GET_FIELD(inst, QPU_ADD_B); 315 u32 raddr_a = QPU_GET_FIELD(inst, QPU_RADDR_A); 316 u32 raddr_b = QPU_GET_FIELD(inst, QPU_RADDR_B); 317 u32 add_lri = raddr_add_a_to_live_reg_index(inst); 318 /* We want our reset to be pointing at whatever uniform follows the 319 * uniforms base address. 320 */ 321 u32 expected_offset = validated_shader->uniforms_size + 4; 322 323 /* We only support absolute uniform address changes, and we 324 * require that they be in the current basic block before any 325 * of its uniform reads. 326 * 327 * One could potentially emit more efficient QPU code, by 328 * noticing that (say) an if statement does uniform control 329 * flow for all threads and that the if reads the same number 330 * of uniforms on each side. However, this scheme is easy to 331 * validate so it's all we allow for now. 332 */ 333 switch (QPU_GET_FIELD(inst, QPU_SIG)) { 334 case QPU_SIG_NONE: 335 case QPU_SIG_SCOREBOARD_UNLOCK: 336 case QPU_SIG_COLOR_LOAD: 337 case QPU_SIG_LOAD_TMU0: 338 case QPU_SIG_LOAD_TMU1: 339 break; 340 default: 341 DRM_DEBUG("uniforms address change must be " 342 "normal math\n"); 343 return false; 344 } 345 346 if (is_mul || QPU_GET_FIELD(inst, QPU_OP_ADD) != QPU_A_ADD) { 347 DRM_DEBUG("Uniform address reset must be an ADD.\n"); 348 return false; 349 } 350 351 if (QPU_GET_FIELD(inst, QPU_COND_ADD) != QPU_COND_ALWAYS) { 352 DRM_DEBUG("Uniform address reset must be unconditional.\n"); 353 return false; 354 } 355 356 if (QPU_GET_FIELD(inst, QPU_PACK) != QPU_PACK_A_NOP && 357 !(inst & QPU_PM)) { 358 DRM_DEBUG("No packing allowed on uniforms reset\n"); 359 return false; 360 } 361 362 if (add_lri == -1) { 363 DRM_DEBUG("First argument of uniform address write must be " 364 "an immediate value.\n"); 365 return false; 366 } 367 368 if (validation_state->live_immediates[add_lri] != expected_offset) { 369 DRM_DEBUG("Resetting uniforms with offset %db instead of %db\n", 370 validation_state->live_immediates[add_lri], 371 expected_offset); 372 return false; 373 } 374 375 if (!(add_b == QPU_MUX_A && raddr_a == QPU_R_UNIF) && 376 !(add_b == QPU_MUX_B && raddr_b == QPU_R_UNIF)) { 377 DRM_DEBUG("Second argument of uniform address write must be " 378 "a uniform.\n"); 379 return false; 380 } 381 382 validation_state->needs_uniform_address_update = false; 383 validation_state->needs_uniform_address_for_loop = false; 384 return require_uniform_address_uniform(validated_shader); 385 } 386 387 static bool 388 check_reg_write(struct vc4_validated_shader_info *validated_shader, 389 struct vc4_shader_validation_state *validation_state, 390 bool is_mul) 391 { 392 uint64_t inst = validation_state->shader[validation_state->ip]; 393 uint32_t waddr = (is_mul ? 394 QPU_GET_FIELD(inst, QPU_WADDR_MUL) : 395 QPU_GET_FIELD(inst, QPU_WADDR_ADD)); 396 uint32_t sig = QPU_GET_FIELD(inst, QPU_SIG); 397 bool ws = inst & QPU_WS; 398 bool is_b = is_mul ^ ws; 399 u32 lri = waddr_to_live_reg_index(waddr, is_b); 400 401 if (lri != -1) { 402 uint32_t cond_add = QPU_GET_FIELD(inst, QPU_COND_ADD); 403 uint32_t cond_mul = QPU_GET_FIELD(inst, QPU_COND_MUL); 404 405 if (sig == QPU_SIG_LOAD_IMM && 406 QPU_GET_FIELD(inst, QPU_PACK) == QPU_PACK_A_NOP && 407 ((is_mul && cond_mul == QPU_COND_ALWAYS) || 408 (!is_mul && cond_add == QPU_COND_ALWAYS))) { 409 validation_state->live_immediates[lri] = 410 QPU_GET_FIELD(inst, QPU_LOAD_IMM); 411 } else { 412 validation_state->live_immediates[lri] = ~0; 413 } 414 415 if (live_reg_is_upper_half(lri)) 416 validation_state->all_registers_used = true; 417 } 418 419 switch (waddr) { 420 case QPU_W_UNIFORMS_ADDRESS: 421 if (is_b) { 422 DRM_DEBUG("relative uniforms address change " 423 "unsupported\n"); 424 return false; 425 } 426 427 return validate_uniform_address_write(validated_shader, 428 validation_state, 429 is_mul); 430 431 case QPU_W_TLB_COLOR_MS: 432 case QPU_W_TLB_COLOR_ALL: 433 case QPU_W_TLB_Z: 434 /* These only interact with the tile buffer, not main memory, 435 * so they're safe. 436 */ 437 return true; 438 439 case QPU_W_TMU0_S: 440 case QPU_W_TMU0_T: 441 case QPU_W_TMU0_R: 442 case QPU_W_TMU0_B: 443 case QPU_W_TMU1_S: 444 case QPU_W_TMU1_T: 445 case QPU_W_TMU1_R: 446 case QPU_W_TMU1_B: 447 return check_tmu_write(validated_shader, validation_state, 448 is_mul); 449 450 case QPU_W_HOST_INT: 451 case QPU_W_TMU_NOSWAP: 452 case QPU_W_TLB_ALPHA_MASK: 453 case QPU_W_MUTEX_RELEASE: 454 /* XXX: I haven't thought about these, so don't support them 455 * for now. 456 */ 457 DRM_DEBUG("Unsupported waddr %d\n", waddr); 458 return false; 459 460 case QPU_W_VPM_ADDR: 461 DRM_DEBUG("General VPM DMA unsupported\n"); 462 return false; 463 464 case QPU_W_VPM: 465 case QPU_W_VPMVCD_SETUP: 466 /* We allow VPM setup in general, even including VPM DMA 467 * configuration setup, because the (unsafe) DMA can only be 468 * triggered by QPU_W_VPM_ADDR writes. 469 */ 470 return true; 471 472 case QPU_W_TLB_STENCIL_SETUP: 473 return true; 474 } 475 476 return true; 477 } 478 479 static void 480 track_live_clamps(struct vc4_validated_shader_info *validated_shader, 481 struct vc4_shader_validation_state *validation_state) 482 { 483 uint64_t inst = validation_state->shader[validation_state->ip]; 484 uint32_t op_add = QPU_GET_FIELD(inst, QPU_OP_ADD); 485 uint32_t waddr_add = QPU_GET_FIELD(inst, QPU_WADDR_ADD); 486 uint32_t waddr_mul = QPU_GET_FIELD(inst, QPU_WADDR_MUL); 487 uint32_t cond_add = QPU_GET_FIELD(inst, QPU_COND_ADD); 488 uint32_t add_a = QPU_GET_FIELD(inst, QPU_ADD_A); 489 uint32_t add_b = QPU_GET_FIELD(inst, QPU_ADD_B); 490 uint32_t raddr_a = QPU_GET_FIELD(inst, QPU_RADDR_A); 491 uint32_t raddr_b = QPU_GET_FIELD(inst, QPU_RADDR_B); 492 uint32_t sig = QPU_GET_FIELD(inst, QPU_SIG); 493 bool ws = inst & QPU_WS; 494 uint32_t lri_add_a, lri_add, lri_mul; 495 bool add_a_is_min_0; 496 497 /* Check whether OP_ADD's A argumennt comes from a live MAX(x, 0), 498 * before we clear previous live state. 499 */ 500 lri_add_a = raddr_add_a_to_live_reg_index(inst); 501 add_a_is_min_0 = (lri_add_a != ~0 && 502 validation_state->live_max_clamp_regs[lri_add_a]); 503 504 /* Clear live state for registers written by our instruction. */ 505 lri_add = waddr_to_live_reg_index(waddr_add, ws); 506 lri_mul = waddr_to_live_reg_index(waddr_mul, !ws); 507 if (lri_mul != ~0) { 508 validation_state->live_max_clamp_regs[lri_mul] = false; 509 validation_state->live_min_clamp_offsets[lri_mul] = ~0; 510 } 511 if (lri_add != ~0) { 512 validation_state->live_max_clamp_regs[lri_add] = false; 513 validation_state->live_min_clamp_offsets[lri_add] = ~0; 514 } else { 515 /* Nothing further to do for live tracking, since only ADDs 516 * generate new live clamp registers. 517 */ 518 return; 519 } 520 521 /* Now, handle remaining live clamp tracking for the ADD operation. */ 522 523 if (cond_add != QPU_COND_ALWAYS) 524 return; 525 526 if (op_add == QPU_A_MAX) { 527 /* Track live clamps of a value to a minimum of 0 (in either 528 * arg). 529 */ 530 if (sig != QPU_SIG_SMALL_IMM || raddr_b != 0 || 531 (add_a != QPU_MUX_B && add_b != QPU_MUX_B)) { 532 return; 533 } 534 535 validation_state->live_max_clamp_regs[lri_add] = true; 536 } else if (op_add == QPU_A_MIN) { 537 /* Track live clamps of a value clamped to a minimum of 0 and 538 * a maximum of some uniform's offset. 539 */ 540 if (!add_a_is_min_0) 541 return; 542 543 if (!(add_b == QPU_MUX_A && raddr_a == QPU_R_UNIF) && 544 !(add_b == QPU_MUX_B && raddr_b == QPU_R_UNIF && 545 sig != QPU_SIG_SMALL_IMM)) { 546 return; 547 } 548 549 validation_state->live_min_clamp_offsets[lri_add] = 550 validated_shader->uniforms_size; 551 } 552 } 553 554 static bool 555 check_instruction_writes(struct vc4_validated_shader_info *validated_shader, 556 struct vc4_shader_validation_state *validation_state) 557 { 558 uint64_t inst = validation_state->shader[validation_state->ip]; 559 uint32_t waddr_add = QPU_GET_FIELD(inst, QPU_WADDR_ADD); 560 uint32_t waddr_mul = QPU_GET_FIELD(inst, QPU_WADDR_MUL); 561 bool ok; 562 563 if (is_tmu_write(waddr_add) && is_tmu_write(waddr_mul)) { 564 DRM_DEBUG("ADD and MUL both set up textures\n"); 565 return false; 566 } 567 568 ok = (check_reg_write(validated_shader, validation_state, false) && 569 check_reg_write(validated_shader, validation_state, true)); 570 571 track_live_clamps(validated_shader, validation_state); 572 573 return ok; 574 } 575 576 static bool 577 check_branch(uint64_t inst, 578 struct vc4_validated_shader_info *validated_shader, 579 struct vc4_shader_validation_state *validation_state, 580 int ip) 581 { 582 int32_t branch_imm = QPU_GET_FIELD(inst, QPU_BRANCH_TARGET); 583 uint32_t waddr_add = QPU_GET_FIELD(inst, QPU_WADDR_ADD); 584 uint32_t waddr_mul = QPU_GET_FIELD(inst, QPU_WADDR_MUL); 585 586 if ((int)branch_imm < 0) 587 validation_state->needs_uniform_address_for_loop = true; 588 589 /* We don't want to have to worry about validation of this, and 590 * there's no need for it. 591 */ 592 if (waddr_add != QPU_W_NOP || waddr_mul != QPU_W_NOP) { 593 DRM_DEBUG("branch instruction at %d wrote a register.\n", 594 validation_state->ip); 595 return false; 596 } 597 598 return true; 599 } 600 601 static bool 602 check_instruction_reads(struct vc4_validated_shader_info *validated_shader, 603 struct vc4_shader_validation_state *validation_state) 604 { 605 uint64_t inst = validation_state->shader[validation_state->ip]; 606 uint32_t raddr_a = QPU_GET_FIELD(inst, QPU_RADDR_A); 607 uint32_t raddr_b = QPU_GET_FIELD(inst, QPU_RADDR_B); 608 uint32_t sig = QPU_GET_FIELD(inst, QPU_SIG); 609 610 if (raddr_a == QPU_R_UNIF || 611 (raddr_b == QPU_R_UNIF && sig != QPU_SIG_SMALL_IMM)) { 612 /* This can't overflow the uint32_t, because we're reading 8 613 * bytes of instruction to increment by 4 here, so we'd 614 * already be OOM. 615 */ 616 validated_shader->uniforms_size += 4; 617 618 if (validation_state->needs_uniform_address_update) { 619 DRM_DEBUG("Uniform read with undefined uniform " 620 "address\n"); 621 return false; 622 } 623 } 624 625 if ((raddr_a >= 16 && raddr_a < 32) || 626 (raddr_b >= 16 && raddr_b < 32 && sig != QPU_SIG_SMALL_IMM)) { 627 validation_state->all_registers_used = true; 628 } 629 630 return true; 631 } 632 633 /* Make sure that all branches are absolute and point within the shader, and 634 * note their targets for later. 635 */ 636 static bool 637 vc4_validate_branches(struct vc4_shader_validation_state *validation_state) 638 { 639 uint32_t max_branch_target = 0; 640 int ip; 641 int last_branch = -2; 642 643 for (ip = 0; ip < validation_state->max_ip; ip++) { 644 uint64_t inst = validation_state->shader[ip]; 645 int32_t branch_imm = QPU_GET_FIELD(inst, QPU_BRANCH_TARGET); 646 uint32_t sig = QPU_GET_FIELD(inst, QPU_SIG); 647 uint32_t after_delay_ip = ip + 4; 648 uint32_t branch_target_ip; 649 650 if (sig == QPU_SIG_PROG_END) { 651 /* There are two delay slots after program end is 652 * signaled that are still executed, then we're 653 * finished. validation_state->max_ip is the 654 * instruction after the last valid instruction in the 655 * program. 656 */ 657 validation_state->max_ip = ip + 3; 658 continue; 659 } 660 661 if (sig != QPU_SIG_BRANCH) 662 continue; 663 664 if (ip - last_branch < 4) { 665 DRM_DEBUG("Branch at %d during delay slots\n", ip); 666 return false; 667 } 668 last_branch = ip; 669 670 if (inst & QPU_BRANCH_REG) { 671 DRM_DEBUG("branching from register relative " 672 "not supported\n"); 673 return false; 674 } 675 676 if (!(inst & QPU_BRANCH_REL)) { 677 DRM_DEBUG("relative branching required\n"); 678 return false; 679 } 680 681 /* The actual branch target is the instruction after the delay 682 * slots, plus whatever byte offset is in the low 32 bits of 683 * the instruction. Make sure we're not branching beyond the 684 * end of the shader object. 685 */ 686 if (branch_imm % sizeof(inst) != 0) { 687 DRM_DEBUG("branch target not aligned\n"); 688 return false; 689 } 690 691 branch_target_ip = after_delay_ip + (branch_imm >> 3); 692 if (branch_target_ip >= validation_state->max_ip) { 693 DRM_DEBUG("Branch at %d outside of shader (ip %d/%d)\n", 694 ip, branch_target_ip, 695 validation_state->max_ip); 696 return false; 697 } 698 set_bit(branch_target_ip, validation_state->branch_targets); 699 700 /* Make sure that the non-branching path is also not outside 701 * the shader. 702 */ 703 if (after_delay_ip >= validation_state->max_ip) { 704 DRM_DEBUG("Branch at %d continues past shader end " 705 "(%d/%d)\n", 706 ip, after_delay_ip, validation_state->max_ip); 707 return false; 708 } 709 set_bit(after_delay_ip, validation_state->branch_targets); 710 max_branch_target = max(max_branch_target, after_delay_ip); 711 } 712 713 if (max_branch_target > validation_state->max_ip - 3) { 714 DRM_DEBUG("Branch landed after QPU_SIG_PROG_END"); 715 return false; 716 } 717 718 return true; 719 } 720 721 /* Resets any known state for the shader, used when we may be branched to from 722 * multiple locations in the program (or at shader start). 723 */ 724 static void 725 reset_validation_state(struct vc4_shader_validation_state *validation_state) 726 { 727 int i; 728 729 for (i = 0; i < 8; i++) 730 validation_state->tmu_setup[i / 4].p_offset[i % 4] = ~0; 731 732 for (i = 0; i < LIVE_REG_COUNT; i++) { 733 validation_state->live_min_clamp_offsets[i] = ~0; 734 validation_state->live_max_clamp_regs[i] = false; 735 validation_state->live_immediates[i] = ~0; 736 } 737 } 738 739 static bool 740 texturing_in_progress(struct vc4_shader_validation_state *validation_state) 741 { 742 return (validation_state->tmu_write_count[0] != 0 || 743 validation_state->tmu_write_count[1] != 0); 744 } 745 746 static bool 747 vc4_handle_branch_target(struct vc4_shader_validation_state *validation_state) 748 { 749 uint32_t ip = validation_state->ip; 750 751 if (!test_bit(ip, validation_state->branch_targets)) 752 return true; 753 754 if (texturing_in_progress(validation_state)) { 755 DRM_DEBUG("Branch target landed during TMU setup\n"); 756 return false; 757 } 758 759 /* Reset our live values tracking, since this instruction may have 760 * multiple predecessors. 761 * 762 * One could potentially do analysis to determine that, for 763 * example, all predecessors have a live max clamp in the same 764 * register, but we don't bother with that. 765 */ 766 reset_validation_state(validation_state); 767 768 /* Since we've entered a basic block from potentially multiple 769 * predecessors, we need the uniforms address to be updated before any 770 * unforms are read. We require that after any branch point, the next 771 * uniform to be loaded is a uniform address offset. That uniform's 772 * offset will be marked by the uniform address register write 773 * validation, or a one-off the end-of-program check. 774 */ 775 validation_state->needs_uniform_address_update = true; 776 777 return true; 778 } 779 780 struct vc4_validated_shader_info * 781 vc4_validate_shader(struct drm_gem_dma_object *shader_obj) 782 { 783 struct vc4_dev *vc4 = to_vc4_dev(shader_obj->base.dev); 784 bool found_shader_end = false; 785 int shader_end_ip = 0; 786 uint32_t last_thread_switch_ip = -3; 787 uint32_t ip; 788 struct vc4_validated_shader_info *validated_shader = NULL; 789 struct vc4_shader_validation_state validation_state; 790 791 if (WARN_ON_ONCE(vc4->gen > VC4_GEN_4)) 792 return NULL; 793 794 memset(&validation_state, 0, sizeof(validation_state)); 795 validation_state.shader = shader_obj->vaddr; 796 validation_state.max_ip = shader_obj->base.size / sizeof(uint64_t); 797 798 reset_validation_state(&validation_state); 799 800 validation_state.branch_targets = 801 kcalloc(BITS_TO_LONGS(validation_state.max_ip), 802 sizeof(unsigned long), GFP_KERNEL); 803 if (!validation_state.branch_targets) 804 goto fail; 805 806 validated_shader = kcalloc(1, sizeof(*validated_shader), GFP_KERNEL); 807 if (!validated_shader) 808 goto fail; 809 810 if (!vc4_validate_branches(&validation_state)) 811 goto fail; 812 813 for (ip = 0; ip < validation_state.max_ip; ip++) { 814 uint64_t inst = validation_state.shader[ip]; 815 uint32_t sig = QPU_GET_FIELD(inst, QPU_SIG); 816 817 validation_state.ip = ip; 818 819 if (!vc4_handle_branch_target(&validation_state)) 820 goto fail; 821 822 if (ip == last_thread_switch_ip + 3) { 823 /* Reset r0-r3 live clamp data */ 824 int i; 825 826 for (i = 64; i < LIVE_REG_COUNT; i++) { 827 validation_state.live_min_clamp_offsets[i] = ~0; 828 validation_state.live_max_clamp_regs[i] = false; 829 validation_state.live_immediates[i] = ~0; 830 } 831 } 832 833 switch (sig) { 834 case QPU_SIG_NONE: 835 case QPU_SIG_WAIT_FOR_SCOREBOARD: 836 case QPU_SIG_SCOREBOARD_UNLOCK: 837 case QPU_SIG_COLOR_LOAD: 838 case QPU_SIG_LOAD_TMU0: 839 case QPU_SIG_LOAD_TMU1: 840 case QPU_SIG_PROG_END: 841 case QPU_SIG_SMALL_IMM: 842 case QPU_SIG_THREAD_SWITCH: 843 case QPU_SIG_LAST_THREAD_SWITCH: 844 if (!check_instruction_writes(validated_shader, 845 &validation_state)) { 846 DRM_DEBUG("Bad write at ip %d\n", ip); 847 goto fail; 848 } 849 850 if (!check_instruction_reads(validated_shader, 851 &validation_state)) 852 goto fail; 853 854 if (sig == QPU_SIG_PROG_END) { 855 found_shader_end = true; 856 shader_end_ip = ip; 857 } 858 859 if (sig == QPU_SIG_THREAD_SWITCH || 860 sig == QPU_SIG_LAST_THREAD_SWITCH) { 861 validated_shader->is_threaded = true; 862 863 if (ip < last_thread_switch_ip + 3) { 864 DRM_DEBUG("Thread switch too soon after " 865 "last switch at ip %d\n", ip); 866 goto fail; 867 } 868 last_thread_switch_ip = ip; 869 } 870 871 break; 872 873 case QPU_SIG_LOAD_IMM: 874 if (!check_instruction_writes(validated_shader, 875 &validation_state)) { 876 DRM_DEBUG("Bad LOAD_IMM write at ip %d\n", ip); 877 goto fail; 878 } 879 break; 880 881 case QPU_SIG_BRANCH: 882 if (!check_branch(inst, validated_shader, 883 &validation_state, ip)) 884 goto fail; 885 886 if (ip < last_thread_switch_ip + 3) { 887 DRM_DEBUG("Branch in thread switch at ip %d", 888 ip); 889 goto fail; 890 } 891 892 break; 893 default: 894 DRM_DEBUG("Unsupported QPU signal %d at " 895 "instruction %d\n", sig, ip); 896 goto fail; 897 } 898 899 /* There are two delay slots after program end is signaled 900 * that are still executed, then we're finished. 901 */ 902 if (found_shader_end && ip == shader_end_ip + 2) 903 break; 904 } 905 906 if (ip == validation_state.max_ip) { 907 DRM_DEBUG("shader failed to terminate before " 908 "shader BO end at %zd\n", 909 shader_obj->base.size); 910 goto fail; 911 } 912 913 /* Might corrupt other thread */ 914 if (validated_shader->is_threaded && 915 validation_state.all_registers_used) { 916 DRM_DEBUG("Shader uses threading, but uses the upper " 917 "half of the registers, too\n"); 918 goto fail; 919 } 920 921 /* If we did a backwards branch and we haven't emitted a uniforms 922 * reset since then, we still need the uniforms stream to have the 923 * uniforms address available so that the backwards branch can do its 924 * uniforms reset. 925 * 926 * We could potentially prove that the backwards branch doesn't 927 * contain any uses of uniforms until program exit, but that doesn't 928 * seem to be worth the trouble. 929 */ 930 if (validation_state.needs_uniform_address_for_loop) { 931 if (!require_uniform_address_uniform(validated_shader)) 932 goto fail; 933 validated_shader->uniforms_size += 4; 934 } 935 936 /* Again, no chance of integer overflow here because the worst case 937 * scenario is 8 bytes of uniforms plus handles per 8-byte 938 * instruction. 939 */ 940 validated_shader->uniforms_src_size = 941 (validated_shader->uniforms_size + 942 4 * validated_shader->num_texture_samples); 943 944 kfree(validation_state.branch_targets); 945 946 return validated_shader; 947 948 fail: 949 kfree(validation_state.branch_targets); 950 if (validated_shader) { 951 kfree(validated_shader->uniform_addr_offsets); 952 kfree(validated_shader->texture_samples); 953 kfree(validated_shader); 954 } 955 return NULL; 956 } 957