1 /* 2 * Copyright (c) 2008 Intel Corporation 3 * 4 * Permission is hereby granted, free of charge, to any person obtaining a 5 * copy of this software and associated documentation files (the "Software"), 6 * to deal in the Software without restriction, including without limitation 7 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 8 * and/or sell copies of the Software, and to permit persons to whom the 9 * Software is furnished to do so, subject to the following conditions: 10 * 11 * The above copyright notice and this permission notice (including the next 12 * paragraph) shall be included in all copies or substantial portions of the 13 * Software. 14 * 15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS 21 * IN THE SOFTWARE. 22 * 23 * Authors: 24 * Eric Anholt <eric@anholt.net> 25 * Keith Packard <keithp@keithp.com> 26 * Mika Kuoppala <mika.kuoppala@intel.com> 27 * 28 */ 29 30 #include <linux/ascii85.h> 31 #include <linux/nmi.h> 32 #include <linux/scatterlist.h> 33 #include <linux/stop_machine.h> 34 #include <linux/utsname.h> 35 #include <linux/zlib.h> 36 37 #include <drm/drm_print.h> 38 39 #include "display/intel_atomic.h" 40 #include "display/intel_overlay.h" 41 42 #include "gem/i915_gem_context.h" 43 44 #include "i915_drv.h" 45 #include "i915_gpu_error.h" 46 #include "i915_scatterlist.h" 47 #include "intel_csr.h" 48 49 static inline const struct intel_engine_cs * 50 engine_lookup(const struct drm_i915_private *i915, unsigned int id) 51 { 52 if (id >= I915_NUM_ENGINES) 53 return NULL; 54 55 return i915->engine[id]; 56 } 57 58 static inline const char * 59 __engine_name(const struct intel_engine_cs *engine) 60 { 61 return engine ? engine->name : ""; 62 } 63 64 static const char * 65 engine_name(const struct drm_i915_private *i915, unsigned int id) 66 { 67 return __engine_name(engine_lookup(i915, id)); 68 } 69 70 static const char *tiling_flag(int tiling) 71 { 72 switch (tiling) { 73 default: 74 case I915_TILING_NONE: return ""; 75 case I915_TILING_X: return " X"; 76 case I915_TILING_Y: return " Y"; 77 } 78 } 79 80 static const char *dirty_flag(int dirty) 81 { 82 return dirty ? " dirty" : ""; 83 } 84 85 static const char *purgeable_flag(int purgeable) 86 { 87 return purgeable ? " purgeable" : ""; 88 } 89 90 static void __sg_set_buf(struct scatterlist *sg, 91 void *addr, unsigned int len, loff_t it) 92 { 93 sg->page_link = (unsigned long)virt_to_page(addr); 94 sg->offset = offset_in_page(addr); 95 sg->length = len; 96 sg->dma_address = it; 97 } 98 99 static bool __i915_error_grow(struct drm_i915_error_state_buf *e, size_t len) 100 { 101 if (!len) 102 return false; 103 104 if (e->bytes + len + 1 <= e->size) 105 return true; 106 107 if (e->bytes) { 108 __sg_set_buf(e->cur++, e->buf, e->bytes, e->iter); 109 e->iter += e->bytes; 110 e->buf = NULL; 111 e->bytes = 0; 112 } 113 114 if (e->cur == e->end) { 115 struct scatterlist *sgl; 116 117 sgl = (typeof(sgl))__get_free_page(GFP_KERNEL); 118 if (!sgl) { 119 e->err = -ENOMEM; 120 return false; 121 } 122 123 if (e->cur) { 124 e->cur->offset = 0; 125 e->cur->length = 0; 126 e->cur->page_link = 127 (unsigned long)sgl | SG_CHAIN; 128 } else { 129 e->sgl = sgl; 130 } 131 132 e->cur = sgl; 133 e->end = sgl + SG_MAX_SINGLE_ALLOC - 1; 134 } 135 136 e->size = ALIGN(len + 1, SZ_64K); 137 e->buf = kmalloc(e->size, GFP_KERNEL | __GFP_NOWARN | __GFP_NORETRY); 138 if (!e->buf) { 139 e->size = PAGE_ALIGN(len + 1); 140 e->buf = kmalloc(e->size, GFP_KERNEL); 141 } 142 if (!e->buf) { 143 e->err = -ENOMEM; 144 return false; 145 } 146 147 return true; 148 } 149 150 __printf(2, 0) 151 static void i915_error_vprintf(struct drm_i915_error_state_buf *e, 152 const char *fmt, va_list args) 153 { 154 va_list ap; 155 int len; 156 157 if (e->err) 158 return; 159 160 va_copy(ap, args); 161 len = vsnprintf(NULL, 0, fmt, ap); 162 va_end(ap); 163 if (len <= 0) { 164 e->err = len; 165 return; 166 } 167 168 if (!__i915_error_grow(e, len)) 169 return; 170 171 GEM_BUG_ON(e->bytes >= e->size); 172 len = vscnprintf(e->buf + e->bytes, e->size - e->bytes, fmt, args); 173 if (len < 0) { 174 e->err = len; 175 return; 176 } 177 e->bytes += len; 178 } 179 180 static void i915_error_puts(struct drm_i915_error_state_buf *e, const char *str) 181 { 182 unsigned len; 183 184 if (e->err || !str) 185 return; 186 187 len = strlen(str); 188 if (!__i915_error_grow(e, len)) 189 return; 190 191 GEM_BUG_ON(e->bytes + len > e->size); 192 memcpy(e->buf + e->bytes, str, len); 193 e->bytes += len; 194 } 195 196 #define err_printf(e, ...) i915_error_printf(e, __VA_ARGS__) 197 #define err_puts(e, s) i915_error_puts(e, s) 198 199 static void __i915_printfn_error(struct drm_printer *p, struct va_format *vaf) 200 { 201 i915_error_vprintf(p->arg, vaf->fmt, *vaf->va); 202 } 203 204 static inline struct drm_printer 205 i915_error_printer(struct drm_i915_error_state_buf *e) 206 { 207 struct drm_printer p = { 208 .printfn = __i915_printfn_error, 209 .arg = e, 210 }; 211 return p; 212 } 213 214 #ifdef CONFIG_DRM_I915_COMPRESS_ERROR 215 216 struct compress { 217 struct z_stream_s zstream; 218 void *tmp; 219 }; 220 221 static bool compress_init(struct compress *c) 222 { 223 struct z_stream_s *zstream = memset(&c->zstream, 0, sizeof(c->zstream)); 224 225 zstream->workspace = 226 kmalloc(zlib_deflate_workspacesize(MAX_WBITS, MAX_MEM_LEVEL), 227 GFP_ATOMIC | __GFP_NOWARN); 228 if (!zstream->workspace) 229 return false; 230 231 if (zlib_deflateInit(zstream, Z_DEFAULT_COMPRESSION) != Z_OK) { 232 kfree(zstream->workspace); 233 return false; 234 } 235 236 c->tmp = NULL; 237 if (i915_has_memcpy_from_wc()) 238 c->tmp = (void *)__get_free_page(GFP_ATOMIC | __GFP_NOWARN); 239 240 return true; 241 } 242 243 static void *compress_next_page(struct drm_i915_error_object *dst) 244 { 245 unsigned long page; 246 247 if (dst->page_count >= dst->num_pages) 248 return ERR_PTR(-ENOSPC); 249 250 page = __get_free_page(GFP_ATOMIC | __GFP_NOWARN); 251 if (!page) 252 return ERR_PTR(-ENOMEM); 253 254 return dst->pages[dst->page_count++] = (void *)page; 255 } 256 257 static int compress_page(struct compress *c, 258 void *src, 259 struct drm_i915_error_object *dst) 260 { 261 struct z_stream_s *zstream = &c->zstream; 262 263 zstream->next_in = src; 264 if (c->tmp && i915_memcpy_from_wc(c->tmp, src, PAGE_SIZE)) 265 zstream->next_in = c->tmp; 266 zstream->avail_in = PAGE_SIZE; 267 268 do { 269 if (zstream->avail_out == 0) { 270 zstream->next_out = compress_next_page(dst); 271 if (IS_ERR(zstream->next_out)) 272 return PTR_ERR(zstream->next_out); 273 274 zstream->avail_out = PAGE_SIZE; 275 } 276 277 if (zlib_deflate(zstream, Z_NO_FLUSH) != Z_OK) 278 return -EIO; 279 280 touch_nmi_watchdog(); 281 } while (zstream->avail_in); 282 283 /* Fallback to uncompressed if we increase size? */ 284 if (0 && zstream->total_out > zstream->total_in) 285 return -E2BIG; 286 287 return 0; 288 } 289 290 static int compress_flush(struct compress *c, 291 struct drm_i915_error_object *dst) 292 { 293 struct z_stream_s *zstream = &c->zstream; 294 295 do { 296 switch (zlib_deflate(zstream, Z_FINISH)) { 297 case Z_OK: /* more space requested */ 298 zstream->next_out = compress_next_page(dst); 299 if (IS_ERR(zstream->next_out)) 300 return PTR_ERR(zstream->next_out); 301 302 zstream->avail_out = PAGE_SIZE; 303 break; 304 305 case Z_STREAM_END: 306 goto end; 307 308 default: /* any error */ 309 return -EIO; 310 } 311 } while (1); 312 313 end: 314 memset(zstream->next_out, 0, zstream->avail_out); 315 dst->unused = zstream->avail_out; 316 return 0; 317 } 318 319 static void compress_fini(struct compress *c, 320 struct drm_i915_error_object *dst) 321 { 322 struct z_stream_s *zstream = &c->zstream; 323 324 zlib_deflateEnd(zstream); 325 kfree(zstream->workspace); 326 if (c->tmp) 327 free_page((unsigned long)c->tmp); 328 } 329 330 static void err_compression_marker(struct drm_i915_error_state_buf *m) 331 { 332 err_puts(m, ":"); 333 } 334 335 #else 336 337 struct compress { 338 }; 339 340 static bool compress_init(struct compress *c) 341 { 342 return true; 343 } 344 345 static int compress_page(struct compress *c, 346 void *src, 347 struct drm_i915_error_object *dst) 348 { 349 unsigned long page; 350 void *ptr; 351 352 page = __get_free_page(GFP_ATOMIC | __GFP_NOWARN); 353 if (!page) 354 return -ENOMEM; 355 356 ptr = (void *)page; 357 if (!i915_memcpy_from_wc(ptr, src, PAGE_SIZE)) 358 memcpy(ptr, src, PAGE_SIZE); 359 dst->pages[dst->page_count++] = ptr; 360 361 return 0; 362 } 363 364 static int compress_flush(struct compress *c, 365 struct drm_i915_error_object *dst) 366 { 367 return 0; 368 } 369 370 static void compress_fini(struct compress *c, 371 struct drm_i915_error_object *dst) 372 { 373 } 374 375 static void err_compression_marker(struct drm_i915_error_state_buf *m) 376 { 377 err_puts(m, "~"); 378 } 379 380 #endif 381 382 static void print_error_buffers(struct drm_i915_error_state_buf *m, 383 const char *name, 384 struct drm_i915_error_buffer *err, 385 int count) 386 { 387 err_printf(m, "%s [%d]:\n", name, count); 388 389 while (count--) { 390 err_printf(m, " %08x_%08x %8u %02x %02x", 391 upper_32_bits(err->gtt_offset), 392 lower_32_bits(err->gtt_offset), 393 err->size, 394 err->read_domains, 395 err->write_domain); 396 err_puts(m, tiling_flag(err->tiling)); 397 err_puts(m, dirty_flag(err->dirty)); 398 err_puts(m, purgeable_flag(err->purgeable)); 399 err_puts(m, err->userptr ? " userptr" : ""); 400 err_puts(m, i915_cache_level_str(m->i915, err->cache_level)); 401 402 if (err->name) 403 err_printf(m, " (name: %d)", err->name); 404 if (err->fence_reg != I915_FENCE_REG_NONE) 405 err_printf(m, " (fence: %d)", err->fence_reg); 406 407 err_puts(m, "\n"); 408 err++; 409 } 410 } 411 412 static void error_print_instdone(struct drm_i915_error_state_buf *m, 413 const struct drm_i915_error_engine *ee) 414 { 415 int slice; 416 int subslice; 417 418 err_printf(m, " INSTDONE: 0x%08x\n", 419 ee->instdone.instdone); 420 421 if (ee->engine_id != RCS0 || INTEL_GEN(m->i915) <= 3) 422 return; 423 424 err_printf(m, " SC_INSTDONE: 0x%08x\n", 425 ee->instdone.slice_common); 426 427 if (INTEL_GEN(m->i915) <= 6) 428 return; 429 430 for_each_instdone_slice_subslice(m->i915, slice, subslice) 431 err_printf(m, " SAMPLER_INSTDONE[%d][%d]: 0x%08x\n", 432 slice, subslice, 433 ee->instdone.sampler[slice][subslice]); 434 435 for_each_instdone_slice_subslice(m->i915, slice, subslice) 436 err_printf(m, " ROW_INSTDONE[%d][%d]: 0x%08x\n", 437 slice, subslice, 438 ee->instdone.row[slice][subslice]); 439 } 440 441 static void error_print_request(struct drm_i915_error_state_buf *m, 442 const char *prefix, 443 const struct drm_i915_error_request *erq, 444 const unsigned long epoch) 445 { 446 if (!erq->seqno) 447 return; 448 449 err_printf(m, "%s pid %d, seqno %8x:%08x%s%s, prio %d, emitted %dms, start %08x, head %08x, tail %08x\n", 450 prefix, erq->pid, erq->context, erq->seqno, 451 test_bit(DMA_FENCE_FLAG_SIGNALED_BIT, 452 &erq->flags) ? "!" : "", 453 test_bit(DMA_FENCE_FLAG_ENABLE_SIGNAL_BIT, 454 &erq->flags) ? "+" : "", 455 erq->sched_attr.priority, 456 jiffies_to_msecs(erq->jiffies - epoch), 457 erq->start, erq->head, erq->tail); 458 } 459 460 static void error_print_context(struct drm_i915_error_state_buf *m, 461 const char *header, 462 const struct drm_i915_error_context *ctx) 463 { 464 err_printf(m, "%s%s[%d] hw_id %d, prio %d, guilty %d active %d\n", 465 header, ctx->comm, ctx->pid, ctx->hw_id, 466 ctx->sched_attr.priority, ctx->guilty, ctx->active); 467 } 468 469 static void error_print_engine(struct drm_i915_error_state_buf *m, 470 const struct drm_i915_error_engine *ee, 471 const unsigned long epoch) 472 { 473 int n; 474 475 err_printf(m, "%s command stream:\n", 476 engine_name(m->i915, ee->engine_id)); 477 err_printf(m, " IDLE?: %s\n", yesno(ee->idle)); 478 err_printf(m, " START: 0x%08x\n", ee->start); 479 err_printf(m, " HEAD: 0x%08x [0x%08x]\n", ee->head, ee->rq_head); 480 err_printf(m, " TAIL: 0x%08x [0x%08x, 0x%08x]\n", 481 ee->tail, ee->rq_post, ee->rq_tail); 482 err_printf(m, " CTL: 0x%08x\n", ee->ctl); 483 err_printf(m, " MODE: 0x%08x\n", ee->mode); 484 err_printf(m, " HWS: 0x%08x\n", ee->hws); 485 err_printf(m, " ACTHD: 0x%08x %08x\n", 486 (u32)(ee->acthd>>32), (u32)ee->acthd); 487 err_printf(m, " IPEIR: 0x%08x\n", ee->ipeir); 488 err_printf(m, " IPEHR: 0x%08x\n", ee->ipehr); 489 490 error_print_instdone(m, ee); 491 492 if (ee->batchbuffer) { 493 u64 start = ee->batchbuffer->gtt_offset; 494 u64 end = start + ee->batchbuffer->gtt_size; 495 496 err_printf(m, " batch: [0x%08x_%08x, 0x%08x_%08x]\n", 497 upper_32_bits(start), lower_32_bits(start), 498 upper_32_bits(end), lower_32_bits(end)); 499 } 500 if (INTEL_GEN(m->i915) >= 4) { 501 err_printf(m, " BBADDR: 0x%08x_%08x\n", 502 (u32)(ee->bbaddr>>32), (u32)ee->bbaddr); 503 err_printf(m, " BB_STATE: 0x%08x\n", ee->bbstate); 504 err_printf(m, " INSTPS: 0x%08x\n", ee->instps); 505 } 506 err_printf(m, " INSTPM: 0x%08x\n", ee->instpm); 507 err_printf(m, " FADDR: 0x%08x %08x\n", upper_32_bits(ee->faddr), 508 lower_32_bits(ee->faddr)); 509 if (INTEL_GEN(m->i915) >= 6) { 510 err_printf(m, " RC PSMI: 0x%08x\n", ee->rc_psmi); 511 err_printf(m, " FAULT_REG: 0x%08x\n", ee->fault_reg); 512 } 513 if (HAS_PPGTT(m->i915)) { 514 err_printf(m, " GFX_MODE: 0x%08x\n", ee->vm_info.gfx_mode); 515 516 if (INTEL_GEN(m->i915) >= 8) { 517 int i; 518 for (i = 0; i < 4; i++) 519 err_printf(m, " PDP%d: 0x%016llx\n", 520 i, ee->vm_info.pdp[i]); 521 } else { 522 err_printf(m, " PP_DIR_BASE: 0x%08x\n", 523 ee->vm_info.pp_dir_base); 524 } 525 } 526 err_printf(m, " ring->head: 0x%08x\n", ee->cpu_ring_head); 527 err_printf(m, " ring->tail: 0x%08x\n", ee->cpu_ring_tail); 528 err_printf(m, " hangcheck timestamp: %dms (%lu%s)\n", 529 jiffies_to_msecs(ee->hangcheck_timestamp - epoch), 530 ee->hangcheck_timestamp, 531 ee->hangcheck_timestamp == epoch ? "; epoch" : ""); 532 err_printf(m, " engine reset count: %u\n", ee->reset_count); 533 534 for (n = 0; n < ee->num_ports; n++) { 535 err_printf(m, " ELSP[%d]:", n); 536 error_print_request(m, " ", &ee->execlist[n], epoch); 537 } 538 539 error_print_context(m, " Active context: ", &ee->context); 540 } 541 542 void i915_error_printf(struct drm_i915_error_state_buf *e, const char *f, ...) 543 { 544 va_list args; 545 546 va_start(args, f); 547 i915_error_vprintf(e, f, args); 548 va_end(args); 549 } 550 551 static void print_error_obj(struct drm_i915_error_state_buf *m, 552 struct intel_engine_cs *engine, 553 const char *name, 554 struct drm_i915_error_object *obj) 555 { 556 char out[ASCII85_BUFSZ]; 557 int page; 558 559 if (!obj) 560 return; 561 562 if (name) { 563 err_printf(m, "%s --- %s = 0x%08x %08x\n", 564 engine ? engine->name : "global", name, 565 upper_32_bits(obj->gtt_offset), 566 lower_32_bits(obj->gtt_offset)); 567 } 568 569 err_compression_marker(m); 570 for (page = 0; page < obj->page_count; page++) { 571 int i, len; 572 573 len = PAGE_SIZE; 574 if (page == obj->page_count - 1) 575 len -= obj->unused; 576 len = ascii85_encode_len(len); 577 578 for (i = 0; i < len; i++) 579 err_puts(m, ascii85_encode(obj->pages[page][i], out)); 580 } 581 err_puts(m, "\n"); 582 } 583 584 static void err_print_capabilities(struct drm_i915_error_state_buf *m, 585 const struct intel_device_info *info, 586 const struct intel_runtime_info *runtime, 587 const struct intel_driver_caps *caps) 588 { 589 struct drm_printer p = i915_error_printer(m); 590 591 intel_device_info_dump_flags(info, &p); 592 intel_driver_caps_print(caps, &p); 593 intel_device_info_dump_topology(&runtime->sseu, &p); 594 } 595 596 static void err_print_params(struct drm_i915_error_state_buf *m, 597 const struct i915_params *params) 598 { 599 struct drm_printer p = i915_error_printer(m); 600 601 i915_params_dump(params, &p); 602 } 603 604 static void err_print_pciid(struct drm_i915_error_state_buf *m, 605 struct drm_i915_private *i915) 606 { 607 struct pci_dev *pdev = i915->drm.pdev; 608 609 err_printf(m, "PCI ID: 0x%04x\n", pdev->device); 610 err_printf(m, "PCI Revision: 0x%02x\n", pdev->revision); 611 err_printf(m, "PCI Subsystem: %04x:%04x\n", 612 pdev->subsystem_vendor, 613 pdev->subsystem_device); 614 } 615 616 static void err_print_uc(struct drm_i915_error_state_buf *m, 617 const struct i915_error_uc *error_uc) 618 { 619 struct drm_printer p = i915_error_printer(m); 620 const struct i915_gpu_state *error = 621 container_of(error_uc, typeof(*error), uc); 622 623 if (!error->device_info.has_guc) 624 return; 625 626 intel_uc_fw_dump(&error_uc->guc_fw, &p); 627 intel_uc_fw_dump(&error_uc->huc_fw, &p); 628 print_error_obj(m, NULL, "GuC log buffer", error_uc->guc_log); 629 } 630 631 static void err_free_sgl(struct scatterlist *sgl) 632 { 633 while (sgl) { 634 struct scatterlist *sg; 635 636 for (sg = sgl; !sg_is_chain(sg); sg++) { 637 kfree(sg_virt(sg)); 638 if (sg_is_last(sg)) 639 break; 640 } 641 642 sg = sg_is_last(sg) ? NULL : sg_chain_ptr(sg); 643 free_page((unsigned long)sgl); 644 sgl = sg; 645 } 646 } 647 648 static void __err_print_to_sgl(struct drm_i915_error_state_buf *m, 649 struct i915_gpu_state *error) 650 { 651 struct drm_i915_error_object *obj; 652 struct timespec64 ts; 653 int i, j; 654 655 if (*error->error_msg) 656 err_printf(m, "%s\n", error->error_msg); 657 err_printf(m, "Kernel: %s %s\n", 658 init_utsname()->release, 659 init_utsname()->machine); 660 ts = ktime_to_timespec64(error->time); 661 err_printf(m, "Time: %lld s %ld us\n", 662 (s64)ts.tv_sec, ts.tv_nsec / NSEC_PER_USEC); 663 ts = ktime_to_timespec64(error->boottime); 664 err_printf(m, "Boottime: %lld s %ld us\n", 665 (s64)ts.tv_sec, ts.tv_nsec / NSEC_PER_USEC); 666 ts = ktime_to_timespec64(error->uptime); 667 err_printf(m, "Uptime: %lld s %ld us\n", 668 (s64)ts.tv_sec, ts.tv_nsec / NSEC_PER_USEC); 669 err_printf(m, "Epoch: %lu jiffies (%u HZ)\n", error->epoch, HZ); 670 err_printf(m, "Capture: %lu jiffies; %d ms ago, %d ms after epoch\n", 671 error->capture, 672 jiffies_to_msecs(jiffies - error->capture), 673 jiffies_to_msecs(error->capture - error->epoch)); 674 675 for (i = 0; i < ARRAY_SIZE(error->engine); i++) { 676 if (!error->engine[i].context.pid) 677 continue; 678 679 err_printf(m, "Active process (on ring %s): %s [%d]\n", 680 engine_name(m->i915, i), 681 error->engine[i].context.comm, 682 error->engine[i].context.pid); 683 } 684 err_printf(m, "Reset count: %u\n", error->reset_count); 685 err_printf(m, "Suspend count: %u\n", error->suspend_count); 686 err_printf(m, "Platform: %s\n", intel_platform_name(error->device_info.platform)); 687 err_printf(m, "Subplatform: 0x%x\n", 688 intel_subplatform(&error->runtime_info, 689 error->device_info.platform)); 690 err_print_pciid(m, m->i915); 691 692 err_printf(m, "IOMMU enabled?: %d\n", error->iommu); 693 694 if (HAS_CSR(m->i915)) { 695 struct intel_csr *csr = &m->i915->csr; 696 697 err_printf(m, "DMC loaded: %s\n", 698 yesno(csr->dmc_payload != NULL)); 699 err_printf(m, "DMC fw version: %d.%d\n", 700 CSR_VERSION_MAJOR(csr->version), 701 CSR_VERSION_MINOR(csr->version)); 702 } 703 704 err_printf(m, "GT awake: %s\n", yesno(error->awake)); 705 err_printf(m, "RPM wakelock: %s\n", yesno(error->wakelock)); 706 err_printf(m, "PM suspended: %s\n", yesno(error->suspended)); 707 err_printf(m, "EIR: 0x%08x\n", error->eir); 708 err_printf(m, "IER: 0x%08x\n", error->ier); 709 for (i = 0; i < error->ngtier; i++) 710 err_printf(m, "GTIER[%d]: 0x%08x\n", i, error->gtier[i]); 711 err_printf(m, "PGTBL_ER: 0x%08x\n", error->pgtbl_er); 712 err_printf(m, "FORCEWAKE: 0x%08x\n", error->forcewake); 713 err_printf(m, "DERRMR: 0x%08x\n", error->derrmr); 714 err_printf(m, "CCID: 0x%08x\n", error->ccid); 715 716 for (i = 0; i < error->nfence; i++) 717 err_printf(m, " fence[%d] = %08llx\n", i, error->fence[i]); 718 719 if (INTEL_GEN(m->i915) >= 6) { 720 err_printf(m, "ERROR: 0x%08x\n", error->error); 721 722 if (INTEL_GEN(m->i915) >= 8) 723 err_printf(m, "FAULT_TLB_DATA: 0x%08x 0x%08x\n", 724 error->fault_data1, error->fault_data0); 725 726 err_printf(m, "DONE_REG: 0x%08x\n", error->done_reg); 727 } 728 729 if (IS_GEN(m->i915, 7)) 730 err_printf(m, "ERR_INT: 0x%08x\n", error->err_int); 731 732 for (i = 0; i < ARRAY_SIZE(error->engine); i++) { 733 if (error->engine[i].engine_id != -1) 734 error_print_engine(m, &error->engine[i], error->epoch); 735 } 736 737 for (i = 0; i < ARRAY_SIZE(error->active_vm); i++) { 738 char buf[128]; 739 int len, first = 1; 740 741 if (!error->active_vm[i]) 742 break; 743 744 len = scnprintf(buf, sizeof(buf), "Active ("); 745 for (j = 0; j < ARRAY_SIZE(error->engine); j++) { 746 if (error->engine[j].vm != error->active_vm[i]) 747 continue; 748 749 len += scnprintf(buf + len, sizeof(buf), "%s%s", 750 first ? "" : ", ", 751 m->i915->engine[j]->name); 752 first = 0; 753 } 754 scnprintf(buf + len, sizeof(buf), ")"); 755 print_error_buffers(m, buf, 756 error->active_bo[i], 757 error->active_bo_count[i]); 758 } 759 760 print_error_buffers(m, "Pinned (global)", 761 error->pinned_bo, 762 error->pinned_bo_count); 763 764 for (i = 0; i < ARRAY_SIZE(error->engine); i++) { 765 const struct drm_i915_error_engine *ee = &error->engine[i]; 766 767 obj = ee->batchbuffer; 768 if (obj) { 769 err_puts(m, m->i915->engine[i]->name); 770 if (ee->context.pid) 771 err_printf(m, " (submitted by %s [%d])", 772 ee->context.comm, 773 ee->context.pid); 774 err_printf(m, " --- gtt_offset = 0x%08x %08x\n", 775 upper_32_bits(obj->gtt_offset), 776 lower_32_bits(obj->gtt_offset)); 777 print_error_obj(m, m->i915->engine[i], NULL, obj); 778 } 779 780 for (j = 0; j < ee->user_bo_count; j++) 781 print_error_obj(m, m->i915->engine[i], 782 "user", ee->user_bo[j]); 783 784 if (ee->num_requests) { 785 err_printf(m, "%s --- %d requests\n", 786 m->i915->engine[i]->name, 787 ee->num_requests); 788 for (j = 0; j < ee->num_requests; j++) 789 error_print_request(m, " ", 790 &ee->requests[j], 791 error->epoch); 792 } 793 794 print_error_obj(m, m->i915->engine[i], 795 "ringbuffer", ee->ringbuffer); 796 797 print_error_obj(m, m->i915->engine[i], 798 "HW Status", ee->hws_page); 799 800 print_error_obj(m, m->i915->engine[i], 801 "HW context", ee->ctx); 802 803 print_error_obj(m, m->i915->engine[i], 804 "WA context", ee->wa_ctx); 805 806 print_error_obj(m, m->i915->engine[i], 807 "WA batchbuffer", ee->wa_batchbuffer); 808 809 print_error_obj(m, m->i915->engine[i], 810 "NULL context", ee->default_state); 811 } 812 813 if (error->overlay) 814 intel_overlay_print_error_state(m, error->overlay); 815 816 if (error->display) 817 intel_display_print_error_state(m, error->display); 818 819 err_print_capabilities(m, &error->device_info, &error->runtime_info, 820 &error->driver_caps); 821 err_print_params(m, &error->params); 822 err_print_uc(m, &error->uc); 823 } 824 825 static int err_print_to_sgl(struct i915_gpu_state *error) 826 { 827 struct drm_i915_error_state_buf m; 828 829 if (IS_ERR(error)) 830 return PTR_ERR(error); 831 832 if (READ_ONCE(error->sgl)) 833 return 0; 834 835 memset(&m, 0, sizeof(m)); 836 m.i915 = error->i915; 837 838 __err_print_to_sgl(&m, error); 839 840 if (m.buf) { 841 __sg_set_buf(m.cur++, m.buf, m.bytes, m.iter); 842 m.bytes = 0; 843 m.buf = NULL; 844 } 845 if (m.cur) { 846 GEM_BUG_ON(m.end < m.cur); 847 sg_mark_end(m.cur - 1); 848 } 849 GEM_BUG_ON(m.sgl && !m.cur); 850 851 if (m.err) { 852 err_free_sgl(m.sgl); 853 return m.err; 854 } 855 856 if (cmpxchg(&error->sgl, NULL, m.sgl)) 857 err_free_sgl(m.sgl); 858 859 return 0; 860 } 861 862 ssize_t i915_gpu_state_copy_to_buffer(struct i915_gpu_state *error, 863 char *buf, loff_t off, size_t rem) 864 { 865 struct scatterlist *sg; 866 size_t count; 867 loff_t pos; 868 int err; 869 870 if (!error || !rem) 871 return 0; 872 873 err = err_print_to_sgl(error); 874 if (err) 875 return err; 876 877 sg = READ_ONCE(error->fit); 878 if (!sg || off < sg->dma_address) 879 sg = error->sgl; 880 if (!sg) 881 return 0; 882 883 pos = sg->dma_address; 884 count = 0; 885 do { 886 size_t len, start; 887 888 if (sg_is_chain(sg)) { 889 sg = sg_chain_ptr(sg); 890 GEM_BUG_ON(sg_is_chain(sg)); 891 } 892 893 len = sg->length; 894 if (pos + len <= off) { 895 pos += len; 896 continue; 897 } 898 899 start = sg->offset; 900 if (pos < off) { 901 GEM_BUG_ON(off - pos > len); 902 len -= off - pos; 903 start += off - pos; 904 pos = off; 905 } 906 907 len = min(len, rem); 908 GEM_BUG_ON(!len || len > sg->length); 909 910 memcpy(buf, page_address(sg_page(sg)) + start, len); 911 912 count += len; 913 pos += len; 914 915 buf += len; 916 rem -= len; 917 if (!rem) { 918 WRITE_ONCE(error->fit, sg); 919 break; 920 } 921 } while (!sg_is_last(sg++)); 922 923 return count; 924 } 925 926 static void i915_error_object_free(struct drm_i915_error_object *obj) 927 { 928 int page; 929 930 if (obj == NULL) 931 return; 932 933 for (page = 0; page < obj->page_count; page++) 934 free_page((unsigned long)obj->pages[page]); 935 936 kfree(obj); 937 } 938 939 940 static void cleanup_params(struct i915_gpu_state *error) 941 { 942 i915_params_free(&error->params); 943 } 944 945 static void cleanup_uc_state(struct i915_gpu_state *error) 946 { 947 struct i915_error_uc *error_uc = &error->uc; 948 949 kfree(error_uc->guc_fw.path); 950 kfree(error_uc->huc_fw.path); 951 i915_error_object_free(error_uc->guc_log); 952 } 953 954 void __i915_gpu_state_free(struct kref *error_ref) 955 { 956 struct i915_gpu_state *error = 957 container_of(error_ref, typeof(*error), ref); 958 long i, j; 959 960 for (i = 0; i < ARRAY_SIZE(error->engine); i++) { 961 struct drm_i915_error_engine *ee = &error->engine[i]; 962 963 for (j = 0; j < ee->user_bo_count; j++) 964 i915_error_object_free(ee->user_bo[j]); 965 kfree(ee->user_bo); 966 967 i915_error_object_free(ee->batchbuffer); 968 i915_error_object_free(ee->wa_batchbuffer); 969 i915_error_object_free(ee->ringbuffer); 970 i915_error_object_free(ee->hws_page); 971 i915_error_object_free(ee->ctx); 972 i915_error_object_free(ee->wa_ctx); 973 974 kfree(ee->requests); 975 } 976 977 for (i = 0; i < ARRAY_SIZE(error->active_bo); i++) 978 kfree(error->active_bo[i]); 979 kfree(error->pinned_bo); 980 981 kfree(error->overlay); 982 kfree(error->display); 983 984 cleanup_params(error); 985 cleanup_uc_state(error); 986 987 err_free_sgl(error->sgl); 988 kfree(error); 989 } 990 991 static struct drm_i915_error_object * 992 i915_error_object_create(struct drm_i915_private *i915, 993 struct i915_vma *vma) 994 { 995 struct i915_ggtt *ggtt = &i915->ggtt; 996 const u64 slot = ggtt->error_capture.start; 997 struct drm_i915_error_object *dst; 998 struct compress compress; 999 unsigned long num_pages; 1000 struct sgt_iter iter; 1001 dma_addr_t dma; 1002 int ret; 1003 1004 if (!vma || !vma->pages) 1005 return NULL; 1006 1007 num_pages = min_t(u64, vma->size, vma->obj->base.size) >> PAGE_SHIFT; 1008 num_pages = DIV_ROUND_UP(10 * num_pages, 8); /* worstcase zlib growth */ 1009 dst = kmalloc(sizeof(*dst) + num_pages * sizeof(u32 *), 1010 GFP_ATOMIC | __GFP_NOWARN); 1011 if (!dst) 1012 return NULL; 1013 1014 dst->gtt_offset = vma->node.start; 1015 dst->gtt_size = vma->node.size; 1016 dst->num_pages = num_pages; 1017 dst->page_count = 0; 1018 dst->unused = 0; 1019 1020 if (!compress_init(&compress)) { 1021 kfree(dst); 1022 return NULL; 1023 } 1024 1025 ret = -EINVAL; 1026 for_each_sgt_dma(dma, iter, vma->pages) { 1027 void __iomem *s; 1028 1029 ggtt->vm.insert_page(&ggtt->vm, dma, slot, I915_CACHE_NONE, 0); 1030 1031 s = io_mapping_map_atomic_wc(&ggtt->iomap, slot); 1032 ret = compress_page(&compress, (void __force *)s, dst); 1033 io_mapping_unmap_atomic(s); 1034 if (ret) 1035 break; 1036 } 1037 1038 if (ret || compress_flush(&compress, dst)) { 1039 while (dst->page_count--) 1040 free_page((unsigned long)dst->pages[dst->page_count]); 1041 kfree(dst); 1042 dst = NULL; 1043 } 1044 1045 compress_fini(&compress, dst); 1046 return dst; 1047 } 1048 1049 static void capture_bo(struct drm_i915_error_buffer *err, 1050 struct i915_vma *vma) 1051 { 1052 struct drm_i915_gem_object *obj = vma->obj; 1053 1054 err->size = obj->base.size; 1055 err->name = obj->base.name; 1056 1057 err->gtt_offset = vma->node.start; 1058 err->read_domains = obj->read_domains; 1059 err->write_domain = obj->write_domain; 1060 err->fence_reg = vma->fence ? vma->fence->id : -1; 1061 err->tiling = i915_gem_object_get_tiling(obj); 1062 err->dirty = obj->mm.dirty; 1063 err->purgeable = obj->mm.madv != I915_MADV_WILLNEED; 1064 err->userptr = obj->userptr.mm != NULL; 1065 err->cache_level = obj->cache_level; 1066 } 1067 1068 static u32 capture_error_bo(struct drm_i915_error_buffer *err, 1069 int count, struct list_head *head, 1070 unsigned int flags) 1071 #define ACTIVE_ONLY BIT(0) 1072 #define PINNED_ONLY BIT(1) 1073 { 1074 struct i915_vma *vma; 1075 int i = 0; 1076 1077 list_for_each_entry(vma, head, vm_link) { 1078 if (!vma->obj) 1079 continue; 1080 1081 if (flags & ACTIVE_ONLY && !i915_vma_is_active(vma)) 1082 continue; 1083 1084 if (flags & PINNED_ONLY && !i915_vma_is_pinned(vma)) 1085 continue; 1086 1087 capture_bo(err++, vma); 1088 if (++i == count) 1089 break; 1090 } 1091 1092 return i; 1093 } 1094 1095 /* 1096 * Generate a semi-unique error code. The code is not meant to have meaning, The 1097 * code's only purpose is to try to prevent false duplicated bug reports by 1098 * grossly estimating a GPU error state. 1099 * 1100 * TODO Ideally, hashing the batchbuffer would be a very nice way to determine 1101 * the hang if we could strip the GTT offset information from it. 1102 * 1103 * It's only a small step better than a random number in its current form. 1104 */ 1105 static u32 i915_error_generate_code(struct i915_gpu_state *error, 1106 intel_engine_mask_t engine_mask) 1107 { 1108 /* 1109 * IPEHR would be an ideal way to detect errors, as it's the gross 1110 * measure of "the command that hung." However, has some very common 1111 * synchronization commands which almost always appear in the case 1112 * strictly a client bug. Use instdone to differentiate those some. 1113 */ 1114 if (engine_mask) { 1115 struct drm_i915_error_engine *ee = 1116 &error->engine[ffs(engine_mask)]; 1117 1118 return ee->ipehr ^ ee->instdone.instdone; 1119 } 1120 1121 return 0; 1122 } 1123 1124 static void gem_record_fences(struct i915_gpu_state *error) 1125 { 1126 struct drm_i915_private *dev_priv = error->i915; 1127 struct intel_uncore *uncore = &dev_priv->uncore; 1128 int i; 1129 1130 if (INTEL_GEN(dev_priv) >= 6) { 1131 for (i = 0; i < dev_priv->ggtt.num_fences; i++) 1132 error->fence[i] = 1133 intel_uncore_read64(uncore, 1134 FENCE_REG_GEN6_LO(i)); 1135 } else if (INTEL_GEN(dev_priv) >= 4) { 1136 for (i = 0; i < dev_priv->ggtt.num_fences; i++) 1137 error->fence[i] = 1138 intel_uncore_read64(uncore, 1139 FENCE_REG_965_LO(i)); 1140 } else { 1141 for (i = 0; i < dev_priv->ggtt.num_fences; i++) 1142 error->fence[i] = 1143 intel_uncore_read(uncore, FENCE_REG(i)); 1144 } 1145 error->nfence = i; 1146 } 1147 1148 static void error_record_engine_registers(struct i915_gpu_state *error, 1149 struct intel_engine_cs *engine, 1150 struct drm_i915_error_engine *ee) 1151 { 1152 struct drm_i915_private *dev_priv = engine->i915; 1153 1154 if (INTEL_GEN(dev_priv) >= 6) { 1155 ee->rc_psmi = ENGINE_READ(engine, RING_PSMI_CTL); 1156 if (INTEL_GEN(dev_priv) >= 8) 1157 ee->fault_reg = I915_READ(GEN8_RING_FAULT_REG); 1158 else 1159 ee->fault_reg = GEN6_RING_FAULT_REG_READ(engine); 1160 } 1161 1162 if (INTEL_GEN(dev_priv) >= 4) { 1163 ee->faddr = ENGINE_READ(engine, RING_DMA_FADD); 1164 ee->ipeir = ENGINE_READ(engine, RING_IPEIR); 1165 ee->ipehr = ENGINE_READ(engine, RING_IPEHR); 1166 ee->instps = ENGINE_READ(engine, RING_INSTPS); 1167 ee->bbaddr = ENGINE_READ(engine, RING_BBADDR); 1168 if (INTEL_GEN(dev_priv) >= 8) { 1169 ee->faddr |= (u64)ENGINE_READ(engine, RING_DMA_FADD_UDW) << 32; 1170 ee->bbaddr |= (u64)ENGINE_READ(engine, RING_BBADDR_UDW) << 32; 1171 } 1172 ee->bbstate = ENGINE_READ(engine, RING_BBSTATE); 1173 } else { 1174 ee->faddr = ENGINE_READ(engine, DMA_FADD_I8XX); 1175 ee->ipeir = ENGINE_READ(engine, IPEIR); 1176 ee->ipehr = ENGINE_READ(engine, IPEHR); 1177 } 1178 1179 intel_engine_get_instdone(engine, &ee->instdone); 1180 1181 ee->instpm = ENGINE_READ(engine, RING_INSTPM); 1182 ee->acthd = intel_engine_get_active_head(engine); 1183 ee->start = ENGINE_READ(engine, RING_START); 1184 ee->head = ENGINE_READ(engine, RING_HEAD); 1185 ee->tail = ENGINE_READ(engine, RING_TAIL); 1186 ee->ctl = ENGINE_READ(engine, RING_CTL); 1187 if (INTEL_GEN(dev_priv) > 2) 1188 ee->mode = ENGINE_READ(engine, RING_MI_MODE); 1189 1190 if (!HWS_NEEDS_PHYSICAL(dev_priv)) { 1191 i915_reg_t mmio; 1192 1193 if (IS_GEN(dev_priv, 7)) { 1194 switch (engine->id) { 1195 default: 1196 MISSING_CASE(engine->id); 1197 case RCS0: 1198 mmio = RENDER_HWS_PGA_GEN7; 1199 break; 1200 case BCS0: 1201 mmio = BLT_HWS_PGA_GEN7; 1202 break; 1203 case VCS0: 1204 mmio = BSD_HWS_PGA_GEN7; 1205 break; 1206 case VECS0: 1207 mmio = VEBOX_HWS_PGA_GEN7; 1208 break; 1209 } 1210 } else if (IS_GEN(engine->i915, 6)) { 1211 mmio = RING_HWS_PGA_GEN6(engine->mmio_base); 1212 } else { 1213 /* XXX: gen8 returns to sanity */ 1214 mmio = RING_HWS_PGA(engine->mmio_base); 1215 } 1216 1217 ee->hws = I915_READ(mmio); 1218 } 1219 1220 ee->idle = intel_engine_is_idle(engine); 1221 if (!ee->idle) 1222 ee->hangcheck_timestamp = engine->hangcheck.action_timestamp; 1223 ee->reset_count = i915_reset_engine_count(&dev_priv->gpu_error, 1224 engine); 1225 1226 if (HAS_PPGTT(dev_priv)) { 1227 int i; 1228 1229 ee->vm_info.gfx_mode = ENGINE_READ(engine, RING_MODE_GEN7); 1230 1231 if (IS_GEN(dev_priv, 6)) { 1232 ee->vm_info.pp_dir_base = 1233 ENGINE_READ(engine, RING_PP_DIR_BASE_READ); 1234 } else if (IS_GEN(dev_priv, 7)) { 1235 ee->vm_info.pp_dir_base = 1236 ENGINE_READ(engine, RING_PP_DIR_BASE); 1237 } else if (INTEL_GEN(dev_priv) >= 8) { 1238 u32 base = engine->mmio_base; 1239 1240 for (i = 0; i < 4; i++) { 1241 ee->vm_info.pdp[i] = 1242 I915_READ(GEN8_RING_PDP_UDW(base, i)); 1243 ee->vm_info.pdp[i] <<= 32; 1244 ee->vm_info.pdp[i] |= 1245 I915_READ(GEN8_RING_PDP_LDW(base, i)); 1246 } 1247 } 1248 } 1249 } 1250 1251 static void record_request(struct i915_request *request, 1252 struct drm_i915_error_request *erq) 1253 { 1254 struct i915_gem_context *ctx = request->gem_context; 1255 1256 erq->flags = request->fence.flags; 1257 erq->context = request->fence.context; 1258 erq->seqno = request->fence.seqno; 1259 erq->sched_attr = request->sched.attr; 1260 erq->jiffies = request->emitted_jiffies; 1261 erq->start = i915_ggtt_offset(request->ring->vma); 1262 erq->head = request->head; 1263 erq->tail = request->tail; 1264 1265 rcu_read_lock(); 1266 erq->pid = ctx->pid ? pid_nr(ctx->pid) : 0; 1267 rcu_read_unlock(); 1268 } 1269 1270 static void engine_record_requests(struct intel_engine_cs *engine, 1271 struct i915_request *first, 1272 struct drm_i915_error_engine *ee) 1273 { 1274 struct i915_request *request; 1275 int count; 1276 1277 count = 0; 1278 request = first; 1279 list_for_each_entry_from(request, &engine->active.requests, sched.link) 1280 count++; 1281 if (!count) 1282 return; 1283 1284 ee->requests = kcalloc(count, sizeof(*ee->requests), GFP_ATOMIC); 1285 if (!ee->requests) 1286 return; 1287 1288 ee->num_requests = count; 1289 1290 count = 0; 1291 request = first; 1292 list_for_each_entry_from(request, 1293 &engine->active.requests, sched.link) { 1294 if (count >= ee->num_requests) { 1295 /* 1296 * If the ring request list was changed in 1297 * between the point where the error request 1298 * list was created and dimensioned and this 1299 * point then just exit early to avoid crashes. 1300 * 1301 * We don't need to communicate that the 1302 * request list changed state during error 1303 * state capture and that the error state is 1304 * slightly incorrect as a consequence since we 1305 * are typically only interested in the request 1306 * list state at the point of error state 1307 * capture, not in any changes happening during 1308 * the capture. 1309 */ 1310 break; 1311 } 1312 1313 record_request(request, &ee->requests[count++]); 1314 } 1315 ee->num_requests = count; 1316 } 1317 1318 static void error_record_engine_execlists(struct intel_engine_cs *engine, 1319 struct drm_i915_error_engine *ee) 1320 { 1321 const struct intel_engine_execlists * const execlists = &engine->execlists; 1322 unsigned int n; 1323 1324 for (n = 0; n < execlists_num_ports(execlists); n++) { 1325 struct i915_request *rq = port_request(&execlists->port[n]); 1326 1327 if (!rq) 1328 break; 1329 1330 record_request(rq, &ee->execlist[n]); 1331 } 1332 1333 ee->num_ports = n; 1334 } 1335 1336 static void record_context(struct drm_i915_error_context *e, 1337 struct i915_gem_context *ctx) 1338 { 1339 if (ctx->pid) { 1340 struct task_struct *task; 1341 1342 rcu_read_lock(); 1343 task = pid_task(ctx->pid, PIDTYPE_PID); 1344 if (task) { 1345 strcpy(e->comm, task->comm); 1346 e->pid = task->pid; 1347 } 1348 rcu_read_unlock(); 1349 } 1350 1351 e->hw_id = ctx->hw_id; 1352 e->sched_attr = ctx->sched; 1353 e->guilty = atomic_read(&ctx->guilty_count); 1354 e->active = atomic_read(&ctx->active_count); 1355 } 1356 1357 static void request_record_user_bo(struct i915_request *request, 1358 struct drm_i915_error_engine *ee) 1359 { 1360 struct i915_capture_list *c; 1361 struct drm_i915_error_object **bo; 1362 long count, max; 1363 1364 max = 0; 1365 for (c = request->capture_list; c; c = c->next) 1366 max++; 1367 if (!max) 1368 return; 1369 1370 bo = kmalloc_array(max, sizeof(*bo), GFP_ATOMIC); 1371 if (!bo) { 1372 /* If we can't capture everything, try to capture something. */ 1373 max = min_t(long, max, PAGE_SIZE / sizeof(*bo)); 1374 bo = kmalloc_array(max, sizeof(*bo), GFP_ATOMIC); 1375 } 1376 if (!bo) 1377 return; 1378 1379 count = 0; 1380 for (c = request->capture_list; c; c = c->next) { 1381 bo[count] = i915_error_object_create(request->i915, c->vma); 1382 if (!bo[count]) 1383 break; 1384 if (++count == max) 1385 break; 1386 } 1387 1388 ee->user_bo = bo; 1389 ee->user_bo_count = count; 1390 } 1391 1392 static struct drm_i915_error_object * 1393 capture_object(struct drm_i915_private *dev_priv, 1394 struct drm_i915_gem_object *obj) 1395 { 1396 if (obj && i915_gem_object_has_pages(obj)) { 1397 struct i915_vma fake = { 1398 .node = { .start = U64_MAX, .size = obj->base.size }, 1399 .size = obj->base.size, 1400 .pages = obj->mm.pages, 1401 .obj = obj, 1402 }; 1403 1404 return i915_error_object_create(dev_priv, &fake); 1405 } else { 1406 return NULL; 1407 } 1408 } 1409 1410 static void gem_record_rings(struct i915_gpu_state *error) 1411 { 1412 struct drm_i915_private *i915 = error->i915; 1413 struct i915_ggtt *ggtt = &i915->ggtt; 1414 int i; 1415 1416 for (i = 0; i < I915_NUM_ENGINES; i++) { 1417 struct intel_engine_cs *engine = i915->engine[i]; 1418 struct drm_i915_error_engine *ee = &error->engine[i]; 1419 struct i915_request *request; 1420 1421 ee->engine_id = -1; 1422 1423 if (!engine) 1424 continue; 1425 1426 ee->engine_id = i; 1427 1428 error_record_engine_registers(error, engine, ee); 1429 error_record_engine_execlists(engine, ee); 1430 1431 request = intel_engine_find_active_request(engine); 1432 if (request) { 1433 struct i915_gem_context *ctx = request->gem_context; 1434 struct intel_ring *ring; 1435 1436 ee->vm = ctx->vm ?: &ggtt->vm; 1437 1438 record_context(&ee->context, ctx); 1439 1440 /* We need to copy these to an anonymous buffer 1441 * as the simplest method to avoid being overwritten 1442 * by userspace. 1443 */ 1444 ee->batchbuffer = 1445 i915_error_object_create(i915, request->batch); 1446 1447 if (HAS_BROKEN_CS_TLB(i915)) 1448 ee->wa_batchbuffer = 1449 i915_error_object_create(i915, 1450 i915->gt.scratch); 1451 request_record_user_bo(request, ee); 1452 1453 ee->ctx = 1454 i915_error_object_create(i915, 1455 request->hw_context->state); 1456 1457 error->simulated |= 1458 i915_gem_context_no_error_capture(ctx); 1459 1460 ee->rq_head = request->head; 1461 ee->rq_post = request->postfix; 1462 ee->rq_tail = request->tail; 1463 1464 ring = request->ring; 1465 ee->cpu_ring_head = ring->head; 1466 ee->cpu_ring_tail = ring->tail; 1467 ee->ringbuffer = 1468 i915_error_object_create(i915, ring->vma); 1469 1470 engine_record_requests(engine, request, ee); 1471 } 1472 1473 ee->hws_page = 1474 i915_error_object_create(i915, 1475 engine->status_page.vma); 1476 1477 ee->wa_ctx = i915_error_object_create(i915, engine->wa_ctx.vma); 1478 1479 ee->default_state = capture_object(i915, engine->default_state); 1480 } 1481 } 1482 1483 static void gem_capture_vm(struct i915_gpu_state *error, 1484 struct i915_address_space *vm, 1485 int idx) 1486 { 1487 struct drm_i915_error_buffer *active_bo; 1488 struct i915_vma *vma; 1489 int count; 1490 1491 count = 0; 1492 list_for_each_entry(vma, &vm->bound_list, vm_link) 1493 if (i915_vma_is_active(vma)) 1494 count++; 1495 1496 active_bo = NULL; 1497 if (count) 1498 active_bo = kcalloc(count, sizeof(*active_bo), GFP_ATOMIC); 1499 if (active_bo) 1500 count = capture_error_bo(active_bo, 1501 count, &vm->bound_list, 1502 ACTIVE_ONLY); 1503 else 1504 count = 0; 1505 1506 error->active_vm[idx] = vm; 1507 error->active_bo[idx] = active_bo; 1508 error->active_bo_count[idx] = count; 1509 } 1510 1511 static void capture_active_buffers(struct i915_gpu_state *error) 1512 { 1513 int cnt = 0, i, j; 1514 1515 BUILD_BUG_ON(ARRAY_SIZE(error->engine) > ARRAY_SIZE(error->active_bo)); 1516 BUILD_BUG_ON(ARRAY_SIZE(error->active_bo) != ARRAY_SIZE(error->active_vm)); 1517 BUILD_BUG_ON(ARRAY_SIZE(error->active_bo) != ARRAY_SIZE(error->active_bo_count)); 1518 1519 /* Scan each engine looking for unique active contexts/vm */ 1520 for (i = 0; i < ARRAY_SIZE(error->engine); i++) { 1521 struct drm_i915_error_engine *ee = &error->engine[i]; 1522 bool found; 1523 1524 if (!ee->vm) 1525 continue; 1526 1527 found = false; 1528 for (j = 0; j < i && !found; j++) 1529 found = error->engine[j].vm == ee->vm; 1530 if (!found) 1531 gem_capture_vm(error, ee->vm, cnt++); 1532 } 1533 } 1534 1535 static void capture_pinned_buffers(struct i915_gpu_state *error) 1536 { 1537 struct i915_address_space *vm = &error->i915->ggtt.vm; 1538 struct drm_i915_error_buffer *bo; 1539 struct i915_vma *vma; 1540 int count; 1541 1542 count = 0; 1543 list_for_each_entry(vma, &vm->bound_list, vm_link) 1544 count++; 1545 1546 bo = NULL; 1547 if (count) 1548 bo = kcalloc(count, sizeof(*bo), GFP_ATOMIC); 1549 if (!bo) 1550 return; 1551 1552 error->pinned_bo_count = 1553 capture_error_bo(bo, count, &vm->bound_list, PINNED_ONLY); 1554 error->pinned_bo = bo; 1555 } 1556 1557 static void capture_uc_state(struct i915_gpu_state *error) 1558 { 1559 struct drm_i915_private *i915 = error->i915; 1560 struct i915_error_uc *error_uc = &error->uc; 1561 1562 /* Capturing uC state won't be useful if there is no GuC */ 1563 if (!error->device_info.has_guc) 1564 return; 1565 1566 error_uc->guc_fw = i915->guc.fw; 1567 error_uc->huc_fw = i915->huc.fw; 1568 1569 /* Non-default firmware paths will be specified by the modparam. 1570 * As modparams are generally accesible from the userspace make 1571 * explicit copies of the firmware paths. 1572 */ 1573 error_uc->guc_fw.path = kstrdup(i915->guc.fw.path, GFP_ATOMIC); 1574 error_uc->huc_fw.path = kstrdup(i915->huc.fw.path, GFP_ATOMIC); 1575 error_uc->guc_log = i915_error_object_create(i915, i915->guc.log.vma); 1576 } 1577 1578 /* Capture all registers which don't fit into another category. */ 1579 static void capture_reg_state(struct i915_gpu_state *error) 1580 { 1581 struct drm_i915_private *i915 = error->i915; 1582 struct intel_uncore *uncore = &i915->uncore; 1583 int i; 1584 1585 /* General organization 1586 * 1. Registers specific to a single generation 1587 * 2. Registers which belong to multiple generations 1588 * 3. Feature specific registers. 1589 * 4. Everything else 1590 * Please try to follow the order. 1591 */ 1592 1593 /* 1: Registers specific to a single generation */ 1594 if (IS_VALLEYVIEW(i915)) { 1595 error->gtier[0] = intel_uncore_read(uncore, GTIER); 1596 error->ier = intel_uncore_read(uncore, VLV_IER); 1597 error->forcewake = intel_uncore_read_fw(uncore, FORCEWAKE_VLV); 1598 } 1599 1600 if (IS_GEN(i915, 7)) 1601 error->err_int = intel_uncore_read(uncore, GEN7_ERR_INT); 1602 1603 if (INTEL_GEN(i915) >= 8) { 1604 error->fault_data0 = intel_uncore_read(uncore, 1605 GEN8_FAULT_TLB_DATA0); 1606 error->fault_data1 = intel_uncore_read(uncore, 1607 GEN8_FAULT_TLB_DATA1); 1608 } 1609 1610 if (IS_GEN(i915, 6)) { 1611 error->forcewake = intel_uncore_read_fw(uncore, FORCEWAKE); 1612 error->gab_ctl = intel_uncore_read(uncore, GAB_CTL); 1613 error->gfx_mode = intel_uncore_read(uncore, GFX_MODE); 1614 } 1615 1616 /* 2: Registers which belong to multiple generations */ 1617 if (INTEL_GEN(i915) >= 7) 1618 error->forcewake = intel_uncore_read_fw(uncore, FORCEWAKE_MT); 1619 1620 if (INTEL_GEN(i915) >= 6) { 1621 error->derrmr = intel_uncore_read(uncore, DERRMR); 1622 error->error = intel_uncore_read(uncore, ERROR_GEN6); 1623 error->done_reg = intel_uncore_read(uncore, DONE_REG); 1624 } 1625 1626 if (INTEL_GEN(i915) >= 5) 1627 error->ccid = intel_uncore_read(uncore, CCID(RENDER_RING_BASE)); 1628 1629 /* 3: Feature specific registers */ 1630 if (IS_GEN_RANGE(i915, 6, 7)) { 1631 error->gam_ecochk = intel_uncore_read(uncore, GAM_ECOCHK); 1632 error->gac_eco = intel_uncore_read(uncore, GAC_ECO_BITS); 1633 } 1634 1635 /* 4: Everything else */ 1636 if (INTEL_GEN(i915) >= 11) { 1637 error->ier = intel_uncore_read(uncore, GEN8_DE_MISC_IER); 1638 error->gtier[0] = 1639 intel_uncore_read(uncore, 1640 GEN11_RENDER_COPY_INTR_ENABLE); 1641 error->gtier[1] = 1642 intel_uncore_read(uncore, GEN11_VCS_VECS_INTR_ENABLE); 1643 error->gtier[2] = 1644 intel_uncore_read(uncore, GEN11_GUC_SG_INTR_ENABLE); 1645 error->gtier[3] = 1646 intel_uncore_read(uncore, 1647 GEN11_GPM_WGBOXPERF_INTR_ENABLE); 1648 error->gtier[4] = 1649 intel_uncore_read(uncore, 1650 GEN11_CRYPTO_RSVD_INTR_ENABLE); 1651 error->gtier[5] = 1652 intel_uncore_read(uncore, 1653 GEN11_GUNIT_CSME_INTR_ENABLE); 1654 error->ngtier = 6; 1655 } else if (INTEL_GEN(i915) >= 8) { 1656 error->ier = intel_uncore_read(uncore, GEN8_DE_MISC_IER); 1657 for (i = 0; i < 4; i++) 1658 error->gtier[i] = intel_uncore_read(uncore, 1659 GEN8_GT_IER(i)); 1660 error->ngtier = 4; 1661 } else if (HAS_PCH_SPLIT(i915)) { 1662 error->ier = intel_uncore_read(uncore, DEIER); 1663 error->gtier[0] = intel_uncore_read(uncore, GTIER); 1664 error->ngtier = 1; 1665 } else if (IS_GEN(i915, 2)) { 1666 error->ier = intel_uncore_read16(uncore, GEN2_IER); 1667 } else if (!IS_VALLEYVIEW(i915)) { 1668 error->ier = intel_uncore_read(uncore, GEN2_IER); 1669 } 1670 error->eir = intel_uncore_read(uncore, EIR); 1671 error->pgtbl_er = intel_uncore_read(uncore, PGTBL_ER); 1672 } 1673 1674 static const char * 1675 error_msg(struct i915_gpu_state *error, 1676 intel_engine_mask_t engines, const char *msg) 1677 { 1678 int len; 1679 int i; 1680 1681 for (i = 0; i < ARRAY_SIZE(error->engine); i++) 1682 if (!error->engine[i].context.pid) 1683 engines &= ~BIT(i); 1684 1685 len = scnprintf(error->error_msg, sizeof(error->error_msg), 1686 "GPU HANG: ecode %d:%x:0x%08x", 1687 INTEL_GEN(error->i915), engines, 1688 i915_error_generate_code(error, engines)); 1689 if (engines) { 1690 /* Just show the first executing process, more is confusing */ 1691 i = __ffs(engines); 1692 len += scnprintf(error->error_msg + len, 1693 sizeof(error->error_msg) - len, 1694 ", in %s [%d]", 1695 error->engine[i].context.comm, 1696 error->engine[i].context.pid); 1697 } 1698 if (msg) 1699 len += scnprintf(error->error_msg + len, 1700 sizeof(error->error_msg) - len, 1701 ", %s", msg); 1702 1703 return error->error_msg; 1704 } 1705 1706 static void capture_gen_state(struct i915_gpu_state *error) 1707 { 1708 struct drm_i915_private *i915 = error->i915; 1709 1710 error->awake = i915->gt.awake; 1711 error->wakelock = atomic_read(&i915->runtime_pm.wakeref_count); 1712 error->suspended = i915->runtime_pm.suspended; 1713 1714 error->iommu = -1; 1715 #ifdef CONFIG_INTEL_IOMMU 1716 error->iommu = intel_iommu_gfx_mapped; 1717 #endif 1718 error->reset_count = i915_reset_count(&i915->gpu_error); 1719 error->suspend_count = i915->suspend_count; 1720 1721 memcpy(&error->device_info, 1722 INTEL_INFO(i915), 1723 sizeof(error->device_info)); 1724 memcpy(&error->runtime_info, 1725 RUNTIME_INFO(i915), 1726 sizeof(error->runtime_info)); 1727 error->driver_caps = i915->caps; 1728 } 1729 1730 static void capture_params(struct i915_gpu_state *error) 1731 { 1732 i915_params_copy(&error->params, &i915_modparams); 1733 } 1734 1735 static unsigned long capture_find_epoch(const struct i915_gpu_state *error) 1736 { 1737 unsigned long epoch = error->capture; 1738 int i; 1739 1740 for (i = 0; i < ARRAY_SIZE(error->engine); i++) { 1741 const struct drm_i915_error_engine *ee = &error->engine[i]; 1742 1743 if (ee->hangcheck_timestamp && 1744 time_before(ee->hangcheck_timestamp, epoch)) 1745 epoch = ee->hangcheck_timestamp; 1746 } 1747 1748 return epoch; 1749 } 1750 1751 static void capture_finish(struct i915_gpu_state *error) 1752 { 1753 struct i915_ggtt *ggtt = &error->i915->ggtt; 1754 const u64 slot = ggtt->error_capture.start; 1755 1756 ggtt->vm.clear_range(&ggtt->vm, slot, PAGE_SIZE); 1757 } 1758 1759 static int capture(void *data) 1760 { 1761 struct i915_gpu_state *error = data; 1762 1763 error->time = ktime_get_real(); 1764 error->boottime = ktime_get_boottime(); 1765 error->uptime = ktime_sub(ktime_get(), 1766 error->i915->gt.last_init_time); 1767 error->capture = jiffies; 1768 1769 capture_params(error); 1770 capture_gen_state(error); 1771 capture_uc_state(error); 1772 capture_reg_state(error); 1773 gem_record_fences(error); 1774 gem_record_rings(error); 1775 capture_active_buffers(error); 1776 capture_pinned_buffers(error); 1777 1778 error->overlay = intel_overlay_capture_error_state(error->i915); 1779 error->display = intel_display_capture_error_state(error->i915); 1780 1781 error->epoch = capture_find_epoch(error); 1782 1783 capture_finish(error); 1784 return 0; 1785 } 1786 1787 #define DAY_AS_SECONDS(x) (24 * 60 * 60 * (x)) 1788 1789 struct i915_gpu_state * 1790 i915_capture_gpu_state(struct drm_i915_private *i915) 1791 { 1792 struct i915_gpu_state *error; 1793 1794 /* Check if GPU capture has been disabled */ 1795 error = READ_ONCE(i915->gpu_error.first_error); 1796 if (IS_ERR(error)) 1797 return error; 1798 1799 error = kzalloc(sizeof(*error), GFP_ATOMIC); 1800 if (!error) { 1801 i915_disable_error_state(i915, -ENOMEM); 1802 return ERR_PTR(-ENOMEM); 1803 } 1804 1805 kref_init(&error->ref); 1806 error->i915 = i915; 1807 1808 stop_machine(capture, error, NULL); 1809 1810 return error; 1811 } 1812 1813 /** 1814 * i915_capture_error_state - capture an error record for later analysis 1815 * @i915: i915 device 1816 * @engine_mask: the mask of engines triggering the hang 1817 * @msg: a message to insert into the error capture header 1818 * 1819 * Should be called when an error is detected (either a hang or an error 1820 * interrupt) to capture error state from the time of the error. Fills 1821 * out a structure which becomes available in debugfs for user level tools 1822 * to pick up. 1823 */ 1824 void i915_capture_error_state(struct drm_i915_private *i915, 1825 intel_engine_mask_t engine_mask, 1826 const char *msg) 1827 { 1828 static bool warned; 1829 struct i915_gpu_state *error; 1830 unsigned long flags; 1831 1832 if (!i915_modparams.error_capture) 1833 return; 1834 1835 if (READ_ONCE(i915->gpu_error.first_error)) 1836 return; 1837 1838 error = i915_capture_gpu_state(i915); 1839 if (IS_ERR(error)) 1840 return; 1841 1842 dev_info(i915->drm.dev, "%s\n", error_msg(error, engine_mask, msg)); 1843 1844 if (!error->simulated) { 1845 spin_lock_irqsave(&i915->gpu_error.lock, flags); 1846 if (!i915->gpu_error.first_error) { 1847 i915->gpu_error.first_error = error; 1848 error = NULL; 1849 } 1850 spin_unlock_irqrestore(&i915->gpu_error.lock, flags); 1851 } 1852 1853 if (error) { 1854 __i915_gpu_state_free(&error->ref); 1855 return; 1856 } 1857 1858 if (!warned && 1859 ktime_get_real_seconds() - DRIVER_TIMESTAMP < DAY_AS_SECONDS(180)) { 1860 DRM_INFO("GPU hangs can indicate a bug anywhere in the entire gfx stack, including userspace.\n"); 1861 DRM_INFO("Please file a _new_ bug report on bugs.freedesktop.org against DRI -> DRM/Intel\n"); 1862 DRM_INFO("drm/i915 developers can then reassign to the right component if it's not a kernel issue.\n"); 1863 DRM_INFO("The gpu crash dump is required to analyze gpu hangs, so please always attach it.\n"); 1864 DRM_INFO("GPU crash dump saved to /sys/class/drm/card%d/error\n", 1865 i915->drm.primary->index); 1866 warned = true; 1867 } 1868 } 1869 1870 struct i915_gpu_state * 1871 i915_first_error_state(struct drm_i915_private *i915) 1872 { 1873 struct i915_gpu_state *error; 1874 1875 spin_lock_irq(&i915->gpu_error.lock); 1876 error = i915->gpu_error.first_error; 1877 if (!IS_ERR_OR_NULL(error)) 1878 i915_gpu_state_get(error); 1879 spin_unlock_irq(&i915->gpu_error.lock); 1880 1881 return error; 1882 } 1883 1884 void i915_reset_error_state(struct drm_i915_private *i915) 1885 { 1886 struct i915_gpu_state *error; 1887 1888 spin_lock_irq(&i915->gpu_error.lock); 1889 error = i915->gpu_error.first_error; 1890 if (error != ERR_PTR(-ENODEV)) /* if disabled, always disabled */ 1891 i915->gpu_error.first_error = NULL; 1892 spin_unlock_irq(&i915->gpu_error.lock); 1893 1894 if (!IS_ERR_OR_NULL(error)) 1895 i915_gpu_state_put(error); 1896 } 1897 1898 void i915_disable_error_state(struct drm_i915_private *i915, int err) 1899 { 1900 spin_lock_irq(&i915->gpu_error.lock); 1901 if (!i915->gpu_error.first_error) 1902 i915->gpu_error.first_error = ERR_PTR(err); 1903 spin_unlock_irq(&i915->gpu_error.lock); 1904 } 1905