1 /* 2 * Copyright 2008 Advanced Micro Devices, Inc. 3 * Copyright 2008 Red Hat Inc. 4 * Copyright 2009 Jerome Glisse. 5 * 6 * Permission is hereby granted, free of charge, to any person obtaining a 7 * copy of this software and associated documentation files (the "Software"), 8 * to deal in the Software without restriction, including without limitation 9 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 10 * and/or sell copies of the Software, and to permit persons to whom the 11 * Software is furnished to do so, subject to the following conditions: 12 * 13 * The above copyright notice and this permission notice shall be included in 14 * all copies or substantial portions of the Software. 15 * 16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 19 * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR 20 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, 21 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR 22 * OTHER DEALINGS IN THE SOFTWARE. 23 * 24 * Authors: Dave Airlie 25 * Alex Deucher 26 * Jerome Glisse 27 */ 28 #include <linux/power_supply.h> 29 #include <linux/kthread.h> 30 #include <linux/module.h> 31 #include <linux/console.h> 32 #include <linux/slab.h> 33 #include <linux/iommu.h> 34 #include <linux/pci.h> 35 36 #include <drm/drm_atomic_helper.h> 37 #include <drm/drm_probe_helper.h> 38 #include <drm/amdgpu_drm.h> 39 #include <linux/vgaarb.h> 40 #include <linux/vga_switcheroo.h> 41 #include <linux/efi.h> 42 #include "amdgpu.h" 43 #include "amdgpu_trace.h" 44 #include "amdgpu_i2c.h" 45 #include "atom.h" 46 #include "amdgpu_atombios.h" 47 #include "amdgpu_atomfirmware.h" 48 #include "amd_pcie.h" 49 #ifdef CONFIG_DRM_AMDGPU_SI 50 #include "si.h" 51 #endif 52 #ifdef CONFIG_DRM_AMDGPU_CIK 53 #include "cik.h" 54 #endif 55 #include "vi.h" 56 #include "soc15.h" 57 #include "nv.h" 58 #include "bif/bif_4_1_d.h" 59 #include <linux/firmware.h> 60 #include "amdgpu_vf_error.h" 61 62 #include "amdgpu_amdkfd.h" 63 #include "amdgpu_pm.h" 64 65 #include "amdgpu_xgmi.h" 66 #include "amdgpu_ras.h" 67 #include "amdgpu_pmu.h" 68 #include "amdgpu_fru_eeprom.h" 69 #include "amdgpu_reset.h" 70 71 #include <linux/suspend.h> 72 #include <drm/task_barrier.h> 73 #include <linux/pm_runtime.h> 74 75 #include <drm/drm_drv.h> 76 77 MODULE_FIRMWARE("amdgpu/vega10_gpu_info.bin"); 78 MODULE_FIRMWARE("amdgpu/vega12_gpu_info.bin"); 79 MODULE_FIRMWARE("amdgpu/raven_gpu_info.bin"); 80 MODULE_FIRMWARE("amdgpu/picasso_gpu_info.bin"); 81 MODULE_FIRMWARE("amdgpu/raven2_gpu_info.bin"); 82 MODULE_FIRMWARE("amdgpu/arcturus_gpu_info.bin"); 83 MODULE_FIRMWARE("amdgpu/renoir_gpu_info.bin"); 84 MODULE_FIRMWARE("amdgpu/navi10_gpu_info.bin"); 85 MODULE_FIRMWARE("amdgpu/navi14_gpu_info.bin"); 86 MODULE_FIRMWARE("amdgpu/navi12_gpu_info.bin"); 87 MODULE_FIRMWARE("amdgpu/vangogh_gpu_info.bin"); 88 MODULE_FIRMWARE("amdgpu/yellow_carp_gpu_info.bin"); 89 90 #define AMDGPU_RESUME_MS 2000 91 #define AMDGPU_MAX_RETRY_LIMIT 2 92 #define AMDGPU_RETRY_SRIOV_RESET(r) ((r) == -EBUSY || (r) == -ETIMEDOUT || (r) == -EINVAL) 93 94 const char *amdgpu_asic_name[] = { 95 "TAHITI", 96 "PITCAIRN", 97 "VERDE", 98 "OLAND", 99 "HAINAN", 100 "BONAIRE", 101 "KAVERI", 102 "KABINI", 103 "HAWAII", 104 "MULLINS", 105 "TOPAZ", 106 "TONGA", 107 "FIJI", 108 "CARRIZO", 109 "STONEY", 110 "POLARIS10", 111 "POLARIS11", 112 "POLARIS12", 113 "VEGAM", 114 "VEGA10", 115 "VEGA12", 116 "VEGA20", 117 "RAVEN", 118 "ARCTURUS", 119 "RENOIR", 120 "ALDEBARAN", 121 "NAVI10", 122 "CYAN_SKILLFISH", 123 "NAVI14", 124 "NAVI12", 125 "SIENNA_CICHLID", 126 "NAVY_FLOUNDER", 127 "VANGOGH", 128 "DIMGREY_CAVEFISH", 129 "BEIGE_GOBY", 130 "YELLOW_CARP", 131 "IP DISCOVERY", 132 "LAST", 133 }; 134 135 /** 136 * DOC: pcie_replay_count 137 * 138 * The amdgpu driver provides a sysfs API for reporting the total number 139 * of PCIe replays (NAKs) 140 * The file pcie_replay_count is used for this and returns the total 141 * number of replays as a sum of the NAKs generated and NAKs received 142 */ 143 144 static ssize_t amdgpu_device_get_pcie_replay_count(struct device *dev, 145 struct device_attribute *attr, char *buf) 146 { 147 struct drm_device *ddev = dev_get_drvdata(dev); 148 struct amdgpu_device *adev = drm_to_adev(ddev); 149 uint64_t cnt = amdgpu_asic_get_pcie_replay_count(adev); 150 151 return sysfs_emit(buf, "%llu\n", cnt); 152 } 153 154 static DEVICE_ATTR(pcie_replay_count, S_IRUGO, 155 amdgpu_device_get_pcie_replay_count, NULL); 156 157 static void amdgpu_device_get_pcie_info(struct amdgpu_device *adev); 158 159 /** 160 * DOC: product_name 161 * 162 * The amdgpu driver provides a sysfs API for reporting the product name 163 * for the device 164 * The file serial_number is used for this and returns the product name 165 * as returned from the FRU. 166 * NOTE: This is only available for certain server cards 167 */ 168 169 static ssize_t amdgpu_device_get_product_name(struct device *dev, 170 struct device_attribute *attr, char *buf) 171 { 172 struct drm_device *ddev = dev_get_drvdata(dev); 173 struct amdgpu_device *adev = drm_to_adev(ddev); 174 175 return sysfs_emit(buf, "%s\n", adev->product_name); 176 } 177 178 static DEVICE_ATTR(product_name, S_IRUGO, 179 amdgpu_device_get_product_name, NULL); 180 181 /** 182 * DOC: product_number 183 * 184 * The amdgpu driver provides a sysfs API for reporting the part number 185 * for the device 186 * The file serial_number is used for this and returns the part number 187 * as returned from the FRU. 188 * NOTE: This is only available for certain server cards 189 */ 190 191 static ssize_t amdgpu_device_get_product_number(struct device *dev, 192 struct device_attribute *attr, char *buf) 193 { 194 struct drm_device *ddev = dev_get_drvdata(dev); 195 struct amdgpu_device *adev = drm_to_adev(ddev); 196 197 return sysfs_emit(buf, "%s\n", adev->product_number); 198 } 199 200 static DEVICE_ATTR(product_number, S_IRUGO, 201 amdgpu_device_get_product_number, NULL); 202 203 /** 204 * DOC: serial_number 205 * 206 * The amdgpu driver provides a sysfs API for reporting the serial number 207 * for the device 208 * The file serial_number is used for this and returns the serial number 209 * as returned from the FRU. 210 * NOTE: This is only available for certain server cards 211 */ 212 213 static ssize_t amdgpu_device_get_serial_number(struct device *dev, 214 struct device_attribute *attr, char *buf) 215 { 216 struct drm_device *ddev = dev_get_drvdata(dev); 217 struct amdgpu_device *adev = drm_to_adev(ddev); 218 219 return sysfs_emit(buf, "%s\n", adev->serial); 220 } 221 222 static DEVICE_ATTR(serial_number, S_IRUGO, 223 amdgpu_device_get_serial_number, NULL); 224 225 /** 226 * amdgpu_device_supports_px - Is the device a dGPU with ATPX power control 227 * 228 * @dev: drm_device pointer 229 * 230 * Returns true if the device is a dGPU with ATPX power control, 231 * otherwise return false. 232 */ 233 bool amdgpu_device_supports_px(struct drm_device *dev) 234 { 235 struct amdgpu_device *adev = drm_to_adev(dev); 236 237 if ((adev->flags & AMD_IS_PX) && !amdgpu_is_atpx_hybrid()) 238 return true; 239 return false; 240 } 241 242 /** 243 * amdgpu_device_supports_boco - Is the device a dGPU with ACPI power resources 244 * 245 * @dev: drm_device pointer 246 * 247 * Returns true if the device is a dGPU with ACPI power control, 248 * otherwise return false. 249 */ 250 bool amdgpu_device_supports_boco(struct drm_device *dev) 251 { 252 struct amdgpu_device *adev = drm_to_adev(dev); 253 254 if (adev->has_pr3 || 255 ((adev->flags & AMD_IS_PX) && amdgpu_is_atpx_hybrid())) 256 return true; 257 return false; 258 } 259 260 /** 261 * amdgpu_device_supports_baco - Does the device support BACO 262 * 263 * @dev: drm_device pointer 264 * 265 * Returns true if the device supporte BACO, 266 * otherwise return false. 267 */ 268 bool amdgpu_device_supports_baco(struct drm_device *dev) 269 { 270 struct amdgpu_device *adev = drm_to_adev(dev); 271 272 return amdgpu_asic_supports_baco(adev); 273 } 274 275 /** 276 * amdgpu_device_supports_smart_shift - Is the device dGPU with 277 * smart shift support 278 * 279 * @dev: drm_device pointer 280 * 281 * Returns true if the device is a dGPU with Smart Shift support, 282 * otherwise returns false. 283 */ 284 bool amdgpu_device_supports_smart_shift(struct drm_device *dev) 285 { 286 return (amdgpu_device_supports_boco(dev) && 287 amdgpu_acpi_is_power_shift_control_supported()); 288 } 289 290 /* 291 * VRAM access helper functions 292 */ 293 294 /** 295 * amdgpu_device_mm_access - access vram by MM_INDEX/MM_DATA 296 * 297 * @adev: amdgpu_device pointer 298 * @pos: offset of the buffer in vram 299 * @buf: virtual address of the buffer in system memory 300 * @size: read/write size, sizeof(@buf) must > @size 301 * @write: true - write to vram, otherwise - read from vram 302 */ 303 void amdgpu_device_mm_access(struct amdgpu_device *adev, loff_t pos, 304 void *buf, size_t size, bool write) 305 { 306 unsigned long flags; 307 uint32_t hi = ~0, tmp = 0; 308 uint32_t *data = buf; 309 uint64_t last; 310 int idx; 311 312 if (!drm_dev_enter(adev_to_drm(adev), &idx)) 313 return; 314 315 BUG_ON(!IS_ALIGNED(pos, 4) || !IS_ALIGNED(size, 4)); 316 317 spin_lock_irqsave(&adev->mmio_idx_lock, flags); 318 for (last = pos + size; pos < last; pos += 4) { 319 tmp = pos >> 31; 320 321 WREG32_NO_KIQ(mmMM_INDEX, ((uint32_t)pos) | 0x80000000); 322 if (tmp != hi) { 323 WREG32_NO_KIQ(mmMM_INDEX_HI, tmp); 324 hi = tmp; 325 } 326 if (write) 327 WREG32_NO_KIQ(mmMM_DATA, *data++); 328 else 329 *data++ = RREG32_NO_KIQ(mmMM_DATA); 330 } 331 332 spin_unlock_irqrestore(&adev->mmio_idx_lock, flags); 333 drm_dev_exit(idx); 334 } 335 336 /** 337 * amdgpu_device_aper_access - access vram by vram aperature 338 * 339 * @adev: amdgpu_device pointer 340 * @pos: offset of the buffer in vram 341 * @buf: virtual address of the buffer in system memory 342 * @size: read/write size, sizeof(@buf) must > @size 343 * @write: true - write to vram, otherwise - read from vram 344 * 345 * The return value means how many bytes have been transferred. 346 */ 347 size_t amdgpu_device_aper_access(struct amdgpu_device *adev, loff_t pos, 348 void *buf, size_t size, bool write) 349 { 350 #ifdef CONFIG_64BIT 351 void __iomem *addr; 352 size_t count = 0; 353 uint64_t last; 354 355 if (!adev->mman.aper_base_kaddr) 356 return 0; 357 358 last = min(pos + size, adev->gmc.visible_vram_size); 359 if (last > pos) { 360 addr = adev->mman.aper_base_kaddr + pos; 361 count = last - pos; 362 363 if (write) { 364 memcpy_toio(addr, buf, count); 365 mb(); 366 amdgpu_device_flush_hdp(adev, NULL); 367 } else { 368 amdgpu_device_invalidate_hdp(adev, NULL); 369 mb(); 370 memcpy_fromio(buf, addr, count); 371 } 372 373 } 374 375 return count; 376 #else 377 return 0; 378 #endif 379 } 380 381 /** 382 * amdgpu_device_vram_access - read/write a buffer in vram 383 * 384 * @adev: amdgpu_device pointer 385 * @pos: offset of the buffer in vram 386 * @buf: virtual address of the buffer in system memory 387 * @size: read/write size, sizeof(@buf) must > @size 388 * @write: true - write to vram, otherwise - read from vram 389 */ 390 void amdgpu_device_vram_access(struct amdgpu_device *adev, loff_t pos, 391 void *buf, size_t size, bool write) 392 { 393 size_t count; 394 395 /* try to using vram apreature to access vram first */ 396 count = amdgpu_device_aper_access(adev, pos, buf, size, write); 397 size -= count; 398 if (size) { 399 /* using MM to access rest vram */ 400 pos += count; 401 buf += count; 402 amdgpu_device_mm_access(adev, pos, buf, size, write); 403 } 404 } 405 406 /* 407 * register access helper functions. 408 */ 409 410 /* Check if hw access should be skipped because of hotplug or device error */ 411 bool amdgpu_device_skip_hw_access(struct amdgpu_device *adev) 412 { 413 if (adev->no_hw_access) 414 return true; 415 416 #ifdef CONFIG_LOCKDEP 417 /* 418 * This is a bit complicated to understand, so worth a comment. What we assert 419 * here is that the GPU reset is not running on another thread in parallel. 420 * 421 * For this we trylock the read side of the reset semaphore, if that succeeds 422 * we know that the reset is not running in paralell. 423 * 424 * If the trylock fails we assert that we are either already holding the read 425 * side of the lock or are the reset thread itself and hold the write side of 426 * the lock. 427 */ 428 if (in_task()) { 429 if (down_read_trylock(&adev->reset_sem)) 430 up_read(&adev->reset_sem); 431 else 432 lockdep_assert_held(&adev->reset_sem); 433 } 434 #endif 435 return false; 436 } 437 438 /** 439 * amdgpu_device_rreg - read a memory mapped IO or indirect register 440 * 441 * @adev: amdgpu_device pointer 442 * @reg: dword aligned register offset 443 * @acc_flags: access flags which require special behavior 444 * 445 * Returns the 32 bit value from the offset specified. 446 */ 447 uint32_t amdgpu_device_rreg(struct amdgpu_device *adev, 448 uint32_t reg, uint32_t acc_flags) 449 { 450 uint32_t ret; 451 452 if (amdgpu_device_skip_hw_access(adev)) 453 return 0; 454 455 if ((reg * 4) < adev->rmmio_size) { 456 if (!(acc_flags & AMDGPU_REGS_NO_KIQ) && 457 amdgpu_sriov_runtime(adev) && 458 down_read_trylock(&adev->reset_sem)) { 459 ret = amdgpu_kiq_rreg(adev, reg); 460 up_read(&adev->reset_sem); 461 } else { 462 ret = readl(((void __iomem *)adev->rmmio) + (reg * 4)); 463 } 464 } else { 465 ret = adev->pcie_rreg(adev, reg * 4); 466 } 467 468 trace_amdgpu_device_rreg(adev->pdev->device, reg, ret); 469 470 return ret; 471 } 472 473 /* 474 * MMIO register read with bytes helper functions 475 * @offset:bytes offset from MMIO start 476 * 477 */ 478 479 /** 480 * amdgpu_mm_rreg8 - read a memory mapped IO register 481 * 482 * @adev: amdgpu_device pointer 483 * @offset: byte aligned register offset 484 * 485 * Returns the 8 bit value from the offset specified. 486 */ 487 uint8_t amdgpu_mm_rreg8(struct amdgpu_device *adev, uint32_t offset) 488 { 489 if (amdgpu_device_skip_hw_access(adev)) 490 return 0; 491 492 if (offset < adev->rmmio_size) 493 return (readb(adev->rmmio + offset)); 494 BUG(); 495 } 496 497 /* 498 * MMIO register write with bytes helper functions 499 * @offset:bytes offset from MMIO start 500 * @value: the value want to be written to the register 501 * 502 */ 503 /** 504 * amdgpu_mm_wreg8 - read a memory mapped IO register 505 * 506 * @adev: amdgpu_device pointer 507 * @offset: byte aligned register offset 508 * @value: 8 bit value to write 509 * 510 * Writes the value specified to the offset specified. 511 */ 512 void amdgpu_mm_wreg8(struct amdgpu_device *adev, uint32_t offset, uint8_t value) 513 { 514 if (amdgpu_device_skip_hw_access(adev)) 515 return; 516 517 if (offset < adev->rmmio_size) 518 writeb(value, adev->rmmio + offset); 519 else 520 BUG(); 521 } 522 523 /** 524 * amdgpu_device_wreg - write to a memory mapped IO or indirect register 525 * 526 * @adev: amdgpu_device pointer 527 * @reg: dword aligned register offset 528 * @v: 32 bit value to write to the register 529 * @acc_flags: access flags which require special behavior 530 * 531 * Writes the value specified to the offset specified. 532 */ 533 void amdgpu_device_wreg(struct amdgpu_device *adev, 534 uint32_t reg, uint32_t v, 535 uint32_t acc_flags) 536 { 537 if (amdgpu_device_skip_hw_access(adev)) 538 return; 539 540 if ((reg * 4) < adev->rmmio_size) { 541 if (!(acc_flags & AMDGPU_REGS_NO_KIQ) && 542 amdgpu_sriov_runtime(adev) && 543 down_read_trylock(&adev->reset_sem)) { 544 amdgpu_kiq_wreg(adev, reg, v); 545 up_read(&adev->reset_sem); 546 } else { 547 writel(v, ((void __iomem *)adev->rmmio) + (reg * 4)); 548 } 549 } else { 550 adev->pcie_wreg(adev, reg * 4, v); 551 } 552 553 trace_amdgpu_device_wreg(adev->pdev->device, reg, v); 554 } 555 556 /** 557 * amdgpu_mm_wreg_mmio_rlc - write register either with direct/indirect mmio or with RLC path if in range 558 * 559 * @adev: amdgpu_device pointer 560 * @reg: mmio/rlc register 561 * @v: value to write 562 * 563 * this function is invoked only for the debugfs register access 564 */ 565 void amdgpu_mm_wreg_mmio_rlc(struct amdgpu_device *adev, 566 uint32_t reg, uint32_t v) 567 { 568 if (amdgpu_device_skip_hw_access(adev)) 569 return; 570 571 if (amdgpu_sriov_fullaccess(adev) && 572 adev->gfx.rlc.funcs && 573 adev->gfx.rlc.funcs->is_rlcg_access_range) { 574 if (adev->gfx.rlc.funcs->is_rlcg_access_range(adev, reg)) 575 return amdgpu_sriov_wreg(adev, reg, v, 0, 0); 576 } else if ((reg * 4) >= adev->rmmio_size) { 577 adev->pcie_wreg(adev, reg * 4, v); 578 } else { 579 writel(v, ((void __iomem *)adev->rmmio) + (reg * 4)); 580 } 581 } 582 583 /** 584 * amdgpu_mm_rdoorbell - read a doorbell dword 585 * 586 * @adev: amdgpu_device pointer 587 * @index: doorbell index 588 * 589 * Returns the value in the doorbell aperture at the 590 * requested doorbell index (CIK). 591 */ 592 u32 amdgpu_mm_rdoorbell(struct amdgpu_device *adev, u32 index) 593 { 594 if (amdgpu_device_skip_hw_access(adev)) 595 return 0; 596 597 if (index < adev->doorbell.num_doorbells) { 598 return readl(adev->doorbell.ptr + index); 599 } else { 600 DRM_ERROR("reading beyond doorbell aperture: 0x%08x!\n", index); 601 return 0; 602 } 603 } 604 605 /** 606 * amdgpu_mm_wdoorbell - write a doorbell dword 607 * 608 * @adev: amdgpu_device pointer 609 * @index: doorbell index 610 * @v: value to write 611 * 612 * Writes @v to the doorbell aperture at the 613 * requested doorbell index (CIK). 614 */ 615 void amdgpu_mm_wdoorbell(struct amdgpu_device *adev, u32 index, u32 v) 616 { 617 if (amdgpu_device_skip_hw_access(adev)) 618 return; 619 620 if (index < adev->doorbell.num_doorbells) { 621 writel(v, adev->doorbell.ptr + index); 622 } else { 623 DRM_ERROR("writing beyond doorbell aperture: 0x%08x!\n", index); 624 } 625 } 626 627 /** 628 * amdgpu_mm_rdoorbell64 - read a doorbell Qword 629 * 630 * @adev: amdgpu_device pointer 631 * @index: doorbell index 632 * 633 * Returns the value in the doorbell aperture at the 634 * requested doorbell index (VEGA10+). 635 */ 636 u64 amdgpu_mm_rdoorbell64(struct amdgpu_device *adev, u32 index) 637 { 638 if (amdgpu_device_skip_hw_access(adev)) 639 return 0; 640 641 if (index < adev->doorbell.num_doorbells) { 642 return atomic64_read((atomic64_t *)(adev->doorbell.ptr + index)); 643 } else { 644 DRM_ERROR("reading beyond doorbell aperture: 0x%08x!\n", index); 645 return 0; 646 } 647 } 648 649 /** 650 * amdgpu_mm_wdoorbell64 - write a doorbell Qword 651 * 652 * @adev: amdgpu_device pointer 653 * @index: doorbell index 654 * @v: value to write 655 * 656 * Writes @v to the doorbell aperture at the 657 * requested doorbell index (VEGA10+). 658 */ 659 void amdgpu_mm_wdoorbell64(struct amdgpu_device *adev, u32 index, u64 v) 660 { 661 if (amdgpu_device_skip_hw_access(adev)) 662 return; 663 664 if (index < adev->doorbell.num_doorbells) { 665 atomic64_set((atomic64_t *)(adev->doorbell.ptr + index), v); 666 } else { 667 DRM_ERROR("writing beyond doorbell aperture: 0x%08x!\n", index); 668 } 669 } 670 671 /** 672 * amdgpu_device_indirect_rreg - read an indirect register 673 * 674 * @adev: amdgpu_device pointer 675 * @pcie_index: mmio register offset 676 * @pcie_data: mmio register offset 677 * @reg_addr: indirect register address to read from 678 * 679 * Returns the value of indirect register @reg_addr 680 */ 681 u32 amdgpu_device_indirect_rreg(struct amdgpu_device *adev, 682 u32 pcie_index, u32 pcie_data, 683 u32 reg_addr) 684 { 685 unsigned long flags; 686 u32 r; 687 void __iomem *pcie_index_offset; 688 void __iomem *pcie_data_offset; 689 690 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 691 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 692 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 693 694 writel(reg_addr, pcie_index_offset); 695 readl(pcie_index_offset); 696 r = readl(pcie_data_offset); 697 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 698 699 return r; 700 } 701 702 /** 703 * amdgpu_device_indirect_rreg64 - read a 64bits indirect register 704 * 705 * @adev: amdgpu_device pointer 706 * @pcie_index: mmio register offset 707 * @pcie_data: mmio register offset 708 * @reg_addr: indirect register address to read from 709 * 710 * Returns the value of indirect register @reg_addr 711 */ 712 u64 amdgpu_device_indirect_rreg64(struct amdgpu_device *adev, 713 u32 pcie_index, u32 pcie_data, 714 u32 reg_addr) 715 { 716 unsigned long flags; 717 u64 r; 718 void __iomem *pcie_index_offset; 719 void __iomem *pcie_data_offset; 720 721 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 722 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 723 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 724 725 /* read low 32 bits */ 726 writel(reg_addr, pcie_index_offset); 727 readl(pcie_index_offset); 728 r = readl(pcie_data_offset); 729 /* read high 32 bits */ 730 writel(reg_addr + 4, pcie_index_offset); 731 readl(pcie_index_offset); 732 r |= ((u64)readl(pcie_data_offset) << 32); 733 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 734 735 return r; 736 } 737 738 /** 739 * amdgpu_device_indirect_wreg - write an indirect register address 740 * 741 * @adev: amdgpu_device pointer 742 * @pcie_index: mmio register offset 743 * @pcie_data: mmio register offset 744 * @reg_addr: indirect register offset 745 * @reg_data: indirect register data 746 * 747 */ 748 void amdgpu_device_indirect_wreg(struct amdgpu_device *adev, 749 u32 pcie_index, u32 pcie_data, 750 u32 reg_addr, u32 reg_data) 751 { 752 unsigned long flags; 753 void __iomem *pcie_index_offset; 754 void __iomem *pcie_data_offset; 755 756 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 757 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 758 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 759 760 writel(reg_addr, pcie_index_offset); 761 readl(pcie_index_offset); 762 writel(reg_data, pcie_data_offset); 763 readl(pcie_data_offset); 764 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 765 } 766 767 /** 768 * amdgpu_device_indirect_wreg64 - write a 64bits indirect register address 769 * 770 * @adev: amdgpu_device pointer 771 * @pcie_index: mmio register offset 772 * @pcie_data: mmio register offset 773 * @reg_addr: indirect register offset 774 * @reg_data: indirect register data 775 * 776 */ 777 void amdgpu_device_indirect_wreg64(struct amdgpu_device *adev, 778 u32 pcie_index, u32 pcie_data, 779 u32 reg_addr, u64 reg_data) 780 { 781 unsigned long flags; 782 void __iomem *pcie_index_offset; 783 void __iomem *pcie_data_offset; 784 785 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 786 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 787 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 788 789 /* write low 32 bits */ 790 writel(reg_addr, pcie_index_offset); 791 readl(pcie_index_offset); 792 writel((u32)(reg_data & 0xffffffffULL), pcie_data_offset); 793 readl(pcie_data_offset); 794 /* write high 32 bits */ 795 writel(reg_addr + 4, pcie_index_offset); 796 readl(pcie_index_offset); 797 writel((u32)(reg_data >> 32), pcie_data_offset); 798 readl(pcie_data_offset); 799 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 800 } 801 802 /** 803 * amdgpu_invalid_rreg - dummy reg read function 804 * 805 * @adev: amdgpu_device pointer 806 * @reg: offset of register 807 * 808 * Dummy register read function. Used for register blocks 809 * that certain asics don't have (all asics). 810 * Returns the value in the register. 811 */ 812 static uint32_t amdgpu_invalid_rreg(struct amdgpu_device *adev, uint32_t reg) 813 { 814 DRM_ERROR("Invalid callback to read register 0x%04X\n", reg); 815 BUG(); 816 return 0; 817 } 818 819 /** 820 * amdgpu_invalid_wreg - dummy reg write function 821 * 822 * @adev: amdgpu_device pointer 823 * @reg: offset of register 824 * @v: value to write to the register 825 * 826 * Dummy register read function. Used for register blocks 827 * that certain asics don't have (all asics). 828 */ 829 static void amdgpu_invalid_wreg(struct amdgpu_device *adev, uint32_t reg, uint32_t v) 830 { 831 DRM_ERROR("Invalid callback to write register 0x%04X with 0x%08X\n", 832 reg, v); 833 BUG(); 834 } 835 836 /** 837 * amdgpu_invalid_rreg64 - dummy 64 bit reg read function 838 * 839 * @adev: amdgpu_device pointer 840 * @reg: offset of register 841 * 842 * Dummy register read function. Used for register blocks 843 * that certain asics don't have (all asics). 844 * Returns the value in the register. 845 */ 846 static uint64_t amdgpu_invalid_rreg64(struct amdgpu_device *adev, uint32_t reg) 847 { 848 DRM_ERROR("Invalid callback to read 64 bit register 0x%04X\n", reg); 849 BUG(); 850 return 0; 851 } 852 853 /** 854 * amdgpu_invalid_wreg64 - dummy reg write function 855 * 856 * @adev: amdgpu_device pointer 857 * @reg: offset of register 858 * @v: value to write to the register 859 * 860 * Dummy register read function. Used for register blocks 861 * that certain asics don't have (all asics). 862 */ 863 static void amdgpu_invalid_wreg64(struct amdgpu_device *adev, uint32_t reg, uint64_t v) 864 { 865 DRM_ERROR("Invalid callback to write 64 bit register 0x%04X with 0x%08llX\n", 866 reg, v); 867 BUG(); 868 } 869 870 /** 871 * amdgpu_block_invalid_rreg - dummy reg read function 872 * 873 * @adev: amdgpu_device pointer 874 * @block: offset of instance 875 * @reg: offset of register 876 * 877 * Dummy register read function. Used for register blocks 878 * that certain asics don't have (all asics). 879 * Returns the value in the register. 880 */ 881 static uint32_t amdgpu_block_invalid_rreg(struct amdgpu_device *adev, 882 uint32_t block, uint32_t reg) 883 { 884 DRM_ERROR("Invalid callback to read register 0x%04X in block 0x%04X\n", 885 reg, block); 886 BUG(); 887 return 0; 888 } 889 890 /** 891 * amdgpu_block_invalid_wreg - dummy reg write function 892 * 893 * @adev: amdgpu_device pointer 894 * @block: offset of instance 895 * @reg: offset of register 896 * @v: value to write to the register 897 * 898 * Dummy register read function. Used for register blocks 899 * that certain asics don't have (all asics). 900 */ 901 static void amdgpu_block_invalid_wreg(struct amdgpu_device *adev, 902 uint32_t block, 903 uint32_t reg, uint32_t v) 904 { 905 DRM_ERROR("Invalid block callback to write register 0x%04X in block 0x%04X with 0x%08X\n", 906 reg, block, v); 907 BUG(); 908 } 909 910 /** 911 * amdgpu_device_asic_init - Wrapper for atom asic_init 912 * 913 * @adev: amdgpu_device pointer 914 * 915 * Does any asic specific work and then calls atom asic init. 916 */ 917 static int amdgpu_device_asic_init(struct amdgpu_device *adev) 918 { 919 amdgpu_asic_pre_asic_init(adev); 920 921 return amdgpu_atom_asic_init(adev->mode_info.atom_context); 922 } 923 924 /** 925 * amdgpu_device_vram_scratch_init - allocate the VRAM scratch page 926 * 927 * @adev: amdgpu_device pointer 928 * 929 * Allocates a scratch page of VRAM for use by various things in the 930 * driver. 931 */ 932 static int amdgpu_device_vram_scratch_init(struct amdgpu_device *adev) 933 { 934 return amdgpu_bo_create_kernel(adev, AMDGPU_GPU_PAGE_SIZE, 935 PAGE_SIZE, AMDGPU_GEM_DOMAIN_VRAM, 936 &adev->vram_scratch.robj, 937 &adev->vram_scratch.gpu_addr, 938 (void **)&adev->vram_scratch.ptr); 939 } 940 941 /** 942 * amdgpu_device_vram_scratch_fini - Free the VRAM scratch page 943 * 944 * @adev: amdgpu_device pointer 945 * 946 * Frees the VRAM scratch page. 947 */ 948 static void amdgpu_device_vram_scratch_fini(struct amdgpu_device *adev) 949 { 950 amdgpu_bo_free_kernel(&adev->vram_scratch.robj, NULL, NULL); 951 } 952 953 /** 954 * amdgpu_device_program_register_sequence - program an array of registers. 955 * 956 * @adev: amdgpu_device pointer 957 * @registers: pointer to the register array 958 * @array_size: size of the register array 959 * 960 * Programs an array or registers with and and or masks. 961 * This is a helper for setting golden registers. 962 */ 963 void amdgpu_device_program_register_sequence(struct amdgpu_device *adev, 964 const u32 *registers, 965 const u32 array_size) 966 { 967 u32 tmp, reg, and_mask, or_mask; 968 int i; 969 970 if (array_size % 3) 971 return; 972 973 for (i = 0; i < array_size; i +=3) { 974 reg = registers[i + 0]; 975 and_mask = registers[i + 1]; 976 or_mask = registers[i + 2]; 977 978 if (and_mask == 0xffffffff) { 979 tmp = or_mask; 980 } else { 981 tmp = RREG32(reg); 982 tmp &= ~and_mask; 983 if (adev->family >= AMDGPU_FAMILY_AI) 984 tmp |= (or_mask & and_mask); 985 else 986 tmp |= or_mask; 987 } 988 WREG32(reg, tmp); 989 } 990 } 991 992 /** 993 * amdgpu_device_pci_config_reset - reset the GPU 994 * 995 * @adev: amdgpu_device pointer 996 * 997 * Resets the GPU using the pci config reset sequence. 998 * Only applicable to asics prior to vega10. 999 */ 1000 void amdgpu_device_pci_config_reset(struct amdgpu_device *adev) 1001 { 1002 pci_write_config_dword(adev->pdev, 0x7c, AMDGPU_ASIC_RESET_DATA); 1003 } 1004 1005 /** 1006 * amdgpu_device_pci_reset - reset the GPU using generic PCI means 1007 * 1008 * @adev: amdgpu_device pointer 1009 * 1010 * Resets the GPU using generic pci reset interfaces (FLR, SBR, etc.). 1011 */ 1012 int amdgpu_device_pci_reset(struct amdgpu_device *adev) 1013 { 1014 return pci_reset_function(adev->pdev); 1015 } 1016 1017 /* 1018 * GPU doorbell aperture helpers function. 1019 */ 1020 /** 1021 * amdgpu_device_doorbell_init - Init doorbell driver information. 1022 * 1023 * @adev: amdgpu_device pointer 1024 * 1025 * Init doorbell driver information (CIK) 1026 * Returns 0 on success, error on failure. 1027 */ 1028 static int amdgpu_device_doorbell_init(struct amdgpu_device *adev) 1029 { 1030 1031 /* No doorbell on SI hardware generation */ 1032 if (adev->asic_type < CHIP_BONAIRE) { 1033 adev->doorbell.base = 0; 1034 adev->doorbell.size = 0; 1035 adev->doorbell.num_doorbells = 0; 1036 adev->doorbell.ptr = NULL; 1037 return 0; 1038 } 1039 1040 if (pci_resource_flags(adev->pdev, 2) & IORESOURCE_UNSET) 1041 return -EINVAL; 1042 1043 amdgpu_asic_init_doorbell_index(adev); 1044 1045 /* doorbell bar mapping */ 1046 adev->doorbell.base = pci_resource_start(adev->pdev, 2); 1047 adev->doorbell.size = pci_resource_len(adev->pdev, 2); 1048 1049 adev->doorbell.num_doorbells = min_t(u32, adev->doorbell.size / sizeof(u32), 1050 adev->doorbell_index.max_assignment+1); 1051 if (adev->doorbell.num_doorbells == 0) 1052 return -EINVAL; 1053 1054 /* For Vega, reserve and map two pages on doorbell BAR since SDMA 1055 * paging queue doorbell use the second page. The 1056 * AMDGPU_DOORBELL64_MAX_ASSIGNMENT definition assumes all the 1057 * doorbells are in the first page. So with paging queue enabled, 1058 * the max num_doorbells should + 1 page (0x400 in dword) 1059 */ 1060 if (adev->asic_type >= CHIP_VEGA10) 1061 adev->doorbell.num_doorbells += 0x400; 1062 1063 adev->doorbell.ptr = ioremap(adev->doorbell.base, 1064 adev->doorbell.num_doorbells * 1065 sizeof(u32)); 1066 if (adev->doorbell.ptr == NULL) 1067 return -ENOMEM; 1068 1069 return 0; 1070 } 1071 1072 /** 1073 * amdgpu_device_doorbell_fini - Tear down doorbell driver information. 1074 * 1075 * @adev: amdgpu_device pointer 1076 * 1077 * Tear down doorbell driver information (CIK) 1078 */ 1079 static void amdgpu_device_doorbell_fini(struct amdgpu_device *adev) 1080 { 1081 iounmap(adev->doorbell.ptr); 1082 adev->doorbell.ptr = NULL; 1083 } 1084 1085 1086 1087 /* 1088 * amdgpu_device_wb_*() 1089 * Writeback is the method by which the GPU updates special pages in memory 1090 * with the status of certain GPU events (fences, ring pointers,etc.). 1091 */ 1092 1093 /** 1094 * amdgpu_device_wb_fini - Disable Writeback and free memory 1095 * 1096 * @adev: amdgpu_device pointer 1097 * 1098 * Disables Writeback and frees the Writeback memory (all asics). 1099 * Used at driver shutdown. 1100 */ 1101 static void amdgpu_device_wb_fini(struct amdgpu_device *adev) 1102 { 1103 if (adev->wb.wb_obj) { 1104 amdgpu_bo_free_kernel(&adev->wb.wb_obj, 1105 &adev->wb.gpu_addr, 1106 (void **)&adev->wb.wb); 1107 adev->wb.wb_obj = NULL; 1108 } 1109 } 1110 1111 /** 1112 * amdgpu_device_wb_init - Init Writeback driver info and allocate memory 1113 * 1114 * @adev: amdgpu_device pointer 1115 * 1116 * Initializes writeback and allocates writeback memory (all asics). 1117 * Used at driver startup. 1118 * Returns 0 on success or an -error on failure. 1119 */ 1120 static int amdgpu_device_wb_init(struct amdgpu_device *adev) 1121 { 1122 int r; 1123 1124 if (adev->wb.wb_obj == NULL) { 1125 /* AMDGPU_MAX_WB * sizeof(uint32_t) * 8 = AMDGPU_MAX_WB 256bit slots */ 1126 r = amdgpu_bo_create_kernel(adev, AMDGPU_MAX_WB * sizeof(uint32_t) * 8, 1127 PAGE_SIZE, AMDGPU_GEM_DOMAIN_GTT, 1128 &adev->wb.wb_obj, &adev->wb.gpu_addr, 1129 (void **)&adev->wb.wb); 1130 if (r) { 1131 dev_warn(adev->dev, "(%d) create WB bo failed\n", r); 1132 return r; 1133 } 1134 1135 adev->wb.num_wb = AMDGPU_MAX_WB; 1136 memset(&adev->wb.used, 0, sizeof(adev->wb.used)); 1137 1138 /* clear wb memory */ 1139 memset((char *)adev->wb.wb, 0, AMDGPU_MAX_WB * sizeof(uint32_t) * 8); 1140 } 1141 1142 return 0; 1143 } 1144 1145 /** 1146 * amdgpu_device_wb_get - Allocate a wb entry 1147 * 1148 * @adev: amdgpu_device pointer 1149 * @wb: wb index 1150 * 1151 * Allocate a wb slot for use by the driver (all asics). 1152 * Returns 0 on success or -EINVAL on failure. 1153 */ 1154 int amdgpu_device_wb_get(struct amdgpu_device *adev, u32 *wb) 1155 { 1156 unsigned long offset = find_first_zero_bit(adev->wb.used, adev->wb.num_wb); 1157 1158 if (offset < adev->wb.num_wb) { 1159 __set_bit(offset, adev->wb.used); 1160 *wb = offset << 3; /* convert to dw offset */ 1161 return 0; 1162 } else { 1163 return -EINVAL; 1164 } 1165 } 1166 1167 /** 1168 * amdgpu_device_wb_free - Free a wb entry 1169 * 1170 * @adev: amdgpu_device pointer 1171 * @wb: wb index 1172 * 1173 * Free a wb slot allocated for use by the driver (all asics) 1174 */ 1175 void amdgpu_device_wb_free(struct amdgpu_device *adev, u32 wb) 1176 { 1177 wb >>= 3; 1178 if (wb < adev->wb.num_wb) 1179 __clear_bit(wb, adev->wb.used); 1180 } 1181 1182 /** 1183 * amdgpu_device_resize_fb_bar - try to resize FB BAR 1184 * 1185 * @adev: amdgpu_device pointer 1186 * 1187 * Try to resize FB BAR to make all VRAM CPU accessible. We try very hard not 1188 * to fail, but if any of the BARs is not accessible after the size we abort 1189 * driver loading by returning -ENODEV. 1190 */ 1191 int amdgpu_device_resize_fb_bar(struct amdgpu_device *adev) 1192 { 1193 int rbar_size = pci_rebar_bytes_to_size(adev->gmc.real_vram_size); 1194 struct pci_bus *root; 1195 struct resource *res; 1196 unsigned i; 1197 u16 cmd; 1198 int r; 1199 1200 /* Bypass for VF */ 1201 if (amdgpu_sriov_vf(adev)) 1202 return 0; 1203 1204 /* skip if the bios has already enabled large BAR */ 1205 if (adev->gmc.real_vram_size && 1206 (pci_resource_len(adev->pdev, 0) >= adev->gmc.real_vram_size)) 1207 return 0; 1208 1209 /* Check if the root BUS has 64bit memory resources */ 1210 root = adev->pdev->bus; 1211 while (root->parent) 1212 root = root->parent; 1213 1214 pci_bus_for_each_resource(root, res, i) { 1215 if (res && res->flags & (IORESOURCE_MEM | IORESOURCE_MEM_64) && 1216 res->start > 0x100000000ull) 1217 break; 1218 } 1219 1220 /* Trying to resize is pointless without a root hub window above 4GB */ 1221 if (!res) 1222 return 0; 1223 1224 /* Limit the BAR size to what is available */ 1225 rbar_size = min(fls(pci_rebar_get_possible_sizes(adev->pdev, 0)) - 1, 1226 rbar_size); 1227 1228 /* Disable memory decoding while we change the BAR addresses and size */ 1229 pci_read_config_word(adev->pdev, PCI_COMMAND, &cmd); 1230 pci_write_config_word(adev->pdev, PCI_COMMAND, 1231 cmd & ~PCI_COMMAND_MEMORY); 1232 1233 /* Free the VRAM and doorbell BAR, we most likely need to move both. */ 1234 amdgpu_device_doorbell_fini(adev); 1235 if (adev->asic_type >= CHIP_BONAIRE) 1236 pci_release_resource(adev->pdev, 2); 1237 1238 pci_release_resource(adev->pdev, 0); 1239 1240 r = pci_resize_resource(adev->pdev, 0, rbar_size); 1241 if (r == -ENOSPC) 1242 DRM_INFO("Not enough PCI address space for a large BAR."); 1243 else if (r && r != -ENOTSUPP) 1244 DRM_ERROR("Problem resizing BAR0 (%d).", r); 1245 1246 pci_assign_unassigned_bus_resources(adev->pdev->bus); 1247 1248 /* When the doorbell or fb BAR isn't available we have no chance of 1249 * using the device. 1250 */ 1251 r = amdgpu_device_doorbell_init(adev); 1252 if (r || (pci_resource_flags(adev->pdev, 0) & IORESOURCE_UNSET)) 1253 return -ENODEV; 1254 1255 pci_write_config_word(adev->pdev, PCI_COMMAND, cmd); 1256 1257 return 0; 1258 } 1259 1260 /* 1261 * GPU helpers function. 1262 */ 1263 /** 1264 * amdgpu_device_need_post - check if the hw need post or not 1265 * 1266 * @adev: amdgpu_device pointer 1267 * 1268 * Check if the asic has been initialized (all asics) at driver startup 1269 * or post is needed if hw reset is performed. 1270 * Returns true if need or false if not. 1271 */ 1272 bool amdgpu_device_need_post(struct amdgpu_device *adev) 1273 { 1274 uint32_t reg; 1275 1276 if (amdgpu_sriov_vf(adev)) 1277 return false; 1278 1279 if (amdgpu_passthrough(adev)) { 1280 /* for FIJI: In whole GPU pass-through virtualization case, after VM reboot 1281 * some old smc fw still need driver do vPost otherwise gpu hang, while 1282 * those smc fw version above 22.15 doesn't have this flaw, so we force 1283 * vpost executed for smc version below 22.15 1284 */ 1285 if (adev->asic_type == CHIP_FIJI) { 1286 int err; 1287 uint32_t fw_ver; 1288 err = request_firmware(&adev->pm.fw, "amdgpu/fiji_smc.bin", adev->dev); 1289 /* force vPost if error occured */ 1290 if (err) 1291 return true; 1292 1293 fw_ver = *((uint32_t *)adev->pm.fw->data + 69); 1294 if (fw_ver < 0x00160e00) 1295 return true; 1296 } 1297 } 1298 1299 /* Don't post if we need to reset whole hive on init */ 1300 if (adev->gmc.xgmi.pending_reset) 1301 return false; 1302 1303 if (adev->has_hw_reset) { 1304 adev->has_hw_reset = false; 1305 return true; 1306 } 1307 1308 /* bios scratch used on CIK+ */ 1309 if (adev->asic_type >= CHIP_BONAIRE) 1310 return amdgpu_atombios_scratch_need_asic_init(adev); 1311 1312 /* check MEM_SIZE for older asics */ 1313 reg = amdgpu_asic_get_config_memsize(adev); 1314 1315 if ((reg != 0) && (reg != 0xffffffff)) 1316 return false; 1317 1318 return true; 1319 } 1320 1321 /** 1322 * amdgpu_device_should_use_aspm - check if the device should program ASPM 1323 * 1324 * @adev: amdgpu_device pointer 1325 * 1326 * Confirm whether the module parameter and pcie bridge agree that ASPM should 1327 * be set for this device. 1328 * 1329 * Returns true if it should be used or false if not. 1330 */ 1331 bool amdgpu_device_should_use_aspm(struct amdgpu_device *adev) 1332 { 1333 switch (amdgpu_aspm) { 1334 case -1: 1335 break; 1336 case 0: 1337 return false; 1338 case 1: 1339 return true; 1340 default: 1341 return false; 1342 } 1343 return pcie_aspm_enabled(adev->pdev); 1344 } 1345 1346 /* if we get transitioned to only one device, take VGA back */ 1347 /** 1348 * amdgpu_device_vga_set_decode - enable/disable vga decode 1349 * 1350 * @pdev: PCI device pointer 1351 * @state: enable/disable vga decode 1352 * 1353 * Enable/disable vga decode (all asics). 1354 * Returns VGA resource flags. 1355 */ 1356 static unsigned int amdgpu_device_vga_set_decode(struct pci_dev *pdev, 1357 bool state) 1358 { 1359 struct amdgpu_device *adev = drm_to_adev(pci_get_drvdata(pdev)); 1360 amdgpu_asic_set_vga_state(adev, state); 1361 if (state) 1362 return VGA_RSRC_LEGACY_IO | VGA_RSRC_LEGACY_MEM | 1363 VGA_RSRC_NORMAL_IO | VGA_RSRC_NORMAL_MEM; 1364 else 1365 return VGA_RSRC_NORMAL_IO | VGA_RSRC_NORMAL_MEM; 1366 } 1367 1368 /** 1369 * amdgpu_device_check_block_size - validate the vm block size 1370 * 1371 * @adev: amdgpu_device pointer 1372 * 1373 * Validates the vm block size specified via module parameter. 1374 * The vm block size defines number of bits in page table versus page directory, 1375 * a page is 4KB so we have 12 bits offset, minimum 9 bits in the 1376 * page table and the remaining bits are in the page directory. 1377 */ 1378 static void amdgpu_device_check_block_size(struct amdgpu_device *adev) 1379 { 1380 /* defines number of bits in page table versus page directory, 1381 * a page is 4KB so we have 12 bits offset, minimum 9 bits in the 1382 * page table and the remaining bits are in the page directory */ 1383 if (amdgpu_vm_block_size == -1) 1384 return; 1385 1386 if (amdgpu_vm_block_size < 9) { 1387 dev_warn(adev->dev, "VM page table size (%d) too small\n", 1388 amdgpu_vm_block_size); 1389 amdgpu_vm_block_size = -1; 1390 } 1391 } 1392 1393 /** 1394 * amdgpu_device_check_vm_size - validate the vm size 1395 * 1396 * @adev: amdgpu_device pointer 1397 * 1398 * Validates the vm size in GB specified via module parameter. 1399 * The VM size is the size of the GPU virtual memory space in GB. 1400 */ 1401 static void amdgpu_device_check_vm_size(struct amdgpu_device *adev) 1402 { 1403 /* no need to check the default value */ 1404 if (amdgpu_vm_size == -1) 1405 return; 1406 1407 if (amdgpu_vm_size < 1) { 1408 dev_warn(adev->dev, "VM size (%d) too small, min is 1GB\n", 1409 amdgpu_vm_size); 1410 amdgpu_vm_size = -1; 1411 } 1412 } 1413 1414 static void amdgpu_device_check_smu_prv_buffer_size(struct amdgpu_device *adev) 1415 { 1416 struct sysinfo si; 1417 bool is_os_64 = (sizeof(void *) == 8); 1418 uint64_t total_memory; 1419 uint64_t dram_size_seven_GB = 0x1B8000000; 1420 uint64_t dram_size_three_GB = 0xB8000000; 1421 1422 if (amdgpu_smu_memory_pool_size == 0) 1423 return; 1424 1425 if (!is_os_64) { 1426 DRM_WARN("Not 64-bit OS, feature not supported\n"); 1427 goto def_value; 1428 } 1429 si_meminfo(&si); 1430 total_memory = (uint64_t)si.totalram * si.mem_unit; 1431 1432 if ((amdgpu_smu_memory_pool_size == 1) || 1433 (amdgpu_smu_memory_pool_size == 2)) { 1434 if (total_memory < dram_size_three_GB) 1435 goto def_value1; 1436 } else if ((amdgpu_smu_memory_pool_size == 4) || 1437 (amdgpu_smu_memory_pool_size == 8)) { 1438 if (total_memory < dram_size_seven_GB) 1439 goto def_value1; 1440 } else { 1441 DRM_WARN("Smu memory pool size not supported\n"); 1442 goto def_value; 1443 } 1444 adev->pm.smu_prv_buffer_size = amdgpu_smu_memory_pool_size << 28; 1445 1446 return; 1447 1448 def_value1: 1449 DRM_WARN("No enough system memory\n"); 1450 def_value: 1451 adev->pm.smu_prv_buffer_size = 0; 1452 } 1453 1454 static int amdgpu_device_init_apu_flags(struct amdgpu_device *adev) 1455 { 1456 if (!(adev->flags & AMD_IS_APU) || 1457 adev->asic_type < CHIP_RAVEN) 1458 return 0; 1459 1460 switch (adev->asic_type) { 1461 case CHIP_RAVEN: 1462 if (adev->pdev->device == 0x15dd) 1463 adev->apu_flags |= AMD_APU_IS_RAVEN; 1464 if (adev->pdev->device == 0x15d8) 1465 adev->apu_flags |= AMD_APU_IS_PICASSO; 1466 break; 1467 case CHIP_RENOIR: 1468 if ((adev->pdev->device == 0x1636) || 1469 (adev->pdev->device == 0x164c)) 1470 adev->apu_flags |= AMD_APU_IS_RENOIR; 1471 else 1472 adev->apu_flags |= AMD_APU_IS_GREEN_SARDINE; 1473 break; 1474 case CHIP_VANGOGH: 1475 adev->apu_flags |= AMD_APU_IS_VANGOGH; 1476 break; 1477 case CHIP_YELLOW_CARP: 1478 break; 1479 case CHIP_CYAN_SKILLFISH: 1480 if ((adev->pdev->device == 0x13FE) || 1481 (adev->pdev->device == 0x143F)) 1482 adev->apu_flags |= AMD_APU_IS_CYAN_SKILLFISH2; 1483 break; 1484 default: 1485 break; 1486 } 1487 1488 return 0; 1489 } 1490 1491 /** 1492 * amdgpu_device_check_arguments - validate module params 1493 * 1494 * @adev: amdgpu_device pointer 1495 * 1496 * Validates certain module parameters and updates 1497 * the associated values used by the driver (all asics). 1498 */ 1499 static int amdgpu_device_check_arguments(struct amdgpu_device *adev) 1500 { 1501 if (amdgpu_sched_jobs < 4) { 1502 dev_warn(adev->dev, "sched jobs (%d) must be at least 4\n", 1503 amdgpu_sched_jobs); 1504 amdgpu_sched_jobs = 4; 1505 } else if (!is_power_of_2(amdgpu_sched_jobs)){ 1506 dev_warn(adev->dev, "sched jobs (%d) must be a power of 2\n", 1507 amdgpu_sched_jobs); 1508 amdgpu_sched_jobs = roundup_pow_of_two(amdgpu_sched_jobs); 1509 } 1510 1511 if (amdgpu_gart_size != -1 && amdgpu_gart_size < 32) { 1512 /* gart size must be greater or equal to 32M */ 1513 dev_warn(adev->dev, "gart size (%d) too small\n", 1514 amdgpu_gart_size); 1515 amdgpu_gart_size = -1; 1516 } 1517 1518 if (amdgpu_gtt_size != -1 && amdgpu_gtt_size < 32) { 1519 /* gtt size must be greater or equal to 32M */ 1520 dev_warn(adev->dev, "gtt size (%d) too small\n", 1521 amdgpu_gtt_size); 1522 amdgpu_gtt_size = -1; 1523 } 1524 1525 /* valid range is between 4 and 9 inclusive */ 1526 if (amdgpu_vm_fragment_size != -1 && 1527 (amdgpu_vm_fragment_size > 9 || amdgpu_vm_fragment_size < 4)) { 1528 dev_warn(adev->dev, "valid range is between 4 and 9\n"); 1529 amdgpu_vm_fragment_size = -1; 1530 } 1531 1532 if (amdgpu_sched_hw_submission < 2) { 1533 dev_warn(adev->dev, "sched hw submission jobs (%d) must be at least 2\n", 1534 amdgpu_sched_hw_submission); 1535 amdgpu_sched_hw_submission = 2; 1536 } else if (!is_power_of_2(amdgpu_sched_hw_submission)) { 1537 dev_warn(adev->dev, "sched hw submission jobs (%d) must be a power of 2\n", 1538 amdgpu_sched_hw_submission); 1539 amdgpu_sched_hw_submission = roundup_pow_of_two(amdgpu_sched_hw_submission); 1540 } 1541 1542 amdgpu_device_check_smu_prv_buffer_size(adev); 1543 1544 amdgpu_device_check_vm_size(adev); 1545 1546 amdgpu_device_check_block_size(adev); 1547 1548 adev->firmware.load_type = amdgpu_ucode_get_load_type(adev, amdgpu_fw_load_type); 1549 1550 amdgpu_gmc_tmz_set(adev); 1551 1552 amdgpu_gmc_noretry_set(adev); 1553 1554 return 0; 1555 } 1556 1557 /** 1558 * amdgpu_switcheroo_set_state - set switcheroo state 1559 * 1560 * @pdev: pci dev pointer 1561 * @state: vga_switcheroo state 1562 * 1563 * Callback for the switcheroo driver. Suspends or resumes the 1564 * the asics before or after it is powered up using ACPI methods. 1565 */ 1566 static void amdgpu_switcheroo_set_state(struct pci_dev *pdev, 1567 enum vga_switcheroo_state state) 1568 { 1569 struct drm_device *dev = pci_get_drvdata(pdev); 1570 int r; 1571 1572 if (amdgpu_device_supports_px(dev) && state == VGA_SWITCHEROO_OFF) 1573 return; 1574 1575 if (state == VGA_SWITCHEROO_ON) { 1576 pr_info("switched on\n"); 1577 /* don't suspend or resume card normally */ 1578 dev->switch_power_state = DRM_SWITCH_POWER_CHANGING; 1579 1580 pci_set_power_state(pdev, PCI_D0); 1581 amdgpu_device_load_pci_state(pdev); 1582 r = pci_enable_device(pdev); 1583 if (r) 1584 DRM_WARN("pci_enable_device failed (%d)\n", r); 1585 amdgpu_device_resume(dev, true); 1586 1587 dev->switch_power_state = DRM_SWITCH_POWER_ON; 1588 } else { 1589 pr_info("switched off\n"); 1590 dev->switch_power_state = DRM_SWITCH_POWER_CHANGING; 1591 amdgpu_device_suspend(dev, true); 1592 amdgpu_device_cache_pci_state(pdev); 1593 /* Shut down the device */ 1594 pci_disable_device(pdev); 1595 pci_set_power_state(pdev, PCI_D3cold); 1596 dev->switch_power_state = DRM_SWITCH_POWER_OFF; 1597 } 1598 } 1599 1600 /** 1601 * amdgpu_switcheroo_can_switch - see if switcheroo state can change 1602 * 1603 * @pdev: pci dev pointer 1604 * 1605 * Callback for the switcheroo driver. Check of the switcheroo 1606 * state can be changed. 1607 * Returns true if the state can be changed, false if not. 1608 */ 1609 static bool amdgpu_switcheroo_can_switch(struct pci_dev *pdev) 1610 { 1611 struct drm_device *dev = pci_get_drvdata(pdev); 1612 1613 /* 1614 * FIXME: open_count is protected by drm_global_mutex but that would lead to 1615 * locking inversion with the driver load path. And the access here is 1616 * completely racy anyway. So don't bother with locking for now. 1617 */ 1618 return atomic_read(&dev->open_count) == 0; 1619 } 1620 1621 static const struct vga_switcheroo_client_ops amdgpu_switcheroo_ops = { 1622 .set_gpu_state = amdgpu_switcheroo_set_state, 1623 .reprobe = NULL, 1624 .can_switch = amdgpu_switcheroo_can_switch, 1625 }; 1626 1627 /** 1628 * amdgpu_device_ip_set_clockgating_state - set the CG state 1629 * 1630 * @dev: amdgpu_device pointer 1631 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.) 1632 * @state: clockgating state (gate or ungate) 1633 * 1634 * Sets the requested clockgating state for all instances of 1635 * the hardware IP specified. 1636 * Returns the error code from the last instance. 1637 */ 1638 int amdgpu_device_ip_set_clockgating_state(void *dev, 1639 enum amd_ip_block_type block_type, 1640 enum amd_clockgating_state state) 1641 { 1642 struct amdgpu_device *adev = dev; 1643 int i, r = 0; 1644 1645 for (i = 0; i < adev->num_ip_blocks; i++) { 1646 if (!adev->ip_blocks[i].status.valid) 1647 continue; 1648 if (adev->ip_blocks[i].version->type != block_type) 1649 continue; 1650 if (!adev->ip_blocks[i].version->funcs->set_clockgating_state) 1651 continue; 1652 r = adev->ip_blocks[i].version->funcs->set_clockgating_state( 1653 (void *)adev, state); 1654 if (r) 1655 DRM_ERROR("set_clockgating_state of IP block <%s> failed %d\n", 1656 adev->ip_blocks[i].version->funcs->name, r); 1657 } 1658 return r; 1659 } 1660 1661 /** 1662 * amdgpu_device_ip_set_powergating_state - set the PG state 1663 * 1664 * @dev: amdgpu_device pointer 1665 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.) 1666 * @state: powergating state (gate or ungate) 1667 * 1668 * Sets the requested powergating state for all instances of 1669 * the hardware IP specified. 1670 * Returns the error code from the last instance. 1671 */ 1672 int amdgpu_device_ip_set_powergating_state(void *dev, 1673 enum amd_ip_block_type block_type, 1674 enum amd_powergating_state state) 1675 { 1676 struct amdgpu_device *adev = dev; 1677 int i, r = 0; 1678 1679 for (i = 0; i < adev->num_ip_blocks; i++) { 1680 if (!adev->ip_blocks[i].status.valid) 1681 continue; 1682 if (adev->ip_blocks[i].version->type != block_type) 1683 continue; 1684 if (!adev->ip_blocks[i].version->funcs->set_powergating_state) 1685 continue; 1686 r = adev->ip_blocks[i].version->funcs->set_powergating_state( 1687 (void *)adev, state); 1688 if (r) 1689 DRM_ERROR("set_powergating_state of IP block <%s> failed %d\n", 1690 adev->ip_blocks[i].version->funcs->name, r); 1691 } 1692 return r; 1693 } 1694 1695 /** 1696 * amdgpu_device_ip_get_clockgating_state - get the CG state 1697 * 1698 * @adev: amdgpu_device pointer 1699 * @flags: clockgating feature flags 1700 * 1701 * Walks the list of IPs on the device and updates the clockgating 1702 * flags for each IP. 1703 * Updates @flags with the feature flags for each hardware IP where 1704 * clockgating is enabled. 1705 */ 1706 void amdgpu_device_ip_get_clockgating_state(struct amdgpu_device *adev, 1707 u32 *flags) 1708 { 1709 int i; 1710 1711 for (i = 0; i < adev->num_ip_blocks; i++) { 1712 if (!adev->ip_blocks[i].status.valid) 1713 continue; 1714 if (adev->ip_blocks[i].version->funcs->get_clockgating_state) 1715 adev->ip_blocks[i].version->funcs->get_clockgating_state((void *)adev, flags); 1716 } 1717 } 1718 1719 /** 1720 * amdgpu_device_ip_wait_for_idle - wait for idle 1721 * 1722 * @adev: amdgpu_device pointer 1723 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.) 1724 * 1725 * Waits for the request hardware IP to be idle. 1726 * Returns 0 for success or a negative error code on failure. 1727 */ 1728 int amdgpu_device_ip_wait_for_idle(struct amdgpu_device *adev, 1729 enum amd_ip_block_type block_type) 1730 { 1731 int i, r; 1732 1733 for (i = 0; i < adev->num_ip_blocks; i++) { 1734 if (!adev->ip_blocks[i].status.valid) 1735 continue; 1736 if (adev->ip_blocks[i].version->type == block_type) { 1737 r = adev->ip_blocks[i].version->funcs->wait_for_idle((void *)adev); 1738 if (r) 1739 return r; 1740 break; 1741 } 1742 } 1743 return 0; 1744 1745 } 1746 1747 /** 1748 * amdgpu_device_ip_is_idle - is the hardware IP idle 1749 * 1750 * @adev: amdgpu_device pointer 1751 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.) 1752 * 1753 * Check if the hardware IP is idle or not. 1754 * Returns true if it the IP is idle, false if not. 1755 */ 1756 bool amdgpu_device_ip_is_idle(struct amdgpu_device *adev, 1757 enum amd_ip_block_type block_type) 1758 { 1759 int i; 1760 1761 for (i = 0; i < adev->num_ip_blocks; i++) { 1762 if (!adev->ip_blocks[i].status.valid) 1763 continue; 1764 if (adev->ip_blocks[i].version->type == block_type) 1765 return adev->ip_blocks[i].version->funcs->is_idle((void *)adev); 1766 } 1767 return true; 1768 1769 } 1770 1771 /** 1772 * amdgpu_device_ip_get_ip_block - get a hw IP pointer 1773 * 1774 * @adev: amdgpu_device pointer 1775 * @type: Type of hardware IP (SMU, GFX, UVD, etc.) 1776 * 1777 * Returns a pointer to the hardware IP block structure 1778 * if it exists for the asic, otherwise NULL. 1779 */ 1780 struct amdgpu_ip_block * 1781 amdgpu_device_ip_get_ip_block(struct amdgpu_device *adev, 1782 enum amd_ip_block_type type) 1783 { 1784 int i; 1785 1786 for (i = 0; i < adev->num_ip_blocks; i++) 1787 if (adev->ip_blocks[i].version->type == type) 1788 return &adev->ip_blocks[i]; 1789 1790 return NULL; 1791 } 1792 1793 /** 1794 * amdgpu_device_ip_block_version_cmp 1795 * 1796 * @adev: amdgpu_device pointer 1797 * @type: enum amd_ip_block_type 1798 * @major: major version 1799 * @minor: minor version 1800 * 1801 * return 0 if equal or greater 1802 * return 1 if smaller or the ip_block doesn't exist 1803 */ 1804 int amdgpu_device_ip_block_version_cmp(struct amdgpu_device *adev, 1805 enum amd_ip_block_type type, 1806 u32 major, u32 minor) 1807 { 1808 struct amdgpu_ip_block *ip_block = amdgpu_device_ip_get_ip_block(adev, type); 1809 1810 if (ip_block && ((ip_block->version->major > major) || 1811 ((ip_block->version->major == major) && 1812 (ip_block->version->minor >= minor)))) 1813 return 0; 1814 1815 return 1; 1816 } 1817 1818 /** 1819 * amdgpu_device_ip_block_add 1820 * 1821 * @adev: amdgpu_device pointer 1822 * @ip_block_version: pointer to the IP to add 1823 * 1824 * Adds the IP block driver information to the collection of IPs 1825 * on the asic. 1826 */ 1827 int amdgpu_device_ip_block_add(struct amdgpu_device *adev, 1828 const struct amdgpu_ip_block_version *ip_block_version) 1829 { 1830 if (!ip_block_version) 1831 return -EINVAL; 1832 1833 switch (ip_block_version->type) { 1834 case AMD_IP_BLOCK_TYPE_VCN: 1835 if (adev->harvest_ip_mask & AMD_HARVEST_IP_VCN_MASK) 1836 return 0; 1837 break; 1838 case AMD_IP_BLOCK_TYPE_JPEG: 1839 if (adev->harvest_ip_mask & AMD_HARVEST_IP_JPEG_MASK) 1840 return 0; 1841 break; 1842 default: 1843 break; 1844 } 1845 1846 DRM_INFO("add ip block number %d <%s>\n", adev->num_ip_blocks, 1847 ip_block_version->funcs->name); 1848 1849 adev->ip_blocks[adev->num_ip_blocks++].version = ip_block_version; 1850 1851 return 0; 1852 } 1853 1854 /** 1855 * amdgpu_device_enable_virtual_display - enable virtual display feature 1856 * 1857 * @adev: amdgpu_device pointer 1858 * 1859 * Enabled the virtual display feature if the user has enabled it via 1860 * the module parameter virtual_display. This feature provides a virtual 1861 * display hardware on headless boards or in virtualized environments. 1862 * This function parses and validates the configuration string specified by 1863 * the user and configues the virtual display configuration (number of 1864 * virtual connectors, crtcs, etc.) specified. 1865 */ 1866 static void amdgpu_device_enable_virtual_display(struct amdgpu_device *adev) 1867 { 1868 adev->enable_virtual_display = false; 1869 1870 if (amdgpu_virtual_display) { 1871 const char *pci_address_name = pci_name(adev->pdev); 1872 char *pciaddstr, *pciaddstr_tmp, *pciaddname_tmp, *pciaddname; 1873 1874 pciaddstr = kstrdup(amdgpu_virtual_display, GFP_KERNEL); 1875 pciaddstr_tmp = pciaddstr; 1876 while ((pciaddname_tmp = strsep(&pciaddstr_tmp, ";"))) { 1877 pciaddname = strsep(&pciaddname_tmp, ","); 1878 if (!strcmp("all", pciaddname) 1879 || !strcmp(pci_address_name, pciaddname)) { 1880 long num_crtc; 1881 int res = -1; 1882 1883 adev->enable_virtual_display = true; 1884 1885 if (pciaddname_tmp) 1886 res = kstrtol(pciaddname_tmp, 10, 1887 &num_crtc); 1888 1889 if (!res) { 1890 if (num_crtc < 1) 1891 num_crtc = 1; 1892 if (num_crtc > 6) 1893 num_crtc = 6; 1894 adev->mode_info.num_crtc = num_crtc; 1895 } else { 1896 adev->mode_info.num_crtc = 1; 1897 } 1898 break; 1899 } 1900 } 1901 1902 DRM_INFO("virtual display string:%s, %s:virtual_display:%d, num_crtc:%d\n", 1903 amdgpu_virtual_display, pci_address_name, 1904 adev->enable_virtual_display, adev->mode_info.num_crtc); 1905 1906 kfree(pciaddstr); 1907 } 1908 } 1909 1910 /** 1911 * amdgpu_device_parse_gpu_info_fw - parse gpu info firmware 1912 * 1913 * @adev: amdgpu_device pointer 1914 * 1915 * Parses the asic configuration parameters specified in the gpu info 1916 * firmware and makes them availale to the driver for use in configuring 1917 * the asic. 1918 * Returns 0 on success, -EINVAL on failure. 1919 */ 1920 static int amdgpu_device_parse_gpu_info_fw(struct amdgpu_device *adev) 1921 { 1922 const char *chip_name; 1923 char fw_name[40]; 1924 int err; 1925 const struct gpu_info_firmware_header_v1_0 *hdr; 1926 1927 adev->firmware.gpu_info_fw = NULL; 1928 1929 if (adev->mman.discovery_bin) { 1930 amdgpu_discovery_get_gfx_info(adev); 1931 1932 /* 1933 * FIXME: The bounding box is still needed by Navi12, so 1934 * temporarily read it from gpu_info firmware. Should be droped 1935 * when DAL no longer needs it. 1936 */ 1937 if (adev->asic_type != CHIP_NAVI12) 1938 return 0; 1939 } 1940 1941 switch (adev->asic_type) { 1942 #ifdef CONFIG_DRM_AMDGPU_SI 1943 case CHIP_VERDE: 1944 case CHIP_TAHITI: 1945 case CHIP_PITCAIRN: 1946 case CHIP_OLAND: 1947 case CHIP_HAINAN: 1948 #endif 1949 #ifdef CONFIG_DRM_AMDGPU_CIK 1950 case CHIP_BONAIRE: 1951 case CHIP_HAWAII: 1952 case CHIP_KAVERI: 1953 case CHIP_KABINI: 1954 case CHIP_MULLINS: 1955 #endif 1956 case CHIP_TOPAZ: 1957 case CHIP_TONGA: 1958 case CHIP_FIJI: 1959 case CHIP_POLARIS10: 1960 case CHIP_POLARIS11: 1961 case CHIP_POLARIS12: 1962 case CHIP_VEGAM: 1963 case CHIP_CARRIZO: 1964 case CHIP_STONEY: 1965 case CHIP_VEGA20: 1966 case CHIP_ALDEBARAN: 1967 case CHIP_SIENNA_CICHLID: 1968 case CHIP_NAVY_FLOUNDER: 1969 case CHIP_DIMGREY_CAVEFISH: 1970 case CHIP_BEIGE_GOBY: 1971 default: 1972 return 0; 1973 case CHIP_VEGA10: 1974 chip_name = "vega10"; 1975 break; 1976 case CHIP_VEGA12: 1977 chip_name = "vega12"; 1978 break; 1979 case CHIP_RAVEN: 1980 if (adev->apu_flags & AMD_APU_IS_RAVEN2) 1981 chip_name = "raven2"; 1982 else if (adev->apu_flags & AMD_APU_IS_PICASSO) 1983 chip_name = "picasso"; 1984 else 1985 chip_name = "raven"; 1986 break; 1987 case CHIP_ARCTURUS: 1988 chip_name = "arcturus"; 1989 break; 1990 case CHIP_RENOIR: 1991 if (adev->apu_flags & AMD_APU_IS_RENOIR) 1992 chip_name = "renoir"; 1993 else 1994 chip_name = "green_sardine"; 1995 break; 1996 case CHIP_NAVI10: 1997 chip_name = "navi10"; 1998 break; 1999 case CHIP_NAVI14: 2000 chip_name = "navi14"; 2001 break; 2002 case CHIP_NAVI12: 2003 chip_name = "navi12"; 2004 break; 2005 case CHIP_VANGOGH: 2006 chip_name = "vangogh"; 2007 break; 2008 case CHIP_YELLOW_CARP: 2009 chip_name = "yellow_carp"; 2010 break; 2011 } 2012 2013 snprintf(fw_name, sizeof(fw_name), "amdgpu/%s_gpu_info.bin", chip_name); 2014 err = request_firmware(&adev->firmware.gpu_info_fw, fw_name, adev->dev); 2015 if (err) { 2016 dev_err(adev->dev, 2017 "Failed to load gpu_info firmware \"%s\"\n", 2018 fw_name); 2019 goto out; 2020 } 2021 err = amdgpu_ucode_validate(adev->firmware.gpu_info_fw); 2022 if (err) { 2023 dev_err(adev->dev, 2024 "Failed to validate gpu_info firmware \"%s\"\n", 2025 fw_name); 2026 goto out; 2027 } 2028 2029 hdr = (const struct gpu_info_firmware_header_v1_0 *)adev->firmware.gpu_info_fw->data; 2030 amdgpu_ucode_print_gpu_info_hdr(&hdr->header); 2031 2032 switch (hdr->version_major) { 2033 case 1: 2034 { 2035 const struct gpu_info_firmware_v1_0 *gpu_info_fw = 2036 (const struct gpu_info_firmware_v1_0 *)(adev->firmware.gpu_info_fw->data + 2037 le32_to_cpu(hdr->header.ucode_array_offset_bytes)); 2038 2039 /* 2040 * Should be droped when DAL no longer needs it. 2041 */ 2042 if (adev->asic_type == CHIP_NAVI12) 2043 goto parse_soc_bounding_box; 2044 2045 adev->gfx.config.max_shader_engines = le32_to_cpu(gpu_info_fw->gc_num_se); 2046 adev->gfx.config.max_cu_per_sh = le32_to_cpu(gpu_info_fw->gc_num_cu_per_sh); 2047 adev->gfx.config.max_sh_per_se = le32_to_cpu(gpu_info_fw->gc_num_sh_per_se); 2048 adev->gfx.config.max_backends_per_se = le32_to_cpu(gpu_info_fw->gc_num_rb_per_se); 2049 adev->gfx.config.max_texture_channel_caches = 2050 le32_to_cpu(gpu_info_fw->gc_num_tccs); 2051 adev->gfx.config.max_gprs = le32_to_cpu(gpu_info_fw->gc_num_gprs); 2052 adev->gfx.config.max_gs_threads = le32_to_cpu(gpu_info_fw->gc_num_max_gs_thds); 2053 adev->gfx.config.gs_vgt_table_depth = le32_to_cpu(gpu_info_fw->gc_gs_table_depth); 2054 adev->gfx.config.gs_prim_buffer_depth = le32_to_cpu(gpu_info_fw->gc_gsprim_buff_depth); 2055 adev->gfx.config.double_offchip_lds_buf = 2056 le32_to_cpu(gpu_info_fw->gc_double_offchip_lds_buffer); 2057 adev->gfx.cu_info.wave_front_size = le32_to_cpu(gpu_info_fw->gc_wave_size); 2058 adev->gfx.cu_info.max_waves_per_simd = 2059 le32_to_cpu(gpu_info_fw->gc_max_waves_per_simd); 2060 adev->gfx.cu_info.max_scratch_slots_per_cu = 2061 le32_to_cpu(gpu_info_fw->gc_max_scratch_slots_per_cu); 2062 adev->gfx.cu_info.lds_size = le32_to_cpu(gpu_info_fw->gc_lds_size); 2063 if (hdr->version_minor >= 1) { 2064 const struct gpu_info_firmware_v1_1 *gpu_info_fw = 2065 (const struct gpu_info_firmware_v1_1 *)(adev->firmware.gpu_info_fw->data + 2066 le32_to_cpu(hdr->header.ucode_array_offset_bytes)); 2067 adev->gfx.config.num_sc_per_sh = 2068 le32_to_cpu(gpu_info_fw->num_sc_per_sh); 2069 adev->gfx.config.num_packer_per_sc = 2070 le32_to_cpu(gpu_info_fw->num_packer_per_sc); 2071 } 2072 2073 parse_soc_bounding_box: 2074 /* 2075 * soc bounding box info is not integrated in disocovery table, 2076 * we always need to parse it from gpu info firmware if needed. 2077 */ 2078 if (hdr->version_minor == 2) { 2079 const struct gpu_info_firmware_v1_2 *gpu_info_fw = 2080 (const struct gpu_info_firmware_v1_2 *)(adev->firmware.gpu_info_fw->data + 2081 le32_to_cpu(hdr->header.ucode_array_offset_bytes)); 2082 adev->dm.soc_bounding_box = &gpu_info_fw->soc_bounding_box; 2083 } 2084 break; 2085 } 2086 default: 2087 dev_err(adev->dev, 2088 "Unsupported gpu_info table %d\n", hdr->header.ucode_version); 2089 err = -EINVAL; 2090 goto out; 2091 } 2092 out: 2093 return err; 2094 } 2095 2096 /** 2097 * amdgpu_device_ip_early_init - run early init for hardware IPs 2098 * 2099 * @adev: amdgpu_device pointer 2100 * 2101 * Early initialization pass for hardware IPs. The hardware IPs that make 2102 * up each asic are discovered each IP's early_init callback is run. This 2103 * is the first stage in initializing the asic. 2104 * Returns 0 on success, negative error code on failure. 2105 */ 2106 static int amdgpu_device_ip_early_init(struct amdgpu_device *adev) 2107 { 2108 struct drm_device *dev = adev_to_drm(adev); 2109 struct pci_dev *parent; 2110 int i, r; 2111 2112 amdgpu_device_enable_virtual_display(adev); 2113 2114 if (amdgpu_sriov_vf(adev)) { 2115 r = amdgpu_virt_request_full_gpu(adev, true); 2116 if (r) 2117 return r; 2118 } 2119 2120 switch (adev->asic_type) { 2121 #ifdef CONFIG_DRM_AMDGPU_SI 2122 case CHIP_VERDE: 2123 case CHIP_TAHITI: 2124 case CHIP_PITCAIRN: 2125 case CHIP_OLAND: 2126 case CHIP_HAINAN: 2127 adev->family = AMDGPU_FAMILY_SI; 2128 r = si_set_ip_blocks(adev); 2129 if (r) 2130 return r; 2131 break; 2132 #endif 2133 #ifdef CONFIG_DRM_AMDGPU_CIK 2134 case CHIP_BONAIRE: 2135 case CHIP_HAWAII: 2136 case CHIP_KAVERI: 2137 case CHIP_KABINI: 2138 case CHIP_MULLINS: 2139 if (adev->flags & AMD_IS_APU) 2140 adev->family = AMDGPU_FAMILY_KV; 2141 else 2142 adev->family = AMDGPU_FAMILY_CI; 2143 2144 r = cik_set_ip_blocks(adev); 2145 if (r) 2146 return r; 2147 break; 2148 #endif 2149 case CHIP_TOPAZ: 2150 case CHIP_TONGA: 2151 case CHIP_FIJI: 2152 case CHIP_POLARIS10: 2153 case CHIP_POLARIS11: 2154 case CHIP_POLARIS12: 2155 case CHIP_VEGAM: 2156 case CHIP_CARRIZO: 2157 case CHIP_STONEY: 2158 if (adev->flags & AMD_IS_APU) 2159 adev->family = AMDGPU_FAMILY_CZ; 2160 else 2161 adev->family = AMDGPU_FAMILY_VI; 2162 2163 r = vi_set_ip_blocks(adev); 2164 if (r) 2165 return r; 2166 break; 2167 default: 2168 r = amdgpu_discovery_set_ip_blocks(adev); 2169 if (r) 2170 return r; 2171 break; 2172 } 2173 2174 if (amdgpu_has_atpx() && 2175 (amdgpu_is_atpx_hybrid() || 2176 amdgpu_has_atpx_dgpu_power_cntl()) && 2177 ((adev->flags & AMD_IS_APU) == 0) && 2178 !pci_is_thunderbolt_attached(to_pci_dev(dev->dev))) 2179 adev->flags |= AMD_IS_PX; 2180 2181 parent = pci_upstream_bridge(adev->pdev); 2182 adev->has_pr3 = parent ? pci_pr3_present(parent) : false; 2183 2184 amdgpu_amdkfd_device_probe(adev); 2185 2186 adev->pm.pp_feature = amdgpu_pp_feature_mask; 2187 if (amdgpu_sriov_vf(adev) || sched_policy == KFD_SCHED_POLICY_NO_HWS) 2188 adev->pm.pp_feature &= ~PP_GFXOFF_MASK; 2189 if (amdgpu_sriov_vf(adev) && adev->asic_type == CHIP_SIENNA_CICHLID) 2190 adev->pm.pp_feature &= ~PP_OVERDRIVE_MASK; 2191 2192 for (i = 0; i < adev->num_ip_blocks; i++) { 2193 if ((amdgpu_ip_block_mask & (1 << i)) == 0) { 2194 DRM_ERROR("disabled ip block: %d <%s>\n", 2195 i, adev->ip_blocks[i].version->funcs->name); 2196 adev->ip_blocks[i].status.valid = false; 2197 } else { 2198 if (adev->ip_blocks[i].version->funcs->early_init) { 2199 r = adev->ip_blocks[i].version->funcs->early_init((void *)adev); 2200 if (r == -ENOENT) { 2201 adev->ip_blocks[i].status.valid = false; 2202 } else if (r) { 2203 DRM_ERROR("early_init of IP block <%s> failed %d\n", 2204 adev->ip_blocks[i].version->funcs->name, r); 2205 return r; 2206 } else { 2207 adev->ip_blocks[i].status.valid = true; 2208 } 2209 } else { 2210 adev->ip_blocks[i].status.valid = true; 2211 } 2212 } 2213 /* get the vbios after the asic_funcs are set up */ 2214 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON) { 2215 r = amdgpu_device_parse_gpu_info_fw(adev); 2216 if (r) 2217 return r; 2218 2219 /* Read BIOS */ 2220 if (!amdgpu_get_bios(adev)) 2221 return -EINVAL; 2222 2223 r = amdgpu_atombios_init(adev); 2224 if (r) { 2225 dev_err(adev->dev, "amdgpu_atombios_init failed\n"); 2226 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_INIT_FAIL, 0, 0); 2227 return r; 2228 } 2229 2230 /*get pf2vf msg info at it's earliest time*/ 2231 if (amdgpu_sriov_vf(adev)) 2232 amdgpu_virt_init_data_exchange(adev); 2233 2234 } 2235 } 2236 2237 adev->cg_flags &= amdgpu_cg_mask; 2238 adev->pg_flags &= amdgpu_pg_mask; 2239 2240 return 0; 2241 } 2242 2243 static int amdgpu_device_ip_hw_init_phase1(struct amdgpu_device *adev) 2244 { 2245 int i, r; 2246 2247 for (i = 0; i < adev->num_ip_blocks; i++) { 2248 if (!adev->ip_blocks[i].status.sw) 2249 continue; 2250 if (adev->ip_blocks[i].status.hw) 2251 continue; 2252 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON || 2253 (amdgpu_sriov_vf(adev) && (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP)) || 2254 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH) { 2255 r = adev->ip_blocks[i].version->funcs->hw_init(adev); 2256 if (r) { 2257 DRM_ERROR("hw_init of IP block <%s> failed %d\n", 2258 adev->ip_blocks[i].version->funcs->name, r); 2259 return r; 2260 } 2261 adev->ip_blocks[i].status.hw = true; 2262 } 2263 } 2264 2265 return 0; 2266 } 2267 2268 static int amdgpu_device_ip_hw_init_phase2(struct amdgpu_device *adev) 2269 { 2270 int i, r; 2271 2272 for (i = 0; i < adev->num_ip_blocks; i++) { 2273 if (!adev->ip_blocks[i].status.sw) 2274 continue; 2275 if (adev->ip_blocks[i].status.hw) 2276 continue; 2277 r = adev->ip_blocks[i].version->funcs->hw_init(adev); 2278 if (r) { 2279 DRM_ERROR("hw_init of IP block <%s> failed %d\n", 2280 adev->ip_blocks[i].version->funcs->name, r); 2281 return r; 2282 } 2283 adev->ip_blocks[i].status.hw = true; 2284 } 2285 2286 return 0; 2287 } 2288 2289 static int amdgpu_device_fw_loading(struct amdgpu_device *adev) 2290 { 2291 int r = 0; 2292 int i; 2293 uint32_t smu_version; 2294 2295 if (adev->asic_type >= CHIP_VEGA10) { 2296 for (i = 0; i < adev->num_ip_blocks; i++) { 2297 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_PSP) 2298 continue; 2299 2300 if (!adev->ip_blocks[i].status.sw) 2301 continue; 2302 2303 /* no need to do the fw loading again if already done*/ 2304 if (adev->ip_blocks[i].status.hw == true) 2305 break; 2306 2307 if (amdgpu_in_reset(adev) || adev->in_suspend) { 2308 r = adev->ip_blocks[i].version->funcs->resume(adev); 2309 if (r) { 2310 DRM_ERROR("resume of IP block <%s> failed %d\n", 2311 adev->ip_blocks[i].version->funcs->name, r); 2312 return r; 2313 } 2314 } else { 2315 r = adev->ip_blocks[i].version->funcs->hw_init(adev); 2316 if (r) { 2317 DRM_ERROR("hw_init of IP block <%s> failed %d\n", 2318 adev->ip_blocks[i].version->funcs->name, r); 2319 return r; 2320 } 2321 } 2322 2323 adev->ip_blocks[i].status.hw = true; 2324 break; 2325 } 2326 } 2327 2328 if (!amdgpu_sriov_vf(adev) || adev->asic_type == CHIP_TONGA) 2329 r = amdgpu_pm_load_smu_firmware(adev, &smu_version); 2330 2331 return r; 2332 } 2333 2334 /** 2335 * amdgpu_device_ip_init - run init for hardware IPs 2336 * 2337 * @adev: amdgpu_device pointer 2338 * 2339 * Main initialization pass for hardware IPs. The list of all the hardware 2340 * IPs that make up the asic is walked and the sw_init and hw_init callbacks 2341 * are run. sw_init initializes the software state associated with each IP 2342 * and hw_init initializes the hardware associated with each IP. 2343 * Returns 0 on success, negative error code on failure. 2344 */ 2345 static int amdgpu_device_ip_init(struct amdgpu_device *adev) 2346 { 2347 int i, r; 2348 2349 r = amdgpu_ras_init(adev); 2350 if (r) 2351 return r; 2352 2353 for (i = 0; i < adev->num_ip_blocks; i++) { 2354 if (!adev->ip_blocks[i].status.valid) 2355 continue; 2356 r = adev->ip_blocks[i].version->funcs->sw_init((void *)adev); 2357 if (r) { 2358 DRM_ERROR("sw_init of IP block <%s> failed %d\n", 2359 adev->ip_blocks[i].version->funcs->name, r); 2360 goto init_failed; 2361 } 2362 adev->ip_blocks[i].status.sw = true; 2363 2364 /* need to do gmc hw init early so we can allocate gpu mem */ 2365 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) { 2366 /* Try to reserve bad pages early */ 2367 if (amdgpu_sriov_vf(adev)) 2368 amdgpu_virt_exchange_data(adev); 2369 2370 r = amdgpu_device_vram_scratch_init(adev); 2371 if (r) { 2372 DRM_ERROR("amdgpu_vram_scratch_init failed %d\n", r); 2373 goto init_failed; 2374 } 2375 r = adev->ip_blocks[i].version->funcs->hw_init((void *)adev); 2376 if (r) { 2377 DRM_ERROR("hw_init %d failed %d\n", i, r); 2378 goto init_failed; 2379 } 2380 r = amdgpu_device_wb_init(adev); 2381 if (r) { 2382 DRM_ERROR("amdgpu_device_wb_init failed %d\n", r); 2383 goto init_failed; 2384 } 2385 adev->ip_blocks[i].status.hw = true; 2386 2387 /* right after GMC hw init, we create CSA */ 2388 if (amdgpu_mcbp || amdgpu_sriov_vf(adev)) { 2389 r = amdgpu_allocate_static_csa(adev, &adev->virt.csa_obj, 2390 AMDGPU_GEM_DOMAIN_VRAM, 2391 AMDGPU_CSA_SIZE); 2392 if (r) { 2393 DRM_ERROR("allocate CSA failed %d\n", r); 2394 goto init_failed; 2395 } 2396 } 2397 } 2398 } 2399 2400 if (amdgpu_sriov_vf(adev)) 2401 amdgpu_virt_init_data_exchange(adev); 2402 2403 r = amdgpu_ib_pool_init(adev); 2404 if (r) { 2405 dev_err(adev->dev, "IB initialization failed (%d).\n", r); 2406 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_IB_INIT_FAIL, 0, r); 2407 goto init_failed; 2408 } 2409 2410 r = amdgpu_ucode_create_bo(adev); /* create ucode bo when sw_init complete*/ 2411 if (r) 2412 goto init_failed; 2413 2414 r = amdgpu_device_ip_hw_init_phase1(adev); 2415 if (r) 2416 goto init_failed; 2417 2418 r = amdgpu_device_fw_loading(adev); 2419 if (r) 2420 goto init_failed; 2421 2422 r = amdgpu_device_ip_hw_init_phase2(adev); 2423 if (r) 2424 goto init_failed; 2425 2426 /* 2427 * retired pages will be loaded from eeprom and reserved here, 2428 * it should be called after amdgpu_device_ip_hw_init_phase2 since 2429 * for some ASICs the RAS EEPROM code relies on SMU fully functioning 2430 * for I2C communication which only true at this point. 2431 * 2432 * amdgpu_ras_recovery_init may fail, but the upper only cares the 2433 * failure from bad gpu situation and stop amdgpu init process 2434 * accordingly. For other failed cases, it will still release all 2435 * the resource and print error message, rather than returning one 2436 * negative value to upper level. 2437 * 2438 * Note: theoretically, this should be called before all vram allocations 2439 * to protect retired page from abusing 2440 */ 2441 r = amdgpu_ras_recovery_init(adev); 2442 if (r) 2443 goto init_failed; 2444 2445 if (adev->gmc.xgmi.num_physical_nodes > 1) 2446 amdgpu_xgmi_add_device(adev); 2447 2448 /* Don't init kfd if whole hive need to be reset during init */ 2449 if (!adev->gmc.xgmi.pending_reset) 2450 amdgpu_amdkfd_device_init(adev); 2451 2452 amdgpu_fru_get_product_info(adev); 2453 2454 init_failed: 2455 if (amdgpu_sriov_vf(adev)) 2456 amdgpu_virt_release_full_gpu(adev, true); 2457 2458 return r; 2459 } 2460 2461 /** 2462 * amdgpu_device_fill_reset_magic - writes reset magic to gart pointer 2463 * 2464 * @adev: amdgpu_device pointer 2465 * 2466 * Writes a reset magic value to the gart pointer in VRAM. The driver calls 2467 * this function before a GPU reset. If the value is retained after a 2468 * GPU reset, VRAM has not been lost. Some GPU resets may destry VRAM contents. 2469 */ 2470 static void amdgpu_device_fill_reset_magic(struct amdgpu_device *adev) 2471 { 2472 memcpy(adev->reset_magic, adev->gart.ptr, AMDGPU_RESET_MAGIC_NUM); 2473 } 2474 2475 /** 2476 * amdgpu_device_check_vram_lost - check if vram is valid 2477 * 2478 * @adev: amdgpu_device pointer 2479 * 2480 * Checks the reset magic value written to the gart pointer in VRAM. 2481 * The driver calls this after a GPU reset to see if the contents of 2482 * VRAM is lost or now. 2483 * returns true if vram is lost, false if not. 2484 */ 2485 static bool amdgpu_device_check_vram_lost(struct amdgpu_device *adev) 2486 { 2487 if (memcmp(adev->gart.ptr, adev->reset_magic, 2488 AMDGPU_RESET_MAGIC_NUM)) 2489 return true; 2490 2491 if (!amdgpu_in_reset(adev)) 2492 return false; 2493 2494 /* 2495 * For all ASICs with baco/mode1 reset, the VRAM is 2496 * always assumed to be lost. 2497 */ 2498 switch (amdgpu_asic_reset_method(adev)) { 2499 case AMD_RESET_METHOD_BACO: 2500 case AMD_RESET_METHOD_MODE1: 2501 return true; 2502 default: 2503 return false; 2504 } 2505 } 2506 2507 /** 2508 * amdgpu_device_set_cg_state - set clockgating for amdgpu device 2509 * 2510 * @adev: amdgpu_device pointer 2511 * @state: clockgating state (gate or ungate) 2512 * 2513 * The list of all the hardware IPs that make up the asic is walked and the 2514 * set_clockgating_state callbacks are run. 2515 * Late initialization pass enabling clockgating for hardware IPs. 2516 * Fini or suspend, pass disabling clockgating for hardware IPs. 2517 * Returns 0 on success, negative error code on failure. 2518 */ 2519 2520 int amdgpu_device_set_cg_state(struct amdgpu_device *adev, 2521 enum amd_clockgating_state state) 2522 { 2523 int i, j, r; 2524 2525 if (amdgpu_emu_mode == 1) 2526 return 0; 2527 2528 for (j = 0; j < adev->num_ip_blocks; j++) { 2529 i = state == AMD_CG_STATE_GATE ? j : adev->num_ip_blocks - j - 1; 2530 if (!adev->ip_blocks[i].status.late_initialized) 2531 continue; 2532 /* skip CG for GFX on S0ix */ 2533 if (adev->in_s0ix && 2534 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GFX) 2535 continue; 2536 /* skip CG for VCE/UVD, it's handled specially */ 2537 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_UVD && 2538 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCE && 2539 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCN && 2540 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_JPEG && 2541 adev->ip_blocks[i].version->funcs->set_clockgating_state) { 2542 /* enable clockgating to save power */ 2543 r = adev->ip_blocks[i].version->funcs->set_clockgating_state((void *)adev, 2544 state); 2545 if (r) { 2546 DRM_ERROR("set_clockgating_state(gate) of IP block <%s> failed %d\n", 2547 adev->ip_blocks[i].version->funcs->name, r); 2548 return r; 2549 } 2550 } 2551 } 2552 2553 return 0; 2554 } 2555 2556 int amdgpu_device_set_pg_state(struct amdgpu_device *adev, 2557 enum amd_powergating_state state) 2558 { 2559 int i, j, r; 2560 2561 if (amdgpu_emu_mode == 1) 2562 return 0; 2563 2564 for (j = 0; j < adev->num_ip_blocks; j++) { 2565 i = state == AMD_PG_STATE_GATE ? j : adev->num_ip_blocks - j - 1; 2566 if (!adev->ip_blocks[i].status.late_initialized) 2567 continue; 2568 /* skip PG for GFX on S0ix */ 2569 if (adev->in_s0ix && 2570 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GFX) 2571 continue; 2572 /* skip CG for VCE/UVD, it's handled specially */ 2573 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_UVD && 2574 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCE && 2575 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCN && 2576 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_JPEG && 2577 adev->ip_blocks[i].version->funcs->set_powergating_state) { 2578 /* enable powergating to save power */ 2579 r = adev->ip_blocks[i].version->funcs->set_powergating_state((void *)adev, 2580 state); 2581 if (r) { 2582 DRM_ERROR("set_powergating_state(gate) of IP block <%s> failed %d\n", 2583 adev->ip_blocks[i].version->funcs->name, r); 2584 return r; 2585 } 2586 } 2587 } 2588 return 0; 2589 } 2590 2591 static int amdgpu_device_enable_mgpu_fan_boost(void) 2592 { 2593 struct amdgpu_gpu_instance *gpu_ins; 2594 struct amdgpu_device *adev; 2595 int i, ret = 0; 2596 2597 mutex_lock(&mgpu_info.mutex); 2598 2599 /* 2600 * MGPU fan boost feature should be enabled 2601 * only when there are two or more dGPUs in 2602 * the system 2603 */ 2604 if (mgpu_info.num_dgpu < 2) 2605 goto out; 2606 2607 for (i = 0; i < mgpu_info.num_dgpu; i++) { 2608 gpu_ins = &(mgpu_info.gpu_ins[i]); 2609 adev = gpu_ins->adev; 2610 if (!(adev->flags & AMD_IS_APU) && 2611 !gpu_ins->mgpu_fan_enabled) { 2612 ret = amdgpu_dpm_enable_mgpu_fan_boost(adev); 2613 if (ret) 2614 break; 2615 2616 gpu_ins->mgpu_fan_enabled = 1; 2617 } 2618 } 2619 2620 out: 2621 mutex_unlock(&mgpu_info.mutex); 2622 2623 return ret; 2624 } 2625 2626 /** 2627 * amdgpu_device_ip_late_init - run late init for hardware IPs 2628 * 2629 * @adev: amdgpu_device pointer 2630 * 2631 * Late initialization pass for hardware IPs. The list of all the hardware 2632 * IPs that make up the asic is walked and the late_init callbacks are run. 2633 * late_init covers any special initialization that an IP requires 2634 * after all of the have been initialized or something that needs to happen 2635 * late in the init process. 2636 * Returns 0 on success, negative error code on failure. 2637 */ 2638 static int amdgpu_device_ip_late_init(struct amdgpu_device *adev) 2639 { 2640 struct amdgpu_gpu_instance *gpu_instance; 2641 int i = 0, r; 2642 2643 for (i = 0; i < adev->num_ip_blocks; i++) { 2644 if (!adev->ip_blocks[i].status.hw) 2645 continue; 2646 if (adev->ip_blocks[i].version->funcs->late_init) { 2647 r = adev->ip_blocks[i].version->funcs->late_init((void *)adev); 2648 if (r) { 2649 DRM_ERROR("late_init of IP block <%s> failed %d\n", 2650 adev->ip_blocks[i].version->funcs->name, r); 2651 return r; 2652 } 2653 } 2654 adev->ip_blocks[i].status.late_initialized = true; 2655 } 2656 2657 r = amdgpu_ras_late_init(adev); 2658 if (r) { 2659 DRM_ERROR("amdgpu_ras_late_init failed %d", r); 2660 return r; 2661 } 2662 2663 amdgpu_ras_set_error_query_ready(adev, true); 2664 2665 amdgpu_device_set_cg_state(adev, AMD_CG_STATE_GATE); 2666 amdgpu_device_set_pg_state(adev, AMD_PG_STATE_GATE); 2667 2668 amdgpu_device_fill_reset_magic(adev); 2669 2670 r = amdgpu_device_enable_mgpu_fan_boost(); 2671 if (r) 2672 DRM_ERROR("enable mgpu fan boost failed (%d).\n", r); 2673 2674 /* For passthrough configuration on arcturus and aldebaran, enable special handling SBR */ 2675 if (amdgpu_passthrough(adev) && ((adev->asic_type == CHIP_ARCTURUS && adev->gmc.xgmi.num_physical_nodes > 1)|| 2676 adev->asic_type == CHIP_ALDEBARAN )) 2677 amdgpu_dpm_handle_passthrough_sbr(adev, true); 2678 2679 if (adev->gmc.xgmi.num_physical_nodes > 1) { 2680 mutex_lock(&mgpu_info.mutex); 2681 2682 /* 2683 * Reset device p-state to low as this was booted with high. 2684 * 2685 * This should be performed only after all devices from the same 2686 * hive get initialized. 2687 * 2688 * However, it's unknown how many device in the hive in advance. 2689 * As this is counted one by one during devices initializations. 2690 * 2691 * So, we wait for all XGMI interlinked devices initialized. 2692 * This may bring some delays as those devices may come from 2693 * different hives. But that should be OK. 2694 */ 2695 if (mgpu_info.num_dgpu == adev->gmc.xgmi.num_physical_nodes) { 2696 for (i = 0; i < mgpu_info.num_gpu; i++) { 2697 gpu_instance = &(mgpu_info.gpu_ins[i]); 2698 if (gpu_instance->adev->flags & AMD_IS_APU) 2699 continue; 2700 2701 r = amdgpu_xgmi_set_pstate(gpu_instance->adev, 2702 AMDGPU_XGMI_PSTATE_MIN); 2703 if (r) { 2704 DRM_ERROR("pstate setting failed (%d).\n", r); 2705 break; 2706 } 2707 } 2708 } 2709 2710 mutex_unlock(&mgpu_info.mutex); 2711 } 2712 2713 return 0; 2714 } 2715 2716 /** 2717 * amdgpu_device_smu_fini_early - smu hw_fini wrapper 2718 * 2719 * @adev: amdgpu_device pointer 2720 * 2721 * For ASICs need to disable SMC first 2722 */ 2723 static void amdgpu_device_smu_fini_early(struct amdgpu_device *adev) 2724 { 2725 int i, r; 2726 2727 if (adev->ip_versions[GC_HWIP][0] > IP_VERSION(9, 0, 0)) 2728 return; 2729 2730 for (i = 0; i < adev->num_ip_blocks; i++) { 2731 if (!adev->ip_blocks[i].status.hw) 2732 continue; 2733 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) { 2734 r = adev->ip_blocks[i].version->funcs->hw_fini((void *)adev); 2735 /* XXX handle errors */ 2736 if (r) { 2737 DRM_DEBUG("hw_fini of IP block <%s> failed %d\n", 2738 adev->ip_blocks[i].version->funcs->name, r); 2739 } 2740 adev->ip_blocks[i].status.hw = false; 2741 break; 2742 } 2743 } 2744 } 2745 2746 static int amdgpu_device_ip_fini_early(struct amdgpu_device *adev) 2747 { 2748 int i, r; 2749 2750 for (i = 0; i < adev->num_ip_blocks; i++) { 2751 if (!adev->ip_blocks[i].version->funcs->early_fini) 2752 continue; 2753 2754 r = adev->ip_blocks[i].version->funcs->early_fini((void *)adev); 2755 if (r) { 2756 DRM_DEBUG("early_fini of IP block <%s> failed %d\n", 2757 adev->ip_blocks[i].version->funcs->name, r); 2758 } 2759 } 2760 2761 amdgpu_device_set_pg_state(adev, AMD_PG_STATE_UNGATE); 2762 amdgpu_device_set_cg_state(adev, AMD_CG_STATE_UNGATE); 2763 2764 amdgpu_amdkfd_suspend(adev, false); 2765 2766 /* Workaroud for ASICs need to disable SMC first */ 2767 amdgpu_device_smu_fini_early(adev); 2768 2769 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 2770 if (!adev->ip_blocks[i].status.hw) 2771 continue; 2772 2773 r = adev->ip_blocks[i].version->funcs->hw_fini((void *)adev); 2774 /* XXX handle errors */ 2775 if (r) { 2776 DRM_DEBUG("hw_fini of IP block <%s> failed %d\n", 2777 adev->ip_blocks[i].version->funcs->name, r); 2778 } 2779 2780 adev->ip_blocks[i].status.hw = false; 2781 } 2782 2783 if (amdgpu_sriov_vf(adev)) { 2784 if (amdgpu_virt_release_full_gpu(adev, false)) 2785 DRM_ERROR("failed to release exclusive mode on fini\n"); 2786 } 2787 2788 return 0; 2789 } 2790 2791 /** 2792 * amdgpu_device_ip_fini - run fini for hardware IPs 2793 * 2794 * @adev: amdgpu_device pointer 2795 * 2796 * Main teardown pass for hardware IPs. The list of all the hardware 2797 * IPs that make up the asic is walked and the hw_fini and sw_fini callbacks 2798 * are run. hw_fini tears down the hardware associated with each IP 2799 * and sw_fini tears down any software state associated with each IP. 2800 * Returns 0 on success, negative error code on failure. 2801 */ 2802 static int amdgpu_device_ip_fini(struct amdgpu_device *adev) 2803 { 2804 int i, r; 2805 2806 if (amdgpu_sriov_vf(adev) && adev->virt.ras_init_done) 2807 amdgpu_virt_release_ras_err_handler_data(adev); 2808 2809 if (adev->gmc.xgmi.num_physical_nodes > 1) 2810 amdgpu_xgmi_remove_device(adev); 2811 2812 amdgpu_amdkfd_device_fini_sw(adev); 2813 2814 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 2815 if (!adev->ip_blocks[i].status.sw) 2816 continue; 2817 2818 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) { 2819 amdgpu_ucode_free_bo(adev); 2820 amdgpu_free_static_csa(&adev->virt.csa_obj); 2821 amdgpu_device_wb_fini(adev); 2822 amdgpu_device_vram_scratch_fini(adev); 2823 amdgpu_ib_pool_fini(adev); 2824 } 2825 2826 r = adev->ip_blocks[i].version->funcs->sw_fini((void *)adev); 2827 /* XXX handle errors */ 2828 if (r) { 2829 DRM_DEBUG("sw_fini of IP block <%s> failed %d\n", 2830 adev->ip_blocks[i].version->funcs->name, r); 2831 } 2832 adev->ip_blocks[i].status.sw = false; 2833 adev->ip_blocks[i].status.valid = false; 2834 } 2835 2836 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 2837 if (!adev->ip_blocks[i].status.late_initialized) 2838 continue; 2839 if (adev->ip_blocks[i].version->funcs->late_fini) 2840 adev->ip_blocks[i].version->funcs->late_fini((void *)adev); 2841 adev->ip_blocks[i].status.late_initialized = false; 2842 } 2843 2844 amdgpu_ras_fini(adev); 2845 2846 return 0; 2847 } 2848 2849 /** 2850 * amdgpu_device_delayed_init_work_handler - work handler for IB tests 2851 * 2852 * @work: work_struct. 2853 */ 2854 static void amdgpu_device_delayed_init_work_handler(struct work_struct *work) 2855 { 2856 struct amdgpu_device *adev = 2857 container_of(work, struct amdgpu_device, delayed_init_work.work); 2858 int r; 2859 2860 r = amdgpu_ib_ring_tests(adev); 2861 if (r) 2862 DRM_ERROR("ib ring test failed (%d).\n", r); 2863 } 2864 2865 static void amdgpu_device_delay_enable_gfx_off(struct work_struct *work) 2866 { 2867 struct amdgpu_device *adev = 2868 container_of(work, struct amdgpu_device, gfx.gfx_off_delay_work.work); 2869 2870 WARN_ON_ONCE(adev->gfx.gfx_off_state); 2871 WARN_ON_ONCE(adev->gfx.gfx_off_req_count); 2872 2873 if (!amdgpu_dpm_set_powergating_by_smu(adev, AMD_IP_BLOCK_TYPE_GFX, true)) 2874 adev->gfx.gfx_off_state = true; 2875 } 2876 2877 /** 2878 * amdgpu_device_ip_suspend_phase1 - run suspend for hardware IPs (phase 1) 2879 * 2880 * @adev: amdgpu_device pointer 2881 * 2882 * Main suspend function for hardware IPs. The list of all the hardware 2883 * IPs that make up the asic is walked, clockgating is disabled and the 2884 * suspend callbacks are run. suspend puts the hardware and software state 2885 * in each IP into a state suitable for suspend. 2886 * Returns 0 on success, negative error code on failure. 2887 */ 2888 static int amdgpu_device_ip_suspend_phase1(struct amdgpu_device *adev) 2889 { 2890 int i, r; 2891 2892 amdgpu_device_set_pg_state(adev, AMD_PG_STATE_UNGATE); 2893 amdgpu_device_set_cg_state(adev, AMD_CG_STATE_UNGATE); 2894 2895 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 2896 if (!adev->ip_blocks[i].status.valid) 2897 continue; 2898 2899 /* displays are handled separately */ 2900 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_DCE) 2901 continue; 2902 2903 /* XXX handle errors */ 2904 r = adev->ip_blocks[i].version->funcs->suspend(adev); 2905 /* XXX handle errors */ 2906 if (r) { 2907 DRM_ERROR("suspend of IP block <%s> failed %d\n", 2908 adev->ip_blocks[i].version->funcs->name, r); 2909 return r; 2910 } 2911 2912 adev->ip_blocks[i].status.hw = false; 2913 } 2914 2915 return 0; 2916 } 2917 2918 /** 2919 * amdgpu_device_ip_suspend_phase2 - run suspend for hardware IPs (phase 2) 2920 * 2921 * @adev: amdgpu_device pointer 2922 * 2923 * Main suspend function for hardware IPs. The list of all the hardware 2924 * IPs that make up the asic is walked, clockgating is disabled and the 2925 * suspend callbacks are run. suspend puts the hardware and software state 2926 * in each IP into a state suitable for suspend. 2927 * Returns 0 on success, negative error code on failure. 2928 */ 2929 static int amdgpu_device_ip_suspend_phase2(struct amdgpu_device *adev) 2930 { 2931 int i, r; 2932 2933 if (adev->in_s0ix) 2934 amdgpu_dpm_gfx_state_change(adev, sGpuChangeState_D3Entry); 2935 2936 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 2937 if (!adev->ip_blocks[i].status.valid) 2938 continue; 2939 /* displays are handled in phase1 */ 2940 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE) 2941 continue; 2942 /* PSP lost connection when err_event_athub occurs */ 2943 if (amdgpu_ras_intr_triggered() && 2944 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) { 2945 adev->ip_blocks[i].status.hw = false; 2946 continue; 2947 } 2948 2949 /* skip unnecessary suspend if we do not initialize them yet */ 2950 if (adev->gmc.xgmi.pending_reset && 2951 !(adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC || 2952 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC || 2953 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON || 2954 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH)) { 2955 adev->ip_blocks[i].status.hw = false; 2956 continue; 2957 } 2958 2959 /* skip suspend of gfx and psp for S0ix 2960 * gfx is in gfxoff state, so on resume it will exit gfxoff just 2961 * like at runtime. PSP is also part of the always on hardware 2962 * so no need to suspend it. 2963 */ 2964 if (adev->in_s0ix && 2965 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP || 2966 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GFX)) 2967 continue; 2968 2969 /* XXX handle errors */ 2970 r = adev->ip_blocks[i].version->funcs->suspend(adev); 2971 /* XXX handle errors */ 2972 if (r) { 2973 DRM_ERROR("suspend of IP block <%s> failed %d\n", 2974 adev->ip_blocks[i].version->funcs->name, r); 2975 } 2976 adev->ip_blocks[i].status.hw = false; 2977 /* handle putting the SMC in the appropriate state */ 2978 if(!amdgpu_sriov_vf(adev)){ 2979 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) { 2980 r = amdgpu_dpm_set_mp1_state(adev, adev->mp1_state); 2981 if (r) { 2982 DRM_ERROR("SMC failed to set mp1 state %d, %d\n", 2983 adev->mp1_state, r); 2984 return r; 2985 } 2986 } 2987 } 2988 } 2989 2990 return 0; 2991 } 2992 2993 /** 2994 * amdgpu_device_ip_suspend - run suspend for hardware IPs 2995 * 2996 * @adev: amdgpu_device pointer 2997 * 2998 * Main suspend function for hardware IPs. The list of all the hardware 2999 * IPs that make up the asic is walked, clockgating is disabled and the 3000 * suspend callbacks are run. suspend puts the hardware and software state 3001 * in each IP into a state suitable for suspend. 3002 * Returns 0 on success, negative error code on failure. 3003 */ 3004 int amdgpu_device_ip_suspend(struct amdgpu_device *adev) 3005 { 3006 int r; 3007 3008 if (amdgpu_sriov_vf(adev)) { 3009 amdgpu_virt_fini_data_exchange(adev); 3010 amdgpu_virt_request_full_gpu(adev, false); 3011 } 3012 3013 r = amdgpu_device_ip_suspend_phase1(adev); 3014 if (r) 3015 return r; 3016 r = amdgpu_device_ip_suspend_phase2(adev); 3017 3018 if (amdgpu_sriov_vf(adev)) 3019 amdgpu_virt_release_full_gpu(adev, false); 3020 3021 return r; 3022 } 3023 3024 static int amdgpu_device_ip_reinit_early_sriov(struct amdgpu_device *adev) 3025 { 3026 int i, r; 3027 3028 static enum amd_ip_block_type ip_order[] = { 3029 AMD_IP_BLOCK_TYPE_GMC, 3030 AMD_IP_BLOCK_TYPE_COMMON, 3031 AMD_IP_BLOCK_TYPE_PSP, 3032 AMD_IP_BLOCK_TYPE_IH, 3033 }; 3034 3035 for (i = 0; i < adev->num_ip_blocks; i++) { 3036 int j; 3037 struct amdgpu_ip_block *block; 3038 3039 block = &adev->ip_blocks[i]; 3040 block->status.hw = false; 3041 3042 for (j = 0; j < ARRAY_SIZE(ip_order); j++) { 3043 3044 if (block->version->type != ip_order[j] || 3045 !block->status.valid) 3046 continue; 3047 3048 r = block->version->funcs->hw_init(adev); 3049 DRM_INFO("RE-INIT-early: %s %s\n", block->version->funcs->name, r?"failed":"succeeded"); 3050 if (r) 3051 return r; 3052 block->status.hw = true; 3053 } 3054 } 3055 3056 return 0; 3057 } 3058 3059 static int amdgpu_device_ip_reinit_late_sriov(struct amdgpu_device *adev) 3060 { 3061 int i, r; 3062 3063 static enum amd_ip_block_type ip_order[] = { 3064 AMD_IP_BLOCK_TYPE_SMC, 3065 AMD_IP_BLOCK_TYPE_DCE, 3066 AMD_IP_BLOCK_TYPE_GFX, 3067 AMD_IP_BLOCK_TYPE_SDMA, 3068 AMD_IP_BLOCK_TYPE_UVD, 3069 AMD_IP_BLOCK_TYPE_VCE, 3070 AMD_IP_BLOCK_TYPE_VCN 3071 }; 3072 3073 for (i = 0; i < ARRAY_SIZE(ip_order); i++) { 3074 int j; 3075 struct amdgpu_ip_block *block; 3076 3077 for (j = 0; j < adev->num_ip_blocks; j++) { 3078 block = &adev->ip_blocks[j]; 3079 3080 if (block->version->type != ip_order[i] || 3081 !block->status.valid || 3082 block->status.hw) 3083 continue; 3084 3085 if (block->version->type == AMD_IP_BLOCK_TYPE_SMC) 3086 r = block->version->funcs->resume(adev); 3087 else 3088 r = block->version->funcs->hw_init(adev); 3089 3090 DRM_INFO("RE-INIT-late: %s %s\n", block->version->funcs->name, r?"failed":"succeeded"); 3091 if (r) 3092 return r; 3093 block->status.hw = true; 3094 } 3095 } 3096 3097 return 0; 3098 } 3099 3100 /** 3101 * amdgpu_device_ip_resume_phase1 - run resume for hardware IPs 3102 * 3103 * @adev: amdgpu_device pointer 3104 * 3105 * First resume function for hardware IPs. The list of all the hardware 3106 * IPs that make up the asic is walked and the resume callbacks are run for 3107 * COMMON, GMC, and IH. resume puts the hardware into a functional state 3108 * after a suspend and updates the software state as necessary. This 3109 * function is also used for restoring the GPU after a GPU reset. 3110 * Returns 0 on success, negative error code on failure. 3111 */ 3112 static int amdgpu_device_ip_resume_phase1(struct amdgpu_device *adev) 3113 { 3114 int i, r; 3115 3116 for (i = 0; i < adev->num_ip_blocks; i++) { 3117 if (!adev->ip_blocks[i].status.valid || adev->ip_blocks[i].status.hw) 3118 continue; 3119 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON || 3120 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC || 3121 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH) { 3122 3123 r = adev->ip_blocks[i].version->funcs->resume(adev); 3124 if (r) { 3125 DRM_ERROR("resume of IP block <%s> failed %d\n", 3126 adev->ip_blocks[i].version->funcs->name, r); 3127 return r; 3128 } 3129 adev->ip_blocks[i].status.hw = true; 3130 } 3131 } 3132 3133 return 0; 3134 } 3135 3136 /** 3137 * amdgpu_device_ip_resume_phase2 - run resume for hardware IPs 3138 * 3139 * @adev: amdgpu_device pointer 3140 * 3141 * First resume function for hardware IPs. The list of all the hardware 3142 * IPs that make up the asic is walked and the resume callbacks are run for 3143 * all blocks except COMMON, GMC, and IH. resume puts the hardware into a 3144 * functional state after a suspend and updates the software state as 3145 * necessary. This function is also used for restoring the GPU after a GPU 3146 * reset. 3147 * Returns 0 on success, negative error code on failure. 3148 */ 3149 static int amdgpu_device_ip_resume_phase2(struct amdgpu_device *adev) 3150 { 3151 int i, r; 3152 3153 for (i = 0; i < adev->num_ip_blocks; i++) { 3154 if (!adev->ip_blocks[i].status.valid || adev->ip_blocks[i].status.hw) 3155 continue; 3156 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON || 3157 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC || 3158 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH || 3159 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) 3160 continue; 3161 r = adev->ip_blocks[i].version->funcs->resume(adev); 3162 if (r) { 3163 DRM_ERROR("resume of IP block <%s> failed %d\n", 3164 adev->ip_blocks[i].version->funcs->name, r); 3165 return r; 3166 } 3167 adev->ip_blocks[i].status.hw = true; 3168 } 3169 3170 return 0; 3171 } 3172 3173 /** 3174 * amdgpu_device_ip_resume - run resume for hardware IPs 3175 * 3176 * @adev: amdgpu_device pointer 3177 * 3178 * Main resume function for hardware IPs. The hardware IPs 3179 * are split into two resume functions because they are 3180 * are also used in in recovering from a GPU reset and some additional 3181 * steps need to be take between them. In this case (S3/S4) they are 3182 * run sequentially. 3183 * Returns 0 on success, negative error code on failure. 3184 */ 3185 static int amdgpu_device_ip_resume(struct amdgpu_device *adev) 3186 { 3187 int r; 3188 3189 r = amdgpu_amdkfd_resume_iommu(adev); 3190 if (r) 3191 return r; 3192 3193 r = amdgpu_device_ip_resume_phase1(adev); 3194 if (r) 3195 return r; 3196 3197 r = amdgpu_device_fw_loading(adev); 3198 if (r) 3199 return r; 3200 3201 r = amdgpu_device_ip_resume_phase2(adev); 3202 3203 return r; 3204 } 3205 3206 /** 3207 * amdgpu_device_detect_sriov_bios - determine if the board supports SR-IOV 3208 * 3209 * @adev: amdgpu_device pointer 3210 * 3211 * Query the VBIOS data tables to determine if the board supports SR-IOV. 3212 */ 3213 static void amdgpu_device_detect_sriov_bios(struct amdgpu_device *adev) 3214 { 3215 if (amdgpu_sriov_vf(adev)) { 3216 if (adev->is_atom_fw) { 3217 if (amdgpu_atomfirmware_gpu_virtualization_supported(adev)) 3218 adev->virt.caps |= AMDGPU_SRIOV_CAPS_SRIOV_VBIOS; 3219 } else { 3220 if (amdgpu_atombios_has_gpu_virtualization_table(adev)) 3221 adev->virt.caps |= AMDGPU_SRIOV_CAPS_SRIOV_VBIOS; 3222 } 3223 3224 if (!(adev->virt.caps & AMDGPU_SRIOV_CAPS_SRIOV_VBIOS)) 3225 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_NO_VBIOS, 0, 0); 3226 } 3227 } 3228 3229 /** 3230 * amdgpu_device_asic_has_dc_support - determine if DC supports the asic 3231 * 3232 * @asic_type: AMD asic type 3233 * 3234 * Check if there is DC (new modesetting infrastructre) support for an asic. 3235 * returns true if DC has support, false if not. 3236 */ 3237 bool amdgpu_device_asic_has_dc_support(enum amd_asic_type asic_type) 3238 { 3239 switch (asic_type) { 3240 #ifdef CONFIG_DRM_AMDGPU_SI 3241 case CHIP_HAINAN: 3242 #endif 3243 case CHIP_TOPAZ: 3244 /* chips with no display hardware */ 3245 return false; 3246 #if defined(CONFIG_DRM_AMD_DC) 3247 case CHIP_TAHITI: 3248 case CHIP_PITCAIRN: 3249 case CHIP_VERDE: 3250 case CHIP_OLAND: 3251 /* 3252 * We have systems in the wild with these ASICs that require 3253 * LVDS and VGA support which is not supported with DC. 3254 * 3255 * Fallback to the non-DC driver here by default so as not to 3256 * cause regressions. 3257 */ 3258 #if defined(CONFIG_DRM_AMD_DC_SI) 3259 return amdgpu_dc > 0; 3260 #else 3261 return false; 3262 #endif 3263 case CHIP_BONAIRE: 3264 case CHIP_KAVERI: 3265 case CHIP_KABINI: 3266 case CHIP_MULLINS: 3267 /* 3268 * We have systems in the wild with these ASICs that require 3269 * LVDS and VGA support which is not supported with DC. 3270 * 3271 * Fallback to the non-DC driver here by default so as not to 3272 * cause regressions. 3273 */ 3274 return amdgpu_dc > 0; 3275 case CHIP_HAWAII: 3276 case CHIP_CARRIZO: 3277 case CHIP_STONEY: 3278 case CHIP_POLARIS10: 3279 case CHIP_POLARIS11: 3280 case CHIP_POLARIS12: 3281 case CHIP_VEGAM: 3282 case CHIP_TONGA: 3283 case CHIP_FIJI: 3284 case CHIP_VEGA10: 3285 case CHIP_VEGA12: 3286 case CHIP_VEGA20: 3287 #if defined(CONFIG_DRM_AMD_DC_DCN) 3288 case CHIP_RAVEN: 3289 case CHIP_NAVI10: 3290 case CHIP_NAVI14: 3291 case CHIP_NAVI12: 3292 case CHIP_RENOIR: 3293 case CHIP_CYAN_SKILLFISH: 3294 case CHIP_SIENNA_CICHLID: 3295 case CHIP_NAVY_FLOUNDER: 3296 case CHIP_DIMGREY_CAVEFISH: 3297 case CHIP_BEIGE_GOBY: 3298 case CHIP_VANGOGH: 3299 case CHIP_YELLOW_CARP: 3300 #endif 3301 default: 3302 return amdgpu_dc != 0; 3303 #else 3304 default: 3305 if (amdgpu_dc > 0) 3306 DRM_INFO_ONCE("Display Core has been requested via kernel parameter " 3307 "but isn't supported by ASIC, ignoring\n"); 3308 return false; 3309 #endif 3310 } 3311 } 3312 3313 /** 3314 * amdgpu_device_has_dc_support - check if dc is supported 3315 * 3316 * @adev: amdgpu_device pointer 3317 * 3318 * Returns true for supported, false for not supported 3319 */ 3320 bool amdgpu_device_has_dc_support(struct amdgpu_device *adev) 3321 { 3322 if (amdgpu_sriov_vf(adev) || 3323 adev->enable_virtual_display || 3324 (adev->harvest_ip_mask & AMD_HARVEST_IP_DMU_MASK)) 3325 return false; 3326 3327 return amdgpu_device_asic_has_dc_support(adev->asic_type); 3328 } 3329 3330 static void amdgpu_device_xgmi_reset_func(struct work_struct *__work) 3331 { 3332 struct amdgpu_device *adev = 3333 container_of(__work, struct amdgpu_device, xgmi_reset_work); 3334 struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev); 3335 3336 /* It's a bug to not have a hive within this function */ 3337 if (WARN_ON(!hive)) 3338 return; 3339 3340 /* 3341 * Use task barrier to synchronize all xgmi reset works across the 3342 * hive. task_barrier_enter and task_barrier_exit will block 3343 * until all the threads running the xgmi reset works reach 3344 * those points. task_barrier_full will do both blocks. 3345 */ 3346 if (amdgpu_asic_reset_method(adev) == AMD_RESET_METHOD_BACO) { 3347 3348 task_barrier_enter(&hive->tb); 3349 adev->asic_reset_res = amdgpu_device_baco_enter(adev_to_drm(adev)); 3350 3351 if (adev->asic_reset_res) 3352 goto fail; 3353 3354 task_barrier_exit(&hive->tb); 3355 adev->asic_reset_res = amdgpu_device_baco_exit(adev_to_drm(adev)); 3356 3357 if (adev->asic_reset_res) 3358 goto fail; 3359 3360 if (adev->mmhub.ras && adev->mmhub.ras->ras_block.hw_ops && 3361 adev->mmhub.ras->ras_block.hw_ops->reset_ras_error_count) 3362 adev->mmhub.ras->ras_block.hw_ops->reset_ras_error_count(adev); 3363 } else { 3364 3365 task_barrier_full(&hive->tb); 3366 adev->asic_reset_res = amdgpu_asic_reset(adev); 3367 } 3368 3369 fail: 3370 if (adev->asic_reset_res) 3371 DRM_WARN("ASIC reset failed with error, %d for drm dev, %s", 3372 adev->asic_reset_res, adev_to_drm(adev)->unique); 3373 amdgpu_put_xgmi_hive(hive); 3374 } 3375 3376 static int amdgpu_device_get_job_timeout_settings(struct amdgpu_device *adev) 3377 { 3378 char *input = amdgpu_lockup_timeout; 3379 char *timeout_setting = NULL; 3380 int index = 0; 3381 long timeout; 3382 int ret = 0; 3383 3384 /* 3385 * By default timeout for non compute jobs is 10000 3386 * and 60000 for compute jobs. 3387 * In SR-IOV or passthrough mode, timeout for compute 3388 * jobs are 60000 by default. 3389 */ 3390 adev->gfx_timeout = msecs_to_jiffies(10000); 3391 adev->sdma_timeout = adev->video_timeout = adev->gfx_timeout; 3392 if (amdgpu_sriov_vf(adev)) 3393 adev->compute_timeout = amdgpu_sriov_is_pp_one_vf(adev) ? 3394 msecs_to_jiffies(60000) : msecs_to_jiffies(10000); 3395 else 3396 adev->compute_timeout = msecs_to_jiffies(60000); 3397 3398 if (strnlen(input, AMDGPU_MAX_TIMEOUT_PARAM_LENGTH)) { 3399 while ((timeout_setting = strsep(&input, ",")) && 3400 strnlen(timeout_setting, AMDGPU_MAX_TIMEOUT_PARAM_LENGTH)) { 3401 ret = kstrtol(timeout_setting, 0, &timeout); 3402 if (ret) 3403 return ret; 3404 3405 if (timeout == 0) { 3406 index++; 3407 continue; 3408 } else if (timeout < 0) { 3409 timeout = MAX_SCHEDULE_TIMEOUT; 3410 dev_warn(adev->dev, "lockup timeout disabled"); 3411 add_taint(TAINT_SOFTLOCKUP, LOCKDEP_STILL_OK); 3412 } else { 3413 timeout = msecs_to_jiffies(timeout); 3414 } 3415 3416 switch (index++) { 3417 case 0: 3418 adev->gfx_timeout = timeout; 3419 break; 3420 case 1: 3421 adev->compute_timeout = timeout; 3422 break; 3423 case 2: 3424 adev->sdma_timeout = timeout; 3425 break; 3426 case 3: 3427 adev->video_timeout = timeout; 3428 break; 3429 default: 3430 break; 3431 } 3432 } 3433 /* 3434 * There is only one value specified and 3435 * it should apply to all non-compute jobs. 3436 */ 3437 if (index == 1) { 3438 adev->sdma_timeout = adev->video_timeout = adev->gfx_timeout; 3439 if (amdgpu_sriov_vf(adev) || amdgpu_passthrough(adev)) 3440 adev->compute_timeout = adev->gfx_timeout; 3441 } 3442 } 3443 3444 return ret; 3445 } 3446 3447 /** 3448 * amdgpu_device_check_iommu_direct_map - check if RAM direct mapped to GPU 3449 * 3450 * @adev: amdgpu_device pointer 3451 * 3452 * RAM direct mapped to GPU if IOMMU is not enabled or is pass through mode 3453 */ 3454 static void amdgpu_device_check_iommu_direct_map(struct amdgpu_device *adev) 3455 { 3456 struct iommu_domain *domain; 3457 3458 domain = iommu_get_domain_for_dev(adev->dev); 3459 if (!domain || domain->type == IOMMU_DOMAIN_IDENTITY) 3460 adev->ram_is_direct_mapped = true; 3461 } 3462 3463 static const struct attribute *amdgpu_dev_attributes[] = { 3464 &dev_attr_product_name.attr, 3465 &dev_attr_product_number.attr, 3466 &dev_attr_serial_number.attr, 3467 &dev_attr_pcie_replay_count.attr, 3468 NULL 3469 }; 3470 3471 /** 3472 * amdgpu_device_init - initialize the driver 3473 * 3474 * @adev: amdgpu_device pointer 3475 * @flags: driver flags 3476 * 3477 * Initializes the driver info and hw (all asics). 3478 * Returns 0 for success or an error on failure. 3479 * Called at driver startup. 3480 */ 3481 int amdgpu_device_init(struct amdgpu_device *adev, 3482 uint32_t flags) 3483 { 3484 struct drm_device *ddev = adev_to_drm(adev); 3485 struct pci_dev *pdev = adev->pdev; 3486 int r, i; 3487 bool px = false; 3488 u32 max_MBps; 3489 3490 adev->shutdown = false; 3491 adev->flags = flags; 3492 3493 if (amdgpu_force_asic_type >= 0 && amdgpu_force_asic_type < CHIP_LAST) 3494 adev->asic_type = amdgpu_force_asic_type; 3495 else 3496 adev->asic_type = flags & AMD_ASIC_MASK; 3497 3498 adev->usec_timeout = AMDGPU_MAX_USEC_TIMEOUT; 3499 if (amdgpu_emu_mode == 1) 3500 adev->usec_timeout *= 10; 3501 adev->gmc.gart_size = 512 * 1024 * 1024; 3502 adev->accel_working = false; 3503 adev->num_rings = 0; 3504 adev->mman.buffer_funcs = NULL; 3505 adev->mman.buffer_funcs_ring = NULL; 3506 adev->vm_manager.vm_pte_funcs = NULL; 3507 adev->vm_manager.vm_pte_num_scheds = 0; 3508 adev->gmc.gmc_funcs = NULL; 3509 adev->harvest_ip_mask = 0x0; 3510 adev->fence_context = dma_fence_context_alloc(AMDGPU_MAX_RINGS); 3511 bitmap_zero(adev->gfx.pipe_reserve_bitmap, AMDGPU_MAX_COMPUTE_QUEUES); 3512 3513 adev->smc_rreg = &amdgpu_invalid_rreg; 3514 adev->smc_wreg = &amdgpu_invalid_wreg; 3515 adev->pcie_rreg = &amdgpu_invalid_rreg; 3516 adev->pcie_wreg = &amdgpu_invalid_wreg; 3517 adev->pciep_rreg = &amdgpu_invalid_rreg; 3518 adev->pciep_wreg = &amdgpu_invalid_wreg; 3519 adev->pcie_rreg64 = &amdgpu_invalid_rreg64; 3520 adev->pcie_wreg64 = &amdgpu_invalid_wreg64; 3521 adev->uvd_ctx_rreg = &amdgpu_invalid_rreg; 3522 adev->uvd_ctx_wreg = &amdgpu_invalid_wreg; 3523 adev->didt_rreg = &amdgpu_invalid_rreg; 3524 adev->didt_wreg = &amdgpu_invalid_wreg; 3525 adev->gc_cac_rreg = &amdgpu_invalid_rreg; 3526 adev->gc_cac_wreg = &amdgpu_invalid_wreg; 3527 adev->audio_endpt_rreg = &amdgpu_block_invalid_rreg; 3528 adev->audio_endpt_wreg = &amdgpu_block_invalid_wreg; 3529 3530 DRM_INFO("initializing kernel modesetting (%s 0x%04X:0x%04X 0x%04X:0x%04X 0x%02X).\n", 3531 amdgpu_asic_name[adev->asic_type], pdev->vendor, pdev->device, 3532 pdev->subsystem_vendor, pdev->subsystem_device, pdev->revision); 3533 3534 /* mutex initialization are all done here so we 3535 * can recall function without having locking issues */ 3536 mutex_init(&adev->firmware.mutex); 3537 mutex_init(&adev->pm.mutex); 3538 mutex_init(&adev->gfx.gpu_clock_mutex); 3539 mutex_init(&adev->srbm_mutex); 3540 mutex_init(&adev->gfx.pipe_reserve_mutex); 3541 mutex_init(&adev->gfx.gfx_off_mutex); 3542 mutex_init(&adev->grbm_idx_mutex); 3543 mutex_init(&adev->mn_lock); 3544 mutex_init(&adev->virt.vf_errors.lock); 3545 hash_init(adev->mn_hash); 3546 atomic_set(&adev->in_gpu_reset, 0); 3547 init_rwsem(&adev->reset_sem); 3548 mutex_init(&adev->psp.mutex); 3549 mutex_init(&adev->notifier_lock); 3550 mutex_init(&adev->pm.stable_pstate_ctx_lock); 3551 3552 amdgpu_device_init_apu_flags(adev); 3553 3554 r = amdgpu_device_check_arguments(adev); 3555 if (r) 3556 return r; 3557 3558 spin_lock_init(&adev->mmio_idx_lock); 3559 spin_lock_init(&adev->smc_idx_lock); 3560 spin_lock_init(&adev->pcie_idx_lock); 3561 spin_lock_init(&adev->uvd_ctx_idx_lock); 3562 spin_lock_init(&adev->didt_idx_lock); 3563 spin_lock_init(&adev->gc_cac_idx_lock); 3564 spin_lock_init(&adev->se_cac_idx_lock); 3565 spin_lock_init(&adev->audio_endpt_idx_lock); 3566 spin_lock_init(&adev->mm_stats.lock); 3567 3568 INIT_LIST_HEAD(&adev->shadow_list); 3569 mutex_init(&adev->shadow_list_lock); 3570 3571 INIT_LIST_HEAD(&adev->reset_list); 3572 3573 INIT_LIST_HEAD(&adev->ras_list); 3574 3575 INIT_DELAYED_WORK(&adev->delayed_init_work, 3576 amdgpu_device_delayed_init_work_handler); 3577 INIT_DELAYED_WORK(&adev->gfx.gfx_off_delay_work, 3578 amdgpu_device_delay_enable_gfx_off); 3579 3580 INIT_WORK(&adev->xgmi_reset_work, amdgpu_device_xgmi_reset_func); 3581 3582 adev->gfx.gfx_off_req_count = 1; 3583 adev->pm.ac_power = power_supply_is_system_supplied() > 0; 3584 3585 atomic_set(&adev->throttling_logging_enabled, 1); 3586 /* 3587 * If throttling continues, logging will be performed every minute 3588 * to avoid log flooding. "-1" is subtracted since the thermal 3589 * throttling interrupt comes every second. Thus, the total logging 3590 * interval is 59 seconds(retelimited printk interval) + 1(waiting 3591 * for throttling interrupt) = 60 seconds. 3592 */ 3593 ratelimit_state_init(&adev->throttling_logging_rs, (60 - 1) * HZ, 1); 3594 ratelimit_set_flags(&adev->throttling_logging_rs, RATELIMIT_MSG_ON_RELEASE); 3595 3596 /* Registers mapping */ 3597 /* TODO: block userspace mapping of io register */ 3598 if (adev->asic_type >= CHIP_BONAIRE) { 3599 adev->rmmio_base = pci_resource_start(adev->pdev, 5); 3600 adev->rmmio_size = pci_resource_len(adev->pdev, 5); 3601 } else { 3602 adev->rmmio_base = pci_resource_start(adev->pdev, 2); 3603 adev->rmmio_size = pci_resource_len(adev->pdev, 2); 3604 } 3605 3606 for (i = 0; i < AMD_IP_BLOCK_TYPE_NUM; i++) 3607 atomic_set(&adev->pm.pwr_state[i], POWER_STATE_UNKNOWN); 3608 3609 adev->rmmio = ioremap(adev->rmmio_base, adev->rmmio_size); 3610 if (adev->rmmio == NULL) { 3611 return -ENOMEM; 3612 } 3613 DRM_INFO("register mmio base: 0x%08X\n", (uint32_t)adev->rmmio_base); 3614 DRM_INFO("register mmio size: %u\n", (unsigned)adev->rmmio_size); 3615 3616 amdgpu_device_get_pcie_info(adev); 3617 3618 if (amdgpu_mcbp) 3619 DRM_INFO("MCBP is enabled\n"); 3620 3621 if (amdgpu_mes && adev->asic_type >= CHIP_NAVI10) 3622 adev->enable_mes = true; 3623 3624 /* detect hw virtualization here */ 3625 amdgpu_detect_virtualization(adev); 3626 3627 r = amdgpu_device_get_job_timeout_settings(adev); 3628 if (r) { 3629 dev_err(adev->dev, "invalid lockup_timeout parameter syntax\n"); 3630 return r; 3631 } 3632 3633 /* early init functions */ 3634 r = amdgpu_device_ip_early_init(adev); 3635 if (r) 3636 return r; 3637 3638 /* Need to get xgmi info early to decide the reset behavior*/ 3639 if (adev->gmc.xgmi.supported) { 3640 r = adev->gfxhub.funcs->get_xgmi_info(adev); 3641 if (r) 3642 return r; 3643 } 3644 3645 /* enable PCIE atomic ops */ 3646 if (amdgpu_sriov_vf(adev)) 3647 adev->have_atomics_support = ((struct amd_sriov_msg_pf2vf_info *) 3648 adev->virt.fw_reserve.p_pf2vf)->pcie_atomic_ops_enabled_flags == 3649 (PCI_EXP_DEVCAP2_ATOMIC_COMP32 | PCI_EXP_DEVCAP2_ATOMIC_COMP64); 3650 else 3651 adev->have_atomics_support = 3652 !pci_enable_atomic_ops_to_root(adev->pdev, 3653 PCI_EXP_DEVCAP2_ATOMIC_COMP32 | 3654 PCI_EXP_DEVCAP2_ATOMIC_COMP64); 3655 if (!adev->have_atomics_support) 3656 dev_info(adev->dev, "PCIE atomic ops is not supported\n"); 3657 3658 /* doorbell bar mapping and doorbell index init*/ 3659 amdgpu_device_doorbell_init(adev); 3660 3661 if (amdgpu_emu_mode == 1) { 3662 /* post the asic on emulation mode */ 3663 emu_soc_asic_init(adev); 3664 goto fence_driver_init; 3665 } 3666 3667 amdgpu_reset_init(adev); 3668 3669 /* detect if we are with an SRIOV vbios */ 3670 amdgpu_device_detect_sriov_bios(adev); 3671 3672 /* check if we need to reset the asic 3673 * E.g., driver was not cleanly unloaded previously, etc. 3674 */ 3675 if (!amdgpu_sriov_vf(adev) && amdgpu_asic_need_reset_on_init(adev)) { 3676 if (adev->gmc.xgmi.num_physical_nodes) { 3677 dev_info(adev->dev, "Pending hive reset.\n"); 3678 adev->gmc.xgmi.pending_reset = true; 3679 /* Only need to init necessary block for SMU to handle the reset */ 3680 for (i = 0; i < adev->num_ip_blocks; i++) { 3681 if (!adev->ip_blocks[i].status.valid) 3682 continue; 3683 if (!(adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC || 3684 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON || 3685 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH || 3686 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC)) { 3687 DRM_DEBUG("IP %s disabled for hw_init.\n", 3688 adev->ip_blocks[i].version->funcs->name); 3689 adev->ip_blocks[i].status.hw = true; 3690 } 3691 } 3692 } else { 3693 r = amdgpu_asic_reset(adev); 3694 if (r) { 3695 dev_err(adev->dev, "asic reset on init failed\n"); 3696 goto failed; 3697 } 3698 } 3699 } 3700 3701 pci_enable_pcie_error_reporting(adev->pdev); 3702 3703 /* Post card if necessary */ 3704 if (amdgpu_device_need_post(adev)) { 3705 if (!adev->bios) { 3706 dev_err(adev->dev, "no vBIOS found\n"); 3707 r = -EINVAL; 3708 goto failed; 3709 } 3710 DRM_INFO("GPU posting now...\n"); 3711 r = amdgpu_device_asic_init(adev); 3712 if (r) { 3713 dev_err(adev->dev, "gpu post error!\n"); 3714 goto failed; 3715 } 3716 } 3717 3718 if (adev->is_atom_fw) { 3719 /* Initialize clocks */ 3720 r = amdgpu_atomfirmware_get_clock_info(adev); 3721 if (r) { 3722 dev_err(adev->dev, "amdgpu_atomfirmware_get_clock_info failed\n"); 3723 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_GET_CLOCK_FAIL, 0, 0); 3724 goto failed; 3725 } 3726 } else { 3727 /* Initialize clocks */ 3728 r = amdgpu_atombios_get_clock_info(adev); 3729 if (r) { 3730 dev_err(adev->dev, "amdgpu_atombios_get_clock_info failed\n"); 3731 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_GET_CLOCK_FAIL, 0, 0); 3732 goto failed; 3733 } 3734 /* init i2c buses */ 3735 if (!amdgpu_device_has_dc_support(adev)) 3736 amdgpu_atombios_i2c_init(adev); 3737 } 3738 3739 fence_driver_init: 3740 /* Fence driver */ 3741 r = amdgpu_fence_driver_sw_init(adev); 3742 if (r) { 3743 dev_err(adev->dev, "amdgpu_fence_driver_sw_init failed\n"); 3744 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_FENCE_INIT_FAIL, 0, 0); 3745 goto failed; 3746 } 3747 3748 /* init the mode config */ 3749 drm_mode_config_init(adev_to_drm(adev)); 3750 3751 r = amdgpu_device_ip_init(adev); 3752 if (r) { 3753 /* failed in exclusive mode due to timeout */ 3754 if (amdgpu_sriov_vf(adev) && 3755 !amdgpu_sriov_runtime(adev) && 3756 amdgpu_virt_mmio_blocked(adev) && 3757 !amdgpu_virt_wait_reset(adev)) { 3758 dev_err(adev->dev, "VF exclusive mode timeout\n"); 3759 /* Don't send request since VF is inactive. */ 3760 adev->virt.caps &= ~AMDGPU_SRIOV_CAPS_RUNTIME; 3761 adev->virt.ops = NULL; 3762 r = -EAGAIN; 3763 goto release_ras_con; 3764 } 3765 dev_err(adev->dev, "amdgpu_device_ip_init failed\n"); 3766 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_AMDGPU_INIT_FAIL, 0, 0); 3767 goto release_ras_con; 3768 } 3769 3770 amdgpu_fence_driver_hw_init(adev); 3771 3772 dev_info(adev->dev, 3773 "SE %d, SH per SE %d, CU per SH %d, active_cu_number %d\n", 3774 adev->gfx.config.max_shader_engines, 3775 adev->gfx.config.max_sh_per_se, 3776 adev->gfx.config.max_cu_per_sh, 3777 adev->gfx.cu_info.number); 3778 3779 adev->accel_working = true; 3780 3781 amdgpu_vm_check_compute_bug(adev); 3782 3783 /* Initialize the buffer migration limit. */ 3784 if (amdgpu_moverate >= 0) 3785 max_MBps = amdgpu_moverate; 3786 else 3787 max_MBps = 8; /* Allow 8 MB/s. */ 3788 /* Get a log2 for easy divisions. */ 3789 adev->mm_stats.log2_max_MBps = ilog2(max(1u, max_MBps)); 3790 3791 r = amdgpu_pm_sysfs_init(adev); 3792 if (r) { 3793 adev->pm_sysfs_en = false; 3794 DRM_ERROR("registering pm debugfs failed (%d).\n", r); 3795 } else 3796 adev->pm_sysfs_en = true; 3797 3798 r = amdgpu_ucode_sysfs_init(adev); 3799 if (r) { 3800 adev->ucode_sysfs_en = false; 3801 DRM_ERROR("Creating firmware sysfs failed (%d).\n", r); 3802 } else 3803 adev->ucode_sysfs_en = true; 3804 3805 if ((amdgpu_testing & 1)) { 3806 if (adev->accel_working) 3807 amdgpu_test_moves(adev); 3808 else 3809 DRM_INFO("amdgpu: acceleration disabled, skipping move tests\n"); 3810 } 3811 if (amdgpu_benchmarking) { 3812 if (adev->accel_working) 3813 amdgpu_benchmark(adev, amdgpu_benchmarking); 3814 else 3815 DRM_INFO("amdgpu: acceleration disabled, skipping benchmarks\n"); 3816 } 3817 3818 /* 3819 * Register gpu instance before amdgpu_device_enable_mgpu_fan_boost. 3820 * Otherwise the mgpu fan boost feature will be skipped due to the 3821 * gpu instance is counted less. 3822 */ 3823 amdgpu_register_gpu_instance(adev); 3824 3825 /* enable clockgating, etc. after ib tests, etc. since some blocks require 3826 * explicit gating rather than handling it automatically. 3827 */ 3828 if (!adev->gmc.xgmi.pending_reset) { 3829 r = amdgpu_device_ip_late_init(adev); 3830 if (r) { 3831 dev_err(adev->dev, "amdgpu_device_ip_late_init failed\n"); 3832 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_AMDGPU_LATE_INIT_FAIL, 0, r); 3833 goto release_ras_con; 3834 } 3835 /* must succeed. */ 3836 amdgpu_ras_resume(adev); 3837 queue_delayed_work(system_wq, &adev->delayed_init_work, 3838 msecs_to_jiffies(AMDGPU_RESUME_MS)); 3839 } 3840 3841 if (amdgpu_sriov_vf(adev)) 3842 flush_delayed_work(&adev->delayed_init_work); 3843 3844 r = sysfs_create_files(&adev->dev->kobj, amdgpu_dev_attributes); 3845 if (r) 3846 dev_err(adev->dev, "Could not create amdgpu device attr\n"); 3847 3848 if (IS_ENABLED(CONFIG_PERF_EVENTS)) 3849 r = amdgpu_pmu_init(adev); 3850 if (r) 3851 dev_err(adev->dev, "amdgpu_pmu_init failed\n"); 3852 3853 /* Have stored pci confspace at hand for restore in sudden PCI error */ 3854 if (amdgpu_device_cache_pci_state(adev->pdev)) 3855 pci_restore_state(pdev); 3856 3857 /* if we have > 1 VGA cards, then disable the amdgpu VGA resources */ 3858 /* this will fail for cards that aren't VGA class devices, just 3859 * ignore it */ 3860 if ((adev->pdev->class >> 8) == PCI_CLASS_DISPLAY_VGA) 3861 vga_client_register(adev->pdev, amdgpu_device_vga_set_decode); 3862 3863 if (amdgpu_device_supports_px(ddev)) { 3864 px = true; 3865 vga_switcheroo_register_client(adev->pdev, 3866 &amdgpu_switcheroo_ops, px); 3867 vga_switcheroo_init_domain_pm_ops(adev->dev, &adev->vga_pm_domain); 3868 } 3869 3870 if (adev->gmc.xgmi.pending_reset) 3871 queue_delayed_work(system_wq, &mgpu_info.delayed_reset_work, 3872 msecs_to_jiffies(AMDGPU_RESUME_MS)); 3873 3874 amdgpu_device_check_iommu_direct_map(adev); 3875 3876 return 0; 3877 3878 release_ras_con: 3879 amdgpu_release_ras_context(adev); 3880 3881 failed: 3882 amdgpu_vf_error_trans_all(adev); 3883 3884 return r; 3885 } 3886 3887 static void amdgpu_device_unmap_mmio(struct amdgpu_device *adev) 3888 { 3889 3890 /* Clear all CPU mappings pointing to this device */ 3891 unmap_mapping_range(adev->ddev.anon_inode->i_mapping, 0, 0, 1); 3892 3893 /* Unmap all mapped bars - Doorbell, registers and VRAM */ 3894 amdgpu_device_doorbell_fini(adev); 3895 3896 iounmap(adev->rmmio); 3897 adev->rmmio = NULL; 3898 if (adev->mman.aper_base_kaddr) 3899 iounmap(adev->mman.aper_base_kaddr); 3900 adev->mman.aper_base_kaddr = NULL; 3901 3902 /* Memory manager related */ 3903 if (!adev->gmc.xgmi.connected_to_cpu) { 3904 arch_phys_wc_del(adev->gmc.vram_mtrr); 3905 arch_io_free_memtype_wc(adev->gmc.aper_base, adev->gmc.aper_size); 3906 } 3907 } 3908 3909 /** 3910 * amdgpu_device_fini_hw - tear down the driver 3911 * 3912 * @adev: amdgpu_device pointer 3913 * 3914 * Tear down the driver info (all asics). 3915 * Called at driver shutdown. 3916 */ 3917 void amdgpu_device_fini_hw(struct amdgpu_device *adev) 3918 { 3919 dev_info(adev->dev, "amdgpu: finishing device.\n"); 3920 flush_delayed_work(&adev->delayed_init_work); 3921 if (adev->mman.initialized) { 3922 flush_delayed_work(&adev->mman.bdev.wq); 3923 ttm_bo_lock_delayed_workqueue(&adev->mman.bdev); 3924 } 3925 adev->shutdown = true; 3926 3927 /* make sure IB test finished before entering exclusive mode 3928 * to avoid preemption on IB test 3929 * */ 3930 if (amdgpu_sriov_vf(adev)) { 3931 amdgpu_virt_request_full_gpu(adev, false); 3932 amdgpu_virt_fini_data_exchange(adev); 3933 } 3934 3935 /* disable all interrupts */ 3936 amdgpu_irq_disable_all(adev); 3937 if (adev->mode_info.mode_config_initialized){ 3938 if (!drm_drv_uses_atomic_modeset(adev_to_drm(adev))) 3939 drm_helper_force_disable_all(adev_to_drm(adev)); 3940 else 3941 drm_atomic_helper_shutdown(adev_to_drm(adev)); 3942 } 3943 amdgpu_fence_driver_hw_fini(adev); 3944 3945 if (adev->pm_sysfs_en) 3946 amdgpu_pm_sysfs_fini(adev); 3947 if (adev->ucode_sysfs_en) 3948 amdgpu_ucode_sysfs_fini(adev); 3949 sysfs_remove_files(&adev->dev->kobj, amdgpu_dev_attributes); 3950 3951 /* disable ras feature must before hw fini */ 3952 amdgpu_ras_pre_fini(adev); 3953 3954 amdgpu_device_ip_fini_early(adev); 3955 3956 amdgpu_irq_fini_hw(adev); 3957 3958 if (adev->mman.initialized) 3959 ttm_device_clear_dma_mappings(&adev->mman.bdev); 3960 3961 amdgpu_gart_dummy_page_fini(adev); 3962 3963 if (drm_dev_is_unplugged(adev_to_drm(adev))) 3964 amdgpu_device_unmap_mmio(adev); 3965 3966 } 3967 3968 void amdgpu_device_fini_sw(struct amdgpu_device *adev) 3969 { 3970 int idx; 3971 3972 amdgpu_fence_driver_sw_fini(adev); 3973 amdgpu_device_ip_fini(adev); 3974 release_firmware(adev->firmware.gpu_info_fw); 3975 adev->firmware.gpu_info_fw = NULL; 3976 adev->accel_working = false; 3977 3978 amdgpu_reset_fini(adev); 3979 3980 /* free i2c buses */ 3981 if (!amdgpu_device_has_dc_support(adev)) 3982 amdgpu_i2c_fini(adev); 3983 3984 if (amdgpu_emu_mode != 1) 3985 amdgpu_atombios_fini(adev); 3986 3987 kfree(adev->bios); 3988 adev->bios = NULL; 3989 if (amdgpu_device_supports_px(adev_to_drm(adev))) { 3990 vga_switcheroo_unregister_client(adev->pdev); 3991 vga_switcheroo_fini_domain_pm_ops(adev->dev); 3992 } 3993 if ((adev->pdev->class >> 8) == PCI_CLASS_DISPLAY_VGA) 3994 vga_client_unregister(adev->pdev); 3995 3996 if (drm_dev_enter(adev_to_drm(adev), &idx)) { 3997 3998 iounmap(adev->rmmio); 3999 adev->rmmio = NULL; 4000 amdgpu_device_doorbell_fini(adev); 4001 drm_dev_exit(idx); 4002 } 4003 4004 if (IS_ENABLED(CONFIG_PERF_EVENTS)) 4005 amdgpu_pmu_fini(adev); 4006 if (adev->mman.discovery_bin) 4007 amdgpu_discovery_fini(adev); 4008 4009 kfree(adev->pci_state); 4010 4011 } 4012 4013 /** 4014 * amdgpu_device_evict_resources - evict device resources 4015 * @adev: amdgpu device object 4016 * 4017 * Evicts all ttm device resources(vram BOs, gart table) from the lru list 4018 * of the vram memory type. Mainly used for evicting device resources 4019 * at suspend time. 4020 * 4021 */ 4022 static void amdgpu_device_evict_resources(struct amdgpu_device *adev) 4023 { 4024 /* No need to evict vram on APUs for suspend to ram or s2idle */ 4025 if ((adev->in_s3 || adev->in_s0ix) && (adev->flags & AMD_IS_APU)) 4026 return; 4027 4028 if (amdgpu_ttm_evict_resources(adev, TTM_PL_VRAM)) 4029 DRM_WARN("evicting device resources failed\n"); 4030 4031 } 4032 4033 /* 4034 * Suspend & resume. 4035 */ 4036 /** 4037 * amdgpu_device_suspend - initiate device suspend 4038 * 4039 * @dev: drm dev pointer 4040 * @fbcon : notify the fbdev of suspend 4041 * 4042 * Puts the hw in the suspend state (all asics). 4043 * Returns 0 for success or an error on failure. 4044 * Called at driver suspend. 4045 */ 4046 int amdgpu_device_suspend(struct drm_device *dev, bool fbcon) 4047 { 4048 struct amdgpu_device *adev = drm_to_adev(dev); 4049 4050 if (dev->switch_power_state == DRM_SWITCH_POWER_OFF) 4051 return 0; 4052 4053 adev->in_suspend = true; 4054 4055 if (amdgpu_acpi_smart_shift_update(dev, AMDGPU_SS_DEV_D3)) 4056 DRM_WARN("smart shift update failed\n"); 4057 4058 drm_kms_helper_poll_disable(dev); 4059 4060 if (fbcon) 4061 drm_fb_helper_set_suspend_unlocked(adev_to_drm(adev)->fb_helper, true); 4062 4063 cancel_delayed_work_sync(&adev->delayed_init_work); 4064 4065 amdgpu_ras_suspend(adev); 4066 4067 amdgpu_device_ip_suspend_phase1(adev); 4068 4069 if (!adev->in_s0ix) 4070 amdgpu_amdkfd_suspend(adev, adev->in_runpm); 4071 4072 amdgpu_device_evict_resources(adev); 4073 4074 amdgpu_fence_driver_hw_fini(adev); 4075 4076 amdgpu_device_ip_suspend_phase2(adev); 4077 4078 return 0; 4079 } 4080 4081 /** 4082 * amdgpu_device_resume - initiate device resume 4083 * 4084 * @dev: drm dev pointer 4085 * @fbcon : notify the fbdev of resume 4086 * 4087 * Bring the hw back to operating state (all asics). 4088 * Returns 0 for success or an error on failure. 4089 * Called at driver resume. 4090 */ 4091 int amdgpu_device_resume(struct drm_device *dev, bool fbcon) 4092 { 4093 struct amdgpu_device *adev = drm_to_adev(dev); 4094 int r = 0; 4095 4096 if (dev->switch_power_state == DRM_SWITCH_POWER_OFF) 4097 return 0; 4098 4099 if (adev->in_s0ix) 4100 amdgpu_dpm_gfx_state_change(adev, sGpuChangeState_D0Entry); 4101 4102 /* post card */ 4103 if (amdgpu_device_need_post(adev)) { 4104 r = amdgpu_device_asic_init(adev); 4105 if (r) 4106 dev_err(adev->dev, "amdgpu asic init failed\n"); 4107 } 4108 4109 r = amdgpu_device_ip_resume(adev); 4110 if (r) { 4111 dev_err(adev->dev, "amdgpu_device_ip_resume failed (%d).\n", r); 4112 return r; 4113 } 4114 amdgpu_fence_driver_hw_init(adev); 4115 4116 r = amdgpu_device_ip_late_init(adev); 4117 if (r) 4118 return r; 4119 4120 queue_delayed_work(system_wq, &adev->delayed_init_work, 4121 msecs_to_jiffies(AMDGPU_RESUME_MS)); 4122 4123 if (!adev->in_s0ix) { 4124 r = amdgpu_amdkfd_resume(adev, adev->in_runpm); 4125 if (r) 4126 return r; 4127 } 4128 4129 /* Make sure IB tests flushed */ 4130 flush_delayed_work(&adev->delayed_init_work); 4131 4132 if (fbcon) 4133 drm_fb_helper_set_suspend_unlocked(adev_to_drm(adev)->fb_helper, false); 4134 4135 drm_kms_helper_poll_enable(dev); 4136 4137 amdgpu_ras_resume(adev); 4138 4139 /* 4140 * Most of the connector probing functions try to acquire runtime pm 4141 * refs to ensure that the GPU is powered on when connector polling is 4142 * performed. Since we're calling this from a runtime PM callback, 4143 * trying to acquire rpm refs will cause us to deadlock. 4144 * 4145 * Since we're guaranteed to be holding the rpm lock, it's safe to 4146 * temporarily disable the rpm helpers so this doesn't deadlock us. 4147 */ 4148 #ifdef CONFIG_PM 4149 dev->dev->power.disable_depth++; 4150 #endif 4151 if (!amdgpu_device_has_dc_support(adev)) 4152 drm_helper_hpd_irq_event(dev); 4153 else 4154 drm_kms_helper_hotplug_event(dev); 4155 #ifdef CONFIG_PM 4156 dev->dev->power.disable_depth--; 4157 #endif 4158 adev->in_suspend = false; 4159 4160 if (amdgpu_acpi_smart_shift_update(dev, AMDGPU_SS_DEV_D0)) 4161 DRM_WARN("smart shift update failed\n"); 4162 4163 return 0; 4164 } 4165 4166 /** 4167 * amdgpu_device_ip_check_soft_reset - did soft reset succeed 4168 * 4169 * @adev: amdgpu_device pointer 4170 * 4171 * The list of all the hardware IPs that make up the asic is walked and 4172 * the check_soft_reset callbacks are run. check_soft_reset determines 4173 * if the asic is still hung or not. 4174 * Returns true if any of the IPs are still in a hung state, false if not. 4175 */ 4176 static bool amdgpu_device_ip_check_soft_reset(struct amdgpu_device *adev) 4177 { 4178 int i; 4179 bool asic_hang = false; 4180 4181 if (amdgpu_sriov_vf(adev)) 4182 return true; 4183 4184 if (amdgpu_asic_need_full_reset(adev)) 4185 return true; 4186 4187 for (i = 0; i < adev->num_ip_blocks; i++) { 4188 if (!adev->ip_blocks[i].status.valid) 4189 continue; 4190 if (adev->ip_blocks[i].version->funcs->check_soft_reset) 4191 adev->ip_blocks[i].status.hang = 4192 adev->ip_blocks[i].version->funcs->check_soft_reset(adev); 4193 if (adev->ip_blocks[i].status.hang) { 4194 dev_info(adev->dev, "IP block:%s is hung!\n", adev->ip_blocks[i].version->funcs->name); 4195 asic_hang = true; 4196 } 4197 } 4198 return asic_hang; 4199 } 4200 4201 /** 4202 * amdgpu_device_ip_pre_soft_reset - prepare for soft reset 4203 * 4204 * @adev: amdgpu_device pointer 4205 * 4206 * The list of all the hardware IPs that make up the asic is walked and the 4207 * pre_soft_reset callbacks are run if the block is hung. pre_soft_reset 4208 * handles any IP specific hardware or software state changes that are 4209 * necessary for a soft reset to succeed. 4210 * Returns 0 on success, negative error code on failure. 4211 */ 4212 static int amdgpu_device_ip_pre_soft_reset(struct amdgpu_device *adev) 4213 { 4214 int i, r = 0; 4215 4216 for (i = 0; i < adev->num_ip_blocks; i++) { 4217 if (!adev->ip_blocks[i].status.valid) 4218 continue; 4219 if (adev->ip_blocks[i].status.hang && 4220 adev->ip_blocks[i].version->funcs->pre_soft_reset) { 4221 r = adev->ip_blocks[i].version->funcs->pre_soft_reset(adev); 4222 if (r) 4223 return r; 4224 } 4225 } 4226 4227 return 0; 4228 } 4229 4230 /** 4231 * amdgpu_device_ip_need_full_reset - check if a full asic reset is needed 4232 * 4233 * @adev: amdgpu_device pointer 4234 * 4235 * Some hardware IPs cannot be soft reset. If they are hung, a full gpu 4236 * reset is necessary to recover. 4237 * Returns true if a full asic reset is required, false if not. 4238 */ 4239 static bool amdgpu_device_ip_need_full_reset(struct amdgpu_device *adev) 4240 { 4241 int i; 4242 4243 if (amdgpu_asic_need_full_reset(adev)) 4244 return true; 4245 4246 for (i = 0; i < adev->num_ip_blocks; i++) { 4247 if (!adev->ip_blocks[i].status.valid) 4248 continue; 4249 if ((adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) || 4250 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) || 4251 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_ACP) || 4252 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE) || 4253 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) { 4254 if (adev->ip_blocks[i].status.hang) { 4255 dev_info(adev->dev, "Some block need full reset!\n"); 4256 return true; 4257 } 4258 } 4259 } 4260 return false; 4261 } 4262 4263 /** 4264 * amdgpu_device_ip_soft_reset - do a soft reset 4265 * 4266 * @adev: amdgpu_device pointer 4267 * 4268 * The list of all the hardware IPs that make up the asic is walked and the 4269 * soft_reset callbacks are run if the block is hung. soft_reset handles any 4270 * IP specific hardware or software state changes that are necessary to soft 4271 * reset the IP. 4272 * Returns 0 on success, negative error code on failure. 4273 */ 4274 static int amdgpu_device_ip_soft_reset(struct amdgpu_device *adev) 4275 { 4276 int i, r = 0; 4277 4278 for (i = 0; i < adev->num_ip_blocks; i++) { 4279 if (!adev->ip_blocks[i].status.valid) 4280 continue; 4281 if (adev->ip_blocks[i].status.hang && 4282 adev->ip_blocks[i].version->funcs->soft_reset) { 4283 r = adev->ip_blocks[i].version->funcs->soft_reset(adev); 4284 if (r) 4285 return r; 4286 } 4287 } 4288 4289 return 0; 4290 } 4291 4292 /** 4293 * amdgpu_device_ip_post_soft_reset - clean up from soft reset 4294 * 4295 * @adev: amdgpu_device pointer 4296 * 4297 * The list of all the hardware IPs that make up the asic is walked and the 4298 * post_soft_reset callbacks are run if the asic was hung. post_soft_reset 4299 * handles any IP specific hardware or software state changes that are 4300 * necessary after the IP has been soft reset. 4301 * Returns 0 on success, negative error code on failure. 4302 */ 4303 static int amdgpu_device_ip_post_soft_reset(struct amdgpu_device *adev) 4304 { 4305 int i, r = 0; 4306 4307 for (i = 0; i < adev->num_ip_blocks; i++) { 4308 if (!adev->ip_blocks[i].status.valid) 4309 continue; 4310 if (adev->ip_blocks[i].status.hang && 4311 adev->ip_blocks[i].version->funcs->post_soft_reset) 4312 r = adev->ip_blocks[i].version->funcs->post_soft_reset(adev); 4313 if (r) 4314 return r; 4315 } 4316 4317 return 0; 4318 } 4319 4320 /** 4321 * amdgpu_device_recover_vram - Recover some VRAM contents 4322 * 4323 * @adev: amdgpu_device pointer 4324 * 4325 * Restores the contents of VRAM buffers from the shadows in GTT. Used to 4326 * restore things like GPUVM page tables after a GPU reset where 4327 * the contents of VRAM might be lost. 4328 * 4329 * Returns: 4330 * 0 on success, negative error code on failure. 4331 */ 4332 static int amdgpu_device_recover_vram(struct amdgpu_device *adev) 4333 { 4334 struct dma_fence *fence = NULL, *next = NULL; 4335 struct amdgpu_bo *shadow; 4336 struct amdgpu_bo_vm *vmbo; 4337 long r = 1, tmo; 4338 4339 if (amdgpu_sriov_runtime(adev)) 4340 tmo = msecs_to_jiffies(8000); 4341 else 4342 tmo = msecs_to_jiffies(100); 4343 4344 dev_info(adev->dev, "recover vram bo from shadow start\n"); 4345 mutex_lock(&adev->shadow_list_lock); 4346 list_for_each_entry(vmbo, &adev->shadow_list, shadow_list) { 4347 shadow = &vmbo->bo; 4348 /* No need to recover an evicted BO */ 4349 if (shadow->tbo.resource->mem_type != TTM_PL_TT || 4350 shadow->tbo.resource->start == AMDGPU_BO_INVALID_OFFSET || 4351 shadow->parent->tbo.resource->mem_type != TTM_PL_VRAM) 4352 continue; 4353 4354 r = amdgpu_bo_restore_shadow(shadow, &next); 4355 if (r) 4356 break; 4357 4358 if (fence) { 4359 tmo = dma_fence_wait_timeout(fence, false, tmo); 4360 dma_fence_put(fence); 4361 fence = next; 4362 if (tmo == 0) { 4363 r = -ETIMEDOUT; 4364 break; 4365 } else if (tmo < 0) { 4366 r = tmo; 4367 break; 4368 } 4369 } else { 4370 fence = next; 4371 } 4372 } 4373 mutex_unlock(&adev->shadow_list_lock); 4374 4375 if (fence) 4376 tmo = dma_fence_wait_timeout(fence, false, tmo); 4377 dma_fence_put(fence); 4378 4379 if (r < 0 || tmo <= 0) { 4380 dev_err(adev->dev, "recover vram bo from shadow failed, r is %ld, tmo is %ld\n", r, tmo); 4381 return -EIO; 4382 } 4383 4384 dev_info(adev->dev, "recover vram bo from shadow done\n"); 4385 return 0; 4386 } 4387 4388 4389 /** 4390 * amdgpu_device_reset_sriov - reset ASIC for SR-IOV vf 4391 * 4392 * @adev: amdgpu_device pointer 4393 * @from_hypervisor: request from hypervisor 4394 * 4395 * do VF FLR and reinitialize Asic 4396 * return 0 means succeeded otherwise failed 4397 */ 4398 static int amdgpu_device_reset_sriov(struct amdgpu_device *adev, 4399 bool from_hypervisor) 4400 { 4401 int r; 4402 struct amdgpu_hive_info *hive = NULL; 4403 int retry_limit = 0; 4404 4405 retry: 4406 amdgpu_amdkfd_pre_reset(adev); 4407 4408 amdgpu_amdkfd_pre_reset(adev); 4409 4410 if (from_hypervisor) 4411 r = amdgpu_virt_request_full_gpu(adev, true); 4412 else 4413 r = amdgpu_virt_reset_gpu(adev); 4414 if (r) 4415 return r; 4416 4417 /* Resume IP prior to SMC */ 4418 r = amdgpu_device_ip_reinit_early_sriov(adev); 4419 if (r) 4420 goto error; 4421 4422 amdgpu_virt_init_data_exchange(adev); 4423 4424 r = amdgpu_device_fw_loading(adev); 4425 if (r) 4426 return r; 4427 4428 /* now we are okay to resume SMC/CP/SDMA */ 4429 r = amdgpu_device_ip_reinit_late_sriov(adev); 4430 if (r) 4431 goto error; 4432 4433 hive = amdgpu_get_xgmi_hive(adev); 4434 /* Update PSP FW topology after reset */ 4435 if (hive && adev->gmc.xgmi.num_physical_nodes > 1) 4436 r = amdgpu_xgmi_update_topology(hive, adev); 4437 4438 if (hive) 4439 amdgpu_put_xgmi_hive(hive); 4440 4441 if (!r) { 4442 amdgpu_irq_gpu_reset_resume_helper(adev); 4443 r = amdgpu_ib_ring_tests(adev); 4444 amdgpu_amdkfd_post_reset(adev); 4445 } 4446 4447 error: 4448 if (!r && adev->virt.gim_feature & AMDGIM_FEATURE_GIM_FLR_VRAMLOST) { 4449 amdgpu_inc_vram_lost(adev); 4450 r = amdgpu_device_recover_vram(adev); 4451 } 4452 amdgpu_virt_release_full_gpu(adev, true); 4453 4454 if (AMDGPU_RETRY_SRIOV_RESET(r)) { 4455 if (retry_limit < AMDGPU_MAX_RETRY_LIMIT) { 4456 retry_limit++; 4457 goto retry; 4458 } else 4459 DRM_ERROR("GPU reset retry is beyond the retry limit\n"); 4460 } 4461 4462 return r; 4463 } 4464 4465 /** 4466 * amdgpu_device_has_job_running - check if there is any job in mirror list 4467 * 4468 * @adev: amdgpu_device pointer 4469 * 4470 * check if there is any job in mirror list 4471 */ 4472 bool amdgpu_device_has_job_running(struct amdgpu_device *adev) 4473 { 4474 int i; 4475 struct drm_sched_job *job; 4476 4477 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 4478 struct amdgpu_ring *ring = adev->rings[i]; 4479 4480 if (!ring || !ring->sched.thread) 4481 continue; 4482 4483 spin_lock(&ring->sched.job_list_lock); 4484 job = list_first_entry_or_null(&ring->sched.pending_list, 4485 struct drm_sched_job, list); 4486 spin_unlock(&ring->sched.job_list_lock); 4487 if (job) 4488 return true; 4489 } 4490 return false; 4491 } 4492 4493 /** 4494 * amdgpu_device_should_recover_gpu - check if we should try GPU recovery 4495 * 4496 * @adev: amdgpu_device pointer 4497 * 4498 * Check amdgpu_gpu_recovery and SRIOV status to see if we should try to recover 4499 * a hung GPU. 4500 */ 4501 bool amdgpu_device_should_recover_gpu(struct amdgpu_device *adev) 4502 { 4503 if (!amdgpu_device_ip_check_soft_reset(adev)) { 4504 dev_info(adev->dev, "Timeout, but no hardware hang detected.\n"); 4505 return false; 4506 } 4507 4508 if (amdgpu_gpu_recovery == 0) 4509 goto disabled; 4510 4511 if (amdgpu_sriov_vf(adev)) 4512 return true; 4513 4514 if (amdgpu_gpu_recovery == -1) { 4515 switch (adev->asic_type) { 4516 #ifdef CONFIG_DRM_AMDGPU_SI 4517 case CHIP_VERDE: 4518 case CHIP_TAHITI: 4519 case CHIP_PITCAIRN: 4520 case CHIP_OLAND: 4521 case CHIP_HAINAN: 4522 #endif 4523 #ifdef CONFIG_DRM_AMDGPU_CIK 4524 case CHIP_KAVERI: 4525 case CHIP_KABINI: 4526 case CHIP_MULLINS: 4527 #endif 4528 case CHIP_CARRIZO: 4529 case CHIP_STONEY: 4530 case CHIP_CYAN_SKILLFISH: 4531 goto disabled; 4532 default: 4533 break; 4534 } 4535 } 4536 4537 return true; 4538 4539 disabled: 4540 dev_info(adev->dev, "GPU recovery disabled.\n"); 4541 return false; 4542 } 4543 4544 int amdgpu_device_mode1_reset(struct amdgpu_device *adev) 4545 { 4546 u32 i; 4547 int ret = 0; 4548 4549 amdgpu_atombios_scratch_regs_engine_hung(adev, true); 4550 4551 dev_info(adev->dev, "GPU mode1 reset\n"); 4552 4553 /* disable BM */ 4554 pci_clear_master(adev->pdev); 4555 4556 amdgpu_device_cache_pci_state(adev->pdev); 4557 4558 if (amdgpu_dpm_is_mode1_reset_supported(adev)) { 4559 dev_info(adev->dev, "GPU smu mode1 reset\n"); 4560 ret = amdgpu_dpm_mode1_reset(adev); 4561 } else { 4562 dev_info(adev->dev, "GPU psp mode1 reset\n"); 4563 ret = psp_gpu_reset(adev); 4564 } 4565 4566 if (ret) 4567 dev_err(adev->dev, "GPU mode1 reset failed\n"); 4568 4569 amdgpu_device_load_pci_state(adev->pdev); 4570 4571 /* wait for asic to come out of reset */ 4572 for (i = 0; i < adev->usec_timeout; i++) { 4573 u32 memsize = adev->nbio.funcs->get_memsize(adev); 4574 4575 if (memsize != 0xffffffff) 4576 break; 4577 udelay(1); 4578 } 4579 4580 amdgpu_atombios_scratch_regs_engine_hung(adev, false); 4581 return ret; 4582 } 4583 4584 int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev, 4585 struct amdgpu_reset_context *reset_context) 4586 { 4587 int i, r = 0; 4588 struct amdgpu_job *job = NULL; 4589 bool need_full_reset = 4590 test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 4591 4592 if (reset_context->reset_req_dev == adev) 4593 job = reset_context->job; 4594 4595 if (amdgpu_sriov_vf(adev)) { 4596 /* stop the data exchange thread */ 4597 amdgpu_virt_fini_data_exchange(adev); 4598 } 4599 4600 /* block all schedulers and reset given job's ring */ 4601 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 4602 struct amdgpu_ring *ring = adev->rings[i]; 4603 4604 if (!ring || !ring->sched.thread) 4605 continue; 4606 4607 /*clear job fence from fence drv to avoid force_completion 4608 *leave NULL and vm flush fence in fence drv */ 4609 amdgpu_fence_driver_clear_job_fences(ring); 4610 4611 /* after all hw jobs are reset, hw fence is meaningless, so force_completion */ 4612 amdgpu_fence_driver_force_completion(ring); 4613 } 4614 4615 if (job && job->vm) 4616 drm_sched_increase_karma(&job->base); 4617 4618 r = amdgpu_reset_prepare_hwcontext(adev, reset_context); 4619 /* If reset handler not implemented, continue; otherwise return */ 4620 if (r == -ENOSYS) 4621 r = 0; 4622 else 4623 return r; 4624 4625 /* Don't suspend on bare metal if we are not going to HW reset the ASIC */ 4626 if (!amdgpu_sriov_vf(adev)) { 4627 4628 if (!need_full_reset) 4629 need_full_reset = amdgpu_device_ip_need_full_reset(adev); 4630 4631 if (!need_full_reset) { 4632 amdgpu_device_ip_pre_soft_reset(adev); 4633 r = amdgpu_device_ip_soft_reset(adev); 4634 amdgpu_device_ip_post_soft_reset(adev); 4635 if (r || amdgpu_device_ip_check_soft_reset(adev)) { 4636 dev_info(adev->dev, "soft reset failed, will fallback to full reset!\n"); 4637 need_full_reset = true; 4638 } 4639 } 4640 4641 if (need_full_reset) 4642 r = amdgpu_device_ip_suspend(adev); 4643 if (need_full_reset) 4644 set_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 4645 else 4646 clear_bit(AMDGPU_NEED_FULL_RESET, 4647 &reset_context->flags); 4648 } 4649 4650 return r; 4651 } 4652 4653 int amdgpu_do_asic_reset(struct list_head *device_list_handle, 4654 struct amdgpu_reset_context *reset_context) 4655 { 4656 struct amdgpu_device *tmp_adev = NULL; 4657 bool need_full_reset, skip_hw_reset, vram_lost = false; 4658 int r = 0; 4659 4660 /* Try reset handler method first */ 4661 tmp_adev = list_first_entry(device_list_handle, struct amdgpu_device, 4662 reset_list); 4663 r = amdgpu_reset_perform_reset(tmp_adev, reset_context); 4664 /* If reset handler not implemented, continue; otherwise return */ 4665 if (r == -ENOSYS) 4666 r = 0; 4667 else 4668 return r; 4669 4670 /* Reset handler not implemented, use the default method */ 4671 need_full_reset = 4672 test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 4673 skip_hw_reset = test_bit(AMDGPU_SKIP_HW_RESET, &reset_context->flags); 4674 4675 /* 4676 * ASIC reset has to be done on all XGMI hive nodes ASAP 4677 * to allow proper links negotiation in FW (within 1 sec) 4678 */ 4679 if (!skip_hw_reset && need_full_reset) { 4680 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 4681 /* For XGMI run all resets in parallel to speed up the process */ 4682 if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) { 4683 tmp_adev->gmc.xgmi.pending_reset = false; 4684 if (!queue_work(system_unbound_wq, &tmp_adev->xgmi_reset_work)) 4685 r = -EALREADY; 4686 } else 4687 r = amdgpu_asic_reset(tmp_adev); 4688 4689 if (r) { 4690 dev_err(tmp_adev->dev, "ASIC reset failed with error, %d for drm dev, %s", 4691 r, adev_to_drm(tmp_adev)->unique); 4692 break; 4693 } 4694 } 4695 4696 /* For XGMI wait for all resets to complete before proceed */ 4697 if (!r) { 4698 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 4699 if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) { 4700 flush_work(&tmp_adev->xgmi_reset_work); 4701 r = tmp_adev->asic_reset_res; 4702 if (r) 4703 break; 4704 } 4705 } 4706 } 4707 } 4708 4709 if (!r && amdgpu_ras_intr_triggered()) { 4710 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 4711 if (tmp_adev->mmhub.ras && tmp_adev->mmhub.ras->ras_block.hw_ops && 4712 tmp_adev->mmhub.ras->ras_block.hw_ops->reset_ras_error_count) 4713 tmp_adev->mmhub.ras->ras_block.hw_ops->reset_ras_error_count(tmp_adev); 4714 } 4715 4716 amdgpu_ras_intr_cleared(); 4717 } 4718 4719 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 4720 if (need_full_reset) { 4721 /* post card */ 4722 r = amdgpu_device_asic_init(tmp_adev); 4723 if (r) { 4724 dev_warn(tmp_adev->dev, "asic atom init failed!"); 4725 } else { 4726 dev_info(tmp_adev->dev, "GPU reset succeeded, trying to resume\n"); 4727 r = amdgpu_amdkfd_resume_iommu(tmp_adev); 4728 if (r) 4729 goto out; 4730 4731 r = amdgpu_device_ip_resume_phase1(tmp_adev); 4732 if (r) 4733 goto out; 4734 4735 vram_lost = amdgpu_device_check_vram_lost(tmp_adev); 4736 if (vram_lost) { 4737 DRM_INFO("VRAM is lost due to GPU reset!\n"); 4738 amdgpu_inc_vram_lost(tmp_adev); 4739 } 4740 4741 r = amdgpu_device_fw_loading(tmp_adev); 4742 if (r) 4743 return r; 4744 4745 r = amdgpu_device_ip_resume_phase2(tmp_adev); 4746 if (r) 4747 goto out; 4748 4749 if (vram_lost) 4750 amdgpu_device_fill_reset_magic(tmp_adev); 4751 4752 /* 4753 * Add this ASIC as tracked as reset was already 4754 * complete successfully. 4755 */ 4756 amdgpu_register_gpu_instance(tmp_adev); 4757 4758 if (!reset_context->hive && 4759 tmp_adev->gmc.xgmi.num_physical_nodes > 1) 4760 amdgpu_xgmi_add_device(tmp_adev); 4761 4762 r = amdgpu_device_ip_late_init(tmp_adev); 4763 if (r) 4764 goto out; 4765 4766 drm_fb_helper_set_suspend_unlocked(adev_to_drm(tmp_adev)->fb_helper, false); 4767 4768 /* 4769 * The GPU enters bad state once faulty pages 4770 * by ECC has reached the threshold, and ras 4771 * recovery is scheduled next. So add one check 4772 * here to break recovery if it indeed exceeds 4773 * bad page threshold, and remind user to 4774 * retire this GPU or setting one bigger 4775 * bad_page_threshold value to fix this once 4776 * probing driver again. 4777 */ 4778 if (!amdgpu_ras_eeprom_check_err_threshold(tmp_adev)) { 4779 /* must succeed. */ 4780 amdgpu_ras_resume(tmp_adev); 4781 } else { 4782 r = -EINVAL; 4783 goto out; 4784 } 4785 4786 /* Update PSP FW topology after reset */ 4787 if (reset_context->hive && 4788 tmp_adev->gmc.xgmi.num_physical_nodes > 1) 4789 r = amdgpu_xgmi_update_topology( 4790 reset_context->hive, tmp_adev); 4791 } 4792 } 4793 4794 out: 4795 if (!r) { 4796 amdgpu_irq_gpu_reset_resume_helper(tmp_adev); 4797 r = amdgpu_ib_ring_tests(tmp_adev); 4798 if (r) { 4799 dev_err(tmp_adev->dev, "ib ring test failed (%d).\n", r); 4800 need_full_reset = true; 4801 r = -EAGAIN; 4802 goto end; 4803 } 4804 } 4805 4806 if (!r) 4807 r = amdgpu_device_recover_vram(tmp_adev); 4808 else 4809 tmp_adev->asic_reset_res = r; 4810 } 4811 4812 end: 4813 if (need_full_reset) 4814 set_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 4815 else 4816 clear_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 4817 return r; 4818 } 4819 4820 static bool amdgpu_device_lock_adev(struct amdgpu_device *adev, 4821 struct amdgpu_hive_info *hive) 4822 { 4823 if (atomic_cmpxchg(&adev->in_gpu_reset, 0, 1) != 0) 4824 return false; 4825 4826 if (hive) { 4827 down_write_nest_lock(&adev->reset_sem, &hive->hive_lock); 4828 } else { 4829 down_write(&adev->reset_sem); 4830 } 4831 4832 switch (amdgpu_asic_reset_method(adev)) { 4833 case AMD_RESET_METHOD_MODE1: 4834 adev->mp1_state = PP_MP1_STATE_SHUTDOWN; 4835 break; 4836 case AMD_RESET_METHOD_MODE2: 4837 adev->mp1_state = PP_MP1_STATE_RESET; 4838 break; 4839 default: 4840 adev->mp1_state = PP_MP1_STATE_NONE; 4841 break; 4842 } 4843 4844 return true; 4845 } 4846 4847 static void amdgpu_device_unlock_adev(struct amdgpu_device *adev) 4848 { 4849 amdgpu_vf_error_trans_all(adev); 4850 adev->mp1_state = PP_MP1_STATE_NONE; 4851 atomic_set(&adev->in_gpu_reset, 0); 4852 up_write(&adev->reset_sem); 4853 } 4854 4855 /* 4856 * to lockup a list of amdgpu devices in a hive safely, if not a hive 4857 * with multiple nodes, it will be similar as amdgpu_device_lock_adev. 4858 * 4859 * unlock won't require roll back. 4860 */ 4861 static int amdgpu_device_lock_hive_adev(struct amdgpu_device *adev, struct amdgpu_hive_info *hive) 4862 { 4863 struct amdgpu_device *tmp_adev = NULL; 4864 4865 if (!amdgpu_sriov_vf(adev) && (adev->gmc.xgmi.num_physical_nodes > 1)) { 4866 if (!hive) { 4867 dev_err(adev->dev, "Hive is NULL while device has multiple xgmi nodes"); 4868 return -ENODEV; 4869 } 4870 list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head) { 4871 if (!amdgpu_device_lock_adev(tmp_adev, hive)) 4872 goto roll_back; 4873 } 4874 } else if (!amdgpu_device_lock_adev(adev, hive)) 4875 return -EAGAIN; 4876 4877 return 0; 4878 roll_back: 4879 if (!list_is_first(&tmp_adev->gmc.xgmi.head, &hive->device_list)) { 4880 /* 4881 * if the lockup iteration break in the middle of a hive, 4882 * it may means there may has a race issue, 4883 * or a hive device locked up independently. 4884 * we may be in trouble and may not, so will try to roll back 4885 * the lock and give out a warnning. 4886 */ 4887 dev_warn(tmp_adev->dev, "Hive lock iteration broke in the middle. Rolling back to unlock"); 4888 list_for_each_entry_continue_reverse(tmp_adev, &hive->device_list, gmc.xgmi.head) { 4889 amdgpu_device_unlock_adev(tmp_adev); 4890 } 4891 } 4892 return -EAGAIN; 4893 } 4894 4895 static void amdgpu_device_resume_display_audio(struct amdgpu_device *adev) 4896 { 4897 struct pci_dev *p = NULL; 4898 4899 p = pci_get_domain_bus_and_slot(pci_domain_nr(adev->pdev->bus), 4900 adev->pdev->bus->number, 1); 4901 if (p) { 4902 pm_runtime_enable(&(p->dev)); 4903 pm_runtime_resume(&(p->dev)); 4904 } 4905 } 4906 4907 static int amdgpu_device_suspend_display_audio(struct amdgpu_device *adev) 4908 { 4909 enum amd_reset_method reset_method; 4910 struct pci_dev *p = NULL; 4911 u64 expires; 4912 4913 /* 4914 * For now, only BACO and mode1 reset are confirmed 4915 * to suffer the audio issue without proper suspended. 4916 */ 4917 reset_method = amdgpu_asic_reset_method(adev); 4918 if ((reset_method != AMD_RESET_METHOD_BACO) && 4919 (reset_method != AMD_RESET_METHOD_MODE1)) 4920 return -EINVAL; 4921 4922 p = pci_get_domain_bus_and_slot(pci_domain_nr(adev->pdev->bus), 4923 adev->pdev->bus->number, 1); 4924 if (!p) 4925 return -ENODEV; 4926 4927 expires = pm_runtime_autosuspend_expiration(&(p->dev)); 4928 if (!expires) 4929 /* 4930 * If we cannot get the audio device autosuspend delay, 4931 * a fixed 4S interval will be used. Considering 3S is 4932 * the audio controller default autosuspend delay setting. 4933 * 4S used here is guaranteed to cover that. 4934 */ 4935 expires = ktime_get_mono_fast_ns() + NSEC_PER_SEC * 4ULL; 4936 4937 while (!pm_runtime_status_suspended(&(p->dev))) { 4938 if (!pm_runtime_suspend(&(p->dev))) 4939 break; 4940 4941 if (expires < ktime_get_mono_fast_ns()) { 4942 dev_warn(adev->dev, "failed to suspend display audio\n"); 4943 /* TODO: abort the succeeding gpu reset? */ 4944 return -ETIMEDOUT; 4945 } 4946 } 4947 4948 pm_runtime_disable(&(p->dev)); 4949 4950 return 0; 4951 } 4952 4953 static void amdgpu_device_recheck_guilty_jobs( 4954 struct amdgpu_device *adev, struct list_head *device_list_handle, 4955 struct amdgpu_reset_context *reset_context) 4956 { 4957 int i, r = 0; 4958 4959 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 4960 struct amdgpu_ring *ring = adev->rings[i]; 4961 int ret = 0; 4962 struct drm_sched_job *s_job; 4963 4964 if (!ring || !ring->sched.thread) 4965 continue; 4966 4967 s_job = list_first_entry_or_null(&ring->sched.pending_list, 4968 struct drm_sched_job, list); 4969 if (s_job == NULL) 4970 continue; 4971 4972 /* clear job's guilty and depend the folowing step to decide the real one */ 4973 drm_sched_reset_karma(s_job); 4974 /* for the real bad job, it will be resubmitted twice, adding a dma_fence_get 4975 * to make sure fence is balanced */ 4976 dma_fence_get(s_job->s_fence->parent); 4977 drm_sched_resubmit_jobs_ext(&ring->sched, 1); 4978 4979 ret = dma_fence_wait_timeout(s_job->s_fence->parent, false, ring->sched.timeout); 4980 if (ret == 0) { /* timeout */ 4981 DRM_ERROR("Found the real bad job! ring:%s, job_id:%llx\n", 4982 ring->sched.name, s_job->id); 4983 4984 /* set guilty */ 4985 drm_sched_increase_karma(s_job); 4986 retry: 4987 /* do hw reset */ 4988 if (amdgpu_sriov_vf(adev)) { 4989 amdgpu_virt_fini_data_exchange(adev); 4990 r = amdgpu_device_reset_sriov(adev, false); 4991 if (r) 4992 adev->asic_reset_res = r; 4993 } else { 4994 clear_bit(AMDGPU_SKIP_HW_RESET, 4995 &reset_context->flags); 4996 r = amdgpu_do_asic_reset(device_list_handle, 4997 reset_context); 4998 if (r && r == -EAGAIN) 4999 goto retry; 5000 } 5001 5002 /* 5003 * add reset counter so that the following 5004 * resubmitted job could flush vmid 5005 */ 5006 atomic_inc(&adev->gpu_reset_counter); 5007 continue; 5008 } 5009 5010 /* got the hw fence, signal finished fence */ 5011 atomic_dec(ring->sched.score); 5012 dma_fence_put(s_job->s_fence->parent); 5013 dma_fence_get(&s_job->s_fence->finished); 5014 dma_fence_signal(&s_job->s_fence->finished); 5015 dma_fence_put(&s_job->s_fence->finished); 5016 5017 /* remove node from list and free the job */ 5018 spin_lock(&ring->sched.job_list_lock); 5019 list_del_init(&s_job->list); 5020 spin_unlock(&ring->sched.job_list_lock); 5021 ring->sched.ops->free_job(s_job); 5022 } 5023 } 5024 5025 /** 5026 * amdgpu_device_gpu_recover - reset the asic and recover scheduler 5027 * 5028 * @adev: amdgpu_device pointer 5029 * @job: which job trigger hang 5030 * 5031 * Attempt to reset the GPU if it has hung (all asics). 5032 * Attempt to do soft-reset or full-reset and reinitialize Asic 5033 * Returns 0 for success or an error on failure. 5034 */ 5035 5036 int amdgpu_device_gpu_recover(struct amdgpu_device *adev, 5037 struct amdgpu_job *job) 5038 { 5039 struct list_head device_list, *device_list_handle = NULL; 5040 bool job_signaled = false; 5041 struct amdgpu_hive_info *hive = NULL; 5042 struct amdgpu_device *tmp_adev = NULL; 5043 int i, r = 0; 5044 bool need_emergency_restart = false; 5045 bool audio_suspended = false; 5046 int tmp_vram_lost_counter; 5047 struct amdgpu_reset_context reset_context; 5048 5049 memset(&reset_context, 0, sizeof(reset_context)); 5050 5051 /* 5052 * Special case: RAS triggered and full reset isn't supported 5053 */ 5054 need_emergency_restart = amdgpu_ras_need_emergency_restart(adev); 5055 5056 /* 5057 * Flush RAM to disk so that after reboot 5058 * the user can read log and see why the system rebooted. 5059 */ 5060 if (need_emergency_restart && amdgpu_ras_get_context(adev)->reboot) { 5061 DRM_WARN("Emergency reboot."); 5062 5063 ksys_sync_helper(); 5064 emergency_restart(); 5065 } 5066 5067 dev_info(adev->dev, "GPU %s begin!\n", 5068 need_emergency_restart ? "jobs stop":"reset"); 5069 5070 /* 5071 * Here we trylock to avoid chain of resets executing from 5072 * either trigger by jobs on different adevs in XGMI hive or jobs on 5073 * different schedulers for same device while this TO handler is running. 5074 * We always reset all schedulers for device and all devices for XGMI 5075 * hive so that should take care of them too. 5076 */ 5077 if (!amdgpu_sriov_vf(adev)) 5078 hive = amdgpu_get_xgmi_hive(adev); 5079 if (hive) { 5080 if (atomic_cmpxchg(&hive->in_reset, 0, 1) != 0) { 5081 DRM_INFO("Bailing on TDR for s_job:%llx, hive: %llx as another already in progress", 5082 job ? job->base.id : -1, hive->hive_id); 5083 amdgpu_put_xgmi_hive(hive); 5084 if (job && job->vm) 5085 drm_sched_increase_karma(&job->base); 5086 return 0; 5087 } 5088 mutex_lock(&hive->hive_lock); 5089 } 5090 5091 reset_context.method = AMD_RESET_METHOD_NONE; 5092 reset_context.reset_req_dev = adev; 5093 reset_context.job = job; 5094 reset_context.hive = hive; 5095 clear_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags); 5096 5097 /* 5098 * lock the device before we try to operate the linked list 5099 * if didn't get the device lock, don't touch the linked list since 5100 * others may iterating it. 5101 */ 5102 r = amdgpu_device_lock_hive_adev(adev, hive); 5103 if (r) { 5104 dev_info(adev->dev, "Bailing on TDR for s_job:%llx, as another already in progress", 5105 job ? job->base.id : -1); 5106 5107 /* even we skipped this reset, still need to set the job to guilty */ 5108 if (job && job->vm) 5109 drm_sched_increase_karma(&job->base); 5110 goto skip_recovery; 5111 } 5112 5113 /* 5114 * Build list of devices to reset. 5115 * In case we are in XGMI hive mode, resort the device list 5116 * to put adev in the 1st position. 5117 */ 5118 INIT_LIST_HEAD(&device_list); 5119 if (!amdgpu_sriov_vf(adev) && (adev->gmc.xgmi.num_physical_nodes > 1)) { 5120 list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head) 5121 list_add_tail(&tmp_adev->reset_list, &device_list); 5122 if (!list_is_first(&adev->reset_list, &device_list)) 5123 list_rotate_to_front(&adev->reset_list, &device_list); 5124 device_list_handle = &device_list; 5125 } else { 5126 list_add_tail(&adev->reset_list, &device_list); 5127 device_list_handle = &device_list; 5128 } 5129 5130 /* block all schedulers and reset given job's ring */ 5131 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 5132 /* 5133 * Try to put the audio codec into suspend state 5134 * before gpu reset started. 5135 * 5136 * Due to the power domain of the graphics device 5137 * is shared with AZ power domain. Without this, 5138 * we may change the audio hardware from behind 5139 * the audio driver's back. That will trigger 5140 * some audio codec errors. 5141 */ 5142 if (!amdgpu_device_suspend_display_audio(tmp_adev)) 5143 audio_suspended = true; 5144 5145 amdgpu_ras_set_error_query_ready(tmp_adev, false); 5146 5147 cancel_delayed_work_sync(&tmp_adev->delayed_init_work); 5148 5149 if (!amdgpu_sriov_vf(tmp_adev)) 5150 amdgpu_amdkfd_pre_reset(tmp_adev); 5151 5152 /* 5153 * Mark these ASICs to be reseted as untracked first 5154 * And add them back after reset completed 5155 */ 5156 amdgpu_unregister_gpu_instance(tmp_adev); 5157 5158 drm_fb_helper_set_suspend_unlocked(adev_to_drm(adev)->fb_helper, true); 5159 5160 /* disable ras on ALL IPs */ 5161 if (!need_emergency_restart && 5162 amdgpu_device_ip_need_full_reset(tmp_adev)) 5163 amdgpu_ras_suspend(tmp_adev); 5164 5165 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 5166 struct amdgpu_ring *ring = tmp_adev->rings[i]; 5167 5168 if (!ring || !ring->sched.thread) 5169 continue; 5170 5171 drm_sched_stop(&ring->sched, job ? &job->base : NULL); 5172 5173 if (need_emergency_restart) 5174 amdgpu_job_stop_all_jobs_on_sched(&ring->sched); 5175 } 5176 atomic_inc(&tmp_adev->gpu_reset_counter); 5177 } 5178 5179 if (need_emergency_restart) 5180 goto skip_sched_resume; 5181 5182 /* 5183 * Must check guilty signal here since after this point all old 5184 * HW fences are force signaled. 5185 * 5186 * job->base holds a reference to parent fence 5187 */ 5188 if (job && job->base.s_fence->parent && 5189 dma_fence_is_signaled(job->base.s_fence->parent)) { 5190 job_signaled = true; 5191 dev_info(adev->dev, "Guilty job already signaled, skipping HW reset"); 5192 goto skip_hw_reset; 5193 } 5194 5195 retry: /* Rest of adevs pre asic reset from XGMI hive. */ 5196 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 5197 r = amdgpu_device_pre_asic_reset(tmp_adev, &reset_context); 5198 /*TODO Should we stop ?*/ 5199 if (r) { 5200 dev_err(tmp_adev->dev, "GPU pre asic reset failed with err, %d for drm dev, %s ", 5201 r, adev_to_drm(tmp_adev)->unique); 5202 tmp_adev->asic_reset_res = r; 5203 } 5204 } 5205 5206 tmp_vram_lost_counter = atomic_read(&((adev)->vram_lost_counter)); 5207 /* Actual ASIC resets if needed.*/ 5208 /* Host driver will handle XGMI hive reset for SRIOV */ 5209 if (amdgpu_sriov_vf(adev)) { 5210 r = amdgpu_device_reset_sriov(adev, job ? false : true); 5211 if (r) 5212 adev->asic_reset_res = r; 5213 } else { 5214 r = amdgpu_do_asic_reset(device_list_handle, &reset_context); 5215 if (r && r == -EAGAIN) 5216 goto retry; 5217 } 5218 5219 skip_hw_reset: 5220 5221 /* Post ASIC reset for all devs .*/ 5222 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 5223 5224 /* 5225 * Sometimes a later bad compute job can block a good gfx job as gfx 5226 * and compute ring share internal GC HW mutually. We add an additional 5227 * guilty jobs recheck step to find the real guilty job, it synchronously 5228 * submits and pends for the first job being signaled. If it gets timeout, 5229 * we identify it as a real guilty job. 5230 */ 5231 if (amdgpu_gpu_recovery == 2 && 5232 !(tmp_vram_lost_counter < atomic_read(&adev->vram_lost_counter))) 5233 amdgpu_device_recheck_guilty_jobs( 5234 tmp_adev, device_list_handle, &reset_context); 5235 5236 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 5237 struct amdgpu_ring *ring = tmp_adev->rings[i]; 5238 5239 if (!ring || !ring->sched.thread) 5240 continue; 5241 5242 /* No point to resubmit jobs if we didn't HW reset*/ 5243 if (!tmp_adev->asic_reset_res && !job_signaled) 5244 drm_sched_resubmit_jobs(&ring->sched); 5245 5246 drm_sched_start(&ring->sched, !tmp_adev->asic_reset_res); 5247 } 5248 5249 if (!drm_drv_uses_atomic_modeset(adev_to_drm(tmp_adev)) && !job_signaled) { 5250 drm_helper_resume_force_mode(adev_to_drm(tmp_adev)); 5251 } 5252 5253 if (tmp_adev->asic_reset_res) 5254 r = tmp_adev->asic_reset_res; 5255 5256 tmp_adev->asic_reset_res = 0; 5257 5258 if (r) { 5259 /* bad news, how to tell it to userspace ? */ 5260 dev_info(tmp_adev->dev, "GPU reset(%d) failed\n", atomic_read(&tmp_adev->gpu_reset_counter)); 5261 amdgpu_vf_error_put(tmp_adev, AMDGIM_ERROR_VF_GPU_RESET_FAIL, 0, r); 5262 } else { 5263 dev_info(tmp_adev->dev, "GPU reset(%d) succeeded!\n", atomic_read(&tmp_adev->gpu_reset_counter)); 5264 if (amdgpu_acpi_smart_shift_update(adev_to_drm(tmp_adev), AMDGPU_SS_DEV_D0)) 5265 DRM_WARN("smart shift update failed\n"); 5266 } 5267 } 5268 5269 skip_sched_resume: 5270 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 5271 /* unlock kfd: SRIOV would do it separately */ 5272 if (!need_emergency_restart && !amdgpu_sriov_vf(tmp_adev)) 5273 amdgpu_amdkfd_post_reset(tmp_adev); 5274 5275 /* kfd_post_reset will do nothing if kfd device is not initialized, 5276 * need to bring up kfd here if it's not be initialized before 5277 */ 5278 if (!adev->kfd.init_complete) 5279 amdgpu_amdkfd_device_init(adev); 5280 5281 if (audio_suspended) 5282 amdgpu_device_resume_display_audio(tmp_adev); 5283 amdgpu_device_unlock_adev(tmp_adev); 5284 } 5285 5286 skip_recovery: 5287 if (hive) { 5288 atomic_set(&hive->in_reset, 0); 5289 mutex_unlock(&hive->hive_lock); 5290 amdgpu_put_xgmi_hive(hive); 5291 } 5292 5293 if (r && r != -EAGAIN) 5294 dev_info(adev->dev, "GPU reset end with ret = %d\n", r); 5295 return r; 5296 } 5297 5298 /** 5299 * amdgpu_device_get_pcie_info - fence pcie info about the PCIE slot 5300 * 5301 * @adev: amdgpu_device pointer 5302 * 5303 * Fetchs and stores in the driver the PCIE capabilities (gen speed 5304 * and lanes) of the slot the device is in. Handles APUs and 5305 * virtualized environments where PCIE config space may not be available. 5306 */ 5307 static void amdgpu_device_get_pcie_info(struct amdgpu_device *adev) 5308 { 5309 struct pci_dev *pdev; 5310 enum pci_bus_speed speed_cap, platform_speed_cap; 5311 enum pcie_link_width platform_link_width; 5312 5313 if (amdgpu_pcie_gen_cap) 5314 adev->pm.pcie_gen_mask = amdgpu_pcie_gen_cap; 5315 5316 if (amdgpu_pcie_lane_cap) 5317 adev->pm.pcie_mlw_mask = amdgpu_pcie_lane_cap; 5318 5319 /* covers APUs as well */ 5320 if (pci_is_root_bus(adev->pdev->bus)) { 5321 if (adev->pm.pcie_gen_mask == 0) 5322 adev->pm.pcie_gen_mask = AMDGPU_DEFAULT_PCIE_GEN_MASK; 5323 if (adev->pm.pcie_mlw_mask == 0) 5324 adev->pm.pcie_mlw_mask = AMDGPU_DEFAULT_PCIE_MLW_MASK; 5325 return; 5326 } 5327 5328 if (adev->pm.pcie_gen_mask && adev->pm.pcie_mlw_mask) 5329 return; 5330 5331 pcie_bandwidth_available(adev->pdev, NULL, 5332 &platform_speed_cap, &platform_link_width); 5333 5334 if (adev->pm.pcie_gen_mask == 0) { 5335 /* asic caps */ 5336 pdev = adev->pdev; 5337 speed_cap = pcie_get_speed_cap(pdev); 5338 if (speed_cap == PCI_SPEED_UNKNOWN) { 5339 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5340 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 | 5341 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3); 5342 } else { 5343 if (speed_cap == PCIE_SPEED_32_0GT) 5344 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5345 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 | 5346 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3 | 5347 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN4 | 5348 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN5); 5349 else if (speed_cap == PCIE_SPEED_16_0GT) 5350 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5351 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 | 5352 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3 | 5353 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN4); 5354 else if (speed_cap == PCIE_SPEED_8_0GT) 5355 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5356 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 | 5357 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3); 5358 else if (speed_cap == PCIE_SPEED_5_0GT) 5359 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5360 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2); 5361 else 5362 adev->pm.pcie_gen_mask |= CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1; 5363 } 5364 /* platform caps */ 5365 if (platform_speed_cap == PCI_SPEED_UNKNOWN) { 5366 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5367 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2); 5368 } else { 5369 if (platform_speed_cap == PCIE_SPEED_32_0GT) 5370 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5371 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 | 5372 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3 | 5373 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN4 | 5374 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN5); 5375 else if (platform_speed_cap == PCIE_SPEED_16_0GT) 5376 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5377 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 | 5378 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3 | 5379 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN4); 5380 else if (platform_speed_cap == PCIE_SPEED_8_0GT) 5381 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5382 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 | 5383 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3); 5384 else if (platform_speed_cap == PCIE_SPEED_5_0GT) 5385 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5386 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2); 5387 else 5388 adev->pm.pcie_gen_mask |= CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1; 5389 5390 } 5391 } 5392 if (adev->pm.pcie_mlw_mask == 0) { 5393 if (platform_link_width == PCIE_LNK_WIDTH_UNKNOWN) { 5394 adev->pm.pcie_mlw_mask |= AMDGPU_DEFAULT_PCIE_MLW_MASK; 5395 } else { 5396 switch (platform_link_width) { 5397 case PCIE_LNK_X32: 5398 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X32 | 5399 CAIL_PCIE_LINK_WIDTH_SUPPORT_X16 | 5400 CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 | 5401 CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 | 5402 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 5403 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 5404 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 5405 break; 5406 case PCIE_LNK_X16: 5407 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X16 | 5408 CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 | 5409 CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 | 5410 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 5411 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 5412 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 5413 break; 5414 case PCIE_LNK_X12: 5415 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 | 5416 CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 | 5417 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 5418 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 5419 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 5420 break; 5421 case PCIE_LNK_X8: 5422 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 | 5423 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 5424 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 5425 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 5426 break; 5427 case PCIE_LNK_X4: 5428 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 5429 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 5430 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 5431 break; 5432 case PCIE_LNK_X2: 5433 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 5434 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 5435 break; 5436 case PCIE_LNK_X1: 5437 adev->pm.pcie_mlw_mask = CAIL_PCIE_LINK_WIDTH_SUPPORT_X1; 5438 break; 5439 default: 5440 break; 5441 } 5442 } 5443 } 5444 } 5445 5446 int amdgpu_device_baco_enter(struct drm_device *dev) 5447 { 5448 struct amdgpu_device *adev = drm_to_adev(dev); 5449 struct amdgpu_ras *ras = amdgpu_ras_get_context(adev); 5450 5451 if (!amdgpu_device_supports_baco(adev_to_drm(adev))) 5452 return -ENOTSUPP; 5453 5454 if (ras && adev->ras_enabled && 5455 adev->nbio.funcs->enable_doorbell_interrupt) 5456 adev->nbio.funcs->enable_doorbell_interrupt(adev, false); 5457 5458 return amdgpu_dpm_baco_enter(adev); 5459 } 5460 5461 int amdgpu_device_baco_exit(struct drm_device *dev) 5462 { 5463 struct amdgpu_device *adev = drm_to_adev(dev); 5464 struct amdgpu_ras *ras = amdgpu_ras_get_context(adev); 5465 int ret = 0; 5466 5467 if (!amdgpu_device_supports_baco(adev_to_drm(adev))) 5468 return -ENOTSUPP; 5469 5470 ret = amdgpu_dpm_baco_exit(adev); 5471 if (ret) 5472 return ret; 5473 5474 if (ras && adev->ras_enabled && 5475 adev->nbio.funcs->enable_doorbell_interrupt) 5476 adev->nbio.funcs->enable_doorbell_interrupt(adev, true); 5477 5478 if (amdgpu_passthrough(adev) && 5479 adev->nbio.funcs->clear_doorbell_interrupt) 5480 adev->nbio.funcs->clear_doorbell_interrupt(adev); 5481 5482 return 0; 5483 } 5484 5485 static void amdgpu_cancel_all_tdr(struct amdgpu_device *adev) 5486 { 5487 int i; 5488 5489 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 5490 struct amdgpu_ring *ring = adev->rings[i]; 5491 5492 if (!ring || !ring->sched.thread) 5493 continue; 5494 5495 cancel_delayed_work_sync(&ring->sched.work_tdr); 5496 } 5497 } 5498 5499 /** 5500 * amdgpu_pci_error_detected - Called when a PCI error is detected. 5501 * @pdev: PCI device struct 5502 * @state: PCI channel state 5503 * 5504 * Description: Called when a PCI error is detected. 5505 * 5506 * Return: PCI_ERS_RESULT_NEED_RESET or PCI_ERS_RESULT_DISCONNECT. 5507 */ 5508 pci_ers_result_t amdgpu_pci_error_detected(struct pci_dev *pdev, pci_channel_state_t state) 5509 { 5510 struct drm_device *dev = pci_get_drvdata(pdev); 5511 struct amdgpu_device *adev = drm_to_adev(dev); 5512 int i; 5513 5514 DRM_INFO("PCI error: detected callback, state(%d)!!\n", state); 5515 5516 if (adev->gmc.xgmi.num_physical_nodes > 1) { 5517 DRM_WARN("No support for XGMI hive yet..."); 5518 return PCI_ERS_RESULT_DISCONNECT; 5519 } 5520 5521 adev->pci_channel_state = state; 5522 5523 switch (state) { 5524 case pci_channel_io_normal: 5525 return PCI_ERS_RESULT_CAN_RECOVER; 5526 /* Fatal error, prepare for slot reset */ 5527 case pci_channel_io_frozen: 5528 /* 5529 * Cancel and wait for all TDRs in progress if failing to 5530 * set adev->in_gpu_reset in amdgpu_device_lock_adev 5531 * 5532 * Locking adev->reset_sem will prevent any external access 5533 * to GPU during PCI error recovery 5534 */ 5535 while (!amdgpu_device_lock_adev(adev, NULL)) 5536 amdgpu_cancel_all_tdr(adev); 5537 5538 /* 5539 * Block any work scheduling as we do for regular GPU reset 5540 * for the duration of the recovery 5541 */ 5542 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 5543 struct amdgpu_ring *ring = adev->rings[i]; 5544 5545 if (!ring || !ring->sched.thread) 5546 continue; 5547 5548 drm_sched_stop(&ring->sched, NULL); 5549 } 5550 atomic_inc(&adev->gpu_reset_counter); 5551 return PCI_ERS_RESULT_NEED_RESET; 5552 case pci_channel_io_perm_failure: 5553 /* Permanent error, prepare for device removal */ 5554 return PCI_ERS_RESULT_DISCONNECT; 5555 } 5556 5557 return PCI_ERS_RESULT_NEED_RESET; 5558 } 5559 5560 /** 5561 * amdgpu_pci_mmio_enabled - Enable MMIO and dump debug registers 5562 * @pdev: pointer to PCI device 5563 */ 5564 pci_ers_result_t amdgpu_pci_mmio_enabled(struct pci_dev *pdev) 5565 { 5566 5567 DRM_INFO("PCI error: mmio enabled callback!!\n"); 5568 5569 /* TODO - dump whatever for debugging purposes */ 5570 5571 /* This called only if amdgpu_pci_error_detected returns 5572 * PCI_ERS_RESULT_CAN_RECOVER. Read/write to the device still 5573 * works, no need to reset slot. 5574 */ 5575 5576 return PCI_ERS_RESULT_RECOVERED; 5577 } 5578 5579 /** 5580 * amdgpu_pci_slot_reset - Called when PCI slot has been reset. 5581 * @pdev: PCI device struct 5582 * 5583 * Description: This routine is called by the pci error recovery 5584 * code after the PCI slot has been reset, just before we 5585 * should resume normal operations. 5586 */ 5587 pci_ers_result_t amdgpu_pci_slot_reset(struct pci_dev *pdev) 5588 { 5589 struct drm_device *dev = pci_get_drvdata(pdev); 5590 struct amdgpu_device *adev = drm_to_adev(dev); 5591 int r, i; 5592 struct amdgpu_reset_context reset_context; 5593 u32 memsize; 5594 struct list_head device_list; 5595 5596 DRM_INFO("PCI error: slot reset callback!!\n"); 5597 5598 memset(&reset_context, 0, sizeof(reset_context)); 5599 5600 INIT_LIST_HEAD(&device_list); 5601 list_add_tail(&adev->reset_list, &device_list); 5602 5603 /* wait for asic to come out of reset */ 5604 msleep(500); 5605 5606 /* Restore PCI confspace */ 5607 amdgpu_device_load_pci_state(pdev); 5608 5609 /* confirm ASIC came out of reset */ 5610 for (i = 0; i < adev->usec_timeout; i++) { 5611 memsize = amdgpu_asic_get_config_memsize(adev); 5612 5613 if (memsize != 0xffffffff) 5614 break; 5615 udelay(1); 5616 } 5617 if (memsize == 0xffffffff) { 5618 r = -ETIME; 5619 goto out; 5620 } 5621 5622 reset_context.method = AMD_RESET_METHOD_NONE; 5623 reset_context.reset_req_dev = adev; 5624 set_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags); 5625 set_bit(AMDGPU_SKIP_HW_RESET, &reset_context.flags); 5626 5627 adev->no_hw_access = true; 5628 r = amdgpu_device_pre_asic_reset(adev, &reset_context); 5629 adev->no_hw_access = false; 5630 if (r) 5631 goto out; 5632 5633 r = amdgpu_do_asic_reset(&device_list, &reset_context); 5634 5635 out: 5636 if (!r) { 5637 if (amdgpu_device_cache_pci_state(adev->pdev)) 5638 pci_restore_state(adev->pdev); 5639 5640 DRM_INFO("PCIe error recovery succeeded\n"); 5641 } else { 5642 DRM_ERROR("PCIe error recovery failed, err:%d", r); 5643 amdgpu_device_unlock_adev(adev); 5644 } 5645 5646 return r ? PCI_ERS_RESULT_DISCONNECT : PCI_ERS_RESULT_RECOVERED; 5647 } 5648 5649 /** 5650 * amdgpu_pci_resume() - resume normal ops after PCI reset 5651 * @pdev: pointer to PCI device 5652 * 5653 * Called when the error recovery driver tells us that its 5654 * OK to resume normal operation. 5655 */ 5656 void amdgpu_pci_resume(struct pci_dev *pdev) 5657 { 5658 struct drm_device *dev = pci_get_drvdata(pdev); 5659 struct amdgpu_device *adev = drm_to_adev(dev); 5660 int i; 5661 5662 5663 DRM_INFO("PCI error: resume callback!!\n"); 5664 5665 /* Only continue execution for the case of pci_channel_io_frozen */ 5666 if (adev->pci_channel_state != pci_channel_io_frozen) 5667 return; 5668 5669 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 5670 struct amdgpu_ring *ring = adev->rings[i]; 5671 5672 if (!ring || !ring->sched.thread) 5673 continue; 5674 5675 5676 drm_sched_resubmit_jobs(&ring->sched); 5677 drm_sched_start(&ring->sched, true); 5678 } 5679 5680 amdgpu_device_unlock_adev(adev); 5681 } 5682 5683 bool amdgpu_device_cache_pci_state(struct pci_dev *pdev) 5684 { 5685 struct drm_device *dev = pci_get_drvdata(pdev); 5686 struct amdgpu_device *adev = drm_to_adev(dev); 5687 int r; 5688 5689 r = pci_save_state(pdev); 5690 if (!r) { 5691 kfree(adev->pci_state); 5692 5693 adev->pci_state = pci_store_saved_state(pdev); 5694 5695 if (!adev->pci_state) { 5696 DRM_ERROR("Failed to store PCI saved state"); 5697 return false; 5698 } 5699 } else { 5700 DRM_WARN("Failed to save PCI state, err:%d\n", r); 5701 return false; 5702 } 5703 5704 return true; 5705 } 5706 5707 bool amdgpu_device_load_pci_state(struct pci_dev *pdev) 5708 { 5709 struct drm_device *dev = pci_get_drvdata(pdev); 5710 struct amdgpu_device *adev = drm_to_adev(dev); 5711 int r; 5712 5713 if (!adev->pci_state) 5714 return false; 5715 5716 r = pci_load_saved_state(pdev, adev->pci_state); 5717 5718 if (!r) { 5719 pci_restore_state(pdev); 5720 } else { 5721 DRM_WARN("Failed to load PCI state, err:%d\n", r); 5722 return false; 5723 } 5724 5725 return true; 5726 } 5727 5728 void amdgpu_device_flush_hdp(struct amdgpu_device *adev, 5729 struct amdgpu_ring *ring) 5730 { 5731 #ifdef CONFIG_X86_64 5732 if (adev->flags & AMD_IS_APU) 5733 return; 5734 #endif 5735 if (adev->gmc.xgmi.connected_to_cpu) 5736 return; 5737 5738 if (ring && ring->funcs->emit_hdp_flush) 5739 amdgpu_ring_emit_hdp_flush(ring); 5740 else 5741 amdgpu_asic_flush_hdp(adev, ring); 5742 } 5743 5744 void amdgpu_device_invalidate_hdp(struct amdgpu_device *adev, 5745 struct amdgpu_ring *ring) 5746 { 5747 #ifdef CONFIG_X86_64 5748 if (adev->flags & AMD_IS_APU) 5749 return; 5750 #endif 5751 if (adev->gmc.xgmi.connected_to_cpu) 5752 return; 5753 5754 amdgpu_asic_invalidate_hdp(adev, ring); 5755 } 5756 5757 /** 5758 * amdgpu_device_halt() - bring hardware to some kind of halt state 5759 * 5760 * @adev: amdgpu_device pointer 5761 * 5762 * Bring hardware to some kind of halt state so that no one can touch it 5763 * any more. It will help to maintain error context when error occurred. 5764 * Compare to a simple hang, the system will keep stable at least for SSH 5765 * access. Then it should be trivial to inspect the hardware state and 5766 * see what's going on. Implemented as following: 5767 * 5768 * 1. drm_dev_unplug() makes device inaccessible to user space(IOCTLs, etc), 5769 * clears all CPU mappings to device, disallows remappings through page faults 5770 * 2. amdgpu_irq_disable_all() disables all interrupts 5771 * 3. amdgpu_fence_driver_hw_fini() signals all HW fences 5772 * 4. set adev->no_hw_access to avoid potential crashes after setp 5 5773 * 5. amdgpu_device_unmap_mmio() clears all MMIO mappings 5774 * 6. pci_disable_device() and pci_wait_for_pending_transaction() 5775 * flush any in flight DMA operations 5776 */ 5777 void amdgpu_device_halt(struct amdgpu_device *adev) 5778 { 5779 struct pci_dev *pdev = adev->pdev; 5780 struct drm_device *ddev = adev_to_drm(adev); 5781 5782 drm_dev_unplug(ddev); 5783 5784 amdgpu_irq_disable_all(adev); 5785 5786 amdgpu_fence_driver_hw_fini(adev); 5787 5788 adev->no_hw_access = true; 5789 5790 amdgpu_device_unmap_mmio(adev); 5791 5792 pci_disable_device(pdev); 5793 pci_wait_for_pending_transaction(pdev); 5794 } 5795 5796 u32 amdgpu_device_pcie_port_rreg(struct amdgpu_device *adev, 5797 u32 reg) 5798 { 5799 unsigned long flags, address, data; 5800 u32 r; 5801 5802 address = adev->nbio.funcs->get_pcie_port_index_offset(adev); 5803 data = adev->nbio.funcs->get_pcie_port_data_offset(adev); 5804 5805 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 5806 WREG32(address, reg * 4); 5807 (void)RREG32(address); 5808 r = RREG32(data); 5809 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 5810 return r; 5811 } 5812 5813 void amdgpu_device_pcie_port_wreg(struct amdgpu_device *adev, 5814 u32 reg, u32 v) 5815 { 5816 unsigned long flags, address, data; 5817 5818 address = adev->nbio.funcs->get_pcie_port_index_offset(adev); 5819 data = adev->nbio.funcs->get_pcie_port_data_offset(adev); 5820 5821 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 5822 WREG32(address, reg * 4); 5823 (void)RREG32(address); 5824 WREG32(data, v); 5825 (void)RREG32(data); 5826 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 5827 } 5828