1 /* 2 * Copyright 2008 Advanced Micro Devices, Inc. 3 * Copyright 2008 Red Hat Inc. 4 * Copyright 2009 Jerome Glisse. 5 * 6 * Permission is hereby granted, free of charge, to any person obtaining a 7 * copy of this software and associated documentation files (the "Software"), 8 * to deal in the Software without restriction, including without limitation 9 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 10 * and/or sell copies of the Software, and to permit persons to whom the 11 * Software is furnished to do so, subject to the following conditions: 12 * 13 * The above copyright notice and this permission notice shall be included in 14 * all copies or substantial portions of the Software. 15 * 16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 19 * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR 20 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, 21 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR 22 * OTHER DEALINGS IN THE SOFTWARE. 23 * 24 * Authors: Dave Airlie 25 * Alex Deucher 26 * Jerome Glisse 27 */ 28 #include <linux/power_supply.h> 29 #include <linux/kthread.h> 30 #include <linux/module.h> 31 #include <linux/console.h> 32 #include <linux/slab.h> 33 #include <linux/iommu.h> 34 #include <linux/pci.h> 35 #include <linux/pci-p2pdma.h> 36 #include <linux/apple-gmux.h> 37 38 #include <drm/drm_aperture.h> 39 #include <drm/drm_atomic_helper.h> 40 #include <drm/drm_crtc_helper.h> 41 #include <drm/drm_fb_helper.h> 42 #include <drm/drm_probe_helper.h> 43 #include <drm/amdgpu_drm.h> 44 #include <linux/device.h> 45 #include <linux/vgaarb.h> 46 #include <linux/vga_switcheroo.h> 47 #include <linux/efi.h> 48 #include "amdgpu.h" 49 #include "amdgpu_trace.h" 50 #include "amdgpu_i2c.h" 51 #include "atom.h" 52 #include "amdgpu_atombios.h" 53 #include "amdgpu_atomfirmware.h" 54 #include "amd_pcie.h" 55 #ifdef CONFIG_DRM_AMDGPU_SI 56 #include "si.h" 57 #endif 58 #ifdef CONFIG_DRM_AMDGPU_CIK 59 #include "cik.h" 60 #endif 61 #include "vi.h" 62 #include "soc15.h" 63 #include "nv.h" 64 #include "bif/bif_4_1_d.h" 65 #include <linux/firmware.h> 66 #include "amdgpu_vf_error.h" 67 68 #include "amdgpu_amdkfd.h" 69 #include "amdgpu_pm.h" 70 71 #include "amdgpu_xgmi.h" 72 #include "amdgpu_ras.h" 73 #include "amdgpu_pmu.h" 74 #include "amdgpu_fru_eeprom.h" 75 #include "amdgpu_reset.h" 76 #include "amdgpu_virt.h" 77 #include "amdgpu_dev_coredump.h" 78 79 #include <linux/suspend.h> 80 #include <drm/task_barrier.h> 81 #include <linux/pm_runtime.h> 82 83 #include <drm/drm_drv.h> 84 85 #if IS_ENABLED(CONFIG_X86) 86 #include <asm/intel-family.h> 87 #endif 88 89 MODULE_FIRMWARE("amdgpu/vega10_gpu_info.bin"); 90 MODULE_FIRMWARE("amdgpu/vega12_gpu_info.bin"); 91 MODULE_FIRMWARE("amdgpu/raven_gpu_info.bin"); 92 MODULE_FIRMWARE("amdgpu/picasso_gpu_info.bin"); 93 MODULE_FIRMWARE("amdgpu/raven2_gpu_info.bin"); 94 MODULE_FIRMWARE("amdgpu/arcturus_gpu_info.bin"); 95 MODULE_FIRMWARE("amdgpu/navi12_gpu_info.bin"); 96 97 #define AMDGPU_RESUME_MS 2000 98 #define AMDGPU_MAX_RETRY_LIMIT 2 99 #define AMDGPU_RETRY_SRIOV_RESET(r) ((r) == -EBUSY || (r) == -ETIMEDOUT || (r) == -EINVAL) 100 #define AMDGPU_PCIE_INDEX_FALLBACK (0x38 >> 2) 101 #define AMDGPU_PCIE_INDEX_HI_FALLBACK (0x44 >> 2) 102 #define AMDGPU_PCIE_DATA_FALLBACK (0x3C >> 2) 103 104 static const struct drm_driver amdgpu_kms_driver; 105 106 const char *amdgpu_asic_name[] = { 107 "TAHITI", 108 "PITCAIRN", 109 "VERDE", 110 "OLAND", 111 "HAINAN", 112 "BONAIRE", 113 "KAVERI", 114 "KABINI", 115 "HAWAII", 116 "MULLINS", 117 "TOPAZ", 118 "TONGA", 119 "FIJI", 120 "CARRIZO", 121 "STONEY", 122 "POLARIS10", 123 "POLARIS11", 124 "POLARIS12", 125 "VEGAM", 126 "VEGA10", 127 "VEGA12", 128 "VEGA20", 129 "RAVEN", 130 "ARCTURUS", 131 "RENOIR", 132 "ALDEBARAN", 133 "NAVI10", 134 "CYAN_SKILLFISH", 135 "NAVI14", 136 "NAVI12", 137 "SIENNA_CICHLID", 138 "NAVY_FLOUNDER", 139 "VANGOGH", 140 "DIMGREY_CAVEFISH", 141 "BEIGE_GOBY", 142 "YELLOW_CARP", 143 "IP DISCOVERY", 144 "LAST", 145 }; 146 147 static inline void amdgpu_device_stop_pending_resets(struct amdgpu_device *adev); 148 149 /** 150 * DOC: pcie_replay_count 151 * 152 * The amdgpu driver provides a sysfs API for reporting the total number 153 * of PCIe replays (NAKs) 154 * The file pcie_replay_count is used for this and returns the total 155 * number of replays as a sum of the NAKs generated and NAKs received 156 */ 157 158 static ssize_t amdgpu_device_get_pcie_replay_count(struct device *dev, 159 struct device_attribute *attr, char *buf) 160 { 161 struct drm_device *ddev = dev_get_drvdata(dev); 162 struct amdgpu_device *adev = drm_to_adev(ddev); 163 uint64_t cnt = amdgpu_asic_get_pcie_replay_count(adev); 164 165 return sysfs_emit(buf, "%llu\n", cnt); 166 } 167 168 static DEVICE_ATTR(pcie_replay_count, 0444, 169 amdgpu_device_get_pcie_replay_count, NULL); 170 171 static ssize_t amdgpu_sysfs_reg_state_get(struct file *f, struct kobject *kobj, 172 struct bin_attribute *attr, char *buf, 173 loff_t ppos, size_t count) 174 { 175 struct device *dev = kobj_to_dev(kobj); 176 struct drm_device *ddev = dev_get_drvdata(dev); 177 struct amdgpu_device *adev = drm_to_adev(ddev); 178 ssize_t bytes_read; 179 180 switch (ppos) { 181 case AMDGPU_SYS_REG_STATE_XGMI: 182 bytes_read = amdgpu_asic_get_reg_state( 183 adev, AMDGPU_REG_STATE_TYPE_XGMI, buf, count); 184 break; 185 case AMDGPU_SYS_REG_STATE_WAFL: 186 bytes_read = amdgpu_asic_get_reg_state( 187 adev, AMDGPU_REG_STATE_TYPE_WAFL, buf, count); 188 break; 189 case AMDGPU_SYS_REG_STATE_PCIE: 190 bytes_read = amdgpu_asic_get_reg_state( 191 adev, AMDGPU_REG_STATE_TYPE_PCIE, buf, count); 192 break; 193 case AMDGPU_SYS_REG_STATE_USR: 194 bytes_read = amdgpu_asic_get_reg_state( 195 adev, AMDGPU_REG_STATE_TYPE_USR, buf, count); 196 break; 197 case AMDGPU_SYS_REG_STATE_USR_1: 198 bytes_read = amdgpu_asic_get_reg_state( 199 adev, AMDGPU_REG_STATE_TYPE_USR_1, buf, count); 200 break; 201 default: 202 return -EINVAL; 203 } 204 205 return bytes_read; 206 } 207 208 BIN_ATTR(reg_state, 0444, amdgpu_sysfs_reg_state_get, NULL, 209 AMDGPU_SYS_REG_STATE_END); 210 211 int amdgpu_reg_state_sysfs_init(struct amdgpu_device *adev) 212 { 213 int ret; 214 215 if (!amdgpu_asic_get_reg_state_supported(adev)) 216 return 0; 217 218 ret = sysfs_create_bin_file(&adev->dev->kobj, &bin_attr_reg_state); 219 220 return ret; 221 } 222 223 void amdgpu_reg_state_sysfs_fini(struct amdgpu_device *adev) 224 { 225 if (!amdgpu_asic_get_reg_state_supported(adev)) 226 return; 227 sysfs_remove_bin_file(&adev->dev->kobj, &bin_attr_reg_state); 228 } 229 230 /** 231 * DOC: board_info 232 * 233 * The amdgpu driver provides a sysfs API for giving board related information. 234 * It provides the form factor information in the format 235 * 236 * type : form factor 237 * 238 * Possible form factor values 239 * 240 * - "cem" - PCIE CEM card 241 * - "oam" - Open Compute Accelerator Module 242 * - "unknown" - Not known 243 * 244 */ 245 246 static ssize_t amdgpu_device_get_board_info(struct device *dev, 247 struct device_attribute *attr, 248 char *buf) 249 { 250 struct drm_device *ddev = dev_get_drvdata(dev); 251 struct amdgpu_device *adev = drm_to_adev(ddev); 252 enum amdgpu_pkg_type pkg_type = AMDGPU_PKG_TYPE_CEM; 253 const char *pkg; 254 255 if (adev->smuio.funcs && adev->smuio.funcs->get_pkg_type) 256 pkg_type = adev->smuio.funcs->get_pkg_type(adev); 257 258 switch (pkg_type) { 259 case AMDGPU_PKG_TYPE_CEM: 260 pkg = "cem"; 261 break; 262 case AMDGPU_PKG_TYPE_OAM: 263 pkg = "oam"; 264 break; 265 default: 266 pkg = "unknown"; 267 break; 268 } 269 270 return sysfs_emit(buf, "%s : %s\n", "type", pkg); 271 } 272 273 static DEVICE_ATTR(board_info, 0444, amdgpu_device_get_board_info, NULL); 274 275 static struct attribute *amdgpu_board_attrs[] = { 276 &dev_attr_board_info.attr, 277 NULL, 278 }; 279 280 static umode_t amdgpu_board_attrs_is_visible(struct kobject *kobj, 281 struct attribute *attr, int n) 282 { 283 struct device *dev = kobj_to_dev(kobj); 284 struct drm_device *ddev = dev_get_drvdata(dev); 285 struct amdgpu_device *adev = drm_to_adev(ddev); 286 287 if (adev->flags & AMD_IS_APU) 288 return 0; 289 290 return attr->mode; 291 } 292 293 static const struct attribute_group amdgpu_board_attrs_group = { 294 .attrs = amdgpu_board_attrs, 295 .is_visible = amdgpu_board_attrs_is_visible 296 }; 297 298 static void amdgpu_device_get_pcie_info(struct amdgpu_device *adev); 299 300 301 /** 302 * amdgpu_device_supports_px - Is the device a dGPU with ATPX power control 303 * 304 * @dev: drm_device pointer 305 * 306 * Returns true if the device is a dGPU with ATPX power control, 307 * otherwise return false. 308 */ 309 bool amdgpu_device_supports_px(struct drm_device *dev) 310 { 311 struct amdgpu_device *adev = drm_to_adev(dev); 312 313 if ((adev->flags & AMD_IS_PX) && !amdgpu_is_atpx_hybrid()) 314 return true; 315 return false; 316 } 317 318 /** 319 * amdgpu_device_supports_boco - Is the device a dGPU with ACPI power resources 320 * 321 * @dev: drm_device pointer 322 * 323 * Returns true if the device is a dGPU with ACPI power control, 324 * otherwise return false. 325 */ 326 bool amdgpu_device_supports_boco(struct drm_device *dev) 327 { 328 struct amdgpu_device *adev = drm_to_adev(dev); 329 330 if (adev->has_pr3 || 331 ((adev->flags & AMD_IS_PX) && amdgpu_is_atpx_hybrid())) 332 return true; 333 return false; 334 } 335 336 /** 337 * amdgpu_device_supports_baco - Does the device support BACO 338 * 339 * @dev: drm_device pointer 340 * 341 * Returns true if the device supporte BACO, 342 * otherwise return false. 343 */ 344 bool amdgpu_device_supports_baco(struct drm_device *dev) 345 { 346 struct amdgpu_device *adev = drm_to_adev(dev); 347 348 return amdgpu_asic_supports_baco(adev); 349 } 350 351 /** 352 * amdgpu_device_supports_smart_shift - Is the device dGPU with 353 * smart shift support 354 * 355 * @dev: drm_device pointer 356 * 357 * Returns true if the device is a dGPU with Smart Shift support, 358 * otherwise returns false. 359 */ 360 bool amdgpu_device_supports_smart_shift(struct drm_device *dev) 361 { 362 return (amdgpu_device_supports_boco(dev) && 363 amdgpu_acpi_is_power_shift_control_supported()); 364 } 365 366 /* 367 * VRAM access helper functions 368 */ 369 370 /** 371 * amdgpu_device_mm_access - access vram by MM_INDEX/MM_DATA 372 * 373 * @adev: amdgpu_device pointer 374 * @pos: offset of the buffer in vram 375 * @buf: virtual address of the buffer in system memory 376 * @size: read/write size, sizeof(@buf) must > @size 377 * @write: true - write to vram, otherwise - read from vram 378 */ 379 void amdgpu_device_mm_access(struct amdgpu_device *adev, loff_t pos, 380 void *buf, size_t size, bool write) 381 { 382 unsigned long flags; 383 uint32_t hi = ~0, tmp = 0; 384 uint32_t *data = buf; 385 uint64_t last; 386 int idx; 387 388 if (!drm_dev_enter(adev_to_drm(adev), &idx)) 389 return; 390 391 BUG_ON(!IS_ALIGNED(pos, 4) || !IS_ALIGNED(size, 4)); 392 393 spin_lock_irqsave(&adev->mmio_idx_lock, flags); 394 for (last = pos + size; pos < last; pos += 4) { 395 tmp = pos >> 31; 396 397 WREG32_NO_KIQ(mmMM_INDEX, ((uint32_t)pos) | 0x80000000); 398 if (tmp != hi) { 399 WREG32_NO_KIQ(mmMM_INDEX_HI, tmp); 400 hi = tmp; 401 } 402 if (write) 403 WREG32_NO_KIQ(mmMM_DATA, *data++); 404 else 405 *data++ = RREG32_NO_KIQ(mmMM_DATA); 406 } 407 408 spin_unlock_irqrestore(&adev->mmio_idx_lock, flags); 409 drm_dev_exit(idx); 410 } 411 412 /** 413 * amdgpu_device_aper_access - access vram by vram aperature 414 * 415 * @adev: amdgpu_device pointer 416 * @pos: offset of the buffer in vram 417 * @buf: virtual address of the buffer in system memory 418 * @size: read/write size, sizeof(@buf) must > @size 419 * @write: true - write to vram, otherwise - read from vram 420 * 421 * The return value means how many bytes have been transferred. 422 */ 423 size_t amdgpu_device_aper_access(struct amdgpu_device *adev, loff_t pos, 424 void *buf, size_t size, bool write) 425 { 426 #ifdef CONFIG_64BIT 427 void __iomem *addr; 428 size_t count = 0; 429 uint64_t last; 430 431 if (!adev->mman.aper_base_kaddr) 432 return 0; 433 434 last = min(pos + size, adev->gmc.visible_vram_size); 435 if (last > pos) { 436 addr = adev->mman.aper_base_kaddr + pos; 437 count = last - pos; 438 439 if (write) { 440 memcpy_toio(addr, buf, count); 441 /* Make sure HDP write cache flush happens without any reordering 442 * after the system memory contents are sent over PCIe device 443 */ 444 mb(); 445 amdgpu_device_flush_hdp(adev, NULL); 446 } else { 447 amdgpu_device_invalidate_hdp(adev, NULL); 448 /* Make sure HDP read cache is invalidated before issuing a read 449 * to the PCIe device 450 */ 451 mb(); 452 memcpy_fromio(buf, addr, count); 453 } 454 455 } 456 457 return count; 458 #else 459 return 0; 460 #endif 461 } 462 463 /** 464 * amdgpu_device_vram_access - read/write a buffer in vram 465 * 466 * @adev: amdgpu_device pointer 467 * @pos: offset of the buffer in vram 468 * @buf: virtual address of the buffer in system memory 469 * @size: read/write size, sizeof(@buf) must > @size 470 * @write: true - write to vram, otherwise - read from vram 471 */ 472 void amdgpu_device_vram_access(struct amdgpu_device *adev, loff_t pos, 473 void *buf, size_t size, bool write) 474 { 475 size_t count; 476 477 /* try to using vram apreature to access vram first */ 478 count = amdgpu_device_aper_access(adev, pos, buf, size, write); 479 size -= count; 480 if (size) { 481 /* using MM to access rest vram */ 482 pos += count; 483 buf += count; 484 amdgpu_device_mm_access(adev, pos, buf, size, write); 485 } 486 } 487 488 /* 489 * register access helper functions. 490 */ 491 492 /* Check if hw access should be skipped because of hotplug or device error */ 493 bool amdgpu_device_skip_hw_access(struct amdgpu_device *adev) 494 { 495 if (adev->no_hw_access) 496 return true; 497 498 #ifdef CONFIG_LOCKDEP 499 /* 500 * This is a bit complicated to understand, so worth a comment. What we assert 501 * here is that the GPU reset is not running on another thread in parallel. 502 * 503 * For this we trylock the read side of the reset semaphore, if that succeeds 504 * we know that the reset is not running in paralell. 505 * 506 * If the trylock fails we assert that we are either already holding the read 507 * side of the lock or are the reset thread itself and hold the write side of 508 * the lock. 509 */ 510 if (in_task()) { 511 if (down_read_trylock(&adev->reset_domain->sem)) 512 up_read(&adev->reset_domain->sem); 513 else 514 lockdep_assert_held(&adev->reset_domain->sem); 515 } 516 #endif 517 return false; 518 } 519 520 /** 521 * amdgpu_device_rreg - read a memory mapped IO or indirect register 522 * 523 * @adev: amdgpu_device pointer 524 * @reg: dword aligned register offset 525 * @acc_flags: access flags which require special behavior 526 * 527 * Returns the 32 bit value from the offset specified. 528 */ 529 uint32_t amdgpu_device_rreg(struct amdgpu_device *adev, 530 uint32_t reg, uint32_t acc_flags) 531 { 532 uint32_t ret; 533 534 if (amdgpu_device_skip_hw_access(adev)) 535 return 0; 536 537 if ((reg * 4) < adev->rmmio_size) { 538 if (!(acc_flags & AMDGPU_REGS_NO_KIQ) && 539 amdgpu_sriov_runtime(adev) && 540 down_read_trylock(&adev->reset_domain->sem)) { 541 ret = amdgpu_kiq_rreg(adev, reg, 0); 542 up_read(&adev->reset_domain->sem); 543 } else { 544 ret = readl(((void __iomem *)adev->rmmio) + (reg * 4)); 545 } 546 } else { 547 ret = adev->pcie_rreg(adev, reg * 4); 548 } 549 550 trace_amdgpu_device_rreg(adev->pdev->device, reg, ret); 551 552 return ret; 553 } 554 555 /* 556 * MMIO register read with bytes helper functions 557 * @offset:bytes offset from MMIO start 558 */ 559 560 /** 561 * amdgpu_mm_rreg8 - read a memory mapped IO register 562 * 563 * @adev: amdgpu_device pointer 564 * @offset: byte aligned register offset 565 * 566 * Returns the 8 bit value from the offset specified. 567 */ 568 uint8_t amdgpu_mm_rreg8(struct amdgpu_device *adev, uint32_t offset) 569 { 570 if (amdgpu_device_skip_hw_access(adev)) 571 return 0; 572 573 if (offset < adev->rmmio_size) 574 return (readb(adev->rmmio + offset)); 575 BUG(); 576 } 577 578 579 /** 580 * amdgpu_device_xcc_rreg - read a memory mapped IO or indirect register with specific XCC 581 * 582 * @adev: amdgpu_device pointer 583 * @reg: dword aligned register offset 584 * @acc_flags: access flags which require special behavior 585 * @xcc_id: xcc accelerated compute core id 586 * 587 * Returns the 32 bit value from the offset specified. 588 */ 589 uint32_t amdgpu_device_xcc_rreg(struct amdgpu_device *adev, 590 uint32_t reg, uint32_t acc_flags, 591 uint32_t xcc_id) 592 { 593 uint32_t ret, rlcg_flag; 594 595 if (amdgpu_device_skip_hw_access(adev)) 596 return 0; 597 598 if ((reg * 4) < adev->rmmio_size) { 599 if (amdgpu_sriov_vf(adev) && 600 !amdgpu_sriov_runtime(adev) && 601 adev->gfx.rlc.rlcg_reg_access_supported && 602 amdgpu_virt_get_rlcg_reg_access_flag(adev, acc_flags, 603 GC_HWIP, false, 604 &rlcg_flag)) { 605 ret = amdgpu_virt_rlcg_reg_rw(adev, reg, 0, rlcg_flag, xcc_id); 606 } else if (!(acc_flags & AMDGPU_REGS_NO_KIQ) && 607 amdgpu_sriov_runtime(adev) && 608 down_read_trylock(&adev->reset_domain->sem)) { 609 ret = amdgpu_kiq_rreg(adev, reg, xcc_id); 610 up_read(&adev->reset_domain->sem); 611 } else { 612 ret = readl(((void __iomem *)adev->rmmio) + (reg * 4)); 613 } 614 } else { 615 ret = adev->pcie_rreg(adev, reg * 4); 616 } 617 618 return ret; 619 } 620 621 /* 622 * MMIO register write with bytes helper functions 623 * @offset:bytes offset from MMIO start 624 * @value: the value want to be written to the register 625 */ 626 627 /** 628 * amdgpu_mm_wreg8 - read a memory mapped IO register 629 * 630 * @adev: amdgpu_device pointer 631 * @offset: byte aligned register offset 632 * @value: 8 bit value to write 633 * 634 * Writes the value specified to the offset specified. 635 */ 636 void amdgpu_mm_wreg8(struct amdgpu_device *adev, uint32_t offset, uint8_t value) 637 { 638 if (amdgpu_device_skip_hw_access(adev)) 639 return; 640 641 if (offset < adev->rmmio_size) 642 writeb(value, adev->rmmio + offset); 643 else 644 BUG(); 645 } 646 647 /** 648 * amdgpu_device_wreg - write to a memory mapped IO or indirect register 649 * 650 * @adev: amdgpu_device pointer 651 * @reg: dword aligned register offset 652 * @v: 32 bit value to write to the register 653 * @acc_flags: access flags which require special behavior 654 * 655 * Writes the value specified to the offset specified. 656 */ 657 void amdgpu_device_wreg(struct amdgpu_device *adev, 658 uint32_t reg, uint32_t v, 659 uint32_t acc_flags) 660 { 661 if (amdgpu_device_skip_hw_access(adev)) 662 return; 663 664 if ((reg * 4) < adev->rmmio_size) { 665 if (!(acc_flags & AMDGPU_REGS_NO_KIQ) && 666 amdgpu_sriov_runtime(adev) && 667 down_read_trylock(&adev->reset_domain->sem)) { 668 amdgpu_kiq_wreg(adev, reg, v, 0); 669 up_read(&adev->reset_domain->sem); 670 } else { 671 writel(v, ((void __iomem *)adev->rmmio) + (reg * 4)); 672 } 673 } else { 674 adev->pcie_wreg(adev, reg * 4, v); 675 } 676 677 trace_amdgpu_device_wreg(adev->pdev->device, reg, v); 678 } 679 680 /** 681 * amdgpu_mm_wreg_mmio_rlc - write register either with direct/indirect mmio or with RLC path if in range 682 * 683 * @adev: amdgpu_device pointer 684 * @reg: mmio/rlc register 685 * @v: value to write 686 * @xcc_id: xcc accelerated compute core id 687 * 688 * this function is invoked only for the debugfs register access 689 */ 690 void amdgpu_mm_wreg_mmio_rlc(struct amdgpu_device *adev, 691 uint32_t reg, uint32_t v, 692 uint32_t xcc_id) 693 { 694 if (amdgpu_device_skip_hw_access(adev)) 695 return; 696 697 if (amdgpu_sriov_fullaccess(adev) && 698 adev->gfx.rlc.funcs && 699 adev->gfx.rlc.funcs->is_rlcg_access_range) { 700 if (adev->gfx.rlc.funcs->is_rlcg_access_range(adev, reg)) 701 return amdgpu_sriov_wreg(adev, reg, v, 0, 0, xcc_id); 702 } else if ((reg * 4) >= adev->rmmio_size) { 703 adev->pcie_wreg(adev, reg * 4, v); 704 } else { 705 writel(v, ((void __iomem *)adev->rmmio) + (reg * 4)); 706 } 707 } 708 709 /** 710 * amdgpu_device_xcc_wreg - write to a memory mapped IO or indirect register with specific XCC 711 * 712 * @adev: amdgpu_device pointer 713 * @reg: dword aligned register offset 714 * @v: 32 bit value to write to the register 715 * @acc_flags: access flags which require special behavior 716 * @xcc_id: xcc accelerated compute core id 717 * 718 * Writes the value specified to the offset specified. 719 */ 720 void amdgpu_device_xcc_wreg(struct amdgpu_device *adev, 721 uint32_t reg, uint32_t v, 722 uint32_t acc_flags, uint32_t xcc_id) 723 { 724 uint32_t rlcg_flag; 725 726 if (amdgpu_device_skip_hw_access(adev)) 727 return; 728 729 if ((reg * 4) < adev->rmmio_size) { 730 if (amdgpu_sriov_vf(adev) && 731 !amdgpu_sriov_runtime(adev) && 732 adev->gfx.rlc.rlcg_reg_access_supported && 733 amdgpu_virt_get_rlcg_reg_access_flag(adev, acc_flags, 734 GC_HWIP, true, 735 &rlcg_flag)) { 736 amdgpu_virt_rlcg_reg_rw(adev, reg, v, rlcg_flag, xcc_id); 737 } else if (!(acc_flags & AMDGPU_REGS_NO_KIQ) && 738 amdgpu_sriov_runtime(adev) && 739 down_read_trylock(&adev->reset_domain->sem)) { 740 amdgpu_kiq_wreg(adev, reg, v, xcc_id); 741 up_read(&adev->reset_domain->sem); 742 } else { 743 writel(v, ((void __iomem *)adev->rmmio) + (reg * 4)); 744 } 745 } else { 746 adev->pcie_wreg(adev, reg * 4, v); 747 } 748 } 749 750 /** 751 * amdgpu_device_indirect_rreg - read an indirect register 752 * 753 * @adev: amdgpu_device pointer 754 * @reg_addr: indirect register address to read from 755 * 756 * Returns the value of indirect register @reg_addr 757 */ 758 u32 amdgpu_device_indirect_rreg(struct amdgpu_device *adev, 759 u32 reg_addr) 760 { 761 unsigned long flags, pcie_index, pcie_data; 762 void __iomem *pcie_index_offset; 763 void __iomem *pcie_data_offset; 764 u32 r; 765 766 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 767 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 768 769 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 770 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 771 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 772 773 writel(reg_addr, pcie_index_offset); 774 readl(pcie_index_offset); 775 r = readl(pcie_data_offset); 776 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 777 778 return r; 779 } 780 781 u32 amdgpu_device_indirect_rreg_ext(struct amdgpu_device *adev, 782 u64 reg_addr) 783 { 784 unsigned long flags, pcie_index, pcie_index_hi, pcie_data; 785 u32 r; 786 void __iomem *pcie_index_offset; 787 void __iomem *pcie_index_hi_offset; 788 void __iomem *pcie_data_offset; 789 790 if (unlikely(!adev->nbio.funcs)) { 791 pcie_index = AMDGPU_PCIE_INDEX_FALLBACK; 792 pcie_data = AMDGPU_PCIE_DATA_FALLBACK; 793 } else { 794 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 795 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 796 } 797 798 if (reg_addr >> 32) { 799 if (unlikely(!adev->nbio.funcs)) 800 pcie_index_hi = AMDGPU_PCIE_INDEX_HI_FALLBACK; 801 else 802 pcie_index_hi = adev->nbio.funcs->get_pcie_index_hi_offset(adev); 803 } else { 804 pcie_index_hi = 0; 805 } 806 807 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 808 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 809 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 810 if (pcie_index_hi != 0) 811 pcie_index_hi_offset = (void __iomem *)adev->rmmio + 812 pcie_index_hi * 4; 813 814 writel(reg_addr, pcie_index_offset); 815 readl(pcie_index_offset); 816 if (pcie_index_hi != 0) { 817 writel((reg_addr >> 32) & 0xff, pcie_index_hi_offset); 818 readl(pcie_index_hi_offset); 819 } 820 r = readl(pcie_data_offset); 821 822 /* clear the high bits */ 823 if (pcie_index_hi != 0) { 824 writel(0, pcie_index_hi_offset); 825 readl(pcie_index_hi_offset); 826 } 827 828 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 829 830 return r; 831 } 832 833 /** 834 * amdgpu_device_indirect_rreg64 - read a 64bits indirect register 835 * 836 * @adev: amdgpu_device pointer 837 * @reg_addr: indirect register address to read from 838 * 839 * Returns the value of indirect register @reg_addr 840 */ 841 u64 amdgpu_device_indirect_rreg64(struct amdgpu_device *adev, 842 u32 reg_addr) 843 { 844 unsigned long flags, pcie_index, pcie_data; 845 void __iomem *pcie_index_offset; 846 void __iomem *pcie_data_offset; 847 u64 r; 848 849 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 850 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 851 852 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 853 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 854 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 855 856 /* read low 32 bits */ 857 writel(reg_addr, pcie_index_offset); 858 readl(pcie_index_offset); 859 r = readl(pcie_data_offset); 860 /* read high 32 bits */ 861 writel(reg_addr + 4, pcie_index_offset); 862 readl(pcie_index_offset); 863 r |= ((u64)readl(pcie_data_offset) << 32); 864 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 865 866 return r; 867 } 868 869 u64 amdgpu_device_indirect_rreg64_ext(struct amdgpu_device *adev, 870 u64 reg_addr) 871 { 872 unsigned long flags, pcie_index, pcie_data; 873 unsigned long pcie_index_hi = 0; 874 void __iomem *pcie_index_offset; 875 void __iomem *pcie_index_hi_offset; 876 void __iomem *pcie_data_offset; 877 u64 r; 878 879 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 880 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 881 if ((reg_addr >> 32) && (adev->nbio.funcs->get_pcie_index_hi_offset)) 882 pcie_index_hi = adev->nbio.funcs->get_pcie_index_hi_offset(adev); 883 884 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 885 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 886 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 887 if (pcie_index_hi != 0) 888 pcie_index_hi_offset = (void __iomem *)adev->rmmio + 889 pcie_index_hi * 4; 890 891 /* read low 32 bits */ 892 writel(reg_addr, pcie_index_offset); 893 readl(pcie_index_offset); 894 if (pcie_index_hi != 0) { 895 writel((reg_addr >> 32) & 0xff, pcie_index_hi_offset); 896 readl(pcie_index_hi_offset); 897 } 898 r = readl(pcie_data_offset); 899 /* read high 32 bits */ 900 writel(reg_addr + 4, pcie_index_offset); 901 readl(pcie_index_offset); 902 if (pcie_index_hi != 0) { 903 writel((reg_addr >> 32) & 0xff, pcie_index_hi_offset); 904 readl(pcie_index_hi_offset); 905 } 906 r |= ((u64)readl(pcie_data_offset) << 32); 907 908 /* clear the high bits */ 909 if (pcie_index_hi != 0) { 910 writel(0, pcie_index_hi_offset); 911 readl(pcie_index_hi_offset); 912 } 913 914 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 915 916 return r; 917 } 918 919 /** 920 * amdgpu_device_indirect_wreg - write an indirect register address 921 * 922 * @adev: amdgpu_device pointer 923 * @reg_addr: indirect register offset 924 * @reg_data: indirect register data 925 * 926 */ 927 void amdgpu_device_indirect_wreg(struct amdgpu_device *adev, 928 u32 reg_addr, u32 reg_data) 929 { 930 unsigned long flags, pcie_index, pcie_data; 931 void __iomem *pcie_index_offset; 932 void __iomem *pcie_data_offset; 933 934 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 935 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 936 937 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 938 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 939 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 940 941 writel(reg_addr, pcie_index_offset); 942 readl(pcie_index_offset); 943 writel(reg_data, pcie_data_offset); 944 readl(pcie_data_offset); 945 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 946 } 947 948 void amdgpu_device_indirect_wreg_ext(struct amdgpu_device *adev, 949 u64 reg_addr, u32 reg_data) 950 { 951 unsigned long flags, pcie_index, pcie_index_hi, pcie_data; 952 void __iomem *pcie_index_offset; 953 void __iomem *pcie_index_hi_offset; 954 void __iomem *pcie_data_offset; 955 956 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 957 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 958 if ((reg_addr >> 32) && (adev->nbio.funcs->get_pcie_index_hi_offset)) 959 pcie_index_hi = adev->nbio.funcs->get_pcie_index_hi_offset(adev); 960 else 961 pcie_index_hi = 0; 962 963 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 964 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 965 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 966 if (pcie_index_hi != 0) 967 pcie_index_hi_offset = (void __iomem *)adev->rmmio + 968 pcie_index_hi * 4; 969 970 writel(reg_addr, pcie_index_offset); 971 readl(pcie_index_offset); 972 if (pcie_index_hi != 0) { 973 writel((reg_addr >> 32) & 0xff, pcie_index_hi_offset); 974 readl(pcie_index_hi_offset); 975 } 976 writel(reg_data, pcie_data_offset); 977 readl(pcie_data_offset); 978 979 /* clear the high bits */ 980 if (pcie_index_hi != 0) { 981 writel(0, pcie_index_hi_offset); 982 readl(pcie_index_hi_offset); 983 } 984 985 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 986 } 987 988 /** 989 * amdgpu_device_indirect_wreg64 - write a 64bits indirect register address 990 * 991 * @adev: amdgpu_device pointer 992 * @reg_addr: indirect register offset 993 * @reg_data: indirect register data 994 * 995 */ 996 void amdgpu_device_indirect_wreg64(struct amdgpu_device *adev, 997 u32 reg_addr, u64 reg_data) 998 { 999 unsigned long flags, pcie_index, pcie_data; 1000 void __iomem *pcie_index_offset; 1001 void __iomem *pcie_data_offset; 1002 1003 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 1004 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 1005 1006 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 1007 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 1008 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 1009 1010 /* write low 32 bits */ 1011 writel(reg_addr, pcie_index_offset); 1012 readl(pcie_index_offset); 1013 writel((u32)(reg_data & 0xffffffffULL), pcie_data_offset); 1014 readl(pcie_data_offset); 1015 /* write high 32 bits */ 1016 writel(reg_addr + 4, pcie_index_offset); 1017 readl(pcie_index_offset); 1018 writel((u32)(reg_data >> 32), pcie_data_offset); 1019 readl(pcie_data_offset); 1020 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 1021 } 1022 1023 void amdgpu_device_indirect_wreg64_ext(struct amdgpu_device *adev, 1024 u64 reg_addr, u64 reg_data) 1025 { 1026 unsigned long flags, pcie_index, pcie_data; 1027 unsigned long pcie_index_hi = 0; 1028 void __iomem *pcie_index_offset; 1029 void __iomem *pcie_index_hi_offset; 1030 void __iomem *pcie_data_offset; 1031 1032 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 1033 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 1034 if ((reg_addr >> 32) && (adev->nbio.funcs->get_pcie_index_hi_offset)) 1035 pcie_index_hi = adev->nbio.funcs->get_pcie_index_hi_offset(adev); 1036 1037 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 1038 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 1039 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 1040 if (pcie_index_hi != 0) 1041 pcie_index_hi_offset = (void __iomem *)adev->rmmio + 1042 pcie_index_hi * 4; 1043 1044 /* write low 32 bits */ 1045 writel(reg_addr, pcie_index_offset); 1046 readl(pcie_index_offset); 1047 if (pcie_index_hi != 0) { 1048 writel((reg_addr >> 32) & 0xff, pcie_index_hi_offset); 1049 readl(pcie_index_hi_offset); 1050 } 1051 writel((u32)(reg_data & 0xffffffffULL), pcie_data_offset); 1052 readl(pcie_data_offset); 1053 /* write high 32 bits */ 1054 writel(reg_addr + 4, pcie_index_offset); 1055 readl(pcie_index_offset); 1056 if (pcie_index_hi != 0) { 1057 writel((reg_addr >> 32) & 0xff, pcie_index_hi_offset); 1058 readl(pcie_index_hi_offset); 1059 } 1060 writel((u32)(reg_data >> 32), pcie_data_offset); 1061 readl(pcie_data_offset); 1062 1063 /* clear the high bits */ 1064 if (pcie_index_hi != 0) { 1065 writel(0, pcie_index_hi_offset); 1066 readl(pcie_index_hi_offset); 1067 } 1068 1069 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 1070 } 1071 1072 /** 1073 * amdgpu_device_get_rev_id - query device rev_id 1074 * 1075 * @adev: amdgpu_device pointer 1076 * 1077 * Return device rev_id 1078 */ 1079 u32 amdgpu_device_get_rev_id(struct amdgpu_device *adev) 1080 { 1081 return adev->nbio.funcs->get_rev_id(adev); 1082 } 1083 1084 /** 1085 * amdgpu_invalid_rreg - dummy reg read function 1086 * 1087 * @adev: amdgpu_device pointer 1088 * @reg: offset of register 1089 * 1090 * Dummy register read function. Used for register blocks 1091 * that certain asics don't have (all asics). 1092 * Returns the value in the register. 1093 */ 1094 static uint32_t amdgpu_invalid_rreg(struct amdgpu_device *adev, uint32_t reg) 1095 { 1096 DRM_ERROR("Invalid callback to read register 0x%04X\n", reg); 1097 BUG(); 1098 return 0; 1099 } 1100 1101 static uint32_t amdgpu_invalid_rreg_ext(struct amdgpu_device *adev, uint64_t reg) 1102 { 1103 DRM_ERROR("Invalid callback to read register 0x%llX\n", reg); 1104 BUG(); 1105 return 0; 1106 } 1107 1108 /** 1109 * amdgpu_invalid_wreg - dummy reg write function 1110 * 1111 * @adev: amdgpu_device pointer 1112 * @reg: offset of register 1113 * @v: value to write to the register 1114 * 1115 * Dummy register read function. Used for register blocks 1116 * that certain asics don't have (all asics). 1117 */ 1118 static void amdgpu_invalid_wreg(struct amdgpu_device *adev, uint32_t reg, uint32_t v) 1119 { 1120 DRM_ERROR("Invalid callback to write register 0x%04X with 0x%08X\n", 1121 reg, v); 1122 BUG(); 1123 } 1124 1125 static void amdgpu_invalid_wreg_ext(struct amdgpu_device *adev, uint64_t reg, uint32_t v) 1126 { 1127 DRM_ERROR("Invalid callback to write register 0x%llX with 0x%08X\n", 1128 reg, v); 1129 BUG(); 1130 } 1131 1132 /** 1133 * amdgpu_invalid_rreg64 - dummy 64 bit reg read function 1134 * 1135 * @adev: amdgpu_device pointer 1136 * @reg: offset of register 1137 * 1138 * Dummy register read function. Used for register blocks 1139 * that certain asics don't have (all asics). 1140 * Returns the value in the register. 1141 */ 1142 static uint64_t amdgpu_invalid_rreg64(struct amdgpu_device *adev, uint32_t reg) 1143 { 1144 DRM_ERROR("Invalid callback to read 64 bit register 0x%04X\n", reg); 1145 BUG(); 1146 return 0; 1147 } 1148 1149 static uint64_t amdgpu_invalid_rreg64_ext(struct amdgpu_device *adev, uint64_t reg) 1150 { 1151 DRM_ERROR("Invalid callback to read register 0x%llX\n", reg); 1152 BUG(); 1153 return 0; 1154 } 1155 1156 /** 1157 * amdgpu_invalid_wreg64 - dummy reg write function 1158 * 1159 * @adev: amdgpu_device pointer 1160 * @reg: offset of register 1161 * @v: value to write to the register 1162 * 1163 * Dummy register read function. Used for register blocks 1164 * that certain asics don't have (all asics). 1165 */ 1166 static void amdgpu_invalid_wreg64(struct amdgpu_device *adev, uint32_t reg, uint64_t v) 1167 { 1168 DRM_ERROR("Invalid callback to write 64 bit register 0x%04X with 0x%08llX\n", 1169 reg, v); 1170 BUG(); 1171 } 1172 1173 static void amdgpu_invalid_wreg64_ext(struct amdgpu_device *adev, uint64_t reg, uint64_t v) 1174 { 1175 DRM_ERROR("Invalid callback to write 64 bit register 0x%llX with 0x%08llX\n", 1176 reg, v); 1177 BUG(); 1178 } 1179 1180 /** 1181 * amdgpu_block_invalid_rreg - dummy reg read function 1182 * 1183 * @adev: amdgpu_device pointer 1184 * @block: offset of instance 1185 * @reg: offset of register 1186 * 1187 * Dummy register read function. Used for register blocks 1188 * that certain asics don't have (all asics). 1189 * Returns the value in the register. 1190 */ 1191 static uint32_t amdgpu_block_invalid_rreg(struct amdgpu_device *adev, 1192 uint32_t block, uint32_t reg) 1193 { 1194 DRM_ERROR("Invalid callback to read register 0x%04X in block 0x%04X\n", 1195 reg, block); 1196 BUG(); 1197 return 0; 1198 } 1199 1200 /** 1201 * amdgpu_block_invalid_wreg - dummy reg write function 1202 * 1203 * @adev: amdgpu_device pointer 1204 * @block: offset of instance 1205 * @reg: offset of register 1206 * @v: value to write to the register 1207 * 1208 * Dummy register read function. Used for register blocks 1209 * that certain asics don't have (all asics). 1210 */ 1211 static void amdgpu_block_invalid_wreg(struct amdgpu_device *adev, 1212 uint32_t block, 1213 uint32_t reg, uint32_t v) 1214 { 1215 DRM_ERROR("Invalid block callback to write register 0x%04X in block 0x%04X with 0x%08X\n", 1216 reg, block, v); 1217 BUG(); 1218 } 1219 1220 /** 1221 * amdgpu_device_asic_init - Wrapper for atom asic_init 1222 * 1223 * @adev: amdgpu_device pointer 1224 * 1225 * Does any asic specific work and then calls atom asic init. 1226 */ 1227 static int amdgpu_device_asic_init(struct amdgpu_device *adev) 1228 { 1229 int ret; 1230 1231 amdgpu_asic_pre_asic_init(adev); 1232 1233 if (amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 3) || 1234 amdgpu_ip_version(adev, GC_HWIP, 0) >= IP_VERSION(11, 0, 0)) { 1235 amdgpu_psp_wait_for_bootloader(adev); 1236 ret = amdgpu_atomfirmware_asic_init(adev, true); 1237 return ret; 1238 } else { 1239 return amdgpu_atom_asic_init(adev->mode_info.atom_context); 1240 } 1241 1242 return 0; 1243 } 1244 1245 /** 1246 * amdgpu_device_mem_scratch_init - allocate the VRAM scratch page 1247 * 1248 * @adev: amdgpu_device pointer 1249 * 1250 * Allocates a scratch page of VRAM for use by various things in the 1251 * driver. 1252 */ 1253 static int amdgpu_device_mem_scratch_init(struct amdgpu_device *adev) 1254 { 1255 return amdgpu_bo_create_kernel(adev, AMDGPU_GPU_PAGE_SIZE, PAGE_SIZE, 1256 AMDGPU_GEM_DOMAIN_VRAM | 1257 AMDGPU_GEM_DOMAIN_GTT, 1258 &adev->mem_scratch.robj, 1259 &adev->mem_scratch.gpu_addr, 1260 (void **)&adev->mem_scratch.ptr); 1261 } 1262 1263 /** 1264 * amdgpu_device_mem_scratch_fini - Free the VRAM scratch page 1265 * 1266 * @adev: amdgpu_device pointer 1267 * 1268 * Frees the VRAM scratch page. 1269 */ 1270 static void amdgpu_device_mem_scratch_fini(struct amdgpu_device *adev) 1271 { 1272 amdgpu_bo_free_kernel(&adev->mem_scratch.robj, NULL, NULL); 1273 } 1274 1275 /** 1276 * amdgpu_device_program_register_sequence - program an array of registers. 1277 * 1278 * @adev: amdgpu_device pointer 1279 * @registers: pointer to the register array 1280 * @array_size: size of the register array 1281 * 1282 * Programs an array or registers with and or masks. 1283 * This is a helper for setting golden registers. 1284 */ 1285 void amdgpu_device_program_register_sequence(struct amdgpu_device *adev, 1286 const u32 *registers, 1287 const u32 array_size) 1288 { 1289 u32 tmp, reg, and_mask, or_mask; 1290 int i; 1291 1292 if (array_size % 3) 1293 return; 1294 1295 for (i = 0; i < array_size; i += 3) { 1296 reg = registers[i + 0]; 1297 and_mask = registers[i + 1]; 1298 or_mask = registers[i + 2]; 1299 1300 if (and_mask == 0xffffffff) { 1301 tmp = or_mask; 1302 } else { 1303 tmp = RREG32(reg); 1304 tmp &= ~and_mask; 1305 if (adev->family >= AMDGPU_FAMILY_AI) 1306 tmp |= (or_mask & and_mask); 1307 else 1308 tmp |= or_mask; 1309 } 1310 WREG32(reg, tmp); 1311 } 1312 } 1313 1314 /** 1315 * amdgpu_device_pci_config_reset - reset the GPU 1316 * 1317 * @adev: amdgpu_device pointer 1318 * 1319 * Resets the GPU using the pci config reset sequence. 1320 * Only applicable to asics prior to vega10. 1321 */ 1322 void amdgpu_device_pci_config_reset(struct amdgpu_device *adev) 1323 { 1324 pci_write_config_dword(adev->pdev, 0x7c, AMDGPU_ASIC_RESET_DATA); 1325 } 1326 1327 /** 1328 * amdgpu_device_pci_reset - reset the GPU using generic PCI means 1329 * 1330 * @adev: amdgpu_device pointer 1331 * 1332 * Resets the GPU using generic pci reset interfaces (FLR, SBR, etc.). 1333 */ 1334 int amdgpu_device_pci_reset(struct amdgpu_device *adev) 1335 { 1336 return pci_reset_function(adev->pdev); 1337 } 1338 1339 /* 1340 * amdgpu_device_wb_*() 1341 * Writeback is the method by which the GPU updates special pages in memory 1342 * with the status of certain GPU events (fences, ring pointers,etc.). 1343 */ 1344 1345 /** 1346 * amdgpu_device_wb_fini - Disable Writeback and free memory 1347 * 1348 * @adev: amdgpu_device pointer 1349 * 1350 * Disables Writeback and frees the Writeback memory (all asics). 1351 * Used at driver shutdown. 1352 */ 1353 static void amdgpu_device_wb_fini(struct amdgpu_device *adev) 1354 { 1355 if (adev->wb.wb_obj) { 1356 amdgpu_bo_free_kernel(&adev->wb.wb_obj, 1357 &adev->wb.gpu_addr, 1358 (void **)&adev->wb.wb); 1359 adev->wb.wb_obj = NULL; 1360 } 1361 } 1362 1363 /** 1364 * amdgpu_device_wb_init - Init Writeback driver info and allocate memory 1365 * 1366 * @adev: amdgpu_device pointer 1367 * 1368 * Initializes writeback and allocates writeback memory (all asics). 1369 * Used at driver startup. 1370 * Returns 0 on success or an -error on failure. 1371 */ 1372 static int amdgpu_device_wb_init(struct amdgpu_device *adev) 1373 { 1374 int r; 1375 1376 if (adev->wb.wb_obj == NULL) { 1377 /* AMDGPU_MAX_WB * sizeof(uint32_t) * 8 = AMDGPU_MAX_WB 256bit slots */ 1378 r = amdgpu_bo_create_kernel(adev, AMDGPU_MAX_WB * sizeof(uint32_t) * 8, 1379 PAGE_SIZE, AMDGPU_GEM_DOMAIN_GTT, 1380 &adev->wb.wb_obj, &adev->wb.gpu_addr, 1381 (void **)&adev->wb.wb); 1382 if (r) { 1383 dev_warn(adev->dev, "(%d) create WB bo failed\n", r); 1384 return r; 1385 } 1386 1387 adev->wb.num_wb = AMDGPU_MAX_WB; 1388 memset(&adev->wb.used, 0, sizeof(adev->wb.used)); 1389 1390 /* clear wb memory */ 1391 memset((char *)adev->wb.wb, 0, AMDGPU_MAX_WB * sizeof(uint32_t) * 8); 1392 } 1393 1394 return 0; 1395 } 1396 1397 /** 1398 * amdgpu_device_wb_get - Allocate a wb entry 1399 * 1400 * @adev: amdgpu_device pointer 1401 * @wb: wb index 1402 * 1403 * Allocate a wb slot for use by the driver (all asics). 1404 * Returns 0 on success or -EINVAL on failure. 1405 */ 1406 int amdgpu_device_wb_get(struct amdgpu_device *adev, u32 *wb) 1407 { 1408 unsigned long offset = find_first_zero_bit(adev->wb.used, adev->wb.num_wb); 1409 1410 if (offset < adev->wb.num_wb) { 1411 __set_bit(offset, adev->wb.used); 1412 *wb = offset << 3; /* convert to dw offset */ 1413 return 0; 1414 } else { 1415 return -EINVAL; 1416 } 1417 } 1418 1419 /** 1420 * amdgpu_device_wb_free - Free a wb entry 1421 * 1422 * @adev: amdgpu_device pointer 1423 * @wb: wb index 1424 * 1425 * Free a wb slot allocated for use by the driver (all asics) 1426 */ 1427 void amdgpu_device_wb_free(struct amdgpu_device *adev, u32 wb) 1428 { 1429 wb >>= 3; 1430 if (wb < adev->wb.num_wb) 1431 __clear_bit(wb, adev->wb.used); 1432 } 1433 1434 /** 1435 * amdgpu_device_resize_fb_bar - try to resize FB BAR 1436 * 1437 * @adev: amdgpu_device pointer 1438 * 1439 * Try to resize FB BAR to make all VRAM CPU accessible. We try very hard not 1440 * to fail, but if any of the BARs is not accessible after the size we abort 1441 * driver loading by returning -ENODEV. 1442 */ 1443 int amdgpu_device_resize_fb_bar(struct amdgpu_device *adev) 1444 { 1445 int rbar_size = pci_rebar_bytes_to_size(adev->gmc.real_vram_size); 1446 struct pci_bus *root; 1447 struct resource *res; 1448 unsigned int i; 1449 u16 cmd; 1450 int r; 1451 1452 if (!IS_ENABLED(CONFIG_PHYS_ADDR_T_64BIT)) 1453 return 0; 1454 1455 /* Bypass for VF */ 1456 if (amdgpu_sriov_vf(adev)) 1457 return 0; 1458 1459 /* PCI_EXT_CAP_ID_VNDR extended capability is located at 0x100 */ 1460 if (!pci_find_ext_capability(adev->pdev, PCI_EXT_CAP_ID_VNDR)) 1461 DRM_WARN("System can't access extended configuration space,please check!!\n"); 1462 1463 /* skip if the bios has already enabled large BAR */ 1464 if (adev->gmc.real_vram_size && 1465 (pci_resource_len(adev->pdev, 0) >= adev->gmc.real_vram_size)) 1466 return 0; 1467 1468 /* Check if the root BUS has 64bit memory resources */ 1469 root = adev->pdev->bus; 1470 while (root->parent) 1471 root = root->parent; 1472 1473 pci_bus_for_each_resource(root, res, i) { 1474 if (res && res->flags & (IORESOURCE_MEM | IORESOURCE_MEM_64) && 1475 res->start > 0x100000000ull) 1476 break; 1477 } 1478 1479 /* Trying to resize is pointless without a root hub window above 4GB */ 1480 if (!res) 1481 return 0; 1482 1483 /* Limit the BAR size to what is available */ 1484 rbar_size = min(fls(pci_rebar_get_possible_sizes(adev->pdev, 0)) - 1, 1485 rbar_size); 1486 1487 /* Disable memory decoding while we change the BAR addresses and size */ 1488 pci_read_config_word(adev->pdev, PCI_COMMAND, &cmd); 1489 pci_write_config_word(adev->pdev, PCI_COMMAND, 1490 cmd & ~PCI_COMMAND_MEMORY); 1491 1492 /* Free the VRAM and doorbell BAR, we most likely need to move both. */ 1493 amdgpu_doorbell_fini(adev); 1494 if (adev->asic_type >= CHIP_BONAIRE) 1495 pci_release_resource(adev->pdev, 2); 1496 1497 pci_release_resource(adev->pdev, 0); 1498 1499 r = pci_resize_resource(adev->pdev, 0, rbar_size); 1500 if (r == -ENOSPC) 1501 DRM_INFO("Not enough PCI address space for a large BAR."); 1502 else if (r && r != -ENOTSUPP) 1503 DRM_ERROR("Problem resizing BAR0 (%d).", r); 1504 1505 pci_assign_unassigned_bus_resources(adev->pdev->bus); 1506 1507 /* When the doorbell or fb BAR isn't available we have no chance of 1508 * using the device. 1509 */ 1510 r = amdgpu_doorbell_init(adev); 1511 if (r || (pci_resource_flags(adev->pdev, 0) & IORESOURCE_UNSET)) 1512 return -ENODEV; 1513 1514 pci_write_config_word(adev->pdev, PCI_COMMAND, cmd); 1515 1516 return 0; 1517 } 1518 1519 static bool amdgpu_device_read_bios(struct amdgpu_device *adev) 1520 { 1521 if (hweight32(adev->aid_mask) && (adev->flags & AMD_IS_APU)) 1522 return false; 1523 1524 return true; 1525 } 1526 1527 /* 1528 * GPU helpers function. 1529 */ 1530 /** 1531 * amdgpu_device_need_post - check if the hw need post or not 1532 * 1533 * @adev: amdgpu_device pointer 1534 * 1535 * Check if the asic has been initialized (all asics) at driver startup 1536 * or post is needed if hw reset is performed. 1537 * Returns true if need or false if not. 1538 */ 1539 bool amdgpu_device_need_post(struct amdgpu_device *adev) 1540 { 1541 uint32_t reg; 1542 1543 if (amdgpu_sriov_vf(adev)) 1544 return false; 1545 1546 if (!amdgpu_device_read_bios(adev)) 1547 return false; 1548 1549 if (amdgpu_passthrough(adev)) { 1550 /* for FIJI: In whole GPU pass-through virtualization case, after VM reboot 1551 * some old smc fw still need driver do vPost otherwise gpu hang, while 1552 * those smc fw version above 22.15 doesn't have this flaw, so we force 1553 * vpost executed for smc version below 22.15 1554 */ 1555 if (adev->asic_type == CHIP_FIJI) { 1556 int err; 1557 uint32_t fw_ver; 1558 1559 err = request_firmware(&adev->pm.fw, "amdgpu/fiji_smc.bin", adev->dev); 1560 /* force vPost if error occured */ 1561 if (err) 1562 return true; 1563 1564 fw_ver = *((uint32_t *)adev->pm.fw->data + 69); 1565 release_firmware(adev->pm.fw); 1566 if (fw_ver < 0x00160e00) 1567 return true; 1568 } 1569 } 1570 1571 /* Don't post if we need to reset whole hive on init */ 1572 if (adev->gmc.xgmi.pending_reset) 1573 return false; 1574 1575 if (adev->has_hw_reset) { 1576 adev->has_hw_reset = false; 1577 return true; 1578 } 1579 1580 /* bios scratch used on CIK+ */ 1581 if (adev->asic_type >= CHIP_BONAIRE) 1582 return amdgpu_atombios_scratch_need_asic_init(adev); 1583 1584 /* check MEM_SIZE for older asics */ 1585 reg = amdgpu_asic_get_config_memsize(adev); 1586 1587 if ((reg != 0) && (reg != 0xffffffff)) 1588 return false; 1589 1590 return true; 1591 } 1592 1593 /* 1594 * Check whether seamless boot is supported. 1595 * 1596 * So far we only support seamless boot on DCE 3.0 or later. 1597 * If users report that it works on older ASICS as well, we may 1598 * loosen this. 1599 */ 1600 bool amdgpu_device_seamless_boot_supported(struct amdgpu_device *adev) 1601 { 1602 switch (amdgpu_seamless) { 1603 case -1: 1604 break; 1605 case 1: 1606 return true; 1607 case 0: 1608 return false; 1609 default: 1610 DRM_ERROR("Invalid value for amdgpu.seamless: %d\n", 1611 amdgpu_seamless); 1612 return false; 1613 } 1614 1615 if (!(adev->flags & AMD_IS_APU)) 1616 return false; 1617 1618 if (adev->mman.keep_stolen_vga_memory) 1619 return false; 1620 1621 return amdgpu_ip_version(adev, DCE_HWIP, 0) >= IP_VERSION(3, 0, 0); 1622 } 1623 1624 /* 1625 * Intel hosts such as Rocket Lake, Alder Lake, Raptor Lake and Sapphire Rapids 1626 * don't support dynamic speed switching. Until we have confirmation from Intel 1627 * that a specific host supports it, it's safer that we keep it disabled for all. 1628 * 1629 * https://edc.intel.com/content/www/us/en/design/products/platforms/details/raptor-lake-s/13th-generation-core-processors-datasheet-volume-1-of-2/005/pci-express-support/ 1630 * https://gitlab.freedesktop.org/drm/amd/-/issues/2663 1631 */ 1632 static bool amdgpu_device_pcie_dynamic_switching_supported(struct amdgpu_device *adev) 1633 { 1634 #if IS_ENABLED(CONFIG_X86) 1635 struct cpuinfo_x86 *c = &cpu_data(0); 1636 1637 /* eGPU change speeds based on USB4 fabric conditions */ 1638 if (dev_is_removable(adev->dev)) 1639 return true; 1640 1641 if (c->x86_vendor == X86_VENDOR_INTEL) 1642 return false; 1643 #endif 1644 return true; 1645 } 1646 1647 /** 1648 * amdgpu_device_should_use_aspm - check if the device should program ASPM 1649 * 1650 * @adev: amdgpu_device pointer 1651 * 1652 * Confirm whether the module parameter and pcie bridge agree that ASPM should 1653 * be set for this device. 1654 * 1655 * Returns true if it should be used or false if not. 1656 */ 1657 bool amdgpu_device_should_use_aspm(struct amdgpu_device *adev) 1658 { 1659 switch (amdgpu_aspm) { 1660 case -1: 1661 break; 1662 case 0: 1663 return false; 1664 case 1: 1665 return true; 1666 default: 1667 return false; 1668 } 1669 if (adev->flags & AMD_IS_APU) 1670 return false; 1671 if (!(adev->pm.pp_feature & PP_PCIE_DPM_MASK)) 1672 return false; 1673 return pcie_aspm_enabled(adev->pdev); 1674 } 1675 1676 /* if we get transitioned to only one device, take VGA back */ 1677 /** 1678 * amdgpu_device_vga_set_decode - enable/disable vga decode 1679 * 1680 * @pdev: PCI device pointer 1681 * @state: enable/disable vga decode 1682 * 1683 * Enable/disable vga decode (all asics). 1684 * Returns VGA resource flags. 1685 */ 1686 static unsigned int amdgpu_device_vga_set_decode(struct pci_dev *pdev, 1687 bool state) 1688 { 1689 struct amdgpu_device *adev = drm_to_adev(pci_get_drvdata(pdev)); 1690 1691 amdgpu_asic_set_vga_state(adev, state); 1692 if (state) 1693 return VGA_RSRC_LEGACY_IO | VGA_RSRC_LEGACY_MEM | 1694 VGA_RSRC_NORMAL_IO | VGA_RSRC_NORMAL_MEM; 1695 else 1696 return VGA_RSRC_NORMAL_IO | VGA_RSRC_NORMAL_MEM; 1697 } 1698 1699 /** 1700 * amdgpu_device_check_block_size - validate the vm block size 1701 * 1702 * @adev: amdgpu_device pointer 1703 * 1704 * Validates the vm block size specified via module parameter. 1705 * The vm block size defines number of bits in page table versus page directory, 1706 * a page is 4KB so we have 12 bits offset, minimum 9 bits in the 1707 * page table and the remaining bits are in the page directory. 1708 */ 1709 static void amdgpu_device_check_block_size(struct amdgpu_device *adev) 1710 { 1711 /* defines number of bits in page table versus page directory, 1712 * a page is 4KB so we have 12 bits offset, minimum 9 bits in the 1713 * page table and the remaining bits are in the page directory 1714 */ 1715 if (amdgpu_vm_block_size == -1) 1716 return; 1717 1718 if (amdgpu_vm_block_size < 9) { 1719 dev_warn(adev->dev, "VM page table size (%d) too small\n", 1720 amdgpu_vm_block_size); 1721 amdgpu_vm_block_size = -1; 1722 } 1723 } 1724 1725 /** 1726 * amdgpu_device_check_vm_size - validate the vm size 1727 * 1728 * @adev: amdgpu_device pointer 1729 * 1730 * Validates the vm size in GB specified via module parameter. 1731 * The VM size is the size of the GPU virtual memory space in GB. 1732 */ 1733 static void amdgpu_device_check_vm_size(struct amdgpu_device *adev) 1734 { 1735 /* no need to check the default value */ 1736 if (amdgpu_vm_size == -1) 1737 return; 1738 1739 if (amdgpu_vm_size < 1) { 1740 dev_warn(adev->dev, "VM size (%d) too small, min is 1GB\n", 1741 amdgpu_vm_size); 1742 amdgpu_vm_size = -1; 1743 } 1744 } 1745 1746 static void amdgpu_device_check_smu_prv_buffer_size(struct amdgpu_device *adev) 1747 { 1748 struct sysinfo si; 1749 bool is_os_64 = (sizeof(void *) == 8); 1750 uint64_t total_memory; 1751 uint64_t dram_size_seven_GB = 0x1B8000000; 1752 uint64_t dram_size_three_GB = 0xB8000000; 1753 1754 if (amdgpu_smu_memory_pool_size == 0) 1755 return; 1756 1757 if (!is_os_64) { 1758 DRM_WARN("Not 64-bit OS, feature not supported\n"); 1759 goto def_value; 1760 } 1761 si_meminfo(&si); 1762 total_memory = (uint64_t)si.totalram * si.mem_unit; 1763 1764 if ((amdgpu_smu_memory_pool_size == 1) || 1765 (amdgpu_smu_memory_pool_size == 2)) { 1766 if (total_memory < dram_size_three_GB) 1767 goto def_value1; 1768 } else if ((amdgpu_smu_memory_pool_size == 4) || 1769 (amdgpu_smu_memory_pool_size == 8)) { 1770 if (total_memory < dram_size_seven_GB) 1771 goto def_value1; 1772 } else { 1773 DRM_WARN("Smu memory pool size not supported\n"); 1774 goto def_value; 1775 } 1776 adev->pm.smu_prv_buffer_size = amdgpu_smu_memory_pool_size << 28; 1777 1778 return; 1779 1780 def_value1: 1781 DRM_WARN("No enough system memory\n"); 1782 def_value: 1783 adev->pm.smu_prv_buffer_size = 0; 1784 } 1785 1786 static int amdgpu_device_init_apu_flags(struct amdgpu_device *adev) 1787 { 1788 if (!(adev->flags & AMD_IS_APU) || 1789 adev->asic_type < CHIP_RAVEN) 1790 return 0; 1791 1792 switch (adev->asic_type) { 1793 case CHIP_RAVEN: 1794 if (adev->pdev->device == 0x15dd) 1795 adev->apu_flags |= AMD_APU_IS_RAVEN; 1796 if (adev->pdev->device == 0x15d8) 1797 adev->apu_flags |= AMD_APU_IS_PICASSO; 1798 break; 1799 case CHIP_RENOIR: 1800 if ((adev->pdev->device == 0x1636) || 1801 (adev->pdev->device == 0x164c)) 1802 adev->apu_flags |= AMD_APU_IS_RENOIR; 1803 else 1804 adev->apu_flags |= AMD_APU_IS_GREEN_SARDINE; 1805 break; 1806 case CHIP_VANGOGH: 1807 adev->apu_flags |= AMD_APU_IS_VANGOGH; 1808 break; 1809 case CHIP_YELLOW_CARP: 1810 break; 1811 case CHIP_CYAN_SKILLFISH: 1812 if ((adev->pdev->device == 0x13FE) || 1813 (adev->pdev->device == 0x143F)) 1814 adev->apu_flags |= AMD_APU_IS_CYAN_SKILLFISH2; 1815 break; 1816 default: 1817 break; 1818 } 1819 1820 return 0; 1821 } 1822 1823 /** 1824 * amdgpu_device_check_arguments - validate module params 1825 * 1826 * @adev: amdgpu_device pointer 1827 * 1828 * Validates certain module parameters and updates 1829 * the associated values used by the driver (all asics). 1830 */ 1831 static int amdgpu_device_check_arguments(struct amdgpu_device *adev) 1832 { 1833 if (amdgpu_sched_jobs < 4) { 1834 dev_warn(adev->dev, "sched jobs (%d) must be at least 4\n", 1835 amdgpu_sched_jobs); 1836 amdgpu_sched_jobs = 4; 1837 } else if (!is_power_of_2(amdgpu_sched_jobs)) { 1838 dev_warn(adev->dev, "sched jobs (%d) must be a power of 2\n", 1839 amdgpu_sched_jobs); 1840 amdgpu_sched_jobs = roundup_pow_of_two(amdgpu_sched_jobs); 1841 } 1842 1843 if (amdgpu_gart_size != -1 && amdgpu_gart_size < 32) { 1844 /* gart size must be greater or equal to 32M */ 1845 dev_warn(adev->dev, "gart size (%d) too small\n", 1846 amdgpu_gart_size); 1847 amdgpu_gart_size = -1; 1848 } 1849 1850 if (amdgpu_gtt_size != -1 && amdgpu_gtt_size < 32) { 1851 /* gtt size must be greater or equal to 32M */ 1852 dev_warn(adev->dev, "gtt size (%d) too small\n", 1853 amdgpu_gtt_size); 1854 amdgpu_gtt_size = -1; 1855 } 1856 1857 /* valid range is between 4 and 9 inclusive */ 1858 if (amdgpu_vm_fragment_size != -1 && 1859 (amdgpu_vm_fragment_size > 9 || amdgpu_vm_fragment_size < 4)) { 1860 dev_warn(adev->dev, "valid range is between 4 and 9\n"); 1861 amdgpu_vm_fragment_size = -1; 1862 } 1863 1864 if (amdgpu_sched_hw_submission < 2) { 1865 dev_warn(adev->dev, "sched hw submission jobs (%d) must be at least 2\n", 1866 amdgpu_sched_hw_submission); 1867 amdgpu_sched_hw_submission = 2; 1868 } else if (!is_power_of_2(amdgpu_sched_hw_submission)) { 1869 dev_warn(adev->dev, "sched hw submission jobs (%d) must be a power of 2\n", 1870 amdgpu_sched_hw_submission); 1871 amdgpu_sched_hw_submission = roundup_pow_of_two(amdgpu_sched_hw_submission); 1872 } 1873 1874 if (amdgpu_reset_method < -1 || amdgpu_reset_method > 4) { 1875 dev_warn(adev->dev, "invalid option for reset method, reverting to default\n"); 1876 amdgpu_reset_method = -1; 1877 } 1878 1879 amdgpu_device_check_smu_prv_buffer_size(adev); 1880 1881 amdgpu_device_check_vm_size(adev); 1882 1883 amdgpu_device_check_block_size(adev); 1884 1885 adev->firmware.load_type = amdgpu_ucode_get_load_type(adev, amdgpu_fw_load_type); 1886 1887 return 0; 1888 } 1889 1890 /** 1891 * amdgpu_switcheroo_set_state - set switcheroo state 1892 * 1893 * @pdev: pci dev pointer 1894 * @state: vga_switcheroo state 1895 * 1896 * Callback for the switcheroo driver. Suspends or resumes 1897 * the asics before or after it is powered up using ACPI methods. 1898 */ 1899 static void amdgpu_switcheroo_set_state(struct pci_dev *pdev, 1900 enum vga_switcheroo_state state) 1901 { 1902 struct drm_device *dev = pci_get_drvdata(pdev); 1903 int r; 1904 1905 if (amdgpu_device_supports_px(dev) && state == VGA_SWITCHEROO_OFF) 1906 return; 1907 1908 if (state == VGA_SWITCHEROO_ON) { 1909 pr_info("switched on\n"); 1910 /* don't suspend or resume card normally */ 1911 dev->switch_power_state = DRM_SWITCH_POWER_CHANGING; 1912 1913 pci_set_power_state(pdev, PCI_D0); 1914 amdgpu_device_load_pci_state(pdev); 1915 r = pci_enable_device(pdev); 1916 if (r) 1917 DRM_WARN("pci_enable_device failed (%d)\n", r); 1918 amdgpu_device_resume(dev, true); 1919 1920 dev->switch_power_state = DRM_SWITCH_POWER_ON; 1921 } else { 1922 pr_info("switched off\n"); 1923 dev->switch_power_state = DRM_SWITCH_POWER_CHANGING; 1924 amdgpu_device_prepare(dev); 1925 amdgpu_device_suspend(dev, true); 1926 amdgpu_device_cache_pci_state(pdev); 1927 /* Shut down the device */ 1928 pci_disable_device(pdev); 1929 pci_set_power_state(pdev, PCI_D3cold); 1930 dev->switch_power_state = DRM_SWITCH_POWER_OFF; 1931 } 1932 } 1933 1934 /** 1935 * amdgpu_switcheroo_can_switch - see if switcheroo state can change 1936 * 1937 * @pdev: pci dev pointer 1938 * 1939 * Callback for the switcheroo driver. Check of the switcheroo 1940 * state can be changed. 1941 * Returns true if the state can be changed, false if not. 1942 */ 1943 static bool amdgpu_switcheroo_can_switch(struct pci_dev *pdev) 1944 { 1945 struct drm_device *dev = pci_get_drvdata(pdev); 1946 1947 /* 1948 * FIXME: open_count is protected by drm_global_mutex but that would lead to 1949 * locking inversion with the driver load path. And the access here is 1950 * completely racy anyway. So don't bother with locking for now. 1951 */ 1952 return atomic_read(&dev->open_count) == 0; 1953 } 1954 1955 static const struct vga_switcheroo_client_ops amdgpu_switcheroo_ops = { 1956 .set_gpu_state = amdgpu_switcheroo_set_state, 1957 .reprobe = NULL, 1958 .can_switch = amdgpu_switcheroo_can_switch, 1959 }; 1960 1961 /** 1962 * amdgpu_device_ip_set_clockgating_state - set the CG state 1963 * 1964 * @dev: amdgpu_device pointer 1965 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.) 1966 * @state: clockgating state (gate or ungate) 1967 * 1968 * Sets the requested clockgating state for all instances of 1969 * the hardware IP specified. 1970 * Returns the error code from the last instance. 1971 */ 1972 int amdgpu_device_ip_set_clockgating_state(void *dev, 1973 enum amd_ip_block_type block_type, 1974 enum amd_clockgating_state state) 1975 { 1976 struct amdgpu_device *adev = dev; 1977 int i, r = 0; 1978 1979 for (i = 0; i < adev->num_ip_blocks; i++) { 1980 if (!adev->ip_blocks[i].status.valid) 1981 continue; 1982 if (adev->ip_blocks[i].version->type != block_type) 1983 continue; 1984 if (!adev->ip_blocks[i].version->funcs->set_clockgating_state) 1985 continue; 1986 r = adev->ip_blocks[i].version->funcs->set_clockgating_state( 1987 (void *)adev, state); 1988 if (r) 1989 DRM_ERROR("set_clockgating_state of IP block <%s> failed %d\n", 1990 adev->ip_blocks[i].version->funcs->name, r); 1991 } 1992 return r; 1993 } 1994 1995 /** 1996 * amdgpu_device_ip_set_powergating_state - set the PG state 1997 * 1998 * @dev: amdgpu_device pointer 1999 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.) 2000 * @state: powergating state (gate or ungate) 2001 * 2002 * Sets the requested powergating state for all instances of 2003 * the hardware IP specified. 2004 * Returns the error code from the last instance. 2005 */ 2006 int amdgpu_device_ip_set_powergating_state(void *dev, 2007 enum amd_ip_block_type block_type, 2008 enum amd_powergating_state state) 2009 { 2010 struct amdgpu_device *adev = dev; 2011 int i, r = 0; 2012 2013 for (i = 0; i < adev->num_ip_blocks; i++) { 2014 if (!adev->ip_blocks[i].status.valid) 2015 continue; 2016 if (adev->ip_blocks[i].version->type != block_type) 2017 continue; 2018 if (!adev->ip_blocks[i].version->funcs->set_powergating_state) 2019 continue; 2020 r = adev->ip_blocks[i].version->funcs->set_powergating_state( 2021 (void *)adev, state); 2022 if (r) 2023 DRM_ERROR("set_powergating_state of IP block <%s> failed %d\n", 2024 adev->ip_blocks[i].version->funcs->name, r); 2025 } 2026 return r; 2027 } 2028 2029 /** 2030 * amdgpu_device_ip_get_clockgating_state - get the CG state 2031 * 2032 * @adev: amdgpu_device pointer 2033 * @flags: clockgating feature flags 2034 * 2035 * Walks the list of IPs on the device and updates the clockgating 2036 * flags for each IP. 2037 * Updates @flags with the feature flags for each hardware IP where 2038 * clockgating is enabled. 2039 */ 2040 void amdgpu_device_ip_get_clockgating_state(struct amdgpu_device *adev, 2041 u64 *flags) 2042 { 2043 int i; 2044 2045 for (i = 0; i < adev->num_ip_blocks; i++) { 2046 if (!adev->ip_blocks[i].status.valid) 2047 continue; 2048 if (adev->ip_blocks[i].version->funcs->get_clockgating_state) 2049 adev->ip_blocks[i].version->funcs->get_clockgating_state((void *)adev, flags); 2050 } 2051 } 2052 2053 /** 2054 * amdgpu_device_ip_wait_for_idle - wait for idle 2055 * 2056 * @adev: amdgpu_device pointer 2057 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.) 2058 * 2059 * Waits for the request hardware IP to be idle. 2060 * Returns 0 for success or a negative error code on failure. 2061 */ 2062 int amdgpu_device_ip_wait_for_idle(struct amdgpu_device *adev, 2063 enum amd_ip_block_type block_type) 2064 { 2065 int i, r; 2066 2067 for (i = 0; i < adev->num_ip_blocks; i++) { 2068 if (!adev->ip_blocks[i].status.valid) 2069 continue; 2070 if (adev->ip_blocks[i].version->type == block_type) { 2071 r = adev->ip_blocks[i].version->funcs->wait_for_idle((void *)adev); 2072 if (r) 2073 return r; 2074 break; 2075 } 2076 } 2077 return 0; 2078 2079 } 2080 2081 /** 2082 * amdgpu_device_ip_is_idle - is the hardware IP idle 2083 * 2084 * @adev: amdgpu_device pointer 2085 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.) 2086 * 2087 * Check if the hardware IP is idle or not. 2088 * Returns true if it the IP is idle, false if not. 2089 */ 2090 bool amdgpu_device_ip_is_idle(struct amdgpu_device *adev, 2091 enum amd_ip_block_type block_type) 2092 { 2093 int i; 2094 2095 for (i = 0; i < adev->num_ip_blocks; i++) { 2096 if (!adev->ip_blocks[i].status.valid) 2097 continue; 2098 if (adev->ip_blocks[i].version->type == block_type) 2099 return adev->ip_blocks[i].version->funcs->is_idle((void *)adev); 2100 } 2101 return true; 2102 2103 } 2104 2105 /** 2106 * amdgpu_device_ip_get_ip_block - get a hw IP pointer 2107 * 2108 * @adev: amdgpu_device pointer 2109 * @type: Type of hardware IP (SMU, GFX, UVD, etc.) 2110 * 2111 * Returns a pointer to the hardware IP block structure 2112 * if it exists for the asic, otherwise NULL. 2113 */ 2114 struct amdgpu_ip_block * 2115 amdgpu_device_ip_get_ip_block(struct amdgpu_device *adev, 2116 enum amd_ip_block_type type) 2117 { 2118 int i; 2119 2120 for (i = 0; i < adev->num_ip_blocks; i++) 2121 if (adev->ip_blocks[i].version->type == type) 2122 return &adev->ip_blocks[i]; 2123 2124 return NULL; 2125 } 2126 2127 /** 2128 * amdgpu_device_ip_block_version_cmp 2129 * 2130 * @adev: amdgpu_device pointer 2131 * @type: enum amd_ip_block_type 2132 * @major: major version 2133 * @minor: minor version 2134 * 2135 * return 0 if equal or greater 2136 * return 1 if smaller or the ip_block doesn't exist 2137 */ 2138 int amdgpu_device_ip_block_version_cmp(struct amdgpu_device *adev, 2139 enum amd_ip_block_type type, 2140 u32 major, u32 minor) 2141 { 2142 struct amdgpu_ip_block *ip_block = amdgpu_device_ip_get_ip_block(adev, type); 2143 2144 if (ip_block && ((ip_block->version->major > major) || 2145 ((ip_block->version->major == major) && 2146 (ip_block->version->minor >= minor)))) 2147 return 0; 2148 2149 return 1; 2150 } 2151 2152 /** 2153 * amdgpu_device_ip_block_add 2154 * 2155 * @adev: amdgpu_device pointer 2156 * @ip_block_version: pointer to the IP to add 2157 * 2158 * Adds the IP block driver information to the collection of IPs 2159 * on the asic. 2160 */ 2161 int amdgpu_device_ip_block_add(struct amdgpu_device *adev, 2162 const struct amdgpu_ip_block_version *ip_block_version) 2163 { 2164 if (!ip_block_version) 2165 return -EINVAL; 2166 2167 switch (ip_block_version->type) { 2168 case AMD_IP_BLOCK_TYPE_VCN: 2169 if (adev->harvest_ip_mask & AMD_HARVEST_IP_VCN_MASK) 2170 return 0; 2171 break; 2172 case AMD_IP_BLOCK_TYPE_JPEG: 2173 if (adev->harvest_ip_mask & AMD_HARVEST_IP_JPEG_MASK) 2174 return 0; 2175 break; 2176 default: 2177 break; 2178 } 2179 2180 DRM_INFO("add ip block number %d <%s>\n", adev->num_ip_blocks, 2181 ip_block_version->funcs->name); 2182 2183 adev->ip_blocks[adev->num_ip_blocks++].version = ip_block_version; 2184 2185 return 0; 2186 } 2187 2188 /** 2189 * amdgpu_device_enable_virtual_display - enable virtual display feature 2190 * 2191 * @adev: amdgpu_device pointer 2192 * 2193 * Enabled the virtual display feature if the user has enabled it via 2194 * the module parameter virtual_display. This feature provides a virtual 2195 * display hardware on headless boards or in virtualized environments. 2196 * This function parses and validates the configuration string specified by 2197 * the user and configues the virtual display configuration (number of 2198 * virtual connectors, crtcs, etc.) specified. 2199 */ 2200 static void amdgpu_device_enable_virtual_display(struct amdgpu_device *adev) 2201 { 2202 adev->enable_virtual_display = false; 2203 2204 if (amdgpu_virtual_display) { 2205 const char *pci_address_name = pci_name(adev->pdev); 2206 char *pciaddstr, *pciaddstr_tmp, *pciaddname_tmp, *pciaddname; 2207 2208 pciaddstr = kstrdup(amdgpu_virtual_display, GFP_KERNEL); 2209 pciaddstr_tmp = pciaddstr; 2210 while ((pciaddname_tmp = strsep(&pciaddstr_tmp, ";"))) { 2211 pciaddname = strsep(&pciaddname_tmp, ","); 2212 if (!strcmp("all", pciaddname) 2213 || !strcmp(pci_address_name, pciaddname)) { 2214 long num_crtc; 2215 int res = -1; 2216 2217 adev->enable_virtual_display = true; 2218 2219 if (pciaddname_tmp) 2220 res = kstrtol(pciaddname_tmp, 10, 2221 &num_crtc); 2222 2223 if (!res) { 2224 if (num_crtc < 1) 2225 num_crtc = 1; 2226 if (num_crtc > 6) 2227 num_crtc = 6; 2228 adev->mode_info.num_crtc = num_crtc; 2229 } else { 2230 adev->mode_info.num_crtc = 1; 2231 } 2232 break; 2233 } 2234 } 2235 2236 DRM_INFO("virtual display string:%s, %s:virtual_display:%d, num_crtc:%d\n", 2237 amdgpu_virtual_display, pci_address_name, 2238 adev->enable_virtual_display, adev->mode_info.num_crtc); 2239 2240 kfree(pciaddstr); 2241 } 2242 } 2243 2244 void amdgpu_device_set_sriov_virtual_display(struct amdgpu_device *adev) 2245 { 2246 if (amdgpu_sriov_vf(adev) && !adev->enable_virtual_display) { 2247 adev->mode_info.num_crtc = 1; 2248 adev->enable_virtual_display = true; 2249 DRM_INFO("virtual_display:%d, num_crtc:%d\n", 2250 adev->enable_virtual_display, adev->mode_info.num_crtc); 2251 } 2252 } 2253 2254 /** 2255 * amdgpu_device_parse_gpu_info_fw - parse gpu info firmware 2256 * 2257 * @adev: amdgpu_device pointer 2258 * 2259 * Parses the asic configuration parameters specified in the gpu info 2260 * firmware and makes them availale to the driver for use in configuring 2261 * the asic. 2262 * Returns 0 on success, -EINVAL on failure. 2263 */ 2264 static int amdgpu_device_parse_gpu_info_fw(struct amdgpu_device *adev) 2265 { 2266 const char *chip_name; 2267 char fw_name[40]; 2268 int err; 2269 const struct gpu_info_firmware_header_v1_0 *hdr; 2270 2271 adev->firmware.gpu_info_fw = NULL; 2272 2273 if (adev->mman.discovery_bin) 2274 return 0; 2275 2276 switch (adev->asic_type) { 2277 default: 2278 return 0; 2279 case CHIP_VEGA10: 2280 chip_name = "vega10"; 2281 break; 2282 case CHIP_VEGA12: 2283 chip_name = "vega12"; 2284 break; 2285 case CHIP_RAVEN: 2286 if (adev->apu_flags & AMD_APU_IS_RAVEN2) 2287 chip_name = "raven2"; 2288 else if (adev->apu_flags & AMD_APU_IS_PICASSO) 2289 chip_name = "picasso"; 2290 else 2291 chip_name = "raven"; 2292 break; 2293 case CHIP_ARCTURUS: 2294 chip_name = "arcturus"; 2295 break; 2296 case CHIP_NAVI12: 2297 chip_name = "navi12"; 2298 break; 2299 } 2300 2301 snprintf(fw_name, sizeof(fw_name), "amdgpu/%s_gpu_info.bin", chip_name); 2302 err = amdgpu_ucode_request(adev, &adev->firmware.gpu_info_fw, fw_name); 2303 if (err) { 2304 dev_err(adev->dev, 2305 "Failed to get gpu_info firmware \"%s\"\n", 2306 fw_name); 2307 goto out; 2308 } 2309 2310 hdr = (const struct gpu_info_firmware_header_v1_0 *)adev->firmware.gpu_info_fw->data; 2311 amdgpu_ucode_print_gpu_info_hdr(&hdr->header); 2312 2313 switch (hdr->version_major) { 2314 case 1: 2315 { 2316 const struct gpu_info_firmware_v1_0 *gpu_info_fw = 2317 (const struct gpu_info_firmware_v1_0 *)(adev->firmware.gpu_info_fw->data + 2318 le32_to_cpu(hdr->header.ucode_array_offset_bytes)); 2319 2320 /* 2321 * Should be droped when DAL no longer needs it. 2322 */ 2323 if (adev->asic_type == CHIP_NAVI12) 2324 goto parse_soc_bounding_box; 2325 2326 adev->gfx.config.max_shader_engines = le32_to_cpu(gpu_info_fw->gc_num_se); 2327 adev->gfx.config.max_cu_per_sh = le32_to_cpu(gpu_info_fw->gc_num_cu_per_sh); 2328 adev->gfx.config.max_sh_per_se = le32_to_cpu(gpu_info_fw->gc_num_sh_per_se); 2329 adev->gfx.config.max_backends_per_se = le32_to_cpu(gpu_info_fw->gc_num_rb_per_se); 2330 adev->gfx.config.max_texture_channel_caches = 2331 le32_to_cpu(gpu_info_fw->gc_num_tccs); 2332 adev->gfx.config.max_gprs = le32_to_cpu(gpu_info_fw->gc_num_gprs); 2333 adev->gfx.config.max_gs_threads = le32_to_cpu(gpu_info_fw->gc_num_max_gs_thds); 2334 adev->gfx.config.gs_vgt_table_depth = le32_to_cpu(gpu_info_fw->gc_gs_table_depth); 2335 adev->gfx.config.gs_prim_buffer_depth = le32_to_cpu(gpu_info_fw->gc_gsprim_buff_depth); 2336 adev->gfx.config.double_offchip_lds_buf = 2337 le32_to_cpu(gpu_info_fw->gc_double_offchip_lds_buffer); 2338 adev->gfx.cu_info.wave_front_size = le32_to_cpu(gpu_info_fw->gc_wave_size); 2339 adev->gfx.cu_info.max_waves_per_simd = 2340 le32_to_cpu(gpu_info_fw->gc_max_waves_per_simd); 2341 adev->gfx.cu_info.max_scratch_slots_per_cu = 2342 le32_to_cpu(gpu_info_fw->gc_max_scratch_slots_per_cu); 2343 adev->gfx.cu_info.lds_size = le32_to_cpu(gpu_info_fw->gc_lds_size); 2344 if (hdr->version_minor >= 1) { 2345 const struct gpu_info_firmware_v1_1 *gpu_info_fw = 2346 (const struct gpu_info_firmware_v1_1 *)(adev->firmware.gpu_info_fw->data + 2347 le32_to_cpu(hdr->header.ucode_array_offset_bytes)); 2348 adev->gfx.config.num_sc_per_sh = 2349 le32_to_cpu(gpu_info_fw->num_sc_per_sh); 2350 adev->gfx.config.num_packer_per_sc = 2351 le32_to_cpu(gpu_info_fw->num_packer_per_sc); 2352 } 2353 2354 parse_soc_bounding_box: 2355 /* 2356 * soc bounding box info is not integrated in disocovery table, 2357 * we always need to parse it from gpu info firmware if needed. 2358 */ 2359 if (hdr->version_minor == 2) { 2360 const struct gpu_info_firmware_v1_2 *gpu_info_fw = 2361 (const struct gpu_info_firmware_v1_2 *)(adev->firmware.gpu_info_fw->data + 2362 le32_to_cpu(hdr->header.ucode_array_offset_bytes)); 2363 adev->dm.soc_bounding_box = &gpu_info_fw->soc_bounding_box; 2364 } 2365 break; 2366 } 2367 default: 2368 dev_err(adev->dev, 2369 "Unsupported gpu_info table %d\n", hdr->header.ucode_version); 2370 err = -EINVAL; 2371 goto out; 2372 } 2373 out: 2374 return err; 2375 } 2376 2377 /** 2378 * amdgpu_device_ip_early_init - run early init for hardware IPs 2379 * 2380 * @adev: amdgpu_device pointer 2381 * 2382 * Early initialization pass for hardware IPs. The hardware IPs that make 2383 * up each asic are discovered each IP's early_init callback is run. This 2384 * is the first stage in initializing the asic. 2385 * Returns 0 on success, negative error code on failure. 2386 */ 2387 static int amdgpu_device_ip_early_init(struct amdgpu_device *adev) 2388 { 2389 struct pci_dev *parent; 2390 int i, r; 2391 bool total; 2392 2393 amdgpu_device_enable_virtual_display(adev); 2394 2395 if (amdgpu_sriov_vf(adev)) { 2396 r = amdgpu_virt_request_full_gpu(adev, true); 2397 if (r) 2398 return r; 2399 } 2400 2401 switch (adev->asic_type) { 2402 #ifdef CONFIG_DRM_AMDGPU_SI 2403 case CHIP_VERDE: 2404 case CHIP_TAHITI: 2405 case CHIP_PITCAIRN: 2406 case CHIP_OLAND: 2407 case CHIP_HAINAN: 2408 adev->family = AMDGPU_FAMILY_SI; 2409 r = si_set_ip_blocks(adev); 2410 if (r) 2411 return r; 2412 break; 2413 #endif 2414 #ifdef CONFIG_DRM_AMDGPU_CIK 2415 case CHIP_BONAIRE: 2416 case CHIP_HAWAII: 2417 case CHIP_KAVERI: 2418 case CHIP_KABINI: 2419 case CHIP_MULLINS: 2420 if (adev->flags & AMD_IS_APU) 2421 adev->family = AMDGPU_FAMILY_KV; 2422 else 2423 adev->family = AMDGPU_FAMILY_CI; 2424 2425 r = cik_set_ip_blocks(adev); 2426 if (r) 2427 return r; 2428 break; 2429 #endif 2430 case CHIP_TOPAZ: 2431 case CHIP_TONGA: 2432 case CHIP_FIJI: 2433 case CHIP_POLARIS10: 2434 case CHIP_POLARIS11: 2435 case CHIP_POLARIS12: 2436 case CHIP_VEGAM: 2437 case CHIP_CARRIZO: 2438 case CHIP_STONEY: 2439 if (adev->flags & AMD_IS_APU) 2440 adev->family = AMDGPU_FAMILY_CZ; 2441 else 2442 adev->family = AMDGPU_FAMILY_VI; 2443 2444 r = vi_set_ip_blocks(adev); 2445 if (r) 2446 return r; 2447 break; 2448 default: 2449 r = amdgpu_discovery_set_ip_blocks(adev); 2450 if (r) 2451 return r; 2452 break; 2453 } 2454 2455 if (amdgpu_has_atpx() && 2456 (amdgpu_is_atpx_hybrid() || 2457 amdgpu_has_atpx_dgpu_power_cntl()) && 2458 ((adev->flags & AMD_IS_APU) == 0) && 2459 !dev_is_removable(&adev->pdev->dev)) 2460 adev->flags |= AMD_IS_PX; 2461 2462 if (!(adev->flags & AMD_IS_APU)) { 2463 parent = pcie_find_root_port(adev->pdev); 2464 adev->has_pr3 = parent ? pci_pr3_present(parent) : false; 2465 } 2466 2467 2468 adev->pm.pp_feature = amdgpu_pp_feature_mask; 2469 if (amdgpu_sriov_vf(adev) || sched_policy == KFD_SCHED_POLICY_NO_HWS) 2470 adev->pm.pp_feature &= ~PP_GFXOFF_MASK; 2471 if (amdgpu_sriov_vf(adev) && adev->asic_type == CHIP_SIENNA_CICHLID) 2472 adev->pm.pp_feature &= ~PP_OVERDRIVE_MASK; 2473 if (!amdgpu_device_pcie_dynamic_switching_supported(adev)) 2474 adev->pm.pp_feature &= ~PP_PCIE_DPM_MASK; 2475 2476 total = true; 2477 for (i = 0; i < adev->num_ip_blocks; i++) { 2478 if ((amdgpu_ip_block_mask & (1 << i)) == 0) { 2479 DRM_WARN("disabled ip block: %d <%s>\n", 2480 i, adev->ip_blocks[i].version->funcs->name); 2481 adev->ip_blocks[i].status.valid = false; 2482 } else { 2483 if (adev->ip_blocks[i].version->funcs->early_init) { 2484 r = adev->ip_blocks[i].version->funcs->early_init((void *)adev); 2485 if (r == -ENOENT) { 2486 adev->ip_blocks[i].status.valid = false; 2487 } else if (r) { 2488 DRM_ERROR("early_init of IP block <%s> failed %d\n", 2489 adev->ip_blocks[i].version->funcs->name, r); 2490 total = false; 2491 } else { 2492 adev->ip_blocks[i].status.valid = true; 2493 } 2494 } else { 2495 adev->ip_blocks[i].status.valid = true; 2496 } 2497 } 2498 /* get the vbios after the asic_funcs are set up */ 2499 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON) { 2500 r = amdgpu_device_parse_gpu_info_fw(adev); 2501 if (r) 2502 return r; 2503 2504 /* Read BIOS */ 2505 if (amdgpu_device_read_bios(adev)) { 2506 if (!amdgpu_get_bios(adev)) 2507 return -EINVAL; 2508 2509 r = amdgpu_atombios_init(adev); 2510 if (r) { 2511 dev_err(adev->dev, "amdgpu_atombios_init failed\n"); 2512 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_INIT_FAIL, 0, 0); 2513 return r; 2514 } 2515 } 2516 2517 /*get pf2vf msg info at it's earliest time*/ 2518 if (amdgpu_sriov_vf(adev)) 2519 amdgpu_virt_init_data_exchange(adev); 2520 2521 } 2522 } 2523 if (!total) 2524 return -ENODEV; 2525 2526 amdgpu_amdkfd_device_probe(adev); 2527 adev->cg_flags &= amdgpu_cg_mask; 2528 adev->pg_flags &= amdgpu_pg_mask; 2529 2530 return 0; 2531 } 2532 2533 static int amdgpu_device_ip_hw_init_phase1(struct amdgpu_device *adev) 2534 { 2535 int i, r; 2536 2537 for (i = 0; i < adev->num_ip_blocks; i++) { 2538 if (!adev->ip_blocks[i].status.sw) 2539 continue; 2540 if (adev->ip_blocks[i].status.hw) 2541 continue; 2542 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON || 2543 (amdgpu_sriov_vf(adev) && (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP)) || 2544 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH) { 2545 r = adev->ip_blocks[i].version->funcs->hw_init(adev); 2546 if (r) { 2547 DRM_ERROR("hw_init of IP block <%s> failed %d\n", 2548 adev->ip_blocks[i].version->funcs->name, r); 2549 return r; 2550 } 2551 adev->ip_blocks[i].status.hw = true; 2552 } 2553 } 2554 2555 return 0; 2556 } 2557 2558 static int amdgpu_device_ip_hw_init_phase2(struct amdgpu_device *adev) 2559 { 2560 int i, r; 2561 2562 for (i = 0; i < adev->num_ip_blocks; i++) { 2563 if (!adev->ip_blocks[i].status.sw) 2564 continue; 2565 if (adev->ip_blocks[i].status.hw) 2566 continue; 2567 r = adev->ip_blocks[i].version->funcs->hw_init(adev); 2568 if (r) { 2569 DRM_ERROR("hw_init of IP block <%s> failed %d\n", 2570 adev->ip_blocks[i].version->funcs->name, r); 2571 return r; 2572 } 2573 adev->ip_blocks[i].status.hw = true; 2574 } 2575 2576 return 0; 2577 } 2578 2579 static int amdgpu_device_fw_loading(struct amdgpu_device *adev) 2580 { 2581 int r = 0; 2582 int i; 2583 uint32_t smu_version; 2584 2585 if (adev->asic_type >= CHIP_VEGA10) { 2586 for (i = 0; i < adev->num_ip_blocks; i++) { 2587 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_PSP) 2588 continue; 2589 2590 if (!adev->ip_blocks[i].status.sw) 2591 continue; 2592 2593 /* no need to do the fw loading again if already done*/ 2594 if (adev->ip_blocks[i].status.hw == true) 2595 break; 2596 2597 if (amdgpu_in_reset(adev) || adev->in_suspend) { 2598 r = adev->ip_blocks[i].version->funcs->resume(adev); 2599 if (r) { 2600 DRM_ERROR("resume of IP block <%s> failed %d\n", 2601 adev->ip_blocks[i].version->funcs->name, r); 2602 return r; 2603 } 2604 } else { 2605 r = adev->ip_blocks[i].version->funcs->hw_init(adev); 2606 if (r) { 2607 DRM_ERROR("hw_init of IP block <%s> failed %d\n", 2608 adev->ip_blocks[i].version->funcs->name, r); 2609 return r; 2610 } 2611 } 2612 2613 adev->ip_blocks[i].status.hw = true; 2614 break; 2615 } 2616 } 2617 2618 if (!amdgpu_sriov_vf(adev) || adev->asic_type == CHIP_TONGA) 2619 r = amdgpu_pm_load_smu_firmware(adev, &smu_version); 2620 2621 return r; 2622 } 2623 2624 static int amdgpu_device_init_schedulers(struct amdgpu_device *adev) 2625 { 2626 long timeout; 2627 int r, i; 2628 2629 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 2630 struct amdgpu_ring *ring = adev->rings[i]; 2631 2632 /* No need to setup the GPU scheduler for rings that don't need it */ 2633 if (!ring || ring->no_scheduler) 2634 continue; 2635 2636 switch (ring->funcs->type) { 2637 case AMDGPU_RING_TYPE_GFX: 2638 timeout = adev->gfx_timeout; 2639 break; 2640 case AMDGPU_RING_TYPE_COMPUTE: 2641 timeout = adev->compute_timeout; 2642 break; 2643 case AMDGPU_RING_TYPE_SDMA: 2644 timeout = adev->sdma_timeout; 2645 break; 2646 default: 2647 timeout = adev->video_timeout; 2648 break; 2649 } 2650 2651 r = drm_sched_init(&ring->sched, &amdgpu_sched_ops, NULL, 2652 DRM_SCHED_PRIORITY_COUNT, 2653 ring->num_hw_submission, 0, 2654 timeout, adev->reset_domain->wq, 2655 ring->sched_score, ring->name, 2656 adev->dev); 2657 if (r) { 2658 DRM_ERROR("Failed to create scheduler on ring %s.\n", 2659 ring->name); 2660 return r; 2661 } 2662 r = amdgpu_uvd_entity_init(adev, ring); 2663 if (r) { 2664 DRM_ERROR("Failed to create UVD scheduling entity on ring %s.\n", 2665 ring->name); 2666 return r; 2667 } 2668 r = amdgpu_vce_entity_init(adev, ring); 2669 if (r) { 2670 DRM_ERROR("Failed to create VCE scheduling entity on ring %s.\n", 2671 ring->name); 2672 return r; 2673 } 2674 } 2675 2676 amdgpu_xcp_update_partition_sched_list(adev); 2677 2678 return 0; 2679 } 2680 2681 2682 /** 2683 * amdgpu_device_ip_init - run init for hardware IPs 2684 * 2685 * @adev: amdgpu_device pointer 2686 * 2687 * Main initialization pass for hardware IPs. The list of all the hardware 2688 * IPs that make up the asic is walked and the sw_init and hw_init callbacks 2689 * are run. sw_init initializes the software state associated with each IP 2690 * and hw_init initializes the hardware associated with each IP. 2691 * Returns 0 on success, negative error code on failure. 2692 */ 2693 static int amdgpu_device_ip_init(struct amdgpu_device *adev) 2694 { 2695 int i, r; 2696 2697 r = amdgpu_ras_init(adev); 2698 if (r) 2699 return r; 2700 2701 for (i = 0; i < adev->num_ip_blocks; i++) { 2702 if (!adev->ip_blocks[i].status.valid) 2703 continue; 2704 r = adev->ip_blocks[i].version->funcs->sw_init((void *)adev); 2705 if (r) { 2706 DRM_ERROR("sw_init of IP block <%s> failed %d\n", 2707 adev->ip_blocks[i].version->funcs->name, r); 2708 goto init_failed; 2709 } 2710 adev->ip_blocks[i].status.sw = true; 2711 2712 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON) { 2713 /* need to do common hw init early so everything is set up for gmc */ 2714 r = adev->ip_blocks[i].version->funcs->hw_init((void *)adev); 2715 if (r) { 2716 DRM_ERROR("hw_init %d failed %d\n", i, r); 2717 goto init_failed; 2718 } 2719 adev->ip_blocks[i].status.hw = true; 2720 } else if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) { 2721 /* need to do gmc hw init early so we can allocate gpu mem */ 2722 /* Try to reserve bad pages early */ 2723 if (amdgpu_sriov_vf(adev)) 2724 amdgpu_virt_exchange_data(adev); 2725 2726 r = amdgpu_device_mem_scratch_init(adev); 2727 if (r) { 2728 DRM_ERROR("amdgpu_mem_scratch_init failed %d\n", r); 2729 goto init_failed; 2730 } 2731 r = adev->ip_blocks[i].version->funcs->hw_init((void *)adev); 2732 if (r) { 2733 DRM_ERROR("hw_init %d failed %d\n", i, r); 2734 goto init_failed; 2735 } 2736 r = amdgpu_device_wb_init(adev); 2737 if (r) { 2738 DRM_ERROR("amdgpu_device_wb_init failed %d\n", r); 2739 goto init_failed; 2740 } 2741 adev->ip_blocks[i].status.hw = true; 2742 2743 /* right after GMC hw init, we create CSA */ 2744 if (adev->gfx.mcbp) { 2745 r = amdgpu_allocate_static_csa(adev, &adev->virt.csa_obj, 2746 AMDGPU_GEM_DOMAIN_VRAM | 2747 AMDGPU_GEM_DOMAIN_GTT, 2748 AMDGPU_CSA_SIZE); 2749 if (r) { 2750 DRM_ERROR("allocate CSA failed %d\n", r); 2751 goto init_failed; 2752 } 2753 } 2754 2755 r = amdgpu_seq64_init(adev); 2756 if (r) { 2757 DRM_ERROR("allocate seq64 failed %d\n", r); 2758 goto init_failed; 2759 } 2760 } 2761 } 2762 2763 if (amdgpu_sriov_vf(adev)) 2764 amdgpu_virt_init_data_exchange(adev); 2765 2766 r = amdgpu_ib_pool_init(adev); 2767 if (r) { 2768 dev_err(adev->dev, "IB initialization failed (%d).\n", r); 2769 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_IB_INIT_FAIL, 0, r); 2770 goto init_failed; 2771 } 2772 2773 r = amdgpu_ucode_create_bo(adev); /* create ucode bo when sw_init complete*/ 2774 if (r) 2775 goto init_failed; 2776 2777 r = amdgpu_device_ip_hw_init_phase1(adev); 2778 if (r) 2779 goto init_failed; 2780 2781 r = amdgpu_device_fw_loading(adev); 2782 if (r) 2783 goto init_failed; 2784 2785 r = amdgpu_device_ip_hw_init_phase2(adev); 2786 if (r) 2787 goto init_failed; 2788 2789 /* 2790 * retired pages will be loaded from eeprom and reserved here, 2791 * it should be called after amdgpu_device_ip_hw_init_phase2 since 2792 * for some ASICs the RAS EEPROM code relies on SMU fully functioning 2793 * for I2C communication which only true at this point. 2794 * 2795 * amdgpu_ras_recovery_init may fail, but the upper only cares the 2796 * failure from bad gpu situation and stop amdgpu init process 2797 * accordingly. For other failed cases, it will still release all 2798 * the resource and print error message, rather than returning one 2799 * negative value to upper level. 2800 * 2801 * Note: theoretically, this should be called before all vram allocations 2802 * to protect retired page from abusing 2803 */ 2804 r = amdgpu_ras_recovery_init(adev); 2805 if (r) 2806 goto init_failed; 2807 2808 /** 2809 * In case of XGMI grab extra reference for reset domain for this device 2810 */ 2811 if (adev->gmc.xgmi.num_physical_nodes > 1) { 2812 if (amdgpu_xgmi_add_device(adev) == 0) { 2813 if (!amdgpu_sriov_vf(adev)) { 2814 struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev); 2815 2816 if (WARN_ON(!hive)) { 2817 r = -ENOENT; 2818 goto init_failed; 2819 } 2820 2821 if (!hive->reset_domain || 2822 !amdgpu_reset_get_reset_domain(hive->reset_domain)) { 2823 r = -ENOENT; 2824 amdgpu_put_xgmi_hive(hive); 2825 goto init_failed; 2826 } 2827 2828 /* Drop the early temporary reset domain we created for device */ 2829 amdgpu_reset_put_reset_domain(adev->reset_domain); 2830 adev->reset_domain = hive->reset_domain; 2831 amdgpu_put_xgmi_hive(hive); 2832 } 2833 } 2834 } 2835 2836 r = amdgpu_device_init_schedulers(adev); 2837 if (r) 2838 goto init_failed; 2839 2840 if (adev->mman.buffer_funcs_ring->sched.ready) 2841 amdgpu_ttm_set_buffer_funcs_status(adev, true); 2842 2843 /* Don't init kfd if whole hive need to be reset during init */ 2844 if (!adev->gmc.xgmi.pending_reset) { 2845 kgd2kfd_init_zone_device(adev); 2846 amdgpu_amdkfd_device_init(adev); 2847 } 2848 2849 amdgpu_fru_get_product_info(adev); 2850 2851 init_failed: 2852 2853 return r; 2854 } 2855 2856 /** 2857 * amdgpu_device_fill_reset_magic - writes reset magic to gart pointer 2858 * 2859 * @adev: amdgpu_device pointer 2860 * 2861 * Writes a reset magic value to the gart pointer in VRAM. The driver calls 2862 * this function before a GPU reset. If the value is retained after a 2863 * GPU reset, VRAM has not been lost. Some GPU resets may destry VRAM contents. 2864 */ 2865 static void amdgpu_device_fill_reset_magic(struct amdgpu_device *adev) 2866 { 2867 memcpy(adev->reset_magic, adev->gart.ptr, AMDGPU_RESET_MAGIC_NUM); 2868 } 2869 2870 /** 2871 * amdgpu_device_check_vram_lost - check if vram is valid 2872 * 2873 * @adev: amdgpu_device pointer 2874 * 2875 * Checks the reset magic value written to the gart pointer in VRAM. 2876 * The driver calls this after a GPU reset to see if the contents of 2877 * VRAM is lost or now. 2878 * returns true if vram is lost, false if not. 2879 */ 2880 static bool amdgpu_device_check_vram_lost(struct amdgpu_device *adev) 2881 { 2882 if (memcmp(adev->gart.ptr, adev->reset_magic, 2883 AMDGPU_RESET_MAGIC_NUM)) 2884 return true; 2885 2886 if (!amdgpu_in_reset(adev)) 2887 return false; 2888 2889 /* 2890 * For all ASICs with baco/mode1 reset, the VRAM is 2891 * always assumed to be lost. 2892 */ 2893 switch (amdgpu_asic_reset_method(adev)) { 2894 case AMD_RESET_METHOD_BACO: 2895 case AMD_RESET_METHOD_MODE1: 2896 return true; 2897 default: 2898 return false; 2899 } 2900 } 2901 2902 /** 2903 * amdgpu_device_set_cg_state - set clockgating for amdgpu device 2904 * 2905 * @adev: amdgpu_device pointer 2906 * @state: clockgating state (gate or ungate) 2907 * 2908 * The list of all the hardware IPs that make up the asic is walked and the 2909 * set_clockgating_state callbacks are run. 2910 * Late initialization pass enabling clockgating for hardware IPs. 2911 * Fini or suspend, pass disabling clockgating for hardware IPs. 2912 * Returns 0 on success, negative error code on failure. 2913 */ 2914 2915 int amdgpu_device_set_cg_state(struct amdgpu_device *adev, 2916 enum amd_clockgating_state state) 2917 { 2918 int i, j, r; 2919 2920 if (amdgpu_emu_mode == 1) 2921 return 0; 2922 2923 for (j = 0; j < adev->num_ip_blocks; j++) { 2924 i = state == AMD_CG_STATE_GATE ? j : adev->num_ip_blocks - j - 1; 2925 if (!adev->ip_blocks[i].status.late_initialized) 2926 continue; 2927 /* skip CG for GFX, SDMA on S0ix */ 2928 if (adev->in_s0ix && 2929 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GFX || 2930 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SDMA)) 2931 continue; 2932 /* skip CG for VCE/UVD, it's handled specially */ 2933 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_UVD && 2934 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCE && 2935 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCN && 2936 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_JPEG && 2937 adev->ip_blocks[i].version->funcs->set_clockgating_state) { 2938 /* enable clockgating to save power */ 2939 r = adev->ip_blocks[i].version->funcs->set_clockgating_state((void *)adev, 2940 state); 2941 if (r) { 2942 DRM_ERROR("set_clockgating_state(gate) of IP block <%s> failed %d\n", 2943 adev->ip_blocks[i].version->funcs->name, r); 2944 return r; 2945 } 2946 } 2947 } 2948 2949 return 0; 2950 } 2951 2952 int amdgpu_device_set_pg_state(struct amdgpu_device *adev, 2953 enum amd_powergating_state state) 2954 { 2955 int i, j, r; 2956 2957 if (amdgpu_emu_mode == 1) 2958 return 0; 2959 2960 for (j = 0; j < adev->num_ip_blocks; j++) { 2961 i = state == AMD_PG_STATE_GATE ? j : adev->num_ip_blocks - j - 1; 2962 if (!adev->ip_blocks[i].status.late_initialized) 2963 continue; 2964 /* skip PG for GFX, SDMA on S0ix */ 2965 if (adev->in_s0ix && 2966 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GFX || 2967 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SDMA)) 2968 continue; 2969 /* skip CG for VCE/UVD, it's handled specially */ 2970 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_UVD && 2971 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCE && 2972 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCN && 2973 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_JPEG && 2974 adev->ip_blocks[i].version->funcs->set_powergating_state) { 2975 /* enable powergating to save power */ 2976 r = adev->ip_blocks[i].version->funcs->set_powergating_state((void *)adev, 2977 state); 2978 if (r) { 2979 DRM_ERROR("set_powergating_state(gate) of IP block <%s> failed %d\n", 2980 adev->ip_blocks[i].version->funcs->name, r); 2981 return r; 2982 } 2983 } 2984 } 2985 return 0; 2986 } 2987 2988 static int amdgpu_device_enable_mgpu_fan_boost(void) 2989 { 2990 struct amdgpu_gpu_instance *gpu_ins; 2991 struct amdgpu_device *adev; 2992 int i, ret = 0; 2993 2994 mutex_lock(&mgpu_info.mutex); 2995 2996 /* 2997 * MGPU fan boost feature should be enabled 2998 * only when there are two or more dGPUs in 2999 * the system 3000 */ 3001 if (mgpu_info.num_dgpu < 2) 3002 goto out; 3003 3004 for (i = 0; i < mgpu_info.num_dgpu; i++) { 3005 gpu_ins = &(mgpu_info.gpu_ins[i]); 3006 adev = gpu_ins->adev; 3007 if (!(adev->flags & AMD_IS_APU) && 3008 !gpu_ins->mgpu_fan_enabled) { 3009 ret = amdgpu_dpm_enable_mgpu_fan_boost(adev); 3010 if (ret) 3011 break; 3012 3013 gpu_ins->mgpu_fan_enabled = 1; 3014 } 3015 } 3016 3017 out: 3018 mutex_unlock(&mgpu_info.mutex); 3019 3020 return ret; 3021 } 3022 3023 /** 3024 * amdgpu_device_ip_late_init - run late init for hardware IPs 3025 * 3026 * @adev: amdgpu_device pointer 3027 * 3028 * Late initialization pass for hardware IPs. The list of all the hardware 3029 * IPs that make up the asic is walked and the late_init callbacks are run. 3030 * late_init covers any special initialization that an IP requires 3031 * after all of the have been initialized or something that needs to happen 3032 * late in the init process. 3033 * Returns 0 on success, negative error code on failure. 3034 */ 3035 static int amdgpu_device_ip_late_init(struct amdgpu_device *adev) 3036 { 3037 struct amdgpu_gpu_instance *gpu_instance; 3038 int i = 0, r; 3039 3040 for (i = 0; i < adev->num_ip_blocks; i++) { 3041 if (!adev->ip_blocks[i].status.hw) 3042 continue; 3043 if (adev->ip_blocks[i].version->funcs->late_init) { 3044 r = adev->ip_blocks[i].version->funcs->late_init((void *)adev); 3045 if (r) { 3046 DRM_ERROR("late_init of IP block <%s> failed %d\n", 3047 adev->ip_blocks[i].version->funcs->name, r); 3048 return r; 3049 } 3050 } 3051 adev->ip_blocks[i].status.late_initialized = true; 3052 } 3053 3054 r = amdgpu_ras_late_init(adev); 3055 if (r) { 3056 DRM_ERROR("amdgpu_ras_late_init failed %d", r); 3057 return r; 3058 } 3059 3060 amdgpu_ras_set_error_query_ready(adev, true); 3061 3062 amdgpu_device_set_cg_state(adev, AMD_CG_STATE_GATE); 3063 amdgpu_device_set_pg_state(adev, AMD_PG_STATE_GATE); 3064 3065 amdgpu_device_fill_reset_magic(adev); 3066 3067 r = amdgpu_device_enable_mgpu_fan_boost(); 3068 if (r) 3069 DRM_ERROR("enable mgpu fan boost failed (%d).\n", r); 3070 3071 /* For passthrough configuration on arcturus and aldebaran, enable special handling SBR */ 3072 if (amdgpu_passthrough(adev) && 3073 ((adev->asic_type == CHIP_ARCTURUS && adev->gmc.xgmi.num_physical_nodes > 1) || 3074 adev->asic_type == CHIP_ALDEBARAN)) 3075 amdgpu_dpm_handle_passthrough_sbr(adev, true); 3076 3077 if (adev->gmc.xgmi.num_physical_nodes > 1) { 3078 mutex_lock(&mgpu_info.mutex); 3079 3080 /* 3081 * Reset device p-state to low as this was booted with high. 3082 * 3083 * This should be performed only after all devices from the same 3084 * hive get initialized. 3085 * 3086 * However, it's unknown how many device in the hive in advance. 3087 * As this is counted one by one during devices initializations. 3088 * 3089 * So, we wait for all XGMI interlinked devices initialized. 3090 * This may bring some delays as those devices may come from 3091 * different hives. But that should be OK. 3092 */ 3093 if (mgpu_info.num_dgpu == adev->gmc.xgmi.num_physical_nodes) { 3094 for (i = 0; i < mgpu_info.num_gpu; i++) { 3095 gpu_instance = &(mgpu_info.gpu_ins[i]); 3096 if (gpu_instance->adev->flags & AMD_IS_APU) 3097 continue; 3098 3099 r = amdgpu_xgmi_set_pstate(gpu_instance->adev, 3100 AMDGPU_XGMI_PSTATE_MIN); 3101 if (r) { 3102 DRM_ERROR("pstate setting failed (%d).\n", r); 3103 break; 3104 } 3105 } 3106 } 3107 3108 mutex_unlock(&mgpu_info.mutex); 3109 } 3110 3111 return 0; 3112 } 3113 3114 /** 3115 * amdgpu_device_smu_fini_early - smu hw_fini wrapper 3116 * 3117 * @adev: amdgpu_device pointer 3118 * 3119 * For ASICs need to disable SMC first 3120 */ 3121 static void amdgpu_device_smu_fini_early(struct amdgpu_device *adev) 3122 { 3123 int i, r; 3124 3125 if (amdgpu_ip_version(adev, GC_HWIP, 0) > IP_VERSION(9, 0, 0)) 3126 return; 3127 3128 for (i = 0; i < adev->num_ip_blocks; i++) { 3129 if (!adev->ip_blocks[i].status.hw) 3130 continue; 3131 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) { 3132 r = adev->ip_blocks[i].version->funcs->hw_fini((void *)adev); 3133 /* XXX handle errors */ 3134 if (r) { 3135 DRM_DEBUG("hw_fini of IP block <%s> failed %d\n", 3136 adev->ip_blocks[i].version->funcs->name, r); 3137 } 3138 adev->ip_blocks[i].status.hw = false; 3139 break; 3140 } 3141 } 3142 } 3143 3144 static int amdgpu_device_ip_fini_early(struct amdgpu_device *adev) 3145 { 3146 int i, r; 3147 3148 for (i = 0; i < adev->num_ip_blocks; i++) { 3149 if (!adev->ip_blocks[i].version->funcs->early_fini) 3150 continue; 3151 3152 r = adev->ip_blocks[i].version->funcs->early_fini((void *)adev); 3153 if (r) { 3154 DRM_DEBUG("early_fini of IP block <%s> failed %d\n", 3155 adev->ip_blocks[i].version->funcs->name, r); 3156 } 3157 } 3158 3159 amdgpu_device_set_pg_state(adev, AMD_PG_STATE_UNGATE); 3160 amdgpu_device_set_cg_state(adev, AMD_CG_STATE_UNGATE); 3161 3162 amdgpu_amdkfd_suspend(adev, false); 3163 3164 /* Workaroud for ASICs need to disable SMC first */ 3165 amdgpu_device_smu_fini_early(adev); 3166 3167 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 3168 if (!adev->ip_blocks[i].status.hw) 3169 continue; 3170 3171 r = adev->ip_blocks[i].version->funcs->hw_fini((void *)adev); 3172 /* XXX handle errors */ 3173 if (r) { 3174 DRM_DEBUG("hw_fini of IP block <%s> failed %d\n", 3175 adev->ip_blocks[i].version->funcs->name, r); 3176 } 3177 3178 adev->ip_blocks[i].status.hw = false; 3179 } 3180 3181 if (amdgpu_sriov_vf(adev)) { 3182 if (amdgpu_virt_release_full_gpu(adev, false)) 3183 DRM_ERROR("failed to release exclusive mode on fini\n"); 3184 } 3185 3186 return 0; 3187 } 3188 3189 /** 3190 * amdgpu_device_ip_fini - run fini for hardware IPs 3191 * 3192 * @adev: amdgpu_device pointer 3193 * 3194 * Main teardown pass for hardware IPs. The list of all the hardware 3195 * IPs that make up the asic is walked and the hw_fini and sw_fini callbacks 3196 * are run. hw_fini tears down the hardware associated with each IP 3197 * and sw_fini tears down any software state associated with each IP. 3198 * Returns 0 on success, negative error code on failure. 3199 */ 3200 static int amdgpu_device_ip_fini(struct amdgpu_device *adev) 3201 { 3202 int i, r; 3203 3204 if (amdgpu_sriov_vf(adev) && adev->virt.ras_init_done) 3205 amdgpu_virt_release_ras_err_handler_data(adev); 3206 3207 if (adev->gmc.xgmi.num_physical_nodes > 1) 3208 amdgpu_xgmi_remove_device(adev); 3209 3210 amdgpu_amdkfd_device_fini_sw(adev); 3211 3212 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 3213 if (!adev->ip_blocks[i].status.sw) 3214 continue; 3215 3216 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) { 3217 amdgpu_ucode_free_bo(adev); 3218 amdgpu_free_static_csa(&adev->virt.csa_obj); 3219 amdgpu_device_wb_fini(adev); 3220 amdgpu_device_mem_scratch_fini(adev); 3221 amdgpu_ib_pool_fini(adev); 3222 amdgpu_seq64_fini(adev); 3223 } 3224 3225 r = adev->ip_blocks[i].version->funcs->sw_fini((void *)adev); 3226 /* XXX handle errors */ 3227 if (r) { 3228 DRM_DEBUG("sw_fini of IP block <%s> failed %d\n", 3229 adev->ip_blocks[i].version->funcs->name, r); 3230 } 3231 adev->ip_blocks[i].status.sw = false; 3232 adev->ip_blocks[i].status.valid = false; 3233 } 3234 3235 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 3236 if (!adev->ip_blocks[i].status.late_initialized) 3237 continue; 3238 if (adev->ip_blocks[i].version->funcs->late_fini) 3239 adev->ip_blocks[i].version->funcs->late_fini((void *)adev); 3240 adev->ip_blocks[i].status.late_initialized = false; 3241 } 3242 3243 amdgpu_ras_fini(adev); 3244 3245 return 0; 3246 } 3247 3248 /** 3249 * amdgpu_device_delayed_init_work_handler - work handler for IB tests 3250 * 3251 * @work: work_struct. 3252 */ 3253 static void amdgpu_device_delayed_init_work_handler(struct work_struct *work) 3254 { 3255 struct amdgpu_device *adev = 3256 container_of(work, struct amdgpu_device, delayed_init_work.work); 3257 int r; 3258 3259 r = amdgpu_ib_ring_tests(adev); 3260 if (r) 3261 DRM_ERROR("ib ring test failed (%d).\n", r); 3262 } 3263 3264 static void amdgpu_device_delay_enable_gfx_off(struct work_struct *work) 3265 { 3266 struct amdgpu_device *adev = 3267 container_of(work, struct amdgpu_device, gfx.gfx_off_delay_work.work); 3268 3269 WARN_ON_ONCE(adev->gfx.gfx_off_state); 3270 WARN_ON_ONCE(adev->gfx.gfx_off_req_count); 3271 3272 if (!amdgpu_dpm_set_powergating_by_smu(adev, AMD_IP_BLOCK_TYPE_GFX, true)) 3273 adev->gfx.gfx_off_state = true; 3274 } 3275 3276 /** 3277 * amdgpu_device_ip_suspend_phase1 - run suspend for hardware IPs (phase 1) 3278 * 3279 * @adev: amdgpu_device pointer 3280 * 3281 * Main suspend function for hardware IPs. The list of all the hardware 3282 * IPs that make up the asic is walked, clockgating is disabled and the 3283 * suspend callbacks are run. suspend puts the hardware and software state 3284 * in each IP into a state suitable for suspend. 3285 * Returns 0 on success, negative error code on failure. 3286 */ 3287 static int amdgpu_device_ip_suspend_phase1(struct amdgpu_device *adev) 3288 { 3289 int i, r; 3290 3291 amdgpu_device_set_pg_state(adev, AMD_PG_STATE_UNGATE); 3292 amdgpu_device_set_cg_state(adev, AMD_CG_STATE_UNGATE); 3293 3294 /* 3295 * Per PMFW team's suggestion, driver needs to handle gfxoff 3296 * and df cstate features disablement for gpu reset(e.g. Mode1Reset) 3297 * scenario. Add the missing df cstate disablement here. 3298 */ 3299 if (amdgpu_dpm_set_df_cstate(adev, DF_CSTATE_DISALLOW)) 3300 dev_warn(adev->dev, "Failed to disallow df cstate"); 3301 3302 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 3303 if (!adev->ip_blocks[i].status.valid) 3304 continue; 3305 3306 /* displays are handled separately */ 3307 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_DCE) 3308 continue; 3309 3310 /* XXX handle errors */ 3311 r = adev->ip_blocks[i].version->funcs->suspend(adev); 3312 /* XXX handle errors */ 3313 if (r) { 3314 DRM_ERROR("suspend of IP block <%s> failed %d\n", 3315 adev->ip_blocks[i].version->funcs->name, r); 3316 return r; 3317 } 3318 3319 adev->ip_blocks[i].status.hw = false; 3320 } 3321 3322 return 0; 3323 } 3324 3325 /** 3326 * amdgpu_device_ip_suspend_phase2 - run suspend for hardware IPs (phase 2) 3327 * 3328 * @adev: amdgpu_device pointer 3329 * 3330 * Main suspend function for hardware IPs. The list of all the hardware 3331 * IPs that make up the asic is walked, clockgating is disabled and the 3332 * suspend callbacks are run. suspend puts the hardware and software state 3333 * in each IP into a state suitable for suspend. 3334 * Returns 0 on success, negative error code on failure. 3335 */ 3336 static int amdgpu_device_ip_suspend_phase2(struct amdgpu_device *adev) 3337 { 3338 int i, r; 3339 3340 if (adev->in_s0ix) 3341 amdgpu_dpm_gfx_state_change(adev, sGpuChangeState_D3Entry); 3342 3343 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 3344 if (!adev->ip_blocks[i].status.valid) 3345 continue; 3346 /* displays are handled in phase1 */ 3347 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE) 3348 continue; 3349 /* PSP lost connection when err_event_athub occurs */ 3350 if (amdgpu_ras_intr_triggered() && 3351 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) { 3352 adev->ip_blocks[i].status.hw = false; 3353 continue; 3354 } 3355 3356 /* skip unnecessary suspend if we do not initialize them yet */ 3357 if (adev->gmc.xgmi.pending_reset && 3358 !(adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC || 3359 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC || 3360 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON || 3361 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH)) { 3362 adev->ip_blocks[i].status.hw = false; 3363 continue; 3364 } 3365 3366 /* skip suspend of gfx/mes and psp for S0ix 3367 * gfx is in gfxoff state, so on resume it will exit gfxoff just 3368 * like at runtime. PSP is also part of the always on hardware 3369 * so no need to suspend it. 3370 */ 3371 if (adev->in_s0ix && 3372 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP || 3373 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GFX || 3374 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_MES)) 3375 continue; 3376 3377 /* SDMA 5.x+ is part of GFX power domain so it's covered by GFXOFF */ 3378 if (adev->in_s0ix && 3379 (amdgpu_ip_version(adev, SDMA0_HWIP, 0) >= 3380 IP_VERSION(5, 0, 0)) && 3381 (adev->ip_blocks[i].version->type == 3382 AMD_IP_BLOCK_TYPE_SDMA)) 3383 continue; 3384 3385 /* Once swPSP provides the IMU, RLC FW binaries to TOS during cold-boot. 3386 * These are in TMR, hence are expected to be reused by PSP-TOS to reload 3387 * from this location and RLC Autoload automatically also gets loaded 3388 * from here based on PMFW -> PSP message during re-init sequence. 3389 * Therefore, the psp suspend & resume should be skipped to avoid destroy 3390 * the TMR and reload FWs again for IMU enabled APU ASICs. 3391 */ 3392 if (amdgpu_in_reset(adev) && 3393 (adev->flags & AMD_IS_APU) && adev->gfx.imu.funcs && 3394 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) 3395 continue; 3396 3397 /* XXX handle errors */ 3398 r = adev->ip_blocks[i].version->funcs->suspend(adev); 3399 /* XXX handle errors */ 3400 if (r) { 3401 DRM_ERROR("suspend of IP block <%s> failed %d\n", 3402 adev->ip_blocks[i].version->funcs->name, r); 3403 } 3404 adev->ip_blocks[i].status.hw = false; 3405 /* handle putting the SMC in the appropriate state */ 3406 if (!amdgpu_sriov_vf(adev)) { 3407 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) { 3408 r = amdgpu_dpm_set_mp1_state(adev, adev->mp1_state); 3409 if (r) { 3410 DRM_ERROR("SMC failed to set mp1 state %d, %d\n", 3411 adev->mp1_state, r); 3412 return r; 3413 } 3414 } 3415 } 3416 } 3417 3418 return 0; 3419 } 3420 3421 /** 3422 * amdgpu_device_ip_suspend - run suspend for hardware IPs 3423 * 3424 * @adev: amdgpu_device pointer 3425 * 3426 * Main suspend function for hardware IPs. The list of all the hardware 3427 * IPs that make up the asic is walked, clockgating is disabled and the 3428 * suspend callbacks are run. suspend puts the hardware and software state 3429 * in each IP into a state suitable for suspend. 3430 * Returns 0 on success, negative error code on failure. 3431 */ 3432 int amdgpu_device_ip_suspend(struct amdgpu_device *adev) 3433 { 3434 int r; 3435 3436 if (amdgpu_sriov_vf(adev)) { 3437 amdgpu_virt_fini_data_exchange(adev); 3438 amdgpu_virt_request_full_gpu(adev, false); 3439 } 3440 3441 amdgpu_ttm_set_buffer_funcs_status(adev, false); 3442 3443 r = amdgpu_device_ip_suspend_phase1(adev); 3444 if (r) 3445 return r; 3446 r = amdgpu_device_ip_suspend_phase2(adev); 3447 3448 if (amdgpu_sriov_vf(adev)) 3449 amdgpu_virt_release_full_gpu(adev, false); 3450 3451 return r; 3452 } 3453 3454 static int amdgpu_device_ip_reinit_early_sriov(struct amdgpu_device *adev) 3455 { 3456 int i, r; 3457 3458 static enum amd_ip_block_type ip_order[] = { 3459 AMD_IP_BLOCK_TYPE_COMMON, 3460 AMD_IP_BLOCK_TYPE_GMC, 3461 AMD_IP_BLOCK_TYPE_PSP, 3462 AMD_IP_BLOCK_TYPE_IH, 3463 }; 3464 3465 for (i = 0; i < adev->num_ip_blocks; i++) { 3466 int j; 3467 struct amdgpu_ip_block *block; 3468 3469 block = &adev->ip_blocks[i]; 3470 block->status.hw = false; 3471 3472 for (j = 0; j < ARRAY_SIZE(ip_order); j++) { 3473 3474 if (block->version->type != ip_order[j] || 3475 !block->status.valid) 3476 continue; 3477 3478 r = block->version->funcs->hw_init(adev); 3479 DRM_INFO("RE-INIT-early: %s %s\n", block->version->funcs->name, r?"failed":"succeeded"); 3480 if (r) 3481 return r; 3482 block->status.hw = true; 3483 } 3484 } 3485 3486 return 0; 3487 } 3488 3489 static int amdgpu_device_ip_reinit_late_sriov(struct amdgpu_device *adev) 3490 { 3491 int i, r; 3492 3493 static enum amd_ip_block_type ip_order[] = { 3494 AMD_IP_BLOCK_TYPE_SMC, 3495 AMD_IP_BLOCK_TYPE_DCE, 3496 AMD_IP_BLOCK_TYPE_GFX, 3497 AMD_IP_BLOCK_TYPE_SDMA, 3498 AMD_IP_BLOCK_TYPE_MES, 3499 AMD_IP_BLOCK_TYPE_UVD, 3500 AMD_IP_BLOCK_TYPE_VCE, 3501 AMD_IP_BLOCK_TYPE_VCN, 3502 AMD_IP_BLOCK_TYPE_JPEG 3503 }; 3504 3505 for (i = 0; i < ARRAY_SIZE(ip_order); i++) { 3506 int j; 3507 struct amdgpu_ip_block *block; 3508 3509 for (j = 0; j < adev->num_ip_blocks; j++) { 3510 block = &adev->ip_blocks[j]; 3511 3512 if (block->version->type != ip_order[i] || 3513 !block->status.valid || 3514 block->status.hw) 3515 continue; 3516 3517 if (block->version->type == AMD_IP_BLOCK_TYPE_SMC) 3518 r = block->version->funcs->resume(adev); 3519 else 3520 r = block->version->funcs->hw_init(adev); 3521 3522 DRM_INFO("RE-INIT-late: %s %s\n", block->version->funcs->name, r?"failed":"succeeded"); 3523 if (r) 3524 return r; 3525 block->status.hw = true; 3526 } 3527 } 3528 3529 return 0; 3530 } 3531 3532 /** 3533 * amdgpu_device_ip_resume_phase1 - run resume for hardware IPs 3534 * 3535 * @adev: amdgpu_device pointer 3536 * 3537 * First resume function for hardware IPs. The list of all the hardware 3538 * IPs that make up the asic is walked and the resume callbacks are run for 3539 * COMMON, GMC, and IH. resume puts the hardware into a functional state 3540 * after a suspend and updates the software state as necessary. This 3541 * function is also used for restoring the GPU after a GPU reset. 3542 * Returns 0 on success, negative error code on failure. 3543 */ 3544 static int amdgpu_device_ip_resume_phase1(struct amdgpu_device *adev) 3545 { 3546 int i, r; 3547 3548 for (i = 0; i < adev->num_ip_blocks; i++) { 3549 if (!adev->ip_blocks[i].status.valid || adev->ip_blocks[i].status.hw) 3550 continue; 3551 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON || 3552 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC || 3553 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH || 3554 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP && amdgpu_sriov_vf(adev))) { 3555 3556 r = adev->ip_blocks[i].version->funcs->resume(adev); 3557 if (r) { 3558 DRM_ERROR("resume of IP block <%s> failed %d\n", 3559 adev->ip_blocks[i].version->funcs->name, r); 3560 return r; 3561 } 3562 adev->ip_blocks[i].status.hw = true; 3563 } 3564 } 3565 3566 return 0; 3567 } 3568 3569 /** 3570 * amdgpu_device_ip_resume_phase2 - run resume for hardware IPs 3571 * 3572 * @adev: amdgpu_device pointer 3573 * 3574 * First resume function for hardware IPs. The list of all the hardware 3575 * IPs that make up the asic is walked and the resume callbacks are run for 3576 * all blocks except COMMON, GMC, and IH. resume puts the hardware into a 3577 * functional state after a suspend and updates the software state as 3578 * necessary. This function is also used for restoring the GPU after a GPU 3579 * reset. 3580 * Returns 0 on success, negative error code on failure. 3581 */ 3582 static int amdgpu_device_ip_resume_phase2(struct amdgpu_device *adev) 3583 { 3584 int i, r; 3585 3586 for (i = 0; i < adev->num_ip_blocks; i++) { 3587 if (!adev->ip_blocks[i].status.valid || adev->ip_blocks[i].status.hw) 3588 continue; 3589 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON || 3590 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC || 3591 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH || 3592 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) 3593 continue; 3594 r = adev->ip_blocks[i].version->funcs->resume(adev); 3595 if (r) { 3596 DRM_ERROR("resume of IP block <%s> failed %d\n", 3597 adev->ip_blocks[i].version->funcs->name, r); 3598 return r; 3599 } 3600 adev->ip_blocks[i].status.hw = true; 3601 } 3602 3603 return 0; 3604 } 3605 3606 /** 3607 * amdgpu_device_ip_resume - run resume for hardware IPs 3608 * 3609 * @adev: amdgpu_device pointer 3610 * 3611 * Main resume function for hardware IPs. The hardware IPs 3612 * are split into two resume functions because they are 3613 * also used in recovering from a GPU reset and some additional 3614 * steps need to be take between them. In this case (S3/S4) they are 3615 * run sequentially. 3616 * Returns 0 on success, negative error code on failure. 3617 */ 3618 static int amdgpu_device_ip_resume(struct amdgpu_device *adev) 3619 { 3620 int r; 3621 3622 r = amdgpu_device_ip_resume_phase1(adev); 3623 if (r) 3624 return r; 3625 3626 r = amdgpu_device_fw_loading(adev); 3627 if (r) 3628 return r; 3629 3630 r = amdgpu_device_ip_resume_phase2(adev); 3631 3632 if (adev->mman.buffer_funcs_ring->sched.ready) 3633 amdgpu_ttm_set_buffer_funcs_status(adev, true); 3634 3635 return r; 3636 } 3637 3638 /** 3639 * amdgpu_device_detect_sriov_bios - determine if the board supports SR-IOV 3640 * 3641 * @adev: amdgpu_device pointer 3642 * 3643 * Query the VBIOS data tables to determine if the board supports SR-IOV. 3644 */ 3645 static void amdgpu_device_detect_sriov_bios(struct amdgpu_device *adev) 3646 { 3647 if (amdgpu_sriov_vf(adev)) { 3648 if (adev->is_atom_fw) { 3649 if (amdgpu_atomfirmware_gpu_virtualization_supported(adev)) 3650 adev->virt.caps |= AMDGPU_SRIOV_CAPS_SRIOV_VBIOS; 3651 } else { 3652 if (amdgpu_atombios_has_gpu_virtualization_table(adev)) 3653 adev->virt.caps |= AMDGPU_SRIOV_CAPS_SRIOV_VBIOS; 3654 } 3655 3656 if (!(adev->virt.caps & AMDGPU_SRIOV_CAPS_SRIOV_VBIOS)) 3657 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_NO_VBIOS, 0, 0); 3658 } 3659 } 3660 3661 /** 3662 * amdgpu_device_asic_has_dc_support - determine if DC supports the asic 3663 * 3664 * @asic_type: AMD asic type 3665 * 3666 * Check if there is DC (new modesetting infrastructre) support for an asic. 3667 * returns true if DC has support, false if not. 3668 */ 3669 bool amdgpu_device_asic_has_dc_support(enum amd_asic_type asic_type) 3670 { 3671 switch (asic_type) { 3672 #ifdef CONFIG_DRM_AMDGPU_SI 3673 case CHIP_HAINAN: 3674 #endif 3675 case CHIP_TOPAZ: 3676 /* chips with no display hardware */ 3677 return false; 3678 #if defined(CONFIG_DRM_AMD_DC) 3679 case CHIP_TAHITI: 3680 case CHIP_PITCAIRN: 3681 case CHIP_VERDE: 3682 case CHIP_OLAND: 3683 /* 3684 * We have systems in the wild with these ASICs that require 3685 * LVDS and VGA support which is not supported with DC. 3686 * 3687 * Fallback to the non-DC driver here by default so as not to 3688 * cause regressions. 3689 */ 3690 #if defined(CONFIG_DRM_AMD_DC_SI) 3691 return amdgpu_dc > 0; 3692 #else 3693 return false; 3694 #endif 3695 case CHIP_BONAIRE: 3696 case CHIP_KAVERI: 3697 case CHIP_KABINI: 3698 case CHIP_MULLINS: 3699 /* 3700 * We have systems in the wild with these ASICs that require 3701 * VGA support which is not supported with DC. 3702 * 3703 * Fallback to the non-DC driver here by default so as not to 3704 * cause regressions. 3705 */ 3706 return amdgpu_dc > 0; 3707 default: 3708 return amdgpu_dc != 0; 3709 #else 3710 default: 3711 if (amdgpu_dc > 0) 3712 DRM_INFO_ONCE("Display Core has been requested via kernel parameter but isn't supported by ASIC, ignoring\n"); 3713 return false; 3714 #endif 3715 } 3716 } 3717 3718 /** 3719 * amdgpu_device_has_dc_support - check if dc is supported 3720 * 3721 * @adev: amdgpu_device pointer 3722 * 3723 * Returns true for supported, false for not supported 3724 */ 3725 bool amdgpu_device_has_dc_support(struct amdgpu_device *adev) 3726 { 3727 if (adev->enable_virtual_display || 3728 (adev->harvest_ip_mask & AMD_HARVEST_IP_DMU_MASK)) 3729 return false; 3730 3731 return amdgpu_device_asic_has_dc_support(adev->asic_type); 3732 } 3733 3734 static void amdgpu_device_xgmi_reset_func(struct work_struct *__work) 3735 { 3736 struct amdgpu_device *adev = 3737 container_of(__work, struct amdgpu_device, xgmi_reset_work); 3738 struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev); 3739 3740 /* It's a bug to not have a hive within this function */ 3741 if (WARN_ON(!hive)) 3742 return; 3743 3744 /* 3745 * Use task barrier to synchronize all xgmi reset works across the 3746 * hive. task_barrier_enter and task_barrier_exit will block 3747 * until all the threads running the xgmi reset works reach 3748 * those points. task_barrier_full will do both blocks. 3749 */ 3750 if (amdgpu_asic_reset_method(adev) == AMD_RESET_METHOD_BACO) { 3751 3752 task_barrier_enter(&hive->tb); 3753 adev->asic_reset_res = amdgpu_device_baco_enter(adev_to_drm(adev)); 3754 3755 if (adev->asic_reset_res) 3756 goto fail; 3757 3758 task_barrier_exit(&hive->tb); 3759 adev->asic_reset_res = amdgpu_device_baco_exit(adev_to_drm(adev)); 3760 3761 if (adev->asic_reset_res) 3762 goto fail; 3763 3764 amdgpu_ras_reset_error_count(adev, AMDGPU_RAS_BLOCK__MMHUB); 3765 } else { 3766 3767 task_barrier_full(&hive->tb); 3768 adev->asic_reset_res = amdgpu_asic_reset(adev); 3769 } 3770 3771 fail: 3772 if (adev->asic_reset_res) 3773 DRM_WARN("ASIC reset failed with error, %d for drm dev, %s", 3774 adev->asic_reset_res, adev_to_drm(adev)->unique); 3775 amdgpu_put_xgmi_hive(hive); 3776 } 3777 3778 static int amdgpu_device_get_job_timeout_settings(struct amdgpu_device *adev) 3779 { 3780 char *input = amdgpu_lockup_timeout; 3781 char *timeout_setting = NULL; 3782 int index = 0; 3783 long timeout; 3784 int ret = 0; 3785 3786 /* 3787 * By default timeout for non compute jobs is 10000 3788 * and 60000 for compute jobs. 3789 * In SR-IOV or passthrough mode, timeout for compute 3790 * jobs are 60000 by default. 3791 */ 3792 adev->gfx_timeout = msecs_to_jiffies(10000); 3793 adev->sdma_timeout = adev->video_timeout = adev->gfx_timeout; 3794 if (amdgpu_sriov_vf(adev)) 3795 adev->compute_timeout = amdgpu_sriov_is_pp_one_vf(adev) ? 3796 msecs_to_jiffies(60000) : msecs_to_jiffies(10000); 3797 else 3798 adev->compute_timeout = msecs_to_jiffies(60000); 3799 3800 if (strnlen(input, AMDGPU_MAX_TIMEOUT_PARAM_LENGTH)) { 3801 while ((timeout_setting = strsep(&input, ",")) && 3802 strnlen(timeout_setting, AMDGPU_MAX_TIMEOUT_PARAM_LENGTH)) { 3803 ret = kstrtol(timeout_setting, 0, &timeout); 3804 if (ret) 3805 return ret; 3806 3807 if (timeout == 0) { 3808 index++; 3809 continue; 3810 } else if (timeout < 0) { 3811 timeout = MAX_SCHEDULE_TIMEOUT; 3812 dev_warn(adev->dev, "lockup timeout disabled"); 3813 add_taint(TAINT_SOFTLOCKUP, LOCKDEP_STILL_OK); 3814 } else { 3815 timeout = msecs_to_jiffies(timeout); 3816 } 3817 3818 switch (index++) { 3819 case 0: 3820 adev->gfx_timeout = timeout; 3821 break; 3822 case 1: 3823 adev->compute_timeout = timeout; 3824 break; 3825 case 2: 3826 adev->sdma_timeout = timeout; 3827 break; 3828 case 3: 3829 adev->video_timeout = timeout; 3830 break; 3831 default: 3832 break; 3833 } 3834 } 3835 /* 3836 * There is only one value specified and 3837 * it should apply to all non-compute jobs. 3838 */ 3839 if (index == 1) { 3840 adev->sdma_timeout = adev->video_timeout = adev->gfx_timeout; 3841 if (amdgpu_sriov_vf(adev) || amdgpu_passthrough(adev)) 3842 adev->compute_timeout = adev->gfx_timeout; 3843 } 3844 } 3845 3846 return ret; 3847 } 3848 3849 /** 3850 * amdgpu_device_check_iommu_direct_map - check if RAM direct mapped to GPU 3851 * 3852 * @adev: amdgpu_device pointer 3853 * 3854 * RAM direct mapped to GPU if IOMMU is not enabled or is pass through mode 3855 */ 3856 static void amdgpu_device_check_iommu_direct_map(struct amdgpu_device *adev) 3857 { 3858 struct iommu_domain *domain; 3859 3860 domain = iommu_get_domain_for_dev(adev->dev); 3861 if (!domain || domain->type == IOMMU_DOMAIN_IDENTITY) 3862 adev->ram_is_direct_mapped = true; 3863 } 3864 3865 static const struct attribute *amdgpu_dev_attributes[] = { 3866 &dev_attr_pcie_replay_count.attr, 3867 NULL 3868 }; 3869 3870 static void amdgpu_device_set_mcbp(struct amdgpu_device *adev) 3871 { 3872 if (amdgpu_mcbp == 1) 3873 adev->gfx.mcbp = true; 3874 else if (amdgpu_mcbp == 0) 3875 adev->gfx.mcbp = false; 3876 3877 if (amdgpu_sriov_vf(adev)) 3878 adev->gfx.mcbp = true; 3879 3880 if (adev->gfx.mcbp) 3881 DRM_INFO("MCBP is enabled\n"); 3882 } 3883 3884 /** 3885 * amdgpu_device_init - initialize the driver 3886 * 3887 * @adev: amdgpu_device pointer 3888 * @flags: driver flags 3889 * 3890 * Initializes the driver info and hw (all asics). 3891 * Returns 0 for success or an error on failure. 3892 * Called at driver startup. 3893 */ 3894 int amdgpu_device_init(struct amdgpu_device *adev, 3895 uint32_t flags) 3896 { 3897 struct drm_device *ddev = adev_to_drm(adev); 3898 struct pci_dev *pdev = adev->pdev; 3899 int r, i; 3900 bool px = false; 3901 u32 max_MBps; 3902 int tmp; 3903 3904 adev->shutdown = false; 3905 adev->flags = flags; 3906 3907 if (amdgpu_force_asic_type >= 0 && amdgpu_force_asic_type < CHIP_LAST) 3908 adev->asic_type = amdgpu_force_asic_type; 3909 else 3910 adev->asic_type = flags & AMD_ASIC_MASK; 3911 3912 adev->usec_timeout = AMDGPU_MAX_USEC_TIMEOUT; 3913 if (amdgpu_emu_mode == 1) 3914 adev->usec_timeout *= 10; 3915 adev->gmc.gart_size = 512 * 1024 * 1024; 3916 adev->accel_working = false; 3917 adev->num_rings = 0; 3918 RCU_INIT_POINTER(adev->gang_submit, dma_fence_get_stub()); 3919 adev->mman.buffer_funcs = NULL; 3920 adev->mman.buffer_funcs_ring = NULL; 3921 adev->vm_manager.vm_pte_funcs = NULL; 3922 adev->vm_manager.vm_pte_num_scheds = 0; 3923 adev->gmc.gmc_funcs = NULL; 3924 adev->harvest_ip_mask = 0x0; 3925 adev->fence_context = dma_fence_context_alloc(AMDGPU_MAX_RINGS); 3926 bitmap_zero(adev->gfx.pipe_reserve_bitmap, AMDGPU_MAX_COMPUTE_QUEUES); 3927 3928 adev->smc_rreg = &amdgpu_invalid_rreg; 3929 adev->smc_wreg = &amdgpu_invalid_wreg; 3930 adev->pcie_rreg = &amdgpu_invalid_rreg; 3931 adev->pcie_wreg = &amdgpu_invalid_wreg; 3932 adev->pcie_rreg_ext = &amdgpu_invalid_rreg_ext; 3933 adev->pcie_wreg_ext = &amdgpu_invalid_wreg_ext; 3934 adev->pciep_rreg = &amdgpu_invalid_rreg; 3935 adev->pciep_wreg = &amdgpu_invalid_wreg; 3936 adev->pcie_rreg64 = &amdgpu_invalid_rreg64; 3937 adev->pcie_wreg64 = &amdgpu_invalid_wreg64; 3938 adev->pcie_rreg64_ext = &amdgpu_invalid_rreg64_ext; 3939 adev->pcie_wreg64_ext = &amdgpu_invalid_wreg64_ext; 3940 adev->uvd_ctx_rreg = &amdgpu_invalid_rreg; 3941 adev->uvd_ctx_wreg = &amdgpu_invalid_wreg; 3942 adev->didt_rreg = &amdgpu_invalid_rreg; 3943 adev->didt_wreg = &amdgpu_invalid_wreg; 3944 adev->gc_cac_rreg = &amdgpu_invalid_rreg; 3945 adev->gc_cac_wreg = &amdgpu_invalid_wreg; 3946 adev->audio_endpt_rreg = &amdgpu_block_invalid_rreg; 3947 adev->audio_endpt_wreg = &amdgpu_block_invalid_wreg; 3948 3949 DRM_INFO("initializing kernel modesetting (%s 0x%04X:0x%04X 0x%04X:0x%04X 0x%02X).\n", 3950 amdgpu_asic_name[adev->asic_type], pdev->vendor, pdev->device, 3951 pdev->subsystem_vendor, pdev->subsystem_device, pdev->revision); 3952 3953 /* mutex initialization are all done here so we 3954 * can recall function without having locking issues 3955 */ 3956 mutex_init(&adev->firmware.mutex); 3957 mutex_init(&adev->pm.mutex); 3958 mutex_init(&adev->gfx.gpu_clock_mutex); 3959 mutex_init(&adev->srbm_mutex); 3960 mutex_init(&adev->gfx.pipe_reserve_mutex); 3961 mutex_init(&adev->gfx.gfx_off_mutex); 3962 mutex_init(&adev->gfx.partition_mutex); 3963 mutex_init(&adev->grbm_idx_mutex); 3964 mutex_init(&adev->mn_lock); 3965 mutex_init(&adev->virt.vf_errors.lock); 3966 hash_init(adev->mn_hash); 3967 mutex_init(&adev->psp.mutex); 3968 mutex_init(&adev->notifier_lock); 3969 mutex_init(&adev->pm.stable_pstate_ctx_lock); 3970 mutex_init(&adev->benchmark_mutex); 3971 3972 amdgpu_device_init_apu_flags(adev); 3973 3974 r = amdgpu_device_check_arguments(adev); 3975 if (r) 3976 return r; 3977 3978 spin_lock_init(&adev->mmio_idx_lock); 3979 spin_lock_init(&adev->smc_idx_lock); 3980 spin_lock_init(&adev->pcie_idx_lock); 3981 spin_lock_init(&adev->uvd_ctx_idx_lock); 3982 spin_lock_init(&adev->didt_idx_lock); 3983 spin_lock_init(&adev->gc_cac_idx_lock); 3984 spin_lock_init(&adev->se_cac_idx_lock); 3985 spin_lock_init(&adev->audio_endpt_idx_lock); 3986 spin_lock_init(&adev->mm_stats.lock); 3987 3988 INIT_LIST_HEAD(&adev->shadow_list); 3989 mutex_init(&adev->shadow_list_lock); 3990 3991 INIT_LIST_HEAD(&adev->reset_list); 3992 3993 INIT_LIST_HEAD(&adev->ras_list); 3994 3995 INIT_LIST_HEAD(&adev->pm.od_kobj_list); 3996 3997 INIT_DELAYED_WORK(&adev->delayed_init_work, 3998 amdgpu_device_delayed_init_work_handler); 3999 INIT_DELAYED_WORK(&adev->gfx.gfx_off_delay_work, 4000 amdgpu_device_delay_enable_gfx_off); 4001 4002 INIT_WORK(&adev->xgmi_reset_work, amdgpu_device_xgmi_reset_func); 4003 4004 adev->gfx.gfx_off_req_count = 1; 4005 adev->gfx.gfx_off_residency = 0; 4006 adev->gfx.gfx_off_entrycount = 0; 4007 adev->pm.ac_power = power_supply_is_system_supplied() > 0; 4008 4009 atomic_set(&adev->throttling_logging_enabled, 1); 4010 /* 4011 * If throttling continues, logging will be performed every minute 4012 * to avoid log flooding. "-1" is subtracted since the thermal 4013 * throttling interrupt comes every second. Thus, the total logging 4014 * interval is 59 seconds(retelimited printk interval) + 1(waiting 4015 * for throttling interrupt) = 60 seconds. 4016 */ 4017 ratelimit_state_init(&adev->throttling_logging_rs, (60 - 1) * HZ, 1); 4018 ratelimit_set_flags(&adev->throttling_logging_rs, RATELIMIT_MSG_ON_RELEASE); 4019 4020 /* Registers mapping */ 4021 /* TODO: block userspace mapping of io register */ 4022 if (adev->asic_type >= CHIP_BONAIRE) { 4023 adev->rmmio_base = pci_resource_start(adev->pdev, 5); 4024 adev->rmmio_size = pci_resource_len(adev->pdev, 5); 4025 } else { 4026 adev->rmmio_base = pci_resource_start(adev->pdev, 2); 4027 adev->rmmio_size = pci_resource_len(adev->pdev, 2); 4028 } 4029 4030 for (i = 0; i < AMD_IP_BLOCK_TYPE_NUM; i++) 4031 atomic_set(&adev->pm.pwr_state[i], POWER_STATE_UNKNOWN); 4032 4033 adev->rmmio = ioremap(adev->rmmio_base, adev->rmmio_size); 4034 if (!adev->rmmio) 4035 return -ENOMEM; 4036 4037 DRM_INFO("register mmio base: 0x%08X\n", (uint32_t)adev->rmmio_base); 4038 DRM_INFO("register mmio size: %u\n", (unsigned int)adev->rmmio_size); 4039 4040 /* 4041 * Reset domain needs to be present early, before XGMI hive discovered 4042 * (if any) and intitialized to use reset sem and in_gpu reset flag 4043 * early on during init and before calling to RREG32. 4044 */ 4045 adev->reset_domain = amdgpu_reset_create_reset_domain(SINGLE_DEVICE, "amdgpu-reset-dev"); 4046 if (!adev->reset_domain) 4047 return -ENOMEM; 4048 4049 /* detect hw virtualization here */ 4050 amdgpu_detect_virtualization(adev); 4051 4052 amdgpu_device_get_pcie_info(adev); 4053 4054 r = amdgpu_device_get_job_timeout_settings(adev); 4055 if (r) { 4056 dev_err(adev->dev, "invalid lockup_timeout parameter syntax\n"); 4057 return r; 4058 } 4059 4060 amdgpu_device_set_mcbp(adev); 4061 4062 /* early init functions */ 4063 r = amdgpu_device_ip_early_init(adev); 4064 if (r) 4065 return r; 4066 4067 /* Get rid of things like offb */ 4068 r = drm_aperture_remove_conflicting_pci_framebuffers(adev->pdev, &amdgpu_kms_driver); 4069 if (r) 4070 return r; 4071 4072 /* Enable TMZ based on IP_VERSION */ 4073 amdgpu_gmc_tmz_set(adev); 4074 4075 if (amdgpu_sriov_vf(adev) && 4076 amdgpu_ip_version(adev, GC_HWIP, 0) >= IP_VERSION(10, 3, 0)) 4077 /* VF MMIO access (except mailbox range) from CPU 4078 * will be blocked during sriov runtime 4079 */ 4080 adev->virt.caps |= AMDGPU_VF_MMIO_ACCESS_PROTECT; 4081 4082 amdgpu_gmc_noretry_set(adev); 4083 /* Need to get xgmi info early to decide the reset behavior*/ 4084 if (adev->gmc.xgmi.supported) { 4085 r = adev->gfxhub.funcs->get_xgmi_info(adev); 4086 if (r) 4087 return r; 4088 } 4089 4090 /* enable PCIE atomic ops */ 4091 if (amdgpu_sriov_vf(adev)) { 4092 if (adev->virt.fw_reserve.p_pf2vf) 4093 adev->have_atomics_support = ((struct amd_sriov_msg_pf2vf_info *) 4094 adev->virt.fw_reserve.p_pf2vf)->pcie_atomic_ops_support_flags == 4095 (PCI_EXP_DEVCAP2_ATOMIC_COMP32 | PCI_EXP_DEVCAP2_ATOMIC_COMP64); 4096 /* APUs w/ gfx9 onwards doesn't reply on PCIe atomics, rather it is a 4097 * internal path natively support atomics, set have_atomics_support to true. 4098 */ 4099 } else if ((adev->flags & AMD_IS_APU) && 4100 (amdgpu_ip_version(adev, GC_HWIP, 0) > 4101 IP_VERSION(9, 0, 0))) { 4102 adev->have_atomics_support = true; 4103 } else { 4104 adev->have_atomics_support = 4105 !pci_enable_atomic_ops_to_root(adev->pdev, 4106 PCI_EXP_DEVCAP2_ATOMIC_COMP32 | 4107 PCI_EXP_DEVCAP2_ATOMIC_COMP64); 4108 } 4109 4110 if (!adev->have_atomics_support) 4111 dev_info(adev->dev, "PCIE atomic ops is not supported\n"); 4112 4113 /* doorbell bar mapping and doorbell index init*/ 4114 amdgpu_doorbell_init(adev); 4115 4116 if (amdgpu_emu_mode == 1) { 4117 /* post the asic on emulation mode */ 4118 emu_soc_asic_init(adev); 4119 goto fence_driver_init; 4120 } 4121 4122 amdgpu_reset_init(adev); 4123 4124 /* detect if we are with an SRIOV vbios */ 4125 if (adev->bios) 4126 amdgpu_device_detect_sriov_bios(adev); 4127 4128 /* check if we need to reset the asic 4129 * E.g., driver was not cleanly unloaded previously, etc. 4130 */ 4131 if (!amdgpu_sriov_vf(adev) && amdgpu_asic_need_reset_on_init(adev)) { 4132 if (adev->gmc.xgmi.num_physical_nodes) { 4133 dev_info(adev->dev, "Pending hive reset.\n"); 4134 adev->gmc.xgmi.pending_reset = true; 4135 /* Only need to init necessary block for SMU to handle the reset */ 4136 for (i = 0; i < adev->num_ip_blocks; i++) { 4137 if (!adev->ip_blocks[i].status.valid) 4138 continue; 4139 if (!(adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC || 4140 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON || 4141 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH || 4142 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC)) { 4143 DRM_DEBUG("IP %s disabled for hw_init.\n", 4144 adev->ip_blocks[i].version->funcs->name); 4145 adev->ip_blocks[i].status.hw = true; 4146 } 4147 } 4148 } else { 4149 tmp = amdgpu_reset_method; 4150 /* It should do a default reset when loading or reloading the driver, 4151 * regardless of the module parameter reset_method. 4152 */ 4153 amdgpu_reset_method = AMD_RESET_METHOD_NONE; 4154 r = amdgpu_asic_reset(adev); 4155 amdgpu_reset_method = tmp; 4156 if (r) { 4157 dev_err(adev->dev, "asic reset on init failed\n"); 4158 goto failed; 4159 } 4160 } 4161 } 4162 4163 /* Post card if necessary */ 4164 if (amdgpu_device_need_post(adev)) { 4165 if (!adev->bios) { 4166 dev_err(adev->dev, "no vBIOS found\n"); 4167 r = -EINVAL; 4168 goto failed; 4169 } 4170 DRM_INFO("GPU posting now...\n"); 4171 r = amdgpu_device_asic_init(adev); 4172 if (r) { 4173 dev_err(adev->dev, "gpu post error!\n"); 4174 goto failed; 4175 } 4176 } 4177 4178 if (adev->bios) { 4179 if (adev->is_atom_fw) { 4180 /* Initialize clocks */ 4181 r = amdgpu_atomfirmware_get_clock_info(adev); 4182 if (r) { 4183 dev_err(adev->dev, "amdgpu_atomfirmware_get_clock_info failed\n"); 4184 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_GET_CLOCK_FAIL, 0, 0); 4185 goto failed; 4186 } 4187 } else { 4188 /* Initialize clocks */ 4189 r = amdgpu_atombios_get_clock_info(adev); 4190 if (r) { 4191 dev_err(adev->dev, "amdgpu_atombios_get_clock_info failed\n"); 4192 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_GET_CLOCK_FAIL, 0, 0); 4193 goto failed; 4194 } 4195 /* init i2c buses */ 4196 if (!amdgpu_device_has_dc_support(adev)) 4197 amdgpu_atombios_i2c_init(adev); 4198 } 4199 } 4200 4201 fence_driver_init: 4202 /* Fence driver */ 4203 r = amdgpu_fence_driver_sw_init(adev); 4204 if (r) { 4205 dev_err(adev->dev, "amdgpu_fence_driver_sw_init failed\n"); 4206 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_FENCE_INIT_FAIL, 0, 0); 4207 goto failed; 4208 } 4209 4210 /* init the mode config */ 4211 drm_mode_config_init(adev_to_drm(adev)); 4212 4213 r = amdgpu_device_ip_init(adev); 4214 if (r) { 4215 dev_err(adev->dev, "amdgpu_device_ip_init failed\n"); 4216 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_AMDGPU_INIT_FAIL, 0, 0); 4217 goto release_ras_con; 4218 } 4219 4220 amdgpu_fence_driver_hw_init(adev); 4221 4222 dev_info(adev->dev, 4223 "SE %d, SH per SE %d, CU per SH %d, active_cu_number %d\n", 4224 adev->gfx.config.max_shader_engines, 4225 adev->gfx.config.max_sh_per_se, 4226 adev->gfx.config.max_cu_per_sh, 4227 adev->gfx.cu_info.number); 4228 4229 adev->accel_working = true; 4230 4231 amdgpu_vm_check_compute_bug(adev); 4232 4233 /* Initialize the buffer migration limit. */ 4234 if (amdgpu_moverate >= 0) 4235 max_MBps = amdgpu_moverate; 4236 else 4237 max_MBps = 8; /* Allow 8 MB/s. */ 4238 /* Get a log2 for easy divisions. */ 4239 adev->mm_stats.log2_max_MBps = ilog2(max(1u, max_MBps)); 4240 4241 /* 4242 * Register gpu instance before amdgpu_device_enable_mgpu_fan_boost. 4243 * Otherwise the mgpu fan boost feature will be skipped due to the 4244 * gpu instance is counted less. 4245 */ 4246 amdgpu_register_gpu_instance(adev); 4247 4248 /* enable clockgating, etc. after ib tests, etc. since some blocks require 4249 * explicit gating rather than handling it automatically. 4250 */ 4251 if (!adev->gmc.xgmi.pending_reset) { 4252 r = amdgpu_device_ip_late_init(adev); 4253 if (r) { 4254 dev_err(adev->dev, "amdgpu_device_ip_late_init failed\n"); 4255 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_AMDGPU_LATE_INIT_FAIL, 0, r); 4256 goto release_ras_con; 4257 } 4258 /* must succeed. */ 4259 amdgpu_ras_resume(adev); 4260 queue_delayed_work(system_wq, &adev->delayed_init_work, 4261 msecs_to_jiffies(AMDGPU_RESUME_MS)); 4262 } 4263 4264 if (amdgpu_sriov_vf(adev)) { 4265 amdgpu_virt_release_full_gpu(adev, true); 4266 flush_delayed_work(&adev->delayed_init_work); 4267 } 4268 4269 /* 4270 * Place those sysfs registering after `late_init`. As some of those 4271 * operations performed in `late_init` might affect the sysfs 4272 * interfaces creating. 4273 */ 4274 r = amdgpu_atombios_sysfs_init(adev); 4275 if (r) 4276 drm_err(&adev->ddev, 4277 "registering atombios sysfs failed (%d).\n", r); 4278 4279 r = amdgpu_pm_sysfs_init(adev); 4280 if (r) 4281 DRM_ERROR("registering pm sysfs failed (%d).\n", r); 4282 4283 r = amdgpu_ucode_sysfs_init(adev); 4284 if (r) { 4285 adev->ucode_sysfs_en = false; 4286 DRM_ERROR("Creating firmware sysfs failed (%d).\n", r); 4287 } else 4288 adev->ucode_sysfs_en = true; 4289 4290 r = sysfs_create_files(&adev->dev->kobj, amdgpu_dev_attributes); 4291 if (r) 4292 dev_err(adev->dev, "Could not create amdgpu device attr\n"); 4293 4294 r = devm_device_add_group(adev->dev, &amdgpu_board_attrs_group); 4295 if (r) 4296 dev_err(adev->dev, 4297 "Could not create amdgpu board attributes\n"); 4298 4299 amdgpu_fru_sysfs_init(adev); 4300 amdgpu_reg_state_sysfs_init(adev); 4301 4302 if (IS_ENABLED(CONFIG_PERF_EVENTS)) 4303 r = amdgpu_pmu_init(adev); 4304 if (r) 4305 dev_err(adev->dev, "amdgpu_pmu_init failed\n"); 4306 4307 /* Have stored pci confspace at hand for restore in sudden PCI error */ 4308 if (amdgpu_device_cache_pci_state(adev->pdev)) 4309 pci_restore_state(pdev); 4310 4311 /* if we have > 1 VGA cards, then disable the amdgpu VGA resources */ 4312 /* this will fail for cards that aren't VGA class devices, just 4313 * ignore it 4314 */ 4315 if ((adev->pdev->class >> 8) == PCI_CLASS_DISPLAY_VGA) 4316 vga_client_register(adev->pdev, amdgpu_device_vga_set_decode); 4317 4318 px = amdgpu_device_supports_px(ddev); 4319 4320 if (px || (!dev_is_removable(&adev->pdev->dev) && 4321 apple_gmux_detect(NULL, NULL))) 4322 vga_switcheroo_register_client(adev->pdev, 4323 &amdgpu_switcheroo_ops, px); 4324 4325 if (px) 4326 vga_switcheroo_init_domain_pm_ops(adev->dev, &adev->vga_pm_domain); 4327 4328 if (adev->gmc.xgmi.pending_reset) 4329 queue_delayed_work(system_wq, &mgpu_info.delayed_reset_work, 4330 msecs_to_jiffies(AMDGPU_RESUME_MS)); 4331 4332 amdgpu_device_check_iommu_direct_map(adev); 4333 4334 return 0; 4335 4336 release_ras_con: 4337 if (amdgpu_sriov_vf(adev)) 4338 amdgpu_virt_release_full_gpu(adev, true); 4339 4340 /* failed in exclusive mode due to timeout */ 4341 if (amdgpu_sriov_vf(adev) && 4342 !amdgpu_sriov_runtime(adev) && 4343 amdgpu_virt_mmio_blocked(adev) && 4344 !amdgpu_virt_wait_reset(adev)) { 4345 dev_err(adev->dev, "VF exclusive mode timeout\n"); 4346 /* Don't send request since VF is inactive. */ 4347 adev->virt.caps &= ~AMDGPU_SRIOV_CAPS_RUNTIME; 4348 adev->virt.ops = NULL; 4349 r = -EAGAIN; 4350 } 4351 amdgpu_release_ras_context(adev); 4352 4353 failed: 4354 amdgpu_vf_error_trans_all(adev); 4355 4356 return r; 4357 } 4358 4359 static void amdgpu_device_unmap_mmio(struct amdgpu_device *adev) 4360 { 4361 4362 /* Clear all CPU mappings pointing to this device */ 4363 unmap_mapping_range(adev->ddev.anon_inode->i_mapping, 0, 0, 1); 4364 4365 /* Unmap all mapped bars - Doorbell, registers and VRAM */ 4366 amdgpu_doorbell_fini(adev); 4367 4368 iounmap(adev->rmmio); 4369 adev->rmmio = NULL; 4370 if (adev->mman.aper_base_kaddr) 4371 iounmap(adev->mman.aper_base_kaddr); 4372 adev->mman.aper_base_kaddr = NULL; 4373 4374 /* Memory manager related */ 4375 if (!adev->gmc.xgmi.connected_to_cpu && !adev->gmc.is_app_apu) { 4376 arch_phys_wc_del(adev->gmc.vram_mtrr); 4377 arch_io_free_memtype_wc(adev->gmc.aper_base, adev->gmc.aper_size); 4378 } 4379 } 4380 4381 /** 4382 * amdgpu_device_fini_hw - tear down the driver 4383 * 4384 * @adev: amdgpu_device pointer 4385 * 4386 * Tear down the driver info (all asics). 4387 * Called at driver shutdown. 4388 */ 4389 void amdgpu_device_fini_hw(struct amdgpu_device *adev) 4390 { 4391 dev_info(adev->dev, "amdgpu: finishing device.\n"); 4392 flush_delayed_work(&adev->delayed_init_work); 4393 adev->shutdown = true; 4394 4395 /* make sure IB test finished before entering exclusive mode 4396 * to avoid preemption on IB test 4397 */ 4398 if (amdgpu_sriov_vf(adev)) { 4399 amdgpu_virt_request_full_gpu(adev, false); 4400 amdgpu_virt_fini_data_exchange(adev); 4401 } 4402 4403 /* disable all interrupts */ 4404 amdgpu_irq_disable_all(adev); 4405 if (adev->mode_info.mode_config_initialized) { 4406 if (!drm_drv_uses_atomic_modeset(adev_to_drm(adev))) 4407 drm_helper_force_disable_all(adev_to_drm(adev)); 4408 else 4409 drm_atomic_helper_shutdown(adev_to_drm(adev)); 4410 } 4411 amdgpu_fence_driver_hw_fini(adev); 4412 4413 if (adev->mman.initialized) 4414 drain_workqueue(adev->mman.bdev.wq); 4415 4416 if (adev->pm.sysfs_initialized) 4417 amdgpu_pm_sysfs_fini(adev); 4418 if (adev->ucode_sysfs_en) 4419 amdgpu_ucode_sysfs_fini(adev); 4420 sysfs_remove_files(&adev->dev->kobj, amdgpu_dev_attributes); 4421 amdgpu_fru_sysfs_fini(adev); 4422 4423 amdgpu_reg_state_sysfs_fini(adev); 4424 4425 /* disable ras feature must before hw fini */ 4426 amdgpu_ras_pre_fini(adev); 4427 4428 amdgpu_ttm_set_buffer_funcs_status(adev, false); 4429 4430 amdgpu_device_ip_fini_early(adev); 4431 4432 amdgpu_irq_fini_hw(adev); 4433 4434 if (adev->mman.initialized) 4435 ttm_device_clear_dma_mappings(&adev->mman.bdev); 4436 4437 amdgpu_gart_dummy_page_fini(adev); 4438 4439 if (drm_dev_is_unplugged(adev_to_drm(adev))) 4440 amdgpu_device_unmap_mmio(adev); 4441 4442 } 4443 4444 void amdgpu_device_fini_sw(struct amdgpu_device *adev) 4445 { 4446 int idx; 4447 bool px; 4448 4449 amdgpu_fence_driver_sw_fini(adev); 4450 amdgpu_device_ip_fini(adev); 4451 amdgpu_ucode_release(&adev->firmware.gpu_info_fw); 4452 adev->accel_working = false; 4453 dma_fence_put(rcu_dereference_protected(adev->gang_submit, true)); 4454 4455 amdgpu_reset_fini(adev); 4456 4457 /* free i2c buses */ 4458 if (!amdgpu_device_has_dc_support(adev)) 4459 amdgpu_i2c_fini(adev); 4460 4461 if (amdgpu_emu_mode != 1) 4462 amdgpu_atombios_fini(adev); 4463 4464 kfree(adev->bios); 4465 adev->bios = NULL; 4466 4467 kfree(adev->fru_info); 4468 adev->fru_info = NULL; 4469 4470 px = amdgpu_device_supports_px(adev_to_drm(adev)); 4471 4472 if (px || (!dev_is_removable(&adev->pdev->dev) && 4473 apple_gmux_detect(NULL, NULL))) 4474 vga_switcheroo_unregister_client(adev->pdev); 4475 4476 if (px) 4477 vga_switcheroo_fini_domain_pm_ops(adev->dev); 4478 4479 if ((adev->pdev->class >> 8) == PCI_CLASS_DISPLAY_VGA) 4480 vga_client_unregister(adev->pdev); 4481 4482 if (drm_dev_enter(adev_to_drm(adev), &idx)) { 4483 4484 iounmap(adev->rmmio); 4485 adev->rmmio = NULL; 4486 amdgpu_doorbell_fini(adev); 4487 drm_dev_exit(idx); 4488 } 4489 4490 if (IS_ENABLED(CONFIG_PERF_EVENTS)) 4491 amdgpu_pmu_fini(adev); 4492 if (adev->mman.discovery_bin) 4493 amdgpu_discovery_fini(adev); 4494 4495 amdgpu_reset_put_reset_domain(adev->reset_domain); 4496 adev->reset_domain = NULL; 4497 4498 kfree(adev->pci_state); 4499 4500 } 4501 4502 /** 4503 * amdgpu_device_evict_resources - evict device resources 4504 * @adev: amdgpu device object 4505 * 4506 * Evicts all ttm device resources(vram BOs, gart table) from the lru list 4507 * of the vram memory type. Mainly used for evicting device resources 4508 * at suspend time. 4509 * 4510 */ 4511 static int amdgpu_device_evict_resources(struct amdgpu_device *adev) 4512 { 4513 int ret; 4514 4515 /* No need to evict vram on APUs for suspend to ram or s2idle */ 4516 if ((adev->in_s3 || adev->in_s0ix) && (adev->flags & AMD_IS_APU)) 4517 return 0; 4518 4519 ret = amdgpu_ttm_evict_resources(adev, TTM_PL_VRAM); 4520 if (ret) 4521 DRM_WARN("evicting device resources failed\n"); 4522 return ret; 4523 } 4524 4525 /* 4526 * Suspend & resume. 4527 */ 4528 /** 4529 * amdgpu_device_prepare - prepare for device suspend 4530 * 4531 * @dev: drm dev pointer 4532 * 4533 * Prepare to put the hw in the suspend state (all asics). 4534 * Returns 0 for success or an error on failure. 4535 * Called at driver suspend. 4536 */ 4537 int amdgpu_device_prepare(struct drm_device *dev) 4538 { 4539 struct amdgpu_device *adev = drm_to_adev(dev); 4540 int i, r; 4541 4542 amdgpu_choose_low_power_state(adev); 4543 4544 if (dev->switch_power_state == DRM_SWITCH_POWER_OFF) 4545 return 0; 4546 4547 /* Evict the majority of BOs before starting suspend sequence */ 4548 r = amdgpu_device_evict_resources(adev); 4549 if (r) 4550 goto unprepare; 4551 4552 flush_delayed_work(&adev->gfx.gfx_off_delay_work); 4553 4554 for (i = 0; i < adev->num_ip_blocks; i++) { 4555 if (!adev->ip_blocks[i].status.valid) 4556 continue; 4557 if (!adev->ip_blocks[i].version->funcs->prepare_suspend) 4558 continue; 4559 r = adev->ip_blocks[i].version->funcs->prepare_suspend((void *)adev); 4560 if (r) 4561 goto unprepare; 4562 } 4563 4564 return 0; 4565 4566 unprepare: 4567 adev->in_s0ix = adev->in_s3 = false; 4568 4569 return r; 4570 } 4571 4572 /** 4573 * amdgpu_device_suspend - initiate device suspend 4574 * 4575 * @dev: drm dev pointer 4576 * @fbcon : notify the fbdev of suspend 4577 * 4578 * Puts the hw in the suspend state (all asics). 4579 * Returns 0 for success or an error on failure. 4580 * Called at driver suspend. 4581 */ 4582 int amdgpu_device_suspend(struct drm_device *dev, bool fbcon) 4583 { 4584 struct amdgpu_device *adev = drm_to_adev(dev); 4585 int r = 0; 4586 4587 if (dev->switch_power_state == DRM_SWITCH_POWER_OFF) 4588 return 0; 4589 4590 adev->in_suspend = true; 4591 4592 if (amdgpu_sriov_vf(adev)) { 4593 amdgpu_virt_fini_data_exchange(adev); 4594 r = amdgpu_virt_request_full_gpu(adev, false); 4595 if (r) 4596 return r; 4597 } 4598 4599 if (amdgpu_acpi_smart_shift_update(dev, AMDGPU_SS_DEV_D3)) 4600 DRM_WARN("smart shift update failed\n"); 4601 4602 if (fbcon) 4603 drm_fb_helper_set_suspend_unlocked(adev_to_drm(adev)->fb_helper, true); 4604 4605 cancel_delayed_work_sync(&adev->delayed_init_work); 4606 4607 amdgpu_ras_suspend(adev); 4608 4609 amdgpu_device_ip_suspend_phase1(adev); 4610 4611 if (!adev->in_s0ix) 4612 amdgpu_amdkfd_suspend(adev, adev->in_runpm); 4613 4614 r = amdgpu_device_evict_resources(adev); 4615 if (r) 4616 return r; 4617 4618 amdgpu_ttm_set_buffer_funcs_status(adev, false); 4619 4620 amdgpu_fence_driver_hw_fini(adev); 4621 4622 amdgpu_device_ip_suspend_phase2(adev); 4623 4624 if (amdgpu_sriov_vf(adev)) 4625 amdgpu_virt_release_full_gpu(adev, false); 4626 4627 r = amdgpu_dpm_notify_rlc_state(adev, false); 4628 if (r) 4629 return r; 4630 4631 return 0; 4632 } 4633 4634 /** 4635 * amdgpu_device_resume - initiate device resume 4636 * 4637 * @dev: drm dev pointer 4638 * @fbcon : notify the fbdev of resume 4639 * 4640 * Bring the hw back to operating state (all asics). 4641 * Returns 0 for success or an error on failure. 4642 * Called at driver resume. 4643 */ 4644 int amdgpu_device_resume(struct drm_device *dev, bool fbcon) 4645 { 4646 struct amdgpu_device *adev = drm_to_adev(dev); 4647 int r = 0; 4648 4649 if (amdgpu_sriov_vf(adev)) { 4650 r = amdgpu_virt_request_full_gpu(adev, true); 4651 if (r) 4652 return r; 4653 } 4654 4655 if (dev->switch_power_state == DRM_SWITCH_POWER_OFF) 4656 return 0; 4657 4658 if (adev->in_s0ix) 4659 amdgpu_dpm_gfx_state_change(adev, sGpuChangeState_D0Entry); 4660 4661 /* post card */ 4662 if (amdgpu_device_need_post(adev)) { 4663 r = amdgpu_device_asic_init(adev); 4664 if (r) 4665 dev_err(adev->dev, "amdgpu asic init failed\n"); 4666 } 4667 4668 r = amdgpu_device_ip_resume(adev); 4669 4670 if (r) { 4671 dev_err(adev->dev, "amdgpu_device_ip_resume failed (%d).\n", r); 4672 goto exit; 4673 } 4674 amdgpu_fence_driver_hw_init(adev); 4675 4676 if (!adev->in_s0ix) { 4677 r = amdgpu_amdkfd_resume(adev, adev->in_runpm); 4678 if (r) 4679 goto exit; 4680 } 4681 4682 r = amdgpu_device_ip_late_init(adev); 4683 if (r) 4684 goto exit; 4685 4686 queue_delayed_work(system_wq, &adev->delayed_init_work, 4687 msecs_to_jiffies(AMDGPU_RESUME_MS)); 4688 exit: 4689 if (amdgpu_sriov_vf(adev)) { 4690 amdgpu_virt_init_data_exchange(adev); 4691 amdgpu_virt_release_full_gpu(adev, true); 4692 } 4693 4694 if (r) 4695 return r; 4696 4697 /* Make sure IB tests flushed */ 4698 flush_delayed_work(&adev->delayed_init_work); 4699 4700 if (fbcon) 4701 drm_fb_helper_set_suspend_unlocked(adev_to_drm(adev)->fb_helper, false); 4702 4703 amdgpu_ras_resume(adev); 4704 4705 if (adev->mode_info.num_crtc) { 4706 /* 4707 * Most of the connector probing functions try to acquire runtime pm 4708 * refs to ensure that the GPU is powered on when connector polling is 4709 * performed. Since we're calling this from a runtime PM callback, 4710 * trying to acquire rpm refs will cause us to deadlock. 4711 * 4712 * Since we're guaranteed to be holding the rpm lock, it's safe to 4713 * temporarily disable the rpm helpers so this doesn't deadlock us. 4714 */ 4715 #ifdef CONFIG_PM 4716 dev->dev->power.disable_depth++; 4717 #endif 4718 if (!adev->dc_enabled) 4719 drm_helper_hpd_irq_event(dev); 4720 else 4721 drm_kms_helper_hotplug_event(dev); 4722 #ifdef CONFIG_PM 4723 dev->dev->power.disable_depth--; 4724 #endif 4725 } 4726 adev->in_suspend = false; 4727 4728 if (adev->enable_mes) 4729 amdgpu_mes_self_test(adev); 4730 4731 if (amdgpu_acpi_smart_shift_update(dev, AMDGPU_SS_DEV_D0)) 4732 DRM_WARN("smart shift update failed\n"); 4733 4734 return 0; 4735 } 4736 4737 /** 4738 * amdgpu_device_ip_check_soft_reset - did soft reset succeed 4739 * 4740 * @adev: amdgpu_device pointer 4741 * 4742 * The list of all the hardware IPs that make up the asic is walked and 4743 * the check_soft_reset callbacks are run. check_soft_reset determines 4744 * if the asic is still hung or not. 4745 * Returns true if any of the IPs are still in a hung state, false if not. 4746 */ 4747 static bool amdgpu_device_ip_check_soft_reset(struct amdgpu_device *adev) 4748 { 4749 int i; 4750 bool asic_hang = false; 4751 4752 if (amdgpu_sriov_vf(adev)) 4753 return true; 4754 4755 if (amdgpu_asic_need_full_reset(adev)) 4756 return true; 4757 4758 for (i = 0; i < adev->num_ip_blocks; i++) { 4759 if (!adev->ip_blocks[i].status.valid) 4760 continue; 4761 if (adev->ip_blocks[i].version->funcs->check_soft_reset) 4762 adev->ip_blocks[i].status.hang = 4763 adev->ip_blocks[i].version->funcs->check_soft_reset(adev); 4764 if (adev->ip_blocks[i].status.hang) { 4765 dev_info(adev->dev, "IP block:%s is hung!\n", adev->ip_blocks[i].version->funcs->name); 4766 asic_hang = true; 4767 } 4768 } 4769 return asic_hang; 4770 } 4771 4772 /** 4773 * amdgpu_device_ip_pre_soft_reset - prepare for soft reset 4774 * 4775 * @adev: amdgpu_device pointer 4776 * 4777 * The list of all the hardware IPs that make up the asic is walked and the 4778 * pre_soft_reset callbacks are run if the block is hung. pre_soft_reset 4779 * handles any IP specific hardware or software state changes that are 4780 * necessary for a soft reset to succeed. 4781 * Returns 0 on success, negative error code on failure. 4782 */ 4783 static int amdgpu_device_ip_pre_soft_reset(struct amdgpu_device *adev) 4784 { 4785 int i, r = 0; 4786 4787 for (i = 0; i < adev->num_ip_blocks; i++) { 4788 if (!adev->ip_blocks[i].status.valid) 4789 continue; 4790 if (adev->ip_blocks[i].status.hang && 4791 adev->ip_blocks[i].version->funcs->pre_soft_reset) { 4792 r = adev->ip_blocks[i].version->funcs->pre_soft_reset(adev); 4793 if (r) 4794 return r; 4795 } 4796 } 4797 4798 return 0; 4799 } 4800 4801 /** 4802 * amdgpu_device_ip_need_full_reset - check if a full asic reset is needed 4803 * 4804 * @adev: amdgpu_device pointer 4805 * 4806 * Some hardware IPs cannot be soft reset. If they are hung, a full gpu 4807 * reset is necessary to recover. 4808 * Returns true if a full asic reset is required, false if not. 4809 */ 4810 static bool amdgpu_device_ip_need_full_reset(struct amdgpu_device *adev) 4811 { 4812 int i; 4813 4814 if (amdgpu_asic_need_full_reset(adev)) 4815 return true; 4816 4817 for (i = 0; i < adev->num_ip_blocks; i++) { 4818 if (!adev->ip_blocks[i].status.valid) 4819 continue; 4820 if ((adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) || 4821 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) || 4822 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_ACP) || 4823 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE) || 4824 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) { 4825 if (adev->ip_blocks[i].status.hang) { 4826 dev_info(adev->dev, "Some block need full reset!\n"); 4827 return true; 4828 } 4829 } 4830 } 4831 return false; 4832 } 4833 4834 /** 4835 * amdgpu_device_ip_soft_reset - do a soft reset 4836 * 4837 * @adev: amdgpu_device pointer 4838 * 4839 * The list of all the hardware IPs that make up the asic is walked and the 4840 * soft_reset callbacks are run if the block is hung. soft_reset handles any 4841 * IP specific hardware or software state changes that are necessary to soft 4842 * reset the IP. 4843 * Returns 0 on success, negative error code on failure. 4844 */ 4845 static int amdgpu_device_ip_soft_reset(struct amdgpu_device *adev) 4846 { 4847 int i, r = 0; 4848 4849 for (i = 0; i < adev->num_ip_blocks; i++) { 4850 if (!adev->ip_blocks[i].status.valid) 4851 continue; 4852 if (adev->ip_blocks[i].status.hang && 4853 adev->ip_blocks[i].version->funcs->soft_reset) { 4854 r = adev->ip_blocks[i].version->funcs->soft_reset(adev); 4855 if (r) 4856 return r; 4857 } 4858 } 4859 4860 return 0; 4861 } 4862 4863 /** 4864 * amdgpu_device_ip_post_soft_reset - clean up from soft reset 4865 * 4866 * @adev: amdgpu_device pointer 4867 * 4868 * The list of all the hardware IPs that make up the asic is walked and the 4869 * post_soft_reset callbacks are run if the asic was hung. post_soft_reset 4870 * handles any IP specific hardware or software state changes that are 4871 * necessary after the IP has been soft reset. 4872 * Returns 0 on success, negative error code on failure. 4873 */ 4874 static int amdgpu_device_ip_post_soft_reset(struct amdgpu_device *adev) 4875 { 4876 int i, r = 0; 4877 4878 for (i = 0; i < adev->num_ip_blocks; i++) { 4879 if (!adev->ip_blocks[i].status.valid) 4880 continue; 4881 if (adev->ip_blocks[i].status.hang && 4882 adev->ip_blocks[i].version->funcs->post_soft_reset) 4883 r = adev->ip_blocks[i].version->funcs->post_soft_reset(adev); 4884 if (r) 4885 return r; 4886 } 4887 4888 return 0; 4889 } 4890 4891 /** 4892 * amdgpu_device_recover_vram - Recover some VRAM contents 4893 * 4894 * @adev: amdgpu_device pointer 4895 * 4896 * Restores the contents of VRAM buffers from the shadows in GTT. Used to 4897 * restore things like GPUVM page tables after a GPU reset where 4898 * the contents of VRAM might be lost. 4899 * 4900 * Returns: 4901 * 0 on success, negative error code on failure. 4902 */ 4903 static int amdgpu_device_recover_vram(struct amdgpu_device *adev) 4904 { 4905 struct dma_fence *fence = NULL, *next = NULL; 4906 struct amdgpu_bo *shadow; 4907 struct amdgpu_bo_vm *vmbo; 4908 long r = 1, tmo; 4909 4910 if (amdgpu_sriov_runtime(adev)) 4911 tmo = msecs_to_jiffies(8000); 4912 else 4913 tmo = msecs_to_jiffies(100); 4914 4915 dev_info(adev->dev, "recover vram bo from shadow start\n"); 4916 mutex_lock(&adev->shadow_list_lock); 4917 list_for_each_entry(vmbo, &adev->shadow_list, shadow_list) { 4918 /* If vm is compute context or adev is APU, shadow will be NULL */ 4919 if (!vmbo->shadow) 4920 continue; 4921 shadow = vmbo->shadow; 4922 4923 /* No need to recover an evicted BO */ 4924 if (shadow->tbo.resource->mem_type != TTM_PL_TT || 4925 shadow->tbo.resource->start == AMDGPU_BO_INVALID_OFFSET || 4926 shadow->parent->tbo.resource->mem_type != TTM_PL_VRAM) 4927 continue; 4928 4929 r = amdgpu_bo_restore_shadow(shadow, &next); 4930 if (r) 4931 break; 4932 4933 if (fence) { 4934 tmo = dma_fence_wait_timeout(fence, false, tmo); 4935 dma_fence_put(fence); 4936 fence = next; 4937 if (tmo == 0) { 4938 r = -ETIMEDOUT; 4939 break; 4940 } else if (tmo < 0) { 4941 r = tmo; 4942 break; 4943 } 4944 } else { 4945 fence = next; 4946 } 4947 } 4948 mutex_unlock(&adev->shadow_list_lock); 4949 4950 if (fence) 4951 tmo = dma_fence_wait_timeout(fence, false, tmo); 4952 dma_fence_put(fence); 4953 4954 if (r < 0 || tmo <= 0) { 4955 dev_err(adev->dev, "recover vram bo from shadow failed, r is %ld, tmo is %ld\n", r, tmo); 4956 return -EIO; 4957 } 4958 4959 dev_info(adev->dev, "recover vram bo from shadow done\n"); 4960 return 0; 4961 } 4962 4963 4964 /** 4965 * amdgpu_device_reset_sriov - reset ASIC for SR-IOV vf 4966 * 4967 * @adev: amdgpu_device pointer 4968 * @from_hypervisor: request from hypervisor 4969 * 4970 * do VF FLR and reinitialize Asic 4971 * return 0 means succeeded otherwise failed 4972 */ 4973 static int amdgpu_device_reset_sriov(struct amdgpu_device *adev, 4974 bool from_hypervisor) 4975 { 4976 int r; 4977 struct amdgpu_hive_info *hive = NULL; 4978 int retry_limit = 0; 4979 4980 retry: 4981 amdgpu_amdkfd_pre_reset(adev); 4982 4983 amdgpu_device_stop_pending_resets(adev); 4984 4985 if (from_hypervisor) 4986 r = amdgpu_virt_request_full_gpu(adev, true); 4987 else 4988 r = amdgpu_virt_reset_gpu(adev); 4989 if (r) 4990 return r; 4991 amdgpu_irq_gpu_reset_resume_helper(adev); 4992 4993 /* some sw clean up VF needs to do before recover */ 4994 amdgpu_virt_post_reset(adev); 4995 4996 /* Resume IP prior to SMC */ 4997 r = amdgpu_device_ip_reinit_early_sriov(adev); 4998 if (r) 4999 goto error; 5000 5001 amdgpu_virt_init_data_exchange(adev); 5002 5003 r = amdgpu_device_fw_loading(adev); 5004 if (r) 5005 return r; 5006 5007 /* now we are okay to resume SMC/CP/SDMA */ 5008 r = amdgpu_device_ip_reinit_late_sriov(adev); 5009 if (r) 5010 goto error; 5011 5012 hive = amdgpu_get_xgmi_hive(adev); 5013 /* Update PSP FW topology after reset */ 5014 if (hive && adev->gmc.xgmi.num_physical_nodes > 1) 5015 r = amdgpu_xgmi_update_topology(hive, adev); 5016 5017 if (hive) 5018 amdgpu_put_xgmi_hive(hive); 5019 5020 if (!r) { 5021 r = amdgpu_ib_ring_tests(adev); 5022 5023 amdgpu_amdkfd_post_reset(adev); 5024 } 5025 5026 error: 5027 if (!r && adev->virt.gim_feature & AMDGIM_FEATURE_GIM_FLR_VRAMLOST) { 5028 amdgpu_inc_vram_lost(adev); 5029 r = amdgpu_device_recover_vram(adev); 5030 } 5031 amdgpu_virt_release_full_gpu(adev, true); 5032 5033 if (AMDGPU_RETRY_SRIOV_RESET(r)) { 5034 if (retry_limit < AMDGPU_MAX_RETRY_LIMIT) { 5035 retry_limit++; 5036 goto retry; 5037 } else 5038 DRM_ERROR("GPU reset retry is beyond the retry limit\n"); 5039 } 5040 5041 return r; 5042 } 5043 5044 /** 5045 * amdgpu_device_has_job_running - check if there is any job in mirror list 5046 * 5047 * @adev: amdgpu_device pointer 5048 * 5049 * check if there is any job in mirror list 5050 */ 5051 bool amdgpu_device_has_job_running(struct amdgpu_device *adev) 5052 { 5053 int i; 5054 struct drm_sched_job *job; 5055 5056 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 5057 struct amdgpu_ring *ring = adev->rings[i]; 5058 5059 if (!amdgpu_ring_sched_ready(ring)) 5060 continue; 5061 5062 spin_lock(&ring->sched.job_list_lock); 5063 job = list_first_entry_or_null(&ring->sched.pending_list, 5064 struct drm_sched_job, list); 5065 spin_unlock(&ring->sched.job_list_lock); 5066 if (job) 5067 return true; 5068 } 5069 return false; 5070 } 5071 5072 /** 5073 * amdgpu_device_should_recover_gpu - check if we should try GPU recovery 5074 * 5075 * @adev: amdgpu_device pointer 5076 * 5077 * Check amdgpu_gpu_recovery and SRIOV status to see if we should try to recover 5078 * a hung GPU. 5079 */ 5080 bool amdgpu_device_should_recover_gpu(struct amdgpu_device *adev) 5081 { 5082 5083 if (amdgpu_gpu_recovery == 0) 5084 goto disabled; 5085 5086 /* Skip soft reset check in fatal error mode */ 5087 if (!amdgpu_ras_is_poison_mode_supported(adev)) 5088 return true; 5089 5090 if (amdgpu_sriov_vf(adev)) 5091 return true; 5092 5093 if (amdgpu_gpu_recovery == -1) { 5094 switch (adev->asic_type) { 5095 #ifdef CONFIG_DRM_AMDGPU_SI 5096 case CHIP_VERDE: 5097 case CHIP_TAHITI: 5098 case CHIP_PITCAIRN: 5099 case CHIP_OLAND: 5100 case CHIP_HAINAN: 5101 #endif 5102 #ifdef CONFIG_DRM_AMDGPU_CIK 5103 case CHIP_KAVERI: 5104 case CHIP_KABINI: 5105 case CHIP_MULLINS: 5106 #endif 5107 case CHIP_CARRIZO: 5108 case CHIP_STONEY: 5109 case CHIP_CYAN_SKILLFISH: 5110 goto disabled; 5111 default: 5112 break; 5113 } 5114 } 5115 5116 return true; 5117 5118 disabled: 5119 dev_info(adev->dev, "GPU recovery disabled.\n"); 5120 return false; 5121 } 5122 5123 int amdgpu_device_mode1_reset(struct amdgpu_device *adev) 5124 { 5125 u32 i; 5126 int ret = 0; 5127 5128 amdgpu_atombios_scratch_regs_engine_hung(adev, true); 5129 5130 dev_info(adev->dev, "GPU mode1 reset\n"); 5131 5132 /* disable BM */ 5133 pci_clear_master(adev->pdev); 5134 5135 amdgpu_device_cache_pci_state(adev->pdev); 5136 5137 if (amdgpu_dpm_is_mode1_reset_supported(adev)) { 5138 dev_info(adev->dev, "GPU smu mode1 reset\n"); 5139 ret = amdgpu_dpm_mode1_reset(adev); 5140 } else { 5141 dev_info(adev->dev, "GPU psp mode1 reset\n"); 5142 ret = psp_gpu_reset(adev); 5143 } 5144 5145 if (ret) 5146 goto mode1_reset_failed; 5147 5148 amdgpu_device_load_pci_state(adev->pdev); 5149 ret = amdgpu_psp_wait_for_bootloader(adev); 5150 if (ret) 5151 goto mode1_reset_failed; 5152 5153 /* wait for asic to come out of reset */ 5154 for (i = 0; i < adev->usec_timeout; i++) { 5155 u32 memsize = adev->nbio.funcs->get_memsize(adev); 5156 5157 if (memsize != 0xffffffff) 5158 break; 5159 udelay(1); 5160 } 5161 5162 if (i >= adev->usec_timeout) { 5163 ret = -ETIMEDOUT; 5164 goto mode1_reset_failed; 5165 } 5166 5167 amdgpu_atombios_scratch_regs_engine_hung(adev, false); 5168 5169 return 0; 5170 5171 mode1_reset_failed: 5172 dev_err(adev->dev, "GPU mode1 reset failed\n"); 5173 return ret; 5174 } 5175 5176 int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev, 5177 struct amdgpu_reset_context *reset_context) 5178 { 5179 int i, r = 0; 5180 struct amdgpu_job *job = NULL; 5181 bool need_full_reset = 5182 test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 5183 5184 if (reset_context->reset_req_dev == adev) 5185 job = reset_context->job; 5186 5187 if (amdgpu_sriov_vf(adev)) { 5188 /* stop the data exchange thread */ 5189 amdgpu_virt_fini_data_exchange(adev); 5190 } 5191 5192 amdgpu_fence_driver_isr_toggle(adev, true); 5193 5194 /* block all schedulers and reset given job's ring */ 5195 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 5196 struct amdgpu_ring *ring = adev->rings[i]; 5197 5198 if (!amdgpu_ring_sched_ready(ring)) 5199 continue; 5200 5201 /* Clear job fence from fence drv to avoid force_completion 5202 * leave NULL and vm flush fence in fence drv 5203 */ 5204 amdgpu_fence_driver_clear_job_fences(ring); 5205 5206 /* after all hw jobs are reset, hw fence is meaningless, so force_completion */ 5207 amdgpu_fence_driver_force_completion(ring); 5208 } 5209 5210 amdgpu_fence_driver_isr_toggle(adev, false); 5211 5212 if (job && job->vm) 5213 drm_sched_increase_karma(&job->base); 5214 5215 r = amdgpu_reset_prepare_hwcontext(adev, reset_context); 5216 /* If reset handler not implemented, continue; otherwise return */ 5217 if (r == -EOPNOTSUPP) 5218 r = 0; 5219 else 5220 return r; 5221 5222 /* Don't suspend on bare metal if we are not going to HW reset the ASIC */ 5223 if (!amdgpu_sriov_vf(adev)) { 5224 5225 if (!need_full_reset) 5226 need_full_reset = amdgpu_device_ip_need_full_reset(adev); 5227 5228 if (!need_full_reset && amdgpu_gpu_recovery && 5229 amdgpu_device_ip_check_soft_reset(adev)) { 5230 amdgpu_device_ip_pre_soft_reset(adev); 5231 r = amdgpu_device_ip_soft_reset(adev); 5232 amdgpu_device_ip_post_soft_reset(adev); 5233 if (r || amdgpu_device_ip_check_soft_reset(adev)) { 5234 dev_info(adev->dev, "soft reset failed, will fallback to full reset!\n"); 5235 need_full_reset = true; 5236 } 5237 } 5238 5239 if (need_full_reset) 5240 r = amdgpu_device_ip_suspend(adev); 5241 if (need_full_reset) 5242 set_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 5243 else 5244 clear_bit(AMDGPU_NEED_FULL_RESET, 5245 &reset_context->flags); 5246 } 5247 5248 return r; 5249 } 5250 5251 static int amdgpu_reset_reg_dumps(struct amdgpu_device *adev) 5252 { 5253 int i; 5254 5255 lockdep_assert_held(&adev->reset_domain->sem); 5256 5257 for (i = 0; i < adev->reset_info.num_regs; i++) { 5258 adev->reset_info.reset_dump_reg_value[i] = 5259 RREG32(adev->reset_info.reset_dump_reg_list[i]); 5260 5261 trace_amdgpu_reset_reg_dumps(adev->reset_info.reset_dump_reg_list[i], 5262 adev->reset_info.reset_dump_reg_value[i]); 5263 } 5264 5265 return 0; 5266 } 5267 5268 int amdgpu_do_asic_reset(struct list_head *device_list_handle, 5269 struct amdgpu_reset_context *reset_context) 5270 { 5271 struct amdgpu_device *tmp_adev = NULL; 5272 bool need_full_reset, skip_hw_reset, vram_lost = false; 5273 int r = 0; 5274 5275 /* Try reset handler method first */ 5276 tmp_adev = list_first_entry(device_list_handle, struct amdgpu_device, 5277 reset_list); 5278 amdgpu_reset_reg_dumps(tmp_adev); 5279 5280 reset_context->reset_device_list = device_list_handle; 5281 r = amdgpu_reset_perform_reset(tmp_adev, reset_context); 5282 /* If reset handler not implemented, continue; otherwise return */ 5283 if (r == -EOPNOTSUPP) 5284 r = 0; 5285 else 5286 return r; 5287 5288 /* Reset handler not implemented, use the default method */ 5289 need_full_reset = 5290 test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 5291 skip_hw_reset = test_bit(AMDGPU_SKIP_HW_RESET, &reset_context->flags); 5292 5293 /* 5294 * ASIC reset has to be done on all XGMI hive nodes ASAP 5295 * to allow proper links negotiation in FW (within 1 sec) 5296 */ 5297 if (!skip_hw_reset && need_full_reset) { 5298 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 5299 /* For XGMI run all resets in parallel to speed up the process */ 5300 if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) { 5301 tmp_adev->gmc.xgmi.pending_reset = false; 5302 if (!queue_work(system_unbound_wq, &tmp_adev->xgmi_reset_work)) 5303 r = -EALREADY; 5304 } else 5305 r = amdgpu_asic_reset(tmp_adev); 5306 5307 if (r) { 5308 dev_err(tmp_adev->dev, "ASIC reset failed with error, %d for drm dev, %s", 5309 r, adev_to_drm(tmp_adev)->unique); 5310 goto out; 5311 } 5312 } 5313 5314 /* For XGMI wait for all resets to complete before proceed */ 5315 if (!r) { 5316 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 5317 if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) { 5318 flush_work(&tmp_adev->xgmi_reset_work); 5319 r = tmp_adev->asic_reset_res; 5320 if (r) 5321 break; 5322 } 5323 } 5324 } 5325 } 5326 5327 if (!r && amdgpu_ras_intr_triggered()) { 5328 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 5329 amdgpu_ras_reset_error_count(tmp_adev, AMDGPU_RAS_BLOCK__MMHUB); 5330 } 5331 5332 amdgpu_ras_intr_cleared(); 5333 } 5334 5335 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 5336 if (need_full_reset) { 5337 /* post card */ 5338 amdgpu_ras_set_fed(tmp_adev, false); 5339 r = amdgpu_device_asic_init(tmp_adev); 5340 if (r) { 5341 dev_warn(tmp_adev->dev, "asic atom init failed!"); 5342 } else { 5343 dev_info(tmp_adev->dev, "GPU reset succeeded, trying to resume\n"); 5344 5345 r = amdgpu_device_ip_resume_phase1(tmp_adev); 5346 if (r) 5347 goto out; 5348 5349 vram_lost = amdgpu_device_check_vram_lost(tmp_adev); 5350 5351 amdgpu_coredump(tmp_adev, vram_lost, reset_context); 5352 5353 if (vram_lost) { 5354 DRM_INFO("VRAM is lost due to GPU reset!\n"); 5355 amdgpu_inc_vram_lost(tmp_adev); 5356 } 5357 5358 r = amdgpu_device_fw_loading(tmp_adev); 5359 if (r) 5360 return r; 5361 5362 r = amdgpu_xcp_restore_partition_mode( 5363 tmp_adev->xcp_mgr); 5364 if (r) 5365 goto out; 5366 5367 r = amdgpu_device_ip_resume_phase2(tmp_adev); 5368 if (r) 5369 goto out; 5370 5371 if (tmp_adev->mman.buffer_funcs_ring->sched.ready) 5372 amdgpu_ttm_set_buffer_funcs_status(tmp_adev, true); 5373 5374 if (vram_lost) 5375 amdgpu_device_fill_reset_magic(tmp_adev); 5376 5377 /* 5378 * Add this ASIC as tracked as reset was already 5379 * complete successfully. 5380 */ 5381 amdgpu_register_gpu_instance(tmp_adev); 5382 5383 if (!reset_context->hive && 5384 tmp_adev->gmc.xgmi.num_physical_nodes > 1) 5385 amdgpu_xgmi_add_device(tmp_adev); 5386 5387 r = amdgpu_device_ip_late_init(tmp_adev); 5388 if (r) 5389 goto out; 5390 5391 drm_fb_helper_set_suspend_unlocked(adev_to_drm(tmp_adev)->fb_helper, false); 5392 5393 /* 5394 * The GPU enters bad state once faulty pages 5395 * by ECC has reached the threshold, and ras 5396 * recovery is scheduled next. So add one check 5397 * here to break recovery if it indeed exceeds 5398 * bad page threshold, and remind user to 5399 * retire this GPU or setting one bigger 5400 * bad_page_threshold value to fix this once 5401 * probing driver again. 5402 */ 5403 if (!amdgpu_ras_eeprom_check_err_threshold(tmp_adev)) { 5404 /* must succeed. */ 5405 amdgpu_ras_resume(tmp_adev); 5406 } else { 5407 r = -EINVAL; 5408 goto out; 5409 } 5410 5411 /* Update PSP FW topology after reset */ 5412 if (reset_context->hive && 5413 tmp_adev->gmc.xgmi.num_physical_nodes > 1) 5414 r = amdgpu_xgmi_update_topology( 5415 reset_context->hive, tmp_adev); 5416 } 5417 } 5418 5419 out: 5420 if (!r) { 5421 amdgpu_irq_gpu_reset_resume_helper(tmp_adev); 5422 r = amdgpu_ib_ring_tests(tmp_adev); 5423 if (r) { 5424 dev_err(tmp_adev->dev, "ib ring test failed (%d).\n", r); 5425 need_full_reset = true; 5426 r = -EAGAIN; 5427 goto end; 5428 } 5429 } 5430 5431 if (!r) 5432 r = amdgpu_device_recover_vram(tmp_adev); 5433 else 5434 tmp_adev->asic_reset_res = r; 5435 } 5436 5437 end: 5438 if (need_full_reset) 5439 set_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 5440 else 5441 clear_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 5442 return r; 5443 } 5444 5445 static void amdgpu_device_set_mp1_state(struct amdgpu_device *adev) 5446 { 5447 5448 switch (amdgpu_asic_reset_method(adev)) { 5449 case AMD_RESET_METHOD_MODE1: 5450 adev->mp1_state = PP_MP1_STATE_SHUTDOWN; 5451 break; 5452 case AMD_RESET_METHOD_MODE2: 5453 adev->mp1_state = PP_MP1_STATE_RESET; 5454 break; 5455 default: 5456 adev->mp1_state = PP_MP1_STATE_NONE; 5457 break; 5458 } 5459 } 5460 5461 static void amdgpu_device_unset_mp1_state(struct amdgpu_device *adev) 5462 { 5463 amdgpu_vf_error_trans_all(adev); 5464 adev->mp1_state = PP_MP1_STATE_NONE; 5465 } 5466 5467 static void amdgpu_device_resume_display_audio(struct amdgpu_device *adev) 5468 { 5469 struct pci_dev *p = NULL; 5470 5471 p = pci_get_domain_bus_and_slot(pci_domain_nr(adev->pdev->bus), 5472 adev->pdev->bus->number, 1); 5473 if (p) { 5474 pm_runtime_enable(&(p->dev)); 5475 pm_runtime_resume(&(p->dev)); 5476 } 5477 5478 pci_dev_put(p); 5479 } 5480 5481 static int amdgpu_device_suspend_display_audio(struct amdgpu_device *adev) 5482 { 5483 enum amd_reset_method reset_method; 5484 struct pci_dev *p = NULL; 5485 u64 expires; 5486 5487 /* 5488 * For now, only BACO and mode1 reset are confirmed 5489 * to suffer the audio issue without proper suspended. 5490 */ 5491 reset_method = amdgpu_asic_reset_method(adev); 5492 if ((reset_method != AMD_RESET_METHOD_BACO) && 5493 (reset_method != AMD_RESET_METHOD_MODE1)) 5494 return -EINVAL; 5495 5496 p = pci_get_domain_bus_and_slot(pci_domain_nr(adev->pdev->bus), 5497 adev->pdev->bus->number, 1); 5498 if (!p) 5499 return -ENODEV; 5500 5501 expires = pm_runtime_autosuspend_expiration(&(p->dev)); 5502 if (!expires) 5503 /* 5504 * If we cannot get the audio device autosuspend delay, 5505 * a fixed 4S interval will be used. Considering 3S is 5506 * the audio controller default autosuspend delay setting. 5507 * 4S used here is guaranteed to cover that. 5508 */ 5509 expires = ktime_get_mono_fast_ns() + NSEC_PER_SEC * 4ULL; 5510 5511 while (!pm_runtime_status_suspended(&(p->dev))) { 5512 if (!pm_runtime_suspend(&(p->dev))) 5513 break; 5514 5515 if (expires < ktime_get_mono_fast_ns()) { 5516 dev_warn(adev->dev, "failed to suspend display audio\n"); 5517 pci_dev_put(p); 5518 /* TODO: abort the succeeding gpu reset? */ 5519 return -ETIMEDOUT; 5520 } 5521 } 5522 5523 pm_runtime_disable(&(p->dev)); 5524 5525 pci_dev_put(p); 5526 return 0; 5527 } 5528 5529 static inline void amdgpu_device_stop_pending_resets(struct amdgpu_device *adev) 5530 { 5531 struct amdgpu_ras *con = amdgpu_ras_get_context(adev); 5532 5533 #if defined(CONFIG_DEBUG_FS) 5534 if (!amdgpu_sriov_vf(adev)) 5535 cancel_work(&adev->reset_work); 5536 #endif 5537 5538 if (adev->kfd.dev) 5539 cancel_work(&adev->kfd.reset_work); 5540 5541 if (amdgpu_sriov_vf(adev)) 5542 cancel_work(&adev->virt.flr_work); 5543 5544 if (con && adev->ras_enabled) 5545 cancel_work(&con->recovery_work); 5546 5547 } 5548 5549 static int amdgpu_device_health_check(struct list_head *device_list_handle) 5550 { 5551 struct amdgpu_device *tmp_adev; 5552 int ret = 0; 5553 u32 status; 5554 5555 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 5556 pci_read_config_dword(tmp_adev->pdev, PCI_COMMAND, &status); 5557 if (PCI_POSSIBLE_ERROR(status)) { 5558 dev_err(tmp_adev->dev, "device lost from bus!"); 5559 ret = -ENODEV; 5560 } 5561 } 5562 5563 return ret; 5564 } 5565 5566 /** 5567 * amdgpu_device_gpu_recover - reset the asic and recover scheduler 5568 * 5569 * @adev: amdgpu_device pointer 5570 * @job: which job trigger hang 5571 * @reset_context: amdgpu reset context pointer 5572 * 5573 * Attempt to reset the GPU if it has hung (all asics). 5574 * Attempt to do soft-reset or full-reset and reinitialize Asic 5575 * Returns 0 for success or an error on failure. 5576 */ 5577 5578 int amdgpu_device_gpu_recover(struct amdgpu_device *adev, 5579 struct amdgpu_job *job, 5580 struct amdgpu_reset_context *reset_context) 5581 { 5582 struct list_head device_list, *device_list_handle = NULL; 5583 bool job_signaled = false; 5584 struct amdgpu_hive_info *hive = NULL; 5585 struct amdgpu_device *tmp_adev = NULL; 5586 int i, r = 0; 5587 bool need_emergency_restart = false; 5588 bool audio_suspended = false; 5589 5590 /* 5591 * Special case: RAS triggered and full reset isn't supported 5592 */ 5593 need_emergency_restart = amdgpu_ras_need_emergency_restart(adev); 5594 5595 /* 5596 * Flush RAM to disk so that after reboot 5597 * the user can read log and see why the system rebooted. 5598 */ 5599 if (need_emergency_restart && amdgpu_ras_get_context(adev) && 5600 amdgpu_ras_get_context(adev)->reboot) { 5601 DRM_WARN("Emergency reboot."); 5602 5603 ksys_sync_helper(); 5604 emergency_restart(); 5605 } 5606 5607 dev_info(adev->dev, "GPU %s begin!\n", 5608 need_emergency_restart ? "jobs stop":"reset"); 5609 5610 if (!amdgpu_sriov_vf(adev)) 5611 hive = amdgpu_get_xgmi_hive(adev); 5612 if (hive) 5613 mutex_lock(&hive->hive_lock); 5614 5615 reset_context->job = job; 5616 reset_context->hive = hive; 5617 /* 5618 * Build list of devices to reset. 5619 * In case we are in XGMI hive mode, resort the device list 5620 * to put adev in the 1st position. 5621 */ 5622 INIT_LIST_HEAD(&device_list); 5623 if (!amdgpu_sriov_vf(adev) && (adev->gmc.xgmi.num_physical_nodes > 1)) { 5624 list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head) { 5625 list_add_tail(&tmp_adev->reset_list, &device_list); 5626 if (adev->shutdown) 5627 tmp_adev->shutdown = true; 5628 } 5629 if (!list_is_first(&adev->reset_list, &device_list)) 5630 list_rotate_to_front(&adev->reset_list, &device_list); 5631 device_list_handle = &device_list; 5632 } else { 5633 list_add_tail(&adev->reset_list, &device_list); 5634 device_list_handle = &device_list; 5635 } 5636 5637 if (!amdgpu_sriov_vf(adev)) { 5638 r = amdgpu_device_health_check(device_list_handle); 5639 if (r) 5640 goto end_reset; 5641 } 5642 5643 /* We need to lock reset domain only once both for XGMI and single device */ 5644 tmp_adev = list_first_entry(device_list_handle, struct amdgpu_device, 5645 reset_list); 5646 amdgpu_device_lock_reset_domain(tmp_adev->reset_domain); 5647 5648 /* block all schedulers and reset given job's ring */ 5649 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 5650 5651 amdgpu_device_set_mp1_state(tmp_adev); 5652 5653 /* 5654 * Try to put the audio codec into suspend state 5655 * before gpu reset started. 5656 * 5657 * Due to the power domain of the graphics device 5658 * is shared with AZ power domain. Without this, 5659 * we may change the audio hardware from behind 5660 * the audio driver's back. That will trigger 5661 * some audio codec errors. 5662 */ 5663 if (!amdgpu_device_suspend_display_audio(tmp_adev)) 5664 audio_suspended = true; 5665 5666 amdgpu_ras_set_error_query_ready(tmp_adev, false); 5667 5668 cancel_delayed_work_sync(&tmp_adev->delayed_init_work); 5669 5670 if (!amdgpu_sriov_vf(tmp_adev)) 5671 amdgpu_amdkfd_pre_reset(tmp_adev); 5672 5673 /* 5674 * Mark these ASICs to be reseted as untracked first 5675 * And add them back after reset completed 5676 */ 5677 amdgpu_unregister_gpu_instance(tmp_adev); 5678 5679 drm_fb_helper_set_suspend_unlocked(adev_to_drm(tmp_adev)->fb_helper, true); 5680 5681 /* disable ras on ALL IPs */ 5682 if (!need_emergency_restart && 5683 amdgpu_device_ip_need_full_reset(tmp_adev)) 5684 amdgpu_ras_suspend(tmp_adev); 5685 5686 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 5687 struct amdgpu_ring *ring = tmp_adev->rings[i]; 5688 5689 if (!amdgpu_ring_sched_ready(ring)) 5690 continue; 5691 5692 drm_sched_stop(&ring->sched, job ? &job->base : NULL); 5693 5694 if (need_emergency_restart) 5695 amdgpu_job_stop_all_jobs_on_sched(&ring->sched); 5696 } 5697 atomic_inc(&tmp_adev->gpu_reset_counter); 5698 } 5699 5700 if (need_emergency_restart) 5701 goto skip_sched_resume; 5702 5703 /* 5704 * Must check guilty signal here since after this point all old 5705 * HW fences are force signaled. 5706 * 5707 * job->base holds a reference to parent fence 5708 */ 5709 if (job && dma_fence_is_signaled(&job->hw_fence)) { 5710 job_signaled = true; 5711 dev_info(adev->dev, "Guilty job already signaled, skipping HW reset"); 5712 goto skip_hw_reset; 5713 } 5714 5715 retry: /* Rest of adevs pre asic reset from XGMI hive. */ 5716 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 5717 r = amdgpu_device_pre_asic_reset(tmp_adev, reset_context); 5718 /*TODO Should we stop ?*/ 5719 if (r) { 5720 dev_err(tmp_adev->dev, "GPU pre asic reset failed with err, %d for drm dev, %s ", 5721 r, adev_to_drm(tmp_adev)->unique); 5722 tmp_adev->asic_reset_res = r; 5723 } 5724 5725 if (!amdgpu_sriov_vf(tmp_adev)) 5726 /* 5727 * Drop all pending non scheduler resets. Scheduler resets 5728 * were already dropped during drm_sched_stop 5729 */ 5730 amdgpu_device_stop_pending_resets(tmp_adev); 5731 } 5732 5733 /* Actual ASIC resets if needed.*/ 5734 /* Host driver will handle XGMI hive reset for SRIOV */ 5735 if (amdgpu_sriov_vf(adev)) { 5736 r = amdgpu_device_reset_sriov(adev, job ? false : true); 5737 if (r) 5738 adev->asic_reset_res = r; 5739 5740 /* Aldebaran and gfx_11_0_3 support ras in SRIOV, so need resume ras during reset */ 5741 if (amdgpu_ip_version(adev, GC_HWIP, 0) == 5742 IP_VERSION(9, 4, 2) || 5743 amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 3) || 5744 amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(11, 0, 3)) 5745 amdgpu_ras_resume(adev); 5746 } else { 5747 r = amdgpu_do_asic_reset(device_list_handle, reset_context); 5748 if (r && r == -EAGAIN) 5749 goto retry; 5750 } 5751 5752 skip_hw_reset: 5753 5754 /* Post ASIC reset for all devs .*/ 5755 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 5756 5757 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 5758 struct amdgpu_ring *ring = tmp_adev->rings[i]; 5759 5760 if (!amdgpu_ring_sched_ready(ring)) 5761 continue; 5762 5763 drm_sched_start(&ring->sched, true); 5764 } 5765 5766 if (!drm_drv_uses_atomic_modeset(adev_to_drm(tmp_adev)) && !job_signaled) 5767 drm_helper_resume_force_mode(adev_to_drm(tmp_adev)); 5768 5769 if (tmp_adev->asic_reset_res) 5770 r = tmp_adev->asic_reset_res; 5771 5772 tmp_adev->asic_reset_res = 0; 5773 5774 if (r) { 5775 /* bad news, how to tell it to userspace ? */ 5776 dev_info(tmp_adev->dev, "GPU reset(%d) failed\n", atomic_read(&tmp_adev->gpu_reset_counter)); 5777 amdgpu_vf_error_put(tmp_adev, AMDGIM_ERROR_VF_GPU_RESET_FAIL, 0, r); 5778 } else { 5779 dev_info(tmp_adev->dev, "GPU reset(%d) succeeded!\n", atomic_read(&tmp_adev->gpu_reset_counter)); 5780 if (amdgpu_acpi_smart_shift_update(adev_to_drm(tmp_adev), AMDGPU_SS_DEV_D0)) 5781 DRM_WARN("smart shift update failed\n"); 5782 } 5783 } 5784 5785 skip_sched_resume: 5786 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 5787 /* unlock kfd: SRIOV would do it separately */ 5788 if (!need_emergency_restart && !amdgpu_sriov_vf(tmp_adev)) 5789 amdgpu_amdkfd_post_reset(tmp_adev); 5790 5791 /* kfd_post_reset will do nothing if kfd device is not initialized, 5792 * need to bring up kfd here if it's not be initialized before 5793 */ 5794 if (!adev->kfd.init_complete) 5795 amdgpu_amdkfd_device_init(adev); 5796 5797 if (audio_suspended) 5798 amdgpu_device_resume_display_audio(tmp_adev); 5799 5800 amdgpu_device_unset_mp1_state(tmp_adev); 5801 5802 amdgpu_ras_set_error_query_ready(tmp_adev, true); 5803 } 5804 5805 tmp_adev = list_first_entry(device_list_handle, struct amdgpu_device, 5806 reset_list); 5807 amdgpu_device_unlock_reset_domain(tmp_adev->reset_domain); 5808 5809 end_reset: 5810 if (hive) { 5811 mutex_unlock(&hive->hive_lock); 5812 amdgpu_put_xgmi_hive(hive); 5813 } 5814 5815 if (r) 5816 dev_info(adev->dev, "GPU reset end with ret = %d\n", r); 5817 5818 atomic_set(&adev->reset_domain->reset_res, r); 5819 return r; 5820 } 5821 5822 /** 5823 * amdgpu_device_partner_bandwidth - find the bandwidth of appropriate partner 5824 * 5825 * @adev: amdgpu_device pointer 5826 * @speed: pointer to the speed of the link 5827 * @width: pointer to the width of the link 5828 * 5829 * Evaluate the hierarchy to find the speed and bandwidth capabilities of the 5830 * first physical partner to an AMD dGPU. 5831 * This will exclude any virtual switches and links. 5832 */ 5833 static void amdgpu_device_partner_bandwidth(struct amdgpu_device *adev, 5834 enum pci_bus_speed *speed, 5835 enum pcie_link_width *width) 5836 { 5837 struct pci_dev *parent = adev->pdev; 5838 5839 if (!speed || !width) 5840 return; 5841 5842 *speed = PCI_SPEED_UNKNOWN; 5843 *width = PCIE_LNK_WIDTH_UNKNOWN; 5844 5845 while ((parent = pci_upstream_bridge(parent))) { 5846 /* skip upstream/downstream switches internal to dGPU*/ 5847 if (parent->vendor == PCI_VENDOR_ID_ATI) 5848 continue; 5849 *speed = pcie_get_speed_cap(parent); 5850 *width = pcie_get_width_cap(parent); 5851 break; 5852 } 5853 } 5854 5855 /** 5856 * amdgpu_device_get_pcie_info - fence pcie info about the PCIE slot 5857 * 5858 * @adev: amdgpu_device pointer 5859 * 5860 * Fetchs and stores in the driver the PCIE capabilities (gen speed 5861 * and lanes) of the slot the device is in. Handles APUs and 5862 * virtualized environments where PCIE config space may not be available. 5863 */ 5864 static void amdgpu_device_get_pcie_info(struct amdgpu_device *adev) 5865 { 5866 struct pci_dev *pdev; 5867 enum pci_bus_speed speed_cap, platform_speed_cap; 5868 enum pcie_link_width platform_link_width; 5869 5870 if (amdgpu_pcie_gen_cap) 5871 adev->pm.pcie_gen_mask = amdgpu_pcie_gen_cap; 5872 5873 if (amdgpu_pcie_lane_cap) 5874 adev->pm.pcie_mlw_mask = amdgpu_pcie_lane_cap; 5875 5876 /* covers APUs as well */ 5877 if (pci_is_root_bus(adev->pdev->bus) && !amdgpu_passthrough(adev)) { 5878 if (adev->pm.pcie_gen_mask == 0) 5879 adev->pm.pcie_gen_mask = AMDGPU_DEFAULT_PCIE_GEN_MASK; 5880 if (adev->pm.pcie_mlw_mask == 0) 5881 adev->pm.pcie_mlw_mask = AMDGPU_DEFAULT_PCIE_MLW_MASK; 5882 return; 5883 } 5884 5885 if (adev->pm.pcie_gen_mask && adev->pm.pcie_mlw_mask) 5886 return; 5887 5888 amdgpu_device_partner_bandwidth(adev, &platform_speed_cap, 5889 &platform_link_width); 5890 5891 if (adev->pm.pcie_gen_mask == 0) { 5892 /* asic caps */ 5893 pdev = adev->pdev; 5894 speed_cap = pcie_get_speed_cap(pdev); 5895 if (speed_cap == PCI_SPEED_UNKNOWN) { 5896 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5897 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 | 5898 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3); 5899 } else { 5900 if (speed_cap == PCIE_SPEED_32_0GT) 5901 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5902 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 | 5903 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3 | 5904 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN4 | 5905 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN5); 5906 else if (speed_cap == PCIE_SPEED_16_0GT) 5907 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5908 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 | 5909 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3 | 5910 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN4); 5911 else if (speed_cap == PCIE_SPEED_8_0GT) 5912 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5913 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 | 5914 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3); 5915 else if (speed_cap == PCIE_SPEED_5_0GT) 5916 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5917 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2); 5918 else 5919 adev->pm.pcie_gen_mask |= CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1; 5920 } 5921 /* platform caps */ 5922 if (platform_speed_cap == PCI_SPEED_UNKNOWN) { 5923 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5924 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2); 5925 } else { 5926 if (platform_speed_cap == PCIE_SPEED_32_0GT) 5927 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5928 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 | 5929 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3 | 5930 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN4 | 5931 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN5); 5932 else if (platform_speed_cap == PCIE_SPEED_16_0GT) 5933 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5934 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 | 5935 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3 | 5936 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN4); 5937 else if (platform_speed_cap == PCIE_SPEED_8_0GT) 5938 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5939 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 | 5940 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3); 5941 else if (platform_speed_cap == PCIE_SPEED_5_0GT) 5942 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5943 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2); 5944 else 5945 adev->pm.pcie_gen_mask |= CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1; 5946 5947 } 5948 } 5949 if (adev->pm.pcie_mlw_mask == 0) { 5950 if (platform_link_width == PCIE_LNK_WIDTH_UNKNOWN) { 5951 adev->pm.pcie_mlw_mask |= AMDGPU_DEFAULT_PCIE_MLW_MASK; 5952 } else { 5953 switch (platform_link_width) { 5954 case PCIE_LNK_X32: 5955 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X32 | 5956 CAIL_PCIE_LINK_WIDTH_SUPPORT_X16 | 5957 CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 | 5958 CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 | 5959 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 5960 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 5961 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 5962 break; 5963 case PCIE_LNK_X16: 5964 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X16 | 5965 CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 | 5966 CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 | 5967 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 5968 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 5969 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 5970 break; 5971 case PCIE_LNK_X12: 5972 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 | 5973 CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 | 5974 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 5975 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 5976 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 5977 break; 5978 case PCIE_LNK_X8: 5979 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 | 5980 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 5981 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 5982 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 5983 break; 5984 case PCIE_LNK_X4: 5985 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 5986 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 5987 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 5988 break; 5989 case PCIE_LNK_X2: 5990 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 5991 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 5992 break; 5993 case PCIE_LNK_X1: 5994 adev->pm.pcie_mlw_mask = CAIL_PCIE_LINK_WIDTH_SUPPORT_X1; 5995 break; 5996 default: 5997 break; 5998 } 5999 } 6000 } 6001 } 6002 6003 /** 6004 * amdgpu_device_is_peer_accessible - Check peer access through PCIe BAR 6005 * 6006 * @adev: amdgpu_device pointer 6007 * @peer_adev: amdgpu_device pointer for peer device trying to access @adev 6008 * 6009 * Return true if @peer_adev can access (DMA) @adev through the PCIe 6010 * BAR, i.e. @adev is "large BAR" and the BAR matches the DMA mask of 6011 * @peer_adev. 6012 */ 6013 bool amdgpu_device_is_peer_accessible(struct amdgpu_device *adev, 6014 struct amdgpu_device *peer_adev) 6015 { 6016 #ifdef CONFIG_HSA_AMD_P2P 6017 uint64_t address_mask = peer_adev->dev->dma_mask ? 6018 ~*peer_adev->dev->dma_mask : ~((1ULL << 32) - 1); 6019 resource_size_t aper_limit = 6020 adev->gmc.aper_base + adev->gmc.aper_size - 1; 6021 bool p2p_access = 6022 !adev->gmc.xgmi.connected_to_cpu && 6023 !(pci_p2pdma_distance(adev->pdev, peer_adev->dev, false) < 0); 6024 6025 return pcie_p2p && p2p_access && (adev->gmc.visible_vram_size && 6026 adev->gmc.real_vram_size == adev->gmc.visible_vram_size && 6027 !(adev->gmc.aper_base & address_mask || 6028 aper_limit & address_mask)); 6029 #else 6030 return false; 6031 #endif 6032 } 6033 6034 int amdgpu_device_baco_enter(struct drm_device *dev) 6035 { 6036 struct amdgpu_device *adev = drm_to_adev(dev); 6037 struct amdgpu_ras *ras = amdgpu_ras_get_context(adev); 6038 6039 if (!amdgpu_device_supports_baco(dev)) 6040 return -ENOTSUPP; 6041 6042 if (ras && adev->ras_enabled && 6043 adev->nbio.funcs->enable_doorbell_interrupt) 6044 adev->nbio.funcs->enable_doorbell_interrupt(adev, false); 6045 6046 return amdgpu_dpm_baco_enter(adev); 6047 } 6048 6049 int amdgpu_device_baco_exit(struct drm_device *dev) 6050 { 6051 struct amdgpu_device *adev = drm_to_adev(dev); 6052 struct amdgpu_ras *ras = amdgpu_ras_get_context(adev); 6053 int ret = 0; 6054 6055 if (!amdgpu_device_supports_baco(dev)) 6056 return -ENOTSUPP; 6057 6058 ret = amdgpu_dpm_baco_exit(adev); 6059 if (ret) 6060 return ret; 6061 6062 if (ras && adev->ras_enabled && 6063 adev->nbio.funcs->enable_doorbell_interrupt) 6064 adev->nbio.funcs->enable_doorbell_interrupt(adev, true); 6065 6066 if (amdgpu_passthrough(adev) && 6067 adev->nbio.funcs->clear_doorbell_interrupt) 6068 adev->nbio.funcs->clear_doorbell_interrupt(adev); 6069 6070 return 0; 6071 } 6072 6073 /** 6074 * amdgpu_pci_error_detected - Called when a PCI error is detected. 6075 * @pdev: PCI device struct 6076 * @state: PCI channel state 6077 * 6078 * Description: Called when a PCI error is detected. 6079 * 6080 * Return: PCI_ERS_RESULT_NEED_RESET or PCI_ERS_RESULT_DISCONNECT. 6081 */ 6082 pci_ers_result_t amdgpu_pci_error_detected(struct pci_dev *pdev, pci_channel_state_t state) 6083 { 6084 struct drm_device *dev = pci_get_drvdata(pdev); 6085 struct amdgpu_device *adev = drm_to_adev(dev); 6086 int i; 6087 6088 DRM_INFO("PCI error: detected callback, state(%d)!!\n", state); 6089 6090 if (adev->gmc.xgmi.num_physical_nodes > 1) { 6091 DRM_WARN("No support for XGMI hive yet..."); 6092 return PCI_ERS_RESULT_DISCONNECT; 6093 } 6094 6095 adev->pci_channel_state = state; 6096 6097 switch (state) { 6098 case pci_channel_io_normal: 6099 return PCI_ERS_RESULT_CAN_RECOVER; 6100 /* Fatal error, prepare for slot reset */ 6101 case pci_channel_io_frozen: 6102 /* 6103 * Locking adev->reset_domain->sem will prevent any external access 6104 * to GPU during PCI error recovery 6105 */ 6106 amdgpu_device_lock_reset_domain(adev->reset_domain); 6107 amdgpu_device_set_mp1_state(adev); 6108 6109 /* 6110 * Block any work scheduling as we do for regular GPU reset 6111 * for the duration of the recovery 6112 */ 6113 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 6114 struct amdgpu_ring *ring = adev->rings[i]; 6115 6116 if (!amdgpu_ring_sched_ready(ring)) 6117 continue; 6118 6119 drm_sched_stop(&ring->sched, NULL); 6120 } 6121 atomic_inc(&adev->gpu_reset_counter); 6122 return PCI_ERS_RESULT_NEED_RESET; 6123 case pci_channel_io_perm_failure: 6124 /* Permanent error, prepare for device removal */ 6125 return PCI_ERS_RESULT_DISCONNECT; 6126 } 6127 6128 return PCI_ERS_RESULT_NEED_RESET; 6129 } 6130 6131 /** 6132 * amdgpu_pci_mmio_enabled - Enable MMIO and dump debug registers 6133 * @pdev: pointer to PCI device 6134 */ 6135 pci_ers_result_t amdgpu_pci_mmio_enabled(struct pci_dev *pdev) 6136 { 6137 6138 DRM_INFO("PCI error: mmio enabled callback!!\n"); 6139 6140 /* TODO - dump whatever for debugging purposes */ 6141 6142 /* This called only if amdgpu_pci_error_detected returns 6143 * PCI_ERS_RESULT_CAN_RECOVER. Read/write to the device still 6144 * works, no need to reset slot. 6145 */ 6146 6147 return PCI_ERS_RESULT_RECOVERED; 6148 } 6149 6150 /** 6151 * amdgpu_pci_slot_reset - Called when PCI slot has been reset. 6152 * @pdev: PCI device struct 6153 * 6154 * Description: This routine is called by the pci error recovery 6155 * code after the PCI slot has been reset, just before we 6156 * should resume normal operations. 6157 */ 6158 pci_ers_result_t amdgpu_pci_slot_reset(struct pci_dev *pdev) 6159 { 6160 struct drm_device *dev = pci_get_drvdata(pdev); 6161 struct amdgpu_device *adev = drm_to_adev(dev); 6162 int r, i; 6163 struct amdgpu_reset_context reset_context; 6164 u32 memsize; 6165 struct list_head device_list; 6166 struct amdgpu_hive_info *hive; 6167 int hive_ras_recovery = 0; 6168 struct amdgpu_ras *ras; 6169 6170 /* PCI error slot reset should be skipped During RAS recovery */ 6171 hive = amdgpu_get_xgmi_hive(adev); 6172 if (hive) { 6173 hive_ras_recovery = atomic_read(&hive->ras_recovery); 6174 amdgpu_put_xgmi_hive(hive); 6175 } 6176 ras = amdgpu_ras_get_context(adev); 6177 if ((amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 3)) && 6178 ras && (atomic_read(&ras->in_recovery) || hive_ras_recovery)) 6179 return PCI_ERS_RESULT_RECOVERED; 6180 6181 DRM_INFO("PCI error: slot reset callback!!\n"); 6182 6183 memset(&reset_context, 0, sizeof(reset_context)); 6184 6185 INIT_LIST_HEAD(&device_list); 6186 list_add_tail(&adev->reset_list, &device_list); 6187 6188 /* wait for asic to come out of reset */ 6189 msleep(500); 6190 6191 /* Restore PCI confspace */ 6192 amdgpu_device_load_pci_state(pdev); 6193 6194 /* confirm ASIC came out of reset */ 6195 for (i = 0; i < adev->usec_timeout; i++) { 6196 memsize = amdgpu_asic_get_config_memsize(adev); 6197 6198 if (memsize != 0xffffffff) 6199 break; 6200 udelay(1); 6201 } 6202 if (memsize == 0xffffffff) { 6203 r = -ETIME; 6204 goto out; 6205 } 6206 6207 reset_context.method = AMD_RESET_METHOD_NONE; 6208 reset_context.reset_req_dev = adev; 6209 set_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags); 6210 set_bit(AMDGPU_SKIP_HW_RESET, &reset_context.flags); 6211 6212 adev->no_hw_access = true; 6213 r = amdgpu_device_pre_asic_reset(adev, &reset_context); 6214 adev->no_hw_access = false; 6215 if (r) 6216 goto out; 6217 6218 r = amdgpu_do_asic_reset(&device_list, &reset_context); 6219 6220 out: 6221 if (!r) { 6222 if (amdgpu_device_cache_pci_state(adev->pdev)) 6223 pci_restore_state(adev->pdev); 6224 6225 DRM_INFO("PCIe error recovery succeeded\n"); 6226 } else { 6227 DRM_ERROR("PCIe error recovery failed, err:%d", r); 6228 amdgpu_device_unset_mp1_state(adev); 6229 amdgpu_device_unlock_reset_domain(adev->reset_domain); 6230 } 6231 6232 return r ? PCI_ERS_RESULT_DISCONNECT : PCI_ERS_RESULT_RECOVERED; 6233 } 6234 6235 /** 6236 * amdgpu_pci_resume() - resume normal ops after PCI reset 6237 * @pdev: pointer to PCI device 6238 * 6239 * Called when the error recovery driver tells us that its 6240 * OK to resume normal operation. 6241 */ 6242 void amdgpu_pci_resume(struct pci_dev *pdev) 6243 { 6244 struct drm_device *dev = pci_get_drvdata(pdev); 6245 struct amdgpu_device *adev = drm_to_adev(dev); 6246 int i; 6247 6248 6249 DRM_INFO("PCI error: resume callback!!\n"); 6250 6251 /* Only continue execution for the case of pci_channel_io_frozen */ 6252 if (adev->pci_channel_state != pci_channel_io_frozen) 6253 return; 6254 6255 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 6256 struct amdgpu_ring *ring = adev->rings[i]; 6257 6258 if (!amdgpu_ring_sched_ready(ring)) 6259 continue; 6260 6261 drm_sched_start(&ring->sched, true); 6262 } 6263 6264 amdgpu_device_unset_mp1_state(adev); 6265 amdgpu_device_unlock_reset_domain(adev->reset_domain); 6266 } 6267 6268 bool amdgpu_device_cache_pci_state(struct pci_dev *pdev) 6269 { 6270 struct drm_device *dev = pci_get_drvdata(pdev); 6271 struct amdgpu_device *adev = drm_to_adev(dev); 6272 int r; 6273 6274 r = pci_save_state(pdev); 6275 if (!r) { 6276 kfree(adev->pci_state); 6277 6278 adev->pci_state = pci_store_saved_state(pdev); 6279 6280 if (!adev->pci_state) { 6281 DRM_ERROR("Failed to store PCI saved state"); 6282 return false; 6283 } 6284 } else { 6285 DRM_WARN("Failed to save PCI state, err:%d\n", r); 6286 return false; 6287 } 6288 6289 return true; 6290 } 6291 6292 bool amdgpu_device_load_pci_state(struct pci_dev *pdev) 6293 { 6294 struct drm_device *dev = pci_get_drvdata(pdev); 6295 struct amdgpu_device *adev = drm_to_adev(dev); 6296 int r; 6297 6298 if (!adev->pci_state) 6299 return false; 6300 6301 r = pci_load_saved_state(pdev, adev->pci_state); 6302 6303 if (!r) { 6304 pci_restore_state(pdev); 6305 } else { 6306 DRM_WARN("Failed to load PCI state, err:%d\n", r); 6307 return false; 6308 } 6309 6310 return true; 6311 } 6312 6313 void amdgpu_device_flush_hdp(struct amdgpu_device *adev, 6314 struct amdgpu_ring *ring) 6315 { 6316 #ifdef CONFIG_X86_64 6317 if ((adev->flags & AMD_IS_APU) && !amdgpu_passthrough(adev)) 6318 return; 6319 #endif 6320 if (adev->gmc.xgmi.connected_to_cpu) 6321 return; 6322 6323 if (ring && ring->funcs->emit_hdp_flush) 6324 amdgpu_ring_emit_hdp_flush(ring); 6325 else 6326 amdgpu_asic_flush_hdp(adev, ring); 6327 } 6328 6329 void amdgpu_device_invalidate_hdp(struct amdgpu_device *adev, 6330 struct amdgpu_ring *ring) 6331 { 6332 #ifdef CONFIG_X86_64 6333 if ((adev->flags & AMD_IS_APU) && !amdgpu_passthrough(adev)) 6334 return; 6335 #endif 6336 if (adev->gmc.xgmi.connected_to_cpu) 6337 return; 6338 6339 amdgpu_asic_invalidate_hdp(adev, ring); 6340 } 6341 6342 int amdgpu_in_reset(struct amdgpu_device *adev) 6343 { 6344 return atomic_read(&adev->reset_domain->in_gpu_reset); 6345 } 6346 6347 /** 6348 * amdgpu_device_halt() - bring hardware to some kind of halt state 6349 * 6350 * @adev: amdgpu_device pointer 6351 * 6352 * Bring hardware to some kind of halt state so that no one can touch it 6353 * any more. It will help to maintain error context when error occurred. 6354 * Compare to a simple hang, the system will keep stable at least for SSH 6355 * access. Then it should be trivial to inspect the hardware state and 6356 * see what's going on. Implemented as following: 6357 * 6358 * 1. drm_dev_unplug() makes device inaccessible to user space(IOCTLs, etc), 6359 * clears all CPU mappings to device, disallows remappings through page faults 6360 * 2. amdgpu_irq_disable_all() disables all interrupts 6361 * 3. amdgpu_fence_driver_hw_fini() signals all HW fences 6362 * 4. set adev->no_hw_access to avoid potential crashes after setp 5 6363 * 5. amdgpu_device_unmap_mmio() clears all MMIO mappings 6364 * 6. pci_disable_device() and pci_wait_for_pending_transaction() 6365 * flush any in flight DMA operations 6366 */ 6367 void amdgpu_device_halt(struct amdgpu_device *adev) 6368 { 6369 struct pci_dev *pdev = adev->pdev; 6370 struct drm_device *ddev = adev_to_drm(adev); 6371 6372 amdgpu_xcp_dev_unplug(adev); 6373 drm_dev_unplug(ddev); 6374 6375 amdgpu_irq_disable_all(adev); 6376 6377 amdgpu_fence_driver_hw_fini(adev); 6378 6379 adev->no_hw_access = true; 6380 6381 amdgpu_device_unmap_mmio(adev); 6382 6383 pci_disable_device(pdev); 6384 pci_wait_for_pending_transaction(pdev); 6385 } 6386 6387 u32 amdgpu_device_pcie_port_rreg(struct amdgpu_device *adev, 6388 u32 reg) 6389 { 6390 unsigned long flags, address, data; 6391 u32 r; 6392 6393 address = adev->nbio.funcs->get_pcie_port_index_offset(adev); 6394 data = adev->nbio.funcs->get_pcie_port_data_offset(adev); 6395 6396 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 6397 WREG32(address, reg * 4); 6398 (void)RREG32(address); 6399 r = RREG32(data); 6400 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 6401 return r; 6402 } 6403 6404 void amdgpu_device_pcie_port_wreg(struct amdgpu_device *adev, 6405 u32 reg, u32 v) 6406 { 6407 unsigned long flags, address, data; 6408 6409 address = adev->nbio.funcs->get_pcie_port_index_offset(adev); 6410 data = adev->nbio.funcs->get_pcie_port_data_offset(adev); 6411 6412 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 6413 WREG32(address, reg * 4); 6414 (void)RREG32(address); 6415 WREG32(data, v); 6416 (void)RREG32(data); 6417 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 6418 } 6419 6420 /** 6421 * amdgpu_device_switch_gang - switch to a new gang 6422 * @adev: amdgpu_device pointer 6423 * @gang: the gang to switch to 6424 * 6425 * Try to switch to a new gang. 6426 * Returns: NULL if we switched to the new gang or a reference to the current 6427 * gang leader. 6428 */ 6429 struct dma_fence *amdgpu_device_switch_gang(struct amdgpu_device *adev, 6430 struct dma_fence *gang) 6431 { 6432 struct dma_fence *old = NULL; 6433 6434 do { 6435 dma_fence_put(old); 6436 rcu_read_lock(); 6437 old = dma_fence_get_rcu_safe(&adev->gang_submit); 6438 rcu_read_unlock(); 6439 6440 if (old == gang) 6441 break; 6442 6443 if (!dma_fence_is_signaled(old)) 6444 return old; 6445 6446 } while (cmpxchg((struct dma_fence __force **)&adev->gang_submit, 6447 old, gang) != old); 6448 6449 dma_fence_put(old); 6450 return NULL; 6451 } 6452 6453 bool amdgpu_device_has_display_hardware(struct amdgpu_device *adev) 6454 { 6455 switch (adev->asic_type) { 6456 #ifdef CONFIG_DRM_AMDGPU_SI 6457 case CHIP_HAINAN: 6458 #endif 6459 case CHIP_TOPAZ: 6460 /* chips with no display hardware */ 6461 return false; 6462 #ifdef CONFIG_DRM_AMDGPU_SI 6463 case CHIP_TAHITI: 6464 case CHIP_PITCAIRN: 6465 case CHIP_VERDE: 6466 case CHIP_OLAND: 6467 #endif 6468 #ifdef CONFIG_DRM_AMDGPU_CIK 6469 case CHIP_BONAIRE: 6470 case CHIP_HAWAII: 6471 case CHIP_KAVERI: 6472 case CHIP_KABINI: 6473 case CHIP_MULLINS: 6474 #endif 6475 case CHIP_TONGA: 6476 case CHIP_FIJI: 6477 case CHIP_POLARIS10: 6478 case CHIP_POLARIS11: 6479 case CHIP_POLARIS12: 6480 case CHIP_VEGAM: 6481 case CHIP_CARRIZO: 6482 case CHIP_STONEY: 6483 /* chips with display hardware */ 6484 return true; 6485 default: 6486 /* IP discovery */ 6487 if (!amdgpu_ip_version(adev, DCE_HWIP, 0) || 6488 (adev->harvest_ip_mask & AMD_HARVEST_IP_DMU_MASK)) 6489 return false; 6490 return true; 6491 } 6492 } 6493 6494 uint32_t amdgpu_device_wait_on_rreg(struct amdgpu_device *adev, 6495 uint32_t inst, uint32_t reg_addr, char reg_name[], 6496 uint32_t expected_value, uint32_t mask) 6497 { 6498 uint32_t ret = 0; 6499 uint32_t old_ = 0; 6500 uint32_t tmp_ = RREG32(reg_addr); 6501 uint32_t loop = adev->usec_timeout; 6502 6503 while ((tmp_ & (mask)) != (expected_value)) { 6504 if (old_ != tmp_) { 6505 loop = adev->usec_timeout; 6506 old_ = tmp_; 6507 } else 6508 udelay(1); 6509 tmp_ = RREG32(reg_addr); 6510 loop--; 6511 if (!loop) { 6512 DRM_WARN("Register(%d) [%s] failed to reach value 0x%08x != 0x%08xn", 6513 inst, reg_name, (uint32_t)expected_value, 6514 (uint32_t)(tmp_ & (mask))); 6515 ret = -ETIMEDOUT; 6516 break; 6517 } 6518 } 6519 return ret; 6520 } 6521