1 /* 2 * Copyright 2008 Advanced Micro Devices, Inc. 3 * Copyright 2008 Red Hat Inc. 4 * Copyright 2009 Jerome Glisse. 5 * 6 * Permission is hereby granted, free of charge, to any person obtaining a 7 * copy of this software and associated documentation files (the "Software"), 8 * to deal in the Software without restriction, including without limitation 9 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 10 * and/or sell copies of the Software, and to permit persons to whom the 11 * Software is furnished to do so, subject to the following conditions: 12 * 13 * The above copyright notice and this permission notice shall be included in 14 * all copies or substantial portions of the Software. 15 * 16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 19 * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR 20 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, 21 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR 22 * OTHER DEALINGS IN THE SOFTWARE. 23 * 24 * Authors: Dave Airlie 25 * Alex Deucher 26 * Jerome Glisse 27 */ 28 29 #include <linux/aperture.h> 30 #include <linux/power_supply.h> 31 #include <linux/kthread.h> 32 #include <linux/module.h> 33 #include <linux/console.h> 34 #include <linux/slab.h> 35 #include <linux/iommu.h> 36 #include <linux/pci.h> 37 #include <linux/pci-p2pdma.h> 38 #include <linux/apple-gmux.h> 39 40 #include <drm/drm_atomic_helper.h> 41 #include <drm/drm_crtc_helper.h> 42 #include <drm/drm_fb_helper.h> 43 #include <drm/drm_probe_helper.h> 44 #include <drm/amdgpu_drm.h> 45 #include <linux/device.h> 46 #include <linux/vgaarb.h> 47 #include <linux/vga_switcheroo.h> 48 #include <linux/efi.h> 49 #include "amdgpu.h" 50 #include "amdgpu_trace.h" 51 #include "amdgpu_i2c.h" 52 #include "atom.h" 53 #include "amdgpu_atombios.h" 54 #include "amdgpu_atomfirmware.h" 55 #include "amd_pcie.h" 56 #ifdef CONFIG_DRM_AMDGPU_SI 57 #include "si.h" 58 #endif 59 #ifdef CONFIG_DRM_AMDGPU_CIK 60 #include "cik.h" 61 #endif 62 #include "vi.h" 63 #include "soc15.h" 64 #include "nv.h" 65 #include "bif/bif_4_1_d.h" 66 #include <linux/firmware.h> 67 #include "amdgpu_vf_error.h" 68 69 #include "amdgpu_amdkfd.h" 70 #include "amdgpu_pm.h" 71 72 #include "amdgpu_xgmi.h" 73 #include "amdgpu_ras.h" 74 #include "amdgpu_pmu.h" 75 #include "amdgpu_fru_eeprom.h" 76 #include "amdgpu_reset.h" 77 #include "amdgpu_virt.h" 78 #include "amdgpu_dev_coredump.h" 79 80 #include <linux/suspend.h> 81 #include <drm/task_barrier.h> 82 #include <linux/pm_runtime.h> 83 84 #include <drm/drm_drv.h> 85 86 #if IS_ENABLED(CONFIG_X86) 87 #include <asm/intel-family.h> 88 #endif 89 90 MODULE_FIRMWARE("amdgpu/vega10_gpu_info.bin"); 91 MODULE_FIRMWARE("amdgpu/vega12_gpu_info.bin"); 92 MODULE_FIRMWARE("amdgpu/raven_gpu_info.bin"); 93 MODULE_FIRMWARE("amdgpu/picasso_gpu_info.bin"); 94 MODULE_FIRMWARE("amdgpu/raven2_gpu_info.bin"); 95 MODULE_FIRMWARE("amdgpu/arcturus_gpu_info.bin"); 96 MODULE_FIRMWARE("amdgpu/navi12_gpu_info.bin"); 97 98 #define AMDGPU_RESUME_MS 2000 99 #define AMDGPU_MAX_RETRY_LIMIT 2 100 #define AMDGPU_RETRY_SRIOV_RESET(r) ((r) == -EBUSY || (r) == -ETIMEDOUT || (r) == -EINVAL) 101 #define AMDGPU_PCIE_INDEX_FALLBACK (0x38 >> 2) 102 #define AMDGPU_PCIE_INDEX_HI_FALLBACK (0x44 >> 2) 103 #define AMDGPU_PCIE_DATA_FALLBACK (0x3C >> 2) 104 105 static const struct drm_driver amdgpu_kms_driver; 106 107 const char *amdgpu_asic_name[] = { 108 "TAHITI", 109 "PITCAIRN", 110 "VERDE", 111 "OLAND", 112 "HAINAN", 113 "BONAIRE", 114 "KAVERI", 115 "KABINI", 116 "HAWAII", 117 "MULLINS", 118 "TOPAZ", 119 "TONGA", 120 "FIJI", 121 "CARRIZO", 122 "STONEY", 123 "POLARIS10", 124 "POLARIS11", 125 "POLARIS12", 126 "VEGAM", 127 "VEGA10", 128 "VEGA12", 129 "VEGA20", 130 "RAVEN", 131 "ARCTURUS", 132 "RENOIR", 133 "ALDEBARAN", 134 "NAVI10", 135 "CYAN_SKILLFISH", 136 "NAVI14", 137 "NAVI12", 138 "SIENNA_CICHLID", 139 "NAVY_FLOUNDER", 140 "VANGOGH", 141 "DIMGREY_CAVEFISH", 142 "BEIGE_GOBY", 143 "YELLOW_CARP", 144 "IP DISCOVERY", 145 "LAST", 146 }; 147 148 static inline void amdgpu_device_stop_pending_resets(struct amdgpu_device *adev); 149 150 /** 151 * DOC: pcie_replay_count 152 * 153 * The amdgpu driver provides a sysfs API for reporting the total number 154 * of PCIe replays (NAKs) 155 * The file pcie_replay_count is used for this and returns the total 156 * number of replays as a sum of the NAKs generated and NAKs received 157 */ 158 159 static ssize_t amdgpu_device_get_pcie_replay_count(struct device *dev, 160 struct device_attribute *attr, char *buf) 161 { 162 struct drm_device *ddev = dev_get_drvdata(dev); 163 struct amdgpu_device *adev = drm_to_adev(ddev); 164 uint64_t cnt = amdgpu_asic_get_pcie_replay_count(adev); 165 166 return sysfs_emit(buf, "%llu\n", cnt); 167 } 168 169 static DEVICE_ATTR(pcie_replay_count, 0444, 170 amdgpu_device_get_pcie_replay_count, NULL); 171 172 static ssize_t amdgpu_sysfs_reg_state_get(struct file *f, struct kobject *kobj, 173 struct bin_attribute *attr, char *buf, 174 loff_t ppos, size_t count) 175 { 176 struct device *dev = kobj_to_dev(kobj); 177 struct drm_device *ddev = dev_get_drvdata(dev); 178 struct amdgpu_device *adev = drm_to_adev(ddev); 179 ssize_t bytes_read; 180 181 switch (ppos) { 182 case AMDGPU_SYS_REG_STATE_XGMI: 183 bytes_read = amdgpu_asic_get_reg_state( 184 adev, AMDGPU_REG_STATE_TYPE_XGMI, buf, count); 185 break; 186 case AMDGPU_SYS_REG_STATE_WAFL: 187 bytes_read = amdgpu_asic_get_reg_state( 188 adev, AMDGPU_REG_STATE_TYPE_WAFL, buf, count); 189 break; 190 case AMDGPU_SYS_REG_STATE_PCIE: 191 bytes_read = amdgpu_asic_get_reg_state( 192 adev, AMDGPU_REG_STATE_TYPE_PCIE, buf, count); 193 break; 194 case AMDGPU_SYS_REG_STATE_USR: 195 bytes_read = amdgpu_asic_get_reg_state( 196 adev, AMDGPU_REG_STATE_TYPE_USR, buf, count); 197 break; 198 case AMDGPU_SYS_REG_STATE_USR_1: 199 bytes_read = amdgpu_asic_get_reg_state( 200 adev, AMDGPU_REG_STATE_TYPE_USR_1, buf, count); 201 break; 202 default: 203 return -EINVAL; 204 } 205 206 return bytes_read; 207 } 208 209 BIN_ATTR(reg_state, 0444, amdgpu_sysfs_reg_state_get, NULL, 210 AMDGPU_SYS_REG_STATE_END); 211 212 int amdgpu_reg_state_sysfs_init(struct amdgpu_device *adev) 213 { 214 int ret; 215 216 if (!amdgpu_asic_get_reg_state_supported(adev)) 217 return 0; 218 219 ret = sysfs_create_bin_file(&adev->dev->kobj, &bin_attr_reg_state); 220 221 return ret; 222 } 223 224 void amdgpu_reg_state_sysfs_fini(struct amdgpu_device *adev) 225 { 226 if (!amdgpu_asic_get_reg_state_supported(adev)) 227 return; 228 sysfs_remove_bin_file(&adev->dev->kobj, &bin_attr_reg_state); 229 } 230 231 /** 232 * DOC: board_info 233 * 234 * The amdgpu driver provides a sysfs API for giving board related information. 235 * It provides the form factor information in the format 236 * 237 * type : form factor 238 * 239 * Possible form factor values 240 * 241 * - "cem" - PCIE CEM card 242 * - "oam" - Open Compute Accelerator Module 243 * - "unknown" - Not known 244 * 245 */ 246 247 static ssize_t amdgpu_device_get_board_info(struct device *dev, 248 struct device_attribute *attr, 249 char *buf) 250 { 251 struct drm_device *ddev = dev_get_drvdata(dev); 252 struct amdgpu_device *adev = drm_to_adev(ddev); 253 enum amdgpu_pkg_type pkg_type = AMDGPU_PKG_TYPE_CEM; 254 const char *pkg; 255 256 if (adev->smuio.funcs && adev->smuio.funcs->get_pkg_type) 257 pkg_type = adev->smuio.funcs->get_pkg_type(adev); 258 259 switch (pkg_type) { 260 case AMDGPU_PKG_TYPE_CEM: 261 pkg = "cem"; 262 break; 263 case AMDGPU_PKG_TYPE_OAM: 264 pkg = "oam"; 265 break; 266 default: 267 pkg = "unknown"; 268 break; 269 } 270 271 return sysfs_emit(buf, "%s : %s\n", "type", pkg); 272 } 273 274 static DEVICE_ATTR(board_info, 0444, amdgpu_device_get_board_info, NULL); 275 276 static struct attribute *amdgpu_board_attrs[] = { 277 &dev_attr_board_info.attr, 278 NULL, 279 }; 280 281 static umode_t amdgpu_board_attrs_is_visible(struct kobject *kobj, 282 struct attribute *attr, int n) 283 { 284 struct device *dev = kobj_to_dev(kobj); 285 struct drm_device *ddev = dev_get_drvdata(dev); 286 struct amdgpu_device *adev = drm_to_adev(ddev); 287 288 if (adev->flags & AMD_IS_APU) 289 return 0; 290 291 return attr->mode; 292 } 293 294 static const struct attribute_group amdgpu_board_attrs_group = { 295 .attrs = amdgpu_board_attrs, 296 .is_visible = amdgpu_board_attrs_is_visible 297 }; 298 299 static void amdgpu_device_get_pcie_info(struct amdgpu_device *adev); 300 301 302 /** 303 * amdgpu_device_supports_px - Is the device a dGPU with ATPX power control 304 * 305 * @dev: drm_device pointer 306 * 307 * Returns true if the device is a dGPU with ATPX power control, 308 * otherwise return false. 309 */ 310 bool amdgpu_device_supports_px(struct drm_device *dev) 311 { 312 struct amdgpu_device *adev = drm_to_adev(dev); 313 314 if ((adev->flags & AMD_IS_PX) && !amdgpu_is_atpx_hybrid()) 315 return true; 316 return false; 317 } 318 319 /** 320 * amdgpu_device_supports_boco - Is the device a dGPU with ACPI power resources 321 * 322 * @dev: drm_device pointer 323 * 324 * Returns true if the device is a dGPU with ACPI power control, 325 * otherwise return false. 326 */ 327 bool amdgpu_device_supports_boco(struct drm_device *dev) 328 { 329 struct amdgpu_device *adev = drm_to_adev(dev); 330 331 if (adev->has_pr3 || 332 ((adev->flags & AMD_IS_PX) && amdgpu_is_atpx_hybrid())) 333 return true; 334 return false; 335 } 336 337 /** 338 * amdgpu_device_supports_baco - Does the device support BACO 339 * 340 * @dev: drm_device pointer 341 * 342 * Return: 343 * 1 if the device supporte BACO; 344 * 3 if the device support MACO (only works if BACO is supported) 345 * otherwise return 0. 346 */ 347 int amdgpu_device_supports_baco(struct drm_device *dev) 348 { 349 struct amdgpu_device *adev = drm_to_adev(dev); 350 351 return amdgpu_asic_supports_baco(adev); 352 } 353 354 void amdgpu_device_detect_runtime_pm_mode(struct amdgpu_device *adev) 355 { 356 struct drm_device *dev; 357 int bamaco_support; 358 359 dev = adev_to_drm(adev); 360 361 adev->pm.rpm_mode = AMDGPU_RUNPM_NONE; 362 bamaco_support = amdgpu_device_supports_baco(dev); 363 364 switch (amdgpu_runtime_pm) { 365 case 2: 366 if (bamaco_support & MACO_SUPPORT) { 367 adev->pm.rpm_mode = AMDGPU_RUNPM_BAMACO; 368 dev_info(adev->dev, "Forcing BAMACO for runtime pm\n"); 369 } else if (bamaco_support == BACO_SUPPORT) { 370 adev->pm.rpm_mode = AMDGPU_RUNPM_BACO; 371 dev_info(adev->dev, "Requested mode BAMACO not available,fallback to use BACO\n"); 372 } 373 break; 374 case 1: 375 if (bamaco_support & BACO_SUPPORT) { 376 adev->pm.rpm_mode = AMDGPU_RUNPM_BACO; 377 dev_info(adev->dev, "Forcing BACO for runtime pm\n"); 378 } 379 break; 380 case -1: 381 case -2: 382 if (amdgpu_device_supports_px(dev)) { /* enable PX as runtime mode */ 383 adev->pm.rpm_mode = AMDGPU_RUNPM_PX; 384 dev_info(adev->dev, "Using ATPX for runtime pm\n"); 385 } else if (amdgpu_device_supports_boco(dev)) { /* enable boco as runtime mode */ 386 adev->pm.rpm_mode = AMDGPU_RUNPM_BOCO; 387 dev_info(adev->dev, "Using BOCO for runtime pm\n"); 388 } else { 389 if (!bamaco_support) 390 goto no_runtime_pm; 391 392 switch (adev->asic_type) { 393 case CHIP_VEGA20: 394 case CHIP_ARCTURUS: 395 /* BACO are not supported on vega20 and arctrus */ 396 break; 397 case CHIP_VEGA10: 398 /* enable BACO as runpm mode if noretry=0 */ 399 if (!adev->gmc.noretry) 400 adev->pm.rpm_mode = AMDGPU_RUNPM_BACO; 401 break; 402 default: 403 /* enable BACO as runpm mode on CI+ */ 404 adev->pm.rpm_mode = AMDGPU_RUNPM_BACO; 405 break; 406 } 407 408 if (adev->pm.rpm_mode == AMDGPU_RUNPM_BACO) { 409 if (bamaco_support & MACO_SUPPORT) { 410 adev->pm.rpm_mode = AMDGPU_RUNPM_BAMACO; 411 dev_info(adev->dev, "Using BAMACO for runtime pm\n"); 412 } else { 413 dev_info(adev->dev, "Using BACO for runtime pm\n"); 414 } 415 } 416 } 417 break; 418 case 0: 419 dev_info(adev->dev, "runtime pm is manually disabled\n"); 420 break; 421 default: 422 break; 423 } 424 425 no_runtime_pm: 426 if (adev->pm.rpm_mode == AMDGPU_RUNPM_NONE) 427 dev_info(adev->dev, "Runtime PM not available\n"); 428 } 429 /** 430 * amdgpu_device_supports_smart_shift - Is the device dGPU with 431 * smart shift support 432 * 433 * @dev: drm_device pointer 434 * 435 * Returns true if the device is a dGPU with Smart Shift support, 436 * otherwise returns false. 437 */ 438 bool amdgpu_device_supports_smart_shift(struct drm_device *dev) 439 { 440 return (amdgpu_device_supports_boco(dev) && 441 amdgpu_acpi_is_power_shift_control_supported()); 442 } 443 444 /* 445 * VRAM access helper functions 446 */ 447 448 /** 449 * amdgpu_device_mm_access - access vram by MM_INDEX/MM_DATA 450 * 451 * @adev: amdgpu_device pointer 452 * @pos: offset of the buffer in vram 453 * @buf: virtual address of the buffer in system memory 454 * @size: read/write size, sizeof(@buf) must > @size 455 * @write: true - write to vram, otherwise - read from vram 456 */ 457 void amdgpu_device_mm_access(struct amdgpu_device *adev, loff_t pos, 458 void *buf, size_t size, bool write) 459 { 460 unsigned long flags; 461 uint32_t hi = ~0, tmp = 0; 462 uint32_t *data = buf; 463 uint64_t last; 464 int idx; 465 466 if (!drm_dev_enter(adev_to_drm(adev), &idx)) 467 return; 468 469 BUG_ON(!IS_ALIGNED(pos, 4) || !IS_ALIGNED(size, 4)); 470 471 spin_lock_irqsave(&adev->mmio_idx_lock, flags); 472 for (last = pos + size; pos < last; pos += 4) { 473 tmp = pos >> 31; 474 475 WREG32_NO_KIQ(mmMM_INDEX, ((uint32_t)pos) | 0x80000000); 476 if (tmp != hi) { 477 WREG32_NO_KIQ(mmMM_INDEX_HI, tmp); 478 hi = tmp; 479 } 480 if (write) 481 WREG32_NO_KIQ(mmMM_DATA, *data++); 482 else 483 *data++ = RREG32_NO_KIQ(mmMM_DATA); 484 } 485 486 spin_unlock_irqrestore(&adev->mmio_idx_lock, flags); 487 drm_dev_exit(idx); 488 } 489 490 /** 491 * amdgpu_device_aper_access - access vram by vram aperature 492 * 493 * @adev: amdgpu_device pointer 494 * @pos: offset of the buffer in vram 495 * @buf: virtual address of the buffer in system memory 496 * @size: read/write size, sizeof(@buf) must > @size 497 * @write: true - write to vram, otherwise - read from vram 498 * 499 * The return value means how many bytes have been transferred. 500 */ 501 size_t amdgpu_device_aper_access(struct amdgpu_device *adev, loff_t pos, 502 void *buf, size_t size, bool write) 503 { 504 #ifdef CONFIG_64BIT 505 void __iomem *addr; 506 size_t count = 0; 507 uint64_t last; 508 509 if (!adev->mman.aper_base_kaddr) 510 return 0; 511 512 last = min(pos + size, adev->gmc.visible_vram_size); 513 if (last > pos) { 514 addr = adev->mman.aper_base_kaddr + pos; 515 count = last - pos; 516 517 if (write) { 518 memcpy_toio(addr, buf, count); 519 /* Make sure HDP write cache flush happens without any reordering 520 * after the system memory contents are sent over PCIe device 521 */ 522 mb(); 523 amdgpu_device_flush_hdp(adev, NULL); 524 } else { 525 amdgpu_device_invalidate_hdp(adev, NULL); 526 /* Make sure HDP read cache is invalidated before issuing a read 527 * to the PCIe device 528 */ 529 mb(); 530 memcpy_fromio(buf, addr, count); 531 } 532 533 } 534 535 return count; 536 #else 537 return 0; 538 #endif 539 } 540 541 /** 542 * amdgpu_device_vram_access - read/write a buffer in vram 543 * 544 * @adev: amdgpu_device pointer 545 * @pos: offset of the buffer in vram 546 * @buf: virtual address of the buffer in system memory 547 * @size: read/write size, sizeof(@buf) must > @size 548 * @write: true - write to vram, otherwise - read from vram 549 */ 550 void amdgpu_device_vram_access(struct amdgpu_device *adev, loff_t pos, 551 void *buf, size_t size, bool write) 552 { 553 size_t count; 554 555 /* try to using vram apreature to access vram first */ 556 count = amdgpu_device_aper_access(adev, pos, buf, size, write); 557 size -= count; 558 if (size) { 559 /* using MM to access rest vram */ 560 pos += count; 561 buf += count; 562 amdgpu_device_mm_access(adev, pos, buf, size, write); 563 } 564 } 565 566 /* 567 * register access helper functions. 568 */ 569 570 /* Check if hw access should be skipped because of hotplug or device error */ 571 bool amdgpu_device_skip_hw_access(struct amdgpu_device *adev) 572 { 573 if (adev->no_hw_access) 574 return true; 575 576 #ifdef CONFIG_LOCKDEP 577 /* 578 * This is a bit complicated to understand, so worth a comment. What we assert 579 * here is that the GPU reset is not running on another thread in parallel. 580 * 581 * For this we trylock the read side of the reset semaphore, if that succeeds 582 * we know that the reset is not running in paralell. 583 * 584 * If the trylock fails we assert that we are either already holding the read 585 * side of the lock or are the reset thread itself and hold the write side of 586 * the lock. 587 */ 588 if (in_task()) { 589 if (down_read_trylock(&adev->reset_domain->sem)) 590 up_read(&adev->reset_domain->sem); 591 else 592 lockdep_assert_held(&adev->reset_domain->sem); 593 } 594 #endif 595 return false; 596 } 597 598 /** 599 * amdgpu_device_rreg - read a memory mapped IO or indirect register 600 * 601 * @adev: amdgpu_device pointer 602 * @reg: dword aligned register offset 603 * @acc_flags: access flags which require special behavior 604 * 605 * Returns the 32 bit value from the offset specified. 606 */ 607 uint32_t amdgpu_device_rreg(struct amdgpu_device *adev, 608 uint32_t reg, uint32_t acc_flags) 609 { 610 uint32_t ret; 611 612 if (amdgpu_device_skip_hw_access(adev)) 613 return 0; 614 615 if ((reg * 4) < adev->rmmio_size) { 616 if (!(acc_flags & AMDGPU_REGS_NO_KIQ) && 617 amdgpu_sriov_runtime(adev) && 618 down_read_trylock(&adev->reset_domain->sem)) { 619 ret = amdgpu_kiq_rreg(adev, reg, 0); 620 up_read(&adev->reset_domain->sem); 621 } else { 622 ret = readl(((void __iomem *)adev->rmmio) + (reg * 4)); 623 } 624 } else { 625 ret = adev->pcie_rreg(adev, reg * 4); 626 } 627 628 trace_amdgpu_device_rreg(adev->pdev->device, reg, ret); 629 630 return ret; 631 } 632 633 /* 634 * MMIO register read with bytes helper functions 635 * @offset:bytes offset from MMIO start 636 */ 637 638 /** 639 * amdgpu_mm_rreg8 - read a memory mapped IO register 640 * 641 * @adev: amdgpu_device pointer 642 * @offset: byte aligned register offset 643 * 644 * Returns the 8 bit value from the offset specified. 645 */ 646 uint8_t amdgpu_mm_rreg8(struct amdgpu_device *adev, uint32_t offset) 647 { 648 if (amdgpu_device_skip_hw_access(adev)) 649 return 0; 650 651 if (offset < adev->rmmio_size) 652 return (readb(adev->rmmio + offset)); 653 BUG(); 654 } 655 656 657 /** 658 * amdgpu_device_xcc_rreg - read a memory mapped IO or indirect register with specific XCC 659 * 660 * @adev: amdgpu_device pointer 661 * @reg: dword aligned register offset 662 * @acc_flags: access flags which require special behavior 663 * @xcc_id: xcc accelerated compute core id 664 * 665 * Returns the 32 bit value from the offset specified. 666 */ 667 uint32_t amdgpu_device_xcc_rreg(struct amdgpu_device *adev, 668 uint32_t reg, uint32_t acc_flags, 669 uint32_t xcc_id) 670 { 671 uint32_t ret, rlcg_flag; 672 673 if (amdgpu_device_skip_hw_access(adev)) 674 return 0; 675 676 if ((reg * 4) < adev->rmmio_size) { 677 if (amdgpu_sriov_vf(adev) && 678 !amdgpu_sriov_runtime(adev) && 679 adev->gfx.rlc.rlcg_reg_access_supported && 680 amdgpu_virt_get_rlcg_reg_access_flag(adev, acc_flags, 681 GC_HWIP, false, 682 &rlcg_flag)) { 683 ret = amdgpu_virt_rlcg_reg_rw(adev, reg, 0, rlcg_flag, GET_INST(GC, xcc_id)); 684 } else if (!(acc_flags & AMDGPU_REGS_NO_KIQ) && 685 amdgpu_sriov_runtime(adev) && 686 down_read_trylock(&adev->reset_domain->sem)) { 687 ret = amdgpu_kiq_rreg(adev, reg, xcc_id); 688 up_read(&adev->reset_domain->sem); 689 } else { 690 ret = readl(((void __iomem *)adev->rmmio) + (reg * 4)); 691 } 692 } else { 693 ret = adev->pcie_rreg(adev, reg * 4); 694 } 695 696 return ret; 697 } 698 699 /* 700 * MMIO register write with bytes helper functions 701 * @offset:bytes offset from MMIO start 702 * @value: the value want to be written to the register 703 */ 704 705 /** 706 * amdgpu_mm_wreg8 - read a memory mapped IO register 707 * 708 * @adev: amdgpu_device pointer 709 * @offset: byte aligned register offset 710 * @value: 8 bit value to write 711 * 712 * Writes the value specified to the offset specified. 713 */ 714 void amdgpu_mm_wreg8(struct amdgpu_device *adev, uint32_t offset, uint8_t value) 715 { 716 if (amdgpu_device_skip_hw_access(adev)) 717 return; 718 719 if (offset < adev->rmmio_size) 720 writeb(value, adev->rmmio + offset); 721 else 722 BUG(); 723 } 724 725 /** 726 * amdgpu_device_wreg - write to a memory mapped IO or indirect register 727 * 728 * @adev: amdgpu_device pointer 729 * @reg: dword aligned register offset 730 * @v: 32 bit value to write to the register 731 * @acc_flags: access flags which require special behavior 732 * 733 * Writes the value specified to the offset specified. 734 */ 735 void amdgpu_device_wreg(struct amdgpu_device *adev, 736 uint32_t reg, uint32_t v, 737 uint32_t acc_flags) 738 { 739 if (amdgpu_device_skip_hw_access(adev)) 740 return; 741 742 if ((reg * 4) < adev->rmmio_size) { 743 if (!(acc_flags & AMDGPU_REGS_NO_KIQ) && 744 amdgpu_sriov_runtime(adev) && 745 down_read_trylock(&adev->reset_domain->sem)) { 746 amdgpu_kiq_wreg(adev, reg, v, 0); 747 up_read(&adev->reset_domain->sem); 748 } else { 749 writel(v, ((void __iomem *)adev->rmmio) + (reg * 4)); 750 } 751 } else { 752 adev->pcie_wreg(adev, reg * 4, v); 753 } 754 755 trace_amdgpu_device_wreg(adev->pdev->device, reg, v); 756 } 757 758 /** 759 * amdgpu_mm_wreg_mmio_rlc - write register either with direct/indirect mmio or with RLC path if in range 760 * 761 * @adev: amdgpu_device pointer 762 * @reg: mmio/rlc register 763 * @v: value to write 764 * @xcc_id: xcc accelerated compute core id 765 * 766 * this function is invoked only for the debugfs register access 767 */ 768 void amdgpu_mm_wreg_mmio_rlc(struct amdgpu_device *adev, 769 uint32_t reg, uint32_t v, 770 uint32_t xcc_id) 771 { 772 if (amdgpu_device_skip_hw_access(adev)) 773 return; 774 775 if (amdgpu_sriov_fullaccess(adev) && 776 adev->gfx.rlc.funcs && 777 adev->gfx.rlc.funcs->is_rlcg_access_range) { 778 if (adev->gfx.rlc.funcs->is_rlcg_access_range(adev, reg)) 779 return amdgpu_sriov_wreg(adev, reg, v, 0, 0, xcc_id); 780 } else if ((reg * 4) >= adev->rmmio_size) { 781 adev->pcie_wreg(adev, reg * 4, v); 782 } else { 783 writel(v, ((void __iomem *)adev->rmmio) + (reg * 4)); 784 } 785 } 786 787 /** 788 * amdgpu_device_xcc_wreg - write to a memory mapped IO or indirect register with specific XCC 789 * 790 * @adev: amdgpu_device pointer 791 * @reg: dword aligned register offset 792 * @v: 32 bit value to write to the register 793 * @acc_flags: access flags which require special behavior 794 * @xcc_id: xcc accelerated compute core id 795 * 796 * Writes the value specified to the offset specified. 797 */ 798 void amdgpu_device_xcc_wreg(struct amdgpu_device *adev, 799 uint32_t reg, uint32_t v, 800 uint32_t acc_flags, uint32_t xcc_id) 801 { 802 uint32_t rlcg_flag; 803 804 if (amdgpu_device_skip_hw_access(adev)) 805 return; 806 807 if ((reg * 4) < adev->rmmio_size) { 808 if (amdgpu_sriov_vf(adev) && 809 !amdgpu_sriov_runtime(adev) && 810 adev->gfx.rlc.rlcg_reg_access_supported && 811 amdgpu_virt_get_rlcg_reg_access_flag(adev, acc_flags, 812 GC_HWIP, true, 813 &rlcg_flag)) { 814 amdgpu_virt_rlcg_reg_rw(adev, reg, v, rlcg_flag, GET_INST(GC, xcc_id)); 815 } else if (!(acc_flags & AMDGPU_REGS_NO_KIQ) && 816 amdgpu_sriov_runtime(adev) && 817 down_read_trylock(&adev->reset_domain->sem)) { 818 amdgpu_kiq_wreg(adev, reg, v, xcc_id); 819 up_read(&adev->reset_domain->sem); 820 } else { 821 writel(v, ((void __iomem *)adev->rmmio) + (reg * 4)); 822 } 823 } else { 824 adev->pcie_wreg(adev, reg * 4, v); 825 } 826 } 827 828 /** 829 * amdgpu_device_indirect_rreg - read an indirect register 830 * 831 * @adev: amdgpu_device pointer 832 * @reg_addr: indirect register address to read from 833 * 834 * Returns the value of indirect register @reg_addr 835 */ 836 u32 amdgpu_device_indirect_rreg(struct amdgpu_device *adev, 837 u32 reg_addr) 838 { 839 unsigned long flags, pcie_index, pcie_data; 840 void __iomem *pcie_index_offset; 841 void __iomem *pcie_data_offset; 842 u32 r; 843 844 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 845 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 846 847 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 848 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 849 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 850 851 writel(reg_addr, pcie_index_offset); 852 readl(pcie_index_offset); 853 r = readl(pcie_data_offset); 854 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 855 856 return r; 857 } 858 859 u32 amdgpu_device_indirect_rreg_ext(struct amdgpu_device *adev, 860 u64 reg_addr) 861 { 862 unsigned long flags, pcie_index, pcie_index_hi, pcie_data; 863 u32 r; 864 void __iomem *pcie_index_offset; 865 void __iomem *pcie_index_hi_offset; 866 void __iomem *pcie_data_offset; 867 868 if (unlikely(!adev->nbio.funcs)) { 869 pcie_index = AMDGPU_PCIE_INDEX_FALLBACK; 870 pcie_data = AMDGPU_PCIE_DATA_FALLBACK; 871 } else { 872 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 873 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 874 } 875 876 if (reg_addr >> 32) { 877 if (unlikely(!adev->nbio.funcs)) 878 pcie_index_hi = AMDGPU_PCIE_INDEX_HI_FALLBACK; 879 else 880 pcie_index_hi = adev->nbio.funcs->get_pcie_index_hi_offset(adev); 881 } else { 882 pcie_index_hi = 0; 883 } 884 885 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 886 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 887 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 888 if (pcie_index_hi != 0) 889 pcie_index_hi_offset = (void __iomem *)adev->rmmio + 890 pcie_index_hi * 4; 891 892 writel(reg_addr, pcie_index_offset); 893 readl(pcie_index_offset); 894 if (pcie_index_hi != 0) { 895 writel((reg_addr >> 32) & 0xff, pcie_index_hi_offset); 896 readl(pcie_index_hi_offset); 897 } 898 r = readl(pcie_data_offset); 899 900 /* clear the high bits */ 901 if (pcie_index_hi != 0) { 902 writel(0, pcie_index_hi_offset); 903 readl(pcie_index_hi_offset); 904 } 905 906 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 907 908 return r; 909 } 910 911 /** 912 * amdgpu_device_indirect_rreg64 - read a 64bits indirect register 913 * 914 * @adev: amdgpu_device pointer 915 * @reg_addr: indirect register address to read from 916 * 917 * Returns the value of indirect register @reg_addr 918 */ 919 u64 amdgpu_device_indirect_rreg64(struct amdgpu_device *adev, 920 u32 reg_addr) 921 { 922 unsigned long flags, pcie_index, pcie_data; 923 void __iomem *pcie_index_offset; 924 void __iomem *pcie_data_offset; 925 u64 r; 926 927 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 928 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 929 930 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 931 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 932 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 933 934 /* read low 32 bits */ 935 writel(reg_addr, pcie_index_offset); 936 readl(pcie_index_offset); 937 r = readl(pcie_data_offset); 938 /* read high 32 bits */ 939 writel(reg_addr + 4, pcie_index_offset); 940 readl(pcie_index_offset); 941 r |= ((u64)readl(pcie_data_offset) << 32); 942 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 943 944 return r; 945 } 946 947 u64 amdgpu_device_indirect_rreg64_ext(struct amdgpu_device *adev, 948 u64 reg_addr) 949 { 950 unsigned long flags, pcie_index, pcie_data; 951 unsigned long pcie_index_hi = 0; 952 void __iomem *pcie_index_offset; 953 void __iomem *pcie_index_hi_offset; 954 void __iomem *pcie_data_offset; 955 u64 r; 956 957 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 958 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 959 if ((reg_addr >> 32) && (adev->nbio.funcs->get_pcie_index_hi_offset)) 960 pcie_index_hi = adev->nbio.funcs->get_pcie_index_hi_offset(adev); 961 962 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 963 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 964 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 965 if (pcie_index_hi != 0) 966 pcie_index_hi_offset = (void __iomem *)adev->rmmio + 967 pcie_index_hi * 4; 968 969 /* read low 32 bits */ 970 writel(reg_addr, pcie_index_offset); 971 readl(pcie_index_offset); 972 if (pcie_index_hi != 0) { 973 writel((reg_addr >> 32) & 0xff, pcie_index_hi_offset); 974 readl(pcie_index_hi_offset); 975 } 976 r = readl(pcie_data_offset); 977 /* read high 32 bits */ 978 writel(reg_addr + 4, pcie_index_offset); 979 readl(pcie_index_offset); 980 if (pcie_index_hi != 0) { 981 writel((reg_addr >> 32) & 0xff, pcie_index_hi_offset); 982 readl(pcie_index_hi_offset); 983 } 984 r |= ((u64)readl(pcie_data_offset) << 32); 985 986 /* clear the high bits */ 987 if (pcie_index_hi != 0) { 988 writel(0, pcie_index_hi_offset); 989 readl(pcie_index_hi_offset); 990 } 991 992 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 993 994 return r; 995 } 996 997 /** 998 * amdgpu_device_indirect_wreg - write an indirect register address 999 * 1000 * @adev: amdgpu_device pointer 1001 * @reg_addr: indirect register offset 1002 * @reg_data: indirect register data 1003 * 1004 */ 1005 void amdgpu_device_indirect_wreg(struct amdgpu_device *adev, 1006 u32 reg_addr, u32 reg_data) 1007 { 1008 unsigned long flags, pcie_index, pcie_data; 1009 void __iomem *pcie_index_offset; 1010 void __iomem *pcie_data_offset; 1011 1012 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 1013 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 1014 1015 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 1016 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 1017 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 1018 1019 writel(reg_addr, pcie_index_offset); 1020 readl(pcie_index_offset); 1021 writel(reg_data, pcie_data_offset); 1022 readl(pcie_data_offset); 1023 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 1024 } 1025 1026 void amdgpu_device_indirect_wreg_ext(struct amdgpu_device *adev, 1027 u64 reg_addr, u32 reg_data) 1028 { 1029 unsigned long flags, pcie_index, pcie_index_hi, pcie_data; 1030 void __iomem *pcie_index_offset; 1031 void __iomem *pcie_index_hi_offset; 1032 void __iomem *pcie_data_offset; 1033 1034 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 1035 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 1036 if ((reg_addr >> 32) && (adev->nbio.funcs->get_pcie_index_hi_offset)) 1037 pcie_index_hi = adev->nbio.funcs->get_pcie_index_hi_offset(adev); 1038 else 1039 pcie_index_hi = 0; 1040 1041 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 1042 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 1043 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 1044 if (pcie_index_hi != 0) 1045 pcie_index_hi_offset = (void __iomem *)adev->rmmio + 1046 pcie_index_hi * 4; 1047 1048 writel(reg_addr, pcie_index_offset); 1049 readl(pcie_index_offset); 1050 if (pcie_index_hi != 0) { 1051 writel((reg_addr >> 32) & 0xff, pcie_index_hi_offset); 1052 readl(pcie_index_hi_offset); 1053 } 1054 writel(reg_data, pcie_data_offset); 1055 readl(pcie_data_offset); 1056 1057 /* clear the high bits */ 1058 if (pcie_index_hi != 0) { 1059 writel(0, pcie_index_hi_offset); 1060 readl(pcie_index_hi_offset); 1061 } 1062 1063 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 1064 } 1065 1066 /** 1067 * amdgpu_device_indirect_wreg64 - write a 64bits indirect register address 1068 * 1069 * @adev: amdgpu_device pointer 1070 * @reg_addr: indirect register offset 1071 * @reg_data: indirect register data 1072 * 1073 */ 1074 void amdgpu_device_indirect_wreg64(struct amdgpu_device *adev, 1075 u32 reg_addr, u64 reg_data) 1076 { 1077 unsigned long flags, pcie_index, pcie_data; 1078 void __iomem *pcie_index_offset; 1079 void __iomem *pcie_data_offset; 1080 1081 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 1082 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 1083 1084 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 1085 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 1086 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 1087 1088 /* write low 32 bits */ 1089 writel(reg_addr, pcie_index_offset); 1090 readl(pcie_index_offset); 1091 writel((u32)(reg_data & 0xffffffffULL), pcie_data_offset); 1092 readl(pcie_data_offset); 1093 /* write high 32 bits */ 1094 writel(reg_addr + 4, pcie_index_offset); 1095 readl(pcie_index_offset); 1096 writel((u32)(reg_data >> 32), pcie_data_offset); 1097 readl(pcie_data_offset); 1098 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 1099 } 1100 1101 void amdgpu_device_indirect_wreg64_ext(struct amdgpu_device *adev, 1102 u64 reg_addr, u64 reg_data) 1103 { 1104 unsigned long flags, pcie_index, pcie_data; 1105 unsigned long pcie_index_hi = 0; 1106 void __iomem *pcie_index_offset; 1107 void __iomem *pcie_index_hi_offset; 1108 void __iomem *pcie_data_offset; 1109 1110 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 1111 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 1112 if ((reg_addr >> 32) && (adev->nbio.funcs->get_pcie_index_hi_offset)) 1113 pcie_index_hi = adev->nbio.funcs->get_pcie_index_hi_offset(adev); 1114 1115 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 1116 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 1117 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 1118 if (pcie_index_hi != 0) 1119 pcie_index_hi_offset = (void __iomem *)adev->rmmio + 1120 pcie_index_hi * 4; 1121 1122 /* write low 32 bits */ 1123 writel(reg_addr, pcie_index_offset); 1124 readl(pcie_index_offset); 1125 if (pcie_index_hi != 0) { 1126 writel((reg_addr >> 32) & 0xff, pcie_index_hi_offset); 1127 readl(pcie_index_hi_offset); 1128 } 1129 writel((u32)(reg_data & 0xffffffffULL), pcie_data_offset); 1130 readl(pcie_data_offset); 1131 /* write high 32 bits */ 1132 writel(reg_addr + 4, pcie_index_offset); 1133 readl(pcie_index_offset); 1134 if (pcie_index_hi != 0) { 1135 writel((reg_addr >> 32) & 0xff, pcie_index_hi_offset); 1136 readl(pcie_index_hi_offset); 1137 } 1138 writel((u32)(reg_data >> 32), pcie_data_offset); 1139 readl(pcie_data_offset); 1140 1141 /* clear the high bits */ 1142 if (pcie_index_hi != 0) { 1143 writel(0, pcie_index_hi_offset); 1144 readl(pcie_index_hi_offset); 1145 } 1146 1147 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 1148 } 1149 1150 /** 1151 * amdgpu_device_get_rev_id - query device rev_id 1152 * 1153 * @adev: amdgpu_device pointer 1154 * 1155 * Return device rev_id 1156 */ 1157 u32 amdgpu_device_get_rev_id(struct amdgpu_device *adev) 1158 { 1159 return adev->nbio.funcs->get_rev_id(adev); 1160 } 1161 1162 /** 1163 * amdgpu_invalid_rreg - dummy reg read function 1164 * 1165 * @adev: amdgpu_device pointer 1166 * @reg: offset of register 1167 * 1168 * Dummy register read function. Used for register blocks 1169 * that certain asics don't have (all asics). 1170 * Returns the value in the register. 1171 */ 1172 static uint32_t amdgpu_invalid_rreg(struct amdgpu_device *adev, uint32_t reg) 1173 { 1174 DRM_ERROR("Invalid callback to read register 0x%04X\n", reg); 1175 BUG(); 1176 return 0; 1177 } 1178 1179 static uint32_t amdgpu_invalid_rreg_ext(struct amdgpu_device *adev, uint64_t reg) 1180 { 1181 DRM_ERROR("Invalid callback to read register 0x%llX\n", reg); 1182 BUG(); 1183 return 0; 1184 } 1185 1186 /** 1187 * amdgpu_invalid_wreg - dummy reg write function 1188 * 1189 * @adev: amdgpu_device pointer 1190 * @reg: offset of register 1191 * @v: value to write to the register 1192 * 1193 * Dummy register read function. Used for register blocks 1194 * that certain asics don't have (all asics). 1195 */ 1196 static void amdgpu_invalid_wreg(struct amdgpu_device *adev, uint32_t reg, uint32_t v) 1197 { 1198 DRM_ERROR("Invalid callback to write register 0x%04X with 0x%08X\n", 1199 reg, v); 1200 BUG(); 1201 } 1202 1203 static void amdgpu_invalid_wreg_ext(struct amdgpu_device *adev, uint64_t reg, uint32_t v) 1204 { 1205 DRM_ERROR("Invalid callback to write register 0x%llX with 0x%08X\n", 1206 reg, v); 1207 BUG(); 1208 } 1209 1210 /** 1211 * amdgpu_invalid_rreg64 - dummy 64 bit reg read function 1212 * 1213 * @adev: amdgpu_device pointer 1214 * @reg: offset of register 1215 * 1216 * Dummy register read function. Used for register blocks 1217 * that certain asics don't have (all asics). 1218 * Returns the value in the register. 1219 */ 1220 static uint64_t amdgpu_invalid_rreg64(struct amdgpu_device *adev, uint32_t reg) 1221 { 1222 DRM_ERROR("Invalid callback to read 64 bit register 0x%04X\n", reg); 1223 BUG(); 1224 return 0; 1225 } 1226 1227 static uint64_t amdgpu_invalid_rreg64_ext(struct amdgpu_device *adev, uint64_t reg) 1228 { 1229 DRM_ERROR("Invalid callback to read register 0x%llX\n", reg); 1230 BUG(); 1231 return 0; 1232 } 1233 1234 /** 1235 * amdgpu_invalid_wreg64 - dummy reg write function 1236 * 1237 * @adev: amdgpu_device pointer 1238 * @reg: offset of register 1239 * @v: value to write to the register 1240 * 1241 * Dummy register read function. Used for register blocks 1242 * that certain asics don't have (all asics). 1243 */ 1244 static void amdgpu_invalid_wreg64(struct amdgpu_device *adev, uint32_t reg, uint64_t v) 1245 { 1246 DRM_ERROR("Invalid callback to write 64 bit register 0x%04X with 0x%08llX\n", 1247 reg, v); 1248 BUG(); 1249 } 1250 1251 static void amdgpu_invalid_wreg64_ext(struct amdgpu_device *adev, uint64_t reg, uint64_t v) 1252 { 1253 DRM_ERROR("Invalid callback to write 64 bit register 0x%llX with 0x%08llX\n", 1254 reg, v); 1255 BUG(); 1256 } 1257 1258 /** 1259 * amdgpu_block_invalid_rreg - dummy reg read function 1260 * 1261 * @adev: amdgpu_device pointer 1262 * @block: offset of instance 1263 * @reg: offset of register 1264 * 1265 * Dummy register read function. Used for register blocks 1266 * that certain asics don't have (all asics). 1267 * Returns the value in the register. 1268 */ 1269 static uint32_t amdgpu_block_invalid_rreg(struct amdgpu_device *adev, 1270 uint32_t block, uint32_t reg) 1271 { 1272 DRM_ERROR("Invalid callback to read register 0x%04X in block 0x%04X\n", 1273 reg, block); 1274 BUG(); 1275 return 0; 1276 } 1277 1278 /** 1279 * amdgpu_block_invalid_wreg - dummy reg write function 1280 * 1281 * @adev: amdgpu_device pointer 1282 * @block: offset of instance 1283 * @reg: offset of register 1284 * @v: value to write to the register 1285 * 1286 * Dummy register read function. Used for register blocks 1287 * that certain asics don't have (all asics). 1288 */ 1289 static void amdgpu_block_invalid_wreg(struct amdgpu_device *adev, 1290 uint32_t block, 1291 uint32_t reg, uint32_t v) 1292 { 1293 DRM_ERROR("Invalid block callback to write register 0x%04X in block 0x%04X with 0x%08X\n", 1294 reg, block, v); 1295 BUG(); 1296 } 1297 1298 /** 1299 * amdgpu_device_asic_init - Wrapper for atom asic_init 1300 * 1301 * @adev: amdgpu_device pointer 1302 * 1303 * Does any asic specific work and then calls atom asic init. 1304 */ 1305 static int amdgpu_device_asic_init(struct amdgpu_device *adev) 1306 { 1307 int ret; 1308 1309 amdgpu_asic_pre_asic_init(adev); 1310 1311 if (amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 3) || 1312 amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 4) || 1313 amdgpu_ip_version(adev, GC_HWIP, 0) >= IP_VERSION(11, 0, 0)) { 1314 amdgpu_psp_wait_for_bootloader(adev); 1315 ret = amdgpu_atomfirmware_asic_init(adev, true); 1316 return ret; 1317 } else { 1318 return amdgpu_atom_asic_init(adev->mode_info.atom_context); 1319 } 1320 1321 return 0; 1322 } 1323 1324 /** 1325 * amdgpu_device_mem_scratch_init - allocate the VRAM scratch page 1326 * 1327 * @adev: amdgpu_device pointer 1328 * 1329 * Allocates a scratch page of VRAM for use by various things in the 1330 * driver. 1331 */ 1332 static int amdgpu_device_mem_scratch_init(struct amdgpu_device *adev) 1333 { 1334 return amdgpu_bo_create_kernel(adev, AMDGPU_GPU_PAGE_SIZE, PAGE_SIZE, 1335 AMDGPU_GEM_DOMAIN_VRAM | 1336 AMDGPU_GEM_DOMAIN_GTT, 1337 &adev->mem_scratch.robj, 1338 &adev->mem_scratch.gpu_addr, 1339 (void **)&adev->mem_scratch.ptr); 1340 } 1341 1342 /** 1343 * amdgpu_device_mem_scratch_fini - Free the VRAM scratch page 1344 * 1345 * @adev: amdgpu_device pointer 1346 * 1347 * Frees the VRAM scratch page. 1348 */ 1349 static void amdgpu_device_mem_scratch_fini(struct amdgpu_device *adev) 1350 { 1351 amdgpu_bo_free_kernel(&adev->mem_scratch.robj, NULL, NULL); 1352 } 1353 1354 /** 1355 * amdgpu_device_program_register_sequence - program an array of registers. 1356 * 1357 * @adev: amdgpu_device pointer 1358 * @registers: pointer to the register array 1359 * @array_size: size of the register array 1360 * 1361 * Programs an array or registers with and or masks. 1362 * This is a helper for setting golden registers. 1363 */ 1364 void amdgpu_device_program_register_sequence(struct amdgpu_device *adev, 1365 const u32 *registers, 1366 const u32 array_size) 1367 { 1368 u32 tmp, reg, and_mask, or_mask; 1369 int i; 1370 1371 if (array_size % 3) 1372 return; 1373 1374 for (i = 0; i < array_size; i += 3) { 1375 reg = registers[i + 0]; 1376 and_mask = registers[i + 1]; 1377 or_mask = registers[i + 2]; 1378 1379 if (and_mask == 0xffffffff) { 1380 tmp = or_mask; 1381 } else { 1382 tmp = RREG32(reg); 1383 tmp &= ~and_mask; 1384 if (adev->family >= AMDGPU_FAMILY_AI) 1385 tmp |= (or_mask & and_mask); 1386 else 1387 tmp |= or_mask; 1388 } 1389 WREG32(reg, tmp); 1390 } 1391 } 1392 1393 /** 1394 * amdgpu_device_pci_config_reset - reset the GPU 1395 * 1396 * @adev: amdgpu_device pointer 1397 * 1398 * Resets the GPU using the pci config reset sequence. 1399 * Only applicable to asics prior to vega10. 1400 */ 1401 void amdgpu_device_pci_config_reset(struct amdgpu_device *adev) 1402 { 1403 pci_write_config_dword(adev->pdev, 0x7c, AMDGPU_ASIC_RESET_DATA); 1404 } 1405 1406 /** 1407 * amdgpu_device_pci_reset - reset the GPU using generic PCI means 1408 * 1409 * @adev: amdgpu_device pointer 1410 * 1411 * Resets the GPU using generic pci reset interfaces (FLR, SBR, etc.). 1412 */ 1413 int amdgpu_device_pci_reset(struct amdgpu_device *adev) 1414 { 1415 return pci_reset_function(adev->pdev); 1416 } 1417 1418 /* 1419 * amdgpu_device_wb_*() 1420 * Writeback is the method by which the GPU updates special pages in memory 1421 * with the status of certain GPU events (fences, ring pointers,etc.). 1422 */ 1423 1424 /** 1425 * amdgpu_device_wb_fini - Disable Writeback and free memory 1426 * 1427 * @adev: amdgpu_device pointer 1428 * 1429 * Disables Writeback and frees the Writeback memory (all asics). 1430 * Used at driver shutdown. 1431 */ 1432 static void amdgpu_device_wb_fini(struct amdgpu_device *adev) 1433 { 1434 if (adev->wb.wb_obj) { 1435 amdgpu_bo_free_kernel(&adev->wb.wb_obj, 1436 &adev->wb.gpu_addr, 1437 (void **)&adev->wb.wb); 1438 adev->wb.wb_obj = NULL; 1439 } 1440 } 1441 1442 /** 1443 * amdgpu_device_wb_init - Init Writeback driver info and allocate memory 1444 * 1445 * @adev: amdgpu_device pointer 1446 * 1447 * Initializes writeback and allocates writeback memory (all asics). 1448 * Used at driver startup. 1449 * Returns 0 on success or an -error on failure. 1450 */ 1451 static int amdgpu_device_wb_init(struct amdgpu_device *adev) 1452 { 1453 int r; 1454 1455 if (adev->wb.wb_obj == NULL) { 1456 /* AMDGPU_MAX_WB * sizeof(uint32_t) * 8 = AMDGPU_MAX_WB 256bit slots */ 1457 r = amdgpu_bo_create_kernel(adev, AMDGPU_MAX_WB * sizeof(uint32_t) * 8, 1458 PAGE_SIZE, AMDGPU_GEM_DOMAIN_GTT, 1459 &adev->wb.wb_obj, &adev->wb.gpu_addr, 1460 (void **)&adev->wb.wb); 1461 if (r) { 1462 dev_warn(adev->dev, "(%d) create WB bo failed\n", r); 1463 return r; 1464 } 1465 1466 adev->wb.num_wb = AMDGPU_MAX_WB; 1467 memset(&adev->wb.used, 0, sizeof(adev->wb.used)); 1468 1469 /* clear wb memory */ 1470 memset((char *)adev->wb.wb, 0, AMDGPU_MAX_WB * sizeof(uint32_t) * 8); 1471 } 1472 1473 return 0; 1474 } 1475 1476 /** 1477 * amdgpu_device_wb_get - Allocate a wb entry 1478 * 1479 * @adev: amdgpu_device pointer 1480 * @wb: wb index 1481 * 1482 * Allocate a wb slot for use by the driver (all asics). 1483 * Returns 0 on success or -EINVAL on failure. 1484 */ 1485 int amdgpu_device_wb_get(struct amdgpu_device *adev, u32 *wb) 1486 { 1487 unsigned long flags, offset; 1488 1489 spin_lock_irqsave(&adev->wb.lock, flags); 1490 offset = find_first_zero_bit(adev->wb.used, adev->wb.num_wb); 1491 if (offset < adev->wb.num_wb) { 1492 __set_bit(offset, adev->wb.used); 1493 spin_unlock_irqrestore(&adev->wb.lock, flags); 1494 *wb = offset << 3; /* convert to dw offset */ 1495 return 0; 1496 } else { 1497 spin_unlock_irqrestore(&adev->wb.lock, flags); 1498 return -EINVAL; 1499 } 1500 } 1501 1502 /** 1503 * amdgpu_device_wb_free - Free a wb entry 1504 * 1505 * @adev: amdgpu_device pointer 1506 * @wb: wb index 1507 * 1508 * Free a wb slot allocated for use by the driver (all asics) 1509 */ 1510 void amdgpu_device_wb_free(struct amdgpu_device *adev, u32 wb) 1511 { 1512 unsigned long flags; 1513 1514 wb >>= 3; 1515 spin_lock_irqsave(&adev->wb.lock, flags); 1516 if (wb < adev->wb.num_wb) 1517 __clear_bit(wb, adev->wb.used); 1518 spin_unlock_irqrestore(&adev->wb.lock, flags); 1519 } 1520 1521 /** 1522 * amdgpu_device_resize_fb_bar - try to resize FB BAR 1523 * 1524 * @adev: amdgpu_device pointer 1525 * 1526 * Try to resize FB BAR to make all VRAM CPU accessible. We try very hard not 1527 * to fail, but if any of the BARs is not accessible after the size we abort 1528 * driver loading by returning -ENODEV. 1529 */ 1530 int amdgpu_device_resize_fb_bar(struct amdgpu_device *adev) 1531 { 1532 int rbar_size = pci_rebar_bytes_to_size(adev->gmc.real_vram_size); 1533 struct pci_bus *root; 1534 struct resource *res; 1535 unsigned int i; 1536 u16 cmd; 1537 int r; 1538 1539 if (!IS_ENABLED(CONFIG_PHYS_ADDR_T_64BIT)) 1540 return 0; 1541 1542 /* Bypass for VF */ 1543 if (amdgpu_sriov_vf(adev)) 1544 return 0; 1545 1546 /* PCI_EXT_CAP_ID_VNDR extended capability is located at 0x100 */ 1547 if (!pci_find_ext_capability(adev->pdev, PCI_EXT_CAP_ID_VNDR)) 1548 DRM_WARN("System can't access extended configuration space, please check!!\n"); 1549 1550 /* skip if the bios has already enabled large BAR */ 1551 if (adev->gmc.real_vram_size && 1552 (pci_resource_len(adev->pdev, 0) >= adev->gmc.real_vram_size)) 1553 return 0; 1554 1555 /* Check if the root BUS has 64bit memory resources */ 1556 root = adev->pdev->bus; 1557 while (root->parent) 1558 root = root->parent; 1559 1560 pci_bus_for_each_resource(root, res, i) { 1561 if (res && res->flags & (IORESOURCE_MEM | IORESOURCE_MEM_64) && 1562 res->start > 0x100000000ull) 1563 break; 1564 } 1565 1566 /* Trying to resize is pointless without a root hub window above 4GB */ 1567 if (!res) 1568 return 0; 1569 1570 /* Limit the BAR size to what is available */ 1571 rbar_size = min(fls(pci_rebar_get_possible_sizes(adev->pdev, 0)) - 1, 1572 rbar_size); 1573 1574 /* Disable memory decoding while we change the BAR addresses and size */ 1575 pci_read_config_word(adev->pdev, PCI_COMMAND, &cmd); 1576 pci_write_config_word(adev->pdev, PCI_COMMAND, 1577 cmd & ~PCI_COMMAND_MEMORY); 1578 1579 /* Free the VRAM and doorbell BAR, we most likely need to move both. */ 1580 amdgpu_doorbell_fini(adev); 1581 if (adev->asic_type >= CHIP_BONAIRE) 1582 pci_release_resource(adev->pdev, 2); 1583 1584 pci_release_resource(adev->pdev, 0); 1585 1586 r = pci_resize_resource(adev->pdev, 0, rbar_size); 1587 if (r == -ENOSPC) 1588 DRM_INFO("Not enough PCI address space for a large BAR."); 1589 else if (r && r != -ENOTSUPP) 1590 DRM_ERROR("Problem resizing BAR0 (%d).", r); 1591 1592 pci_assign_unassigned_bus_resources(adev->pdev->bus); 1593 1594 /* When the doorbell or fb BAR isn't available we have no chance of 1595 * using the device. 1596 */ 1597 r = amdgpu_doorbell_init(adev); 1598 if (r || (pci_resource_flags(adev->pdev, 0) & IORESOURCE_UNSET)) 1599 return -ENODEV; 1600 1601 pci_write_config_word(adev->pdev, PCI_COMMAND, cmd); 1602 1603 return 0; 1604 } 1605 1606 static bool amdgpu_device_read_bios(struct amdgpu_device *adev) 1607 { 1608 if (hweight32(adev->aid_mask) && (adev->flags & AMD_IS_APU)) 1609 return false; 1610 1611 return true; 1612 } 1613 1614 /* 1615 * GPU helpers function. 1616 */ 1617 /** 1618 * amdgpu_device_need_post - check if the hw need post or not 1619 * 1620 * @adev: amdgpu_device pointer 1621 * 1622 * Check if the asic has been initialized (all asics) at driver startup 1623 * or post is needed if hw reset is performed. 1624 * Returns true if need or false if not. 1625 */ 1626 bool amdgpu_device_need_post(struct amdgpu_device *adev) 1627 { 1628 uint32_t reg; 1629 1630 if (amdgpu_sriov_vf(adev)) 1631 return false; 1632 1633 if (!amdgpu_device_read_bios(adev)) 1634 return false; 1635 1636 if (amdgpu_passthrough(adev)) { 1637 /* for FIJI: In whole GPU pass-through virtualization case, after VM reboot 1638 * some old smc fw still need driver do vPost otherwise gpu hang, while 1639 * those smc fw version above 22.15 doesn't have this flaw, so we force 1640 * vpost executed for smc version below 22.15 1641 */ 1642 if (adev->asic_type == CHIP_FIJI) { 1643 int err; 1644 uint32_t fw_ver; 1645 1646 err = request_firmware(&adev->pm.fw, "amdgpu/fiji_smc.bin", adev->dev); 1647 /* force vPost if error occured */ 1648 if (err) 1649 return true; 1650 1651 fw_ver = *((uint32_t *)adev->pm.fw->data + 69); 1652 release_firmware(adev->pm.fw); 1653 if (fw_ver < 0x00160e00) 1654 return true; 1655 } 1656 } 1657 1658 /* Don't post if we need to reset whole hive on init */ 1659 if (adev->gmc.xgmi.pending_reset) 1660 return false; 1661 1662 if (adev->has_hw_reset) { 1663 adev->has_hw_reset = false; 1664 return true; 1665 } 1666 1667 /* bios scratch used on CIK+ */ 1668 if (adev->asic_type >= CHIP_BONAIRE) 1669 return amdgpu_atombios_scratch_need_asic_init(adev); 1670 1671 /* check MEM_SIZE for older asics */ 1672 reg = amdgpu_asic_get_config_memsize(adev); 1673 1674 if ((reg != 0) && (reg != 0xffffffff)) 1675 return false; 1676 1677 return true; 1678 } 1679 1680 /* 1681 * Check whether seamless boot is supported. 1682 * 1683 * So far we only support seamless boot on DCE 3.0 or later. 1684 * If users report that it works on older ASICS as well, we may 1685 * loosen this. 1686 */ 1687 bool amdgpu_device_seamless_boot_supported(struct amdgpu_device *adev) 1688 { 1689 switch (amdgpu_seamless) { 1690 case -1: 1691 break; 1692 case 1: 1693 return true; 1694 case 0: 1695 return false; 1696 default: 1697 DRM_ERROR("Invalid value for amdgpu.seamless: %d\n", 1698 amdgpu_seamless); 1699 return false; 1700 } 1701 1702 if (!(adev->flags & AMD_IS_APU)) 1703 return false; 1704 1705 if (adev->mman.keep_stolen_vga_memory) 1706 return false; 1707 1708 return amdgpu_ip_version(adev, DCE_HWIP, 0) >= IP_VERSION(3, 0, 0); 1709 } 1710 1711 /* 1712 * Intel hosts such as Rocket Lake, Alder Lake, Raptor Lake and Sapphire Rapids 1713 * don't support dynamic speed switching. Until we have confirmation from Intel 1714 * that a specific host supports it, it's safer that we keep it disabled for all. 1715 * 1716 * https://edc.intel.com/content/www/us/en/design/products/platforms/details/raptor-lake-s/13th-generation-core-processors-datasheet-volume-1-of-2/005/pci-express-support/ 1717 * https://gitlab.freedesktop.org/drm/amd/-/issues/2663 1718 */ 1719 static bool amdgpu_device_pcie_dynamic_switching_supported(struct amdgpu_device *adev) 1720 { 1721 #if IS_ENABLED(CONFIG_X86) 1722 struct cpuinfo_x86 *c = &cpu_data(0); 1723 1724 /* eGPU change speeds based on USB4 fabric conditions */ 1725 if (dev_is_removable(adev->dev)) 1726 return true; 1727 1728 if (c->x86_vendor == X86_VENDOR_INTEL) 1729 return false; 1730 #endif 1731 return true; 1732 } 1733 1734 /** 1735 * amdgpu_device_should_use_aspm - check if the device should program ASPM 1736 * 1737 * @adev: amdgpu_device pointer 1738 * 1739 * Confirm whether the module parameter and pcie bridge agree that ASPM should 1740 * be set for this device. 1741 * 1742 * Returns true if it should be used or false if not. 1743 */ 1744 bool amdgpu_device_should_use_aspm(struct amdgpu_device *adev) 1745 { 1746 switch (amdgpu_aspm) { 1747 case -1: 1748 break; 1749 case 0: 1750 return false; 1751 case 1: 1752 return true; 1753 default: 1754 return false; 1755 } 1756 if (adev->flags & AMD_IS_APU) 1757 return false; 1758 if (!(adev->pm.pp_feature & PP_PCIE_DPM_MASK)) 1759 return false; 1760 return pcie_aspm_enabled(adev->pdev); 1761 } 1762 1763 /* if we get transitioned to only one device, take VGA back */ 1764 /** 1765 * amdgpu_device_vga_set_decode - enable/disable vga decode 1766 * 1767 * @pdev: PCI device pointer 1768 * @state: enable/disable vga decode 1769 * 1770 * Enable/disable vga decode (all asics). 1771 * Returns VGA resource flags. 1772 */ 1773 static unsigned int amdgpu_device_vga_set_decode(struct pci_dev *pdev, 1774 bool state) 1775 { 1776 struct amdgpu_device *adev = drm_to_adev(pci_get_drvdata(pdev)); 1777 1778 amdgpu_asic_set_vga_state(adev, state); 1779 if (state) 1780 return VGA_RSRC_LEGACY_IO | VGA_RSRC_LEGACY_MEM | 1781 VGA_RSRC_NORMAL_IO | VGA_RSRC_NORMAL_MEM; 1782 else 1783 return VGA_RSRC_NORMAL_IO | VGA_RSRC_NORMAL_MEM; 1784 } 1785 1786 /** 1787 * amdgpu_device_check_block_size - validate the vm block size 1788 * 1789 * @adev: amdgpu_device pointer 1790 * 1791 * Validates the vm block size specified via module parameter. 1792 * The vm block size defines number of bits in page table versus page directory, 1793 * a page is 4KB so we have 12 bits offset, minimum 9 bits in the 1794 * page table and the remaining bits are in the page directory. 1795 */ 1796 static void amdgpu_device_check_block_size(struct amdgpu_device *adev) 1797 { 1798 /* defines number of bits in page table versus page directory, 1799 * a page is 4KB so we have 12 bits offset, minimum 9 bits in the 1800 * page table and the remaining bits are in the page directory 1801 */ 1802 if (amdgpu_vm_block_size == -1) 1803 return; 1804 1805 if (amdgpu_vm_block_size < 9) { 1806 dev_warn(adev->dev, "VM page table size (%d) too small\n", 1807 amdgpu_vm_block_size); 1808 amdgpu_vm_block_size = -1; 1809 } 1810 } 1811 1812 /** 1813 * amdgpu_device_check_vm_size - validate the vm size 1814 * 1815 * @adev: amdgpu_device pointer 1816 * 1817 * Validates the vm size in GB specified via module parameter. 1818 * The VM size is the size of the GPU virtual memory space in GB. 1819 */ 1820 static void amdgpu_device_check_vm_size(struct amdgpu_device *adev) 1821 { 1822 /* no need to check the default value */ 1823 if (amdgpu_vm_size == -1) 1824 return; 1825 1826 if (amdgpu_vm_size < 1) { 1827 dev_warn(adev->dev, "VM size (%d) too small, min is 1GB\n", 1828 amdgpu_vm_size); 1829 amdgpu_vm_size = -1; 1830 } 1831 } 1832 1833 static void amdgpu_device_check_smu_prv_buffer_size(struct amdgpu_device *adev) 1834 { 1835 struct sysinfo si; 1836 bool is_os_64 = (sizeof(void *) == 8); 1837 uint64_t total_memory; 1838 uint64_t dram_size_seven_GB = 0x1B8000000; 1839 uint64_t dram_size_three_GB = 0xB8000000; 1840 1841 if (amdgpu_smu_memory_pool_size == 0) 1842 return; 1843 1844 if (!is_os_64) { 1845 DRM_WARN("Not 64-bit OS, feature not supported\n"); 1846 goto def_value; 1847 } 1848 si_meminfo(&si); 1849 total_memory = (uint64_t)si.totalram * si.mem_unit; 1850 1851 if ((amdgpu_smu_memory_pool_size == 1) || 1852 (amdgpu_smu_memory_pool_size == 2)) { 1853 if (total_memory < dram_size_three_GB) 1854 goto def_value1; 1855 } else if ((amdgpu_smu_memory_pool_size == 4) || 1856 (amdgpu_smu_memory_pool_size == 8)) { 1857 if (total_memory < dram_size_seven_GB) 1858 goto def_value1; 1859 } else { 1860 DRM_WARN("Smu memory pool size not supported\n"); 1861 goto def_value; 1862 } 1863 adev->pm.smu_prv_buffer_size = amdgpu_smu_memory_pool_size << 28; 1864 1865 return; 1866 1867 def_value1: 1868 DRM_WARN("No enough system memory\n"); 1869 def_value: 1870 adev->pm.smu_prv_buffer_size = 0; 1871 } 1872 1873 static int amdgpu_device_init_apu_flags(struct amdgpu_device *adev) 1874 { 1875 if (!(adev->flags & AMD_IS_APU) || 1876 adev->asic_type < CHIP_RAVEN) 1877 return 0; 1878 1879 switch (adev->asic_type) { 1880 case CHIP_RAVEN: 1881 if (adev->pdev->device == 0x15dd) 1882 adev->apu_flags |= AMD_APU_IS_RAVEN; 1883 if (adev->pdev->device == 0x15d8) 1884 adev->apu_flags |= AMD_APU_IS_PICASSO; 1885 break; 1886 case CHIP_RENOIR: 1887 if ((adev->pdev->device == 0x1636) || 1888 (adev->pdev->device == 0x164c)) 1889 adev->apu_flags |= AMD_APU_IS_RENOIR; 1890 else 1891 adev->apu_flags |= AMD_APU_IS_GREEN_SARDINE; 1892 break; 1893 case CHIP_VANGOGH: 1894 adev->apu_flags |= AMD_APU_IS_VANGOGH; 1895 break; 1896 case CHIP_YELLOW_CARP: 1897 break; 1898 case CHIP_CYAN_SKILLFISH: 1899 if ((adev->pdev->device == 0x13FE) || 1900 (adev->pdev->device == 0x143F)) 1901 adev->apu_flags |= AMD_APU_IS_CYAN_SKILLFISH2; 1902 break; 1903 default: 1904 break; 1905 } 1906 1907 return 0; 1908 } 1909 1910 /** 1911 * amdgpu_device_check_arguments - validate module params 1912 * 1913 * @adev: amdgpu_device pointer 1914 * 1915 * Validates certain module parameters and updates 1916 * the associated values used by the driver (all asics). 1917 */ 1918 static int amdgpu_device_check_arguments(struct amdgpu_device *adev) 1919 { 1920 int i; 1921 1922 if (amdgpu_sched_jobs < 4) { 1923 dev_warn(adev->dev, "sched jobs (%d) must be at least 4\n", 1924 amdgpu_sched_jobs); 1925 amdgpu_sched_jobs = 4; 1926 } else if (!is_power_of_2(amdgpu_sched_jobs)) { 1927 dev_warn(adev->dev, "sched jobs (%d) must be a power of 2\n", 1928 amdgpu_sched_jobs); 1929 amdgpu_sched_jobs = roundup_pow_of_two(amdgpu_sched_jobs); 1930 } 1931 1932 if (amdgpu_gart_size != -1 && amdgpu_gart_size < 32) { 1933 /* gart size must be greater or equal to 32M */ 1934 dev_warn(adev->dev, "gart size (%d) too small\n", 1935 amdgpu_gart_size); 1936 amdgpu_gart_size = -1; 1937 } 1938 1939 if (amdgpu_gtt_size != -1 && amdgpu_gtt_size < 32) { 1940 /* gtt size must be greater or equal to 32M */ 1941 dev_warn(adev->dev, "gtt size (%d) too small\n", 1942 amdgpu_gtt_size); 1943 amdgpu_gtt_size = -1; 1944 } 1945 1946 /* valid range is between 4 and 9 inclusive */ 1947 if (amdgpu_vm_fragment_size != -1 && 1948 (amdgpu_vm_fragment_size > 9 || amdgpu_vm_fragment_size < 4)) { 1949 dev_warn(adev->dev, "valid range is between 4 and 9\n"); 1950 amdgpu_vm_fragment_size = -1; 1951 } 1952 1953 if (amdgpu_sched_hw_submission < 2) { 1954 dev_warn(adev->dev, "sched hw submission jobs (%d) must be at least 2\n", 1955 amdgpu_sched_hw_submission); 1956 amdgpu_sched_hw_submission = 2; 1957 } else if (!is_power_of_2(amdgpu_sched_hw_submission)) { 1958 dev_warn(adev->dev, "sched hw submission jobs (%d) must be a power of 2\n", 1959 amdgpu_sched_hw_submission); 1960 amdgpu_sched_hw_submission = roundup_pow_of_two(amdgpu_sched_hw_submission); 1961 } 1962 1963 if (amdgpu_reset_method < -1 || amdgpu_reset_method > 4) { 1964 dev_warn(adev->dev, "invalid option for reset method, reverting to default\n"); 1965 amdgpu_reset_method = -1; 1966 } 1967 1968 amdgpu_device_check_smu_prv_buffer_size(adev); 1969 1970 amdgpu_device_check_vm_size(adev); 1971 1972 amdgpu_device_check_block_size(adev); 1973 1974 adev->firmware.load_type = amdgpu_ucode_get_load_type(adev, amdgpu_fw_load_type); 1975 1976 for (i = 0; i < MAX_XCP; i++) 1977 adev->enforce_isolation[i] = !!enforce_isolation; 1978 1979 return 0; 1980 } 1981 1982 /** 1983 * amdgpu_switcheroo_set_state - set switcheroo state 1984 * 1985 * @pdev: pci dev pointer 1986 * @state: vga_switcheroo state 1987 * 1988 * Callback for the switcheroo driver. Suspends or resumes 1989 * the asics before or after it is powered up using ACPI methods. 1990 */ 1991 static void amdgpu_switcheroo_set_state(struct pci_dev *pdev, 1992 enum vga_switcheroo_state state) 1993 { 1994 struct drm_device *dev = pci_get_drvdata(pdev); 1995 int r; 1996 1997 if (amdgpu_device_supports_px(dev) && state == VGA_SWITCHEROO_OFF) 1998 return; 1999 2000 if (state == VGA_SWITCHEROO_ON) { 2001 pr_info("switched on\n"); 2002 /* don't suspend or resume card normally */ 2003 dev->switch_power_state = DRM_SWITCH_POWER_CHANGING; 2004 2005 pci_set_power_state(pdev, PCI_D0); 2006 amdgpu_device_load_pci_state(pdev); 2007 r = pci_enable_device(pdev); 2008 if (r) 2009 DRM_WARN("pci_enable_device failed (%d)\n", r); 2010 amdgpu_device_resume(dev, true); 2011 2012 dev->switch_power_state = DRM_SWITCH_POWER_ON; 2013 } else { 2014 pr_info("switched off\n"); 2015 dev->switch_power_state = DRM_SWITCH_POWER_CHANGING; 2016 amdgpu_device_prepare(dev); 2017 amdgpu_device_suspend(dev, true); 2018 amdgpu_device_cache_pci_state(pdev); 2019 /* Shut down the device */ 2020 pci_disable_device(pdev); 2021 pci_set_power_state(pdev, PCI_D3cold); 2022 dev->switch_power_state = DRM_SWITCH_POWER_OFF; 2023 } 2024 } 2025 2026 /** 2027 * amdgpu_switcheroo_can_switch - see if switcheroo state can change 2028 * 2029 * @pdev: pci dev pointer 2030 * 2031 * Callback for the switcheroo driver. Check of the switcheroo 2032 * state can be changed. 2033 * Returns true if the state can be changed, false if not. 2034 */ 2035 static bool amdgpu_switcheroo_can_switch(struct pci_dev *pdev) 2036 { 2037 struct drm_device *dev = pci_get_drvdata(pdev); 2038 2039 /* 2040 * FIXME: open_count is protected by drm_global_mutex but that would lead to 2041 * locking inversion with the driver load path. And the access here is 2042 * completely racy anyway. So don't bother with locking for now. 2043 */ 2044 return atomic_read(&dev->open_count) == 0; 2045 } 2046 2047 static const struct vga_switcheroo_client_ops amdgpu_switcheroo_ops = { 2048 .set_gpu_state = amdgpu_switcheroo_set_state, 2049 .reprobe = NULL, 2050 .can_switch = amdgpu_switcheroo_can_switch, 2051 }; 2052 2053 /** 2054 * amdgpu_device_ip_set_clockgating_state - set the CG state 2055 * 2056 * @dev: amdgpu_device pointer 2057 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.) 2058 * @state: clockgating state (gate or ungate) 2059 * 2060 * Sets the requested clockgating state for all instances of 2061 * the hardware IP specified. 2062 * Returns the error code from the last instance. 2063 */ 2064 int amdgpu_device_ip_set_clockgating_state(void *dev, 2065 enum amd_ip_block_type block_type, 2066 enum amd_clockgating_state state) 2067 { 2068 struct amdgpu_device *adev = dev; 2069 int i, r = 0; 2070 2071 for (i = 0; i < adev->num_ip_blocks; i++) { 2072 if (!adev->ip_blocks[i].status.valid) 2073 continue; 2074 if (adev->ip_blocks[i].version->type != block_type) 2075 continue; 2076 if (!adev->ip_blocks[i].version->funcs->set_clockgating_state) 2077 continue; 2078 r = adev->ip_blocks[i].version->funcs->set_clockgating_state( 2079 (void *)adev, state); 2080 if (r) 2081 DRM_ERROR("set_clockgating_state of IP block <%s> failed %d\n", 2082 adev->ip_blocks[i].version->funcs->name, r); 2083 } 2084 return r; 2085 } 2086 2087 /** 2088 * amdgpu_device_ip_set_powergating_state - set the PG state 2089 * 2090 * @dev: amdgpu_device pointer 2091 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.) 2092 * @state: powergating state (gate or ungate) 2093 * 2094 * Sets the requested powergating state for all instances of 2095 * the hardware IP specified. 2096 * Returns the error code from the last instance. 2097 */ 2098 int amdgpu_device_ip_set_powergating_state(void *dev, 2099 enum amd_ip_block_type block_type, 2100 enum amd_powergating_state state) 2101 { 2102 struct amdgpu_device *adev = dev; 2103 int i, r = 0; 2104 2105 for (i = 0; i < adev->num_ip_blocks; i++) { 2106 if (!adev->ip_blocks[i].status.valid) 2107 continue; 2108 if (adev->ip_blocks[i].version->type != block_type) 2109 continue; 2110 if (!adev->ip_blocks[i].version->funcs->set_powergating_state) 2111 continue; 2112 r = adev->ip_blocks[i].version->funcs->set_powergating_state( 2113 (void *)adev, state); 2114 if (r) 2115 DRM_ERROR("set_powergating_state of IP block <%s> failed %d\n", 2116 adev->ip_blocks[i].version->funcs->name, r); 2117 } 2118 return r; 2119 } 2120 2121 /** 2122 * amdgpu_device_ip_get_clockgating_state - get the CG state 2123 * 2124 * @adev: amdgpu_device pointer 2125 * @flags: clockgating feature flags 2126 * 2127 * Walks the list of IPs on the device and updates the clockgating 2128 * flags for each IP. 2129 * Updates @flags with the feature flags for each hardware IP where 2130 * clockgating is enabled. 2131 */ 2132 void amdgpu_device_ip_get_clockgating_state(struct amdgpu_device *adev, 2133 u64 *flags) 2134 { 2135 int i; 2136 2137 for (i = 0; i < adev->num_ip_blocks; i++) { 2138 if (!adev->ip_blocks[i].status.valid) 2139 continue; 2140 if (adev->ip_blocks[i].version->funcs->get_clockgating_state) 2141 adev->ip_blocks[i].version->funcs->get_clockgating_state((void *)adev, flags); 2142 } 2143 } 2144 2145 /** 2146 * amdgpu_device_ip_wait_for_idle - wait for idle 2147 * 2148 * @adev: amdgpu_device pointer 2149 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.) 2150 * 2151 * Waits for the request hardware IP to be idle. 2152 * Returns 0 for success or a negative error code on failure. 2153 */ 2154 int amdgpu_device_ip_wait_for_idle(struct amdgpu_device *adev, 2155 enum amd_ip_block_type block_type) 2156 { 2157 int i, r; 2158 2159 for (i = 0; i < adev->num_ip_blocks; i++) { 2160 if (!adev->ip_blocks[i].status.valid) 2161 continue; 2162 if (adev->ip_blocks[i].version->type == block_type) { 2163 r = adev->ip_blocks[i].version->funcs->wait_for_idle((void *)adev); 2164 if (r) 2165 return r; 2166 break; 2167 } 2168 } 2169 return 0; 2170 2171 } 2172 2173 /** 2174 * amdgpu_device_ip_is_idle - is the hardware IP idle 2175 * 2176 * @adev: amdgpu_device pointer 2177 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.) 2178 * 2179 * Check if the hardware IP is idle or not. 2180 * Returns true if it the IP is idle, false if not. 2181 */ 2182 bool amdgpu_device_ip_is_idle(struct amdgpu_device *adev, 2183 enum amd_ip_block_type block_type) 2184 { 2185 int i; 2186 2187 for (i = 0; i < adev->num_ip_blocks; i++) { 2188 if (!adev->ip_blocks[i].status.valid) 2189 continue; 2190 if (adev->ip_blocks[i].version->type == block_type) 2191 return adev->ip_blocks[i].version->funcs->is_idle((void *)adev); 2192 } 2193 return true; 2194 2195 } 2196 2197 /** 2198 * amdgpu_device_ip_get_ip_block - get a hw IP pointer 2199 * 2200 * @adev: amdgpu_device pointer 2201 * @type: Type of hardware IP (SMU, GFX, UVD, etc.) 2202 * 2203 * Returns a pointer to the hardware IP block structure 2204 * if it exists for the asic, otherwise NULL. 2205 */ 2206 struct amdgpu_ip_block * 2207 amdgpu_device_ip_get_ip_block(struct amdgpu_device *adev, 2208 enum amd_ip_block_type type) 2209 { 2210 int i; 2211 2212 for (i = 0; i < adev->num_ip_blocks; i++) 2213 if (adev->ip_blocks[i].version->type == type) 2214 return &adev->ip_blocks[i]; 2215 2216 return NULL; 2217 } 2218 2219 /** 2220 * amdgpu_device_ip_block_version_cmp 2221 * 2222 * @adev: amdgpu_device pointer 2223 * @type: enum amd_ip_block_type 2224 * @major: major version 2225 * @minor: minor version 2226 * 2227 * return 0 if equal or greater 2228 * return 1 if smaller or the ip_block doesn't exist 2229 */ 2230 int amdgpu_device_ip_block_version_cmp(struct amdgpu_device *adev, 2231 enum amd_ip_block_type type, 2232 u32 major, u32 minor) 2233 { 2234 struct amdgpu_ip_block *ip_block = amdgpu_device_ip_get_ip_block(adev, type); 2235 2236 if (ip_block && ((ip_block->version->major > major) || 2237 ((ip_block->version->major == major) && 2238 (ip_block->version->minor >= minor)))) 2239 return 0; 2240 2241 return 1; 2242 } 2243 2244 /** 2245 * amdgpu_device_ip_block_add 2246 * 2247 * @adev: amdgpu_device pointer 2248 * @ip_block_version: pointer to the IP to add 2249 * 2250 * Adds the IP block driver information to the collection of IPs 2251 * on the asic. 2252 */ 2253 int amdgpu_device_ip_block_add(struct amdgpu_device *adev, 2254 const struct amdgpu_ip_block_version *ip_block_version) 2255 { 2256 if (!ip_block_version) 2257 return -EINVAL; 2258 2259 switch (ip_block_version->type) { 2260 case AMD_IP_BLOCK_TYPE_VCN: 2261 if (adev->harvest_ip_mask & AMD_HARVEST_IP_VCN_MASK) 2262 return 0; 2263 break; 2264 case AMD_IP_BLOCK_TYPE_JPEG: 2265 if (adev->harvest_ip_mask & AMD_HARVEST_IP_JPEG_MASK) 2266 return 0; 2267 break; 2268 default: 2269 break; 2270 } 2271 2272 DRM_INFO("add ip block number %d <%s>\n", adev->num_ip_blocks, 2273 ip_block_version->funcs->name); 2274 2275 adev->ip_blocks[adev->num_ip_blocks++].version = ip_block_version; 2276 2277 return 0; 2278 } 2279 2280 /** 2281 * amdgpu_device_enable_virtual_display - enable virtual display feature 2282 * 2283 * @adev: amdgpu_device pointer 2284 * 2285 * Enabled the virtual display feature if the user has enabled it via 2286 * the module parameter virtual_display. This feature provides a virtual 2287 * display hardware on headless boards or in virtualized environments. 2288 * This function parses and validates the configuration string specified by 2289 * the user and configues the virtual display configuration (number of 2290 * virtual connectors, crtcs, etc.) specified. 2291 */ 2292 static void amdgpu_device_enable_virtual_display(struct amdgpu_device *adev) 2293 { 2294 adev->enable_virtual_display = false; 2295 2296 if (amdgpu_virtual_display) { 2297 const char *pci_address_name = pci_name(adev->pdev); 2298 char *pciaddstr, *pciaddstr_tmp, *pciaddname_tmp, *pciaddname; 2299 2300 pciaddstr = kstrdup(amdgpu_virtual_display, GFP_KERNEL); 2301 pciaddstr_tmp = pciaddstr; 2302 while ((pciaddname_tmp = strsep(&pciaddstr_tmp, ";"))) { 2303 pciaddname = strsep(&pciaddname_tmp, ","); 2304 if (!strcmp("all", pciaddname) 2305 || !strcmp(pci_address_name, pciaddname)) { 2306 long num_crtc; 2307 int res = -1; 2308 2309 adev->enable_virtual_display = true; 2310 2311 if (pciaddname_tmp) 2312 res = kstrtol(pciaddname_tmp, 10, 2313 &num_crtc); 2314 2315 if (!res) { 2316 if (num_crtc < 1) 2317 num_crtc = 1; 2318 if (num_crtc > 6) 2319 num_crtc = 6; 2320 adev->mode_info.num_crtc = num_crtc; 2321 } else { 2322 adev->mode_info.num_crtc = 1; 2323 } 2324 break; 2325 } 2326 } 2327 2328 DRM_INFO("virtual display string:%s, %s:virtual_display:%d, num_crtc:%d\n", 2329 amdgpu_virtual_display, pci_address_name, 2330 adev->enable_virtual_display, adev->mode_info.num_crtc); 2331 2332 kfree(pciaddstr); 2333 } 2334 } 2335 2336 void amdgpu_device_set_sriov_virtual_display(struct amdgpu_device *adev) 2337 { 2338 if (amdgpu_sriov_vf(adev) && !adev->enable_virtual_display) { 2339 adev->mode_info.num_crtc = 1; 2340 adev->enable_virtual_display = true; 2341 DRM_INFO("virtual_display:%d, num_crtc:%d\n", 2342 adev->enable_virtual_display, adev->mode_info.num_crtc); 2343 } 2344 } 2345 2346 /** 2347 * amdgpu_device_parse_gpu_info_fw - parse gpu info firmware 2348 * 2349 * @adev: amdgpu_device pointer 2350 * 2351 * Parses the asic configuration parameters specified in the gpu info 2352 * firmware and makes them availale to the driver for use in configuring 2353 * the asic. 2354 * Returns 0 on success, -EINVAL on failure. 2355 */ 2356 static int amdgpu_device_parse_gpu_info_fw(struct amdgpu_device *adev) 2357 { 2358 const char *chip_name; 2359 int err; 2360 const struct gpu_info_firmware_header_v1_0 *hdr; 2361 2362 adev->firmware.gpu_info_fw = NULL; 2363 2364 if (adev->mman.discovery_bin) 2365 return 0; 2366 2367 switch (adev->asic_type) { 2368 default: 2369 return 0; 2370 case CHIP_VEGA10: 2371 chip_name = "vega10"; 2372 break; 2373 case CHIP_VEGA12: 2374 chip_name = "vega12"; 2375 break; 2376 case CHIP_RAVEN: 2377 if (adev->apu_flags & AMD_APU_IS_RAVEN2) 2378 chip_name = "raven2"; 2379 else if (adev->apu_flags & AMD_APU_IS_PICASSO) 2380 chip_name = "picasso"; 2381 else 2382 chip_name = "raven"; 2383 break; 2384 case CHIP_ARCTURUS: 2385 chip_name = "arcturus"; 2386 break; 2387 case CHIP_NAVI12: 2388 chip_name = "navi12"; 2389 break; 2390 } 2391 2392 err = amdgpu_ucode_request(adev, &adev->firmware.gpu_info_fw, 2393 "amdgpu/%s_gpu_info.bin", chip_name); 2394 if (err) { 2395 dev_err(adev->dev, 2396 "Failed to get gpu_info firmware \"%s_gpu_info.bin\"\n", 2397 chip_name); 2398 goto out; 2399 } 2400 2401 hdr = (const struct gpu_info_firmware_header_v1_0 *)adev->firmware.gpu_info_fw->data; 2402 amdgpu_ucode_print_gpu_info_hdr(&hdr->header); 2403 2404 switch (hdr->version_major) { 2405 case 1: 2406 { 2407 const struct gpu_info_firmware_v1_0 *gpu_info_fw = 2408 (const struct gpu_info_firmware_v1_0 *)(adev->firmware.gpu_info_fw->data + 2409 le32_to_cpu(hdr->header.ucode_array_offset_bytes)); 2410 2411 /* 2412 * Should be droped when DAL no longer needs it. 2413 */ 2414 if (adev->asic_type == CHIP_NAVI12) 2415 goto parse_soc_bounding_box; 2416 2417 adev->gfx.config.max_shader_engines = le32_to_cpu(gpu_info_fw->gc_num_se); 2418 adev->gfx.config.max_cu_per_sh = le32_to_cpu(gpu_info_fw->gc_num_cu_per_sh); 2419 adev->gfx.config.max_sh_per_se = le32_to_cpu(gpu_info_fw->gc_num_sh_per_se); 2420 adev->gfx.config.max_backends_per_se = le32_to_cpu(gpu_info_fw->gc_num_rb_per_se); 2421 adev->gfx.config.max_texture_channel_caches = 2422 le32_to_cpu(gpu_info_fw->gc_num_tccs); 2423 adev->gfx.config.max_gprs = le32_to_cpu(gpu_info_fw->gc_num_gprs); 2424 adev->gfx.config.max_gs_threads = le32_to_cpu(gpu_info_fw->gc_num_max_gs_thds); 2425 adev->gfx.config.gs_vgt_table_depth = le32_to_cpu(gpu_info_fw->gc_gs_table_depth); 2426 adev->gfx.config.gs_prim_buffer_depth = le32_to_cpu(gpu_info_fw->gc_gsprim_buff_depth); 2427 adev->gfx.config.double_offchip_lds_buf = 2428 le32_to_cpu(gpu_info_fw->gc_double_offchip_lds_buffer); 2429 adev->gfx.cu_info.wave_front_size = le32_to_cpu(gpu_info_fw->gc_wave_size); 2430 adev->gfx.cu_info.max_waves_per_simd = 2431 le32_to_cpu(gpu_info_fw->gc_max_waves_per_simd); 2432 adev->gfx.cu_info.max_scratch_slots_per_cu = 2433 le32_to_cpu(gpu_info_fw->gc_max_scratch_slots_per_cu); 2434 adev->gfx.cu_info.lds_size = le32_to_cpu(gpu_info_fw->gc_lds_size); 2435 if (hdr->version_minor >= 1) { 2436 const struct gpu_info_firmware_v1_1 *gpu_info_fw = 2437 (const struct gpu_info_firmware_v1_1 *)(adev->firmware.gpu_info_fw->data + 2438 le32_to_cpu(hdr->header.ucode_array_offset_bytes)); 2439 adev->gfx.config.num_sc_per_sh = 2440 le32_to_cpu(gpu_info_fw->num_sc_per_sh); 2441 adev->gfx.config.num_packer_per_sc = 2442 le32_to_cpu(gpu_info_fw->num_packer_per_sc); 2443 } 2444 2445 parse_soc_bounding_box: 2446 /* 2447 * soc bounding box info is not integrated in disocovery table, 2448 * we always need to parse it from gpu info firmware if needed. 2449 */ 2450 if (hdr->version_minor == 2) { 2451 const struct gpu_info_firmware_v1_2 *gpu_info_fw = 2452 (const struct gpu_info_firmware_v1_2 *)(adev->firmware.gpu_info_fw->data + 2453 le32_to_cpu(hdr->header.ucode_array_offset_bytes)); 2454 adev->dm.soc_bounding_box = &gpu_info_fw->soc_bounding_box; 2455 } 2456 break; 2457 } 2458 default: 2459 dev_err(adev->dev, 2460 "Unsupported gpu_info table %d\n", hdr->header.ucode_version); 2461 err = -EINVAL; 2462 goto out; 2463 } 2464 out: 2465 return err; 2466 } 2467 2468 /** 2469 * amdgpu_device_ip_early_init - run early init for hardware IPs 2470 * 2471 * @adev: amdgpu_device pointer 2472 * 2473 * Early initialization pass for hardware IPs. The hardware IPs that make 2474 * up each asic are discovered each IP's early_init callback is run. This 2475 * is the first stage in initializing the asic. 2476 * Returns 0 on success, negative error code on failure. 2477 */ 2478 static int amdgpu_device_ip_early_init(struct amdgpu_device *adev) 2479 { 2480 struct amdgpu_ip_block *ip_block; 2481 struct pci_dev *parent; 2482 int i, r; 2483 bool total; 2484 2485 amdgpu_device_enable_virtual_display(adev); 2486 2487 if (amdgpu_sriov_vf(adev)) { 2488 r = amdgpu_virt_request_full_gpu(adev, true); 2489 if (r) 2490 return r; 2491 } 2492 2493 switch (adev->asic_type) { 2494 #ifdef CONFIG_DRM_AMDGPU_SI 2495 case CHIP_VERDE: 2496 case CHIP_TAHITI: 2497 case CHIP_PITCAIRN: 2498 case CHIP_OLAND: 2499 case CHIP_HAINAN: 2500 adev->family = AMDGPU_FAMILY_SI; 2501 r = si_set_ip_blocks(adev); 2502 if (r) 2503 return r; 2504 break; 2505 #endif 2506 #ifdef CONFIG_DRM_AMDGPU_CIK 2507 case CHIP_BONAIRE: 2508 case CHIP_HAWAII: 2509 case CHIP_KAVERI: 2510 case CHIP_KABINI: 2511 case CHIP_MULLINS: 2512 if (adev->flags & AMD_IS_APU) 2513 adev->family = AMDGPU_FAMILY_KV; 2514 else 2515 adev->family = AMDGPU_FAMILY_CI; 2516 2517 r = cik_set_ip_blocks(adev); 2518 if (r) 2519 return r; 2520 break; 2521 #endif 2522 case CHIP_TOPAZ: 2523 case CHIP_TONGA: 2524 case CHIP_FIJI: 2525 case CHIP_POLARIS10: 2526 case CHIP_POLARIS11: 2527 case CHIP_POLARIS12: 2528 case CHIP_VEGAM: 2529 case CHIP_CARRIZO: 2530 case CHIP_STONEY: 2531 if (adev->flags & AMD_IS_APU) 2532 adev->family = AMDGPU_FAMILY_CZ; 2533 else 2534 adev->family = AMDGPU_FAMILY_VI; 2535 2536 r = vi_set_ip_blocks(adev); 2537 if (r) 2538 return r; 2539 break; 2540 default: 2541 r = amdgpu_discovery_set_ip_blocks(adev); 2542 if (r) 2543 return r; 2544 break; 2545 } 2546 2547 if (amdgpu_has_atpx() && 2548 (amdgpu_is_atpx_hybrid() || 2549 amdgpu_has_atpx_dgpu_power_cntl()) && 2550 ((adev->flags & AMD_IS_APU) == 0) && 2551 !dev_is_removable(&adev->pdev->dev)) 2552 adev->flags |= AMD_IS_PX; 2553 2554 if (!(adev->flags & AMD_IS_APU)) { 2555 parent = pcie_find_root_port(adev->pdev); 2556 adev->has_pr3 = parent ? pci_pr3_present(parent) : false; 2557 } 2558 2559 2560 adev->pm.pp_feature = amdgpu_pp_feature_mask; 2561 if (amdgpu_sriov_vf(adev) || sched_policy == KFD_SCHED_POLICY_NO_HWS) 2562 adev->pm.pp_feature &= ~PP_GFXOFF_MASK; 2563 if (amdgpu_sriov_vf(adev) && adev->asic_type == CHIP_SIENNA_CICHLID) 2564 adev->pm.pp_feature &= ~PP_OVERDRIVE_MASK; 2565 if (!amdgpu_device_pcie_dynamic_switching_supported(adev)) 2566 adev->pm.pp_feature &= ~PP_PCIE_DPM_MASK; 2567 2568 total = true; 2569 for (i = 0; i < adev->num_ip_blocks; i++) { 2570 if ((amdgpu_ip_block_mask & (1 << i)) == 0) { 2571 DRM_WARN("disabled ip block: %d <%s>\n", 2572 i, adev->ip_blocks[i].version->funcs->name); 2573 adev->ip_blocks[i].status.valid = false; 2574 } else { 2575 if (adev->ip_blocks[i].version->funcs->early_init) { 2576 r = adev->ip_blocks[i].version->funcs->early_init((void *)adev); 2577 if (r == -ENOENT) { 2578 adev->ip_blocks[i].status.valid = false; 2579 } else if (r) { 2580 DRM_ERROR("early_init of IP block <%s> failed %d\n", 2581 adev->ip_blocks[i].version->funcs->name, r); 2582 total = false; 2583 } else { 2584 adev->ip_blocks[i].status.valid = true; 2585 } 2586 } else { 2587 adev->ip_blocks[i].status.valid = true; 2588 } 2589 } 2590 /* get the vbios after the asic_funcs are set up */ 2591 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON) { 2592 r = amdgpu_device_parse_gpu_info_fw(adev); 2593 if (r) 2594 return r; 2595 2596 /* Read BIOS */ 2597 if (amdgpu_device_read_bios(adev)) { 2598 if (!amdgpu_get_bios(adev)) 2599 return -EINVAL; 2600 2601 r = amdgpu_atombios_init(adev); 2602 if (r) { 2603 dev_err(adev->dev, "amdgpu_atombios_init failed\n"); 2604 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_INIT_FAIL, 0, 0); 2605 return r; 2606 } 2607 } 2608 2609 /*get pf2vf msg info at it's earliest time*/ 2610 if (amdgpu_sriov_vf(adev)) 2611 amdgpu_virt_init_data_exchange(adev); 2612 2613 } 2614 } 2615 if (!total) 2616 return -ENODEV; 2617 2618 ip_block = amdgpu_device_ip_get_ip_block(adev, AMD_IP_BLOCK_TYPE_GFX); 2619 if (ip_block->status.valid != false) 2620 amdgpu_amdkfd_device_probe(adev); 2621 2622 adev->cg_flags &= amdgpu_cg_mask; 2623 adev->pg_flags &= amdgpu_pg_mask; 2624 2625 return 0; 2626 } 2627 2628 static int amdgpu_device_ip_hw_init_phase1(struct amdgpu_device *adev) 2629 { 2630 int i, r; 2631 2632 for (i = 0; i < adev->num_ip_blocks; i++) { 2633 if (!adev->ip_blocks[i].status.sw) 2634 continue; 2635 if (adev->ip_blocks[i].status.hw) 2636 continue; 2637 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON || 2638 (amdgpu_sriov_vf(adev) && (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP)) || 2639 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH) { 2640 r = adev->ip_blocks[i].version->funcs->hw_init(adev); 2641 if (r) { 2642 DRM_ERROR("hw_init of IP block <%s> failed %d\n", 2643 adev->ip_blocks[i].version->funcs->name, r); 2644 return r; 2645 } 2646 adev->ip_blocks[i].status.hw = true; 2647 } 2648 } 2649 2650 return 0; 2651 } 2652 2653 static int amdgpu_device_ip_hw_init_phase2(struct amdgpu_device *adev) 2654 { 2655 int i, r; 2656 2657 for (i = 0; i < adev->num_ip_blocks; i++) { 2658 if (!adev->ip_blocks[i].status.sw) 2659 continue; 2660 if (adev->ip_blocks[i].status.hw) 2661 continue; 2662 r = adev->ip_blocks[i].version->funcs->hw_init(adev); 2663 if (r) { 2664 DRM_ERROR("hw_init of IP block <%s> failed %d\n", 2665 adev->ip_blocks[i].version->funcs->name, r); 2666 return r; 2667 } 2668 adev->ip_blocks[i].status.hw = true; 2669 } 2670 2671 return 0; 2672 } 2673 2674 static int amdgpu_device_fw_loading(struct amdgpu_device *adev) 2675 { 2676 int r = 0; 2677 int i; 2678 uint32_t smu_version; 2679 2680 if (adev->asic_type >= CHIP_VEGA10) { 2681 for (i = 0; i < adev->num_ip_blocks; i++) { 2682 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_PSP) 2683 continue; 2684 2685 if (!adev->ip_blocks[i].status.sw) 2686 continue; 2687 2688 /* no need to do the fw loading again if already done*/ 2689 if (adev->ip_blocks[i].status.hw == true) 2690 break; 2691 2692 if (amdgpu_in_reset(adev) || adev->in_suspend) { 2693 r = adev->ip_blocks[i].version->funcs->resume(adev); 2694 if (r) { 2695 DRM_ERROR("resume of IP block <%s> failed %d\n", 2696 adev->ip_blocks[i].version->funcs->name, r); 2697 return r; 2698 } 2699 } else { 2700 r = adev->ip_blocks[i].version->funcs->hw_init(adev); 2701 if (r) { 2702 DRM_ERROR("hw_init of IP block <%s> failed %d\n", 2703 adev->ip_blocks[i].version->funcs->name, r); 2704 return r; 2705 } 2706 } 2707 2708 adev->ip_blocks[i].status.hw = true; 2709 break; 2710 } 2711 } 2712 2713 if (!amdgpu_sriov_vf(adev) || adev->asic_type == CHIP_TONGA) 2714 r = amdgpu_pm_load_smu_firmware(adev, &smu_version); 2715 2716 return r; 2717 } 2718 2719 static int amdgpu_device_init_schedulers(struct amdgpu_device *adev) 2720 { 2721 long timeout; 2722 int r, i; 2723 2724 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 2725 struct amdgpu_ring *ring = adev->rings[i]; 2726 2727 /* No need to setup the GPU scheduler for rings that don't need it */ 2728 if (!ring || ring->no_scheduler) 2729 continue; 2730 2731 switch (ring->funcs->type) { 2732 case AMDGPU_RING_TYPE_GFX: 2733 timeout = adev->gfx_timeout; 2734 break; 2735 case AMDGPU_RING_TYPE_COMPUTE: 2736 timeout = adev->compute_timeout; 2737 break; 2738 case AMDGPU_RING_TYPE_SDMA: 2739 timeout = adev->sdma_timeout; 2740 break; 2741 default: 2742 timeout = adev->video_timeout; 2743 break; 2744 } 2745 2746 r = drm_sched_init(&ring->sched, &amdgpu_sched_ops, NULL, 2747 DRM_SCHED_PRIORITY_COUNT, 2748 ring->num_hw_submission, 0, 2749 timeout, adev->reset_domain->wq, 2750 ring->sched_score, ring->name, 2751 adev->dev); 2752 if (r) { 2753 DRM_ERROR("Failed to create scheduler on ring %s.\n", 2754 ring->name); 2755 return r; 2756 } 2757 r = amdgpu_uvd_entity_init(adev, ring); 2758 if (r) { 2759 DRM_ERROR("Failed to create UVD scheduling entity on ring %s.\n", 2760 ring->name); 2761 return r; 2762 } 2763 r = amdgpu_vce_entity_init(adev, ring); 2764 if (r) { 2765 DRM_ERROR("Failed to create VCE scheduling entity on ring %s.\n", 2766 ring->name); 2767 return r; 2768 } 2769 } 2770 2771 amdgpu_xcp_update_partition_sched_list(adev); 2772 2773 return 0; 2774 } 2775 2776 2777 /** 2778 * amdgpu_device_ip_init - run init for hardware IPs 2779 * 2780 * @adev: amdgpu_device pointer 2781 * 2782 * Main initialization pass for hardware IPs. The list of all the hardware 2783 * IPs that make up the asic is walked and the sw_init and hw_init callbacks 2784 * are run. sw_init initializes the software state associated with each IP 2785 * and hw_init initializes the hardware associated with each IP. 2786 * Returns 0 on success, negative error code on failure. 2787 */ 2788 static int amdgpu_device_ip_init(struct amdgpu_device *adev) 2789 { 2790 int i, r; 2791 2792 r = amdgpu_ras_init(adev); 2793 if (r) 2794 return r; 2795 2796 for (i = 0; i < adev->num_ip_blocks; i++) { 2797 if (!adev->ip_blocks[i].status.valid) 2798 continue; 2799 r = adev->ip_blocks[i].version->funcs->sw_init((void *)adev); 2800 if (r) { 2801 DRM_ERROR("sw_init of IP block <%s> failed %d\n", 2802 adev->ip_blocks[i].version->funcs->name, r); 2803 goto init_failed; 2804 } 2805 adev->ip_blocks[i].status.sw = true; 2806 2807 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON) { 2808 /* need to do common hw init early so everything is set up for gmc */ 2809 r = adev->ip_blocks[i].version->funcs->hw_init((void *)adev); 2810 if (r) { 2811 DRM_ERROR("hw_init %d failed %d\n", i, r); 2812 goto init_failed; 2813 } 2814 adev->ip_blocks[i].status.hw = true; 2815 } else if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) { 2816 /* need to do gmc hw init early so we can allocate gpu mem */ 2817 /* Try to reserve bad pages early */ 2818 if (amdgpu_sriov_vf(adev)) 2819 amdgpu_virt_exchange_data(adev); 2820 2821 r = amdgpu_device_mem_scratch_init(adev); 2822 if (r) { 2823 DRM_ERROR("amdgpu_mem_scratch_init failed %d\n", r); 2824 goto init_failed; 2825 } 2826 r = adev->ip_blocks[i].version->funcs->hw_init((void *)adev); 2827 if (r) { 2828 DRM_ERROR("hw_init %d failed %d\n", i, r); 2829 goto init_failed; 2830 } 2831 r = amdgpu_device_wb_init(adev); 2832 if (r) { 2833 DRM_ERROR("amdgpu_device_wb_init failed %d\n", r); 2834 goto init_failed; 2835 } 2836 adev->ip_blocks[i].status.hw = true; 2837 2838 /* right after GMC hw init, we create CSA */ 2839 if (adev->gfx.mcbp) { 2840 r = amdgpu_allocate_static_csa(adev, &adev->virt.csa_obj, 2841 AMDGPU_GEM_DOMAIN_VRAM | 2842 AMDGPU_GEM_DOMAIN_GTT, 2843 AMDGPU_CSA_SIZE); 2844 if (r) { 2845 DRM_ERROR("allocate CSA failed %d\n", r); 2846 goto init_failed; 2847 } 2848 } 2849 2850 r = amdgpu_seq64_init(adev); 2851 if (r) { 2852 DRM_ERROR("allocate seq64 failed %d\n", r); 2853 goto init_failed; 2854 } 2855 } 2856 } 2857 2858 if (amdgpu_sriov_vf(adev)) 2859 amdgpu_virt_init_data_exchange(adev); 2860 2861 r = amdgpu_ib_pool_init(adev); 2862 if (r) { 2863 dev_err(adev->dev, "IB initialization failed (%d).\n", r); 2864 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_IB_INIT_FAIL, 0, r); 2865 goto init_failed; 2866 } 2867 2868 r = amdgpu_ucode_create_bo(adev); /* create ucode bo when sw_init complete*/ 2869 if (r) 2870 goto init_failed; 2871 2872 r = amdgpu_device_ip_hw_init_phase1(adev); 2873 if (r) 2874 goto init_failed; 2875 2876 r = amdgpu_device_fw_loading(adev); 2877 if (r) 2878 goto init_failed; 2879 2880 r = amdgpu_device_ip_hw_init_phase2(adev); 2881 if (r) 2882 goto init_failed; 2883 2884 /* 2885 * retired pages will be loaded from eeprom and reserved here, 2886 * it should be called after amdgpu_device_ip_hw_init_phase2 since 2887 * for some ASICs the RAS EEPROM code relies on SMU fully functioning 2888 * for I2C communication which only true at this point. 2889 * 2890 * amdgpu_ras_recovery_init may fail, but the upper only cares the 2891 * failure from bad gpu situation and stop amdgpu init process 2892 * accordingly. For other failed cases, it will still release all 2893 * the resource and print error message, rather than returning one 2894 * negative value to upper level. 2895 * 2896 * Note: theoretically, this should be called before all vram allocations 2897 * to protect retired page from abusing 2898 */ 2899 r = amdgpu_ras_recovery_init(adev); 2900 if (r) 2901 goto init_failed; 2902 2903 /** 2904 * In case of XGMI grab extra reference for reset domain for this device 2905 */ 2906 if (adev->gmc.xgmi.num_physical_nodes > 1) { 2907 if (amdgpu_xgmi_add_device(adev) == 0) { 2908 if (!amdgpu_sriov_vf(adev)) { 2909 struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev); 2910 2911 if (WARN_ON(!hive)) { 2912 r = -ENOENT; 2913 goto init_failed; 2914 } 2915 2916 if (!hive->reset_domain || 2917 !amdgpu_reset_get_reset_domain(hive->reset_domain)) { 2918 r = -ENOENT; 2919 amdgpu_put_xgmi_hive(hive); 2920 goto init_failed; 2921 } 2922 2923 /* Drop the early temporary reset domain we created for device */ 2924 amdgpu_reset_put_reset_domain(adev->reset_domain); 2925 adev->reset_domain = hive->reset_domain; 2926 amdgpu_put_xgmi_hive(hive); 2927 } 2928 } 2929 } 2930 2931 r = amdgpu_device_init_schedulers(adev); 2932 if (r) 2933 goto init_failed; 2934 2935 if (adev->mman.buffer_funcs_ring->sched.ready) 2936 amdgpu_ttm_set_buffer_funcs_status(adev, true); 2937 2938 /* Don't init kfd if whole hive need to be reset during init */ 2939 if (!adev->gmc.xgmi.pending_reset) { 2940 kgd2kfd_init_zone_device(adev); 2941 amdgpu_amdkfd_device_init(adev); 2942 } 2943 2944 amdgpu_fru_get_product_info(adev); 2945 2946 init_failed: 2947 2948 return r; 2949 } 2950 2951 /** 2952 * amdgpu_device_fill_reset_magic - writes reset magic to gart pointer 2953 * 2954 * @adev: amdgpu_device pointer 2955 * 2956 * Writes a reset magic value to the gart pointer in VRAM. The driver calls 2957 * this function before a GPU reset. If the value is retained after a 2958 * GPU reset, VRAM has not been lost. Some GPU resets may destry VRAM contents. 2959 */ 2960 static void amdgpu_device_fill_reset_magic(struct amdgpu_device *adev) 2961 { 2962 memcpy(adev->reset_magic, adev->gart.ptr, AMDGPU_RESET_MAGIC_NUM); 2963 } 2964 2965 /** 2966 * amdgpu_device_check_vram_lost - check if vram is valid 2967 * 2968 * @adev: amdgpu_device pointer 2969 * 2970 * Checks the reset magic value written to the gart pointer in VRAM. 2971 * The driver calls this after a GPU reset to see if the contents of 2972 * VRAM is lost or now. 2973 * returns true if vram is lost, false if not. 2974 */ 2975 static bool amdgpu_device_check_vram_lost(struct amdgpu_device *adev) 2976 { 2977 if (memcmp(adev->gart.ptr, adev->reset_magic, 2978 AMDGPU_RESET_MAGIC_NUM)) 2979 return true; 2980 2981 if (!amdgpu_in_reset(adev)) 2982 return false; 2983 2984 /* 2985 * For all ASICs with baco/mode1 reset, the VRAM is 2986 * always assumed to be lost. 2987 */ 2988 switch (amdgpu_asic_reset_method(adev)) { 2989 case AMD_RESET_METHOD_BACO: 2990 case AMD_RESET_METHOD_MODE1: 2991 return true; 2992 default: 2993 return false; 2994 } 2995 } 2996 2997 /** 2998 * amdgpu_device_set_cg_state - set clockgating for amdgpu device 2999 * 3000 * @adev: amdgpu_device pointer 3001 * @state: clockgating state (gate or ungate) 3002 * 3003 * The list of all the hardware IPs that make up the asic is walked and the 3004 * set_clockgating_state callbacks are run. 3005 * Late initialization pass enabling clockgating for hardware IPs. 3006 * Fini or suspend, pass disabling clockgating for hardware IPs. 3007 * Returns 0 on success, negative error code on failure. 3008 */ 3009 3010 int amdgpu_device_set_cg_state(struct amdgpu_device *adev, 3011 enum amd_clockgating_state state) 3012 { 3013 int i, j, r; 3014 3015 if (amdgpu_emu_mode == 1) 3016 return 0; 3017 3018 for (j = 0; j < adev->num_ip_blocks; j++) { 3019 i = state == AMD_CG_STATE_GATE ? j : adev->num_ip_blocks - j - 1; 3020 if (!adev->ip_blocks[i].status.late_initialized) 3021 continue; 3022 /* skip CG for GFX, SDMA on S0ix */ 3023 if (adev->in_s0ix && 3024 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GFX || 3025 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SDMA)) 3026 continue; 3027 /* skip CG for VCE/UVD, it's handled specially */ 3028 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_UVD && 3029 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCE && 3030 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCN && 3031 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_JPEG && 3032 adev->ip_blocks[i].version->funcs->set_clockgating_state) { 3033 /* enable clockgating to save power */ 3034 r = adev->ip_blocks[i].version->funcs->set_clockgating_state((void *)adev, 3035 state); 3036 if (r) { 3037 DRM_ERROR("set_clockgating_state(gate) of IP block <%s> failed %d\n", 3038 adev->ip_blocks[i].version->funcs->name, r); 3039 return r; 3040 } 3041 } 3042 } 3043 3044 return 0; 3045 } 3046 3047 int amdgpu_device_set_pg_state(struct amdgpu_device *adev, 3048 enum amd_powergating_state state) 3049 { 3050 int i, j, r; 3051 3052 if (amdgpu_emu_mode == 1) 3053 return 0; 3054 3055 for (j = 0; j < adev->num_ip_blocks; j++) { 3056 i = state == AMD_PG_STATE_GATE ? j : adev->num_ip_blocks - j - 1; 3057 if (!adev->ip_blocks[i].status.late_initialized) 3058 continue; 3059 /* skip PG for GFX, SDMA on S0ix */ 3060 if (adev->in_s0ix && 3061 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GFX || 3062 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SDMA)) 3063 continue; 3064 /* skip CG for VCE/UVD, it's handled specially */ 3065 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_UVD && 3066 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCE && 3067 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCN && 3068 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_JPEG && 3069 adev->ip_blocks[i].version->funcs->set_powergating_state) { 3070 /* enable powergating to save power */ 3071 r = adev->ip_blocks[i].version->funcs->set_powergating_state((void *)adev, 3072 state); 3073 if (r) { 3074 DRM_ERROR("set_powergating_state(gate) of IP block <%s> failed %d\n", 3075 adev->ip_blocks[i].version->funcs->name, r); 3076 return r; 3077 } 3078 } 3079 } 3080 return 0; 3081 } 3082 3083 static int amdgpu_device_enable_mgpu_fan_boost(void) 3084 { 3085 struct amdgpu_gpu_instance *gpu_ins; 3086 struct amdgpu_device *adev; 3087 int i, ret = 0; 3088 3089 mutex_lock(&mgpu_info.mutex); 3090 3091 /* 3092 * MGPU fan boost feature should be enabled 3093 * only when there are two or more dGPUs in 3094 * the system 3095 */ 3096 if (mgpu_info.num_dgpu < 2) 3097 goto out; 3098 3099 for (i = 0; i < mgpu_info.num_dgpu; i++) { 3100 gpu_ins = &(mgpu_info.gpu_ins[i]); 3101 adev = gpu_ins->adev; 3102 if (!(adev->flags & AMD_IS_APU) && 3103 !gpu_ins->mgpu_fan_enabled) { 3104 ret = amdgpu_dpm_enable_mgpu_fan_boost(adev); 3105 if (ret) 3106 break; 3107 3108 gpu_ins->mgpu_fan_enabled = 1; 3109 } 3110 } 3111 3112 out: 3113 mutex_unlock(&mgpu_info.mutex); 3114 3115 return ret; 3116 } 3117 3118 /** 3119 * amdgpu_device_ip_late_init - run late init for hardware IPs 3120 * 3121 * @adev: amdgpu_device pointer 3122 * 3123 * Late initialization pass for hardware IPs. The list of all the hardware 3124 * IPs that make up the asic is walked and the late_init callbacks are run. 3125 * late_init covers any special initialization that an IP requires 3126 * after all of the have been initialized or something that needs to happen 3127 * late in the init process. 3128 * Returns 0 on success, negative error code on failure. 3129 */ 3130 static int amdgpu_device_ip_late_init(struct amdgpu_device *adev) 3131 { 3132 struct amdgpu_gpu_instance *gpu_instance; 3133 int i = 0, r; 3134 3135 for (i = 0; i < adev->num_ip_blocks; i++) { 3136 if (!adev->ip_blocks[i].status.hw) 3137 continue; 3138 if (adev->ip_blocks[i].version->funcs->late_init) { 3139 r = adev->ip_blocks[i].version->funcs->late_init((void *)adev); 3140 if (r) { 3141 DRM_ERROR("late_init of IP block <%s> failed %d\n", 3142 adev->ip_blocks[i].version->funcs->name, r); 3143 return r; 3144 } 3145 } 3146 adev->ip_blocks[i].status.late_initialized = true; 3147 } 3148 3149 r = amdgpu_ras_late_init(adev); 3150 if (r) { 3151 DRM_ERROR("amdgpu_ras_late_init failed %d", r); 3152 return r; 3153 } 3154 3155 if (!amdgpu_in_reset(adev)) 3156 amdgpu_ras_set_error_query_ready(adev, true); 3157 3158 amdgpu_device_set_cg_state(adev, AMD_CG_STATE_GATE); 3159 amdgpu_device_set_pg_state(adev, AMD_PG_STATE_GATE); 3160 3161 amdgpu_device_fill_reset_magic(adev); 3162 3163 r = amdgpu_device_enable_mgpu_fan_boost(); 3164 if (r) 3165 DRM_ERROR("enable mgpu fan boost failed (%d).\n", r); 3166 3167 /* For passthrough configuration on arcturus and aldebaran, enable special handling SBR */ 3168 if (amdgpu_passthrough(adev) && 3169 ((adev->asic_type == CHIP_ARCTURUS && adev->gmc.xgmi.num_physical_nodes > 1) || 3170 adev->asic_type == CHIP_ALDEBARAN)) 3171 amdgpu_dpm_handle_passthrough_sbr(adev, true); 3172 3173 if (adev->gmc.xgmi.num_physical_nodes > 1) { 3174 mutex_lock(&mgpu_info.mutex); 3175 3176 /* 3177 * Reset device p-state to low as this was booted with high. 3178 * 3179 * This should be performed only after all devices from the same 3180 * hive get initialized. 3181 * 3182 * However, it's unknown how many device in the hive in advance. 3183 * As this is counted one by one during devices initializations. 3184 * 3185 * So, we wait for all XGMI interlinked devices initialized. 3186 * This may bring some delays as those devices may come from 3187 * different hives. But that should be OK. 3188 */ 3189 if (mgpu_info.num_dgpu == adev->gmc.xgmi.num_physical_nodes) { 3190 for (i = 0; i < mgpu_info.num_gpu; i++) { 3191 gpu_instance = &(mgpu_info.gpu_ins[i]); 3192 if (gpu_instance->adev->flags & AMD_IS_APU) 3193 continue; 3194 3195 r = amdgpu_xgmi_set_pstate(gpu_instance->adev, 3196 AMDGPU_XGMI_PSTATE_MIN); 3197 if (r) { 3198 DRM_ERROR("pstate setting failed (%d).\n", r); 3199 break; 3200 } 3201 } 3202 } 3203 3204 mutex_unlock(&mgpu_info.mutex); 3205 } 3206 3207 return 0; 3208 } 3209 3210 /** 3211 * amdgpu_device_smu_fini_early - smu hw_fini wrapper 3212 * 3213 * @adev: amdgpu_device pointer 3214 * 3215 * For ASICs need to disable SMC first 3216 */ 3217 static void amdgpu_device_smu_fini_early(struct amdgpu_device *adev) 3218 { 3219 int i, r; 3220 3221 if (amdgpu_ip_version(adev, GC_HWIP, 0) > IP_VERSION(9, 0, 0)) 3222 return; 3223 3224 for (i = 0; i < adev->num_ip_blocks; i++) { 3225 if (!adev->ip_blocks[i].status.hw) 3226 continue; 3227 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) { 3228 r = adev->ip_blocks[i].version->funcs->hw_fini((void *)adev); 3229 /* XXX handle errors */ 3230 if (r) { 3231 DRM_DEBUG("hw_fini of IP block <%s> failed %d\n", 3232 adev->ip_blocks[i].version->funcs->name, r); 3233 } 3234 adev->ip_blocks[i].status.hw = false; 3235 break; 3236 } 3237 } 3238 } 3239 3240 static int amdgpu_device_ip_fini_early(struct amdgpu_device *adev) 3241 { 3242 int i, r; 3243 3244 for (i = 0; i < adev->num_ip_blocks; i++) { 3245 if (!adev->ip_blocks[i].version->funcs->early_fini) 3246 continue; 3247 3248 r = adev->ip_blocks[i].version->funcs->early_fini((void *)adev); 3249 if (r) { 3250 DRM_DEBUG("early_fini of IP block <%s> failed %d\n", 3251 adev->ip_blocks[i].version->funcs->name, r); 3252 } 3253 } 3254 3255 amdgpu_device_set_pg_state(adev, AMD_PG_STATE_UNGATE); 3256 amdgpu_device_set_cg_state(adev, AMD_CG_STATE_UNGATE); 3257 3258 amdgpu_amdkfd_suspend(adev, false); 3259 3260 /* Workaroud for ASICs need to disable SMC first */ 3261 amdgpu_device_smu_fini_early(adev); 3262 3263 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 3264 if (!adev->ip_blocks[i].status.hw) 3265 continue; 3266 3267 r = adev->ip_blocks[i].version->funcs->hw_fini((void *)adev); 3268 /* XXX handle errors */ 3269 if (r) { 3270 DRM_DEBUG("hw_fini of IP block <%s> failed %d\n", 3271 adev->ip_blocks[i].version->funcs->name, r); 3272 } 3273 3274 adev->ip_blocks[i].status.hw = false; 3275 } 3276 3277 if (amdgpu_sriov_vf(adev)) { 3278 if (amdgpu_virt_release_full_gpu(adev, false)) 3279 DRM_ERROR("failed to release exclusive mode on fini\n"); 3280 } 3281 3282 return 0; 3283 } 3284 3285 /** 3286 * amdgpu_device_ip_fini - run fini for hardware IPs 3287 * 3288 * @adev: amdgpu_device pointer 3289 * 3290 * Main teardown pass for hardware IPs. The list of all the hardware 3291 * IPs that make up the asic is walked and the hw_fini and sw_fini callbacks 3292 * are run. hw_fini tears down the hardware associated with each IP 3293 * and sw_fini tears down any software state associated with each IP. 3294 * Returns 0 on success, negative error code on failure. 3295 */ 3296 static int amdgpu_device_ip_fini(struct amdgpu_device *adev) 3297 { 3298 int i, r; 3299 3300 if (amdgpu_sriov_vf(adev) && adev->virt.ras_init_done) 3301 amdgpu_virt_release_ras_err_handler_data(adev); 3302 3303 if (adev->gmc.xgmi.num_physical_nodes > 1) 3304 amdgpu_xgmi_remove_device(adev); 3305 3306 amdgpu_amdkfd_device_fini_sw(adev); 3307 3308 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 3309 if (!adev->ip_blocks[i].status.sw) 3310 continue; 3311 3312 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) { 3313 amdgpu_ucode_free_bo(adev); 3314 amdgpu_free_static_csa(&adev->virt.csa_obj); 3315 amdgpu_device_wb_fini(adev); 3316 amdgpu_device_mem_scratch_fini(adev); 3317 amdgpu_ib_pool_fini(adev); 3318 amdgpu_seq64_fini(adev); 3319 } 3320 3321 r = adev->ip_blocks[i].version->funcs->sw_fini((void *)adev); 3322 /* XXX handle errors */ 3323 if (r) { 3324 DRM_DEBUG("sw_fini of IP block <%s> failed %d\n", 3325 adev->ip_blocks[i].version->funcs->name, r); 3326 } 3327 adev->ip_blocks[i].status.sw = false; 3328 adev->ip_blocks[i].status.valid = false; 3329 } 3330 3331 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 3332 if (!adev->ip_blocks[i].status.late_initialized) 3333 continue; 3334 if (adev->ip_blocks[i].version->funcs->late_fini) 3335 adev->ip_blocks[i].version->funcs->late_fini((void *)adev); 3336 adev->ip_blocks[i].status.late_initialized = false; 3337 } 3338 3339 amdgpu_ras_fini(adev); 3340 3341 return 0; 3342 } 3343 3344 /** 3345 * amdgpu_device_delayed_init_work_handler - work handler for IB tests 3346 * 3347 * @work: work_struct. 3348 */ 3349 static void amdgpu_device_delayed_init_work_handler(struct work_struct *work) 3350 { 3351 struct amdgpu_device *adev = 3352 container_of(work, struct amdgpu_device, delayed_init_work.work); 3353 int r; 3354 3355 r = amdgpu_ib_ring_tests(adev); 3356 if (r) 3357 DRM_ERROR("ib ring test failed (%d).\n", r); 3358 } 3359 3360 static void amdgpu_device_delay_enable_gfx_off(struct work_struct *work) 3361 { 3362 struct amdgpu_device *adev = 3363 container_of(work, struct amdgpu_device, gfx.gfx_off_delay_work.work); 3364 3365 WARN_ON_ONCE(adev->gfx.gfx_off_state); 3366 WARN_ON_ONCE(adev->gfx.gfx_off_req_count); 3367 3368 if (!amdgpu_dpm_set_powergating_by_smu(adev, AMD_IP_BLOCK_TYPE_GFX, true)) 3369 adev->gfx.gfx_off_state = true; 3370 } 3371 3372 /** 3373 * amdgpu_device_ip_suspend_phase1 - run suspend for hardware IPs (phase 1) 3374 * 3375 * @adev: amdgpu_device pointer 3376 * 3377 * Main suspend function for hardware IPs. The list of all the hardware 3378 * IPs that make up the asic is walked, clockgating is disabled and the 3379 * suspend callbacks are run. suspend puts the hardware and software state 3380 * in each IP into a state suitable for suspend. 3381 * Returns 0 on success, negative error code on failure. 3382 */ 3383 static int amdgpu_device_ip_suspend_phase1(struct amdgpu_device *adev) 3384 { 3385 int i, r; 3386 3387 amdgpu_device_set_pg_state(adev, AMD_PG_STATE_UNGATE); 3388 amdgpu_device_set_cg_state(adev, AMD_CG_STATE_UNGATE); 3389 3390 /* 3391 * Per PMFW team's suggestion, driver needs to handle gfxoff 3392 * and df cstate features disablement for gpu reset(e.g. Mode1Reset) 3393 * scenario. Add the missing df cstate disablement here. 3394 */ 3395 if (amdgpu_dpm_set_df_cstate(adev, DF_CSTATE_DISALLOW)) 3396 dev_warn(adev->dev, "Failed to disallow df cstate"); 3397 3398 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 3399 if (!adev->ip_blocks[i].status.valid) 3400 continue; 3401 3402 /* displays are handled separately */ 3403 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_DCE) 3404 continue; 3405 3406 /* XXX handle errors */ 3407 r = adev->ip_blocks[i].version->funcs->suspend(adev); 3408 /* XXX handle errors */ 3409 if (r) { 3410 DRM_ERROR("suspend of IP block <%s> failed %d\n", 3411 adev->ip_blocks[i].version->funcs->name, r); 3412 return r; 3413 } 3414 3415 adev->ip_blocks[i].status.hw = false; 3416 } 3417 3418 return 0; 3419 } 3420 3421 /** 3422 * amdgpu_device_ip_suspend_phase2 - run suspend for hardware IPs (phase 2) 3423 * 3424 * @adev: amdgpu_device pointer 3425 * 3426 * Main suspend function for hardware IPs. The list of all the hardware 3427 * IPs that make up the asic is walked, clockgating is disabled and the 3428 * suspend callbacks are run. suspend puts the hardware and software state 3429 * in each IP into a state suitable for suspend. 3430 * Returns 0 on success, negative error code on failure. 3431 */ 3432 static int amdgpu_device_ip_suspend_phase2(struct amdgpu_device *adev) 3433 { 3434 int i, r; 3435 3436 if (adev->in_s0ix) 3437 amdgpu_dpm_gfx_state_change(adev, sGpuChangeState_D3Entry); 3438 3439 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 3440 if (!adev->ip_blocks[i].status.valid) 3441 continue; 3442 /* displays are handled in phase1 */ 3443 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE) 3444 continue; 3445 /* PSP lost connection when err_event_athub occurs */ 3446 if (amdgpu_ras_intr_triggered() && 3447 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) { 3448 adev->ip_blocks[i].status.hw = false; 3449 continue; 3450 } 3451 3452 /* skip unnecessary suspend if we do not initialize them yet */ 3453 if (adev->gmc.xgmi.pending_reset && 3454 !(adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC || 3455 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC || 3456 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON || 3457 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH)) { 3458 adev->ip_blocks[i].status.hw = false; 3459 continue; 3460 } 3461 3462 /* skip suspend of gfx/mes and psp for S0ix 3463 * gfx is in gfxoff state, so on resume it will exit gfxoff just 3464 * like at runtime. PSP is also part of the always on hardware 3465 * so no need to suspend it. 3466 */ 3467 if (adev->in_s0ix && 3468 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP || 3469 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GFX || 3470 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_MES)) 3471 continue; 3472 3473 /* SDMA 5.x+ is part of GFX power domain so it's covered by GFXOFF */ 3474 if (adev->in_s0ix && 3475 (amdgpu_ip_version(adev, SDMA0_HWIP, 0) >= 3476 IP_VERSION(5, 0, 0)) && 3477 (adev->ip_blocks[i].version->type == 3478 AMD_IP_BLOCK_TYPE_SDMA)) 3479 continue; 3480 3481 /* Once swPSP provides the IMU, RLC FW binaries to TOS during cold-boot. 3482 * These are in TMR, hence are expected to be reused by PSP-TOS to reload 3483 * from this location and RLC Autoload automatically also gets loaded 3484 * from here based on PMFW -> PSP message during re-init sequence. 3485 * Therefore, the psp suspend & resume should be skipped to avoid destroy 3486 * the TMR and reload FWs again for IMU enabled APU ASICs. 3487 */ 3488 if (amdgpu_in_reset(adev) && 3489 (adev->flags & AMD_IS_APU) && adev->gfx.imu.funcs && 3490 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) 3491 continue; 3492 3493 /* XXX handle errors */ 3494 r = adev->ip_blocks[i].version->funcs->suspend(adev); 3495 /* XXX handle errors */ 3496 if (r) { 3497 DRM_ERROR("suspend of IP block <%s> failed %d\n", 3498 adev->ip_blocks[i].version->funcs->name, r); 3499 } 3500 adev->ip_blocks[i].status.hw = false; 3501 /* handle putting the SMC in the appropriate state */ 3502 if (!amdgpu_sriov_vf(adev)) { 3503 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) { 3504 r = amdgpu_dpm_set_mp1_state(adev, adev->mp1_state); 3505 if (r) { 3506 DRM_ERROR("SMC failed to set mp1 state %d, %d\n", 3507 adev->mp1_state, r); 3508 return r; 3509 } 3510 } 3511 } 3512 } 3513 3514 return 0; 3515 } 3516 3517 /** 3518 * amdgpu_device_ip_suspend - run suspend for hardware IPs 3519 * 3520 * @adev: amdgpu_device pointer 3521 * 3522 * Main suspend function for hardware IPs. The list of all the hardware 3523 * IPs that make up the asic is walked, clockgating is disabled and the 3524 * suspend callbacks are run. suspend puts the hardware and software state 3525 * in each IP into a state suitable for suspend. 3526 * Returns 0 on success, negative error code on failure. 3527 */ 3528 int amdgpu_device_ip_suspend(struct amdgpu_device *adev) 3529 { 3530 int r; 3531 3532 if (amdgpu_sriov_vf(adev)) { 3533 amdgpu_virt_fini_data_exchange(adev); 3534 amdgpu_virt_request_full_gpu(adev, false); 3535 } 3536 3537 amdgpu_ttm_set_buffer_funcs_status(adev, false); 3538 3539 r = amdgpu_device_ip_suspend_phase1(adev); 3540 if (r) 3541 return r; 3542 r = amdgpu_device_ip_suspend_phase2(adev); 3543 3544 if (amdgpu_sriov_vf(adev)) 3545 amdgpu_virt_release_full_gpu(adev, false); 3546 3547 return r; 3548 } 3549 3550 static int amdgpu_device_ip_reinit_early_sriov(struct amdgpu_device *adev) 3551 { 3552 int i, r; 3553 3554 static enum amd_ip_block_type ip_order[] = { 3555 AMD_IP_BLOCK_TYPE_COMMON, 3556 AMD_IP_BLOCK_TYPE_GMC, 3557 AMD_IP_BLOCK_TYPE_PSP, 3558 AMD_IP_BLOCK_TYPE_IH, 3559 }; 3560 3561 for (i = 0; i < adev->num_ip_blocks; i++) { 3562 int j; 3563 struct amdgpu_ip_block *block; 3564 3565 block = &adev->ip_blocks[i]; 3566 block->status.hw = false; 3567 3568 for (j = 0; j < ARRAY_SIZE(ip_order); j++) { 3569 3570 if (block->version->type != ip_order[j] || 3571 !block->status.valid) 3572 continue; 3573 3574 r = block->version->funcs->hw_init(adev); 3575 DRM_INFO("RE-INIT-early: %s %s\n", block->version->funcs->name, r?"failed":"succeeded"); 3576 if (r) 3577 return r; 3578 block->status.hw = true; 3579 } 3580 } 3581 3582 return 0; 3583 } 3584 3585 static int amdgpu_device_ip_reinit_late_sriov(struct amdgpu_device *adev) 3586 { 3587 int i, r; 3588 3589 static enum amd_ip_block_type ip_order[] = { 3590 AMD_IP_BLOCK_TYPE_SMC, 3591 AMD_IP_BLOCK_TYPE_DCE, 3592 AMD_IP_BLOCK_TYPE_GFX, 3593 AMD_IP_BLOCK_TYPE_SDMA, 3594 AMD_IP_BLOCK_TYPE_MES, 3595 AMD_IP_BLOCK_TYPE_UVD, 3596 AMD_IP_BLOCK_TYPE_VCE, 3597 AMD_IP_BLOCK_TYPE_VCN, 3598 AMD_IP_BLOCK_TYPE_JPEG 3599 }; 3600 3601 for (i = 0; i < ARRAY_SIZE(ip_order); i++) { 3602 int j; 3603 struct amdgpu_ip_block *block; 3604 3605 for (j = 0; j < adev->num_ip_blocks; j++) { 3606 block = &adev->ip_blocks[j]; 3607 3608 if (block->version->type != ip_order[i] || 3609 !block->status.valid || 3610 block->status.hw) 3611 continue; 3612 3613 if (block->version->type == AMD_IP_BLOCK_TYPE_SMC) 3614 r = block->version->funcs->resume(adev); 3615 else 3616 r = block->version->funcs->hw_init(adev); 3617 3618 DRM_INFO("RE-INIT-late: %s %s\n", block->version->funcs->name, r?"failed":"succeeded"); 3619 if (r) 3620 return r; 3621 block->status.hw = true; 3622 } 3623 } 3624 3625 return 0; 3626 } 3627 3628 /** 3629 * amdgpu_device_ip_resume_phase1 - run resume for hardware IPs 3630 * 3631 * @adev: amdgpu_device pointer 3632 * 3633 * First resume function for hardware IPs. The list of all the hardware 3634 * IPs that make up the asic is walked and the resume callbacks are run for 3635 * COMMON, GMC, and IH. resume puts the hardware into a functional state 3636 * after a suspend and updates the software state as necessary. This 3637 * function is also used for restoring the GPU after a GPU reset. 3638 * Returns 0 on success, negative error code on failure. 3639 */ 3640 static int amdgpu_device_ip_resume_phase1(struct amdgpu_device *adev) 3641 { 3642 int i, r; 3643 3644 for (i = 0; i < adev->num_ip_blocks; i++) { 3645 if (!adev->ip_blocks[i].status.valid || adev->ip_blocks[i].status.hw) 3646 continue; 3647 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON || 3648 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC || 3649 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH || 3650 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP && amdgpu_sriov_vf(adev))) { 3651 3652 r = adev->ip_blocks[i].version->funcs->resume(adev); 3653 if (r) { 3654 DRM_ERROR("resume of IP block <%s> failed %d\n", 3655 adev->ip_blocks[i].version->funcs->name, r); 3656 return r; 3657 } 3658 adev->ip_blocks[i].status.hw = true; 3659 } 3660 } 3661 3662 return 0; 3663 } 3664 3665 /** 3666 * amdgpu_device_ip_resume_phase2 - run resume for hardware IPs 3667 * 3668 * @adev: amdgpu_device pointer 3669 * 3670 * First resume function for hardware IPs. The list of all the hardware 3671 * IPs that make up the asic is walked and the resume callbacks are run for 3672 * all blocks except COMMON, GMC, and IH. resume puts the hardware into a 3673 * functional state after a suspend and updates the software state as 3674 * necessary. This function is also used for restoring the GPU after a GPU 3675 * reset. 3676 * Returns 0 on success, negative error code on failure. 3677 */ 3678 static int amdgpu_device_ip_resume_phase2(struct amdgpu_device *adev) 3679 { 3680 int i, r; 3681 3682 for (i = 0; i < adev->num_ip_blocks; i++) { 3683 if (!adev->ip_blocks[i].status.valid || adev->ip_blocks[i].status.hw) 3684 continue; 3685 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON || 3686 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC || 3687 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH || 3688 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) 3689 continue; 3690 r = adev->ip_blocks[i].version->funcs->resume(adev); 3691 if (r) { 3692 DRM_ERROR("resume of IP block <%s> failed %d\n", 3693 adev->ip_blocks[i].version->funcs->name, r); 3694 return r; 3695 } 3696 adev->ip_blocks[i].status.hw = true; 3697 } 3698 3699 return 0; 3700 } 3701 3702 /** 3703 * amdgpu_device_ip_resume - run resume for hardware IPs 3704 * 3705 * @adev: amdgpu_device pointer 3706 * 3707 * Main resume function for hardware IPs. The hardware IPs 3708 * are split into two resume functions because they are 3709 * also used in recovering from a GPU reset and some additional 3710 * steps need to be take between them. In this case (S3/S4) they are 3711 * run sequentially. 3712 * Returns 0 on success, negative error code on failure. 3713 */ 3714 static int amdgpu_device_ip_resume(struct amdgpu_device *adev) 3715 { 3716 int r; 3717 3718 r = amdgpu_device_ip_resume_phase1(adev); 3719 if (r) 3720 return r; 3721 3722 r = amdgpu_device_fw_loading(adev); 3723 if (r) 3724 return r; 3725 3726 r = amdgpu_device_ip_resume_phase2(adev); 3727 3728 if (adev->mman.buffer_funcs_ring->sched.ready) 3729 amdgpu_ttm_set_buffer_funcs_status(adev, true); 3730 3731 return r; 3732 } 3733 3734 /** 3735 * amdgpu_device_detect_sriov_bios - determine if the board supports SR-IOV 3736 * 3737 * @adev: amdgpu_device pointer 3738 * 3739 * Query the VBIOS data tables to determine if the board supports SR-IOV. 3740 */ 3741 static void amdgpu_device_detect_sriov_bios(struct amdgpu_device *adev) 3742 { 3743 if (amdgpu_sriov_vf(adev)) { 3744 if (adev->is_atom_fw) { 3745 if (amdgpu_atomfirmware_gpu_virtualization_supported(adev)) 3746 adev->virt.caps |= AMDGPU_SRIOV_CAPS_SRIOV_VBIOS; 3747 } else { 3748 if (amdgpu_atombios_has_gpu_virtualization_table(adev)) 3749 adev->virt.caps |= AMDGPU_SRIOV_CAPS_SRIOV_VBIOS; 3750 } 3751 3752 if (!(adev->virt.caps & AMDGPU_SRIOV_CAPS_SRIOV_VBIOS)) 3753 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_NO_VBIOS, 0, 0); 3754 } 3755 } 3756 3757 /** 3758 * amdgpu_device_asic_has_dc_support - determine if DC supports the asic 3759 * 3760 * @asic_type: AMD asic type 3761 * 3762 * Check if there is DC (new modesetting infrastructre) support for an asic. 3763 * returns true if DC has support, false if not. 3764 */ 3765 bool amdgpu_device_asic_has_dc_support(enum amd_asic_type asic_type) 3766 { 3767 switch (asic_type) { 3768 #ifdef CONFIG_DRM_AMDGPU_SI 3769 case CHIP_HAINAN: 3770 #endif 3771 case CHIP_TOPAZ: 3772 /* chips with no display hardware */ 3773 return false; 3774 #if defined(CONFIG_DRM_AMD_DC) 3775 case CHIP_TAHITI: 3776 case CHIP_PITCAIRN: 3777 case CHIP_VERDE: 3778 case CHIP_OLAND: 3779 /* 3780 * We have systems in the wild with these ASICs that require 3781 * LVDS and VGA support which is not supported with DC. 3782 * 3783 * Fallback to the non-DC driver here by default so as not to 3784 * cause regressions. 3785 */ 3786 #if defined(CONFIG_DRM_AMD_DC_SI) 3787 return amdgpu_dc > 0; 3788 #else 3789 return false; 3790 #endif 3791 case CHIP_BONAIRE: 3792 case CHIP_KAVERI: 3793 case CHIP_KABINI: 3794 case CHIP_MULLINS: 3795 /* 3796 * We have systems in the wild with these ASICs that require 3797 * VGA support which is not supported with DC. 3798 * 3799 * Fallback to the non-DC driver here by default so as not to 3800 * cause regressions. 3801 */ 3802 return amdgpu_dc > 0; 3803 default: 3804 return amdgpu_dc != 0; 3805 #else 3806 default: 3807 if (amdgpu_dc > 0) 3808 DRM_INFO_ONCE("Display Core has been requested via kernel parameter but isn't supported by ASIC, ignoring\n"); 3809 return false; 3810 #endif 3811 } 3812 } 3813 3814 /** 3815 * amdgpu_device_has_dc_support - check if dc is supported 3816 * 3817 * @adev: amdgpu_device pointer 3818 * 3819 * Returns true for supported, false for not supported 3820 */ 3821 bool amdgpu_device_has_dc_support(struct amdgpu_device *adev) 3822 { 3823 if (adev->enable_virtual_display || 3824 (adev->harvest_ip_mask & AMD_HARVEST_IP_DMU_MASK)) 3825 return false; 3826 3827 return amdgpu_device_asic_has_dc_support(adev->asic_type); 3828 } 3829 3830 static void amdgpu_device_xgmi_reset_func(struct work_struct *__work) 3831 { 3832 struct amdgpu_device *adev = 3833 container_of(__work, struct amdgpu_device, xgmi_reset_work); 3834 struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev); 3835 3836 /* It's a bug to not have a hive within this function */ 3837 if (WARN_ON(!hive)) 3838 return; 3839 3840 /* 3841 * Use task barrier to synchronize all xgmi reset works across the 3842 * hive. task_barrier_enter and task_barrier_exit will block 3843 * until all the threads running the xgmi reset works reach 3844 * those points. task_barrier_full will do both blocks. 3845 */ 3846 if (amdgpu_asic_reset_method(adev) == AMD_RESET_METHOD_BACO) { 3847 3848 task_barrier_enter(&hive->tb); 3849 adev->asic_reset_res = amdgpu_device_baco_enter(adev_to_drm(adev)); 3850 3851 if (adev->asic_reset_res) 3852 goto fail; 3853 3854 task_barrier_exit(&hive->tb); 3855 adev->asic_reset_res = amdgpu_device_baco_exit(adev_to_drm(adev)); 3856 3857 if (adev->asic_reset_res) 3858 goto fail; 3859 3860 amdgpu_ras_reset_error_count(adev, AMDGPU_RAS_BLOCK__MMHUB); 3861 } else { 3862 3863 task_barrier_full(&hive->tb); 3864 adev->asic_reset_res = amdgpu_asic_reset(adev); 3865 } 3866 3867 fail: 3868 if (adev->asic_reset_res) 3869 DRM_WARN("ASIC reset failed with error, %d for drm dev, %s", 3870 adev->asic_reset_res, adev_to_drm(adev)->unique); 3871 amdgpu_put_xgmi_hive(hive); 3872 } 3873 3874 static int amdgpu_device_get_job_timeout_settings(struct amdgpu_device *adev) 3875 { 3876 char *input = amdgpu_lockup_timeout; 3877 char *timeout_setting = NULL; 3878 int index = 0; 3879 long timeout; 3880 int ret = 0; 3881 3882 /* 3883 * By default timeout for non compute jobs is 10000 3884 * and 60000 for compute jobs. 3885 * In SR-IOV or passthrough mode, timeout for compute 3886 * jobs are 60000 by default. 3887 */ 3888 adev->gfx_timeout = msecs_to_jiffies(10000); 3889 adev->sdma_timeout = adev->video_timeout = adev->gfx_timeout; 3890 if (amdgpu_sriov_vf(adev)) 3891 adev->compute_timeout = amdgpu_sriov_is_pp_one_vf(adev) ? 3892 msecs_to_jiffies(60000) : msecs_to_jiffies(10000); 3893 else 3894 adev->compute_timeout = msecs_to_jiffies(60000); 3895 3896 if (strnlen(input, AMDGPU_MAX_TIMEOUT_PARAM_LENGTH)) { 3897 while ((timeout_setting = strsep(&input, ",")) && 3898 strnlen(timeout_setting, AMDGPU_MAX_TIMEOUT_PARAM_LENGTH)) { 3899 ret = kstrtol(timeout_setting, 0, &timeout); 3900 if (ret) 3901 return ret; 3902 3903 if (timeout == 0) { 3904 index++; 3905 continue; 3906 } else if (timeout < 0) { 3907 timeout = MAX_SCHEDULE_TIMEOUT; 3908 dev_warn(adev->dev, "lockup timeout disabled"); 3909 add_taint(TAINT_SOFTLOCKUP, LOCKDEP_STILL_OK); 3910 } else { 3911 timeout = msecs_to_jiffies(timeout); 3912 } 3913 3914 switch (index++) { 3915 case 0: 3916 adev->gfx_timeout = timeout; 3917 break; 3918 case 1: 3919 adev->compute_timeout = timeout; 3920 break; 3921 case 2: 3922 adev->sdma_timeout = timeout; 3923 break; 3924 case 3: 3925 adev->video_timeout = timeout; 3926 break; 3927 default: 3928 break; 3929 } 3930 } 3931 /* 3932 * There is only one value specified and 3933 * it should apply to all non-compute jobs. 3934 */ 3935 if (index == 1) { 3936 adev->sdma_timeout = adev->video_timeout = adev->gfx_timeout; 3937 if (amdgpu_sriov_vf(adev) || amdgpu_passthrough(adev)) 3938 adev->compute_timeout = adev->gfx_timeout; 3939 } 3940 } 3941 3942 return ret; 3943 } 3944 3945 /** 3946 * amdgpu_device_check_iommu_direct_map - check if RAM direct mapped to GPU 3947 * 3948 * @adev: amdgpu_device pointer 3949 * 3950 * RAM direct mapped to GPU if IOMMU is not enabled or is pass through mode 3951 */ 3952 static void amdgpu_device_check_iommu_direct_map(struct amdgpu_device *adev) 3953 { 3954 struct iommu_domain *domain; 3955 3956 domain = iommu_get_domain_for_dev(adev->dev); 3957 if (!domain || domain->type == IOMMU_DOMAIN_IDENTITY) 3958 adev->ram_is_direct_mapped = true; 3959 } 3960 3961 #if defined(CONFIG_HSA_AMD_P2P) 3962 /** 3963 * amdgpu_device_check_iommu_remap - Check if DMA remapping is enabled. 3964 * 3965 * @adev: amdgpu_device pointer 3966 * 3967 * return if IOMMU remapping bar address 3968 */ 3969 static bool amdgpu_device_check_iommu_remap(struct amdgpu_device *adev) 3970 { 3971 struct iommu_domain *domain; 3972 3973 domain = iommu_get_domain_for_dev(adev->dev); 3974 if (domain && (domain->type == IOMMU_DOMAIN_DMA || 3975 domain->type == IOMMU_DOMAIN_DMA_FQ)) 3976 return true; 3977 3978 return false; 3979 } 3980 #endif 3981 3982 static const struct attribute *amdgpu_dev_attributes[] = { 3983 &dev_attr_pcie_replay_count.attr, 3984 NULL 3985 }; 3986 3987 static void amdgpu_device_set_mcbp(struct amdgpu_device *adev) 3988 { 3989 if (amdgpu_mcbp == 1) 3990 adev->gfx.mcbp = true; 3991 else if (amdgpu_mcbp == 0) 3992 adev->gfx.mcbp = false; 3993 3994 if (amdgpu_sriov_vf(adev)) 3995 adev->gfx.mcbp = true; 3996 3997 if (adev->gfx.mcbp) 3998 DRM_INFO("MCBP is enabled\n"); 3999 } 4000 4001 /** 4002 * amdgpu_device_init - initialize the driver 4003 * 4004 * @adev: amdgpu_device pointer 4005 * @flags: driver flags 4006 * 4007 * Initializes the driver info and hw (all asics). 4008 * Returns 0 for success or an error on failure. 4009 * Called at driver startup. 4010 */ 4011 int amdgpu_device_init(struct amdgpu_device *adev, 4012 uint32_t flags) 4013 { 4014 struct drm_device *ddev = adev_to_drm(adev); 4015 struct pci_dev *pdev = adev->pdev; 4016 int r, i; 4017 bool px = false; 4018 u32 max_MBps; 4019 int tmp; 4020 4021 adev->shutdown = false; 4022 adev->flags = flags; 4023 4024 if (amdgpu_force_asic_type >= 0 && amdgpu_force_asic_type < CHIP_LAST) 4025 adev->asic_type = amdgpu_force_asic_type; 4026 else 4027 adev->asic_type = flags & AMD_ASIC_MASK; 4028 4029 adev->usec_timeout = AMDGPU_MAX_USEC_TIMEOUT; 4030 if (amdgpu_emu_mode == 1) 4031 adev->usec_timeout *= 10; 4032 adev->gmc.gart_size = 512 * 1024 * 1024; 4033 adev->accel_working = false; 4034 adev->num_rings = 0; 4035 RCU_INIT_POINTER(adev->gang_submit, dma_fence_get_stub()); 4036 adev->mman.buffer_funcs = NULL; 4037 adev->mman.buffer_funcs_ring = NULL; 4038 adev->vm_manager.vm_pte_funcs = NULL; 4039 adev->vm_manager.vm_pte_num_scheds = 0; 4040 adev->gmc.gmc_funcs = NULL; 4041 adev->harvest_ip_mask = 0x0; 4042 adev->fence_context = dma_fence_context_alloc(AMDGPU_MAX_RINGS); 4043 bitmap_zero(adev->gfx.pipe_reserve_bitmap, AMDGPU_MAX_COMPUTE_QUEUES); 4044 4045 adev->smc_rreg = &amdgpu_invalid_rreg; 4046 adev->smc_wreg = &amdgpu_invalid_wreg; 4047 adev->pcie_rreg = &amdgpu_invalid_rreg; 4048 adev->pcie_wreg = &amdgpu_invalid_wreg; 4049 adev->pcie_rreg_ext = &amdgpu_invalid_rreg_ext; 4050 adev->pcie_wreg_ext = &amdgpu_invalid_wreg_ext; 4051 adev->pciep_rreg = &amdgpu_invalid_rreg; 4052 adev->pciep_wreg = &amdgpu_invalid_wreg; 4053 adev->pcie_rreg64 = &amdgpu_invalid_rreg64; 4054 adev->pcie_wreg64 = &amdgpu_invalid_wreg64; 4055 adev->pcie_rreg64_ext = &amdgpu_invalid_rreg64_ext; 4056 adev->pcie_wreg64_ext = &amdgpu_invalid_wreg64_ext; 4057 adev->uvd_ctx_rreg = &amdgpu_invalid_rreg; 4058 adev->uvd_ctx_wreg = &amdgpu_invalid_wreg; 4059 adev->didt_rreg = &amdgpu_invalid_rreg; 4060 adev->didt_wreg = &amdgpu_invalid_wreg; 4061 adev->gc_cac_rreg = &amdgpu_invalid_rreg; 4062 adev->gc_cac_wreg = &amdgpu_invalid_wreg; 4063 adev->audio_endpt_rreg = &amdgpu_block_invalid_rreg; 4064 adev->audio_endpt_wreg = &amdgpu_block_invalid_wreg; 4065 4066 DRM_INFO("initializing kernel modesetting (%s 0x%04X:0x%04X 0x%04X:0x%04X 0x%02X).\n", 4067 amdgpu_asic_name[adev->asic_type], pdev->vendor, pdev->device, 4068 pdev->subsystem_vendor, pdev->subsystem_device, pdev->revision); 4069 4070 /* mutex initialization are all done here so we 4071 * can recall function without having locking issues 4072 */ 4073 mutex_init(&adev->firmware.mutex); 4074 mutex_init(&adev->pm.mutex); 4075 mutex_init(&adev->gfx.gpu_clock_mutex); 4076 mutex_init(&adev->srbm_mutex); 4077 mutex_init(&adev->gfx.pipe_reserve_mutex); 4078 mutex_init(&adev->gfx.gfx_off_mutex); 4079 mutex_init(&adev->gfx.partition_mutex); 4080 mutex_init(&adev->grbm_idx_mutex); 4081 mutex_init(&adev->mn_lock); 4082 mutex_init(&adev->virt.vf_errors.lock); 4083 mutex_init(&adev->virt.rlcg_reg_lock); 4084 hash_init(adev->mn_hash); 4085 mutex_init(&adev->psp.mutex); 4086 mutex_init(&adev->notifier_lock); 4087 mutex_init(&adev->pm.stable_pstate_ctx_lock); 4088 mutex_init(&adev->benchmark_mutex); 4089 mutex_init(&adev->gfx.reset_sem_mutex); 4090 /* Initialize the mutex for cleaner shader isolation between GFX and compute processes */ 4091 mutex_init(&adev->enforce_isolation_mutex); 4092 mutex_init(&adev->gfx.kfd_sch_mutex); 4093 4094 amdgpu_device_init_apu_flags(adev); 4095 4096 r = amdgpu_device_check_arguments(adev); 4097 if (r) 4098 return r; 4099 4100 spin_lock_init(&adev->mmio_idx_lock); 4101 spin_lock_init(&adev->smc_idx_lock); 4102 spin_lock_init(&adev->pcie_idx_lock); 4103 spin_lock_init(&adev->uvd_ctx_idx_lock); 4104 spin_lock_init(&adev->didt_idx_lock); 4105 spin_lock_init(&adev->gc_cac_idx_lock); 4106 spin_lock_init(&adev->se_cac_idx_lock); 4107 spin_lock_init(&adev->audio_endpt_idx_lock); 4108 spin_lock_init(&adev->mm_stats.lock); 4109 spin_lock_init(&adev->wb.lock); 4110 4111 INIT_LIST_HEAD(&adev->reset_list); 4112 4113 INIT_LIST_HEAD(&adev->ras_list); 4114 4115 INIT_LIST_HEAD(&adev->pm.od_kobj_list); 4116 4117 INIT_DELAYED_WORK(&adev->delayed_init_work, 4118 amdgpu_device_delayed_init_work_handler); 4119 INIT_DELAYED_WORK(&adev->gfx.gfx_off_delay_work, 4120 amdgpu_device_delay_enable_gfx_off); 4121 /* 4122 * Initialize the enforce_isolation work structures for each XCP 4123 * partition. This work handler is responsible for enforcing shader 4124 * isolation on AMD GPUs. It counts the number of emitted fences for 4125 * each GFX and compute ring. If there are any fences, it schedules 4126 * the `enforce_isolation_work` to be run after a delay. If there are 4127 * no fences, it signals the Kernel Fusion Driver (KFD) to resume the 4128 * runqueue. 4129 */ 4130 for (i = 0; i < MAX_XCP; i++) { 4131 INIT_DELAYED_WORK(&adev->gfx.enforce_isolation[i].work, 4132 amdgpu_gfx_enforce_isolation_handler); 4133 adev->gfx.enforce_isolation[i].adev = adev; 4134 adev->gfx.enforce_isolation[i].xcp_id = i; 4135 } 4136 4137 INIT_WORK(&adev->xgmi_reset_work, amdgpu_device_xgmi_reset_func); 4138 4139 adev->gfx.gfx_off_req_count = 1; 4140 adev->gfx.gfx_off_residency = 0; 4141 adev->gfx.gfx_off_entrycount = 0; 4142 adev->pm.ac_power = power_supply_is_system_supplied() > 0; 4143 4144 atomic_set(&adev->throttling_logging_enabled, 1); 4145 /* 4146 * If throttling continues, logging will be performed every minute 4147 * to avoid log flooding. "-1" is subtracted since the thermal 4148 * throttling interrupt comes every second. Thus, the total logging 4149 * interval is 59 seconds(retelimited printk interval) + 1(waiting 4150 * for throttling interrupt) = 60 seconds. 4151 */ 4152 ratelimit_state_init(&adev->throttling_logging_rs, (60 - 1) * HZ, 1); 4153 ratelimit_set_flags(&adev->throttling_logging_rs, RATELIMIT_MSG_ON_RELEASE); 4154 4155 /* Registers mapping */ 4156 /* TODO: block userspace mapping of io register */ 4157 if (adev->asic_type >= CHIP_BONAIRE) { 4158 adev->rmmio_base = pci_resource_start(adev->pdev, 5); 4159 adev->rmmio_size = pci_resource_len(adev->pdev, 5); 4160 } else { 4161 adev->rmmio_base = pci_resource_start(adev->pdev, 2); 4162 adev->rmmio_size = pci_resource_len(adev->pdev, 2); 4163 } 4164 4165 for (i = 0; i < AMD_IP_BLOCK_TYPE_NUM; i++) 4166 atomic_set(&adev->pm.pwr_state[i], POWER_STATE_UNKNOWN); 4167 4168 adev->rmmio = ioremap(adev->rmmio_base, adev->rmmio_size); 4169 if (!adev->rmmio) 4170 return -ENOMEM; 4171 4172 DRM_INFO("register mmio base: 0x%08X\n", (uint32_t)adev->rmmio_base); 4173 DRM_INFO("register mmio size: %u\n", (unsigned int)adev->rmmio_size); 4174 4175 /* 4176 * Reset domain needs to be present early, before XGMI hive discovered 4177 * (if any) and intitialized to use reset sem and in_gpu reset flag 4178 * early on during init and before calling to RREG32. 4179 */ 4180 adev->reset_domain = amdgpu_reset_create_reset_domain(SINGLE_DEVICE, "amdgpu-reset-dev"); 4181 if (!adev->reset_domain) 4182 return -ENOMEM; 4183 4184 /* detect hw virtualization here */ 4185 amdgpu_detect_virtualization(adev); 4186 4187 amdgpu_device_get_pcie_info(adev); 4188 4189 r = amdgpu_device_get_job_timeout_settings(adev); 4190 if (r) { 4191 dev_err(adev->dev, "invalid lockup_timeout parameter syntax\n"); 4192 return r; 4193 } 4194 4195 amdgpu_device_set_mcbp(adev); 4196 4197 /* early init functions */ 4198 r = amdgpu_device_ip_early_init(adev); 4199 if (r) 4200 return r; 4201 4202 /* Get rid of things like offb */ 4203 r = aperture_remove_conflicting_pci_devices(adev->pdev, amdgpu_kms_driver.name); 4204 if (r) 4205 return r; 4206 4207 /* Enable TMZ based on IP_VERSION */ 4208 amdgpu_gmc_tmz_set(adev); 4209 4210 if (amdgpu_sriov_vf(adev) && 4211 amdgpu_ip_version(adev, GC_HWIP, 0) >= IP_VERSION(10, 3, 0)) 4212 /* VF MMIO access (except mailbox range) from CPU 4213 * will be blocked during sriov runtime 4214 */ 4215 adev->virt.caps |= AMDGPU_VF_MMIO_ACCESS_PROTECT; 4216 4217 amdgpu_gmc_noretry_set(adev); 4218 /* Need to get xgmi info early to decide the reset behavior*/ 4219 if (adev->gmc.xgmi.supported) { 4220 r = adev->gfxhub.funcs->get_xgmi_info(adev); 4221 if (r) 4222 return r; 4223 } 4224 4225 /* enable PCIE atomic ops */ 4226 if (amdgpu_sriov_vf(adev)) { 4227 if (adev->virt.fw_reserve.p_pf2vf) 4228 adev->have_atomics_support = ((struct amd_sriov_msg_pf2vf_info *) 4229 adev->virt.fw_reserve.p_pf2vf)->pcie_atomic_ops_support_flags == 4230 (PCI_EXP_DEVCAP2_ATOMIC_COMP32 | PCI_EXP_DEVCAP2_ATOMIC_COMP64); 4231 /* APUs w/ gfx9 onwards doesn't reply on PCIe atomics, rather it is a 4232 * internal path natively support atomics, set have_atomics_support to true. 4233 */ 4234 } else if ((adev->flags & AMD_IS_APU) && 4235 (amdgpu_ip_version(adev, GC_HWIP, 0) > 4236 IP_VERSION(9, 0, 0))) { 4237 adev->have_atomics_support = true; 4238 } else { 4239 adev->have_atomics_support = 4240 !pci_enable_atomic_ops_to_root(adev->pdev, 4241 PCI_EXP_DEVCAP2_ATOMIC_COMP32 | 4242 PCI_EXP_DEVCAP2_ATOMIC_COMP64); 4243 } 4244 4245 if (!adev->have_atomics_support) 4246 dev_info(adev->dev, "PCIE atomic ops is not supported\n"); 4247 4248 /* doorbell bar mapping and doorbell index init*/ 4249 amdgpu_doorbell_init(adev); 4250 4251 if (amdgpu_emu_mode == 1) { 4252 /* post the asic on emulation mode */ 4253 emu_soc_asic_init(adev); 4254 goto fence_driver_init; 4255 } 4256 4257 amdgpu_reset_init(adev); 4258 4259 /* detect if we are with an SRIOV vbios */ 4260 if (adev->bios) 4261 amdgpu_device_detect_sriov_bios(adev); 4262 4263 /* check if we need to reset the asic 4264 * E.g., driver was not cleanly unloaded previously, etc. 4265 */ 4266 if (!amdgpu_sriov_vf(adev) && amdgpu_asic_need_reset_on_init(adev)) { 4267 if (adev->gmc.xgmi.num_physical_nodes) { 4268 dev_info(adev->dev, "Pending hive reset.\n"); 4269 adev->gmc.xgmi.pending_reset = true; 4270 /* Only need to init necessary block for SMU to handle the reset */ 4271 for (i = 0; i < adev->num_ip_blocks; i++) { 4272 if (!adev->ip_blocks[i].status.valid) 4273 continue; 4274 if (!(adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC || 4275 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON || 4276 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH || 4277 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC)) { 4278 DRM_DEBUG("IP %s disabled for hw_init.\n", 4279 adev->ip_blocks[i].version->funcs->name); 4280 adev->ip_blocks[i].status.hw = true; 4281 } 4282 } 4283 } else if (amdgpu_ip_version(adev, MP1_HWIP, 0) == IP_VERSION(13, 0, 10) && 4284 !amdgpu_device_has_display_hardware(adev)) { 4285 r = psp_gpu_reset(adev); 4286 } else { 4287 tmp = amdgpu_reset_method; 4288 /* It should do a default reset when loading or reloading the driver, 4289 * regardless of the module parameter reset_method. 4290 */ 4291 amdgpu_reset_method = AMD_RESET_METHOD_NONE; 4292 r = amdgpu_asic_reset(adev); 4293 amdgpu_reset_method = tmp; 4294 } 4295 4296 if (r) { 4297 dev_err(adev->dev, "asic reset on init failed\n"); 4298 goto failed; 4299 } 4300 } 4301 4302 /* Post card if necessary */ 4303 if (amdgpu_device_need_post(adev)) { 4304 if (!adev->bios) { 4305 dev_err(adev->dev, "no vBIOS found\n"); 4306 r = -EINVAL; 4307 goto failed; 4308 } 4309 DRM_INFO("GPU posting now...\n"); 4310 r = amdgpu_device_asic_init(adev); 4311 if (r) { 4312 dev_err(adev->dev, "gpu post error!\n"); 4313 goto failed; 4314 } 4315 } 4316 4317 if (adev->bios) { 4318 if (adev->is_atom_fw) { 4319 /* Initialize clocks */ 4320 r = amdgpu_atomfirmware_get_clock_info(adev); 4321 if (r) { 4322 dev_err(adev->dev, "amdgpu_atomfirmware_get_clock_info failed\n"); 4323 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_GET_CLOCK_FAIL, 0, 0); 4324 goto failed; 4325 } 4326 } else { 4327 /* Initialize clocks */ 4328 r = amdgpu_atombios_get_clock_info(adev); 4329 if (r) { 4330 dev_err(adev->dev, "amdgpu_atombios_get_clock_info failed\n"); 4331 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_GET_CLOCK_FAIL, 0, 0); 4332 goto failed; 4333 } 4334 /* init i2c buses */ 4335 if (!amdgpu_device_has_dc_support(adev)) 4336 amdgpu_atombios_i2c_init(adev); 4337 } 4338 } 4339 4340 fence_driver_init: 4341 /* Fence driver */ 4342 r = amdgpu_fence_driver_sw_init(adev); 4343 if (r) { 4344 dev_err(adev->dev, "amdgpu_fence_driver_sw_init failed\n"); 4345 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_FENCE_INIT_FAIL, 0, 0); 4346 goto failed; 4347 } 4348 4349 /* init the mode config */ 4350 drm_mode_config_init(adev_to_drm(adev)); 4351 4352 r = amdgpu_device_ip_init(adev); 4353 if (r) { 4354 dev_err(adev->dev, "amdgpu_device_ip_init failed\n"); 4355 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_AMDGPU_INIT_FAIL, 0, 0); 4356 goto release_ras_con; 4357 } 4358 4359 amdgpu_fence_driver_hw_init(adev); 4360 4361 dev_info(adev->dev, 4362 "SE %d, SH per SE %d, CU per SH %d, active_cu_number %d\n", 4363 adev->gfx.config.max_shader_engines, 4364 adev->gfx.config.max_sh_per_se, 4365 adev->gfx.config.max_cu_per_sh, 4366 adev->gfx.cu_info.number); 4367 4368 adev->accel_working = true; 4369 4370 amdgpu_vm_check_compute_bug(adev); 4371 4372 /* Initialize the buffer migration limit. */ 4373 if (amdgpu_moverate >= 0) 4374 max_MBps = amdgpu_moverate; 4375 else 4376 max_MBps = 8; /* Allow 8 MB/s. */ 4377 /* Get a log2 for easy divisions. */ 4378 adev->mm_stats.log2_max_MBps = ilog2(max(1u, max_MBps)); 4379 4380 /* 4381 * Register gpu instance before amdgpu_device_enable_mgpu_fan_boost. 4382 * Otherwise the mgpu fan boost feature will be skipped due to the 4383 * gpu instance is counted less. 4384 */ 4385 amdgpu_register_gpu_instance(adev); 4386 4387 /* enable clockgating, etc. after ib tests, etc. since some blocks require 4388 * explicit gating rather than handling it automatically. 4389 */ 4390 if (!adev->gmc.xgmi.pending_reset) { 4391 r = amdgpu_device_ip_late_init(adev); 4392 if (r) { 4393 dev_err(adev->dev, "amdgpu_device_ip_late_init failed\n"); 4394 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_AMDGPU_LATE_INIT_FAIL, 0, r); 4395 goto release_ras_con; 4396 } 4397 /* must succeed. */ 4398 amdgpu_ras_resume(adev); 4399 queue_delayed_work(system_wq, &adev->delayed_init_work, 4400 msecs_to_jiffies(AMDGPU_RESUME_MS)); 4401 } 4402 4403 if (amdgpu_sriov_vf(adev)) { 4404 amdgpu_virt_release_full_gpu(adev, true); 4405 flush_delayed_work(&adev->delayed_init_work); 4406 } 4407 4408 /* 4409 * Place those sysfs registering after `late_init`. As some of those 4410 * operations performed in `late_init` might affect the sysfs 4411 * interfaces creating. 4412 */ 4413 r = amdgpu_atombios_sysfs_init(adev); 4414 if (r) 4415 drm_err(&adev->ddev, 4416 "registering atombios sysfs failed (%d).\n", r); 4417 4418 r = amdgpu_pm_sysfs_init(adev); 4419 if (r) 4420 DRM_ERROR("registering pm sysfs failed (%d).\n", r); 4421 4422 r = amdgpu_ucode_sysfs_init(adev); 4423 if (r) { 4424 adev->ucode_sysfs_en = false; 4425 DRM_ERROR("Creating firmware sysfs failed (%d).\n", r); 4426 } else 4427 adev->ucode_sysfs_en = true; 4428 4429 r = sysfs_create_files(&adev->dev->kobj, amdgpu_dev_attributes); 4430 if (r) 4431 dev_err(adev->dev, "Could not create amdgpu device attr\n"); 4432 4433 r = devm_device_add_group(adev->dev, &amdgpu_board_attrs_group); 4434 if (r) 4435 dev_err(adev->dev, 4436 "Could not create amdgpu board attributes\n"); 4437 4438 amdgpu_fru_sysfs_init(adev); 4439 amdgpu_reg_state_sysfs_init(adev); 4440 4441 if (IS_ENABLED(CONFIG_PERF_EVENTS)) 4442 r = amdgpu_pmu_init(adev); 4443 if (r) 4444 dev_err(adev->dev, "amdgpu_pmu_init failed\n"); 4445 4446 /* Have stored pci confspace at hand for restore in sudden PCI error */ 4447 if (amdgpu_device_cache_pci_state(adev->pdev)) 4448 pci_restore_state(pdev); 4449 4450 /* if we have > 1 VGA cards, then disable the amdgpu VGA resources */ 4451 /* this will fail for cards that aren't VGA class devices, just 4452 * ignore it 4453 */ 4454 if ((adev->pdev->class >> 8) == PCI_CLASS_DISPLAY_VGA) 4455 vga_client_register(adev->pdev, amdgpu_device_vga_set_decode); 4456 4457 px = amdgpu_device_supports_px(ddev); 4458 4459 if (px || (!dev_is_removable(&adev->pdev->dev) && 4460 apple_gmux_detect(NULL, NULL))) 4461 vga_switcheroo_register_client(adev->pdev, 4462 &amdgpu_switcheroo_ops, px); 4463 4464 if (px) 4465 vga_switcheroo_init_domain_pm_ops(adev->dev, &adev->vga_pm_domain); 4466 4467 if (adev->gmc.xgmi.pending_reset) 4468 queue_delayed_work(system_wq, &mgpu_info.delayed_reset_work, 4469 msecs_to_jiffies(AMDGPU_RESUME_MS)); 4470 4471 amdgpu_device_check_iommu_direct_map(adev); 4472 4473 return 0; 4474 4475 release_ras_con: 4476 if (amdgpu_sriov_vf(adev)) 4477 amdgpu_virt_release_full_gpu(adev, true); 4478 4479 /* failed in exclusive mode due to timeout */ 4480 if (amdgpu_sriov_vf(adev) && 4481 !amdgpu_sriov_runtime(adev) && 4482 amdgpu_virt_mmio_blocked(adev) && 4483 !amdgpu_virt_wait_reset(adev)) { 4484 dev_err(adev->dev, "VF exclusive mode timeout\n"); 4485 /* Don't send request since VF is inactive. */ 4486 adev->virt.caps &= ~AMDGPU_SRIOV_CAPS_RUNTIME; 4487 adev->virt.ops = NULL; 4488 r = -EAGAIN; 4489 } 4490 amdgpu_release_ras_context(adev); 4491 4492 failed: 4493 amdgpu_vf_error_trans_all(adev); 4494 4495 return r; 4496 } 4497 4498 static void amdgpu_device_unmap_mmio(struct amdgpu_device *adev) 4499 { 4500 4501 /* Clear all CPU mappings pointing to this device */ 4502 unmap_mapping_range(adev->ddev.anon_inode->i_mapping, 0, 0, 1); 4503 4504 /* Unmap all mapped bars - Doorbell, registers and VRAM */ 4505 amdgpu_doorbell_fini(adev); 4506 4507 iounmap(adev->rmmio); 4508 adev->rmmio = NULL; 4509 if (adev->mman.aper_base_kaddr) 4510 iounmap(adev->mman.aper_base_kaddr); 4511 adev->mman.aper_base_kaddr = NULL; 4512 4513 /* Memory manager related */ 4514 if (!adev->gmc.xgmi.connected_to_cpu && !adev->gmc.is_app_apu) { 4515 arch_phys_wc_del(adev->gmc.vram_mtrr); 4516 arch_io_free_memtype_wc(adev->gmc.aper_base, adev->gmc.aper_size); 4517 } 4518 } 4519 4520 /** 4521 * amdgpu_device_fini_hw - tear down the driver 4522 * 4523 * @adev: amdgpu_device pointer 4524 * 4525 * Tear down the driver info (all asics). 4526 * Called at driver shutdown. 4527 */ 4528 void amdgpu_device_fini_hw(struct amdgpu_device *adev) 4529 { 4530 dev_info(adev->dev, "amdgpu: finishing device.\n"); 4531 flush_delayed_work(&adev->delayed_init_work); 4532 4533 if (adev->mman.initialized) 4534 drain_workqueue(adev->mman.bdev.wq); 4535 adev->shutdown = true; 4536 4537 /* make sure IB test finished before entering exclusive mode 4538 * to avoid preemption on IB test 4539 */ 4540 if (amdgpu_sriov_vf(adev)) { 4541 amdgpu_virt_request_full_gpu(adev, false); 4542 amdgpu_virt_fini_data_exchange(adev); 4543 } 4544 4545 /* disable all interrupts */ 4546 amdgpu_irq_disable_all(adev); 4547 if (adev->mode_info.mode_config_initialized) { 4548 if (!drm_drv_uses_atomic_modeset(adev_to_drm(adev))) 4549 drm_helper_force_disable_all(adev_to_drm(adev)); 4550 else 4551 drm_atomic_helper_shutdown(adev_to_drm(adev)); 4552 } 4553 amdgpu_fence_driver_hw_fini(adev); 4554 4555 if (adev->pm.sysfs_initialized) 4556 amdgpu_pm_sysfs_fini(adev); 4557 if (adev->ucode_sysfs_en) 4558 amdgpu_ucode_sysfs_fini(adev); 4559 sysfs_remove_files(&adev->dev->kobj, amdgpu_dev_attributes); 4560 amdgpu_fru_sysfs_fini(adev); 4561 4562 amdgpu_reg_state_sysfs_fini(adev); 4563 4564 /* disable ras feature must before hw fini */ 4565 amdgpu_ras_pre_fini(adev); 4566 4567 amdgpu_ttm_set_buffer_funcs_status(adev, false); 4568 4569 amdgpu_device_ip_fini_early(adev); 4570 4571 amdgpu_irq_fini_hw(adev); 4572 4573 if (adev->mman.initialized) 4574 ttm_device_clear_dma_mappings(&adev->mman.bdev); 4575 4576 amdgpu_gart_dummy_page_fini(adev); 4577 4578 if (drm_dev_is_unplugged(adev_to_drm(adev))) 4579 amdgpu_device_unmap_mmio(adev); 4580 4581 } 4582 4583 void amdgpu_device_fini_sw(struct amdgpu_device *adev) 4584 { 4585 int idx; 4586 bool px; 4587 4588 amdgpu_fence_driver_sw_fini(adev); 4589 amdgpu_device_ip_fini(adev); 4590 amdgpu_ucode_release(&adev->firmware.gpu_info_fw); 4591 adev->accel_working = false; 4592 dma_fence_put(rcu_dereference_protected(adev->gang_submit, true)); 4593 4594 amdgpu_reset_fini(adev); 4595 4596 /* free i2c buses */ 4597 if (!amdgpu_device_has_dc_support(adev)) 4598 amdgpu_i2c_fini(adev); 4599 4600 if (amdgpu_emu_mode != 1) 4601 amdgpu_atombios_fini(adev); 4602 4603 kfree(adev->bios); 4604 adev->bios = NULL; 4605 4606 kfree(adev->fru_info); 4607 adev->fru_info = NULL; 4608 4609 px = amdgpu_device_supports_px(adev_to_drm(adev)); 4610 4611 if (px || (!dev_is_removable(&adev->pdev->dev) && 4612 apple_gmux_detect(NULL, NULL))) 4613 vga_switcheroo_unregister_client(adev->pdev); 4614 4615 if (px) 4616 vga_switcheroo_fini_domain_pm_ops(adev->dev); 4617 4618 if ((adev->pdev->class >> 8) == PCI_CLASS_DISPLAY_VGA) 4619 vga_client_unregister(adev->pdev); 4620 4621 if (drm_dev_enter(adev_to_drm(adev), &idx)) { 4622 4623 iounmap(adev->rmmio); 4624 adev->rmmio = NULL; 4625 amdgpu_doorbell_fini(adev); 4626 drm_dev_exit(idx); 4627 } 4628 4629 if (IS_ENABLED(CONFIG_PERF_EVENTS)) 4630 amdgpu_pmu_fini(adev); 4631 if (adev->mman.discovery_bin) 4632 amdgpu_discovery_fini(adev); 4633 4634 amdgpu_reset_put_reset_domain(adev->reset_domain); 4635 adev->reset_domain = NULL; 4636 4637 kfree(adev->pci_state); 4638 4639 } 4640 4641 /** 4642 * amdgpu_device_evict_resources - evict device resources 4643 * @adev: amdgpu device object 4644 * 4645 * Evicts all ttm device resources(vram BOs, gart table) from the lru list 4646 * of the vram memory type. Mainly used for evicting device resources 4647 * at suspend time. 4648 * 4649 */ 4650 static int amdgpu_device_evict_resources(struct amdgpu_device *adev) 4651 { 4652 int ret; 4653 4654 /* No need to evict vram on APUs for suspend to ram or s2idle */ 4655 if ((adev->in_s3 || adev->in_s0ix) && (adev->flags & AMD_IS_APU)) 4656 return 0; 4657 4658 ret = amdgpu_ttm_evict_resources(adev, TTM_PL_VRAM); 4659 if (ret) 4660 DRM_WARN("evicting device resources failed\n"); 4661 return ret; 4662 } 4663 4664 /* 4665 * Suspend & resume. 4666 */ 4667 /** 4668 * amdgpu_device_prepare - prepare for device suspend 4669 * 4670 * @dev: drm dev pointer 4671 * 4672 * Prepare to put the hw in the suspend state (all asics). 4673 * Returns 0 for success or an error on failure. 4674 * Called at driver suspend. 4675 */ 4676 int amdgpu_device_prepare(struct drm_device *dev) 4677 { 4678 struct amdgpu_device *adev = drm_to_adev(dev); 4679 int i, r; 4680 4681 amdgpu_choose_low_power_state(adev); 4682 4683 if (dev->switch_power_state == DRM_SWITCH_POWER_OFF) 4684 return 0; 4685 4686 /* Evict the majority of BOs before starting suspend sequence */ 4687 r = amdgpu_device_evict_resources(adev); 4688 if (r) 4689 goto unprepare; 4690 4691 flush_delayed_work(&adev->gfx.gfx_off_delay_work); 4692 4693 for (i = 0; i < adev->num_ip_blocks; i++) { 4694 if (!adev->ip_blocks[i].status.valid) 4695 continue; 4696 if (!adev->ip_blocks[i].version->funcs->prepare_suspend) 4697 continue; 4698 r = adev->ip_blocks[i].version->funcs->prepare_suspend((void *)adev); 4699 if (r) 4700 goto unprepare; 4701 } 4702 4703 return 0; 4704 4705 unprepare: 4706 adev->in_s0ix = adev->in_s3 = false; 4707 4708 return r; 4709 } 4710 4711 /** 4712 * amdgpu_device_suspend - initiate device suspend 4713 * 4714 * @dev: drm dev pointer 4715 * @fbcon : notify the fbdev of suspend 4716 * 4717 * Puts the hw in the suspend state (all asics). 4718 * Returns 0 for success or an error on failure. 4719 * Called at driver suspend. 4720 */ 4721 int amdgpu_device_suspend(struct drm_device *dev, bool fbcon) 4722 { 4723 struct amdgpu_device *adev = drm_to_adev(dev); 4724 int r = 0; 4725 4726 if (dev->switch_power_state == DRM_SWITCH_POWER_OFF) 4727 return 0; 4728 4729 adev->in_suspend = true; 4730 4731 if (amdgpu_sriov_vf(adev)) { 4732 amdgpu_virt_fini_data_exchange(adev); 4733 r = amdgpu_virt_request_full_gpu(adev, false); 4734 if (r) 4735 return r; 4736 } 4737 4738 if (amdgpu_acpi_smart_shift_update(dev, AMDGPU_SS_DEV_D3)) 4739 DRM_WARN("smart shift update failed\n"); 4740 4741 if (fbcon) 4742 drm_fb_helper_set_suspend_unlocked(adev_to_drm(adev)->fb_helper, true); 4743 4744 cancel_delayed_work_sync(&adev->delayed_init_work); 4745 4746 amdgpu_ras_suspend(adev); 4747 4748 amdgpu_device_ip_suspend_phase1(adev); 4749 4750 if (!adev->in_s0ix) 4751 amdgpu_amdkfd_suspend(adev, adev->in_runpm); 4752 4753 r = amdgpu_device_evict_resources(adev); 4754 if (r) 4755 return r; 4756 4757 amdgpu_ttm_set_buffer_funcs_status(adev, false); 4758 4759 amdgpu_fence_driver_hw_fini(adev); 4760 4761 amdgpu_device_ip_suspend_phase2(adev); 4762 4763 if (amdgpu_sriov_vf(adev)) 4764 amdgpu_virt_release_full_gpu(adev, false); 4765 4766 r = amdgpu_dpm_notify_rlc_state(adev, false); 4767 if (r) 4768 return r; 4769 4770 return 0; 4771 } 4772 4773 /** 4774 * amdgpu_device_resume - initiate device resume 4775 * 4776 * @dev: drm dev pointer 4777 * @fbcon : notify the fbdev of resume 4778 * 4779 * Bring the hw back to operating state (all asics). 4780 * Returns 0 for success or an error on failure. 4781 * Called at driver resume. 4782 */ 4783 int amdgpu_device_resume(struct drm_device *dev, bool fbcon) 4784 { 4785 struct amdgpu_device *adev = drm_to_adev(dev); 4786 int r = 0; 4787 4788 if (amdgpu_sriov_vf(adev)) { 4789 r = amdgpu_virt_request_full_gpu(adev, true); 4790 if (r) 4791 return r; 4792 } 4793 4794 if (dev->switch_power_state == DRM_SWITCH_POWER_OFF) 4795 return 0; 4796 4797 if (adev->in_s0ix) 4798 amdgpu_dpm_gfx_state_change(adev, sGpuChangeState_D0Entry); 4799 4800 /* post card */ 4801 if (amdgpu_device_need_post(adev)) { 4802 r = amdgpu_device_asic_init(adev); 4803 if (r) 4804 dev_err(adev->dev, "amdgpu asic init failed\n"); 4805 } 4806 4807 r = amdgpu_device_ip_resume(adev); 4808 4809 if (r) { 4810 dev_err(adev->dev, "amdgpu_device_ip_resume failed (%d).\n", r); 4811 goto exit; 4812 } 4813 amdgpu_fence_driver_hw_init(adev); 4814 4815 if (!adev->in_s0ix) { 4816 r = amdgpu_amdkfd_resume(adev, adev->in_runpm); 4817 if (r) 4818 goto exit; 4819 } 4820 4821 r = amdgpu_device_ip_late_init(adev); 4822 if (r) 4823 goto exit; 4824 4825 queue_delayed_work(system_wq, &adev->delayed_init_work, 4826 msecs_to_jiffies(AMDGPU_RESUME_MS)); 4827 exit: 4828 if (amdgpu_sriov_vf(adev)) { 4829 amdgpu_virt_init_data_exchange(adev); 4830 amdgpu_virt_release_full_gpu(adev, true); 4831 } 4832 4833 if (r) 4834 return r; 4835 4836 /* Make sure IB tests flushed */ 4837 flush_delayed_work(&adev->delayed_init_work); 4838 4839 if (fbcon) 4840 drm_fb_helper_set_suspend_unlocked(adev_to_drm(adev)->fb_helper, false); 4841 4842 amdgpu_ras_resume(adev); 4843 4844 if (adev->mode_info.num_crtc) { 4845 /* 4846 * Most of the connector probing functions try to acquire runtime pm 4847 * refs to ensure that the GPU is powered on when connector polling is 4848 * performed. Since we're calling this from a runtime PM callback, 4849 * trying to acquire rpm refs will cause us to deadlock. 4850 * 4851 * Since we're guaranteed to be holding the rpm lock, it's safe to 4852 * temporarily disable the rpm helpers so this doesn't deadlock us. 4853 */ 4854 #ifdef CONFIG_PM 4855 dev->dev->power.disable_depth++; 4856 #endif 4857 if (!adev->dc_enabled) 4858 drm_helper_hpd_irq_event(dev); 4859 else 4860 drm_kms_helper_hotplug_event(dev); 4861 #ifdef CONFIG_PM 4862 dev->dev->power.disable_depth--; 4863 #endif 4864 } 4865 adev->in_suspend = false; 4866 4867 if (adev->enable_mes) 4868 amdgpu_mes_self_test(adev); 4869 4870 if (amdgpu_acpi_smart_shift_update(dev, AMDGPU_SS_DEV_D0)) 4871 DRM_WARN("smart shift update failed\n"); 4872 4873 return 0; 4874 } 4875 4876 /** 4877 * amdgpu_device_ip_check_soft_reset - did soft reset succeed 4878 * 4879 * @adev: amdgpu_device pointer 4880 * 4881 * The list of all the hardware IPs that make up the asic is walked and 4882 * the check_soft_reset callbacks are run. check_soft_reset determines 4883 * if the asic is still hung or not. 4884 * Returns true if any of the IPs are still in a hung state, false if not. 4885 */ 4886 static bool amdgpu_device_ip_check_soft_reset(struct amdgpu_device *adev) 4887 { 4888 int i; 4889 bool asic_hang = false; 4890 4891 if (amdgpu_sriov_vf(adev)) 4892 return true; 4893 4894 if (amdgpu_asic_need_full_reset(adev)) 4895 return true; 4896 4897 for (i = 0; i < adev->num_ip_blocks; i++) { 4898 if (!adev->ip_blocks[i].status.valid) 4899 continue; 4900 if (adev->ip_blocks[i].version->funcs->check_soft_reset) 4901 adev->ip_blocks[i].status.hang = 4902 adev->ip_blocks[i].version->funcs->check_soft_reset(adev); 4903 if (adev->ip_blocks[i].status.hang) { 4904 dev_info(adev->dev, "IP block:%s is hung!\n", adev->ip_blocks[i].version->funcs->name); 4905 asic_hang = true; 4906 } 4907 } 4908 return asic_hang; 4909 } 4910 4911 /** 4912 * amdgpu_device_ip_pre_soft_reset - prepare for soft reset 4913 * 4914 * @adev: amdgpu_device pointer 4915 * 4916 * The list of all the hardware IPs that make up the asic is walked and the 4917 * pre_soft_reset callbacks are run if the block is hung. pre_soft_reset 4918 * handles any IP specific hardware or software state changes that are 4919 * necessary for a soft reset to succeed. 4920 * Returns 0 on success, negative error code on failure. 4921 */ 4922 static int amdgpu_device_ip_pre_soft_reset(struct amdgpu_device *adev) 4923 { 4924 int i, r = 0; 4925 4926 for (i = 0; i < adev->num_ip_blocks; i++) { 4927 if (!adev->ip_blocks[i].status.valid) 4928 continue; 4929 if (adev->ip_blocks[i].status.hang && 4930 adev->ip_blocks[i].version->funcs->pre_soft_reset) { 4931 r = adev->ip_blocks[i].version->funcs->pre_soft_reset(adev); 4932 if (r) 4933 return r; 4934 } 4935 } 4936 4937 return 0; 4938 } 4939 4940 /** 4941 * amdgpu_device_ip_need_full_reset - check if a full asic reset is needed 4942 * 4943 * @adev: amdgpu_device pointer 4944 * 4945 * Some hardware IPs cannot be soft reset. If they are hung, a full gpu 4946 * reset is necessary to recover. 4947 * Returns true if a full asic reset is required, false if not. 4948 */ 4949 static bool amdgpu_device_ip_need_full_reset(struct amdgpu_device *adev) 4950 { 4951 int i; 4952 4953 if (amdgpu_asic_need_full_reset(adev)) 4954 return true; 4955 4956 for (i = 0; i < adev->num_ip_blocks; i++) { 4957 if (!adev->ip_blocks[i].status.valid) 4958 continue; 4959 if ((adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) || 4960 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) || 4961 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_ACP) || 4962 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE) || 4963 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) { 4964 if (adev->ip_blocks[i].status.hang) { 4965 dev_info(adev->dev, "Some block need full reset!\n"); 4966 return true; 4967 } 4968 } 4969 } 4970 return false; 4971 } 4972 4973 /** 4974 * amdgpu_device_ip_soft_reset - do a soft reset 4975 * 4976 * @adev: amdgpu_device pointer 4977 * 4978 * The list of all the hardware IPs that make up the asic is walked and the 4979 * soft_reset callbacks are run if the block is hung. soft_reset handles any 4980 * IP specific hardware or software state changes that are necessary to soft 4981 * reset the IP. 4982 * Returns 0 on success, negative error code on failure. 4983 */ 4984 static int amdgpu_device_ip_soft_reset(struct amdgpu_device *adev) 4985 { 4986 int i, r = 0; 4987 4988 for (i = 0; i < adev->num_ip_blocks; i++) { 4989 if (!adev->ip_blocks[i].status.valid) 4990 continue; 4991 if (adev->ip_blocks[i].status.hang && 4992 adev->ip_blocks[i].version->funcs->soft_reset) { 4993 r = adev->ip_blocks[i].version->funcs->soft_reset(adev); 4994 if (r) 4995 return r; 4996 } 4997 } 4998 4999 return 0; 5000 } 5001 5002 /** 5003 * amdgpu_device_ip_post_soft_reset - clean up from soft reset 5004 * 5005 * @adev: amdgpu_device pointer 5006 * 5007 * The list of all the hardware IPs that make up the asic is walked and the 5008 * post_soft_reset callbacks are run if the asic was hung. post_soft_reset 5009 * handles any IP specific hardware or software state changes that are 5010 * necessary after the IP has been soft reset. 5011 * Returns 0 on success, negative error code on failure. 5012 */ 5013 static int amdgpu_device_ip_post_soft_reset(struct amdgpu_device *adev) 5014 { 5015 int i, r = 0; 5016 5017 for (i = 0; i < adev->num_ip_blocks; i++) { 5018 if (!adev->ip_blocks[i].status.valid) 5019 continue; 5020 if (adev->ip_blocks[i].status.hang && 5021 adev->ip_blocks[i].version->funcs->post_soft_reset) 5022 r = adev->ip_blocks[i].version->funcs->post_soft_reset(adev); 5023 if (r) 5024 return r; 5025 } 5026 5027 return 0; 5028 } 5029 5030 /** 5031 * amdgpu_device_reset_sriov - reset ASIC for SR-IOV vf 5032 * 5033 * @adev: amdgpu_device pointer 5034 * @reset_context: amdgpu reset context pointer 5035 * 5036 * do VF FLR and reinitialize Asic 5037 * return 0 means succeeded otherwise failed 5038 */ 5039 static int amdgpu_device_reset_sriov(struct amdgpu_device *adev, 5040 struct amdgpu_reset_context *reset_context) 5041 { 5042 int r; 5043 struct amdgpu_hive_info *hive = NULL; 5044 5045 if (test_bit(AMDGPU_HOST_FLR, &reset_context->flags)) { 5046 if (!amdgpu_ras_get_fed_status(adev)) 5047 amdgpu_virt_ready_to_reset(adev); 5048 amdgpu_virt_wait_reset(adev); 5049 clear_bit(AMDGPU_HOST_FLR, &reset_context->flags); 5050 r = amdgpu_virt_request_full_gpu(adev, true); 5051 } else { 5052 r = amdgpu_virt_reset_gpu(adev); 5053 } 5054 if (r) 5055 return r; 5056 5057 amdgpu_ras_set_fed(adev, false); 5058 amdgpu_irq_gpu_reset_resume_helper(adev); 5059 5060 /* some sw clean up VF needs to do before recover */ 5061 amdgpu_virt_post_reset(adev); 5062 5063 /* Resume IP prior to SMC */ 5064 r = amdgpu_device_ip_reinit_early_sriov(adev); 5065 if (r) 5066 return r; 5067 5068 amdgpu_virt_init_data_exchange(adev); 5069 5070 r = amdgpu_device_fw_loading(adev); 5071 if (r) 5072 return r; 5073 5074 /* now we are okay to resume SMC/CP/SDMA */ 5075 r = amdgpu_device_ip_reinit_late_sriov(adev); 5076 if (r) 5077 return r; 5078 5079 hive = amdgpu_get_xgmi_hive(adev); 5080 /* Update PSP FW topology after reset */ 5081 if (hive && adev->gmc.xgmi.num_physical_nodes > 1) 5082 r = amdgpu_xgmi_update_topology(hive, adev); 5083 if (hive) 5084 amdgpu_put_xgmi_hive(hive); 5085 if (r) 5086 return r; 5087 5088 r = amdgpu_ib_ring_tests(adev); 5089 if (r) 5090 return r; 5091 5092 if (adev->virt.gim_feature & AMDGIM_FEATURE_GIM_FLR_VRAMLOST) 5093 amdgpu_inc_vram_lost(adev); 5094 5095 /* need to be called during full access so we can't do it later like 5096 * bare-metal does. 5097 */ 5098 amdgpu_amdkfd_post_reset(adev); 5099 amdgpu_virt_release_full_gpu(adev, true); 5100 5101 /* Aldebaran and gfx_11_0_3 support ras in SRIOV, so need resume ras during reset */ 5102 if (amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 2) || 5103 amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 3) || 5104 amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 4) || 5105 amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(11, 0, 3)) 5106 amdgpu_ras_resume(adev); 5107 return 0; 5108 } 5109 5110 /** 5111 * amdgpu_device_has_job_running - check if there is any job in mirror list 5112 * 5113 * @adev: amdgpu_device pointer 5114 * 5115 * check if there is any job in mirror list 5116 */ 5117 bool amdgpu_device_has_job_running(struct amdgpu_device *adev) 5118 { 5119 int i; 5120 struct drm_sched_job *job; 5121 5122 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 5123 struct amdgpu_ring *ring = adev->rings[i]; 5124 5125 if (!amdgpu_ring_sched_ready(ring)) 5126 continue; 5127 5128 spin_lock(&ring->sched.job_list_lock); 5129 job = list_first_entry_or_null(&ring->sched.pending_list, 5130 struct drm_sched_job, list); 5131 spin_unlock(&ring->sched.job_list_lock); 5132 if (job) 5133 return true; 5134 } 5135 return false; 5136 } 5137 5138 /** 5139 * amdgpu_device_should_recover_gpu - check if we should try GPU recovery 5140 * 5141 * @adev: amdgpu_device pointer 5142 * 5143 * Check amdgpu_gpu_recovery and SRIOV status to see if we should try to recover 5144 * a hung GPU. 5145 */ 5146 bool amdgpu_device_should_recover_gpu(struct amdgpu_device *adev) 5147 { 5148 5149 if (amdgpu_gpu_recovery == 0) 5150 goto disabled; 5151 5152 /* Skip soft reset check in fatal error mode */ 5153 if (!amdgpu_ras_is_poison_mode_supported(adev)) 5154 return true; 5155 5156 if (amdgpu_sriov_vf(adev)) 5157 return true; 5158 5159 if (amdgpu_gpu_recovery == -1) { 5160 switch (adev->asic_type) { 5161 #ifdef CONFIG_DRM_AMDGPU_SI 5162 case CHIP_VERDE: 5163 case CHIP_TAHITI: 5164 case CHIP_PITCAIRN: 5165 case CHIP_OLAND: 5166 case CHIP_HAINAN: 5167 #endif 5168 #ifdef CONFIG_DRM_AMDGPU_CIK 5169 case CHIP_KAVERI: 5170 case CHIP_KABINI: 5171 case CHIP_MULLINS: 5172 #endif 5173 case CHIP_CARRIZO: 5174 case CHIP_STONEY: 5175 case CHIP_CYAN_SKILLFISH: 5176 goto disabled; 5177 default: 5178 break; 5179 } 5180 } 5181 5182 return true; 5183 5184 disabled: 5185 dev_info(adev->dev, "GPU recovery disabled.\n"); 5186 return false; 5187 } 5188 5189 int amdgpu_device_mode1_reset(struct amdgpu_device *adev) 5190 { 5191 u32 i; 5192 int ret = 0; 5193 5194 amdgpu_atombios_scratch_regs_engine_hung(adev, true); 5195 5196 dev_info(adev->dev, "GPU mode1 reset\n"); 5197 5198 /* Cache the state before bus master disable. The saved config space 5199 * values are used in other cases like restore after mode-2 reset. 5200 */ 5201 amdgpu_device_cache_pci_state(adev->pdev); 5202 5203 /* disable BM */ 5204 pci_clear_master(adev->pdev); 5205 5206 if (amdgpu_dpm_is_mode1_reset_supported(adev)) { 5207 dev_info(adev->dev, "GPU smu mode1 reset\n"); 5208 ret = amdgpu_dpm_mode1_reset(adev); 5209 } else { 5210 dev_info(adev->dev, "GPU psp mode1 reset\n"); 5211 ret = psp_gpu_reset(adev); 5212 } 5213 5214 if (ret) 5215 goto mode1_reset_failed; 5216 5217 amdgpu_device_load_pci_state(adev->pdev); 5218 ret = amdgpu_psp_wait_for_bootloader(adev); 5219 if (ret) 5220 goto mode1_reset_failed; 5221 5222 /* wait for asic to come out of reset */ 5223 for (i = 0; i < adev->usec_timeout; i++) { 5224 u32 memsize = adev->nbio.funcs->get_memsize(adev); 5225 5226 if (memsize != 0xffffffff) 5227 break; 5228 udelay(1); 5229 } 5230 5231 if (i >= adev->usec_timeout) { 5232 ret = -ETIMEDOUT; 5233 goto mode1_reset_failed; 5234 } 5235 5236 amdgpu_atombios_scratch_regs_engine_hung(adev, false); 5237 5238 return 0; 5239 5240 mode1_reset_failed: 5241 dev_err(adev->dev, "GPU mode1 reset failed\n"); 5242 return ret; 5243 } 5244 5245 int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev, 5246 struct amdgpu_reset_context *reset_context) 5247 { 5248 int i, r = 0; 5249 struct amdgpu_job *job = NULL; 5250 struct amdgpu_device *tmp_adev = reset_context->reset_req_dev; 5251 bool need_full_reset = 5252 test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 5253 5254 if (reset_context->reset_req_dev == adev) 5255 job = reset_context->job; 5256 5257 if (amdgpu_sriov_vf(adev)) 5258 amdgpu_virt_pre_reset(adev); 5259 5260 amdgpu_fence_driver_isr_toggle(adev, true); 5261 5262 /* block all schedulers and reset given job's ring */ 5263 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 5264 struct amdgpu_ring *ring = adev->rings[i]; 5265 5266 if (!amdgpu_ring_sched_ready(ring)) 5267 continue; 5268 5269 /* Clear job fence from fence drv to avoid force_completion 5270 * leave NULL and vm flush fence in fence drv 5271 */ 5272 amdgpu_fence_driver_clear_job_fences(ring); 5273 5274 /* after all hw jobs are reset, hw fence is meaningless, so force_completion */ 5275 amdgpu_fence_driver_force_completion(ring); 5276 } 5277 5278 amdgpu_fence_driver_isr_toggle(adev, false); 5279 5280 if (job && job->vm) 5281 drm_sched_increase_karma(&job->base); 5282 5283 r = amdgpu_reset_prepare_hwcontext(adev, reset_context); 5284 /* If reset handler not implemented, continue; otherwise return */ 5285 if (r == -EOPNOTSUPP) 5286 r = 0; 5287 else 5288 return r; 5289 5290 /* Don't suspend on bare metal if we are not going to HW reset the ASIC */ 5291 if (!amdgpu_sriov_vf(adev)) { 5292 5293 if (!need_full_reset) 5294 need_full_reset = amdgpu_device_ip_need_full_reset(adev); 5295 5296 if (!need_full_reset && amdgpu_gpu_recovery && 5297 amdgpu_device_ip_check_soft_reset(adev)) { 5298 amdgpu_device_ip_pre_soft_reset(adev); 5299 r = amdgpu_device_ip_soft_reset(adev); 5300 amdgpu_device_ip_post_soft_reset(adev); 5301 if (r || amdgpu_device_ip_check_soft_reset(adev)) { 5302 dev_info(adev->dev, "soft reset failed, will fallback to full reset!\n"); 5303 need_full_reset = true; 5304 } 5305 } 5306 5307 if (!test_bit(AMDGPU_SKIP_COREDUMP, &reset_context->flags)) { 5308 dev_info(tmp_adev->dev, "Dumping IP State\n"); 5309 /* Trigger ip dump before we reset the asic */ 5310 for (i = 0; i < tmp_adev->num_ip_blocks; i++) 5311 if (tmp_adev->ip_blocks[i].version->funcs->dump_ip_state) 5312 tmp_adev->ip_blocks[i].version->funcs 5313 ->dump_ip_state((void *)tmp_adev); 5314 dev_info(tmp_adev->dev, "Dumping IP State Completed\n"); 5315 } 5316 5317 if (need_full_reset) 5318 r = amdgpu_device_ip_suspend(adev); 5319 if (need_full_reset) 5320 set_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 5321 else 5322 clear_bit(AMDGPU_NEED_FULL_RESET, 5323 &reset_context->flags); 5324 } 5325 5326 return r; 5327 } 5328 5329 int amdgpu_do_asic_reset(struct list_head *device_list_handle, 5330 struct amdgpu_reset_context *reset_context) 5331 { 5332 struct amdgpu_device *tmp_adev = NULL; 5333 bool need_full_reset, skip_hw_reset, vram_lost = false; 5334 int r = 0; 5335 5336 /* Try reset handler method first */ 5337 tmp_adev = list_first_entry(device_list_handle, struct amdgpu_device, 5338 reset_list); 5339 5340 reset_context->reset_device_list = device_list_handle; 5341 r = amdgpu_reset_perform_reset(tmp_adev, reset_context); 5342 /* If reset handler not implemented, continue; otherwise return */ 5343 if (r == -EOPNOTSUPP) 5344 r = 0; 5345 else 5346 return r; 5347 5348 /* Reset handler not implemented, use the default method */ 5349 need_full_reset = 5350 test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 5351 skip_hw_reset = test_bit(AMDGPU_SKIP_HW_RESET, &reset_context->flags); 5352 5353 /* 5354 * ASIC reset has to be done on all XGMI hive nodes ASAP 5355 * to allow proper links negotiation in FW (within 1 sec) 5356 */ 5357 if (!skip_hw_reset && need_full_reset) { 5358 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 5359 /* For XGMI run all resets in parallel to speed up the process */ 5360 if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) { 5361 tmp_adev->gmc.xgmi.pending_reset = false; 5362 if (!queue_work(system_unbound_wq, &tmp_adev->xgmi_reset_work)) 5363 r = -EALREADY; 5364 } else 5365 r = amdgpu_asic_reset(tmp_adev); 5366 5367 if (r) { 5368 dev_err(tmp_adev->dev, "ASIC reset failed with error, %d for drm dev, %s", 5369 r, adev_to_drm(tmp_adev)->unique); 5370 goto out; 5371 } 5372 } 5373 5374 /* For XGMI wait for all resets to complete before proceed */ 5375 if (!r) { 5376 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 5377 if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) { 5378 flush_work(&tmp_adev->xgmi_reset_work); 5379 r = tmp_adev->asic_reset_res; 5380 if (r) 5381 break; 5382 } 5383 } 5384 } 5385 } 5386 5387 if (!r && amdgpu_ras_intr_triggered()) { 5388 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 5389 amdgpu_ras_reset_error_count(tmp_adev, AMDGPU_RAS_BLOCK__MMHUB); 5390 } 5391 5392 amdgpu_ras_intr_cleared(); 5393 } 5394 5395 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 5396 if (need_full_reset) { 5397 /* post card */ 5398 amdgpu_ras_set_fed(tmp_adev, false); 5399 r = amdgpu_device_asic_init(tmp_adev); 5400 if (r) { 5401 dev_warn(tmp_adev->dev, "asic atom init failed!"); 5402 } else { 5403 dev_info(tmp_adev->dev, "GPU reset succeeded, trying to resume\n"); 5404 5405 r = amdgpu_device_ip_resume_phase1(tmp_adev); 5406 if (r) 5407 goto out; 5408 5409 vram_lost = amdgpu_device_check_vram_lost(tmp_adev); 5410 5411 if (!test_bit(AMDGPU_SKIP_COREDUMP, &reset_context->flags)) 5412 amdgpu_coredump(tmp_adev, false, vram_lost, reset_context->job); 5413 5414 if (vram_lost) { 5415 DRM_INFO("VRAM is lost due to GPU reset!\n"); 5416 amdgpu_inc_vram_lost(tmp_adev); 5417 } 5418 5419 r = amdgpu_device_fw_loading(tmp_adev); 5420 if (r) 5421 return r; 5422 5423 r = amdgpu_xcp_restore_partition_mode( 5424 tmp_adev->xcp_mgr); 5425 if (r) 5426 goto out; 5427 5428 r = amdgpu_device_ip_resume_phase2(tmp_adev); 5429 if (r) 5430 goto out; 5431 5432 if (tmp_adev->mman.buffer_funcs_ring->sched.ready) 5433 amdgpu_ttm_set_buffer_funcs_status(tmp_adev, true); 5434 5435 if (vram_lost) 5436 amdgpu_device_fill_reset_magic(tmp_adev); 5437 5438 /* 5439 * Add this ASIC as tracked as reset was already 5440 * complete successfully. 5441 */ 5442 amdgpu_register_gpu_instance(tmp_adev); 5443 5444 if (!reset_context->hive && 5445 tmp_adev->gmc.xgmi.num_physical_nodes > 1) 5446 amdgpu_xgmi_add_device(tmp_adev); 5447 5448 r = amdgpu_device_ip_late_init(tmp_adev); 5449 if (r) 5450 goto out; 5451 5452 drm_fb_helper_set_suspend_unlocked(adev_to_drm(tmp_adev)->fb_helper, false); 5453 5454 /* 5455 * The GPU enters bad state once faulty pages 5456 * by ECC has reached the threshold, and ras 5457 * recovery is scheduled next. So add one check 5458 * here to break recovery if it indeed exceeds 5459 * bad page threshold, and remind user to 5460 * retire this GPU or setting one bigger 5461 * bad_page_threshold value to fix this once 5462 * probing driver again. 5463 */ 5464 if (!amdgpu_ras_is_rma(tmp_adev)) { 5465 /* must succeed. */ 5466 amdgpu_ras_resume(tmp_adev); 5467 } else { 5468 r = -EINVAL; 5469 goto out; 5470 } 5471 5472 /* Update PSP FW topology after reset */ 5473 if (reset_context->hive && 5474 tmp_adev->gmc.xgmi.num_physical_nodes > 1) 5475 r = amdgpu_xgmi_update_topology( 5476 reset_context->hive, tmp_adev); 5477 } 5478 } 5479 5480 out: 5481 if (!r) { 5482 amdgpu_irq_gpu_reset_resume_helper(tmp_adev); 5483 r = amdgpu_ib_ring_tests(tmp_adev); 5484 if (r) { 5485 dev_err(tmp_adev->dev, "ib ring test failed (%d).\n", r); 5486 need_full_reset = true; 5487 r = -EAGAIN; 5488 goto end; 5489 } 5490 } 5491 5492 if (r) 5493 tmp_adev->asic_reset_res = r; 5494 } 5495 5496 end: 5497 if (need_full_reset) 5498 set_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 5499 else 5500 clear_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 5501 return r; 5502 } 5503 5504 static void amdgpu_device_set_mp1_state(struct amdgpu_device *adev) 5505 { 5506 5507 switch (amdgpu_asic_reset_method(adev)) { 5508 case AMD_RESET_METHOD_MODE1: 5509 adev->mp1_state = PP_MP1_STATE_SHUTDOWN; 5510 break; 5511 case AMD_RESET_METHOD_MODE2: 5512 adev->mp1_state = PP_MP1_STATE_RESET; 5513 break; 5514 default: 5515 adev->mp1_state = PP_MP1_STATE_NONE; 5516 break; 5517 } 5518 } 5519 5520 static void amdgpu_device_unset_mp1_state(struct amdgpu_device *adev) 5521 { 5522 amdgpu_vf_error_trans_all(adev); 5523 adev->mp1_state = PP_MP1_STATE_NONE; 5524 } 5525 5526 static void amdgpu_device_resume_display_audio(struct amdgpu_device *adev) 5527 { 5528 struct pci_dev *p = NULL; 5529 5530 p = pci_get_domain_bus_and_slot(pci_domain_nr(adev->pdev->bus), 5531 adev->pdev->bus->number, 1); 5532 if (p) { 5533 pm_runtime_enable(&(p->dev)); 5534 pm_runtime_resume(&(p->dev)); 5535 } 5536 5537 pci_dev_put(p); 5538 } 5539 5540 static int amdgpu_device_suspend_display_audio(struct amdgpu_device *adev) 5541 { 5542 enum amd_reset_method reset_method; 5543 struct pci_dev *p = NULL; 5544 u64 expires; 5545 5546 /* 5547 * For now, only BACO and mode1 reset are confirmed 5548 * to suffer the audio issue without proper suspended. 5549 */ 5550 reset_method = amdgpu_asic_reset_method(adev); 5551 if ((reset_method != AMD_RESET_METHOD_BACO) && 5552 (reset_method != AMD_RESET_METHOD_MODE1)) 5553 return -EINVAL; 5554 5555 p = pci_get_domain_bus_and_slot(pci_domain_nr(adev->pdev->bus), 5556 adev->pdev->bus->number, 1); 5557 if (!p) 5558 return -ENODEV; 5559 5560 expires = pm_runtime_autosuspend_expiration(&(p->dev)); 5561 if (!expires) 5562 /* 5563 * If we cannot get the audio device autosuspend delay, 5564 * a fixed 4S interval will be used. Considering 3S is 5565 * the audio controller default autosuspend delay setting. 5566 * 4S used here is guaranteed to cover that. 5567 */ 5568 expires = ktime_get_mono_fast_ns() + NSEC_PER_SEC * 4ULL; 5569 5570 while (!pm_runtime_status_suspended(&(p->dev))) { 5571 if (!pm_runtime_suspend(&(p->dev))) 5572 break; 5573 5574 if (expires < ktime_get_mono_fast_ns()) { 5575 dev_warn(adev->dev, "failed to suspend display audio\n"); 5576 pci_dev_put(p); 5577 /* TODO: abort the succeeding gpu reset? */ 5578 return -ETIMEDOUT; 5579 } 5580 } 5581 5582 pm_runtime_disable(&(p->dev)); 5583 5584 pci_dev_put(p); 5585 return 0; 5586 } 5587 5588 static inline void amdgpu_device_stop_pending_resets(struct amdgpu_device *adev) 5589 { 5590 struct amdgpu_ras *con = amdgpu_ras_get_context(adev); 5591 5592 #if defined(CONFIG_DEBUG_FS) 5593 if (!amdgpu_sriov_vf(adev)) 5594 cancel_work(&adev->reset_work); 5595 #endif 5596 5597 if (adev->kfd.dev) 5598 cancel_work(&adev->kfd.reset_work); 5599 5600 if (amdgpu_sriov_vf(adev)) 5601 cancel_work(&adev->virt.flr_work); 5602 5603 if (con && adev->ras_enabled) 5604 cancel_work(&con->recovery_work); 5605 5606 } 5607 5608 static int amdgpu_device_health_check(struct list_head *device_list_handle) 5609 { 5610 struct amdgpu_device *tmp_adev; 5611 int ret = 0; 5612 u32 status; 5613 5614 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 5615 pci_read_config_dword(tmp_adev->pdev, PCI_COMMAND, &status); 5616 if (PCI_POSSIBLE_ERROR(status)) { 5617 dev_err(tmp_adev->dev, "device lost from bus!"); 5618 ret = -ENODEV; 5619 } 5620 } 5621 5622 return ret; 5623 } 5624 5625 /** 5626 * amdgpu_device_gpu_recover - reset the asic and recover scheduler 5627 * 5628 * @adev: amdgpu_device pointer 5629 * @job: which job trigger hang 5630 * @reset_context: amdgpu reset context pointer 5631 * 5632 * Attempt to reset the GPU if it has hung (all asics). 5633 * Attempt to do soft-reset or full-reset and reinitialize Asic 5634 * Returns 0 for success or an error on failure. 5635 */ 5636 5637 int amdgpu_device_gpu_recover(struct amdgpu_device *adev, 5638 struct amdgpu_job *job, 5639 struct amdgpu_reset_context *reset_context) 5640 { 5641 struct list_head device_list, *device_list_handle = NULL; 5642 bool job_signaled = false; 5643 struct amdgpu_hive_info *hive = NULL; 5644 struct amdgpu_device *tmp_adev = NULL; 5645 int i, r = 0; 5646 bool need_emergency_restart = false; 5647 bool audio_suspended = false; 5648 int retry_limit = AMDGPU_MAX_RETRY_LIMIT; 5649 5650 /* 5651 * Special case: RAS triggered and full reset isn't supported 5652 */ 5653 need_emergency_restart = amdgpu_ras_need_emergency_restart(adev); 5654 5655 /* 5656 * Flush RAM to disk so that after reboot 5657 * the user can read log and see why the system rebooted. 5658 */ 5659 if (need_emergency_restart && amdgpu_ras_get_context(adev) && 5660 amdgpu_ras_get_context(adev)->reboot) { 5661 DRM_WARN("Emergency reboot."); 5662 5663 ksys_sync_helper(); 5664 emergency_restart(); 5665 } 5666 5667 dev_info(adev->dev, "GPU %s begin!\n", 5668 need_emergency_restart ? "jobs stop":"reset"); 5669 5670 if (!amdgpu_sriov_vf(adev)) 5671 hive = amdgpu_get_xgmi_hive(adev); 5672 if (hive) 5673 mutex_lock(&hive->hive_lock); 5674 5675 reset_context->job = job; 5676 reset_context->hive = hive; 5677 /* 5678 * Build list of devices to reset. 5679 * In case we are in XGMI hive mode, resort the device list 5680 * to put adev in the 1st position. 5681 */ 5682 INIT_LIST_HEAD(&device_list); 5683 if (!amdgpu_sriov_vf(adev) && (adev->gmc.xgmi.num_physical_nodes > 1) && hive) { 5684 list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head) { 5685 list_add_tail(&tmp_adev->reset_list, &device_list); 5686 if (adev->shutdown) 5687 tmp_adev->shutdown = true; 5688 } 5689 if (!list_is_first(&adev->reset_list, &device_list)) 5690 list_rotate_to_front(&adev->reset_list, &device_list); 5691 device_list_handle = &device_list; 5692 } else { 5693 list_add_tail(&adev->reset_list, &device_list); 5694 device_list_handle = &device_list; 5695 } 5696 5697 if (!amdgpu_sriov_vf(adev)) { 5698 r = amdgpu_device_health_check(device_list_handle); 5699 if (r) 5700 goto end_reset; 5701 } 5702 5703 /* We need to lock reset domain only once both for XGMI and single device */ 5704 tmp_adev = list_first_entry(device_list_handle, struct amdgpu_device, 5705 reset_list); 5706 amdgpu_device_lock_reset_domain(tmp_adev->reset_domain); 5707 5708 /* block all schedulers and reset given job's ring */ 5709 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 5710 5711 amdgpu_device_set_mp1_state(tmp_adev); 5712 5713 /* 5714 * Try to put the audio codec into suspend state 5715 * before gpu reset started. 5716 * 5717 * Due to the power domain of the graphics device 5718 * is shared with AZ power domain. Without this, 5719 * we may change the audio hardware from behind 5720 * the audio driver's back. That will trigger 5721 * some audio codec errors. 5722 */ 5723 if (!amdgpu_device_suspend_display_audio(tmp_adev)) 5724 audio_suspended = true; 5725 5726 amdgpu_ras_set_error_query_ready(tmp_adev, false); 5727 5728 cancel_delayed_work_sync(&tmp_adev->delayed_init_work); 5729 5730 amdgpu_amdkfd_pre_reset(tmp_adev, reset_context); 5731 5732 /* 5733 * Mark these ASICs to be reseted as untracked first 5734 * And add them back after reset completed 5735 */ 5736 amdgpu_unregister_gpu_instance(tmp_adev); 5737 5738 drm_fb_helper_set_suspend_unlocked(adev_to_drm(tmp_adev)->fb_helper, true); 5739 5740 /* disable ras on ALL IPs */ 5741 if (!need_emergency_restart && 5742 amdgpu_device_ip_need_full_reset(tmp_adev)) 5743 amdgpu_ras_suspend(tmp_adev); 5744 5745 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 5746 struct amdgpu_ring *ring = tmp_adev->rings[i]; 5747 5748 if (!amdgpu_ring_sched_ready(ring)) 5749 continue; 5750 5751 drm_sched_stop(&ring->sched, job ? &job->base : NULL); 5752 5753 if (need_emergency_restart) 5754 amdgpu_job_stop_all_jobs_on_sched(&ring->sched); 5755 } 5756 atomic_inc(&tmp_adev->gpu_reset_counter); 5757 } 5758 5759 if (need_emergency_restart) 5760 goto skip_sched_resume; 5761 5762 /* 5763 * Must check guilty signal here since after this point all old 5764 * HW fences are force signaled. 5765 * 5766 * job->base holds a reference to parent fence 5767 */ 5768 if (job && dma_fence_is_signaled(&job->hw_fence)) { 5769 job_signaled = true; 5770 dev_info(adev->dev, "Guilty job already signaled, skipping HW reset"); 5771 goto skip_hw_reset; 5772 } 5773 5774 retry: /* Rest of adevs pre asic reset from XGMI hive. */ 5775 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 5776 r = amdgpu_device_pre_asic_reset(tmp_adev, reset_context); 5777 /*TODO Should we stop ?*/ 5778 if (r) { 5779 dev_err(tmp_adev->dev, "GPU pre asic reset failed with err, %d for drm dev, %s ", 5780 r, adev_to_drm(tmp_adev)->unique); 5781 tmp_adev->asic_reset_res = r; 5782 } 5783 } 5784 5785 /* Actual ASIC resets if needed.*/ 5786 /* Host driver will handle XGMI hive reset for SRIOV */ 5787 if (amdgpu_sriov_vf(adev)) { 5788 if (amdgpu_ras_get_fed_status(adev) || amdgpu_virt_rcvd_ras_interrupt(adev)) { 5789 dev_dbg(adev->dev, "Detected RAS error, wait for FLR completion\n"); 5790 amdgpu_ras_set_fed(adev, true); 5791 set_bit(AMDGPU_HOST_FLR, &reset_context->flags); 5792 } 5793 5794 r = amdgpu_device_reset_sriov(adev, reset_context); 5795 if (AMDGPU_RETRY_SRIOV_RESET(r) && (retry_limit--) > 0) { 5796 amdgpu_virt_release_full_gpu(adev, true); 5797 goto retry; 5798 } 5799 if (r) 5800 adev->asic_reset_res = r; 5801 } else { 5802 r = amdgpu_do_asic_reset(device_list_handle, reset_context); 5803 if (r && r == -EAGAIN) 5804 goto retry; 5805 } 5806 5807 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 5808 /* 5809 * Drop any pending non scheduler resets queued before reset is done. 5810 * Any reset scheduled after this point would be valid. Scheduler resets 5811 * were already dropped during drm_sched_stop and no new ones can come 5812 * in before drm_sched_start. 5813 */ 5814 amdgpu_device_stop_pending_resets(tmp_adev); 5815 } 5816 5817 skip_hw_reset: 5818 5819 /* Post ASIC reset for all devs .*/ 5820 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 5821 5822 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 5823 struct amdgpu_ring *ring = tmp_adev->rings[i]; 5824 5825 if (!amdgpu_ring_sched_ready(ring)) 5826 continue; 5827 5828 drm_sched_start(&ring->sched, 0); 5829 } 5830 5831 if (!drm_drv_uses_atomic_modeset(adev_to_drm(tmp_adev)) && !job_signaled) 5832 drm_helper_resume_force_mode(adev_to_drm(tmp_adev)); 5833 5834 if (tmp_adev->asic_reset_res) 5835 r = tmp_adev->asic_reset_res; 5836 5837 tmp_adev->asic_reset_res = 0; 5838 5839 if (r) { 5840 /* bad news, how to tell it to userspace ? 5841 * for ras error, we should report GPU bad status instead of 5842 * reset failure 5843 */ 5844 if (reset_context->src != AMDGPU_RESET_SRC_RAS || 5845 !amdgpu_ras_eeprom_check_err_threshold(tmp_adev)) 5846 dev_info(tmp_adev->dev, "GPU reset(%d) failed\n", 5847 atomic_read(&tmp_adev->gpu_reset_counter)); 5848 amdgpu_vf_error_put(tmp_adev, AMDGIM_ERROR_VF_GPU_RESET_FAIL, 0, r); 5849 } else { 5850 dev_info(tmp_adev->dev, "GPU reset(%d) succeeded!\n", atomic_read(&tmp_adev->gpu_reset_counter)); 5851 if (amdgpu_acpi_smart_shift_update(adev_to_drm(tmp_adev), AMDGPU_SS_DEV_D0)) 5852 DRM_WARN("smart shift update failed\n"); 5853 } 5854 } 5855 5856 skip_sched_resume: 5857 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 5858 /* unlock kfd: SRIOV would do it separately */ 5859 if (!need_emergency_restart && !amdgpu_sriov_vf(tmp_adev)) 5860 amdgpu_amdkfd_post_reset(tmp_adev); 5861 5862 /* kfd_post_reset will do nothing if kfd device is not initialized, 5863 * need to bring up kfd here if it's not be initialized before 5864 */ 5865 if (!adev->kfd.init_complete) 5866 amdgpu_amdkfd_device_init(adev); 5867 5868 if (audio_suspended) 5869 amdgpu_device_resume_display_audio(tmp_adev); 5870 5871 amdgpu_device_unset_mp1_state(tmp_adev); 5872 5873 amdgpu_ras_set_error_query_ready(tmp_adev, true); 5874 } 5875 5876 tmp_adev = list_first_entry(device_list_handle, struct amdgpu_device, 5877 reset_list); 5878 amdgpu_device_unlock_reset_domain(tmp_adev->reset_domain); 5879 5880 end_reset: 5881 if (hive) { 5882 mutex_unlock(&hive->hive_lock); 5883 amdgpu_put_xgmi_hive(hive); 5884 } 5885 5886 if (r) 5887 dev_info(adev->dev, "GPU reset end with ret = %d\n", r); 5888 5889 atomic_set(&adev->reset_domain->reset_res, r); 5890 return r; 5891 } 5892 5893 /** 5894 * amdgpu_device_partner_bandwidth - find the bandwidth of appropriate partner 5895 * 5896 * @adev: amdgpu_device pointer 5897 * @speed: pointer to the speed of the link 5898 * @width: pointer to the width of the link 5899 * 5900 * Evaluate the hierarchy to find the speed and bandwidth capabilities of the 5901 * first physical partner to an AMD dGPU. 5902 * This will exclude any virtual switches and links. 5903 */ 5904 static void amdgpu_device_partner_bandwidth(struct amdgpu_device *adev, 5905 enum pci_bus_speed *speed, 5906 enum pcie_link_width *width) 5907 { 5908 struct pci_dev *parent = adev->pdev; 5909 5910 if (!speed || !width) 5911 return; 5912 5913 *speed = PCI_SPEED_UNKNOWN; 5914 *width = PCIE_LNK_WIDTH_UNKNOWN; 5915 5916 if (amdgpu_device_pcie_dynamic_switching_supported(adev)) { 5917 while ((parent = pci_upstream_bridge(parent))) { 5918 /* skip upstream/downstream switches internal to dGPU*/ 5919 if (parent->vendor == PCI_VENDOR_ID_ATI) 5920 continue; 5921 *speed = pcie_get_speed_cap(parent); 5922 *width = pcie_get_width_cap(parent); 5923 break; 5924 } 5925 } else { 5926 /* use the current speeds rather than max if switching is not supported */ 5927 pcie_bandwidth_available(adev->pdev, NULL, speed, width); 5928 } 5929 } 5930 5931 /** 5932 * amdgpu_device_get_pcie_info - fence pcie info about the PCIE slot 5933 * 5934 * @adev: amdgpu_device pointer 5935 * 5936 * Fetchs and stores in the driver the PCIE capabilities (gen speed 5937 * and lanes) of the slot the device is in. Handles APUs and 5938 * virtualized environments where PCIE config space may not be available. 5939 */ 5940 static void amdgpu_device_get_pcie_info(struct amdgpu_device *adev) 5941 { 5942 struct pci_dev *pdev; 5943 enum pci_bus_speed speed_cap, platform_speed_cap; 5944 enum pcie_link_width platform_link_width; 5945 5946 if (amdgpu_pcie_gen_cap) 5947 adev->pm.pcie_gen_mask = amdgpu_pcie_gen_cap; 5948 5949 if (amdgpu_pcie_lane_cap) 5950 adev->pm.pcie_mlw_mask = amdgpu_pcie_lane_cap; 5951 5952 /* covers APUs as well */ 5953 if (pci_is_root_bus(adev->pdev->bus) && !amdgpu_passthrough(adev)) { 5954 if (adev->pm.pcie_gen_mask == 0) 5955 adev->pm.pcie_gen_mask = AMDGPU_DEFAULT_PCIE_GEN_MASK; 5956 if (adev->pm.pcie_mlw_mask == 0) 5957 adev->pm.pcie_mlw_mask = AMDGPU_DEFAULT_PCIE_MLW_MASK; 5958 return; 5959 } 5960 5961 if (adev->pm.pcie_gen_mask && adev->pm.pcie_mlw_mask) 5962 return; 5963 5964 amdgpu_device_partner_bandwidth(adev, &platform_speed_cap, 5965 &platform_link_width); 5966 5967 if (adev->pm.pcie_gen_mask == 0) { 5968 /* asic caps */ 5969 pdev = adev->pdev; 5970 speed_cap = pcie_get_speed_cap(pdev); 5971 if (speed_cap == PCI_SPEED_UNKNOWN) { 5972 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5973 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 | 5974 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3); 5975 } else { 5976 if (speed_cap == PCIE_SPEED_32_0GT) 5977 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5978 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 | 5979 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3 | 5980 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN4 | 5981 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN5); 5982 else if (speed_cap == PCIE_SPEED_16_0GT) 5983 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5984 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 | 5985 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3 | 5986 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN4); 5987 else if (speed_cap == PCIE_SPEED_8_0GT) 5988 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5989 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 | 5990 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3); 5991 else if (speed_cap == PCIE_SPEED_5_0GT) 5992 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 5993 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2); 5994 else 5995 adev->pm.pcie_gen_mask |= CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1; 5996 } 5997 /* platform caps */ 5998 if (platform_speed_cap == PCI_SPEED_UNKNOWN) { 5999 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 6000 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2); 6001 } else { 6002 if (platform_speed_cap == PCIE_SPEED_32_0GT) 6003 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 6004 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 | 6005 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3 | 6006 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN4 | 6007 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN5); 6008 else if (platform_speed_cap == PCIE_SPEED_16_0GT) 6009 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 6010 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 | 6011 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3 | 6012 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN4); 6013 else if (platform_speed_cap == PCIE_SPEED_8_0GT) 6014 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 6015 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 | 6016 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3); 6017 else if (platform_speed_cap == PCIE_SPEED_5_0GT) 6018 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 6019 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2); 6020 else 6021 adev->pm.pcie_gen_mask |= CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1; 6022 6023 } 6024 } 6025 if (adev->pm.pcie_mlw_mask == 0) { 6026 if (platform_link_width == PCIE_LNK_WIDTH_UNKNOWN) { 6027 adev->pm.pcie_mlw_mask |= AMDGPU_DEFAULT_PCIE_MLW_MASK; 6028 } else { 6029 switch (platform_link_width) { 6030 case PCIE_LNK_X32: 6031 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X32 | 6032 CAIL_PCIE_LINK_WIDTH_SUPPORT_X16 | 6033 CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 | 6034 CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 | 6035 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 6036 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 6037 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 6038 break; 6039 case PCIE_LNK_X16: 6040 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X16 | 6041 CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 | 6042 CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 | 6043 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 6044 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 6045 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 6046 break; 6047 case PCIE_LNK_X12: 6048 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 | 6049 CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 | 6050 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 6051 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 6052 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 6053 break; 6054 case PCIE_LNK_X8: 6055 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 | 6056 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 6057 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 6058 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 6059 break; 6060 case PCIE_LNK_X4: 6061 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 6062 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 6063 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 6064 break; 6065 case PCIE_LNK_X2: 6066 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 6067 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 6068 break; 6069 case PCIE_LNK_X1: 6070 adev->pm.pcie_mlw_mask = CAIL_PCIE_LINK_WIDTH_SUPPORT_X1; 6071 break; 6072 default: 6073 break; 6074 } 6075 } 6076 } 6077 } 6078 6079 /** 6080 * amdgpu_device_is_peer_accessible - Check peer access through PCIe BAR 6081 * 6082 * @adev: amdgpu_device pointer 6083 * @peer_adev: amdgpu_device pointer for peer device trying to access @adev 6084 * 6085 * Return true if @peer_adev can access (DMA) @adev through the PCIe 6086 * BAR, i.e. @adev is "large BAR" and the BAR matches the DMA mask of 6087 * @peer_adev. 6088 */ 6089 bool amdgpu_device_is_peer_accessible(struct amdgpu_device *adev, 6090 struct amdgpu_device *peer_adev) 6091 { 6092 #ifdef CONFIG_HSA_AMD_P2P 6093 bool p2p_access = 6094 !adev->gmc.xgmi.connected_to_cpu && 6095 !(pci_p2pdma_distance(adev->pdev, peer_adev->dev, false) < 0); 6096 6097 bool is_large_bar = adev->gmc.visible_vram_size && 6098 adev->gmc.real_vram_size == adev->gmc.visible_vram_size; 6099 bool p2p_addressable = amdgpu_device_check_iommu_remap(peer_adev); 6100 6101 if (!p2p_addressable) { 6102 uint64_t address_mask = peer_adev->dev->dma_mask ? 6103 ~*peer_adev->dev->dma_mask : ~((1ULL << 32) - 1); 6104 resource_size_t aper_limit = 6105 adev->gmc.aper_base + adev->gmc.aper_size - 1; 6106 6107 p2p_addressable = !(adev->gmc.aper_base & address_mask || 6108 aper_limit & address_mask); 6109 } 6110 return pcie_p2p && is_large_bar && p2p_access && p2p_addressable; 6111 #else 6112 return false; 6113 #endif 6114 } 6115 6116 int amdgpu_device_baco_enter(struct drm_device *dev) 6117 { 6118 struct amdgpu_device *adev = drm_to_adev(dev); 6119 struct amdgpu_ras *ras = amdgpu_ras_get_context(adev); 6120 6121 if (!amdgpu_device_supports_baco(dev)) 6122 return -ENOTSUPP; 6123 6124 if (ras && adev->ras_enabled && 6125 adev->nbio.funcs->enable_doorbell_interrupt) 6126 adev->nbio.funcs->enable_doorbell_interrupt(adev, false); 6127 6128 return amdgpu_dpm_baco_enter(adev); 6129 } 6130 6131 int amdgpu_device_baco_exit(struct drm_device *dev) 6132 { 6133 struct amdgpu_device *adev = drm_to_adev(dev); 6134 struct amdgpu_ras *ras = amdgpu_ras_get_context(adev); 6135 int ret = 0; 6136 6137 if (!amdgpu_device_supports_baco(dev)) 6138 return -ENOTSUPP; 6139 6140 ret = amdgpu_dpm_baco_exit(adev); 6141 if (ret) 6142 return ret; 6143 6144 if (ras && adev->ras_enabled && 6145 adev->nbio.funcs->enable_doorbell_interrupt) 6146 adev->nbio.funcs->enable_doorbell_interrupt(adev, true); 6147 6148 if (amdgpu_passthrough(adev) && adev->nbio.funcs && 6149 adev->nbio.funcs->clear_doorbell_interrupt) 6150 adev->nbio.funcs->clear_doorbell_interrupt(adev); 6151 6152 return 0; 6153 } 6154 6155 /** 6156 * amdgpu_pci_error_detected - Called when a PCI error is detected. 6157 * @pdev: PCI device struct 6158 * @state: PCI channel state 6159 * 6160 * Description: Called when a PCI error is detected. 6161 * 6162 * Return: PCI_ERS_RESULT_NEED_RESET or PCI_ERS_RESULT_DISCONNECT. 6163 */ 6164 pci_ers_result_t amdgpu_pci_error_detected(struct pci_dev *pdev, pci_channel_state_t state) 6165 { 6166 struct drm_device *dev = pci_get_drvdata(pdev); 6167 struct amdgpu_device *adev = drm_to_adev(dev); 6168 int i; 6169 6170 DRM_INFO("PCI error: detected callback, state(%d)!!\n", state); 6171 6172 if (adev->gmc.xgmi.num_physical_nodes > 1) { 6173 DRM_WARN("No support for XGMI hive yet..."); 6174 return PCI_ERS_RESULT_DISCONNECT; 6175 } 6176 6177 adev->pci_channel_state = state; 6178 6179 switch (state) { 6180 case pci_channel_io_normal: 6181 return PCI_ERS_RESULT_CAN_RECOVER; 6182 /* Fatal error, prepare for slot reset */ 6183 case pci_channel_io_frozen: 6184 /* 6185 * Locking adev->reset_domain->sem will prevent any external access 6186 * to GPU during PCI error recovery 6187 */ 6188 amdgpu_device_lock_reset_domain(adev->reset_domain); 6189 amdgpu_device_set_mp1_state(adev); 6190 6191 /* 6192 * Block any work scheduling as we do for regular GPU reset 6193 * for the duration of the recovery 6194 */ 6195 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 6196 struct amdgpu_ring *ring = adev->rings[i]; 6197 6198 if (!amdgpu_ring_sched_ready(ring)) 6199 continue; 6200 6201 drm_sched_stop(&ring->sched, NULL); 6202 } 6203 atomic_inc(&adev->gpu_reset_counter); 6204 return PCI_ERS_RESULT_NEED_RESET; 6205 case pci_channel_io_perm_failure: 6206 /* Permanent error, prepare for device removal */ 6207 return PCI_ERS_RESULT_DISCONNECT; 6208 } 6209 6210 return PCI_ERS_RESULT_NEED_RESET; 6211 } 6212 6213 /** 6214 * amdgpu_pci_mmio_enabled - Enable MMIO and dump debug registers 6215 * @pdev: pointer to PCI device 6216 */ 6217 pci_ers_result_t amdgpu_pci_mmio_enabled(struct pci_dev *pdev) 6218 { 6219 6220 DRM_INFO("PCI error: mmio enabled callback!!\n"); 6221 6222 /* TODO - dump whatever for debugging purposes */ 6223 6224 /* This called only if amdgpu_pci_error_detected returns 6225 * PCI_ERS_RESULT_CAN_RECOVER. Read/write to the device still 6226 * works, no need to reset slot. 6227 */ 6228 6229 return PCI_ERS_RESULT_RECOVERED; 6230 } 6231 6232 /** 6233 * amdgpu_pci_slot_reset - Called when PCI slot has been reset. 6234 * @pdev: PCI device struct 6235 * 6236 * Description: This routine is called by the pci error recovery 6237 * code after the PCI slot has been reset, just before we 6238 * should resume normal operations. 6239 */ 6240 pci_ers_result_t amdgpu_pci_slot_reset(struct pci_dev *pdev) 6241 { 6242 struct drm_device *dev = pci_get_drvdata(pdev); 6243 struct amdgpu_device *adev = drm_to_adev(dev); 6244 int r, i; 6245 struct amdgpu_reset_context reset_context; 6246 u32 memsize; 6247 struct list_head device_list; 6248 6249 /* PCI error slot reset should be skipped During RAS recovery */ 6250 if ((amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 3) || 6251 amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 4)) && 6252 amdgpu_ras_in_recovery(adev)) 6253 return PCI_ERS_RESULT_RECOVERED; 6254 6255 DRM_INFO("PCI error: slot reset callback!!\n"); 6256 6257 memset(&reset_context, 0, sizeof(reset_context)); 6258 6259 INIT_LIST_HEAD(&device_list); 6260 list_add_tail(&adev->reset_list, &device_list); 6261 6262 /* wait for asic to come out of reset */ 6263 msleep(500); 6264 6265 /* Restore PCI confspace */ 6266 amdgpu_device_load_pci_state(pdev); 6267 6268 /* confirm ASIC came out of reset */ 6269 for (i = 0; i < adev->usec_timeout; i++) { 6270 memsize = amdgpu_asic_get_config_memsize(adev); 6271 6272 if (memsize != 0xffffffff) 6273 break; 6274 udelay(1); 6275 } 6276 if (memsize == 0xffffffff) { 6277 r = -ETIME; 6278 goto out; 6279 } 6280 6281 reset_context.method = AMD_RESET_METHOD_NONE; 6282 reset_context.reset_req_dev = adev; 6283 set_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags); 6284 set_bit(AMDGPU_SKIP_HW_RESET, &reset_context.flags); 6285 6286 adev->no_hw_access = true; 6287 r = amdgpu_device_pre_asic_reset(adev, &reset_context); 6288 adev->no_hw_access = false; 6289 if (r) 6290 goto out; 6291 6292 r = amdgpu_do_asic_reset(&device_list, &reset_context); 6293 6294 out: 6295 if (!r) { 6296 if (amdgpu_device_cache_pci_state(adev->pdev)) 6297 pci_restore_state(adev->pdev); 6298 6299 DRM_INFO("PCIe error recovery succeeded\n"); 6300 } else { 6301 DRM_ERROR("PCIe error recovery failed, err:%d", r); 6302 amdgpu_device_unset_mp1_state(adev); 6303 amdgpu_device_unlock_reset_domain(adev->reset_domain); 6304 } 6305 6306 return r ? PCI_ERS_RESULT_DISCONNECT : PCI_ERS_RESULT_RECOVERED; 6307 } 6308 6309 /** 6310 * amdgpu_pci_resume() - resume normal ops after PCI reset 6311 * @pdev: pointer to PCI device 6312 * 6313 * Called when the error recovery driver tells us that its 6314 * OK to resume normal operation. 6315 */ 6316 void amdgpu_pci_resume(struct pci_dev *pdev) 6317 { 6318 struct drm_device *dev = pci_get_drvdata(pdev); 6319 struct amdgpu_device *adev = drm_to_adev(dev); 6320 int i; 6321 6322 6323 DRM_INFO("PCI error: resume callback!!\n"); 6324 6325 /* Only continue execution for the case of pci_channel_io_frozen */ 6326 if (adev->pci_channel_state != pci_channel_io_frozen) 6327 return; 6328 6329 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 6330 struct amdgpu_ring *ring = adev->rings[i]; 6331 6332 if (!amdgpu_ring_sched_ready(ring)) 6333 continue; 6334 6335 drm_sched_start(&ring->sched, 0); 6336 } 6337 6338 amdgpu_device_unset_mp1_state(adev); 6339 amdgpu_device_unlock_reset_domain(adev->reset_domain); 6340 } 6341 6342 bool amdgpu_device_cache_pci_state(struct pci_dev *pdev) 6343 { 6344 struct drm_device *dev = pci_get_drvdata(pdev); 6345 struct amdgpu_device *adev = drm_to_adev(dev); 6346 int r; 6347 6348 r = pci_save_state(pdev); 6349 if (!r) { 6350 kfree(adev->pci_state); 6351 6352 adev->pci_state = pci_store_saved_state(pdev); 6353 6354 if (!adev->pci_state) { 6355 DRM_ERROR("Failed to store PCI saved state"); 6356 return false; 6357 } 6358 } else { 6359 DRM_WARN("Failed to save PCI state, err:%d\n", r); 6360 return false; 6361 } 6362 6363 return true; 6364 } 6365 6366 bool amdgpu_device_load_pci_state(struct pci_dev *pdev) 6367 { 6368 struct drm_device *dev = pci_get_drvdata(pdev); 6369 struct amdgpu_device *adev = drm_to_adev(dev); 6370 int r; 6371 6372 if (!adev->pci_state) 6373 return false; 6374 6375 r = pci_load_saved_state(pdev, adev->pci_state); 6376 6377 if (!r) { 6378 pci_restore_state(pdev); 6379 } else { 6380 DRM_WARN("Failed to load PCI state, err:%d\n", r); 6381 return false; 6382 } 6383 6384 return true; 6385 } 6386 6387 void amdgpu_device_flush_hdp(struct amdgpu_device *adev, 6388 struct amdgpu_ring *ring) 6389 { 6390 #ifdef CONFIG_X86_64 6391 if ((adev->flags & AMD_IS_APU) && !amdgpu_passthrough(adev)) 6392 return; 6393 #endif 6394 if (adev->gmc.xgmi.connected_to_cpu) 6395 return; 6396 6397 if (ring && ring->funcs->emit_hdp_flush) 6398 amdgpu_ring_emit_hdp_flush(ring); 6399 else 6400 amdgpu_asic_flush_hdp(adev, ring); 6401 } 6402 6403 void amdgpu_device_invalidate_hdp(struct amdgpu_device *adev, 6404 struct amdgpu_ring *ring) 6405 { 6406 #ifdef CONFIG_X86_64 6407 if ((adev->flags & AMD_IS_APU) && !amdgpu_passthrough(adev)) 6408 return; 6409 #endif 6410 if (adev->gmc.xgmi.connected_to_cpu) 6411 return; 6412 6413 amdgpu_asic_invalidate_hdp(adev, ring); 6414 } 6415 6416 int amdgpu_in_reset(struct amdgpu_device *adev) 6417 { 6418 return atomic_read(&adev->reset_domain->in_gpu_reset); 6419 } 6420 6421 /** 6422 * amdgpu_device_halt() - bring hardware to some kind of halt state 6423 * 6424 * @adev: amdgpu_device pointer 6425 * 6426 * Bring hardware to some kind of halt state so that no one can touch it 6427 * any more. It will help to maintain error context when error occurred. 6428 * Compare to a simple hang, the system will keep stable at least for SSH 6429 * access. Then it should be trivial to inspect the hardware state and 6430 * see what's going on. Implemented as following: 6431 * 6432 * 1. drm_dev_unplug() makes device inaccessible to user space(IOCTLs, etc), 6433 * clears all CPU mappings to device, disallows remappings through page faults 6434 * 2. amdgpu_irq_disable_all() disables all interrupts 6435 * 3. amdgpu_fence_driver_hw_fini() signals all HW fences 6436 * 4. set adev->no_hw_access to avoid potential crashes after setp 5 6437 * 5. amdgpu_device_unmap_mmio() clears all MMIO mappings 6438 * 6. pci_disable_device() and pci_wait_for_pending_transaction() 6439 * flush any in flight DMA operations 6440 */ 6441 void amdgpu_device_halt(struct amdgpu_device *adev) 6442 { 6443 struct pci_dev *pdev = adev->pdev; 6444 struct drm_device *ddev = adev_to_drm(adev); 6445 6446 amdgpu_xcp_dev_unplug(adev); 6447 drm_dev_unplug(ddev); 6448 6449 amdgpu_irq_disable_all(adev); 6450 6451 amdgpu_fence_driver_hw_fini(adev); 6452 6453 adev->no_hw_access = true; 6454 6455 amdgpu_device_unmap_mmio(adev); 6456 6457 pci_disable_device(pdev); 6458 pci_wait_for_pending_transaction(pdev); 6459 } 6460 6461 u32 amdgpu_device_pcie_port_rreg(struct amdgpu_device *adev, 6462 u32 reg) 6463 { 6464 unsigned long flags, address, data; 6465 u32 r; 6466 6467 address = adev->nbio.funcs->get_pcie_port_index_offset(adev); 6468 data = adev->nbio.funcs->get_pcie_port_data_offset(adev); 6469 6470 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 6471 WREG32(address, reg * 4); 6472 (void)RREG32(address); 6473 r = RREG32(data); 6474 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 6475 return r; 6476 } 6477 6478 void amdgpu_device_pcie_port_wreg(struct amdgpu_device *adev, 6479 u32 reg, u32 v) 6480 { 6481 unsigned long flags, address, data; 6482 6483 address = adev->nbio.funcs->get_pcie_port_index_offset(adev); 6484 data = adev->nbio.funcs->get_pcie_port_data_offset(adev); 6485 6486 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 6487 WREG32(address, reg * 4); 6488 (void)RREG32(address); 6489 WREG32(data, v); 6490 (void)RREG32(data); 6491 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 6492 } 6493 6494 /** 6495 * amdgpu_device_get_gang - return a reference to the current gang 6496 * @adev: amdgpu_device pointer 6497 * 6498 * Returns: A new reference to the current gang leader. 6499 */ 6500 struct dma_fence *amdgpu_device_get_gang(struct amdgpu_device *adev) 6501 { 6502 struct dma_fence *fence; 6503 6504 rcu_read_lock(); 6505 fence = dma_fence_get_rcu_safe(&adev->gang_submit); 6506 rcu_read_unlock(); 6507 return fence; 6508 } 6509 6510 /** 6511 * amdgpu_device_switch_gang - switch to a new gang 6512 * @adev: amdgpu_device pointer 6513 * @gang: the gang to switch to 6514 * 6515 * Try to switch to a new gang. 6516 * Returns: NULL if we switched to the new gang or a reference to the current 6517 * gang leader. 6518 */ 6519 struct dma_fence *amdgpu_device_switch_gang(struct amdgpu_device *adev, 6520 struct dma_fence *gang) 6521 { 6522 struct dma_fence *old = NULL; 6523 6524 do { 6525 dma_fence_put(old); 6526 old = amdgpu_device_get_gang(adev); 6527 if (old == gang) 6528 break; 6529 6530 if (!dma_fence_is_signaled(old)) 6531 return old; 6532 6533 } while (cmpxchg((struct dma_fence __force **)&adev->gang_submit, 6534 old, gang) != old); 6535 6536 dma_fence_put(old); 6537 return NULL; 6538 } 6539 6540 bool amdgpu_device_has_display_hardware(struct amdgpu_device *adev) 6541 { 6542 switch (adev->asic_type) { 6543 #ifdef CONFIG_DRM_AMDGPU_SI 6544 case CHIP_HAINAN: 6545 #endif 6546 case CHIP_TOPAZ: 6547 /* chips with no display hardware */ 6548 return false; 6549 #ifdef CONFIG_DRM_AMDGPU_SI 6550 case CHIP_TAHITI: 6551 case CHIP_PITCAIRN: 6552 case CHIP_VERDE: 6553 case CHIP_OLAND: 6554 #endif 6555 #ifdef CONFIG_DRM_AMDGPU_CIK 6556 case CHIP_BONAIRE: 6557 case CHIP_HAWAII: 6558 case CHIP_KAVERI: 6559 case CHIP_KABINI: 6560 case CHIP_MULLINS: 6561 #endif 6562 case CHIP_TONGA: 6563 case CHIP_FIJI: 6564 case CHIP_POLARIS10: 6565 case CHIP_POLARIS11: 6566 case CHIP_POLARIS12: 6567 case CHIP_VEGAM: 6568 case CHIP_CARRIZO: 6569 case CHIP_STONEY: 6570 /* chips with display hardware */ 6571 return true; 6572 default: 6573 /* IP discovery */ 6574 if (!amdgpu_ip_version(adev, DCE_HWIP, 0) || 6575 (adev->harvest_ip_mask & AMD_HARVEST_IP_DMU_MASK)) 6576 return false; 6577 return true; 6578 } 6579 } 6580 6581 uint32_t amdgpu_device_wait_on_rreg(struct amdgpu_device *adev, 6582 uint32_t inst, uint32_t reg_addr, char reg_name[], 6583 uint32_t expected_value, uint32_t mask) 6584 { 6585 uint32_t ret = 0; 6586 uint32_t old_ = 0; 6587 uint32_t tmp_ = RREG32(reg_addr); 6588 uint32_t loop = adev->usec_timeout; 6589 6590 while ((tmp_ & (mask)) != (expected_value)) { 6591 if (old_ != tmp_) { 6592 loop = adev->usec_timeout; 6593 old_ = tmp_; 6594 } else 6595 udelay(1); 6596 tmp_ = RREG32(reg_addr); 6597 loop--; 6598 if (!loop) { 6599 DRM_WARN("Register(%d) [%s] failed to reach value 0x%08x != 0x%08xn", 6600 inst, reg_name, (uint32_t)expected_value, 6601 (uint32_t)(tmp_ & (mask))); 6602 ret = -ETIMEDOUT; 6603 break; 6604 } 6605 } 6606 return ret; 6607 } 6608