1 /* 2 * Copyright 2008 Advanced Micro Devices, Inc. 3 * Copyright 2008 Red Hat Inc. 4 * Copyright 2009 Jerome Glisse. 5 * 6 * Permission is hereby granted, free of charge, to any person obtaining a 7 * copy of this software and associated documentation files (the "Software"), 8 * to deal in the Software without restriction, including without limitation 9 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 10 * and/or sell copies of the Software, and to permit persons to whom the 11 * Software is furnished to do so, subject to the following conditions: 12 * 13 * The above copyright notice and this permission notice shall be included in 14 * all copies or substantial portions of the Software. 15 * 16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 19 * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR 20 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, 21 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR 22 * OTHER DEALINGS IN THE SOFTWARE. 23 * 24 * Authors: Dave Airlie 25 * Alex Deucher 26 * Jerome Glisse 27 */ 28 #include <linux/power_supply.h> 29 #include <linux/kthread.h> 30 #include <linux/module.h> 31 #include <linux/console.h> 32 #include <linux/slab.h> 33 #include <linux/iommu.h> 34 #include <linux/pci.h> 35 #include <linux/pci-p2pdma.h> 36 #include <linux/apple-gmux.h> 37 38 #include <drm/drm_aperture.h> 39 #include <drm/drm_atomic_helper.h> 40 #include <drm/drm_crtc_helper.h> 41 #include <drm/drm_fb_helper.h> 42 #include <drm/drm_probe_helper.h> 43 #include <drm/amdgpu_drm.h> 44 #include <linux/device.h> 45 #include <linux/vgaarb.h> 46 #include <linux/vga_switcheroo.h> 47 #include <linux/efi.h> 48 #include "amdgpu.h" 49 #include "amdgpu_trace.h" 50 #include "amdgpu_i2c.h" 51 #include "atom.h" 52 #include "amdgpu_atombios.h" 53 #include "amdgpu_atomfirmware.h" 54 #include "amd_pcie.h" 55 #ifdef CONFIG_DRM_AMDGPU_SI 56 #include "si.h" 57 #endif 58 #ifdef CONFIG_DRM_AMDGPU_CIK 59 #include "cik.h" 60 #endif 61 #include "vi.h" 62 #include "soc15.h" 63 #include "nv.h" 64 #include "bif/bif_4_1_d.h" 65 #include <linux/firmware.h> 66 #include "amdgpu_vf_error.h" 67 68 #include "amdgpu_amdkfd.h" 69 #include "amdgpu_pm.h" 70 71 #include "amdgpu_xgmi.h" 72 #include "amdgpu_ras.h" 73 #include "amdgpu_pmu.h" 74 #include "amdgpu_fru_eeprom.h" 75 #include "amdgpu_reset.h" 76 #include "amdgpu_virt.h" 77 #include "amdgpu_dev_coredump.h" 78 79 #include <linux/suspend.h> 80 #include <drm/task_barrier.h> 81 #include <linux/pm_runtime.h> 82 83 #include <drm/drm_drv.h> 84 85 #if IS_ENABLED(CONFIG_X86) 86 #include <asm/intel-family.h> 87 #endif 88 89 MODULE_FIRMWARE("amdgpu/vega10_gpu_info.bin"); 90 MODULE_FIRMWARE("amdgpu/vega12_gpu_info.bin"); 91 MODULE_FIRMWARE("amdgpu/raven_gpu_info.bin"); 92 MODULE_FIRMWARE("amdgpu/picasso_gpu_info.bin"); 93 MODULE_FIRMWARE("amdgpu/raven2_gpu_info.bin"); 94 MODULE_FIRMWARE("amdgpu/arcturus_gpu_info.bin"); 95 MODULE_FIRMWARE("amdgpu/navi12_gpu_info.bin"); 96 97 #define AMDGPU_RESUME_MS 2000 98 #define AMDGPU_MAX_RETRY_LIMIT 2 99 #define AMDGPU_RETRY_SRIOV_RESET(r) ((r) == -EBUSY || (r) == -ETIMEDOUT || (r) == -EINVAL) 100 #define AMDGPU_PCIE_INDEX_FALLBACK (0x38 >> 2) 101 #define AMDGPU_PCIE_INDEX_HI_FALLBACK (0x44 >> 2) 102 #define AMDGPU_PCIE_DATA_FALLBACK (0x3C >> 2) 103 104 static const struct drm_driver amdgpu_kms_driver; 105 106 const char *amdgpu_asic_name[] = { 107 "TAHITI", 108 "PITCAIRN", 109 "VERDE", 110 "OLAND", 111 "HAINAN", 112 "BONAIRE", 113 "KAVERI", 114 "KABINI", 115 "HAWAII", 116 "MULLINS", 117 "TOPAZ", 118 "TONGA", 119 "FIJI", 120 "CARRIZO", 121 "STONEY", 122 "POLARIS10", 123 "POLARIS11", 124 "POLARIS12", 125 "VEGAM", 126 "VEGA10", 127 "VEGA12", 128 "VEGA20", 129 "RAVEN", 130 "ARCTURUS", 131 "RENOIR", 132 "ALDEBARAN", 133 "NAVI10", 134 "CYAN_SKILLFISH", 135 "NAVI14", 136 "NAVI12", 137 "SIENNA_CICHLID", 138 "NAVY_FLOUNDER", 139 "VANGOGH", 140 "DIMGREY_CAVEFISH", 141 "BEIGE_GOBY", 142 "YELLOW_CARP", 143 "IP DISCOVERY", 144 "LAST", 145 }; 146 147 #define AMDGPU_IP_BLK_MASK_ALL GENMASK(AMDGPU_MAX_IP_NUM - 1, 0) 148 /* 149 * Default init level where all blocks are expected to be initialized. This is 150 * the level of initialization expected by default and also after a full reset 151 * of the device. 152 */ 153 struct amdgpu_init_level amdgpu_init_default = { 154 .level = AMDGPU_INIT_LEVEL_DEFAULT, 155 .hwini_ip_block_mask = AMDGPU_IP_BLK_MASK_ALL, 156 }; 157 158 /* 159 * Minimal blocks needed to be initialized before a XGMI hive can be reset. This 160 * is used for cases like reset on initialization where the entire hive needs to 161 * be reset before first use. 162 */ 163 struct amdgpu_init_level amdgpu_init_minimal_xgmi = { 164 .level = AMDGPU_INIT_LEVEL_MINIMAL_XGMI, 165 .hwini_ip_block_mask = 166 BIT(AMD_IP_BLOCK_TYPE_GMC) | BIT(AMD_IP_BLOCK_TYPE_SMC) | 167 BIT(AMD_IP_BLOCK_TYPE_COMMON) | BIT(AMD_IP_BLOCK_TYPE_IH) | 168 BIT(AMD_IP_BLOCK_TYPE_PSP) 169 }; 170 171 static inline bool amdgpu_ip_member_of_hwini(struct amdgpu_device *adev, 172 enum amd_ip_block_type block) 173 { 174 return (adev->init_lvl->hwini_ip_block_mask & (1U << block)) != 0; 175 } 176 177 void amdgpu_set_init_level(struct amdgpu_device *adev, 178 enum amdgpu_init_lvl_id lvl) 179 { 180 switch (lvl) { 181 case AMDGPU_INIT_LEVEL_MINIMAL_XGMI: 182 adev->init_lvl = &amdgpu_init_minimal_xgmi; 183 break; 184 case AMDGPU_INIT_LEVEL_DEFAULT: 185 fallthrough; 186 default: 187 adev->init_lvl = &amdgpu_init_default; 188 break; 189 } 190 } 191 192 static inline void amdgpu_device_stop_pending_resets(struct amdgpu_device *adev); 193 194 /** 195 * DOC: pcie_replay_count 196 * 197 * The amdgpu driver provides a sysfs API for reporting the total number 198 * of PCIe replays (NAKs) 199 * The file pcie_replay_count is used for this and returns the total 200 * number of replays as a sum of the NAKs generated and NAKs received 201 */ 202 203 static ssize_t amdgpu_device_get_pcie_replay_count(struct device *dev, 204 struct device_attribute *attr, char *buf) 205 { 206 struct drm_device *ddev = dev_get_drvdata(dev); 207 struct amdgpu_device *adev = drm_to_adev(ddev); 208 uint64_t cnt = amdgpu_asic_get_pcie_replay_count(adev); 209 210 return sysfs_emit(buf, "%llu\n", cnt); 211 } 212 213 static DEVICE_ATTR(pcie_replay_count, 0444, 214 amdgpu_device_get_pcie_replay_count, NULL); 215 216 static ssize_t amdgpu_sysfs_reg_state_get(struct file *f, struct kobject *kobj, 217 struct bin_attribute *attr, char *buf, 218 loff_t ppos, size_t count) 219 { 220 struct device *dev = kobj_to_dev(kobj); 221 struct drm_device *ddev = dev_get_drvdata(dev); 222 struct amdgpu_device *adev = drm_to_adev(ddev); 223 ssize_t bytes_read; 224 225 switch (ppos) { 226 case AMDGPU_SYS_REG_STATE_XGMI: 227 bytes_read = amdgpu_asic_get_reg_state( 228 adev, AMDGPU_REG_STATE_TYPE_XGMI, buf, count); 229 break; 230 case AMDGPU_SYS_REG_STATE_WAFL: 231 bytes_read = amdgpu_asic_get_reg_state( 232 adev, AMDGPU_REG_STATE_TYPE_WAFL, buf, count); 233 break; 234 case AMDGPU_SYS_REG_STATE_PCIE: 235 bytes_read = amdgpu_asic_get_reg_state( 236 adev, AMDGPU_REG_STATE_TYPE_PCIE, buf, count); 237 break; 238 case AMDGPU_SYS_REG_STATE_USR: 239 bytes_read = amdgpu_asic_get_reg_state( 240 adev, AMDGPU_REG_STATE_TYPE_USR, buf, count); 241 break; 242 case AMDGPU_SYS_REG_STATE_USR_1: 243 bytes_read = amdgpu_asic_get_reg_state( 244 adev, AMDGPU_REG_STATE_TYPE_USR_1, buf, count); 245 break; 246 default: 247 return -EINVAL; 248 } 249 250 return bytes_read; 251 } 252 253 BIN_ATTR(reg_state, 0444, amdgpu_sysfs_reg_state_get, NULL, 254 AMDGPU_SYS_REG_STATE_END); 255 256 int amdgpu_reg_state_sysfs_init(struct amdgpu_device *adev) 257 { 258 int ret; 259 260 if (!amdgpu_asic_get_reg_state_supported(adev)) 261 return 0; 262 263 ret = sysfs_create_bin_file(&adev->dev->kobj, &bin_attr_reg_state); 264 265 return ret; 266 } 267 268 void amdgpu_reg_state_sysfs_fini(struct amdgpu_device *adev) 269 { 270 if (!amdgpu_asic_get_reg_state_supported(adev)) 271 return; 272 sysfs_remove_bin_file(&adev->dev->kobj, &bin_attr_reg_state); 273 } 274 275 /** 276 * DOC: board_info 277 * 278 * The amdgpu driver provides a sysfs API for giving board related information. 279 * It provides the form factor information in the format 280 * 281 * type : form factor 282 * 283 * Possible form factor values 284 * 285 * - "cem" - PCIE CEM card 286 * - "oam" - Open Compute Accelerator Module 287 * - "unknown" - Not known 288 * 289 */ 290 291 static ssize_t amdgpu_device_get_board_info(struct device *dev, 292 struct device_attribute *attr, 293 char *buf) 294 { 295 struct drm_device *ddev = dev_get_drvdata(dev); 296 struct amdgpu_device *adev = drm_to_adev(ddev); 297 enum amdgpu_pkg_type pkg_type = AMDGPU_PKG_TYPE_CEM; 298 const char *pkg; 299 300 if (adev->smuio.funcs && adev->smuio.funcs->get_pkg_type) 301 pkg_type = adev->smuio.funcs->get_pkg_type(adev); 302 303 switch (pkg_type) { 304 case AMDGPU_PKG_TYPE_CEM: 305 pkg = "cem"; 306 break; 307 case AMDGPU_PKG_TYPE_OAM: 308 pkg = "oam"; 309 break; 310 default: 311 pkg = "unknown"; 312 break; 313 } 314 315 return sysfs_emit(buf, "%s : %s\n", "type", pkg); 316 } 317 318 static DEVICE_ATTR(board_info, 0444, amdgpu_device_get_board_info, NULL); 319 320 static struct attribute *amdgpu_board_attrs[] = { 321 &dev_attr_board_info.attr, 322 NULL, 323 }; 324 325 static umode_t amdgpu_board_attrs_is_visible(struct kobject *kobj, 326 struct attribute *attr, int n) 327 { 328 struct device *dev = kobj_to_dev(kobj); 329 struct drm_device *ddev = dev_get_drvdata(dev); 330 struct amdgpu_device *adev = drm_to_adev(ddev); 331 332 if (adev->flags & AMD_IS_APU) 333 return 0; 334 335 return attr->mode; 336 } 337 338 static const struct attribute_group amdgpu_board_attrs_group = { 339 .attrs = amdgpu_board_attrs, 340 .is_visible = amdgpu_board_attrs_is_visible 341 }; 342 343 static void amdgpu_device_get_pcie_info(struct amdgpu_device *adev); 344 345 346 /** 347 * amdgpu_device_supports_px - Is the device a dGPU with ATPX power control 348 * 349 * @dev: drm_device pointer 350 * 351 * Returns true if the device is a dGPU with ATPX power control, 352 * otherwise return false. 353 */ 354 bool amdgpu_device_supports_px(struct drm_device *dev) 355 { 356 struct amdgpu_device *adev = drm_to_adev(dev); 357 358 if ((adev->flags & AMD_IS_PX) && !amdgpu_is_atpx_hybrid()) 359 return true; 360 return false; 361 } 362 363 /** 364 * amdgpu_device_supports_boco - Is the device a dGPU with ACPI power resources 365 * 366 * @dev: drm_device pointer 367 * 368 * Returns true if the device is a dGPU with ACPI power control, 369 * otherwise return false. 370 */ 371 bool amdgpu_device_supports_boco(struct drm_device *dev) 372 { 373 struct amdgpu_device *adev = drm_to_adev(dev); 374 375 if (adev->has_pr3 || 376 ((adev->flags & AMD_IS_PX) && amdgpu_is_atpx_hybrid())) 377 return true; 378 return false; 379 } 380 381 /** 382 * amdgpu_device_supports_baco - Does the device support BACO 383 * 384 * @dev: drm_device pointer 385 * 386 * Return: 387 * 1 if the device supporte BACO; 388 * 3 if the device support MACO (only works if BACO is supported) 389 * otherwise return 0. 390 */ 391 int amdgpu_device_supports_baco(struct drm_device *dev) 392 { 393 struct amdgpu_device *adev = drm_to_adev(dev); 394 395 return amdgpu_asic_supports_baco(adev); 396 } 397 398 void amdgpu_device_detect_runtime_pm_mode(struct amdgpu_device *adev) 399 { 400 struct drm_device *dev; 401 int bamaco_support; 402 403 dev = adev_to_drm(adev); 404 405 adev->pm.rpm_mode = AMDGPU_RUNPM_NONE; 406 bamaco_support = amdgpu_device_supports_baco(dev); 407 408 switch (amdgpu_runtime_pm) { 409 case 2: 410 if (bamaco_support & MACO_SUPPORT) { 411 adev->pm.rpm_mode = AMDGPU_RUNPM_BAMACO; 412 dev_info(adev->dev, "Forcing BAMACO for runtime pm\n"); 413 } else if (bamaco_support == BACO_SUPPORT) { 414 adev->pm.rpm_mode = AMDGPU_RUNPM_BACO; 415 dev_info(adev->dev, "Requested mode BAMACO not available,fallback to use BACO\n"); 416 } 417 break; 418 case 1: 419 if (bamaco_support & BACO_SUPPORT) { 420 adev->pm.rpm_mode = AMDGPU_RUNPM_BACO; 421 dev_info(adev->dev, "Forcing BACO for runtime pm\n"); 422 } 423 break; 424 case -1: 425 case -2: 426 if (amdgpu_device_supports_px(dev)) { /* enable PX as runtime mode */ 427 adev->pm.rpm_mode = AMDGPU_RUNPM_PX; 428 dev_info(adev->dev, "Using ATPX for runtime pm\n"); 429 } else if (amdgpu_device_supports_boco(dev)) { /* enable boco as runtime mode */ 430 adev->pm.rpm_mode = AMDGPU_RUNPM_BOCO; 431 dev_info(adev->dev, "Using BOCO for runtime pm\n"); 432 } else { 433 if (!bamaco_support) 434 goto no_runtime_pm; 435 436 switch (adev->asic_type) { 437 case CHIP_VEGA20: 438 case CHIP_ARCTURUS: 439 /* BACO are not supported on vega20 and arctrus */ 440 break; 441 case CHIP_VEGA10: 442 /* enable BACO as runpm mode if noretry=0 */ 443 if (!adev->gmc.noretry) 444 adev->pm.rpm_mode = AMDGPU_RUNPM_BACO; 445 break; 446 default: 447 /* enable BACO as runpm mode on CI+ */ 448 adev->pm.rpm_mode = AMDGPU_RUNPM_BACO; 449 break; 450 } 451 452 if (adev->pm.rpm_mode == AMDGPU_RUNPM_BACO) { 453 if (bamaco_support & MACO_SUPPORT) { 454 adev->pm.rpm_mode = AMDGPU_RUNPM_BAMACO; 455 dev_info(adev->dev, "Using BAMACO for runtime pm\n"); 456 } else { 457 dev_info(adev->dev, "Using BACO for runtime pm\n"); 458 } 459 } 460 } 461 break; 462 case 0: 463 dev_info(adev->dev, "runtime pm is manually disabled\n"); 464 break; 465 default: 466 break; 467 } 468 469 no_runtime_pm: 470 if (adev->pm.rpm_mode == AMDGPU_RUNPM_NONE) 471 dev_info(adev->dev, "Runtime PM not available\n"); 472 } 473 /** 474 * amdgpu_device_supports_smart_shift - Is the device dGPU with 475 * smart shift support 476 * 477 * @dev: drm_device pointer 478 * 479 * Returns true if the device is a dGPU with Smart Shift support, 480 * otherwise returns false. 481 */ 482 bool amdgpu_device_supports_smart_shift(struct drm_device *dev) 483 { 484 return (amdgpu_device_supports_boco(dev) && 485 amdgpu_acpi_is_power_shift_control_supported()); 486 } 487 488 /* 489 * VRAM access helper functions 490 */ 491 492 /** 493 * amdgpu_device_mm_access - access vram by MM_INDEX/MM_DATA 494 * 495 * @adev: amdgpu_device pointer 496 * @pos: offset of the buffer in vram 497 * @buf: virtual address of the buffer in system memory 498 * @size: read/write size, sizeof(@buf) must > @size 499 * @write: true - write to vram, otherwise - read from vram 500 */ 501 void amdgpu_device_mm_access(struct amdgpu_device *adev, loff_t pos, 502 void *buf, size_t size, bool write) 503 { 504 unsigned long flags; 505 uint32_t hi = ~0, tmp = 0; 506 uint32_t *data = buf; 507 uint64_t last; 508 int idx; 509 510 if (!drm_dev_enter(adev_to_drm(adev), &idx)) 511 return; 512 513 BUG_ON(!IS_ALIGNED(pos, 4) || !IS_ALIGNED(size, 4)); 514 515 spin_lock_irqsave(&adev->mmio_idx_lock, flags); 516 for (last = pos + size; pos < last; pos += 4) { 517 tmp = pos >> 31; 518 519 WREG32_NO_KIQ(mmMM_INDEX, ((uint32_t)pos) | 0x80000000); 520 if (tmp != hi) { 521 WREG32_NO_KIQ(mmMM_INDEX_HI, tmp); 522 hi = tmp; 523 } 524 if (write) 525 WREG32_NO_KIQ(mmMM_DATA, *data++); 526 else 527 *data++ = RREG32_NO_KIQ(mmMM_DATA); 528 } 529 530 spin_unlock_irqrestore(&adev->mmio_idx_lock, flags); 531 drm_dev_exit(idx); 532 } 533 534 /** 535 * amdgpu_device_aper_access - access vram by vram aperature 536 * 537 * @adev: amdgpu_device pointer 538 * @pos: offset of the buffer in vram 539 * @buf: virtual address of the buffer in system memory 540 * @size: read/write size, sizeof(@buf) must > @size 541 * @write: true - write to vram, otherwise - read from vram 542 * 543 * The return value means how many bytes have been transferred. 544 */ 545 size_t amdgpu_device_aper_access(struct amdgpu_device *adev, loff_t pos, 546 void *buf, size_t size, bool write) 547 { 548 #ifdef CONFIG_64BIT 549 void __iomem *addr; 550 size_t count = 0; 551 uint64_t last; 552 553 if (!adev->mman.aper_base_kaddr) 554 return 0; 555 556 last = min(pos + size, adev->gmc.visible_vram_size); 557 if (last > pos) { 558 addr = adev->mman.aper_base_kaddr + pos; 559 count = last - pos; 560 561 if (write) { 562 memcpy_toio(addr, buf, count); 563 /* Make sure HDP write cache flush happens without any reordering 564 * after the system memory contents are sent over PCIe device 565 */ 566 mb(); 567 amdgpu_device_flush_hdp(adev, NULL); 568 } else { 569 amdgpu_device_invalidate_hdp(adev, NULL); 570 /* Make sure HDP read cache is invalidated before issuing a read 571 * to the PCIe device 572 */ 573 mb(); 574 memcpy_fromio(buf, addr, count); 575 } 576 577 } 578 579 return count; 580 #else 581 return 0; 582 #endif 583 } 584 585 /** 586 * amdgpu_device_vram_access - read/write a buffer in vram 587 * 588 * @adev: amdgpu_device pointer 589 * @pos: offset of the buffer in vram 590 * @buf: virtual address of the buffer in system memory 591 * @size: read/write size, sizeof(@buf) must > @size 592 * @write: true - write to vram, otherwise - read from vram 593 */ 594 void amdgpu_device_vram_access(struct amdgpu_device *adev, loff_t pos, 595 void *buf, size_t size, bool write) 596 { 597 size_t count; 598 599 /* try to using vram apreature to access vram first */ 600 count = amdgpu_device_aper_access(adev, pos, buf, size, write); 601 size -= count; 602 if (size) { 603 /* using MM to access rest vram */ 604 pos += count; 605 buf += count; 606 amdgpu_device_mm_access(adev, pos, buf, size, write); 607 } 608 } 609 610 /* 611 * register access helper functions. 612 */ 613 614 /* Check if hw access should be skipped because of hotplug or device error */ 615 bool amdgpu_device_skip_hw_access(struct amdgpu_device *adev) 616 { 617 if (adev->no_hw_access) 618 return true; 619 620 #ifdef CONFIG_LOCKDEP 621 /* 622 * This is a bit complicated to understand, so worth a comment. What we assert 623 * here is that the GPU reset is not running on another thread in parallel. 624 * 625 * For this we trylock the read side of the reset semaphore, if that succeeds 626 * we know that the reset is not running in paralell. 627 * 628 * If the trylock fails we assert that we are either already holding the read 629 * side of the lock or are the reset thread itself and hold the write side of 630 * the lock. 631 */ 632 if (in_task()) { 633 if (down_read_trylock(&adev->reset_domain->sem)) 634 up_read(&adev->reset_domain->sem); 635 else 636 lockdep_assert_held(&adev->reset_domain->sem); 637 } 638 #endif 639 return false; 640 } 641 642 /** 643 * amdgpu_device_rreg - read a memory mapped IO or indirect register 644 * 645 * @adev: amdgpu_device pointer 646 * @reg: dword aligned register offset 647 * @acc_flags: access flags which require special behavior 648 * 649 * Returns the 32 bit value from the offset specified. 650 */ 651 uint32_t amdgpu_device_rreg(struct amdgpu_device *adev, 652 uint32_t reg, uint32_t acc_flags) 653 { 654 uint32_t ret; 655 656 if (amdgpu_device_skip_hw_access(adev)) 657 return 0; 658 659 if ((reg * 4) < adev->rmmio_size) { 660 if (!(acc_flags & AMDGPU_REGS_NO_KIQ) && 661 amdgpu_sriov_runtime(adev) && 662 down_read_trylock(&adev->reset_domain->sem)) { 663 ret = amdgpu_kiq_rreg(adev, reg, 0); 664 up_read(&adev->reset_domain->sem); 665 } else { 666 ret = readl(((void __iomem *)adev->rmmio) + (reg * 4)); 667 } 668 } else { 669 ret = adev->pcie_rreg(adev, reg * 4); 670 } 671 672 trace_amdgpu_device_rreg(adev->pdev->device, reg, ret); 673 674 return ret; 675 } 676 677 /* 678 * MMIO register read with bytes helper functions 679 * @offset:bytes offset from MMIO start 680 */ 681 682 /** 683 * amdgpu_mm_rreg8 - read a memory mapped IO register 684 * 685 * @adev: amdgpu_device pointer 686 * @offset: byte aligned register offset 687 * 688 * Returns the 8 bit value from the offset specified. 689 */ 690 uint8_t amdgpu_mm_rreg8(struct amdgpu_device *adev, uint32_t offset) 691 { 692 if (amdgpu_device_skip_hw_access(adev)) 693 return 0; 694 695 if (offset < adev->rmmio_size) 696 return (readb(adev->rmmio + offset)); 697 BUG(); 698 } 699 700 701 /** 702 * amdgpu_device_xcc_rreg - read a memory mapped IO or indirect register with specific XCC 703 * 704 * @adev: amdgpu_device pointer 705 * @reg: dword aligned register offset 706 * @acc_flags: access flags which require special behavior 707 * @xcc_id: xcc accelerated compute core id 708 * 709 * Returns the 32 bit value from the offset specified. 710 */ 711 uint32_t amdgpu_device_xcc_rreg(struct amdgpu_device *adev, 712 uint32_t reg, uint32_t acc_flags, 713 uint32_t xcc_id) 714 { 715 uint32_t ret, rlcg_flag; 716 717 if (amdgpu_device_skip_hw_access(adev)) 718 return 0; 719 720 if ((reg * 4) < adev->rmmio_size) { 721 if (amdgpu_sriov_vf(adev) && 722 !amdgpu_sriov_runtime(adev) && 723 adev->gfx.rlc.rlcg_reg_access_supported && 724 amdgpu_virt_get_rlcg_reg_access_flag(adev, acc_flags, 725 GC_HWIP, false, 726 &rlcg_flag)) { 727 ret = amdgpu_virt_rlcg_reg_rw(adev, reg, 0, rlcg_flag, GET_INST(GC, xcc_id)); 728 } else if (!(acc_flags & AMDGPU_REGS_NO_KIQ) && 729 amdgpu_sriov_runtime(adev) && 730 down_read_trylock(&adev->reset_domain->sem)) { 731 ret = amdgpu_kiq_rreg(adev, reg, xcc_id); 732 up_read(&adev->reset_domain->sem); 733 } else { 734 ret = readl(((void __iomem *)adev->rmmio) + (reg * 4)); 735 } 736 } else { 737 ret = adev->pcie_rreg(adev, reg * 4); 738 } 739 740 return ret; 741 } 742 743 /* 744 * MMIO register write with bytes helper functions 745 * @offset:bytes offset from MMIO start 746 * @value: the value want to be written to the register 747 */ 748 749 /** 750 * amdgpu_mm_wreg8 - read a memory mapped IO register 751 * 752 * @adev: amdgpu_device pointer 753 * @offset: byte aligned register offset 754 * @value: 8 bit value to write 755 * 756 * Writes the value specified to the offset specified. 757 */ 758 void amdgpu_mm_wreg8(struct amdgpu_device *adev, uint32_t offset, uint8_t value) 759 { 760 if (amdgpu_device_skip_hw_access(adev)) 761 return; 762 763 if (offset < adev->rmmio_size) 764 writeb(value, adev->rmmio + offset); 765 else 766 BUG(); 767 } 768 769 /** 770 * amdgpu_device_wreg - write to a memory mapped IO or indirect register 771 * 772 * @adev: amdgpu_device pointer 773 * @reg: dword aligned register offset 774 * @v: 32 bit value to write to the register 775 * @acc_flags: access flags which require special behavior 776 * 777 * Writes the value specified to the offset specified. 778 */ 779 void amdgpu_device_wreg(struct amdgpu_device *adev, 780 uint32_t reg, uint32_t v, 781 uint32_t acc_flags) 782 { 783 if (amdgpu_device_skip_hw_access(adev)) 784 return; 785 786 if ((reg * 4) < adev->rmmio_size) { 787 if (!(acc_flags & AMDGPU_REGS_NO_KIQ) && 788 amdgpu_sriov_runtime(adev) && 789 down_read_trylock(&adev->reset_domain->sem)) { 790 amdgpu_kiq_wreg(adev, reg, v, 0); 791 up_read(&adev->reset_domain->sem); 792 } else { 793 writel(v, ((void __iomem *)adev->rmmio) + (reg * 4)); 794 } 795 } else { 796 adev->pcie_wreg(adev, reg * 4, v); 797 } 798 799 trace_amdgpu_device_wreg(adev->pdev->device, reg, v); 800 } 801 802 /** 803 * amdgpu_mm_wreg_mmio_rlc - write register either with direct/indirect mmio or with RLC path if in range 804 * 805 * @adev: amdgpu_device pointer 806 * @reg: mmio/rlc register 807 * @v: value to write 808 * @xcc_id: xcc accelerated compute core id 809 * 810 * this function is invoked only for the debugfs register access 811 */ 812 void amdgpu_mm_wreg_mmio_rlc(struct amdgpu_device *adev, 813 uint32_t reg, uint32_t v, 814 uint32_t xcc_id) 815 { 816 if (amdgpu_device_skip_hw_access(adev)) 817 return; 818 819 if (amdgpu_sriov_fullaccess(adev) && 820 adev->gfx.rlc.funcs && 821 adev->gfx.rlc.funcs->is_rlcg_access_range) { 822 if (adev->gfx.rlc.funcs->is_rlcg_access_range(adev, reg)) 823 return amdgpu_sriov_wreg(adev, reg, v, 0, 0, xcc_id); 824 } else if ((reg * 4) >= adev->rmmio_size) { 825 adev->pcie_wreg(adev, reg * 4, v); 826 } else { 827 writel(v, ((void __iomem *)adev->rmmio) + (reg * 4)); 828 } 829 } 830 831 /** 832 * amdgpu_device_xcc_wreg - write to a memory mapped IO or indirect register with specific XCC 833 * 834 * @adev: amdgpu_device pointer 835 * @reg: dword aligned register offset 836 * @v: 32 bit value to write to the register 837 * @acc_flags: access flags which require special behavior 838 * @xcc_id: xcc accelerated compute core id 839 * 840 * Writes the value specified to the offset specified. 841 */ 842 void amdgpu_device_xcc_wreg(struct amdgpu_device *adev, 843 uint32_t reg, uint32_t v, 844 uint32_t acc_flags, uint32_t xcc_id) 845 { 846 uint32_t rlcg_flag; 847 848 if (amdgpu_device_skip_hw_access(adev)) 849 return; 850 851 if ((reg * 4) < adev->rmmio_size) { 852 if (amdgpu_sriov_vf(adev) && 853 !amdgpu_sriov_runtime(adev) && 854 adev->gfx.rlc.rlcg_reg_access_supported && 855 amdgpu_virt_get_rlcg_reg_access_flag(adev, acc_flags, 856 GC_HWIP, true, 857 &rlcg_flag)) { 858 amdgpu_virt_rlcg_reg_rw(adev, reg, v, rlcg_flag, GET_INST(GC, xcc_id)); 859 } else if (!(acc_flags & AMDGPU_REGS_NO_KIQ) && 860 amdgpu_sriov_runtime(adev) && 861 down_read_trylock(&adev->reset_domain->sem)) { 862 amdgpu_kiq_wreg(adev, reg, v, xcc_id); 863 up_read(&adev->reset_domain->sem); 864 } else { 865 writel(v, ((void __iomem *)adev->rmmio) + (reg * 4)); 866 } 867 } else { 868 adev->pcie_wreg(adev, reg * 4, v); 869 } 870 } 871 872 /** 873 * amdgpu_device_indirect_rreg - read an indirect register 874 * 875 * @adev: amdgpu_device pointer 876 * @reg_addr: indirect register address to read from 877 * 878 * Returns the value of indirect register @reg_addr 879 */ 880 u32 amdgpu_device_indirect_rreg(struct amdgpu_device *adev, 881 u32 reg_addr) 882 { 883 unsigned long flags, pcie_index, pcie_data; 884 void __iomem *pcie_index_offset; 885 void __iomem *pcie_data_offset; 886 u32 r; 887 888 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 889 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 890 891 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 892 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 893 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 894 895 writel(reg_addr, pcie_index_offset); 896 readl(pcie_index_offset); 897 r = readl(pcie_data_offset); 898 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 899 900 return r; 901 } 902 903 u32 amdgpu_device_indirect_rreg_ext(struct amdgpu_device *adev, 904 u64 reg_addr) 905 { 906 unsigned long flags, pcie_index, pcie_index_hi, pcie_data; 907 u32 r; 908 void __iomem *pcie_index_offset; 909 void __iomem *pcie_index_hi_offset; 910 void __iomem *pcie_data_offset; 911 912 if (unlikely(!adev->nbio.funcs)) { 913 pcie_index = AMDGPU_PCIE_INDEX_FALLBACK; 914 pcie_data = AMDGPU_PCIE_DATA_FALLBACK; 915 } else { 916 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 917 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 918 } 919 920 if (reg_addr >> 32) { 921 if (unlikely(!adev->nbio.funcs)) 922 pcie_index_hi = AMDGPU_PCIE_INDEX_HI_FALLBACK; 923 else 924 pcie_index_hi = adev->nbio.funcs->get_pcie_index_hi_offset(adev); 925 } else { 926 pcie_index_hi = 0; 927 } 928 929 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 930 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 931 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 932 if (pcie_index_hi != 0) 933 pcie_index_hi_offset = (void __iomem *)adev->rmmio + 934 pcie_index_hi * 4; 935 936 writel(reg_addr, pcie_index_offset); 937 readl(pcie_index_offset); 938 if (pcie_index_hi != 0) { 939 writel((reg_addr >> 32) & 0xff, pcie_index_hi_offset); 940 readl(pcie_index_hi_offset); 941 } 942 r = readl(pcie_data_offset); 943 944 /* clear the high bits */ 945 if (pcie_index_hi != 0) { 946 writel(0, pcie_index_hi_offset); 947 readl(pcie_index_hi_offset); 948 } 949 950 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 951 952 return r; 953 } 954 955 /** 956 * amdgpu_device_indirect_rreg64 - read a 64bits indirect register 957 * 958 * @adev: amdgpu_device pointer 959 * @reg_addr: indirect register address to read from 960 * 961 * Returns the value of indirect register @reg_addr 962 */ 963 u64 amdgpu_device_indirect_rreg64(struct amdgpu_device *adev, 964 u32 reg_addr) 965 { 966 unsigned long flags, pcie_index, pcie_data; 967 void __iomem *pcie_index_offset; 968 void __iomem *pcie_data_offset; 969 u64 r; 970 971 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 972 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 973 974 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 975 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 976 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 977 978 /* read low 32 bits */ 979 writel(reg_addr, pcie_index_offset); 980 readl(pcie_index_offset); 981 r = readl(pcie_data_offset); 982 /* read high 32 bits */ 983 writel(reg_addr + 4, pcie_index_offset); 984 readl(pcie_index_offset); 985 r |= ((u64)readl(pcie_data_offset) << 32); 986 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 987 988 return r; 989 } 990 991 u64 amdgpu_device_indirect_rreg64_ext(struct amdgpu_device *adev, 992 u64 reg_addr) 993 { 994 unsigned long flags, pcie_index, pcie_data; 995 unsigned long pcie_index_hi = 0; 996 void __iomem *pcie_index_offset; 997 void __iomem *pcie_index_hi_offset; 998 void __iomem *pcie_data_offset; 999 u64 r; 1000 1001 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 1002 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 1003 if ((reg_addr >> 32) && (adev->nbio.funcs->get_pcie_index_hi_offset)) 1004 pcie_index_hi = adev->nbio.funcs->get_pcie_index_hi_offset(adev); 1005 1006 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 1007 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 1008 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 1009 if (pcie_index_hi != 0) 1010 pcie_index_hi_offset = (void __iomem *)adev->rmmio + 1011 pcie_index_hi * 4; 1012 1013 /* read low 32 bits */ 1014 writel(reg_addr, pcie_index_offset); 1015 readl(pcie_index_offset); 1016 if (pcie_index_hi != 0) { 1017 writel((reg_addr >> 32) & 0xff, pcie_index_hi_offset); 1018 readl(pcie_index_hi_offset); 1019 } 1020 r = readl(pcie_data_offset); 1021 /* read high 32 bits */ 1022 writel(reg_addr + 4, pcie_index_offset); 1023 readl(pcie_index_offset); 1024 if (pcie_index_hi != 0) { 1025 writel((reg_addr >> 32) & 0xff, pcie_index_hi_offset); 1026 readl(pcie_index_hi_offset); 1027 } 1028 r |= ((u64)readl(pcie_data_offset) << 32); 1029 1030 /* clear the high bits */ 1031 if (pcie_index_hi != 0) { 1032 writel(0, pcie_index_hi_offset); 1033 readl(pcie_index_hi_offset); 1034 } 1035 1036 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 1037 1038 return r; 1039 } 1040 1041 /** 1042 * amdgpu_device_indirect_wreg - write an indirect register address 1043 * 1044 * @adev: amdgpu_device pointer 1045 * @reg_addr: indirect register offset 1046 * @reg_data: indirect register data 1047 * 1048 */ 1049 void amdgpu_device_indirect_wreg(struct amdgpu_device *adev, 1050 u32 reg_addr, u32 reg_data) 1051 { 1052 unsigned long flags, pcie_index, pcie_data; 1053 void __iomem *pcie_index_offset; 1054 void __iomem *pcie_data_offset; 1055 1056 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 1057 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 1058 1059 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 1060 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 1061 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 1062 1063 writel(reg_addr, pcie_index_offset); 1064 readl(pcie_index_offset); 1065 writel(reg_data, pcie_data_offset); 1066 readl(pcie_data_offset); 1067 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 1068 } 1069 1070 void amdgpu_device_indirect_wreg_ext(struct amdgpu_device *adev, 1071 u64 reg_addr, u32 reg_data) 1072 { 1073 unsigned long flags, pcie_index, pcie_index_hi, pcie_data; 1074 void __iomem *pcie_index_offset; 1075 void __iomem *pcie_index_hi_offset; 1076 void __iomem *pcie_data_offset; 1077 1078 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 1079 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 1080 if ((reg_addr >> 32) && (adev->nbio.funcs->get_pcie_index_hi_offset)) 1081 pcie_index_hi = adev->nbio.funcs->get_pcie_index_hi_offset(adev); 1082 else 1083 pcie_index_hi = 0; 1084 1085 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 1086 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 1087 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 1088 if (pcie_index_hi != 0) 1089 pcie_index_hi_offset = (void __iomem *)adev->rmmio + 1090 pcie_index_hi * 4; 1091 1092 writel(reg_addr, pcie_index_offset); 1093 readl(pcie_index_offset); 1094 if (pcie_index_hi != 0) { 1095 writel((reg_addr >> 32) & 0xff, pcie_index_hi_offset); 1096 readl(pcie_index_hi_offset); 1097 } 1098 writel(reg_data, pcie_data_offset); 1099 readl(pcie_data_offset); 1100 1101 /* clear the high bits */ 1102 if (pcie_index_hi != 0) { 1103 writel(0, pcie_index_hi_offset); 1104 readl(pcie_index_hi_offset); 1105 } 1106 1107 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 1108 } 1109 1110 /** 1111 * amdgpu_device_indirect_wreg64 - write a 64bits indirect register address 1112 * 1113 * @adev: amdgpu_device pointer 1114 * @reg_addr: indirect register offset 1115 * @reg_data: indirect register data 1116 * 1117 */ 1118 void amdgpu_device_indirect_wreg64(struct amdgpu_device *adev, 1119 u32 reg_addr, u64 reg_data) 1120 { 1121 unsigned long flags, pcie_index, pcie_data; 1122 void __iomem *pcie_index_offset; 1123 void __iomem *pcie_data_offset; 1124 1125 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 1126 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 1127 1128 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 1129 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 1130 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 1131 1132 /* write low 32 bits */ 1133 writel(reg_addr, pcie_index_offset); 1134 readl(pcie_index_offset); 1135 writel((u32)(reg_data & 0xffffffffULL), pcie_data_offset); 1136 readl(pcie_data_offset); 1137 /* write high 32 bits */ 1138 writel(reg_addr + 4, pcie_index_offset); 1139 readl(pcie_index_offset); 1140 writel((u32)(reg_data >> 32), pcie_data_offset); 1141 readl(pcie_data_offset); 1142 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 1143 } 1144 1145 void amdgpu_device_indirect_wreg64_ext(struct amdgpu_device *adev, 1146 u64 reg_addr, u64 reg_data) 1147 { 1148 unsigned long flags, pcie_index, pcie_data; 1149 unsigned long pcie_index_hi = 0; 1150 void __iomem *pcie_index_offset; 1151 void __iomem *pcie_index_hi_offset; 1152 void __iomem *pcie_data_offset; 1153 1154 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 1155 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 1156 if ((reg_addr >> 32) && (adev->nbio.funcs->get_pcie_index_hi_offset)) 1157 pcie_index_hi = adev->nbio.funcs->get_pcie_index_hi_offset(adev); 1158 1159 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 1160 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 1161 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 1162 if (pcie_index_hi != 0) 1163 pcie_index_hi_offset = (void __iomem *)adev->rmmio + 1164 pcie_index_hi * 4; 1165 1166 /* write low 32 bits */ 1167 writel(reg_addr, pcie_index_offset); 1168 readl(pcie_index_offset); 1169 if (pcie_index_hi != 0) { 1170 writel((reg_addr >> 32) & 0xff, pcie_index_hi_offset); 1171 readl(pcie_index_hi_offset); 1172 } 1173 writel((u32)(reg_data & 0xffffffffULL), pcie_data_offset); 1174 readl(pcie_data_offset); 1175 /* write high 32 bits */ 1176 writel(reg_addr + 4, pcie_index_offset); 1177 readl(pcie_index_offset); 1178 if (pcie_index_hi != 0) { 1179 writel((reg_addr >> 32) & 0xff, pcie_index_hi_offset); 1180 readl(pcie_index_hi_offset); 1181 } 1182 writel((u32)(reg_data >> 32), pcie_data_offset); 1183 readl(pcie_data_offset); 1184 1185 /* clear the high bits */ 1186 if (pcie_index_hi != 0) { 1187 writel(0, pcie_index_hi_offset); 1188 readl(pcie_index_hi_offset); 1189 } 1190 1191 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 1192 } 1193 1194 /** 1195 * amdgpu_device_get_rev_id - query device rev_id 1196 * 1197 * @adev: amdgpu_device pointer 1198 * 1199 * Return device rev_id 1200 */ 1201 u32 amdgpu_device_get_rev_id(struct amdgpu_device *adev) 1202 { 1203 return adev->nbio.funcs->get_rev_id(adev); 1204 } 1205 1206 /** 1207 * amdgpu_invalid_rreg - dummy reg read function 1208 * 1209 * @adev: amdgpu_device pointer 1210 * @reg: offset of register 1211 * 1212 * Dummy register read function. Used for register blocks 1213 * that certain asics don't have (all asics). 1214 * Returns the value in the register. 1215 */ 1216 static uint32_t amdgpu_invalid_rreg(struct amdgpu_device *adev, uint32_t reg) 1217 { 1218 DRM_ERROR("Invalid callback to read register 0x%04X\n", reg); 1219 BUG(); 1220 return 0; 1221 } 1222 1223 static uint32_t amdgpu_invalid_rreg_ext(struct amdgpu_device *adev, uint64_t reg) 1224 { 1225 DRM_ERROR("Invalid callback to read register 0x%llX\n", reg); 1226 BUG(); 1227 return 0; 1228 } 1229 1230 /** 1231 * amdgpu_invalid_wreg - dummy reg write function 1232 * 1233 * @adev: amdgpu_device pointer 1234 * @reg: offset of register 1235 * @v: value to write to the register 1236 * 1237 * Dummy register read function. Used for register blocks 1238 * that certain asics don't have (all asics). 1239 */ 1240 static void amdgpu_invalid_wreg(struct amdgpu_device *adev, uint32_t reg, uint32_t v) 1241 { 1242 DRM_ERROR("Invalid callback to write register 0x%04X with 0x%08X\n", 1243 reg, v); 1244 BUG(); 1245 } 1246 1247 static void amdgpu_invalid_wreg_ext(struct amdgpu_device *adev, uint64_t reg, uint32_t v) 1248 { 1249 DRM_ERROR("Invalid callback to write register 0x%llX with 0x%08X\n", 1250 reg, v); 1251 BUG(); 1252 } 1253 1254 /** 1255 * amdgpu_invalid_rreg64 - dummy 64 bit reg read function 1256 * 1257 * @adev: amdgpu_device pointer 1258 * @reg: offset of register 1259 * 1260 * Dummy register read function. Used for register blocks 1261 * that certain asics don't have (all asics). 1262 * Returns the value in the register. 1263 */ 1264 static uint64_t amdgpu_invalid_rreg64(struct amdgpu_device *adev, uint32_t reg) 1265 { 1266 DRM_ERROR("Invalid callback to read 64 bit register 0x%04X\n", reg); 1267 BUG(); 1268 return 0; 1269 } 1270 1271 static uint64_t amdgpu_invalid_rreg64_ext(struct amdgpu_device *adev, uint64_t reg) 1272 { 1273 DRM_ERROR("Invalid callback to read register 0x%llX\n", reg); 1274 BUG(); 1275 return 0; 1276 } 1277 1278 /** 1279 * amdgpu_invalid_wreg64 - dummy reg write function 1280 * 1281 * @adev: amdgpu_device pointer 1282 * @reg: offset of register 1283 * @v: value to write to the register 1284 * 1285 * Dummy register read function. Used for register blocks 1286 * that certain asics don't have (all asics). 1287 */ 1288 static void amdgpu_invalid_wreg64(struct amdgpu_device *adev, uint32_t reg, uint64_t v) 1289 { 1290 DRM_ERROR("Invalid callback to write 64 bit register 0x%04X with 0x%08llX\n", 1291 reg, v); 1292 BUG(); 1293 } 1294 1295 static void amdgpu_invalid_wreg64_ext(struct amdgpu_device *adev, uint64_t reg, uint64_t v) 1296 { 1297 DRM_ERROR("Invalid callback to write 64 bit register 0x%llX with 0x%08llX\n", 1298 reg, v); 1299 BUG(); 1300 } 1301 1302 /** 1303 * amdgpu_block_invalid_rreg - dummy reg read function 1304 * 1305 * @adev: amdgpu_device pointer 1306 * @block: offset of instance 1307 * @reg: offset of register 1308 * 1309 * Dummy register read function. Used for register blocks 1310 * that certain asics don't have (all asics). 1311 * Returns the value in the register. 1312 */ 1313 static uint32_t amdgpu_block_invalid_rreg(struct amdgpu_device *adev, 1314 uint32_t block, uint32_t reg) 1315 { 1316 DRM_ERROR("Invalid callback to read register 0x%04X in block 0x%04X\n", 1317 reg, block); 1318 BUG(); 1319 return 0; 1320 } 1321 1322 /** 1323 * amdgpu_block_invalid_wreg - dummy reg write function 1324 * 1325 * @adev: amdgpu_device pointer 1326 * @block: offset of instance 1327 * @reg: offset of register 1328 * @v: value to write to the register 1329 * 1330 * Dummy register read function. Used for register blocks 1331 * that certain asics don't have (all asics). 1332 */ 1333 static void amdgpu_block_invalid_wreg(struct amdgpu_device *adev, 1334 uint32_t block, 1335 uint32_t reg, uint32_t v) 1336 { 1337 DRM_ERROR("Invalid block callback to write register 0x%04X in block 0x%04X with 0x%08X\n", 1338 reg, block, v); 1339 BUG(); 1340 } 1341 1342 /** 1343 * amdgpu_device_asic_init - Wrapper for atom asic_init 1344 * 1345 * @adev: amdgpu_device pointer 1346 * 1347 * Does any asic specific work and then calls atom asic init. 1348 */ 1349 static int amdgpu_device_asic_init(struct amdgpu_device *adev) 1350 { 1351 int ret; 1352 1353 amdgpu_asic_pre_asic_init(adev); 1354 1355 if (amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 3) || 1356 amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 4) || 1357 amdgpu_ip_version(adev, GC_HWIP, 0) >= IP_VERSION(11, 0, 0)) { 1358 amdgpu_psp_wait_for_bootloader(adev); 1359 ret = amdgpu_atomfirmware_asic_init(adev, true); 1360 return ret; 1361 } else { 1362 return amdgpu_atom_asic_init(adev->mode_info.atom_context); 1363 } 1364 1365 return 0; 1366 } 1367 1368 /** 1369 * amdgpu_device_mem_scratch_init - allocate the VRAM scratch page 1370 * 1371 * @adev: amdgpu_device pointer 1372 * 1373 * Allocates a scratch page of VRAM for use by various things in the 1374 * driver. 1375 */ 1376 static int amdgpu_device_mem_scratch_init(struct amdgpu_device *adev) 1377 { 1378 return amdgpu_bo_create_kernel(adev, AMDGPU_GPU_PAGE_SIZE, PAGE_SIZE, 1379 AMDGPU_GEM_DOMAIN_VRAM | 1380 AMDGPU_GEM_DOMAIN_GTT, 1381 &adev->mem_scratch.robj, 1382 &adev->mem_scratch.gpu_addr, 1383 (void **)&adev->mem_scratch.ptr); 1384 } 1385 1386 /** 1387 * amdgpu_device_mem_scratch_fini - Free the VRAM scratch page 1388 * 1389 * @adev: amdgpu_device pointer 1390 * 1391 * Frees the VRAM scratch page. 1392 */ 1393 static void amdgpu_device_mem_scratch_fini(struct amdgpu_device *adev) 1394 { 1395 amdgpu_bo_free_kernel(&adev->mem_scratch.robj, NULL, NULL); 1396 } 1397 1398 /** 1399 * amdgpu_device_program_register_sequence - program an array of registers. 1400 * 1401 * @adev: amdgpu_device pointer 1402 * @registers: pointer to the register array 1403 * @array_size: size of the register array 1404 * 1405 * Programs an array or registers with and or masks. 1406 * This is a helper for setting golden registers. 1407 */ 1408 void amdgpu_device_program_register_sequence(struct amdgpu_device *adev, 1409 const u32 *registers, 1410 const u32 array_size) 1411 { 1412 u32 tmp, reg, and_mask, or_mask; 1413 int i; 1414 1415 if (array_size % 3) 1416 return; 1417 1418 for (i = 0; i < array_size; i += 3) { 1419 reg = registers[i + 0]; 1420 and_mask = registers[i + 1]; 1421 or_mask = registers[i + 2]; 1422 1423 if (and_mask == 0xffffffff) { 1424 tmp = or_mask; 1425 } else { 1426 tmp = RREG32(reg); 1427 tmp &= ~and_mask; 1428 if (adev->family >= AMDGPU_FAMILY_AI) 1429 tmp |= (or_mask & and_mask); 1430 else 1431 tmp |= or_mask; 1432 } 1433 WREG32(reg, tmp); 1434 } 1435 } 1436 1437 /** 1438 * amdgpu_device_pci_config_reset - reset the GPU 1439 * 1440 * @adev: amdgpu_device pointer 1441 * 1442 * Resets the GPU using the pci config reset sequence. 1443 * Only applicable to asics prior to vega10. 1444 */ 1445 void amdgpu_device_pci_config_reset(struct amdgpu_device *adev) 1446 { 1447 pci_write_config_dword(adev->pdev, 0x7c, AMDGPU_ASIC_RESET_DATA); 1448 } 1449 1450 /** 1451 * amdgpu_device_pci_reset - reset the GPU using generic PCI means 1452 * 1453 * @adev: amdgpu_device pointer 1454 * 1455 * Resets the GPU using generic pci reset interfaces (FLR, SBR, etc.). 1456 */ 1457 int amdgpu_device_pci_reset(struct amdgpu_device *adev) 1458 { 1459 return pci_reset_function(adev->pdev); 1460 } 1461 1462 /* 1463 * amdgpu_device_wb_*() 1464 * Writeback is the method by which the GPU updates special pages in memory 1465 * with the status of certain GPU events (fences, ring pointers,etc.). 1466 */ 1467 1468 /** 1469 * amdgpu_device_wb_fini - Disable Writeback and free memory 1470 * 1471 * @adev: amdgpu_device pointer 1472 * 1473 * Disables Writeback and frees the Writeback memory (all asics). 1474 * Used at driver shutdown. 1475 */ 1476 static void amdgpu_device_wb_fini(struct amdgpu_device *adev) 1477 { 1478 if (adev->wb.wb_obj) { 1479 amdgpu_bo_free_kernel(&adev->wb.wb_obj, 1480 &adev->wb.gpu_addr, 1481 (void **)&adev->wb.wb); 1482 adev->wb.wb_obj = NULL; 1483 } 1484 } 1485 1486 /** 1487 * amdgpu_device_wb_init - Init Writeback driver info and allocate memory 1488 * 1489 * @adev: amdgpu_device pointer 1490 * 1491 * Initializes writeback and allocates writeback memory (all asics). 1492 * Used at driver startup. 1493 * Returns 0 on success or an -error on failure. 1494 */ 1495 static int amdgpu_device_wb_init(struct amdgpu_device *adev) 1496 { 1497 int r; 1498 1499 if (adev->wb.wb_obj == NULL) { 1500 /* AMDGPU_MAX_WB * sizeof(uint32_t) * 8 = AMDGPU_MAX_WB 256bit slots */ 1501 r = amdgpu_bo_create_kernel(adev, AMDGPU_MAX_WB * sizeof(uint32_t) * 8, 1502 PAGE_SIZE, AMDGPU_GEM_DOMAIN_GTT, 1503 &adev->wb.wb_obj, &adev->wb.gpu_addr, 1504 (void **)&adev->wb.wb); 1505 if (r) { 1506 dev_warn(adev->dev, "(%d) create WB bo failed\n", r); 1507 return r; 1508 } 1509 1510 adev->wb.num_wb = AMDGPU_MAX_WB; 1511 memset(&adev->wb.used, 0, sizeof(adev->wb.used)); 1512 1513 /* clear wb memory */ 1514 memset((char *)adev->wb.wb, 0, AMDGPU_MAX_WB * sizeof(uint32_t) * 8); 1515 } 1516 1517 return 0; 1518 } 1519 1520 /** 1521 * amdgpu_device_wb_get - Allocate a wb entry 1522 * 1523 * @adev: amdgpu_device pointer 1524 * @wb: wb index 1525 * 1526 * Allocate a wb slot for use by the driver (all asics). 1527 * Returns 0 on success or -EINVAL on failure. 1528 */ 1529 int amdgpu_device_wb_get(struct amdgpu_device *adev, u32 *wb) 1530 { 1531 unsigned long flags, offset; 1532 1533 spin_lock_irqsave(&adev->wb.lock, flags); 1534 offset = find_first_zero_bit(adev->wb.used, adev->wb.num_wb); 1535 if (offset < adev->wb.num_wb) { 1536 __set_bit(offset, adev->wb.used); 1537 spin_unlock_irqrestore(&adev->wb.lock, flags); 1538 *wb = offset << 3; /* convert to dw offset */ 1539 return 0; 1540 } else { 1541 spin_unlock_irqrestore(&adev->wb.lock, flags); 1542 return -EINVAL; 1543 } 1544 } 1545 1546 /** 1547 * amdgpu_device_wb_free - Free a wb entry 1548 * 1549 * @adev: amdgpu_device pointer 1550 * @wb: wb index 1551 * 1552 * Free a wb slot allocated for use by the driver (all asics) 1553 */ 1554 void amdgpu_device_wb_free(struct amdgpu_device *adev, u32 wb) 1555 { 1556 unsigned long flags; 1557 1558 wb >>= 3; 1559 spin_lock_irqsave(&adev->wb.lock, flags); 1560 if (wb < adev->wb.num_wb) 1561 __clear_bit(wb, adev->wb.used); 1562 spin_unlock_irqrestore(&adev->wb.lock, flags); 1563 } 1564 1565 /** 1566 * amdgpu_device_resize_fb_bar - try to resize FB BAR 1567 * 1568 * @adev: amdgpu_device pointer 1569 * 1570 * Try to resize FB BAR to make all VRAM CPU accessible. We try very hard not 1571 * to fail, but if any of the BARs is not accessible after the size we abort 1572 * driver loading by returning -ENODEV. 1573 */ 1574 int amdgpu_device_resize_fb_bar(struct amdgpu_device *adev) 1575 { 1576 int rbar_size = pci_rebar_bytes_to_size(adev->gmc.real_vram_size); 1577 struct pci_bus *root; 1578 struct resource *res; 1579 unsigned int i; 1580 u16 cmd; 1581 int r; 1582 1583 if (!IS_ENABLED(CONFIG_PHYS_ADDR_T_64BIT)) 1584 return 0; 1585 1586 /* Bypass for VF */ 1587 if (amdgpu_sriov_vf(adev)) 1588 return 0; 1589 1590 /* PCI_EXT_CAP_ID_VNDR extended capability is located at 0x100 */ 1591 if (!pci_find_ext_capability(adev->pdev, PCI_EXT_CAP_ID_VNDR)) 1592 DRM_WARN("System can't access extended configuration space, please check!!\n"); 1593 1594 /* skip if the bios has already enabled large BAR */ 1595 if (adev->gmc.real_vram_size && 1596 (pci_resource_len(adev->pdev, 0) >= adev->gmc.real_vram_size)) 1597 return 0; 1598 1599 /* Check if the root BUS has 64bit memory resources */ 1600 root = adev->pdev->bus; 1601 while (root->parent) 1602 root = root->parent; 1603 1604 pci_bus_for_each_resource(root, res, i) { 1605 if (res && res->flags & (IORESOURCE_MEM | IORESOURCE_MEM_64) && 1606 res->start > 0x100000000ull) 1607 break; 1608 } 1609 1610 /* Trying to resize is pointless without a root hub window above 4GB */ 1611 if (!res) 1612 return 0; 1613 1614 /* Limit the BAR size to what is available */ 1615 rbar_size = min(fls(pci_rebar_get_possible_sizes(adev->pdev, 0)) - 1, 1616 rbar_size); 1617 1618 /* Disable memory decoding while we change the BAR addresses and size */ 1619 pci_read_config_word(adev->pdev, PCI_COMMAND, &cmd); 1620 pci_write_config_word(adev->pdev, PCI_COMMAND, 1621 cmd & ~PCI_COMMAND_MEMORY); 1622 1623 /* Free the VRAM and doorbell BAR, we most likely need to move both. */ 1624 amdgpu_doorbell_fini(adev); 1625 if (adev->asic_type >= CHIP_BONAIRE) 1626 pci_release_resource(adev->pdev, 2); 1627 1628 pci_release_resource(adev->pdev, 0); 1629 1630 r = pci_resize_resource(adev->pdev, 0, rbar_size); 1631 if (r == -ENOSPC) 1632 DRM_INFO("Not enough PCI address space for a large BAR."); 1633 else if (r && r != -ENOTSUPP) 1634 DRM_ERROR("Problem resizing BAR0 (%d).", r); 1635 1636 pci_assign_unassigned_bus_resources(adev->pdev->bus); 1637 1638 /* When the doorbell or fb BAR isn't available we have no chance of 1639 * using the device. 1640 */ 1641 r = amdgpu_doorbell_init(adev); 1642 if (r || (pci_resource_flags(adev->pdev, 0) & IORESOURCE_UNSET)) 1643 return -ENODEV; 1644 1645 pci_write_config_word(adev->pdev, PCI_COMMAND, cmd); 1646 1647 return 0; 1648 } 1649 1650 static bool amdgpu_device_read_bios(struct amdgpu_device *adev) 1651 { 1652 if (hweight32(adev->aid_mask) && (adev->flags & AMD_IS_APU)) 1653 return false; 1654 1655 return true; 1656 } 1657 1658 /* 1659 * GPU helpers function. 1660 */ 1661 /** 1662 * amdgpu_device_need_post - check if the hw need post or not 1663 * 1664 * @adev: amdgpu_device pointer 1665 * 1666 * Check if the asic has been initialized (all asics) at driver startup 1667 * or post is needed if hw reset is performed. 1668 * Returns true if need or false if not. 1669 */ 1670 bool amdgpu_device_need_post(struct amdgpu_device *adev) 1671 { 1672 uint32_t reg; 1673 1674 if (amdgpu_sriov_vf(adev)) 1675 return false; 1676 1677 if (!amdgpu_device_read_bios(adev)) 1678 return false; 1679 1680 if (amdgpu_passthrough(adev)) { 1681 /* for FIJI: In whole GPU pass-through virtualization case, after VM reboot 1682 * some old smc fw still need driver do vPost otherwise gpu hang, while 1683 * those smc fw version above 22.15 doesn't have this flaw, so we force 1684 * vpost executed for smc version below 22.15 1685 */ 1686 if (adev->asic_type == CHIP_FIJI) { 1687 int err; 1688 uint32_t fw_ver; 1689 1690 err = request_firmware(&adev->pm.fw, "amdgpu/fiji_smc.bin", adev->dev); 1691 /* force vPost if error occured */ 1692 if (err) 1693 return true; 1694 1695 fw_ver = *((uint32_t *)adev->pm.fw->data + 69); 1696 release_firmware(adev->pm.fw); 1697 if (fw_ver < 0x00160e00) 1698 return true; 1699 } 1700 } 1701 1702 /* Don't post if we need to reset whole hive on init */ 1703 if (adev->init_lvl->level == AMDGPU_INIT_LEVEL_MINIMAL_XGMI) 1704 return false; 1705 1706 if (adev->has_hw_reset) { 1707 adev->has_hw_reset = false; 1708 return true; 1709 } 1710 1711 /* bios scratch used on CIK+ */ 1712 if (adev->asic_type >= CHIP_BONAIRE) 1713 return amdgpu_atombios_scratch_need_asic_init(adev); 1714 1715 /* check MEM_SIZE for older asics */ 1716 reg = amdgpu_asic_get_config_memsize(adev); 1717 1718 if ((reg != 0) && (reg != 0xffffffff)) 1719 return false; 1720 1721 return true; 1722 } 1723 1724 /* 1725 * Check whether seamless boot is supported. 1726 * 1727 * So far we only support seamless boot on DCE 3.0 or later. 1728 * If users report that it works on older ASICS as well, we may 1729 * loosen this. 1730 */ 1731 bool amdgpu_device_seamless_boot_supported(struct amdgpu_device *adev) 1732 { 1733 switch (amdgpu_seamless) { 1734 case -1: 1735 break; 1736 case 1: 1737 return true; 1738 case 0: 1739 return false; 1740 default: 1741 DRM_ERROR("Invalid value for amdgpu.seamless: %d\n", 1742 amdgpu_seamless); 1743 return false; 1744 } 1745 1746 if (!(adev->flags & AMD_IS_APU)) 1747 return false; 1748 1749 if (adev->mman.keep_stolen_vga_memory) 1750 return false; 1751 1752 return amdgpu_ip_version(adev, DCE_HWIP, 0) >= IP_VERSION(3, 0, 0); 1753 } 1754 1755 /* 1756 * Intel hosts such as Rocket Lake, Alder Lake, Raptor Lake and Sapphire Rapids 1757 * don't support dynamic speed switching. Until we have confirmation from Intel 1758 * that a specific host supports it, it's safer that we keep it disabled for all. 1759 * 1760 * https://edc.intel.com/content/www/us/en/design/products/platforms/details/raptor-lake-s/13th-generation-core-processors-datasheet-volume-1-of-2/005/pci-express-support/ 1761 * https://gitlab.freedesktop.org/drm/amd/-/issues/2663 1762 */ 1763 static bool amdgpu_device_pcie_dynamic_switching_supported(struct amdgpu_device *adev) 1764 { 1765 #if IS_ENABLED(CONFIG_X86) 1766 struct cpuinfo_x86 *c = &cpu_data(0); 1767 1768 /* eGPU change speeds based on USB4 fabric conditions */ 1769 if (dev_is_removable(adev->dev)) 1770 return true; 1771 1772 if (c->x86_vendor == X86_VENDOR_INTEL) 1773 return false; 1774 #endif 1775 return true; 1776 } 1777 1778 /** 1779 * amdgpu_device_should_use_aspm - check if the device should program ASPM 1780 * 1781 * @adev: amdgpu_device pointer 1782 * 1783 * Confirm whether the module parameter and pcie bridge agree that ASPM should 1784 * be set for this device. 1785 * 1786 * Returns true if it should be used or false if not. 1787 */ 1788 bool amdgpu_device_should_use_aspm(struct amdgpu_device *adev) 1789 { 1790 switch (amdgpu_aspm) { 1791 case -1: 1792 break; 1793 case 0: 1794 return false; 1795 case 1: 1796 return true; 1797 default: 1798 return false; 1799 } 1800 if (adev->flags & AMD_IS_APU) 1801 return false; 1802 if (!(adev->pm.pp_feature & PP_PCIE_DPM_MASK)) 1803 return false; 1804 return pcie_aspm_enabled(adev->pdev); 1805 } 1806 1807 /* if we get transitioned to only one device, take VGA back */ 1808 /** 1809 * amdgpu_device_vga_set_decode - enable/disable vga decode 1810 * 1811 * @pdev: PCI device pointer 1812 * @state: enable/disable vga decode 1813 * 1814 * Enable/disable vga decode (all asics). 1815 * Returns VGA resource flags. 1816 */ 1817 static unsigned int amdgpu_device_vga_set_decode(struct pci_dev *pdev, 1818 bool state) 1819 { 1820 struct amdgpu_device *adev = drm_to_adev(pci_get_drvdata(pdev)); 1821 1822 amdgpu_asic_set_vga_state(adev, state); 1823 if (state) 1824 return VGA_RSRC_LEGACY_IO | VGA_RSRC_LEGACY_MEM | 1825 VGA_RSRC_NORMAL_IO | VGA_RSRC_NORMAL_MEM; 1826 else 1827 return VGA_RSRC_NORMAL_IO | VGA_RSRC_NORMAL_MEM; 1828 } 1829 1830 /** 1831 * amdgpu_device_check_block_size - validate the vm block size 1832 * 1833 * @adev: amdgpu_device pointer 1834 * 1835 * Validates the vm block size specified via module parameter. 1836 * The vm block size defines number of bits in page table versus page directory, 1837 * a page is 4KB so we have 12 bits offset, minimum 9 bits in the 1838 * page table and the remaining bits are in the page directory. 1839 */ 1840 static void amdgpu_device_check_block_size(struct amdgpu_device *adev) 1841 { 1842 /* defines number of bits in page table versus page directory, 1843 * a page is 4KB so we have 12 bits offset, minimum 9 bits in the 1844 * page table and the remaining bits are in the page directory 1845 */ 1846 if (amdgpu_vm_block_size == -1) 1847 return; 1848 1849 if (amdgpu_vm_block_size < 9) { 1850 dev_warn(adev->dev, "VM page table size (%d) too small\n", 1851 amdgpu_vm_block_size); 1852 amdgpu_vm_block_size = -1; 1853 } 1854 } 1855 1856 /** 1857 * amdgpu_device_check_vm_size - validate the vm size 1858 * 1859 * @adev: amdgpu_device pointer 1860 * 1861 * Validates the vm size in GB specified via module parameter. 1862 * The VM size is the size of the GPU virtual memory space in GB. 1863 */ 1864 static void amdgpu_device_check_vm_size(struct amdgpu_device *adev) 1865 { 1866 /* no need to check the default value */ 1867 if (amdgpu_vm_size == -1) 1868 return; 1869 1870 if (amdgpu_vm_size < 1) { 1871 dev_warn(adev->dev, "VM size (%d) too small, min is 1GB\n", 1872 amdgpu_vm_size); 1873 amdgpu_vm_size = -1; 1874 } 1875 } 1876 1877 static void amdgpu_device_check_smu_prv_buffer_size(struct amdgpu_device *adev) 1878 { 1879 struct sysinfo si; 1880 bool is_os_64 = (sizeof(void *) == 8); 1881 uint64_t total_memory; 1882 uint64_t dram_size_seven_GB = 0x1B8000000; 1883 uint64_t dram_size_three_GB = 0xB8000000; 1884 1885 if (amdgpu_smu_memory_pool_size == 0) 1886 return; 1887 1888 if (!is_os_64) { 1889 DRM_WARN("Not 64-bit OS, feature not supported\n"); 1890 goto def_value; 1891 } 1892 si_meminfo(&si); 1893 total_memory = (uint64_t)si.totalram * si.mem_unit; 1894 1895 if ((amdgpu_smu_memory_pool_size == 1) || 1896 (amdgpu_smu_memory_pool_size == 2)) { 1897 if (total_memory < dram_size_three_GB) 1898 goto def_value1; 1899 } else if ((amdgpu_smu_memory_pool_size == 4) || 1900 (amdgpu_smu_memory_pool_size == 8)) { 1901 if (total_memory < dram_size_seven_GB) 1902 goto def_value1; 1903 } else { 1904 DRM_WARN("Smu memory pool size not supported\n"); 1905 goto def_value; 1906 } 1907 adev->pm.smu_prv_buffer_size = amdgpu_smu_memory_pool_size << 28; 1908 1909 return; 1910 1911 def_value1: 1912 DRM_WARN("No enough system memory\n"); 1913 def_value: 1914 adev->pm.smu_prv_buffer_size = 0; 1915 } 1916 1917 static int amdgpu_device_init_apu_flags(struct amdgpu_device *adev) 1918 { 1919 if (!(adev->flags & AMD_IS_APU) || 1920 adev->asic_type < CHIP_RAVEN) 1921 return 0; 1922 1923 switch (adev->asic_type) { 1924 case CHIP_RAVEN: 1925 if (adev->pdev->device == 0x15dd) 1926 adev->apu_flags |= AMD_APU_IS_RAVEN; 1927 if (adev->pdev->device == 0x15d8) 1928 adev->apu_flags |= AMD_APU_IS_PICASSO; 1929 break; 1930 case CHIP_RENOIR: 1931 if ((adev->pdev->device == 0x1636) || 1932 (adev->pdev->device == 0x164c)) 1933 adev->apu_flags |= AMD_APU_IS_RENOIR; 1934 else 1935 adev->apu_flags |= AMD_APU_IS_GREEN_SARDINE; 1936 break; 1937 case CHIP_VANGOGH: 1938 adev->apu_flags |= AMD_APU_IS_VANGOGH; 1939 break; 1940 case CHIP_YELLOW_CARP: 1941 break; 1942 case CHIP_CYAN_SKILLFISH: 1943 if ((adev->pdev->device == 0x13FE) || 1944 (adev->pdev->device == 0x143F)) 1945 adev->apu_flags |= AMD_APU_IS_CYAN_SKILLFISH2; 1946 break; 1947 default: 1948 break; 1949 } 1950 1951 return 0; 1952 } 1953 1954 /** 1955 * amdgpu_device_check_arguments - validate module params 1956 * 1957 * @adev: amdgpu_device pointer 1958 * 1959 * Validates certain module parameters and updates 1960 * the associated values used by the driver (all asics). 1961 */ 1962 static int amdgpu_device_check_arguments(struct amdgpu_device *adev) 1963 { 1964 int i; 1965 1966 if (amdgpu_sched_jobs < 4) { 1967 dev_warn(adev->dev, "sched jobs (%d) must be at least 4\n", 1968 amdgpu_sched_jobs); 1969 amdgpu_sched_jobs = 4; 1970 } else if (!is_power_of_2(amdgpu_sched_jobs)) { 1971 dev_warn(adev->dev, "sched jobs (%d) must be a power of 2\n", 1972 amdgpu_sched_jobs); 1973 amdgpu_sched_jobs = roundup_pow_of_two(amdgpu_sched_jobs); 1974 } 1975 1976 if (amdgpu_gart_size != -1 && amdgpu_gart_size < 32) { 1977 /* gart size must be greater or equal to 32M */ 1978 dev_warn(adev->dev, "gart size (%d) too small\n", 1979 amdgpu_gart_size); 1980 amdgpu_gart_size = -1; 1981 } 1982 1983 if (amdgpu_gtt_size != -1 && amdgpu_gtt_size < 32) { 1984 /* gtt size must be greater or equal to 32M */ 1985 dev_warn(adev->dev, "gtt size (%d) too small\n", 1986 amdgpu_gtt_size); 1987 amdgpu_gtt_size = -1; 1988 } 1989 1990 /* valid range is between 4 and 9 inclusive */ 1991 if (amdgpu_vm_fragment_size != -1 && 1992 (amdgpu_vm_fragment_size > 9 || amdgpu_vm_fragment_size < 4)) { 1993 dev_warn(adev->dev, "valid range is between 4 and 9\n"); 1994 amdgpu_vm_fragment_size = -1; 1995 } 1996 1997 if (amdgpu_sched_hw_submission < 2) { 1998 dev_warn(adev->dev, "sched hw submission jobs (%d) must be at least 2\n", 1999 amdgpu_sched_hw_submission); 2000 amdgpu_sched_hw_submission = 2; 2001 } else if (!is_power_of_2(amdgpu_sched_hw_submission)) { 2002 dev_warn(adev->dev, "sched hw submission jobs (%d) must be a power of 2\n", 2003 amdgpu_sched_hw_submission); 2004 amdgpu_sched_hw_submission = roundup_pow_of_two(amdgpu_sched_hw_submission); 2005 } 2006 2007 if (amdgpu_reset_method < -1 || amdgpu_reset_method > 4) { 2008 dev_warn(adev->dev, "invalid option for reset method, reverting to default\n"); 2009 amdgpu_reset_method = -1; 2010 } 2011 2012 amdgpu_device_check_smu_prv_buffer_size(adev); 2013 2014 amdgpu_device_check_vm_size(adev); 2015 2016 amdgpu_device_check_block_size(adev); 2017 2018 adev->firmware.load_type = amdgpu_ucode_get_load_type(adev, amdgpu_fw_load_type); 2019 2020 for (i = 0; i < MAX_XCP; i++) 2021 adev->enforce_isolation[i] = !!enforce_isolation; 2022 2023 return 0; 2024 } 2025 2026 /** 2027 * amdgpu_switcheroo_set_state - set switcheroo state 2028 * 2029 * @pdev: pci dev pointer 2030 * @state: vga_switcheroo state 2031 * 2032 * Callback for the switcheroo driver. Suspends or resumes 2033 * the asics before or after it is powered up using ACPI methods. 2034 */ 2035 static void amdgpu_switcheroo_set_state(struct pci_dev *pdev, 2036 enum vga_switcheroo_state state) 2037 { 2038 struct drm_device *dev = pci_get_drvdata(pdev); 2039 int r; 2040 2041 if (amdgpu_device_supports_px(dev) && state == VGA_SWITCHEROO_OFF) 2042 return; 2043 2044 if (state == VGA_SWITCHEROO_ON) { 2045 pr_info("switched on\n"); 2046 /* don't suspend or resume card normally */ 2047 dev->switch_power_state = DRM_SWITCH_POWER_CHANGING; 2048 2049 pci_set_power_state(pdev, PCI_D0); 2050 amdgpu_device_load_pci_state(pdev); 2051 r = pci_enable_device(pdev); 2052 if (r) 2053 DRM_WARN("pci_enable_device failed (%d)\n", r); 2054 amdgpu_device_resume(dev, true); 2055 2056 dev->switch_power_state = DRM_SWITCH_POWER_ON; 2057 } else { 2058 pr_info("switched off\n"); 2059 dev->switch_power_state = DRM_SWITCH_POWER_CHANGING; 2060 amdgpu_device_prepare(dev); 2061 amdgpu_device_suspend(dev, true); 2062 amdgpu_device_cache_pci_state(pdev); 2063 /* Shut down the device */ 2064 pci_disable_device(pdev); 2065 pci_set_power_state(pdev, PCI_D3cold); 2066 dev->switch_power_state = DRM_SWITCH_POWER_OFF; 2067 } 2068 } 2069 2070 /** 2071 * amdgpu_switcheroo_can_switch - see if switcheroo state can change 2072 * 2073 * @pdev: pci dev pointer 2074 * 2075 * Callback for the switcheroo driver. Check of the switcheroo 2076 * state can be changed. 2077 * Returns true if the state can be changed, false if not. 2078 */ 2079 static bool amdgpu_switcheroo_can_switch(struct pci_dev *pdev) 2080 { 2081 struct drm_device *dev = pci_get_drvdata(pdev); 2082 2083 /* 2084 * FIXME: open_count is protected by drm_global_mutex but that would lead to 2085 * locking inversion with the driver load path. And the access here is 2086 * completely racy anyway. So don't bother with locking for now. 2087 */ 2088 return atomic_read(&dev->open_count) == 0; 2089 } 2090 2091 static const struct vga_switcheroo_client_ops amdgpu_switcheroo_ops = { 2092 .set_gpu_state = amdgpu_switcheroo_set_state, 2093 .reprobe = NULL, 2094 .can_switch = amdgpu_switcheroo_can_switch, 2095 }; 2096 2097 /** 2098 * amdgpu_device_ip_set_clockgating_state - set the CG state 2099 * 2100 * @dev: amdgpu_device pointer 2101 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.) 2102 * @state: clockgating state (gate or ungate) 2103 * 2104 * Sets the requested clockgating state for all instances of 2105 * the hardware IP specified. 2106 * Returns the error code from the last instance. 2107 */ 2108 int amdgpu_device_ip_set_clockgating_state(void *dev, 2109 enum amd_ip_block_type block_type, 2110 enum amd_clockgating_state state) 2111 { 2112 struct amdgpu_device *adev = dev; 2113 int i, r = 0; 2114 2115 for (i = 0; i < adev->num_ip_blocks; i++) { 2116 if (!adev->ip_blocks[i].status.valid) 2117 continue; 2118 if (adev->ip_blocks[i].version->type != block_type) 2119 continue; 2120 if (!adev->ip_blocks[i].version->funcs->set_clockgating_state) 2121 continue; 2122 r = adev->ip_blocks[i].version->funcs->set_clockgating_state( 2123 (void *)adev, state); 2124 if (r) 2125 DRM_ERROR("set_clockgating_state of IP block <%s> failed %d\n", 2126 adev->ip_blocks[i].version->funcs->name, r); 2127 } 2128 return r; 2129 } 2130 2131 /** 2132 * amdgpu_device_ip_set_powergating_state - set the PG state 2133 * 2134 * @dev: amdgpu_device pointer 2135 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.) 2136 * @state: powergating state (gate or ungate) 2137 * 2138 * Sets the requested powergating state for all instances of 2139 * the hardware IP specified. 2140 * Returns the error code from the last instance. 2141 */ 2142 int amdgpu_device_ip_set_powergating_state(void *dev, 2143 enum amd_ip_block_type block_type, 2144 enum amd_powergating_state state) 2145 { 2146 struct amdgpu_device *adev = dev; 2147 int i, r = 0; 2148 2149 for (i = 0; i < adev->num_ip_blocks; i++) { 2150 if (!adev->ip_blocks[i].status.valid) 2151 continue; 2152 if (adev->ip_blocks[i].version->type != block_type) 2153 continue; 2154 if (!adev->ip_blocks[i].version->funcs->set_powergating_state) 2155 continue; 2156 r = adev->ip_blocks[i].version->funcs->set_powergating_state( 2157 (void *)adev, state); 2158 if (r) 2159 DRM_ERROR("set_powergating_state of IP block <%s> failed %d\n", 2160 adev->ip_blocks[i].version->funcs->name, r); 2161 } 2162 return r; 2163 } 2164 2165 /** 2166 * amdgpu_device_ip_get_clockgating_state - get the CG state 2167 * 2168 * @adev: amdgpu_device pointer 2169 * @flags: clockgating feature flags 2170 * 2171 * Walks the list of IPs on the device and updates the clockgating 2172 * flags for each IP. 2173 * Updates @flags with the feature flags for each hardware IP where 2174 * clockgating is enabled. 2175 */ 2176 void amdgpu_device_ip_get_clockgating_state(struct amdgpu_device *adev, 2177 u64 *flags) 2178 { 2179 int i; 2180 2181 for (i = 0; i < adev->num_ip_blocks; i++) { 2182 if (!adev->ip_blocks[i].status.valid) 2183 continue; 2184 if (adev->ip_blocks[i].version->funcs->get_clockgating_state) 2185 adev->ip_blocks[i].version->funcs->get_clockgating_state((void *)adev, flags); 2186 } 2187 } 2188 2189 /** 2190 * amdgpu_device_ip_wait_for_idle - wait for idle 2191 * 2192 * @adev: amdgpu_device pointer 2193 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.) 2194 * 2195 * Waits for the request hardware IP to be idle. 2196 * Returns 0 for success or a negative error code on failure. 2197 */ 2198 int amdgpu_device_ip_wait_for_idle(struct amdgpu_device *adev, 2199 enum amd_ip_block_type block_type) 2200 { 2201 int i, r; 2202 2203 for (i = 0; i < adev->num_ip_blocks; i++) { 2204 if (!adev->ip_blocks[i].status.valid) 2205 continue; 2206 if (adev->ip_blocks[i].version->type == block_type) { 2207 r = adev->ip_blocks[i].version->funcs->wait_for_idle(&adev->ip_blocks[i]); 2208 if (r) 2209 return r; 2210 break; 2211 } 2212 } 2213 return 0; 2214 2215 } 2216 2217 /** 2218 * amdgpu_device_ip_is_valid - is the hardware IP enabled 2219 * 2220 * @adev: amdgpu_device pointer 2221 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.) 2222 * 2223 * Check if the hardware IP is enable or not. 2224 * Returns true if it the IP is enable, false if not. 2225 */ 2226 bool amdgpu_device_ip_is_valid(struct amdgpu_device *adev, 2227 enum amd_ip_block_type block_type) 2228 { 2229 int i; 2230 2231 for (i = 0; i < adev->num_ip_blocks; i++) { 2232 if (adev->ip_blocks[i].version->type == block_type) 2233 return adev->ip_blocks[i].status.valid; 2234 } 2235 return false; 2236 2237 } 2238 2239 /** 2240 * amdgpu_device_ip_get_ip_block - get a hw IP pointer 2241 * 2242 * @adev: amdgpu_device pointer 2243 * @type: Type of hardware IP (SMU, GFX, UVD, etc.) 2244 * 2245 * Returns a pointer to the hardware IP block structure 2246 * if it exists for the asic, otherwise NULL. 2247 */ 2248 struct amdgpu_ip_block * 2249 amdgpu_device_ip_get_ip_block(struct amdgpu_device *adev, 2250 enum amd_ip_block_type type) 2251 { 2252 int i; 2253 2254 for (i = 0; i < adev->num_ip_blocks; i++) 2255 if (adev->ip_blocks[i].version->type == type) 2256 return &adev->ip_blocks[i]; 2257 2258 return NULL; 2259 } 2260 2261 /** 2262 * amdgpu_device_ip_block_version_cmp 2263 * 2264 * @adev: amdgpu_device pointer 2265 * @type: enum amd_ip_block_type 2266 * @major: major version 2267 * @minor: minor version 2268 * 2269 * return 0 if equal or greater 2270 * return 1 if smaller or the ip_block doesn't exist 2271 */ 2272 int amdgpu_device_ip_block_version_cmp(struct amdgpu_device *adev, 2273 enum amd_ip_block_type type, 2274 u32 major, u32 minor) 2275 { 2276 struct amdgpu_ip_block *ip_block = amdgpu_device_ip_get_ip_block(adev, type); 2277 2278 if (ip_block && ((ip_block->version->major > major) || 2279 ((ip_block->version->major == major) && 2280 (ip_block->version->minor >= minor)))) 2281 return 0; 2282 2283 return 1; 2284 } 2285 2286 /** 2287 * amdgpu_device_ip_block_add 2288 * 2289 * @adev: amdgpu_device pointer 2290 * @ip_block_version: pointer to the IP to add 2291 * 2292 * Adds the IP block driver information to the collection of IPs 2293 * on the asic. 2294 */ 2295 int amdgpu_device_ip_block_add(struct amdgpu_device *adev, 2296 const struct amdgpu_ip_block_version *ip_block_version) 2297 { 2298 if (!ip_block_version) 2299 return -EINVAL; 2300 2301 switch (ip_block_version->type) { 2302 case AMD_IP_BLOCK_TYPE_VCN: 2303 if (adev->harvest_ip_mask & AMD_HARVEST_IP_VCN_MASK) 2304 return 0; 2305 break; 2306 case AMD_IP_BLOCK_TYPE_JPEG: 2307 if (adev->harvest_ip_mask & AMD_HARVEST_IP_JPEG_MASK) 2308 return 0; 2309 break; 2310 default: 2311 break; 2312 } 2313 2314 DRM_INFO("add ip block number %d <%s>\n", adev->num_ip_blocks, 2315 ip_block_version->funcs->name); 2316 2317 adev->ip_blocks[adev->num_ip_blocks].adev = adev; 2318 2319 adev->ip_blocks[adev->num_ip_blocks++].version = ip_block_version; 2320 2321 return 0; 2322 } 2323 2324 /** 2325 * amdgpu_device_enable_virtual_display - enable virtual display feature 2326 * 2327 * @adev: amdgpu_device pointer 2328 * 2329 * Enabled the virtual display feature if the user has enabled it via 2330 * the module parameter virtual_display. This feature provides a virtual 2331 * display hardware on headless boards or in virtualized environments. 2332 * This function parses and validates the configuration string specified by 2333 * the user and configues the virtual display configuration (number of 2334 * virtual connectors, crtcs, etc.) specified. 2335 */ 2336 static void amdgpu_device_enable_virtual_display(struct amdgpu_device *adev) 2337 { 2338 adev->enable_virtual_display = false; 2339 2340 if (amdgpu_virtual_display) { 2341 const char *pci_address_name = pci_name(adev->pdev); 2342 char *pciaddstr, *pciaddstr_tmp, *pciaddname_tmp, *pciaddname; 2343 2344 pciaddstr = kstrdup(amdgpu_virtual_display, GFP_KERNEL); 2345 pciaddstr_tmp = pciaddstr; 2346 while ((pciaddname_tmp = strsep(&pciaddstr_tmp, ";"))) { 2347 pciaddname = strsep(&pciaddname_tmp, ","); 2348 if (!strcmp("all", pciaddname) 2349 || !strcmp(pci_address_name, pciaddname)) { 2350 long num_crtc; 2351 int res = -1; 2352 2353 adev->enable_virtual_display = true; 2354 2355 if (pciaddname_tmp) 2356 res = kstrtol(pciaddname_tmp, 10, 2357 &num_crtc); 2358 2359 if (!res) { 2360 if (num_crtc < 1) 2361 num_crtc = 1; 2362 if (num_crtc > 6) 2363 num_crtc = 6; 2364 adev->mode_info.num_crtc = num_crtc; 2365 } else { 2366 adev->mode_info.num_crtc = 1; 2367 } 2368 break; 2369 } 2370 } 2371 2372 DRM_INFO("virtual display string:%s, %s:virtual_display:%d, num_crtc:%d\n", 2373 amdgpu_virtual_display, pci_address_name, 2374 adev->enable_virtual_display, adev->mode_info.num_crtc); 2375 2376 kfree(pciaddstr); 2377 } 2378 } 2379 2380 void amdgpu_device_set_sriov_virtual_display(struct amdgpu_device *adev) 2381 { 2382 if (amdgpu_sriov_vf(adev) && !adev->enable_virtual_display) { 2383 adev->mode_info.num_crtc = 1; 2384 adev->enable_virtual_display = true; 2385 DRM_INFO("virtual_display:%d, num_crtc:%d\n", 2386 adev->enable_virtual_display, adev->mode_info.num_crtc); 2387 } 2388 } 2389 2390 /** 2391 * amdgpu_device_parse_gpu_info_fw - parse gpu info firmware 2392 * 2393 * @adev: amdgpu_device pointer 2394 * 2395 * Parses the asic configuration parameters specified in the gpu info 2396 * firmware and makes them availale to the driver for use in configuring 2397 * the asic. 2398 * Returns 0 on success, -EINVAL on failure. 2399 */ 2400 static int amdgpu_device_parse_gpu_info_fw(struct amdgpu_device *adev) 2401 { 2402 const char *chip_name; 2403 int err; 2404 const struct gpu_info_firmware_header_v1_0 *hdr; 2405 2406 adev->firmware.gpu_info_fw = NULL; 2407 2408 if (adev->mman.discovery_bin) 2409 return 0; 2410 2411 switch (adev->asic_type) { 2412 default: 2413 return 0; 2414 case CHIP_VEGA10: 2415 chip_name = "vega10"; 2416 break; 2417 case CHIP_VEGA12: 2418 chip_name = "vega12"; 2419 break; 2420 case CHIP_RAVEN: 2421 if (adev->apu_flags & AMD_APU_IS_RAVEN2) 2422 chip_name = "raven2"; 2423 else if (adev->apu_flags & AMD_APU_IS_PICASSO) 2424 chip_name = "picasso"; 2425 else 2426 chip_name = "raven"; 2427 break; 2428 case CHIP_ARCTURUS: 2429 chip_name = "arcturus"; 2430 break; 2431 case CHIP_NAVI12: 2432 chip_name = "navi12"; 2433 break; 2434 } 2435 2436 err = amdgpu_ucode_request(adev, &adev->firmware.gpu_info_fw, 2437 "amdgpu/%s_gpu_info.bin", chip_name); 2438 if (err) { 2439 dev_err(adev->dev, 2440 "Failed to get gpu_info firmware \"%s_gpu_info.bin\"\n", 2441 chip_name); 2442 goto out; 2443 } 2444 2445 hdr = (const struct gpu_info_firmware_header_v1_0 *)adev->firmware.gpu_info_fw->data; 2446 amdgpu_ucode_print_gpu_info_hdr(&hdr->header); 2447 2448 switch (hdr->version_major) { 2449 case 1: 2450 { 2451 const struct gpu_info_firmware_v1_0 *gpu_info_fw = 2452 (const struct gpu_info_firmware_v1_0 *)(adev->firmware.gpu_info_fw->data + 2453 le32_to_cpu(hdr->header.ucode_array_offset_bytes)); 2454 2455 /* 2456 * Should be droped when DAL no longer needs it. 2457 */ 2458 if (adev->asic_type == CHIP_NAVI12) 2459 goto parse_soc_bounding_box; 2460 2461 adev->gfx.config.max_shader_engines = le32_to_cpu(gpu_info_fw->gc_num_se); 2462 adev->gfx.config.max_cu_per_sh = le32_to_cpu(gpu_info_fw->gc_num_cu_per_sh); 2463 adev->gfx.config.max_sh_per_se = le32_to_cpu(gpu_info_fw->gc_num_sh_per_se); 2464 adev->gfx.config.max_backends_per_se = le32_to_cpu(gpu_info_fw->gc_num_rb_per_se); 2465 adev->gfx.config.max_texture_channel_caches = 2466 le32_to_cpu(gpu_info_fw->gc_num_tccs); 2467 adev->gfx.config.max_gprs = le32_to_cpu(gpu_info_fw->gc_num_gprs); 2468 adev->gfx.config.max_gs_threads = le32_to_cpu(gpu_info_fw->gc_num_max_gs_thds); 2469 adev->gfx.config.gs_vgt_table_depth = le32_to_cpu(gpu_info_fw->gc_gs_table_depth); 2470 adev->gfx.config.gs_prim_buffer_depth = le32_to_cpu(gpu_info_fw->gc_gsprim_buff_depth); 2471 adev->gfx.config.double_offchip_lds_buf = 2472 le32_to_cpu(gpu_info_fw->gc_double_offchip_lds_buffer); 2473 adev->gfx.cu_info.wave_front_size = le32_to_cpu(gpu_info_fw->gc_wave_size); 2474 adev->gfx.cu_info.max_waves_per_simd = 2475 le32_to_cpu(gpu_info_fw->gc_max_waves_per_simd); 2476 adev->gfx.cu_info.max_scratch_slots_per_cu = 2477 le32_to_cpu(gpu_info_fw->gc_max_scratch_slots_per_cu); 2478 adev->gfx.cu_info.lds_size = le32_to_cpu(gpu_info_fw->gc_lds_size); 2479 if (hdr->version_minor >= 1) { 2480 const struct gpu_info_firmware_v1_1 *gpu_info_fw = 2481 (const struct gpu_info_firmware_v1_1 *)(adev->firmware.gpu_info_fw->data + 2482 le32_to_cpu(hdr->header.ucode_array_offset_bytes)); 2483 adev->gfx.config.num_sc_per_sh = 2484 le32_to_cpu(gpu_info_fw->num_sc_per_sh); 2485 adev->gfx.config.num_packer_per_sc = 2486 le32_to_cpu(gpu_info_fw->num_packer_per_sc); 2487 } 2488 2489 parse_soc_bounding_box: 2490 /* 2491 * soc bounding box info is not integrated in disocovery table, 2492 * we always need to parse it from gpu info firmware if needed. 2493 */ 2494 if (hdr->version_minor == 2) { 2495 const struct gpu_info_firmware_v1_2 *gpu_info_fw = 2496 (const struct gpu_info_firmware_v1_2 *)(adev->firmware.gpu_info_fw->data + 2497 le32_to_cpu(hdr->header.ucode_array_offset_bytes)); 2498 adev->dm.soc_bounding_box = &gpu_info_fw->soc_bounding_box; 2499 } 2500 break; 2501 } 2502 default: 2503 dev_err(adev->dev, 2504 "Unsupported gpu_info table %d\n", hdr->header.ucode_version); 2505 err = -EINVAL; 2506 goto out; 2507 } 2508 out: 2509 return err; 2510 } 2511 2512 /** 2513 * amdgpu_device_ip_early_init - run early init for hardware IPs 2514 * 2515 * @adev: amdgpu_device pointer 2516 * 2517 * Early initialization pass for hardware IPs. The hardware IPs that make 2518 * up each asic are discovered each IP's early_init callback is run. This 2519 * is the first stage in initializing the asic. 2520 * Returns 0 on success, negative error code on failure. 2521 */ 2522 static int amdgpu_device_ip_early_init(struct amdgpu_device *adev) 2523 { 2524 struct amdgpu_ip_block *ip_block; 2525 struct pci_dev *parent; 2526 int i, r; 2527 bool total; 2528 2529 amdgpu_device_enable_virtual_display(adev); 2530 2531 if (amdgpu_sriov_vf(adev)) { 2532 r = amdgpu_virt_request_full_gpu(adev, true); 2533 if (r) 2534 return r; 2535 } 2536 2537 switch (adev->asic_type) { 2538 #ifdef CONFIG_DRM_AMDGPU_SI 2539 case CHIP_VERDE: 2540 case CHIP_TAHITI: 2541 case CHIP_PITCAIRN: 2542 case CHIP_OLAND: 2543 case CHIP_HAINAN: 2544 adev->family = AMDGPU_FAMILY_SI; 2545 r = si_set_ip_blocks(adev); 2546 if (r) 2547 return r; 2548 break; 2549 #endif 2550 #ifdef CONFIG_DRM_AMDGPU_CIK 2551 case CHIP_BONAIRE: 2552 case CHIP_HAWAII: 2553 case CHIP_KAVERI: 2554 case CHIP_KABINI: 2555 case CHIP_MULLINS: 2556 if (adev->flags & AMD_IS_APU) 2557 adev->family = AMDGPU_FAMILY_KV; 2558 else 2559 adev->family = AMDGPU_FAMILY_CI; 2560 2561 r = cik_set_ip_blocks(adev); 2562 if (r) 2563 return r; 2564 break; 2565 #endif 2566 case CHIP_TOPAZ: 2567 case CHIP_TONGA: 2568 case CHIP_FIJI: 2569 case CHIP_POLARIS10: 2570 case CHIP_POLARIS11: 2571 case CHIP_POLARIS12: 2572 case CHIP_VEGAM: 2573 case CHIP_CARRIZO: 2574 case CHIP_STONEY: 2575 if (adev->flags & AMD_IS_APU) 2576 adev->family = AMDGPU_FAMILY_CZ; 2577 else 2578 adev->family = AMDGPU_FAMILY_VI; 2579 2580 r = vi_set_ip_blocks(adev); 2581 if (r) 2582 return r; 2583 break; 2584 default: 2585 r = amdgpu_discovery_set_ip_blocks(adev); 2586 if (r) 2587 return r; 2588 break; 2589 } 2590 2591 if (amdgpu_has_atpx() && 2592 (amdgpu_is_atpx_hybrid() || 2593 amdgpu_has_atpx_dgpu_power_cntl()) && 2594 ((adev->flags & AMD_IS_APU) == 0) && 2595 !dev_is_removable(&adev->pdev->dev)) 2596 adev->flags |= AMD_IS_PX; 2597 2598 if (!(adev->flags & AMD_IS_APU)) { 2599 parent = pcie_find_root_port(adev->pdev); 2600 adev->has_pr3 = parent ? pci_pr3_present(parent) : false; 2601 } 2602 2603 2604 adev->pm.pp_feature = amdgpu_pp_feature_mask; 2605 if (amdgpu_sriov_vf(adev) || sched_policy == KFD_SCHED_POLICY_NO_HWS) 2606 adev->pm.pp_feature &= ~PP_GFXOFF_MASK; 2607 if (amdgpu_sriov_vf(adev) && adev->asic_type == CHIP_SIENNA_CICHLID) 2608 adev->pm.pp_feature &= ~PP_OVERDRIVE_MASK; 2609 if (!amdgpu_device_pcie_dynamic_switching_supported(adev)) 2610 adev->pm.pp_feature &= ~PP_PCIE_DPM_MASK; 2611 2612 total = true; 2613 for (i = 0; i < adev->num_ip_blocks; i++) { 2614 ip_block = &adev->ip_blocks[i]; 2615 2616 if ((amdgpu_ip_block_mask & (1 << i)) == 0) { 2617 DRM_WARN("disabled ip block: %d <%s>\n", 2618 i, adev->ip_blocks[i].version->funcs->name); 2619 adev->ip_blocks[i].status.valid = false; 2620 } else if (ip_block->version->funcs->early_init) { 2621 r = ip_block->version->funcs->early_init(ip_block); 2622 if (r == -ENOENT) { 2623 adev->ip_blocks[i].status.valid = false; 2624 } else if (r) { 2625 DRM_ERROR("early_init of IP block <%s> failed %d\n", 2626 adev->ip_blocks[i].version->funcs->name, r); 2627 total = false; 2628 } else { 2629 adev->ip_blocks[i].status.valid = true; 2630 } 2631 } else { 2632 adev->ip_blocks[i].status.valid = true; 2633 } 2634 /* get the vbios after the asic_funcs are set up */ 2635 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON) { 2636 r = amdgpu_device_parse_gpu_info_fw(adev); 2637 if (r) 2638 return r; 2639 2640 /* Read BIOS */ 2641 if (amdgpu_device_read_bios(adev)) { 2642 if (!amdgpu_get_bios(adev)) 2643 return -EINVAL; 2644 2645 r = amdgpu_atombios_init(adev); 2646 if (r) { 2647 dev_err(adev->dev, "amdgpu_atombios_init failed\n"); 2648 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_INIT_FAIL, 0, 0); 2649 return r; 2650 } 2651 } 2652 2653 /*get pf2vf msg info at it's earliest time*/ 2654 if (amdgpu_sriov_vf(adev)) 2655 amdgpu_virt_init_data_exchange(adev); 2656 2657 } 2658 } 2659 if (!total) 2660 return -ENODEV; 2661 2662 ip_block = amdgpu_device_ip_get_ip_block(adev, AMD_IP_BLOCK_TYPE_GFX); 2663 if (ip_block->status.valid != false) 2664 amdgpu_amdkfd_device_probe(adev); 2665 2666 adev->cg_flags &= amdgpu_cg_mask; 2667 adev->pg_flags &= amdgpu_pg_mask; 2668 2669 return 0; 2670 } 2671 2672 static int amdgpu_device_ip_hw_init_phase1(struct amdgpu_device *adev) 2673 { 2674 int i, r; 2675 2676 for (i = 0; i < adev->num_ip_blocks; i++) { 2677 if (!adev->ip_blocks[i].status.sw) 2678 continue; 2679 if (adev->ip_blocks[i].status.hw) 2680 continue; 2681 if (!amdgpu_ip_member_of_hwini( 2682 adev, adev->ip_blocks[i].version->type)) 2683 continue; 2684 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON || 2685 (amdgpu_sriov_vf(adev) && (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP)) || 2686 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH) { 2687 r = adev->ip_blocks[i].version->funcs->hw_init(&adev->ip_blocks[i]); 2688 if (r) { 2689 DRM_ERROR("hw_init of IP block <%s> failed %d\n", 2690 adev->ip_blocks[i].version->funcs->name, r); 2691 return r; 2692 } 2693 adev->ip_blocks[i].status.hw = true; 2694 } 2695 } 2696 2697 return 0; 2698 } 2699 2700 static int amdgpu_device_ip_hw_init_phase2(struct amdgpu_device *adev) 2701 { 2702 int i, r; 2703 2704 for (i = 0; i < adev->num_ip_blocks; i++) { 2705 if (!adev->ip_blocks[i].status.sw) 2706 continue; 2707 if (adev->ip_blocks[i].status.hw) 2708 continue; 2709 if (!amdgpu_ip_member_of_hwini( 2710 adev, adev->ip_blocks[i].version->type)) 2711 continue; 2712 r = adev->ip_blocks[i].version->funcs->hw_init(&adev->ip_blocks[i]); 2713 if (r) { 2714 DRM_ERROR("hw_init of IP block <%s> failed %d\n", 2715 adev->ip_blocks[i].version->funcs->name, r); 2716 return r; 2717 } 2718 adev->ip_blocks[i].status.hw = true; 2719 } 2720 2721 return 0; 2722 } 2723 2724 static int amdgpu_device_fw_loading(struct amdgpu_device *adev) 2725 { 2726 int r = 0; 2727 int i; 2728 uint32_t smu_version; 2729 2730 if (adev->asic_type >= CHIP_VEGA10) { 2731 for (i = 0; i < adev->num_ip_blocks; i++) { 2732 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_PSP) 2733 continue; 2734 2735 if (!amdgpu_ip_member_of_hwini(adev, 2736 AMD_IP_BLOCK_TYPE_PSP)) 2737 break; 2738 2739 if (!adev->ip_blocks[i].status.sw) 2740 continue; 2741 2742 /* no need to do the fw loading again if already done*/ 2743 if (adev->ip_blocks[i].status.hw == true) 2744 break; 2745 2746 if (amdgpu_in_reset(adev) || adev->in_suspend) { 2747 r = adev->ip_blocks[i].version->funcs->resume(&adev->ip_blocks[i]); 2748 if (r) { 2749 DRM_ERROR("resume of IP block <%s> failed %d\n", 2750 adev->ip_blocks[i].version->funcs->name, r); 2751 return r; 2752 } 2753 } else { 2754 r = adev->ip_blocks[i].version->funcs->hw_init(&adev->ip_blocks[i]); 2755 if (r) { 2756 DRM_ERROR("hw_init of IP block <%s> failed %d\n", 2757 adev->ip_blocks[i].version->funcs->name, r); 2758 return r; 2759 } 2760 } 2761 2762 adev->ip_blocks[i].status.hw = true; 2763 break; 2764 } 2765 } 2766 2767 if (!amdgpu_sriov_vf(adev) || adev->asic_type == CHIP_TONGA) 2768 r = amdgpu_pm_load_smu_firmware(adev, &smu_version); 2769 2770 return r; 2771 } 2772 2773 static int amdgpu_device_init_schedulers(struct amdgpu_device *adev) 2774 { 2775 long timeout; 2776 int r, i; 2777 2778 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 2779 struct amdgpu_ring *ring = adev->rings[i]; 2780 2781 /* No need to setup the GPU scheduler for rings that don't need it */ 2782 if (!ring || ring->no_scheduler) 2783 continue; 2784 2785 switch (ring->funcs->type) { 2786 case AMDGPU_RING_TYPE_GFX: 2787 timeout = adev->gfx_timeout; 2788 break; 2789 case AMDGPU_RING_TYPE_COMPUTE: 2790 timeout = adev->compute_timeout; 2791 break; 2792 case AMDGPU_RING_TYPE_SDMA: 2793 timeout = adev->sdma_timeout; 2794 break; 2795 default: 2796 timeout = adev->video_timeout; 2797 break; 2798 } 2799 2800 r = drm_sched_init(&ring->sched, &amdgpu_sched_ops, NULL, 2801 DRM_SCHED_PRIORITY_COUNT, 2802 ring->num_hw_submission, 0, 2803 timeout, adev->reset_domain->wq, 2804 ring->sched_score, ring->name, 2805 adev->dev); 2806 if (r) { 2807 DRM_ERROR("Failed to create scheduler on ring %s.\n", 2808 ring->name); 2809 return r; 2810 } 2811 r = amdgpu_uvd_entity_init(adev, ring); 2812 if (r) { 2813 DRM_ERROR("Failed to create UVD scheduling entity on ring %s.\n", 2814 ring->name); 2815 return r; 2816 } 2817 r = amdgpu_vce_entity_init(adev, ring); 2818 if (r) { 2819 DRM_ERROR("Failed to create VCE scheduling entity on ring %s.\n", 2820 ring->name); 2821 return r; 2822 } 2823 } 2824 2825 amdgpu_xcp_update_partition_sched_list(adev); 2826 2827 return 0; 2828 } 2829 2830 2831 /** 2832 * amdgpu_device_ip_init - run init for hardware IPs 2833 * 2834 * @adev: amdgpu_device pointer 2835 * 2836 * Main initialization pass for hardware IPs. The list of all the hardware 2837 * IPs that make up the asic is walked and the sw_init and hw_init callbacks 2838 * are run. sw_init initializes the software state associated with each IP 2839 * and hw_init initializes the hardware associated with each IP. 2840 * Returns 0 on success, negative error code on failure. 2841 */ 2842 static int amdgpu_device_ip_init(struct amdgpu_device *adev) 2843 { 2844 bool init_badpage; 2845 int i, r; 2846 2847 r = amdgpu_ras_init(adev); 2848 if (r) 2849 return r; 2850 2851 for (i = 0; i < adev->num_ip_blocks; i++) { 2852 if (!adev->ip_blocks[i].status.valid) 2853 continue; 2854 r = adev->ip_blocks[i].version->funcs->sw_init(&adev->ip_blocks[i]); 2855 if (r) { 2856 DRM_ERROR("sw_init of IP block <%s> failed %d\n", 2857 adev->ip_blocks[i].version->funcs->name, r); 2858 goto init_failed; 2859 } 2860 adev->ip_blocks[i].status.sw = true; 2861 2862 if (!amdgpu_ip_member_of_hwini( 2863 adev, adev->ip_blocks[i].version->type)) 2864 continue; 2865 2866 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON) { 2867 /* need to do common hw init early so everything is set up for gmc */ 2868 r = adev->ip_blocks[i].version->funcs->hw_init(&adev->ip_blocks[i]); 2869 if (r) { 2870 DRM_ERROR("hw_init %d failed %d\n", i, r); 2871 goto init_failed; 2872 } 2873 adev->ip_blocks[i].status.hw = true; 2874 } else if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) { 2875 /* need to do gmc hw init early so we can allocate gpu mem */ 2876 /* Try to reserve bad pages early */ 2877 if (amdgpu_sriov_vf(adev)) 2878 amdgpu_virt_exchange_data(adev); 2879 2880 r = amdgpu_device_mem_scratch_init(adev); 2881 if (r) { 2882 DRM_ERROR("amdgpu_mem_scratch_init failed %d\n", r); 2883 goto init_failed; 2884 } 2885 r = adev->ip_blocks[i].version->funcs->hw_init(&adev->ip_blocks[i]); 2886 if (r) { 2887 DRM_ERROR("hw_init %d failed %d\n", i, r); 2888 goto init_failed; 2889 } 2890 r = amdgpu_device_wb_init(adev); 2891 if (r) { 2892 DRM_ERROR("amdgpu_device_wb_init failed %d\n", r); 2893 goto init_failed; 2894 } 2895 adev->ip_blocks[i].status.hw = true; 2896 2897 /* right after GMC hw init, we create CSA */ 2898 if (adev->gfx.mcbp) { 2899 r = amdgpu_allocate_static_csa(adev, &adev->virt.csa_obj, 2900 AMDGPU_GEM_DOMAIN_VRAM | 2901 AMDGPU_GEM_DOMAIN_GTT, 2902 AMDGPU_CSA_SIZE); 2903 if (r) { 2904 DRM_ERROR("allocate CSA failed %d\n", r); 2905 goto init_failed; 2906 } 2907 } 2908 2909 r = amdgpu_seq64_init(adev); 2910 if (r) { 2911 DRM_ERROR("allocate seq64 failed %d\n", r); 2912 goto init_failed; 2913 } 2914 } 2915 } 2916 2917 if (amdgpu_sriov_vf(adev)) 2918 amdgpu_virt_init_data_exchange(adev); 2919 2920 r = amdgpu_ib_pool_init(adev); 2921 if (r) { 2922 dev_err(adev->dev, "IB initialization failed (%d).\n", r); 2923 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_IB_INIT_FAIL, 0, r); 2924 goto init_failed; 2925 } 2926 2927 r = amdgpu_ucode_create_bo(adev); /* create ucode bo when sw_init complete*/ 2928 if (r) 2929 goto init_failed; 2930 2931 r = amdgpu_device_ip_hw_init_phase1(adev); 2932 if (r) 2933 goto init_failed; 2934 2935 r = amdgpu_device_fw_loading(adev); 2936 if (r) 2937 goto init_failed; 2938 2939 r = amdgpu_device_ip_hw_init_phase2(adev); 2940 if (r) 2941 goto init_failed; 2942 2943 /* 2944 * retired pages will be loaded from eeprom and reserved here, 2945 * it should be called after amdgpu_device_ip_hw_init_phase2 since 2946 * for some ASICs the RAS EEPROM code relies on SMU fully functioning 2947 * for I2C communication which only true at this point. 2948 * 2949 * amdgpu_ras_recovery_init may fail, but the upper only cares the 2950 * failure from bad gpu situation and stop amdgpu init process 2951 * accordingly. For other failed cases, it will still release all 2952 * the resource and print error message, rather than returning one 2953 * negative value to upper level. 2954 * 2955 * Note: theoretically, this should be called before all vram allocations 2956 * to protect retired page from abusing 2957 */ 2958 init_badpage = (adev->init_lvl->level != AMDGPU_INIT_LEVEL_MINIMAL_XGMI); 2959 r = amdgpu_ras_recovery_init(adev, init_badpage); 2960 if (r) 2961 goto init_failed; 2962 2963 /** 2964 * In case of XGMI grab extra reference for reset domain for this device 2965 */ 2966 if (adev->gmc.xgmi.num_physical_nodes > 1) { 2967 if (amdgpu_xgmi_add_device(adev) == 0) { 2968 if (!amdgpu_sriov_vf(adev)) { 2969 struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev); 2970 2971 if (WARN_ON(!hive)) { 2972 r = -ENOENT; 2973 goto init_failed; 2974 } 2975 2976 if (!hive->reset_domain || 2977 !amdgpu_reset_get_reset_domain(hive->reset_domain)) { 2978 r = -ENOENT; 2979 amdgpu_put_xgmi_hive(hive); 2980 goto init_failed; 2981 } 2982 2983 /* Drop the early temporary reset domain we created for device */ 2984 amdgpu_reset_put_reset_domain(adev->reset_domain); 2985 adev->reset_domain = hive->reset_domain; 2986 amdgpu_put_xgmi_hive(hive); 2987 } 2988 } 2989 } 2990 2991 r = amdgpu_device_init_schedulers(adev); 2992 if (r) 2993 goto init_failed; 2994 2995 if (adev->mman.buffer_funcs_ring->sched.ready) 2996 amdgpu_ttm_set_buffer_funcs_status(adev, true); 2997 2998 /* Don't init kfd if whole hive need to be reset during init */ 2999 if (adev->init_lvl->level != AMDGPU_INIT_LEVEL_MINIMAL_XGMI) { 3000 kgd2kfd_init_zone_device(adev); 3001 amdgpu_amdkfd_device_init(adev); 3002 } 3003 3004 amdgpu_fru_get_product_info(adev); 3005 3006 init_failed: 3007 3008 return r; 3009 } 3010 3011 /** 3012 * amdgpu_device_fill_reset_magic - writes reset magic to gart pointer 3013 * 3014 * @adev: amdgpu_device pointer 3015 * 3016 * Writes a reset magic value to the gart pointer in VRAM. The driver calls 3017 * this function before a GPU reset. If the value is retained after a 3018 * GPU reset, VRAM has not been lost. Some GPU resets may destry VRAM contents. 3019 */ 3020 static void amdgpu_device_fill_reset_magic(struct amdgpu_device *adev) 3021 { 3022 memcpy(adev->reset_magic, adev->gart.ptr, AMDGPU_RESET_MAGIC_NUM); 3023 } 3024 3025 /** 3026 * amdgpu_device_check_vram_lost - check if vram is valid 3027 * 3028 * @adev: amdgpu_device pointer 3029 * 3030 * Checks the reset magic value written to the gart pointer in VRAM. 3031 * The driver calls this after a GPU reset to see if the contents of 3032 * VRAM is lost or now. 3033 * returns true if vram is lost, false if not. 3034 */ 3035 static bool amdgpu_device_check_vram_lost(struct amdgpu_device *adev) 3036 { 3037 if (memcmp(adev->gart.ptr, adev->reset_magic, 3038 AMDGPU_RESET_MAGIC_NUM)) 3039 return true; 3040 3041 if (!amdgpu_in_reset(adev)) 3042 return false; 3043 3044 /* 3045 * For all ASICs with baco/mode1 reset, the VRAM is 3046 * always assumed to be lost. 3047 */ 3048 switch (amdgpu_asic_reset_method(adev)) { 3049 case AMD_RESET_METHOD_BACO: 3050 case AMD_RESET_METHOD_MODE1: 3051 return true; 3052 default: 3053 return false; 3054 } 3055 } 3056 3057 /** 3058 * amdgpu_device_set_cg_state - set clockgating for amdgpu device 3059 * 3060 * @adev: amdgpu_device pointer 3061 * @state: clockgating state (gate or ungate) 3062 * 3063 * The list of all the hardware IPs that make up the asic is walked and the 3064 * set_clockgating_state callbacks are run. 3065 * Late initialization pass enabling clockgating for hardware IPs. 3066 * Fini or suspend, pass disabling clockgating for hardware IPs. 3067 * Returns 0 on success, negative error code on failure. 3068 */ 3069 3070 int amdgpu_device_set_cg_state(struct amdgpu_device *adev, 3071 enum amd_clockgating_state state) 3072 { 3073 int i, j, r; 3074 3075 if (amdgpu_emu_mode == 1) 3076 return 0; 3077 3078 for (j = 0; j < adev->num_ip_blocks; j++) { 3079 i = state == AMD_CG_STATE_GATE ? j : adev->num_ip_blocks - j - 1; 3080 if (!adev->ip_blocks[i].status.late_initialized) 3081 continue; 3082 /* skip CG for GFX, SDMA on S0ix */ 3083 if (adev->in_s0ix && 3084 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GFX || 3085 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SDMA)) 3086 continue; 3087 /* skip CG for VCE/UVD, it's handled specially */ 3088 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_UVD && 3089 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCE && 3090 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCN && 3091 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_JPEG && 3092 adev->ip_blocks[i].version->funcs->set_clockgating_state) { 3093 /* enable clockgating to save power */ 3094 r = adev->ip_blocks[i].version->funcs->set_clockgating_state((void *)adev, 3095 state); 3096 if (r) { 3097 DRM_ERROR("set_clockgating_state(gate) of IP block <%s> failed %d\n", 3098 adev->ip_blocks[i].version->funcs->name, r); 3099 return r; 3100 } 3101 } 3102 } 3103 3104 return 0; 3105 } 3106 3107 int amdgpu_device_set_pg_state(struct amdgpu_device *adev, 3108 enum amd_powergating_state state) 3109 { 3110 int i, j, r; 3111 3112 if (amdgpu_emu_mode == 1) 3113 return 0; 3114 3115 for (j = 0; j < adev->num_ip_blocks; j++) { 3116 i = state == AMD_PG_STATE_GATE ? j : adev->num_ip_blocks - j - 1; 3117 if (!adev->ip_blocks[i].status.late_initialized) 3118 continue; 3119 /* skip PG for GFX, SDMA on S0ix */ 3120 if (adev->in_s0ix && 3121 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GFX || 3122 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SDMA)) 3123 continue; 3124 /* skip CG for VCE/UVD, it's handled specially */ 3125 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_UVD && 3126 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCE && 3127 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCN && 3128 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_JPEG && 3129 adev->ip_blocks[i].version->funcs->set_powergating_state) { 3130 /* enable powergating to save power */ 3131 r = adev->ip_blocks[i].version->funcs->set_powergating_state((void *)adev, 3132 state); 3133 if (r) { 3134 DRM_ERROR("set_powergating_state(gate) of IP block <%s> failed %d\n", 3135 adev->ip_blocks[i].version->funcs->name, r); 3136 return r; 3137 } 3138 } 3139 } 3140 return 0; 3141 } 3142 3143 static int amdgpu_device_enable_mgpu_fan_boost(void) 3144 { 3145 struct amdgpu_gpu_instance *gpu_ins; 3146 struct amdgpu_device *adev; 3147 int i, ret = 0; 3148 3149 mutex_lock(&mgpu_info.mutex); 3150 3151 /* 3152 * MGPU fan boost feature should be enabled 3153 * only when there are two or more dGPUs in 3154 * the system 3155 */ 3156 if (mgpu_info.num_dgpu < 2) 3157 goto out; 3158 3159 for (i = 0; i < mgpu_info.num_dgpu; i++) { 3160 gpu_ins = &(mgpu_info.gpu_ins[i]); 3161 adev = gpu_ins->adev; 3162 if (!(adev->flags & AMD_IS_APU) && 3163 !gpu_ins->mgpu_fan_enabled) { 3164 ret = amdgpu_dpm_enable_mgpu_fan_boost(adev); 3165 if (ret) 3166 break; 3167 3168 gpu_ins->mgpu_fan_enabled = 1; 3169 } 3170 } 3171 3172 out: 3173 mutex_unlock(&mgpu_info.mutex); 3174 3175 return ret; 3176 } 3177 3178 /** 3179 * amdgpu_device_ip_late_init - run late init for hardware IPs 3180 * 3181 * @adev: amdgpu_device pointer 3182 * 3183 * Late initialization pass for hardware IPs. The list of all the hardware 3184 * IPs that make up the asic is walked and the late_init callbacks are run. 3185 * late_init covers any special initialization that an IP requires 3186 * after all of the have been initialized or something that needs to happen 3187 * late in the init process. 3188 * Returns 0 on success, negative error code on failure. 3189 */ 3190 static int amdgpu_device_ip_late_init(struct amdgpu_device *adev) 3191 { 3192 struct amdgpu_gpu_instance *gpu_instance; 3193 int i = 0, r; 3194 3195 for (i = 0; i < adev->num_ip_blocks; i++) { 3196 if (!adev->ip_blocks[i].status.hw) 3197 continue; 3198 if (adev->ip_blocks[i].version->funcs->late_init) { 3199 r = adev->ip_blocks[i].version->funcs->late_init(&adev->ip_blocks[i]); 3200 if (r) { 3201 DRM_ERROR("late_init of IP block <%s> failed %d\n", 3202 adev->ip_blocks[i].version->funcs->name, r); 3203 return r; 3204 } 3205 } 3206 adev->ip_blocks[i].status.late_initialized = true; 3207 } 3208 3209 r = amdgpu_ras_late_init(adev); 3210 if (r) { 3211 DRM_ERROR("amdgpu_ras_late_init failed %d", r); 3212 return r; 3213 } 3214 3215 if (!amdgpu_in_reset(adev)) 3216 amdgpu_ras_set_error_query_ready(adev, true); 3217 3218 amdgpu_device_set_cg_state(adev, AMD_CG_STATE_GATE); 3219 amdgpu_device_set_pg_state(adev, AMD_PG_STATE_GATE); 3220 3221 amdgpu_device_fill_reset_magic(adev); 3222 3223 r = amdgpu_device_enable_mgpu_fan_boost(); 3224 if (r) 3225 DRM_ERROR("enable mgpu fan boost failed (%d).\n", r); 3226 3227 /* For passthrough configuration on arcturus and aldebaran, enable special handling SBR */ 3228 if (amdgpu_passthrough(adev) && 3229 ((adev->asic_type == CHIP_ARCTURUS && adev->gmc.xgmi.num_physical_nodes > 1) || 3230 adev->asic_type == CHIP_ALDEBARAN)) 3231 amdgpu_dpm_handle_passthrough_sbr(adev, true); 3232 3233 if (adev->gmc.xgmi.num_physical_nodes > 1) { 3234 mutex_lock(&mgpu_info.mutex); 3235 3236 /* 3237 * Reset device p-state to low as this was booted with high. 3238 * 3239 * This should be performed only after all devices from the same 3240 * hive get initialized. 3241 * 3242 * However, it's unknown how many device in the hive in advance. 3243 * As this is counted one by one during devices initializations. 3244 * 3245 * So, we wait for all XGMI interlinked devices initialized. 3246 * This may bring some delays as those devices may come from 3247 * different hives. But that should be OK. 3248 */ 3249 if (mgpu_info.num_dgpu == adev->gmc.xgmi.num_physical_nodes) { 3250 for (i = 0; i < mgpu_info.num_gpu; i++) { 3251 gpu_instance = &(mgpu_info.gpu_ins[i]); 3252 if (gpu_instance->adev->flags & AMD_IS_APU) 3253 continue; 3254 3255 r = amdgpu_xgmi_set_pstate(gpu_instance->adev, 3256 AMDGPU_XGMI_PSTATE_MIN); 3257 if (r) { 3258 DRM_ERROR("pstate setting failed (%d).\n", r); 3259 break; 3260 } 3261 } 3262 } 3263 3264 mutex_unlock(&mgpu_info.mutex); 3265 } 3266 3267 return 0; 3268 } 3269 3270 /** 3271 * amdgpu_device_smu_fini_early - smu hw_fini wrapper 3272 * 3273 * @adev: amdgpu_device pointer 3274 * 3275 * For ASICs need to disable SMC first 3276 */ 3277 static void amdgpu_device_smu_fini_early(struct amdgpu_device *adev) 3278 { 3279 int i, r; 3280 3281 if (amdgpu_ip_version(adev, GC_HWIP, 0) > IP_VERSION(9, 0, 0)) 3282 return; 3283 3284 for (i = 0; i < adev->num_ip_blocks; i++) { 3285 if (!adev->ip_blocks[i].status.hw) 3286 continue; 3287 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) { 3288 r = adev->ip_blocks[i].version->funcs->hw_fini(&adev->ip_blocks[i]); 3289 /* XXX handle errors */ 3290 if (r) { 3291 DRM_DEBUG("hw_fini of IP block <%s> failed %d\n", 3292 adev->ip_blocks[i].version->funcs->name, r); 3293 } 3294 adev->ip_blocks[i].status.hw = false; 3295 break; 3296 } 3297 } 3298 } 3299 3300 static int amdgpu_device_ip_fini_early(struct amdgpu_device *adev) 3301 { 3302 int i, r; 3303 3304 for (i = 0; i < adev->num_ip_blocks; i++) { 3305 if (!adev->ip_blocks[i].version->funcs->early_fini) 3306 continue; 3307 3308 r = adev->ip_blocks[i].version->funcs->early_fini(&adev->ip_blocks[i]); 3309 if (r) { 3310 DRM_DEBUG("early_fini of IP block <%s> failed %d\n", 3311 adev->ip_blocks[i].version->funcs->name, r); 3312 } 3313 } 3314 3315 amdgpu_device_set_pg_state(adev, AMD_PG_STATE_UNGATE); 3316 amdgpu_device_set_cg_state(adev, AMD_CG_STATE_UNGATE); 3317 3318 amdgpu_amdkfd_suspend(adev, false); 3319 3320 /* Workaroud for ASICs need to disable SMC first */ 3321 amdgpu_device_smu_fini_early(adev); 3322 3323 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 3324 if (!adev->ip_blocks[i].status.hw) 3325 continue; 3326 3327 r = adev->ip_blocks[i].version->funcs->hw_fini(&adev->ip_blocks[i]); 3328 /* XXX handle errors */ 3329 if (r) { 3330 DRM_DEBUG("hw_fini of IP block <%s> failed %d\n", 3331 adev->ip_blocks[i].version->funcs->name, r); 3332 } 3333 3334 adev->ip_blocks[i].status.hw = false; 3335 } 3336 3337 if (amdgpu_sriov_vf(adev)) { 3338 if (amdgpu_virt_release_full_gpu(adev, false)) 3339 DRM_ERROR("failed to release exclusive mode on fini\n"); 3340 } 3341 3342 return 0; 3343 } 3344 3345 /** 3346 * amdgpu_device_ip_fini - run fini for hardware IPs 3347 * 3348 * @adev: amdgpu_device pointer 3349 * 3350 * Main teardown pass for hardware IPs. The list of all the hardware 3351 * IPs that make up the asic is walked and the hw_fini and sw_fini callbacks 3352 * are run. hw_fini tears down the hardware associated with each IP 3353 * and sw_fini tears down any software state associated with each IP. 3354 * Returns 0 on success, negative error code on failure. 3355 */ 3356 static int amdgpu_device_ip_fini(struct amdgpu_device *adev) 3357 { 3358 int i, r; 3359 3360 if (amdgpu_sriov_vf(adev) && adev->virt.ras_init_done) 3361 amdgpu_virt_release_ras_err_handler_data(adev); 3362 3363 if (adev->gmc.xgmi.num_physical_nodes > 1) 3364 amdgpu_xgmi_remove_device(adev); 3365 3366 amdgpu_amdkfd_device_fini_sw(adev); 3367 3368 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 3369 if (!adev->ip_blocks[i].status.sw) 3370 continue; 3371 3372 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) { 3373 amdgpu_ucode_free_bo(adev); 3374 amdgpu_free_static_csa(&adev->virt.csa_obj); 3375 amdgpu_device_wb_fini(adev); 3376 amdgpu_device_mem_scratch_fini(adev); 3377 amdgpu_ib_pool_fini(adev); 3378 amdgpu_seq64_fini(adev); 3379 } 3380 3381 r = adev->ip_blocks[i].version->funcs->sw_fini(&adev->ip_blocks[i]); 3382 /* XXX handle errors */ 3383 if (r) { 3384 DRM_DEBUG("sw_fini of IP block <%s> failed %d\n", 3385 adev->ip_blocks[i].version->funcs->name, r); 3386 } 3387 adev->ip_blocks[i].status.sw = false; 3388 adev->ip_blocks[i].status.valid = false; 3389 } 3390 3391 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 3392 if (!adev->ip_blocks[i].status.late_initialized) 3393 continue; 3394 if (adev->ip_blocks[i].version->funcs->late_fini) 3395 adev->ip_blocks[i].version->funcs->late_fini(&adev->ip_blocks[i]); 3396 adev->ip_blocks[i].status.late_initialized = false; 3397 } 3398 3399 amdgpu_ras_fini(adev); 3400 3401 return 0; 3402 } 3403 3404 /** 3405 * amdgpu_device_delayed_init_work_handler - work handler for IB tests 3406 * 3407 * @work: work_struct. 3408 */ 3409 static void amdgpu_device_delayed_init_work_handler(struct work_struct *work) 3410 { 3411 struct amdgpu_device *adev = 3412 container_of(work, struct amdgpu_device, delayed_init_work.work); 3413 int r; 3414 3415 r = amdgpu_ib_ring_tests(adev); 3416 if (r) 3417 DRM_ERROR("ib ring test failed (%d).\n", r); 3418 } 3419 3420 static void amdgpu_device_delay_enable_gfx_off(struct work_struct *work) 3421 { 3422 struct amdgpu_device *adev = 3423 container_of(work, struct amdgpu_device, gfx.gfx_off_delay_work.work); 3424 3425 WARN_ON_ONCE(adev->gfx.gfx_off_state); 3426 WARN_ON_ONCE(adev->gfx.gfx_off_req_count); 3427 3428 if (!amdgpu_dpm_set_powergating_by_smu(adev, AMD_IP_BLOCK_TYPE_GFX, true)) 3429 adev->gfx.gfx_off_state = true; 3430 } 3431 3432 /** 3433 * amdgpu_device_ip_suspend_phase1 - run suspend for hardware IPs (phase 1) 3434 * 3435 * @adev: amdgpu_device pointer 3436 * 3437 * Main suspend function for hardware IPs. The list of all the hardware 3438 * IPs that make up the asic is walked, clockgating is disabled and the 3439 * suspend callbacks are run. suspend puts the hardware and software state 3440 * in each IP into a state suitable for suspend. 3441 * Returns 0 on success, negative error code on failure. 3442 */ 3443 static int amdgpu_device_ip_suspend_phase1(struct amdgpu_device *adev) 3444 { 3445 int i, r; 3446 3447 amdgpu_device_set_pg_state(adev, AMD_PG_STATE_UNGATE); 3448 amdgpu_device_set_cg_state(adev, AMD_CG_STATE_UNGATE); 3449 3450 /* 3451 * Per PMFW team's suggestion, driver needs to handle gfxoff 3452 * and df cstate features disablement for gpu reset(e.g. Mode1Reset) 3453 * scenario. Add the missing df cstate disablement here. 3454 */ 3455 if (amdgpu_dpm_set_df_cstate(adev, DF_CSTATE_DISALLOW)) 3456 dev_warn(adev->dev, "Failed to disallow df cstate"); 3457 3458 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 3459 if (!adev->ip_blocks[i].status.valid) 3460 continue; 3461 3462 /* displays are handled separately */ 3463 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_DCE) 3464 continue; 3465 3466 /* XXX handle errors */ 3467 r = adev->ip_blocks[i].version->funcs->suspend(&adev->ip_blocks[i]); 3468 /* XXX handle errors */ 3469 if (r) { 3470 DRM_ERROR("suspend of IP block <%s> failed %d\n", 3471 adev->ip_blocks[i].version->funcs->name, r); 3472 return r; 3473 } 3474 3475 adev->ip_blocks[i].status.hw = false; 3476 } 3477 3478 return 0; 3479 } 3480 3481 /** 3482 * amdgpu_device_ip_suspend_phase2 - run suspend for hardware IPs (phase 2) 3483 * 3484 * @adev: amdgpu_device pointer 3485 * 3486 * Main suspend function for hardware IPs. The list of all the hardware 3487 * IPs that make up the asic is walked, clockgating is disabled and the 3488 * suspend callbacks are run. suspend puts the hardware and software state 3489 * in each IP into a state suitable for suspend. 3490 * Returns 0 on success, negative error code on failure. 3491 */ 3492 static int amdgpu_device_ip_suspend_phase2(struct amdgpu_device *adev) 3493 { 3494 int i, r; 3495 3496 if (adev->in_s0ix) 3497 amdgpu_dpm_gfx_state_change(adev, sGpuChangeState_D3Entry); 3498 3499 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 3500 if (!adev->ip_blocks[i].status.valid) 3501 continue; 3502 /* displays are handled in phase1 */ 3503 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE) 3504 continue; 3505 /* PSP lost connection when err_event_athub occurs */ 3506 if (amdgpu_ras_intr_triggered() && 3507 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) { 3508 adev->ip_blocks[i].status.hw = false; 3509 continue; 3510 } 3511 3512 /* skip unnecessary suspend if we do not initialize them yet */ 3513 if (!amdgpu_ip_member_of_hwini( 3514 adev, adev->ip_blocks[i].version->type)) 3515 continue; 3516 3517 /* skip suspend of gfx/mes and psp for S0ix 3518 * gfx is in gfxoff state, so on resume it will exit gfxoff just 3519 * like at runtime. PSP is also part of the always on hardware 3520 * so no need to suspend it. 3521 */ 3522 if (adev->in_s0ix && 3523 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP || 3524 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GFX || 3525 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_MES)) 3526 continue; 3527 3528 /* SDMA 5.x+ is part of GFX power domain so it's covered by GFXOFF */ 3529 if (adev->in_s0ix && 3530 (amdgpu_ip_version(adev, SDMA0_HWIP, 0) >= 3531 IP_VERSION(5, 0, 0)) && 3532 (adev->ip_blocks[i].version->type == 3533 AMD_IP_BLOCK_TYPE_SDMA)) 3534 continue; 3535 3536 /* Once swPSP provides the IMU, RLC FW binaries to TOS during cold-boot. 3537 * These are in TMR, hence are expected to be reused by PSP-TOS to reload 3538 * from this location and RLC Autoload automatically also gets loaded 3539 * from here based on PMFW -> PSP message during re-init sequence. 3540 * Therefore, the psp suspend & resume should be skipped to avoid destroy 3541 * the TMR and reload FWs again for IMU enabled APU ASICs. 3542 */ 3543 if (amdgpu_in_reset(adev) && 3544 (adev->flags & AMD_IS_APU) && adev->gfx.imu.funcs && 3545 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) 3546 continue; 3547 3548 /* XXX handle errors */ 3549 r = adev->ip_blocks[i].version->funcs->suspend(&adev->ip_blocks[i]); 3550 /* XXX handle errors */ 3551 if (r) { 3552 DRM_ERROR("suspend of IP block <%s> failed %d\n", 3553 adev->ip_blocks[i].version->funcs->name, r); 3554 } 3555 adev->ip_blocks[i].status.hw = false; 3556 /* handle putting the SMC in the appropriate state */ 3557 if (!amdgpu_sriov_vf(adev)) { 3558 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) { 3559 r = amdgpu_dpm_set_mp1_state(adev, adev->mp1_state); 3560 if (r) { 3561 DRM_ERROR("SMC failed to set mp1 state %d, %d\n", 3562 adev->mp1_state, r); 3563 return r; 3564 } 3565 } 3566 } 3567 } 3568 3569 return 0; 3570 } 3571 3572 /** 3573 * amdgpu_device_ip_suspend - run suspend for hardware IPs 3574 * 3575 * @adev: amdgpu_device pointer 3576 * 3577 * Main suspend function for hardware IPs. The list of all the hardware 3578 * IPs that make up the asic is walked, clockgating is disabled and the 3579 * suspend callbacks are run. suspend puts the hardware and software state 3580 * in each IP into a state suitable for suspend. 3581 * Returns 0 on success, negative error code on failure. 3582 */ 3583 int amdgpu_device_ip_suspend(struct amdgpu_device *adev) 3584 { 3585 int r; 3586 3587 if (amdgpu_sriov_vf(adev)) { 3588 amdgpu_virt_fini_data_exchange(adev); 3589 amdgpu_virt_request_full_gpu(adev, false); 3590 } 3591 3592 amdgpu_ttm_set_buffer_funcs_status(adev, false); 3593 3594 r = amdgpu_device_ip_suspend_phase1(adev); 3595 if (r) 3596 return r; 3597 r = amdgpu_device_ip_suspend_phase2(adev); 3598 3599 if (amdgpu_sriov_vf(adev)) 3600 amdgpu_virt_release_full_gpu(adev, false); 3601 3602 return r; 3603 } 3604 3605 static int amdgpu_device_ip_reinit_early_sriov(struct amdgpu_device *adev) 3606 { 3607 int i, r; 3608 3609 static enum amd_ip_block_type ip_order[] = { 3610 AMD_IP_BLOCK_TYPE_COMMON, 3611 AMD_IP_BLOCK_TYPE_GMC, 3612 AMD_IP_BLOCK_TYPE_PSP, 3613 AMD_IP_BLOCK_TYPE_IH, 3614 }; 3615 3616 for (i = 0; i < adev->num_ip_blocks; i++) { 3617 int j; 3618 struct amdgpu_ip_block *block; 3619 3620 block = &adev->ip_blocks[i]; 3621 block->status.hw = false; 3622 3623 for (j = 0; j < ARRAY_SIZE(ip_order); j++) { 3624 3625 if (block->version->type != ip_order[j] || 3626 !block->status.valid) 3627 continue; 3628 3629 r = block->version->funcs->hw_init(&adev->ip_blocks[i]); 3630 DRM_INFO("RE-INIT-early: %s %s\n", block->version->funcs->name, r?"failed":"succeeded"); 3631 if (r) 3632 return r; 3633 block->status.hw = true; 3634 } 3635 } 3636 3637 return 0; 3638 } 3639 3640 static int amdgpu_device_ip_reinit_late_sriov(struct amdgpu_device *adev) 3641 { 3642 int i, r; 3643 3644 static enum amd_ip_block_type ip_order[] = { 3645 AMD_IP_BLOCK_TYPE_SMC, 3646 AMD_IP_BLOCK_TYPE_DCE, 3647 AMD_IP_BLOCK_TYPE_GFX, 3648 AMD_IP_BLOCK_TYPE_SDMA, 3649 AMD_IP_BLOCK_TYPE_MES, 3650 AMD_IP_BLOCK_TYPE_UVD, 3651 AMD_IP_BLOCK_TYPE_VCE, 3652 AMD_IP_BLOCK_TYPE_VCN, 3653 AMD_IP_BLOCK_TYPE_JPEG 3654 }; 3655 3656 for (i = 0; i < ARRAY_SIZE(ip_order); i++) { 3657 int j; 3658 struct amdgpu_ip_block *block; 3659 3660 for (j = 0; j < adev->num_ip_blocks; j++) { 3661 block = &adev->ip_blocks[j]; 3662 3663 if (block->version->type != ip_order[i] || 3664 !block->status.valid || 3665 block->status.hw) 3666 continue; 3667 3668 if (block->version->type == AMD_IP_BLOCK_TYPE_SMC) 3669 r = block->version->funcs->resume(&adev->ip_blocks[i]); 3670 else 3671 r = block->version->funcs->hw_init(&adev->ip_blocks[i]); 3672 3673 DRM_INFO("RE-INIT-late: %s %s\n", block->version->funcs->name, r?"failed":"succeeded"); 3674 if (r) 3675 return r; 3676 block->status.hw = true; 3677 } 3678 } 3679 3680 return 0; 3681 } 3682 3683 /** 3684 * amdgpu_device_ip_resume_phase1 - run resume for hardware IPs 3685 * 3686 * @adev: amdgpu_device pointer 3687 * 3688 * First resume function for hardware IPs. The list of all the hardware 3689 * IPs that make up the asic is walked and the resume callbacks are run for 3690 * COMMON, GMC, and IH. resume puts the hardware into a functional state 3691 * after a suspend and updates the software state as necessary. This 3692 * function is also used for restoring the GPU after a GPU reset. 3693 * Returns 0 on success, negative error code on failure. 3694 */ 3695 static int amdgpu_device_ip_resume_phase1(struct amdgpu_device *adev) 3696 { 3697 int i, r; 3698 3699 for (i = 0; i < adev->num_ip_blocks; i++) { 3700 if (!adev->ip_blocks[i].status.valid || adev->ip_blocks[i].status.hw) 3701 continue; 3702 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON || 3703 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC || 3704 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH || 3705 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP && amdgpu_sriov_vf(adev))) { 3706 3707 r = adev->ip_blocks[i].version->funcs->resume(&adev->ip_blocks[i]); 3708 if (r) { 3709 DRM_ERROR("resume of IP block <%s> failed %d\n", 3710 adev->ip_blocks[i].version->funcs->name, r); 3711 return r; 3712 } 3713 adev->ip_blocks[i].status.hw = true; 3714 } 3715 } 3716 3717 return 0; 3718 } 3719 3720 /** 3721 * amdgpu_device_ip_resume_phase2 - run resume for hardware IPs 3722 * 3723 * @adev: amdgpu_device pointer 3724 * 3725 * First resume function for hardware IPs. The list of all the hardware 3726 * IPs that make up the asic is walked and the resume callbacks are run for 3727 * all blocks except COMMON, GMC, and IH. resume puts the hardware into a 3728 * functional state after a suspend and updates the software state as 3729 * necessary. This function is also used for restoring the GPU after a GPU 3730 * reset. 3731 * Returns 0 on success, negative error code on failure. 3732 */ 3733 static int amdgpu_device_ip_resume_phase2(struct amdgpu_device *adev) 3734 { 3735 int i, r; 3736 3737 for (i = 0; i < adev->num_ip_blocks; i++) { 3738 if (!adev->ip_blocks[i].status.valid || adev->ip_blocks[i].status.hw) 3739 continue; 3740 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON || 3741 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC || 3742 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH || 3743 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) 3744 continue; 3745 r = adev->ip_blocks[i].version->funcs->resume(&adev->ip_blocks[i]); 3746 if (r) { 3747 DRM_ERROR("resume of IP block <%s> failed %d\n", 3748 adev->ip_blocks[i].version->funcs->name, r); 3749 return r; 3750 } 3751 adev->ip_blocks[i].status.hw = true; 3752 } 3753 3754 return 0; 3755 } 3756 3757 /** 3758 * amdgpu_device_ip_resume - run resume for hardware IPs 3759 * 3760 * @adev: amdgpu_device pointer 3761 * 3762 * Main resume function for hardware IPs. The hardware IPs 3763 * are split into two resume functions because they are 3764 * also used in recovering from a GPU reset and some additional 3765 * steps need to be take between them. In this case (S3/S4) they are 3766 * run sequentially. 3767 * Returns 0 on success, negative error code on failure. 3768 */ 3769 static int amdgpu_device_ip_resume(struct amdgpu_device *adev) 3770 { 3771 int r; 3772 3773 r = amdgpu_device_ip_resume_phase1(adev); 3774 if (r) 3775 return r; 3776 3777 r = amdgpu_device_fw_loading(adev); 3778 if (r) 3779 return r; 3780 3781 r = amdgpu_device_ip_resume_phase2(adev); 3782 3783 if (adev->mman.buffer_funcs_ring->sched.ready) 3784 amdgpu_ttm_set_buffer_funcs_status(adev, true); 3785 3786 return r; 3787 } 3788 3789 /** 3790 * amdgpu_device_detect_sriov_bios - determine if the board supports SR-IOV 3791 * 3792 * @adev: amdgpu_device pointer 3793 * 3794 * Query the VBIOS data tables to determine if the board supports SR-IOV. 3795 */ 3796 static void amdgpu_device_detect_sriov_bios(struct amdgpu_device *adev) 3797 { 3798 if (amdgpu_sriov_vf(adev)) { 3799 if (adev->is_atom_fw) { 3800 if (amdgpu_atomfirmware_gpu_virtualization_supported(adev)) 3801 adev->virt.caps |= AMDGPU_SRIOV_CAPS_SRIOV_VBIOS; 3802 } else { 3803 if (amdgpu_atombios_has_gpu_virtualization_table(adev)) 3804 adev->virt.caps |= AMDGPU_SRIOV_CAPS_SRIOV_VBIOS; 3805 } 3806 3807 if (!(adev->virt.caps & AMDGPU_SRIOV_CAPS_SRIOV_VBIOS)) 3808 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_NO_VBIOS, 0, 0); 3809 } 3810 } 3811 3812 /** 3813 * amdgpu_device_asic_has_dc_support - determine if DC supports the asic 3814 * 3815 * @asic_type: AMD asic type 3816 * 3817 * Check if there is DC (new modesetting infrastructre) support for an asic. 3818 * returns true if DC has support, false if not. 3819 */ 3820 bool amdgpu_device_asic_has_dc_support(enum amd_asic_type asic_type) 3821 { 3822 switch (asic_type) { 3823 #ifdef CONFIG_DRM_AMDGPU_SI 3824 case CHIP_HAINAN: 3825 #endif 3826 case CHIP_TOPAZ: 3827 /* chips with no display hardware */ 3828 return false; 3829 #if defined(CONFIG_DRM_AMD_DC) 3830 case CHIP_TAHITI: 3831 case CHIP_PITCAIRN: 3832 case CHIP_VERDE: 3833 case CHIP_OLAND: 3834 /* 3835 * We have systems in the wild with these ASICs that require 3836 * LVDS and VGA support which is not supported with DC. 3837 * 3838 * Fallback to the non-DC driver here by default so as not to 3839 * cause regressions. 3840 */ 3841 #if defined(CONFIG_DRM_AMD_DC_SI) 3842 return amdgpu_dc > 0; 3843 #else 3844 return false; 3845 #endif 3846 case CHIP_BONAIRE: 3847 case CHIP_KAVERI: 3848 case CHIP_KABINI: 3849 case CHIP_MULLINS: 3850 /* 3851 * We have systems in the wild with these ASICs that require 3852 * VGA support which is not supported with DC. 3853 * 3854 * Fallback to the non-DC driver here by default so as not to 3855 * cause regressions. 3856 */ 3857 return amdgpu_dc > 0; 3858 default: 3859 return amdgpu_dc != 0; 3860 #else 3861 default: 3862 if (amdgpu_dc > 0) 3863 DRM_INFO_ONCE("Display Core has been requested via kernel parameter but isn't supported by ASIC, ignoring\n"); 3864 return false; 3865 #endif 3866 } 3867 } 3868 3869 /** 3870 * amdgpu_device_has_dc_support - check if dc is supported 3871 * 3872 * @adev: amdgpu_device pointer 3873 * 3874 * Returns true for supported, false for not supported 3875 */ 3876 bool amdgpu_device_has_dc_support(struct amdgpu_device *adev) 3877 { 3878 if (adev->enable_virtual_display || 3879 (adev->harvest_ip_mask & AMD_HARVEST_IP_DMU_MASK)) 3880 return false; 3881 3882 return amdgpu_device_asic_has_dc_support(adev->asic_type); 3883 } 3884 3885 static void amdgpu_device_xgmi_reset_func(struct work_struct *__work) 3886 { 3887 struct amdgpu_device *adev = 3888 container_of(__work, struct amdgpu_device, xgmi_reset_work); 3889 struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev); 3890 3891 /* It's a bug to not have a hive within this function */ 3892 if (WARN_ON(!hive)) 3893 return; 3894 3895 /* 3896 * Use task barrier to synchronize all xgmi reset works across the 3897 * hive. task_barrier_enter and task_barrier_exit will block 3898 * until all the threads running the xgmi reset works reach 3899 * those points. task_barrier_full will do both blocks. 3900 */ 3901 if (amdgpu_asic_reset_method(adev) == AMD_RESET_METHOD_BACO) { 3902 3903 task_barrier_enter(&hive->tb); 3904 adev->asic_reset_res = amdgpu_device_baco_enter(adev_to_drm(adev)); 3905 3906 if (adev->asic_reset_res) 3907 goto fail; 3908 3909 task_barrier_exit(&hive->tb); 3910 adev->asic_reset_res = amdgpu_device_baco_exit(adev_to_drm(adev)); 3911 3912 if (adev->asic_reset_res) 3913 goto fail; 3914 3915 amdgpu_ras_reset_error_count(adev, AMDGPU_RAS_BLOCK__MMHUB); 3916 } else { 3917 3918 task_barrier_full(&hive->tb); 3919 adev->asic_reset_res = amdgpu_asic_reset(adev); 3920 } 3921 3922 fail: 3923 if (adev->asic_reset_res) 3924 DRM_WARN("ASIC reset failed with error, %d for drm dev, %s", 3925 adev->asic_reset_res, adev_to_drm(adev)->unique); 3926 amdgpu_put_xgmi_hive(hive); 3927 } 3928 3929 static int amdgpu_device_get_job_timeout_settings(struct amdgpu_device *adev) 3930 { 3931 char *input = amdgpu_lockup_timeout; 3932 char *timeout_setting = NULL; 3933 int index = 0; 3934 long timeout; 3935 int ret = 0; 3936 3937 /* 3938 * By default timeout for non compute jobs is 10000 3939 * and 60000 for compute jobs. 3940 * In SR-IOV or passthrough mode, timeout for compute 3941 * jobs are 60000 by default. 3942 */ 3943 adev->gfx_timeout = msecs_to_jiffies(10000); 3944 adev->sdma_timeout = adev->video_timeout = adev->gfx_timeout; 3945 if (amdgpu_sriov_vf(adev)) 3946 adev->compute_timeout = amdgpu_sriov_is_pp_one_vf(adev) ? 3947 msecs_to_jiffies(60000) : msecs_to_jiffies(10000); 3948 else 3949 adev->compute_timeout = msecs_to_jiffies(60000); 3950 3951 if (strnlen(input, AMDGPU_MAX_TIMEOUT_PARAM_LENGTH)) { 3952 while ((timeout_setting = strsep(&input, ",")) && 3953 strnlen(timeout_setting, AMDGPU_MAX_TIMEOUT_PARAM_LENGTH)) { 3954 ret = kstrtol(timeout_setting, 0, &timeout); 3955 if (ret) 3956 return ret; 3957 3958 if (timeout == 0) { 3959 index++; 3960 continue; 3961 } else if (timeout < 0) { 3962 timeout = MAX_SCHEDULE_TIMEOUT; 3963 dev_warn(adev->dev, "lockup timeout disabled"); 3964 add_taint(TAINT_SOFTLOCKUP, LOCKDEP_STILL_OK); 3965 } else { 3966 timeout = msecs_to_jiffies(timeout); 3967 } 3968 3969 switch (index++) { 3970 case 0: 3971 adev->gfx_timeout = timeout; 3972 break; 3973 case 1: 3974 adev->compute_timeout = timeout; 3975 break; 3976 case 2: 3977 adev->sdma_timeout = timeout; 3978 break; 3979 case 3: 3980 adev->video_timeout = timeout; 3981 break; 3982 default: 3983 break; 3984 } 3985 } 3986 /* 3987 * There is only one value specified and 3988 * it should apply to all non-compute jobs. 3989 */ 3990 if (index == 1) { 3991 adev->sdma_timeout = adev->video_timeout = adev->gfx_timeout; 3992 if (amdgpu_sriov_vf(adev) || amdgpu_passthrough(adev)) 3993 adev->compute_timeout = adev->gfx_timeout; 3994 } 3995 } 3996 3997 return ret; 3998 } 3999 4000 /** 4001 * amdgpu_device_check_iommu_direct_map - check if RAM direct mapped to GPU 4002 * 4003 * @adev: amdgpu_device pointer 4004 * 4005 * RAM direct mapped to GPU if IOMMU is not enabled or is pass through mode 4006 */ 4007 static void amdgpu_device_check_iommu_direct_map(struct amdgpu_device *adev) 4008 { 4009 struct iommu_domain *domain; 4010 4011 domain = iommu_get_domain_for_dev(adev->dev); 4012 if (!domain || domain->type == IOMMU_DOMAIN_IDENTITY) 4013 adev->ram_is_direct_mapped = true; 4014 } 4015 4016 #if defined(CONFIG_HSA_AMD_P2P) 4017 /** 4018 * amdgpu_device_check_iommu_remap - Check if DMA remapping is enabled. 4019 * 4020 * @adev: amdgpu_device pointer 4021 * 4022 * return if IOMMU remapping bar address 4023 */ 4024 static bool amdgpu_device_check_iommu_remap(struct amdgpu_device *adev) 4025 { 4026 struct iommu_domain *domain; 4027 4028 domain = iommu_get_domain_for_dev(adev->dev); 4029 if (domain && (domain->type == IOMMU_DOMAIN_DMA || 4030 domain->type == IOMMU_DOMAIN_DMA_FQ)) 4031 return true; 4032 4033 return false; 4034 } 4035 #endif 4036 4037 static const struct attribute *amdgpu_dev_attributes[] = { 4038 &dev_attr_pcie_replay_count.attr, 4039 NULL 4040 }; 4041 4042 static void amdgpu_device_set_mcbp(struct amdgpu_device *adev) 4043 { 4044 if (amdgpu_mcbp == 1) 4045 adev->gfx.mcbp = true; 4046 else if (amdgpu_mcbp == 0) 4047 adev->gfx.mcbp = false; 4048 4049 if (amdgpu_sriov_vf(adev)) 4050 adev->gfx.mcbp = true; 4051 4052 if (adev->gfx.mcbp) 4053 DRM_INFO("MCBP is enabled\n"); 4054 } 4055 4056 /** 4057 * amdgpu_device_init - initialize the driver 4058 * 4059 * @adev: amdgpu_device pointer 4060 * @flags: driver flags 4061 * 4062 * Initializes the driver info and hw (all asics). 4063 * Returns 0 for success or an error on failure. 4064 * Called at driver startup. 4065 */ 4066 int amdgpu_device_init(struct amdgpu_device *adev, 4067 uint32_t flags) 4068 { 4069 struct drm_device *ddev = adev_to_drm(adev); 4070 struct pci_dev *pdev = adev->pdev; 4071 int r, i; 4072 bool px = false; 4073 u32 max_MBps; 4074 int tmp; 4075 4076 adev->shutdown = false; 4077 adev->flags = flags; 4078 4079 if (amdgpu_force_asic_type >= 0 && amdgpu_force_asic_type < CHIP_LAST) 4080 adev->asic_type = amdgpu_force_asic_type; 4081 else 4082 adev->asic_type = flags & AMD_ASIC_MASK; 4083 4084 adev->usec_timeout = AMDGPU_MAX_USEC_TIMEOUT; 4085 if (amdgpu_emu_mode == 1) 4086 adev->usec_timeout *= 10; 4087 adev->gmc.gart_size = 512 * 1024 * 1024; 4088 adev->accel_working = false; 4089 adev->num_rings = 0; 4090 RCU_INIT_POINTER(adev->gang_submit, dma_fence_get_stub()); 4091 adev->mman.buffer_funcs = NULL; 4092 adev->mman.buffer_funcs_ring = NULL; 4093 adev->vm_manager.vm_pte_funcs = NULL; 4094 adev->vm_manager.vm_pte_num_scheds = 0; 4095 adev->gmc.gmc_funcs = NULL; 4096 adev->harvest_ip_mask = 0x0; 4097 adev->fence_context = dma_fence_context_alloc(AMDGPU_MAX_RINGS); 4098 bitmap_zero(adev->gfx.pipe_reserve_bitmap, AMDGPU_MAX_COMPUTE_QUEUES); 4099 4100 adev->smc_rreg = &amdgpu_invalid_rreg; 4101 adev->smc_wreg = &amdgpu_invalid_wreg; 4102 adev->pcie_rreg = &amdgpu_invalid_rreg; 4103 adev->pcie_wreg = &amdgpu_invalid_wreg; 4104 adev->pcie_rreg_ext = &amdgpu_invalid_rreg_ext; 4105 adev->pcie_wreg_ext = &amdgpu_invalid_wreg_ext; 4106 adev->pciep_rreg = &amdgpu_invalid_rreg; 4107 adev->pciep_wreg = &amdgpu_invalid_wreg; 4108 adev->pcie_rreg64 = &amdgpu_invalid_rreg64; 4109 adev->pcie_wreg64 = &amdgpu_invalid_wreg64; 4110 adev->pcie_rreg64_ext = &amdgpu_invalid_rreg64_ext; 4111 adev->pcie_wreg64_ext = &amdgpu_invalid_wreg64_ext; 4112 adev->uvd_ctx_rreg = &amdgpu_invalid_rreg; 4113 adev->uvd_ctx_wreg = &amdgpu_invalid_wreg; 4114 adev->didt_rreg = &amdgpu_invalid_rreg; 4115 adev->didt_wreg = &amdgpu_invalid_wreg; 4116 adev->gc_cac_rreg = &amdgpu_invalid_rreg; 4117 adev->gc_cac_wreg = &amdgpu_invalid_wreg; 4118 adev->audio_endpt_rreg = &amdgpu_block_invalid_rreg; 4119 adev->audio_endpt_wreg = &amdgpu_block_invalid_wreg; 4120 4121 DRM_INFO("initializing kernel modesetting (%s 0x%04X:0x%04X 0x%04X:0x%04X 0x%02X).\n", 4122 amdgpu_asic_name[adev->asic_type], pdev->vendor, pdev->device, 4123 pdev->subsystem_vendor, pdev->subsystem_device, pdev->revision); 4124 4125 /* mutex initialization are all done here so we 4126 * can recall function without having locking issues 4127 */ 4128 mutex_init(&adev->firmware.mutex); 4129 mutex_init(&adev->pm.mutex); 4130 mutex_init(&adev->gfx.gpu_clock_mutex); 4131 mutex_init(&adev->srbm_mutex); 4132 mutex_init(&adev->gfx.pipe_reserve_mutex); 4133 mutex_init(&adev->gfx.gfx_off_mutex); 4134 mutex_init(&adev->gfx.partition_mutex); 4135 mutex_init(&adev->grbm_idx_mutex); 4136 mutex_init(&adev->mn_lock); 4137 mutex_init(&adev->virt.vf_errors.lock); 4138 mutex_init(&adev->virt.rlcg_reg_lock); 4139 hash_init(adev->mn_hash); 4140 mutex_init(&adev->psp.mutex); 4141 mutex_init(&adev->notifier_lock); 4142 mutex_init(&adev->pm.stable_pstate_ctx_lock); 4143 mutex_init(&adev->benchmark_mutex); 4144 mutex_init(&adev->gfx.reset_sem_mutex); 4145 /* Initialize the mutex for cleaner shader isolation between GFX and compute processes */ 4146 mutex_init(&adev->enforce_isolation_mutex); 4147 mutex_init(&adev->gfx.kfd_sch_mutex); 4148 4149 amdgpu_device_init_apu_flags(adev); 4150 4151 r = amdgpu_device_check_arguments(adev); 4152 if (r) 4153 return r; 4154 4155 spin_lock_init(&adev->mmio_idx_lock); 4156 spin_lock_init(&adev->smc_idx_lock); 4157 spin_lock_init(&adev->pcie_idx_lock); 4158 spin_lock_init(&adev->uvd_ctx_idx_lock); 4159 spin_lock_init(&adev->didt_idx_lock); 4160 spin_lock_init(&adev->gc_cac_idx_lock); 4161 spin_lock_init(&adev->se_cac_idx_lock); 4162 spin_lock_init(&adev->audio_endpt_idx_lock); 4163 spin_lock_init(&adev->mm_stats.lock); 4164 spin_lock_init(&adev->wb.lock); 4165 4166 INIT_LIST_HEAD(&adev->reset_list); 4167 4168 INIT_LIST_HEAD(&adev->ras_list); 4169 4170 INIT_LIST_HEAD(&adev->pm.od_kobj_list); 4171 4172 INIT_DELAYED_WORK(&adev->delayed_init_work, 4173 amdgpu_device_delayed_init_work_handler); 4174 INIT_DELAYED_WORK(&adev->gfx.gfx_off_delay_work, 4175 amdgpu_device_delay_enable_gfx_off); 4176 /* 4177 * Initialize the enforce_isolation work structures for each XCP 4178 * partition. This work handler is responsible for enforcing shader 4179 * isolation on AMD GPUs. It counts the number of emitted fences for 4180 * each GFX and compute ring. If there are any fences, it schedules 4181 * the `enforce_isolation_work` to be run after a delay. If there are 4182 * no fences, it signals the Kernel Fusion Driver (KFD) to resume the 4183 * runqueue. 4184 */ 4185 for (i = 0; i < MAX_XCP; i++) { 4186 INIT_DELAYED_WORK(&adev->gfx.enforce_isolation[i].work, 4187 amdgpu_gfx_enforce_isolation_handler); 4188 adev->gfx.enforce_isolation[i].adev = adev; 4189 adev->gfx.enforce_isolation[i].xcp_id = i; 4190 } 4191 4192 INIT_WORK(&adev->xgmi_reset_work, amdgpu_device_xgmi_reset_func); 4193 4194 adev->gfx.gfx_off_req_count = 1; 4195 adev->gfx.gfx_off_residency = 0; 4196 adev->gfx.gfx_off_entrycount = 0; 4197 adev->pm.ac_power = power_supply_is_system_supplied() > 0; 4198 4199 atomic_set(&adev->throttling_logging_enabled, 1); 4200 /* 4201 * If throttling continues, logging will be performed every minute 4202 * to avoid log flooding. "-1" is subtracted since the thermal 4203 * throttling interrupt comes every second. Thus, the total logging 4204 * interval is 59 seconds(retelimited printk interval) + 1(waiting 4205 * for throttling interrupt) = 60 seconds. 4206 */ 4207 ratelimit_state_init(&adev->throttling_logging_rs, (60 - 1) * HZ, 1); 4208 ratelimit_set_flags(&adev->throttling_logging_rs, RATELIMIT_MSG_ON_RELEASE); 4209 4210 /* Registers mapping */ 4211 /* TODO: block userspace mapping of io register */ 4212 if (adev->asic_type >= CHIP_BONAIRE) { 4213 adev->rmmio_base = pci_resource_start(adev->pdev, 5); 4214 adev->rmmio_size = pci_resource_len(adev->pdev, 5); 4215 } else { 4216 adev->rmmio_base = pci_resource_start(adev->pdev, 2); 4217 adev->rmmio_size = pci_resource_len(adev->pdev, 2); 4218 } 4219 4220 for (i = 0; i < AMD_IP_BLOCK_TYPE_NUM; i++) 4221 atomic_set(&adev->pm.pwr_state[i], POWER_STATE_UNKNOWN); 4222 4223 adev->rmmio = ioremap(adev->rmmio_base, adev->rmmio_size); 4224 if (!adev->rmmio) 4225 return -ENOMEM; 4226 4227 DRM_INFO("register mmio base: 0x%08X\n", (uint32_t)adev->rmmio_base); 4228 DRM_INFO("register mmio size: %u\n", (unsigned int)adev->rmmio_size); 4229 4230 /* 4231 * Reset domain needs to be present early, before XGMI hive discovered 4232 * (if any) and intitialized to use reset sem and in_gpu reset flag 4233 * early on during init and before calling to RREG32. 4234 */ 4235 adev->reset_domain = amdgpu_reset_create_reset_domain(SINGLE_DEVICE, "amdgpu-reset-dev"); 4236 if (!adev->reset_domain) 4237 return -ENOMEM; 4238 4239 /* detect hw virtualization here */ 4240 amdgpu_detect_virtualization(adev); 4241 4242 amdgpu_device_get_pcie_info(adev); 4243 4244 r = amdgpu_device_get_job_timeout_settings(adev); 4245 if (r) { 4246 dev_err(adev->dev, "invalid lockup_timeout parameter syntax\n"); 4247 return r; 4248 } 4249 4250 amdgpu_device_set_mcbp(adev); 4251 4252 /* 4253 * By default, use default mode where all blocks are expected to be 4254 * initialized. At present a 'swinit' of blocks is required to be 4255 * completed before the need for a different level is detected. 4256 */ 4257 amdgpu_set_init_level(adev, AMDGPU_INIT_LEVEL_DEFAULT); 4258 /* early init functions */ 4259 r = amdgpu_device_ip_early_init(adev); 4260 if (r) 4261 return r; 4262 4263 /* Get rid of things like offb */ 4264 r = drm_aperture_remove_conflicting_pci_framebuffers(adev->pdev, &amdgpu_kms_driver); 4265 if (r) 4266 return r; 4267 4268 /* Enable TMZ based on IP_VERSION */ 4269 amdgpu_gmc_tmz_set(adev); 4270 4271 if (amdgpu_sriov_vf(adev) && 4272 amdgpu_ip_version(adev, GC_HWIP, 0) >= IP_VERSION(10, 3, 0)) 4273 /* VF MMIO access (except mailbox range) from CPU 4274 * will be blocked during sriov runtime 4275 */ 4276 adev->virt.caps |= AMDGPU_VF_MMIO_ACCESS_PROTECT; 4277 4278 amdgpu_gmc_noretry_set(adev); 4279 /* Need to get xgmi info early to decide the reset behavior*/ 4280 if (adev->gmc.xgmi.supported) { 4281 r = adev->gfxhub.funcs->get_xgmi_info(adev); 4282 if (r) 4283 return r; 4284 } 4285 4286 /* enable PCIE atomic ops */ 4287 if (amdgpu_sriov_vf(adev)) { 4288 if (adev->virt.fw_reserve.p_pf2vf) 4289 adev->have_atomics_support = ((struct amd_sriov_msg_pf2vf_info *) 4290 adev->virt.fw_reserve.p_pf2vf)->pcie_atomic_ops_support_flags == 4291 (PCI_EXP_DEVCAP2_ATOMIC_COMP32 | PCI_EXP_DEVCAP2_ATOMIC_COMP64); 4292 /* APUs w/ gfx9 onwards doesn't reply on PCIe atomics, rather it is a 4293 * internal path natively support atomics, set have_atomics_support to true. 4294 */ 4295 } else if ((adev->flags & AMD_IS_APU) && 4296 (amdgpu_ip_version(adev, GC_HWIP, 0) > 4297 IP_VERSION(9, 0, 0))) { 4298 adev->have_atomics_support = true; 4299 } else { 4300 adev->have_atomics_support = 4301 !pci_enable_atomic_ops_to_root(adev->pdev, 4302 PCI_EXP_DEVCAP2_ATOMIC_COMP32 | 4303 PCI_EXP_DEVCAP2_ATOMIC_COMP64); 4304 } 4305 4306 if (!adev->have_atomics_support) 4307 dev_info(adev->dev, "PCIE atomic ops is not supported\n"); 4308 4309 /* doorbell bar mapping and doorbell index init*/ 4310 amdgpu_doorbell_init(adev); 4311 4312 if (amdgpu_emu_mode == 1) { 4313 /* post the asic on emulation mode */ 4314 emu_soc_asic_init(adev); 4315 goto fence_driver_init; 4316 } 4317 4318 amdgpu_reset_init(adev); 4319 4320 /* detect if we are with an SRIOV vbios */ 4321 if (adev->bios) 4322 amdgpu_device_detect_sriov_bios(adev); 4323 4324 /* check if we need to reset the asic 4325 * E.g., driver was not cleanly unloaded previously, etc. 4326 */ 4327 if (!amdgpu_sriov_vf(adev) && amdgpu_asic_need_reset_on_init(adev)) { 4328 if (adev->gmc.xgmi.num_physical_nodes) { 4329 dev_info(adev->dev, "Pending hive reset.\n"); 4330 amdgpu_set_init_level(adev, 4331 AMDGPU_INIT_LEVEL_MINIMAL_XGMI); 4332 } else if (amdgpu_ip_version(adev, MP1_HWIP, 0) == IP_VERSION(13, 0, 10) && 4333 !amdgpu_device_has_display_hardware(adev)) { 4334 r = psp_gpu_reset(adev); 4335 } else { 4336 tmp = amdgpu_reset_method; 4337 /* It should do a default reset when loading or reloading the driver, 4338 * regardless of the module parameter reset_method. 4339 */ 4340 amdgpu_reset_method = AMD_RESET_METHOD_NONE; 4341 r = amdgpu_asic_reset(adev); 4342 amdgpu_reset_method = tmp; 4343 } 4344 4345 if (r) { 4346 dev_err(adev->dev, "asic reset on init failed\n"); 4347 goto failed; 4348 } 4349 } 4350 4351 /* Post card if necessary */ 4352 if (amdgpu_device_need_post(adev)) { 4353 if (!adev->bios) { 4354 dev_err(adev->dev, "no vBIOS found\n"); 4355 r = -EINVAL; 4356 goto failed; 4357 } 4358 DRM_INFO("GPU posting now...\n"); 4359 r = amdgpu_device_asic_init(adev); 4360 if (r) { 4361 dev_err(adev->dev, "gpu post error!\n"); 4362 goto failed; 4363 } 4364 } 4365 4366 if (adev->bios) { 4367 if (adev->is_atom_fw) { 4368 /* Initialize clocks */ 4369 r = amdgpu_atomfirmware_get_clock_info(adev); 4370 if (r) { 4371 dev_err(adev->dev, "amdgpu_atomfirmware_get_clock_info failed\n"); 4372 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_GET_CLOCK_FAIL, 0, 0); 4373 goto failed; 4374 } 4375 } else { 4376 /* Initialize clocks */ 4377 r = amdgpu_atombios_get_clock_info(adev); 4378 if (r) { 4379 dev_err(adev->dev, "amdgpu_atombios_get_clock_info failed\n"); 4380 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_GET_CLOCK_FAIL, 0, 0); 4381 goto failed; 4382 } 4383 /* init i2c buses */ 4384 if (!amdgpu_device_has_dc_support(adev)) 4385 amdgpu_atombios_i2c_init(adev); 4386 } 4387 } 4388 4389 fence_driver_init: 4390 /* Fence driver */ 4391 r = amdgpu_fence_driver_sw_init(adev); 4392 if (r) { 4393 dev_err(adev->dev, "amdgpu_fence_driver_sw_init failed\n"); 4394 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_FENCE_INIT_FAIL, 0, 0); 4395 goto failed; 4396 } 4397 4398 /* init the mode config */ 4399 drm_mode_config_init(adev_to_drm(adev)); 4400 4401 r = amdgpu_device_ip_init(adev); 4402 if (r) { 4403 dev_err(adev->dev, "amdgpu_device_ip_init failed\n"); 4404 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_AMDGPU_INIT_FAIL, 0, 0); 4405 goto release_ras_con; 4406 } 4407 4408 amdgpu_fence_driver_hw_init(adev); 4409 4410 dev_info(adev->dev, 4411 "SE %d, SH per SE %d, CU per SH %d, active_cu_number %d\n", 4412 adev->gfx.config.max_shader_engines, 4413 adev->gfx.config.max_sh_per_se, 4414 adev->gfx.config.max_cu_per_sh, 4415 adev->gfx.cu_info.number); 4416 4417 adev->accel_working = true; 4418 4419 amdgpu_vm_check_compute_bug(adev); 4420 4421 /* Initialize the buffer migration limit. */ 4422 if (amdgpu_moverate >= 0) 4423 max_MBps = amdgpu_moverate; 4424 else 4425 max_MBps = 8; /* Allow 8 MB/s. */ 4426 /* Get a log2 for easy divisions. */ 4427 adev->mm_stats.log2_max_MBps = ilog2(max(1u, max_MBps)); 4428 4429 /* 4430 * Register gpu instance before amdgpu_device_enable_mgpu_fan_boost. 4431 * Otherwise the mgpu fan boost feature will be skipped due to the 4432 * gpu instance is counted less. 4433 */ 4434 amdgpu_register_gpu_instance(adev); 4435 4436 /* enable clockgating, etc. after ib tests, etc. since some blocks require 4437 * explicit gating rather than handling it automatically. 4438 */ 4439 if (adev->init_lvl->level != AMDGPU_INIT_LEVEL_MINIMAL_XGMI) { 4440 r = amdgpu_device_ip_late_init(adev); 4441 if (r) { 4442 dev_err(adev->dev, "amdgpu_device_ip_late_init failed\n"); 4443 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_AMDGPU_LATE_INIT_FAIL, 0, r); 4444 goto release_ras_con; 4445 } 4446 /* must succeed. */ 4447 amdgpu_ras_resume(adev); 4448 queue_delayed_work(system_wq, &adev->delayed_init_work, 4449 msecs_to_jiffies(AMDGPU_RESUME_MS)); 4450 } 4451 4452 if (amdgpu_sriov_vf(adev)) { 4453 amdgpu_virt_release_full_gpu(adev, true); 4454 flush_delayed_work(&adev->delayed_init_work); 4455 } 4456 4457 /* 4458 * Place those sysfs registering after `late_init`. As some of those 4459 * operations performed in `late_init` might affect the sysfs 4460 * interfaces creating. 4461 */ 4462 r = amdgpu_atombios_sysfs_init(adev); 4463 if (r) 4464 drm_err(&adev->ddev, 4465 "registering atombios sysfs failed (%d).\n", r); 4466 4467 r = amdgpu_pm_sysfs_init(adev); 4468 if (r) 4469 DRM_ERROR("registering pm sysfs failed (%d).\n", r); 4470 4471 r = amdgpu_ucode_sysfs_init(adev); 4472 if (r) { 4473 adev->ucode_sysfs_en = false; 4474 DRM_ERROR("Creating firmware sysfs failed (%d).\n", r); 4475 } else 4476 adev->ucode_sysfs_en = true; 4477 4478 r = sysfs_create_files(&adev->dev->kobj, amdgpu_dev_attributes); 4479 if (r) 4480 dev_err(adev->dev, "Could not create amdgpu device attr\n"); 4481 4482 r = devm_device_add_group(adev->dev, &amdgpu_board_attrs_group); 4483 if (r) 4484 dev_err(adev->dev, 4485 "Could not create amdgpu board attributes\n"); 4486 4487 amdgpu_fru_sysfs_init(adev); 4488 amdgpu_reg_state_sysfs_init(adev); 4489 amdgpu_xcp_cfg_sysfs_init(adev); 4490 4491 if (IS_ENABLED(CONFIG_PERF_EVENTS)) 4492 r = amdgpu_pmu_init(adev); 4493 if (r) 4494 dev_err(adev->dev, "amdgpu_pmu_init failed\n"); 4495 4496 /* Have stored pci confspace at hand for restore in sudden PCI error */ 4497 if (amdgpu_device_cache_pci_state(adev->pdev)) 4498 pci_restore_state(pdev); 4499 4500 /* if we have > 1 VGA cards, then disable the amdgpu VGA resources */ 4501 /* this will fail for cards that aren't VGA class devices, just 4502 * ignore it 4503 */ 4504 if ((adev->pdev->class >> 8) == PCI_CLASS_DISPLAY_VGA) 4505 vga_client_register(adev->pdev, amdgpu_device_vga_set_decode); 4506 4507 px = amdgpu_device_supports_px(ddev); 4508 4509 if (px || (!dev_is_removable(&adev->pdev->dev) && 4510 apple_gmux_detect(NULL, NULL))) 4511 vga_switcheroo_register_client(adev->pdev, 4512 &amdgpu_switcheroo_ops, px); 4513 4514 if (px) 4515 vga_switcheroo_init_domain_pm_ops(adev->dev, &adev->vga_pm_domain); 4516 4517 if (adev->init_lvl->level == AMDGPU_INIT_LEVEL_MINIMAL_XGMI) 4518 amdgpu_xgmi_reset_on_init(adev); 4519 4520 amdgpu_device_check_iommu_direct_map(adev); 4521 4522 return 0; 4523 4524 release_ras_con: 4525 if (amdgpu_sriov_vf(adev)) 4526 amdgpu_virt_release_full_gpu(adev, true); 4527 4528 /* failed in exclusive mode due to timeout */ 4529 if (amdgpu_sriov_vf(adev) && 4530 !amdgpu_sriov_runtime(adev) && 4531 amdgpu_virt_mmio_blocked(adev) && 4532 !amdgpu_virt_wait_reset(adev)) { 4533 dev_err(adev->dev, "VF exclusive mode timeout\n"); 4534 /* Don't send request since VF is inactive. */ 4535 adev->virt.caps &= ~AMDGPU_SRIOV_CAPS_RUNTIME; 4536 adev->virt.ops = NULL; 4537 r = -EAGAIN; 4538 } 4539 amdgpu_release_ras_context(adev); 4540 4541 failed: 4542 amdgpu_vf_error_trans_all(adev); 4543 4544 return r; 4545 } 4546 4547 static void amdgpu_device_unmap_mmio(struct amdgpu_device *adev) 4548 { 4549 4550 /* Clear all CPU mappings pointing to this device */ 4551 unmap_mapping_range(adev->ddev.anon_inode->i_mapping, 0, 0, 1); 4552 4553 /* Unmap all mapped bars - Doorbell, registers and VRAM */ 4554 amdgpu_doorbell_fini(adev); 4555 4556 iounmap(adev->rmmio); 4557 adev->rmmio = NULL; 4558 if (adev->mman.aper_base_kaddr) 4559 iounmap(adev->mman.aper_base_kaddr); 4560 adev->mman.aper_base_kaddr = NULL; 4561 4562 /* Memory manager related */ 4563 if (!adev->gmc.xgmi.connected_to_cpu && !adev->gmc.is_app_apu) { 4564 arch_phys_wc_del(adev->gmc.vram_mtrr); 4565 arch_io_free_memtype_wc(adev->gmc.aper_base, adev->gmc.aper_size); 4566 } 4567 } 4568 4569 /** 4570 * amdgpu_device_fini_hw - tear down the driver 4571 * 4572 * @adev: amdgpu_device pointer 4573 * 4574 * Tear down the driver info (all asics). 4575 * Called at driver shutdown. 4576 */ 4577 void amdgpu_device_fini_hw(struct amdgpu_device *adev) 4578 { 4579 dev_info(adev->dev, "amdgpu: finishing device.\n"); 4580 flush_delayed_work(&adev->delayed_init_work); 4581 4582 if (adev->mman.initialized) 4583 drain_workqueue(adev->mman.bdev.wq); 4584 adev->shutdown = true; 4585 4586 /* make sure IB test finished before entering exclusive mode 4587 * to avoid preemption on IB test 4588 */ 4589 if (amdgpu_sriov_vf(adev)) { 4590 amdgpu_virt_request_full_gpu(adev, false); 4591 amdgpu_virt_fini_data_exchange(adev); 4592 } 4593 4594 /* disable all interrupts */ 4595 amdgpu_irq_disable_all(adev); 4596 if (adev->mode_info.mode_config_initialized) { 4597 if (!drm_drv_uses_atomic_modeset(adev_to_drm(adev))) 4598 drm_helper_force_disable_all(adev_to_drm(adev)); 4599 else 4600 drm_atomic_helper_shutdown(adev_to_drm(adev)); 4601 } 4602 amdgpu_fence_driver_hw_fini(adev); 4603 4604 if (adev->pm.sysfs_initialized) 4605 amdgpu_pm_sysfs_fini(adev); 4606 if (adev->ucode_sysfs_en) 4607 amdgpu_ucode_sysfs_fini(adev); 4608 sysfs_remove_files(&adev->dev->kobj, amdgpu_dev_attributes); 4609 amdgpu_fru_sysfs_fini(adev); 4610 4611 amdgpu_reg_state_sysfs_fini(adev); 4612 amdgpu_xcp_cfg_sysfs_fini(adev); 4613 4614 /* disable ras feature must before hw fini */ 4615 amdgpu_ras_pre_fini(adev); 4616 4617 amdgpu_ttm_set_buffer_funcs_status(adev, false); 4618 4619 amdgpu_device_ip_fini_early(adev); 4620 4621 amdgpu_irq_fini_hw(adev); 4622 4623 if (adev->mman.initialized) 4624 ttm_device_clear_dma_mappings(&adev->mman.bdev); 4625 4626 amdgpu_gart_dummy_page_fini(adev); 4627 4628 if (drm_dev_is_unplugged(adev_to_drm(adev))) 4629 amdgpu_device_unmap_mmio(adev); 4630 4631 } 4632 4633 void amdgpu_device_fini_sw(struct amdgpu_device *adev) 4634 { 4635 int idx; 4636 bool px; 4637 4638 amdgpu_fence_driver_sw_fini(adev); 4639 amdgpu_device_ip_fini(adev); 4640 amdgpu_ucode_release(&adev->firmware.gpu_info_fw); 4641 adev->accel_working = false; 4642 dma_fence_put(rcu_dereference_protected(adev->gang_submit, true)); 4643 4644 amdgpu_reset_fini(adev); 4645 4646 /* free i2c buses */ 4647 if (!amdgpu_device_has_dc_support(adev)) 4648 amdgpu_i2c_fini(adev); 4649 4650 if (amdgpu_emu_mode != 1) 4651 amdgpu_atombios_fini(adev); 4652 4653 kfree(adev->bios); 4654 adev->bios = NULL; 4655 4656 kfree(adev->fru_info); 4657 adev->fru_info = NULL; 4658 4659 px = amdgpu_device_supports_px(adev_to_drm(adev)); 4660 4661 if (px || (!dev_is_removable(&adev->pdev->dev) && 4662 apple_gmux_detect(NULL, NULL))) 4663 vga_switcheroo_unregister_client(adev->pdev); 4664 4665 if (px) 4666 vga_switcheroo_fini_domain_pm_ops(adev->dev); 4667 4668 if ((adev->pdev->class >> 8) == PCI_CLASS_DISPLAY_VGA) 4669 vga_client_unregister(adev->pdev); 4670 4671 if (drm_dev_enter(adev_to_drm(adev), &idx)) { 4672 4673 iounmap(adev->rmmio); 4674 adev->rmmio = NULL; 4675 amdgpu_doorbell_fini(adev); 4676 drm_dev_exit(idx); 4677 } 4678 4679 if (IS_ENABLED(CONFIG_PERF_EVENTS)) 4680 amdgpu_pmu_fini(adev); 4681 if (adev->mman.discovery_bin) 4682 amdgpu_discovery_fini(adev); 4683 4684 amdgpu_reset_put_reset_domain(adev->reset_domain); 4685 adev->reset_domain = NULL; 4686 4687 kfree(adev->pci_state); 4688 4689 } 4690 4691 /** 4692 * amdgpu_device_evict_resources - evict device resources 4693 * @adev: amdgpu device object 4694 * 4695 * Evicts all ttm device resources(vram BOs, gart table) from the lru list 4696 * of the vram memory type. Mainly used for evicting device resources 4697 * at suspend time. 4698 * 4699 */ 4700 static int amdgpu_device_evict_resources(struct amdgpu_device *adev) 4701 { 4702 int ret; 4703 4704 /* No need to evict vram on APUs for suspend to ram or s2idle */ 4705 if ((adev->in_s3 || adev->in_s0ix) && (adev->flags & AMD_IS_APU)) 4706 return 0; 4707 4708 ret = amdgpu_ttm_evict_resources(adev, TTM_PL_VRAM); 4709 if (ret) 4710 DRM_WARN("evicting device resources failed\n"); 4711 return ret; 4712 } 4713 4714 /* 4715 * Suspend & resume. 4716 */ 4717 /** 4718 * amdgpu_device_prepare - prepare for device suspend 4719 * 4720 * @dev: drm dev pointer 4721 * 4722 * Prepare to put the hw in the suspend state (all asics). 4723 * Returns 0 for success or an error on failure. 4724 * Called at driver suspend. 4725 */ 4726 int amdgpu_device_prepare(struct drm_device *dev) 4727 { 4728 struct amdgpu_device *adev = drm_to_adev(dev); 4729 int i, r; 4730 4731 amdgpu_choose_low_power_state(adev); 4732 4733 if (dev->switch_power_state == DRM_SWITCH_POWER_OFF) 4734 return 0; 4735 4736 /* Evict the majority of BOs before starting suspend sequence */ 4737 r = amdgpu_device_evict_resources(adev); 4738 if (r) 4739 goto unprepare; 4740 4741 flush_delayed_work(&adev->gfx.gfx_off_delay_work); 4742 4743 for (i = 0; i < adev->num_ip_blocks; i++) { 4744 if (!adev->ip_blocks[i].status.valid) 4745 continue; 4746 if (!adev->ip_blocks[i].version->funcs->prepare_suspend) 4747 continue; 4748 r = adev->ip_blocks[i].version->funcs->prepare_suspend(&adev->ip_blocks[i]); 4749 if (r) 4750 goto unprepare; 4751 } 4752 4753 return 0; 4754 4755 unprepare: 4756 adev->in_s0ix = adev->in_s3 = false; 4757 4758 return r; 4759 } 4760 4761 /** 4762 * amdgpu_device_suspend - initiate device suspend 4763 * 4764 * @dev: drm dev pointer 4765 * @fbcon : notify the fbdev of suspend 4766 * 4767 * Puts the hw in the suspend state (all asics). 4768 * Returns 0 for success or an error on failure. 4769 * Called at driver suspend. 4770 */ 4771 int amdgpu_device_suspend(struct drm_device *dev, bool fbcon) 4772 { 4773 struct amdgpu_device *adev = drm_to_adev(dev); 4774 int r = 0; 4775 4776 if (dev->switch_power_state == DRM_SWITCH_POWER_OFF) 4777 return 0; 4778 4779 adev->in_suspend = true; 4780 4781 if (amdgpu_sriov_vf(adev)) { 4782 amdgpu_virt_fini_data_exchange(adev); 4783 r = amdgpu_virt_request_full_gpu(adev, false); 4784 if (r) 4785 return r; 4786 } 4787 4788 if (amdgpu_acpi_smart_shift_update(dev, AMDGPU_SS_DEV_D3)) 4789 DRM_WARN("smart shift update failed\n"); 4790 4791 if (fbcon) 4792 drm_fb_helper_set_suspend_unlocked(adev_to_drm(adev)->fb_helper, true); 4793 4794 cancel_delayed_work_sync(&adev->delayed_init_work); 4795 4796 amdgpu_ras_suspend(adev); 4797 4798 amdgpu_device_ip_suspend_phase1(adev); 4799 4800 if (!adev->in_s0ix) 4801 amdgpu_amdkfd_suspend(adev, adev->in_runpm); 4802 4803 r = amdgpu_device_evict_resources(adev); 4804 if (r) 4805 return r; 4806 4807 amdgpu_ttm_set_buffer_funcs_status(adev, false); 4808 4809 amdgpu_fence_driver_hw_fini(adev); 4810 4811 amdgpu_device_ip_suspend_phase2(adev); 4812 4813 if (amdgpu_sriov_vf(adev)) 4814 amdgpu_virt_release_full_gpu(adev, false); 4815 4816 r = amdgpu_dpm_notify_rlc_state(adev, false); 4817 if (r) 4818 return r; 4819 4820 return 0; 4821 } 4822 4823 /** 4824 * amdgpu_device_resume - initiate device resume 4825 * 4826 * @dev: drm dev pointer 4827 * @fbcon : notify the fbdev of resume 4828 * 4829 * Bring the hw back to operating state (all asics). 4830 * Returns 0 for success or an error on failure. 4831 * Called at driver resume. 4832 */ 4833 int amdgpu_device_resume(struct drm_device *dev, bool fbcon) 4834 { 4835 struct amdgpu_device *adev = drm_to_adev(dev); 4836 int r = 0; 4837 4838 if (amdgpu_sriov_vf(adev)) { 4839 r = amdgpu_virt_request_full_gpu(adev, true); 4840 if (r) 4841 return r; 4842 } 4843 4844 if (dev->switch_power_state == DRM_SWITCH_POWER_OFF) 4845 return 0; 4846 4847 if (adev->in_s0ix) 4848 amdgpu_dpm_gfx_state_change(adev, sGpuChangeState_D0Entry); 4849 4850 /* post card */ 4851 if (amdgpu_device_need_post(adev)) { 4852 r = amdgpu_device_asic_init(adev); 4853 if (r) 4854 dev_err(adev->dev, "amdgpu asic init failed\n"); 4855 } 4856 4857 r = amdgpu_device_ip_resume(adev); 4858 4859 if (r) { 4860 dev_err(adev->dev, "amdgpu_device_ip_resume failed (%d).\n", r); 4861 goto exit; 4862 } 4863 amdgpu_fence_driver_hw_init(adev); 4864 4865 if (!adev->in_s0ix) { 4866 r = amdgpu_amdkfd_resume(adev, adev->in_runpm); 4867 if (r) 4868 goto exit; 4869 } 4870 4871 r = amdgpu_device_ip_late_init(adev); 4872 if (r) 4873 goto exit; 4874 4875 queue_delayed_work(system_wq, &adev->delayed_init_work, 4876 msecs_to_jiffies(AMDGPU_RESUME_MS)); 4877 exit: 4878 if (amdgpu_sriov_vf(adev)) { 4879 amdgpu_virt_init_data_exchange(adev); 4880 amdgpu_virt_release_full_gpu(adev, true); 4881 } 4882 4883 if (r) 4884 return r; 4885 4886 /* Make sure IB tests flushed */ 4887 flush_delayed_work(&adev->delayed_init_work); 4888 4889 if (fbcon) 4890 drm_fb_helper_set_suspend_unlocked(adev_to_drm(adev)->fb_helper, false); 4891 4892 amdgpu_ras_resume(adev); 4893 4894 if (adev->mode_info.num_crtc) { 4895 /* 4896 * Most of the connector probing functions try to acquire runtime pm 4897 * refs to ensure that the GPU is powered on when connector polling is 4898 * performed. Since we're calling this from a runtime PM callback, 4899 * trying to acquire rpm refs will cause us to deadlock. 4900 * 4901 * Since we're guaranteed to be holding the rpm lock, it's safe to 4902 * temporarily disable the rpm helpers so this doesn't deadlock us. 4903 */ 4904 #ifdef CONFIG_PM 4905 dev->dev->power.disable_depth++; 4906 #endif 4907 if (!adev->dc_enabled) 4908 drm_helper_hpd_irq_event(dev); 4909 else 4910 drm_kms_helper_hotplug_event(dev); 4911 #ifdef CONFIG_PM 4912 dev->dev->power.disable_depth--; 4913 #endif 4914 } 4915 adev->in_suspend = false; 4916 4917 if (adev->enable_mes) 4918 amdgpu_mes_self_test(adev); 4919 4920 if (amdgpu_acpi_smart_shift_update(dev, AMDGPU_SS_DEV_D0)) 4921 DRM_WARN("smart shift update failed\n"); 4922 4923 return 0; 4924 } 4925 4926 /** 4927 * amdgpu_device_ip_check_soft_reset - did soft reset succeed 4928 * 4929 * @adev: amdgpu_device pointer 4930 * 4931 * The list of all the hardware IPs that make up the asic is walked and 4932 * the check_soft_reset callbacks are run. check_soft_reset determines 4933 * if the asic is still hung or not. 4934 * Returns true if any of the IPs are still in a hung state, false if not. 4935 */ 4936 static bool amdgpu_device_ip_check_soft_reset(struct amdgpu_device *adev) 4937 { 4938 int i; 4939 bool asic_hang = false; 4940 4941 if (amdgpu_sriov_vf(adev)) 4942 return true; 4943 4944 if (amdgpu_asic_need_full_reset(adev)) 4945 return true; 4946 4947 for (i = 0; i < adev->num_ip_blocks; i++) { 4948 if (!adev->ip_blocks[i].status.valid) 4949 continue; 4950 if (adev->ip_blocks[i].version->funcs->check_soft_reset) 4951 adev->ip_blocks[i].status.hang = 4952 adev->ip_blocks[i].version->funcs->check_soft_reset( 4953 &adev->ip_blocks[i]); 4954 if (adev->ip_blocks[i].status.hang) { 4955 dev_info(adev->dev, "IP block:%s is hung!\n", adev->ip_blocks[i].version->funcs->name); 4956 asic_hang = true; 4957 } 4958 } 4959 return asic_hang; 4960 } 4961 4962 /** 4963 * amdgpu_device_ip_pre_soft_reset - prepare for soft reset 4964 * 4965 * @adev: amdgpu_device pointer 4966 * 4967 * The list of all the hardware IPs that make up the asic is walked and the 4968 * pre_soft_reset callbacks are run if the block is hung. pre_soft_reset 4969 * handles any IP specific hardware or software state changes that are 4970 * necessary for a soft reset to succeed. 4971 * Returns 0 on success, negative error code on failure. 4972 */ 4973 static int amdgpu_device_ip_pre_soft_reset(struct amdgpu_device *adev) 4974 { 4975 int i, r = 0; 4976 4977 for (i = 0; i < adev->num_ip_blocks; i++) { 4978 if (!adev->ip_blocks[i].status.valid) 4979 continue; 4980 if (adev->ip_blocks[i].status.hang && 4981 adev->ip_blocks[i].version->funcs->pre_soft_reset) { 4982 r = adev->ip_blocks[i].version->funcs->pre_soft_reset(&adev->ip_blocks[i]); 4983 if (r) 4984 return r; 4985 } 4986 } 4987 4988 return 0; 4989 } 4990 4991 /** 4992 * amdgpu_device_ip_need_full_reset - check if a full asic reset is needed 4993 * 4994 * @adev: amdgpu_device pointer 4995 * 4996 * Some hardware IPs cannot be soft reset. If they are hung, a full gpu 4997 * reset is necessary to recover. 4998 * Returns true if a full asic reset is required, false if not. 4999 */ 5000 static bool amdgpu_device_ip_need_full_reset(struct amdgpu_device *adev) 5001 { 5002 int i; 5003 5004 if (amdgpu_asic_need_full_reset(adev)) 5005 return true; 5006 5007 for (i = 0; i < adev->num_ip_blocks; i++) { 5008 if (!adev->ip_blocks[i].status.valid) 5009 continue; 5010 if ((adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) || 5011 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) || 5012 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_ACP) || 5013 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE) || 5014 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) { 5015 if (adev->ip_blocks[i].status.hang) { 5016 dev_info(adev->dev, "Some block need full reset!\n"); 5017 return true; 5018 } 5019 } 5020 } 5021 return false; 5022 } 5023 5024 /** 5025 * amdgpu_device_ip_soft_reset - do a soft reset 5026 * 5027 * @adev: amdgpu_device pointer 5028 * 5029 * The list of all the hardware IPs that make up the asic is walked and the 5030 * soft_reset callbacks are run if the block is hung. soft_reset handles any 5031 * IP specific hardware or software state changes that are necessary to soft 5032 * reset the IP. 5033 * Returns 0 on success, negative error code on failure. 5034 */ 5035 static int amdgpu_device_ip_soft_reset(struct amdgpu_device *adev) 5036 { 5037 int i, r = 0; 5038 5039 for (i = 0; i < adev->num_ip_blocks; i++) { 5040 if (!adev->ip_blocks[i].status.valid) 5041 continue; 5042 if (adev->ip_blocks[i].status.hang && 5043 adev->ip_blocks[i].version->funcs->soft_reset) { 5044 r = adev->ip_blocks[i].version->funcs->soft_reset(&adev->ip_blocks[i]); 5045 if (r) 5046 return r; 5047 } 5048 } 5049 5050 return 0; 5051 } 5052 5053 /** 5054 * amdgpu_device_ip_post_soft_reset - clean up from soft reset 5055 * 5056 * @adev: amdgpu_device pointer 5057 * 5058 * The list of all the hardware IPs that make up the asic is walked and the 5059 * post_soft_reset callbacks are run if the asic was hung. post_soft_reset 5060 * handles any IP specific hardware or software state changes that are 5061 * necessary after the IP has been soft reset. 5062 * Returns 0 on success, negative error code on failure. 5063 */ 5064 static int amdgpu_device_ip_post_soft_reset(struct amdgpu_device *adev) 5065 { 5066 int i, r = 0; 5067 5068 for (i = 0; i < adev->num_ip_blocks; i++) { 5069 if (!adev->ip_blocks[i].status.valid) 5070 continue; 5071 if (adev->ip_blocks[i].status.hang && 5072 adev->ip_blocks[i].version->funcs->post_soft_reset) 5073 r = adev->ip_blocks[i].version->funcs->post_soft_reset(&adev->ip_blocks[i]); 5074 if (r) 5075 return r; 5076 } 5077 5078 return 0; 5079 } 5080 5081 /** 5082 * amdgpu_device_reset_sriov - reset ASIC for SR-IOV vf 5083 * 5084 * @adev: amdgpu_device pointer 5085 * @reset_context: amdgpu reset context pointer 5086 * 5087 * do VF FLR and reinitialize Asic 5088 * return 0 means succeeded otherwise failed 5089 */ 5090 static int amdgpu_device_reset_sriov(struct amdgpu_device *adev, 5091 struct amdgpu_reset_context *reset_context) 5092 { 5093 int r; 5094 struct amdgpu_hive_info *hive = NULL; 5095 5096 if (test_bit(AMDGPU_HOST_FLR, &reset_context->flags)) { 5097 if (!amdgpu_ras_get_fed_status(adev)) 5098 amdgpu_virt_ready_to_reset(adev); 5099 amdgpu_virt_wait_reset(adev); 5100 clear_bit(AMDGPU_HOST_FLR, &reset_context->flags); 5101 r = amdgpu_virt_request_full_gpu(adev, true); 5102 } else { 5103 r = amdgpu_virt_reset_gpu(adev); 5104 } 5105 if (r) 5106 return r; 5107 5108 amdgpu_ras_set_fed(adev, false); 5109 amdgpu_irq_gpu_reset_resume_helper(adev); 5110 5111 /* some sw clean up VF needs to do before recover */ 5112 amdgpu_virt_post_reset(adev); 5113 5114 /* Resume IP prior to SMC */ 5115 r = amdgpu_device_ip_reinit_early_sriov(adev); 5116 if (r) 5117 return r; 5118 5119 amdgpu_virt_init_data_exchange(adev); 5120 5121 r = amdgpu_device_fw_loading(adev); 5122 if (r) 5123 return r; 5124 5125 /* now we are okay to resume SMC/CP/SDMA */ 5126 r = amdgpu_device_ip_reinit_late_sriov(adev); 5127 if (r) 5128 return r; 5129 5130 hive = amdgpu_get_xgmi_hive(adev); 5131 /* Update PSP FW topology after reset */ 5132 if (hive && adev->gmc.xgmi.num_physical_nodes > 1) 5133 r = amdgpu_xgmi_update_topology(hive, adev); 5134 if (hive) 5135 amdgpu_put_xgmi_hive(hive); 5136 if (r) 5137 return r; 5138 5139 r = amdgpu_ib_ring_tests(adev); 5140 if (r) 5141 return r; 5142 5143 if (adev->virt.gim_feature & AMDGIM_FEATURE_GIM_FLR_VRAMLOST) 5144 amdgpu_inc_vram_lost(adev); 5145 5146 /* need to be called during full access so we can't do it later like 5147 * bare-metal does. 5148 */ 5149 amdgpu_amdkfd_post_reset(adev); 5150 amdgpu_virt_release_full_gpu(adev, true); 5151 5152 /* Aldebaran and gfx_11_0_3 support ras in SRIOV, so need resume ras during reset */ 5153 if (amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 2) || 5154 amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 3) || 5155 amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 4) || 5156 amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(11, 0, 3)) 5157 amdgpu_ras_resume(adev); 5158 return 0; 5159 } 5160 5161 /** 5162 * amdgpu_device_has_job_running - check if there is any job in mirror list 5163 * 5164 * @adev: amdgpu_device pointer 5165 * 5166 * check if there is any job in mirror list 5167 */ 5168 bool amdgpu_device_has_job_running(struct amdgpu_device *adev) 5169 { 5170 int i; 5171 struct drm_sched_job *job; 5172 5173 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 5174 struct amdgpu_ring *ring = adev->rings[i]; 5175 5176 if (!amdgpu_ring_sched_ready(ring)) 5177 continue; 5178 5179 spin_lock(&ring->sched.job_list_lock); 5180 job = list_first_entry_or_null(&ring->sched.pending_list, 5181 struct drm_sched_job, list); 5182 spin_unlock(&ring->sched.job_list_lock); 5183 if (job) 5184 return true; 5185 } 5186 return false; 5187 } 5188 5189 /** 5190 * amdgpu_device_should_recover_gpu - check if we should try GPU recovery 5191 * 5192 * @adev: amdgpu_device pointer 5193 * 5194 * Check amdgpu_gpu_recovery and SRIOV status to see if we should try to recover 5195 * a hung GPU. 5196 */ 5197 bool amdgpu_device_should_recover_gpu(struct amdgpu_device *adev) 5198 { 5199 5200 if (amdgpu_gpu_recovery == 0) 5201 goto disabled; 5202 5203 /* Skip soft reset check in fatal error mode */ 5204 if (!amdgpu_ras_is_poison_mode_supported(adev)) 5205 return true; 5206 5207 if (amdgpu_sriov_vf(adev)) 5208 return true; 5209 5210 if (amdgpu_gpu_recovery == -1) { 5211 switch (adev->asic_type) { 5212 #ifdef CONFIG_DRM_AMDGPU_SI 5213 case CHIP_VERDE: 5214 case CHIP_TAHITI: 5215 case CHIP_PITCAIRN: 5216 case CHIP_OLAND: 5217 case CHIP_HAINAN: 5218 #endif 5219 #ifdef CONFIG_DRM_AMDGPU_CIK 5220 case CHIP_KAVERI: 5221 case CHIP_KABINI: 5222 case CHIP_MULLINS: 5223 #endif 5224 case CHIP_CARRIZO: 5225 case CHIP_STONEY: 5226 case CHIP_CYAN_SKILLFISH: 5227 goto disabled; 5228 default: 5229 break; 5230 } 5231 } 5232 5233 return true; 5234 5235 disabled: 5236 dev_info(adev->dev, "GPU recovery disabled.\n"); 5237 return false; 5238 } 5239 5240 int amdgpu_device_mode1_reset(struct amdgpu_device *adev) 5241 { 5242 u32 i; 5243 int ret = 0; 5244 5245 amdgpu_atombios_scratch_regs_engine_hung(adev, true); 5246 5247 dev_info(adev->dev, "GPU mode1 reset\n"); 5248 5249 /* Cache the state before bus master disable. The saved config space 5250 * values are used in other cases like restore after mode-2 reset. 5251 */ 5252 amdgpu_device_cache_pci_state(adev->pdev); 5253 5254 /* disable BM */ 5255 pci_clear_master(adev->pdev); 5256 5257 if (amdgpu_dpm_is_mode1_reset_supported(adev)) { 5258 dev_info(adev->dev, "GPU smu mode1 reset\n"); 5259 ret = amdgpu_dpm_mode1_reset(adev); 5260 } else { 5261 dev_info(adev->dev, "GPU psp mode1 reset\n"); 5262 ret = psp_gpu_reset(adev); 5263 } 5264 5265 if (ret) 5266 goto mode1_reset_failed; 5267 5268 amdgpu_device_load_pci_state(adev->pdev); 5269 ret = amdgpu_psp_wait_for_bootloader(adev); 5270 if (ret) 5271 goto mode1_reset_failed; 5272 5273 /* wait for asic to come out of reset */ 5274 for (i = 0; i < adev->usec_timeout; i++) { 5275 u32 memsize = adev->nbio.funcs->get_memsize(adev); 5276 5277 if (memsize != 0xffffffff) 5278 break; 5279 udelay(1); 5280 } 5281 5282 if (i >= adev->usec_timeout) { 5283 ret = -ETIMEDOUT; 5284 goto mode1_reset_failed; 5285 } 5286 5287 amdgpu_atombios_scratch_regs_engine_hung(adev, false); 5288 5289 return 0; 5290 5291 mode1_reset_failed: 5292 dev_err(adev->dev, "GPU mode1 reset failed\n"); 5293 return ret; 5294 } 5295 5296 int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev, 5297 struct amdgpu_reset_context *reset_context) 5298 { 5299 int i, r = 0; 5300 struct amdgpu_job *job = NULL; 5301 struct amdgpu_device *tmp_adev = reset_context->reset_req_dev; 5302 bool need_full_reset = 5303 test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 5304 5305 if (reset_context->reset_req_dev == adev) 5306 job = reset_context->job; 5307 5308 if (amdgpu_sriov_vf(adev)) 5309 amdgpu_virt_pre_reset(adev); 5310 5311 amdgpu_fence_driver_isr_toggle(adev, true); 5312 5313 /* block all schedulers and reset given job's ring */ 5314 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 5315 struct amdgpu_ring *ring = adev->rings[i]; 5316 5317 if (!amdgpu_ring_sched_ready(ring)) 5318 continue; 5319 5320 /* Clear job fence from fence drv to avoid force_completion 5321 * leave NULL and vm flush fence in fence drv 5322 */ 5323 amdgpu_fence_driver_clear_job_fences(ring); 5324 5325 /* after all hw jobs are reset, hw fence is meaningless, so force_completion */ 5326 amdgpu_fence_driver_force_completion(ring); 5327 } 5328 5329 amdgpu_fence_driver_isr_toggle(adev, false); 5330 5331 if (job && job->vm) 5332 drm_sched_increase_karma(&job->base); 5333 5334 r = amdgpu_reset_prepare_hwcontext(adev, reset_context); 5335 /* If reset handler not implemented, continue; otherwise return */ 5336 if (r == -EOPNOTSUPP) 5337 r = 0; 5338 else 5339 return r; 5340 5341 /* Don't suspend on bare metal if we are not going to HW reset the ASIC */ 5342 if (!amdgpu_sriov_vf(adev)) { 5343 5344 if (!need_full_reset) 5345 need_full_reset = amdgpu_device_ip_need_full_reset(adev); 5346 5347 if (!need_full_reset && amdgpu_gpu_recovery && 5348 amdgpu_device_ip_check_soft_reset(adev)) { 5349 amdgpu_device_ip_pre_soft_reset(adev); 5350 r = amdgpu_device_ip_soft_reset(adev); 5351 amdgpu_device_ip_post_soft_reset(adev); 5352 if (r || amdgpu_device_ip_check_soft_reset(adev)) { 5353 dev_info(adev->dev, "soft reset failed, will fallback to full reset!\n"); 5354 need_full_reset = true; 5355 } 5356 } 5357 5358 if (!test_bit(AMDGPU_SKIP_COREDUMP, &reset_context->flags)) { 5359 dev_info(tmp_adev->dev, "Dumping IP State\n"); 5360 /* Trigger ip dump before we reset the asic */ 5361 for (i = 0; i < tmp_adev->num_ip_blocks; i++) 5362 if (tmp_adev->ip_blocks[i].version->funcs->dump_ip_state) 5363 tmp_adev->ip_blocks[i].version->funcs 5364 ->dump_ip_state((void *)&tmp_adev->ip_blocks[i]); 5365 dev_info(tmp_adev->dev, "Dumping IP State Completed\n"); 5366 } 5367 5368 if (need_full_reset) 5369 r = amdgpu_device_ip_suspend(adev); 5370 if (need_full_reset) 5371 set_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 5372 else 5373 clear_bit(AMDGPU_NEED_FULL_RESET, 5374 &reset_context->flags); 5375 } 5376 5377 return r; 5378 } 5379 5380 int amdgpu_device_reinit_after_reset(struct amdgpu_reset_context *reset_context) 5381 { 5382 struct list_head *device_list_handle; 5383 bool full_reset, vram_lost = false; 5384 struct amdgpu_device *tmp_adev; 5385 int r; 5386 5387 device_list_handle = reset_context->reset_device_list; 5388 5389 if (!device_list_handle) 5390 return -EINVAL; 5391 5392 full_reset = test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 5393 5394 r = 0; 5395 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 5396 /* After reset, it's default init level */ 5397 amdgpu_set_init_level(tmp_adev, AMDGPU_INIT_LEVEL_DEFAULT); 5398 if (full_reset) { 5399 /* post card */ 5400 amdgpu_ras_set_fed(tmp_adev, false); 5401 r = amdgpu_device_asic_init(tmp_adev); 5402 if (r) { 5403 dev_warn(tmp_adev->dev, "asic atom init failed!"); 5404 } else { 5405 dev_info(tmp_adev->dev, "GPU reset succeeded, trying to resume\n"); 5406 5407 r = amdgpu_device_ip_resume_phase1(tmp_adev); 5408 if (r) 5409 goto out; 5410 5411 vram_lost = amdgpu_device_check_vram_lost(tmp_adev); 5412 5413 if (!test_bit(AMDGPU_SKIP_COREDUMP, &reset_context->flags)) 5414 amdgpu_coredump(tmp_adev, false, vram_lost, reset_context->job); 5415 5416 if (vram_lost) { 5417 DRM_INFO("VRAM is lost due to GPU reset!\n"); 5418 amdgpu_inc_vram_lost(tmp_adev); 5419 } 5420 5421 r = amdgpu_device_fw_loading(tmp_adev); 5422 if (r) 5423 return r; 5424 5425 r = amdgpu_xcp_restore_partition_mode( 5426 tmp_adev->xcp_mgr); 5427 if (r) 5428 goto out; 5429 5430 r = amdgpu_device_ip_resume_phase2(tmp_adev); 5431 if (r) 5432 goto out; 5433 5434 if (tmp_adev->mman.buffer_funcs_ring->sched.ready) 5435 amdgpu_ttm_set_buffer_funcs_status(tmp_adev, true); 5436 5437 if (vram_lost) 5438 amdgpu_device_fill_reset_magic(tmp_adev); 5439 5440 /* 5441 * Add this ASIC as tracked as reset was already 5442 * complete successfully. 5443 */ 5444 amdgpu_register_gpu_instance(tmp_adev); 5445 5446 if (!reset_context->hive && 5447 tmp_adev->gmc.xgmi.num_physical_nodes > 1) 5448 amdgpu_xgmi_add_device(tmp_adev); 5449 5450 r = amdgpu_device_ip_late_init(tmp_adev); 5451 if (r) 5452 goto out; 5453 5454 drm_fb_helper_set_suspend_unlocked(adev_to_drm(tmp_adev)->fb_helper, false); 5455 5456 /* 5457 * The GPU enters bad state once faulty pages 5458 * by ECC has reached the threshold, and ras 5459 * recovery is scheduled next. So add one check 5460 * here to break recovery if it indeed exceeds 5461 * bad page threshold, and remind user to 5462 * retire this GPU or setting one bigger 5463 * bad_page_threshold value to fix this once 5464 * probing driver again. 5465 */ 5466 if (!amdgpu_ras_is_rma(tmp_adev)) { 5467 /* must succeed. */ 5468 amdgpu_ras_resume(tmp_adev); 5469 } else { 5470 r = -EINVAL; 5471 goto out; 5472 } 5473 5474 /* Update PSP FW topology after reset */ 5475 if (reset_context->hive && 5476 tmp_adev->gmc.xgmi.num_physical_nodes > 1) 5477 r = amdgpu_xgmi_update_topology( 5478 reset_context->hive, tmp_adev); 5479 } 5480 } 5481 5482 out: 5483 if (!r) { 5484 amdgpu_irq_gpu_reset_resume_helper(tmp_adev); 5485 r = amdgpu_ib_ring_tests(tmp_adev); 5486 if (r) { 5487 dev_err(tmp_adev->dev, "ib ring test failed (%d).\n", r); 5488 r = -EAGAIN; 5489 goto end; 5490 } 5491 } 5492 5493 if (r) 5494 tmp_adev->asic_reset_res = r; 5495 } 5496 5497 end: 5498 return r; 5499 } 5500 5501 int amdgpu_do_asic_reset(struct list_head *device_list_handle, 5502 struct amdgpu_reset_context *reset_context) 5503 { 5504 struct amdgpu_device *tmp_adev = NULL; 5505 bool need_full_reset, skip_hw_reset; 5506 int r = 0; 5507 5508 /* Try reset handler method first */ 5509 tmp_adev = list_first_entry(device_list_handle, struct amdgpu_device, 5510 reset_list); 5511 5512 reset_context->reset_device_list = device_list_handle; 5513 r = amdgpu_reset_perform_reset(tmp_adev, reset_context); 5514 /* If reset handler not implemented, continue; otherwise return */ 5515 if (r == -EOPNOTSUPP) 5516 r = 0; 5517 else 5518 return r; 5519 5520 /* Reset handler not implemented, use the default method */ 5521 need_full_reset = 5522 test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 5523 skip_hw_reset = test_bit(AMDGPU_SKIP_HW_RESET, &reset_context->flags); 5524 5525 /* 5526 * ASIC reset has to be done on all XGMI hive nodes ASAP 5527 * to allow proper links negotiation in FW (within 1 sec) 5528 */ 5529 if (!skip_hw_reset && need_full_reset) { 5530 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 5531 /* For XGMI run all resets in parallel to speed up the process */ 5532 if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) { 5533 if (!queue_work(system_unbound_wq, 5534 &tmp_adev->xgmi_reset_work)) 5535 r = -EALREADY; 5536 } else 5537 r = amdgpu_asic_reset(tmp_adev); 5538 5539 if (r) { 5540 dev_err(tmp_adev->dev, 5541 "ASIC reset failed with error, %d for drm dev, %s", 5542 r, adev_to_drm(tmp_adev)->unique); 5543 goto out; 5544 } 5545 } 5546 5547 /* For XGMI wait for all resets to complete before proceed */ 5548 if (!r) { 5549 list_for_each_entry(tmp_adev, device_list_handle, 5550 reset_list) { 5551 if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) { 5552 flush_work(&tmp_adev->xgmi_reset_work); 5553 r = tmp_adev->asic_reset_res; 5554 if (r) 5555 break; 5556 } 5557 } 5558 } 5559 } 5560 5561 if (!r && amdgpu_ras_intr_triggered()) { 5562 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 5563 amdgpu_ras_reset_error_count(tmp_adev, 5564 AMDGPU_RAS_BLOCK__MMHUB); 5565 } 5566 5567 amdgpu_ras_intr_cleared(); 5568 } 5569 5570 r = amdgpu_device_reinit_after_reset(reset_context); 5571 if (r == -EAGAIN) 5572 set_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 5573 else 5574 clear_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 5575 5576 out: 5577 return r; 5578 } 5579 5580 static void amdgpu_device_set_mp1_state(struct amdgpu_device *adev) 5581 { 5582 5583 switch (amdgpu_asic_reset_method(adev)) { 5584 case AMD_RESET_METHOD_MODE1: 5585 adev->mp1_state = PP_MP1_STATE_SHUTDOWN; 5586 break; 5587 case AMD_RESET_METHOD_MODE2: 5588 adev->mp1_state = PP_MP1_STATE_RESET; 5589 break; 5590 default: 5591 adev->mp1_state = PP_MP1_STATE_NONE; 5592 break; 5593 } 5594 } 5595 5596 static void amdgpu_device_unset_mp1_state(struct amdgpu_device *adev) 5597 { 5598 amdgpu_vf_error_trans_all(adev); 5599 adev->mp1_state = PP_MP1_STATE_NONE; 5600 } 5601 5602 static void amdgpu_device_resume_display_audio(struct amdgpu_device *adev) 5603 { 5604 struct pci_dev *p = NULL; 5605 5606 p = pci_get_domain_bus_and_slot(pci_domain_nr(adev->pdev->bus), 5607 adev->pdev->bus->number, 1); 5608 if (p) { 5609 pm_runtime_enable(&(p->dev)); 5610 pm_runtime_resume(&(p->dev)); 5611 } 5612 5613 pci_dev_put(p); 5614 } 5615 5616 static int amdgpu_device_suspend_display_audio(struct amdgpu_device *adev) 5617 { 5618 enum amd_reset_method reset_method; 5619 struct pci_dev *p = NULL; 5620 u64 expires; 5621 5622 /* 5623 * For now, only BACO and mode1 reset are confirmed 5624 * to suffer the audio issue without proper suspended. 5625 */ 5626 reset_method = amdgpu_asic_reset_method(adev); 5627 if ((reset_method != AMD_RESET_METHOD_BACO) && 5628 (reset_method != AMD_RESET_METHOD_MODE1)) 5629 return -EINVAL; 5630 5631 p = pci_get_domain_bus_and_slot(pci_domain_nr(adev->pdev->bus), 5632 adev->pdev->bus->number, 1); 5633 if (!p) 5634 return -ENODEV; 5635 5636 expires = pm_runtime_autosuspend_expiration(&(p->dev)); 5637 if (!expires) 5638 /* 5639 * If we cannot get the audio device autosuspend delay, 5640 * a fixed 4S interval will be used. Considering 3S is 5641 * the audio controller default autosuspend delay setting. 5642 * 4S used here is guaranteed to cover that. 5643 */ 5644 expires = ktime_get_mono_fast_ns() + NSEC_PER_SEC * 4ULL; 5645 5646 while (!pm_runtime_status_suspended(&(p->dev))) { 5647 if (!pm_runtime_suspend(&(p->dev))) 5648 break; 5649 5650 if (expires < ktime_get_mono_fast_ns()) { 5651 dev_warn(adev->dev, "failed to suspend display audio\n"); 5652 pci_dev_put(p); 5653 /* TODO: abort the succeeding gpu reset? */ 5654 return -ETIMEDOUT; 5655 } 5656 } 5657 5658 pm_runtime_disable(&(p->dev)); 5659 5660 pci_dev_put(p); 5661 return 0; 5662 } 5663 5664 static inline void amdgpu_device_stop_pending_resets(struct amdgpu_device *adev) 5665 { 5666 struct amdgpu_ras *con = amdgpu_ras_get_context(adev); 5667 5668 #if defined(CONFIG_DEBUG_FS) 5669 if (!amdgpu_sriov_vf(adev)) 5670 cancel_work(&adev->reset_work); 5671 #endif 5672 5673 if (adev->kfd.dev) 5674 cancel_work(&adev->kfd.reset_work); 5675 5676 if (amdgpu_sriov_vf(adev)) 5677 cancel_work(&adev->virt.flr_work); 5678 5679 if (con && adev->ras_enabled) 5680 cancel_work(&con->recovery_work); 5681 5682 } 5683 5684 static int amdgpu_device_health_check(struct list_head *device_list_handle) 5685 { 5686 struct amdgpu_device *tmp_adev; 5687 int ret = 0; 5688 u32 status; 5689 5690 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 5691 pci_read_config_dword(tmp_adev->pdev, PCI_COMMAND, &status); 5692 if (PCI_POSSIBLE_ERROR(status)) { 5693 dev_err(tmp_adev->dev, "device lost from bus!"); 5694 ret = -ENODEV; 5695 } 5696 } 5697 5698 return ret; 5699 } 5700 5701 /** 5702 * amdgpu_device_gpu_recover - reset the asic and recover scheduler 5703 * 5704 * @adev: amdgpu_device pointer 5705 * @job: which job trigger hang 5706 * @reset_context: amdgpu reset context pointer 5707 * 5708 * Attempt to reset the GPU if it has hung (all asics). 5709 * Attempt to do soft-reset or full-reset and reinitialize Asic 5710 * Returns 0 for success or an error on failure. 5711 */ 5712 5713 int amdgpu_device_gpu_recover(struct amdgpu_device *adev, 5714 struct amdgpu_job *job, 5715 struct amdgpu_reset_context *reset_context) 5716 { 5717 struct list_head device_list, *device_list_handle = NULL; 5718 bool job_signaled = false; 5719 struct amdgpu_hive_info *hive = NULL; 5720 struct amdgpu_device *tmp_adev = NULL; 5721 int i, r = 0; 5722 bool need_emergency_restart = false; 5723 bool audio_suspended = false; 5724 int retry_limit = AMDGPU_MAX_RETRY_LIMIT; 5725 5726 /* 5727 * Special case: RAS triggered and full reset isn't supported 5728 */ 5729 need_emergency_restart = amdgpu_ras_need_emergency_restart(adev); 5730 5731 /* 5732 * Flush RAM to disk so that after reboot 5733 * the user can read log and see why the system rebooted. 5734 */ 5735 if (need_emergency_restart && amdgpu_ras_get_context(adev) && 5736 amdgpu_ras_get_context(adev)->reboot) { 5737 DRM_WARN("Emergency reboot."); 5738 5739 ksys_sync_helper(); 5740 emergency_restart(); 5741 } 5742 5743 dev_info(adev->dev, "GPU %s begin!\n", 5744 need_emergency_restart ? "jobs stop":"reset"); 5745 5746 if (!amdgpu_sriov_vf(adev)) 5747 hive = amdgpu_get_xgmi_hive(adev); 5748 if (hive) 5749 mutex_lock(&hive->hive_lock); 5750 5751 reset_context->job = job; 5752 reset_context->hive = hive; 5753 /* 5754 * Build list of devices to reset. 5755 * In case we are in XGMI hive mode, resort the device list 5756 * to put adev in the 1st position. 5757 */ 5758 INIT_LIST_HEAD(&device_list); 5759 if (!amdgpu_sriov_vf(adev) && (adev->gmc.xgmi.num_physical_nodes > 1) && hive) { 5760 list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head) { 5761 list_add_tail(&tmp_adev->reset_list, &device_list); 5762 if (adev->shutdown) 5763 tmp_adev->shutdown = true; 5764 } 5765 if (!list_is_first(&adev->reset_list, &device_list)) 5766 list_rotate_to_front(&adev->reset_list, &device_list); 5767 device_list_handle = &device_list; 5768 } else { 5769 list_add_tail(&adev->reset_list, &device_list); 5770 device_list_handle = &device_list; 5771 } 5772 5773 if (!amdgpu_sriov_vf(adev)) { 5774 r = amdgpu_device_health_check(device_list_handle); 5775 if (r) 5776 goto end_reset; 5777 } 5778 5779 /* We need to lock reset domain only once both for XGMI and single device */ 5780 tmp_adev = list_first_entry(device_list_handle, struct amdgpu_device, 5781 reset_list); 5782 amdgpu_device_lock_reset_domain(tmp_adev->reset_domain); 5783 5784 /* block all schedulers and reset given job's ring */ 5785 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 5786 5787 amdgpu_device_set_mp1_state(tmp_adev); 5788 5789 /* 5790 * Try to put the audio codec into suspend state 5791 * before gpu reset started. 5792 * 5793 * Due to the power domain of the graphics device 5794 * is shared with AZ power domain. Without this, 5795 * we may change the audio hardware from behind 5796 * the audio driver's back. That will trigger 5797 * some audio codec errors. 5798 */ 5799 if (!amdgpu_device_suspend_display_audio(tmp_adev)) 5800 audio_suspended = true; 5801 5802 amdgpu_ras_set_error_query_ready(tmp_adev, false); 5803 5804 cancel_delayed_work_sync(&tmp_adev->delayed_init_work); 5805 5806 amdgpu_amdkfd_pre_reset(tmp_adev, reset_context); 5807 5808 /* 5809 * Mark these ASICs to be reseted as untracked first 5810 * And add them back after reset completed 5811 */ 5812 amdgpu_unregister_gpu_instance(tmp_adev); 5813 5814 drm_fb_helper_set_suspend_unlocked(adev_to_drm(tmp_adev)->fb_helper, true); 5815 5816 /* disable ras on ALL IPs */ 5817 if (!need_emergency_restart && 5818 amdgpu_device_ip_need_full_reset(tmp_adev)) 5819 amdgpu_ras_suspend(tmp_adev); 5820 5821 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 5822 struct amdgpu_ring *ring = tmp_adev->rings[i]; 5823 5824 if (!amdgpu_ring_sched_ready(ring)) 5825 continue; 5826 5827 drm_sched_stop(&ring->sched, job ? &job->base : NULL); 5828 5829 if (need_emergency_restart) 5830 amdgpu_job_stop_all_jobs_on_sched(&ring->sched); 5831 } 5832 atomic_inc(&tmp_adev->gpu_reset_counter); 5833 } 5834 5835 if (need_emergency_restart) 5836 goto skip_sched_resume; 5837 5838 /* 5839 * Must check guilty signal here since after this point all old 5840 * HW fences are force signaled. 5841 * 5842 * job->base holds a reference to parent fence 5843 */ 5844 if (job && dma_fence_is_signaled(&job->hw_fence)) { 5845 job_signaled = true; 5846 dev_info(adev->dev, "Guilty job already signaled, skipping HW reset"); 5847 goto skip_hw_reset; 5848 } 5849 5850 retry: /* Rest of adevs pre asic reset from XGMI hive. */ 5851 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 5852 r = amdgpu_device_pre_asic_reset(tmp_adev, reset_context); 5853 /*TODO Should we stop ?*/ 5854 if (r) { 5855 dev_err(tmp_adev->dev, "GPU pre asic reset failed with err, %d for drm dev, %s ", 5856 r, adev_to_drm(tmp_adev)->unique); 5857 tmp_adev->asic_reset_res = r; 5858 } 5859 } 5860 5861 /* Actual ASIC resets if needed.*/ 5862 /* Host driver will handle XGMI hive reset for SRIOV */ 5863 if (amdgpu_sriov_vf(adev)) { 5864 if (amdgpu_ras_get_fed_status(adev) || amdgpu_virt_rcvd_ras_interrupt(adev)) { 5865 dev_dbg(adev->dev, "Detected RAS error, wait for FLR completion\n"); 5866 amdgpu_ras_set_fed(adev, true); 5867 set_bit(AMDGPU_HOST_FLR, &reset_context->flags); 5868 } 5869 5870 r = amdgpu_device_reset_sriov(adev, reset_context); 5871 if (AMDGPU_RETRY_SRIOV_RESET(r) && (retry_limit--) > 0) { 5872 amdgpu_virt_release_full_gpu(adev, true); 5873 goto retry; 5874 } 5875 if (r) 5876 adev->asic_reset_res = r; 5877 } else { 5878 r = amdgpu_do_asic_reset(device_list_handle, reset_context); 5879 if (r && r == -EAGAIN) 5880 goto retry; 5881 } 5882 5883 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 5884 /* 5885 * Drop any pending non scheduler resets queued before reset is done. 5886 * Any reset scheduled after this point would be valid. Scheduler resets 5887 * were already dropped during drm_sched_stop and no new ones can come 5888 * in before drm_sched_start. 5889 */ 5890 amdgpu_device_stop_pending_resets(tmp_adev); 5891 } 5892 5893 skip_hw_reset: 5894 5895 /* Post ASIC reset for all devs .*/ 5896 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 5897 5898 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 5899 struct amdgpu_ring *ring = tmp_adev->rings[i]; 5900 5901 if (!amdgpu_ring_sched_ready(ring)) 5902 continue; 5903 5904 drm_sched_start(&ring->sched); 5905 } 5906 5907 if (!drm_drv_uses_atomic_modeset(adev_to_drm(tmp_adev)) && !job_signaled) 5908 drm_helper_resume_force_mode(adev_to_drm(tmp_adev)); 5909 5910 if (tmp_adev->asic_reset_res) 5911 r = tmp_adev->asic_reset_res; 5912 5913 tmp_adev->asic_reset_res = 0; 5914 5915 if (r) { 5916 /* bad news, how to tell it to userspace ? 5917 * for ras error, we should report GPU bad status instead of 5918 * reset failure 5919 */ 5920 if (reset_context->src != AMDGPU_RESET_SRC_RAS || 5921 !amdgpu_ras_eeprom_check_err_threshold(tmp_adev)) 5922 dev_info(tmp_adev->dev, "GPU reset(%d) failed\n", 5923 atomic_read(&tmp_adev->gpu_reset_counter)); 5924 amdgpu_vf_error_put(tmp_adev, AMDGIM_ERROR_VF_GPU_RESET_FAIL, 0, r); 5925 } else { 5926 dev_info(tmp_adev->dev, "GPU reset(%d) succeeded!\n", atomic_read(&tmp_adev->gpu_reset_counter)); 5927 if (amdgpu_acpi_smart_shift_update(adev_to_drm(tmp_adev), AMDGPU_SS_DEV_D0)) 5928 DRM_WARN("smart shift update failed\n"); 5929 } 5930 } 5931 5932 skip_sched_resume: 5933 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 5934 /* unlock kfd: SRIOV would do it separately */ 5935 if (!need_emergency_restart && !amdgpu_sriov_vf(tmp_adev)) 5936 amdgpu_amdkfd_post_reset(tmp_adev); 5937 5938 /* kfd_post_reset will do nothing if kfd device is not initialized, 5939 * need to bring up kfd here if it's not be initialized before 5940 */ 5941 if (!adev->kfd.init_complete) 5942 amdgpu_amdkfd_device_init(adev); 5943 5944 if (audio_suspended) 5945 amdgpu_device_resume_display_audio(tmp_adev); 5946 5947 amdgpu_device_unset_mp1_state(tmp_adev); 5948 5949 amdgpu_ras_set_error_query_ready(tmp_adev, true); 5950 } 5951 5952 tmp_adev = list_first_entry(device_list_handle, struct amdgpu_device, 5953 reset_list); 5954 amdgpu_device_unlock_reset_domain(tmp_adev->reset_domain); 5955 5956 end_reset: 5957 if (hive) { 5958 mutex_unlock(&hive->hive_lock); 5959 amdgpu_put_xgmi_hive(hive); 5960 } 5961 5962 if (r) 5963 dev_info(adev->dev, "GPU reset end with ret = %d\n", r); 5964 5965 atomic_set(&adev->reset_domain->reset_res, r); 5966 return r; 5967 } 5968 5969 /** 5970 * amdgpu_device_partner_bandwidth - find the bandwidth of appropriate partner 5971 * 5972 * @adev: amdgpu_device pointer 5973 * @speed: pointer to the speed of the link 5974 * @width: pointer to the width of the link 5975 * 5976 * Evaluate the hierarchy to find the speed and bandwidth capabilities of the 5977 * first physical partner to an AMD dGPU. 5978 * This will exclude any virtual switches and links. 5979 */ 5980 static void amdgpu_device_partner_bandwidth(struct amdgpu_device *adev, 5981 enum pci_bus_speed *speed, 5982 enum pcie_link_width *width) 5983 { 5984 struct pci_dev *parent = adev->pdev; 5985 5986 if (!speed || !width) 5987 return; 5988 5989 *speed = PCI_SPEED_UNKNOWN; 5990 *width = PCIE_LNK_WIDTH_UNKNOWN; 5991 5992 if (amdgpu_device_pcie_dynamic_switching_supported(adev)) { 5993 while ((parent = pci_upstream_bridge(parent))) { 5994 /* skip upstream/downstream switches internal to dGPU*/ 5995 if (parent->vendor == PCI_VENDOR_ID_ATI) 5996 continue; 5997 *speed = pcie_get_speed_cap(parent); 5998 *width = pcie_get_width_cap(parent); 5999 break; 6000 } 6001 } else { 6002 /* use the current speeds rather than max if switching is not supported */ 6003 pcie_bandwidth_available(adev->pdev, NULL, speed, width); 6004 } 6005 } 6006 6007 /** 6008 * amdgpu_device_get_pcie_info - fence pcie info about the PCIE slot 6009 * 6010 * @adev: amdgpu_device pointer 6011 * 6012 * Fetchs and stores in the driver the PCIE capabilities (gen speed 6013 * and lanes) of the slot the device is in. Handles APUs and 6014 * virtualized environments where PCIE config space may not be available. 6015 */ 6016 static void amdgpu_device_get_pcie_info(struct amdgpu_device *adev) 6017 { 6018 struct pci_dev *pdev; 6019 enum pci_bus_speed speed_cap, platform_speed_cap; 6020 enum pcie_link_width platform_link_width; 6021 6022 if (amdgpu_pcie_gen_cap) 6023 adev->pm.pcie_gen_mask = amdgpu_pcie_gen_cap; 6024 6025 if (amdgpu_pcie_lane_cap) 6026 adev->pm.pcie_mlw_mask = amdgpu_pcie_lane_cap; 6027 6028 /* covers APUs as well */ 6029 if (pci_is_root_bus(adev->pdev->bus) && !amdgpu_passthrough(adev)) { 6030 if (adev->pm.pcie_gen_mask == 0) 6031 adev->pm.pcie_gen_mask = AMDGPU_DEFAULT_PCIE_GEN_MASK; 6032 if (adev->pm.pcie_mlw_mask == 0) 6033 adev->pm.pcie_mlw_mask = AMDGPU_DEFAULT_PCIE_MLW_MASK; 6034 return; 6035 } 6036 6037 if (adev->pm.pcie_gen_mask && adev->pm.pcie_mlw_mask) 6038 return; 6039 6040 amdgpu_device_partner_bandwidth(adev, &platform_speed_cap, 6041 &platform_link_width); 6042 6043 if (adev->pm.pcie_gen_mask == 0) { 6044 /* asic caps */ 6045 pdev = adev->pdev; 6046 speed_cap = pcie_get_speed_cap(pdev); 6047 if (speed_cap == PCI_SPEED_UNKNOWN) { 6048 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 6049 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 | 6050 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3); 6051 } else { 6052 if (speed_cap == PCIE_SPEED_32_0GT) 6053 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 6054 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 | 6055 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3 | 6056 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN4 | 6057 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN5); 6058 else if (speed_cap == PCIE_SPEED_16_0GT) 6059 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 6060 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 | 6061 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3 | 6062 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN4); 6063 else if (speed_cap == PCIE_SPEED_8_0GT) 6064 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 6065 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 | 6066 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3); 6067 else if (speed_cap == PCIE_SPEED_5_0GT) 6068 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 6069 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2); 6070 else 6071 adev->pm.pcie_gen_mask |= CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1; 6072 } 6073 /* platform caps */ 6074 if (platform_speed_cap == PCI_SPEED_UNKNOWN) { 6075 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 6076 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2); 6077 } else { 6078 if (platform_speed_cap == PCIE_SPEED_32_0GT) 6079 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 6080 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 | 6081 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3 | 6082 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN4 | 6083 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN5); 6084 else if (platform_speed_cap == PCIE_SPEED_16_0GT) 6085 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 6086 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 | 6087 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3 | 6088 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN4); 6089 else if (platform_speed_cap == PCIE_SPEED_8_0GT) 6090 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 6091 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 | 6092 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3); 6093 else if (platform_speed_cap == PCIE_SPEED_5_0GT) 6094 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 6095 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2); 6096 else 6097 adev->pm.pcie_gen_mask |= CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1; 6098 6099 } 6100 } 6101 if (adev->pm.pcie_mlw_mask == 0) { 6102 if (platform_link_width == PCIE_LNK_WIDTH_UNKNOWN) { 6103 adev->pm.pcie_mlw_mask |= AMDGPU_DEFAULT_PCIE_MLW_MASK; 6104 } else { 6105 switch (platform_link_width) { 6106 case PCIE_LNK_X32: 6107 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X32 | 6108 CAIL_PCIE_LINK_WIDTH_SUPPORT_X16 | 6109 CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 | 6110 CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 | 6111 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 6112 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 6113 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 6114 break; 6115 case PCIE_LNK_X16: 6116 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X16 | 6117 CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 | 6118 CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 | 6119 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 6120 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 6121 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 6122 break; 6123 case PCIE_LNK_X12: 6124 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 | 6125 CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 | 6126 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 6127 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 6128 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 6129 break; 6130 case PCIE_LNK_X8: 6131 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 | 6132 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 6133 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 6134 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 6135 break; 6136 case PCIE_LNK_X4: 6137 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 6138 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 6139 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 6140 break; 6141 case PCIE_LNK_X2: 6142 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 6143 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 6144 break; 6145 case PCIE_LNK_X1: 6146 adev->pm.pcie_mlw_mask = CAIL_PCIE_LINK_WIDTH_SUPPORT_X1; 6147 break; 6148 default: 6149 break; 6150 } 6151 } 6152 } 6153 } 6154 6155 /** 6156 * amdgpu_device_is_peer_accessible - Check peer access through PCIe BAR 6157 * 6158 * @adev: amdgpu_device pointer 6159 * @peer_adev: amdgpu_device pointer for peer device trying to access @adev 6160 * 6161 * Return true if @peer_adev can access (DMA) @adev through the PCIe 6162 * BAR, i.e. @adev is "large BAR" and the BAR matches the DMA mask of 6163 * @peer_adev. 6164 */ 6165 bool amdgpu_device_is_peer_accessible(struct amdgpu_device *adev, 6166 struct amdgpu_device *peer_adev) 6167 { 6168 #ifdef CONFIG_HSA_AMD_P2P 6169 bool p2p_access = 6170 !adev->gmc.xgmi.connected_to_cpu && 6171 !(pci_p2pdma_distance(adev->pdev, peer_adev->dev, false) < 0); 6172 6173 bool is_large_bar = adev->gmc.visible_vram_size && 6174 adev->gmc.real_vram_size == adev->gmc.visible_vram_size; 6175 bool p2p_addressable = amdgpu_device_check_iommu_remap(peer_adev); 6176 6177 if (!p2p_addressable) { 6178 uint64_t address_mask = peer_adev->dev->dma_mask ? 6179 ~*peer_adev->dev->dma_mask : ~((1ULL << 32) - 1); 6180 resource_size_t aper_limit = 6181 adev->gmc.aper_base + adev->gmc.aper_size - 1; 6182 6183 p2p_addressable = !(adev->gmc.aper_base & address_mask || 6184 aper_limit & address_mask); 6185 } 6186 return pcie_p2p && is_large_bar && p2p_access && p2p_addressable; 6187 #else 6188 return false; 6189 #endif 6190 } 6191 6192 int amdgpu_device_baco_enter(struct drm_device *dev) 6193 { 6194 struct amdgpu_device *adev = drm_to_adev(dev); 6195 struct amdgpu_ras *ras = amdgpu_ras_get_context(adev); 6196 6197 if (!amdgpu_device_supports_baco(dev)) 6198 return -ENOTSUPP; 6199 6200 if (ras && adev->ras_enabled && 6201 adev->nbio.funcs->enable_doorbell_interrupt) 6202 adev->nbio.funcs->enable_doorbell_interrupt(adev, false); 6203 6204 return amdgpu_dpm_baco_enter(adev); 6205 } 6206 6207 int amdgpu_device_baco_exit(struct drm_device *dev) 6208 { 6209 struct amdgpu_device *adev = drm_to_adev(dev); 6210 struct amdgpu_ras *ras = amdgpu_ras_get_context(adev); 6211 int ret = 0; 6212 6213 if (!amdgpu_device_supports_baco(dev)) 6214 return -ENOTSUPP; 6215 6216 ret = amdgpu_dpm_baco_exit(adev); 6217 if (ret) 6218 return ret; 6219 6220 if (ras && adev->ras_enabled && 6221 adev->nbio.funcs->enable_doorbell_interrupt) 6222 adev->nbio.funcs->enable_doorbell_interrupt(adev, true); 6223 6224 if (amdgpu_passthrough(adev) && adev->nbio.funcs && 6225 adev->nbio.funcs->clear_doorbell_interrupt) 6226 adev->nbio.funcs->clear_doorbell_interrupt(adev); 6227 6228 return 0; 6229 } 6230 6231 /** 6232 * amdgpu_pci_error_detected - Called when a PCI error is detected. 6233 * @pdev: PCI device struct 6234 * @state: PCI channel state 6235 * 6236 * Description: Called when a PCI error is detected. 6237 * 6238 * Return: PCI_ERS_RESULT_NEED_RESET or PCI_ERS_RESULT_DISCONNECT. 6239 */ 6240 pci_ers_result_t amdgpu_pci_error_detected(struct pci_dev *pdev, pci_channel_state_t state) 6241 { 6242 struct drm_device *dev = pci_get_drvdata(pdev); 6243 struct amdgpu_device *adev = drm_to_adev(dev); 6244 int i; 6245 6246 DRM_INFO("PCI error: detected callback, state(%d)!!\n", state); 6247 6248 if (adev->gmc.xgmi.num_physical_nodes > 1) { 6249 DRM_WARN("No support for XGMI hive yet..."); 6250 return PCI_ERS_RESULT_DISCONNECT; 6251 } 6252 6253 adev->pci_channel_state = state; 6254 6255 switch (state) { 6256 case pci_channel_io_normal: 6257 return PCI_ERS_RESULT_CAN_RECOVER; 6258 /* Fatal error, prepare for slot reset */ 6259 case pci_channel_io_frozen: 6260 /* 6261 * Locking adev->reset_domain->sem will prevent any external access 6262 * to GPU during PCI error recovery 6263 */ 6264 amdgpu_device_lock_reset_domain(adev->reset_domain); 6265 amdgpu_device_set_mp1_state(adev); 6266 6267 /* 6268 * Block any work scheduling as we do for regular GPU reset 6269 * for the duration of the recovery 6270 */ 6271 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 6272 struct amdgpu_ring *ring = adev->rings[i]; 6273 6274 if (!amdgpu_ring_sched_ready(ring)) 6275 continue; 6276 6277 drm_sched_stop(&ring->sched, NULL); 6278 } 6279 atomic_inc(&adev->gpu_reset_counter); 6280 return PCI_ERS_RESULT_NEED_RESET; 6281 case pci_channel_io_perm_failure: 6282 /* Permanent error, prepare for device removal */ 6283 return PCI_ERS_RESULT_DISCONNECT; 6284 } 6285 6286 return PCI_ERS_RESULT_NEED_RESET; 6287 } 6288 6289 /** 6290 * amdgpu_pci_mmio_enabled - Enable MMIO and dump debug registers 6291 * @pdev: pointer to PCI device 6292 */ 6293 pci_ers_result_t amdgpu_pci_mmio_enabled(struct pci_dev *pdev) 6294 { 6295 6296 DRM_INFO("PCI error: mmio enabled callback!!\n"); 6297 6298 /* TODO - dump whatever for debugging purposes */ 6299 6300 /* This called only if amdgpu_pci_error_detected returns 6301 * PCI_ERS_RESULT_CAN_RECOVER. Read/write to the device still 6302 * works, no need to reset slot. 6303 */ 6304 6305 return PCI_ERS_RESULT_RECOVERED; 6306 } 6307 6308 /** 6309 * amdgpu_pci_slot_reset - Called when PCI slot has been reset. 6310 * @pdev: PCI device struct 6311 * 6312 * Description: This routine is called by the pci error recovery 6313 * code after the PCI slot has been reset, just before we 6314 * should resume normal operations. 6315 */ 6316 pci_ers_result_t amdgpu_pci_slot_reset(struct pci_dev *pdev) 6317 { 6318 struct drm_device *dev = pci_get_drvdata(pdev); 6319 struct amdgpu_device *adev = drm_to_adev(dev); 6320 int r, i; 6321 struct amdgpu_reset_context reset_context; 6322 u32 memsize; 6323 struct list_head device_list; 6324 6325 /* PCI error slot reset should be skipped During RAS recovery */ 6326 if ((amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 3) || 6327 amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 4)) && 6328 amdgpu_ras_in_recovery(adev)) 6329 return PCI_ERS_RESULT_RECOVERED; 6330 6331 DRM_INFO("PCI error: slot reset callback!!\n"); 6332 6333 memset(&reset_context, 0, sizeof(reset_context)); 6334 6335 INIT_LIST_HEAD(&device_list); 6336 list_add_tail(&adev->reset_list, &device_list); 6337 6338 /* wait for asic to come out of reset */ 6339 msleep(500); 6340 6341 /* Restore PCI confspace */ 6342 amdgpu_device_load_pci_state(pdev); 6343 6344 /* confirm ASIC came out of reset */ 6345 for (i = 0; i < adev->usec_timeout; i++) { 6346 memsize = amdgpu_asic_get_config_memsize(adev); 6347 6348 if (memsize != 0xffffffff) 6349 break; 6350 udelay(1); 6351 } 6352 if (memsize == 0xffffffff) { 6353 r = -ETIME; 6354 goto out; 6355 } 6356 6357 reset_context.method = AMD_RESET_METHOD_NONE; 6358 reset_context.reset_req_dev = adev; 6359 set_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags); 6360 set_bit(AMDGPU_SKIP_HW_RESET, &reset_context.flags); 6361 6362 adev->no_hw_access = true; 6363 r = amdgpu_device_pre_asic_reset(adev, &reset_context); 6364 adev->no_hw_access = false; 6365 if (r) 6366 goto out; 6367 6368 r = amdgpu_do_asic_reset(&device_list, &reset_context); 6369 6370 out: 6371 if (!r) { 6372 if (amdgpu_device_cache_pci_state(adev->pdev)) 6373 pci_restore_state(adev->pdev); 6374 6375 DRM_INFO("PCIe error recovery succeeded\n"); 6376 } else { 6377 DRM_ERROR("PCIe error recovery failed, err:%d", r); 6378 amdgpu_device_unset_mp1_state(adev); 6379 amdgpu_device_unlock_reset_domain(adev->reset_domain); 6380 } 6381 6382 return r ? PCI_ERS_RESULT_DISCONNECT : PCI_ERS_RESULT_RECOVERED; 6383 } 6384 6385 /** 6386 * amdgpu_pci_resume() - resume normal ops after PCI reset 6387 * @pdev: pointer to PCI device 6388 * 6389 * Called when the error recovery driver tells us that its 6390 * OK to resume normal operation. 6391 */ 6392 void amdgpu_pci_resume(struct pci_dev *pdev) 6393 { 6394 struct drm_device *dev = pci_get_drvdata(pdev); 6395 struct amdgpu_device *adev = drm_to_adev(dev); 6396 int i; 6397 6398 6399 DRM_INFO("PCI error: resume callback!!\n"); 6400 6401 /* Only continue execution for the case of pci_channel_io_frozen */ 6402 if (adev->pci_channel_state != pci_channel_io_frozen) 6403 return; 6404 6405 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 6406 struct amdgpu_ring *ring = adev->rings[i]; 6407 6408 if (!amdgpu_ring_sched_ready(ring)) 6409 continue; 6410 6411 drm_sched_start(&ring->sched); 6412 } 6413 6414 amdgpu_device_unset_mp1_state(adev); 6415 amdgpu_device_unlock_reset_domain(adev->reset_domain); 6416 } 6417 6418 bool amdgpu_device_cache_pci_state(struct pci_dev *pdev) 6419 { 6420 struct drm_device *dev = pci_get_drvdata(pdev); 6421 struct amdgpu_device *adev = drm_to_adev(dev); 6422 int r; 6423 6424 r = pci_save_state(pdev); 6425 if (!r) { 6426 kfree(adev->pci_state); 6427 6428 adev->pci_state = pci_store_saved_state(pdev); 6429 6430 if (!adev->pci_state) { 6431 DRM_ERROR("Failed to store PCI saved state"); 6432 return false; 6433 } 6434 } else { 6435 DRM_WARN("Failed to save PCI state, err:%d\n", r); 6436 return false; 6437 } 6438 6439 return true; 6440 } 6441 6442 bool amdgpu_device_load_pci_state(struct pci_dev *pdev) 6443 { 6444 struct drm_device *dev = pci_get_drvdata(pdev); 6445 struct amdgpu_device *adev = drm_to_adev(dev); 6446 int r; 6447 6448 if (!adev->pci_state) 6449 return false; 6450 6451 r = pci_load_saved_state(pdev, adev->pci_state); 6452 6453 if (!r) { 6454 pci_restore_state(pdev); 6455 } else { 6456 DRM_WARN("Failed to load PCI state, err:%d\n", r); 6457 return false; 6458 } 6459 6460 return true; 6461 } 6462 6463 void amdgpu_device_flush_hdp(struct amdgpu_device *adev, 6464 struct amdgpu_ring *ring) 6465 { 6466 #ifdef CONFIG_X86_64 6467 if ((adev->flags & AMD_IS_APU) && !amdgpu_passthrough(adev)) 6468 return; 6469 #endif 6470 if (adev->gmc.xgmi.connected_to_cpu) 6471 return; 6472 6473 if (ring && ring->funcs->emit_hdp_flush) 6474 amdgpu_ring_emit_hdp_flush(ring); 6475 else 6476 amdgpu_asic_flush_hdp(adev, ring); 6477 } 6478 6479 void amdgpu_device_invalidate_hdp(struct amdgpu_device *adev, 6480 struct amdgpu_ring *ring) 6481 { 6482 #ifdef CONFIG_X86_64 6483 if ((adev->flags & AMD_IS_APU) && !amdgpu_passthrough(adev)) 6484 return; 6485 #endif 6486 if (adev->gmc.xgmi.connected_to_cpu) 6487 return; 6488 6489 amdgpu_asic_invalidate_hdp(adev, ring); 6490 } 6491 6492 int amdgpu_in_reset(struct amdgpu_device *adev) 6493 { 6494 return atomic_read(&adev->reset_domain->in_gpu_reset); 6495 } 6496 6497 /** 6498 * amdgpu_device_halt() - bring hardware to some kind of halt state 6499 * 6500 * @adev: amdgpu_device pointer 6501 * 6502 * Bring hardware to some kind of halt state so that no one can touch it 6503 * any more. It will help to maintain error context when error occurred. 6504 * Compare to a simple hang, the system will keep stable at least for SSH 6505 * access. Then it should be trivial to inspect the hardware state and 6506 * see what's going on. Implemented as following: 6507 * 6508 * 1. drm_dev_unplug() makes device inaccessible to user space(IOCTLs, etc), 6509 * clears all CPU mappings to device, disallows remappings through page faults 6510 * 2. amdgpu_irq_disable_all() disables all interrupts 6511 * 3. amdgpu_fence_driver_hw_fini() signals all HW fences 6512 * 4. set adev->no_hw_access to avoid potential crashes after setp 5 6513 * 5. amdgpu_device_unmap_mmio() clears all MMIO mappings 6514 * 6. pci_disable_device() and pci_wait_for_pending_transaction() 6515 * flush any in flight DMA operations 6516 */ 6517 void amdgpu_device_halt(struct amdgpu_device *adev) 6518 { 6519 struct pci_dev *pdev = adev->pdev; 6520 struct drm_device *ddev = adev_to_drm(adev); 6521 6522 amdgpu_xcp_dev_unplug(adev); 6523 drm_dev_unplug(ddev); 6524 6525 amdgpu_irq_disable_all(adev); 6526 6527 amdgpu_fence_driver_hw_fini(adev); 6528 6529 adev->no_hw_access = true; 6530 6531 amdgpu_device_unmap_mmio(adev); 6532 6533 pci_disable_device(pdev); 6534 pci_wait_for_pending_transaction(pdev); 6535 } 6536 6537 u32 amdgpu_device_pcie_port_rreg(struct amdgpu_device *adev, 6538 u32 reg) 6539 { 6540 unsigned long flags, address, data; 6541 u32 r; 6542 6543 address = adev->nbio.funcs->get_pcie_port_index_offset(adev); 6544 data = adev->nbio.funcs->get_pcie_port_data_offset(adev); 6545 6546 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 6547 WREG32(address, reg * 4); 6548 (void)RREG32(address); 6549 r = RREG32(data); 6550 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 6551 return r; 6552 } 6553 6554 void amdgpu_device_pcie_port_wreg(struct amdgpu_device *adev, 6555 u32 reg, u32 v) 6556 { 6557 unsigned long flags, address, data; 6558 6559 address = adev->nbio.funcs->get_pcie_port_index_offset(adev); 6560 data = adev->nbio.funcs->get_pcie_port_data_offset(adev); 6561 6562 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 6563 WREG32(address, reg * 4); 6564 (void)RREG32(address); 6565 WREG32(data, v); 6566 (void)RREG32(data); 6567 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 6568 } 6569 6570 /** 6571 * amdgpu_device_get_gang - return a reference to the current gang 6572 * @adev: amdgpu_device pointer 6573 * 6574 * Returns: A new reference to the current gang leader. 6575 */ 6576 struct dma_fence *amdgpu_device_get_gang(struct amdgpu_device *adev) 6577 { 6578 struct dma_fence *fence; 6579 6580 rcu_read_lock(); 6581 fence = dma_fence_get_rcu_safe(&adev->gang_submit); 6582 rcu_read_unlock(); 6583 return fence; 6584 } 6585 6586 /** 6587 * amdgpu_device_switch_gang - switch to a new gang 6588 * @adev: amdgpu_device pointer 6589 * @gang: the gang to switch to 6590 * 6591 * Try to switch to a new gang. 6592 * Returns: NULL if we switched to the new gang or a reference to the current 6593 * gang leader. 6594 */ 6595 struct dma_fence *amdgpu_device_switch_gang(struct amdgpu_device *adev, 6596 struct dma_fence *gang) 6597 { 6598 struct dma_fence *old = NULL; 6599 6600 do { 6601 dma_fence_put(old); 6602 old = amdgpu_device_get_gang(adev); 6603 if (old == gang) 6604 break; 6605 6606 if (!dma_fence_is_signaled(old)) 6607 return old; 6608 6609 } while (cmpxchg((struct dma_fence __force **)&adev->gang_submit, 6610 old, gang) != old); 6611 6612 dma_fence_put(old); 6613 return NULL; 6614 } 6615 6616 bool amdgpu_device_has_display_hardware(struct amdgpu_device *adev) 6617 { 6618 switch (adev->asic_type) { 6619 #ifdef CONFIG_DRM_AMDGPU_SI 6620 case CHIP_HAINAN: 6621 #endif 6622 case CHIP_TOPAZ: 6623 /* chips with no display hardware */ 6624 return false; 6625 #ifdef CONFIG_DRM_AMDGPU_SI 6626 case CHIP_TAHITI: 6627 case CHIP_PITCAIRN: 6628 case CHIP_VERDE: 6629 case CHIP_OLAND: 6630 #endif 6631 #ifdef CONFIG_DRM_AMDGPU_CIK 6632 case CHIP_BONAIRE: 6633 case CHIP_HAWAII: 6634 case CHIP_KAVERI: 6635 case CHIP_KABINI: 6636 case CHIP_MULLINS: 6637 #endif 6638 case CHIP_TONGA: 6639 case CHIP_FIJI: 6640 case CHIP_POLARIS10: 6641 case CHIP_POLARIS11: 6642 case CHIP_POLARIS12: 6643 case CHIP_VEGAM: 6644 case CHIP_CARRIZO: 6645 case CHIP_STONEY: 6646 /* chips with display hardware */ 6647 return true; 6648 default: 6649 /* IP discovery */ 6650 if (!amdgpu_ip_version(adev, DCE_HWIP, 0) || 6651 (adev->harvest_ip_mask & AMD_HARVEST_IP_DMU_MASK)) 6652 return false; 6653 return true; 6654 } 6655 } 6656 6657 uint32_t amdgpu_device_wait_on_rreg(struct amdgpu_device *adev, 6658 uint32_t inst, uint32_t reg_addr, char reg_name[], 6659 uint32_t expected_value, uint32_t mask) 6660 { 6661 uint32_t ret = 0; 6662 uint32_t old_ = 0; 6663 uint32_t tmp_ = RREG32(reg_addr); 6664 uint32_t loop = adev->usec_timeout; 6665 6666 while ((tmp_ & (mask)) != (expected_value)) { 6667 if (old_ != tmp_) { 6668 loop = adev->usec_timeout; 6669 old_ = tmp_; 6670 } else 6671 udelay(1); 6672 tmp_ = RREG32(reg_addr); 6673 loop--; 6674 if (!loop) { 6675 DRM_WARN("Register(%d) [%s] failed to reach value 0x%08x != 0x%08xn", 6676 inst, reg_name, (uint32_t)expected_value, 6677 (uint32_t)(tmp_ & (mask))); 6678 ret = -ETIMEDOUT; 6679 break; 6680 } 6681 } 6682 return ret; 6683 } 6684