1 /* 2 * Copyright 2008 Advanced Micro Devices, Inc. 3 * Copyright 2008 Red Hat Inc. 4 * Copyright 2009 Jerome Glisse. 5 * 6 * Permission is hereby granted, free of charge, to any person obtaining a 7 * copy of this software and associated documentation files (the "Software"), 8 * to deal in the Software without restriction, including without limitation 9 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 10 * and/or sell copies of the Software, and to permit persons to whom the 11 * Software is furnished to do so, subject to the following conditions: 12 * 13 * The above copyright notice and this permission notice shall be included in 14 * all copies or substantial portions of the Software. 15 * 16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 19 * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR 20 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, 21 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR 22 * OTHER DEALINGS IN THE SOFTWARE. 23 * 24 * Authors: Dave Airlie 25 * Alex Deucher 26 * Jerome Glisse 27 */ 28 #include <linux/power_supply.h> 29 #include <linux/kthread.h> 30 #include <linux/module.h> 31 #include <linux/console.h> 32 #include <linux/slab.h> 33 #include <linux/iommu.h> 34 #include <linux/pci.h> 35 #include <linux/pci-p2pdma.h> 36 #include <linux/apple-gmux.h> 37 38 #include <drm/drm_aperture.h> 39 #include <drm/drm_atomic_helper.h> 40 #include <drm/drm_crtc_helper.h> 41 #include <drm/drm_fb_helper.h> 42 #include <drm/drm_probe_helper.h> 43 #include <drm/amdgpu_drm.h> 44 #include <linux/device.h> 45 #include <linux/vgaarb.h> 46 #include <linux/vga_switcheroo.h> 47 #include <linux/efi.h> 48 #include "amdgpu.h" 49 #include "amdgpu_trace.h" 50 #include "amdgpu_i2c.h" 51 #include "atom.h" 52 #include "amdgpu_atombios.h" 53 #include "amdgpu_atomfirmware.h" 54 #include "amd_pcie.h" 55 #ifdef CONFIG_DRM_AMDGPU_SI 56 #include "si.h" 57 #endif 58 #ifdef CONFIG_DRM_AMDGPU_CIK 59 #include "cik.h" 60 #endif 61 #include "vi.h" 62 #include "soc15.h" 63 #include "nv.h" 64 #include "bif/bif_4_1_d.h" 65 #include <linux/firmware.h> 66 #include "amdgpu_vf_error.h" 67 68 #include "amdgpu_amdkfd.h" 69 #include "amdgpu_pm.h" 70 71 #include "amdgpu_xgmi.h" 72 #include "amdgpu_ras.h" 73 #include "amdgpu_pmu.h" 74 #include "amdgpu_fru_eeprom.h" 75 #include "amdgpu_reset.h" 76 #include "amdgpu_virt.h" 77 #include "amdgpu_dev_coredump.h" 78 79 #include <linux/suspend.h> 80 #include <drm/task_barrier.h> 81 #include <linux/pm_runtime.h> 82 83 #include <drm/drm_drv.h> 84 85 #if IS_ENABLED(CONFIG_X86) 86 #include <asm/intel-family.h> 87 #endif 88 89 MODULE_FIRMWARE("amdgpu/vega10_gpu_info.bin"); 90 MODULE_FIRMWARE("amdgpu/vega12_gpu_info.bin"); 91 MODULE_FIRMWARE("amdgpu/raven_gpu_info.bin"); 92 MODULE_FIRMWARE("amdgpu/picasso_gpu_info.bin"); 93 MODULE_FIRMWARE("amdgpu/raven2_gpu_info.bin"); 94 MODULE_FIRMWARE("amdgpu/arcturus_gpu_info.bin"); 95 MODULE_FIRMWARE("amdgpu/navi12_gpu_info.bin"); 96 97 #define AMDGPU_RESUME_MS 2000 98 #define AMDGPU_MAX_RETRY_LIMIT 2 99 #define AMDGPU_RETRY_SRIOV_RESET(r) ((r) == -EBUSY || (r) == -ETIMEDOUT || (r) == -EINVAL) 100 #define AMDGPU_PCIE_INDEX_FALLBACK (0x38 >> 2) 101 #define AMDGPU_PCIE_INDEX_HI_FALLBACK (0x44 >> 2) 102 #define AMDGPU_PCIE_DATA_FALLBACK (0x3C >> 2) 103 104 static const struct drm_driver amdgpu_kms_driver; 105 106 const char *amdgpu_asic_name[] = { 107 "TAHITI", 108 "PITCAIRN", 109 "VERDE", 110 "OLAND", 111 "HAINAN", 112 "BONAIRE", 113 "KAVERI", 114 "KABINI", 115 "HAWAII", 116 "MULLINS", 117 "TOPAZ", 118 "TONGA", 119 "FIJI", 120 "CARRIZO", 121 "STONEY", 122 "POLARIS10", 123 "POLARIS11", 124 "POLARIS12", 125 "VEGAM", 126 "VEGA10", 127 "VEGA12", 128 "VEGA20", 129 "RAVEN", 130 "ARCTURUS", 131 "RENOIR", 132 "ALDEBARAN", 133 "NAVI10", 134 "CYAN_SKILLFISH", 135 "NAVI14", 136 "NAVI12", 137 "SIENNA_CICHLID", 138 "NAVY_FLOUNDER", 139 "VANGOGH", 140 "DIMGREY_CAVEFISH", 141 "BEIGE_GOBY", 142 "YELLOW_CARP", 143 "IP DISCOVERY", 144 "LAST", 145 }; 146 147 #define AMDGPU_IP_BLK_MASK_ALL GENMASK(AMDGPU_MAX_IP_NUM - 1, 0) 148 /* 149 * Default init level where all blocks are expected to be initialized. This is 150 * the level of initialization expected by default and also after a full reset 151 * of the device. 152 */ 153 struct amdgpu_init_level amdgpu_init_default = { 154 .level = AMDGPU_INIT_LEVEL_DEFAULT, 155 .hwini_ip_block_mask = AMDGPU_IP_BLK_MASK_ALL, 156 }; 157 158 /* 159 * Minimal blocks needed to be initialized before a XGMI hive can be reset. This 160 * is used for cases like reset on initialization where the entire hive needs to 161 * be reset before first use. 162 */ 163 struct amdgpu_init_level amdgpu_init_minimal_xgmi = { 164 .level = AMDGPU_INIT_LEVEL_MINIMAL_XGMI, 165 .hwini_ip_block_mask = 166 BIT(AMD_IP_BLOCK_TYPE_GMC) | BIT(AMD_IP_BLOCK_TYPE_SMC) | 167 BIT(AMD_IP_BLOCK_TYPE_COMMON) | BIT(AMD_IP_BLOCK_TYPE_IH) | 168 BIT(AMD_IP_BLOCK_TYPE_PSP) 169 }; 170 171 static inline bool amdgpu_ip_member_of_hwini(struct amdgpu_device *adev, 172 enum amd_ip_block_type block) 173 { 174 return (adev->init_lvl->hwini_ip_block_mask & (1U << block)) != 0; 175 } 176 177 void amdgpu_set_init_level(struct amdgpu_device *adev, 178 enum amdgpu_init_lvl_id lvl) 179 { 180 switch (lvl) { 181 case AMDGPU_INIT_LEVEL_MINIMAL_XGMI: 182 adev->init_lvl = &amdgpu_init_minimal_xgmi; 183 break; 184 case AMDGPU_INIT_LEVEL_DEFAULT: 185 fallthrough; 186 default: 187 adev->init_lvl = &amdgpu_init_default; 188 break; 189 } 190 } 191 192 static inline void amdgpu_device_stop_pending_resets(struct amdgpu_device *adev); 193 194 /** 195 * DOC: pcie_replay_count 196 * 197 * The amdgpu driver provides a sysfs API for reporting the total number 198 * of PCIe replays (NAKs) 199 * The file pcie_replay_count is used for this and returns the total 200 * number of replays as a sum of the NAKs generated and NAKs received 201 */ 202 203 static ssize_t amdgpu_device_get_pcie_replay_count(struct device *dev, 204 struct device_attribute *attr, char *buf) 205 { 206 struct drm_device *ddev = dev_get_drvdata(dev); 207 struct amdgpu_device *adev = drm_to_adev(ddev); 208 uint64_t cnt = amdgpu_asic_get_pcie_replay_count(adev); 209 210 return sysfs_emit(buf, "%llu\n", cnt); 211 } 212 213 static DEVICE_ATTR(pcie_replay_count, 0444, 214 amdgpu_device_get_pcie_replay_count, NULL); 215 216 static ssize_t amdgpu_sysfs_reg_state_get(struct file *f, struct kobject *kobj, 217 struct bin_attribute *attr, char *buf, 218 loff_t ppos, size_t count) 219 { 220 struct device *dev = kobj_to_dev(kobj); 221 struct drm_device *ddev = dev_get_drvdata(dev); 222 struct amdgpu_device *adev = drm_to_adev(ddev); 223 ssize_t bytes_read; 224 225 switch (ppos) { 226 case AMDGPU_SYS_REG_STATE_XGMI: 227 bytes_read = amdgpu_asic_get_reg_state( 228 adev, AMDGPU_REG_STATE_TYPE_XGMI, buf, count); 229 break; 230 case AMDGPU_SYS_REG_STATE_WAFL: 231 bytes_read = amdgpu_asic_get_reg_state( 232 adev, AMDGPU_REG_STATE_TYPE_WAFL, buf, count); 233 break; 234 case AMDGPU_SYS_REG_STATE_PCIE: 235 bytes_read = amdgpu_asic_get_reg_state( 236 adev, AMDGPU_REG_STATE_TYPE_PCIE, buf, count); 237 break; 238 case AMDGPU_SYS_REG_STATE_USR: 239 bytes_read = amdgpu_asic_get_reg_state( 240 adev, AMDGPU_REG_STATE_TYPE_USR, buf, count); 241 break; 242 case AMDGPU_SYS_REG_STATE_USR_1: 243 bytes_read = amdgpu_asic_get_reg_state( 244 adev, AMDGPU_REG_STATE_TYPE_USR_1, buf, count); 245 break; 246 default: 247 return -EINVAL; 248 } 249 250 return bytes_read; 251 } 252 253 BIN_ATTR(reg_state, 0444, amdgpu_sysfs_reg_state_get, NULL, 254 AMDGPU_SYS_REG_STATE_END); 255 256 int amdgpu_reg_state_sysfs_init(struct amdgpu_device *adev) 257 { 258 int ret; 259 260 if (!amdgpu_asic_get_reg_state_supported(adev)) 261 return 0; 262 263 ret = sysfs_create_bin_file(&adev->dev->kobj, &bin_attr_reg_state); 264 265 return ret; 266 } 267 268 void amdgpu_reg_state_sysfs_fini(struct amdgpu_device *adev) 269 { 270 if (!amdgpu_asic_get_reg_state_supported(adev)) 271 return; 272 sysfs_remove_bin_file(&adev->dev->kobj, &bin_attr_reg_state); 273 } 274 275 /** 276 * DOC: board_info 277 * 278 * The amdgpu driver provides a sysfs API for giving board related information. 279 * It provides the form factor information in the format 280 * 281 * type : form factor 282 * 283 * Possible form factor values 284 * 285 * - "cem" - PCIE CEM card 286 * - "oam" - Open Compute Accelerator Module 287 * - "unknown" - Not known 288 * 289 */ 290 291 static ssize_t amdgpu_device_get_board_info(struct device *dev, 292 struct device_attribute *attr, 293 char *buf) 294 { 295 struct drm_device *ddev = dev_get_drvdata(dev); 296 struct amdgpu_device *adev = drm_to_adev(ddev); 297 enum amdgpu_pkg_type pkg_type = AMDGPU_PKG_TYPE_CEM; 298 const char *pkg; 299 300 if (adev->smuio.funcs && adev->smuio.funcs->get_pkg_type) 301 pkg_type = adev->smuio.funcs->get_pkg_type(adev); 302 303 switch (pkg_type) { 304 case AMDGPU_PKG_TYPE_CEM: 305 pkg = "cem"; 306 break; 307 case AMDGPU_PKG_TYPE_OAM: 308 pkg = "oam"; 309 break; 310 default: 311 pkg = "unknown"; 312 break; 313 } 314 315 return sysfs_emit(buf, "%s : %s\n", "type", pkg); 316 } 317 318 static DEVICE_ATTR(board_info, 0444, amdgpu_device_get_board_info, NULL); 319 320 static struct attribute *amdgpu_board_attrs[] = { 321 &dev_attr_board_info.attr, 322 NULL, 323 }; 324 325 static umode_t amdgpu_board_attrs_is_visible(struct kobject *kobj, 326 struct attribute *attr, int n) 327 { 328 struct device *dev = kobj_to_dev(kobj); 329 struct drm_device *ddev = dev_get_drvdata(dev); 330 struct amdgpu_device *adev = drm_to_adev(ddev); 331 332 if (adev->flags & AMD_IS_APU) 333 return 0; 334 335 return attr->mode; 336 } 337 338 static const struct attribute_group amdgpu_board_attrs_group = { 339 .attrs = amdgpu_board_attrs, 340 .is_visible = amdgpu_board_attrs_is_visible 341 }; 342 343 static void amdgpu_device_get_pcie_info(struct amdgpu_device *adev); 344 345 346 /** 347 * amdgpu_device_supports_px - Is the device a dGPU with ATPX power control 348 * 349 * @dev: drm_device pointer 350 * 351 * Returns true if the device is a dGPU with ATPX power control, 352 * otherwise return false. 353 */ 354 bool amdgpu_device_supports_px(struct drm_device *dev) 355 { 356 struct amdgpu_device *adev = drm_to_adev(dev); 357 358 if ((adev->flags & AMD_IS_PX) && !amdgpu_is_atpx_hybrid()) 359 return true; 360 return false; 361 } 362 363 /** 364 * amdgpu_device_supports_boco - Is the device a dGPU with ACPI power resources 365 * 366 * @dev: drm_device pointer 367 * 368 * Returns true if the device is a dGPU with ACPI power control, 369 * otherwise return false. 370 */ 371 bool amdgpu_device_supports_boco(struct drm_device *dev) 372 { 373 struct amdgpu_device *adev = drm_to_adev(dev); 374 375 if (adev->has_pr3 || 376 ((adev->flags & AMD_IS_PX) && amdgpu_is_atpx_hybrid())) 377 return true; 378 return false; 379 } 380 381 /** 382 * amdgpu_device_supports_baco - Does the device support BACO 383 * 384 * @dev: drm_device pointer 385 * 386 * Return: 387 * 1 if the device supporte BACO; 388 * 3 if the device support MACO (only works if BACO is supported) 389 * otherwise return 0. 390 */ 391 int amdgpu_device_supports_baco(struct drm_device *dev) 392 { 393 struct amdgpu_device *adev = drm_to_adev(dev); 394 395 return amdgpu_asic_supports_baco(adev); 396 } 397 398 void amdgpu_device_detect_runtime_pm_mode(struct amdgpu_device *adev) 399 { 400 struct drm_device *dev; 401 int bamaco_support; 402 403 dev = adev_to_drm(adev); 404 405 adev->pm.rpm_mode = AMDGPU_RUNPM_NONE; 406 bamaco_support = amdgpu_device_supports_baco(dev); 407 408 switch (amdgpu_runtime_pm) { 409 case 2: 410 if (bamaco_support & MACO_SUPPORT) { 411 adev->pm.rpm_mode = AMDGPU_RUNPM_BAMACO; 412 dev_info(adev->dev, "Forcing BAMACO for runtime pm\n"); 413 } else if (bamaco_support == BACO_SUPPORT) { 414 adev->pm.rpm_mode = AMDGPU_RUNPM_BACO; 415 dev_info(adev->dev, "Requested mode BAMACO not available,fallback to use BACO\n"); 416 } 417 break; 418 case 1: 419 if (bamaco_support & BACO_SUPPORT) { 420 adev->pm.rpm_mode = AMDGPU_RUNPM_BACO; 421 dev_info(adev->dev, "Forcing BACO for runtime pm\n"); 422 } 423 break; 424 case -1: 425 case -2: 426 if (amdgpu_device_supports_px(dev)) { /* enable PX as runtime mode */ 427 adev->pm.rpm_mode = AMDGPU_RUNPM_PX; 428 dev_info(adev->dev, "Using ATPX for runtime pm\n"); 429 } else if (amdgpu_device_supports_boco(dev)) { /* enable boco as runtime mode */ 430 adev->pm.rpm_mode = AMDGPU_RUNPM_BOCO; 431 dev_info(adev->dev, "Using BOCO for runtime pm\n"); 432 } else { 433 if (!bamaco_support) 434 goto no_runtime_pm; 435 436 switch (adev->asic_type) { 437 case CHIP_VEGA20: 438 case CHIP_ARCTURUS: 439 /* BACO are not supported on vega20 and arctrus */ 440 break; 441 case CHIP_VEGA10: 442 /* enable BACO as runpm mode if noretry=0 */ 443 if (!adev->gmc.noretry) 444 adev->pm.rpm_mode = AMDGPU_RUNPM_BACO; 445 break; 446 default: 447 /* enable BACO as runpm mode on CI+ */ 448 adev->pm.rpm_mode = AMDGPU_RUNPM_BACO; 449 break; 450 } 451 452 if (adev->pm.rpm_mode == AMDGPU_RUNPM_BACO) { 453 if (bamaco_support & MACO_SUPPORT) { 454 adev->pm.rpm_mode = AMDGPU_RUNPM_BAMACO; 455 dev_info(adev->dev, "Using BAMACO for runtime pm\n"); 456 } else { 457 dev_info(adev->dev, "Using BACO for runtime pm\n"); 458 } 459 } 460 } 461 break; 462 case 0: 463 dev_info(adev->dev, "runtime pm is manually disabled\n"); 464 break; 465 default: 466 break; 467 } 468 469 no_runtime_pm: 470 if (adev->pm.rpm_mode == AMDGPU_RUNPM_NONE) 471 dev_info(adev->dev, "Runtime PM not available\n"); 472 } 473 /** 474 * amdgpu_device_supports_smart_shift - Is the device dGPU with 475 * smart shift support 476 * 477 * @dev: drm_device pointer 478 * 479 * Returns true if the device is a dGPU with Smart Shift support, 480 * otherwise returns false. 481 */ 482 bool amdgpu_device_supports_smart_shift(struct drm_device *dev) 483 { 484 return (amdgpu_device_supports_boco(dev) && 485 amdgpu_acpi_is_power_shift_control_supported()); 486 } 487 488 /* 489 * VRAM access helper functions 490 */ 491 492 /** 493 * amdgpu_device_mm_access - access vram by MM_INDEX/MM_DATA 494 * 495 * @adev: amdgpu_device pointer 496 * @pos: offset of the buffer in vram 497 * @buf: virtual address of the buffer in system memory 498 * @size: read/write size, sizeof(@buf) must > @size 499 * @write: true - write to vram, otherwise - read from vram 500 */ 501 void amdgpu_device_mm_access(struct amdgpu_device *adev, loff_t pos, 502 void *buf, size_t size, bool write) 503 { 504 unsigned long flags; 505 uint32_t hi = ~0, tmp = 0; 506 uint32_t *data = buf; 507 uint64_t last; 508 int idx; 509 510 if (!drm_dev_enter(adev_to_drm(adev), &idx)) 511 return; 512 513 BUG_ON(!IS_ALIGNED(pos, 4) || !IS_ALIGNED(size, 4)); 514 515 spin_lock_irqsave(&adev->mmio_idx_lock, flags); 516 for (last = pos + size; pos < last; pos += 4) { 517 tmp = pos >> 31; 518 519 WREG32_NO_KIQ(mmMM_INDEX, ((uint32_t)pos) | 0x80000000); 520 if (tmp != hi) { 521 WREG32_NO_KIQ(mmMM_INDEX_HI, tmp); 522 hi = tmp; 523 } 524 if (write) 525 WREG32_NO_KIQ(mmMM_DATA, *data++); 526 else 527 *data++ = RREG32_NO_KIQ(mmMM_DATA); 528 } 529 530 spin_unlock_irqrestore(&adev->mmio_idx_lock, flags); 531 drm_dev_exit(idx); 532 } 533 534 /** 535 * amdgpu_device_aper_access - access vram by vram aperature 536 * 537 * @adev: amdgpu_device pointer 538 * @pos: offset of the buffer in vram 539 * @buf: virtual address of the buffer in system memory 540 * @size: read/write size, sizeof(@buf) must > @size 541 * @write: true - write to vram, otherwise - read from vram 542 * 543 * The return value means how many bytes have been transferred. 544 */ 545 size_t amdgpu_device_aper_access(struct amdgpu_device *adev, loff_t pos, 546 void *buf, size_t size, bool write) 547 { 548 #ifdef CONFIG_64BIT 549 void __iomem *addr; 550 size_t count = 0; 551 uint64_t last; 552 553 if (!adev->mman.aper_base_kaddr) 554 return 0; 555 556 last = min(pos + size, adev->gmc.visible_vram_size); 557 if (last > pos) { 558 addr = adev->mman.aper_base_kaddr + pos; 559 count = last - pos; 560 561 if (write) { 562 memcpy_toio(addr, buf, count); 563 /* Make sure HDP write cache flush happens without any reordering 564 * after the system memory contents are sent over PCIe device 565 */ 566 mb(); 567 amdgpu_device_flush_hdp(adev, NULL); 568 } else { 569 amdgpu_device_invalidate_hdp(adev, NULL); 570 /* Make sure HDP read cache is invalidated before issuing a read 571 * to the PCIe device 572 */ 573 mb(); 574 memcpy_fromio(buf, addr, count); 575 } 576 577 } 578 579 return count; 580 #else 581 return 0; 582 #endif 583 } 584 585 /** 586 * amdgpu_device_vram_access - read/write a buffer in vram 587 * 588 * @adev: amdgpu_device pointer 589 * @pos: offset of the buffer in vram 590 * @buf: virtual address of the buffer in system memory 591 * @size: read/write size, sizeof(@buf) must > @size 592 * @write: true - write to vram, otherwise - read from vram 593 */ 594 void amdgpu_device_vram_access(struct amdgpu_device *adev, loff_t pos, 595 void *buf, size_t size, bool write) 596 { 597 size_t count; 598 599 /* try to using vram apreature to access vram first */ 600 count = amdgpu_device_aper_access(adev, pos, buf, size, write); 601 size -= count; 602 if (size) { 603 /* using MM to access rest vram */ 604 pos += count; 605 buf += count; 606 amdgpu_device_mm_access(adev, pos, buf, size, write); 607 } 608 } 609 610 /* 611 * register access helper functions. 612 */ 613 614 /* Check if hw access should be skipped because of hotplug or device error */ 615 bool amdgpu_device_skip_hw_access(struct amdgpu_device *adev) 616 { 617 if (adev->no_hw_access) 618 return true; 619 620 #ifdef CONFIG_LOCKDEP 621 /* 622 * This is a bit complicated to understand, so worth a comment. What we assert 623 * here is that the GPU reset is not running on another thread in parallel. 624 * 625 * For this we trylock the read side of the reset semaphore, if that succeeds 626 * we know that the reset is not running in paralell. 627 * 628 * If the trylock fails we assert that we are either already holding the read 629 * side of the lock or are the reset thread itself and hold the write side of 630 * the lock. 631 */ 632 if (in_task()) { 633 if (down_read_trylock(&adev->reset_domain->sem)) 634 up_read(&adev->reset_domain->sem); 635 else 636 lockdep_assert_held(&adev->reset_domain->sem); 637 } 638 #endif 639 return false; 640 } 641 642 /** 643 * amdgpu_device_rreg - read a memory mapped IO or indirect register 644 * 645 * @adev: amdgpu_device pointer 646 * @reg: dword aligned register offset 647 * @acc_flags: access flags which require special behavior 648 * 649 * Returns the 32 bit value from the offset specified. 650 */ 651 uint32_t amdgpu_device_rreg(struct amdgpu_device *adev, 652 uint32_t reg, uint32_t acc_flags) 653 { 654 uint32_t ret; 655 656 if (amdgpu_device_skip_hw_access(adev)) 657 return 0; 658 659 if ((reg * 4) < adev->rmmio_size) { 660 if (!(acc_flags & AMDGPU_REGS_NO_KIQ) && 661 amdgpu_sriov_runtime(adev) && 662 down_read_trylock(&adev->reset_domain->sem)) { 663 ret = amdgpu_kiq_rreg(adev, reg, 0); 664 up_read(&adev->reset_domain->sem); 665 } else { 666 ret = readl(((void __iomem *)adev->rmmio) + (reg * 4)); 667 } 668 } else { 669 ret = adev->pcie_rreg(adev, reg * 4); 670 } 671 672 trace_amdgpu_device_rreg(adev->pdev->device, reg, ret); 673 674 return ret; 675 } 676 677 /* 678 * MMIO register read with bytes helper functions 679 * @offset:bytes offset from MMIO start 680 */ 681 682 /** 683 * amdgpu_mm_rreg8 - read a memory mapped IO register 684 * 685 * @adev: amdgpu_device pointer 686 * @offset: byte aligned register offset 687 * 688 * Returns the 8 bit value from the offset specified. 689 */ 690 uint8_t amdgpu_mm_rreg8(struct amdgpu_device *adev, uint32_t offset) 691 { 692 if (amdgpu_device_skip_hw_access(adev)) 693 return 0; 694 695 if (offset < adev->rmmio_size) 696 return (readb(adev->rmmio + offset)); 697 BUG(); 698 } 699 700 701 /** 702 * amdgpu_device_xcc_rreg - read a memory mapped IO or indirect register with specific XCC 703 * 704 * @adev: amdgpu_device pointer 705 * @reg: dword aligned register offset 706 * @acc_flags: access flags which require special behavior 707 * @xcc_id: xcc accelerated compute core id 708 * 709 * Returns the 32 bit value from the offset specified. 710 */ 711 uint32_t amdgpu_device_xcc_rreg(struct amdgpu_device *adev, 712 uint32_t reg, uint32_t acc_flags, 713 uint32_t xcc_id) 714 { 715 uint32_t ret, rlcg_flag; 716 717 if (amdgpu_device_skip_hw_access(adev)) 718 return 0; 719 720 if ((reg * 4) < adev->rmmio_size) { 721 if (amdgpu_sriov_vf(adev) && 722 !amdgpu_sriov_runtime(adev) && 723 adev->gfx.rlc.rlcg_reg_access_supported && 724 amdgpu_virt_get_rlcg_reg_access_flag(adev, acc_flags, 725 GC_HWIP, false, 726 &rlcg_flag)) { 727 ret = amdgpu_virt_rlcg_reg_rw(adev, reg, 0, rlcg_flag, GET_INST(GC, xcc_id)); 728 } else if (!(acc_flags & AMDGPU_REGS_NO_KIQ) && 729 amdgpu_sriov_runtime(adev) && 730 down_read_trylock(&adev->reset_domain->sem)) { 731 ret = amdgpu_kiq_rreg(adev, reg, xcc_id); 732 up_read(&adev->reset_domain->sem); 733 } else { 734 ret = readl(((void __iomem *)adev->rmmio) + (reg * 4)); 735 } 736 } else { 737 ret = adev->pcie_rreg(adev, reg * 4); 738 } 739 740 return ret; 741 } 742 743 /* 744 * MMIO register write with bytes helper functions 745 * @offset:bytes offset from MMIO start 746 * @value: the value want to be written to the register 747 */ 748 749 /** 750 * amdgpu_mm_wreg8 - read a memory mapped IO register 751 * 752 * @adev: amdgpu_device pointer 753 * @offset: byte aligned register offset 754 * @value: 8 bit value to write 755 * 756 * Writes the value specified to the offset specified. 757 */ 758 void amdgpu_mm_wreg8(struct amdgpu_device *adev, uint32_t offset, uint8_t value) 759 { 760 if (amdgpu_device_skip_hw_access(adev)) 761 return; 762 763 if (offset < adev->rmmio_size) 764 writeb(value, adev->rmmio + offset); 765 else 766 BUG(); 767 } 768 769 /** 770 * amdgpu_device_wreg - write to a memory mapped IO or indirect register 771 * 772 * @adev: amdgpu_device pointer 773 * @reg: dword aligned register offset 774 * @v: 32 bit value to write to the register 775 * @acc_flags: access flags which require special behavior 776 * 777 * Writes the value specified to the offset specified. 778 */ 779 void amdgpu_device_wreg(struct amdgpu_device *adev, 780 uint32_t reg, uint32_t v, 781 uint32_t acc_flags) 782 { 783 if (amdgpu_device_skip_hw_access(adev)) 784 return; 785 786 if ((reg * 4) < adev->rmmio_size) { 787 if (!(acc_flags & AMDGPU_REGS_NO_KIQ) && 788 amdgpu_sriov_runtime(adev) && 789 down_read_trylock(&adev->reset_domain->sem)) { 790 amdgpu_kiq_wreg(adev, reg, v, 0); 791 up_read(&adev->reset_domain->sem); 792 } else { 793 writel(v, ((void __iomem *)adev->rmmio) + (reg * 4)); 794 } 795 } else { 796 adev->pcie_wreg(adev, reg * 4, v); 797 } 798 799 trace_amdgpu_device_wreg(adev->pdev->device, reg, v); 800 } 801 802 /** 803 * amdgpu_mm_wreg_mmio_rlc - write register either with direct/indirect mmio or with RLC path if in range 804 * 805 * @adev: amdgpu_device pointer 806 * @reg: mmio/rlc register 807 * @v: value to write 808 * @xcc_id: xcc accelerated compute core id 809 * 810 * this function is invoked only for the debugfs register access 811 */ 812 void amdgpu_mm_wreg_mmio_rlc(struct amdgpu_device *adev, 813 uint32_t reg, uint32_t v, 814 uint32_t xcc_id) 815 { 816 if (amdgpu_device_skip_hw_access(adev)) 817 return; 818 819 if (amdgpu_sriov_fullaccess(adev) && 820 adev->gfx.rlc.funcs && 821 adev->gfx.rlc.funcs->is_rlcg_access_range) { 822 if (adev->gfx.rlc.funcs->is_rlcg_access_range(adev, reg)) 823 return amdgpu_sriov_wreg(adev, reg, v, 0, 0, xcc_id); 824 } else if ((reg * 4) >= adev->rmmio_size) { 825 adev->pcie_wreg(adev, reg * 4, v); 826 } else { 827 writel(v, ((void __iomem *)adev->rmmio) + (reg * 4)); 828 } 829 } 830 831 /** 832 * amdgpu_device_xcc_wreg - write to a memory mapped IO or indirect register with specific XCC 833 * 834 * @adev: amdgpu_device pointer 835 * @reg: dword aligned register offset 836 * @v: 32 bit value to write to the register 837 * @acc_flags: access flags which require special behavior 838 * @xcc_id: xcc accelerated compute core id 839 * 840 * Writes the value specified to the offset specified. 841 */ 842 void amdgpu_device_xcc_wreg(struct amdgpu_device *adev, 843 uint32_t reg, uint32_t v, 844 uint32_t acc_flags, uint32_t xcc_id) 845 { 846 uint32_t rlcg_flag; 847 848 if (amdgpu_device_skip_hw_access(adev)) 849 return; 850 851 if ((reg * 4) < adev->rmmio_size) { 852 if (amdgpu_sriov_vf(adev) && 853 !amdgpu_sriov_runtime(adev) && 854 adev->gfx.rlc.rlcg_reg_access_supported && 855 amdgpu_virt_get_rlcg_reg_access_flag(adev, acc_flags, 856 GC_HWIP, true, 857 &rlcg_flag)) { 858 amdgpu_virt_rlcg_reg_rw(adev, reg, v, rlcg_flag, GET_INST(GC, xcc_id)); 859 } else if (!(acc_flags & AMDGPU_REGS_NO_KIQ) && 860 amdgpu_sriov_runtime(adev) && 861 down_read_trylock(&adev->reset_domain->sem)) { 862 amdgpu_kiq_wreg(adev, reg, v, xcc_id); 863 up_read(&adev->reset_domain->sem); 864 } else { 865 writel(v, ((void __iomem *)adev->rmmio) + (reg * 4)); 866 } 867 } else { 868 adev->pcie_wreg(adev, reg * 4, v); 869 } 870 } 871 872 /** 873 * amdgpu_device_indirect_rreg - read an indirect register 874 * 875 * @adev: amdgpu_device pointer 876 * @reg_addr: indirect register address to read from 877 * 878 * Returns the value of indirect register @reg_addr 879 */ 880 u32 amdgpu_device_indirect_rreg(struct amdgpu_device *adev, 881 u32 reg_addr) 882 { 883 unsigned long flags, pcie_index, pcie_data; 884 void __iomem *pcie_index_offset; 885 void __iomem *pcie_data_offset; 886 u32 r; 887 888 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 889 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 890 891 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 892 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 893 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 894 895 writel(reg_addr, pcie_index_offset); 896 readl(pcie_index_offset); 897 r = readl(pcie_data_offset); 898 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 899 900 return r; 901 } 902 903 u32 amdgpu_device_indirect_rreg_ext(struct amdgpu_device *adev, 904 u64 reg_addr) 905 { 906 unsigned long flags, pcie_index, pcie_index_hi, pcie_data; 907 u32 r; 908 void __iomem *pcie_index_offset; 909 void __iomem *pcie_index_hi_offset; 910 void __iomem *pcie_data_offset; 911 912 if (unlikely(!adev->nbio.funcs)) { 913 pcie_index = AMDGPU_PCIE_INDEX_FALLBACK; 914 pcie_data = AMDGPU_PCIE_DATA_FALLBACK; 915 } else { 916 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 917 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 918 } 919 920 if (reg_addr >> 32) { 921 if (unlikely(!adev->nbio.funcs)) 922 pcie_index_hi = AMDGPU_PCIE_INDEX_HI_FALLBACK; 923 else 924 pcie_index_hi = adev->nbio.funcs->get_pcie_index_hi_offset(adev); 925 } else { 926 pcie_index_hi = 0; 927 } 928 929 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 930 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 931 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 932 if (pcie_index_hi != 0) 933 pcie_index_hi_offset = (void __iomem *)adev->rmmio + 934 pcie_index_hi * 4; 935 936 writel(reg_addr, pcie_index_offset); 937 readl(pcie_index_offset); 938 if (pcie_index_hi != 0) { 939 writel((reg_addr >> 32) & 0xff, pcie_index_hi_offset); 940 readl(pcie_index_hi_offset); 941 } 942 r = readl(pcie_data_offset); 943 944 /* clear the high bits */ 945 if (pcie_index_hi != 0) { 946 writel(0, pcie_index_hi_offset); 947 readl(pcie_index_hi_offset); 948 } 949 950 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 951 952 return r; 953 } 954 955 /** 956 * amdgpu_device_indirect_rreg64 - read a 64bits indirect register 957 * 958 * @adev: amdgpu_device pointer 959 * @reg_addr: indirect register address to read from 960 * 961 * Returns the value of indirect register @reg_addr 962 */ 963 u64 amdgpu_device_indirect_rreg64(struct amdgpu_device *adev, 964 u32 reg_addr) 965 { 966 unsigned long flags, pcie_index, pcie_data; 967 void __iomem *pcie_index_offset; 968 void __iomem *pcie_data_offset; 969 u64 r; 970 971 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 972 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 973 974 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 975 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 976 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 977 978 /* read low 32 bits */ 979 writel(reg_addr, pcie_index_offset); 980 readl(pcie_index_offset); 981 r = readl(pcie_data_offset); 982 /* read high 32 bits */ 983 writel(reg_addr + 4, pcie_index_offset); 984 readl(pcie_index_offset); 985 r |= ((u64)readl(pcie_data_offset) << 32); 986 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 987 988 return r; 989 } 990 991 u64 amdgpu_device_indirect_rreg64_ext(struct amdgpu_device *adev, 992 u64 reg_addr) 993 { 994 unsigned long flags, pcie_index, pcie_data; 995 unsigned long pcie_index_hi = 0; 996 void __iomem *pcie_index_offset; 997 void __iomem *pcie_index_hi_offset; 998 void __iomem *pcie_data_offset; 999 u64 r; 1000 1001 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 1002 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 1003 if ((reg_addr >> 32) && (adev->nbio.funcs->get_pcie_index_hi_offset)) 1004 pcie_index_hi = adev->nbio.funcs->get_pcie_index_hi_offset(adev); 1005 1006 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 1007 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 1008 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 1009 if (pcie_index_hi != 0) 1010 pcie_index_hi_offset = (void __iomem *)adev->rmmio + 1011 pcie_index_hi * 4; 1012 1013 /* read low 32 bits */ 1014 writel(reg_addr, pcie_index_offset); 1015 readl(pcie_index_offset); 1016 if (pcie_index_hi != 0) { 1017 writel((reg_addr >> 32) & 0xff, pcie_index_hi_offset); 1018 readl(pcie_index_hi_offset); 1019 } 1020 r = readl(pcie_data_offset); 1021 /* read high 32 bits */ 1022 writel(reg_addr + 4, pcie_index_offset); 1023 readl(pcie_index_offset); 1024 if (pcie_index_hi != 0) { 1025 writel((reg_addr >> 32) & 0xff, pcie_index_hi_offset); 1026 readl(pcie_index_hi_offset); 1027 } 1028 r |= ((u64)readl(pcie_data_offset) << 32); 1029 1030 /* clear the high bits */ 1031 if (pcie_index_hi != 0) { 1032 writel(0, pcie_index_hi_offset); 1033 readl(pcie_index_hi_offset); 1034 } 1035 1036 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 1037 1038 return r; 1039 } 1040 1041 /** 1042 * amdgpu_device_indirect_wreg - write an indirect register address 1043 * 1044 * @adev: amdgpu_device pointer 1045 * @reg_addr: indirect register offset 1046 * @reg_data: indirect register data 1047 * 1048 */ 1049 void amdgpu_device_indirect_wreg(struct amdgpu_device *adev, 1050 u32 reg_addr, u32 reg_data) 1051 { 1052 unsigned long flags, pcie_index, pcie_data; 1053 void __iomem *pcie_index_offset; 1054 void __iomem *pcie_data_offset; 1055 1056 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 1057 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 1058 1059 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 1060 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 1061 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 1062 1063 writel(reg_addr, pcie_index_offset); 1064 readl(pcie_index_offset); 1065 writel(reg_data, pcie_data_offset); 1066 readl(pcie_data_offset); 1067 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 1068 } 1069 1070 void amdgpu_device_indirect_wreg_ext(struct amdgpu_device *adev, 1071 u64 reg_addr, u32 reg_data) 1072 { 1073 unsigned long flags, pcie_index, pcie_index_hi, pcie_data; 1074 void __iomem *pcie_index_offset; 1075 void __iomem *pcie_index_hi_offset; 1076 void __iomem *pcie_data_offset; 1077 1078 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 1079 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 1080 if ((reg_addr >> 32) && (adev->nbio.funcs->get_pcie_index_hi_offset)) 1081 pcie_index_hi = adev->nbio.funcs->get_pcie_index_hi_offset(adev); 1082 else 1083 pcie_index_hi = 0; 1084 1085 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 1086 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 1087 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 1088 if (pcie_index_hi != 0) 1089 pcie_index_hi_offset = (void __iomem *)adev->rmmio + 1090 pcie_index_hi * 4; 1091 1092 writel(reg_addr, pcie_index_offset); 1093 readl(pcie_index_offset); 1094 if (pcie_index_hi != 0) { 1095 writel((reg_addr >> 32) & 0xff, pcie_index_hi_offset); 1096 readl(pcie_index_hi_offset); 1097 } 1098 writel(reg_data, pcie_data_offset); 1099 readl(pcie_data_offset); 1100 1101 /* clear the high bits */ 1102 if (pcie_index_hi != 0) { 1103 writel(0, pcie_index_hi_offset); 1104 readl(pcie_index_hi_offset); 1105 } 1106 1107 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 1108 } 1109 1110 /** 1111 * amdgpu_device_indirect_wreg64 - write a 64bits indirect register address 1112 * 1113 * @adev: amdgpu_device pointer 1114 * @reg_addr: indirect register offset 1115 * @reg_data: indirect register data 1116 * 1117 */ 1118 void amdgpu_device_indirect_wreg64(struct amdgpu_device *adev, 1119 u32 reg_addr, u64 reg_data) 1120 { 1121 unsigned long flags, pcie_index, pcie_data; 1122 void __iomem *pcie_index_offset; 1123 void __iomem *pcie_data_offset; 1124 1125 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 1126 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 1127 1128 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 1129 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 1130 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 1131 1132 /* write low 32 bits */ 1133 writel(reg_addr, pcie_index_offset); 1134 readl(pcie_index_offset); 1135 writel((u32)(reg_data & 0xffffffffULL), pcie_data_offset); 1136 readl(pcie_data_offset); 1137 /* write high 32 bits */ 1138 writel(reg_addr + 4, pcie_index_offset); 1139 readl(pcie_index_offset); 1140 writel((u32)(reg_data >> 32), pcie_data_offset); 1141 readl(pcie_data_offset); 1142 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 1143 } 1144 1145 void amdgpu_device_indirect_wreg64_ext(struct amdgpu_device *adev, 1146 u64 reg_addr, u64 reg_data) 1147 { 1148 unsigned long flags, pcie_index, pcie_data; 1149 unsigned long pcie_index_hi = 0; 1150 void __iomem *pcie_index_offset; 1151 void __iomem *pcie_index_hi_offset; 1152 void __iomem *pcie_data_offset; 1153 1154 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 1155 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 1156 if ((reg_addr >> 32) && (adev->nbio.funcs->get_pcie_index_hi_offset)) 1157 pcie_index_hi = adev->nbio.funcs->get_pcie_index_hi_offset(adev); 1158 1159 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 1160 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 1161 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 1162 if (pcie_index_hi != 0) 1163 pcie_index_hi_offset = (void __iomem *)adev->rmmio + 1164 pcie_index_hi * 4; 1165 1166 /* write low 32 bits */ 1167 writel(reg_addr, pcie_index_offset); 1168 readl(pcie_index_offset); 1169 if (pcie_index_hi != 0) { 1170 writel((reg_addr >> 32) & 0xff, pcie_index_hi_offset); 1171 readl(pcie_index_hi_offset); 1172 } 1173 writel((u32)(reg_data & 0xffffffffULL), pcie_data_offset); 1174 readl(pcie_data_offset); 1175 /* write high 32 bits */ 1176 writel(reg_addr + 4, pcie_index_offset); 1177 readl(pcie_index_offset); 1178 if (pcie_index_hi != 0) { 1179 writel((reg_addr >> 32) & 0xff, pcie_index_hi_offset); 1180 readl(pcie_index_hi_offset); 1181 } 1182 writel((u32)(reg_data >> 32), pcie_data_offset); 1183 readl(pcie_data_offset); 1184 1185 /* clear the high bits */ 1186 if (pcie_index_hi != 0) { 1187 writel(0, pcie_index_hi_offset); 1188 readl(pcie_index_hi_offset); 1189 } 1190 1191 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 1192 } 1193 1194 /** 1195 * amdgpu_device_get_rev_id - query device rev_id 1196 * 1197 * @adev: amdgpu_device pointer 1198 * 1199 * Return device rev_id 1200 */ 1201 u32 amdgpu_device_get_rev_id(struct amdgpu_device *adev) 1202 { 1203 return adev->nbio.funcs->get_rev_id(adev); 1204 } 1205 1206 /** 1207 * amdgpu_invalid_rreg - dummy reg read function 1208 * 1209 * @adev: amdgpu_device pointer 1210 * @reg: offset of register 1211 * 1212 * Dummy register read function. Used for register blocks 1213 * that certain asics don't have (all asics). 1214 * Returns the value in the register. 1215 */ 1216 static uint32_t amdgpu_invalid_rreg(struct amdgpu_device *adev, uint32_t reg) 1217 { 1218 DRM_ERROR("Invalid callback to read register 0x%04X\n", reg); 1219 BUG(); 1220 return 0; 1221 } 1222 1223 static uint32_t amdgpu_invalid_rreg_ext(struct amdgpu_device *adev, uint64_t reg) 1224 { 1225 DRM_ERROR("Invalid callback to read register 0x%llX\n", reg); 1226 BUG(); 1227 return 0; 1228 } 1229 1230 /** 1231 * amdgpu_invalid_wreg - dummy reg write function 1232 * 1233 * @adev: amdgpu_device pointer 1234 * @reg: offset of register 1235 * @v: value to write to the register 1236 * 1237 * Dummy register read function. Used for register blocks 1238 * that certain asics don't have (all asics). 1239 */ 1240 static void amdgpu_invalid_wreg(struct amdgpu_device *adev, uint32_t reg, uint32_t v) 1241 { 1242 DRM_ERROR("Invalid callback to write register 0x%04X with 0x%08X\n", 1243 reg, v); 1244 BUG(); 1245 } 1246 1247 static void amdgpu_invalid_wreg_ext(struct amdgpu_device *adev, uint64_t reg, uint32_t v) 1248 { 1249 DRM_ERROR("Invalid callback to write register 0x%llX with 0x%08X\n", 1250 reg, v); 1251 BUG(); 1252 } 1253 1254 /** 1255 * amdgpu_invalid_rreg64 - dummy 64 bit reg read function 1256 * 1257 * @adev: amdgpu_device pointer 1258 * @reg: offset of register 1259 * 1260 * Dummy register read function. Used for register blocks 1261 * that certain asics don't have (all asics). 1262 * Returns the value in the register. 1263 */ 1264 static uint64_t amdgpu_invalid_rreg64(struct amdgpu_device *adev, uint32_t reg) 1265 { 1266 DRM_ERROR("Invalid callback to read 64 bit register 0x%04X\n", reg); 1267 BUG(); 1268 return 0; 1269 } 1270 1271 static uint64_t amdgpu_invalid_rreg64_ext(struct amdgpu_device *adev, uint64_t reg) 1272 { 1273 DRM_ERROR("Invalid callback to read register 0x%llX\n", reg); 1274 BUG(); 1275 return 0; 1276 } 1277 1278 /** 1279 * amdgpu_invalid_wreg64 - dummy reg write function 1280 * 1281 * @adev: amdgpu_device pointer 1282 * @reg: offset of register 1283 * @v: value to write to the register 1284 * 1285 * Dummy register read function. Used for register blocks 1286 * that certain asics don't have (all asics). 1287 */ 1288 static void amdgpu_invalid_wreg64(struct amdgpu_device *adev, uint32_t reg, uint64_t v) 1289 { 1290 DRM_ERROR("Invalid callback to write 64 bit register 0x%04X with 0x%08llX\n", 1291 reg, v); 1292 BUG(); 1293 } 1294 1295 static void amdgpu_invalid_wreg64_ext(struct amdgpu_device *adev, uint64_t reg, uint64_t v) 1296 { 1297 DRM_ERROR("Invalid callback to write 64 bit register 0x%llX with 0x%08llX\n", 1298 reg, v); 1299 BUG(); 1300 } 1301 1302 /** 1303 * amdgpu_block_invalid_rreg - dummy reg read function 1304 * 1305 * @adev: amdgpu_device pointer 1306 * @block: offset of instance 1307 * @reg: offset of register 1308 * 1309 * Dummy register read function. Used for register blocks 1310 * that certain asics don't have (all asics). 1311 * Returns the value in the register. 1312 */ 1313 static uint32_t amdgpu_block_invalid_rreg(struct amdgpu_device *adev, 1314 uint32_t block, uint32_t reg) 1315 { 1316 DRM_ERROR("Invalid callback to read register 0x%04X in block 0x%04X\n", 1317 reg, block); 1318 BUG(); 1319 return 0; 1320 } 1321 1322 /** 1323 * amdgpu_block_invalid_wreg - dummy reg write function 1324 * 1325 * @adev: amdgpu_device pointer 1326 * @block: offset of instance 1327 * @reg: offset of register 1328 * @v: value to write to the register 1329 * 1330 * Dummy register read function. Used for register blocks 1331 * that certain asics don't have (all asics). 1332 */ 1333 static void amdgpu_block_invalid_wreg(struct amdgpu_device *adev, 1334 uint32_t block, 1335 uint32_t reg, uint32_t v) 1336 { 1337 DRM_ERROR("Invalid block callback to write register 0x%04X in block 0x%04X with 0x%08X\n", 1338 reg, block, v); 1339 BUG(); 1340 } 1341 1342 /** 1343 * amdgpu_device_asic_init - Wrapper for atom asic_init 1344 * 1345 * @adev: amdgpu_device pointer 1346 * 1347 * Does any asic specific work and then calls atom asic init. 1348 */ 1349 static int amdgpu_device_asic_init(struct amdgpu_device *adev) 1350 { 1351 int ret; 1352 1353 amdgpu_asic_pre_asic_init(adev); 1354 1355 if (amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 3) || 1356 amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 4) || 1357 amdgpu_ip_version(adev, GC_HWIP, 0) >= IP_VERSION(11, 0, 0)) { 1358 amdgpu_psp_wait_for_bootloader(adev); 1359 ret = amdgpu_atomfirmware_asic_init(adev, true); 1360 return ret; 1361 } else { 1362 return amdgpu_atom_asic_init(adev->mode_info.atom_context); 1363 } 1364 1365 return 0; 1366 } 1367 1368 /** 1369 * amdgpu_device_mem_scratch_init - allocate the VRAM scratch page 1370 * 1371 * @adev: amdgpu_device pointer 1372 * 1373 * Allocates a scratch page of VRAM for use by various things in the 1374 * driver. 1375 */ 1376 static int amdgpu_device_mem_scratch_init(struct amdgpu_device *adev) 1377 { 1378 return amdgpu_bo_create_kernel(adev, AMDGPU_GPU_PAGE_SIZE, PAGE_SIZE, 1379 AMDGPU_GEM_DOMAIN_VRAM | 1380 AMDGPU_GEM_DOMAIN_GTT, 1381 &adev->mem_scratch.robj, 1382 &adev->mem_scratch.gpu_addr, 1383 (void **)&adev->mem_scratch.ptr); 1384 } 1385 1386 /** 1387 * amdgpu_device_mem_scratch_fini - Free the VRAM scratch page 1388 * 1389 * @adev: amdgpu_device pointer 1390 * 1391 * Frees the VRAM scratch page. 1392 */ 1393 static void amdgpu_device_mem_scratch_fini(struct amdgpu_device *adev) 1394 { 1395 amdgpu_bo_free_kernel(&adev->mem_scratch.robj, NULL, NULL); 1396 } 1397 1398 /** 1399 * amdgpu_device_program_register_sequence - program an array of registers. 1400 * 1401 * @adev: amdgpu_device pointer 1402 * @registers: pointer to the register array 1403 * @array_size: size of the register array 1404 * 1405 * Programs an array or registers with and or masks. 1406 * This is a helper for setting golden registers. 1407 */ 1408 void amdgpu_device_program_register_sequence(struct amdgpu_device *adev, 1409 const u32 *registers, 1410 const u32 array_size) 1411 { 1412 u32 tmp, reg, and_mask, or_mask; 1413 int i; 1414 1415 if (array_size % 3) 1416 return; 1417 1418 for (i = 0; i < array_size; i += 3) { 1419 reg = registers[i + 0]; 1420 and_mask = registers[i + 1]; 1421 or_mask = registers[i + 2]; 1422 1423 if (and_mask == 0xffffffff) { 1424 tmp = or_mask; 1425 } else { 1426 tmp = RREG32(reg); 1427 tmp &= ~and_mask; 1428 if (adev->family >= AMDGPU_FAMILY_AI) 1429 tmp |= (or_mask & and_mask); 1430 else 1431 tmp |= or_mask; 1432 } 1433 WREG32(reg, tmp); 1434 } 1435 } 1436 1437 /** 1438 * amdgpu_device_pci_config_reset - reset the GPU 1439 * 1440 * @adev: amdgpu_device pointer 1441 * 1442 * Resets the GPU using the pci config reset sequence. 1443 * Only applicable to asics prior to vega10. 1444 */ 1445 void amdgpu_device_pci_config_reset(struct amdgpu_device *adev) 1446 { 1447 pci_write_config_dword(adev->pdev, 0x7c, AMDGPU_ASIC_RESET_DATA); 1448 } 1449 1450 /** 1451 * amdgpu_device_pci_reset - reset the GPU using generic PCI means 1452 * 1453 * @adev: amdgpu_device pointer 1454 * 1455 * Resets the GPU using generic pci reset interfaces (FLR, SBR, etc.). 1456 */ 1457 int amdgpu_device_pci_reset(struct amdgpu_device *adev) 1458 { 1459 return pci_reset_function(adev->pdev); 1460 } 1461 1462 /* 1463 * amdgpu_device_wb_*() 1464 * Writeback is the method by which the GPU updates special pages in memory 1465 * with the status of certain GPU events (fences, ring pointers,etc.). 1466 */ 1467 1468 /** 1469 * amdgpu_device_wb_fini - Disable Writeback and free memory 1470 * 1471 * @adev: amdgpu_device pointer 1472 * 1473 * Disables Writeback and frees the Writeback memory (all asics). 1474 * Used at driver shutdown. 1475 */ 1476 static void amdgpu_device_wb_fini(struct amdgpu_device *adev) 1477 { 1478 if (adev->wb.wb_obj) { 1479 amdgpu_bo_free_kernel(&adev->wb.wb_obj, 1480 &adev->wb.gpu_addr, 1481 (void **)&adev->wb.wb); 1482 adev->wb.wb_obj = NULL; 1483 } 1484 } 1485 1486 /** 1487 * amdgpu_device_wb_init - Init Writeback driver info and allocate memory 1488 * 1489 * @adev: amdgpu_device pointer 1490 * 1491 * Initializes writeback and allocates writeback memory (all asics). 1492 * Used at driver startup. 1493 * Returns 0 on success or an -error on failure. 1494 */ 1495 static int amdgpu_device_wb_init(struct amdgpu_device *adev) 1496 { 1497 int r; 1498 1499 if (adev->wb.wb_obj == NULL) { 1500 /* AMDGPU_MAX_WB * sizeof(uint32_t) * 8 = AMDGPU_MAX_WB 256bit slots */ 1501 r = amdgpu_bo_create_kernel(adev, AMDGPU_MAX_WB * sizeof(uint32_t) * 8, 1502 PAGE_SIZE, AMDGPU_GEM_DOMAIN_GTT, 1503 &adev->wb.wb_obj, &adev->wb.gpu_addr, 1504 (void **)&adev->wb.wb); 1505 if (r) { 1506 dev_warn(adev->dev, "(%d) create WB bo failed\n", r); 1507 return r; 1508 } 1509 1510 adev->wb.num_wb = AMDGPU_MAX_WB; 1511 memset(&adev->wb.used, 0, sizeof(adev->wb.used)); 1512 1513 /* clear wb memory */ 1514 memset((char *)adev->wb.wb, 0, AMDGPU_MAX_WB * sizeof(uint32_t) * 8); 1515 } 1516 1517 return 0; 1518 } 1519 1520 /** 1521 * amdgpu_device_wb_get - Allocate a wb entry 1522 * 1523 * @adev: amdgpu_device pointer 1524 * @wb: wb index 1525 * 1526 * Allocate a wb slot for use by the driver (all asics). 1527 * Returns 0 on success or -EINVAL on failure. 1528 */ 1529 int amdgpu_device_wb_get(struct amdgpu_device *adev, u32 *wb) 1530 { 1531 unsigned long flags, offset; 1532 1533 spin_lock_irqsave(&adev->wb.lock, flags); 1534 offset = find_first_zero_bit(adev->wb.used, adev->wb.num_wb); 1535 if (offset < adev->wb.num_wb) { 1536 __set_bit(offset, adev->wb.used); 1537 spin_unlock_irqrestore(&adev->wb.lock, flags); 1538 *wb = offset << 3; /* convert to dw offset */ 1539 return 0; 1540 } else { 1541 spin_unlock_irqrestore(&adev->wb.lock, flags); 1542 return -EINVAL; 1543 } 1544 } 1545 1546 /** 1547 * amdgpu_device_wb_free - Free a wb entry 1548 * 1549 * @adev: amdgpu_device pointer 1550 * @wb: wb index 1551 * 1552 * Free a wb slot allocated for use by the driver (all asics) 1553 */ 1554 void amdgpu_device_wb_free(struct amdgpu_device *adev, u32 wb) 1555 { 1556 unsigned long flags; 1557 1558 wb >>= 3; 1559 spin_lock_irqsave(&adev->wb.lock, flags); 1560 if (wb < adev->wb.num_wb) 1561 __clear_bit(wb, adev->wb.used); 1562 spin_unlock_irqrestore(&adev->wb.lock, flags); 1563 } 1564 1565 /** 1566 * amdgpu_device_resize_fb_bar - try to resize FB BAR 1567 * 1568 * @adev: amdgpu_device pointer 1569 * 1570 * Try to resize FB BAR to make all VRAM CPU accessible. We try very hard not 1571 * to fail, but if any of the BARs is not accessible after the size we abort 1572 * driver loading by returning -ENODEV. 1573 */ 1574 int amdgpu_device_resize_fb_bar(struct amdgpu_device *adev) 1575 { 1576 int rbar_size = pci_rebar_bytes_to_size(adev->gmc.real_vram_size); 1577 struct pci_bus *root; 1578 struct resource *res; 1579 unsigned int i; 1580 u16 cmd; 1581 int r; 1582 1583 if (!IS_ENABLED(CONFIG_PHYS_ADDR_T_64BIT)) 1584 return 0; 1585 1586 /* Bypass for VF */ 1587 if (amdgpu_sriov_vf(adev)) 1588 return 0; 1589 1590 /* PCI_EXT_CAP_ID_VNDR extended capability is located at 0x100 */ 1591 if (!pci_find_ext_capability(adev->pdev, PCI_EXT_CAP_ID_VNDR)) 1592 DRM_WARN("System can't access extended configuration space, please check!!\n"); 1593 1594 /* skip if the bios has already enabled large BAR */ 1595 if (adev->gmc.real_vram_size && 1596 (pci_resource_len(adev->pdev, 0) >= adev->gmc.real_vram_size)) 1597 return 0; 1598 1599 /* Check if the root BUS has 64bit memory resources */ 1600 root = adev->pdev->bus; 1601 while (root->parent) 1602 root = root->parent; 1603 1604 pci_bus_for_each_resource(root, res, i) { 1605 if (res && res->flags & (IORESOURCE_MEM | IORESOURCE_MEM_64) && 1606 res->start > 0x100000000ull) 1607 break; 1608 } 1609 1610 /* Trying to resize is pointless without a root hub window above 4GB */ 1611 if (!res) 1612 return 0; 1613 1614 /* Limit the BAR size to what is available */ 1615 rbar_size = min(fls(pci_rebar_get_possible_sizes(adev->pdev, 0)) - 1, 1616 rbar_size); 1617 1618 /* Disable memory decoding while we change the BAR addresses and size */ 1619 pci_read_config_word(adev->pdev, PCI_COMMAND, &cmd); 1620 pci_write_config_word(adev->pdev, PCI_COMMAND, 1621 cmd & ~PCI_COMMAND_MEMORY); 1622 1623 /* Free the VRAM and doorbell BAR, we most likely need to move both. */ 1624 amdgpu_doorbell_fini(adev); 1625 if (adev->asic_type >= CHIP_BONAIRE) 1626 pci_release_resource(adev->pdev, 2); 1627 1628 pci_release_resource(adev->pdev, 0); 1629 1630 r = pci_resize_resource(adev->pdev, 0, rbar_size); 1631 if (r == -ENOSPC) 1632 DRM_INFO("Not enough PCI address space for a large BAR."); 1633 else if (r && r != -ENOTSUPP) 1634 DRM_ERROR("Problem resizing BAR0 (%d).", r); 1635 1636 pci_assign_unassigned_bus_resources(adev->pdev->bus); 1637 1638 /* When the doorbell or fb BAR isn't available we have no chance of 1639 * using the device. 1640 */ 1641 r = amdgpu_doorbell_init(adev); 1642 if (r || (pci_resource_flags(adev->pdev, 0) & IORESOURCE_UNSET)) 1643 return -ENODEV; 1644 1645 pci_write_config_word(adev->pdev, PCI_COMMAND, cmd); 1646 1647 return 0; 1648 } 1649 1650 static bool amdgpu_device_read_bios(struct amdgpu_device *adev) 1651 { 1652 if (hweight32(adev->aid_mask) && (adev->flags & AMD_IS_APU)) 1653 return false; 1654 1655 return true; 1656 } 1657 1658 /* 1659 * GPU helpers function. 1660 */ 1661 /** 1662 * amdgpu_device_need_post - check if the hw need post or not 1663 * 1664 * @adev: amdgpu_device pointer 1665 * 1666 * Check if the asic has been initialized (all asics) at driver startup 1667 * or post is needed if hw reset is performed. 1668 * Returns true if need or false if not. 1669 */ 1670 bool amdgpu_device_need_post(struct amdgpu_device *adev) 1671 { 1672 uint32_t reg; 1673 1674 if (amdgpu_sriov_vf(adev)) 1675 return false; 1676 1677 if (!amdgpu_device_read_bios(adev)) 1678 return false; 1679 1680 if (amdgpu_passthrough(adev)) { 1681 /* for FIJI: In whole GPU pass-through virtualization case, after VM reboot 1682 * some old smc fw still need driver do vPost otherwise gpu hang, while 1683 * those smc fw version above 22.15 doesn't have this flaw, so we force 1684 * vpost executed for smc version below 22.15 1685 */ 1686 if (adev->asic_type == CHIP_FIJI) { 1687 int err; 1688 uint32_t fw_ver; 1689 1690 err = request_firmware(&adev->pm.fw, "amdgpu/fiji_smc.bin", adev->dev); 1691 /* force vPost if error occured */ 1692 if (err) 1693 return true; 1694 1695 fw_ver = *((uint32_t *)adev->pm.fw->data + 69); 1696 release_firmware(adev->pm.fw); 1697 if (fw_ver < 0x00160e00) 1698 return true; 1699 } 1700 } 1701 1702 /* Don't post if we need to reset whole hive on init */ 1703 if (adev->init_lvl->level == AMDGPU_INIT_LEVEL_MINIMAL_XGMI) 1704 return false; 1705 1706 if (adev->has_hw_reset) { 1707 adev->has_hw_reset = false; 1708 return true; 1709 } 1710 1711 /* bios scratch used on CIK+ */ 1712 if (adev->asic_type >= CHIP_BONAIRE) 1713 return amdgpu_atombios_scratch_need_asic_init(adev); 1714 1715 /* check MEM_SIZE for older asics */ 1716 reg = amdgpu_asic_get_config_memsize(adev); 1717 1718 if ((reg != 0) && (reg != 0xffffffff)) 1719 return false; 1720 1721 return true; 1722 } 1723 1724 /* 1725 * Check whether seamless boot is supported. 1726 * 1727 * So far we only support seamless boot on DCE 3.0 or later. 1728 * If users report that it works on older ASICS as well, we may 1729 * loosen this. 1730 */ 1731 bool amdgpu_device_seamless_boot_supported(struct amdgpu_device *adev) 1732 { 1733 switch (amdgpu_seamless) { 1734 case -1: 1735 break; 1736 case 1: 1737 return true; 1738 case 0: 1739 return false; 1740 default: 1741 DRM_ERROR("Invalid value for amdgpu.seamless: %d\n", 1742 amdgpu_seamless); 1743 return false; 1744 } 1745 1746 if (!(adev->flags & AMD_IS_APU)) 1747 return false; 1748 1749 if (adev->mman.keep_stolen_vga_memory) 1750 return false; 1751 1752 return amdgpu_ip_version(adev, DCE_HWIP, 0) >= IP_VERSION(3, 0, 0); 1753 } 1754 1755 /* 1756 * Intel hosts such as Rocket Lake, Alder Lake, Raptor Lake and Sapphire Rapids 1757 * don't support dynamic speed switching. Until we have confirmation from Intel 1758 * that a specific host supports it, it's safer that we keep it disabled for all. 1759 * 1760 * https://edc.intel.com/content/www/us/en/design/products/platforms/details/raptor-lake-s/13th-generation-core-processors-datasheet-volume-1-of-2/005/pci-express-support/ 1761 * https://gitlab.freedesktop.org/drm/amd/-/issues/2663 1762 */ 1763 static bool amdgpu_device_pcie_dynamic_switching_supported(struct amdgpu_device *adev) 1764 { 1765 #if IS_ENABLED(CONFIG_X86) 1766 struct cpuinfo_x86 *c = &cpu_data(0); 1767 1768 /* eGPU change speeds based on USB4 fabric conditions */ 1769 if (dev_is_removable(adev->dev)) 1770 return true; 1771 1772 if (c->x86_vendor == X86_VENDOR_INTEL) 1773 return false; 1774 #endif 1775 return true; 1776 } 1777 1778 /** 1779 * amdgpu_device_should_use_aspm - check if the device should program ASPM 1780 * 1781 * @adev: amdgpu_device pointer 1782 * 1783 * Confirm whether the module parameter and pcie bridge agree that ASPM should 1784 * be set for this device. 1785 * 1786 * Returns true if it should be used or false if not. 1787 */ 1788 bool amdgpu_device_should_use_aspm(struct amdgpu_device *adev) 1789 { 1790 switch (amdgpu_aspm) { 1791 case -1: 1792 break; 1793 case 0: 1794 return false; 1795 case 1: 1796 return true; 1797 default: 1798 return false; 1799 } 1800 if (adev->flags & AMD_IS_APU) 1801 return false; 1802 if (!(adev->pm.pp_feature & PP_PCIE_DPM_MASK)) 1803 return false; 1804 return pcie_aspm_enabled(adev->pdev); 1805 } 1806 1807 /* if we get transitioned to only one device, take VGA back */ 1808 /** 1809 * amdgpu_device_vga_set_decode - enable/disable vga decode 1810 * 1811 * @pdev: PCI device pointer 1812 * @state: enable/disable vga decode 1813 * 1814 * Enable/disable vga decode (all asics). 1815 * Returns VGA resource flags. 1816 */ 1817 static unsigned int amdgpu_device_vga_set_decode(struct pci_dev *pdev, 1818 bool state) 1819 { 1820 struct amdgpu_device *adev = drm_to_adev(pci_get_drvdata(pdev)); 1821 1822 amdgpu_asic_set_vga_state(adev, state); 1823 if (state) 1824 return VGA_RSRC_LEGACY_IO | VGA_RSRC_LEGACY_MEM | 1825 VGA_RSRC_NORMAL_IO | VGA_RSRC_NORMAL_MEM; 1826 else 1827 return VGA_RSRC_NORMAL_IO | VGA_RSRC_NORMAL_MEM; 1828 } 1829 1830 /** 1831 * amdgpu_device_check_block_size - validate the vm block size 1832 * 1833 * @adev: amdgpu_device pointer 1834 * 1835 * Validates the vm block size specified via module parameter. 1836 * The vm block size defines number of bits in page table versus page directory, 1837 * a page is 4KB so we have 12 bits offset, minimum 9 bits in the 1838 * page table and the remaining bits are in the page directory. 1839 */ 1840 static void amdgpu_device_check_block_size(struct amdgpu_device *adev) 1841 { 1842 /* defines number of bits in page table versus page directory, 1843 * a page is 4KB so we have 12 bits offset, minimum 9 bits in the 1844 * page table and the remaining bits are in the page directory 1845 */ 1846 if (amdgpu_vm_block_size == -1) 1847 return; 1848 1849 if (amdgpu_vm_block_size < 9) { 1850 dev_warn(adev->dev, "VM page table size (%d) too small\n", 1851 amdgpu_vm_block_size); 1852 amdgpu_vm_block_size = -1; 1853 } 1854 } 1855 1856 /** 1857 * amdgpu_device_check_vm_size - validate the vm size 1858 * 1859 * @adev: amdgpu_device pointer 1860 * 1861 * Validates the vm size in GB specified via module parameter. 1862 * The VM size is the size of the GPU virtual memory space in GB. 1863 */ 1864 static void amdgpu_device_check_vm_size(struct amdgpu_device *adev) 1865 { 1866 /* no need to check the default value */ 1867 if (amdgpu_vm_size == -1) 1868 return; 1869 1870 if (amdgpu_vm_size < 1) { 1871 dev_warn(adev->dev, "VM size (%d) too small, min is 1GB\n", 1872 amdgpu_vm_size); 1873 amdgpu_vm_size = -1; 1874 } 1875 } 1876 1877 static void amdgpu_device_check_smu_prv_buffer_size(struct amdgpu_device *adev) 1878 { 1879 struct sysinfo si; 1880 bool is_os_64 = (sizeof(void *) == 8); 1881 uint64_t total_memory; 1882 uint64_t dram_size_seven_GB = 0x1B8000000; 1883 uint64_t dram_size_three_GB = 0xB8000000; 1884 1885 if (amdgpu_smu_memory_pool_size == 0) 1886 return; 1887 1888 if (!is_os_64) { 1889 DRM_WARN("Not 64-bit OS, feature not supported\n"); 1890 goto def_value; 1891 } 1892 si_meminfo(&si); 1893 total_memory = (uint64_t)si.totalram * si.mem_unit; 1894 1895 if ((amdgpu_smu_memory_pool_size == 1) || 1896 (amdgpu_smu_memory_pool_size == 2)) { 1897 if (total_memory < dram_size_three_GB) 1898 goto def_value1; 1899 } else if ((amdgpu_smu_memory_pool_size == 4) || 1900 (amdgpu_smu_memory_pool_size == 8)) { 1901 if (total_memory < dram_size_seven_GB) 1902 goto def_value1; 1903 } else { 1904 DRM_WARN("Smu memory pool size not supported\n"); 1905 goto def_value; 1906 } 1907 adev->pm.smu_prv_buffer_size = amdgpu_smu_memory_pool_size << 28; 1908 1909 return; 1910 1911 def_value1: 1912 DRM_WARN("No enough system memory\n"); 1913 def_value: 1914 adev->pm.smu_prv_buffer_size = 0; 1915 } 1916 1917 static int amdgpu_device_init_apu_flags(struct amdgpu_device *adev) 1918 { 1919 if (!(adev->flags & AMD_IS_APU) || 1920 adev->asic_type < CHIP_RAVEN) 1921 return 0; 1922 1923 switch (adev->asic_type) { 1924 case CHIP_RAVEN: 1925 if (adev->pdev->device == 0x15dd) 1926 adev->apu_flags |= AMD_APU_IS_RAVEN; 1927 if (adev->pdev->device == 0x15d8) 1928 adev->apu_flags |= AMD_APU_IS_PICASSO; 1929 break; 1930 case CHIP_RENOIR: 1931 if ((adev->pdev->device == 0x1636) || 1932 (adev->pdev->device == 0x164c)) 1933 adev->apu_flags |= AMD_APU_IS_RENOIR; 1934 else 1935 adev->apu_flags |= AMD_APU_IS_GREEN_SARDINE; 1936 break; 1937 case CHIP_VANGOGH: 1938 adev->apu_flags |= AMD_APU_IS_VANGOGH; 1939 break; 1940 case CHIP_YELLOW_CARP: 1941 break; 1942 case CHIP_CYAN_SKILLFISH: 1943 if ((adev->pdev->device == 0x13FE) || 1944 (adev->pdev->device == 0x143F)) 1945 adev->apu_flags |= AMD_APU_IS_CYAN_SKILLFISH2; 1946 break; 1947 default: 1948 break; 1949 } 1950 1951 return 0; 1952 } 1953 1954 /** 1955 * amdgpu_device_check_arguments - validate module params 1956 * 1957 * @adev: amdgpu_device pointer 1958 * 1959 * Validates certain module parameters and updates 1960 * the associated values used by the driver (all asics). 1961 */ 1962 static int amdgpu_device_check_arguments(struct amdgpu_device *adev) 1963 { 1964 int i; 1965 1966 if (amdgpu_sched_jobs < 4) { 1967 dev_warn(adev->dev, "sched jobs (%d) must be at least 4\n", 1968 amdgpu_sched_jobs); 1969 amdgpu_sched_jobs = 4; 1970 } else if (!is_power_of_2(amdgpu_sched_jobs)) { 1971 dev_warn(adev->dev, "sched jobs (%d) must be a power of 2\n", 1972 amdgpu_sched_jobs); 1973 amdgpu_sched_jobs = roundup_pow_of_two(amdgpu_sched_jobs); 1974 } 1975 1976 if (amdgpu_gart_size != -1 && amdgpu_gart_size < 32) { 1977 /* gart size must be greater or equal to 32M */ 1978 dev_warn(adev->dev, "gart size (%d) too small\n", 1979 amdgpu_gart_size); 1980 amdgpu_gart_size = -1; 1981 } 1982 1983 if (amdgpu_gtt_size != -1 && amdgpu_gtt_size < 32) { 1984 /* gtt size must be greater or equal to 32M */ 1985 dev_warn(adev->dev, "gtt size (%d) too small\n", 1986 amdgpu_gtt_size); 1987 amdgpu_gtt_size = -1; 1988 } 1989 1990 /* valid range is between 4 and 9 inclusive */ 1991 if (amdgpu_vm_fragment_size != -1 && 1992 (amdgpu_vm_fragment_size > 9 || amdgpu_vm_fragment_size < 4)) { 1993 dev_warn(adev->dev, "valid range is between 4 and 9\n"); 1994 amdgpu_vm_fragment_size = -1; 1995 } 1996 1997 if (amdgpu_sched_hw_submission < 2) { 1998 dev_warn(adev->dev, "sched hw submission jobs (%d) must be at least 2\n", 1999 amdgpu_sched_hw_submission); 2000 amdgpu_sched_hw_submission = 2; 2001 } else if (!is_power_of_2(amdgpu_sched_hw_submission)) { 2002 dev_warn(adev->dev, "sched hw submission jobs (%d) must be a power of 2\n", 2003 amdgpu_sched_hw_submission); 2004 amdgpu_sched_hw_submission = roundup_pow_of_two(amdgpu_sched_hw_submission); 2005 } 2006 2007 if (amdgpu_reset_method < -1 || amdgpu_reset_method > 4) { 2008 dev_warn(adev->dev, "invalid option for reset method, reverting to default\n"); 2009 amdgpu_reset_method = -1; 2010 } 2011 2012 amdgpu_device_check_smu_prv_buffer_size(adev); 2013 2014 amdgpu_device_check_vm_size(adev); 2015 2016 amdgpu_device_check_block_size(adev); 2017 2018 adev->firmware.load_type = amdgpu_ucode_get_load_type(adev, amdgpu_fw_load_type); 2019 2020 for (i = 0; i < MAX_XCP; i++) 2021 adev->enforce_isolation[i] = !!enforce_isolation; 2022 2023 return 0; 2024 } 2025 2026 /** 2027 * amdgpu_switcheroo_set_state - set switcheroo state 2028 * 2029 * @pdev: pci dev pointer 2030 * @state: vga_switcheroo state 2031 * 2032 * Callback for the switcheroo driver. Suspends or resumes 2033 * the asics before or after it is powered up using ACPI methods. 2034 */ 2035 static void amdgpu_switcheroo_set_state(struct pci_dev *pdev, 2036 enum vga_switcheroo_state state) 2037 { 2038 struct drm_device *dev = pci_get_drvdata(pdev); 2039 int r; 2040 2041 if (amdgpu_device_supports_px(dev) && state == VGA_SWITCHEROO_OFF) 2042 return; 2043 2044 if (state == VGA_SWITCHEROO_ON) { 2045 pr_info("switched on\n"); 2046 /* don't suspend or resume card normally */ 2047 dev->switch_power_state = DRM_SWITCH_POWER_CHANGING; 2048 2049 pci_set_power_state(pdev, PCI_D0); 2050 amdgpu_device_load_pci_state(pdev); 2051 r = pci_enable_device(pdev); 2052 if (r) 2053 DRM_WARN("pci_enable_device failed (%d)\n", r); 2054 amdgpu_device_resume(dev, true); 2055 2056 dev->switch_power_state = DRM_SWITCH_POWER_ON; 2057 } else { 2058 pr_info("switched off\n"); 2059 dev->switch_power_state = DRM_SWITCH_POWER_CHANGING; 2060 amdgpu_device_prepare(dev); 2061 amdgpu_device_suspend(dev, true); 2062 amdgpu_device_cache_pci_state(pdev); 2063 /* Shut down the device */ 2064 pci_disable_device(pdev); 2065 pci_set_power_state(pdev, PCI_D3cold); 2066 dev->switch_power_state = DRM_SWITCH_POWER_OFF; 2067 } 2068 } 2069 2070 /** 2071 * amdgpu_switcheroo_can_switch - see if switcheroo state can change 2072 * 2073 * @pdev: pci dev pointer 2074 * 2075 * Callback for the switcheroo driver. Check of the switcheroo 2076 * state can be changed. 2077 * Returns true if the state can be changed, false if not. 2078 */ 2079 static bool amdgpu_switcheroo_can_switch(struct pci_dev *pdev) 2080 { 2081 struct drm_device *dev = pci_get_drvdata(pdev); 2082 2083 /* 2084 * FIXME: open_count is protected by drm_global_mutex but that would lead to 2085 * locking inversion with the driver load path. And the access here is 2086 * completely racy anyway. So don't bother with locking for now. 2087 */ 2088 return atomic_read(&dev->open_count) == 0; 2089 } 2090 2091 static const struct vga_switcheroo_client_ops amdgpu_switcheroo_ops = { 2092 .set_gpu_state = amdgpu_switcheroo_set_state, 2093 .reprobe = NULL, 2094 .can_switch = amdgpu_switcheroo_can_switch, 2095 }; 2096 2097 /** 2098 * amdgpu_device_ip_set_clockgating_state - set the CG state 2099 * 2100 * @dev: amdgpu_device pointer 2101 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.) 2102 * @state: clockgating state (gate or ungate) 2103 * 2104 * Sets the requested clockgating state for all instances of 2105 * the hardware IP specified. 2106 * Returns the error code from the last instance. 2107 */ 2108 int amdgpu_device_ip_set_clockgating_state(void *dev, 2109 enum amd_ip_block_type block_type, 2110 enum amd_clockgating_state state) 2111 { 2112 struct amdgpu_device *adev = dev; 2113 int i, r = 0; 2114 2115 for (i = 0; i < adev->num_ip_blocks; i++) { 2116 if (!adev->ip_blocks[i].status.valid) 2117 continue; 2118 if (adev->ip_blocks[i].version->type != block_type) 2119 continue; 2120 if (!adev->ip_blocks[i].version->funcs->set_clockgating_state) 2121 continue; 2122 r = adev->ip_blocks[i].version->funcs->set_clockgating_state( 2123 (void *)adev, state); 2124 if (r) 2125 DRM_ERROR("set_clockgating_state of IP block <%s> failed %d\n", 2126 adev->ip_blocks[i].version->funcs->name, r); 2127 } 2128 return r; 2129 } 2130 2131 /** 2132 * amdgpu_device_ip_set_powergating_state - set the PG state 2133 * 2134 * @dev: amdgpu_device pointer 2135 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.) 2136 * @state: powergating state (gate or ungate) 2137 * 2138 * Sets the requested powergating state for all instances of 2139 * the hardware IP specified. 2140 * Returns the error code from the last instance. 2141 */ 2142 int amdgpu_device_ip_set_powergating_state(void *dev, 2143 enum amd_ip_block_type block_type, 2144 enum amd_powergating_state state) 2145 { 2146 struct amdgpu_device *adev = dev; 2147 int i, r = 0; 2148 2149 for (i = 0; i < adev->num_ip_blocks; i++) { 2150 if (!adev->ip_blocks[i].status.valid) 2151 continue; 2152 if (adev->ip_blocks[i].version->type != block_type) 2153 continue; 2154 if (!adev->ip_blocks[i].version->funcs->set_powergating_state) 2155 continue; 2156 r = adev->ip_blocks[i].version->funcs->set_powergating_state( 2157 (void *)adev, state); 2158 if (r) 2159 DRM_ERROR("set_powergating_state of IP block <%s> failed %d\n", 2160 adev->ip_blocks[i].version->funcs->name, r); 2161 } 2162 return r; 2163 } 2164 2165 /** 2166 * amdgpu_device_ip_get_clockgating_state - get the CG state 2167 * 2168 * @adev: amdgpu_device pointer 2169 * @flags: clockgating feature flags 2170 * 2171 * Walks the list of IPs on the device and updates the clockgating 2172 * flags for each IP. 2173 * Updates @flags with the feature flags for each hardware IP where 2174 * clockgating is enabled. 2175 */ 2176 void amdgpu_device_ip_get_clockgating_state(struct amdgpu_device *adev, 2177 u64 *flags) 2178 { 2179 int i; 2180 2181 for (i = 0; i < adev->num_ip_blocks; i++) { 2182 if (!adev->ip_blocks[i].status.valid) 2183 continue; 2184 if (adev->ip_blocks[i].version->funcs->get_clockgating_state) 2185 adev->ip_blocks[i].version->funcs->get_clockgating_state((void *)adev, flags); 2186 } 2187 } 2188 2189 /** 2190 * amdgpu_device_ip_wait_for_idle - wait for idle 2191 * 2192 * @adev: amdgpu_device pointer 2193 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.) 2194 * 2195 * Waits for the request hardware IP to be idle. 2196 * Returns 0 for success or a negative error code on failure. 2197 */ 2198 int amdgpu_device_ip_wait_for_idle(struct amdgpu_device *adev, 2199 enum amd_ip_block_type block_type) 2200 { 2201 int i, r; 2202 2203 for (i = 0; i < adev->num_ip_blocks; i++) { 2204 if (!adev->ip_blocks[i].status.valid) 2205 continue; 2206 if (adev->ip_blocks[i].version->type == block_type) { 2207 r = adev->ip_blocks[i].version->funcs->wait_for_idle(&adev->ip_blocks[i]); 2208 if (r) 2209 return r; 2210 break; 2211 } 2212 } 2213 return 0; 2214 2215 } 2216 2217 /** 2218 * amdgpu_device_ip_is_valid - is the hardware IP enabled 2219 * 2220 * @adev: amdgpu_device pointer 2221 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.) 2222 * 2223 * Check if the hardware IP is enable or not. 2224 * Returns true if it the IP is enable, false if not. 2225 */ 2226 bool amdgpu_device_ip_is_valid(struct amdgpu_device *adev, 2227 enum amd_ip_block_type block_type) 2228 { 2229 int i; 2230 2231 for (i = 0; i < adev->num_ip_blocks; i++) { 2232 if (adev->ip_blocks[i].version->type == block_type) 2233 return adev->ip_blocks[i].status.valid; 2234 } 2235 return false; 2236 2237 } 2238 2239 /** 2240 * amdgpu_device_ip_get_ip_block - get a hw IP pointer 2241 * 2242 * @adev: amdgpu_device pointer 2243 * @type: Type of hardware IP (SMU, GFX, UVD, etc.) 2244 * 2245 * Returns a pointer to the hardware IP block structure 2246 * if it exists for the asic, otherwise NULL. 2247 */ 2248 struct amdgpu_ip_block * 2249 amdgpu_device_ip_get_ip_block(struct amdgpu_device *adev, 2250 enum amd_ip_block_type type) 2251 { 2252 int i; 2253 2254 for (i = 0; i < adev->num_ip_blocks; i++) 2255 if (adev->ip_blocks[i].version->type == type) 2256 return &adev->ip_blocks[i]; 2257 2258 return NULL; 2259 } 2260 2261 /** 2262 * amdgpu_device_ip_block_version_cmp 2263 * 2264 * @adev: amdgpu_device pointer 2265 * @type: enum amd_ip_block_type 2266 * @major: major version 2267 * @minor: minor version 2268 * 2269 * return 0 if equal or greater 2270 * return 1 if smaller or the ip_block doesn't exist 2271 */ 2272 int amdgpu_device_ip_block_version_cmp(struct amdgpu_device *adev, 2273 enum amd_ip_block_type type, 2274 u32 major, u32 minor) 2275 { 2276 struct amdgpu_ip_block *ip_block = amdgpu_device_ip_get_ip_block(adev, type); 2277 2278 if (ip_block && ((ip_block->version->major > major) || 2279 ((ip_block->version->major == major) && 2280 (ip_block->version->minor >= minor)))) 2281 return 0; 2282 2283 return 1; 2284 } 2285 2286 /** 2287 * amdgpu_device_ip_block_add 2288 * 2289 * @adev: amdgpu_device pointer 2290 * @ip_block_version: pointer to the IP to add 2291 * 2292 * Adds the IP block driver information to the collection of IPs 2293 * on the asic. 2294 */ 2295 int amdgpu_device_ip_block_add(struct amdgpu_device *adev, 2296 const struct amdgpu_ip_block_version *ip_block_version) 2297 { 2298 if (!ip_block_version) 2299 return -EINVAL; 2300 2301 switch (ip_block_version->type) { 2302 case AMD_IP_BLOCK_TYPE_VCN: 2303 if (adev->harvest_ip_mask & AMD_HARVEST_IP_VCN_MASK) 2304 return 0; 2305 break; 2306 case AMD_IP_BLOCK_TYPE_JPEG: 2307 if (adev->harvest_ip_mask & AMD_HARVEST_IP_JPEG_MASK) 2308 return 0; 2309 break; 2310 default: 2311 break; 2312 } 2313 2314 DRM_INFO("add ip block number %d <%s>\n", adev->num_ip_blocks, 2315 ip_block_version->funcs->name); 2316 2317 adev->ip_blocks[adev->num_ip_blocks].adev = adev; 2318 2319 adev->ip_blocks[adev->num_ip_blocks++].version = ip_block_version; 2320 2321 return 0; 2322 } 2323 2324 /** 2325 * amdgpu_device_enable_virtual_display - enable virtual display feature 2326 * 2327 * @adev: amdgpu_device pointer 2328 * 2329 * Enabled the virtual display feature if the user has enabled it via 2330 * the module parameter virtual_display. This feature provides a virtual 2331 * display hardware on headless boards or in virtualized environments. 2332 * This function parses and validates the configuration string specified by 2333 * the user and configues the virtual display configuration (number of 2334 * virtual connectors, crtcs, etc.) specified. 2335 */ 2336 static void amdgpu_device_enable_virtual_display(struct amdgpu_device *adev) 2337 { 2338 adev->enable_virtual_display = false; 2339 2340 if (amdgpu_virtual_display) { 2341 const char *pci_address_name = pci_name(adev->pdev); 2342 char *pciaddstr, *pciaddstr_tmp, *pciaddname_tmp, *pciaddname; 2343 2344 pciaddstr = kstrdup(amdgpu_virtual_display, GFP_KERNEL); 2345 pciaddstr_tmp = pciaddstr; 2346 while ((pciaddname_tmp = strsep(&pciaddstr_tmp, ";"))) { 2347 pciaddname = strsep(&pciaddname_tmp, ","); 2348 if (!strcmp("all", pciaddname) 2349 || !strcmp(pci_address_name, pciaddname)) { 2350 long num_crtc; 2351 int res = -1; 2352 2353 adev->enable_virtual_display = true; 2354 2355 if (pciaddname_tmp) 2356 res = kstrtol(pciaddname_tmp, 10, 2357 &num_crtc); 2358 2359 if (!res) { 2360 if (num_crtc < 1) 2361 num_crtc = 1; 2362 if (num_crtc > 6) 2363 num_crtc = 6; 2364 adev->mode_info.num_crtc = num_crtc; 2365 } else { 2366 adev->mode_info.num_crtc = 1; 2367 } 2368 break; 2369 } 2370 } 2371 2372 DRM_INFO("virtual display string:%s, %s:virtual_display:%d, num_crtc:%d\n", 2373 amdgpu_virtual_display, pci_address_name, 2374 adev->enable_virtual_display, adev->mode_info.num_crtc); 2375 2376 kfree(pciaddstr); 2377 } 2378 } 2379 2380 void amdgpu_device_set_sriov_virtual_display(struct amdgpu_device *adev) 2381 { 2382 if (amdgpu_sriov_vf(adev) && !adev->enable_virtual_display) { 2383 adev->mode_info.num_crtc = 1; 2384 adev->enable_virtual_display = true; 2385 DRM_INFO("virtual_display:%d, num_crtc:%d\n", 2386 adev->enable_virtual_display, adev->mode_info.num_crtc); 2387 } 2388 } 2389 2390 /** 2391 * amdgpu_device_parse_gpu_info_fw - parse gpu info firmware 2392 * 2393 * @adev: amdgpu_device pointer 2394 * 2395 * Parses the asic configuration parameters specified in the gpu info 2396 * firmware and makes them availale to the driver for use in configuring 2397 * the asic. 2398 * Returns 0 on success, -EINVAL on failure. 2399 */ 2400 static int amdgpu_device_parse_gpu_info_fw(struct amdgpu_device *adev) 2401 { 2402 const char *chip_name; 2403 int err; 2404 const struct gpu_info_firmware_header_v1_0 *hdr; 2405 2406 adev->firmware.gpu_info_fw = NULL; 2407 2408 if (adev->mman.discovery_bin) 2409 return 0; 2410 2411 switch (adev->asic_type) { 2412 default: 2413 return 0; 2414 case CHIP_VEGA10: 2415 chip_name = "vega10"; 2416 break; 2417 case CHIP_VEGA12: 2418 chip_name = "vega12"; 2419 break; 2420 case CHIP_RAVEN: 2421 if (adev->apu_flags & AMD_APU_IS_RAVEN2) 2422 chip_name = "raven2"; 2423 else if (adev->apu_flags & AMD_APU_IS_PICASSO) 2424 chip_name = "picasso"; 2425 else 2426 chip_name = "raven"; 2427 break; 2428 case CHIP_ARCTURUS: 2429 chip_name = "arcturus"; 2430 break; 2431 case CHIP_NAVI12: 2432 chip_name = "navi12"; 2433 break; 2434 } 2435 2436 err = amdgpu_ucode_request(adev, &adev->firmware.gpu_info_fw, 2437 "amdgpu/%s_gpu_info.bin", chip_name); 2438 if (err) { 2439 dev_err(adev->dev, 2440 "Failed to get gpu_info firmware \"%s_gpu_info.bin\"\n", 2441 chip_name); 2442 goto out; 2443 } 2444 2445 hdr = (const struct gpu_info_firmware_header_v1_0 *)adev->firmware.gpu_info_fw->data; 2446 amdgpu_ucode_print_gpu_info_hdr(&hdr->header); 2447 2448 switch (hdr->version_major) { 2449 case 1: 2450 { 2451 const struct gpu_info_firmware_v1_0 *gpu_info_fw = 2452 (const struct gpu_info_firmware_v1_0 *)(adev->firmware.gpu_info_fw->data + 2453 le32_to_cpu(hdr->header.ucode_array_offset_bytes)); 2454 2455 /* 2456 * Should be droped when DAL no longer needs it. 2457 */ 2458 if (adev->asic_type == CHIP_NAVI12) 2459 goto parse_soc_bounding_box; 2460 2461 adev->gfx.config.max_shader_engines = le32_to_cpu(gpu_info_fw->gc_num_se); 2462 adev->gfx.config.max_cu_per_sh = le32_to_cpu(gpu_info_fw->gc_num_cu_per_sh); 2463 adev->gfx.config.max_sh_per_se = le32_to_cpu(gpu_info_fw->gc_num_sh_per_se); 2464 adev->gfx.config.max_backends_per_se = le32_to_cpu(gpu_info_fw->gc_num_rb_per_se); 2465 adev->gfx.config.max_texture_channel_caches = 2466 le32_to_cpu(gpu_info_fw->gc_num_tccs); 2467 adev->gfx.config.max_gprs = le32_to_cpu(gpu_info_fw->gc_num_gprs); 2468 adev->gfx.config.max_gs_threads = le32_to_cpu(gpu_info_fw->gc_num_max_gs_thds); 2469 adev->gfx.config.gs_vgt_table_depth = le32_to_cpu(gpu_info_fw->gc_gs_table_depth); 2470 adev->gfx.config.gs_prim_buffer_depth = le32_to_cpu(gpu_info_fw->gc_gsprim_buff_depth); 2471 adev->gfx.config.double_offchip_lds_buf = 2472 le32_to_cpu(gpu_info_fw->gc_double_offchip_lds_buffer); 2473 adev->gfx.cu_info.wave_front_size = le32_to_cpu(gpu_info_fw->gc_wave_size); 2474 adev->gfx.cu_info.max_waves_per_simd = 2475 le32_to_cpu(gpu_info_fw->gc_max_waves_per_simd); 2476 adev->gfx.cu_info.max_scratch_slots_per_cu = 2477 le32_to_cpu(gpu_info_fw->gc_max_scratch_slots_per_cu); 2478 adev->gfx.cu_info.lds_size = le32_to_cpu(gpu_info_fw->gc_lds_size); 2479 if (hdr->version_minor >= 1) { 2480 const struct gpu_info_firmware_v1_1 *gpu_info_fw = 2481 (const struct gpu_info_firmware_v1_1 *)(adev->firmware.gpu_info_fw->data + 2482 le32_to_cpu(hdr->header.ucode_array_offset_bytes)); 2483 adev->gfx.config.num_sc_per_sh = 2484 le32_to_cpu(gpu_info_fw->num_sc_per_sh); 2485 adev->gfx.config.num_packer_per_sc = 2486 le32_to_cpu(gpu_info_fw->num_packer_per_sc); 2487 } 2488 2489 parse_soc_bounding_box: 2490 /* 2491 * soc bounding box info is not integrated in disocovery table, 2492 * we always need to parse it from gpu info firmware if needed. 2493 */ 2494 if (hdr->version_minor == 2) { 2495 const struct gpu_info_firmware_v1_2 *gpu_info_fw = 2496 (const struct gpu_info_firmware_v1_2 *)(adev->firmware.gpu_info_fw->data + 2497 le32_to_cpu(hdr->header.ucode_array_offset_bytes)); 2498 adev->dm.soc_bounding_box = &gpu_info_fw->soc_bounding_box; 2499 } 2500 break; 2501 } 2502 default: 2503 dev_err(adev->dev, 2504 "Unsupported gpu_info table %d\n", hdr->header.ucode_version); 2505 err = -EINVAL; 2506 goto out; 2507 } 2508 out: 2509 return err; 2510 } 2511 2512 /** 2513 * amdgpu_device_ip_early_init - run early init for hardware IPs 2514 * 2515 * @adev: amdgpu_device pointer 2516 * 2517 * Early initialization pass for hardware IPs. The hardware IPs that make 2518 * up each asic are discovered each IP's early_init callback is run. This 2519 * is the first stage in initializing the asic. 2520 * Returns 0 on success, negative error code on failure. 2521 */ 2522 static int amdgpu_device_ip_early_init(struct amdgpu_device *adev) 2523 { 2524 struct amdgpu_ip_block *ip_block; 2525 struct pci_dev *parent; 2526 int i, r; 2527 bool total; 2528 2529 amdgpu_device_enable_virtual_display(adev); 2530 2531 if (amdgpu_sriov_vf(adev)) { 2532 r = amdgpu_virt_request_full_gpu(adev, true); 2533 if (r) 2534 return r; 2535 } 2536 2537 switch (adev->asic_type) { 2538 #ifdef CONFIG_DRM_AMDGPU_SI 2539 case CHIP_VERDE: 2540 case CHIP_TAHITI: 2541 case CHIP_PITCAIRN: 2542 case CHIP_OLAND: 2543 case CHIP_HAINAN: 2544 adev->family = AMDGPU_FAMILY_SI; 2545 r = si_set_ip_blocks(adev); 2546 if (r) 2547 return r; 2548 break; 2549 #endif 2550 #ifdef CONFIG_DRM_AMDGPU_CIK 2551 case CHIP_BONAIRE: 2552 case CHIP_HAWAII: 2553 case CHIP_KAVERI: 2554 case CHIP_KABINI: 2555 case CHIP_MULLINS: 2556 if (adev->flags & AMD_IS_APU) 2557 adev->family = AMDGPU_FAMILY_KV; 2558 else 2559 adev->family = AMDGPU_FAMILY_CI; 2560 2561 r = cik_set_ip_blocks(adev); 2562 if (r) 2563 return r; 2564 break; 2565 #endif 2566 case CHIP_TOPAZ: 2567 case CHIP_TONGA: 2568 case CHIP_FIJI: 2569 case CHIP_POLARIS10: 2570 case CHIP_POLARIS11: 2571 case CHIP_POLARIS12: 2572 case CHIP_VEGAM: 2573 case CHIP_CARRIZO: 2574 case CHIP_STONEY: 2575 if (adev->flags & AMD_IS_APU) 2576 adev->family = AMDGPU_FAMILY_CZ; 2577 else 2578 adev->family = AMDGPU_FAMILY_VI; 2579 2580 r = vi_set_ip_blocks(adev); 2581 if (r) 2582 return r; 2583 break; 2584 default: 2585 r = amdgpu_discovery_set_ip_blocks(adev); 2586 if (r) 2587 return r; 2588 break; 2589 } 2590 2591 if (amdgpu_has_atpx() && 2592 (amdgpu_is_atpx_hybrid() || 2593 amdgpu_has_atpx_dgpu_power_cntl()) && 2594 ((adev->flags & AMD_IS_APU) == 0) && 2595 !dev_is_removable(&adev->pdev->dev)) 2596 adev->flags |= AMD_IS_PX; 2597 2598 if (!(adev->flags & AMD_IS_APU)) { 2599 parent = pcie_find_root_port(adev->pdev); 2600 adev->has_pr3 = parent ? pci_pr3_present(parent) : false; 2601 } 2602 2603 2604 adev->pm.pp_feature = amdgpu_pp_feature_mask; 2605 if (amdgpu_sriov_vf(adev) || sched_policy == KFD_SCHED_POLICY_NO_HWS) 2606 adev->pm.pp_feature &= ~PP_GFXOFF_MASK; 2607 if (amdgpu_sriov_vf(adev) && adev->asic_type == CHIP_SIENNA_CICHLID) 2608 adev->pm.pp_feature &= ~PP_OVERDRIVE_MASK; 2609 if (!amdgpu_device_pcie_dynamic_switching_supported(adev)) 2610 adev->pm.pp_feature &= ~PP_PCIE_DPM_MASK; 2611 2612 total = true; 2613 for (i = 0; i < adev->num_ip_blocks; i++) { 2614 ip_block = &adev->ip_blocks[i]; 2615 2616 if ((amdgpu_ip_block_mask & (1 << i)) == 0) { 2617 DRM_WARN("disabled ip block: %d <%s>\n", 2618 i, adev->ip_blocks[i].version->funcs->name); 2619 adev->ip_blocks[i].status.valid = false; 2620 } else if (ip_block->version->funcs->early_init) { 2621 r = ip_block->version->funcs->early_init(ip_block); 2622 if (r == -ENOENT) { 2623 adev->ip_blocks[i].status.valid = false; 2624 } else if (r) { 2625 DRM_ERROR("early_init of IP block <%s> failed %d\n", 2626 adev->ip_blocks[i].version->funcs->name, r); 2627 total = false; 2628 } else { 2629 adev->ip_blocks[i].status.valid = true; 2630 } 2631 } else { 2632 adev->ip_blocks[i].status.valid = true; 2633 } 2634 /* get the vbios after the asic_funcs are set up */ 2635 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON) { 2636 r = amdgpu_device_parse_gpu_info_fw(adev); 2637 if (r) 2638 return r; 2639 2640 /* Read BIOS */ 2641 if (amdgpu_device_read_bios(adev)) { 2642 if (!amdgpu_get_bios(adev)) 2643 return -EINVAL; 2644 2645 r = amdgpu_atombios_init(adev); 2646 if (r) { 2647 dev_err(adev->dev, "amdgpu_atombios_init failed\n"); 2648 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_INIT_FAIL, 0, 0); 2649 return r; 2650 } 2651 } 2652 2653 /*get pf2vf msg info at it's earliest time*/ 2654 if (amdgpu_sriov_vf(adev)) 2655 amdgpu_virt_init_data_exchange(adev); 2656 2657 } 2658 } 2659 if (!total) 2660 return -ENODEV; 2661 2662 ip_block = amdgpu_device_ip_get_ip_block(adev, AMD_IP_BLOCK_TYPE_GFX); 2663 if (ip_block->status.valid != false) 2664 amdgpu_amdkfd_device_probe(adev); 2665 2666 adev->cg_flags &= amdgpu_cg_mask; 2667 adev->pg_flags &= amdgpu_pg_mask; 2668 2669 return 0; 2670 } 2671 2672 static int amdgpu_device_ip_hw_init_phase1(struct amdgpu_device *adev) 2673 { 2674 int i, r; 2675 2676 for (i = 0; i < adev->num_ip_blocks; i++) { 2677 if (!adev->ip_blocks[i].status.sw) 2678 continue; 2679 if (adev->ip_blocks[i].status.hw) 2680 continue; 2681 if (!amdgpu_ip_member_of_hwini( 2682 adev, adev->ip_blocks[i].version->type)) 2683 continue; 2684 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON || 2685 (amdgpu_sriov_vf(adev) && (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP)) || 2686 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH) { 2687 r = adev->ip_blocks[i].version->funcs->hw_init(&adev->ip_blocks[i]); 2688 if (r) { 2689 DRM_ERROR("hw_init of IP block <%s> failed %d\n", 2690 adev->ip_blocks[i].version->funcs->name, r); 2691 return r; 2692 } 2693 adev->ip_blocks[i].status.hw = true; 2694 } 2695 } 2696 2697 return 0; 2698 } 2699 2700 static int amdgpu_device_ip_hw_init_phase2(struct amdgpu_device *adev) 2701 { 2702 int i, r; 2703 2704 for (i = 0; i < adev->num_ip_blocks; i++) { 2705 if (!adev->ip_blocks[i].status.sw) 2706 continue; 2707 if (adev->ip_blocks[i].status.hw) 2708 continue; 2709 if (!amdgpu_ip_member_of_hwini( 2710 adev, adev->ip_blocks[i].version->type)) 2711 continue; 2712 r = adev->ip_blocks[i].version->funcs->hw_init(&adev->ip_blocks[i]); 2713 if (r) { 2714 DRM_ERROR("hw_init of IP block <%s> failed %d\n", 2715 adev->ip_blocks[i].version->funcs->name, r); 2716 return r; 2717 } 2718 adev->ip_blocks[i].status.hw = true; 2719 } 2720 2721 return 0; 2722 } 2723 2724 static int amdgpu_device_fw_loading(struct amdgpu_device *adev) 2725 { 2726 int r = 0; 2727 int i; 2728 uint32_t smu_version; 2729 2730 if (adev->asic_type >= CHIP_VEGA10) { 2731 for (i = 0; i < adev->num_ip_blocks; i++) { 2732 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_PSP) 2733 continue; 2734 2735 if (!amdgpu_ip_member_of_hwini(adev, 2736 AMD_IP_BLOCK_TYPE_PSP)) 2737 break; 2738 2739 if (!adev->ip_blocks[i].status.sw) 2740 continue; 2741 2742 /* no need to do the fw loading again if already done*/ 2743 if (adev->ip_blocks[i].status.hw == true) 2744 break; 2745 2746 if (amdgpu_in_reset(adev) || adev->in_suspend) { 2747 r = adev->ip_blocks[i].version->funcs->resume(&adev->ip_blocks[i]); 2748 if (r) { 2749 DRM_ERROR("resume of IP block <%s> failed %d\n", 2750 adev->ip_blocks[i].version->funcs->name, r); 2751 return r; 2752 } 2753 } else { 2754 r = adev->ip_blocks[i].version->funcs->hw_init(&adev->ip_blocks[i]); 2755 if (r) { 2756 DRM_ERROR("hw_init of IP block <%s> failed %d\n", 2757 adev->ip_blocks[i].version->funcs->name, r); 2758 return r; 2759 } 2760 } 2761 2762 adev->ip_blocks[i].status.hw = true; 2763 break; 2764 } 2765 } 2766 2767 if (!amdgpu_sriov_vf(adev) || adev->asic_type == CHIP_TONGA) 2768 r = amdgpu_pm_load_smu_firmware(adev, &smu_version); 2769 2770 return r; 2771 } 2772 2773 static int amdgpu_device_init_schedulers(struct amdgpu_device *adev) 2774 { 2775 long timeout; 2776 int r, i; 2777 2778 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 2779 struct amdgpu_ring *ring = adev->rings[i]; 2780 2781 /* No need to setup the GPU scheduler for rings that don't need it */ 2782 if (!ring || ring->no_scheduler) 2783 continue; 2784 2785 switch (ring->funcs->type) { 2786 case AMDGPU_RING_TYPE_GFX: 2787 timeout = adev->gfx_timeout; 2788 break; 2789 case AMDGPU_RING_TYPE_COMPUTE: 2790 timeout = adev->compute_timeout; 2791 break; 2792 case AMDGPU_RING_TYPE_SDMA: 2793 timeout = adev->sdma_timeout; 2794 break; 2795 default: 2796 timeout = adev->video_timeout; 2797 break; 2798 } 2799 2800 r = drm_sched_init(&ring->sched, &amdgpu_sched_ops, NULL, 2801 DRM_SCHED_PRIORITY_COUNT, 2802 ring->num_hw_submission, 0, 2803 timeout, adev->reset_domain->wq, 2804 ring->sched_score, ring->name, 2805 adev->dev); 2806 if (r) { 2807 DRM_ERROR("Failed to create scheduler on ring %s.\n", 2808 ring->name); 2809 return r; 2810 } 2811 r = amdgpu_uvd_entity_init(adev, ring); 2812 if (r) { 2813 DRM_ERROR("Failed to create UVD scheduling entity on ring %s.\n", 2814 ring->name); 2815 return r; 2816 } 2817 r = amdgpu_vce_entity_init(adev, ring); 2818 if (r) { 2819 DRM_ERROR("Failed to create VCE scheduling entity on ring %s.\n", 2820 ring->name); 2821 return r; 2822 } 2823 } 2824 2825 amdgpu_xcp_update_partition_sched_list(adev); 2826 2827 return 0; 2828 } 2829 2830 2831 /** 2832 * amdgpu_device_ip_init - run init for hardware IPs 2833 * 2834 * @adev: amdgpu_device pointer 2835 * 2836 * Main initialization pass for hardware IPs. The list of all the hardware 2837 * IPs that make up the asic is walked and the sw_init and hw_init callbacks 2838 * are run. sw_init initializes the software state associated with each IP 2839 * and hw_init initializes the hardware associated with each IP. 2840 * Returns 0 on success, negative error code on failure. 2841 */ 2842 static int amdgpu_device_ip_init(struct amdgpu_device *adev) 2843 { 2844 bool init_badpage; 2845 int i, r; 2846 2847 r = amdgpu_ras_init(adev); 2848 if (r) 2849 return r; 2850 2851 for (i = 0; i < adev->num_ip_blocks; i++) { 2852 if (!adev->ip_blocks[i].status.valid) 2853 continue; 2854 if (adev->ip_blocks[i].version->funcs->sw_init) { 2855 r = adev->ip_blocks[i].version->funcs->sw_init(&adev->ip_blocks[i]); 2856 if (r) { 2857 DRM_ERROR("sw_init of IP block <%s> failed %d\n", 2858 adev->ip_blocks[i].version->funcs->name, r); 2859 goto init_failed; 2860 } 2861 } 2862 adev->ip_blocks[i].status.sw = true; 2863 2864 if (!amdgpu_ip_member_of_hwini( 2865 adev, adev->ip_blocks[i].version->type)) 2866 continue; 2867 2868 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON) { 2869 /* need to do common hw init early so everything is set up for gmc */ 2870 r = adev->ip_blocks[i].version->funcs->hw_init(&adev->ip_blocks[i]); 2871 if (r) { 2872 DRM_ERROR("hw_init %d failed %d\n", i, r); 2873 goto init_failed; 2874 } 2875 adev->ip_blocks[i].status.hw = true; 2876 } else if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) { 2877 /* need to do gmc hw init early so we can allocate gpu mem */ 2878 /* Try to reserve bad pages early */ 2879 if (amdgpu_sriov_vf(adev)) 2880 amdgpu_virt_exchange_data(adev); 2881 2882 r = amdgpu_device_mem_scratch_init(adev); 2883 if (r) { 2884 DRM_ERROR("amdgpu_mem_scratch_init failed %d\n", r); 2885 goto init_failed; 2886 } 2887 r = adev->ip_blocks[i].version->funcs->hw_init(&adev->ip_blocks[i]); 2888 if (r) { 2889 DRM_ERROR("hw_init %d failed %d\n", i, r); 2890 goto init_failed; 2891 } 2892 r = amdgpu_device_wb_init(adev); 2893 if (r) { 2894 DRM_ERROR("amdgpu_device_wb_init failed %d\n", r); 2895 goto init_failed; 2896 } 2897 adev->ip_blocks[i].status.hw = true; 2898 2899 /* right after GMC hw init, we create CSA */ 2900 if (adev->gfx.mcbp) { 2901 r = amdgpu_allocate_static_csa(adev, &adev->virt.csa_obj, 2902 AMDGPU_GEM_DOMAIN_VRAM | 2903 AMDGPU_GEM_DOMAIN_GTT, 2904 AMDGPU_CSA_SIZE); 2905 if (r) { 2906 DRM_ERROR("allocate CSA failed %d\n", r); 2907 goto init_failed; 2908 } 2909 } 2910 2911 r = amdgpu_seq64_init(adev); 2912 if (r) { 2913 DRM_ERROR("allocate seq64 failed %d\n", r); 2914 goto init_failed; 2915 } 2916 } 2917 } 2918 2919 if (amdgpu_sriov_vf(adev)) 2920 amdgpu_virt_init_data_exchange(adev); 2921 2922 r = amdgpu_ib_pool_init(adev); 2923 if (r) { 2924 dev_err(adev->dev, "IB initialization failed (%d).\n", r); 2925 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_IB_INIT_FAIL, 0, r); 2926 goto init_failed; 2927 } 2928 2929 r = amdgpu_ucode_create_bo(adev); /* create ucode bo when sw_init complete*/ 2930 if (r) 2931 goto init_failed; 2932 2933 r = amdgpu_device_ip_hw_init_phase1(adev); 2934 if (r) 2935 goto init_failed; 2936 2937 r = amdgpu_device_fw_loading(adev); 2938 if (r) 2939 goto init_failed; 2940 2941 r = amdgpu_device_ip_hw_init_phase2(adev); 2942 if (r) 2943 goto init_failed; 2944 2945 /* 2946 * retired pages will be loaded from eeprom and reserved here, 2947 * it should be called after amdgpu_device_ip_hw_init_phase2 since 2948 * for some ASICs the RAS EEPROM code relies on SMU fully functioning 2949 * for I2C communication which only true at this point. 2950 * 2951 * amdgpu_ras_recovery_init may fail, but the upper only cares the 2952 * failure from bad gpu situation and stop amdgpu init process 2953 * accordingly. For other failed cases, it will still release all 2954 * the resource and print error message, rather than returning one 2955 * negative value to upper level. 2956 * 2957 * Note: theoretically, this should be called before all vram allocations 2958 * to protect retired page from abusing 2959 */ 2960 init_badpage = (adev->init_lvl->level != AMDGPU_INIT_LEVEL_MINIMAL_XGMI); 2961 r = amdgpu_ras_recovery_init(adev, init_badpage); 2962 if (r) 2963 goto init_failed; 2964 2965 /** 2966 * In case of XGMI grab extra reference for reset domain for this device 2967 */ 2968 if (adev->gmc.xgmi.num_physical_nodes > 1) { 2969 if (amdgpu_xgmi_add_device(adev) == 0) { 2970 if (!amdgpu_sriov_vf(adev)) { 2971 struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev); 2972 2973 if (WARN_ON(!hive)) { 2974 r = -ENOENT; 2975 goto init_failed; 2976 } 2977 2978 if (!hive->reset_domain || 2979 !amdgpu_reset_get_reset_domain(hive->reset_domain)) { 2980 r = -ENOENT; 2981 amdgpu_put_xgmi_hive(hive); 2982 goto init_failed; 2983 } 2984 2985 /* Drop the early temporary reset domain we created for device */ 2986 amdgpu_reset_put_reset_domain(adev->reset_domain); 2987 adev->reset_domain = hive->reset_domain; 2988 amdgpu_put_xgmi_hive(hive); 2989 } 2990 } 2991 } 2992 2993 r = amdgpu_device_init_schedulers(adev); 2994 if (r) 2995 goto init_failed; 2996 2997 if (adev->mman.buffer_funcs_ring->sched.ready) 2998 amdgpu_ttm_set_buffer_funcs_status(adev, true); 2999 3000 /* Don't init kfd if whole hive need to be reset during init */ 3001 if (adev->init_lvl->level != AMDGPU_INIT_LEVEL_MINIMAL_XGMI) { 3002 kgd2kfd_init_zone_device(adev); 3003 amdgpu_amdkfd_device_init(adev); 3004 } 3005 3006 amdgpu_fru_get_product_info(adev); 3007 3008 init_failed: 3009 3010 return r; 3011 } 3012 3013 /** 3014 * amdgpu_device_fill_reset_magic - writes reset magic to gart pointer 3015 * 3016 * @adev: amdgpu_device pointer 3017 * 3018 * Writes a reset magic value to the gart pointer in VRAM. The driver calls 3019 * this function before a GPU reset. If the value is retained after a 3020 * GPU reset, VRAM has not been lost. Some GPU resets may destry VRAM contents. 3021 */ 3022 static void amdgpu_device_fill_reset_magic(struct amdgpu_device *adev) 3023 { 3024 memcpy(adev->reset_magic, adev->gart.ptr, AMDGPU_RESET_MAGIC_NUM); 3025 } 3026 3027 /** 3028 * amdgpu_device_check_vram_lost - check if vram is valid 3029 * 3030 * @adev: amdgpu_device pointer 3031 * 3032 * Checks the reset magic value written to the gart pointer in VRAM. 3033 * The driver calls this after a GPU reset to see if the contents of 3034 * VRAM is lost or now. 3035 * returns true if vram is lost, false if not. 3036 */ 3037 static bool amdgpu_device_check_vram_lost(struct amdgpu_device *adev) 3038 { 3039 if (memcmp(adev->gart.ptr, adev->reset_magic, 3040 AMDGPU_RESET_MAGIC_NUM)) 3041 return true; 3042 3043 if (!amdgpu_in_reset(adev)) 3044 return false; 3045 3046 /* 3047 * For all ASICs with baco/mode1 reset, the VRAM is 3048 * always assumed to be lost. 3049 */ 3050 switch (amdgpu_asic_reset_method(adev)) { 3051 case AMD_RESET_METHOD_BACO: 3052 case AMD_RESET_METHOD_MODE1: 3053 return true; 3054 default: 3055 return false; 3056 } 3057 } 3058 3059 /** 3060 * amdgpu_device_set_cg_state - set clockgating for amdgpu device 3061 * 3062 * @adev: amdgpu_device pointer 3063 * @state: clockgating state (gate or ungate) 3064 * 3065 * The list of all the hardware IPs that make up the asic is walked and the 3066 * set_clockgating_state callbacks are run. 3067 * Late initialization pass enabling clockgating for hardware IPs. 3068 * Fini or suspend, pass disabling clockgating for hardware IPs. 3069 * Returns 0 on success, negative error code on failure. 3070 */ 3071 3072 int amdgpu_device_set_cg_state(struct amdgpu_device *adev, 3073 enum amd_clockgating_state state) 3074 { 3075 int i, j, r; 3076 3077 if (amdgpu_emu_mode == 1) 3078 return 0; 3079 3080 for (j = 0; j < adev->num_ip_blocks; j++) { 3081 i = state == AMD_CG_STATE_GATE ? j : adev->num_ip_blocks - j - 1; 3082 if (!adev->ip_blocks[i].status.late_initialized) 3083 continue; 3084 /* skip CG for GFX, SDMA on S0ix */ 3085 if (adev->in_s0ix && 3086 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GFX || 3087 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SDMA)) 3088 continue; 3089 /* skip CG for VCE/UVD, it's handled specially */ 3090 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_UVD && 3091 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCE && 3092 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCN && 3093 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_JPEG && 3094 adev->ip_blocks[i].version->funcs->set_clockgating_state) { 3095 /* enable clockgating to save power */ 3096 r = adev->ip_blocks[i].version->funcs->set_clockgating_state((void *)adev, 3097 state); 3098 if (r) { 3099 DRM_ERROR("set_clockgating_state(gate) of IP block <%s> failed %d\n", 3100 adev->ip_blocks[i].version->funcs->name, r); 3101 return r; 3102 } 3103 } 3104 } 3105 3106 return 0; 3107 } 3108 3109 int amdgpu_device_set_pg_state(struct amdgpu_device *adev, 3110 enum amd_powergating_state state) 3111 { 3112 int i, j, r; 3113 3114 if (amdgpu_emu_mode == 1) 3115 return 0; 3116 3117 for (j = 0; j < adev->num_ip_blocks; j++) { 3118 i = state == AMD_PG_STATE_GATE ? j : adev->num_ip_blocks - j - 1; 3119 if (!adev->ip_blocks[i].status.late_initialized) 3120 continue; 3121 /* skip PG for GFX, SDMA on S0ix */ 3122 if (adev->in_s0ix && 3123 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GFX || 3124 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SDMA)) 3125 continue; 3126 /* skip CG for VCE/UVD, it's handled specially */ 3127 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_UVD && 3128 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCE && 3129 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCN && 3130 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_JPEG && 3131 adev->ip_blocks[i].version->funcs->set_powergating_state) { 3132 /* enable powergating to save power */ 3133 r = adev->ip_blocks[i].version->funcs->set_powergating_state((void *)adev, 3134 state); 3135 if (r) { 3136 DRM_ERROR("set_powergating_state(gate) of IP block <%s> failed %d\n", 3137 adev->ip_blocks[i].version->funcs->name, r); 3138 return r; 3139 } 3140 } 3141 } 3142 return 0; 3143 } 3144 3145 static int amdgpu_device_enable_mgpu_fan_boost(void) 3146 { 3147 struct amdgpu_gpu_instance *gpu_ins; 3148 struct amdgpu_device *adev; 3149 int i, ret = 0; 3150 3151 mutex_lock(&mgpu_info.mutex); 3152 3153 /* 3154 * MGPU fan boost feature should be enabled 3155 * only when there are two or more dGPUs in 3156 * the system 3157 */ 3158 if (mgpu_info.num_dgpu < 2) 3159 goto out; 3160 3161 for (i = 0; i < mgpu_info.num_dgpu; i++) { 3162 gpu_ins = &(mgpu_info.gpu_ins[i]); 3163 adev = gpu_ins->adev; 3164 if (!(adev->flags & AMD_IS_APU) && 3165 !gpu_ins->mgpu_fan_enabled) { 3166 ret = amdgpu_dpm_enable_mgpu_fan_boost(adev); 3167 if (ret) 3168 break; 3169 3170 gpu_ins->mgpu_fan_enabled = 1; 3171 } 3172 } 3173 3174 out: 3175 mutex_unlock(&mgpu_info.mutex); 3176 3177 return ret; 3178 } 3179 3180 /** 3181 * amdgpu_device_ip_late_init - run late init for hardware IPs 3182 * 3183 * @adev: amdgpu_device pointer 3184 * 3185 * Late initialization pass for hardware IPs. The list of all the hardware 3186 * IPs that make up the asic is walked and the late_init callbacks are run. 3187 * late_init covers any special initialization that an IP requires 3188 * after all of the have been initialized or something that needs to happen 3189 * late in the init process. 3190 * Returns 0 on success, negative error code on failure. 3191 */ 3192 static int amdgpu_device_ip_late_init(struct amdgpu_device *adev) 3193 { 3194 struct amdgpu_gpu_instance *gpu_instance; 3195 int i = 0, r; 3196 3197 for (i = 0; i < adev->num_ip_blocks; i++) { 3198 if (!adev->ip_blocks[i].status.hw) 3199 continue; 3200 if (adev->ip_blocks[i].version->funcs->late_init) { 3201 r = adev->ip_blocks[i].version->funcs->late_init(&adev->ip_blocks[i]); 3202 if (r) { 3203 DRM_ERROR("late_init of IP block <%s> failed %d\n", 3204 adev->ip_blocks[i].version->funcs->name, r); 3205 return r; 3206 } 3207 } 3208 adev->ip_blocks[i].status.late_initialized = true; 3209 } 3210 3211 r = amdgpu_ras_late_init(adev); 3212 if (r) { 3213 DRM_ERROR("amdgpu_ras_late_init failed %d", r); 3214 return r; 3215 } 3216 3217 if (!amdgpu_in_reset(adev)) 3218 amdgpu_ras_set_error_query_ready(adev, true); 3219 3220 amdgpu_device_set_cg_state(adev, AMD_CG_STATE_GATE); 3221 amdgpu_device_set_pg_state(adev, AMD_PG_STATE_GATE); 3222 3223 amdgpu_device_fill_reset_magic(adev); 3224 3225 r = amdgpu_device_enable_mgpu_fan_boost(); 3226 if (r) 3227 DRM_ERROR("enable mgpu fan boost failed (%d).\n", r); 3228 3229 /* For passthrough configuration on arcturus and aldebaran, enable special handling SBR */ 3230 if (amdgpu_passthrough(adev) && 3231 ((adev->asic_type == CHIP_ARCTURUS && adev->gmc.xgmi.num_physical_nodes > 1) || 3232 adev->asic_type == CHIP_ALDEBARAN)) 3233 amdgpu_dpm_handle_passthrough_sbr(adev, true); 3234 3235 if (adev->gmc.xgmi.num_physical_nodes > 1) { 3236 mutex_lock(&mgpu_info.mutex); 3237 3238 /* 3239 * Reset device p-state to low as this was booted with high. 3240 * 3241 * This should be performed only after all devices from the same 3242 * hive get initialized. 3243 * 3244 * However, it's unknown how many device in the hive in advance. 3245 * As this is counted one by one during devices initializations. 3246 * 3247 * So, we wait for all XGMI interlinked devices initialized. 3248 * This may bring some delays as those devices may come from 3249 * different hives. But that should be OK. 3250 */ 3251 if (mgpu_info.num_dgpu == adev->gmc.xgmi.num_physical_nodes) { 3252 for (i = 0; i < mgpu_info.num_gpu; i++) { 3253 gpu_instance = &(mgpu_info.gpu_ins[i]); 3254 if (gpu_instance->adev->flags & AMD_IS_APU) 3255 continue; 3256 3257 r = amdgpu_xgmi_set_pstate(gpu_instance->adev, 3258 AMDGPU_XGMI_PSTATE_MIN); 3259 if (r) { 3260 DRM_ERROR("pstate setting failed (%d).\n", r); 3261 break; 3262 } 3263 } 3264 } 3265 3266 mutex_unlock(&mgpu_info.mutex); 3267 } 3268 3269 return 0; 3270 } 3271 3272 /** 3273 * amdgpu_device_smu_fini_early - smu hw_fini wrapper 3274 * 3275 * @adev: amdgpu_device pointer 3276 * 3277 * For ASICs need to disable SMC first 3278 */ 3279 static void amdgpu_device_smu_fini_early(struct amdgpu_device *adev) 3280 { 3281 int i, r; 3282 3283 if (amdgpu_ip_version(adev, GC_HWIP, 0) > IP_VERSION(9, 0, 0)) 3284 return; 3285 3286 for (i = 0; i < adev->num_ip_blocks; i++) { 3287 if (!adev->ip_blocks[i].status.hw) 3288 continue; 3289 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) { 3290 r = adev->ip_blocks[i].version->funcs->hw_fini(&adev->ip_blocks[i]); 3291 /* XXX handle errors */ 3292 if (r) { 3293 DRM_DEBUG("hw_fini of IP block <%s> failed %d\n", 3294 adev->ip_blocks[i].version->funcs->name, r); 3295 } 3296 adev->ip_blocks[i].status.hw = false; 3297 break; 3298 } 3299 } 3300 } 3301 3302 static int amdgpu_device_ip_fini_early(struct amdgpu_device *adev) 3303 { 3304 int i, r; 3305 3306 for (i = 0; i < adev->num_ip_blocks; i++) { 3307 if (!adev->ip_blocks[i].version->funcs->early_fini) 3308 continue; 3309 3310 r = adev->ip_blocks[i].version->funcs->early_fini(&adev->ip_blocks[i]); 3311 if (r) { 3312 DRM_DEBUG("early_fini of IP block <%s> failed %d\n", 3313 adev->ip_blocks[i].version->funcs->name, r); 3314 } 3315 } 3316 3317 amdgpu_device_set_pg_state(adev, AMD_PG_STATE_UNGATE); 3318 amdgpu_device_set_cg_state(adev, AMD_CG_STATE_UNGATE); 3319 3320 amdgpu_amdkfd_suspend(adev, false); 3321 3322 /* Workaroud for ASICs need to disable SMC first */ 3323 amdgpu_device_smu_fini_early(adev); 3324 3325 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 3326 if (!adev->ip_blocks[i].status.hw) 3327 continue; 3328 3329 r = adev->ip_blocks[i].version->funcs->hw_fini(&adev->ip_blocks[i]); 3330 /* XXX handle errors */ 3331 if (r) { 3332 DRM_DEBUG("hw_fini of IP block <%s> failed %d\n", 3333 adev->ip_blocks[i].version->funcs->name, r); 3334 } 3335 3336 adev->ip_blocks[i].status.hw = false; 3337 } 3338 3339 if (amdgpu_sriov_vf(adev)) { 3340 if (amdgpu_virt_release_full_gpu(adev, false)) 3341 DRM_ERROR("failed to release exclusive mode on fini\n"); 3342 } 3343 3344 return 0; 3345 } 3346 3347 /** 3348 * amdgpu_device_ip_fini - run fini for hardware IPs 3349 * 3350 * @adev: amdgpu_device pointer 3351 * 3352 * Main teardown pass for hardware IPs. The list of all the hardware 3353 * IPs that make up the asic is walked and the hw_fini and sw_fini callbacks 3354 * are run. hw_fini tears down the hardware associated with each IP 3355 * and sw_fini tears down any software state associated with each IP. 3356 * Returns 0 on success, negative error code on failure. 3357 */ 3358 static int amdgpu_device_ip_fini(struct amdgpu_device *adev) 3359 { 3360 int i, r; 3361 3362 if (amdgpu_sriov_vf(adev) && adev->virt.ras_init_done) 3363 amdgpu_virt_release_ras_err_handler_data(adev); 3364 3365 if (adev->gmc.xgmi.num_physical_nodes > 1) 3366 amdgpu_xgmi_remove_device(adev); 3367 3368 amdgpu_amdkfd_device_fini_sw(adev); 3369 3370 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 3371 if (!adev->ip_blocks[i].status.sw) 3372 continue; 3373 3374 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) { 3375 amdgpu_ucode_free_bo(adev); 3376 amdgpu_free_static_csa(&adev->virt.csa_obj); 3377 amdgpu_device_wb_fini(adev); 3378 amdgpu_device_mem_scratch_fini(adev); 3379 amdgpu_ib_pool_fini(adev); 3380 amdgpu_seq64_fini(adev); 3381 } 3382 if (adev->ip_blocks[i].version->funcs->sw_fini) { 3383 r = adev->ip_blocks[i].version->funcs->sw_fini(&adev->ip_blocks[i]); 3384 /* XXX handle errors */ 3385 if (r) { 3386 DRM_DEBUG("sw_fini of IP block <%s> failed %d\n", 3387 adev->ip_blocks[i].version->funcs->name, r); 3388 } 3389 } 3390 adev->ip_blocks[i].status.sw = false; 3391 adev->ip_blocks[i].status.valid = false; 3392 } 3393 3394 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 3395 if (!adev->ip_blocks[i].status.late_initialized) 3396 continue; 3397 if (adev->ip_blocks[i].version->funcs->late_fini) 3398 adev->ip_blocks[i].version->funcs->late_fini(&adev->ip_blocks[i]); 3399 adev->ip_blocks[i].status.late_initialized = false; 3400 } 3401 3402 amdgpu_ras_fini(adev); 3403 3404 return 0; 3405 } 3406 3407 /** 3408 * amdgpu_device_delayed_init_work_handler - work handler for IB tests 3409 * 3410 * @work: work_struct. 3411 */ 3412 static void amdgpu_device_delayed_init_work_handler(struct work_struct *work) 3413 { 3414 struct amdgpu_device *adev = 3415 container_of(work, struct amdgpu_device, delayed_init_work.work); 3416 int r; 3417 3418 r = amdgpu_ib_ring_tests(adev); 3419 if (r) 3420 DRM_ERROR("ib ring test failed (%d).\n", r); 3421 } 3422 3423 static void amdgpu_device_delay_enable_gfx_off(struct work_struct *work) 3424 { 3425 struct amdgpu_device *adev = 3426 container_of(work, struct amdgpu_device, gfx.gfx_off_delay_work.work); 3427 3428 WARN_ON_ONCE(adev->gfx.gfx_off_state); 3429 WARN_ON_ONCE(adev->gfx.gfx_off_req_count); 3430 3431 if (!amdgpu_dpm_set_powergating_by_smu(adev, AMD_IP_BLOCK_TYPE_GFX, true)) 3432 adev->gfx.gfx_off_state = true; 3433 } 3434 3435 /** 3436 * amdgpu_device_ip_suspend_phase1 - run suspend for hardware IPs (phase 1) 3437 * 3438 * @adev: amdgpu_device pointer 3439 * 3440 * Main suspend function for hardware IPs. The list of all the hardware 3441 * IPs that make up the asic is walked, clockgating is disabled and the 3442 * suspend callbacks are run. suspend puts the hardware and software state 3443 * in each IP into a state suitable for suspend. 3444 * Returns 0 on success, negative error code on failure. 3445 */ 3446 static int amdgpu_device_ip_suspend_phase1(struct amdgpu_device *adev) 3447 { 3448 int i, r; 3449 3450 amdgpu_device_set_pg_state(adev, AMD_PG_STATE_UNGATE); 3451 amdgpu_device_set_cg_state(adev, AMD_CG_STATE_UNGATE); 3452 3453 /* 3454 * Per PMFW team's suggestion, driver needs to handle gfxoff 3455 * and df cstate features disablement for gpu reset(e.g. Mode1Reset) 3456 * scenario. Add the missing df cstate disablement here. 3457 */ 3458 if (amdgpu_dpm_set_df_cstate(adev, DF_CSTATE_DISALLOW)) 3459 dev_warn(adev->dev, "Failed to disallow df cstate"); 3460 3461 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 3462 if (!adev->ip_blocks[i].status.valid) 3463 continue; 3464 3465 /* displays are handled separately */ 3466 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_DCE) 3467 continue; 3468 3469 /* XXX handle errors */ 3470 r = adev->ip_blocks[i].version->funcs->suspend(&adev->ip_blocks[i]); 3471 /* XXX handle errors */ 3472 if (r) { 3473 DRM_ERROR("suspend of IP block <%s> failed %d\n", 3474 adev->ip_blocks[i].version->funcs->name, r); 3475 return r; 3476 } 3477 3478 adev->ip_blocks[i].status.hw = false; 3479 } 3480 3481 return 0; 3482 } 3483 3484 /** 3485 * amdgpu_device_ip_suspend_phase2 - run suspend for hardware IPs (phase 2) 3486 * 3487 * @adev: amdgpu_device pointer 3488 * 3489 * Main suspend function for hardware IPs. The list of all the hardware 3490 * IPs that make up the asic is walked, clockgating is disabled and the 3491 * suspend callbacks are run. suspend puts the hardware and software state 3492 * in each IP into a state suitable for suspend. 3493 * Returns 0 on success, negative error code on failure. 3494 */ 3495 static int amdgpu_device_ip_suspend_phase2(struct amdgpu_device *adev) 3496 { 3497 int i, r; 3498 3499 if (adev->in_s0ix) 3500 amdgpu_dpm_gfx_state_change(adev, sGpuChangeState_D3Entry); 3501 3502 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 3503 if (!adev->ip_blocks[i].status.valid) 3504 continue; 3505 /* displays are handled in phase1 */ 3506 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE) 3507 continue; 3508 /* PSP lost connection when err_event_athub occurs */ 3509 if (amdgpu_ras_intr_triggered() && 3510 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) { 3511 adev->ip_blocks[i].status.hw = false; 3512 continue; 3513 } 3514 3515 /* skip unnecessary suspend if we do not initialize them yet */ 3516 if (!amdgpu_ip_member_of_hwini( 3517 adev, adev->ip_blocks[i].version->type)) 3518 continue; 3519 3520 /* skip suspend of gfx/mes and psp for S0ix 3521 * gfx is in gfxoff state, so on resume it will exit gfxoff just 3522 * like at runtime. PSP is also part of the always on hardware 3523 * so no need to suspend it. 3524 */ 3525 if (adev->in_s0ix && 3526 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP || 3527 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GFX || 3528 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_MES)) 3529 continue; 3530 3531 /* SDMA 5.x+ is part of GFX power domain so it's covered by GFXOFF */ 3532 if (adev->in_s0ix && 3533 (amdgpu_ip_version(adev, SDMA0_HWIP, 0) >= 3534 IP_VERSION(5, 0, 0)) && 3535 (adev->ip_blocks[i].version->type == 3536 AMD_IP_BLOCK_TYPE_SDMA)) 3537 continue; 3538 3539 /* Once swPSP provides the IMU, RLC FW binaries to TOS during cold-boot. 3540 * These are in TMR, hence are expected to be reused by PSP-TOS to reload 3541 * from this location and RLC Autoload automatically also gets loaded 3542 * from here based on PMFW -> PSP message during re-init sequence. 3543 * Therefore, the psp suspend & resume should be skipped to avoid destroy 3544 * the TMR and reload FWs again for IMU enabled APU ASICs. 3545 */ 3546 if (amdgpu_in_reset(adev) && 3547 (adev->flags & AMD_IS_APU) && adev->gfx.imu.funcs && 3548 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) 3549 continue; 3550 3551 /* XXX handle errors */ 3552 r = adev->ip_blocks[i].version->funcs->suspend(&adev->ip_blocks[i]); 3553 /* XXX handle errors */ 3554 if (r) { 3555 DRM_ERROR("suspend of IP block <%s> failed %d\n", 3556 adev->ip_blocks[i].version->funcs->name, r); 3557 } 3558 adev->ip_blocks[i].status.hw = false; 3559 /* handle putting the SMC in the appropriate state */ 3560 if (!amdgpu_sriov_vf(adev)) { 3561 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) { 3562 r = amdgpu_dpm_set_mp1_state(adev, adev->mp1_state); 3563 if (r) { 3564 DRM_ERROR("SMC failed to set mp1 state %d, %d\n", 3565 adev->mp1_state, r); 3566 return r; 3567 } 3568 } 3569 } 3570 } 3571 3572 return 0; 3573 } 3574 3575 /** 3576 * amdgpu_device_ip_suspend - run suspend for hardware IPs 3577 * 3578 * @adev: amdgpu_device pointer 3579 * 3580 * Main suspend function for hardware IPs. The list of all the hardware 3581 * IPs that make up the asic is walked, clockgating is disabled and the 3582 * suspend callbacks are run. suspend puts the hardware and software state 3583 * in each IP into a state suitable for suspend. 3584 * Returns 0 on success, negative error code on failure. 3585 */ 3586 int amdgpu_device_ip_suspend(struct amdgpu_device *adev) 3587 { 3588 int r; 3589 3590 if (amdgpu_sriov_vf(adev)) { 3591 amdgpu_virt_fini_data_exchange(adev); 3592 amdgpu_virt_request_full_gpu(adev, false); 3593 } 3594 3595 amdgpu_ttm_set_buffer_funcs_status(adev, false); 3596 3597 r = amdgpu_device_ip_suspend_phase1(adev); 3598 if (r) 3599 return r; 3600 r = amdgpu_device_ip_suspend_phase2(adev); 3601 3602 if (amdgpu_sriov_vf(adev)) 3603 amdgpu_virt_release_full_gpu(adev, false); 3604 3605 return r; 3606 } 3607 3608 static int amdgpu_device_ip_reinit_early_sriov(struct amdgpu_device *adev) 3609 { 3610 int i, r; 3611 3612 static enum amd_ip_block_type ip_order[] = { 3613 AMD_IP_BLOCK_TYPE_COMMON, 3614 AMD_IP_BLOCK_TYPE_GMC, 3615 AMD_IP_BLOCK_TYPE_PSP, 3616 AMD_IP_BLOCK_TYPE_IH, 3617 }; 3618 3619 for (i = 0; i < adev->num_ip_blocks; i++) { 3620 int j; 3621 struct amdgpu_ip_block *block; 3622 3623 block = &adev->ip_blocks[i]; 3624 block->status.hw = false; 3625 3626 for (j = 0; j < ARRAY_SIZE(ip_order); j++) { 3627 3628 if (block->version->type != ip_order[j] || 3629 !block->status.valid) 3630 continue; 3631 3632 r = block->version->funcs->hw_init(&adev->ip_blocks[i]); 3633 DRM_INFO("RE-INIT-early: %s %s\n", block->version->funcs->name, r?"failed":"succeeded"); 3634 if (r) 3635 return r; 3636 block->status.hw = true; 3637 } 3638 } 3639 3640 return 0; 3641 } 3642 3643 static int amdgpu_device_ip_reinit_late_sriov(struct amdgpu_device *adev) 3644 { 3645 int i, r; 3646 3647 static enum amd_ip_block_type ip_order[] = { 3648 AMD_IP_BLOCK_TYPE_SMC, 3649 AMD_IP_BLOCK_TYPE_DCE, 3650 AMD_IP_BLOCK_TYPE_GFX, 3651 AMD_IP_BLOCK_TYPE_SDMA, 3652 AMD_IP_BLOCK_TYPE_MES, 3653 AMD_IP_BLOCK_TYPE_UVD, 3654 AMD_IP_BLOCK_TYPE_VCE, 3655 AMD_IP_BLOCK_TYPE_VCN, 3656 AMD_IP_BLOCK_TYPE_JPEG 3657 }; 3658 3659 for (i = 0; i < ARRAY_SIZE(ip_order); i++) { 3660 int j; 3661 struct amdgpu_ip_block *block; 3662 3663 for (j = 0; j < adev->num_ip_blocks; j++) { 3664 block = &adev->ip_blocks[j]; 3665 3666 if (block->version->type != ip_order[i] || 3667 !block->status.valid || 3668 block->status.hw) 3669 continue; 3670 3671 if (block->version->type == AMD_IP_BLOCK_TYPE_SMC) 3672 r = block->version->funcs->resume(&adev->ip_blocks[i]); 3673 else 3674 r = block->version->funcs->hw_init(&adev->ip_blocks[i]); 3675 3676 DRM_INFO("RE-INIT-late: %s %s\n", block->version->funcs->name, r?"failed":"succeeded"); 3677 if (r) 3678 return r; 3679 block->status.hw = true; 3680 } 3681 } 3682 3683 return 0; 3684 } 3685 3686 /** 3687 * amdgpu_device_ip_resume_phase1 - run resume for hardware IPs 3688 * 3689 * @adev: amdgpu_device pointer 3690 * 3691 * First resume function for hardware IPs. The list of all the hardware 3692 * IPs that make up the asic is walked and the resume callbacks are run for 3693 * COMMON, GMC, and IH. resume puts the hardware into a functional state 3694 * after a suspend and updates the software state as necessary. This 3695 * function is also used for restoring the GPU after a GPU reset. 3696 * Returns 0 on success, negative error code on failure. 3697 */ 3698 static int amdgpu_device_ip_resume_phase1(struct amdgpu_device *adev) 3699 { 3700 int i, r; 3701 3702 for (i = 0; i < adev->num_ip_blocks; i++) { 3703 if (!adev->ip_blocks[i].status.valid || adev->ip_blocks[i].status.hw) 3704 continue; 3705 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON || 3706 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC || 3707 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH || 3708 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP && amdgpu_sriov_vf(adev))) { 3709 3710 r = adev->ip_blocks[i].version->funcs->resume(&adev->ip_blocks[i]); 3711 if (r) { 3712 DRM_ERROR("resume of IP block <%s> failed %d\n", 3713 adev->ip_blocks[i].version->funcs->name, r); 3714 return r; 3715 } 3716 adev->ip_blocks[i].status.hw = true; 3717 } 3718 } 3719 3720 return 0; 3721 } 3722 3723 /** 3724 * amdgpu_device_ip_resume_phase2 - run resume for hardware IPs 3725 * 3726 * @adev: amdgpu_device pointer 3727 * 3728 * First resume function for hardware IPs. The list of all the hardware 3729 * IPs that make up the asic is walked and the resume callbacks are run for 3730 * all blocks except COMMON, GMC, and IH. resume puts the hardware into a 3731 * functional state after a suspend and updates the software state as 3732 * necessary. This function is also used for restoring the GPU after a GPU 3733 * reset. 3734 * Returns 0 on success, negative error code on failure. 3735 */ 3736 static int amdgpu_device_ip_resume_phase2(struct amdgpu_device *adev) 3737 { 3738 int i, r; 3739 3740 for (i = 0; i < adev->num_ip_blocks; i++) { 3741 if (!adev->ip_blocks[i].status.valid || adev->ip_blocks[i].status.hw) 3742 continue; 3743 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON || 3744 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC || 3745 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH || 3746 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) 3747 continue; 3748 r = adev->ip_blocks[i].version->funcs->resume(&adev->ip_blocks[i]); 3749 if (r) { 3750 DRM_ERROR("resume of IP block <%s> failed %d\n", 3751 adev->ip_blocks[i].version->funcs->name, r); 3752 return r; 3753 } 3754 adev->ip_blocks[i].status.hw = true; 3755 } 3756 3757 return 0; 3758 } 3759 3760 /** 3761 * amdgpu_device_ip_resume - run resume for hardware IPs 3762 * 3763 * @adev: amdgpu_device pointer 3764 * 3765 * Main resume function for hardware IPs. The hardware IPs 3766 * are split into two resume functions because they are 3767 * also used in recovering from a GPU reset and some additional 3768 * steps need to be take between them. In this case (S3/S4) they are 3769 * run sequentially. 3770 * Returns 0 on success, negative error code on failure. 3771 */ 3772 static int amdgpu_device_ip_resume(struct amdgpu_device *adev) 3773 { 3774 int r; 3775 3776 r = amdgpu_device_ip_resume_phase1(adev); 3777 if (r) 3778 return r; 3779 3780 r = amdgpu_device_fw_loading(adev); 3781 if (r) 3782 return r; 3783 3784 r = amdgpu_device_ip_resume_phase2(adev); 3785 3786 if (adev->mman.buffer_funcs_ring->sched.ready) 3787 amdgpu_ttm_set_buffer_funcs_status(adev, true); 3788 3789 return r; 3790 } 3791 3792 /** 3793 * amdgpu_device_detect_sriov_bios - determine if the board supports SR-IOV 3794 * 3795 * @adev: amdgpu_device pointer 3796 * 3797 * Query the VBIOS data tables to determine if the board supports SR-IOV. 3798 */ 3799 static void amdgpu_device_detect_sriov_bios(struct amdgpu_device *adev) 3800 { 3801 if (amdgpu_sriov_vf(adev)) { 3802 if (adev->is_atom_fw) { 3803 if (amdgpu_atomfirmware_gpu_virtualization_supported(adev)) 3804 adev->virt.caps |= AMDGPU_SRIOV_CAPS_SRIOV_VBIOS; 3805 } else { 3806 if (amdgpu_atombios_has_gpu_virtualization_table(adev)) 3807 adev->virt.caps |= AMDGPU_SRIOV_CAPS_SRIOV_VBIOS; 3808 } 3809 3810 if (!(adev->virt.caps & AMDGPU_SRIOV_CAPS_SRIOV_VBIOS)) 3811 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_NO_VBIOS, 0, 0); 3812 } 3813 } 3814 3815 /** 3816 * amdgpu_device_asic_has_dc_support - determine if DC supports the asic 3817 * 3818 * @asic_type: AMD asic type 3819 * 3820 * Check if there is DC (new modesetting infrastructre) support for an asic. 3821 * returns true if DC has support, false if not. 3822 */ 3823 bool amdgpu_device_asic_has_dc_support(enum amd_asic_type asic_type) 3824 { 3825 switch (asic_type) { 3826 #ifdef CONFIG_DRM_AMDGPU_SI 3827 case CHIP_HAINAN: 3828 #endif 3829 case CHIP_TOPAZ: 3830 /* chips with no display hardware */ 3831 return false; 3832 #if defined(CONFIG_DRM_AMD_DC) 3833 case CHIP_TAHITI: 3834 case CHIP_PITCAIRN: 3835 case CHIP_VERDE: 3836 case CHIP_OLAND: 3837 /* 3838 * We have systems in the wild with these ASICs that require 3839 * LVDS and VGA support which is not supported with DC. 3840 * 3841 * Fallback to the non-DC driver here by default so as not to 3842 * cause regressions. 3843 */ 3844 #if defined(CONFIG_DRM_AMD_DC_SI) 3845 return amdgpu_dc > 0; 3846 #else 3847 return false; 3848 #endif 3849 case CHIP_BONAIRE: 3850 case CHIP_KAVERI: 3851 case CHIP_KABINI: 3852 case CHIP_MULLINS: 3853 /* 3854 * We have systems in the wild with these ASICs that require 3855 * VGA support which is not supported with DC. 3856 * 3857 * Fallback to the non-DC driver here by default so as not to 3858 * cause regressions. 3859 */ 3860 return amdgpu_dc > 0; 3861 default: 3862 return amdgpu_dc != 0; 3863 #else 3864 default: 3865 if (amdgpu_dc > 0) 3866 DRM_INFO_ONCE("Display Core has been requested via kernel parameter but isn't supported by ASIC, ignoring\n"); 3867 return false; 3868 #endif 3869 } 3870 } 3871 3872 /** 3873 * amdgpu_device_has_dc_support - check if dc is supported 3874 * 3875 * @adev: amdgpu_device pointer 3876 * 3877 * Returns true for supported, false for not supported 3878 */ 3879 bool amdgpu_device_has_dc_support(struct amdgpu_device *adev) 3880 { 3881 if (adev->enable_virtual_display || 3882 (adev->harvest_ip_mask & AMD_HARVEST_IP_DMU_MASK)) 3883 return false; 3884 3885 return amdgpu_device_asic_has_dc_support(adev->asic_type); 3886 } 3887 3888 static void amdgpu_device_xgmi_reset_func(struct work_struct *__work) 3889 { 3890 struct amdgpu_device *adev = 3891 container_of(__work, struct amdgpu_device, xgmi_reset_work); 3892 struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev); 3893 3894 /* It's a bug to not have a hive within this function */ 3895 if (WARN_ON(!hive)) 3896 return; 3897 3898 /* 3899 * Use task barrier to synchronize all xgmi reset works across the 3900 * hive. task_barrier_enter and task_barrier_exit will block 3901 * until all the threads running the xgmi reset works reach 3902 * those points. task_barrier_full will do both blocks. 3903 */ 3904 if (amdgpu_asic_reset_method(adev) == AMD_RESET_METHOD_BACO) { 3905 3906 task_barrier_enter(&hive->tb); 3907 adev->asic_reset_res = amdgpu_device_baco_enter(adev_to_drm(adev)); 3908 3909 if (adev->asic_reset_res) 3910 goto fail; 3911 3912 task_barrier_exit(&hive->tb); 3913 adev->asic_reset_res = amdgpu_device_baco_exit(adev_to_drm(adev)); 3914 3915 if (adev->asic_reset_res) 3916 goto fail; 3917 3918 amdgpu_ras_reset_error_count(adev, AMDGPU_RAS_BLOCK__MMHUB); 3919 } else { 3920 3921 task_barrier_full(&hive->tb); 3922 adev->asic_reset_res = amdgpu_asic_reset(adev); 3923 } 3924 3925 fail: 3926 if (adev->asic_reset_res) 3927 DRM_WARN("ASIC reset failed with error, %d for drm dev, %s", 3928 adev->asic_reset_res, adev_to_drm(adev)->unique); 3929 amdgpu_put_xgmi_hive(hive); 3930 } 3931 3932 static int amdgpu_device_get_job_timeout_settings(struct amdgpu_device *adev) 3933 { 3934 char *input = amdgpu_lockup_timeout; 3935 char *timeout_setting = NULL; 3936 int index = 0; 3937 long timeout; 3938 int ret = 0; 3939 3940 /* 3941 * By default timeout for non compute jobs is 10000 3942 * and 60000 for compute jobs. 3943 * In SR-IOV or passthrough mode, timeout for compute 3944 * jobs are 60000 by default. 3945 */ 3946 adev->gfx_timeout = msecs_to_jiffies(10000); 3947 adev->sdma_timeout = adev->video_timeout = adev->gfx_timeout; 3948 if (amdgpu_sriov_vf(adev)) 3949 adev->compute_timeout = amdgpu_sriov_is_pp_one_vf(adev) ? 3950 msecs_to_jiffies(60000) : msecs_to_jiffies(10000); 3951 else 3952 adev->compute_timeout = msecs_to_jiffies(60000); 3953 3954 if (strnlen(input, AMDGPU_MAX_TIMEOUT_PARAM_LENGTH)) { 3955 while ((timeout_setting = strsep(&input, ",")) && 3956 strnlen(timeout_setting, AMDGPU_MAX_TIMEOUT_PARAM_LENGTH)) { 3957 ret = kstrtol(timeout_setting, 0, &timeout); 3958 if (ret) 3959 return ret; 3960 3961 if (timeout == 0) { 3962 index++; 3963 continue; 3964 } else if (timeout < 0) { 3965 timeout = MAX_SCHEDULE_TIMEOUT; 3966 dev_warn(adev->dev, "lockup timeout disabled"); 3967 add_taint(TAINT_SOFTLOCKUP, LOCKDEP_STILL_OK); 3968 } else { 3969 timeout = msecs_to_jiffies(timeout); 3970 } 3971 3972 switch (index++) { 3973 case 0: 3974 adev->gfx_timeout = timeout; 3975 break; 3976 case 1: 3977 adev->compute_timeout = timeout; 3978 break; 3979 case 2: 3980 adev->sdma_timeout = timeout; 3981 break; 3982 case 3: 3983 adev->video_timeout = timeout; 3984 break; 3985 default: 3986 break; 3987 } 3988 } 3989 /* 3990 * There is only one value specified and 3991 * it should apply to all non-compute jobs. 3992 */ 3993 if (index == 1) { 3994 adev->sdma_timeout = adev->video_timeout = adev->gfx_timeout; 3995 if (amdgpu_sriov_vf(adev) || amdgpu_passthrough(adev)) 3996 adev->compute_timeout = adev->gfx_timeout; 3997 } 3998 } 3999 4000 return ret; 4001 } 4002 4003 /** 4004 * amdgpu_device_check_iommu_direct_map - check if RAM direct mapped to GPU 4005 * 4006 * @adev: amdgpu_device pointer 4007 * 4008 * RAM direct mapped to GPU if IOMMU is not enabled or is pass through mode 4009 */ 4010 static void amdgpu_device_check_iommu_direct_map(struct amdgpu_device *adev) 4011 { 4012 struct iommu_domain *domain; 4013 4014 domain = iommu_get_domain_for_dev(adev->dev); 4015 if (!domain || domain->type == IOMMU_DOMAIN_IDENTITY) 4016 adev->ram_is_direct_mapped = true; 4017 } 4018 4019 #if defined(CONFIG_HSA_AMD_P2P) 4020 /** 4021 * amdgpu_device_check_iommu_remap - Check if DMA remapping is enabled. 4022 * 4023 * @adev: amdgpu_device pointer 4024 * 4025 * return if IOMMU remapping bar address 4026 */ 4027 static bool amdgpu_device_check_iommu_remap(struct amdgpu_device *adev) 4028 { 4029 struct iommu_domain *domain; 4030 4031 domain = iommu_get_domain_for_dev(adev->dev); 4032 if (domain && (domain->type == IOMMU_DOMAIN_DMA || 4033 domain->type == IOMMU_DOMAIN_DMA_FQ)) 4034 return true; 4035 4036 return false; 4037 } 4038 #endif 4039 4040 static const struct attribute *amdgpu_dev_attributes[] = { 4041 &dev_attr_pcie_replay_count.attr, 4042 NULL 4043 }; 4044 4045 static void amdgpu_device_set_mcbp(struct amdgpu_device *adev) 4046 { 4047 if (amdgpu_mcbp == 1) 4048 adev->gfx.mcbp = true; 4049 else if (amdgpu_mcbp == 0) 4050 adev->gfx.mcbp = false; 4051 4052 if (amdgpu_sriov_vf(adev)) 4053 adev->gfx.mcbp = true; 4054 4055 if (adev->gfx.mcbp) 4056 DRM_INFO("MCBP is enabled\n"); 4057 } 4058 4059 /** 4060 * amdgpu_device_init - initialize the driver 4061 * 4062 * @adev: amdgpu_device pointer 4063 * @flags: driver flags 4064 * 4065 * Initializes the driver info and hw (all asics). 4066 * Returns 0 for success or an error on failure. 4067 * Called at driver startup. 4068 */ 4069 int amdgpu_device_init(struct amdgpu_device *adev, 4070 uint32_t flags) 4071 { 4072 struct drm_device *ddev = adev_to_drm(adev); 4073 struct pci_dev *pdev = adev->pdev; 4074 int r, i; 4075 bool px = false; 4076 u32 max_MBps; 4077 int tmp; 4078 4079 adev->shutdown = false; 4080 adev->flags = flags; 4081 4082 if (amdgpu_force_asic_type >= 0 && amdgpu_force_asic_type < CHIP_LAST) 4083 adev->asic_type = amdgpu_force_asic_type; 4084 else 4085 adev->asic_type = flags & AMD_ASIC_MASK; 4086 4087 adev->usec_timeout = AMDGPU_MAX_USEC_TIMEOUT; 4088 if (amdgpu_emu_mode == 1) 4089 adev->usec_timeout *= 10; 4090 adev->gmc.gart_size = 512 * 1024 * 1024; 4091 adev->accel_working = false; 4092 adev->num_rings = 0; 4093 RCU_INIT_POINTER(adev->gang_submit, dma_fence_get_stub()); 4094 adev->mman.buffer_funcs = NULL; 4095 adev->mman.buffer_funcs_ring = NULL; 4096 adev->vm_manager.vm_pte_funcs = NULL; 4097 adev->vm_manager.vm_pte_num_scheds = 0; 4098 adev->gmc.gmc_funcs = NULL; 4099 adev->harvest_ip_mask = 0x0; 4100 adev->fence_context = dma_fence_context_alloc(AMDGPU_MAX_RINGS); 4101 bitmap_zero(adev->gfx.pipe_reserve_bitmap, AMDGPU_MAX_COMPUTE_QUEUES); 4102 4103 adev->smc_rreg = &amdgpu_invalid_rreg; 4104 adev->smc_wreg = &amdgpu_invalid_wreg; 4105 adev->pcie_rreg = &amdgpu_invalid_rreg; 4106 adev->pcie_wreg = &amdgpu_invalid_wreg; 4107 adev->pcie_rreg_ext = &amdgpu_invalid_rreg_ext; 4108 adev->pcie_wreg_ext = &amdgpu_invalid_wreg_ext; 4109 adev->pciep_rreg = &amdgpu_invalid_rreg; 4110 adev->pciep_wreg = &amdgpu_invalid_wreg; 4111 adev->pcie_rreg64 = &amdgpu_invalid_rreg64; 4112 adev->pcie_wreg64 = &amdgpu_invalid_wreg64; 4113 adev->pcie_rreg64_ext = &amdgpu_invalid_rreg64_ext; 4114 adev->pcie_wreg64_ext = &amdgpu_invalid_wreg64_ext; 4115 adev->uvd_ctx_rreg = &amdgpu_invalid_rreg; 4116 adev->uvd_ctx_wreg = &amdgpu_invalid_wreg; 4117 adev->didt_rreg = &amdgpu_invalid_rreg; 4118 adev->didt_wreg = &amdgpu_invalid_wreg; 4119 adev->gc_cac_rreg = &amdgpu_invalid_rreg; 4120 adev->gc_cac_wreg = &amdgpu_invalid_wreg; 4121 adev->audio_endpt_rreg = &amdgpu_block_invalid_rreg; 4122 adev->audio_endpt_wreg = &amdgpu_block_invalid_wreg; 4123 4124 DRM_INFO("initializing kernel modesetting (%s 0x%04X:0x%04X 0x%04X:0x%04X 0x%02X).\n", 4125 amdgpu_asic_name[adev->asic_type], pdev->vendor, pdev->device, 4126 pdev->subsystem_vendor, pdev->subsystem_device, pdev->revision); 4127 4128 /* mutex initialization are all done here so we 4129 * can recall function without having locking issues 4130 */ 4131 mutex_init(&adev->firmware.mutex); 4132 mutex_init(&adev->pm.mutex); 4133 mutex_init(&adev->gfx.gpu_clock_mutex); 4134 mutex_init(&adev->srbm_mutex); 4135 mutex_init(&adev->gfx.pipe_reserve_mutex); 4136 mutex_init(&adev->gfx.gfx_off_mutex); 4137 mutex_init(&adev->gfx.partition_mutex); 4138 mutex_init(&adev->grbm_idx_mutex); 4139 mutex_init(&adev->mn_lock); 4140 mutex_init(&adev->virt.vf_errors.lock); 4141 mutex_init(&adev->virt.rlcg_reg_lock); 4142 hash_init(adev->mn_hash); 4143 mutex_init(&adev->psp.mutex); 4144 mutex_init(&adev->notifier_lock); 4145 mutex_init(&adev->pm.stable_pstate_ctx_lock); 4146 mutex_init(&adev->benchmark_mutex); 4147 mutex_init(&adev->gfx.reset_sem_mutex); 4148 /* Initialize the mutex for cleaner shader isolation between GFX and compute processes */ 4149 mutex_init(&adev->enforce_isolation_mutex); 4150 mutex_init(&adev->gfx.kfd_sch_mutex); 4151 4152 amdgpu_device_init_apu_flags(adev); 4153 4154 r = amdgpu_device_check_arguments(adev); 4155 if (r) 4156 return r; 4157 4158 spin_lock_init(&adev->mmio_idx_lock); 4159 spin_lock_init(&adev->smc_idx_lock); 4160 spin_lock_init(&adev->pcie_idx_lock); 4161 spin_lock_init(&adev->uvd_ctx_idx_lock); 4162 spin_lock_init(&adev->didt_idx_lock); 4163 spin_lock_init(&adev->gc_cac_idx_lock); 4164 spin_lock_init(&adev->se_cac_idx_lock); 4165 spin_lock_init(&adev->audio_endpt_idx_lock); 4166 spin_lock_init(&adev->mm_stats.lock); 4167 spin_lock_init(&adev->wb.lock); 4168 4169 INIT_LIST_HEAD(&adev->reset_list); 4170 4171 INIT_LIST_HEAD(&adev->ras_list); 4172 4173 INIT_LIST_HEAD(&adev->pm.od_kobj_list); 4174 4175 INIT_DELAYED_WORK(&adev->delayed_init_work, 4176 amdgpu_device_delayed_init_work_handler); 4177 INIT_DELAYED_WORK(&adev->gfx.gfx_off_delay_work, 4178 amdgpu_device_delay_enable_gfx_off); 4179 /* 4180 * Initialize the enforce_isolation work structures for each XCP 4181 * partition. This work handler is responsible for enforcing shader 4182 * isolation on AMD GPUs. It counts the number of emitted fences for 4183 * each GFX and compute ring. If there are any fences, it schedules 4184 * the `enforce_isolation_work` to be run after a delay. If there are 4185 * no fences, it signals the Kernel Fusion Driver (KFD) to resume the 4186 * runqueue. 4187 */ 4188 for (i = 0; i < MAX_XCP; i++) { 4189 INIT_DELAYED_WORK(&adev->gfx.enforce_isolation[i].work, 4190 amdgpu_gfx_enforce_isolation_handler); 4191 adev->gfx.enforce_isolation[i].adev = adev; 4192 adev->gfx.enforce_isolation[i].xcp_id = i; 4193 } 4194 4195 INIT_WORK(&adev->xgmi_reset_work, amdgpu_device_xgmi_reset_func); 4196 4197 adev->gfx.gfx_off_req_count = 1; 4198 adev->gfx.gfx_off_residency = 0; 4199 adev->gfx.gfx_off_entrycount = 0; 4200 adev->pm.ac_power = power_supply_is_system_supplied() > 0; 4201 4202 atomic_set(&adev->throttling_logging_enabled, 1); 4203 /* 4204 * If throttling continues, logging will be performed every minute 4205 * to avoid log flooding. "-1" is subtracted since the thermal 4206 * throttling interrupt comes every second. Thus, the total logging 4207 * interval is 59 seconds(retelimited printk interval) + 1(waiting 4208 * for throttling interrupt) = 60 seconds. 4209 */ 4210 ratelimit_state_init(&adev->throttling_logging_rs, (60 - 1) * HZ, 1); 4211 ratelimit_set_flags(&adev->throttling_logging_rs, RATELIMIT_MSG_ON_RELEASE); 4212 4213 /* Registers mapping */ 4214 /* TODO: block userspace mapping of io register */ 4215 if (adev->asic_type >= CHIP_BONAIRE) { 4216 adev->rmmio_base = pci_resource_start(adev->pdev, 5); 4217 adev->rmmio_size = pci_resource_len(adev->pdev, 5); 4218 } else { 4219 adev->rmmio_base = pci_resource_start(adev->pdev, 2); 4220 adev->rmmio_size = pci_resource_len(adev->pdev, 2); 4221 } 4222 4223 for (i = 0; i < AMD_IP_BLOCK_TYPE_NUM; i++) 4224 atomic_set(&adev->pm.pwr_state[i], POWER_STATE_UNKNOWN); 4225 4226 adev->rmmio = ioremap(adev->rmmio_base, adev->rmmio_size); 4227 if (!adev->rmmio) 4228 return -ENOMEM; 4229 4230 DRM_INFO("register mmio base: 0x%08X\n", (uint32_t)adev->rmmio_base); 4231 DRM_INFO("register mmio size: %u\n", (unsigned int)adev->rmmio_size); 4232 4233 /* 4234 * Reset domain needs to be present early, before XGMI hive discovered 4235 * (if any) and intitialized to use reset sem and in_gpu reset flag 4236 * early on during init and before calling to RREG32. 4237 */ 4238 adev->reset_domain = amdgpu_reset_create_reset_domain(SINGLE_DEVICE, "amdgpu-reset-dev"); 4239 if (!adev->reset_domain) 4240 return -ENOMEM; 4241 4242 /* detect hw virtualization here */ 4243 amdgpu_detect_virtualization(adev); 4244 4245 amdgpu_device_get_pcie_info(adev); 4246 4247 r = amdgpu_device_get_job_timeout_settings(adev); 4248 if (r) { 4249 dev_err(adev->dev, "invalid lockup_timeout parameter syntax\n"); 4250 return r; 4251 } 4252 4253 amdgpu_device_set_mcbp(adev); 4254 4255 /* 4256 * By default, use default mode where all blocks are expected to be 4257 * initialized. At present a 'swinit' of blocks is required to be 4258 * completed before the need for a different level is detected. 4259 */ 4260 amdgpu_set_init_level(adev, AMDGPU_INIT_LEVEL_DEFAULT); 4261 /* early init functions */ 4262 r = amdgpu_device_ip_early_init(adev); 4263 if (r) 4264 return r; 4265 4266 /* Get rid of things like offb */ 4267 r = drm_aperture_remove_conflicting_pci_framebuffers(adev->pdev, &amdgpu_kms_driver); 4268 if (r) 4269 return r; 4270 4271 /* Enable TMZ based on IP_VERSION */ 4272 amdgpu_gmc_tmz_set(adev); 4273 4274 if (amdgpu_sriov_vf(adev) && 4275 amdgpu_ip_version(adev, GC_HWIP, 0) >= IP_VERSION(10, 3, 0)) 4276 /* VF MMIO access (except mailbox range) from CPU 4277 * will be blocked during sriov runtime 4278 */ 4279 adev->virt.caps |= AMDGPU_VF_MMIO_ACCESS_PROTECT; 4280 4281 amdgpu_gmc_noretry_set(adev); 4282 /* Need to get xgmi info early to decide the reset behavior*/ 4283 if (adev->gmc.xgmi.supported) { 4284 r = adev->gfxhub.funcs->get_xgmi_info(adev); 4285 if (r) 4286 return r; 4287 } 4288 4289 /* enable PCIE atomic ops */ 4290 if (amdgpu_sriov_vf(adev)) { 4291 if (adev->virt.fw_reserve.p_pf2vf) 4292 adev->have_atomics_support = ((struct amd_sriov_msg_pf2vf_info *) 4293 adev->virt.fw_reserve.p_pf2vf)->pcie_atomic_ops_support_flags == 4294 (PCI_EXP_DEVCAP2_ATOMIC_COMP32 | PCI_EXP_DEVCAP2_ATOMIC_COMP64); 4295 /* APUs w/ gfx9 onwards doesn't reply on PCIe atomics, rather it is a 4296 * internal path natively support atomics, set have_atomics_support to true. 4297 */ 4298 } else if ((adev->flags & AMD_IS_APU) && 4299 (amdgpu_ip_version(adev, GC_HWIP, 0) > 4300 IP_VERSION(9, 0, 0))) { 4301 adev->have_atomics_support = true; 4302 } else { 4303 adev->have_atomics_support = 4304 !pci_enable_atomic_ops_to_root(adev->pdev, 4305 PCI_EXP_DEVCAP2_ATOMIC_COMP32 | 4306 PCI_EXP_DEVCAP2_ATOMIC_COMP64); 4307 } 4308 4309 if (!adev->have_atomics_support) 4310 dev_info(adev->dev, "PCIE atomic ops is not supported\n"); 4311 4312 /* doorbell bar mapping and doorbell index init*/ 4313 amdgpu_doorbell_init(adev); 4314 4315 if (amdgpu_emu_mode == 1) { 4316 /* post the asic on emulation mode */ 4317 emu_soc_asic_init(adev); 4318 goto fence_driver_init; 4319 } 4320 4321 amdgpu_reset_init(adev); 4322 4323 /* detect if we are with an SRIOV vbios */ 4324 if (adev->bios) 4325 amdgpu_device_detect_sriov_bios(adev); 4326 4327 /* check if we need to reset the asic 4328 * E.g., driver was not cleanly unloaded previously, etc. 4329 */ 4330 if (!amdgpu_sriov_vf(adev) && amdgpu_asic_need_reset_on_init(adev)) { 4331 if (adev->gmc.xgmi.num_physical_nodes) { 4332 dev_info(adev->dev, "Pending hive reset.\n"); 4333 amdgpu_set_init_level(adev, 4334 AMDGPU_INIT_LEVEL_MINIMAL_XGMI); 4335 } else if (amdgpu_ip_version(adev, MP1_HWIP, 0) == IP_VERSION(13, 0, 10) && 4336 !amdgpu_device_has_display_hardware(adev)) { 4337 r = psp_gpu_reset(adev); 4338 } else { 4339 tmp = amdgpu_reset_method; 4340 /* It should do a default reset when loading or reloading the driver, 4341 * regardless of the module parameter reset_method. 4342 */ 4343 amdgpu_reset_method = AMD_RESET_METHOD_NONE; 4344 r = amdgpu_asic_reset(adev); 4345 amdgpu_reset_method = tmp; 4346 } 4347 4348 if (r) { 4349 dev_err(adev->dev, "asic reset on init failed\n"); 4350 goto failed; 4351 } 4352 } 4353 4354 /* Post card if necessary */ 4355 if (amdgpu_device_need_post(adev)) { 4356 if (!adev->bios) { 4357 dev_err(adev->dev, "no vBIOS found\n"); 4358 r = -EINVAL; 4359 goto failed; 4360 } 4361 DRM_INFO("GPU posting now...\n"); 4362 r = amdgpu_device_asic_init(adev); 4363 if (r) { 4364 dev_err(adev->dev, "gpu post error!\n"); 4365 goto failed; 4366 } 4367 } 4368 4369 if (adev->bios) { 4370 if (adev->is_atom_fw) { 4371 /* Initialize clocks */ 4372 r = amdgpu_atomfirmware_get_clock_info(adev); 4373 if (r) { 4374 dev_err(adev->dev, "amdgpu_atomfirmware_get_clock_info failed\n"); 4375 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_GET_CLOCK_FAIL, 0, 0); 4376 goto failed; 4377 } 4378 } else { 4379 /* Initialize clocks */ 4380 r = amdgpu_atombios_get_clock_info(adev); 4381 if (r) { 4382 dev_err(adev->dev, "amdgpu_atombios_get_clock_info failed\n"); 4383 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_GET_CLOCK_FAIL, 0, 0); 4384 goto failed; 4385 } 4386 /* init i2c buses */ 4387 if (!amdgpu_device_has_dc_support(adev)) 4388 amdgpu_atombios_i2c_init(adev); 4389 } 4390 } 4391 4392 fence_driver_init: 4393 /* Fence driver */ 4394 r = amdgpu_fence_driver_sw_init(adev); 4395 if (r) { 4396 dev_err(adev->dev, "amdgpu_fence_driver_sw_init failed\n"); 4397 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_FENCE_INIT_FAIL, 0, 0); 4398 goto failed; 4399 } 4400 4401 /* init the mode config */ 4402 drm_mode_config_init(adev_to_drm(adev)); 4403 4404 r = amdgpu_device_ip_init(adev); 4405 if (r) { 4406 dev_err(adev->dev, "amdgpu_device_ip_init failed\n"); 4407 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_AMDGPU_INIT_FAIL, 0, 0); 4408 goto release_ras_con; 4409 } 4410 4411 amdgpu_fence_driver_hw_init(adev); 4412 4413 dev_info(adev->dev, 4414 "SE %d, SH per SE %d, CU per SH %d, active_cu_number %d\n", 4415 adev->gfx.config.max_shader_engines, 4416 adev->gfx.config.max_sh_per_se, 4417 adev->gfx.config.max_cu_per_sh, 4418 adev->gfx.cu_info.number); 4419 4420 adev->accel_working = true; 4421 4422 amdgpu_vm_check_compute_bug(adev); 4423 4424 /* Initialize the buffer migration limit. */ 4425 if (amdgpu_moverate >= 0) 4426 max_MBps = amdgpu_moverate; 4427 else 4428 max_MBps = 8; /* Allow 8 MB/s. */ 4429 /* Get a log2 for easy divisions. */ 4430 adev->mm_stats.log2_max_MBps = ilog2(max(1u, max_MBps)); 4431 4432 /* 4433 * Register gpu instance before amdgpu_device_enable_mgpu_fan_boost. 4434 * Otherwise the mgpu fan boost feature will be skipped due to the 4435 * gpu instance is counted less. 4436 */ 4437 amdgpu_register_gpu_instance(adev); 4438 4439 /* enable clockgating, etc. after ib tests, etc. since some blocks require 4440 * explicit gating rather than handling it automatically. 4441 */ 4442 if (adev->init_lvl->level != AMDGPU_INIT_LEVEL_MINIMAL_XGMI) { 4443 r = amdgpu_device_ip_late_init(adev); 4444 if (r) { 4445 dev_err(adev->dev, "amdgpu_device_ip_late_init failed\n"); 4446 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_AMDGPU_LATE_INIT_FAIL, 0, r); 4447 goto release_ras_con; 4448 } 4449 /* must succeed. */ 4450 amdgpu_ras_resume(adev); 4451 queue_delayed_work(system_wq, &adev->delayed_init_work, 4452 msecs_to_jiffies(AMDGPU_RESUME_MS)); 4453 } 4454 4455 if (amdgpu_sriov_vf(adev)) { 4456 amdgpu_virt_release_full_gpu(adev, true); 4457 flush_delayed_work(&adev->delayed_init_work); 4458 } 4459 4460 /* 4461 * Place those sysfs registering after `late_init`. As some of those 4462 * operations performed in `late_init` might affect the sysfs 4463 * interfaces creating. 4464 */ 4465 r = amdgpu_atombios_sysfs_init(adev); 4466 if (r) 4467 drm_err(&adev->ddev, 4468 "registering atombios sysfs failed (%d).\n", r); 4469 4470 r = amdgpu_pm_sysfs_init(adev); 4471 if (r) 4472 DRM_ERROR("registering pm sysfs failed (%d).\n", r); 4473 4474 r = amdgpu_ucode_sysfs_init(adev); 4475 if (r) { 4476 adev->ucode_sysfs_en = false; 4477 DRM_ERROR("Creating firmware sysfs failed (%d).\n", r); 4478 } else 4479 adev->ucode_sysfs_en = true; 4480 4481 r = sysfs_create_files(&adev->dev->kobj, amdgpu_dev_attributes); 4482 if (r) 4483 dev_err(adev->dev, "Could not create amdgpu device attr\n"); 4484 4485 r = devm_device_add_group(adev->dev, &amdgpu_board_attrs_group); 4486 if (r) 4487 dev_err(adev->dev, 4488 "Could not create amdgpu board attributes\n"); 4489 4490 amdgpu_fru_sysfs_init(adev); 4491 amdgpu_reg_state_sysfs_init(adev); 4492 amdgpu_xcp_cfg_sysfs_init(adev); 4493 4494 if (IS_ENABLED(CONFIG_PERF_EVENTS)) 4495 r = amdgpu_pmu_init(adev); 4496 if (r) 4497 dev_err(adev->dev, "amdgpu_pmu_init failed\n"); 4498 4499 /* Have stored pci confspace at hand for restore in sudden PCI error */ 4500 if (amdgpu_device_cache_pci_state(adev->pdev)) 4501 pci_restore_state(pdev); 4502 4503 /* if we have > 1 VGA cards, then disable the amdgpu VGA resources */ 4504 /* this will fail for cards that aren't VGA class devices, just 4505 * ignore it 4506 */ 4507 if ((adev->pdev->class >> 8) == PCI_CLASS_DISPLAY_VGA) 4508 vga_client_register(adev->pdev, amdgpu_device_vga_set_decode); 4509 4510 px = amdgpu_device_supports_px(ddev); 4511 4512 if (px || (!dev_is_removable(&adev->pdev->dev) && 4513 apple_gmux_detect(NULL, NULL))) 4514 vga_switcheroo_register_client(adev->pdev, 4515 &amdgpu_switcheroo_ops, px); 4516 4517 if (px) 4518 vga_switcheroo_init_domain_pm_ops(adev->dev, &adev->vga_pm_domain); 4519 4520 if (adev->init_lvl->level == AMDGPU_INIT_LEVEL_MINIMAL_XGMI) 4521 amdgpu_xgmi_reset_on_init(adev); 4522 4523 amdgpu_device_check_iommu_direct_map(adev); 4524 4525 return 0; 4526 4527 release_ras_con: 4528 if (amdgpu_sriov_vf(adev)) 4529 amdgpu_virt_release_full_gpu(adev, true); 4530 4531 /* failed in exclusive mode due to timeout */ 4532 if (amdgpu_sriov_vf(adev) && 4533 !amdgpu_sriov_runtime(adev) && 4534 amdgpu_virt_mmio_blocked(adev) && 4535 !amdgpu_virt_wait_reset(adev)) { 4536 dev_err(adev->dev, "VF exclusive mode timeout\n"); 4537 /* Don't send request since VF is inactive. */ 4538 adev->virt.caps &= ~AMDGPU_SRIOV_CAPS_RUNTIME; 4539 adev->virt.ops = NULL; 4540 r = -EAGAIN; 4541 } 4542 amdgpu_release_ras_context(adev); 4543 4544 failed: 4545 amdgpu_vf_error_trans_all(adev); 4546 4547 return r; 4548 } 4549 4550 static void amdgpu_device_unmap_mmio(struct amdgpu_device *adev) 4551 { 4552 4553 /* Clear all CPU mappings pointing to this device */ 4554 unmap_mapping_range(adev->ddev.anon_inode->i_mapping, 0, 0, 1); 4555 4556 /* Unmap all mapped bars - Doorbell, registers and VRAM */ 4557 amdgpu_doorbell_fini(adev); 4558 4559 iounmap(adev->rmmio); 4560 adev->rmmio = NULL; 4561 if (adev->mman.aper_base_kaddr) 4562 iounmap(adev->mman.aper_base_kaddr); 4563 adev->mman.aper_base_kaddr = NULL; 4564 4565 /* Memory manager related */ 4566 if (!adev->gmc.xgmi.connected_to_cpu && !adev->gmc.is_app_apu) { 4567 arch_phys_wc_del(adev->gmc.vram_mtrr); 4568 arch_io_free_memtype_wc(adev->gmc.aper_base, adev->gmc.aper_size); 4569 } 4570 } 4571 4572 /** 4573 * amdgpu_device_fini_hw - tear down the driver 4574 * 4575 * @adev: amdgpu_device pointer 4576 * 4577 * Tear down the driver info (all asics). 4578 * Called at driver shutdown. 4579 */ 4580 void amdgpu_device_fini_hw(struct amdgpu_device *adev) 4581 { 4582 dev_info(adev->dev, "amdgpu: finishing device.\n"); 4583 flush_delayed_work(&adev->delayed_init_work); 4584 4585 if (adev->mman.initialized) 4586 drain_workqueue(adev->mman.bdev.wq); 4587 adev->shutdown = true; 4588 4589 /* make sure IB test finished before entering exclusive mode 4590 * to avoid preemption on IB test 4591 */ 4592 if (amdgpu_sriov_vf(adev)) { 4593 amdgpu_virt_request_full_gpu(adev, false); 4594 amdgpu_virt_fini_data_exchange(adev); 4595 } 4596 4597 /* disable all interrupts */ 4598 amdgpu_irq_disable_all(adev); 4599 if (adev->mode_info.mode_config_initialized) { 4600 if (!drm_drv_uses_atomic_modeset(adev_to_drm(adev))) 4601 drm_helper_force_disable_all(adev_to_drm(adev)); 4602 else 4603 drm_atomic_helper_shutdown(adev_to_drm(adev)); 4604 } 4605 amdgpu_fence_driver_hw_fini(adev); 4606 4607 if (adev->pm.sysfs_initialized) 4608 amdgpu_pm_sysfs_fini(adev); 4609 if (adev->ucode_sysfs_en) 4610 amdgpu_ucode_sysfs_fini(adev); 4611 sysfs_remove_files(&adev->dev->kobj, amdgpu_dev_attributes); 4612 amdgpu_fru_sysfs_fini(adev); 4613 4614 amdgpu_reg_state_sysfs_fini(adev); 4615 amdgpu_xcp_cfg_sysfs_fini(adev); 4616 4617 /* disable ras feature must before hw fini */ 4618 amdgpu_ras_pre_fini(adev); 4619 4620 amdgpu_ttm_set_buffer_funcs_status(adev, false); 4621 4622 amdgpu_device_ip_fini_early(adev); 4623 4624 amdgpu_irq_fini_hw(adev); 4625 4626 if (adev->mman.initialized) 4627 ttm_device_clear_dma_mappings(&adev->mman.bdev); 4628 4629 amdgpu_gart_dummy_page_fini(adev); 4630 4631 if (drm_dev_is_unplugged(adev_to_drm(adev))) 4632 amdgpu_device_unmap_mmio(adev); 4633 4634 } 4635 4636 void amdgpu_device_fini_sw(struct amdgpu_device *adev) 4637 { 4638 int idx; 4639 bool px; 4640 4641 amdgpu_fence_driver_sw_fini(adev); 4642 amdgpu_device_ip_fini(adev); 4643 amdgpu_ucode_release(&adev->firmware.gpu_info_fw); 4644 adev->accel_working = false; 4645 dma_fence_put(rcu_dereference_protected(adev->gang_submit, true)); 4646 4647 amdgpu_reset_fini(adev); 4648 4649 /* free i2c buses */ 4650 if (!amdgpu_device_has_dc_support(adev)) 4651 amdgpu_i2c_fini(adev); 4652 4653 if (amdgpu_emu_mode != 1) 4654 amdgpu_atombios_fini(adev); 4655 4656 kfree(adev->bios); 4657 adev->bios = NULL; 4658 4659 kfree(adev->fru_info); 4660 adev->fru_info = NULL; 4661 4662 px = amdgpu_device_supports_px(adev_to_drm(adev)); 4663 4664 if (px || (!dev_is_removable(&adev->pdev->dev) && 4665 apple_gmux_detect(NULL, NULL))) 4666 vga_switcheroo_unregister_client(adev->pdev); 4667 4668 if (px) 4669 vga_switcheroo_fini_domain_pm_ops(adev->dev); 4670 4671 if ((adev->pdev->class >> 8) == PCI_CLASS_DISPLAY_VGA) 4672 vga_client_unregister(adev->pdev); 4673 4674 if (drm_dev_enter(adev_to_drm(adev), &idx)) { 4675 4676 iounmap(adev->rmmio); 4677 adev->rmmio = NULL; 4678 amdgpu_doorbell_fini(adev); 4679 drm_dev_exit(idx); 4680 } 4681 4682 if (IS_ENABLED(CONFIG_PERF_EVENTS)) 4683 amdgpu_pmu_fini(adev); 4684 if (adev->mman.discovery_bin) 4685 amdgpu_discovery_fini(adev); 4686 4687 amdgpu_reset_put_reset_domain(adev->reset_domain); 4688 adev->reset_domain = NULL; 4689 4690 kfree(adev->pci_state); 4691 4692 } 4693 4694 /** 4695 * amdgpu_device_evict_resources - evict device resources 4696 * @adev: amdgpu device object 4697 * 4698 * Evicts all ttm device resources(vram BOs, gart table) from the lru list 4699 * of the vram memory type. Mainly used for evicting device resources 4700 * at suspend time. 4701 * 4702 */ 4703 static int amdgpu_device_evict_resources(struct amdgpu_device *adev) 4704 { 4705 int ret; 4706 4707 /* No need to evict vram on APUs for suspend to ram or s2idle */ 4708 if ((adev->in_s3 || adev->in_s0ix) && (adev->flags & AMD_IS_APU)) 4709 return 0; 4710 4711 ret = amdgpu_ttm_evict_resources(adev, TTM_PL_VRAM); 4712 if (ret) 4713 DRM_WARN("evicting device resources failed\n"); 4714 return ret; 4715 } 4716 4717 /* 4718 * Suspend & resume. 4719 */ 4720 /** 4721 * amdgpu_device_prepare - prepare for device suspend 4722 * 4723 * @dev: drm dev pointer 4724 * 4725 * Prepare to put the hw in the suspend state (all asics). 4726 * Returns 0 for success or an error on failure. 4727 * Called at driver suspend. 4728 */ 4729 int amdgpu_device_prepare(struct drm_device *dev) 4730 { 4731 struct amdgpu_device *adev = drm_to_adev(dev); 4732 int i, r; 4733 4734 amdgpu_choose_low_power_state(adev); 4735 4736 if (dev->switch_power_state == DRM_SWITCH_POWER_OFF) 4737 return 0; 4738 4739 /* Evict the majority of BOs before starting suspend sequence */ 4740 r = amdgpu_device_evict_resources(adev); 4741 if (r) 4742 goto unprepare; 4743 4744 flush_delayed_work(&adev->gfx.gfx_off_delay_work); 4745 4746 for (i = 0; i < adev->num_ip_blocks; i++) { 4747 if (!adev->ip_blocks[i].status.valid) 4748 continue; 4749 if (!adev->ip_blocks[i].version->funcs->prepare_suspend) 4750 continue; 4751 r = adev->ip_blocks[i].version->funcs->prepare_suspend(&adev->ip_blocks[i]); 4752 if (r) 4753 goto unprepare; 4754 } 4755 4756 return 0; 4757 4758 unprepare: 4759 adev->in_s0ix = adev->in_s3 = false; 4760 4761 return r; 4762 } 4763 4764 /** 4765 * amdgpu_device_suspend - initiate device suspend 4766 * 4767 * @dev: drm dev pointer 4768 * @fbcon : notify the fbdev of suspend 4769 * 4770 * Puts the hw in the suspend state (all asics). 4771 * Returns 0 for success or an error on failure. 4772 * Called at driver suspend. 4773 */ 4774 int amdgpu_device_suspend(struct drm_device *dev, bool fbcon) 4775 { 4776 struct amdgpu_device *adev = drm_to_adev(dev); 4777 int r = 0; 4778 4779 if (dev->switch_power_state == DRM_SWITCH_POWER_OFF) 4780 return 0; 4781 4782 adev->in_suspend = true; 4783 4784 if (amdgpu_sriov_vf(adev)) { 4785 amdgpu_virt_fini_data_exchange(adev); 4786 r = amdgpu_virt_request_full_gpu(adev, false); 4787 if (r) 4788 return r; 4789 } 4790 4791 if (amdgpu_acpi_smart_shift_update(dev, AMDGPU_SS_DEV_D3)) 4792 DRM_WARN("smart shift update failed\n"); 4793 4794 if (fbcon) 4795 drm_fb_helper_set_suspend_unlocked(adev_to_drm(adev)->fb_helper, true); 4796 4797 cancel_delayed_work_sync(&adev->delayed_init_work); 4798 4799 amdgpu_ras_suspend(adev); 4800 4801 amdgpu_device_ip_suspend_phase1(adev); 4802 4803 if (!adev->in_s0ix) 4804 amdgpu_amdkfd_suspend(adev, adev->in_runpm); 4805 4806 r = amdgpu_device_evict_resources(adev); 4807 if (r) 4808 return r; 4809 4810 amdgpu_ttm_set_buffer_funcs_status(adev, false); 4811 4812 amdgpu_fence_driver_hw_fini(adev); 4813 4814 amdgpu_device_ip_suspend_phase2(adev); 4815 4816 if (amdgpu_sriov_vf(adev)) 4817 amdgpu_virt_release_full_gpu(adev, false); 4818 4819 r = amdgpu_dpm_notify_rlc_state(adev, false); 4820 if (r) 4821 return r; 4822 4823 return 0; 4824 } 4825 4826 /** 4827 * amdgpu_device_resume - initiate device resume 4828 * 4829 * @dev: drm dev pointer 4830 * @fbcon : notify the fbdev of resume 4831 * 4832 * Bring the hw back to operating state (all asics). 4833 * Returns 0 for success or an error on failure. 4834 * Called at driver resume. 4835 */ 4836 int amdgpu_device_resume(struct drm_device *dev, bool fbcon) 4837 { 4838 struct amdgpu_device *adev = drm_to_adev(dev); 4839 int r = 0; 4840 4841 if (amdgpu_sriov_vf(adev)) { 4842 r = amdgpu_virt_request_full_gpu(adev, true); 4843 if (r) 4844 return r; 4845 } 4846 4847 if (dev->switch_power_state == DRM_SWITCH_POWER_OFF) 4848 return 0; 4849 4850 if (adev->in_s0ix) 4851 amdgpu_dpm_gfx_state_change(adev, sGpuChangeState_D0Entry); 4852 4853 /* post card */ 4854 if (amdgpu_device_need_post(adev)) { 4855 r = amdgpu_device_asic_init(adev); 4856 if (r) 4857 dev_err(adev->dev, "amdgpu asic init failed\n"); 4858 } 4859 4860 r = amdgpu_device_ip_resume(adev); 4861 4862 if (r) { 4863 dev_err(adev->dev, "amdgpu_device_ip_resume failed (%d).\n", r); 4864 goto exit; 4865 } 4866 amdgpu_fence_driver_hw_init(adev); 4867 4868 if (!adev->in_s0ix) { 4869 r = amdgpu_amdkfd_resume(adev, adev->in_runpm); 4870 if (r) 4871 goto exit; 4872 } 4873 4874 r = amdgpu_device_ip_late_init(adev); 4875 if (r) 4876 goto exit; 4877 4878 queue_delayed_work(system_wq, &adev->delayed_init_work, 4879 msecs_to_jiffies(AMDGPU_RESUME_MS)); 4880 exit: 4881 if (amdgpu_sriov_vf(adev)) { 4882 amdgpu_virt_init_data_exchange(adev); 4883 amdgpu_virt_release_full_gpu(adev, true); 4884 } 4885 4886 if (r) 4887 return r; 4888 4889 /* Make sure IB tests flushed */ 4890 flush_delayed_work(&adev->delayed_init_work); 4891 4892 if (fbcon) 4893 drm_fb_helper_set_suspend_unlocked(adev_to_drm(adev)->fb_helper, false); 4894 4895 amdgpu_ras_resume(adev); 4896 4897 if (adev->mode_info.num_crtc) { 4898 /* 4899 * Most of the connector probing functions try to acquire runtime pm 4900 * refs to ensure that the GPU is powered on when connector polling is 4901 * performed. Since we're calling this from a runtime PM callback, 4902 * trying to acquire rpm refs will cause us to deadlock. 4903 * 4904 * Since we're guaranteed to be holding the rpm lock, it's safe to 4905 * temporarily disable the rpm helpers so this doesn't deadlock us. 4906 */ 4907 #ifdef CONFIG_PM 4908 dev->dev->power.disable_depth++; 4909 #endif 4910 if (!adev->dc_enabled) 4911 drm_helper_hpd_irq_event(dev); 4912 else 4913 drm_kms_helper_hotplug_event(dev); 4914 #ifdef CONFIG_PM 4915 dev->dev->power.disable_depth--; 4916 #endif 4917 } 4918 adev->in_suspend = false; 4919 4920 if (adev->enable_mes) 4921 amdgpu_mes_self_test(adev); 4922 4923 if (amdgpu_acpi_smart_shift_update(dev, AMDGPU_SS_DEV_D0)) 4924 DRM_WARN("smart shift update failed\n"); 4925 4926 return 0; 4927 } 4928 4929 /** 4930 * amdgpu_device_ip_check_soft_reset - did soft reset succeed 4931 * 4932 * @adev: amdgpu_device pointer 4933 * 4934 * The list of all the hardware IPs that make up the asic is walked and 4935 * the check_soft_reset callbacks are run. check_soft_reset determines 4936 * if the asic is still hung or not. 4937 * Returns true if any of the IPs are still in a hung state, false if not. 4938 */ 4939 static bool amdgpu_device_ip_check_soft_reset(struct amdgpu_device *adev) 4940 { 4941 int i; 4942 bool asic_hang = false; 4943 4944 if (amdgpu_sriov_vf(adev)) 4945 return true; 4946 4947 if (amdgpu_asic_need_full_reset(adev)) 4948 return true; 4949 4950 for (i = 0; i < adev->num_ip_blocks; i++) { 4951 if (!adev->ip_blocks[i].status.valid) 4952 continue; 4953 if (adev->ip_blocks[i].version->funcs->check_soft_reset) 4954 adev->ip_blocks[i].status.hang = 4955 adev->ip_blocks[i].version->funcs->check_soft_reset( 4956 &adev->ip_blocks[i]); 4957 if (adev->ip_blocks[i].status.hang) { 4958 dev_info(adev->dev, "IP block:%s is hung!\n", adev->ip_blocks[i].version->funcs->name); 4959 asic_hang = true; 4960 } 4961 } 4962 return asic_hang; 4963 } 4964 4965 /** 4966 * amdgpu_device_ip_pre_soft_reset - prepare for soft reset 4967 * 4968 * @adev: amdgpu_device pointer 4969 * 4970 * The list of all the hardware IPs that make up the asic is walked and the 4971 * pre_soft_reset callbacks are run if the block is hung. pre_soft_reset 4972 * handles any IP specific hardware or software state changes that are 4973 * necessary for a soft reset to succeed. 4974 * Returns 0 on success, negative error code on failure. 4975 */ 4976 static int amdgpu_device_ip_pre_soft_reset(struct amdgpu_device *adev) 4977 { 4978 int i, r = 0; 4979 4980 for (i = 0; i < adev->num_ip_blocks; i++) { 4981 if (!adev->ip_blocks[i].status.valid) 4982 continue; 4983 if (adev->ip_blocks[i].status.hang && 4984 adev->ip_blocks[i].version->funcs->pre_soft_reset) { 4985 r = adev->ip_blocks[i].version->funcs->pre_soft_reset(&adev->ip_blocks[i]); 4986 if (r) 4987 return r; 4988 } 4989 } 4990 4991 return 0; 4992 } 4993 4994 /** 4995 * amdgpu_device_ip_need_full_reset - check if a full asic reset is needed 4996 * 4997 * @adev: amdgpu_device pointer 4998 * 4999 * Some hardware IPs cannot be soft reset. If they are hung, a full gpu 5000 * reset is necessary to recover. 5001 * Returns true if a full asic reset is required, false if not. 5002 */ 5003 static bool amdgpu_device_ip_need_full_reset(struct amdgpu_device *adev) 5004 { 5005 int i; 5006 5007 if (amdgpu_asic_need_full_reset(adev)) 5008 return true; 5009 5010 for (i = 0; i < adev->num_ip_blocks; i++) { 5011 if (!adev->ip_blocks[i].status.valid) 5012 continue; 5013 if ((adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) || 5014 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) || 5015 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_ACP) || 5016 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE) || 5017 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) { 5018 if (adev->ip_blocks[i].status.hang) { 5019 dev_info(adev->dev, "Some block need full reset!\n"); 5020 return true; 5021 } 5022 } 5023 } 5024 return false; 5025 } 5026 5027 /** 5028 * amdgpu_device_ip_soft_reset - do a soft reset 5029 * 5030 * @adev: amdgpu_device pointer 5031 * 5032 * The list of all the hardware IPs that make up the asic is walked and the 5033 * soft_reset callbacks are run if the block is hung. soft_reset handles any 5034 * IP specific hardware or software state changes that are necessary to soft 5035 * reset the IP. 5036 * Returns 0 on success, negative error code on failure. 5037 */ 5038 static int amdgpu_device_ip_soft_reset(struct amdgpu_device *adev) 5039 { 5040 int i, r = 0; 5041 5042 for (i = 0; i < adev->num_ip_blocks; i++) { 5043 if (!adev->ip_blocks[i].status.valid) 5044 continue; 5045 if (adev->ip_blocks[i].status.hang && 5046 adev->ip_blocks[i].version->funcs->soft_reset) { 5047 r = adev->ip_blocks[i].version->funcs->soft_reset(&adev->ip_blocks[i]); 5048 if (r) 5049 return r; 5050 } 5051 } 5052 5053 return 0; 5054 } 5055 5056 /** 5057 * amdgpu_device_ip_post_soft_reset - clean up from soft reset 5058 * 5059 * @adev: amdgpu_device pointer 5060 * 5061 * The list of all the hardware IPs that make up the asic is walked and the 5062 * post_soft_reset callbacks are run if the asic was hung. post_soft_reset 5063 * handles any IP specific hardware or software state changes that are 5064 * necessary after the IP has been soft reset. 5065 * Returns 0 on success, negative error code on failure. 5066 */ 5067 static int amdgpu_device_ip_post_soft_reset(struct amdgpu_device *adev) 5068 { 5069 int i, r = 0; 5070 5071 for (i = 0; i < adev->num_ip_blocks; i++) { 5072 if (!adev->ip_blocks[i].status.valid) 5073 continue; 5074 if (adev->ip_blocks[i].status.hang && 5075 adev->ip_blocks[i].version->funcs->post_soft_reset) 5076 r = adev->ip_blocks[i].version->funcs->post_soft_reset(&adev->ip_blocks[i]); 5077 if (r) 5078 return r; 5079 } 5080 5081 return 0; 5082 } 5083 5084 /** 5085 * amdgpu_device_reset_sriov - reset ASIC for SR-IOV vf 5086 * 5087 * @adev: amdgpu_device pointer 5088 * @reset_context: amdgpu reset context pointer 5089 * 5090 * do VF FLR and reinitialize Asic 5091 * return 0 means succeeded otherwise failed 5092 */ 5093 static int amdgpu_device_reset_sriov(struct amdgpu_device *adev, 5094 struct amdgpu_reset_context *reset_context) 5095 { 5096 int r; 5097 struct amdgpu_hive_info *hive = NULL; 5098 5099 if (test_bit(AMDGPU_HOST_FLR, &reset_context->flags)) { 5100 if (!amdgpu_ras_get_fed_status(adev)) 5101 amdgpu_virt_ready_to_reset(adev); 5102 amdgpu_virt_wait_reset(adev); 5103 clear_bit(AMDGPU_HOST_FLR, &reset_context->flags); 5104 r = amdgpu_virt_request_full_gpu(adev, true); 5105 } else { 5106 r = amdgpu_virt_reset_gpu(adev); 5107 } 5108 if (r) 5109 return r; 5110 5111 amdgpu_ras_set_fed(adev, false); 5112 amdgpu_irq_gpu_reset_resume_helper(adev); 5113 5114 /* some sw clean up VF needs to do before recover */ 5115 amdgpu_virt_post_reset(adev); 5116 5117 /* Resume IP prior to SMC */ 5118 r = amdgpu_device_ip_reinit_early_sriov(adev); 5119 if (r) 5120 return r; 5121 5122 amdgpu_virt_init_data_exchange(adev); 5123 5124 r = amdgpu_device_fw_loading(adev); 5125 if (r) 5126 return r; 5127 5128 /* now we are okay to resume SMC/CP/SDMA */ 5129 r = amdgpu_device_ip_reinit_late_sriov(adev); 5130 if (r) 5131 return r; 5132 5133 hive = amdgpu_get_xgmi_hive(adev); 5134 /* Update PSP FW topology after reset */ 5135 if (hive && adev->gmc.xgmi.num_physical_nodes > 1) 5136 r = amdgpu_xgmi_update_topology(hive, adev); 5137 if (hive) 5138 amdgpu_put_xgmi_hive(hive); 5139 if (r) 5140 return r; 5141 5142 r = amdgpu_ib_ring_tests(adev); 5143 if (r) 5144 return r; 5145 5146 if (adev->virt.gim_feature & AMDGIM_FEATURE_GIM_FLR_VRAMLOST) 5147 amdgpu_inc_vram_lost(adev); 5148 5149 /* need to be called during full access so we can't do it later like 5150 * bare-metal does. 5151 */ 5152 amdgpu_amdkfd_post_reset(adev); 5153 amdgpu_virt_release_full_gpu(adev, true); 5154 5155 /* Aldebaran and gfx_11_0_3 support ras in SRIOV, so need resume ras during reset */ 5156 if (amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 2) || 5157 amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 3) || 5158 amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 4) || 5159 amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(11, 0, 3)) 5160 amdgpu_ras_resume(adev); 5161 return 0; 5162 } 5163 5164 /** 5165 * amdgpu_device_has_job_running - check if there is any job in mirror list 5166 * 5167 * @adev: amdgpu_device pointer 5168 * 5169 * check if there is any job in mirror list 5170 */ 5171 bool amdgpu_device_has_job_running(struct amdgpu_device *adev) 5172 { 5173 int i; 5174 struct drm_sched_job *job; 5175 5176 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 5177 struct amdgpu_ring *ring = adev->rings[i]; 5178 5179 if (!amdgpu_ring_sched_ready(ring)) 5180 continue; 5181 5182 spin_lock(&ring->sched.job_list_lock); 5183 job = list_first_entry_or_null(&ring->sched.pending_list, 5184 struct drm_sched_job, list); 5185 spin_unlock(&ring->sched.job_list_lock); 5186 if (job) 5187 return true; 5188 } 5189 return false; 5190 } 5191 5192 /** 5193 * amdgpu_device_should_recover_gpu - check if we should try GPU recovery 5194 * 5195 * @adev: amdgpu_device pointer 5196 * 5197 * Check amdgpu_gpu_recovery and SRIOV status to see if we should try to recover 5198 * a hung GPU. 5199 */ 5200 bool amdgpu_device_should_recover_gpu(struct amdgpu_device *adev) 5201 { 5202 5203 if (amdgpu_gpu_recovery == 0) 5204 goto disabled; 5205 5206 /* Skip soft reset check in fatal error mode */ 5207 if (!amdgpu_ras_is_poison_mode_supported(adev)) 5208 return true; 5209 5210 if (amdgpu_sriov_vf(adev)) 5211 return true; 5212 5213 if (amdgpu_gpu_recovery == -1) { 5214 switch (adev->asic_type) { 5215 #ifdef CONFIG_DRM_AMDGPU_SI 5216 case CHIP_VERDE: 5217 case CHIP_TAHITI: 5218 case CHIP_PITCAIRN: 5219 case CHIP_OLAND: 5220 case CHIP_HAINAN: 5221 #endif 5222 #ifdef CONFIG_DRM_AMDGPU_CIK 5223 case CHIP_KAVERI: 5224 case CHIP_KABINI: 5225 case CHIP_MULLINS: 5226 #endif 5227 case CHIP_CARRIZO: 5228 case CHIP_STONEY: 5229 case CHIP_CYAN_SKILLFISH: 5230 goto disabled; 5231 default: 5232 break; 5233 } 5234 } 5235 5236 return true; 5237 5238 disabled: 5239 dev_info(adev->dev, "GPU recovery disabled.\n"); 5240 return false; 5241 } 5242 5243 int amdgpu_device_mode1_reset(struct amdgpu_device *adev) 5244 { 5245 u32 i; 5246 int ret = 0; 5247 5248 amdgpu_atombios_scratch_regs_engine_hung(adev, true); 5249 5250 dev_info(adev->dev, "GPU mode1 reset\n"); 5251 5252 /* Cache the state before bus master disable. The saved config space 5253 * values are used in other cases like restore after mode-2 reset. 5254 */ 5255 amdgpu_device_cache_pci_state(adev->pdev); 5256 5257 /* disable BM */ 5258 pci_clear_master(adev->pdev); 5259 5260 if (amdgpu_dpm_is_mode1_reset_supported(adev)) { 5261 dev_info(adev->dev, "GPU smu mode1 reset\n"); 5262 ret = amdgpu_dpm_mode1_reset(adev); 5263 } else { 5264 dev_info(adev->dev, "GPU psp mode1 reset\n"); 5265 ret = psp_gpu_reset(adev); 5266 } 5267 5268 if (ret) 5269 goto mode1_reset_failed; 5270 5271 amdgpu_device_load_pci_state(adev->pdev); 5272 ret = amdgpu_psp_wait_for_bootloader(adev); 5273 if (ret) 5274 goto mode1_reset_failed; 5275 5276 /* wait for asic to come out of reset */ 5277 for (i = 0; i < adev->usec_timeout; i++) { 5278 u32 memsize = adev->nbio.funcs->get_memsize(adev); 5279 5280 if (memsize != 0xffffffff) 5281 break; 5282 udelay(1); 5283 } 5284 5285 if (i >= adev->usec_timeout) { 5286 ret = -ETIMEDOUT; 5287 goto mode1_reset_failed; 5288 } 5289 5290 amdgpu_atombios_scratch_regs_engine_hung(adev, false); 5291 5292 return 0; 5293 5294 mode1_reset_failed: 5295 dev_err(adev->dev, "GPU mode1 reset failed\n"); 5296 return ret; 5297 } 5298 5299 int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev, 5300 struct amdgpu_reset_context *reset_context) 5301 { 5302 int i, r = 0; 5303 struct amdgpu_job *job = NULL; 5304 struct amdgpu_device *tmp_adev = reset_context->reset_req_dev; 5305 bool need_full_reset = 5306 test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 5307 5308 if (reset_context->reset_req_dev == adev) 5309 job = reset_context->job; 5310 5311 if (amdgpu_sriov_vf(adev)) 5312 amdgpu_virt_pre_reset(adev); 5313 5314 amdgpu_fence_driver_isr_toggle(adev, true); 5315 5316 /* block all schedulers and reset given job's ring */ 5317 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 5318 struct amdgpu_ring *ring = adev->rings[i]; 5319 5320 if (!amdgpu_ring_sched_ready(ring)) 5321 continue; 5322 5323 /* Clear job fence from fence drv to avoid force_completion 5324 * leave NULL and vm flush fence in fence drv 5325 */ 5326 amdgpu_fence_driver_clear_job_fences(ring); 5327 5328 /* after all hw jobs are reset, hw fence is meaningless, so force_completion */ 5329 amdgpu_fence_driver_force_completion(ring); 5330 } 5331 5332 amdgpu_fence_driver_isr_toggle(adev, false); 5333 5334 if (job && job->vm) 5335 drm_sched_increase_karma(&job->base); 5336 5337 r = amdgpu_reset_prepare_hwcontext(adev, reset_context); 5338 /* If reset handler not implemented, continue; otherwise return */ 5339 if (r == -EOPNOTSUPP) 5340 r = 0; 5341 else 5342 return r; 5343 5344 /* Don't suspend on bare metal if we are not going to HW reset the ASIC */ 5345 if (!amdgpu_sriov_vf(adev)) { 5346 5347 if (!need_full_reset) 5348 need_full_reset = amdgpu_device_ip_need_full_reset(adev); 5349 5350 if (!need_full_reset && amdgpu_gpu_recovery && 5351 amdgpu_device_ip_check_soft_reset(adev)) { 5352 amdgpu_device_ip_pre_soft_reset(adev); 5353 r = amdgpu_device_ip_soft_reset(adev); 5354 amdgpu_device_ip_post_soft_reset(adev); 5355 if (r || amdgpu_device_ip_check_soft_reset(adev)) { 5356 dev_info(adev->dev, "soft reset failed, will fallback to full reset!\n"); 5357 need_full_reset = true; 5358 } 5359 } 5360 5361 if (!test_bit(AMDGPU_SKIP_COREDUMP, &reset_context->flags)) { 5362 dev_info(tmp_adev->dev, "Dumping IP State\n"); 5363 /* Trigger ip dump before we reset the asic */ 5364 for (i = 0; i < tmp_adev->num_ip_blocks; i++) 5365 if (tmp_adev->ip_blocks[i].version->funcs->dump_ip_state) 5366 tmp_adev->ip_blocks[i].version->funcs 5367 ->dump_ip_state((void *)&tmp_adev->ip_blocks[i]); 5368 dev_info(tmp_adev->dev, "Dumping IP State Completed\n"); 5369 } 5370 5371 if (need_full_reset) 5372 r = amdgpu_device_ip_suspend(adev); 5373 if (need_full_reset) 5374 set_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 5375 else 5376 clear_bit(AMDGPU_NEED_FULL_RESET, 5377 &reset_context->flags); 5378 } 5379 5380 return r; 5381 } 5382 5383 int amdgpu_device_reinit_after_reset(struct amdgpu_reset_context *reset_context) 5384 { 5385 struct list_head *device_list_handle; 5386 bool full_reset, vram_lost = false; 5387 struct amdgpu_device *tmp_adev; 5388 int r; 5389 5390 device_list_handle = reset_context->reset_device_list; 5391 5392 if (!device_list_handle) 5393 return -EINVAL; 5394 5395 full_reset = test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 5396 5397 r = 0; 5398 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 5399 /* After reset, it's default init level */ 5400 amdgpu_set_init_level(tmp_adev, AMDGPU_INIT_LEVEL_DEFAULT); 5401 if (full_reset) { 5402 /* post card */ 5403 amdgpu_ras_set_fed(tmp_adev, false); 5404 r = amdgpu_device_asic_init(tmp_adev); 5405 if (r) { 5406 dev_warn(tmp_adev->dev, "asic atom init failed!"); 5407 } else { 5408 dev_info(tmp_adev->dev, "GPU reset succeeded, trying to resume\n"); 5409 5410 r = amdgpu_device_ip_resume_phase1(tmp_adev); 5411 if (r) 5412 goto out; 5413 5414 vram_lost = amdgpu_device_check_vram_lost(tmp_adev); 5415 5416 if (!test_bit(AMDGPU_SKIP_COREDUMP, &reset_context->flags)) 5417 amdgpu_coredump(tmp_adev, false, vram_lost, reset_context->job); 5418 5419 if (vram_lost) { 5420 DRM_INFO("VRAM is lost due to GPU reset!\n"); 5421 amdgpu_inc_vram_lost(tmp_adev); 5422 } 5423 5424 r = amdgpu_device_fw_loading(tmp_adev); 5425 if (r) 5426 return r; 5427 5428 r = amdgpu_xcp_restore_partition_mode( 5429 tmp_adev->xcp_mgr); 5430 if (r) 5431 goto out; 5432 5433 r = amdgpu_device_ip_resume_phase2(tmp_adev); 5434 if (r) 5435 goto out; 5436 5437 if (tmp_adev->mman.buffer_funcs_ring->sched.ready) 5438 amdgpu_ttm_set_buffer_funcs_status(tmp_adev, true); 5439 5440 if (vram_lost) 5441 amdgpu_device_fill_reset_magic(tmp_adev); 5442 5443 /* 5444 * Add this ASIC as tracked as reset was already 5445 * complete successfully. 5446 */ 5447 amdgpu_register_gpu_instance(tmp_adev); 5448 5449 if (!reset_context->hive && 5450 tmp_adev->gmc.xgmi.num_physical_nodes > 1) 5451 amdgpu_xgmi_add_device(tmp_adev); 5452 5453 r = amdgpu_device_ip_late_init(tmp_adev); 5454 if (r) 5455 goto out; 5456 5457 drm_fb_helper_set_suspend_unlocked(adev_to_drm(tmp_adev)->fb_helper, false); 5458 5459 /* 5460 * The GPU enters bad state once faulty pages 5461 * by ECC has reached the threshold, and ras 5462 * recovery is scheduled next. So add one check 5463 * here to break recovery if it indeed exceeds 5464 * bad page threshold, and remind user to 5465 * retire this GPU or setting one bigger 5466 * bad_page_threshold value to fix this once 5467 * probing driver again. 5468 */ 5469 if (!amdgpu_ras_is_rma(tmp_adev)) { 5470 /* must succeed. */ 5471 amdgpu_ras_resume(tmp_adev); 5472 } else { 5473 r = -EINVAL; 5474 goto out; 5475 } 5476 5477 /* Update PSP FW topology after reset */ 5478 if (reset_context->hive && 5479 tmp_adev->gmc.xgmi.num_physical_nodes > 1) 5480 r = amdgpu_xgmi_update_topology( 5481 reset_context->hive, tmp_adev); 5482 } 5483 } 5484 5485 out: 5486 if (!r) { 5487 amdgpu_irq_gpu_reset_resume_helper(tmp_adev); 5488 r = amdgpu_ib_ring_tests(tmp_adev); 5489 if (r) { 5490 dev_err(tmp_adev->dev, "ib ring test failed (%d).\n", r); 5491 r = -EAGAIN; 5492 goto end; 5493 } 5494 } 5495 5496 if (r) 5497 tmp_adev->asic_reset_res = r; 5498 } 5499 5500 end: 5501 return r; 5502 } 5503 5504 int amdgpu_do_asic_reset(struct list_head *device_list_handle, 5505 struct amdgpu_reset_context *reset_context) 5506 { 5507 struct amdgpu_device *tmp_adev = NULL; 5508 bool need_full_reset, skip_hw_reset; 5509 int r = 0; 5510 5511 /* Try reset handler method first */ 5512 tmp_adev = list_first_entry(device_list_handle, struct amdgpu_device, 5513 reset_list); 5514 5515 reset_context->reset_device_list = device_list_handle; 5516 r = amdgpu_reset_perform_reset(tmp_adev, reset_context); 5517 /* If reset handler not implemented, continue; otherwise return */ 5518 if (r == -EOPNOTSUPP) 5519 r = 0; 5520 else 5521 return r; 5522 5523 /* Reset handler not implemented, use the default method */ 5524 need_full_reset = 5525 test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 5526 skip_hw_reset = test_bit(AMDGPU_SKIP_HW_RESET, &reset_context->flags); 5527 5528 /* 5529 * ASIC reset has to be done on all XGMI hive nodes ASAP 5530 * to allow proper links negotiation in FW (within 1 sec) 5531 */ 5532 if (!skip_hw_reset && need_full_reset) { 5533 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 5534 /* For XGMI run all resets in parallel to speed up the process */ 5535 if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) { 5536 if (!queue_work(system_unbound_wq, 5537 &tmp_adev->xgmi_reset_work)) 5538 r = -EALREADY; 5539 } else 5540 r = amdgpu_asic_reset(tmp_adev); 5541 5542 if (r) { 5543 dev_err(tmp_adev->dev, 5544 "ASIC reset failed with error, %d for drm dev, %s", 5545 r, adev_to_drm(tmp_adev)->unique); 5546 goto out; 5547 } 5548 } 5549 5550 /* For XGMI wait for all resets to complete before proceed */ 5551 if (!r) { 5552 list_for_each_entry(tmp_adev, device_list_handle, 5553 reset_list) { 5554 if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) { 5555 flush_work(&tmp_adev->xgmi_reset_work); 5556 r = tmp_adev->asic_reset_res; 5557 if (r) 5558 break; 5559 } 5560 } 5561 } 5562 } 5563 5564 if (!r && amdgpu_ras_intr_triggered()) { 5565 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 5566 amdgpu_ras_reset_error_count(tmp_adev, 5567 AMDGPU_RAS_BLOCK__MMHUB); 5568 } 5569 5570 amdgpu_ras_intr_cleared(); 5571 } 5572 5573 r = amdgpu_device_reinit_after_reset(reset_context); 5574 if (r == -EAGAIN) 5575 set_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 5576 else 5577 clear_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 5578 5579 out: 5580 return r; 5581 } 5582 5583 static void amdgpu_device_set_mp1_state(struct amdgpu_device *adev) 5584 { 5585 5586 switch (amdgpu_asic_reset_method(adev)) { 5587 case AMD_RESET_METHOD_MODE1: 5588 adev->mp1_state = PP_MP1_STATE_SHUTDOWN; 5589 break; 5590 case AMD_RESET_METHOD_MODE2: 5591 adev->mp1_state = PP_MP1_STATE_RESET; 5592 break; 5593 default: 5594 adev->mp1_state = PP_MP1_STATE_NONE; 5595 break; 5596 } 5597 } 5598 5599 static void amdgpu_device_unset_mp1_state(struct amdgpu_device *adev) 5600 { 5601 amdgpu_vf_error_trans_all(adev); 5602 adev->mp1_state = PP_MP1_STATE_NONE; 5603 } 5604 5605 static void amdgpu_device_resume_display_audio(struct amdgpu_device *adev) 5606 { 5607 struct pci_dev *p = NULL; 5608 5609 p = pci_get_domain_bus_and_slot(pci_domain_nr(adev->pdev->bus), 5610 adev->pdev->bus->number, 1); 5611 if (p) { 5612 pm_runtime_enable(&(p->dev)); 5613 pm_runtime_resume(&(p->dev)); 5614 } 5615 5616 pci_dev_put(p); 5617 } 5618 5619 static int amdgpu_device_suspend_display_audio(struct amdgpu_device *adev) 5620 { 5621 enum amd_reset_method reset_method; 5622 struct pci_dev *p = NULL; 5623 u64 expires; 5624 5625 /* 5626 * For now, only BACO and mode1 reset are confirmed 5627 * to suffer the audio issue without proper suspended. 5628 */ 5629 reset_method = amdgpu_asic_reset_method(adev); 5630 if ((reset_method != AMD_RESET_METHOD_BACO) && 5631 (reset_method != AMD_RESET_METHOD_MODE1)) 5632 return -EINVAL; 5633 5634 p = pci_get_domain_bus_and_slot(pci_domain_nr(adev->pdev->bus), 5635 adev->pdev->bus->number, 1); 5636 if (!p) 5637 return -ENODEV; 5638 5639 expires = pm_runtime_autosuspend_expiration(&(p->dev)); 5640 if (!expires) 5641 /* 5642 * If we cannot get the audio device autosuspend delay, 5643 * a fixed 4S interval will be used. Considering 3S is 5644 * the audio controller default autosuspend delay setting. 5645 * 4S used here is guaranteed to cover that. 5646 */ 5647 expires = ktime_get_mono_fast_ns() + NSEC_PER_SEC * 4ULL; 5648 5649 while (!pm_runtime_status_suspended(&(p->dev))) { 5650 if (!pm_runtime_suspend(&(p->dev))) 5651 break; 5652 5653 if (expires < ktime_get_mono_fast_ns()) { 5654 dev_warn(adev->dev, "failed to suspend display audio\n"); 5655 pci_dev_put(p); 5656 /* TODO: abort the succeeding gpu reset? */ 5657 return -ETIMEDOUT; 5658 } 5659 } 5660 5661 pm_runtime_disable(&(p->dev)); 5662 5663 pci_dev_put(p); 5664 return 0; 5665 } 5666 5667 static inline void amdgpu_device_stop_pending_resets(struct amdgpu_device *adev) 5668 { 5669 struct amdgpu_ras *con = amdgpu_ras_get_context(adev); 5670 5671 #if defined(CONFIG_DEBUG_FS) 5672 if (!amdgpu_sriov_vf(adev)) 5673 cancel_work(&adev->reset_work); 5674 #endif 5675 5676 if (adev->kfd.dev) 5677 cancel_work(&adev->kfd.reset_work); 5678 5679 if (amdgpu_sriov_vf(adev)) 5680 cancel_work(&adev->virt.flr_work); 5681 5682 if (con && adev->ras_enabled) 5683 cancel_work(&con->recovery_work); 5684 5685 } 5686 5687 static int amdgpu_device_health_check(struct list_head *device_list_handle) 5688 { 5689 struct amdgpu_device *tmp_adev; 5690 int ret = 0; 5691 u32 status; 5692 5693 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 5694 pci_read_config_dword(tmp_adev->pdev, PCI_COMMAND, &status); 5695 if (PCI_POSSIBLE_ERROR(status)) { 5696 dev_err(tmp_adev->dev, "device lost from bus!"); 5697 ret = -ENODEV; 5698 } 5699 } 5700 5701 return ret; 5702 } 5703 5704 /** 5705 * amdgpu_device_gpu_recover - reset the asic and recover scheduler 5706 * 5707 * @adev: amdgpu_device pointer 5708 * @job: which job trigger hang 5709 * @reset_context: amdgpu reset context pointer 5710 * 5711 * Attempt to reset the GPU if it has hung (all asics). 5712 * Attempt to do soft-reset or full-reset and reinitialize Asic 5713 * Returns 0 for success or an error on failure. 5714 */ 5715 5716 int amdgpu_device_gpu_recover(struct amdgpu_device *adev, 5717 struct amdgpu_job *job, 5718 struct amdgpu_reset_context *reset_context) 5719 { 5720 struct list_head device_list, *device_list_handle = NULL; 5721 bool job_signaled = false; 5722 struct amdgpu_hive_info *hive = NULL; 5723 struct amdgpu_device *tmp_adev = NULL; 5724 int i, r = 0; 5725 bool need_emergency_restart = false; 5726 bool audio_suspended = false; 5727 int retry_limit = AMDGPU_MAX_RETRY_LIMIT; 5728 5729 /* 5730 * Special case: RAS triggered and full reset isn't supported 5731 */ 5732 need_emergency_restart = amdgpu_ras_need_emergency_restart(adev); 5733 5734 /* 5735 * Flush RAM to disk so that after reboot 5736 * the user can read log and see why the system rebooted. 5737 */ 5738 if (need_emergency_restart && amdgpu_ras_get_context(adev) && 5739 amdgpu_ras_get_context(adev)->reboot) { 5740 DRM_WARN("Emergency reboot."); 5741 5742 ksys_sync_helper(); 5743 emergency_restart(); 5744 } 5745 5746 dev_info(adev->dev, "GPU %s begin!\n", 5747 need_emergency_restart ? "jobs stop":"reset"); 5748 5749 if (!amdgpu_sriov_vf(adev)) 5750 hive = amdgpu_get_xgmi_hive(adev); 5751 if (hive) 5752 mutex_lock(&hive->hive_lock); 5753 5754 reset_context->job = job; 5755 reset_context->hive = hive; 5756 /* 5757 * Build list of devices to reset. 5758 * In case we are in XGMI hive mode, resort the device list 5759 * to put adev in the 1st position. 5760 */ 5761 INIT_LIST_HEAD(&device_list); 5762 if (!amdgpu_sriov_vf(adev) && (adev->gmc.xgmi.num_physical_nodes > 1) && hive) { 5763 list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head) { 5764 list_add_tail(&tmp_adev->reset_list, &device_list); 5765 if (adev->shutdown) 5766 tmp_adev->shutdown = true; 5767 } 5768 if (!list_is_first(&adev->reset_list, &device_list)) 5769 list_rotate_to_front(&adev->reset_list, &device_list); 5770 device_list_handle = &device_list; 5771 } else { 5772 list_add_tail(&adev->reset_list, &device_list); 5773 device_list_handle = &device_list; 5774 } 5775 5776 if (!amdgpu_sriov_vf(adev)) { 5777 r = amdgpu_device_health_check(device_list_handle); 5778 if (r) 5779 goto end_reset; 5780 } 5781 5782 /* We need to lock reset domain only once both for XGMI and single device */ 5783 tmp_adev = list_first_entry(device_list_handle, struct amdgpu_device, 5784 reset_list); 5785 amdgpu_device_lock_reset_domain(tmp_adev->reset_domain); 5786 5787 /* block all schedulers and reset given job's ring */ 5788 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 5789 5790 amdgpu_device_set_mp1_state(tmp_adev); 5791 5792 /* 5793 * Try to put the audio codec into suspend state 5794 * before gpu reset started. 5795 * 5796 * Due to the power domain of the graphics device 5797 * is shared with AZ power domain. Without this, 5798 * we may change the audio hardware from behind 5799 * the audio driver's back. That will trigger 5800 * some audio codec errors. 5801 */ 5802 if (!amdgpu_device_suspend_display_audio(tmp_adev)) 5803 audio_suspended = true; 5804 5805 amdgpu_ras_set_error_query_ready(tmp_adev, false); 5806 5807 cancel_delayed_work_sync(&tmp_adev->delayed_init_work); 5808 5809 amdgpu_amdkfd_pre_reset(tmp_adev, reset_context); 5810 5811 /* 5812 * Mark these ASICs to be reseted as untracked first 5813 * And add them back after reset completed 5814 */ 5815 amdgpu_unregister_gpu_instance(tmp_adev); 5816 5817 drm_fb_helper_set_suspend_unlocked(adev_to_drm(tmp_adev)->fb_helper, true); 5818 5819 /* disable ras on ALL IPs */ 5820 if (!need_emergency_restart && 5821 amdgpu_device_ip_need_full_reset(tmp_adev)) 5822 amdgpu_ras_suspend(tmp_adev); 5823 5824 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 5825 struct amdgpu_ring *ring = tmp_adev->rings[i]; 5826 5827 if (!amdgpu_ring_sched_ready(ring)) 5828 continue; 5829 5830 drm_sched_stop(&ring->sched, job ? &job->base : NULL); 5831 5832 if (need_emergency_restart) 5833 amdgpu_job_stop_all_jobs_on_sched(&ring->sched); 5834 } 5835 atomic_inc(&tmp_adev->gpu_reset_counter); 5836 } 5837 5838 if (need_emergency_restart) 5839 goto skip_sched_resume; 5840 5841 /* 5842 * Must check guilty signal here since after this point all old 5843 * HW fences are force signaled. 5844 * 5845 * job->base holds a reference to parent fence 5846 */ 5847 if (job && dma_fence_is_signaled(&job->hw_fence)) { 5848 job_signaled = true; 5849 dev_info(adev->dev, "Guilty job already signaled, skipping HW reset"); 5850 goto skip_hw_reset; 5851 } 5852 5853 retry: /* Rest of adevs pre asic reset from XGMI hive. */ 5854 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 5855 r = amdgpu_device_pre_asic_reset(tmp_adev, reset_context); 5856 /*TODO Should we stop ?*/ 5857 if (r) { 5858 dev_err(tmp_adev->dev, "GPU pre asic reset failed with err, %d for drm dev, %s ", 5859 r, adev_to_drm(tmp_adev)->unique); 5860 tmp_adev->asic_reset_res = r; 5861 } 5862 } 5863 5864 /* Actual ASIC resets if needed.*/ 5865 /* Host driver will handle XGMI hive reset for SRIOV */ 5866 if (amdgpu_sriov_vf(adev)) { 5867 if (amdgpu_ras_get_fed_status(adev) || amdgpu_virt_rcvd_ras_interrupt(adev)) { 5868 dev_dbg(adev->dev, "Detected RAS error, wait for FLR completion\n"); 5869 amdgpu_ras_set_fed(adev, true); 5870 set_bit(AMDGPU_HOST_FLR, &reset_context->flags); 5871 } 5872 5873 r = amdgpu_device_reset_sriov(adev, reset_context); 5874 if (AMDGPU_RETRY_SRIOV_RESET(r) && (retry_limit--) > 0) { 5875 amdgpu_virt_release_full_gpu(adev, true); 5876 goto retry; 5877 } 5878 if (r) 5879 adev->asic_reset_res = r; 5880 } else { 5881 r = amdgpu_do_asic_reset(device_list_handle, reset_context); 5882 if (r && r == -EAGAIN) 5883 goto retry; 5884 } 5885 5886 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 5887 /* 5888 * Drop any pending non scheduler resets queued before reset is done. 5889 * Any reset scheduled after this point would be valid. Scheduler resets 5890 * were already dropped during drm_sched_stop and no new ones can come 5891 * in before drm_sched_start. 5892 */ 5893 amdgpu_device_stop_pending_resets(tmp_adev); 5894 } 5895 5896 skip_hw_reset: 5897 5898 /* Post ASIC reset for all devs .*/ 5899 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 5900 5901 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 5902 struct amdgpu_ring *ring = tmp_adev->rings[i]; 5903 5904 if (!amdgpu_ring_sched_ready(ring)) 5905 continue; 5906 5907 drm_sched_start(&ring->sched); 5908 } 5909 5910 if (!drm_drv_uses_atomic_modeset(adev_to_drm(tmp_adev)) && !job_signaled) 5911 drm_helper_resume_force_mode(adev_to_drm(tmp_adev)); 5912 5913 if (tmp_adev->asic_reset_res) 5914 r = tmp_adev->asic_reset_res; 5915 5916 tmp_adev->asic_reset_res = 0; 5917 5918 if (r) { 5919 /* bad news, how to tell it to userspace ? 5920 * for ras error, we should report GPU bad status instead of 5921 * reset failure 5922 */ 5923 if (reset_context->src != AMDGPU_RESET_SRC_RAS || 5924 !amdgpu_ras_eeprom_check_err_threshold(tmp_adev)) 5925 dev_info(tmp_adev->dev, "GPU reset(%d) failed\n", 5926 atomic_read(&tmp_adev->gpu_reset_counter)); 5927 amdgpu_vf_error_put(tmp_adev, AMDGIM_ERROR_VF_GPU_RESET_FAIL, 0, r); 5928 } else { 5929 dev_info(tmp_adev->dev, "GPU reset(%d) succeeded!\n", atomic_read(&tmp_adev->gpu_reset_counter)); 5930 if (amdgpu_acpi_smart_shift_update(adev_to_drm(tmp_adev), AMDGPU_SS_DEV_D0)) 5931 DRM_WARN("smart shift update failed\n"); 5932 } 5933 } 5934 5935 skip_sched_resume: 5936 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 5937 /* unlock kfd: SRIOV would do it separately */ 5938 if (!need_emergency_restart && !amdgpu_sriov_vf(tmp_adev)) 5939 amdgpu_amdkfd_post_reset(tmp_adev); 5940 5941 /* kfd_post_reset will do nothing if kfd device is not initialized, 5942 * need to bring up kfd here if it's not be initialized before 5943 */ 5944 if (!adev->kfd.init_complete) 5945 amdgpu_amdkfd_device_init(adev); 5946 5947 if (audio_suspended) 5948 amdgpu_device_resume_display_audio(tmp_adev); 5949 5950 amdgpu_device_unset_mp1_state(tmp_adev); 5951 5952 amdgpu_ras_set_error_query_ready(tmp_adev, true); 5953 } 5954 5955 tmp_adev = list_first_entry(device_list_handle, struct amdgpu_device, 5956 reset_list); 5957 amdgpu_device_unlock_reset_domain(tmp_adev->reset_domain); 5958 5959 end_reset: 5960 if (hive) { 5961 mutex_unlock(&hive->hive_lock); 5962 amdgpu_put_xgmi_hive(hive); 5963 } 5964 5965 if (r) 5966 dev_info(adev->dev, "GPU reset end with ret = %d\n", r); 5967 5968 atomic_set(&adev->reset_domain->reset_res, r); 5969 return r; 5970 } 5971 5972 /** 5973 * amdgpu_device_partner_bandwidth - find the bandwidth of appropriate partner 5974 * 5975 * @adev: amdgpu_device pointer 5976 * @speed: pointer to the speed of the link 5977 * @width: pointer to the width of the link 5978 * 5979 * Evaluate the hierarchy to find the speed and bandwidth capabilities of the 5980 * first physical partner to an AMD dGPU. 5981 * This will exclude any virtual switches and links. 5982 */ 5983 static void amdgpu_device_partner_bandwidth(struct amdgpu_device *adev, 5984 enum pci_bus_speed *speed, 5985 enum pcie_link_width *width) 5986 { 5987 struct pci_dev *parent = adev->pdev; 5988 5989 if (!speed || !width) 5990 return; 5991 5992 *speed = PCI_SPEED_UNKNOWN; 5993 *width = PCIE_LNK_WIDTH_UNKNOWN; 5994 5995 if (amdgpu_device_pcie_dynamic_switching_supported(adev)) { 5996 while ((parent = pci_upstream_bridge(parent))) { 5997 /* skip upstream/downstream switches internal to dGPU*/ 5998 if (parent->vendor == PCI_VENDOR_ID_ATI) 5999 continue; 6000 *speed = pcie_get_speed_cap(parent); 6001 *width = pcie_get_width_cap(parent); 6002 break; 6003 } 6004 } else { 6005 /* use the current speeds rather than max if switching is not supported */ 6006 pcie_bandwidth_available(adev->pdev, NULL, speed, width); 6007 } 6008 } 6009 6010 /** 6011 * amdgpu_device_get_pcie_info - fence pcie info about the PCIE slot 6012 * 6013 * @adev: amdgpu_device pointer 6014 * 6015 * Fetchs and stores in the driver the PCIE capabilities (gen speed 6016 * and lanes) of the slot the device is in. Handles APUs and 6017 * virtualized environments where PCIE config space may not be available. 6018 */ 6019 static void amdgpu_device_get_pcie_info(struct amdgpu_device *adev) 6020 { 6021 struct pci_dev *pdev; 6022 enum pci_bus_speed speed_cap, platform_speed_cap; 6023 enum pcie_link_width platform_link_width; 6024 6025 if (amdgpu_pcie_gen_cap) 6026 adev->pm.pcie_gen_mask = amdgpu_pcie_gen_cap; 6027 6028 if (amdgpu_pcie_lane_cap) 6029 adev->pm.pcie_mlw_mask = amdgpu_pcie_lane_cap; 6030 6031 /* covers APUs as well */ 6032 if (pci_is_root_bus(adev->pdev->bus) && !amdgpu_passthrough(adev)) { 6033 if (adev->pm.pcie_gen_mask == 0) 6034 adev->pm.pcie_gen_mask = AMDGPU_DEFAULT_PCIE_GEN_MASK; 6035 if (adev->pm.pcie_mlw_mask == 0) 6036 adev->pm.pcie_mlw_mask = AMDGPU_DEFAULT_PCIE_MLW_MASK; 6037 return; 6038 } 6039 6040 if (adev->pm.pcie_gen_mask && adev->pm.pcie_mlw_mask) 6041 return; 6042 6043 amdgpu_device_partner_bandwidth(adev, &platform_speed_cap, 6044 &platform_link_width); 6045 6046 if (adev->pm.pcie_gen_mask == 0) { 6047 /* asic caps */ 6048 pdev = adev->pdev; 6049 speed_cap = pcie_get_speed_cap(pdev); 6050 if (speed_cap == PCI_SPEED_UNKNOWN) { 6051 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 6052 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 | 6053 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3); 6054 } else { 6055 if (speed_cap == PCIE_SPEED_32_0GT) 6056 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 6057 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 | 6058 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3 | 6059 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN4 | 6060 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN5); 6061 else if (speed_cap == PCIE_SPEED_16_0GT) 6062 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 6063 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 | 6064 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3 | 6065 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN4); 6066 else if (speed_cap == PCIE_SPEED_8_0GT) 6067 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 6068 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 | 6069 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3); 6070 else if (speed_cap == PCIE_SPEED_5_0GT) 6071 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 6072 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2); 6073 else 6074 adev->pm.pcie_gen_mask |= CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1; 6075 } 6076 /* platform caps */ 6077 if (platform_speed_cap == PCI_SPEED_UNKNOWN) { 6078 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 6079 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2); 6080 } else { 6081 if (platform_speed_cap == PCIE_SPEED_32_0GT) 6082 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 6083 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 | 6084 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3 | 6085 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN4 | 6086 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN5); 6087 else if (platform_speed_cap == PCIE_SPEED_16_0GT) 6088 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 6089 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 | 6090 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3 | 6091 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN4); 6092 else if (platform_speed_cap == PCIE_SPEED_8_0GT) 6093 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 6094 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 | 6095 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3); 6096 else if (platform_speed_cap == PCIE_SPEED_5_0GT) 6097 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 6098 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2); 6099 else 6100 adev->pm.pcie_gen_mask |= CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1; 6101 6102 } 6103 } 6104 if (adev->pm.pcie_mlw_mask == 0) { 6105 if (platform_link_width == PCIE_LNK_WIDTH_UNKNOWN) { 6106 adev->pm.pcie_mlw_mask |= AMDGPU_DEFAULT_PCIE_MLW_MASK; 6107 } else { 6108 switch (platform_link_width) { 6109 case PCIE_LNK_X32: 6110 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X32 | 6111 CAIL_PCIE_LINK_WIDTH_SUPPORT_X16 | 6112 CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 | 6113 CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 | 6114 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 6115 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 6116 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 6117 break; 6118 case PCIE_LNK_X16: 6119 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X16 | 6120 CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 | 6121 CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 | 6122 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 6123 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 6124 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 6125 break; 6126 case PCIE_LNK_X12: 6127 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 | 6128 CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 | 6129 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 6130 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 6131 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 6132 break; 6133 case PCIE_LNK_X8: 6134 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 | 6135 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 6136 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 6137 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 6138 break; 6139 case PCIE_LNK_X4: 6140 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 6141 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 6142 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 6143 break; 6144 case PCIE_LNK_X2: 6145 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 6146 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 6147 break; 6148 case PCIE_LNK_X1: 6149 adev->pm.pcie_mlw_mask = CAIL_PCIE_LINK_WIDTH_SUPPORT_X1; 6150 break; 6151 default: 6152 break; 6153 } 6154 } 6155 } 6156 } 6157 6158 /** 6159 * amdgpu_device_is_peer_accessible - Check peer access through PCIe BAR 6160 * 6161 * @adev: amdgpu_device pointer 6162 * @peer_adev: amdgpu_device pointer for peer device trying to access @adev 6163 * 6164 * Return true if @peer_adev can access (DMA) @adev through the PCIe 6165 * BAR, i.e. @adev is "large BAR" and the BAR matches the DMA mask of 6166 * @peer_adev. 6167 */ 6168 bool amdgpu_device_is_peer_accessible(struct amdgpu_device *adev, 6169 struct amdgpu_device *peer_adev) 6170 { 6171 #ifdef CONFIG_HSA_AMD_P2P 6172 bool p2p_access = 6173 !adev->gmc.xgmi.connected_to_cpu && 6174 !(pci_p2pdma_distance(adev->pdev, peer_adev->dev, false) < 0); 6175 6176 bool is_large_bar = adev->gmc.visible_vram_size && 6177 adev->gmc.real_vram_size == adev->gmc.visible_vram_size; 6178 bool p2p_addressable = amdgpu_device_check_iommu_remap(peer_adev); 6179 6180 if (!p2p_addressable) { 6181 uint64_t address_mask = peer_adev->dev->dma_mask ? 6182 ~*peer_adev->dev->dma_mask : ~((1ULL << 32) - 1); 6183 resource_size_t aper_limit = 6184 adev->gmc.aper_base + adev->gmc.aper_size - 1; 6185 6186 p2p_addressable = !(adev->gmc.aper_base & address_mask || 6187 aper_limit & address_mask); 6188 } 6189 return pcie_p2p && is_large_bar && p2p_access && p2p_addressable; 6190 #else 6191 return false; 6192 #endif 6193 } 6194 6195 int amdgpu_device_baco_enter(struct drm_device *dev) 6196 { 6197 struct amdgpu_device *adev = drm_to_adev(dev); 6198 struct amdgpu_ras *ras = amdgpu_ras_get_context(adev); 6199 6200 if (!amdgpu_device_supports_baco(dev)) 6201 return -ENOTSUPP; 6202 6203 if (ras && adev->ras_enabled && 6204 adev->nbio.funcs->enable_doorbell_interrupt) 6205 adev->nbio.funcs->enable_doorbell_interrupt(adev, false); 6206 6207 return amdgpu_dpm_baco_enter(adev); 6208 } 6209 6210 int amdgpu_device_baco_exit(struct drm_device *dev) 6211 { 6212 struct amdgpu_device *adev = drm_to_adev(dev); 6213 struct amdgpu_ras *ras = amdgpu_ras_get_context(adev); 6214 int ret = 0; 6215 6216 if (!amdgpu_device_supports_baco(dev)) 6217 return -ENOTSUPP; 6218 6219 ret = amdgpu_dpm_baco_exit(adev); 6220 if (ret) 6221 return ret; 6222 6223 if (ras && adev->ras_enabled && 6224 adev->nbio.funcs->enable_doorbell_interrupt) 6225 adev->nbio.funcs->enable_doorbell_interrupt(adev, true); 6226 6227 if (amdgpu_passthrough(adev) && adev->nbio.funcs && 6228 adev->nbio.funcs->clear_doorbell_interrupt) 6229 adev->nbio.funcs->clear_doorbell_interrupt(adev); 6230 6231 return 0; 6232 } 6233 6234 /** 6235 * amdgpu_pci_error_detected - Called when a PCI error is detected. 6236 * @pdev: PCI device struct 6237 * @state: PCI channel state 6238 * 6239 * Description: Called when a PCI error is detected. 6240 * 6241 * Return: PCI_ERS_RESULT_NEED_RESET or PCI_ERS_RESULT_DISCONNECT. 6242 */ 6243 pci_ers_result_t amdgpu_pci_error_detected(struct pci_dev *pdev, pci_channel_state_t state) 6244 { 6245 struct drm_device *dev = pci_get_drvdata(pdev); 6246 struct amdgpu_device *adev = drm_to_adev(dev); 6247 int i; 6248 6249 DRM_INFO("PCI error: detected callback, state(%d)!!\n", state); 6250 6251 if (adev->gmc.xgmi.num_physical_nodes > 1) { 6252 DRM_WARN("No support for XGMI hive yet..."); 6253 return PCI_ERS_RESULT_DISCONNECT; 6254 } 6255 6256 adev->pci_channel_state = state; 6257 6258 switch (state) { 6259 case pci_channel_io_normal: 6260 return PCI_ERS_RESULT_CAN_RECOVER; 6261 /* Fatal error, prepare for slot reset */ 6262 case pci_channel_io_frozen: 6263 /* 6264 * Locking adev->reset_domain->sem will prevent any external access 6265 * to GPU during PCI error recovery 6266 */ 6267 amdgpu_device_lock_reset_domain(adev->reset_domain); 6268 amdgpu_device_set_mp1_state(adev); 6269 6270 /* 6271 * Block any work scheduling as we do for regular GPU reset 6272 * for the duration of the recovery 6273 */ 6274 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 6275 struct amdgpu_ring *ring = adev->rings[i]; 6276 6277 if (!amdgpu_ring_sched_ready(ring)) 6278 continue; 6279 6280 drm_sched_stop(&ring->sched, NULL); 6281 } 6282 atomic_inc(&adev->gpu_reset_counter); 6283 return PCI_ERS_RESULT_NEED_RESET; 6284 case pci_channel_io_perm_failure: 6285 /* Permanent error, prepare for device removal */ 6286 return PCI_ERS_RESULT_DISCONNECT; 6287 } 6288 6289 return PCI_ERS_RESULT_NEED_RESET; 6290 } 6291 6292 /** 6293 * amdgpu_pci_mmio_enabled - Enable MMIO and dump debug registers 6294 * @pdev: pointer to PCI device 6295 */ 6296 pci_ers_result_t amdgpu_pci_mmio_enabled(struct pci_dev *pdev) 6297 { 6298 6299 DRM_INFO("PCI error: mmio enabled callback!!\n"); 6300 6301 /* TODO - dump whatever for debugging purposes */ 6302 6303 /* This called only if amdgpu_pci_error_detected returns 6304 * PCI_ERS_RESULT_CAN_RECOVER. Read/write to the device still 6305 * works, no need to reset slot. 6306 */ 6307 6308 return PCI_ERS_RESULT_RECOVERED; 6309 } 6310 6311 /** 6312 * amdgpu_pci_slot_reset - Called when PCI slot has been reset. 6313 * @pdev: PCI device struct 6314 * 6315 * Description: This routine is called by the pci error recovery 6316 * code after the PCI slot has been reset, just before we 6317 * should resume normal operations. 6318 */ 6319 pci_ers_result_t amdgpu_pci_slot_reset(struct pci_dev *pdev) 6320 { 6321 struct drm_device *dev = pci_get_drvdata(pdev); 6322 struct amdgpu_device *adev = drm_to_adev(dev); 6323 int r, i; 6324 struct amdgpu_reset_context reset_context; 6325 u32 memsize; 6326 struct list_head device_list; 6327 6328 /* PCI error slot reset should be skipped During RAS recovery */ 6329 if ((amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 3) || 6330 amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 4)) && 6331 amdgpu_ras_in_recovery(adev)) 6332 return PCI_ERS_RESULT_RECOVERED; 6333 6334 DRM_INFO("PCI error: slot reset callback!!\n"); 6335 6336 memset(&reset_context, 0, sizeof(reset_context)); 6337 6338 INIT_LIST_HEAD(&device_list); 6339 list_add_tail(&adev->reset_list, &device_list); 6340 6341 /* wait for asic to come out of reset */ 6342 msleep(500); 6343 6344 /* Restore PCI confspace */ 6345 amdgpu_device_load_pci_state(pdev); 6346 6347 /* confirm ASIC came out of reset */ 6348 for (i = 0; i < adev->usec_timeout; i++) { 6349 memsize = amdgpu_asic_get_config_memsize(adev); 6350 6351 if (memsize != 0xffffffff) 6352 break; 6353 udelay(1); 6354 } 6355 if (memsize == 0xffffffff) { 6356 r = -ETIME; 6357 goto out; 6358 } 6359 6360 reset_context.method = AMD_RESET_METHOD_NONE; 6361 reset_context.reset_req_dev = adev; 6362 set_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags); 6363 set_bit(AMDGPU_SKIP_HW_RESET, &reset_context.flags); 6364 6365 adev->no_hw_access = true; 6366 r = amdgpu_device_pre_asic_reset(adev, &reset_context); 6367 adev->no_hw_access = false; 6368 if (r) 6369 goto out; 6370 6371 r = amdgpu_do_asic_reset(&device_list, &reset_context); 6372 6373 out: 6374 if (!r) { 6375 if (amdgpu_device_cache_pci_state(adev->pdev)) 6376 pci_restore_state(adev->pdev); 6377 6378 DRM_INFO("PCIe error recovery succeeded\n"); 6379 } else { 6380 DRM_ERROR("PCIe error recovery failed, err:%d", r); 6381 amdgpu_device_unset_mp1_state(adev); 6382 amdgpu_device_unlock_reset_domain(adev->reset_domain); 6383 } 6384 6385 return r ? PCI_ERS_RESULT_DISCONNECT : PCI_ERS_RESULT_RECOVERED; 6386 } 6387 6388 /** 6389 * amdgpu_pci_resume() - resume normal ops after PCI reset 6390 * @pdev: pointer to PCI device 6391 * 6392 * Called when the error recovery driver tells us that its 6393 * OK to resume normal operation. 6394 */ 6395 void amdgpu_pci_resume(struct pci_dev *pdev) 6396 { 6397 struct drm_device *dev = pci_get_drvdata(pdev); 6398 struct amdgpu_device *adev = drm_to_adev(dev); 6399 int i; 6400 6401 6402 DRM_INFO("PCI error: resume callback!!\n"); 6403 6404 /* Only continue execution for the case of pci_channel_io_frozen */ 6405 if (adev->pci_channel_state != pci_channel_io_frozen) 6406 return; 6407 6408 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 6409 struct amdgpu_ring *ring = adev->rings[i]; 6410 6411 if (!amdgpu_ring_sched_ready(ring)) 6412 continue; 6413 6414 drm_sched_start(&ring->sched); 6415 } 6416 6417 amdgpu_device_unset_mp1_state(adev); 6418 amdgpu_device_unlock_reset_domain(adev->reset_domain); 6419 } 6420 6421 bool amdgpu_device_cache_pci_state(struct pci_dev *pdev) 6422 { 6423 struct drm_device *dev = pci_get_drvdata(pdev); 6424 struct amdgpu_device *adev = drm_to_adev(dev); 6425 int r; 6426 6427 r = pci_save_state(pdev); 6428 if (!r) { 6429 kfree(adev->pci_state); 6430 6431 adev->pci_state = pci_store_saved_state(pdev); 6432 6433 if (!adev->pci_state) { 6434 DRM_ERROR("Failed to store PCI saved state"); 6435 return false; 6436 } 6437 } else { 6438 DRM_WARN("Failed to save PCI state, err:%d\n", r); 6439 return false; 6440 } 6441 6442 return true; 6443 } 6444 6445 bool amdgpu_device_load_pci_state(struct pci_dev *pdev) 6446 { 6447 struct drm_device *dev = pci_get_drvdata(pdev); 6448 struct amdgpu_device *adev = drm_to_adev(dev); 6449 int r; 6450 6451 if (!adev->pci_state) 6452 return false; 6453 6454 r = pci_load_saved_state(pdev, adev->pci_state); 6455 6456 if (!r) { 6457 pci_restore_state(pdev); 6458 } else { 6459 DRM_WARN("Failed to load PCI state, err:%d\n", r); 6460 return false; 6461 } 6462 6463 return true; 6464 } 6465 6466 void amdgpu_device_flush_hdp(struct amdgpu_device *adev, 6467 struct amdgpu_ring *ring) 6468 { 6469 #ifdef CONFIG_X86_64 6470 if ((adev->flags & AMD_IS_APU) && !amdgpu_passthrough(adev)) 6471 return; 6472 #endif 6473 if (adev->gmc.xgmi.connected_to_cpu) 6474 return; 6475 6476 if (ring && ring->funcs->emit_hdp_flush) 6477 amdgpu_ring_emit_hdp_flush(ring); 6478 else 6479 amdgpu_asic_flush_hdp(adev, ring); 6480 } 6481 6482 void amdgpu_device_invalidate_hdp(struct amdgpu_device *adev, 6483 struct amdgpu_ring *ring) 6484 { 6485 #ifdef CONFIG_X86_64 6486 if ((adev->flags & AMD_IS_APU) && !amdgpu_passthrough(adev)) 6487 return; 6488 #endif 6489 if (adev->gmc.xgmi.connected_to_cpu) 6490 return; 6491 6492 amdgpu_asic_invalidate_hdp(adev, ring); 6493 } 6494 6495 int amdgpu_in_reset(struct amdgpu_device *adev) 6496 { 6497 return atomic_read(&adev->reset_domain->in_gpu_reset); 6498 } 6499 6500 /** 6501 * amdgpu_device_halt() - bring hardware to some kind of halt state 6502 * 6503 * @adev: amdgpu_device pointer 6504 * 6505 * Bring hardware to some kind of halt state so that no one can touch it 6506 * any more. It will help to maintain error context when error occurred. 6507 * Compare to a simple hang, the system will keep stable at least for SSH 6508 * access. Then it should be trivial to inspect the hardware state and 6509 * see what's going on. Implemented as following: 6510 * 6511 * 1. drm_dev_unplug() makes device inaccessible to user space(IOCTLs, etc), 6512 * clears all CPU mappings to device, disallows remappings through page faults 6513 * 2. amdgpu_irq_disable_all() disables all interrupts 6514 * 3. amdgpu_fence_driver_hw_fini() signals all HW fences 6515 * 4. set adev->no_hw_access to avoid potential crashes after setp 5 6516 * 5. amdgpu_device_unmap_mmio() clears all MMIO mappings 6517 * 6. pci_disable_device() and pci_wait_for_pending_transaction() 6518 * flush any in flight DMA operations 6519 */ 6520 void amdgpu_device_halt(struct amdgpu_device *adev) 6521 { 6522 struct pci_dev *pdev = adev->pdev; 6523 struct drm_device *ddev = adev_to_drm(adev); 6524 6525 amdgpu_xcp_dev_unplug(adev); 6526 drm_dev_unplug(ddev); 6527 6528 amdgpu_irq_disable_all(adev); 6529 6530 amdgpu_fence_driver_hw_fini(adev); 6531 6532 adev->no_hw_access = true; 6533 6534 amdgpu_device_unmap_mmio(adev); 6535 6536 pci_disable_device(pdev); 6537 pci_wait_for_pending_transaction(pdev); 6538 } 6539 6540 u32 amdgpu_device_pcie_port_rreg(struct amdgpu_device *adev, 6541 u32 reg) 6542 { 6543 unsigned long flags, address, data; 6544 u32 r; 6545 6546 address = adev->nbio.funcs->get_pcie_port_index_offset(adev); 6547 data = adev->nbio.funcs->get_pcie_port_data_offset(adev); 6548 6549 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 6550 WREG32(address, reg * 4); 6551 (void)RREG32(address); 6552 r = RREG32(data); 6553 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 6554 return r; 6555 } 6556 6557 void amdgpu_device_pcie_port_wreg(struct amdgpu_device *adev, 6558 u32 reg, u32 v) 6559 { 6560 unsigned long flags, address, data; 6561 6562 address = adev->nbio.funcs->get_pcie_port_index_offset(adev); 6563 data = adev->nbio.funcs->get_pcie_port_data_offset(adev); 6564 6565 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 6566 WREG32(address, reg * 4); 6567 (void)RREG32(address); 6568 WREG32(data, v); 6569 (void)RREG32(data); 6570 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 6571 } 6572 6573 /** 6574 * amdgpu_device_get_gang - return a reference to the current gang 6575 * @adev: amdgpu_device pointer 6576 * 6577 * Returns: A new reference to the current gang leader. 6578 */ 6579 struct dma_fence *amdgpu_device_get_gang(struct amdgpu_device *adev) 6580 { 6581 struct dma_fence *fence; 6582 6583 rcu_read_lock(); 6584 fence = dma_fence_get_rcu_safe(&adev->gang_submit); 6585 rcu_read_unlock(); 6586 return fence; 6587 } 6588 6589 /** 6590 * amdgpu_device_switch_gang - switch to a new gang 6591 * @adev: amdgpu_device pointer 6592 * @gang: the gang to switch to 6593 * 6594 * Try to switch to a new gang. 6595 * Returns: NULL if we switched to the new gang or a reference to the current 6596 * gang leader. 6597 */ 6598 struct dma_fence *amdgpu_device_switch_gang(struct amdgpu_device *adev, 6599 struct dma_fence *gang) 6600 { 6601 struct dma_fence *old = NULL; 6602 6603 do { 6604 dma_fence_put(old); 6605 old = amdgpu_device_get_gang(adev); 6606 if (old == gang) 6607 break; 6608 6609 if (!dma_fence_is_signaled(old)) 6610 return old; 6611 6612 } while (cmpxchg((struct dma_fence __force **)&adev->gang_submit, 6613 old, gang) != old); 6614 6615 dma_fence_put(old); 6616 return NULL; 6617 } 6618 6619 bool amdgpu_device_has_display_hardware(struct amdgpu_device *adev) 6620 { 6621 switch (adev->asic_type) { 6622 #ifdef CONFIG_DRM_AMDGPU_SI 6623 case CHIP_HAINAN: 6624 #endif 6625 case CHIP_TOPAZ: 6626 /* chips with no display hardware */ 6627 return false; 6628 #ifdef CONFIG_DRM_AMDGPU_SI 6629 case CHIP_TAHITI: 6630 case CHIP_PITCAIRN: 6631 case CHIP_VERDE: 6632 case CHIP_OLAND: 6633 #endif 6634 #ifdef CONFIG_DRM_AMDGPU_CIK 6635 case CHIP_BONAIRE: 6636 case CHIP_HAWAII: 6637 case CHIP_KAVERI: 6638 case CHIP_KABINI: 6639 case CHIP_MULLINS: 6640 #endif 6641 case CHIP_TONGA: 6642 case CHIP_FIJI: 6643 case CHIP_POLARIS10: 6644 case CHIP_POLARIS11: 6645 case CHIP_POLARIS12: 6646 case CHIP_VEGAM: 6647 case CHIP_CARRIZO: 6648 case CHIP_STONEY: 6649 /* chips with display hardware */ 6650 return true; 6651 default: 6652 /* IP discovery */ 6653 if (!amdgpu_ip_version(adev, DCE_HWIP, 0) || 6654 (adev->harvest_ip_mask & AMD_HARVEST_IP_DMU_MASK)) 6655 return false; 6656 return true; 6657 } 6658 } 6659 6660 uint32_t amdgpu_device_wait_on_rreg(struct amdgpu_device *adev, 6661 uint32_t inst, uint32_t reg_addr, char reg_name[], 6662 uint32_t expected_value, uint32_t mask) 6663 { 6664 uint32_t ret = 0; 6665 uint32_t old_ = 0; 6666 uint32_t tmp_ = RREG32(reg_addr); 6667 uint32_t loop = adev->usec_timeout; 6668 6669 while ((tmp_ & (mask)) != (expected_value)) { 6670 if (old_ != tmp_) { 6671 loop = adev->usec_timeout; 6672 old_ = tmp_; 6673 } else 6674 udelay(1); 6675 tmp_ = RREG32(reg_addr); 6676 loop--; 6677 if (!loop) { 6678 DRM_WARN("Register(%d) [%s] failed to reach value 0x%08x != 0x%08xn", 6679 inst, reg_name, (uint32_t)expected_value, 6680 (uint32_t)(tmp_ & (mask))); 6681 ret = -ETIMEDOUT; 6682 break; 6683 } 6684 } 6685 return ret; 6686 } 6687