1 /* 2 * Copyright 2008 Advanced Micro Devices, Inc. 3 * Copyright 2008 Red Hat Inc. 4 * Copyright 2009 Jerome Glisse. 5 * 6 * Permission is hereby granted, free of charge, to any person obtaining a 7 * copy of this software and associated documentation files (the "Software"), 8 * to deal in the Software without restriction, including without limitation 9 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 10 * and/or sell copies of the Software, and to permit persons to whom the 11 * Software is furnished to do so, subject to the following conditions: 12 * 13 * The above copyright notice and this permission notice shall be included in 14 * all copies or substantial portions of the Software. 15 * 16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 19 * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR 20 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, 21 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR 22 * OTHER DEALINGS IN THE SOFTWARE. 23 * 24 * Authors: Dave Airlie 25 * Alex Deucher 26 * Jerome Glisse 27 */ 28 #include <linux/power_supply.h> 29 #include <linux/kthread.h> 30 #include <linux/module.h> 31 #include <linux/console.h> 32 #include <linux/slab.h> 33 #include <linux/iommu.h> 34 #include <linux/pci.h> 35 #include <linux/pci-p2pdma.h> 36 #include <linux/apple-gmux.h> 37 38 #include <drm/drm_aperture.h> 39 #include <drm/drm_atomic_helper.h> 40 #include <drm/drm_crtc_helper.h> 41 #include <drm/drm_fb_helper.h> 42 #include <drm/drm_probe_helper.h> 43 #include <drm/amdgpu_drm.h> 44 #include <linux/device.h> 45 #include <linux/vgaarb.h> 46 #include <linux/vga_switcheroo.h> 47 #include <linux/efi.h> 48 #include "amdgpu.h" 49 #include "amdgpu_trace.h" 50 #include "amdgpu_i2c.h" 51 #include "atom.h" 52 #include "amdgpu_atombios.h" 53 #include "amdgpu_atomfirmware.h" 54 #include "amd_pcie.h" 55 #ifdef CONFIG_DRM_AMDGPU_SI 56 #include "si.h" 57 #endif 58 #ifdef CONFIG_DRM_AMDGPU_CIK 59 #include "cik.h" 60 #endif 61 #include "vi.h" 62 #include "soc15.h" 63 #include "nv.h" 64 #include "bif/bif_4_1_d.h" 65 #include <linux/firmware.h> 66 #include "amdgpu_vf_error.h" 67 68 #include "amdgpu_amdkfd.h" 69 #include "amdgpu_pm.h" 70 71 #include "amdgpu_xgmi.h" 72 #include "amdgpu_ras.h" 73 #include "amdgpu_pmu.h" 74 #include "amdgpu_fru_eeprom.h" 75 #include "amdgpu_reset.h" 76 #include "amdgpu_virt.h" 77 #include "amdgpu_dev_coredump.h" 78 79 #include <linux/suspend.h> 80 #include <drm/task_barrier.h> 81 #include <linux/pm_runtime.h> 82 83 #include <drm/drm_drv.h> 84 85 #if IS_ENABLED(CONFIG_X86) 86 #include <asm/intel-family.h> 87 #endif 88 89 MODULE_FIRMWARE("amdgpu/vega10_gpu_info.bin"); 90 MODULE_FIRMWARE("amdgpu/vega12_gpu_info.bin"); 91 MODULE_FIRMWARE("amdgpu/raven_gpu_info.bin"); 92 MODULE_FIRMWARE("amdgpu/picasso_gpu_info.bin"); 93 MODULE_FIRMWARE("amdgpu/raven2_gpu_info.bin"); 94 MODULE_FIRMWARE("amdgpu/arcturus_gpu_info.bin"); 95 MODULE_FIRMWARE("amdgpu/navi12_gpu_info.bin"); 96 97 #define AMDGPU_RESUME_MS 2000 98 #define AMDGPU_MAX_RETRY_LIMIT 2 99 #define AMDGPU_RETRY_SRIOV_RESET(r) ((r) == -EBUSY || (r) == -ETIMEDOUT || (r) == -EINVAL) 100 #define AMDGPU_PCIE_INDEX_FALLBACK (0x38 >> 2) 101 #define AMDGPU_PCIE_INDEX_HI_FALLBACK (0x44 >> 2) 102 #define AMDGPU_PCIE_DATA_FALLBACK (0x3C >> 2) 103 104 static const struct drm_driver amdgpu_kms_driver; 105 106 const char *amdgpu_asic_name[] = { 107 "TAHITI", 108 "PITCAIRN", 109 "VERDE", 110 "OLAND", 111 "HAINAN", 112 "BONAIRE", 113 "KAVERI", 114 "KABINI", 115 "HAWAII", 116 "MULLINS", 117 "TOPAZ", 118 "TONGA", 119 "FIJI", 120 "CARRIZO", 121 "STONEY", 122 "POLARIS10", 123 "POLARIS11", 124 "POLARIS12", 125 "VEGAM", 126 "VEGA10", 127 "VEGA12", 128 "VEGA20", 129 "RAVEN", 130 "ARCTURUS", 131 "RENOIR", 132 "ALDEBARAN", 133 "NAVI10", 134 "CYAN_SKILLFISH", 135 "NAVI14", 136 "NAVI12", 137 "SIENNA_CICHLID", 138 "NAVY_FLOUNDER", 139 "VANGOGH", 140 "DIMGREY_CAVEFISH", 141 "BEIGE_GOBY", 142 "YELLOW_CARP", 143 "IP DISCOVERY", 144 "LAST", 145 }; 146 147 #define AMDGPU_IP_BLK_MASK_ALL GENMASK(AMDGPU_MAX_IP_NUM - 1, 0) 148 /* 149 * Default init level where all blocks are expected to be initialized. This is 150 * the level of initialization expected by default and also after a full reset 151 * of the device. 152 */ 153 struct amdgpu_init_level amdgpu_init_default = { 154 .level = AMDGPU_INIT_LEVEL_DEFAULT, 155 .hwini_ip_block_mask = AMDGPU_IP_BLK_MASK_ALL, 156 }; 157 158 /* 159 * Minimal blocks needed to be initialized before a XGMI hive can be reset. This 160 * is used for cases like reset on initialization where the entire hive needs to 161 * be reset before first use. 162 */ 163 struct amdgpu_init_level amdgpu_init_minimal_xgmi = { 164 .level = AMDGPU_INIT_LEVEL_MINIMAL_XGMI, 165 .hwini_ip_block_mask = 166 BIT(AMD_IP_BLOCK_TYPE_GMC) | BIT(AMD_IP_BLOCK_TYPE_SMC) | 167 BIT(AMD_IP_BLOCK_TYPE_COMMON) | BIT(AMD_IP_BLOCK_TYPE_IH) | 168 BIT(AMD_IP_BLOCK_TYPE_PSP) 169 }; 170 171 static inline bool amdgpu_ip_member_of_hwini(struct amdgpu_device *adev, 172 enum amd_ip_block_type block) 173 { 174 return (adev->init_lvl->hwini_ip_block_mask & (1U << block)) != 0; 175 } 176 177 void amdgpu_set_init_level(struct amdgpu_device *adev, 178 enum amdgpu_init_lvl_id lvl) 179 { 180 switch (lvl) { 181 case AMDGPU_INIT_LEVEL_MINIMAL_XGMI: 182 adev->init_lvl = &amdgpu_init_minimal_xgmi; 183 break; 184 case AMDGPU_INIT_LEVEL_DEFAULT: 185 fallthrough; 186 default: 187 adev->init_lvl = &amdgpu_init_default; 188 break; 189 } 190 } 191 192 static inline void amdgpu_device_stop_pending_resets(struct amdgpu_device *adev); 193 194 /** 195 * DOC: pcie_replay_count 196 * 197 * The amdgpu driver provides a sysfs API for reporting the total number 198 * of PCIe replays (NAKs) 199 * The file pcie_replay_count is used for this and returns the total 200 * number of replays as a sum of the NAKs generated and NAKs received 201 */ 202 203 static ssize_t amdgpu_device_get_pcie_replay_count(struct device *dev, 204 struct device_attribute *attr, char *buf) 205 { 206 struct drm_device *ddev = dev_get_drvdata(dev); 207 struct amdgpu_device *adev = drm_to_adev(ddev); 208 uint64_t cnt = amdgpu_asic_get_pcie_replay_count(adev); 209 210 return sysfs_emit(buf, "%llu\n", cnt); 211 } 212 213 static DEVICE_ATTR(pcie_replay_count, 0444, 214 amdgpu_device_get_pcie_replay_count, NULL); 215 216 static ssize_t amdgpu_sysfs_reg_state_get(struct file *f, struct kobject *kobj, 217 struct bin_attribute *attr, char *buf, 218 loff_t ppos, size_t count) 219 { 220 struct device *dev = kobj_to_dev(kobj); 221 struct drm_device *ddev = dev_get_drvdata(dev); 222 struct amdgpu_device *adev = drm_to_adev(ddev); 223 ssize_t bytes_read; 224 225 switch (ppos) { 226 case AMDGPU_SYS_REG_STATE_XGMI: 227 bytes_read = amdgpu_asic_get_reg_state( 228 adev, AMDGPU_REG_STATE_TYPE_XGMI, buf, count); 229 break; 230 case AMDGPU_SYS_REG_STATE_WAFL: 231 bytes_read = amdgpu_asic_get_reg_state( 232 adev, AMDGPU_REG_STATE_TYPE_WAFL, buf, count); 233 break; 234 case AMDGPU_SYS_REG_STATE_PCIE: 235 bytes_read = amdgpu_asic_get_reg_state( 236 adev, AMDGPU_REG_STATE_TYPE_PCIE, buf, count); 237 break; 238 case AMDGPU_SYS_REG_STATE_USR: 239 bytes_read = amdgpu_asic_get_reg_state( 240 adev, AMDGPU_REG_STATE_TYPE_USR, buf, count); 241 break; 242 case AMDGPU_SYS_REG_STATE_USR_1: 243 bytes_read = amdgpu_asic_get_reg_state( 244 adev, AMDGPU_REG_STATE_TYPE_USR_1, buf, count); 245 break; 246 default: 247 return -EINVAL; 248 } 249 250 return bytes_read; 251 } 252 253 BIN_ATTR(reg_state, 0444, amdgpu_sysfs_reg_state_get, NULL, 254 AMDGPU_SYS_REG_STATE_END); 255 256 int amdgpu_reg_state_sysfs_init(struct amdgpu_device *adev) 257 { 258 int ret; 259 260 if (!amdgpu_asic_get_reg_state_supported(adev)) 261 return 0; 262 263 ret = sysfs_create_bin_file(&adev->dev->kobj, &bin_attr_reg_state); 264 265 return ret; 266 } 267 268 void amdgpu_reg_state_sysfs_fini(struct amdgpu_device *adev) 269 { 270 if (!amdgpu_asic_get_reg_state_supported(adev)) 271 return; 272 sysfs_remove_bin_file(&adev->dev->kobj, &bin_attr_reg_state); 273 } 274 275 /** 276 * DOC: board_info 277 * 278 * The amdgpu driver provides a sysfs API for giving board related information. 279 * It provides the form factor information in the format 280 * 281 * type : form factor 282 * 283 * Possible form factor values 284 * 285 * - "cem" - PCIE CEM card 286 * - "oam" - Open Compute Accelerator Module 287 * - "unknown" - Not known 288 * 289 */ 290 291 static ssize_t amdgpu_device_get_board_info(struct device *dev, 292 struct device_attribute *attr, 293 char *buf) 294 { 295 struct drm_device *ddev = dev_get_drvdata(dev); 296 struct amdgpu_device *adev = drm_to_adev(ddev); 297 enum amdgpu_pkg_type pkg_type = AMDGPU_PKG_TYPE_CEM; 298 const char *pkg; 299 300 if (adev->smuio.funcs && adev->smuio.funcs->get_pkg_type) 301 pkg_type = adev->smuio.funcs->get_pkg_type(adev); 302 303 switch (pkg_type) { 304 case AMDGPU_PKG_TYPE_CEM: 305 pkg = "cem"; 306 break; 307 case AMDGPU_PKG_TYPE_OAM: 308 pkg = "oam"; 309 break; 310 default: 311 pkg = "unknown"; 312 break; 313 } 314 315 return sysfs_emit(buf, "%s : %s\n", "type", pkg); 316 } 317 318 static DEVICE_ATTR(board_info, 0444, amdgpu_device_get_board_info, NULL); 319 320 static struct attribute *amdgpu_board_attrs[] = { 321 &dev_attr_board_info.attr, 322 NULL, 323 }; 324 325 static umode_t amdgpu_board_attrs_is_visible(struct kobject *kobj, 326 struct attribute *attr, int n) 327 { 328 struct device *dev = kobj_to_dev(kobj); 329 struct drm_device *ddev = dev_get_drvdata(dev); 330 struct amdgpu_device *adev = drm_to_adev(ddev); 331 332 if (adev->flags & AMD_IS_APU) 333 return 0; 334 335 return attr->mode; 336 } 337 338 static const struct attribute_group amdgpu_board_attrs_group = { 339 .attrs = amdgpu_board_attrs, 340 .is_visible = amdgpu_board_attrs_is_visible 341 }; 342 343 static void amdgpu_device_get_pcie_info(struct amdgpu_device *adev); 344 345 346 /** 347 * amdgpu_device_supports_px - Is the device a dGPU with ATPX power control 348 * 349 * @dev: drm_device pointer 350 * 351 * Returns true if the device is a dGPU with ATPX power control, 352 * otherwise return false. 353 */ 354 bool amdgpu_device_supports_px(struct drm_device *dev) 355 { 356 struct amdgpu_device *adev = drm_to_adev(dev); 357 358 if ((adev->flags & AMD_IS_PX) && !amdgpu_is_atpx_hybrid()) 359 return true; 360 return false; 361 } 362 363 /** 364 * amdgpu_device_supports_boco - Is the device a dGPU with ACPI power resources 365 * 366 * @dev: drm_device pointer 367 * 368 * Returns true if the device is a dGPU with ACPI power control, 369 * otherwise return false. 370 */ 371 bool amdgpu_device_supports_boco(struct drm_device *dev) 372 { 373 struct amdgpu_device *adev = drm_to_adev(dev); 374 375 if (adev->has_pr3 || 376 ((adev->flags & AMD_IS_PX) && amdgpu_is_atpx_hybrid())) 377 return true; 378 return false; 379 } 380 381 /** 382 * amdgpu_device_supports_baco - Does the device support BACO 383 * 384 * @dev: drm_device pointer 385 * 386 * Return: 387 * 1 if the device supporte BACO; 388 * 3 if the device support MACO (only works if BACO is supported) 389 * otherwise return 0. 390 */ 391 int amdgpu_device_supports_baco(struct drm_device *dev) 392 { 393 struct amdgpu_device *adev = drm_to_adev(dev); 394 395 return amdgpu_asic_supports_baco(adev); 396 } 397 398 void amdgpu_device_detect_runtime_pm_mode(struct amdgpu_device *adev) 399 { 400 struct drm_device *dev; 401 int bamaco_support; 402 403 dev = adev_to_drm(adev); 404 405 adev->pm.rpm_mode = AMDGPU_RUNPM_NONE; 406 bamaco_support = amdgpu_device_supports_baco(dev); 407 408 switch (amdgpu_runtime_pm) { 409 case 2: 410 if (bamaco_support & MACO_SUPPORT) { 411 adev->pm.rpm_mode = AMDGPU_RUNPM_BAMACO; 412 dev_info(adev->dev, "Forcing BAMACO for runtime pm\n"); 413 } else if (bamaco_support == BACO_SUPPORT) { 414 adev->pm.rpm_mode = AMDGPU_RUNPM_BACO; 415 dev_info(adev->dev, "Requested mode BAMACO not available,fallback to use BACO\n"); 416 } 417 break; 418 case 1: 419 if (bamaco_support & BACO_SUPPORT) { 420 adev->pm.rpm_mode = AMDGPU_RUNPM_BACO; 421 dev_info(adev->dev, "Forcing BACO for runtime pm\n"); 422 } 423 break; 424 case -1: 425 case -2: 426 if (amdgpu_device_supports_px(dev)) { /* enable PX as runtime mode */ 427 adev->pm.rpm_mode = AMDGPU_RUNPM_PX; 428 dev_info(adev->dev, "Using ATPX for runtime pm\n"); 429 } else if (amdgpu_device_supports_boco(dev)) { /* enable boco as runtime mode */ 430 adev->pm.rpm_mode = AMDGPU_RUNPM_BOCO; 431 dev_info(adev->dev, "Using BOCO for runtime pm\n"); 432 } else { 433 if (!bamaco_support) 434 goto no_runtime_pm; 435 436 switch (adev->asic_type) { 437 case CHIP_VEGA20: 438 case CHIP_ARCTURUS: 439 /* BACO are not supported on vega20 and arctrus */ 440 break; 441 case CHIP_VEGA10: 442 /* enable BACO as runpm mode if noretry=0 */ 443 if (!adev->gmc.noretry) 444 adev->pm.rpm_mode = AMDGPU_RUNPM_BACO; 445 break; 446 default: 447 /* enable BACO as runpm mode on CI+ */ 448 adev->pm.rpm_mode = AMDGPU_RUNPM_BACO; 449 break; 450 } 451 452 if (adev->pm.rpm_mode == AMDGPU_RUNPM_BACO) { 453 if (bamaco_support & MACO_SUPPORT) { 454 adev->pm.rpm_mode = AMDGPU_RUNPM_BAMACO; 455 dev_info(adev->dev, "Using BAMACO for runtime pm\n"); 456 } else { 457 dev_info(adev->dev, "Using BACO for runtime pm\n"); 458 } 459 } 460 } 461 break; 462 case 0: 463 dev_info(adev->dev, "runtime pm is manually disabled\n"); 464 break; 465 default: 466 break; 467 } 468 469 no_runtime_pm: 470 if (adev->pm.rpm_mode == AMDGPU_RUNPM_NONE) 471 dev_info(adev->dev, "Runtime PM not available\n"); 472 } 473 /** 474 * amdgpu_device_supports_smart_shift - Is the device dGPU with 475 * smart shift support 476 * 477 * @dev: drm_device pointer 478 * 479 * Returns true if the device is a dGPU with Smart Shift support, 480 * otherwise returns false. 481 */ 482 bool amdgpu_device_supports_smart_shift(struct drm_device *dev) 483 { 484 return (amdgpu_device_supports_boco(dev) && 485 amdgpu_acpi_is_power_shift_control_supported()); 486 } 487 488 /* 489 * VRAM access helper functions 490 */ 491 492 /** 493 * amdgpu_device_mm_access - access vram by MM_INDEX/MM_DATA 494 * 495 * @adev: amdgpu_device pointer 496 * @pos: offset of the buffer in vram 497 * @buf: virtual address of the buffer in system memory 498 * @size: read/write size, sizeof(@buf) must > @size 499 * @write: true - write to vram, otherwise - read from vram 500 */ 501 void amdgpu_device_mm_access(struct amdgpu_device *adev, loff_t pos, 502 void *buf, size_t size, bool write) 503 { 504 unsigned long flags; 505 uint32_t hi = ~0, tmp = 0; 506 uint32_t *data = buf; 507 uint64_t last; 508 int idx; 509 510 if (!drm_dev_enter(adev_to_drm(adev), &idx)) 511 return; 512 513 BUG_ON(!IS_ALIGNED(pos, 4) || !IS_ALIGNED(size, 4)); 514 515 spin_lock_irqsave(&adev->mmio_idx_lock, flags); 516 for (last = pos + size; pos < last; pos += 4) { 517 tmp = pos >> 31; 518 519 WREG32_NO_KIQ(mmMM_INDEX, ((uint32_t)pos) | 0x80000000); 520 if (tmp != hi) { 521 WREG32_NO_KIQ(mmMM_INDEX_HI, tmp); 522 hi = tmp; 523 } 524 if (write) 525 WREG32_NO_KIQ(mmMM_DATA, *data++); 526 else 527 *data++ = RREG32_NO_KIQ(mmMM_DATA); 528 } 529 530 spin_unlock_irqrestore(&adev->mmio_idx_lock, flags); 531 drm_dev_exit(idx); 532 } 533 534 /** 535 * amdgpu_device_aper_access - access vram by vram aperature 536 * 537 * @adev: amdgpu_device pointer 538 * @pos: offset of the buffer in vram 539 * @buf: virtual address of the buffer in system memory 540 * @size: read/write size, sizeof(@buf) must > @size 541 * @write: true - write to vram, otherwise - read from vram 542 * 543 * The return value means how many bytes have been transferred. 544 */ 545 size_t amdgpu_device_aper_access(struct amdgpu_device *adev, loff_t pos, 546 void *buf, size_t size, bool write) 547 { 548 #ifdef CONFIG_64BIT 549 void __iomem *addr; 550 size_t count = 0; 551 uint64_t last; 552 553 if (!adev->mman.aper_base_kaddr) 554 return 0; 555 556 last = min(pos + size, adev->gmc.visible_vram_size); 557 if (last > pos) { 558 addr = adev->mman.aper_base_kaddr + pos; 559 count = last - pos; 560 561 if (write) { 562 memcpy_toio(addr, buf, count); 563 /* Make sure HDP write cache flush happens without any reordering 564 * after the system memory contents are sent over PCIe device 565 */ 566 mb(); 567 amdgpu_device_flush_hdp(adev, NULL); 568 } else { 569 amdgpu_device_invalidate_hdp(adev, NULL); 570 /* Make sure HDP read cache is invalidated before issuing a read 571 * to the PCIe device 572 */ 573 mb(); 574 memcpy_fromio(buf, addr, count); 575 } 576 577 } 578 579 return count; 580 #else 581 return 0; 582 #endif 583 } 584 585 /** 586 * amdgpu_device_vram_access - read/write a buffer in vram 587 * 588 * @adev: amdgpu_device pointer 589 * @pos: offset of the buffer in vram 590 * @buf: virtual address of the buffer in system memory 591 * @size: read/write size, sizeof(@buf) must > @size 592 * @write: true - write to vram, otherwise - read from vram 593 */ 594 void amdgpu_device_vram_access(struct amdgpu_device *adev, loff_t pos, 595 void *buf, size_t size, bool write) 596 { 597 size_t count; 598 599 /* try to using vram apreature to access vram first */ 600 count = amdgpu_device_aper_access(adev, pos, buf, size, write); 601 size -= count; 602 if (size) { 603 /* using MM to access rest vram */ 604 pos += count; 605 buf += count; 606 amdgpu_device_mm_access(adev, pos, buf, size, write); 607 } 608 } 609 610 /* 611 * register access helper functions. 612 */ 613 614 /* Check if hw access should be skipped because of hotplug or device error */ 615 bool amdgpu_device_skip_hw_access(struct amdgpu_device *adev) 616 { 617 if (adev->no_hw_access) 618 return true; 619 620 #ifdef CONFIG_LOCKDEP 621 /* 622 * This is a bit complicated to understand, so worth a comment. What we assert 623 * here is that the GPU reset is not running on another thread in parallel. 624 * 625 * For this we trylock the read side of the reset semaphore, if that succeeds 626 * we know that the reset is not running in paralell. 627 * 628 * If the trylock fails we assert that we are either already holding the read 629 * side of the lock or are the reset thread itself and hold the write side of 630 * the lock. 631 */ 632 if (in_task()) { 633 if (down_read_trylock(&adev->reset_domain->sem)) 634 up_read(&adev->reset_domain->sem); 635 else 636 lockdep_assert_held(&adev->reset_domain->sem); 637 } 638 #endif 639 return false; 640 } 641 642 /** 643 * amdgpu_device_rreg - read a memory mapped IO or indirect register 644 * 645 * @adev: amdgpu_device pointer 646 * @reg: dword aligned register offset 647 * @acc_flags: access flags which require special behavior 648 * 649 * Returns the 32 bit value from the offset specified. 650 */ 651 uint32_t amdgpu_device_rreg(struct amdgpu_device *adev, 652 uint32_t reg, uint32_t acc_flags) 653 { 654 uint32_t ret; 655 656 if (amdgpu_device_skip_hw_access(adev)) 657 return 0; 658 659 if ((reg * 4) < adev->rmmio_size) { 660 if (!(acc_flags & AMDGPU_REGS_NO_KIQ) && 661 amdgpu_sriov_runtime(adev) && 662 down_read_trylock(&adev->reset_domain->sem)) { 663 ret = amdgpu_kiq_rreg(adev, reg, 0); 664 up_read(&adev->reset_domain->sem); 665 } else { 666 ret = readl(((void __iomem *)adev->rmmio) + (reg * 4)); 667 } 668 } else { 669 ret = adev->pcie_rreg(adev, reg * 4); 670 } 671 672 trace_amdgpu_device_rreg(adev->pdev->device, reg, ret); 673 674 return ret; 675 } 676 677 /* 678 * MMIO register read with bytes helper functions 679 * @offset:bytes offset from MMIO start 680 */ 681 682 /** 683 * amdgpu_mm_rreg8 - read a memory mapped IO register 684 * 685 * @adev: amdgpu_device pointer 686 * @offset: byte aligned register offset 687 * 688 * Returns the 8 bit value from the offset specified. 689 */ 690 uint8_t amdgpu_mm_rreg8(struct amdgpu_device *adev, uint32_t offset) 691 { 692 if (amdgpu_device_skip_hw_access(adev)) 693 return 0; 694 695 if (offset < adev->rmmio_size) 696 return (readb(adev->rmmio + offset)); 697 BUG(); 698 } 699 700 701 /** 702 * amdgpu_device_xcc_rreg - read a memory mapped IO or indirect register with specific XCC 703 * 704 * @adev: amdgpu_device pointer 705 * @reg: dword aligned register offset 706 * @acc_flags: access flags which require special behavior 707 * @xcc_id: xcc accelerated compute core id 708 * 709 * Returns the 32 bit value from the offset specified. 710 */ 711 uint32_t amdgpu_device_xcc_rreg(struct amdgpu_device *adev, 712 uint32_t reg, uint32_t acc_flags, 713 uint32_t xcc_id) 714 { 715 uint32_t ret, rlcg_flag; 716 717 if (amdgpu_device_skip_hw_access(adev)) 718 return 0; 719 720 if ((reg * 4) < adev->rmmio_size) { 721 if (amdgpu_sriov_vf(adev) && 722 !amdgpu_sriov_runtime(adev) && 723 adev->gfx.rlc.rlcg_reg_access_supported && 724 amdgpu_virt_get_rlcg_reg_access_flag(adev, acc_flags, 725 GC_HWIP, false, 726 &rlcg_flag)) { 727 ret = amdgpu_virt_rlcg_reg_rw(adev, reg, 0, rlcg_flag, GET_INST(GC, xcc_id)); 728 } else if (!(acc_flags & AMDGPU_REGS_NO_KIQ) && 729 amdgpu_sriov_runtime(adev) && 730 down_read_trylock(&adev->reset_domain->sem)) { 731 ret = amdgpu_kiq_rreg(adev, reg, xcc_id); 732 up_read(&adev->reset_domain->sem); 733 } else { 734 ret = readl(((void __iomem *)adev->rmmio) + (reg * 4)); 735 } 736 } else { 737 ret = adev->pcie_rreg(adev, reg * 4); 738 } 739 740 return ret; 741 } 742 743 /* 744 * MMIO register write with bytes helper functions 745 * @offset:bytes offset from MMIO start 746 * @value: the value want to be written to the register 747 */ 748 749 /** 750 * amdgpu_mm_wreg8 - read a memory mapped IO register 751 * 752 * @adev: amdgpu_device pointer 753 * @offset: byte aligned register offset 754 * @value: 8 bit value to write 755 * 756 * Writes the value specified to the offset specified. 757 */ 758 void amdgpu_mm_wreg8(struct amdgpu_device *adev, uint32_t offset, uint8_t value) 759 { 760 if (amdgpu_device_skip_hw_access(adev)) 761 return; 762 763 if (offset < adev->rmmio_size) 764 writeb(value, adev->rmmio + offset); 765 else 766 BUG(); 767 } 768 769 /** 770 * amdgpu_device_wreg - write to a memory mapped IO or indirect register 771 * 772 * @adev: amdgpu_device pointer 773 * @reg: dword aligned register offset 774 * @v: 32 bit value to write to the register 775 * @acc_flags: access flags which require special behavior 776 * 777 * Writes the value specified to the offset specified. 778 */ 779 void amdgpu_device_wreg(struct amdgpu_device *adev, 780 uint32_t reg, uint32_t v, 781 uint32_t acc_flags) 782 { 783 if (amdgpu_device_skip_hw_access(adev)) 784 return; 785 786 if ((reg * 4) < adev->rmmio_size) { 787 if (!(acc_flags & AMDGPU_REGS_NO_KIQ) && 788 amdgpu_sriov_runtime(adev) && 789 down_read_trylock(&adev->reset_domain->sem)) { 790 amdgpu_kiq_wreg(adev, reg, v, 0); 791 up_read(&adev->reset_domain->sem); 792 } else { 793 writel(v, ((void __iomem *)adev->rmmio) + (reg * 4)); 794 } 795 } else { 796 adev->pcie_wreg(adev, reg * 4, v); 797 } 798 799 trace_amdgpu_device_wreg(adev->pdev->device, reg, v); 800 } 801 802 /** 803 * amdgpu_mm_wreg_mmio_rlc - write register either with direct/indirect mmio or with RLC path if in range 804 * 805 * @adev: amdgpu_device pointer 806 * @reg: mmio/rlc register 807 * @v: value to write 808 * @xcc_id: xcc accelerated compute core id 809 * 810 * this function is invoked only for the debugfs register access 811 */ 812 void amdgpu_mm_wreg_mmio_rlc(struct amdgpu_device *adev, 813 uint32_t reg, uint32_t v, 814 uint32_t xcc_id) 815 { 816 if (amdgpu_device_skip_hw_access(adev)) 817 return; 818 819 if (amdgpu_sriov_fullaccess(adev) && 820 adev->gfx.rlc.funcs && 821 adev->gfx.rlc.funcs->is_rlcg_access_range) { 822 if (adev->gfx.rlc.funcs->is_rlcg_access_range(adev, reg)) 823 return amdgpu_sriov_wreg(adev, reg, v, 0, 0, xcc_id); 824 } else if ((reg * 4) >= adev->rmmio_size) { 825 adev->pcie_wreg(adev, reg * 4, v); 826 } else { 827 writel(v, ((void __iomem *)adev->rmmio) + (reg * 4)); 828 } 829 } 830 831 /** 832 * amdgpu_device_xcc_wreg - write to a memory mapped IO or indirect register with specific XCC 833 * 834 * @adev: amdgpu_device pointer 835 * @reg: dword aligned register offset 836 * @v: 32 bit value to write to the register 837 * @acc_flags: access flags which require special behavior 838 * @xcc_id: xcc accelerated compute core id 839 * 840 * Writes the value specified to the offset specified. 841 */ 842 void amdgpu_device_xcc_wreg(struct amdgpu_device *adev, 843 uint32_t reg, uint32_t v, 844 uint32_t acc_flags, uint32_t xcc_id) 845 { 846 uint32_t rlcg_flag; 847 848 if (amdgpu_device_skip_hw_access(adev)) 849 return; 850 851 if ((reg * 4) < adev->rmmio_size) { 852 if (amdgpu_sriov_vf(adev) && 853 !amdgpu_sriov_runtime(adev) && 854 adev->gfx.rlc.rlcg_reg_access_supported && 855 amdgpu_virt_get_rlcg_reg_access_flag(adev, acc_flags, 856 GC_HWIP, true, 857 &rlcg_flag)) { 858 amdgpu_virt_rlcg_reg_rw(adev, reg, v, rlcg_flag, GET_INST(GC, xcc_id)); 859 } else if (!(acc_flags & AMDGPU_REGS_NO_KIQ) && 860 amdgpu_sriov_runtime(adev) && 861 down_read_trylock(&adev->reset_domain->sem)) { 862 amdgpu_kiq_wreg(adev, reg, v, xcc_id); 863 up_read(&adev->reset_domain->sem); 864 } else { 865 writel(v, ((void __iomem *)adev->rmmio) + (reg * 4)); 866 } 867 } else { 868 adev->pcie_wreg(adev, reg * 4, v); 869 } 870 } 871 872 /** 873 * amdgpu_device_indirect_rreg - read an indirect register 874 * 875 * @adev: amdgpu_device pointer 876 * @reg_addr: indirect register address to read from 877 * 878 * Returns the value of indirect register @reg_addr 879 */ 880 u32 amdgpu_device_indirect_rreg(struct amdgpu_device *adev, 881 u32 reg_addr) 882 { 883 unsigned long flags, pcie_index, pcie_data; 884 void __iomem *pcie_index_offset; 885 void __iomem *pcie_data_offset; 886 u32 r; 887 888 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 889 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 890 891 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 892 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 893 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 894 895 writel(reg_addr, pcie_index_offset); 896 readl(pcie_index_offset); 897 r = readl(pcie_data_offset); 898 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 899 900 return r; 901 } 902 903 u32 amdgpu_device_indirect_rreg_ext(struct amdgpu_device *adev, 904 u64 reg_addr) 905 { 906 unsigned long flags, pcie_index, pcie_index_hi, pcie_data; 907 u32 r; 908 void __iomem *pcie_index_offset; 909 void __iomem *pcie_index_hi_offset; 910 void __iomem *pcie_data_offset; 911 912 if (unlikely(!adev->nbio.funcs)) { 913 pcie_index = AMDGPU_PCIE_INDEX_FALLBACK; 914 pcie_data = AMDGPU_PCIE_DATA_FALLBACK; 915 } else { 916 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 917 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 918 } 919 920 if (reg_addr >> 32) { 921 if (unlikely(!adev->nbio.funcs)) 922 pcie_index_hi = AMDGPU_PCIE_INDEX_HI_FALLBACK; 923 else 924 pcie_index_hi = adev->nbio.funcs->get_pcie_index_hi_offset(adev); 925 } else { 926 pcie_index_hi = 0; 927 } 928 929 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 930 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 931 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 932 if (pcie_index_hi != 0) 933 pcie_index_hi_offset = (void __iomem *)adev->rmmio + 934 pcie_index_hi * 4; 935 936 writel(reg_addr, pcie_index_offset); 937 readl(pcie_index_offset); 938 if (pcie_index_hi != 0) { 939 writel((reg_addr >> 32) & 0xff, pcie_index_hi_offset); 940 readl(pcie_index_hi_offset); 941 } 942 r = readl(pcie_data_offset); 943 944 /* clear the high bits */ 945 if (pcie_index_hi != 0) { 946 writel(0, pcie_index_hi_offset); 947 readl(pcie_index_hi_offset); 948 } 949 950 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 951 952 return r; 953 } 954 955 /** 956 * amdgpu_device_indirect_rreg64 - read a 64bits indirect register 957 * 958 * @adev: amdgpu_device pointer 959 * @reg_addr: indirect register address to read from 960 * 961 * Returns the value of indirect register @reg_addr 962 */ 963 u64 amdgpu_device_indirect_rreg64(struct amdgpu_device *adev, 964 u32 reg_addr) 965 { 966 unsigned long flags, pcie_index, pcie_data; 967 void __iomem *pcie_index_offset; 968 void __iomem *pcie_data_offset; 969 u64 r; 970 971 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 972 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 973 974 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 975 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 976 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 977 978 /* read low 32 bits */ 979 writel(reg_addr, pcie_index_offset); 980 readl(pcie_index_offset); 981 r = readl(pcie_data_offset); 982 /* read high 32 bits */ 983 writel(reg_addr + 4, pcie_index_offset); 984 readl(pcie_index_offset); 985 r |= ((u64)readl(pcie_data_offset) << 32); 986 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 987 988 return r; 989 } 990 991 u64 amdgpu_device_indirect_rreg64_ext(struct amdgpu_device *adev, 992 u64 reg_addr) 993 { 994 unsigned long flags, pcie_index, pcie_data; 995 unsigned long pcie_index_hi = 0; 996 void __iomem *pcie_index_offset; 997 void __iomem *pcie_index_hi_offset; 998 void __iomem *pcie_data_offset; 999 u64 r; 1000 1001 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 1002 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 1003 if ((reg_addr >> 32) && (adev->nbio.funcs->get_pcie_index_hi_offset)) 1004 pcie_index_hi = adev->nbio.funcs->get_pcie_index_hi_offset(adev); 1005 1006 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 1007 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 1008 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 1009 if (pcie_index_hi != 0) 1010 pcie_index_hi_offset = (void __iomem *)adev->rmmio + 1011 pcie_index_hi * 4; 1012 1013 /* read low 32 bits */ 1014 writel(reg_addr, pcie_index_offset); 1015 readl(pcie_index_offset); 1016 if (pcie_index_hi != 0) { 1017 writel((reg_addr >> 32) & 0xff, pcie_index_hi_offset); 1018 readl(pcie_index_hi_offset); 1019 } 1020 r = readl(pcie_data_offset); 1021 /* read high 32 bits */ 1022 writel(reg_addr + 4, pcie_index_offset); 1023 readl(pcie_index_offset); 1024 if (pcie_index_hi != 0) { 1025 writel((reg_addr >> 32) & 0xff, pcie_index_hi_offset); 1026 readl(pcie_index_hi_offset); 1027 } 1028 r |= ((u64)readl(pcie_data_offset) << 32); 1029 1030 /* clear the high bits */ 1031 if (pcie_index_hi != 0) { 1032 writel(0, pcie_index_hi_offset); 1033 readl(pcie_index_hi_offset); 1034 } 1035 1036 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 1037 1038 return r; 1039 } 1040 1041 /** 1042 * amdgpu_device_indirect_wreg - write an indirect register address 1043 * 1044 * @adev: amdgpu_device pointer 1045 * @reg_addr: indirect register offset 1046 * @reg_data: indirect register data 1047 * 1048 */ 1049 void amdgpu_device_indirect_wreg(struct amdgpu_device *adev, 1050 u32 reg_addr, u32 reg_data) 1051 { 1052 unsigned long flags, pcie_index, pcie_data; 1053 void __iomem *pcie_index_offset; 1054 void __iomem *pcie_data_offset; 1055 1056 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 1057 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 1058 1059 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 1060 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 1061 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 1062 1063 writel(reg_addr, pcie_index_offset); 1064 readl(pcie_index_offset); 1065 writel(reg_data, pcie_data_offset); 1066 readl(pcie_data_offset); 1067 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 1068 } 1069 1070 void amdgpu_device_indirect_wreg_ext(struct amdgpu_device *adev, 1071 u64 reg_addr, u32 reg_data) 1072 { 1073 unsigned long flags, pcie_index, pcie_index_hi, pcie_data; 1074 void __iomem *pcie_index_offset; 1075 void __iomem *pcie_index_hi_offset; 1076 void __iomem *pcie_data_offset; 1077 1078 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 1079 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 1080 if ((reg_addr >> 32) && (adev->nbio.funcs->get_pcie_index_hi_offset)) 1081 pcie_index_hi = adev->nbio.funcs->get_pcie_index_hi_offset(adev); 1082 else 1083 pcie_index_hi = 0; 1084 1085 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 1086 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 1087 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 1088 if (pcie_index_hi != 0) 1089 pcie_index_hi_offset = (void __iomem *)adev->rmmio + 1090 pcie_index_hi * 4; 1091 1092 writel(reg_addr, pcie_index_offset); 1093 readl(pcie_index_offset); 1094 if (pcie_index_hi != 0) { 1095 writel((reg_addr >> 32) & 0xff, pcie_index_hi_offset); 1096 readl(pcie_index_hi_offset); 1097 } 1098 writel(reg_data, pcie_data_offset); 1099 readl(pcie_data_offset); 1100 1101 /* clear the high bits */ 1102 if (pcie_index_hi != 0) { 1103 writel(0, pcie_index_hi_offset); 1104 readl(pcie_index_hi_offset); 1105 } 1106 1107 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 1108 } 1109 1110 /** 1111 * amdgpu_device_indirect_wreg64 - write a 64bits indirect register address 1112 * 1113 * @adev: amdgpu_device pointer 1114 * @reg_addr: indirect register offset 1115 * @reg_data: indirect register data 1116 * 1117 */ 1118 void amdgpu_device_indirect_wreg64(struct amdgpu_device *adev, 1119 u32 reg_addr, u64 reg_data) 1120 { 1121 unsigned long flags, pcie_index, pcie_data; 1122 void __iomem *pcie_index_offset; 1123 void __iomem *pcie_data_offset; 1124 1125 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 1126 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 1127 1128 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 1129 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 1130 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 1131 1132 /* write low 32 bits */ 1133 writel(reg_addr, pcie_index_offset); 1134 readl(pcie_index_offset); 1135 writel((u32)(reg_data & 0xffffffffULL), pcie_data_offset); 1136 readl(pcie_data_offset); 1137 /* write high 32 bits */ 1138 writel(reg_addr + 4, pcie_index_offset); 1139 readl(pcie_index_offset); 1140 writel((u32)(reg_data >> 32), pcie_data_offset); 1141 readl(pcie_data_offset); 1142 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 1143 } 1144 1145 void amdgpu_device_indirect_wreg64_ext(struct amdgpu_device *adev, 1146 u64 reg_addr, u64 reg_data) 1147 { 1148 unsigned long flags, pcie_index, pcie_data; 1149 unsigned long pcie_index_hi = 0; 1150 void __iomem *pcie_index_offset; 1151 void __iomem *pcie_index_hi_offset; 1152 void __iomem *pcie_data_offset; 1153 1154 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 1155 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 1156 if ((reg_addr >> 32) && (adev->nbio.funcs->get_pcie_index_hi_offset)) 1157 pcie_index_hi = adev->nbio.funcs->get_pcie_index_hi_offset(adev); 1158 1159 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 1160 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 1161 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 1162 if (pcie_index_hi != 0) 1163 pcie_index_hi_offset = (void __iomem *)adev->rmmio + 1164 pcie_index_hi * 4; 1165 1166 /* write low 32 bits */ 1167 writel(reg_addr, pcie_index_offset); 1168 readl(pcie_index_offset); 1169 if (pcie_index_hi != 0) { 1170 writel((reg_addr >> 32) & 0xff, pcie_index_hi_offset); 1171 readl(pcie_index_hi_offset); 1172 } 1173 writel((u32)(reg_data & 0xffffffffULL), pcie_data_offset); 1174 readl(pcie_data_offset); 1175 /* write high 32 bits */ 1176 writel(reg_addr + 4, pcie_index_offset); 1177 readl(pcie_index_offset); 1178 if (pcie_index_hi != 0) { 1179 writel((reg_addr >> 32) & 0xff, pcie_index_hi_offset); 1180 readl(pcie_index_hi_offset); 1181 } 1182 writel((u32)(reg_data >> 32), pcie_data_offset); 1183 readl(pcie_data_offset); 1184 1185 /* clear the high bits */ 1186 if (pcie_index_hi != 0) { 1187 writel(0, pcie_index_hi_offset); 1188 readl(pcie_index_hi_offset); 1189 } 1190 1191 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 1192 } 1193 1194 /** 1195 * amdgpu_device_get_rev_id - query device rev_id 1196 * 1197 * @adev: amdgpu_device pointer 1198 * 1199 * Return device rev_id 1200 */ 1201 u32 amdgpu_device_get_rev_id(struct amdgpu_device *adev) 1202 { 1203 return adev->nbio.funcs->get_rev_id(adev); 1204 } 1205 1206 /** 1207 * amdgpu_invalid_rreg - dummy reg read function 1208 * 1209 * @adev: amdgpu_device pointer 1210 * @reg: offset of register 1211 * 1212 * Dummy register read function. Used for register blocks 1213 * that certain asics don't have (all asics). 1214 * Returns the value in the register. 1215 */ 1216 static uint32_t amdgpu_invalid_rreg(struct amdgpu_device *adev, uint32_t reg) 1217 { 1218 DRM_ERROR("Invalid callback to read register 0x%04X\n", reg); 1219 BUG(); 1220 return 0; 1221 } 1222 1223 static uint32_t amdgpu_invalid_rreg_ext(struct amdgpu_device *adev, uint64_t reg) 1224 { 1225 DRM_ERROR("Invalid callback to read register 0x%llX\n", reg); 1226 BUG(); 1227 return 0; 1228 } 1229 1230 /** 1231 * amdgpu_invalid_wreg - dummy reg write function 1232 * 1233 * @adev: amdgpu_device pointer 1234 * @reg: offset of register 1235 * @v: value to write to the register 1236 * 1237 * Dummy register read function. Used for register blocks 1238 * that certain asics don't have (all asics). 1239 */ 1240 static void amdgpu_invalid_wreg(struct amdgpu_device *adev, uint32_t reg, uint32_t v) 1241 { 1242 DRM_ERROR("Invalid callback to write register 0x%04X with 0x%08X\n", 1243 reg, v); 1244 BUG(); 1245 } 1246 1247 static void amdgpu_invalid_wreg_ext(struct amdgpu_device *adev, uint64_t reg, uint32_t v) 1248 { 1249 DRM_ERROR("Invalid callback to write register 0x%llX with 0x%08X\n", 1250 reg, v); 1251 BUG(); 1252 } 1253 1254 /** 1255 * amdgpu_invalid_rreg64 - dummy 64 bit reg read function 1256 * 1257 * @adev: amdgpu_device pointer 1258 * @reg: offset of register 1259 * 1260 * Dummy register read function. Used for register blocks 1261 * that certain asics don't have (all asics). 1262 * Returns the value in the register. 1263 */ 1264 static uint64_t amdgpu_invalid_rreg64(struct amdgpu_device *adev, uint32_t reg) 1265 { 1266 DRM_ERROR("Invalid callback to read 64 bit register 0x%04X\n", reg); 1267 BUG(); 1268 return 0; 1269 } 1270 1271 static uint64_t amdgpu_invalid_rreg64_ext(struct amdgpu_device *adev, uint64_t reg) 1272 { 1273 DRM_ERROR("Invalid callback to read register 0x%llX\n", reg); 1274 BUG(); 1275 return 0; 1276 } 1277 1278 /** 1279 * amdgpu_invalid_wreg64 - dummy reg write function 1280 * 1281 * @adev: amdgpu_device pointer 1282 * @reg: offset of register 1283 * @v: value to write to the register 1284 * 1285 * Dummy register read function. Used for register blocks 1286 * that certain asics don't have (all asics). 1287 */ 1288 static void amdgpu_invalid_wreg64(struct amdgpu_device *adev, uint32_t reg, uint64_t v) 1289 { 1290 DRM_ERROR("Invalid callback to write 64 bit register 0x%04X with 0x%08llX\n", 1291 reg, v); 1292 BUG(); 1293 } 1294 1295 static void amdgpu_invalid_wreg64_ext(struct amdgpu_device *adev, uint64_t reg, uint64_t v) 1296 { 1297 DRM_ERROR("Invalid callback to write 64 bit register 0x%llX with 0x%08llX\n", 1298 reg, v); 1299 BUG(); 1300 } 1301 1302 /** 1303 * amdgpu_block_invalid_rreg - dummy reg read function 1304 * 1305 * @adev: amdgpu_device pointer 1306 * @block: offset of instance 1307 * @reg: offset of register 1308 * 1309 * Dummy register read function. Used for register blocks 1310 * that certain asics don't have (all asics). 1311 * Returns the value in the register. 1312 */ 1313 static uint32_t amdgpu_block_invalid_rreg(struct amdgpu_device *adev, 1314 uint32_t block, uint32_t reg) 1315 { 1316 DRM_ERROR("Invalid callback to read register 0x%04X in block 0x%04X\n", 1317 reg, block); 1318 BUG(); 1319 return 0; 1320 } 1321 1322 /** 1323 * amdgpu_block_invalid_wreg - dummy reg write function 1324 * 1325 * @adev: amdgpu_device pointer 1326 * @block: offset of instance 1327 * @reg: offset of register 1328 * @v: value to write to the register 1329 * 1330 * Dummy register read function. Used for register blocks 1331 * that certain asics don't have (all asics). 1332 */ 1333 static void amdgpu_block_invalid_wreg(struct amdgpu_device *adev, 1334 uint32_t block, 1335 uint32_t reg, uint32_t v) 1336 { 1337 DRM_ERROR("Invalid block callback to write register 0x%04X in block 0x%04X with 0x%08X\n", 1338 reg, block, v); 1339 BUG(); 1340 } 1341 1342 /** 1343 * amdgpu_device_asic_init - Wrapper for atom asic_init 1344 * 1345 * @adev: amdgpu_device pointer 1346 * 1347 * Does any asic specific work and then calls atom asic init. 1348 */ 1349 static int amdgpu_device_asic_init(struct amdgpu_device *adev) 1350 { 1351 int ret; 1352 1353 amdgpu_asic_pre_asic_init(adev); 1354 1355 if (amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 3) || 1356 amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 4) || 1357 amdgpu_ip_version(adev, GC_HWIP, 0) >= IP_VERSION(11, 0, 0)) { 1358 amdgpu_psp_wait_for_bootloader(adev); 1359 ret = amdgpu_atomfirmware_asic_init(adev, true); 1360 return ret; 1361 } else { 1362 return amdgpu_atom_asic_init(adev->mode_info.atom_context); 1363 } 1364 1365 return 0; 1366 } 1367 1368 /** 1369 * amdgpu_device_mem_scratch_init - allocate the VRAM scratch page 1370 * 1371 * @adev: amdgpu_device pointer 1372 * 1373 * Allocates a scratch page of VRAM for use by various things in the 1374 * driver. 1375 */ 1376 static int amdgpu_device_mem_scratch_init(struct amdgpu_device *adev) 1377 { 1378 return amdgpu_bo_create_kernel(adev, AMDGPU_GPU_PAGE_SIZE, PAGE_SIZE, 1379 AMDGPU_GEM_DOMAIN_VRAM | 1380 AMDGPU_GEM_DOMAIN_GTT, 1381 &adev->mem_scratch.robj, 1382 &adev->mem_scratch.gpu_addr, 1383 (void **)&adev->mem_scratch.ptr); 1384 } 1385 1386 /** 1387 * amdgpu_device_mem_scratch_fini - Free the VRAM scratch page 1388 * 1389 * @adev: amdgpu_device pointer 1390 * 1391 * Frees the VRAM scratch page. 1392 */ 1393 static void amdgpu_device_mem_scratch_fini(struct amdgpu_device *adev) 1394 { 1395 amdgpu_bo_free_kernel(&adev->mem_scratch.robj, NULL, NULL); 1396 } 1397 1398 /** 1399 * amdgpu_device_program_register_sequence - program an array of registers. 1400 * 1401 * @adev: amdgpu_device pointer 1402 * @registers: pointer to the register array 1403 * @array_size: size of the register array 1404 * 1405 * Programs an array or registers with and or masks. 1406 * This is a helper for setting golden registers. 1407 */ 1408 void amdgpu_device_program_register_sequence(struct amdgpu_device *adev, 1409 const u32 *registers, 1410 const u32 array_size) 1411 { 1412 u32 tmp, reg, and_mask, or_mask; 1413 int i; 1414 1415 if (array_size % 3) 1416 return; 1417 1418 for (i = 0; i < array_size; i += 3) { 1419 reg = registers[i + 0]; 1420 and_mask = registers[i + 1]; 1421 or_mask = registers[i + 2]; 1422 1423 if (and_mask == 0xffffffff) { 1424 tmp = or_mask; 1425 } else { 1426 tmp = RREG32(reg); 1427 tmp &= ~and_mask; 1428 if (adev->family >= AMDGPU_FAMILY_AI) 1429 tmp |= (or_mask & and_mask); 1430 else 1431 tmp |= or_mask; 1432 } 1433 WREG32(reg, tmp); 1434 } 1435 } 1436 1437 /** 1438 * amdgpu_device_pci_config_reset - reset the GPU 1439 * 1440 * @adev: amdgpu_device pointer 1441 * 1442 * Resets the GPU using the pci config reset sequence. 1443 * Only applicable to asics prior to vega10. 1444 */ 1445 void amdgpu_device_pci_config_reset(struct amdgpu_device *adev) 1446 { 1447 pci_write_config_dword(adev->pdev, 0x7c, AMDGPU_ASIC_RESET_DATA); 1448 } 1449 1450 /** 1451 * amdgpu_device_pci_reset - reset the GPU using generic PCI means 1452 * 1453 * @adev: amdgpu_device pointer 1454 * 1455 * Resets the GPU using generic pci reset interfaces (FLR, SBR, etc.). 1456 */ 1457 int amdgpu_device_pci_reset(struct amdgpu_device *adev) 1458 { 1459 return pci_reset_function(adev->pdev); 1460 } 1461 1462 /* 1463 * amdgpu_device_wb_*() 1464 * Writeback is the method by which the GPU updates special pages in memory 1465 * with the status of certain GPU events (fences, ring pointers,etc.). 1466 */ 1467 1468 /** 1469 * amdgpu_device_wb_fini - Disable Writeback and free memory 1470 * 1471 * @adev: amdgpu_device pointer 1472 * 1473 * Disables Writeback and frees the Writeback memory (all asics). 1474 * Used at driver shutdown. 1475 */ 1476 static void amdgpu_device_wb_fini(struct amdgpu_device *adev) 1477 { 1478 if (adev->wb.wb_obj) { 1479 amdgpu_bo_free_kernel(&adev->wb.wb_obj, 1480 &adev->wb.gpu_addr, 1481 (void **)&adev->wb.wb); 1482 adev->wb.wb_obj = NULL; 1483 } 1484 } 1485 1486 /** 1487 * amdgpu_device_wb_init - Init Writeback driver info and allocate memory 1488 * 1489 * @adev: amdgpu_device pointer 1490 * 1491 * Initializes writeback and allocates writeback memory (all asics). 1492 * Used at driver startup. 1493 * Returns 0 on success or an -error on failure. 1494 */ 1495 static int amdgpu_device_wb_init(struct amdgpu_device *adev) 1496 { 1497 int r; 1498 1499 if (adev->wb.wb_obj == NULL) { 1500 /* AMDGPU_MAX_WB * sizeof(uint32_t) * 8 = AMDGPU_MAX_WB 256bit slots */ 1501 r = amdgpu_bo_create_kernel(adev, AMDGPU_MAX_WB * sizeof(uint32_t) * 8, 1502 PAGE_SIZE, AMDGPU_GEM_DOMAIN_GTT, 1503 &adev->wb.wb_obj, &adev->wb.gpu_addr, 1504 (void **)&adev->wb.wb); 1505 if (r) { 1506 dev_warn(adev->dev, "(%d) create WB bo failed\n", r); 1507 return r; 1508 } 1509 1510 adev->wb.num_wb = AMDGPU_MAX_WB; 1511 memset(&adev->wb.used, 0, sizeof(adev->wb.used)); 1512 1513 /* clear wb memory */ 1514 memset((char *)adev->wb.wb, 0, AMDGPU_MAX_WB * sizeof(uint32_t) * 8); 1515 } 1516 1517 return 0; 1518 } 1519 1520 /** 1521 * amdgpu_device_wb_get - Allocate a wb entry 1522 * 1523 * @adev: amdgpu_device pointer 1524 * @wb: wb index 1525 * 1526 * Allocate a wb slot for use by the driver (all asics). 1527 * Returns 0 on success or -EINVAL on failure. 1528 */ 1529 int amdgpu_device_wb_get(struct amdgpu_device *adev, u32 *wb) 1530 { 1531 unsigned long flags, offset; 1532 1533 spin_lock_irqsave(&adev->wb.lock, flags); 1534 offset = find_first_zero_bit(adev->wb.used, adev->wb.num_wb); 1535 if (offset < adev->wb.num_wb) { 1536 __set_bit(offset, adev->wb.used); 1537 spin_unlock_irqrestore(&adev->wb.lock, flags); 1538 *wb = offset << 3; /* convert to dw offset */ 1539 return 0; 1540 } else { 1541 spin_unlock_irqrestore(&adev->wb.lock, flags); 1542 return -EINVAL; 1543 } 1544 } 1545 1546 /** 1547 * amdgpu_device_wb_free - Free a wb entry 1548 * 1549 * @adev: amdgpu_device pointer 1550 * @wb: wb index 1551 * 1552 * Free a wb slot allocated for use by the driver (all asics) 1553 */ 1554 void amdgpu_device_wb_free(struct amdgpu_device *adev, u32 wb) 1555 { 1556 unsigned long flags; 1557 1558 wb >>= 3; 1559 spin_lock_irqsave(&adev->wb.lock, flags); 1560 if (wb < adev->wb.num_wb) 1561 __clear_bit(wb, adev->wb.used); 1562 spin_unlock_irqrestore(&adev->wb.lock, flags); 1563 } 1564 1565 /** 1566 * amdgpu_device_resize_fb_bar - try to resize FB BAR 1567 * 1568 * @adev: amdgpu_device pointer 1569 * 1570 * Try to resize FB BAR to make all VRAM CPU accessible. We try very hard not 1571 * to fail, but if any of the BARs is not accessible after the size we abort 1572 * driver loading by returning -ENODEV. 1573 */ 1574 int amdgpu_device_resize_fb_bar(struct amdgpu_device *adev) 1575 { 1576 int rbar_size = pci_rebar_bytes_to_size(adev->gmc.real_vram_size); 1577 struct pci_bus *root; 1578 struct resource *res; 1579 unsigned int i; 1580 u16 cmd; 1581 int r; 1582 1583 if (!IS_ENABLED(CONFIG_PHYS_ADDR_T_64BIT)) 1584 return 0; 1585 1586 /* Bypass for VF */ 1587 if (amdgpu_sriov_vf(adev)) 1588 return 0; 1589 1590 /* PCI_EXT_CAP_ID_VNDR extended capability is located at 0x100 */ 1591 if (!pci_find_ext_capability(adev->pdev, PCI_EXT_CAP_ID_VNDR)) 1592 DRM_WARN("System can't access extended configuration space, please check!!\n"); 1593 1594 /* skip if the bios has already enabled large BAR */ 1595 if (adev->gmc.real_vram_size && 1596 (pci_resource_len(adev->pdev, 0) >= adev->gmc.real_vram_size)) 1597 return 0; 1598 1599 /* Check if the root BUS has 64bit memory resources */ 1600 root = adev->pdev->bus; 1601 while (root->parent) 1602 root = root->parent; 1603 1604 pci_bus_for_each_resource(root, res, i) { 1605 if (res && res->flags & (IORESOURCE_MEM | IORESOURCE_MEM_64) && 1606 res->start > 0x100000000ull) 1607 break; 1608 } 1609 1610 /* Trying to resize is pointless without a root hub window above 4GB */ 1611 if (!res) 1612 return 0; 1613 1614 /* Limit the BAR size to what is available */ 1615 rbar_size = min(fls(pci_rebar_get_possible_sizes(adev->pdev, 0)) - 1, 1616 rbar_size); 1617 1618 /* Disable memory decoding while we change the BAR addresses and size */ 1619 pci_read_config_word(adev->pdev, PCI_COMMAND, &cmd); 1620 pci_write_config_word(adev->pdev, PCI_COMMAND, 1621 cmd & ~PCI_COMMAND_MEMORY); 1622 1623 /* Free the VRAM and doorbell BAR, we most likely need to move both. */ 1624 amdgpu_doorbell_fini(adev); 1625 if (adev->asic_type >= CHIP_BONAIRE) 1626 pci_release_resource(adev->pdev, 2); 1627 1628 pci_release_resource(adev->pdev, 0); 1629 1630 r = pci_resize_resource(adev->pdev, 0, rbar_size); 1631 if (r == -ENOSPC) 1632 DRM_INFO("Not enough PCI address space for a large BAR."); 1633 else if (r && r != -ENOTSUPP) 1634 DRM_ERROR("Problem resizing BAR0 (%d).", r); 1635 1636 pci_assign_unassigned_bus_resources(adev->pdev->bus); 1637 1638 /* When the doorbell or fb BAR isn't available we have no chance of 1639 * using the device. 1640 */ 1641 r = amdgpu_doorbell_init(adev); 1642 if (r || (pci_resource_flags(adev->pdev, 0) & IORESOURCE_UNSET)) 1643 return -ENODEV; 1644 1645 pci_write_config_word(adev->pdev, PCI_COMMAND, cmd); 1646 1647 return 0; 1648 } 1649 1650 static bool amdgpu_device_read_bios(struct amdgpu_device *adev) 1651 { 1652 if (hweight32(adev->aid_mask) && (adev->flags & AMD_IS_APU)) 1653 return false; 1654 1655 return true; 1656 } 1657 1658 /* 1659 * GPU helpers function. 1660 */ 1661 /** 1662 * amdgpu_device_need_post - check if the hw need post or not 1663 * 1664 * @adev: amdgpu_device pointer 1665 * 1666 * Check if the asic has been initialized (all asics) at driver startup 1667 * or post is needed if hw reset is performed. 1668 * Returns true if need or false if not. 1669 */ 1670 bool amdgpu_device_need_post(struct amdgpu_device *adev) 1671 { 1672 uint32_t reg; 1673 1674 if (amdgpu_sriov_vf(adev)) 1675 return false; 1676 1677 if (!amdgpu_device_read_bios(adev)) 1678 return false; 1679 1680 if (amdgpu_passthrough(adev)) { 1681 /* for FIJI: In whole GPU pass-through virtualization case, after VM reboot 1682 * some old smc fw still need driver do vPost otherwise gpu hang, while 1683 * those smc fw version above 22.15 doesn't have this flaw, so we force 1684 * vpost executed for smc version below 22.15 1685 */ 1686 if (adev->asic_type == CHIP_FIJI) { 1687 int err; 1688 uint32_t fw_ver; 1689 1690 err = request_firmware(&adev->pm.fw, "amdgpu/fiji_smc.bin", adev->dev); 1691 /* force vPost if error occured */ 1692 if (err) 1693 return true; 1694 1695 fw_ver = *((uint32_t *)adev->pm.fw->data + 69); 1696 release_firmware(adev->pm.fw); 1697 if (fw_ver < 0x00160e00) 1698 return true; 1699 } 1700 } 1701 1702 /* Don't post if we need to reset whole hive on init */ 1703 if (adev->init_lvl->level == AMDGPU_INIT_LEVEL_MINIMAL_XGMI) 1704 return false; 1705 1706 if (adev->has_hw_reset) { 1707 adev->has_hw_reset = false; 1708 return true; 1709 } 1710 1711 /* bios scratch used on CIK+ */ 1712 if (adev->asic_type >= CHIP_BONAIRE) 1713 return amdgpu_atombios_scratch_need_asic_init(adev); 1714 1715 /* check MEM_SIZE for older asics */ 1716 reg = amdgpu_asic_get_config_memsize(adev); 1717 1718 if ((reg != 0) && (reg != 0xffffffff)) 1719 return false; 1720 1721 return true; 1722 } 1723 1724 /* 1725 * Check whether seamless boot is supported. 1726 * 1727 * So far we only support seamless boot on DCE 3.0 or later. 1728 * If users report that it works on older ASICS as well, we may 1729 * loosen this. 1730 */ 1731 bool amdgpu_device_seamless_boot_supported(struct amdgpu_device *adev) 1732 { 1733 switch (amdgpu_seamless) { 1734 case -1: 1735 break; 1736 case 1: 1737 return true; 1738 case 0: 1739 return false; 1740 default: 1741 DRM_ERROR("Invalid value for amdgpu.seamless: %d\n", 1742 amdgpu_seamless); 1743 return false; 1744 } 1745 1746 if (!(adev->flags & AMD_IS_APU)) 1747 return false; 1748 1749 if (adev->mman.keep_stolen_vga_memory) 1750 return false; 1751 1752 return amdgpu_ip_version(adev, DCE_HWIP, 0) >= IP_VERSION(3, 0, 0); 1753 } 1754 1755 /* 1756 * Intel hosts such as Rocket Lake, Alder Lake, Raptor Lake and Sapphire Rapids 1757 * don't support dynamic speed switching. Until we have confirmation from Intel 1758 * that a specific host supports it, it's safer that we keep it disabled for all. 1759 * 1760 * https://edc.intel.com/content/www/us/en/design/products/platforms/details/raptor-lake-s/13th-generation-core-processors-datasheet-volume-1-of-2/005/pci-express-support/ 1761 * https://gitlab.freedesktop.org/drm/amd/-/issues/2663 1762 */ 1763 static bool amdgpu_device_pcie_dynamic_switching_supported(struct amdgpu_device *adev) 1764 { 1765 #if IS_ENABLED(CONFIG_X86) 1766 struct cpuinfo_x86 *c = &cpu_data(0); 1767 1768 /* eGPU change speeds based on USB4 fabric conditions */ 1769 if (dev_is_removable(adev->dev)) 1770 return true; 1771 1772 if (c->x86_vendor == X86_VENDOR_INTEL) 1773 return false; 1774 #endif 1775 return true; 1776 } 1777 1778 /** 1779 * amdgpu_device_should_use_aspm - check if the device should program ASPM 1780 * 1781 * @adev: amdgpu_device pointer 1782 * 1783 * Confirm whether the module parameter and pcie bridge agree that ASPM should 1784 * be set for this device. 1785 * 1786 * Returns true if it should be used or false if not. 1787 */ 1788 bool amdgpu_device_should_use_aspm(struct amdgpu_device *adev) 1789 { 1790 switch (amdgpu_aspm) { 1791 case -1: 1792 break; 1793 case 0: 1794 return false; 1795 case 1: 1796 return true; 1797 default: 1798 return false; 1799 } 1800 if (adev->flags & AMD_IS_APU) 1801 return false; 1802 if (!(adev->pm.pp_feature & PP_PCIE_DPM_MASK)) 1803 return false; 1804 return pcie_aspm_enabled(adev->pdev); 1805 } 1806 1807 /* if we get transitioned to only one device, take VGA back */ 1808 /** 1809 * amdgpu_device_vga_set_decode - enable/disable vga decode 1810 * 1811 * @pdev: PCI device pointer 1812 * @state: enable/disable vga decode 1813 * 1814 * Enable/disable vga decode (all asics). 1815 * Returns VGA resource flags. 1816 */ 1817 static unsigned int amdgpu_device_vga_set_decode(struct pci_dev *pdev, 1818 bool state) 1819 { 1820 struct amdgpu_device *adev = drm_to_adev(pci_get_drvdata(pdev)); 1821 1822 amdgpu_asic_set_vga_state(adev, state); 1823 if (state) 1824 return VGA_RSRC_LEGACY_IO | VGA_RSRC_LEGACY_MEM | 1825 VGA_RSRC_NORMAL_IO | VGA_RSRC_NORMAL_MEM; 1826 else 1827 return VGA_RSRC_NORMAL_IO | VGA_RSRC_NORMAL_MEM; 1828 } 1829 1830 /** 1831 * amdgpu_device_check_block_size - validate the vm block size 1832 * 1833 * @adev: amdgpu_device pointer 1834 * 1835 * Validates the vm block size specified via module parameter. 1836 * The vm block size defines number of bits in page table versus page directory, 1837 * a page is 4KB so we have 12 bits offset, minimum 9 bits in the 1838 * page table and the remaining bits are in the page directory. 1839 */ 1840 static void amdgpu_device_check_block_size(struct amdgpu_device *adev) 1841 { 1842 /* defines number of bits in page table versus page directory, 1843 * a page is 4KB so we have 12 bits offset, minimum 9 bits in the 1844 * page table and the remaining bits are in the page directory 1845 */ 1846 if (amdgpu_vm_block_size == -1) 1847 return; 1848 1849 if (amdgpu_vm_block_size < 9) { 1850 dev_warn(adev->dev, "VM page table size (%d) too small\n", 1851 amdgpu_vm_block_size); 1852 amdgpu_vm_block_size = -1; 1853 } 1854 } 1855 1856 /** 1857 * amdgpu_device_check_vm_size - validate the vm size 1858 * 1859 * @adev: amdgpu_device pointer 1860 * 1861 * Validates the vm size in GB specified via module parameter. 1862 * The VM size is the size of the GPU virtual memory space in GB. 1863 */ 1864 static void amdgpu_device_check_vm_size(struct amdgpu_device *adev) 1865 { 1866 /* no need to check the default value */ 1867 if (amdgpu_vm_size == -1) 1868 return; 1869 1870 if (amdgpu_vm_size < 1) { 1871 dev_warn(adev->dev, "VM size (%d) too small, min is 1GB\n", 1872 amdgpu_vm_size); 1873 amdgpu_vm_size = -1; 1874 } 1875 } 1876 1877 static void amdgpu_device_check_smu_prv_buffer_size(struct amdgpu_device *adev) 1878 { 1879 struct sysinfo si; 1880 bool is_os_64 = (sizeof(void *) == 8); 1881 uint64_t total_memory; 1882 uint64_t dram_size_seven_GB = 0x1B8000000; 1883 uint64_t dram_size_three_GB = 0xB8000000; 1884 1885 if (amdgpu_smu_memory_pool_size == 0) 1886 return; 1887 1888 if (!is_os_64) { 1889 DRM_WARN("Not 64-bit OS, feature not supported\n"); 1890 goto def_value; 1891 } 1892 si_meminfo(&si); 1893 total_memory = (uint64_t)si.totalram * si.mem_unit; 1894 1895 if ((amdgpu_smu_memory_pool_size == 1) || 1896 (amdgpu_smu_memory_pool_size == 2)) { 1897 if (total_memory < dram_size_three_GB) 1898 goto def_value1; 1899 } else if ((amdgpu_smu_memory_pool_size == 4) || 1900 (amdgpu_smu_memory_pool_size == 8)) { 1901 if (total_memory < dram_size_seven_GB) 1902 goto def_value1; 1903 } else { 1904 DRM_WARN("Smu memory pool size not supported\n"); 1905 goto def_value; 1906 } 1907 adev->pm.smu_prv_buffer_size = amdgpu_smu_memory_pool_size << 28; 1908 1909 return; 1910 1911 def_value1: 1912 DRM_WARN("No enough system memory\n"); 1913 def_value: 1914 adev->pm.smu_prv_buffer_size = 0; 1915 } 1916 1917 static int amdgpu_device_init_apu_flags(struct amdgpu_device *adev) 1918 { 1919 if (!(adev->flags & AMD_IS_APU) || 1920 adev->asic_type < CHIP_RAVEN) 1921 return 0; 1922 1923 switch (adev->asic_type) { 1924 case CHIP_RAVEN: 1925 if (adev->pdev->device == 0x15dd) 1926 adev->apu_flags |= AMD_APU_IS_RAVEN; 1927 if (adev->pdev->device == 0x15d8) 1928 adev->apu_flags |= AMD_APU_IS_PICASSO; 1929 break; 1930 case CHIP_RENOIR: 1931 if ((adev->pdev->device == 0x1636) || 1932 (adev->pdev->device == 0x164c)) 1933 adev->apu_flags |= AMD_APU_IS_RENOIR; 1934 else 1935 adev->apu_flags |= AMD_APU_IS_GREEN_SARDINE; 1936 break; 1937 case CHIP_VANGOGH: 1938 adev->apu_flags |= AMD_APU_IS_VANGOGH; 1939 break; 1940 case CHIP_YELLOW_CARP: 1941 break; 1942 case CHIP_CYAN_SKILLFISH: 1943 if ((adev->pdev->device == 0x13FE) || 1944 (adev->pdev->device == 0x143F)) 1945 adev->apu_flags |= AMD_APU_IS_CYAN_SKILLFISH2; 1946 break; 1947 default: 1948 break; 1949 } 1950 1951 return 0; 1952 } 1953 1954 /** 1955 * amdgpu_device_check_arguments - validate module params 1956 * 1957 * @adev: amdgpu_device pointer 1958 * 1959 * Validates certain module parameters and updates 1960 * the associated values used by the driver (all asics). 1961 */ 1962 static int amdgpu_device_check_arguments(struct amdgpu_device *adev) 1963 { 1964 int i; 1965 1966 if (amdgpu_sched_jobs < 4) { 1967 dev_warn(adev->dev, "sched jobs (%d) must be at least 4\n", 1968 amdgpu_sched_jobs); 1969 amdgpu_sched_jobs = 4; 1970 } else if (!is_power_of_2(amdgpu_sched_jobs)) { 1971 dev_warn(adev->dev, "sched jobs (%d) must be a power of 2\n", 1972 amdgpu_sched_jobs); 1973 amdgpu_sched_jobs = roundup_pow_of_two(amdgpu_sched_jobs); 1974 } 1975 1976 if (amdgpu_gart_size != -1 && amdgpu_gart_size < 32) { 1977 /* gart size must be greater or equal to 32M */ 1978 dev_warn(adev->dev, "gart size (%d) too small\n", 1979 amdgpu_gart_size); 1980 amdgpu_gart_size = -1; 1981 } 1982 1983 if (amdgpu_gtt_size != -1 && amdgpu_gtt_size < 32) { 1984 /* gtt size must be greater or equal to 32M */ 1985 dev_warn(adev->dev, "gtt size (%d) too small\n", 1986 amdgpu_gtt_size); 1987 amdgpu_gtt_size = -1; 1988 } 1989 1990 /* valid range is between 4 and 9 inclusive */ 1991 if (amdgpu_vm_fragment_size != -1 && 1992 (amdgpu_vm_fragment_size > 9 || amdgpu_vm_fragment_size < 4)) { 1993 dev_warn(adev->dev, "valid range is between 4 and 9\n"); 1994 amdgpu_vm_fragment_size = -1; 1995 } 1996 1997 if (amdgpu_sched_hw_submission < 2) { 1998 dev_warn(adev->dev, "sched hw submission jobs (%d) must be at least 2\n", 1999 amdgpu_sched_hw_submission); 2000 amdgpu_sched_hw_submission = 2; 2001 } else if (!is_power_of_2(amdgpu_sched_hw_submission)) { 2002 dev_warn(adev->dev, "sched hw submission jobs (%d) must be a power of 2\n", 2003 amdgpu_sched_hw_submission); 2004 amdgpu_sched_hw_submission = roundup_pow_of_two(amdgpu_sched_hw_submission); 2005 } 2006 2007 if (amdgpu_reset_method < -1 || amdgpu_reset_method > 4) { 2008 dev_warn(adev->dev, "invalid option for reset method, reverting to default\n"); 2009 amdgpu_reset_method = -1; 2010 } 2011 2012 amdgpu_device_check_smu_prv_buffer_size(adev); 2013 2014 amdgpu_device_check_vm_size(adev); 2015 2016 amdgpu_device_check_block_size(adev); 2017 2018 adev->firmware.load_type = amdgpu_ucode_get_load_type(adev, amdgpu_fw_load_type); 2019 2020 for (i = 0; i < MAX_XCP; i++) 2021 adev->enforce_isolation[i] = !!enforce_isolation; 2022 2023 return 0; 2024 } 2025 2026 /** 2027 * amdgpu_switcheroo_set_state - set switcheroo state 2028 * 2029 * @pdev: pci dev pointer 2030 * @state: vga_switcheroo state 2031 * 2032 * Callback for the switcheroo driver. Suspends or resumes 2033 * the asics before or after it is powered up using ACPI methods. 2034 */ 2035 static void amdgpu_switcheroo_set_state(struct pci_dev *pdev, 2036 enum vga_switcheroo_state state) 2037 { 2038 struct drm_device *dev = pci_get_drvdata(pdev); 2039 int r; 2040 2041 if (amdgpu_device_supports_px(dev) && state == VGA_SWITCHEROO_OFF) 2042 return; 2043 2044 if (state == VGA_SWITCHEROO_ON) { 2045 pr_info("switched on\n"); 2046 /* don't suspend or resume card normally */ 2047 dev->switch_power_state = DRM_SWITCH_POWER_CHANGING; 2048 2049 pci_set_power_state(pdev, PCI_D0); 2050 amdgpu_device_load_pci_state(pdev); 2051 r = pci_enable_device(pdev); 2052 if (r) 2053 DRM_WARN("pci_enable_device failed (%d)\n", r); 2054 amdgpu_device_resume(dev, true); 2055 2056 dev->switch_power_state = DRM_SWITCH_POWER_ON; 2057 } else { 2058 pr_info("switched off\n"); 2059 dev->switch_power_state = DRM_SWITCH_POWER_CHANGING; 2060 amdgpu_device_prepare(dev); 2061 amdgpu_device_suspend(dev, true); 2062 amdgpu_device_cache_pci_state(pdev); 2063 /* Shut down the device */ 2064 pci_disable_device(pdev); 2065 pci_set_power_state(pdev, PCI_D3cold); 2066 dev->switch_power_state = DRM_SWITCH_POWER_OFF; 2067 } 2068 } 2069 2070 /** 2071 * amdgpu_switcheroo_can_switch - see if switcheroo state can change 2072 * 2073 * @pdev: pci dev pointer 2074 * 2075 * Callback for the switcheroo driver. Check of the switcheroo 2076 * state can be changed. 2077 * Returns true if the state can be changed, false if not. 2078 */ 2079 static bool amdgpu_switcheroo_can_switch(struct pci_dev *pdev) 2080 { 2081 struct drm_device *dev = pci_get_drvdata(pdev); 2082 2083 /* 2084 * FIXME: open_count is protected by drm_global_mutex but that would lead to 2085 * locking inversion with the driver load path. And the access here is 2086 * completely racy anyway. So don't bother with locking for now. 2087 */ 2088 return atomic_read(&dev->open_count) == 0; 2089 } 2090 2091 static const struct vga_switcheroo_client_ops amdgpu_switcheroo_ops = { 2092 .set_gpu_state = amdgpu_switcheroo_set_state, 2093 .reprobe = NULL, 2094 .can_switch = amdgpu_switcheroo_can_switch, 2095 }; 2096 2097 /** 2098 * amdgpu_device_ip_set_clockgating_state - set the CG state 2099 * 2100 * @dev: amdgpu_device pointer 2101 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.) 2102 * @state: clockgating state (gate or ungate) 2103 * 2104 * Sets the requested clockgating state for all instances of 2105 * the hardware IP specified. 2106 * Returns the error code from the last instance. 2107 */ 2108 int amdgpu_device_ip_set_clockgating_state(void *dev, 2109 enum amd_ip_block_type block_type, 2110 enum amd_clockgating_state state) 2111 { 2112 struct amdgpu_device *adev = dev; 2113 int i, r = 0; 2114 2115 for (i = 0; i < adev->num_ip_blocks; i++) { 2116 if (!adev->ip_blocks[i].status.valid) 2117 continue; 2118 if (adev->ip_blocks[i].version->type != block_type) 2119 continue; 2120 if (!adev->ip_blocks[i].version->funcs->set_clockgating_state) 2121 continue; 2122 r = adev->ip_blocks[i].version->funcs->set_clockgating_state( 2123 (void *)adev, state); 2124 if (r) 2125 DRM_ERROR("set_clockgating_state of IP block <%s> failed %d\n", 2126 adev->ip_blocks[i].version->funcs->name, r); 2127 } 2128 return r; 2129 } 2130 2131 /** 2132 * amdgpu_device_ip_set_powergating_state - set the PG state 2133 * 2134 * @dev: amdgpu_device pointer 2135 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.) 2136 * @state: powergating state (gate or ungate) 2137 * 2138 * Sets the requested powergating state for all instances of 2139 * the hardware IP specified. 2140 * Returns the error code from the last instance. 2141 */ 2142 int amdgpu_device_ip_set_powergating_state(void *dev, 2143 enum amd_ip_block_type block_type, 2144 enum amd_powergating_state state) 2145 { 2146 struct amdgpu_device *adev = dev; 2147 int i, r = 0; 2148 2149 for (i = 0; i < adev->num_ip_blocks; i++) { 2150 if (!adev->ip_blocks[i].status.valid) 2151 continue; 2152 if (adev->ip_blocks[i].version->type != block_type) 2153 continue; 2154 if (!adev->ip_blocks[i].version->funcs->set_powergating_state) 2155 continue; 2156 r = adev->ip_blocks[i].version->funcs->set_powergating_state( 2157 (void *)adev, state); 2158 if (r) 2159 DRM_ERROR("set_powergating_state of IP block <%s> failed %d\n", 2160 adev->ip_blocks[i].version->funcs->name, r); 2161 } 2162 return r; 2163 } 2164 2165 /** 2166 * amdgpu_device_ip_get_clockgating_state - get the CG state 2167 * 2168 * @adev: amdgpu_device pointer 2169 * @flags: clockgating feature flags 2170 * 2171 * Walks the list of IPs on the device and updates the clockgating 2172 * flags for each IP. 2173 * Updates @flags with the feature flags for each hardware IP where 2174 * clockgating is enabled. 2175 */ 2176 void amdgpu_device_ip_get_clockgating_state(struct amdgpu_device *adev, 2177 u64 *flags) 2178 { 2179 int i; 2180 2181 for (i = 0; i < adev->num_ip_blocks; i++) { 2182 if (!adev->ip_blocks[i].status.valid) 2183 continue; 2184 if (adev->ip_blocks[i].version->funcs->get_clockgating_state) 2185 adev->ip_blocks[i].version->funcs->get_clockgating_state((void *)adev, flags); 2186 } 2187 } 2188 2189 /** 2190 * amdgpu_device_ip_wait_for_idle - wait for idle 2191 * 2192 * @adev: amdgpu_device pointer 2193 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.) 2194 * 2195 * Waits for the request hardware IP to be idle. 2196 * Returns 0 for success or a negative error code on failure. 2197 */ 2198 int amdgpu_device_ip_wait_for_idle(struct amdgpu_device *adev, 2199 enum amd_ip_block_type block_type) 2200 { 2201 int i, r; 2202 2203 for (i = 0; i < adev->num_ip_blocks; i++) { 2204 if (!adev->ip_blocks[i].status.valid) 2205 continue; 2206 if (adev->ip_blocks[i].version->type == block_type) { 2207 r = adev->ip_blocks[i].version->funcs->wait_for_idle((void *)adev); 2208 if (r) 2209 return r; 2210 break; 2211 } 2212 } 2213 return 0; 2214 2215 } 2216 2217 /** 2218 * amdgpu_device_ip_is_valid - is the hardware IP enabled 2219 * 2220 * @adev: amdgpu_device pointer 2221 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.) 2222 * 2223 * Check if the hardware IP is enable or not. 2224 * Returns true if it the IP is enable, false if not. 2225 */ 2226 bool amdgpu_device_ip_is_valid(struct amdgpu_device *adev, 2227 enum amd_ip_block_type block_type) 2228 { 2229 int i; 2230 2231 for (i = 0; i < adev->num_ip_blocks; i++) { 2232 if (adev->ip_blocks[i].version->type == block_type) 2233 return adev->ip_blocks[i].status.valid; 2234 } 2235 return false; 2236 2237 } 2238 2239 /** 2240 * amdgpu_device_ip_get_ip_block - get a hw IP pointer 2241 * 2242 * @adev: amdgpu_device pointer 2243 * @type: Type of hardware IP (SMU, GFX, UVD, etc.) 2244 * 2245 * Returns a pointer to the hardware IP block structure 2246 * if it exists for the asic, otherwise NULL. 2247 */ 2248 struct amdgpu_ip_block * 2249 amdgpu_device_ip_get_ip_block(struct amdgpu_device *adev, 2250 enum amd_ip_block_type type) 2251 { 2252 int i; 2253 2254 for (i = 0; i < adev->num_ip_blocks; i++) 2255 if (adev->ip_blocks[i].version->type == type) 2256 return &adev->ip_blocks[i]; 2257 2258 return NULL; 2259 } 2260 2261 /** 2262 * amdgpu_device_ip_block_version_cmp 2263 * 2264 * @adev: amdgpu_device pointer 2265 * @type: enum amd_ip_block_type 2266 * @major: major version 2267 * @minor: minor version 2268 * 2269 * return 0 if equal or greater 2270 * return 1 if smaller or the ip_block doesn't exist 2271 */ 2272 int amdgpu_device_ip_block_version_cmp(struct amdgpu_device *adev, 2273 enum amd_ip_block_type type, 2274 u32 major, u32 minor) 2275 { 2276 struct amdgpu_ip_block *ip_block = amdgpu_device_ip_get_ip_block(adev, type); 2277 2278 if (ip_block && ((ip_block->version->major > major) || 2279 ((ip_block->version->major == major) && 2280 (ip_block->version->minor >= minor)))) 2281 return 0; 2282 2283 return 1; 2284 } 2285 2286 /** 2287 * amdgpu_device_ip_block_add 2288 * 2289 * @adev: amdgpu_device pointer 2290 * @ip_block_version: pointer to the IP to add 2291 * 2292 * Adds the IP block driver information to the collection of IPs 2293 * on the asic. 2294 */ 2295 int amdgpu_device_ip_block_add(struct amdgpu_device *adev, 2296 const struct amdgpu_ip_block_version *ip_block_version) 2297 { 2298 if (!ip_block_version) 2299 return -EINVAL; 2300 2301 switch (ip_block_version->type) { 2302 case AMD_IP_BLOCK_TYPE_VCN: 2303 if (adev->harvest_ip_mask & AMD_HARVEST_IP_VCN_MASK) 2304 return 0; 2305 break; 2306 case AMD_IP_BLOCK_TYPE_JPEG: 2307 if (adev->harvest_ip_mask & AMD_HARVEST_IP_JPEG_MASK) 2308 return 0; 2309 break; 2310 default: 2311 break; 2312 } 2313 2314 DRM_INFO("add ip block number %d <%s>\n", adev->num_ip_blocks, 2315 ip_block_version->funcs->name); 2316 2317 adev->ip_blocks[adev->num_ip_blocks].adev = adev; 2318 2319 adev->ip_blocks[adev->num_ip_blocks++].version = ip_block_version; 2320 2321 return 0; 2322 } 2323 2324 /** 2325 * amdgpu_device_enable_virtual_display - enable virtual display feature 2326 * 2327 * @adev: amdgpu_device pointer 2328 * 2329 * Enabled the virtual display feature if the user has enabled it via 2330 * the module parameter virtual_display. This feature provides a virtual 2331 * display hardware on headless boards or in virtualized environments. 2332 * This function parses and validates the configuration string specified by 2333 * the user and configues the virtual display configuration (number of 2334 * virtual connectors, crtcs, etc.) specified. 2335 */ 2336 static void amdgpu_device_enable_virtual_display(struct amdgpu_device *adev) 2337 { 2338 adev->enable_virtual_display = false; 2339 2340 if (amdgpu_virtual_display) { 2341 const char *pci_address_name = pci_name(adev->pdev); 2342 char *pciaddstr, *pciaddstr_tmp, *pciaddname_tmp, *pciaddname; 2343 2344 pciaddstr = kstrdup(amdgpu_virtual_display, GFP_KERNEL); 2345 pciaddstr_tmp = pciaddstr; 2346 while ((pciaddname_tmp = strsep(&pciaddstr_tmp, ";"))) { 2347 pciaddname = strsep(&pciaddname_tmp, ","); 2348 if (!strcmp("all", pciaddname) 2349 || !strcmp(pci_address_name, pciaddname)) { 2350 long num_crtc; 2351 int res = -1; 2352 2353 adev->enable_virtual_display = true; 2354 2355 if (pciaddname_tmp) 2356 res = kstrtol(pciaddname_tmp, 10, 2357 &num_crtc); 2358 2359 if (!res) { 2360 if (num_crtc < 1) 2361 num_crtc = 1; 2362 if (num_crtc > 6) 2363 num_crtc = 6; 2364 adev->mode_info.num_crtc = num_crtc; 2365 } else { 2366 adev->mode_info.num_crtc = 1; 2367 } 2368 break; 2369 } 2370 } 2371 2372 DRM_INFO("virtual display string:%s, %s:virtual_display:%d, num_crtc:%d\n", 2373 amdgpu_virtual_display, pci_address_name, 2374 adev->enable_virtual_display, adev->mode_info.num_crtc); 2375 2376 kfree(pciaddstr); 2377 } 2378 } 2379 2380 void amdgpu_device_set_sriov_virtual_display(struct amdgpu_device *adev) 2381 { 2382 if (amdgpu_sriov_vf(adev) && !adev->enable_virtual_display) { 2383 adev->mode_info.num_crtc = 1; 2384 adev->enable_virtual_display = true; 2385 DRM_INFO("virtual_display:%d, num_crtc:%d\n", 2386 adev->enable_virtual_display, adev->mode_info.num_crtc); 2387 } 2388 } 2389 2390 /** 2391 * amdgpu_device_parse_gpu_info_fw - parse gpu info firmware 2392 * 2393 * @adev: amdgpu_device pointer 2394 * 2395 * Parses the asic configuration parameters specified in the gpu info 2396 * firmware and makes them availale to the driver for use in configuring 2397 * the asic. 2398 * Returns 0 on success, -EINVAL on failure. 2399 */ 2400 static int amdgpu_device_parse_gpu_info_fw(struct amdgpu_device *adev) 2401 { 2402 const char *chip_name; 2403 int err; 2404 const struct gpu_info_firmware_header_v1_0 *hdr; 2405 2406 adev->firmware.gpu_info_fw = NULL; 2407 2408 if (adev->mman.discovery_bin) 2409 return 0; 2410 2411 switch (adev->asic_type) { 2412 default: 2413 return 0; 2414 case CHIP_VEGA10: 2415 chip_name = "vega10"; 2416 break; 2417 case CHIP_VEGA12: 2418 chip_name = "vega12"; 2419 break; 2420 case CHIP_RAVEN: 2421 if (adev->apu_flags & AMD_APU_IS_RAVEN2) 2422 chip_name = "raven2"; 2423 else if (adev->apu_flags & AMD_APU_IS_PICASSO) 2424 chip_name = "picasso"; 2425 else 2426 chip_name = "raven"; 2427 break; 2428 case CHIP_ARCTURUS: 2429 chip_name = "arcturus"; 2430 break; 2431 case CHIP_NAVI12: 2432 chip_name = "navi12"; 2433 break; 2434 } 2435 2436 err = amdgpu_ucode_request(adev, &adev->firmware.gpu_info_fw, 2437 "amdgpu/%s_gpu_info.bin", chip_name); 2438 if (err) { 2439 dev_err(adev->dev, 2440 "Failed to get gpu_info firmware \"%s_gpu_info.bin\"\n", 2441 chip_name); 2442 goto out; 2443 } 2444 2445 hdr = (const struct gpu_info_firmware_header_v1_0 *)adev->firmware.gpu_info_fw->data; 2446 amdgpu_ucode_print_gpu_info_hdr(&hdr->header); 2447 2448 switch (hdr->version_major) { 2449 case 1: 2450 { 2451 const struct gpu_info_firmware_v1_0 *gpu_info_fw = 2452 (const struct gpu_info_firmware_v1_0 *)(adev->firmware.gpu_info_fw->data + 2453 le32_to_cpu(hdr->header.ucode_array_offset_bytes)); 2454 2455 /* 2456 * Should be droped when DAL no longer needs it. 2457 */ 2458 if (adev->asic_type == CHIP_NAVI12) 2459 goto parse_soc_bounding_box; 2460 2461 adev->gfx.config.max_shader_engines = le32_to_cpu(gpu_info_fw->gc_num_se); 2462 adev->gfx.config.max_cu_per_sh = le32_to_cpu(gpu_info_fw->gc_num_cu_per_sh); 2463 adev->gfx.config.max_sh_per_se = le32_to_cpu(gpu_info_fw->gc_num_sh_per_se); 2464 adev->gfx.config.max_backends_per_se = le32_to_cpu(gpu_info_fw->gc_num_rb_per_se); 2465 adev->gfx.config.max_texture_channel_caches = 2466 le32_to_cpu(gpu_info_fw->gc_num_tccs); 2467 adev->gfx.config.max_gprs = le32_to_cpu(gpu_info_fw->gc_num_gprs); 2468 adev->gfx.config.max_gs_threads = le32_to_cpu(gpu_info_fw->gc_num_max_gs_thds); 2469 adev->gfx.config.gs_vgt_table_depth = le32_to_cpu(gpu_info_fw->gc_gs_table_depth); 2470 adev->gfx.config.gs_prim_buffer_depth = le32_to_cpu(gpu_info_fw->gc_gsprim_buff_depth); 2471 adev->gfx.config.double_offchip_lds_buf = 2472 le32_to_cpu(gpu_info_fw->gc_double_offchip_lds_buffer); 2473 adev->gfx.cu_info.wave_front_size = le32_to_cpu(gpu_info_fw->gc_wave_size); 2474 adev->gfx.cu_info.max_waves_per_simd = 2475 le32_to_cpu(gpu_info_fw->gc_max_waves_per_simd); 2476 adev->gfx.cu_info.max_scratch_slots_per_cu = 2477 le32_to_cpu(gpu_info_fw->gc_max_scratch_slots_per_cu); 2478 adev->gfx.cu_info.lds_size = le32_to_cpu(gpu_info_fw->gc_lds_size); 2479 if (hdr->version_minor >= 1) { 2480 const struct gpu_info_firmware_v1_1 *gpu_info_fw = 2481 (const struct gpu_info_firmware_v1_1 *)(adev->firmware.gpu_info_fw->data + 2482 le32_to_cpu(hdr->header.ucode_array_offset_bytes)); 2483 adev->gfx.config.num_sc_per_sh = 2484 le32_to_cpu(gpu_info_fw->num_sc_per_sh); 2485 adev->gfx.config.num_packer_per_sc = 2486 le32_to_cpu(gpu_info_fw->num_packer_per_sc); 2487 } 2488 2489 parse_soc_bounding_box: 2490 /* 2491 * soc bounding box info is not integrated in disocovery table, 2492 * we always need to parse it from gpu info firmware if needed. 2493 */ 2494 if (hdr->version_minor == 2) { 2495 const struct gpu_info_firmware_v1_2 *gpu_info_fw = 2496 (const struct gpu_info_firmware_v1_2 *)(adev->firmware.gpu_info_fw->data + 2497 le32_to_cpu(hdr->header.ucode_array_offset_bytes)); 2498 adev->dm.soc_bounding_box = &gpu_info_fw->soc_bounding_box; 2499 } 2500 break; 2501 } 2502 default: 2503 dev_err(adev->dev, 2504 "Unsupported gpu_info table %d\n", hdr->header.ucode_version); 2505 err = -EINVAL; 2506 goto out; 2507 } 2508 out: 2509 return err; 2510 } 2511 2512 /** 2513 * amdgpu_device_ip_early_init - run early init for hardware IPs 2514 * 2515 * @adev: amdgpu_device pointer 2516 * 2517 * Early initialization pass for hardware IPs. The hardware IPs that make 2518 * up each asic are discovered each IP's early_init callback is run. This 2519 * is the first stage in initializing the asic. 2520 * Returns 0 on success, negative error code on failure. 2521 */ 2522 static int amdgpu_device_ip_early_init(struct amdgpu_device *adev) 2523 { 2524 struct amdgpu_ip_block *ip_block; 2525 struct pci_dev *parent; 2526 int i, r; 2527 bool total; 2528 2529 amdgpu_device_enable_virtual_display(adev); 2530 2531 if (amdgpu_sriov_vf(adev)) { 2532 r = amdgpu_virt_request_full_gpu(adev, true); 2533 if (r) 2534 return r; 2535 } 2536 2537 switch (adev->asic_type) { 2538 #ifdef CONFIG_DRM_AMDGPU_SI 2539 case CHIP_VERDE: 2540 case CHIP_TAHITI: 2541 case CHIP_PITCAIRN: 2542 case CHIP_OLAND: 2543 case CHIP_HAINAN: 2544 adev->family = AMDGPU_FAMILY_SI; 2545 r = si_set_ip_blocks(adev); 2546 if (r) 2547 return r; 2548 break; 2549 #endif 2550 #ifdef CONFIG_DRM_AMDGPU_CIK 2551 case CHIP_BONAIRE: 2552 case CHIP_HAWAII: 2553 case CHIP_KAVERI: 2554 case CHIP_KABINI: 2555 case CHIP_MULLINS: 2556 if (adev->flags & AMD_IS_APU) 2557 adev->family = AMDGPU_FAMILY_KV; 2558 else 2559 adev->family = AMDGPU_FAMILY_CI; 2560 2561 r = cik_set_ip_blocks(adev); 2562 if (r) 2563 return r; 2564 break; 2565 #endif 2566 case CHIP_TOPAZ: 2567 case CHIP_TONGA: 2568 case CHIP_FIJI: 2569 case CHIP_POLARIS10: 2570 case CHIP_POLARIS11: 2571 case CHIP_POLARIS12: 2572 case CHIP_VEGAM: 2573 case CHIP_CARRIZO: 2574 case CHIP_STONEY: 2575 if (adev->flags & AMD_IS_APU) 2576 adev->family = AMDGPU_FAMILY_CZ; 2577 else 2578 adev->family = AMDGPU_FAMILY_VI; 2579 2580 r = vi_set_ip_blocks(adev); 2581 if (r) 2582 return r; 2583 break; 2584 default: 2585 r = amdgpu_discovery_set_ip_blocks(adev); 2586 if (r) 2587 return r; 2588 break; 2589 } 2590 2591 if (amdgpu_has_atpx() && 2592 (amdgpu_is_atpx_hybrid() || 2593 amdgpu_has_atpx_dgpu_power_cntl()) && 2594 ((adev->flags & AMD_IS_APU) == 0) && 2595 !dev_is_removable(&adev->pdev->dev)) 2596 adev->flags |= AMD_IS_PX; 2597 2598 if (!(adev->flags & AMD_IS_APU)) { 2599 parent = pcie_find_root_port(adev->pdev); 2600 adev->has_pr3 = parent ? pci_pr3_present(parent) : false; 2601 } 2602 2603 2604 adev->pm.pp_feature = amdgpu_pp_feature_mask; 2605 if (amdgpu_sriov_vf(adev) || sched_policy == KFD_SCHED_POLICY_NO_HWS) 2606 adev->pm.pp_feature &= ~PP_GFXOFF_MASK; 2607 if (amdgpu_sriov_vf(adev) && adev->asic_type == CHIP_SIENNA_CICHLID) 2608 adev->pm.pp_feature &= ~PP_OVERDRIVE_MASK; 2609 if (!amdgpu_device_pcie_dynamic_switching_supported(adev)) 2610 adev->pm.pp_feature &= ~PP_PCIE_DPM_MASK; 2611 2612 total = true; 2613 for (i = 0; i < adev->num_ip_blocks; i++) { 2614 if ((amdgpu_ip_block_mask & (1 << i)) == 0) { 2615 DRM_WARN("disabled ip block: %d <%s>\n", 2616 i, adev->ip_blocks[i].version->funcs->name); 2617 adev->ip_blocks[i].status.valid = false; 2618 } else { 2619 if (adev->ip_blocks[i].version->funcs->early_init) { 2620 r = adev->ip_blocks[i].version->funcs->early_init((void *)adev); 2621 if (r == -ENOENT) { 2622 adev->ip_blocks[i].status.valid = false; 2623 } else if (r) { 2624 DRM_ERROR("early_init of IP block <%s> failed %d\n", 2625 adev->ip_blocks[i].version->funcs->name, r); 2626 total = false; 2627 } else { 2628 adev->ip_blocks[i].status.valid = true; 2629 } 2630 } else { 2631 adev->ip_blocks[i].status.valid = true; 2632 } 2633 } 2634 /* get the vbios after the asic_funcs are set up */ 2635 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON) { 2636 r = amdgpu_device_parse_gpu_info_fw(adev); 2637 if (r) 2638 return r; 2639 2640 /* Read BIOS */ 2641 if (amdgpu_device_read_bios(adev)) { 2642 if (!amdgpu_get_bios(adev)) 2643 return -EINVAL; 2644 2645 r = amdgpu_atombios_init(adev); 2646 if (r) { 2647 dev_err(adev->dev, "amdgpu_atombios_init failed\n"); 2648 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_INIT_FAIL, 0, 0); 2649 return r; 2650 } 2651 } 2652 2653 /*get pf2vf msg info at it's earliest time*/ 2654 if (amdgpu_sriov_vf(adev)) 2655 amdgpu_virt_init_data_exchange(adev); 2656 2657 } 2658 } 2659 if (!total) 2660 return -ENODEV; 2661 2662 ip_block = amdgpu_device_ip_get_ip_block(adev, AMD_IP_BLOCK_TYPE_GFX); 2663 if (ip_block->status.valid != false) 2664 amdgpu_amdkfd_device_probe(adev); 2665 2666 adev->cg_flags &= amdgpu_cg_mask; 2667 adev->pg_flags &= amdgpu_pg_mask; 2668 2669 return 0; 2670 } 2671 2672 static int amdgpu_device_ip_hw_init_phase1(struct amdgpu_device *adev) 2673 { 2674 int i, r; 2675 2676 for (i = 0; i < adev->num_ip_blocks; i++) { 2677 if (!adev->ip_blocks[i].status.sw) 2678 continue; 2679 if (adev->ip_blocks[i].status.hw) 2680 continue; 2681 if (!amdgpu_ip_member_of_hwini( 2682 adev, adev->ip_blocks[i].version->type)) 2683 continue; 2684 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON || 2685 (amdgpu_sriov_vf(adev) && (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP)) || 2686 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH) { 2687 r = adev->ip_blocks[i].version->funcs->hw_init(adev); 2688 if (r) { 2689 DRM_ERROR("hw_init of IP block <%s> failed %d\n", 2690 adev->ip_blocks[i].version->funcs->name, r); 2691 return r; 2692 } 2693 adev->ip_blocks[i].status.hw = true; 2694 } 2695 } 2696 2697 return 0; 2698 } 2699 2700 static int amdgpu_device_ip_hw_init_phase2(struct amdgpu_device *adev) 2701 { 2702 int i, r; 2703 2704 for (i = 0; i < adev->num_ip_blocks; i++) { 2705 if (!adev->ip_blocks[i].status.sw) 2706 continue; 2707 if (adev->ip_blocks[i].status.hw) 2708 continue; 2709 if (!amdgpu_ip_member_of_hwini( 2710 adev, adev->ip_blocks[i].version->type)) 2711 continue; 2712 r = adev->ip_blocks[i].version->funcs->hw_init(adev); 2713 if (r) { 2714 DRM_ERROR("hw_init of IP block <%s> failed %d\n", 2715 adev->ip_blocks[i].version->funcs->name, r); 2716 return r; 2717 } 2718 adev->ip_blocks[i].status.hw = true; 2719 } 2720 2721 return 0; 2722 } 2723 2724 static int amdgpu_device_fw_loading(struct amdgpu_device *adev) 2725 { 2726 int r = 0; 2727 int i; 2728 uint32_t smu_version; 2729 2730 if (adev->asic_type >= CHIP_VEGA10) { 2731 for (i = 0; i < adev->num_ip_blocks; i++) { 2732 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_PSP) 2733 continue; 2734 2735 if (!amdgpu_ip_member_of_hwini(adev, 2736 AMD_IP_BLOCK_TYPE_PSP)) 2737 break; 2738 2739 if (!adev->ip_blocks[i].status.sw) 2740 continue; 2741 2742 /* no need to do the fw loading again if already done*/ 2743 if (adev->ip_blocks[i].status.hw == true) 2744 break; 2745 2746 if (amdgpu_in_reset(adev) || adev->in_suspend) { 2747 r = adev->ip_blocks[i].version->funcs->resume(adev); 2748 if (r) { 2749 DRM_ERROR("resume of IP block <%s> failed %d\n", 2750 adev->ip_blocks[i].version->funcs->name, r); 2751 return r; 2752 } 2753 } else { 2754 r = adev->ip_blocks[i].version->funcs->hw_init(adev); 2755 if (r) { 2756 DRM_ERROR("hw_init of IP block <%s> failed %d\n", 2757 adev->ip_blocks[i].version->funcs->name, r); 2758 return r; 2759 } 2760 } 2761 2762 adev->ip_blocks[i].status.hw = true; 2763 break; 2764 } 2765 } 2766 2767 if (!amdgpu_sriov_vf(adev) || adev->asic_type == CHIP_TONGA) 2768 r = amdgpu_pm_load_smu_firmware(adev, &smu_version); 2769 2770 return r; 2771 } 2772 2773 static int amdgpu_device_init_schedulers(struct amdgpu_device *adev) 2774 { 2775 long timeout; 2776 int r, i; 2777 2778 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 2779 struct amdgpu_ring *ring = adev->rings[i]; 2780 2781 /* No need to setup the GPU scheduler for rings that don't need it */ 2782 if (!ring || ring->no_scheduler) 2783 continue; 2784 2785 switch (ring->funcs->type) { 2786 case AMDGPU_RING_TYPE_GFX: 2787 timeout = adev->gfx_timeout; 2788 break; 2789 case AMDGPU_RING_TYPE_COMPUTE: 2790 timeout = adev->compute_timeout; 2791 break; 2792 case AMDGPU_RING_TYPE_SDMA: 2793 timeout = adev->sdma_timeout; 2794 break; 2795 default: 2796 timeout = adev->video_timeout; 2797 break; 2798 } 2799 2800 r = drm_sched_init(&ring->sched, &amdgpu_sched_ops, NULL, 2801 DRM_SCHED_PRIORITY_COUNT, 2802 ring->num_hw_submission, 0, 2803 timeout, adev->reset_domain->wq, 2804 ring->sched_score, ring->name, 2805 adev->dev); 2806 if (r) { 2807 DRM_ERROR("Failed to create scheduler on ring %s.\n", 2808 ring->name); 2809 return r; 2810 } 2811 r = amdgpu_uvd_entity_init(adev, ring); 2812 if (r) { 2813 DRM_ERROR("Failed to create UVD scheduling entity on ring %s.\n", 2814 ring->name); 2815 return r; 2816 } 2817 r = amdgpu_vce_entity_init(adev, ring); 2818 if (r) { 2819 DRM_ERROR("Failed to create VCE scheduling entity on ring %s.\n", 2820 ring->name); 2821 return r; 2822 } 2823 } 2824 2825 amdgpu_xcp_update_partition_sched_list(adev); 2826 2827 return 0; 2828 } 2829 2830 2831 /** 2832 * amdgpu_device_ip_init - run init for hardware IPs 2833 * 2834 * @adev: amdgpu_device pointer 2835 * 2836 * Main initialization pass for hardware IPs. The list of all the hardware 2837 * IPs that make up the asic is walked and the sw_init and hw_init callbacks 2838 * are run. sw_init initializes the software state associated with each IP 2839 * and hw_init initializes the hardware associated with each IP. 2840 * Returns 0 on success, negative error code on failure. 2841 */ 2842 static int amdgpu_device_ip_init(struct amdgpu_device *adev) 2843 { 2844 bool init_badpage; 2845 int i, r; 2846 2847 r = amdgpu_ras_init(adev); 2848 if (r) 2849 return r; 2850 2851 for (i = 0; i < adev->num_ip_blocks; i++) { 2852 if (!adev->ip_blocks[i].status.valid) 2853 continue; 2854 r = adev->ip_blocks[i].version->funcs->sw_init((void *)adev); 2855 if (r) { 2856 DRM_ERROR("sw_init of IP block <%s> failed %d\n", 2857 adev->ip_blocks[i].version->funcs->name, r); 2858 goto init_failed; 2859 } 2860 adev->ip_blocks[i].status.sw = true; 2861 2862 if (!amdgpu_ip_member_of_hwini( 2863 adev, adev->ip_blocks[i].version->type)) 2864 continue; 2865 2866 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON) { 2867 /* need to do common hw init early so everything is set up for gmc */ 2868 r = adev->ip_blocks[i].version->funcs->hw_init((void *)adev); 2869 if (r) { 2870 DRM_ERROR("hw_init %d failed %d\n", i, r); 2871 goto init_failed; 2872 } 2873 adev->ip_blocks[i].status.hw = true; 2874 } else if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) { 2875 /* need to do gmc hw init early so we can allocate gpu mem */ 2876 /* Try to reserve bad pages early */ 2877 if (amdgpu_sriov_vf(adev)) 2878 amdgpu_virt_exchange_data(adev); 2879 2880 r = amdgpu_device_mem_scratch_init(adev); 2881 if (r) { 2882 DRM_ERROR("amdgpu_mem_scratch_init failed %d\n", r); 2883 goto init_failed; 2884 } 2885 r = adev->ip_blocks[i].version->funcs->hw_init((void *)adev); 2886 if (r) { 2887 DRM_ERROR("hw_init %d failed %d\n", i, r); 2888 goto init_failed; 2889 } 2890 r = amdgpu_device_wb_init(adev); 2891 if (r) { 2892 DRM_ERROR("amdgpu_device_wb_init failed %d\n", r); 2893 goto init_failed; 2894 } 2895 adev->ip_blocks[i].status.hw = true; 2896 2897 /* right after GMC hw init, we create CSA */ 2898 if (adev->gfx.mcbp) { 2899 r = amdgpu_allocate_static_csa(adev, &adev->virt.csa_obj, 2900 AMDGPU_GEM_DOMAIN_VRAM | 2901 AMDGPU_GEM_DOMAIN_GTT, 2902 AMDGPU_CSA_SIZE); 2903 if (r) { 2904 DRM_ERROR("allocate CSA failed %d\n", r); 2905 goto init_failed; 2906 } 2907 } 2908 2909 r = amdgpu_seq64_init(adev); 2910 if (r) { 2911 DRM_ERROR("allocate seq64 failed %d\n", r); 2912 goto init_failed; 2913 } 2914 } 2915 } 2916 2917 if (amdgpu_sriov_vf(adev)) 2918 amdgpu_virt_init_data_exchange(adev); 2919 2920 r = amdgpu_ib_pool_init(adev); 2921 if (r) { 2922 dev_err(adev->dev, "IB initialization failed (%d).\n", r); 2923 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_IB_INIT_FAIL, 0, r); 2924 goto init_failed; 2925 } 2926 2927 r = amdgpu_ucode_create_bo(adev); /* create ucode bo when sw_init complete*/ 2928 if (r) 2929 goto init_failed; 2930 2931 r = amdgpu_device_ip_hw_init_phase1(adev); 2932 if (r) 2933 goto init_failed; 2934 2935 r = amdgpu_device_fw_loading(adev); 2936 if (r) 2937 goto init_failed; 2938 2939 r = amdgpu_device_ip_hw_init_phase2(adev); 2940 if (r) 2941 goto init_failed; 2942 2943 /* 2944 * retired pages will be loaded from eeprom and reserved here, 2945 * it should be called after amdgpu_device_ip_hw_init_phase2 since 2946 * for some ASICs the RAS EEPROM code relies on SMU fully functioning 2947 * for I2C communication which only true at this point. 2948 * 2949 * amdgpu_ras_recovery_init may fail, but the upper only cares the 2950 * failure from bad gpu situation and stop amdgpu init process 2951 * accordingly. For other failed cases, it will still release all 2952 * the resource and print error message, rather than returning one 2953 * negative value to upper level. 2954 * 2955 * Note: theoretically, this should be called before all vram allocations 2956 * to protect retired page from abusing 2957 */ 2958 init_badpage = (adev->init_lvl->level != AMDGPU_INIT_LEVEL_MINIMAL_XGMI); 2959 r = amdgpu_ras_recovery_init(adev, init_badpage); 2960 if (r) 2961 goto init_failed; 2962 2963 /** 2964 * In case of XGMI grab extra reference for reset domain for this device 2965 */ 2966 if (adev->gmc.xgmi.num_physical_nodes > 1) { 2967 if (amdgpu_xgmi_add_device(adev) == 0) { 2968 if (!amdgpu_sriov_vf(adev)) { 2969 struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev); 2970 2971 if (WARN_ON(!hive)) { 2972 r = -ENOENT; 2973 goto init_failed; 2974 } 2975 2976 if (!hive->reset_domain || 2977 !amdgpu_reset_get_reset_domain(hive->reset_domain)) { 2978 r = -ENOENT; 2979 amdgpu_put_xgmi_hive(hive); 2980 goto init_failed; 2981 } 2982 2983 /* Drop the early temporary reset domain we created for device */ 2984 amdgpu_reset_put_reset_domain(adev->reset_domain); 2985 adev->reset_domain = hive->reset_domain; 2986 amdgpu_put_xgmi_hive(hive); 2987 } 2988 } 2989 } 2990 2991 r = amdgpu_device_init_schedulers(adev); 2992 if (r) 2993 goto init_failed; 2994 2995 if (adev->mman.buffer_funcs_ring->sched.ready) 2996 amdgpu_ttm_set_buffer_funcs_status(adev, true); 2997 2998 /* Don't init kfd if whole hive need to be reset during init */ 2999 if (adev->init_lvl->level != AMDGPU_INIT_LEVEL_MINIMAL_XGMI) { 3000 kgd2kfd_init_zone_device(adev); 3001 amdgpu_amdkfd_device_init(adev); 3002 } 3003 3004 amdgpu_fru_get_product_info(adev); 3005 3006 init_failed: 3007 3008 return r; 3009 } 3010 3011 /** 3012 * amdgpu_device_fill_reset_magic - writes reset magic to gart pointer 3013 * 3014 * @adev: amdgpu_device pointer 3015 * 3016 * Writes a reset magic value to the gart pointer in VRAM. The driver calls 3017 * this function before a GPU reset. If the value is retained after a 3018 * GPU reset, VRAM has not been lost. Some GPU resets may destry VRAM contents. 3019 */ 3020 static void amdgpu_device_fill_reset_magic(struct amdgpu_device *adev) 3021 { 3022 memcpy(adev->reset_magic, adev->gart.ptr, AMDGPU_RESET_MAGIC_NUM); 3023 } 3024 3025 /** 3026 * amdgpu_device_check_vram_lost - check if vram is valid 3027 * 3028 * @adev: amdgpu_device pointer 3029 * 3030 * Checks the reset magic value written to the gart pointer in VRAM. 3031 * The driver calls this after a GPU reset to see if the contents of 3032 * VRAM is lost or now. 3033 * returns true if vram is lost, false if not. 3034 */ 3035 static bool amdgpu_device_check_vram_lost(struct amdgpu_device *adev) 3036 { 3037 if (memcmp(adev->gart.ptr, adev->reset_magic, 3038 AMDGPU_RESET_MAGIC_NUM)) 3039 return true; 3040 3041 if (!amdgpu_in_reset(adev)) 3042 return false; 3043 3044 /* 3045 * For all ASICs with baco/mode1 reset, the VRAM is 3046 * always assumed to be lost. 3047 */ 3048 switch (amdgpu_asic_reset_method(adev)) { 3049 case AMD_RESET_METHOD_BACO: 3050 case AMD_RESET_METHOD_MODE1: 3051 return true; 3052 default: 3053 return false; 3054 } 3055 } 3056 3057 /** 3058 * amdgpu_device_set_cg_state - set clockgating for amdgpu device 3059 * 3060 * @adev: amdgpu_device pointer 3061 * @state: clockgating state (gate or ungate) 3062 * 3063 * The list of all the hardware IPs that make up the asic is walked and the 3064 * set_clockgating_state callbacks are run. 3065 * Late initialization pass enabling clockgating for hardware IPs. 3066 * Fini or suspend, pass disabling clockgating for hardware IPs. 3067 * Returns 0 on success, negative error code on failure. 3068 */ 3069 3070 int amdgpu_device_set_cg_state(struct amdgpu_device *adev, 3071 enum amd_clockgating_state state) 3072 { 3073 int i, j, r; 3074 3075 if (amdgpu_emu_mode == 1) 3076 return 0; 3077 3078 for (j = 0; j < adev->num_ip_blocks; j++) { 3079 i = state == AMD_CG_STATE_GATE ? j : adev->num_ip_blocks - j - 1; 3080 if (!adev->ip_blocks[i].status.late_initialized) 3081 continue; 3082 /* skip CG for GFX, SDMA on S0ix */ 3083 if (adev->in_s0ix && 3084 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GFX || 3085 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SDMA)) 3086 continue; 3087 /* skip CG for VCE/UVD, it's handled specially */ 3088 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_UVD && 3089 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCE && 3090 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCN && 3091 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_JPEG && 3092 adev->ip_blocks[i].version->funcs->set_clockgating_state) { 3093 /* enable clockgating to save power */ 3094 r = adev->ip_blocks[i].version->funcs->set_clockgating_state((void *)adev, 3095 state); 3096 if (r) { 3097 DRM_ERROR("set_clockgating_state(gate) of IP block <%s> failed %d\n", 3098 adev->ip_blocks[i].version->funcs->name, r); 3099 return r; 3100 } 3101 } 3102 } 3103 3104 return 0; 3105 } 3106 3107 int amdgpu_device_set_pg_state(struct amdgpu_device *adev, 3108 enum amd_powergating_state state) 3109 { 3110 int i, j, r; 3111 3112 if (amdgpu_emu_mode == 1) 3113 return 0; 3114 3115 for (j = 0; j < adev->num_ip_blocks; j++) { 3116 i = state == AMD_PG_STATE_GATE ? j : adev->num_ip_blocks - j - 1; 3117 if (!adev->ip_blocks[i].status.late_initialized) 3118 continue; 3119 /* skip PG for GFX, SDMA on S0ix */ 3120 if (adev->in_s0ix && 3121 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GFX || 3122 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SDMA)) 3123 continue; 3124 /* skip CG for VCE/UVD, it's handled specially */ 3125 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_UVD && 3126 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCE && 3127 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCN && 3128 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_JPEG && 3129 adev->ip_blocks[i].version->funcs->set_powergating_state) { 3130 /* enable powergating to save power */ 3131 r = adev->ip_blocks[i].version->funcs->set_powergating_state((void *)adev, 3132 state); 3133 if (r) { 3134 DRM_ERROR("set_powergating_state(gate) of IP block <%s> failed %d\n", 3135 adev->ip_blocks[i].version->funcs->name, r); 3136 return r; 3137 } 3138 } 3139 } 3140 return 0; 3141 } 3142 3143 static int amdgpu_device_enable_mgpu_fan_boost(void) 3144 { 3145 struct amdgpu_gpu_instance *gpu_ins; 3146 struct amdgpu_device *adev; 3147 int i, ret = 0; 3148 3149 mutex_lock(&mgpu_info.mutex); 3150 3151 /* 3152 * MGPU fan boost feature should be enabled 3153 * only when there are two or more dGPUs in 3154 * the system 3155 */ 3156 if (mgpu_info.num_dgpu < 2) 3157 goto out; 3158 3159 for (i = 0; i < mgpu_info.num_dgpu; i++) { 3160 gpu_ins = &(mgpu_info.gpu_ins[i]); 3161 adev = gpu_ins->adev; 3162 if (!(adev->flags & AMD_IS_APU) && 3163 !gpu_ins->mgpu_fan_enabled) { 3164 ret = amdgpu_dpm_enable_mgpu_fan_boost(adev); 3165 if (ret) 3166 break; 3167 3168 gpu_ins->mgpu_fan_enabled = 1; 3169 } 3170 } 3171 3172 out: 3173 mutex_unlock(&mgpu_info.mutex); 3174 3175 return ret; 3176 } 3177 3178 /** 3179 * amdgpu_device_ip_late_init - run late init for hardware IPs 3180 * 3181 * @adev: amdgpu_device pointer 3182 * 3183 * Late initialization pass for hardware IPs. The list of all the hardware 3184 * IPs that make up the asic is walked and the late_init callbacks are run. 3185 * late_init covers any special initialization that an IP requires 3186 * after all of the have been initialized or something that needs to happen 3187 * late in the init process. 3188 * Returns 0 on success, negative error code on failure. 3189 */ 3190 static int amdgpu_device_ip_late_init(struct amdgpu_device *adev) 3191 { 3192 struct amdgpu_gpu_instance *gpu_instance; 3193 int i = 0, r; 3194 3195 for (i = 0; i < adev->num_ip_blocks; i++) { 3196 if (!adev->ip_blocks[i].status.hw) 3197 continue; 3198 if (adev->ip_blocks[i].version->funcs->late_init) { 3199 r = adev->ip_blocks[i].version->funcs->late_init((void *)adev); 3200 if (r) { 3201 DRM_ERROR("late_init of IP block <%s> failed %d\n", 3202 adev->ip_blocks[i].version->funcs->name, r); 3203 return r; 3204 } 3205 } 3206 adev->ip_blocks[i].status.late_initialized = true; 3207 } 3208 3209 r = amdgpu_ras_late_init(adev); 3210 if (r) { 3211 DRM_ERROR("amdgpu_ras_late_init failed %d", r); 3212 return r; 3213 } 3214 3215 if (!amdgpu_in_reset(adev)) 3216 amdgpu_ras_set_error_query_ready(adev, true); 3217 3218 amdgpu_device_set_cg_state(adev, AMD_CG_STATE_GATE); 3219 amdgpu_device_set_pg_state(adev, AMD_PG_STATE_GATE); 3220 3221 amdgpu_device_fill_reset_magic(adev); 3222 3223 r = amdgpu_device_enable_mgpu_fan_boost(); 3224 if (r) 3225 DRM_ERROR("enable mgpu fan boost failed (%d).\n", r); 3226 3227 /* For passthrough configuration on arcturus and aldebaran, enable special handling SBR */ 3228 if (amdgpu_passthrough(adev) && 3229 ((adev->asic_type == CHIP_ARCTURUS && adev->gmc.xgmi.num_physical_nodes > 1) || 3230 adev->asic_type == CHIP_ALDEBARAN)) 3231 amdgpu_dpm_handle_passthrough_sbr(adev, true); 3232 3233 if (adev->gmc.xgmi.num_physical_nodes > 1) { 3234 mutex_lock(&mgpu_info.mutex); 3235 3236 /* 3237 * Reset device p-state to low as this was booted with high. 3238 * 3239 * This should be performed only after all devices from the same 3240 * hive get initialized. 3241 * 3242 * However, it's unknown how many device in the hive in advance. 3243 * As this is counted one by one during devices initializations. 3244 * 3245 * So, we wait for all XGMI interlinked devices initialized. 3246 * This may bring some delays as those devices may come from 3247 * different hives. But that should be OK. 3248 */ 3249 if (mgpu_info.num_dgpu == adev->gmc.xgmi.num_physical_nodes) { 3250 for (i = 0; i < mgpu_info.num_gpu; i++) { 3251 gpu_instance = &(mgpu_info.gpu_ins[i]); 3252 if (gpu_instance->adev->flags & AMD_IS_APU) 3253 continue; 3254 3255 r = amdgpu_xgmi_set_pstate(gpu_instance->adev, 3256 AMDGPU_XGMI_PSTATE_MIN); 3257 if (r) { 3258 DRM_ERROR("pstate setting failed (%d).\n", r); 3259 break; 3260 } 3261 } 3262 } 3263 3264 mutex_unlock(&mgpu_info.mutex); 3265 } 3266 3267 return 0; 3268 } 3269 3270 /** 3271 * amdgpu_device_smu_fini_early - smu hw_fini wrapper 3272 * 3273 * @adev: amdgpu_device pointer 3274 * 3275 * For ASICs need to disable SMC first 3276 */ 3277 static void amdgpu_device_smu_fini_early(struct amdgpu_device *adev) 3278 { 3279 int i, r; 3280 3281 if (amdgpu_ip_version(adev, GC_HWIP, 0) > IP_VERSION(9, 0, 0)) 3282 return; 3283 3284 for (i = 0; i < adev->num_ip_blocks; i++) { 3285 if (!adev->ip_blocks[i].status.hw) 3286 continue; 3287 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) { 3288 r = adev->ip_blocks[i].version->funcs->hw_fini((void *)adev); 3289 /* XXX handle errors */ 3290 if (r) { 3291 DRM_DEBUG("hw_fini of IP block <%s> failed %d\n", 3292 adev->ip_blocks[i].version->funcs->name, r); 3293 } 3294 adev->ip_blocks[i].status.hw = false; 3295 break; 3296 } 3297 } 3298 } 3299 3300 static int amdgpu_device_ip_fini_early(struct amdgpu_device *adev) 3301 { 3302 int i, r; 3303 3304 for (i = 0; i < adev->num_ip_blocks; i++) { 3305 if (!adev->ip_blocks[i].version->funcs->early_fini) 3306 continue; 3307 3308 r = adev->ip_blocks[i].version->funcs->early_fini((void *)adev); 3309 if (r) { 3310 DRM_DEBUG("early_fini of IP block <%s> failed %d\n", 3311 adev->ip_blocks[i].version->funcs->name, r); 3312 } 3313 } 3314 3315 amdgpu_device_set_pg_state(adev, AMD_PG_STATE_UNGATE); 3316 amdgpu_device_set_cg_state(adev, AMD_CG_STATE_UNGATE); 3317 3318 amdgpu_amdkfd_suspend(adev, false); 3319 3320 /* Workaroud for ASICs need to disable SMC first */ 3321 amdgpu_device_smu_fini_early(adev); 3322 3323 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 3324 if (!adev->ip_blocks[i].status.hw) 3325 continue; 3326 3327 r = adev->ip_blocks[i].version->funcs->hw_fini((void *)adev); 3328 /* XXX handle errors */ 3329 if (r) { 3330 DRM_DEBUG("hw_fini of IP block <%s> failed %d\n", 3331 adev->ip_blocks[i].version->funcs->name, r); 3332 } 3333 3334 adev->ip_blocks[i].status.hw = false; 3335 } 3336 3337 if (amdgpu_sriov_vf(adev)) { 3338 if (amdgpu_virt_release_full_gpu(adev, false)) 3339 DRM_ERROR("failed to release exclusive mode on fini\n"); 3340 } 3341 3342 return 0; 3343 } 3344 3345 /** 3346 * amdgpu_device_ip_fini - run fini for hardware IPs 3347 * 3348 * @adev: amdgpu_device pointer 3349 * 3350 * Main teardown pass for hardware IPs. The list of all the hardware 3351 * IPs that make up the asic is walked and the hw_fini and sw_fini callbacks 3352 * are run. hw_fini tears down the hardware associated with each IP 3353 * and sw_fini tears down any software state associated with each IP. 3354 * Returns 0 on success, negative error code on failure. 3355 */ 3356 static int amdgpu_device_ip_fini(struct amdgpu_device *adev) 3357 { 3358 int i, r; 3359 3360 if (amdgpu_sriov_vf(adev) && adev->virt.ras_init_done) 3361 amdgpu_virt_release_ras_err_handler_data(adev); 3362 3363 if (adev->gmc.xgmi.num_physical_nodes > 1) 3364 amdgpu_xgmi_remove_device(adev); 3365 3366 amdgpu_amdkfd_device_fini_sw(adev); 3367 3368 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 3369 if (!adev->ip_blocks[i].status.sw) 3370 continue; 3371 3372 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) { 3373 amdgpu_ucode_free_bo(adev); 3374 amdgpu_free_static_csa(&adev->virt.csa_obj); 3375 amdgpu_device_wb_fini(adev); 3376 amdgpu_device_mem_scratch_fini(adev); 3377 amdgpu_ib_pool_fini(adev); 3378 amdgpu_seq64_fini(adev); 3379 } 3380 3381 r = adev->ip_blocks[i].version->funcs->sw_fini((void *)adev); 3382 /* XXX handle errors */ 3383 if (r) { 3384 DRM_DEBUG("sw_fini of IP block <%s> failed %d\n", 3385 adev->ip_blocks[i].version->funcs->name, r); 3386 } 3387 adev->ip_blocks[i].status.sw = false; 3388 adev->ip_blocks[i].status.valid = false; 3389 } 3390 3391 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 3392 if (!adev->ip_blocks[i].status.late_initialized) 3393 continue; 3394 if (adev->ip_blocks[i].version->funcs->late_fini) 3395 adev->ip_blocks[i].version->funcs->late_fini((void *)adev); 3396 adev->ip_blocks[i].status.late_initialized = false; 3397 } 3398 3399 amdgpu_ras_fini(adev); 3400 3401 return 0; 3402 } 3403 3404 /** 3405 * amdgpu_device_delayed_init_work_handler - work handler for IB tests 3406 * 3407 * @work: work_struct. 3408 */ 3409 static void amdgpu_device_delayed_init_work_handler(struct work_struct *work) 3410 { 3411 struct amdgpu_device *adev = 3412 container_of(work, struct amdgpu_device, delayed_init_work.work); 3413 int r; 3414 3415 r = amdgpu_ib_ring_tests(adev); 3416 if (r) 3417 DRM_ERROR("ib ring test failed (%d).\n", r); 3418 } 3419 3420 static void amdgpu_device_delay_enable_gfx_off(struct work_struct *work) 3421 { 3422 struct amdgpu_device *adev = 3423 container_of(work, struct amdgpu_device, gfx.gfx_off_delay_work.work); 3424 3425 WARN_ON_ONCE(adev->gfx.gfx_off_state); 3426 WARN_ON_ONCE(adev->gfx.gfx_off_req_count); 3427 3428 if (!amdgpu_dpm_set_powergating_by_smu(adev, AMD_IP_BLOCK_TYPE_GFX, true)) 3429 adev->gfx.gfx_off_state = true; 3430 } 3431 3432 /** 3433 * amdgpu_device_ip_suspend_phase1 - run suspend for hardware IPs (phase 1) 3434 * 3435 * @adev: amdgpu_device pointer 3436 * 3437 * Main suspend function for hardware IPs. The list of all the hardware 3438 * IPs that make up the asic is walked, clockgating is disabled and the 3439 * suspend callbacks are run. suspend puts the hardware and software state 3440 * in each IP into a state suitable for suspend. 3441 * Returns 0 on success, negative error code on failure. 3442 */ 3443 static int amdgpu_device_ip_suspend_phase1(struct amdgpu_device *adev) 3444 { 3445 int i, r; 3446 3447 amdgpu_device_set_pg_state(adev, AMD_PG_STATE_UNGATE); 3448 amdgpu_device_set_cg_state(adev, AMD_CG_STATE_UNGATE); 3449 3450 /* 3451 * Per PMFW team's suggestion, driver needs to handle gfxoff 3452 * and df cstate features disablement for gpu reset(e.g. Mode1Reset) 3453 * scenario. Add the missing df cstate disablement here. 3454 */ 3455 if (amdgpu_dpm_set_df_cstate(adev, DF_CSTATE_DISALLOW)) 3456 dev_warn(adev->dev, "Failed to disallow df cstate"); 3457 3458 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 3459 if (!adev->ip_blocks[i].status.valid) 3460 continue; 3461 3462 /* displays are handled separately */ 3463 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_DCE) 3464 continue; 3465 3466 /* XXX handle errors */ 3467 r = adev->ip_blocks[i].version->funcs->suspend(adev); 3468 /* XXX handle errors */ 3469 if (r) { 3470 DRM_ERROR("suspend of IP block <%s> failed %d\n", 3471 adev->ip_blocks[i].version->funcs->name, r); 3472 return r; 3473 } 3474 3475 adev->ip_blocks[i].status.hw = false; 3476 } 3477 3478 return 0; 3479 } 3480 3481 /** 3482 * amdgpu_device_ip_suspend_phase2 - run suspend for hardware IPs (phase 2) 3483 * 3484 * @adev: amdgpu_device pointer 3485 * 3486 * Main suspend function for hardware IPs. The list of all the hardware 3487 * IPs that make up the asic is walked, clockgating is disabled and the 3488 * suspend callbacks are run. suspend puts the hardware and software state 3489 * in each IP into a state suitable for suspend. 3490 * Returns 0 on success, negative error code on failure. 3491 */ 3492 static int amdgpu_device_ip_suspend_phase2(struct amdgpu_device *adev) 3493 { 3494 int i, r; 3495 3496 if (adev->in_s0ix) 3497 amdgpu_dpm_gfx_state_change(adev, sGpuChangeState_D3Entry); 3498 3499 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 3500 if (!adev->ip_blocks[i].status.valid) 3501 continue; 3502 /* displays are handled in phase1 */ 3503 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE) 3504 continue; 3505 /* PSP lost connection when err_event_athub occurs */ 3506 if (amdgpu_ras_intr_triggered() && 3507 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) { 3508 adev->ip_blocks[i].status.hw = false; 3509 continue; 3510 } 3511 3512 /* skip unnecessary suspend if we do not initialize them yet */ 3513 if (!amdgpu_ip_member_of_hwini( 3514 adev, adev->ip_blocks[i].version->type)) 3515 continue; 3516 3517 /* skip suspend of gfx/mes and psp for S0ix 3518 * gfx is in gfxoff state, so on resume it will exit gfxoff just 3519 * like at runtime. PSP is also part of the always on hardware 3520 * so no need to suspend it. 3521 */ 3522 if (adev->in_s0ix && 3523 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP || 3524 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GFX || 3525 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_MES)) 3526 continue; 3527 3528 /* SDMA 5.x+ is part of GFX power domain so it's covered by GFXOFF */ 3529 if (adev->in_s0ix && 3530 (amdgpu_ip_version(adev, SDMA0_HWIP, 0) >= 3531 IP_VERSION(5, 0, 0)) && 3532 (adev->ip_blocks[i].version->type == 3533 AMD_IP_BLOCK_TYPE_SDMA)) 3534 continue; 3535 3536 /* Once swPSP provides the IMU, RLC FW binaries to TOS during cold-boot. 3537 * These are in TMR, hence are expected to be reused by PSP-TOS to reload 3538 * from this location and RLC Autoload automatically also gets loaded 3539 * from here based on PMFW -> PSP message during re-init sequence. 3540 * Therefore, the psp suspend & resume should be skipped to avoid destroy 3541 * the TMR and reload FWs again for IMU enabled APU ASICs. 3542 */ 3543 if (amdgpu_in_reset(adev) && 3544 (adev->flags & AMD_IS_APU) && adev->gfx.imu.funcs && 3545 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) 3546 continue; 3547 3548 /* XXX handle errors */ 3549 r = adev->ip_blocks[i].version->funcs->suspend(adev); 3550 /* XXX handle errors */ 3551 if (r) { 3552 DRM_ERROR("suspend of IP block <%s> failed %d\n", 3553 adev->ip_blocks[i].version->funcs->name, r); 3554 } 3555 adev->ip_blocks[i].status.hw = false; 3556 /* handle putting the SMC in the appropriate state */ 3557 if (!amdgpu_sriov_vf(adev)) { 3558 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) { 3559 r = amdgpu_dpm_set_mp1_state(adev, adev->mp1_state); 3560 if (r) { 3561 DRM_ERROR("SMC failed to set mp1 state %d, %d\n", 3562 adev->mp1_state, r); 3563 return r; 3564 } 3565 } 3566 } 3567 } 3568 3569 return 0; 3570 } 3571 3572 /** 3573 * amdgpu_device_ip_suspend - run suspend for hardware IPs 3574 * 3575 * @adev: amdgpu_device pointer 3576 * 3577 * Main suspend function for hardware IPs. The list of all the hardware 3578 * IPs that make up the asic is walked, clockgating is disabled and the 3579 * suspend callbacks are run. suspend puts the hardware and software state 3580 * in each IP into a state suitable for suspend. 3581 * Returns 0 on success, negative error code on failure. 3582 */ 3583 int amdgpu_device_ip_suspend(struct amdgpu_device *adev) 3584 { 3585 int r; 3586 3587 if (amdgpu_sriov_vf(adev)) { 3588 amdgpu_virt_fini_data_exchange(adev); 3589 amdgpu_virt_request_full_gpu(adev, false); 3590 } 3591 3592 amdgpu_ttm_set_buffer_funcs_status(adev, false); 3593 3594 r = amdgpu_device_ip_suspend_phase1(adev); 3595 if (r) 3596 return r; 3597 r = amdgpu_device_ip_suspend_phase2(adev); 3598 3599 if (amdgpu_sriov_vf(adev)) 3600 amdgpu_virt_release_full_gpu(adev, false); 3601 3602 return r; 3603 } 3604 3605 static int amdgpu_device_ip_reinit_early_sriov(struct amdgpu_device *adev) 3606 { 3607 int i, r; 3608 3609 static enum amd_ip_block_type ip_order[] = { 3610 AMD_IP_BLOCK_TYPE_COMMON, 3611 AMD_IP_BLOCK_TYPE_GMC, 3612 AMD_IP_BLOCK_TYPE_PSP, 3613 AMD_IP_BLOCK_TYPE_IH, 3614 }; 3615 3616 for (i = 0; i < adev->num_ip_blocks; i++) { 3617 int j; 3618 struct amdgpu_ip_block *block; 3619 3620 block = &adev->ip_blocks[i]; 3621 block->status.hw = false; 3622 3623 for (j = 0; j < ARRAY_SIZE(ip_order); j++) { 3624 3625 if (block->version->type != ip_order[j] || 3626 !block->status.valid) 3627 continue; 3628 3629 r = block->version->funcs->hw_init(adev); 3630 DRM_INFO("RE-INIT-early: %s %s\n", block->version->funcs->name, r?"failed":"succeeded"); 3631 if (r) 3632 return r; 3633 block->status.hw = true; 3634 } 3635 } 3636 3637 return 0; 3638 } 3639 3640 static int amdgpu_device_ip_reinit_late_sriov(struct amdgpu_device *adev) 3641 { 3642 int i, r; 3643 3644 static enum amd_ip_block_type ip_order[] = { 3645 AMD_IP_BLOCK_TYPE_SMC, 3646 AMD_IP_BLOCK_TYPE_DCE, 3647 AMD_IP_BLOCK_TYPE_GFX, 3648 AMD_IP_BLOCK_TYPE_SDMA, 3649 AMD_IP_BLOCK_TYPE_MES, 3650 AMD_IP_BLOCK_TYPE_UVD, 3651 AMD_IP_BLOCK_TYPE_VCE, 3652 AMD_IP_BLOCK_TYPE_VCN, 3653 AMD_IP_BLOCK_TYPE_JPEG 3654 }; 3655 3656 for (i = 0; i < ARRAY_SIZE(ip_order); i++) { 3657 int j; 3658 struct amdgpu_ip_block *block; 3659 3660 for (j = 0; j < adev->num_ip_blocks; j++) { 3661 block = &adev->ip_blocks[j]; 3662 3663 if (block->version->type != ip_order[i] || 3664 !block->status.valid || 3665 block->status.hw) 3666 continue; 3667 3668 if (block->version->type == AMD_IP_BLOCK_TYPE_SMC) 3669 r = block->version->funcs->resume(adev); 3670 else 3671 r = block->version->funcs->hw_init(adev); 3672 3673 DRM_INFO("RE-INIT-late: %s %s\n", block->version->funcs->name, r?"failed":"succeeded"); 3674 if (r) 3675 return r; 3676 block->status.hw = true; 3677 } 3678 } 3679 3680 return 0; 3681 } 3682 3683 /** 3684 * amdgpu_device_ip_resume_phase1 - run resume for hardware IPs 3685 * 3686 * @adev: amdgpu_device pointer 3687 * 3688 * First resume function for hardware IPs. The list of all the hardware 3689 * IPs that make up the asic is walked and the resume callbacks are run for 3690 * COMMON, GMC, and IH. resume puts the hardware into a functional state 3691 * after a suspend and updates the software state as necessary. This 3692 * function is also used for restoring the GPU after a GPU reset. 3693 * Returns 0 on success, negative error code on failure. 3694 */ 3695 static int amdgpu_device_ip_resume_phase1(struct amdgpu_device *adev) 3696 { 3697 int i, r; 3698 3699 for (i = 0; i < adev->num_ip_blocks; i++) { 3700 if (!adev->ip_blocks[i].status.valid || adev->ip_blocks[i].status.hw) 3701 continue; 3702 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON || 3703 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC || 3704 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH || 3705 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP && amdgpu_sriov_vf(adev))) { 3706 3707 r = adev->ip_blocks[i].version->funcs->resume(adev); 3708 if (r) { 3709 DRM_ERROR("resume of IP block <%s> failed %d\n", 3710 adev->ip_blocks[i].version->funcs->name, r); 3711 return r; 3712 } 3713 adev->ip_blocks[i].status.hw = true; 3714 } 3715 } 3716 3717 return 0; 3718 } 3719 3720 /** 3721 * amdgpu_device_ip_resume_phase2 - run resume for hardware IPs 3722 * 3723 * @adev: amdgpu_device pointer 3724 * 3725 * First resume function for hardware IPs. The list of all the hardware 3726 * IPs that make up the asic is walked and the resume callbacks are run for 3727 * all blocks except COMMON, GMC, and IH. resume puts the hardware into a 3728 * functional state after a suspend and updates the software state as 3729 * necessary. This function is also used for restoring the GPU after a GPU 3730 * reset. 3731 * Returns 0 on success, negative error code on failure. 3732 */ 3733 static int amdgpu_device_ip_resume_phase2(struct amdgpu_device *adev) 3734 { 3735 int i, r; 3736 3737 for (i = 0; i < adev->num_ip_blocks; i++) { 3738 if (!adev->ip_blocks[i].status.valid || adev->ip_blocks[i].status.hw) 3739 continue; 3740 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON || 3741 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC || 3742 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH || 3743 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) 3744 continue; 3745 r = adev->ip_blocks[i].version->funcs->resume(adev); 3746 if (r) { 3747 DRM_ERROR("resume of IP block <%s> failed %d\n", 3748 adev->ip_blocks[i].version->funcs->name, r); 3749 return r; 3750 } 3751 adev->ip_blocks[i].status.hw = true; 3752 } 3753 3754 return 0; 3755 } 3756 3757 /** 3758 * amdgpu_device_ip_resume - run resume for hardware IPs 3759 * 3760 * @adev: amdgpu_device pointer 3761 * 3762 * Main resume function for hardware IPs. The hardware IPs 3763 * are split into two resume functions because they are 3764 * also used in recovering from a GPU reset and some additional 3765 * steps need to be take between them. In this case (S3/S4) they are 3766 * run sequentially. 3767 * Returns 0 on success, negative error code on failure. 3768 */ 3769 static int amdgpu_device_ip_resume(struct amdgpu_device *adev) 3770 { 3771 int r; 3772 3773 r = amdgpu_device_ip_resume_phase1(adev); 3774 if (r) 3775 return r; 3776 3777 r = amdgpu_device_fw_loading(adev); 3778 if (r) 3779 return r; 3780 3781 r = amdgpu_device_ip_resume_phase2(adev); 3782 3783 if (adev->mman.buffer_funcs_ring->sched.ready) 3784 amdgpu_ttm_set_buffer_funcs_status(adev, true); 3785 3786 return r; 3787 } 3788 3789 /** 3790 * amdgpu_device_detect_sriov_bios - determine if the board supports SR-IOV 3791 * 3792 * @adev: amdgpu_device pointer 3793 * 3794 * Query the VBIOS data tables to determine if the board supports SR-IOV. 3795 */ 3796 static void amdgpu_device_detect_sriov_bios(struct amdgpu_device *adev) 3797 { 3798 if (amdgpu_sriov_vf(adev)) { 3799 if (adev->is_atom_fw) { 3800 if (amdgpu_atomfirmware_gpu_virtualization_supported(adev)) 3801 adev->virt.caps |= AMDGPU_SRIOV_CAPS_SRIOV_VBIOS; 3802 } else { 3803 if (amdgpu_atombios_has_gpu_virtualization_table(adev)) 3804 adev->virt.caps |= AMDGPU_SRIOV_CAPS_SRIOV_VBIOS; 3805 } 3806 3807 if (!(adev->virt.caps & AMDGPU_SRIOV_CAPS_SRIOV_VBIOS)) 3808 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_NO_VBIOS, 0, 0); 3809 } 3810 } 3811 3812 /** 3813 * amdgpu_device_asic_has_dc_support - determine if DC supports the asic 3814 * 3815 * @asic_type: AMD asic type 3816 * 3817 * Check if there is DC (new modesetting infrastructre) support for an asic. 3818 * returns true if DC has support, false if not. 3819 */ 3820 bool amdgpu_device_asic_has_dc_support(enum amd_asic_type asic_type) 3821 { 3822 switch (asic_type) { 3823 #ifdef CONFIG_DRM_AMDGPU_SI 3824 case CHIP_HAINAN: 3825 #endif 3826 case CHIP_TOPAZ: 3827 /* chips with no display hardware */ 3828 return false; 3829 #if defined(CONFIG_DRM_AMD_DC) 3830 case CHIP_TAHITI: 3831 case CHIP_PITCAIRN: 3832 case CHIP_VERDE: 3833 case CHIP_OLAND: 3834 /* 3835 * We have systems in the wild with these ASICs that require 3836 * LVDS and VGA support which is not supported with DC. 3837 * 3838 * Fallback to the non-DC driver here by default so as not to 3839 * cause regressions. 3840 */ 3841 #if defined(CONFIG_DRM_AMD_DC_SI) 3842 return amdgpu_dc > 0; 3843 #else 3844 return false; 3845 #endif 3846 case CHIP_BONAIRE: 3847 case CHIP_KAVERI: 3848 case CHIP_KABINI: 3849 case CHIP_MULLINS: 3850 /* 3851 * We have systems in the wild with these ASICs that require 3852 * VGA support which is not supported with DC. 3853 * 3854 * Fallback to the non-DC driver here by default so as not to 3855 * cause regressions. 3856 */ 3857 return amdgpu_dc > 0; 3858 default: 3859 return amdgpu_dc != 0; 3860 #else 3861 default: 3862 if (amdgpu_dc > 0) 3863 DRM_INFO_ONCE("Display Core has been requested via kernel parameter but isn't supported by ASIC, ignoring\n"); 3864 return false; 3865 #endif 3866 } 3867 } 3868 3869 /** 3870 * amdgpu_device_has_dc_support - check if dc is supported 3871 * 3872 * @adev: amdgpu_device pointer 3873 * 3874 * Returns true for supported, false for not supported 3875 */ 3876 bool amdgpu_device_has_dc_support(struct amdgpu_device *adev) 3877 { 3878 if (adev->enable_virtual_display || 3879 (adev->harvest_ip_mask & AMD_HARVEST_IP_DMU_MASK)) 3880 return false; 3881 3882 return amdgpu_device_asic_has_dc_support(adev->asic_type); 3883 } 3884 3885 static void amdgpu_device_xgmi_reset_func(struct work_struct *__work) 3886 { 3887 struct amdgpu_device *adev = 3888 container_of(__work, struct amdgpu_device, xgmi_reset_work); 3889 struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev); 3890 3891 /* It's a bug to not have a hive within this function */ 3892 if (WARN_ON(!hive)) 3893 return; 3894 3895 /* 3896 * Use task barrier to synchronize all xgmi reset works across the 3897 * hive. task_barrier_enter and task_barrier_exit will block 3898 * until all the threads running the xgmi reset works reach 3899 * those points. task_barrier_full will do both blocks. 3900 */ 3901 if (amdgpu_asic_reset_method(adev) == AMD_RESET_METHOD_BACO) { 3902 3903 task_barrier_enter(&hive->tb); 3904 adev->asic_reset_res = amdgpu_device_baco_enter(adev_to_drm(adev)); 3905 3906 if (adev->asic_reset_res) 3907 goto fail; 3908 3909 task_barrier_exit(&hive->tb); 3910 adev->asic_reset_res = amdgpu_device_baco_exit(adev_to_drm(adev)); 3911 3912 if (adev->asic_reset_res) 3913 goto fail; 3914 3915 amdgpu_ras_reset_error_count(adev, AMDGPU_RAS_BLOCK__MMHUB); 3916 } else { 3917 3918 task_barrier_full(&hive->tb); 3919 adev->asic_reset_res = amdgpu_asic_reset(adev); 3920 } 3921 3922 fail: 3923 if (adev->asic_reset_res) 3924 DRM_WARN("ASIC reset failed with error, %d for drm dev, %s", 3925 adev->asic_reset_res, adev_to_drm(adev)->unique); 3926 amdgpu_put_xgmi_hive(hive); 3927 } 3928 3929 static int amdgpu_device_get_job_timeout_settings(struct amdgpu_device *adev) 3930 { 3931 char *input = amdgpu_lockup_timeout; 3932 char *timeout_setting = NULL; 3933 int index = 0; 3934 long timeout; 3935 int ret = 0; 3936 3937 /* 3938 * By default timeout for non compute jobs is 10000 3939 * and 60000 for compute jobs. 3940 * In SR-IOV or passthrough mode, timeout for compute 3941 * jobs are 60000 by default. 3942 */ 3943 adev->gfx_timeout = msecs_to_jiffies(10000); 3944 adev->sdma_timeout = adev->video_timeout = adev->gfx_timeout; 3945 if (amdgpu_sriov_vf(adev)) 3946 adev->compute_timeout = amdgpu_sriov_is_pp_one_vf(adev) ? 3947 msecs_to_jiffies(60000) : msecs_to_jiffies(10000); 3948 else 3949 adev->compute_timeout = msecs_to_jiffies(60000); 3950 3951 if (strnlen(input, AMDGPU_MAX_TIMEOUT_PARAM_LENGTH)) { 3952 while ((timeout_setting = strsep(&input, ",")) && 3953 strnlen(timeout_setting, AMDGPU_MAX_TIMEOUT_PARAM_LENGTH)) { 3954 ret = kstrtol(timeout_setting, 0, &timeout); 3955 if (ret) 3956 return ret; 3957 3958 if (timeout == 0) { 3959 index++; 3960 continue; 3961 } else if (timeout < 0) { 3962 timeout = MAX_SCHEDULE_TIMEOUT; 3963 dev_warn(adev->dev, "lockup timeout disabled"); 3964 add_taint(TAINT_SOFTLOCKUP, LOCKDEP_STILL_OK); 3965 } else { 3966 timeout = msecs_to_jiffies(timeout); 3967 } 3968 3969 switch (index++) { 3970 case 0: 3971 adev->gfx_timeout = timeout; 3972 break; 3973 case 1: 3974 adev->compute_timeout = timeout; 3975 break; 3976 case 2: 3977 adev->sdma_timeout = timeout; 3978 break; 3979 case 3: 3980 adev->video_timeout = timeout; 3981 break; 3982 default: 3983 break; 3984 } 3985 } 3986 /* 3987 * There is only one value specified and 3988 * it should apply to all non-compute jobs. 3989 */ 3990 if (index == 1) { 3991 adev->sdma_timeout = adev->video_timeout = adev->gfx_timeout; 3992 if (amdgpu_sriov_vf(adev) || amdgpu_passthrough(adev)) 3993 adev->compute_timeout = adev->gfx_timeout; 3994 } 3995 } 3996 3997 return ret; 3998 } 3999 4000 /** 4001 * amdgpu_device_check_iommu_direct_map - check if RAM direct mapped to GPU 4002 * 4003 * @adev: amdgpu_device pointer 4004 * 4005 * RAM direct mapped to GPU if IOMMU is not enabled or is pass through mode 4006 */ 4007 static void amdgpu_device_check_iommu_direct_map(struct amdgpu_device *adev) 4008 { 4009 struct iommu_domain *domain; 4010 4011 domain = iommu_get_domain_for_dev(adev->dev); 4012 if (!domain || domain->type == IOMMU_DOMAIN_IDENTITY) 4013 adev->ram_is_direct_mapped = true; 4014 } 4015 4016 #if defined(CONFIG_HSA_AMD_P2P) 4017 /** 4018 * amdgpu_device_check_iommu_remap - Check if DMA remapping is enabled. 4019 * 4020 * @adev: amdgpu_device pointer 4021 * 4022 * return if IOMMU remapping bar address 4023 */ 4024 static bool amdgpu_device_check_iommu_remap(struct amdgpu_device *adev) 4025 { 4026 struct iommu_domain *domain; 4027 4028 domain = iommu_get_domain_for_dev(adev->dev); 4029 if (domain && (domain->type == IOMMU_DOMAIN_DMA || 4030 domain->type == IOMMU_DOMAIN_DMA_FQ)) 4031 return true; 4032 4033 return false; 4034 } 4035 #endif 4036 4037 static const struct attribute *amdgpu_dev_attributes[] = { 4038 &dev_attr_pcie_replay_count.attr, 4039 NULL 4040 }; 4041 4042 static void amdgpu_device_set_mcbp(struct amdgpu_device *adev) 4043 { 4044 if (amdgpu_mcbp == 1) 4045 adev->gfx.mcbp = true; 4046 else if (amdgpu_mcbp == 0) 4047 adev->gfx.mcbp = false; 4048 4049 if (amdgpu_sriov_vf(adev)) 4050 adev->gfx.mcbp = true; 4051 4052 if (adev->gfx.mcbp) 4053 DRM_INFO("MCBP is enabled\n"); 4054 } 4055 4056 /** 4057 * amdgpu_device_init - initialize the driver 4058 * 4059 * @adev: amdgpu_device pointer 4060 * @flags: driver flags 4061 * 4062 * Initializes the driver info and hw (all asics). 4063 * Returns 0 for success or an error on failure. 4064 * Called at driver startup. 4065 */ 4066 int amdgpu_device_init(struct amdgpu_device *adev, 4067 uint32_t flags) 4068 { 4069 struct drm_device *ddev = adev_to_drm(adev); 4070 struct pci_dev *pdev = adev->pdev; 4071 int r, i; 4072 bool px = false; 4073 u32 max_MBps; 4074 int tmp; 4075 4076 adev->shutdown = false; 4077 adev->flags = flags; 4078 4079 if (amdgpu_force_asic_type >= 0 && amdgpu_force_asic_type < CHIP_LAST) 4080 adev->asic_type = amdgpu_force_asic_type; 4081 else 4082 adev->asic_type = flags & AMD_ASIC_MASK; 4083 4084 adev->usec_timeout = AMDGPU_MAX_USEC_TIMEOUT; 4085 if (amdgpu_emu_mode == 1) 4086 adev->usec_timeout *= 10; 4087 adev->gmc.gart_size = 512 * 1024 * 1024; 4088 adev->accel_working = false; 4089 adev->num_rings = 0; 4090 RCU_INIT_POINTER(adev->gang_submit, dma_fence_get_stub()); 4091 adev->mman.buffer_funcs = NULL; 4092 adev->mman.buffer_funcs_ring = NULL; 4093 adev->vm_manager.vm_pte_funcs = NULL; 4094 adev->vm_manager.vm_pte_num_scheds = 0; 4095 adev->gmc.gmc_funcs = NULL; 4096 adev->harvest_ip_mask = 0x0; 4097 adev->fence_context = dma_fence_context_alloc(AMDGPU_MAX_RINGS); 4098 bitmap_zero(adev->gfx.pipe_reserve_bitmap, AMDGPU_MAX_COMPUTE_QUEUES); 4099 4100 adev->smc_rreg = &amdgpu_invalid_rreg; 4101 adev->smc_wreg = &amdgpu_invalid_wreg; 4102 adev->pcie_rreg = &amdgpu_invalid_rreg; 4103 adev->pcie_wreg = &amdgpu_invalid_wreg; 4104 adev->pcie_rreg_ext = &amdgpu_invalid_rreg_ext; 4105 adev->pcie_wreg_ext = &amdgpu_invalid_wreg_ext; 4106 adev->pciep_rreg = &amdgpu_invalid_rreg; 4107 adev->pciep_wreg = &amdgpu_invalid_wreg; 4108 adev->pcie_rreg64 = &amdgpu_invalid_rreg64; 4109 adev->pcie_wreg64 = &amdgpu_invalid_wreg64; 4110 adev->pcie_rreg64_ext = &amdgpu_invalid_rreg64_ext; 4111 adev->pcie_wreg64_ext = &amdgpu_invalid_wreg64_ext; 4112 adev->uvd_ctx_rreg = &amdgpu_invalid_rreg; 4113 adev->uvd_ctx_wreg = &amdgpu_invalid_wreg; 4114 adev->didt_rreg = &amdgpu_invalid_rreg; 4115 adev->didt_wreg = &amdgpu_invalid_wreg; 4116 adev->gc_cac_rreg = &amdgpu_invalid_rreg; 4117 adev->gc_cac_wreg = &amdgpu_invalid_wreg; 4118 adev->audio_endpt_rreg = &amdgpu_block_invalid_rreg; 4119 adev->audio_endpt_wreg = &amdgpu_block_invalid_wreg; 4120 4121 DRM_INFO("initializing kernel modesetting (%s 0x%04X:0x%04X 0x%04X:0x%04X 0x%02X).\n", 4122 amdgpu_asic_name[adev->asic_type], pdev->vendor, pdev->device, 4123 pdev->subsystem_vendor, pdev->subsystem_device, pdev->revision); 4124 4125 /* mutex initialization are all done here so we 4126 * can recall function without having locking issues 4127 */ 4128 mutex_init(&adev->firmware.mutex); 4129 mutex_init(&adev->pm.mutex); 4130 mutex_init(&adev->gfx.gpu_clock_mutex); 4131 mutex_init(&adev->srbm_mutex); 4132 mutex_init(&adev->gfx.pipe_reserve_mutex); 4133 mutex_init(&adev->gfx.gfx_off_mutex); 4134 mutex_init(&adev->gfx.partition_mutex); 4135 mutex_init(&adev->grbm_idx_mutex); 4136 mutex_init(&adev->mn_lock); 4137 mutex_init(&adev->virt.vf_errors.lock); 4138 mutex_init(&adev->virt.rlcg_reg_lock); 4139 hash_init(adev->mn_hash); 4140 mutex_init(&adev->psp.mutex); 4141 mutex_init(&adev->notifier_lock); 4142 mutex_init(&adev->pm.stable_pstate_ctx_lock); 4143 mutex_init(&adev->benchmark_mutex); 4144 mutex_init(&adev->gfx.reset_sem_mutex); 4145 /* Initialize the mutex for cleaner shader isolation between GFX and compute processes */ 4146 mutex_init(&adev->enforce_isolation_mutex); 4147 mutex_init(&adev->gfx.kfd_sch_mutex); 4148 4149 amdgpu_device_init_apu_flags(adev); 4150 4151 r = amdgpu_device_check_arguments(adev); 4152 if (r) 4153 return r; 4154 4155 spin_lock_init(&adev->mmio_idx_lock); 4156 spin_lock_init(&adev->smc_idx_lock); 4157 spin_lock_init(&adev->pcie_idx_lock); 4158 spin_lock_init(&adev->uvd_ctx_idx_lock); 4159 spin_lock_init(&adev->didt_idx_lock); 4160 spin_lock_init(&adev->gc_cac_idx_lock); 4161 spin_lock_init(&adev->se_cac_idx_lock); 4162 spin_lock_init(&adev->audio_endpt_idx_lock); 4163 spin_lock_init(&adev->mm_stats.lock); 4164 spin_lock_init(&adev->wb.lock); 4165 4166 INIT_LIST_HEAD(&adev->reset_list); 4167 4168 INIT_LIST_HEAD(&adev->ras_list); 4169 4170 INIT_LIST_HEAD(&adev->pm.od_kobj_list); 4171 4172 INIT_DELAYED_WORK(&adev->delayed_init_work, 4173 amdgpu_device_delayed_init_work_handler); 4174 INIT_DELAYED_WORK(&adev->gfx.gfx_off_delay_work, 4175 amdgpu_device_delay_enable_gfx_off); 4176 /* 4177 * Initialize the enforce_isolation work structures for each XCP 4178 * partition. This work handler is responsible for enforcing shader 4179 * isolation on AMD GPUs. It counts the number of emitted fences for 4180 * each GFX and compute ring. If there are any fences, it schedules 4181 * the `enforce_isolation_work` to be run after a delay. If there are 4182 * no fences, it signals the Kernel Fusion Driver (KFD) to resume the 4183 * runqueue. 4184 */ 4185 for (i = 0; i < MAX_XCP; i++) { 4186 INIT_DELAYED_WORK(&adev->gfx.enforce_isolation[i].work, 4187 amdgpu_gfx_enforce_isolation_handler); 4188 adev->gfx.enforce_isolation[i].adev = adev; 4189 adev->gfx.enforce_isolation[i].xcp_id = i; 4190 } 4191 4192 INIT_WORK(&adev->xgmi_reset_work, amdgpu_device_xgmi_reset_func); 4193 4194 adev->gfx.gfx_off_req_count = 1; 4195 adev->gfx.gfx_off_residency = 0; 4196 adev->gfx.gfx_off_entrycount = 0; 4197 adev->pm.ac_power = power_supply_is_system_supplied() > 0; 4198 4199 atomic_set(&adev->throttling_logging_enabled, 1); 4200 /* 4201 * If throttling continues, logging will be performed every minute 4202 * to avoid log flooding. "-1" is subtracted since the thermal 4203 * throttling interrupt comes every second. Thus, the total logging 4204 * interval is 59 seconds(retelimited printk interval) + 1(waiting 4205 * for throttling interrupt) = 60 seconds. 4206 */ 4207 ratelimit_state_init(&adev->throttling_logging_rs, (60 - 1) * HZ, 1); 4208 ratelimit_set_flags(&adev->throttling_logging_rs, RATELIMIT_MSG_ON_RELEASE); 4209 4210 /* Registers mapping */ 4211 /* TODO: block userspace mapping of io register */ 4212 if (adev->asic_type >= CHIP_BONAIRE) { 4213 adev->rmmio_base = pci_resource_start(adev->pdev, 5); 4214 adev->rmmio_size = pci_resource_len(adev->pdev, 5); 4215 } else { 4216 adev->rmmio_base = pci_resource_start(adev->pdev, 2); 4217 adev->rmmio_size = pci_resource_len(adev->pdev, 2); 4218 } 4219 4220 for (i = 0; i < AMD_IP_BLOCK_TYPE_NUM; i++) 4221 atomic_set(&adev->pm.pwr_state[i], POWER_STATE_UNKNOWN); 4222 4223 adev->rmmio = ioremap(adev->rmmio_base, adev->rmmio_size); 4224 if (!adev->rmmio) 4225 return -ENOMEM; 4226 4227 DRM_INFO("register mmio base: 0x%08X\n", (uint32_t)adev->rmmio_base); 4228 DRM_INFO("register mmio size: %u\n", (unsigned int)adev->rmmio_size); 4229 4230 /* 4231 * Reset domain needs to be present early, before XGMI hive discovered 4232 * (if any) and intitialized to use reset sem and in_gpu reset flag 4233 * early on during init and before calling to RREG32. 4234 */ 4235 adev->reset_domain = amdgpu_reset_create_reset_domain(SINGLE_DEVICE, "amdgpu-reset-dev"); 4236 if (!adev->reset_domain) 4237 return -ENOMEM; 4238 4239 /* detect hw virtualization here */ 4240 amdgpu_detect_virtualization(adev); 4241 4242 amdgpu_device_get_pcie_info(adev); 4243 4244 r = amdgpu_device_get_job_timeout_settings(adev); 4245 if (r) { 4246 dev_err(adev->dev, "invalid lockup_timeout parameter syntax\n"); 4247 return r; 4248 } 4249 4250 amdgpu_device_set_mcbp(adev); 4251 4252 /* 4253 * By default, use default mode where all blocks are expected to be 4254 * initialized. At present a 'swinit' of blocks is required to be 4255 * completed before the need for a different level is detected. 4256 */ 4257 amdgpu_set_init_level(adev, AMDGPU_INIT_LEVEL_DEFAULT); 4258 /* early init functions */ 4259 r = amdgpu_device_ip_early_init(adev); 4260 if (r) 4261 return r; 4262 4263 /* Get rid of things like offb */ 4264 r = drm_aperture_remove_conflicting_pci_framebuffers(adev->pdev, &amdgpu_kms_driver); 4265 if (r) 4266 return r; 4267 4268 /* Enable TMZ based on IP_VERSION */ 4269 amdgpu_gmc_tmz_set(adev); 4270 4271 if (amdgpu_sriov_vf(adev) && 4272 amdgpu_ip_version(adev, GC_HWIP, 0) >= IP_VERSION(10, 3, 0)) 4273 /* VF MMIO access (except mailbox range) from CPU 4274 * will be blocked during sriov runtime 4275 */ 4276 adev->virt.caps |= AMDGPU_VF_MMIO_ACCESS_PROTECT; 4277 4278 amdgpu_gmc_noretry_set(adev); 4279 /* Need to get xgmi info early to decide the reset behavior*/ 4280 if (adev->gmc.xgmi.supported) { 4281 r = adev->gfxhub.funcs->get_xgmi_info(adev); 4282 if (r) 4283 return r; 4284 } 4285 4286 /* enable PCIE atomic ops */ 4287 if (amdgpu_sriov_vf(adev)) { 4288 if (adev->virt.fw_reserve.p_pf2vf) 4289 adev->have_atomics_support = ((struct amd_sriov_msg_pf2vf_info *) 4290 adev->virt.fw_reserve.p_pf2vf)->pcie_atomic_ops_support_flags == 4291 (PCI_EXP_DEVCAP2_ATOMIC_COMP32 | PCI_EXP_DEVCAP2_ATOMIC_COMP64); 4292 /* APUs w/ gfx9 onwards doesn't reply on PCIe atomics, rather it is a 4293 * internal path natively support atomics, set have_atomics_support to true. 4294 */ 4295 } else if ((adev->flags & AMD_IS_APU) && 4296 (amdgpu_ip_version(adev, GC_HWIP, 0) > 4297 IP_VERSION(9, 0, 0))) { 4298 adev->have_atomics_support = true; 4299 } else { 4300 adev->have_atomics_support = 4301 !pci_enable_atomic_ops_to_root(adev->pdev, 4302 PCI_EXP_DEVCAP2_ATOMIC_COMP32 | 4303 PCI_EXP_DEVCAP2_ATOMIC_COMP64); 4304 } 4305 4306 if (!adev->have_atomics_support) 4307 dev_info(adev->dev, "PCIE atomic ops is not supported\n"); 4308 4309 /* doorbell bar mapping and doorbell index init*/ 4310 amdgpu_doorbell_init(adev); 4311 4312 if (amdgpu_emu_mode == 1) { 4313 /* post the asic on emulation mode */ 4314 emu_soc_asic_init(adev); 4315 goto fence_driver_init; 4316 } 4317 4318 amdgpu_reset_init(adev); 4319 4320 /* detect if we are with an SRIOV vbios */ 4321 if (adev->bios) 4322 amdgpu_device_detect_sriov_bios(adev); 4323 4324 /* check if we need to reset the asic 4325 * E.g., driver was not cleanly unloaded previously, etc. 4326 */ 4327 if (!amdgpu_sriov_vf(adev) && amdgpu_asic_need_reset_on_init(adev)) { 4328 if (adev->gmc.xgmi.num_physical_nodes) { 4329 dev_info(adev->dev, "Pending hive reset.\n"); 4330 amdgpu_set_init_level(adev, 4331 AMDGPU_INIT_LEVEL_MINIMAL_XGMI); 4332 } else if (amdgpu_ip_version(adev, MP1_HWIP, 0) == IP_VERSION(13, 0, 10) && 4333 !amdgpu_device_has_display_hardware(adev)) { 4334 r = psp_gpu_reset(adev); 4335 } else { 4336 tmp = amdgpu_reset_method; 4337 /* It should do a default reset when loading or reloading the driver, 4338 * regardless of the module parameter reset_method. 4339 */ 4340 amdgpu_reset_method = AMD_RESET_METHOD_NONE; 4341 r = amdgpu_asic_reset(adev); 4342 amdgpu_reset_method = tmp; 4343 } 4344 4345 if (r) { 4346 dev_err(adev->dev, "asic reset on init failed\n"); 4347 goto failed; 4348 } 4349 } 4350 4351 /* Post card if necessary */ 4352 if (amdgpu_device_need_post(adev)) { 4353 if (!adev->bios) { 4354 dev_err(adev->dev, "no vBIOS found\n"); 4355 r = -EINVAL; 4356 goto failed; 4357 } 4358 DRM_INFO("GPU posting now...\n"); 4359 r = amdgpu_device_asic_init(adev); 4360 if (r) { 4361 dev_err(adev->dev, "gpu post error!\n"); 4362 goto failed; 4363 } 4364 } 4365 4366 if (adev->bios) { 4367 if (adev->is_atom_fw) { 4368 /* Initialize clocks */ 4369 r = amdgpu_atomfirmware_get_clock_info(adev); 4370 if (r) { 4371 dev_err(adev->dev, "amdgpu_atomfirmware_get_clock_info failed\n"); 4372 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_GET_CLOCK_FAIL, 0, 0); 4373 goto failed; 4374 } 4375 } else { 4376 /* Initialize clocks */ 4377 r = amdgpu_atombios_get_clock_info(adev); 4378 if (r) { 4379 dev_err(adev->dev, "amdgpu_atombios_get_clock_info failed\n"); 4380 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_GET_CLOCK_FAIL, 0, 0); 4381 goto failed; 4382 } 4383 /* init i2c buses */ 4384 if (!amdgpu_device_has_dc_support(adev)) 4385 amdgpu_atombios_i2c_init(adev); 4386 } 4387 } 4388 4389 fence_driver_init: 4390 /* Fence driver */ 4391 r = amdgpu_fence_driver_sw_init(adev); 4392 if (r) { 4393 dev_err(adev->dev, "amdgpu_fence_driver_sw_init failed\n"); 4394 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_FENCE_INIT_FAIL, 0, 0); 4395 goto failed; 4396 } 4397 4398 /* init the mode config */ 4399 drm_mode_config_init(adev_to_drm(adev)); 4400 4401 r = amdgpu_device_ip_init(adev); 4402 if (r) { 4403 dev_err(adev->dev, "amdgpu_device_ip_init failed\n"); 4404 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_AMDGPU_INIT_FAIL, 0, 0); 4405 goto release_ras_con; 4406 } 4407 4408 amdgpu_fence_driver_hw_init(adev); 4409 4410 dev_info(adev->dev, 4411 "SE %d, SH per SE %d, CU per SH %d, active_cu_number %d\n", 4412 adev->gfx.config.max_shader_engines, 4413 adev->gfx.config.max_sh_per_se, 4414 adev->gfx.config.max_cu_per_sh, 4415 adev->gfx.cu_info.number); 4416 4417 adev->accel_working = true; 4418 4419 amdgpu_vm_check_compute_bug(adev); 4420 4421 /* Initialize the buffer migration limit. */ 4422 if (amdgpu_moverate >= 0) 4423 max_MBps = amdgpu_moverate; 4424 else 4425 max_MBps = 8; /* Allow 8 MB/s. */ 4426 /* Get a log2 for easy divisions. */ 4427 adev->mm_stats.log2_max_MBps = ilog2(max(1u, max_MBps)); 4428 4429 /* 4430 * Register gpu instance before amdgpu_device_enable_mgpu_fan_boost. 4431 * Otherwise the mgpu fan boost feature will be skipped due to the 4432 * gpu instance is counted less. 4433 */ 4434 amdgpu_register_gpu_instance(adev); 4435 4436 /* enable clockgating, etc. after ib tests, etc. since some blocks require 4437 * explicit gating rather than handling it automatically. 4438 */ 4439 if (adev->init_lvl->level != AMDGPU_INIT_LEVEL_MINIMAL_XGMI) { 4440 r = amdgpu_device_ip_late_init(adev); 4441 if (r) { 4442 dev_err(adev->dev, "amdgpu_device_ip_late_init failed\n"); 4443 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_AMDGPU_LATE_INIT_FAIL, 0, r); 4444 goto release_ras_con; 4445 } 4446 /* must succeed. */ 4447 amdgpu_ras_resume(adev); 4448 queue_delayed_work(system_wq, &adev->delayed_init_work, 4449 msecs_to_jiffies(AMDGPU_RESUME_MS)); 4450 } 4451 4452 if (amdgpu_sriov_vf(adev)) { 4453 amdgpu_virt_release_full_gpu(adev, true); 4454 flush_delayed_work(&adev->delayed_init_work); 4455 } 4456 4457 /* 4458 * Place those sysfs registering after `late_init`. As some of those 4459 * operations performed in `late_init` might affect the sysfs 4460 * interfaces creating. 4461 */ 4462 r = amdgpu_atombios_sysfs_init(adev); 4463 if (r) 4464 drm_err(&adev->ddev, 4465 "registering atombios sysfs failed (%d).\n", r); 4466 4467 r = amdgpu_pm_sysfs_init(adev); 4468 if (r) 4469 DRM_ERROR("registering pm sysfs failed (%d).\n", r); 4470 4471 r = amdgpu_ucode_sysfs_init(adev); 4472 if (r) { 4473 adev->ucode_sysfs_en = false; 4474 DRM_ERROR("Creating firmware sysfs failed (%d).\n", r); 4475 } else 4476 adev->ucode_sysfs_en = true; 4477 4478 r = sysfs_create_files(&adev->dev->kobj, amdgpu_dev_attributes); 4479 if (r) 4480 dev_err(adev->dev, "Could not create amdgpu device attr\n"); 4481 4482 r = devm_device_add_group(adev->dev, &amdgpu_board_attrs_group); 4483 if (r) 4484 dev_err(adev->dev, 4485 "Could not create amdgpu board attributes\n"); 4486 4487 amdgpu_fru_sysfs_init(adev); 4488 amdgpu_reg_state_sysfs_init(adev); 4489 amdgpu_xcp_cfg_sysfs_init(adev); 4490 4491 if (IS_ENABLED(CONFIG_PERF_EVENTS)) 4492 r = amdgpu_pmu_init(adev); 4493 if (r) 4494 dev_err(adev->dev, "amdgpu_pmu_init failed\n"); 4495 4496 /* Have stored pci confspace at hand for restore in sudden PCI error */ 4497 if (amdgpu_device_cache_pci_state(adev->pdev)) 4498 pci_restore_state(pdev); 4499 4500 /* if we have > 1 VGA cards, then disable the amdgpu VGA resources */ 4501 /* this will fail for cards that aren't VGA class devices, just 4502 * ignore it 4503 */ 4504 if ((adev->pdev->class >> 8) == PCI_CLASS_DISPLAY_VGA) 4505 vga_client_register(adev->pdev, amdgpu_device_vga_set_decode); 4506 4507 px = amdgpu_device_supports_px(ddev); 4508 4509 if (px || (!dev_is_removable(&adev->pdev->dev) && 4510 apple_gmux_detect(NULL, NULL))) 4511 vga_switcheroo_register_client(adev->pdev, 4512 &amdgpu_switcheroo_ops, px); 4513 4514 if (px) 4515 vga_switcheroo_init_domain_pm_ops(adev->dev, &adev->vga_pm_domain); 4516 4517 if (adev->init_lvl->level == AMDGPU_INIT_LEVEL_MINIMAL_XGMI) 4518 amdgpu_xgmi_reset_on_init(adev); 4519 4520 amdgpu_device_check_iommu_direct_map(adev); 4521 4522 return 0; 4523 4524 release_ras_con: 4525 if (amdgpu_sriov_vf(adev)) 4526 amdgpu_virt_release_full_gpu(adev, true); 4527 4528 /* failed in exclusive mode due to timeout */ 4529 if (amdgpu_sriov_vf(adev) && 4530 !amdgpu_sriov_runtime(adev) && 4531 amdgpu_virt_mmio_blocked(adev) && 4532 !amdgpu_virt_wait_reset(adev)) { 4533 dev_err(adev->dev, "VF exclusive mode timeout\n"); 4534 /* Don't send request since VF is inactive. */ 4535 adev->virt.caps &= ~AMDGPU_SRIOV_CAPS_RUNTIME; 4536 adev->virt.ops = NULL; 4537 r = -EAGAIN; 4538 } 4539 amdgpu_release_ras_context(adev); 4540 4541 failed: 4542 amdgpu_vf_error_trans_all(adev); 4543 4544 return r; 4545 } 4546 4547 static void amdgpu_device_unmap_mmio(struct amdgpu_device *adev) 4548 { 4549 4550 /* Clear all CPU mappings pointing to this device */ 4551 unmap_mapping_range(adev->ddev.anon_inode->i_mapping, 0, 0, 1); 4552 4553 /* Unmap all mapped bars - Doorbell, registers and VRAM */ 4554 amdgpu_doorbell_fini(adev); 4555 4556 iounmap(adev->rmmio); 4557 adev->rmmio = NULL; 4558 if (adev->mman.aper_base_kaddr) 4559 iounmap(adev->mman.aper_base_kaddr); 4560 adev->mman.aper_base_kaddr = NULL; 4561 4562 /* Memory manager related */ 4563 if (!adev->gmc.xgmi.connected_to_cpu && !adev->gmc.is_app_apu) { 4564 arch_phys_wc_del(adev->gmc.vram_mtrr); 4565 arch_io_free_memtype_wc(adev->gmc.aper_base, adev->gmc.aper_size); 4566 } 4567 } 4568 4569 /** 4570 * amdgpu_device_fini_hw - tear down the driver 4571 * 4572 * @adev: amdgpu_device pointer 4573 * 4574 * Tear down the driver info (all asics). 4575 * Called at driver shutdown. 4576 */ 4577 void amdgpu_device_fini_hw(struct amdgpu_device *adev) 4578 { 4579 dev_info(adev->dev, "amdgpu: finishing device.\n"); 4580 flush_delayed_work(&adev->delayed_init_work); 4581 4582 if (adev->mman.initialized) 4583 drain_workqueue(adev->mman.bdev.wq); 4584 adev->shutdown = true; 4585 4586 /* make sure IB test finished before entering exclusive mode 4587 * to avoid preemption on IB test 4588 */ 4589 if (amdgpu_sriov_vf(adev)) { 4590 amdgpu_virt_request_full_gpu(adev, false); 4591 amdgpu_virt_fini_data_exchange(adev); 4592 } 4593 4594 /* disable all interrupts */ 4595 amdgpu_irq_disable_all(adev); 4596 if (adev->mode_info.mode_config_initialized) { 4597 if (!drm_drv_uses_atomic_modeset(adev_to_drm(adev))) 4598 drm_helper_force_disable_all(adev_to_drm(adev)); 4599 else 4600 drm_atomic_helper_shutdown(adev_to_drm(adev)); 4601 } 4602 amdgpu_fence_driver_hw_fini(adev); 4603 4604 if (adev->pm.sysfs_initialized) 4605 amdgpu_pm_sysfs_fini(adev); 4606 if (adev->ucode_sysfs_en) 4607 amdgpu_ucode_sysfs_fini(adev); 4608 sysfs_remove_files(&adev->dev->kobj, amdgpu_dev_attributes); 4609 amdgpu_fru_sysfs_fini(adev); 4610 4611 amdgpu_reg_state_sysfs_fini(adev); 4612 amdgpu_xcp_cfg_sysfs_fini(adev); 4613 4614 /* disable ras feature must before hw fini */ 4615 amdgpu_ras_pre_fini(adev); 4616 4617 amdgpu_ttm_set_buffer_funcs_status(adev, false); 4618 4619 amdgpu_device_ip_fini_early(adev); 4620 4621 amdgpu_irq_fini_hw(adev); 4622 4623 if (adev->mman.initialized) 4624 ttm_device_clear_dma_mappings(&adev->mman.bdev); 4625 4626 amdgpu_gart_dummy_page_fini(adev); 4627 4628 if (drm_dev_is_unplugged(adev_to_drm(adev))) 4629 amdgpu_device_unmap_mmio(adev); 4630 4631 } 4632 4633 void amdgpu_device_fini_sw(struct amdgpu_device *adev) 4634 { 4635 int idx; 4636 bool px; 4637 4638 amdgpu_fence_driver_sw_fini(adev); 4639 amdgpu_device_ip_fini(adev); 4640 amdgpu_ucode_release(&adev->firmware.gpu_info_fw); 4641 adev->accel_working = false; 4642 dma_fence_put(rcu_dereference_protected(adev->gang_submit, true)); 4643 4644 amdgpu_reset_fini(adev); 4645 4646 /* free i2c buses */ 4647 if (!amdgpu_device_has_dc_support(adev)) 4648 amdgpu_i2c_fini(adev); 4649 4650 if (amdgpu_emu_mode != 1) 4651 amdgpu_atombios_fini(adev); 4652 4653 kfree(adev->bios); 4654 adev->bios = NULL; 4655 4656 kfree(adev->fru_info); 4657 adev->fru_info = NULL; 4658 4659 px = amdgpu_device_supports_px(adev_to_drm(adev)); 4660 4661 if (px || (!dev_is_removable(&adev->pdev->dev) && 4662 apple_gmux_detect(NULL, NULL))) 4663 vga_switcheroo_unregister_client(adev->pdev); 4664 4665 if (px) 4666 vga_switcheroo_fini_domain_pm_ops(adev->dev); 4667 4668 if ((adev->pdev->class >> 8) == PCI_CLASS_DISPLAY_VGA) 4669 vga_client_unregister(adev->pdev); 4670 4671 if (drm_dev_enter(adev_to_drm(adev), &idx)) { 4672 4673 iounmap(adev->rmmio); 4674 adev->rmmio = NULL; 4675 amdgpu_doorbell_fini(adev); 4676 drm_dev_exit(idx); 4677 } 4678 4679 if (IS_ENABLED(CONFIG_PERF_EVENTS)) 4680 amdgpu_pmu_fini(adev); 4681 if (adev->mman.discovery_bin) 4682 amdgpu_discovery_fini(adev); 4683 4684 amdgpu_reset_put_reset_domain(adev->reset_domain); 4685 adev->reset_domain = NULL; 4686 4687 kfree(adev->pci_state); 4688 4689 } 4690 4691 /** 4692 * amdgpu_device_evict_resources - evict device resources 4693 * @adev: amdgpu device object 4694 * 4695 * Evicts all ttm device resources(vram BOs, gart table) from the lru list 4696 * of the vram memory type. Mainly used for evicting device resources 4697 * at suspend time. 4698 * 4699 */ 4700 static int amdgpu_device_evict_resources(struct amdgpu_device *adev) 4701 { 4702 int ret; 4703 4704 /* No need to evict vram on APUs for suspend to ram or s2idle */ 4705 if ((adev->in_s3 || adev->in_s0ix) && (adev->flags & AMD_IS_APU)) 4706 return 0; 4707 4708 ret = amdgpu_ttm_evict_resources(adev, TTM_PL_VRAM); 4709 if (ret) 4710 DRM_WARN("evicting device resources failed\n"); 4711 return ret; 4712 } 4713 4714 /* 4715 * Suspend & resume. 4716 */ 4717 /** 4718 * amdgpu_device_prepare - prepare for device suspend 4719 * 4720 * @dev: drm dev pointer 4721 * 4722 * Prepare to put the hw in the suspend state (all asics). 4723 * Returns 0 for success or an error on failure. 4724 * Called at driver suspend. 4725 */ 4726 int amdgpu_device_prepare(struct drm_device *dev) 4727 { 4728 struct amdgpu_device *adev = drm_to_adev(dev); 4729 int i, r; 4730 4731 amdgpu_choose_low_power_state(adev); 4732 4733 if (dev->switch_power_state == DRM_SWITCH_POWER_OFF) 4734 return 0; 4735 4736 /* Evict the majority of BOs before starting suspend sequence */ 4737 r = amdgpu_device_evict_resources(adev); 4738 if (r) 4739 goto unprepare; 4740 4741 flush_delayed_work(&adev->gfx.gfx_off_delay_work); 4742 4743 for (i = 0; i < adev->num_ip_blocks; i++) { 4744 if (!adev->ip_blocks[i].status.valid) 4745 continue; 4746 if (!adev->ip_blocks[i].version->funcs->prepare_suspend) 4747 continue; 4748 r = adev->ip_blocks[i].version->funcs->prepare_suspend((void *)adev); 4749 if (r) 4750 goto unprepare; 4751 } 4752 4753 return 0; 4754 4755 unprepare: 4756 adev->in_s0ix = adev->in_s3 = false; 4757 4758 return r; 4759 } 4760 4761 /** 4762 * amdgpu_device_suspend - initiate device suspend 4763 * 4764 * @dev: drm dev pointer 4765 * @fbcon : notify the fbdev of suspend 4766 * 4767 * Puts the hw in the suspend state (all asics). 4768 * Returns 0 for success or an error on failure. 4769 * Called at driver suspend. 4770 */ 4771 int amdgpu_device_suspend(struct drm_device *dev, bool fbcon) 4772 { 4773 struct amdgpu_device *adev = drm_to_adev(dev); 4774 int r = 0; 4775 4776 if (dev->switch_power_state == DRM_SWITCH_POWER_OFF) 4777 return 0; 4778 4779 adev->in_suspend = true; 4780 4781 if (amdgpu_sriov_vf(adev)) { 4782 amdgpu_virt_fini_data_exchange(adev); 4783 r = amdgpu_virt_request_full_gpu(adev, false); 4784 if (r) 4785 return r; 4786 } 4787 4788 if (amdgpu_acpi_smart_shift_update(dev, AMDGPU_SS_DEV_D3)) 4789 DRM_WARN("smart shift update failed\n"); 4790 4791 if (fbcon) 4792 drm_fb_helper_set_suspend_unlocked(adev_to_drm(adev)->fb_helper, true); 4793 4794 cancel_delayed_work_sync(&adev->delayed_init_work); 4795 4796 amdgpu_ras_suspend(adev); 4797 4798 amdgpu_device_ip_suspend_phase1(adev); 4799 4800 if (!adev->in_s0ix) 4801 amdgpu_amdkfd_suspend(adev, adev->in_runpm); 4802 4803 r = amdgpu_device_evict_resources(adev); 4804 if (r) 4805 return r; 4806 4807 amdgpu_ttm_set_buffer_funcs_status(adev, false); 4808 4809 amdgpu_fence_driver_hw_fini(adev); 4810 4811 amdgpu_device_ip_suspend_phase2(adev); 4812 4813 if (amdgpu_sriov_vf(adev)) 4814 amdgpu_virt_release_full_gpu(adev, false); 4815 4816 r = amdgpu_dpm_notify_rlc_state(adev, false); 4817 if (r) 4818 return r; 4819 4820 return 0; 4821 } 4822 4823 /** 4824 * amdgpu_device_resume - initiate device resume 4825 * 4826 * @dev: drm dev pointer 4827 * @fbcon : notify the fbdev of resume 4828 * 4829 * Bring the hw back to operating state (all asics). 4830 * Returns 0 for success or an error on failure. 4831 * Called at driver resume. 4832 */ 4833 int amdgpu_device_resume(struct drm_device *dev, bool fbcon) 4834 { 4835 struct amdgpu_device *adev = drm_to_adev(dev); 4836 int r = 0; 4837 4838 if (amdgpu_sriov_vf(adev)) { 4839 r = amdgpu_virt_request_full_gpu(adev, true); 4840 if (r) 4841 return r; 4842 } 4843 4844 if (dev->switch_power_state == DRM_SWITCH_POWER_OFF) 4845 return 0; 4846 4847 if (adev->in_s0ix) 4848 amdgpu_dpm_gfx_state_change(adev, sGpuChangeState_D0Entry); 4849 4850 /* post card */ 4851 if (amdgpu_device_need_post(adev)) { 4852 r = amdgpu_device_asic_init(adev); 4853 if (r) 4854 dev_err(adev->dev, "amdgpu asic init failed\n"); 4855 } 4856 4857 r = amdgpu_device_ip_resume(adev); 4858 4859 if (r) { 4860 dev_err(adev->dev, "amdgpu_device_ip_resume failed (%d).\n", r); 4861 goto exit; 4862 } 4863 amdgpu_fence_driver_hw_init(adev); 4864 4865 if (!adev->in_s0ix) { 4866 r = amdgpu_amdkfd_resume(adev, adev->in_runpm); 4867 if (r) 4868 goto exit; 4869 } 4870 4871 r = amdgpu_device_ip_late_init(adev); 4872 if (r) 4873 goto exit; 4874 4875 queue_delayed_work(system_wq, &adev->delayed_init_work, 4876 msecs_to_jiffies(AMDGPU_RESUME_MS)); 4877 exit: 4878 if (amdgpu_sriov_vf(adev)) { 4879 amdgpu_virt_init_data_exchange(adev); 4880 amdgpu_virt_release_full_gpu(adev, true); 4881 } 4882 4883 if (r) 4884 return r; 4885 4886 /* Make sure IB tests flushed */ 4887 flush_delayed_work(&adev->delayed_init_work); 4888 4889 if (fbcon) 4890 drm_fb_helper_set_suspend_unlocked(adev_to_drm(adev)->fb_helper, false); 4891 4892 amdgpu_ras_resume(adev); 4893 4894 if (adev->mode_info.num_crtc) { 4895 /* 4896 * Most of the connector probing functions try to acquire runtime pm 4897 * refs to ensure that the GPU is powered on when connector polling is 4898 * performed. Since we're calling this from a runtime PM callback, 4899 * trying to acquire rpm refs will cause us to deadlock. 4900 * 4901 * Since we're guaranteed to be holding the rpm lock, it's safe to 4902 * temporarily disable the rpm helpers so this doesn't deadlock us. 4903 */ 4904 #ifdef CONFIG_PM 4905 dev->dev->power.disable_depth++; 4906 #endif 4907 if (!adev->dc_enabled) 4908 drm_helper_hpd_irq_event(dev); 4909 else 4910 drm_kms_helper_hotplug_event(dev); 4911 #ifdef CONFIG_PM 4912 dev->dev->power.disable_depth--; 4913 #endif 4914 } 4915 adev->in_suspend = false; 4916 4917 if (adev->enable_mes) 4918 amdgpu_mes_self_test(adev); 4919 4920 if (amdgpu_acpi_smart_shift_update(dev, AMDGPU_SS_DEV_D0)) 4921 DRM_WARN("smart shift update failed\n"); 4922 4923 return 0; 4924 } 4925 4926 /** 4927 * amdgpu_device_ip_check_soft_reset - did soft reset succeed 4928 * 4929 * @adev: amdgpu_device pointer 4930 * 4931 * The list of all the hardware IPs that make up the asic is walked and 4932 * the check_soft_reset callbacks are run. check_soft_reset determines 4933 * if the asic is still hung or not. 4934 * Returns true if any of the IPs are still in a hung state, false if not. 4935 */ 4936 static bool amdgpu_device_ip_check_soft_reset(struct amdgpu_device *adev) 4937 { 4938 int i; 4939 bool asic_hang = false; 4940 4941 if (amdgpu_sriov_vf(adev)) 4942 return true; 4943 4944 if (amdgpu_asic_need_full_reset(adev)) 4945 return true; 4946 4947 for (i = 0; i < adev->num_ip_blocks; i++) { 4948 if (!adev->ip_blocks[i].status.valid) 4949 continue; 4950 if (adev->ip_blocks[i].version->funcs->check_soft_reset) 4951 adev->ip_blocks[i].status.hang = 4952 adev->ip_blocks[i].version->funcs->check_soft_reset(adev); 4953 if (adev->ip_blocks[i].status.hang) { 4954 dev_info(adev->dev, "IP block:%s is hung!\n", adev->ip_blocks[i].version->funcs->name); 4955 asic_hang = true; 4956 } 4957 } 4958 return asic_hang; 4959 } 4960 4961 /** 4962 * amdgpu_device_ip_pre_soft_reset - prepare for soft reset 4963 * 4964 * @adev: amdgpu_device pointer 4965 * 4966 * The list of all the hardware IPs that make up the asic is walked and the 4967 * pre_soft_reset callbacks are run if the block is hung. pre_soft_reset 4968 * handles any IP specific hardware or software state changes that are 4969 * necessary for a soft reset to succeed. 4970 * Returns 0 on success, negative error code on failure. 4971 */ 4972 static int amdgpu_device_ip_pre_soft_reset(struct amdgpu_device *adev) 4973 { 4974 int i, r = 0; 4975 4976 for (i = 0; i < adev->num_ip_blocks; i++) { 4977 if (!adev->ip_blocks[i].status.valid) 4978 continue; 4979 if (adev->ip_blocks[i].status.hang && 4980 adev->ip_blocks[i].version->funcs->pre_soft_reset) { 4981 r = adev->ip_blocks[i].version->funcs->pre_soft_reset(adev); 4982 if (r) 4983 return r; 4984 } 4985 } 4986 4987 return 0; 4988 } 4989 4990 /** 4991 * amdgpu_device_ip_need_full_reset - check if a full asic reset is needed 4992 * 4993 * @adev: amdgpu_device pointer 4994 * 4995 * Some hardware IPs cannot be soft reset. If they are hung, a full gpu 4996 * reset is necessary to recover. 4997 * Returns true if a full asic reset is required, false if not. 4998 */ 4999 static bool amdgpu_device_ip_need_full_reset(struct amdgpu_device *adev) 5000 { 5001 int i; 5002 5003 if (amdgpu_asic_need_full_reset(adev)) 5004 return true; 5005 5006 for (i = 0; i < adev->num_ip_blocks; i++) { 5007 if (!adev->ip_blocks[i].status.valid) 5008 continue; 5009 if ((adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) || 5010 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) || 5011 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_ACP) || 5012 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE) || 5013 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) { 5014 if (adev->ip_blocks[i].status.hang) { 5015 dev_info(adev->dev, "Some block need full reset!\n"); 5016 return true; 5017 } 5018 } 5019 } 5020 return false; 5021 } 5022 5023 /** 5024 * amdgpu_device_ip_soft_reset - do a soft reset 5025 * 5026 * @adev: amdgpu_device pointer 5027 * 5028 * The list of all the hardware IPs that make up the asic is walked and the 5029 * soft_reset callbacks are run if the block is hung. soft_reset handles any 5030 * IP specific hardware or software state changes that are necessary to soft 5031 * reset the IP. 5032 * Returns 0 on success, negative error code on failure. 5033 */ 5034 static int amdgpu_device_ip_soft_reset(struct amdgpu_device *adev) 5035 { 5036 int i, r = 0; 5037 5038 for (i = 0; i < adev->num_ip_blocks; i++) { 5039 if (!adev->ip_blocks[i].status.valid) 5040 continue; 5041 if (adev->ip_blocks[i].status.hang && 5042 adev->ip_blocks[i].version->funcs->soft_reset) { 5043 r = adev->ip_blocks[i].version->funcs->soft_reset(adev); 5044 if (r) 5045 return r; 5046 } 5047 } 5048 5049 return 0; 5050 } 5051 5052 /** 5053 * amdgpu_device_ip_post_soft_reset - clean up from soft reset 5054 * 5055 * @adev: amdgpu_device pointer 5056 * 5057 * The list of all the hardware IPs that make up the asic is walked and the 5058 * post_soft_reset callbacks are run if the asic was hung. post_soft_reset 5059 * handles any IP specific hardware or software state changes that are 5060 * necessary after the IP has been soft reset. 5061 * Returns 0 on success, negative error code on failure. 5062 */ 5063 static int amdgpu_device_ip_post_soft_reset(struct amdgpu_device *adev) 5064 { 5065 int i, r = 0; 5066 5067 for (i = 0; i < adev->num_ip_blocks; i++) { 5068 if (!adev->ip_blocks[i].status.valid) 5069 continue; 5070 if (adev->ip_blocks[i].status.hang && 5071 adev->ip_blocks[i].version->funcs->post_soft_reset) 5072 r = adev->ip_blocks[i].version->funcs->post_soft_reset(adev); 5073 if (r) 5074 return r; 5075 } 5076 5077 return 0; 5078 } 5079 5080 /** 5081 * amdgpu_device_reset_sriov - reset ASIC for SR-IOV vf 5082 * 5083 * @adev: amdgpu_device pointer 5084 * @reset_context: amdgpu reset context pointer 5085 * 5086 * do VF FLR and reinitialize Asic 5087 * return 0 means succeeded otherwise failed 5088 */ 5089 static int amdgpu_device_reset_sriov(struct amdgpu_device *adev, 5090 struct amdgpu_reset_context *reset_context) 5091 { 5092 int r; 5093 struct amdgpu_hive_info *hive = NULL; 5094 5095 if (test_bit(AMDGPU_HOST_FLR, &reset_context->flags)) { 5096 if (!amdgpu_ras_get_fed_status(adev)) 5097 amdgpu_virt_ready_to_reset(adev); 5098 amdgpu_virt_wait_reset(adev); 5099 clear_bit(AMDGPU_HOST_FLR, &reset_context->flags); 5100 r = amdgpu_virt_request_full_gpu(adev, true); 5101 } else { 5102 r = amdgpu_virt_reset_gpu(adev); 5103 } 5104 if (r) 5105 return r; 5106 5107 amdgpu_ras_set_fed(adev, false); 5108 amdgpu_irq_gpu_reset_resume_helper(adev); 5109 5110 /* some sw clean up VF needs to do before recover */ 5111 amdgpu_virt_post_reset(adev); 5112 5113 /* Resume IP prior to SMC */ 5114 r = amdgpu_device_ip_reinit_early_sriov(adev); 5115 if (r) 5116 return r; 5117 5118 amdgpu_virt_init_data_exchange(adev); 5119 5120 r = amdgpu_device_fw_loading(adev); 5121 if (r) 5122 return r; 5123 5124 /* now we are okay to resume SMC/CP/SDMA */ 5125 r = amdgpu_device_ip_reinit_late_sriov(adev); 5126 if (r) 5127 return r; 5128 5129 hive = amdgpu_get_xgmi_hive(adev); 5130 /* Update PSP FW topology after reset */ 5131 if (hive && adev->gmc.xgmi.num_physical_nodes > 1) 5132 r = amdgpu_xgmi_update_topology(hive, adev); 5133 if (hive) 5134 amdgpu_put_xgmi_hive(hive); 5135 if (r) 5136 return r; 5137 5138 r = amdgpu_ib_ring_tests(adev); 5139 if (r) 5140 return r; 5141 5142 if (adev->virt.gim_feature & AMDGIM_FEATURE_GIM_FLR_VRAMLOST) 5143 amdgpu_inc_vram_lost(adev); 5144 5145 /* need to be called during full access so we can't do it later like 5146 * bare-metal does. 5147 */ 5148 amdgpu_amdkfd_post_reset(adev); 5149 amdgpu_virt_release_full_gpu(adev, true); 5150 5151 /* Aldebaran and gfx_11_0_3 support ras in SRIOV, so need resume ras during reset */ 5152 if (amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 2) || 5153 amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 3) || 5154 amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 4) || 5155 amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(11, 0, 3)) 5156 amdgpu_ras_resume(adev); 5157 return 0; 5158 } 5159 5160 /** 5161 * amdgpu_device_has_job_running - check if there is any job in mirror list 5162 * 5163 * @adev: amdgpu_device pointer 5164 * 5165 * check if there is any job in mirror list 5166 */ 5167 bool amdgpu_device_has_job_running(struct amdgpu_device *adev) 5168 { 5169 int i; 5170 struct drm_sched_job *job; 5171 5172 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 5173 struct amdgpu_ring *ring = adev->rings[i]; 5174 5175 if (!amdgpu_ring_sched_ready(ring)) 5176 continue; 5177 5178 spin_lock(&ring->sched.job_list_lock); 5179 job = list_first_entry_or_null(&ring->sched.pending_list, 5180 struct drm_sched_job, list); 5181 spin_unlock(&ring->sched.job_list_lock); 5182 if (job) 5183 return true; 5184 } 5185 return false; 5186 } 5187 5188 /** 5189 * amdgpu_device_should_recover_gpu - check if we should try GPU recovery 5190 * 5191 * @adev: amdgpu_device pointer 5192 * 5193 * Check amdgpu_gpu_recovery and SRIOV status to see if we should try to recover 5194 * a hung GPU. 5195 */ 5196 bool amdgpu_device_should_recover_gpu(struct amdgpu_device *adev) 5197 { 5198 5199 if (amdgpu_gpu_recovery == 0) 5200 goto disabled; 5201 5202 /* Skip soft reset check in fatal error mode */ 5203 if (!amdgpu_ras_is_poison_mode_supported(adev)) 5204 return true; 5205 5206 if (amdgpu_sriov_vf(adev)) 5207 return true; 5208 5209 if (amdgpu_gpu_recovery == -1) { 5210 switch (adev->asic_type) { 5211 #ifdef CONFIG_DRM_AMDGPU_SI 5212 case CHIP_VERDE: 5213 case CHIP_TAHITI: 5214 case CHIP_PITCAIRN: 5215 case CHIP_OLAND: 5216 case CHIP_HAINAN: 5217 #endif 5218 #ifdef CONFIG_DRM_AMDGPU_CIK 5219 case CHIP_KAVERI: 5220 case CHIP_KABINI: 5221 case CHIP_MULLINS: 5222 #endif 5223 case CHIP_CARRIZO: 5224 case CHIP_STONEY: 5225 case CHIP_CYAN_SKILLFISH: 5226 goto disabled; 5227 default: 5228 break; 5229 } 5230 } 5231 5232 return true; 5233 5234 disabled: 5235 dev_info(adev->dev, "GPU recovery disabled.\n"); 5236 return false; 5237 } 5238 5239 int amdgpu_device_mode1_reset(struct amdgpu_device *adev) 5240 { 5241 u32 i; 5242 int ret = 0; 5243 5244 amdgpu_atombios_scratch_regs_engine_hung(adev, true); 5245 5246 dev_info(adev->dev, "GPU mode1 reset\n"); 5247 5248 /* Cache the state before bus master disable. The saved config space 5249 * values are used in other cases like restore after mode-2 reset. 5250 */ 5251 amdgpu_device_cache_pci_state(adev->pdev); 5252 5253 /* disable BM */ 5254 pci_clear_master(adev->pdev); 5255 5256 if (amdgpu_dpm_is_mode1_reset_supported(adev)) { 5257 dev_info(adev->dev, "GPU smu mode1 reset\n"); 5258 ret = amdgpu_dpm_mode1_reset(adev); 5259 } else { 5260 dev_info(adev->dev, "GPU psp mode1 reset\n"); 5261 ret = psp_gpu_reset(adev); 5262 } 5263 5264 if (ret) 5265 goto mode1_reset_failed; 5266 5267 amdgpu_device_load_pci_state(adev->pdev); 5268 ret = amdgpu_psp_wait_for_bootloader(adev); 5269 if (ret) 5270 goto mode1_reset_failed; 5271 5272 /* wait for asic to come out of reset */ 5273 for (i = 0; i < adev->usec_timeout; i++) { 5274 u32 memsize = adev->nbio.funcs->get_memsize(adev); 5275 5276 if (memsize != 0xffffffff) 5277 break; 5278 udelay(1); 5279 } 5280 5281 if (i >= adev->usec_timeout) { 5282 ret = -ETIMEDOUT; 5283 goto mode1_reset_failed; 5284 } 5285 5286 amdgpu_atombios_scratch_regs_engine_hung(adev, false); 5287 5288 return 0; 5289 5290 mode1_reset_failed: 5291 dev_err(adev->dev, "GPU mode1 reset failed\n"); 5292 return ret; 5293 } 5294 5295 int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev, 5296 struct amdgpu_reset_context *reset_context) 5297 { 5298 int i, r = 0; 5299 struct amdgpu_job *job = NULL; 5300 struct amdgpu_device *tmp_adev = reset_context->reset_req_dev; 5301 bool need_full_reset = 5302 test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 5303 5304 if (reset_context->reset_req_dev == adev) 5305 job = reset_context->job; 5306 5307 if (amdgpu_sriov_vf(adev)) 5308 amdgpu_virt_pre_reset(adev); 5309 5310 amdgpu_fence_driver_isr_toggle(adev, true); 5311 5312 /* block all schedulers and reset given job's ring */ 5313 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 5314 struct amdgpu_ring *ring = adev->rings[i]; 5315 5316 if (!amdgpu_ring_sched_ready(ring)) 5317 continue; 5318 5319 /* Clear job fence from fence drv to avoid force_completion 5320 * leave NULL and vm flush fence in fence drv 5321 */ 5322 amdgpu_fence_driver_clear_job_fences(ring); 5323 5324 /* after all hw jobs are reset, hw fence is meaningless, so force_completion */ 5325 amdgpu_fence_driver_force_completion(ring); 5326 } 5327 5328 amdgpu_fence_driver_isr_toggle(adev, false); 5329 5330 if (job && job->vm) 5331 drm_sched_increase_karma(&job->base); 5332 5333 r = amdgpu_reset_prepare_hwcontext(adev, reset_context); 5334 /* If reset handler not implemented, continue; otherwise return */ 5335 if (r == -EOPNOTSUPP) 5336 r = 0; 5337 else 5338 return r; 5339 5340 /* Don't suspend on bare metal if we are not going to HW reset the ASIC */ 5341 if (!amdgpu_sriov_vf(adev)) { 5342 5343 if (!need_full_reset) 5344 need_full_reset = amdgpu_device_ip_need_full_reset(adev); 5345 5346 if (!need_full_reset && amdgpu_gpu_recovery && 5347 amdgpu_device_ip_check_soft_reset(adev)) { 5348 amdgpu_device_ip_pre_soft_reset(adev); 5349 r = amdgpu_device_ip_soft_reset(adev); 5350 amdgpu_device_ip_post_soft_reset(adev); 5351 if (r || amdgpu_device_ip_check_soft_reset(adev)) { 5352 dev_info(adev->dev, "soft reset failed, will fallback to full reset!\n"); 5353 need_full_reset = true; 5354 } 5355 } 5356 5357 if (!test_bit(AMDGPU_SKIP_COREDUMP, &reset_context->flags)) { 5358 dev_info(tmp_adev->dev, "Dumping IP State\n"); 5359 /* Trigger ip dump before we reset the asic */ 5360 for (i = 0; i < tmp_adev->num_ip_blocks; i++) 5361 if (tmp_adev->ip_blocks[i].version->funcs->dump_ip_state) 5362 tmp_adev->ip_blocks[i].version->funcs 5363 ->dump_ip_state((void *)&tmp_adev->ip_blocks[i]); 5364 dev_info(tmp_adev->dev, "Dumping IP State Completed\n"); 5365 } 5366 5367 if (need_full_reset) 5368 r = amdgpu_device_ip_suspend(adev); 5369 if (need_full_reset) 5370 set_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 5371 else 5372 clear_bit(AMDGPU_NEED_FULL_RESET, 5373 &reset_context->flags); 5374 } 5375 5376 return r; 5377 } 5378 5379 int amdgpu_device_reinit_after_reset(struct amdgpu_reset_context *reset_context) 5380 { 5381 struct list_head *device_list_handle; 5382 bool full_reset, vram_lost = false; 5383 struct amdgpu_device *tmp_adev; 5384 int r; 5385 5386 device_list_handle = reset_context->reset_device_list; 5387 5388 if (!device_list_handle) 5389 return -EINVAL; 5390 5391 full_reset = test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 5392 5393 r = 0; 5394 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 5395 /* After reset, it's default init level */ 5396 amdgpu_set_init_level(tmp_adev, AMDGPU_INIT_LEVEL_DEFAULT); 5397 if (full_reset) { 5398 /* post card */ 5399 amdgpu_ras_set_fed(tmp_adev, false); 5400 r = amdgpu_device_asic_init(tmp_adev); 5401 if (r) { 5402 dev_warn(tmp_adev->dev, "asic atom init failed!"); 5403 } else { 5404 dev_info(tmp_adev->dev, "GPU reset succeeded, trying to resume\n"); 5405 5406 r = amdgpu_device_ip_resume_phase1(tmp_adev); 5407 if (r) 5408 goto out; 5409 5410 vram_lost = amdgpu_device_check_vram_lost(tmp_adev); 5411 5412 if (!test_bit(AMDGPU_SKIP_COREDUMP, &reset_context->flags)) 5413 amdgpu_coredump(tmp_adev, false, vram_lost, reset_context->job); 5414 5415 if (vram_lost) { 5416 DRM_INFO("VRAM is lost due to GPU reset!\n"); 5417 amdgpu_inc_vram_lost(tmp_adev); 5418 } 5419 5420 r = amdgpu_device_fw_loading(tmp_adev); 5421 if (r) 5422 return r; 5423 5424 r = amdgpu_xcp_restore_partition_mode( 5425 tmp_adev->xcp_mgr); 5426 if (r) 5427 goto out; 5428 5429 r = amdgpu_device_ip_resume_phase2(tmp_adev); 5430 if (r) 5431 goto out; 5432 5433 if (tmp_adev->mman.buffer_funcs_ring->sched.ready) 5434 amdgpu_ttm_set_buffer_funcs_status(tmp_adev, true); 5435 5436 if (vram_lost) 5437 amdgpu_device_fill_reset_magic(tmp_adev); 5438 5439 /* 5440 * Add this ASIC as tracked as reset was already 5441 * complete successfully. 5442 */ 5443 amdgpu_register_gpu_instance(tmp_adev); 5444 5445 if (!reset_context->hive && 5446 tmp_adev->gmc.xgmi.num_physical_nodes > 1) 5447 amdgpu_xgmi_add_device(tmp_adev); 5448 5449 r = amdgpu_device_ip_late_init(tmp_adev); 5450 if (r) 5451 goto out; 5452 5453 drm_fb_helper_set_suspend_unlocked(adev_to_drm(tmp_adev)->fb_helper, false); 5454 5455 /* 5456 * The GPU enters bad state once faulty pages 5457 * by ECC has reached the threshold, and ras 5458 * recovery is scheduled next. So add one check 5459 * here to break recovery if it indeed exceeds 5460 * bad page threshold, and remind user to 5461 * retire this GPU or setting one bigger 5462 * bad_page_threshold value to fix this once 5463 * probing driver again. 5464 */ 5465 if (!amdgpu_ras_is_rma(tmp_adev)) { 5466 /* must succeed. */ 5467 amdgpu_ras_resume(tmp_adev); 5468 } else { 5469 r = -EINVAL; 5470 goto out; 5471 } 5472 5473 /* Update PSP FW topology after reset */ 5474 if (reset_context->hive && 5475 tmp_adev->gmc.xgmi.num_physical_nodes > 1) 5476 r = amdgpu_xgmi_update_topology( 5477 reset_context->hive, tmp_adev); 5478 } 5479 } 5480 5481 out: 5482 if (!r) { 5483 amdgpu_irq_gpu_reset_resume_helper(tmp_adev); 5484 r = amdgpu_ib_ring_tests(tmp_adev); 5485 if (r) { 5486 dev_err(tmp_adev->dev, "ib ring test failed (%d).\n", r); 5487 r = -EAGAIN; 5488 goto end; 5489 } 5490 } 5491 5492 if (r) 5493 tmp_adev->asic_reset_res = r; 5494 } 5495 5496 end: 5497 return r; 5498 } 5499 5500 int amdgpu_do_asic_reset(struct list_head *device_list_handle, 5501 struct amdgpu_reset_context *reset_context) 5502 { 5503 struct amdgpu_device *tmp_adev = NULL; 5504 bool need_full_reset, skip_hw_reset; 5505 int r = 0; 5506 5507 /* Try reset handler method first */ 5508 tmp_adev = list_first_entry(device_list_handle, struct amdgpu_device, 5509 reset_list); 5510 5511 reset_context->reset_device_list = device_list_handle; 5512 r = amdgpu_reset_perform_reset(tmp_adev, reset_context); 5513 /* If reset handler not implemented, continue; otherwise return */ 5514 if (r == -EOPNOTSUPP) 5515 r = 0; 5516 else 5517 return r; 5518 5519 /* Reset handler not implemented, use the default method */ 5520 need_full_reset = 5521 test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 5522 skip_hw_reset = test_bit(AMDGPU_SKIP_HW_RESET, &reset_context->flags); 5523 5524 /* 5525 * ASIC reset has to be done on all XGMI hive nodes ASAP 5526 * to allow proper links negotiation in FW (within 1 sec) 5527 */ 5528 if (!skip_hw_reset && need_full_reset) { 5529 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 5530 /* For XGMI run all resets in parallel to speed up the process */ 5531 if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) { 5532 if (!queue_work(system_unbound_wq, 5533 &tmp_adev->xgmi_reset_work)) 5534 r = -EALREADY; 5535 } else 5536 r = amdgpu_asic_reset(tmp_adev); 5537 5538 if (r) { 5539 dev_err(tmp_adev->dev, 5540 "ASIC reset failed with error, %d for drm dev, %s", 5541 r, adev_to_drm(tmp_adev)->unique); 5542 goto out; 5543 } 5544 } 5545 5546 /* For XGMI wait for all resets to complete before proceed */ 5547 if (!r) { 5548 list_for_each_entry(tmp_adev, device_list_handle, 5549 reset_list) { 5550 if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) { 5551 flush_work(&tmp_adev->xgmi_reset_work); 5552 r = tmp_adev->asic_reset_res; 5553 if (r) 5554 break; 5555 } 5556 } 5557 } 5558 } 5559 5560 if (!r && amdgpu_ras_intr_triggered()) { 5561 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 5562 amdgpu_ras_reset_error_count(tmp_adev, 5563 AMDGPU_RAS_BLOCK__MMHUB); 5564 } 5565 5566 amdgpu_ras_intr_cleared(); 5567 } 5568 5569 r = amdgpu_device_reinit_after_reset(reset_context); 5570 if (r == -EAGAIN) 5571 set_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 5572 else 5573 clear_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 5574 5575 out: 5576 return r; 5577 } 5578 5579 static void amdgpu_device_set_mp1_state(struct amdgpu_device *adev) 5580 { 5581 5582 switch (amdgpu_asic_reset_method(adev)) { 5583 case AMD_RESET_METHOD_MODE1: 5584 adev->mp1_state = PP_MP1_STATE_SHUTDOWN; 5585 break; 5586 case AMD_RESET_METHOD_MODE2: 5587 adev->mp1_state = PP_MP1_STATE_RESET; 5588 break; 5589 default: 5590 adev->mp1_state = PP_MP1_STATE_NONE; 5591 break; 5592 } 5593 } 5594 5595 static void amdgpu_device_unset_mp1_state(struct amdgpu_device *adev) 5596 { 5597 amdgpu_vf_error_trans_all(adev); 5598 adev->mp1_state = PP_MP1_STATE_NONE; 5599 } 5600 5601 static void amdgpu_device_resume_display_audio(struct amdgpu_device *adev) 5602 { 5603 struct pci_dev *p = NULL; 5604 5605 p = pci_get_domain_bus_and_slot(pci_domain_nr(adev->pdev->bus), 5606 adev->pdev->bus->number, 1); 5607 if (p) { 5608 pm_runtime_enable(&(p->dev)); 5609 pm_runtime_resume(&(p->dev)); 5610 } 5611 5612 pci_dev_put(p); 5613 } 5614 5615 static int amdgpu_device_suspend_display_audio(struct amdgpu_device *adev) 5616 { 5617 enum amd_reset_method reset_method; 5618 struct pci_dev *p = NULL; 5619 u64 expires; 5620 5621 /* 5622 * For now, only BACO and mode1 reset are confirmed 5623 * to suffer the audio issue without proper suspended. 5624 */ 5625 reset_method = amdgpu_asic_reset_method(adev); 5626 if ((reset_method != AMD_RESET_METHOD_BACO) && 5627 (reset_method != AMD_RESET_METHOD_MODE1)) 5628 return -EINVAL; 5629 5630 p = pci_get_domain_bus_and_slot(pci_domain_nr(adev->pdev->bus), 5631 adev->pdev->bus->number, 1); 5632 if (!p) 5633 return -ENODEV; 5634 5635 expires = pm_runtime_autosuspend_expiration(&(p->dev)); 5636 if (!expires) 5637 /* 5638 * If we cannot get the audio device autosuspend delay, 5639 * a fixed 4S interval will be used. Considering 3S is 5640 * the audio controller default autosuspend delay setting. 5641 * 4S used here is guaranteed to cover that. 5642 */ 5643 expires = ktime_get_mono_fast_ns() + NSEC_PER_SEC * 4ULL; 5644 5645 while (!pm_runtime_status_suspended(&(p->dev))) { 5646 if (!pm_runtime_suspend(&(p->dev))) 5647 break; 5648 5649 if (expires < ktime_get_mono_fast_ns()) { 5650 dev_warn(adev->dev, "failed to suspend display audio\n"); 5651 pci_dev_put(p); 5652 /* TODO: abort the succeeding gpu reset? */ 5653 return -ETIMEDOUT; 5654 } 5655 } 5656 5657 pm_runtime_disable(&(p->dev)); 5658 5659 pci_dev_put(p); 5660 return 0; 5661 } 5662 5663 static inline void amdgpu_device_stop_pending_resets(struct amdgpu_device *adev) 5664 { 5665 struct amdgpu_ras *con = amdgpu_ras_get_context(adev); 5666 5667 #if defined(CONFIG_DEBUG_FS) 5668 if (!amdgpu_sriov_vf(adev)) 5669 cancel_work(&adev->reset_work); 5670 #endif 5671 5672 if (adev->kfd.dev) 5673 cancel_work(&adev->kfd.reset_work); 5674 5675 if (amdgpu_sriov_vf(adev)) 5676 cancel_work(&adev->virt.flr_work); 5677 5678 if (con && adev->ras_enabled) 5679 cancel_work(&con->recovery_work); 5680 5681 } 5682 5683 static int amdgpu_device_health_check(struct list_head *device_list_handle) 5684 { 5685 struct amdgpu_device *tmp_adev; 5686 int ret = 0; 5687 u32 status; 5688 5689 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 5690 pci_read_config_dword(tmp_adev->pdev, PCI_COMMAND, &status); 5691 if (PCI_POSSIBLE_ERROR(status)) { 5692 dev_err(tmp_adev->dev, "device lost from bus!"); 5693 ret = -ENODEV; 5694 } 5695 } 5696 5697 return ret; 5698 } 5699 5700 /** 5701 * amdgpu_device_gpu_recover - reset the asic and recover scheduler 5702 * 5703 * @adev: amdgpu_device pointer 5704 * @job: which job trigger hang 5705 * @reset_context: amdgpu reset context pointer 5706 * 5707 * Attempt to reset the GPU if it has hung (all asics). 5708 * Attempt to do soft-reset or full-reset and reinitialize Asic 5709 * Returns 0 for success or an error on failure. 5710 */ 5711 5712 int amdgpu_device_gpu_recover(struct amdgpu_device *adev, 5713 struct amdgpu_job *job, 5714 struct amdgpu_reset_context *reset_context) 5715 { 5716 struct list_head device_list, *device_list_handle = NULL; 5717 bool job_signaled = false; 5718 struct amdgpu_hive_info *hive = NULL; 5719 struct amdgpu_device *tmp_adev = NULL; 5720 int i, r = 0; 5721 bool need_emergency_restart = false; 5722 bool audio_suspended = false; 5723 int retry_limit = AMDGPU_MAX_RETRY_LIMIT; 5724 5725 /* 5726 * Special case: RAS triggered and full reset isn't supported 5727 */ 5728 need_emergency_restart = amdgpu_ras_need_emergency_restart(adev); 5729 5730 /* 5731 * Flush RAM to disk so that after reboot 5732 * the user can read log and see why the system rebooted. 5733 */ 5734 if (need_emergency_restart && amdgpu_ras_get_context(adev) && 5735 amdgpu_ras_get_context(adev)->reboot) { 5736 DRM_WARN("Emergency reboot."); 5737 5738 ksys_sync_helper(); 5739 emergency_restart(); 5740 } 5741 5742 dev_info(adev->dev, "GPU %s begin!\n", 5743 need_emergency_restart ? "jobs stop":"reset"); 5744 5745 if (!amdgpu_sriov_vf(adev)) 5746 hive = amdgpu_get_xgmi_hive(adev); 5747 if (hive) 5748 mutex_lock(&hive->hive_lock); 5749 5750 reset_context->job = job; 5751 reset_context->hive = hive; 5752 /* 5753 * Build list of devices to reset. 5754 * In case we are in XGMI hive mode, resort the device list 5755 * to put adev in the 1st position. 5756 */ 5757 INIT_LIST_HEAD(&device_list); 5758 if (!amdgpu_sriov_vf(adev) && (adev->gmc.xgmi.num_physical_nodes > 1) && hive) { 5759 list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head) { 5760 list_add_tail(&tmp_adev->reset_list, &device_list); 5761 if (adev->shutdown) 5762 tmp_adev->shutdown = true; 5763 } 5764 if (!list_is_first(&adev->reset_list, &device_list)) 5765 list_rotate_to_front(&adev->reset_list, &device_list); 5766 device_list_handle = &device_list; 5767 } else { 5768 list_add_tail(&adev->reset_list, &device_list); 5769 device_list_handle = &device_list; 5770 } 5771 5772 if (!amdgpu_sriov_vf(adev)) { 5773 r = amdgpu_device_health_check(device_list_handle); 5774 if (r) 5775 goto end_reset; 5776 } 5777 5778 /* We need to lock reset domain only once both for XGMI and single device */ 5779 tmp_adev = list_first_entry(device_list_handle, struct amdgpu_device, 5780 reset_list); 5781 amdgpu_device_lock_reset_domain(tmp_adev->reset_domain); 5782 5783 /* block all schedulers and reset given job's ring */ 5784 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 5785 5786 amdgpu_device_set_mp1_state(tmp_adev); 5787 5788 /* 5789 * Try to put the audio codec into suspend state 5790 * before gpu reset started. 5791 * 5792 * Due to the power domain of the graphics device 5793 * is shared with AZ power domain. Without this, 5794 * we may change the audio hardware from behind 5795 * the audio driver's back. That will trigger 5796 * some audio codec errors. 5797 */ 5798 if (!amdgpu_device_suspend_display_audio(tmp_adev)) 5799 audio_suspended = true; 5800 5801 amdgpu_ras_set_error_query_ready(tmp_adev, false); 5802 5803 cancel_delayed_work_sync(&tmp_adev->delayed_init_work); 5804 5805 amdgpu_amdkfd_pre_reset(tmp_adev, reset_context); 5806 5807 /* 5808 * Mark these ASICs to be reseted as untracked first 5809 * And add them back after reset completed 5810 */ 5811 amdgpu_unregister_gpu_instance(tmp_adev); 5812 5813 drm_fb_helper_set_suspend_unlocked(adev_to_drm(tmp_adev)->fb_helper, true); 5814 5815 /* disable ras on ALL IPs */ 5816 if (!need_emergency_restart && 5817 amdgpu_device_ip_need_full_reset(tmp_adev)) 5818 amdgpu_ras_suspend(tmp_adev); 5819 5820 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 5821 struct amdgpu_ring *ring = tmp_adev->rings[i]; 5822 5823 if (!amdgpu_ring_sched_ready(ring)) 5824 continue; 5825 5826 drm_sched_stop(&ring->sched, job ? &job->base : NULL); 5827 5828 if (need_emergency_restart) 5829 amdgpu_job_stop_all_jobs_on_sched(&ring->sched); 5830 } 5831 atomic_inc(&tmp_adev->gpu_reset_counter); 5832 } 5833 5834 if (need_emergency_restart) 5835 goto skip_sched_resume; 5836 5837 /* 5838 * Must check guilty signal here since after this point all old 5839 * HW fences are force signaled. 5840 * 5841 * job->base holds a reference to parent fence 5842 */ 5843 if (job && dma_fence_is_signaled(&job->hw_fence)) { 5844 job_signaled = true; 5845 dev_info(adev->dev, "Guilty job already signaled, skipping HW reset"); 5846 goto skip_hw_reset; 5847 } 5848 5849 retry: /* Rest of adevs pre asic reset from XGMI hive. */ 5850 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 5851 r = amdgpu_device_pre_asic_reset(tmp_adev, reset_context); 5852 /*TODO Should we stop ?*/ 5853 if (r) { 5854 dev_err(tmp_adev->dev, "GPU pre asic reset failed with err, %d for drm dev, %s ", 5855 r, adev_to_drm(tmp_adev)->unique); 5856 tmp_adev->asic_reset_res = r; 5857 } 5858 } 5859 5860 /* Actual ASIC resets if needed.*/ 5861 /* Host driver will handle XGMI hive reset for SRIOV */ 5862 if (amdgpu_sriov_vf(adev)) { 5863 if (amdgpu_ras_get_fed_status(adev) || amdgpu_virt_rcvd_ras_interrupt(adev)) { 5864 dev_dbg(adev->dev, "Detected RAS error, wait for FLR completion\n"); 5865 amdgpu_ras_set_fed(adev, true); 5866 set_bit(AMDGPU_HOST_FLR, &reset_context->flags); 5867 } 5868 5869 r = amdgpu_device_reset_sriov(adev, reset_context); 5870 if (AMDGPU_RETRY_SRIOV_RESET(r) && (retry_limit--) > 0) { 5871 amdgpu_virt_release_full_gpu(adev, true); 5872 goto retry; 5873 } 5874 if (r) 5875 adev->asic_reset_res = r; 5876 } else { 5877 r = amdgpu_do_asic_reset(device_list_handle, reset_context); 5878 if (r && r == -EAGAIN) 5879 goto retry; 5880 } 5881 5882 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 5883 /* 5884 * Drop any pending non scheduler resets queued before reset is done. 5885 * Any reset scheduled after this point would be valid. Scheduler resets 5886 * were already dropped during drm_sched_stop and no new ones can come 5887 * in before drm_sched_start. 5888 */ 5889 amdgpu_device_stop_pending_resets(tmp_adev); 5890 } 5891 5892 skip_hw_reset: 5893 5894 /* Post ASIC reset for all devs .*/ 5895 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 5896 5897 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 5898 struct amdgpu_ring *ring = tmp_adev->rings[i]; 5899 5900 if (!amdgpu_ring_sched_ready(ring)) 5901 continue; 5902 5903 drm_sched_start(&ring->sched); 5904 } 5905 5906 if (!drm_drv_uses_atomic_modeset(adev_to_drm(tmp_adev)) && !job_signaled) 5907 drm_helper_resume_force_mode(adev_to_drm(tmp_adev)); 5908 5909 if (tmp_adev->asic_reset_res) 5910 r = tmp_adev->asic_reset_res; 5911 5912 tmp_adev->asic_reset_res = 0; 5913 5914 if (r) { 5915 /* bad news, how to tell it to userspace ? 5916 * for ras error, we should report GPU bad status instead of 5917 * reset failure 5918 */ 5919 if (reset_context->src != AMDGPU_RESET_SRC_RAS || 5920 !amdgpu_ras_eeprom_check_err_threshold(tmp_adev)) 5921 dev_info(tmp_adev->dev, "GPU reset(%d) failed\n", 5922 atomic_read(&tmp_adev->gpu_reset_counter)); 5923 amdgpu_vf_error_put(tmp_adev, AMDGIM_ERROR_VF_GPU_RESET_FAIL, 0, r); 5924 } else { 5925 dev_info(tmp_adev->dev, "GPU reset(%d) succeeded!\n", atomic_read(&tmp_adev->gpu_reset_counter)); 5926 if (amdgpu_acpi_smart_shift_update(adev_to_drm(tmp_adev), AMDGPU_SS_DEV_D0)) 5927 DRM_WARN("smart shift update failed\n"); 5928 } 5929 } 5930 5931 skip_sched_resume: 5932 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 5933 /* unlock kfd: SRIOV would do it separately */ 5934 if (!need_emergency_restart && !amdgpu_sriov_vf(tmp_adev)) 5935 amdgpu_amdkfd_post_reset(tmp_adev); 5936 5937 /* kfd_post_reset will do nothing if kfd device is not initialized, 5938 * need to bring up kfd here if it's not be initialized before 5939 */ 5940 if (!adev->kfd.init_complete) 5941 amdgpu_amdkfd_device_init(adev); 5942 5943 if (audio_suspended) 5944 amdgpu_device_resume_display_audio(tmp_adev); 5945 5946 amdgpu_device_unset_mp1_state(tmp_adev); 5947 5948 amdgpu_ras_set_error_query_ready(tmp_adev, true); 5949 } 5950 5951 tmp_adev = list_first_entry(device_list_handle, struct amdgpu_device, 5952 reset_list); 5953 amdgpu_device_unlock_reset_domain(tmp_adev->reset_domain); 5954 5955 end_reset: 5956 if (hive) { 5957 mutex_unlock(&hive->hive_lock); 5958 amdgpu_put_xgmi_hive(hive); 5959 } 5960 5961 if (r) 5962 dev_info(adev->dev, "GPU reset end with ret = %d\n", r); 5963 5964 atomic_set(&adev->reset_domain->reset_res, r); 5965 return r; 5966 } 5967 5968 /** 5969 * amdgpu_device_partner_bandwidth - find the bandwidth of appropriate partner 5970 * 5971 * @adev: amdgpu_device pointer 5972 * @speed: pointer to the speed of the link 5973 * @width: pointer to the width of the link 5974 * 5975 * Evaluate the hierarchy to find the speed and bandwidth capabilities of the 5976 * first physical partner to an AMD dGPU. 5977 * This will exclude any virtual switches and links. 5978 */ 5979 static void amdgpu_device_partner_bandwidth(struct amdgpu_device *adev, 5980 enum pci_bus_speed *speed, 5981 enum pcie_link_width *width) 5982 { 5983 struct pci_dev *parent = adev->pdev; 5984 5985 if (!speed || !width) 5986 return; 5987 5988 *speed = PCI_SPEED_UNKNOWN; 5989 *width = PCIE_LNK_WIDTH_UNKNOWN; 5990 5991 if (amdgpu_device_pcie_dynamic_switching_supported(adev)) { 5992 while ((parent = pci_upstream_bridge(parent))) { 5993 /* skip upstream/downstream switches internal to dGPU*/ 5994 if (parent->vendor == PCI_VENDOR_ID_ATI) 5995 continue; 5996 *speed = pcie_get_speed_cap(parent); 5997 *width = pcie_get_width_cap(parent); 5998 break; 5999 } 6000 } else { 6001 /* use the current speeds rather than max if switching is not supported */ 6002 pcie_bandwidth_available(adev->pdev, NULL, speed, width); 6003 } 6004 } 6005 6006 /** 6007 * amdgpu_device_get_pcie_info - fence pcie info about the PCIE slot 6008 * 6009 * @adev: amdgpu_device pointer 6010 * 6011 * Fetchs and stores in the driver the PCIE capabilities (gen speed 6012 * and lanes) of the slot the device is in. Handles APUs and 6013 * virtualized environments where PCIE config space may not be available. 6014 */ 6015 static void amdgpu_device_get_pcie_info(struct amdgpu_device *adev) 6016 { 6017 struct pci_dev *pdev; 6018 enum pci_bus_speed speed_cap, platform_speed_cap; 6019 enum pcie_link_width platform_link_width; 6020 6021 if (amdgpu_pcie_gen_cap) 6022 adev->pm.pcie_gen_mask = amdgpu_pcie_gen_cap; 6023 6024 if (amdgpu_pcie_lane_cap) 6025 adev->pm.pcie_mlw_mask = amdgpu_pcie_lane_cap; 6026 6027 /* covers APUs as well */ 6028 if (pci_is_root_bus(adev->pdev->bus) && !amdgpu_passthrough(adev)) { 6029 if (adev->pm.pcie_gen_mask == 0) 6030 adev->pm.pcie_gen_mask = AMDGPU_DEFAULT_PCIE_GEN_MASK; 6031 if (adev->pm.pcie_mlw_mask == 0) 6032 adev->pm.pcie_mlw_mask = AMDGPU_DEFAULT_PCIE_MLW_MASK; 6033 return; 6034 } 6035 6036 if (adev->pm.pcie_gen_mask && adev->pm.pcie_mlw_mask) 6037 return; 6038 6039 amdgpu_device_partner_bandwidth(adev, &platform_speed_cap, 6040 &platform_link_width); 6041 6042 if (adev->pm.pcie_gen_mask == 0) { 6043 /* asic caps */ 6044 pdev = adev->pdev; 6045 speed_cap = pcie_get_speed_cap(pdev); 6046 if (speed_cap == PCI_SPEED_UNKNOWN) { 6047 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 6048 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 | 6049 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3); 6050 } else { 6051 if (speed_cap == PCIE_SPEED_32_0GT) 6052 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 6053 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 | 6054 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3 | 6055 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN4 | 6056 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN5); 6057 else if (speed_cap == PCIE_SPEED_16_0GT) 6058 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 6059 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 | 6060 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3 | 6061 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN4); 6062 else if (speed_cap == PCIE_SPEED_8_0GT) 6063 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 6064 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 | 6065 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3); 6066 else if (speed_cap == PCIE_SPEED_5_0GT) 6067 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 6068 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2); 6069 else 6070 adev->pm.pcie_gen_mask |= CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1; 6071 } 6072 /* platform caps */ 6073 if (platform_speed_cap == PCI_SPEED_UNKNOWN) { 6074 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 6075 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2); 6076 } else { 6077 if (platform_speed_cap == PCIE_SPEED_32_0GT) 6078 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 6079 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 | 6080 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3 | 6081 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN4 | 6082 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN5); 6083 else if (platform_speed_cap == PCIE_SPEED_16_0GT) 6084 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 6085 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 | 6086 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3 | 6087 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN4); 6088 else if (platform_speed_cap == PCIE_SPEED_8_0GT) 6089 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 6090 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 | 6091 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3); 6092 else if (platform_speed_cap == PCIE_SPEED_5_0GT) 6093 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 6094 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2); 6095 else 6096 adev->pm.pcie_gen_mask |= CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1; 6097 6098 } 6099 } 6100 if (adev->pm.pcie_mlw_mask == 0) { 6101 if (platform_link_width == PCIE_LNK_WIDTH_UNKNOWN) { 6102 adev->pm.pcie_mlw_mask |= AMDGPU_DEFAULT_PCIE_MLW_MASK; 6103 } else { 6104 switch (platform_link_width) { 6105 case PCIE_LNK_X32: 6106 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X32 | 6107 CAIL_PCIE_LINK_WIDTH_SUPPORT_X16 | 6108 CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 | 6109 CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 | 6110 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 6111 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 6112 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 6113 break; 6114 case PCIE_LNK_X16: 6115 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X16 | 6116 CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 | 6117 CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 | 6118 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 6119 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 6120 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 6121 break; 6122 case PCIE_LNK_X12: 6123 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 | 6124 CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 | 6125 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 6126 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 6127 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 6128 break; 6129 case PCIE_LNK_X8: 6130 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 | 6131 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 6132 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 6133 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 6134 break; 6135 case PCIE_LNK_X4: 6136 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 6137 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 6138 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 6139 break; 6140 case PCIE_LNK_X2: 6141 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 6142 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 6143 break; 6144 case PCIE_LNK_X1: 6145 adev->pm.pcie_mlw_mask = CAIL_PCIE_LINK_WIDTH_SUPPORT_X1; 6146 break; 6147 default: 6148 break; 6149 } 6150 } 6151 } 6152 } 6153 6154 /** 6155 * amdgpu_device_is_peer_accessible - Check peer access through PCIe BAR 6156 * 6157 * @adev: amdgpu_device pointer 6158 * @peer_adev: amdgpu_device pointer for peer device trying to access @adev 6159 * 6160 * Return true if @peer_adev can access (DMA) @adev through the PCIe 6161 * BAR, i.e. @adev is "large BAR" and the BAR matches the DMA mask of 6162 * @peer_adev. 6163 */ 6164 bool amdgpu_device_is_peer_accessible(struct amdgpu_device *adev, 6165 struct amdgpu_device *peer_adev) 6166 { 6167 #ifdef CONFIG_HSA_AMD_P2P 6168 bool p2p_access = 6169 !adev->gmc.xgmi.connected_to_cpu && 6170 !(pci_p2pdma_distance(adev->pdev, peer_adev->dev, false) < 0); 6171 6172 bool is_large_bar = adev->gmc.visible_vram_size && 6173 adev->gmc.real_vram_size == adev->gmc.visible_vram_size; 6174 bool p2p_addressable = amdgpu_device_check_iommu_remap(peer_adev); 6175 6176 if (!p2p_addressable) { 6177 uint64_t address_mask = peer_adev->dev->dma_mask ? 6178 ~*peer_adev->dev->dma_mask : ~((1ULL << 32) - 1); 6179 resource_size_t aper_limit = 6180 adev->gmc.aper_base + adev->gmc.aper_size - 1; 6181 6182 p2p_addressable = !(adev->gmc.aper_base & address_mask || 6183 aper_limit & address_mask); 6184 } 6185 return pcie_p2p && is_large_bar && p2p_access && p2p_addressable; 6186 #else 6187 return false; 6188 #endif 6189 } 6190 6191 int amdgpu_device_baco_enter(struct drm_device *dev) 6192 { 6193 struct amdgpu_device *adev = drm_to_adev(dev); 6194 struct amdgpu_ras *ras = amdgpu_ras_get_context(adev); 6195 6196 if (!amdgpu_device_supports_baco(dev)) 6197 return -ENOTSUPP; 6198 6199 if (ras && adev->ras_enabled && 6200 adev->nbio.funcs->enable_doorbell_interrupt) 6201 adev->nbio.funcs->enable_doorbell_interrupt(adev, false); 6202 6203 return amdgpu_dpm_baco_enter(adev); 6204 } 6205 6206 int amdgpu_device_baco_exit(struct drm_device *dev) 6207 { 6208 struct amdgpu_device *adev = drm_to_adev(dev); 6209 struct amdgpu_ras *ras = amdgpu_ras_get_context(adev); 6210 int ret = 0; 6211 6212 if (!amdgpu_device_supports_baco(dev)) 6213 return -ENOTSUPP; 6214 6215 ret = amdgpu_dpm_baco_exit(adev); 6216 if (ret) 6217 return ret; 6218 6219 if (ras && adev->ras_enabled && 6220 adev->nbio.funcs->enable_doorbell_interrupt) 6221 adev->nbio.funcs->enable_doorbell_interrupt(adev, true); 6222 6223 if (amdgpu_passthrough(adev) && adev->nbio.funcs && 6224 adev->nbio.funcs->clear_doorbell_interrupt) 6225 adev->nbio.funcs->clear_doorbell_interrupt(adev); 6226 6227 return 0; 6228 } 6229 6230 /** 6231 * amdgpu_pci_error_detected - Called when a PCI error is detected. 6232 * @pdev: PCI device struct 6233 * @state: PCI channel state 6234 * 6235 * Description: Called when a PCI error is detected. 6236 * 6237 * Return: PCI_ERS_RESULT_NEED_RESET or PCI_ERS_RESULT_DISCONNECT. 6238 */ 6239 pci_ers_result_t amdgpu_pci_error_detected(struct pci_dev *pdev, pci_channel_state_t state) 6240 { 6241 struct drm_device *dev = pci_get_drvdata(pdev); 6242 struct amdgpu_device *adev = drm_to_adev(dev); 6243 int i; 6244 6245 DRM_INFO("PCI error: detected callback, state(%d)!!\n", state); 6246 6247 if (adev->gmc.xgmi.num_physical_nodes > 1) { 6248 DRM_WARN("No support for XGMI hive yet..."); 6249 return PCI_ERS_RESULT_DISCONNECT; 6250 } 6251 6252 adev->pci_channel_state = state; 6253 6254 switch (state) { 6255 case pci_channel_io_normal: 6256 return PCI_ERS_RESULT_CAN_RECOVER; 6257 /* Fatal error, prepare for slot reset */ 6258 case pci_channel_io_frozen: 6259 /* 6260 * Locking adev->reset_domain->sem will prevent any external access 6261 * to GPU during PCI error recovery 6262 */ 6263 amdgpu_device_lock_reset_domain(adev->reset_domain); 6264 amdgpu_device_set_mp1_state(adev); 6265 6266 /* 6267 * Block any work scheduling as we do for regular GPU reset 6268 * for the duration of the recovery 6269 */ 6270 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 6271 struct amdgpu_ring *ring = adev->rings[i]; 6272 6273 if (!amdgpu_ring_sched_ready(ring)) 6274 continue; 6275 6276 drm_sched_stop(&ring->sched, NULL); 6277 } 6278 atomic_inc(&adev->gpu_reset_counter); 6279 return PCI_ERS_RESULT_NEED_RESET; 6280 case pci_channel_io_perm_failure: 6281 /* Permanent error, prepare for device removal */ 6282 return PCI_ERS_RESULT_DISCONNECT; 6283 } 6284 6285 return PCI_ERS_RESULT_NEED_RESET; 6286 } 6287 6288 /** 6289 * amdgpu_pci_mmio_enabled - Enable MMIO and dump debug registers 6290 * @pdev: pointer to PCI device 6291 */ 6292 pci_ers_result_t amdgpu_pci_mmio_enabled(struct pci_dev *pdev) 6293 { 6294 6295 DRM_INFO("PCI error: mmio enabled callback!!\n"); 6296 6297 /* TODO - dump whatever for debugging purposes */ 6298 6299 /* This called only if amdgpu_pci_error_detected returns 6300 * PCI_ERS_RESULT_CAN_RECOVER. Read/write to the device still 6301 * works, no need to reset slot. 6302 */ 6303 6304 return PCI_ERS_RESULT_RECOVERED; 6305 } 6306 6307 /** 6308 * amdgpu_pci_slot_reset - Called when PCI slot has been reset. 6309 * @pdev: PCI device struct 6310 * 6311 * Description: This routine is called by the pci error recovery 6312 * code after the PCI slot has been reset, just before we 6313 * should resume normal operations. 6314 */ 6315 pci_ers_result_t amdgpu_pci_slot_reset(struct pci_dev *pdev) 6316 { 6317 struct drm_device *dev = pci_get_drvdata(pdev); 6318 struct amdgpu_device *adev = drm_to_adev(dev); 6319 int r, i; 6320 struct amdgpu_reset_context reset_context; 6321 u32 memsize; 6322 struct list_head device_list; 6323 6324 /* PCI error slot reset should be skipped During RAS recovery */ 6325 if ((amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 3) || 6326 amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 4)) && 6327 amdgpu_ras_in_recovery(adev)) 6328 return PCI_ERS_RESULT_RECOVERED; 6329 6330 DRM_INFO("PCI error: slot reset callback!!\n"); 6331 6332 memset(&reset_context, 0, sizeof(reset_context)); 6333 6334 INIT_LIST_HEAD(&device_list); 6335 list_add_tail(&adev->reset_list, &device_list); 6336 6337 /* wait for asic to come out of reset */ 6338 msleep(500); 6339 6340 /* Restore PCI confspace */ 6341 amdgpu_device_load_pci_state(pdev); 6342 6343 /* confirm ASIC came out of reset */ 6344 for (i = 0; i < adev->usec_timeout; i++) { 6345 memsize = amdgpu_asic_get_config_memsize(adev); 6346 6347 if (memsize != 0xffffffff) 6348 break; 6349 udelay(1); 6350 } 6351 if (memsize == 0xffffffff) { 6352 r = -ETIME; 6353 goto out; 6354 } 6355 6356 reset_context.method = AMD_RESET_METHOD_NONE; 6357 reset_context.reset_req_dev = adev; 6358 set_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags); 6359 set_bit(AMDGPU_SKIP_HW_RESET, &reset_context.flags); 6360 6361 adev->no_hw_access = true; 6362 r = amdgpu_device_pre_asic_reset(adev, &reset_context); 6363 adev->no_hw_access = false; 6364 if (r) 6365 goto out; 6366 6367 r = amdgpu_do_asic_reset(&device_list, &reset_context); 6368 6369 out: 6370 if (!r) { 6371 if (amdgpu_device_cache_pci_state(adev->pdev)) 6372 pci_restore_state(adev->pdev); 6373 6374 DRM_INFO("PCIe error recovery succeeded\n"); 6375 } else { 6376 DRM_ERROR("PCIe error recovery failed, err:%d", r); 6377 amdgpu_device_unset_mp1_state(adev); 6378 amdgpu_device_unlock_reset_domain(adev->reset_domain); 6379 } 6380 6381 return r ? PCI_ERS_RESULT_DISCONNECT : PCI_ERS_RESULT_RECOVERED; 6382 } 6383 6384 /** 6385 * amdgpu_pci_resume() - resume normal ops after PCI reset 6386 * @pdev: pointer to PCI device 6387 * 6388 * Called when the error recovery driver tells us that its 6389 * OK to resume normal operation. 6390 */ 6391 void amdgpu_pci_resume(struct pci_dev *pdev) 6392 { 6393 struct drm_device *dev = pci_get_drvdata(pdev); 6394 struct amdgpu_device *adev = drm_to_adev(dev); 6395 int i; 6396 6397 6398 DRM_INFO("PCI error: resume callback!!\n"); 6399 6400 /* Only continue execution for the case of pci_channel_io_frozen */ 6401 if (adev->pci_channel_state != pci_channel_io_frozen) 6402 return; 6403 6404 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 6405 struct amdgpu_ring *ring = adev->rings[i]; 6406 6407 if (!amdgpu_ring_sched_ready(ring)) 6408 continue; 6409 6410 drm_sched_start(&ring->sched); 6411 } 6412 6413 amdgpu_device_unset_mp1_state(adev); 6414 amdgpu_device_unlock_reset_domain(adev->reset_domain); 6415 } 6416 6417 bool amdgpu_device_cache_pci_state(struct pci_dev *pdev) 6418 { 6419 struct drm_device *dev = pci_get_drvdata(pdev); 6420 struct amdgpu_device *adev = drm_to_adev(dev); 6421 int r; 6422 6423 r = pci_save_state(pdev); 6424 if (!r) { 6425 kfree(adev->pci_state); 6426 6427 adev->pci_state = pci_store_saved_state(pdev); 6428 6429 if (!adev->pci_state) { 6430 DRM_ERROR("Failed to store PCI saved state"); 6431 return false; 6432 } 6433 } else { 6434 DRM_WARN("Failed to save PCI state, err:%d\n", r); 6435 return false; 6436 } 6437 6438 return true; 6439 } 6440 6441 bool amdgpu_device_load_pci_state(struct pci_dev *pdev) 6442 { 6443 struct drm_device *dev = pci_get_drvdata(pdev); 6444 struct amdgpu_device *adev = drm_to_adev(dev); 6445 int r; 6446 6447 if (!adev->pci_state) 6448 return false; 6449 6450 r = pci_load_saved_state(pdev, adev->pci_state); 6451 6452 if (!r) { 6453 pci_restore_state(pdev); 6454 } else { 6455 DRM_WARN("Failed to load PCI state, err:%d\n", r); 6456 return false; 6457 } 6458 6459 return true; 6460 } 6461 6462 void amdgpu_device_flush_hdp(struct amdgpu_device *adev, 6463 struct amdgpu_ring *ring) 6464 { 6465 #ifdef CONFIG_X86_64 6466 if ((adev->flags & AMD_IS_APU) && !amdgpu_passthrough(adev)) 6467 return; 6468 #endif 6469 if (adev->gmc.xgmi.connected_to_cpu) 6470 return; 6471 6472 if (ring && ring->funcs->emit_hdp_flush) 6473 amdgpu_ring_emit_hdp_flush(ring); 6474 else 6475 amdgpu_asic_flush_hdp(adev, ring); 6476 } 6477 6478 void amdgpu_device_invalidate_hdp(struct amdgpu_device *adev, 6479 struct amdgpu_ring *ring) 6480 { 6481 #ifdef CONFIG_X86_64 6482 if ((adev->flags & AMD_IS_APU) && !amdgpu_passthrough(adev)) 6483 return; 6484 #endif 6485 if (adev->gmc.xgmi.connected_to_cpu) 6486 return; 6487 6488 amdgpu_asic_invalidate_hdp(adev, ring); 6489 } 6490 6491 int amdgpu_in_reset(struct amdgpu_device *adev) 6492 { 6493 return atomic_read(&adev->reset_domain->in_gpu_reset); 6494 } 6495 6496 /** 6497 * amdgpu_device_halt() - bring hardware to some kind of halt state 6498 * 6499 * @adev: amdgpu_device pointer 6500 * 6501 * Bring hardware to some kind of halt state so that no one can touch it 6502 * any more. It will help to maintain error context when error occurred. 6503 * Compare to a simple hang, the system will keep stable at least for SSH 6504 * access. Then it should be trivial to inspect the hardware state and 6505 * see what's going on. Implemented as following: 6506 * 6507 * 1. drm_dev_unplug() makes device inaccessible to user space(IOCTLs, etc), 6508 * clears all CPU mappings to device, disallows remappings through page faults 6509 * 2. amdgpu_irq_disable_all() disables all interrupts 6510 * 3. amdgpu_fence_driver_hw_fini() signals all HW fences 6511 * 4. set adev->no_hw_access to avoid potential crashes after setp 5 6512 * 5. amdgpu_device_unmap_mmio() clears all MMIO mappings 6513 * 6. pci_disable_device() and pci_wait_for_pending_transaction() 6514 * flush any in flight DMA operations 6515 */ 6516 void amdgpu_device_halt(struct amdgpu_device *adev) 6517 { 6518 struct pci_dev *pdev = adev->pdev; 6519 struct drm_device *ddev = adev_to_drm(adev); 6520 6521 amdgpu_xcp_dev_unplug(adev); 6522 drm_dev_unplug(ddev); 6523 6524 amdgpu_irq_disable_all(adev); 6525 6526 amdgpu_fence_driver_hw_fini(adev); 6527 6528 adev->no_hw_access = true; 6529 6530 amdgpu_device_unmap_mmio(adev); 6531 6532 pci_disable_device(pdev); 6533 pci_wait_for_pending_transaction(pdev); 6534 } 6535 6536 u32 amdgpu_device_pcie_port_rreg(struct amdgpu_device *adev, 6537 u32 reg) 6538 { 6539 unsigned long flags, address, data; 6540 u32 r; 6541 6542 address = adev->nbio.funcs->get_pcie_port_index_offset(adev); 6543 data = adev->nbio.funcs->get_pcie_port_data_offset(adev); 6544 6545 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 6546 WREG32(address, reg * 4); 6547 (void)RREG32(address); 6548 r = RREG32(data); 6549 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 6550 return r; 6551 } 6552 6553 void amdgpu_device_pcie_port_wreg(struct amdgpu_device *adev, 6554 u32 reg, u32 v) 6555 { 6556 unsigned long flags, address, data; 6557 6558 address = adev->nbio.funcs->get_pcie_port_index_offset(adev); 6559 data = adev->nbio.funcs->get_pcie_port_data_offset(adev); 6560 6561 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 6562 WREG32(address, reg * 4); 6563 (void)RREG32(address); 6564 WREG32(data, v); 6565 (void)RREG32(data); 6566 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 6567 } 6568 6569 /** 6570 * amdgpu_device_get_gang - return a reference to the current gang 6571 * @adev: amdgpu_device pointer 6572 * 6573 * Returns: A new reference to the current gang leader. 6574 */ 6575 struct dma_fence *amdgpu_device_get_gang(struct amdgpu_device *adev) 6576 { 6577 struct dma_fence *fence; 6578 6579 rcu_read_lock(); 6580 fence = dma_fence_get_rcu_safe(&adev->gang_submit); 6581 rcu_read_unlock(); 6582 return fence; 6583 } 6584 6585 /** 6586 * amdgpu_device_switch_gang - switch to a new gang 6587 * @adev: amdgpu_device pointer 6588 * @gang: the gang to switch to 6589 * 6590 * Try to switch to a new gang. 6591 * Returns: NULL if we switched to the new gang or a reference to the current 6592 * gang leader. 6593 */ 6594 struct dma_fence *amdgpu_device_switch_gang(struct amdgpu_device *adev, 6595 struct dma_fence *gang) 6596 { 6597 struct dma_fence *old = NULL; 6598 6599 do { 6600 dma_fence_put(old); 6601 old = amdgpu_device_get_gang(adev); 6602 if (old == gang) 6603 break; 6604 6605 if (!dma_fence_is_signaled(old)) 6606 return old; 6607 6608 } while (cmpxchg((struct dma_fence __force **)&adev->gang_submit, 6609 old, gang) != old); 6610 6611 dma_fence_put(old); 6612 return NULL; 6613 } 6614 6615 bool amdgpu_device_has_display_hardware(struct amdgpu_device *adev) 6616 { 6617 switch (adev->asic_type) { 6618 #ifdef CONFIG_DRM_AMDGPU_SI 6619 case CHIP_HAINAN: 6620 #endif 6621 case CHIP_TOPAZ: 6622 /* chips with no display hardware */ 6623 return false; 6624 #ifdef CONFIG_DRM_AMDGPU_SI 6625 case CHIP_TAHITI: 6626 case CHIP_PITCAIRN: 6627 case CHIP_VERDE: 6628 case CHIP_OLAND: 6629 #endif 6630 #ifdef CONFIG_DRM_AMDGPU_CIK 6631 case CHIP_BONAIRE: 6632 case CHIP_HAWAII: 6633 case CHIP_KAVERI: 6634 case CHIP_KABINI: 6635 case CHIP_MULLINS: 6636 #endif 6637 case CHIP_TONGA: 6638 case CHIP_FIJI: 6639 case CHIP_POLARIS10: 6640 case CHIP_POLARIS11: 6641 case CHIP_POLARIS12: 6642 case CHIP_VEGAM: 6643 case CHIP_CARRIZO: 6644 case CHIP_STONEY: 6645 /* chips with display hardware */ 6646 return true; 6647 default: 6648 /* IP discovery */ 6649 if (!amdgpu_ip_version(adev, DCE_HWIP, 0) || 6650 (adev->harvest_ip_mask & AMD_HARVEST_IP_DMU_MASK)) 6651 return false; 6652 return true; 6653 } 6654 } 6655 6656 uint32_t amdgpu_device_wait_on_rreg(struct amdgpu_device *adev, 6657 uint32_t inst, uint32_t reg_addr, char reg_name[], 6658 uint32_t expected_value, uint32_t mask) 6659 { 6660 uint32_t ret = 0; 6661 uint32_t old_ = 0; 6662 uint32_t tmp_ = RREG32(reg_addr); 6663 uint32_t loop = adev->usec_timeout; 6664 6665 while ((tmp_ & (mask)) != (expected_value)) { 6666 if (old_ != tmp_) { 6667 loop = adev->usec_timeout; 6668 old_ = tmp_; 6669 } else 6670 udelay(1); 6671 tmp_ = RREG32(reg_addr); 6672 loop--; 6673 if (!loop) { 6674 DRM_WARN("Register(%d) [%s] failed to reach value 0x%08x != 0x%08xn", 6675 inst, reg_name, (uint32_t)expected_value, 6676 (uint32_t)(tmp_ & (mask))); 6677 ret = -ETIMEDOUT; 6678 break; 6679 } 6680 } 6681 return ret; 6682 } 6683