1 /* 2 * Copyright 2008 Advanced Micro Devices, Inc. 3 * Copyright 2008 Red Hat Inc. 4 * Copyright 2009 Jerome Glisse. 5 * 6 * Permission is hereby granted, free of charge, to any person obtaining a 7 * copy of this software and associated documentation files (the "Software"), 8 * to deal in the Software without restriction, including without limitation 9 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 10 * and/or sell copies of the Software, and to permit persons to whom the 11 * Software is furnished to do so, subject to the following conditions: 12 * 13 * The above copyright notice and this permission notice shall be included in 14 * all copies or substantial portions of the Software. 15 * 16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 19 * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR 20 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, 21 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR 22 * OTHER DEALINGS IN THE SOFTWARE. 23 * 24 * Authors: Dave Airlie 25 * Alex Deucher 26 * Jerome Glisse 27 */ 28 29 #include <linux/aperture.h> 30 #include <linux/power_supply.h> 31 #include <linux/kthread.h> 32 #include <linux/module.h> 33 #include <linux/console.h> 34 #include <linux/slab.h> 35 #include <linux/iommu.h> 36 #include <linux/pci.h> 37 #include <linux/pci-p2pdma.h> 38 #include <linux/apple-gmux.h> 39 40 #include <drm/drm_atomic_helper.h> 41 #include <drm/drm_client_event.h> 42 #include <drm/drm_crtc_helper.h> 43 #include <drm/drm_probe_helper.h> 44 #include <drm/amdgpu_drm.h> 45 #include <linux/device.h> 46 #include <linux/vgaarb.h> 47 #include <linux/vga_switcheroo.h> 48 #include <linux/efi.h> 49 #include "amdgpu.h" 50 #include "amdgpu_trace.h" 51 #include "amdgpu_i2c.h" 52 #include "atom.h" 53 #include "amdgpu_atombios.h" 54 #include "amdgpu_atomfirmware.h" 55 #include "amd_pcie.h" 56 #ifdef CONFIG_DRM_AMDGPU_SI 57 #include "si.h" 58 #endif 59 #ifdef CONFIG_DRM_AMDGPU_CIK 60 #include "cik.h" 61 #endif 62 #include "vi.h" 63 #include "soc15.h" 64 #include "nv.h" 65 #include "bif/bif_4_1_d.h" 66 #include <linux/firmware.h> 67 #include "amdgpu_vf_error.h" 68 69 #include "amdgpu_amdkfd.h" 70 #include "amdgpu_pm.h" 71 72 #include "amdgpu_xgmi.h" 73 #include "amdgpu_ras.h" 74 #include "amdgpu_pmu.h" 75 #include "amdgpu_fru_eeprom.h" 76 #include "amdgpu_reset.h" 77 #include "amdgpu_virt.h" 78 #include "amdgpu_dev_coredump.h" 79 80 #include <linux/suspend.h> 81 #include <drm/task_barrier.h> 82 #include <linux/pm_runtime.h> 83 84 #include <drm/drm_drv.h> 85 86 #if IS_ENABLED(CONFIG_X86) 87 #include <asm/intel-family.h> 88 #endif 89 90 MODULE_FIRMWARE("amdgpu/vega10_gpu_info.bin"); 91 MODULE_FIRMWARE("amdgpu/vega12_gpu_info.bin"); 92 MODULE_FIRMWARE("amdgpu/raven_gpu_info.bin"); 93 MODULE_FIRMWARE("amdgpu/picasso_gpu_info.bin"); 94 MODULE_FIRMWARE("amdgpu/raven2_gpu_info.bin"); 95 MODULE_FIRMWARE("amdgpu/arcturus_gpu_info.bin"); 96 MODULE_FIRMWARE("amdgpu/navi12_gpu_info.bin"); 97 98 #define AMDGPU_RESUME_MS 2000 99 #define AMDGPU_MAX_RETRY_LIMIT 2 100 #define AMDGPU_RETRY_SRIOV_RESET(r) ((r) == -EBUSY || (r) == -ETIMEDOUT || (r) == -EINVAL) 101 #define AMDGPU_PCIE_INDEX_FALLBACK (0x38 >> 2) 102 #define AMDGPU_PCIE_INDEX_HI_FALLBACK (0x44 >> 2) 103 #define AMDGPU_PCIE_DATA_FALLBACK (0x3C >> 2) 104 105 static const struct drm_driver amdgpu_kms_driver; 106 107 const char *amdgpu_asic_name[] = { 108 "TAHITI", 109 "PITCAIRN", 110 "VERDE", 111 "OLAND", 112 "HAINAN", 113 "BONAIRE", 114 "KAVERI", 115 "KABINI", 116 "HAWAII", 117 "MULLINS", 118 "TOPAZ", 119 "TONGA", 120 "FIJI", 121 "CARRIZO", 122 "STONEY", 123 "POLARIS10", 124 "POLARIS11", 125 "POLARIS12", 126 "VEGAM", 127 "VEGA10", 128 "VEGA12", 129 "VEGA20", 130 "RAVEN", 131 "ARCTURUS", 132 "RENOIR", 133 "ALDEBARAN", 134 "NAVI10", 135 "CYAN_SKILLFISH", 136 "NAVI14", 137 "NAVI12", 138 "SIENNA_CICHLID", 139 "NAVY_FLOUNDER", 140 "VANGOGH", 141 "DIMGREY_CAVEFISH", 142 "BEIGE_GOBY", 143 "YELLOW_CARP", 144 "IP DISCOVERY", 145 "LAST", 146 }; 147 148 #define AMDGPU_IP_BLK_MASK_ALL GENMASK(AMD_IP_BLOCK_TYPE_NUM - 1, 0) 149 /* 150 * Default init level where all blocks are expected to be initialized. This is 151 * the level of initialization expected by default and also after a full reset 152 * of the device. 153 */ 154 struct amdgpu_init_level amdgpu_init_default = { 155 .level = AMDGPU_INIT_LEVEL_DEFAULT, 156 .hwini_ip_block_mask = AMDGPU_IP_BLK_MASK_ALL, 157 }; 158 159 struct amdgpu_init_level amdgpu_init_recovery = { 160 .level = AMDGPU_INIT_LEVEL_RESET_RECOVERY, 161 .hwini_ip_block_mask = AMDGPU_IP_BLK_MASK_ALL, 162 }; 163 164 /* 165 * Minimal blocks needed to be initialized before a XGMI hive can be reset. This 166 * is used for cases like reset on initialization where the entire hive needs to 167 * be reset before first use. 168 */ 169 struct amdgpu_init_level amdgpu_init_minimal_xgmi = { 170 .level = AMDGPU_INIT_LEVEL_MINIMAL_XGMI, 171 .hwini_ip_block_mask = 172 BIT(AMD_IP_BLOCK_TYPE_GMC) | BIT(AMD_IP_BLOCK_TYPE_SMC) | 173 BIT(AMD_IP_BLOCK_TYPE_COMMON) | BIT(AMD_IP_BLOCK_TYPE_IH) | 174 BIT(AMD_IP_BLOCK_TYPE_PSP) 175 }; 176 177 static inline bool amdgpu_ip_member_of_hwini(struct amdgpu_device *adev, 178 enum amd_ip_block_type block) 179 { 180 return (adev->init_lvl->hwini_ip_block_mask & (1U << block)) != 0; 181 } 182 183 void amdgpu_set_init_level(struct amdgpu_device *adev, 184 enum amdgpu_init_lvl_id lvl) 185 { 186 switch (lvl) { 187 case AMDGPU_INIT_LEVEL_MINIMAL_XGMI: 188 adev->init_lvl = &amdgpu_init_minimal_xgmi; 189 break; 190 case AMDGPU_INIT_LEVEL_RESET_RECOVERY: 191 adev->init_lvl = &amdgpu_init_recovery; 192 break; 193 case AMDGPU_INIT_LEVEL_DEFAULT: 194 fallthrough; 195 default: 196 adev->init_lvl = &amdgpu_init_default; 197 break; 198 } 199 } 200 201 static inline void amdgpu_device_stop_pending_resets(struct amdgpu_device *adev); 202 203 /** 204 * DOC: pcie_replay_count 205 * 206 * The amdgpu driver provides a sysfs API for reporting the total number 207 * of PCIe replays (NAKs) 208 * The file pcie_replay_count is used for this and returns the total 209 * number of replays as a sum of the NAKs generated and NAKs received 210 */ 211 212 static ssize_t amdgpu_device_get_pcie_replay_count(struct device *dev, 213 struct device_attribute *attr, char *buf) 214 { 215 struct drm_device *ddev = dev_get_drvdata(dev); 216 struct amdgpu_device *adev = drm_to_adev(ddev); 217 uint64_t cnt = amdgpu_asic_get_pcie_replay_count(adev); 218 219 return sysfs_emit(buf, "%llu\n", cnt); 220 } 221 222 static DEVICE_ATTR(pcie_replay_count, 0444, 223 amdgpu_device_get_pcie_replay_count, NULL); 224 225 static ssize_t amdgpu_sysfs_reg_state_get(struct file *f, struct kobject *kobj, 226 struct bin_attribute *attr, char *buf, 227 loff_t ppos, size_t count) 228 { 229 struct device *dev = kobj_to_dev(kobj); 230 struct drm_device *ddev = dev_get_drvdata(dev); 231 struct amdgpu_device *adev = drm_to_adev(ddev); 232 ssize_t bytes_read; 233 234 switch (ppos) { 235 case AMDGPU_SYS_REG_STATE_XGMI: 236 bytes_read = amdgpu_asic_get_reg_state( 237 adev, AMDGPU_REG_STATE_TYPE_XGMI, buf, count); 238 break; 239 case AMDGPU_SYS_REG_STATE_WAFL: 240 bytes_read = amdgpu_asic_get_reg_state( 241 adev, AMDGPU_REG_STATE_TYPE_WAFL, buf, count); 242 break; 243 case AMDGPU_SYS_REG_STATE_PCIE: 244 bytes_read = amdgpu_asic_get_reg_state( 245 adev, AMDGPU_REG_STATE_TYPE_PCIE, buf, count); 246 break; 247 case AMDGPU_SYS_REG_STATE_USR: 248 bytes_read = amdgpu_asic_get_reg_state( 249 adev, AMDGPU_REG_STATE_TYPE_USR, buf, count); 250 break; 251 case AMDGPU_SYS_REG_STATE_USR_1: 252 bytes_read = amdgpu_asic_get_reg_state( 253 adev, AMDGPU_REG_STATE_TYPE_USR_1, buf, count); 254 break; 255 default: 256 return -EINVAL; 257 } 258 259 return bytes_read; 260 } 261 262 BIN_ATTR(reg_state, 0444, amdgpu_sysfs_reg_state_get, NULL, 263 AMDGPU_SYS_REG_STATE_END); 264 265 int amdgpu_reg_state_sysfs_init(struct amdgpu_device *adev) 266 { 267 int ret; 268 269 if (!amdgpu_asic_get_reg_state_supported(adev)) 270 return 0; 271 272 ret = sysfs_create_bin_file(&adev->dev->kobj, &bin_attr_reg_state); 273 274 return ret; 275 } 276 277 void amdgpu_reg_state_sysfs_fini(struct amdgpu_device *adev) 278 { 279 if (!amdgpu_asic_get_reg_state_supported(adev)) 280 return; 281 sysfs_remove_bin_file(&adev->dev->kobj, &bin_attr_reg_state); 282 } 283 284 int amdgpu_ip_block_suspend(struct amdgpu_ip_block *ip_block) 285 { 286 int r; 287 288 if (ip_block->version->funcs->suspend) { 289 r = ip_block->version->funcs->suspend(ip_block); 290 if (r) { 291 dev_err(ip_block->adev->dev, 292 "suspend of IP block <%s> failed %d\n", 293 ip_block->version->funcs->name, r); 294 return r; 295 } 296 } 297 298 ip_block->status.hw = false; 299 return 0; 300 } 301 302 int amdgpu_ip_block_resume(struct amdgpu_ip_block *ip_block) 303 { 304 int r; 305 306 if (ip_block->version->funcs->resume) { 307 r = ip_block->version->funcs->resume(ip_block); 308 if (r) { 309 dev_err(ip_block->adev->dev, 310 "resume of IP block <%s> failed %d\n", 311 ip_block->version->funcs->name, r); 312 return r; 313 } 314 } 315 316 ip_block->status.hw = true; 317 return 0; 318 } 319 320 /** 321 * DOC: board_info 322 * 323 * The amdgpu driver provides a sysfs API for giving board related information. 324 * It provides the form factor information in the format 325 * 326 * type : form factor 327 * 328 * Possible form factor values 329 * 330 * - "cem" - PCIE CEM card 331 * - "oam" - Open Compute Accelerator Module 332 * - "unknown" - Not known 333 * 334 */ 335 336 static ssize_t amdgpu_device_get_board_info(struct device *dev, 337 struct device_attribute *attr, 338 char *buf) 339 { 340 struct drm_device *ddev = dev_get_drvdata(dev); 341 struct amdgpu_device *adev = drm_to_adev(ddev); 342 enum amdgpu_pkg_type pkg_type = AMDGPU_PKG_TYPE_CEM; 343 const char *pkg; 344 345 if (adev->smuio.funcs && adev->smuio.funcs->get_pkg_type) 346 pkg_type = adev->smuio.funcs->get_pkg_type(adev); 347 348 switch (pkg_type) { 349 case AMDGPU_PKG_TYPE_CEM: 350 pkg = "cem"; 351 break; 352 case AMDGPU_PKG_TYPE_OAM: 353 pkg = "oam"; 354 break; 355 default: 356 pkg = "unknown"; 357 break; 358 } 359 360 return sysfs_emit(buf, "%s : %s\n", "type", pkg); 361 } 362 363 static DEVICE_ATTR(board_info, 0444, amdgpu_device_get_board_info, NULL); 364 365 static struct attribute *amdgpu_board_attrs[] = { 366 &dev_attr_board_info.attr, 367 NULL, 368 }; 369 370 static umode_t amdgpu_board_attrs_is_visible(struct kobject *kobj, 371 struct attribute *attr, int n) 372 { 373 struct device *dev = kobj_to_dev(kobj); 374 struct drm_device *ddev = dev_get_drvdata(dev); 375 struct amdgpu_device *adev = drm_to_adev(ddev); 376 377 if (adev->flags & AMD_IS_APU) 378 return 0; 379 380 return attr->mode; 381 } 382 383 static const struct attribute_group amdgpu_board_attrs_group = { 384 .attrs = amdgpu_board_attrs, 385 .is_visible = amdgpu_board_attrs_is_visible 386 }; 387 388 static void amdgpu_device_get_pcie_info(struct amdgpu_device *adev); 389 390 391 /** 392 * amdgpu_device_supports_px - Is the device a dGPU with ATPX power control 393 * 394 * @dev: drm_device pointer 395 * 396 * Returns true if the device is a dGPU with ATPX power control, 397 * otherwise return false. 398 */ 399 bool amdgpu_device_supports_px(struct drm_device *dev) 400 { 401 struct amdgpu_device *adev = drm_to_adev(dev); 402 403 if ((adev->flags & AMD_IS_PX) && !amdgpu_is_atpx_hybrid()) 404 return true; 405 return false; 406 } 407 408 /** 409 * amdgpu_device_supports_boco - Is the device a dGPU with ACPI power resources 410 * 411 * @dev: drm_device pointer 412 * 413 * Returns true if the device is a dGPU with ACPI power control, 414 * otherwise return false. 415 */ 416 bool amdgpu_device_supports_boco(struct drm_device *dev) 417 { 418 struct amdgpu_device *adev = drm_to_adev(dev); 419 420 if (!IS_ENABLED(CONFIG_HOTPLUG_PCI_PCIE)) 421 return false; 422 423 if (adev->has_pr3 || 424 ((adev->flags & AMD_IS_PX) && amdgpu_is_atpx_hybrid())) 425 return true; 426 return false; 427 } 428 429 /** 430 * amdgpu_device_supports_baco - Does the device support BACO 431 * 432 * @dev: drm_device pointer 433 * 434 * Return: 435 * 1 if the device supporte BACO; 436 * 3 if the device support MACO (only works if BACO is supported) 437 * otherwise return 0. 438 */ 439 int amdgpu_device_supports_baco(struct drm_device *dev) 440 { 441 struct amdgpu_device *adev = drm_to_adev(dev); 442 443 return amdgpu_asic_supports_baco(adev); 444 } 445 446 void amdgpu_device_detect_runtime_pm_mode(struct amdgpu_device *adev) 447 { 448 struct drm_device *dev; 449 int bamaco_support; 450 451 dev = adev_to_drm(adev); 452 453 adev->pm.rpm_mode = AMDGPU_RUNPM_NONE; 454 bamaco_support = amdgpu_device_supports_baco(dev); 455 456 switch (amdgpu_runtime_pm) { 457 case 2: 458 if (bamaco_support & MACO_SUPPORT) { 459 adev->pm.rpm_mode = AMDGPU_RUNPM_BAMACO; 460 dev_info(adev->dev, "Forcing BAMACO for runtime pm\n"); 461 } else if (bamaco_support == BACO_SUPPORT) { 462 adev->pm.rpm_mode = AMDGPU_RUNPM_BACO; 463 dev_info(adev->dev, "Requested mode BAMACO not available,fallback to use BACO\n"); 464 } 465 break; 466 case 1: 467 if (bamaco_support & BACO_SUPPORT) { 468 adev->pm.rpm_mode = AMDGPU_RUNPM_BACO; 469 dev_info(adev->dev, "Forcing BACO for runtime pm\n"); 470 } 471 break; 472 case -1: 473 case -2: 474 if (amdgpu_device_supports_px(dev)) { /* enable PX as runtime mode */ 475 adev->pm.rpm_mode = AMDGPU_RUNPM_PX; 476 dev_info(adev->dev, "Using ATPX for runtime pm\n"); 477 } else if (amdgpu_device_supports_boco(dev)) { /* enable boco as runtime mode */ 478 adev->pm.rpm_mode = AMDGPU_RUNPM_BOCO; 479 dev_info(adev->dev, "Using BOCO for runtime pm\n"); 480 } else { 481 if (!bamaco_support) 482 goto no_runtime_pm; 483 484 switch (adev->asic_type) { 485 case CHIP_VEGA20: 486 case CHIP_ARCTURUS: 487 /* BACO are not supported on vega20 and arctrus */ 488 break; 489 case CHIP_VEGA10: 490 /* enable BACO as runpm mode if noretry=0 */ 491 if (!adev->gmc.noretry) 492 adev->pm.rpm_mode = AMDGPU_RUNPM_BACO; 493 break; 494 default: 495 /* enable BACO as runpm mode on CI+ */ 496 adev->pm.rpm_mode = AMDGPU_RUNPM_BACO; 497 break; 498 } 499 500 if (adev->pm.rpm_mode == AMDGPU_RUNPM_BACO) { 501 if (bamaco_support & MACO_SUPPORT) { 502 adev->pm.rpm_mode = AMDGPU_RUNPM_BAMACO; 503 dev_info(adev->dev, "Using BAMACO for runtime pm\n"); 504 } else { 505 dev_info(adev->dev, "Using BACO for runtime pm\n"); 506 } 507 } 508 } 509 break; 510 case 0: 511 dev_info(adev->dev, "runtime pm is manually disabled\n"); 512 break; 513 default: 514 break; 515 } 516 517 no_runtime_pm: 518 if (adev->pm.rpm_mode == AMDGPU_RUNPM_NONE) 519 dev_info(adev->dev, "Runtime PM not available\n"); 520 } 521 /** 522 * amdgpu_device_supports_smart_shift - Is the device dGPU with 523 * smart shift support 524 * 525 * @dev: drm_device pointer 526 * 527 * Returns true if the device is a dGPU with Smart Shift support, 528 * otherwise returns false. 529 */ 530 bool amdgpu_device_supports_smart_shift(struct drm_device *dev) 531 { 532 return (amdgpu_device_supports_boco(dev) && 533 amdgpu_acpi_is_power_shift_control_supported()); 534 } 535 536 /* 537 * VRAM access helper functions 538 */ 539 540 /** 541 * amdgpu_device_mm_access - access vram by MM_INDEX/MM_DATA 542 * 543 * @adev: amdgpu_device pointer 544 * @pos: offset of the buffer in vram 545 * @buf: virtual address of the buffer in system memory 546 * @size: read/write size, sizeof(@buf) must > @size 547 * @write: true - write to vram, otherwise - read from vram 548 */ 549 void amdgpu_device_mm_access(struct amdgpu_device *adev, loff_t pos, 550 void *buf, size_t size, bool write) 551 { 552 unsigned long flags; 553 uint32_t hi = ~0, tmp = 0; 554 uint32_t *data = buf; 555 uint64_t last; 556 int idx; 557 558 if (!drm_dev_enter(adev_to_drm(adev), &idx)) 559 return; 560 561 BUG_ON(!IS_ALIGNED(pos, 4) || !IS_ALIGNED(size, 4)); 562 563 spin_lock_irqsave(&adev->mmio_idx_lock, flags); 564 for (last = pos + size; pos < last; pos += 4) { 565 tmp = pos >> 31; 566 567 WREG32_NO_KIQ(mmMM_INDEX, ((uint32_t)pos) | 0x80000000); 568 if (tmp != hi) { 569 WREG32_NO_KIQ(mmMM_INDEX_HI, tmp); 570 hi = tmp; 571 } 572 if (write) 573 WREG32_NO_KIQ(mmMM_DATA, *data++); 574 else 575 *data++ = RREG32_NO_KIQ(mmMM_DATA); 576 } 577 578 spin_unlock_irqrestore(&adev->mmio_idx_lock, flags); 579 drm_dev_exit(idx); 580 } 581 582 /** 583 * amdgpu_device_aper_access - access vram by vram aperature 584 * 585 * @adev: amdgpu_device pointer 586 * @pos: offset of the buffer in vram 587 * @buf: virtual address of the buffer in system memory 588 * @size: read/write size, sizeof(@buf) must > @size 589 * @write: true - write to vram, otherwise - read from vram 590 * 591 * The return value means how many bytes have been transferred. 592 */ 593 size_t amdgpu_device_aper_access(struct amdgpu_device *adev, loff_t pos, 594 void *buf, size_t size, bool write) 595 { 596 #ifdef CONFIG_64BIT 597 void __iomem *addr; 598 size_t count = 0; 599 uint64_t last; 600 601 if (!adev->mman.aper_base_kaddr) 602 return 0; 603 604 last = min(pos + size, adev->gmc.visible_vram_size); 605 if (last > pos) { 606 addr = adev->mman.aper_base_kaddr + pos; 607 count = last - pos; 608 609 if (write) { 610 memcpy_toio(addr, buf, count); 611 /* Make sure HDP write cache flush happens without any reordering 612 * after the system memory contents are sent over PCIe device 613 */ 614 mb(); 615 amdgpu_device_flush_hdp(adev, NULL); 616 } else { 617 amdgpu_device_invalidate_hdp(adev, NULL); 618 /* Make sure HDP read cache is invalidated before issuing a read 619 * to the PCIe device 620 */ 621 mb(); 622 memcpy_fromio(buf, addr, count); 623 } 624 625 } 626 627 return count; 628 #else 629 return 0; 630 #endif 631 } 632 633 /** 634 * amdgpu_device_vram_access - read/write a buffer in vram 635 * 636 * @adev: amdgpu_device pointer 637 * @pos: offset of the buffer in vram 638 * @buf: virtual address of the buffer in system memory 639 * @size: read/write size, sizeof(@buf) must > @size 640 * @write: true - write to vram, otherwise - read from vram 641 */ 642 void amdgpu_device_vram_access(struct amdgpu_device *adev, loff_t pos, 643 void *buf, size_t size, bool write) 644 { 645 size_t count; 646 647 /* try to using vram apreature to access vram first */ 648 count = amdgpu_device_aper_access(adev, pos, buf, size, write); 649 size -= count; 650 if (size) { 651 /* using MM to access rest vram */ 652 pos += count; 653 buf += count; 654 amdgpu_device_mm_access(adev, pos, buf, size, write); 655 } 656 } 657 658 /* 659 * register access helper functions. 660 */ 661 662 /* Check if hw access should be skipped because of hotplug or device error */ 663 bool amdgpu_device_skip_hw_access(struct amdgpu_device *adev) 664 { 665 if (adev->no_hw_access) 666 return true; 667 668 #ifdef CONFIG_LOCKDEP 669 /* 670 * This is a bit complicated to understand, so worth a comment. What we assert 671 * here is that the GPU reset is not running on another thread in parallel. 672 * 673 * For this we trylock the read side of the reset semaphore, if that succeeds 674 * we know that the reset is not running in paralell. 675 * 676 * If the trylock fails we assert that we are either already holding the read 677 * side of the lock or are the reset thread itself and hold the write side of 678 * the lock. 679 */ 680 if (in_task()) { 681 if (down_read_trylock(&adev->reset_domain->sem)) 682 up_read(&adev->reset_domain->sem); 683 else 684 lockdep_assert_held(&adev->reset_domain->sem); 685 } 686 #endif 687 return false; 688 } 689 690 /** 691 * amdgpu_device_rreg - read a memory mapped IO or indirect register 692 * 693 * @adev: amdgpu_device pointer 694 * @reg: dword aligned register offset 695 * @acc_flags: access flags which require special behavior 696 * 697 * Returns the 32 bit value from the offset specified. 698 */ 699 uint32_t amdgpu_device_rreg(struct amdgpu_device *adev, 700 uint32_t reg, uint32_t acc_flags) 701 { 702 uint32_t ret; 703 704 if (amdgpu_device_skip_hw_access(adev)) 705 return 0; 706 707 if ((reg * 4) < adev->rmmio_size) { 708 if (!(acc_flags & AMDGPU_REGS_NO_KIQ) && 709 amdgpu_sriov_runtime(adev) && 710 down_read_trylock(&adev->reset_domain->sem)) { 711 ret = amdgpu_kiq_rreg(adev, reg, 0); 712 up_read(&adev->reset_domain->sem); 713 } else { 714 ret = readl(((void __iomem *)adev->rmmio) + (reg * 4)); 715 } 716 } else { 717 ret = adev->pcie_rreg(adev, reg * 4); 718 } 719 720 trace_amdgpu_device_rreg(adev->pdev->device, reg, ret); 721 722 return ret; 723 } 724 725 /* 726 * MMIO register read with bytes helper functions 727 * @offset:bytes offset from MMIO start 728 */ 729 730 /** 731 * amdgpu_mm_rreg8 - read a memory mapped IO register 732 * 733 * @adev: amdgpu_device pointer 734 * @offset: byte aligned register offset 735 * 736 * Returns the 8 bit value from the offset specified. 737 */ 738 uint8_t amdgpu_mm_rreg8(struct amdgpu_device *adev, uint32_t offset) 739 { 740 if (amdgpu_device_skip_hw_access(adev)) 741 return 0; 742 743 if (offset < adev->rmmio_size) 744 return (readb(adev->rmmio + offset)); 745 BUG(); 746 } 747 748 749 /** 750 * amdgpu_device_xcc_rreg - read a memory mapped IO or indirect register with specific XCC 751 * 752 * @adev: amdgpu_device pointer 753 * @reg: dword aligned register offset 754 * @acc_flags: access flags which require special behavior 755 * @xcc_id: xcc accelerated compute core id 756 * 757 * Returns the 32 bit value from the offset specified. 758 */ 759 uint32_t amdgpu_device_xcc_rreg(struct amdgpu_device *adev, 760 uint32_t reg, uint32_t acc_flags, 761 uint32_t xcc_id) 762 { 763 uint32_t ret, rlcg_flag; 764 765 if (amdgpu_device_skip_hw_access(adev)) 766 return 0; 767 768 if ((reg * 4) < adev->rmmio_size) { 769 if (amdgpu_sriov_vf(adev) && 770 !amdgpu_sriov_runtime(adev) && 771 adev->gfx.rlc.rlcg_reg_access_supported && 772 amdgpu_virt_get_rlcg_reg_access_flag(adev, acc_flags, 773 GC_HWIP, false, 774 &rlcg_flag)) { 775 ret = amdgpu_virt_rlcg_reg_rw(adev, reg, 0, rlcg_flag, GET_INST(GC, xcc_id)); 776 } else if (!(acc_flags & AMDGPU_REGS_NO_KIQ) && 777 amdgpu_sriov_runtime(adev) && 778 down_read_trylock(&adev->reset_domain->sem)) { 779 ret = amdgpu_kiq_rreg(adev, reg, xcc_id); 780 up_read(&adev->reset_domain->sem); 781 } else { 782 ret = readl(((void __iomem *)adev->rmmio) + (reg * 4)); 783 } 784 } else { 785 ret = adev->pcie_rreg(adev, reg * 4); 786 } 787 788 return ret; 789 } 790 791 /* 792 * MMIO register write with bytes helper functions 793 * @offset:bytes offset from MMIO start 794 * @value: the value want to be written to the register 795 */ 796 797 /** 798 * amdgpu_mm_wreg8 - read a memory mapped IO register 799 * 800 * @adev: amdgpu_device pointer 801 * @offset: byte aligned register offset 802 * @value: 8 bit value to write 803 * 804 * Writes the value specified to the offset specified. 805 */ 806 void amdgpu_mm_wreg8(struct amdgpu_device *adev, uint32_t offset, uint8_t value) 807 { 808 if (amdgpu_device_skip_hw_access(adev)) 809 return; 810 811 if (offset < adev->rmmio_size) 812 writeb(value, adev->rmmio + offset); 813 else 814 BUG(); 815 } 816 817 /** 818 * amdgpu_device_wreg - write to a memory mapped IO or indirect register 819 * 820 * @adev: amdgpu_device pointer 821 * @reg: dword aligned register offset 822 * @v: 32 bit value to write to the register 823 * @acc_flags: access flags which require special behavior 824 * 825 * Writes the value specified to the offset specified. 826 */ 827 void amdgpu_device_wreg(struct amdgpu_device *adev, 828 uint32_t reg, uint32_t v, 829 uint32_t acc_flags) 830 { 831 if (amdgpu_device_skip_hw_access(adev)) 832 return; 833 834 if ((reg * 4) < adev->rmmio_size) { 835 if (!(acc_flags & AMDGPU_REGS_NO_KIQ) && 836 amdgpu_sriov_runtime(adev) && 837 down_read_trylock(&adev->reset_domain->sem)) { 838 amdgpu_kiq_wreg(adev, reg, v, 0); 839 up_read(&adev->reset_domain->sem); 840 } else { 841 writel(v, ((void __iomem *)adev->rmmio) + (reg * 4)); 842 } 843 } else { 844 adev->pcie_wreg(adev, reg * 4, v); 845 } 846 847 trace_amdgpu_device_wreg(adev->pdev->device, reg, v); 848 } 849 850 /** 851 * amdgpu_mm_wreg_mmio_rlc - write register either with direct/indirect mmio or with RLC path if in range 852 * 853 * @adev: amdgpu_device pointer 854 * @reg: mmio/rlc register 855 * @v: value to write 856 * @xcc_id: xcc accelerated compute core id 857 * 858 * this function is invoked only for the debugfs register access 859 */ 860 void amdgpu_mm_wreg_mmio_rlc(struct amdgpu_device *adev, 861 uint32_t reg, uint32_t v, 862 uint32_t xcc_id) 863 { 864 if (amdgpu_device_skip_hw_access(adev)) 865 return; 866 867 if (amdgpu_sriov_fullaccess(adev) && 868 adev->gfx.rlc.funcs && 869 adev->gfx.rlc.funcs->is_rlcg_access_range) { 870 if (adev->gfx.rlc.funcs->is_rlcg_access_range(adev, reg)) 871 return amdgpu_sriov_wreg(adev, reg, v, 0, 0, xcc_id); 872 } else if ((reg * 4) >= adev->rmmio_size) { 873 adev->pcie_wreg(adev, reg * 4, v); 874 } else { 875 writel(v, ((void __iomem *)adev->rmmio) + (reg * 4)); 876 } 877 } 878 879 /** 880 * amdgpu_device_xcc_wreg - write to a memory mapped IO or indirect register with specific XCC 881 * 882 * @adev: amdgpu_device pointer 883 * @reg: dword aligned register offset 884 * @v: 32 bit value to write to the register 885 * @acc_flags: access flags which require special behavior 886 * @xcc_id: xcc accelerated compute core id 887 * 888 * Writes the value specified to the offset specified. 889 */ 890 void amdgpu_device_xcc_wreg(struct amdgpu_device *adev, 891 uint32_t reg, uint32_t v, 892 uint32_t acc_flags, uint32_t xcc_id) 893 { 894 uint32_t rlcg_flag; 895 896 if (amdgpu_device_skip_hw_access(adev)) 897 return; 898 899 if ((reg * 4) < adev->rmmio_size) { 900 if (amdgpu_sriov_vf(adev) && 901 !amdgpu_sriov_runtime(adev) && 902 adev->gfx.rlc.rlcg_reg_access_supported && 903 amdgpu_virt_get_rlcg_reg_access_flag(adev, acc_flags, 904 GC_HWIP, true, 905 &rlcg_flag)) { 906 amdgpu_virt_rlcg_reg_rw(adev, reg, v, rlcg_flag, GET_INST(GC, xcc_id)); 907 } else if (!(acc_flags & AMDGPU_REGS_NO_KIQ) && 908 amdgpu_sriov_runtime(adev) && 909 down_read_trylock(&adev->reset_domain->sem)) { 910 amdgpu_kiq_wreg(adev, reg, v, xcc_id); 911 up_read(&adev->reset_domain->sem); 912 } else { 913 writel(v, ((void __iomem *)adev->rmmio) + (reg * 4)); 914 } 915 } else { 916 adev->pcie_wreg(adev, reg * 4, v); 917 } 918 } 919 920 /** 921 * amdgpu_device_indirect_rreg - read an indirect register 922 * 923 * @adev: amdgpu_device pointer 924 * @reg_addr: indirect register address to read from 925 * 926 * Returns the value of indirect register @reg_addr 927 */ 928 u32 amdgpu_device_indirect_rreg(struct amdgpu_device *adev, 929 u32 reg_addr) 930 { 931 unsigned long flags, pcie_index, pcie_data; 932 void __iomem *pcie_index_offset; 933 void __iomem *pcie_data_offset; 934 u32 r; 935 936 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 937 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 938 939 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 940 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 941 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 942 943 writel(reg_addr, pcie_index_offset); 944 readl(pcie_index_offset); 945 r = readl(pcie_data_offset); 946 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 947 948 return r; 949 } 950 951 u32 amdgpu_device_indirect_rreg_ext(struct amdgpu_device *adev, 952 u64 reg_addr) 953 { 954 unsigned long flags, pcie_index, pcie_index_hi, pcie_data; 955 u32 r; 956 void __iomem *pcie_index_offset; 957 void __iomem *pcie_index_hi_offset; 958 void __iomem *pcie_data_offset; 959 960 if (unlikely(!adev->nbio.funcs)) { 961 pcie_index = AMDGPU_PCIE_INDEX_FALLBACK; 962 pcie_data = AMDGPU_PCIE_DATA_FALLBACK; 963 } else { 964 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 965 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 966 } 967 968 if (reg_addr >> 32) { 969 if (unlikely(!adev->nbio.funcs)) 970 pcie_index_hi = AMDGPU_PCIE_INDEX_HI_FALLBACK; 971 else 972 pcie_index_hi = adev->nbio.funcs->get_pcie_index_hi_offset(adev); 973 } else { 974 pcie_index_hi = 0; 975 } 976 977 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 978 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 979 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 980 if (pcie_index_hi != 0) 981 pcie_index_hi_offset = (void __iomem *)adev->rmmio + 982 pcie_index_hi * 4; 983 984 writel(reg_addr, pcie_index_offset); 985 readl(pcie_index_offset); 986 if (pcie_index_hi != 0) { 987 writel((reg_addr >> 32) & 0xff, pcie_index_hi_offset); 988 readl(pcie_index_hi_offset); 989 } 990 r = readl(pcie_data_offset); 991 992 /* clear the high bits */ 993 if (pcie_index_hi != 0) { 994 writel(0, pcie_index_hi_offset); 995 readl(pcie_index_hi_offset); 996 } 997 998 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 999 1000 return r; 1001 } 1002 1003 /** 1004 * amdgpu_device_indirect_rreg64 - read a 64bits indirect register 1005 * 1006 * @adev: amdgpu_device pointer 1007 * @reg_addr: indirect register address to read from 1008 * 1009 * Returns the value of indirect register @reg_addr 1010 */ 1011 u64 amdgpu_device_indirect_rreg64(struct amdgpu_device *adev, 1012 u32 reg_addr) 1013 { 1014 unsigned long flags, pcie_index, pcie_data; 1015 void __iomem *pcie_index_offset; 1016 void __iomem *pcie_data_offset; 1017 u64 r; 1018 1019 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 1020 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 1021 1022 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 1023 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 1024 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 1025 1026 /* read low 32 bits */ 1027 writel(reg_addr, pcie_index_offset); 1028 readl(pcie_index_offset); 1029 r = readl(pcie_data_offset); 1030 /* read high 32 bits */ 1031 writel(reg_addr + 4, pcie_index_offset); 1032 readl(pcie_index_offset); 1033 r |= ((u64)readl(pcie_data_offset) << 32); 1034 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 1035 1036 return r; 1037 } 1038 1039 u64 amdgpu_device_indirect_rreg64_ext(struct amdgpu_device *adev, 1040 u64 reg_addr) 1041 { 1042 unsigned long flags, pcie_index, pcie_data; 1043 unsigned long pcie_index_hi = 0; 1044 void __iomem *pcie_index_offset; 1045 void __iomem *pcie_index_hi_offset; 1046 void __iomem *pcie_data_offset; 1047 u64 r; 1048 1049 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 1050 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 1051 if ((reg_addr >> 32) && (adev->nbio.funcs->get_pcie_index_hi_offset)) 1052 pcie_index_hi = adev->nbio.funcs->get_pcie_index_hi_offset(adev); 1053 1054 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 1055 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 1056 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 1057 if (pcie_index_hi != 0) 1058 pcie_index_hi_offset = (void __iomem *)adev->rmmio + 1059 pcie_index_hi * 4; 1060 1061 /* read low 32 bits */ 1062 writel(reg_addr, pcie_index_offset); 1063 readl(pcie_index_offset); 1064 if (pcie_index_hi != 0) { 1065 writel((reg_addr >> 32) & 0xff, pcie_index_hi_offset); 1066 readl(pcie_index_hi_offset); 1067 } 1068 r = readl(pcie_data_offset); 1069 /* read high 32 bits */ 1070 writel(reg_addr + 4, pcie_index_offset); 1071 readl(pcie_index_offset); 1072 if (pcie_index_hi != 0) { 1073 writel((reg_addr >> 32) & 0xff, pcie_index_hi_offset); 1074 readl(pcie_index_hi_offset); 1075 } 1076 r |= ((u64)readl(pcie_data_offset) << 32); 1077 1078 /* clear the high bits */ 1079 if (pcie_index_hi != 0) { 1080 writel(0, pcie_index_hi_offset); 1081 readl(pcie_index_hi_offset); 1082 } 1083 1084 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 1085 1086 return r; 1087 } 1088 1089 /** 1090 * amdgpu_device_indirect_wreg - write an indirect register address 1091 * 1092 * @adev: amdgpu_device pointer 1093 * @reg_addr: indirect register offset 1094 * @reg_data: indirect register data 1095 * 1096 */ 1097 void amdgpu_device_indirect_wreg(struct amdgpu_device *adev, 1098 u32 reg_addr, u32 reg_data) 1099 { 1100 unsigned long flags, pcie_index, pcie_data; 1101 void __iomem *pcie_index_offset; 1102 void __iomem *pcie_data_offset; 1103 1104 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 1105 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 1106 1107 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 1108 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 1109 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 1110 1111 writel(reg_addr, pcie_index_offset); 1112 readl(pcie_index_offset); 1113 writel(reg_data, pcie_data_offset); 1114 readl(pcie_data_offset); 1115 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 1116 } 1117 1118 void amdgpu_device_indirect_wreg_ext(struct amdgpu_device *adev, 1119 u64 reg_addr, u32 reg_data) 1120 { 1121 unsigned long flags, pcie_index, pcie_index_hi, pcie_data; 1122 void __iomem *pcie_index_offset; 1123 void __iomem *pcie_index_hi_offset; 1124 void __iomem *pcie_data_offset; 1125 1126 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 1127 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 1128 if ((reg_addr >> 32) && (adev->nbio.funcs->get_pcie_index_hi_offset)) 1129 pcie_index_hi = adev->nbio.funcs->get_pcie_index_hi_offset(adev); 1130 else 1131 pcie_index_hi = 0; 1132 1133 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 1134 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 1135 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 1136 if (pcie_index_hi != 0) 1137 pcie_index_hi_offset = (void __iomem *)adev->rmmio + 1138 pcie_index_hi * 4; 1139 1140 writel(reg_addr, pcie_index_offset); 1141 readl(pcie_index_offset); 1142 if (pcie_index_hi != 0) { 1143 writel((reg_addr >> 32) & 0xff, pcie_index_hi_offset); 1144 readl(pcie_index_hi_offset); 1145 } 1146 writel(reg_data, pcie_data_offset); 1147 readl(pcie_data_offset); 1148 1149 /* clear the high bits */ 1150 if (pcie_index_hi != 0) { 1151 writel(0, pcie_index_hi_offset); 1152 readl(pcie_index_hi_offset); 1153 } 1154 1155 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 1156 } 1157 1158 /** 1159 * amdgpu_device_indirect_wreg64 - write a 64bits indirect register address 1160 * 1161 * @adev: amdgpu_device pointer 1162 * @reg_addr: indirect register offset 1163 * @reg_data: indirect register data 1164 * 1165 */ 1166 void amdgpu_device_indirect_wreg64(struct amdgpu_device *adev, 1167 u32 reg_addr, u64 reg_data) 1168 { 1169 unsigned long flags, pcie_index, pcie_data; 1170 void __iomem *pcie_index_offset; 1171 void __iomem *pcie_data_offset; 1172 1173 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 1174 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 1175 1176 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 1177 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 1178 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 1179 1180 /* write low 32 bits */ 1181 writel(reg_addr, pcie_index_offset); 1182 readl(pcie_index_offset); 1183 writel((u32)(reg_data & 0xffffffffULL), pcie_data_offset); 1184 readl(pcie_data_offset); 1185 /* write high 32 bits */ 1186 writel(reg_addr + 4, pcie_index_offset); 1187 readl(pcie_index_offset); 1188 writel((u32)(reg_data >> 32), pcie_data_offset); 1189 readl(pcie_data_offset); 1190 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 1191 } 1192 1193 void amdgpu_device_indirect_wreg64_ext(struct amdgpu_device *adev, 1194 u64 reg_addr, u64 reg_data) 1195 { 1196 unsigned long flags, pcie_index, pcie_data; 1197 unsigned long pcie_index_hi = 0; 1198 void __iomem *pcie_index_offset; 1199 void __iomem *pcie_index_hi_offset; 1200 void __iomem *pcie_data_offset; 1201 1202 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 1203 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 1204 if ((reg_addr >> 32) && (adev->nbio.funcs->get_pcie_index_hi_offset)) 1205 pcie_index_hi = adev->nbio.funcs->get_pcie_index_hi_offset(adev); 1206 1207 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 1208 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 1209 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 1210 if (pcie_index_hi != 0) 1211 pcie_index_hi_offset = (void __iomem *)adev->rmmio + 1212 pcie_index_hi * 4; 1213 1214 /* write low 32 bits */ 1215 writel(reg_addr, pcie_index_offset); 1216 readl(pcie_index_offset); 1217 if (pcie_index_hi != 0) { 1218 writel((reg_addr >> 32) & 0xff, pcie_index_hi_offset); 1219 readl(pcie_index_hi_offset); 1220 } 1221 writel((u32)(reg_data & 0xffffffffULL), pcie_data_offset); 1222 readl(pcie_data_offset); 1223 /* write high 32 bits */ 1224 writel(reg_addr + 4, pcie_index_offset); 1225 readl(pcie_index_offset); 1226 if (pcie_index_hi != 0) { 1227 writel((reg_addr >> 32) & 0xff, pcie_index_hi_offset); 1228 readl(pcie_index_hi_offset); 1229 } 1230 writel((u32)(reg_data >> 32), pcie_data_offset); 1231 readl(pcie_data_offset); 1232 1233 /* clear the high bits */ 1234 if (pcie_index_hi != 0) { 1235 writel(0, pcie_index_hi_offset); 1236 readl(pcie_index_hi_offset); 1237 } 1238 1239 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 1240 } 1241 1242 /** 1243 * amdgpu_device_get_rev_id - query device rev_id 1244 * 1245 * @adev: amdgpu_device pointer 1246 * 1247 * Return device rev_id 1248 */ 1249 u32 amdgpu_device_get_rev_id(struct amdgpu_device *adev) 1250 { 1251 return adev->nbio.funcs->get_rev_id(adev); 1252 } 1253 1254 /** 1255 * amdgpu_invalid_rreg - dummy reg read function 1256 * 1257 * @adev: amdgpu_device pointer 1258 * @reg: offset of register 1259 * 1260 * Dummy register read function. Used for register blocks 1261 * that certain asics don't have (all asics). 1262 * Returns the value in the register. 1263 */ 1264 static uint32_t amdgpu_invalid_rreg(struct amdgpu_device *adev, uint32_t reg) 1265 { 1266 DRM_ERROR("Invalid callback to read register 0x%04X\n", reg); 1267 BUG(); 1268 return 0; 1269 } 1270 1271 static uint32_t amdgpu_invalid_rreg_ext(struct amdgpu_device *adev, uint64_t reg) 1272 { 1273 DRM_ERROR("Invalid callback to read register 0x%llX\n", reg); 1274 BUG(); 1275 return 0; 1276 } 1277 1278 /** 1279 * amdgpu_invalid_wreg - dummy reg write function 1280 * 1281 * @adev: amdgpu_device pointer 1282 * @reg: offset of register 1283 * @v: value to write to the register 1284 * 1285 * Dummy register read function. Used for register blocks 1286 * that certain asics don't have (all asics). 1287 */ 1288 static void amdgpu_invalid_wreg(struct amdgpu_device *adev, uint32_t reg, uint32_t v) 1289 { 1290 DRM_ERROR("Invalid callback to write register 0x%04X with 0x%08X\n", 1291 reg, v); 1292 BUG(); 1293 } 1294 1295 static void amdgpu_invalid_wreg_ext(struct amdgpu_device *adev, uint64_t reg, uint32_t v) 1296 { 1297 DRM_ERROR("Invalid callback to write register 0x%llX with 0x%08X\n", 1298 reg, v); 1299 BUG(); 1300 } 1301 1302 /** 1303 * amdgpu_invalid_rreg64 - dummy 64 bit reg read function 1304 * 1305 * @adev: amdgpu_device pointer 1306 * @reg: offset of register 1307 * 1308 * Dummy register read function. Used for register blocks 1309 * that certain asics don't have (all asics). 1310 * Returns the value in the register. 1311 */ 1312 static uint64_t amdgpu_invalid_rreg64(struct amdgpu_device *adev, uint32_t reg) 1313 { 1314 DRM_ERROR("Invalid callback to read 64 bit register 0x%04X\n", reg); 1315 BUG(); 1316 return 0; 1317 } 1318 1319 static uint64_t amdgpu_invalid_rreg64_ext(struct amdgpu_device *adev, uint64_t reg) 1320 { 1321 DRM_ERROR("Invalid callback to read register 0x%llX\n", reg); 1322 BUG(); 1323 return 0; 1324 } 1325 1326 /** 1327 * amdgpu_invalid_wreg64 - dummy reg write function 1328 * 1329 * @adev: amdgpu_device pointer 1330 * @reg: offset of register 1331 * @v: value to write to the register 1332 * 1333 * Dummy register read function. Used for register blocks 1334 * that certain asics don't have (all asics). 1335 */ 1336 static void amdgpu_invalid_wreg64(struct amdgpu_device *adev, uint32_t reg, uint64_t v) 1337 { 1338 DRM_ERROR("Invalid callback to write 64 bit register 0x%04X with 0x%08llX\n", 1339 reg, v); 1340 BUG(); 1341 } 1342 1343 static void amdgpu_invalid_wreg64_ext(struct amdgpu_device *adev, uint64_t reg, uint64_t v) 1344 { 1345 DRM_ERROR("Invalid callback to write 64 bit register 0x%llX with 0x%08llX\n", 1346 reg, v); 1347 BUG(); 1348 } 1349 1350 /** 1351 * amdgpu_block_invalid_rreg - dummy reg read function 1352 * 1353 * @adev: amdgpu_device pointer 1354 * @block: offset of instance 1355 * @reg: offset of register 1356 * 1357 * Dummy register read function. Used for register blocks 1358 * that certain asics don't have (all asics). 1359 * Returns the value in the register. 1360 */ 1361 static uint32_t amdgpu_block_invalid_rreg(struct amdgpu_device *adev, 1362 uint32_t block, uint32_t reg) 1363 { 1364 DRM_ERROR("Invalid callback to read register 0x%04X in block 0x%04X\n", 1365 reg, block); 1366 BUG(); 1367 return 0; 1368 } 1369 1370 /** 1371 * amdgpu_block_invalid_wreg - dummy reg write function 1372 * 1373 * @adev: amdgpu_device pointer 1374 * @block: offset of instance 1375 * @reg: offset of register 1376 * @v: value to write to the register 1377 * 1378 * Dummy register read function. Used for register blocks 1379 * that certain asics don't have (all asics). 1380 */ 1381 static void amdgpu_block_invalid_wreg(struct amdgpu_device *adev, 1382 uint32_t block, 1383 uint32_t reg, uint32_t v) 1384 { 1385 DRM_ERROR("Invalid block callback to write register 0x%04X in block 0x%04X with 0x%08X\n", 1386 reg, block, v); 1387 BUG(); 1388 } 1389 1390 /** 1391 * amdgpu_device_asic_init - Wrapper for atom asic_init 1392 * 1393 * @adev: amdgpu_device pointer 1394 * 1395 * Does any asic specific work and then calls atom asic init. 1396 */ 1397 static int amdgpu_device_asic_init(struct amdgpu_device *adev) 1398 { 1399 int ret; 1400 1401 amdgpu_asic_pre_asic_init(adev); 1402 1403 if (amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 3) || 1404 amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 4) || 1405 amdgpu_ip_version(adev, GC_HWIP, 0) >= IP_VERSION(11, 0, 0)) { 1406 amdgpu_psp_wait_for_bootloader(adev); 1407 ret = amdgpu_atomfirmware_asic_init(adev, true); 1408 return ret; 1409 } else { 1410 return amdgpu_atom_asic_init(adev->mode_info.atom_context); 1411 } 1412 1413 return 0; 1414 } 1415 1416 /** 1417 * amdgpu_device_mem_scratch_init - allocate the VRAM scratch page 1418 * 1419 * @adev: amdgpu_device pointer 1420 * 1421 * Allocates a scratch page of VRAM for use by various things in the 1422 * driver. 1423 */ 1424 static int amdgpu_device_mem_scratch_init(struct amdgpu_device *adev) 1425 { 1426 return amdgpu_bo_create_kernel(adev, AMDGPU_GPU_PAGE_SIZE, PAGE_SIZE, 1427 AMDGPU_GEM_DOMAIN_VRAM | 1428 AMDGPU_GEM_DOMAIN_GTT, 1429 &adev->mem_scratch.robj, 1430 &adev->mem_scratch.gpu_addr, 1431 (void **)&adev->mem_scratch.ptr); 1432 } 1433 1434 /** 1435 * amdgpu_device_mem_scratch_fini - Free the VRAM scratch page 1436 * 1437 * @adev: amdgpu_device pointer 1438 * 1439 * Frees the VRAM scratch page. 1440 */ 1441 static void amdgpu_device_mem_scratch_fini(struct amdgpu_device *adev) 1442 { 1443 amdgpu_bo_free_kernel(&adev->mem_scratch.robj, NULL, NULL); 1444 } 1445 1446 /** 1447 * amdgpu_device_program_register_sequence - program an array of registers. 1448 * 1449 * @adev: amdgpu_device pointer 1450 * @registers: pointer to the register array 1451 * @array_size: size of the register array 1452 * 1453 * Programs an array or registers with and or masks. 1454 * This is a helper for setting golden registers. 1455 */ 1456 void amdgpu_device_program_register_sequence(struct amdgpu_device *adev, 1457 const u32 *registers, 1458 const u32 array_size) 1459 { 1460 u32 tmp, reg, and_mask, or_mask; 1461 int i; 1462 1463 if (array_size % 3) 1464 return; 1465 1466 for (i = 0; i < array_size; i += 3) { 1467 reg = registers[i + 0]; 1468 and_mask = registers[i + 1]; 1469 or_mask = registers[i + 2]; 1470 1471 if (and_mask == 0xffffffff) { 1472 tmp = or_mask; 1473 } else { 1474 tmp = RREG32(reg); 1475 tmp &= ~and_mask; 1476 if (adev->family >= AMDGPU_FAMILY_AI) 1477 tmp |= (or_mask & and_mask); 1478 else 1479 tmp |= or_mask; 1480 } 1481 WREG32(reg, tmp); 1482 } 1483 } 1484 1485 /** 1486 * amdgpu_device_pci_config_reset - reset the GPU 1487 * 1488 * @adev: amdgpu_device pointer 1489 * 1490 * Resets the GPU using the pci config reset sequence. 1491 * Only applicable to asics prior to vega10. 1492 */ 1493 void amdgpu_device_pci_config_reset(struct amdgpu_device *adev) 1494 { 1495 pci_write_config_dword(adev->pdev, 0x7c, AMDGPU_ASIC_RESET_DATA); 1496 } 1497 1498 /** 1499 * amdgpu_device_pci_reset - reset the GPU using generic PCI means 1500 * 1501 * @adev: amdgpu_device pointer 1502 * 1503 * Resets the GPU using generic pci reset interfaces (FLR, SBR, etc.). 1504 */ 1505 int amdgpu_device_pci_reset(struct amdgpu_device *adev) 1506 { 1507 return pci_reset_function(adev->pdev); 1508 } 1509 1510 /* 1511 * amdgpu_device_wb_*() 1512 * Writeback is the method by which the GPU updates special pages in memory 1513 * with the status of certain GPU events (fences, ring pointers,etc.). 1514 */ 1515 1516 /** 1517 * amdgpu_device_wb_fini - Disable Writeback and free memory 1518 * 1519 * @adev: amdgpu_device pointer 1520 * 1521 * Disables Writeback and frees the Writeback memory (all asics). 1522 * Used at driver shutdown. 1523 */ 1524 static void amdgpu_device_wb_fini(struct amdgpu_device *adev) 1525 { 1526 if (adev->wb.wb_obj) { 1527 amdgpu_bo_free_kernel(&adev->wb.wb_obj, 1528 &adev->wb.gpu_addr, 1529 (void **)&adev->wb.wb); 1530 adev->wb.wb_obj = NULL; 1531 } 1532 } 1533 1534 /** 1535 * amdgpu_device_wb_init - Init Writeback driver info and allocate memory 1536 * 1537 * @adev: amdgpu_device pointer 1538 * 1539 * Initializes writeback and allocates writeback memory (all asics). 1540 * Used at driver startup. 1541 * Returns 0 on success or an -error on failure. 1542 */ 1543 static int amdgpu_device_wb_init(struct amdgpu_device *adev) 1544 { 1545 int r; 1546 1547 if (adev->wb.wb_obj == NULL) { 1548 /* AMDGPU_MAX_WB * sizeof(uint32_t) * 8 = AMDGPU_MAX_WB 256bit slots */ 1549 r = amdgpu_bo_create_kernel(adev, AMDGPU_MAX_WB * sizeof(uint32_t) * 8, 1550 PAGE_SIZE, AMDGPU_GEM_DOMAIN_GTT, 1551 &adev->wb.wb_obj, &adev->wb.gpu_addr, 1552 (void **)&adev->wb.wb); 1553 if (r) { 1554 dev_warn(adev->dev, "(%d) create WB bo failed\n", r); 1555 return r; 1556 } 1557 1558 adev->wb.num_wb = AMDGPU_MAX_WB; 1559 memset(&adev->wb.used, 0, sizeof(adev->wb.used)); 1560 1561 /* clear wb memory */ 1562 memset((char *)adev->wb.wb, 0, AMDGPU_MAX_WB * sizeof(uint32_t) * 8); 1563 } 1564 1565 return 0; 1566 } 1567 1568 /** 1569 * amdgpu_device_wb_get - Allocate a wb entry 1570 * 1571 * @adev: amdgpu_device pointer 1572 * @wb: wb index 1573 * 1574 * Allocate a wb slot for use by the driver (all asics). 1575 * Returns 0 on success or -EINVAL on failure. 1576 */ 1577 int amdgpu_device_wb_get(struct amdgpu_device *adev, u32 *wb) 1578 { 1579 unsigned long flags, offset; 1580 1581 spin_lock_irqsave(&adev->wb.lock, flags); 1582 offset = find_first_zero_bit(adev->wb.used, adev->wb.num_wb); 1583 if (offset < adev->wb.num_wb) { 1584 __set_bit(offset, adev->wb.used); 1585 spin_unlock_irqrestore(&adev->wb.lock, flags); 1586 *wb = offset << 3; /* convert to dw offset */ 1587 return 0; 1588 } else { 1589 spin_unlock_irqrestore(&adev->wb.lock, flags); 1590 return -EINVAL; 1591 } 1592 } 1593 1594 /** 1595 * amdgpu_device_wb_free - Free a wb entry 1596 * 1597 * @adev: amdgpu_device pointer 1598 * @wb: wb index 1599 * 1600 * Free a wb slot allocated for use by the driver (all asics) 1601 */ 1602 void amdgpu_device_wb_free(struct amdgpu_device *adev, u32 wb) 1603 { 1604 unsigned long flags; 1605 1606 wb >>= 3; 1607 spin_lock_irqsave(&adev->wb.lock, flags); 1608 if (wb < adev->wb.num_wb) 1609 __clear_bit(wb, adev->wb.used); 1610 spin_unlock_irqrestore(&adev->wb.lock, flags); 1611 } 1612 1613 /** 1614 * amdgpu_device_resize_fb_bar - try to resize FB BAR 1615 * 1616 * @adev: amdgpu_device pointer 1617 * 1618 * Try to resize FB BAR to make all VRAM CPU accessible. We try very hard not 1619 * to fail, but if any of the BARs is not accessible after the size we abort 1620 * driver loading by returning -ENODEV. 1621 */ 1622 int amdgpu_device_resize_fb_bar(struct amdgpu_device *adev) 1623 { 1624 int rbar_size = pci_rebar_bytes_to_size(adev->gmc.real_vram_size); 1625 struct pci_bus *root; 1626 struct resource *res; 1627 unsigned int i; 1628 u16 cmd; 1629 int r; 1630 1631 if (!IS_ENABLED(CONFIG_PHYS_ADDR_T_64BIT)) 1632 return 0; 1633 1634 /* Bypass for VF */ 1635 if (amdgpu_sriov_vf(adev)) 1636 return 0; 1637 1638 /* PCI_EXT_CAP_ID_VNDR extended capability is located at 0x100 */ 1639 if (!pci_find_ext_capability(adev->pdev, PCI_EXT_CAP_ID_VNDR)) 1640 DRM_WARN("System can't access extended configuration space, please check!!\n"); 1641 1642 /* skip if the bios has already enabled large BAR */ 1643 if (adev->gmc.real_vram_size && 1644 (pci_resource_len(adev->pdev, 0) >= adev->gmc.real_vram_size)) 1645 return 0; 1646 1647 /* Check if the root BUS has 64bit memory resources */ 1648 root = adev->pdev->bus; 1649 while (root->parent) 1650 root = root->parent; 1651 1652 pci_bus_for_each_resource(root, res, i) { 1653 if (res && res->flags & (IORESOURCE_MEM | IORESOURCE_MEM_64) && 1654 res->start > 0x100000000ull) 1655 break; 1656 } 1657 1658 /* Trying to resize is pointless without a root hub window above 4GB */ 1659 if (!res) 1660 return 0; 1661 1662 /* Limit the BAR size to what is available */ 1663 rbar_size = min(fls(pci_rebar_get_possible_sizes(adev->pdev, 0)) - 1, 1664 rbar_size); 1665 1666 /* Disable memory decoding while we change the BAR addresses and size */ 1667 pci_read_config_word(adev->pdev, PCI_COMMAND, &cmd); 1668 pci_write_config_word(adev->pdev, PCI_COMMAND, 1669 cmd & ~PCI_COMMAND_MEMORY); 1670 1671 /* Free the VRAM and doorbell BAR, we most likely need to move both. */ 1672 amdgpu_doorbell_fini(adev); 1673 if (adev->asic_type >= CHIP_BONAIRE) 1674 pci_release_resource(adev->pdev, 2); 1675 1676 pci_release_resource(adev->pdev, 0); 1677 1678 r = pci_resize_resource(adev->pdev, 0, rbar_size); 1679 if (r == -ENOSPC) 1680 DRM_INFO("Not enough PCI address space for a large BAR."); 1681 else if (r && r != -ENOTSUPP) 1682 DRM_ERROR("Problem resizing BAR0 (%d).", r); 1683 1684 pci_assign_unassigned_bus_resources(adev->pdev->bus); 1685 1686 /* When the doorbell or fb BAR isn't available we have no chance of 1687 * using the device. 1688 */ 1689 r = amdgpu_doorbell_init(adev); 1690 if (r || (pci_resource_flags(adev->pdev, 0) & IORESOURCE_UNSET)) 1691 return -ENODEV; 1692 1693 pci_write_config_word(adev->pdev, PCI_COMMAND, cmd); 1694 1695 return 0; 1696 } 1697 1698 static bool amdgpu_device_read_bios(struct amdgpu_device *adev) 1699 { 1700 if (hweight32(adev->aid_mask) && (adev->flags & AMD_IS_APU)) 1701 return false; 1702 1703 return true; 1704 } 1705 1706 /* 1707 * GPU helpers function. 1708 */ 1709 /** 1710 * amdgpu_device_need_post - check if the hw need post or not 1711 * 1712 * @adev: amdgpu_device pointer 1713 * 1714 * Check if the asic has been initialized (all asics) at driver startup 1715 * or post is needed if hw reset is performed. 1716 * Returns true if need or false if not. 1717 */ 1718 bool amdgpu_device_need_post(struct amdgpu_device *adev) 1719 { 1720 uint32_t reg; 1721 1722 if (amdgpu_sriov_vf(adev)) 1723 return false; 1724 1725 if (!amdgpu_device_read_bios(adev)) 1726 return false; 1727 1728 if (amdgpu_passthrough(adev)) { 1729 /* for FIJI: In whole GPU pass-through virtualization case, after VM reboot 1730 * some old smc fw still need driver do vPost otherwise gpu hang, while 1731 * those smc fw version above 22.15 doesn't have this flaw, so we force 1732 * vpost executed for smc version below 22.15 1733 */ 1734 if (adev->asic_type == CHIP_FIJI) { 1735 int err; 1736 uint32_t fw_ver; 1737 1738 err = request_firmware(&adev->pm.fw, "amdgpu/fiji_smc.bin", adev->dev); 1739 /* force vPost if error occured */ 1740 if (err) 1741 return true; 1742 1743 fw_ver = *((uint32_t *)adev->pm.fw->data + 69); 1744 release_firmware(adev->pm.fw); 1745 if (fw_ver < 0x00160e00) 1746 return true; 1747 } 1748 } 1749 1750 /* Don't post if we need to reset whole hive on init */ 1751 if (adev->init_lvl->level == AMDGPU_INIT_LEVEL_MINIMAL_XGMI) 1752 return false; 1753 1754 if (adev->has_hw_reset) { 1755 adev->has_hw_reset = false; 1756 return true; 1757 } 1758 1759 /* bios scratch used on CIK+ */ 1760 if (adev->asic_type >= CHIP_BONAIRE) 1761 return amdgpu_atombios_scratch_need_asic_init(adev); 1762 1763 /* check MEM_SIZE for older asics */ 1764 reg = amdgpu_asic_get_config_memsize(adev); 1765 1766 if ((reg != 0) && (reg != 0xffffffff)) 1767 return false; 1768 1769 return true; 1770 } 1771 1772 /* 1773 * Check whether seamless boot is supported. 1774 * 1775 * So far we only support seamless boot on DCE 3.0 or later. 1776 * If users report that it works on older ASICS as well, we may 1777 * loosen this. 1778 */ 1779 bool amdgpu_device_seamless_boot_supported(struct amdgpu_device *adev) 1780 { 1781 switch (amdgpu_seamless) { 1782 case -1: 1783 break; 1784 case 1: 1785 return true; 1786 case 0: 1787 return false; 1788 default: 1789 DRM_ERROR("Invalid value for amdgpu.seamless: %d\n", 1790 amdgpu_seamless); 1791 return false; 1792 } 1793 1794 if (!(adev->flags & AMD_IS_APU)) 1795 return false; 1796 1797 if (adev->mman.keep_stolen_vga_memory) 1798 return false; 1799 1800 return amdgpu_ip_version(adev, DCE_HWIP, 0) >= IP_VERSION(3, 0, 0); 1801 } 1802 1803 /* 1804 * Intel hosts such as Rocket Lake, Alder Lake, Raptor Lake and Sapphire Rapids 1805 * don't support dynamic speed switching. Until we have confirmation from Intel 1806 * that a specific host supports it, it's safer that we keep it disabled for all. 1807 * 1808 * https://edc.intel.com/content/www/us/en/design/products/platforms/details/raptor-lake-s/13th-generation-core-processors-datasheet-volume-1-of-2/005/pci-express-support/ 1809 * https://gitlab.freedesktop.org/drm/amd/-/issues/2663 1810 */ 1811 static bool amdgpu_device_pcie_dynamic_switching_supported(struct amdgpu_device *adev) 1812 { 1813 #if IS_ENABLED(CONFIG_X86) 1814 struct cpuinfo_x86 *c = &cpu_data(0); 1815 1816 /* eGPU change speeds based on USB4 fabric conditions */ 1817 if (dev_is_removable(adev->dev)) 1818 return true; 1819 1820 if (c->x86_vendor == X86_VENDOR_INTEL) 1821 return false; 1822 #endif 1823 return true; 1824 } 1825 1826 /** 1827 * amdgpu_device_should_use_aspm - check if the device should program ASPM 1828 * 1829 * @adev: amdgpu_device pointer 1830 * 1831 * Confirm whether the module parameter and pcie bridge agree that ASPM should 1832 * be set for this device. 1833 * 1834 * Returns true if it should be used or false if not. 1835 */ 1836 bool amdgpu_device_should_use_aspm(struct amdgpu_device *adev) 1837 { 1838 switch (amdgpu_aspm) { 1839 case -1: 1840 break; 1841 case 0: 1842 return false; 1843 case 1: 1844 return true; 1845 default: 1846 return false; 1847 } 1848 if (adev->flags & AMD_IS_APU) 1849 return false; 1850 if (!(adev->pm.pp_feature & PP_PCIE_DPM_MASK)) 1851 return false; 1852 return pcie_aspm_enabled(adev->pdev); 1853 } 1854 1855 /* if we get transitioned to only one device, take VGA back */ 1856 /** 1857 * amdgpu_device_vga_set_decode - enable/disable vga decode 1858 * 1859 * @pdev: PCI device pointer 1860 * @state: enable/disable vga decode 1861 * 1862 * Enable/disable vga decode (all asics). 1863 * Returns VGA resource flags. 1864 */ 1865 static unsigned int amdgpu_device_vga_set_decode(struct pci_dev *pdev, 1866 bool state) 1867 { 1868 struct amdgpu_device *adev = drm_to_adev(pci_get_drvdata(pdev)); 1869 1870 amdgpu_asic_set_vga_state(adev, state); 1871 if (state) 1872 return VGA_RSRC_LEGACY_IO | VGA_RSRC_LEGACY_MEM | 1873 VGA_RSRC_NORMAL_IO | VGA_RSRC_NORMAL_MEM; 1874 else 1875 return VGA_RSRC_NORMAL_IO | VGA_RSRC_NORMAL_MEM; 1876 } 1877 1878 /** 1879 * amdgpu_device_check_block_size - validate the vm block size 1880 * 1881 * @adev: amdgpu_device pointer 1882 * 1883 * Validates the vm block size specified via module parameter. 1884 * The vm block size defines number of bits in page table versus page directory, 1885 * a page is 4KB so we have 12 bits offset, minimum 9 bits in the 1886 * page table and the remaining bits are in the page directory. 1887 */ 1888 static void amdgpu_device_check_block_size(struct amdgpu_device *adev) 1889 { 1890 /* defines number of bits in page table versus page directory, 1891 * a page is 4KB so we have 12 bits offset, minimum 9 bits in the 1892 * page table and the remaining bits are in the page directory 1893 */ 1894 if (amdgpu_vm_block_size == -1) 1895 return; 1896 1897 if (amdgpu_vm_block_size < 9) { 1898 dev_warn(adev->dev, "VM page table size (%d) too small\n", 1899 amdgpu_vm_block_size); 1900 amdgpu_vm_block_size = -1; 1901 } 1902 } 1903 1904 /** 1905 * amdgpu_device_check_vm_size - validate the vm size 1906 * 1907 * @adev: amdgpu_device pointer 1908 * 1909 * Validates the vm size in GB specified via module parameter. 1910 * The VM size is the size of the GPU virtual memory space in GB. 1911 */ 1912 static void amdgpu_device_check_vm_size(struct amdgpu_device *adev) 1913 { 1914 /* no need to check the default value */ 1915 if (amdgpu_vm_size == -1) 1916 return; 1917 1918 if (amdgpu_vm_size < 1) { 1919 dev_warn(adev->dev, "VM size (%d) too small, min is 1GB\n", 1920 amdgpu_vm_size); 1921 amdgpu_vm_size = -1; 1922 } 1923 } 1924 1925 static void amdgpu_device_check_smu_prv_buffer_size(struct amdgpu_device *adev) 1926 { 1927 struct sysinfo si; 1928 bool is_os_64 = (sizeof(void *) == 8); 1929 uint64_t total_memory; 1930 uint64_t dram_size_seven_GB = 0x1B8000000; 1931 uint64_t dram_size_three_GB = 0xB8000000; 1932 1933 if (amdgpu_smu_memory_pool_size == 0) 1934 return; 1935 1936 if (!is_os_64) { 1937 DRM_WARN("Not 64-bit OS, feature not supported\n"); 1938 goto def_value; 1939 } 1940 si_meminfo(&si); 1941 total_memory = (uint64_t)si.totalram * si.mem_unit; 1942 1943 if ((amdgpu_smu_memory_pool_size == 1) || 1944 (amdgpu_smu_memory_pool_size == 2)) { 1945 if (total_memory < dram_size_three_GB) 1946 goto def_value1; 1947 } else if ((amdgpu_smu_memory_pool_size == 4) || 1948 (amdgpu_smu_memory_pool_size == 8)) { 1949 if (total_memory < dram_size_seven_GB) 1950 goto def_value1; 1951 } else { 1952 DRM_WARN("Smu memory pool size not supported\n"); 1953 goto def_value; 1954 } 1955 adev->pm.smu_prv_buffer_size = amdgpu_smu_memory_pool_size << 28; 1956 1957 return; 1958 1959 def_value1: 1960 DRM_WARN("No enough system memory\n"); 1961 def_value: 1962 adev->pm.smu_prv_buffer_size = 0; 1963 } 1964 1965 static int amdgpu_device_init_apu_flags(struct amdgpu_device *adev) 1966 { 1967 if (!(adev->flags & AMD_IS_APU) || 1968 adev->asic_type < CHIP_RAVEN) 1969 return 0; 1970 1971 switch (adev->asic_type) { 1972 case CHIP_RAVEN: 1973 if (adev->pdev->device == 0x15dd) 1974 adev->apu_flags |= AMD_APU_IS_RAVEN; 1975 if (adev->pdev->device == 0x15d8) 1976 adev->apu_flags |= AMD_APU_IS_PICASSO; 1977 break; 1978 case CHIP_RENOIR: 1979 if ((adev->pdev->device == 0x1636) || 1980 (adev->pdev->device == 0x164c)) 1981 adev->apu_flags |= AMD_APU_IS_RENOIR; 1982 else 1983 adev->apu_flags |= AMD_APU_IS_GREEN_SARDINE; 1984 break; 1985 case CHIP_VANGOGH: 1986 adev->apu_flags |= AMD_APU_IS_VANGOGH; 1987 break; 1988 case CHIP_YELLOW_CARP: 1989 break; 1990 case CHIP_CYAN_SKILLFISH: 1991 if ((adev->pdev->device == 0x13FE) || 1992 (adev->pdev->device == 0x143F)) 1993 adev->apu_flags |= AMD_APU_IS_CYAN_SKILLFISH2; 1994 break; 1995 default: 1996 break; 1997 } 1998 1999 return 0; 2000 } 2001 2002 /** 2003 * amdgpu_device_check_arguments - validate module params 2004 * 2005 * @adev: amdgpu_device pointer 2006 * 2007 * Validates certain module parameters and updates 2008 * the associated values used by the driver (all asics). 2009 */ 2010 static int amdgpu_device_check_arguments(struct amdgpu_device *adev) 2011 { 2012 int i; 2013 2014 if (amdgpu_sched_jobs < 4) { 2015 dev_warn(adev->dev, "sched jobs (%d) must be at least 4\n", 2016 amdgpu_sched_jobs); 2017 amdgpu_sched_jobs = 4; 2018 } else if (!is_power_of_2(amdgpu_sched_jobs)) { 2019 dev_warn(adev->dev, "sched jobs (%d) must be a power of 2\n", 2020 amdgpu_sched_jobs); 2021 amdgpu_sched_jobs = roundup_pow_of_two(amdgpu_sched_jobs); 2022 } 2023 2024 if (amdgpu_gart_size != -1 && amdgpu_gart_size < 32) { 2025 /* gart size must be greater or equal to 32M */ 2026 dev_warn(adev->dev, "gart size (%d) too small\n", 2027 amdgpu_gart_size); 2028 amdgpu_gart_size = -1; 2029 } 2030 2031 if (amdgpu_gtt_size != -1 && amdgpu_gtt_size < 32) { 2032 /* gtt size must be greater or equal to 32M */ 2033 dev_warn(adev->dev, "gtt size (%d) too small\n", 2034 amdgpu_gtt_size); 2035 amdgpu_gtt_size = -1; 2036 } 2037 2038 /* valid range is between 4 and 9 inclusive */ 2039 if (amdgpu_vm_fragment_size != -1 && 2040 (amdgpu_vm_fragment_size > 9 || amdgpu_vm_fragment_size < 4)) { 2041 dev_warn(adev->dev, "valid range is between 4 and 9\n"); 2042 amdgpu_vm_fragment_size = -1; 2043 } 2044 2045 if (amdgpu_sched_hw_submission < 2) { 2046 dev_warn(adev->dev, "sched hw submission jobs (%d) must be at least 2\n", 2047 amdgpu_sched_hw_submission); 2048 amdgpu_sched_hw_submission = 2; 2049 } else if (!is_power_of_2(amdgpu_sched_hw_submission)) { 2050 dev_warn(adev->dev, "sched hw submission jobs (%d) must be a power of 2\n", 2051 amdgpu_sched_hw_submission); 2052 amdgpu_sched_hw_submission = roundup_pow_of_two(amdgpu_sched_hw_submission); 2053 } 2054 2055 if (amdgpu_reset_method < -1 || amdgpu_reset_method > 4) { 2056 dev_warn(adev->dev, "invalid option for reset method, reverting to default\n"); 2057 amdgpu_reset_method = -1; 2058 } 2059 2060 amdgpu_device_check_smu_prv_buffer_size(adev); 2061 2062 amdgpu_device_check_vm_size(adev); 2063 2064 amdgpu_device_check_block_size(adev); 2065 2066 adev->firmware.load_type = amdgpu_ucode_get_load_type(adev, amdgpu_fw_load_type); 2067 2068 for (i = 0; i < MAX_XCP; i++) 2069 adev->enforce_isolation[i] = !!enforce_isolation; 2070 2071 return 0; 2072 } 2073 2074 /** 2075 * amdgpu_switcheroo_set_state - set switcheroo state 2076 * 2077 * @pdev: pci dev pointer 2078 * @state: vga_switcheroo state 2079 * 2080 * Callback for the switcheroo driver. Suspends or resumes 2081 * the asics before or after it is powered up using ACPI methods. 2082 */ 2083 static void amdgpu_switcheroo_set_state(struct pci_dev *pdev, 2084 enum vga_switcheroo_state state) 2085 { 2086 struct drm_device *dev = pci_get_drvdata(pdev); 2087 int r; 2088 2089 if (amdgpu_device_supports_px(dev) && state == VGA_SWITCHEROO_OFF) 2090 return; 2091 2092 if (state == VGA_SWITCHEROO_ON) { 2093 pr_info("switched on\n"); 2094 /* don't suspend or resume card normally */ 2095 dev->switch_power_state = DRM_SWITCH_POWER_CHANGING; 2096 2097 pci_set_power_state(pdev, PCI_D0); 2098 amdgpu_device_load_pci_state(pdev); 2099 r = pci_enable_device(pdev); 2100 if (r) 2101 DRM_WARN("pci_enable_device failed (%d)\n", r); 2102 amdgpu_device_resume(dev, true); 2103 2104 dev->switch_power_state = DRM_SWITCH_POWER_ON; 2105 } else { 2106 pr_info("switched off\n"); 2107 dev->switch_power_state = DRM_SWITCH_POWER_CHANGING; 2108 amdgpu_device_prepare(dev); 2109 amdgpu_device_suspend(dev, true); 2110 amdgpu_device_cache_pci_state(pdev); 2111 /* Shut down the device */ 2112 pci_disable_device(pdev); 2113 pci_set_power_state(pdev, PCI_D3cold); 2114 dev->switch_power_state = DRM_SWITCH_POWER_OFF; 2115 } 2116 } 2117 2118 /** 2119 * amdgpu_switcheroo_can_switch - see if switcheroo state can change 2120 * 2121 * @pdev: pci dev pointer 2122 * 2123 * Callback for the switcheroo driver. Check of the switcheroo 2124 * state can be changed. 2125 * Returns true if the state can be changed, false if not. 2126 */ 2127 static bool amdgpu_switcheroo_can_switch(struct pci_dev *pdev) 2128 { 2129 struct drm_device *dev = pci_get_drvdata(pdev); 2130 2131 /* 2132 * FIXME: open_count is protected by drm_global_mutex but that would lead to 2133 * locking inversion with the driver load path. And the access here is 2134 * completely racy anyway. So don't bother with locking for now. 2135 */ 2136 return atomic_read(&dev->open_count) == 0; 2137 } 2138 2139 static const struct vga_switcheroo_client_ops amdgpu_switcheroo_ops = { 2140 .set_gpu_state = amdgpu_switcheroo_set_state, 2141 .reprobe = NULL, 2142 .can_switch = amdgpu_switcheroo_can_switch, 2143 }; 2144 2145 /** 2146 * amdgpu_device_ip_set_clockgating_state - set the CG state 2147 * 2148 * @dev: amdgpu_device pointer 2149 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.) 2150 * @state: clockgating state (gate or ungate) 2151 * 2152 * Sets the requested clockgating state for all instances of 2153 * the hardware IP specified. 2154 * Returns the error code from the last instance. 2155 */ 2156 int amdgpu_device_ip_set_clockgating_state(void *dev, 2157 enum amd_ip_block_type block_type, 2158 enum amd_clockgating_state state) 2159 { 2160 struct amdgpu_device *adev = dev; 2161 int i, r = 0; 2162 2163 for (i = 0; i < adev->num_ip_blocks; i++) { 2164 if (!adev->ip_blocks[i].status.valid) 2165 continue; 2166 if (adev->ip_blocks[i].version->type != block_type) 2167 continue; 2168 if (!adev->ip_blocks[i].version->funcs->set_clockgating_state) 2169 continue; 2170 r = adev->ip_blocks[i].version->funcs->set_clockgating_state( 2171 (void *)adev, state); 2172 if (r) 2173 DRM_ERROR("set_clockgating_state of IP block <%s> failed %d\n", 2174 adev->ip_blocks[i].version->funcs->name, r); 2175 } 2176 return r; 2177 } 2178 2179 /** 2180 * amdgpu_device_ip_set_powergating_state - set the PG state 2181 * 2182 * @dev: amdgpu_device pointer 2183 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.) 2184 * @state: powergating state (gate or ungate) 2185 * 2186 * Sets the requested powergating state for all instances of 2187 * the hardware IP specified. 2188 * Returns the error code from the last instance. 2189 */ 2190 int amdgpu_device_ip_set_powergating_state(void *dev, 2191 enum amd_ip_block_type block_type, 2192 enum amd_powergating_state state) 2193 { 2194 struct amdgpu_device *adev = dev; 2195 int i, r = 0; 2196 2197 for (i = 0; i < adev->num_ip_blocks; i++) { 2198 if (!adev->ip_blocks[i].status.valid) 2199 continue; 2200 if (adev->ip_blocks[i].version->type != block_type) 2201 continue; 2202 if (!adev->ip_blocks[i].version->funcs->set_powergating_state) 2203 continue; 2204 r = adev->ip_blocks[i].version->funcs->set_powergating_state( 2205 (void *)adev, state); 2206 if (r) 2207 DRM_ERROR("set_powergating_state of IP block <%s> failed %d\n", 2208 adev->ip_blocks[i].version->funcs->name, r); 2209 } 2210 return r; 2211 } 2212 2213 /** 2214 * amdgpu_device_ip_get_clockgating_state - get the CG state 2215 * 2216 * @adev: amdgpu_device pointer 2217 * @flags: clockgating feature flags 2218 * 2219 * Walks the list of IPs on the device and updates the clockgating 2220 * flags for each IP. 2221 * Updates @flags with the feature flags for each hardware IP where 2222 * clockgating is enabled. 2223 */ 2224 void amdgpu_device_ip_get_clockgating_state(struct amdgpu_device *adev, 2225 u64 *flags) 2226 { 2227 int i; 2228 2229 for (i = 0; i < adev->num_ip_blocks; i++) { 2230 if (!adev->ip_blocks[i].status.valid) 2231 continue; 2232 if (adev->ip_blocks[i].version->funcs->get_clockgating_state) 2233 adev->ip_blocks[i].version->funcs->get_clockgating_state((void *)adev, flags); 2234 } 2235 } 2236 2237 /** 2238 * amdgpu_device_ip_wait_for_idle - wait for idle 2239 * 2240 * @adev: amdgpu_device pointer 2241 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.) 2242 * 2243 * Waits for the request hardware IP to be idle. 2244 * Returns 0 for success or a negative error code on failure. 2245 */ 2246 int amdgpu_device_ip_wait_for_idle(struct amdgpu_device *adev, 2247 enum amd_ip_block_type block_type) 2248 { 2249 int i, r; 2250 2251 for (i = 0; i < adev->num_ip_blocks; i++) { 2252 if (!adev->ip_blocks[i].status.valid) 2253 continue; 2254 if (adev->ip_blocks[i].version->type == block_type) { 2255 if (adev->ip_blocks[i].version->funcs->wait_for_idle) { 2256 r = adev->ip_blocks[i].version->funcs->wait_for_idle( 2257 &adev->ip_blocks[i]); 2258 if (r) 2259 return r; 2260 } 2261 break; 2262 } 2263 } 2264 return 0; 2265 2266 } 2267 2268 /** 2269 * amdgpu_device_ip_is_valid - is the hardware IP enabled 2270 * 2271 * @adev: amdgpu_device pointer 2272 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.) 2273 * 2274 * Check if the hardware IP is enable or not. 2275 * Returns true if it the IP is enable, false if not. 2276 */ 2277 bool amdgpu_device_ip_is_valid(struct amdgpu_device *adev, 2278 enum amd_ip_block_type block_type) 2279 { 2280 int i; 2281 2282 for (i = 0; i < adev->num_ip_blocks; i++) { 2283 if (adev->ip_blocks[i].version->type == block_type) 2284 return adev->ip_blocks[i].status.valid; 2285 } 2286 return false; 2287 2288 } 2289 2290 /** 2291 * amdgpu_device_ip_get_ip_block - get a hw IP pointer 2292 * 2293 * @adev: amdgpu_device pointer 2294 * @type: Type of hardware IP (SMU, GFX, UVD, etc.) 2295 * 2296 * Returns a pointer to the hardware IP block structure 2297 * if it exists for the asic, otherwise NULL. 2298 */ 2299 struct amdgpu_ip_block * 2300 amdgpu_device_ip_get_ip_block(struct amdgpu_device *adev, 2301 enum amd_ip_block_type type) 2302 { 2303 int i; 2304 2305 for (i = 0; i < adev->num_ip_blocks; i++) 2306 if (adev->ip_blocks[i].version->type == type) 2307 return &adev->ip_blocks[i]; 2308 2309 return NULL; 2310 } 2311 2312 /** 2313 * amdgpu_device_ip_block_version_cmp 2314 * 2315 * @adev: amdgpu_device pointer 2316 * @type: enum amd_ip_block_type 2317 * @major: major version 2318 * @minor: minor version 2319 * 2320 * return 0 if equal or greater 2321 * return 1 if smaller or the ip_block doesn't exist 2322 */ 2323 int amdgpu_device_ip_block_version_cmp(struct amdgpu_device *adev, 2324 enum amd_ip_block_type type, 2325 u32 major, u32 minor) 2326 { 2327 struct amdgpu_ip_block *ip_block = amdgpu_device_ip_get_ip_block(adev, type); 2328 2329 if (ip_block && ((ip_block->version->major > major) || 2330 ((ip_block->version->major == major) && 2331 (ip_block->version->minor >= minor)))) 2332 return 0; 2333 2334 return 1; 2335 } 2336 2337 /** 2338 * amdgpu_device_ip_block_add 2339 * 2340 * @adev: amdgpu_device pointer 2341 * @ip_block_version: pointer to the IP to add 2342 * 2343 * Adds the IP block driver information to the collection of IPs 2344 * on the asic. 2345 */ 2346 int amdgpu_device_ip_block_add(struct amdgpu_device *adev, 2347 const struct amdgpu_ip_block_version *ip_block_version) 2348 { 2349 if (!ip_block_version) 2350 return -EINVAL; 2351 2352 switch (ip_block_version->type) { 2353 case AMD_IP_BLOCK_TYPE_VCN: 2354 if (adev->harvest_ip_mask & AMD_HARVEST_IP_VCN_MASK) 2355 return 0; 2356 break; 2357 case AMD_IP_BLOCK_TYPE_JPEG: 2358 if (adev->harvest_ip_mask & AMD_HARVEST_IP_JPEG_MASK) 2359 return 0; 2360 break; 2361 default: 2362 break; 2363 } 2364 2365 DRM_INFO("add ip block number %d <%s>\n", adev->num_ip_blocks, 2366 ip_block_version->funcs->name); 2367 2368 adev->ip_blocks[adev->num_ip_blocks].adev = adev; 2369 2370 adev->ip_blocks[adev->num_ip_blocks++].version = ip_block_version; 2371 2372 return 0; 2373 } 2374 2375 /** 2376 * amdgpu_device_enable_virtual_display - enable virtual display feature 2377 * 2378 * @adev: amdgpu_device pointer 2379 * 2380 * Enabled the virtual display feature if the user has enabled it via 2381 * the module parameter virtual_display. This feature provides a virtual 2382 * display hardware on headless boards or in virtualized environments. 2383 * This function parses and validates the configuration string specified by 2384 * the user and configues the virtual display configuration (number of 2385 * virtual connectors, crtcs, etc.) specified. 2386 */ 2387 static void amdgpu_device_enable_virtual_display(struct amdgpu_device *adev) 2388 { 2389 adev->enable_virtual_display = false; 2390 2391 if (amdgpu_virtual_display) { 2392 const char *pci_address_name = pci_name(adev->pdev); 2393 char *pciaddstr, *pciaddstr_tmp, *pciaddname_tmp, *pciaddname; 2394 2395 pciaddstr = kstrdup(amdgpu_virtual_display, GFP_KERNEL); 2396 pciaddstr_tmp = pciaddstr; 2397 while ((pciaddname_tmp = strsep(&pciaddstr_tmp, ";"))) { 2398 pciaddname = strsep(&pciaddname_tmp, ","); 2399 if (!strcmp("all", pciaddname) 2400 || !strcmp(pci_address_name, pciaddname)) { 2401 long num_crtc; 2402 int res = -1; 2403 2404 adev->enable_virtual_display = true; 2405 2406 if (pciaddname_tmp) 2407 res = kstrtol(pciaddname_tmp, 10, 2408 &num_crtc); 2409 2410 if (!res) { 2411 if (num_crtc < 1) 2412 num_crtc = 1; 2413 if (num_crtc > 6) 2414 num_crtc = 6; 2415 adev->mode_info.num_crtc = num_crtc; 2416 } else { 2417 adev->mode_info.num_crtc = 1; 2418 } 2419 break; 2420 } 2421 } 2422 2423 DRM_INFO("virtual display string:%s, %s:virtual_display:%d, num_crtc:%d\n", 2424 amdgpu_virtual_display, pci_address_name, 2425 adev->enable_virtual_display, adev->mode_info.num_crtc); 2426 2427 kfree(pciaddstr); 2428 } 2429 } 2430 2431 void amdgpu_device_set_sriov_virtual_display(struct amdgpu_device *adev) 2432 { 2433 if (amdgpu_sriov_vf(adev) && !adev->enable_virtual_display) { 2434 adev->mode_info.num_crtc = 1; 2435 adev->enable_virtual_display = true; 2436 DRM_INFO("virtual_display:%d, num_crtc:%d\n", 2437 adev->enable_virtual_display, adev->mode_info.num_crtc); 2438 } 2439 } 2440 2441 /** 2442 * amdgpu_device_parse_gpu_info_fw - parse gpu info firmware 2443 * 2444 * @adev: amdgpu_device pointer 2445 * 2446 * Parses the asic configuration parameters specified in the gpu info 2447 * firmware and makes them availale to the driver for use in configuring 2448 * the asic. 2449 * Returns 0 on success, -EINVAL on failure. 2450 */ 2451 static int amdgpu_device_parse_gpu_info_fw(struct amdgpu_device *adev) 2452 { 2453 const char *chip_name; 2454 int err; 2455 const struct gpu_info_firmware_header_v1_0 *hdr; 2456 2457 adev->firmware.gpu_info_fw = NULL; 2458 2459 if (adev->mman.discovery_bin) 2460 return 0; 2461 2462 switch (adev->asic_type) { 2463 default: 2464 return 0; 2465 case CHIP_VEGA10: 2466 chip_name = "vega10"; 2467 break; 2468 case CHIP_VEGA12: 2469 chip_name = "vega12"; 2470 break; 2471 case CHIP_RAVEN: 2472 if (adev->apu_flags & AMD_APU_IS_RAVEN2) 2473 chip_name = "raven2"; 2474 else if (adev->apu_flags & AMD_APU_IS_PICASSO) 2475 chip_name = "picasso"; 2476 else 2477 chip_name = "raven"; 2478 break; 2479 case CHIP_ARCTURUS: 2480 chip_name = "arcturus"; 2481 break; 2482 case CHIP_NAVI12: 2483 chip_name = "navi12"; 2484 break; 2485 } 2486 2487 err = amdgpu_ucode_request(adev, &adev->firmware.gpu_info_fw, 2488 "amdgpu/%s_gpu_info.bin", chip_name); 2489 if (err) { 2490 dev_err(adev->dev, 2491 "Failed to get gpu_info firmware \"%s_gpu_info.bin\"\n", 2492 chip_name); 2493 goto out; 2494 } 2495 2496 hdr = (const struct gpu_info_firmware_header_v1_0 *)adev->firmware.gpu_info_fw->data; 2497 amdgpu_ucode_print_gpu_info_hdr(&hdr->header); 2498 2499 switch (hdr->version_major) { 2500 case 1: 2501 { 2502 const struct gpu_info_firmware_v1_0 *gpu_info_fw = 2503 (const struct gpu_info_firmware_v1_0 *)(adev->firmware.gpu_info_fw->data + 2504 le32_to_cpu(hdr->header.ucode_array_offset_bytes)); 2505 2506 /* 2507 * Should be droped when DAL no longer needs it. 2508 */ 2509 if (adev->asic_type == CHIP_NAVI12) 2510 goto parse_soc_bounding_box; 2511 2512 adev->gfx.config.max_shader_engines = le32_to_cpu(gpu_info_fw->gc_num_se); 2513 adev->gfx.config.max_cu_per_sh = le32_to_cpu(gpu_info_fw->gc_num_cu_per_sh); 2514 adev->gfx.config.max_sh_per_se = le32_to_cpu(gpu_info_fw->gc_num_sh_per_se); 2515 adev->gfx.config.max_backends_per_se = le32_to_cpu(gpu_info_fw->gc_num_rb_per_se); 2516 adev->gfx.config.max_texture_channel_caches = 2517 le32_to_cpu(gpu_info_fw->gc_num_tccs); 2518 adev->gfx.config.max_gprs = le32_to_cpu(gpu_info_fw->gc_num_gprs); 2519 adev->gfx.config.max_gs_threads = le32_to_cpu(gpu_info_fw->gc_num_max_gs_thds); 2520 adev->gfx.config.gs_vgt_table_depth = le32_to_cpu(gpu_info_fw->gc_gs_table_depth); 2521 adev->gfx.config.gs_prim_buffer_depth = le32_to_cpu(gpu_info_fw->gc_gsprim_buff_depth); 2522 adev->gfx.config.double_offchip_lds_buf = 2523 le32_to_cpu(gpu_info_fw->gc_double_offchip_lds_buffer); 2524 adev->gfx.cu_info.wave_front_size = le32_to_cpu(gpu_info_fw->gc_wave_size); 2525 adev->gfx.cu_info.max_waves_per_simd = 2526 le32_to_cpu(gpu_info_fw->gc_max_waves_per_simd); 2527 adev->gfx.cu_info.max_scratch_slots_per_cu = 2528 le32_to_cpu(gpu_info_fw->gc_max_scratch_slots_per_cu); 2529 adev->gfx.cu_info.lds_size = le32_to_cpu(gpu_info_fw->gc_lds_size); 2530 if (hdr->version_minor >= 1) { 2531 const struct gpu_info_firmware_v1_1 *gpu_info_fw = 2532 (const struct gpu_info_firmware_v1_1 *)(adev->firmware.gpu_info_fw->data + 2533 le32_to_cpu(hdr->header.ucode_array_offset_bytes)); 2534 adev->gfx.config.num_sc_per_sh = 2535 le32_to_cpu(gpu_info_fw->num_sc_per_sh); 2536 adev->gfx.config.num_packer_per_sc = 2537 le32_to_cpu(gpu_info_fw->num_packer_per_sc); 2538 } 2539 2540 parse_soc_bounding_box: 2541 /* 2542 * soc bounding box info is not integrated in disocovery table, 2543 * we always need to parse it from gpu info firmware if needed. 2544 */ 2545 if (hdr->version_minor == 2) { 2546 const struct gpu_info_firmware_v1_2 *gpu_info_fw = 2547 (const struct gpu_info_firmware_v1_2 *)(adev->firmware.gpu_info_fw->data + 2548 le32_to_cpu(hdr->header.ucode_array_offset_bytes)); 2549 adev->dm.soc_bounding_box = &gpu_info_fw->soc_bounding_box; 2550 } 2551 break; 2552 } 2553 default: 2554 dev_err(adev->dev, 2555 "Unsupported gpu_info table %d\n", hdr->header.ucode_version); 2556 err = -EINVAL; 2557 goto out; 2558 } 2559 out: 2560 return err; 2561 } 2562 2563 /** 2564 * amdgpu_device_ip_early_init - run early init for hardware IPs 2565 * 2566 * @adev: amdgpu_device pointer 2567 * 2568 * Early initialization pass for hardware IPs. The hardware IPs that make 2569 * up each asic are discovered each IP's early_init callback is run. This 2570 * is the first stage in initializing the asic. 2571 * Returns 0 on success, negative error code on failure. 2572 */ 2573 static int amdgpu_device_ip_early_init(struct amdgpu_device *adev) 2574 { 2575 struct amdgpu_ip_block *ip_block; 2576 struct pci_dev *parent; 2577 int i, r; 2578 bool total; 2579 2580 amdgpu_device_enable_virtual_display(adev); 2581 2582 if (amdgpu_sriov_vf(adev)) { 2583 r = amdgpu_virt_request_full_gpu(adev, true); 2584 if (r) 2585 return r; 2586 } 2587 2588 switch (adev->asic_type) { 2589 #ifdef CONFIG_DRM_AMDGPU_SI 2590 case CHIP_VERDE: 2591 case CHIP_TAHITI: 2592 case CHIP_PITCAIRN: 2593 case CHIP_OLAND: 2594 case CHIP_HAINAN: 2595 adev->family = AMDGPU_FAMILY_SI; 2596 r = si_set_ip_blocks(adev); 2597 if (r) 2598 return r; 2599 break; 2600 #endif 2601 #ifdef CONFIG_DRM_AMDGPU_CIK 2602 case CHIP_BONAIRE: 2603 case CHIP_HAWAII: 2604 case CHIP_KAVERI: 2605 case CHIP_KABINI: 2606 case CHIP_MULLINS: 2607 if (adev->flags & AMD_IS_APU) 2608 adev->family = AMDGPU_FAMILY_KV; 2609 else 2610 adev->family = AMDGPU_FAMILY_CI; 2611 2612 r = cik_set_ip_blocks(adev); 2613 if (r) 2614 return r; 2615 break; 2616 #endif 2617 case CHIP_TOPAZ: 2618 case CHIP_TONGA: 2619 case CHIP_FIJI: 2620 case CHIP_POLARIS10: 2621 case CHIP_POLARIS11: 2622 case CHIP_POLARIS12: 2623 case CHIP_VEGAM: 2624 case CHIP_CARRIZO: 2625 case CHIP_STONEY: 2626 if (adev->flags & AMD_IS_APU) 2627 adev->family = AMDGPU_FAMILY_CZ; 2628 else 2629 adev->family = AMDGPU_FAMILY_VI; 2630 2631 r = vi_set_ip_blocks(adev); 2632 if (r) 2633 return r; 2634 break; 2635 default: 2636 r = amdgpu_discovery_set_ip_blocks(adev); 2637 if (r) 2638 return r; 2639 break; 2640 } 2641 2642 if (amdgpu_has_atpx() && 2643 (amdgpu_is_atpx_hybrid() || 2644 amdgpu_has_atpx_dgpu_power_cntl()) && 2645 ((adev->flags & AMD_IS_APU) == 0) && 2646 !dev_is_removable(&adev->pdev->dev)) 2647 adev->flags |= AMD_IS_PX; 2648 2649 if (!(adev->flags & AMD_IS_APU)) { 2650 parent = pcie_find_root_port(adev->pdev); 2651 adev->has_pr3 = parent ? pci_pr3_present(parent) : false; 2652 } 2653 2654 2655 adev->pm.pp_feature = amdgpu_pp_feature_mask; 2656 if (amdgpu_sriov_vf(adev) || sched_policy == KFD_SCHED_POLICY_NO_HWS) 2657 adev->pm.pp_feature &= ~PP_GFXOFF_MASK; 2658 if (amdgpu_sriov_vf(adev) && adev->asic_type == CHIP_SIENNA_CICHLID) 2659 adev->pm.pp_feature &= ~PP_OVERDRIVE_MASK; 2660 if (!amdgpu_device_pcie_dynamic_switching_supported(adev)) 2661 adev->pm.pp_feature &= ~PP_PCIE_DPM_MASK; 2662 2663 total = true; 2664 for (i = 0; i < adev->num_ip_blocks; i++) { 2665 ip_block = &adev->ip_blocks[i]; 2666 2667 if ((amdgpu_ip_block_mask & (1 << i)) == 0) { 2668 DRM_WARN("disabled ip block: %d <%s>\n", 2669 i, adev->ip_blocks[i].version->funcs->name); 2670 adev->ip_blocks[i].status.valid = false; 2671 } else if (ip_block->version->funcs->early_init) { 2672 r = ip_block->version->funcs->early_init(ip_block); 2673 if (r == -ENOENT) { 2674 adev->ip_blocks[i].status.valid = false; 2675 } else if (r) { 2676 DRM_ERROR("early_init of IP block <%s> failed %d\n", 2677 adev->ip_blocks[i].version->funcs->name, r); 2678 total = false; 2679 } else { 2680 adev->ip_blocks[i].status.valid = true; 2681 } 2682 } else { 2683 adev->ip_blocks[i].status.valid = true; 2684 } 2685 /* get the vbios after the asic_funcs are set up */ 2686 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON) { 2687 r = amdgpu_device_parse_gpu_info_fw(adev); 2688 if (r) 2689 return r; 2690 2691 /* Read BIOS */ 2692 if (amdgpu_device_read_bios(adev)) { 2693 if (!amdgpu_get_bios(adev)) 2694 return -EINVAL; 2695 2696 r = amdgpu_atombios_init(adev); 2697 if (r) { 2698 dev_err(adev->dev, "amdgpu_atombios_init failed\n"); 2699 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_INIT_FAIL, 0, 0); 2700 return r; 2701 } 2702 } 2703 2704 /*get pf2vf msg info at it's earliest time*/ 2705 if (amdgpu_sriov_vf(adev)) 2706 amdgpu_virt_init_data_exchange(adev); 2707 2708 } 2709 } 2710 if (!total) 2711 return -ENODEV; 2712 2713 ip_block = amdgpu_device_ip_get_ip_block(adev, AMD_IP_BLOCK_TYPE_GFX); 2714 if (ip_block->status.valid != false) 2715 amdgpu_amdkfd_device_probe(adev); 2716 2717 adev->cg_flags &= amdgpu_cg_mask; 2718 adev->pg_flags &= amdgpu_pg_mask; 2719 2720 return 0; 2721 } 2722 2723 static int amdgpu_device_ip_hw_init_phase1(struct amdgpu_device *adev) 2724 { 2725 int i, r; 2726 2727 for (i = 0; i < adev->num_ip_blocks; i++) { 2728 if (!adev->ip_blocks[i].status.sw) 2729 continue; 2730 if (adev->ip_blocks[i].status.hw) 2731 continue; 2732 if (!amdgpu_ip_member_of_hwini( 2733 adev, adev->ip_blocks[i].version->type)) 2734 continue; 2735 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON || 2736 (amdgpu_sriov_vf(adev) && (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP)) || 2737 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH) { 2738 r = adev->ip_blocks[i].version->funcs->hw_init(&adev->ip_blocks[i]); 2739 if (r) { 2740 DRM_ERROR("hw_init of IP block <%s> failed %d\n", 2741 adev->ip_blocks[i].version->funcs->name, r); 2742 return r; 2743 } 2744 adev->ip_blocks[i].status.hw = true; 2745 } 2746 } 2747 2748 return 0; 2749 } 2750 2751 static int amdgpu_device_ip_hw_init_phase2(struct amdgpu_device *adev) 2752 { 2753 int i, r; 2754 2755 for (i = 0; i < adev->num_ip_blocks; i++) { 2756 if (!adev->ip_blocks[i].status.sw) 2757 continue; 2758 if (adev->ip_blocks[i].status.hw) 2759 continue; 2760 if (!amdgpu_ip_member_of_hwini( 2761 adev, adev->ip_blocks[i].version->type)) 2762 continue; 2763 r = adev->ip_blocks[i].version->funcs->hw_init(&adev->ip_blocks[i]); 2764 if (r) { 2765 DRM_ERROR("hw_init of IP block <%s> failed %d\n", 2766 adev->ip_blocks[i].version->funcs->name, r); 2767 return r; 2768 } 2769 adev->ip_blocks[i].status.hw = true; 2770 } 2771 2772 return 0; 2773 } 2774 2775 static int amdgpu_device_fw_loading(struct amdgpu_device *adev) 2776 { 2777 int r = 0; 2778 int i; 2779 uint32_t smu_version; 2780 2781 if (adev->asic_type >= CHIP_VEGA10) { 2782 for (i = 0; i < adev->num_ip_blocks; i++) { 2783 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_PSP) 2784 continue; 2785 2786 if (!amdgpu_ip_member_of_hwini(adev, 2787 AMD_IP_BLOCK_TYPE_PSP)) 2788 break; 2789 2790 if (!adev->ip_blocks[i].status.sw) 2791 continue; 2792 2793 /* no need to do the fw loading again if already done*/ 2794 if (adev->ip_blocks[i].status.hw == true) 2795 break; 2796 2797 if (amdgpu_in_reset(adev) || adev->in_suspend) { 2798 r = amdgpu_ip_block_resume(&adev->ip_blocks[i]); 2799 if (r) 2800 return r; 2801 } else { 2802 r = adev->ip_blocks[i].version->funcs->hw_init(&adev->ip_blocks[i]); 2803 if (r) { 2804 DRM_ERROR("hw_init of IP block <%s> failed %d\n", 2805 adev->ip_blocks[i].version->funcs->name, r); 2806 return r; 2807 } 2808 adev->ip_blocks[i].status.hw = true; 2809 } 2810 break; 2811 } 2812 } 2813 2814 if (!amdgpu_sriov_vf(adev) || adev->asic_type == CHIP_TONGA) 2815 r = amdgpu_pm_load_smu_firmware(adev, &smu_version); 2816 2817 return r; 2818 } 2819 2820 static int amdgpu_device_init_schedulers(struct amdgpu_device *adev) 2821 { 2822 long timeout; 2823 int r, i; 2824 2825 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 2826 struct amdgpu_ring *ring = adev->rings[i]; 2827 2828 /* No need to setup the GPU scheduler for rings that don't need it */ 2829 if (!ring || ring->no_scheduler) 2830 continue; 2831 2832 switch (ring->funcs->type) { 2833 case AMDGPU_RING_TYPE_GFX: 2834 timeout = adev->gfx_timeout; 2835 break; 2836 case AMDGPU_RING_TYPE_COMPUTE: 2837 timeout = adev->compute_timeout; 2838 break; 2839 case AMDGPU_RING_TYPE_SDMA: 2840 timeout = adev->sdma_timeout; 2841 break; 2842 default: 2843 timeout = adev->video_timeout; 2844 break; 2845 } 2846 2847 r = drm_sched_init(&ring->sched, &amdgpu_sched_ops, NULL, 2848 DRM_SCHED_PRIORITY_COUNT, 2849 ring->num_hw_submission, 0, 2850 timeout, adev->reset_domain->wq, 2851 ring->sched_score, ring->name, 2852 adev->dev); 2853 if (r) { 2854 DRM_ERROR("Failed to create scheduler on ring %s.\n", 2855 ring->name); 2856 return r; 2857 } 2858 r = amdgpu_uvd_entity_init(adev, ring); 2859 if (r) { 2860 DRM_ERROR("Failed to create UVD scheduling entity on ring %s.\n", 2861 ring->name); 2862 return r; 2863 } 2864 r = amdgpu_vce_entity_init(adev, ring); 2865 if (r) { 2866 DRM_ERROR("Failed to create VCE scheduling entity on ring %s.\n", 2867 ring->name); 2868 return r; 2869 } 2870 } 2871 2872 amdgpu_xcp_update_partition_sched_list(adev); 2873 2874 return 0; 2875 } 2876 2877 2878 /** 2879 * amdgpu_device_ip_init - run init for hardware IPs 2880 * 2881 * @adev: amdgpu_device pointer 2882 * 2883 * Main initialization pass for hardware IPs. The list of all the hardware 2884 * IPs that make up the asic is walked and the sw_init and hw_init callbacks 2885 * are run. sw_init initializes the software state associated with each IP 2886 * and hw_init initializes the hardware associated with each IP. 2887 * Returns 0 on success, negative error code on failure. 2888 */ 2889 static int amdgpu_device_ip_init(struct amdgpu_device *adev) 2890 { 2891 bool init_badpage; 2892 int i, r; 2893 2894 r = amdgpu_ras_init(adev); 2895 if (r) 2896 return r; 2897 2898 for (i = 0; i < adev->num_ip_blocks; i++) { 2899 if (!adev->ip_blocks[i].status.valid) 2900 continue; 2901 if (adev->ip_blocks[i].version->funcs->sw_init) { 2902 r = adev->ip_blocks[i].version->funcs->sw_init(&adev->ip_blocks[i]); 2903 if (r) { 2904 DRM_ERROR("sw_init of IP block <%s> failed %d\n", 2905 adev->ip_blocks[i].version->funcs->name, r); 2906 goto init_failed; 2907 } 2908 } 2909 adev->ip_blocks[i].status.sw = true; 2910 2911 if (!amdgpu_ip_member_of_hwini( 2912 adev, adev->ip_blocks[i].version->type)) 2913 continue; 2914 2915 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON) { 2916 /* need to do common hw init early so everything is set up for gmc */ 2917 r = adev->ip_blocks[i].version->funcs->hw_init(&adev->ip_blocks[i]); 2918 if (r) { 2919 DRM_ERROR("hw_init %d failed %d\n", i, r); 2920 goto init_failed; 2921 } 2922 adev->ip_blocks[i].status.hw = true; 2923 } else if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) { 2924 /* need to do gmc hw init early so we can allocate gpu mem */ 2925 /* Try to reserve bad pages early */ 2926 if (amdgpu_sriov_vf(adev)) 2927 amdgpu_virt_exchange_data(adev); 2928 2929 r = amdgpu_device_mem_scratch_init(adev); 2930 if (r) { 2931 DRM_ERROR("amdgpu_mem_scratch_init failed %d\n", r); 2932 goto init_failed; 2933 } 2934 r = adev->ip_blocks[i].version->funcs->hw_init(&adev->ip_blocks[i]); 2935 if (r) { 2936 DRM_ERROR("hw_init %d failed %d\n", i, r); 2937 goto init_failed; 2938 } 2939 r = amdgpu_device_wb_init(adev); 2940 if (r) { 2941 DRM_ERROR("amdgpu_device_wb_init failed %d\n", r); 2942 goto init_failed; 2943 } 2944 adev->ip_blocks[i].status.hw = true; 2945 2946 /* right after GMC hw init, we create CSA */ 2947 if (adev->gfx.mcbp) { 2948 r = amdgpu_allocate_static_csa(adev, &adev->virt.csa_obj, 2949 AMDGPU_GEM_DOMAIN_VRAM | 2950 AMDGPU_GEM_DOMAIN_GTT, 2951 AMDGPU_CSA_SIZE); 2952 if (r) { 2953 DRM_ERROR("allocate CSA failed %d\n", r); 2954 goto init_failed; 2955 } 2956 } 2957 2958 r = amdgpu_seq64_init(adev); 2959 if (r) { 2960 DRM_ERROR("allocate seq64 failed %d\n", r); 2961 goto init_failed; 2962 } 2963 } 2964 } 2965 2966 if (amdgpu_sriov_vf(adev)) 2967 amdgpu_virt_init_data_exchange(adev); 2968 2969 r = amdgpu_ib_pool_init(adev); 2970 if (r) { 2971 dev_err(adev->dev, "IB initialization failed (%d).\n", r); 2972 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_IB_INIT_FAIL, 0, r); 2973 goto init_failed; 2974 } 2975 2976 r = amdgpu_ucode_create_bo(adev); /* create ucode bo when sw_init complete*/ 2977 if (r) 2978 goto init_failed; 2979 2980 r = amdgpu_device_ip_hw_init_phase1(adev); 2981 if (r) 2982 goto init_failed; 2983 2984 r = amdgpu_device_fw_loading(adev); 2985 if (r) 2986 goto init_failed; 2987 2988 r = amdgpu_device_ip_hw_init_phase2(adev); 2989 if (r) 2990 goto init_failed; 2991 2992 /* 2993 * retired pages will be loaded from eeprom and reserved here, 2994 * it should be called after amdgpu_device_ip_hw_init_phase2 since 2995 * for some ASICs the RAS EEPROM code relies on SMU fully functioning 2996 * for I2C communication which only true at this point. 2997 * 2998 * amdgpu_ras_recovery_init may fail, but the upper only cares the 2999 * failure from bad gpu situation and stop amdgpu init process 3000 * accordingly. For other failed cases, it will still release all 3001 * the resource and print error message, rather than returning one 3002 * negative value to upper level. 3003 * 3004 * Note: theoretically, this should be called before all vram allocations 3005 * to protect retired page from abusing 3006 */ 3007 init_badpage = (adev->init_lvl->level != AMDGPU_INIT_LEVEL_MINIMAL_XGMI); 3008 r = amdgpu_ras_recovery_init(adev, init_badpage); 3009 if (r) 3010 goto init_failed; 3011 3012 /** 3013 * In case of XGMI grab extra reference for reset domain for this device 3014 */ 3015 if (adev->gmc.xgmi.num_physical_nodes > 1) { 3016 if (amdgpu_xgmi_add_device(adev) == 0) { 3017 if (!amdgpu_sriov_vf(adev)) { 3018 struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev); 3019 3020 if (WARN_ON(!hive)) { 3021 r = -ENOENT; 3022 goto init_failed; 3023 } 3024 3025 if (!hive->reset_domain || 3026 !amdgpu_reset_get_reset_domain(hive->reset_domain)) { 3027 r = -ENOENT; 3028 amdgpu_put_xgmi_hive(hive); 3029 goto init_failed; 3030 } 3031 3032 /* Drop the early temporary reset domain we created for device */ 3033 amdgpu_reset_put_reset_domain(adev->reset_domain); 3034 adev->reset_domain = hive->reset_domain; 3035 amdgpu_put_xgmi_hive(hive); 3036 } 3037 } 3038 } 3039 3040 r = amdgpu_device_init_schedulers(adev); 3041 if (r) 3042 goto init_failed; 3043 3044 if (adev->mman.buffer_funcs_ring->sched.ready) 3045 amdgpu_ttm_set_buffer_funcs_status(adev, true); 3046 3047 /* Don't init kfd if whole hive need to be reset during init */ 3048 if (adev->init_lvl->level != AMDGPU_INIT_LEVEL_MINIMAL_XGMI) { 3049 kgd2kfd_init_zone_device(adev); 3050 amdgpu_amdkfd_device_init(adev); 3051 } 3052 3053 amdgpu_fru_get_product_info(adev); 3054 3055 init_failed: 3056 3057 return r; 3058 } 3059 3060 /** 3061 * amdgpu_device_fill_reset_magic - writes reset magic to gart pointer 3062 * 3063 * @adev: amdgpu_device pointer 3064 * 3065 * Writes a reset magic value to the gart pointer in VRAM. The driver calls 3066 * this function before a GPU reset. If the value is retained after a 3067 * GPU reset, VRAM has not been lost. Some GPU resets may destry VRAM contents. 3068 */ 3069 static void amdgpu_device_fill_reset_magic(struct amdgpu_device *adev) 3070 { 3071 memcpy(adev->reset_magic, adev->gart.ptr, AMDGPU_RESET_MAGIC_NUM); 3072 } 3073 3074 /** 3075 * amdgpu_device_check_vram_lost - check if vram is valid 3076 * 3077 * @adev: amdgpu_device pointer 3078 * 3079 * Checks the reset magic value written to the gart pointer in VRAM. 3080 * The driver calls this after a GPU reset to see if the contents of 3081 * VRAM is lost or now. 3082 * returns true if vram is lost, false if not. 3083 */ 3084 static bool amdgpu_device_check_vram_lost(struct amdgpu_device *adev) 3085 { 3086 if (memcmp(adev->gart.ptr, adev->reset_magic, 3087 AMDGPU_RESET_MAGIC_NUM)) 3088 return true; 3089 3090 if (!amdgpu_in_reset(adev)) 3091 return false; 3092 3093 /* 3094 * For all ASICs with baco/mode1 reset, the VRAM is 3095 * always assumed to be lost. 3096 */ 3097 switch (amdgpu_asic_reset_method(adev)) { 3098 case AMD_RESET_METHOD_BACO: 3099 case AMD_RESET_METHOD_MODE1: 3100 return true; 3101 default: 3102 return false; 3103 } 3104 } 3105 3106 /** 3107 * amdgpu_device_set_cg_state - set clockgating for amdgpu device 3108 * 3109 * @adev: amdgpu_device pointer 3110 * @state: clockgating state (gate or ungate) 3111 * 3112 * The list of all the hardware IPs that make up the asic is walked and the 3113 * set_clockgating_state callbacks are run. 3114 * Late initialization pass enabling clockgating for hardware IPs. 3115 * Fini or suspend, pass disabling clockgating for hardware IPs. 3116 * Returns 0 on success, negative error code on failure. 3117 */ 3118 3119 int amdgpu_device_set_cg_state(struct amdgpu_device *adev, 3120 enum amd_clockgating_state state) 3121 { 3122 int i, j, r; 3123 3124 if (amdgpu_emu_mode == 1) 3125 return 0; 3126 3127 for (j = 0; j < adev->num_ip_blocks; j++) { 3128 i = state == AMD_CG_STATE_GATE ? j : adev->num_ip_blocks - j - 1; 3129 if (!adev->ip_blocks[i].status.late_initialized) 3130 continue; 3131 /* skip CG for GFX, SDMA on S0ix */ 3132 if (adev->in_s0ix && 3133 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GFX || 3134 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SDMA)) 3135 continue; 3136 /* skip CG for VCE/UVD, it's handled specially */ 3137 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_UVD && 3138 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCE && 3139 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCN && 3140 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_JPEG && 3141 adev->ip_blocks[i].version->funcs->set_clockgating_state) { 3142 /* enable clockgating to save power */ 3143 r = adev->ip_blocks[i].version->funcs->set_clockgating_state((void *)adev, 3144 state); 3145 if (r) { 3146 DRM_ERROR("set_clockgating_state(gate) of IP block <%s> failed %d\n", 3147 adev->ip_blocks[i].version->funcs->name, r); 3148 return r; 3149 } 3150 } 3151 } 3152 3153 return 0; 3154 } 3155 3156 int amdgpu_device_set_pg_state(struct amdgpu_device *adev, 3157 enum amd_powergating_state state) 3158 { 3159 int i, j, r; 3160 3161 if (amdgpu_emu_mode == 1) 3162 return 0; 3163 3164 for (j = 0; j < adev->num_ip_blocks; j++) { 3165 i = state == AMD_PG_STATE_GATE ? j : adev->num_ip_blocks - j - 1; 3166 if (!adev->ip_blocks[i].status.late_initialized) 3167 continue; 3168 /* skip PG for GFX, SDMA on S0ix */ 3169 if (adev->in_s0ix && 3170 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GFX || 3171 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SDMA)) 3172 continue; 3173 /* skip CG for VCE/UVD, it's handled specially */ 3174 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_UVD && 3175 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCE && 3176 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCN && 3177 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_JPEG && 3178 adev->ip_blocks[i].version->funcs->set_powergating_state) { 3179 /* enable powergating to save power */ 3180 r = adev->ip_blocks[i].version->funcs->set_powergating_state((void *)adev, 3181 state); 3182 if (r) { 3183 DRM_ERROR("set_powergating_state(gate) of IP block <%s> failed %d\n", 3184 adev->ip_blocks[i].version->funcs->name, r); 3185 return r; 3186 } 3187 } 3188 } 3189 return 0; 3190 } 3191 3192 static int amdgpu_device_enable_mgpu_fan_boost(void) 3193 { 3194 struct amdgpu_gpu_instance *gpu_ins; 3195 struct amdgpu_device *adev; 3196 int i, ret = 0; 3197 3198 mutex_lock(&mgpu_info.mutex); 3199 3200 /* 3201 * MGPU fan boost feature should be enabled 3202 * only when there are two or more dGPUs in 3203 * the system 3204 */ 3205 if (mgpu_info.num_dgpu < 2) 3206 goto out; 3207 3208 for (i = 0; i < mgpu_info.num_dgpu; i++) { 3209 gpu_ins = &(mgpu_info.gpu_ins[i]); 3210 adev = gpu_ins->adev; 3211 if (!(adev->flags & AMD_IS_APU) && 3212 !gpu_ins->mgpu_fan_enabled) { 3213 ret = amdgpu_dpm_enable_mgpu_fan_boost(adev); 3214 if (ret) 3215 break; 3216 3217 gpu_ins->mgpu_fan_enabled = 1; 3218 } 3219 } 3220 3221 out: 3222 mutex_unlock(&mgpu_info.mutex); 3223 3224 return ret; 3225 } 3226 3227 /** 3228 * amdgpu_device_ip_late_init - run late init for hardware IPs 3229 * 3230 * @adev: amdgpu_device pointer 3231 * 3232 * Late initialization pass for hardware IPs. The list of all the hardware 3233 * IPs that make up the asic is walked and the late_init callbacks are run. 3234 * late_init covers any special initialization that an IP requires 3235 * after all of the have been initialized or something that needs to happen 3236 * late in the init process. 3237 * Returns 0 on success, negative error code on failure. 3238 */ 3239 static int amdgpu_device_ip_late_init(struct amdgpu_device *adev) 3240 { 3241 struct amdgpu_gpu_instance *gpu_instance; 3242 int i = 0, r; 3243 3244 for (i = 0; i < adev->num_ip_blocks; i++) { 3245 if (!adev->ip_blocks[i].status.hw) 3246 continue; 3247 if (adev->ip_blocks[i].version->funcs->late_init) { 3248 r = adev->ip_blocks[i].version->funcs->late_init(&adev->ip_blocks[i]); 3249 if (r) { 3250 DRM_ERROR("late_init of IP block <%s> failed %d\n", 3251 adev->ip_blocks[i].version->funcs->name, r); 3252 return r; 3253 } 3254 } 3255 adev->ip_blocks[i].status.late_initialized = true; 3256 } 3257 3258 r = amdgpu_ras_late_init(adev); 3259 if (r) { 3260 DRM_ERROR("amdgpu_ras_late_init failed %d", r); 3261 return r; 3262 } 3263 3264 if (!amdgpu_reset_in_recovery(adev)) 3265 amdgpu_ras_set_error_query_ready(adev, true); 3266 3267 amdgpu_device_set_cg_state(adev, AMD_CG_STATE_GATE); 3268 amdgpu_device_set_pg_state(adev, AMD_PG_STATE_GATE); 3269 3270 amdgpu_device_fill_reset_magic(adev); 3271 3272 r = amdgpu_device_enable_mgpu_fan_boost(); 3273 if (r) 3274 DRM_ERROR("enable mgpu fan boost failed (%d).\n", r); 3275 3276 /* For passthrough configuration on arcturus and aldebaran, enable special handling SBR */ 3277 if (amdgpu_passthrough(adev) && 3278 ((adev->asic_type == CHIP_ARCTURUS && adev->gmc.xgmi.num_physical_nodes > 1) || 3279 adev->asic_type == CHIP_ALDEBARAN)) 3280 amdgpu_dpm_handle_passthrough_sbr(adev, true); 3281 3282 if (adev->gmc.xgmi.num_physical_nodes > 1) { 3283 mutex_lock(&mgpu_info.mutex); 3284 3285 /* 3286 * Reset device p-state to low as this was booted with high. 3287 * 3288 * This should be performed only after all devices from the same 3289 * hive get initialized. 3290 * 3291 * However, it's unknown how many device in the hive in advance. 3292 * As this is counted one by one during devices initializations. 3293 * 3294 * So, we wait for all XGMI interlinked devices initialized. 3295 * This may bring some delays as those devices may come from 3296 * different hives. But that should be OK. 3297 */ 3298 if (mgpu_info.num_dgpu == adev->gmc.xgmi.num_physical_nodes) { 3299 for (i = 0; i < mgpu_info.num_gpu; i++) { 3300 gpu_instance = &(mgpu_info.gpu_ins[i]); 3301 if (gpu_instance->adev->flags & AMD_IS_APU) 3302 continue; 3303 3304 r = amdgpu_xgmi_set_pstate(gpu_instance->adev, 3305 AMDGPU_XGMI_PSTATE_MIN); 3306 if (r) { 3307 DRM_ERROR("pstate setting failed (%d).\n", r); 3308 break; 3309 } 3310 } 3311 } 3312 3313 mutex_unlock(&mgpu_info.mutex); 3314 } 3315 3316 return 0; 3317 } 3318 3319 static void amdgpu_ip_block_hw_fini(struct amdgpu_ip_block *ip_block) 3320 { 3321 int r; 3322 3323 if (!ip_block->version->funcs->hw_fini) { 3324 DRM_ERROR("hw_fini of IP block <%s> not defined\n", 3325 ip_block->version->funcs->name); 3326 } else { 3327 r = ip_block->version->funcs->hw_fini(ip_block); 3328 /* XXX handle errors */ 3329 if (r) { 3330 DRM_DEBUG("hw_fini of IP block <%s> failed %d\n", 3331 ip_block->version->funcs->name, r); 3332 } 3333 } 3334 3335 ip_block->status.hw = false; 3336 } 3337 3338 /** 3339 * amdgpu_device_smu_fini_early - smu hw_fini wrapper 3340 * 3341 * @adev: amdgpu_device pointer 3342 * 3343 * For ASICs need to disable SMC first 3344 */ 3345 static void amdgpu_device_smu_fini_early(struct amdgpu_device *adev) 3346 { 3347 int i; 3348 3349 if (amdgpu_ip_version(adev, GC_HWIP, 0) > IP_VERSION(9, 0, 0)) 3350 return; 3351 3352 for (i = 0; i < adev->num_ip_blocks; i++) { 3353 if (!adev->ip_blocks[i].status.hw) 3354 continue; 3355 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) { 3356 amdgpu_ip_block_hw_fini(&adev->ip_blocks[i]); 3357 break; 3358 } 3359 } 3360 } 3361 3362 static int amdgpu_device_ip_fini_early(struct amdgpu_device *adev) 3363 { 3364 int i, r; 3365 3366 for (i = 0; i < adev->num_ip_blocks; i++) { 3367 if (!adev->ip_blocks[i].version->funcs->early_fini) 3368 continue; 3369 3370 r = adev->ip_blocks[i].version->funcs->early_fini(&adev->ip_blocks[i]); 3371 if (r) { 3372 DRM_DEBUG("early_fini of IP block <%s> failed %d\n", 3373 adev->ip_blocks[i].version->funcs->name, r); 3374 } 3375 } 3376 3377 amdgpu_device_set_pg_state(adev, AMD_PG_STATE_UNGATE); 3378 amdgpu_device_set_cg_state(adev, AMD_CG_STATE_UNGATE); 3379 3380 amdgpu_amdkfd_suspend(adev, false); 3381 3382 /* Workaroud for ASICs need to disable SMC first */ 3383 amdgpu_device_smu_fini_early(adev); 3384 3385 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 3386 if (!adev->ip_blocks[i].status.hw) 3387 continue; 3388 3389 amdgpu_ip_block_hw_fini(&adev->ip_blocks[i]); 3390 } 3391 3392 if (amdgpu_sriov_vf(adev)) { 3393 if (amdgpu_virt_release_full_gpu(adev, false)) 3394 DRM_ERROR("failed to release exclusive mode on fini\n"); 3395 } 3396 3397 return 0; 3398 } 3399 3400 /** 3401 * amdgpu_device_ip_fini - run fini for hardware IPs 3402 * 3403 * @adev: amdgpu_device pointer 3404 * 3405 * Main teardown pass for hardware IPs. The list of all the hardware 3406 * IPs that make up the asic is walked and the hw_fini and sw_fini callbacks 3407 * are run. hw_fini tears down the hardware associated with each IP 3408 * and sw_fini tears down any software state associated with each IP. 3409 * Returns 0 on success, negative error code on failure. 3410 */ 3411 static int amdgpu_device_ip_fini(struct amdgpu_device *adev) 3412 { 3413 int i, r; 3414 3415 if (amdgpu_sriov_vf(adev) && adev->virt.ras_init_done) 3416 amdgpu_virt_release_ras_err_handler_data(adev); 3417 3418 if (adev->gmc.xgmi.num_physical_nodes > 1) 3419 amdgpu_xgmi_remove_device(adev); 3420 3421 amdgpu_amdkfd_device_fini_sw(adev); 3422 3423 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 3424 if (!adev->ip_blocks[i].status.sw) 3425 continue; 3426 3427 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) { 3428 amdgpu_ucode_free_bo(adev); 3429 amdgpu_free_static_csa(&adev->virt.csa_obj); 3430 amdgpu_device_wb_fini(adev); 3431 amdgpu_device_mem_scratch_fini(adev); 3432 amdgpu_ib_pool_fini(adev); 3433 amdgpu_seq64_fini(adev); 3434 } 3435 if (adev->ip_blocks[i].version->funcs->sw_fini) { 3436 r = adev->ip_blocks[i].version->funcs->sw_fini(&adev->ip_blocks[i]); 3437 /* XXX handle errors */ 3438 if (r) { 3439 DRM_DEBUG("sw_fini of IP block <%s> failed %d\n", 3440 adev->ip_blocks[i].version->funcs->name, r); 3441 } 3442 } 3443 adev->ip_blocks[i].status.sw = false; 3444 adev->ip_blocks[i].status.valid = false; 3445 } 3446 3447 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 3448 if (!adev->ip_blocks[i].status.late_initialized) 3449 continue; 3450 if (adev->ip_blocks[i].version->funcs->late_fini) 3451 adev->ip_blocks[i].version->funcs->late_fini(&adev->ip_blocks[i]); 3452 adev->ip_blocks[i].status.late_initialized = false; 3453 } 3454 3455 amdgpu_ras_fini(adev); 3456 3457 return 0; 3458 } 3459 3460 /** 3461 * amdgpu_device_delayed_init_work_handler - work handler for IB tests 3462 * 3463 * @work: work_struct. 3464 */ 3465 static void amdgpu_device_delayed_init_work_handler(struct work_struct *work) 3466 { 3467 struct amdgpu_device *adev = 3468 container_of(work, struct amdgpu_device, delayed_init_work.work); 3469 int r; 3470 3471 r = amdgpu_ib_ring_tests(adev); 3472 if (r) 3473 DRM_ERROR("ib ring test failed (%d).\n", r); 3474 } 3475 3476 static void amdgpu_device_delay_enable_gfx_off(struct work_struct *work) 3477 { 3478 struct amdgpu_device *adev = 3479 container_of(work, struct amdgpu_device, gfx.gfx_off_delay_work.work); 3480 3481 WARN_ON_ONCE(adev->gfx.gfx_off_state); 3482 WARN_ON_ONCE(adev->gfx.gfx_off_req_count); 3483 3484 if (!amdgpu_dpm_set_powergating_by_smu(adev, AMD_IP_BLOCK_TYPE_GFX, true)) 3485 adev->gfx.gfx_off_state = true; 3486 } 3487 3488 /** 3489 * amdgpu_device_ip_suspend_phase1 - run suspend for hardware IPs (phase 1) 3490 * 3491 * @adev: amdgpu_device pointer 3492 * 3493 * Main suspend function for hardware IPs. The list of all the hardware 3494 * IPs that make up the asic is walked, clockgating is disabled and the 3495 * suspend callbacks are run. suspend puts the hardware and software state 3496 * in each IP into a state suitable for suspend. 3497 * Returns 0 on success, negative error code on failure. 3498 */ 3499 static int amdgpu_device_ip_suspend_phase1(struct amdgpu_device *adev) 3500 { 3501 int i, r; 3502 3503 amdgpu_device_set_pg_state(adev, AMD_PG_STATE_UNGATE); 3504 amdgpu_device_set_cg_state(adev, AMD_CG_STATE_UNGATE); 3505 3506 /* 3507 * Per PMFW team's suggestion, driver needs to handle gfxoff 3508 * and df cstate features disablement for gpu reset(e.g. Mode1Reset) 3509 * scenario. Add the missing df cstate disablement here. 3510 */ 3511 if (amdgpu_dpm_set_df_cstate(adev, DF_CSTATE_DISALLOW)) 3512 dev_warn(adev->dev, "Failed to disallow df cstate"); 3513 3514 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 3515 if (!adev->ip_blocks[i].status.valid) 3516 continue; 3517 3518 /* displays are handled separately */ 3519 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_DCE) 3520 continue; 3521 3522 /* XXX handle errors */ 3523 r = amdgpu_ip_block_suspend(&adev->ip_blocks[i]); 3524 if (r) 3525 return r; 3526 } 3527 3528 return 0; 3529 } 3530 3531 /** 3532 * amdgpu_device_ip_suspend_phase2 - run suspend for hardware IPs (phase 2) 3533 * 3534 * @adev: amdgpu_device pointer 3535 * 3536 * Main suspend function for hardware IPs. The list of all the hardware 3537 * IPs that make up the asic is walked, clockgating is disabled and the 3538 * suspend callbacks are run. suspend puts the hardware and software state 3539 * in each IP into a state suitable for suspend. 3540 * Returns 0 on success, negative error code on failure. 3541 */ 3542 static int amdgpu_device_ip_suspend_phase2(struct amdgpu_device *adev) 3543 { 3544 int i, r; 3545 3546 if (adev->in_s0ix) 3547 amdgpu_dpm_gfx_state_change(adev, sGpuChangeState_D3Entry); 3548 3549 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 3550 if (!adev->ip_blocks[i].status.valid) 3551 continue; 3552 /* displays are handled in phase1 */ 3553 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE) 3554 continue; 3555 /* PSP lost connection when err_event_athub occurs */ 3556 if (amdgpu_ras_intr_triggered() && 3557 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) { 3558 adev->ip_blocks[i].status.hw = false; 3559 continue; 3560 } 3561 3562 /* skip unnecessary suspend if we do not initialize them yet */ 3563 if (!amdgpu_ip_member_of_hwini( 3564 adev, adev->ip_blocks[i].version->type)) 3565 continue; 3566 3567 /* skip suspend of gfx/mes and psp for S0ix 3568 * gfx is in gfxoff state, so on resume it will exit gfxoff just 3569 * like at runtime. PSP is also part of the always on hardware 3570 * so no need to suspend it. 3571 */ 3572 if (adev->in_s0ix && 3573 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP || 3574 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GFX || 3575 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_MES)) 3576 continue; 3577 3578 /* SDMA 5.x+ is part of GFX power domain so it's covered by GFXOFF */ 3579 if (adev->in_s0ix && 3580 (amdgpu_ip_version(adev, SDMA0_HWIP, 0) >= 3581 IP_VERSION(5, 0, 0)) && 3582 (adev->ip_blocks[i].version->type == 3583 AMD_IP_BLOCK_TYPE_SDMA)) 3584 continue; 3585 3586 /* Once swPSP provides the IMU, RLC FW binaries to TOS during cold-boot. 3587 * These are in TMR, hence are expected to be reused by PSP-TOS to reload 3588 * from this location and RLC Autoload automatically also gets loaded 3589 * from here based on PMFW -> PSP message during re-init sequence. 3590 * Therefore, the psp suspend & resume should be skipped to avoid destroy 3591 * the TMR and reload FWs again for IMU enabled APU ASICs. 3592 */ 3593 if (amdgpu_in_reset(adev) && 3594 (adev->flags & AMD_IS_APU) && adev->gfx.imu.funcs && 3595 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) 3596 continue; 3597 3598 /* XXX handle errors */ 3599 r = amdgpu_ip_block_suspend(&adev->ip_blocks[i]); 3600 adev->ip_blocks[i].status.hw = false; 3601 3602 /* handle putting the SMC in the appropriate state */ 3603 if (!amdgpu_sriov_vf(adev)) { 3604 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) { 3605 r = amdgpu_dpm_set_mp1_state(adev, adev->mp1_state); 3606 if (r) { 3607 DRM_ERROR("SMC failed to set mp1 state %d, %d\n", 3608 adev->mp1_state, r); 3609 return r; 3610 } 3611 } 3612 } 3613 } 3614 3615 return 0; 3616 } 3617 3618 /** 3619 * amdgpu_device_ip_suspend - run suspend for hardware IPs 3620 * 3621 * @adev: amdgpu_device pointer 3622 * 3623 * Main suspend function for hardware IPs. The list of all the hardware 3624 * IPs that make up the asic is walked, clockgating is disabled and the 3625 * suspend callbacks are run. suspend puts the hardware and software state 3626 * in each IP into a state suitable for suspend. 3627 * Returns 0 on success, negative error code on failure. 3628 */ 3629 int amdgpu_device_ip_suspend(struct amdgpu_device *adev) 3630 { 3631 int r; 3632 3633 if (amdgpu_sriov_vf(adev)) { 3634 amdgpu_virt_fini_data_exchange(adev); 3635 amdgpu_virt_request_full_gpu(adev, false); 3636 } 3637 3638 amdgpu_ttm_set_buffer_funcs_status(adev, false); 3639 3640 r = amdgpu_device_ip_suspend_phase1(adev); 3641 if (r) 3642 return r; 3643 r = amdgpu_device_ip_suspend_phase2(adev); 3644 3645 if (amdgpu_sriov_vf(adev)) 3646 amdgpu_virt_release_full_gpu(adev, false); 3647 3648 return r; 3649 } 3650 3651 static int amdgpu_device_ip_reinit_early_sriov(struct amdgpu_device *adev) 3652 { 3653 int i, r; 3654 3655 static enum amd_ip_block_type ip_order[] = { 3656 AMD_IP_BLOCK_TYPE_COMMON, 3657 AMD_IP_BLOCK_TYPE_GMC, 3658 AMD_IP_BLOCK_TYPE_PSP, 3659 AMD_IP_BLOCK_TYPE_IH, 3660 }; 3661 3662 for (i = 0; i < adev->num_ip_blocks; i++) { 3663 int j; 3664 struct amdgpu_ip_block *block; 3665 3666 block = &adev->ip_blocks[i]; 3667 block->status.hw = false; 3668 3669 for (j = 0; j < ARRAY_SIZE(ip_order); j++) { 3670 3671 if (block->version->type != ip_order[j] || 3672 !block->status.valid) 3673 continue; 3674 3675 r = block->version->funcs->hw_init(&adev->ip_blocks[i]); 3676 if (r) { 3677 dev_err(adev->dev, "RE-INIT-early: %s failed\n", 3678 block->version->funcs->name); 3679 return r; 3680 } 3681 block->status.hw = true; 3682 } 3683 } 3684 3685 return 0; 3686 } 3687 3688 static int amdgpu_device_ip_reinit_late_sriov(struct amdgpu_device *adev) 3689 { 3690 struct amdgpu_ip_block *block; 3691 int i, r = 0; 3692 3693 static enum amd_ip_block_type ip_order[] = { 3694 AMD_IP_BLOCK_TYPE_SMC, 3695 AMD_IP_BLOCK_TYPE_DCE, 3696 AMD_IP_BLOCK_TYPE_GFX, 3697 AMD_IP_BLOCK_TYPE_SDMA, 3698 AMD_IP_BLOCK_TYPE_MES, 3699 AMD_IP_BLOCK_TYPE_UVD, 3700 AMD_IP_BLOCK_TYPE_VCE, 3701 AMD_IP_BLOCK_TYPE_VCN, 3702 AMD_IP_BLOCK_TYPE_JPEG 3703 }; 3704 3705 for (i = 0; i < ARRAY_SIZE(ip_order); i++) { 3706 block = amdgpu_device_ip_get_ip_block(adev, ip_order[i]); 3707 3708 if (!block) 3709 continue; 3710 3711 if (block->status.valid && !block->status.hw) { 3712 if (block->version->type == AMD_IP_BLOCK_TYPE_SMC) { 3713 r = amdgpu_ip_block_resume(block); 3714 } else { 3715 r = block->version->funcs->hw_init(block); 3716 } 3717 3718 if (r) { 3719 dev_err(adev->dev, "RE-INIT-late: %s failed\n", 3720 block->version->funcs->name); 3721 break; 3722 } 3723 block->status.hw = true; 3724 } 3725 } 3726 3727 return r; 3728 } 3729 3730 /** 3731 * amdgpu_device_ip_resume_phase1 - run resume for hardware IPs 3732 * 3733 * @adev: amdgpu_device pointer 3734 * 3735 * First resume function for hardware IPs. The list of all the hardware 3736 * IPs that make up the asic is walked and the resume callbacks are run for 3737 * COMMON, GMC, and IH. resume puts the hardware into a functional state 3738 * after a suspend and updates the software state as necessary. This 3739 * function is also used for restoring the GPU after a GPU reset. 3740 * Returns 0 on success, negative error code on failure. 3741 */ 3742 static int amdgpu_device_ip_resume_phase1(struct amdgpu_device *adev) 3743 { 3744 int i, r; 3745 3746 for (i = 0; i < adev->num_ip_blocks; i++) { 3747 if (!adev->ip_blocks[i].status.valid || adev->ip_blocks[i].status.hw) 3748 continue; 3749 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON || 3750 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC || 3751 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH || 3752 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP && amdgpu_sriov_vf(adev))) { 3753 3754 r = amdgpu_ip_block_resume(&adev->ip_blocks[i]); 3755 if (r) 3756 return r; 3757 } 3758 } 3759 3760 return 0; 3761 } 3762 3763 /** 3764 * amdgpu_device_ip_resume_phase2 - run resume for hardware IPs 3765 * 3766 * @adev: amdgpu_device pointer 3767 * 3768 * Second resume function for hardware IPs. The list of all the hardware 3769 * IPs that make up the asic is walked and the resume callbacks are run for 3770 * all blocks except COMMON, GMC, and IH. resume puts the hardware into a 3771 * functional state after a suspend and updates the software state as 3772 * necessary. This function is also used for restoring the GPU after a GPU 3773 * reset. 3774 * Returns 0 on success, negative error code on failure. 3775 */ 3776 static int amdgpu_device_ip_resume_phase2(struct amdgpu_device *adev) 3777 { 3778 int i, r; 3779 3780 for (i = 0; i < adev->num_ip_blocks; i++) { 3781 if (!adev->ip_blocks[i].status.valid || adev->ip_blocks[i].status.hw) 3782 continue; 3783 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON || 3784 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC || 3785 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH || 3786 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE || 3787 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) 3788 continue; 3789 r = amdgpu_ip_block_resume(&adev->ip_blocks[i]); 3790 if (r) 3791 return r; 3792 } 3793 3794 return 0; 3795 } 3796 3797 /** 3798 * amdgpu_device_ip_resume_phase3 - run resume for hardware IPs 3799 * 3800 * @adev: amdgpu_device pointer 3801 * 3802 * Third resume function for hardware IPs. The list of all the hardware 3803 * IPs that make up the asic is walked and the resume callbacks are run for 3804 * all DCE. resume puts the hardware into a functional state after a suspend 3805 * and updates the software state as necessary. This function is also used 3806 * for restoring the GPU after a GPU reset. 3807 * 3808 * Returns 0 on success, negative error code on failure. 3809 */ 3810 static int amdgpu_device_ip_resume_phase3(struct amdgpu_device *adev) 3811 { 3812 int i, r; 3813 3814 for (i = 0; i < adev->num_ip_blocks; i++) { 3815 if (!adev->ip_blocks[i].status.valid || adev->ip_blocks[i].status.hw) 3816 continue; 3817 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE) { 3818 r = amdgpu_ip_block_resume(&adev->ip_blocks[i]); 3819 if (r) 3820 return r; 3821 } 3822 } 3823 3824 return 0; 3825 } 3826 3827 /** 3828 * amdgpu_device_ip_resume - run resume for hardware IPs 3829 * 3830 * @adev: amdgpu_device pointer 3831 * 3832 * Main resume function for hardware IPs. The hardware IPs 3833 * are split into two resume functions because they are 3834 * also used in recovering from a GPU reset and some additional 3835 * steps need to be take between them. In this case (S3/S4) they are 3836 * run sequentially. 3837 * Returns 0 on success, negative error code on failure. 3838 */ 3839 static int amdgpu_device_ip_resume(struct amdgpu_device *adev) 3840 { 3841 int r; 3842 3843 r = amdgpu_device_ip_resume_phase1(adev); 3844 if (r) 3845 return r; 3846 3847 r = amdgpu_device_fw_loading(adev); 3848 if (r) 3849 return r; 3850 3851 r = amdgpu_device_ip_resume_phase2(adev); 3852 3853 if (adev->mman.buffer_funcs_ring->sched.ready) 3854 amdgpu_ttm_set_buffer_funcs_status(adev, true); 3855 3856 if (r) 3857 return r; 3858 3859 amdgpu_fence_driver_hw_init(adev); 3860 3861 r = amdgpu_device_ip_resume_phase3(adev); 3862 3863 return r; 3864 } 3865 3866 /** 3867 * amdgpu_device_detect_sriov_bios - determine if the board supports SR-IOV 3868 * 3869 * @adev: amdgpu_device pointer 3870 * 3871 * Query the VBIOS data tables to determine if the board supports SR-IOV. 3872 */ 3873 static void amdgpu_device_detect_sriov_bios(struct amdgpu_device *adev) 3874 { 3875 if (amdgpu_sriov_vf(adev)) { 3876 if (adev->is_atom_fw) { 3877 if (amdgpu_atomfirmware_gpu_virtualization_supported(adev)) 3878 adev->virt.caps |= AMDGPU_SRIOV_CAPS_SRIOV_VBIOS; 3879 } else { 3880 if (amdgpu_atombios_has_gpu_virtualization_table(adev)) 3881 adev->virt.caps |= AMDGPU_SRIOV_CAPS_SRIOV_VBIOS; 3882 } 3883 3884 if (!(adev->virt.caps & AMDGPU_SRIOV_CAPS_SRIOV_VBIOS)) 3885 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_NO_VBIOS, 0, 0); 3886 } 3887 } 3888 3889 /** 3890 * amdgpu_device_asic_has_dc_support - determine if DC supports the asic 3891 * 3892 * @asic_type: AMD asic type 3893 * 3894 * Check if there is DC (new modesetting infrastructre) support for an asic. 3895 * returns true if DC has support, false if not. 3896 */ 3897 bool amdgpu_device_asic_has_dc_support(enum amd_asic_type asic_type) 3898 { 3899 switch (asic_type) { 3900 #ifdef CONFIG_DRM_AMDGPU_SI 3901 case CHIP_HAINAN: 3902 #endif 3903 case CHIP_TOPAZ: 3904 /* chips with no display hardware */ 3905 return false; 3906 #if defined(CONFIG_DRM_AMD_DC) 3907 case CHIP_TAHITI: 3908 case CHIP_PITCAIRN: 3909 case CHIP_VERDE: 3910 case CHIP_OLAND: 3911 /* 3912 * We have systems in the wild with these ASICs that require 3913 * LVDS and VGA support which is not supported with DC. 3914 * 3915 * Fallback to the non-DC driver here by default so as not to 3916 * cause regressions. 3917 */ 3918 #if defined(CONFIG_DRM_AMD_DC_SI) 3919 return amdgpu_dc > 0; 3920 #else 3921 return false; 3922 #endif 3923 case CHIP_BONAIRE: 3924 case CHIP_KAVERI: 3925 case CHIP_KABINI: 3926 case CHIP_MULLINS: 3927 /* 3928 * We have systems in the wild with these ASICs that require 3929 * VGA support which is not supported with DC. 3930 * 3931 * Fallback to the non-DC driver here by default so as not to 3932 * cause regressions. 3933 */ 3934 return amdgpu_dc > 0; 3935 default: 3936 return amdgpu_dc != 0; 3937 #else 3938 default: 3939 if (amdgpu_dc > 0) 3940 DRM_INFO_ONCE("Display Core has been requested via kernel parameter but isn't supported by ASIC, ignoring\n"); 3941 return false; 3942 #endif 3943 } 3944 } 3945 3946 /** 3947 * amdgpu_device_has_dc_support - check if dc is supported 3948 * 3949 * @adev: amdgpu_device pointer 3950 * 3951 * Returns true for supported, false for not supported 3952 */ 3953 bool amdgpu_device_has_dc_support(struct amdgpu_device *adev) 3954 { 3955 if (adev->enable_virtual_display || 3956 (adev->harvest_ip_mask & AMD_HARVEST_IP_DMU_MASK)) 3957 return false; 3958 3959 return amdgpu_device_asic_has_dc_support(adev->asic_type); 3960 } 3961 3962 static void amdgpu_device_xgmi_reset_func(struct work_struct *__work) 3963 { 3964 struct amdgpu_device *adev = 3965 container_of(__work, struct amdgpu_device, xgmi_reset_work); 3966 struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev); 3967 3968 /* It's a bug to not have a hive within this function */ 3969 if (WARN_ON(!hive)) 3970 return; 3971 3972 /* 3973 * Use task barrier to synchronize all xgmi reset works across the 3974 * hive. task_barrier_enter and task_barrier_exit will block 3975 * until all the threads running the xgmi reset works reach 3976 * those points. task_barrier_full will do both blocks. 3977 */ 3978 if (amdgpu_asic_reset_method(adev) == AMD_RESET_METHOD_BACO) { 3979 3980 task_barrier_enter(&hive->tb); 3981 adev->asic_reset_res = amdgpu_device_baco_enter(adev_to_drm(adev)); 3982 3983 if (adev->asic_reset_res) 3984 goto fail; 3985 3986 task_barrier_exit(&hive->tb); 3987 adev->asic_reset_res = amdgpu_device_baco_exit(adev_to_drm(adev)); 3988 3989 if (adev->asic_reset_res) 3990 goto fail; 3991 3992 amdgpu_ras_reset_error_count(adev, AMDGPU_RAS_BLOCK__MMHUB); 3993 } else { 3994 3995 task_barrier_full(&hive->tb); 3996 adev->asic_reset_res = amdgpu_asic_reset(adev); 3997 } 3998 3999 fail: 4000 if (adev->asic_reset_res) 4001 DRM_WARN("ASIC reset failed with error, %d for drm dev, %s", 4002 adev->asic_reset_res, adev_to_drm(adev)->unique); 4003 amdgpu_put_xgmi_hive(hive); 4004 } 4005 4006 static int amdgpu_device_get_job_timeout_settings(struct amdgpu_device *adev) 4007 { 4008 char *input = amdgpu_lockup_timeout; 4009 char *timeout_setting = NULL; 4010 int index = 0; 4011 long timeout; 4012 int ret = 0; 4013 4014 /* 4015 * By default timeout for non compute jobs is 10000 4016 * and 60000 for compute jobs. 4017 * In SR-IOV or passthrough mode, timeout for compute 4018 * jobs are 60000 by default. 4019 */ 4020 adev->gfx_timeout = msecs_to_jiffies(10000); 4021 adev->sdma_timeout = adev->video_timeout = adev->gfx_timeout; 4022 if (amdgpu_sriov_vf(adev)) 4023 adev->compute_timeout = amdgpu_sriov_is_pp_one_vf(adev) ? 4024 msecs_to_jiffies(60000) : msecs_to_jiffies(10000); 4025 else 4026 adev->compute_timeout = msecs_to_jiffies(60000); 4027 4028 if (strnlen(input, AMDGPU_MAX_TIMEOUT_PARAM_LENGTH)) { 4029 while ((timeout_setting = strsep(&input, ",")) && 4030 strnlen(timeout_setting, AMDGPU_MAX_TIMEOUT_PARAM_LENGTH)) { 4031 ret = kstrtol(timeout_setting, 0, &timeout); 4032 if (ret) 4033 return ret; 4034 4035 if (timeout == 0) { 4036 index++; 4037 continue; 4038 } else if (timeout < 0) { 4039 timeout = MAX_SCHEDULE_TIMEOUT; 4040 dev_warn(adev->dev, "lockup timeout disabled"); 4041 add_taint(TAINT_SOFTLOCKUP, LOCKDEP_STILL_OK); 4042 } else { 4043 timeout = msecs_to_jiffies(timeout); 4044 } 4045 4046 switch (index++) { 4047 case 0: 4048 adev->gfx_timeout = timeout; 4049 break; 4050 case 1: 4051 adev->compute_timeout = timeout; 4052 break; 4053 case 2: 4054 adev->sdma_timeout = timeout; 4055 break; 4056 case 3: 4057 adev->video_timeout = timeout; 4058 break; 4059 default: 4060 break; 4061 } 4062 } 4063 /* 4064 * There is only one value specified and 4065 * it should apply to all non-compute jobs. 4066 */ 4067 if (index == 1) { 4068 adev->sdma_timeout = adev->video_timeout = adev->gfx_timeout; 4069 if (amdgpu_sriov_vf(adev) || amdgpu_passthrough(adev)) 4070 adev->compute_timeout = adev->gfx_timeout; 4071 } 4072 } 4073 4074 return ret; 4075 } 4076 4077 /** 4078 * amdgpu_device_check_iommu_direct_map - check if RAM direct mapped to GPU 4079 * 4080 * @adev: amdgpu_device pointer 4081 * 4082 * RAM direct mapped to GPU if IOMMU is not enabled or is pass through mode 4083 */ 4084 static void amdgpu_device_check_iommu_direct_map(struct amdgpu_device *adev) 4085 { 4086 struct iommu_domain *domain; 4087 4088 domain = iommu_get_domain_for_dev(adev->dev); 4089 if (!domain || domain->type == IOMMU_DOMAIN_IDENTITY) 4090 adev->ram_is_direct_mapped = true; 4091 } 4092 4093 #if defined(CONFIG_HSA_AMD_P2P) 4094 /** 4095 * amdgpu_device_check_iommu_remap - Check if DMA remapping is enabled. 4096 * 4097 * @adev: amdgpu_device pointer 4098 * 4099 * return if IOMMU remapping bar address 4100 */ 4101 static bool amdgpu_device_check_iommu_remap(struct amdgpu_device *adev) 4102 { 4103 struct iommu_domain *domain; 4104 4105 domain = iommu_get_domain_for_dev(adev->dev); 4106 if (domain && (domain->type == IOMMU_DOMAIN_DMA || 4107 domain->type == IOMMU_DOMAIN_DMA_FQ)) 4108 return true; 4109 4110 return false; 4111 } 4112 #endif 4113 4114 static const struct attribute *amdgpu_dev_attributes[] = { 4115 &dev_attr_pcie_replay_count.attr, 4116 NULL 4117 }; 4118 4119 static void amdgpu_device_set_mcbp(struct amdgpu_device *adev) 4120 { 4121 if (amdgpu_mcbp == 1) 4122 adev->gfx.mcbp = true; 4123 else if (amdgpu_mcbp == 0) 4124 adev->gfx.mcbp = false; 4125 4126 if (amdgpu_sriov_vf(adev)) 4127 adev->gfx.mcbp = true; 4128 4129 if (adev->gfx.mcbp) 4130 DRM_INFO("MCBP is enabled\n"); 4131 } 4132 4133 /** 4134 * amdgpu_device_init - initialize the driver 4135 * 4136 * @adev: amdgpu_device pointer 4137 * @flags: driver flags 4138 * 4139 * Initializes the driver info and hw (all asics). 4140 * Returns 0 for success or an error on failure. 4141 * Called at driver startup. 4142 */ 4143 int amdgpu_device_init(struct amdgpu_device *adev, 4144 uint32_t flags) 4145 { 4146 struct drm_device *ddev = adev_to_drm(adev); 4147 struct pci_dev *pdev = adev->pdev; 4148 int r, i; 4149 bool px = false; 4150 u32 max_MBps; 4151 int tmp; 4152 4153 adev->shutdown = false; 4154 adev->flags = flags; 4155 4156 if (amdgpu_force_asic_type >= 0 && amdgpu_force_asic_type < CHIP_LAST) 4157 adev->asic_type = amdgpu_force_asic_type; 4158 else 4159 adev->asic_type = flags & AMD_ASIC_MASK; 4160 4161 adev->usec_timeout = AMDGPU_MAX_USEC_TIMEOUT; 4162 if (amdgpu_emu_mode == 1) 4163 adev->usec_timeout *= 10; 4164 adev->gmc.gart_size = 512 * 1024 * 1024; 4165 adev->accel_working = false; 4166 adev->num_rings = 0; 4167 RCU_INIT_POINTER(adev->gang_submit, dma_fence_get_stub()); 4168 adev->mman.buffer_funcs = NULL; 4169 adev->mman.buffer_funcs_ring = NULL; 4170 adev->vm_manager.vm_pte_funcs = NULL; 4171 adev->vm_manager.vm_pte_num_scheds = 0; 4172 adev->gmc.gmc_funcs = NULL; 4173 adev->harvest_ip_mask = 0x0; 4174 adev->fence_context = dma_fence_context_alloc(AMDGPU_MAX_RINGS); 4175 bitmap_zero(adev->gfx.pipe_reserve_bitmap, AMDGPU_MAX_COMPUTE_QUEUES); 4176 4177 adev->smc_rreg = &amdgpu_invalid_rreg; 4178 adev->smc_wreg = &amdgpu_invalid_wreg; 4179 adev->pcie_rreg = &amdgpu_invalid_rreg; 4180 adev->pcie_wreg = &amdgpu_invalid_wreg; 4181 adev->pcie_rreg_ext = &amdgpu_invalid_rreg_ext; 4182 adev->pcie_wreg_ext = &amdgpu_invalid_wreg_ext; 4183 adev->pciep_rreg = &amdgpu_invalid_rreg; 4184 adev->pciep_wreg = &amdgpu_invalid_wreg; 4185 adev->pcie_rreg64 = &amdgpu_invalid_rreg64; 4186 adev->pcie_wreg64 = &amdgpu_invalid_wreg64; 4187 adev->pcie_rreg64_ext = &amdgpu_invalid_rreg64_ext; 4188 adev->pcie_wreg64_ext = &amdgpu_invalid_wreg64_ext; 4189 adev->uvd_ctx_rreg = &amdgpu_invalid_rreg; 4190 adev->uvd_ctx_wreg = &amdgpu_invalid_wreg; 4191 adev->didt_rreg = &amdgpu_invalid_rreg; 4192 adev->didt_wreg = &amdgpu_invalid_wreg; 4193 adev->gc_cac_rreg = &amdgpu_invalid_rreg; 4194 adev->gc_cac_wreg = &amdgpu_invalid_wreg; 4195 adev->audio_endpt_rreg = &amdgpu_block_invalid_rreg; 4196 adev->audio_endpt_wreg = &amdgpu_block_invalid_wreg; 4197 4198 DRM_INFO("initializing kernel modesetting (%s 0x%04X:0x%04X 0x%04X:0x%04X 0x%02X).\n", 4199 amdgpu_asic_name[adev->asic_type], pdev->vendor, pdev->device, 4200 pdev->subsystem_vendor, pdev->subsystem_device, pdev->revision); 4201 4202 /* mutex initialization are all done here so we 4203 * can recall function without having locking issues 4204 */ 4205 mutex_init(&adev->firmware.mutex); 4206 mutex_init(&adev->pm.mutex); 4207 mutex_init(&adev->gfx.gpu_clock_mutex); 4208 mutex_init(&adev->srbm_mutex); 4209 mutex_init(&adev->gfx.pipe_reserve_mutex); 4210 mutex_init(&adev->gfx.gfx_off_mutex); 4211 mutex_init(&adev->gfx.partition_mutex); 4212 mutex_init(&adev->grbm_idx_mutex); 4213 mutex_init(&adev->mn_lock); 4214 mutex_init(&adev->virt.vf_errors.lock); 4215 mutex_init(&adev->virt.rlcg_reg_lock); 4216 hash_init(adev->mn_hash); 4217 mutex_init(&adev->psp.mutex); 4218 mutex_init(&adev->notifier_lock); 4219 mutex_init(&adev->pm.stable_pstate_ctx_lock); 4220 mutex_init(&adev->benchmark_mutex); 4221 mutex_init(&adev->gfx.reset_sem_mutex); 4222 /* Initialize the mutex for cleaner shader isolation between GFX and compute processes */ 4223 mutex_init(&adev->enforce_isolation_mutex); 4224 mutex_init(&adev->gfx.kfd_sch_mutex); 4225 4226 amdgpu_device_init_apu_flags(adev); 4227 4228 r = amdgpu_device_check_arguments(adev); 4229 if (r) 4230 return r; 4231 4232 spin_lock_init(&adev->mmio_idx_lock); 4233 spin_lock_init(&adev->smc_idx_lock); 4234 spin_lock_init(&adev->pcie_idx_lock); 4235 spin_lock_init(&adev->uvd_ctx_idx_lock); 4236 spin_lock_init(&adev->didt_idx_lock); 4237 spin_lock_init(&adev->gc_cac_idx_lock); 4238 spin_lock_init(&adev->se_cac_idx_lock); 4239 spin_lock_init(&adev->audio_endpt_idx_lock); 4240 spin_lock_init(&adev->mm_stats.lock); 4241 spin_lock_init(&adev->wb.lock); 4242 4243 INIT_LIST_HEAD(&adev->reset_list); 4244 4245 INIT_LIST_HEAD(&adev->ras_list); 4246 4247 INIT_LIST_HEAD(&adev->pm.od_kobj_list); 4248 4249 INIT_DELAYED_WORK(&adev->delayed_init_work, 4250 amdgpu_device_delayed_init_work_handler); 4251 INIT_DELAYED_WORK(&adev->gfx.gfx_off_delay_work, 4252 amdgpu_device_delay_enable_gfx_off); 4253 /* 4254 * Initialize the enforce_isolation work structures for each XCP 4255 * partition. This work handler is responsible for enforcing shader 4256 * isolation on AMD GPUs. It counts the number of emitted fences for 4257 * each GFX and compute ring. If there are any fences, it schedules 4258 * the `enforce_isolation_work` to be run after a delay. If there are 4259 * no fences, it signals the Kernel Fusion Driver (KFD) to resume the 4260 * runqueue. 4261 */ 4262 for (i = 0; i < MAX_XCP; i++) { 4263 INIT_DELAYED_WORK(&adev->gfx.enforce_isolation[i].work, 4264 amdgpu_gfx_enforce_isolation_handler); 4265 adev->gfx.enforce_isolation[i].adev = adev; 4266 adev->gfx.enforce_isolation[i].xcp_id = i; 4267 } 4268 4269 INIT_WORK(&adev->xgmi_reset_work, amdgpu_device_xgmi_reset_func); 4270 4271 adev->gfx.gfx_off_req_count = 1; 4272 adev->gfx.gfx_off_residency = 0; 4273 adev->gfx.gfx_off_entrycount = 0; 4274 adev->pm.ac_power = power_supply_is_system_supplied() > 0; 4275 4276 atomic_set(&adev->throttling_logging_enabled, 1); 4277 /* 4278 * If throttling continues, logging will be performed every minute 4279 * to avoid log flooding. "-1" is subtracted since the thermal 4280 * throttling interrupt comes every second. Thus, the total logging 4281 * interval is 59 seconds(retelimited printk interval) + 1(waiting 4282 * for throttling interrupt) = 60 seconds. 4283 */ 4284 ratelimit_state_init(&adev->throttling_logging_rs, (60 - 1) * HZ, 1); 4285 ratelimit_state_init(&adev->virt.ras_telemetry_rs, 5 * HZ, 1); 4286 4287 ratelimit_set_flags(&adev->throttling_logging_rs, RATELIMIT_MSG_ON_RELEASE); 4288 ratelimit_set_flags(&adev->virt.ras_telemetry_rs, RATELIMIT_MSG_ON_RELEASE); 4289 4290 /* Registers mapping */ 4291 /* TODO: block userspace mapping of io register */ 4292 if (adev->asic_type >= CHIP_BONAIRE) { 4293 adev->rmmio_base = pci_resource_start(adev->pdev, 5); 4294 adev->rmmio_size = pci_resource_len(adev->pdev, 5); 4295 } else { 4296 adev->rmmio_base = pci_resource_start(adev->pdev, 2); 4297 adev->rmmio_size = pci_resource_len(adev->pdev, 2); 4298 } 4299 4300 for (i = 0; i < AMD_IP_BLOCK_TYPE_NUM; i++) 4301 atomic_set(&adev->pm.pwr_state[i], POWER_STATE_UNKNOWN); 4302 4303 adev->rmmio = ioremap(adev->rmmio_base, adev->rmmio_size); 4304 if (!adev->rmmio) 4305 return -ENOMEM; 4306 4307 DRM_INFO("register mmio base: 0x%08X\n", (uint32_t)adev->rmmio_base); 4308 DRM_INFO("register mmio size: %u\n", (unsigned int)adev->rmmio_size); 4309 4310 /* 4311 * Reset domain needs to be present early, before XGMI hive discovered 4312 * (if any) and intitialized to use reset sem and in_gpu reset flag 4313 * early on during init and before calling to RREG32. 4314 */ 4315 adev->reset_domain = amdgpu_reset_create_reset_domain(SINGLE_DEVICE, "amdgpu-reset-dev"); 4316 if (!adev->reset_domain) 4317 return -ENOMEM; 4318 4319 /* detect hw virtualization here */ 4320 amdgpu_detect_virtualization(adev); 4321 4322 amdgpu_device_get_pcie_info(adev); 4323 4324 r = amdgpu_device_get_job_timeout_settings(adev); 4325 if (r) { 4326 dev_err(adev->dev, "invalid lockup_timeout parameter syntax\n"); 4327 return r; 4328 } 4329 4330 amdgpu_device_set_mcbp(adev); 4331 4332 /* 4333 * By default, use default mode where all blocks are expected to be 4334 * initialized. At present a 'swinit' of blocks is required to be 4335 * completed before the need for a different level is detected. 4336 */ 4337 amdgpu_set_init_level(adev, AMDGPU_INIT_LEVEL_DEFAULT); 4338 /* early init functions */ 4339 r = amdgpu_device_ip_early_init(adev); 4340 if (r) 4341 return r; 4342 4343 /* Get rid of things like offb */ 4344 r = aperture_remove_conflicting_pci_devices(adev->pdev, amdgpu_kms_driver.name); 4345 if (r) 4346 return r; 4347 4348 /* Enable TMZ based on IP_VERSION */ 4349 amdgpu_gmc_tmz_set(adev); 4350 4351 if (amdgpu_sriov_vf(adev) && 4352 amdgpu_ip_version(adev, GC_HWIP, 0) >= IP_VERSION(10, 3, 0)) 4353 /* VF MMIO access (except mailbox range) from CPU 4354 * will be blocked during sriov runtime 4355 */ 4356 adev->virt.caps |= AMDGPU_VF_MMIO_ACCESS_PROTECT; 4357 4358 amdgpu_gmc_noretry_set(adev); 4359 /* Need to get xgmi info early to decide the reset behavior*/ 4360 if (adev->gmc.xgmi.supported) { 4361 r = adev->gfxhub.funcs->get_xgmi_info(adev); 4362 if (r) 4363 return r; 4364 } 4365 4366 /* enable PCIE atomic ops */ 4367 if (amdgpu_sriov_vf(adev)) { 4368 if (adev->virt.fw_reserve.p_pf2vf) 4369 adev->have_atomics_support = ((struct amd_sriov_msg_pf2vf_info *) 4370 adev->virt.fw_reserve.p_pf2vf)->pcie_atomic_ops_support_flags == 4371 (PCI_EXP_DEVCAP2_ATOMIC_COMP32 | PCI_EXP_DEVCAP2_ATOMIC_COMP64); 4372 /* APUs w/ gfx9 onwards doesn't reply on PCIe atomics, rather it is a 4373 * internal path natively support atomics, set have_atomics_support to true. 4374 */ 4375 } else if ((adev->flags & AMD_IS_APU) && 4376 (amdgpu_ip_version(adev, GC_HWIP, 0) > 4377 IP_VERSION(9, 0, 0))) { 4378 adev->have_atomics_support = true; 4379 } else { 4380 adev->have_atomics_support = 4381 !pci_enable_atomic_ops_to_root(adev->pdev, 4382 PCI_EXP_DEVCAP2_ATOMIC_COMP32 | 4383 PCI_EXP_DEVCAP2_ATOMIC_COMP64); 4384 } 4385 4386 if (!adev->have_atomics_support) 4387 dev_info(adev->dev, "PCIE atomic ops is not supported\n"); 4388 4389 /* doorbell bar mapping and doorbell index init*/ 4390 amdgpu_doorbell_init(adev); 4391 4392 if (amdgpu_emu_mode == 1) { 4393 /* post the asic on emulation mode */ 4394 emu_soc_asic_init(adev); 4395 goto fence_driver_init; 4396 } 4397 4398 amdgpu_reset_init(adev); 4399 4400 /* detect if we are with an SRIOV vbios */ 4401 if (adev->bios) 4402 amdgpu_device_detect_sriov_bios(adev); 4403 4404 /* check if we need to reset the asic 4405 * E.g., driver was not cleanly unloaded previously, etc. 4406 */ 4407 if (!amdgpu_sriov_vf(adev) && amdgpu_asic_need_reset_on_init(adev)) { 4408 if (adev->gmc.xgmi.num_physical_nodes) { 4409 dev_info(adev->dev, "Pending hive reset.\n"); 4410 amdgpu_set_init_level(adev, 4411 AMDGPU_INIT_LEVEL_MINIMAL_XGMI); 4412 } else if (amdgpu_ip_version(adev, MP1_HWIP, 0) == IP_VERSION(13, 0, 10) && 4413 !amdgpu_device_has_display_hardware(adev)) { 4414 r = psp_gpu_reset(adev); 4415 } else { 4416 tmp = amdgpu_reset_method; 4417 /* It should do a default reset when loading or reloading the driver, 4418 * regardless of the module parameter reset_method. 4419 */ 4420 amdgpu_reset_method = AMD_RESET_METHOD_NONE; 4421 r = amdgpu_asic_reset(adev); 4422 amdgpu_reset_method = tmp; 4423 } 4424 4425 if (r) { 4426 dev_err(adev->dev, "asic reset on init failed\n"); 4427 goto failed; 4428 } 4429 } 4430 4431 /* Post card if necessary */ 4432 if (amdgpu_device_need_post(adev)) { 4433 if (!adev->bios) { 4434 dev_err(adev->dev, "no vBIOS found\n"); 4435 r = -EINVAL; 4436 goto failed; 4437 } 4438 DRM_INFO("GPU posting now...\n"); 4439 r = amdgpu_device_asic_init(adev); 4440 if (r) { 4441 dev_err(adev->dev, "gpu post error!\n"); 4442 goto failed; 4443 } 4444 } 4445 4446 if (adev->bios) { 4447 if (adev->is_atom_fw) { 4448 /* Initialize clocks */ 4449 r = amdgpu_atomfirmware_get_clock_info(adev); 4450 if (r) { 4451 dev_err(adev->dev, "amdgpu_atomfirmware_get_clock_info failed\n"); 4452 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_GET_CLOCK_FAIL, 0, 0); 4453 goto failed; 4454 } 4455 } else { 4456 /* Initialize clocks */ 4457 r = amdgpu_atombios_get_clock_info(adev); 4458 if (r) { 4459 dev_err(adev->dev, "amdgpu_atombios_get_clock_info failed\n"); 4460 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_GET_CLOCK_FAIL, 0, 0); 4461 goto failed; 4462 } 4463 /* init i2c buses */ 4464 if (!amdgpu_device_has_dc_support(adev)) 4465 amdgpu_atombios_i2c_init(adev); 4466 } 4467 } 4468 4469 fence_driver_init: 4470 /* Fence driver */ 4471 r = amdgpu_fence_driver_sw_init(adev); 4472 if (r) { 4473 dev_err(adev->dev, "amdgpu_fence_driver_sw_init failed\n"); 4474 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_FENCE_INIT_FAIL, 0, 0); 4475 goto failed; 4476 } 4477 4478 /* init the mode config */ 4479 drm_mode_config_init(adev_to_drm(adev)); 4480 4481 r = amdgpu_device_ip_init(adev); 4482 if (r) { 4483 dev_err(adev->dev, "amdgpu_device_ip_init failed\n"); 4484 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_AMDGPU_INIT_FAIL, 0, 0); 4485 goto release_ras_con; 4486 } 4487 4488 amdgpu_fence_driver_hw_init(adev); 4489 4490 dev_info(adev->dev, 4491 "SE %d, SH per SE %d, CU per SH %d, active_cu_number %d\n", 4492 adev->gfx.config.max_shader_engines, 4493 adev->gfx.config.max_sh_per_se, 4494 adev->gfx.config.max_cu_per_sh, 4495 adev->gfx.cu_info.number); 4496 4497 adev->accel_working = true; 4498 4499 amdgpu_vm_check_compute_bug(adev); 4500 4501 /* Initialize the buffer migration limit. */ 4502 if (amdgpu_moverate >= 0) 4503 max_MBps = amdgpu_moverate; 4504 else 4505 max_MBps = 8; /* Allow 8 MB/s. */ 4506 /* Get a log2 for easy divisions. */ 4507 adev->mm_stats.log2_max_MBps = ilog2(max(1u, max_MBps)); 4508 4509 /* 4510 * Register gpu instance before amdgpu_device_enable_mgpu_fan_boost. 4511 * Otherwise the mgpu fan boost feature will be skipped due to the 4512 * gpu instance is counted less. 4513 */ 4514 amdgpu_register_gpu_instance(adev); 4515 4516 /* enable clockgating, etc. after ib tests, etc. since some blocks require 4517 * explicit gating rather than handling it automatically. 4518 */ 4519 if (adev->init_lvl->level != AMDGPU_INIT_LEVEL_MINIMAL_XGMI) { 4520 r = amdgpu_device_ip_late_init(adev); 4521 if (r) { 4522 dev_err(adev->dev, "amdgpu_device_ip_late_init failed\n"); 4523 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_AMDGPU_LATE_INIT_FAIL, 0, r); 4524 goto release_ras_con; 4525 } 4526 /* must succeed. */ 4527 amdgpu_ras_resume(adev); 4528 queue_delayed_work(system_wq, &adev->delayed_init_work, 4529 msecs_to_jiffies(AMDGPU_RESUME_MS)); 4530 } 4531 4532 if (amdgpu_sriov_vf(adev)) { 4533 amdgpu_virt_release_full_gpu(adev, true); 4534 flush_delayed_work(&adev->delayed_init_work); 4535 } 4536 4537 /* 4538 * Place those sysfs registering after `late_init`. As some of those 4539 * operations performed in `late_init` might affect the sysfs 4540 * interfaces creating. 4541 */ 4542 r = amdgpu_atombios_sysfs_init(adev); 4543 if (r) 4544 drm_err(&adev->ddev, 4545 "registering atombios sysfs failed (%d).\n", r); 4546 4547 r = amdgpu_pm_sysfs_init(adev); 4548 if (r) 4549 DRM_ERROR("registering pm sysfs failed (%d).\n", r); 4550 4551 r = amdgpu_ucode_sysfs_init(adev); 4552 if (r) { 4553 adev->ucode_sysfs_en = false; 4554 DRM_ERROR("Creating firmware sysfs failed (%d).\n", r); 4555 } else 4556 adev->ucode_sysfs_en = true; 4557 4558 r = sysfs_create_files(&adev->dev->kobj, amdgpu_dev_attributes); 4559 if (r) 4560 dev_err(adev->dev, "Could not create amdgpu device attr\n"); 4561 4562 r = devm_device_add_group(adev->dev, &amdgpu_board_attrs_group); 4563 if (r) 4564 dev_err(adev->dev, 4565 "Could not create amdgpu board attributes\n"); 4566 4567 amdgpu_fru_sysfs_init(adev); 4568 amdgpu_reg_state_sysfs_init(adev); 4569 amdgpu_xcp_cfg_sysfs_init(adev); 4570 4571 if (IS_ENABLED(CONFIG_PERF_EVENTS)) 4572 r = amdgpu_pmu_init(adev); 4573 if (r) 4574 dev_err(adev->dev, "amdgpu_pmu_init failed\n"); 4575 4576 /* Have stored pci confspace at hand for restore in sudden PCI error */ 4577 if (amdgpu_device_cache_pci_state(adev->pdev)) 4578 pci_restore_state(pdev); 4579 4580 /* if we have > 1 VGA cards, then disable the amdgpu VGA resources */ 4581 /* this will fail for cards that aren't VGA class devices, just 4582 * ignore it 4583 */ 4584 if ((adev->pdev->class >> 8) == PCI_CLASS_DISPLAY_VGA) 4585 vga_client_register(adev->pdev, amdgpu_device_vga_set_decode); 4586 4587 px = amdgpu_device_supports_px(ddev); 4588 4589 if (px || (!dev_is_removable(&adev->pdev->dev) && 4590 apple_gmux_detect(NULL, NULL))) 4591 vga_switcheroo_register_client(adev->pdev, 4592 &amdgpu_switcheroo_ops, px); 4593 4594 if (px) 4595 vga_switcheroo_init_domain_pm_ops(adev->dev, &adev->vga_pm_domain); 4596 4597 if (adev->init_lvl->level == AMDGPU_INIT_LEVEL_MINIMAL_XGMI) 4598 amdgpu_xgmi_reset_on_init(adev); 4599 4600 amdgpu_device_check_iommu_direct_map(adev); 4601 4602 return 0; 4603 4604 release_ras_con: 4605 if (amdgpu_sriov_vf(adev)) 4606 amdgpu_virt_release_full_gpu(adev, true); 4607 4608 /* failed in exclusive mode due to timeout */ 4609 if (amdgpu_sriov_vf(adev) && 4610 !amdgpu_sriov_runtime(adev) && 4611 amdgpu_virt_mmio_blocked(adev) && 4612 !amdgpu_virt_wait_reset(adev)) { 4613 dev_err(adev->dev, "VF exclusive mode timeout\n"); 4614 /* Don't send request since VF is inactive. */ 4615 adev->virt.caps &= ~AMDGPU_SRIOV_CAPS_RUNTIME; 4616 adev->virt.ops = NULL; 4617 r = -EAGAIN; 4618 } 4619 amdgpu_release_ras_context(adev); 4620 4621 failed: 4622 amdgpu_vf_error_trans_all(adev); 4623 4624 return r; 4625 } 4626 4627 static void amdgpu_device_unmap_mmio(struct amdgpu_device *adev) 4628 { 4629 4630 /* Clear all CPU mappings pointing to this device */ 4631 unmap_mapping_range(adev->ddev.anon_inode->i_mapping, 0, 0, 1); 4632 4633 /* Unmap all mapped bars - Doorbell, registers and VRAM */ 4634 amdgpu_doorbell_fini(adev); 4635 4636 iounmap(adev->rmmio); 4637 adev->rmmio = NULL; 4638 if (adev->mman.aper_base_kaddr) 4639 iounmap(adev->mman.aper_base_kaddr); 4640 adev->mman.aper_base_kaddr = NULL; 4641 4642 /* Memory manager related */ 4643 if (!adev->gmc.xgmi.connected_to_cpu && !adev->gmc.is_app_apu) { 4644 arch_phys_wc_del(adev->gmc.vram_mtrr); 4645 arch_io_free_memtype_wc(adev->gmc.aper_base, adev->gmc.aper_size); 4646 } 4647 } 4648 4649 /** 4650 * amdgpu_device_fini_hw - tear down the driver 4651 * 4652 * @adev: amdgpu_device pointer 4653 * 4654 * Tear down the driver info (all asics). 4655 * Called at driver shutdown. 4656 */ 4657 void amdgpu_device_fini_hw(struct amdgpu_device *adev) 4658 { 4659 dev_info(adev->dev, "amdgpu: finishing device.\n"); 4660 flush_delayed_work(&adev->delayed_init_work); 4661 4662 if (adev->mman.initialized) 4663 drain_workqueue(adev->mman.bdev.wq); 4664 adev->shutdown = true; 4665 4666 /* make sure IB test finished before entering exclusive mode 4667 * to avoid preemption on IB test 4668 */ 4669 if (amdgpu_sriov_vf(adev)) { 4670 amdgpu_virt_request_full_gpu(adev, false); 4671 amdgpu_virt_fini_data_exchange(adev); 4672 } 4673 4674 /* disable all interrupts */ 4675 amdgpu_irq_disable_all(adev); 4676 if (adev->mode_info.mode_config_initialized) { 4677 if (!drm_drv_uses_atomic_modeset(adev_to_drm(adev))) 4678 drm_helper_force_disable_all(adev_to_drm(adev)); 4679 else 4680 drm_atomic_helper_shutdown(adev_to_drm(adev)); 4681 } 4682 amdgpu_fence_driver_hw_fini(adev); 4683 4684 if (adev->pm.sysfs_initialized) 4685 amdgpu_pm_sysfs_fini(adev); 4686 if (adev->ucode_sysfs_en) 4687 amdgpu_ucode_sysfs_fini(adev); 4688 sysfs_remove_files(&adev->dev->kobj, amdgpu_dev_attributes); 4689 amdgpu_fru_sysfs_fini(adev); 4690 4691 amdgpu_reg_state_sysfs_fini(adev); 4692 amdgpu_xcp_cfg_sysfs_fini(adev); 4693 4694 /* disable ras feature must before hw fini */ 4695 amdgpu_ras_pre_fini(adev); 4696 4697 amdgpu_ttm_set_buffer_funcs_status(adev, false); 4698 4699 amdgpu_device_ip_fini_early(adev); 4700 4701 amdgpu_irq_fini_hw(adev); 4702 4703 if (adev->mman.initialized) 4704 ttm_device_clear_dma_mappings(&adev->mman.bdev); 4705 4706 amdgpu_gart_dummy_page_fini(adev); 4707 4708 if (drm_dev_is_unplugged(adev_to_drm(adev))) 4709 amdgpu_device_unmap_mmio(adev); 4710 4711 } 4712 4713 void amdgpu_device_fini_sw(struct amdgpu_device *adev) 4714 { 4715 int idx; 4716 bool px; 4717 4718 amdgpu_device_ip_fini(adev); 4719 amdgpu_fence_driver_sw_fini(adev); 4720 amdgpu_ucode_release(&adev->firmware.gpu_info_fw); 4721 adev->accel_working = false; 4722 dma_fence_put(rcu_dereference_protected(adev->gang_submit, true)); 4723 4724 amdgpu_reset_fini(adev); 4725 4726 /* free i2c buses */ 4727 if (!amdgpu_device_has_dc_support(adev)) 4728 amdgpu_i2c_fini(adev); 4729 4730 if (amdgpu_emu_mode != 1) 4731 amdgpu_atombios_fini(adev); 4732 4733 kfree(adev->bios); 4734 adev->bios = NULL; 4735 4736 kfree(adev->fru_info); 4737 adev->fru_info = NULL; 4738 4739 px = amdgpu_device_supports_px(adev_to_drm(adev)); 4740 4741 if (px || (!dev_is_removable(&adev->pdev->dev) && 4742 apple_gmux_detect(NULL, NULL))) 4743 vga_switcheroo_unregister_client(adev->pdev); 4744 4745 if (px) 4746 vga_switcheroo_fini_domain_pm_ops(adev->dev); 4747 4748 if ((adev->pdev->class >> 8) == PCI_CLASS_DISPLAY_VGA) 4749 vga_client_unregister(adev->pdev); 4750 4751 if (drm_dev_enter(adev_to_drm(adev), &idx)) { 4752 4753 iounmap(adev->rmmio); 4754 adev->rmmio = NULL; 4755 amdgpu_doorbell_fini(adev); 4756 drm_dev_exit(idx); 4757 } 4758 4759 if (IS_ENABLED(CONFIG_PERF_EVENTS)) 4760 amdgpu_pmu_fini(adev); 4761 if (adev->mman.discovery_bin) 4762 amdgpu_discovery_fini(adev); 4763 4764 amdgpu_reset_put_reset_domain(adev->reset_domain); 4765 adev->reset_domain = NULL; 4766 4767 kfree(adev->pci_state); 4768 4769 } 4770 4771 /** 4772 * amdgpu_device_evict_resources - evict device resources 4773 * @adev: amdgpu device object 4774 * 4775 * Evicts all ttm device resources(vram BOs, gart table) from the lru list 4776 * of the vram memory type. Mainly used for evicting device resources 4777 * at suspend time. 4778 * 4779 */ 4780 static int amdgpu_device_evict_resources(struct amdgpu_device *adev) 4781 { 4782 int ret; 4783 4784 /* No need to evict vram on APUs for suspend to ram or s2idle */ 4785 if ((adev->in_s3 || adev->in_s0ix) && (adev->flags & AMD_IS_APU)) 4786 return 0; 4787 4788 ret = amdgpu_ttm_evict_resources(adev, TTM_PL_VRAM); 4789 if (ret) 4790 DRM_WARN("evicting device resources failed\n"); 4791 return ret; 4792 } 4793 4794 /* 4795 * Suspend & resume. 4796 */ 4797 /** 4798 * amdgpu_device_prepare - prepare for device suspend 4799 * 4800 * @dev: drm dev pointer 4801 * 4802 * Prepare to put the hw in the suspend state (all asics). 4803 * Returns 0 for success or an error on failure. 4804 * Called at driver suspend. 4805 */ 4806 int amdgpu_device_prepare(struct drm_device *dev) 4807 { 4808 struct amdgpu_device *adev = drm_to_adev(dev); 4809 int i, r; 4810 4811 amdgpu_choose_low_power_state(adev); 4812 4813 if (dev->switch_power_state == DRM_SWITCH_POWER_OFF) 4814 return 0; 4815 4816 /* Evict the majority of BOs before starting suspend sequence */ 4817 r = amdgpu_device_evict_resources(adev); 4818 if (r) 4819 goto unprepare; 4820 4821 flush_delayed_work(&adev->gfx.gfx_off_delay_work); 4822 4823 for (i = 0; i < adev->num_ip_blocks; i++) { 4824 if (!adev->ip_blocks[i].status.valid) 4825 continue; 4826 if (!adev->ip_blocks[i].version->funcs->prepare_suspend) 4827 continue; 4828 r = adev->ip_blocks[i].version->funcs->prepare_suspend(&adev->ip_blocks[i]); 4829 if (r) 4830 goto unprepare; 4831 } 4832 4833 return 0; 4834 4835 unprepare: 4836 adev->in_s0ix = adev->in_s3 = false; 4837 4838 return r; 4839 } 4840 4841 /** 4842 * amdgpu_device_suspend - initiate device suspend 4843 * 4844 * @dev: drm dev pointer 4845 * @notify_clients: notify in-kernel DRM clients 4846 * 4847 * Puts the hw in the suspend state (all asics). 4848 * Returns 0 for success or an error on failure. 4849 * Called at driver suspend. 4850 */ 4851 int amdgpu_device_suspend(struct drm_device *dev, bool notify_clients) 4852 { 4853 struct amdgpu_device *adev = drm_to_adev(dev); 4854 int r = 0; 4855 4856 if (dev->switch_power_state == DRM_SWITCH_POWER_OFF) 4857 return 0; 4858 4859 adev->in_suspend = true; 4860 4861 if (amdgpu_sriov_vf(adev)) { 4862 amdgpu_virt_fini_data_exchange(adev); 4863 r = amdgpu_virt_request_full_gpu(adev, false); 4864 if (r) 4865 return r; 4866 } 4867 4868 if (amdgpu_acpi_smart_shift_update(dev, AMDGPU_SS_DEV_D3)) 4869 DRM_WARN("smart shift update failed\n"); 4870 4871 if (notify_clients) 4872 drm_client_dev_suspend(adev_to_drm(adev), false); 4873 4874 cancel_delayed_work_sync(&adev->delayed_init_work); 4875 4876 amdgpu_ras_suspend(adev); 4877 4878 amdgpu_device_ip_suspend_phase1(adev); 4879 4880 if (!adev->in_s0ix) 4881 amdgpu_amdkfd_suspend(adev, adev->in_runpm); 4882 4883 r = amdgpu_device_evict_resources(adev); 4884 if (r) 4885 return r; 4886 4887 amdgpu_ttm_set_buffer_funcs_status(adev, false); 4888 4889 amdgpu_fence_driver_hw_fini(adev); 4890 4891 amdgpu_device_ip_suspend_phase2(adev); 4892 4893 if (amdgpu_sriov_vf(adev)) 4894 amdgpu_virt_release_full_gpu(adev, false); 4895 4896 r = amdgpu_dpm_notify_rlc_state(adev, false); 4897 if (r) 4898 return r; 4899 4900 return 0; 4901 } 4902 4903 /** 4904 * amdgpu_device_resume - initiate device resume 4905 * 4906 * @dev: drm dev pointer 4907 * @notify_clients: notify in-kernel DRM clients 4908 * 4909 * Bring the hw back to operating state (all asics). 4910 * Returns 0 for success or an error on failure. 4911 * Called at driver resume. 4912 */ 4913 int amdgpu_device_resume(struct drm_device *dev, bool notify_clients) 4914 { 4915 struct amdgpu_device *adev = drm_to_adev(dev); 4916 int r = 0; 4917 4918 if (amdgpu_sriov_vf(adev)) { 4919 r = amdgpu_virt_request_full_gpu(adev, true); 4920 if (r) 4921 return r; 4922 } 4923 4924 if (dev->switch_power_state == DRM_SWITCH_POWER_OFF) 4925 return 0; 4926 4927 if (adev->in_s0ix) 4928 amdgpu_dpm_gfx_state_change(adev, sGpuChangeState_D0Entry); 4929 4930 /* post card */ 4931 if (amdgpu_device_need_post(adev)) { 4932 r = amdgpu_device_asic_init(adev); 4933 if (r) 4934 dev_err(adev->dev, "amdgpu asic init failed\n"); 4935 } 4936 4937 r = amdgpu_device_ip_resume(adev); 4938 4939 if (r) { 4940 dev_err(adev->dev, "amdgpu_device_ip_resume failed (%d).\n", r); 4941 goto exit; 4942 } 4943 4944 if (!adev->in_s0ix) { 4945 r = amdgpu_amdkfd_resume(adev, adev->in_runpm); 4946 if (r) 4947 goto exit; 4948 } 4949 4950 r = amdgpu_device_ip_late_init(adev); 4951 if (r) 4952 goto exit; 4953 4954 queue_delayed_work(system_wq, &adev->delayed_init_work, 4955 msecs_to_jiffies(AMDGPU_RESUME_MS)); 4956 exit: 4957 if (amdgpu_sriov_vf(adev)) { 4958 amdgpu_virt_init_data_exchange(adev); 4959 amdgpu_virt_release_full_gpu(adev, true); 4960 } 4961 4962 if (r) 4963 return r; 4964 4965 /* Make sure IB tests flushed */ 4966 flush_delayed_work(&adev->delayed_init_work); 4967 4968 if (notify_clients) 4969 drm_client_dev_resume(adev_to_drm(adev), false); 4970 4971 amdgpu_ras_resume(adev); 4972 4973 if (adev->mode_info.num_crtc) { 4974 /* 4975 * Most of the connector probing functions try to acquire runtime pm 4976 * refs to ensure that the GPU is powered on when connector polling is 4977 * performed. Since we're calling this from a runtime PM callback, 4978 * trying to acquire rpm refs will cause us to deadlock. 4979 * 4980 * Since we're guaranteed to be holding the rpm lock, it's safe to 4981 * temporarily disable the rpm helpers so this doesn't deadlock us. 4982 */ 4983 #ifdef CONFIG_PM 4984 dev->dev->power.disable_depth++; 4985 #endif 4986 if (!adev->dc_enabled) 4987 drm_helper_hpd_irq_event(dev); 4988 else 4989 drm_kms_helper_hotplug_event(dev); 4990 #ifdef CONFIG_PM 4991 dev->dev->power.disable_depth--; 4992 #endif 4993 } 4994 adev->in_suspend = false; 4995 4996 if (adev->enable_mes) 4997 amdgpu_mes_self_test(adev); 4998 4999 if (amdgpu_acpi_smart_shift_update(dev, AMDGPU_SS_DEV_D0)) 5000 DRM_WARN("smart shift update failed\n"); 5001 5002 return 0; 5003 } 5004 5005 /** 5006 * amdgpu_device_ip_check_soft_reset - did soft reset succeed 5007 * 5008 * @adev: amdgpu_device pointer 5009 * 5010 * The list of all the hardware IPs that make up the asic is walked and 5011 * the check_soft_reset callbacks are run. check_soft_reset determines 5012 * if the asic is still hung or not. 5013 * Returns true if any of the IPs are still in a hung state, false if not. 5014 */ 5015 static bool amdgpu_device_ip_check_soft_reset(struct amdgpu_device *adev) 5016 { 5017 int i; 5018 bool asic_hang = false; 5019 5020 if (amdgpu_sriov_vf(adev)) 5021 return true; 5022 5023 if (amdgpu_asic_need_full_reset(adev)) 5024 return true; 5025 5026 for (i = 0; i < adev->num_ip_blocks; i++) { 5027 if (!adev->ip_blocks[i].status.valid) 5028 continue; 5029 if (adev->ip_blocks[i].version->funcs->check_soft_reset) 5030 adev->ip_blocks[i].status.hang = 5031 adev->ip_blocks[i].version->funcs->check_soft_reset( 5032 &adev->ip_blocks[i]); 5033 if (adev->ip_blocks[i].status.hang) { 5034 dev_info(adev->dev, "IP block:%s is hung!\n", adev->ip_blocks[i].version->funcs->name); 5035 asic_hang = true; 5036 } 5037 } 5038 return asic_hang; 5039 } 5040 5041 /** 5042 * amdgpu_device_ip_pre_soft_reset - prepare for soft reset 5043 * 5044 * @adev: amdgpu_device pointer 5045 * 5046 * The list of all the hardware IPs that make up the asic is walked and the 5047 * pre_soft_reset callbacks are run if the block is hung. pre_soft_reset 5048 * handles any IP specific hardware or software state changes that are 5049 * necessary for a soft reset to succeed. 5050 * Returns 0 on success, negative error code on failure. 5051 */ 5052 static int amdgpu_device_ip_pre_soft_reset(struct amdgpu_device *adev) 5053 { 5054 int i, r = 0; 5055 5056 for (i = 0; i < adev->num_ip_blocks; i++) { 5057 if (!adev->ip_blocks[i].status.valid) 5058 continue; 5059 if (adev->ip_blocks[i].status.hang && 5060 adev->ip_blocks[i].version->funcs->pre_soft_reset) { 5061 r = adev->ip_blocks[i].version->funcs->pre_soft_reset(&adev->ip_blocks[i]); 5062 if (r) 5063 return r; 5064 } 5065 } 5066 5067 return 0; 5068 } 5069 5070 /** 5071 * amdgpu_device_ip_need_full_reset - check if a full asic reset is needed 5072 * 5073 * @adev: amdgpu_device pointer 5074 * 5075 * Some hardware IPs cannot be soft reset. If they are hung, a full gpu 5076 * reset is necessary to recover. 5077 * Returns true if a full asic reset is required, false if not. 5078 */ 5079 static bool amdgpu_device_ip_need_full_reset(struct amdgpu_device *adev) 5080 { 5081 int i; 5082 5083 if (amdgpu_asic_need_full_reset(adev)) 5084 return true; 5085 5086 for (i = 0; i < adev->num_ip_blocks; i++) { 5087 if (!adev->ip_blocks[i].status.valid) 5088 continue; 5089 if ((adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) || 5090 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) || 5091 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_ACP) || 5092 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE) || 5093 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) { 5094 if (adev->ip_blocks[i].status.hang) { 5095 dev_info(adev->dev, "Some block need full reset!\n"); 5096 return true; 5097 } 5098 } 5099 } 5100 return false; 5101 } 5102 5103 /** 5104 * amdgpu_device_ip_soft_reset - do a soft reset 5105 * 5106 * @adev: amdgpu_device pointer 5107 * 5108 * The list of all the hardware IPs that make up the asic is walked and the 5109 * soft_reset callbacks are run if the block is hung. soft_reset handles any 5110 * IP specific hardware or software state changes that are necessary to soft 5111 * reset the IP. 5112 * Returns 0 on success, negative error code on failure. 5113 */ 5114 static int amdgpu_device_ip_soft_reset(struct amdgpu_device *adev) 5115 { 5116 int i, r = 0; 5117 5118 for (i = 0; i < adev->num_ip_blocks; i++) { 5119 if (!adev->ip_blocks[i].status.valid) 5120 continue; 5121 if (adev->ip_blocks[i].status.hang && 5122 adev->ip_blocks[i].version->funcs->soft_reset) { 5123 r = adev->ip_blocks[i].version->funcs->soft_reset(&adev->ip_blocks[i]); 5124 if (r) 5125 return r; 5126 } 5127 } 5128 5129 return 0; 5130 } 5131 5132 /** 5133 * amdgpu_device_ip_post_soft_reset - clean up from soft reset 5134 * 5135 * @adev: amdgpu_device pointer 5136 * 5137 * The list of all the hardware IPs that make up the asic is walked and the 5138 * post_soft_reset callbacks are run if the asic was hung. post_soft_reset 5139 * handles any IP specific hardware or software state changes that are 5140 * necessary after the IP has been soft reset. 5141 * Returns 0 on success, negative error code on failure. 5142 */ 5143 static int amdgpu_device_ip_post_soft_reset(struct amdgpu_device *adev) 5144 { 5145 int i, r = 0; 5146 5147 for (i = 0; i < adev->num_ip_blocks; i++) { 5148 if (!adev->ip_blocks[i].status.valid) 5149 continue; 5150 if (adev->ip_blocks[i].status.hang && 5151 adev->ip_blocks[i].version->funcs->post_soft_reset) 5152 r = adev->ip_blocks[i].version->funcs->post_soft_reset(&adev->ip_blocks[i]); 5153 if (r) 5154 return r; 5155 } 5156 5157 return 0; 5158 } 5159 5160 /** 5161 * amdgpu_device_reset_sriov - reset ASIC for SR-IOV vf 5162 * 5163 * @adev: amdgpu_device pointer 5164 * @reset_context: amdgpu reset context pointer 5165 * 5166 * do VF FLR and reinitialize Asic 5167 * return 0 means succeeded otherwise failed 5168 */ 5169 static int amdgpu_device_reset_sriov(struct amdgpu_device *adev, 5170 struct amdgpu_reset_context *reset_context) 5171 { 5172 int r; 5173 struct amdgpu_hive_info *hive = NULL; 5174 5175 if (test_bit(AMDGPU_HOST_FLR, &reset_context->flags)) { 5176 if (!amdgpu_ras_get_fed_status(adev)) 5177 amdgpu_virt_ready_to_reset(adev); 5178 amdgpu_virt_wait_reset(adev); 5179 clear_bit(AMDGPU_HOST_FLR, &reset_context->flags); 5180 r = amdgpu_virt_request_full_gpu(adev, true); 5181 } else { 5182 r = amdgpu_virt_reset_gpu(adev); 5183 } 5184 if (r) 5185 return r; 5186 5187 amdgpu_ras_set_fed(adev, false); 5188 amdgpu_irq_gpu_reset_resume_helper(adev); 5189 5190 /* some sw clean up VF needs to do before recover */ 5191 amdgpu_virt_post_reset(adev); 5192 5193 /* Resume IP prior to SMC */ 5194 r = amdgpu_device_ip_reinit_early_sriov(adev); 5195 if (r) 5196 return r; 5197 5198 amdgpu_virt_init_data_exchange(adev); 5199 5200 r = amdgpu_device_fw_loading(adev); 5201 if (r) 5202 return r; 5203 5204 /* now we are okay to resume SMC/CP/SDMA */ 5205 r = amdgpu_device_ip_reinit_late_sriov(adev); 5206 if (r) 5207 return r; 5208 5209 hive = amdgpu_get_xgmi_hive(adev); 5210 /* Update PSP FW topology after reset */ 5211 if (hive && adev->gmc.xgmi.num_physical_nodes > 1) 5212 r = amdgpu_xgmi_update_topology(hive, adev); 5213 if (hive) 5214 amdgpu_put_xgmi_hive(hive); 5215 if (r) 5216 return r; 5217 5218 r = amdgpu_ib_ring_tests(adev); 5219 if (r) 5220 return r; 5221 5222 if (adev->virt.gim_feature & AMDGIM_FEATURE_GIM_FLR_VRAMLOST) 5223 amdgpu_inc_vram_lost(adev); 5224 5225 /* need to be called during full access so we can't do it later like 5226 * bare-metal does. 5227 */ 5228 amdgpu_amdkfd_post_reset(adev); 5229 amdgpu_virt_release_full_gpu(adev, true); 5230 5231 /* Aldebaran and gfx_11_0_3 support ras in SRIOV, so need resume ras during reset */ 5232 if (amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 2) || 5233 amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 3) || 5234 amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 4) || 5235 amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(11, 0, 3)) 5236 amdgpu_ras_resume(adev); 5237 5238 amdgpu_virt_ras_telemetry_post_reset(adev); 5239 5240 return 0; 5241 } 5242 5243 /** 5244 * amdgpu_device_has_job_running - check if there is any job in mirror list 5245 * 5246 * @adev: amdgpu_device pointer 5247 * 5248 * check if there is any job in mirror list 5249 */ 5250 bool amdgpu_device_has_job_running(struct amdgpu_device *adev) 5251 { 5252 int i; 5253 struct drm_sched_job *job; 5254 5255 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 5256 struct amdgpu_ring *ring = adev->rings[i]; 5257 5258 if (!amdgpu_ring_sched_ready(ring)) 5259 continue; 5260 5261 spin_lock(&ring->sched.job_list_lock); 5262 job = list_first_entry_or_null(&ring->sched.pending_list, 5263 struct drm_sched_job, list); 5264 spin_unlock(&ring->sched.job_list_lock); 5265 if (job) 5266 return true; 5267 } 5268 return false; 5269 } 5270 5271 /** 5272 * amdgpu_device_should_recover_gpu - check if we should try GPU recovery 5273 * 5274 * @adev: amdgpu_device pointer 5275 * 5276 * Check amdgpu_gpu_recovery and SRIOV status to see if we should try to recover 5277 * a hung GPU. 5278 */ 5279 bool amdgpu_device_should_recover_gpu(struct amdgpu_device *adev) 5280 { 5281 5282 if (amdgpu_gpu_recovery == 0) 5283 goto disabled; 5284 5285 /* Skip soft reset check in fatal error mode */ 5286 if (!amdgpu_ras_is_poison_mode_supported(adev)) 5287 return true; 5288 5289 if (amdgpu_sriov_vf(adev)) 5290 return true; 5291 5292 if (amdgpu_gpu_recovery == -1) { 5293 switch (adev->asic_type) { 5294 #ifdef CONFIG_DRM_AMDGPU_SI 5295 case CHIP_VERDE: 5296 case CHIP_TAHITI: 5297 case CHIP_PITCAIRN: 5298 case CHIP_OLAND: 5299 case CHIP_HAINAN: 5300 #endif 5301 #ifdef CONFIG_DRM_AMDGPU_CIK 5302 case CHIP_KAVERI: 5303 case CHIP_KABINI: 5304 case CHIP_MULLINS: 5305 #endif 5306 case CHIP_CARRIZO: 5307 case CHIP_STONEY: 5308 case CHIP_CYAN_SKILLFISH: 5309 goto disabled; 5310 default: 5311 break; 5312 } 5313 } 5314 5315 return true; 5316 5317 disabled: 5318 dev_info(adev->dev, "GPU recovery disabled.\n"); 5319 return false; 5320 } 5321 5322 int amdgpu_device_mode1_reset(struct amdgpu_device *adev) 5323 { 5324 u32 i; 5325 int ret = 0; 5326 5327 amdgpu_atombios_scratch_regs_engine_hung(adev, true); 5328 5329 dev_info(adev->dev, "GPU mode1 reset\n"); 5330 5331 /* Cache the state before bus master disable. The saved config space 5332 * values are used in other cases like restore after mode-2 reset. 5333 */ 5334 amdgpu_device_cache_pci_state(adev->pdev); 5335 5336 /* disable BM */ 5337 pci_clear_master(adev->pdev); 5338 5339 if (amdgpu_dpm_is_mode1_reset_supported(adev)) { 5340 dev_info(adev->dev, "GPU smu mode1 reset\n"); 5341 ret = amdgpu_dpm_mode1_reset(adev); 5342 } else { 5343 dev_info(adev->dev, "GPU psp mode1 reset\n"); 5344 ret = psp_gpu_reset(adev); 5345 } 5346 5347 if (ret) 5348 goto mode1_reset_failed; 5349 5350 amdgpu_device_load_pci_state(adev->pdev); 5351 ret = amdgpu_psp_wait_for_bootloader(adev); 5352 if (ret) 5353 goto mode1_reset_failed; 5354 5355 /* wait for asic to come out of reset */ 5356 for (i = 0; i < adev->usec_timeout; i++) { 5357 u32 memsize = adev->nbio.funcs->get_memsize(adev); 5358 5359 if (memsize != 0xffffffff) 5360 break; 5361 udelay(1); 5362 } 5363 5364 if (i >= adev->usec_timeout) { 5365 ret = -ETIMEDOUT; 5366 goto mode1_reset_failed; 5367 } 5368 5369 amdgpu_atombios_scratch_regs_engine_hung(adev, false); 5370 5371 return 0; 5372 5373 mode1_reset_failed: 5374 dev_err(adev->dev, "GPU mode1 reset failed\n"); 5375 return ret; 5376 } 5377 5378 int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev, 5379 struct amdgpu_reset_context *reset_context) 5380 { 5381 int i, r = 0; 5382 struct amdgpu_job *job = NULL; 5383 struct amdgpu_device *tmp_adev = reset_context->reset_req_dev; 5384 bool need_full_reset = 5385 test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 5386 5387 if (reset_context->reset_req_dev == adev) 5388 job = reset_context->job; 5389 5390 if (amdgpu_sriov_vf(adev)) 5391 amdgpu_virt_pre_reset(adev); 5392 5393 amdgpu_fence_driver_isr_toggle(adev, true); 5394 5395 /* block all schedulers and reset given job's ring */ 5396 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 5397 struct amdgpu_ring *ring = adev->rings[i]; 5398 5399 if (!amdgpu_ring_sched_ready(ring)) 5400 continue; 5401 5402 /* Clear job fence from fence drv to avoid force_completion 5403 * leave NULL and vm flush fence in fence drv 5404 */ 5405 amdgpu_fence_driver_clear_job_fences(ring); 5406 5407 /* after all hw jobs are reset, hw fence is meaningless, so force_completion */ 5408 amdgpu_fence_driver_force_completion(ring); 5409 } 5410 5411 amdgpu_fence_driver_isr_toggle(adev, false); 5412 5413 if (job && job->vm) 5414 drm_sched_increase_karma(&job->base); 5415 5416 r = amdgpu_reset_prepare_hwcontext(adev, reset_context); 5417 /* If reset handler not implemented, continue; otherwise return */ 5418 if (r == -EOPNOTSUPP) 5419 r = 0; 5420 else 5421 return r; 5422 5423 /* Don't suspend on bare metal if we are not going to HW reset the ASIC */ 5424 if (!amdgpu_sriov_vf(adev)) { 5425 5426 if (!need_full_reset) 5427 need_full_reset = amdgpu_device_ip_need_full_reset(adev); 5428 5429 if (!need_full_reset && amdgpu_gpu_recovery && 5430 amdgpu_device_ip_check_soft_reset(adev)) { 5431 amdgpu_device_ip_pre_soft_reset(adev); 5432 r = amdgpu_device_ip_soft_reset(adev); 5433 amdgpu_device_ip_post_soft_reset(adev); 5434 if (r || amdgpu_device_ip_check_soft_reset(adev)) { 5435 dev_info(adev->dev, "soft reset failed, will fallback to full reset!\n"); 5436 need_full_reset = true; 5437 } 5438 } 5439 5440 if (!test_bit(AMDGPU_SKIP_COREDUMP, &reset_context->flags)) { 5441 dev_info(tmp_adev->dev, "Dumping IP State\n"); 5442 /* Trigger ip dump before we reset the asic */ 5443 for (i = 0; i < tmp_adev->num_ip_blocks; i++) 5444 if (tmp_adev->ip_blocks[i].version->funcs->dump_ip_state) 5445 tmp_adev->ip_blocks[i].version->funcs 5446 ->dump_ip_state((void *)&tmp_adev->ip_blocks[i]); 5447 dev_info(tmp_adev->dev, "Dumping IP State Completed\n"); 5448 } 5449 5450 if (need_full_reset) 5451 r = amdgpu_device_ip_suspend(adev); 5452 if (need_full_reset) 5453 set_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 5454 else 5455 clear_bit(AMDGPU_NEED_FULL_RESET, 5456 &reset_context->flags); 5457 } 5458 5459 return r; 5460 } 5461 5462 int amdgpu_device_reinit_after_reset(struct amdgpu_reset_context *reset_context) 5463 { 5464 struct list_head *device_list_handle; 5465 bool full_reset, vram_lost = false; 5466 struct amdgpu_device *tmp_adev; 5467 int r, init_level; 5468 5469 device_list_handle = reset_context->reset_device_list; 5470 5471 if (!device_list_handle) 5472 return -EINVAL; 5473 5474 full_reset = test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 5475 5476 /** 5477 * If it's reset on init, it's default init level, otherwise keep level 5478 * as recovery level. 5479 */ 5480 if (reset_context->method == AMD_RESET_METHOD_ON_INIT) 5481 init_level = AMDGPU_INIT_LEVEL_DEFAULT; 5482 else 5483 init_level = AMDGPU_INIT_LEVEL_RESET_RECOVERY; 5484 5485 r = 0; 5486 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 5487 amdgpu_set_init_level(tmp_adev, init_level); 5488 if (full_reset) { 5489 /* post card */ 5490 amdgpu_ras_set_fed(tmp_adev, false); 5491 r = amdgpu_device_asic_init(tmp_adev); 5492 if (r) { 5493 dev_warn(tmp_adev->dev, "asic atom init failed!"); 5494 } else { 5495 dev_info(tmp_adev->dev, "GPU reset succeeded, trying to resume\n"); 5496 5497 r = amdgpu_device_ip_resume_phase1(tmp_adev); 5498 if (r) 5499 goto out; 5500 5501 vram_lost = amdgpu_device_check_vram_lost(tmp_adev); 5502 5503 if (!test_bit(AMDGPU_SKIP_COREDUMP, &reset_context->flags)) 5504 amdgpu_coredump(tmp_adev, false, vram_lost, reset_context->job); 5505 5506 if (vram_lost) { 5507 DRM_INFO("VRAM is lost due to GPU reset!\n"); 5508 amdgpu_inc_vram_lost(tmp_adev); 5509 } 5510 5511 r = amdgpu_device_fw_loading(tmp_adev); 5512 if (r) 5513 return r; 5514 5515 r = amdgpu_xcp_restore_partition_mode( 5516 tmp_adev->xcp_mgr); 5517 if (r) 5518 goto out; 5519 5520 r = amdgpu_device_ip_resume_phase2(tmp_adev); 5521 if (r) 5522 goto out; 5523 5524 if (tmp_adev->mman.buffer_funcs_ring->sched.ready) 5525 amdgpu_ttm_set_buffer_funcs_status(tmp_adev, true); 5526 5527 r = amdgpu_device_ip_resume_phase3(tmp_adev); 5528 if (r) 5529 goto out; 5530 5531 if (vram_lost) 5532 amdgpu_device_fill_reset_magic(tmp_adev); 5533 5534 /* 5535 * Add this ASIC as tracked as reset was already 5536 * complete successfully. 5537 */ 5538 amdgpu_register_gpu_instance(tmp_adev); 5539 5540 if (!reset_context->hive && 5541 tmp_adev->gmc.xgmi.num_physical_nodes > 1) 5542 amdgpu_xgmi_add_device(tmp_adev); 5543 5544 r = amdgpu_device_ip_late_init(tmp_adev); 5545 if (r) 5546 goto out; 5547 5548 drm_client_dev_resume(adev_to_drm(tmp_adev), false); 5549 5550 /* 5551 * The GPU enters bad state once faulty pages 5552 * by ECC has reached the threshold, and ras 5553 * recovery is scheduled next. So add one check 5554 * here to break recovery if it indeed exceeds 5555 * bad page threshold, and remind user to 5556 * retire this GPU or setting one bigger 5557 * bad_page_threshold value to fix this once 5558 * probing driver again. 5559 */ 5560 if (!amdgpu_ras_is_rma(tmp_adev)) { 5561 /* must succeed. */ 5562 amdgpu_ras_resume(tmp_adev); 5563 } else { 5564 r = -EINVAL; 5565 goto out; 5566 } 5567 5568 /* Update PSP FW topology after reset */ 5569 if (reset_context->hive && 5570 tmp_adev->gmc.xgmi.num_physical_nodes > 1) 5571 r = amdgpu_xgmi_update_topology( 5572 reset_context->hive, tmp_adev); 5573 } 5574 } 5575 5576 out: 5577 if (!r) { 5578 /* IP init is complete now, set level as default */ 5579 amdgpu_set_init_level(tmp_adev, 5580 AMDGPU_INIT_LEVEL_DEFAULT); 5581 amdgpu_irq_gpu_reset_resume_helper(tmp_adev); 5582 r = amdgpu_ib_ring_tests(tmp_adev); 5583 if (r) { 5584 dev_err(tmp_adev->dev, "ib ring test failed (%d).\n", r); 5585 r = -EAGAIN; 5586 goto end; 5587 } 5588 } 5589 5590 if (r) 5591 tmp_adev->asic_reset_res = r; 5592 } 5593 5594 end: 5595 return r; 5596 } 5597 5598 int amdgpu_do_asic_reset(struct list_head *device_list_handle, 5599 struct amdgpu_reset_context *reset_context) 5600 { 5601 struct amdgpu_device *tmp_adev = NULL; 5602 bool need_full_reset, skip_hw_reset; 5603 int r = 0; 5604 5605 /* Try reset handler method first */ 5606 tmp_adev = list_first_entry(device_list_handle, struct amdgpu_device, 5607 reset_list); 5608 5609 reset_context->reset_device_list = device_list_handle; 5610 r = amdgpu_reset_perform_reset(tmp_adev, reset_context); 5611 /* If reset handler not implemented, continue; otherwise return */ 5612 if (r == -EOPNOTSUPP) 5613 r = 0; 5614 else 5615 return r; 5616 5617 /* Reset handler not implemented, use the default method */ 5618 need_full_reset = 5619 test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 5620 skip_hw_reset = test_bit(AMDGPU_SKIP_HW_RESET, &reset_context->flags); 5621 5622 /* 5623 * ASIC reset has to be done on all XGMI hive nodes ASAP 5624 * to allow proper links negotiation in FW (within 1 sec) 5625 */ 5626 if (!skip_hw_reset && need_full_reset) { 5627 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 5628 /* For XGMI run all resets in parallel to speed up the process */ 5629 if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) { 5630 if (!queue_work(system_unbound_wq, 5631 &tmp_adev->xgmi_reset_work)) 5632 r = -EALREADY; 5633 } else 5634 r = amdgpu_asic_reset(tmp_adev); 5635 5636 if (r) { 5637 dev_err(tmp_adev->dev, 5638 "ASIC reset failed with error, %d for drm dev, %s", 5639 r, adev_to_drm(tmp_adev)->unique); 5640 goto out; 5641 } 5642 } 5643 5644 /* For XGMI wait for all resets to complete before proceed */ 5645 if (!r) { 5646 list_for_each_entry(tmp_adev, device_list_handle, 5647 reset_list) { 5648 if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) { 5649 flush_work(&tmp_adev->xgmi_reset_work); 5650 r = tmp_adev->asic_reset_res; 5651 if (r) 5652 break; 5653 } 5654 } 5655 } 5656 } 5657 5658 if (!r && amdgpu_ras_intr_triggered()) { 5659 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 5660 amdgpu_ras_reset_error_count(tmp_adev, 5661 AMDGPU_RAS_BLOCK__MMHUB); 5662 } 5663 5664 amdgpu_ras_intr_cleared(); 5665 } 5666 5667 r = amdgpu_device_reinit_after_reset(reset_context); 5668 if (r == -EAGAIN) 5669 set_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 5670 else 5671 clear_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 5672 5673 out: 5674 return r; 5675 } 5676 5677 static void amdgpu_device_set_mp1_state(struct amdgpu_device *adev) 5678 { 5679 5680 switch (amdgpu_asic_reset_method(adev)) { 5681 case AMD_RESET_METHOD_MODE1: 5682 adev->mp1_state = PP_MP1_STATE_SHUTDOWN; 5683 break; 5684 case AMD_RESET_METHOD_MODE2: 5685 adev->mp1_state = PP_MP1_STATE_RESET; 5686 break; 5687 default: 5688 adev->mp1_state = PP_MP1_STATE_NONE; 5689 break; 5690 } 5691 } 5692 5693 static void amdgpu_device_unset_mp1_state(struct amdgpu_device *adev) 5694 { 5695 amdgpu_vf_error_trans_all(adev); 5696 adev->mp1_state = PP_MP1_STATE_NONE; 5697 } 5698 5699 static void amdgpu_device_resume_display_audio(struct amdgpu_device *adev) 5700 { 5701 struct pci_dev *p = NULL; 5702 5703 p = pci_get_domain_bus_and_slot(pci_domain_nr(adev->pdev->bus), 5704 adev->pdev->bus->number, 1); 5705 if (p) { 5706 pm_runtime_enable(&(p->dev)); 5707 pm_runtime_resume(&(p->dev)); 5708 } 5709 5710 pci_dev_put(p); 5711 } 5712 5713 static int amdgpu_device_suspend_display_audio(struct amdgpu_device *adev) 5714 { 5715 enum amd_reset_method reset_method; 5716 struct pci_dev *p = NULL; 5717 u64 expires; 5718 5719 /* 5720 * For now, only BACO and mode1 reset are confirmed 5721 * to suffer the audio issue without proper suspended. 5722 */ 5723 reset_method = amdgpu_asic_reset_method(adev); 5724 if ((reset_method != AMD_RESET_METHOD_BACO) && 5725 (reset_method != AMD_RESET_METHOD_MODE1)) 5726 return -EINVAL; 5727 5728 p = pci_get_domain_bus_and_slot(pci_domain_nr(adev->pdev->bus), 5729 adev->pdev->bus->number, 1); 5730 if (!p) 5731 return -ENODEV; 5732 5733 expires = pm_runtime_autosuspend_expiration(&(p->dev)); 5734 if (!expires) 5735 /* 5736 * If we cannot get the audio device autosuspend delay, 5737 * a fixed 4S interval will be used. Considering 3S is 5738 * the audio controller default autosuspend delay setting. 5739 * 4S used here is guaranteed to cover that. 5740 */ 5741 expires = ktime_get_mono_fast_ns() + NSEC_PER_SEC * 4ULL; 5742 5743 while (!pm_runtime_status_suspended(&(p->dev))) { 5744 if (!pm_runtime_suspend(&(p->dev))) 5745 break; 5746 5747 if (expires < ktime_get_mono_fast_ns()) { 5748 dev_warn(adev->dev, "failed to suspend display audio\n"); 5749 pci_dev_put(p); 5750 /* TODO: abort the succeeding gpu reset? */ 5751 return -ETIMEDOUT; 5752 } 5753 } 5754 5755 pm_runtime_disable(&(p->dev)); 5756 5757 pci_dev_put(p); 5758 return 0; 5759 } 5760 5761 static inline void amdgpu_device_stop_pending_resets(struct amdgpu_device *adev) 5762 { 5763 struct amdgpu_ras *con = amdgpu_ras_get_context(adev); 5764 5765 #if defined(CONFIG_DEBUG_FS) 5766 if (!amdgpu_sriov_vf(adev)) 5767 cancel_work(&adev->reset_work); 5768 #endif 5769 5770 if (adev->kfd.dev) 5771 cancel_work(&adev->kfd.reset_work); 5772 5773 if (amdgpu_sriov_vf(adev)) 5774 cancel_work(&adev->virt.flr_work); 5775 5776 if (con && adev->ras_enabled) 5777 cancel_work(&con->recovery_work); 5778 5779 } 5780 5781 static int amdgpu_device_health_check(struct list_head *device_list_handle) 5782 { 5783 struct amdgpu_device *tmp_adev; 5784 int ret = 0; 5785 u32 status; 5786 5787 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 5788 pci_read_config_dword(tmp_adev->pdev, PCI_COMMAND, &status); 5789 if (PCI_POSSIBLE_ERROR(status)) { 5790 dev_err(tmp_adev->dev, "device lost from bus!"); 5791 ret = -ENODEV; 5792 } 5793 } 5794 5795 return ret; 5796 } 5797 5798 /** 5799 * amdgpu_device_gpu_recover - reset the asic and recover scheduler 5800 * 5801 * @adev: amdgpu_device pointer 5802 * @job: which job trigger hang 5803 * @reset_context: amdgpu reset context pointer 5804 * 5805 * Attempt to reset the GPU if it has hung (all asics). 5806 * Attempt to do soft-reset or full-reset and reinitialize Asic 5807 * Returns 0 for success or an error on failure. 5808 */ 5809 5810 int amdgpu_device_gpu_recover(struct amdgpu_device *adev, 5811 struct amdgpu_job *job, 5812 struct amdgpu_reset_context *reset_context) 5813 { 5814 struct list_head device_list, *device_list_handle = NULL; 5815 bool job_signaled = false; 5816 struct amdgpu_hive_info *hive = NULL; 5817 struct amdgpu_device *tmp_adev = NULL; 5818 int i, r = 0; 5819 bool need_emergency_restart = false; 5820 bool audio_suspended = false; 5821 int retry_limit = AMDGPU_MAX_RETRY_LIMIT; 5822 5823 /* 5824 * Special case: RAS triggered and full reset isn't supported 5825 */ 5826 need_emergency_restart = amdgpu_ras_need_emergency_restart(adev); 5827 5828 /* 5829 * Flush RAM to disk so that after reboot 5830 * the user can read log and see why the system rebooted. 5831 */ 5832 if (need_emergency_restart && amdgpu_ras_get_context(adev) && 5833 amdgpu_ras_get_context(adev)->reboot) { 5834 DRM_WARN("Emergency reboot."); 5835 5836 ksys_sync_helper(); 5837 emergency_restart(); 5838 } 5839 5840 dev_info(adev->dev, "GPU %s begin!\n", 5841 need_emergency_restart ? "jobs stop":"reset"); 5842 5843 if (!amdgpu_sriov_vf(adev)) 5844 hive = amdgpu_get_xgmi_hive(adev); 5845 if (hive) 5846 mutex_lock(&hive->hive_lock); 5847 5848 reset_context->job = job; 5849 reset_context->hive = hive; 5850 /* 5851 * Build list of devices to reset. 5852 * In case we are in XGMI hive mode, resort the device list 5853 * to put adev in the 1st position. 5854 */ 5855 INIT_LIST_HEAD(&device_list); 5856 if (!amdgpu_sriov_vf(adev) && (adev->gmc.xgmi.num_physical_nodes > 1) && hive) { 5857 list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head) { 5858 list_add_tail(&tmp_adev->reset_list, &device_list); 5859 if (adev->shutdown) 5860 tmp_adev->shutdown = true; 5861 } 5862 if (!list_is_first(&adev->reset_list, &device_list)) 5863 list_rotate_to_front(&adev->reset_list, &device_list); 5864 device_list_handle = &device_list; 5865 } else { 5866 list_add_tail(&adev->reset_list, &device_list); 5867 device_list_handle = &device_list; 5868 } 5869 5870 if (!amdgpu_sriov_vf(adev)) { 5871 r = amdgpu_device_health_check(device_list_handle); 5872 if (r) 5873 goto end_reset; 5874 } 5875 5876 /* We need to lock reset domain only once both for XGMI and single device */ 5877 tmp_adev = list_first_entry(device_list_handle, struct amdgpu_device, 5878 reset_list); 5879 amdgpu_device_lock_reset_domain(tmp_adev->reset_domain); 5880 5881 /* block all schedulers and reset given job's ring */ 5882 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 5883 5884 amdgpu_device_set_mp1_state(tmp_adev); 5885 5886 /* 5887 * Try to put the audio codec into suspend state 5888 * before gpu reset started. 5889 * 5890 * Due to the power domain of the graphics device 5891 * is shared with AZ power domain. Without this, 5892 * we may change the audio hardware from behind 5893 * the audio driver's back. That will trigger 5894 * some audio codec errors. 5895 */ 5896 if (!amdgpu_device_suspend_display_audio(tmp_adev)) 5897 audio_suspended = true; 5898 5899 amdgpu_ras_set_error_query_ready(tmp_adev, false); 5900 5901 cancel_delayed_work_sync(&tmp_adev->delayed_init_work); 5902 5903 amdgpu_amdkfd_pre_reset(tmp_adev, reset_context); 5904 5905 /* 5906 * Mark these ASICs to be reseted as untracked first 5907 * And add them back after reset completed 5908 */ 5909 amdgpu_unregister_gpu_instance(tmp_adev); 5910 5911 drm_client_dev_suspend(adev_to_drm(tmp_adev), false); 5912 5913 /* disable ras on ALL IPs */ 5914 if (!need_emergency_restart && 5915 amdgpu_device_ip_need_full_reset(tmp_adev)) 5916 amdgpu_ras_suspend(tmp_adev); 5917 5918 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 5919 struct amdgpu_ring *ring = tmp_adev->rings[i]; 5920 5921 if (!amdgpu_ring_sched_ready(ring)) 5922 continue; 5923 5924 drm_sched_stop(&ring->sched, job ? &job->base : NULL); 5925 5926 if (need_emergency_restart) 5927 amdgpu_job_stop_all_jobs_on_sched(&ring->sched); 5928 } 5929 atomic_inc(&tmp_adev->gpu_reset_counter); 5930 } 5931 5932 if (need_emergency_restart) 5933 goto skip_sched_resume; 5934 5935 /* 5936 * Must check guilty signal here since after this point all old 5937 * HW fences are force signaled. 5938 * 5939 * job->base holds a reference to parent fence 5940 */ 5941 if (job && dma_fence_is_signaled(&job->hw_fence)) { 5942 job_signaled = true; 5943 dev_info(adev->dev, "Guilty job already signaled, skipping HW reset"); 5944 goto skip_hw_reset; 5945 } 5946 5947 retry: /* Rest of adevs pre asic reset from XGMI hive. */ 5948 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 5949 r = amdgpu_device_pre_asic_reset(tmp_adev, reset_context); 5950 /*TODO Should we stop ?*/ 5951 if (r) { 5952 dev_err(tmp_adev->dev, "GPU pre asic reset failed with err, %d for drm dev, %s ", 5953 r, adev_to_drm(tmp_adev)->unique); 5954 tmp_adev->asic_reset_res = r; 5955 } 5956 } 5957 5958 /* Actual ASIC resets if needed.*/ 5959 /* Host driver will handle XGMI hive reset for SRIOV */ 5960 if (amdgpu_sriov_vf(adev)) { 5961 if (amdgpu_ras_get_fed_status(adev) || amdgpu_virt_rcvd_ras_interrupt(adev)) { 5962 dev_dbg(adev->dev, "Detected RAS error, wait for FLR completion\n"); 5963 amdgpu_ras_set_fed(adev, true); 5964 set_bit(AMDGPU_HOST_FLR, &reset_context->flags); 5965 } 5966 5967 r = amdgpu_device_reset_sriov(adev, reset_context); 5968 if (AMDGPU_RETRY_SRIOV_RESET(r) && (retry_limit--) > 0) { 5969 amdgpu_virt_release_full_gpu(adev, true); 5970 goto retry; 5971 } 5972 if (r) 5973 adev->asic_reset_res = r; 5974 } else { 5975 r = amdgpu_do_asic_reset(device_list_handle, reset_context); 5976 if (r && r == -EAGAIN) 5977 goto retry; 5978 } 5979 5980 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 5981 /* 5982 * Drop any pending non scheduler resets queued before reset is done. 5983 * Any reset scheduled after this point would be valid. Scheduler resets 5984 * were already dropped during drm_sched_stop and no new ones can come 5985 * in before drm_sched_start. 5986 */ 5987 amdgpu_device_stop_pending_resets(tmp_adev); 5988 } 5989 5990 skip_hw_reset: 5991 5992 /* Post ASIC reset for all devs .*/ 5993 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 5994 5995 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 5996 struct amdgpu_ring *ring = tmp_adev->rings[i]; 5997 5998 if (!amdgpu_ring_sched_ready(ring)) 5999 continue; 6000 6001 drm_sched_start(&ring->sched, 0); 6002 } 6003 6004 if (!drm_drv_uses_atomic_modeset(adev_to_drm(tmp_adev)) && !job_signaled) 6005 drm_helper_resume_force_mode(adev_to_drm(tmp_adev)); 6006 6007 if (tmp_adev->asic_reset_res) 6008 r = tmp_adev->asic_reset_res; 6009 6010 tmp_adev->asic_reset_res = 0; 6011 6012 if (r) { 6013 /* bad news, how to tell it to userspace ? 6014 * for ras error, we should report GPU bad status instead of 6015 * reset failure 6016 */ 6017 if (reset_context->src != AMDGPU_RESET_SRC_RAS || 6018 !amdgpu_ras_eeprom_check_err_threshold(tmp_adev)) 6019 dev_info(tmp_adev->dev, "GPU reset(%d) failed\n", 6020 atomic_read(&tmp_adev->gpu_reset_counter)); 6021 amdgpu_vf_error_put(tmp_adev, AMDGIM_ERROR_VF_GPU_RESET_FAIL, 0, r); 6022 } else { 6023 dev_info(tmp_adev->dev, "GPU reset(%d) succeeded!\n", atomic_read(&tmp_adev->gpu_reset_counter)); 6024 if (amdgpu_acpi_smart_shift_update(adev_to_drm(tmp_adev), AMDGPU_SS_DEV_D0)) 6025 DRM_WARN("smart shift update failed\n"); 6026 } 6027 } 6028 6029 skip_sched_resume: 6030 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 6031 /* unlock kfd: SRIOV would do it separately */ 6032 if (!need_emergency_restart && !amdgpu_sriov_vf(tmp_adev)) 6033 amdgpu_amdkfd_post_reset(tmp_adev); 6034 6035 /* kfd_post_reset will do nothing if kfd device is not initialized, 6036 * need to bring up kfd here if it's not be initialized before 6037 */ 6038 if (!adev->kfd.init_complete) 6039 amdgpu_amdkfd_device_init(adev); 6040 6041 if (audio_suspended) 6042 amdgpu_device_resume_display_audio(tmp_adev); 6043 6044 amdgpu_device_unset_mp1_state(tmp_adev); 6045 6046 amdgpu_ras_set_error_query_ready(tmp_adev, true); 6047 } 6048 6049 tmp_adev = list_first_entry(device_list_handle, struct amdgpu_device, 6050 reset_list); 6051 amdgpu_device_unlock_reset_domain(tmp_adev->reset_domain); 6052 6053 end_reset: 6054 if (hive) { 6055 mutex_unlock(&hive->hive_lock); 6056 amdgpu_put_xgmi_hive(hive); 6057 } 6058 6059 if (r) 6060 dev_info(adev->dev, "GPU reset end with ret = %d\n", r); 6061 6062 atomic_set(&adev->reset_domain->reset_res, r); 6063 return r; 6064 } 6065 6066 /** 6067 * amdgpu_device_partner_bandwidth - find the bandwidth of appropriate partner 6068 * 6069 * @adev: amdgpu_device pointer 6070 * @speed: pointer to the speed of the link 6071 * @width: pointer to the width of the link 6072 * 6073 * Evaluate the hierarchy to find the speed and bandwidth capabilities of the 6074 * first physical partner to an AMD dGPU. 6075 * This will exclude any virtual switches and links. 6076 */ 6077 static void amdgpu_device_partner_bandwidth(struct amdgpu_device *adev, 6078 enum pci_bus_speed *speed, 6079 enum pcie_link_width *width) 6080 { 6081 struct pci_dev *parent = adev->pdev; 6082 6083 if (!speed || !width) 6084 return; 6085 6086 *speed = PCI_SPEED_UNKNOWN; 6087 *width = PCIE_LNK_WIDTH_UNKNOWN; 6088 6089 if (amdgpu_device_pcie_dynamic_switching_supported(adev)) { 6090 while ((parent = pci_upstream_bridge(parent))) { 6091 /* skip upstream/downstream switches internal to dGPU*/ 6092 if (parent->vendor == PCI_VENDOR_ID_ATI) 6093 continue; 6094 *speed = pcie_get_speed_cap(parent); 6095 *width = pcie_get_width_cap(parent); 6096 break; 6097 } 6098 } else { 6099 /* use the current speeds rather than max if switching is not supported */ 6100 pcie_bandwidth_available(adev->pdev, NULL, speed, width); 6101 } 6102 } 6103 6104 /** 6105 * amdgpu_device_get_pcie_info - fence pcie info about the PCIE slot 6106 * 6107 * @adev: amdgpu_device pointer 6108 * 6109 * Fetchs and stores in the driver the PCIE capabilities (gen speed 6110 * and lanes) of the slot the device is in. Handles APUs and 6111 * virtualized environments where PCIE config space may not be available. 6112 */ 6113 static void amdgpu_device_get_pcie_info(struct amdgpu_device *adev) 6114 { 6115 struct pci_dev *pdev; 6116 enum pci_bus_speed speed_cap, platform_speed_cap; 6117 enum pcie_link_width platform_link_width; 6118 6119 if (amdgpu_pcie_gen_cap) 6120 adev->pm.pcie_gen_mask = amdgpu_pcie_gen_cap; 6121 6122 if (amdgpu_pcie_lane_cap) 6123 adev->pm.pcie_mlw_mask = amdgpu_pcie_lane_cap; 6124 6125 /* covers APUs as well */ 6126 if (pci_is_root_bus(adev->pdev->bus) && !amdgpu_passthrough(adev)) { 6127 if (adev->pm.pcie_gen_mask == 0) 6128 adev->pm.pcie_gen_mask = AMDGPU_DEFAULT_PCIE_GEN_MASK; 6129 if (adev->pm.pcie_mlw_mask == 0) 6130 adev->pm.pcie_mlw_mask = AMDGPU_DEFAULT_PCIE_MLW_MASK; 6131 return; 6132 } 6133 6134 if (adev->pm.pcie_gen_mask && adev->pm.pcie_mlw_mask) 6135 return; 6136 6137 amdgpu_device_partner_bandwidth(adev, &platform_speed_cap, 6138 &platform_link_width); 6139 6140 if (adev->pm.pcie_gen_mask == 0) { 6141 /* asic caps */ 6142 pdev = adev->pdev; 6143 speed_cap = pcie_get_speed_cap(pdev); 6144 if (speed_cap == PCI_SPEED_UNKNOWN) { 6145 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 6146 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 | 6147 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3); 6148 } else { 6149 if (speed_cap == PCIE_SPEED_32_0GT) 6150 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 6151 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 | 6152 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3 | 6153 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN4 | 6154 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN5); 6155 else if (speed_cap == PCIE_SPEED_16_0GT) 6156 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 6157 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 | 6158 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3 | 6159 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN4); 6160 else if (speed_cap == PCIE_SPEED_8_0GT) 6161 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 6162 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 | 6163 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3); 6164 else if (speed_cap == PCIE_SPEED_5_0GT) 6165 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 6166 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2); 6167 else 6168 adev->pm.pcie_gen_mask |= CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1; 6169 } 6170 /* platform caps */ 6171 if (platform_speed_cap == PCI_SPEED_UNKNOWN) { 6172 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 6173 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2); 6174 } else { 6175 if (platform_speed_cap == PCIE_SPEED_32_0GT) 6176 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 6177 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 | 6178 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3 | 6179 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN4 | 6180 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN5); 6181 else if (platform_speed_cap == PCIE_SPEED_16_0GT) 6182 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 6183 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 | 6184 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3 | 6185 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN4); 6186 else if (platform_speed_cap == PCIE_SPEED_8_0GT) 6187 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 6188 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 | 6189 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3); 6190 else if (platform_speed_cap == PCIE_SPEED_5_0GT) 6191 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 6192 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2); 6193 else 6194 adev->pm.pcie_gen_mask |= CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1; 6195 6196 } 6197 } 6198 if (adev->pm.pcie_mlw_mask == 0) { 6199 if (platform_link_width == PCIE_LNK_WIDTH_UNKNOWN) { 6200 adev->pm.pcie_mlw_mask |= AMDGPU_DEFAULT_PCIE_MLW_MASK; 6201 } else { 6202 switch (platform_link_width) { 6203 case PCIE_LNK_X32: 6204 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X32 | 6205 CAIL_PCIE_LINK_WIDTH_SUPPORT_X16 | 6206 CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 | 6207 CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 | 6208 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 6209 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 6210 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 6211 break; 6212 case PCIE_LNK_X16: 6213 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X16 | 6214 CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 | 6215 CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 | 6216 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 6217 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 6218 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 6219 break; 6220 case PCIE_LNK_X12: 6221 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 | 6222 CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 | 6223 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 6224 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 6225 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 6226 break; 6227 case PCIE_LNK_X8: 6228 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 | 6229 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 6230 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 6231 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 6232 break; 6233 case PCIE_LNK_X4: 6234 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 6235 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 6236 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 6237 break; 6238 case PCIE_LNK_X2: 6239 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 6240 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 6241 break; 6242 case PCIE_LNK_X1: 6243 adev->pm.pcie_mlw_mask = CAIL_PCIE_LINK_WIDTH_SUPPORT_X1; 6244 break; 6245 default: 6246 break; 6247 } 6248 } 6249 } 6250 } 6251 6252 /** 6253 * amdgpu_device_is_peer_accessible - Check peer access through PCIe BAR 6254 * 6255 * @adev: amdgpu_device pointer 6256 * @peer_adev: amdgpu_device pointer for peer device trying to access @adev 6257 * 6258 * Return true if @peer_adev can access (DMA) @adev through the PCIe 6259 * BAR, i.e. @adev is "large BAR" and the BAR matches the DMA mask of 6260 * @peer_adev. 6261 */ 6262 bool amdgpu_device_is_peer_accessible(struct amdgpu_device *adev, 6263 struct amdgpu_device *peer_adev) 6264 { 6265 #ifdef CONFIG_HSA_AMD_P2P 6266 bool p2p_access = 6267 !adev->gmc.xgmi.connected_to_cpu && 6268 !(pci_p2pdma_distance(adev->pdev, peer_adev->dev, false) < 0); 6269 if (!p2p_access) 6270 dev_info(adev->dev, "PCIe P2P access from peer device %s is not supported by the chipset\n", 6271 pci_name(peer_adev->pdev)); 6272 6273 bool is_large_bar = adev->gmc.visible_vram_size && 6274 adev->gmc.real_vram_size == adev->gmc.visible_vram_size; 6275 bool p2p_addressable = amdgpu_device_check_iommu_remap(peer_adev); 6276 6277 if (!p2p_addressable) { 6278 uint64_t address_mask = peer_adev->dev->dma_mask ? 6279 ~*peer_adev->dev->dma_mask : ~((1ULL << 32) - 1); 6280 resource_size_t aper_limit = 6281 adev->gmc.aper_base + adev->gmc.aper_size - 1; 6282 6283 p2p_addressable = !(adev->gmc.aper_base & address_mask || 6284 aper_limit & address_mask); 6285 } 6286 return pcie_p2p && is_large_bar && p2p_access && p2p_addressable; 6287 #else 6288 return false; 6289 #endif 6290 } 6291 6292 int amdgpu_device_baco_enter(struct drm_device *dev) 6293 { 6294 struct amdgpu_device *adev = drm_to_adev(dev); 6295 struct amdgpu_ras *ras = amdgpu_ras_get_context(adev); 6296 6297 if (!amdgpu_device_supports_baco(dev)) 6298 return -ENOTSUPP; 6299 6300 if (ras && adev->ras_enabled && 6301 adev->nbio.funcs->enable_doorbell_interrupt) 6302 adev->nbio.funcs->enable_doorbell_interrupt(adev, false); 6303 6304 return amdgpu_dpm_baco_enter(adev); 6305 } 6306 6307 int amdgpu_device_baco_exit(struct drm_device *dev) 6308 { 6309 struct amdgpu_device *adev = drm_to_adev(dev); 6310 struct amdgpu_ras *ras = amdgpu_ras_get_context(adev); 6311 int ret = 0; 6312 6313 if (!amdgpu_device_supports_baco(dev)) 6314 return -ENOTSUPP; 6315 6316 ret = amdgpu_dpm_baco_exit(adev); 6317 if (ret) 6318 return ret; 6319 6320 if (ras && adev->ras_enabled && 6321 adev->nbio.funcs->enable_doorbell_interrupt) 6322 adev->nbio.funcs->enable_doorbell_interrupt(adev, true); 6323 6324 if (amdgpu_passthrough(adev) && adev->nbio.funcs && 6325 adev->nbio.funcs->clear_doorbell_interrupt) 6326 adev->nbio.funcs->clear_doorbell_interrupt(adev); 6327 6328 return 0; 6329 } 6330 6331 /** 6332 * amdgpu_pci_error_detected - Called when a PCI error is detected. 6333 * @pdev: PCI device struct 6334 * @state: PCI channel state 6335 * 6336 * Description: Called when a PCI error is detected. 6337 * 6338 * Return: PCI_ERS_RESULT_NEED_RESET or PCI_ERS_RESULT_DISCONNECT. 6339 */ 6340 pci_ers_result_t amdgpu_pci_error_detected(struct pci_dev *pdev, pci_channel_state_t state) 6341 { 6342 struct drm_device *dev = pci_get_drvdata(pdev); 6343 struct amdgpu_device *adev = drm_to_adev(dev); 6344 int i; 6345 6346 DRM_INFO("PCI error: detected callback, state(%d)!!\n", state); 6347 6348 if (adev->gmc.xgmi.num_physical_nodes > 1) { 6349 DRM_WARN("No support for XGMI hive yet..."); 6350 return PCI_ERS_RESULT_DISCONNECT; 6351 } 6352 6353 adev->pci_channel_state = state; 6354 6355 switch (state) { 6356 case pci_channel_io_normal: 6357 return PCI_ERS_RESULT_CAN_RECOVER; 6358 /* Fatal error, prepare for slot reset */ 6359 case pci_channel_io_frozen: 6360 /* 6361 * Locking adev->reset_domain->sem will prevent any external access 6362 * to GPU during PCI error recovery 6363 */ 6364 amdgpu_device_lock_reset_domain(adev->reset_domain); 6365 amdgpu_device_set_mp1_state(adev); 6366 6367 /* 6368 * Block any work scheduling as we do for regular GPU reset 6369 * for the duration of the recovery 6370 */ 6371 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 6372 struct amdgpu_ring *ring = adev->rings[i]; 6373 6374 if (!amdgpu_ring_sched_ready(ring)) 6375 continue; 6376 6377 drm_sched_stop(&ring->sched, NULL); 6378 } 6379 atomic_inc(&adev->gpu_reset_counter); 6380 return PCI_ERS_RESULT_NEED_RESET; 6381 case pci_channel_io_perm_failure: 6382 /* Permanent error, prepare for device removal */ 6383 return PCI_ERS_RESULT_DISCONNECT; 6384 } 6385 6386 return PCI_ERS_RESULT_NEED_RESET; 6387 } 6388 6389 /** 6390 * amdgpu_pci_mmio_enabled - Enable MMIO and dump debug registers 6391 * @pdev: pointer to PCI device 6392 */ 6393 pci_ers_result_t amdgpu_pci_mmio_enabled(struct pci_dev *pdev) 6394 { 6395 6396 DRM_INFO("PCI error: mmio enabled callback!!\n"); 6397 6398 /* TODO - dump whatever for debugging purposes */ 6399 6400 /* This called only if amdgpu_pci_error_detected returns 6401 * PCI_ERS_RESULT_CAN_RECOVER. Read/write to the device still 6402 * works, no need to reset slot. 6403 */ 6404 6405 return PCI_ERS_RESULT_RECOVERED; 6406 } 6407 6408 /** 6409 * amdgpu_pci_slot_reset - Called when PCI slot has been reset. 6410 * @pdev: PCI device struct 6411 * 6412 * Description: This routine is called by the pci error recovery 6413 * code after the PCI slot has been reset, just before we 6414 * should resume normal operations. 6415 */ 6416 pci_ers_result_t amdgpu_pci_slot_reset(struct pci_dev *pdev) 6417 { 6418 struct drm_device *dev = pci_get_drvdata(pdev); 6419 struct amdgpu_device *adev = drm_to_adev(dev); 6420 int r, i; 6421 struct amdgpu_reset_context reset_context; 6422 u32 memsize; 6423 struct list_head device_list; 6424 6425 /* PCI error slot reset should be skipped During RAS recovery */ 6426 if ((amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 3) || 6427 amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 4)) && 6428 amdgpu_ras_in_recovery(adev)) 6429 return PCI_ERS_RESULT_RECOVERED; 6430 6431 DRM_INFO("PCI error: slot reset callback!!\n"); 6432 6433 memset(&reset_context, 0, sizeof(reset_context)); 6434 6435 INIT_LIST_HEAD(&device_list); 6436 list_add_tail(&adev->reset_list, &device_list); 6437 6438 /* wait for asic to come out of reset */ 6439 msleep(500); 6440 6441 /* Restore PCI confspace */ 6442 amdgpu_device_load_pci_state(pdev); 6443 6444 /* confirm ASIC came out of reset */ 6445 for (i = 0; i < adev->usec_timeout; i++) { 6446 memsize = amdgpu_asic_get_config_memsize(adev); 6447 6448 if (memsize != 0xffffffff) 6449 break; 6450 udelay(1); 6451 } 6452 if (memsize == 0xffffffff) { 6453 r = -ETIME; 6454 goto out; 6455 } 6456 6457 reset_context.method = AMD_RESET_METHOD_NONE; 6458 reset_context.reset_req_dev = adev; 6459 set_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags); 6460 set_bit(AMDGPU_SKIP_HW_RESET, &reset_context.flags); 6461 6462 adev->no_hw_access = true; 6463 r = amdgpu_device_pre_asic_reset(adev, &reset_context); 6464 adev->no_hw_access = false; 6465 if (r) 6466 goto out; 6467 6468 r = amdgpu_do_asic_reset(&device_list, &reset_context); 6469 6470 out: 6471 if (!r) { 6472 if (amdgpu_device_cache_pci_state(adev->pdev)) 6473 pci_restore_state(adev->pdev); 6474 6475 DRM_INFO("PCIe error recovery succeeded\n"); 6476 } else { 6477 DRM_ERROR("PCIe error recovery failed, err:%d", r); 6478 amdgpu_device_unset_mp1_state(adev); 6479 amdgpu_device_unlock_reset_domain(adev->reset_domain); 6480 } 6481 6482 return r ? PCI_ERS_RESULT_DISCONNECT : PCI_ERS_RESULT_RECOVERED; 6483 } 6484 6485 /** 6486 * amdgpu_pci_resume() - resume normal ops after PCI reset 6487 * @pdev: pointer to PCI device 6488 * 6489 * Called when the error recovery driver tells us that its 6490 * OK to resume normal operation. 6491 */ 6492 void amdgpu_pci_resume(struct pci_dev *pdev) 6493 { 6494 struct drm_device *dev = pci_get_drvdata(pdev); 6495 struct amdgpu_device *adev = drm_to_adev(dev); 6496 int i; 6497 6498 6499 DRM_INFO("PCI error: resume callback!!\n"); 6500 6501 /* Only continue execution for the case of pci_channel_io_frozen */ 6502 if (adev->pci_channel_state != pci_channel_io_frozen) 6503 return; 6504 6505 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 6506 struct amdgpu_ring *ring = adev->rings[i]; 6507 6508 if (!amdgpu_ring_sched_ready(ring)) 6509 continue; 6510 6511 drm_sched_start(&ring->sched, 0); 6512 } 6513 6514 amdgpu_device_unset_mp1_state(adev); 6515 amdgpu_device_unlock_reset_domain(adev->reset_domain); 6516 } 6517 6518 bool amdgpu_device_cache_pci_state(struct pci_dev *pdev) 6519 { 6520 struct drm_device *dev = pci_get_drvdata(pdev); 6521 struct amdgpu_device *adev = drm_to_adev(dev); 6522 int r; 6523 6524 if (amdgpu_sriov_vf(adev)) 6525 return false; 6526 6527 r = pci_save_state(pdev); 6528 if (!r) { 6529 kfree(adev->pci_state); 6530 6531 adev->pci_state = pci_store_saved_state(pdev); 6532 6533 if (!adev->pci_state) { 6534 DRM_ERROR("Failed to store PCI saved state"); 6535 return false; 6536 } 6537 } else { 6538 DRM_WARN("Failed to save PCI state, err:%d\n", r); 6539 return false; 6540 } 6541 6542 return true; 6543 } 6544 6545 bool amdgpu_device_load_pci_state(struct pci_dev *pdev) 6546 { 6547 struct drm_device *dev = pci_get_drvdata(pdev); 6548 struct amdgpu_device *adev = drm_to_adev(dev); 6549 int r; 6550 6551 if (!adev->pci_state) 6552 return false; 6553 6554 r = pci_load_saved_state(pdev, adev->pci_state); 6555 6556 if (!r) { 6557 pci_restore_state(pdev); 6558 } else { 6559 DRM_WARN("Failed to load PCI state, err:%d\n", r); 6560 return false; 6561 } 6562 6563 return true; 6564 } 6565 6566 void amdgpu_device_flush_hdp(struct amdgpu_device *adev, 6567 struct amdgpu_ring *ring) 6568 { 6569 #ifdef CONFIG_X86_64 6570 if ((adev->flags & AMD_IS_APU) && !amdgpu_passthrough(adev)) 6571 return; 6572 #endif 6573 if (adev->gmc.xgmi.connected_to_cpu) 6574 return; 6575 6576 if (ring && ring->funcs->emit_hdp_flush) 6577 amdgpu_ring_emit_hdp_flush(ring); 6578 else 6579 amdgpu_asic_flush_hdp(adev, ring); 6580 } 6581 6582 void amdgpu_device_invalidate_hdp(struct amdgpu_device *adev, 6583 struct amdgpu_ring *ring) 6584 { 6585 #ifdef CONFIG_X86_64 6586 if ((adev->flags & AMD_IS_APU) && !amdgpu_passthrough(adev)) 6587 return; 6588 #endif 6589 if (adev->gmc.xgmi.connected_to_cpu) 6590 return; 6591 6592 amdgpu_asic_invalidate_hdp(adev, ring); 6593 } 6594 6595 int amdgpu_in_reset(struct amdgpu_device *adev) 6596 { 6597 return atomic_read(&adev->reset_domain->in_gpu_reset); 6598 } 6599 6600 /** 6601 * amdgpu_device_halt() - bring hardware to some kind of halt state 6602 * 6603 * @adev: amdgpu_device pointer 6604 * 6605 * Bring hardware to some kind of halt state so that no one can touch it 6606 * any more. It will help to maintain error context when error occurred. 6607 * Compare to a simple hang, the system will keep stable at least for SSH 6608 * access. Then it should be trivial to inspect the hardware state and 6609 * see what's going on. Implemented as following: 6610 * 6611 * 1. drm_dev_unplug() makes device inaccessible to user space(IOCTLs, etc), 6612 * clears all CPU mappings to device, disallows remappings through page faults 6613 * 2. amdgpu_irq_disable_all() disables all interrupts 6614 * 3. amdgpu_fence_driver_hw_fini() signals all HW fences 6615 * 4. set adev->no_hw_access to avoid potential crashes after setp 5 6616 * 5. amdgpu_device_unmap_mmio() clears all MMIO mappings 6617 * 6. pci_disable_device() and pci_wait_for_pending_transaction() 6618 * flush any in flight DMA operations 6619 */ 6620 void amdgpu_device_halt(struct amdgpu_device *adev) 6621 { 6622 struct pci_dev *pdev = adev->pdev; 6623 struct drm_device *ddev = adev_to_drm(adev); 6624 6625 amdgpu_xcp_dev_unplug(adev); 6626 drm_dev_unplug(ddev); 6627 6628 amdgpu_irq_disable_all(adev); 6629 6630 amdgpu_fence_driver_hw_fini(adev); 6631 6632 adev->no_hw_access = true; 6633 6634 amdgpu_device_unmap_mmio(adev); 6635 6636 pci_disable_device(pdev); 6637 pci_wait_for_pending_transaction(pdev); 6638 } 6639 6640 u32 amdgpu_device_pcie_port_rreg(struct amdgpu_device *adev, 6641 u32 reg) 6642 { 6643 unsigned long flags, address, data; 6644 u32 r; 6645 6646 address = adev->nbio.funcs->get_pcie_port_index_offset(adev); 6647 data = adev->nbio.funcs->get_pcie_port_data_offset(adev); 6648 6649 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 6650 WREG32(address, reg * 4); 6651 (void)RREG32(address); 6652 r = RREG32(data); 6653 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 6654 return r; 6655 } 6656 6657 void amdgpu_device_pcie_port_wreg(struct amdgpu_device *adev, 6658 u32 reg, u32 v) 6659 { 6660 unsigned long flags, address, data; 6661 6662 address = adev->nbio.funcs->get_pcie_port_index_offset(adev); 6663 data = adev->nbio.funcs->get_pcie_port_data_offset(adev); 6664 6665 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 6666 WREG32(address, reg * 4); 6667 (void)RREG32(address); 6668 WREG32(data, v); 6669 (void)RREG32(data); 6670 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 6671 } 6672 6673 /** 6674 * amdgpu_device_get_gang - return a reference to the current gang 6675 * @adev: amdgpu_device pointer 6676 * 6677 * Returns: A new reference to the current gang leader. 6678 */ 6679 struct dma_fence *amdgpu_device_get_gang(struct amdgpu_device *adev) 6680 { 6681 struct dma_fence *fence; 6682 6683 rcu_read_lock(); 6684 fence = dma_fence_get_rcu_safe(&adev->gang_submit); 6685 rcu_read_unlock(); 6686 return fence; 6687 } 6688 6689 /** 6690 * amdgpu_device_switch_gang - switch to a new gang 6691 * @adev: amdgpu_device pointer 6692 * @gang: the gang to switch to 6693 * 6694 * Try to switch to a new gang. 6695 * Returns: NULL if we switched to the new gang or a reference to the current 6696 * gang leader. 6697 */ 6698 struct dma_fence *amdgpu_device_switch_gang(struct amdgpu_device *adev, 6699 struct dma_fence *gang) 6700 { 6701 struct dma_fence *old = NULL; 6702 6703 do { 6704 dma_fence_put(old); 6705 old = amdgpu_device_get_gang(adev); 6706 if (old == gang) 6707 break; 6708 6709 if (!dma_fence_is_signaled(old)) 6710 return old; 6711 6712 } while (cmpxchg((struct dma_fence __force **)&adev->gang_submit, 6713 old, gang) != old); 6714 6715 dma_fence_put(old); 6716 return NULL; 6717 } 6718 6719 bool amdgpu_device_has_display_hardware(struct amdgpu_device *adev) 6720 { 6721 switch (adev->asic_type) { 6722 #ifdef CONFIG_DRM_AMDGPU_SI 6723 case CHIP_HAINAN: 6724 #endif 6725 case CHIP_TOPAZ: 6726 /* chips with no display hardware */ 6727 return false; 6728 #ifdef CONFIG_DRM_AMDGPU_SI 6729 case CHIP_TAHITI: 6730 case CHIP_PITCAIRN: 6731 case CHIP_VERDE: 6732 case CHIP_OLAND: 6733 #endif 6734 #ifdef CONFIG_DRM_AMDGPU_CIK 6735 case CHIP_BONAIRE: 6736 case CHIP_HAWAII: 6737 case CHIP_KAVERI: 6738 case CHIP_KABINI: 6739 case CHIP_MULLINS: 6740 #endif 6741 case CHIP_TONGA: 6742 case CHIP_FIJI: 6743 case CHIP_POLARIS10: 6744 case CHIP_POLARIS11: 6745 case CHIP_POLARIS12: 6746 case CHIP_VEGAM: 6747 case CHIP_CARRIZO: 6748 case CHIP_STONEY: 6749 /* chips with display hardware */ 6750 return true; 6751 default: 6752 /* IP discovery */ 6753 if (!amdgpu_ip_version(adev, DCE_HWIP, 0) || 6754 (adev->harvest_ip_mask & AMD_HARVEST_IP_DMU_MASK)) 6755 return false; 6756 return true; 6757 } 6758 } 6759 6760 uint32_t amdgpu_device_wait_on_rreg(struct amdgpu_device *adev, 6761 uint32_t inst, uint32_t reg_addr, char reg_name[], 6762 uint32_t expected_value, uint32_t mask) 6763 { 6764 uint32_t ret = 0; 6765 uint32_t old_ = 0; 6766 uint32_t tmp_ = RREG32(reg_addr); 6767 uint32_t loop = adev->usec_timeout; 6768 6769 while ((tmp_ & (mask)) != (expected_value)) { 6770 if (old_ != tmp_) { 6771 loop = adev->usec_timeout; 6772 old_ = tmp_; 6773 } else 6774 udelay(1); 6775 tmp_ = RREG32(reg_addr); 6776 loop--; 6777 if (!loop) { 6778 DRM_WARN("Register(%d) [%s] failed to reach value 0x%08x != 0x%08xn", 6779 inst, reg_name, (uint32_t)expected_value, 6780 (uint32_t)(tmp_ & (mask))); 6781 ret = -ETIMEDOUT; 6782 break; 6783 } 6784 } 6785 return ret; 6786 } 6787 6788 ssize_t amdgpu_get_soft_full_reset_mask(struct amdgpu_ring *ring) 6789 { 6790 ssize_t size = 0; 6791 6792 if (!ring || !ring->adev) 6793 return size; 6794 6795 if (amdgpu_device_should_recover_gpu(ring->adev)) 6796 size |= AMDGPU_RESET_TYPE_FULL; 6797 6798 if (unlikely(!ring->adev->debug_disable_soft_recovery) && 6799 !amdgpu_sriov_vf(ring->adev) && ring->funcs->soft_recovery) 6800 size |= AMDGPU_RESET_TYPE_SOFT_RESET; 6801 6802 return size; 6803 } 6804 6805 ssize_t amdgpu_show_reset_mask(char *buf, uint32_t supported_reset) 6806 { 6807 ssize_t size = 0; 6808 6809 if (supported_reset == 0) { 6810 size += sysfs_emit_at(buf, size, "unsupported"); 6811 size += sysfs_emit_at(buf, size, "\n"); 6812 return size; 6813 6814 } 6815 6816 if (supported_reset & AMDGPU_RESET_TYPE_SOFT_RESET) 6817 size += sysfs_emit_at(buf, size, "soft "); 6818 6819 if (supported_reset & AMDGPU_RESET_TYPE_PER_QUEUE) 6820 size += sysfs_emit_at(buf, size, "queue "); 6821 6822 if (supported_reset & AMDGPU_RESET_TYPE_PER_PIPE) 6823 size += sysfs_emit_at(buf, size, "pipe "); 6824 6825 if (supported_reset & AMDGPU_RESET_TYPE_FULL) 6826 size += sysfs_emit_at(buf, size, "full "); 6827 6828 size += sysfs_emit_at(buf, size, "\n"); 6829 return size; 6830 } 6831