1 /* 2 * Copyright 2008 Advanced Micro Devices, Inc. 3 * Copyright 2008 Red Hat Inc. 4 * Copyright 2009 Jerome Glisse. 5 * 6 * Permission is hereby granted, free of charge, to any person obtaining a 7 * copy of this software and associated documentation files (the "Software"), 8 * to deal in the Software without restriction, including without limitation 9 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 10 * and/or sell copies of the Software, and to permit persons to whom the 11 * Software is furnished to do so, subject to the following conditions: 12 * 13 * The above copyright notice and this permission notice shall be included in 14 * all copies or substantial portions of the Software. 15 * 16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 19 * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR 20 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, 21 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR 22 * OTHER DEALINGS IN THE SOFTWARE. 23 * 24 * Authors: Dave Airlie 25 * Alex Deucher 26 * Jerome Glisse 27 */ 28 29 #include <linux/aperture.h> 30 #include <linux/power_supply.h> 31 #include <linux/kthread.h> 32 #include <linux/module.h> 33 #include <linux/console.h> 34 #include <linux/slab.h> 35 #include <linux/iommu.h> 36 #include <linux/pci.h> 37 #include <linux/pci-p2pdma.h> 38 #include <linux/apple-gmux.h> 39 40 #include <drm/drm_atomic_helper.h> 41 #include <drm/drm_client_event.h> 42 #include <drm/drm_crtc_helper.h> 43 #include <drm/drm_probe_helper.h> 44 #include <drm/amdgpu_drm.h> 45 #include <linux/device.h> 46 #include <linux/vgaarb.h> 47 #include <linux/vga_switcheroo.h> 48 #include <linux/efi.h> 49 #include "amdgpu.h" 50 #include "amdgpu_trace.h" 51 #include "amdgpu_i2c.h" 52 #include "atom.h" 53 #include "amdgpu_atombios.h" 54 #include "amdgpu_atomfirmware.h" 55 #include "amd_pcie.h" 56 #ifdef CONFIG_DRM_AMDGPU_SI 57 #include "si.h" 58 #endif 59 #ifdef CONFIG_DRM_AMDGPU_CIK 60 #include "cik.h" 61 #endif 62 #include "vi.h" 63 #include "soc15.h" 64 #include "nv.h" 65 #include "bif/bif_4_1_d.h" 66 #include <linux/firmware.h> 67 #include "amdgpu_vf_error.h" 68 69 #include "amdgpu_amdkfd.h" 70 #include "amdgpu_pm.h" 71 72 #include "amdgpu_xgmi.h" 73 #include "amdgpu_ras.h" 74 #include "amdgpu_pmu.h" 75 #include "amdgpu_fru_eeprom.h" 76 #include "amdgpu_reset.h" 77 #include "amdgpu_virt.h" 78 #include "amdgpu_dev_coredump.h" 79 80 #include <linux/suspend.h> 81 #include <drm/task_barrier.h> 82 #include <linux/pm_runtime.h> 83 84 #include <drm/drm_drv.h> 85 86 #if IS_ENABLED(CONFIG_X86) 87 #include <asm/intel-family.h> 88 #endif 89 90 MODULE_FIRMWARE("amdgpu/vega10_gpu_info.bin"); 91 MODULE_FIRMWARE("amdgpu/vega12_gpu_info.bin"); 92 MODULE_FIRMWARE("amdgpu/raven_gpu_info.bin"); 93 MODULE_FIRMWARE("amdgpu/picasso_gpu_info.bin"); 94 MODULE_FIRMWARE("amdgpu/raven2_gpu_info.bin"); 95 MODULE_FIRMWARE("amdgpu/arcturus_gpu_info.bin"); 96 MODULE_FIRMWARE("amdgpu/navi12_gpu_info.bin"); 97 98 #define AMDGPU_RESUME_MS 2000 99 #define AMDGPU_MAX_RETRY_LIMIT 2 100 #define AMDGPU_RETRY_SRIOV_RESET(r) ((r) == -EBUSY || (r) == -ETIMEDOUT || (r) == -EINVAL) 101 #define AMDGPU_PCIE_INDEX_FALLBACK (0x38 >> 2) 102 #define AMDGPU_PCIE_INDEX_HI_FALLBACK (0x44 >> 2) 103 #define AMDGPU_PCIE_DATA_FALLBACK (0x3C >> 2) 104 105 static const struct drm_driver amdgpu_kms_driver; 106 107 const char *amdgpu_asic_name[] = { 108 "TAHITI", 109 "PITCAIRN", 110 "VERDE", 111 "OLAND", 112 "HAINAN", 113 "BONAIRE", 114 "KAVERI", 115 "KABINI", 116 "HAWAII", 117 "MULLINS", 118 "TOPAZ", 119 "TONGA", 120 "FIJI", 121 "CARRIZO", 122 "STONEY", 123 "POLARIS10", 124 "POLARIS11", 125 "POLARIS12", 126 "VEGAM", 127 "VEGA10", 128 "VEGA12", 129 "VEGA20", 130 "RAVEN", 131 "ARCTURUS", 132 "RENOIR", 133 "ALDEBARAN", 134 "NAVI10", 135 "CYAN_SKILLFISH", 136 "NAVI14", 137 "NAVI12", 138 "SIENNA_CICHLID", 139 "NAVY_FLOUNDER", 140 "VANGOGH", 141 "DIMGREY_CAVEFISH", 142 "BEIGE_GOBY", 143 "YELLOW_CARP", 144 "IP DISCOVERY", 145 "LAST", 146 }; 147 148 #define AMDGPU_IP_BLK_MASK_ALL GENMASK(AMD_IP_BLOCK_TYPE_NUM - 1, 0) 149 /* 150 * Default init level where all blocks are expected to be initialized. This is 151 * the level of initialization expected by default and also after a full reset 152 * of the device. 153 */ 154 struct amdgpu_init_level amdgpu_init_default = { 155 .level = AMDGPU_INIT_LEVEL_DEFAULT, 156 .hwini_ip_block_mask = AMDGPU_IP_BLK_MASK_ALL, 157 }; 158 159 struct amdgpu_init_level amdgpu_init_recovery = { 160 .level = AMDGPU_INIT_LEVEL_RESET_RECOVERY, 161 .hwini_ip_block_mask = AMDGPU_IP_BLK_MASK_ALL, 162 }; 163 164 /* 165 * Minimal blocks needed to be initialized before a XGMI hive can be reset. This 166 * is used for cases like reset on initialization where the entire hive needs to 167 * be reset before first use. 168 */ 169 struct amdgpu_init_level amdgpu_init_minimal_xgmi = { 170 .level = AMDGPU_INIT_LEVEL_MINIMAL_XGMI, 171 .hwini_ip_block_mask = 172 BIT(AMD_IP_BLOCK_TYPE_GMC) | BIT(AMD_IP_BLOCK_TYPE_SMC) | 173 BIT(AMD_IP_BLOCK_TYPE_COMMON) | BIT(AMD_IP_BLOCK_TYPE_IH) | 174 BIT(AMD_IP_BLOCK_TYPE_PSP) 175 }; 176 177 static inline bool amdgpu_ip_member_of_hwini(struct amdgpu_device *adev, 178 enum amd_ip_block_type block) 179 { 180 return (adev->init_lvl->hwini_ip_block_mask & (1U << block)) != 0; 181 } 182 183 void amdgpu_set_init_level(struct amdgpu_device *adev, 184 enum amdgpu_init_lvl_id lvl) 185 { 186 switch (lvl) { 187 case AMDGPU_INIT_LEVEL_MINIMAL_XGMI: 188 adev->init_lvl = &amdgpu_init_minimal_xgmi; 189 break; 190 case AMDGPU_INIT_LEVEL_RESET_RECOVERY: 191 adev->init_lvl = &amdgpu_init_recovery; 192 break; 193 case AMDGPU_INIT_LEVEL_DEFAULT: 194 fallthrough; 195 default: 196 adev->init_lvl = &amdgpu_init_default; 197 break; 198 } 199 } 200 201 static inline void amdgpu_device_stop_pending_resets(struct amdgpu_device *adev); 202 203 /** 204 * DOC: pcie_replay_count 205 * 206 * The amdgpu driver provides a sysfs API for reporting the total number 207 * of PCIe replays (NAKs) 208 * The file pcie_replay_count is used for this and returns the total 209 * number of replays as a sum of the NAKs generated and NAKs received 210 */ 211 212 static ssize_t amdgpu_device_get_pcie_replay_count(struct device *dev, 213 struct device_attribute *attr, char *buf) 214 { 215 struct drm_device *ddev = dev_get_drvdata(dev); 216 struct amdgpu_device *adev = drm_to_adev(ddev); 217 uint64_t cnt = amdgpu_asic_get_pcie_replay_count(adev); 218 219 return sysfs_emit(buf, "%llu\n", cnt); 220 } 221 222 static DEVICE_ATTR(pcie_replay_count, 0444, 223 amdgpu_device_get_pcie_replay_count, NULL); 224 225 static ssize_t amdgpu_sysfs_reg_state_get(struct file *f, struct kobject *kobj, 226 struct bin_attribute *attr, char *buf, 227 loff_t ppos, size_t count) 228 { 229 struct device *dev = kobj_to_dev(kobj); 230 struct drm_device *ddev = dev_get_drvdata(dev); 231 struct amdgpu_device *adev = drm_to_adev(ddev); 232 ssize_t bytes_read; 233 234 switch (ppos) { 235 case AMDGPU_SYS_REG_STATE_XGMI: 236 bytes_read = amdgpu_asic_get_reg_state( 237 adev, AMDGPU_REG_STATE_TYPE_XGMI, buf, count); 238 break; 239 case AMDGPU_SYS_REG_STATE_WAFL: 240 bytes_read = amdgpu_asic_get_reg_state( 241 adev, AMDGPU_REG_STATE_TYPE_WAFL, buf, count); 242 break; 243 case AMDGPU_SYS_REG_STATE_PCIE: 244 bytes_read = amdgpu_asic_get_reg_state( 245 adev, AMDGPU_REG_STATE_TYPE_PCIE, buf, count); 246 break; 247 case AMDGPU_SYS_REG_STATE_USR: 248 bytes_read = amdgpu_asic_get_reg_state( 249 adev, AMDGPU_REG_STATE_TYPE_USR, buf, count); 250 break; 251 case AMDGPU_SYS_REG_STATE_USR_1: 252 bytes_read = amdgpu_asic_get_reg_state( 253 adev, AMDGPU_REG_STATE_TYPE_USR_1, buf, count); 254 break; 255 default: 256 return -EINVAL; 257 } 258 259 return bytes_read; 260 } 261 262 BIN_ATTR(reg_state, 0444, amdgpu_sysfs_reg_state_get, NULL, 263 AMDGPU_SYS_REG_STATE_END); 264 265 int amdgpu_reg_state_sysfs_init(struct amdgpu_device *adev) 266 { 267 int ret; 268 269 if (!amdgpu_asic_get_reg_state_supported(adev)) 270 return 0; 271 272 ret = sysfs_create_bin_file(&adev->dev->kobj, &bin_attr_reg_state); 273 274 return ret; 275 } 276 277 void amdgpu_reg_state_sysfs_fini(struct amdgpu_device *adev) 278 { 279 if (!amdgpu_asic_get_reg_state_supported(adev)) 280 return; 281 sysfs_remove_bin_file(&adev->dev->kobj, &bin_attr_reg_state); 282 } 283 284 int amdgpu_ip_block_suspend(struct amdgpu_ip_block *ip_block) 285 { 286 int r; 287 288 if (ip_block->version->funcs->suspend) { 289 r = ip_block->version->funcs->suspend(ip_block); 290 if (r) { 291 dev_err(ip_block->adev->dev, 292 "suspend of IP block <%s> failed %d\n", 293 ip_block->version->funcs->name, r); 294 return r; 295 } 296 } 297 298 ip_block->status.hw = false; 299 return 0; 300 } 301 302 int amdgpu_ip_block_resume(struct amdgpu_ip_block *ip_block) 303 { 304 int r; 305 306 if (ip_block->version->funcs->resume) { 307 r = ip_block->version->funcs->resume(ip_block); 308 if (r) { 309 dev_err(ip_block->adev->dev, 310 "resume of IP block <%s> failed %d\n", 311 ip_block->version->funcs->name, r); 312 return r; 313 } 314 } 315 316 ip_block->status.hw = true; 317 return 0; 318 } 319 320 /** 321 * DOC: board_info 322 * 323 * The amdgpu driver provides a sysfs API for giving board related information. 324 * It provides the form factor information in the format 325 * 326 * type : form factor 327 * 328 * Possible form factor values 329 * 330 * - "cem" - PCIE CEM card 331 * - "oam" - Open Compute Accelerator Module 332 * - "unknown" - Not known 333 * 334 */ 335 336 static ssize_t amdgpu_device_get_board_info(struct device *dev, 337 struct device_attribute *attr, 338 char *buf) 339 { 340 struct drm_device *ddev = dev_get_drvdata(dev); 341 struct amdgpu_device *adev = drm_to_adev(ddev); 342 enum amdgpu_pkg_type pkg_type = AMDGPU_PKG_TYPE_CEM; 343 const char *pkg; 344 345 if (adev->smuio.funcs && adev->smuio.funcs->get_pkg_type) 346 pkg_type = adev->smuio.funcs->get_pkg_type(adev); 347 348 switch (pkg_type) { 349 case AMDGPU_PKG_TYPE_CEM: 350 pkg = "cem"; 351 break; 352 case AMDGPU_PKG_TYPE_OAM: 353 pkg = "oam"; 354 break; 355 default: 356 pkg = "unknown"; 357 break; 358 } 359 360 return sysfs_emit(buf, "%s : %s\n", "type", pkg); 361 } 362 363 static DEVICE_ATTR(board_info, 0444, amdgpu_device_get_board_info, NULL); 364 365 static struct attribute *amdgpu_board_attrs[] = { 366 &dev_attr_board_info.attr, 367 NULL, 368 }; 369 370 static umode_t amdgpu_board_attrs_is_visible(struct kobject *kobj, 371 struct attribute *attr, int n) 372 { 373 struct device *dev = kobj_to_dev(kobj); 374 struct drm_device *ddev = dev_get_drvdata(dev); 375 struct amdgpu_device *adev = drm_to_adev(ddev); 376 377 if (adev->flags & AMD_IS_APU) 378 return 0; 379 380 return attr->mode; 381 } 382 383 static const struct attribute_group amdgpu_board_attrs_group = { 384 .attrs = amdgpu_board_attrs, 385 .is_visible = amdgpu_board_attrs_is_visible 386 }; 387 388 static void amdgpu_device_get_pcie_info(struct amdgpu_device *adev); 389 390 391 /** 392 * amdgpu_device_supports_px - Is the device a dGPU with ATPX power control 393 * 394 * @dev: drm_device pointer 395 * 396 * Returns true if the device is a dGPU with ATPX power control, 397 * otherwise return false. 398 */ 399 bool amdgpu_device_supports_px(struct drm_device *dev) 400 { 401 struct amdgpu_device *adev = drm_to_adev(dev); 402 403 if ((adev->flags & AMD_IS_PX) && !amdgpu_is_atpx_hybrid()) 404 return true; 405 return false; 406 } 407 408 /** 409 * amdgpu_device_supports_boco - Is the device a dGPU with ACPI power resources 410 * 411 * @dev: drm_device pointer 412 * 413 * Returns true if the device is a dGPU with ACPI power control, 414 * otherwise return false. 415 */ 416 bool amdgpu_device_supports_boco(struct drm_device *dev) 417 { 418 struct amdgpu_device *adev = drm_to_adev(dev); 419 420 if (adev->has_pr3 || 421 ((adev->flags & AMD_IS_PX) && amdgpu_is_atpx_hybrid())) 422 return true; 423 return false; 424 } 425 426 /** 427 * amdgpu_device_supports_baco - Does the device support BACO 428 * 429 * @dev: drm_device pointer 430 * 431 * Return: 432 * 1 if the device supporte BACO; 433 * 3 if the device support MACO (only works if BACO is supported) 434 * otherwise return 0. 435 */ 436 int amdgpu_device_supports_baco(struct drm_device *dev) 437 { 438 struct amdgpu_device *adev = drm_to_adev(dev); 439 440 return amdgpu_asic_supports_baco(adev); 441 } 442 443 void amdgpu_device_detect_runtime_pm_mode(struct amdgpu_device *adev) 444 { 445 struct drm_device *dev; 446 int bamaco_support; 447 448 dev = adev_to_drm(adev); 449 450 adev->pm.rpm_mode = AMDGPU_RUNPM_NONE; 451 bamaco_support = amdgpu_device_supports_baco(dev); 452 453 switch (amdgpu_runtime_pm) { 454 case 2: 455 if (bamaco_support & MACO_SUPPORT) { 456 adev->pm.rpm_mode = AMDGPU_RUNPM_BAMACO; 457 dev_info(adev->dev, "Forcing BAMACO for runtime pm\n"); 458 } else if (bamaco_support == BACO_SUPPORT) { 459 adev->pm.rpm_mode = AMDGPU_RUNPM_BACO; 460 dev_info(adev->dev, "Requested mode BAMACO not available,fallback to use BACO\n"); 461 } 462 break; 463 case 1: 464 if (bamaco_support & BACO_SUPPORT) { 465 adev->pm.rpm_mode = AMDGPU_RUNPM_BACO; 466 dev_info(adev->dev, "Forcing BACO for runtime pm\n"); 467 } 468 break; 469 case -1: 470 case -2: 471 if (amdgpu_device_supports_px(dev)) { /* enable PX as runtime mode */ 472 adev->pm.rpm_mode = AMDGPU_RUNPM_PX; 473 dev_info(adev->dev, "Using ATPX for runtime pm\n"); 474 } else if (amdgpu_device_supports_boco(dev)) { /* enable boco as runtime mode */ 475 adev->pm.rpm_mode = AMDGPU_RUNPM_BOCO; 476 dev_info(adev->dev, "Using BOCO for runtime pm\n"); 477 } else { 478 if (!bamaco_support) 479 goto no_runtime_pm; 480 481 switch (adev->asic_type) { 482 case CHIP_VEGA20: 483 case CHIP_ARCTURUS: 484 /* BACO are not supported on vega20 and arctrus */ 485 break; 486 case CHIP_VEGA10: 487 /* enable BACO as runpm mode if noretry=0 */ 488 if (!adev->gmc.noretry) 489 adev->pm.rpm_mode = AMDGPU_RUNPM_BACO; 490 break; 491 default: 492 /* enable BACO as runpm mode on CI+ */ 493 adev->pm.rpm_mode = AMDGPU_RUNPM_BACO; 494 break; 495 } 496 497 if (adev->pm.rpm_mode == AMDGPU_RUNPM_BACO) { 498 if (bamaco_support & MACO_SUPPORT) { 499 adev->pm.rpm_mode = AMDGPU_RUNPM_BAMACO; 500 dev_info(adev->dev, "Using BAMACO for runtime pm\n"); 501 } else { 502 dev_info(adev->dev, "Using BACO for runtime pm\n"); 503 } 504 } 505 } 506 break; 507 case 0: 508 dev_info(adev->dev, "runtime pm is manually disabled\n"); 509 break; 510 default: 511 break; 512 } 513 514 no_runtime_pm: 515 if (adev->pm.rpm_mode == AMDGPU_RUNPM_NONE) 516 dev_info(adev->dev, "Runtime PM not available\n"); 517 } 518 /** 519 * amdgpu_device_supports_smart_shift - Is the device dGPU with 520 * smart shift support 521 * 522 * @dev: drm_device pointer 523 * 524 * Returns true if the device is a dGPU with Smart Shift support, 525 * otherwise returns false. 526 */ 527 bool amdgpu_device_supports_smart_shift(struct drm_device *dev) 528 { 529 return (amdgpu_device_supports_boco(dev) && 530 amdgpu_acpi_is_power_shift_control_supported()); 531 } 532 533 /* 534 * VRAM access helper functions 535 */ 536 537 /** 538 * amdgpu_device_mm_access - access vram by MM_INDEX/MM_DATA 539 * 540 * @adev: amdgpu_device pointer 541 * @pos: offset of the buffer in vram 542 * @buf: virtual address of the buffer in system memory 543 * @size: read/write size, sizeof(@buf) must > @size 544 * @write: true - write to vram, otherwise - read from vram 545 */ 546 void amdgpu_device_mm_access(struct amdgpu_device *adev, loff_t pos, 547 void *buf, size_t size, bool write) 548 { 549 unsigned long flags; 550 uint32_t hi = ~0, tmp = 0; 551 uint32_t *data = buf; 552 uint64_t last; 553 int idx; 554 555 if (!drm_dev_enter(adev_to_drm(adev), &idx)) 556 return; 557 558 BUG_ON(!IS_ALIGNED(pos, 4) || !IS_ALIGNED(size, 4)); 559 560 spin_lock_irqsave(&adev->mmio_idx_lock, flags); 561 for (last = pos + size; pos < last; pos += 4) { 562 tmp = pos >> 31; 563 564 WREG32_NO_KIQ(mmMM_INDEX, ((uint32_t)pos) | 0x80000000); 565 if (tmp != hi) { 566 WREG32_NO_KIQ(mmMM_INDEX_HI, tmp); 567 hi = tmp; 568 } 569 if (write) 570 WREG32_NO_KIQ(mmMM_DATA, *data++); 571 else 572 *data++ = RREG32_NO_KIQ(mmMM_DATA); 573 } 574 575 spin_unlock_irqrestore(&adev->mmio_idx_lock, flags); 576 drm_dev_exit(idx); 577 } 578 579 /** 580 * amdgpu_device_aper_access - access vram by vram aperature 581 * 582 * @adev: amdgpu_device pointer 583 * @pos: offset of the buffer in vram 584 * @buf: virtual address of the buffer in system memory 585 * @size: read/write size, sizeof(@buf) must > @size 586 * @write: true - write to vram, otherwise - read from vram 587 * 588 * The return value means how many bytes have been transferred. 589 */ 590 size_t amdgpu_device_aper_access(struct amdgpu_device *adev, loff_t pos, 591 void *buf, size_t size, bool write) 592 { 593 #ifdef CONFIG_64BIT 594 void __iomem *addr; 595 size_t count = 0; 596 uint64_t last; 597 598 if (!adev->mman.aper_base_kaddr) 599 return 0; 600 601 last = min(pos + size, adev->gmc.visible_vram_size); 602 if (last > pos) { 603 addr = adev->mman.aper_base_kaddr + pos; 604 count = last - pos; 605 606 if (write) { 607 memcpy_toio(addr, buf, count); 608 /* Make sure HDP write cache flush happens without any reordering 609 * after the system memory contents are sent over PCIe device 610 */ 611 mb(); 612 amdgpu_device_flush_hdp(adev, NULL); 613 } else { 614 amdgpu_device_invalidate_hdp(adev, NULL); 615 /* Make sure HDP read cache is invalidated before issuing a read 616 * to the PCIe device 617 */ 618 mb(); 619 memcpy_fromio(buf, addr, count); 620 } 621 622 } 623 624 return count; 625 #else 626 return 0; 627 #endif 628 } 629 630 /** 631 * amdgpu_device_vram_access - read/write a buffer in vram 632 * 633 * @adev: amdgpu_device pointer 634 * @pos: offset of the buffer in vram 635 * @buf: virtual address of the buffer in system memory 636 * @size: read/write size, sizeof(@buf) must > @size 637 * @write: true - write to vram, otherwise - read from vram 638 */ 639 void amdgpu_device_vram_access(struct amdgpu_device *adev, loff_t pos, 640 void *buf, size_t size, bool write) 641 { 642 size_t count; 643 644 /* try to using vram apreature to access vram first */ 645 count = amdgpu_device_aper_access(adev, pos, buf, size, write); 646 size -= count; 647 if (size) { 648 /* using MM to access rest vram */ 649 pos += count; 650 buf += count; 651 amdgpu_device_mm_access(adev, pos, buf, size, write); 652 } 653 } 654 655 /* 656 * register access helper functions. 657 */ 658 659 /* Check if hw access should be skipped because of hotplug or device error */ 660 bool amdgpu_device_skip_hw_access(struct amdgpu_device *adev) 661 { 662 if (adev->no_hw_access) 663 return true; 664 665 #ifdef CONFIG_LOCKDEP 666 /* 667 * This is a bit complicated to understand, so worth a comment. What we assert 668 * here is that the GPU reset is not running on another thread in parallel. 669 * 670 * For this we trylock the read side of the reset semaphore, if that succeeds 671 * we know that the reset is not running in paralell. 672 * 673 * If the trylock fails we assert that we are either already holding the read 674 * side of the lock or are the reset thread itself and hold the write side of 675 * the lock. 676 */ 677 if (in_task()) { 678 if (down_read_trylock(&adev->reset_domain->sem)) 679 up_read(&adev->reset_domain->sem); 680 else 681 lockdep_assert_held(&adev->reset_domain->sem); 682 } 683 #endif 684 return false; 685 } 686 687 /** 688 * amdgpu_device_rreg - read a memory mapped IO or indirect register 689 * 690 * @adev: amdgpu_device pointer 691 * @reg: dword aligned register offset 692 * @acc_flags: access flags which require special behavior 693 * 694 * Returns the 32 bit value from the offset specified. 695 */ 696 uint32_t amdgpu_device_rreg(struct amdgpu_device *adev, 697 uint32_t reg, uint32_t acc_flags) 698 { 699 uint32_t ret; 700 701 if (amdgpu_device_skip_hw_access(adev)) 702 return 0; 703 704 if ((reg * 4) < adev->rmmio_size) { 705 if (!(acc_flags & AMDGPU_REGS_NO_KIQ) && 706 amdgpu_sriov_runtime(adev) && 707 down_read_trylock(&adev->reset_domain->sem)) { 708 ret = amdgpu_kiq_rreg(adev, reg, 0); 709 up_read(&adev->reset_domain->sem); 710 } else { 711 ret = readl(((void __iomem *)adev->rmmio) + (reg * 4)); 712 } 713 } else { 714 ret = adev->pcie_rreg(adev, reg * 4); 715 } 716 717 trace_amdgpu_device_rreg(adev->pdev->device, reg, ret); 718 719 return ret; 720 } 721 722 /* 723 * MMIO register read with bytes helper functions 724 * @offset:bytes offset from MMIO start 725 */ 726 727 /** 728 * amdgpu_mm_rreg8 - read a memory mapped IO register 729 * 730 * @adev: amdgpu_device pointer 731 * @offset: byte aligned register offset 732 * 733 * Returns the 8 bit value from the offset specified. 734 */ 735 uint8_t amdgpu_mm_rreg8(struct amdgpu_device *adev, uint32_t offset) 736 { 737 if (amdgpu_device_skip_hw_access(adev)) 738 return 0; 739 740 if (offset < adev->rmmio_size) 741 return (readb(adev->rmmio + offset)); 742 BUG(); 743 } 744 745 746 /** 747 * amdgpu_device_xcc_rreg - read a memory mapped IO or indirect register with specific XCC 748 * 749 * @adev: amdgpu_device pointer 750 * @reg: dword aligned register offset 751 * @acc_flags: access flags which require special behavior 752 * @xcc_id: xcc accelerated compute core id 753 * 754 * Returns the 32 bit value from the offset specified. 755 */ 756 uint32_t amdgpu_device_xcc_rreg(struct amdgpu_device *adev, 757 uint32_t reg, uint32_t acc_flags, 758 uint32_t xcc_id) 759 { 760 uint32_t ret, rlcg_flag; 761 762 if (amdgpu_device_skip_hw_access(adev)) 763 return 0; 764 765 if ((reg * 4) < adev->rmmio_size) { 766 if (amdgpu_sriov_vf(adev) && 767 !amdgpu_sriov_runtime(adev) && 768 adev->gfx.rlc.rlcg_reg_access_supported && 769 amdgpu_virt_get_rlcg_reg_access_flag(adev, acc_flags, 770 GC_HWIP, false, 771 &rlcg_flag)) { 772 ret = amdgpu_virt_rlcg_reg_rw(adev, reg, 0, rlcg_flag, GET_INST(GC, xcc_id)); 773 } else if (!(acc_flags & AMDGPU_REGS_NO_KIQ) && 774 amdgpu_sriov_runtime(adev) && 775 down_read_trylock(&adev->reset_domain->sem)) { 776 ret = amdgpu_kiq_rreg(adev, reg, xcc_id); 777 up_read(&adev->reset_domain->sem); 778 } else { 779 ret = readl(((void __iomem *)adev->rmmio) + (reg * 4)); 780 } 781 } else { 782 ret = adev->pcie_rreg(adev, reg * 4); 783 } 784 785 return ret; 786 } 787 788 /* 789 * MMIO register write with bytes helper functions 790 * @offset:bytes offset from MMIO start 791 * @value: the value want to be written to the register 792 */ 793 794 /** 795 * amdgpu_mm_wreg8 - read a memory mapped IO register 796 * 797 * @adev: amdgpu_device pointer 798 * @offset: byte aligned register offset 799 * @value: 8 bit value to write 800 * 801 * Writes the value specified to the offset specified. 802 */ 803 void amdgpu_mm_wreg8(struct amdgpu_device *adev, uint32_t offset, uint8_t value) 804 { 805 if (amdgpu_device_skip_hw_access(adev)) 806 return; 807 808 if (offset < adev->rmmio_size) 809 writeb(value, adev->rmmio + offset); 810 else 811 BUG(); 812 } 813 814 /** 815 * amdgpu_device_wreg - write to a memory mapped IO or indirect register 816 * 817 * @adev: amdgpu_device pointer 818 * @reg: dword aligned register offset 819 * @v: 32 bit value to write to the register 820 * @acc_flags: access flags which require special behavior 821 * 822 * Writes the value specified to the offset specified. 823 */ 824 void amdgpu_device_wreg(struct amdgpu_device *adev, 825 uint32_t reg, uint32_t v, 826 uint32_t acc_flags) 827 { 828 if (amdgpu_device_skip_hw_access(adev)) 829 return; 830 831 if ((reg * 4) < adev->rmmio_size) { 832 if (!(acc_flags & AMDGPU_REGS_NO_KIQ) && 833 amdgpu_sriov_runtime(adev) && 834 down_read_trylock(&adev->reset_domain->sem)) { 835 amdgpu_kiq_wreg(adev, reg, v, 0); 836 up_read(&adev->reset_domain->sem); 837 } else { 838 writel(v, ((void __iomem *)adev->rmmio) + (reg * 4)); 839 } 840 } else { 841 adev->pcie_wreg(adev, reg * 4, v); 842 } 843 844 trace_amdgpu_device_wreg(adev->pdev->device, reg, v); 845 } 846 847 /** 848 * amdgpu_mm_wreg_mmio_rlc - write register either with direct/indirect mmio or with RLC path if in range 849 * 850 * @adev: amdgpu_device pointer 851 * @reg: mmio/rlc register 852 * @v: value to write 853 * @xcc_id: xcc accelerated compute core id 854 * 855 * this function is invoked only for the debugfs register access 856 */ 857 void amdgpu_mm_wreg_mmio_rlc(struct amdgpu_device *adev, 858 uint32_t reg, uint32_t v, 859 uint32_t xcc_id) 860 { 861 if (amdgpu_device_skip_hw_access(adev)) 862 return; 863 864 if (amdgpu_sriov_fullaccess(adev) && 865 adev->gfx.rlc.funcs && 866 adev->gfx.rlc.funcs->is_rlcg_access_range) { 867 if (adev->gfx.rlc.funcs->is_rlcg_access_range(adev, reg)) 868 return amdgpu_sriov_wreg(adev, reg, v, 0, 0, xcc_id); 869 } else if ((reg * 4) >= adev->rmmio_size) { 870 adev->pcie_wreg(adev, reg * 4, v); 871 } else { 872 writel(v, ((void __iomem *)adev->rmmio) + (reg * 4)); 873 } 874 } 875 876 /** 877 * amdgpu_device_xcc_wreg - write to a memory mapped IO or indirect register with specific XCC 878 * 879 * @adev: amdgpu_device pointer 880 * @reg: dword aligned register offset 881 * @v: 32 bit value to write to the register 882 * @acc_flags: access flags which require special behavior 883 * @xcc_id: xcc accelerated compute core id 884 * 885 * Writes the value specified to the offset specified. 886 */ 887 void amdgpu_device_xcc_wreg(struct amdgpu_device *adev, 888 uint32_t reg, uint32_t v, 889 uint32_t acc_flags, uint32_t xcc_id) 890 { 891 uint32_t rlcg_flag; 892 893 if (amdgpu_device_skip_hw_access(adev)) 894 return; 895 896 if ((reg * 4) < adev->rmmio_size) { 897 if (amdgpu_sriov_vf(adev) && 898 !amdgpu_sriov_runtime(adev) && 899 adev->gfx.rlc.rlcg_reg_access_supported && 900 amdgpu_virt_get_rlcg_reg_access_flag(adev, acc_flags, 901 GC_HWIP, true, 902 &rlcg_flag)) { 903 amdgpu_virt_rlcg_reg_rw(adev, reg, v, rlcg_flag, GET_INST(GC, xcc_id)); 904 } else if (!(acc_flags & AMDGPU_REGS_NO_KIQ) && 905 amdgpu_sriov_runtime(adev) && 906 down_read_trylock(&adev->reset_domain->sem)) { 907 amdgpu_kiq_wreg(adev, reg, v, xcc_id); 908 up_read(&adev->reset_domain->sem); 909 } else { 910 writel(v, ((void __iomem *)adev->rmmio) + (reg * 4)); 911 } 912 } else { 913 adev->pcie_wreg(adev, reg * 4, v); 914 } 915 } 916 917 /** 918 * amdgpu_device_indirect_rreg - read an indirect register 919 * 920 * @adev: amdgpu_device pointer 921 * @reg_addr: indirect register address to read from 922 * 923 * Returns the value of indirect register @reg_addr 924 */ 925 u32 amdgpu_device_indirect_rreg(struct amdgpu_device *adev, 926 u32 reg_addr) 927 { 928 unsigned long flags, pcie_index, pcie_data; 929 void __iomem *pcie_index_offset; 930 void __iomem *pcie_data_offset; 931 u32 r; 932 933 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 934 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 935 936 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 937 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 938 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 939 940 writel(reg_addr, pcie_index_offset); 941 readl(pcie_index_offset); 942 r = readl(pcie_data_offset); 943 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 944 945 return r; 946 } 947 948 u32 amdgpu_device_indirect_rreg_ext(struct amdgpu_device *adev, 949 u64 reg_addr) 950 { 951 unsigned long flags, pcie_index, pcie_index_hi, pcie_data; 952 u32 r; 953 void __iomem *pcie_index_offset; 954 void __iomem *pcie_index_hi_offset; 955 void __iomem *pcie_data_offset; 956 957 if (unlikely(!adev->nbio.funcs)) { 958 pcie_index = AMDGPU_PCIE_INDEX_FALLBACK; 959 pcie_data = AMDGPU_PCIE_DATA_FALLBACK; 960 } else { 961 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 962 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 963 } 964 965 if (reg_addr >> 32) { 966 if (unlikely(!adev->nbio.funcs)) 967 pcie_index_hi = AMDGPU_PCIE_INDEX_HI_FALLBACK; 968 else 969 pcie_index_hi = adev->nbio.funcs->get_pcie_index_hi_offset(adev); 970 } else { 971 pcie_index_hi = 0; 972 } 973 974 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 975 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 976 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 977 if (pcie_index_hi != 0) 978 pcie_index_hi_offset = (void __iomem *)adev->rmmio + 979 pcie_index_hi * 4; 980 981 writel(reg_addr, pcie_index_offset); 982 readl(pcie_index_offset); 983 if (pcie_index_hi != 0) { 984 writel((reg_addr >> 32) & 0xff, pcie_index_hi_offset); 985 readl(pcie_index_hi_offset); 986 } 987 r = readl(pcie_data_offset); 988 989 /* clear the high bits */ 990 if (pcie_index_hi != 0) { 991 writel(0, pcie_index_hi_offset); 992 readl(pcie_index_hi_offset); 993 } 994 995 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 996 997 return r; 998 } 999 1000 /** 1001 * amdgpu_device_indirect_rreg64 - read a 64bits indirect register 1002 * 1003 * @adev: amdgpu_device pointer 1004 * @reg_addr: indirect register address to read from 1005 * 1006 * Returns the value of indirect register @reg_addr 1007 */ 1008 u64 amdgpu_device_indirect_rreg64(struct amdgpu_device *adev, 1009 u32 reg_addr) 1010 { 1011 unsigned long flags, pcie_index, pcie_data; 1012 void __iomem *pcie_index_offset; 1013 void __iomem *pcie_data_offset; 1014 u64 r; 1015 1016 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 1017 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 1018 1019 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 1020 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 1021 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 1022 1023 /* read low 32 bits */ 1024 writel(reg_addr, pcie_index_offset); 1025 readl(pcie_index_offset); 1026 r = readl(pcie_data_offset); 1027 /* read high 32 bits */ 1028 writel(reg_addr + 4, pcie_index_offset); 1029 readl(pcie_index_offset); 1030 r |= ((u64)readl(pcie_data_offset) << 32); 1031 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 1032 1033 return r; 1034 } 1035 1036 u64 amdgpu_device_indirect_rreg64_ext(struct amdgpu_device *adev, 1037 u64 reg_addr) 1038 { 1039 unsigned long flags, pcie_index, pcie_data; 1040 unsigned long pcie_index_hi = 0; 1041 void __iomem *pcie_index_offset; 1042 void __iomem *pcie_index_hi_offset; 1043 void __iomem *pcie_data_offset; 1044 u64 r; 1045 1046 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 1047 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 1048 if ((reg_addr >> 32) && (adev->nbio.funcs->get_pcie_index_hi_offset)) 1049 pcie_index_hi = adev->nbio.funcs->get_pcie_index_hi_offset(adev); 1050 1051 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 1052 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 1053 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 1054 if (pcie_index_hi != 0) 1055 pcie_index_hi_offset = (void __iomem *)adev->rmmio + 1056 pcie_index_hi * 4; 1057 1058 /* read low 32 bits */ 1059 writel(reg_addr, pcie_index_offset); 1060 readl(pcie_index_offset); 1061 if (pcie_index_hi != 0) { 1062 writel((reg_addr >> 32) & 0xff, pcie_index_hi_offset); 1063 readl(pcie_index_hi_offset); 1064 } 1065 r = readl(pcie_data_offset); 1066 /* read high 32 bits */ 1067 writel(reg_addr + 4, pcie_index_offset); 1068 readl(pcie_index_offset); 1069 if (pcie_index_hi != 0) { 1070 writel((reg_addr >> 32) & 0xff, pcie_index_hi_offset); 1071 readl(pcie_index_hi_offset); 1072 } 1073 r |= ((u64)readl(pcie_data_offset) << 32); 1074 1075 /* clear the high bits */ 1076 if (pcie_index_hi != 0) { 1077 writel(0, pcie_index_hi_offset); 1078 readl(pcie_index_hi_offset); 1079 } 1080 1081 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 1082 1083 return r; 1084 } 1085 1086 /** 1087 * amdgpu_device_indirect_wreg - write an indirect register address 1088 * 1089 * @adev: amdgpu_device pointer 1090 * @reg_addr: indirect register offset 1091 * @reg_data: indirect register data 1092 * 1093 */ 1094 void amdgpu_device_indirect_wreg(struct amdgpu_device *adev, 1095 u32 reg_addr, u32 reg_data) 1096 { 1097 unsigned long flags, pcie_index, pcie_data; 1098 void __iomem *pcie_index_offset; 1099 void __iomem *pcie_data_offset; 1100 1101 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 1102 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 1103 1104 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 1105 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 1106 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 1107 1108 writel(reg_addr, pcie_index_offset); 1109 readl(pcie_index_offset); 1110 writel(reg_data, pcie_data_offset); 1111 readl(pcie_data_offset); 1112 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 1113 } 1114 1115 void amdgpu_device_indirect_wreg_ext(struct amdgpu_device *adev, 1116 u64 reg_addr, u32 reg_data) 1117 { 1118 unsigned long flags, pcie_index, pcie_index_hi, pcie_data; 1119 void __iomem *pcie_index_offset; 1120 void __iomem *pcie_index_hi_offset; 1121 void __iomem *pcie_data_offset; 1122 1123 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 1124 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 1125 if ((reg_addr >> 32) && (adev->nbio.funcs->get_pcie_index_hi_offset)) 1126 pcie_index_hi = adev->nbio.funcs->get_pcie_index_hi_offset(adev); 1127 else 1128 pcie_index_hi = 0; 1129 1130 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 1131 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 1132 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 1133 if (pcie_index_hi != 0) 1134 pcie_index_hi_offset = (void __iomem *)adev->rmmio + 1135 pcie_index_hi * 4; 1136 1137 writel(reg_addr, pcie_index_offset); 1138 readl(pcie_index_offset); 1139 if (pcie_index_hi != 0) { 1140 writel((reg_addr >> 32) & 0xff, pcie_index_hi_offset); 1141 readl(pcie_index_hi_offset); 1142 } 1143 writel(reg_data, pcie_data_offset); 1144 readl(pcie_data_offset); 1145 1146 /* clear the high bits */ 1147 if (pcie_index_hi != 0) { 1148 writel(0, pcie_index_hi_offset); 1149 readl(pcie_index_hi_offset); 1150 } 1151 1152 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 1153 } 1154 1155 /** 1156 * amdgpu_device_indirect_wreg64 - write a 64bits indirect register address 1157 * 1158 * @adev: amdgpu_device pointer 1159 * @reg_addr: indirect register offset 1160 * @reg_data: indirect register data 1161 * 1162 */ 1163 void amdgpu_device_indirect_wreg64(struct amdgpu_device *adev, 1164 u32 reg_addr, u64 reg_data) 1165 { 1166 unsigned long flags, pcie_index, pcie_data; 1167 void __iomem *pcie_index_offset; 1168 void __iomem *pcie_data_offset; 1169 1170 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 1171 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 1172 1173 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 1174 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 1175 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 1176 1177 /* write low 32 bits */ 1178 writel(reg_addr, pcie_index_offset); 1179 readl(pcie_index_offset); 1180 writel((u32)(reg_data & 0xffffffffULL), pcie_data_offset); 1181 readl(pcie_data_offset); 1182 /* write high 32 bits */ 1183 writel(reg_addr + 4, pcie_index_offset); 1184 readl(pcie_index_offset); 1185 writel((u32)(reg_data >> 32), pcie_data_offset); 1186 readl(pcie_data_offset); 1187 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 1188 } 1189 1190 void amdgpu_device_indirect_wreg64_ext(struct amdgpu_device *adev, 1191 u64 reg_addr, u64 reg_data) 1192 { 1193 unsigned long flags, pcie_index, pcie_data; 1194 unsigned long pcie_index_hi = 0; 1195 void __iomem *pcie_index_offset; 1196 void __iomem *pcie_index_hi_offset; 1197 void __iomem *pcie_data_offset; 1198 1199 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 1200 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 1201 if ((reg_addr >> 32) && (adev->nbio.funcs->get_pcie_index_hi_offset)) 1202 pcie_index_hi = adev->nbio.funcs->get_pcie_index_hi_offset(adev); 1203 1204 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 1205 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 1206 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 1207 if (pcie_index_hi != 0) 1208 pcie_index_hi_offset = (void __iomem *)adev->rmmio + 1209 pcie_index_hi * 4; 1210 1211 /* write low 32 bits */ 1212 writel(reg_addr, pcie_index_offset); 1213 readl(pcie_index_offset); 1214 if (pcie_index_hi != 0) { 1215 writel((reg_addr >> 32) & 0xff, pcie_index_hi_offset); 1216 readl(pcie_index_hi_offset); 1217 } 1218 writel((u32)(reg_data & 0xffffffffULL), pcie_data_offset); 1219 readl(pcie_data_offset); 1220 /* write high 32 bits */ 1221 writel(reg_addr + 4, pcie_index_offset); 1222 readl(pcie_index_offset); 1223 if (pcie_index_hi != 0) { 1224 writel((reg_addr >> 32) & 0xff, pcie_index_hi_offset); 1225 readl(pcie_index_hi_offset); 1226 } 1227 writel((u32)(reg_data >> 32), pcie_data_offset); 1228 readl(pcie_data_offset); 1229 1230 /* clear the high bits */ 1231 if (pcie_index_hi != 0) { 1232 writel(0, pcie_index_hi_offset); 1233 readl(pcie_index_hi_offset); 1234 } 1235 1236 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 1237 } 1238 1239 /** 1240 * amdgpu_device_get_rev_id - query device rev_id 1241 * 1242 * @adev: amdgpu_device pointer 1243 * 1244 * Return device rev_id 1245 */ 1246 u32 amdgpu_device_get_rev_id(struct amdgpu_device *adev) 1247 { 1248 return adev->nbio.funcs->get_rev_id(adev); 1249 } 1250 1251 /** 1252 * amdgpu_invalid_rreg - dummy reg read function 1253 * 1254 * @adev: amdgpu_device pointer 1255 * @reg: offset of register 1256 * 1257 * Dummy register read function. Used for register blocks 1258 * that certain asics don't have (all asics). 1259 * Returns the value in the register. 1260 */ 1261 static uint32_t amdgpu_invalid_rreg(struct amdgpu_device *adev, uint32_t reg) 1262 { 1263 DRM_ERROR("Invalid callback to read register 0x%04X\n", reg); 1264 BUG(); 1265 return 0; 1266 } 1267 1268 static uint32_t amdgpu_invalid_rreg_ext(struct amdgpu_device *adev, uint64_t reg) 1269 { 1270 DRM_ERROR("Invalid callback to read register 0x%llX\n", reg); 1271 BUG(); 1272 return 0; 1273 } 1274 1275 /** 1276 * amdgpu_invalid_wreg - dummy reg write function 1277 * 1278 * @adev: amdgpu_device pointer 1279 * @reg: offset of register 1280 * @v: value to write to the register 1281 * 1282 * Dummy register read function. Used for register blocks 1283 * that certain asics don't have (all asics). 1284 */ 1285 static void amdgpu_invalid_wreg(struct amdgpu_device *adev, uint32_t reg, uint32_t v) 1286 { 1287 DRM_ERROR("Invalid callback to write register 0x%04X with 0x%08X\n", 1288 reg, v); 1289 BUG(); 1290 } 1291 1292 static void amdgpu_invalid_wreg_ext(struct amdgpu_device *adev, uint64_t reg, uint32_t v) 1293 { 1294 DRM_ERROR("Invalid callback to write register 0x%llX with 0x%08X\n", 1295 reg, v); 1296 BUG(); 1297 } 1298 1299 /** 1300 * amdgpu_invalid_rreg64 - dummy 64 bit reg read function 1301 * 1302 * @adev: amdgpu_device pointer 1303 * @reg: offset of register 1304 * 1305 * Dummy register read function. Used for register blocks 1306 * that certain asics don't have (all asics). 1307 * Returns the value in the register. 1308 */ 1309 static uint64_t amdgpu_invalid_rreg64(struct amdgpu_device *adev, uint32_t reg) 1310 { 1311 DRM_ERROR("Invalid callback to read 64 bit register 0x%04X\n", reg); 1312 BUG(); 1313 return 0; 1314 } 1315 1316 static uint64_t amdgpu_invalid_rreg64_ext(struct amdgpu_device *adev, uint64_t reg) 1317 { 1318 DRM_ERROR("Invalid callback to read register 0x%llX\n", reg); 1319 BUG(); 1320 return 0; 1321 } 1322 1323 /** 1324 * amdgpu_invalid_wreg64 - dummy reg write function 1325 * 1326 * @adev: amdgpu_device pointer 1327 * @reg: offset of register 1328 * @v: value to write to the register 1329 * 1330 * Dummy register read function. Used for register blocks 1331 * that certain asics don't have (all asics). 1332 */ 1333 static void amdgpu_invalid_wreg64(struct amdgpu_device *adev, uint32_t reg, uint64_t v) 1334 { 1335 DRM_ERROR("Invalid callback to write 64 bit register 0x%04X with 0x%08llX\n", 1336 reg, v); 1337 BUG(); 1338 } 1339 1340 static void amdgpu_invalid_wreg64_ext(struct amdgpu_device *adev, uint64_t reg, uint64_t v) 1341 { 1342 DRM_ERROR("Invalid callback to write 64 bit register 0x%llX with 0x%08llX\n", 1343 reg, v); 1344 BUG(); 1345 } 1346 1347 /** 1348 * amdgpu_block_invalid_rreg - dummy reg read function 1349 * 1350 * @adev: amdgpu_device pointer 1351 * @block: offset of instance 1352 * @reg: offset of register 1353 * 1354 * Dummy register read function. Used for register blocks 1355 * that certain asics don't have (all asics). 1356 * Returns the value in the register. 1357 */ 1358 static uint32_t amdgpu_block_invalid_rreg(struct amdgpu_device *adev, 1359 uint32_t block, uint32_t reg) 1360 { 1361 DRM_ERROR("Invalid callback to read register 0x%04X in block 0x%04X\n", 1362 reg, block); 1363 BUG(); 1364 return 0; 1365 } 1366 1367 /** 1368 * amdgpu_block_invalid_wreg - dummy reg write function 1369 * 1370 * @adev: amdgpu_device pointer 1371 * @block: offset of instance 1372 * @reg: offset of register 1373 * @v: value to write to the register 1374 * 1375 * Dummy register read function. Used for register blocks 1376 * that certain asics don't have (all asics). 1377 */ 1378 static void amdgpu_block_invalid_wreg(struct amdgpu_device *adev, 1379 uint32_t block, 1380 uint32_t reg, uint32_t v) 1381 { 1382 DRM_ERROR("Invalid block callback to write register 0x%04X in block 0x%04X with 0x%08X\n", 1383 reg, block, v); 1384 BUG(); 1385 } 1386 1387 /** 1388 * amdgpu_device_asic_init - Wrapper for atom asic_init 1389 * 1390 * @adev: amdgpu_device pointer 1391 * 1392 * Does any asic specific work and then calls atom asic init. 1393 */ 1394 static int amdgpu_device_asic_init(struct amdgpu_device *adev) 1395 { 1396 int ret; 1397 1398 amdgpu_asic_pre_asic_init(adev); 1399 1400 if (amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 3) || 1401 amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 4) || 1402 amdgpu_ip_version(adev, GC_HWIP, 0) >= IP_VERSION(11, 0, 0)) { 1403 amdgpu_psp_wait_for_bootloader(adev); 1404 ret = amdgpu_atomfirmware_asic_init(adev, true); 1405 return ret; 1406 } else { 1407 return amdgpu_atom_asic_init(adev->mode_info.atom_context); 1408 } 1409 1410 return 0; 1411 } 1412 1413 /** 1414 * amdgpu_device_mem_scratch_init - allocate the VRAM scratch page 1415 * 1416 * @adev: amdgpu_device pointer 1417 * 1418 * Allocates a scratch page of VRAM for use by various things in the 1419 * driver. 1420 */ 1421 static int amdgpu_device_mem_scratch_init(struct amdgpu_device *adev) 1422 { 1423 return amdgpu_bo_create_kernel(adev, AMDGPU_GPU_PAGE_SIZE, PAGE_SIZE, 1424 AMDGPU_GEM_DOMAIN_VRAM | 1425 AMDGPU_GEM_DOMAIN_GTT, 1426 &adev->mem_scratch.robj, 1427 &adev->mem_scratch.gpu_addr, 1428 (void **)&adev->mem_scratch.ptr); 1429 } 1430 1431 /** 1432 * amdgpu_device_mem_scratch_fini - Free the VRAM scratch page 1433 * 1434 * @adev: amdgpu_device pointer 1435 * 1436 * Frees the VRAM scratch page. 1437 */ 1438 static void amdgpu_device_mem_scratch_fini(struct amdgpu_device *adev) 1439 { 1440 amdgpu_bo_free_kernel(&adev->mem_scratch.robj, NULL, NULL); 1441 } 1442 1443 /** 1444 * amdgpu_device_program_register_sequence - program an array of registers. 1445 * 1446 * @adev: amdgpu_device pointer 1447 * @registers: pointer to the register array 1448 * @array_size: size of the register array 1449 * 1450 * Programs an array or registers with and or masks. 1451 * This is a helper for setting golden registers. 1452 */ 1453 void amdgpu_device_program_register_sequence(struct amdgpu_device *adev, 1454 const u32 *registers, 1455 const u32 array_size) 1456 { 1457 u32 tmp, reg, and_mask, or_mask; 1458 int i; 1459 1460 if (array_size % 3) 1461 return; 1462 1463 for (i = 0; i < array_size; i += 3) { 1464 reg = registers[i + 0]; 1465 and_mask = registers[i + 1]; 1466 or_mask = registers[i + 2]; 1467 1468 if (and_mask == 0xffffffff) { 1469 tmp = or_mask; 1470 } else { 1471 tmp = RREG32(reg); 1472 tmp &= ~and_mask; 1473 if (adev->family >= AMDGPU_FAMILY_AI) 1474 tmp |= (or_mask & and_mask); 1475 else 1476 tmp |= or_mask; 1477 } 1478 WREG32(reg, tmp); 1479 } 1480 } 1481 1482 /** 1483 * amdgpu_device_pci_config_reset - reset the GPU 1484 * 1485 * @adev: amdgpu_device pointer 1486 * 1487 * Resets the GPU using the pci config reset sequence. 1488 * Only applicable to asics prior to vega10. 1489 */ 1490 void amdgpu_device_pci_config_reset(struct amdgpu_device *adev) 1491 { 1492 pci_write_config_dword(adev->pdev, 0x7c, AMDGPU_ASIC_RESET_DATA); 1493 } 1494 1495 /** 1496 * amdgpu_device_pci_reset - reset the GPU using generic PCI means 1497 * 1498 * @adev: amdgpu_device pointer 1499 * 1500 * Resets the GPU using generic pci reset interfaces (FLR, SBR, etc.). 1501 */ 1502 int amdgpu_device_pci_reset(struct amdgpu_device *adev) 1503 { 1504 return pci_reset_function(adev->pdev); 1505 } 1506 1507 /* 1508 * amdgpu_device_wb_*() 1509 * Writeback is the method by which the GPU updates special pages in memory 1510 * with the status of certain GPU events (fences, ring pointers,etc.). 1511 */ 1512 1513 /** 1514 * amdgpu_device_wb_fini - Disable Writeback and free memory 1515 * 1516 * @adev: amdgpu_device pointer 1517 * 1518 * Disables Writeback and frees the Writeback memory (all asics). 1519 * Used at driver shutdown. 1520 */ 1521 static void amdgpu_device_wb_fini(struct amdgpu_device *adev) 1522 { 1523 if (adev->wb.wb_obj) { 1524 amdgpu_bo_free_kernel(&adev->wb.wb_obj, 1525 &adev->wb.gpu_addr, 1526 (void **)&adev->wb.wb); 1527 adev->wb.wb_obj = NULL; 1528 } 1529 } 1530 1531 /** 1532 * amdgpu_device_wb_init - Init Writeback driver info and allocate memory 1533 * 1534 * @adev: amdgpu_device pointer 1535 * 1536 * Initializes writeback and allocates writeback memory (all asics). 1537 * Used at driver startup. 1538 * Returns 0 on success or an -error on failure. 1539 */ 1540 static int amdgpu_device_wb_init(struct amdgpu_device *adev) 1541 { 1542 int r; 1543 1544 if (adev->wb.wb_obj == NULL) { 1545 /* AMDGPU_MAX_WB * sizeof(uint32_t) * 8 = AMDGPU_MAX_WB 256bit slots */ 1546 r = amdgpu_bo_create_kernel(adev, AMDGPU_MAX_WB * sizeof(uint32_t) * 8, 1547 PAGE_SIZE, AMDGPU_GEM_DOMAIN_GTT, 1548 &adev->wb.wb_obj, &adev->wb.gpu_addr, 1549 (void **)&adev->wb.wb); 1550 if (r) { 1551 dev_warn(adev->dev, "(%d) create WB bo failed\n", r); 1552 return r; 1553 } 1554 1555 adev->wb.num_wb = AMDGPU_MAX_WB; 1556 memset(&adev->wb.used, 0, sizeof(adev->wb.used)); 1557 1558 /* clear wb memory */ 1559 memset((char *)adev->wb.wb, 0, AMDGPU_MAX_WB * sizeof(uint32_t) * 8); 1560 } 1561 1562 return 0; 1563 } 1564 1565 /** 1566 * amdgpu_device_wb_get - Allocate a wb entry 1567 * 1568 * @adev: amdgpu_device pointer 1569 * @wb: wb index 1570 * 1571 * Allocate a wb slot for use by the driver (all asics). 1572 * Returns 0 on success or -EINVAL on failure. 1573 */ 1574 int amdgpu_device_wb_get(struct amdgpu_device *adev, u32 *wb) 1575 { 1576 unsigned long flags, offset; 1577 1578 spin_lock_irqsave(&adev->wb.lock, flags); 1579 offset = find_first_zero_bit(adev->wb.used, adev->wb.num_wb); 1580 if (offset < adev->wb.num_wb) { 1581 __set_bit(offset, adev->wb.used); 1582 spin_unlock_irqrestore(&adev->wb.lock, flags); 1583 *wb = offset << 3; /* convert to dw offset */ 1584 return 0; 1585 } else { 1586 spin_unlock_irqrestore(&adev->wb.lock, flags); 1587 return -EINVAL; 1588 } 1589 } 1590 1591 /** 1592 * amdgpu_device_wb_free - Free a wb entry 1593 * 1594 * @adev: amdgpu_device pointer 1595 * @wb: wb index 1596 * 1597 * Free a wb slot allocated for use by the driver (all asics) 1598 */ 1599 void amdgpu_device_wb_free(struct amdgpu_device *adev, u32 wb) 1600 { 1601 unsigned long flags; 1602 1603 wb >>= 3; 1604 spin_lock_irqsave(&adev->wb.lock, flags); 1605 if (wb < adev->wb.num_wb) 1606 __clear_bit(wb, adev->wb.used); 1607 spin_unlock_irqrestore(&adev->wb.lock, flags); 1608 } 1609 1610 /** 1611 * amdgpu_device_resize_fb_bar - try to resize FB BAR 1612 * 1613 * @adev: amdgpu_device pointer 1614 * 1615 * Try to resize FB BAR to make all VRAM CPU accessible. We try very hard not 1616 * to fail, but if any of the BARs is not accessible after the size we abort 1617 * driver loading by returning -ENODEV. 1618 */ 1619 int amdgpu_device_resize_fb_bar(struct amdgpu_device *adev) 1620 { 1621 int rbar_size = pci_rebar_bytes_to_size(adev->gmc.real_vram_size); 1622 struct pci_bus *root; 1623 struct resource *res; 1624 unsigned int i; 1625 u16 cmd; 1626 int r; 1627 1628 if (!IS_ENABLED(CONFIG_PHYS_ADDR_T_64BIT)) 1629 return 0; 1630 1631 /* Bypass for VF */ 1632 if (amdgpu_sriov_vf(adev)) 1633 return 0; 1634 1635 /* PCI_EXT_CAP_ID_VNDR extended capability is located at 0x100 */ 1636 if (!pci_find_ext_capability(adev->pdev, PCI_EXT_CAP_ID_VNDR)) 1637 DRM_WARN("System can't access extended configuration space, please check!!\n"); 1638 1639 /* skip if the bios has already enabled large BAR */ 1640 if (adev->gmc.real_vram_size && 1641 (pci_resource_len(adev->pdev, 0) >= adev->gmc.real_vram_size)) 1642 return 0; 1643 1644 /* Check if the root BUS has 64bit memory resources */ 1645 root = adev->pdev->bus; 1646 while (root->parent) 1647 root = root->parent; 1648 1649 pci_bus_for_each_resource(root, res, i) { 1650 if (res && res->flags & (IORESOURCE_MEM | IORESOURCE_MEM_64) && 1651 res->start > 0x100000000ull) 1652 break; 1653 } 1654 1655 /* Trying to resize is pointless without a root hub window above 4GB */ 1656 if (!res) 1657 return 0; 1658 1659 /* Limit the BAR size to what is available */ 1660 rbar_size = min(fls(pci_rebar_get_possible_sizes(adev->pdev, 0)) - 1, 1661 rbar_size); 1662 1663 /* Disable memory decoding while we change the BAR addresses and size */ 1664 pci_read_config_word(adev->pdev, PCI_COMMAND, &cmd); 1665 pci_write_config_word(adev->pdev, PCI_COMMAND, 1666 cmd & ~PCI_COMMAND_MEMORY); 1667 1668 /* Free the VRAM and doorbell BAR, we most likely need to move both. */ 1669 amdgpu_doorbell_fini(adev); 1670 if (adev->asic_type >= CHIP_BONAIRE) 1671 pci_release_resource(adev->pdev, 2); 1672 1673 pci_release_resource(adev->pdev, 0); 1674 1675 r = pci_resize_resource(adev->pdev, 0, rbar_size); 1676 if (r == -ENOSPC) 1677 DRM_INFO("Not enough PCI address space for a large BAR."); 1678 else if (r && r != -ENOTSUPP) 1679 DRM_ERROR("Problem resizing BAR0 (%d).", r); 1680 1681 pci_assign_unassigned_bus_resources(adev->pdev->bus); 1682 1683 /* When the doorbell or fb BAR isn't available we have no chance of 1684 * using the device. 1685 */ 1686 r = amdgpu_doorbell_init(adev); 1687 if (r || (pci_resource_flags(adev->pdev, 0) & IORESOURCE_UNSET)) 1688 return -ENODEV; 1689 1690 pci_write_config_word(adev->pdev, PCI_COMMAND, cmd); 1691 1692 return 0; 1693 } 1694 1695 static bool amdgpu_device_read_bios(struct amdgpu_device *adev) 1696 { 1697 if (hweight32(adev->aid_mask) && (adev->flags & AMD_IS_APU)) 1698 return false; 1699 1700 return true; 1701 } 1702 1703 /* 1704 * GPU helpers function. 1705 */ 1706 /** 1707 * amdgpu_device_need_post - check if the hw need post or not 1708 * 1709 * @adev: amdgpu_device pointer 1710 * 1711 * Check if the asic has been initialized (all asics) at driver startup 1712 * or post is needed if hw reset is performed. 1713 * Returns true if need or false if not. 1714 */ 1715 bool amdgpu_device_need_post(struct amdgpu_device *adev) 1716 { 1717 uint32_t reg; 1718 1719 if (amdgpu_sriov_vf(adev)) 1720 return false; 1721 1722 if (!amdgpu_device_read_bios(adev)) 1723 return false; 1724 1725 if (amdgpu_passthrough(adev)) { 1726 /* for FIJI: In whole GPU pass-through virtualization case, after VM reboot 1727 * some old smc fw still need driver do vPost otherwise gpu hang, while 1728 * those smc fw version above 22.15 doesn't have this flaw, so we force 1729 * vpost executed for smc version below 22.15 1730 */ 1731 if (adev->asic_type == CHIP_FIJI) { 1732 int err; 1733 uint32_t fw_ver; 1734 1735 err = request_firmware(&adev->pm.fw, "amdgpu/fiji_smc.bin", adev->dev); 1736 /* force vPost if error occured */ 1737 if (err) 1738 return true; 1739 1740 fw_ver = *((uint32_t *)adev->pm.fw->data + 69); 1741 release_firmware(adev->pm.fw); 1742 if (fw_ver < 0x00160e00) 1743 return true; 1744 } 1745 } 1746 1747 /* Don't post if we need to reset whole hive on init */ 1748 if (adev->init_lvl->level == AMDGPU_INIT_LEVEL_MINIMAL_XGMI) 1749 return false; 1750 1751 if (adev->has_hw_reset) { 1752 adev->has_hw_reset = false; 1753 return true; 1754 } 1755 1756 /* bios scratch used on CIK+ */ 1757 if (adev->asic_type >= CHIP_BONAIRE) 1758 return amdgpu_atombios_scratch_need_asic_init(adev); 1759 1760 /* check MEM_SIZE for older asics */ 1761 reg = amdgpu_asic_get_config_memsize(adev); 1762 1763 if ((reg != 0) && (reg != 0xffffffff)) 1764 return false; 1765 1766 return true; 1767 } 1768 1769 /* 1770 * Check whether seamless boot is supported. 1771 * 1772 * So far we only support seamless boot on DCE 3.0 or later. 1773 * If users report that it works on older ASICS as well, we may 1774 * loosen this. 1775 */ 1776 bool amdgpu_device_seamless_boot_supported(struct amdgpu_device *adev) 1777 { 1778 switch (amdgpu_seamless) { 1779 case -1: 1780 break; 1781 case 1: 1782 return true; 1783 case 0: 1784 return false; 1785 default: 1786 DRM_ERROR("Invalid value for amdgpu.seamless: %d\n", 1787 amdgpu_seamless); 1788 return false; 1789 } 1790 1791 if (!(adev->flags & AMD_IS_APU)) 1792 return false; 1793 1794 if (adev->mman.keep_stolen_vga_memory) 1795 return false; 1796 1797 return amdgpu_ip_version(adev, DCE_HWIP, 0) >= IP_VERSION(3, 0, 0); 1798 } 1799 1800 /* 1801 * Intel hosts such as Rocket Lake, Alder Lake, Raptor Lake and Sapphire Rapids 1802 * don't support dynamic speed switching. Until we have confirmation from Intel 1803 * that a specific host supports it, it's safer that we keep it disabled for all. 1804 * 1805 * https://edc.intel.com/content/www/us/en/design/products/platforms/details/raptor-lake-s/13th-generation-core-processors-datasheet-volume-1-of-2/005/pci-express-support/ 1806 * https://gitlab.freedesktop.org/drm/amd/-/issues/2663 1807 */ 1808 static bool amdgpu_device_pcie_dynamic_switching_supported(struct amdgpu_device *adev) 1809 { 1810 #if IS_ENABLED(CONFIG_X86) 1811 struct cpuinfo_x86 *c = &cpu_data(0); 1812 1813 /* eGPU change speeds based on USB4 fabric conditions */ 1814 if (dev_is_removable(adev->dev)) 1815 return true; 1816 1817 if (c->x86_vendor == X86_VENDOR_INTEL) 1818 return false; 1819 #endif 1820 return true; 1821 } 1822 1823 /** 1824 * amdgpu_device_should_use_aspm - check if the device should program ASPM 1825 * 1826 * @adev: amdgpu_device pointer 1827 * 1828 * Confirm whether the module parameter and pcie bridge agree that ASPM should 1829 * be set for this device. 1830 * 1831 * Returns true if it should be used or false if not. 1832 */ 1833 bool amdgpu_device_should_use_aspm(struct amdgpu_device *adev) 1834 { 1835 switch (amdgpu_aspm) { 1836 case -1: 1837 break; 1838 case 0: 1839 return false; 1840 case 1: 1841 return true; 1842 default: 1843 return false; 1844 } 1845 if (adev->flags & AMD_IS_APU) 1846 return false; 1847 if (!(adev->pm.pp_feature & PP_PCIE_DPM_MASK)) 1848 return false; 1849 return pcie_aspm_enabled(adev->pdev); 1850 } 1851 1852 /* if we get transitioned to only one device, take VGA back */ 1853 /** 1854 * amdgpu_device_vga_set_decode - enable/disable vga decode 1855 * 1856 * @pdev: PCI device pointer 1857 * @state: enable/disable vga decode 1858 * 1859 * Enable/disable vga decode (all asics). 1860 * Returns VGA resource flags. 1861 */ 1862 static unsigned int amdgpu_device_vga_set_decode(struct pci_dev *pdev, 1863 bool state) 1864 { 1865 struct amdgpu_device *adev = drm_to_adev(pci_get_drvdata(pdev)); 1866 1867 amdgpu_asic_set_vga_state(adev, state); 1868 if (state) 1869 return VGA_RSRC_LEGACY_IO | VGA_RSRC_LEGACY_MEM | 1870 VGA_RSRC_NORMAL_IO | VGA_RSRC_NORMAL_MEM; 1871 else 1872 return VGA_RSRC_NORMAL_IO | VGA_RSRC_NORMAL_MEM; 1873 } 1874 1875 /** 1876 * amdgpu_device_check_block_size - validate the vm block size 1877 * 1878 * @adev: amdgpu_device pointer 1879 * 1880 * Validates the vm block size specified via module parameter. 1881 * The vm block size defines number of bits in page table versus page directory, 1882 * a page is 4KB so we have 12 bits offset, minimum 9 bits in the 1883 * page table and the remaining bits are in the page directory. 1884 */ 1885 static void amdgpu_device_check_block_size(struct amdgpu_device *adev) 1886 { 1887 /* defines number of bits in page table versus page directory, 1888 * a page is 4KB so we have 12 bits offset, minimum 9 bits in the 1889 * page table and the remaining bits are in the page directory 1890 */ 1891 if (amdgpu_vm_block_size == -1) 1892 return; 1893 1894 if (amdgpu_vm_block_size < 9) { 1895 dev_warn(adev->dev, "VM page table size (%d) too small\n", 1896 amdgpu_vm_block_size); 1897 amdgpu_vm_block_size = -1; 1898 } 1899 } 1900 1901 /** 1902 * amdgpu_device_check_vm_size - validate the vm size 1903 * 1904 * @adev: amdgpu_device pointer 1905 * 1906 * Validates the vm size in GB specified via module parameter. 1907 * The VM size is the size of the GPU virtual memory space in GB. 1908 */ 1909 static void amdgpu_device_check_vm_size(struct amdgpu_device *adev) 1910 { 1911 /* no need to check the default value */ 1912 if (amdgpu_vm_size == -1) 1913 return; 1914 1915 if (amdgpu_vm_size < 1) { 1916 dev_warn(adev->dev, "VM size (%d) too small, min is 1GB\n", 1917 amdgpu_vm_size); 1918 amdgpu_vm_size = -1; 1919 } 1920 } 1921 1922 static void amdgpu_device_check_smu_prv_buffer_size(struct amdgpu_device *adev) 1923 { 1924 struct sysinfo si; 1925 bool is_os_64 = (sizeof(void *) == 8); 1926 uint64_t total_memory; 1927 uint64_t dram_size_seven_GB = 0x1B8000000; 1928 uint64_t dram_size_three_GB = 0xB8000000; 1929 1930 if (amdgpu_smu_memory_pool_size == 0) 1931 return; 1932 1933 if (!is_os_64) { 1934 DRM_WARN("Not 64-bit OS, feature not supported\n"); 1935 goto def_value; 1936 } 1937 si_meminfo(&si); 1938 total_memory = (uint64_t)si.totalram * si.mem_unit; 1939 1940 if ((amdgpu_smu_memory_pool_size == 1) || 1941 (amdgpu_smu_memory_pool_size == 2)) { 1942 if (total_memory < dram_size_three_GB) 1943 goto def_value1; 1944 } else if ((amdgpu_smu_memory_pool_size == 4) || 1945 (amdgpu_smu_memory_pool_size == 8)) { 1946 if (total_memory < dram_size_seven_GB) 1947 goto def_value1; 1948 } else { 1949 DRM_WARN("Smu memory pool size not supported\n"); 1950 goto def_value; 1951 } 1952 adev->pm.smu_prv_buffer_size = amdgpu_smu_memory_pool_size << 28; 1953 1954 return; 1955 1956 def_value1: 1957 DRM_WARN("No enough system memory\n"); 1958 def_value: 1959 adev->pm.smu_prv_buffer_size = 0; 1960 } 1961 1962 static int amdgpu_device_init_apu_flags(struct amdgpu_device *adev) 1963 { 1964 if (!(adev->flags & AMD_IS_APU) || 1965 adev->asic_type < CHIP_RAVEN) 1966 return 0; 1967 1968 switch (adev->asic_type) { 1969 case CHIP_RAVEN: 1970 if (adev->pdev->device == 0x15dd) 1971 adev->apu_flags |= AMD_APU_IS_RAVEN; 1972 if (adev->pdev->device == 0x15d8) 1973 adev->apu_flags |= AMD_APU_IS_PICASSO; 1974 break; 1975 case CHIP_RENOIR: 1976 if ((adev->pdev->device == 0x1636) || 1977 (adev->pdev->device == 0x164c)) 1978 adev->apu_flags |= AMD_APU_IS_RENOIR; 1979 else 1980 adev->apu_flags |= AMD_APU_IS_GREEN_SARDINE; 1981 break; 1982 case CHIP_VANGOGH: 1983 adev->apu_flags |= AMD_APU_IS_VANGOGH; 1984 break; 1985 case CHIP_YELLOW_CARP: 1986 break; 1987 case CHIP_CYAN_SKILLFISH: 1988 if ((adev->pdev->device == 0x13FE) || 1989 (adev->pdev->device == 0x143F)) 1990 adev->apu_flags |= AMD_APU_IS_CYAN_SKILLFISH2; 1991 break; 1992 default: 1993 break; 1994 } 1995 1996 return 0; 1997 } 1998 1999 /** 2000 * amdgpu_device_check_arguments - validate module params 2001 * 2002 * @adev: amdgpu_device pointer 2003 * 2004 * Validates certain module parameters and updates 2005 * the associated values used by the driver (all asics). 2006 */ 2007 static int amdgpu_device_check_arguments(struct amdgpu_device *adev) 2008 { 2009 int i; 2010 2011 if (amdgpu_sched_jobs < 4) { 2012 dev_warn(adev->dev, "sched jobs (%d) must be at least 4\n", 2013 amdgpu_sched_jobs); 2014 amdgpu_sched_jobs = 4; 2015 } else if (!is_power_of_2(amdgpu_sched_jobs)) { 2016 dev_warn(adev->dev, "sched jobs (%d) must be a power of 2\n", 2017 amdgpu_sched_jobs); 2018 amdgpu_sched_jobs = roundup_pow_of_two(amdgpu_sched_jobs); 2019 } 2020 2021 if (amdgpu_gart_size != -1 && amdgpu_gart_size < 32) { 2022 /* gart size must be greater or equal to 32M */ 2023 dev_warn(adev->dev, "gart size (%d) too small\n", 2024 amdgpu_gart_size); 2025 amdgpu_gart_size = -1; 2026 } 2027 2028 if (amdgpu_gtt_size != -1 && amdgpu_gtt_size < 32) { 2029 /* gtt size must be greater or equal to 32M */ 2030 dev_warn(adev->dev, "gtt size (%d) too small\n", 2031 amdgpu_gtt_size); 2032 amdgpu_gtt_size = -1; 2033 } 2034 2035 /* valid range is between 4 and 9 inclusive */ 2036 if (amdgpu_vm_fragment_size != -1 && 2037 (amdgpu_vm_fragment_size > 9 || amdgpu_vm_fragment_size < 4)) { 2038 dev_warn(adev->dev, "valid range is between 4 and 9\n"); 2039 amdgpu_vm_fragment_size = -1; 2040 } 2041 2042 if (amdgpu_sched_hw_submission < 2) { 2043 dev_warn(adev->dev, "sched hw submission jobs (%d) must be at least 2\n", 2044 amdgpu_sched_hw_submission); 2045 amdgpu_sched_hw_submission = 2; 2046 } else if (!is_power_of_2(amdgpu_sched_hw_submission)) { 2047 dev_warn(adev->dev, "sched hw submission jobs (%d) must be a power of 2\n", 2048 amdgpu_sched_hw_submission); 2049 amdgpu_sched_hw_submission = roundup_pow_of_two(amdgpu_sched_hw_submission); 2050 } 2051 2052 if (amdgpu_reset_method < -1 || amdgpu_reset_method > 4) { 2053 dev_warn(adev->dev, "invalid option for reset method, reverting to default\n"); 2054 amdgpu_reset_method = -1; 2055 } 2056 2057 amdgpu_device_check_smu_prv_buffer_size(adev); 2058 2059 amdgpu_device_check_vm_size(adev); 2060 2061 amdgpu_device_check_block_size(adev); 2062 2063 adev->firmware.load_type = amdgpu_ucode_get_load_type(adev, amdgpu_fw_load_type); 2064 2065 for (i = 0; i < MAX_XCP; i++) 2066 adev->enforce_isolation[i] = !!enforce_isolation; 2067 2068 return 0; 2069 } 2070 2071 /** 2072 * amdgpu_switcheroo_set_state - set switcheroo state 2073 * 2074 * @pdev: pci dev pointer 2075 * @state: vga_switcheroo state 2076 * 2077 * Callback for the switcheroo driver. Suspends or resumes 2078 * the asics before or after it is powered up using ACPI methods. 2079 */ 2080 static void amdgpu_switcheroo_set_state(struct pci_dev *pdev, 2081 enum vga_switcheroo_state state) 2082 { 2083 struct drm_device *dev = pci_get_drvdata(pdev); 2084 int r; 2085 2086 if (amdgpu_device_supports_px(dev) && state == VGA_SWITCHEROO_OFF) 2087 return; 2088 2089 if (state == VGA_SWITCHEROO_ON) { 2090 pr_info("switched on\n"); 2091 /* don't suspend or resume card normally */ 2092 dev->switch_power_state = DRM_SWITCH_POWER_CHANGING; 2093 2094 pci_set_power_state(pdev, PCI_D0); 2095 amdgpu_device_load_pci_state(pdev); 2096 r = pci_enable_device(pdev); 2097 if (r) 2098 DRM_WARN("pci_enable_device failed (%d)\n", r); 2099 amdgpu_device_resume(dev, true); 2100 2101 dev->switch_power_state = DRM_SWITCH_POWER_ON; 2102 } else { 2103 pr_info("switched off\n"); 2104 dev->switch_power_state = DRM_SWITCH_POWER_CHANGING; 2105 amdgpu_device_prepare(dev); 2106 amdgpu_device_suspend(dev, true); 2107 amdgpu_device_cache_pci_state(pdev); 2108 /* Shut down the device */ 2109 pci_disable_device(pdev); 2110 pci_set_power_state(pdev, PCI_D3cold); 2111 dev->switch_power_state = DRM_SWITCH_POWER_OFF; 2112 } 2113 } 2114 2115 /** 2116 * amdgpu_switcheroo_can_switch - see if switcheroo state can change 2117 * 2118 * @pdev: pci dev pointer 2119 * 2120 * Callback for the switcheroo driver. Check of the switcheroo 2121 * state can be changed. 2122 * Returns true if the state can be changed, false if not. 2123 */ 2124 static bool amdgpu_switcheroo_can_switch(struct pci_dev *pdev) 2125 { 2126 struct drm_device *dev = pci_get_drvdata(pdev); 2127 2128 /* 2129 * FIXME: open_count is protected by drm_global_mutex but that would lead to 2130 * locking inversion with the driver load path. And the access here is 2131 * completely racy anyway. So don't bother with locking for now. 2132 */ 2133 return atomic_read(&dev->open_count) == 0; 2134 } 2135 2136 static const struct vga_switcheroo_client_ops amdgpu_switcheroo_ops = { 2137 .set_gpu_state = amdgpu_switcheroo_set_state, 2138 .reprobe = NULL, 2139 .can_switch = amdgpu_switcheroo_can_switch, 2140 }; 2141 2142 /** 2143 * amdgpu_device_ip_set_clockgating_state - set the CG state 2144 * 2145 * @dev: amdgpu_device pointer 2146 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.) 2147 * @state: clockgating state (gate or ungate) 2148 * 2149 * Sets the requested clockgating state for all instances of 2150 * the hardware IP specified. 2151 * Returns the error code from the last instance. 2152 */ 2153 int amdgpu_device_ip_set_clockgating_state(void *dev, 2154 enum amd_ip_block_type block_type, 2155 enum amd_clockgating_state state) 2156 { 2157 struct amdgpu_device *adev = dev; 2158 int i, r = 0; 2159 2160 for (i = 0; i < adev->num_ip_blocks; i++) { 2161 if (!adev->ip_blocks[i].status.valid) 2162 continue; 2163 if (adev->ip_blocks[i].version->type != block_type) 2164 continue; 2165 if (!adev->ip_blocks[i].version->funcs->set_clockgating_state) 2166 continue; 2167 r = adev->ip_blocks[i].version->funcs->set_clockgating_state( 2168 (void *)adev, state); 2169 if (r) 2170 DRM_ERROR("set_clockgating_state of IP block <%s> failed %d\n", 2171 adev->ip_blocks[i].version->funcs->name, r); 2172 } 2173 return r; 2174 } 2175 2176 /** 2177 * amdgpu_device_ip_set_powergating_state - set the PG state 2178 * 2179 * @dev: amdgpu_device pointer 2180 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.) 2181 * @state: powergating state (gate or ungate) 2182 * 2183 * Sets the requested powergating state for all instances of 2184 * the hardware IP specified. 2185 * Returns the error code from the last instance. 2186 */ 2187 int amdgpu_device_ip_set_powergating_state(void *dev, 2188 enum amd_ip_block_type block_type, 2189 enum amd_powergating_state state) 2190 { 2191 struct amdgpu_device *adev = dev; 2192 int i, r = 0; 2193 2194 for (i = 0; i < adev->num_ip_blocks; i++) { 2195 if (!adev->ip_blocks[i].status.valid) 2196 continue; 2197 if (adev->ip_blocks[i].version->type != block_type) 2198 continue; 2199 if (!adev->ip_blocks[i].version->funcs->set_powergating_state) 2200 continue; 2201 r = adev->ip_blocks[i].version->funcs->set_powergating_state( 2202 (void *)adev, state); 2203 if (r) 2204 DRM_ERROR("set_powergating_state of IP block <%s> failed %d\n", 2205 adev->ip_blocks[i].version->funcs->name, r); 2206 } 2207 return r; 2208 } 2209 2210 /** 2211 * amdgpu_device_ip_get_clockgating_state - get the CG state 2212 * 2213 * @adev: amdgpu_device pointer 2214 * @flags: clockgating feature flags 2215 * 2216 * Walks the list of IPs on the device and updates the clockgating 2217 * flags for each IP. 2218 * Updates @flags with the feature flags for each hardware IP where 2219 * clockgating is enabled. 2220 */ 2221 void amdgpu_device_ip_get_clockgating_state(struct amdgpu_device *adev, 2222 u64 *flags) 2223 { 2224 int i; 2225 2226 for (i = 0; i < adev->num_ip_blocks; i++) { 2227 if (!adev->ip_blocks[i].status.valid) 2228 continue; 2229 if (adev->ip_blocks[i].version->funcs->get_clockgating_state) 2230 adev->ip_blocks[i].version->funcs->get_clockgating_state((void *)adev, flags); 2231 } 2232 } 2233 2234 /** 2235 * amdgpu_device_ip_wait_for_idle - wait for idle 2236 * 2237 * @adev: amdgpu_device pointer 2238 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.) 2239 * 2240 * Waits for the request hardware IP to be idle. 2241 * Returns 0 for success or a negative error code on failure. 2242 */ 2243 int amdgpu_device_ip_wait_for_idle(struct amdgpu_device *adev, 2244 enum amd_ip_block_type block_type) 2245 { 2246 int i, r; 2247 2248 for (i = 0; i < adev->num_ip_blocks; i++) { 2249 if (!adev->ip_blocks[i].status.valid) 2250 continue; 2251 if (adev->ip_blocks[i].version->type == block_type) { 2252 if (adev->ip_blocks[i].version->funcs->wait_for_idle) { 2253 r = adev->ip_blocks[i].version->funcs->wait_for_idle( 2254 &adev->ip_blocks[i]); 2255 if (r) 2256 return r; 2257 } 2258 break; 2259 } 2260 } 2261 return 0; 2262 2263 } 2264 2265 /** 2266 * amdgpu_device_ip_is_valid - is the hardware IP enabled 2267 * 2268 * @adev: amdgpu_device pointer 2269 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.) 2270 * 2271 * Check if the hardware IP is enable or not. 2272 * Returns true if it the IP is enable, false if not. 2273 */ 2274 bool amdgpu_device_ip_is_valid(struct amdgpu_device *adev, 2275 enum amd_ip_block_type block_type) 2276 { 2277 int i; 2278 2279 for (i = 0; i < adev->num_ip_blocks; i++) { 2280 if (adev->ip_blocks[i].version->type == block_type) 2281 return adev->ip_blocks[i].status.valid; 2282 } 2283 return false; 2284 2285 } 2286 2287 /** 2288 * amdgpu_device_ip_get_ip_block - get a hw IP pointer 2289 * 2290 * @adev: amdgpu_device pointer 2291 * @type: Type of hardware IP (SMU, GFX, UVD, etc.) 2292 * 2293 * Returns a pointer to the hardware IP block structure 2294 * if it exists for the asic, otherwise NULL. 2295 */ 2296 struct amdgpu_ip_block * 2297 amdgpu_device_ip_get_ip_block(struct amdgpu_device *adev, 2298 enum amd_ip_block_type type) 2299 { 2300 int i; 2301 2302 for (i = 0; i < adev->num_ip_blocks; i++) 2303 if (adev->ip_blocks[i].version->type == type) 2304 return &adev->ip_blocks[i]; 2305 2306 return NULL; 2307 } 2308 2309 /** 2310 * amdgpu_device_ip_block_version_cmp 2311 * 2312 * @adev: amdgpu_device pointer 2313 * @type: enum amd_ip_block_type 2314 * @major: major version 2315 * @minor: minor version 2316 * 2317 * return 0 if equal or greater 2318 * return 1 if smaller or the ip_block doesn't exist 2319 */ 2320 int amdgpu_device_ip_block_version_cmp(struct amdgpu_device *adev, 2321 enum amd_ip_block_type type, 2322 u32 major, u32 minor) 2323 { 2324 struct amdgpu_ip_block *ip_block = amdgpu_device_ip_get_ip_block(adev, type); 2325 2326 if (ip_block && ((ip_block->version->major > major) || 2327 ((ip_block->version->major == major) && 2328 (ip_block->version->minor >= minor)))) 2329 return 0; 2330 2331 return 1; 2332 } 2333 2334 /** 2335 * amdgpu_device_ip_block_add 2336 * 2337 * @adev: amdgpu_device pointer 2338 * @ip_block_version: pointer to the IP to add 2339 * 2340 * Adds the IP block driver information to the collection of IPs 2341 * on the asic. 2342 */ 2343 int amdgpu_device_ip_block_add(struct amdgpu_device *adev, 2344 const struct amdgpu_ip_block_version *ip_block_version) 2345 { 2346 if (!ip_block_version) 2347 return -EINVAL; 2348 2349 switch (ip_block_version->type) { 2350 case AMD_IP_BLOCK_TYPE_VCN: 2351 if (adev->harvest_ip_mask & AMD_HARVEST_IP_VCN_MASK) 2352 return 0; 2353 break; 2354 case AMD_IP_BLOCK_TYPE_JPEG: 2355 if (adev->harvest_ip_mask & AMD_HARVEST_IP_JPEG_MASK) 2356 return 0; 2357 break; 2358 default: 2359 break; 2360 } 2361 2362 DRM_INFO("add ip block number %d <%s>\n", adev->num_ip_blocks, 2363 ip_block_version->funcs->name); 2364 2365 adev->ip_blocks[adev->num_ip_blocks].adev = adev; 2366 2367 adev->ip_blocks[adev->num_ip_blocks++].version = ip_block_version; 2368 2369 return 0; 2370 } 2371 2372 /** 2373 * amdgpu_device_enable_virtual_display - enable virtual display feature 2374 * 2375 * @adev: amdgpu_device pointer 2376 * 2377 * Enabled the virtual display feature if the user has enabled it via 2378 * the module parameter virtual_display. This feature provides a virtual 2379 * display hardware on headless boards or in virtualized environments. 2380 * This function parses and validates the configuration string specified by 2381 * the user and configues the virtual display configuration (number of 2382 * virtual connectors, crtcs, etc.) specified. 2383 */ 2384 static void amdgpu_device_enable_virtual_display(struct amdgpu_device *adev) 2385 { 2386 adev->enable_virtual_display = false; 2387 2388 if (amdgpu_virtual_display) { 2389 const char *pci_address_name = pci_name(adev->pdev); 2390 char *pciaddstr, *pciaddstr_tmp, *pciaddname_tmp, *pciaddname; 2391 2392 pciaddstr = kstrdup(amdgpu_virtual_display, GFP_KERNEL); 2393 pciaddstr_tmp = pciaddstr; 2394 while ((pciaddname_tmp = strsep(&pciaddstr_tmp, ";"))) { 2395 pciaddname = strsep(&pciaddname_tmp, ","); 2396 if (!strcmp("all", pciaddname) 2397 || !strcmp(pci_address_name, pciaddname)) { 2398 long num_crtc; 2399 int res = -1; 2400 2401 adev->enable_virtual_display = true; 2402 2403 if (pciaddname_tmp) 2404 res = kstrtol(pciaddname_tmp, 10, 2405 &num_crtc); 2406 2407 if (!res) { 2408 if (num_crtc < 1) 2409 num_crtc = 1; 2410 if (num_crtc > 6) 2411 num_crtc = 6; 2412 adev->mode_info.num_crtc = num_crtc; 2413 } else { 2414 adev->mode_info.num_crtc = 1; 2415 } 2416 break; 2417 } 2418 } 2419 2420 DRM_INFO("virtual display string:%s, %s:virtual_display:%d, num_crtc:%d\n", 2421 amdgpu_virtual_display, pci_address_name, 2422 adev->enable_virtual_display, adev->mode_info.num_crtc); 2423 2424 kfree(pciaddstr); 2425 } 2426 } 2427 2428 void amdgpu_device_set_sriov_virtual_display(struct amdgpu_device *adev) 2429 { 2430 if (amdgpu_sriov_vf(adev) && !adev->enable_virtual_display) { 2431 adev->mode_info.num_crtc = 1; 2432 adev->enable_virtual_display = true; 2433 DRM_INFO("virtual_display:%d, num_crtc:%d\n", 2434 adev->enable_virtual_display, adev->mode_info.num_crtc); 2435 } 2436 } 2437 2438 /** 2439 * amdgpu_device_parse_gpu_info_fw - parse gpu info firmware 2440 * 2441 * @adev: amdgpu_device pointer 2442 * 2443 * Parses the asic configuration parameters specified in the gpu info 2444 * firmware and makes them availale to the driver for use in configuring 2445 * the asic. 2446 * Returns 0 on success, -EINVAL on failure. 2447 */ 2448 static int amdgpu_device_parse_gpu_info_fw(struct amdgpu_device *adev) 2449 { 2450 const char *chip_name; 2451 int err; 2452 const struct gpu_info_firmware_header_v1_0 *hdr; 2453 2454 adev->firmware.gpu_info_fw = NULL; 2455 2456 if (adev->mman.discovery_bin) 2457 return 0; 2458 2459 switch (adev->asic_type) { 2460 default: 2461 return 0; 2462 case CHIP_VEGA10: 2463 chip_name = "vega10"; 2464 break; 2465 case CHIP_VEGA12: 2466 chip_name = "vega12"; 2467 break; 2468 case CHIP_RAVEN: 2469 if (adev->apu_flags & AMD_APU_IS_RAVEN2) 2470 chip_name = "raven2"; 2471 else if (adev->apu_flags & AMD_APU_IS_PICASSO) 2472 chip_name = "picasso"; 2473 else 2474 chip_name = "raven"; 2475 break; 2476 case CHIP_ARCTURUS: 2477 chip_name = "arcturus"; 2478 break; 2479 case CHIP_NAVI12: 2480 chip_name = "navi12"; 2481 break; 2482 } 2483 2484 err = amdgpu_ucode_request(adev, &adev->firmware.gpu_info_fw, 2485 "amdgpu/%s_gpu_info.bin", chip_name); 2486 if (err) { 2487 dev_err(adev->dev, 2488 "Failed to get gpu_info firmware \"%s_gpu_info.bin\"\n", 2489 chip_name); 2490 goto out; 2491 } 2492 2493 hdr = (const struct gpu_info_firmware_header_v1_0 *)adev->firmware.gpu_info_fw->data; 2494 amdgpu_ucode_print_gpu_info_hdr(&hdr->header); 2495 2496 switch (hdr->version_major) { 2497 case 1: 2498 { 2499 const struct gpu_info_firmware_v1_0 *gpu_info_fw = 2500 (const struct gpu_info_firmware_v1_0 *)(adev->firmware.gpu_info_fw->data + 2501 le32_to_cpu(hdr->header.ucode_array_offset_bytes)); 2502 2503 /* 2504 * Should be droped when DAL no longer needs it. 2505 */ 2506 if (adev->asic_type == CHIP_NAVI12) 2507 goto parse_soc_bounding_box; 2508 2509 adev->gfx.config.max_shader_engines = le32_to_cpu(gpu_info_fw->gc_num_se); 2510 adev->gfx.config.max_cu_per_sh = le32_to_cpu(gpu_info_fw->gc_num_cu_per_sh); 2511 adev->gfx.config.max_sh_per_se = le32_to_cpu(gpu_info_fw->gc_num_sh_per_se); 2512 adev->gfx.config.max_backends_per_se = le32_to_cpu(gpu_info_fw->gc_num_rb_per_se); 2513 adev->gfx.config.max_texture_channel_caches = 2514 le32_to_cpu(gpu_info_fw->gc_num_tccs); 2515 adev->gfx.config.max_gprs = le32_to_cpu(gpu_info_fw->gc_num_gprs); 2516 adev->gfx.config.max_gs_threads = le32_to_cpu(gpu_info_fw->gc_num_max_gs_thds); 2517 adev->gfx.config.gs_vgt_table_depth = le32_to_cpu(gpu_info_fw->gc_gs_table_depth); 2518 adev->gfx.config.gs_prim_buffer_depth = le32_to_cpu(gpu_info_fw->gc_gsprim_buff_depth); 2519 adev->gfx.config.double_offchip_lds_buf = 2520 le32_to_cpu(gpu_info_fw->gc_double_offchip_lds_buffer); 2521 adev->gfx.cu_info.wave_front_size = le32_to_cpu(gpu_info_fw->gc_wave_size); 2522 adev->gfx.cu_info.max_waves_per_simd = 2523 le32_to_cpu(gpu_info_fw->gc_max_waves_per_simd); 2524 adev->gfx.cu_info.max_scratch_slots_per_cu = 2525 le32_to_cpu(gpu_info_fw->gc_max_scratch_slots_per_cu); 2526 adev->gfx.cu_info.lds_size = le32_to_cpu(gpu_info_fw->gc_lds_size); 2527 if (hdr->version_minor >= 1) { 2528 const struct gpu_info_firmware_v1_1 *gpu_info_fw = 2529 (const struct gpu_info_firmware_v1_1 *)(adev->firmware.gpu_info_fw->data + 2530 le32_to_cpu(hdr->header.ucode_array_offset_bytes)); 2531 adev->gfx.config.num_sc_per_sh = 2532 le32_to_cpu(gpu_info_fw->num_sc_per_sh); 2533 adev->gfx.config.num_packer_per_sc = 2534 le32_to_cpu(gpu_info_fw->num_packer_per_sc); 2535 } 2536 2537 parse_soc_bounding_box: 2538 /* 2539 * soc bounding box info is not integrated in disocovery table, 2540 * we always need to parse it from gpu info firmware if needed. 2541 */ 2542 if (hdr->version_minor == 2) { 2543 const struct gpu_info_firmware_v1_2 *gpu_info_fw = 2544 (const struct gpu_info_firmware_v1_2 *)(adev->firmware.gpu_info_fw->data + 2545 le32_to_cpu(hdr->header.ucode_array_offset_bytes)); 2546 adev->dm.soc_bounding_box = &gpu_info_fw->soc_bounding_box; 2547 } 2548 break; 2549 } 2550 default: 2551 dev_err(adev->dev, 2552 "Unsupported gpu_info table %d\n", hdr->header.ucode_version); 2553 err = -EINVAL; 2554 goto out; 2555 } 2556 out: 2557 return err; 2558 } 2559 2560 /** 2561 * amdgpu_device_ip_early_init - run early init for hardware IPs 2562 * 2563 * @adev: amdgpu_device pointer 2564 * 2565 * Early initialization pass for hardware IPs. The hardware IPs that make 2566 * up each asic are discovered each IP's early_init callback is run. This 2567 * is the first stage in initializing the asic. 2568 * Returns 0 on success, negative error code on failure. 2569 */ 2570 static int amdgpu_device_ip_early_init(struct amdgpu_device *adev) 2571 { 2572 struct amdgpu_ip_block *ip_block; 2573 struct pci_dev *parent; 2574 int i, r; 2575 bool total; 2576 2577 amdgpu_device_enable_virtual_display(adev); 2578 2579 if (amdgpu_sriov_vf(adev)) { 2580 r = amdgpu_virt_request_full_gpu(adev, true); 2581 if (r) 2582 return r; 2583 } 2584 2585 switch (adev->asic_type) { 2586 #ifdef CONFIG_DRM_AMDGPU_SI 2587 case CHIP_VERDE: 2588 case CHIP_TAHITI: 2589 case CHIP_PITCAIRN: 2590 case CHIP_OLAND: 2591 case CHIP_HAINAN: 2592 adev->family = AMDGPU_FAMILY_SI; 2593 r = si_set_ip_blocks(adev); 2594 if (r) 2595 return r; 2596 break; 2597 #endif 2598 #ifdef CONFIG_DRM_AMDGPU_CIK 2599 case CHIP_BONAIRE: 2600 case CHIP_HAWAII: 2601 case CHIP_KAVERI: 2602 case CHIP_KABINI: 2603 case CHIP_MULLINS: 2604 if (adev->flags & AMD_IS_APU) 2605 adev->family = AMDGPU_FAMILY_KV; 2606 else 2607 adev->family = AMDGPU_FAMILY_CI; 2608 2609 r = cik_set_ip_blocks(adev); 2610 if (r) 2611 return r; 2612 break; 2613 #endif 2614 case CHIP_TOPAZ: 2615 case CHIP_TONGA: 2616 case CHIP_FIJI: 2617 case CHIP_POLARIS10: 2618 case CHIP_POLARIS11: 2619 case CHIP_POLARIS12: 2620 case CHIP_VEGAM: 2621 case CHIP_CARRIZO: 2622 case CHIP_STONEY: 2623 if (adev->flags & AMD_IS_APU) 2624 adev->family = AMDGPU_FAMILY_CZ; 2625 else 2626 adev->family = AMDGPU_FAMILY_VI; 2627 2628 r = vi_set_ip_blocks(adev); 2629 if (r) 2630 return r; 2631 break; 2632 default: 2633 r = amdgpu_discovery_set_ip_blocks(adev); 2634 if (r) 2635 return r; 2636 break; 2637 } 2638 2639 if (amdgpu_has_atpx() && 2640 (amdgpu_is_atpx_hybrid() || 2641 amdgpu_has_atpx_dgpu_power_cntl()) && 2642 ((adev->flags & AMD_IS_APU) == 0) && 2643 !dev_is_removable(&adev->pdev->dev)) 2644 adev->flags |= AMD_IS_PX; 2645 2646 if (!(adev->flags & AMD_IS_APU)) { 2647 parent = pcie_find_root_port(adev->pdev); 2648 adev->has_pr3 = parent ? pci_pr3_present(parent) : false; 2649 } 2650 2651 2652 adev->pm.pp_feature = amdgpu_pp_feature_mask; 2653 if (amdgpu_sriov_vf(adev) || sched_policy == KFD_SCHED_POLICY_NO_HWS) 2654 adev->pm.pp_feature &= ~PP_GFXOFF_MASK; 2655 if (amdgpu_sriov_vf(adev) && adev->asic_type == CHIP_SIENNA_CICHLID) 2656 adev->pm.pp_feature &= ~PP_OVERDRIVE_MASK; 2657 if (!amdgpu_device_pcie_dynamic_switching_supported(adev)) 2658 adev->pm.pp_feature &= ~PP_PCIE_DPM_MASK; 2659 2660 total = true; 2661 for (i = 0; i < adev->num_ip_blocks; i++) { 2662 ip_block = &adev->ip_blocks[i]; 2663 2664 if ((amdgpu_ip_block_mask & (1 << i)) == 0) { 2665 DRM_WARN("disabled ip block: %d <%s>\n", 2666 i, adev->ip_blocks[i].version->funcs->name); 2667 adev->ip_blocks[i].status.valid = false; 2668 } else if (ip_block->version->funcs->early_init) { 2669 r = ip_block->version->funcs->early_init(ip_block); 2670 if (r == -ENOENT) { 2671 adev->ip_blocks[i].status.valid = false; 2672 } else if (r) { 2673 DRM_ERROR("early_init of IP block <%s> failed %d\n", 2674 adev->ip_blocks[i].version->funcs->name, r); 2675 total = false; 2676 } else { 2677 adev->ip_blocks[i].status.valid = true; 2678 } 2679 } else { 2680 adev->ip_blocks[i].status.valid = true; 2681 } 2682 /* get the vbios after the asic_funcs are set up */ 2683 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON) { 2684 r = amdgpu_device_parse_gpu_info_fw(adev); 2685 if (r) 2686 return r; 2687 2688 /* Read BIOS */ 2689 if (amdgpu_device_read_bios(adev)) { 2690 if (!amdgpu_get_bios(adev)) 2691 return -EINVAL; 2692 2693 r = amdgpu_atombios_init(adev); 2694 if (r) { 2695 dev_err(adev->dev, "amdgpu_atombios_init failed\n"); 2696 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_INIT_FAIL, 0, 0); 2697 return r; 2698 } 2699 } 2700 2701 /*get pf2vf msg info at it's earliest time*/ 2702 if (amdgpu_sriov_vf(adev)) 2703 amdgpu_virt_init_data_exchange(adev); 2704 2705 } 2706 } 2707 if (!total) 2708 return -ENODEV; 2709 2710 ip_block = amdgpu_device_ip_get_ip_block(adev, AMD_IP_BLOCK_TYPE_GFX); 2711 if (ip_block->status.valid != false) 2712 amdgpu_amdkfd_device_probe(adev); 2713 2714 adev->cg_flags &= amdgpu_cg_mask; 2715 adev->pg_flags &= amdgpu_pg_mask; 2716 2717 return 0; 2718 } 2719 2720 static int amdgpu_device_ip_hw_init_phase1(struct amdgpu_device *adev) 2721 { 2722 int i, r; 2723 2724 for (i = 0; i < adev->num_ip_blocks; i++) { 2725 if (!adev->ip_blocks[i].status.sw) 2726 continue; 2727 if (adev->ip_blocks[i].status.hw) 2728 continue; 2729 if (!amdgpu_ip_member_of_hwini( 2730 adev, adev->ip_blocks[i].version->type)) 2731 continue; 2732 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON || 2733 (amdgpu_sriov_vf(adev) && (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP)) || 2734 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH) { 2735 r = adev->ip_blocks[i].version->funcs->hw_init(&adev->ip_blocks[i]); 2736 if (r) { 2737 DRM_ERROR("hw_init of IP block <%s> failed %d\n", 2738 adev->ip_blocks[i].version->funcs->name, r); 2739 return r; 2740 } 2741 adev->ip_blocks[i].status.hw = true; 2742 } 2743 } 2744 2745 return 0; 2746 } 2747 2748 static int amdgpu_device_ip_hw_init_phase2(struct amdgpu_device *adev) 2749 { 2750 int i, r; 2751 2752 for (i = 0; i < adev->num_ip_blocks; i++) { 2753 if (!adev->ip_blocks[i].status.sw) 2754 continue; 2755 if (adev->ip_blocks[i].status.hw) 2756 continue; 2757 if (!amdgpu_ip_member_of_hwini( 2758 adev, adev->ip_blocks[i].version->type)) 2759 continue; 2760 r = adev->ip_blocks[i].version->funcs->hw_init(&adev->ip_blocks[i]); 2761 if (r) { 2762 DRM_ERROR("hw_init of IP block <%s> failed %d\n", 2763 adev->ip_blocks[i].version->funcs->name, r); 2764 return r; 2765 } 2766 adev->ip_blocks[i].status.hw = true; 2767 } 2768 2769 return 0; 2770 } 2771 2772 static int amdgpu_device_fw_loading(struct amdgpu_device *adev) 2773 { 2774 int r = 0; 2775 int i; 2776 uint32_t smu_version; 2777 2778 if (adev->asic_type >= CHIP_VEGA10) { 2779 for (i = 0; i < adev->num_ip_blocks; i++) { 2780 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_PSP) 2781 continue; 2782 2783 if (!amdgpu_ip_member_of_hwini(adev, 2784 AMD_IP_BLOCK_TYPE_PSP)) 2785 break; 2786 2787 if (!adev->ip_blocks[i].status.sw) 2788 continue; 2789 2790 /* no need to do the fw loading again if already done*/ 2791 if (adev->ip_blocks[i].status.hw == true) 2792 break; 2793 2794 if (amdgpu_in_reset(adev) || adev->in_suspend) { 2795 r = amdgpu_ip_block_resume(&adev->ip_blocks[i]); 2796 if (r) 2797 return r; 2798 } else { 2799 r = adev->ip_blocks[i].version->funcs->hw_init(&adev->ip_blocks[i]); 2800 if (r) { 2801 DRM_ERROR("hw_init of IP block <%s> failed %d\n", 2802 adev->ip_blocks[i].version->funcs->name, r); 2803 return r; 2804 } 2805 adev->ip_blocks[i].status.hw = true; 2806 } 2807 break; 2808 } 2809 } 2810 2811 if (!amdgpu_sriov_vf(adev) || adev->asic_type == CHIP_TONGA) 2812 r = amdgpu_pm_load_smu_firmware(adev, &smu_version); 2813 2814 return r; 2815 } 2816 2817 static int amdgpu_device_init_schedulers(struct amdgpu_device *adev) 2818 { 2819 long timeout; 2820 int r, i; 2821 2822 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 2823 struct amdgpu_ring *ring = adev->rings[i]; 2824 2825 /* No need to setup the GPU scheduler for rings that don't need it */ 2826 if (!ring || ring->no_scheduler) 2827 continue; 2828 2829 switch (ring->funcs->type) { 2830 case AMDGPU_RING_TYPE_GFX: 2831 timeout = adev->gfx_timeout; 2832 break; 2833 case AMDGPU_RING_TYPE_COMPUTE: 2834 timeout = adev->compute_timeout; 2835 break; 2836 case AMDGPU_RING_TYPE_SDMA: 2837 timeout = adev->sdma_timeout; 2838 break; 2839 default: 2840 timeout = adev->video_timeout; 2841 break; 2842 } 2843 2844 r = drm_sched_init(&ring->sched, &amdgpu_sched_ops, NULL, 2845 DRM_SCHED_PRIORITY_COUNT, 2846 ring->num_hw_submission, 0, 2847 timeout, adev->reset_domain->wq, 2848 ring->sched_score, ring->name, 2849 adev->dev); 2850 if (r) { 2851 DRM_ERROR("Failed to create scheduler on ring %s.\n", 2852 ring->name); 2853 return r; 2854 } 2855 r = amdgpu_uvd_entity_init(adev, ring); 2856 if (r) { 2857 DRM_ERROR("Failed to create UVD scheduling entity on ring %s.\n", 2858 ring->name); 2859 return r; 2860 } 2861 r = amdgpu_vce_entity_init(adev, ring); 2862 if (r) { 2863 DRM_ERROR("Failed to create VCE scheduling entity on ring %s.\n", 2864 ring->name); 2865 return r; 2866 } 2867 } 2868 2869 amdgpu_xcp_update_partition_sched_list(adev); 2870 2871 return 0; 2872 } 2873 2874 2875 /** 2876 * amdgpu_device_ip_init - run init for hardware IPs 2877 * 2878 * @adev: amdgpu_device pointer 2879 * 2880 * Main initialization pass for hardware IPs. The list of all the hardware 2881 * IPs that make up the asic is walked and the sw_init and hw_init callbacks 2882 * are run. sw_init initializes the software state associated with each IP 2883 * and hw_init initializes the hardware associated with each IP. 2884 * Returns 0 on success, negative error code on failure. 2885 */ 2886 static int amdgpu_device_ip_init(struct amdgpu_device *adev) 2887 { 2888 bool init_badpage; 2889 int i, r; 2890 2891 r = amdgpu_ras_init(adev); 2892 if (r) 2893 return r; 2894 2895 for (i = 0; i < adev->num_ip_blocks; i++) { 2896 if (!adev->ip_blocks[i].status.valid) 2897 continue; 2898 if (adev->ip_blocks[i].version->funcs->sw_init) { 2899 r = adev->ip_blocks[i].version->funcs->sw_init(&adev->ip_blocks[i]); 2900 if (r) { 2901 DRM_ERROR("sw_init of IP block <%s> failed %d\n", 2902 adev->ip_blocks[i].version->funcs->name, r); 2903 goto init_failed; 2904 } 2905 } 2906 adev->ip_blocks[i].status.sw = true; 2907 2908 if (!amdgpu_ip_member_of_hwini( 2909 adev, adev->ip_blocks[i].version->type)) 2910 continue; 2911 2912 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON) { 2913 /* need to do common hw init early so everything is set up for gmc */ 2914 r = adev->ip_blocks[i].version->funcs->hw_init(&adev->ip_blocks[i]); 2915 if (r) { 2916 DRM_ERROR("hw_init %d failed %d\n", i, r); 2917 goto init_failed; 2918 } 2919 adev->ip_blocks[i].status.hw = true; 2920 } else if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) { 2921 /* need to do gmc hw init early so we can allocate gpu mem */ 2922 /* Try to reserve bad pages early */ 2923 if (amdgpu_sriov_vf(adev)) 2924 amdgpu_virt_exchange_data(adev); 2925 2926 r = amdgpu_device_mem_scratch_init(adev); 2927 if (r) { 2928 DRM_ERROR("amdgpu_mem_scratch_init failed %d\n", r); 2929 goto init_failed; 2930 } 2931 r = adev->ip_blocks[i].version->funcs->hw_init(&adev->ip_blocks[i]); 2932 if (r) { 2933 DRM_ERROR("hw_init %d failed %d\n", i, r); 2934 goto init_failed; 2935 } 2936 r = amdgpu_device_wb_init(adev); 2937 if (r) { 2938 DRM_ERROR("amdgpu_device_wb_init failed %d\n", r); 2939 goto init_failed; 2940 } 2941 adev->ip_blocks[i].status.hw = true; 2942 2943 /* right after GMC hw init, we create CSA */ 2944 if (adev->gfx.mcbp) { 2945 r = amdgpu_allocate_static_csa(adev, &adev->virt.csa_obj, 2946 AMDGPU_GEM_DOMAIN_VRAM | 2947 AMDGPU_GEM_DOMAIN_GTT, 2948 AMDGPU_CSA_SIZE); 2949 if (r) { 2950 DRM_ERROR("allocate CSA failed %d\n", r); 2951 goto init_failed; 2952 } 2953 } 2954 2955 r = amdgpu_seq64_init(adev); 2956 if (r) { 2957 DRM_ERROR("allocate seq64 failed %d\n", r); 2958 goto init_failed; 2959 } 2960 } 2961 } 2962 2963 if (amdgpu_sriov_vf(adev)) 2964 amdgpu_virt_init_data_exchange(adev); 2965 2966 r = amdgpu_ib_pool_init(adev); 2967 if (r) { 2968 dev_err(adev->dev, "IB initialization failed (%d).\n", r); 2969 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_IB_INIT_FAIL, 0, r); 2970 goto init_failed; 2971 } 2972 2973 r = amdgpu_ucode_create_bo(adev); /* create ucode bo when sw_init complete*/ 2974 if (r) 2975 goto init_failed; 2976 2977 r = amdgpu_device_ip_hw_init_phase1(adev); 2978 if (r) 2979 goto init_failed; 2980 2981 r = amdgpu_device_fw_loading(adev); 2982 if (r) 2983 goto init_failed; 2984 2985 r = amdgpu_device_ip_hw_init_phase2(adev); 2986 if (r) 2987 goto init_failed; 2988 2989 /* 2990 * retired pages will be loaded from eeprom and reserved here, 2991 * it should be called after amdgpu_device_ip_hw_init_phase2 since 2992 * for some ASICs the RAS EEPROM code relies on SMU fully functioning 2993 * for I2C communication which only true at this point. 2994 * 2995 * amdgpu_ras_recovery_init may fail, but the upper only cares the 2996 * failure from bad gpu situation and stop amdgpu init process 2997 * accordingly. For other failed cases, it will still release all 2998 * the resource and print error message, rather than returning one 2999 * negative value to upper level. 3000 * 3001 * Note: theoretically, this should be called before all vram allocations 3002 * to protect retired page from abusing 3003 */ 3004 init_badpage = (adev->init_lvl->level != AMDGPU_INIT_LEVEL_MINIMAL_XGMI); 3005 r = amdgpu_ras_recovery_init(adev, init_badpage); 3006 if (r) 3007 goto init_failed; 3008 3009 /** 3010 * In case of XGMI grab extra reference for reset domain for this device 3011 */ 3012 if (adev->gmc.xgmi.num_physical_nodes > 1) { 3013 if (amdgpu_xgmi_add_device(adev) == 0) { 3014 if (!amdgpu_sriov_vf(adev)) { 3015 struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev); 3016 3017 if (WARN_ON(!hive)) { 3018 r = -ENOENT; 3019 goto init_failed; 3020 } 3021 3022 if (!hive->reset_domain || 3023 !amdgpu_reset_get_reset_domain(hive->reset_domain)) { 3024 r = -ENOENT; 3025 amdgpu_put_xgmi_hive(hive); 3026 goto init_failed; 3027 } 3028 3029 /* Drop the early temporary reset domain we created for device */ 3030 amdgpu_reset_put_reset_domain(adev->reset_domain); 3031 adev->reset_domain = hive->reset_domain; 3032 amdgpu_put_xgmi_hive(hive); 3033 } 3034 } 3035 } 3036 3037 r = amdgpu_device_init_schedulers(adev); 3038 if (r) 3039 goto init_failed; 3040 3041 if (adev->mman.buffer_funcs_ring->sched.ready) 3042 amdgpu_ttm_set_buffer_funcs_status(adev, true); 3043 3044 /* Don't init kfd if whole hive need to be reset during init */ 3045 if (adev->init_lvl->level != AMDGPU_INIT_LEVEL_MINIMAL_XGMI) { 3046 kgd2kfd_init_zone_device(adev); 3047 amdgpu_amdkfd_device_init(adev); 3048 } 3049 3050 amdgpu_fru_get_product_info(adev); 3051 3052 init_failed: 3053 3054 return r; 3055 } 3056 3057 /** 3058 * amdgpu_device_fill_reset_magic - writes reset magic to gart pointer 3059 * 3060 * @adev: amdgpu_device pointer 3061 * 3062 * Writes a reset magic value to the gart pointer in VRAM. The driver calls 3063 * this function before a GPU reset. If the value is retained after a 3064 * GPU reset, VRAM has not been lost. Some GPU resets may destry VRAM contents. 3065 */ 3066 static void amdgpu_device_fill_reset_magic(struct amdgpu_device *adev) 3067 { 3068 memcpy(adev->reset_magic, adev->gart.ptr, AMDGPU_RESET_MAGIC_NUM); 3069 } 3070 3071 /** 3072 * amdgpu_device_check_vram_lost - check if vram is valid 3073 * 3074 * @adev: amdgpu_device pointer 3075 * 3076 * Checks the reset magic value written to the gart pointer in VRAM. 3077 * The driver calls this after a GPU reset to see if the contents of 3078 * VRAM is lost or now. 3079 * returns true if vram is lost, false if not. 3080 */ 3081 static bool amdgpu_device_check_vram_lost(struct amdgpu_device *adev) 3082 { 3083 if (memcmp(adev->gart.ptr, adev->reset_magic, 3084 AMDGPU_RESET_MAGIC_NUM)) 3085 return true; 3086 3087 if (!amdgpu_in_reset(adev)) 3088 return false; 3089 3090 /* 3091 * For all ASICs with baco/mode1 reset, the VRAM is 3092 * always assumed to be lost. 3093 */ 3094 switch (amdgpu_asic_reset_method(adev)) { 3095 case AMD_RESET_METHOD_BACO: 3096 case AMD_RESET_METHOD_MODE1: 3097 return true; 3098 default: 3099 return false; 3100 } 3101 } 3102 3103 /** 3104 * amdgpu_device_set_cg_state - set clockgating for amdgpu device 3105 * 3106 * @adev: amdgpu_device pointer 3107 * @state: clockgating state (gate or ungate) 3108 * 3109 * The list of all the hardware IPs that make up the asic is walked and the 3110 * set_clockgating_state callbacks are run. 3111 * Late initialization pass enabling clockgating for hardware IPs. 3112 * Fini or suspend, pass disabling clockgating for hardware IPs. 3113 * Returns 0 on success, negative error code on failure. 3114 */ 3115 3116 int amdgpu_device_set_cg_state(struct amdgpu_device *adev, 3117 enum amd_clockgating_state state) 3118 { 3119 int i, j, r; 3120 3121 if (amdgpu_emu_mode == 1) 3122 return 0; 3123 3124 for (j = 0; j < adev->num_ip_blocks; j++) { 3125 i = state == AMD_CG_STATE_GATE ? j : adev->num_ip_blocks - j - 1; 3126 if (!adev->ip_blocks[i].status.late_initialized) 3127 continue; 3128 /* skip CG for GFX, SDMA on S0ix */ 3129 if (adev->in_s0ix && 3130 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GFX || 3131 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SDMA)) 3132 continue; 3133 /* skip CG for VCE/UVD, it's handled specially */ 3134 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_UVD && 3135 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCE && 3136 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCN && 3137 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_JPEG && 3138 adev->ip_blocks[i].version->funcs->set_clockgating_state) { 3139 /* enable clockgating to save power */ 3140 r = adev->ip_blocks[i].version->funcs->set_clockgating_state((void *)adev, 3141 state); 3142 if (r) { 3143 DRM_ERROR("set_clockgating_state(gate) of IP block <%s> failed %d\n", 3144 adev->ip_blocks[i].version->funcs->name, r); 3145 return r; 3146 } 3147 } 3148 } 3149 3150 return 0; 3151 } 3152 3153 int amdgpu_device_set_pg_state(struct amdgpu_device *adev, 3154 enum amd_powergating_state state) 3155 { 3156 int i, j, r; 3157 3158 if (amdgpu_emu_mode == 1) 3159 return 0; 3160 3161 for (j = 0; j < adev->num_ip_blocks; j++) { 3162 i = state == AMD_PG_STATE_GATE ? j : adev->num_ip_blocks - j - 1; 3163 if (!adev->ip_blocks[i].status.late_initialized) 3164 continue; 3165 /* skip PG for GFX, SDMA on S0ix */ 3166 if (adev->in_s0ix && 3167 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GFX || 3168 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SDMA)) 3169 continue; 3170 /* skip CG for VCE/UVD, it's handled specially */ 3171 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_UVD && 3172 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCE && 3173 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCN && 3174 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_JPEG && 3175 adev->ip_blocks[i].version->funcs->set_powergating_state) { 3176 /* enable powergating to save power */ 3177 r = adev->ip_blocks[i].version->funcs->set_powergating_state((void *)adev, 3178 state); 3179 if (r) { 3180 DRM_ERROR("set_powergating_state(gate) of IP block <%s> failed %d\n", 3181 adev->ip_blocks[i].version->funcs->name, r); 3182 return r; 3183 } 3184 } 3185 } 3186 return 0; 3187 } 3188 3189 static int amdgpu_device_enable_mgpu_fan_boost(void) 3190 { 3191 struct amdgpu_gpu_instance *gpu_ins; 3192 struct amdgpu_device *adev; 3193 int i, ret = 0; 3194 3195 mutex_lock(&mgpu_info.mutex); 3196 3197 /* 3198 * MGPU fan boost feature should be enabled 3199 * only when there are two or more dGPUs in 3200 * the system 3201 */ 3202 if (mgpu_info.num_dgpu < 2) 3203 goto out; 3204 3205 for (i = 0; i < mgpu_info.num_dgpu; i++) { 3206 gpu_ins = &(mgpu_info.gpu_ins[i]); 3207 adev = gpu_ins->adev; 3208 if (!(adev->flags & AMD_IS_APU) && 3209 !gpu_ins->mgpu_fan_enabled) { 3210 ret = amdgpu_dpm_enable_mgpu_fan_boost(adev); 3211 if (ret) 3212 break; 3213 3214 gpu_ins->mgpu_fan_enabled = 1; 3215 } 3216 } 3217 3218 out: 3219 mutex_unlock(&mgpu_info.mutex); 3220 3221 return ret; 3222 } 3223 3224 /** 3225 * amdgpu_device_ip_late_init - run late init for hardware IPs 3226 * 3227 * @adev: amdgpu_device pointer 3228 * 3229 * Late initialization pass for hardware IPs. The list of all the hardware 3230 * IPs that make up the asic is walked and the late_init callbacks are run. 3231 * late_init covers any special initialization that an IP requires 3232 * after all of the have been initialized or something that needs to happen 3233 * late in the init process. 3234 * Returns 0 on success, negative error code on failure. 3235 */ 3236 static int amdgpu_device_ip_late_init(struct amdgpu_device *adev) 3237 { 3238 struct amdgpu_gpu_instance *gpu_instance; 3239 int i = 0, r; 3240 3241 for (i = 0; i < adev->num_ip_blocks; i++) { 3242 if (!adev->ip_blocks[i].status.hw) 3243 continue; 3244 if (adev->ip_blocks[i].version->funcs->late_init) { 3245 r = adev->ip_blocks[i].version->funcs->late_init(&adev->ip_blocks[i]); 3246 if (r) { 3247 DRM_ERROR("late_init of IP block <%s> failed %d\n", 3248 adev->ip_blocks[i].version->funcs->name, r); 3249 return r; 3250 } 3251 } 3252 adev->ip_blocks[i].status.late_initialized = true; 3253 } 3254 3255 r = amdgpu_ras_late_init(adev); 3256 if (r) { 3257 DRM_ERROR("amdgpu_ras_late_init failed %d", r); 3258 return r; 3259 } 3260 3261 if (!amdgpu_reset_in_recovery(adev)) 3262 amdgpu_ras_set_error_query_ready(adev, true); 3263 3264 amdgpu_device_set_cg_state(adev, AMD_CG_STATE_GATE); 3265 amdgpu_device_set_pg_state(adev, AMD_PG_STATE_GATE); 3266 3267 amdgpu_device_fill_reset_magic(adev); 3268 3269 r = amdgpu_device_enable_mgpu_fan_boost(); 3270 if (r) 3271 DRM_ERROR("enable mgpu fan boost failed (%d).\n", r); 3272 3273 /* For passthrough configuration on arcturus and aldebaran, enable special handling SBR */ 3274 if (amdgpu_passthrough(adev) && 3275 ((adev->asic_type == CHIP_ARCTURUS && adev->gmc.xgmi.num_physical_nodes > 1) || 3276 adev->asic_type == CHIP_ALDEBARAN)) 3277 amdgpu_dpm_handle_passthrough_sbr(adev, true); 3278 3279 if (adev->gmc.xgmi.num_physical_nodes > 1) { 3280 mutex_lock(&mgpu_info.mutex); 3281 3282 /* 3283 * Reset device p-state to low as this was booted with high. 3284 * 3285 * This should be performed only after all devices from the same 3286 * hive get initialized. 3287 * 3288 * However, it's unknown how many device in the hive in advance. 3289 * As this is counted one by one during devices initializations. 3290 * 3291 * So, we wait for all XGMI interlinked devices initialized. 3292 * This may bring some delays as those devices may come from 3293 * different hives. But that should be OK. 3294 */ 3295 if (mgpu_info.num_dgpu == adev->gmc.xgmi.num_physical_nodes) { 3296 for (i = 0; i < mgpu_info.num_gpu; i++) { 3297 gpu_instance = &(mgpu_info.gpu_ins[i]); 3298 if (gpu_instance->adev->flags & AMD_IS_APU) 3299 continue; 3300 3301 r = amdgpu_xgmi_set_pstate(gpu_instance->adev, 3302 AMDGPU_XGMI_PSTATE_MIN); 3303 if (r) { 3304 DRM_ERROR("pstate setting failed (%d).\n", r); 3305 break; 3306 } 3307 } 3308 } 3309 3310 mutex_unlock(&mgpu_info.mutex); 3311 } 3312 3313 return 0; 3314 } 3315 3316 static void amdgpu_ip_block_hw_fini(struct amdgpu_ip_block *ip_block) 3317 { 3318 int r; 3319 3320 if (!ip_block->version->funcs->hw_fini) { 3321 DRM_ERROR("hw_fini of IP block <%s> not defined\n", 3322 ip_block->version->funcs->name); 3323 } else { 3324 r = ip_block->version->funcs->hw_fini(ip_block); 3325 /* XXX handle errors */ 3326 if (r) { 3327 DRM_DEBUG("hw_fini of IP block <%s> failed %d\n", 3328 ip_block->version->funcs->name, r); 3329 } 3330 } 3331 3332 ip_block->status.hw = false; 3333 } 3334 3335 /** 3336 * amdgpu_device_smu_fini_early - smu hw_fini wrapper 3337 * 3338 * @adev: amdgpu_device pointer 3339 * 3340 * For ASICs need to disable SMC first 3341 */ 3342 static void amdgpu_device_smu_fini_early(struct amdgpu_device *adev) 3343 { 3344 int i; 3345 3346 if (amdgpu_ip_version(adev, GC_HWIP, 0) > IP_VERSION(9, 0, 0)) 3347 return; 3348 3349 for (i = 0; i < adev->num_ip_blocks; i++) { 3350 if (!adev->ip_blocks[i].status.hw) 3351 continue; 3352 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) { 3353 amdgpu_ip_block_hw_fini(&adev->ip_blocks[i]); 3354 break; 3355 } 3356 } 3357 } 3358 3359 static int amdgpu_device_ip_fini_early(struct amdgpu_device *adev) 3360 { 3361 int i, r; 3362 3363 for (i = 0; i < adev->num_ip_blocks; i++) { 3364 if (!adev->ip_blocks[i].version->funcs->early_fini) 3365 continue; 3366 3367 r = adev->ip_blocks[i].version->funcs->early_fini(&adev->ip_blocks[i]); 3368 if (r) { 3369 DRM_DEBUG("early_fini of IP block <%s> failed %d\n", 3370 adev->ip_blocks[i].version->funcs->name, r); 3371 } 3372 } 3373 3374 amdgpu_device_set_pg_state(adev, AMD_PG_STATE_UNGATE); 3375 amdgpu_device_set_cg_state(adev, AMD_CG_STATE_UNGATE); 3376 3377 amdgpu_amdkfd_suspend(adev, false); 3378 3379 /* Workaroud for ASICs need to disable SMC first */ 3380 amdgpu_device_smu_fini_early(adev); 3381 3382 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 3383 if (!adev->ip_blocks[i].status.hw) 3384 continue; 3385 3386 amdgpu_ip_block_hw_fini(&adev->ip_blocks[i]); 3387 } 3388 3389 if (amdgpu_sriov_vf(adev)) { 3390 if (amdgpu_virt_release_full_gpu(adev, false)) 3391 DRM_ERROR("failed to release exclusive mode on fini\n"); 3392 } 3393 3394 return 0; 3395 } 3396 3397 /** 3398 * amdgpu_device_ip_fini - run fini for hardware IPs 3399 * 3400 * @adev: amdgpu_device pointer 3401 * 3402 * Main teardown pass for hardware IPs. The list of all the hardware 3403 * IPs that make up the asic is walked and the hw_fini and sw_fini callbacks 3404 * are run. hw_fini tears down the hardware associated with each IP 3405 * and sw_fini tears down any software state associated with each IP. 3406 * Returns 0 on success, negative error code on failure. 3407 */ 3408 static int amdgpu_device_ip_fini(struct amdgpu_device *adev) 3409 { 3410 int i, r; 3411 3412 if (amdgpu_sriov_vf(adev) && adev->virt.ras_init_done) 3413 amdgpu_virt_release_ras_err_handler_data(adev); 3414 3415 if (adev->gmc.xgmi.num_physical_nodes > 1) 3416 amdgpu_xgmi_remove_device(adev); 3417 3418 amdgpu_amdkfd_device_fini_sw(adev); 3419 3420 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 3421 if (!adev->ip_blocks[i].status.sw) 3422 continue; 3423 3424 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) { 3425 amdgpu_ucode_free_bo(adev); 3426 amdgpu_free_static_csa(&adev->virt.csa_obj); 3427 amdgpu_device_wb_fini(adev); 3428 amdgpu_device_mem_scratch_fini(adev); 3429 amdgpu_ib_pool_fini(adev); 3430 amdgpu_seq64_fini(adev); 3431 } 3432 if (adev->ip_blocks[i].version->funcs->sw_fini) { 3433 r = adev->ip_blocks[i].version->funcs->sw_fini(&adev->ip_blocks[i]); 3434 /* XXX handle errors */ 3435 if (r) { 3436 DRM_DEBUG("sw_fini of IP block <%s> failed %d\n", 3437 adev->ip_blocks[i].version->funcs->name, r); 3438 } 3439 } 3440 adev->ip_blocks[i].status.sw = false; 3441 adev->ip_blocks[i].status.valid = false; 3442 } 3443 3444 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 3445 if (!adev->ip_blocks[i].status.late_initialized) 3446 continue; 3447 if (adev->ip_blocks[i].version->funcs->late_fini) 3448 adev->ip_blocks[i].version->funcs->late_fini(&adev->ip_blocks[i]); 3449 adev->ip_blocks[i].status.late_initialized = false; 3450 } 3451 3452 amdgpu_ras_fini(adev); 3453 3454 return 0; 3455 } 3456 3457 /** 3458 * amdgpu_device_delayed_init_work_handler - work handler for IB tests 3459 * 3460 * @work: work_struct. 3461 */ 3462 static void amdgpu_device_delayed_init_work_handler(struct work_struct *work) 3463 { 3464 struct amdgpu_device *adev = 3465 container_of(work, struct amdgpu_device, delayed_init_work.work); 3466 int r; 3467 3468 r = amdgpu_ib_ring_tests(adev); 3469 if (r) 3470 DRM_ERROR("ib ring test failed (%d).\n", r); 3471 } 3472 3473 static void amdgpu_device_delay_enable_gfx_off(struct work_struct *work) 3474 { 3475 struct amdgpu_device *adev = 3476 container_of(work, struct amdgpu_device, gfx.gfx_off_delay_work.work); 3477 3478 WARN_ON_ONCE(adev->gfx.gfx_off_state); 3479 WARN_ON_ONCE(adev->gfx.gfx_off_req_count); 3480 3481 if (!amdgpu_dpm_set_powergating_by_smu(adev, AMD_IP_BLOCK_TYPE_GFX, true)) 3482 adev->gfx.gfx_off_state = true; 3483 } 3484 3485 /** 3486 * amdgpu_device_ip_suspend_phase1 - run suspend for hardware IPs (phase 1) 3487 * 3488 * @adev: amdgpu_device pointer 3489 * 3490 * Main suspend function for hardware IPs. The list of all the hardware 3491 * IPs that make up the asic is walked, clockgating is disabled and the 3492 * suspend callbacks are run. suspend puts the hardware and software state 3493 * in each IP into a state suitable for suspend. 3494 * Returns 0 on success, negative error code on failure. 3495 */ 3496 static int amdgpu_device_ip_suspend_phase1(struct amdgpu_device *adev) 3497 { 3498 int i, r; 3499 3500 amdgpu_device_set_pg_state(adev, AMD_PG_STATE_UNGATE); 3501 amdgpu_device_set_cg_state(adev, AMD_CG_STATE_UNGATE); 3502 3503 /* 3504 * Per PMFW team's suggestion, driver needs to handle gfxoff 3505 * and df cstate features disablement for gpu reset(e.g. Mode1Reset) 3506 * scenario. Add the missing df cstate disablement here. 3507 */ 3508 if (amdgpu_dpm_set_df_cstate(adev, DF_CSTATE_DISALLOW)) 3509 dev_warn(adev->dev, "Failed to disallow df cstate"); 3510 3511 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 3512 if (!adev->ip_blocks[i].status.valid) 3513 continue; 3514 3515 /* displays are handled separately */ 3516 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_DCE) 3517 continue; 3518 3519 /* XXX handle errors */ 3520 r = amdgpu_ip_block_suspend(&adev->ip_blocks[i]); 3521 if (r) 3522 return r; 3523 } 3524 3525 return 0; 3526 } 3527 3528 /** 3529 * amdgpu_device_ip_suspend_phase2 - run suspend for hardware IPs (phase 2) 3530 * 3531 * @adev: amdgpu_device pointer 3532 * 3533 * Main suspend function for hardware IPs. The list of all the hardware 3534 * IPs that make up the asic is walked, clockgating is disabled and the 3535 * suspend callbacks are run. suspend puts the hardware and software state 3536 * in each IP into a state suitable for suspend. 3537 * Returns 0 on success, negative error code on failure. 3538 */ 3539 static int amdgpu_device_ip_suspend_phase2(struct amdgpu_device *adev) 3540 { 3541 int i, r; 3542 3543 if (adev->in_s0ix) 3544 amdgpu_dpm_gfx_state_change(adev, sGpuChangeState_D3Entry); 3545 3546 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 3547 if (!adev->ip_blocks[i].status.valid) 3548 continue; 3549 /* displays are handled in phase1 */ 3550 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE) 3551 continue; 3552 /* PSP lost connection when err_event_athub occurs */ 3553 if (amdgpu_ras_intr_triggered() && 3554 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) { 3555 adev->ip_blocks[i].status.hw = false; 3556 continue; 3557 } 3558 3559 /* skip unnecessary suspend if we do not initialize them yet */ 3560 if (!amdgpu_ip_member_of_hwini( 3561 adev, adev->ip_blocks[i].version->type)) 3562 continue; 3563 3564 /* skip suspend of gfx/mes and psp for S0ix 3565 * gfx is in gfxoff state, so on resume it will exit gfxoff just 3566 * like at runtime. PSP is also part of the always on hardware 3567 * so no need to suspend it. 3568 */ 3569 if (adev->in_s0ix && 3570 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP || 3571 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GFX || 3572 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_MES)) 3573 continue; 3574 3575 /* SDMA 5.x+ is part of GFX power domain so it's covered by GFXOFF */ 3576 if (adev->in_s0ix && 3577 (amdgpu_ip_version(adev, SDMA0_HWIP, 0) >= 3578 IP_VERSION(5, 0, 0)) && 3579 (adev->ip_blocks[i].version->type == 3580 AMD_IP_BLOCK_TYPE_SDMA)) 3581 continue; 3582 3583 /* Once swPSP provides the IMU, RLC FW binaries to TOS during cold-boot. 3584 * These are in TMR, hence are expected to be reused by PSP-TOS to reload 3585 * from this location and RLC Autoload automatically also gets loaded 3586 * from here based on PMFW -> PSP message during re-init sequence. 3587 * Therefore, the psp suspend & resume should be skipped to avoid destroy 3588 * the TMR and reload FWs again for IMU enabled APU ASICs. 3589 */ 3590 if (amdgpu_in_reset(adev) && 3591 (adev->flags & AMD_IS_APU) && adev->gfx.imu.funcs && 3592 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) 3593 continue; 3594 3595 /* XXX handle errors */ 3596 r = amdgpu_ip_block_suspend(&adev->ip_blocks[i]); 3597 adev->ip_blocks[i].status.hw = false; 3598 3599 /* handle putting the SMC in the appropriate state */ 3600 if (!amdgpu_sriov_vf(adev)) { 3601 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) { 3602 r = amdgpu_dpm_set_mp1_state(adev, adev->mp1_state); 3603 if (r) { 3604 DRM_ERROR("SMC failed to set mp1 state %d, %d\n", 3605 adev->mp1_state, r); 3606 return r; 3607 } 3608 } 3609 } 3610 } 3611 3612 return 0; 3613 } 3614 3615 /** 3616 * amdgpu_device_ip_suspend - run suspend for hardware IPs 3617 * 3618 * @adev: amdgpu_device pointer 3619 * 3620 * Main suspend function for hardware IPs. The list of all the hardware 3621 * IPs that make up the asic is walked, clockgating is disabled and the 3622 * suspend callbacks are run. suspend puts the hardware and software state 3623 * in each IP into a state suitable for suspend. 3624 * Returns 0 on success, negative error code on failure. 3625 */ 3626 int amdgpu_device_ip_suspend(struct amdgpu_device *adev) 3627 { 3628 int r; 3629 3630 if (amdgpu_sriov_vf(adev)) { 3631 amdgpu_virt_fini_data_exchange(adev); 3632 amdgpu_virt_request_full_gpu(adev, false); 3633 } 3634 3635 amdgpu_ttm_set_buffer_funcs_status(adev, false); 3636 3637 r = amdgpu_device_ip_suspend_phase1(adev); 3638 if (r) 3639 return r; 3640 r = amdgpu_device_ip_suspend_phase2(adev); 3641 3642 if (amdgpu_sriov_vf(adev)) 3643 amdgpu_virt_release_full_gpu(adev, false); 3644 3645 return r; 3646 } 3647 3648 static int amdgpu_device_ip_reinit_early_sriov(struct amdgpu_device *adev) 3649 { 3650 int i, r; 3651 3652 static enum amd_ip_block_type ip_order[] = { 3653 AMD_IP_BLOCK_TYPE_COMMON, 3654 AMD_IP_BLOCK_TYPE_GMC, 3655 AMD_IP_BLOCK_TYPE_PSP, 3656 AMD_IP_BLOCK_TYPE_IH, 3657 }; 3658 3659 for (i = 0; i < adev->num_ip_blocks; i++) { 3660 int j; 3661 struct amdgpu_ip_block *block; 3662 3663 block = &adev->ip_blocks[i]; 3664 block->status.hw = false; 3665 3666 for (j = 0; j < ARRAY_SIZE(ip_order); j++) { 3667 3668 if (block->version->type != ip_order[j] || 3669 !block->status.valid) 3670 continue; 3671 3672 r = block->version->funcs->hw_init(&adev->ip_blocks[i]); 3673 if (r) { 3674 dev_err(adev->dev, "RE-INIT-early: %s failed\n", 3675 block->version->funcs->name); 3676 return r; 3677 } 3678 block->status.hw = true; 3679 } 3680 } 3681 3682 return 0; 3683 } 3684 3685 static int amdgpu_device_ip_reinit_late_sriov(struct amdgpu_device *adev) 3686 { 3687 struct amdgpu_ip_block *block; 3688 int i, r = 0; 3689 3690 static enum amd_ip_block_type ip_order[] = { 3691 AMD_IP_BLOCK_TYPE_SMC, 3692 AMD_IP_BLOCK_TYPE_DCE, 3693 AMD_IP_BLOCK_TYPE_GFX, 3694 AMD_IP_BLOCK_TYPE_SDMA, 3695 AMD_IP_BLOCK_TYPE_MES, 3696 AMD_IP_BLOCK_TYPE_UVD, 3697 AMD_IP_BLOCK_TYPE_VCE, 3698 AMD_IP_BLOCK_TYPE_VCN, 3699 AMD_IP_BLOCK_TYPE_JPEG 3700 }; 3701 3702 for (i = 0; i < ARRAY_SIZE(ip_order); i++) { 3703 block = amdgpu_device_ip_get_ip_block(adev, ip_order[i]); 3704 3705 if (!block) 3706 continue; 3707 3708 if (block->status.valid && !block->status.hw) { 3709 if (block->version->type == AMD_IP_BLOCK_TYPE_SMC) { 3710 r = amdgpu_ip_block_resume(block); 3711 } else { 3712 r = block->version->funcs->hw_init(block); 3713 } 3714 3715 if (r) { 3716 dev_err(adev->dev, "RE-INIT-late: %s failed\n", 3717 block->version->funcs->name); 3718 break; 3719 } 3720 block->status.hw = true; 3721 } 3722 } 3723 3724 return r; 3725 } 3726 3727 /** 3728 * amdgpu_device_ip_resume_phase1 - run resume for hardware IPs 3729 * 3730 * @adev: amdgpu_device pointer 3731 * 3732 * First resume function for hardware IPs. The list of all the hardware 3733 * IPs that make up the asic is walked and the resume callbacks are run for 3734 * COMMON, GMC, and IH. resume puts the hardware into a functional state 3735 * after a suspend and updates the software state as necessary. This 3736 * function is also used for restoring the GPU after a GPU reset. 3737 * Returns 0 on success, negative error code on failure. 3738 */ 3739 static int amdgpu_device_ip_resume_phase1(struct amdgpu_device *adev) 3740 { 3741 int i, r; 3742 3743 for (i = 0; i < adev->num_ip_blocks; i++) { 3744 if (!adev->ip_blocks[i].status.valid || adev->ip_blocks[i].status.hw) 3745 continue; 3746 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON || 3747 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC || 3748 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH || 3749 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP && amdgpu_sriov_vf(adev))) { 3750 3751 r = amdgpu_ip_block_resume(&adev->ip_blocks[i]); 3752 if (r) 3753 return r; 3754 } 3755 } 3756 3757 return 0; 3758 } 3759 3760 /** 3761 * amdgpu_device_ip_resume_phase2 - run resume for hardware IPs 3762 * 3763 * @adev: amdgpu_device pointer 3764 * 3765 * Second resume function for hardware IPs. The list of all the hardware 3766 * IPs that make up the asic is walked and the resume callbacks are run for 3767 * all blocks except COMMON, GMC, and IH. resume puts the hardware into a 3768 * functional state after a suspend and updates the software state as 3769 * necessary. This function is also used for restoring the GPU after a GPU 3770 * reset. 3771 * Returns 0 on success, negative error code on failure. 3772 */ 3773 static int amdgpu_device_ip_resume_phase2(struct amdgpu_device *adev) 3774 { 3775 int i, r; 3776 3777 for (i = 0; i < adev->num_ip_blocks; i++) { 3778 if (!adev->ip_blocks[i].status.valid || adev->ip_blocks[i].status.hw) 3779 continue; 3780 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON || 3781 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC || 3782 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH || 3783 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE || 3784 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) 3785 continue; 3786 r = amdgpu_ip_block_resume(&adev->ip_blocks[i]); 3787 if (r) 3788 return r; 3789 } 3790 3791 return 0; 3792 } 3793 3794 /** 3795 * amdgpu_device_ip_resume_phase3 - run resume for hardware IPs 3796 * 3797 * @adev: amdgpu_device pointer 3798 * 3799 * Third resume function for hardware IPs. The list of all the hardware 3800 * IPs that make up the asic is walked and the resume callbacks are run for 3801 * all DCE. resume puts the hardware into a functional state after a suspend 3802 * and updates the software state as necessary. This function is also used 3803 * for restoring the GPU after a GPU reset. 3804 * 3805 * Returns 0 on success, negative error code on failure. 3806 */ 3807 static int amdgpu_device_ip_resume_phase3(struct amdgpu_device *adev) 3808 { 3809 int i, r; 3810 3811 for (i = 0; i < adev->num_ip_blocks; i++) { 3812 if (!adev->ip_blocks[i].status.valid || adev->ip_blocks[i].status.hw) 3813 continue; 3814 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE) { 3815 r = amdgpu_ip_block_resume(&adev->ip_blocks[i]); 3816 if (r) 3817 return r; 3818 } 3819 } 3820 3821 return 0; 3822 } 3823 3824 /** 3825 * amdgpu_device_ip_resume - run resume for hardware IPs 3826 * 3827 * @adev: amdgpu_device pointer 3828 * 3829 * Main resume function for hardware IPs. The hardware IPs 3830 * are split into two resume functions because they are 3831 * also used in recovering from a GPU reset and some additional 3832 * steps need to be take between them. In this case (S3/S4) they are 3833 * run sequentially. 3834 * Returns 0 on success, negative error code on failure. 3835 */ 3836 static int amdgpu_device_ip_resume(struct amdgpu_device *adev) 3837 { 3838 int r; 3839 3840 r = amdgpu_device_ip_resume_phase1(adev); 3841 if (r) 3842 return r; 3843 3844 r = amdgpu_device_fw_loading(adev); 3845 if (r) 3846 return r; 3847 3848 r = amdgpu_device_ip_resume_phase2(adev); 3849 3850 if (adev->mman.buffer_funcs_ring->sched.ready) 3851 amdgpu_ttm_set_buffer_funcs_status(adev, true); 3852 3853 if (r) 3854 return r; 3855 3856 amdgpu_fence_driver_hw_init(adev); 3857 3858 r = amdgpu_device_ip_resume_phase3(adev); 3859 3860 return r; 3861 } 3862 3863 /** 3864 * amdgpu_device_detect_sriov_bios - determine if the board supports SR-IOV 3865 * 3866 * @adev: amdgpu_device pointer 3867 * 3868 * Query the VBIOS data tables to determine if the board supports SR-IOV. 3869 */ 3870 static void amdgpu_device_detect_sriov_bios(struct amdgpu_device *adev) 3871 { 3872 if (amdgpu_sriov_vf(adev)) { 3873 if (adev->is_atom_fw) { 3874 if (amdgpu_atomfirmware_gpu_virtualization_supported(adev)) 3875 adev->virt.caps |= AMDGPU_SRIOV_CAPS_SRIOV_VBIOS; 3876 } else { 3877 if (amdgpu_atombios_has_gpu_virtualization_table(adev)) 3878 adev->virt.caps |= AMDGPU_SRIOV_CAPS_SRIOV_VBIOS; 3879 } 3880 3881 if (!(adev->virt.caps & AMDGPU_SRIOV_CAPS_SRIOV_VBIOS)) 3882 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_NO_VBIOS, 0, 0); 3883 } 3884 } 3885 3886 /** 3887 * amdgpu_device_asic_has_dc_support - determine if DC supports the asic 3888 * 3889 * @asic_type: AMD asic type 3890 * 3891 * Check if there is DC (new modesetting infrastructre) support for an asic. 3892 * returns true if DC has support, false if not. 3893 */ 3894 bool amdgpu_device_asic_has_dc_support(enum amd_asic_type asic_type) 3895 { 3896 switch (asic_type) { 3897 #ifdef CONFIG_DRM_AMDGPU_SI 3898 case CHIP_HAINAN: 3899 #endif 3900 case CHIP_TOPAZ: 3901 /* chips with no display hardware */ 3902 return false; 3903 #if defined(CONFIG_DRM_AMD_DC) 3904 case CHIP_TAHITI: 3905 case CHIP_PITCAIRN: 3906 case CHIP_VERDE: 3907 case CHIP_OLAND: 3908 /* 3909 * We have systems in the wild with these ASICs that require 3910 * LVDS and VGA support which is not supported with DC. 3911 * 3912 * Fallback to the non-DC driver here by default so as not to 3913 * cause regressions. 3914 */ 3915 #if defined(CONFIG_DRM_AMD_DC_SI) 3916 return amdgpu_dc > 0; 3917 #else 3918 return false; 3919 #endif 3920 case CHIP_BONAIRE: 3921 case CHIP_KAVERI: 3922 case CHIP_KABINI: 3923 case CHIP_MULLINS: 3924 /* 3925 * We have systems in the wild with these ASICs that require 3926 * VGA support which is not supported with DC. 3927 * 3928 * Fallback to the non-DC driver here by default so as not to 3929 * cause regressions. 3930 */ 3931 return amdgpu_dc > 0; 3932 default: 3933 return amdgpu_dc != 0; 3934 #else 3935 default: 3936 if (amdgpu_dc > 0) 3937 DRM_INFO_ONCE("Display Core has been requested via kernel parameter but isn't supported by ASIC, ignoring\n"); 3938 return false; 3939 #endif 3940 } 3941 } 3942 3943 /** 3944 * amdgpu_device_has_dc_support - check if dc is supported 3945 * 3946 * @adev: amdgpu_device pointer 3947 * 3948 * Returns true for supported, false for not supported 3949 */ 3950 bool amdgpu_device_has_dc_support(struct amdgpu_device *adev) 3951 { 3952 if (adev->enable_virtual_display || 3953 (adev->harvest_ip_mask & AMD_HARVEST_IP_DMU_MASK)) 3954 return false; 3955 3956 return amdgpu_device_asic_has_dc_support(adev->asic_type); 3957 } 3958 3959 static void amdgpu_device_xgmi_reset_func(struct work_struct *__work) 3960 { 3961 struct amdgpu_device *adev = 3962 container_of(__work, struct amdgpu_device, xgmi_reset_work); 3963 struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev); 3964 3965 /* It's a bug to not have a hive within this function */ 3966 if (WARN_ON(!hive)) 3967 return; 3968 3969 /* 3970 * Use task barrier to synchronize all xgmi reset works across the 3971 * hive. task_barrier_enter and task_barrier_exit will block 3972 * until all the threads running the xgmi reset works reach 3973 * those points. task_barrier_full will do both blocks. 3974 */ 3975 if (amdgpu_asic_reset_method(adev) == AMD_RESET_METHOD_BACO) { 3976 3977 task_barrier_enter(&hive->tb); 3978 adev->asic_reset_res = amdgpu_device_baco_enter(adev_to_drm(adev)); 3979 3980 if (adev->asic_reset_res) 3981 goto fail; 3982 3983 task_barrier_exit(&hive->tb); 3984 adev->asic_reset_res = amdgpu_device_baco_exit(adev_to_drm(adev)); 3985 3986 if (adev->asic_reset_res) 3987 goto fail; 3988 3989 amdgpu_ras_reset_error_count(adev, AMDGPU_RAS_BLOCK__MMHUB); 3990 } else { 3991 3992 task_barrier_full(&hive->tb); 3993 adev->asic_reset_res = amdgpu_asic_reset(adev); 3994 } 3995 3996 fail: 3997 if (adev->asic_reset_res) 3998 DRM_WARN("ASIC reset failed with error, %d for drm dev, %s", 3999 adev->asic_reset_res, adev_to_drm(adev)->unique); 4000 amdgpu_put_xgmi_hive(hive); 4001 } 4002 4003 static int amdgpu_device_get_job_timeout_settings(struct amdgpu_device *adev) 4004 { 4005 char *input = amdgpu_lockup_timeout; 4006 char *timeout_setting = NULL; 4007 int index = 0; 4008 long timeout; 4009 int ret = 0; 4010 4011 /* 4012 * By default timeout for non compute jobs is 10000 4013 * and 60000 for compute jobs. 4014 * In SR-IOV or passthrough mode, timeout for compute 4015 * jobs are 60000 by default. 4016 */ 4017 adev->gfx_timeout = msecs_to_jiffies(10000); 4018 adev->sdma_timeout = adev->video_timeout = adev->gfx_timeout; 4019 if (amdgpu_sriov_vf(adev)) 4020 adev->compute_timeout = amdgpu_sriov_is_pp_one_vf(adev) ? 4021 msecs_to_jiffies(60000) : msecs_to_jiffies(10000); 4022 else 4023 adev->compute_timeout = msecs_to_jiffies(60000); 4024 4025 if (strnlen(input, AMDGPU_MAX_TIMEOUT_PARAM_LENGTH)) { 4026 while ((timeout_setting = strsep(&input, ",")) && 4027 strnlen(timeout_setting, AMDGPU_MAX_TIMEOUT_PARAM_LENGTH)) { 4028 ret = kstrtol(timeout_setting, 0, &timeout); 4029 if (ret) 4030 return ret; 4031 4032 if (timeout == 0) { 4033 index++; 4034 continue; 4035 } else if (timeout < 0) { 4036 timeout = MAX_SCHEDULE_TIMEOUT; 4037 dev_warn(adev->dev, "lockup timeout disabled"); 4038 add_taint(TAINT_SOFTLOCKUP, LOCKDEP_STILL_OK); 4039 } else { 4040 timeout = msecs_to_jiffies(timeout); 4041 } 4042 4043 switch (index++) { 4044 case 0: 4045 adev->gfx_timeout = timeout; 4046 break; 4047 case 1: 4048 adev->compute_timeout = timeout; 4049 break; 4050 case 2: 4051 adev->sdma_timeout = timeout; 4052 break; 4053 case 3: 4054 adev->video_timeout = timeout; 4055 break; 4056 default: 4057 break; 4058 } 4059 } 4060 /* 4061 * There is only one value specified and 4062 * it should apply to all non-compute jobs. 4063 */ 4064 if (index == 1) { 4065 adev->sdma_timeout = adev->video_timeout = adev->gfx_timeout; 4066 if (amdgpu_sriov_vf(adev) || amdgpu_passthrough(adev)) 4067 adev->compute_timeout = adev->gfx_timeout; 4068 } 4069 } 4070 4071 return ret; 4072 } 4073 4074 /** 4075 * amdgpu_device_check_iommu_direct_map - check if RAM direct mapped to GPU 4076 * 4077 * @adev: amdgpu_device pointer 4078 * 4079 * RAM direct mapped to GPU if IOMMU is not enabled or is pass through mode 4080 */ 4081 static void amdgpu_device_check_iommu_direct_map(struct amdgpu_device *adev) 4082 { 4083 struct iommu_domain *domain; 4084 4085 domain = iommu_get_domain_for_dev(adev->dev); 4086 if (!domain || domain->type == IOMMU_DOMAIN_IDENTITY) 4087 adev->ram_is_direct_mapped = true; 4088 } 4089 4090 #if defined(CONFIG_HSA_AMD_P2P) 4091 /** 4092 * amdgpu_device_check_iommu_remap - Check if DMA remapping is enabled. 4093 * 4094 * @adev: amdgpu_device pointer 4095 * 4096 * return if IOMMU remapping bar address 4097 */ 4098 static bool amdgpu_device_check_iommu_remap(struct amdgpu_device *adev) 4099 { 4100 struct iommu_domain *domain; 4101 4102 domain = iommu_get_domain_for_dev(adev->dev); 4103 if (domain && (domain->type == IOMMU_DOMAIN_DMA || 4104 domain->type == IOMMU_DOMAIN_DMA_FQ)) 4105 return true; 4106 4107 return false; 4108 } 4109 #endif 4110 4111 static const struct attribute *amdgpu_dev_attributes[] = { 4112 &dev_attr_pcie_replay_count.attr, 4113 NULL 4114 }; 4115 4116 static void amdgpu_device_set_mcbp(struct amdgpu_device *adev) 4117 { 4118 if (amdgpu_mcbp == 1) 4119 adev->gfx.mcbp = true; 4120 else if (amdgpu_mcbp == 0) 4121 adev->gfx.mcbp = false; 4122 4123 if (amdgpu_sriov_vf(adev)) 4124 adev->gfx.mcbp = true; 4125 4126 if (adev->gfx.mcbp) 4127 DRM_INFO("MCBP is enabled\n"); 4128 } 4129 4130 /** 4131 * amdgpu_device_init - initialize the driver 4132 * 4133 * @adev: amdgpu_device pointer 4134 * @flags: driver flags 4135 * 4136 * Initializes the driver info and hw (all asics). 4137 * Returns 0 for success or an error on failure. 4138 * Called at driver startup. 4139 */ 4140 int amdgpu_device_init(struct amdgpu_device *adev, 4141 uint32_t flags) 4142 { 4143 struct drm_device *ddev = adev_to_drm(adev); 4144 struct pci_dev *pdev = adev->pdev; 4145 int r, i; 4146 bool px = false; 4147 u32 max_MBps; 4148 int tmp; 4149 4150 adev->shutdown = false; 4151 adev->flags = flags; 4152 4153 if (amdgpu_force_asic_type >= 0 && amdgpu_force_asic_type < CHIP_LAST) 4154 adev->asic_type = amdgpu_force_asic_type; 4155 else 4156 adev->asic_type = flags & AMD_ASIC_MASK; 4157 4158 adev->usec_timeout = AMDGPU_MAX_USEC_TIMEOUT; 4159 if (amdgpu_emu_mode == 1) 4160 adev->usec_timeout *= 10; 4161 adev->gmc.gart_size = 512 * 1024 * 1024; 4162 adev->accel_working = false; 4163 adev->num_rings = 0; 4164 RCU_INIT_POINTER(adev->gang_submit, dma_fence_get_stub()); 4165 adev->mman.buffer_funcs = NULL; 4166 adev->mman.buffer_funcs_ring = NULL; 4167 adev->vm_manager.vm_pte_funcs = NULL; 4168 adev->vm_manager.vm_pte_num_scheds = 0; 4169 adev->gmc.gmc_funcs = NULL; 4170 adev->harvest_ip_mask = 0x0; 4171 adev->fence_context = dma_fence_context_alloc(AMDGPU_MAX_RINGS); 4172 bitmap_zero(adev->gfx.pipe_reserve_bitmap, AMDGPU_MAX_COMPUTE_QUEUES); 4173 4174 adev->smc_rreg = &amdgpu_invalid_rreg; 4175 adev->smc_wreg = &amdgpu_invalid_wreg; 4176 adev->pcie_rreg = &amdgpu_invalid_rreg; 4177 adev->pcie_wreg = &amdgpu_invalid_wreg; 4178 adev->pcie_rreg_ext = &amdgpu_invalid_rreg_ext; 4179 adev->pcie_wreg_ext = &amdgpu_invalid_wreg_ext; 4180 adev->pciep_rreg = &amdgpu_invalid_rreg; 4181 adev->pciep_wreg = &amdgpu_invalid_wreg; 4182 adev->pcie_rreg64 = &amdgpu_invalid_rreg64; 4183 adev->pcie_wreg64 = &amdgpu_invalid_wreg64; 4184 adev->pcie_rreg64_ext = &amdgpu_invalid_rreg64_ext; 4185 adev->pcie_wreg64_ext = &amdgpu_invalid_wreg64_ext; 4186 adev->uvd_ctx_rreg = &amdgpu_invalid_rreg; 4187 adev->uvd_ctx_wreg = &amdgpu_invalid_wreg; 4188 adev->didt_rreg = &amdgpu_invalid_rreg; 4189 adev->didt_wreg = &amdgpu_invalid_wreg; 4190 adev->gc_cac_rreg = &amdgpu_invalid_rreg; 4191 adev->gc_cac_wreg = &amdgpu_invalid_wreg; 4192 adev->audio_endpt_rreg = &amdgpu_block_invalid_rreg; 4193 adev->audio_endpt_wreg = &amdgpu_block_invalid_wreg; 4194 4195 DRM_INFO("initializing kernel modesetting (%s 0x%04X:0x%04X 0x%04X:0x%04X 0x%02X).\n", 4196 amdgpu_asic_name[adev->asic_type], pdev->vendor, pdev->device, 4197 pdev->subsystem_vendor, pdev->subsystem_device, pdev->revision); 4198 4199 /* mutex initialization are all done here so we 4200 * can recall function without having locking issues 4201 */ 4202 mutex_init(&adev->firmware.mutex); 4203 mutex_init(&adev->pm.mutex); 4204 mutex_init(&adev->gfx.gpu_clock_mutex); 4205 mutex_init(&adev->srbm_mutex); 4206 mutex_init(&adev->gfx.pipe_reserve_mutex); 4207 mutex_init(&adev->gfx.gfx_off_mutex); 4208 mutex_init(&adev->gfx.partition_mutex); 4209 mutex_init(&adev->grbm_idx_mutex); 4210 mutex_init(&adev->mn_lock); 4211 mutex_init(&adev->virt.vf_errors.lock); 4212 mutex_init(&adev->virt.rlcg_reg_lock); 4213 hash_init(adev->mn_hash); 4214 mutex_init(&adev->psp.mutex); 4215 mutex_init(&adev->notifier_lock); 4216 mutex_init(&adev->pm.stable_pstate_ctx_lock); 4217 mutex_init(&adev->benchmark_mutex); 4218 mutex_init(&adev->gfx.reset_sem_mutex); 4219 /* Initialize the mutex for cleaner shader isolation between GFX and compute processes */ 4220 mutex_init(&adev->enforce_isolation_mutex); 4221 mutex_init(&adev->gfx.kfd_sch_mutex); 4222 4223 amdgpu_device_init_apu_flags(adev); 4224 4225 r = amdgpu_device_check_arguments(adev); 4226 if (r) 4227 return r; 4228 4229 spin_lock_init(&adev->mmio_idx_lock); 4230 spin_lock_init(&adev->smc_idx_lock); 4231 spin_lock_init(&adev->pcie_idx_lock); 4232 spin_lock_init(&adev->uvd_ctx_idx_lock); 4233 spin_lock_init(&adev->didt_idx_lock); 4234 spin_lock_init(&adev->gc_cac_idx_lock); 4235 spin_lock_init(&adev->se_cac_idx_lock); 4236 spin_lock_init(&adev->audio_endpt_idx_lock); 4237 spin_lock_init(&adev->mm_stats.lock); 4238 spin_lock_init(&adev->wb.lock); 4239 4240 INIT_LIST_HEAD(&adev->reset_list); 4241 4242 INIT_LIST_HEAD(&adev->ras_list); 4243 4244 INIT_LIST_HEAD(&adev->pm.od_kobj_list); 4245 4246 INIT_DELAYED_WORK(&adev->delayed_init_work, 4247 amdgpu_device_delayed_init_work_handler); 4248 INIT_DELAYED_WORK(&adev->gfx.gfx_off_delay_work, 4249 amdgpu_device_delay_enable_gfx_off); 4250 /* 4251 * Initialize the enforce_isolation work structures for each XCP 4252 * partition. This work handler is responsible for enforcing shader 4253 * isolation on AMD GPUs. It counts the number of emitted fences for 4254 * each GFX and compute ring. If there are any fences, it schedules 4255 * the `enforce_isolation_work` to be run after a delay. If there are 4256 * no fences, it signals the Kernel Fusion Driver (KFD) to resume the 4257 * runqueue. 4258 */ 4259 for (i = 0; i < MAX_XCP; i++) { 4260 INIT_DELAYED_WORK(&adev->gfx.enforce_isolation[i].work, 4261 amdgpu_gfx_enforce_isolation_handler); 4262 adev->gfx.enforce_isolation[i].adev = adev; 4263 adev->gfx.enforce_isolation[i].xcp_id = i; 4264 } 4265 4266 INIT_WORK(&adev->xgmi_reset_work, amdgpu_device_xgmi_reset_func); 4267 4268 adev->gfx.gfx_off_req_count = 1; 4269 adev->gfx.gfx_off_residency = 0; 4270 adev->gfx.gfx_off_entrycount = 0; 4271 adev->pm.ac_power = power_supply_is_system_supplied() > 0; 4272 4273 atomic_set(&adev->throttling_logging_enabled, 1); 4274 /* 4275 * If throttling continues, logging will be performed every minute 4276 * to avoid log flooding. "-1" is subtracted since the thermal 4277 * throttling interrupt comes every second. Thus, the total logging 4278 * interval is 59 seconds(retelimited printk interval) + 1(waiting 4279 * for throttling interrupt) = 60 seconds. 4280 */ 4281 ratelimit_state_init(&adev->throttling_logging_rs, (60 - 1) * HZ, 1); 4282 ratelimit_state_init(&adev->virt.ras_telemetry_rs, 5 * HZ, 1); 4283 4284 ratelimit_set_flags(&adev->throttling_logging_rs, RATELIMIT_MSG_ON_RELEASE); 4285 ratelimit_set_flags(&adev->virt.ras_telemetry_rs, RATELIMIT_MSG_ON_RELEASE); 4286 4287 /* Registers mapping */ 4288 /* TODO: block userspace mapping of io register */ 4289 if (adev->asic_type >= CHIP_BONAIRE) { 4290 adev->rmmio_base = pci_resource_start(adev->pdev, 5); 4291 adev->rmmio_size = pci_resource_len(adev->pdev, 5); 4292 } else { 4293 adev->rmmio_base = pci_resource_start(adev->pdev, 2); 4294 adev->rmmio_size = pci_resource_len(adev->pdev, 2); 4295 } 4296 4297 for (i = 0; i < AMD_IP_BLOCK_TYPE_NUM; i++) 4298 atomic_set(&adev->pm.pwr_state[i], POWER_STATE_UNKNOWN); 4299 4300 adev->rmmio = ioremap(adev->rmmio_base, adev->rmmio_size); 4301 if (!adev->rmmio) 4302 return -ENOMEM; 4303 4304 DRM_INFO("register mmio base: 0x%08X\n", (uint32_t)adev->rmmio_base); 4305 DRM_INFO("register mmio size: %u\n", (unsigned int)adev->rmmio_size); 4306 4307 /* 4308 * Reset domain needs to be present early, before XGMI hive discovered 4309 * (if any) and intitialized to use reset sem and in_gpu reset flag 4310 * early on during init and before calling to RREG32. 4311 */ 4312 adev->reset_domain = amdgpu_reset_create_reset_domain(SINGLE_DEVICE, "amdgpu-reset-dev"); 4313 if (!adev->reset_domain) 4314 return -ENOMEM; 4315 4316 /* detect hw virtualization here */ 4317 amdgpu_detect_virtualization(adev); 4318 4319 amdgpu_device_get_pcie_info(adev); 4320 4321 r = amdgpu_device_get_job_timeout_settings(adev); 4322 if (r) { 4323 dev_err(adev->dev, "invalid lockup_timeout parameter syntax\n"); 4324 return r; 4325 } 4326 4327 amdgpu_device_set_mcbp(adev); 4328 4329 /* 4330 * By default, use default mode where all blocks are expected to be 4331 * initialized. At present a 'swinit' of blocks is required to be 4332 * completed before the need for a different level is detected. 4333 */ 4334 amdgpu_set_init_level(adev, AMDGPU_INIT_LEVEL_DEFAULT); 4335 /* early init functions */ 4336 r = amdgpu_device_ip_early_init(adev); 4337 if (r) 4338 return r; 4339 4340 /* Get rid of things like offb */ 4341 r = aperture_remove_conflicting_pci_devices(adev->pdev, amdgpu_kms_driver.name); 4342 if (r) 4343 return r; 4344 4345 /* Enable TMZ based on IP_VERSION */ 4346 amdgpu_gmc_tmz_set(adev); 4347 4348 if (amdgpu_sriov_vf(adev) && 4349 amdgpu_ip_version(adev, GC_HWIP, 0) >= IP_VERSION(10, 3, 0)) 4350 /* VF MMIO access (except mailbox range) from CPU 4351 * will be blocked during sriov runtime 4352 */ 4353 adev->virt.caps |= AMDGPU_VF_MMIO_ACCESS_PROTECT; 4354 4355 amdgpu_gmc_noretry_set(adev); 4356 /* Need to get xgmi info early to decide the reset behavior*/ 4357 if (adev->gmc.xgmi.supported) { 4358 r = adev->gfxhub.funcs->get_xgmi_info(adev); 4359 if (r) 4360 return r; 4361 } 4362 4363 /* enable PCIE atomic ops */ 4364 if (amdgpu_sriov_vf(adev)) { 4365 if (adev->virt.fw_reserve.p_pf2vf) 4366 adev->have_atomics_support = ((struct amd_sriov_msg_pf2vf_info *) 4367 adev->virt.fw_reserve.p_pf2vf)->pcie_atomic_ops_support_flags == 4368 (PCI_EXP_DEVCAP2_ATOMIC_COMP32 | PCI_EXP_DEVCAP2_ATOMIC_COMP64); 4369 /* APUs w/ gfx9 onwards doesn't reply on PCIe atomics, rather it is a 4370 * internal path natively support atomics, set have_atomics_support to true. 4371 */ 4372 } else if ((adev->flags & AMD_IS_APU) && 4373 (amdgpu_ip_version(adev, GC_HWIP, 0) > 4374 IP_VERSION(9, 0, 0))) { 4375 adev->have_atomics_support = true; 4376 } else { 4377 adev->have_atomics_support = 4378 !pci_enable_atomic_ops_to_root(adev->pdev, 4379 PCI_EXP_DEVCAP2_ATOMIC_COMP32 | 4380 PCI_EXP_DEVCAP2_ATOMIC_COMP64); 4381 } 4382 4383 if (!adev->have_atomics_support) 4384 dev_info(adev->dev, "PCIE atomic ops is not supported\n"); 4385 4386 /* doorbell bar mapping and doorbell index init*/ 4387 amdgpu_doorbell_init(adev); 4388 4389 if (amdgpu_emu_mode == 1) { 4390 /* post the asic on emulation mode */ 4391 emu_soc_asic_init(adev); 4392 goto fence_driver_init; 4393 } 4394 4395 amdgpu_reset_init(adev); 4396 4397 /* detect if we are with an SRIOV vbios */ 4398 if (adev->bios) 4399 amdgpu_device_detect_sriov_bios(adev); 4400 4401 /* check if we need to reset the asic 4402 * E.g., driver was not cleanly unloaded previously, etc. 4403 */ 4404 if (!amdgpu_sriov_vf(adev) && amdgpu_asic_need_reset_on_init(adev)) { 4405 if (adev->gmc.xgmi.num_physical_nodes) { 4406 dev_info(adev->dev, "Pending hive reset.\n"); 4407 amdgpu_set_init_level(adev, 4408 AMDGPU_INIT_LEVEL_MINIMAL_XGMI); 4409 } else if (amdgpu_ip_version(adev, MP1_HWIP, 0) == IP_VERSION(13, 0, 10) && 4410 !amdgpu_device_has_display_hardware(adev)) { 4411 r = psp_gpu_reset(adev); 4412 } else { 4413 tmp = amdgpu_reset_method; 4414 /* It should do a default reset when loading or reloading the driver, 4415 * regardless of the module parameter reset_method. 4416 */ 4417 amdgpu_reset_method = AMD_RESET_METHOD_NONE; 4418 r = amdgpu_asic_reset(adev); 4419 amdgpu_reset_method = tmp; 4420 } 4421 4422 if (r) { 4423 dev_err(adev->dev, "asic reset on init failed\n"); 4424 goto failed; 4425 } 4426 } 4427 4428 /* Post card if necessary */ 4429 if (amdgpu_device_need_post(adev)) { 4430 if (!adev->bios) { 4431 dev_err(adev->dev, "no vBIOS found\n"); 4432 r = -EINVAL; 4433 goto failed; 4434 } 4435 DRM_INFO("GPU posting now...\n"); 4436 r = amdgpu_device_asic_init(adev); 4437 if (r) { 4438 dev_err(adev->dev, "gpu post error!\n"); 4439 goto failed; 4440 } 4441 } 4442 4443 if (adev->bios) { 4444 if (adev->is_atom_fw) { 4445 /* Initialize clocks */ 4446 r = amdgpu_atomfirmware_get_clock_info(adev); 4447 if (r) { 4448 dev_err(adev->dev, "amdgpu_atomfirmware_get_clock_info failed\n"); 4449 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_GET_CLOCK_FAIL, 0, 0); 4450 goto failed; 4451 } 4452 } else { 4453 /* Initialize clocks */ 4454 r = amdgpu_atombios_get_clock_info(adev); 4455 if (r) { 4456 dev_err(adev->dev, "amdgpu_atombios_get_clock_info failed\n"); 4457 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_GET_CLOCK_FAIL, 0, 0); 4458 goto failed; 4459 } 4460 /* init i2c buses */ 4461 if (!amdgpu_device_has_dc_support(adev)) 4462 amdgpu_atombios_i2c_init(adev); 4463 } 4464 } 4465 4466 fence_driver_init: 4467 /* Fence driver */ 4468 r = amdgpu_fence_driver_sw_init(adev); 4469 if (r) { 4470 dev_err(adev->dev, "amdgpu_fence_driver_sw_init failed\n"); 4471 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_FENCE_INIT_FAIL, 0, 0); 4472 goto failed; 4473 } 4474 4475 /* init the mode config */ 4476 drm_mode_config_init(adev_to_drm(adev)); 4477 4478 r = amdgpu_device_ip_init(adev); 4479 if (r) { 4480 dev_err(adev->dev, "amdgpu_device_ip_init failed\n"); 4481 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_AMDGPU_INIT_FAIL, 0, 0); 4482 goto release_ras_con; 4483 } 4484 4485 amdgpu_fence_driver_hw_init(adev); 4486 4487 dev_info(adev->dev, 4488 "SE %d, SH per SE %d, CU per SH %d, active_cu_number %d\n", 4489 adev->gfx.config.max_shader_engines, 4490 adev->gfx.config.max_sh_per_se, 4491 adev->gfx.config.max_cu_per_sh, 4492 adev->gfx.cu_info.number); 4493 4494 adev->accel_working = true; 4495 4496 amdgpu_vm_check_compute_bug(adev); 4497 4498 /* Initialize the buffer migration limit. */ 4499 if (amdgpu_moverate >= 0) 4500 max_MBps = amdgpu_moverate; 4501 else 4502 max_MBps = 8; /* Allow 8 MB/s. */ 4503 /* Get a log2 for easy divisions. */ 4504 adev->mm_stats.log2_max_MBps = ilog2(max(1u, max_MBps)); 4505 4506 /* 4507 * Register gpu instance before amdgpu_device_enable_mgpu_fan_boost. 4508 * Otherwise the mgpu fan boost feature will be skipped due to the 4509 * gpu instance is counted less. 4510 */ 4511 amdgpu_register_gpu_instance(adev); 4512 4513 /* enable clockgating, etc. after ib tests, etc. since some blocks require 4514 * explicit gating rather than handling it automatically. 4515 */ 4516 if (adev->init_lvl->level != AMDGPU_INIT_LEVEL_MINIMAL_XGMI) { 4517 r = amdgpu_device_ip_late_init(adev); 4518 if (r) { 4519 dev_err(adev->dev, "amdgpu_device_ip_late_init failed\n"); 4520 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_AMDGPU_LATE_INIT_FAIL, 0, r); 4521 goto release_ras_con; 4522 } 4523 /* must succeed. */ 4524 amdgpu_ras_resume(adev); 4525 queue_delayed_work(system_wq, &adev->delayed_init_work, 4526 msecs_to_jiffies(AMDGPU_RESUME_MS)); 4527 } 4528 4529 if (amdgpu_sriov_vf(adev)) { 4530 amdgpu_virt_release_full_gpu(adev, true); 4531 flush_delayed_work(&adev->delayed_init_work); 4532 } 4533 4534 /* 4535 * Place those sysfs registering after `late_init`. As some of those 4536 * operations performed in `late_init` might affect the sysfs 4537 * interfaces creating. 4538 */ 4539 r = amdgpu_atombios_sysfs_init(adev); 4540 if (r) 4541 drm_err(&adev->ddev, 4542 "registering atombios sysfs failed (%d).\n", r); 4543 4544 r = amdgpu_pm_sysfs_init(adev); 4545 if (r) 4546 DRM_ERROR("registering pm sysfs failed (%d).\n", r); 4547 4548 r = amdgpu_ucode_sysfs_init(adev); 4549 if (r) { 4550 adev->ucode_sysfs_en = false; 4551 DRM_ERROR("Creating firmware sysfs failed (%d).\n", r); 4552 } else 4553 adev->ucode_sysfs_en = true; 4554 4555 r = sysfs_create_files(&adev->dev->kobj, amdgpu_dev_attributes); 4556 if (r) 4557 dev_err(adev->dev, "Could not create amdgpu device attr\n"); 4558 4559 r = devm_device_add_group(adev->dev, &amdgpu_board_attrs_group); 4560 if (r) 4561 dev_err(adev->dev, 4562 "Could not create amdgpu board attributes\n"); 4563 4564 amdgpu_fru_sysfs_init(adev); 4565 amdgpu_reg_state_sysfs_init(adev); 4566 amdgpu_xcp_cfg_sysfs_init(adev); 4567 4568 if (IS_ENABLED(CONFIG_PERF_EVENTS)) 4569 r = amdgpu_pmu_init(adev); 4570 if (r) 4571 dev_err(adev->dev, "amdgpu_pmu_init failed\n"); 4572 4573 /* Have stored pci confspace at hand for restore in sudden PCI error */ 4574 if (amdgpu_device_cache_pci_state(adev->pdev)) 4575 pci_restore_state(pdev); 4576 4577 /* if we have > 1 VGA cards, then disable the amdgpu VGA resources */ 4578 /* this will fail for cards that aren't VGA class devices, just 4579 * ignore it 4580 */ 4581 if ((adev->pdev->class >> 8) == PCI_CLASS_DISPLAY_VGA) 4582 vga_client_register(adev->pdev, amdgpu_device_vga_set_decode); 4583 4584 px = amdgpu_device_supports_px(ddev); 4585 4586 if (px || (!dev_is_removable(&adev->pdev->dev) && 4587 apple_gmux_detect(NULL, NULL))) 4588 vga_switcheroo_register_client(adev->pdev, 4589 &amdgpu_switcheroo_ops, px); 4590 4591 if (px) 4592 vga_switcheroo_init_domain_pm_ops(adev->dev, &adev->vga_pm_domain); 4593 4594 if (adev->init_lvl->level == AMDGPU_INIT_LEVEL_MINIMAL_XGMI) 4595 amdgpu_xgmi_reset_on_init(adev); 4596 4597 amdgpu_device_check_iommu_direct_map(adev); 4598 4599 return 0; 4600 4601 release_ras_con: 4602 if (amdgpu_sriov_vf(adev)) 4603 amdgpu_virt_release_full_gpu(adev, true); 4604 4605 /* failed in exclusive mode due to timeout */ 4606 if (amdgpu_sriov_vf(adev) && 4607 !amdgpu_sriov_runtime(adev) && 4608 amdgpu_virt_mmio_blocked(adev) && 4609 !amdgpu_virt_wait_reset(adev)) { 4610 dev_err(adev->dev, "VF exclusive mode timeout\n"); 4611 /* Don't send request since VF is inactive. */ 4612 adev->virt.caps &= ~AMDGPU_SRIOV_CAPS_RUNTIME; 4613 adev->virt.ops = NULL; 4614 r = -EAGAIN; 4615 } 4616 amdgpu_release_ras_context(adev); 4617 4618 failed: 4619 amdgpu_vf_error_trans_all(adev); 4620 4621 return r; 4622 } 4623 4624 static void amdgpu_device_unmap_mmio(struct amdgpu_device *adev) 4625 { 4626 4627 /* Clear all CPU mappings pointing to this device */ 4628 unmap_mapping_range(adev->ddev.anon_inode->i_mapping, 0, 0, 1); 4629 4630 /* Unmap all mapped bars - Doorbell, registers and VRAM */ 4631 amdgpu_doorbell_fini(adev); 4632 4633 iounmap(adev->rmmio); 4634 adev->rmmio = NULL; 4635 if (adev->mman.aper_base_kaddr) 4636 iounmap(adev->mman.aper_base_kaddr); 4637 adev->mman.aper_base_kaddr = NULL; 4638 4639 /* Memory manager related */ 4640 if (!adev->gmc.xgmi.connected_to_cpu && !adev->gmc.is_app_apu) { 4641 arch_phys_wc_del(adev->gmc.vram_mtrr); 4642 arch_io_free_memtype_wc(adev->gmc.aper_base, adev->gmc.aper_size); 4643 } 4644 } 4645 4646 /** 4647 * amdgpu_device_fini_hw - tear down the driver 4648 * 4649 * @adev: amdgpu_device pointer 4650 * 4651 * Tear down the driver info (all asics). 4652 * Called at driver shutdown. 4653 */ 4654 void amdgpu_device_fini_hw(struct amdgpu_device *adev) 4655 { 4656 dev_info(adev->dev, "amdgpu: finishing device.\n"); 4657 flush_delayed_work(&adev->delayed_init_work); 4658 4659 if (adev->mman.initialized) 4660 drain_workqueue(adev->mman.bdev.wq); 4661 adev->shutdown = true; 4662 4663 /* make sure IB test finished before entering exclusive mode 4664 * to avoid preemption on IB test 4665 */ 4666 if (amdgpu_sriov_vf(adev)) { 4667 amdgpu_virt_request_full_gpu(adev, false); 4668 amdgpu_virt_fini_data_exchange(adev); 4669 } 4670 4671 /* disable all interrupts */ 4672 amdgpu_irq_disable_all(adev); 4673 if (adev->mode_info.mode_config_initialized) { 4674 if (!drm_drv_uses_atomic_modeset(adev_to_drm(adev))) 4675 drm_helper_force_disable_all(adev_to_drm(adev)); 4676 else 4677 drm_atomic_helper_shutdown(adev_to_drm(adev)); 4678 } 4679 amdgpu_fence_driver_hw_fini(adev); 4680 4681 if (adev->pm.sysfs_initialized) 4682 amdgpu_pm_sysfs_fini(adev); 4683 if (adev->ucode_sysfs_en) 4684 amdgpu_ucode_sysfs_fini(adev); 4685 sysfs_remove_files(&adev->dev->kobj, amdgpu_dev_attributes); 4686 amdgpu_fru_sysfs_fini(adev); 4687 4688 amdgpu_reg_state_sysfs_fini(adev); 4689 amdgpu_xcp_cfg_sysfs_fini(adev); 4690 4691 /* disable ras feature must before hw fini */ 4692 amdgpu_ras_pre_fini(adev); 4693 4694 amdgpu_ttm_set_buffer_funcs_status(adev, false); 4695 4696 amdgpu_device_ip_fini_early(adev); 4697 4698 amdgpu_irq_fini_hw(adev); 4699 4700 if (adev->mman.initialized) 4701 ttm_device_clear_dma_mappings(&adev->mman.bdev); 4702 4703 amdgpu_gart_dummy_page_fini(adev); 4704 4705 if (drm_dev_is_unplugged(adev_to_drm(adev))) 4706 amdgpu_device_unmap_mmio(adev); 4707 4708 } 4709 4710 void amdgpu_device_fini_sw(struct amdgpu_device *adev) 4711 { 4712 int idx; 4713 bool px; 4714 4715 amdgpu_device_ip_fini(adev); 4716 amdgpu_fence_driver_sw_fini(adev); 4717 amdgpu_ucode_release(&adev->firmware.gpu_info_fw); 4718 adev->accel_working = false; 4719 dma_fence_put(rcu_dereference_protected(adev->gang_submit, true)); 4720 4721 amdgpu_reset_fini(adev); 4722 4723 /* free i2c buses */ 4724 if (!amdgpu_device_has_dc_support(adev)) 4725 amdgpu_i2c_fini(adev); 4726 4727 if (amdgpu_emu_mode != 1) 4728 amdgpu_atombios_fini(adev); 4729 4730 kfree(adev->bios); 4731 adev->bios = NULL; 4732 4733 kfree(adev->fru_info); 4734 adev->fru_info = NULL; 4735 4736 px = amdgpu_device_supports_px(adev_to_drm(adev)); 4737 4738 if (px || (!dev_is_removable(&adev->pdev->dev) && 4739 apple_gmux_detect(NULL, NULL))) 4740 vga_switcheroo_unregister_client(adev->pdev); 4741 4742 if (px) 4743 vga_switcheroo_fini_domain_pm_ops(adev->dev); 4744 4745 if ((adev->pdev->class >> 8) == PCI_CLASS_DISPLAY_VGA) 4746 vga_client_unregister(adev->pdev); 4747 4748 if (drm_dev_enter(adev_to_drm(adev), &idx)) { 4749 4750 iounmap(adev->rmmio); 4751 adev->rmmio = NULL; 4752 amdgpu_doorbell_fini(adev); 4753 drm_dev_exit(idx); 4754 } 4755 4756 if (IS_ENABLED(CONFIG_PERF_EVENTS)) 4757 amdgpu_pmu_fini(adev); 4758 if (adev->mman.discovery_bin) 4759 amdgpu_discovery_fini(adev); 4760 4761 amdgpu_reset_put_reset_domain(adev->reset_domain); 4762 adev->reset_domain = NULL; 4763 4764 kfree(adev->pci_state); 4765 4766 } 4767 4768 /** 4769 * amdgpu_device_evict_resources - evict device resources 4770 * @adev: amdgpu device object 4771 * 4772 * Evicts all ttm device resources(vram BOs, gart table) from the lru list 4773 * of the vram memory type. Mainly used for evicting device resources 4774 * at suspend time. 4775 * 4776 */ 4777 static int amdgpu_device_evict_resources(struct amdgpu_device *adev) 4778 { 4779 int ret; 4780 4781 /* No need to evict vram on APUs for suspend to ram or s2idle */ 4782 if ((adev->in_s3 || adev->in_s0ix) && (adev->flags & AMD_IS_APU)) 4783 return 0; 4784 4785 ret = amdgpu_ttm_evict_resources(adev, TTM_PL_VRAM); 4786 if (ret) 4787 DRM_WARN("evicting device resources failed\n"); 4788 return ret; 4789 } 4790 4791 /* 4792 * Suspend & resume. 4793 */ 4794 /** 4795 * amdgpu_device_prepare - prepare for device suspend 4796 * 4797 * @dev: drm dev pointer 4798 * 4799 * Prepare to put the hw in the suspend state (all asics). 4800 * Returns 0 for success or an error on failure. 4801 * Called at driver suspend. 4802 */ 4803 int amdgpu_device_prepare(struct drm_device *dev) 4804 { 4805 struct amdgpu_device *adev = drm_to_adev(dev); 4806 int i, r; 4807 4808 amdgpu_choose_low_power_state(adev); 4809 4810 if (dev->switch_power_state == DRM_SWITCH_POWER_OFF) 4811 return 0; 4812 4813 /* Evict the majority of BOs before starting suspend sequence */ 4814 r = amdgpu_device_evict_resources(adev); 4815 if (r) 4816 goto unprepare; 4817 4818 flush_delayed_work(&adev->gfx.gfx_off_delay_work); 4819 4820 for (i = 0; i < adev->num_ip_blocks; i++) { 4821 if (!adev->ip_blocks[i].status.valid) 4822 continue; 4823 if (!adev->ip_blocks[i].version->funcs->prepare_suspend) 4824 continue; 4825 r = adev->ip_blocks[i].version->funcs->prepare_suspend(&adev->ip_blocks[i]); 4826 if (r) 4827 goto unprepare; 4828 } 4829 4830 return 0; 4831 4832 unprepare: 4833 adev->in_s0ix = adev->in_s3 = false; 4834 4835 return r; 4836 } 4837 4838 /** 4839 * amdgpu_device_suspend - initiate device suspend 4840 * 4841 * @dev: drm dev pointer 4842 * @notify_clients: notify in-kernel DRM clients 4843 * 4844 * Puts the hw in the suspend state (all asics). 4845 * Returns 0 for success or an error on failure. 4846 * Called at driver suspend. 4847 */ 4848 int amdgpu_device_suspend(struct drm_device *dev, bool notify_clients) 4849 { 4850 struct amdgpu_device *adev = drm_to_adev(dev); 4851 int r = 0; 4852 4853 if (dev->switch_power_state == DRM_SWITCH_POWER_OFF) 4854 return 0; 4855 4856 adev->in_suspend = true; 4857 4858 if (amdgpu_sriov_vf(adev)) { 4859 amdgpu_virt_fini_data_exchange(adev); 4860 r = amdgpu_virt_request_full_gpu(adev, false); 4861 if (r) 4862 return r; 4863 } 4864 4865 if (amdgpu_acpi_smart_shift_update(dev, AMDGPU_SS_DEV_D3)) 4866 DRM_WARN("smart shift update failed\n"); 4867 4868 if (notify_clients) 4869 drm_client_dev_suspend(adev_to_drm(adev), false); 4870 4871 cancel_delayed_work_sync(&adev->delayed_init_work); 4872 4873 amdgpu_ras_suspend(adev); 4874 4875 amdgpu_device_ip_suspend_phase1(adev); 4876 4877 if (!adev->in_s0ix) 4878 amdgpu_amdkfd_suspend(adev, adev->in_runpm); 4879 4880 r = amdgpu_device_evict_resources(adev); 4881 if (r) 4882 return r; 4883 4884 amdgpu_ttm_set_buffer_funcs_status(adev, false); 4885 4886 amdgpu_fence_driver_hw_fini(adev); 4887 4888 amdgpu_device_ip_suspend_phase2(adev); 4889 4890 if (amdgpu_sriov_vf(adev)) 4891 amdgpu_virt_release_full_gpu(adev, false); 4892 4893 r = amdgpu_dpm_notify_rlc_state(adev, false); 4894 if (r) 4895 return r; 4896 4897 return 0; 4898 } 4899 4900 /** 4901 * amdgpu_device_resume - initiate device resume 4902 * 4903 * @dev: drm dev pointer 4904 * @notify_clients: notify in-kernel DRM clients 4905 * 4906 * Bring the hw back to operating state (all asics). 4907 * Returns 0 for success or an error on failure. 4908 * Called at driver resume. 4909 */ 4910 int amdgpu_device_resume(struct drm_device *dev, bool notify_clients) 4911 { 4912 struct amdgpu_device *adev = drm_to_adev(dev); 4913 int r = 0; 4914 4915 if (amdgpu_sriov_vf(adev)) { 4916 r = amdgpu_virt_request_full_gpu(adev, true); 4917 if (r) 4918 return r; 4919 } 4920 4921 if (dev->switch_power_state == DRM_SWITCH_POWER_OFF) 4922 return 0; 4923 4924 if (adev->in_s0ix) 4925 amdgpu_dpm_gfx_state_change(adev, sGpuChangeState_D0Entry); 4926 4927 /* post card */ 4928 if (amdgpu_device_need_post(adev)) { 4929 r = amdgpu_device_asic_init(adev); 4930 if (r) 4931 dev_err(adev->dev, "amdgpu asic init failed\n"); 4932 } 4933 4934 r = amdgpu_device_ip_resume(adev); 4935 4936 if (r) { 4937 dev_err(adev->dev, "amdgpu_device_ip_resume failed (%d).\n", r); 4938 goto exit; 4939 } 4940 4941 if (!adev->in_s0ix) { 4942 r = amdgpu_amdkfd_resume(adev, adev->in_runpm); 4943 if (r) 4944 goto exit; 4945 } 4946 4947 r = amdgpu_device_ip_late_init(adev); 4948 if (r) 4949 goto exit; 4950 4951 queue_delayed_work(system_wq, &adev->delayed_init_work, 4952 msecs_to_jiffies(AMDGPU_RESUME_MS)); 4953 exit: 4954 if (amdgpu_sriov_vf(adev)) { 4955 amdgpu_virt_init_data_exchange(adev); 4956 amdgpu_virt_release_full_gpu(adev, true); 4957 } 4958 4959 if (r) 4960 return r; 4961 4962 /* Make sure IB tests flushed */ 4963 flush_delayed_work(&adev->delayed_init_work); 4964 4965 if (notify_clients) 4966 drm_client_dev_resume(adev_to_drm(adev), false); 4967 4968 amdgpu_ras_resume(adev); 4969 4970 if (adev->mode_info.num_crtc) { 4971 /* 4972 * Most of the connector probing functions try to acquire runtime pm 4973 * refs to ensure that the GPU is powered on when connector polling is 4974 * performed. Since we're calling this from a runtime PM callback, 4975 * trying to acquire rpm refs will cause us to deadlock. 4976 * 4977 * Since we're guaranteed to be holding the rpm lock, it's safe to 4978 * temporarily disable the rpm helpers so this doesn't deadlock us. 4979 */ 4980 #ifdef CONFIG_PM 4981 dev->dev->power.disable_depth++; 4982 #endif 4983 if (!adev->dc_enabled) 4984 drm_helper_hpd_irq_event(dev); 4985 else 4986 drm_kms_helper_hotplug_event(dev); 4987 #ifdef CONFIG_PM 4988 dev->dev->power.disable_depth--; 4989 #endif 4990 } 4991 adev->in_suspend = false; 4992 4993 if (adev->enable_mes) 4994 amdgpu_mes_self_test(adev); 4995 4996 if (amdgpu_acpi_smart_shift_update(dev, AMDGPU_SS_DEV_D0)) 4997 DRM_WARN("smart shift update failed\n"); 4998 4999 return 0; 5000 } 5001 5002 /** 5003 * amdgpu_device_ip_check_soft_reset - did soft reset succeed 5004 * 5005 * @adev: amdgpu_device pointer 5006 * 5007 * The list of all the hardware IPs that make up the asic is walked and 5008 * the check_soft_reset callbacks are run. check_soft_reset determines 5009 * if the asic is still hung or not. 5010 * Returns true if any of the IPs are still in a hung state, false if not. 5011 */ 5012 static bool amdgpu_device_ip_check_soft_reset(struct amdgpu_device *adev) 5013 { 5014 int i; 5015 bool asic_hang = false; 5016 5017 if (amdgpu_sriov_vf(adev)) 5018 return true; 5019 5020 if (amdgpu_asic_need_full_reset(adev)) 5021 return true; 5022 5023 for (i = 0; i < adev->num_ip_blocks; i++) { 5024 if (!adev->ip_blocks[i].status.valid) 5025 continue; 5026 if (adev->ip_blocks[i].version->funcs->check_soft_reset) 5027 adev->ip_blocks[i].status.hang = 5028 adev->ip_blocks[i].version->funcs->check_soft_reset( 5029 &adev->ip_blocks[i]); 5030 if (adev->ip_blocks[i].status.hang) { 5031 dev_info(adev->dev, "IP block:%s is hung!\n", adev->ip_blocks[i].version->funcs->name); 5032 asic_hang = true; 5033 } 5034 } 5035 return asic_hang; 5036 } 5037 5038 /** 5039 * amdgpu_device_ip_pre_soft_reset - prepare for soft reset 5040 * 5041 * @adev: amdgpu_device pointer 5042 * 5043 * The list of all the hardware IPs that make up the asic is walked and the 5044 * pre_soft_reset callbacks are run if the block is hung. pre_soft_reset 5045 * handles any IP specific hardware or software state changes that are 5046 * necessary for a soft reset to succeed. 5047 * Returns 0 on success, negative error code on failure. 5048 */ 5049 static int amdgpu_device_ip_pre_soft_reset(struct amdgpu_device *adev) 5050 { 5051 int i, r = 0; 5052 5053 for (i = 0; i < adev->num_ip_blocks; i++) { 5054 if (!adev->ip_blocks[i].status.valid) 5055 continue; 5056 if (adev->ip_blocks[i].status.hang && 5057 adev->ip_blocks[i].version->funcs->pre_soft_reset) { 5058 r = adev->ip_blocks[i].version->funcs->pre_soft_reset(&adev->ip_blocks[i]); 5059 if (r) 5060 return r; 5061 } 5062 } 5063 5064 return 0; 5065 } 5066 5067 /** 5068 * amdgpu_device_ip_need_full_reset - check if a full asic reset is needed 5069 * 5070 * @adev: amdgpu_device pointer 5071 * 5072 * Some hardware IPs cannot be soft reset. If they are hung, a full gpu 5073 * reset is necessary to recover. 5074 * Returns true if a full asic reset is required, false if not. 5075 */ 5076 static bool amdgpu_device_ip_need_full_reset(struct amdgpu_device *adev) 5077 { 5078 int i; 5079 5080 if (amdgpu_asic_need_full_reset(adev)) 5081 return true; 5082 5083 for (i = 0; i < adev->num_ip_blocks; i++) { 5084 if (!adev->ip_blocks[i].status.valid) 5085 continue; 5086 if ((adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) || 5087 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) || 5088 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_ACP) || 5089 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE) || 5090 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) { 5091 if (adev->ip_blocks[i].status.hang) { 5092 dev_info(adev->dev, "Some block need full reset!\n"); 5093 return true; 5094 } 5095 } 5096 } 5097 return false; 5098 } 5099 5100 /** 5101 * amdgpu_device_ip_soft_reset - do a soft reset 5102 * 5103 * @adev: amdgpu_device pointer 5104 * 5105 * The list of all the hardware IPs that make up the asic is walked and the 5106 * soft_reset callbacks are run if the block is hung. soft_reset handles any 5107 * IP specific hardware or software state changes that are necessary to soft 5108 * reset the IP. 5109 * Returns 0 on success, negative error code on failure. 5110 */ 5111 static int amdgpu_device_ip_soft_reset(struct amdgpu_device *adev) 5112 { 5113 int i, r = 0; 5114 5115 for (i = 0; i < adev->num_ip_blocks; i++) { 5116 if (!adev->ip_blocks[i].status.valid) 5117 continue; 5118 if (adev->ip_blocks[i].status.hang && 5119 adev->ip_blocks[i].version->funcs->soft_reset) { 5120 r = adev->ip_blocks[i].version->funcs->soft_reset(&adev->ip_blocks[i]); 5121 if (r) 5122 return r; 5123 } 5124 } 5125 5126 return 0; 5127 } 5128 5129 /** 5130 * amdgpu_device_ip_post_soft_reset - clean up from soft reset 5131 * 5132 * @adev: amdgpu_device pointer 5133 * 5134 * The list of all the hardware IPs that make up the asic is walked and the 5135 * post_soft_reset callbacks are run if the asic was hung. post_soft_reset 5136 * handles any IP specific hardware or software state changes that are 5137 * necessary after the IP has been soft reset. 5138 * Returns 0 on success, negative error code on failure. 5139 */ 5140 static int amdgpu_device_ip_post_soft_reset(struct amdgpu_device *adev) 5141 { 5142 int i, r = 0; 5143 5144 for (i = 0; i < adev->num_ip_blocks; i++) { 5145 if (!adev->ip_blocks[i].status.valid) 5146 continue; 5147 if (adev->ip_blocks[i].status.hang && 5148 adev->ip_blocks[i].version->funcs->post_soft_reset) 5149 r = adev->ip_blocks[i].version->funcs->post_soft_reset(&adev->ip_blocks[i]); 5150 if (r) 5151 return r; 5152 } 5153 5154 return 0; 5155 } 5156 5157 /** 5158 * amdgpu_device_reset_sriov - reset ASIC for SR-IOV vf 5159 * 5160 * @adev: amdgpu_device pointer 5161 * @reset_context: amdgpu reset context pointer 5162 * 5163 * do VF FLR and reinitialize Asic 5164 * return 0 means succeeded otherwise failed 5165 */ 5166 static int amdgpu_device_reset_sriov(struct amdgpu_device *adev, 5167 struct amdgpu_reset_context *reset_context) 5168 { 5169 int r; 5170 struct amdgpu_hive_info *hive = NULL; 5171 5172 if (test_bit(AMDGPU_HOST_FLR, &reset_context->flags)) { 5173 if (!amdgpu_ras_get_fed_status(adev)) 5174 amdgpu_virt_ready_to_reset(adev); 5175 amdgpu_virt_wait_reset(adev); 5176 clear_bit(AMDGPU_HOST_FLR, &reset_context->flags); 5177 r = amdgpu_virt_request_full_gpu(adev, true); 5178 } else { 5179 r = amdgpu_virt_reset_gpu(adev); 5180 } 5181 if (r) 5182 return r; 5183 5184 amdgpu_ras_set_fed(adev, false); 5185 amdgpu_irq_gpu_reset_resume_helper(adev); 5186 5187 /* some sw clean up VF needs to do before recover */ 5188 amdgpu_virt_post_reset(adev); 5189 5190 /* Resume IP prior to SMC */ 5191 r = amdgpu_device_ip_reinit_early_sriov(adev); 5192 if (r) 5193 return r; 5194 5195 amdgpu_virt_init_data_exchange(adev); 5196 5197 r = amdgpu_device_fw_loading(adev); 5198 if (r) 5199 return r; 5200 5201 /* now we are okay to resume SMC/CP/SDMA */ 5202 r = amdgpu_device_ip_reinit_late_sriov(adev); 5203 if (r) 5204 return r; 5205 5206 hive = amdgpu_get_xgmi_hive(adev); 5207 /* Update PSP FW topology after reset */ 5208 if (hive && adev->gmc.xgmi.num_physical_nodes > 1) 5209 r = amdgpu_xgmi_update_topology(hive, adev); 5210 if (hive) 5211 amdgpu_put_xgmi_hive(hive); 5212 if (r) 5213 return r; 5214 5215 r = amdgpu_ib_ring_tests(adev); 5216 if (r) 5217 return r; 5218 5219 if (adev->virt.gim_feature & AMDGIM_FEATURE_GIM_FLR_VRAMLOST) 5220 amdgpu_inc_vram_lost(adev); 5221 5222 /* need to be called during full access so we can't do it later like 5223 * bare-metal does. 5224 */ 5225 amdgpu_amdkfd_post_reset(adev); 5226 amdgpu_virt_release_full_gpu(adev, true); 5227 5228 /* Aldebaran and gfx_11_0_3 support ras in SRIOV, so need resume ras during reset */ 5229 if (amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 2) || 5230 amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 3) || 5231 amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 4) || 5232 amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(11, 0, 3)) 5233 amdgpu_ras_resume(adev); 5234 5235 amdgpu_virt_ras_telemetry_post_reset(adev); 5236 5237 return 0; 5238 } 5239 5240 /** 5241 * amdgpu_device_has_job_running - check if there is any job in mirror list 5242 * 5243 * @adev: amdgpu_device pointer 5244 * 5245 * check if there is any job in mirror list 5246 */ 5247 bool amdgpu_device_has_job_running(struct amdgpu_device *adev) 5248 { 5249 int i; 5250 struct drm_sched_job *job; 5251 5252 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 5253 struct amdgpu_ring *ring = adev->rings[i]; 5254 5255 if (!amdgpu_ring_sched_ready(ring)) 5256 continue; 5257 5258 spin_lock(&ring->sched.job_list_lock); 5259 job = list_first_entry_or_null(&ring->sched.pending_list, 5260 struct drm_sched_job, list); 5261 spin_unlock(&ring->sched.job_list_lock); 5262 if (job) 5263 return true; 5264 } 5265 return false; 5266 } 5267 5268 /** 5269 * amdgpu_device_should_recover_gpu - check if we should try GPU recovery 5270 * 5271 * @adev: amdgpu_device pointer 5272 * 5273 * Check amdgpu_gpu_recovery and SRIOV status to see if we should try to recover 5274 * a hung GPU. 5275 */ 5276 bool amdgpu_device_should_recover_gpu(struct amdgpu_device *adev) 5277 { 5278 5279 if (amdgpu_gpu_recovery == 0) 5280 goto disabled; 5281 5282 /* Skip soft reset check in fatal error mode */ 5283 if (!amdgpu_ras_is_poison_mode_supported(adev)) 5284 return true; 5285 5286 if (amdgpu_sriov_vf(adev)) 5287 return true; 5288 5289 if (amdgpu_gpu_recovery == -1) { 5290 switch (adev->asic_type) { 5291 #ifdef CONFIG_DRM_AMDGPU_SI 5292 case CHIP_VERDE: 5293 case CHIP_TAHITI: 5294 case CHIP_PITCAIRN: 5295 case CHIP_OLAND: 5296 case CHIP_HAINAN: 5297 #endif 5298 #ifdef CONFIG_DRM_AMDGPU_CIK 5299 case CHIP_KAVERI: 5300 case CHIP_KABINI: 5301 case CHIP_MULLINS: 5302 #endif 5303 case CHIP_CARRIZO: 5304 case CHIP_STONEY: 5305 case CHIP_CYAN_SKILLFISH: 5306 goto disabled; 5307 default: 5308 break; 5309 } 5310 } 5311 5312 return true; 5313 5314 disabled: 5315 dev_info(adev->dev, "GPU recovery disabled.\n"); 5316 return false; 5317 } 5318 5319 int amdgpu_device_mode1_reset(struct amdgpu_device *adev) 5320 { 5321 u32 i; 5322 int ret = 0; 5323 5324 amdgpu_atombios_scratch_regs_engine_hung(adev, true); 5325 5326 dev_info(adev->dev, "GPU mode1 reset\n"); 5327 5328 /* Cache the state before bus master disable. The saved config space 5329 * values are used in other cases like restore after mode-2 reset. 5330 */ 5331 amdgpu_device_cache_pci_state(adev->pdev); 5332 5333 /* disable BM */ 5334 pci_clear_master(adev->pdev); 5335 5336 if (amdgpu_dpm_is_mode1_reset_supported(adev)) { 5337 dev_info(adev->dev, "GPU smu mode1 reset\n"); 5338 ret = amdgpu_dpm_mode1_reset(adev); 5339 } else { 5340 dev_info(adev->dev, "GPU psp mode1 reset\n"); 5341 ret = psp_gpu_reset(adev); 5342 } 5343 5344 if (ret) 5345 goto mode1_reset_failed; 5346 5347 amdgpu_device_load_pci_state(adev->pdev); 5348 ret = amdgpu_psp_wait_for_bootloader(adev); 5349 if (ret) 5350 goto mode1_reset_failed; 5351 5352 /* wait for asic to come out of reset */ 5353 for (i = 0; i < adev->usec_timeout; i++) { 5354 u32 memsize = adev->nbio.funcs->get_memsize(adev); 5355 5356 if (memsize != 0xffffffff) 5357 break; 5358 udelay(1); 5359 } 5360 5361 if (i >= adev->usec_timeout) { 5362 ret = -ETIMEDOUT; 5363 goto mode1_reset_failed; 5364 } 5365 5366 amdgpu_atombios_scratch_regs_engine_hung(adev, false); 5367 5368 return 0; 5369 5370 mode1_reset_failed: 5371 dev_err(adev->dev, "GPU mode1 reset failed\n"); 5372 return ret; 5373 } 5374 5375 int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev, 5376 struct amdgpu_reset_context *reset_context) 5377 { 5378 int i, r = 0; 5379 struct amdgpu_job *job = NULL; 5380 struct amdgpu_device *tmp_adev = reset_context->reset_req_dev; 5381 bool need_full_reset = 5382 test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 5383 5384 if (reset_context->reset_req_dev == adev) 5385 job = reset_context->job; 5386 5387 if (amdgpu_sriov_vf(adev)) 5388 amdgpu_virt_pre_reset(adev); 5389 5390 amdgpu_fence_driver_isr_toggle(adev, true); 5391 5392 /* block all schedulers and reset given job's ring */ 5393 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 5394 struct amdgpu_ring *ring = adev->rings[i]; 5395 5396 if (!amdgpu_ring_sched_ready(ring)) 5397 continue; 5398 5399 /* Clear job fence from fence drv to avoid force_completion 5400 * leave NULL and vm flush fence in fence drv 5401 */ 5402 amdgpu_fence_driver_clear_job_fences(ring); 5403 5404 /* after all hw jobs are reset, hw fence is meaningless, so force_completion */ 5405 amdgpu_fence_driver_force_completion(ring); 5406 } 5407 5408 amdgpu_fence_driver_isr_toggle(adev, false); 5409 5410 if (job && job->vm) 5411 drm_sched_increase_karma(&job->base); 5412 5413 r = amdgpu_reset_prepare_hwcontext(adev, reset_context); 5414 /* If reset handler not implemented, continue; otherwise return */ 5415 if (r == -EOPNOTSUPP) 5416 r = 0; 5417 else 5418 return r; 5419 5420 /* Don't suspend on bare metal if we are not going to HW reset the ASIC */ 5421 if (!amdgpu_sriov_vf(adev)) { 5422 5423 if (!need_full_reset) 5424 need_full_reset = amdgpu_device_ip_need_full_reset(adev); 5425 5426 if (!need_full_reset && amdgpu_gpu_recovery && 5427 amdgpu_device_ip_check_soft_reset(adev)) { 5428 amdgpu_device_ip_pre_soft_reset(adev); 5429 r = amdgpu_device_ip_soft_reset(adev); 5430 amdgpu_device_ip_post_soft_reset(adev); 5431 if (r || amdgpu_device_ip_check_soft_reset(adev)) { 5432 dev_info(adev->dev, "soft reset failed, will fallback to full reset!\n"); 5433 need_full_reset = true; 5434 } 5435 } 5436 5437 if (!test_bit(AMDGPU_SKIP_COREDUMP, &reset_context->flags)) { 5438 dev_info(tmp_adev->dev, "Dumping IP State\n"); 5439 /* Trigger ip dump before we reset the asic */ 5440 for (i = 0; i < tmp_adev->num_ip_blocks; i++) 5441 if (tmp_adev->ip_blocks[i].version->funcs->dump_ip_state) 5442 tmp_adev->ip_blocks[i].version->funcs 5443 ->dump_ip_state((void *)&tmp_adev->ip_blocks[i]); 5444 dev_info(tmp_adev->dev, "Dumping IP State Completed\n"); 5445 } 5446 5447 if (need_full_reset) 5448 r = amdgpu_device_ip_suspend(adev); 5449 if (need_full_reset) 5450 set_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 5451 else 5452 clear_bit(AMDGPU_NEED_FULL_RESET, 5453 &reset_context->flags); 5454 } 5455 5456 return r; 5457 } 5458 5459 int amdgpu_device_reinit_after_reset(struct amdgpu_reset_context *reset_context) 5460 { 5461 struct list_head *device_list_handle; 5462 bool full_reset, vram_lost = false; 5463 struct amdgpu_device *tmp_adev; 5464 int r, init_level; 5465 5466 device_list_handle = reset_context->reset_device_list; 5467 5468 if (!device_list_handle) 5469 return -EINVAL; 5470 5471 full_reset = test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 5472 5473 /** 5474 * If it's reset on init, it's default init level, otherwise keep level 5475 * as recovery level. 5476 */ 5477 if (reset_context->method == AMD_RESET_METHOD_ON_INIT) 5478 init_level = AMDGPU_INIT_LEVEL_DEFAULT; 5479 else 5480 init_level = AMDGPU_INIT_LEVEL_RESET_RECOVERY; 5481 5482 r = 0; 5483 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 5484 amdgpu_set_init_level(tmp_adev, init_level); 5485 if (full_reset) { 5486 /* post card */ 5487 amdgpu_ras_set_fed(tmp_adev, false); 5488 r = amdgpu_device_asic_init(tmp_adev); 5489 if (r) { 5490 dev_warn(tmp_adev->dev, "asic atom init failed!"); 5491 } else { 5492 dev_info(tmp_adev->dev, "GPU reset succeeded, trying to resume\n"); 5493 5494 r = amdgpu_device_ip_resume_phase1(tmp_adev); 5495 if (r) 5496 goto out; 5497 5498 vram_lost = amdgpu_device_check_vram_lost(tmp_adev); 5499 5500 if (!test_bit(AMDGPU_SKIP_COREDUMP, &reset_context->flags)) 5501 amdgpu_coredump(tmp_adev, false, vram_lost, reset_context->job); 5502 5503 if (vram_lost) { 5504 DRM_INFO("VRAM is lost due to GPU reset!\n"); 5505 amdgpu_inc_vram_lost(tmp_adev); 5506 } 5507 5508 r = amdgpu_device_fw_loading(tmp_adev); 5509 if (r) 5510 return r; 5511 5512 r = amdgpu_xcp_restore_partition_mode( 5513 tmp_adev->xcp_mgr); 5514 if (r) 5515 goto out; 5516 5517 r = amdgpu_device_ip_resume_phase2(tmp_adev); 5518 if (r) 5519 goto out; 5520 5521 if (tmp_adev->mman.buffer_funcs_ring->sched.ready) 5522 amdgpu_ttm_set_buffer_funcs_status(tmp_adev, true); 5523 5524 r = amdgpu_device_ip_resume_phase3(tmp_adev); 5525 if (r) 5526 goto out; 5527 5528 if (vram_lost) 5529 amdgpu_device_fill_reset_magic(tmp_adev); 5530 5531 /* 5532 * Add this ASIC as tracked as reset was already 5533 * complete successfully. 5534 */ 5535 amdgpu_register_gpu_instance(tmp_adev); 5536 5537 if (!reset_context->hive && 5538 tmp_adev->gmc.xgmi.num_physical_nodes > 1) 5539 amdgpu_xgmi_add_device(tmp_adev); 5540 5541 r = amdgpu_device_ip_late_init(tmp_adev); 5542 if (r) 5543 goto out; 5544 5545 drm_client_dev_resume(adev_to_drm(tmp_adev), false); 5546 5547 /* 5548 * The GPU enters bad state once faulty pages 5549 * by ECC has reached the threshold, and ras 5550 * recovery is scheduled next. So add one check 5551 * here to break recovery if it indeed exceeds 5552 * bad page threshold, and remind user to 5553 * retire this GPU or setting one bigger 5554 * bad_page_threshold value to fix this once 5555 * probing driver again. 5556 */ 5557 if (!amdgpu_ras_is_rma(tmp_adev)) { 5558 /* must succeed. */ 5559 amdgpu_ras_resume(tmp_adev); 5560 } else { 5561 r = -EINVAL; 5562 goto out; 5563 } 5564 5565 /* Update PSP FW topology after reset */ 5566 if (reset_context->hive && 5567 tmp_adev->gmc.xgmi.num_physical_nodes > 1) 5568 r = amdgpu_xgmi_update_topology( 5569 reset_context->hive, tmp_adev); 5570 } 5571 } 5572 5573 out: 5574 if (!r) { 5575 /* IP init is complete now, set level as default */ 5576 amdgpu_set_init_level(tmp_adev, 5577 AMDGPU_INIT_LEVEL_DEFAULT); 5578 amdgpu_irq_gpu_reset_resume_helper(tmp_adev); 5579 r = amdgpu_ib_ring_tests(tmp_adev); 5580 if (r) { 5581 dev_err(tmp_adev->dev, "ib ring test failed (%d).\n", r); 5582 r = -EAGAIN; 5583 goto end; 5584 } 5585 } 5586 5587 if (r) 5588 tmp_adev->asic_reset_res = r; 5589 } 5590 5591 end: 5592 return r; 5593 } 5594 5595 int amdgpu_do_asic_reset(struct list_head *device_list_handle, 5596 struct amdgpu_reset_context *reset_context) 5597 { 5598 struct amdgpu_device *tmp_adev = NULL; 5599 bool need_full_reset, skip_hw_reset; 5600 int r = 0; 5601 5602 /* Try reset handler method first */ 5603 tmp_adev = list_first_entry(device_list_handle, struct amdgpu_device, 5604 reset_list); 5605 5606 reset_context->reset_device_list = device_list_handle; 5607 r = amdgpu_reset_perform_reset(tmp_adev, reset_context); 5608 /* If reset handler not implemented, continue; otherwise return */ 5609 if (r == -EOPNOTSUPP) 5610 r = 0; 5611 else 5612 return r; 5613 5614 /* Reset handler not implemented, use the default method */ 5615 need_full_reset = 5616 test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 5617 skip_hw_reset = test_bit(AMDGPU_SKIP_HW_RESET, &reset_context->flags); 5618 5619 /* 5620 * ASIC reset has to be done on all XGMI hive nodes ASAP 5621 * to allow proper links negotiation in FW (within 1 sec) 5622 */ 5623 if (!skip_hw_reset && need_full_reset) { 5624 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 5625 /* For XGMI run all resets in parallel to speed up the process */ 5626 if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) { 5627 if (!queue_work(system_unbound_wq, 5628 &tmp_adev->xgmi_reset_work)) 5629 r = -EALREADY; 5630 } else 5631 r = amdgpu_asic_reset(tmp_adev); 5632 5633 if (r) { 5634 dev_err(tmp_adev->dev, 5635 "ASIC reset failed with error, %d for drm dev, %s", 5636 r, adev_to_drm(tmp_adev)->unique); 5637 goto out; 5638 } 5639 } 5640 5641 /* For XGMI wait for all resets to complete before proceed */ 5642 if (!r) { 5643 list_for_each_entry(tmp_adev, device_list_handle, 5644 reset_list) { 5645 if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) { 5646 flush_work(&tmp_adev->xgmi_reset_work); 5647 r = tmp_adev->asic_reset_res; 5648 if (r) 5649 break; 5650 } 5651 } 5652 } 5653 } 5654 5655 if (!r && amdgpu_ras_intr_triggered()) { 5656 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 5657 amdgpu_ras_reset_error_count(tmp_adev, 5658 AMDGPU_RAS_BLOCK__MMHUB); 5659 } 5660 5661 amdgpu_ras_intr_cleared(); 5662 } 5663 5664 r = amdgpu_device_reinit_after_reset(reset_context); 5665 if (r == -EAGAIN) 5666 set_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 5667 else 5668 clear_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 5669 5670 out: 5671 return r; 5672 } 5673 5674 static void amdgpu_device_set_mp1_state(struct amdgpu_device *adev) 5675 { 5676 5677 switch (amdgpu_asic_reset_method(adev)) { 5678 case AMD_RESET_METHOD_MODE1: 5679 adev->mp1_state = PP_MP1_STATE_SHUTDOWN; 5680 break; 5681 case AMD_RESET_METHOD_MODE2: 5682 adev->mp1_state = PP_MP1_STATE_RESET; 5683 break; 5684 default: 5685 adev->mp1_state = PP_MP1_STATE_NONE; 5686 break; 5687 } 5688 } 5689 5690 static void amdgpu_device_unset_mp1_state(struct amdgpu_device *adev) 5691 { 5692 amdgpu_vf_error_trans_all(adev); 5693 adev->mp1_state = PP_MP1_STATE_NONE; 5694 } 5695 5696 static void amdgpu_device_resume_display_audio(struct amdgpu_device *adev) 5697 { 5698 struct pci_dev *p = NULL; 5699 5700 p = pci_get_domain_bus_and_slot(pci_domain_nr(adev->pdev->bus), 5701 adev->pdev->bus->number, 1); 5702 if (p) { 5703 pm_runtime_enable(&(p->dev)); 5704 pm_runtime_resume(&(p->dev)); 5705 } 5706 5707 pci_dev_put(p); 5708 } 5709 5710 static int amdgpu_device_suspend_display_audio(struct amdgpu_device *adev) 5711 { 5712 enum amd_reset_method reset_method; 5713 struct pci_dev *p = NULL; 5714 u64 expires; 5715 5716 /* 5717 * For now, only BACO and mode1 reset are confirmed 5718 * to suffer the audio issue without proper suspended. 5719 */ 5720 reset_method = amdgpu_asic_reset_method(adev); 5721 if ((reset_method != AMD_RESET_METHOD_BACO) && 5722 (reset_method != AMD_RESET_METHOD_MODE1)) 5723 return -EINVAL; 5724 5725 p = pci_get_domain_bus_and_slot(pci_domain_nr(adev->pdev->bus), 5726 adev->pdev->bus->number, 1); 5727 if (!p) 5728 return -ENODEV; 5729 5730 expires = pm_runtime_autosuspend_expiration(&(p->dev)); 5731 if (!expires) 5732 /* 5733 * If we cannot get the audio device autosuspend delay, 5734 * a fixed 4S interval will be used. Considering 3S is 5735 * the audio controller default autosuspend delay setting. 5736 * 4S used here is guaranteed to cover that. 5737 */ 5738 expires = ktime_get_mono_fast_ns() + NSEC_PER_SEC * 4ULL; 5739 5740 while (!pm_runtime_status_suspended(&(p->dev))) { 5741 if (!pm_runtime_suspend(&(p->dev))) 5742 break; 5743 5744 if (expires < ktime_get_mono_fast_ns()) { 5745 dev_warn(adev->dev, "failed to suspend display audio\n"); 5746 pci_dev_put(p); 5747 /* TODO: abort the succeeding gpu reset? */ 5748 return -ETIMEDOUT; 5749 } 5750 } 5751 5752 pm_runtime_disable(&(p->dev)); 5753 5754 pci_dev_put(p); 5755 return 0; 5756 } 5757 5758 static inline void amdgpu_device_stop_pending_resets(struct amdgpu_device *adev) 5759 { 5760 struct amdgpu_ras *con = amdgpu_ras_get_context(adev); 5761 5762 #if defined(CONFIG_DEBUG_FS) 5763 if (!amdgpu_sriov_vf(adev)) 5764 cancel_work(&adev->reset_work); 5765 #endif 5766 5767 if (adev->kfd.dev) 5768 cancel_work(&adev->kfd.reset_work); 5769 5770 if (amdgpu_sriov_vf(adev)) 5771 cancel_work(&adev->virt.flr_work); 5772 5773 if (con && adev->ras_enabled) 5774 cancel_work(&con->recovery_work); 5775 5776 } 5777 5778 static int amdgpu_device_health_check(struct list_head *device_list_handle) 5779 { 5780 struct amdgpu_device *tmp_adev; 5781 int ret = 0; 5782 u32 status; 5783 5784 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 5785 pci_read_config_dword(tmp_adev->pdev, PCI_COMMAND, &status); 5786 if (PCI_POSSIBLE_ERROR(status)) { 5787 dev_err(tmp_adev->dev, "device lost from bus!"); 5788 ret = -ENODEV; 5789 } 5790 } 5791 5792 return ret; 5793 } 5794 5795 /** 5796 * amdgpu_device_gpu_recover - reset the asic and recover scheduler 5797 * 5798 * @adev: amdgpu_device pointer 5799 * @job: which job trigger hang 5800 * @reset_context: amdgpu reset context pointer 5801 * 5802 * Attempt to reset the GPU if it has hung (all asics). 5803 * Attempt to do soft-reset or full-reset and reinitialize Asic 5804 * Returns 0 for success or an error on failure. 5805 */ 5806 5807 int amdgpu_device_gpu_recover(struct amdgpu_device *adev, 5808 struct amdgpu_job *job, 5809 struct amdgpu_reset_context *reset_context) 5810 { 5811 struct list_head device_list, *device_list_handle = NULL; 5812 bool job_signaled = false; 5813 struct amdgpu_hive_info *hive = NULL; 5814 struct amdgpu_device *tmp_adev = NULL; 5815 int i, r = 0; 5816 bool need_emergency_restart = false; 5817 bool audio_suspended = false; 5818 int retry_limit = AMDGPU_MAX_RETRY_LIMIT; 5819 5820 /* 5821 * Special case: RAS triggered and full reset isn't supported 5822 */ 5823 need_emergency_restart = amdgpu_ras_need_emergency_restart(adev); 5824 5825 /* 5826 * Flush RAM to disk so that after reboot 5827 * the user can read log and see why the system rebooted. 5828 */ 5829 if (need_emergency_restart && amdgpu_ras_get_context(adev) && 5830 amdgpu_ras_get_context(adev)->reboot) { 5831 DRM_WARN("Emergency reboot."); 5832 5833 ksys_sync_helper(); 5834 emergency_restart(); 5835 } 5836 5837 dev_info(adev->dev, "GPU %s begin!\n", 5838 need_emergency_restart ? "jobs stop":"reset"); 5839 5840 if (!amdgpu_sriov_vf(adev)) 5841 hive = amdgpu_get_xgmi_hive(adev); 5842 if (hive) 5843 mutex_lock(&hive->hive_lock); 5844 5845 reset_context->job = job; 5846 reset_context->hive = hive; 5847 /* 5848 * Build list of devices to reset. 5849 * In case we are in XGMI hive mode, resort the device list 5850 * to put adev in the 1st position. 5851 */ 5852 INIT_LIST_HEAD(&device_list); 5853 if (!amdgpu_sriov_vf(adev) && (adev->gmc.xgmi.num_physical_nodes > 1) && hive) { 5854 list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head) { 5855 list_add_tail(&tmp_adev->reset_list, &device_list); 5856 if (adev->shutdown) 5857 tmp_adev->shutdown = true; 5858 } 5859 if (!list_is_first(&adev->reset_list, &device_list)) 5860 list_rotate_to_front(&adev->reset_list, &device_list); 5861 device_list_handle = &device_list; 5862 } else { 5863 list_add_tail(&adev->reset_list, &device_list); 5864 device_list_handle = &device_list; 5865 } 5866 5867 if (!amdgpu_sriov_vf(adev)) { 5868 r = amdgpu_device_health_check(device_list_handle); 5869 if (r) 5870 goto end_reset; 5871 } 5872 5873 /* We need to lock reset domain only once both for XGMI and single device */ 5874 tmp_adev = list_first_entry(device_list_handle, struct amdgpu_device, 5875 reset_list); 5876 amdgpu_device_lock_reset_domain(tmp_adev->reset_domain); 5877 5878 /* block all schedulers and reset given job's ring */ 5879 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 5880 5881 amdgpu_device_set_mp1_state(tmp_adev); 5882 5883 /* 5884 * Try to put the audio codec into suspend state 5885 * before gpu reset started. 5886 * 5887 * Due to the power domain of the graphics device 5888 * is shared with AZ power domain. Without this, 5889 * we may change the audio hardware from behind 5890 * the audio driver's back. That will trigger 5891 * some audio codec errors. 5892 */ 5893 if (!amdgpu_device_suspend_display_audio(tmp_adev)) 5894 audio_suspended = true; 5895 5896 amdgpu_ras_set_error_query_ready(tmp_adev, false); 5897 5898 cancel_delayed_work_sync(&tmp_adev->delayed_init_work); 5899 5900 amdgpu_amdkfd_pre_reset(tmp_adev, reset_context); 5901 5902 /* 5903 * Mark these ASICs to be reseted as untracked first 5904 * And add them back after reset completed 5905 */ 5906 amdgpu_unregister_gpu_instance(tmp_adev); 5907 5908 drm_client_dev_suspend(adev_to_drm(tmp_adev), false); 5909 5910 /* disable ras on ALL IPs */ 5911 if (!need_emergency_restart && 5912 amdgpu_device_ip_need_full_reset(tmp_adev)) 5913 amdgpu_ras_suspend(tmp_adev); 5914 5915 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 5916 struct amdgpu_ring *ring = tmp_adev->rings[i]; 5917 5918 if (!amdgpu_ring_sched_ready(ring)) 5919 continue; 5920 5921 drm_sched_stop(&ring->sched, job ? &job->base : NULL); 5922 5923 if (need_emergency_restart) 5924 amdgpu_job_stop_all_jobs_on_sched(&ring->sched); 5925 } 5926 atomic_inc(&tmp_adev->gpu_reset_counter); 5927 } 5928 5929 if (need_emergency_restart) 5930 goto skip_sched_resume; 5931 5932 /* 5933 * Must check guilty signal here since after this point all old 5934 * HW fences are force signaled. 5935 * 5936 * job->base holds a reference to parent fence 5937 */ 5938 if (job && dma_fence_is_signaled(&job->hw_fence)) { 5939 job_signaled = true; 5940 dev_info(adev->dev, "Guilty job already signaled, skipping HW reset"); 5941 goto skip_hw_reset; 5942 } 5943 5944 retry: /* Rest of adevs pre asic reset from XGMI hive. */ 5945 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 5946 r = amdgpu_device_pre_asic_reset(tmp_adev, reset_context); 5947 /*TODO Should we stop ?*/ 5948 if (r) { 5949 dev_err(tmp_adev->dev, "GPU pre asic reset failed with err, %d for drm dev, %s ", 5950 r, adev_to_drm(tmp_adev)->unique); 5951 tmp_adev->asic_reset_res = r; 5952 } 5953 } 5954 5955 /* Actual ASIC resets if needed.*/ 5956 /* Host driver will handle XGMI hive reset for SRIOV */ 5957 if (amdgpu_sriov_vf(adev)) { 5958 if (amdgpu_ras_get_fed_status(adev) || amdgpu_virt_rcvd_ras_interrupt(adev)) { 5959 dev_dbg(adev->dev, "Detected RAS error, wait for FLR completion\n"); 5960 amdgpu_ras_set_fed(adev, true); 5961 set_bit(AMDGPU_HOST_FLR, &reset_context->flags); 5962 } 5963 5964 r = amdgpu_device_reset_sriov(adev, reset_context); 5965 if (AMDGPU_RETRY_SRIOV_RESET(r) && (retry_limit--) > 0) { 5966 amdgpu_virt_release_full_gpu(adev, true); 5967 goto retry; 5968 } 5969 if (r) 5970 adev->asic_reset_res = r; 5971 } else { 5972 r = amdgpu_do_asic_reset(device_list_handle, reset_context); 5973 if (r && r == -EAGAIN) 5974 goto retry; 5975 } 5976 5977 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 5978 /* 5979 * Drop any pending non scheduler resets queued before reset is done. 5980 * Any reset scheduled after this point would be valid. Scheduler resets 5981 * were already dropped during drm_sched_stop and no new ones can come 5982 * in before drm_sched_start. 5983 */ 5984 amdgpu_device_stop_pending_resets(tmp_adev); 5985 } 5986 5987 skip_hw_reset: 5988 5989 /* Post ASIC reset for all devs .*/ 5990 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 5991 5992 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 5993 struct amdgpu_ring *ring = tmp_adev->rings[i]; 5994 5995 if (!amdgpu_ring_sched_ready(ring)) 5996 continue; 5997 5998 drm_sched_start(&ring->sched, 0); 5999 } 6000 6001 if (!drm_drv_uses_atomic_modeset(adev_to_drm(tmp_adev)) && !job_signaled) 6002 drm_helper_resume_force_mode(adev_to_drm(tmp_adev)); 6003 6004 if (tmp_adev->asic_reset_res) 6005 r = tmp_adev->asic_reset_res; 6006 6007 tmp_adev->asic_reset_res = 0; 6008 6009 if (r) { 6010 /* bad news, how to tell it to userspace ? 6011 * for ras error, we should report GPU bad status instead of 6012 * reset failure 6013 */ 6014 if (reset_context->src != AMDGPU_RESET_SRC_RAS || 6015 !amdgpu_ras_eeprom_check_err_threshold(tmp_adev)) 6016 dev_info(tmp_adev->dev, "GPU reset(%d) failed\n", 6017 atomic_read(&tmp_adev->gpu_reset_counter)); 6018 amdgpu_vf_error_put(tmp_adev, AMDGIM_ERROR_VF_GPU_RESET_FAIL, 0, r); 6019 } else { 6020 dev_info(tmp_adev->dev, "GPU reset(%d) succeeded!\n", atomic_read(&tmp_adev->gpu_reset_counter)); 6021 if (amdgpu_acpi_smart_shift_update(adev_to_drm(tmp_adev), AMDGPU_SS_DEV_D0)) 6022 DRM_WARN("smart shift update failed\n"); 6023 } 6024 } 6025 6026 skip_sched_resume: 6027 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 6028 /* unlock kfd: SRIOV would do it separately */ 6029 if (!need_emergency_restart && !amdgpu_sriov_vf(tmp_adev)) 6030 amdgpu_amdkfd_post_reset(tmp_adev); 6031 6032 /* kfd_post_reset will do nothing if kfd device is not initialized, 6033 * need to bring up kfd here if it's not be initialized before 6034 */ 6035 if (!adev->kfd.init_complete) 6036 amdgpu_amdkfd_device_init(adev); 6037 6038 if (audio_suspended) 6039 amdgpu_device_resume_display_audio(tmp_adev); 6040 6041 amdgpu_device_unset_mp1_state(tmp_adev); 6042 6043 amdgpu_ras_set_error_query_ready(tmp_adev, true); 6044 } 6045 6046 tmp_adev = list_first_entry(device_list_handle, struct amdgpu_device, 6047 reset_list); 6048 amdgpu_device_unlock_reset_domain(tmp_adev->reset_domain); 6049 6050 end_reset: 6051 if (hive) { 6052 mutex_unlock(&hive->hive_lock); 6053 amdgpu_put_xgmi_hive(hive); 6054 } 6055 6056 if (r) 6057 dev_info(adev->dev, "GPU reset end with ret = %d\n", r); 6058 6059 atomic_set(&adev->reset_domain->reset_res, r); 6060 return r; 6061 } 6062 6063 /** 6064 * amdgpu_device_partner_bandwidth - find the bandwidth of appropriate partner 6065 * 6066 * @adev: amdgpu_device pointer 6067 * @speed: pointer to the speed of the link 6068 * @width: pointer to the width of the link 6069 * 6070 * Evaluate the hierarchy to find the speed and bandwidth capabilities of the 6071 * first physical partner to an AMD dGPU. 6072 * This will exclude any virtual switches and links. 6073 */ 6074 static void amdgpu_device_partner_bandwidth(struct amdgpu_device *adev, 6075 enum pci_bus_speed *speed, 6076 enum pcie_link_width *width) 6077 { 6078 struct pci_dev *parent = adev->pdev; 6079 6080 if (!speed || !width) 6081 return; 6082 6083 *speed = PCI_SPEED_UNKNOWN; 6084 *width = PCIE_LNK_WIDTH_UNKNOWN; 6085 6086 if (amdgpu_device_pcie_dynamic_switching_supported(adev)) { 6087 while ((parent = pci_upstream_bridge(parent))) { 6088 /* skip upstream/downstream switches internal to dGPU*/ 6089 if (parent->vendor == PCI_VENDOR_ID_ATI) 6090 continue; 6091 *speed = pcie_get_speed_cap(parent); 6092 *width = pcie_get_width_cap(parent); 6093 break; 6094 } 6095 } else { 6096 /* use the current speeds rather than max if switching is not supported */ 6097 pcie_bandwidth_available(adev->pdev, NULL, speed, width); 6098 } 6099 } 6100 6101 /** 6102 * amdgpu_device_get_pcie_info - fence pcie info about the PCIE slot 6103 * 6104 * @adev: amdgpu_device pointer 6105 * 6106 * Fetchs and stores in the driver the PCIE capabilities (gen speed 6107 * and lanes) of the slot the device is in. Handles APUs and 6108 * virtualized environments where PCIE config space may not be available. 6109 */ 6110 static void amdgpu_device_get_pcie_info(struct amdgpu_device *adev) 6111 { 6112 struct pci_dev *pdev; 6113 enum pci_bus_speed speed_cap, platform_speed_cap; 6114 enum pcie_link_width platform_link_width; 6115 6116 if (amdgpu_pcie_gen_cap) 6117 adev->pm.pcie_gen_mask = amdgpu_pcie_gen_cap; 6118 6119 if (amdgpu_pcie_lane_cap) 6120 adev->pm.pcie_mlw_mask = amdgpu_pcie_lane_cap; 6121 6122 /* covers APUs as well */ 6123 if (pci_is_root_bus(adev->pdev->bus) && !amdgpu_passthrough(adev)) { 6124 if (adev->pm.pcie_gen_mask == 0) 6125 adev->pm.pcie_gen_mask = AMDGPU_DEFAULT_PCIE_GEN_MASK; 6126 if (adev->pm.pcie_mlw_mask == 0) 6127 adev->pm.pcie_mlw_mask = AMDGPU_DEFAULT_PCIE_MLW_MASK; 6128 return; 6129 } 6130 6131 if (adev->pm.pcie_gen_mask && adev->pm.pcie_mlw_mask) 6132 return; 6133 6134 amdgpu_device_partner_bandwidth(adev, &platform_speed_cap, 6135 &platform_link_width); 6136 6137 if (adev->pm.pcie_gen_mask == 0) { 6138 /* asic caps */ 6139 pdev = adev->pdev; 6140 speed_cap = pcie_get_speed_cap(pdev); 6141 if (speed_cap == PCI_SPEED_UNKNOWN) { 6142 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 6143 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 | 6144 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3); 6145 } else { 6146 if (speed_cap == PCIE_SPEED_32_0GT) 6147 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 6148 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 | 6149 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3 | 6150 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN4 | 6151 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN5); 6152 else if (speed_cap == PCIE_SPEED_16_0GT) 6153 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 6154 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 | 6155 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3 | 6156 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN4); 6157 else if (speed_cap == PCIE_SPEED_8_0GT) 6158 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 6159 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 | 6160 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3); 6161 else if (speed_cap == PCIE_SPEED_5_0GT) 6162 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 6163 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2); 6164 else 6165 adev->pm.pcie_gen_mask |= CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1; 6166 } 6167 /* platform caps */ 6168 if (platform_speed_cap == PCI_SPEED_UNKNOWN) { 6169 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 6170 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2); 6171 } else { 6172 if (platform_speed_cap == PCIE_SPEED_32_0GT) 6173 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 6174 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 | 6175 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3 | 6176 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN4 | 6177 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN5); 6178 else if (platform_speed_cap == PCIE_SPEED_16_0GT) 6179 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 6180 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 | 6181 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3 | 6182 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN4); 6183 else if (platform_speed_cap == PCIE_SPEED_8_0GT) 6184 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 6185 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 | 6186 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3); 6187 else if (platform_speed_cap == PCIE_SPEED_5_0GT) 6188 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 6189 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2); 6190 else 6191 adev->pm.pcie_gen_mask |= CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1; 6192 6193 } 6194 } 6195 if (adev->pm.pcie_mlw_mask == 0) { 6196 if (platform_link_width == PCIE_LNK_WIDTH_UNKNOWN) { 6197 adev->pm.pcie_mlw_mask |= AMDGPU_DEFAULT_PCIE_MLW_MASK; 6198 } else { 6199 switch (platform_link_width) { 6200 case PCIE_LNK_X32: 6201 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X32 | 6202 CAIL_PCIE_LINK_WIDTH_SUPPORT_X16 | 6203 CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 | 6204 CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 | 6205 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 6206 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 6207 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 6208 break; 6209 case PCIE_LNK_X16: 6210 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X16 | 6211 CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 | 6212 CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 | 6213 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 6214 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 6215 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 6216 break; 6217 case PCIE_LNK_X12: 6218 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 | 6219 CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 | 6220 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 6221 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 6222 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 6223 break; 6224 case PCIE_LNK_X8: 6225 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 | 6226 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 6227 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 6228 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 6229 break; 6230 case PCIE_LNK_X4: 6231 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 6232 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 6233 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 6234 break; 6235 case PCIE_LNK_X2: 6236 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 6237 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 6238 break; 6239 case PCIE_LNK_X1: 6240 adev->pm.pcie_mlw_mask = CAIL_PCIE_LINK_WIDTH_SUPPORT_X1; 6241 break; 6242 default: 6243 break; 6244 } 6245 } 6246 } 6247 } 6248 6249 /** 6250 * amdgpu_device_is_peer_accessible - Check peer access through PCIe BAR 6251 * 6252 * @adev: amdgpu_device pointer 6253 * @peer_adev: amdgpu_device pointer for peer device trying to access @adev 6254 * 6255 * Return true if @peer_adev can access (DMA) @adev through the PCIe 6256 * BAR, i.e. @adev is "large BAR" and the BAR matches the DMA mask of 6257 * @peer_adev. 6258 */ 6259 bool amdgpu_device_is_peer_accessible(struct amdgpu_device *adev, 6260 struct amdgpu_device *peer_adev) 6261 { 6262 #ifdef CONFIG_HSA_AMD_P2P 6263 bool p2p_access = 6264 !adev->gmc.xgmi.connected_to_cpu && 6265 !(pci_p2pdma_distance(adev->pdev, peer_adev->dev, false) < 0); 6266 if (!p2p_access) 6267 dev_info(adev->dev, "PCIe P2P access from peer device %s is not supported by the chipset\n", 6268 pci_name(peer_adev->pdev)); 6269 6270 bool is_large_bar = adev->gmc.visible_vram_size && 6271 adev->gmc.real_vram_size == adev->gmc.visible_vram_size; 6272 bool p2p_addressable = amdgpu_device_check_iommu_remap(peer_adev); 6273 6274 if (!p2p_addressable) { 6275 uint64_t address_mask = peer_adev->dev->dma_mask ? 6276 ~*peer_adev->dev->dma_mask : ~((1ULL << 32) - 1); 6277 resource_size_t aper_limit = 6278 adev->gmc.aper_base + adev->gmc.aper_size - 1; 6279 6280 p2p_addressable = !(adev->gmc.aper_base & address_mask || 6281 aper_limit & address_mask); 6282 } 6283 return pcie_p2p && is_large_bar && p2p_access && p2p_addressable; 6284 #else 6285 return false; 6286 #endif 6287 } 6288 6289 int amdgpu_device_baco_enter(struct drm_device *dev) 6290 { 6291 struct amdgpu_device *adev = drm_to_adev(dev); 6292 struct amdgpu_ras *ras = amdgpu_ras_get_context(adev); 6293 6294 if (!amdgpu_device_supports_baco(dev)) 6295 return -ENOTSUPP; 6296 6297 if (ras && adev->ras_enabled && 6298 adev->nbio.funcs->enable_doorbell_interrupt) 6299 adev->nbio.funcs->enable_doorbell_interrupt(adev, false); 6300 6301 return amdgpu_dpm_baco_enter(adev); 6302 } 6303 6304 int amdgpu_device_baco_exit(struct drm_device *dev) 6305 { 6306 struct amdgpu_device *adev = drm_to_adev(dev); 6307 struct amdgpu_ras *ras = amdgpu_ras_get_context(adev); 6308 int ret = 0; 6309 6310 if (!amdgpu_device_supports_baco(dev)) 6311 return -ENOTSUPP; 6312 6313 ret = amdgpu_dpm_baco_exit(adev); 6314 if (ret) 6315 return ret; 6316 6317 if (ras && adev->ras_enabled && 6318 adev->nbio.funcs->enable_doorbell_interrupt) 6319 adev->nbio.funcs->enable_doorbell_interrupt(adev, true); 6320 6321 if (amdgpu_passthrough(adev) && adev->nbio.funcs && 6322 adev->nbio.funcs->clear_doorbell_interrupt) 6323 adev->nbio.funcs->clear_doorbell_interrupt(adev); 6324 6325 return 0; 6326 } 6327 6328 /** 6329 * amdgpu_pci_error_detected - Called when a PCI error is detected. 6330 * @pdev: PCI device struct 6331 * @state: PCI channel state 6332 * 6333 * Description: Called when a PCI error is detected. 6334 * 6335 * Return: PCI_ERS_RESULT_NEED_RESET or PCI_ERS_RESULT_DISCONNECT. 6336 */ 6337 pci_ers_result_t amdgpu_pci_error_detected(struct pci_dev *pdev, pci_channel_state_t state) 6338 { 6339 struct drm_device *dev = pci_get_drvdata(pdev); 6340 struct amdgpu_device *adev = drm_to_adev(dev); 6341 int i; 6342 6343 DRM_INFO("PCI error: detected callback, state(%d)!!\n", state); 6344 6345 if (adev->gmc.xgmi.num_physical_nodes > 1) { 6346 DRM_WARN("No support for XGMI hive yet..."); 6347 return PCI_ERS_RESULT_DISCONNECT; 6348 } 6349 6350 adev->pci_channel_state = state; 6351 6352 switch (state) { 6353 case pci_channel_io_normal: 6354 return PCI_ERS_RESULT_CAN_RECOVER; 6355 /* Fatal error, prepare for slot reset */ 6356 case pci_channel_io_frozen: 6357 /* 6358 * Locking adev->reset_domain->sem will prevent any external access 6359 * to GPU during PCI error recovery 6360 */ 6361 amdgpu_device_lock_reset_domain(adev->reset_domain); 6362 amdgpu_device_set_mp1_state(adev); 6363 6364 /* 6365 * Block any work scheduling as we do for regular GPU reset 6366 * for the duration of the recovery 6367 */ 6368 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 6369 struct amdgpu_ring *ring = adev->rings[i]; 6370 6371 if (!amdgpu_ring_sched_ready(ring)) 6372 continue; 6373 6374 drm_sched_stop(&ring->sched, NULL); 6375 } 6376 atomic_inc(&adev->gpu_reset_counter); 6377 return PCI_ERS_RESULT_NEED_RESET; 6378 case pci_channel_io_perm_failure: 6379 /* Permanent error, prepare for device removal */ 6380 return PCI_ERS_RESULT_DISCONNECT; 6381 } 6382 6383 return PCI_ERS_RESULT_NEED_RESET; 6384 } 6385 6386 /** 6387 * amdgpu_pci_mmio_enabled - Enable MMIO and dump debug registers 6388 * @pdev: pointer to PCI device 6389 */ 6390 pci_ers_result_t amdgpu_pci_mmio_enabled(struct pci_dev *pdev) 6391 { 6392 6393 DRM_INFO("PCI error: mmio enabled callback!!\n"); 6394 6395 /* TODO - dump whatever for debugging purposes */ 6396 6397 /* This called only if amdgpu_pci_error_detected returns 6398 * PCI_ERS_RESULT_CAN_RECOVER. Read/write to the device still 6399 * works, no need to reset slot. 6400 */ 6401 6402 return PCI_ERS_RESULT_RECOVERED; 6403 } 6404 6405 /** 6406 * amdgpu_pci_slot_reset - Called when PCI slot has been reset. 6407 * @pdev: PCI device struct 6408 * 6409 * Description: This routine is called by the pci error recovery 6410 * code after the PCI slot has been reset, just before we 6411 * should resume normal operations. 6412 */ 6413 pci_ers_result_t amdgpu_pci_slot_reset(struct pci_dev *pdev) 6414 { 6415 struct drm_device *dev = pci_get_drvdata(pdev); 6416 struct amdgpu_device *adev = drm_to_adev(dev); 6417 int r, i; 6418 struct amdgpu_reset_context reset_context; 6419 u32 memsize; 6420 struct list_head device_list; 6421 6422 /* PCI error slot reset should be skipped During RAS recovery */ 6423 if ((amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 3) || 6424 amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 4)) && 6425 amdgpu_ras_in_recovery(adev)) 6426 return PCI_ERS_RESULT_RECOVERED; 6427 6428 DRM_INFO("PCI error: slot reset callback!!\n"); 6429 6430 memset(&reset_context, 0, sizeof(reset_context)); 6431 6432 INIT_LIST_HEAD(&device_list); 6433 list_add_tail(&adev->reset_list, &device_list); 6434 6435 /* wait for asic to come out of reset */ 6436 msleep(500); 6437 6438 /* Restore PCI confspace */ 6439 amdgpu_device_load_pci_state(pdev); 6440 6441 /* confirm ASIC came out of reset */ 6442 for (i = 0; i < adev->usec_timeout; i++) { 6443 memsize = amdgpu_asic_get_config_memsize(adev); 6444 6445 if (memsize != 0xffffffff) 6446 break; 6447 udelay(1); 6448 } 6449 if (memsize == 0xffffffff) { 6450 r = -ETIME; 6451 goto out; 6452 } 6453 6454 reset_context.method = AMD_RESET_METHOD_NONE; 6455 reset_context.reset_req_dev = adev; 6456 set_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags); 6457 set_bit(AMDGPU_SKIP_HW_RESET, &reset_context.flags); 6458 6459 adev->no_hw_access = true; 6460 r = amdgpu_device_pre_asic_reset(adev, &reset_context); 6461 adev->no_hw_access = false; 6462 if (r) 6463 goto out; 6464 6465 r = amdgpu_do_asic_reset(&device_list, &reset_context); 6466 6467 out: 6468 if (!r) { 6469 if (amdgpu_device_cache_pci_state(adev->pdev)) 6470 pci_restore_state(adev->pdev); 6471 6472 DRM_INFO("PCIe error recovery succeeded\n"); 6473 } else { 6474 DRM_ERROR("PCIe error recovery failed, err:%d", r); 6475 amdgpu_device_unset_mp1_state(adev); 6476 amdgpu_device_unlock_reset_domain(adev->reset_domain); 6477 } 6478 6479 return r ? PCI_ERS_RESULT_DISCONNECT : PCI_ERS_RESULT_RECOVERED; 6480 } 6481 6482 /** 6483 * amdgpu_pci_resume() - resume normal ops after PCI reset 6484 * @pdev: pointer to PCI device 6485 * 6486 * Called when the error recovery driver tells us that its 6487 * OK to resume normal operation. 6488 */ 6489 void amdgpu_pci_resume(struct pci_dev *pdev) 6490 { 6491 struct drm_device *dev = pci_get_drvdata(pdev); 6492 struct amdgpu_device *adev = drm_to_adev(dev); 6493 int i; 6494 6495 6496 DRM_INFO("PCI error: resume callback!!\n"); 6497 6498 /* Only continue execution for the case of pci_channel_io_frozen */ 6499 if (adev->pci_channel_state != pci_channel_io_frozen) 6500 return; 6501 6502 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 6503 struct amdgpu_ring *ring = adev->rings[i]; 6504 6505 if (!amdgpu_ring_sched_ready(ring)) 6506 continue; 6507 6508 drm_sched_start(&ring->sched, 0); 6509 } 6510 6511 amdgpu_device_unset_mp1_state(adev); 6512 amdgpu_device_unlock_reset_domain(adev->reset_domain); 6513 } 6514 6515 bool amdgpu_device_cache_pci_state(struct pci_dev *pdev) 6516 { 6517 struct drm_device *dev = pci_get_drvdata(pdev); 6518 struct amdgpu_device *adev = drm_to_adev(dev); 6519 int r; 6520 6521 if (amdgpu_sriov_vf(adev)) 6522 return false; 6523 6524 r = pci_save_state(pdev); 6525 if (!r) { 6526 kfree(adev->pci_state); 6527 6528 adev->pci_state = pci_store_saved_state(pdev); 6529 6530 if (!adev->pci_state) { 6531 DRM_ERROR("Failed to store PCI saved state"); 6532 return false; 6533 } 6534 } else { 6535 DRM_WARN("Failed to save PCI state, err:%d\n", r); 6536 return false; 6537 } 6538 6539 return true; 6540 } 6541 6542 bool amdgpu_device_load_pci_state(struct pci_dev *pdev) 6543 { 6544 struct drm_device *dev = pci_get_drvdata(pdev); 6545 struct amdgpu_device *adev = drm_to_adev(dev); 6546 int r; 6547 6548 if (!adev->pci_state) 6549 return false; 6550 6551 r = pci_load_saved_state(pdev, adev->pci_state); 6552 6553 if (!r) { 6554 pci_restore_state(pdev); 6555 } else { 6556 DRM_WARN("Failed to load PCI state, err:%d\n", r); 6557 return false; 6558 } 6559 6560 return true; 6561 } 6562 6563 void amdgpu_device_flush_hdp(struct amdgpu_device *adev, 6564 struct amdgpu_ring *ring) 6565 { 6566 #ifdef CONFIG_X86_64 6567 if ((adev->flags & AMD_IS_APU) && !amdgpu_passthrough(adev)) 6568 return; 6569 #endif 6570 if (adev->gmc.xgmi.connected_to_cpu) 6571 return; 6572 6573 if (ring && ring->funcs->emit_hdp_flush) 6574 amdgpu_ring_emit_hdp_flush(ring); 6575 else 6576 amdgpu_asic_flush_hdp(adev, ring); 6577 } 6578 6579 void amdgpu_device_invalidate_hdp(struct amdgpu_device *adev, 6580 struct amdgpu_ring *ring) 6581 { 6582 #ifdef CONFIG_X86_64 6583 if ((adev->flags & AMD_IS_APU) && !amdgpu_passthrough(adev)) 6584 return; 6585 #endif 6586 if (adev->gmc.xgmi.connected_to_cpu) 6587 return; 6588 6589 amdgpu_asic_invalidate_hdp(adev, ring); 6590 } 6591 6592 int amdgpu_in_reset(struct amdgpu_device *adev) 6593 { 6594 return atomic_read(&adev->reset_domain->in_gpu_reset); 6595 } 6596 6597 /** 6598 * amdgpu_device_halt() - bring hardware to some kind of halt state 6599 * 6600 * @adev: amdgpu_device pointer 6601 * 6602 * Bring hardware to some kind of halt state so that no one can touch it 6603 * any more. It will help to maintain error context when error occurred. 6604 * Compare to a simple hang, the system will keep stable at least for SSH 6605 * access. Then it should be trivial to inspect the hardware state and 6606 * see what's going on. Implemented as following: 6607 * 6608 * 1. drm_dev_unplug() makes device inaccessible to user space(IOCTLs, etc), 6609 * clears all CPU mappings to device, disallows remappings through page faults 6610 * 2. amdgpu_irq_disable_all() disables all interrupts 6611 * 3. amdgpu_fence_driver_hw_fini() signals all HW fences 6612 * 4. set adev->no_hw_access to avoid potential crashes after setp 5 6613 * 5. amdgpu_device_unmap_mmio() clears all MMIO mappings 6614 * 6. pci_disable_device() and pci_wait_for_pending_transaction() 6615 * flush any in flight DMA operations 6616 */ 6617 void amdgpu_device_halt(struct amdgpu_device *adev) 6618 { 6619 struct pci_dev *pdev = adev->pdev; 6620 struct drm_device *ddev = adev_to_drm(adev); 6621 6622 amdgpu_xcp_dev_unplug(adev); 6623 drm_dev_unplug(ddev); 6624 6625 amdgpu_irq_disable_all(adev); 6626 6627 amdgpu_fence_driver_hw_fini(adev); 6628 6629 adev->no_hw_access = true; 6630 6631 amdgpu_device_unmap_mmio(adev); 6632 6633 pci_disable_device(pdev); 6634 pci_wait_for_pending_transaction(pdev); 6635 } 6636 6637 u32 amdgpu_device_pcie_port_rreg(struct amdgpu_device *adev, 6638 u32 reg) 6639 { 6640 unsigned long flags, address, data; 6641 u32 r; 6642 6643 address = adev->nbio.funcs->get_pcie_port_index_offset(adev); 6644 data = adev->nbio.funcs->get_pcie_port_data_offset(adev); 6645 6646 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 6647 WREG32(address, reg * 4); 6648 (void)RREG32(address); 6649 r = RREG32(data); 6650 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 6651 return r; 6652 } 6653 6654 void amdgpu_device_pcie_port_wreg(struct amdgpu_device *adev, 6655 u32 reg, u32 v) 6656 { 6657 unsigned long flags, address, data; 6658 6659 address = adev->nbio.funcs->get_pcie_port_index_offset(adev); 6660 data = adev->nbio.funcs->get_pcie_port_data_offset(adev); 6661 6662 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 6663 WREG32(address, reg * 4); 6664 (void)RREG32(address); 6665 WREG32(data, v); 6666 (void)RREG32(data); 6667 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 6668 } 6669 6670 /** 6671 * amdgpu_device_get_gang - return a reference to the current gang 6672 * @adev: amdgpu_device pointer 6673 * 6674 * Returns: A new reference to the current gang leader. 6675 */ 6676 struct dma_fence *amdgpu_device_get_gang(struct amdgpu_device *adev) 6677 { 6678 struct dma_fence *fence; 6679 6680 rcu_read_lock(); 6681 fence = dma_fence_get_rcu_safe(&adev->gang_submit); 6682 rcu_read_unlock(); 6683 return fence; 6684 } 6685 6686 /** 6687 * amdgpu_device_switch_gang - switch to a new gang 6688 * @adev: amdgpu_device pointer 6689 * @gang: the gang to switch to 6690 * 6691 * Try to switch to a new gang. 6692 * Returns: NULL if we switched to the new gang or a reference to the current 6693 * gang leader. 6694 */ 6695 struct dma_fence *amdgpu_device_switch_gang(struct amdgpu_device *adev, 6696 struct dma_fence *gang) 6697 { 6698 struct dma_fence *old = NULL; 6699 6700 do { 6701 dma_fence_put(old); 6702 old = amdgpu_device_get_gang(adev); 6703 if (old == gang) 6704 break; 6705 6706 if (!dma_fence_is_signaled(old)) 6707 return old; 6708 6709 } while (cmpxchg((struct dma_fence __force **)&adev->gang_submit, 6710 old, gang) != old); 6711 6712 dma_fence_put(old); 6713 return NULL; 6714 } 6715 6716 bool amdgpu_device_has_display_hardware(struct amdgpu_device *adev) 6717 { 6718 switch (adev->asic_type) { 6719 #ifdef CONFIG_DRM_AMDGPU_SI 6720 case CHIP_HAINAN: 6721 #endif 6722 case CHIP_TOPAZ: 6723 /* chips with no display hardware */ 6724 return false; 6725 #ifdef CONFIG_DRM_AMDGPU_SI 6726 case CHIP_TAHITI: 6727 case CHIP_PITCAIRN: 6728 case CHIP_VERDE: 6729 case CHIP_OLAND: 6730 #endif 6731 #ifdef CONFIG_DRM_AMDGPU_CIK 6732 case CHIP_BONAIRE: 6733 case CHIP_HAWAII: 6734 case CHIP_KAVERI: 6735 case CHIP_KABINI: 6736 case CHIP_MULLINS: 6737 #endif 6738 case CHIP_TONGA: 6739 case CHIP_FIJI: 6740 case CHIP_POLARIS10: 6741 case CHIP_POLARIS11: 6742 case CHIP_POLARIS12: 6743 case CHIP_VEGAM: 6744 case CHIP_CARRIZO: 6745 case CHIP_STONEY: 6746 /* chips with display hardware */ 6747 return true; 6748 default: 6749 /* IP discovery */ 6750 if (!amdgpu_ip_version(adev, DCE_HWIP, 0) || 6751 (adev->harvest_ip_mask & AMD_HARVEST_IP_DMU_MASK)) 6752 return false; 6753 return true; 6754 } 6755 } 6756 6757 uint32_t amdgpu_device_wait_on_rreg(struct amdgpu_device *adev, 6758 uint32_t inst, uint32_t reg_addr, char reg_name[], 6759 uint32_t expected_value, uint32_t mask) 6760 { 6761 uint32_t ret = 0; 6762 uint32_t old_ = 0; 6763 uint32_t tmp_ = RREG32(reg_addr); 6764 uint32_t loop = adev->usec_timeout; 6765 6766 while ((tmp_ & (mask)) != (expected_value)) { 6767 if (old_ != tmp_) { 6768 loop = adev->usec_timeout; 6769 old_ = tmp_; 6770 } else 6771 udelay(1); 6772 tmp_ = RREG32(reg_addr); 6773 loop--; 6774 if (!loop) { 6775 DRM_WARN("Register(%d) [%s] failed to reach value 0x%08x != 0x%08xn", 6776 inst, reg_name, (uint32_t)expected_value, 6777 (uint32_t)(tmp_ & (mask))); 6778 ret = -ETIMEDOUT; 6779 break; 6780 } 6781 } 6782 return ret; 6783 } 6784 6785 ssize_t amdgpu_get_soft_full_reset_mask(struct amdgpu_ring *ring) 6786 { 6787 ssize_t size = 0; 6788 6789 if (!ring || !ring->adev) 6790 return size; 6791 6792 if (amdgpu_device_should_recover_gpu(ring->adev)) 6793 size |= AMDGPU_RESET_TYPE_FULL; 6794 6795 if (unlikely(!ring->adev->debug_disable_soft_recovery) && 6796 !amdgpu_sriov_vf(ring->adev) && ring->funcs->soft_recovery) 6797 size |= AMDGPU_RESET_TYPE_SOFT_RESET; 6798 6799 return size; 6800 } 6801 6802 ssize_t amdgpu_show_reset_mask(char *buf, uint32_t supported_reset) 6803 { 6804 ssize_t size = 0; 6805 6806 if (supported_reset == 0) { 6807 size += sysfs_emit_at(buf, size, "unsupported"); 6808 size += sysfs_emit_at(buf, size, "\n"); 6809 return size; 6810 6811 } 6812 6813 if (supported_reset & AMDGPU_RESET_TYPE_SOFT_RESET) 6814 size += sysfs_emit_at(buf, size, "soft "); 6815 6816 if (supported_reset & AMDGPU_RESET_TYPE_PER_QUEUE) 6817 size += sysfs_emit_at(buf, size, "queue "); 6818 6819 if (supported_reset & AMDGPU_RESET_TYPE_PER_PIPE) 6820 size += sysfs_emit_at(buf, size, "pipe "); 6821 6822 if (supported_reset & AMDGPU_RESET_TYPE_FULL) 6823 size += sysfs_emit_at(buf, size, "full "); 6824 6825 size += sysfs_emit_at(buf, size, "\n"); 6826 return size; 6827 } 6828