1 /* 2 * Copyright 2008 Advanced Micro Devices, Inc. 3 * Copyright 2008 Red Hat Inc. 4 * Copyright 2009 Jerome Glisse. 5 * 6 * Permission is hereby granted, free of charge, to any person obtaining a 7 * copy of this software and associated documentation files (the "Software"), 8 * to deal in the Software without restriction, including without limitation 9 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 10 * and/or sell copies of the Software, and to permit persons to whom the 11 * Software is furnished to do so, subject to the following conditions: 12 * 13 * The above copyright notice and this permission notice shall be included in 14 * all copies or substantial portions of the Software. 15 * 16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 19 * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR 20 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, 21 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR 22 * OTHER DEALINGS IN THE SOFTWARE. 23 * 24 * Authors: Dave Airlie 25 * Alex Deucher 26 * Jerome Glisse 27 */ 28 29 #include <linux/aperture.h> 30 #include <linux/power_supply.h> 31 #include <linux/kthread.h> 32 #include <linux/module.h> 33 #include <linux/console.h> 34 #include <linux/slab.h> 35 #include <linux/iommu.h> 36 #include <linux/pci.h> 37 #include <linux/pci-p2pdma.h> 38 #include <linux/apple-gmux.h> 39 40 #include <drm/drm_atomic_helper.h> 41 #include <drm/drm_client_event.h> 42 #include <drm/drm_crtc_helper.h> 43 #include <drm/drm_probe_helper.h> 44 #include <drm/amdgpu_drm.h> 45 #include <linux/device.h> 46 #include <linux/vgaarb.h> 47 #include <linux/vga_switcheroo.h> 48 #include <linux/efi.h> 49 #include "amdgpu.h" 50 #include "amdgpu_trace.h" 51 #include "amdgpu_i2c.h" 52 #include "atom.h" 53 #include "amdgpu_atombios.h" 54 #include "amdgpu_atomfirmware.h" 55 #include "amd_pcie.h" 56 #ifdef CONFIG_DRM_AMDGPU_SI 57 #include "si.h" 58 #endif 59 #ifdef CONFIG_DRM_AMDGPU_CIK 60 #include "cik.h" 61 #endif 62 #include "vi.h" 63 #include "soc15.h" 64 #include "nv.h" 65 #include "bif/bif_4_1_d.h" 66 #include <linux/firmware.h> 67 #include "amdgpu_vf_error.h" 68 69 #include "amdgpu_amdkfd.h" 70 #include "amdgpu_pm.h" 71 72 #include "amdgpu_xgmi.h" 73 #include "amdgpu_ras.h" 74 #include "amdgpu_pmu.h" 75 #include "amdgpu_fru_eeprom.h" 76 #include "amdgpu_reset.h" 77 #include "amdgpu_virt.h" 78 #include "amdgpu_dev_coredump.h" 79 80 #include <linux/suspend.h> 81 #include <drm/task_barrier.h> 82 #include <linux/pm_runtime.h> 83 84 #include <drm/drm_drv.h> 85 86 #if IS_ENABLED(CONFIG_X86) 87 #include <asm/intel-family.h> 88 #include <asm/cpu_device_id.h> 89 #endif 90 91 MODULE_FIRMWARE("amdgpu/vega10_gpu_info.bin"); 92 MODULE_FIRMWARE("amdgpu/vega12_gpu_info.bin"); 93 MODULE_FIRMWARE("amdgpu/raven_gpu_info.bin"); 94 MODULE_FIRMWARE("amdgpu/picasso_gpu_info.bin"); 95 MODULE_FIRMWARE("amdgpu/raven2_gpu_info.bin"); 96 MODULE_FIRMWARE("amdgpu/arcturus_gpu_info.bin"); 97 MODULE_FIRMWARE("amdgpu/navi12_gpu_info.bin"); 98 99 #define AMDGPU_RESUME_MS 2000 100 #define AMDGPU_MAX_RETRY_LIMIT 2 101 #define AMDGPU_RETRY_SRIOV_RESET(r) ((r) == -EBUSY || (r) == -ETIMEDOUT || (r) == -EINVAL) 102 #define AMDGPU_PCIE_INDEX_FALLBACK (0x38 >> 2) 103 #define AMDGPU_PCIE_INDEX_HI_FALLBACK (0x44 >> 2) 104 #define AMDGPU_PCIE_DATA_FALLBACK (0x3C >> 2) 105 106 #define AMDGPU_VBIOS_SKIP (1U << 0) 107 #define AMDGPU_VBIOS_OPTIONAL (1U << 1) 108 109 static const struct drm_driver amdgpu_kms_driver; 110 111 const char *amdgpu_asic_name[] = { 112 "TAHITI", 113 "PITCAIRN", 114 "VERDE", 115 "OLAND", 116 "HAINAN", 117 "BONAIRE", 118 "KAVERI", 119 "KABINI", 120 "HAWAII", 121 "MULLINS", 122 "TOPAZ", 123 "TONGA", 124 "FIJI", 125 "CARRIZO", 126 "STONEY", 127 "POLARIS10", 128 "POLARIS11", 129 "POLARIS12", 130 "VEGAM", 131 "VEGA10", 132 "VEGA12", 133 "VEGA20", 134 "RAVEN", 135 "ARCTURUS", 136 "RENOIR", 137 "ALDEBARAN", 138 "NAVI10", 139 "CYAN_SKILLFISH", 140 "NAVI14", 141 "NAVI12", 142 "SIENNA_CICHLID", 143 "NAVY_FLOUNDER", 144 "VANGOGH", 145 "DIMGREY_CAVEFISH", 146 "BEIGE_GOBY", 147 "YELLOW_CARP", 148 "IP DISCOVERY", 149 "LAST", 150 }; 151 152 #define AMDGPU_IP_BLK_MASK_ALL GENMASK(AMD_IP_BLOCK_TYPE_NUM - 1, 0) 153 /* 154 * Default init level where all blocks are expected to be initialized. This is 155 * the level of initialization expected by default and also after a full reset 156 * of the device. 157 */ 158 struct amdgpu_init_level amdgpu_init_default = { 159 .level = AMDGPU_INIT_LEVEL_DEFAULT, 160 .hwini_ip_block_mask = AMDGPU_IP_BLK_MASK_ALL, 161 }; 162 163 struct amdgpu_init_level amdgpu_init_recovery = { 164 .level = AMDGPU_INIT_LEVEL_RESET_RECOVERY, 165 .hwini_ip_block_mask = AMDGPU_IP_BLK_MASK_ALL, 166 }; 167 168 /* 169 * Minimal blocks needed to be initialized before a XGMI hive can be reset. This 170 * is used for cases like reset on initialization where the entire hive needs to 171 * be reset before first use. 172 */ 173 struct amdgpu_init_level amdgpu_init_minimal_xgmi = { 174 .level = AMDGPU_INIT_LEVEL_MINIMAL_XGMI, 175 .hwini_ip_block_mask = 176 BIT(AMD_IP_BLOCK_TYPE_GMC) | BIT(AMD_IP_BLOCK_TYPE_SMC) | 177 BIT(AMD_IP_BLOCK_TYPE_COMMON) | BIT(AMD_IP_BLOCK_TYPE_IH) | 178 BIT(AMD_IP_BLOCK_TYPE_PSP) 179 }; 180 181 static inline bool amdgpu_ip_member_of_hwini(struct amdgpu_device *adev, 182 enum amd_ip_block_type block) 183 { 184 return (adev->init_lvl->hwini_ip_block_mask & (1U << block)) != 0; 185 } 186 187 void amdgpu_set_init_level(struct amdgpu_device *adev, 188 enum amdgpu_init_lvl_id lvl) 189 { 190 switch (lvl) { 191 case AMDGPU_INIT_LEVEL_MINIMAL_XGMI: 192 adev->init_lvl = &amdgpu_init_minimal_xgmi; 193 break; 194 case AMDGPU_INIT_LEVEL_RESET_RECOVERY: 195 adev->init_lvl = &amdgpu_init_recovery; 196 break; 197 case AMDGPU_INIT_LEVEL_DEFAULT: 198 fallthrough; 199 default: 200 adev->init_lvl = &amdgpu_init_default; 201 break; 202 } 203 } 204 205 static inline void amdgpu_device_stop_pending_resets(struct amdgpu_device *adev); 206 static int amdgpu_device_pm_notifier(struct notifier_block *nb, unsigned long mode, 207 void *data); 208 209 /** 210 * DOC: pcie_replay_count 211 * 212 * The amdgpu driver provides a sysfs API for reporting the total number 213 * of PCIe replays (NAKs). 214 * The file pcie_replay_count is used for this and returns the total 215 * number of replays as a sum of the NAKs generated and NAKs received. 216 */ 217 218 static ssize_t amdgpu_device_get_pcie_replay_count(struct device *dev, 219 struct device_attribute *attr, char *buf) 220 { 221 struct drm_device *ddev = dev_get_drvdata(dev); 222 struct amdgpu_device *adev = drm_to_adev(ddev); 223 uint64_t cnt = amdgpu_asic_get_pcie_replay_count(adev); 224 225 return sysfs_emit(buf, "%llu\n", cnt); 226 } 227 228 static DEVICE_ATTR(pcie_replay_count, 0444, 229 amdgpu_device_get_pcie_replay_count, NULL); 230 231 static int amdgpu_device_attr_sysfs_init(struct amdgpu_device *adev) 232 { 233 int ret = 0; 234 235 if (!amdgpu_sriov_vf(adev)) 236 ret = sysfs_create_file(&adev->dev->kobj, 237 &dev_attr_pcie_replay_count.attr); 238 239 return ret; 240 } 241 242 static void amdgpu_device_attr_sysfs_fini(struct amdgpu_device *adev) 243 { 244 if (!amdgpu_sriov_vf(adev)) 245 sysfs_remove_file(&adev->dev->kobj, 246 &dev_attr_pcie_replay_count.attr); 247 } 248 249 static ssize_t amdgpu_sysfs_reg_state_get(struct file *f, struct kobject *kobj, 250 struct bin_attribute *attr, char *buf, 251 loff_t ppos, size_t count) 252 { 253 struct device *dev = kobj_to_dev(kobj); 254 struct drm_device *ddev = dev_get_drvdata(dev); 255 struct amdgpu_device *adev = drm_to_adev(ddev); 256 ssize_t bytes_read; 257 258 switch (ppos) { 259 case AMDGPU_SYS_REG_STATE_XGMI: 260 bytes_read = amdgpu_asic_get_reg_state( 261 adev, AMDGPU_REG_STATE_TYPE_XGMI, buf, count); 262 break; 263 case AMDGPU_SYS_REG_STATE_WAFL: 264 bytes_read = amdgpu_asic_get_reg_state( 265 adev, AMDGPU_REG_STATE_TYPE_WAFL, buf, count); 266 break; 267 case AMDGPU_SYS_REG_STATE_PCIE: 268 bytes_read = amdgpu_asic_get_reg_state( 269 adev, AMDGPU_REG_STATE_TYPE_PCIE, buf, count); 270 break; 271 case AMDGPU_SYS_REG_STATE_USR: 272 bytes_read = amdgpu_asic_get_reg_state( 273 adev, AMDGPU_REG_STATE_TYPE_USR, buf, count); 274 break; 275 case AMDGPU_SYS_REG_STATE_USR_1: 276 bytes_read = amdgpu_asic_get_reg_state( 277 adev, AMDGPU_REG_STATE_TYPE_USR_1, buf, count); 278 break; 279 default: 280 return -EINVAL; 281 } 282 283 return bytes_read; 284 } 285 286 BIN_ATTR(reg_state, 0444, amdgpu_sysfs_reg_state_get, NULL, 287 AMDGPU_SYS_REG_STATE_END); 288 289 int amdgpu_reg_state_sysfs_init(struct amdgpu_device *adev) 290 { 291 int ret; 292 293 if (!amdgpu_asic_get_reg_state_supported(adev)) 294 return 0; 295 296 ret = sysfs_create_bin_file(&adev->dev->kobj, &bin_attr_reg_state); 297 298 return ret; 299 } 300 301 void amdgpu_reg_state_sysfs_fini(struct amdgpu_device *adev) 302 { 303 if (!amdgpu_asic_get_reg_state_supported(adev)) 304 return; 305 sysfs_remove_bin_file(&adev->dev->kobj, &bin_attr_reg_state); 306 } 307 308 int amdgpu_ip_block_suspend(struct amdgpu_ip_block *ip_block) 309 { 310 int r; 311 312 if (ip_block->version->funcs->suspend) { 313 r = ip_block->version->funcs->suspend(ip_block); 314 if (r) { 315 dev_err(ip_block->adev->dev, 316 "suspend of IP block <%s> failed %d\n", 317 ip_block->version->funcs->name, r); 318 return r; 319 } 320 } 321 322 ip_block->status.hw = false; 323 return 0; 324 } 325 326 int amdgpu_ip_block_resume(struct amdgpu_ip_block *ip_block) 327 { 328 int r; 329 330 if (ip_block->version->funcs->resume) { 331 r = ip_block->version->funcs->resume(ip_block); 332 if (r) { 333 dev_err(ip_block->adev->dev, 334 "resume of IP block <%s> failed %d\n", 335 ip_block->version->funcs->name, r); 336 return r; 337 } 338 } 339 340 ip_block->status.hw = true; 341 return 0; 342 } 343 344 /** 345 * DOC: board_info 346 * 347 * The amdgpu driver provides a sysfs API for giving board related information. 348 * It provides the form factor information in the format 349 * 350 * type : form factor 351 * 352 * Possible form factor values 353 * 354 * - "cem" - PCIE CEM card 355 * - "oam" - Open Compute Accelerator Module 356 * - "unknown" - Not known 357 * 358 */ 359 360 static ssize_t amdgpu_device_get_board_info(struct device *dev, 361 struct device_attribute *attr, 362 char *buf) 363 { 364 struct drm_device *ddev = dev_get_drvdata(dev); 365 struct amdgpu_device *adev = drm_to_adev(ddev); 366 enum amdgpu_pkg_type pkg_type = AMDGPU_PKG_TYPE_CEM; 367 const char *pkg; 368 369 if (adev->smuio.funcs && adev->smuio.funcs->get_pkg_type) 370 pkg_type = adev->smuio.funcs->get_pkg_type(adev); 371 372 switch (pkg_type) { 373 case AMDGPU_PKG_TYPE_CEM: 374 pkg = "cem"; 375 break; 376 case AMDGPU_PKG_TYPE_OAM: 377 pkg = "oam"; 378 break; 379 default: 380 pkg = "unknown"; 381 break; 382 } 383 384 return sysfs_emit(buf, "%s : %s\n", "type", pkg); 385 } 386 387 static DEVICE_ATTR(board_info, 0444, amdgpu_device_get_board_info, NULL); 388 389 static struct attribute *amdgpu_board_attrs[] = { 390 &dev_attr_board_info.attr, 391 NULL, 392 }; 393 394 static umode_t amdgpu_board_attrs_is_visible(struct kobject *kobj, 395 struct attribute *attr, int n) 396 { 397 struct device *dev = kobj_to_dev(kobj); 398 struct drm_device *ddev = dev_get_drvdata(dev); 399 struct amdgpu_device *adev = drm_to_adev(ddev); 400 401 if (adev->flags & AMD_IS_APU) 402 return 0; 403 404 return attr->mode; 405 } 406 407 static const struct attribute_group amdgpu_board_attrs_group = { 408 .attrs = amdgpu_board_attrs, 409 .is_visible = amdgpu_board_attrs_is_visible 410 }; 411 412 static void amdgpu_device_get_pcie_info(struct amdgpu_device *adev); 413 414 415 /** 416 * amdgpu_device_supports_px - Is the device a dGPU with ATPX power control 417 * 418 * @dev: drm_device pointer 419 * 420 * Returns true if the device is a dGPU with ATPX power control, 421 * otherwise return false. 422 */ 423 bool amdgpu_device_supports_px(struct drm_device *dev) 424 { 425 struct amdgpu_device *adev = drm_to_adev(dev); 426 427 if ((adev->flags & AMD_IS_PX) && !amdgpu_is_atpx_hybrid()) 428 return true; 429 return false; 430 } 431 432 /** 433 * amdgpu_device_supports_boco - Is the device a dGPU with ACPI power resources 434 * 435 * @dev: drm_device pointer 436 * 437 * Returns true if the device is a dGPU with ACPI power control, 438 * otherwise return false. 439 */ 440 bool amdgpu_device_supports_boco(struct drm_device *dev) 441 { 442 struct amdgpu_device *adev = drm_to_adev(dev); 443 444 if (!IS_ENABLED(CONFIG_HOTPLUG_PCI_PCIE)) 445 return false; 446 447 if (adev->has_pr3 || 448 ((adev->flags & AMD_IS_PX) && amdgpu_is_atpx_hybrid())) 449 return true; 450 return false; 451 } 452 453 /** 454 * amdgpu_device_supports_baco - Does the device support BACO 455 * 456 * @dev: drm_device pointer 457 * 458 * Return: 459 * 1 if the device supports BACO; 460 * 3 if the device supports MACO (only works if BACO is supported) 461 * otherwise return 0. 462 */ 463 int amdgpu_device_supports_baco(struct drm_device *dev) 464 { 465 struct amdgpu_device *adev = drm_to_adev(dev); 466 467 return amdgpu_asic_supports_baco(adev); 468 } 469 470 void amdgpu_device_detect_runtime_pm_mode(struct amdgpu_device *adev) 471 { 472 struct drm_device *dev; 473 int bamaco_support; 474 475 dev = adev_to_drm(adev); 476 477 adev->pm.rpm_mode = AMDGPU_RUNPM_NONE; 478 bamaco_support = amdgpu_device_supports_baco(dev); 479 480 switch (amdgpu_runtime_pm) { 481 case 2: 482 if (bamaco_support & MACO_SUPPORT) { 483 adev->pm.rpm_mode = AMDGPU_RUNPM_BAMACO; 484 dev_info(adev->dev, "Forcing BAMACO for runtime pm\n"); 485 } else if (bamaco_support == BACO_SUPPORT) { 486 adev->pm.rpm_mode = AMDGPU_RUNPM_BACO; 487 dev_info(adev->dev, "Requested mode BAMACO not available,fallback to use BACO\n"); 488 } 489 break; 490 case 1: 491 if (bamaco_support & BACO_SUPPORT) { 492 adev->pm.rpm_mode = AMDGPU_RUNPM_BACO; 493 dev_info(adev->dev, "Forcing BACO for runtime pm\n"); 494 } 495 break; 496 case -1: 497 case -2: 498 if (amdgpu_device_supports_px(dev)) { /* enable PX as runtime mode */ 499 adev->pm.rpm_mode = AMDGPU_RUNPM_PX; 500 dev_info(adev->dev, "Using ATPX for runtime pm\n"); 501 } else if (amdgpu_device_supports_boco(dev)) { /* enable boco as runtime mode */ 502 adev->pm.rpm_mode = AMDGPU_RUNPM_BOCO; 503 dev_info(adev->dev, "Using BOCO for runtime pm\n"); 504 } else { 505 if (!bamaco_support) 506 goto no_runtime_pm; 507 508 switch (adev->asic_type) { 509 case CHIP_VEGA20: 510 case CHIP_ARCTURUS: 511 /* BACO are not supported on vega20 and arctrus */ 512 break; 513 case CHIP_VEGA10: 514 /* enable BACO as runpm mode if noretry=0 */ 515 if (!adev->gmc.noretry) 516 adev->pm.rpm_mode = AMDGPU_RUNPM_BACO; 517 break; 518 default: 519 /* enable BACO as runpm mode on CI+ */ 520 adev->pm.rpm_mode = AMDGPU_RUNPM_BACO; 521 break; 522 } 523 524 if (adev->pm.rpm_mode == AMDGPU_RUNPM_BACO) { 525 if (bamaco_support & MACO_SUPPORT) { 526 adev->pm.rpm_mode = AMDGPU_RUNPM_BAMACO; 527 dev_info(adev->dev, "Using BAMACO for runtime pm\n"); 528 } else { 529 dev_info(adev->dev, "Using BACO for runtime pm\n"); 530 } 531 } 532 } 533 break; 534 case 0: 535 dev_info(adev->dev, "runtime pm is manually disabled\n"); 536 break; 537 default: 538 break; 539 } 540 541 no_runtime_pm: 542 if (adev->pm.rpm_mode == AMDGPU_RUNPM_NONE) 543 dev_info(adev->dev, "Runtime PM not available\n"); 544 } 545 /** 546 * amdgpu_device_supports_smart_shift - Is the device dGPU with 547 * smart shift support 548 * 549 * @dev: drm_device pointer 550 * 551 * Returns true if the device is a dGPU with Smart Shift support, 552 * otherwise returns false. 553 */ 554 bool amdgpu_device_supports_smart_shift(struct drm_device *dev) 555 { 556 return (amdgpu_device_supports_boco(dev) && 557 amdgpu_acpi_is_power_shift_control_supported()); 558 } 559 560 /* 561 * VRAM access helper functions 562 */ 563 564 /** 565 * amdgpu_device_mm_access - access vram by MM_INDEX/MM_DATA 566 * 567 * @adev: amdgpu_device pointer 568 * @pos: offset of the buffer in vram 569 * @buf: virtual address of the buffer in system memory 570 * @size: read/write size, sizeof(@buf) must > @size 571 * @write: true - write to vram, otherwise - read from vram 572 */ 573 void amdgpu_device_mm_access(struct amdgpu_device *adev, loff_t pos, 574 void *buf, size_t size, bool write) 575 { 576 unsigned long flags; 577 uint32_t hi = ~0, tmp = 0; 578 uint32_t *data = buf; 579 uint64_t last; 580 int idx; 581 582 if (!drm_dev_enter(adev_to_drm(adev), &idx)) 583 return; 584 585 BUG_ON(!IS_ALIGNED(pos, 4) || !IS_ALIGNED(size, 4)); 586 587 spin_lock_irqsave(&adev->mmio_idx_lock, flags); 588 for (last = pos + size; pos < last; pos += 4) { 589 tmp = pos >> 31; 590 591 WREG32_NO_KIQ(mmMM_INDEX, ((uint32_t)pos) | 0x80000000); 592 if (tmp != hi) { 593 WREG32_NO_KIQ(mmMM_INDEX_HI, tmp); 594 hi = tmp; 595 } 596 if (write) 597 WREG32_NO_KIQ(mmMM_DATA, *data++); 598 else 599 *data++ = RREG32_NO_KIQ(mmMM_DATA); 600 } 601 602 spin_unlock_irqrestore(&adev->mmio_idx_lock, flags); 603 drm_dev_exit(idx); 604 } 605 606 /** 607 * amdgpu_device_aper_access - access vram by vram aperture 608 * 609 * @adev: amdgpu_device pointer 610 * @pos: offset of the buffer in vram 611 * @buf: virtual address of the buffer in system memory 612 * @size: read/write size, sizeof(@buf) must > @size 613 * @write: true - write to vram, otherwise - read from vram 614 * 615 * The return value means how many bytes have been transferred. 616 */ 617 size_t amdgpu_device_aper_access(struct amdgpu_device *adev, loff_t pos, 618 void *buf, size_t size, bool write) 619 { 620 #ifdef CONFIG_64BIT 621 void __iomem *addr; 622 size_t count = 0; 623 uint64_t last; 624 625 if (!adev->mman.aper_base_kaddr) 626 return 0; 627 628 last = min(pos + size, adev->gmc.visible_vram_size); 629 if (last > pos) { 630 addr = adev->mman.aper_base_kaddr + pos; 631 count = last - pos; 632 633 if (write) { 634 memcpy_toio(addr, buf, count); 635 /* Make sure HDP write cache flush happens without any reordering 636 * after the system memory contents are sent over PCIe device 637 */ 638 mb(); 639 amdgpu_device_flush_hdp(adev, NULL); 640 } else { 641 amdgpu_device_invalidate_hdp(adev, NULL); 642 /* Make sure HDP read cache is invalidated before issuing a read 643 * to the PCIe device 644 */ 645 mb(); 646 memcpy_fromio(buf, addr, count); 647 } 648 649 } 650 651 return count; 652 #else 653 return 0; 654 #endif 655 } 656 657 /** 658 * amdgpu_device_vram_access - read/write a buffer in vram 659 * 660 * @adev: amdgpu_device pointer 661 * @pos: offset of the buffer in vram 662 * @buf: virtual address of the buffer in system memory 663 * @size: read/write size, sizeof(@buf) must > @size 664 * @write: true - write to vram, otherwise - read from vram 665 */ 666 void amdgpu_device_vram_access(struct amdgpu_device *adev, loff_t pos, 667 void *buf, size_t size, bool write) 668 { 669 size_t count; 670 671 /* try to using vram apreature to access vram first */ 672 count = amdgpu_device_aper_access(adev, pos, buf, size, write); 673 size -= count; 674 if (size) { 675 /* using MM to access rest vram */ 676 pos += count; 677 buf += count; 678 amdgpu_device_mm_access(adev, pos, buf, size, write); 679 } 680 } 681 682 /* 683 * register access helper functions. 684 */ 685 686 /* Check if hw access should be skipped because of hotplug or device error */ 687 bool amdgpu_device_skip_hw_access(struct amdgpu_device *adev) 688 { 689 if (adev->no_hw_access) 690 return true; 691 692 #ifdef CONFIG_LOCKDEP 693 /* 694 * This is a bit complicated to understand, so worth a comment. What we assert 695 * here is that the GPU reset is not running on another thread in parallel. 696 * 697 * For this we trylock the read side of the reset semaphore, if that succeeds 698 * we know that the reset is not running in parallel. 699 * 700 * If the trylock fails we assert that we are either already holding the read 701 * side of the lock or are the reset thread itself and hold the write side of 702 * the lock. 703 */ 704 if (in_task()) { 705 if (down_read_trylock(&adev->reset_domain->sem)) 706 up_read(&adev->reset_domain->sem); 707 else 708 lockdep_assert_held(&adev->reset_domain->sem); 709 } 710 #endif 711 return false; 712 } 713 714 /** 715 * amdgpu_device_rreg - read a memory mapped IO or indirect register 716 * 717 * @adev: amdgpu_device pointer 718 * @reg: dword aligned register offset 719 * @acc_flags: access flags which require special behavior 720 * 721 * Returns the 32 bit value from the offset specified. 722 */ 723 uint32_t amdgpu_device_rreg(struct amdgpu_device *adev, 724 uint32_t reg, uint32_t acc_flags) 725 { 726 uint32_t ret; 727 728 if (amdgpu_device_skip_hw_access(adev)) 729 return 0; 730 731 if ((reg * 4) < adev->rmmio_size) { 732 if (!(acc_flags & AMDGPU_REGS_NO_KIQ) && 733 amdgpu_sriov_runtime(adev) && 734 down_read_trylock(&adev->reset_domain->sem)) { 735 ret = amdgpu_kiq_rreg(adev, reg, 0); 736 up_read(&adev->reset_domain->sem); 737 } else { 738 ret = readl(((void __iomem *)adev->rmmio) + (reg * 4)); 739 } 740 } else { 741 ret = adev->pcie_rreg(adev, reg * 4); 742 } 743 744 trace_amdgpu_device_rreg(adev->pdev->device, reg, ret); 745 746 return ret; 747 } 748 749 /* 750 * MMIO register read with bytes helper functions 751 * @offset:bytes offset from MMIO start 752 */ 753 754 /** 755 * amdgpu_mm_rreg8 - read a memory mapped IO register 756 * 757 * @adev: amdgpu_device pointer 758 * @offset: byte aligned register offset 759 * 760 * Returns the 8 bit value from the offset specified. 761 */ 762 uint8_t amdgpu_mm_rreg8(struct amdgpu_device *adev, uint32_t offset) 763 { 764 if (amdgpu_device_skip_hw_access(adev)) 765 return 0; 766 767 if (offset < adev->rmmio_size) 768 return (readb(adev->rmmio + offset)); 769 BUG(); 770 } 771 772 773 /** 774 * amdgpu_device_xcc_rreg - read a memory mapped IO or indirect register with specific XCC 775 * 776 * @adev: amdgpu_device pointer 777 * @reg: dword aligned register offset 778 * @acc_flags: access flags which require special behavior 779 * @xcc_id: xcc accelerated compute core id 780 * 781 * Returns the 32 bit value from the offset specified. 782 */ 783 uint32_t amdgpu_device_xcc_rreg(struct amdgpu_device *adev, 784 uint32_t reg, uint32_t acc_flags, 785 uint32_t xcc_id) 786 { 787 uint32_t ret, rlcg_flag; 788 789 if (amdgpu_device_skip_hw_access(adev)) 790 return 0; 791 792 if ((reg * 4) < adev->rmmio_size) { 793 if (amdgpu_sriov_vf(adev) && 794 !amdgpu_sriov_runtime(adev) && 795 adev->gfx.rlc.rlcg_reg_access_supported && 796 amdgpu_virt_get_rlcg_reg_access_flag(adev, acc_flags, 797 GC_HWIP, false, 798 &rlcg_flag)) { 799 ret = amdgpu_virt_rlcg_reg_rw(adev, reg, 0, rlcg_flag, GET_INST(GC, xcc_id)); 800 } else if (!(acc_flags & AMDGPU_REGS_NO_KIQ) && 801 amdgpu_sriov_runtime(adev) && 802 down_read_trylock(&adev->reset_domain->sem)) { 803 ret = amdgpu_kiq_rreg(adev, reg, xcc_id); 804 up_read(&adev->reset_domain->sem); 805 } else { 806 ret = readl(((void __iomem *)adev->rmmio) + (reg * 4)); 807 } 808 } else { 809 ret = adev->pcie_rreg(adev, reg * 4); 810 } 811 812 return ret; 813 } 814 815 /* 816 * MMIO register write with bytes helper functions 817 * @offset:bytes offset from MMIO start 818 * @value: the value want to be written to the register 819 */ 820 821 /** 822 * amdgpu_mm_wreg8 - read a memory mapped IO register 823 * 824 * @adev: amdgpu_device pointer 825 * @offset: byte aligned register offset 826 * @value: 8 bit value to write 827 * 828 * Writes the value specified to the offset specified. 829 */ 830 void amdgpu_mm_wreg8(struct amdgpu_device *adev, uint32_t offset, uint8_t value) 831 { 832 if (amdgpu_device_skip_hw_access(adev)) 833 return; 834 835 if (offset < adev->rmmio_size) 836 writeb(value, adev->rmmio + offset); 837 else 838 BUG(); 839 } 840 841 /** 842 * amdgpu_device_wreg - write to a memory mapped IO or indirect register 843 * 844 * @adev: amdgpu_device pointer 845 * @reg: dword aligned register offset 846 * @v: 32 bit value to write to the register 847 * @acc_flags: access flags which require special behavior 848 * 849 * Writes the value specified to the offset specified. 850 */ 851 void amdgpu_device_wreg(struct amdgpu_device *adev, 852 uint32_t reg, uint32_t v, 853 uint32_t acc_flags) 854 { 855 if (amdgpu_device_skip_hw_access(adev)) 856 return; 857 858 if ((reg * 4) < adev->rmmio_size) { 859 if (!(acc_flags & AMDGPU_REGS_NO_KIQ) && 860 amdgpu_sriov_runtime(adev) && 861 down_read_trylock(&adev->reset_domain->sem)) { 862 amdgpu_kiq_wreg(adev, reg, v, 0); 863 up_read(&adev->reset_domain->sem); 864 } else { 865 writel(v, ((void __iomem *)adev->rmmio) + (reg * 4)); 866 } 867 } else { 868 adev->pcie_wreg(adev, reg * 4, v); 869 } 870 871 trace_amdgpu_device_wreg(adev->pdev->device, reg, v); 872 } 873 874 /** 875 * amdgpu_mm_wreg_mmio_rlc - write register either with direct/indirect mmio or with RLC path if in range 876 * 877 * @adev: amdgpu_device pointer 878 * @reg: mmio/rlc register 879 * @v: value to write 880 * @xcc_id: xcc accelerated compute core id 881 * 882 * this function is invoked only for the debugfs register access 883 */ 884 void amdgpu_mm_wreg_mmio_rlc(struct amdgpu_device *adev, 885 uint32_t reg, uint32_t v, 886 uint32_t xcc_id) 887 { 888 if (amdgpu_device_skip_hw_access(adev)) 889 return; 890 891 if (amdgpu_sriov_fullaccess(adev) && 892 adev->gfx.rlc.funcs && 893 adev->gfx.rlc.funcs->is_rlcg_access_range) { 894 if (adev->gfx.rlc.funcs->is_rlcg_access_range(adev, reg)) 895 return amdgpu_sriov_wreg(adev, reg, v, 0, 0, xcc_id); 896 } else if ((reg * 4) >= adev->rmmio_size) { 897 adev->pcie_wreg(adev, reg * 4, v); 898 } else { 899 writel(v, ((void __iomem *)adev->rmmio) + (reg * 4)); 900 } 901 } 902 903 /** 904 * amdgpu_device_xcc_wreg - write to a memory mapped IO or indirect register with specific XCC 905 * 906 * @adev: amdgpu_device pointer 907 * @reg: dword aligned register offset 908 * @v: 32 bit value to write to the register 909 * @acc_flags: access flags which require special behavior 910 * @xcc_id: xcc accelerated compute core id 911 * 912 * Writes the value specified to the offset specified. 913 */ 914 void amdgpu_device_xcc_wreg(struct amdgpu_device *adev, 915 uint32_t reg, uint32_t v, 916 uint32_t acc_flags, uint32_t xcc_id) 917 { 918 uint32_t rlcg_flag; 919 920 if (amdgpu_device_skip_hw_access(adev)) 921 return; 922 923 if ((reg * 4) < adev->rmmio_size) { 924 if (amdgpu_sriov_vf(adev) && 925 !amdgpu_sriov_runtime(adev) && 926 adev->gfx.rlc.rlcg_reg_access_supported && 927 amdgpu_virt_get_rlcg_reg_access_flag(adev, acc_flags, 928 GC_HWIP, true, 929 &rlcg_flag)) { 930 amdgpu_virt_rlcg_reg_rw(adev, reg, v, rlcg_flag, GET_INST(GC, xcc_id)); 931 } else if (!(acc_flags & AMDGPU_REGS_NO_KIQ) && 932 amdgpu_sriov_runtime(adev) && 933 down_read_trylock(&adev->reset_domain->sem)) { 934 amdgpu_kiq_wreg(adev, reg, v, xcc_id); 935 up_read(&adev->reset_domain->sem); 936 } else { 937 writel(v, ((void __iomem *)adev->rmmio) + (reg * 4)); 938 } 939 } else { 940 adev->pcie_wreg(adev, reg * 4, v); 941 } 942 } 943 944 /** 945 * amdgpu_device_indirect_rreg - read an indirect register 946 * 947 * @adev: amdgpu_device pointer 948 * @reg_addr: indirect register address to read from 949 * 950 * Returns the value of indirect register @reg_addr 951 */ 952 u32 amdgpu_device_indirect_rreg(struct amdgpu_device *adev, 953 u32 reg_addr) 954 { 955 unsigned long flags, pcie_index, pcie_data; 956 void __iomem *pcie_index_offset; 957 void __iomem *pcie_data_offset; 958 u32 r; 959 960 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 961 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 962 963 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 964 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 965 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 966 967 writel(reg_addr, pcie_index_offset); 968 readl(pcie_index_offset); 969 r = readl(pcie_data_offset); 970 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 971 972 return r; 973 } 974 975 u32 amdgpu_device_indirect_rreg_ext(struct amdgpu_device *adev, 976 u64 reg_addr) 977 { 978 unsigned long flags, pcie_index, pcie_index_hi, pcie_data; 979 u32 r; 980 void __iomem *pcie_index_offset; 981 void __iomem *pcie_index_hi_offset; 982 void __iomem *pcie_data_offset; 983 984 if (unlikely(!adev->nbio.funcs)) { 985 pcie_index = AMDGPU_PCIE_INDEX_FALLBACK; 986 pcie_data = AMDGPU_PCIE_DATA_FALLBACK; 987 } else { 988 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 989 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 990 } 991 992 if (reg_addr >> 32) { 993 if (unlikely(!adev->nbio.funcs)) 994 pcie_index_hi = AMDGPU_PCIE_INDEX_HI_FALLBACK; 995 else 996 pcie_index_hi = adev->nbio.funcs->get_pcie_index_hi_offset(adev); 997 } else { 998 pcie_index_hi = 0; 999 } 1000 1001 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 1002 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 1003 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 1004 if (pcie_index_hi != 0) 1005 pcie_index_hi_offset = (void __iomem *)adev->rmmio + 1006 pcie_index_hi * 4; 1007 1008 writel(reg_addr, pcie_index_offset); 1009 readl(pcie_index_offset); 1010 if (pcie_index_hi != 0) { 1011 writel((reg_addr >> 32) & 0xff, pcie_index_hi_offset); 1012 readl(pcie_index_hi_offset); 1013 } 1014 r = readl(pcie_data_offset); 1015 1016 /* clear the high bits */ 1017 if (pcie_index_hi != 0) { 1018 writel(0, pcie_index_hi_offset); 1019 readl(pcie_index_hi_offset); 1020 } 1021 1022 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 1023 1024 return r; 1025 } 1026 1027 /** 1028 * amdgpu_device_indirect_rreg64 - read a 64bits indirect register 1029 * 1030 * @adev: amdgpu_device pointer 1031 * @reg_addr: indirect register address to read from 1032 * 1033 * Returns the value of indirect register @reg_addr 1034 */ 1035 u64 amdgpu_device_indirect_rreg64(struct amdgpu_device *adev, 1036 u32 reg_addr) 1037 { 1038 unsigned long flags, pcie_index, pcie_data; 1039 void __iomem *pcie_index_offset; 1040 void __iomem *pcie_data_offset; 1041 u64 r; 1042 1043 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 1044 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 1045 1046 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 1047 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 1048 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 1049 1050 /* read low 32 bits */ 1051 writel(reg_addr, pcie_index_offset); 1052 readl(pcie_index_offset); 1053 r = readl(pcie_data_offset); 1054 /* read high 32 bits */ 1055 writel(reg_addr + 4, pcie_index_offset); 1056 readl(pcie_index_offset); 1057 r |= ((u64)readl(pcie_data_offset) << 32); 1058 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 1059 1060 return r; 1061 } 1062 1063 u64 amdgpu_device_indirect_rreg64_ext(struct amdgpu_device *adev, 1064 u64 reg_addr) 1065 { 1066 unsigned long flags, pcie_index, pcie_data; 1067 unsigned long pcie_index_hi = 0; 1068 void __iomem *pcie_index_offset; 1069 void __iomem *pcie_index_hi_offset; 1070 void __iomem *pcie_data_offset; 1071 u64 r; 1072 1073 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 1074 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 1075 if ((reg_addr >> 32) && (adev->nbio.funcs->get_pcie_index_hi_offset)) 1076 pcie_index_hi = adev->nbio.funcs->get_pcie_index_hi_offset(adev); 1077 1078 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 1079 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 1080 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 1081 if (pcie_index_hi != 0) 1082 pcie_index_hi_offset = (void __iomem *)adev->rmmio + 1083 pcie_index_hi * 4; 1084 1085 /* read low 32 bits */ 1086 writel(reg_addr, pcie_index_offset); 1087 readl(pcie_index_offset); 1088 if (pcie_index_hi != 0) { 1089 writel((reg_addr >> 32) & 0xff, pcie_index_hi_offset); 1090 readl(pcie_index_hi_offset); 1091 } 1092 r = readl(pcie_data_offset); 1093 /* read high 32 bits */ 1094 writel(reg_addr + 4, pcie_index_offset); 1095 readl(pcie_index_offset); 1096 if (pcie_index_hi != 0) { 1097 writel((reg_addr >> 32) & 0xff, pcie_index_hi_offset); 1098 readl(pcie_index_hi_offset); 1099 } 1100 r |= ((u64)readl(pcie_data_offset) << 32); 1101 1102 /* clear the high bits */ 1103 if (pcie_index_hi != 0) { 1104 writel(0, pcie_index_hi_offset); 1105 readl(pcie_index_hi_offset); 1106 } 1107 1108 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 1109 1110 return r; 1111 } 1112 1113 /** 1114 * amdgpu_device_indirect_wreg - write an indirect register address 1115 * 1116 * @adev: amdgpu_device pointer 1117 * @reg_addr: indirect register offset 1118 * @reg_data: indirect register data 1119 * 1120 */ 1121 void amdgpu_device_indirect_wreg(struct amdgpu_device *adev, 1122 u32 reg_addr, u32 reg_data) 1123 { 1124 unsigned long flags, pcie_index, pcie_data; 1125 void __iomem *pcie_index_offset; 1126 void __iomem *pcie_data_offset; 1127 1128 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 1129 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 1130 1131 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 1132 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 1133 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 1134 1135 writel(reg_addr, pcie_index_offset); 1136 readl(pcie_index_offset); 1137 writel(reg_data, pcie_data_offset); 1138 readl(pcie_data_offset); 1139 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 1140 } 1141 1142 void amdgpu_device_indirect_wreg_ext(struct amdgpu_device *adev, 1143 u64 reg_addr, u32 reg_data) 1144 { 1145 unsigned long flags, pcie_index, pcie_index_hi, pcie_data; 1146 void __iomem *pcie_index_offset; 1147 void __iomem *pcie_index_hi_offset; 1148 void __iomem *pcie_data_offset; 1149 1150 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 1151 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 1152 if ((reg_addr >> 32) && (adev->nbio.funcs->get_pcie_index_hi_offset)) 1153 pcie_index_hi = adev->nbio.funcs->get_pcie_index_hi_offset(adev); 1154 else 1155 pcie_index_hi = 0; 1156 1157 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 1158 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 1159 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 1160 if (pcie_index_hi != 0) 1161 pcie_index_hi_offset = (void __iomem *)adev->rmmio + 1162 pcie_index_hi * 4; 1163 1164 writel(reg_addr, pcie_index_offset); 1165 readl(pcie_index_offset); 1166 if (pcie_index_hi != 0) { 1167 writel((reg_addr >> 32) & 0xff, pcie_index_hi_offset); 1168 readl(pcie_index_hi_offset); 1169 } 1170 writel(reg_data, pcie_data_offset); 1171 readl(pcie_data_offset); 1172 1173 /* clear the high bits */ 1174 if (pcie_index_hi != 0) { 1175 writel(0, pcie_index_hi_offset); 1176 readl(pcie_index_hi_offset); 1177 } 1178 1179 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 1180 } 1181 1182 /** 1183 * amdgpu_device_indirect_wreg64 - write a 64bits indirect register address 1184 * 1185 * @adev: amdgpu_device pointer 1186 * @reg_addr: indirect register offset 1187 * @reg_data: indirect register data 1188 * 1189 */ 1190 void amdgpu_device_indirect_wreg64(struct amdgpu_device *adev, 1191 u32 reg_addr, u64 reg_data) 1192 { 1193 unsigned long flags, pcie_index, pcie_data; 1194 void __iomem *pcie_index_offset; 1195 void __iomem *pcie_data_offset; 1196 1197 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 1198 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 1199 1200 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 1201 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 1202 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 1203 1204 /* write low 32 bits */ 1205 writel(reg_addr, pcie_index_offset); 1206 readl(pcie_index_offset); 1207 writel((u32)(reg_data & 0xffffffffULL), pcie_data_offset); 1208 readl(pcie_data_offset); 1209 /* write high 32 bits */ 1210 writel(reg_addr + 4, pcie_index_offset); 1211 readl(pcie_index_offset); 1212 writel((u32)(reg_data >> 32), pcie_data_offset); 1213 readl(pcie_data_offset); 1214 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 1215 } 1216 1217 void amdgpu_device_indirect_wreg64_ext(struct amdgpu_device *adev, 1218 u64 reg_addr, u64 reg_data) 1219 { 1220 unsigned long flags, pcie_index, pcie_data; 1221 unsigned long pcie_index_hi = 0; 1222 void __iomem *pcie_index_offset; 1223 void __iomem *pcie_index_hi_offset; 1224 void __iomem *pcie_data_offset; 1225 1226 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 1227 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 1228 if ((reg_addr >> 32) && (adev->nbio.funcs->get_pcie_index_hi_offset)) 1229 pcie_index_hi = adev->nbio.funcs->get_pcie_index_hi_offset(adev); 1230 1231 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 1232 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 1233 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 1234 if (pcie_index_hi != 0) 1235 pcie_index_hi_offset = (void __iomem *)adev->rmmio + 1236 pcie_index_hi * 4; 1237 1238 /* write low 32 bits */ 1239 writel(reg_addr, pcie_index_offset); 1240 readl(pcie_index_offset); 1241 if (pcie_index_hi != 0) { 1242 writel((reg_addr >> 32) & 0xff, pcie_index_hi_offset); 1243 readl(pcie_index_hi_offset); 1244 } 1245 writel((u32)(reg_data & 0xffffffffULL), pcie_data_offset); 1246 readl(pcie_data_offset); 1247 /* write high 32 bits */ 1248 writel(reg_addr + 4, pcie_index_offset); 1249 readl(pcie_index_offset); 1250 if (pcie_index_hi != 0) { 1251 writel((reg_addr >> 32) & 0xff, pcie_index_hi_offset); 1252 readl(pcie_index_hi_offset); 1253 } 1254 writel((u32)(reg_data >> 32), pcie_data_offset); 1255 readl(pcie_data_offset); 1256 1257 /* clear the high bits */ 1258 if (pcie_index_hi != 0) { 1259 writel(0, pcie_index_hi_offset); 1260 readl(pcie_index_hi_offset); 1261 } 1262 1263 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 1264 } 1265 1266 /** 1267 * amdgpu_device_get_rev_id - query device rev_id 1268 * 1269 * @adev: amdgpu_device pointer 1270 * 1271 * Return device rev_id 1272 */ 1273 u32 amdgpu_device_get_rev_id(struct amdgpu_device *adev) 1274 { 1275 return adev->nbio.funcs->get_rev_id(adev); 1276 } 1277 1278 /** 1279 * amdgpu_invalid_rreg - dummy reg read function 1280 * 1281 * @adev: amdgpu_device pointer 1282 * @reg: offset of register 1283 * 1284 * Dummy register read function. Used for register blocks 1285 * that certain asics don't have (all asics). 1286 * Returns the value in the register. 1287 */ 1288 static uint32_t amdgpu_invalid_rreg(struct amdgpu_device *adev, uint32_t reg) 1289 { 1290 DRM_ERROR("Invalid callback to read register 0x%04X\n", reg); 1291 BUG(); 1292 return 0; 1293 } 1294 1295 static uint32_t amdgpu_invalid_rreg_ext(struct amdgpu_device *adev, uint64_t reg) 1296 { 1297 DRM_ERROR("Invalid callback to read register 0x%llX\n", reg); 1298 BUG(); 1299 return 0; 1300 } 1301 1302 /** 1303 * amdgpu_invalid_wreg - dummy reg write function 1304 * 1305 * @adev: amdgpu_device pointer 1306 * @reg: offset of register 1307 * @v: value to write to the register 1308 * 1309 * Dummy register read function. Used for register blocks 1310 * that certain asics don't have (all asics). 1311 */ 1312 static void amdgpu_invalid_wreg(struct amdgpu_device *adev, uint32_t reg, uint32_t v) 1313 { 1314 DRM_ERROR("Invalid callback to write register 0x%04X with 0x%08X\n", 1315 reg, v); 1316 BUG(); 1317 } 1318 1319 static void amdgpu_invalid_wreg_ext(struct amdgpu_device *adev, uint64_t reg, uint32_t v) 1320 { 1321 DRM_ERROR("Invalid callback to write register 0x%llX with 0x%08X\n", 1322 reg, v); 1323 BUG(); 1324 } 1325 1326 /** 1327 * amdgpu_invalid_rreg64 - dummy 64 bit reg read function 1328 * 1329 * @adev: amdgpu_device pointer 1330 * @reg: offset of register 1331 * 1332 * Dummy register read function. Used for register blocks 1333 * that certain asics don't have (all asics). 1334 * Returns the value in the register. 1335 */ 1336 static uint64_t amdgpu_invalid_rreg64(struct amdgpu_device *adev, uint32_t reg) 1337 { 1338 DRM_ERROR("Invalid callback to read 64 bit register 0x%04X\n", reg); 1339 BUG(); 1340 return 0; 1341 } 1342 1343 static uint64_t amdgpu_invalid_rreg64_ext(struct amdgpu_device *adev, uint64_t reg) 1344 { 1345 DRM_ERROR("Invalid callback to read register 0x%llX\n", reg); 1346 BUG(); 1347 return 0; 1348 } 1349 1350 /** 1351 * amdgpu_invalid_wreg64 - dummy reg write function 1352 * 1353 * @adev: amdgpu_device pointer 1354 * @reg: offset of register 1355 * @v: value to write to the register 1356 * 1357 * Dummy register read function. Used for register blocks 1358 * that certain asics don't have (all asics). 1359 */ 1360 static void amdgpu_invalid_wreg64(struct amdgpu_device *adev, uint32_t reg, uint64_t v) 1361 { 1362 DRM_ERROR("Invalid callback to write 64 bit register 0x%04X with 0x%08llX\n", 1363 reg, v); 1364 BUG(); 1365 } 1366 1367 static void amdgpu_invalid_wreg64_ext(struct amdgpu_device *adev, uint64_t reg, uint64_t v) 1368 { 1369 DRM_ERROR("Invalid callback to write 64 bit register 0x%llX with 0x%08llX\n", 1370 reg, v); 1371 BUG(); 1372 } 1373 1374 /** 1375 * amdgpu_block_invalid_rreg - dummy reg read function 1376 * 1377 * @adev: amdgpu_device pointer 1378 * @block: offset of instance 1379 * @reg: offset of register 1380 * 1381 * Dummy register read function. Used for register blocks 1382 * that certain asics don't have (all asics). 1383 * Returns the value in the register. 1384 */ 1385 static uint32_t amdgpu_block_invalid_rreg(struct amdgpu_device *adev, 1386 uint32_t block, uint32_t reg) 1387 { 1388 DRM_ERROR("Invalid callback to read register 0x%04X in block 0x%04X\n", 1389 reg, block); 1390 BUG(); 1391 return 0; 1392 } 1393 1394 /** 1395 * amdgpu_block_invalid_wreg - dummy reg write function 1396 * 1397 * @adev: amdgpu_device pointer 1398 * @block: offset of instance 1399 * @reg: offset of register 1400 * @v: value to write to the register 1401 * 1402 * Dummy register read function. Used for register blocks 1403 * that certain asics don't have (all asics). 1404 */ 1405 static void amdgpu_block_invalid_wreg(struct amdgpu_device *adev, 1406 uint32_t block, 1407 uint32_t reg, uint32_t v) 1408 { 1409 DRM_ERROR("Invalid block callback to write register 0x%04X in block 0x%04X with 0x%08X\n", 1410 reg, block, v); 1411 BUG(); 1412 } 1413 1414 static uint32_t amdgpu_device_get_vbios_flags(struct amdgpu_device *adev) 1415 { 1416 if (hweight32(adev->aid_mask) && (adev->flags & AMD_IS_APU)) 1417 return AMDGPU_VBIOS_SKIP; 1418 1419 if (hweight32(adev->aid_mask) && amdgpu_passthrough(adev)) 1420 return AMDGPU_VBIOS_OPTIONAL; 1421 1422 return 0; 1423 } 1424 1425 /** 1426 * amdgpu_device_asic_init - Wrapper for atom asic_init 1427 * 1428 * @adev: amdgpu_device pointer 1429 * 1430 * Does any asic specific work and then calls atom asic init. 1431 */ 1432 static int amdgpu_device_asic_init(struct amdgpu_device *adev) 1433 { 1434 uint32_t flags; 1435 bool optional; 1436 int ret; 1437 1438 amdgpu_asic_pre_asic_init(adev); 1439 flags = amdgpu_device_get_vbios_flags(adev); 1440 optional = !!(flags & (AMDGPU_VBIOS_OPTIONAL | AMDGPU_VBIOS_SKIP)); 1441 1442 if (amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 3) || 1443 amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 4) || 1444 amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 5, 0) || 1445 amdgpu_ip_version(adev, GC_HWIP, 0) >= IP_VERSION(11, 0, 0)) { 1446 amdgpu_psp_wait_for_bootloader(adev); 1447 if (optional && !adev->bios) 1448 return 0; 1449 1450 ret = amdgpu_atomfirmware_asic_init(adev, true); 1451 return ret; 1452 } else { 1453 if (optional && !adev->bios) 1454 return 0; 1455 1456 return amdgpu_atom_asic_init(adev->mode_info.atom_context); 1457 } 1458 1459 return 0; 1460 } 1461 1462 /** 1463 * amdgpu_device_mem_scratch_init - allocate the VRAM scratch page 1464 * 1465 * @adev: amdgpu_device pointer 1466 * 1467 * Allocates a scratch page of VRAM for use by various things in the 1468 * driver. 1469 */ 1470 static int amdgpu_device_mem_scratch_init(struct amdgpu_device *adev) 1471 { 1472 return amdgpu_bo_create_kernel(adev, AMDGPU_GPU_PAGE_SIZE, PAGE_SIZE, 1473 AMDGPU_GEM_DOMAIN_VRAM | 1474 AMDGPU_GEM_DOMAIN_GTT, 1475 &adev->mem_scratch.robj, 1476 &adev->mem_scratch.gpu_addr, 1477 (void **)&adev->mem_scratch.ptr); 1478 } 1479 1480 /** 1481 * amdgpu_device_mem_scratch_fini - Free the VRAM scratch page 1482 * 1483 * @adev: amdgpu_device pointer 1484 * 1485 * Frees the VRAM scratch page. 1486 */ 1487 static void amdgpu_device_mem_scratch_fini(struct amdgpu_device *adev) 1488 { 1489 amdgpu_bo_free_kernel(&adev->mem_scratch.robj, NULL, NULL); 1490 } 1491 1492 /** 1493 * amdgpu_device_program_register_sequence - program an array of registers. 1494 * 1495 * @adev: amdgpu_device pointer 1496 * @registers: pointer to the register array 1497 * @array_size: size of the register array 1498 * 1499 * Programs an array or registers with and or masks. 1500 * This is a helper for setting golden registers. 1501 */ 1502 void amdgpu_device_program_register_sequence(struct amdgpu_device *adev, 1503 const u32 *registers, 1504 const u32 array_size) 1505 { 1506 u32 tmp, reg, and_mask, or_mask; 1507 int i; 1508 1509 if (array_size % 3) 1510 return; 1511 1512 for (i = 0; i < array_size; i += 3) { 1513 reg = registers[i + 0]; 1514 and_mask = registers[i + 1]; 1515 or_mask = registers[i + 2]; 1516 1517 if (and_mask == 0xffffffff) { 1518 tmp = or_mask; 1519 } else { 1520 tmp = RREG32(reg); 1521 tmp &= ~and_mask; 1522 if (adev->family >= AMDGPU_FAMILY_AI) 1523 tmp |= (or_mask & and_mask); 1524 else 1525 tmp |= or_mask; 1526 } 1527 WREG32(reg, tmp); 1528 } 1529 } 1530 1531 /** 1532 * amdgpu_device_pci_config_reset - reset the GPU 1533 * 1534 * @adev: amdgpu_device pointer 1535 * 1536 * Resets the GPU using the pci config reset sequence. 1537 * Only applicable to asics prior to vega10. 1538 */ 1539 void amdgpu_device_pci_config_reset(struct amdgpu_device *adev) 1540 { 1541 pci_write_config_dword(adev->pdev, 0x7c, AMDGPU_ASIC_RESET_DATA); 1542 } 1543 1544 /** 1545 * amdgpu_device_pci_reset - reset the GPU using generic PCI means 1546 * 1547 * @adev: amdgpu_device pointer 1548 * 1549 * Resets the GPU using generic pci reset interfaces (FLR, SBR, etc.). 1550 */ 1551 int amdgpu_device_pci_reset(struct amdgpu_device *adev) 1552 { 1553 return pci_reset_function(adev->pdev); 1554 } 1555 1556 /* 1557 * amdgpu_device_wb_*() 1558 * Writeback is the method by which the GPU updates special pages in memory 1559 * with the status of certain GPU events (fences, ring pointers,etc.). 1560 */ 1561 1562 /** 1563 * amdgpu_device_wb_fini - Disable Writeback and free memory 1564 * 1565 * @adev: amdgpu_device pointer 1566 * 1567 * Disables Writeback and frees the Writeback memory (all asics). 1568 * Used at driver shutdown. 1569 */ 1570 static void amdgpu_device_wb_fini(struct amdgpu_device *adev) 1571 { 1572 if (adev->wb.wb_obj) { 1573 amdgpu_bo_free_kernel(&adev->wb.wb_obj, 1574 &adev->wb.gpu_addr, 1575 (void **)&adev->wb.wb); 1576 adev->wb.wb_obj = NULL; 1577 } 1578 } 1579 1580 /** 1581 * amdgpu_device_wb_init - Init Writeback driver info and allocate memory 1582 * 1583 * @adev: amdgpu_device pointer 1584 * 1585 * Initializes writeback and allocates writeback memory (all asics). 1586 * Used at driver startup. 1587 * Returns 0 on success or an -error on failure. 1588 */ 1589 static int amdgpu_device_wb_init(struct amdgpu_device *adev) 1590 { 1591 int r; 1592 1593 if (adev->wb.wb_obj == NULL) { 1594 /* AMDGPU_MAX_WB * sizeof(uint32_t) * 8 = AMDGPU_MAX_WB 256bit slots */ 1595 r = amdgpu_bo_create_kernel(adev, AMDGPU_MAX_WB * sizeof(uint32_t) * 8, 1596 PAGE_SIZE, AMDGPU_GEM_DOMAIN_GTT, 1597 &adev->wb.wb_obj, &adev->wb.gpu_addr, 1598 (void **)&adev->wb.wb); 1599 if (r) { 1600 dev_warn(adev->dev, "(%d) create WB bo failed\n", r); 1601 return r; 1602 } 1603 1604 adev->wb.num_wb = AMDGPU_MAX_WB; 1605 memset(&adev->wb.used, 0, sizeof(adev->wb.used)); 1606 1607 /* clear wb memory */ 1608 memset((char *)adev->wb.wb, 0, AMDGPU_MAX_WB * sizeof(uint32_t) * 8); 1609 } 1610 1611 return 0; 1612 } 1613 1614 /** 1615 * amdgpu_device_wb_get - Allocate a wb entry 1616 * 1617 * @adev: amdgpu_device pointer 1618 * @wb: wb index 1619 * 1620 * Allocate a wb slot for use by the driver (all asics). 1621 * Returns 0 on success or -EINVAL on failure. 1622 */ 1623 int amdgpu_device_wb_get(struct amdgpu_device *adev, u32 *wb) 1624 { 1625 unsigned long flags, offset; 1626 1627 spin_lock_irqsave(&adev->wb.lock, flags); 1628 offset = find_first_zero_bit(adev->wb.used, adev->wb.num_wb); 1629 if (offset < adev->wb.num_wb) { 1630 __set_bit(offset, adev->wb.used); 1631 spin_unlock_irqrestore(&adev->wb.lock, flags); 1632 *wb = offset << 3; /* convert to dw offset */ 1633 return 0; 1634 } else { 1635 spin_unlock_irqrestore(&adev->wb.lock, flags); 1636 return -EINVAL; 1637 } 1638 } 1639 1640 /** 1641 * amdgpu_device_wb_free - Free a wb entry 1642 * 1643 * @adev: amdgpu_device pointer 1644 * @wb: wb index 1645 * 1646 * Free a wb slot allocated for use by the driver (all asics) 1647 */ 1648 void amdgpu_device_wb_free(struct amdgpu_device *adev, u32 wb) 1649 { 1650 unsigned long flags; 1651 1652 wb >>= 3; 1653 spin_lock_irqsave(&adev->wb.lock, flags); 1654 if (wb < adev->wb.num_wb) 1655 __clear_bit(wb, adev->wb.used); 1656 spin_unlock_irqrestore(&adev->wb.lock, flags); 1657 } 1658 1659 /** 1660 * amdgpu_device_resize_fb_bar - try to resize FB BAR 1661 * 1662 * @adev: amdgpu_device pointer 1663 * 1664 * Try to resize FB BAR to make all VRAM CPU accessible. We try very hard not 1665 * to fail, but if any of the BARs is not accessible after the size we abort 1666 * driver loading by returning -ENODEV. 1667 */ 1668 int amdgpu_device_resize_fb_bar(struct amdgpu_device *adev) 1669 { 1670 int rbar_size = pci_rebar_bytes_to_size(adev->gmc.real_vram_size); 1671 struct pci_bus *root; 1672 struct resource *res; 1673 unsigned int i; 1674 u16 cmd; 1675 int r; 1676 1677 if (!IS_ENABLED(CONFIG_PHYS_ADDR_T_64BIT)) 1678 return 0; 1679 1680 /* Bypass for VF */ 1681 if (amdgpu_sriov_vf(adev)) 1682 return 0; 1683 1684 if (!amdgpu_rebar) 1685 return 0; 1686 1687 /* resizing on Dell G5 SE platforms causes problems with runtime pm */ 1688 if ((amdgpu_runtime_pm != 0) && 1689 adev->pdev->vendor == PCI_VENDOR_ID_ATI && 1690 adev->pdev->device == 0x731f && 1691 adev->pdev->subsystem_vendor == PCI_VENDOR_ID_DELL) 1692 return 0; 1693 1694 /* PCI_EXT_CAP_ID_VNDR extended capability is located at 0x100 */ 1695 if (!pci_find_ext_capability(adev->pdev, PCI_EXT_CAP_ID_VNDR)) 1696 DRM_WARN("System can't access extended configuration space, please check!!\n"); 1697 1698 /* skip if the bios has already enabled large BAR */ 1699 if (adev->gmc.real_vram_size && 1700 (pci_resource_len(adev->pdev, 0) >= adev->gmc.real_vram_size)) 1701 return 0; 1702 1703 /* Check if the root BUS has 64bit memory resources */ 1704 root = adev->pdev->bus; 1705 while (root->parent) 1706 root = root->parent; 1707 1708 pci_bus_for_each_resource(root, res, i) { 1709 if (res && res->flags & (IORESOURCE_MEM | IORESOURCE_MEM_64) && 1710 res->start > 0x100000000ull) 1711 break; 1712 } 1713 1714 /* Trying to resize is pointless without a root hub window above 4GB */ 1715 if (!res) 1716 return 0; 1717 1718 /* Limit the BAR size to what is available */ 1719 rbar_size = min(fls(pci_rebar_get_possible_sizes(adev->pdev, 0)) - 1, 1720 rbar_size); 1721 1722 /* Disable memory decoding while we change the BAR addresses and size */ 1723 pci_read_config_word(adev->pdev, PCI_COMMAND, &cmd); 1724 pci_write_config_word(adev->pdev, PCI_COMMAND, 1725 cmd & ~PCI_COMMAND_MEMORY); 1726 1727 /* Free the VRAM and doorbell BAR, we most likely need to move both. */ 1728 amdgpu_doorbell_fini(adev); 1729 if (adev->asic_type >= CHIP_BONAIRE) 1730 pci_release_resource(adev->pdev, 2); 1731 1732 pci_release_resource(adev->pdev, 0); 1733 1734 r = pci_resize_resource(adev->pdev, 0, rbar_size); 1735 if (r == -ENOSPC) 1736 DRM_INFO("Not enough PCI address space for a large BAR."); 1737 else if (r && r != -ENOTSUPP) 1738 DRM_ERROR("Problem resizing BAR0 (%d).", r); 1739 1740 pci_assign_unassigned_bus_resources(adev->pdev->bus); 1741 1742 /* When the doorbell or fb BAR isn't available we have no chance of 1743 * using the device. 1744 */ 1745 r = amdgpu_doorbell_init(adev); 1746 if (r || (pci_resource_flags(adev->pdev, 0) & IORESOURCE_UNSET)) 1747 return -ENODEV; 1748 1749 pci_write_config_word(adev->pdev, PCI_COMMAND, cmd); 1750 1751 return 0; 1752 } 1753 1754 /* 1755 * GPU helpers function. 1756 */ 1757 /** 1758 * amdgpu_device_need_post - check if the hw need post or not 1759 * 1760 * @adev: amdgpu_device pointer 1761 * 1762 * Check if the asic has been initialized (all asics) at driver startup 1763 * or post is needed if hw reset is performed. 1764 * Returns true if need or false if not. 1765 */ 1766 bool amdgpu_device_need_post(struct amdgpu_device *adev) 1767 { 1768 uint32_t reg, flags; 1769 1770 if (amdgpu_sriov_vf(adev)) 1771 return false; 1772 1773 flags = amdgpu_device_get_vbios_flags(adev); 1774 if (flags & AMDGPU_VBIOS_SKIP) 1775 return false; 1776 if ((flags & AMDGPU_VBIOS_OPTIONAL) && !adev->bios) 1777 return false; 1778 1779 if (amdgpu_passthrough(adev)) { 1780 /* for FIJI: In whole GPU pass-through virtualization case, after VM reboot 1781 * some old smc fw still need driver do vPost otherwise gpu hang, while 1782 * those smc fw version above 22.15 doesn't have this flaw, so we force 1783 * vpost executed for smc version below 22.15 1784 */ 1785 if (adev->asic_type == CHIP_FIJI) { 1786 int err; 1787 uint32_t fw_ver; 1788 1789 err = request_firmware(&adev->pm.fw, "amdgpu/fiji_smc.bin", adev->dev); 1790 /* force vPost if error occurred */ 1791 if (err) 1792 return true; 1793 1794 fw_ver = *((uint32_t *)adev->pm.fw->data + 69); 1795 release_firmware(adev->pm.fw); 1796 if (fw_ver < 0x00160e00) 1797 return true; 1798 } 1799 } 1800 1801 /* Don't post if we need to reset whole hive on init */ 1802 if (adev->init_lvl->level == AMDGPU_INIT_LEVEL_MINIMAL_XGMI) 1803 return false; 1804 1805 if (adev->has_hw_reset) { 1806 adev->has_hw_reset = false; 1807 return true; 1808 } 1809 1810 /* bios scratch used on CIK+ */ 1811 if (adev->asic_type >= CHIP_BONAIRE) 1812 return amdgpu_atombios_scratch_need_asic_init(adev); 1813 1814 /* check MEM_SIZE for older asics */ 1815 reg = amdgpu_asic_get_config_memsize(adev); 1816 1817 if ((reg != 0) && (reg != 0xffffffff)) 1818 return false; 1819 1820 return true; 1821 } 1822 1823 /* 1824 * Check whether seamless boot is supported. 1825 * 1826 * So far we only support seamless boot on DCE 3.0 or later. 1827 * If users report that it works on older ASICS as well, we may 1828 * loosen this. 1829 */ 1830 bool amdgpu_device_seamless_boot_supported(struct amdgpu_device *adev) 1831 { 1832 switch (amdgpu_seamless) { 1833 case -1: 1834 break; 1835 case 1: 1836 return true; 1837 case 0: 1838 return false; 1839 default: 1840 DRM_ERROR("Invalid value for amdgpu.seamless: %d\n", 1841 amdgpu_seamless); 1842 return false; 1843 } 1844 1845 if (!(adev->flags & AMD_IS_APU)) 1846 return false; 1847 1848 if (adev->mman.keep_stolen_vga_memory) 1849 return false; 1850 1851 return amdgpu_ip_version(adev, DCE_HWIP, 0) >= IP_VERSION(3, 0, 0); 1852 } 1853 1854 /* 1855 * Intel hosts such as Rocket Lake, Alder Lake, Raptor Lake and Sapphire Rapids 1856 * don't support dynamic speed switching. Until we have confirmation from Intel 1857 * that a specific host supports it, it's safer that we keep it disabled for all. 1858 * 1859 * https://edc.intel.com/content/www/us/en/design/products/platforms/details/raptor-lake-s/13th-generation-core-processors-datasheet-volume-1-of-2/005/pci-express-support/ 1860 * https://gitlab.freedesktop.org/drm/amd/-/issues/2663 1861 */ 1862 static bool amdgpu_device_pcie_dynamic_switching_supported(struct amdgpu_device *adev) 1863 { 1864 #if IS_ENABLED(CONFIG_X86) 1865 struct cpuinfo_x86 *c = &cpu_data(0); 1866 1867 /* eGPU change speeds based on USB4 fabric conditions */ 1868 if (dev_is_removable(adev->dev)) 1869 return true; 1870 1871 if (c->x86_vendor == X86_VENDOR_INTEL) 1872 return false; 1873 #endif 1874 return true; 1875 } 1876 1877 static bool amdgpu_device_aspm_support_quirk(struct amdgpu_device *adev) 1878 { 1879 #if IS_ENABLED(CONFIG_X86) 1880 struct cpuinfo_x86 *c = &cpu_data(0); 1881 1882 if (!(amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(12, 0, 0) || 1883 amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(12, 0, 1))) 1884 return false; 1885 1886 if (c->x86 == 6 && 1887 adev->pm.pcie_gen_mask & CAIL_PCIE_LINK_SPEED_SUPPORT_GEN5) { 1888 switch (c->x86_model) { 1889 case VFM_MODEL(INTEL_ALDERLAKE): 1890 case VFM_MODEL(INTEL_ALDERLAKE_L): 1891 case VFM_MODEL(INTEL_RAPTORLAKE): 1892 case VFM_MODEL(INTEL_RAPTORLAKE_P): 1893 case VFM_MODEL(INTEL_RAPTORLAKE_S): 1894 return true; 1895 default: 1896 return false; 1897 } 1898 } else { 1899 return false; 1900 } 1901 #else 1902 return false; 1903 #endif 1904 } 1905 1906 /** 1907 * amdgpu_device_should_use_aspm - check if the device should program ASPM 1908 * 1909 * @adev: amdgpu_device pointer 1910 * 1911 * Confirm whether the module parameter and pcie bridge agree that ASPM should 1912 * be set for this device. 1913 * 1914 * Returns true if it should be used or false if not. 1915 */ 1916 bool amdgpu_device_should_use_aspm(struct amdgpu_device *adev) 1917 { 1918 switch (amdgpu_aspm) { 1919 case -1: 1920 break; 1921 case 0: 1922 return false; 1923 case 1: 1924 return true; 1925 default: 1926 return false; 1927 } 1928 if (adev->flags & AMD_IS_APU) 1929 return false; 1930 if (amdgpu_device_aspm_support_quirk(adev)) 1931 return false; 1932 return pcie_aspm_enabled(adev->pdev); 1933 } 1934 1935 /* if we get transitioned to only one device, take VGA back */ 1936 /** 1937 * amdgpu_device_vga_set_decode - enable/disable vga decode 1938 * 1939 * @pdev: PCI device pointer 1940 * @state: enable/disable vga decode 1941 * 1942 * Enable/disable vga decode (all asics). 1943 * Returns VGA resource flags. 1944 */ 1945 static unsigned int amdgpu_device_vga_set_decode(struct pci_dev *pdev, 1946 bool state) 1947 { 1948 struct amdgpu_device *adev = drm_to_adev(pci_get_drvdata(pdev)); 1949 1950 amdgpu_asic_set_vga_state(adev, state); 1951 if (state) 1952 return VGA_RSRC_LEGACY_IO | VGA_RSRC_LEGACY_MEM | 1953 VGA_RSRC_NORMAL_IO | VGA_RSRC_NORMAL_MEM; 1954 else 1955 return VGA_RSRC_NORMAL_IO | VGA_RSRC_NORMAL_MEM; 1956 } 1957 1958 /** 1959 * amdgpu_device_check_block_size - validate the vm block size 1960 * 1961 * @adev: amdgpu_device pointer 1962 * 1963 * Validates the vm block size specified via module parameter. 1964 * The vm block size defines number of bits in page table versus page directory, 1965 * a page is 4KB so we have 12 bits offset, minimum 9 bits in the 1966 * page table and the remaining bits are in the page directory. 1967 */ 1968 static void amdgpu_device_check_block_size(struct amdgpu_device *adev) 1969 { 1970 /* defines number of bits in page table versus page directory, 1971 * a page is 4KB so we have 12 bits offset, minimum 9 bits in the 1972 * page table and the remaining bits are in the page directory 1973 */ 1974 if (amdgpu_vm_block_size == -1) 1975 return; 1976 1977 if (amdgpu_vm_block_size < 9) { 1978 dev_warn(adev->dev, "VM page table size (%d) too small\n", 1979 amdgpu_vm_block_size); 1980 amdgpu_vm_block_size = -1; 1981 } 1982 } 1983 1984 /** 1985 * amdgpu_device_check_vm_size - validate the vm size 1986 * 1987 * @adev: amdgpu_device pointer 1988 * 1989 * Validates the vm size in GB specified via module parameter. 1990 * The VM size is the size of the GPU virtual memory space in GB. 1991 */ 1992 static void amdgpu_device_check_vm_size(struct amdgpu_device *adev) 1993 { 1994 /* no need to check the default value */ 1995 if (amdgpu_vm_size == -1) 1996 return; 1997 1998 if (amdgpu_vm_size < 1) { 1999 dev_warn(adev->dev, "VM size (%d) too small, min is 1GB\n", 2000 amdgpu_vm_size); 2001 amdgpu_vm_size = -1; 2002 } 2003 } 2004 2005 static void amdgpu_device_check_smu_prv_buffer_size(struct amdgpu_device *adev) 2006 { 2007 struct sysinfo si; 2008 bool is_os_64 = (sizeof(void *) == 8); 2009 uint64_t total_memory; 2010 uint64_t dram_size_seven_GB = 0x1B8000000; 2011 uint64_t dram_size_three_GB = 0xB8000000; 2012 2013 if (amdgpu_smu_memory_pool_size == 0) 2014 return; 2015 2016 if (!is_os_64) { 2017 DRM_WARN("Not 64-bit OS, feature not supported\n"); 2018 goto def_value; 2019 } 2020 si_meminfo(&si); 2021 total_memory = (uint64_t)si.totalram * si.mem_unit; 2022 2023 if ((amdgpu_smu_memory_pool_size == 1) || 2024 (amdgpu_smu_memory_pool_size == 2)) { 2025 if (total_memory < dram_size_three_GB) 2026 goto def_value1; 2027 } else if ((amdgpu_smu_memory_pool_size == 4) || 2028 (amdgpu_smu_memory_pool_size == 8)) { 2029 if (total_memory < dram_size_seven_GB) 2030 goto def_value1; 2031 } else { 2032 DRM_WARN("Smu memory pool size not supported\n"); 2033 goto def_value; 2034 } 2035 adev->pm.smu_prv_buffer_size = amdgpu_smu_memory_pool_size << 28; 2036 2037 return; 2038 2039 def_value1: 2040 DRM_WARN("No enough system memory\n"); 2041 def_value: 2042 adev->pm.smu_prv_buffer_size = 0; 2043 } 2044 2045 static int amdgpu_device_init_apu_flags(struct amdgpu_device *adev) 2046 { 2047 if (!(adev->flags & AMD_IS_APU) || 2048 adev->asic_type < CHIP_RAVEN) 2049 return 0; 2050 2051 switch (adev->asic_type) { 2052 case CHIP_RAVEN: 2053 if (adev->pdev->device == 0x15dd) 2054 adev->apu_flags |= AMD_APU_IS_RAVEN; 2055 if (adev->pdev->device == 0x15d8) 2056 adev->apu_flags |= AMD_APU_IS_PICASSO; 2057 break; 2058 case CHIP_RENOIR: 2059 if ((adev->pdev->device == 0x1636) || 2060 (adev->pdev->device == 0x164c)) 2061 adev->apu_flags |= AMD_APU_IS_RENOIR; 2062 else 2063 adev->apu_flags |= AMD_APU_IS_GREEN_SARDINE; 2064 break; 2065 case CHIP_VANGOGH: 2066 adev->apu_flags |= AMD_APU_IS_VANGOGH; 2067 break; 2068 case CHIP_YELLOW_CARP: 2069 break; 2070 case CHIP_CYAN_SKILLFISH: 2071 if ((adev->pdev->device == 0x13FE) || 2072 (adev->pdev->device == 0x143F)) 2073 adev->apu_flags |= AMD_APU_IS_CYAN_SKILLFISH2; 2074 break; 2075 default: 2076 break; 2077 } 2078 2079 return 0; 2080 } 2081 2082 /** 2083 * amdgpu_device_check_arguments - validate module params 2084 * 2085 * @adev: amdgpu_device pointer 2086 * 2087 * Validates certain module parameters and updates 2088 * the associated values used by the driver (all asics). 2089 */ 2090 static int amdgpu_device_check_arguments(struct amdgpu_device *adev) 2091 { 2092 int i; 2093 2094 if (amdgpu_sched_jobs < 4) { 2095 dev_warn(adev->dev, "sched jobs (%d) must be at least 4\n", 2096 amdgpu_sched_jobs); 2097 amdgpu_sched_jobs = 4; 2098 } else if (!is_power_of_2(amdgpu_sched_jobs)) { 2099 dev_warn(adev->dev, "sched jobs (%d) must be a power of 2\n", 2100 amdgpu_sched_jobs); 2101 amdgpu_sched_jobs = roundup_pow_of_two(amdgpu_sched_jobs); 2102 } 2103 2104 if (amdgpu_gart_size != -1 && amdgpu_gart_size < 32) { 2105 /* gart size must be greater or equal to 32M */ 2106 dev_warn(adev->dev, "gart size (%d) too small\n", 2107 amdgpu_gart_size); 2108 amdgpu_gart_size = -1; 2109 } 2110 2111 if (amdgpu_gtt_size != -1 && amdgpu_gtt_size < 32) { 2112 /* gtt size must be greater or equal to 32M */ 2113 dev_warn(adev->dev, "gtt size (%d) too small\n", 2114 amdgpu_gtt_size); 2115 amdgpu_gtt_size = -1; 2116 } 2117 2118 /* valid range is between 4 and 9 inclusive */ 2119 if (amdgpu_vm_fragment_size != -1 && 2120 (amdgpu_vm_fragment_size > 9 || amdgpu_vm_fragment_size < 4)) { 2121 dev_warn(adev->dev, "valid range is between 4 and 9\n"); 2122 amdgpu_vm_fragment_size = -1; 2123 } 2124 2125 if (amdgpu_sched_hw_submission < 2) { 2126 dev_warn(adev->dev, "sched hw submission jobs (%d) must be at least 2\n", 2127 amdgpu_sched_hw_submission); 2128 amdgpu_sched_hw_submission = 2; 2129 } else if (!is_power_of_2(amdgpu_sched_hw_submission)) { 2130 dev_warn(adev->dev, "sched hw submission jobs (%d) must be a power of 2\n", 2131 amdgpu_sched_hw_submission); 2132 amdgpu_sched_hw_submission = roundup_pow_of_two(amdgpu_sched_hw_submission); 2133 } 2134 2135 if (amdgpu_reset_method < -1 || amdgpu_reset_method > 4) { 2136 dev_warn(adev->dev, "invalid option for reset method, reverting to default\n"); 2137 amdgpu_reset_method = -1; 2138 } 2139 2140 amdgpu_device_check_smu_prv_buffer_size(adev); 2141 2142 amdgpu_device_check_vm_size(adev); 2143 2144 amdgpu_device_check_block_size(adev); 2145 2146 adev->firmware.load_type = amdgpu_ucode_get_load_type(adev, amdgpu_fw_load_type); 2147 2148 for (i = 0; i < MAX_XCP; i++) { 2149 switch (amdgpu_enforce_isolation) { 2150 case -1: 2151 case 0: 2152 default: 2153 /* disable */ 2154 adev->enforce_isolation[i] = AMDGPU_ENFORCE_ISOLATION_DISABLE; 2155 break; 2156 case 1: 2157 /* enable */ 2158 adev->enforce_isolation[i] = 2159 AMDGPU_ENFORCE_ISOLATION_ENABLE; 2160 break; 2161 case 2: 2162 /* enable legacy mode */ 2163 adev->enforce_isolation[i] = 2164 AMDGPU_ENFORCE_ISOLATION_ENABLE_LEGACY; 2165 break; 2166 } 2167 } 2168 2169 return 0; 2170 } 2171 2172 /** 2173 * amdgpu_switcheroo_set_state - set switcheroo state 2174 * 2175 * @pdev: pci dev pointer 2176 * @state: vga_switcheroo state 2177 * 2178 * Callback for the switcheroo driver. Suspends or resumes 2179 * the asics before or after it is powered up using ACPI methods. 2180 */ 2181 static void amdgpu_switcheroo_set_state(struct pci_dev *pdev, 2182 enum vga_switcheroo_state state) 2183 { 2184 struct drm_device *dev = pci_get_drvdata(pdev); 2185 int r; 2186 2187 if (amdgpu_device_supports_px(dev) && state == VGA_SWITCHEROO_OFF) 2188 return; 2189 2190 if (state == VGA_SWITCHEROO_ON) { 2191 pr_info("switched on\n"); 2192 /* don't suspend or resume card normally */ 2193 dev->switch_power_state = DRM_SWITCH_POWER_CHANGING; 2194 2195 pci_set_power_state(pdev, PCI_D0); 2196 amdgpu_device_load_pci_state(pdev); 2197 r = pci_enable_device(pdev); 2198 if (r) 2199 DRM_WARN("pci_enable_device failed (%d)\n", r); 2200 amdgpu_device_resume(dev, true); 2201 2202 dev->switch_power_state = DRM_SWITCH_POWER_ON; 2203 } else { 2204 pr_info("switched off\n"); 2205 dev->switch_power_state = DRM_SWITCH_POWER_CHANGING; 2206 amdgpu_device_prepare(dev); 2207 amdgpu_device_suspend(dev, true); 2208 amdgpu_device_cache_pci_state(pdev); 2209 /* Shut down the device */ 2210 pci_disable_device(pdev); 2211 pci_set_power_state(pdev, PCI_D3cold); 2212 dev->switch_power_state = DRM_SWITCH_POWER_OFF; 2213 } 2214 } 2215 2216 /** 2217 * amdgpu_switcheroo_can_switch - see if switcheroo state can change 2218 * 2219 * @pdev: pci dev pointer 2220 * 2221 * Callback for the switcheroo driver. Check of the switcheroo 2222 * state can be changed. 2223 * Returns true if the state can be changed, false if not. 2224 */ 2225 static bool amdgpu_switcheroo_can_switch(struct pci_dev *pdev) 2226 { 2227 struct drm_device *dev = pci_get_drvdata(pdev); 2228 2229 /* 2230 * FIXME: open_count is protected by drm_global_mutex but that would lead to 2231 * locking inversion with the driver load path. And the access here is 2232 * completely racy anyway. So don't bother with locking for now. 2233 */ 2234 return atomic_read(&dev->open_count) == 0; 2235 } 2236 2237 static const struct vga_switcheroo_client_ops amdgpu_switcheroo_ops = { 2238 .set_gpu_state = amdgpu_switcheroo_set_state, 2239 .reprobe = NULL, 2240 .can_switch = amdgpu_switcheroo_can_switch, 2241 }; 2242 2243 /** 2244 * amdgpu_device_ip_set_clockgating_state - set the CG state 2245 * 2246 * @dev: amdgpu_device pointer 2247 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.) 2248 * @state: clockgating state (gate or ungate) 2249 * 2250 * Sets the requested clockgating state for all instances of 2251 * the hardware IP specified. 2252 * Returns the error code from the last instance. 2253 */ 2254 int amdgpu_device_ip_set_clockgating_state(void *dev, 2255 enum amd_ip_block_type block_type, 2256 enum amd_clockgating_state state) 2257 { 2258 struct amdgpu_device *adev = dev; 2259 int i, r = 0; 2260 2261 for (i = 0; i < adev->num_ip_blocks; i++) { 2262 if (!adev->ip_blocks[i].status.valid) 2263 continue; 2264 if (adev->ip_blocks[i].version->type != block_type) 2265 continue; 2266 if (!adev->ip_blocks[i].version->funcs->set_clockgating_state) 2267 continue; 2268 r = adev->ip_blocks[i].version->funcs->set_clockgating_state( 2269 &adev->ip_blocks[i], state); 2270 if (r) 2271 DRM_ERROR("set_clockgating_state of IP block <%s> failed %d\n", 2272 adev->ip_blocks[i].version->funcs->name, r); 2273 } 2274 return r; 2275 } 2276 2277 /** 2278 * amdgpu_device_ip_set_powergating_state - set the PG state 2279 * 2280 * @dev: amdgpu_device pointer 2281 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.) 2282 * @state: powergating state (gate or ungate) 2283 * 2284 * Sets the requested powergating state for all instances of 2285 * the hardware IP specified. 2286 * Returns the error code from the last instance. 2287 */ 2288 int amdgpu_device_ip_set_powergating_state(void *dev, 2289 enum amd_ip_block_type block_type, 2290 enum amd_powergating_state state) 2291 { 2292 struct amdgpu_device *adev = dev; 2293 int i, r = 0; 2294 2295 for (i = 0; i < adev->num_ip_blocks; i++) { 2296 if (!adev->ip_blocks[i].status.valid) 2297 continue; 2298 if (adev->ip_blocks[i].version->type != block_type) 2299 continue; 2300 if (!adev->ip_blocks[i].version->funcs->set_powergating_state) 2301 continue; 2302 r = adev->ip_blocks[i].version->funcs->set_powergating_state( 2303 &adev->ip_blocks[i], state); 2304 if (r) 2305 DRM_ERROR("set_powergating_state of IP block <%s> failed %d\n", 2306 adev->ip_blocks[i].version->funcs->name, r); 2307 } 2308 return r; 2309 } 2310 2311 /** 2312 * amdgpu_device_ip_get_clockgating_state - get the CG state 2313 * 2314 * @adev: amdgpu_device pointer 2315 * @flags: clockgating feature flags 2316 * 2317 * Walks the list of IPs on the device and updates the clockgating 2318 * flags for each IP. 2319 * Updates @flags with the feature flags for each hardware IP where 2320 * clockgating is enabled. 2321 */ 2322 void amdgpu_device_ip_get_clockgating_state(struct amdgpu_device *adev, 2323 u64 *flags) 2324 { 2325 int i; 2326 2327 for (i = 0; i < adev->num_ip_blocks; i++) { 2328 if (!adev->ip_blocks[i].status.valid) 2329 continue; 2330 if (adev->ip_blocks[i].version->funcs->get_clockgating_state) 2331 adev->ip_blocks[i].version->funcs->get_clockgating_state( 2332 &adev->ip_blocks[i], flags); 2333 } 2334 } 2335 2336 /** 2337 * amdgpu_device_ip_wait_for_idle - wait for idle 2338 * 2339 * @adev: amdgpu_device pointer 2340 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.) 2341 * 2342 * Waits for the request hardware IP to be idle. 2343 * Returns 0 for success or a negative error code on failure. 2344 */ 2345 int amdgpu_device_ip_wait_for_idle(struct amdgpu_device *adev, 2346 enum amd_ip_block_type block_type) 2347 { 2348 int i, r; 2349 2350 for (i = 0; i < adev->num_ip_blocks; i++) { 2351 if (!adev->ip_blocks[i].status.valid) 2352 continue; 2353 if (adev->ip_blocks[i].version->type == block_type) { 2354 if (adev->ip_blocks[i].version->funcs->wait_for_idle) { 2355 r = adev->ip_blocks[i].version->funcs->wait_for_idle( 2356 &adev->ip_blocks[i]); 2357 if (r) 2358 return r; 2359 } 2360 break; 2361 } 2362 } 2363 return 0; 2364 2365 } 2366 2367 /** 2368 * amdgpu_device_ip_is_valid - is the hardware IP enabled 2369 * 2370 * @adev: amdgpu_device pointer 2371 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.) 2372 * 2373 * Check if the hardware IP is enable or not. 2374 * Returns true if it the IP is enable, false if not. 2375 */ 2376 bool amdgpu_device_ip_is_valid(struct amdgpu_device *adev, 2377 enum amd_ip_block_type block_type) 2378 { 2379 int i; 2380 2381 for (i = 0; i < adev->num_ip_blocks; i++) { 2382 if (adev->ip_blocks[i].version->type == block_type) 2383 return adev->ip_blocks[i].status.valid; 2384 } 2385 return false; 2386 2387 } 2388 2389 /** 2390 * amdgpu_device_ip_get_ip_block - get a hw IP pointer 2391 * 2392 * @adev: amdgpu_device pointer 2393 * @type: Type of hardware IP (SMU, GFX, UVD, etc.) 2394 * 2395 * Returns a pointer to the hardware IP block structure 2396 * if it exists for the asic, otherwise NULL. 2397 */ 2398 struct amdgpu_ip_block * 2399 amdgpu_device_ip_get_ip_block(struct amdgpu_device *adev, 2400 enum amd_ip_block_type type) 2401 { 2402 int i; 2403 2404 for (i = 0; i < adev->num_ip_blocks; i++) 2405 if (adev->ip_blocks[i].version->type == type) 2406 return &adev->ip_blocks[i]; 2407 2408 return NULL; 2409 } 2410 2411 /** 2412 * amdgpu_device_ip_block_version_cmp 2413 * 2414 * @adev: amdgpu_device pointer 2415 * @type: enum amd_ip_block_type 2416 * @major: major version 2417 * @minor: minor version 2418 * 2419 * return 0 if equal or greater 2420 * return 1 if smaller or the ip_block doesn't exist 2421 */ 2422 int amdgpu_device_ip_block_version_cmp(struct amdgpu_device *adev, 2423 enum amd_ip_block_type type, 2424 u32 major, u32 minor) 2425 { 2426 struct amdgpu_ip_block *ip_block = amdgpu_device_ip_get_ip_block(adev, type); 2427 2428 if (ip_block && ((ip_block->version->major > major) || 2429 ((ip_block->version->major == major) && 2430 (ip_block->version->minor >= minor)))) 2431 return 0; 2432 2433 return 1; 2434 } 2435 2436 /** 2437 * amdgpu_device_ip_block_add 2438 * 2439 * @adev: amdgpu_device pointer 2440 * @ip_block_version: pointer to the IP to add 2441 * 2442 * Adds the IP block driver information to the collection of IPs 2443 * on the asic. 2444 */ 2445 int amdgpu_device_ip_block_add(struct amdgpu_device *adev, 2446 const struct amdgpu_ip_block_version *ip_block_version) 2447 { 2448 if (!ip_block_version) 2449 return -EINVAL; 2450 2451 switch (ip_block_version->type) { 2452 case AMD_IP_BLOCK_TYPE_VCN: 2453 if (adev->harvest_ip_mask & AMD_HARVEST_IP_VCN_MASK) 2454 return 0; 2455 break; 2456 case AMD_IP_BLOCK_TYPE_JPEG: 2457 if (adev->harvest_ip_mask & AMD_HARVEST_IP_JPEG_MASK) 2458 return 0; 2459 break; 2460 default: 2461 break; 2462 } 2463 2464 dev_info(adev->dev, "detected ip block number %d <%s>\n", 2465 adev->num_ip_blocks, ip_block_version->funcs->name); 2466 2467 adev->ip_blocks[adev->num_ip_blocks].adev = adev; 2468 2469 adev->ip_blocks[adev->num_ip_blocks++].version = ip_block_version; 2470 2471 return 0; 2472 } 2473 2474 /** 2475 * amdgpu_device_enable_virtual_display - enable virtual display feature 2476 * 2477 * @adev: amdgpu_device pointer 2478 * 2479 * Enabled the virtual display feature if the user has enabled it via 2480 * the module parameter virtual_display. This feature provides a virtual 2481 * display hardware on headless boards or in virtualized environments. 2482 * This function parses and validates the configuration string specified by 2483 * the user and configures the virtual display configuration (number of 2484 * virtual connectors, crtcs, etc.) specified. 2485 */ 2486 static void amdgpu_device_enable_virtual_display(struct amdgpu_device *adev) 2487 { 2488 adev->enable_virtual_display = false; 2489 2490 if (amdgpu_virtual_display) { 2491 const char *pci_address_name = pci_name(adev->pdev); 2492 char *pciaddstr, *pciaddstr_tmp, *pciaddname_tmp, *pciaddname; 2493 2494 pciaddstr = kstrdup(amdgpu_virtual_display, GFP_KERNEL); 2495 pciaddstr_tmp = pciaddstr; 2496 while ((pciaddname_tmp = strsep(&pciaddstr_tmp, ";"))) { 2497 pciaddname = strsep(&pciaddname_tmp, ","); 2498 if (!strcmp("all", pciaddname) 2499 || !strcmp(pci_address_name, pciaddname)) { 2500 long num_crtc; 2501 int res = -1; 2502 2503 adev->enable_virtual_display = true; 2504 2505 if (pciaddname_tmp) 2506 res = kstrtol(pciaddname_tmp, 10, 2507 &num_crtc); 2508 2509 if (!res) { 2510 if (num_crtc < 1) 2511 num_crtc = 1; 2512 if (num_crtc > 6) 2513 num_crtc = 6; 2514 adev->mode_info.num_crtc = num_crtc; 2515 } else { 2516 adev->mode_info.num_crtc = 1; 2517 } 2518 break; 2519 } 2520 } 2521 2522 DRM_INFO("virtual display string:%s, %s:virtual_display:%d, num_crtc:%d\n", 2523 amdgpu_virtual_display, pci_address_name, 2524 adev->enable_virtual_display, adev->mode_info.num_crtc); 2525 2526 kfree(pciaddstr); 2527 } 2528 } 2529 2530 void amdgpu_device_set_sriov_virtual_display(struct amdgpu_device *adev) 2531 { 2532 if (amdgpu_sriov_vf(adev) && !adev->enable_virtual_display) { 2533 adev->mode_info.num_crtc = 1; 2534 adev->enable_virtual_display = true; 2535 DRM_INFO("virtual_display:%d, num_crtc:%d\n", 2536 adev->enable_virtual_display, adev->mode_info.num_crtc); 2537 } 2538 } 2539 2540 /** 2541 * amdgpu_device_parse_gpu_info_fw - parse gpu info firmware 2542 * 2543 * @adev: amdgpu_device pointer 2544 * 2545 * Parses the asic configuration parameters specified in the gpu info 2546 * firmware and makes them available to the driver for use in configuring 2547 * the asic. 2548 * Returns 0 on success, -EINVAL on failure. 2549 */ 2550 static int amdgpu_device_parse_gpu_info_fw(struct amdgpu_device *adev) 2551 { 2552 const char *chip_name; 2553 int err; 2554 const struct gpu_info_firmware_header_v1_0 *hdr; 2555 2556 adev->firmware.gpu_info_fw = NULL; 2557 2558 if (adev->mman.discovery_bin) 2559 return 0; 2560 2561 switch (adev->asic_type) { 2562 default: 2563 return 0; 2564 case CHIP_VEGA10: 2565 chip_name = "vega10"; 2566 break; 2567 case CHIP_VEGA12: 2568 chip_name = "vega12"; 2569 break; 2570 case CHIP_RAVEN: 2571 if (adev->apu_flags & AMD_APU_IS_RAVEN2) 2572 chip_name = "raven2"; 2573 else if (adev->apu_flags & AMD_APU_IS_PICASSO) 2574 chip_name = "picasso"; 2575 else 2576 chip_name = "raven"; 2577 break; 2578 case CHIP_ARCTURUS: 2579 chip_name = "arcturus"; 2580 break; 2581 case CHIP_NAVI12: 2582 chip_name = "navi12"; 2583 break; 2584 } 2585 2586 err = amdgpu_ucode_request(adev, &adev->firmware.gpu_info_fw, 2587 AMDGPU_UCODE_OPTIONAL, 2588 "amdgpu/%s_gpu_info.bin", chip_name); 2589 if (err) { 2590 dev_err(adev->dev, 2591 "Failed to get gpu_info firmware \"%s_gpu_info.bin\"\n", 2592 chip_name); 2593 goto out; 2594 } 2595 2596 hdr = (const struct gpu_info_firmware_header_v1_0 *)adev->firmware.gpu_info_fw->data; 2597 amdgpu_ucode_print_gpu_info_hdr(&hdr->header); 2598 2599 switch (hdr->version_major) { 2600 case 1: 2601 { 2602 const struct gpu_info_firmware_v1_0 *gpu_info_fw = 2603 (const struct gpu_info_firmware_v1_0 *)(adev->firmware.gpu_info_fw->data + 2604 le32_to_cpu(hdr->header.ucode_array_offset_bytes)); 2605 2606 /* 2607 * Should be dropped when DAL no longer needs it. 2608 */ 2609 if (adev->asic_type == CHIP_NAVI12) 2610 goto parse_soc_bounding_box; 2611 2612 adev->gfx.config.max_shader_engines = le32_to_cpu(gpu_info_fw->gc_num_se); 2613 adev->gfx.config.max_cu_per_sh = le32_to_cpu(gpu_info_fw->gc_num_cu_per_sh); 2614 adev->gfx.config.max_sh_per_se = le32_to_cpu(gpu_info_fw->gc_num_sh_per_se); 2615 adev->gfx.config.max_backends_per_se = le32_to_cpu(gpu_info_fw->gc_num_rb_per_se); 2616 adev->gfx.config.max_texture_channel_caches = 2617 le32_to_cpu(gpu_info_fw->gc_num_tccs); 2618 adev->gfx.config.max_gprs = le32_to_cpu(gpu_info_fw->gc_num_gprs); 2619 adev->gfx.config.max_gs_threads = le32_to_cpu(gpu_info_fw->gc_num_max_gs_thds); 2620 adev->gfx.config.gs_vgt_table_depth = le32_to_cpu(gpu_info_fw->gc_gs_table_depth); 2621 adev->gfx.config.gs_prim_buffer_depth = le32_to_cpu(gpu_info_fw->gc_gsprim_buff_depth); 2622 adev->gfx.config.double_offchip_lds_buf = 2623 le32_to_cpu(gpu_info_fw->gc_double_offchip_lds_buffer); 2624 adev->gfx.cu_info.wave_front_size = le32_to_cpu(gpu_info_fw->gc_wave_size); 2625 adev->gfx.cu_info.max_waves_per_simd = 2626 le32_to_cpu(gpu_info_fw->gc_max_waves_per_simd); 2627 adev->gfx.cu_info.max_scratch_slots_per_cu = 2628 le32_to_cpu(gpu_info_fw->gc_max_scratch_slots_per_cu); 2629 adev->gfx.cu_info.lds_size = le32_to_cpu(gpu_info_fw->gc_lds_size); 2630 if (hdr->version_minor >= 1) { 2631 const struct gpu_info_firmware_v1_1 *gpu_info_fw = 2632 (const struct gpu_info_firmware_v1_1 *)(adev->firmware.gpu_info_fw->data + 2633 le32_to_cpu(hdr->header.ucode_array_offset_bytes)); 2634 adev->gfx.config.num_sc_per_sh = 2635 le32_to_cpu(gpu_info_fw->num_sc_per_sh); 2636 adev->gfx.config.num_packer_per_sc = 2637 le32_to_cpu(gpu_info_fw->num_packer_per_sc); 2638 } 2639 2640 parse_soc_bounding_box: 2641 /* 2642 * soc bounding box info is not integrated in disocovery table, 2643 * we always need to parse it from gpu info firmware if needed. 2644 */ 2645 if (hdr->version_minor == 2) { 2646 const struct gpu_info_firmware_v1_2 *gpu_info_fw = 2647 (const struct gpu_info_firmware_v1_2 *)(adev->firmware.gpu_info_fw->data + 2648 le32_to_cpu(hdr->header.ucode_array_offset_bytes)); 2649 adev->dm.soc_bounding_box = &gpu_info_fw->soc_bounding_box; 2650 } 2651 break; 2652 } 2653 default: 2654 dev_err(adev->dev, 2655 "Unsupported gpu_info table %d\n", hdr->header.ucode_version); 2656 err = -EINVAL; 2657 goto out; 2658 } 2659 out: 2660 return err; 2661 } 2662 2663 /** 2664 * amdgpu_device_ip_early_init - run early init for hardware IPs 2665 * 2666 * @adev: amdgpu_device pointer 2667 * 2668 * Early initialization pass for hardware IPs. The hardware IPs that make 2669 * up each asic are discovered each IP's early_init callback is run. This 2670 * is the first stage in initializing the asic. 2671 * Returns 0 on success, negative error code on failure. 2672 */ 2673 static int amdgpu_device_ip_early_init(struct amdgpu_device *adev) 2674 { 2675 struct amdgpu_ip_block *ip_block; 2676 struct pci_dev *parent; 2677 bool total, skip_bios; 2678 uint32_t bios_flags; 2679 int i, r; 2680 2681 amdgpu_device_enable_virtual_display(adev); 2682 2683 if (amdgpu_sriov_vf(adev)) { 2684 r = amdgpu_virt_request_full_gpu(adev, true); 2685 if (r) 2686 return r; 2687 } 2688 2689 switch (adev->asic_type) { 2690 #ifdef CONFIG_DRM_AMDGPU_SI 2691 case CHIP_VERDE: 2692 case CHIP_TAHITI: 2693 case CHIP_PITCAIRN: 2694 case CHIP_OLAND: 2695 case CHIP_HAINAN: 2696 adev->family = AMDGPU_FAMILY_SI; 2697 r = si_set_ip_blocks(adev); 2698 if (r) 2699 return r; 2700 break; 2701 #endif 2702 #ifdef CONFIG_DRM_AMDGPU_CIK 2703 case CHIP_BONAIRE: 2704 case CHIP_HAWAII: 2705 case CHIP_KAVERI: 2706 case CHIP_KABINI: 2707 case CHIP_MULLINS: 2708 if (adev->flags & AMD_IS_APU) 2709 adev->family = AMDGPU_FAMILY_KV; 2710 else 2711 adev->family = AMDGPU_FAMILY_CI; 2712 2713 r = cik_set_ip_blocks(adev); 2714 if (r) 2715 return r; 2716 break; 2717 #endif 2718 case CHIP_TOPAZ: 2719 case CHIP_TONGA: 2720 case CHIP_FIJI: 2721 case CHIP_POLARIS10: 2722 case CHIP_POLARIS11: 2723 case CHIP_POLARIS12: 2724 case CHIP_VEGAM: 2725 case CHIP_CARRIZO: 2726 case CHIP_STONEY: 2727 if (adev->flags & AMD_IS_APU) 2728 adev->family = AMDGPU_FAMILY_CZ; 2729 else 2730 adev->family = AMDGPU_FAMILY_VI; 2731 2732 r = vi_set_ip_blocks(adev); 2733 if (r) 2734 return r; 2735 break; 2736 default: 2737 r = amdgpu_discovery_set_ip_blocks(adev); 2738 if (r) 2739 return r; 2740 break; 2741 } 2742 2743 /* Check for IP version 9.4.3 with A0 hardware */ 2744 if (amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 3) && 2745 !amdgpu_device_get_rev_id(adev)) { 2746 dev_err(adev->dev, "Unsupported A0 hardware\n"); 2747 return -ENODEV; /* device unsupported - no device error */ 2748 } 2749 2750 if (amdgpu_has_atpx() && 2751 (amdgpu_is_atpx_hybrid() || 2752 amdgpu_has_atpx_dgpu_power_cntl()) && 2753 ((adev->flags & AMD_IS_APU) == 0) && 2754 !dev_is_removable(&adev->pdev->dev)) 2755 adev->flags |= AMD_IS_PX; 2756 2757 if (!(adev->flags & AMD_IS_APU)) { 2758 parent = pcie_find_root_port(adev->pdev); 2759 adev->has_pr3 = parent ? pci_pr3_present(parent) : false; 2760 } 2761 2762 adev->pm.pp_feature = amdgpu_pp_feature_mask; 2763 if (amdgpu_sriov_vf(adev) || sched_policy == KFD_SCHED_POLICY_NO_HWS) 2764 adev->pm.pp_feature &= ~PP_GFXOFF_MASK; 2765 if (amdgpu_sriov_vf(adev) && adev->asic_type == CHIP_SIENNA_CICHLID) 2766 adev->pm.pp_feature &= ~PP_OVERDRIVE_MASK; 2767 if (!amdgpu_device_pcie_dynamic_switching_supported(adev)) 2768 adev->pm.pp_feature &= ~PP_PCIE_DPM_MASK; 2769 2770 total = true; 2771 for (i = 0; i < adev->num_ip_blocks; i++) { 2772 ip_block = &adev->ip_blocks[i]; 2773 2774 if ((amdgpu_ip_block_mask & (1 << i)) == 0) { 2775 DRM_WARN("disabled ip block: %d <%s>\n", 2776 i, adev->ip_blocks[i].version->funcs->name); 2777 adev->ip_blocks[i].status.valid = false; 2778 } else if (ip_block->version->funcs->early_init) { 2779 r = ip_block->version->funcs->early_init(ip_block); 2780 if (r == -ENOENT) { 2781 adev->ip_blocks[i].status.valid = false; 2782 } else if (r) { 2783 DRM_ERROR("early_init of IP block <%s> failed %d\n", 2784 adev->ip_blocks[i].version->funcs->name, r); 2785 total = false; 2786 } else { 2787 adev->ip_blocks[i].status.valid = true; 2788 } 2789 } else { 2790 adev->ip_blocks[i].status.valid = true; 2791 } 2792 /* get the vbios after the asic_funcs are set up */ 2793 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON) { 2794 r = amdgpu_device_parse_gpu_info_fw(adev); 2795 if (r) 2796 return r; 2797 2798 bios_flags = amdgpu_device_get_vbios_flags(adev); 2799 skip_bios = !!(bios_flags & AMDGPU_VBIOS_SKIP); 2800 /* Read BIOS */ 2801 if (!skip_bios) { 2802 bool optional = 2803 !!(bios_flags & AMDGPU_VBIOS_OPTIONAL); 2804 if (!amdgpu_get_bios(adev) && !optional) 2805 return -EINVAL; 2806 2807 if (optional && !adev->bios) 2808 dev_info( 2809 adev->dev, 2810 "VBIOS image optional, proceeding without VBIOS image"); 2811 2812 if (adev->bios) { 2813 r = amdgpu_atombios_init(adev); 2814 if (r) { 2815 dev_err(adev->dev, 2816 "amdgpu_atombios_init failed\n"); 2817 amdgpu_vf_error_put( 2818 adev, 2819 AMDGIM_ERROR_VF_ATOMBIOS_INIT_FAIL, 2820 0, 0); 2821 return r; 2822 } 2823 } 2824 } 2825 2826 /*get pf2vf msg info at it's earliest time*/ 2827 if (amdgpu_sriov_vf(adev)) 2828 amdgpu_virt_init_data_exchange(adev); 2829 2830 } 2831 } 2832 if (!total) 2833 return -ENODEV; 2834 2835 if (adev->gmc.xgmi.supported) 2836 amdgpu_xgmi_early_init(adev); 2837 2838 ip_block = amdgpu_device_ip_get_ip_block(adev, AMD_IP_BLOCK_TYPE_GFX); 2839 if (ip_block->status.valid != false) 2840 amdgpu_amdkfd_device_probe(adev); 2841 2842 adev->cg_flags &= amdgpu_cg_mask; 2843 adev->pg_flags &= amdgpu_pg_mask; 2844 2845 return 0; 2846 } 2847 2848 static int amdgpu_device_ip_hw_init_phase1(struct amdgpu_device *adev) 2849 { 2850 int i, r; 2851 2852 for (i = 0; i < adev->num_ip_blocks; i++) { 2853 if (!adev->ip_blocks[i].status.sw) 2854 continue; 2855 if (adev->ip_blocks[i].status.hw) 2856 continue; 2857 if (!amdgpu_ip_member_of_hwini( 2858 adev, adev->ip_blocks[i].version->type)) 2859 continue; 2860 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON || 2861 (amdgpu_sriov_vf(adev) && (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP)) || 2862 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH) { 2863 r = adev->ip_blocks[i].version->funcs->hw_init(&adev->ip_blocks[i]); 2864 if (r) { 2865 DRM_ERROR("hw_init of IP block <%s> failed %d\n", 2866 adev->ip_blocks[i].version->funcs->name, r); 2867 return r; 2868 } 2869 adev->ip_blocks[i].status.hw = true; 2870 } 2871 } 2872 2873 return 0; 2874 } 2875 2876 static int amdgpu_device_ip_hw_init_phase2(struct amdgpu_device *adev) 2877 { 2878 int i, r; 2879 2880 for (i = 0; i < adev->num_ip_blocks; i++) { 2881 if (!adev->ip_blocks[i].status.sw) 2882 continue; 2883 if (adev->ip_blocks[i].status.hw) 2884 continue; 2885 if (!amdgpu_ip_member_of_hwini( 2886 adev, adev->ip_blocks[i].version->type)) 2887 continue; 2888 r = adev->ip_blocks[i].version->funcs->hw_init(&adev->ip_blocks[i]); 2889 if (r) { 2890 DRM_ERROR("hw_init of IP block <%s> failed %d\n", 2891 adev->ip_blocks[i].version->funcs->name, r); 2892 return r; 2893 } 2894 adev->ip_blocks[i].status.hw = true; 2895 } 2896 2897 return 0; 2898 } 2899 2900 static int amdgpu_device_fw_loading(struct amdgpu_device *adev) 2901 { 2902 int r = 0; 2903 int i; 2904 uint32_t smu_version; 2905 2906 if (adev->asic_type >= CHIP_VEGA10) { 2907 for (i = 0; i < adev->num_ip_blocks; i++) { 2908 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_PSP) 2909 continue; 2910 2911 if (!amdgpu_ip_member_of_hwini(adev, 2912 AMD_IP_BLOCK_TYPE_PSP)) 2913 break; 2914 2915 if (!adev->ip_blocks[i].status.sw) 2916 continue; 2917 2918 /* no need to do the fw loading again if already done*/ 2919 if (adev->ip_blocks[i].status.hw == true) 2920 break; 2921 2922 if (amdgpu_in_reset(adev) || adev->in_suspend) { 2923 r = amdgpu_ip_block_resume(&adev->ip_blocks[i]); 2924 if (r) 2925 return r; 2926 } else { 2927 r = adev->ip_blocks[i].version->funcs->hw_init(&adev->ip_blocks[i]); 2928 if (r) { 2929 DRM_ERROR("hw_init of IP block <%s> failed %d\n", 2930 adev->ip_blocks[i].version->funcs->name, r); 2931 return r; 2932 } 2933 adev->ip_blocks[i].status.hw = true; 2934 } 2935 break; 2936 } 2937 } 2938 2939 if (!amdgpu_sriov_vf(adev) || adev->asic_type == CHIP_TONGA) 2940 r = amdgpu_pm_load_smu_firmware(adev, &smu_version); 2941 2942 return r; 2943 } 2944 2945 static int amdgpu_device_init_schedulers(struct amdgpu_device *adev) 2946 { 2947 struct drm_sched_init_args args = { 2948 .ops = &amdgpu_sched_ops, 2949 .num_rqs = DRM_SCHED_PRIORITY_COUNT, 2950 .timeout_wq = adev->reset_domain->wq, 2951 .dev = adev->dev, 2952 }; 2953 long timeout; 2954 int r, i; 2955 2956 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 2957 struct amdgpu_ring *ring = adev->rings[i]; 2958 2959 /* No need to setup the GPU scheduler for rings that don't need it */ 2960 if (!ring || ring->no_scheduler) 2961 continue; 2962 2963 switch (ring->funcs->type) { 2964 case AMDGPU_RING_TYPE_GFX: 2965 timeout = adev->gfx_timeout; 2966 break; 2967 case AMDGPU_RING_TYPE_COMPUTE: 2968 timeout = adev->compute_timeout; 2969 break; 2970 case AMDGPU_RING_TYPE_SDMA: 2971 timeout = adev->sdma_timeout; 2972 break; 2973 default: 2974 timeout = adev->video_timeout; 2975 break; 2976 } 2977 2978 args.timeout = timeout; 2979 args.credit_limit = ring->num_hw_submission; 2980 args.score = ring->sched_score; 2981 args.name = ring->name; 2982 2983 r = drm_sched_init(&ring->sched, &args); 2984 if (r) { 2985 DRM_ERROR("Failed to create scheduler on ring %s.\n", 2986 ring->name); 2987 return r; 2988 } 2989 r = amdgpu_uvd_entity_init(adev, ring); 2990 if (r) { 2991 DRM_ERROR("Failed to create UVD scheduling entity on ring %s.\n", 2992 ring->name); 2993 return r; 2994 } 2995 r = amdgpu_vce_entity_init(adev, ring); 2996 if (r) { 2997 DRM_ERROR("Failed to create VCE scheduling entity on ring %s.\n", 2998 ring->name); 2999 return r; 3000 } 3001 } 3002 3003 amdgpu_xcp_update_partition_sched_list(adev); 3004 3005 return 0; 3006 } 3007 3008 3009 /** 3010 * amdgpu_device_ip_init - run init for hardware IPs 3011 * 3012 * @adev: amdgpu_device pointer 3013 * 3014 * Main initialization pass for hardware IPs. The list of all the hardware 3015 * IPs that make up the asic is walked and the sw_init and hw_init callbacks 3016 * are run. sw_init initializes the software state associated with each IP 3017 * and hw_init initializes the hardware associated with each IP. 3018 * Returns 0 on success, negative error code on failure. 3019 */ 3020 static int amdgpu_device_ip_init(struct amdgpu_device *adev) 3021 { 3022 bool init_badpage; 3023 int i, r; 3024 3025 r = amdgpu_ras_init(adev); 3026 if (r) 3027 return r; 3028 3029 for (i = 0; i < adev->num_ip_blocks; i++) { 3030 if (!adev->ip_blocks[i].status.valid) 3031 continue; 3032 if (adev->ip_blocks[i].version->funcs->sw_init) { 3033 r = adev->ip_blocks[i].version->funcs->sw_init(&adev->ip_blocks[i]); 3034 if (r) { 3035 DRM_ERROR("sw_init of IP block <%s> failed %d\n", 3036 adev->ip_blocks[i].version->funcs->name, r); 3037 goto init_failed; 3038 } 3039 } 3040 adev->ip_blocks[i].status.sw = true; 3041 3042 if (!amdgpu_ip_member_of_hwini( 3043 adev, adev->ip_blocks[i].version->type)) 3044 continue; 3045 3046 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON) { 3047 /* need to do common hw init early so everything is set up for gmc */ 3048 r = adev->ip_blocks[i].version->funcs->hw_init(&adev->ip_blocks[i]); 3049 if (r) { 3050 DRM_ERROR("hw_init %d failed %d\n", i, r); 3051 goto init_failed; 3052 } 3053 adev->ip_blocks[i].status.hw = true; 3054 } else if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) { 3055 /* need to do gmc hw init early so we can allocate gpu mem */ 3056 /* Try to reserve bad pages early */ 3057 if (amdgpu_sriov_vf(adev)) 3058 amdgpu_virt_exchange_data(adev); 3059 3060 r = amdgpu_device_mem_scratch_init(adev); 3061 if (r) { 3062 DRM_ERROR("amdgpu_mem_scratch_init failed %d\n", r); 3063 goto init_failed; 3064 } 3065 r = adev->ip_blocks[i].version->funcs->hw_init(&adev->ip_blocks[i]); 3066 if (r) { 3067 DRM_ERROR("hw_init %d failed %d\n", i, r); 3068 goto init_failed; 3069 } 3070 r = amdgpu_device_wb_init(adev); 3071 if (r) { 3072 DRM_ERROR("amdgpu_device_wb_init failed %d\n", r); 3073 goto init_failed; 3074 } 3075 adev->ip_blocks[i].status.hw = true; 3076 3077 /* right after GMC hw init, we create CSA */ 3078 if (adev->gfx.mcbp) { 3079 r = amdgpu_allocate_static_csa(adev, &adev->virt.csa_obj, 3080 AMDGPU_GEM_DOMAIN_VRAM | 3081 AMDGPU_GEM_DOMAIN_GTT, 3082 AMDGPU_CSA_SIZE); 3083 if (r) { 3084 DRM_ERROR("allocate CSA failed %d\n", r); 3085 goto init_failed; 3086 } 3087 } 3088 3089 r = amdgpu_seq64_init(adev); 3090 if (r) { 3091 DRM_ERROR("allocate seq64 failed %d\n", r); 3092 goto init_failed; 3093 } 3094 } 3095 } 3096 3097 if (amdgpu_sriov_vf(adev)) 3098 amdgpu_virt_init_data_exchange(adev); 3099 3100 r = amdgpu_ib_pool_init(adev); 3101 if (r) { 3102 dev_err(adev->dev, "IB initialization failed (%d).\n", r); 3103 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_IB_INIT_FAIL, 0, r); 3104 goto init_failed; 3105 } 3106 3107 r = amdgpu_ucode_create_bo(adev); /* create ucode bo when sw_init complete*/ 3108 if (r) 3109 goto init_failed; 3110 3111 r = amdgpu_device_ip_hw_init_phase1(adev); 3112 if (r) 3113 goto init_failed; 3114 3115 r = amdgpu_device_fw_loading(adev); 3116 if (r) 3117 goto init_failed; 3118 3119 r = amdgpu_device_ip_hw_init_phase2(adev); 3120 if (r) 3121 goto init_failed; 3122 3123 /* 3124 * retired pages will be loaded from eeprom and reserved here, 3125 * it should be called after amdgpu_device_ip_hw_init_phase2 since 3126 * for some ASICs the RAS EEPROM code relies on SMU fully functioning 3127 * for I2C communication which only true at this point. 3128 * 3129 * amdgpu_ras_recovery_init may fail, but the upper only cares the 3130 * failure from bad gpu situation and stop amdgpu init process 3131 * accordingly. For other failed cases, it will still release all 3132 * the resource and print error message, rather than returning one 3133 * negative value to upper level. 3134 * 3135 * Note: theoretically, this should be called before all vram allocations 3136 * to protect retired page from abusing 3137 */ 3138 init_badpage = (adev->init_lvl->level != AMDGPU_INIT_LEVEL_MINIMAL_XGMI); 3139 r = amdgpu_ras_recovery_init(adev, init_badpage); 3140 if (r) 3141 goto init_failed; 3142 3143 /** 3144 * In case of XGMI grab extra reference for reset domain for this device 3145 */ 3146 if (adev->gmc.xgmi.num_physical_nodes > 1) { 3147 if (amdgpu_xgmi_add_device(adev) == 0) { 3148 if (!amdgpu_sriov_vf(adev)) { 3149 struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev); 3150 3151 if (WARN_ON(!hive)) { 3152 r = -ENOENT; 3153 goto init_failed; 3154 } 3155 3156 if (!hive->reset_domain || 3157 !amdgpu_reset_get_reset_domain(hive->reset_domain)) { 3158 r = -ENOENT; 3159 amdgpu_put_xgmi_hive(hive); 3160 goto init_failed; 3161 } 3162 3163 /* Drop the early temporary reset domain we created for device */ 3164 amdgpu_reset_put_reset_domain(adev->reset_domain); 3165 adev->reset_domain = hive->reset_domain; 3166 amdgpu_put_xgmi_hive(hive); 3167 } 3168 } 3169 } 3170 3171 r = amdgpu_device_init_schedulers(adev); 3172 if (r) 3173 goto init_failed; 3174 3175 if (adev->mman.buffer_funcs_ring->sched.ready) 3176 amdgpu_ttm_set_buffer_funcs_status(adev, true); 3177 3178 /* Don't init kfd if whole hive need to be reset during init */ 3179 if (adev->init_lvl->level != AMDGPU_INIT_LEVEL_MINIMAL_XGMI) { 3180 kgd2kfd_init_zone_device(adev); 3181 amdgpu_amdkfd_device_init(adev); 3182 } 3183 3184 amdgpu_fru_get_product_info(adev); 3185 3186 if (!amdgpu_sriov_vf(adev) || amdgpu_sriov_ras_cper_en(adev)) 3187 r = amdgpu_cper_init(adev); 3188 3189 init_failed: 3190 3191 return r; 3192 } 3193 3194 /** 3195 * amdgpu_device_fill_reset_magic - writes reset magic to gart pointer 3196 * 3197 * @adev: amdgpu_device pointer 3198 * 3199 * Writes a reset magic value to the gart pointer in VRAM. The driver calls 3200 * this function before a GPU reset. If the value is retained after a 3201 * GPU reset, VRAM has not been lost. Some GPU resets may destroy VRAM contents. 3202 */ 3203 static void amdgpu_device_fill_reset_magic(struct amdgpu_device *adev) 3204 { 3205 memcpy(adev->reset_magic, adev->gart.ptr, AMDGPU_RESET_MAGIC_NUM); 3206 } 3207 3208 /** 3209 * amdgpu_device_check_vram_lost - check if vram is valid 3210 * 3211 * @adev: amdgpu_device pointer 3212 * 3213 * Checks the reset magic value written to the gart pointer in VRAM. 3214 * The driver calls this after a GPU reset to see if the contents of 3215 * VRAM is lost or now. 3216 * returns true if vram is lost, false if not. 3217 */ 3218 static bool amdgpu_device_check_vram_lost(struct amdgpu_device *adev) 3219 { 3220 if (memcmp(adev->gart.ptr, adev->reset_magic, 3221 AMDGPU_RESET_MAGIC_NUM)) 3222 return true; 3223 3224 if (!amdgpu_in_reset(adev)) 3225 return false; 3226 3227 /* 3228 * For all ASICs with baco/mode1 reset, the VRAM is 3229 * always assumed to be lost. 3230 */ 3231 switch (amdgpu_asic_reset_method(adev)) { 3232 case AMD_RESET_METHOD_LINK: 3233 case AMD_RESET_METHOD_BACO: 3234 case AMD_RESET_METHOD_MODE1: 3235 return true; 3236 default: 3237 return false; 3238 } 3239 } 3240 3241 /** 3242 * amdgpu_device_set_cg_state - set clockgating for amdgpu device 3243 * 3244 * @adev: amdgpu_device pointer 3245 * @state: clockgating state (gate or ungate) 3246 * 3247 * The list of all the hardware IPs that make up the asic is walked and the 3248 * set_clockgating_state callbacks are run. 3249 * Late initialization pass enabling clockgating for hardware IPs. 3250 * Fini or suspend, pass disabling clockgating for hardware IPs. 3251 * Returns 0 on success, negative error code on failure. 3252 */ 3253 3254 int amdgpu_device_set_cg_state(struct amdgpu_device *adev, 3255 enum amd_clockgating_state state) 3256 { 3257 int i, j, r; 3258 3259 if (amdgpu_emu_mode == 1) 3260 return 0; 3261 3262 for (j = 0; j < adev->num_ip_blocks; j++) { 3263 i = state == AMD_CG_STATE_GATE ? j : adev->num_ip_blocks - j - 1; 3264 if (!adev->ip_blocks[i].status.late_initialized) 3265 continue; 3266 /* skip CG for GFX, SDMA on S0ix */ 3267 if (adev->in_s0ix && 3268 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GFX || 3269 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SDMA)) 3270 continue; 3271 /* skip CG for VCE/UVD, it's handled specially */ 3272 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_UVD && 3273 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCE && 3274 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCN && 3275 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_JPEG && 3276 adev->ip_blocks[i].version->funcs->set_clockgating_state) { 3277 /* enable clockgating to save power */ 3278 r = adev->ip_blocks[i].version->funcs->set_clockgating_state(&adev->ip_blocks[i], 3279 state); 3280 if (r) { 3281 DRM_ERROR("set_clockgating_state(gate) of IP block <%s> failed %d\n", 3282 adev->ip_blocks[i].version->funcs->name, r); 3283 return r; 3284 } 3285 } 3286 } 3287 3288 return 0; 3289 } 3290 3291 int amdgpu_device_set_pg_state(struct amdgpu_device *adev, 3292 enum amd_powergating_state state) 3293 { 3294 int i, j, r; 3295 3296 if (amdgpu_emu_mode == 1) 3297 return 0; 3298 3299 for (j = 0; j < adev->num_ip_blocks; j++) { 3300 i = state == AMD_PG_STATE_GATE ? j : adev->num_ip_blocks - j - 1; 3301 if (!adev->ip_blocks[i].status.late_initialized) 3302 continue; 3303 /* skip PG for GFX, SDMA on S0ix */ 3304 if (adev->in_s0ix && 3305 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GFX || 3306 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SDMA)) 3307 continue; 3308 /* skip CG for VCE/UVD, it's handled specially */ 3309 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_UVD && 3310 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCE && 3311 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCN && 3312 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_JPEG && 3313 adev->ip_blocks[i].version->funcs->set_powergating_state) { 3314 /* enable powergating to save power */ 3315 r = adev->ip_blocks[i].version->funcs->set_powergating_state(&adev->ip_blocks[i], 3316 state); 3317 if (r) { 3318 DRM_ERROR("set_powergating_state(gate) of IP block <%s> failed %d\n", 3319 adev->ip_blocks[i].version->funcs->name, r); 3320 return r; 3321 } 3322 } 3323 } 3324 return 0; 3325 } 3326 3327 static int amdgpu_device_enable_mgpu_fan_boost(void) 3328 { 3329 struct amdgpu_gpu_instance *gpu_ins; 3330 struct amdgpu_device *adev; 3331 int i, ret = 0; 3332 3333 mutex_lock(&mgpu_info.mutex); 3334 3335 /* 3336 * MGPU fan boost feature should be enabled 3337 * only when there are two or more dGPUs in 3338 * the system 3339 */ 3340 if (mgpu_info.num_dgpu < 2) 3341 goto out; 3342 3343 for (i = 0; i < mgpu_info.num_dgpu; i++) { 3344 gpu_ins = &(mgpu_info.gpu_ins[i]); 3345 adev = gpu_ins->adev; 3346 if (!(adev->flags & AMD_IS_APU) && 3347 !gpu_ins->mgpu_fan_enabled) { 3348 ret = amdgpu_dpm_enable_mgpu_fan_boost(adev); 3349 if (ret) 3350 break; 3351 3352 gpu_ins->mgpu_fan_enabled = 1; 3353 } 3354 } 3355 3356 out: 3357 mutex_unlock(&mgpu_info.mutex); 3358 3359 return ret; 3360 } 3361 3362 /** 3363 * amdgpu_device_ip_late_init - run late init for hardware IPs 3364 * 3365 * @adev: amdgpu_device pointer 3366 * 3367 * Late initialization pass for hardware IPs. The list of all the hardware 3368 * IPs that make up the asic is walked and the late_init callbacks are run. 3369 * late_init covers any special initialization that an IP requires 3370 * after all of the have been initialized or something that needs to happen 3371 * late in the init process. 3372 * Returns 0 on success, negative error code on failure. 3373 */ 3374 static int amdgpu_device_ip_late_init(struct amdgpu_device *adev) 3375 { 3376 struct amdgpu_gpu_instance *gpu_instance; 3377 int i = 0, r; 3378 3379 for (i = 0; i < adev->num_ip_blocks; i++) { 3380 if (!adev->ip_blocks[i].status.hw) 3381 continue; 3382 if (adev->ip_blocks[i].version->funcs->late_init) { 3383 r = adev->ip_blocks[i].version->funcs->late_init(&adev->ip_blocks[i]); 3384 if (r) { 3385 DRM_ERROR("late_init of IP block <%s> failed %d\n", 3386 adev->ip_blocks[i].version->funcs->name, r); 3387 return r; 3388 } 3389 } 3390 adev->ip_blocks[i].status.late_initialized = true; 3391 } 3392 3393 r = amdgpu_ras_late_init(adev); 3394 if (r) { 3395 DRM_ERROR("amdgpu_ras_late_init failed %d", r); 3396 return r; 3397 } 3398 3399 if (!amdgpu_reset_in_recovery(adev)) 3400 amdgpu_ras_set_error_query_ready(adev, true); 3401 3402 amdgpu_device_set_cg_state(adev, AMD_CG_STATE_GATE); 3403 amdgpu_device_set_pg_state(adev, AMD_PG_STATE_GATE); 3404 3405 amdgpu_device_fill_reset_magic(adev); 3406 3407 r = amdgpu_device_enable_mgpu_fan_boost(); 3408 if (r) 3409 DRM_ERROR("enable mgpu fan boost failed (%d).\n", r); 3410 3411 /* For passthrough configuration on arcturus and aldebaran, enable special handling SBR */ 3412 if (amdgpu_passthrough(adev) && 3413 ((adev->asic_type == CHIP_ARCTURUS && adev->gmc.xgmi.num_physical_nodes > 1) || 3414 adev->asic_type == CHIP_ALDEBARAN)) 3415 amdgpu_dpm_handle_passthrough_sbr(adev, true); 3416 3417 if (adev->gmc.xgmi.num_physical_nodes > 1) { 3418 mutex_lock(&mgpu_info.mutex); 3419 3420 /* 3421 * Reset device p-state to low as this was booted with high. 3422 * 3423 * This should be performed only after all devices from the same 3424 * hive get initialized. 3425 * 3426 * However, it's unknown how many device in the hive in advance. 3427 * As this is counted one by one during devices initializations. 3428 * 3429 * So, we wait for all XGMI interlinked devices initialized. 3430 * This may bring some delays as those devices may come from 3431 * different hives. But that should be OK. 3432 */ 3433 if (mgpu_info.num_dgpu == adev->gmc.xgmi.num_physical_nodes) { 3434 for (i = 0; i < mgpu_info.num_gpu; i++) { 3435 gpu_instance = &(mgpu_info.gpu_ins[i]); 3436 if (gpu_instance->adev->flags & AMD_IS_APU) 3437 continue; 3438 3439 r = amdgpu_xgmi_set_pstate(gpu_instance->adev, 3440 AMDGPU_XGMI_PSTATE_MIN); 3441 if (r) { 3442 DRM_ERROR("pstate setting failed (%d).\n", r); 3443 break; 3444 } 3445 } 3446 } 3447 3448 mutex_unlock(&mgpu_info.mutex); 3449 } 3450 3451 return 0; 3452 } 3453 3454 static void amdgpu_ip_block_hw_fini(struct amdgpu_ip_block *ip_block) 3455 { 3456 int r; 3457 3458 if (!ip_block->version->funcs->hw_fini) { 3459 DRM_ERROR("hw_fini of IP block <%s> not defined\n", 3460 ip_block->version->funcs->name); 3461 } else { 3462 r = ip_block->version->funcs->hw_fini(ip_block); 3463 /* XXX handle errors */ 3464 if (r) { 3465 DRM_DEBUG("hw_fini of IP block <%s> failed %d\n", 3466 ip_block->version->funcs->name, r); 3467 } 3468 } 3469 3470 ip_block->status.hw = false; 3471 } 3472 3473 /** 3474 * amdgpu_device_smu_fini_early - smu hw_fini wrapper 3475 * 3476 * @adev: amdgpu_device pointer 3477 * 3478 * For ASICs need to disable SMC first 3479 */ 3480 static void amdgpu_device_smu_fini_early(struct amdgpu_device *adev) 3481 { 3482 int i; 3483 3484 if (amdgpu_ip_version(adev, GC_HWIP, 0) > IP_VERSION(9, 0, 0)) 3485 return; 3486 3487 for (i = 0; i < adev->num_ip_blocks; i++) { 3488 if (!adev->ip_blocks[i].status.hw) 3489 continue; 3490 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) { 3491 amdgpu_ip_block_hw_fini(&adev->ip_blocks[i]); 3492 break; 3493 } 3494 } 3495 } 3496 3497 static int amdgpu_device_ip_fini_early(struct amdgpu_device *adev) 3498 { 3499 int i, r; 3500 3501 for (i = 0; i < adev->num_ip_blocks; i++) { 3502 if (!adev->ip_blocks[i].version->funcs->early_fini) 3503 continue; 3504 3505 r = adev->ip_blocks[i].version->funcs->early_fini(&adev->ip_blocks[i]); 3506 if (r) { 3507 DRM_DEBUG("early_fini of IP block <%s> failed %d\n", 3508 adev->ip_blocks[i].version->funcs->name, r); 3509 } 3510 } 3511 3512 amdgpu_device_set_pg_state(adev, AMD_PG_STATE_UNGATE); 3513 amdgpu_device_set_cg_state(adev, AMD_CG_STATE_UNGATE); 3514 3515 amdgpu_amdkfd_suspend(adev, false); 3516 #ifdef CONFIG_DRM_AMDGPU_NAVI3X_USERQ 3517 amdgpu_userq_suspend(adev); 3518 #endif 3519 3520 /* Workaround for ASICs need to disable SMC first */ 3521 amdgpu_device_smu_fini_early(adev); 3522 3523 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 3524 if (!adev->ip_blocks[i].status.hw) 3525 continue; 3526 3527 amdgpu_ip_block_hw_fini(&adev->ip_blocks[i]); 3528 } 3529 3530 if (amdgpu_sriov_vf(adev)) { 3531 if (amdgpu_virt_release_full_gpu(adev, false)) 3532 DRM_ERROR("failed to release exclusive mode on fini\n"); 3533 } 3534 3535 return 0; 3536 } 3537 3538 /** 3539 * amdgpu_device_ip_fini - run fini for hardware IPs 3540 * 3541 * @adev: amdgpu_device pointer 3542 * 3543 * Main teardown pass for hardware IPs. The list of all the hardware 3544 * IPs that make up the asic is walked and the hw_fini and sw_fini callbacks 3545 * are run. hw_fini tears down the hardware associated with each IP 3546 * and sw_fini tears down any software state associated with each IP. 3547 * Returns 0 on success, negative error code on failure. 3548 */ 3549 static int amdgpu_device_ip_fini(struct amdgpu_device *adev) 3550 { 3551 int i, r; 3552 3553 amdgpu_cper_fini(adev); 3554 3555 if (amdgpu_sriov_vf(adev) && adev->virt.ras_init_done) 3556 amdgpu_virt_release_ras_err_handler_data(adev); 3557 3558 if (adev->gmc.xgmi.num_physical_nodes > 1) 3559 amdgpu_xgmi_remove_device(adev); 3560 3561 amdgpu_amdkfd_device_fini_sw(adev); 3562 3563 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 3564 if (!adev->ip_blocks[i].status.sw) 3565 continue; 3566 3567 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) { 3568 amdgpu_ucode_free_bo(adev); 3569 amdgpu_free_static_csa(&adev->virt.csa_obj); 3570 amdgpu_device_wb_fini(adev); 3571 amdgpu_device_mem_scratch_fini(adev); 3572 amdgpu_ib_pool_fini(adev); 3573 amdgpu_seq64_fini(adev); 3574 amdgpu_doorbell_fini(adev); 3575 } 3576 if (adev->ip_blocks[i].version->funcs->sw_fini) { 3577 r = adev->ip_blocks[i].version->funcs->sw_fini(&adev->ip_blocks[i]); 3578 /* XXX handle errors */ 3579 if (r) { 3580 DRM_DEBUG("sw_fini of IP block <%s> failed %d\n", 3581 adev->ip_blocks[i].version->funcs->name, r); 3582 } 3583 } 3584 adev->ip_blocks[i].status.sw = false; 3585 adev->ip_blocks[i].status.valid = false; 3586 } 3587 3588 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 3589 if (!adev->ip_blocks[i].status.late_initialized) 3590 continue; 3591 if (adev->ip_blocks[i].version->funcs->late_fini) 3592 adev->ip_blocks[i].version->funcs->late_fini(&adev->ip_blocks[i]); 3593 adev->ip_blocks[i].status.late_initialized = false; 3594 } 3595 3596 amdgpu_ras_fini(adev); 3597 3598 return 0; 3599 } 3600 3601 /** 3602 * amdgpu_device_delayed_init_work_handler - work handler for IB tests 3603 * 3604 * @work: work_struct. 3605 */ 3606 static void amdgpu_device_delayed_init_work_handler(struct work_struct *work) 3607 { 3608 struct amdgpu_device *adev = 3609 container_of(work, struct amdgpu_device, delayed_init_work.work); 3610 int r; 3611 3612 r = amdgpu_ib_ring_tests(adev); 3613 if (r) 3614 DRM_ERROR("ib ring test failed (%d).\n", r); 3615 } 3616 3617 static void amdgpu_device_delay_enable_gfx_off(struct work_struct *work) 3618 { 3619 struct amdgpu_device *adev = 3620 container_of(work, struct amdgpu_device, gfx.gfx_off_delay_work.work); 3621 3622 WARN_ON_ONCE(adev->gfx.gfx_off_state); 3623 WARN_ON_ONCE(adev->gfx.gfx_off_req_count); 3624 3625 if (!amdgpu_dpm_set_powergating_by_smu(adev, AMD_IP_BLOCK_TYPE_GFX, true, 0)) 3626 adev->gfx.gfx_off_state = true; 3627 } 3628 3629 /** 3630 * amdgpu_device_ip_suspend_phase1 - run suspend for hardware IPs (phase 1) 3631 * 3632 * @adev: amdgpu_device pointer 3633 * 3634 * Main suspend function for hardware IPs. The list of all the hardware 3635 * IPs that make up the asic is walked, clockgating is disabled and the 3636 * suspend callbacks are run. suspend puts the hardware and software state 3637 * in each IP into a state suitable for suspend. 3638 * Returns 0 on success, negative error code on failure. 3639 */ 3640 static int amdgpu_device_ip_suspend_phase1(struct amdgpu_device *adev) 3641 { 3642 int i, r; 3643 3644 amdgpu_device_set_pg_state(adev, AMD_PG_STATE_UNGATE); 3645 amdgpu_device_set_cg_state(adev, AMD_CG_STATE_UNGATE); 3646 3647 /* 3648 * Per PMFW team's suggestion, driver needs to handle gfxoff 3649 * and df cstate features disablement for gpu reset(e.g. Mode1Reset) 3650 * scenario. Add the missing df cstate disablement here. 3651 */ 3652 if (amdgpu_dpm_set_df_cstate(adev, DF_CSTATE_DISALLOW)) 3653 dev_warn(adev->dev, "Failed to disallow df cstate"); 3654 3655 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 3656 if (!adev->ip_blocks[i].status.valid) 3657 continue; 3658 3659 /* displays are handled separately */ 3660 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_DCE) 3661 continue; 3662 3663 /* XXX handle errors */ 3664 r = amdgpu_ip_block_suspend(&adev->ip_blocks[i]); 3665 if (r) 3666 return r; 3667 } 3668 3669 return 0; 3670 } 3671 3672 /** 3673 * amdgpu_device_ip_suspend_phase2 - run suspend for hardware IPs (phase 2) 3674 * 3675 * @adev: amdgpu_device pointer 3676 * 3677 * Main suspend function for hardware IPs. The list of all the hardware 3678 * IPs that make up the asic is walked, clockgating is disabled and the 3679 * suspend callbacks are run. suspend puts the hardware and software state 3680 * in each IP into a state suitable for suspend. 3681 * Returns 0 on success, negative error code on failure. 3682 */ 3683 static int amdgpu_device_ip_suspend_phase2(struct amdgpu_device *adev) 3684 { 3685 int i, r; 3686 3687 if (adev->in_s0ix) 3688 amdgpu_dpm_gfx_state_change(adev, sGpuChangeState_D3Entry); 3689 3690 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 3691 if (!adev->ip_blocks[i].status.valid) 3692 continue; 3693 /* displays are handled in phase1 */ 3694 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE) 3695 continue; 3696 /* PSP lost connection when err_event_athub occurs */ 3697 if (amdgpu_ras_intr_triggered() && 3698 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) { 3699 adev->ip_blocks[i].status.hw = false; 3700 continue; 3701 } 3702 3703 /* skip unnecessary suspend if we do not initialize them yet */ 3704 if (!amdgpu_ip_member_of_hwini( 3705 adev, adev->ip_blocks[i].version->type)) 3706 continue; 3707 3708 /* Since we skip suspend for S0i3, we need to cancel the delayed 3709 * idle work here as the suspend callback never gets called. 3710 */ 3711 if (adev->in_s0ix && 3712 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GFX && 3713 amdgpu_ip_version(adev, GC_HWIP, 0) >= IP_VERSION(10, 0, 0)) 3714 cancel_delayed_work_sync(&adev->gfx.idle_work); 3715 /* skip suspend of gfx/mes and psp for S0ix 3716 * gfx is in gfxoff state, so on resume it will exit gfxoff just 3717 * like at runtime. PSP is also part of the always on hardware 3718 * so no need to suspend it. 3719 */ 3720 if (adev->in_s0ix && 3721 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP || 3722 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GFX || 3723 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_MES)) 3724 continue; 3725 3726 /* SDMA 5.x+ is part of GFX power domain so it's covered by GFXOFF */ 3727 if (adev->in_s0ix && 3728 (amdgpu_ip_version(adev, SDMA0_HWIP, 0) >= 3729 IP_VERSION(5, 0, 0)) && 3730 (adev->ip_blocks[i].version->type == 3731 AMD_IP_BLOCK_TYPE_SDMA)) 3732 continue; 3733 3734 /* Once swPSP provides the IMU, RLC FW binaries to TOS during cold-boot. 3735 * These are in TMR, hence are expected to be reused by PSP-TOS to reload 3736 * from this location and RLC Autoload automatically also gets loaded 3737 * from here based on PMFW -> PSP message during re-init sequence. 3738 * Therefore, the psp suspend & resume should be skipped to avoid destroy 3739 * the TMR and reload FWs again for IMU enabled APU ASICs. 3740 */ 3741 if (amdgpu_in_reset(adev) && 3742 (adev->flags & AMD_IS_APU) && adev->gfx.imu.funcs && 3743 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) 3744 continue; 3745 3746 /* XXX handle errors */ 3747 r = amdgpu_ip_block_suspend(&adev->ip_blocks[i]); 3748 adev->ip_blocks[i].status.hw = false; 3749 3750 /* handle putting the SMC in the appropriate state */ 3751 if (!amdgpu_sriov_vf(adev)) { 3752 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) { 3753 r = amdgpu_dpm_set_mp1_state(adev, adev->mp1_state); 3754 if (r) { 3755 DRM_ERROR("SMC failed to set mp1 state %d, %d\n", 3756 adev->mp1_state, r); 3757 return r; 3758 } 3759 } 3760 } 3761 } 3762 3763 return 0; 3764 } 3765 3766 /** 3767 * amdgpu_device_ip_suspend - run suspend for hardware IPs 3768 * 3769 * @adev: amdgpu_device pointer 3770 * 3771 * Main suspend function for hardware IPs. The list of all the hardware 3772 * IPs that make up the asic is walked, clockgating is disabled and the 3773 * suspend callbacks are run. suspend puts the hardware and software state 3774 * in each IP into a state suitable for suspend. 3775 * Returns 0 on success, negative error code on failure. 3776 */ 3777 int amdgpu_device_ip_suspend(struct amdgpu_device *adev) 3778 { 3779 int r; 3780 3781 if (amdgpu_sriov_vf(adev)) { 3782 amdgpu_virt_fini_data_exchange(adev); 3783 amdgpu_virt_request_full_gpu(adev, false); 3784 } 3785 3786 amdgpu_ttm_set_buffer_funcs_status(adev, false); 3787 3788 r = amdgpu_device_ip_suspend_phase1(adev); 3789 if (r) 3790 return r; 3791 r = amdgpu_device_ip_suspend_phase2(adev); 3792 3793 if (amdgpu_sriov_vf(adev)) 3794 amdgpu_virt_release_full_gpu(adev, false); 3795 3796 return r; 3797 } 3798 3799 static int amdgpu_device_ip_reinit_early_sriov(struct amdgpu_device *adev) 3800 { 3801 int i, r; 3802 3803 static enum amd_ip_block_type ip_order[] = { 3804 AMD_IP_BLOCK_TYPE_COMMON, 3805 AMD_IP_BLOCK_TYPE_GMC, 3806 AMD_IP_BLOCK_TYPE_PSP, 3807 AMD_IP_BLOCK_TYPE_IH, 3808 }; 3809 3810 for (i = 0; i < adev->num_ip_blocks; i++) { 3811 int j; 3812 struct amdgpu_ip_block *block; 3813 3814 block = &adev->ip_blocks[i]; 3815 block->status.hw = false; 3816 3817 for (j = 0; j < ARRAY_SIZE(ip_order); j++) { 3818 3819 if (block->version->type != ip_order[j] || 3820 !block->status.valid) 3821 continue; 3822 3823 r = block->version->funcs->hw_init(&adev->ip_blocks[i]); 3824 if (r) { 3825 dev_err(adev->dev, "RE-INIT-early: %s failed\n", 3826 block->version->funcs->name); 3827 return r; 3828 } 3829 block->status.hw = true; 3830 } 3831 } 3832 3833 return 0; 3834 } 3835 3836 static int amdgpu_device_ip_reinit_late_sriov(struct amdgpu_device *adev) 3837 { 3838 struct amdgpu_ip_block *block; 3839 int i, r = 0; 3840 3841 static enum amd_ip_block_type ip_order[] = { 3842 AMD_IP_BLOCK_TYPE_SMC, 3843 AMD_IP_BLOCK_TYPE_DCE, 3844 AMD_IP_BLOCK_TYPE_GFX, 3845 AMD_IP_BLOCK_TYPE_SDMA, 3846 AMD_IP_BLOCK_TYPE_MES, 3847 AMD_IP_BLOCK_TYPE_UVD, 3848 AMD_IP_BLOCK_TYPE_VCE, 3849 AMD_IP_BLOCK_TYPE_VCN, 3850 AMD_IP_BLOCK_TYPE_JPEG 3851 }; 3852 3853 for (i = 0; i < ARRAY_SIZE(ip_order); i++) { 3854 block = amdgpu_device_ip_get_ip_block(adev, ip_order[i]); 3855 3856 if (!block) 3857 continue; 3858 3859 if (block->status.valid && !block->status.hw) { 3860 if (block->version->type == AMD_IP_BLOCK_TYPE_SMC) { 3861 r = amdgpu_ip_block_resume(block); 3862 } else { 3863 r = block->version->funcs->hw_init(block); 3864 } 3865 3866 if (r) { 3867 dev_err(adev->dev, "RE-INIT-late: %s failed\n", 3868 block->version->funcs->name); 3869 break; 3870 } 3871 block->status.hw = true; 3872 } 3873 } 3874 3875 return r; 3876 } 3877 3878 /** 3879 * amdgpu_device_ip_resume_phase1 - run resume for hardware IPs 3880 * 3881 * @adev: amdgpu_device pointer 3882 * 3883 * First resume function for hardware IPs. The list of all the hardware 3884 * IPs that make up the asic is walked and the resume callbacks are run for 3885 * COMMON, GMC, and IH. resume puts the hardware into a functional state 3886 * after a suspend and updates the software state as necessary. This 3887 * function is also used for restoring the GPU after a GPU reset. 3888 * Returns 0 on success, negative error code on failure. 3889 */ 3890 static int amdgpu_device_ip_resume_phase1(struct amdgpu_device *adev) 3891 { 3892 int i, r; 3893 3894 for (i = 0; i < adev->num_ip_blocks; i++) { 3895 if (!adev->ip_blocks[i].status.valid || adev->ip_blocks[i].status.hw) 3896 continue; 3897 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON || 3898 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC || 3899 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH || 3900 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP && amdgpu_sriov_vf(adev))) { 3901 3902 r = amdgpu_ip_block_resume(&adev->ip_blocks[i]); 3903 if (r) 3904 return r; 3905 } 3906 } 3907 3908 return 0; 3909 } 3910 3911 /** 3912 * amdgpu_device_ip_resume_phase2 - run resume for hardware IPs 3913 * 3914 * @adev: amdgpu_device pointer 3915 * 3916 * Second resume function for hardware IPs. The list of all the hardware 3917 * IPs that make up the asic is walked and the resume callbacks are run for 3918 * all blocks except COMMON, GMC, and IH. resume puts the hardware into a 3919 * functional state after a suspend and updates the software state as 3920 * necessary. This function is also used for restoring the GPU after a GPU 3921 * reset. 3922 * Returns 0 on success, negative error code on failure. 3923 */ 3924 static int amdgpu_device_ip_resume_phase2(struct amdgpu_device *adev) 3925 { 3926 int i, r; 3927 3928 for (i = 0; i < adev->num_ip_blocks; i++) { 3929 if (!adev->ip_blocks[i].status.valid || adev->ip_blocks[i].status.hw) 3930 continue; 3931 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON || 3932 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC || 3933 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH || 3934 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE || 3935 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) 3936 continue; 3937 r = amdgpu_ip_block_resume(&adev->ip_blocks[i]); 3938 if (r) 3939 return r; 3940 } 3941 3942 return 0; 3943 } 3944 3945 /** 3946 * amdgpu_device_ip_resume_phase3 - run resume for hardware IPs 3947 * 3948 * @adev: amdgpu_device pointer 3949 * 3950 * Third resume function for hardware IPs. The list of all the hardware 3951 * IPs that make up the asic is walked and the resume callbacks are run for 3952 * all DCE. resume puts the hardware into a functional state after a suspend 3953 * and updates the software state as necessary. This function is also used 3954 * for restoring the GPU after a GPU reset. 3955 * 3956 * Returns 0 on success, negative error code on failure. 3957 */ 3958 static int amdgpu_device_ip_resume_phase3(struct amdgpu_device *adev) 3959 { 3960 int i, r; 3961 3962 for (i = 0; i < adev->num_ip_blocks; i++) { 3963 if (!adev->ip_blocks[i].status.valid || adev->ip_blocks[i].status.hw) 3964 continue; 3965 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE) { 3966 r = amdgpu_ip_block_resume(&adev->ip_blocks[i]); 3967 if (r) 3968 return r; 3969 } 3970 } 3971 3972 return 0; 3973 } 3974 3975 /** 3976 * amdgpu_device_ip_resume - run resume for hardware IPs 3977 * 3978 * @adev: amdgpu_device pointer 3979 * 3980 * Main resume function for hardware IPs. The hardware IPs 3981 * are split into two resume functions because they are 3982 * also used in recovering from a GPU reset and some additional 3983 * steps need to be take between them. In this case (S3/S4) they are 3984 * run sequentially. 3985 * Returns 0 on success, negative error code on failure. 3986 */ 3987 static int amdgpu_device_ip_resume(struct amdgpu_device *adev) 3988 { 3989 int r; 3990 3991 r = amdgpu_device_ip_resume_phase1(adev); 3992 if (r) 3993 return r; 3994 3995 r = amdgpu_device_fw_loading(adev); 3996 if (r) 3997 return r; 3998 3999 r = amdgpu_device_ip_resume_phase2(adev); 4000 4001 if (adev->mman.buffer_funcs_ring->sched.ready) 4002 amdgpu_ttm_set_buffer_funcs_status(adev, true); 4003 4004 if (r) 4005 return r; 4006 4007 amdgpu_fence_driver_hw_init(adev); 4008 4009 r = amdgpu_device_ip_resume_phase3(adev); 4010 4011 return r; 4012 } 4013 4014 /** 4015 * amdgpu_device_detect_sriov_bios - determine if the board supports SR-IOV 4016 * 4017 * @adev: amdgpu_device pointer 4018 * 4019 * Query the VBIOS data tables to determine if the board supports SR-IOV. 4020 */ 4021 static void amdgpu_device_detect_sriov_bios(struct amdgpu_device *adev) 4022 { 4023 if (amdgpu_sriov_vf(adev)) { 4024 if (adev->is_atom_fw) { 4025 if (amdgpu_atomfirmware_gpu_virtualization_supported(adev)) 4026 adev->virt.caps |= AMDGPU_SRIOV_CAPS_SRIOV_VBIOS; 4027 } else { 4028 if (amdgpu_atombios_has_gpu_virtualization_table(adev)) 4029 adev->virt.caps |= AMDGPU_SRIOV_CAPS_SRIOV_VBIOS; 4030 } 4031 4032 if (!(adev->virt.caps & AMDGPU_SRIOV_CAPS_SRIOV_VBIOS)) 4033 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_NO_VBIOS, 0, 0); 4034 } 4035 } 4036 4037 /** 4038 * amdgpu_device_asic_has_dc_support - determine if DC supports the asic 4039 * 4040 * @asic_type: AMD asic type 4041 * 4042 * Check if there is DC (new modesetting infrastructre) support for an asic. 4043 * returns true if DC has support, false if not. 4044 */ 4045 bool amdgpu_device_asic_has_dc_support(enum amd_asic_type asic_type) 4046 { 4047 switch (asic_type) { 4048 #ifdef CONFIG_DRM_AMDGPU_SI 4049 case CHIP_HAINAN: 4050 #endif 4051 case CHIP_TOPAZ: 4052 /* chips with no display hardware */ 4053 return false; 4054 #if defined(CONFIG_DRM_AMD_DC) 4055 case CHIP_TAHITI: 4056 case CHIP_PITCAIRN: 4057 case CHIP_VERDE: 4058 case CHIP_OLAND: 4059 /* 4060 * We have systems in the wild with these ASICs that require 4061 * LVDS and VGA support which is not supported with DC. 4062 * 4063 * Fallback to the non-DC driver here by default so as not to 4064 * cause regressions. 4065 */ 4066 #if defined(CONFIG_DRM_AMD_DC_SI) 4067 return amdgpu_dc > 0; 4068 #else 4069 return false; 4070 #endif 4071 case CHIP_BONAIRE: 4072 case CHIP_KAVERI: 4073 case CHIP_KABINI: 4074 case CHIP_MULLINS: 4075 /* 4076 * We have systems in the wild with these ASICs that require 4077 * VGA support which is not supported with DC. 4078 * 4079 * Fallback to the non-DC driver here by default so as not to 4080 * cause regressions. 4081 */ 4082 return amdgpu_dc > 0; 4083 default: 4084 return amdgpu_dc != 0; 4085 #else 4086 default: 4087 if (amdgpu_dc > 0) 4088 DRM_INFO_ONCE("Display Core has been requested via kernel parameter but isn't supported by ASIC, ignoring\n"); 4089 return false; 4090 #endif 4091 } 4092 } 4093 4094 /** 4095 * amdgpu_device_has_dc_support - check if dc is supported 4096 * 4097 * @adev: amdgpu_device pointer 4098 * 4099 * Returns true for supported, false for not supported 4100 */ 4101 bool amdgpu_device_has_dc_support(struct amdgpu_device *adev) 4102 { 4103 if (adev->enable_virtual_display || 4104 (adev->harvest_ip_mask & AMD_HARVEST_IP_DMU_MASK)) 4105 return false; 4106 4107 return amdgpu_device_asic_has_dc_support(adev->asic_type); 4108 } 4109 4110 static void amdgpu_device_xgmi_reset_func(struct work_struct *__work) 4111 { 4112 struct amdgpu_device *adev = 4113 container_of(__work, struct amdgpu_device, xgmi_reset_work); 4114 struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev); 4115 4116 /* It's a bug to not have a hive within this function */ 4117 if (WARN_ON(!hive)) 4118 return; 4119 4120 /* 4121 * Use task barrier to synchronize all xgmi reset works across the 4122 * hive. task_barrier_enter and task_barrier_exit will block 4123 * until all the threads running the xgmi reset works reach 4124 * those points. task_barrier_full will do both blocks. 4125 */ 4126 if (amdgpu_asic_reset_method(adev) == AMD_RESET_METHOD_BACO) { 4127 4128 task_barrier_enter(&hive->tb); 4129 adev->asic_reset_res = amdgpu_device_baco_enter(adev_to_drm(adev)); 4130 4131 if (adev->asic_reset_res) 4132 goto fail; 4133 4134 task_barrier_exit(&hive->tb); 4135 adev->asic_reset_res = amdgpu_device_baco_exit(adev_to_drm(adev)); 4136 4137 if (adev->asic_reset_res) 4138 goto fail; 4139 4140 amdgpu_ras_reset_error_count(adev, AMDGPU_RAS_BLOCK__MMHUB); 4141 } else { 4142 4143 task_barrier_full(&hive->tb); 4144 adev->asic_reset_res = amdgpu_asic_reset(adev); 4145 } 4146 4147 fail: 4148 if (adev->asic_reset_res) 4149 DRM_WARN("ASIC reset failed with error, %d for drm dev, %s", 4150 adev->asic_reset_res, adev_to_drm(adev)->unique); 4151 amdgpu_put_xgmi_hive(hive); 4152 } 4153 4154 static int amdgpu_device_get_job_timeout_settings(struct amdgpu_device *adev) 4155 { 4156 char *input = amdgpu_lockup_timeout; 4157 char *timeout_setting = NULL; 4158 int index = 0; 4159 long timeout; 4160 int ret = 0; 4161 4162 /* 4163 * By default timeout for non compute jobs is 10000 4164 * and 60000 for compute jobs. 4165 * In SR-IOV or passthrough mode, timeout for compute 4166 * jobs are 60000 by default. 4167 */ 4168 adev->gfx_timeout = msecs_to_jiffies(10000); 4169 adev->sdma_timeout = adev->video_timeout = adev->gfx_timeout; 4170 if (amdgpu_sriov_vf(adev)) 4171 adev->compute_timeout = amdgpu_sriov_is_pp_one_vf(adev) ? 4172 msecs_to_jiffies(60000) : msecs_to_jiffies(10000); 4173 else 4174 adev->compute_timeout = msecs_to_jiffies(60000); 4175 4176 if (strnlen(input, AMDGPU_MAX_TIMEOUT_PARAM_LENGTH)) { 4177 while ((timeout_setting = strsep(&input, ",")) && 4178 strnlen(timeout_setting, AMDGPU_MAX_TIMEOUT_PARAM_LENGTH)) { 4179 ret = kstrtol(timeout_setting, 0, &timeout); 4180 if (ret) 4181 return ret; 4182 4183 if (timeout == 0) { 4184 index++; 4185 continue; 4186 } else if (timeout < 0) { 4187 timeout = MAX_SCHEDULE_TIMEOUT; 4188 dev_warn(adev->dev, "lockup timeout disabled"); 4189 add_taint(TAINT_SOFTLOCKUP, LOCKDEP_STILL_OK); 4190 } else { 4191 timeout = msecs_to_jiffies(timeout); 4192 } 4193 4194 switch (index++) { 4195 case 0: 4196 adev->gfx_timeout = timeout; 4197 break; 4198 case 1: 4199 adev->compute_timeout = timeout; 4200 break; 4201 case 2: 4202 adev->sdma_timeout = timeout; 4203 break; 4204 case 3: 4205 adev->video_timeout = timeout; 4206 break; 4207 default: 4208 break; 4209 } 4210 } 4211 /* 4212 * There is only one value specified and 4213 * it should apply to all non-compute jobs. 4214 */ 4215 if (index == 1) { 4216 adev->sdma_timeout = adev->video_timeout = adev->gfx_timeout; 4217 if (amdgpu_sriov_vf(adev) || amdgpu_passthrough(adev)) 4218 adev->compute_timeout = adev->gfx_timeout; 4219 } 4220 } 4221 4222 return ret; 4223 } 4224 4225 /** 4226 * amdgpu_device_check_iommu_direct_map - check if RAM direct mapped to GPU 4227 * 4228 * @adev: amdgpu_device pointer 4229 * 4230 * RAM direct mapped to GPU if IOMMU is not enabled or is pass through mode 4231 */ 4232 static void amdgpu_device_check_iommu_direct_map(struct amdgpu_device *adev) 4233 { 4234 struct iommu_domain *domain; 4235 4236 domain = iommu_get_domain_for_dev(adev->dev); 4237 if (!domain || domain->type == IOMMU_DOMAIN_IDENTITY) 4238 adev->ram_is_direct_mapped = true; 4239 } 4240 4241 #if defined(CONFIG_HSA_AMD_P2P) 4242 /** 4243 * amdgpu_device_check_iommu_remap - Check if DMA remapping is enabled. 4244 * 4245 * @adev: amdgpu_device pointer 4246 * 4247 * return if IOMMU remapping bar address 4248 */ 4249 static bool amdgpu_device_check_iommu_remap(struct amdgpu_device *adev) 4250 { 4251 struct iommu_domain *domain; 4252 4253 domain = iommu_get_domain_for_dev(adev->dev); 4254 if (domain && (domain->type == IOMMU_DOMAIN_DMA || 4255 domain->type == IOMMU_DOMAIN_DMA_FQ)) 4256 return true; 4257 4258 return false; 4259 } 4260 #endif 4261 4262 static void amdgpu_device_set_mcbp(struct amdgpu_device *adev) 4263 { 4264 if (amdgpu_mcbp == 1) 4265 adev->gfx.mcbp = true; 4266 else if (amdgpu_mcbp == 0) 4267 adev->gfx.mcbp = false; 4268 4269 if (amdgpu_sriov_vf(adev)) 4270 adev->gfx.mcbp = true; 4271 4272 if (adev->gfx.mcbp) 4273 DRM_INFO("MCBP is enabled\n"); 4274 } 4275 4276 /** 4277 * amdgpu_device_init - initialize the driver 4278 * 4279 * @adev: amdgpu_device pointer 4280 * @flags: driver flags 4281 * 4282 * Initializes the driver info and hw (all asics). 4283 * Returns 0 for success or an error on failure. 4284 * Called at driver startup. 4285 */ 4286 int amdgpu_device_init(struct amdgpu_device *adev, 4287 uint32_t flags) 4288 { 4289 struct drm_device *ddev = adev_to_drm(adev); 4290 struct pci_dev *pdev = adev->pdev; 4291 int r, i; 4292 bool px = false; 4293 u32 max_MBps; 4294 int tmp; 4295 4296 adev->shutdown = false; 4297 adev->flags = flags; 4298 4299 if (amdgpu_force_asic_type >= 0 && amdgpu_force_asic_type < CHIP_LAST) 4300 adev->asic_type = amdgpu_force_asic_type; 4301 else 4302 adev->asic_type = flags & AMD_ASIC_MASK; 4303 4304 adev->usec_timeout = AMDGPU_MAX_USEC_TIMEOUT; 4305 if (amdgpu_emu_mode == 1) 4306 adev->usec_timeout *= 10; 4307 adev->gmc.gart_size = 512 * 1024 * 1024; 4308 adev->accel_working = false; 4309 adev->num_rings = 0; 4310 RCU_INIT_POINTER(adev->gang_submit, dma_fence_get_stub()); 4311 adev->mman.buffer_funcs = NULL; 4312 adev->mman.buffer_funcs_ring = NULL; 4313 adev->vm_manager.vm_pte_funcs = NULL; 4314 adev->vm_manager.vm_pte_num_scheds = 0; 4315 adev->gmc.gmc_funcs = NULL; 4316 adev->harvest_ip_mask = 0x0; 4317 adev->fence_context = dma_fence_context_alloc(AMDGPU_MAX_RINGS); 4318 bitmap_zero(adev->gfx.pipe_reserve_bitmap, AMDGPU_MAX_COMPUTE_QUEUES); 4319 4320 adev->smc_rreg = &amdgpu_invalid_rreg; 4321 adev->smc_wreg = &amdgpu_invalid_wreg; 4322 adev->pcie_rreg = &amdgpu_invalid_rreg; 4323 adev->pcie_wreg = &amdgpu_invalid_wreg; 4324 adev->pcie_rreg_ext = &amdgpu_invalid_rreg_ext; 4325 adev->pcie_wreg_ext = &amdgpu_invalid_wreg_ext; 4326 adev->pciep_rreg = &amdgpu_invalid_rreg; 4327 adev->pciep_wreg = &amdgpu_invalid_wreg; 4328 adev->pcie_rreg64 = &amdgpu_invalid_rreg64; 4329 adev->pcie_wreg64 = &amdgpu_invalid_wreg64; 4330 adev->pcie_rreg64_ext = &amdgpu_invalid_rreg64_ext; 4331 adev->pcie_wreg64_ext = &amdgpu_invalid_wreg64_ext; 4332 adev->uvd_ctx_rreg = &amdgpu_invalid_rreg; 4333 adev->uvd_ctx_wreg = &amdgpu_invalid_wreg; 4334 adev->didt_rreg = &amdgpu_invalid_rreg; 4335 adev->didt_wreg = &amdgpu_invalid_wreg; 4336 adev->gc_cac_rreg = &amdgpu_invalid_rreg; 4337 adev->gc_cac_wreg = &amdgpu_invalid_wreg; 4338 adev->audio_endpt_rreg = &amdgpu_block_invalid_rreg; 4339 adev->audio_endpt_wreg = &amdgpu_block_invalid_wreg; 4340 4341 DRM_INFO("initializing kernel modesetting (%s 0x%04X:0x%04X 0x%04X:0x%04X 0x%02X).\n", 4342 amdgpu_asic_name[adev->asic_type], pdev->vendor, pdev->device, 4343 pdev->subsystem_vendor, pdev->subsystem_device, pdev->revision); 4344 4345 /* mutex initialization are all done here so we 4346 * can recall function without having locking issues 4347 */ 4348 mutex_init(&adev->firmware.mutex); 4349 mutex_init(&adev->pm.mutex); 4350 mutex_init(&adev->gfx.gpu_clock_mutex); 4351 mutex_init(&adev->srbm_mutex); 4352 mutex_init(&adev->gfx.pipe_reserve_mutex); 4353 mutex_init(&adev->gfx.gfx_off_mutex); 4354 mutex_init(&adev->gfx.partition_mutex); 4355 mutex_init(&adev->grbm_idx_mutex); 4356 mutex_init(&adev->mn_lock); 4357 mutex_init(&adev->virt.vf_errors.lock); 4358 hash_init(adev->mn_hash); 4359 mutex_init(&adev->psp.mutex); 4360 mutex_init(&adev->notifier_lock); 4361 mutex_init(&adev->pm.stable_pstate_ctx_lock); 4362 mutex_init(&adev->benchmark_mutex); 4363 mutex_init(&adev->gfx.reset_sem_mutex); 4364 /* Initialize the mutex for cleaner shader isolation between GFX and compute processes */ 4365 mutex_init(&adev->enforce_isolation_mutex); 4366 for (i = 0; i < MAX_XCP; ++i) { 4367 adev->isolation[i].spearhead = dma_fence_get_stub(); 4368 amdgpu_sync_create(&adev->isolation[i].active); 4369 amdgpu_sync_create(&adev->isolation[i].prev); 4370 } 4371 mutex_init(&adev->gfx.userq_sch_mutex); 4372 mutex_init(&adev->gfx.workload_profile_mutex); 4373 mutex_init(&adev->vcn.workload_profile_mutex); 4374 mutex_init(&adev->userq_mutex); 4375 4376 amdgpu_device_init_apu_flags(adev); 4377 4378 r = amdgpu_device_check_arguments(adev); 4379 if (r) 4380 return r; 4381 4382 spin_lock_init(&adev->mmio_idx_lock); 4383 spin_lock_init(&adev->smc_idx_lock); 4384 spin_lock_init(&adev->pcie_idx_lock); 4385 spin_lock_init(&adev->uvd_ctx_idx_lock); 4386 spin_lock_init(&adev->didt_idx_lock); 4387 spin_lock_init(&adev->gc_cac_idx_lock); 4388 spin_lock_init(&adev->se_cac_idx_lock); 4389 spin_lock_init(&adev->audio_endpt_idx_lock); 4390 spin_lock_init(&adev->mm_stats.lock); 4391 spin_lock_init(&adev->virt.rlcg_reg_lock); 4392 spin_lock_init(&adev->wb.lock); 4393 4394 xa_init_flags(&adev->userq_xa, XA_FLAGS_LOCK_IRQ); 4395 4396 INIT_LIST_HEAD(&adev->reset_list); 4397 4398 INIT_LIST_HEAD(&adev->ras_list); 4399 4400 INIT_LIST_HEAD(&adev->pm.od_kobj_list); 4401 4402 INIT_LIST_HEAD(&adev->userq_mgr_list); 4403 4404 INIT_DELAYED_WORK(&adev->delayed_init_work, 4405 amdgpu_device_delayed_init_work_handler); 4406 INIT_DELAYED_WORK(&adev->gfx.gfx_off_delay_work, 4407 amdgpu_device_delay_enable_gfx_off); 4408 /* 4409 * Initialize the enforce_isolation work structures for each XCP 4410 * partition. This work handler is responsible for enforcing shader 4411 * isolation on AMD GPUs. It counts the number of emitted fences for 4412 * each GFX and compute ring. If there are any fences, it schedules 4413 * the `enforce_isolation_work` to be run after a delay. If there are 4414 * no fences, it signals the Kernel Fusion Driver (KFD) to resume the 4415 * runqueue. 4416 */ 4417 for (i = 0; i < MAX_XCP; i++) { 4418 INIT_DELAYED_WORK(&adev->gfx.enforce_isolation[i].work, 4419 amdgpu_gfx_enforce_isolation_handler); 4420 adev->gfx.enforce_isolation[i].adev = adev; 4421 adev->gfx.enforce_isolation[i].xcp_id = i; 4422 } 4423 4424 INIT_WORK(&adev->xgmi_reset_work, amdgpu_device_xgmi_reset_func); 4425 4426 adev->gfx.gfx_off_req_count = 1; 4427 adev->gfx.gfx_off_residency = 0; 4428 adev->gfx.gfx_off_entrycount = 0; 4429 adev->pm.ac_power = power_supply_is_system_supplied() > 0; 4430 4431 atomic_set(&adev->throttling_logging_enabled, 1); 4432 /* 4433 * If throttling continues, logging will be performed every minute 4434 * to avoid log flooding. "-1" is subtracted since the thermal 4435 * throttling interrupt comes every second. Thus, the total logging 4436 * interval is 59 seconds(retelimited printk interval) + 1(waiting 4437 * for throttling interrupt) = 60 seconds. 4438 */ 4439 ratelimit_state_init(&adev->throttling_logging_rs, (60 - 1) * HZ, 1); 4440 4441 ratelimit_set_flags(&adev->throttling_logging_rs, RATELIMIT_MSG_ON_RELEASE); 4442 4443 /* Registers mapping */ 4444 /* TODO: block userspace mapping of io register */ 4445 if (adev->asic_type >= CHIP_BONAIRE) { 4446 adev->rmmio_base = pci_resource_start(adev->pdev, 5); 4447 adev->rmmio_size = pci_resource_len(adev->pdev, 5); 4448 } else { 4449 adev->rmmio_base = pci_resource_start(adev->pdev, 2); 4450 adev->rmmio_size = pci_resource_len(adev->pdev, 2); 4451 } 4452 4453 for (i = 0; i < AMD_IP_BLOCK_TYPE_NUM; i++) 4454 atomic_set(&adev->pm.pwr_state[i], POWER_STATE_UNKNOWN); 4455 4456 adev->rmmio = ioremap(adev->rmmio_base, adev->rmmio_size); 4457 if (!adev->rmmio) 4458 return -ENOMEM; 4459 4460 DRM_INFO("register mmio base: 0x%08X\n", (uint32_t)adev->rmmio_base); 4461 DRM_INFO("register mmio size: %u\n", (unsigned int)adev->rmmio_size); 4462 4463 /* 4464 * Reset domain needs to be present early, before XGMI hive discovered 4465 * (if any) and initialized to use reset sem and in_gpu reset flag 4466 * early on during init and before calling to RREG32. 4467 */ 4468 adev->reset_domain = amdgpu_reset_create_reset_domain(SINGLE_DEVICE, "amdgpu-reset-dev"); 4469 if (!adev->reset_domain) 4470 return -ENOMEM; 4471 4472 /* detect hw virtualization here */ 4473 amdgpu_virt_init(adev); 4474 4475 amdgpu_device_get_pcie_info(adev); 4476 4477 r = amdgpu_device_get_job_timeout_settings(adev); 4478 if (r) { 4479 dev_err(adev->dev, "invalid lockup_timeout parameter syntax\n"); 4480 return r; 4481 } 4482 4483 amdgpu_device_set_mcbp(adev); 4484 4485 /* 4486 * By default, use default mode where all blocks are expected to be 4487 * initialized. At present a 'swinit' of blocks is required to be 4488 * completed before the need for a different level is detected. 4489 */ 4490 amdgpu_set_init_level(adev, AMDGPU_INIT_LEVEL_DEFAULT); 4491 /* early init functions */ 4492 r = amdgpu_device_ip_early_init(adev); 4493 if (r) 4494 return r; 4495 4496 /* 4497 * No need to remove conflicting FBs for non-display class devices. 4498 * This prevents the sysfb from being freed accidently. 4499 */ 4500 if ((pdev->class >> 8) == PCI_CLASS_DISPLAY_VGA || 4501 (pdev->class >> 8) == PCI_CLASS_DISPLAY_OTHER) { 4502 /* Get rid of things like offb */ 4503 r = aperture_remove_conflicting_pci_devices(adev->pdev, amdgpu_kms_driver.name); 4504 if (r) 4505 return r; 4506 } 4507 4508 /* Enable TMZ based on IP_VERSION */ 4509 amdgpu_gmc_tmz_set(adev); 4510 4511 if (amdgpu_sriov_vf(adev) && 4512 amdgpu_ip_version(adev, GC_HWIP, 0) >= IP_VERSION(10, 3, 0)) 4513 /* VF MMIO access (except mailbox range) from CPU 4514 * will be blocked during sriov runtime 4515 */ 4516 adev->virt.caps |= AMDGPU_VF_MMIO_ACCESS_PROTECT; 4517 4518 amdgpu_gmc_noretry_set(adev); 4519 /* Need to get xgmi info early to decide the reset behavior*/ 4520 if (adev->gmc.xgmi.supported) { 4521 r = adev->gfxhub.funcs->get_xgmi_info(adev); 4522 if (r) 4523 return r; 4524 } 4525 4526 /* enable PCIE atomic ops */ 4527 if (amdgpu_sriov_vf(adev)) { 4528 if (adev->virt.fw_reserve.p_pf2vf) 4529 adev->have_atomics_support = ((struct amd_sriov_msg_pf2vf_info *) 4530 adev->virt.fw_reserve.p_pf2vf)->pcie_atomic_ops_support_flags == 4531 (PCI_EXP_DEVCAP2_ATOMIC_COMP32 | PCI_EXP_DEVCAP2_ATOMIC_COMP64); 4532 /* APUs w/ gfx9 onwards doesn't reply on PCIe atomics, rather it is a 4533 * internal path natively support atomics, set have_atomics_support to true. 4534 */ 4535 } else if ((adev->flags & AMD_IS_APU) && 4536 (amdgpu_ip_version(adev, GC_HWIP, 0) > 4537 IP_VERSION(9, 0, 0))) { 4538 adev->have_atomics_support = true; 4539 } else { 4540 adev->have_atomics_support = 4541 !pci_enable_atomic_ops_to_root(adev->pdev, 4542 PCI_EXP_DEVCAP2_ATOMIC_COMP32 | 4543 PCI_EXP_DEVCAP2_ATOMIC_COMP64); 4544 } 4545 4546 if (!adev->have_atomics_support) 4547 dev_info(adev->dev, "PCIE atomic ops is not supported\n"); 4548 4549 /* doorbell bar mapping and doorbell index init*/ 4550 amdgpu_doorbell_init(adev); 4551 4552 if (amdgpu_emu_mode == 1) { 4553 /* post the asic on emulation mode */ 4554 emu_soc_asic_init(adev); 4555 goto fence_driver_init; 4556 } 4557 4558 amdgpu_reset_init(adev); 4559 4560 /* detect if we are with an SRIOV vbios */ 4561 if (adev->bios) 4562 amdgpu_device_detect_sriov_bios(adev); 4563 4564 /* check if we need to reset the asic 4565 * E.g., driver was not cleanly unloaded previously, etc. 4566 */ 4567 if (!amdgpu_sriov_vf(adev) && amdgpu_asic_need_reset_on_init(adev)) { 4568 if (adev->gmc.xgmi.num_physical_nodes) { 4569 dev_info(adev->dev, "Pending hive reset.\n"); 4570 amdgpu_set_init_level(adev, 4571 AMDGPU_INIT_LEVEL_MINIMAL_XGMI); 4572 } else if (amdgpu_ip_version(adev, MP1_HWIP, 0) == IP_VERSION(13, 0, 10) && 4573 !amdgpu_device_has_display_hardware(adev)) { 4574 r = psp_gpu_reset(adev); 4575 } else { 4576 tmp = amdgpu_reset_method; 4577 /* It should do a default reset when loading or reloading the driver, 4578 * regardless of the module parameter reset_method. 4579 */ 4580 amdgpu_reset_method = AMD_RESET_METHOD_NONE; 4581 r = amdgpu_asic_reset(adev); 4582 amdgpu_reset_method = tmp; 4583 } 4584 4585 if (r) { 4586 dev_err(adev->dev, "asic reset on init failed\n"); 4587 goto failed; 4588 } 4589 } 4590 4591 /* Post card if necessary */ 4592 if (amdgpu_device_need_post(adev)) { 4593 if (!adev->bios) { 4594 dev_err(adev->dev, "no vBIOS found\n"); 4595 r = -EINVAL; 4596 goto failed; 4597 } 4598 DRM_INFO("GPU posting now...\n"); 4599 r = amdgpu_device_asic_init(adev); 4600 if (r) { 4601 dev_err(adev->dev, "gpu post error!\n"); 4602 goto failed; 4603 } 4604 } 4605 4606 if (adev->bios) { 4607 if (adev->is_atom_fw) { 4608 /* Initialize clocks */ 4609 r = amdgpu_atomfirmware_get_clock_info(adev); 4610 if (r) { 4611 dev_err(adev->dev, "amdgpu_atomfirmware_get_clock_info failed\n"); 4612 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_GET_CLOCK_FAIL, 0, 0); 4613 goto failed; 4614 } 4615 } else { 4616 /* Initialize clocks */ 4617 r = amdgpu_atombios_get_clock_info(adev); 4618 if (r) { 4619 dev_err(adev->dev, "amdgpu_atombios_get_clock_info failed\n"); 4620 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_GET_CLOCK_FAIL, 0, 0); 4621 goto failed; 4622 } 4623 /* init i2c buses */ 4624 amdgpu_i2c_init(adev); 4625 } 4626 } 4627 4628 fence_driver_init: 4629 /* Fence driver */ 4630 r = amdgpu_fence_driver_sw_init(adev); 4631 if (r) { 4632 dev_err(adev->dev, "amdgpu_fence_driver_sw_init failed\n"); 4633 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_FENCE_INIT_FAIL, 0, 0); 4634 goto failed; 4635 } 4636 4637 /* init the mode config */ 4638 drm_mode_config_init(adev_to_drm(adev)); 4639 4640 r = amdgpu_device_ip_init(adev); 4641 if (r) { 4642 dev_err(adev->dev, "amdgpu_device_ip_init failed\n"); 4643 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_AMDGPU_INIT_FAIL, 0, 0); 4644 goto release_ras_con; 4645 } 4646 4647 amdgpu_fence_driver_hw_init(adev); 4648 4649 dev_info(adev->dev, 4650 "SE %d, SH per SE %d, CU per SH %d, active_cu_number %d\n", 4651 adev->gfx.config.max_shader_engines, 4652 adev->gfx.config.max_sh_per_se, 4653 adev->gfx.config.max_cu_per_sh, 4654 adev->gfx.cu_info.number); 4655 4656 adev->accel_working = true; 4657 4658 amdgpu_vm_check_compute_bug(adev); 4659 4660 /* Initialize the buffer migration limit. */ 4661 if (amdgpu_moverate >= 0) 4662 max_MBps = amdgpu_moverate; 4663 else 4664 max_MBps = 8; /* Allow 8 MB/s. */ 4665 /* Get a log2 for easy divisions. */ 4666 adev->mm_stats.log2_max_MBps = ilog2(max(1u, max_MBps)); 4667 4668 /* 4669 * Register gpu instance before amdgpu_device_enable_mgpu_fan_boost. 4670 * Otherwise the mgpu fan boost feature will be skipped due to the 4671 * gpu instance is counted less. 4672 */ 4673 amdgpu_register_gpu_instance(adev); 4674 4675 /* enable clockgating, etc. after ib tests, etc. since some blocks require 4676 * explicit gating rather than handling it automatically. 4677 */ 4678 if (adev->init_lvl->level != AMDGPU_INIT_LEVEL_MINIMAL_XGMI) { 4679 r = amdgpu_device_ip_late_init(adev); 4680 if (r) { 4681 dev_err(adev->dev, "amdgpu_device_ip_late_init failed\n"); 4682 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_AMDGPU_LATE_INIT_FAIL, 0, r); 4683 goto release_ras_con; 4684 } 4685 /* must succeed. */ 4686 amdgpu_ras_resume(adev); 4687 queue_delayed_work(system_wq, &adev->delayed_init_work, 4688 msecs_to_jiffies(AMDGPU_RESUME_MS)); 4689 } 4690 4691 if (amdgpu_sriov_vf(adev)) { 4692 amdgpu_virt_release_full_gpu(adev, true); 4693 flush_delayed_work(&adev->delayed_init_work); 4694 } 4695 4696 /* 4697 * Place those sysfs registering after `late_init`. As some of those 4698 * operations performed in `late_init` might affect the sysfs 4699 * interfaces creating. 4700 */ 4701 r = amdgpu_atombios_sysfs_init(adev); 4702 if (r) 4703 drm_err(&adev->ddev, 4704 "registering atombios sysfs failed (%d).\n", r); 4705 4706 r = amdgpu_pm_sysfs_init(adev); 4707 if (r) 4708 DRM_ERROR("registering pm sysfs failed (%d).\n", r); 4709 4710 r = amdgpu_ucode_sysfs_init(adev); 4711 if (r) { 4712 adev->ucode_sysfs_en = false; 4713 DRM_ERROR("Creating firmware sysfs failed (%d).\n", r); 4714 } else 4715 adev->ucode_sysfs_en = true; 4716 4717 r = amdgpu_device_attr_sysfs_init(adev); 4718 if (r) 4719 dev_err(adev->dev, "Could not create amdgpu device attr\n"); 4720 4721 r = devm_device_add_group(adev->dev, &amdgpu_board_attrs_group); 4722 if (r) 4723 dev_err(adev->dev, 4724 "Could not create amdgpu board attributes\n"); 4725 4726 amdgpu_fru_sysfs_init(adev); 4727 amdgpu_reg_state_sysfs_init(adev); 4728 amdgpu_xcp_cfg_sysfs_init(adev); 4729 4730 if (IS_ENABLED(CONFIG_PERF_EVENTS)) 4731 r = amdgpu_pmu_init(adev); 4732 if (r) 4733 dev_err(adev->dev, "amdgpu_pmu_init failed\n"); 4734 4735 /* Have stored pci confspace at hand for restore in sudden PCI error */ 4736 if (amdgpu_device_cache_pci_state(adev->pdev)) 4737 pci_restore_state(pdev); 4738 4739 /* if we have > 1 VGA cards, then disable the amdgpu VGA resources */ 4740 /* this will fail for cards that aren't VGA class devices, just 4741 * ignore it 4742 */ 4743 if ((adev->pdev->class >> 8) == PCI_CLASS_DISPLAY_VGA) 4744 vga_client_register(adev->pdev, amdgpu_device_vga_set_decode); 4745 4746 px = amdgpu_device_supports_px(ddev); 4747 4748 if (px || (!dev_is_removable(&adev->pdev->dev) && 4749 apple_gmux_detect(NULL, NULL))) 4750 vga_switcheroo_register_client(adev->pdev, 4751 &amdgpu_switcheroo_ops, px); 4752 4753 if (px) 4754 vga_switcheroo_init_domain_pm_ops(adev->dev, &adev->vga_pm_domain); 4755 4756 if (adev->init_lvl->level == AMDGPU_INIT_LEVEL_MINIMAL_XGMI) 4757 amdgpu_xgmi_reset_on_init(adev); 4758 4759 amdgpu_device_check_iommu_direct_map(adev); 4760 4761 adev->pm_nb.notifier_call = amdgpu_device_pm_notifier; 4762 r = register_pm_notifier(&adev->pm_nb); 4763 if (r) 4764 goto failed; 4765 4766 return 0; 4767 4768 release_ras_con: 4769 if (amdgpu_sriov_vf(adev)) 4770 amdgpu_virt_release_full_gpu(adev, true); 4771 4772 /* failed in exclusive mode due to timeout */ 4773 if (amdgpu_sriov_vf(adev) && 4774 !amdgpu_sriov_runtime(adev) && 4775 amdgpu_virt_mmio_blocked(adev) && 4776 !amdgpu_virt_wait_reset(adev)) { 4777 dev_err(adev->dev, "VF exclusive mode timeout\n"); 4778 /* Don't send request since VF is inactive. */ 4779 adev->virt.caps &= ~AMDGPU_SRIOV_CAPS_RUNTIME; 4780 adev->virt.ops = NULL; 4781 r = -EAGAIN; 4782 } 4783 amdgpu_release_ras_context(adev); 4784 4785 failed: 4786 amdgpu_vf_error_trans_all(adev); 4787 4788 return r; 4789 } 4790 4791 static void amdgpu_device_unmap_mmio(struct amdgpu_device *adev) 4792 { 4793 4794 /* Clear all CPU mappings pointing to this device */ 4795 unmap_mapping_range(adev->ddev.anon_inode->i_mapping, 0, 0, 1); 4796 4797 /* Unmap all mapped bars - Doorbell, registers and VRAM */ 4798 amdgpu_doorbell_fini(adev); 4799 4800 iounmap(adev->rmmio); 4801 adev->rmmio = NULL; 4802 if (adev->mman.aper_base_kaddr) 4803 iounmap(adev->mman.aper_base_kaddr); 4804 adev->mman.aper_base_kaddr = NULL; 4805 4806 /* Memory manager related */ 4807 if (!adev->gmc.xgmi.connected_to_cpu && !adev->gmc.is_app_apu) { 4808 arch_phys_wc_del(adev->gmc.vram_mtrr); 4809 arch_io_free_memtype_wc(adev->gmc.aper_base, adev->gmc.aper_size); 4810 } 4811 } 4812 4813 /** 4814 * amdgpu_device_fini_hw - tear down the driver 4815 * 4816 * @adev: amdgpu_device pointer 4817 * 4818 * Tear down the driver info (all asics). 4819 * Called at driver shutdown. 4820 */ 4821 void amdgpu_device_fini_hw(struct amdgpu_device *adev) 4822 { 4823 dev_info(adev->dev, "amdgpu: finishing device.\n"); 4824 flush_delayed_work(&adev->delayed_init_work); 4825 4826 if (adev->mman.initialized) 4827 drain_workqueue(adev->mman.bdev.wq); 4828 adev->shutdown = true; 4829 4830 unregister_pm_notifier(&adev->pm_nb); 4831 4832 /* make sure IB test finished before entering exclusive mode 4833 * to avoid preemption on IB test 4834 */ 4835 if (amdgpu_sriov_vf(adev)) { 4836 amdgpu_virt_request_full_gpu(adev, false); 4837 amdgpu_virt_fini_data_exchange(adev); 4838 } 4839 4840 /* disable all interrupts */ 4841 amdgpu_irq_disable_all(adev); 4842 if (adev->mode_info.mode_config_initialized) { 4843 if (!drm_drv_uses_atomic_modeset(adev_to_drm(adev))) 4844 drm_helper_force_disable_all(adev_to_drm(adev)); 4845 else 4846 drm_atomic_helper_shutdown(adev_to_drm(adev)); 4847 } 4848 amdgpu_fence_driver_hw_fini(adev); 4849 4850 if (adev->pm.sysfs_initialized) 4851 amdgpu_pm_sysfs_fini(adev); 4852 if (adev->ucode_sysfs_en) 4853 amdgpu_ucode_sysfs_fini(adev); 4854 amdgpu_device_attr_sysfs_fini(adev); 4855 amdgpu_fru_sysfs_fini(adev); 4856 4857 amdgpu_reg_state_sysfs_fini(adev); 4858 amdgpu_xcp_cfg_sysfs_fini(adev); 4859 4860 /* disable ras feature must before hw fini */ 4861 amdgpu_ras_pre_fini(adev); 4862 4863 amdgpu_ttm_set_buffer_funcs_status(adev, false); 4864 4865 amdgpu_device_ip_fini_early(adev); 4866 4867 amdgpu_irq_fini_hw(adev); 4868 4869 if (adev->mman.initialized) 4870 ttm_device_clear_dma_mappings(&adev->mman.bdev); 4871 4872 amdgpu_gart_dummy_page_fini(adev); 4873 4874 if (drm_dev_is_unplugged(adev_to_drm(adev))) 4875 amdgpu_device_unmap_mmio(adev); 4876 4877 } 4878 4879 void amdgpu_device_fini_sw(struct amdgpu_device *adev) 4880 { 4881 int i, idx; 4882 bool px; 4883 4884 amdgpu_device_ip_fini(adev); 4885 amdgpu_fence_driver_sw_fini(adev); 4886 amdgpu_ucode_release(&adev->firmware.gpu_info_fw); 4887 adev->accel_working = false; 4888 dma_fence_put(rcu_dereference_protected(adev->gang_submit, true)); 4889 for (i = 0; i < MAX_XCP; ++i) { 4890 dma_fence_put(adev->isolation[i].spearhead); 4891 amdgpu_sync_free(&adev->isolation[i].active); 4892 amdgpu_sync_free(&adev->isolation[i].prev); 4893 } 4894 4895 amdgpu_reset_fini(adev); 4896 4897 /* free i2c buses */ 4898 amdgpu_i2c_fini(adev); 4899 4900 if (adev->bios) { 4901 if (amdgpu_emu_mode != 1) 4902 amdgpu_atombios_fini(adev); 4903 amdgpu_bios_release(adev); 4904 } 4905 4906 kfree(adev->fru_info); 4907 adev->fru_info = NULL; 4908 4909 kfree(adev->xcp_mgr); 4910 adev->xcp_mgr = NULL; 4911 4912 px = amdgpu_device_supports_px(adev_to_drm(adev)); 4913 4914 if (px || (!dev_is_removable(&adev->pdev->dev) && 4915 apple_gmux_detect(NULL, NULL))) 4916 vga_switcheroo_unregister_client(adev->pdev); 4917 4918 if (px) 4919 vga_switcheroo_fini_domain_pm_ops(adev->dev); 4920 4921 if ((adev->pdev->class >> 8) == PCI_CLASS_DISPLAY_VGA) 4922 vga_client_unregister(adev->pdev); 4923 4924 if (drm_dev_enter(adev_to_drm(adev), &idx)) { 4925 4926 iounmap(adev->rmmio); 4927 adev->rmmio = NULL; 4928 drm_dev_exit(idx); 4929 } 4930 4931 if (IS_ENABLED(CONFIG_PERF_EVENTS)) 4932 amdgpu_pmu_fini(adev); 4933 if (adev->mman.discovery_bin) 4934 amdgpu_discovery_fini(adev); 4935 4936 amdgpu_reset_put_reset_domain(adev->reset_domain); 4937 adev->reset_domain = NULL; 4938 4939 kfree(adev->pci_state); 4940 4941 } 4942 4943 /** 4944 * amdgpu_device_evict_resources - evict device resources 4945 * @adev: amdgpu device object 4946 * 4947 * Evicts all ttm device resources(vram BOs, gart table) from the lru list 4948 * of the vram memory type. Mainly used for evicting device resources 4949 * at suspend time. 4950 * 4951 */ 4952 static int amdgpu_device_evict_resources(struct amdgpu_device *adev) 4953 { 4954 int ret; 4955 4956 /* No need to evict vram on APUs unless going to S4 */ 4957 if (!adev->in_s4 && (adev->flags & AMD_IS_APU)) 4958 return 0; 4959 4960 ret = amdgpu_ttm_evict_resources(adev, TTM_PL_VRAM); 4961 if (ret) 4962 DRM_WARN("evicting device resources failed\n"); 4963 return ret; 4964 } 4965 4966 /* 4967 * Suspend & resume. 4968 */ 4969 /** 4970 * amdgpu_device_pm_notifier - Notification block for Suspend/Hibernate events 4971 * @nb: notifier block 4972 * @mode: suspend mode 4973 * @data: data 4974 * 4975 * This function is called when the system is about to suspend or hibernate. 4976 * It is used to evict resources from the device before the system goes to 4977 * sleep while there is still access to swap. 4978 */ 4979 static int amdgpu_device_pm_notifier(struct notifier_block *nb, unsigned long mode, 4980 void *data) 4981 { 4982 struct amdgpu_device *adev = container_of(nb, struct amdgpu_device, pm_nb); 4983 int r; 4984 4985 switch (mode) { 4986 case PM_HIBERNATION_PREPARE: 4987 adev->in_s4 = true; 4988 fallthrough; 4989 case PM_SUSPEND_PREPARE: 4990 r = amdgpu_device_evict_resources(adev); 4991 /* 4992 * This is considered non-fatal at this time because 4993 * amdgpu_device_prepare() will also fatally evict resources. 4994 * See https://gitlab.freedesktop.org/drm/amd/-/issues/3781 4995 */ 4996 if (r) 4997 drm_warn(adev_to_drm(adev), "Failed to evict resources, freeze active processes if problems occur: %d\n", r); 4998 break; 4999 } 5000 5001 return NOTIFY_DONE; 5002 } 5003 5004 /** 5005 * amdgpu_device_prepare - prepare for device suspend 5006 * 5007 * @dev: drm dev pointer 5008 * 5009 * Prepare to put the hw in the suspend state (all asics). 5010 * Returns 0 for success or an error on failure. 5011 * Called at driver suspend. 5012 */ 5013 int amdgpu_device_prepare(struct drm_device *dev) 5014 { 5015 struct amdgpu_device *adev = drm_to_adev(dev); 5016 int i, r; 5017 5018 amdgpu_choose_low_power_state(adev); 5019 5020 if (dev->switch_power_state == DRM_SWITCH_POWER_OFF) 5021 return 0; 5022 5023 /* Evict the majority of BOs before starting suspend sequence */ 5024 r = amdgpu_device_evict_resources(adev); 5025 if (r) 5026 goto unprepare; 5027 5028 flush_delayed_work(&adev->gfx.gfx_off_delay_work); 5029 5030 for (i = 0; i < adev->num_ip_blocks; i++) { 5031 if (!adev->ip_blocks[i].status.valid) 5032 continue; 5033 if (!adev->ip_blocks[i].version->funcs->prepare_suspend) 5034 continue; 5035 r = adev->ip_blocks[i].version->funcs->prepare_suspend(&adev->ip_blocks[i]); 5036 if (r) 5037 goto unprepare; 5038 } 5039 5040 return 0; 5041 5042 unprepare: 5043 adev->in_s0ix = adev->in_s3 = adev->in_s4 = false; 5044 5045 return r; 5046 } 5047 5048 /** 5049 * amdgpu_device_suspend - initiate device suspend 5050 * 5051 * @dev: drm dev pointer 5052 * @notify_clients: notify in-kernel DRM clients 5053 * 5054 * Puts the hw in the suspend state (all asics). 5055 * Returns 0 for success or an error on failure. 5056 * Called at driver suspend. 5057 */ 5058 int amdgpu_device_suspend(struct drm_device *dev, bool notify_clients) 5059 { 5060 struct amdgpu_device *adev = drm_to_adev(dev); 5061 int r = 0; 5062 5063 if (dev->switch_power_state == DRM_SWITCH_POWER_OFF) 5064 return 0; 5065 5066 adev->in_suspend = true; 5067 5068 if (amdgpu_sriov_vf(adev)) { 5069 amdgpu_virt_fini_data_exchange(adev); 5070 r = amdgpu_virt_request_full_gpu(adev, false); 5071 if (r) 5072 return r; 5073 } 5074 5075 if (amdgpu_acpi_smart_shift_update(dev, AMDGPU_SS_DEV_D3)) 5076 DRM_WARN("smart shift update failed\n"); 5077 5078 if (notify_clients) 5079 drm_client_dev_suspend(adev_to_drm(adev), false); 5080 5081 cancel_delayed_work_sync(&adev->delayed_init_work); 5082 5083 amdgpu_ras_suspend(adev); 5084 5085 amdgpu_device_ip_suspend_phase1(adev); 5086 5087 if (!adev->in_s0ix) { 5088 amdgpu_amdkfd_suspend(adev, adev->in_runpm); 5089 #ifdef CONFIG_DRM_AMDGPU_NAVI3X_USERQ 5090 amdgpu_userq_suspend(adev); 5091 #endif 5092 } 5093 5094 r = amdgpu_device_evict_resources(adev); 5095 if (r) 5096 return r; 5097 5098 amdgpu_ttm_set_buffer_funcs_status(adev, false); 5099 5100 amdgpu_fence_driver_hw_fini(adev); 5101 5102 amdgpu_device_ip_suspend_phase2(adev); 5103 5104 if (amdgpu_sriov_vf(adev)) 5105 amdgpu_virt_release_full_gpu(adev, false); 5106 5107 r = amdgpu_dpm_notify_rlc_state(adev, false); 5108 if (r) 5109 return r; 5110 5111 return 0; 5112 } 5113 5114 /** 5115 * amdgpu_device_resume - initiate device resume 5116 * 5117 * @dev: drm dev pointer 5118 * @notify_clients: notify in-kernel DRM clients 5119 * 5120 * Bring the hw back to operating state (all asics). 5121 * Returns 0 for success or an error on failure. 5122 * Called at driver resume. 5123 */ 5124 int amdgpu_device_resume(struct drm_device *dev, bool notify_clients) 5125 { 5126 struct amdgpu_device *adev = drm_to_adev(dev); 5127 int r = 0; 5128 5129 if (amdgpu_sriov_vf(adev)) { 5130 r = amdgpu_virt_request_full_gpu(adev, true); 5131 if (r) 5132 return r; 5133 } 5134 5135 if (dev->switch_power_state == DRM_SWITCH_POWER_OFF) 5136 return 0; 5137 5138 if (adev->in_s0ix) 5139 amdgpu_dpm_gfx_state_change(adev, sGpuChangeState_D0Entry); 5140 5141 /* post card */ 5142 if (amdgpu_device_need_post(adev)) { 5143 r = amdgpu_device_asic_init(adev); 5144 if (r) 5145 dev_err(adev->dev, "amdgpu asic init failed\n"); 5146 } 5147 5148 r = amdgpu_device_ip_resume(adev); 5149 5150 if (r) { 5151 dev_err(adev->dev, "amdgpu_device_ip_resume failed (%d).\n", r); 5152 goto exit; 5153 } 5154 5155 if (!adev->in_s0ix) { 5156 r = amdgpu_amdkfd_resume(adev, adev->in_runpm); 5157 if (r) 5158 goto exit; 5159 #ifdef CONFIG_DRM_AMDGPU_NAVI3X_USERQ 5160 r = amdgpu_userq_resume(adev); 5161 if (r) 5162 goto exit; 5163 #endif 5164 } 5165 5166 r = amdgpu_device_ip_late_init(adev); 5167 if (r) 5168 goto exit; 5169 5170 queue_delayed_work(system_wq, &adev->delayed_init_work, 5171 msecs_to_jiffies(AMDGPU_RESUME_MS)); 5172 exit: 5173 if (amdgpu_sriov_vf(adev)) { 5174 amdgpu_virt_init_data_exchange(adev); 5175 amdgpu_virt_release_full_gpu(adev, true); 5176 } 5177 5178 if (r) 5179 return r; 5180 5181 /* Make sure IB tests flushed */ 5182 flush_delayed_work(&adev->delayed_init_work); 5183 5184 if (notify_clients) 5185 drm_client_dev_resume(adev_to_drm(adev), false); 5186 5187 amdgpu_ras_resume(adev); 5188 5189 if (adev->mode_info.num_crtc) { 5190 /* 5191 * Most of the connector probing functions try to acquire runtime pm 5192 * refs to ensure that the GPU is powered on when connector polling is 5193 * performed. Since we're calling this from a runtime PM callback, 5194 * trying to acquire rpm refs will cause us to deadlock. 5195 * 5196 * Since we're guaranteed to be holding the rpm lock, it's safe to 5197 * temporarily disable the rpm helpers so this doesn't deadlock us. 5198 */ 5199 #ifdef CONFIG_PM 5200 dev->dev->power.disable_depth++; 5201 #endif 5202 if (!adev->dc_enabled) 5203 drm_helper_hpd_irq_event(dev); 5204 else 5205 drm_kms_helper_hotplug_event(dev); 5206 #ifdef CONFIG_PM 5207 dev->dev->power.disable_depth--; 5208 #endif 5209 } 5210 adev->in_suspend = false; 5211 5212 if (amdgpu_acpi_smart_shift_update(dev, AMDGPU_SS_DEV_D0)) 5213 DRM_WARN("smart shift update failed\n"); 5214 5215 return 0; 5216 } 5217 5218 /** 5219 * amdgpu_device_ip_check_soft_reset - did soft reset succeed 5220 * 5221 * @adev: amdgpu_device pointer 5222 * 5223 * The list of all the hardware IPs that make up the asic is walked and 5224 * the check_soft_reset callbacks are run. check_soft_reset determines 5225 * if the asic is still hung or not. 5226 * Returns true if any of the IPs are still in a hung state, false if not. 5227 */ 5228 static bool amdgpu_device_ip_check_soft_reset(struct amdgpu_device *adev) 5229 { 5230 int i; 5231 bool asic_hang = false; 5232 5233 if (amdgpu_sriov_vf(adev)) 5234 return true; 5235 5236 if (amdgpu_asic_need_full_reset(adev)) 5237 return true; 5238 5239 for (i = 0; i < adev->num_ip_blocks; i++) { 5240 if (!adev->ip_blocks[i].status.valid) 5241 continue; 5242 if (adev->ip_blocks[i].version->funcs->check_soft_reset) 5243 adev->ip_blocks[i].status.hang = 5244 adev->ip_blocks[i].version->funcs->check_soft_reset( 5245 &adev->ip_blocks[i]); 5246 if (adev->ip_blocks[i].status.hang) { 5247 dev_info(adev->dev, "IP block:%s is hung!\n", adev->ip_blocks[i].version->funcs->name); 5248 asic_hang = true; 5249 } 5250 } 5251 return asic_hang; 5252 } 5253 5254 /** 5255 * amdgpu_device_ip_pre_soft_reset - prepare for soft reset 5256 * 5257 * @adev: amdgpu_device pointer 5258 * 5259 * The list of all the hardware IPs that make up the asic is walked and the 5260 * pre_soft_reset callbacks are run if the block is hung. pre_soft_reset 5261 * handles any IP specific hardware or software state changes that are 5262 * necessary for a soft reset to succeed. 5263 * Returns 0 on success, negative error code on failure. 5264 */ 5265 static int amdgpu_device_ip_pre_soft_reset(struct amdgpu_device *adev) 5266 { 5267 int i, r = 0; 5268 5269 for (i = 0; i < adev->num_ip_blocks; i++) { 5270 if (!adev->ip_blocks[i].status.valid) 5271 continue; 5272 if (adev->ip_blocks[i].status.hang && 5273 adev->ip_blocks[i].version->funcs->pre_soft_reset) { 5274 r = adev->ip_blocks[i].version->funcs->pre_soft_reset(&adev->ip_blocks[i]); 5275 if (r) 5276 return r; 5277 } 5278 } 5279 5280 return 0; 5281 } 5282 5283 /** 5284 * amdgpu_device_ip_need_full_reset - check if a full asic reset is needed 5285 * 5286 * @adev: amdgpu_device pointer 5287 * 5288 * Some hardware IPs cannot be soft reset. If they are hung, a full gpu 5289 * reset is necessary to recover. 5290 * Returns true if a full asic reset is required, false if not. 5291 */ 5292 static bool amdgpu_device_ip_need_full_reset(struct amdgpu_device *adev) 5293 { 5294 int i; 5295 5296 if (amdgpu_asic_need_full_reset(adev)) 5297 return true; 5298 5299 for (i = 0; i < adev->num_ip_blocks; i++) { 5300 if (!adev->ip_blocks[i].status.valid) 5301 continue; 5302 if ((adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) || 5303 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) || 5304 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_ACP) || 5305 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE) || 5306 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) { 5307 if (adev->ip_blocks[i].status.hang) { 5308 dev_info(adev->dev, "Some block need full reset!\n"); 5309 return true; 5310 } 5311 } 5312 } 5313 return false; 5314 } 5315 5316 /** 5317 * amdgpu_device_ip_soft_reset - do a soft reset 5318 * 5319 * @adev: amdgpu_device pointer 5320 * 5321 * The list of all the hardware IPs that make up the asic is walked and the 5322 * soft_reset callbacks are run if the block is hung. soft_reset handles any 5323 * IP specific hardware or software state changes that are necessary to soft 5324 * reset the IP. 5325 * Returns 0 on success, negative error code on failure. 5326 */ 5327 static int amdgpu_device_ip_soft_reset(struct amdgpu_device *adev) 5328 { 5329 int i, r = 0; 5330 5331 for (i = 0; i < adev->num_ip_blocks; i++) { 5332 if (!adev->ip_blocks[i].status.valid) 5333 continue; 5334 if (adev->ip_blocks[i].status.hang && 5335 adev->ip_blocks[i].version->funcs->soft_reset) { 5336 r = adev->ip_blocks[i].version->funcs->soft_reset(&adev->ip_blocks[i]); 5337 if (r) 5338 return r; 5339 } 5340 } 5341 5342 return 0; 5343 } 5344 5345 /** 5346 * amdgpu_device_ip_post_soft_reset - clean up from soft reset 5347 * 5348 * @adev: amdgpu_device pointer 5349 * 5350 * The list of all the hardware IPs that make up the asic is walked and the 5351 * post_soft_reset callbacks are run if the asic was hung. post_soft_reset 5352 * handles any IP specific hardware or software state changes that are 5353 * necessary after the IP has been soft reset. 5354 * Returns 0 on success, negative error code on failure. 5355 */ 5356 static int amdgpu_device_ip_post_soft_reset(struct amdgpu_device *adev) 5357 { 5358 int i, r = 0; 5359 5360 for (i = 0; i < adev->num_ip_blocks; i++) { 5361 if (!adev->ip_blocks[i].status.valid) 5362 continue; 5363 if (adev->ip_blocks[i].status.hang && 5364 adev->ip_blocks[i].version->funcs->post_soft_reset) 5365 r = adev->ip_blocks[i].version->funcs->post_soft_reset(&adev->ip_blocks[i]); 5366 if (r) 5367 return r; 5368 } 5369 5370 return 0; 5371 } 5372 5373 /** 5374 * amdgpu_device_reset_sriov - reset ASIC for SR-IOV vf 5375 * 5376 * @adev: amdgpu_device pointer 5377 * @reset_context: amdgpu reset context pointer 5378 * 5379 * do VF FLR and reinitialize Asic 5380 * return 0 means succeeded otherwise failed 5381 */ 5382 static int amdgpu_device_reset_sriov(struct amdgpu_device *adev, 5383 struct amdgpu_reset_context *reset_context) 5384 { 5385 int r; 5386 struct amdgpu_hive_info *hive = NULL; 5387 5388 if (test_bit(AMDGPU_HOST_FLR, &reset_context->flags)) { 5389 if (!amdgpu_ras_get_fed_status(adev)) 5390 amdgpu_virt_ready_to_reset(adev); 5391 amdgpu_virt_wait_reset(adev); 5392 clear_bit(AMDGPU_HOST_FLR, &reset_context->flags); 5393 r = amdgpu_virt_request_full_gpu(adev, true); 5394 } else { 5395 r = amdgpu_virt_reset_gpu(adev); 5396 } 5397 if (r) 5398 return r; 5399 5400 amdgpu_ras_clear_err_state(adev); 5401 amdgpu_irq_gpu_reset_resume_helper(adev); 5402 5403 /* some sw clean up VF needs to do before recover */ 5404 amdgpu_virt_post_reset(adev); 5405 5406 /* Resume IP prior to SMC */ 5407 r = amdgpu_device_ip_reinit_early_sriov(adev); 5408 if (r) 5409 return r; 5410 5411 amdgpu_virt_init_data_exchange(adev); 5412 5413 r = amdgpu_device_fw_loading(adev); 5414 if (r) 5415 return r; 5416 5417 /* now we are okay to resume SMC/CP/SDMA */ 5418 r = amdgpu_device_ip_reinit_late_sriov(adev); 5419 if (r) 5420 return r; 5421 5422 hive = amdgpu_get_xgmi_hive(adev); 5423 /* Update PSP FW topology after reset */ 5424 if (hive && adev->gmc.xgmi.num_physical_nodes > 1) 5425 r = amdgpu_xgmi_update_topology(hive, adev); 5426 if (hive) 5427 amdgpu_put_xgmi_hive(hive); 5428 if (r) 5429 return r; 5430 5431 r = amdgpu_ib_ring_tests(adev); 5432 if (r) 5433 return r; 5434 5435 if (adev->virt.gim_feature & AMDGIM_FEATURE_GIM_FLR_VRAMLOST) 5436 amdgpu_inc_vram_lost(adev); 5437 5438 /* need to be called during full access so we can't do it later like 5439 * bare-metal does. 5440 */ 5441 amdgpu_amdkfd_post_reset(adev); 5442 amdgpu_virt_release_full_gpu(adev, true); 5443 5444 /* Aldebaran and gfx_11_0_3 support ras in SRIOV, so need resume ras during reset */ 5445 if (amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 2) || 5446 amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 3) || 5447 amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 4) || 5448 amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 5, 0) || 5449 amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(11, 0, 3)) 5450 amdgpu_ras_resume(adev); 5451 5452 amdgpu_virt_ras_telemetry_post_reset(adev); 5453 5454 return 0; 5455 } 5456 5457 /** 5458 * amdgpu_device_has_job_running - check if there is any unfinished job 5459 * 5460 * @adev: amdgpu_device pointer 5461 * 5462 * check if there is any job running on the device when guest driver receives 5463 * FLR notification from host driver. If there are still jobs running, then 5464 * the guest driver will not respond the FLR reset. Instead, let the job hit 5465 * the timeout and guest driver then issue the reset request. 5466 */ 5467 bool amdgpu_device_has_job_running(struct amdgpu_device *adev) 5468 { 5469 int i; 5470 5471 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 5472 struct amdgpu_ring *ring = adev->rings[i]; 5473 5474 if (!amdgpu_ring_sched_ready(ring)) 5475 continue; 5476 5477 if (amdgpu_fence_count_emitted(ring)) 5478 return true; 5479 } 5480 return false; 5481 } 5482 5483 /** 5484 * amdgpu_device_should_recover_gpu - check if we should try GPU recovery 5485 * 5486 * @adev: amdgpu_device pointer 5487 * 5488 * Check amdgpu_gpu_recovery and SRIOV status to see if we should try to recover 5489 * a hung GPU. 5490 */ 5491 bool amdgpu_device_should_recover_gpu(struct amdgpu_device *adev) 5492 { 5493 5494 if (amdgpu_gpu_recovery == 0) 5495 goto disabled; 5496 5497 /* Skip soft reset check in fatal error mode */ 5498 if (!amdgpu_ras_is_poison_mode_supported(adev)) 5499 return true; 5500 5501 if (amdgpu_sriov_vf(adev)) 5502 return true; 5503 5504 if (amdgpu_gpu_recovery == -1) { 5505 switch (adev->asic_type) { 5506 #ifdef CONFIG_DRM_AMDGPU_SI 5507 case CHIP_VERDE: 5508 case CHIP_TAHITI: 5509 case CHIP_PITCAIRN: 5510 case CHIP_OLAND: 5511 case CHIP_HAINAN: 5512 #endif 5513 #ifdef CONFIG_DRM_AMDGPU_CIK 5514 case CHIP_KAVERI: 5515 case CHIP_KABINI: 5516 case CHIP_MULLINS: 5517 #endif 5518 case CHIP_CARRIZO: 5519 case CHIP_STONEY: 5520 case CHIP_CYAN_SKILLFISH: 5521 goto disabled; 5522 default: 5523 break; 5524 } 5525 } 5526 5527 return true; 5528 5529 disabled: 5530 dev_info(adev->dev, "GPU recovery disabled.\n"); 5531 return false; 5532 } 5533 5534 int amdgpu_device_mode1_reset(struct amdgpu_device *adev) 5535 { 5536 u32 i; 5537 int ret = 0; 5538 5539 if (adev->bios) 5540 amdgpu_atombios_scratch_regs_engine_hung(adev, true); 5541 5542 dev_info(adev->dev, "GPU mode1 reset\n"); 5543 5544 /* Cache the state before bus master disable. The saved config space 5545 * values are used in other cases like restore after mode-2 reset. 5546 */ 5547 amdgpu_device_cache_pci_state(adev->pdev); 5548 5549 /* disable BM */ 5550 pci_clear_master(adev->pdev); 5551 5552 if (amdgpu_dpm_is_mode1_reset_supported(adev)) { 5553 dev_info(adev->dev, "GPU smu mode1 reset\n"); 5554 ret = amdgpu_dpm_mode1_reset(adev); 5555 } else { 5556 dev_info(adev->dev, "GPU psp mode1 reset\n"); 5557 ret = psp_gpu_reset(adev); 5558 } 5559 5560 if (ret) 5561 goto mode1_reset_failed; 5562 5563 amdgpu_device_load_pci_state(adev->pdev); 5564 ret = amdgpu_psp_wait_for_bootloader(adev); 5565 if (ret) 5566 goto mode1_reset_failed; 5567 5568 /* wait for asic to come out of reset */ 5569 for (i = 0; i < adev->usec_timeout; i++) { 5570 u32 memsize = adev->nbio.funcs->get_memsize(adev); 5571 5572 if (memsize != 0xffffffff) 5573 break; 5574 udelay(1); 5575 } 5576 5577 if (i >= adev->usec_timeout) { 5578 ret = -ETIMEDOUT; 5579 goto mode1_reset_failed; 5580 } 5581 5582 if (adev->bios) 5583 amdgpu_atombios_scratch_regs_engine_hung(adev, false); 5584 5585 return 0; 5586 5587 mode1_reset_failed: 5588 dev_err(adev->dev, "GPU mode1 reset failed\n"); 5589 return ret; 5590 } 5591 5592 int amdgpu_device_link_reset(struct amdgpu_device *adev) 5593 { 5594 int ret = 0; 5595 5596 dev_info(adev->dev, "GPU link reset\n"); 5597 5598 if (!adev->pcie_reset_ctx.occurs_dpc) 5599 ret = amdgpu_dpm_link_reset(adev); 5600 5601 if (ret) 5602 goto link_reset_failed; 5603 5604 ret = amdgpu_psp_wait_for_bootloader(adev); 5605 if (ret) 5606 goto link_reset_failed; 5607 5608 return 0; 5609 5610 link_reset_failed: 5611 dev_err(adev->dev, "GPU link reset failed\n"); 5612 return ret; 5613 } 5614 5615 int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev, 5616 struct amdgpu_reset_context *reset_context) 5617 { 5618 int i, r = 0; 5619 struct amdgpu_job *job = NULL; 5620 struct amdgpu_device *tmp_adev = reset_context->reset_req_dev; 5621 bool need_full_reset = 5622 test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 5623 5624 if (reset_context->reset_req_dev == adev) 5625 job = reset_context->job; 5626 5627 if (amdgpu_sriov_vf(adev)) 5628 amdgpu_virt_pre_reset(adev); 5629 5630 amdgpu_fence_driver_isr_toggle(adev, true); 5631 5632 /* block all schedulers and reset given job's ring */ 5633 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 5634 struct amdgpu_ring *ring = adev->rings[i]; 5635 5636 if (!amdgpu_ring_sched_ready(ring)) 5637 continue; 5638 5639 /* Clear job fence from fence drv to avoid force_completion 5640 * leave NULL and vm flush fence in fence drv 5641 */ 5642 amdgpu_fence_driver_clear_job_fences(ring); 5643 5644 /* after all hw jobs are reset, hw fence is meaningless, so force_completion */ 5645 amdgpu_fence_driver_force_completion(ring); 5646 } 5647 5648 amdgpu_fence_driver_isr_toggle(adev, false); 5649 5650 if (job && job->vm) 5651 drm_sched_increase_karma(&job->base); 5652 5653 r = amdgpu_reset_prepare_hwcontext(adev, reset_context); 5654 /* If reset handler not implemented, continue; otherwise return */ 5655 if (r == -EOPNOTSUPP) 5656 r = 0; 5657 else 5658 return r; 5659 5660 /* Don't suspend on bare metal if we are not going to HW reset the ASIC */ 5661 if (!amdgpu_sriov_vf(adev)) { 5662 5663 if (!need_full_reset) 5664 need_full_reset = amdgpu_device_ip_need_full_reset(adev); 5665 5666 if (!need_full_reset && amdgpu_gpu_recovery && 5667 amdgpu_device_ip_check_soft_reset(adev)) { 5668 amdgpu_device_ip_pre_soft_reset(adev); 5669 r = amdgpu_device_ip_soft_reset(adev); 5670 amdgpu_device_ip_post_soft_reset(adev); 5671 if (r || amdgpu_device_ip_check_soft_reset(adev)) { 5672 dev_info(adev->dev, "soft reset failed, will fallback to full reset!\n"); 5673 need_full_reset = true; 5674 } 5675 } 5676 5677 if (!test_bit(AMDGPU_SKIP_COREDUMP, &reset_context->flags)) { 5678 dev_info(tmp_adev->dev, "Dumping IP State\n"); 5679 /* Trigger ip dump before we reset the asic */ 5680 for (i = 0; i < tmp_adev->num_ip_blocks; i++) 5681 if (tmp_adev->ip_blocks[i].version->funcs->dump_ip_state) 5682 tmp_adev->ip_blocks[i].version->funcs 5683 ->dump_ip_state((void *)&tmp_adev->ip_blocks[i]); 5684 dev_info(tmp_adev->dev, "Dumping IP State Completed\n"); 5685 } 5686 5687 if (need_full_reset) 5688 r = amdgpu_device_ip_suspend(adev); 5689 if (need_full_reset) 5690 set_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 5691 else 5692 clear_bit(AMDGPU_NEED_FULL_RESET, 5693 &reset_context->flags); 5694 } 5695 5696 return r; 5697 } 5698 5699 int amdgpu_device_reinit_after_reset(struct amdgpu_reset_context *reset_context) 5700 { 5701 struct list_head *device_list_handle; 5702 bool full_reset, vram_lost = false; 5703 struct amdgpu_device *tmp_adev; 5704 int r, init_level; 5705 5706 device_list_handle = reset_context->reset_device_list; 5707 5708 if (!device_list_handle) 5709 return -EINVAL; 5710 5711 full_reset = test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 5712 5713 /** 5714 * If it's reset on init, it's default init level, otherwise keep level 5715 * as recovery level. 5716 */ 5717 if (reset_context->method == AMD_RESET_METHOD_ON_INIT) 5718 init_level = AMDGPU_INIT_LEVEL_DEFAULT; 5719 else 5720 init_level = AMDGPU_INIT_LEVEL_RESET_RECOVERY; 5721 5722 r = 0; 5723 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 5724 amdgpu_set_init_level(tmp_adev, init_level); 5725 if (full_reset) { 5726 /* post card */ 5727 amdgpu_ras_clear_err_state(tmp_adev); 5728 r = amdgpu_device_asic_init(tmp_adev); 5729 if (r) { 5730 dev_warn(tmp_adev->dev, "asic atom init failed!"); 5731 } else { 5732 dev_info(tmp_adev->dev, "GPU reset succeeded, trying to resume\n"); 5733 5734 r = amdgpu_device_ip_resume_phase1(tmp_adev); 5735 if (r) 5736 goto out; 5737 5738 vram_lost = amdgpu_device_check_vram_lost(tmp_adev); 5739 5740 if (!test_bit(AMDGPU_SKIP_COREDUMP, &reset_context->flags)) 5741 amdgpu_coredump(tmp_adev, false, vram_lost, reset_context->job); 5742 5743 if (vram_lost) { 5744 DRM_INFO("VRAM is lost due to GPU reset!\n"); 5745 amdgpu_inc_vram_lost(tmp_adev); 5746 } 5747 5748 r = amdgpu_device_fw_loading(tmp_adev); 5749 if (r) 5750 return r; 5751 5752 r = amdgpu_xcp_restore_partition_mode( 5753 tmp_adev->xcp_mgr); 5754 if (r) 5755 goto out; 5756 5757 r = amdgpu_device_ip_resume_phase2(tmp_adev); 5758 if (r) 5759 goto out; 5760 5761 if (tmp_adev->mman.buffer_funcs_ring->sched.ready) 5762 amdgpu_ttm_set_buffer_funcs_status(tmp_adev, true); 5763 5764 r = amdgpu_device_ip_resume_phase3(tmp_adev); 5765 if (r) 5766 goto out; 5767 5768 if (vram_lost) 5769 amdgpu_device_fill_reset_magic(tmp_adev); 5770 5771 /* 5772 * Add this ASIC as tracked as reset was already 5773 * complete successfully. 5774 */ 5775 amdgpu_register_gpu_instance(tmp_adev); 5776 5777 if (!reset_context->hive && 5778 tmp_adev->gmc.xgmi.num_physical_nodes > 1) 5779 amdgpu_xgmi_add_device(tmp_adev); 5780 5781 r = amdgpu_device_ip_late_init(tmp_adev); 5782 if (r) 5783 goto out; 5784 5785 drm_client_dev_resume(adev_to_drm(tmp_adev), false); 5786 5787 /* 5788 * The GPU enters bad state once faulty pages 5789 * by ECC has reached the threshold, and ras 5790 * recovery is scheduled next. So add one check 5791 * here to break recovery if it indeed exceeds 5792 * bad page threshold, and remind user to 5793 * retire this GPU or setting one bigger 5794 * bad_page_threshold value to fix this once 5795 * probing driver again. 5796 */ 5797 if (!amdgpu_ras_is_rma(tmp_adev)) { 5798 /* must succeed. */ 5799 amdgpu_ras_resume(tmp_adev); 5800 } else { 5801 r = -EINVAL; 5802 goto out; 5803 } 5804 5805 /* Update PSP FW topology after reset */ 5806 if (reset_context->hive && 5807 tmp_adev->gmc.xgmi.num_physical_nodes > 1) 5808 r = amdgpu_xgmi_update_topology( 5809 reset_context->hive, tmp_adev); 5810 } 5811 } 5812 5813 out: 5814 if (!r) { 5815 /* IP init is complete now, set level as default */ 5816 amdgpu_set_init_level(tmp_adev, 5817 AMDGPU_INIT_LEVEL_DEFAULT); 5818 amdgpu_irq_gpu_reset_resume_helper(tmp_adev); 5819 r = amdgpu_ib_ring_tests(tmp_adev); 5820 if (r) { 5821 dev_err(tmp_adev->dev, "ib ring test failed (%d).\n", r); 5822 r = -EAGAIN; 5823 goto end; 5824 } 5825 } 5826 5827 if (r) 5828 tmp_adev->asic_reset_res = r; 5829 } 5830 5831 end: 5832 return r; 5833 } 5834 5835 int amdgpu_do_asic_reset(struct list_head *device_list_handle, 5836 struct amdgpu_reset_context *reset_context) 5837 { 5838 struct amdgpu_device *tmp_adev = NULL; 5839 bool need_full_reset, skip_hw_reset; 5840 int r = 0; 5841 5842 /* Try reset handler method first */ 5843 tmp_adev = list_first_entry(device_list_handle, struct amdgpu_device, 5844 reset_list); 5845 5846 reset_context->reset_device_list = device_list_handle; 5847 r = amdgpu_reset_perform_reset(tmp_adev, reset_context); 5848 /* If reset handler not implemented, continue; otherwise return */ 5849 if (r == -EOPNOTSUPP) 5850 r = 0; 5851 else 5852 return r; 5853 5854 /* Reset handler not implemented, use the default method */ 5855 need_full_reset = 5856 test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 5857 skip_hw_reset = test_bit(AMDGPU_SKIP_HW_RESET, &reset_context->flags); 5858 5859 /* 5860 * ASIC reset has to be done on all XGMI hive nodes ASAP 5861 * to allow proper links negotiation in FW (within 1 sec) 5862 */ 5863 if (!skip_hw_reset && need_full_reset) { 5864 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 5865 /* For XGMI run all resets in parallel to speed up the process */ 5866 if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) { 5867 if (!queue_work(system_unbound_wq, 5868 &tmp_adev->xgmi_reset_work)) 5869 r = -EALREADY; 5870 } else 5871 r = amdgpu_asic_reset(tmp_adev); 5872 5873 if (r) { 5874 dev_err(tmp_adev->dev, 5875 "ASIC reset failed with error, %d for drm dev, %s", 5876 r, adev_to_drm(tmp_adev)->unique); 5877 goto out; 5878 } 5879 } 5880 5881 /* For XGMI wait for all resets to complete before proceed */ 5882 if (!r) { 5883 list_for_each_entry(tmp_adev, device_list_handle, 5884 reset_list) { 5885 if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) { 5886 flush_work(&tmp_adev->xgmi_reset_work); 5887 r = tmp_adev->asic_reset_res; 5888 if (r) 5889 break; 5890 } 5891 } 5892 } 5893 } 5894 5895 if (!r && amdgpu_ras_intr_triggered()) { 5896 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 5897 amdgpu_ras_reset_error_count(tmp_adev, 5898 AMDGPU_RAS_BLOCK__MMHUB); 5899 } 5900 5901 amdgpu_ras_intr_cleared(); 5902 } 5903 5904 r = amdgpu_device_reinit_after_reset(reset_context); 5905 if (r == -EAGAIN) 5906 set_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 5907 else 5908 clear_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 5909 5910 out: 5911 return r; 5912 } 5913 5914 static void amdgpu_device_set_mp1_state(struct amdgpu_device *adev) 5915 { 5916 5917 switch (amdgpu_asic_reset_method(adev)) { 5918 case AMD_RESET_METHOD_MODE1: 5919 case AMD_RESET_METHOD_LINK: 5920 adev->mp1_state = PP_MP1_STATE_SHUTDOWN; 5921 break; 5922 case AMD_RESET_METHOD_MODE2: 5923 adev->mp1_state = PP_MP1_STATE_RESET; 5924 break; 5925 default: 5926 adev->mp1_state = PP_MP1_STATE_NONE; 5927 break; 5928 } 5929 } 5930 5931 static void amdgpu_device_unset_mp1_state(struct amdgpu_device *adev) 5932 { 5933 amdgpu_vf_error_trans_all(adev); 5934 adev->mp1_state = PP_MP1_STATE_NONE; 5935 } 5936 5937 static void amdgpu_device_resume_display_audio(struct amdgpu_device *adev) 5938 { 5939 struct pci_dev *p = NULL; 5940 5941 p = pci_get_domain_bus_and_slot(pci_domain_nr(adev->pdev->bus), 5942 adev->pdev->bus->number, 1); 5943 if (p) { 5944 pm_runtime_enable(&(p->dev)); 5945 pm_runtime_resume(&(p->dev)); 5946 } 5947 5948 pci_dev_put(p); 5949 } 5950 5951 static int amdgpu_device_suspend_display_audio(struct amdgpu_device *adev) 5952 { 5953 enum amd_reset_method reset_method; 5954 struct pci_dev *p = NULL; 5955 u64 expires; 5956 5957 /* 5958 * For now, only BACO and mode1 reset are confirmed 5959 * to suffer the audio issue without proper suspended. 5960 */ 5961 reset_method = amdgpu_asic_reset_method(adev); 5962 if ((reset_method != AMD_RESET_METHOD_BACO) && 5963 (reset_method != AMD_RESET_METHOD_MODE1)) 5964 return -EINVAL; 5965 5966 p = pci_get_domain_bus_and_slot(pci_domain_nr(adev->pdev->bus), 5967 adev->pdev->bus->number, 1); 5968 if (!p) 5969 return -ENODEV; 5970 5971 expires = pm_runtime_autosuspend_expiration(&(p->dev)); 5972 if (!expires) 5973 /* 5974 * If we cannot get the audio device autosuspend delay, 5975 * a fixed 4S interval will be used. Considering 3S is 5976 * the audio controller default autosuspend delay setting. 5977 * 4S used here is guaranteed to cover that. 5978 */ 5979 expires = ktime_get_mono_fast_ns() + NSEC_PER_SEC * 4ULL; 5980 5981 while (!pm_runtime_status_suspended(&(p->dev))) { 5982 if (!pm_runtime_suspend(&(p->dev))) 5983 break; 5984 5985 if (expires < ktime_get_mono_fast_ns()) { 5986 dev_warn(adev->dev, "failed to suspend display audio\n"); 5987 pci_dev_put(p); 5988 /* TODO: abort the succeeding gpu reset? */ 5989 return -ETIMEDOUT; 5990 } 5991 } 5992 5993 pm_runtime_disable(&(p->dev)); 5994 5995 pci_dev_put(p); 5996 return 0; 5997 } 5998 5999 static inline void amdgpu_device_stop_pending_resets(struct amdgpu_device *adev) 6000 { 6001 struct amdgpu_ras *con = amdgpu_ras_get_context(adev); 6002 6003 #if defined(CONFIG_DEBUG_FS) 6004 if (!amdgpu_sriov_vf(adev)) 6005 cancel_work(&adev->reset_work); 6006 #endif 6007 6008 if (adev->kfd.dev) 6009 cancel_work(&adev->kfd.reset_work); 6010 6011 if (amdgpu_sriov_vf(adev)) 6012 cancel_work(&adev->virt.flr_work); 6013 6014 if (con && adev->ras_enabled) 6015 cancel_work(&con->recovery_work); 6016 6017 } 6018 6019 static int amdgpu_device_health_check(struct list_head *device_list_handle) 6020 { 6021 struct amdgpu_device *tmp_adev; 6022 int ret = 0; 6023 u32 status; 6024 6025 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 6026 pci_read_config_dword(tmp_adev->pdev, PCI_COMMAND, &status); 6027 if (PCI_POSSIBLE_ERROR(status)) { 6028 dev_err(tmp_adev->dev, "device lost from bus!"); 6029 ret = -ENODEV; 6030 } 6031 } 6032 6033 return ret; 6034 } 6035 6036 static int amdgpu_device_halt_activities(struct amdgpu_device *adev, 6037 struct amdgpu_job *job, 6038 struct amdgpu_reset_context *reset_context, 6039 struct list_head *device_list, 6040 struct amdgpu_hive_info *hive, 6041 bool need_emergency_restart) 6042 { 6043 struct list_head *device_list_handle = NULL; 6044 struct amdgpu_device *tmp_adev = NULL; 6045 int i, r = 0; 6046 6047 /* 6048 * Build list of devices to reset. 6049 * In case we are in XGMI hive mode, resort the device list 6050 * to put adev in the 1st position. 6051 */ 6052 if (!amdgpu_sriov_vf(adev) && (adev->gmc.xgmi.num_physical_nodes > 1) && hive) { 6053 list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head) { 6054 list_add_tail(&tmp_adev->reset_list, device_list); 6055 if (adev->shutdown) 6056 tmp_adev->shutdown = true; 6057 if (adev->pcie_reset_ctx.occurs_dpc) 6058 tmp_adev->pcie_reset_ctx.in_link_reset = true; 6059 } 6060 if (!list_is_first(&adev->reset_list, device_list)) 6061 list_rotate_to_front(&adev->reset_list, device_list); 6062 device_list_handle = device_list; 6063 } else { 6064 list_add_tail(&adev->reset_list, device_list); 6065 device_list_handle = device_list; 6066 } 6067 6068 if (!amdgpu_sriov_vf(adev) && (!adev->pcie_reset_ctx.occurs_dpc)) { 6069 r = amdgpu_device_health_check(device_list_handle); 6070 if (r) 6071 return r; 6072 } 6073 6074 /* We need to lock reset domain only once both for XGMI and single device */ 6075 tmp_adev = list_first_entry(device_list_handle, struct amdgpu_device, 6076 reset_list); 6077 amdgpu_device_lock_reset_domain(tmp_adev->reset_domain); 6078 6079 /* block all schedulers and reset given job's ring */ 6080 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 6081 6082 amdgpu_device_set_mp1_state(tmp_adev); 6083 6084 /* 6085 * Try to put the audio codec into suspend state 6086 * before gpu reset started. 6087 * 6088 * Due to the power domain of the graphics device 6089 * is shared with AZ power domain. Without this, 6090 * we may change the audio hardware from behind 6091 * the audio driver's back. That will trigger 6092 * some audio codec errors. 6093 */ 6094 if (!amdgpu_device_suspend_display_audio(tmp_adev)) 6095 tmp_adev->pcie_reset_ctx.audio_suspended = true; 6096 6097 amdgpu_ras_set_error_query_ready(tmp_adev, false); 6098 6099 cancel_delayed_work_sync(&tmp_adev->delayed_init_work); 6100 6101 amdgpu_amdkfd_pre_reset(tmp_adev, reset_context); 6102 6103 /* 6104 * Mark these ASICs to be reset as untracked first 6105 * And add them back after reset completed 6106 */ 6107 amdgpu_unregister_gpu_instance(tmp_adev); 6108 6109 drm_client_dev_suspend(adev_to_drm(tmp_adev), false); 6110 6111 /* disable ras on ALL IPs */ 6112 if (!need_emergency_restart && 6113 (!adev->pcie_reset_ctx.occurs_dpc) && 6114 amdgpu_device_ip_need_full_reset(tmp_adev)) 6115 amdgpu_ras_suspend(tmp_adev); 6116 6117 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 6118 struct amdgpu_ring *ring = tmp_adev->rings[i]; 6119 6120 if (!amdgpu_ring_sched_ready(ring)) 6121 continue; 6122 6123 drm_sched_stop(&ring->sched, job ? &job->base : NULL); 6124 6125 if (need_emergency_restart) 6126 amdgpu_job_stop_all_jobs_on_sched(&ring->sched); 6127 } 6128 atomic_inc(&tmp_adev->gpu_reset_counter); 6129 } 6130 6131 return r; 6132 } 6133 6134 static int amdgpu_device_asic_reset(struct amdgpu_device *adev, 6135 struct list_head *device_list, 6136 struct amdgpu_reset_context *reset_context) 6137 { 6138 struct amdgpu_device *tmp_adev = NULL; 6139 int retry_limit = AMDGPU_MAX_RETRY_LIMIT; 6140 int r = 0; 6141 6142 retry: /* Rest of adevs pre asic reset from XGMI hive. */ 6143 list_for_each_entry(tmp_adev, device_list, reset_list) { 6144 if (adev->pcie_reset_ctx.occurs_dpc) 6145 tmp_adev->no_hw_access = true; 6146 r = amdgpu_device_pre_asic_reset(tmp_adev, reset_context); 6147 if (adev->pcie_reset_ctx.occurs_dpc) 6148 tmp_adev->no_hw_access = false; 6149 /*TODO Should we stop ?*/ 6150 if (r) { 6151 dev_err(tmp_adev->dev, "GPU pre asic reset failed with err, %d for drm dev, %s ", 6152 r, adev_to_drm(tmp_adev)->unique); 6153 tmp_adev->asic_reset_res = r; 6154 } 6155 } 6156 6157 /* Actual ASIC resets if needed.*/ 6158 /* Host driver will handle XGMI hive reset for SRIOV */ 6159 if (amdgpu_sriov_vf(adev)) { 6160 if (amdgpu_ras_get_fed_status(adev) || amdgpu_virt_rcvd_ras_interrupt(adev)) { 6161 dev_dbg(adev->dev, "Detected RAS error, wait for FLR completion\n"); 6162 amdgpu_ras_set_fed(adev, true); 6163 set_bit(AMDGPU_HOST_FLR, &reset_context->flags); 6164 } 6165 6166 r = amdgpu_device_reset_sriov(adev, reset_context); 6167 if (AMDGPU_RETRY_SRIOV_RESET(r) && (retry_limit--) > 0) { 6168 amdgpu_virt_release_full_gpu(adev, true); 6169 goto retry; 6170 } 6171 if (r) 6172 adev->asic_reset_res = r; 6173 } else { 6174 r = amdgpu_do_asic_reset(device_list, reset_context); 6175 if (r && r == -EAGAIN) 6176 goto retry; 6177 } 6178 6179 list_for_each_entry(tmp_adev, device_list, reset_list) { 6180 /* 6181 * Drop any pending non scheduler resets queued before reset is done. 6182 * Any reset scheduled after this point would be valid. Scheduler resets 6183 * were already dropped during drm_sched_stop and no new ones can come 6184 * in before drm_sched_start. 6185 */ 6186 amdgpu_device_stop_pending_resets(tmp_adev); 6187 } 6188 6189 return r; 6190 } 6191 6192 static int amdgpu_device_sched_resume(struct list_head *device_list, 6193 struct amdgpu_reset_context *reset_context, 6194 bool job_signaled) 6195 { 6196 struct amdgpu_device *tmp_adev = NULL; 6197 int i, r = 0; 6198 6199 /* Post ASIC reset for all devs .*/ 6200 list_for_each_entry(tmp_adev, device_list, reset_list) { 6201 6202 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 6203 struct amdgpu_ring *ring = tmp_adev->rings[i]; 6204 6205 if (!amdgpu_ring_sched_ready(ring)) 6206 continue; 6207 6208 drm_sched_start(&ring->sched, 0); 6209 } 6210 6211 if (!drm_drv_uses_atomic_modeset(adev_to_drm(tmp_adev)) && !job_signaled) 6212 drm_helper_resume_force_mode(adev_to_drm(tmp_adev)); 6213 6214 if (tmp_adev->asic_reset_res) 6215 r = tmp_adev->asic_reset_res; 6216 6217 tmp_adev->asic_reset_res = 0; 6218 6219 if (r) { 6220 /* bad news, how to tell it to userspace ? 6221 * for ras error, we should report GPU bad status instead of 6222 * reset failure 6223 */ 6224 if (reset_context->src != AMDGPU_RESET_SRC_RAS || 6225 !amdgpu_ras_eeprom_check_err_threshold(tmp_adev)) 6226 dev_info(tmp_adev->dev, "GPU reset(%d) failed\n", 6227 atomic_read(&tmp_adev->gpu_reset_counter)); 6228 amdgpu_vf_error_put(tmp_adev, AMDGIM_ERROR_VF_GPU_RESET_FAIL, 0, r); 6229 } else { 6230 dev_info(tmp_adev->dev, "GPU reset(%d) succeeded!\n", atomic_read(&tmp_adev->gpu_reset_counter)); 6231 if (amdgpu_acpi_smart_shift_update(adev_to_drm(tmp_adev), AMDGPU_SS_DEV_D0)) 6232 DRM_WARN("smart shift update failed\n"); 6233 } 6234 } 6235 6236 return r; 6237 } 6238 6239 static void amdgpu_device_gpu_resume(struct amdgpu_device *adev, 6240 struct list_head *device_list, 6241 bool need_emergency_restart) 6242 { 6243 struct amdgpu_device *tmp_adev = NULL; 6244 6245 list_for_each_entry(tmp_adev, device_list, reset_list) { 6246 /* unlock kfd: SRIOV would do it separately */ 6247 if (!need_emergency_restart && !amdgpu_sriov_vf(tmp_adev)) 6248 amdgpu_amdkfd_post_reset(tmp_adev); 6249 6250 /* kfd_post_reset will do nothing if kfd device is not initialized, 6251 * need to bring up kfd here if it's not be initialized before 6252 */ 6253 if (!adev->kfd.init_complete) 6254 amdgpu_amdkfd_device_init(adev); 6255 6256 if (tmp_adev->pcie_reset_ctx.audio_suspended) 6257 amdgpu_device_resume_display_audio(tmp_adev); 6258 6259 amdgpu_device_unset_mp1_state(tmp_adev); 6260 6261 amdgpu_ras_set_error_query_ready(tmp_adev, true); 6262 6263 } 6264 6265 tmp_adev = list_first_entry(device_list, struct amdgpu_device, 6266 reset_list); 6267 amdgpu_device_unlock_reset_domain(tmp_adev->reset_domain); 6268 6269 } 6270 6271 6272 /** 6273 * amdgpu_device_gpu_recover - reset the asic and recover scheduler 6274 * 6275 * @adev: amdgpu_device pointer 6276 * @job: which job trigger hang 6277 * @reset_context: amdgpu reset context pointer 6278 * 6279 * Attempt to reset the GPU if it has hung (all asics). 6280 * Attempt to do soft-reset or full-reset and reinitialize Asic 6281 * Returns 0 for success or an error on failure. 6282 */ 6283 6284 int amdgpu_device_gpu_recover(struct amdgpu_device *adev, 6285 struct amdgpu_job *job, 6286 struct amdgpu_reset_context *reset_context) 6287 { 6288 struct list_head device_list; 6289 bool job_signaled = false; 6290 struct amdgpu_hive_info *hive = NULL; 6291 int r = 0; 6292 bool need_emergency_restart = false; 6293 6294 /* 6295 * If it reaches here because of hang/timeout and a RAS error is 6296 * detected at the same time, let RAS recovery take care of it. 6297 */ 6298 if (amdgpu_ras_is_err_state(adev, AMDGPU_RAS_BLOCK__ANY) && 6299 !amdgpu_sriov_vf(adev) && 6300 reset_context->src != AMDGPU_RESET_SRC_RAS) { 6301 dev_dbg(adev->dev, 6302 "Gpu recovery from source: %d yielding to RAS error recovery handling", 6303 reset_context->src); 6304 return 0; 6305 } 6306 6307 /* 6308 * Special case: RAS triggered and full reset isn't supported 6309 */ 6310 need_emergency_restart = amdgpu_ras_need_emergency_restart(adev); 6311 6312 /* 6313 * Flush RAM to disk so that after reboot 6314 * the user can read log and see why the system rebooted. 6315 */ 6316 if (need_emergency_restart && amdgpu_ras_get_context(adev) && 6317 amdgpu_ras_get_context(adev)->reboot) { 6318 DRM_WARN("Emergency reboot."); 6319 6320 ksys_sync_helper(); 6321 emergency_restart(); 6322 } 6323 6324 dev_info(adev->dev, "GPU %s begin!\n", 6325 need_emergency_restart ? "jobs stop":"reset"); 6326 6327 if (!amdgpu_sriov_vf(adev)) 6328 hive = amdgpu_get_xgmi_hive(adev); 6329 if (hive) 6330 mutex_lock(&hive->hive_lock); 6331 6332 reset_context->job = job; 6333 reset_context->hive = hive; 6334 INIT_LIST_HEAD(&device_list); 6335 6336 r = amdgpu_device_halt_activities(adev, job, reset_context, &device_list, 6337 hive, need_emergency_restart); 6338 if (r) 6339 goto end_reset; 6340 6341 if (need_emergency_restart) 6342 goto skip_sched_resume; 6343 /* 6344 * Must check guilty signal here since after this point all old 6345 * HW fences are force signaled. 6346 * 6347 * job->base holds a reference to parent fence 6348 */ 6349 if (job && dma_fence_is_signaled(&job->hw_fence)) { 6350 job_signaled = true; 6351 dev_info(adev->dev, "Guilty job already signaled, skipping HW reset"); 6352 goto skip_hw_reset; 6353 } 6354 6355 r = amdgpu_device_asic_reset(adev, &device_list, reset_context); 6356 if (r) 6357 goto end_reset; 6358 skip_hw_reset: 6359 r = amdgpu_device_sched_resume(&device_list, reset_context, job_signaled); 6360 if (r) 6361 goto end_reset; 6362 skip_sched_resume: 6363 amdgpu_device_gpu_resume(adev, &device_list, need_emergency_restart); 6364 end_reset: 6365 if (hive) { 6366 mutex_unlock(&hive->hive_lock); 6367 amdgpu_put_xgmi_hive(hive); 6368 } 6369 6370 if (r) 6371 dev_info(adev->dev, "GPU reset end with ret = %d\n", r); 6372 6373 atomic_set(&adev->reset_domain->reset_res, r); 6374 6375 if (!r) 6376 drm_dev_wedged_event(adev_to_drm(adev), DRM_WEDGE_RECOVERY_NONE); 6377 6378 return r; 6379 } 6380 6381 /** 6382 * amdgpu_device_partner_bandwidth - find the bandwidth of appropriate partner 6383 * 6384 * @adev: amdgpu_device pointer 6385 * @speed: pointer to the speed of the link 6386 * @width: pointer to the width of the link 6387 * 6388 * Evaluate the hierarchy to find the speed and bandwidth capabilities of the 6389 * first physical partner to an AMD dGPU. 6390 * This will exclude any virtual switches and links. 6391 */ 6392 static void amdgpu_device_partner_bandwidth(struct amdgpu_device *adev, 6393 enum pci_bus_speed *speed, 6394 enum pcie_link_width *width) 6395 { 6396 struct pci_dev *parent = adev->pdev; 6397 6398 if (!speed || !width) 6399 return; 6400 6401 *speed = PCI_SPEED_UNKNOWN; 6402 *width = PCIE_LNK_WIDTH_UNKNOWN; 6403 6404 if (amdgpu_device_pcie_dynamic_switching_supported(adev)) { 6405 while ((parent = pci_upstream_bridge(parent))) { 6406 /* skip upstream/downstream switches internal to dGPU*/ 6407 if (parent->vendor == PCI_VENDOR_ID_ATI) 6408 continue; 6409 *speed = pcie_get_speed_cap(parent); 6410 *width = pcie_get_width_cap(parent); 6411 break; 6412 } 6413 } else { 6414 /* use the current speeds rather than max if switching is not supported */ 6415 pcie_bandwidth_available(adev->pdev, NULL, speed, width); 6416 } 6417 } 6418 6419 /** 6420 * amdgpu_device_gpu_bandwidth - find the bandwidth of the GPU 6421 * 6422 * @adev: amdgpu_device pointer 6423 * @speed: pointer to the speed of the link 6424 * @width: pointer to the width of the link 6425 * 6426 * Evaluate the hierarchy to find the speed and bandwidth capabilities of the 6427 * AMD dGPU which may be a virtual upstream bridge. 6428 */ 6429 static void amdgpu_device_gpu_bandwidth(struct amdgpu_device *adev, 6430 enum pci_bus_speed *speed, 6431 enum pcie_link_width *width) 6432 { 6433 struct pci_dev *parent = adev->pdev; 6434 6435 if (!speed || !width) 6436 return; 6437 6438 parent = pci_upstream_bridge(parent); 6439 if (parent && parent->vendor == PCI_VENDOR_ID_ATI) { 6440 /* use the upstream/downstream switches internal to dGPU */ 6441 *speed = pcie_get_speed_cap(parent); 6442 *width = pcie_get_width_cap(parent); 6443 while ((parent = pci_upstream_bridge(parent))) { 6444 if (parent->vendor == PCI_VENDOR_ID_ATI) { 6445 /* use the upstream/downstream switches internal to dGPU */ 6446 *speed = pcie_get_speed_cap(parent); 6447 *width = pcie_get_width_cap(parent); 6448 } 6449 } 6450 } else { 6451 /* use the device itself */ 6452 *speed = pcie_get_speed_cap(adev->pdev); 6453 *width = pcie_get_width_cap(adev->pdev); 6454 } 6455 } 6456 6457 /** 6458 * amdgpu_device_get_pcie_info - fence pcie info about the PCIE slot 6459 * 6460 * @adev: amdgpu_device pointer 6461 * 6462 * Fetches and stores in the driver the PCIE capabilities (gen speed 6463 * and lanes) of the slot the device is in. Handles APUs and 6464 * virtualized environments where PCIE config space may not be available. 6465 */ 6466 static void amdgpu_device_get_pcie_info(struct amdgpu_device *adev) 6467 { 6468 enum pci_bus_speed speed_cap, platform_speed_cap; 6469 enum pcie_link_width platform_link_width, link_width; 6470 6471 if (amdgpu_pcie_gen_cap) 6472 adev->pm.pcie_gen_mask = amdgpu_pcie_gen_cap; 6473 6474 if (amdgpu_pcie_lane_cap) 6475 adev->pm.pcie_mlw_mask = amdgpu_pcie_lane_cap; 6476 6477 /* covers APUs as well */ 6478 if (pci_is_root_bus(adev->pdev->bus) && !amdgpu_passthrough(adev)) { 6479 if (adev->pm.pcie_gen_mask == 0) 6480 adev->pm.pcie_gen_mask = AMDGPU_DEFAULT_PCIE_GEN_MASK; 6481 if (adev->pm.pcie_mlw_mask == 0) 6482 adev->pm.pcie_mlw_mask = AMDGPU_DEFAULT_PCIE_MLW_MASK; 6483 return; 6484 } 6485 6486 if (adev->pm.pcie_gen_mask && adev->pm.pcie_mlw_mask) 6487 return; 6488 6489 amdgpu_device_partner_bandwidth(adev, &platform_speed_cap, 6490 &platform_link_width); 6491 amdgpu_device_gpu_bandwidth(adev, &speed_cap, &link_width); 6492 6493 if (adev->pm.pcie_gen_mask == 0) { 6494 /* asic caps */ 6495 if (speed_cap == PCI_SPEED_UNKNOWN) { 6496 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 6497 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 | 6498 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3); 6499 } else { 6500 if (speed_cap == PCIE_SPEED_32_0GT) 6501 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 6502 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 | 6503 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3 | 6504 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN4 | 6505 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN5); 6506 else if (speed_cap == PCIE_SPEED_16_0GT) 6507 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 6508 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 | 6509 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3 | 6510 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN4); 6511 else if (speed_cap == PCIE_SPEED_8_0GT) 6512 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 6513 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 | 6514 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3); 6515 else if (speed_cap == PCIE_SPEED_5_0GT) 6516 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 6517 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2); 6518 else 6519 adev->pm.pcie_gen_mask |= CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1; 6520 } 6521 /* platform caps */ 6522 if (platform_speed_cap == PCI_SPEED_UNKNOWN) { 6523 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 6524 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2); 6525 } else { 6526 if (platform_speed_cap == PCIE_SPEED_32_0GT) 6527 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 6528 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 | 6529 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3 | 6530 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN4 | 6531 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN5); 6532 else if (platform_speed_cap == PCIE_SPEED_16_0GT) 6533 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 6534 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 | 6535 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3 | 6536 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN4); 6537 else if (platform_speed_cap == PCIE_SPEED_8_0GT) 6538 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 6539 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 | 6540 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3); 6541 else if (platform_speed_cap == PCIE_SPEED_5_0GT) 6542 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 6543 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2); 6544 else 6545 adev->pm.pcie_gen_mask |= CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1; 6546 6547 } 6548 } 6549 if (adev->pm.pcie_mlw_mask == 0) { 6550 /* asic caps */ 6551 if (link_width == PCIE_LNK_WIDTH_UNKNOWN) { 6552 adev->pm.pcie_mlw_mask |= AMDGPU_DEFAULT_ASIC_PCIE_MLW_MASK; 6553 } else { 6554 switch (link_width) { 6555 case PCIE_LNK_X32: 6556 adev->pm.pcie_mlw_mask |= (CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X32 | 6557 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X16 | 6558 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X12 | 6559 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X8 | 6560 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X4 | 6561 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X2 | 6562 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X1); 6563 break; 6564 case PCIE_LNK_X16: 6565 adev->pm.pcie_mlw_mask |= (CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X16 | 6566 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X12 | 6567 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X8 | 6568 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X4 | 6569 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X2 | 6570 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X1); 6571 break; 6572 case PCIE_LNK_X12: 6573 adev->pm.pcie_mlw_mask |= (CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X12 | 6574 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X8 | 6575 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X4 | 6576 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X2 | 6577 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X1); 6578 break; 6579 case PCIE_LNK_X8: 6580 adev->pm.pcie_mlw_mask |= (CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X8 | 6581 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X4 | 6582 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X2 | 6583 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X1); 6584 break; 6585 case PCIE_LNK_X4: 6586 adev->pm.pcie_mlw_mask |= (CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X4 | 6587 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X2 | 6588 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X1); 6589 break; 6590 case PCIE_LNK_X2: 6591 adev->pm.pcie_mlw_mask |= (CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X2 | 6592 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X1); 6593 break; 6594 case PCIE_LNK_X1: 6595 adev->pm.pcie_mlw_mask |= CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X1; 6596 break; 6597 default: 6598 break; 6599 } 6600 } 6601 /* platform caps */ 6602 if (platform_link_width == PCIE_LNK_WIDTH_UNKNOWN) { 6603 adev->pm.pcie_mlw_mask |= AMDGPU_DEFAULT_PCIE_MLW_MASK; 6604 } else { 6605 switch (platform_link_width) { 6606 case PCIE_LNK_X32: 6607 adev->pm.pcie_mlw_mask |= (CAIL_PCIE_LINK_WIDTH_SUPPORT_X32 | 6608 CAIL_PCIE_LINK_WIDTH_SUPPORT_X16 | 6609 CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 | 6610 CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 | 6611 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 6612 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 6613 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 6614 break; 6615 case PCIE_LNK_X16: 6616 adev->pm.pcie_mlw_mask |= (CAIL_PCIE_LINK_WIDTH_SUPPORT_X16 | 6617 CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 | 6618 CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 | 6619 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 6620 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 6621 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 6622 break; 6623 case PCIE_LNK_X12: 6624 adev->pm.pcie_mlw_mask |= (CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 | 6625 CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 | 6626 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 6627 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 6628 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 6629 break; 6630 case PCIE_LNK_X8: 6631 adev->pm.pcie_mlw_mask |= (CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 | 6632 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 6633 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 6634 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 6635 break; 6636 case PCIE_LNK_X4: 6637 adev->pm.pcie_mlw_mask |= (CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 6638 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 6639 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 6640 break; 6641 case PCIE_LNK_X2: 6642 adev->pm.pcie_mlw_mask |= (CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 6643 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 6644 break; 6645 case PCIE_LNK_X1: 6646 adev->pm.pcie_mlw_mask |= CAIL_PCIE_LINK_WIDTH_SUPPORT_X1; 6647 break; 6648 default: 6649 break; 6650 } 6651 } 6652 } 6653 } 6654 6655 /** 6656 * amdgpu_device_is_peer_accessible - Check peer access through PCIe BAR 6657 * 6658 * @adev: amdgpu_device pointer 6659 * @peer_adev: amdgpu_device pointer for peer device trying to access @adev 6660 * 6661 * Return true if @peer_adev can access (DMA) @adev through the PCIe 6662 * BAR, i.e. @adev is "large BAR" and the BAR matches the DMA mask of 6663 * @peer_adev. 6664 */ 6665 bool amdgpu_device_is_peer_accessible(struct amdgpu_device *adev, 6666 struct amdgpu_device *peer_adev) 6667 { 6668 #ifdef CONFIG_HSA_AMD_P2P 6669 bool p2p_access = 6670 !adev->gmc.xgmi.connected_to_cpu && 6671 !(pci_p2pdma_distance(adev->pdev, peer_adev->dev, false) < 0); 6672 if (!p2p_access) 6673 dev_info(adev->dev, "PCIe P2P access from peer device %s is not supported by the chipset\n", 6674 pci_name(peer_adev->pdev)); 6675 6676 bool is_large_bar = adev->gmc.visible_vram_size && 6677 adev->gmc.real_vram_size == adev->gmc.visible_vram_size; 6678 bool p2p_addressable = amdgpu_device_check_iommu_remap(peer_adev); 6679 6680 if (!p2p_addressable) { 6681 uint64_t address_mask = peer_adev->dev->dma_mask ? 6682 ~*peer_adev->dev->dma_mask : ~((1ULL << 32) - 1); 6683 resource_size_t aper_limit = 6684 adev->gmc.aper_base + adev->gmc.aper_size - 1; 6685 6686 p2p_addressable = !(adev->gmc.aper_base & address_mask || 6687 aper_limit & address_mask); 6688 } 6689 return pcie_p2p && is_large_bar && p2p_access && p2p_addressable; 6690 #else 6691 return false; 6692 #endif 6693 } 6694 6695 int amdgpu_device_baco_enter(struct drm_device *dev) 6696 { 6697 struct amdgpu_device *adev = drm_to_adev(dev); 6698 struct amdgpu_ras *ras = amdgpu_ras_get_context(adev); 6699 6700 if (!amdgpu_device_supports_baco(dev)) 6701 return -ENOTSUPP; 6702 6703 if (ras && adev->ras_enabled && 6704 adev->nbio.funcs->enable_doorbell_interrupt) 6705 adev->nbio.funcs->enable_doorbell_interrupt(adev, false); 6706 6707 return amdgpu_dpm_baco_enter(adev); 6708 } 6709 6710 int amdgpu_device_baco_exit(struct drm_device *dev) 6711 { 6712 struct amdgpu_device *adev = drm_to_adev(dev); 6713 struct amdgpu_ras *ras = amdgpu_ras_get_context(adev); 6714 int ret = 0; 6715 6716 if (!amdgpu_device_supports_baco(dev)) 6717 return -ENOTSUPP; 6718 6719 ret = amdgpu_dpm_baco_exit(adev); 6720 if (ret) 6721 return ret; 6722 6723 if (ras && adev->ras_enabled && 6724 adev->nbio.funcs->enable_doorbell_interrupt) 6725 adev->nbio.funcs->enable_doorbell_interrupt(adev, true); 6726 6727 if (amdgpu_passthrough(adev) && adev->nbio.funcs && 6728 adev->nbio.funcs->clear_doorbell_interrupt) 6729 adev->nbio.funcs->clear_doorbell_interrupt(adev); 6730 6731 return 0; 6732 } 6733 6734 /** 6735 * amdgpu_pci_error_detected - Called when a PCI error is detected. 6736 * @pdev: PCI device struct 6737 * @state: PCI channel state 6738 * 6739 * Description: Called when a PCI error is detected. 6740 * 6741 * Return: PCI_ERS_RESULT_NEED_RESET or PCI_ERS_RESULT_DISCONNECT. 6742 */ 6743 pci_ers_result_t amdgpu_pci_error_detected(struct pci_dev *pdev, pci_channel_state_t state) 6744 { 6745 struct drm_device *dev = pci_get_drvdata(pdev); 6746 struct amdgpu_device *adev = drm_to_adev(dev); 6747 struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev); 6748 struct amdgpu_reset_context reset_context; 6749 struct list_head device_list; 6750 int r = 0; 6751 6752 dev_info(adev->dev, "PCI error: detected callback!!\n"); 6753 6754 if (!amdgpu_dpm_is_link_reset_supported(adev)) { 6755 dev_warn(adev->dev, "No support for XGMI hive yet...\n"); 6756 return PCI_ERS_RESULT_DISCONNECT; 6757 } 6758 6759 adev->pci_channel_state = state; 6760 6761 switch (state) { 6762 case pci_channel_io_normal: 6763 dev_info(adev->dev, "pci_channel_io_normal: state(%d)!!\n", state); 6764 return PCI_ERS_RESULT_CAN_RECOVER; 6765 case pci_channel_io_frozen: 6766 /* Fatal error, prepare for slot reset */ 6767 dev_info(adev->dev, "pci_channel_io_frozen: state(%d)!!\n", state); 6768 6769 if (hive) 6770 mutex_lock(&hive->hive_lock); 6771 adev->pcie_reset_ctx.occurs_dpc = true; 6772 memset(&reset_context, 0, sizeof(reset_context)); 6773 INIT_LIST_HEAD(&device_list); 6774 6775 r = amdgpu_device_halt_activities(adev, NULL, &reset_context, &device_list, 6776 hive, false); 6777 if (hive) { 6778 mutex_unlock(&hive->hive_lock); 6779 amdgpu_put_xgmi_hive(hive); 6780 } 6781 if (r) 6782 return PCI_ERS_RESULT_DISCONNECT; 6783 return PCI_ERS_RESULT_NEED_RESET; 6784 case pci_channel_io_perm_failure: 6785 /* Permanent error, prepare for device removal */ 6786 dev_info(adev->dev, "pci_channel_io_perm_failure: state(%d)!!\n", state); 6787 return PCI_ERS_RESULT_DISCONNECT; 6788 } 6789 6790 return PCI_ERS_RESULT_NEED_RESET; 6791 } 6792 6793 /** 6794 * amdgpu_pci_mmio_enabled - Enable MMIO and dump debug registers 6795 * @pdev: pointer to PCI device 6796 */ 6797 pci_ers_result_t amdgpu_pci_mmio_enabled(struct pci_dev *pdev) 6798 { 6799 struct drm_device *dev = pci_get_drvdata(pdev); 6800 struct amdgpu_device *adev = drm_to_adev(dev); 6801 6802 dev_info(adev->dev, "PCI error: mmio enabled callback!!\n"); 6803 6804 /* TODO - dump whatever for debugging purposes */ 6805 6806 /* This called only if amdgpu_pci_error_detected returns 6807 * PCI_ERS_RESULT_CAN_RECOVER. Read/write to the device still 6808 * works, no need to reset slot. 6809 */ 6810 6811 return PCI_ERS_RESULT_RECOVERED; 6812 } 6813 6814 /** 6815 * amdgpu_pci_slot_reset - Called when PCI slot has been reset. 6816 * @pdev: PCI device struct 6817 * 6818 * Description: This routine is called by the pci error recovery 6819 * code after the PCI slot has been reset, just before we 6820 * should resume normal operations. 6821 */ 6822 pci_ers_result_t amdgpu_pci_slot_reset(struct pci_dev *pdev) 6823 { 6824 struct drm_device *dev = pci_get_drvdata(pdev); 6825 struct amdgpu_device *adev = drm_to_adev(dev); 6826 struct amdgpu_reset_context reset_context; 6827 struct amdgpu_device *tmp_adev; 6828 struct amdgpu_hive_info *hive; 6829 struct list_head device_list; 6830 int r = 0, i; 6831 u32 memsize; 6832 6833 /* PCI error slot reset should be skipped During RAS recovery */ 6834 if ((amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 3) || 6835 amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 4)) && 6836 amdgpu_ras_in_recovery(adev)) 6837 return PCI_ERS_RESULT_RECOVERED; 6838 6839 dev_info(adev->dev, "PCI error: slot reset callback!!\n"); 6840 6841 memset(&reset_context, 0, sizeof(reset_context)); 6842 6843 /* wait for asic to come out of reset */ 6844 msleep(700); 6845 6846 /* Restore PCI confspace */ 6847 amdgpu_device_load_pci_state(pdev); 6848 6849 /* confirm ASIC came out of reset */ 6850 for (i = 0; i < adev->usec_timeout; i++) { 6851 memsize = amdgpu_asic_get_config_memsize(adev); 6852 6853 if (memsize != 0xffffffff) 6854 break; 6855 udelay(1); 6856 } 6857 if (memsize == 0xffffffff) { 6858 r = -ETIME; 6859 goto out; 6860 } 6861 6862 reset_context.method = AMD_RESET_METHOD_NONE; 6863 reset_context.reset_req_dev = adev; 6864 set_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags); 6865 set_bit(AMDGPU_SKIP_COREDUMP, &reset_context.flags); 6866 INIT_LIST_HEAD(&device_list); 6867 6868 hive = amdgpu_get_xgmi_hive(adev); 6869 if (hive) { 6870 mutex_lock(&hive->hive_lock); 6871 reset_context.hive = hive; 6872 list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head) { 6873 tmp_adev->pcie_reset_ctx.in_link_reset = true; 6874 list_add_tail(&tmp_adev->reset_list, &device_list); 6875 } 6876 } else { 6877 set_bit(AMDGPU_SKIP_HW_RESET, &reset_context.flags); 6878 list_add_tail(&adev->reset_list, &device_list); 6879 } 6880 6881 r = amdgpu_device_asic_reset(adev, &device_list, &reset_context); 6882 out: 6883 if (!r) { 6884 if (amdgpu_device_cache_pci_state(adev->pdev)) 6885 pci_restore_state(adev->pdev); 6886 dev_info(adev->dev, "PCIe error recovery succeeded\n"); 6887 } else { 6888 dev_err(adev->dev, "PCIe error recovery failed, err:%d\n", r); 6889 if (hive) { 6890 list_for_each_entry(tmp_adev, &device_list, reset_list) 6891 amdgpu_device_unset_mp1_state(tmp_adev); 6892 amdgpu_device_unlock_reset_domain(adev->reset_domain); 6893 } 6894 } 6895 6896 if (hive) { 6897 mutex_unlock(&hive->hive_lock); 6898 amdgpu_put_xgmi_hive(hive); 6899 } 6900 6901 return r ? PCI_ERS_RESULT_DISCONNECT : PCI_ERS_RESULT_RECOVERED; 6902 } 6903 6904 /** 6905 * amdgpu_pci_resume() - resume normal ops after PCI reset 6906 * @pdev: pointer to PCI device 6907 * 6908 * Called when the error recovery driver tells us that its 6909 * OK to resume normal operation. 6910 */ 6911 void amdgpu_pci_resume(struct pci_dev *pdev) 6912 { 6913 struct drm_device *dev = pci_get_drvdata(pdev); 6914 struct amdgpu_device *adev = drm_to_adev(dev); 6915 struct list_head device_list; 6916 struct amdgpu_hive_info *hive = NULL; 6917 struct amdgpu_device *tmp_adev = NULL; 6918 6919 dev_info(adev->dev, "PCI error: resume callback!!\n"); 6920 6921 /* Only continue execution for the case of pci_channel_io_frozen */ 6922 if (adev->pci_channel_state != pci_channel_io_frozen) 6923 return; 6924 6925 INIT_LIST_HEAD(&device_list); 6926 6927 hive = amdgpu_get_xgmi_hive(adev); 6928 if (hive) { 6929 mutex_lock(&hive->hive_lock); 6930 list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head) { 6931 tmp_adev->pcie_reset_ctx.in_link_reset = false; 6932 list_add_tail(&tmp_adev->reset_list, &device_list); 6933 } 6934 } else 6935 list_add_tail(&adev->reset_list, &device_list); 6936 6937 amdgpu_device_sched_resume(&device_list, NULL, NULL); 6938 amdgpu_device_gpu_resume(adev, &device_list, false); 6939 adev->pcie_reset_ctx.occurs_dpc = false; 6940 6941 if (hive) { 6942 mutex_unlock(&hive->hive_lock); 6943 amdgpu_put_xgmi_hive(hive); 6944 } 6945 } 6946 6947 bool amdgpu_device_cache_pci_state(struct pci_dev *pdev) 6948 { 6949 struct drm_device *dev = pci_get_drvdata(pdev); 6950 struct amdgpu_device *adev = drm_to_adev(dev); 6951 int r; 6952 6953 if (amdgpu_sriov_vf(adev)) 6954 return false; 6955 6956 r = pci_save_state(pdev); 6957 if (!r) { 6958 kfree(adev->pci_state); 6959 6960 adev->pci_state = pci_store_saved_state(pdev); 6961 6962 if (!adev->pci_state) { 6963 DRM_ERROR("Failed to store PCI saved state"); 6964 return false; 6965 } 6966 } else { 6967 DRM_WARN("Failed to save PCI state, err:%d\n", r); 6968 return false; 6969 } 6970 6971 return true; 6972 } 6973 6974 bool amdgpu_device_load_pci_state(struct pci_dev *pdev) 6975 { 6976 struct drm_device *dev = pci_get_drvdata(pdev); 6977 struct amdgpu_device *adev = drm_to_adev(dev); 6978 int r; 6979 6980 if (!adev->pci_state) 6981 return false; 6982 6983 r = pci_load_saved_state(pdev, adev->pci_state); 6984 6985 if (!r) { 6986 pci_restore_state(pdev); 6987 } else { 6988 DRM_WARN("Failed to load PCI state, err:%d\n", r); 6989 return false; 6990 } 6991 6992 return true; 6993 } 6994 6995 void amdgpu_device_flush_hdp(struct amdgpu_device *adev, 6996 struct amdgpu_ring *ring) 6997 { 6998 #ifdef CONFIG_X86_64 6999 if ((adev->flags & AMD_IS_APU) && !amdgpu_passthrough(adev)) 7000 return; 7001 #endif 7002 if (adev->gmc.xgmi.connected_to_cpu) 7003 return; 7004 7005 if (ring && ring->funcs->emit_hdp_flush) 7006 amdgpu_ring_emit_hdp_flush(ring); 7007 else 7008 amdgpu_asic_flush_hdp(adev, ring); 7009 } 7010 7011 void amdgpu_device_invalidate_hdp(struct amdgpu_device *adev, 7012 struct amdgpu_ring *ring) 7013 { 7014 #ifdef CONFIG_X86_64 7015 if ((adev->flags & AMD_IS_APU) && !amdgpu_passthrough(adev)) 7016 return; 7017 #endif 7018 if (adev->gmc.xgmi.connected_to_cpu) 7019 return; 7020 7021 amdgpu_asic_invalidate_hdp(adev, ring); 7022 } 7023 7024 int amdgpu_in_reset(struct amdgpu_device *adev) 7025 { 7026 return atomic_read(&adev->reset_domain->in_gpu_reset); 7027 } 7028 7029 /** 7030 * amdgpu_device_halt() - bring hardware to some kind of halt state 7031 * 7032 * @adev: amdgpu_device pointer 7033 * 7034 * Bring hardware to some kind of halt state so that no one can touch it 7035 * any more. It will help to maintain error context when error occurred. 7036 * Compare to a simple hang, the system will keep stable at least for SSH 7037 * access. Then it should be trivial to inspect the hardware state and 7038 * see what's going on. Implemented as following: 7039 * 7040 * 1. drm_dev_unplug() makes device inaccessible to user space(IOCTLs, etc), 7041 * clears all CPU mappings to device, disallows remappings through page faults 7042 * 2. amdgpu_irq_disable_all() disables all interrupts 7043 * 3. amdgpu_fence_driver_hw_fini() signals all HW fences 7044 * 4. set adev->no_hw_access to avoid potential crashes after setp 5 7045 * 5. amdgpu_device_unmap_mmio() clears all MMIO mappings 7046 * 6. pci_disable_device() and pci_wait_for_pending_transaction() 7047 * flush any in flight DMA operations 7048 */ 7049 void amdgpu_device_halt(struct amdgpu_device *adev) 7050 { 7051 struct pci_dev *pdev = adev->pdev; 7052 struct drm_device *ddev = adev_to_drm(adev); 7053 7054 amdgpu_xcp_dev_unplug(adev); 7055 drm_dev_unplug(ddev); 7056 7057 amdgpu_irq_disable_all(adev); 7058 7059 amdgpu_fence_driver_hw_fini(adev); 7060 7061 adev->no_hw_access = true; 7062 7063 amdgpu_device_unmap_mmio(adev); 7064 7065 pci_disable_device(pdev); 7066 pci_wait_for_pending_transaction(pdev); 7067 } 7068 7069 u32 amdgpu_device_pcie_port_rreg(struct amdgpu_device *adev, 7070 u32 reg) 7071 { 7072 unsigned long flags, address, data; 7073 u32 r; 7074 7075 address = adev->nbio.funcs->get_pcie_port_index_offset(adev); 7076 data = adev->nbio.funcs->get_pcie_port_data_offset(adev); 7077 7078 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 7079 WREG32(address, reg * 4); 7080 (void)RREG32(address); 7081 r = RREG32(data); 7082 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 7083 return r; 7084 } 7085 7086 void amdgpu_device_pcie_port_wreg(struct amdgpu_device *adev, 7087 u32 reg, u32 v) 7088 { 7089 unsigned long flags, address, data; 7090 7091 address = adev->nbio.funcs->get_pcie_port_index_offset(adev); 7092 data = adev->nbio.funcs->get_pcie_port_data_offset(adev); 7093 7094 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 7095 WREG32(address, reg * 4); 7096 (void)RREG32(address); 7097 WREG32(data, v); 7098 (void)RREG32(data); 7099 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 7100 } 7101 7102 /** 7103 * amdgpu_device_get_gang - return a reference to the current gang 7104 * @adev: amdgpu_device pointer 7105 * 7106 * Returns: A new reference to the current gang leader. 7107 */ 7108 struct dma_fence *amdgpu_device_get_gang(struct amdgpu_device *adev) 7109 { 7110 struct dma_fence *fence; 7111 7112 rcu_read_lock(); 7113 fence = dma_fence_get_rcu_safe(&adev->gang_submit); 7114 rcu_read_unlock(); 7115 return fence; 7116 } 7117 7118 /** 7119 * amdgpu_device_switch_gang - switch to a new gang 7120 * @adev: amdgpu_device pointer 7121 * @gang: the gang to switch to 7122 * 7123 * Try to switch to a new gang. 7124 * Returns: NULL if we switched to the new gang or a reference to the current 7125 * gang leader. 7126 */ 7127 struct dma_fence *amdgpu_device_switch_gang(struct amdgpu_device *adev, 7128 struct dma_fence *gang) 7129 { 7130 struct dma_fence *old = NULL; 7131 7132 dma_fence_get(gang); 7133 do { 7134 dma_fence_put(old); 7135 old = amdgpu_device_get_gang(adev); 7136 if (old == gang) 7137 break; 7138 7139 if (!dma_fence_is_signaled(old)) { 7140 dma_fence_put(gang); 7141 return old; 7142 } 7143 7144 } while (cmpxchg((struct dma_fence __force **)&adev->gang_submit, 7145 old, gang) != old); 7146 7147 /* 7148 * Drop it once for the exchanged reference in adev and once for the 7149 * thread local reference acquired in amdgpu_device_get_gang(). 7150 */ 7151 dma_fence_put(old); 7152 dma_fence_put(old); 7153 return NULL; 7154 } 7155 7156 /** 7157 * amdgpu_device_enforce_isolation - enforce HW isolation 7158 * @adev: the amdgpu device pointer 7159 * @ring: the HW ring the job is supposed to run on 7160 * @job: the job which is about to be pushed to the HW ring 7161 * 7162 * Makes sure that only one client at a time can use the GFX block. 7163 * Returns: The dependency to wait on before the job can be pushed to the HW. 7164 * The function is called multiple times until NULL is returned. 7165 */ 7166 struct dma_fence *amdgpu_device_enforce_isolation(struct amdgpu_device *adev, 7167 struct amdgpu_ring *ring, 7168 struct amdgpu_job *job) 7169 { 7170 struct amdgpu_isolation *isolation = &adev->isolation[ring->xcp_id]; 7171 struct drm_sched_fence *f = job->base.s_fence; 7172 struct dma_fence *dep; 7173 void *owner; 7174 int r; 7175 7176 /* 7177 * For now enforce isolation only for the GFX block since we only need 7178 * the cleaner shader on those rings. 7179 */ 7180 if (ring->funcs->type != AMDGPU_RING_TYPE_GFX && 7181 ring->funcs->type != AMDGPU_RING_TYPE_COMPUTE) 7182 return NULL; 7183 7184 /* 7185 * All submissions where enforce isolation is false are handled as if 7186 * they come from a single client. Use ~0l as the owner to distinct it 7187 * from kernel submissions where the owner is NULL. 7188 */ 7189 owner = job->enforce_isolation ? f->owner : (void *)~0l; 7190 7191 mutex_lock(&adev->enforce_isolation_mutex); 7192 7193 /* 7194 * The "spearhead" submission is the first one which changes the 7195 * ownership to its client. We always need to wait for it to be 7196 * pushed to the HW before proceeding with anything. 7197 */ 7198 if (&f->scheduled != isolation->spearhead && 7199 !dma_fence_is_signaled(isolation->spearhead)) { 7200 dep = isolation->spearhead; 7201 goto out_grab_ref; 7202 } 7203 7204 if (isolation->owner != owner) { 7205 7206 /* 7207 * Wait for any gang to be assembled before switching to a 7208 * different owner or otherwise we could deadlock the 7209 * submissions. 7210 */ 7211 if (!job->gang_submit) { 7212 dep = amdgpu_device_get_gang(adev); 7213 if (!dma_fence_is_signaled(dep)) 7214 goto out_return_dep; 7215 dma_fence_put(dep); 7216 } 7217 7218 dma_fence_put(isolation->spearhead); 7219 isolation->spearhead = dma_fence_get(&f->scheduled); 7220 amdgpu_sync_move(&isolation->active, &isolation->prev); 7221 trace_amdgpu_isolation(isolation->owner, owner); 7222 isolation->owner = owner; 7223 } 7224 7225 /* 7226 * Specifying the ring here helps to pipeline submissions even when 7227 * isolation is enabled. If that is not desired for testing NULL can be 7228 * used instead of the ring to enforce a CPU round trip while switching 7229 * between clients. 7230 */ 7231 dep = amdgpu_sync_peek_fence(&isolation->prev, ring); 7232 r = amdgpu_sync_fence(&isolation->active, &f->finished, GFP_NOWAIT); 7233 if (r) 7234 DRM_WARN("OOM tracking isolation\n"); 7235 7236 out_grab_ref: 7237 dma_fence_get(dep); 7238 out_return_dep: 7239 mutex_unlock(&adev->enforce_isolation_mutex); 7240 return dep; 7241 } 7242 7243 bool amdgpu_device_has_display_hardware(struct amdgpu_device *adev) 7244 { 7245 switch (adev->asic_type) { 7246 #ifdef CONFIG_DRM_AMDGPU_SI 7247 case CHIP_HAINAN: 7248 #endif 7249 case CHIP_TOPAZ: 7250 /* chips with no display hardware */ 7251 return false; 7252 #ifdef CONFIG_DRM_AMDGPU_SI 7253 case CHIP_TAHITI: 7254 case CHIP_PITCAIRN: 7255 case CHIP_VERDE: 7256 case CHIP_OLAND: 7257 #endif 7258 #ifdef CONFIG_DRM_AMDGPU_CIK 7259 case CHIP_BONAIRE: 7260 case CHIP_HAWAII: 7261 case CHIP_KAVERI: 7262 case CHIP_KABINI: 7263 case CHIP_MULLINS: 7264 #endif 7265 case CHIP_TONGA: 7266 case CHIP_FIJI: 7267 case CHIP_POLARIS10: 7268 case CHIP_POLARIS11: 7269 case CHIP_POLARIS12: 7270 case CHIP_VEGAM: 7271 case CHIP_CARRIZO: 7272 case CHIP_STONEY: 7273 /* chips with display hardware */ 7274 return true; 7275 default: 7276 /* IP discovery */ 7277 if (!amdgpu_ip_version(adev, DCE_HWIP, 0) || 7278 (adev->harvest_ip_mask & AMD_HARVEST_IP_DMU_MASK)) 7279 return false; 7280 return true; 7281 } 7282 } 7283 7284 uint32_t amdgpu_device_wait_on_rreg(struct amdgpu_device *adev, 7285 uint32_t inst, uint32_t reg_addr, char reg_name[], 7286 uint32_t expected_value, uint32_t mask) 7287 { 7288 uint32_t ret = 0; 7289 uint32_t old_ = 0; 7290 uint32_t tmp_ = RREG32(reg_addr); 7291 uint32_t loop = adev->usec_timeout; 7292 7293 while ((tmp_ & (mask)) != (expected_value)) { 7294 if (old_ != tmp_) { 7295 loop = adev->usec_timeout; 7296 old_ = tmp_; 7297 } else 7298 udelay(1); 7299 tmp_ = RREG32(reg_addr); 7300 loop--; 7301 if (!loop) { 7302 DRM_WARN("Register(%d) [%s] failed to reach value 0x%08x != 0x%08xn", 7303 inst, reg_name, (uint32_t)expected_value, 7304 (uint32_t)(tmp_ & (mask))); 7305 ret = -ETIMEDOUT; 7306 break; 7307 } 7308 } 7309 return ret; 7310 } 7311 7312 ssize_t amdgpu_get_soft_full_reset_mask(struct amdgpu_ring *ring) 7313 { 7314 ssize_t size = 0; 7315 7316 if (!ring || !ring->adev) 7317 return size; 7318 7319 if (amdgpu_device_should_recover_gpu(ring->adev)) 7320 size |= AMDGPU_RESET_TYPE_FULL; 7321 7322 if (unlikely(!ring->adev->debug_disable_soft_recovery) && 7323 !amdgpu_sriov_vf(ring->adev) && ring->funcs->soft_recovery) 7324 size |= AMDGPU_RESET_TYPE_SOFT_RESET; 7325 7326 return size; 7327 } 7328 7329 ssize_t amdgpu_show_reset_mask(char *buf, uint32_t supported_reset) 7330 { 7331 ssize_t size = 0; 7332 7333 if (supported_reset == 0) { 7334 size += sysfs_emit_at(buf, size, "unsupported"); 7335 size += sysfs_emit_at(buf, size, "\n"); 7336 return size; 7337 7338 } 7339 7340 if (supported_reset & AMDGPU_RESET_TYPE_SOFT_RESET) 7341 size += sysfs_emit_at(buf, size, "soft "); 7342 7343 if (supported_reset & AMDGPU_RESET_TYPE_PER_QUEUE) 7344 size += sysfs_emit_at(buf, size, "queue "); 7345 7346 if (supported_reset & AMDGPU_RESET_TYPE_PER_PIPE) 7347 size += sysfs_emit_at(buf, size, "pipe "); 7348 7349 if (supported_reset & AMDGPU_RESET_TYPE_FULL) 7350 size += sysfs_emit_at(buf, size, "full "); 7351 7352 size += sysfs_emit_at(buf, size, "\n"); 7353 return size; 7354 } 7355