1 /* 2 * Copyright 2008 Advanced Micro Devices, Inc. 3 * Copyright 2008 Red Hat Inc. 4 * Copyright 2009 Jerome Glisse. 5 * 6 * Permission is hereby granted, free of charge, to any person obtaining a 7 * copy of this software and associated documentation files (the "Software"), 8 * to deal in the Software without restriction, including without limitation 9 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 10 * and/or sell copies of the Software, and to permit persons to whom the 11 * Software is furnished to do so, subject to the following conditions: 12 * 13 * The above copyright notice and this permission notice shall be included in 14 * all copies or substantial portions of the Software. 15 * 16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 19 * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR 20 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, 21 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR 22 * OTHER DEALINGS IN THE SOFTWARE. 23 * 24 * Authors: Dave Airlie 25 * Alex Deucher 26 * Jerome Glisse 27 */ 28 29 #include <linux/aperture.h> 30 #include <linux/power_supply.h> 31 #include <linux/kthread.h> 32 #include <linux/module.h> 33 #include <linux/console.h> 34 #include <linux/slab.h> 35 #include <linux/iommu.h> 36 #include <linux/pci.h> 37 #include <linux/pci-p2pdma.h> 38 #include <linux/apple-gmux.h> 39 40 #include <drm/drm_atomic_helper.h> 41 #include <drm/drm_client_event.h> 42 #include <drm/drm_crtc_helper.h> 43 #include <drm/drm_probe_helper.h> 44 #include <drm/amdgpu_drm.h> 45 #include <linux/device.h> 46 #include <linux/vgaarb.h> 47 #include <linux/vga_switcheroo.h> 48 #include <linux/efi.h> 49 #include "amdgpu.h" 50 #include "amdgpu_trace.h" 51 #include "amdgpu_i2c.h" 52 #include "atom.h" 53 #include "amdgpu_atombios.h" 54 #include "amdgpu_atomfirmware.h" 55 #include "amd_pcie.h" 56 #ifdef CONFIG_DRM_AMDGPU_SI 57 #include "si.h" 58 #endif 59 #ifdef CONFIG_DRM_AMDGPU_CIK 60 #include "cik.h" 61 #endif 62 #include "vi.h" 63 #include "soc15.h" 64 #include "nv.h" 65 #include "bif/bif_4_1_d.h" 66 #include <linux/firmware.h> 67 #include "amdgpu_vf_error.h" 68 69 #include "amdgpu_amdkfd.h" 70 #include "amdgpu_pm.h" 71 72 #include "amdgpu_xgmi.h" 73 #include "amdgpu_ras.h" 74 #include "amdgpu_pmu.h" 75 #include "amdgpu_fru_eeprom.h" 76 #include "amdgpu_reset.h" 77 #include "amdgpu_virt.h" 78 #include "amdgpu_dev_coredump.h" 79 80 #include <linux/suspend.h> 81 #include <drm/task_barrier.h> 82 #include <linux/pm_runtime.h> 83 84 #include <drm/drm_drv.h> 85 86 #if IS_ENABLED(CONFIG_X86) 87 #include <asm/intel-family.h> 88 #include <asm/cpu_device_id.h> 89 #endif 90 91 MODULE_FIRMWARE("amdgpu/vega10_gpu_info.bin"); 92 MODULE_FIRMWARE("amdgpu/vega12_gpu_info.bin"); 93 MODULE_FIRMWARE("amdgpu/raven_gpu_info.bin"); 94 MODULE_FIRMWARE("amdgpu/picasso_gpu_info.bin"); 95 MODULE_FIRMWARE("amdgpu/raven2_gpu_info.bin"); 96 MODULE_FIRMWARE("amdgpu/arcturus_gpu_info.bin"); 97 MODULE_FIRMWARE("amdgpu/navi12_gpu_info.bin"); 98 99 #define AMDGPU_RESUME_MS 2000 100 #define AMDGPU_MAX_RETRY_LIMIT 2 101 #define AMDGPU_RETRY_SRIOV_RESET(r) ((r) == -EBUSY || (r) == -ETIMEDOUT || (r) == -EINVAL) 102 #define AMDGPU_PCIE_INDEX_FALLBACK (0x38 >> 2) 103 #define AMDGPU_PCIE_INDEX_HI_FALLBACK (0x44 >> 2) 104 #define AMDGPU_PCIE_DATA_FALLBACK (0x3C >> 2) 105 106 #define AMDGPU_VBIOS_SKIP (1U << 0) 107 #define AMDGPU_VBIOS_OPTIONAL (1U << 1) 108 109 static const struct drm_driver amdgpu_kms_driver; 110 111 const char *amdgpu_asic_name[] = { 112 "TAHITI", 113 "PITCAIRN", 114 "VERDE", 115 "OLAND", 116 "HAINAN", 117 "BONAIRE", 118 "KAVERI", 119 "KABINI", 120 "HAWAII", 121 "MULLINS", 122 "TOPAZ", 123 "TONGA", 124 "FIJI", 125 "CARRIZO", 126 "STONEY", 127 "POLARIS10", 128 "POLARIS11", 129 "POLARIS12", 130 "VEGAM", 131 "VEGA10", 132 "VEGA12", 133 "VEGA20", 134 "RAVEN", 135 "ARCTURUS", 136 "RENOIR", 137 "ALDEBARAN", 138 "NAVI10", 139 "CYAN_SKILLFISH", 140 "NAVI14", 141 "NAVI12", 142 "SIENNA_CICHLID", 143 "NAVY_FLOUNDER", 144 "VANGOGH", 145 "DIMGREY_CAVEFISH", 146 "BEIGE_GOBY", 147 "YELLOW_CARP", 148 "IP DISCOVERY", 149 "LAST", 150 }; 151 152 #define AMDGPU_IP_BLK_MASK_ALL GENMASK(AMD_IP_BLOCK_TYPE_NUM - 1, 0) 153 /* 154 * Default init level where all blocks are expected to be initialized. This is 155 * the level of initialization expected by default and also after a full reset 156 * of the device. 157 */ 158 struct amdgpu_init_level amdgpu_init_default = { 159 .level = AMDGPU_INIT_LEVEL_DEFAULT, 160 .hwini_ip_block_mask = AMDGPU_IP_BLK_MASK_ALL, 161 }; 162 163 struct amdgpu_init_level amdgpu_init_recovery = { 164 .level = AMDGPU_INIT_LEVEL_RESET_RECOVERY, 165 .hwini_ip_block_mask = AMDGPU_IP_BLK_MASK_ALL, 166 }; 167 168 /* 169 * Minimal blocks needed to be initialized before a XGMI hive can be reset. This 170 * is used for cases like reset on initialization where the entire hive needs to 171 * be reset before first use. 172 */ 173 struct amdgpu_init_level amdgpu_init_minimal_xgmi = { 174 .level = AMDGPU_INIT_LEVEL_MINIMAL_XGMI, 175 .hwini_ip_block_mask = 176 BIT(AMD_IP_BLOCK_TYPE_GMC) | BIT(AMD_IP_BLOCK_TYPE_SMC) | 177 BIT(AMD_IP_BLOCK_TYPE_COMMON) | BIT(AMD_IP_BLOCK_TYPE_IH) | 178 BIT(AMD_IP_BLOCK_TYPE_PSP) 179 }; 180 181 static inline bool amdgpu_ip_member_of_hwini(struct amdgpu_device *adev, 182 enum amd_ip_block_type block) 183 { 184 return (adev->init_lvl->hwini_ip_block_mask & (1U << block)) != 0; 185 } 186 187 void amdgpu_set_init_level(struct amdgpu_device *adev, 188 enum amdgpu_init_lvl_id lvl) 189 { 190 switch (lvl) { 191 case AMDGPU_INIT_LEVEL_MINIMAL_XGMI: 192 adev->init_lvl = &amdgpu_init_minimal_xgmi; 193 break; 194 case AMDGPU_INIT_LEVEL_RESET_RECOVERY: 195 adev->init_lvl = &amdgpu_init_recovery; 196 break; 197 case AMDGPU_INIT_LEVEL_DEFAULT: 198 fallthrough; 199 default: 200 adev->init_lvl = &amdgpu_init_default; 201 break; 202 } 203 } 204 205 static inline void amdgpu_device_stop_pending_resets(struct amdgpu_device *adev); 206 static int amdgpu_device_pm_notifier(struct notifier_block *nb, unsigned long mode, 207 void *data); 208 209 /** 210 * DOC: pcie_replay_count 211 * 212 * The amdgpu driver provides a sysfs API for reporting the total number 213 * of PCIe replays (NAKs). 214 * The file pcie_replay_count is used for this and returns the total 215 * number of replays as a sum of the NAKs generated and NAKs received. 216 */ 217 218 static ssize_t amdgpu_device_get_pcie_replay_count(struct device *dev, 219 struct device_attribute *attr, char *buf) 220 { 221 struct drm_device *ddev = dev_get_drvdata(dev); 222 struct amdgpu_device *adev = drm_to_adev(ddev); 223 uint64_t cnt = amdgpu_asic_get_pcie_replay_count(adev); 224 225 return sysfs_emit(buf, "%llu\n", cnt); 226 } 227 228 static DEVICE_ATTR(pcie_replay_count, 0444, 229 amdgpu_device_get_pcie_replay_count, NULL); 230 231 static int amdgpu_device_attr_sysfs_init(struct amdgpu_device *adev) 232 { 233 int ret = 0; 234 235 if (!amdgpu_sriov_vf(adev)) 236 ret = sysfs_create_file(&adev->dev->kobj, 237 &dev_attr_pcie_replay_count.attr); 238 239 return ret; 240 } 241 242 static void amdgpu_device_attr_sysfs_fini(struct amdgpu_device *adev) 243 { 244 if (!amdgpu_sriov_vf(adev)) 245 sysfs_remove_file(&adev->dev->kobj, 246 &dev_attr_pcie_replay_count.attr); 247 } 248 249 static ssize_t amdgpu_sysfs_reg_state_get(struct file *f, struct kobject *kobj, 250 struct bin_attribute *attr, char *buf, 251 loff_t ppos, size_t count) 252 { 253 struct device *dev = kobj_to_dev(kobj); 254 struct drm_device *ddev = dev_get_drvdata(dev); 255 struct amdgpu_device *adev = drm_to_adev(ddev); 256 ssize_t bytes_read; 257 258 switch (ppos) { 259 case AMDGPU_SYS_REG_STATE_XGMI: 260 bytes_read = amdgpu_asic_get_reg_state( 261 adev, AMDGPU_REG_STATE_TYPE_XGMI, buf, count); 262 break; 263 case AMDGPU_SYS_REG_STATE_WAFL: 264 bytes_read = amdgpu_asic_get_reg_state( 265 adev, AMDGPU_REG_STATE_TYPE_WAFL, buf, count); 266 break; 267 case AMDGPU_SYS_REG_STATE_PCIE: 268 bytes_read = amdgpu_asic_get_reg_state( 269 adev, AMDGPU_REG_STATE_TYPE_PCIE, buf, count); 270 break; 271 case AMDGPU_SYS_REG_STATE_USR: 272 bytes_read = amdgpu_asic_get_reg_state( 273 adev, AMDGPU_REG_STATE_TYPE_USR, buf, count); 274 break; 275 case AMDGPU_SYS_REG_STATE_USR_1: 276 bytes_read = amdgpu_asic_get_reg_state( 277 adev, AMDGPU_REG_STATE_TYPE_USR_1, buf, count); 278 break; 279 default: 280 return -EINVAL; 281 } 282 283 return bytes_read; 284 } 285 286 BIN_ATTR(reg_state, 0444, amdgpu_sysfs_reg_state_get, NULL, 287 AMDGPU_SYS_REG_STATE_END); 288 289 int amdgpu_reg_state_sysfs_init(struct amdgpu_device *adev) 290 { 291 int ret; 292 293 if (!amdgpu_asic_get_reg_state_supported(adev)) 294 return 0; 295 296 ret = sysfs_create_bin_file(&adev->dev->kobj, &bin_attr_reg_state); 297 298 return ret; 299 } 300 301 void amdgpu_reg_state_sysfs_fini(struct amdgpu_device *adev) 302 { 303 if (!amdgpu_asic_get_reg_state_supported(adev)) 304 return; 305 sysfs_remove_bin_file(&adev->dev->kobj, &bin_attr_reg_state); 306 } 307 308 int amdgpu_ip_block_suspend(struct amdgpu_ip_block *ip_block) 309 { 310 int r; 311 312 if (ip_block->version->funcs->suspend) { 313 r = ip_block->version->funcs->suspend(ip_block); 314 if (r) { 315 dev_err(ip_block->adev->dev, 316 "suspend of IP block <%s> failed %d\n", 317 ip_block->version->funcs->name, r); 318 return r; 319 } 320 } 321 322 ip_block->status.hw = false; 323 return 0; 324 } 325 326 int amdgpu_ip_block_resume(struct amdgpu_ip_block *ip_block) 327 { 328 int r; 329 330 if (ip_block->version->funcs->resume) { 331 r = ip_block->version->funcs->resume(ip_block); 332 if (r) { 333 dev_err(ip_block->adev->dev, 334 "resume of IP block <%s> failed %d\n", 335 ip_block->version->funcs->name, r); 336 return r; 337 } 338 } 339 340 ip_block->status.hw = true; 341 return 0; 342 } 343 344 /** 345 * DOC: board_info 346 * 347 * The amdgpu driver provides a sysfs API for giving board related information. 348 * It provides the form factor information in the format 349 * 350 * type : form factor 351 * 352 * Possible form factor values 353 * 354 * - "cem" - PCIE CEM card 355 * - "oam" - Open Compute Accelerator Module 356 * - "unknown" - Not known 357 * 358 */ 359 360 static ssize_t amdgpu_device_get_board_info(struct device *dev, 361 struct device_attribute *attr, 362 char *buf) 363 { 364 struct drm_device *ddev = dev_get_drvdata(dev); 365 struct amdgpu_device *adev = drm_to_adev(ddev); 366 enum amdgpu_pkg_type pkg_type = AMDGPU_PKG_TYPE_CEM; 367 const char *pkg; 368 369 if (adev->smuio.funcs && adev->smuio.funcs->get_pkg_type) 370 pkg_type = adev->smuio.funcs->get_pkg_type(adev); 371 372 switch (pkg_type) { 373 case AMDGPU_PKG_TYPE_CEM: 374 pkg = "cem"; 375 break; 376 case AMDGPU_PKG_TYPE_OAM: 377 pkg = "oam"; 378 break; 379 default: 380 pkg = "unknown"; 381 break; 382 } 383 384 return sysfs_emit(buf, "%s : %s\n", "type", pkg); 385 } 386 387 static DEVICE_ATTR(board_info, 0444, amdgpu_device_get_board_info, NULL); 388 389 static struct attribute *amdgpu_board_attrs[] = { 390 &dev_attr_board_info.attr, 391 NULL, 392 }; 393 394 static umode_t amdgpu_board_attrs_is_visible(struct kobject *kobj, 395 struct attribute *attr, int n) 396 { 397 struct device *dev = kobj_to_dev(kobj); 398 struct drm_device *ddev = dev_get_drvdata(dev); 399 struct amdgpu_device *adev = drm_to_adev(ddev); 400 401 if (adev->flags & AMD_IS_APU) 402 return 0; 403 404 return attr->mode; 405 } 406 407 static const struct attribute_group amdgpu_board_attrs_group = { 408 .attrs = amdgpu_board_attrs, 409 .is_visible = amdgpu_board_attrs_is_visible 410 }; 411 412 static void amdgpu_device_get_pcie_info(struct amdgpu_device *adev); 413 414 415 /** 416 * amdgpu_device_supports_px - Is the device a dGPU with ATPX power control 417 * 418 * @dev: drm_device pointer 419 * 420 * Returns true if the device is a dGPU with ATPX power control, 421 * otherwise return false. 422 */ 423 bool amdgpu_device_supports_px(struct drm_device *dev) 424 { 425 struct amdgpu_device *adev = drm_to_adev(dev); 426 427 if ((adev->flags & AMD_IS_PX) && !amdgpu_is_atpx_hybrid()) 428 return true; 429 return false; 430 } 431 432 /** 433 * amdgpu_device_supports_boco - Is the device a dGPU with ACPI power resources 434 * 435 * @dev: drm_device pointer 436 * 437 * Returns true if the device is a dGPU with ACPI power control, 438 * otherwise return false. 439 */ 440 bool amdgpu_device_supports_boco(struct drm_device *dev) 441 { 442 struct amdgpu_device *adev = drm_to_adev(dev); 443 444 if (!IS_ENABLED(CONFIG_HOTPLUG_PCI_PCIE)) 445 return false; 446 447 if (adev->has_pr3 || 448 ((adev->flags & AMD_IS_PX) && amdgpu_is_atpx_hybrid())) 449 return true; 450 return false; 451 } 452 453 /** 454 * amdgpu_device_supports_baco - Does the device support BACO 455 * 456 * @dev: drm_device pointer 457 * 458 * Return: 459 * 1 if the device supports BACO; 460 * 3 if the device supports MACO (only works if BACO is supported) 461 * otherwise return 0. 462 */ 463 int amdgpu_device_supports_baco(struct drm_device *dev) 464 { 465 struct amdgpu_device *adev = drm_to_adev(dev); 466 467 return amdgpu_asic_supports_baco(adev); 468 } 469 470 void amdgpu_device_detect_runtime_pm_mode(struct amdgpu_device *adev) 471 { 472 struct drm_device *dev; 473 int bamaco_support; 474 475 dev = adev_to_drm(adev); 476 477 adev->pm.rpm_mode = AMDGPU_RUNPM_NONE; 478 bamaco_support = amdgpu_device_supports_baco(dev); 479 480 switch (amdgpu_runtime_pm) { 481 case 2: 482 if (bamaco_support & MACO_SUPPORT) { 483 adev->pm.rpm_mode = AMDGPU_RUNPM_BAMACO; 484 dev_info(adev->dev, "Forcing BAMACO for runtime pm\n"); 485 } else if (bamaco_support == BACO_SUPPORT) { 486 adev->pm.rpm_mode = AMDGPU_RUNPM_BACO; 487 dev_info(adev->dev, "Requested mode BAMACO not available,fallback to use BACO\n"); 488 } 489 break; 490 case 1: 491 if (bamaco_support & BACO_SUPPORT) { 492 adev->pm.rpm_mode = AMDGPU_RUNPM_BACO; 493 dev_info(adev->dev, "Forcing BACO for runtime pm\n"); 494 } 495 break; 496 case -1: 497 case -2: 498 if (amdgpu_device_supports_px(dev)) { /* enable PX as runtime mode */ 499 adev->pm.rpm_mode = AMDGPU_RUNPM_PX; 500 dev_info(adev->dev, "Using ATPX for runtime pm\n"); 501 } else if (amdgpu_device_supports_boco(dev)) { /* enable boco as runtime mode */ 502 adev->pm.rpm_mode = AMDGPU_RUNPM_BOCO; 503 dev_info(adev->dev, "Using BOCO for runtime pm\n"); 504 } else { 505 if (!bamaco_support) 506 goto no_runtime_pm; 507 508 switch (adev->asic_type) { 509 case CHIP_VEGA20: 510 case CHIP_ARCTURUS: 511 /* BACO are not supported on vega20 and arctrus */ 512 break; 513 case CHIP_VEGA10: 514 /* enable BACO as runpm mode if noretry=0 */ 515 if (!adev->gmc.noretry) 516 adev->pm.rpm_mode = AMDGPU_RUNPM_BACO; 517 break; 518 default: 519 /* enable BACO as runpm mode on CI+ */ 520 adev->pm.rpm_mode = AMDGPU_RUNPM_BACO; 521 break; 522 } 523 524 if (adev->pm.rpm_mode == AMDGPU_RUNPM_BACO) { 525 if (bamaco_support & MACO_SUPPORT) { 526 adev->pm.rpm_mode = AMDGPU_RUNPM_BAMACO; 527 dev_info(adev->dev, "Using BAMACO for runtime pm\n"); 528 } else { 529 dev_info(adev->dev, "Using BACO for runtime pm\n"); 530 } 531 } 532 } 533 break; 534 case 0: 535 dev_info(adev->dev, "runtime pm is manually disabled\n"); 536 break; 537 default: 538 break; 539 } 540 541 no_runtime_pm: 542 if (adev->pm.rpm_mode == AMDGPU_RUNPM_NONE) 543 dev_info(adev->dev, "Runtime PM not available\n"); 544 } 545 /** 546 * amdgpu_device_supports_smart_shift - Is the device dGPU with 547 * smart shift support 548 * 549 * @dev: drm_device pointer 550 * 551 * Returns true if the device is a dGPU with Smart Shift support, 552 * otherwise returns false. 553 */ 554 bool amdgpu_device_supports_smart_shift(struct drm_device *dev) 555 { 556 return (amdgpu_device_supports_boco(dev) && 557 amdgpu_acpi_is_power_shift_control_supported()); 558 } 559 560 /* 561 * VRAM access helper functions 562 */ 563 564 /** 565 * amdgpu_device_mm_access - access vram by MM_INDEX/MM_DATA 566 * 567 * @adev: amdgpu_device pointer 568 * @pos: offset of the buffer in vram 569 * @buf: virtual address of the buffer in system memory 570 * @size: read/write size, sizeof(@buf) must > @size 571 * @write: true - write to vram, otherwise - read from vram 572 */ 573 void amdgpu_device_mm_access(struct amdgpu_device *adev, loff_t pos, 574 void *buf, size_t size, bool write) 575 { 576 unsigned long flags; 577 uint32_t hi = ~0, tmp = 0; 578 uint32_t *data = buf; 579 uint64_t last; 580 int idx; 581 582 if (!drm_dev_enter(adev_to_drm(adev), &idx)) 583 return; 584 585 BUG_ON(!IS_ALIGNED(pos, 4) || !IS_ALIGNED(size, 4)); 586 587 spin_lock_irqsave(&adev->mmio_idx_lock, flags); 588 for (last = pos + size; pos < last; pos += 4) { 589 tmp = pos >> 31; 590 591 WREG32_NO_KIQ(mmMM_INDEX, ((uint32_t)pos) | 0x80000000); 592 if (tmp != hi) { 593 WREG32_NO_KIQ(mmMM_INDEX_HI, tmp); 594 hi = tmp; 595 } 596 if (write) 597 WREG32_NO_KIQ(mmMM_DATA, *data++); 598 else 599 *data++ = RREG32_NO_KIQ(mmMM_DATA); 600 } 601 602 spin_unlock_irqrestore(&adev->mmio_idx_lock, flags); 603 drm_dev_exit(idx); 604 } 605 606 /** 607 * amdgpu_device_aper_access - access vram by vram aperture 608 * 609 * @adev: amdgpu_device pointer 610 * @pos: offset of the buffer in vram 611 * @buf: virtual address of the buffer in system memory 612 * @size: read/write size, sizeof(@buf) must > @size 613 * @write: true - write to vram, otherwise - read from vram 614 * 615 * The return value means how many bytes have been transferred. 616 */ 617 size_t amdgpu_device_aper_access(struct amdgpu_device *adev, loff_t pos, 618 void *buf, size_t size, bool write) 619 { 620 #ifdef CONFIG_64BIT 621 void __iomem *addr; 622 size_t count = 0; 623 uint64_t last; 624 625 if (!adev->mman.aper_base_kaddr) 626 return 0; 627 628 last = min(pos + size, adev->gmc.visible_vram_size); 629 if (last > pos) { 630 addr = adev->mman.aper_base_kaddr + pos; 631 count = last - pos; 632 633 if (write) { 634 memcpy_toio(addr, buf, count); 635 /* Make sure HDP write cache flush happens without any reordering 636 * after the system memory contents are sent over PCIe device 637 */ 638 mb(); 639 amdgpu_device_flush_hdp(adev, NULL); 640 } else { 641 amdgpu_device_invalidate_hdp(adev, NULL); 642 /* Make sure HDP read cache is invalidated before issuing a read 643 * to the PCIe device 644 */ 645 mb(); 646 memcpy_fromio(buf, addr, count); 647 } 648 649 } 650 651 return count; 652 #else 653 return 0; 654 #endif 655 } 656 657 /** 658 * amdgpu_device_vram_access - read/write a buffer in vram 659 * 660 * @adev: amdgpu_device pointer 661 * @pos: offset of the buffer in vram 662 * @buf: virtual address of the buffer in system memory 663 * @size: read/write size, sizeof(@buf) must > @size 664 * @write: true - write to vram, otherwise - read from vram 665 */ 666 void amdgpu_device_vram_access(struct amdgpu_device *adev, loff_t pos, 667 void *buf, size_t size, bool write) 668 { 669 size_t count; 670 671 /* try to using vram apreature to access vram first */ 672 count = amdgpu_device_aper_access(adev, pos, buf, size, write); 673 size -= count; 674 if (size) { 675 /* using MM to access rest vram */ 676 pos += count; 677 buf += count; 678 amdgpu_device_mm_access(adev, pos, buf, size, write); 679 } 680 } 681 682 /* 683 * register access helper functions. 684 */ 685 686 /* Check if hw access should be skipped because of hotplug or device error */ 687 bool amdgpu_device_skip_hw_access(struct amdgpu_device *adev) 688 { 689 if (adev->no_hw_access) 690 return true; 691 692 #ifdef CONFIG_LOCKDEP 693 /* 694 * This is a bit complicated to understand, so worth a comment. What we assert 695 * here is that the GPU reset is not running on another thread in parallel. 696 * 697 * For this we trylock the read side of the reset semaphore, if that succeeds 698 * we know that the reset is not running in parallel. 699 * 700 * If the trylock fails we assert that we are either already holding the read 701 * side of the lock or are the reset thread itself and hold the write side of 702 * the lock. 703 */ 704 if (in_task()) { 705 if (down_read_trylock(&adev->reset_domain->sem)) 706 up_read(&adev->reset_domain->sem); 707 else 708 lockdep_assert_held(&adev->reset_domain->sem); 709 } 710 #endif 711 return false; 712 } 713 714 /** 715 * amdgpu_device_rreg - read a memory mapped IO or indirect register 716 * 717 * @adev: amdgpu_device pointer 718 * @reg: dword aligned register offset 719 * @acc_flags: access flags which require special behavior 720 * 721 * Returns the 32 bit value from the offset specified. 722 */ 723 uint32_t amdgpu_device_rreg(struct amdgpu_device *adev, 724 uint32_t reg, uint32_t acc_flags) 725 { 726 uint32_t ret; 727 728 if (amdgpu_device_skip_hw_access(adev)) 729 return 0; 730 731 if ((reg * 4) < adev->rmmio_size) { 732 if (!(acc_flags & AMDGPU_REGS_NO_KIQ) && 733 amdgpu_sriov_runtime(adev) && 734 down_read_trylock(&adev->reset_domain->sem)) { 735 ret = amdgpu_kiq_rreg(adev, reg, 0); 736 up_read(&adev->reset_domain->sem); 737 } else { 738 ret = readl(((void __iomem *)adev->rmmio) + (reg * 4)); 739 } 740 } else { 741 ret = adev->pcie_rreg(adev, reg * 4); 742 } 743 744 trace_amdgpu_device_rreg(adev->pdev->device, reg, ret); 745 746 return ret; 747 } 748 749 /* 750 * MMIO register read with bytes helper functions 751 * @offset:bytes offset from MMIO start 752 */ 753 754 /** 755 * amdgpu_mm_rreg8 - read a memory mapped IO register 756 * 757 * @adev: amdgpu_device pointer 758 * @offset: byte aligned register offset 759 * 760 * Returns the 8 bit value from the offset specified. 761 */ 762 uint8_t amdgpu_mm_rreg8(struct amdgpu_device *adev, uint32_t offset) 763 { 764 if (amdgpu_device_skip_hw_access(adev)) 765 return 0; 766 767 if (offset < adev->rmmio_size) 768 return (readb(adev->rmmio + offset)); 769 BUG(); 770 } 771 772 773 /** 774 * amdgpu_device_xcc_rreg - read a memory mapped IO or indirect register with specific XCC 775 * 776 * @adev: amdgpu_device pointer 777 * @reg: dword aligned register offset 778 * @acc_flags: access flags which require special behavior 779 * @xcc_id: xcc accelerated compute core id 780 * 781 * Returns the 32 bit value from the offset specified. 782 */ 783 uint32_t amdgpu_device_xcc_rreg(struct amdgpu_device *adev, 784 uint32_t reg, uint32_t acc_flags, 785 uint32_t xcc_id) 786 { 787 uint32_t ret, rlcg_flag; 788 789 if (amdgpu_device_skip_hw_access(adev)) 790 return 0; 791 792 if ((reg * 4) < adev->rmmio_size) { 793 if (amdgpu_sriov_vf(adev) && 794 !amdgpu_sriov_runtime(adev) && 795 adev->gfx.rlc.rlcg_reg_access_supported && 796 amdgpu_virt_get_rlcg_reg_access_flag(adev, acc_flags, 797 GC_HWIP, false, 798 &rlcg_flag)) { 799 ret = amdgpu_virt_rlcg_reg_rw(adev, reg, 0, rlcg_flag, GET_INST(GC, xcc_id)); 800 } else if (!(acc_flags & AMDGPU_REGS_NO_KIQ) && 801 amdgpu_sriov_runtime(adev) && 802 down_read_trylock(&adev->reset_domain->sem)) { 803 ret = amdgpu_kiq_rreg(adev, reg, xcc_id); 804 up_read(&adev->reset_domain->sem); 805 } else { 806 ret = readl(((void __iomem *)adev->rmmio) + (reg * 4)); 807 } 808 } else { 809 ret = adev->pcie_rreg(adev, reg * 4); 810 } 811 812 return ret; 813 } 814 815 /* 816 * MMIO register write with bytes helper functions 817 * @offset:bytes offset from MMIO start 818 * @value: the value want to be written to the register 819 */ 820 821 /** 822 * amdgpu_mm_wreg8 - read a memory mapped IO register 823 * 824 * @adev: amdgpu_device pointer 825 * @offset: byte aligned register offset 826 * @value: 8 bit value to write 827 * 828 * Writes the value specified to the offset specified. 829 */ 830 void amdgpu_mm_wreg8(struct amdgpu_device *adev, uint32_t offset, uint8_t value) 831 { 832 if (amdgpu_device_skip_hw_access(adev)) 833 return; 834 835 if (offset < adev->rmmio_size) 836 writeb(value, adev->rmmio + offset); 837 else 838 BUG(); 839 } 840 841 /** 842 * amdgpu_device_wreg - write to a memory mapped IO or indirect register 843 * 844 * @adev: amdgpu_device pointer 845 * @reg: dword aligned register offset 846 * @v: 32 bit value to write to the register 847 * @acc_flags: access flags which require special behavior 848 * 849 * Writes the value specified to the offset specified. 850 */ 851 void amdgpu_device_wreg(struct amdgpu_device *adev, 852 uint32_t reg, uint32_t v, 853 uint32_t acc_flags) 854 { 855 if (amdgpu_device_skip_hw_access(adev)) 856 return; 857 858 if ((reg * 4) < adev->rmmio_size) { 859 if (!(acc_flags & AMDGPU_REGS_NO_KIQ) && 860 amdgpu_sriov_runtime(adev) && 861 down_read_trylock(&adev->reset_domain->sem)) { 862 amdgpu_kiq_wreg(adev, reg, v, 0); 863 up_read(&adev->reset_domain->sem); 864 } else { 865 writel(v, ((void __iomem *)adev->rmmio) + (reg * 4)); 866 } 867 } else { 868 adev->pcie_wreg(adev, reg * 4, v); 869 } 870 871 trace_amdgpu_device_wreg(adev->pdev->device, reg, v); 872 } 873 874 /** 875 * amdgpu_mm_wreg_mmio_rlc - write register either with direct/indirect mmio or with RLC path if in range 876 * 877 * @adev: amdgpu_device pointer 878 * @reg: mmio/rlc register 879 * @v: value to write 880 * @xcc_id: xcc accelerated compute core id 881 * 882 * this function is invoked only for the debugfs register access 883 */ 884 void amdgpu_mm_wreg_mmio_rlc(struct amdgpu_device *adev, 885 uint32_t reg, uint32_t v, 886 uint32_t xcc_id) 887 { 888 if (amdgpu_device_skip_hw_access(adev)) 889 return; 890 891 if (amdgpu_sriov_fullaccess(adev) && 892 adev->gfx.rlc.funcs && 893 adev->gfx.rlc.funcs->is_rlcg_access_range) { 894 if (adev->gfx.rlc.funcs->is_rlcg_access_range(adev, reg)) 895 return amdgpu_sriov_wreg(adev, reg, v, 0, 0, xcc_id); 896 } else if ((reg * 4) >= adev->rmmio_size) { 897 adev->pcie_wreg(adev, reg * 4, v); 898 } else { 899 writel(v, ((void __iomem *)adev->rmmio) + (reg * 4)); 900 } 901 } 902 903 /** 904 * amdgpu_device_xcc_wreg - write to a memory mapped IO or indirect register with specific XCC 905 * 906 * @adev: amdgpu_device pointer 907 * @reg: dword aligned register offset 908 * @v: 32 bit value to write to the register 909 * @acc_flags: access flags which require special behavior 910 * @xcc_id: xcc accelerated compute core id 911 * 912 * Writes the value specified to the offset specified. 913 */ 914 void amdgpu_device_xcc_wreg(struct amdgpu_device *adev, 915 uint32_t reg, uint32_t v, 916 uint32_t acc_flags, uint32_t xcc_id) 917 { 918 uint32_t rlcg_flag; 919 920 if (amdgpu_device_skip_hw_access(adev)) 921 return; 922 923 if ((reg * 4) < adev->rmmio_size) { 924 if (amdgpu_sriov_vf(adev) && 925 !amdgpu_sriov_runtime(adev) && 926 adev->gfx.rlc.rlcg_reg_access_supported && 927 amdgpu_virt_get_rlcg_reg_access_flag(adev, acc_flags, 928 GC_HWIP, true, 929 &rlcg_flag)) { 930 amdgpu_virt_rlcg_reg_rw(adev, reg, v, rlcg_flag, GET_INST(GC, xcc_id)); 931 } else if (!(acc_flags & AMDGPU_REGS_NO_KIQ) && 932 amdgpu_sriov_runtime(adev) && 933 down_read_trylock(&adev->reset_domain->sem)) { 934 amdgpu_kiq_wreg(adev, reg, v, xcc_id); 935 up_read(&adev->reset_domain->sem); 936 } else { 937 writel(v, ((void __iomem *)adev->rmmio) + (reg * 4)); 938 } 939 } else { 940 adev->pcie_wreg(adev, reg * 4, v); 941 } 942 } 943 944 /** 945 * amdgpu_device_indirect_rreg - read an indirect register 946 * 947 * @adev: amdgpu_device pointer 948 * @reg_addr: indirect register address to read from 949 * 950 * Returns the value of indirect register @reg_addr 951 */ 952 u32 amdgpu_device_indirect_rreg(struct amdgpu_device *adev, 953 u32 reg_addr) 954 { 955 unsigned long flags, pcie_index, pcie_data; 956 void __iomem *pcie_index_offset; 957 void __iomem *pcie_data_offset; 958 u32 r; 959 960 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 961 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 962 963 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 964 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 965 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 966 967 writel(reg_addr, pcie_index_offset); 968 readl(pcie_index_offset); 969 r = readl(pcie_data_offset); 970 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 971 972 return r; 973 } 974 975 u32 amdgpu_device_indirect_rreg_ext(struct amdgpu_device *adev, 976 u64 reg_addr) 977 { 978 unsigned long flags, pcie_index, pcie_index_hi, pcie_data; 979 u32 r; 980 void __iomem *pcie_index_offset; 981 void __iomem *pcie_index_hi_offset; 982 void __iomem *pcie_data_offset; 983 984 if (unlikely(!adev->nbio.funcs)) { 985 pcie_index = AMDGPU_PCIE_INDEX_FALLBACK; 986 pcie_data = AMDGPU_PCIE_DATA_FALLBACK; 987 } else { 988 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 989 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 990 } 991 992 if (reg_addr >> 32) { 993 if (unlikely(!adev->nbio.funcs)) 994 pcie_index_hi = AMDGPU_PCIE_INDEX_HI_FALLBACK; 995 else 996 pcie_index_hi = adev->nbio.funcs->get_pcie_index_hi_offset(adev); 997 } else { 998 pcie_index_hi = 0; 999 } 1000 1001 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 1002 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 1003 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 1004 if (pcie_index_hi != 0) 1005 pcie_index_hi_offset = (void __iomem *)adev->rmmio + 1006 pcie_index_hi * 4; 1007 1008 writel(reg_addr, pcie_index_offset); 1009 readl(pcie_index_offset); 1010 if (pcie_index_hi != 0) { 1011 writel((reg_addr >> 32) & 0xff, pcie_index_hi_offset); 1012 readl(pcie_index_hi_offset); 1013 } 1014 r = readl(pcie_data_offset); 1015 1016 /* clear the high bits */ 1017 if (pcie_index_hi != 0) { 1018 writel(0, pcie_index_hi_offset); 1019 readl(pcie_index_hi_offset); 1020 } 1021 1022 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 1023 1024 return r; 1025 } 1026 1027 /** 1028 * amdgpu_device_indirect_rreg64 - read a 64bits indirect register 1029 * 1030 * @adev: amdgpu_device pointer 1031 * @reg_addr: indirect register address to read from 1032 * 1033 * Returns the value of indirect register @reg_addr 1034 */ 1035 u64 amdgpu_device_indirect_rreg64(struct amdgpu_device *adev, 1036 u32 reg_addr) 1037 { 1038 unsigned long flags, pcie_index, pcie_data; 1039 void __iomem *pcie_index_offset; 1040 void __iomem *pcie_data_offset; 1041 u64 r; 1042 1043 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 1044 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 1045 1046 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 1047 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 1048 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 1049 1050 /* read low 32 bits */ 1051 writel(reg_addr, pcie_index_offset); 1052 readl(pcie_index_offset); 1053 r = readl(pcie_data_offset); 1054 /* read high 32 bits */ 1055 writel(reg_addr + 4, pcie_index_offset); 1056 readl(pcie_index_offset); 1057 r |= ((u64)readl(pcie_data_offset) << 32); 1058 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 1059 1060 return r; 1061 } 1062 1063 u64 amdgpu_device_indirect_rreg64_ext(struct amdgpu_device *adev, 1064 u64 reg_addr) 1065 { 1066 unsigned long flags, pcie_index, pcie_data; 1067 unsigned long pcie_index_hi = 0; 1068 void __iomem *pcie_index_offset; 1069 void __iomem *pcie_index_hi_offset; 1070 void __iomem *pcie_data_offset; 1071 u64 r; 1072 1073 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 1074 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 1075 if ((reg_addr >> 32) && (adev->nbio.funcs->get_pcie_index_hi_offset)) 1076 pcie_index_hi = adev->nbio.funcs->get_pcie_index_hi_offset(adev); 1077 1078 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 1079 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 1080 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 1081 if (pcie_index_hi != 0) 1082 pcie_index_hi_offset = (void __iomem *)adev->rmmio + 1083 pcie_index_hi * 4; 1084 1085 /* read low 32 bits */ 1086 writel(reg_addr, pcie_index_offset); 1087 readl(pcie_index_offset); 1088 if (pcie_index_hi != 0) { 1089 writel((reg_addr >> 32) & 0xff, pcie_index_hi_offset); 1090 readl(pcie_index_hi_offset); 1091 } 1092 r = readl(pcie_data_offset); 1093 /* read high 32 bits */ 1094 writel(reg_addr + 4, pcie_index_offset); 1095 readl(pcie_index_offset); 1096 if (pcie_index_hi != 0) { 1097 writel((reg_addr >> 32) & 0xff, pcie_index_hi_offset); 1098 readl(pcie_index_hi_offset); 1099 } 1100 r |= ((u64)readl(pcie_data_offset) << 32); 1101 1102 /* clear the high bits */ 1103 if (pcie_index_hi != 0) { 1104 writel(0, pcie_index_hi_offset); 1105 readl(pcie_index_hi_offset); 1106 } 1107 1108 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 1109 1110 return r; 1111 } 1112 1113 /** 1114 * amdgpu_device_indirect_wreg - write an indirect register address 1115 * 1116 * @adev: amdgpu_device pointer 1117 * @reg_addr: indirect register offset 1118 * @reg_data: indirect register data 1119 * 1120 */ 1121 void amdgpu_device_indirect_wreg(struct amdgpu_device *adev, 1122 u32 reg_addr, u32 reg_data) 1123 { 1124 unsigned long flags, pcie_index, pcie_data; 1125 void __iomem *pcie_index_offset; 1126 void __iomem *pcie_data_offset; 1127 1128 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 1129 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 1130 1131 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 1132 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 1133 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 1134 1135 writel(reg_addr, pcie_index_offset); 1136 readl(pcie_index_offset); 1137 writel(reg_data, pcie_data_offset); 1138 readl(pcie_data_offset); 1139 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 1140 } 1141 1142 void amdgpu_device_indirect_wreg_ext(struct amdgpu_device *adev, 1143 u64 reg_addr, u32 reg_data) 1144 { 1145 unsigned long flags, pcie_index, pcie_index_hi, pcie_data; 1146 void __iomem *pcie_index_offset; 1147 void __iomem *pcie_index_hi_offset; 1148 void __iomem *pcie_data_offset; 1149 1150 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 1151 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 1152 if ((reg_addr >> 32) && (adev->nbio.funcs->get_pcie_index_hi_offset)) 1153 pcie_index_hi = adev->nbio.funcs->get_pcie_index_hi_offset(adev); 1154 else 1155 pcie_index_hi = 0; 1156 1157 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 1158 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 1159 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 1160 if (pcie_index_hi != 0) 1161 pcie_index_hi_offset = (void __iomem *)adev->rmmio + 1162 pcie_index_hi * 4; 1163 1164 writel(reg_addr, pcie_index_offset); 1165 readl(pcie_index_offset); 1166 if (pcie_index_hi != 0) { 1167 writel((reg_addr >> 32) & 0xff, pcie_index_hi_offset); 1168 readl(pcie_index_hi_offset); 1169 } 1170 writel(reg_data, pcie_data_offset); 1171 readl(pcie_data_offset); 1172 1173 /* clear the high bits */ 1174 if (pcie_index_hi != 0) { 1175 writel(0, pcie_index_hi_offset); 1176 readl(pcie_index_hi_offset); 1177 } 1178 1179 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 1180 } 1181 1182 /** 1183 * amdgpu_device_indirect_wreg64 - write a 64bits indirect register address 1184 * 1185 * @adev: amdgpu_device pointer 1186 * @reg_addr: indirect register offset 1187 * @reg_data: indirect register data 1188 * 1189 */ 1190 void amdgpu_device_indirect_wreg64(struct amdgpu_device *adev, 1191 u32 reg_addr, u64 reg_data) 1192 { 1193 unsigned long flags, pcie_index, pcie_data; 1194 void __iomem *pcie_index_offset; 1195 void __iomem *pcie_data_offset; 1196 1197 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 1198 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 1199 1200 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 1201 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 1202 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 1203 1204 /* write low 32 bits */ 1205 writel(reg_addr, pcie_index_offset); 1206 readl(pcie_index_offset); 1207 writel((u32)(reg_data & 0xffffffffULL), pcie_data_offset); 1208 readl(pcie_data_offset); 1209 /* write high 32 bits */ 1210 writel(reg_addr + 4, pcie_index_offset); 1211 readl(pcie_index_offset); 1212 writel((u32)(reg_data >> 32), pcie_data_offset); 1213 readl(pcie_data_offset); 1214 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 1215 } 1216 1217 void amdgpu_device_indirect_wreg64_ext(struct amdgpu_device *adev, 1218 u64 reg_addr, u64 reg_data) 1219 { 1220 unsigned long flags, pcie_index, pcie_data; 1221 unsigned long pcie_index_hi = 0; 1222 void __iomem *pcie_index_offset; 1223 void __iomem *pcie_index_hi_offset; 1224 void __iomem *pcie_data_offset; 1225 1226 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 1227 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 1228 if ((reg_addr >> 32) && (adev->nbio.funcs->get_pcie_index_hi_offset)) 1229 pcie_index_hi = adev->nbio.funcs->get_pcie_index_hi_offset(adev); 1230 1231 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 1232 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 1233 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 1234 if (pcie_index_hi != 0) 1235 pcie_index_hi_offset = (void __iomem *)adev->rmmio + 1236 pcie_index_hi * 4; 1237 1238 /* write low 32 bits */ 1239 writel(reg_addr, pcie_index_offset); 1240 readl(pcie_index_offset); 1241 if (pcie_index_hi != 0) { 1242 writel((reg_addr >> 32) & 0xff, pcie_index_hi_offset); 1243 readl(pcie_index_hi_offset); 1244 } 1245 writel((u32)(reg_data & 0xffffffffULL), pcie_data_offset); 1246 readl(pcie_data_offset); 1247 /* write high 32 bits */ 1248 writel(reg_addr + 4, pcie_index_offset); 1249 readl(pcie_index_offset); 1250 if (pcie_index_hi != 0) { 1251 writel((reg_addr >> 32) & 0xff, pcie_index_hi_offset); 1252 readl(pcie_index_hi_offset); 1253 } 1254 writel((u32)(reg_data >> 32), pcie_data_offset); 1255 readl(pcie_data_offset); 1256 1257 /* clear the high bits */ 1258 if (pcie_index_hi != 0) { 1259 writel(0, pcie_index_hi_offset); 1260 readl(pcie_index_hi_offset); 1261 } 1262 1263 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 1264 } 1265 1266 /** 1267 * amdgpu_device_get_rev_id - query device rev_id 1268 * 1269 * @adev: amdgpu_device pointer 1270 * 1271 * Return device rev_id 1272 */ 1273 u32 amdgpu_device_get_rev_id(struct amdgpu_device *adev) 1274 { 1275 return adev->nbio.funcs->get_rev_id(adev); 1276 } 1277 1278 /** 1279 * amdgpu_invalid_rreg - dummy reg read function 1280 * 1281 * @adev: amdgpu_device pointer 1282 * @reg: offset of register 1283 * 1284 * Dummy register read function. Used for register blocks 1285 * that certain asics don't have (all asics). 1286 * Returns the value in the register. 1287 */ 1288 static uint32_t amdgpu_invalid_rreg(struct amdgpu_device *adev, uint32_t reg) 1289 { 1290 DRM_ERROR("Invalid callback to read register 0x%04X\n", reg); 1291 BUG(); 1292 return 0; 1293 } 1294 1295 static uint32_t amdgpu_invalid_rreg_ext(struct amdgpu_device *adev, uint64_t reg) 1296 { 1297 DRM_ERROR("Invalid callback to read register 0x%llX\n", reg); 1298 BUG(); 1299 return 0; 1300 } 1301 1302 /** 1303 * amdgpu_invalid_wreg - dummy reg write function 1304 * 1305 * @adev: amdgpu_device pointer 1306 * @reg: offset of register 1307 * @v: value to write to the register 1308 * 1309 * Dummy register read function. Used for register blocks 1310 * that certain asics don't have (all asics). 1311 */ 1312 static void amdgpu_invalid_wreg(struct amdgpu_device *adev, uint32_t reg, uint32_t v) 1313 { 1314 DRM_ERROR("Invalid callback to write register 0x%04X with 0x%08X\n", 1315 reg, v); 1316 BUG(); 1317 } 1318 1319 static void amdgpu_invalid_wreg_ext(struct amdgpu_device *adev, uint64_t reg, uint32_t v) 1320 { 1321 DRM_ERROR("Invalid callback to write register 0x%llX with 0x%08X\n", 1322 reg, v); 1323 BUG(); 1324 } 1325 1326 /** 1327 * amdgpu_invalid_rreg64 - dummy 64 bit reg read function 1328 * 1329 * @adev: amdgpu_device pointer 1330 * @reg: offset of register 1331 * 1332 * Dummy register read function. Used for register blocks 1333 * that certain asics don't have (all asics). 1334 * Returns the value in the register. 1335 */ 1336 static uint64_t amdgpu_invalid_rreg64(struct amdgpu_device *adev, uint32_t reg) 1337 { 1338 DRM_ERROR("Invalid callback to read 64 bit register 0x%04X\n", reg); 1339 BUG(); 1340 return 0; 1341 } 1342 1343 static uint64_t amdgpu_invalid_rreg64_ext(struct amdgpu_device *adev, uint64_t reg) 1344 { 1345 DRM_ERROR("Invalid callback to read register 0x%llX\n", reg); 1346 BUG(); 1347 return 0; 1348 } 1349 1350 /** 1351 * amdgpu_invalid_wreg64 - dummy reg write function 1352 * 1353 * @adev: amdgpu_device pointer 1354 * @reg: offset of register 1355 * @v: value to write to the register 1356 * 1357 * Dummy register read function. Used for register blocks 1358 * that certain asics don't have (all asics). 1359 */ 1360 static void amdgpu_invalid_wreg64(struct amdgpu_device *adev, uint32_t reg, uint64_t v) 1361 { 1362 DRM_ERROR("Invalid callback to write 64 bit register 0x%04X with 0x%08llX\n", 1363 reg, v); 1364 BUG(); 1365 } 1366 1367 static void amdgpu_invalid_wreg64_ext(struct amdgpu_device *adev, uint64_t reg, uint64_t v) 1368 { 1369 DRM_ERROR("Invalid callback to write 64 bit register 0x%llX with 0x%08llX\n", 1370 reg, v); 1371 BUG(); 1372 } 1373 1374 /** 1375 * amdgpu_block_invalid_rreg - dummy reg read function 1376 * 1377 * @adev: amdgpu_device pointer 1378 * @block: offset of instance 1379 * @reg: offset of register 1380 * 1381 * Dummy register read function. Used for register blocks 1382 * that certain asics don't have (all asics). 1383 * Returns the value in the register. 1384 */ 1385 static uint32_t amdgpu_block_invalid_rreg(struct amdgpu_device *adev, 1386 uint32_t block, uint32_t reg) 1387 { 1388 DRM_ERROR("Invalid callback to read register 0x%04X in block 0x%04X\n", 1389 reg, block); 1390 BUG(); 1391 return 0; 1392 } 1393 1394 /** 1395 * amdgpu_block_invalid_wreg - dummy reg write function 1396 * 1397 * @adev: amdgpu_device pointer 1398 * @block: offset of instance 1399 * @reg: offset of register 1400 * @v: value to write to the register 1401 * 1402 * Dummy register read function. Used for register blocks 1403 * that certain asics don't have (all asics). 1404 */ 1405 static void amdgpu_block_invalid_wreg(struct amdgpu_device *adev, 1406 uint32_t block, 1407 uint32_t reg, uint32_t v) 1408 { 1409 DRM_ERROR("Invalid block callback to write register 0x%04X in block 0x%04X with 0x%08X\n", 1410 reg, block, v); 1411 BUG(); 1412 } 1413 1414 static uint32_t amdgpu_device_get_vbios_flags(struct amdgpu_device *adev) 1415 { 1416 if (hweight32(adev->aid_mask) && (adev->flags & AMD_IS_APU)) 1417 return AMDGPU_VBIOS_SKIP; 1418 1419 if (hweight32(adev->aid_mask) && amdgpu_passthrough(adev)) 1420 return AMDGPU_VBIOS_OPTIONAL; 1421 1422 return 0; 1423 } 1424 1425 /** 1426 * amdgpu_device_asic_init - Wrapper for atom asic_init 1427 * 1428 * @adev: amdgpu_device pointer 1429 * 1430 * Does any asic specific work and then calls atom asic init. 1431 */ 1432 static int amdgpu_device_asic_init(struct amdgpu_device *adev) 1433 { 1434 uint32_t flags; 1435 bool optional; 1436 int ret; 1437 1438 amdgpu_asic_pre_asic_init(adev); 1439 flags = amdgpu_device_get_vbios_flags(adev); 1440 optional = !!(flags & (AMDGPU_VBIOS_OPTIONAL | AMDGPU_VBIOS_SKIP)); 1441 1442 if (amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 3) || 1443 amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 4) || 1444 amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 5, 0) || 1445 amdgpu_ip_version(adev, GC_HWIP, 0) >= IP_VERSION(11, 0, 0)) { 1446 amdgpu_psp_wait_for_bootloader(adev); 1447 if (optional && !adev->bios) 1448 return 0; 1449 1450 ret = amdgpu_atomfirmware_asic_init(adev, true); 1451 return ret; 1452 } else { 1453 if (optional && !adev->bios) 1454 return 0; 1455 1456 return amdgpu_atom_asic_init(adev->mode_info.atom_context); 1457 } 1458 1459 return 0; 1460 } 1461 1462 /** 1463 * amdgpu_device_mem_scratch_init - allocate the VRAM scratch page 1464 * 1465 * @adev: amdgpu_device pointer 1466 * 1467 * Allocates a scratch page of VRAM for use by various things in the 1468 * driver. 1469 */ 1470 static int amdgpu_device_mem_scratch_init(struct amdgpu_device *adev) 1471 { 1472 return amdgpu_bo_create_kernel(adev, AMDGPU_GPU_PAGE_SIZE, PAGE_SIZE, 1473 AMDGPU_GEM_DOMAIN_VRAM | 1474 AMDGPU_GEM_DOMAIN_GTT, 1475 &adev->mem_scratch.robj, 1476 &adev->mem_scratch.gpu_addr, 1477 (void **)&adev->mem_scratch.ptr); 1478 } 1479 1480 /** 1481 * amdgpu_device_mem_scratch_fini - Free the VRAM scratch page 1482 * 1483 * @adev: amdgpu_device pointer 1484 * 1485 * Frees the VRAM scratch page. 1486 */ 1487 static void amdgpu_device_mem_scratch_fini(struct amdgpu_device *adev) 1488 { 1489 amdgpu_bo_free_kernel(&adev->mem_scratch.robj, NULL, NULL); 1490 } 1491 1492 /** 1493 * amdgpu_device_program_register_sequence - program an array of registers. 1494 * 1495 * @adev: amdgpu_device pointer 1496 * @registers: pointer to the register array 1497 * @array_size: size of the register array 1498 * 1499 * Programs an array or registers with and or masks. 1500 * This is a helper for setting golden registers. 1501 */ 1502 void amdgpu_device_program_register_sequence(struct amdgpu_device *adev, 1503 const u32 *registers, 1504 const u32 array_size) 1505 { 1506 u32 tmp, reg, and_mask, or_mask; 1507 int i; 1508 1509 if (array_size % 3) 1510 return; 1511 1512 for (i = 0; i < array_size; i += 3) { 1513 reg = registers[i + 0]; 1514 and_mask = registers[i + 1]; 1515 or_mask = registers[i + 2]; 1516 1517 if (and_mask == 0xffffffff) { 1518 tmp = or_mask; 1519 } else { 1520 tmp = RREG32(reg); 1521 tmp &= ~and_mask; 1522 if (adev->family >= AMDGPU_FAMILY_AI) 1523 tmp |= (or_mask & and_mask); 1524 else 1525 tmp |= or_mask; 1526 } 1527 WREG32(reg, tmp); 1528 } 1529 } 1530 1531 /** 1532 * amdgpu_device_pci_config_reset - reset the GPU 1533 * 1534 * @adev: amdgpu_device pointer 1535 * 1536 * Resets the GPU using the pci config reset sequence. 1537 * Only applicable to asics prior to vega10. 1538 */ 1539 void amdgpu_device_pci_config_reset(struct amdgpu_device *adev) 1540 { 1541 pci_write_config_dword(adev->pdev, 0x7c, AMDGPU_ASIC_RESET_DATA); 1542 } 1543 1544 /** 1545 * amdgpu_device_pci_reset - reset the GPU using generic PCI means 1546 * 1547 * @adev: amdgpu_device pointer 1548 * 1549 * Resets the GPU using generic pci reset interfaces (FLR, SBR, etc.). 1550 */ 1551 int amdgpu_device_pci_reset(struct amdgpu_device *adev) 1552 { 1553 return pci_reset_function(adev->pdev); 1554 } 1555 1556 /* 1557 * amdgpu_device_wb_*() 1558 * Writeback is the method by which the GPU updates special pages in memory 1559 * with the status of certain GPU events (fences, ring pointers,etc.). 1560 */ 1561 1562 /** 1563 * amdgpu_device_wb_fini - Disable Writeback and free memory 1564 * 1565 * @adev: amdgpu_device pointer 1566 * 1567 * Disables Writeback and frees the Writeback memory (all asics). 1568 * Used at driver shutdown. 1569 */ 1570 static void amdgpu_device_wb_fini(struct amdgpu_device *adev) 1571 { 1572 if (adev->wb.wb_obj) { 1573 amdgpu_bo_free_kernel(&adev->wb.wb_obj, 1574 &adev->wb.gpu_addr, 1575 (void **)&adev->wb.wb); 1576 adev->wb.wb_obj = NULL; 1577 } 1578 } 1579 1580 /** 1581 * amdgpu_device_wb_init - Init Writeback driver info and allocate memory 1582 * 1583 * @adev: amdgpu_device pointer 1584 * 1585 * Initializes writeback and allocates writeback memory (all asics). 1586 * Used at driver startup. 1587 * Returns 0 on success or an -error on failure. 1588 */ 1589 static int amdgpu_device_wb_init(struct amdgpu_device *adev) 1590 { 1591 int r; 1592 1593 if (adev->wb.wb_obj == NULL) { 1594 /* AMDGPU_MAX_WB * sizeof(uint32_t) * 8 = AMDGPU_MAX_WB 256bit slots */ 1595 r = amdgpu_bo_create_kernel(adev, AMDGPU_MAX_WB * sizeof(uint32_t) * 8, 1596 PAGE_SIZE, AMDGPU_GEM_DOMAIN_GTT, 1597 &adev->wb.wb_obj, &adev->wb.gpu_addr, 1598 (void **)&adev->wb.wb); 1599 if (r) { 1600 dev_warn(adev->dev, "(%d) create WB bo failed\n", r); 1601 return r; 1602 } 1603 1604 adev->wb.num_wb = AMDGPU_MAX_WB; 1605 memset(&adev->wb.used, 0, sizeof(adev->wb.used)); 1606 1607 /* clear wb memory */ 1608 memset((char *)adev->wb.wb, 0, AMDGPU_MAX_WB * sizeof(uint32_t) * 8); 1609 } 1610 1611 return 0; 1612 } 1613 1614 /** 1615 * amdgpu_device_wb_get - Allocate a wb entry 1616 * 1617 * @adev: amdgpu_device pointer 1618 * @wb: wb index 1619 * 1620 * Allocate a wb slot for use by the driver (all asics). 1621 * Returns 0 on success or -EINVAL on failure. 1622 */ 1623 int amdgpu_device_wb_get(struct amdgpu_device *adev, u32 *wb) 1624 { 1625 unsigned long flags, offset; 1626 1627 spin_lock_irqsave(&adev->wb.lock, flags); 1628 offset = find_first_zero_bit(adev->wb.used, adev->wb.num_wb); 1629 if (offset < adev->wb.num_wb) { 1630 __set_bit(offset, adev->wb.used); 1631 spin_unlock_irqrestore(&adev->wb.lock, flags); 1632 *wb = offset << 3; /* convert to dw offset */ 1633 return 0; 1634 } else { 1635 spin_unlock_irqrestore(&adev->wb.lock, flags); 1636 return -EINVAL; 1637 } 1638 } 1639 1640 /** 1641 * amdgpu_device_wb_free - Free a wb entry 1642 * 1643 * @adev: amdgpu_device pointer 1644 * @wb: wb index 1645 * 1646 * Free a wb slot allocated for use by the driver (all asics) 1647 */ 1648 void amdgpu_device_wb_free(struct amdgpu_device *adev, u32 wb) 1649 { 1650 unsigned long flags; 1651 1652 wb >>= 3; 1653 spin_lock_irqsave(&adev->wb.lock, flags); 1654 if (wb < adev->wb.num_wb) 1655 __clear_bit(wb, adev->wb.used); 1656 spin_unlock_irqrestore(&adev->wb.lock, flags); 1657 } 1658 1659 /** 1660 * amdgpu_device_resize_fb_bar - try to resize FB BAR 1661 * 1662 * @adev: amdgpu_device pointer 1663 * 1664 * Try to resize FB BAR to make all VRAM CPU accessible. We try very hard not 1665 * to fail, but if any of the BARs is not accessible after the size we abort 1666 * driver loading by returning -ENODEV. 1667 */ 1668 int amdgpu_device_resize_fb_bar(struct amdgpu_device *adev) 1669 { 1670 int rbar_size = pci_rebar_bytes_to_size(adev->gmc.real_vram_size); 1671 struct pci_bus *root; 1672 struct resource *res; 1673 unsigned int i; 1674 u16 cmd; 1675 int r; 1676 1677 if (!IS_ENABLED(CONFIG_PHYS_ADDR_T_64BIT)) 1678 return 0; 1679 1680 /* Bypass for VF */ 1681 if (amdgpu_sriov_vf(adev)) 1682 return 0; 1683 1684 if (!amdgpu_rebar) 1685 return 0; 1686 1687 /* resizing on Dell G5 SE platforms causes problems with runtime pm */ 1688 if ((amdgpu_runtime_pm != 0) && 1689 adev->pdev->vendor == PCI_VENDOR_ID_ATI && 1690 adev->pdev->device == 0x731f && 1691 adev->pdev->subsystem_vendor == PCI_VENDOR_ID_DELL) 1692 return 0; 1693 1694 /* PCI_EXT_CAP_ID_VNDR extended capability is located at 0x100 */ 1695 if (!pci_find_ext_capability(adev->pdev, PCI_EXT_CAP_ID_VNDR)) 1696 DRM_WARN("System can't access extended configuration space, please check!!\n"); 1697 1698 /* skip if the bios has already enabled large BAR */ 1699 if (adev->gmc.real_vram_size && 1700 (pci_resource_len(adev->pdev, 0) >= adev->gmc.real_vram_size)) 1701 return 0; 1702 1703 /* Check if the root BUS has 64bit memory resources */ 1704 root = adev->pdev->bus; 1705 while (root->parent) 1706 root = root->parent; 1707 1708 pci_bus_for_each_resource(root, res, i) { 1709 if (res && res->flags & (IORESOURCE_MEM | IORESOURCE_MEM_64) && 1710 res->start > 0x100000000ull) 1711 break; 1712 } 1713 1714 /* Trying to resize is pointless without a root hub window above 4GB */ 1715 if (!res) 1716 return 0; 1717 1718 /* Limit the BAR size to what is available */ 1719 rbar_size = min(fls(pci_rebar_get_possible_sizes(adev->pdev, 0)) - 1, 1720 rbar_size); 1721 1722 /* Disable memory decoding while we change the BAR addresses and size */ 1723 pci_read_config_word(adev->pdev, PCI_COMMAND, &cmd); 1724 pci_write_config_word(adev->pdev, PCI_COMMAND, 1725 cmd & ~PCI_COMMAND_MEMORY); 1726 1727 /* Free the VRAM and doorbell BAR, we most likely need to move both. */ 1728 amdgpu_doorbell_fini(adev); 1729 if (adev->asic_type >= CHIP_BONAIRE) 1730 pci_release_resource(adev->pdev, 2); 1731 1732 pci_release_resource(adev->pdev, 0); 1733 1734 r = pci_resize_resource(adev->pdev, 0, rbar_size); 1735 if (r == -ENOSPC) 1736 DRM_INFO("Not enough PCI address space for a large BAR."); 1737 else if (r && r != -ENOTSUPP) 1738 DRM_ERROR("Problem resizing BAR0 (%d).", r); 1739 1740 pci_assign_unassigned_bus_resources(adev->pdev->bus); 1741 1742 /* When the doorbell or fb BAR isn't available we have no chance of 1743 * using the device. 1744 */ 1745 r = amdgpu_doorbell_init(adev); 1746 if (r || (pci_resource_flags(adev->pdev, 0) & IORESOURCE_UNSET)) 1747 return -ENODEV; 1748 1749 pci_write_config_word(adev->pdev, PCI_COMMAND, cmd); 1750 1751 return 0; 1752 } 1753 1754 /* 1755 * GPU helpers function. 1756 */ 1757 /** 1758 * amdgpu_device_need_post - check if the hw need post or not 1759 * 1760 * @adev: amdgpu_device pointer 1761 * 1762 * Check if the asic has been initialized (all asics) at driver startup 1763 * or post is needed if hw reset is performed. 1764 * Returns true if need or false if not. 1765 */ 1766 bool amdgpu_device_need_post(struct amdgpu_device *adev) 1767 { 1768 uint32_t reg, flags; 1769 1770 if (amdgpu_sriov_vf(adev)) 1771 return false; 1772 1773 flags = amdgpu_device_get_vbios_flags(adev); 1774 if (flags & AMDGPU_VBIOS_SKIP) 1775 return false; 1776 if ((flags & AMDGPU_VBIOS_OPTIONAL) && !adev->bios) 1777 return false; 1778 1779 if (amdgpu_passthrough(adev)) { 1780 /* for FIJI: In whole GPU pass-through virtualization case, after VM reboot 1781 * some old smc fw still need driver do vPost otherwise gpu hang, while 1782 * those smc fw version above 22.15 doesn't have this flaw, so we force 1783 * vpost executed for smc version below 22.15 1784 */ 1785 if (adev->asic_type == CHIP_FIJI) { 1786 int err; 1787 uint32_t fw_ver; 1788 1789 err = request_firmware(&adev->pm.fw, "amdgpu/fiji_smc.bin", adev->dev); 1790 /* force vPost if error occurred */ 1791 if (err) 1792 return true; 1793 1794 fw_ver = *((uint32_t *)adev->pm.fw->data + 69); 1795 release_firmware(adev->pm.fw); 1796 if (fw_ver < 0x00160e00) 1797 return true; 1798 } 1799 } 1800 1801 /* Don't post if we need to reset whole hive on init */ 1802 if (adev->init_lvl->level == AMDGPU_INIT_LEVEL_MINIMAL_XGMI) 1803 return false; 1804 1805 if (adev->has_hw_reset) { 1806 adev->has_hw_reset = false; 1807 return true; 1808 } 1809 1810 /* bios scratch used on CIK+ */ 1811 if (adev->asic_type >= CHIP_BONAIRE) 1812 return amdgpu_atombios_scratch_need_asic_init(adev); 1813 1814 /* check MEM_SIZE for older asics */ 1815 reg = amdgpu_asic_get_config_memsize(adev); 1816 1817 if ((reg != 0) && (reg != 0xffffffff)) 1818 return false; 1819 1820 return true; 1821 } 1822 1823 /* 1824 * Check whether seamless boot is supported. 1825 * 1826 * So far we only support seamless boot on DCE 3.0 or later. 1827 * If users report that it works on older ASICS as well, we may 1828 * loosen this. 1829 */ 1830 bool amdgpu_device_seamless_boot_supported(struct amdgpu_device *adev) 1831 { 1832 switch (amdgpu_seamless) { 1833 case -1: 1834 break; 1835 case 1: 1836 return true; 1837 case 0: 1838 return false; 1839 default: 1840 DRM_ERROR("Invalid value for amdgpu.seamless: %d\n", 1841 amdgpu_seamless); 1842 return false; 1843 } 1844 1845 if (!(adev->flags & AMD_IS_APU)) 1846 return false; 1847 1848 if (adev->mman.keep_stolen_vga_memory) 1849 return false; 1850 1851 return amdgpu_ip_version(adev, DCE_HWIP, 0) >= IP_VERSION(3, 0, 0); 1852 } 1853 1854 /* 1855 * Intel hosts such as Rocket Lake, Alder Lake, Raptor Lake and Sapphire Rapids 1856 * don't support dynamic speed switching. Until we have confirmation from Intel 1857 * that a specific host supports it, it's safer that we keep it disabled for all. 1858 * 1859 * https://edc.intel.com/content/www/us/en/design/products/platforms/details/raptor-lake-s/13th-generation-core-processors-datasheet-volume-1-of-2/005/pci-express-support/ 1860 * https://gitlab.freedesktop.org/drm/amd/-/issues/2663 1861 */ 1862 static bool amdgpu_device_pcie_dynamic_switching_supported(struct amdgpu_device *adev) 1863 { 1864 #if IS_ENABLED(CONFIG_X86) 1865 struct cpuinfo_x86 *c = &cpu_data(0); 1866 1867 /* eGPU change speeds based on USB4 fabric conditions */ 1868 if (dev_is_removable(adev->dev)) 1869 return true; 1870 1871 if (c->x86_vendor == X86_VENDOR_INTEL) 1872 return false; 1873 #endif 1874 return true; 1875 } 1876 1877 static bool amdgpu_device_aspm_support_quirk(struct amdgpu_device *adev) 1878 { 1879 #if IS_ENABLED(CONFIG_X86) 1880 struct cpuinfo_x86 *c = &cpu_data(0); 1881 1882 if (!(amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(12, 0, 0) || 1883 amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(12, 0, 1))) 1884 return false; 1885 1886 if (c->x86 == 6 && 1887 adev->pm.pcie_gen_mask & CAIL_PCIE_LINK_SPEED_SUPPORT_GEN5) { 1888 switch (c->x86_model) { 1889 case VFM_MODEL(INTEL_ALDERLAKE): 1890 case VFM_MODEL(INTEL_ALDERLAKE_L): 1891 case VFM_MODEL(INTEL_RAPTORLAKE): 1892 case VFM_MODEL(INTEL_RAPTORLAKE_P): 1893 case VFM_MODEL(INTEL_RAPTORLAKE_S): 1894 return true; 1895 default: 1896 return false; 1897 } 1898 } else { 1899 return false; 1900 } 1901 #else 1902 return false; 1903 #endif 1904 } 1905 1906 /** 1907 * amdgpu_device_should_use_aspm - check if the device should program ASPM 1908 * 1909 * @adev: amdgpu_device pointer 1910 * 1911 * Confirm whether the module parameter and pcie bridge agree that ASPM should 1912 * be set for this device. 1913 * 1914 * Returns true if it should be used or false if not. 1915 */ 1916 bool amdgpu_device_should_use_aspm(struct amdgpu_device *adev) 1917 { 1918 switch (amdgpu_aspm) { 1919 case -1: 1920 break; 1921 case 0: 1922 return false; 1923 case 1: 1924 return true; 1925 default: 1926 return false; 1927 } 1928 if (adev->flags & AMD_IS_APU) 1929 return false; 1930 if (amdgpu_device_aspm_support_quirk(adev)) 1931 return false; 1932 return pcie_aspm_enabled(adev->pdev); 1933 } 1934 1935 /* if we get transitioned to only one device, take VGA back */ 1936 /** 1937 * amdgpu_device_vga_set_decode - enable/disable vga decode 1938 * 1939 * @pdev: PCI device pointer 1940 * @state: enable/disable vga decode 1941 * 1942 * Enable/disable vga decode (all asics). 1943 * Returns VGA resource flags. 1944 */ 1945 static unsigned int amdgpu_device_vga_set_decode(struct pci_dev *pdev, 1946 bool state) 1947 { 1948 struct amdgpu_device *adev = drm_to_adev(pci_get_drvdata(pdev)); 1949 1950 amdgpu_asic_set_vga_state(adev, state); 1951 if (state) 1952 return VGA_RSRC_LEGACY_IO | VGA_RSRC_LEGACY_MEM | 1953 VGA_RSRC_NORMAL_IO | VGA_RSRC_NORMAL_MEM; 1954 else 1955 return VGA_RSRC_NORMAL_IO | VGA_RSRC_NORMAL_MEM; 1956 } 1957 1958 /** 1959 * amdgpu_device_check_block_size - validate the vm block size 1960 * 1961 * @adev: amdgpu_device pointer 1962 * 1963 * Validates the vm block size specified via module parameter. 1964 * The vm block size defines number of bits in page table versus page directory, 1965 * a page is 4KB so we have 12 bits offset, minimum 9 bits in the 1966 * page table and the remaining bits are in the page directory. 1967 */ 1968 static void amdgpu_device_check_block_size(struct amdgpu_device *adev) 1969 { 1970 /* defines number of bits in page table versus page directory, 1971 * a page is 4KB so we have 12 bits offset, minimum 9 bits in the 1972 * page table and the remaining bits are in the page directory 1973 */ 1974 if (amdgpu_vm_block_size == -1) 1975 return; 1976 1977 if (amdgpu_vm_block_size < 9) { 1978 dev_warn(adev->dev, "VM page table size (%d) too small\n", 1979 amdgpu_vm_block_size); 1980 amdgpu_vm_block_size = -1; 1981 } 1982 } 1983 1984 /** 1985 * amdgpu_device_check_vm_size - validate the vm size 1986 * 1987 * @adev: amdgpu_device pointer 1988 * 1989 * Validates the vm size in GB specified via module parameter. 1990 * The VM size is the size of the GPU virtual memory space in GB. 1991 */ 1992 static void amdgpu_device_check_vm_size(struct amdgpu_device *adev) 1993 { 1994 /* no need to check the default value */ 1995 if (amdgpu_vm_size == -1) 1996 return; 1997 1998 if (amdgpu_vm_size < 1) { 1999 dev_warn(adev->dev, "VM size (%d) too small, min is 1GB\n", 2000 amdgpu_vm_size); 2001 amdgpu_vm_size = -1; 2002 } 2003 } 2004 2005 static void amdgpu_device_check_smu_prv_buffer_size(struct amdgpu_device *adev) 2006 { 2007 struct sysinfo si; 2008 bool is_os_64 = (sizeof(void *) == 8); 2009 uint64_t total_memory; 2010 uint64_t dram_size_seven_GB = 0x1B8000000; 2011 uint64_t dram_size_three_GB = 0xB8000000; 2012 2013 if (amdgpu_smu_memory_pool_size == 0) 2014 return; 2015 2016 if (!is_os_64) { 2017 DRM_WARN("Not 64-bit OS, feature not supported\n"); 2018 goto def_value; 2019 } 2020 si_meminfo(&si); 2021 total_memory = (uint64_t)si.totalram * si.mem_unit; 2022 2023 if ((amdgpu_smu_memory_pool_size == 1) || 2024 (amdgpu_smu_memory_pool_size == 2)) { 2025 if (total_memory < dram_size_three_GB) 2026 goto def_value1; 2027 } else if ((amdgpu_smu_memory_pool_size == 4) || 2028 (amdgpu_smu_memory_pool_size == 8)) { 2029 if (total_memory < dram_size_seven_GB) 2030 goto def_value1; 2031 } else { 2032 DRM_WARN("Smu memory pool size not supported\n"); 2033 goto def_value; 2034 } 2035 adev->pm.smu_prv_buffer_size = amdgpu_smu_memory_pool_size << 28; 2036 2037 return; 2038 2039 def_value1: 2040 DRM_WARN("No enough system memory\n"); 2041 def_value: 2042 adev->pm.smu_prv_buffer_size = 0; 2043 } 2044 2045 static int amdgpu_device_init_apu_flags(struct amdgpu_device *adev) 2046 { 2047 if (!(adev->flags & AMD_IS_APU) || 2048 adev->asic_type < CHIP_RAVEN) 2049 return 0; 2050 2051 switch (adev->asic_type) { 2052 case CHIP_RAVEN: 2053 if (adev->pdev->device == 0x15dd) 2054 adev->apu_flags |= AMD_APU_IS_RAVEN; 2055 if (adev->pdev->device == 0x15d8) 2056 adev->apu_flags |= AMD_APU_IS_PICASSO; 2057 break; 2058 case CHIP_RENOIR: 2059 if ((adev->pdev->device == 0x1636) || 2060 (adev->pdev->device == 0x164c)) 2061 adev->apu_flags |= AMD_APU_IS_RENOIR; 2062 else 2063 adev->apu_flags |= AMD_APU_IS_GREEN_SARDINE; 2064 break; 2065 case CHIP_VANGOGH: 2066 adev->apu_flags |= AMD_APU_IS_VANGOGH; 2067 break; 2068 case CHIP_YELLOW_CARP: 2069 break; 2070 case CHIP_CYAN_SKILLFISH: 2071 if ((adev->pdev->device == 0x13FE) || 2072 (adev->pdev->device == 0x143F)) 2073 adev->apu_flags |= AMD_APU_IS_CYAN_SKILLFISH2; 2074 break; 2075 default: 2076 break; 2077 } 2078 2079 return 0; 2080 } 2081 2082 /** 2083 * amdgpu_device_check_arguments - validate module params 2084 * 2085 * @adev: amdgpu_device pointer 2086 * 2087 * Validates certain module parameters and updates 2088 * the associated values used by the driver (all asics). 2089 */ 2090 static int amdgpu_device_check_arguments(struct amdgpu_device *adev) 2091 { 2092 int i; 2093 2094 if (amdgpu_sched_jobs < 4) { 2095 dev_warn(adev->dev, "sched jobs (%d) must be at least 4\n", 2096 amdgpu_sched_jobs); 2097 amdgpu_sched_jobs = 4; 2098 } else if (!is_power_of_2(amdgpu_sched_jobs)) { 2099 dev_warn(adev->dev, "sched jobs (%d) must be a power of 2\n", 2100 amdgpu_sched_jobs); 2101 amdgpu_sched_jobs = roundup_pow_of_two(amdgpu_sched_jobs); 2102 } 2103 2104 if (amdgpu_gart_size != -1 && amdgpu_gart_size < 32) { 2105 /* gart size must be greater or equal to 32M */ 2106 dev_warn(adev->dev, "gart size (%d) too small\n", 2107 amdgpu_gart_size); 2108 amdgpu_gart_size = -1; 2109 } 2110 2111 if (amdgpu_gtt_size != -1 && amdgpu_gtt_size < 32) { 2112 /* gtt size must be greater or equal to 32M */ 2113 dev_warn(adev->dev, "gtt size (%d) too small\n", 2114 amdgpu_gtt_size); 2115 amdgpu_gtt_size = -1; 2116 } 2117 2118 /* valid range is between 4 and 9 inclusive */ 2119 if (amdgpu_vm_fragment_size != -1 && 2120 (amdgpu_vm_fragment_size > 9 || amdgpu_vm_fragment_size < 4)) { 2121 dev_warn(adev->dev, "valid range is between 4 and 9\n"); 2122 amdgpu_vm_fragment_size = -1; 2123 } 2124 2125 if (amdgpu_sched_hw_submission < 2) { 2126 dev_warn(adev->dev, "sched hw submission jobs (%d) must be at least 2\n", 2127 amdgpu_sched_hw_submission); 2128 amdgpu_sched_hw_submission = 2; 2129 } else if (!is_power_of_2(amdgpu_sched_hw_submission)) { 2130 dev_warn(adev->dev, "sched hw submission jobs (%d) must be a power of 2\n", 2131 amdgpu_sched_hw_submission); 2132 amdgpu_sched_hw_submission = roundup_pow_of_two(amdgpu_sched_hw_submission); 2133 } 2134 2135 if (amdgpu_reset_method < -1 || amdgpu_reset_method > 4) { 2136 dev_warn(adev->dev, "invalid option for reset method, reverting to default\n"); 2137 amdgpu_reset_method = -1; 2138 } 2139 2140 amdgpu_device_check_smu_prv_buffer_size(adev); 2141 2142 amdgpu_device_check_vm_size(adev); 2143 2144 amdgpu_device_check_block_size(adev); 2145 2146 adev->firmware.load_type = amdgpu_ucode_get_load_type(adev, amdgpu_fw_load_type); 2147 2148 for (i = 0; i < MAX_XCP; i++) { 2149 switch (amdgpu_enforce_isolation) { 2150 case -1: 2151 case 0: 2152 default: 2153 /* disable */ 2154 adev->enforce_isolation[i] = AMDGPU_ENFORCE_ISOLATION_DISABLE; 2155 break; 2156 case 1: 2157 /* enable */ 2158 adev->enforce_isolation[i] = 2159 AMDGPU_ENFORCE_ISOLATION_ENABLE; 2160 break; 2161 case 2: 2162 /* enable legacy mode */ 2163 adev->enforce_isolation[i] = 2164 AMDGPU_ENFORCE_ISOLATION_ENABLE_LEGACY; 2165 break; 2166 } 2167 } 2168 2169 return 0; 2170 } 2171 2172 /** 2173 * amdgpu_switcheroo_set_state - set switcheroo state 2174 * 2175 * @pdev: pci dev pointer 2176 * @state: vga_switcheroo state 2177 * 2178 * Callback for the switcheroo driver. Suspends or resumes 2179 * the asics before or after it is powered up using ACPI methods. 2180 */ 2181 static void amdgpu_switcheroo_set_state(struct pci_dev *pdev, 2182 enum vga_switcheroo_state state) 2183 { 2184 struct drm_device *dev = pci_get_drvdata(pdev); 2185 int r; 2186 2187 if (amdgpu_device_supports_px(dev) && state == VGA_SWITCHEROO_OFF) 2188 return; 2189 2190 if (state == VGA_SWITCHEROO_ON) { 2191 pr_info("switched on\n"); 2192 /* don't suspend or resume card normally */ 2193 dev->switch_power_state = DRM_SWITCH_POWER_CHANGING; 2194 2195 pci_set_power_state(pdev, PCI_D0); 2196 amdgpu_device_load_pci_state(pdev); 2197 r = pci_enable_device(pdev); 2198 if (r) 2199 DRM_WARN("pci_enable_device failed (%d)\n", r); 2200 amdgpu_device_resume(dev, true); 2201 2202 dev->switch_power_state = DRM_SWITCH_POWER_ON; 2203 } else { 2204 pr_info("switched off\n"); 2205 dev->switch_power_state = DRM_SWITCH_POWER_CHANGING; 2206 amdgpu_device_prepare(dev); 2207 amdgpu_device_suspend(dev, true); 2208 amdgpu_device_cache_pci_state(pdev); 2209 /* Shut down the device */ 2210 pci_disable_device(pdev); 2211 pci_set_power_state(pdev, PCI_D3cold); 2212 dev->switch_power_state = DRM_SWITCH_POWER_OFF; 2213 } 2214 } 2215 2216 /** 2217 * amdgpu_switcheroo_can_switch - see if switcheroo state can change 2218 * 2219 * @pdev: pci dev pointer 2220 * 2221 * Callback for the switcheroo driver. Check of the switcheroo 2222 * state can be changed. 2223 * Returns true if the state can be changed, false if not. 2224 */ 2225 static bool amdgpu_switcheroo_can_switch(struct pci_dev *pdev) 2226 { 2227 struct drm_device *dev = pci_get_drvdata(pdev); 2228 2229 /* 2230 * FIXME: open_count is protected by drm_global_mutex but that would lead to 2231 * locking inversion with the driver load path. And the access here is 2232 * completely racy anyway. So don't bother with locking for now. 2233 */ 2234 return atomic_read(&dev->open_count) == 0; 2235 } 2236 2237 static const struct vga_switcheroo_client_ops amdgpu_switcheroo_ops = { 2238 .set_gpu_state = amdgpu_switcheroo_set_state, 2239 .reprobe = NULL, 2240 .can_switch = amdgpu_switcheroo_can_switch, 2241 }; 2242 2243 /** 2244 * amdgpu_device_ip_set_clockgating_state - set the CG state 2245 * 2246 * @dev: amdgpu_device pointer 2247 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.) 2248 * @state: clockgating state (gate or ungate) 2249 * 2250 * Sets the requested clockgating state for all instances of 2251 * the hardware IP specified. 2252 * Returns the error code from the last instance. 2253 */ 2254 int amdgpu_device_ip_set_clockgating_state(void *dev, 2255 enum amd_ip_block_type block_type, 2256 enum amd_clockgating_state state) 2257 { 2258 struct amdgpu_device *adev = dev; 2259 int i, r = 0; 2260 2261 for (i = 0; i < adev->num_ip_blocks; i++) { 2262 if (!adev->ip_blocks[i].status.valid) 2263 continue; 2264 if (adev->ip_blocks[i].version->type != block_type) 2265 continue; 2266 if (!adev->ip_blocks[i].version->funcs->set_clockgating_state) 2267 continue; 2268 r = adev->ip_blocks[i].version->funcs->set_clockgating_state( 2269 &adev->ip_blocks[i], state); 2270 if (r) 2271 DRM_ERROR("set_clockgating_state of IP block <%s> failed %d\n", 2272 adev->ip_blocks[i].version->funcs->name, r); 2273 } 2274 return r; 2275 } 2276 2277 /** 2278 * amdgpu_device_ip_set_powergating_state - set the PG state 2279 * 2280 * @dev: amdgpu_device pointer 2281 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.) 2282 * @state: powergating state (gate or ungate) 2283 * 2284 * Sets the requested powergating state for all instances of 2285 * the hardware IP specified. 2286 * Returns the error code from the last instance. 2287 */ 2288 int amdgpu_device_ip_set_powergating_state(void *dev, 2289 enum amd_ip_block_type block_type, 2290 enum amd_powergating_state state) 2291 { 2292 struct amdgpu_device *adev = dev; 2293 int i, r = 0; 2294 2295 for (i = 0; i < adev->num_ip_blocks; i++) { 2296 if (!adev->ip_blocks[i].status.valid) 2297 continue; 2298 if (adev->ip_blocks[i].version->type != block_type) 2299 continue; 2300 if (!adev->ip_blocks[i].version->funcs->set_powergating_state) 2301 continue; 2302 r = adev->ip_blocks[i].version->funcs->set_powergating_state( 2303 &adev->ip_blocks[i], state); 2304 if (r) 2305 DRM_ERROR("set_powergating_state of IP block <%s> failed %d\n", 2306 adev->ip_blocks[i].version->funcs->name, r); 2307 } 2308 return r; 2309 } 2310 2311 /** 2312 * amdgpu_device_ip_get_clockgating_state - get the CG state 2313 * 2314 * @adev: amdgpu_device pointer 2315 * @flags: clockgating feature flags 2316 * 2317 * Walks the list of IPs on the device and updates the clockgating 2318 * flags for each IP. 2319 * Updates @flags with the feature flags for each hardware IP where 2320 * clockgating is enabled. 2321 */ 2322 void amdgpu_device_ip_get_clockgating_state(struct amdgpu_device *adev, 2323 u64 *flags) 2324 { 2325 int i; 2326 2327 for (i = 0; i < adev->num_ip_blocks; i++) { 2328 if (!adev->ip_blocks[i].status.valid) 2329 continue; 2330 if (adev->ip_blocks[i].version->funcs->get_clockgating_state) 2331 adev->ip_blocks[i].version->funcs->get_clockgating_state( 2332 &adev->ip_blocks[i], flags); 2333 } 2334 } 2335 2336 /** 2337 * amdgpu_device_ip_wait_for_idle - wait for idle 2338 * 2339 * @adev: amdgpu_device pointer 2340 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.) 2341 * 2342 * Waits for the request hardware IP to be idle. 2343 * Returns 0 for success or a negative error code on failure. 2344 */ 2345 int amdgpu_device_ip_wait_for_idle(struct amdgpu_device *adev, 2346 enum amd_ip_block_type block_type) 2347 { 2348 int i, r; 2349 2350 for (i = 0; i < adev->num_ip_blocks; i++) { 2351 if (!adev->ip_blocks[i].status.valid) 2352 continue; 2353 if (adev->ip_blocks[i].version->type == block_type) { 2354 if (adev->ip_blocks[i].version->funcs->wait_for_idle) { 2355 r = adev->ip_blocks[i].version->funcs->wait_for_idle( 2356 &adev->ip_blocks[i]); 2357 if (r) 2358 return r; 2359 } 2360 break; 2361 } 2362 } 2363 return 0; 2364 2365 } 2366 2367 /** 2368 * amdgpu_device_ip_is_valid - is the hardware IP enabled 2369 * 2370 * @adev: amdgpu_device pointer 2371 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.) 2372 * 2373 * Check if the hardware IP is enable or not. 2374 * Returns true if it the IP is enable, false if not. 2375 */ 2376 bool amdgpu_device_ip_is_valid(struct amdgpu_device *adev, 2377 enum amd_ip_block_type block_type) 2378 { 2379 int i; 2380 2381 for (i = 0; i < adev->num_ip_blocks; i++) { 2382 if (adev->ip_blocks[i].version->type == block_type) 2383 return adev->ip_blocks[i].status.valid; 2384 } 2385 return false; 2386 2387 } 2388 2389 /** 2390 * amdgpu_device_ip_get_ip_block - get a hw IP pointer 2391 * 2392 * @adev: amdgpu_device pointer 2393 * @type: Type of hardware IP (SMU, GFX, UVD, etc.) 2394 * 2395 * Returns a pointer to the hardware IP block structure 2396 * if it exists for the asic, otherwise NULL. 2397 */ 2398 struct amdgpu_ip_block * 2399 amdgpu_device_ip_get_ip_block(struct amdgpu_device *adev, 2400 enum amd_ip_block_type type) 2401 { 2402 int i; 2403 2404 for (i = 0; i < adev->num_ip_blocks; i++) 2405 if (adev->ip_blocks[i].version->type == type) 2406 return &adev->ip_blocks[i]; 2407 2408 return NULL; 2409 } 2410 2411 /** 2412 * amdgpu_device_ip_block_version_cmp 2413 * 2414 * @adev: amdgpu_device pointer 2415 * @type: enum amd_ip_block_type 2416 * @major: major version 2417 * @minor: minor version 2418 * 2419 * return 0 if equal or greater 2420 * return 1 if smaller or the ip_block doesn't exist 2421 */ 2422 int amdgpu_device_ip_block_version_cmp(struct amdgpu_device *adev, 2423 enum amd_ip_block_type type, 2424 u32 major, u32 minor) 2425 { 2426 struct amdgpu_ip_block *ip_block = amdgpu_device_ip_get_ip_block(adev, type); 2427 2428 if (ip_block && ((ip_block->version->major > major) || 2429 ((ip_block->version->major == major) && 2430 (ip_block->version->minor >= minor)))) 2431 return 0; 2432 2433 return 1; 2434 } 2435 2436 /** 2437 * amdgpu_device_ip_block_add 2438 * 2439 * @adev: amdgpu_device pointer 2440 * @ip_block_version: pointer to the IP to add 2441 * 2442 * Adds the IP block driver information to the collection of IPs 2443 * on the asic. 2444 */ 2445 int amdgpu_device_ip_block_add(struct amdgpu_device *adev, 2446 const struct amdgpu_ip_block_version *ip_block_version) 2447 { 2448 if (!ip_block_version) 2449 return -EINVAL; 2450 2451 switch (ip_block_version->type) { 2452 case AMD_IP_BLOCK_TYPE_VCN: 2453 if (adev->harvest_ip_mask & AMD_HARVEST_IP_VCN_MASK) 2454 return 0; 2455 break; 2456 case AMD_IP_BLOCK_TYPE_JPEG: 2457 if (adev->harvest_ip_mask & AMD_HARVEST_IP_JPEG_MASK) 2458 return 0; 2459 break; 2460 default: 2461 break; 2462 } 2463 2464 dev_info(adev->dev, "detected ip block number %d <%s>\n", 2465 adev->num_ip_blocks, ip_block_version->funcs->name); 2466 2467 adev->ip_blocks[adev->num_ip_blocks].adev = adev; 2468 2469 adev->ip_blocks[adev->num_ip_blocks++].version = ip_block_version; 2470 2471 return 0; 2472 } 2473 2474 /** 2475 * amdgpu_device_enable_virtual_display - enable virtual display feature 2476 * 2477 * @adev: amdgpu_device pointer 2478 * 2479 * Enabled the virtual display feature if the user has enabled it via 2480 * the module parameter virtual_display. This feature provides a virtual 2481 * display hardware on headless boards or in virtualized environments. 2482 * This function parses and validates the configuration string specified by 2483 * the user and configures the virtual display configuration (number of 2484 * virtual connectors, crtcs, etc.) specified. 2485 */ 2486 static void amdgpu_device_enable_virtual_display(struct amdgpu_device *adev) 2487 { 2488 adev->enable_virtual_display = false; 2489 2490 if (amdgpu_virtual_display) { 2491 const char *pci_address_name = pci_name(adev->pdev); 2492 char *pciaddstr, *pciaddstr_tmp, *pciaddname_tmp, *pciaddname; 2493 2494 pciaddstr = kstrdup(amdgpu_virtual_display, GFP_KERNEL); 2495 pciaddstr_tmp = pciaddstr; 2496 while ((pciaddname_tmp = strsep(&pciaddstr_tmp, ";"))) { 2497 pciaddname = strsep(&pciaddname_tmp, ","); 2498 if (!strcmp("all", pciaddname) 2499 || !strcmp(pci_address_name, pciaddname)) { 2500 long num_crtc; 2501 int res = -1; 2502 2503 adev->enable_virtual_display = true; 2504 2505 if (pciaddname_tmp) 2506 res = kstrtol(pciaddname_tmp, 10, 2507 &num_crtc); 2508 2509 if (!res) { 2510 if (num_crtc < 1) 2511 num_crtc = 1; 2512 if (num_crtc > 6) 2513 num_crtc = 6; 2514 adev->mode_info.num_crtc = num_crtc; 2515 } else { 2516 adev->mode_info.num_crtc = 1; 2517 } 2518 break; 2519 } 2520 } 2521 2522 DRM_INFO("virtual display string:%s, %s:virtual_display:%d, num_crtc:%d\n", 2523 amdgpu_virtual_display, pci_address_name, 2524 adev->enable_virtual_display, adev->mode_info.num_crtc); 2525 2526 kfree(pciaddstr); 2527 } 2528 } 2529 2530 void amdgpu_device_set_sriov_virtual_display(struct amdgpu_device *adev) 2531 { 2532 if (amdgpu_sriov_vf(adev) && !adev->enable_virtual_display) { 2533 adev->mode_info.num_crtc = 1; 2534 adev->enable_virtual_display = true; 2535 DRM_INFO("virtual_display:%d, num_crtc:%d\n", 2536 adev->enable_virtual_display, adev->mode_info.num_crtc); 2537 } 2538 } 2539 2540 /** 2541 * amdgpu_device_parse_gpu_info_fw - parse gpu info firmware 2542 * 2543 * @adev: amdgpu_device pointer 2544 * 2545 * Parses the asic configuration parameters specified in the gpu info 2546 * firmware and makes them available to the driver for use in configuring 2547 * the asic. 2548 * Returns 0 on success, -EINVAL on failure. 2549 */ 2550 static int amdgpu_device_parse_gpu_info_fw(struct amdgpu_device *adev) 2551 { 2552 const char *chip_name; 2553 int err; 2554 const struct gpu_info_firmware_header_v1_0 *hdr; 2555 2556 adev->firmware.gpu_info_fw = NULL; 2557 2558 if (adev->mman.discovery_bin) 2559 return 0; 2560 2561 switch (adev->asic_type) { 2562 default: 2563 return 0; 2564 case CHIP_VEGA10: 2565 chip_name = "vega10"; 2566 break; 2567 case CHIP_VEGA12: 2568 chip_name = "vega12"; 2569 break; 2570 case CHIP_RAVEN: 2571 if (adev->apu_flags & AMD_APU_IS_RAVEN2) 2572 chip_name = "raven2"; 2573 else if (adev->apu_flags & AMD_APU_IS_PICASSO) 2574 chip_name = "picasso"; 2575 else 2576 chip_name = "raven"; 2577 break; 2578 case CHIP_ARCTURUS: 2579 chip_name = "arcturus"; 2580 break; 2581 case CHIP_NAVI12: 2582 chip_name = "navi12"; 2583 break; 2584 } 2585 2586 err = amdgpu_ucode_request(adev, &adev->firmware.gpu_info_fw, 2587 AMDGPU_UCODE_OPTIONAL, 2588 "amdgpu/%s_gpu_info.bin", chip_name); 2589 if (err) { 2590 dev_err(adev->dev, 2591 "Failed to get gpu_info firmware \"%s_gpu_info.bin\"\n", 2592 chip_name); 2593 goto out; 2594 } 2595 2596 hdr = (const struct gpu_info_firmware_header_v1_0 *)adev->firmware.gpu_info_fw->data; 2597 amdgpu_ucode_print_gpu_info_hdr(&hdr->header); 2598 2599 switch (hdr->version_major) { 2600 case 1: 2601 { 2602 const struct gpu_info_firmware_v1_0 *gpu_info_fw = 2603 (const struct gpu_info_firmware_v1_0 *)(adev->firmware.gpu_info_fw->data + 2604 le32_to_cpu(hdr->header.ucode_array_offset_bytes)); 2605 2606 /* 2607 * Should be dropped when DAL no longer needs it. 2608 */ 2609 if (adev->asic_type == CHIP_NAVI12) 2610 goto parse_soc_bounding_box; 2611 2612 adev->gfx.config.max_shader_engines = le32_to_cpu(gpu_info_fw->gc_num_se); 2613 adev->gfx.config.max_cu_per_sh = le32_to_cpu(gpu_info_fw->gc_num_cu_per_sh); 2614 adev->gfx.config.max_sh_per_se = le32_to_cpu(gpu_info_fw->gc_num_sh_per_se); 2615 adev->gfx.config.max_backends_per_se = le32_to_cpu(gpu_info_fw->gc_num_rb_per_se); 2616 adev->gfx.config.max_texture_channel_caches = 2617 le32_to_cpu(gpu_info_fw->gc_num_tccs); 2618 adev->gfx.config.max_gprs = le32_to_cpu(gpu_info_fw->gc_num_gprs); 2619 adev->gfx.config.max_gs_threads = le32_to_cpu(gpu_info_fw->gc_num_max_gs_thds); 2620 adev->gfx.config.gs_vgt_table_depth = le32_to_cpu(gpu_info_fw->gc_gs_table_depth); 2621 adev->gfx.config.gs_prim_buffer_depth = le32_to_cpu(gpu_info_fw->gc_gsprim_buff_depth); 2622 adev->gfx.config.double_offchip_lds_buf = 2623 le32_to_cpu(gpu_info_fw->gc_double_offchip_lds_buffer); 2624 adev->gfx.cu_info.wave_front_size = le32_to_cpu(gpu_info_fw->gc_wave_size); 2625 adev->gfx.cu_info.max_waves_per_simd = 2626 le32_to_cpu(gpu_info_fw->gc_max_waves_per_simd); 2627 adev->gfx.cu_info.max_scratch_slots_per_cu = 2628 le32_to_cpu(gpu_info_fw->gc_max_scratch_slots_per_cu); 2629 adev->gfx.cu_info.lds_size = le32_to_cpu(gpu_info_fw->gc_lds_size); 2630 if (hdr->version_minor >= 1) { 2631 const struct gpu_info_firmware_v1_1 *gpu_info_fw = 2632 (const struct gpu_info_firmware_v1_1 *)(adev->firmware.gpu_info_fw->data + 2633 le32_to_cpu(hdr->header.ucode_array_offset_bytes)); 2634 adev->gfx.config.num_sc_per_sh = 2635 le32_to_cpu(gpu_info_fw->num_sc_per_sh); 2636 adev->gfx.config.num_packer_per_sc = 2637 le32_to_cpu(gpu_info_fw->num_packer_per_sc); 2638 } 2639 2640 parse_soc_bounding_box: 2641 /* 2642 * soc bounding box info is not integrated in disocovery table, 2643 * we always need to parse it from gpu info firmware if needed. 2644 */ 2645 if (hdr->version_minor == 2) { 2646 const struct gpu_info_firmware_v1_2 *gpu_info_fw = 2647 (const struct gpu_info_firmware_v1_2 *)(adev->firmware.gpu_info_fw->data + 2648 le32_to_cpu(hdr->header.ucode_array_offset_bytes)); 2649 adev->dm.soc_bounding_box = &gpu_info_fw->soc_bounding_box; 2650 } 2651 break; 2652 } 2653 default: 2654 dev_err(adev->dev, 2655 "Unsupported gpu_info table %d\n", hdr->header.ucode_version); 2656 err = -EINVAL; 2657 goto out; 2658 } 2659 out: 2660 return err; 2661 } 2662 2663 /** 2664 * amdgpu_device_ip_early_init - run early init for hardware IPs 2665 * 2666 * @adev: amdgpu_device pointer 2667 * 2668 * Early initialization pass for hardware IPs. The hardware IPs that make 2669 * up each asic are discovered each IP's early_init callback is run. This 2670 * is the first stage in initializing the asic. 2671 * Returns 0 on success, negative error code on failure. 2672 */ 2673 static int amdgpu_device_ip_early_init(struct amdgpu_device *adev) 2674 { 2675 struct amdgpu_ip_block *ip_block; 2676 struct pci_dev *parent; 2677 bool total, skip_bios; 2678 uint32_t bios_flags; 2679 int i, r; 2680 2681 amdgpu_device_enable_virtual_display(adev); 2682 2683 if (amdgpu_sriov_vf(adev)) { 2684 r = amdgpu_virt_request_full_gpu(adev, true); 2685 if (r) 2686 return r; 2687 } 2688 2689 switch (adev->asic_type) { 2690 #ifdef CONFIG_DRM_AMDGPU_SI 2691 case CHIP_VERDE: 2692 case CHIP_TAHITI: 2693 case CHIP_PITCAIRN: 2694 case CHIP_OLAND: 2695 case CHIP_HAINAN: 2696 adev->family = AMDGPU_FAMILY_SI; 2697 r = si_set_ip_blocks(adev); 2698 if (r) 2699 return r; 2700 break; 2701 #endif 2702 #ifdef CONFIG_DRM_AMDGPU_CIK 2703 case CHIP_BONAIRE: 2704 case CHIP_HAWAII: 2705 case CHIP_KAVERI: 2706 case CHIP_KABINI: 2707 case CHIP_MULLINS: 2708 if (adev->flags & AMD_IS_APU) 2709 adev->family = AMDGPU_FAMILY_KV; 2710 else 2711 adev->family = AMDGPU_FAMILY_CI; 2712 2713 r = cik_set_ip_blocks(adev); 2714 if (r) 2715 return r; 2716 break; 2717 #endif 2718 case CHIP_TOPAZ: 2719 case CHIP_TONGA: 2720 case CHIP_FIJI: 2721 case CHIP_POLARIS10: 2722 case CHIP_POLARIS11: 2723 case CHIP_POLARIS12: 2724 case CHIP_VEGAM: 2725 case CHIP_CARRIZO: 2726 case CHIP_STONEY: 2727 if (adev->flags & AMD_IS_APU) 2728 adev->family = AMDGPU_FAMILY_CZ; 2729 else 2730 adev->family = AMDGPU_FAMILY_VI; 2731 2732 r = vi_set_ip_blocks(adev); 2733 if (r) 2734 return r; 2735 break; 2736 default: 2737 r = amdgpu_discovery_set_ip_blocks(adev); 2738 if (r) 2739 return r; 2740 break; 2741 } 2742 2743 /* Check for IP version 9.4.3 with A0 hardware */ 2744 if (amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 3) && 2745 !amdgpu_device_get_rev_id(adev)) { 2746 dev_err(adev->dev, "Unsupported A0 hardware\n"); 2747 return -ENODEV; /* device unsupported - no device error */ 2748 } 2749 2750 if (amdgpu_has_atpx() && 2751 (amdgpu_is_atpx_hybrid() || 2752 amdgpu_has_atpx_dgpu_power_cntl()) && 2753 ((adev->flags & AMD_IS_APU) == 0) && 2754 !dev_is_removable(&adev->pdev->dev)) 2755 adev->flags |= AMD_IS_PX; 2756 2757 if (!(adev->flags & AMD_IS_APU)) { 2758 parent = pcie_find_root_port(adev->pdev); 2759 adev->has_pr3 = parent ? pci_pr3_present(parent) : false; 2760 } 2761 2762 adev->pm.pp_feature = amdgpu_pp_feature_mask; 2763 if (amdgpu_sriov_vf(adev) || sched_policy == KFD_SCHED_POLICY_NO_HWS) 2764 adev->pm.pp_feature &= ~PP_GFXOFF_MASK; 2765 if (amdgpu_sriov_vf(adev) && adev->asic_type == CHIP_SIENNA_CICHLID) 2766 adev->pm.pp_feature &= ~PP_OVERDRIVE_MASK; 2767 if (!amdgpu_device_pcie_dynamic_switching_supported(adev)) 2768 adev->pm.pp_feature &= ~PP_PCIE_DPM_MASK; 2769 2770 total = true; 2771 for (i = 0; i < adev->num_ip_blocks; i++) { 2772 ip_block = &adev->ip_blocks[i]; 2773 2774 if ((amdgpu_ip_block_mask & (1 << i)) == 0) { 2775 DRM_WARN("disabled ip block: %d <%s>\n", 2776 i, adev->ip_blocks[i].version->funcs->name); 2777 adev->ip_blocks[i].status.valid = false; 2778 } else if (ip_block->version->funcs->early_init) { 2779 r = ip_block->version->funcs->early_init(ip_block); 2780 if (r == -ENOENT) { 2781 adev->ip_blocks[i].status.valid = false; 2782 } else if (r) { 2783 DRM_ERROR("early_init of IP block <%s> failed %d\n", 2784 adev->ip_blocks[i].version->funcs->name, r); 2785 total = false; 2786 } else { 2787 adev->ip_blocks[i].status.valid = true; 2788 } 2789 } else { 2790 adev->ip_blocks[i].status.valid = true; 2791 } 2792 /* get the vbios after the asic_funcs are set up */ 2793 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON) { 2794 r = amdgpu_device_parse_gpu_info_fw(adev); 2795 if (r) 2796 return r; 2797 2798 bios_flags = amdgpu_device_get_vbios_flags(adev); 2799 skip_bios = !!(bios_flags & AMDGPU_VBIOS_SKIP); 2800 /* Read BIOS */ 2801 if (!skip_bios) { 2802 bool optional = 2803 !!(bios_flags & AMDGPU_VBIOS_OPTIONAL); 2804 if (!amdgpu_get_bios(adev) && !optional) 2805 return -EINVAL; 2806 2807 if (optional && !adev->bios) 2808 dev_info( 2809 adev->dev, 2810 "VBIOS image optional, proceeding without VBIOS image"); 2811 2812 if (adev->bios) { 2813 r = amdgpu_atombios_init(adev); 2814 if (r) { 2815 dev_err(adev->dev, 2816 "amdgpu_atombios_init failed\n"); 2817 amdgpu_vf_error_put( 2818 adev, 2819 AMDGIM_ERROR_VF_ATOMBIOS_INIT_FAIL, 2820 0, 0); 2821 return r; 2822 } 2823 } 2824 } 2825 2826 /*get pf2vf msg info at it's earliest time*/ 2827 if (amdgpu_sriov_vf(adev)) 2828 amdgpu_virt_init_data_exchange(adev); 2829 2830 } 2831 } 2832 if (!total) 2833 return -ENODEV; 2834 2835 if (adev->gmc.xgmi.supported) 2836 amdgpu_xgmi_early_init(adev); 2837 2838 ip_block = amdgpu_device_ip_get_ip_block(adev, AMD_IP_BLOCK_TYPE_GFX); 2839 if (ip_block->status.valid != false) 2840 amdgpu_amdkfd_device_probe(adev); 2841 2842 adev->cg_flags &= amdgpu_cg_mask; 2843 adev->pg_flags &= amdgpu_pg_mask; 2844 2845 return 0; 2846 } 2847 2848 static int amdgpu_device_ip_hw_init_phase1(struct amdgpu_device *adev) 2849 { 2850 int i, r; 2851 2852 for (i = 0; i < adev->num_ip_blocks; i++) { 2853 if (!adev->ip_blocks[i].status.sw) 2854 continue; 2855 if (adev->ip_blocks[i].status.hw) 2856 continue; 2857 if (!amdgpu_ip_member_of_hwini( 2858 adev, adev->ip_blocks[i].version->type)) 2859 continue; 2860 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON || 2861 (amdgpu_sriov_vf(adev) && (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP)) || 2862 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH) { 2863 r = adev->ip_blocks[i].version->funcs->hw_init(&adev->ip_blocks[i]); 2864 if (r) { 2865 DRM_ERROR("hw_init of IP block <%s> failed %d\n", 2866 adev->ip_blocks[i].version->funcs->name, r); 2867 return r; 2868 } 2869 adev->ip_blocks[i].status.hw = true; 2870 } 2871 } 2872 2873 return 0; 2874 } 2875 2876 static int amdgpu_device_ip_hw_init_phase2(struct amdgpu_device *adev) 2877 { 2878 int i, r; 2879 2880 for (i = 0; i < adev->num_ip_blocks; i++) { 2881 if (!adev->ip_blocks[i].status.sw) 2882 continue; 2883 if (adev->ip_blocks[i].status.hw) 2884 continue; 2885 if (!amdgpu_ip_member_of_hwini( 2886 adev, adev->ip_blocks[i].version->type)) 2887 continue; 2888 r = adev->ip_blocks[i].version->funcs->hw_init(&adev->ip_blocks[i]); 2889 if (r) { 2890 DRM_ERROR("hw_init of IP block <%s> failed %d\n", 2891 adev->ip_blocks[i].version->funcs->name, r); 2892 return r; 2893 } 2894 adev->ip_blocks[i].status.hw = true; 2895 } 2896 2897 return 0; 2898 } 2899 2900 static int amdgpu_device_fw_loading(struct amdgpu_device *adev) 2901 { 2902 int r = 0; 2903 int i; 2904 uint32_t smu_version; 2905 2906 if (adev->asic_type >= CHIP_VEGA10) { 2907 for (i = 0; i < adev->num_ip_blocks; i++) { 2908 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_PSP) 2909 continue; 2910 2911 if (!amdgpu_ip_member_of_hwini(adev, 2912 AMD_IP_BLOCK_TYPE_PSP)) 2913 break; 2914 2915 if (!adev->ip_blocks[i].status.sw) 2916 continue; 2917 2918 /* no need to do the fw loading again if already done*/ 2919 if (adev->ip_blocks[i].status.hw == true) 2920 break; 2921 2922 if (amdgpu_in_reset(adev) || adev->in_suspend) { 2923 r = amdgpu_ip_block_resume(&adev->ip_blocks[i]); 2924 if (r) 2925 return r; 2926 } else { 2927 r = adev->ip_blocks[i].version->funcs->hw_init(&adev->ip_blocks[i]); 2928 if (r) { 2929 DRM_ERROR("hw_init of IP block <%s> failed %d\n", 2930 adev->ip_blocks[i].version->funcs->name, r); 2931 return r; 2932 } 2933 adev->ip_blocks[i].status.hw = true; 2934 } 2935 break; 2936 } 2937 } 2938 2939 if (!amdgpu_sriov_vf(adev) || adev->asic_type == CHIP_TONGA) 2940 r = amdgpu_pm_load_smu_firmware(adev, &smu_version); 2941 2942 return r; 2943 } 2944 2945 static int amdgpu_device_init_schedulers(struct amdgpu_device *adev) 2946 { 2947 struct drm_sched_init_args args = { 2948 .ops = &amdgpu_sched_ops, 2949 .num_rqs = DRM_SCHED_PRIORITY_COUNT, 2950 .timeout_wq = adev->reset_domain->wq, 2951 .dev = adev->dev, 2952 }; 2953 long timeout; 2954 int r, i; 2955 2956 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 2957 struct amdgpu_ring *ring = adev->rings[i]; 2958 2959 /* No need to setup the GPU scheduler for rings that don't need it */ 2960 if (!ring || ring->no_scheduler) 2961 continue; 2962 2963 switch (ring->funcs->type) { 2964 case AMDGPU_RING_TYPE_GFX: 2965 timeout = adev->gfx_timeout; 2966 break; 2967 case AMDGPU_RING_TYPE_COMPUTE: 2968 timeout = adev->compute_timeout; 2969 break; 2970 case AMDGPU_RING_TYPE_SDMA: 2971 timeout = adev->sdma_timeout; 2972 break; 2973 default: 2974 timeout = adev->video_timeout; 2975 break; 2976 } 2977 2978 args.timeout = timeout; 2979 args.credit_limit = ring->num_hw_submission; 2980 args.score = ring->sched_score; 2981 args.name = ring->name; 2982 2983 r = drm_sched_init(&ring->sched, &args); 2984 if (r) { 2985 DRM_ERROR("Failed to create scheduler on ring %s.\n", 2986 ring->name); 2987 return r; 2988 } 2989 r = amdgpu_uvd_entity_init(adev, ring); 2990 if (r) { 2991 DRM_ERROR("Failed to create UVD scheduling entity on ring %s.\n", 2992 ring->name); 2993 return r; 2994 } 2995 r = amdgpu_vce_entity_init(adev, ring); 2996 if (r) { 2997 DRM_ERROR("Failed to create VCE scheduling entity on ring %s.\n", 2998 ring->name); 2999 return r; 3000 } 3001 } 3002 3003 amdgpu_xcp_update_partition_sched_list(adev); 3004 3005 return 0; 3006 } 3007 3008 3009 /** 3010 * amdgpu_device_ip_init - run init for hardware IPs 3011 * 3012 * @adev: amdgpu_device pointer 3013 * 3014 * Main initialization pass for hardware IPs. The list of all the hardware 3015 * IPs that make up the asic is walked and the sw_init and hw_init callbacks 3016 * are run. sw_init initializes the software state associated with each IP 3017 * and hw_init initializes the hardware associated with each IP. 3018 * Returns 0 on success, negative error code on failure. 3019 */ 3020 static int amdgpu_device_ip_init(struct amdgpu_device *adev) 3021 { 3022 bool init_badpage; 3023 int i, r; 3024 3025 r = amdgpu_ras_init(adev); 3026 if (r) 3027 return r; 3028 3029 for (i = 0; i < adev->num_ip_blocks; i++) { 3030 if (!adev->ip_blocks[i].status.valid) 3031 continue; 3032 if (adev->ip_blocks[i].version->funcs->sw_init) { 3033 r = adev->ip_blocks[i].version->funcs->sw_init(&adev->ip_blocks[i]); 3034 if (r) { 3035 DRM_ERROR("sw_init of IP block <%s> failed %d\n", 3036 adev->ip_blocks[i].version->funcs->name, r); 3037 goto init_failed; 3038 } 3039 } 3040 adev->ip_blocks[i].status.sw = true; 3041 3042 if (!amdgpu_ip_member_of_hwini( 3043 adev, adev->ip_blocks[i].version->type)) 3044 continue; 3045 3046 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON) { 3047 /* need to do common hw init early so everything is set up for gmc */ 3048 r = adev->ip_blocks[i].version->funcs->hw_init(&adev->ip_blocks[i]); 3049 if (r) { 3050 DRM_ERROR("hw_init %d failed %d\n", i, r); 3051 goto init_failed; 3052 } 3053 adev->ip_blocks[i].status.hw = true; 3054 } else if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) { 3055 /* need to do gmc hw init early so we can allocate gpu mem */ 3056 /* Try to reserve bad pages early */ 3057 if (amdgpu_sriov_vf(adev)) 3058 amdgpu_virt_exchange_data(adev); 3059 3060 r = amdgpu_device_mem_scratch_init(adev); 3061 if (r) { 3062 DRM_ERROR("amdgpu_mem_scratch_init failed %d\n", r); 3063 goto init_failed; 3064 } 3065 r = adev->ip_blocks[i].version->funcs->hw_init(&adev->ip_blocks[i]); 3066 if (r) { 3067 DRM_ERROR("hw_init %d failed %d\n", i, r); 3068 goto init_failed; 3069 } 3070 r = amdgpu_device_wb_init(adev); 3071 if (r) { 3072 DRM_ERROR("amdgpu_device_wb_init failed %d\n", r); 3073 goto init_failed; 3074 } 3075 adev->ip_blocks[i].status.hw = true; 3076 3077 /* right after GMC hw init, we create CSA */ 3078 if (adev->gfx.mcbp) { 3079 r = amdgpu_allocate_static_csa(adev, &adev->virt.csa_obj, 3080 AMDGPU_GEM_DOMAIN_VRAM | 3081 AMDGPU_GEM_DOMAIN_GTT, 3082 AMDGPU_CSA_SIZE); 3083 if (r) { 3084 DRM_ERROR("allocate CSA failed %d\n", r); 3085 goto init_failed; 3086 } 3087 } 3088 3089 r = amdgpu_seq64_init(adev); 3090 if (r) { 3091 DRM_ERROR("allocate seq64 failed %d\n", r); 3092 goto init_failed; 3093 } 3094 } 3095 } 3096 3097 if (amdgpu_sriov_vf(adev)) 3098 amdgpu_virt_init_data_exchange(adev); 3099 3100 r = amdgpu_ib_pool_init(adev); 3101 if (r) { 3102 dev_err(adev->dev, "IB initialization failed (%d).\n", r); 3103 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_IB_INIT_FAIL, 0, r); 3104 goto init_failed; 3105 } 3106 3107 r = amdgpu_ucode_create_bo(adev); /* create ucode bo when sw_init complete*/ 3108 if (r) 3109 goto init_failed; 3110 3111 r = amdgpu_device_ip_hw_init_phase1(adev); 3112 if (r) 3113 goto init_failed; 3114 3115 r = amdgpu_device_fw_loading(adev); 3116 if (r) 3117 goto init_failed; 3118 3119 r = amdgpu_device_ip_hw_init_phase2(adev); 3120 if (r) 3121 goto init_failed; 3122 3123 /* 3124 * retired pages will be loaded from eeprom and reserved here, 3125 * it should be called after amdgpu_device_ip_hw_init_phase2 since 3126 * for some ASICs the RAS EEPROM code relies on SMU fully functioning 3127 * for I2C communication which only true at this point. 3128 * 3129 * amdgpu_ras_recovery_init may fail, but the upper only cares the 3130 * failure from bad gpu situation and stop amdgpu init process 3131 * accordingly. For other failed cases, it will still release all 3132 * the resource and print error message, rather than returning one 3133 * negative value to upper level. 3134 * 3135 * Note: theoretically, this should be called before all vram allocations 3136 * to protect retired page from abusing 3137 */ 3138 init_badpage = (adev->init_lvl->level != AMDGPU_INIT_LEVEL_MINIMAL_XGMI); 3139 r = amdgpu_ras_recovery_init(adev, init_badpage); 3140 if (r) 3141 goto init_failed; 3142 3143 /** 3144 * In case of XGMI grab extra reference for reset domain for this device 3145 */ 3146 if (adev->gmc.xgmi.num_physical_nodes > 1) { 3147 if (amdgpu_xgmi_add_device(adev) == 0) { 3148 if (!amdgpu_sriov_vf(adev)) { 3149 struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev); 3150 3151 if (WARN_ON(!hive)) { 3152 r = -ENOENT; 3153 goto init_failed; 3154 } 3155 3156 if (!hive->reset_domain || 3157 !amdgpu_reset_get_reset_domain(hive->reset_domain)) { 3158 r = -ENOENT; 3159 amdgpu_put_xgmi_hive(hive); 3160 goto init_failed; 3161 } 3162 3163 /* Drop the early temporary reset domain we created for device */ 3164 amdgpu_reset_put_reset_domain(adev->reset_domain); 3165 adev->reset_domain = hive->reset_domain; 3166 amdgpu_put_xgmi_hive(hive); 3167 } 3168 } 3169 } 3170 3171 r = amdgpu_device_init_schedulers(adev); 3172 if (r) 3173 goto init_failed; 3174 3175 if (adev->mman.buffer_funcs_ring->sched.ready) 3176 amdgpu_ttm_set_buffer_funcs_status(adev, true); 3177 3178 /* Don't init kfd if whole hive need to be reset during init */ 3179 if (adev->init_lvl->level != AMDGPU_INIT_LEVEL_MINIMAL_XGMI) { 3180 kgd2kfd_init_zone_device(adev); 3181 amdgpu_amdkfd_device_init(adev); 3182 } 3183 3184 amdgpu_fru_get_product_info(adev); 3185 3186 if (!amdgpu_sriov_vf(adev) || amdgpu_sriov_ras_cper_en(adev)) 3187 r = amdgpu_cper_init(adev); 3188 3189 init_failed: 3190 3191 return r; 3192 } 3193 3194 /** 3195 * amdgpu_device_fill_reset_magic - writes reset magic to gart pointer 3196 * 3197 * @adev: amdgpu_device pointer 3198 * 3199 * Writes a reset magic value to the gart pointer in VRAM. The driver calls 3200 * this function before a GPU reset. If the value is retained after a 3201 * GPU reset, VRAM has not been lost. Some GPU resets may destroy VRAM contents. 3202 */ 3203 static void amdgpu_device_fill_reset_magic(struct amdgpu_device *adev) 3204 { 3205 memcpy(adev->reset_magic, adev->gart.ptr, AMDGPU_RESET_MAGIC_NUM); 3206 } 3207 3208 /** 3209 * amdgpu_device_check_vram_lost - check if vram is valid 3210 * 3211 * @adev: amdgpu_device pointer 3212 * 3213 * Checks the reset magic value written to the gart pointer in VRAM. 3214 * The driver calls this after a GPU reset to see if the contents of 3215 * VRAM is lost or now. 3216 * returns true if vram is lost, false if not. 3217 */ 3218 static bool amdgpu_device_check_vram_lost(struct amdgpu_device *adev) 3219 { 3220 if (memcmp(adev->gart.ptr, adev->reset_magic, 3221 AMDGPU_RESET_MAGIC_NUM)) 3222 return true; 3223 3224 if (!amdgpu_in_reset(adev)) 3225 return false; 3226 3227 /* 3228 * For all ASICs with baco/mode1 reset, the VRAM is 3229 * always assumed to be lost. 3230 */ 3231 switch (amdgpu_asic_reset_method(adev)) { 3232 case AMD_RESET_METHOD_LINK: 3233 case AMD_RESET_METHOD_BACO: 3234 case AMD_RESET_METHOD_MODE1: 3235 return true; 3236 default: 3237 return false; 3238 } 3239 } 3240 3241 /** 3242 * amdgpu_device_set_cg_state - set clockgating for amdgpu device 3243 * 3244 * @adev: amdgpu_device pointer 3245 * @state: clockgating state (gate or ungate) 3246 * 3247 * The list of all the hardware IPs that make up the asic is walked and the 3248 * set_clockgating_state callbacks are run. 3249 * Late initialization pass enabling clockgating for hardware IPs. 3250 * Fini or suspend, pass disabling clockgating for hardware IPs. 3251 * Returns 0 on success, negative error code on failure. 3252 */ 3253 3254 int amdgpu_device_set_cg_state(struct amdgpu_device *adev, 3255 enum amd_clockgating_state state) 3256 { 3257 int i, j, r; 3258 3259 if (amdgpu_emu_mode == 1) 3260 return 0; 3261 3262 for (j = 0; j < adev->num_ip_blocks; j++) { 3263 i = state == AMD_CG_STATE_GATE ? j : adev->num_ip_blocks - j - 1; 3264 if (!adev->ip_blocks[i].status.late_initialized) 3265 continue; 3266 /* skip CG for GFX, SDMA on S0ix */ 3267 if (adev->in_s0ix && 3268 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GFX || 3269 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SDMA)) 3270 continue; 3271 /* skip CG for VCE/UVD, it's handled specially */ 3272 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_UVD && 3273 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCE && 3274 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCN && 3275 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_JPEG && 3276 adev->ip_blocks[i].version->funcs->set_clockgating_state) { 3277 /* enable clockgating to save power */ 3278 r = adev->ip_blocks[i].version->funcs->set_clockgating_state(&adev->ip_blocks[i], 3279 state); 3280 if (r) { 3281 DRM_ERROR("set_clockgating_state(gate) of IP block <%s> failed %d\n", 3282 adev->ip_blocks[i].version->funcs->name, r); 3283 return r; 3284 } 3285 } 3286 } 3287 3288 return 0; 3289 } 3290 3291 int amdgpu_device_set_pg_state(struct amdgpu_device *adev, 3292 enum amd_powergating_state state) 3293 { 3294 int i, j, r; 3295 3296 if (amdgpu_emu_mode == 1) 3297 return 0; 3298 3299 for (j = 0; j < adev->num_ip_blocks; j++) { 3300 i = state == AMD_PG_STATE_GATE ? j : adev->num_ip_blocks - j - 1; 3301 if (!adev->ip_blocks[i].status.late_initialized) 3302 continue; 3303 /* skip PG for GFX, SDMA on S0ix */ 3304 if (adev->in_s0ix && 3305 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GFX || 3306 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SDMA)) 3307 continue; 3308 /* skip CG for VCE/UVD, it's handled specially */ 3309 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_UVD && 3310 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCE && 3311 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCN && 3312 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_JPEG && 3313 adev->ip_blocks[i].version->funcs->set_powergating_state) { 3314 /* enable powergating to save power */ 3315 r = adev->ip_blocks[i].version->funcs->set_powergating_state(&adev->ip_blocks[i], 3316 state); 3317 if (r) { 3318 DRM_ERROR("set_powergating_state(gate) of IP block <%s> failed %d\n", 3319 adev->ip_blocks[i].version->funcs->name, r); 3320 return r; 3321 } 3322 } 3323 } 3324 return 0; 3325 } 3326 3327 static int amdgpu_device_enable_mgpu_fan_boost(void) 3328 { 3329 struct amdgpu_gpu_instance *gpu_ins; 3330 struct amdgpu_device *adev; 3331 int i, ret = 0; 3332 3333 mutex_lock(&mgpu_info.mutex); 3334 3335 /* 3336 * MGPU fan boost feature should be enabled 3337 * only when there are two or more dGPUs in 3338 * the system 3339 */ 3340 if (mgpu_info.num_dgpu < 2) 3341 goto out; 3342 3343 for (i = 0; i < mgpu_info.num_dgpu; i++) { 3344 gpu_ins = &(mgpu_info.gpu_ins[i]); 3345 adev = gpu_ins->adev; 3346 if (!(adev->flags & AMD_IS_APU) && 3347 !gpu_ins->mgpu_fan_enabled) { 3348 ret = amdgpu_dpm_enable_mgpu_fan_boost(adev); 3349 if (ret) 3350 break; 3351 3352 gpu_ins->mgpu_fan_enabled = 1; 3353 } 3354 } 3355 3356 out: 3357 mutex_unlock(&mgpu_info.mutex); 3358 3359 return ret; 3360 } 3361 3362 /** 3363 * amdgpu_device_ip_late_init - run late init for hardware IPs 3364 * 3365 * @adev: amdgpu_device pointer 3366 * 3367 * Late initialization pass for hardware IPs. The list of all the hardware 3368 * IPs that make up the asic is walked and the late_init callbacks are run. 3369 * late_init covers any special initialization that an IP requires 3370 * after all of the have been initialized or something that needs to happen 3371 * late in the init process. 3372 * Returns 0 on success, negative error code on failure. 3373 */ 3374 static int amdgpu_device_ip_late_init(struct amdgpu_device *adev) 3375 { 3376 struct amdgpu_gpu_instance *gpu_instance; 3377 int i = 0, r; 3378 3379 for (i = 0; i < adev->num_ip_blocks; i++) { 3380 if (!adev->ip_blocks[i].status.hw) 3381 continue; 3382 if (adev->ip_blocks[i].version->funcs->late_init) { 3383 r = adev->ip_blocks[i].version->funcs->late_init(&adev->ip_blocks[i]); 3384 if (r) { 3385 DRM_ERROR("late_init of IP block <%s> failed %d\n", 3386 adev->ip_blocks[i].version->funcs->name, r); 3387 return r; 3388 } 3389 } 3390 adev->ip_blocks[i].status.late_initialized = true; 3391 } 3392 3393 r = amdgpu_ras_late_init(adev); 3394 if (r) { 3395 DRM_ERROR("amdgpu_ras_late_init failed %d", r); 3396 return r; 3397 } 3398 3399 if (!amdgpu_reset_in_recovery(adev)) 3400 amdgpu_ras_set_error_query_ready(adev, true); 3401 3402 amdgpu_device_set_cg_state(adev, AMD_CG_STATE_GATE); 3403 amdgpu_device_set_pg_state(adev, AMD_PG_STATE_GATE); 3404 3405 amdgpu_device_fill_reset_magic(adev); 3406 3407 r = amdgpu_device_enable_mgpu_fan_boost(); 3408 if (r) 3409 DRM_ERROR("enable mgpu fan boost failed (%d).\n", r); 3410 3411 /* For passthrough configuration on arcturus and aldebaran, enable special handling SBR */ 3412 if (amdgpu_passthrough(adev) && 3413 ((adev->asic_type == CHIP_ARCTURUS && adev->gmc.xgmi.num_physical_nodes > 1) || 3414 adev->asic_type == CHIP_ALDEBARAN)) 3415 amdgpu_dpm_handle_passthrough_sbr(adev, true); 3416 3417 if (adev->gmc.xgmi.num_physical_nodes > 1) { 3418 mutex_lock(&mgpu_info.mutex); 3419 3420 /* 3421 * Reset device p-state to low as this was booted with high. 3422 * 3423 * This should be performed only after all devices from the same 3424 * hive get initialized. 3425 * 3426 * However, it's unknown how many device in the hive in advance. 3427 * As this is counted one by one during devices initializations. 3428 * 3429 * So, we wait for all XGMI interlinked devices initialized. 3430 * This may bring some delays as those devices may come from 3431 * different hives. But that should be OK. 3432 */ 3433 if (mgpu_info.num_dgpu == adev->gmc.xgmi.num_physical_nodes) { 3434 for (i = 0; i < mgpu_info.num_gpu; i++) { 3435 gpu_instance = &(mgpu_info.gpu_ins[i]); 3436 if (gpu_instance->adev->flags & AMD_IS_APU) 3437 continue; 3438 3439 r = amdgpu_xgmi_set_pstate(gpu_instance->adev, 3440 AMDGPU_XGMI_PSTATE_MIN); 3441 if (r) { 3442 DRM_ERROR("pstate setting failed (%d).\n", r); 3443 break; 3444 } 3445 } 3446 } 3447 3448 mutex_unlock(&mgpu_info.mutex); 3449 } 3450 3451 return 0; 3452 } 3453 3454 static void amdgpu_ip_block_hw_fini(struct amdgpu_ip_block *ip_block) 3455 { 3456 int r; 3457 3458 if (!ip_block->version->funcs->hw_fini) { 3459 DRM_ERROR("hw_fini of IP block <%s> not defined\n", 3460 ip_block->version->funcs->name); 3461 } else { 3462 r = ip_block->version->funcs->hw_fini(ip_block); 3463 /* XXX handle errors */ 3464 if (r) { 3465 DRM_DEBUG("hw_fini of IP block <%s> failed %d\n", 3466 ip_block->version->funcs->name, r); 3467 } 3468 } 3469 3470 ip_block->status.hw = false; 3471 } 3472 3473 /** 3474 * amdgpu_device_smu_fini_early - smu hw_fini wrapper 3475 * 3476 * @adev: amdgpu_device pointer 3477 * 3478 * For ASICs need to disable SMC first 3479 */ 3480 static void amdgpu_device_smu_fini_early(struct amdgpu_device *adev) 3481 { 3482 int i; 3483 3484 if (amdgpu_ip_version(adev, GC_HWIP, 0) > IP_VERSION(9, 0, 0)) 3485 return; 3486 3487 for (i = 0; i < adev->num_ip_blocks; i++) { 3488 if (!adev->ip_blocks[i].status.hw) 3489 continue; 3490 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) { 3491 amdgpu_ip_block_hw_fini(&adev->ip_blocks[i]); 3492 break; 3493 } 3494 } 3495 } 3496 3497 static int amdgpu_device_ip_fini_early(struct amdgpu_device *adev) 3498 { 3499 int i, r; 3500 3501 for (i = 0; i < adev->num_ip_blocks; i++) { 3502 if (!adev->ip_blocks[i].version->funcs->early_fini) 3503 continue; 3504 3505 r = adev->ip_blocks[i].version->funcs->early_fini(&adev->ip_blocks[i]); 3506 if (r) { 3507 DRM_DEBUG("early_fini of IP block <%s> failed %d\n", 3508 adev->ip_blocks[i].version->funcs->name, r); 3509 } 3510 } 3511 3512 amdgpu_device_set_pg_state(adev, AMD_PG_STATE_UNGATE); 3513 amdgpu_device_set_cg_state(adev, AMD_CG_STATE_UNGATE); 3514 3515 amdgpu_amdkfd_suspend(adev, false); 3516 3517 /* Workaround for ASICs need to disable SMC first */ 3518 amdgpu_device_smu_fini_early(adev); 3519 3520 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 3521 if (!adev->ip_blocks[i].status.hw) 3522 continue; 3523 3524 amdgpu_ip_block_hw_fini(&adev->ip_blocks[i]); 3525 } 3526 3527 if (amdgpu_sriov_vf(adev)) { 3528 if (amdgpu_virt_release_full_gpu(adev, false)) 3529 DRM_ERROR("failed to release exclusive mode on fini\n"); 3530 } 3531 3532 return 0; 3533 } 3534 3535 /** 3536 * amdgpu_device_ip_fini - run fini for hardware IPs 3537 * 3538 * @adev: amdgpu_device pointer 3539 * 3540 * Main teardown pass for hardware IPs. The list of all the hardware 3541 * IPs that make up the asic is walked and the hw_fini and sw_fini callbacks 3542 * are run. hw_fini tears down the hardware associated with each IP 3543 * and sw_fini tears down any software state associated with each IP. 3544 * Returns 0 on success, negative error code on failure. 3545 */ 3546 static int amdgpu_device_ip_fini(struct amdgpu_device *adev) 3547 { 3548 int i, r; 3549 3550 amdgpu_cper_fini(adev); 3551 3552 if (amdgpu_sriov_vf(adev) && adev->virt.ras_init_done) 3553 amdgpu_virt_release_ras_err_handler_data(adev); 3554 3555 if (adev->gmc.xgmi.num_physical_nodes > 1) 3556 amdgpu_xgmi_remove_device(adev); 3557 3558 amdgpu_amdkfd_device_fini_sw(adev); 3559 3560 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 3561 if (!adev->ip_blocks[i].status.sw) 3562 continue; 3563 3564 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) { 3565 amdgpu_ucode_free_bo(adev); 3566 amdgpu_free_static_csa(&adev->virt.csa_obj); 3567 amdgpu_device_wb_fini(adev); 3568 amdgpu_device_mem_scratch_fini(adev); 3569 amdgpu_ib_pool_fini(adev); 3570 amdgpu_seq64_fini(adev); 3571 amdgpu_doorbell_fini(adev); 3572 } 3573 if (adev->ip_blocks[i].version->funcs->sw_fini) { 3574 r = adev->ip_blocks[i].version->funcs->sw_fini(&adev->ip_blocks[i]); 3575 /* XXX handle errors */ 3576 if (r) { 3577 DRM_DEBUG("sw_fini of IP block <%s> failed %d\n", 3578 adev->ip_blocks[i].version->funcs->name, r); 3579 } 3580 } 3581 adev->ip_blocks[i].status.sw = false; 3582 adev->ip_blocks[i].status.valid = false; 3583 } 3584 3585 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 3586 if (!adev->ip_blocks[i].status.late_initialized) 3587 continue; 3588 if (adev->ip_blocks[i].version->funcs->late_fini) 3589 adev->ip_blocks[i].version->funcs->late_fini(&adev->ip_blocks[i]); 3590 adev->ip_blocks[i].status.late_initialized = false; 3591 } 3592 3593 amdgpu_ras_fini(adev); 3594 3595 return 0; 3596 } 3597 3598 /** 3599 * amdgpu_device_delayed_init_work_handler - work handler for IB tests 3600 * 3601 * @work: work_struct. 3602 */ 3603 static void amdgpu_device_delayed_init_work_handler(struct work_struct *work) 3604 { 3605 struct amdgpu_device *adev = 3606 container_of(work, struct amdgpu_device, delayed_init_work.work); 3607 int r; 3608 3609 r = amdgpu_ib_ring_tests(adev); 3610 if (r) 3611 DRM_ERROR("ib ring test failed (%d).\n", r); 3612 } 3613 3614 static void amdgpu_device_delay_enable_gfx_off(struct work_struct *work) 3615 { 3616 struct amdgpu_device *adev = 3617 container_of(work, struct amdgpu_device, gfx.gfx_off_delay_work.work); 3618 3619 WARN_ON_ONCE(adev->gfx.gfx_off_state); 3620 WARN_ON_ONCE(adev->gfx.gfx_off_req_count); 3621 3622 if (!amdgpu_dpm_set_powergating_by_smu(adev, AMD_IP_BLOCK_TYPE_GFX, true, 0)) 3623 adev->gfx.gfx_off_state = true; 3624 } 3625 3626 /** 3627 * amdgpu_device_ip_suspend_phase1 - run suspend for hardware IPs (phase 1) 3628 * 3629 * @adev: amdgpu_device pointer 3630 * 3631 * Main suspend function for hardware IPs. The list of all the hardware 3632 * IPs that make up the asic is walked, clockgating is disabled and the 3633 * suspend callbacks are run. suspend puts the hardware and software state 3634 * in each IP into a state suitable for suspend. 3635 * Returns 0 on success, negative error code on failure. 3636 */ 3637 static int amdgpu_device_ip_suspend_phase1(struct amdgpu_device *adev) 3638 { 3639 int i, r; 3640 3641 amdgpu_device_set_pg_state(adev, AMD_PG_STATE_UNGATE); 3642 amdgpu_device_set_cg_state(adev, AMD_CG_STATE_UNGATE); 3643 3644 /* 3645 * Per PMFW team's suggestion, driver needs to handle gfxoff 3646 * and df cstate features disablement for gpu reset(e.g. Mode1Reset) 3647 * scenario. Add the missing df cstate disablement here. 3648 */ 3649 if (amdgpu_dpm_set_df_cstate(adev, DF_CSTATE_DISALLOW)) 3650 dev_warn(adev->dev, "Failed to disallow df cstate"); 3651 3652 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 3653 if (!adev->ip_blocks[i].status.valid) 3654 continue; 3655 3656 /* displays are handled separately */ 3657 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_DCE) 3658 continue; 3659 3660 /* XXX handle errors */ 3661 r = amdgpu_ip_block_suspend(&adev->ip_blocks[i]); 3662 if (r) 3663 return r; 3664 } 3665 3666 return 0; 3667 } 3668 3669 /** 3670 * amdgpu_device_ip_suspend_phase2 - run suspend for hardware IPs (phase 2) 3671 * 3672 * @adev: amdgpu_device pointer 3673 * 3674 * Main suspend function for hardware IPs. The list of all the hardware 3675 * IPs that make up the asic is walked, clockgating is disabled and the 3676 * suspend callbacks are run. suspend puts the hardware and software state 3677 * in each IP into a state suitable for suspend. 3678 * Returns 0 on success, negative error code on failure. 3679 */ 3680 static int amdgpu_device_ip_suspend_phase2(struct amdgpu_device *adev) 3681 { 3682 int i, r; 3683 3684 if (adev->in_s0ix) 3685 amdgpu_dpm_gfx_state_change(adev, sGpuChangeState_D3Entry); 3686 3687 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 3688 if (!adev->ip_blocks[i].status.valid) 3689 continue; 3690 /* displays are handled in phase1 */ 3691 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE) 3692 continue; 3693 /* PSP lost connection when err_event_athub occurs */ 3694 if (amdgpu_ras_intr_triggered() && 3695 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) { 3696 adev->ip_blocks[i].status.hw = false; 3697 continue; 3698 } 3699 3700 /* skip unnecessary suspend if we do not initialize them yet */ 3701 if (!amdgpu_ip_member_of_hwini( 3702 adev, adev->ip_blocks[i].version->type)) 3703 continue; 3704 3705 /* Since we skip suspend for S0i3, we need to cancel the delayed 3706 * idle work here as the suspend callback never gets called. 3707 */ 3708 if (adev->in_s0ix && 3709 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GFX && 3710 amdgpu_ip_version(adev, GC_HWIP, 0) >= IP_VERSION(10, 0, 0)) 3711 cancel_delayed_work_sync(&adev->gfx.idle_work); 3712 /* skip suspend of gfx/mes and psp for S0ix 3713 * gfx is in gfxoff state, so on resume it will exit gfxoff just 3714 * like at runtime. PSP is also part of the always on hardware 3715 * so no need to suspend it. 3716 */ 3717 if (adev->in_s0ix && 3718 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP || 3719 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GFX || 3720 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_MES)) 3721 continue; 3722 3723 /* SDMA 5.x+ is part of GFX power domain so it's covered by GFXOFF */ 3724 if (adev->in_s0ix && 3725 (amdgpu_ip_version(adev, SDMA0_HWIP, 0) >= 3726 IP_VERSION(5, 0, 0)) && 3727 (adev->ip_blocks[i].version->type == 3728 AMD_IP_BLOCK_TYPE_SDMA)) 3729 continue; 3730 3731 /* Once swPSP provides the IMU, RLC FW binaries to TOS during cold-boot. 3732 * These are in TMR, hence are expected to be reused by PSP-TOS to reload 3733 * from this location and RLC Autoload automatically also gets loaded 3734 * from here based on PMFW -> PSP message during re-init sequence. 3735 * Therefore, the psp suspend & resume should be skipped to avoid destroy 3736 * the TMR and reload FWs again for IMU enabled APU ASICs. 3737 */ 3738 if (amdgpu_in_reset(adev) && 3739 (adev->flags & AMD_IS_APU) && adev->gfx.imu.funcs && 3740 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) 3741 continue; 3742 3743 /* XXX handle errors */ 3744 r = amdgpu_ip_block_suspend(&adev->ip_blocks[i]); 3745 adev->ip_blocks[i].status.hw = false; 3746 3747 /* handle putting the SMC in the appropriate state */ 3748 if (!amdgpu_sriov_vf(adev)) { 3749 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) { 3750 r = amdgpu_dpm_set_mp1_state(adev, adev->mp1_state); 3751 if (r) { 3752 DRM_ERROR("SMC failed to set mp1 state %d, %d\n", 3753 adev->mp1_state, r); 3754 return r; 3755 } 3756 } 3757 } 3758 } 3759 3760 return 0; 3761 } 3762 3763 /** 3764 * amdgpu_device_ip_suspend - run suspend for hardware IPs 3765 * 3766 * @adev: amdgpu_device pointer 3767 * 3768 * Main suspend function for hardware IPs. The list of all the hardware 3769 * IPs that make up the asic is walked, clockgating is disabled and the 3770 * suspend callbacks are run. suspend puts the hardware and software state 3771 * in each IP into a state suitable for suspend. 3772 * Returns 0 on success, negative error code on failure. 3773 */ 3774 int amdgpu_device_ip_suspend(struct amdgpu_device *adev) 3775 { 3776 int r; 3777 3778 if (amdgpu_sriov_vf(adev)) { 3779 amdgpu_virt_fini_data_exchange(adev); 3780 amdgpu_virt_request_full_gpu(adev, false); 3781 } 3782 3783 amdgpu_ttm_set_buffer_funcs_status(adev, false); 3784 3785 r = amdgpu_device_ip_suspend_phase1(adev); 3786 if (r) 3787 return r; 3788 r = amdgpu_device_ip_suspend_phase2(adev); 3789 3790 if (amdgpu_sriov_vf(adev)) 3791 amdgpu_virt_release_full_gpu(adev, false); 3792 3793 return r; 3794 } 3795 3796 static int amdgpu_device_ip_reinit_early_sriov(struct amdgpu_device *adev) 3797 { 3798 int i, r; 3799 3800 static enum amd_ip_block_type ip_order[] = { 3801 AMD_IP_BLOCK_TYPE_COMMON, 3802 AMD_IP_BLOCK_TYPE_GMC, 3803 AMD_IP_BLOCK_TYPE_PSP, 3804 AMD_IP_BLOCK_TYPE_IH, 3805 }; 3806 3807 for (i = 0; i < adev->num_ip_blocks; i++) { 3808 int j; 3809 struct amdgpu_ip_block *block; 3810 3811 block = &adev->ip_blocks[i]; 3812 block->status.hw = false; 3813 3814 for (j = 0; j < ARRAY_SIZE(ip_order); j++) { 3815 3816 if (block->version->type != ip_order[j] || 3817 !block->status.valid) 3818 continue; 3819 3820 r = block->version->funcs->hw_init(&adev->ip_blocks[i]); 3821 if (r) { 3822 dev_err(adev->dev, "RE-INIT-early: %s failed\n", 3823 block->version->funcs->name); 3824 return r; 3825 } 3826 block->status.hw = true; 3827 } 3828 } 3829 3830 return 0; 3831 } 3832 3833 static int amdgpu_device_ip_reinit_late_sriov(struct amdgpu_device *adev) 3834 { 3835 struct amdgpu_ip_block *block; 3836 int i, r = 0; 3837 3838 static enum amd_ip_block_type ip_order[] = { 3839 AMD_IP_BLOCK_TYPE_SMC, 3840 AMD_IP_BLOCK_TYPE_DCE, 3841 AMD_IP_BLOCK_TYPE_GFX, 3842 AMD_IP_BLOCK_TYPE_SDMA, 3843 AMD_IP_BLOCK_TYPE_MES, 3844 AMD_IP_BLOCK_TYPE_UVD, 3845 AMD_IP_BLOCK_TYPE_VCE, 3846 AMD_IP_BLOCK_TYPE_VCN, 3847 AMD_IP_BLOCK_TYPE_JPEG 3848 }; 3849 3850 for (i = 0; i < ARRAY_SIZE(ip_order); i++) { 3851 block = amdgpu_device_ip_get_ip_block(adev, ip_order[i]); 3852 3853 if (!block) 3854 continue; 3855 3856 if (block->status.valid && !block->status.hw) { 3857 if (block->version->type == AMD_IP_BLOCK_TYPE_SMC) { 3858 r = amdgpu_ip_block_resume(block); 3859 } else { 3860 r = block->version->funcs->hw_init(block); 3861 } 3862 3863 if (r) { 3864 dev_err(adev->dev, "RE-INIT-late: %s failed\n", 3865 block->version->funcs->name); 3866 break; 3867 } 3868 block->status.hw = true; 3869 } 3870 } 3871 3872 return r; 3873 } 3874 3875 /** 3876 * amdgpu_device_ip_resume_phase1 - run resume for hardware IPs 3877 * 3878 * @adev: amdgpu_device pointer 3879 * 3880 * First resume function for hardware IPs. The list of all the hardware 3881 * IPs that make up the asic is walked and the resume callbacks are run for 3882 * COMMON, GMC, and IH. resume puts the hardware into a functional state 3883 * after a suspend and updates the software state as necessary. This 3884 * function is also used for restoring the GPU after a GPU reset. 3885 * Returns 0 on success, negative error code on failure. 3886 */ 3887 static int amdgpu_device_ip_resume_phase1(struct amdgpu_device *adev) 3888 { 3889 int i, r; 3890 3891 for (i = 0; i < adev->num_ip_blocks; i++) { 3892 if (!adev->ip_blocks[i].status.valid || adev->ip_blocks[i].status.hw) 3893 continue; 3894 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON || 3895 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC || 3896 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH || 3897 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP && amdgpu_sriov_vf(adev))) { 3898 3899 r = amdgpu_ip_block_resume(&adev->ip_blocks[i]); 3900 if (r) 3901 return r; 3902 } 3903 } 3904 3905 return 0; 3906 } 3907 3908 /** 3909 * amdgpu_device_ip_resume_phase2 - run resume for hardware IPs 3910 * 3911 * @adev: amdgpu_device pointer 3912 * 3913 * Second resume function for hardware IPs. The list of all the hardware 3914 * IPs that make up the asic is walked and the resume callbacks are run for 3915 * all blocks except COMMON, GMC, and IH. resume puts the hardware into a 3916 * functional state after a suspend and updates the software state as 3917 * necessary. This function is also used for restoring the GPU after a GPU 3918 * reset. 3919 * Returns 0 on success, negative error code on failure. 3920 */ 3921 static int amdgpu_device_ip_resume_phase2(struct amdgpu_device *adev) 3922 { 3923 int i, r; 3924 3925 for (i = 0; i < adev->num_ip_blocks; i++) { 3926 if (!adev->ip_blocks[i].status.valid || adev->ip_blocks[i].status.hw) 3927 continue; 3928 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON || 3929 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC || 3930 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH || 3931 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE || 3932 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) 3933 continue; 3934 r = amdgpu_ip_block_resume(&adev->ip_blocks[i]); 3935 if (r) 3936 return r; 3937 } 3938 3939 return 0; 3940 } 3941 3942 /** 3943 * amdgpu_device_ip_resume_phase3 - run resume for hardware IPs 3944 * 3945 * @adev: amdgpu_device pointer 3946 * 3947 * Third resume function for hardware IPs. The list of all the hardware 3948 * IPs that make up the asic is walked and the resume callbacks are run for 3949 * all DCE. resume puts the hardware into a functional state after a suspend 3950 * and updates the software state as necessary. This function is also used 3951 * for restoring the GPU after a GPU reset. 3952 * 3953 * Returns 0 on success, negative error code on failure. 3954 */ 3955 static int amdgpu_device_ip_resume_phase3(struct amdgpu_device *adev) 3956 { 3957 int i, r; 3958 3959 for (i = 0; i < adev->num_ip_blocks; i++) { 3960 if (!adev->ip_blocks[i].status.valid || adev->ip_blocks[i].status.hw) 3961 continue; 3962 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE) { 3963 r = amdgpu_ip_block_resume(&adev->ip_blocks[i]); 3964 if (r) 3965 return r; 3966 } 3967 } 3968 3969 return 0; 3970 } 3971 3972 /** 3973 * amdgpu_device_ip_resume - run resume for hardware IPs 3974 * 3975 * @adev: amdgpu_device pointer 3976 * 3977 * Main resume function for hardware IPs. The hardware IPs 3978 * are split into two resume functions because they are 3979 * also used in recovering from a GPU reset and some additional 3980 * steps need to be take between them. In this case (S3/S4) they are 3981 * run sequentially. 3982 * Returns 0 on success, negative error code on failure. 3983 */ 3984 static int amdgpu_device_ip_resume(struct amdgpu_device *adev) 3985 { 3986 int r; 3987 3988 r = amdgpu_device_ip_resume_phase1(adev); 3989 if (r) 3990 return r; 3991 3992 r = amdgpu_device_fw_loading(adev); 3993 if (r) 3994 return r; 3995 3996 r = amdgpu_device_ip_resume_phase2(adev); 3997 3998 if (adev->mman.buffer_funcs_ring->sched.ready) 3999 amdgpu_ttm_set_buffer_funcs_status(adev, true); 4000 4001 if (r) 4002 return r; 4003 4004 amdgpu_fence_driver_hw_init(adev); 4005 4006 r = amdgpu_device_ip_resume_phase3(adev); 4007 4008 return r; 4009 } 4010 4011 /** 4012 * amdgpu_device_detect_sriov_bios - determine if the board supports SR-IOV 4013 * 4014 * @adev: amdgpu_device pointer 4015 * 4016 * Query the VBIOS data tables to determine if the board supports SR-IOV. 4017 */ 4018 static void amdgpu_device_detect_sriov_bios(struct amdgpu_device *adev) 4019 { 4020 if (amdgpu_sriov_vf(adev)) { 4021 if (adev->is_atom_fw) { 4022 if (amdgpu_atomfirmware_gpu_virtualization_supported(adev)) 4023 adev->virt.caps |= AMDGPU_SRIOV_CAPS_SRIOV_VBIOS; 4024 } else { 4025 if (amdgpu_atombios_has_gpu_virtualization_table(adev)) 4026 adev->virt.caps |= AMDGPU_SRIOV_CAPS_SRIOV_VBIOS; 4027 } 4028 4029 if (!(adev->virt.caps & AMDGPU_SRIOV_CAPS_SRIOV_VBIOS)) 4030 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_NO_VBIOS, 0, 0); 4031 } 4032 } 4033 4034 /** 4035 * amdgpu_device_asic_has_dc_support - determine if DC supports the asic 4036 * 4037 * @asic_type: AMD asic type 4038 * 4039 * Check if there is DC (new modesetting infrastructre) support for an asic. 4040 * returns true if DC has support, false if not. 4041 */ 4042 bool amdgpu_device_asic_has_dc_support(enum amd_asic_type asic_type) 4043 { 4044 switch (asic_type) { 4045 #ifdef CONFIG_DRM_AMDGPU_SI 4046 case CHIP_HAINAN: 4047 #endif 4048 case CHIP_TOPAZ: 4049 /* chips with no display hardware */ 4050 return false; 4051 #if defined(CONFIG_DRM_AMD_DC) 4052 case CHIP_TAHITI: 4053 case CHIP_PITCAIRN: 4054 case CHIP_VERDE: 4055 case CHIP_OLAND: 4056 /* 4057 * We have systems in the wild with these ASICs that require 4058 * LVDS and VGA support which is not supported with DC. 4059 * 4060 * Fallback to the non-DC driver here by default so as not to 4061 * cause regressions. 4062 */ 4063 #if defined(CONFIG_DRM_AMD_DC_SI) 4064 return amdgpu_dc > 0; 4065 #else 4066 return false; 4067 #endif 4068 case CHIP_BONAIRE: 4069 case CHIP_KAVERI: 4070 case CHIP_KABINI: 4071 case CHIP_MULLINS: 4072 /* 4073 * We have systems in the wild with these ASICs that require 4074 * VGA support which is not supported with DC. 4075 * 4076 * Fallback to the non-DC driver here by default so as not to 4077 * cause regressions. 4078 */ 4079 return amdgpu_dc > 0; 4080 default: 4081 return amdgpu_dc != 0; 4082 #else 4083 default: 4084 if (amdgpu_dc > 0) 4085 DRM_INFO_ONCE("Display Core has been requested via kernel parameter but isn't supported by ASIC, ignoring\n"); 4086 return false; 4087 #endif 4088 } 4089 } 4090 4091 /** 4092 * amdgpu_device_has_dc_support - check if dc is supported 4093 * 4094 * @adev: amdgpu_device pointer 4095 * 4096 * Returns true for supported, false for not supported 4097 */ 4098 bool amdgpu_device_has_dc_support(struct amdgpu_device *adev) 4099 { 4100 if (adev->enable_virtual_display || 4101 (adev->harvest_ip_mask & AMD_HARVEST_IP_DMU_MASK)) 4102 return false; 4103 4104 return amdgpu_device_asic_has_dc_support(adev->asic_type); 4105 } 4106 4107 static void amdgpu_device_xgmi_reset_func(struct work_struct *__work) 4108 { 4109 struct amdgpu_device *adev = 4110 container_of(__work, struct amdgpu_device, xgmi_reset_work); 4111 struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev); 4112 4113 /* It's a bug to not have a hive within this function */ 4114 if (WARN_ON(!hive)) 4115 return; 4116 4117 /* 4118 * Use task barrier to synchronize all xgmi reset works across the 4119 * hive. task_barrier_enter and task_barrier_exit will block 4120 * until all the threads running the xgmi reset works reach 4121 * those points. task_barrier_full will do both blocks. 4122 */ 4123 if (amdgpu_asic_reset_method(adev) == AMD_RESET_METHOD_BACO) { 4124 4125 task_barrier_enter(&hive->tb); 4126 adev->asic_reset_res = amdgpu_device_baco_enter(adev_to_drm(adev)); 4127 4128 if (adev->asic_reset_res) 4129 goto fail; 4130 4131 task_barrier_exit(&hive->tb); 4132 adev->asic_reset_res = amdgpu_device_baco_exit(adev_to_drm(adev)); 4133 4134 if (adev->asic_reset_res) 4135 goto fail; 4136 4137 amdgpu_ras_reset_error_count(adev, AMDGPU_RAS_BLOCK__MMHUB); 4138 } else { 4139 4140 task_barrier_full(&hive->tb); 4141 adev->asic_reset_res = amdgpu_asic_reset(adev); 4142 } 4143 4144 fail: 4145 if (adev->asic_reset_res) 4146 DRM_WARN("ASIC reset failed with error, %d for drm dev, %s", 4147 adev->asic_reset_res, adev_to_drm(adev)->unique); 4148 amdgpu_put_xgmi_hive(hive); 4149 } 4150 4151 static int amdgpu_device_get_job_timeout_settings(struct amdgpu_device *adev) 4152 { 4153 char *input = amdgpu_lockup_timeout; 4154 char *timeout_setting = NULL; 4155 int index = 0; 4156 long timeout; 4157 int ret = 0; 4158 4159 /* 4160 * By default timeout for non compute jobs is 10000 4161 * and 60000 for compute jobs. 4162 * In SR-IOV or passthrough mode, timeout for compute 4163 * jobs are 60000 by default. 4164 */ 4165 adev->gfx_timeout = msecs_to_jiffies(10000); 4166 adev->sdma_timeout = adev->video_timeout = adev->gfx_timeout; 4167 if (amdgpu_sriov_vf(adev)) 4168 adev->compute_timeout = amdgpu_sriov_is_pp_one_vf(adev) ? 4169 msecs_to_jiffies(60000) : msecs_to_jiffies(10000); 4170 else 4171 adev->compute_timeout = msecs_to_jiffies(60000); 4172 4173 if (strnlen(input, AMDGPU_MAX_TIMEOUT_PARAM_LENGTH)) { 4174 while ((timeout_setting = strsep(&input, ",")) && 4175 strnlen(timeout_setting, AMDGPU_MAX_TIMEOUT_PARAM_LENGTH)) { 4176 ret = kstrtol(timeout_setting, 0, &timeout); 4177 if (ret) 4178 return ret; 4179 4180 if (timeout == 0) { 4181 index++; 4182 continue; 4183 } else if (timeout < 0) { 4184 timeout = MAX_SCHEDULE_TIMEOUT; 4185 dev_warn(adev->dev, "lockup timeout disabled"); 4186 add_taint(TAINT_SOFTLOCKUP, LOCKDEP_STILL_OK); 4187 } else { 4188 timeout = msecs_to_jiffies(timeout); 4189 } 4190 4191 switch (index++) { 4192 case 0: 4193 adev->gfx_timeout = timeout; 4194 break; 4195 case 1: 4196 adev->compute_timeout = timeout; 4197 break; 4198 case 2: 4199 adev->sdma_timeout = timeout; 4200 break; 4201 case 3: 4202 adev->video_timeout = timeout; 4203 break; 4204 default: 4205 break; 4206 } 4207 } 4208 /* 4209 * There is only one value specified and 4210 * it should apply to all non-compute jobs. 4211 */ 4212 if (index == 1) { 4213 adev->sdma_timeout = adev->video_timeout = adev->gfx_timeout; 4214 if (amdgpu_sriov_vf(adev) || amdgpu_passthrough(adev)) 4215 adev->compute_timeout = adev->gfx_timeout; 4216 } 4217 } 4218 4219 return ret; 4220 } 4221 4222 /** 4223 * amdgpu_device_check_iommu_direct_map - check if RAM direct mapped to GPU 4224 * 4225 * @adev: amdgpu_device pointer 4226 * 4227 * RAM direct mapped to GPU if IOMMU is not enabled or is pass through mode 4228 */ 4229 static void amdgpu_device_check_iommu_direct_map(struct amdgpu_device *adev) 4230 { 4231 struct iommu_domain *domain; 4232 4233 domain = iommu_get_domain_for_dev(adev->dev); 4234 if (!domain || domain->type == IOMMU_DOMAIN_IDENTITY) 4235 adev->ram_is_direct_mapped = true; 4236 } 4237 4238 #if defined(CONFIG_HSA_AMD_P2P) 4239 /** 4240 * amdgpu_device_check_iommu_remap - Check if DMA remapping is enabled. 4241 * 4242 * @adev: amdgpu_device pointer 4243 * 4244 * return if IOMMU remapping bar address 4245 */ 4246 static bool amdgpu_device_check_iommu_remap(struct amdgpu_device *adev) 4247 { 4248 struct iommu_domain *domain; 4249 4250 domain = iommu_get_domain_for_dev(adev->dev); 4251 if (domain && (domain->type == IOMMU_DOMAIN_DMA || 4252 domain->type == IOMMU_DOMAIN_DMA_FQ)) 4253 return true; 4254 4255 return false; 4256 } 4257 #endif 4258 4259 static void amdgpu_device_set_mcbp(struct amdgpu_device *adev) 4260 { 4261 if (amdgpu_mcbp == 1) 4262 adev->gfx.mcbp = true; 4263 else if (amdgpu_mcbp == 0) 4264 adev->gfx.mcbp = false; 4265 4266 if (amdgpu_sriov_vf(adev)) 4267 adev->gfx.mcbp = true; 4268 4269 if (adev->gfx.mcbp) 4270 DRM_INFO("MCBP is enabled\n"); 4271 } 4272 4273 /** 4274 * amdgpu_device_init - initialize the driver 4275 * 4276 * @adev: amdgpu_device pointer 4277 * @flags: driver flags 4278 * 4279 * Initializes the driver info and hw (all asics). 4280 * Returns 0 for success or an error on failure. 4281 * Called at driver startup. 4282 */ 4283 int amdgpu_device_init(struct amdgpu_device *adev, 4284 uint32_t flags) 4285 { 4286 struct drm_device *ddev = adev_to_drm(adev); 4287 struct pci_dev *pdev = adev->pdev; 4288 int r, i; 4289 bool px = false; 4290 u32 max_MBps; 4291 int tmp; 4292 4293 adev->shutdown = false; 4294 adev->flags = flags; 4295 4296 if (amdgpu_force_asic_type >= 0 && amdgpu_force_asic_type < CHIP_LAST) 4297 adev->asic_type = amdgpu_force_asic_type; 4298 else 4299 adev->asic_type = flags & AMD_ASIC_MASK; 4300 4301 adev->usec_timeout = AMDGPU_MAX_USEC_TIMEOUT; 4302 if (amdgpu_emu_mode == 1) 4303 adev->usec_timeout *= 10; 4304 adev->gmc.gart_size = 512 * 1024 * 1024; 4305 adev->accel_working = false; 4306 adev->num_rings = 0; 4307 RCU_INIT_POINTER(adev->gang_submit, dma_fence_get_stub()); 4308 adev->mman.buffer_funcs = NULL; 4309 adev->mman.buffer_funcs_ring = NULL; 4310 adev->vm_manager.vm_pte_funcs = NULL; 4311 adev->vm_manager.vm_pte_num_scheds = 0; 4312 adev->gmc.gmc_funcs = NULL; 4313 adev->harvest_ip_mask = 0x0; 4314 adev->fence_context = dma_fence_context_alloc(AMDGPU_MAX_RINGS); 4315 bitmap_zero(adev->gfx.pipe_reserve_bitmap, AMDGPU_MAX_COMPUTE_QUEUES); 4316 4317 adev->smc_rreg = &amdgpu_invalid_rreg; 4318 adev->smc_wreg = &amdgpu_invalid_wreg; 4319 adev->pcie_rreg = &amdgpu_invalid_rreg; 4320 adev->pcie_wreg = &amdgpu_invalid_wreg; 4321 adev->pcie_rreg_ext = &amdgpu_invalid_rreg_ext; 4322 adev->pcie_wreg_ext = &amdgpu_invalid_wreg_ext; 4323 adev->pciep_rreg = &amdgpu_invalid_rreg; 4324 adev->pciep_wreg = &amdgpu_invalid_wreg; 4325 adev->pcie_rreg64 = &amdgpu_invalid_rreg64; 4326 adev->pcie_wreg64 = &amdgpu_invalid_wreg64; 4327 adev->pcie_rreg64_ext = &amdgpu_invalid_rreg64_ext; 4328 adev->pcie_wreg64_ext = &amdgpu_invalid_wreg64_ext; 4329 adev->uvd_ctx_rreg = &amdgpu_invalid_rreg; 4330 adev->uvd_ctx_wreg = &amdgpu_invalid_wreg; 4331 adev->didt_rreg = &amdgpu_invalid_rreg; 4332 adev->didt_wreg = &amdgpu_invalid_wreg; 4333 adev->gc_cac_rreg = &amdgpu_invalid_rreg; 4334 adev->gc_cac_wreg = &amdgpu_invalid_wreg; 4335 adev->audio_endpt_rreg = &amdgpu_block_invalid_rreg; 4336 adev->audio_endpt_wreg = &amdgpu_block_invalid_wreg; 4337 4338 DRM_INFO("initializing kernel modesetting (%s 0x%04X:0x%04X 0x%04X:0x%04X 0x%02X).\n", 4339 amdgpu_asic_name[adev->asic_type], pdev->vendor, pdev->device, 4340 pdev->subsystem_vendor, pdev->subsystem_device, pdev->revision); 4341 4342 /* mutex initialization are all done here so we 4343 * can recall function without having locking issues 4344 */ 4345 mutex_init(&adev->firmware.mutex); 4346 mutex_init(&adev->pm.mutex); 4347 mutex_init(&adev->gfx.gpu_clock_mutex); 4348 mutex_init(&adev->srbm_mutex); 4349 mutex_init(&adev->gfx.pipe_reserve_mutex); 4350 mutex_init(&adev->gfx.gfx_off_mutex); 4351 mutex_init(&adev->gfx.partition_mutex); 4352 mutex_init(&adev->grbm_idx_mutex); 4353 mutex_init(&adev->mn_lock); 4354 mutex_init(&adev->virt.vf_errors.lock); 4355 hash_init(adev->mn_hash); 4356 mutex_init(&adev->psp.mutex); 4357 mutex_init(&adev->notifier_lock); 4358 mutex_init(&adev->pm.stable_pstate_ctx_lock); 4359 mutex_init(&adev->benchmark_mutex); 4360 mutex_init(&adev->gfx.reset_sem_mutex); 4361 /* Initialize the mutex for cleaner shader isolation between GFX and compute processes */ 4362 mutex_init(&adev->enforce_isolation_mutex); 4363 for (i = 0; i < MAX_XCP; ++i) { 4364 adev->isolation[i].spearhead = dma_fence_get_stub(); 4365 amdgpu_sync_create(&adev->isolation[i].active); 4366 amdgpu_sync_create(&adev->isolation[i].prev); 4367 } 4368 mutex_init(&adev->gfx.kfd_sch_mutex); 4369 mutex_init(&adev->gfx.workload_profile_mutex); 4370 mutex_init(&adev->vcn.workload_profile_mutex); 4371 mutex_init(&adev->userq_mutex); 4372 4373 amdgpu_device_init_apu_flags(adev); 4374 4375 r = amdgpu_device_check_arguments(adev); 4376 if (r) 4377 return r; 4378 4379 spin_lock_init(&adev->mmio_idx_lock); 4380 spin_lock_init(&adev->smc_idx_lock); 4381 spin_lock_init(&adev->pcie_idx_lock); 4382 spin_lock_init(&adev->uvd_ctx_idx_lock); 4383 spin_lock_init(&adev->didt_idx_lock); 4384 spin_lock_init(&adev->gc_cac_idx_lock); 4385 spin_lock_init(&adev->se_cac_idx_lock); 4386 spin_lock_init(&adev->audio_endpt_idx_lock); 4387 spin_lock_init(&adev->mm_stats.lock); 4388 spin_lock_init(&adev->virt.rlcg_reg_lock); 4389 spin_lock_init(&adev->wb.lock); 4390 4391 xa_init_flags(&adev->userq_xa, XA_FLAGS_LOCK_IRQ); 4392 4393 INIT_LIST_HEAD(&adev->reset_list); 4394 4395 INIT_LIST_HEAD(&adev->ras_list); 4396 4397 INIT_LIST_HEAD(&adev->pm.od_kobj_list); 4398 4399 INIT_LIST_HEAD(&adev->userq_mgr_list); 4400 4401 INIT_DELAYED_WORK(&adev->delayed_init_work, 4402 amdgpu_device_delayed_init_work_handler); 4403 INIT_DELAYED_WORK(&adev->gfx.gfx_off_delay_work, 4404 amdgpu_device_delay_enable_gfx_off); 4405 /* 4406 * Initialize the enforce_isolation work structures for each XCP 4407 * partition. This work handler is responsible for enforcing shader 4408 * isolation on AMD GPUs. It counts the number of emitted fences for 4409 * each GFX and compute ring. If there are any fences, it schedules 4410 * the `enforce_isolation_work` to be run after a delay. If there are 4411 * no fences, it signals the Kernel Fusion Driver (KFD) to resume the 4412 * runqueue. 4413 */ 4414 for (i = 0; i < MAX_XCP; i++) { 4415 INIT_DELAYED_WORK(&adev->gfx.enforce_isolation[i].work, 4416 amdgpu_gfx_enforce_isolation_handler); 4417 adev->gfx.enforce_isolation[i].adev = adev; 4418 adev->gfx.enforce_isolation[i].xcp_id = i; 4419 } 4420 4421 INIT_WORK(&adev->xgmi_reset_work, amdgpu_device_xgmi_reset_func); 4422 4423 adev->gfx.gfx_off_req_count = 1; 4424 adev->gfx.gfx_off_residency = 0; 4425 adev->gfx.gfx_off_entrycount = 0; 4426 adev->pm.ac_power = power_supply_is_system_supplied() > 0; 4427 4428 atomic_set(&adev->throttling_logging_enabled, 1); 4429 /* 4430 * If throttling continues, logging will be performed every minute 4431 * to avoid log flooding. "-1" is subtracted since the thermal 4432 * throttling interrupt comes every second. Thus, the total logging 4433 * interval is 59 seconds(retelimited printk interval) + 1(waiting 4434 * for throttling interrupt) = 60 seconds. 4435 */ 4436 ratelimit_state_init(&adev->throttling_logging_rs, (60 - 1) * HZ, 1); 4437 4438 ratelimit_set_flags(&adev->throttling_logging_rs, RATELIMIT_MSG_ON_RELEASE); 4439 4440 /* Registers mapping */ 4441 /* TODO: block userspace mapping of io register */ 4442 if (adev->asic_type >= CHIP_BONAIRE) { 4443 adev->rmmio_base = pci_resource_start(adev->pdev, 5); 4444 adev->rmmio_size = pci_resource_len(adev->pdev, 5); 4445 } else { 4446 adev->rmmio_base = pci_resource_start(adev->pdev, 2); 4447 adev->rmmio_size = pci_resource_len(adev->pdev, 2); 4448 } 4449 4450 for (i = 0; i < AMD_IP_BLOCK_TYPE_NUM; i++) 4451 atomic_set(&adev->pm.pwr_state[i], POWER_STATE_UNKNOWN); 4452 4453 adev->rmmio = ioremap(adev->rmmio_base, adev->rmmio_size); 4454 if (!adev->rmmio) 4455 return -ENOMEM; 4456 4457 DRM_INFO("register mmio base: 0x%08X\n", (uint32_t)adev->rmmio_base); 4458 DRM_INFO("register mmio size: %u\n", (unsigned int)adev->rmmio_size); 4459 4460 /* 4461 * Reset domain needs to be present early, before XGMI hive discovered 4462 * (if any) and initialized to use reset sem and in_gpu reset flag 4463 * early on during init and before calling to RREG32. 4464 */ 4465 adev->reset_domain = amdgpu_reset_create_reset_domain(SINGLE_DEVICE, "amdgpu-reset-dev"); 4466 if (!adev->reset_domain) 4467 return -ENOMEM; 4468 4469 /* detect hw virtualization here */ 4470 amdgpu_virt_init(adev); 4471 4472 amdgpu_device_get_pcie_info(adev); 4473 4474 r = amdgpu_device_get_job_timeout_settings(adev); 4475 if (r) { 4476 dev_err(adev->dev, "invalid lockup_timeout parameter syntax\n"); 4477 return r; 4478 } 4479 4480 amdgpu_device_set_mcbp(adev); 4481 4482 /* 4483 * By default, use default mode where all blocks are expected to be 4484 * initialized. At present a 'swinit' of blocks is required to be 4485 * completed before the need for a different level is detected. 4486 */ 4487 amdgpu_set_init_level(adev, AMDGPU_INIT_LEVEL_DEFAULT); 4488 /* early init functions */ 4489 r = amdgpu_device_ip_early_init(adev); 4490 if (r) 4491 return r; 4492 4493 /* 4494 * No need to remove conflicting FBs for non-display class devices. 4495 * This prevents the sysfb from being freed accidently. 4496 */ 4497 if ((pdev->class >> 8) == PCI_CLASS_DISPLAY_VGA || 4498 (pdev->class >> 8) == PCI_CLASS_DISPLAY_OTHER) { 4499 /* Get rid of things like offb */ 4500 r = aperture_remove_conflicting_pci_devices(adev->pdev, amdgpu_kms_driver.name); 4501 if (r) 4502 return r; 4503 } 4504 4505 /* Enable TMZ based on IP_VERSION */ 4506 amdgpu_gmc_tmz_set(adev); 4507 4508 if (amdgpu_sriov_vf(adev) && 4509 amdgpu_ip_version(adev, GC_HWIP, 0) >= IP_VERSION(10, 3, 0)) 4510 /* VF MMIO access (except mailbox range) from CPU 4511 * will be blocked during sriov runtime 4512 */ 4513 adev->virt.caps |= AMDGPU_VF_MMIO_ACCESS_PROTECT; 4514 4515 amdgpu_gmc_noretry_set(adev); 4516 /* Need to get xgmi info early to decide the reset behavior*/ 4517 if (adev->gmc.xgmi.supported) { 4518 r = adev->gfxhub.funcs->get_xgmi_info(adev); 4519 if (r) 4520 return r; 4521 } 4522 4523 /* enable PCIE atomic ops */ 4524 if (amdgpu_sriov_vf(adev)) { 4525 if (adev->virt.fw_reserve.p_pf2vf) 4526 adev->have_atomics_support = ((struct amd_sriov_msg_pf2vf_info *) 4527 adev->virt.fw_reserve.p_pf2vf)->pcie_atomic_ops_support_flags == 4528 (PCI_EXP_DEVCAP2_ATOMIC_COMP32 | PCI_EXP_DEVCAP2_ATOMIC_COMP64); 4529 /* APUs w/ gfx9 onwards doesn't reply on PCIe atomics, rather it is a 4530 * internal path natively support atomics, set have_atomics_support to true. 4531 */ 4532 } else if ((adev->flags & AMD_IS_APU) && 4533 (amdgpu_ip_version(adev, GC_HWIP, 0) > 4534 IP_VERSION(9, 0, 0))) { 4535 adev->have_atomics_support = true; 4536 } else { 4537 adev->have_atomics_support = 4538 !pci_enable_atomic_ops_to_root(adev->pdev, 4539 PCI_EXP_DEVCAP2_ATOMIC_COMP32 | 4540 PCI_EXP_DEVCAP2_ATOMIC_COMP64); 4541 } 4542 4543 if (!adev->have_atomics_support) 4544 dev_info(adev->dev, "PCIE atomic ops is not supported\n"); 4545 4546 /* doorbell bar mapping and doorbell index init*/ 4547 amdgpu_doorbell_init(adev); 4548 4549 if (amdgpu_emu_mode == 1) { 4550 /* post the asic on emulation mode */ 4551 emu_soc_asic_init(adev); 4552 goto fence_driver_init; 4553 } 4554 4555 amdgpu_reset_init(adev); 4556 4557 /* detect if we are with an SRIOV vbios */ 4558 if (adev->bios) 4559 amdgpu_device_detect_sriov_bios(adev); 4560 4561 /* check if we need to reset the asic 4562 * E.g., driver was not cleanly unloaded previously, etc. 4563 */ 4564 if (!amdgpu_sriov_vf(adev) && amdgpu_asic_need_reset_on_init(adev)) { 4565 if (adev->gmc.xgmi.num_physical_nodes) { 4566 dev_info(adev->dev, "Pending hive reset.\n"); 4567 amdgpu_set_init_level(adev, 4568 AMDGPU_INIT_LEVEL_MINIMAL_XGMI); 4569 } else if (amdgpu_ip_version(adev, MP1_HWIP, 0) == IP_VERSION(13, 0, 10) && 4570 !amdgpu_device_has_display_hardware(adev)) { 4571 r = psp_gpu_reset(adev); 4572 } else { 4573 tmp = amdgpu_reset_method; 4574 /* It should do a default reset when loading or reloading the driver, 4575 * regardless of the module parameter reset_method. 4576 */ 4577 amdgpu_reset_method = AMD_RESET_METHOD_NONE; 4578 r = amdgpu_asic_reset(adev); 4579 amdgpu_reset_method = tmp; 4580 } 4581 4582 if (r) { 4583 dev_err(adev->dev, "asic reset on init failed\n"); 4584 goto failed; 4585 } 4586 } 4587 4588 /* Post card if necessary */ 4589 if (amdgpu_device_need_post(adev)) { 4590 if (!adev->bios) { 4591 dev_err(adev->dev, "no vBIOS found\n"); 4592 r = -EINVAL; 4593 goto failed; 4594 } 4595 DRM_INFO("GPU posting now...\n"); 4596 r = amdgpu_device_asic_init(adev); 4597 if (r) { 4598 dev_err(adev->dev, "gpu post error!\n"); 4599 goto failed; 4600 } 4601 } 4602 4603 if (adev->bios) { 4604 if (adev->is_atom_fw) { 4605 /* Initialize clocks */ 4606 r = amdgpu_atomfirmware_get_clock_info(adev); 4607 if (r) { 4608 dev_err(adev->dev, "amdgpu_atomfirmware_get_clock_info failed\n"); 4609 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_GET_CLOCK_FAIL, 0, 0); 4610 goto failed; 4611 } 4612 } else { 4613 /* Initialize clocks */ 4614 r = amdgpu_atombios_get_clock_info(adev); 4615 if (r) { 4616 dev_err(adev->dev, "amdgpu_atombios_get_clock_info failed\n"); 4617 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_GET_CLOCK_FAIL, 0, 0); 4618 goto failed; 4619 } 4620 /* init i2c buses */ 4621 amdgpu_i2c_init(adev); 4622 } 4623 } 4624 4625 fence_driver_init: 4626 /* Fence driver */ 4627 r = amdgpu_fence_driver_sw_init(adev); 4628 if (r) { 4629 dev_err(adev->dev, "amdgpu_fence_driver_sw_init failed\n"); 4630 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_FENCE_INIT_FAIL, 0, 0); 4631 goto failed; 4632 } 4633 4634 /* init the mode config */ 4635 drm_mode_config_init(adev_to_drm(adev)); 4636 4637 r = amdgpu_device_ip_init(adev); 4638 if (r) { 4639 dev_err(adev->dev, "amdgpu_device_ip_init failed\n"); 4640 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_AMDGPU_INIT_FAIL, 0, 0); 4641 goto release_ras_con; 4642 } 4643 4644 amdgpu_fence_driver_hw_init(adev); 4645 4646 dev_info(adev->dev, 4647 "SE %d, SH per SE %d, CU per SH %d, active_cu_number %d\n", 4648 adev->gfx.config.max_shader_engines, 4649 adev->gfx.config.max_sh_per_se, 4650 adev->gfx.config.max_cu_per_sh, 4651 adev->gfx.cu_info.number); 4652 4653 adev->accel_working = true; 4654 4655 amdgpu_vm_check_compute_bug(adev); 4656 4657 /* Initialize the buffer migration limit. */ 4658 if (amdgpu_moverate >= 0) 4659 max_MBps = amdgpu_moverate; 4660 else 4661 max_MBps = 8; /* Allow 8 MB/s. */ 4662 /* Get a log2 for easy divisions. */ 4663 adev->mm_stats.log2_max_MBps = ilog2(max(1u, max_MBps)); 4664 4665 /* 4666 * Register gpu instance before amdgpu_device_enable_mgpu_fan_boost. 4667 * Otherwise the mgpu fan boost feature will be skipped due to the 4668 * gpu instance is counted less. 4669 */ 4670 amdgpu_register_gpu_instance(adev); 4671 4672 /* enable clockgating, etc. after ib tests, etc. since some blocks require 4673 * explicit gating rather than handling it automatically. 4674 */ 4675 if (adev->init_lvl->level != AMDGPU_INIT_LEVEL_MINIMAL_XGMI) { 4676 r = amdgpu_device_ip_late_init(adev); 4677 if (r) { 4678 dev_err(adev->dev, "amdgpu_device_ip_late_init failed\n"); 4679 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_AMDGPU_LATE_INIT_FAIL, 0, r); 4680 goto release_ras_con; 4681 } 4682 /* must succeed. */ 4683 amdgpu_ras_resume(adev); 4684 queue_delayed_work(system_wq, &adev->delayed_init_work, 4685 msecs_to_jiffies(AMDGPU_RESUME_MS)); 4686 } 4687 4688 if (amdgpu_sriov_vf(adev)) { 4689 amdgpu_virt_release_full_gpu(adev, true); 4690 flush_delayed_work(&adev->delayed_init_work); 4691 } 4692 4693 /* 4694 * Place those sysfs registering after `late_init`. As some of those 4695 * operations performed in `late_init` might affect the sysfs 4696 * interfaces creating. 4697 */ 4698 r = amdgpu_atombios_sysfs_init(adev); 4699 if (r) 4700 drm_err(&adev->ddev, 4701 "registering atombios sysfs failed (%d).\n", r); 4702 4703 r = amdgpu_pm_sysfs_init(adev); 4704 if (r) 4705 DRM_ERROR("registering pm sysfs failed (%d).\n", r); 4706 4707 r = amdgpu_ucode_sysfs_init(adev); 4708 if (r) { 4709 adev->ucode_sysfs_en = false; 4710 DRM_ERROR("Creating firmware sysfs failed (%d).\n", r); 4711 } else 4712 adev->ucode_sysfs_en = true; 4713 4714 r = amdgpu_device_attr_sysfs_init(adev); 4715 if (r) 4716 dev_err(adev->dev, "Could not create amdgpu device attr\n"); 4717 4718 r = devm_device_add_group(adev->dev, &amdgpu_board_attrs_group); 4719 if (r) 4720 dev_err(adev->dev, 4721 "Could not create amdgpu board attributes\n"); 4722 4723 amdgpu_fru_sysfs_init(adev); 4724 amdgpu_reg_state_sysfs_init(adev); 4725 amdgpu_xcp_cfg_sysfs_init(adev); 4726 4727 if (IS_ENABLED(CONFIG_PERF_EVENTS)) 4728 r = amdgpu_pmu_init(adev); 4729 if (r) 4730 dev_err(adev->dev, "amdgpu_pmu_init failed\n"); 4731 4732 /* Have stored pci confspace at hand for restore in sudden PCI error */ 4733 if (amdgpu_device_cache_pci_state(adev->pdev)) 4734 pci_restore_state(pdev); 4735 4736 /* if we have > 1 VGA cards, then disable the amdgpu VGA resources */ 4737 /* this will fail for cards that aren't VGA class devices, just 4738 * ignore it 4739 */ 4740 if ((adev->pdev->class >> 8) == PCI_CLASS_DISPLAY_VGA) 4741 vga_client_register(adev->pdev, amdgpu_device_vga_set_decode); 4742 4743 px = amdgpu_device_supports_px(ddev); 4744 4745 if (px || (!dev_is_removable(&adev->pdev->dev) && 4746 apple_gmux_detect(NULL, NULL))) 4747 vga_switcheroo_register_client(adev->pdev, 4748 &amdgpu_switcheroo_ops, px); 4749 4750 if (px) 4751 vga_switcheroo_init_domain_pm_ops(adev->dev, &adev->vga_pm_domain); 4752 4753 if (adev->init_lvl->level == AMDGPU_INIT_LEVEL_MINIMAL_XGMI) 4754 amdgpu_xgmi_reset_on_init(adev); 4755 4756 amdgpu_device_check_iommu_direct_map(adev); 4757 4758 adev->pm_nb.notifier_call = amdgpu_device_pm_notifier; 4759 r = register_pm_notifier(&adev->pm_nb); 4760 if (r) 4761 goto failed; 4762 4763 return 0; 4764 4765 release_ras_con: 4766 if (amdgpu_sriov_vf(adev)) 4767 amdgpu_virt_release_full_gpu(adev, true); 4768 4769 /* failed in exclusive mode due to timeout */ 4770 if (amdgpu_sriov_vf(adev) && 4771 !amdgpu_sriov_runtime(adev) && 4772 amdgpu_virt_mmio_blocked(adev) && 4773 !amdgpu_virt_wait_reset(adev)) { 4774 dev_err(adev->dev, "VF exclusive mode timeout\n"); 4775 /* Don't send request since VF is inactive. */ 4776 adev->virt.caps &= ~AMDGPU_SRIOV_CAPS_RUNTIME; 4777 adev->virt.ops = NULL; 4778 r = -EAGAIN; 4779 } 4780 amdgpu_release_ras_context(adev); 4781 4782 failed: 4783 amdgpu_vf_error_trans_all(adev); 4784 4785 return r; 4786 } 4787 4788 static void amdgpu_device_unmap_mmio(struct amdgpu_device *adev) 4789 { 4790 4791 /* Clear all CPU mappings pointing to this device */ 4792 unmap_mapping_range(adev->ddev.anon_inode->i_mapping, 0, 0, 1); 4793 4794 /* Unmap all mapped bars - Doorbell, registers and VRAM */ 4795 amdgpu_doorbell_fini(adev); 4796 4797 iounmap(adev->rmmio); 4798 adev->rmmio = NULL; 4799 if (adev->mman.aper_base_kaddr) 4800 iounmap(adev->mman.aper_base_kaddr); 4801 adev->mman.aper_base_kaddr = NULL; 4802 4803 /* Memory manager related */ 4804 if (!adev->gmc.xgmi.connected_to_cpu && !adev->gmc.is_app_apu) { 4805 arch_phys_wc_del(adev->gmc.vram_mtrr); 4806 arch_io_free_memtype_wc(adev->gmc.aper_base, adev->gmc.aper_size); 4807 } 4808 } 4809 4810 /** 4811 * amdgpu_device_fini_hw - tear down the driver 4812 * 4813 * @adev: amdgpu_device pointer 4814 * 4815 * Tear down the driver info (all asics). 4816 * Called at driver shutdown. 4817 */ 4818 void amdgpu_device_fini_hw(struct amdgpu_device *adev) 4819 { 4820 dev_info(adev->dev, "amdgpu: finishing device.\n"); 4821 flush_delayed_work(&adev->delayed_init_work); 4822 4823 if (adev->mman.initialized) 4824 drain_workqueue(adev->mman.bdev.wq); 4825 adev->shutdown = true; 4826 4827 unregister_pm_notifier(&adev->pm_nb); 4828 4829 /* make sure IB test finished before entering exclusive mode 4830 * to avoid preemption on IB test 4831 */ 4832 if (amdgpu_sriov_vf(adev)) { 4833 amdgpu_virt_request_full_gpu(adev, false); 4834 amdgpu_virt_fini_data_exchange(adev); 4835 } 4836 4837 /* disable all interrupts */ 4838 amdgpu_irq_disable_all(adev); 4839 if (adev->mode_info.mode_config_initialized) { 4840 if (!drm_drv_uses_atomic_modeset(adev_to_drm(adev))) 4841 drm_helper_force_disable_all(adev_to_drm(adev)); 4842 else 4843 drm_atomic_helper_shutdown(adev_to_drm(adev)); 4844 } 4845 amdgpu_fence_driver_hw_fini(adev); 4846 4847 if (adev->pm.sysfs_initialized) 4848 amdgpu_pm_sysfs_fini(adev); 4849 if (adev->ucode_sysfs_en) 4850 amdgpu_ucode_sysfs_fini(adev); 4851 amdgpu_device_attr_sysfs_fini(adev); 4852 amdgpu_fru_sysfs_fini(adev); 4853 4854 amdgpu_reg_state_sysfs_fini(adev); 4855 amdgpu_xcp_cfg_sysfs_fini(adev); 4856 4857 /* disable ras feature must before hw fini */ 4858 amdgpu_ras_pre_fini(adev); 4859 4860 amdgpu_ttm_set_buffer_funcs_status(adev, false); 4861 4862 amdgpu_device_ip_fini_early(adev); 4863 4864 amdgpu_irq_fini_hw(adev); 4865 4866 if (adev->mman.initialized) 4867 ttm_device_clear_dma_mappings(&adev->mman.bdev); 4868 4869 amdgpu_gart_dummy_page_fini(adev); 4870 4871 if (drm_dev_is_unplugged(adev_to_drm(adev))) 4872 amdgpu_device_unmap_mmio(adev); 4873 4874 } 4875 4876 void amdgpu_device_fini_sw(struct amdgpu_device *adev) 4877 { 4878 int i, idx; 4879 bool px; 4880 4881 amdgpu_device_ip_fini(adev); 4882 amdgpu_fence_driver_sw_fini(adev); 4883 amdgpu_ucode_release(&adev->firmware.gpu_info_fw); 4884 adev->accel_working = false; 4885 dma_fence_put(rcu_dereference_protected(adev->gang_submit, true)); 4886 for (i = 0; i < MAX_XCP; ++i) { 4887 dma_fence_put(adev->isolation[i].spearhead); 4888 amdgpu_sync_free(&adev->isolation[i].active); 4889 amdgpu_sync_free(&adev->isolation[i].prev); 4890 } 4891 4892 amdgpu_reset_fini(adev); 4893 4894 /* free i2c buses */ 4895 amdgpu_i2c_fini(adev); 4896 4897 if (adev->bios) { 4898 if (amdgpu_emu_mode != 1) 4899 amdgpu_atombios_fini(adev); 4900 amdgpu_bios_release(adev); 4901 } 4902 4903 kfree(adev->fru_info); 4904 adev->fru_info = NULL; 4905 4906 kfree(adev->xcp_mgr); 4907 adev->xcp_mgr = NULL; 4908 4909 px = amdgpu_device_supports_px(adev_to_drm(adev)); 4910 4911 if (px || (!dev_is_removable(&adev->pdev->dev) && 4912 apple_gmux_detect(NULL, NULL))) 4913 vga_switcheroo_unregister_client(adev->pdev); 4914 4915 if (px) 4916 vga_switcheroo_fini_domain_pm_ops(adev->dev); 4917 4918 if ((adev->pdev->class >> 8) == PCI_CLASS_DISPLAY_VGA) 4919 vga_client_unregister(adev->pdev); 4920 4921 if (drm_dev_enter(adev_to_drm(adev), &idx)) { 4922 4923 iounmap(adev->rmmio); 4924 adev->rmmio = NULL; 4925 drm_dev_exit(idx); 4926 } 4927 4928 if (IS_ENABLED(CONFIG_PERF_EVENTS)) 4929 amdgpu_pmu_fini(adev); 4930 if (adev->mman.discovery_bin) 4931 amdgpu_discovery_fini(adev); 4932 4933 amdgpu_reset_put_reset_domain(adev->reset_domain); 4934 adev->reset_domain = NULL; 4935 4936 kfree(adev->pci_state); 4937 4938 } 4939 4940 /** 4941 * amdgpu_device_evict_resources - evict device resources 4942 * @adev: amdgpu device object 4943 * 4944 * Evicts all ttm device resources(vram BOs, gart table) from the lru list 4945 * of the vram memory type. Mainly used for evicting device resources 4946 * at suspend time. 4947 * 4948 */ 4949 static int amdgpu_device_evict_resources(struct amdgpu_device *adev) 4950 { 4951 int ret; 4952 4953 /* No need to evict vram on APUs unless going to S4 */ 4954 if (!adev->in_s4 && (adev->flags & AMD_IS_APU)) 4955 return 0; 4956 4957 ret = amdgpu_ttm_evict_resources(adev, TTM_PL_VRAM); 4958 if (ret) 4959 DRM_WARN("evicting device resources failed\n"); 4960 return ret; 4961 } 4962 4963 /* 4964 * Suspend & resume. 4965 */ 4966 /** 4967 * amdgpu_device_pm_notifier - Notification block for Suspend/Hibernate events 4968 * @nb: notifier block 4969 * @mode: suspend mode 4970 * @data: data 4971 * 4972 * This function is called when the system is about to suspend or hibernate. 4973 * It is used to evict resources from the device before the system goes to 4974 * sleep while there is still access to swap. 4975 */ 4976 static int amdgpu_device_pm_notifier(struct notifier_block *nb, unsigned long mode, 4977 void *data) 4978 { 4979 struct amdgpu_device *adev = container_of(nb, struct amdgpu_device, pm_nb); 4980 int r; 4981 4982 switch (mode) { 4983 case PM_HIBERNATION_PREPARE: 4984 adev->in_s4 = true; 4985 fallthrough; 4986 case PM_SUSPEND_PREPARE: 4987 r = amdgpu_device_evict_resources(adev); 4988 /* 4989 * This is considered non-fatal at this time because 4990 * amdgpu_device_prepare() will also fatally evict resources. 4991 * See https://gitlab.freedesktop.org/drm/amd/-/issues/3781 4992 */ 4993 if (r) 4994 drm_warn(adev_to_drm(adev), "Failed to evict resources, freeze active processes if problems occur: %d\n", r); 4995 break; 4996 } 4997 4998 return NOTIFY_DONE; 4999 } 5000 5001 /** 5002 * amdgpu_device_prepare - prepare for device suspend 5003 * 5004 * @dev: drm dev pointer 5005 * 5006 * Prepare to put the hw in the suspend state (all asics). 5007 * Returns 0 for success or an error on failure. 5008 * Called at driver suspend. 5009 */ 5010 int amdgpu_device_prepare(struct drm_device *dev) 5011 { 5012 struct amdgpu_device *adev = drm_to_adev(dev); 5013 int i, r; 5014 5015 amdgpu_choose_low_power_state(adev); 5016 5017 if (dev->switch_power_state == DRM_SWITCH_POWER_OFF) 5018 return 0; 5019 5020 /* Evict the majority of BOs before starting suspend sequence */ 5021 r = amdgpu_device_evict_resources(adev); 5022 if (r) 5023 goto unprepare; 5024 5025 flush_delayed_work(&adev->gfx.gfx_off_delay_work); 5026 5027 for (i = 0; i < adev->num_ip_blocks; i++) { 5028 if (!adev->ip_blocks[i].status.valid) 5029 continue; 5030 if (!adev->ip_blocks[i].version->funcs->prepare_suspend) 5031 continue; 5032 r = adev->ip_blocks[i].version->funcs->prepare_suspend(&adev->ip_blocks[i]); 5033 if (r) 5034 goto unprepare; 5035 } 5036 5037 return 0; 5038 5039 unprepare: 5040 adev->in_s0ix = adev->in_s3 = adev->in_s4 = false; 5041 5042 return r; 5043 } 5044 5045 /** 5046 * amdgpu_device_suspend - initiate device suspend 5047 * 5048 * @dev: drm dev pointer 5049 * @notify_clients: notify in-kernel DRM clients 5050 * 5051 * Puts the hw in the suspend state (all asics). 5052 * Returns 0 for success or an error on failure. 5053 * Called at driver suspend. 5054 */ 5055 int amdgpu_device_suspend(struct drm_device *dev, bool notify_clients) 5056 { 5057 struct amdgpu_device *adev = drm_to_adev(dev); 5058 int r = 0; 5059 5060 if (dev->switch_power_state == DRM_SWITCH_POWER_OFF) 5061 return 0; 5062 5063 adev->in_suspend = true; 5064 5065 if (amdgpu_sriov_vf(adev)) { 5066 amdgpu_virt_fini_data_exchange(adev); 5067 r = amdgpu_virt_request_full_gpu(adev, false); 5068 if (r) 5069 return r; 5070 } 5071 5072 if (amdgpu_acpi_smart_shift_update(dev, AMDGPU_SS_DEV_D3)) 5073 DRM_WARN("smart shift update failed\n"); 5074 5075 if (notify_clients) 5076 drm_client_dev_suspend(adev_to_drm(adev), false); 5077 5078 cancel_delayed_work_sync(&adev->delayed_init_work); 5079 5080 amdgpu_ras_suspend(adev); 5081 5082 amdgpu_device_ip_suspend_phase1(adev); 5083 5084 if (!adev->in_s0ix) 5085 amdgpu_amdkfd_suspend(adev, adev->in_runpm); 5086 5087 r = amdgpu_device_evict_resources(adev); 5088 if (r) 5089 return r; 5090 5091 amdgpu_ttm_set_buffer_funcs_status(adev, false); 5092 5093 amdgpu_fence_driver_hw_fini(adev); 5094 5095 amdgpu_device_ip_suspend_phase2(adev); 5096 5097 if (amdgpu_sriov_vf(adev)) 5098 amdgpu_virt_release_full_gpu(adev, false); 5099 5100 r = amdgpu_dpm_notify_rlc_state(adev, false); 5101 if (r) 5102 return r; 5103 5104 return 0; 5105 } 5106 5107 /** 5108 * amdgpu_device_resume - initiate device resume 5109 * 5110 * @dev: drm dev pointer 5111 * @notify_clients: notify in-kernel DRM clients 5112 * 5113 * Bring the hw back to operating state (all asics). 5114 * Returns 0 for success or an error on failure. 5115 * Called at driver resume. 5116 */ 5117 int amdgpu_device_resume(struct drm_device *dev, bool notify_clients) 5118 { 5119 struct amdgpu_device *adev = drm_to_adev(dev); 5120 int r = 0; 5121 5122 if (amdgpu_sriov_vf(adev)) { 5123 r = amdgpu_virt_request_full_gpu(adev, true); 5124 if (r) 5125 return r; 5126 } 5127 5128 if (dev->switch_power_state == DRM_SWITCH_POWER_OFF) 5129 return 0; 5130 5131 if (adev->in_s0ix) 5132 amdgpu_dpm_gfx_state_change(adev, sGpuChangeState_D0Entry); 5133 5134 /* post card */ 5135 if (amdgpu_device_need_post(adev)) { 5136 r = amdgpu_device_asic_init(adev); 5137 if (r) 5138 dev_err(adev->dev, "amdgpu asic init failed\n"); 5139 } 5140 5141 r = amdgpu_device_ip_resume(adev); 5142 5143 if (r) { 5144 dev_err(adev->dev, "amdgpu_device_ip_resume failed (%d).\n", r); 5145 goto exit; 5146 } 5147 5148 if (!adev->in_s0ix) { 5149 r = amdgpu_amdkfd_resume(adev, adev->in_runpm); 5150 if (r) 5151 goto exit; 5152 } 5153 5154 r = amdgpu_device_ip_late_init(adev); 5155 if (r) 5156 goto exit; 5157 5158 queue_delayed_work(system_wq, &adev->delayed_init_work, 5159 msecs_to_jiffies(AMDGPU_RESUME_MS)); 5160 exit: 5161 if (amdgpu_sriov_vf(adev)) { 5162 amdgpu_virt_init_data_exchange(adev); 5163 amdgpu_virt_release_full_gpu(adev, true); 5164 } 5165 5166 if (r) 5167 return r; 5168 5169 /* Make sure IB tests flushed */ 5170 flush_delayed_work(&adev->delayed_init_work); 5171 5172 if (notify_clients) 5173 drm_client_dev_resume(adev_to_drm(adev), false); 5174 5175 amdgpu_ras_resume(adev); 5176 5177 if (adev->mode_info.num_crtc) { 5178 /* 5179 * Most of the connector probing functions try to acquire runtime pm 5180 * refs to ensure that the GPU is powered on when connector polling is 5181 * performed. Since we're calling this from a runtime PM callback, 5182 * trying to acquire rpm refs will cause us to deadlock. 5183 * 5184 * Since we're guaranteed to be holding the rpm lock, it's safe to 5185 * temporarily disable the rpm helpers so this doesn't deadlock us. 5186 */ 5187 #ifdef CONFIG_PM 5188 dev->dev->power.disable_depth++; 5189 #endif 5190 if (!adev->dc_enabled) 5191 drm_helper_hpd_irq_event(dev); 5192 else 5193 drm_kms_helper_hotplug_event(dev); 5194 #ifdef CONFIG_PM 5195 dev->dev->power.disable_depth--; 5196 #endif 5197 } 5198 adev->in_suspend = false; 5199 5200 if (amdgpu_acpi_smart_shift_update(dev, AMDGPU_SS_DEV_D0)) 5201 DRM_WARN("smart shift update failed\n"); 5202 5203 return 0; 5204 } 5205 5206 /** 5207 * amdgpu_device_ip_check_soft_reset - did soft reset succeed 5208 * 5209 * @adev: amdgpu_device pointer 5210 * 5211 * The list of all the hardware IPs that make up the asic is walked and 5212 * the check_soft_reset callbacks are run. check_soft_reset determines 5213 * if the asic is still hung or not. 5214 * Returns true if any of the IPs are still in a hung state, false if not. 5215 */ 5216 static bool amdgpu_device_ip_check_soft_reset(struct amdgpu_device *adev) 5217 { 5218 int i; 5219 bool asic_hang = false; 5220 5221 if (amdgpu_sriov_vf(adev)) 5222 return true; 5223 5224 if (amdgpu_asic_need_full_reset(adev)) 5225 return true; 5226 5227 for (i = 0; i < adev->num_ip_blocks; i++) { 5228 if (!adev->ip_blocks[i].status.valid) 5229 continue; 5230 if (adev->ip_blocks[i].version->funcs->check_soft_reset) 5231 adev->ip_blocks[i].status.hang = 5232 adev->ip_blocks[i].version->funcs->check_soft_reset( 5233 &adev->ip_blocks[i]); 5234 if (adev->ip_blocks[i].status.hang) { 5235 dev_info(adev->dev, "IP block:%s is hung!\n", adev->ip_blocks[i].version->funcs->name); 5236 asic_hang = true; 5237 } 5238 } 5239 return asic_hang; 5240 } 5241 5242 /** 5243 * amdgpu_device_ip_pre_soft_reset - prepare for soft reset 5244 * 5245 * @adev: amdgpu_device pointer 5246 * 5247 * The list of all the hardware IPs that make up the asic is walked and the 5248 * pre_soft_reset callbacks are run if the block is hung. pre_soft_reset 5249 * handles any IP specific hardware or software state changes that are 5250 * necessary for a soft reset to succeed. 5251 * Returns 0 on success, negative error code on failure. 5252 */ 5253 static int amdgpu_device_ip_pre_soft_reset(struct amdgpu_device *adev) 5254 { 5255 int i, r = 0; 5256 5257 for (i = 0; i < adev->num_ip_blocks; i++) { 5258 if (!adev->ip_blocks[i].status.valid) 5259 continue; 5260 if (adev->ip_blocks[i].status.hang && 5261 adev->ip_blocks[i].version->funcs->pre_soft_reset) { 5262 r = adev->ip_blocks[i].version->funcs->pre_soft_reset(&adev->ip_blocks[i]); 5263 if (r) 5264 return r; 5265 } 5266 } 5267 5268 return 0; 5269 } 5270 5271 /** 5272 * amdgpu_device_ip_need_full_reset - check if a full asic reset is needed 5273 * 5274 * @adev: amdgpu_device pointer 5275 * 5276 * Some hardware IPs cannot be soft reset. If they are hung, a full gpu 5277 * reset is necessary to recover. 5278 * Returns true if a full asic reset is required, false if not. 5279 */ 5280 static bool amdgpu_device_ip_need_full_reset(struct amdgpu_device *adev) 5281 { 5282 int i; 5283 5284 if (amdgpu_asic_need_full_reset(adev)) 5285 return true; 5286 5287 for (i = 0; i < adev->num_ip_blocks; i++) { 5288 if (!adev->ip_blocks[i].status.valid) 5289 continue; 5290 if ((adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) || 5291 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) || 5292 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_ACP) || 5293 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE) || 5294 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) { 5295 if (adev->ip_blocks[i].status.hang) { 5296 dev_info(adev->dev, "Some block need full reset!\n"); 5297 return true; 5298 } 5299 } 5300 } 5301 return false; 5302 } 5303 5304 /** 5305 * amdgpu_device_ip_soft_reset - do a soft reset 5306 * 5307 * @adev: amdgpu_device pointer 5308 * 5309 * The list of all the hardware IPs that make up the asic is walked and the 5310 * soft_reset callbacks are run if the block is hung. soft_reset handles any 5311 * IP specific hardware or software state changes that are necessary to soft 5312 * reset the IP. 5313 * Returns 0 on success, negative error code on failure. 5314 */ 5315 static int amdgpu_device_ip_soft_reset(struct amdgpu_device *adev) 5316 { 5317 int i, r = 0; 5318 5319 for (i = 0; i < adev->num_ip_blocks; i++) { 5320 if (!adev->ip_blocks[i].status.valid) 5321 continue; 5322 if (adev->ip_blocks[i].status.hang && 5323 adev->ip_blocks[i].version->funcs->soft_reset) { 5324 r = adev->ip_blocks[i].version->funcs->soft_reset(&adev->ip_blocks[i]); 5325 if (r) 5326 return r; 5327 } 5328 } 5329 5330 return 0; 5331 } 5332 5333 /** 5334 * amdgpu_device_ip_post_soft_reset - clean up from soft reset 5335 * 5336 * @adev: amdgpu_device pointer 5337 * 5338 * The list of all the hardware IPs that make up the asic is walked and the 5339 * post_soft_reset callbacks are run if the asic was hung. post_soft_reset 5340 * handles any IP specific hardware or software state changes that are 5341 * necessary after the IP has been soft reset. 5342 * Returns 0 on success, negative error code on failure. 5343 */ 5344 static int amdgpu_device_ip_post_soft_reset(struct amdgpu_device *adev) 5345 { 5346 int i, r = 0; 5347 5348 for (i = 0; i < adev->num_ip_blocks; i++) { 5349 if (!adev->ip_blocks[i].status.valid) 5350 continue; 5351 if (adev->ip_blocks[i].status.hang && 5352 adev->ip_blocks[i].version->funcs->post_soft_reset) 5353 r = adev->ip_blocks[i].version->funcs->post_soft_reset(&adev->ip_blocks[i]); 5354 if (r) 5355 return r; 5356 } 5357 5358 return 0; 5359 } 5360 5361 /** 5362 * amdgpu_device_reset_sriov - reset ASIC for SR-IOV vf 5363 * 5364 * @adev: amdgpu_device pointer 5365 * @reset_context: amdgpu reset context pointer 5366 * 5367 * do VF FLR and reinitialize Asic 5368 * return 0 means succeeded otherwise failed 5369 */ 5370 static int amdgpu_device_reset_sriov(struct amdgpu_device *adev, 5371 struct amdgpu_reset_context *reset_context) 5372 { 5373 int r; 5374 struct amdgpu_hive_info *hive = NULL; 5375 5376 if (test_bit(AMDGPU_HOST_FLR, &reset_context->flags)) { 5377 if (!amdgpu_ras_get_fed_status(adev)) 5378 amdgpu_virt_ready_to_reset(adev); 5379 amdgpu_virt_wait_reset(adev); 5380 clear_bit(AMDGPU_HOST_FLR, &reset_context->flags); 5381 r = amdgpu_virt_request_full_gpu(adev, true); 5382 } else { 5383 r = amdgpu_virt_reset_gpu(adev); 5384 } 5385 if (r) 5386 return r; 5387 5388 amdgpu_ras_clear_err_state(adev); 5389 amdgpu_irq_gpu_reset_resume_helper(adev); 5390 5391 /* some sw clean up VF needs to do before recover */ 5392 amdgpu_virt_post_reset(adev); 5393 5394 /* Resume IP prior to SMC */ 5395 r = amdgpu_device_ip_reinit_early_sriov(adev); 5396 if (r) 5397 return r; 5398 5399 amdgpu_virt_init_data_exchange(adev); 5400 5401 r = amdgpu_device_fw_loading(adev); 5402 if (r) 5403 return r; 5404 5405 /* now we are okay to resume SMC/CP/SDMA */ 5406 r = amdgpu_device_ip_reinit_late_sriov(adev); 5407 if (r) 5408 return r; 5409 5410 hive = amdgpu_get_xgmi_hive(adev); 5411 /* Update PSP FW topology after reset */ 5412 if (hive && adev->gmc.xgmi.num_physical_nodes > 1) 5413 r = amdgpu_xgmi_update_topology(hive, adev); 5414 if (hive) 5415 amdgpu_put_xgmi_hive(hive); 5416 if (r) 5417 return r; 5418 5419 r = amdgpu_ib_ring_tests(adev); 5420 if (r) 5421 return r; 5422 5423 if (adev->virt.gim_feature & AMDGIM_FEATURE_GIM_FLR_VRAMLOST) 5424 amdgpu_inc_vram_lost(adev); 5425 5426 /* need to be called during full access so we can't do it later like 5427 * bare-metal does. 5428 */ 5429 amdgpu_amdkfd_post_reset(adev); 5430 amdgpu_virt_release_full_gpu(adev, true); 5431 5432 /* Aldebaran and gfx_11_0_3 support ras in SRIOV, so need resume ras during reset */ 5433 if (amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 2) || 5434 amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 3) || 5435 amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 4) || 5436 amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 5, 0) || 5437 amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(11, 0, 3)) 5438 amdgpu_ras_resume(adev); 5439 5440 amdgpu_virt_ras_telemetry_post_reset(adev); 5441 5442 return 0; 5443 } 5444 5445 /** 5446 * amdgpu_device_has_job_running - check if there is any unfinished job 5447 * 5448 * @adev: amdgpu_device pointer 5449 * 5450 * check if there is any job running on the device when guest driver receives 5451 * FLR notification from host driver. If there are still jobs running, then 5452 * the guest driver will not respond the FLR reset. Instead, let the job hit 5453 * the timeout and guest driver then issue the reset request. 5454 */ 5455 bool amdgpu_device_has_job_running(struct amdgpu_device *adev) 5456 { 5457 int i; 5458 5459 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 5460 struct amdgpu_ring *ring = adev->rings[i]; 5461 5462 if (!amdgpu_ring_sched_ready(ring)) 5463 continue; 5464 5465 if (amdgpu_fence_count_emitted(ring)) 5466 return true; 5467 } 5468 return false; 5469 } 5470 5471 /** 5472 * amdgpu_device_should_recover_gpu - check if we should try GPU recovery 5473 * 5474 * @adev: amdgpu_device pointer 5475 * 5476 * Check amdgpu_gpu_recovery and SRIOV status to see if we should try to recover 5477 * a hung GPU. 5478 */ 5479 bool amdgpu_device_should_recover_gpu(struct amdgpu_device *adev) 5480 { 5481 5482 if (amdgpu_gpu_recovery == 0) 5483 goto disabled; 5484 5485 /* Skip soft reset check in fatal error mode */ 5486 if (!amdgpu_ras_is_poison_mode_supported(adev)) 5487 return true; 5488 5489 if (amdgpu_sriov_vf(adev)) 5490 return true; 5491 5492 if (amdgpu_gpu_recovery == -1) { 5493 switch (adev->asic_type) { 5494 #ifdef CONFIG_DRM_AMDGPU_SI 5495 case CHIP_VERDE: 5496 case CHIP_TAHITI: 5497 case CHIP_PITCAIRN: 5498 case CHIP_OLAND: 5499 case CHIP_HAINAN: 5500 #endif 5501 #ifdef CONFIG_DRM_AMDGPU_CIK 5502 case CHIP_KAVERI: 5503 case CHIP_KABINI: 5504 case CHIP_MULLINS: 5505 #endif 5506 case CHIP_CARRIZO: 5507 case CHIP_STONEY: 5508 case CHIP_CYAN_SKILLFISH: 5509 goto disabled; 5510 default: 5511 break; 5512 } 5513 } 5514 5515 return true; 5516 5517 disabled: 5518 dev_info(adev->dev, "GPU recovery disabled.\n"); 5519 return false; 5520 } 5521 5522 int amdgpu_device_mode1_reset(struct amdgpu_device *adev) 5523 { 5524 u32 i; 5525 int ret = 0; 5526 5527 if (adev->bios) 5528 amdgpu_atombios_scratch_regs_engine_hung(adev, true); 5529 5530 dev_info(adev->dev, "GPU mode1 reset\n"); 5531 5532 /* Cache the state before bus master disable. The saved config space 5533 * values are used in other cases like restore after mode-2 reset. 5534 */ 5535 amdgpu_device_cache_pci_state(adev->pdev); 5536 5537 /* disable BM */ 5538 pci_clear_master(adev->pdev); 5539 5540 if (amdgpu_dpm_is_mode1_reset_supported(adev)) { 5541 dev_info(adev->dev, "GPU smu mode1 reset\n"); 5542 ret = amdgpu_dpm_mode1_reset(adev); 5543 } else { 5544 dev_info(adev->dev, "GPU psp mode1 reset\n"); 5545 ret = psp_gpu_reset(adev); 5546 } 5547 5548 if (ret) 5549 goto mode1_reset_failed; 5550 5551 amdgpu_device_load_pci_state(adev->pdev); 5552 ret = amdgpu_psp_wait_for_bootloader(adev); 5553 if (ret) 5554 goto mode1_reset_failed; 5555 5556 /* wait for asic to come out of reset */ 5557 for (i = 0; i < adev->usec_timeout; i++) { 5558 u32 memsize = adev->nbio.funcs->get_memsize(adev); 5559 5560 if (memsize != 0xffffffff) 5561 break; 5562 udelay(1); 5563 } 5564 5565 if (i >= adev->usec_timeout) { 5566 ret = -ETIMEDOUT; 5567 goto mode1_reset_failed; 5568 } 5569 5570 if (adev->bios) 5571 amdgpu_atombios_scratch_regs_engine_hung(adev, false); 5572 5573 return 0; 5574 5575 mode1_reset_failed: 5576 dev_err(adev->dev, "GPU mode1 reset failed\n"); 5577 return ret; 5578 } 5579 5580 int amdgpu_device_link_reset(struct amdgpu_device *adev) 5581 { 5582 int ret = 0; 5583 5584 dev_info(adev->dev, "GPU link reset\n"); 5585 5586 if (!adev->pcie_reset_ctx.occurs_dpc) 5587 ret = amdgpu_dpm_link_reset(adev); 5588 5589 if (ret) 5590 goto link_reset_failed; 5591 5592 ret = amdgpu_psp_wait_for_bootloader(adev); 5593 if (ret) 5594 goto link_reset_failed; 5595 5596 return 0; 5597 5598 link_reset_failed: 5599 dev_err(adev->dev, "GPU link reset failed\n"); 5600 return ret; 5601 } 5602 5603 int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev, 5604 struct amdgpu_reset_context *reset_context) 5605 { 5606 int i, r = 0; 5607 struct amdgpu_job *job = NULL; 5608 struct amdgpu_device *tmp_adev = reset_context->reset_req_dev; 5609 bool need_full_reset = 5610 test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 5611 5612 if (reset_context->reset_req_dev == adev) 5613 job = reset_context->job; 5614 5615 if (amdgpu_sriov_vf(adev)) 5616 amdgpu_virt_pre_reset(adev); 5617 5618 amdgpu_fence_driver_isr_toggle(adev, true); 5619 5620 /* block all schedulers and reset given job's ring */ 5621 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 5622 struct amdgpu_ring *ring = adev->rings[i]; 5623 5624 if (!amdgpu_ring_sched_ready(ring)) 5625 continue; 5626 5627 /* Clear job fence from fence drv to avoid force_completion 5628 * leave NULL and vm flush fence in fence drv 5629 */ 5630 amdgpu_fence_driver_clear_job_fences(ring); 5631 5632 /* after all hw jobs are reset, hw fence is meaningless, so force_completion */ 5633 amdgpu_fence_driver_force_completion(ring); 5634 } 5635 5636 amdgpu_fence_driver_isr_toggle(adev, false); 5637 5638 if (job && job->vm) 5639 drm_sched_increase_karma(&job->base); 5640 5641 r = amdgpu_reset_prepare_hwcontext(adev, reset_context); 5642 /* If reset handler not implemented, continue; otherwise return */ 5643 if (r == -EOPNOTSUPP) 5644 r = 0; 5645 else 5646 return r; 5647 5648 /* Don't suspend on bare metal if we are not going to HW reset the ASIC */ 5649 if (!amdgpu_sriov_vf(adev)) { 5650 5651 if (!need_full_reset) 5652 need_full_reset = amdgpu_device_ip_need_full_reset(adev); 5653 5654 if (!need_full_reset && amdgpu_gpu_recovery && 5655 amdgpu_device_ip_check_soft_reset(adev)) { 5656 amdgpu_device_ip_pre_soft_reset(adev); 5657 r = amdgpu_device_ip_soft_reset(adev); 5658 amdgpu_device_ip_post_soft_reset(adev); 5659 if (r || amdgpu_device_ip_check_soft_reset(adev)) { 5660 dev_info(adev->dev, "soft reset failed, will fallback to full reset!\n"); 5661 need_full_reset = true; 5662 } 5663 } 5664 5665 if (!test_bit(AMDGPU_SKIP_COREDUMP, &reset_context->flags)) { 5666 dev_info(tmp_adev->dev, "Dumping IP State\n"); 5667 /* Trigger ip dump before we reset the asic */ 5668 for (i = 0; i < tmp_adev->num_ip_blocks; i++) 5669 if (tmp_adev->ip_blocks[i].version->funcs->dump_ip_state) 5670 tmp_adev->ip_blocks[i].version->funcs 5671 ->dump_ip_state((void *)&tmp_adev->ip_blocks[i]); 5672 dev_info(tmp_adev->dev, "Dumping IP State Completed\n"); 5673 } 5674 5675 if (need_full_reset) 5676 r = amdgpu_device_ip_suspend(adev); 5677 if (need_full_reset) 5678 set_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 5679 else 5680 clear_bit(AMDGPU_NEED_FULL_RESET, 5681 &reset_context->flags); 5682 } 5683 5684 return r; 5685 } 5686 5687 int amdgpu_device_reinit_after_reset(struct amdgpu_reset_context *reset_context) 5688 { 5689 struct list_head *device_list_handle; 5690 bool full_reset, vram_lost = false; 5691 struct amdgpu_device *tmp_adev; 5692 int r, init_level; 5693 5694 device_list_handle = reset_context->reset_device_list; 5695 5696 if (!device_list_handle) 5697 return -EINVAL; 5698 5699 full_reset = test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 5700 5701 /** 5702 * If it's reset on init, it's default init level, otherwise keep level 5703 * as recovery level. 5704 */ 5705 if (reset_context->method == AMD_RESET_METHOD_ON_INIT) 5706 init_level = AMDGPU_INIT_LEVEL_DEFAULT; 5707 else 5708 init_level = AMDGPU_INIT_LEVEL_RESET_RECOVERY; 5709 5710 r = 0; 5711 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 5712 amdgpu_set_init_level(tmp_adev, init_level); 5713 if (full_reset) { 5714 /* post card */ 5715 amdgpu_ras_clear_err_state(tmp_adev); 5716 r = amdgpu_device_asic_init(tmp_adev); 5717 if (r) { 5718 dev_warn(tmp_adev->dev, "asic atom init failed!"); 5719 } else { 5720 dev_info(tmp_adev->dev, "GPU reset succeeded, trying to resume\n"); 5721 5722 r = amdgpu_device_ip_resume_phase1(tmp_adev); 5723 if (r) 5724 goto out; 5725 5726 vram_lost = amdgpu_device_check_vram_lost(tmp_adev); 5727 5728 if (!test_bit(AMDGPU_SKIP_COREDUMP, &reset_context->flags)) 5729 amdgpu_coredump(tmp_adev, false, vram_lost, reset_context->job); 5730 5731 if (vram_lost) { 5732 DRM_INFO("VRAM is lost due to GPU reset!\n"); 5733 amdgpu_inc_vram_lost(tmp_adev); 5734 } 5735 5736 r = amdgpu_device_fw_loading(tmp_adev); 5737 if (r) 5738 return r; 5739 5740 r = amdgpu_xcp_restore_partition_mode( 5741 tmp_adev->xcp_mgr); 5742 if (r) 5743 goto out; 5744 5745 r = amdgpu_device_ip_resume_phase2(tmp_adev); 5746 if (r) 5747 goto out; 5748 5749 if (tmp_adev->mman.buffer_funcs_ring->sched.ready) 5750 amdgpu_ttm_set_buffer_funcs_status(tmp_adev, true); 5751 5752 r = amdgpu_device_ip_resume_phase3(tmp_adev); 5753 if (r) 5754 goto out; 5755 5756 if (vram_lost) 5757 amdgpu_device_fill_reset_magic(tmp_adev); 5758 5759 /* 5760 * Add this ASIC as tracked as reset was already 5761 * complete successfully. 5762 */ 5763 amdgpu_register_gpu_instance(tmp_adev); 5764 5765 if (!reset_context->hive && 5766 tmp_adev->gmc.xgmi.num_physical_nodes > 1) 5767 amdgpu_xgmi_add_device(tmp_adev); 5768 5769 r = amdgpu_device_ip_late_init(tmp_adev); 5770 if (r) 5771 goto out; 5772 5773 drm_client_dev_resume(adev_to_drm(tmp_adev), false); 5774 5775 /* 5776 * The GPU enters bad state once faulty pages 5777 * by ECC has reached the threshold, and ras 5778 * recovery is scheduled next. So add one check 5779 * here to break recovery if it indeed exceeds 5780 * bad page threshold, and remind user to 5781 * retire this GPU or setting one bigger 5782 * bad_page_threshold value to fix this once 5783 * probing driver again. 5784 */ 5785 if (!amdgpu_ras_is_rma(tmp_adev)) { 5786 /* must succeed. */ 5787 amdgpu_ras_resume(tmp_adev); 5788 } else { 5789 r = -EINVAL; 5790 goto out; 5791 } 5792 5793 /* Update PSP FW topology after reset */ 5794 if (reset_context->hive && 5795 tmp_adev->gmc.xgmi.num_physical_nodes > 1) 5796 r = amdgpu_xgmi_update_topology( 5797 reset_context->hive, tmp_adev); 5798 } 5799 } 5800 5801 out: 5802 if (!r) { 5803 /* IP init is complete now, set level as default */ 5804 amdgpu_set_init_level(tmp_adev, 5805 AMDGPU_INIT_LEVEL_DEFAULT); 5806 amdgpu_irq_gpu_reset_resume_helper(tmp_adev); 5807 r = amdgpu_ib_ring_tests(tmp_adev); 5808 if (r) { 5809 dev_err(tmp_adev->dev, "ib ring test failed (%d).\n", r); 5810 r = -EAGAIN; 5811 goto end; 5812 } 5813 } 5814 5815 if (r) 5816 tmp_adev->asic_reset_res = r; 5817 } 5818 5819 end: 5820 return r; 5821 } 5822 5823 int amdgpu_do_asic_reset(struct list_head *device_list_handle, 5824 struct amdgpu_reset_context *reset_context) 5825 { 5826 struct amdgpu_device *tmp_adev = NULL; 5827 bool need_full_reset, skip_hw_reset; 5828 int r = 0; 5829 5830 /* Try reset handler method first */ 5831 tmp_adev = list_first_entry(device_list_handle, struct amdgpu_device, 5832 reset_list); 5833 5834 reset_context->reset_device_list = device_list_handle; 5835 r = amdgpu_reset_perform_reset(tmp_adev, reset_context); 5836 /* If reset handler not implemented, continue; otherwise return */ 5837 if (r == -EOPNOTSUPP) 5838 r = 0; 5839 else 5840 return r; 5841 5842 /* Reset handler not implemented, use the default method */ 5843 need_full_reset = 5844 test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 5845 skip_hw_reset = test_bit(AMDGPU_SKIP_HW_RESET, &reset_context->flags); 5846 5847 /* 5848 * ASIC reset has to be done on all XGMI hive nodes ASAP 5849 * to allow proper links negotiation in FW (within 1 sec) 5850 */ 5851 if (!skip_hw_reset && need_full_reset) { 5852 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 5853 /* For XGMI run all resets in parallel to speed up the process */ 5854 if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) { 5855 if (!queue_work(system_unbound_wq, 5856 &tmp_adev->xgmi_reset_work)) 5857 r = -EALREADY; 5858 } else 5859 r = amdgpu_asic_reset(tmp_adev); 5860 5861 if (r) { 5862 dev_err(tmp_adev->dev, 5863 "ASIC reset failed with error, %d for drm dev, %s", 5864 r, adev_to_drm(tmp_adev)->unique); 5865 goto out; 5866 } 5867 } 5868 5869 /* For XGMI wait for all resets to complete before proceed */ 5870 if (!r) { 5871 list_for_each_entry(tmp_adev, device_list_handle, 5872 reset_list) { 5873 if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) { 5874 flush_work(&tmp_adev->xgmi_reset_work); 5875 r = tmp_adev->asic_reset_res; 5876 if (r) 5877 break; 5878 } 5879 } 5880 } 5881 } 5882 5883 if (!r && amdgpu_ras_intr_triggered()) { 5884 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 5885 amdgpu_ras_reset_error_count(tmp_adev, 5886 AMDGPU_RAS_BLOCK__MMHUB); 5887 } 5888 5889 amdgpu_ras_intr_cleared(); 5890 } 5891 5892 r = amdgpu_device_reinit_after_reset(reset_context); 5893 if (r == -EAGAIN) 5894 set_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 5895 else 5896 clear_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 5897 5898 out: 5899 return r; 5900 } 5901 5902 static void amdgpu_device_set_mp1_state(struct amdgpu_device *adev) 5903 { 5904 5905 switch (amdgpu_asic_reset_method(adev)) { 5906 case AMD_RESET_METHOD_MODE1: 5907 case AMD_RESET_METHOD_LINK: 5908 adev->mp1_state = PP_MP1_STATE_SHUTDOWN; 5909 break; 5910 case AMD_RESET_METHOD_MODE2: 5911 adev->mp1_state = PP_MP1_STATE_RESET; 5912 break; 5913 default: 5914 adev->mp1_state = PP_MP1_STATE_NONE; 5915 break; 5916 } 5917 } 5918 5919 static void amdgpu_device_unset_mp1_state(struct amdgpu_device *adev) 5920 { 5921 amdgpu_vf_error_trans_all(adev); 5922 adev->mp1_state = PP_MP1_STATE_NONE; 5923 } 5924 5925 static void amdgpu_device_resume_display_audio(struct amdgpu_device *adev) 5926 { 5927 struct pci_dev *p = NULL; 5928 5929 p = pci_get_domain_bus_and_slot(pci_domain_nr(adev->pdev->bus), 5930 adev->pdev->bus->number, 1); 5931 if (p) { 5932 pm_runtime_enable(&(p->dev)); 5933 pm_runtime_resume(&(p->dev)); 5934 } 5935 5936 pci_dev_put(p); 5937 } 5938 5939 static int amdgpu_device_suspend_display_audio(struct amdgpu_device *adev) 5940 { 5941 enum amd_reset_method reset_method; 5942 struct pci_dev *p = NULL; 5943 u64 expires; 5944 5945 /* 5946 * For now, only BACO and mode1 reset are confirmed 5947 * to suffer the audio issue without proper suspended. 5948 */ 5949 reset_method = amdgpu_asic_reset_method(adev); 5950 if ((reset_method != AMD_RESET_METHOD_BACO) && 5951 (reset_method != AMD_RESET_METHOD_MODE1)) 5952 return -EINVAL; 5953 5954 p = pci_get_domain_bus_and_slot(pci_domain_nr(adev->pdev->bus), 5955 adev->pdev->bus->number, 1); 5956 if (!p) 5957 return -ENODEV; 5958 5959 expires = pm_runtime_autosuspend_expiration(&(p->dev)); 5960 if (!expires) 5961 /* 5962 * If we cannot get the audio device autosuspend delay, 5963 * a fixed 4S interval will be used. Considering 3S is 5964 * the audio controller default autosuspend delay setting. 5965 * 4S used here is guaranteed to cover that. 5966 */ 5967 expires = ktime_get_mono_fast_ns() + NSEC_PER_SEC * 4ULL; 5968 5969 while (!pm_runtime_status_suspended(&(p->dev))) { 5970 if (!pm_runtime_suspend(&(p->dev))) 5971 break; 5972 5973 if (expires < ktime_get_mono_fast_ns()) { 5974 dev_warn(adev->dev, "failed to suspend display audio\n"); 5975 pci_dev_put(p); 5976 /* TODO: abort the succeeding gpu reset? */ 5977 return -ETIMEDOUT; 5978 } 5979 } 5980 5981 pm_runtime_disable(&(p->dev)); 5982 5983 pci_dev_put(p); 5984 return 0; 5985 } 5986 5987 static inline void amdgpu_device_stop_pending_resets(struct amdgpu_device *adev) 5988 { 5989 struct amdgpu_ras *con = amdgpu_ras_get_context(adev); 5990 5991 #if defined(CONFIG_DEBUG_FS) 5992 if (!amdgpu_sriov_vf(adev)) 5993 cancel_work(&adev->reset_work); 5994 #endif 5995 5996 if (adev->kfd.dev) 5997 cancel_work(&adev->kfd.reset_work); 5998 5999 if (amdgpu_sriov_vf(adev)) 6000 cancel_work(&adev->virt.flr_work); 6001 6002 if (con && adev->ras_enabled) 6003 cancel_work(&con->recovery_work); 6004 6005 } 6006 6007 static int amdgpu_device_health_check(struct list_head *device_list_handle) 6008 { 6009 struct amdgpu_device *tmp_adev; 6010 int ret = 0; 6011 u32 status; 6012 6013 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 6014 pci_read_config_dword(tmp_adev->pdev, PCI_COMMAND, &status); 6015 if (PCI_POSSIBLE_ERROR(status)) { 6016 dev_err(tmp_adev->dev, "device lost from bus!"); 6017 ret = -ENODEV; 6018 } 6019 } 6020 6021 return ret; 6022 } 6023 6024 static int amdgpu_device_halt_activities(struct amdgpu_device *adev, 6025 struct amdgpu_job *job, 6026 struct amdgpu_reset_context *reset_context, 6027 struct list_head *device_list, 6028 struct amdgpu_hive_info *hive, 6029 bool need_emergency_restart) 6030 { 6031 struct list_head *device_list_handle = NULL; 6032 struct amdgpu_device *tmp_adev = NULL; 6033 int i, r = 0; 6034 6035 /* 6036 * Build list of devices to reset. 6037 * In case we are in XGMI hive mode, resort the device list 6038 * to put adev in the 1st position. 6039 */ 6040 if (!amdgpu_sriov_vf(adev) && (adev->gmc.xgmi.num_physical_nodes > 1) && hive) { 6041 list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head) { 6042 list_add_tail(&tmp_adev->reset_list, device_list); 6043 if (adev->shutdown) 6044 tmp_adev->shutdown = true; 6045 if (adev->pcie_reset_ctx.occurs_dpc) 6046 tmp_adev->pcie_reset_ctx.in_link_reset = true; 6047 } 6048 if (!list_is_first(&adev->reset_list, device_list)) 6049 list_rotate_to_front(&adev->reset_list, device_list); 6050 device_list_handle = device_list; 6051 } else { 6052 list_add_tail(&adev->reset_list, device_list); 6053 device_list_handle = device_list; 6054 } 6055 6056 if (!amdgpu_sriov_vf(adev) && (!adev->pcie_reset_ctx.occurs_dpc)) { 6057 r = amdgpu_device_health_check(device_list_handle); 6058 if (r) 6059 return r; 6060 } 6061 6062 /* We need to lock reset domain only once both for XGMI and single device */ 6063 tmp_adev = list_first_entry(device_list_handle, struct amdgpu_device, 6064 reset_list); 6065 amdgpu_device_lock_reset_domain(tmp_adev->reset_domain); 6066 6067 /* block all schedulers and reset given job's ring */ 6068 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 6069 6070 amdgpu_device_set_mp1_state(tmp_adev); 6071 6072 /* 6073 * Try to put the audio codec into suspend state 6074 * before gpu reset started. 6075 * 6076 * Due to the power domain of the graphics device 6077 * is shared with AZ power domain. Without this, 6078 * we may change the audio hardware from behind 6079 * the audio driver's back. That will trigger 6080 * some audio codec errors. 6081 */ 6082 if (!amdgpu_device_suspend_display_audio(tmp_adev)) 6083 tmp_adev->pcie_reset_ctx.audio_suspended = true; 6084 6085 amdgpu_ras_set_error_query_ready(tmp_adev, false); 6086 6087 cancel_delayed_work_sync(&tmp_adev->delayed_init_work); 6088 6089 amdgpu_amdkfd_pre_reset(tmp_adev, reset_context); 6090 6091 /* 6092 * Mark these ASICs to be reset as untracked first 6093 * And add them back after reset completed 6094 */ 6095 amdgpu_unregister_gpu_instance(tmp_adev); 6096 6097 drm_client_dev_suspend(adev_to_drm(tmp_adev), false); 6098 6099 /* disable ras on ALL IPs */ 6100 if (!need_emergency_restart && 6101 (!adev->pcie_reset_ctx.occurs_dpc) && 6102 amdgpu_device_ip_need_full_reset(tmp_adev)) 6103 amdgpu_ras_suspend(tmp_adev); 6104 6105 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 6106 struct amdgpu_ring *ring = tmp_adev->rings[i]; 6107 6108 if (!amdgpu_ring_sched_ready(ring)) 6109 continue; 6110 6111 drm_sched_stop(&ring->sched, job ? &job->base : NULL); 6112 6113 if (need_emergency_restart) 6114 amdgpu_job_stop_all_jobs_on_sched(&ring->sched); 6115 } 6116 atomic_inc(&tmp_adev->gpu_reset_counter); 6117 } 6118 6119 return r; 6120 } 6121 6122 static int amdgpu_device_asic_reset(struct amdgpu_device *adev, 6123 struct list_head *device_list, 6124 struct amdgpu_reset_context *reset_context) 6125 { 6126 struct amdgpu_device *tmp_adev = NULL; 6127 int retry_limit = AMDGPU_MAX_RETRY_LIMIT; 6128 int r = 0; 6129 6130 retry: /* Rest of adevs pre asic reset from XGMI hive. */ 6131 list_for_each_entry(tmp_adev, device_list, reset_list) { 6132 if (adev->pcie_reset_ctx.occurs_dpc) 6133 tmp_adev->no_hw_access = true; 6134 r = amdgpu_device_pre_asic_reset(tmp_adev, reset_context); 6135 if (adev->pcie_reset_ctx.occurs_dpc) 6136 tmp_adev->no_hw_access = false; 6137 /*TODO Should we stop ?*/ 6138 if (r) { 6139 dev_err(tmp_adev->dev, "GPU pre asic reset failed with err, %d for drm dev, %s ", 6140 r, adev_to_drm(tmp_adev)->unique); 6141 tmp_adev->asic_reset_res = r; 6142 } 6143 } 6144 6145 /* Actual ASIC resets if needed.*/ 6146 /* Host driver will handle XGMI hive reset for SRIOV */ 6147 if (amdgpu_sriov_vf(adev)) { 6148 if (amdgpu_ras_get_fed_status(adev) || amdgpu_virt_rcvd_ras_interrupt(adev)) { 6149 dev_dbg(adev->dev, "Detected RAS error, wait for FLR completion\n"); 6150 amdgpu_ras_set_fed(adev, true); 6151 set_bit(AMDGPU_HOST_FLR, &reset_context->flags); 6152 } 6153 6154 r = amdgpu_device_reset_sriov(adev, reset_context); 6155 if (AMDGPU_RETRY_SRIOV_RESET(r) && (retry_limit--) > 0) { 6156 amdgpu_virt_release_full_gpu(adev, true); 6157 goto retry; 6158 } 6159 if (r) 6160 adev->asic_reset_res = r; 6161 } else { 6162 r = amdgpu_do_asic_reset(device_list, reset_context); 6163 if (r && r == -EAGAIN) 6164 goto retry; 6165 } 6166 6167 list_for_each_entry(tmp_adev, device_list, reset_list) { 6168 /* 6169 * Drop any pending non scheduler resets queued before reset is done. 6170 * Any reset scheduled after this point would be valid. Scheduler resets 6171 * were already dropped during drm_sched_stop and no new ones can come 6172 * in before drm_sched_start. 6173 */ 6174 amdgpu_device_stop_pending_resets(tmp_adev); 6175 } 6176 6177 return r; 6178 } 6179 6180 static int amdgpu_device_sched_resume(struct list_head *device_list, 6181 struct amdgpu_reset_context *reset_context, 6182 bool job_signaled) 6183 { 6184 struct amdgpu_device *tmp_adev = NULL; 6185 int i, r = 0; 6186 6187 /* Post ASIC reset for all devs .*/ 6188 list_for_each_entry(tmp_adev, device_list, reset_list) { 6189 6190 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 6191 struct amdgpu_ring *ring = tmp_adev->rings[i]; 6192 6193 if (!amdgpu_ring_sched_ready(ring)) 6194 continue; 6195 6196 drm_sched_start(&ring->sched, 0); 6197 } 6198 6199 if (!drm_drv_uses_atomic_modeset(adev_to_drm(tmp_adev)) && !job_signaled) 6200 drm_helper_resume_force_mode(adev_to_drm(tmp_adev)); 6201 6202 if (tmp_adev->asic_reset_res) 6203 r = tmp_adev->asic_reset_res; 6204 6205 tmp_adev->asic_reset_res = 0; 6206 6207 if (r) { 6208 /* bad news, how to tell it to userspace ? 6209 * for ras error, we should report GPU bad status instead of 6210 * reset failure 6211 */ 6212 if (reset_context->src != AMDGPU_RESET_SRC_RAS || 6213 !amdgpu_ras_eeprom_check_err_threshold(tmp_adev)) 6214 dev_info(tmp_adev->dev, "GPU reset(%d) failed\n", 6215 atomic_read(&tmp_adev->gpu_reset_counter)); 6216 amdgpu_vf_error_put(tmp_adev, AMDGIM_ERROR_VF_GPU_RESET_FAIL, 0, r); 6217 } else { 6218 dev_info(tmp_adev->dev, "GPU reset(%d) succeeded!\n", atomic_read(&tmp_adev->gpu_reset_counter)); 6219 if (amdgpu_acpi_smart_shift_update(adev_to_drm(tmp_adev), AMDGPU_SS_DEV_D0)) 6220 DRM_WARN("smart shift update failed\n"); 6221 } 6222 } 6223 6224 return r; 6225 } 6226 6227 static void amdgpu_device_gpu_resume(struct amdgpu_device *adev, 6228 struct list_head *device_list, 6229 bool need_emergency_restart) 6230 { 6231 struct amdgpu_device *tmp_adev = NULL; 6232 6233 list_for_each_entry(tmp_adev, device_list, reset_list) { 6234 /* unlock kfd: SRIOV would do it separately */ 6235 if (!need_emergency_restart && !amdgpu_sriov_vf(tmp_adev)) 6236 amdgpu_amdkfd_post_reset(tmp_adev); 6237 6238 /* kfd_post_reset will do nothing if kfd device is not initialized, 6239 * need to bring up kfd here if it's not be initialized before 6240 */ 6241 if (!adev->kfd.init_complete) 6242 amdgpu_amdkfd_device_init(adev); 6243 6244 if (tmp_adev->pcie_reset_ctx.audio_suspended) 6245 amdgpu_device_resume_display_audio(tmp_adev); 6246 6247 amdgpu_device_unset_mp1_state(tmp_adev); 6248 6249 amdgpu_ras_set_error_query_ready(tmp_adev, true); 6250 6251 } 6252 6253 tmp_adev = list_first_entry(device_list, struct amdgpu_device, 6254 reset_list); 6255 amdgpu_device_unlock_reset_domain(tmp_adev->reset_domain); 6256 6257 } 6258 6259 6260 /** 6261 * amdgpu_device_gpu_recover - reset the asic and recover scheduler 6262 * 6263 * @adev: amdgpu_device pointer 6264 * @job: which job trigger hang 6265 * @reset_context: amdgpu reset context pointer 6266 * 6267 * Attempt to reset the GPU if it has hung (all asics). 6268 * Attempt to do soft-reset or full-reset and reinitialize Asic 6269 * Returns 0 for success or an error on failure. 6270 */ 6271 6272 int amdgpu_device_gpu_recover(struct amdgpu_device *adev, 6273 struct amdgpu_job *job, 6274 struct amdgpu_reset_context *reset_context) 6275 { 6276 struct list_head device_list; 6277 bool job_signaled = false; 6278 struct amdgpu_hive_info *hive = NULL; 6279 int r = 0; 6280 bool need_emergency_restart = false; 6281 6282 /* 6283 * If it reaches here because of hang/timeout and a RAS error is 6284 * detected at the same time, let RAS recovery take care of it. 6285 */ 6286 if (amdgpu_ras_is_err_state(adev, AMDGPU_RAS_BLOCK__ANY) && 6287 !amdgpu_sriov_vf(adev) && 6288 reset_context->src != AMDGPU_RESET_SRC_RAS) { 6289 dev_dbg(adev->dev, 6290 "Gpu recovery from source: %d yielding to RAS error recovery handling", 6291 reset_context->src); 6292 return 0; 6293 } 6294 6295 /* 6296 * Special case: RAS triggered and full reset isn't supported 6297 */ 6298 need_emergency_restart = amdgpu_ras_need_emergency_restart(adev); 6299 6300 /* 6301 * Flush RAM to disk so that after reboot 6302 * the user can read log and see why the system rebooted. 6303 */ 6304 if (need_emergency_restart && amdgpu_ras_get_context(adev) && 6305 amdgpu_ras_get_context(adev)->reboot) { 6306 DRM_WARN("Emergency reboot."); 6307 6308 ksys_sync_helper(); 6309 emergency_restart(); 6310 } 6311 6312 dev_info(adev->dev, "GPU %s begin!\n", 6313 need_emergency_restart ? "jobs stop":"reset"); 6314 6315 if (!amdgpu_sriov_vf(adev)) 6316 hive = amdgpu_get_xgmi_hive(adev); 6317 if (hive) 6318 mutex_lock(&hive->hive_lock); 6319 6320 reset_context->job = job; 6321 reset_context->hive = hive; 6322 INIT_LIST_HEAD(&device_list); 6323 6324 r = amdgpu_device_halt_activities(adev, job, reset_context, &device_list, 6325 hive, need_emergency_restart); 6326 if (r) 6327 goto end_reset; 6328 6329 if (need_emergency_restart) 6330 goto skip_sched_resume; 6331 /* 6332 * Must check guilty signal here since after this point all old 6333 * HW fences are force signaled. 6334 * 6335 * job->base holds a reference to parent fence 6336 */ 6337 if (job && dma_fence_is_signaled(&job->hw_fence)) { 6338 job_signaled = true; 6339 dev_info(adev->dev, "Guilty job already signaled, skipping HW reset"); 6340 goto skip_hw_reset; 6341 } 6342 6343 r = amdgpu_device_asic_reset(adev, &device_list, reset_context); 6344 if (r) 6345 goto end_reset; 6346 skip_hw_reset: 6347 r = amdgpu_device_sched_resume(&device_list, reset_context, job_signaled); 6348 if (r) 6349 goto end_reset; 6350 skip_sched_resume: 6351 amdgpu_device_gpu_resume(adev, &device_list, need_emergency_restart); 6352 end_reset: 6353 if (hive) { 6354 mutex_unlock(&hive->hive_lock); 6355 amdgpu_put_xgmi_hive(hive); 6356 } 6357 6358 if (r) 6359 dev_info(adev->dev, "GPU reset end with ret = %d\n", r); 6360 6361 atomic_set(&adev->reset_domain->reset_res, r); 6362 6363 if (!r) 6364 drm_dev_wedged_event(adev_to_drm(adev), DRM_WEDGE_RECOVERY_NONE); 6365 6366 return r; 6367 } 6368 6369 /** 6370 * amdgpu_device_partner_bandwidth - find the bandwidth of appropriate partner 6371 * 6372 * @adev: amdgpu_device pointer 6373 * @speed: pointer to the speed of the link 6374 * @width: pointer to the width of the link 6375 * 6376 * Evaluate the hierarchy to find the speed and bandwidth capabilities of the 6377 * first physical partner to an AMD dGPU. 6378 * This will exclude any virtual switches and links. 6379 */ 6380 static void amdgpu_device_partner_bandwidth(struct amdgpu_device *adev, 6381 enum pci_bus_speed *speed, 6382 enum pcie_link_width *width) 6383 { 6384 struct pci_dev *parent = adev->pdev; 6385 6386 if (!speed || !width) 6387 return; 6388 6389 *speed = PCI_SPEED_UNKNOWN; 6390 *width = PCIE_LNK_WIDTH_UNKNOWN; 6391 6392 if (amdgpu_device_pcie_dynamic_switching_supported(adev)) { 6393 while ((parent = pci_upstream_bridge(parent))) { 6394 /* skip upstream/downstream switches internal to dGPU*/ 6395 if (parent->vendor == PCI_VENDOR_ID_ATI) 6396 continue; 6397 *speed = pcie_get_speed_cap(parent); 6398 *width = pcie_get_width_cap(parent); 6399 break; 6400 } 6401 } else { 6402 /* use the current speeds rather than max if switching is not supported */ 6403 pcie_bandwidth_available(adev->pdev, NULL, speed, width); 6404 } 6405 } 6406 6407 /** 6408 * amdgpu_device_gpu_bandwidth - find the bandwidth of the GPU 6409 * 6410 * @adev: amdgpu_device pointer 6411 * @speed: pointer to the speed of the link 6412 * @width: pointer to the width of the link 6413 * 6414 * Evaluate the hierarchy to find the speed and bandwidth capabilities of the 6415 * AMD dGPU which may be a virtual upstream bridge. 6416 */ 6417 static void amdgpu_device_gpu_bandwidth(struct amdgpu_device *adev, 6418 enum pci_bus_speed *speed, 6419 enum pcie_link_width *width) 6420 { 6421 struct pci_dev *parent = adev->pdev; 6422 6423 if (!speed || !width) 6424 return; 6425 6426 parent = pci_upstream_bridge(parent); 6427 if (parent && parent->vendor == PCI_VENDOR_ID_ATI) { 6428 /* use the upstream/downstream switches internal to dGPU */ 6429 *speed = pcie_get_speed_cap(parent); 6430 *width = pcie_get_width_cap(parent); 6431 while ((parent = pci_upstream_bridge(parent))) { 6432 if (parent->vendor == PCI_VENDOR_ID_ATI) { 6433 /* use the upstream/downstream switches internal to dGPU */ 6434 *speed = pcie_get_speed_cap(parent); 6435 *width = pcie_get_width_cap(parent); 6436 } 6437 } 6438 } else { 6439 /* use the device itself */ 6440 *speed = pcie_get_speed_cap(adev->pdev); 6441 *width = pcie_get_width_cap(adev->pdev); 6442 } 6443 } 6444 6445 /** 6446 * amdgpu_device_get_pcie_info - fence pcie info about the PCIE slot 6447 * 6448 * @adev: amdgpu_device pointer 6449 * 6450 * Fetches and stores in the driver the PCIE capabilities (gen speed 6451 * and lanes) of the slot the device is in. Handles APUs and 6452 * virtualized environments where PCIE config space may not be available. 6453 */ 6454 static void amdgpu_device_get_pcie_info(struct amdgpu_device *adev) 6455 { 6456 enum pci_bus_speed speed_cap, platform_speed_cap; 6457 enum pcie_link_width platform_link_width, link_width; 6458 6459 if (amdgpu_pcie_gen_cap) 6460 adev->pm.pcie_gen_mask = amdgpu_pcie_gen_cap; 6461 6462 if (amdgpu_pcie_lane_cap) 6463 adev->pm.pcie_mlw_mask = amdgpu_pcie_lane_cap; 6464 6465 /* covers APUs as well */ 6466 if (pci_is_root_bus(adev->pdev->bus) && !amdgpu_passthrough(adev)) { 6467 if (adev->pm.pcie_gen_mask == 0) 6468 adev->pm.pcie_gen_mask = AMDGPU_DEFAULT_PCIE_GEN_MASK; 6469 if (adev->pm.pcie_mlw_mask == 0) 6470 adev->pm.pcie_mlw_mask = AMDGPU_DEFAULT_PCIE_MLW_MASK; 6471 return; 6472 } 6473 6474 if (adev->pm.pcie_gen_mask && adev->pm.pcie_mlw_mask) 6475 return; 6476 6477 amdgpu_device_partner_bandwidth(adev, &platform_speed_cap, 6478 &platform_link_width); 6479 amdgpu_device_gpu_bandwidth(adev, &speed_cap, &link_width); 6480 6481 if (adev->pm.pcie_gen_mask == 0) { 6482 /* asic caps */ 6483 if (speed_cap == PCI_SPEED_UNKNOWN) { 6484 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 6485 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 | 6486 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3); 6487 } else { 6488 if (speed_cap == PCIE_SPEED_32_0GT) 6489 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 6490 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 | 6491 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3 | 6492 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN4 | 6493 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN5); 6494 else if (speed_cap == PCIE_SPEED_16_0GT) 6495 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 6496 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 | 6497 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3 | 6498 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN4); 6499 else if (speed_cap == PCIE_SPEED_8_0GT) 6500 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 6501 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 | 6502 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3); 6503 else if (speed_cap == PCIE_SPEED_5_0GT) 6504 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 6505 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2); 6506 else 6507 adev->pm.pcie_gen_mask |= CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1; 6508 } 6509 /* platform caps */ 6510 if (platform_speed_cap == PCI_SPEED_UNKNOWN) { 6511 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 6512 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2); 6513 } else { 6514 if (platform_speed_cap == PCIE_SPEED_32_0GT) 6515 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 6516 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 | 6517 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3 | 6518 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN4 | 6519 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN5); 6520 else if (platform_speed_cap == PCIE_SPEED_16_0GT) 6521 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 6522 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 | 6523 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3 | 6524 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN4); 6525 else if (platform_speed_cap == PCIE_SPEED_8_0GT) 6526 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 6527 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 | 6528 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3); 6529 else if (platform_speed_cap == PCIE_SPEED_5_0GT) 6530 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 6531 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2); 6532 else 6533 adev->pm.pcie_gen_mask |= CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1; 6534 6535 } 6536 } 6537 if (adev->pm.pcie_mlw_mask == 0) { 6538 /* asic caps */ 6539 if (link_width == PCIE_LNK_WIDTH_UNKNOWN) { 6540 adev->pm.pcie_mlw_mask |= AMDGPU_DEFAULT_ASIC_PCIE_MLW_MASK; 6541 } else { 6542 switch (link_width) { 6543 case PCIE_LNK_X32: 6544 adev->pm.pcie_mlw_mask |= (CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X32 | 6545 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X16 | 6546 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X12 | 6547 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X8 | 6548 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X4 | 6549 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X2 | 6550 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X1); 6551 break; 6552 case PCIE_LNK_X16: 6553 adev->pm.pcie_mlw_mask |= (CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X16 | 6554 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X12 | 6555 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X8 | 6556 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X4 | 6557 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X2 | 6558 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X1); 6559 break; 6560 case PCIE_LNK_X12: 6561 adev->pm.pcie_mlw_mask |= (CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X12 | 6562 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X8 | 6563 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X4 | 6564 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X2 | 6565 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X1); 6566 break; 6567 case PCIE_LNK_X8: 6568 adev->pm.pcie_mlw_mask |= (CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X8 | 6569 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X4 | 6570 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X2 | 6571 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X1); 6572 break; 6573 case PCIE_LNK_X4: 6574 adev->pm.pcie_mlw_mask |= (CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X4 | 6575 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X2 | 6576 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X1); 6577 break; 6578 case PCIE_LNK_X2: 6579 adev->pm.pcie_mlw_mask |= (CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X2 | 6580 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X1); 6581 break; 6582 case PCIE_LNK_X1: 6583 adev->pm.pcie_mlw_mask |= CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X1; 6584 break; 6585 default: 6586 break; 6587 } 6588 } 6589 /* platform caps */ 6590 if (platform_link_width == PCIE_LNK_WIDTH_UNKNOWN) { 6591 adev->pm.pcie_mlw_mask |= AMDGPU_DEFAULT_PCIE_MLW_MASK; 6592 } else { 6593 switch (platform_link_width) { 6594 case PCIE_LNK_X32: 6595 adev->pm.pcie_mlw_mask |= (CAIL_PCIE_LINK_WIDTH_SUPPORT_X32 | 6596 CAIL_PCIE_LINK_WIDTH_SUPPORT_X16 | 6597 CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 | 6598 CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 | 6599 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 6600 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 6601 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 6602 break; 6603 case PCIE_LNK_X16: 6604 adev->pm.pcie_mlw_mask |= (CAIL_PCIE_LINK_WIDTH_SUPPORT_X16 | 6605 CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 | 6606 CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 | 6607 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 6608 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 6609 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 6610 break; 6611 case PCIE_LNK_X12: 6612 adev->pm.pcie_mlw_mask |= (CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 | 6613 CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 | 6614 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 6615 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 6616 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 6617 break; 6618 case PCIE_LNK_X8: 6619 adev->pm.pcie_mlw_mask |= (CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 | 6620 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 6621 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 6622 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 6623 break; 6624 case PCIE_LNK_X4: 6625 adev->pm.pcie_mlw_mask |= (CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 6626 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 6627 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 6628 break; 6629 case PCIE_LNK_X2: 6630 adev->pm.pcie_mlw_mask |= (CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 6631 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 6632 break; 6633 case PCIE_LNK_X1: 6634 adev->pm.pcie_mlw_mask |= CAIL_PCIE_LINK_WIDTH_SUPPORT_X1; 6635 break; 6636 default: 6637 break; 6638 } 6639 } 6640 } 6641 } 6642 6643 /** 6644 * amdgpu_device_is_peer_accessible - Check peer access through PCIe BAR 6645 * 6646 * @adev: amdgpu_device pointer 6647 * @peer_adev: amdgpu_device pointer for peer device trying to access @adev 6648 * 6649 * Return true if @peer_adev can access (DMA) @adev through the PCIe 6650 * BAR, i.e. @adev is "large BAR" and the BAR matches the DMA mask of 6651 * @peer_adev. 6652 */ 6653 bool amdgpu_device_is_peer_accessible(struct amdgpu_device *adev, 6654 struct amdgpu_device *peer_adev) 6655 { 6656 #ifdef CONFIG_HSA_AMD_P2P 6657 bool p2p_access = 6658 !adev->gmc.xgmi.connected_to_cpu && 6659 !(pci_p2pdma_distance(adev->pdev, peer_adev->dev, false) < 0); 6660 if (!p2p_access) 6661 dev_info(adev->dev, "PCIe P2P access from peer device %s is not supported by the chipset\n", 6662 pci_name(peer_adev->pdev)); 6663 6664 bool is_large_bar = adev->gmc.visible_vram_size && 6665 adev->gmc.real_vram_size == adev->gmc.visible_vram_size; 6666 bool p2p_addressable = amdgpu_device_check_iommu_remap(peer_adev); 6667 6668 if (!p2p_addressable) { 6669 uint64_t address_mask = peer_adev->dev->dma_mask ? 6670 ~*peer_adev->dev->dma_mask : ~((1ULL << 32) - 1); 6671 resource_size_t aper_limit = 6672 adev->gmc.aper_base + adev->gmc.aper_size - 1; 6673 6674 p2p_addressable = !(adev->gmc.aper_base & address_mask || 6675 aper_limit & address_mask); 6676 } 6677 return pcie_p2p && is_large_bar && p2p_access && p2p_addressable; 6678 #else 6679 return false; 6680 #endif 6681 } 6682 6683 int amdgpu_device_baco_enter(struct drm_device *dev) 6684 { 6685 struct amdgpu_device *adev = drm_to_adev(dev); 6686 struct amdgpu_ras *ras = amdgpu_ras_get_context(adev); 6687 6688 if (!amdgpu_device_supports_baco(dev)) 6689 return -ENOTSUPP; 6690 6691 if (ras && adev->ras_enabled && 6692 adev->nbio.funcs->enable_doorbell_interrupt) 6693 adev->nbio.funcs->enable_doorbell_interrupt(adev, false); 6694 6695 return amdgpu_dpm_baco_enter(adev); 6696 } 6697 6698 int amdgpu_device_baco_exit(struct drm_device *dev) 6699 { 6700 struct amdgpu_device *adev = drm_to_adev(dev); 6701 struct amdgpu_ras *ras = amdgpu_ras_get_context(adev); 6702 int ret = 0; 6703 6704 if (!amdgpu_device_supports_baco(dev)) 6705 return -ENOTSUPP; 6706 6707 ret = amdgpu_dpm_baco_exit(adev); 6708 if (ret) 6709 return ret; 6710 6711 if (ras && adev->ras_enabled && 6712 adev->nbio.funcs->enable_doorbell_interrupt) 6713 adev->nbio.funcs->enable_doorbell_interrupt(adev, true); 6714 6715 if (amdgpu_passthrough(adev) && adev->nbio.funcs && 6716 adev->nbio.funcs->clear_doorbell_interrupt) 6717 adev->nbio.funcs->clear_doorbell_interrupt(adev); 6718 6719 return 0; 6720 } 6721 6722 /** 6723 * amdgpu_pci_error_detected - Called when a PCI error is detected. 6724 * @pdev: PCI device struct 6725 * @state: PCI channel state 6726 * 6727 * Description: Called when a PCI error is detected. 6728 * 6729 * Return: PCI_ERS_RESULT_NEED_RESET or PCI_ERS_RESULT_DISCONNECT. 6730 */ 6731 pci_ers_result_t amdgpu_pci_error_detected(struct pci_dev *pdev, pci_channel_state_t state) 6732 { 6733 struct drm_device *dev = pci_get_drvdata(pdev); 6734 struct amdgpu_device *adev = drm_to_adev(dev); 6735 struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev); 6736 struct amdgpu_reset_context reset_context; 6737 struct list_head device_list; 6738 int r = 0; 6739 6740 dev_info(adev->dev, "PCI error: detected callback!!\n"); 6741 6742 if (!amdgpu_dpm_is_link_reset_supported(adev)) { 6743 dev_warn(adev->dev, "No support for XGMI hive yet...\n"); 6744 return PCI_ERS_RESULT_DISCONNECT; 6745 } 6746 6747 adev->pci_channel_state = state; 6748 6749 switch (state) { 6750 case pci_channel_io_normal: 6751 dev_info(adev->dev, "pci_channel_io_normal: state(%d)!!\n", state); 6752 return PCI_ERS_RESULT_CAN_RECOVER; 6753 case pci_channel_io_frozen: 6754 /* Fatal error, prepare for slot reset */ 6755 dev_info(adev->dev, "pci_channel_io_frozen: state(%d)!!\n", state); 6756 6757 if (hive) 6758 mutex_lock(&hive->hive_lock); 6759 adev->pcie_reset_ctx.occurs_dpc = true; 6760 memset(&reset_context, 0, sizeof(reset_context)); 6761 INIT_LIST_HEAD(&device_list); 6762 6763 r = amdgpu_device_halt_activities(adev, NULL, &reset_context, &device_list, 6764 hive, false); 6765 if (hive) { 6766 mutex_unlock(&hive->hive_lock); 6767 amdgpu_put_xgmi_hive(hive); 6768 } 6769 if (r) 6770 return PCI_ERS_RESULT_DISCONNECT; 6771 return PCI_ERS_RESULT_NEED_RESET; 6772 case pci_channel_io_perm_failure: 6773 /* Permanent error, prepare for device removal */ 6774 dev_info(adev->dev, "pci_channel_io_perm_failure: state(%d)!!\n", state); 6775 return PCI_ERS_RESULT_DISCONNECT; 6776 } 6777 6778 return PCI_ERS_RESULT_NEED_RESET; 6779 } 6780 6781 /** 6782 * amdgpu_pci_mmio_enabled - Enable MMIO and dump debug registers 6783 * @pdev: pointer to PCI device 6784 */ 6785 pci_ers_result_t amdgpu_pci_mmio_enabled(struct pci_dev *pdev) 6786 { 6787 struct drm_device *dev = pci_get_drvdata(pdev); 6788 struct amdgpu_device *adev = drm_to_adev(dev); 6789 6790 dev_info(adev->dev, "PCI error: mmio enabled callback!!\n"); 6791 6792 /* TODO - dump whatever for debugging purposes */ 6793 6794 /* This called only if amdgpu_pci_error_detected returns 6795 * PCI_ERS_RESULT_CAN_RECOVER. Read/write to the device still 6796 * works, no need to reset slot. 6797 */ 6798 6799 return PCI_ERS_RESULT_RECOVERED; 6800 } 6801 6802 /** 6803 * amdgpu_pci_slot_reset - Called when PCI slot has been reset. 6804 * @pdev: PCI device struct 6805 * 6806 * Description: This routine is called by the pci error recovery 6807 * code after the PCI slot has been reset, just before we 6808 * should resume normal operations. 6809 */ 6810 pci_ers_result_t amdgpu_pci_slot_reset(struct pci_dev *pdev) 6811 { 6812 struct drm_device *dev = pci_get_drvdata(pdev); 6813 struct amdgpu_device *adev = drm_to_adev(dev); 6814 struct amdgpu_reset_context reset_context; 6815 struct amdgpu_device *tmp_adev; 6816 struct amdgpu_hive_info *hive; 6817 struct list_head device_list; 6818 int r = 0, i; 6819 u32 memsize; 6820 6821 /* PCI error slot reset should be skipped During RAS recovery */ 6822 if ((amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 3) || 6823 amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 4)) && 6824 amdgpu_ras_in_recovery(adev)) 6825 return PCI_ERS_RESULT_RECOVERED; 6826 6827 dev_info(adev->dev, "PCI error: slot reset callback!!\n"); 6828 6829 memset(&reset_context, 0, sizeof(reset_context)); 6830 6831 /* wait for asic to come out of reset */ 6832 msleep(700); 6833 6834 /* Restore PCI confspace */ 6835 amdgpu_device_load_pci_state(pdev); 6836 6837 /* confirm ASIC came out of reset */ 6838 for (i = 0; i < adev->usec_timeout; i++) { 6839 memsize = amdgpu_asic_get_config_memsize(adev); 6840 6841 if (memsize != 0xffffffff) 6842 break; 6843 udelay(1); 6844 } 6845 if (memsize == 0xffffffff) { 6846 r = -ETIME; 6847 goto out; 6848 } 6849 6850 reset_context.method = AMD_RESET_METHOD_NONE; 6851 reset_context.reset_req_dev = adev; 6852 set_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags); 6853 set_bit(AMDGPU_SKIP_COREDUMP, &reset_context.flags); 6854 INIT_LIST_HEAD(&device_list); 6855 6856 hive = amdgpu_get_xgmi_hive(adev); 6857 if (hive) { 6858 mutex_lock(&hive->hive_lock); 6859 reset_context.hive = hive; 6860 list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head) { 6861 tmp_adev->pcie_reset_ctx.in_link_reset = true; 6862 list_add_tail(&tmp_adev->reset_list, &device_list); 6863 } 6864 } else { 6865 set_bit(AMDGPU_SKIP_HW_RESET, &reset_context.flags); 6866 list_add_tail(&adev->reset_list, &device_list); 6867 } 6868 6869 r = amdgpu_device_asic_reset(adev, &device_list, &reset_context); 6870 out: 6871 if (!r) { 6872 if (amdgpu_device_cache_pci_state(adev->pdev)) 6873 pci_restore_state(adev->pdev); 6874 dev_info(adev->dev, "PCIe error recovery succeeded\n"); 6875 } else { 6876 dev_err(adev->dev, "PCIe error recovery failed, err:%d\n", r); 6877 if (hive) { 6878 list_for_each_entry(tmp_adev, &device_list, reset_list) 6879 amdgpu_device_unset_mp1_state(tmp_adev); 6880 amdgpu_device_unlock_reset_domain(adev->reset_domain); 6881 } 6882 } 6883 6884 if (hive) { 6885 mutex_unlock(&hive->hive_lock); 6886 amdgpu_put_xgmi_hive(hive); 6887 } 6888 6889 return r ? PCI_ERS_RESULT_DISCONNECT : PCI_ERS_RESULT_RECOVERED; 6890 } 6891 6892 /** 6893 * amdgpu_pci_resume() - resume normal ops after PCI reset 6894 * @pdev: pointer to PCI device 6895 * 6896 * Called when the error recovery driver tells us that its 6897 * OK to resume normal operation. 6898 */ 6899 void amdgpu_pci_resume(struct pci_dev *pdev) 6900 { 6901 struct drm_device *dev = pci_get_drvdata(pdev); 6902 struct amdgpu_device *adev = drm_to_adev(dev); 6903 struct list_head device_list; 6904 struct amdgpu_hive_info *hive = NULL; 6905 struct amdgpu_device *tmp_adev = NULL; 6906 6907 dev_info(adev->dev, "PCI error: resume callback!!\n"); 6908 6909 /* Only continue execution for the case of pci_channel_io_frozen */ 6910 if (adev->pci_channel_state != pci_channel_io_frozen) 6911 return; 6912 6913 INIT_LIST_HEAD(&device_list); 6914 6915 hive = amdgpu_get_xgmi_hive(adev); 6916 if (hive) { 6917 mutex_lock(&hive->hive_lock); 6918 list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head) { 6919 tmp_adev->pcie_reset_ctx.in_link_reset = false; 6920 list_add_tail(&tmp_adev->reset_list, &device_list); 6921 } 6922 } else 6923 list_add_tail(&adev->reset_list, &device_list); 6924 6925 amdgpu_device_sched_resume(&device_list, NULL, NULL); 6926 amdgpu_device_gpu_resume(adev, &device_list, false); 6927 adev->pcie_reset_ctx.occurs_dpc = false; 6928 6929 if (hive) { 6930 mutex_unlock(&hive->hive_lock); 6931 amdgpu_put_xgmi_hive(hive); 6932 } 6933 } 6934 6935 bool amdgpu_device_cache_pci_state(struct pci_dev *pdev) 6936 { 6937 struct drm_device *dev = pci_get_drvdata(pdev); 6938 struct amdgpu_device *adev = drm_to_adev(dev); 6939 int r; 6940 6941 if (amdgpu_sriov_vf(adev)) 6942 return false; 6943 6944 r = pci_save_state(pdev); 6945 if (!r) { 6946 kfree(adev->pci_state); 6947 6948 adev->pci_state = pci_store_saved_state(pdev); 6949 6950 if (!adev->pci_state) { 6951 DRM_ERROR("Failed to store PCI saved state"); 6952 return false; 6953 } 6954 } else { 6955 DRM_WARN("Failed to save PCI state, err:%d\n", r); 6956 return false; 6957 } 6958 6959 return true; 6960 } 6961 6962 bool amdgpu_device_load_pci_state(struct pci_dev *pdev) 6963 { 6964 struct drm_device *dev = pci_get_drvdata(pdev); 6965 struct amdgpu_device *adev = drm_to_adev(dev); 6966 int r; 6967 6968 if (!adev->pci_state) 6969 return false; 6970 6971 r = pci_load_saved_state(pdev, adev->pci_state); 6972 6973 if (!r) { 6974 pci_restore_state(pdev); 6975 } else { 6976 DRM_WARN("Failed to load PCI state, err:%d\n", r); 6977 return false; 6978 } 6979 6980 return true; 6981 } 6982 6983 void amdgpu_device_flush_hdp(struct amdgpu_device *adev, 6984 struct amdgpu_ring *ring) 6985 { 6986 #ifdef CONFIG_X86_64 6987 if ((adev->flags & AMD_IS_APU) && !amdgpu_passthrough(adev)) 6988 return; 6989 #endif 6990 if (adev->gmc.xgmi.connected_to_cpu) 6991 return; 6992 6993 if (ring && ring->funcs->emit_hdp_flush) 6994 amdgpu_ring_emit_hdp_flush(ring); 6995 else 6996 amdgpu_asic_flush_hdp(adev, ring); 6997 } 6998 6999 void amdgpu_device_invalidate_hdp(struct amdgpu_device *adev, 7000 struct amdgpu_ring *ring) 7001 { 7002 #ifdef CONFIG_X86_64 7003 if ((adev->flags & AMD_IS_APU) && !amdgpu_passthrough(adev)) 7004 return; 7005 #endif 7006 if (adev->gmc.xgmi.connected_to_cpu) 7007 return; 7008 7009 amdgpu_asic_invalidate_hdp(adev, ring); 7010 } 7011 7012 int amdgpu_in_reset(struct amdgpu_device *adev) 7013 { 7014 return atomic_read(&adev->reset_domain->in_gpu_reset); 7015 } 7016 7017 /** 7018 * amdgpu_device_halt() - bring hardware to some kind of halt state 7019 * 7020 * @adev: amdgpu_device pointer 7021 * 7022 * Bring hardware to some kind of halt state so that no one can touch it 7023 * any more. It will help to maintain error context when error occurred. 7024 * Compare to a simple hang, the system will keep stable at least for SSH 7025 * access. Then it should be trivial to inspect the hardware state and 7026 * see what's going on. Implemented as following: 7027 * 7028 * 1. drm_dev_unplug() makes device inaccessible to user space(IOCTLs, etc), 7029 * clears all CPU mappings to device, disallows remappings through page faults 7030 * 2. amdgpu_irq_disable_all() disables all interrupts 7031 * 3. amdgpu_fence_driver_hw_fini() signals all HW fences 7032 * 4. set adev->no_hw_access to avoid potential crashes after setp 5 7033 * 5. amdgpu_device_unmap_mmio() clears all MMIO mappings 7034 * 6. pci_disable_device() and pci_wait_for_pending_transaction() 7035 * flush any in flight DMA operations 7036 */ 7037 void amdgpu_device_halt(struct amdgpu_device *adev) 7038 { 7039 struct pci_dev *pdev = adev->pdev; 7040 struct drm_device *ddev = adev_to_drm(adev); 7041 7042 amdgpu_xcp_dev_unplug(adev); 7043 drm_dev_unplug(ddev); 7044 7045 amdgpu_irq_disable_all(adev); 7046 7047 amdgpu_fence_driver_hw_fini(adev); 7048 7049 adev->no_hw_access = true; 7050 7051 amdgpu_device_unmap_mmio(adev); 7052 7053 pci_disable_device(pdev); 7054 pci_wait_for_pending_transaction(pdev); 7055 } 7056 7057 u32 amdgpu_device_pcie_port_rreg(struct amdgpu_device *adev, 7058 u32 reg) 7059 { 7060 unsigned long flags, address, data; 7061 u32 r; 7062 7063 address = adev->nbio.funcs->get_pcie_port_index_offset(adev); 7064 data = adev->nbio.funcs->get_pcie_port_data_offset(adev); 7065 7066 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 7067 WREG32(address, reg * 4); 7068 (void)RREG32(address); 7069 r = RREG32(data); 7070 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 7071 return r; 7072 } 7073 7074 void amdgpu_device_pcie_port_wreg(struct amdgpu_device *adev, 7075 u32 reg, u32 v) 7076 { 7077 unsigned long flags, address, data; 7078 7079 address = adev->nbio.funcs->get_pcie_port_index_offset(adev); 7080 data = adev->nbio.funcs->get_pcie_port_data_offset(adev); 7081 7082 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 7083 WREG32(address, reg * 4); 7084 (void)RREG32(address); 7085 WREG32(data, v); 7086 (void)RREG32(data); 7087 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 7088 } 7089 7090 /** 7091 * amdgpu_device_get_gang - return a reference to the current gang 7092 * @adev: amdgpu_device pointer 7093 * 7094 * Returns: A new reference to the current gang leader. 7095 */ 7096 struct dma_fence *amdgpu_device_get_gang(struct amdgpu_device *adev) 7097 { 7098 struct dma_fence *fence; 7099 7100 rcu_read_lock(); 7101 fence = dma_fence_get_rcu_safe(&adev->gang_submit); 7102 rcu_read_unlock(); 7103 return fence; 7104 } 7105 7106 /** 7107 * amdgpu_device_switch_gang - switch to a new gang 7108 * @adev: amdgpu_device pointer 7109 * @gang: the gang to switch to 7110 * 7111 * Try to switch to a new gang. 7112 * Returns: NULL if we switched to the new gang or a reference to the current 7113 * gang leader. 7114 */ 7115 struct dma_fence *amdgpu_device_switch_gang(struct amdgpu_device *adev, 7116 struct dma_fence *gang) 7117 { 7118 struct dma_fence *old = NULL; 7119 7120 dma_fence_get(gang); 7121 do { 7122 dma_fence_put(old); 7123 old = amdgpu_device_get_gang(adev); 7124 if (old == gang) 7125 break; 7126 7127 if (!dma_fence_is_signaled(old)) { 7128 dma_fence_put(gang); 7129 return old; 7130 } 7131 7132 } while (cmpxchg((struct dma_fence __force **)&adev->gang_submit, 7133 old, gang) != old); 7134 7135 /* 7136 * Drop it once for the exchanged reference in adev and once for the 7137 * thread local reference acquired in amdgpu_device_get_gang(). 7138 */ 7139 dma_fence_put(old); 7140 dma_fence_put(old); 7141 return NULL; 7142 } 7143 7144 /** 7145 * amdgpu_device_enforce_isolation - enforce HW isolation 7146 * @adev: the amdgpu device pointer 7147 * @ring: the HW ring the job is supposed to run on 7148 * @job: the job which is about to be pushed to the HW ring 7149 * 7150 * Makes sure that only one client at a time can use the GFX block. 7151 * Returns: The dependency to wait on before the job can be pushed to the HW. 7152 * The function is called multiple times until NULL is returned. 7153 */ 7154 struct dma_fence *amdgpu_device_enforce_isolation(struct amdgpu_device *adev, 7155 struct amdgpu_ring *ring, 7156 struct amdgpu_job *job) 7157 { 7158 struct amdgpu_isolation *isolation = &adev->isolation[ring->xcp_id]; 7159 struct drm_sched_fence *f = job->base.s_fence; 7160 struct dma_fence *dep; 7161 void *owner; 7162 int r; 7163 7164 /* 7165 * For now enforce isolation only for the GFX block since we only need 7166 * the cleaner shader on those rings. 7167 */ 7168 if (ring->funcs->type != AMDGPU_RING_TYPE_GFX && 7169 ring->funcs->type != AMDGPU_RING_TYPE_COMPUTE) 7170 return NULL; 7171 7172 /* 7173 * All submissions where enforce isolation is false are handled as if 7174 * they come from a single client. Use ~0l as the owner to distinct it 7175 * from kernel submissions where the owner is NULL. 7176 */ 7177 owner = job->enforce_isolation ? f->owner : (void *)~0l; 7178 7179 mutex_lock(&adev->enforce_isolation_mutex); 7180 7181 /* 7182 * The "spearhead" submission is the first one which changes the 7183 * ownership to its client. We always need to wait for it to be 7184 * pushed to the HW before proceeding with anything. 7185 */ 7186 if (&f->scheduled != isolation->spearhead && 7187 !dma_fence_is_signaled(isolation->spearhead)) { 7188 dep = isolation->spearhead; 7189 goto out_grab_ref; 7190 } 7191 7192 if (isolation->owner != owner) { 7193 7194 /* 7195 * Wait for any gang to be assembled before switching to a 7196 * different owner or otherwise we could deadlock the 7197 * submissions. 7198 */ 7199 if (!job->gang_submit) { 7200 dep = amdgpu_device_get_gang(adev); 7201 if (!dma_fence_is_signaled(dep)) 7202 goto out_return_dep; 7203 dma_fence_put(dep); 7204 } 7205 7206 dma_fence_put(isolation->spearhead); 7207 isolation->spearhead = dma_fence_get(&f->scheduled); 7208 amdgpu_sync_move(&isolation->active, &isolation->prev); 7209 trace_amdgpu_isolation(isolation->owner, owner); 7210 isolation->owner = owner; 7211 } 7212 7213 /* 7214 * Specifying the ring here helps to pipeline submissions even when 7215 * isolation is enabled. If that is not desired for testing NULL can be 7216 * used instead of the ring to enforce a CPU round trip while switching 7217 * between clients. 7218 */ 7219 dep = amdgpu_sync_peek_fence(&isolation->prev, ring); 7220 r = amdgpu_sync_fence(&isolation->active, &f->finished, GFP_NOWAIT); 7221 if (r) 7222 DRM_WARN("OOM tracking isolation\n"); 7223 7224 out_grab_ref: 7225 dma_fence_get(dep); 7226 out_return_dep: 7227 mutex_unlock(&adev->enforce_isolation_mutex); 7228 return dep; 7229 } 7230 7231 bool amdgpu_device_has_display_hardware(struct amdgpu_device *adev) 7232 { 7233 switch (adev->asic_type) { 7234 #ifdef CONFIG_DRM_AMDGPU_SI 7235 case CHIP_HAINAN: 7236 #endif 7237 case CHIP_TOPAZ: 7238 /* chips with no display hardware */ 7239 return false; 7240 #ifdef CONFIG_DRM_AMDGPU_SI 7241 case CHIP_TAHITI: 7242 case CHIP_PITCAIRN: 7243 case CHIP_VERDE: 7244 case CHIP_OLAND: 7245 #endif 7246 #ifdef CONFIG_DRM_AMDGPU_CIK 7247 case CHIP_BONAIRE: 7248 case CHIP_HAWAII: 7249 case CHIP_KAVERI: 7250 case CHIP_KABINI: 7251 case CHIP_MULLINS: 7252 #endif 7253 case CHIP_TONGA: 7254 case CHIP_FIJI: 7255 case CHIP_POLARIS10: 7256 case CHIP_POLARIS11: 7257 case CHIP_POLARIS12: 7258 case CHIP_VEGAM: 7259 case CHIP_CARRIZO: 7260 case CHIP_STONEY: 7261 /* chips with display hardware */ 7262 return true; 7263 default: 7264 /* IP discovery */ 7265 if (!amdgpu_ip_version(adev, DCE_HWIP, 0) || 7266 (adev->harvest_ip_mask & AMD_HARVEST_IP_DMU_MASK)) 7267 return false; 7268 return true; 7269 } 7270 } 7271 7272 uint32_t amdgpu_device_wait_on_rreg(struct amdgpu_device *adev, 7273 uint32_t inst, uint32_t reg_addr, char reg_name[], 7274 uint32_t expected_value, uint32_t mask) 7275 { 7276 uint32_t ret = 0; 7277 uint32_t old_ = 0; 7278 uint32_t tmp_ = RREG32(reg_addr); 7279 uint32_t loop = adev->usec_timeout; 7280 7281 while ((tmp_ & (mask)) != (expected_value)) { 7282 if (old_ != tmp_) { 7283 loop = adev->usec_timeout; 7284 old_ = tmp_; 7285 } else 7286 udelay(1); 7287 tmp_ = RREG32(reg_addr); 7288 loop--; 7289 if (!loop) { 7290 DRM_WARN("Register(%d) [%s] failed to reach value 0x%08x != 0x%08xn", 7291 inst, reg_name, (uint32_t)expected_value, 7292 (uint32_t)(tmp_ & (mask))); 7293 ret = -ETIMEDOUT; 7294 break; 7295 } 7296 } 7297 return ret; 7298 } 7299 7300 ssize_t amdgpu_get_soft_full_reset_mask(struct amdgpu_ring *ring) 7301 { 7302 ssize_t size = 0; 7303 7304 if (!ring || !ring->adev) 7305 return size; 7306 7307 if (amdgpu_device_should_recover_gpu(ring->adev)) 7308 size |= AMDGPU_RESET_TYPE_FULL; 7309 7310 if (unlikely(!ring->adev->debug_disable_soft_recovery) && 7311 !amdgpu_sriov_vf(ring->adev) && ring->funcs->soft_recovery) 7312 size |= AMDGPU_RESET_TYPE_SOFT_RESET; 7313 7314 return size; 7315 } 7316 7317 ssize_t amdgpu_show_reset_mask(char *buf, uint32_t supported_reset) 7318 { 7319 ssize_t size = 0; 7320 7321 if (supported_reset == 0) { 7322 size += sysfs_emit_at(buf, size, "unsupported"); 7323 size += sysfs_emit_at(buf, size, "\n"); 7324 return size; 7325 7326 } 7327 7328 if (supported_reset & AMDGPU_RESET_TYPE_SOFT_RESET) 7329 size += sysfs_emit_at(buf, size, "soft "); 7330 7331 if (supported_reset & AMDGPU_RESET_TYPE_PER_QUEUE) 7332 size += sysfs_emit_at(buf, size, "queue "); 7333 7334 if (supported_reset & AMDGPU_RESET_TYPE_PER_PIPE) 7335 size += sysfs_emit_at(buf, size, "pipe "); 7336 7337 if (supported_reset & AMDGPU_RESET_TYPE_FULL) 7338 size += sysfs_emit_at(buf, size, "full "); 7339 7340 size += sysfs_emit_at(buf, size, "\n"); 7341 return size; 7342 } 7343