1 /* 2 * Copyright 2008 Advanced Micro Devices, Inc. 3 * Copyright 2008 Red Hat Inc. 4 * Copyright 2009 Jerome Glisse. 5 * 6 * Permission is hereby granted, free of charge, to any person obtaining a 7 * copy of this software and associated documentation files (the "Software"), 8 * to deal in the Software without restriction, including without limitation 9 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 10 * and/or sell copies of the Software, and to permit persons to whom the 11 * Software is furnished to do so, subject to the following conditions: 12 * 13 * The above copyright notice and this permission notice shall be included in 14 * all copies or substantial portions of the Software. 15 * 16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 19 * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR 20 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, 21 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR 22 * OTHER DEALINGS IN THE SOFTWARE. 23 * 24 * Authors: Dave Airlie 25 * Alex Deucher 26 * Jerome Glisse 27 */ 28 29 #include <linux/aperture.h> 30 #include <linux/power_supply.h> 31 #include <linux/kthread.h> 32 #include <linux/module.h> 33 #include <linux/console.h> 34 #include <linux/slab.h> 35 #include <linux/iommu.h> 36 #include <linux/pci.h> 37 #include <linux/pci-p2pdma.h> 38 #include <linux/apple-gmux.h> 39 40 #include <drm/drm_atomic_helper.h> 41 #include <drm/drm_client_event.h> 42 #include <drm/drm_crtc_helper.h> 43 #include <drm/drm_probe_helper.h> 44 #include <drm/amdgpu_drm.h> 45 #include <linux/device.h> 46 #include <linux/vgaarb.h> 47 #include <linux/vga_switcheroo.h> 48 #include <linux/efi.h> 49 #include "amdgpu.h" 50 #include "amdgpu_trace.h" 51 #include "amdgpu_i2c.h" 52 #include "atom.h" 53 #include "amdgpu_atombios.h" 54 #include "amdgpu_atomfirmware.h" 55 #include "amd_pcie.h" 56 #ifdef CONFIG_DRM_AMDGPU_SI 57 #include "si.h" 58 #endif 59 #ifdef CONFIG_DRM_AMDGPU_CIK 60 #include "cik.h" 61 #endif 62 #include "vi.h" 63 #include "soc15.h" 64 #include "nv.h" 65 #include "bif/bif_4_1_d.h" 66 #include <linux/firmware.h> 67 #include "amdgpu_vf_error.h" 68 69 #include "amdgpu_amdkfd.h" 70 #include "amdgpu_pm.h" 71 72 #include "amdgpu_xgmi.h" 73 #include "amdgpu_ras.h" 74 #include "amdgpu_pmu.h" 75 #include "amdgpu_fru_eeprom.h" 76 #include "amdgpu_reset.h" 77 #include "amdgpu_virt.h" 78 #include "amdgpu_dev_coredump.h" 79 80 #include <linux/suspend.h> 81 #include <drm/task_barrier.h> 82 #include <linux/pm_runtime.h> 83 84 #include <drm/drm_drv.h> 85 86 #if IS_ENABLED(CONFIG_X86) 87 #include <asm/intel-family.h> 88 #include <asm/cpu_device_id.h> 89 #endif 90 91 MODULE_FIRMWARE("amdgpu/vega10_gpu_info.bin"); 92 MODULE_FIRMWARE("amdgpu/vega12_gpu_info.bin"); 93 MODULE_FIRMWARE("amdgpu/raven_gpu_info.bin"); 94 MODULE_FIRMWARE("amdgpu/picasso_gpu_info.bin"); 95 MODULE_FIRMWARE("amdgpu/raven2_gpu_info.bin"); 96 MODULE_FIRMWARE("amdgpu/arcturus_gpu_info.bin"); 97 MODULE_FIRMWARE("amdgpu/navi12_gpu_info.bin"); 98 99 #define AMDGPU_RESUME_MS 2000 100 #define AMDGPU_MAX_RETRY_LIMIT 2 101 #define AMDGPU_RETRY_SRIOV_RESET(r) ((r) == -EBUSY || (r) == -ETIMEDOUT || (r) == -EINVAL) 102 #define AMDGPU_PCIE_INDEX_FALLBACK (0x38 >> 2) 103 #define AMDGPU_PCIE_INDEX_HI_FALLBACK (0x44 >> 2) 104 #define AMDGPU_PCIE_DATA_FALLBACK (0x3C >> 2) 105 106 #define AMDGPU_VBIOS_SKIP (1U << 0) 107 #define AMDGPU_VBIOS_OPTIONAL (1U << 1) 108 109 static const struct drm_driver amdgpu_kms_driver; 110 111 const char *amdgpu_asic_name[] = { 112 "TAHITI", 113 "PITCAIRN", 114 "VERDE", 115 "OLAND", 116 "HAINAN", 117 "BONAIRE", 118 "KAVERI", 119 "KABINI", 120 "HAWAII", 121 "MULLINS", 122 "TOPAZ", 123 "TONGA", 124 "FIJI", 125 "CARRIZO", 126 "STONEY", 127 "POLARIS10", 128 "POLARIS11", 129 "POLARIS12", 130 "VEGAM", 131 "VEGA10", 132 "VEGA12", 133 "VEGA20", 134 "RAVEN", 135 "ARCTURUS", 136 "RENOIR", 137 "ALDEBARAN", 138 "NAVI10", 139 "CYAN_SKILLFISH", 140 "NAVI14", 141 "NAVI12", 142 "SIENNA_CICHLID", 143 "NAVY_FLOUNDER", 144 "VANGOGH", 145 "DIMGREY_CAVEFISH", 146 "BEIGE_GOBY", 147 "YELLOW_CARP", 148 "IP DISCOVERY", 149 "LAST", 150 }; 151 152 #define AMDGPU_IP_BLK_MASK_ALL GENMASK(AMD_IP_BLOCK_TYPE_NUM - 1, 0) 153 /* 154 * Default init level where all blocks are expected to be initialized. This is 155 * the level of initialization expected by default and also after a full reset 156 * of the device. 157 */ 158 struct amdgpu_init_level amdgpu_init_default = { 159 .level = AMDGPU_INIT_LEVEL_DEFAULT, 160 .hwini_ip_block_mask = AMDGPU_IP_BLK_MASK_ALL, 161 }; 162 163 struct amdgpu_init_level amdgpu_init_recovery = { 164 .level = AMDGPU_INIT_LEVEL_RESET_RECOVERY, 165 .hwini_ip_block_mask = AMDGPU_IP_BLK_MASK_ALL, 166 }; 167 168 /* 169 * Minimal blocks needed to be initialized before a XGMI hive can be reset. This 170 * is used for cases like reset on initialization where the entire hive needs to 171 * be reset before first use. 172 */ 173 struct amdgpu_init_level amdgpu_init_minimal_xgmi = { 174 .level = AMDGPU_INIT_LEVEL_MINIMAL_XGMI, 175 .hwini_ip_block_mask = 176 BIT(AMD_IP_BLOCK_TYPE_GMC) | BIT(AMD_IP_BLOCK_TYPE_SMC) | 177 BIT(AMD_IP_BLOCK_TYPE_COMMON) | BIT(AMD_IP_BLOCK_TYPE_IH) | 178 BIT(AMD_IP_BLOCK_TYPE_PSP) 179 }; 180 181 static inline bool amdgpu_ip_member_of_hwini(struct amdgpu_device *adev, 182 enum amd_ip_block_type block) 183 { 184 return (adev->init_lvl->hwini_ip_block_mask & (1U << block)) != 0; 185 } 186 187 void amdgpu_set_init_level(struct amdgpu_device *adev, 188 enum amdgpu_init_lvl_id lvl) 189 { 190 switch (lvl) { 191 case AMDGPU_INIT_LEVEL_MINIMAL_XGMI: 192 adev->init_lvl = &amdgpu_init_minimal_xgmi; 193 break; 194 case AMDGPU_INIT_LEVEL_RESET_RECOVERY: 195 adev->init_lvl = &amdgpu_init_recovery; 196 break; 197 case AMDGPU_INIT_LEVEL_DEFAULT: 198 fallthrough; 199 default: 200 adev->init_lvl = &amdgpu_init_default; 201 break; 202 } 203 } 204 205 static inline void amdgpu_device_stop_pending_resets(struct amdgpu_device *adev); 206 static int amdgpu_device_pm_notifier(struct notifier_block *nb, unsigned long mode, 207 void *data); 208 209 /** 210 * DOC: pcie_replay_count 211 * 212 * The amdgpu driver provides a sysfs API for reporting the total number 213 * of PCIe replays (NAKs). 214 * The file pcie_replay_count is used for this and returns the total 215 * number of replays as a sum of the NAKs generated and NAKs received. 216 */ 217 218 static ssize_t amdgpu_device_get_pcie_replay_count(struct device *dev, 219 struct device_attribute *attr, char *buf) 220 { 221 struct drm_device *ddev = dev_get_drvdata(dev); 222 struct amdgpu_device *adev = drm_to_adev(ddev); 223 uint64_t cnt = amdgpu_asic_get_pcie_replay_count(adev); 224 225 return sysfs_emit(buf, "%llu\n", cnt); 226 } 227 228 static DEVICE_ATTR(pcie_replay_count, 0444, 229 amdgpu_device_get_pcie_replay_count, NULL); 230 231 static int amdgpu_device_attr_sysfs_init(struct amdgpu_device *adev) 232 { 233 int ret = 0; 234 235 if (!amdgpu_sriov_vf(adev)) 236 ret = sysfs_create_file(&adev->dev->kobj, 237 &dev_attr_pcie_replay_count.attr); 238 239 return ret; 240 } 241 242 static void amdgpu_device_attr_sysfs_fini(struct amdgpu_device *adev) 243 { 244 if (!amdgpu_sriov_vf(adev)) 245 sysfs_remove_file(&adev->dev->kobj, 246 &dev_attr_pcie_replay_count.attr); 247 } 248 249 static ssize_t amdgpu_sysfs_reg_state_get(struct file *f, struct kobject *kobj, 250 struct bin_attribute *attr, char *buf, 251 loff_t ppos, size_t count) 252 { 253 struct device *dev = kobj_to_dev(kobj); 254 struct drm_device *ddev = dev_get_drvdata(dev); 255 struct amdgpu_device *adev = drm_to_adev(ddev); 256 ssize_t bytes_read; 257 258 switch (ppos) { 259 case AMDGPU_SYS_REG_STATE_XGMI: 260 bytes_read = amdgpu_asic_get_reg_state( 261 adev, AMDGPU_REG_STATE_TYPE_XGMI, buf, count); 262 break; 263 case AMDGPU_SYS_REG_STATE_WAFL: 264 bytes_read = amdgpu_asic_get_reg_state( 265 adev, AMDGPU_REG_STATE_TYPE_WAFL, buf, count); 266 break; 267 case AMDGPU_SYS_REG_STATE_PCIE: 268 bytes_read = amdgpu_asic_get_reg_state( 269 adev, AMDGPU_REG_STATE_TYPE_PCIE, buf, count); 270 break; 271 case AMDGPU_SYS_REG_STATE_USR: 272 bytes_read = amdgpu_asic_get_reg_state( 273 adev, AMDGPU_REG_STATE_TYPE_USR, buf, count); 274 break; 275 case AMDGPU_SYS_REG_STATE_USR_1: 276 bytes_read = amdgpu_asic_get_reg_state( 277 adev, AMDGPU_REG_STATE_TYPE_USR_1, buf, count); 278 break; 279 default: 280 return -EINVAL; 281 } 282 283 return bytes_read; 284 } 285 286 BIN_ATTR(reg_state, 0444, amdgpu_sysfs_reg_state_get, NULL, 287 AMDGPU_SYS_REG_STATE_END); 288 289 int amdgpu_reg_state_sysfs_init(struct amdgpu_device *adev) 290 { 291 int ret; 292 293 if (!amdgpu_asic_get_reg_state_supported(adev)) 294 return 0; 295 296 ret = sysfs_create_bin_file(&adev->dev->kobj, &bin_attr_reg_state); 297 298 return ret; 299 } 300 301 void amdgpu_reg_state_sysfs_fini(struct amdgpu_device *adev) 302 { 303 if (!amdgpu_asic_get_reg_state_supported(adev)) 304 return; 305 sysfs_remove_bin_file(&adev->dev->kobj, &bin_attr_reg_state); 306 } 307 308 int amdgpu_ip_block_suspend(struct amdgpu_ip_block *ip_block) 309 { 310 int r; 311 312 if (ip_block->version->funcs->suspend) { 313 r = ip_block->version->funcs->suspend(ip_block); 314 if (r) { 315 dev_err(ip_block->adev->dev, 316 "suspend of IP block <%s> failed %d\n", 317 ip_block->version->funcs->name, r); 318 return r; 319 } 320 } 321 322 ip_block->status.hw = false; 323 return 0; 324 } 325 326 int amdgpu_ip_block_resume(struct amdgpu_ip_block *ip_block) 327 { 328 int r; 329 330 if (ip_block->version->funcs->resume) { 331 r = ip_block->version->funcs->resume(ip_block); 332 if (r) { 333 dev_err(ip_block->adev->dev, 334 "resume of IP block <%s> failed %d\n", 335 ip_block->version->funcs->name, r); 336 return r; 337 } 338 } 339 340 ip_block->status.hw = true; 341 return 0; 342 } 343 344 /** 345 * DOC: board_info 346 * 347 * The amdgpu driver provides a sysfs API for giving board related information. 348 * It provides the form factor information in the format 349 * 350 * type : form factor 351 * 352 * Possible form factor values 353 * 354 * - "cem" - PCIE CEM card 355 * - "oam" - Open Compute Accelerator Module 356 * - "unknown" - Not known 357 * 358 */ 359 360 static ssize_t amdgpu_device_get_board_info(struct device *dev, 361 struct device_attribute *attr, 362 char *buf) 363 { 364 struct drm_device *ddev = dev_get_drvdata(dev); 365 struct amdgpu_device *adev = drm_to_adev(ddev); 366 enum amdgpu_pkg_type pkg_type = AMDGPU_PKG_TYPE_CEM; 367 const char *pkg; 368 369 if (adev->smuio.funcs && adev->smuio.funcs->get_pkg_type) 370 pkg_type = adev->smuio.funcs->get_pkg_type(adev); 371 372 switch (pkg_type) { 373 case AMDGPU_PKG_TYPE_CEM: 374 pkg = "cem"; 375 break; 376 case AMDGPU_PKG_TYPE_OAM: 377 pkg = "oam"; 378 break; 379 default: 380 pkg = "unknown"; 381 break; 382 } 383 384 return sysfs_emit(buf, "%s : %s\n", "type", pkg); 385 } 386 387 static DEVICE_ATTR(board_info, 0444, amdgpu_device_get_board_info, NULL); 388 389 static struct attribute *amdgpu_board_attrs[] = { 390 &dev_attr_board_info.attr, 391 NULL, 392 }; 393 394 static umode_t amdgpu_board_attrs_is_visible(struct kobject *kobj, 395 struct attribute *attr, int n) 396 { 397 struct device *dev = kobj_to_dev(kobj); 398 struct drm_device *ddev = dev_get_drvdata(dev); 399 struct amdgpu_device *adev = drm_to_adev(ddev); 400 401 if (adev->flags & AMD_IS_APU) 402 return 0; 403 404 return attr->mode; 405 } 406 407 static const struct attribute_group amdgpu_board_attrs_group = { 408 .attrs = amdgpu_board_attrs, 409 .is_visible = amdgpu_board_attrs_is_visible 410 }; 411 412 static void amdgpu_device_get_pcie_info(struct amdgpu_device *adev); 413 414 415 /** 416 * amdgpu_device_supports_px - Is the device a dGPU with ATPX power control 417 * 418 * @dev: drm_device pointer 419 * 420 * Returns true if the device is a dGPU with ATPX power control, 421 * otherwise return false. 422 */ 423 bool amdgpu_device_supports_px(struct drm_device *dev) 424 { 425 struct amdgpu_device *adev = drm_to_adev(dev); 426 427 if ((adev->flags & AMD_IS_PX) && !amdgpu_is_atpx_hybrid()) 428 return true; 429 return false; 430 } 431 432 /** 433 * amdgpu_device_supports_boco - Is the device a dGPU with ACPI power resources 434 * 435 * @dev: drm_device pointer 436 * 437 * Returns true if the device is a dGPU with ACPI power control, 438 * otherwise return false. 439 */ 440 bool amdgpu_device_supports_boco(struct drm_device *dev) 441 { 442 struct amdgpu_device *adev = drm_to_adev(dev); 443 444 if (!IS_ENABLED(CONFIG_HOTPLUG_PCI_PCIE)) 445 return false; 446 447 if (adev->has_pr3 || 448 ((adev->flags & AMD_IS_PX) && amdgpu_is_atpx_hybrid())) 449 return true; 450 return false; 451 } 452 453 /** 454 * amdgpu_device_supports_baco - Does the device support BACO 455 * 456 * @dev: drm_device pointer 457 * 458 * Return: 459 * 1 if the device supports BACO; 460 * 3 if the device supports MACO (only works if BACO is supported) 461 * otherwise return 0. 462 */ 463 int amdgpu_device_supports_baco(struct drm_device *dev) 464 { 465 struct amdgpu_device *adev = drm_to_adev(dev); 466 467 return amdgpu_asic_supports_baco(adev); 468 } 469 470 void amdgpu_device_detect_runtime_pm_mode(struct amdgpu_device *adev) 471 { 472 struct drm_device *dev; 473 int bamaco_support; 474 475 dev = adev_to_drm(adev); 476 477 adev->pm.rpm_mode = AMDGPU_RUNPM_NONE; 478 bamaco_support = amdgpu_device_supports_baco(dev); 479 480 switch (amdgpu_runtime_pm) { 481 case 2: 482 if (bamaco_support & MACO_SUPPORT) { 483 adev->pm.rpm_mode = AMDGPU_RUNPM_BAMACO; 484 dev_info(adev->dev, "Forcing BAMACO for runtime pm\n"); 485 } else if (bamaco_support == BACO_SUPPORT) { 486 adev->pm.rpm_mode = AMDGPU_RUNPM_BACO; 487 dev_info(adev->dev, "Requested mode BAMACO not available,fallback to use BACO\n"); 488 } 489 break; 490 case 1: 491 if (bamaco_support & BACO_SUPPORT) { 492 adev->pm.rpm_mode = AMDGPU_RUNPM_BACO; 493 dev_info(adev->dev, "Forcing BACO for runtime pm\n"); 494 } 495 break; 496 case -1: 497 case -2: 498 if (amdgpu_device_supports_px(dev)) { /* enable PX as runtime mode */ 499 adev->pm.rpm_mode = AMDGPU_RUNPM_PX; 500 dev_info(adev->dev, "Using ATPX for runtime pm\n"); 501 } else if (amdgpu_device_supports_boco(dev)) { /* enable boco as runtime mode */ 502 adev->pm.rpm_mode = AMDGPU_RUNPM_BOCO; 503 dev_info(adev->dev, "Using BOCO for runtime pm\n"); 504 } else { 505 if (!bamaco_support) 506 goto no_runtime_pm; 507 508 switch (adev->asic_type) { 509 case CHIP_VEGA20: 510 case CHIP_ARCTURUS: 511 /* BACO are not supported on vega20 and arctrus */ 512 break; 513 case CHIP_VEGA10: 514 /* enable BACO as runpm mode if noretry=0 */ 515 if (!adev->gmc.noretry) 516 adev->pm.rpm_mode = AMDGPU_RUNPM_BACO; 517 break; 518 default: 519 /* enable BACO as runpm mode on CI+ */ 520 adev->pm.rpm_mode = AMDGPU_RUNPM_BACO; 521 break; 522 } 523 524 if (adev->pm.rpm_mode == AMDGPU_RUNPM_BACO) { 525 if (bamaco_support & MACO_SUPPORT) { 526 adev->pm.rpm_mode = AMDGPU_RUNPM_BAMACO; 527 dev_info(adev->dev, "Using BAMACO for runtime pm\n"); 528 } else { 529 dev_info(adev->dev, "Using BACO for runtime pm\n"); 530 } 531 } 532 } 533 break; 534 case 0: 535 dev_info(adev->dev, "runtime pm is manually disabled\n"); 536 break; 537 default: 538 break; 539 } 540 541 no_runtime_pm: 542 if (adev->pm.rpm_mode == AMDGPU_RUNPM_NONE) 543 dev_info(adev->dev, "Runtime PM not available\n"); 544 } 545 /** 546 * amdgpu_device_supports_smart_shift - Is the device dGPU with 547 * smart shift support 548 * 549 * @dev: drm_device pointer 550 * 551 * Returns true if the device is a dGPU with Smart Shift support, 552 * otherwise returns false. 553 */ 554 bool amdgpu_device_supports_smart_shift(struct drm_device *dev) 555 { 556 return (amdgpu_device_supports_boco(dev) && 557 amdgpu_acpi_is_power_shift_control_supported()); 558 } 559 560 /* 561 * VRAM access helper functions 562 */ 563 564 /** 565 * amdgpu_device_mm_access - access vram by MM_INDEX/MM_DATA 566 * 567 * @adev: amdgpu_device pointer 568 * @pos: offset of the buffer in vram 569 * @buf: virtual address of the buffer in system memory 570 * @size: read/write size, sizeof(@buf) must > @size 571 * @write: true - write to vram, otherwise - read from vram 572 */ 573 void amdgpu_device_mm_access(struct amdgpu_device *adev, loff_t pos, 574 void *buf, size_t size, bool write) 575 { 576 unsigned long flags; 577 uint32_t hi = ~0, tmp = 0; 578 uint32_t *data = buf; 579 uint64_t last; 580 int idx; 581 582 if (!drm_dev_enter(adev_to_drm(adev), &idx)) 583 return; 584 585 BUG_ON(!IS_ALIGNED(pos, 4) || !IS_ALIGNED(size, 4)); 586 587 spin_lock_irqsave(&adev->mmio_idx_lock, flags); 588 for (last = pos + size; pos < last; pos += 4) { 589 tmp = pos >> 31; 590 591 WREG32_NO_KIQ(mmMM_INDEX, ((uint32_t)pos) | 0x80000000); 592 if (tmp != hi) { 593 WREG32_NO_KIQ(mmMM_INDEX_HI, tmp); 594 hi = tmp; 595 } 596 if (write) 597 WREG32_NO_KIQ(mmMM_DATA, *data++); 598 else 599 *data++ = RREG32_NO_KIQ(mmMM_DATA); 600 } 601 602 spin_unlock_irqrestore(&adev->mmio_idx_lock, flags); 603 drm_dev_exit(idx); 604 } 605 606 /** 607 * amdgpu_device_aper_access - access vram by vram aperture 608 * 609 * @adev: amdgpu_device pointer 610 * @pos: offset of the buffer in vram 611 * @buf: virtual address of the buffer in system memory 612 * @size: read/write size, sizeof(@buf) must > @size 613 * @write: true - write to vram, otherwise - read from vram 614 * 615 * The return value means how many bytes have been transferred. 616 */ 617 size_t amdgpu_device_aper_access(struct amdgpu_device *adev, loff_t pos, 618 void *buf, size_t size, bool write) 619 { 620 #ifdef CONFIG_64BIT 621 void __iomem *addr; 622 size_t count = 0; 623 uint64_t last; 624 625 if (!adev->mman.aper_base_kaddr) 626 return 0; 627 628 last = min(pos + size, adev->gmc.visible_vram_size); 629 if (last > pos) { 630 addr = adev->mman.aper_base_kaddr + pos; 631 count = last - pos; 632 633 if (write) { 634 memcpy_toio(addr, buf, count); 635 /* Make sure HDP write cache flush happens without any reordering 636 * after the system memory contents are sent over PCIe device 637 */ 638 mb(); 639 amdgpu_device_flush_hdp(adev, NULL); 640 } else { 641 amdgpu_device_invalidate_hdp(adev, NULL); 642 /* Make sure HDP read cache is invalidated before issuing a read 643 * to the PCIe device 644 */ 645 mb(); 646 memcpy_fromio(buf, addr, count); 647 } 648 649 } 650 651 return count; 652 #else 653 return 0; 654 #endif 655 } 656 657 /** 658 * amdgpu_device_vram_access - read/write a buffer in vram 659 * 660 * @adev: amdgpu_device pointer 661 * @pos: offset of the buffer in vram 662 * @buf: virtual address of the buffer in system memory 663 * @size: read/write size, sizeof(@buf) must > @size 664 * @write: true - write to vram, otherwise - read from vram 665 */ 666 void amdgpu_device_vram_access(struct amdgpu_device *adev, loff_t pos, 667 void *buf, size_t size, bool write) 668 { 669 size_t count; 670 671 /* try to using vram apreature to access vram first */ 672 count = amdgpu_device_aper_access(adev, pos, buf, size, write); 673 size -= count; 674 if (size) { 675 /* using MM to access rest vram */ 676 pos += count; 677 buf += count; 678 amdgpu_device_mm_access(adev, pos, buf, size, write); 679 } 680 } 681 682 /* 683 * register access helper functions. 684 */ 685 686 /* Check if hw access should be skipped because of hotplug or device error */ 687 bool amdgpu_device_skip_hw_access(struct amdgpu_device *adev) 688 { 689 if (adev->no_hw_access) 690 return true; 691 692 #ifdef CONFIG_LOCKDEP 693 /* 694 * This is a bit complicated to understand, so worth a comment. What we assert 695 * here is that the GPU reset is not running on another thread in parallel. 696 * 697 * For this we trylock the read side of the reset semaphore, if that succeeds 698 * we know that the reset is not running in parallel. 699 * 700 * If the trylock fails we assert that we are either already holding the read 701 * side of the lock or are the reset thread itself and hold the write side of 702 * the lock. 703 */ 704 if (in_task()) { 705 if (down_read_trylock(&adev->reset_domain->sem)) 706 up_read(&adev->reset_domain->sem); 707 else 708 lockdep_assert_held(&adev->reset_domain->sem); 709 } 710 #endif 711 return false; 712 } 713 714 /** 715 * amdgpu_device_rreg - read a memory mapped IO or indirect register 716 * 717 * @adev: amdgpu_device pointer 718 * @reg: dword aligned register offset 719 * @acc_flags: access flags which require special behavior 720 * 721 * Returns the 32 bit value from the offset specified. 722 */ 723 uint32_t amdgpu_device_rreg(struct amdgpu_device *adev, 724 uint32_t reg, uint32_t acc_flags) 725 { 726 uint32_t ret; 727 728 if (amdgpu_device_skip_hw_access(adev)) 729 return 0; 730 731 if ((reg * 4) < adev->rmmio_size) { 732 if (!(acc_flags & AMDGPU_REGS_NO_KIQ) && 733 amdgpu_sriov_runtime(adev) && 734 down_read_trylock(&adev->reset_domain->sem)) { 735 ret = amdgpu_kiq_rreg(adev, reg, 0); 736 up_read(&adev->reset_domain->sem); 737 } else { 738 ret = readl(((void __iomem *)adev->rmmio) + (reg * 4)); 739 } 740 } else { 741 ret = adev->pcie_rreg(adev, reg * 4); 742 } 743 744 trace_amdgpu_device_rreg(adev->pdev->device, reg, ret); 745 746 return ret; 747 } 748 749 /* 750 * MMIO register read with bytes helper functions 751 * @offset:bytes offset from MMIO start 752 */ 753 754 /** 755 * amdgpu_mm_rreg8 - read a memory mapped IO register 756 * 757 * @adev: amdgpu_device pointer 758 * @offset: byte aligned register offset 759 * 760 * Returns the 8 bit value from the offset specified. 761 */ 762 uint8_t amdgpu_mm_rreg8(struct amdgpu_device *adev, uint32_t offset) 763 { 764 if (amdgpu_device_skip_hw_access(adev)) 765 return 0; 766 767 if (offset < adev->rmmio_size) 768 return (readb(adev->rmmio + offset)); 769 BUG(); 770 } 771 772 773 /** 774 * amdgpu_device_xcc_rreg - read a memory mapped IO or indirect register with specific XCC 775 * 776 * @adev: amdgpu_device pointer 777 * @reg: dword aligned register offset 778 * @acc_flags: access flags which require special behavior 779 * @xcc_id: xcc accelerated compute core id 780 * 781 * Returns the 32 bit value from the offset specified. 782 */ 783 uint32_t amdgpu_device_xcc_rreg(struct amdgpu_device *adev, 784 uint32_t reg, uint32_t acc_flags, 785 uint32_t xcc_id) 786 { 787 uint32_t ret, rlcg_flag; 788 789 if (amdgpu_device_skip_hw_access(adev)) 790 return 0; 791 792 if ((reg * 4) < adev->rmmio_size) { 793 if (amdgpu_sriov_vf(adev) && 794 !amdgpu_sriov_runtime(adev) && 795 adev->gfx.rlc.rlcg_reg_access_supported && 796 amdgpu_virt_get_rlcg_reg_access_flag(adev, acc_flags, 797 GC_HWIP, false, 798 &rlcg_flag)) { 799 ret = amdgpu_virt_rlcg_reg_rw(adev, reg, 0, rlcg_flag, GET_INST(GC, xcc_id)); 800 } else if (!(acc_flags & AMDGPU_REGS_NO_KIQ) && 801 amdgpu_sriov_runtime(adev) && 802 down_read_trylock(&adev->reset_domain->sem)) { 803 ret = amdgpu_kiq_rreg(adev, reg, xcc_id); 804 up_read(&adev->reset_domain->sem); 805 } else { 806 ret = readl(((void __iomem *)adev->rmmio) + (reg * 4)); 807 } 808 } else { 809 ret = adev->pcie_rreg(adev, reg * 4); 810 } 811 812 return ret; 813 } 814 815 /* 816 * MMIO register write with bytes helper functions 817 * @offset:bytes offset from MMIO start 818 * @value: the value want to be written to the register 819 */ 820 821 /** 822 * amdgpu_mm_wreg8 - read a memory mapped IO register 823 * 824 * @adev: amdgpu_device pointer 825 * @offset: byte aligned register offset 826 * @value: 8 bit value to write 827 * 828 * Writes the value specified to the offset specified. 829 */ 830 void amdgpu_mm_wreg8(struct amdgpu_device *adev, uint32_t offset, uint8_t value) 831 { 832 if (amdgpu_device_skip_hw_access(adev)) 833 return; 834 835 if (offset < adev->rmmio_size) 836 writeb(value, adev->rmmio + offset); 837 else 838 BUG(); 839 } 840 841 /** 842 * amdgpu_device_wreg - write to a memory mapped IO or indirect register 843 * 844 * @adev: amdgpu_device pointer 845 * @reg: dword aligned register offset 846 * @v: 32 bit value to write to the register 847 * @acc_flags: access flags which require special behavior 848 * 849 * Writes the value specified to the offset specified. 850 */ 851 void amdgpu_device_wreg(struct amdgpu_device *adev, 852 uint32_t reg, uint32_t v, 853 uint32_t acc_flags) 854 { 855 if (amdgpu_device_skip_hw_access(adev)) 856 return; 857 858 if ((reg * 4) < adev->rmmio_size) { 859 if (!(acc_flags & AMDGPU_REGS_NO_KIQ) && 860 amdgpu_sriov_runtime(adev) && 861 down_read_trylock(&adev->reset_domain->sem)) { 862 amdgpu_kiq_wreg(adev, reg, v, 0); 863 up_read(&adev->reset_domain->sem); 864 } else { 865 writel(v, ((void __iomem *)adev->rmmio) + (reg * 4)); 866 } 867 } else { 868 adev->pcie_wreg(adev, reg * 4, v); 869 } 870 871 trace_amdgpu_device_wreg(adev->pdev->device, reg, v); 872 } 873 874 /** 875 * amdgpu_mm_wreg_mmio_rlc - write register either with direct/indirect mmio or with RLC path if in range 876 * 877 * @adev: amdgpu_device pointer 878 * @reg: mmio/rlc register 879 * @v: value to write 880 * @xcc_id: xcc accelerated compute core id 881 * 882 * this function is invoked only for the debugfs register access 883 */ 884 void amdgpu_mm_wreg_mmio_rlc(struct amdgpu_device *adev, 885 uint32_t reg, uint32_t v, 886 uint32_t xcc_id) 887 { 888 if (amdgpu_device_skip_hw_access(adev)) 889 return; 890 891 if (amdgpu_sriov_fullaccess(adev) && 892 adev->gfx.rlc.funcs && 893 adev->gfx.rlc.funcs->is_rlcg_access_range) { 894 if (adev->gfx.rlc.funcs->is_rlcg_access_range(adev, reg)) 895 return amdgpu_sriov_wreg(adev, reg, v, 0, 0, xcc_id); 896 } else if ((reg * 4) >= adev->rmmio_size) { 897 adev->pcie_wreg(adev, reg * 4, v); 898 } else { 899 writel(v, ((void __iomem *)adev->rmmio) + (reg * 4)); 900 } 901 } 902 903 /** 904 * amdgpu_device_xcc_wreg - write to a memory mapped IO or indirect register with specific XCC 905 * 906 * @adev: amdgpu_device pointer 907 * @reg: dword aligned register offset 908 * @v: 32 bit value to write to the register 909 * @acc_flags: access flags which require special behavior 910 * @xcc_id: xcc accelerated compute core id 911 * 912 * Writes the value specified to the offset specified. 913 */ 914 void amdgpu_device_xcc_wreg(struct amdgpu_device *adev, 915 uint32_t reg, uint32_t v, 916 uint32_t acc_flags, uint32_t xcc_id) 917 { 918 uint32_t rlcg_flag; 919 920 if (amdgpu_device_skip_hw_access(adev)) 921 return; 922 923 if ((reg * 4) < adev->rmmio_size) { 924 if (amdgpu_sriov_vf(adev) && 925 !amdgpu_sriov_runtime(adev) && 926 adev->gfx.rlc.rlcg_reg_access_supported && 927 amdgpu_virt_get_rlcg_reg_access_flag(adev, acc_flags, 928 GC_HWIP, true, 929 &rlcg_flag)) { 930 amdgpu_virt_rlcg_reg_rw(adev, reg, v, rlcg_flag, GET_INST(GC, xcc_id)); 931 } else if (!(acc_flags & AMDGPU_REGS_NO_KIQ) && 932 amdgpu_sriov_runtime(adev) && 933 down_read_trylock(&adev->reset_domain->sem)) { 934 amdgpu_kiq_wreg(adev, reg, v, xcc_id); 935 up_read(&adev->reset_domain->sem); 936 } else { 937 writel(v, ((void __iomem *)adev->rmmio) + (reg * 4)); 938 } 939 } else { 940 adev->pcie_wreg(adev, reg * 4, v); 941 } 942 } 943 944 /** 945 * amdgpu_device_indirect_rreg - read an indirect register 946 * 947 * @adev: amdgpu_device pointer 948 * @reg_addr: indirect register address to read from 949 * 950 * Returns the value of indirect register @reg_addr 951 */ 952 u32 amdgpu_device_indirect_rreg(struct amdgpu_device *adev, 953 u32 reg_addr) 954 { 955 unsigned long flags, pcie_index, pcie_data; 956 void __iomem *pcie_index_offset; 957 void __iomem *pcie_data_offset; 958 u32 r; 959 960 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 961 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 962 963 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 964 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 965 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 966 967 writel(reg_addr, pcie_index_offset); 968 readl(pcie_index_offset); 969 r = readl(pcie_data_offset); 970 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 971 972 return r; 973 } 974 975 u32 amdgpu_device_indirect_rreg_ext(struct amdgpu_device *adev, 976 u64 reg_addr) 977 { 978 unsigned long flags, pcie_index, pcie_index_hi, pcie_data; 979 u32 r; 980 void __iomem *pcie_index_offset; 981 void __iomem *pcie_index_hi_offset; 982 void __iomem *pcie_data_offset; 983 984 if (unlikely(!adev->nbio.funcs)) { 985 pcie_index = AMDGPU_PCIE_INDEX_FALLBACK; 986 pcie_data = AMDGPU_PCIE_DATA_FALLBACK; 987 } else { 988 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 989 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 990 } 991 992 if (reg_addr >> 32) { 993 if (unlikely(!adev->nbio.funcs)) 994 pcie_index_hi = AMDGPU_PCIE_INDEX_HI_FALLBACK; 995 else 996 pcie_index_hi = adev->nbio.funcs->get_pcie_index_hi_offset(adev); 997 } else { 998 pcie_index_hi = 0; 999 } 1000 1001 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 1002 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 1003 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 1004 if (pcie_index_hi != 0) 1005 pcie_index_hi_offset = (void __iomem *)adev->rmmio + 1006 pcie_index_hi * 4; 1007 1008 writel(reg_addr, pcie_index_offset); 1009 readl(pcie_index_offset); 1010 if (pcie_index_hi != 0) { 1011 writel((reg_addr >> 32) & 0xff, pcie_index_hi_offset); 1012 readl(pcie_index_hi_offset); 1013 } 1014 r = readl(pcie_data_offset); 1015 1016 /* clear the high bits */ 1017 if (pcie_index_hi != 0) { 1018 writel(0, pcie_index_hi_offset); 1019 readl(pcie_index_hi_offset); 1020 } 1021 1022 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 1023 1024 return r; 1025 } 1026 1027 /** 1028 * amdgpu_device_indirect_rreg64 - read a 64bits indirect register 1029 * 1030 * @adev: amdgpu_device pointer 1031 * @reg_addr: indirect register address to read from 1032 * 1033 * Returns the value of indirect register @reg_addr 1034 */ 1035 u64 amdgpu_device_indirect_rreg64(struct amdgpu_device *adev, 1036 u32 reg_addr) 1037 { 1038 unsigned long flags, pcie_index, pcie_data; 1039 void __iomem *pcie_index_offset; 1040 void __iomem *pcie_data_offset; 1041 u64 r; 1042 1043 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 1044 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 1045 1046 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 1047 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 1048 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 1049 1050 /* read low 32 bits */ 1051 writel(reg_addr, pcie_index_offset); 1052 readl(pcie_index_offset); 1053 r = readl(pcie_data_offset); 1054 /* read high 32 bits */ 1055 writel(reg_addr + 4, pcie_index_offset); 1056 readl(pcie_index_offset); 1057 r |= ((u64)readl(pcie_data_offset) << 32); 1058 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 1059 1060 return r; 1061 } 1062 1063 u64 amdgpu_device_indirect_rreg64_ext(struct amdgpu_device *adev, 1064 u64 reg_addr) 1065 { 1066 unsigned long flags, pcie_index, pcie_data; 1067 unsigned long pcie_index_hi = 0; 1068 void __iomem *pcie_index_offset; 1069 void __iomem *pcie_index_hi_offset; 1070 void __iomem *pcie_data_offset; 1071 u64 r; 1072 1073 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 1074 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 1075 if ((reg_addr >> 32) && (adev->nbio.funcs->get_pcie_index_hi_offset)) 1076 pcie_index_hi = adev->nbio.funcs->get_pcie_index_hi_offset(adev); 1077 1078 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 1079 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 1080 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 1081 if (pcie_index_hi != 0) 1082 pcie_index_hi_offset = (void __iomem *)adev->rmmio + 1083 pcie_index_hi * 4; 1084 1085 /* read low 32 bits */ 1086 writel(reg_addr, pcie_index_offset); 1087 readl(pcie_index_offset); 1088 if (pcie_index_hi != 0) { 1089 writel((reg_addr >> 32) & 0xff, pcie_index_hi_offset); 1090 readl(pcie_index_hi_offset); 1091 } 1092 r = readl(pcie_data_offset); 1093 /* read high 32 bits */ 1094 writel(reg_addr + 4, pcie_index_offset); 1095 readl(pcie_index_offset); 1096 if (pcie_index_hi != 0) { 1097 writel((reg_addr >> 32) & 0xff, pcie_index_hi_offset); 1098 readl(pcie_index_hi_offset); 1099 } 1100 r |= ((u64)readl(pcie_data_offset) << 32); 1101 1102 /* clear the high bits */ 1103 if (pcie_index_hi != 0) { 1104 writel(0, pcie_index_hi_offset); 1105 readl(pcie_index_hi_offset); 1106 } 1107 1108 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 1109 1110 return r; 1111 } 1112 1113 /** 1114 * amdgpu_device_indirect_wreg - write an indirect register address 1115 * 1116 * @adev: amdgpu_device pointer 1117 * @reg_addr: indirect register offset 1118 * @reg_data: indirect register data 1119 * 1120 */ 1121 void amdgpu_device_indirect_wreg(struct amdgpu_device *adev, 1122 u32 reg_addr, u32 reg_data) 1123 { 1124 unsigned long flags, pcie_index, pcie_data; 1125 void __iomem *pcie_index_offset; 1126 void __iomem *pcie_data_offset; 1127 1128 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 1129 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 1130 1131 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 1132 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 1133 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 1134 1135 writel(reg_addr, pcie_index_offset); 1136 readl(pcie_index_offset); 1137 writel(reg_data, pcie_data_offset); 1138 readl(pcie_data_offset); 1139 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 1140 } 1141 1142 void amdgpu_device_indirect_wreg_ext(struct amdgpu_device *adev, 1143 u64 reg_addr, u32 reg_data) 1144 { 1145 unsigned long flags, pcie_index, pcie_index_hi, pcie_data; 1146 void __iomem *pcie_index_offset; 1147 void __iomem *pcie_index_hi_offset; 1148 void __iomem *pcie_data_offset; 1149 1150 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 1151 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 1152 if ((reg_addr >> 32) && (adev->nbio.funcs->get_pcie_index_hi_offset)) 1153 pcie_index_hi = adev->nbio.funcs->get_pcie_index_hi_offset(adev); 1154 else 1155 pcie_index_hi = 0; 1156 1157 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 1158 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 1159 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 1160 if (pcie_index_hi != 0) 1161 pcie_index_hi_offset = (void __iomem *)adev->rmmio + 1162 pcie_index_hi * 4; 1163 1164 writel(reg_addr, pcie_index_offset); 1165 readl(pcie_index_offset); 1166 if (pcie_index_hi != 0) { 1167 writel((reg_addr >> 32) & 0xff, pcie_index_hi_offset); 1168 readl(pcie_index_hi_offset); 1169 } 1170 writel(reg_data, pcie_data_offset); 1171 readl(pcie_data_offset); 1172 1173 /* clear the high bits */ 1174 if (pcie_index_hi != 0) { 1175 writel(0, pcie_index_hi_offset); 1176 readl(pcie_index_hi_offset); 1177 } 1178 1179 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 1180 } 1181 1182 /** 1183 * amdgpu_device_indirect_wreg64 - write a 64bits indirect register address 1184 * 1185 * @adev: amdgpu_device pointer 1186 * @reg_addr: indirect register offset 1187 * @reg_data: indirect register data 1188 * 1189 */ 1190 void amdgpu_device_indirect_wreg64(struct amdgpu_device *adev, 1191 u32 reg_addr, u64 reg_data) 1192 { 1193 unsigned long flags, pcie_index, pcie_data; 1194 void __iomem *pcie_index_offset; 1195 void __iomem *pcie_data_offset; 1196 1197 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 1198 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 1199 1200 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 1201 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 1202 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 1203 1204 /* write low 32 bits */ 1205 writel(reg_addr, pcie_index_offset); 1206 readl(pcie_index_offset); 1207 writel((u32)(reg_data & 0xffffffffULL), pcie_data_offset); 1208 readl(pcie_data_offset); 1209 /* write high 32 bits */ 1210 writel(reg_addr + 4, pcie_index_offset); 1211 readl(pcie_index_offset); 1212 writel((u32)(reg_data >> 32), pcie_data_offset); 1213 readl(pcie_data_offset); 1214 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 1215 } 1216 1217 void amdgpu_device_indirect_wreg64_ext(struct amdgpu_device *adev, 1218 u64 reg_addr, u64 reg_data) 1219 { 1220 unsigned long flags, pcie_index, pcie_data; 1221 unsigned long pcie_index_hi = 0; 1222 void __iomem *pcie_index_offset; 1223 void __iomem *pcie_index_hi_offset; 1224 void __iomem *pcie_data_offset; 1225 1226 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 1227 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 1228 if ((reg_addr >> 32) && (adev->nbio.funcs->get_pcie_index_hi_offset)) 1229 pcie_index_hi = adev->nbio.funcs->get_pcie_index_hi_offset(adev); 1230 1231 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 1232 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 1233 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 1234 if (pcie_index_hi != 0) 1235 pcie_index_hi_offset = (void __iomem *)adev->rmmio + 1236 pcie_index_hi * 4; 1237 1238 /* write low 32 bits */ 1239 writel(reg_addr, pcie_index_offset); 1240 readl(pcie_index_offset); 1241 if (pcie_index_hi != 0) { 1242 writel((reg_addr >> 32) & 0xff, pcie_index_hi_offset); 1243 readl(pcie_index_hi_offset); 1244 } 1245 writel((u32)(reg_data & 0xffffffffULL), pcie_data_offset); 1246 readl(pcie_data_offset); 1247 /* write high 32 bits */ 1248 writel(reg_addr + 4, pcie_index_offset); 1249 readl(pcie_index_offset); 1250 if (pcie_index_hi != 0) { 1251 writel((reg_addr >> 32) & 0xff, pcie_index_hi_offset); 1252 readl(pcie_index_hi_offset); 1253 } 1254 writel((u32)(reg_data >> 32), pcie_data_offset); 1255 readl(pcie_data_offset); 1256 1257 /* clear the high bits */ 1258 if (pcie_index_hi != 0) { 1259 writel(0, pcie_index_hi_offset); 1260 readl(pcie_index_hi_offset); 1261 } 1262 1263 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 1264 } 1265 1266 /** 1267 * amdgpu_device_get_rev_id - query device rev_id 1268 * 1269 * @adev: amdgpu_device pointer 1270 * 1271 * Return device rev_id 1272 */ 1273 u32 amdgpu_device_get_rev_id(struct amdgpu_device *adev) 1274 { 1275 return adev->nbio.funcs->get_rev_id(adev); 1276 } 1277 1278 /** 1279 * amdgpu_invalid_rreg - dummy reg read function 1280 * 1281 * @adev: amdgpu_device pointer 1282 * @reg: offset of register 1283 * 1284 * Dummy register read function. Used for register blocks 1285 * that certain asics don't have (all asics). 1286 * Returns the value in the register. 1287 */ 1288 static uint32_t amdgpu_invalid_rreg(struct amdgpu_device *adev, uint32_t reg) 1289 { 1290 DRM_ERROR("Invalid callback to read register 0x%04X\n", reg); 1291 BUG(); 1292 return 0; 1293 } 1294 1295 static uint32_t amdgpu_invalid_rreg_ext(struct amdgpu_device *adev, uint64_t reg) 1296 { 1297 DRM_ERROR("Invalid callback to read register 0x%llX\n", reg); 1298 BUG(); 1299 return 0; 1300 } 1301 1302 /** 1303 * amdgpu_invalid_wreg - dummy reg write function 1304 * 1305 * @adev: amdgpu_device pointer 1306 * @reg: offset of register 1307 * @v: value to write to the register 1308 * 1309 * Dummy register read function. Used for register blocks 1310 * that certain asics don't have (all asics). 1311 */ 1312 static void amdgpu_invalid_wreg(struct amdgpu_device *adev, uint32_t reg, uint32_t v) 1313 { 1314 DRM_ERROR("Invalid callback to write register 0x%04X with 0x%08X\n", 1315 reg, v); 1316 BUG(); 1317 } 1318 1319 static void amdgpu_invalid_wreg_ext(struct amdgpu_device *adev, uint64_t reg, uint32_t v) 1320 { 1321 DRM_ERROR("Invalid callback to write register 0x%llX with 0x%08X\n", 1322 reg, v); 1323 BUG(); 1324 } 1325 1326 /** 1327 * amdgpu_invalid_rreg64 - dummy 64 bit reg read function 1328 * 1329 * @adev: amdgpu_device pointer 1330 * @reg: offset of register 1331 * 1332 * Dummy register read function. Used for register blocks 1333 * that certain asics don't have (all asics). 1334 * Returns the value in the register. 1335 */ 1336 static uint64_t amdgpu_invalid_rreg64(struct amdgpu_device *adev, uint32_t reg) 1337 { 1338 DRM_ERROR("Invalid callback to read 64 bit register 0x%04X\n", reg); 1339 BUG(); 1340 return 0; 1341 } 1342 1343 static uint64_t amdgpu_invalid_rreg64_ext(struct amdgpu_device *adev, uint64_t reg) 1344 { 1345 DRM_ERROR("Invalid callback to read register 0x%llX\n", reg); 1346 BUG(); 1347 return 0; 1348 } 1349 1350 /** 1351 * amdgpu_invalid_wreg64 - dummy reg write function 1352 * 1353 * @adev: amdgpu_device pointer 1354 * @reg: offset of register 1355 * @v: value to write to the register 1356 * 1357 * Dummy register read function. Used for register blocks 1358 * that certain asics don't have (all asics). 1359 */ 1360 static void amdgpu_invalid_wreg64(struct amdgpu_device *adev, uint32_t reg, uint64_t v) 1361 { 1362 DRM_ERROR("Invalid callback to write 64 bit register 0x%04X with 0x%08llX\n", 1363 reg, v); 1364 BUG(); 1365 } 1366 1367 static void amdgpu_invalid_wreg64_ext(struct amdgpu_device *adev, uint64_t reg, uint64_t v) 1368 { 1369 DRM_ERROR("Invalid callback to write 64 bit register 0x%llX with 0x%08llX\n", 1370 reg, v); 1371 BUG(); 1372 } 1373 1374 /** 1375 * amdgpu_block_invalid_rreg - dummy reg read function 1376 * 1377 * @adev: amdgpu_device pointer 1378 * @block: offset of instance 1379 * @reg: offset of register 1380 * 1381 * Dummy register read function. Used for register blocks 1382 * that certain asics don't have (all asics). 1383 * Returns the value in the register. 1384 */ 1385 static uint32_t amdgpu_block_invalid_rreg(struct amdgpu_device *adev, 1386 uint32_t block, uint32_t reg) 1387 { 1388 DRM_ERROR("Invalid callback to read register 0x%04X in block 0x%04X\n", 1389 reg, block); 1390 BUG(); 1391 return 0; 1392 } 1393 1394 /** 1395 * amdgpu_block_invalid_wreg - dummy reg write function 1396 * 1397 * @adev: amdgpu_device pointer 1398 * @block: offset of instance 1399 * @reg: offset of register 1400 * @v: value to write to the register 1401 * 1402 * Dummy register read function. Used for register blocks 1403 * that certain asics don't have (all asics). 1404 */ 1405 static void amdgpu_block_invalid_wreg(struct amdgpu_device *adev, 1406 uint32_t block, 1407 uint32_t reg, uint32_t v) 1408 { 1409 DRM_ERROR("Invalid block callback to write register 0x%04X in block 0x%04X with 0x%08X\n", 1410 reg, block, v); 1411 BUG(); 1412 } 1413 1414 static uint32_t amdgpu_device_get_vbios_flags(struct amdgpu_device *adev) 1415 { 1416 if (hweight32(adev->aid_mask) && (adev->flags & AMD_IS_APU)) 1417 return AMDGPU_VBIOS_SKIP; 1418 1419 if (hweight32(adev->aid_mask) && amdgpu_passthrough(adev)) 1420 return AMDGPU_VBIOS_OPTIONAL; 1421 1422 return 0; 1423 } 1424 1425 /** 1426 * amdgpu_device_asic_init - Wrapper for atom asic_init 1427 * 1428 * @adev: amdgpu_device pointer 1429 * 1430 * Does any asic specific work and then calls atom asic init. 1431 */ 1432 static int amdgpu_device_asic_init(struct amdgpu_device *adev) 1433 { 1434 uint32_t flags; 1435 bool optional; 1436 int ret; 1437 1438 amdgpu_asic_pre_asic_init(adev); 1439 flags = amdgpu_device_get_vbios_flags(adev); 1440 optional = !!(flags & (AMDGPU_VBIOS_OPTIONAL | AMDGPU_VBIOS_SKIP)); 1441 1442 if (amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 3) || 1443 amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 4) || 1444 amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 5, 0) || 1445 amdgpu_ip_version(adev, GC_HWIP, 0) >= IP_VERSION(11, 0, 0)) { 1446 amdgpu_psp_wait_for_bootloader(adev); 1447 if (optional && !adev->bios) 1448 return 0; 1449 1450 ret = amdgpu_atomfirmware_asic_init(adev, true); 1451 return ret; 1452 } else { 1453 if (optional && !adev->bios) 1454 return 0; 1455 1456 return amdgpu_atom_asic_init(adev->mode_info.atom_context); 1457 } 1458 1459 return 0; 1460 } 1461 1462 /** 1463 * amdgpu_device_mem_scratch_init - allocate the VRAM scratch page 1464 * 1465 * @adev: amdgpu_device pointer 1466 * 1467 * Allocates a scratch page of VRAM for use by various things in the 1468 * driver. 1469 */ 1470 static int amdgpu_device_mem_scratch_init(struct amdgpu_device *adev) 1471 { 1472 return amdgpu_bo_create_kernel(adev, AMDGPU_GPU_PAGE_SIZE, PAGE_SIZE, 1473 AMDGPU_GEM_DOMAIN_VRAM | 1474 AMDGPU_GEM_DOMAIN_GTT, 1475 &adev->mem_scratch.robj, 1476 &adev->mem_scratch.gpu_addr, 1477 (void **)&adev->mem_scratch.ptr); 1478 } 1479 1480 /** 1481 * amdgpu_device_mem_scratch_fini - Free the VRAM scratch page 1482 * 1483 * @adev: amdgpu_device pointer 1484 * 1485 * Frees the VRAM scratch page. 1486 */ 1487 static void amdgpu_device_mem_scratch_fini(struct amdgpu_device *adev) 1488 { 1489 amdgpu_bo_free_kernel(&adev->mem_scratch.robj, NULL, NULL); 1490 } 1491 1492 /** 1493 * amdgpu_device_program_register_sequence - program an array of registers. 1494 * 1495 * @adev: amdgpu_device pointer 1496 * @registers: pointer to the register array 1497 * @array_size: size of the register array 1498 * 1499 * Programs an array or registers with and or masks. 1500 * This is a helper for setting golden registers. 1501 */ 1502 void amdgpu_device_program_register_sequence(struct amdgpu_device *adev, 1503 const u32 *registers, 1504 const u32 array_size) 1505 { 1506 u32 tmp, reg, and_mask, or_mask; 1507 int i; 1508 1509 if (array_size % 3) 1510 return; 1511 1512 for (i = 0; i < array_size; i += 3) { 1513 reg = registers[i + 0]; 1514 and_mask = registers[i + 1]; 1515 or_mask = registers[i + 2]; 1516 1517 if (and_mask == 0xffffffff) { 1518 tmp = or_mask; 1519 } else { 1520 tmp = RREG32(reg); 1521 tmp &= ~and_mask; 1522 if (adev->family >= AMDGPU_FAMILY_AI) 1523 tmp |= (or_mask & and_mask); 1524 else 1525 tmp |= or_mask; 1526 } 1527 WREG32(reg, tmp); 1528 } 1529 } 1530 1531 /** 1532 * amdgpu_device_pci_config_reset - reset the GPU 1533 * 1534 * @adev: amdgpu_device pointer 1535 * 1536 * Resets the GPU using the pci config reset sequence. 1537 * Only applicable to asics prior to vega10. 1538 */ 1539 void amdgpu_device_pci_config_reset(struct amdgpu_device *adev) 1540 { 1541 pci_write_config_dword(adev->pdev, 0x7c, AMDGPU_ASIC_RESET_DATA); 1542 } 1543 1544 /** 1545 * amdgpu_device_pci_reset - reset the GPU using generic PCI means 1546 * 1547 * @adev: amdgpu_device pointer 1548 * 1549 * Resets the GPU using generic pci reset interfaces (FLR, SBR, etc.). 1550 */ 1551 int amdgpu_device_pci_reset(struct amdgpu_device *adev) 1552 { 1553 return pci_reset_function(adev->pdev); 1554 } 1555 1556 /* 1557 * amdgpu_device_wb_*() 1558 * Writeback is the method by which the GPU updates special pages in memory 1559 * with the status of certain GPU events (fences, ring pointers,etc.). 1560 */ 1561 1562 /** 1563 * amdgpu_device_wb_fini - Disable Writeback and free memory 1564 * 1565 * @adev: amdgpu_device pointer 1566 * 1567 * Disables Writeback and frees the Writeback memory (all asics). 1568 * Used at driver shutdown. 1569 */ 1570 static void amdgpu_device_wb_fini(struct amdgpu_device *adev) 1571 { 1572 if (adev->wb.wb_obj) { 1573 amdgpu_bo_free_kernel(&adev->wb.wb_obj, 1574 &adev->wb.gpu_addr, 1575 (void **)&adev->wb.wb); 1576 adev->wb.wb_obj = NULL; 1577 } 1578 } 1579 1580 /** 1581 * amdgpu_device_wb_init - Init Writeback driver info and allocate memory 1582 * 1583 * @adev: amdgpu_device pointer 1584 * 1585 * Initializes writeback and allocates writeback memory (all asics). 1586 * Used at driver startup. 1587 * Returns 0 on success or an -error on failure. 1588 */ 1589 static int amdgpu_device_wb_init(struct amdgpu_device *adev) 1590 { 1591 int r; 1592 1593 if (adev->wb.wb_obj == NULL) { 1594 /* AMDGPU_MAX_WB * sizeof(uint32_t) * 8 = AMDGPU_MAX_WB 256bit slots */ 1595 r = amdgpu_bo_create_kernel(adev, AMDGPU_MAX_WB * sizeof(uint32_t) * 8, 1596 PAGE_SIZE, AMDGPU_GEM_DOMAIN_GTT, 1597 &adev->wb.wb_obj, &adev->wb.gpu_addr, 1598 (void **)&adev->wb.wb); 1599 if (r) { 1600 dev_warn(adev->dev, "(%d) create WB bo failed\n", r); 1601 return r; 1602 } 1603 1604 adev->wb.num_wb = AMDGPU_MAX_WB; 1605 memset(&adev->wb.used, 0, sizeof(adev->wb.used)); 1606 1607 /* clear wb memory */ 1608 memset((char *)adev->wb.wb, 0, AMDGPU_MAX_WB * sizeof(uint32_t) * 8); 1609 } 1610 1611 return 0; 1612 } 1613 1614 /** 1615 * amdgpu_device_wb_get - Allocate a wb entry 1616 * 1617 * @adev: amdgpu_device pointer 1618 * @wb: wb index 1619 * 1620 * Allocate a wb slot for use by the driver (all asics). 1621 * Returns 0 on success or -EINVAL on failure. 1622 */ 1623 int amdgpu_device_wb_get(struct amdgpu_device *adev, u32 *wb) 1624 { 1625 unsigned long flags, offset; 1626 1627 spin_lock_irqsave(&adev->wb.lock, flags); 1628 offset = find_first_zero_bit(adev->wb.used, adev->wb.num_wb); 1629 if (offset < adev->wb.num_wb) { 1630 __set_bit(offset, adev->wb.used); 1631 spin_unlock_irqrestore(&adev->wb.lock, flags); 1632 *wb = offset << 3; /* convert to dw offset */ 1633 return 0; 1634 } else { 1635 spin_unlock_irqrestore(&adev->wb.lock, flags); 1636 return -EINVAL; 1637 } 1638 } 1639 1640 /** 1641 * amdgpu_device_wb_free - Free a wb entry 1642 * 1643 * @adev: amdgpu_device pointer 1644 * @wb: wb index 1645 * 1646 * Free a wb slot allocated for use by the driver (all asics) 1647 */ 1648 void amdgpu_device_wb_free(struct amdgpu_device *adev, u32 wb) 1649 { 1650 unsigned long flags; 1651 1652 wb >>= 3; 1653 spin_lock_irqsave(&adev->wb.lock, flags); 1654 if (wb < adev->wb.num_wb) 1655 __clear_bit(wb, adev->wb.used); 1656 spin_unlock_irqrestore(&adev->wb.lock, flags); 1657 } 1658 1659 /** 1660 * amdgpu_device_resize_fb_bar - try to resize FB BAR 1661 * 1662 * @adev: amdgpu_device pointer 1663 * 1664 * Try to resize FB BAR to make all VRAM CPU accessible. We try very hard not 1665 * to fail, but if any of the BARs is not accessible after the size we abort 1666 * driver loading by returning -ENODEV. 1667 */ 1668 int amdgpu_device_resize_fb_bar(struct amdgpu_device *adev) 1669 { 1670 int rbar_size = pci_rebar_bytes_to_size(adev->gmc.real_vram_size); 1671 struct pci_bus *root; 1672 struct resource *res; 1673 unsigned int i; 1674 u16 cmd; 1675 int r; 1676 1677 if (!IS_ENABLED(CONFIG_PHYS_ADDR_T_64BIT)) 1678 return 0; 1679 1680 /* Bypass for VF */ 1681 if (amdgpu_sriov_vf(adev)) 1682 return 0; 1683 1684 if (!amdgpu_rebar) 1685 return 0; 1686 1687 /* resizing on Dell G5 SE platforms causes problems with runtime pm */ 1688 if ((amdgpu_runtime_pm != 0) && 1689 adev->pdev->vendor == PCI_VENDOR_ID_ATI && 1690 adev->pdev->device == 0x731f && 1691 adev->pdev->subsystem_vendor == PCI_VENDOR_ID_DELL) 1692 return 0; 1693 1694 /* PCI_EXT_CAP_ID_VNDR extended capability is located at 0x100 */ 1695 if (!pci_find_ext_capability(adev->pdev, PCI_EXT_CAP_ID_VNDR)) 1696 DRM_WARN("System can't access extended configuration space, please check!!\n"); 1697 1698 /* skip if the bios has already enabled large BAR */ 1699 if (adev->gmc.real_vram_size && 1700 (pci_resource_len(adev->pdev, 0) >= adev->gmc.real_vram_size)) 1701 return 0; 1702 1703 /* Check if the root BUS has 64bit memory resources */ 1704 root = adev->pdev->bus; 1705 while (root->parent) 1706 root = root->parent; 1707 1708 pci_bus_for_each_resource(root, res, i) { 1709 if (res && res->flags & (IORESOURCE_MEM | IORESOURCE_MEM_64) && 1710 res->start > 0x100000000ull) 1711 break; 1712 } 1713 1714 /* Trying to resize is pointless without a root hub window above 4GB */ 1715 if (!res) 1716 return 0; 1717 1718 /* Limit the BAR size to what is available */ 1719 rbar_size = min(fls(pci_rebar_get_possible_sizes(adev->pdev, 0)) - 1, 1720 rbar_size); 1721 1722 /* Disable memory decoding while we change the BAR addresses and size */ 1723 pci_read_config_word(adev->pdev, PCI_COMMAND, &cmd); 1724 pci_write_config_word(adev->pdev, PCI_COMMAND, 1725 cmd & ~PCI_COMMAND_MEMORY); 1726 1727 /* Free the VRAM and doorbell BAR, we most likely need to move both. */ 1728 amdgpu_doorbell_fini(adev); 1729 if (adev->asic_type >= CHIP_BONAIRE) 1730 pci_release_resource(adev->pdev, 2); 1731 1732 pci_release_resource(adev->pdev, 0); 1733 1734 r = pci_resize_resource(adev->pdev, 0, rbar_size); 1735 if (r == -ENOSPC) 1736 DRM_INFO("Not enough PCI address space for a large BAR."); 1737 else if (r && r != -ENOTSUPP) 1738 DRM_ERROR("Problem resizing BAR0 (%d).", r); 1739 1740 pci_assign_unassigned_bus_resources(adev->pdev->bus); 1741 1742 /* When the doorbell or fb BAR isn't available we have no chance of 1743 * using the device. 1744 */ 1745 r = amdgpu_doorbell_init(adev); 1746 if (r || (pci_resource_flags(adev->pdev, 0) & IORESOURCE_UNSET)) 1747 return -ENODEV; 1748 1749 pci_write_config_word(adev->pdev, PCI_COMMAND, cmd); 1750 1751 return 0; 1752 } 1753 1754 /* 1755 * GPU helpers function. 1756 */ 1757 /** 1758 * amdgpu_device_need_post - check if the hw need post or not 1759 * 1760 * @adev: amdgpu_device pointer 1761 * 1762 * Check if the asic has been initialized (all asics) at driver startup 1763 * or post is needed if hw reset is performed. 1764 * Returns true if need or false if not. 1765 */ 1766 bool amdgpu_device_need_post(struct amdgpu_device *adev) 1767 { 1768 uint32_t reg, flags; 1769 1770 if (amdgpu_sriov_vf(adev)) 1771 return false; 1772 1773 flags = amdgpu_device_get_vbios_flags(adev); 1774 if (flags & AMDGPU_VBIOS_SKIP) 1775 return false; 1776 if ((flags & AMDGPU_VBIOS_OPTIONAL) && !adev->bios) 1777 return false; 1778 1779 if (amdgpu_passthrough(adev)) { 1780 /* for FIJI: In whole GPU pass-through virtualization case, after VM reboot 1781 * some old smc fw still need driver do vPost otherwise gpu hang, while 1782 * those smc fw version above 22.15 doesn't have this flaw, so we force 1783 * vpost executed for smc version below 22.15 1784 */ 1785 if (adev->asic_type == CHIP_FIJI) { 1786 int err; 1787 uint32_t fw_ver; 1788 1789 err = request_firmware(&adev->pm.fw, "amdgpu/fiji_smc.bin", adev->dev); 1790 /* force vPost if error occurred */ 1791 if (err) 1792 return true; 1793 1794 fw_ver = *((uint32_t *)adev->pm.fw->data + 69); 1795 release_firmware(adev->pm.fw); 1796 if (fw_ver < 0x00160e00) 1797 return true; 1798 } 1799 } 1800 1801 /* Don't post if we need to reset whole hive on init */ 1802 if (adev->init_lvl->level == AMDGPU_INIT_LEVEL_MINIMAL_XGMI) 1803 return false; 1804 1805 if (adev->has_hw_reset) { 1806 adev->has_hw_reset = false; 1807 return true; 1808 } 1809 1810 /* bios scratch used on CIK+ */ 1811 if (adev->asic_type >= CHIP_BONAIRE) 1812 return amdgpu_atombios_scratch_need_asic_init(adev); 1813 1814 /* check MEM_SIZE for older asics */ 1815 reg = amdgpu_asic_get_config_memsize(adev); 1816 1817 if ((reg != 0) && (reg != 0xffffffff)) 1818 return false; 1819 1820 return true; 1821 } 1822 1823 /* 1824 * Check whether seamless boot is supported. 1825 * 1826 * So far we only support seamless boot on DCE 3.0 or later. 1827 * If users report that it works on older ASICS as well, we may 1828 * loosen this. 1829 */ 1830 bool amdgpu_device_seamless_boot_supported(struct amdgpu_device *adev) 1831 { 1832 switch (amdgpu_seamless) { 1833 case -1: 1834 break; 1835 case 1: 1836 return true; 1837 case 0: 1838 return false; 1839 default: 1840 DRM_ERROR("Invalid value for amdgpu.seamless: %d\n", 1841 amdgpu_seamless); 1842 return false; 1843 } 1844 1845 if (!(adev->flags & AMD_IS_APU)) 1846 return false; 1847 1848 if (adev->mman.keep_stolen_vga_memory) 1849 return false; 1850 1851 return amdgpu_ip_version(adev, DCE_HWIP, 0) >= IP_VERSION(3, 0, 0); 1852 } 1853 1854 /* 1855 * Intel hosts such as Rocket Lake, Alder Lake, Raptor Lake and Sapphire Rapids 1856 * don't support dynamic speed switching. Until we have confirmation from Intel 1857 * that a specific host supports it, it's safer that we keep it disabled for all. 1858 * 1859 * https://edc.intel.com/content/www/us/en/design/products/platforms/details/raptor-lake-s/13th-generation-core-processors-datasheet-volume-1-of-2/005/pci-express-support/ 1860 * https://gitlab.freedesktop.org/drm/amd/-/issues/2663 1861 */ 1862 static bool amdgpu_device_pcie_dynamic_switching_supported(struct amdgpu_device *adev) 1863 { 1864 #if IS_ENABLED(CONFIG_X86) 1865 struct cpuinfo_x86 *c = &cpu_data(0); 1866 1867 /* eGPU change speeds based on USB4 fabric conditions */ 1868 if (dev_is_removable(adev->dev)) 1869 return true; 1870 1871 if (c->x86_vendor == X86_VENDOR_INTEL) 1872 return false; 1873 #endif 1874 return true; 1875 } 1876 1877 static bool amdgpu_device_aspm_support_quirk(struct amdgpu_device *adev) 1878 { 1879 #if IS_ENABLED(CONFIG_X86) 1880 struct cpuinfo_x86 *c = &cpu_data(0); 1881 1882 if (!(amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(12, 0, 0) || 1883 amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(12, 0, 1))) 1884 return false; 1885 1886 if (c->x86 == 6 && 1887 adev->pm.pcie_gen_mask & CAIL_PCIE_LINK_SPEED_SUPPORT_GEN5) { 1888 switch (c->x86_model) { 1889 case VFM_MODEL(INTEL_ALDERLAKE): 1890 case VFM_MODEL(INTEL_ALDERLAKE_L): 1891 case VFM_MODEL(INTEL_RAPTORLAKE): 1892 case VFM_MODEL(INTEL_RAPTORLAKE_P): 1893 case VFM_MODEL(INTEL_RAPTORLAKE_S): 1894 return true; 1895 default: 1896 return false; 1897 } 1898 } else { 1899 return false; 1900 } 1901 #else 1902 return false; 1903 #endif 1904 } 1905 1906 /** 1907 * amdgpu_device_should_use_aspm - check if the device should program ASPM 1908 * 1909 * @adev: amdgpu_device pointer 1910 * 1911 * Confirm whether the module parameter and pcie bridge agree that ASPM should 1912 * be set for this device. 1913 * 1914 * Returns true if it should be used or false if not. 1915 */ 1916 bool amdgpu_device_should_use_aspm(struct amdgpu_device *adev) 1917 { 1918 switch (amdgpu_aspm) { 1919 case -1: 1920 break; 1921 case 0: 1922 return false; 1923 case 1: 1924 return true; 1925 default: 1926 return false; 1927 } 1928 if (adev->flags & AMD_IS_APU) 1929 return false; 1930 if (amdgpu_device_aspm_support_quirk(adev)) 1931 return false; 1932 return pcie_aspm_enabled(adev->pdev); 1933 } 1934 1935 /* if we get transitioned to only one device, take VGA back */ 1936 /** 1937 * amdgpu_device_vga_set_decode - enable/disable vga decode 1938 * 1939 * @pdev: PCI device pointer 1940 * @state: enable/disable vga decode 1941 * 1942 * Enable/disable vga decode (all asics). 1943 * Returns VGA resource flags. 1944 */ 1945 static unsigned int amdgpu_device_vga_set_decode(struct pci_dev *pdev, 1946 bool state) 1947 { 1948 struct amdgpu_device *adev = drm_to_adev(pci_get_drvdata(pdev)); 1949 1950 amdgpu_asic_set_vga_state(adev, state); 1951 if (state) 1952 return VGA_RSRC_LEGACY_IO | VGA_RSRC_LEGACY_MEM | 1953 VGA_RSRC_NORMAL_IO | VGA_RSRC_NORMAL_MEM; 1954 else 1955 return VGA_RSRC_NORMAL_IO | VGA_RSRC_NORMAL_MEM; 1956 } 1957 1958 /** 1959 * amdgpu_device_check_block_size - validate the vm block size 1960 * 1961 * @adev: amdgpu_device pointer 1962 * 1963 * Validates the vm block size specified via module parameter. 1964 * The vm block size defines number of bits in page table versus page directory, 1965 * a page is 4KB so we have 12 bits offset, minimum 9 bits in the 1966 * page table and the remaining bits are in the page directory. 1967 */ 1968 static void amdgpu_device_check_block_size(struct amdgpu_device *adev) 1969 { 1970 /* defines number of bits in page table versus page directory, 1971 * a page is 4KB so we have 12 bits offset, minimum 9 bits in the 1972 * page table and the remaining bits are in the page directory 1973 */ 1974 if (amdgpu_vm_block_size == -1) 1975 return; 1976 1977 if (amdgpu_vm_block_size < 9) { 1978 dev_warn(adev->dev, "VM page table size (%d) too small\n", 1979 amdgpu_vm_block_size); 1980 amdgpu_vm_block_size = -1; 1981 } 1982 } 1983 1984 /** 1985 * amdgpu_device_check_vm_size - validate the vm size 1986 * 1987 * @adev: amdgpu_device pointer 1988 * 1989 * Validates the vm size in GB specified via module parameter. 1990 * The VM size is the size of the GPU virtual memory space in GB. 1991 */ 1992 static void amdgpu_device_check_vm_size(struct amdgpu_device *adev) 1993 { 1994 /* no need to check the default value */ 1995 if (amdgpu_vm_size == -1) 1996 return; 1997 1998 if (amdgpu_vm_size < 1) { 1999 dev_warn(adev->dev, "VM size (%d) too small, min is 1GB\n", 2000 amdgpu_vm_size); 2001 amdgpu_vm_size = -1; 2002 } 2003 } 2004 2005 static void amdgpu_device_check_smu_prv_buffer_size(struct amdgpu_device *adev) 2006 { 2007 struct sysinfo si; 2008 bool is_os_64 = (sizeof(void *) == 8); 2009 uint64_t total_memory; 2010 uint64_t dram_size_seven_GB = 0x1B8000000; 2011 uint64_t dram_size_three_GB = 0xB8000000; 2012 2013 if (amdgpu_smu_memory_pool_size == 0) 2014 return; 2015 2016 if (!is_os_64) { 2017 DRM_WARN("Not 64-bit OS, feature not supported\n"); 2018 goto def_value; 2019 } 2020 si_meminfo(&si); 2021 total_memory = (uint64_t)si.totalram * si.mem_unit; 2022 2023 if ((amdgpu_smu_memory_pool_size == 1) || 2024 (amdgpu_smu_memory_pool_size == 2)) { 2025 if (total_memory < dram_size_three_GB) 2026 goto def_value1; 2027 } else if ((amdgpu_smu_memory_pool_size == 4) || 2028 (amdgpu_smu_memory_pool_size == 8)) { 2029 if (total_memory < dram_size_seven_GB) 2030 goto def_value1; 2031 } else { 2032 DRM_WARN("Smu memory pool size not supported\n"); 2033 goto def_value; 2034 } 2035 adev->pm.smu_prv_buffer_size = amdgpu_smu_memory_pool_size << 28; 2036 2037 return; 2038 2039 def_value1: 2040 DRM_WARN("No enough system memory\n"); 2041 def_value: 2042 adev->pm.smu_prv_buffer_size = 0; 2043 } 2044 2045 static int amdgpu_device_init_apu_flags(struct amdgpu_device *adev) 2046 { 2047 if (!(adev->flags & AMD_IS_APU) || 2048 adev->asic_type < CHIP_RAVEN) 2049 return 0; 2050 2051 switch (adev->asic_type) { 2052 case CHIP_RAVEN: 2053 if (adev->pdev->device == 0x15dd) 2054 adev->apu_flags |= AMD_APU_IS_RAVEN; 2055 if (adev->pdev->device == 0x15d8) 2056 adev->apu_flags |= AMD_APU_IS_PICASSO; 2057 break; 2058 case CHIP_RENOIR: 2059 if ((adev->pdev->device == 0x1636) || 2060 (adev->pdev->device == 0x164c)) 2061 adev->apu_flags |= AMD_APU_IS_RENOIR; 2062 else 2063 adev->apu_flags |= AMD_APU_IS_GREEN_SARDINE; 2064 break; 2065 case CHIP_VANGOGH: 2066 adev->apu_flags |= AMD_APU_IS_VANGOGH; 2067 break; 2068 case CHIP_YELLOW_CARP: 2069 break; 2070 case CHIP_CYAN_SKILLFISH: 2071 if ((adev->pdev->device == 0x13FE) || 2072 (adev->pdev->device == 0x143F)) 2073 adev->apu_flags |= AMD_APU_IS_CYAN_SKILLFISH2; 2074 break; 2075 default: 2076 break; 2077 } 2078 2079 return 0; 2080 } 2081 2082 /** 2083 * amdgpu_device_check_arguments - validate module params 2084 * 2085 * @adev: amdgpu_device pointer 2086 * 2087 * Validates certain module parameters and updates 2088 * the associated values used by the driver (all asics). 2089 */ 2090 static int amdgpu_device_check_arguments(struct amdgpu_device *adev) 2091 { 2092 int i; 2093 2094 if (amdgpu_sched_jobs < 4) { 2095 dev_warn(adev->dev, "sched jobs (%d) must be at least 4\n", 2096 amdgpu_sched_jobs); 2097 amdgpu_sched_jobs = 4; 2098 } else if (!is_power_of_2(amdgpu_sched_jobs)) { 2099 dev_warn(adev->dev, "sched jobs (%d) must be a power of 2\n", 2100 amdgpu_sched_jobs); 2101 amdgpu_sched_jobs = roundup_pow_of_two(amdgpu_sched_jobs); 2102 } 2103 2104 if (amdgpu_gart_size != -1 && amdgpu_gart_size < 32) { 2105 /* gart size must be greater or equal to 32M */ 2106 dev_warn(adev->dev, "gart size (%d) too small\n", 2107 amdgpu_gart_size); 2108 amdgpu_gart_size = -1; 2109 } 2110 2111 if (amdgpu_gtt_size != -1 && amdgpu_gtt_size < 32) { 2112 /* gtt size must be greater or equal to 32M */ 2113 dev_warn(adev->dev, "gtt size (%d) too small\n", 2114 amdgpu_gtt_size); 2115 amdgpu_gtt_size = -1; 2116 } 2117 2118 /* valid range is between 4 and 9 inclusive */ 2119 if (amdgpu_vm_fragment_size != -1 && 2120 (amdgpu_vm_fragment_size > 9 || amdgpu_vm_fragment_size < 4)) { 2121 dev_warn(adev->dev, "valid range is between 4 and 9\n"); 2122 amdgpu_vm_fragment_size = -1; 2123 } 2124 2125 if (amdgpu_sched_hw_submission < 2) { 2126 dev_warn(adev->dev, "sched hw submission jobs (%d) must be at least 2\n", 2127 amdgpu_sched_hw_submission); 2128 amdgpu_sched_hw_submission = 2; 2129 } else if (!is_power_of_2(amdgpu_sched_hw_submission)) { 2130 dev_warn(adev->dev, "sched hw submission jobs (%d) must be a power of 2\n", 2131 amdgpu_sched_hw_submission); 2132 amdgpu_sched_hw_submission = roundup_pow_of_two(amdgpu_sched_hw_submission); 2133 } 2134 2135 if (amdgpu_reset_method < -1 || amdgpu_reset_method > 4) { 2136 dev_warn(adev->dev, "invalid option for reset method, reverting to default\n"); 2137 amdgpu_reset_method = -1; 2138 } 2139 2140 amdgpu_device_check_smu_prv_buffer_size(adev); 2141 2142 amdgpu_device_check_vm_size(adev); 2143 2144 amdgpu_device_check_block_size(adev); 2145 2146 adev->firmware.load_type = amdgpu_ucode_get_load_type(adev, amdgpu_fw_load_type); 2147 2148 for (i = 0; i < MAX_XCP; i++) { 2149 switch (amdgpu_enforce_isolation) { 2150 case -1: 2151 case 0: 2152 default: 2153 /* disable */ 2154 adev->enforce_isolation[i] = AMDGPU_ENFORCE_ISOLATION_DISABLE; 2155 break; 2156 case 1: 2157 /* enable */ 2158 adev->enforce_isolation[i] = 2159 AMDGPU_ENFORCE_ISOLATION_ENABLE; 2160 break; 2161 case 2: 2162 /* enable legacy mode */ 2163 adev->enforce_isolation[i] = 2164 AMDGPU_ENFORCE_ISOLATION_ENABLE_LEGACY; 2165 break; 2166 } 2167 } 2168 2169 return 0; 2170 } 2171 2172 /** 2173 * amdgpu_switcheroo_set_state - set switcheroo state 2174 * 2175 * @pdev: pci dev pointer 2176 * @state: vga_switcheroo state 2177 * 2178 * Callback for the switcheroo driver. Suspends or resumes 2179 * the asics before or after it is powered up using ACPI methods. 2180 */ 2181 static void amdgpu_switcheroo_set_state(struct pci_dev *pdev, 2182 enum vga_switcheroo_state state) 2183 { 2184 struct drm_device *dev = pci_get_drvdata(pdev); 2185 int r; 2186 2187 if (amdgpu_device_supports_px(dev) && state == VGA_SWITCHEROO_OFF) 2188 return; 2189 2190 if (state == VGA_SWITCHEROO_ON) { 2191 pr_info("switched on\n"); 2192 /* don't suspend or resume card normally */ 2193 dev->switch_power_state = DRM_SWITCH_POWER_CHANGING; 2194 2195 pci_set_power_state(pdev, PCI_D0); 2196 amdgpu_device_load_pci_state(pdev); 2197 r = pci_enable_device(pdev); 2198 if (r) 2199 DRM_WARN("pci_enable_device failed (%d)\n", r); 2200 amdgpu_device_resume(dev, true); 2201 2202 dev->switch_power_state = DRM_SWITCH_POWER_ON; 2203 } else { 2204 pr_info("switched off\n"); 2205 dev->switch_power_state = DRM_SWITCH_POWER_CHANGING; 2206 amdgpu_device_prepare(dev); 2207 amdgpu_device_suspend(dev, true); 2208 amdgpu_device_cache_pci_state(pdev); 2209 /* Shut down the device */ 2210 pci_disable_device(pdev); 2211 pci_set_power_state(pdev, PCI_D3cold); 2212 dev->switch_power_state = DRM_SWITCH_POWER_OFF; 2213 } 2214 } 2215 2216 /** 2217 * amdgpu_switcheroo_can_switch - see if switcheroo state can change 2218 * 2219 * @pdev: pci dev pointer 2220 * 2221 * Callback for the switcheroo driver. Check of the switcheroo 2222 * state can be changed. 2223 * Returns true if the state can be changed, false if not. 2224 */ 2225 static bool amdgpu_switcheroo_can_switch(struct pci_dev *pdev) 2226 { 2227 struct drm_device *dev = pci_get_drvdata(pdev); 2228 2229 /* 2230 * FIXME: open_count is protected by drm_global_mutex but that would lead to 2231 * locking inversion with the driver load path. And the access here is 2232 * completely racy anyway. So don't bother with locking for now. 2233 */ 2234 return atomic_read(&dev->open_count) == 0; 2235 } 2236 2237 static const struct vga_switcheroo_client_ops amdgpu_switcheroo_ops = { 2238 .set_gpu_state = amdgpu_switcheroo_set_state, 2239 .reprobe = NULL, 2240 .can_switch = amdgpu_switcheroo_can_switch, 2241 }; 2242 2243 /** 2244 * amdgpu_device_ip_set_clockgating_state - set the CG state 2245 * 2246 * @dev: amdgpu_device pointer 2247 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.) 2248 * @state: clockgating state (gate or ungate) 2249 * 2250 * Sets the requested clockgating state for all instances of 2251 * the hardware IP specified. 2252 * Returns the error code from the last instance. 2253 */ 2254 int amdgpu_device_ip_set_clockgating_state(void *dev, 2255 enum amd_ip_block_type block_type, 2256 enum amd_clockgating_state state) 2257 { 2258 struct amdgpu_device *adev = dev; 2259 int i, r = 0; 2260 2261 for (i = 0; i < adev->num_ip_blocks; i++) { 2262 if (!adev->ip_blocks[i].status.valid) 2263 continue; 2264 if (adev->ip_blocks[i].version->type != block_type) 2265 continue; 2266 if (!adev->ip_blocks[i].version->funcs->set_clockgating_state) 2267 continue; 2268 r = adev->ip_blocks[i].version->funcs->set_clockgating_state( 2269 &adev->ip_blocks[i], state); 2270 if (r) 2271 DRM_ERROR("set_clockgating_state of IP block <%s> failed %d\n", 2272 adev->ip_blocks[i].version->funcs->name, r); 2273 } 2274 return r; 2275 } 2276 2277 /** 2278 * amdgpu_device_ip_set_powergating_state - set the PG state 2279 * 2280 * @dev: amdgpu_device pointer 2281 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.) 2282 * @state: powergating state (gate or ungate) 2283 * 2284 * Sets the requested powergating state for all instances of 2285 * the hardware IP specified. 2286 * Returns the error code from the last instance. 2287 */ 2288 int amdgpu_device_ip_set_powergating_state(void *dev, 2289 enum amd_ip_block_type block_type, 2290 enum amd_powergating_state state) 2291 { 2292 struct amdgpu_device *adev = dev; 2293 int i, r = 0; 2294 2295 for (i = 0; i < adev->num_ip_blocks; i++) { 2296 if (!adev->ip_blocks[i].status.valid) 2297 continue; 2298 if (adev->ip_blocks[i].version->type != block_type) 2299 continue; 2300 if (!adev->ip_blocks[i].version->funcs->set_powergating_state) 2301 continue; 2302 r = adev->ip_blocks[i].version->funcs->set_powergating_state( 2303 &adev->ip_blocks[i], state); 2304 if (r) 2305 DRM_ERROR("set_powergating_state of IP block <%s> failed %d\n", 2306 adev->ip_blocks[i].version->funcs->name, r); 2307 } 2308 return r; 2309 } 2310 2311 /** 2312 * amdgpu_device_ip_get_clockgating_state - get the CG state 2313 * 2314 * @adev: amdgpu_device pointer 2315 * @flags: clockgating feature flags 2316 * 2317 * Walks the list of IPs on the device and updates the clockgating 2318 * flags for each IP. 2319 * Updates @flags with the feature flags for each hardware IP where 2320 * clockgating is enabled. 2321 */ 2322 void amdgpu_device_ip_get_clockgating_state(struct amdgpu_device *adev, 2323 u64 *flags) 2324 { 2325 int i; 2326 2327 for (i = 0; i < adev->num_ip_blocks; i++) { 2328 if (!adev->ip_blocks[i].status.valid) 2329 continue; 2330 if (adev->ip_blocks[i].version->funcs->get_clockgating_state) 2331 adev->ip_blocks[i].version->funcs->get_clockgating_state( 2332 &adev->ip_blocks[i], flags); 2333 } 2334 } 2335 2336 /** 2337 * amdgpu_device_ip_wait_for_idle - wait for idle 2338 * 2339 * @adev: amdgpu_device pointer 2340 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.) 2341 * 2342 * Waits for the request hardware IP to be idle. 2343 * Returns 0 for success or a negative error code on failure. 2344 */ 2345 int amdgpu_device_ip_wait_for_idle(struct amdgpu_device *adev, 2346 enum amd_ip_block_type block_type) 2347 { 2348 int i, r; 2349 2350 for (i = 0; i < adev->num_ip_blocks; i++) { 2351 if (!adev->ip_blocks[i].status.valid) 2352 continue; 2353 if (adev->ip_blocks[i].version->type == block_type) { 2354 if (adev->ip_blocks[i].version->funcs->wait_for_idle) { 2355 r = adev->ip_blocks[i].version->funcs->wait_for_idle( 2356 &adev->ip_blocks[i]); 2357 if (r) 2358 return r; 2359 } 2360 break; 2361 } 2362 } 2363 return 0; 2364 2365 } 2366 2367 /** 2368 * amdgpu_device_ip_is_valid - is the hardware IP enabled 2369 * 2370 * @adev: amdgpu_device pointer 2371 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.) 2372 * 2373 * Check if the hardware IP is enable or not. 2374 * Returns true if it the IP is enable, false if not. 2375 */ 2376 bool amdgpu_device_ip_is_valid(struct amdgpu_device *adev, 2377 enum amd_ip_block_type block_type) 2378 { 2379 int i; 2380 2381 for (i = 0; i < adev->num_ip_blocks; i++) { 2382 if (adev->ip_blocks[i].version->type == block_type) 2383 return adev->ip_blocks[i].status.valid; 2384 } 2385 return false; 2386 2387 } 2388 2389 /** 2390 * amdgpu_device_ip_get_ip_block - get a hw IP pointer 2391 * 2392 * @adev: amdgpu_device pointer 2393 * @type: Type of hardware IP (SMU, GFX, UVD, etc.) 2394 * 2395 * Returns a pointer to the hardware IP block structure 2396 * if it exists for the asic, otherwise NULL. 2397 */ 2398 struct amdgpu_ip_block * 2399 amdgpu_device_ip_get_ip_block(struct amdgpu_device *adev, 2400 enum amd_ip_block_type type) 2401 { 2402 int i; 2403 2404 for (i = 0; i < adev->num_ip_blocks; i++) 2405 if (adev->ip_blocks[i].version->type == type) 2406 return &adev->ip_blocks[i]; 2407 2408 return NULL; 2409 } 2410 2411 /** 2412 * amdgpu_device_ip_block_version_cmp 2413 * 2414 * @adev: amdgpu_device pointer 2415 * @type: enum amd_ip_block_type 2416 * @major: major version 2417 * @minor: minor version 2418 * 2419 * return 0 if equal or greater 2420 * return 1 if smaller or the ip_block doesn't exist 2421 */ 2422 int amdgpu_device_ip_block_version_cmp(struct amdgpu_device *adev, 2423 enum amd_ip_block_type type, 2424 u32 major, u32 minor) 2425 { 2426 struct amdgpu_ip_block *ip_block = amdgpu_device_ip_get_ip_block(adev, type); 2427 2428 if (ip_block && ((ip_block->version->major > major) || 2429 ((ip_block->version->major == major) && 2430 (ip_block->version->minor >= minor)))) 2431 return 0; 2432 2433 return 1; 2434 } 2435 2436 /** 2437 * amdgpu_device_ip_block_add 2438 * 2439 * @adev: amdgpu_device pointer 2440 * @ip_block_version: pointer to the IP to add 2441 * 2442 * Adds the IP block driver information to the collection of IPs 2443 * on the asic. 2444 */ 2445 int amdgpu_device_ip_block_add(struct amdgpu_device *adev, 2446 const struct amdgpu_ip_block_version *ip_block_version) 2447 { 2448 if (!ip_block_version) 2449 return -EINVAL; 2450 2451 switch (ip_block_version->type) { 2452 case AMD_IP_BLOCK_TYPE_VCN: 2453 if (adev->harvest_ip_mask & AMD_HARVEST_IP_VCN_MASK) 2454 return 0; 2455 break; 2456 case AMD_IP_BLOCK_TYPE_JPEG: 2457 if (adev->harvest_ip_mask & AMD_HARVEST_IP_JPEG_MASK) 2458 return 0; 2459 break; 2460 default: 2461 break; 2462 } 2463 2464 dev_info(adev->dev, "detected ip block number %d <%s>\n", 2465 adev->num_ip_blocks, ip_block_version->funcs->name); 2466 2467 adev->ip_blocks[adev->num_ip_blocks].adev = adev; 2468 2469 adev->ip_blocks[adev->num_ip_blocks++].version = ip_block_version; 2470 2471 return 0; 2472 } 2473 2474 /** 2475 * amdgpu_device_enable_virtual_display - enable virtual display feature 2476 * 2477 * @adev: amdgpu_device pointer 2478 * 2479 * Enabled the virtual display feature if the user has enabled it via 2480 * the module parameter virtual_display. This feature provides a virtual 2481 * display hardware on headless boards or in virtualized environments. 2482 * This function parses and validates the configuration string specified by 2483 * the user and configures the virtual display configuration (number of 2484 * virtual connectors, crtcs, etc.) specified. 2485 */ 2486 static void amdgpu_device_enable_virtual_display(struct amdgpu_device *adev) 2487 { 2488 adev->enable_virtual_display = false; 2489 2490 if (amdgpu_virtual_display) { 2491 const char *pci_address_name = pci_name(adev->pdev); 2492 char *pciaddstr, *pciaddstr_tmp, *pciaddname_tmp, *pciaddname; 2493 2494 pciaddstr = kstrdup(amdgpu_virtual_display, GFP_KERNEL); 2495 pciaddstr_tmp = pciaddstr; 2496 while ((pciaddname_tmp = strsep(&pciaddstr_tmp, ";"))) { 2497 pciaddname = strsep(&pciaddname_tmp, ","); 2498 if (!strcmp("all", pciaddname) 2499 || !strcmp(pci_address_name, pciaddname)) { 2500 long num_crtc; 2501 int res = -1; 2502 2503 adev->enable_virtual_display = true; 2504 2505 if (pciaddname_tmp) 2506 res = kstrtol(pciaddname_tmp, 10, 2507 &num_crtc); 2508 2509 if (!res) { 2510 if (num_crtc < 1) 2511 num_crtc = 1; 2512 if (num_crtc > 6) 2513 num_crtc = 6; 2514 adev->mode_info.num_crtc = num_crtc; 2515 } else { 2516 adev->mode_info.num_crtc = 1; 2517 } 2518 break; 2519 } 2520 } 2521 2522 DRM_INFO("virtual display string:%s, %s:virtual_display:%d, num_crtc:%d\n", 2523 amdgpu_virtual_display, pci_address_name, 2524 adev->enable_virtual_display, adev->mode_info.num_crtc); 2525 2526 kfree(pciaddstr); 2527 } 2528 } 2529 2530 void amdgpu_device_set_sriov_virtual_display(struct amdgpu_device *adev) 2531 { 2532 if (amdgpu_sriov_vf(adev) && !adev->enable_virtual_display) { 2533 adev->mode_info.num_crtc = 1; 2534 adev->enable_virtual_display = true; 2535 DRM_INFO("virtual_display:%d, num_crtc:%d\n", 2536 adev->enable_virtual_display, adev->mode_info.num_crtc); 2537 } 2538 } 2539 2540 /** 2541 * amdgpu_device_parse_gpu_info_fw - parse gpu info firmware 2542 * 2543 * @adev: amdgpu_device pointer 2544 * 2545 * Parses the asic configuration parameters specified in the gpu info 2546 * firmware and makes them available to the driver for use in configuring 2547 * the asic. 2548 * Returns 0 on success, -EINVAL on failure. 2549 */ 2550 static int amdgpu_device_parse_gpu_info_fw(struct amdgpu_device *adev) 2551 { 2552 const char *chip_name; 2553 int err; 2554 const struct gpu_info_firmware_header_v1_0 *hdr; 2555 2556 adev->firmware.gpu_info_fw = NULL; 2557 2558 if (adev->mman.discovery_bin) 2559 return 0; 2560 2561 switch (adev->asic_type) { 2562 default: 2563 return 0; 2564 case CHIP_VEGA10: 2565 chip_name = "vega10"; 2566 break; 2567 case CHIP_VEGA12: 2568 chip_name = "vega12"; 2569 break; 2570 case CHIP_RAVEN: 2571 if (adev->apu_flags & AMD_APU_IS_RAVEN2) 2572 chip_name = "raven2"; 2573 else if (adev->apu_flags & AMD_APU_IS_PICASSO) 2574 chip_name = "picasso"; 2575 else 2576 chip_name = "raven"; 2577 break; 2578 case CHIP_ARCTURUS: 2579 chip_name = "arcturus"; 2580 break; 2581 case CHIP_NAVI12: 2582 chip_name = "navi12"; 2583 break; 2584 } 2585 2586 err = amdgpu_ucode_request(adev, &adev->firmware.gpu_info_fw, 2587 AMDGPU_UCODE_OPTIONAL, 2588 "amdgpu/%s_gpu_info.bin", chip_name); 2589 if (err) { 2590 dev_err(adev->dev, 2591 "Failed to get gpu_info firmware \"%s_gpu_info.bin\"\n", 2592 chip_name); 2593 goto out; 2594 } 2595 2596 hdr = (const struct gpu_info_firmware_header_v1_0 *)adev->firmware.gpu_info_fw->data; 2597 amdgpu_ucode_print_gpu_info_hdr(&hdr->header); 2598 2599 switch (hdr->version_major) { 2600 case 1: 2601 { 2602 const struct gpu_info_firmware_v1_0 *gpu_info_fw = 2603 (const struct gpu_info_firmware_v1_0 *)(adev->firmware.gpu_info_fw->data + 2604 le32_to_cpu(hdr->header.ucode_array_offset_bytes)); 2605 2606 /* 2607 * Should be dropped when DAL no longer needs it. 2608 */ 2609 if (adev->asic_type == CHIP_NAVI12) 2610 goto parse_soc_bounding_box; 2611 2612 adev->gfx.config.max_shader_engines = le32_to_cpu(gpu_info_fw->gc_num_se); 2613 adev->gfx.config.max_cu_per_sh = le32_to_cpu(gpu_info_fw->gc_num_cu_per_sh); 2614 adev->gfx.config.max_sh_per_se = le32_to_cpu(gpu_info_fw->gc_num_sh_per_se); 2615 adev->gfx.config.max_backends_per_se = le32_to_cpu(gpu_info_fw->gc_num_rb_per_se); 2616 adev->gfx.config.max_texture_channel_caches = 2617 le32_to_cpu(gpu_info_fw->gc_num_tccs); 2618 adev->gfx.config.max_gprs = le32_to_cpu(gpu_info_fw->gc_num_gprs); 2619 adev->gfx.config.max_gs_threads = le32_to_cpu(gpu_info_fw->gc_num_max_gs_thds); 2620 adev->gfx.config.gs_vgt_table_depth = le32_to_cpu(gpu_info_fw->gc_gs_table_depth); 2621 adev->gfx.config.gs_prim_buffer_depth = le32_to_cpu(gpu_info_fw->gc_gsprim_buff_depth); 2622 adev->gfx.config.double_offchip_lds_buf = 2623 le32_to_cpu(gpu_info_fw->gc_double_offchip_lds_buffer); 2624 adev->gfx.cu_info.wave_front_size = le32_to_cpu(gpu_info_fw->gc_wave_size); 2625 adev->gfx.cu_info.max_waves_per_simd = 2626 le32_to_cpu(gpu_info_fw->gc_max_waves_per_simd); 2627 adev->gfx.cu_info.max_scratch_slots_per_cu = 2628 le32_to_cpu(gpu_info_fw->gc_max_scratch_slots_per_cu); 2629 adev->gfx.cu_info.lds_size = le32_to_cpu(gpu_info_fw->gc_lds_size); 2630 if (hdr->version_minor >= 1) { 2631 const struct gpu_info_firmware_v1_1 *gpu_info_fw = 2632 (const struct gpu_info_firmware_v1_1 *)(adev->firmware.gpu_info_fw->data + 2633 le32_to_cpu(hdr->header.ucode_array_offset_bytes)); 2634 adev->gfx.config.num_sc_per_sh = 2635 le32_to_cpu(gpu_info_fw->num_sc_per_sh); 2636 adev->gfx.config.num_packer_per_sc = 2637 le32_to_cpu(gpu_info_fw->num_packer_per_sc); 2638 } 2639 2640 parse_soc_bounding_box: 2641 /* 2642 * soc bounding box info is not integrated in disocovery table, 2643 * we always need to parse it from gpu info firmware if needed. 2644 */ 2645 if (hdr->version_minor == 2) { 2646 const struct gpu_info_firmware_v1_2 *gpu_info_fw = 2647 (const struct gpu_info_firmware_v1_2 *)(adev->firmware.gpu_info_fw->data + 2648 le32_to_cpu(hdr->header.ucode_array_offset_bytes)); 2649 adev->dm.soc_bounding_box = &gpu_info_fw->soc_bounding_box; 2650 } 2651 break; 2652 } 2653 default: 2654 dev_err(adev->dev, 2655 "Unsupported gpu_info table %d\n", hdr->header.ucode_version); 2656 err = -EINVAL; 2657 goto out; 2658 } 2659 out: 2660 return err; 2661 } 2662 2663 /** 2664 * amdgpu_device_ip_early_init - run early init for hardware IPs 2665 * 2666 * @adev: amdgpu_device pointer 2667 * 2668 * Early initialization pass for hardware IPs. The hardware IPs that make 2669 * up each asic are discovered each IP's early_init callback is run. This 2670 * is the first stage in initializing the asic. 2671 * Returns 0 on success, negative error code on failure. 2672 */ 2673 static int amdgpu_device_ip_early_init(struct amdgpu_device *adev) 2674 { 2675 struct amdgpu_ip_block *ip_block; 2676 struct pci_dev *parent; 2677 bool total, skip_bios; 2678 uint32_t bios_flags; 2679 int i, r; 2680 2681 amdgpu_device_enable_virtual_display(adev); 2682 2683 if (amdgpu_sriov_vf(adev)) { 2684 r = amdgpu_virt_request_full_gpu(adev, true); 2685 if (r) 2686 return r; 2687 } 2688 2689 switch (adev->asic_type) { 2690 #ifdef CONFIG_DRM_AMDGPU_SI 2691 case CHIP_VERDE: 2692 case CHIP_TAHITI: 2693 case CHIP_PITCAIRN: 2694 case CHIP_OLAND: 2695 case CHIP_HAINAN: 2696 adev->family = AMDGPU_FAMILY_SI; 2697 r = si_set_ip_blocks(adev); 2698 if (r) 2699 return r; 2700 break; 2701 #endif 2702 #ifdef CONFIG_DRM_AMDGPU_CIK 2703 case CHIP_BONAIRE: 2704 case CHIP_HAWAII: 2705 case CHIP_KAVERI: 2706 case CHIP_KABINI: 2707 case CHIP_MULLINS: 2708 if (adev->flags & AMD_IS_APU) 2709 adev->family = AMDGPU_FAMILY_KV; 2710 else 2711 adev->family = AMDGPU_FAMILY_CI; 2712 2713 r = cik_set_ip_blocks(adev); 2714 if (r) 2715 return r; 2716 break; 2717 #endif 2718 case CHIP_TOPAZ: 2719 case CHIP_TONGA: 2720 case CHIP_FIJI: 2721 case CHIP_POLARIS10: 2722 case CHIP_POLARIS11: 2723 case CHIP_POLARIS12: 2724 case CHIP_VEGAM: 2725 case CHIP_CARRIZO: 2726 case CHIP_STONEY: 2727 if (adev->flags & AMD_IS_APU) 2728 adev->family = AMDGPU_FAMILY_CZ; 2729 else 2730 adev->family = AMDGPU_FAMILY_VI; 2731 2732 r = vi_set_ip_blocks(adev); 2733 if (r) 2734 return r; 2735 break; 2736 default: 2737 r = amdgpu_discovery_set_ip_blocks(adev); 2738 if (r) 2739 return r; 2740 break; 2741 } 2742 2743 /* Check for IP version 9.4.3 with A0 hardware */ 2744 if (amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 3) && 2745 !amdgpu_device_get_rev_id(adev)) { 2746 dev_err(adev->dev, "Unsupported A0 hardware\n"); 2747 return -ENODEV; /* device unsupported - no device error */ 2748 } 2749 2750 if (amdgpu_has_atpx() && 2751 (amdgpu_is_atpx_hybrid() || 2752 amdgpu_has_atpx_dgpu_power_cntl()) && 2753 ((adev->flags & AMD_IS_APU) == 0) && 2754 !dev_is_removable(&adev->pdev->dev)) 2755 adev->flags |= AMD_IS_PX; 2756 2757 if (!(adev->flags & AMD_IS_APU)) { 2758 parent = pcie_find_root_port(adev->pdev); 2759 adev->has_pr3 = parent ? pci_pr3_present(parent) : false; 2760 } 2761 2762 adev->pm.pp_feature = amdgpu_pp_feature_mask; 2763 if (amdgpu_sriov_vf(adev) || sched_policy == KFD_SCHED_POLICY_NO_HWS) 2764 adev->pm.pp_feature &= ~PP_GFXOFF_MASK; 2765 if (amdgpu_sriov_vf(adev) && adev->asic_type == CHIP_SIENNA_CICHLID) 2766 adev->pm.pp_feature &= ~PP_OVERDRIVE_MASK; 2767 if (!amdgpu_device_pcie_dynamic_switching_supported(adev)) 2768 adev->pm.pp_feature &= ~PP_PCIE_DPM_MASK; 2769 2770 total = true; 2771 for (i = 0; i < adev->num_ip_blocks; i++) { 2772 ip_block = &adev->ip_blocks[i]; 2773 2774 if ((amdgpu_ip_block_mask & (1 << i)) == 0) { 2775 DRM_WARN("disabled ip block: %d <%s>\n", 2776 i, adev->ip_blocks[i].version->funcs->name); 2777 adev->ip_blocks[i].status.valid = false; 2778 } else if (ip_block->version->funcs->early_init) { 2779 r = ip_block->version->funcs->early_init(ip_block); 2780 if (r == -ENOENT) { 2781 adev->ip_blocks[i].status.valid = false; 2782 } else if (r) { 2783 DRM_ERROR("early_init of IP block <%s> failed %d\n", 2784 adev->ip_blocks[i].version->funcs->name, r); 2785 total = false; 2786 } else { 2787 adev->ip_blocks[i].status.valid = true; 2788 } 2789 } else { 2790 adev->ip_blocks[i].status.valid = true; 2791 } 2792 /* get the vbios after the asic_funcs are set up */ 2793 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON) { 2794 r = amdgpu_device_parse_gpu_info_fw(adev); 2795 if (r) 2796 return r; 2797 2798 bios_flags = amdgpu_device_get_vbios_flags(adev); 2799 skip_bios = !!(bios_flags & AMDGPU_VBIOS_SKIP); 2800 /* Read BIOS */ 2801 if (!skip_bios) { 2802 bool optional = 2803 !!(bios_flags & AMDGPU_VBIOS_OPTIONAL); 2804 if (!amdgpu_get_bios(adev) && !optional) 2805 return -EINVAL; 2806 2807 if (optional && !adev->bios) 2808 dev_info( 2809 adev->dev, 2810 "VBIOS image optional, proceeding without VBIOS image"); 2811 2812 if (adev->bios) { 2813 r = amdgpu_atombios_init(adev); 2814 if (r) { 2815 dev_err(adev->dev, 2816 "amdgpu_atombios_init failed\n"); 2817 amdgpu_vf_error_put( 2818 adev, 2819 AMDGIM_ERROR_VF_ATOMBIOS_INIT_FAIL, 2820 0, 0); 2821 return r; 2822 } 2823 } 2824 } 2825 2826 /*get pf2vf msg info at it's earliest time*/ 2827 if (amdgpu_sriov_vf(adev)) 2828 amdgpu_virt_init_data_exchange(adev); 2829 2830 } 2831 } 2832 if (!total) 2833 return -ENODEV; 2834 2835 if (adev->gmc.xgmi.supported) 2836 amdgpu_xgmi_early_init(adev); 2837 2838 ip_block = amdgpu_device_ip_get_ip_block(adev, AMD_IP_BLOCK_TYPE_GFX); 2839 if (ip_block->status.valid != false) 2840 amdgpu_amdkfd_device_probe(adev); 2841 2842 adev->cg_flags &= amdgpu_cg_mask; 2843 adev->pg_flags &= amdgpu_pg_mask; 2844 2845 return 0; 2846 } 2847 2848 static int amdgpu_device_ip_hw_init_phase1(struct amdgpu_device *adev) 2849 { 2850 int i, r; 2851 2852 for (i = 0; i < adev->num_ip_blocks; i++) { 2853 if (!adev->ip_blocks[i].status.sw) 2854 continue; 2855 if (adev->ip_blocks[i].status.hw) 2856 continue; 2857 if (!amdgpu_ip_member_of_hwini( 2858 adev, adev->ip_blocks[i].version->type)) 2859 continue; 2860 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON || 2861 (amdgpu_sriov_vf(adev) && (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP)) || 2862 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH) { 2863 r = adev->ip_blocks[i].version->funcs->hw_init(&adev->ip_blocks[i]); 2864 if (r) { 2865 DRM_ERROR("hw_init of IP block <%s> failed %d\n", 2866 adev->ip_blocks[i].version->funcs->name, r); 2867 return r; 2868 } 2869 adev->ip_blocks[i].status.hw = true; 2870 } 2871 } 2872 2873 return 0; 2874 } 2875 2876 static int amdgpu_device_ip_hw_init_phase2(struct amdgpu_device *adev) 2877 { 2878 int i, r; 2879 2880 for (i = 0; i < adev->num_ip_blocks; i++) { 2881 if (!adev->ip_blocks[i].status.sw) 2882 continue; 2883 if (adev->ip_blocks[i].status.hw) 2884 continue; 2885 if (!amdgpu_ip_member_of_hwini( 2886 adev, adev->ip_blocks[i].version->type)) 2887 continue; 2888 r = adev->ip_blocks[i].version->funcs->hw_init(&adev->ip_blocks[i]); 2889 if (r) { 2890 DRM_ERROR("hw_init of IP block <%s> failed %d\n", 2891 adev->ip_blocks[i].version->funcs->name, r); 2892 return r; 2893 } 2894 adev->ip_blocks[i].status.hw = true; 2895 } 2896 2897 return 0; 2898 } 2899 2900 static int amdgpu_device_fw_loading(struct amdgpu_device *adev) 2901 { 2902 int r = 0; 2903 int i; 2904 uint32_t smu_version; 2905 2906 if (adev->asic_type >= CHIP_VEGA10) { 2907 for (i = 0; i < adev->num_ip_blocks; i++) { 2908 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_PSP) 2909 continue; 2910 2911 if (!amdgpu_ip_member_of_hwini(adev, 2912 AMD_IP_BLOCK_TYPE_PSP)) 2913 break; 2914 2915 if (!adev->ip_blocks[i].status.sw) 2916 continue; 2917 2918 /* no need to do the fw loading again if already done*/ 2919 if (adev->ip_blocks[i].status.hw == true) 2920 break; 2921 2922 if (amdgpu_in_reset(adev) || adev->in_suspend) { 2923 r = amdgpu_ip_block_resume(&adev->ip_blocks[i]); 2924 if (r) 2925 return r; 2926 } else { 2927 r = adev->ip_blocks[i].version->funcs->hw_init(&adev->ip_blocks[i]); 2928 if (r) { 2929 DRM_ERROR("hw_init of IP block <%s> failed %d\n", 2930 adev->ip_blocks[i].version->funcs->name, r); 2931 return r; 2932 } 2933 adev->ip_blocks[i].status.hw = true; 2934 } 2935 break; 2936 } 2937 } 2938 2939 if (!amdgpu_sriov_vf(adev) || adev->asic_type == CHIP_TONGA) 2940 r = amdgpu_pm_load_smu_firmware(adev, &smu_version); 2941 2942 return r; 2943 } 2944 2945 static int amdgpu_device_init_schedulers(struct amdgpu_device *adev) 2946 { 2947 struct drm_sched_init_args args = { 2948 .ops = &amdgpu_sched_ops, 2949 .num_rqs = DRM_SCHED_PRIORITY_COUNT, 2950 .timeout_wq = adev->reset_domain->wq, 2951 .dev = adev->dev, 2952 }; 2953 long timeout; 2954 int r, i; 2955 2956 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 2957 struct amdgpu_ring *ring = adev->rings[i]; 2958 2959 /* No need to setup the GPU scheduler for rings that don't need it */ 2960 if (!ring || ring->no_scheduler) 2961 continue; 2962 2963 switch (ring->funcs->type) { 2964 case AMDGPU_RING_TYPE_GFX: 2965 timeout = adev->gfx_timeout; 2966 break; 2967 case AMDGPU_RING_TYPE_COMPUTE: 2968 timeout = adev->compute_timeout; 2969 break; 2970 case AMDGPU_RING_TYPE_SDMA: 2971 timeout = adev->sdma_timeout; 2972 break; 2973 default: 2974 timeout = adev->video_timeout; 2975 break; 2976 } 2977 2978 args.timeout = timeout; 2979 args.credit_limit = ring->num_hw_submission; 2980 args.score = ring->sched_score; 2981 args.name = ring->name; 2982 2983 r = drm_sched_init(&ring->sched, &args); 2984 if (r) { 2985 DRM_ERROR("Failed to create scheduler on ring %s.\n", 2986 ring->name); 2987 return r; 2988 } 2989 r = amdgpu_uvd_entity_init(adev, ring); 2990 if (r) { 2991 DRM_ERROR("Failed to create UVD scheduling entity on ring %s.\n", 2992 ring->name); 2993 return r; 2994 } 2995 r = amdgpu_vce_entity_init(adev, ring); 2996 if (r) { 2997 DRM_ERROR("Failed to create VCE scheduling entity on ring %s.\n", 2998 ring->name); 2999 return r; 3000 } 3001 } 3002 3003 amdgpu_xcp_update_partition_sched_list(adev); 3004 3005 return 0; 3006 } 3007 3008 3009 /** 3010 * amdgpu_device_ip_init - run init for hardware IPs 3011 * 3012 * @adev: amdgpu_device pointer 3013 * 3014 * Main initialization pass for hardware IPs. The list of all the hardware 3015 * IPs that make up the asic is walked and the sw_init and hw_init callbacks 3016 * are run. sw_init initializes the software state associated with each IP 3017 * and hw_init initializes the hardware associated with each IP. 3018 * Returns 0 on success, negative error code on failure. 3019 */ 3020 static int amdgpu_device_ip_init(struct amdgpu_device *adev) 3021 { 3022 bool init_badpage; 3023 int i, r; 3024 3025 r = amdgpu_ras_init(adev); 3026 if (r) 3027 return r; 3028 3029 for (i = 0; i < adev->num_ip_blocks; i++) { 3030 if (!adev->ip_blocks[i].status.valid) 3031 continue; 3032 if (adev->ip_blocks[i].version->funcs->sw_init) { 3033 r = adev->ip_blocks[i].version->funcs->sw_init(&adev->ip_blocks[i]); 3034 if (r) { 3035 DRM_ERROR("sw_init of IP block <%s> failed %d\n", 3036 adev->ip_blocks[i].version->funcs->name, r); 3037 goto init_failed; 3038 } 3039 } 3040 adev->ip_blocks[i].status.sw = true; 3041 3042 if (!amdgpu_ip_member_of_hwini( 3043 adev, adev->ip_blocks[i].version->type)) 3044 continue; 3045 3046 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON) { 3047 /* need to do common hw init early so everything is set up for gmc */ 3048 r = adev->ip_blocks[i].version->funcs->hw_init(&adev->ip_blocks[i]); 3049 if (r) { 3050 DRM_ERROR("hw_init %d failed %d\n", i, r); 3051 goto init_failed; 3052 } 3053 adev->ip_blocks[i].status.hw = true; 3054 } else if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) { 3055 /* need to do gmc hw init early so we can allocate gpu mem */ 3056 /* Try to reserve bad pages early */ 3057 if (amdgpu_sriov_vf(adev)) 3058 amdgpu_virt_exchange_data(adev); 3059 3060 r = amdgpu_device_mem_scratch_init(adev); 3061 if (r) { 3062 DRM_ERROR("amdgpu_mem_scratch_init failed %d\n", r); 3063 goto init_failed; 3064 } 3065 r = adev->ip_blocks[i].version->funcs->hw_init(&adev->ip_blocks[i]); 3066 if (r) { 3067 DRM_ERROR("hw_init %d failed %d\n", i, r); 3068 goto init_failed; 3069 } 3070 r = amdgpu_device_wb_init(adev); 3071 if (r) { 3072 DRM_ERROR("amdgpu_device_wb_init failed %d\n", r); 3073 goto init_failed; 3074 } 3075 adev->ip_blocks[i].status.hw = true; 3076 3077 /* right after GMC hw init, we create CSA */ 3078 if (adev->gfx.mcbp) { 3079 r = amdgpu_allocate_static_csa(adev, &adev->virt.csa_obj, 3080 AMDGPU_GEM_DOMAIN_VRAM | 3081 AMDGPU_GEM_DOMAIN_GTT, 3082 AMDGPU_CSA_SIZE); 3083 if (r) { 3084 DRM_ERROR("allocate CSA failed %d\n", r); 3085 goto init_failed; 3086 } 3087 } 3088 3089 r = amdgpu_seq64_init(adev); 3090 if (r) { 3091 DRM_ERROR("allocate seq64 failed %d\n", r); 3092 goto init_failed; 3093 } 3094 } 3095 } 3096 3097 if (amdgpu_sriov_vf(adev)) 3098 amdgpu_virt_init_data_exchange(adev); 3099 3100 r = amdgpu_ib_pool_init(adev); 3101 if (r) { 3102 dev_err(adev->dev, "IB initialization failed (%d).\n", r); 3103 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_IB_INIT_FAIL, 0, r); 3104 goto init_failed; 3105 } 3106 3107 r = amdgpu_ucode_create_bo(adev); /* create ucode bo when sw_init complete*/ 3108 if (r) 3109 goto init_failed; 3110 3111 r = amdgpu_device_ip_hw_init_phase1(adev); 3112 if (r) 3113 goto init_failed; 3114 3115 r = amdgpu_device_fw_loading(adev); 3116 if (r) 3117 goto init_failed; 3118 3119 r = amdgpu_device_ip_hw_init_phase2(adev); 3120 if (r) 3121 goto init_failed; 3122 3123 /* 3124 * retired pages will be loaded from eeprom and reserved here, 3125 * it should be called after amdgpu_device_ip_hw_init_phase2 since 3126 * for some ASICs the RAS EEPROM code relies on SMU fully functioning 3127 * for I2C communication which only true at this point. 3128 * 3129 * amdgpu_ras_recovery_init may fail, but the upper only cares the 3130 * failure from bad gpu situation and stop amdgpu init process 3131 * accordingly. For other failed cases, it will still release all 3132 * the resource and print error message, rather than returning one 3133 * negative value to upper level. 3134 * 3135 * Note: theoretically, this should be called before all vram allocations 3136 * to protect retired page from abusing 3137 */ 3138 init_badpage = (adev->init_lvl->level != AMDGPU_INIT_LEVEL_MINIMAL_XGMI); 3139 r = amdgpu_ras_recovery_init(adev, init_badpage); 3140 if (r) 3141 goto init_failed; 3142 3143 /** 3144 * In case of XGMI grab extra reference for reset domain for this device 3145 */ 3146 if (adev->gmc.xgmi.num_physical_nodes > 1) { 3147 if (amdgpu_xgmi_add_device(adev) == 0) { 3148 if (!amdgpu_sriov_vf(adev)) { 3149 struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev); 3150 3151 if (WARN_ON(!hive)) { 3152 r = -ENOENT; 3153 goto init_failed; 3154 } 3155 3156 if (!hive->reset_domain || 3157 !amdgpu_reset_get_reset_domain(hive->reset_domain)) { 3158 r = -ENOENT; 3159 amdgpu_put_xgmi_hive(hive); 3160 goto init_failed; 3161 } 3162 3163 /* Drop the early temporary reset domain we created for device */ 3164 amdgpu_reset_put_reset_domain(adev->reset_domain); 3165 adev->reset_domain = hive->reset_domain; 3166 amdgpu_put_xgmi_hive(hive); 3167 } 3168 } 3169 } 3170 3171 r = amdgpu_device_init_schedulers(adev); 3172 if (r) 3173 goto init_failed; 3174 3175 if (adev->mman.buffer_funcs_ring->sched.ready) 3176 amdgpu_ttm_set_buffer_funcs_status(adev, true); 3177 3178 /* Don't init kfd if whole hive need to be reset during init */ 3179 if (adev->init_lvl->level != AMDGPU_INIT_LEVEL_MINIMAL_XGMI) { 3180 kgd2kfd_init_zone_device(adev); 3181 amdgpu_amdkfd_device_init(adev); 3182 } 3183 3184 amdgpu_fru_get_product_info(adev); 3185 3186 if (!amdgpu_sriov_vf(adev) || amdgpu_sriov_ras_cper_en(adev)) 3187 r = amdgpu_cper_init(adev); 3188 3189 init_failed: 3190 3191 return r; 3192 } 3193 3194 /** 3195 * amdgpu_device_fill_reset_magic - writes reset magic to gart pointer 3196 * 3197 * @adev: amdgpu_device pointer 3198 * 3199 * Writes a reset magic value to the gart pointer in VRAM. The driver calls 3200 * this function before a GPU reset. If the value is retained after a 3201 * GPU reset, VRAM has not been lost. Some GPU resets may destroy VRAM contents. 3202 */ 3203 static void amdgpu_device_fill_reset_magic(struct amdgpu_device *adev) 3204 { 3205 memcpy(adev->reset_magic, adev->gart.ptr, AMDGPU_RESET_MAGIC_NUM); 3206 } 3207 3208 /** 3209 * amdgpu_device_check_vram_lost - check if vram is valid 3210 * 3211 * @adev: amdgpu_device pointer 3212 * 3213 * Checks the reset magic value written to the gart pointer in VRAM. 3214 * The driver calls this after a GPU reset to see if the contents of 3215 * VRAM is lost or now. 3216 * returns true if vram is lost, false if not. 3217 */ 3218 static bool amdgpu_device_check_vram_lost(struct amdgpu_device *adev) 3219 { 3220 if (memcmp(adev->gart.ptr, adev->reset_magic, 3221 AMDGPU_RESET_MAGIC_NUM)) 3222 return true; 3223 3224 if (!amdgpu_in_reset(adev)) 3225 return false; 3226 3227 /* 3228 * For all ASICs with baco/mode1 reset, the VRAM is 3229 * always assumed to be lost. 3230 */ 3231 switch (amdgpu_asic_reset_method(adev)) { 3232 case AMD_RESET_METHOD_LINK: 3233 case AMD_RESET_METHOD_BACO: 3234 case AMD_RESET_METHOD_MODE1: 3235 return true; 3236 default: 3237 return false; 3238 } 3239 } 3240 3241 /** 3242 * amdgpu_device_set_cg_state - set clockgating for amdgpu device 3243 * 3244 * @adev: amdgpu_device pointer 3245 * @state: clockgating state (gate or ungate) 3246 * 3247 * The list of all the hardware IPs that make up the asic is walked and the 3248 * set_clockgating_state callbacks are run. 3249 * Late initialization pass enabling clockgating for hardware IPs. 3250 * Fini or suspend, pass disabling clockgating for hardware IPs. 3251 * Returns 0 on success, negative error code on failure. 3252 */ 3253 3254 int amdgpu_device_set_cg_state(struct amdgpu_device *adev, 3255 enum amd_clockgating_state state) 3256 { 3257 int i, j, r; 3258 3259 if (amdgpu_emu_mode == 1) 3260 return 0; 3261 3262 for (j = 0; j < adev->num_ip_blocks; j++) { 3263 i = state == AMD_CG_STATE_GATE ? j : adev->num_ip_blocks - j - 1; 3264 if (!adev->ip_blocks[i].status.late_initialized) 3265 continue; 3266 /* skip CG for GFX, SDMA on S0ix */ 3267 if (adev->in_s0ix && 3268 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GFX || 3269 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SDMA)) 3270 continue; 3271 /* skip CG for VCE/UVD, it's handled specially */ 3272 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_UVD && 3273 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCE && 3274 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCN && 3275 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_JPEG && 3276 adev->ip_blocks[i].version->funcs->set_clockgating_state) { 3277 /* enable clockgating to save power */ 3278 r = adev->ip_blocks[i].version->funcs->set_clockgating_state(&adev->ip_blocks[i], 3279 state); 3280 if (r) { 3281 DRM_ERROR("set_clockgating_state(gate) of IP block <%s> failed %d\n", 3282 adev->ip_blocks[i].version->funcs->name, r); 3283 return r; 3284 } 3285 } 3286 } 3287 3288 return 0; 3289 } 3290 3291 int amdgpu_device_set_pg_state(struct amdgpu_device *adev, 3292 enum amd_powergating_state state) 3293 { 3294 int i, j, r; 3295 3296 if (amdgpu_emu_mode == 1) 3297 return 0; 3298 3299 for (j = 0; j < adev->num_ip_blocks; j++) { 3300 i = state == AMD_PG_STATE_GATE ? j : adev->num_ip_blocks - j - 1; 3301 if (!adev->ip_blocks[i].status.late_initialized) 3302 continue; 3303 /* skip PG for GFX, SDMA on S0ix */ 3304 if (adev->in_s0ix && 3305 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GFX || 3306 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SDMA)) 3307 continue; 3308 /* skip CG for VCE/UVD, it's handled specially */ 3309 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_UVD && 3310 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCE && 3311 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCN && 3312 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_JPEG && 3313 adev->ip_blocks[i].version->funcs->set_powergating_state) { 3314 /* enable powergating to save power */ 3315 r = adev->ip_blocks[i].version->funcs->set_powergating_state(&adev->ip_blocks[i], 3316 state); 3317 if (r) { 3318 DRM_ERROR("set_powergating_state(gate) of IP block <%s> failed %d\n", 3319 adev->ip_blocks[i].version->funcs->name, r); 3320 return r; 3321 } 3322 } 3323 } 3324 return 0; 3325 } 3326 3327 static int amdgpu_device_enable_mgpu_fan_boost(void) 3328 { 3329 struct amdgpu_gpu_instance *gpu_ins; 3330 struct amdgpu_device *adev; 3331 int i, ret = 0; 3332 3333 mutex_lock(&mgpu_info.mutex); 3334 3335 /* 3336 * MGPU fan boost feature should be enabled 3337 * only when there are two or more dGPUs in 3338 * the system 3339 */ 3340 if (mgpu_info.num_dgpu < 2) 3341 goto out; 3342 3343 for (i = 0; i < mgpu_info.num_dgpu; i++) { 3344 gpu_ins = &(mgpu_info.gpu_ins[i]); 3345 adev = gpu_ins->adev; 3346 if (!(adev->flags & AMD_IS_APU) && 3347 !gpu_ins->mgpu_fan_enabled) { 3348 ret = amdgpu_dpm_enable_mgpu_fan_boost(adev); 3349 if (ret) 3350 break; 3351 3352 gpu_ins->mgpu_fan_enabled = 1; 3353 } 3354 } 3355 3356 out: 3357 mutex_unlock(&mgpu_info.mutex); 3358 3359 return ret; 3360 } 3361 3362 /** 3363 * amdgpu_device_ip_late_init - run late init for hardware IPs 3364 * 3365 * @adev: amdgpu_device pointer 3366 * 3367 * Late initialization pass for hardware IPs. The list of all the hardware 3368 * IPs that make up the asic is walked and the late_init callbacks are run. 3369 * late_init covers any special initialization that an IP requires 3370 * after all of the have been initialized or something that needs to happen 3371 * late in the init process. 3372 * Returns 0 on success, negative error code on failure. 3373 */ 3374 static int amdgpu_device_ip_late_init(struct amdgpu_device *adev) 3375 { 3376 struct amdgpu_gpu_instance *gpu_instance; 3377 int i = 0, r; 3378 3379 for (i = 0; i < adev->num_ip_blocks; i++) { 3380 if (!adev->ip_blocks[i].status.hw) 3381 continue; 3382 if (adev->ip_blocks[i].version->funcs->late_init) { 3383 r = adev->ip_blocks[i].version->funcs->late_init(&adev->ip_blocks[i]); 3384 if (r) { 3385 DRM_ERROR("late_init of IP block <%s> failed %d\n", 3386 adev->ip_blocks[i].version->funcs->name, r); 3387 return r; 3388 } 3389 } 3390 adev->ip_blocks[i].status.late_initialized = true; 3391 } 3392 3393 r = amdgpu_ras_late_init(adev); 3394 if (r) { 3395 DRM_ERROR("amdgpu_ras_late_init failed %d", r); 3396 return r; 3397 } 3398 3399 if (!amdgpu_reset_in_recovery(adev)) 3400 amdgpu_ras_set_error_query_ready(adev, true); 3401 3402 amdgpu_device_set_cg_state(adev, AMD_CG_STATE_GATE); 3403 amdgpu_device_set_pg_state(adev, AMD_PG_STATE_GATE); 3404 3405 amdgpu_device_fill_reset_magic(adev); 3406 3407 r = amdgpu_device_enable_mgpu_fan_boost(); 3408 if (r) 3409 DRM_ERROR("enable mgpu fan boost failed (%d).\n", r); 3410 3411 /* For passthrough configuration on arcturus and aldebaran, enable special handling SBR */ 3412 if (amdgpu_passthrough(adev) && 3413 ((adev->asic_type == CHIP_ARCTURUS && adev->gmc.xgmi.num_physical_nodes > 1) || 3414 adev->asic_type == CHIP_ALDEBARAN)) 3415 amdgpu_dpm_handle_passthrough_sbr(adev, true); 3416 3417 if (adev->gmc.xgmi.num_physical_nodes > 1) { 3418 mutex_lock(&mgpu_info.mutex); 3419 3420 /* 3421 * Reset device p-state to low as this was booted with high. 3422 * 3423 * This should be performed only after all devices from the same 3424 * hive get initialized. 3425 * 3426 * However, it's unknown how many device in the hive in advance. 3427 * As this is counted one by one during devices initializations. 3428 * 3429 * So, we wait for all XGMI interlinked devices initialized. 3430 * This may bring some delays as those devices may come from 3431 * different hives. But that should be OK. 3432 */ 3433 if (mgpu_info.num_dgpu == adev->gmc.xgmi.num_physical_nodes) { 3434 for (i = 0; i < mgpu_info.num_gpu; i++) { 3435 gpu_instance = &(mgpu_info.gpu_ins[i]); 3436 if (gpu_instance->adev->flags & AMD_IS_APU) 3437 continue; 3438 3439 r = amdgpu_xgmi_set_pstate(gpu_instance->adev, 3440 AMDGPU_XGMI_PSTATE_MIN); 3441 if (r) { 3442 DRM_ERROR("pstate setting failed (%d).\n", r); 3443 break; 3444 } 3445 } 3446 } 3447 3448 mutex_unlock(&mgpu_info.mutex); 3449 } 3450 3451 return 0; 3452 } 3453 3454 static void amdgpu_ip_block_hw_fini(struct amdgpu_ip_block *ip_block) 3455 { 3456 int r; 3457 3458 if (!ip_block->version->funcs->hw_fini) { 3459 DRM_ERROR("hw_fini of IP block <%s> not defined\n", 3460 ip_block->version->funcs->name); 3461 } else { 3462 r = ip_block->version->funcs->hw_fini(ip_block); 3463 /* XXX handle errors */ 3464 if (r) { 3465 DRM_DEBUG("hw_fini of IP block <%s> failed %d\n", 3466 ip_block->version->funcs->name, r); 3467 } 3468 } 3469 3470 ip_block->status.hw = false; 3471 } 3472 3473 /** 3474 * amdgpu_device_smu_fini_early - smu hw_fini wrapper 3475 * 3476 * @adev: amdgpu_device pointer 3477 * 3478 * For ASICs need to disable SMC first 3479 */ 3480 static void amdgpu_device_smu_fini_early(struct amdgpu_device *adev) 3481 { 3482 int i; 3483 3484 if (amdgpu_ip_version(adev, GC_HWIP, 0) > IP_VERSION(9, 0, 0)) 3485 return; 3486 3487 for (i = 0; i < adev->num_ip_blocks; i++) { 3488 if (!adev->ip_blocks[i].status.hw) 3489 continue; 3490 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) { 3491 amdgpu_ip_block_hw_fini(&adev->ip_blocks[i]); 3492 break; 3493 } 3494 } 3495 } 3496 3497 static int amdgpu_device_ip_fini_early(struct amdgpu_device *adev) 3498 { 3499 int i, r; 3500 3501 for (i = 0; i < adev->num_ip_blocks; i++) { 3502 if (!adev->ip_blocks[i].version->funcs->early_fini) 3503 continue; 3504 3505 r = adev->ip_blocks[i].version->funcs->early_fini(&adev->ip_blocks[i]); 3506 if (r) { 3507 DRM_DEBUG("early_fini of IP block <%s> failed %d\n", 3508 adev->ip_blocks[i].version->funcs->name, r); 3509 } 3510 } 3511 3512 amdgpu_device_set_pg_state(adev, AMD_PG_STATE_UNGATE); 3513 amdgpu_device_set_cg_state(adev, AMD_CG_STATE_UNGATE); 3514 3515 amdgpu_amdkfd_suspend(adev, false); 3516 amdgpu_userq_suspend(adev); 3517 3518 /* Workaround for ASICs need to disable SMC first */ 3519 amdgpu_device_smu_fini_early(adev); 3520 3521 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 3522 if (!adev->ip_blocks[i].status.hw) 3523 continue; 3524 3525 amdgpu_ip_block_hw_fini(&adev->ip_blocks[i]); 3526 } 3527 3528 if (amdgpu_sriov_vf(adev)) { 3529 if (amdgpu_virt_release_full_gpu(adev, false)) 3530 DRM_ERROR("failed to release exclusive mode on fini\n"); 3531 } 3532 3533 return 0; 3534 } 3535 3536 /** 3537 * amdgpu_device_ip_fini - run fini for hardware IPs 3538 * 3539 * @adev: amdgpu_device pointer 3540 * 3541 * Main teardown pass for hardware IPs. The list of all the hardware 3542 * IPs that make up the asic is walked and the hw_fini and sw_fini callbacks 3543 * are run. hw_fini tears down the hardware associated with each IP 3544 * and sw_fini tears down any software state associated with each IP. 3545 * Returns 0 on success, negative error code on failure. 3546 */ 3547 static int amdgpu_device_ip_fini(struct amdgpu_device *adev) 3548 { 3549 int i, r; 3550 3551 amdgpu_cper_fini(adev); 3552 3553 if (amdgpu_sriov_vf(adev) && adev->virt.ras_init_done) 3554 amdgpu_virt_release_ras_err_handler_data(adev); 3555 3556 if (adev->gmc.xgmi.num_physical_nodes > 1) 3557 amdgpu_xgmi_remove_device(adev); 3558 3559 amdgpu_amdkfd_device_fini_sw(adev); 3560 3561 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 3562 if (!adev->ip_blocks[i].status.sw) 3563 continue; 3564 3565 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) { 3566 amdgpu_ucode_free_bo(adev); 3567 amdgpu_free_static_csa(&adev->virt.csa_obj); 3568 amdgpu_device_wb_fini(adev); 3569 amdgpu_device_mem_scratch_fini(adev); 3570 amdgpu_ib_pool_fini(adev); 3571 amdgpu_seq64_fini(adev); 3572 amdgpu_doorbell_fini(adev); 3573 } 3574 if (adev->ip_blocks[i].version->funcs->sw_fini) { 3575 r = adev->ip_blocks[i].version->funcs->sw_fini(&adev->ip_blocks[i]); 3576 /* XXX handle errors */ 3577 if (r) { 3578 DRM_DEBUG("sw_fini of IP block <%s> failed %d\n", 3579 adev->ip_blocks[i].version->funcs->name, r); 3580 } 3581 } 3582 adev->ip_blocks[i].status.sw = false; 3583 adev->ip_blocks[i].status.valid = false; 3584 } 3585 3586 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 3587 if (!adev->ip_blocks[i].status.late_initialized) 3588 continue; 3589 if (adev->ip_blocks[i].version->funcs->late_fini) 3590 adev->ip_blocks[i].version->funcs->late_fini(&adev->ip_blocks[i]); 3591 adev->ip_blocks[i].status.late_initialized = false; 3592 } 3593 3594 amdgpu_ras_fini(adev); 3595 3596 return 0; 3597 } 3598 3599 /** 3600 * amdgpu_device_delayed_init_work_handler - work handler for IB tests 3601 * 3602 * @work: work_struct. 3603 */ 3604 static void amdgpu_device_delayed_init_work_handler(struct work_struct *work) 3605 { 3606 struct amdgpu_device *adev = 3607 container_of(work, struct amdgpu_device, delayed_init_work.work); 3608 int r; 3609 3610 r = amdgpu_ib_ring_tests(adev); 3611 if (r) 3612 DRM_ERROR("ib ring test failed (%d).\n", r); 3613 } 3614 3615 static void amdgpu_device_delay_enable_gfx_off(struct work_struct *work) 3616 { 3617 struct amdgpu_device *adev = 3618 container_of(work, struct amdgpu_device, gfx.gfx_off_delay_work.work); 3619 3620 WARN_ON_ONCE(adev->gfx.gfx_off_state); 3621 WARN_ON_ONCE(adev->gfx.gfx_off_req_count); 3622 3623 if (!amdgpu_dpm_set_powergating_by_smu(adev, AMD_IP_BLOCK_TYPE_GFX, true, 0)) 3624 adev->gfx.gfx_off_state = true; 3625 } 3626 3627 /** 3628 * amdgpu_device_ip_suspend_phase1 - run suspend for hardware IPs (phase 1) 3629 * 3630 * @adev: amdgpu_device pointer 3631 * 3632 * Main suspend function for hardware IPs. The list of all the hardware 3633 * IPs that make up the asic is walked, clockgating is disabled and the 3634 * suspend callbacks are run. suspend puts the hardware and software state 3635 * in each IP into a state suitable for suspend. 3636 * Returns 0 on success, negative error code on failure. 3637 */ 3638 static int amdgpu_device_ip_suspend_phase1(struct amdgpu_device *adev) 3639 { 3640 int i, r; 3641 3642 amdgpu_device_set_pg_state(adev, AMD_PG_STATE_UNGATE); 3643 amdgpu_device_set_cg_state(adev, AMD_CG_STATE_UNGATE); 3644 3645 /* 3646 * Per PMFW team's suggestion, driver needs to handle gfxoff 3647 * and df cstate features disablement for gpu reset(e.g. Mode1Reset) 3648 * scenario. Add the missing df cstate disablement here. 3649 */ 3650 if (amdgpu_dpm_set_df_cstate(adev, DF_CSTATE_DISALLOW)) 3651 dev_warn(adev->dev, "Failed to disallow df cstate"); 3652 3653 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 3654 if (!adev->ip_blocks[i].status.valid) 3655 continue; 3656 3657 /* displays are handled separately */ 3658 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_DCE) 3659 continue; 3660 3661 /* XXX handle errors */ 3662 r = amdgpu_ip_block_suspend(&adev->ip_blocks[i]); 3663 if (r) 3664 return r; 3665 } 3666 3667 return 0; 3668 } 3669 3670 /** 3671 * amdgpu_device_ip_suspend_phase2 - run suspend for hardware IPs (phase 2) 3672 * 3673 * @adev: amdgpu_device pointer 3674 * 3675 * Main suspend function for hardware IPs. The list of all the hardware 3676 * IPs that make up the asic is walked, clockgating is disabled and the 3677 * suspend callbacks are run. suspend puts the hardware and software state 3678 * in each IP into a state suitable for suspend. 3679 * Returns 0 on success, negative error code on failure. 3680 */ 3681 static int amdgpu_device_ip_suspend_phase2(struct amdgpu_device *adev) 3682 { 3683 int i, r; 3684 3685 if (adev->in_s0ix) 3686 amdgpu_dpm_gfx_state_change(adev, sGpuChangeState_D3Entry); 3687 3688 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 3689 if (!adev->ip_blocks[i].status.valid) 3690 continue; 3691 /* displays are handled in phase1 */ 3692 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE) 3693 continue; 3694 /* PSP lost connection when err_event_athub occurs */ 3695 if (amdgpu_ras_intr_triggered() && 3696 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) { 3697 adev->ip_blocks[i].status.hw = false; 3698 continue; 3699 } 3700 3701 /* skip unnecessary suspend if we do not initialize them yet */ 3702 if (!amdgpu_ip_member_of_hwini( 3703 adev, adev->ip_blocks[i].version->type)) 3704 continue; 3705 3706 /* Since we skip suspend for S0i3, we need to cancel the delayed 3707 * idle work here as the suspend callback never gets called. 3708 */ 3709 if (adev->in_s0ix && 3710 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GFX && 3711 amdgpu_ip_version(adev, GC_HWIP, 0) >= IP_VERSION(10, 0, 0)) 3712 cancel_delayed_work_sync(&adev->gfx.idle_work); 3713 /* skip suspend of gfx/mes and psp for S0ix 3714 * gfx is in gfxoff state, so on resume it will exit gfxoff just 3715 * like at runtime. PSP is also part of the always on hardware 3716 * so no need to suspend it. 3717 */ 3718 if (adev->in_s0ix && 3719 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP || 3720 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GFX || 3721 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_MES)) 3722 continue; 3723 3724 /* SDMA 5.x+ is part of GFX power domain so it's covered by GFXOFF */ 3725 if (adev->in_s0ix && 3726 (amdgpu_ip_version(adev, SDMA0_HWIP, 0) >= 3727 IP_VERSION(5, 0, 0)) && 3728 (adev->ip_blocks[i].version->type == 3729 AMD_IP_BLOCK_TYPE_SDMA)) 3730 continue; 3731 3732 /* Once swPSP provides the IMU, RLC FW binaries to TOS during cold-boot. 3733 * These are in TMR, hence are expected to be reused by PSP-TOS to reload 3734 * from this location and RLC Autoload automatically also gets loaded 3735 * from here based on PMFW -> PSP message during re-init sequence. 3736 * Therefore, the psp suspend & resume should be skipped to avoid destroy 3737 * the TMR and reload FWs again for IMU enabled APU ASICs. 3738 */ 3739 if (amdgpu_in_reset(adev) && 3740 (adev->flags & AMD_IS_APU) && adev->gfx.imu.funcs && 3741 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) 3742 continue; 3743 3744 /* XXX handle errors */ 3745 r = amdgpu_ip_block_suspend(&adev->ip_blocks[i]); 3746 adev->ip_blocks[i].status.hw = false; 3747 3748 /* handle putting the SMC in the appropriate state */ 3749 if (!amdgpu_sriov_vf(adev)) { 3750 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) { 3751 r = amdgpu_dpm_set_mp1_state(adev, adev->mp1_state); 3752 if (r) { 3753 DRM_ERROR("SMC failed to set mp1 state %d, %d\n", 3754 adev->mp1_state, r); 3755 return r; 3756 } 3757 } 3758 } 3759 } 3760 3761 return 0; 3762 } 3763 3764 /** 3765 * amdgpu_device_ip_suspend - run suspend for hardware IPs 3766 * 3767 * @adev: amdgpu_device pointer 3768 * 3769 * Main suspend function for hardware IPs. The list of all the hardware 3770 * IPs that make up the asic is walked, clockgating is disabled and the 3771 * suspend callbacks are run. suspend puts the hardware and software state 3772 * in each IP into a state suitable for suspend. 3773 * Returns 0 on success, negative error code on failure. 3774 */ 3775 int amdgpu_device_ip_suspend(struct amdgpu_device *adev) 3776 { 3777 int r; 3778 3779 if (amdgpu_sriov_vf(adev)) { 3780 amdgpu_virt_fini_data_exchange(adev); 3781 amdgpu_virt_request_full_gpu(adev, false); 3782 } 3783 3784 amdgpu_ttm_set_buffer_funcs_status(adev, false); 3785 3786 r = amdgpu_device_ip_suspend_phase1(adev); 3787 if (r) 3788 return r; 3789 r = amdgpu_device_ip_suspend_phase2(adev); 3790 3791 if (amdgpu_sriov_vf(adev)) 3792 amdgpu_virt_release_full_gpu(adev, false); 3793 3794 return r; 3795 } 3796 3797 static int amdgpu_device_ip_reinit_early_sriov(struct amdgpu_device *adev) 3798 { 3799 int i, r; 3800 3801 static enum amd_ip_block_type ip_order[] = { 3802 AMD_IP_BLOCK_TYPE_COMMON, 3803 AMD_IP_BLOCK_TYPE_GMC, 3804 AMD_IP_BLOCK_TYPE_PSP, 3805 AMD_IP_BLOCK_TYPE_IH, 3806 }; 3807 3808 for (i = 0; i < adev->num_ip_blocks; i++) { 3809 int j; 3810 struct amdgpu_ip_block *block; 3811 3812 block = &adev->ip_blocks[i]; 3813 block->status.hw = false; 3814 3815 for (j = 0; j < ARRAY_SIZE(ip_order); j++) { 3816 3817 if (block->version->type != ip_order[j] || 3818 !block->status.valid) 3819 continue; 3820 3821 r = block->version->funcs->hw_init(&adev->ip_blocks[i]); 3822 if (r) { 3823 dev_err(adev->dev, "RE-INIT-early: %s failed\n", 3824 block->version->funcs->name); 3825 return r; 3826 } 3827 block->status.hw = true; 3828 } 3829 } 3830 3831 return 0; 3832 } 3833 3834 static int amdgpu_device_ip_reinit_late_sriov(struct amdgpu_device *adev) 3835 { 3836 struct amdgpu_ip_block *block; 3837 int i, r = 0; 3838 3839 static enum amd_ip_block_type ip_order[] = { 3840 AMD_IP_BLOCK_TYPE_SMC, 3841 AMD_IP_BLOCK_TYPE_DCE, 3842 AMD_IP_BLOCK_TYPE_GFX, 3843 AMD_IP_BLOCK_TYPE_SDMA, 3844 AMD_IP_BLOCK_TYPE_MES, 3845 AMD_IP_BLOCK_TYPE_UVD, 3846 AMD_IP_BLOCK_TYPE_VCE, 3847 AMD_IP_BLOCK_TYPE_VCN, 3848 AMD_IP_BLOCK_TYPE_JPEG 3849 }; 3850 3851 for (i = 0; i < ARRAY_SIZE(ip_order); i++) { 3852 block = amdgpu_device_ip_get_ip_block(adev, ip_order[i]); 3853 3854 if (!block) 3855 continue; 3856 3857 if (block->status.valid && !block->status.hw) { 3858 if (block->version->type == AMD_IP_BLOCK_TYPE_SMC) { 3859 r = amdgpu_ip_block_resume(block); 3860 } else { 3861 r = block->version->funcs->hw_init(block); 3862 } 3863 3864 if (r) { 3865 dev_err(adev->dev, "RE-INIT-late: %s failed\n", 3866 block->version->funcs->name); 3867 break; 3868 } 3869 block->status.hw = true; 3870 } 3871 } 3872 3873 return r; 3874 } 3875 3876 /** 3877 * amdgpu_device_ip_resume_phase1 - run resume for hardware IPs 3878 * 3879 * @adev: amdgpu_device pointer 3880 * 3881 * First resume function for hardware IPs. The list of all the hardware 3882 * IPs that make up the asic is walked and the resume callbacks are run for 3883 * COMMON, GMC, and IH. resume puts the hardware into a functional state 3884 * after a suspend and updates the software state as necessary. This 3885 * function is also used for restoring the GPU after a GPU reset. 3886 * Returns 0 on success, negative error code on failure. 3887 */ 3888 static int amdgpu_device_ip_resume_phase1(struct amdgpu_device *adev) 3889 { 3890 int i, r; 3891 3892 for (i = 0; i < adev->num_ip_blocks; i++) { 3893 if (!adev->ip_blocks[i].status.valid || adev->ip_blocks[i].status.hw) 3894 continue; 3895 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON || 3896 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC || 3897 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH || 3898 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP && amdgpu_sriov_vf(adev))) { 3899 3900 r = amdgpu_ip_block_resume(&adev->ip_blocks[i]); 3901 if (r) 3902 return r; 3903 } 3904 } 3905 3906 return 0; 3907 } 3908 3909 /** 3910 * amdgpu_device_ip_resume_phase2 - run resume for hardware IPs 3911 * 3912 * @adev: amdgpu_device pointer 3913 * 3914 * Second resume function for hardware IPs. The list of all the hardware 3915 * IPs that make up the asic is walked and the resume callbacks are run for 3916 * all blocks except COMMON, GMC, and IH. resume puts the hardware into a 3917 * functional state after a suspend and updates the software state as 3918 * necessary. This function is also used for restoring the GPU after a GPU 3919 * reset. 3920 * Returns 0 on success, negative error code on failure. 3921 */ 3922 static int amdgpu_device_ip_resume_phase2(struct amdgpu_device *adev) 3923 { 3924 int i, r; 3925 3926 for (i = 0; i < adev->num_ip_blocks; i++) { 3927 if (!adev->ip_blocks[i].status.valid || adev->ip_blocks[i].status.hw) 3928 continue; 3929 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON || 3930 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC || 3931 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH || 3932 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE || 3933 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) 3934 continue; 3935 r = amdgpu_ip_block_resume(&adev->ip_blocks[i]); 3936 if (r) 3937 return r; 3938 } 3939 3940 return 0; 3941 } 3942 3943 /** 3944 * amdgpu_device_ip_resume_phase3 - run resume for hardware IPs 3945 * 3946 * @adev: amdgpu_device pointer 3947 * 3948 * Third resume function for hardware IPs. The list of all the hardware 3949 * IPs that make up the asic is walked and the resume callbacks are run for 3950 * all DCE. resume puts the hardware into a functional state after a suspend 3951 * and updates the software state as necessary. This function is also used 3952 * for restoring the GPU after a GPU reset. 3953 * 3954 * Returns 0 on success, negative error code on failure. 3955 */ 3956 static int amdgpu_device_ip_resume_phase3(struct amdgpu_device *adev) 3957 { 3958 int i, r; 3959 3960 for (i = 0; i < adev->num_ip_blocks; i++) { 3961 if (!adev->ip_blocks[i].status.valid || adev->ip_blocks[i].status.hw) 3962 continue; 3963 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE) { 3964 r = amdgpu_ip_block_resume(&adev->ip_blocks[i]); 3965 if (r) 3966 return r; 3967 } 3968 } 3969 3970 return 0; 3971 } 3972 3973 /** 3974 * amdgpu_device_ip_resume - run resume for hardware IPs 3975 * 3976 * @adev: amdgpu_device pointer 3977 * 3978 * Main resume function for hardware IPs. The hardware IPs 3979 * are split into two resume functions because they are 3980 * also used in recovering from a GPU reset and some additional 3981 * steps need to be take between them. In this case (S3/S4) they are 3982 * run sequentially. 3983 * Returns 0 on success, negative error code on failure. 3984 */ 3985 static int amdgpu_device_ip_resume(struct amdgpu_device *adev) 3986 { 3987 int r; 3988 3989 r = amdgpu_device_ip_resume_phase1(adev); 3990 if (r) 3991 return r; 3992 3993 r = amdgpu_device_fw_loading(adev); 3994 if (r) 3995 return r; 3996 3997 r = amdgpu_device_ip_resume_phase2(adev); 3998 3999 if (adev->mman.buffer_funcs_ring->sched.ready) 4000 amdgpu_ttm_set_buffer_funcs_status(adev, true); 4001 4002 if (r) 4003 return r; 4004 4005 amdgpu_fence_driver_hw_init(adev); 4006 4007 r = amdgpu_device_ip_resume_phase3(adev); 4008 4009 return r; 4010 } 4011 4012 /** 4013 * amdgpu_device_detect_sriov_bios - determine if the board supports SR-IOV 4014 * 4015 * @adev: amdgpu_device pointer 4016 * 4017 * Query the VBIOS data tables to determine if the board supports SR-IOV. 4018 */ 4019 static void amdgpu_device_detect_sriov_bios(struct amdgpu_device *adev) 4020 { 4021 if (amdgpu_sriov_vf(adev)) { 4022 if (adev->is_atom_fw) { 4023 if (amdgpu_atomfirmware_gpu_virtualization_supported(adev)) 4024 adev->virt.caps |= AMDGPU_SRIOV_CAPS_SRIOV_VBIOS; 4025 } else { 4026 if (amdgpu_atombios_has_gpu_virtualization_table(adev)) 4027 adev->virt.caps |= AMDGPU_SRIOV_CAPS_SRIOV_VBIOS; 4028 } 4029 4030 if (!(adev->virt.caps & AMDGPU_SRIOV_CAPS_SRIOV_VBIOS)) 4031 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_NO_VBIOS, 0, 0); 4032 } 4033 } 4034 4035 /** 4036 * amdgpu_device_asic_has_dc_support - determine if DC supports the asic 4037 * 4038 * @asic_type: AMD asic type 4039 * 4040 * Check if there is DC (new modesetting infrastructre) support for an asic. 4041 * returns true if DC has support, false if not. 4042 */ 4043 bool amdgpu_device_asic_has_dc_support(enum amd_asic_type asic_type) 4044 { 4045 switch (asic_type) { 4046 #ifdef CONFIG_DRM_AMDGPU_SI 4047 case CHIP_HAINAN: 4048 #endif 4049 case CHIP_TOPAZ: 4050 /* chips with no display hardware */ 4051 return false; 4052 #if defined(CONFIG_DRM_AMD_DC) 4053 case CHIP_TAHITI: 4054 case CHIP_PITCAIRN: 4055 case CHIP_VERDE: 4056 case CHIP_OLAND: 4057 /* 4058 * We have systems in the wild with these ASICs that require 4059 * LVDS and VGA support which is not supported with DC. 4060 * 4061 * Fallback to the non-DC driver here by default so as not to 4062 * cause regressions. 4063 */ 4064 #if defined(CONFIG_DRM_AMD_DC_SI) 4065 return amdgpu_dc > 0; 4066 #else 4067 return false; 4068 #endif 4069 case CHIP_BONAIRE: 4070 case CHIP_KAVERI: 4071 case CHIP_KABINI: 4072 case CHIP_MULLINS: 4073 /* 4074 * We have systems in the wild with these ASICs that require 4075 * VGA support which is not supported with DC. 4076 * 4077 * Fallback to the non-DC driver here by default so as not to 4078 * cause regressions. 4079 */ 4080 return amdgpu_dc > 0; 4081 default: 4082 return amdgpu_dc != 0; 4083 #else 4084 default: 4085 if (amdgpu_dc > 0) 4086 DRM_INFO_ONCE("Display Core has been requested via kernel parameter but isn't supported by ASIC, ignoring\n"); 4087 return false; 4088 #endif 4089 } 4090 } 4091 4092 /** 4093 * amdgpu_device_has_dc_support - check if dc is supported 4094 * 4095 * @adev: amdgpu_device pointer 4096 * 4097 * Returns true for supported, false for not supported 4098 */ 4099 bool amdgpu_device_has_dc_support(struct amdgpu_device *adev) 4100 { 4101 if (adev->enable_virtual_display || 4102 (adev->harvest_ip_mask & AMD_HARVEST_IP_DMU_MASK)) 4103 return false; 4104 4105 return amdgpu_device_asic_has_dc_support(adev->asic_type); 4106 } 4107 4108 static void amdgpu_device_xgmi_reset_func(struct work_struct *__work) 4109 { 4110 struct amdgpu_device *adev = 4111 container_of(__work, struct amdgpu_device, xgmi_reset_work); 4112 struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev); 4113 4114 /* It's a bug to not have a hive within this function */ 4115 if (WARN_ON(!hive)) 4116 return; 4117 4118 /* 4119 * Use task barrier to synchronize all xgmi reset works across the 4120 * hive. task_barrier_enter and task_barrier_exit will block 4121 * until all the threads running the xgmi reset works reach 4122 * those points. task_barrier_full will do both blocks. 4123 */ 4124 if (amdgpu_asic_reset_method(adev) == AMD_RESET_METHOD_BACO) { 4125 4126 task_barrier_enter(&hive->tb); 4127 adev->asic_reset_res = amdgpu_device_baco_enter(adev_to_drm(adev)); 4128 4129 if (adev->asic_reset_res) 4130 goto fail; 4131 4132 task_barrier_exit(&hive->tb); 4133 adev->asic_reset_res = amdgpu_device_baco_exit(adev_to_drm(adev)); 4134 4135 if (adev->asic_reset_res) 4136 goto fail; 4137 4138 amdgpu_ras_reset_error_count(adev, AMDGPU_RAS_BLOCK__MMHUB); 4139 } else { 4140 4141 task_barrier_full(&hive->tb); 4142 adev->asic_reset_res = amdgpu_asic_reset(adev); 4143 } 4144 4145 fail: 4146 if (adev->asic_reset_res) 4147 DRM_WARN("ASIC reset failed with error, %d for drm dev, %s", 4148 adev->asic_reset_res, adev_to_drm(adev)->unique); 4149 amdgpu_put_xgmi_hive(hive); 4150 } 4151 4152 static int amdgpu_device_get_job_timeout_settings(struct amdgpu_device *adev) 4153 { 4154 char *input = amdgpu_lockup_timeout; 4155 char *timeout_setting = NULL; 4156 int index = 0; 4157 long timeout; 4158 int ret = 0; 4159 4160 /* 4161 * By default timeout for non compute jobs is 10000 4162 * and 60000 for compute jobs. 4163 * In SR-IOV or passthrough mode, timeout for compute 4164 * jobs are 60000 by default. 4165 */ 4166 adev->gfx_timeout = msecs_to_jiffies(10000); 4167 adev->sdma_timeout = adev->video_timeout = adev->gfx_timeout; 4168 if (amdgpu_sriov_vf(adev)) 4169 adev->compute_timeout = amdgpu_sriov_is_pp_one_vf(adev) ? 4170 msecs_to_jiffies(60000) : msecs_to_jiffies(10000); 4171 else 4172 adev->compute_timeout = msecs_to_jiffies(60000); 4173 4174 if (strnlen(input, AMDGPU_MAX_TIMEOUT_PARAM_LENGTH)) { 4175 while ((timeout_setting = strsep(&input, ",")) && 4176 strnlen(timeout_setting, AMDGPU_MAX_TIMEOUT_PARAM_LENGTH)) { 4177 ret = kstrtol(timeout_setting, 0, &timeout); 4178 if (ret) 4179 return ret; 4180 4181 if (timeout == 0) { 4182 index++; 4183 continue; 4184 } else if (timeout < 0) { 4185 timeout = MAX_SCHEDULE_TIMEOUT; 4186 dev_warn(adev->dev, "lockup timeout disabled"); 4187 add_taint(TAINT_SOFTLOCKUP, LOCKDEP_STILL_OK); 4188 } else { 4189 timeout = msecs_to_jiffies(timeout); 4190 } 4191 4192 switch (index++) { 4193 case 0: 4194 adev->gfx_timeout = timeout; 4195 break; 4196 case 1: 4197 adev->compute_timeout = timeout; 4198 break; 4199 case 2: 4200 adev->sdma_timeout = timeout; 4201 break; 4202 case 3: 4203 adev->video_timeout = timeout; 4204 break; 4205 default: 4206 break; 4207 } 4208 } 4209 /* 4210 * There is only one value specified and 4211 * it should apply to all non-compute jobs. 4212 */ 4213 if (index == 1) { 4214 adev->sdma_timeout = adev->video_timeout = adev->gfx_timeout; 4215 if (amdgpu_sriov_vf(adev) || amdgpu_passthrough(adev)) 4216 adev->compute_timeout = adev->gfx_timeout; 4217 } 4218 } 4219 4220 return ret; 4221 } 4222 4223 /** 4224 * amdgpu_device_check_iommu_direct_map - check if RAM direct mapped to GPU 4225 * 4226 * @adev: amdgpu_device pointer 4227 * 4228 * RAM direct mapped to GPU if IOMMU is not enabled or is pass through mode 4229 */ 4230 static void amdgpu_device_check_iommu_direct_map(struct amdgpu_device *adev) 4231 { 4232 struct iommu_domain *domain; 4233 4234 domain = iommu_get_domain_for_dev(adev->dev); 4235 if (!domain || domain->type == IOMMU_DOMAIN_IDENTITY) 4236 adev->ram_is_direct_mapped = true; 4237 } 4238 4239 #if defined(CONFIG_HSA_AMD_P2P) 4240 /** 4241 * amdgpu_device_check_iommu_remap - Check if DMA remapping is enabled. 4242 * 4243 * @adev: amdgpu_device pointer 4244 * 4245 * return if IOMMU remapping bar address 4246 */ 4247 static bool amdgpu_device_check_iommu_remap(struct amdgpu_device *adev) 4248 { 4249 struct iommu_domain *domain; 4250 4251 domain = iommu_get_domain_for_dev(adev->dev); 4252 if (domain && (domain->type == IOMMU_DOMAIN_DMA || 4253 domain->type == IOMMU_DOMAIN_DMA_FQ)) 4254 return true; 4255 4256 return false; 4257 } 4258 #endif 4259 4260 static void amdgpu_device_set_mcbp(struct amdgpu_device *adev) 4261 { 4262 if (amdgpu_mcbp == 1) 4263 adev->gfx.mcbp = true; 4264 else if (amdgpu_mcbp == 0) 4265 adev->gfx.mcbp = false; 4266 4267 if (amdgpu_sriov_vf(adev)) 4268 adev->gfx.mcbp = true; 4269 4270 if (adev->gfx.mcbp) 4271 DRM_INFO("MCBP is enabled\n"); 4272 } 4273 4274 /** 4275 * amdgpu_device_init - initialize the driver 4276 * 4277 * @adev: amdgpu_device pointer 4278 * @flags: driver flags 4279 * 4280 * Initializes the driver info and hw (all asics). 4281 * Returns 0 for success or an error on failure. 4282 * Called at driver startup. 4283 */ 4284 int amdgpu_device_init(struct amdgpu_device *adev, 4285 uint32_t flags) 4286 { 4287 struct drm_device *ddev = adev_to_drm(adev); 4288 struct pci_dev *pdev = adev->pdev; 4289 int r, i; 4290 bool px = false; 4291 u32 max_MBps; 4292 int tmp; 4293 4294 adev->shutdown = false; 4295 adev->flags = flags; 4296 4297 if (amdgpu_force_asic_type >= 0 && amdgpu_force_asic_type < CHIP_LAST) 4298 adev->asic_type = amdgpu_force_asic_type; 4299 else 4300 adev->asic_type = flags & AMD_ASIC_MASK; 4301 4302 adev->usec_timeout = AMDGPU_MAX_USEC_TIMEOUT; 4303 if (amdgpu_emu_mode == 1) 4304 adev->usec_timeout *= 10; 4305 adev->gmc.gart_size = 512 * 1024 * 1024; 4306 adev->accel_working = false; 4307 adev->num_rings = 0; 4308 RCU_INIT_POINTER(adev->gang_submit, dma_fence_get_stub()); 4309 adev->mman.buffer_funcs = NULL; 4310 adev->mman.buffer_funcs_ring = NULL; 4311 adev->vm_manager.vm_pte_funcs = NULL; 4312 adev->vm_manager.vm_pte_num_scheds = 0; 4313 adev->gmc.gmc_funcs = NULL; 4314 adev->harvest_ip_mask = 0x0; 4315 adev->fence_context = dma_fence_context_alloc(AMDGPU_MAX_RINGS); 4316 bitmap_zero(adev->gfx.pipe_reserve_bitmap, AMDGPU_MAX_COMPUTE_QUEUES); 4317 4318 adev->smc_rreg = &amdgpu_invalid_rreg; 4319 adev->smc_wreg = &amdgpu_invalid_wreg; 4320 adev->pcie_rreg = &amdgpu_invalid_rreg; 4321 adev->pcie_wreg = &amdgpu_invalid_wreg; 4322 adev->pcie_rreg_ext = &amdgpu_invalid_rreg_ext; 4323 adev->pcie_wreg_ext = &amdgpu_invalid_wreg_ext; 4324 adev->pciep_rreg = &amdgpu_invalid_rreg; 4325 adev->pciep_wreg = &amdgpu_invalid_wreg; 4326 adev->pcie_rreg64 = &amdgpu_invalid_rreg64; 4327 adev->pcie_wreg64 = &amdgpu_invalid_wreg64; 4328 adev->pcie_rreg64_ext = &amdgpu_invalid_rreg64_ext; 4329 adev->pcie_wreg64_ext = &amdgpu_invalid_wreg64_ext; 4330 adev->uvd_ctx_rreg = &amdgpu_invalid_rreg; 4331 adev->uvd_ctx_wreg = &amdgpu_invalid_wreg; 4332 adev->didt_rreg = &amdgpu_invalid_rreg; 4333 adev->didt_wreg = &amdgpu_invalid_wreg; 4334 adev->gc_cac_rreg = &amdgpu_invalid_rreg; 4335 adev->gc_cac_wreg = &amdgpu_invalid_wreg; 4336 adev->audio_endpt_rreg = &amdgpu_block_invalid_rreg; 4337 adev->audio_endpt_wreg = &amdgpu_block_invalid_wreg; 4338 4339 DRM_INFO("initializing kernel modesetting (%s 0x%04X:0x%04X 0x%04X:0x%04X 0x%02X).\n", 4340 amdgpu_asic_name[adev->asic_type], pdev->vendor, pdev->device, 4341 pdev->subsystem_vendor, pdev->subsystem_device, pdev->revision); 4342 4343 /* mutex initialization are all done here so we 4344 * can recall function without having locking issues 4345 */ 4346 mutex_init(&adev->firmware.mutex); 4347 mutex_init(&adev->pm.mutex); 4348 mutex_init(&adev->gfx.gpu_clock_mutex); 4349 mutex_init(&adev->srbm_mutex); 4350 mutex_init(&adev->gfx.pipe_reserve_mutex); 4351 mutex_init(&adev->gfx.gfx_off_mutex); 4352 mutex_init(&adev->gfx.partition_mutex); 4353 mutex_init(&adev->grbm_idx_mutex); 4354 mutex_init(&adev->mn_lock); 4355 mutex_init(&adev->virt.vf_errors.lock); 4356 hash_init(adev->mn_hash); 4357 mutex_init(&adev->psp.mutex); 4358 mutex_init(&adev->notifier_lock); 4359 mutex_init(&adev->pm.stable_pstate_ctx_lock); 4360 mutex_init(&adev->benchmark_mutex); 4361 mutex_init(&adev->gfx.reset_sem_mutex); 4362 /* Initialize the mutex for cleaner shader isolation between GFX and compute processes */ 4363 mutex_init(&adev->enforce_isolation_mutex); 4364 for (i = 0; i < MAX_XCP; ++i) { 4365 adev->isolation[i].spearhead = dma_fence_get_stub(); 4366 amdgpu_sync_create(&adev->isolation[i].active); 4367 amdgpu_sync_create(&adev->isolation[i].prev); 4368 } 4369 mutex_init(&adev->gfx.userq_sch_mutex); 4370 mutex_init(&adev->gfx.workload_profile_mutex); 4371 mutex_init(&adev->vcn.workload_profile_mutex); 4372 mutex_init(&adev->userq_mutex); 4373 4374 amdgpu_device_init_apu_flags(adev); 4375 4376 r = amdgpu_device_check_arguments(adev); 4377 if (r) 4378 return r; 4379 4380 spin_lock_init(&adev->mmio_idx_lock); 4381 spin_lock_init(&adev->smc_idx_lock); 4382 spin_lock_init(&adev->pcie_idx_lock); 4383 spin_lock_init(&adev->uvd_ctx_idx_lock); 4384 spin_lock_init(&adev->didt_idx_lock); 4385 spin_lock_init(&adev->gc_cac_idx_lock); 4386 spin_lock_init(&adev->se_cac_idx_lock); 4387 spin_lock_init(&adev->audio_endpt_idx_lock); 4388 spin_lock_init(&adev->mm_stats.lock); 4389 spin_lock_init(&adev->virt.rlcg_reg_lock); 4390 spin_lock_init(&adev->wb.lock); 4391 4392 xa_init_flags(&adev->userq_xa, XA_FLAGS_LOCK_IRQ); 4393 4394 INIT_LIST_HEAD(&adev->reset_list); 4395 4396 INIT_LIST_HEAD(&adev->ras_list); 4397 4398 INIT_LIST_HEAD(&adev->pm.od_kobj_list); 4399 4400 INIT_LIST_HEAD(&adev->userq_mgr_list); 4401 4402 INIT_DELAYED_WORK(&adev->delayed_init_work, 4403 amdgpu_device_delayed_init_work_handler); 4404 INIT_DELAYED_WORK(&adev->gfx.gfx_off_delay_work, 4405 amdgpu_device_delay_enable_gfx_off); 4406 /* 4407 * Initialize the enforce_isolation work structures for each XCP 4408 * partition. This work handler is responsible for enforcing shader 4409 * isolation on AMD GPUs. It counts the number of emitted fences for 4410 * each GFX and compute ring. If there are any fences, it schedules 4411 * the `enforce_isolation_work` to be run after a delay. If there are 4412 * no fences, it signals the Kernel Fusion Driver (KFD) to resume the 4413 * runqueue. 4414 */ 4415 for (i = 0; i < MAX_XCP; i++) { 4416 INIT_DELAYED_WORK(&adev->gfx.enforce_isolation[i].work, 4417 amdgpu_gfx_enforce_isolation_handler); 4418 adev->gfx.enforce_isolation[i].adev = adev; 4419 adev->gfx.enforce_isolation[i].xcp_id = i; 4420 } 4421 4422 INIT_WORK(&adev->xgmi_reset_work, amdgpu_device_xgmi_reset_func); 4423 4424 adev->gfx.gfx_off_req_count = 1; 4425 adev->gfx.gfx_off_residency = 0; 4426 adev->gfx.gfx_off_entrycount = 0; 4427 adev->pm.ac_power = power_supply_is_system_supplied() > 0; 4428 4429 atomic_set(&adev->throttling_logging_enabled, 1); 4430 /* 4431 * If throttling continues, logging will be performed every minute 4432 * to avoid log flooding. "-1" is subtracted since the thermal 4433 * throttling interrupt comes every second. Thus, the total logging 4434 * interval is 59 seconds(retelimited printk interval) + 1(waiting 4435 * for throttling interrupt) = 60 seconds. 4436 */ 4437 ratelimit_state_init(&adev->throttling_logging_rs, (60 - 1) * HZ, 1); 4438 4439 ratelimit_set_flags(&adev->throttling_logging_rs, RATELIMIT_MSG_ON_RELEASE); 4440 4441 /* Registers mapping */ 4442 /* TODO: block userspace mapping of io register */ 4443 if (adev->asic_type >= CHIP_BONAIRE) { 4444 adev->rmmio_base = pci_resource_start(adev->pdev, 5); 4445 adev->rmmio_size = pci_resource_len(adev->pdev, 5); 4446 } else { 4447 adev->rmmio_base = pci_resource_start(adev->pdev, 2); 4448 adev->rmmio_size = pci_resource_len(adev->pdev, 2); 4449 } 4450 4451 for (i = 0; i < AMD_IP_BLOCK_TYPE_NUM; i++) 4452 atomic_set(&adev->pm.pwr_state[i], POWER_STATE_UNKNOWN); 4453 4454 adev->rmmio = ioremap(adev->rmmio_base, adev->rmmio_size); 4455 if (!adev->rmmio) 4456 return -ENOMEM; 4457 4458 DRM_INFO("register mmio base: 0x%08X\n", (uint32_t)adev->rmmio_base); 4459 DRM_INFO("register mmio size: %u\n", (unsigned int)adev->rmmio_size); 4460 4461 /* 4462 * Reset domain needs to be present early, before XGMI hive discovered 4463 * (if any) and initialized to use reset sem and in_gpu reset flag 4464 * early on during init and before calling to RREG32. 4465 */ 4466 adev->reset_domain = amdgpu_reset_create_reset_domain(SINGLE_DEVICE, "amdgpu-reset-dev"); 4467 if (!adev->reset_domain) 4468 return -ENOMEM; 4469 4470 /* detect hw virtualization here */ 4471 amdgpu_virt_init(adev); 4472 4473 amdgpu_device_get_pcie_info(adev); 4474 4475 r = amdgpu_device_get_job_timeout_settings(adev); 4476 if (r) { 4477 dev_err(adev->dev, "invalid lockup_timeout parameter syntax\n"); 4478 return r; 4479 } 4480 4481 amdgpu_device_set_mcbp(adev); 4482 4483 /* 4484 * By default, use default mode where all blocks are expected to be 4485 * initialized. At present a 'swinit' of blocks is required to be 4486 * completed before the need for a different level is detected. 4487 */ 4488 amdgpu_set_init_level(adev, AMDGPU_INIT_LEVEL_DEFAULT); 4489 /* early init functions */ 4490 r = amdgpu_device_ip_early_init(adev); 4491 if (r) 4492 return r; 4493 4494 /* 4495 * No need to remove conflicting FBs for non-display class devices. 4496 * This prevents the sysfb from being freed accidently. 4497 */ 4498 if ((pdev->class >> 8) == PCI_CLASS_DISPLAY_VGA || 4499 (pdev->class >> 8) == PCI_CLASS_DISPLAY_OTHER) { 4500 /* Get rid of things like offb */ 4501 r = aperture_remove_conflicting_pci_devices(adev->pdev, amdgpu_kms_driver.name); 4502 if (r) 4503 return r; 4504 } 4505 4506 /* Enable TMZ based on IP_VERSION */ 4507 amdgpu_gmc_tmz_set(adev); 4508 4509 if (amdgpu_sriov_vf(adev) && 4510 amdgpu_ip_version(adev, GC_HWIP, 0) >= IP_VERSION(10, 3, 0)) 4511 /* VF MMIO access (except mailbox range) from CPU 4512 * will be blocked during sriov runtime 4513 */ 4514 adev->virt.caps |= AMDGPU_VF_MMIO_ACCESS_PROTECT; 4515 4516 amdgpu_gmc_noretry_set(adev); 4517 /* Need to get xgmi info early to decide the reset behavior*/ 4518 if (adev->gmc.xgmi.supported) { 4519 r = adev->gfxhub.funcs->get_xgmi_info(adev); 4520 if (r) 4521 return r; 4522 } 4523 4524 /* enable PCIE atomic ops */ 4525 if (amdgpu_sriov_vf(adev)) { 4526 if (adev->virt.fw_reserve.p_pf2vf) 4527 adev->have_atomics_support = ((struct amd_sriov_msg_pf2vf_info *) 4528 adev->virt.fw_reserve.p_pf2vf)->pcie_atomic_ops_support_flags == 4529 (PCI_EXP_DEVCAP2_ATOMIC_COMP32 | PCI_EXP_DEVCAP2_ATOMIC_COMP64); 4530 /* APUs w/ gfx9 onwards doesn't reply on PCIe atomics, rather it is a 4531 * internal path natively support atomics, set have_atomics_support to true. 4532 */ 4533 } else if ((adev->flags & AMD_IS_APU) && 4534 (amdgpu_ip_version(adev, GC_HWIP, 0) > 4535 IP_VERSION(9, 0, 0))) { 4536 adev->have_atomics_support = true; 4537 } else { 4538 adev->have_atomics_support = 4539 !pci_enable_atomic_ops_to_root(adev->pdev, 4540 PCI_EXP_DEVCAP2_ATOMIC_COMP32 | 4541 PCI_EXP_DEVCAP2_ATOMIC_COMP64); 4542 } 4543 4544 if (!adev->have_atomics_support) 4545 dev_info(adev->dev, "PCIE atomic ops is not supported\n"); 4546 4547 /* doorbell bar mapping and doorbell index init*/ 4548 amdgpu_doorbell_init(adev); 4549 4550 if (amdgpu_emu_mode == 1) { 4551 /* post the asic on emulation mode */ 4552 emu_soc_asic_init(adev); 4553 goto fence_driver_init; 4554 } 4555 4556 amdgpu_reset_init(adev); 4557 4558 /* detect if we are with an SRIOV vbios */ 4559 if (adev->bios) 4560 amdgpu_device_detect_sriov_bios(adev); 4561 4562 /* check if we need to reset the asic 4563 * E.g., driver was not cleanly unloaded previously, etc. 4564 */ 4565 if (!amdgpu_sriov_vf(adev) && amdgpu_asic_need_reset_on_init(adev)) { 4566 if (adev->gmc.xgmi.num_physical_nodes) { 4567 dev_info(adev->dev, "Pending hive reset.\n"); 4568 amdgpu_set_init_level(adev, 4569 AMDGPU_INIT_LEVEL_MINIMAL_XGMI); 4570 } else if (amdgpu_ip_version(adev, MP1_HWIP, 0) == IP_VERSION(13, 0, 10) && 4571 !amdgpu_device_has_display_hardware(adev)) { 4572 r = psp_gpu_reset(adev); 4573 } else { 4574 tmp = amdgpu_reset_method; 4575 /* It should do a default reset when loading or reloading the driver, 4576 * regardless of the module parameter reset_method. 4577 */ 4578 amdgpu_reset_method = AMD_RESET_METHOD_NONE; 4579 r = amdgpu_asic_reset(adev); 4580 amdgpu_reset_method = tmp; 4581 } 4582 4583 if (r) { 4584 dev_err(adev->dev, "asic reset on init failed\n"); 4585 goto failed; 4586 } 4587 } 4588 4589 /* Post card if necessary */ 4590 if (amdgpu_device_need_post(adev)) { 4591 if (!adev->bios) { 4592 dev_err(adev->dev, "no vBIOS found\n"); 4593 r = -EINVAL; 4594 goto failed; 4595 } 4596 DRM_INFO("GPU posting now...\n"); 4597 r = amdgpu_device_asic_init(adev); 4598 if (r) { 4599 dev_err(adev->dev, "gpu post error!\n"); 4600 goto failed; 4601 } 4602 } 4603 4604 if (adev->bios) { 4605 if (adev->is_atom_fw) { 4606 /* Initialize clocks */ 4607 r = amdgpu_atomfirmware_get_clock_info(adev); 4608 if (r) { 4609 dev_err(adev->dev, "amdgpu_atomfirmware_get_clock_info failed\n"); 4610 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_GET_CLOCK_FAIL, 0, 0); 4611 goto failed; 4612 } 4613 } else { 4614 /* Initialize clocks */ 4615 r = amdgpu_atombios_get_clock_info(adev); 4616 if (r) { 4617 dev_err(adev->dev, "amdgpu_atombios_get_clock_info failed\n"); 4618 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_GET_CLOCK_FAIL, 0, 0); 4619 goto failed; 4620 } 4621 /* init i2c buses */ 4622 amdgpu_i2c_init(adev); 4623 } 4624 } 4625 4626 fence_driver_init: 4627 /* Fence driver */ 4628 r = amdgpu_fence_driver_sw_init(adev); 4629 if (r) { 4630 dev_err(adev->dev, "amdgpu_fence_driver_sw_init failed\n"); 4631 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_FENCE_INIT_FAIL, 0, 0); 4632 goto failed; 4633 } 4634 4635 /* init the mode config */ 4636 drm_mode_config_init(adev_to_drm(adev)); 4637 4638 r = amdgpu_device_ip_init(adev); 4639 if (r) { 4640 dev_err(adev->dev, "amdgpu_device_ip_init failed\n"); 4641 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_AMDGPU_INIT_FAIL, 0, 0); 4642 goto release_ras_con; 4643 } 4644 4645 amdgpu_fence_driver_hw_init(adev); 4646 4647 dev_info(adev->dev, 4648 "SE %d, SH per SE %d, CU per SH %d, active_cu_number %d\n", 4649 adev->gfx.config.max_shader_engines, 4650 adev->gfx.config.max_sh_per_se, 4651 adev->gfx.config.max_cu_per_sh, 4652 adev->gfx.cu_info.number); 4653 4654 adev->accel_working = true; 4655 4656 amdgpu_vm_check_compute_bug(adev); 4657 4658 /* Initialize the buffer migration limit. */ 4659 if (amdgpu_moverate >= 0) 4660 max_MBps = amdgpu_moverate; 4661 else 4662 max_MBps = 8; /* Allow 8 MB/s. */ 4663 /* Get a log2 for easy divisions. */ 4664 adev->mm_stats.log2_max_MBps = ilog2(max(1u, max_MBps)); 4665 4666 /* 4667 * Register gpu instance before amdgpu_device_enable_mgpu_fan_boost. 4668 * Otherwise the mgpu fan boost feature will be skipped due to the 4669 * gpu instance is counted less. 4670 */ 4671 amdgpu_register_gpu_instance(adev); 4672 4673 /* enable clockgating, etc. after ib tests, etc. since some blocks require 4674 * explicit gating rather than handling it automatically. 4675 */ 4676 if (adev->init_lvl->level != AMDGPU_INIT_LEVEL_MINIMAL_XGMI) { 4677 r = amdgpu_device_ip_late_init(adev); 4678 if (r) { 4679 dev_err(adev->dev, "amdgpu_device_ip_late_init failed\n"); 4680 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_AMDGPU_LATE_INIT_FAIL, 0, r); 4681 goto release_ras_con; 4682 } 4683 /* must succeed. */ 4684 amdgpu_ras_resume(adev); 4685 queue_delayed_work(system_wq, &adev->delayed_init_work, 4686 msecs_to_jiffies(AMDGPU_RESUME_MS)); 4687 } 4688 4689 if (amdgpu_sriov_vf(adev)) { 4690 amdgpu_virt_release_full_gpu(adev, true); 4691 flush_delayed_work(&adev->delayed_init_work); 4692 } 4693 4694 /* 4695 * Place those sysfs registering after `late_init`. As some of those 4696 * operations performed in `late_init` might affect the sysfs 4697 * interfaces creating. 4698 */ 4699 r = amdgpu_atombios_sysfs_init(adev); 4700 if (r) 4701 drm_err(&adev->ddev, 4702 "registering atombios sysfs failed (%d).\n", r); 4703 4704 r = amdgpu_pm_sysfs_init(adev); 4705 if (r) 4706 DRM_ERROR("registering pm sysfs failed (%d).\n", r); 4707 4708 r = amdgpu_ucode_sysfs_init(adev); 4709 if (r) { 4710 adev->ucode_sysfs_en = false; 4711 DRM_ERROR("Creating firmware sysfs failed (%d).\n", r); 4712 } else 4713 adev->ucode_sysfs_en = true; 4714 4715 r = amdgpu_device_attr_sysfs_init(adev); 4716 if (r) 4717 dev_err(adev->dev, "Could not create amdgpu device attr\n"); 4718 4719 r = devm_device_add_group(adev->dev, &amdgpu_board_attrs_group); 4720 if (r) 4721 dev_err(adev->dev, 4722 "Could not create amdgpu board attributes\n"); 4723 4724 amdgpu_fru_sysfs_init(adev); 4725 amdgpu_reg_state_sysfs_init(adev); 4726 amdgpu_xcp_cfg_sysfs_init(adev); 4727 4728 if (IS_ENABLED(CONFIG_PERF_EVENTS)) 4729 r = amdgpu_pmu_init(adev); 4730 if (r) 4731 dev_err(adev->dev, "amdgpu_pmu_init failed\n"); 4732 4733 /* Have stored pci confspace at hand for restore in sudden PCI error */ 4734 if (amdgpu_device_cache_pci_state(adev->pdev)) 4735 pci_restore_state(pdev); 4736 4737 /* if we have > 1 VGA cards, then disable the amdgpu VGA resources */ 4738 /* this will fail for cards that aren't VGA class devices, just 4739 * ignore it 4740 */ 4741 if ((adev->pdev->class >> 8) == PCI_CLASS_DISPLAY_VGA) 4742 vga_client_register(adev->pdev, amdgpu_device_vga_set_decode); 4743 4744 px = amdgpu_device_supports_px(ddev); 4745 4746 if (px || (!dev_is_removable(&adev->pdev->dev) && 4747 apple_gmux_detect(NULL, NULL))) 4748 vga_switcheroo_register_client(adev->pdev, 4749 &amdgpu_switcheroo_ops, px); 4750 4751 if (px) 4752 vga_switcheroo_init_domain_pm_ops(adev->dev, &adev->vga_pm_domain); 4753 4754 if (adev->init_lvl->level == AMDGPU_INIT_LEVEL_MINIMAL_XGMI) 4755 amdgpu_xgmi_reset_on_init(adev); 4756 4757 amdgpu_device_check_iommu_direct_map(adev); 4758 4759 adev->pm_nb.notifier_call = amdgpu_device_pm_notifier; 4760 r = register_pm_notifier(&adev->pm_nb); 4761 if (r) 4762 goto failed; 4763 4764 return 0; 4765 4766 release_ras_con: 4767 if (amdgpu_sriov_vf(adev)) 4768 amdgpu_virt_release_full_gpu(adev, true); 4769 4770 /* failed in exclusive mode due to timeout */ 4771 if (amdgpu_sriov_vf(adev) && 4772 !amdgpu_sriov_runtime(adev) && 4773 amdgpu_virt_mmio_blocked(adev) && 4774 !amdgpu_virt_wait_reset(adev)) { 4775 dev_err(adev->dev, "VF exclusive mode timeout\n"); 4776 /* Don't send request since VF is inactive. */ 4777 adev->virt.caps &= ~AMDGPU_SRIOV_CAPS_RUNTIME; 4778 adev->virt.ops = NULL; 4779 r = -EAGAIN; 4780 } 4781 amdgpu_release_ras_context(adev); 4782 4783 failed: 4784 amdgpu_vf_error_trans_all(adev); 4785 4786 return r; 4787 } 4788 4789 static void amdgpu_device_unmap_mmio(struct amdgpu_device *adev) 4790 { 4791 4792 /* Clear all CPU mappings pointing to this device */ 4793 unmap_mapping_range(adev->ddev.anon_inode->i_mapping, 0, 0, 1); 4794 4795 /* Unmap all mapped bars - Doorbell, registers and VRAM */ 4796 amdgpu_doorbell_fini(adev); 4797 4798 iounmap(adev->rmmio); 4799 adev->rmmio = NULL; 4800 if (adev->mman.aper_base_kaddr) 4801 iounmap(adev->mman.aper_base_kaddr); 4802 adev->mman.aper_base_kaddr = NULL; 4803 4804 /* Memory manager related */ 4805 if (!adev->gmc.xgmi.connected_to_cpu && !adev->gmc.is_app_apu) { 4806 arch_phys_wc_del(adev->gmc.vram_mtrr); 4807 arch_io_free_memtype_wc(adev->gmc.aper_base, adev->gmc.aper_size); 4808 } 4809 } 4810 4811 /** 4812 * amdgpu_device_fini_hw - tear down the driver 4813 * 4814 * @adev: amdgpu_device pointer 4815 * 4816 * Tear down the driver info (all asics). 4817 * Called at driver shutdown. 4818 */ 4819 void amdgpu_device_fini_hw(struct amdgpu_device *adev) 4820 { 4821 dev_info(adev->dev, "amdgpu: finishing device.\n"); 4822 flush_delayed_work(&adev->delayed_init_work); 4823 4824 if (adev->mman.initialized) 4825 drain_workqueue(adev->mman.bdev.wq); 4826 adev->shutdown = true; 4827 4828 unregister_pm_notifier(&adev->pm_nb); 4829 4830 /* make sure IB test finished before entering exclusive mode 4831 * to avoid preemption on IB test 4832 */ 4833 if (amdgpu_sriov_vf(adev)) { 4834 amdgpu_virt_request_full_gpu(adev, false); 4835 amdgpu_virt_fini_data_exchange(adev); 4836 } 4837 4838 /* disable all interrupts */ 4839 amdgpu_irq_disable_all(adev); 4840 if (adev->mode_info.mode_config_initialized) { 4841 if (!drm_drv_uses_atomic_modeset(adev_to_drm(adev))) 4842 drm_helper_force_disable_all(adev_to_drm(adev)); 4843 else 4844 drm_atomic_helper_shutdown(adev_to_drm(adev)); 4845 } 4846 amdgpu_fence_driver_hw_fini(adev); 4847 4848 if (adev->pm.sysfs_initialized) 4849 amdgpu_pm_sysfs_fini(adev); 4850 if (adev->ucode_sysfs_en) 4851 amdgpu_ucode_sysfs_fini(adev); 4852 amdgpu_device_attr_sysfs_fini(adev); 4853 amdgpu_fru_sysfs_fini(adev); 4854 4855 amdgpu_reg_state_sysfs_fini(adev); 4856 amdgpu_xcp_cfg_sysfs_fini(adev); 4857 4858 /* disable ras feature must before hw fini */ 4859 amdgpu_ras_pre_fini(adev); 4860 4861 amdgpu_ttm_set_buffer_funcs_status(adev, false); 4862 4863 amdgpu_device_ip_fini_early(adev); 4864 4865 amdgpu_irq_fini_hw(adev); 4866 4867 if (adev->mman.initialized) 4868 ttm_device_clear_dma_mappings(&adev->mman.bdev); 4869 4870 amdgpu_gart_dummy_page_fini(adev); 4871 4872 if (drm_dev_is_unplugged(adev_to_drm(adev))) 4873 amdgpu_device_unmap_mmio(adev); 4874 4875 } 4876 4877 void amdgpu_device_fini_sw(struct amdgpu_device *adev) 4878 { 4879 int i, idx; 4880 bool px; 4881 4882 amdgpu_device_ip_fini(adev); 4883 amdgpu_fence_driver_sw_fini(adev); 4884 amdgpu_ucode_release(&adev->firmware.gpu_info_fw); 4885 adev->accel_working = false; 4886 dma_fence_put(rcu_dereference_protected(adev->gang_submit, true)); 4887 for (i = 0; i < MAX_XCP; ++i) { 4888 dma_fence_put(adev->isolation[i].spearhead); 4889 amdgpu_sync_free(&adev->isolation[i].active); 4890 amdgpu_sync_free(&adev->isolation[i].prev); 4891 } 4892 4893 amdgpu_reset_fini(adev); 4894 4895 /* free i2c buses */ 4896 amdgpu_i2c_fini(adev); 4897 4898 if (adev->bios) { 4899 if (amdgpu_emu_mode != 1) 4900 amdgpu_atombios_fini(adev); 4901 amdgpu_bios_release(adev); 4902 } 4903 4904 kfree(adev->fru_info); 4905 adev->fru_info = NULL; 4906 4907 kfree(adev->xcp_mgr); 4908 adev->xcp_mgr = NULL; 4909 4910 px = amdgpu_device_supports_px(adev_to_drm(adev)); 4911 4912 if (px || (!dev_is_removable(&adev->pdev->dev) && 4913 apple_gmux_detect(NULL, NULL))) 4914 vga_switcheroo_unregister_client(adev->pdev); 4915 4916 if (px) 4917 vga_switcheroo_fini_domain_pm_ops(adev->dev); 4918 4919 if ((adev->pdev->class >> 8) == PCI_CLASS_DISPLAY_VGA) 4920 vga_client_unregister(adev->pdev); 4921 4922 if (drm_dev_enter(adev_to_drm(adev), &idx)) { 4923 4924 iounmap(adev->rmmio); 4925 adev->rmmio = NULL; 4926 drm_dev_exit(idx); 4927 } 4928 4929 if (IS_ENABLED(CONFIG_PERF_EVENTS)) 4930 amdgpu_pmu_fini(adev); 4931 if (adev->mman.discovery_bin) 4932 amdgpu_discovery_fini(adev); 4933 4934 amdgpu_reset_put_reset_domain(adev->reset_domain); 4935 adev->reset_domain = NULL; 4936 4937 kfree(adev->pci_state); 4938 4939 } 4940 4941 /** 4942 * amdgpu_device_evict_resources - evict device resources 4943 * @adev: amdgpu device object 4944 * 4945 * Evicts all ttm device resources(vram BOs, gart table) from the lru list 4946 * of the vram memory type. Mainly used for evicting device resources 4947 * at suspend time. 4948 * 4949 */ 4950 static int amdgpu_device_evict_resources(struct amdgpu_device *adev) 4951 { 4952 int ret; 4953 4954 /* No need to evict vram on APUs unless going to S4 */ 4955 if (!adev->in_s4 && (adev->flags & AMD_IS_APU)) 4956 return 0; 4957 4958 ret = amdgpu_ttm_evict_resources(adev, TTM_PL_VRAM); 4959 if (ret) 4960 DRM_WARN("evicting device resources failed\n"); 4961 return ret; 4962 } 4963 4964 /* 4965 * Suspend & resume. 4966 */ 4967 /** 4968 * amdgpu_device_pm_notifier - Notification block for Suspend/Hibernate events 4969 * @nb: notifier block 4970 * @mode: suspend mode 4971 * @data: data 4972 * 4973 * This function is called when the system is about to suspend or hibernate. 4974 * It is used to evict resources from the device before the system goes to 4975 * sleep while there is still access to swap. 4976 */ 4977 static int amdgpu_device_pm_notifier(struct notifier_block *nb, unsigned long mode, 4978 void *data) 4979 { 4980 struct amdgpu_device *adev = container_of(nb, struct amdgpu_device, pm_nb); 4981 int r; 4982 4983 switch (mode) { 4984 case PM_HIBERNATION_PREPARE: 4985 adev->in_s4 = true; 4986 fallthrough; 4987 case PM_SUSPEND_PREPARE: 4988 r = amdgpu_device_evict_resources(adev); 4989 /* 4990 * This is considered non-fatal at this time because 4991 * amdgpu_device_prepare() will also fatally evict resources. 4992 * See https://gitlab.freedesktop.org/drm/amd/-/issues/3781 4993 */ 4994 if (r) 4995 drm_warn(adev_to_drm(adev), "Failed to evict resources, freeze active processes if problems occur: %d\n", r); 4996 break; 4997 } 4998 4999 return NOTIFY_DONE; 5000 } 5001 5002 /** 5003 * amdgpu_device_prepare - prepare for device suspend 5004 * 5005 * @dev: drm dev pointer 5006 * 5007 * Prepare to put the hw in the suspend state (all asics). 5008 * Returns 0 for success or an error on failure. 5009 * Called at driver suspend. 5010 */ 5011 int amdgpu_device_prepare(struct drm_device *dev) 5012 { 5013 struct amdgpu_device *adev = drm_to_adev(dev); 5014 int i, r; 5015 5016 amdgpu_choose_low_power_state(adev); 5017 5018 if (dev->switch_power_state == DRM_SWITCH_POWER_OFF) 5019 return 0; 5020 5021 /* Evict the majority of BOs before starting suspend sequence */ 5022 r = amdgpu_device_evict_resources(adev); 5023 if (r) 5024 goto unprepare; 5025 5026 flush_delayed_work(&adev->gfx.gfx_off_delay_work); 5027 5028 for (i = 0; i < adev->num_ip_blocks; i++) { 5029 if (!adev->ip_blocks[i].status.valid) 5030 continue; 5031 if (!adev->ip_blocks[i].version->funcs->prepare_suspend) 5032 continue; 5033 r = adev->ip_blocks[i].version->funcs->prepare_suspend(&adev->ip_blocks[i]); 5034 if (r) 5035 goto unprepare; 5036 } 5037 5038 return 0; 5039 5040 unprepare: 5041 adev->in_s0ix = adev->in_s3 = adev->in_s4 = false; 5042 5043 return r; 5044 } 5045 5046 /** 5047 * amdgpu_device_suspend - initiate device suspend 5048 * 5049 * @dev: drm dev pointer 5050 * @notify_clients: notify in-kernel DRM clients 5051 * 5052 * Puts the hw in the suspend state (all asics). 5053 * Returns 0 for success or an error on failure. 5054 * Called at driver suspend. 5055 */ 5056 int amdgpu_device_suspend(struct drm_device *dev, bool notify_clients) 5057 { 5058 struct amdgpu_device *adev = drm_to_adev(dev); 5059 int r = 0; 5060 5061 if (dev->switch_power_state == DRM_SWITCH_POWER_OFF) 5062 return 0; 5063 5064 adev->in_suspend = true; 5065 5066 if (amdgpu_sriov_vf(adev)) { 5067 amdgpu_virt_fini_data_exchange(adev); 5068 r = amdgpu_virt_request_full_gpu(adev, false); 5069 if (r) 5070 return r; 5071 } 5072 5073 if (amdgpu_acpi_smart_shift_update(dev, AMDGPU_SS_DEV_D3)) 5074 DRM_WARN("smart shift update failed\n"); 5075 5076 if (notify_clients) 5077 drm_client_dev_suspend(adev_to_drm(adev), false); 5078 5079 cancel_delayed_work_sync(&adev->delayed_init_work); 5080 5081 amdgpu_ras_suspend(adev); 5082 5083 amdgpu_device_ip_suspend_phase1(adev); 5084 5085 if (!adev->in_s0ix) { 5086 amdgpu_amdkfd_suspend(adev, adev->in_runpm); 5087 amdgpu_userq_suspend(adev); 5088 } 5089 5090 r = amdgpu_device_evict_resources(adev); 5091 if (r) 5092 return r; 5093 5094 amdgpu_ttm_set_buffer_funcs_status(adev, false); 5095 5096 amdgpu_fence_driver_hw_fini(adev); 5097 5098 amdgpu_device_ip_suspend_phase2(adev); 5099 5100 if (amdgpu_sriov_vf(adev)) 5101 amdgpu_virt_release_full_gpu(adev, false); 5102 5103 r = amdgpu_dpm_notify_rlc_state(adev, false); 5104 if (r) 5105 return r; 5106 5107 return 0; 5108 } 5109 5110 /** 5111 * amdgpu_device_resume - initiate device resume 5112 * 5113 * @dev: drm dev pointer 5114 * @notify_clients: notify in-kernel DRM clients 5115 * 5116 * Bring the hw back to operating state (all asics). 5117 * Returns 0 for success or an error on failure. 5118 * Called at driver resume. 5119 */ 5120 int amdgpu_device_resume(struct drm_device *dev, bool notify_clients) 5121 { 5122 struct amdgpu_device *adev = drm_to_adev(dev); 5123 int r = 0; 5124 5125 if (amdgpu_sriov_vf(adev)) { 5126 r = amdgpu_virt_request_full_gpu(adev, true); 5127 if (r) 5128 return r; 5129 } 5130 5131 if (dev->switch_power_state == DRM_SWITCH_POWER_OFF) 5132 return 0; 5133 5134 if (adev->in_s0ix) 5135 amdgpu_dpm_gfx_state_change(adev, sGpuChangeState_D0Entry); 5136 5137 /* post card */ 5138 if (amdgpu_device_need_post(adev)) { 5139 r = amdgpu_device_asic_init(adev); 5140 if (r) 5141 dev_err(adev->dev, "amdgpu asic init failed\n"); 5142 } 5143 5144 r = amdgpu_device_ip_resume(adev); 5145 5146 if (r) { 5147 dev_err(adev->dev, "amdgpu_device_ip_resume failed (%d).\n", r); 5148 goto exit; 5149 } 5150 5151 if (!adev->in_s0ix) { 5152 r = amdgpu_amdkfd_resume(adev, adev->in_runpm); 5153 if (r) 5154 goto exit; 5155 5156 r = amdgpu_userq_resume(adev); 5157 if (r) 5158 goto exit; 5159 } 5160 5161 r = amdgpu_device_ip_late_init(adev); 5162 if (r) 5163 goto exit; 5164 5165 queue_delayed_work(system_wq, &adev->delayed_init_work, 5166 msecs_to_jiffies(AMDGPU_RESUME_MS)); 5167 exit: 5168 if (amdgpu_sriov_vf(adev)) { 5169 amdgpu_virt_init_data_exchange(adev); 5170 amdgpu_virt_release_full_gpu(adev, true); 5171 } 5172 5173 if (r) 5174 return r; 5175 5176 /* Make sure IB tests flushed */ 5177 flush_delayed_work(&adev->delayed_init_work); 5178 5179 if (notify_clients) 5180 drm_client_dev_resume(adev_to_drm(adev), false); 5181 5182 amdgpu_ras_resume(adev); 5183 5184 if (adev->mode_info.num_crtc) { 5185 /* 5186 * Most of the connector probing functions try to acquire runtime pm 5187 * refs to ensure that the GPU is powered on when connector polling is 5188 * performed. Since we're calling this from a runtime PM callback, 5189 * trying to acquire rpm refs will cause us to deadlock. 5190 * 5191 * Since we're guaranteed to be holding the rpm lock, it's safe to 5192 * temporarily disable the rpm helpers so this doesn't deadlock us. 5193 */ 5194 #ifdef CONFIG_PM 5195 dev->dev->power.disable_depth++; 5196 #endif 5197 if (!adev->dc_enabled) 5198 drm_helper_hpd_irq_event(dev); 5199 else 5200 drm_kms_helper_hotplug_event(dev); 5201 #ifdef CONFIG_PM 5202 dev->dev->power.disable_depth--; 5203 #endif 5204 } 5205 adev->in_suspend = false; 5206 5207 if (amdgpu_acpi_smart_shift_update(dev, AMDGPU_SS_DEV_D0)) 5208 DRM_WARN("smart shift update failed\n"); 5209 5210 return 0; 5211 } 5212 5213 /** 5214 * amdgpu_device_ip_check_soft_reset - did soft reset succeed 5215 * 5216 * @adev: amdgpu_device pointer 5217 * 5218 * The list of all the hardware IPs that make up the asic is walked and 5219 * the check_soft_reset callbacks are run. check_soft_reset determines 5220 * if the asic is still hung or not. 5221 * Returns true if any of the IPs are still in a hung state, false if not. 5222 */ 5223 static bool amdgpu_device_ip_check_soft_reset(struct amdgpu_device *adev) 5224 { 5225 int i; 5226 bool asic_hang = false; 5227 5228 if (amdgpu_sriov_vf(adev)) 5229 return true; 5230 5231 if (amdgpu_asic_need_full_reset(adev)) 5232 return true; 5233 5234 for (i = 0; i < adev->num_ip_blocks; i++) { 5235 if (!adev->ip_blocks[i].status.valid) 5236 continue; 5237 if (adev->ip_blocks[i].version->funcs->check_soft_reset) 5238 adev->ip_blocks[i].status.hang = 5239 adev->ip_blocks[i].version->funcs->check_soft_reset( 5240 &adev->ip_blocks[i]); 5241 if (adev->ip_blocks[i].status.hang) { 5242 dev_info(adev->dev, "IP block:%s is hung!\n", adev->ip_blocks[i].version->funcs->name); 5243 asic_hang = true; 5244 } 5245 } 5246 return asic_hang; 5247 } 5248 5249 /** 5250 * amdgpu_device_ip_pre_soft_reset - prepare for soft reset 5251 * 5252 * @adev: amdgpu_device pointer 5253 * 5254 * The list of all the hardware IPs that make up the asic is walked and the 5255 * pre_soft_reset callbacks are run if the block is hung. pre_soft_reset 5256 * handles any IP specific hardware or software state changes that are 5257 * necessary for a soft reset to succeed. 5258 * Returns 0 on success, negative error code on failure. 5259 */ 5260 static int amdgpu_device_ip_pre_soft_reset(struct amdgpu_device *adev) 5261 { 5262 int i, r = 0; 5263 5264 for (i = 0; i < adev->num_ip_blocks; i++) { 5265 if (!adev->ip_blocks[i].status.valid) 5266 continue; 5267 if (adev->ip_blocks[i].status.hang && 5268 adev->ip_blocks[i].version->funcs->pre_soft_reset) { 5269 r = adev->ip_blocks[i].version->funcs->pre_soft_reset(&adev->ip_blocks[i]); 5270 if (r) 5271 return r; 5272 } 5273 } 5274 5275 return 0; 5276 } 5277 5278 /** 5279 * amdgpu_device_ip_need_full_reset - check if a full asic reset is needed 5280 * 5281 * @adev: amdgpu_device pointer 5282 * 5283 * Some hardware IPs cannot be soft reset. If they are hung, a full gpu 5284 * reset is necessary to recover. 5285 * Returns true if a full asic reset is required, false if not. 5286 */ 5287 static bool amdgpu_device_ip_need_full_reset(struct amdgpu_device *adev) 5288 { 5289 int i; 5290 5291 if (amdgpu_asic_need_full_reset(adev)) 5292 return true; 5293 5294 for (i = 0; i < adev->num_ip_blocks; i++) { 5295 if (!adev->ip_blocks[i].status.valid) 5296 continue; 5297 if ((adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) || 5298 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) || 5299 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_ACP) || 5300 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE) || 5301 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) { 5302 if (adev->ip_blocks[i].status.hang) { 5303 dev_info(adev->dev, "Some block need full reset!\n"); 5304 return true; 5305 } 5306 } 5307 } 5308 return false; 5309 } 5310 5311 /** 5312 * amdgpu_device_ip_soft_reset - do a soft reset 5313 * 5314 * @adev: amdgpu_device pointer 5315 * 5316 * The list of all the hardware IPs that make up the asic is walked and the 5317 * soft_reset callbacks are run if the block is hung. soft_reset handles any 5318 * IP specific hardware or software state changes that are necessary to soft 5319 * reset the IP. 5320 * Returns 0 on success, negative error code on failure. 5321 */ 5322 static int amdgpu_device_ip_soft_reset(struct amdgpu_device *adev) 5323 { 5324 int i, r = 0; 5325 5326 for (i = 0; i < adev->num_ip_blocks; i++) { 5327 if (!adev->ip_blocks[i].status.valid) 5328 continue; 5329 if (adev->ip_blocks[i].status.hang && 5330 adev->ip_blocks[i].version->funcs->soft_reset) { 5331 r = adev->ip_blocks[i].version->funcs->soft_reset(&adev->ip_blocks[i]); 5332 if (r) 5333 return r; 5334 } 5335 } 5336 5337 return 0; 5338 } 5339 5340 /** 5341 * amdgpu_device_ip_post_soft_reset - clean up from soft reset 5342 * 5343 * @adev: amdgpu_device pointer 5344 * 5345 * The list of all the hardware IPs that make up the asic is walked and the 5346 * post_soft_reset callbacks are run if the asic was hung. post_soft_reset 5347 * handles any IP specific hardware or software state changes that are 5348 * necessary after the IP has been soft reset. 5349 * Returns 0 on success, negative error code on failure. 5350 */ 5351 static int amdgpu_device_ip_post_soft_reset(struct amdgpu_device *adev) 5352 { 5353 int i, r = 0; 5354 5355 for (i = 0; i < adev->num_ip_blocks; i++) { 5356 if (!adev->ip_blocks[i].status.valid) 5357 continue; 5358 if (adev->ip_blocks[i].status.hang && 5359 adev->ip_blocks[i].version->funcs->post_soft_reset) 5360 r = adev->ip_blocks[i].version->funcs->post_soft_reset(&adev->ip_blocks[i]); 5361 if (r) 5362 return r; 5363 } 5364 5365 return 0; 5366 } 5367 5368 /** 5369 * amdgpu_device_reset_sriov - reset ASIC for SR-IOV vf 5370 * 5371 * @adev: amdgpu_device pointer 5372 * @reset_context: amdgpu reset context pointer 5373 * 5374 * do VF FLR and reinitialize Asic 5375 * return 0 means succeeded otherwise failed 5376 */ 5377 static int amdgpu_device_reset_sriov(struct amdgpu_device *adev, 5378 struct amdgpu_reset_context *reset_context) 5379 { 5380 int r; 5381 struct amdgpu_hive_info *hive = NULL; 5382 5383 if (test_bit(AMDGPU_HOST_FLR, &reset_context->flags)) { 5384 if (!amdgpu_ras_get_fed_status(adev)) 5385 amdgpu_virt_ready_to_reset(adev); 5386 amdgpu_virt_wait_reset(adev); 5387 clear_bit(AMDGPU_HOST_FLR, &reset_context->flags); 5388 r = amdgpu_virt_request_full_gpu(adev, true); 5389 } else { 5390 r = amdgpu_virt_reset_gpu(adev); 5391 } 5392 if (r) 5393 return r; 5394 5395 amdgpu_ras_clear_err_state(adev); 5396 amdgpu_irq_gpu_reset_resume_helper(adev); 5397 5398 /* some sw clean up VF needs to do before recover */ 5399 amdgpu_virt_post_reset(adev); 5400 5401 /* Resume IP prior to SMC */ 5402 r = amdgpu_device_ip_reinit_early_sriov(adev); 5403 if (r) 5404 return r; 5405 5406 amdgpu_virt_init_data_exchange(adev); 5407 5408 r = amdgpu_device_fw_loading(adev); 5409 if (r) 5410 return r; 5411 5412 /* now we are okay to resume SMC/CP/SDMA */ 5413 r = amdgpu_device_ip_reinit_late_sriov(adev); 5414 if (r) 5415 return r; 5416 5417 hive = amdgpu_get_xgmi_hive(adev); 5418 /* Update PSP FW topology after reset */ 5419 if (hive && adev->gmc.xgmi.num_physical_nodes > 1) 5420 r = amdgpu_xgmi_update_topology(hive, adev); 5421 if (hive) 5422 amdgpu_put_xgmi_hive(hive); 5423 if (r) 5424 return r; 5425 5426 r = amdgpu_ib_ring_tests(adev); 5427 if (r) 5428 return r; 5429 5430 if (adev->virt.gim_feature & AMDGIM_FEATURE_GIM_FLR_VRAMLOST) 5431 amdgpu_inc_vram_lost(adev); 5432 5433 /* need to be called during full access so we can't do it later like 5434 * bare-metal does. 5435 */ 5436 amdgpu_amdkfd_post_reset(adev); 5437 amdgpu_virt_release_full_gpu(adev, true); 5438 5439 /* Aldebaran and gfx_11_0_3 support ras in SRIOV, so need resume ras during reset */ 5440 if (amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 2) || 5441 amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 3) || 5442 amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 4) || 5443 amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 5, 0) || 5444 amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(11, 0, 3)) 5445 amdgpu_ras_resume(adev); 5446 5447 amdgpu_virt_ras_telemetry_post_reset(adev); 5448 5449 return 0; 5450 } 5451 5452 /** 5453 * amdgpu_device_has_job_running - check if there is any unfinished job 5454 * 5455 * @adev: amdgpu_device pointer 5456 * 5457 * check if there is any job running on the device when guest driver receives 5458 * FLR notification from host driver. If there are still jobs running, then 5459 * the guest driver will not respond the FLR reset. Instead, let the job hit 5460 * the timeout and guest driver then issue the reset request. 5461 */ 5462 bool amdgpu_device_has_job_running(struct amdgpu_device *adev) 5463 { 5464 int i; 5465 5466 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 5467 struct amdgpu_ring *ring = adev->rings[i]; 5468 5469 if (!amdgpu_ring_sched_ready(ring)) 5470 continue; 5471 5472 if (amdgpu_fence_count_emitted(ring)) 5473 return true; 5474 } 5475 return false; 5476 } 5477 5478 /** 5479 * amdgpu_device_should_recover_gpu - check if we should try GPU recovery 5480 * 5481 * @adev: amdgpu_device pointer 5482 * 5483 * Check amdgpu_gpu_recovery and SRIOV status to see if we should try to recover 5484 * a hung GPU. 5485 */ 5486 bool amdgpu_device_should_recover_gpu(struct amdgpu_device *adev) 5487 { 5488 5489 if (amdgpu_gpu_recovery == 0) 5490 goto disabled; 5491 5492 /* Skip soft reset check in fatal error mode */ 5493 if (!amdgpu_ras_is_poison_mode_supported(adev)) 5494 return true; 5495 5496 if (amdgpu_sriov_vf(adev)) 5497 return true; 5498 5499 if (amdgpu_gpu_recovery == -1) { 5500 switch (adev->asic_type) { 5501 #ifdef CONFIG_DRM_AMDGPU_SI 5502 case CHIP_VERDE: 5503 case CHIP_TAHITI: 5504 case CHIP_PITCAIRN: 5505 case CHIP_OLAND: 5506 case CHIP_HAINAN: 5507 #endif 5508 #ifdef CONFIG_DRM_AMDGPU_CIK 5509 case CHIP_KAVERI: 5510 case CHIP_KABINI: 5511 case CHIP_MULLINS: 5512 #endif 5513 case CHIP_CARRIZO: 5514 case CHIP_STONEY: 5515 case CHIP_CYAN_SKILLFISH: 5516 goto disabled; 5517 default: 5518 break; 5519 } 5520 } 5521 5522 return true; 5523 5524 disabled: 5525 dev_info(adev->dev, "GPU recovery disabled.\n"); 5526 return false; 5527 } 5528 5529 int amdgpu_device_mode1_reset(struct amdgpu_device *adev) 5530 { 5531 u32 i; 5532 int ret = 0; 5533 5534 if (adev->bios) 5535 amdgpu_atombios_scratch_regs_engine_hung(adev, true); 5536 5537 dev_info(adev->dev, "GPU mode1 reset\n"); 5538 5539 /* Cache the state before bus master disable. The saved config space 5540 * values are used in other cases like restore after mode-2 reset. 5541 */ 5542 amdgpu_device_cache_pci_state(adev->pdev); 5543 5544 /* disable BM */ 5545 pci_clear_master(adev->pdev); 5546 5547 if (amdgpu_dpm_is_mode1_reset_supported(adev)) { 5548 dev_info(adev->dev, "GPU smu mode1 reset\n"); 5549 ret = amdgpu_dpm_mode1_reset(adev); 5550 } else { 5551 dev_info(adev->dev, "GPU psp mode1 reset\n"); 5552 ret = psp_gpu_reset(adev); 5553 } 5554 5555 if (ret) 5556 goto mode1_reset_failed; 5557 5558 amdgpu_device_load_pci_state(adev->pdev); 5559 ret = amdgpu_psp_wait_for_bootloader(adev); 5560 if (ret) 5561 goto mode1_reset_failed; 5562 5563 /* wait for asic to come out of reset */ 5564 for (i = 0; i < adev->usec_timeout; i++) { 5565 u32 memsize = adev->nbio.funcs->get_memsize(adev); 5566 5567 if (memsize != 0xffffffff) 5568 break; 5569 udelay(1); 5570 } 5571 5572 if (i >= adev->usec_timeout) { 5573 ret = -ETIMEDOUT; 5574 goto mode1_reset_failed; 5575 } 5576 5577 if (adev->bios) 5578 amdgpu_atombios_scratch_regs_engine_hung(adev, false); 5579 5580 return 0; 5581 5582 mode1_reset_failed: 5583 dev_err(adev->dev, "GPU mode1 reset failed\n"); 5584 return ret; 5585 } 5586 5587 int amdgpu_device_link_reset(struct amdgpu_device *adev) 5588 { 5589 int ret = 0; 5590 5591 dev_info(adev->dev, "GPU link reset\n"); 5592 5593 if (!adev->pcie_reset_ctx.occurs_dpc) 5594 ret = amdgpu_dpm_link_reset(adev); 5595 5596 if (ret) 5597 goto link_reset_failed; 5598 5599 ret = amdgpu_psp_wait_for_bootloader(adev); 5600 if (ret) 5601 goto link_reset_failed; 5602 5603 return 0; 5604 5605 link_reset_failed: 5606 dev_err(adev->dev, "GPU link reset failed\n"); 5607 return ret; 5608 } 5609 5610 int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev, 5611 struct amdgpu_reset_context *reset_context) 5612 { 5613 int i, r = 0; 5614 struct amdgpu_job *job = NULL; 5615 struct amdgpu_device *tmp_adev = reset_context->reset_req_dev; 5616 bool need_full_reset = 5617 test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 5618 5619 if (reset_context->reset_req_dev == adev) 5620 job = reset_context->job; 5621 5622 if (amdgpu_sriov_vf(adev)) 5623 amdgpu_virt_pre_reset(adev); 5624 5625 amdgpu_fence_driver_isr_toggle(adev, true); 5626 5627 /* block all schedulers and reset given job's ring */ 5628 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 5629 struct amdgpu_ring *ring = adev->rings[i]; 5630 5631 if (!amdgpu_ring_sched_ready(ring)) 5632 continue; 5633 5634 /* Clear job fence from fence drv to avoid force_completion 5635 * leave NULL and vm flush fence in fence drv 5636 */ 5637 amdgpu_fence_driver_clear_job_fences(ring); 5638 5639 /* after all hw jobs are reset, hw fence is meaningless, so force_completion */ 5640 amdgpu_fence_driver_force_completion(ring); 5641 } 5642 5643 amdgpu_fence_driver_isr_toggle(adev, false); 5644 5645 if (job && job->vm) 5646 drm_sched_increase_karma(&job->base); 5647 5648 r = amdgpu_reset_prepare_hwcontext(adev, reset_context); 5649 /* If reset handler not implemented, continue; otherwise return */ 5650 if (r == -EOPNOTSUPP) 5651 r = 0; 5652 else 5653 return r; 5654 5655 /* Don't suspend on bare metal if we are not going to HW reset the ASIC */ 5656 if (!amdgpu_sriov_vf(adev)) { 5657 5658 if (!need_full_reset) 5659 need_full_reset = amdgpu_device_ip_need_full_reset(adev); 5660 5661 if (!need_full_reset && amdgpu_gpu_recovery && 5662 amdgpu_device_ip_check_soft_reset(adev)) { 5663 amdgpu_device_ip_pre_soft_reset(adev); 5664 r = amdgpu_device_ip_soft_reset(adev); 5665 amdgpu_device_ip_post_soft_reset(adev); 5666 if (r || amdgpu_device_ip_check_soft_reset(adev)) { 5667 dev_info(adev->dev, "soft reset failed, will fallback to full reset!\n"); 5668 need_full_reset = true; 5669 } 5670 } 5671 5672 if (!test_bit(AMDGPU_SKIP_COREDUMP, &reset_context->flags)) { 5673 dev_info(tmp_adev->dev, "Dumping IP State\n"); 5674 /* Trigger ip dump before we reset the asic */ 5675 for (i = 0; i < tmp_adev->num_ip_blocks; i++) 5676 if (tmp_adev->ip_blocks[i].version->funcs->dump_ip_state) 5677 tmp_adev->ip_blocks[i].version->funcs 5678 ->dump_ip_state((void *)&tmp_adev->ip_blocks[i]); 5679 dev_info(tmp_adev->dev, "Dumping IP State Completed\n"); 5680 } 5681 5682 if (need_full_reset) 5683 r = amdgpu_device_ip_suspend(adev); 5684 if (need_full_reset) 5685 set_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 5686 else 5687 clear_bit(AMDGPU_NEED_FULL_RESET, 5688 &reset_context->flags); 5689 } 5690 5691 return r; 5692 } 5693 5694 int amdgpu_device_reinit_after_reset(struct amdgpu_reset_context *reset_context) 5695 { 5696 struct list_head *device_list_handle; 5697 bool full_reset, vram_lost = false; 5698 struct amdgpu_device *tmp_adev; 5699 int r, init_level; 5700 5701 device_list_handle = reset_context->reset_device_list; 5702 5703 if (!device_list_handle) 5704 return -EINVAL; 5705 5706 full_reset = test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 5707 5708 /** 5709 * If it's reset on init, it's default init level, otherwise keep level 5710 * as recovery level. 5711 */ 5712 if (reset_context->method == AMD_RESET_METHOD_ON_INIT) 5713 init_level = AMDGPU_INIT_LEVEL_DEFAULT; 5714 else 5715 init_level = AMDGPU_INIT_LEVEL_RESET_RECOVERY; 5716 5717 r = 0; 5718 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 5719 amdgpu_set_init_level(tmp_adev, init_level); 5720 if (full_reset) { 5721 /* post card */ 5722 amdgpu_ras_clear_err_state(tmp_adev); 5723 r = amdgpu_device_asic_init(tmp_adev); 5724 if (r) { 5725 dev_warn(tmp_adev->dev, "asic atom init failed!"); 5726 } else { 5727 dev_info(tmp_adev->dev, "GPU reset succeeded, trying to resume\n"); 5728 5729 r = amdgpu_device_ip_resume_phase1(tmp_adev); 5730 if (r) 5731 goto out; 5732 5733 vram_lost = amdgpu_device_check_vram_lost(tmp_adev); 5734 5735 if (!test_bit(AMDGPU_SKIP_COREDUMP, &reset_context->flags)) 5736 amdgpu_coredump(tmp_adev, false, vram_lost, reset_context->job); 5737 5738 if (vram_lost) { 5739 DRM_INFO("VRAM is lost due to GPU reset!\n"); 5740 amdgpu_inc_vram_lost(tmp_adev); 5741 } 5742 5743 r = amdgpu_device_fw_loading(tmp_adev); 5744 if (r) 5745 return r; 5746 5747 r = amdgpu_xcp_restore_partition_mode( 5748 tmp_adev->xcp_mgr); 5749 if (r) 5750 goto out; 5751 5752 r = amdgpu_device_ip_resume_phase2(tmp_adev); 5753 if (r) 5754 goto out; 5755 5756 if (tmp_adev->mman.buffer_funcs_ring->sched.ready) 5757 amdgpu_ttm_set_buffer_funcs_status(tmp_adev, true); 5758 5759 r = amdgpu_device_ip_resume_phase3(tmp_adev); 5760 if (r) 5761 goto out; 5762 5763 if (vram_lost) 5764 amdgpu_device_fill_reset_magic(tmp_adev); 5765 5766 /* 5767 * Add this ASIC as tracked as reset was already 5768 * complete successfully. 5769 */ 5770 amdgpu_register_gpu_instance(tmp_adev); 5771 5772 if (!reset_context->hive && 5773 tmp_adev->gmc.xgmi.num_physical_nodes > 1) 5774 amdgpu_xgmi_add_device(tmp_adev); 5775 5776 r = amdgpu_device_ip_late_init(tmp_adev); 5777 if (r) 5778 goto out; 5779 5780 drm_client_dev_resume(adev_to_drm(tmp_adev), false); 5781 5782 /* 5783 * The GPU enters bad state once faulty pages 5784 * by ECC has reached the threshold, and ras 5785 * recovery is scheduled next. So add one check 5786 * here to break recovery if it indeed exceeds 5787 * bad page threshold, and remind user to 5788 * retire this GPU or setting one bigger 5789 * bad_page_threshold value to fix this once 5790 * probing driver again. 5791 */ 5792 if (!amdgpu_ras_is_rma(tmp_adev)) { 5793 /* must succeed. */ 5794 amdgpu_ras_resume(tmp_adev); 5795 } else { 5796 r = -EINVAL; 5797 goto out; 5798 } 5799 5800 /* Update PSP FW topology after reset */ 5801 if (reset_context->hive && 5802 tmp_adev->gmc.xgmi.num_physical_nodes > 1) 5803 r = amdgpu_xgmi_update_topology( 5804 reset_context->hive, tmp_adev); 5805 } 5806 } 5807 5808 out: 5809 if (!r) { 5810 /* IP init is complete now, set level as default */ 5811 amdgpu_set_init_level(tmp_adev, 5812 AMDGPU_INIT_LEVEL_DEFAULT); 5813 amdgpu_irq_gpu_reset_resume_helper(tmp_adev); 5814 r = amdgpu_ib_ring_tests(tmp_adev); 5815 if (r) { 5816 dev_err(tmp_adev->dev, "ib ring test failed (%d).\n", r); 5817 r = -EAGAIN; 5818 goto end; 5819 } 5820 } 5821 5822 if (r) 5823 tmp_adev->asic_reset_res = r; 5824 } 5825 5826 end: 5827 return r; 5828 } 5829 5830 int amdgpu_do_asic_reset(struct list_head *device_list_handle, 5831 struct amdgpu_reset_context *reset_context) 5832 { 5833 struct amdgpu_device *tmp_adev = NULL; 5834 bool need_full_reset, skip_hw_reset; 5835 int r = 0; 5836 5837 /* Try reset handler method first */ 5838 tmp_adev = list_first_entry(device_list_handle, struct amdgpu_device, 5839 reset_list); 5840 5841 reset_context->reset_device_list = device_list_handle; 5842 r = amdgpu_reset_perform_reset(tmp_adev, reset_context); 5843 /* If reset handler not implemented, continue; otherwise return */ 5844 if (r == -EOPNOTSUPP) 5845 r = 0; 5846 else 5847 return r; 5848 5849 /* Reset handler not implemented, use the default method */ 5850 need_full_reset = 5851 test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 5852 skip_hw_reset = test_bit(AMDGPU_SKIP_HW_RESET, &reset_context->flags); 5853 5854 /* 5855 * ASIC reset has to be done on all XGMI hive nodes ASAP 5856 * to allow proper links negotiation in FW (within 1 sec) 5857 */ 5858 if (!skip_hw_reset && need_full_reset) { 5859 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 5860 /* For XGMI run all resets in parallel to speed up the process */ 5861 if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) { 5862 if (!queue_work(system_unbound_wq, 5863 &tmp_adev->xgmi_reset_work)) 5864 r = -EALREADY; 5865 } else 5866 r = amdgpu_asic_reset(tmp_adev); 5867 5868 if (r) { 5869 dev_err(tmp_adev->dev, 5870 "ASIC reset failed with error, %d for drm dev, %s", 5871 r, adev_to_drm(tmp_adev)->unique); 5872 goto out; 5873 } 5874 } 5875 5876 /* For XGMI wait for all resets to complete before proceed */ 5877 if (!r) { 5878 list_for_each_entry(tmp_adev, device_list_handle, 5879 reset_list) { 5880 if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) { 5881 flush_work(&tmp_adev->xgmi_reset_work); 5882 r = tmp_adev->asic_reset_res; 5883 if (r) 5884 break; 5885 } 5886 } 5887 } 5888 } 5889 5890 if (!r && amdgpu_ras_intr_triggered()) { 5891 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 5892 amdgpu_ras_reset_error_count(tmp_adev, 5893 AMDGPU_RAS_BLOCK__MMHUB); 5894 } 5895 5896 amdgpu_ras_intr_cleared(); 5897 } 5898 5899 r = amdgpu_device_reinit_after_reset(reset_context); 5900 if (r == -EAGAIN) 5901 set_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 5902 else 5903 clear_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 5904 5905 out: 5906 return r; 5907 } 5908 5909 static void amdgpu_device_set_mp1_state(struct amdgpu_device *adev) 5910 { 5911 5912 switch (amdgpu_asic_reset_method(adev)) { 5913 case AMD_RESET_METHOD_MODE1: 5914 case AMD_RESET_METHOD_LINK: 5915 adev->mp1_state = PP_MP1_STATE_SHUTDOWN; 5916 break; 5917 case AMD_RESET_METHOD_MODE2: 5918 adev->mp1_state = PP_MP1_STATE_RESET; 5919 break; 5920 default: 5921 adev->mp1_state = PP_MP1_STATE_NONE; 5922 break; 5923 } 5924 } 5925 5926 static void amdgpu_device_unset_mp1_state(struct amdgpu_device *adev) 5927 { 5928 amdgpu_vf_error_trans_all(adev); 5929 adev->mp1_state = PP_MP1_STATE_NONE; 5930 } 5931 5932 static void amdgpu_device_resume_display_audio(struct amdgpu_device *adev) 5933 { 5934 struct pci_dev *p = NULL; 5935 5936 p = pci_get_domain_bus_and_slot(pci_domain_nr(adev->pdev->bus), 5937 adev->pdev->bus->number, 1); 5938 if (p) { 5939 pm_runtime_enable(&(p->dev)); 5940 pm_runtime_resume(&(p->dev)); 5941 } 5942 5943 pci_dev_put(p); 5944 } 5945 5946 static int amdgpu_device_suspend_display_audio(struct amdgpu_device *adev) 5947 { 5948 enum amd_reset_method reset_method; 5949 struct pci_dev *p = NULL; 5950 u64 expires; 5951 5952 /* 5953 * For now, only BACO and mode1 reset are confirmed 5954 * to suffer the audio issue without proper suspended. 5955 */ 5956 reset_method = amdgpu_asic_reset_method(adev); 5957 if ((reset_method != AMD_RESET_METHOD_BACO) && 5958 (reset_method != AMD_RESET_METHOD_MODE1)) 5959 return -EINVAL; 5960 5961 p = pci_get_domain_bus_and_slot(pci_domain_nr(adev->pdev->bus), 5962 adev->pdev->bus->number, 1); 5963 if (!p) 5964 return -ENODEV; 5965 5966 expires = pm_runtime_autosuspend_expiration(&(p->dev)); 5967 if (!expires) 5968 /* 5969 * If we cannot get the audio device autosuspend delay, 5970 * a fixed 4S interval will be used. Considering 3S is 5971 * the audio controller default autosuspend delay setting. 5972 * 4S used here is guaranteed to cover that. 5973 */ 5974 expires = ktime_get_mono_fast_ns() + NSEC_PER_SEC * 4ULL; 5975 5976 while (!pm_runtime_status_suspended(&(p->dev))) { 5977 if (!pm_runtime_suspend(&(p->dev))) 5978 break; 5979 5980 if (expires < ktime_get_mono_fast_ns()) { 5981 dev_warn(adev->dev, "failed to suspend display audio\n"); 5982 pci_dev_put(p); 5983 /* TODO: abort the succeeding gpu reset? */ 5984 return -ETIMEDOUT; 5985 } 5986 } 5987 5988 pm_runtime_disable(&(p->dev)); 5989 5990 pci_dev_put(p); 5991 return 0; 5992 } 5993 5994 static inline void amdgpu_device_stop_pending_resets(struct amdgpu_device *adev) 5995 { 5996 struct amdgpu_ras *con = amdgpu_ras_get_context(adev); 5997 5998 #if defined(CONFIG_DEBUG_FS) 5999 if (!amdgpu_sriov_vf(adev)) 6000 cancel_work(&adev->reset_work); 6001 #endif 6002 6003 if (adev->kfd.dev) 6004 cancel_work(&adev->kfd.reset_work); 6005 6006 if (amdgpu_sriov_vf(adev)) 6007 cancel_work(&adev->virt.flr_work); 6008 6009 if (con && adev->ras_enabled) 6010 cancel_work(&con->recovery_work); 6011 6012 } 6013 6014 static int amdgpu_device_health_check(struct list_head *device_list_handle) 6015 { 6016 struct amdgpu_device *tmp_adev; 6017 int ret = 0; 6018 u32 status; 6019 6020 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 6021 pci_read_config_dword(tmp_adev->pdev, PCI_COMMAND, &status); 6022 if (PCI_POSSIBLE_ERROR(status)) { 6023 dev_err(tmp_adev->dev, "device lost from bus!"); 6024 ret = -ENODEV; 6025 } 6026 } 6027 6028 return ret; 6029 } 6030 6031 static int amdgpu_device_halt_activities(struct amdgpu_device *adev, 6032 struct amdgpu_job *job, 6033 struct amdgpu_reset_context *reset_context, 6034 struct list_head *device_list, 6035 struct amdgpu_hive_info *hive, 6036 bool need_emergency_restart) 6037 { 6038 struct list_head *device_list_handle = NULL; 6039 struct amdgpu_device *tmp_adev = NULL; 6040 int i, r = 0; 6041 6042 /* 6043 * Build list of devices to reset. 6044 * In case we are in XGMI hive mode, resort the device list 6045 * to put adev in the 1st position. 6046 */ 6047 if (!amdgpu_sriov_vf(adev) && (adev->gmc.xgmi.num_physical_nodes > 1) && hive) { 6048 list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head) { 6049 list_add_tail(&tmp_adev->reset_list, device_list); 6050 if (adev->shutdown) 6051 tmp_adev->shutdown = true; 6052 if (adev->pcie_reset_ctx.occurs_dpc) 6053 tmp_adev->pcie_reset_ctx.in_link_reset = true; 6054 } 6055 if (!list_is_first(&adev->reset_list, device_list)) 6056 list_rotate_to_front(&adev->reset_list, device_list); 6057 device_list_handle = device_list; 6058 } else { 6059 list_add_tail(&adev->reset_list, device_list); 6060 device_list_handle = device_list; 6061 } 6062 6063 if (!amdgpu_sriov_vf(adev) && (!adev->pcie_reset_ctx.occurs_dpc)) { 6064 r = amdgpu_device_health_check(device_list_handle); 6065 if (r) 6066 return r; 6067 } 6068 6069 /* We need to lock reset domain only once both for XGMI and single device */ 6070 tmp_adev = list_first_entry(device_list_handle, struct amdgpu_device, 6071 reset_list); 6072 amdgpu_device_lock_reset_domain(tmp_adev->reset_domain); 6073 6074 /* block all schedulers and reset given job's ring */ 6075 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 6076 6077 amdgpu_device_set_mp1_state(tmp_adev); 6078 6079 /* 6080 * Try to put the audio codec into suspend state 6081 * before gpu reset started. 6082 * 6083 * Due to the power domain of the graphics device 6084 * is shared with AZ power domain. Without this, 6085 * we may change the audio hardware from behind 6086 * the audio driver's back. That will trigger 6087 * some audio codec errors. 6088 */ 6089 if (!amdgpu_device_suspend_display_audio(tmp_adev)) 6090 tmp_adev->pcie_reset_ctx.audio_suspended = true; 6091 6092 amdgpu_ras_set_error_query_ready(tmp_adev, false); 6093 6094 cancel_delayed_work_sync(&tmp_adev->delayed_init_work); 6095 6096 amdgpu_amdkfd_pre_reset(tmp_adev, reset_context); 6097 6098 /* 6099 * Mark these ASICs to be reset as untracked first 6100 * And add them back after reset completed 6101 */ 6102 amdgpu_unregister_gpu_instance(tmp_adev); 6103 6104 drm_client_dev_suspend(adev_to_drm(tmp_adev), false); 6105 6106 /* disable ras on ALL IPs */ 6107 if (!need_emergency_restart && 6108 (!adev->pcie_reset_ctx.occurs_dpc) && 6109 amdgpu_device_ip_need_full_reset(tmp_adev)) 6110 amdgpu_ras_suspend(tmp_adev); 6111 6112 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 6113 struct amdgpu_ring *ring = tmp_adev->rings[i]; 6114 6115 if (!amdgpu_ring_sched_ready(ring)) 6116 continue; 6117 6118 drm_sched_stop(&ring->sched, job ? &job->base : NULL); 6119 6120 if (need_emergency_restart) 6121 amdgpu_job_stop_all_jobs_on_sched(&ring->sched); 6122 } 6123 atomic_inc(&tmp_adev->gpu_reset_counter); 6124 } 6125 6126 return r; 6127 } 6128 6129 static int amdgpu_device_asic_reset(struct amdgpu_device *adev, 6130 struct list_head *device_list, 6131 struct amdgpu_reset_context *reset_context) 6132 { 6133 struct amdgpu_device *tmp_adev = NULL; 6134 int retry_limit = AMDGPU_MAX_RETRY_LIMIT; 6135 int r = 0; 6136 6137 retry: /* Rest of adevs pre asic reset from XGMI hive. */ 6138 list_for_each_entry(tmp_adev, device_list, reset_list) { 6139 if (adev->pcie_reset_ctx.occurs_dpc) 6140 tmp_adev->no_hw_access = true; 6141 r = amdgpu_device_pre_asic_reset(tmp_adev, reset_context); 6142 if (adev->pcie_reset_ctx.occurs_dpc) 6143 tmp_adev->no_hw_access = false; 6144 /*TODO Should we stop ?*/ 6145 if (r) { 6146 dev_err(tmp_adev->dev, "GPU pre asic reset failed with err, %d for drm dev, %s ", 6147 r, adev_to_drm(tmp_adev)->unique); 6148 tmp_adev->asic_reset_res = r; 6149 } 6150 } 6151 6152 /* Actual ASIC resets if needed.*/ 6153 /* Host driver will handle XGMI hive reset for SRIOV */ 6154 if (amdgpu_sriov_vf(adev)) { 6155 if (amdgpu_ras_get_fed_status(adev) || amdgpu_virt_rcvd_ras_interrupt(adev)) { 6156 dev_dbg(adev->dev, "Detected RAS error, wait for FLR completion\n"); 6157 amdgpu_ras_set_fed(adev, true); 6158 set_bit(AMDGPU_HOST_FLR, &reset_context->flags); 6159 } 6160 6161 r = amdgpu_device_reset_sriov(adev, reset_context); 6162 if (AMDGPU_RETRY_SRIOV_RESET(r) && (retry_limit--) > 0) { 6163 amdgpu_virt_release_full_gpu(adev, true); 6164 goto retry; 6165 } 6166 if (r) 6167 adev->asic_reset_res = r; 6168 } else { 6169 r = amdgpu_do_asic_reset(device_list, reset_context); 6170 if (r && r == -EAGAIN) 6171 goto retry; 6172 } 6173 6174 list_for_each_entry(tmp_adev, device_list, reset_list) { 6175 /* 6176 * Drop any pending non scheduler resets queued before reset is done. 6177 * Any reset scheduled after this point would be valid. Scheduler resets 6178 * were already dropped during drm_sched_stop and no new ones can come 6179 * in before drm_sched_start. 6180 */ 6181 amdgpu_device_stop_pending_resets(tmp_adev); 6182 } 6183 6184 return r; 6185 } 6186 6187 static int amdgpu_device_sched_resume(struct list_head *device_list, 6188 struct amdgpu_reset_context *reset_context, 6189 bool job_signaled) 6190 { 6191 struct amdgpu_device *tmp_adev = NULL; 6192 int i, r = 0; 6193 6194 /* Post ASIC reset for all devs .*/ 6195 list_for_each_entry(tmp_adev, device_list, reset_list) { 6196 6197 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 6198 struct amdgpu_ring *ring = tmp_adev->rings[i]; 6199 6200 if (!amdgpu_ring_sched_ready(ring)) 6201 continue; 6202 6203 drm_sched_start(&ring->sched, 0); 6204 } 6205 6206 if (!drm_drv_uses_atomic_modeset(adev_to_drm(tmp_adev)) && !job_signaled) 6207 drm_helper_resume_force_mode(adev_to_drm(tmp_adev)); 6208 6209 if (tmp_adev->asic_reset_res) 6210 r = tmp_adev->asic_reset_res; 6211 6212 tmp_adev->asic_reset_res = 0; 6213 6214 if (r) { 6215 /* bad news, how to tell it to userspace ? 6216 * for ras error, we should report GPU bad status instead of 6217 * reset failure 6218 */ 6219 if (reset_context->src != AMDGPU_RESET_SRC_RAS || 6220 !amdgpu_ras_eeprom_check_err_threshold(tmp_adev)) 6221 dev_info(tmp_adev->dev, "GPU reset(%d) failed\n", 6222 atomic_read(&tmp_adev->gpu_reset_counter)); 6223 amdgpu_vf_error_put(tmp_adev, AMDGIM_ERROR_VF_GPU_RESET_FAIL, 0, r); 6224 } else { 6225 dev_info(tmp_adev->dev, "GPU reset(%d) succeeded!\n", atomic_read(&tmp_adev->gpu_reset_counter)); 6226 if (amdgpu_acpi_smart_shift_update(adev_to_drm(tmp_adev), AMDGPU_SS_DEV_D0)) 6227 DRM_WARN("smart shift update failed\n"); 6228 } 6229 } 6230 6231 return r; 6232 } 6233 6234 static void amdgpu_device_gpu_resume(struct amdgpu_device *adev, 6235 struct list_head *device_list, 6236 bool need_emergency_restart) 6237 { 6238 struct amdgpu_device *tmp_adev = NULL; 6239 6240 list_for_each_entry(tmp_adev, device_list, reset_list) { 6241 /* unlock kfd: SRIOV would do it separately */ 6242 if (!need_emergency_restart && !amdgpu_sriov_vf(tmp_adev)) 6243 amdgpu_amdkfd_post_reset(tmp_adev); 6244 6245 /* kfd_post_reset will do nothing if kfd device is not initialized, 6246 * need to bring up kfd here if it's not be initialized before 6247 */ 6248 if (!adev->kfd.init_complete) 6249 amdgpu_amdkfd_device_init(adev); 6250 6251 if (tmp_adev->pcie_reset_ctx.audio_suspended) 6252 amdgpu_device_resume_display_audio(tmp_adev); 6253 6254 amdgpu_device_unset_mp1_state(tmp_adev); 6255 6256 amdgpu_ras_set_error_query_ready(tmp_adev, true); 6257 6258 } 6259 6260 tmp_adev = list_first_entry(device_list, struct amdgpu_device, 6261 reset_list); 6262 amdgpu_device_unlock_reset_domain(tmp_adev->reset_domain); 6263 6264 } 6265 6266 6267 /** 6268 * amdgpu_device_gpu_recover - reset the asic and recover scheduler 6269 * 6270 * @adev: amdgpu_device pointer 6271 * @job: which job trigger hang 6272 * @reset_context: amdgpu reset context pointer 6273 * 6274 * Attempt to reset the GPU if it has hung (all asics). 6275 * Attempt to do soft-reset or full-reset and reinitialize Asic 6276 * Returns 0 for success or an error on failure. 6277 */ 6278 6279 int amdgpu_device_gpu_recover(struct amdgpu_device *adev, 6280 struct amdgpu_job *job, 6281 struct amdgpu_reset_context *reset_context) 6282 { 6283 struct list_head device_list; 6284 bool job_signaled = false; 6285 struct amdgpu_hive_info *hive = NULL; 6286 int r = 0; 6287 bool need_emergency_restart = false; 6288 6289 /* 6290 * If it reaches here because of hang/timeout and a RAS error is 6291 * detected at the same time, let RAS recovery take care of it. 6292 */ 6293 if (amdgpu_ras_is_err_state(adev, AMDGPU_RAS_BLOCK__ANY) && 6294 !amdgpu_sriov_vf(adev) && 6295 reset_context->src != AMDGPU_RESET_SRC_RAS) { 6296 dev_dbg(adev->dev, 6297 "Gpu recovery from source: %d yielding to RAS error recovery handling", 6298 reset_context->src); 6299 return 0; 6300 } 6301 6302 /* 6303 * Special case: RAS triggered and full reset isn't supported 6304 */ 6305 need_emergency_restart = amdgpu_ras_need_emergency_restart(adev); 6306 6307 /* 6308 * Flush RAM to disk so that after reboot 6309 * the user can read log and see why the system rebooted. 6310 */ 6311 if (need_emergency_restart && amdgpu_ras_get_context(adev) && 6312 amdgpu_ras_get_context(adev)->reboot) { 6313 DRM_WARN("Emergency reboot."); 6314 6315 ksys_sync_helper(); 6316 emergency_restart(); 6317 } 6318 6319 dev_info(adev->dev, "GPU %s begin!\n", 6320 need_emergency_restart ? "jobs stop":"reset"); 6321 6322 if (!amdgpu_sriov_vf(adev)) 6323 hive = amdgpu_get_xgmi_hive(adev); 6324 if (hive) 6325 mutex_lock(&hive->hive_lock); 6326 6327 reset_context->job = job; 6328 reset_context->hive = hive; 6329 INIT_LIST_HEAD(&device_list); 6330 6331 r = amdgpu_device_halt_activities(adev, job, reset_context, &device_list, 6332 hive, need_emergency_restart); 6333 if (r) 6334 goto end_reset; 6335 6336 if (need_emergency_restart) 6337 goto skip_sched_resume; 6338 /* 6339 * Must check guilty signal here since after this point all old 6340 * HW fences are force signaled. 6341 * 6342 * job->base holds a reference to parent fence 6343 */ 6344 if (job && dma_fence_is_signaled(&job->hw_fence)) { 6345 job_signaled = true; 6346 dev_info(adev->dev, "Guilty job already signaled, skipping HW reset"); 6347 goto skip_hw_reset; 6348 } 6349 6350 r = amdgpu_device_asic_reset(adev, &device_list, reset_context); 6351 if (r) 6352 goto end_reset; 6353 skip_hw_reset: 6354 r = amdgpu_device_sched_resume(&device_list, reset_context, job_signaled); 6355 if (r) 6356 goto end_reset; 6357 skip_sched_resume: 6358 amdgpu_device_gpu_resume(adev, &device_list, need_emergency_restart); 6359 end_reset: 6360 if (hive) { 6361 mutex_unlock(&hive->hive_lock); 6362 amdgpu_put_xgmi_hive(hive); 6363 } 6364 6365 if (r) 6366 dev_info(adev->dev, "GPU reset end with ret = %d\n", r); 6367 6368 atomic_set(&adev->reset_domain->reset_res, r); 6369 6370 if (!r) 6371 drm_dev_wedged_event(adev_to_drm(adev), DRM_WEDGE_RECOVERY_NONE); 6372 6373 return r; 6374 } 6375 6376 /** 6377 * amdgpu_device_partner_bandwidth - find the bandwidth of appropriate partner 6378 * 6379 * @adev: amdgpu_device pointer 6380 * @speed: pointer to the speed of the link 6381 * @width: pointer to the width of the link 6382 * 6383 * Evaluate the hierarchy to find the speed and bandwidth capabilities of the 6384 * first physical partner to an AMD dGPU. 6385 * This will exclude any virtual switches and links. 6386 */ 6387 static void amdgpu_device_partner_bandwidth(struct amdgpu_device *adev, 6388 enum pci_bus_speed *speed, 6389 enum pcie_link_width *width) 6390 { 6391 struct pci_dev *parent = adev->pdev; 6392 6393 if (!speed || !width) 6394 return; 6395 6396 *speed = PCI_SPEED_UNKNOWN; 6397 *width = PCIE_LNK_WIDTH_UNKNOWN; 6398 6399 if (amdgpu_device_pcie_dynamic_switching_supported(adev)) { 6400 while ((parent = pci_upstream_bridge(parent))) { 6401 /* skip upstream/downstream switches internal to dGPU*/ 6402 if (parent->vendor == PCI_VENDOR_ID_ATI) 6403 continue; 6404 *speed = pcie_get_speed_cap(parent); 6405 *width = pcie_get_width_cap(parent); 6406 break; 6407 } 6408 } else { 6409 /* use the current speeds rather than max if switching is not supported */ 6410 pcie_bandwidth_available(adev->pdev, NULL, speed, width); 6411 } 6412 } 6413 6414 /** 6415 * amdgpu_device_gpu_bandwidth - find the bandwidth of the GPU 6416 * 6417 * @adev: amdgpu_device pointer 6418 * @speed: pointer to the speed of the link 6419 * @width: pointer to the width of the link 6420 * 6421 * Evaluate the hierarchy to find the speed and bandwidth capabilities of the 6422 * AMD dGPU which may be a virtual upstream bridge. 6423 */ 6424 static void amdgpu_device_gpu_bandwidth(struct amdgpu_device *adev, 6425 enum pci_bus_speed *speed, 6426 enum pcie_link_width *width) 6427 { 6428 struct pci_dev *parent = adev->pdev; 6429 6430 if (!speed || !width) 6431 return; 6432 6433 parent = pci_upstream_bridge(parent); 6434 if (parent && parent->vendor == PCI_VENDOR_ID_ATI) { 6435 /* use the upstream/downstream switches internal to dGPU */ 6436 *speed = pcie_get_speed_cap(parent); 6437 *width = pcie_get_width_cap(parent); 6438 while ((parent = pci_upstream_bridge(parent))) { 6439 if (parent->vendor == PCI_VENDOR_ID_ATI) { 6440 /* use the upstream/downstream switches internal to dGPU */ 6441 *speed = pcie_get_speed_cap(parent); 6442 *width = pcie_get_width_cap(parent); 6443 } 6444 } 6445 } else { 6446 /* use the device itself */ 6447 *speed = pcie_get_speed_cap(adev->pdev); 6448 *width = pcie_get_width_cap(adev->pdev); 6449 } 6450 } 6451 6452 /** 6453 * amdgpu_device_get_pcie_info - fence pcie info about the PCIE slot 6454 * 6455 * @adev: amdgpu_device pointer 6456 * 6457 * Fetches and stores in the driver the PCIE capabilities (gen speed 6458 * and lanes) of the slot the device is in. Handles APUs and 6459 * virtualized environments where PCIE config space may not be available. 6460 */ 6461 static void amdgpu_device_get_pcie_info(struct amdgpu_device *adev) 6462 { 6463 enum pci_bus_speed speed_cap, platform_speed_cap; 6464 enum pcie_link_width platform_link_width, link_width; 6465 6466 if (amdgpu_pcie_gen_cap) 6467 adev->pm.pcie_gen_mask = amdgpu_pcie_gen_cap; 6468 6469 if (amdgpu_pcie_lane_cap) 6470 adev->pm.pcie_mlw_mask = amdgpu_pcie_lane_cap; 6471 6472 /* covers APUs as well */ 6473 if (pci_is_root_bus(adev->pdev->bus) && !amdgpu_passthrough(adev)) { 6474 if (adev->pm.pcie_gen_mask == 0) 6475 adev->pm.pcie_gen_mask = AMDGPU_DEFAULT_PCIE_GEN_MASK; 6476 if (adev->pm.pcie_mlw_mask == 0) 6477 adev->pm.pcie_mlw_mask = AMDGPU_DEFAULT_PCIE_MLW_MASK; 6478 return; 6479 } 6480 6481 if (adev->pm.pcie_gen_mask && adev->pm.pcie_mlw_mask) 6482 return; 6483 6484 amdgpu_device_partner_bandwidth(adev, &platform_speed_cap, 6485 &platform_link_width); 6486 amdgpu_device_gpu_bandwidth(adev, &speed_cap, &link_width); 6487 6488 if (adev->pm.pcie_gen_mask == 0) { 6489 /* asic caps */ 6490 if (speed_cap == PCI_SPEED_UNKNOWN) { 6491 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 6492 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 | 6493 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3); 6494 } else { 6495 if (speed_cap == PCIE_SPEED_32_0GT) 6496 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 6497 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 | 6498 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3 | 6499 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN4 | 6500 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN5); 6501 else if (speed_cap == PCIE_SPEED_16_0GT) 6502 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 6503 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 | 6504 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3 | 6505 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN4); 6506 else if (speed_cap == PCIE_SPEED_8_0GT) 6507 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 6508 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 | 6509 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3); 6510 else if (speed_cap == PCIE_SPEED_5_0GT) 6511 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 6512 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2); 6513 else 6514 adev->pm.pcie_gen_mask |= CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1; 6515 } 6516 /* platform caps */ 6517 if (platform_speed_cap == PCI_SPEED_UNKNOWN) { 6518 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 6519 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2); 6520 } else { 6521 if (platform_speed_cap == PCIE_SPEED_32_0GT) 6522 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 6523 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 | 6524 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3 | 6525 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN4 | 6526 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN5); 6527 else if (platform_speed_cap == PCIE_SPEED_16_0GT) 6528 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 6529 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 | 6530 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3 | 6531 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN4); 6532 else if (platform_speed_cap == PCIE_SPEED_8_0GT) 6533 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 6534 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 | 6535 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3); 6536 else if (platform_speed_cap == PCIE_SPEED_5_0GT) 6537 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 6538 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2); 6539 else 6540 adev->pm.pcie_gen_mask |= CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1; 6541 6542 } 6543 } 6544 if (adev->pm.pcie_mlw_mask == 0) { 6545 /* asic caps */ 6546 if (link_width == PCIE_LNK_WIDTH_UNKNOWN) { 6547 adev->pm.pcie_mlw_mask |= AMDGPU_DEFAULT_ASIC_PCIE_MLW_MASK; 6548 } else { 6549 switch (link_width) { 6550 case PCIE_LNK_X32: 6551 adev->pm.pcie_mlw_mask |= (CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X32 | 6552 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X16 | 6553 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X12 | 6554 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X8 | 6555 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X4 | 6556 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X2 | 6557 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X1); 6558 break; 6559 case PCIE_LNK_X16: 6560 adev->pm.pcie_mlw_mask |= (CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X16 | 6561 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X12 | 6562 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X8 | 6563 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X4 | 6564 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X2 | 6565 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X1); 6566 break; 6567 case PCIE_LNK_X12: 6568 adev->pm.pcie_mlw_mask |= (CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X12 | 6569 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X8 | 6570 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X4 | 6571 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X2 | 6572 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X1); 6573 break; 6574 case PCIE_LNK_X8: 6575 adev->pm.pcie_mlw_mask |= (CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X8 | 6576 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X4 | 6577 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X2 | 6578 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X1); 6579 break; 6580 case PCIE_LNK_X4: 6581 adev->pm.pcie_mlw_mask |= (CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X4 | 6582 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X2 | 6583 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X1); 6584 break; 6585 case PCIE_LNK_X2: 6586 adev->pm.pcie_mlw_mask |= (CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X2 | 6587 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X1); 6588 break; 6589 case PCIE_LNK_X1: 6590 adev->pm.pcie_mlw_mask |= CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X1; 6591 break; 6592 default: 6593 break; 6594 } 6595 } 6596 /* platform caps */ 6597 if (platform_link_width == PCIE_LNK_WIDTH_UNKNOWN) { 6598 adev->pm.pcie_mlw_mask |= AMDGPU_DEFAULT_PCIE_MLW_MASK; 6599 } else { 6600 switch (platform_link_width) { 6601 case PCIE_LNK_X32: 6602 adev->pm.pcie_mlw_mask |= (CAIL_PCIE_LINK_WIDTH_SUPPORT_X32 | 6603 CAIL_PCIE_LINK_WIDTH_SUPPORT_X16 | 6604 CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 | 6605 CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 | 6606 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 6607 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 6608 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 6609 break; 6610 case PCIE_LNK_X16: 6611 adev->pm.pcie_mlw_mask |= (CAIL_PCIE_LINK_WIDTH_SUPPORT_X16 | 6612 CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 | 6613 CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 | 6614 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 6615 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 6616 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 6617 break; 6618 case PCIE_LNK_X12: 6619 adev->pm.pcie_mlw_mask |= (CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 | 6620 CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 | 6621 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 6622 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 6623 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 6624 break; 6625 case PCIE_LNK_X8: 6626 adev->pm.pcie_mlw_mask |= (CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 | 6627 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 6628 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 6629 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 6630 break; 6631 case PCIE_LNK_X4: 6632 adev->pm.pcie_mlw_mask |= (CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 6633 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 6634 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 6635 break; 6636 case PCIE_LNK_X2: 6637 adev->pm.pcie_mlw_mask |= (CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 6638 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 6639 break; 6640 case PCIE_LNK_X1: 6641 adev->pm.pcie_mlw_mask |= CAIL_PCIE_LINK_WIDTH_SUPPORT_X1; 6642 break; 6643 default: 6644 break; 6645 } 6646 } 6647 } 6648 } 6649 6650 /** 6651 * amdgpu_device_is_peer_accessible - Check peer access through PCIe BAR 6652 * 6653 * @adev: amdgpu_device pointer 6654 * @peer_adev: amdgpu_device pointer for peer device trying to access @adev 6655 * 6656 * Return true if @peer_adev can access (DMA) @adev through the PCIe 6657 * BAR, i.e. @adev is "large BAR" and the BAR matches the DMA mask of 6658 * @peer_adev. 6659 */ 6660 bool amdgpu_device_is_peer_accessible(struct amdgpu_device *adev, 6661 struct amdgpu_device *peer_adev) 6662 { 6663 #ifdef CONFIG_HSA_AMD_P2P 6664 bool p2p_access = 6665 !adev->gmc.xgmi.connected_to_cpu && 6666 !(pci_p2pdma_distance(adev->pdev, peer_adev->dev, false) < 0); 6667 if (!p2p_access) 6668 dev_info(adev->dev, "PCIe P2P access from peer device %s is not supported by the chipset\n", 6669 pci_name(peer_adev->pdev)); 6670 6671 bool is_large_bar = adev->gmc.visible_vram_size && 6672 adev->gmc.real_vram_size == adev->gmc.visible_vram_size; 6673 bool p2p_addressable = amdgpu_device_check_iommu_remap(peer_adev); 6674 6675 if (!p2p_addressable) { 6676 uint64_t address_mask = peer_adev->dev->dma_mask ? 6677 ~*peer_adev->dev->dma_mask : ~((1ULL << 32) - 1); 6678 resource_size_t aper_limit = 6679 adev->gmc.aper_base + adev->gmc.aper_size - 1; 6680 6681 p2p_addressable = !(adev->gmc.aper_base & address_mask || 6682 aper_limit & address_mask); 6683 } 6684 return pcie_p2p && is_large_bar && p2p_access && p2p_addressable; 6685 #else 6686 return false; 6687 #endif 6688 } 6689 6690 int amdgpu_device_baco_enter(struct drm_device *dev) 6691 { 6692 struct amdgpu_device *adev = drm_to_adev(dev); 6693 struct amdgpu_ras *ras = amdgpu_ras_get_context(adev); 6694 6695 if (!amdgpu_device_supports_baco(dev)) 6696 return -ENOTSUPP; 6697 6698 if (ras && adev->ras_enabled && 6699 adev->nbio.funcs->enable_doorbell_interrupt) 6700 adev->nbio.funcs->enable_doorbell_interrupt(adev, false); 6701 6702 return amdgpu_dpm_baco_enter(adev); 6703 } 6704 6705 int amdgpu_device_baco_exit(struct drm_device *dev) 6706 { 6707 struct amdgpu_device *adev = drm_to_adev(dev); 6708 struct amdgpu_ras *ras = amdgpu_ras_get_context(adev); 6709 int ret = 0; 6710 6711 if (!amdgpu_device_supports_baco(dev)) 6712 return -ENOTSUPP; 6713 6714 ret = amdgpu_dpm_baco_exit(adev); 6715 if (ret) 6716 return ret; 6717 6718 if (ras && adev->ras_enabled && 6719 adev->nbio.funcs->enable_doorbell_interrupt) 6720 adev->nbio.funcs->enable_doorbell_interrupt(adev, true); 6721 6722 if (amdgpu_passthrough(adev) && adev->nbio.funcs && 6723 adev->nbio.funcs->clear_doorbell_interrupt) 6724 adev->nbio.funcs->clear_doorbell_interrupt(adev); 6725 6726 return 0; 6727 } 6728 6729 /** 6730 * amdgpu_pci_error_detected - Called when a PCI error is detected. 6731 * @pdev: PCI device struct 6732 * @state: PCI channel state 6733 * 6734 * Description: Called when a PCI error is detected. 6735 * 6736 * Return: PCI_ERS_RESULT_NEED_RESET or PCI_ERS_RESULT_DISCONNECT. 6737 */ 6738 pci_ers_result_t amdgpu_pci_error_detected(struct pci_dev *pdev, pci_channel_state_t state) 6739 { 6740 struct drm_device *dev = pci_get_drvdata(pdev); 6741 struct amdgpu_device *adev = drm_to_adev(dev); 6742 struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev); 6743 struct amdgpu_reset_context reset_context; 6744 struct list_head device_list; 6745 int r = 0; 6746 6747 dev_info(adev->dev, "PCI error: detected callback!!\n"); 6748 6749 if (!amdgpu_dpm_is_link_reset_supported(adev)) { 6750 dev_warn(adev->dev, "No support for XGMI hive yet...\n"); 6751 return PCI_ERS_RESULT_DISCONNECT; 6752 } 6753 6754 adev->pci_channel_state = state; 6755 6756 switch (state) { 6757 case pci_channel_io_normal: 6758 dev_info(adev->dev, "pci_channel_io_normal: state(%d)!!\n", state); 6759 return PCI_ERS_RESULT_CAN_RECOVER; 6760 case pci_channel_io_frozen: 6761 /* Fatal error, prepare for slot reset */ 6762 dev_info(adev->dev, "pci_channel_io_frozen: state(%d)!!\n", state); 6763 6764 if (hive) 6765 mutex_lock(&hive->hive_lock); 6766 adev->pcie_reset_ctx.occurs_dpc = true; 6767 memset(&reset_context, 0, sizeof(reset_context)); 6768 INIT_LIST_HEAD(&device_list); 6769 6770 r = amdgpu_device_halt_activities(adev, NULL, &reset_context, &device_list, 6771 hive, false); 6772 if (hive) { 6773 mutex_unlock(&hive->hive_lock); 6774 amdgpu_put_xgmi_hive(hive); 6775 } 6776 if (r) 6777 return PCI_ERS_RESULT_DISCONNECT; 6778 return PCI_ERS_RESULT_NEED_RESET; 6779 case pci_channel_io_perm_failure: 6780 /* Permanent error, prepare for device removal */ 6781 dev_info(adev->dev, "pci_channel_io_perm_failure: state(%d)!!\n", state); 6782 return PCI_ERS_RESULT_DISCONNECT; 6783 } 6784 6785 return PCI_ERS_RESULT_NEED_RESET; 6786 } 6787 6788 /** 6789 * amdgpu_pci_mmio_enabled - Enable MMIO and dump debug registers 6790 * @pdev: pointer to PCI device 6791 */ 6792 pci_ers_result_t amdgpu_pci_mmio_enabled(struct pci_dev *pdev) 6793 { 6794 struct drm_device *dev = pci_get_drvdata(pdev); 6795 struct amdgpu_device *adev = drm_to_adev(dev); 6796 6797 dev_info(adev->dev, "PCI error: mmio enabled callback!!\n"); 6798 6799 /* TODO - dump whatever for debugging purposes */ 6800 6801 /* This called only if amdgpu_pci_error_detected returns 6802 * PCI_ERS_RESULT_CAN_RECOVER. Read/write to the device still 6803 * works, no need to reset slot. 6804 */ 6805 6806 return PCI_ERS_RESULT_RECOVERED; 6807 } 6808 6809 /** 6810 * amdgpu_pci_slot_reset - Called when PCI slot has been reset. 6811 * @pdev: PCI device struct 6812 * 6813 * Description: This routine is called by the pci error recovery 6814 * code after the PCI slot has been reset, just before we 6815 * should resume normal operations. 6816 */ 6817 pci_ers_result_t amdgpu_pci_slot_reset(struct pci_dev *pdev) 6818 { 6819 struct drm_device *dev = pci_get_drvdata(pdev); 6820 struct amdgpu_device *adev = drm_to_adev(dev); 6821 struct amdgpu_reset_context reset_context; 6822 struct amdgpu_device *tmp_adev; 6823 struct amdgpu_hive_info *hive; 6824 struct list_head device_list; 6825 int r = 0, i; 6826 u32 memsize; 6827 6828 /* PCI error slot reset should be skipped During RAS recovery */ 6829 if ((amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 3) || 6830 amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 4)) && 6831 amdgpu_ras_in_recovery(adev)) 6832 return PCI_ERS_RESULT_RECOVERED; 6833 6834 dev_info(adev->dev, "PCI error: slot reset callback!!\n"); 6835 6836 memset(&reset_context, 0, sizeof(reset_context)); 6837 6838 /* wait for asic to come out of reset */ 6839 msleep(700); 6840 6841 /* Restore PCI confspace */ 6842 amdgpu_device_load_pci_state(pdev); 6843 6844 /* confirm ASIC came out of reset */ 6845 for (i = 0; i < adev->usec_timeout; i++) { 6846 memsize = amdgpu_asic_get_config_memsize(adev); 6847 6848 if (memsize != 0xffffffff) 6849 break; 6850 udelay(1); 6851 } 6852 if (memsize == 0xffffffff) { 6853 r = -ETIME; 6854 goto out; 6855 } 6856 6857 reset_context.method = AMD_RESET_METHOD_NONE; 6858 reset_context.reset_req_dev = adev; 6859 set_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags); 6860 set_bit(AMDGPU_SKIP_COREDUMP, &reset_context.flags); 6861 INIT_LIST_HEAD(&device_list); 6862 6863 hive = amdgpu_get_xgmi_hive(adev); 6864 if (hive) { 6865 mutex_lock(&hive->hive_lock); 6866 reset_context.hive = hive; 6867 list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head) { 6868 tmp_adev->pcie_reset_ctx.in_link_reset = true; 6869 list_add_tail(&tmp_adev->reset_list, &device_list); 6870 } 6871 } else { 6872 set_bit(AMDGPU_SKIP_HW_RESET, &reset_context.flags); 6873 list_add_tail(&adev->reset_list, &device_list); 6874 } 6875 6876 r = amdgpu_device_asic_reset(adev, &device_list, &reset_context); 6877 out: 6878 if (!r) { 6879 if (amdgpu_device_cache_pci_state(adev->pdev)) 6880 pci_restore_state(adev->pdev); 6881 dev_info(adev->dev, "PCIe error recovery succeeded\n"); 6882 } else { 6883 dev_err(adev->dev, "PCIe error recovery failed, err:%d\n", r); 6884 if (hive) { 6885 list_for_each_entry(tmp_adev, &device_list, reset_list) 6886 amdgpu_device_unset_mp1_state(tmp_adev); 6887 amdgpu_device_unlock_reset_domain(adev->reset_domain); 6888 } 6889 } 6890 6891 if (hive) { 6892 mutex_unlock(&hive->hive_lock); 6893 amdgpu_put_xgmi_hive(hive); 6894 } 6895 6896 return r ? PCI_ERS_RESULT_DISCONNECT : PCI_ERS_RESULT_RECOVERED; 6897 } 6898 6899 /** 6900 * amdgpu_pci_resume() - resume normal ops after PCI reset 6901 * @pdev: pointer to PCI device 6902 * 6903 * Called when the error recovery driver tells us that its 6904 * OK to resume normal operation. 6905 */ 6906 void amdgpu_pci_resume(struct pci_dev *pdev) 6907 { 6908 struct drm_device *dev = pci_get_drvdata(pdev); 6909 struct amdgpu_device *adev = drm_to_adev(dev); 6910 struct list_head device_list; 6911 struct amdgpu_hive_info *hive = NULL; 6912 struct amdgpu_device *tmp_adev = NULL; 6913 6914 dev_info(adev->dev, "PCI error: resume callback!!\n"); 6915 6916 /* Only continue execution for the case of pci_channel_io_frozen */ 6917 if (adev->pci_channel_state != pci_channel_io_frozen) 6918 return; 6919 6920 INIT_LIST_HEAD(&device_list); 6921 6922 hive = amdgpu_get_xgmi_hive(adev); 6923 if (hive) { 6924 mutex_lock(&hive->hive_lock); 6925 list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head) { 6926 tmp_adev->pcie_reset_ctx.in_link_reset = false; 6927 list_add_tail(&tmp_adev->reset_list, &device_list); 6928 } 6929 } else 6930 list_add_tail(&adev->reset_list, &device_list); 6931 6932 amdgpu_device_sched_resume(&device_list, NULL, NULL); 6933 amdgpu_device_gpu_resume(adev, &device_list, false); 6934 adev->pcie_reset_ctx.occurs_dpc = false; 6935 6936 if (hive) { 6937 mutex_unlock(&hive->hive_lock); 6938 amdgpu_put_xgmi_hive(hive); 6939 } 6940 } 6941 6942 bool amdgpu_device_cache_pci_state(struct pci_dev *pdev) 6943 { 6944 struct drm_device *dev = pci_get_drvdata(pdev); 6945 struct amdgpu_device *adev = drm_to_adev(dev); 6946 int r; 6947 6948 if (amdgpu_sriov_vf(adev)) 6949 return false; 6950 6951 r = pci_save_state(pdev); 6952 if (!r) { 6953 kfree(adev->pci_state); 6954 6955 adev->pci_state = pci_store_saved_state(pdev); 6956 6957 if (!adev->pci_state) { 6958 DRM_ERROR("Failed to store PCI saved state"); 6959 return false; 6960 } 6961 } else { 6962 DRM_WARN("Failed to save PCI state, err:%d\n", r); 6963 return false; 6964 } 6965 6966 return true; 6967 } 6968 6969 bool amdgpu_device_load_pci_state(struct pci_dev *pdev) 6970 { 6971 struct drm_device *dev = pci_get_drvdata(pdev); 6972 struct amdgpu_device *adev = drm_to_adev(dev); 6973 int r; 6974 6975 if (!adev->pci_state) 6976 return false; 6977 6978 r = pci_load_saved_state(pdev, adev->pci_state); 6979 6980 if (!r) { 6981 pci_restore_state(pdev); 6982 } else { 6983 DRM_WARN("Failed to load PCI state, err:%d\n", r); 6984 return false; 6985 } 6986 6987 return true; 6988 } 6989 6990 void amdgpu_device_flush_hdp(struct amdgpu_device *adev, 6991 struct amdgpu_ring *ring) 6992 { 6993 #ifdef CONFIG_X86_64 6994 if ((adev->flags & AMD_IS_APU) && !amdgpu_passthrough(adev)) 6995 return; 6996 #endif 6997 if (adev->gmc.xgmi.connected_to_cpu) 6998 return; 6999 7000 if (ring && ring->funcs->emit_hdp_flush) 7001 amdgpu_ring_emit_hdp_flush(ring); 7002 else 7003 amdgpu_asic_flush_hdp(adev, ring); 7004 } 7005 7006 void amdgpu_device_invalidate_hdp(struct amdgpu_device *adev, 7007 struct amdgpu_ring *ring) 7008 { 7009 #ifdef CONFIG_X86_64 7010 if ((adev->flags & AMD_IS_APU) && !amdgpu_passthrough(adev)) 7011 return; 7012 #endif 7013 if (adev->gmc.xgmi.connected_to_cpu) 7014 return; 7015 7016 amdgpu_asic_invalidate_hdp(adev, ring); 7017 } 7018 7019 int amdgpu_in_reset(struct amdgpu_device *adev) 7020 { 7021 return atomic_read(&adev->reset_domain->in_gpu_reset); 7022 } 7023 7024 /** 7025 * amdgpu_device_halt() - bring hardware to some kind of halt state 7026 * 7027 * @adev: amdgpu_device pointer 7028 * 7029 * Bring hardware to some kind of halt state so that no one can touch it 7030 * any more. It will help to maintain error context when error occurred. 7031 * Compare to a simple hang, the system will keep stable at least for SSH 7032 * access. Then it should be trivial to inspect the hardware state and 7033 * see what's going on. Implemented as following: 7034 * 7035 * 1. drm_dev_unplug() makes device inaccessible to user space(IOCTLs, etc), 7036 * clears all CPU mappings to device, disallows remappings through page faults 7037 * 2. amdgpu_irq_disable_all() disables all interrupts 7038 * 3. amdgpu_fence_driver_hw_fini() signals all HW fences 7039 * 4. set adev->no_hw_access to avoid potential crashes after setp 5 7040 * 5. amdgpu_device_unmap_mmio() clears all MMIO mappings 7041 * 6. pci_disable_device() and pci_wait_for_pending_transaction() 7042 * flush any in flight DMA operations 7043 */ 7044 void amdgpu_device_halt(struct amdgpu_device *adev) 7045 { 7046 struct pci_dev *pdev = adev->pdev; 7047 struct drm_device *ddev = adev_to_drm(adev); 7048 7049 amdgpu_xcp_dev_unplug(adev); 7050 drm_dev_unplug(ddev); 7051 7052 amdgpu_irq_disable_all(adev); 7053 7054 amdgpu_fence_driver_hw_fini(adev); 7055 7056 adev->no_hw_access = true; 7057 7058 amdgpu_device_unmap_mmio(adev); 7059 7060 pci_disable_device(pdev); 7061 pci_wait_for_pending_transaction(pdev); 7062 } 7063 7064 u32 amdgpu_device_pcie_port_rreg(struct amdgpu_device *adev, 7065 u32 reg) 7066 { 7067 unsigned long flags, address, data; 7068 u32 r; 7069 7070 address = adev->nbio.funcs->get_pcie_port_index_offset(adev); 7071 data = adev->nbio.funcs->get_pcie_port_data_offset(adev); 7072 7073 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 7074 WREG32(address, reg * 4); 7075 (void)RREG32(address); 7076 r = RREG32(data); 7077 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 7078 return r; 7079 } 7080 7081 void amdgpu_device_pcie_port_wreg(struct amdgpu_device *adev, 7082 u32 reg, u32 v) 7083 { 7084 unsigned long flags, address, data; 7085 7086 address = adev->nbio.funcs->get_pcie_port_index_offset(adev); 7087 data = adev->nbio.funcs->get_pcie_port_data_offset(adev); 7088 7089 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 7090 WREG32(address, reg * 4); 7091 (void)RREG32(address); 7092 WREG32(data, v); 7093 (void)RREG32(data); 7094 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 7095 } 7096 7097 /** 7098 * amdgpu_device_get_gang - return a reference to the current gang 7099 * @adev: amdgpu_device pointer 7100 * 7101 * Returns: A new reference to the current gang leader. 7102 */ 7103 struct dma_fence *amdgpu_device_get_gang(struct amdgpu_device *adev) 7104 { 7105 struct dma_fence *fence; 7106 7107 rcu_read_lock(); 7108 fence = dma_fence_get_rcu_safe(&adev->gang_submit); 7109 rcu_read_unlock(); 7110 return fence; 7111 } 7112 7113 /** 7114 * amdgpu_device_switch_gang - switch to a new gang 7115 * @adev: amdgpu_device pointer 7116 * @gang: the gang to switch to 7117 * 7118 * Try to switch to a new gang. 7119 * Returns: NULL if we switched to the new gang or a reference to the current 7120 * gang leader. 7121 */ 7122 struct dma_fence *amdgpu_device_switch_gang(struct amdgpu_device *adev, 7123 struct dma_fence *gang) 7124 { 7125 struct dma_fence *old = NULL; 7126 7127 dma_fence_get(gang); 7128 do { 7129 dma_fence_put(old); 7130 old = amdgpu_device_get_gang(adev); 7131 if (old == gang) 7132 break; 7133 7134 if (!dma_fence_is_signaled(old)) { 7135 dma_fence_put(gang); 7136 return old; 7137 } 7138 7139 } while (cmpxchg((struct dma_fence __force **)&adev->gang_submit, 7140 old, gang) != old); 7141 7142 /* 7143 * Drop it once for the exchanged reference in adev and once for the 7144 * thread local reference acquired in amdgpu_device_get_gang(). 7145 */ 7146 dma_fence_put(old); 7147 dma_fence_put(old); 7148 return NULL; 7149 } 7150 7151 /** 7152 * amdgpu_device_enforce_isolation - enforce HW isolation 7153 * @adev: the amdgpu device pointer 7154 * @ring: the HW ring the job is supposed to run on 7155 * @job: the job which is about to be pushed to the HW ring 7156 * 7157 * Makes sure that only one client at a time can use the GFX block. 7158 * Returns: The dependency to wait on before the job can be pushed to the HW. 7159 * The function is called multiple times until NULL is returned. 7160 */ 7161 struct dma_fence *amdgpu_device_enforce_isolation(struct amdgpu_device *adev, 7162 struct amdgpu_ring *ring, 7163 struct amdgpu_job *job) 7164 { 7165 struct amdgpu_isolation *isolation = &adev->isolation[ring->xcp_id]; 7166 struct drm_sched_fence *f = job->base.s_fence; 7167 struct dma_fence *dep; 7168 void *owner; 7169 int r; 7170 7171 /* 7172 * For now enforce isolation only for the GFX block since we only need 7173 * the cleaner shader on those rings. 7174 */ 7175 if (ring->funcs->type != AMDGPU_RING_TYPE_GFX && 7176 ring->funcs->type != AMDGPU_RING_TYPE_COMPUTE) 7177 return NULL; 7178 7179 /* 7180 * All submissions where enforce isolation is false are handled as if 7181 * they come from a single client. Use ~0l as the owner to distinct it 7182 * from kernel submissions where the owner is NULL. 7183 */ 7184 owner = job->enforce_isolation ? f->owner : (void *)~0l; 7185 7186 mutex_lock(&adev->enforce_isolation_mutex); 7187 7188 /* 7189 * The "spearhead" submission is the first one which changes the 7190 * ownership to its client. We always need to wait for it to be 7191 * pushed to the HW before proceeding with anything. 7192 */ 7193 if (&f->scheduled != isolation->spearhead && 7194 !dma_fence_is_signaled(isolation->spearhead)) { 7195 dep = isolation->spearhead; 7196 goto out_grab_ref; 7197 } 7198 7199 if (isolation->owner != owner) { 7200 7201 /* 7202 * Wait for any gang to be assembled before switching to a 7203 * different owner or otherwise we could deadlock the 7204 * submissions. 7205 */ 7206 if (!job->gang_submit) { 7207 dep = amdgpu_device_get_gang(adev); 7208 if (!dma_fence_is_signaled(dep)) 7209 goto out_return_dep; 7210 dma_fence_put(dep); 7211 } 7212 7213 dma_fence_put(isolation->spearhead); 7214 isolation->spearhead = dma_fence_get(&f->scheduled); 7215 amdgpu_sync_move(&isolation->active, &isolation->prev); 7216 trace_amdgpu_isolation(isolation->owner, owner); 7217 isolation->owner = owner; 7218 } 7219 7220 /* 7221 * Specifying the ring here helps to pipeline submissions even when 7222 * isolation is enabled. If that is not desired for testing NULL can be 7223 * used instead of the ring to enforce a CPU round trip while switching 7224 * between clients. 7225 */ 7226 dep = amdgpu_sync_peek_fence(&isolation->prev, ring); 7227 r = amdgpu_sync_fence(&isolation->active, &f->finished, GFP_NOWAIT); 7228 if (r) 7229 DRM_WARN("OOM tracking isolation\n"); 7230 7231 out_grab_ref: 7232 dma_fence_get(dep); 7233 out_return_dep: 7234 mutex_unlock(&adev->enforce_isolation_mutex); 7235 return dep; 7236 } 7237 7238 bool amdgpu_device_has_display_hardware(struct amdgpu_device *adev) 7239 { 7240 switch (adev->asic_type) { 7241 #ifdef CONFIG_DRM_AMDGPU_SI 7242 case CHIP_HAINAN: 7243 #endif 7244 case CHIP_TOPAZ: 7245 /* chips with no display hardware */ 7246 return false; 7247 #ifdef CONFIG_DRM_AMDGPU_SI 7248 case CHIP_TAHITI: 7249 case CHIP_PITCAIRN: 7250 case CHIP_VERDE: 7251 case CHIP_OLAND: 7252 #endif 7253 #ifdef CONFIG_DRM_AMDGPU_CIK 7254 case CHIP_BONAIRE: 7255 case CHIP_HAWAII: 7256 case CHIP_KAVERI: 7257 case CHIP_KABINI: 7258 case CHIP_MULLINS: 7259 #endif 7260 case CHIP_TONGA: 7261 case CHIP_FIJI: 7262 case CHIP_POLARIS10: 7263 case CHIP_POLARIS11: 7264 case CHIP_POLARIS12: 7265 case CHIP_VEGAM: 7266 case CHIP_CARRIZO: 7267 case CHIP_STONEY: 7268 /* chips with display hardware */ 7269 return true; 7270 default: 7271 /* IP discovery */ 7272 if (!amdgpu_ip_version(adev, DCE_HWIP, 0) || 7273 (adev->harvest_ip_mask & AMD_HARVEST_IP_DMU_MASK)) 7274 return false; 7275 return true; 7276 } 7277 } 7278 7279 uint32_t amdgpu_device_wait_on_rreg(struct amdgpu_device *adev, 7280 uint32_t inst, uint32_t reg_addr, char reg_name[], 7281 uint32_t expected_value, uint32_t mask) 7282 { 7283 uint32_t ret = 0; 7284 uint32_t old_ = 0; 7285 uint32_t tmp_ = RREG32(reg_addr); 7286 uint32_t loop = adev->usec_timeout; 7287 7288 while ((tmp_ & (mask)) != (expected_value)) { 7289 if (old_ != tmp_) { 7290 loop = adev->usec_timeout; 7291 old_ = tmp_; 7292 } else 7293 udelay(1); 7294 tmp_ = RREG32(reg_addr); 7295 loop--; 7296 if (!loop) { 7297 DRM_WARN("Register(%d) [%s] failed to reach value 0x%08x != 0x%08xn", 7298 inst, reg_name, (uint32_t)expected_value, 7299 (uint32_t)(tmp_ & (mask))); 7300 ret = -ETIMEDOUT; 7301 break; 7302 } 7303 } 7304 return ret; 7305 } 7306 7307 ssize_t amdgpu_get_soft_full_reset_mask(struct amdgpu_ring *ring) 7308 { 7309 ssize_t size = 0; 7310 7311 if (!ring || !ring->adev) 7312 return size; 7313 7314 if (amdgpu_device_should_recover_gpu(ring->adev)) 7315 size |= AMDGPU_RESET_TYPE_FULL; 7316 7317 if (unlikely(!ring->adev->debug_disable_soft_recovery) && 7318 !amdgpu_sriov_vf(ring->adev) && ring->funcs->soft_recovery) 7319 size |= AMDGPU_RESET_TYPE_SOFT_RESET; 7320 7321 return size; 7322 } 7323 7324 ssize_t amdgpu_show_reset_mask(char *buf, uint32_t supported_reset) 7325 { 7326 ssize_t size = 0; 7327 7328 if (supported_reset == 0) { 7329 size += sysfs_emit_at(buf, size, "unsupported"); 7330 size += sysfs_emit_at(buf, size, "\n"); 7331 return size; 7332 7333 } 7334 7335 if (supported_reset & AMDGPU_RESET_TYPE_SOFT_RESET) 7336 size += sysfs_emit_at(buf, size, "soft "); 7337 7338 if (supported_reset & AMDGPU_RESET_TYPE_PER_QUEUE) 7339 size += sysfs_emit_at(buf, size, "queue "); 7340 7341 if (supported_reset & AMDGPU_RESET_TYPE_PER_PIPE) 7342 size += sysfs_emit_at(buf, size, "pipe "); 7343 7344 if (supported_reset & AMDGPU_RESET_TYPE_FULL) 7345 size += sysfs_emit_at(buf, size, "full "); 7346 7347 size += sysfs_emit_at(buf, size, "\n"); 7348 return size; 7349 } 7350