1 /* 2 * Copyright 2008 Advanced Micro Devices, Inc. 3 * Copyright 2008 Red Hat Inc. 4 * Copyright 2009 Jerome Glisse. 5 * 6 * Permission is hereby granted, free of charge, to any person obtaining a 7 * copy of this software and associated documentation files (the "Software"), 8 * to deal in the Software without restriction, including without limitation 9 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 10 * and/or sell copies of the Software, and to permit persons to whom the 11 * Software is furnished to do so, subject to the following conditions: 12 * 13 * The above copyright notice and this permission notice shall be included in 14 * all copies or substantial portions of the Software. 15 * 16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 19 * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR 20 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, 21 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR 22 * OTHER DEALINGS IN THE SOFTWARE. 23 * 24 * Authors: Dave Airlie 25 * Alex Deucher 26 * Jerome Glisse 27 */ 28 29 #include <linux/aperture.h> 30 #include <linux/power_supply.h> 31 #include <linux/kthread.h> 32 #include <linux/module.h> 33 #include <linux/console.h> 34 #include <linux/slab.h> 35 #include <linux/iommu.h> 36 #include <linux/pci.h> 37 #include <linux/pci-p2pdma.h> 38 #include <linux/apple-gmux.h> 39 40 #include <drm/drm_atomic_helper.h> 41 #include <drm/drm_client_event.h> 42 #include <drm/drm_crtc_helper.h> 43 #include <drm/drm_probe_helper.h> 44 #include <drm/amdgpu_drm.h> 45 #include <linux/device.h> 46 #include <linux/vgaarb.h> 47 #include <linux/vga_switcheroo.h> 48 #include <linux/efi.h> 49 #include "amdgpu.h" 50 #include "amdgpu_trace.h" 51 #include "amdgpu_i2c.h" 52 #include "atom.h" 53 #include "amdgpu_atombios.h" 54 #include "amdgpu_atomfirmware.h" 55 #include "amd_pcie.h" 56 #ifdef CONFIG_DRM_AMDGPU_SI 57 #include "si.h" 58 #endif 59 #ifdef CONFIG_DRM_AMDGPU_CIK 60 #include "cik.h" 61 #endif 62 #include "vi.h" 63 #include "soc15.h" 64 #include "nv.h" 65 #include "bif/bif_4_1_d.h" 66 #include <linux/firmware.h> 67 #include "amdgpu_vf_error.h" 68 69 #include "amdgpu_amdkfd.h" 70 #include "amdgpu_pm.h" 71 72 #include "amdgpu_xgmi.h" 73 #include "amdgpu_ras.h" 74 #include "amdgpu_pmu.h" 75 #include "amdgpu_fru_eeprom.h" 76 #include "amdgpu_reset.h" 77 #include "amdgpu_virt.h" 78 #include "amdgpu_dev_coredump.h" 79 80 #include <linux/suspend.h> 81 #include <drm/task_barrier.h> 82 #include <linux/pm_runtime.h> 83 84 #include <drm/drm_drv.h> 85 86 #if IS_ENABLED(CONFIG_X86) 87 #include <asm/intel-family.h> 88 #include <asm/cpu_device_id.h> 89 #endif 90 91 MODULE_FIRMWARE("amdgpu/vega10_gpu_info.bin"); 92 MODULE_FIRMWARE("amdgpu/vega12_gpu_info.bin"); 93 MODULE_FIRMWARE("amdgpu/raven_gpu_info.bin"); 94 MODULE_FIRMWARE("amdgpu/picasso_gpu_info.bin"); 95 MODULE_FIRMWARE("amdgpu/raven2_gpu_info.bin"); 96 MODULE_FIRMWARE("amdgpu/arcturus_gpu_info.bin"); 97 MODULE_FIRMWARE("amdgpu/navi12_gpu_info.bin"); 98 99 #define AMDGPU_RESUME_MS 2000 100 #define AMDGPU_MAX_RETRY_LIMIT 2 101 #define AMDGPU_RETRY_SRIOV_RESET(r) ((r) == -EBUSY || (r) == -ETIMEDOUT || (r) == -EINVAL) 102 #define AMDGPU_PCIE_INDEX_FALLBACK (0x38 >> 2) 103 #define AMDGPU_PCIE_INDEX_HI_FALLBACK (0x44 >> 2) 104 #define AMDGPU_PCIE_DATA_FALLBACK (0x3C >> 2) 105 106 #define AMDGPU_VBIOS_SKIP (1U << 0) 107 #define AMDGPU_VBIOS_OPTIONAL (1U << 1) 108 109 static const struct drm_driver amdgpu_kms_driver; 110 111 const char *amdgpu_asic_name[] = { 112 "TAHITI", 113 "PITCAIRN", 114 "VERDE", 115 "OLAND", 116 "HAINAN", 117 "BONAIRE", 118 "KAVERI", 119 "KABINI", 120 "HAWAII", 121 "MULLINS", 122 "TOPAZ", 123 "TONGA", 124 "FIJI", 125 "CARRIZO", 126 "STONEY", 127 "POLARIS10", 128 "POLARIS11", 129 "POLARIS12", 130 "VEGAM", 131 "VEGA10", 132 "VEGA12", 133 "VEGA20", 134 "RAVEN", 135 "ARCTURUS", 136 "RENOIR", 137 "ALDEBARAN", 138 "NAVI10", 139 "CYAN_SKILLFISH", 140 "NAVI14", 141 "NAVI12", 142 "SIENNA_CICHLID", 143 "NAVY_FLOUNDER", 144 "VANGOGH", 145 "DIMGREY_CAVEFISH", 146 "BEIGE_GOBY", 147 "YELLOW_CARP", 148 "IP DISCOVERY", 149 "LAST", 150 }; 151 152 #define AMDGPU_IP_BLK_MASK_ALL GENMASK(AMD_IP_BLOCK_TYPE_NUM - 1, 0) 153 /* 154 * Default init level where all blocks are expected to be initialized. This is 155 * the level of initialization expected by default and also after a full reset 156 * of the device. 157 */ 158 struct amdgpu_init_level amdgpu_init_default = { 159 .level = AMDGPU_INIT_LEVEL_DEFAULT, 160 .hwini_ip_block_mask = AMDGPU_IP_BLK_MASK_ALL, 161 }; 162 163 struct amdgpu_init_level amdgpu_init_recovery = { 164 .level = AMDGPU_INIT_LEVEL_RESET_RECOVERY, 165 .hwini_ip_block_mask = AMDGPU_IP_BLK_MASK_ALL, 166 }; 167 168 /* 169 * Minimal blocks needed to be initialized before a XGMI hive can be reset. This 170 * is used for cases like reset on initialization where the entire hive needs to 171 * be reset before first use. 172 */ 173 struct amdgpu_init_level amdgpu_init_minimal_xgmi = { 174 .level = AMDGPU_INIT_LEVEL_MINIMAL_XGMI, 175 .hwini_ip_block_mask = 176 BIT(AMD_IP_BLOCK_TYPE_GMC) | BIT(AMD_IP_BLOCK_TYPE_SMC) | 177 BIT(AMD_IP_BLOCK_TYPE_COMMON) | BIT(AMD_IP_BLOCK_TYPE_IH) | 178 BIT(AMD_IP_BLOCK_TYPE_PSP) 179 }; 180 181 static inline bool amdgpu_ip_member_of_hwini(struct amdgpu_device *adev, 182 enum amd_ip_block_type block) 183 { 184 return (adev->init_lvl->hwini_ip_block_mask & (1U << block)) != 0; 185 } 186 187 void amdgpu_set_init_level(struct amdgpu_device *adev, 188 enum amdgpu_init_lvl_id lvl) 189 { 190 switch (lvl) { 191 case AMDGPU_INIT_LEVEL_MINIMAL_XGMI: 192 adev->init_lvl = &amdgpu_init_minimal_xgmi; 193 break; 194 case AMDGPU_INIT_LEVEL_RESET_RECOVERY: 195 adev->init_lvl = &amdgpu_init_recovery; 196 break; 197 case AMDGPU_INIT_LEVEL_DEFAULT: 198 fallthrough; 199 default: 200 adev->init_lvl = &amdgpu_init_default; 201 break; 202 } 203 } 204 205 static inline void amdgpu_device_stop_pending_resets(struct amdgpu_device *adev); 206 static int amdgpu_device_pm_notifier(struct notifier_block *nb, unsigned long mode, 207 void *data); 208 209 /** 210 * DOC: pcie_replay_count 211 * 212 * The amdgpu driver provides a sysfs API for reporting the total number 213 * of PCIe replays (NAKs). 214 * The file pcie_replay_count is used for this and returns the total 215 * number of replays as a sum of the NAKs generated and NAKs received. 216 */ 217 218 static ssize_t amdgpu_device_get_pcie_replay_count(struct device *dev, 219 struct device_attribute *attr, char *buf) 220 { 221 struct drm_device *ddev = dev_get_drvdata(dev); 222 struct amdgpu_device *adev = drm_to_adev(ddev); 223 uint64_t cnt = amdgpu_asic_get_pcie_replay_count(adev); 224 225 return sysfs_emit(buf, "%llu\n", cnt); 226 } 227 228 static DEVICE_ATTR(pcie_replay_count, 0444, 229 amdgpu_device_get_pcie_replay_count, NULL); 230 231 static int amdgpu_device_attr_sysfs_init(struct amdgpu_device *adev) 232 { 233 int ret = 0; 234 235 if (!amdgpu_sriov_vf(adev)) 236 ret = sysfs_create_file(&adev->dev->kobj, 237 &dev_attr_pcie_replay_count.attr); 238 239 return ret; 240 } 241 242 static void amdgpu_device_attr_sysfs_fini(struct amdgpu_device *adev) 243 { 244 if (!amdgpu_sriov_vf(adev)) 245 sysfs_remove_file(&adev->dev->kobj, 246 &dev_attr_pcie_replay_count.attr); 247 } 248 249 static ssize_t amdgpu_sysfs_reg_state_get(struct file *f, struct kobject *kobj, 250 const struct bin_attribute *attr, char *buf, 251 loff_t ppos, size_t count) 252 { 253 struct device *dev = kobj_to_dev(kobj); 254 struct drm_device *ddev = dev_get_drvdata(dev); 255 struct amdgpu_device *adev = drm_to_adev(ddev); 256 ssize_t bytes_read; 257 258 switch (ppos) { 259 case AMDGPU_SYS_REG_STATE_XGMI: 260 bytes_read = amdgpu_asic_get_reg_state( 261 adev, AMDGPU_REG_STATE_TYPE_XGMI, buf, count); 262 break; 263 case AMDGPU_SYS_REG_STATE_WAFL: 264 bytes_read = amdgpu_asic_get_reg_state( 265 adev, AMDGPU_REG_STATE_TYPE_WAFL, buf, count); 266 break; 267 case AMDGPU_SYS_REG_STATE_PCIE: 268 bytes_read = amdgpu_asic_get_reg_state( 269 adev, AMDGPU_REG_STATE_TYPE_PCIE, buf, count); 270 break; 271 case AMDGPU_SYS_REG_STATE_USR: 272 bytes_read = amdgpu_asic_get_reg_state( 273 adev, AMDGPU_REG_STATE_TYPE_USR, buf, count); 274 break; 275 case AMDGPU_SYS_REG_STATE_USR_1: 276 bytes_read = amdgpu_asic_get_reg_state( 277 adev, AMDGPU_REG_STATE_TYPE_USR_1, buf, count); 278 break; 279 default: 280 return -EINVAL; 281 } 282 283 return bytes_read; 284 } 285 286 static const BIN_ATTR(reg_state, 0444, amdgpu_sysfs_reg_state_get, NULL, 287 AMDGPU_SYS_REG_STATE_END); 288 289 int amdgpu_reg_state_sysfs_init(struct amdgpu_device *adev) 290 { 291 int ret; 292 293 if (!amdgpu_asic_get_reg_state_supported(adev)) 294 return 0; 295 296 ret = sysfs_create_bin_file(&adev->dev->kobj, &bin_attr_reg_state); 297 298 return ret; 299 } 300 301 void amdgpu_reg_state_sysfs_fini(struct amdgpu_device *adev) 302 { 303 if (!amdgpu_asic_get_reg_state_supported(adev)) 304 return; 305 sysfs_remove_bin_file(&adev->dev->kobj, &bin_attr_reg_state); 306 } 307 308 int amdgpu_ip_block_suspend(struct amdgpu_ip_block *ip_block) 309 { 310 int r; 311 312 if (ip_block->version->funcs->suspend) { 313 r = ip_block->version->funcs->suspend(ip_block); 314 if (r) { 315 dev_err(ip_block->adev->dev, 316 "suspend of IP block <%s> failed %d\n", 317 ip_block->version->funcs->name, r); 318 return r; 319 } 320 } 321 322 ip_block->status.hw = false; 323 return 0; 324 } 325 326 int amdgpu_ip_block_resume(struct amdgpu_ip_block *ip_block) 327 { 328 int r; 329 330 if (ip_block->version->funcs->resume) { 331 r = ip_block->version->funcs->resume(ip_block); 332 if (r) { 333 dev_err(ip_block->adev->dev, 334 "resume of IP block <%s> failed %d\n", 335 ip_block->version->funcs->name, r); 336 return r; 337 } 338 } 339 340 ip_block->status.hw = true; 341 return 0; 342 } 343 344 /** 345 * DOC: board_info 346 * 347 * The amdgpu driver provides a sysfs API for giving board related information. 348 * It provides the form factor information in the format 349 * 350 * type : form factor 351 * 352 * Possible form factor values 353 * 354 * - "cem" - PCIE CEM card 355 * - "oam" - Open Compute Accelerator Module 356 * - "unknown" - Not known 357 * 358 */ 359 360 static ssize_t amdgpu_device_get_board_info(struct device *dev, 361 struct device_attribute *attr, 362 char *buf) 363 { 364 struct drm_device *ddev = dev_get_drvdata(dev); 365 struct amdgpu_device *adev = drm_to_adev(ddev); 366 enum amdgpu_pkg_type pkg_type = AMDGPU_PKG_TYPE_CEM; 367 const char *pkg; 368 369 if (adev->smuio.funcs && adev->smuio.funcs->get_pkg_type) 370 pkg_type = adev->smuio.funcs->get_pkg_type(adev); 371 372 switch (pkg_type) { 373 case AMDGPU_PKG_TYPE_CEM: 374 pkg = "cem"; 375 break; 376 case AMDGPU_PKG_TYPE_OAM: 377 pkg = "oam"; 378 break; 379 default: 380 pkg = "unknown"; 381 break; 382 } 383 384 return sysfs_emit(buf, "%s : %s\n", "type", pkg); 385 } 386 387 static DEVICE_ATTR(board_info, 0444, amdgpu_device_get_board_info, NULL); 388 389 static struct attribute *amdgpu_board_attrs[] = { 390 &dev_attr_board_info.attr, 391 NULL, 392 }; 393 394 static umode_t amdgpu_board_attrs_is_visible(struct kobject *kobj, 395 struct attribute *attr, int n) 396 { 397 struct device *dev = kobj_to_dev(kobj); 398 struct drm_device *ddev = dev_get_drvdata(dev); 399 struct amdgpu_device *adev = drm_to_adev(ddev); 400 401 if (adev->flags & AMD_IS_APU) 402 return 0; 403 404 return attr->mode; 405 } 406 407 static const struct attribute_group amdgpu_board_attrs_group = { 408 .attrs = amdgpu_board_attrs, 409 .is_visible = amdgpu_board_attrs_is_visible 410 }; 411 412 static void amdgpu_device_get_pcie_info(struct amdgpu_device *adev); 413 414 415 /** 416 * amdgpu_device_supports_px - Is the device a dGPU with ATPX power control 417 * 418 * @dev: drm_device pointer 419 * 420 * Returns true if the device is a dGPU with ATPX power control, 421 * otherwise return false. 422 */ 423 bool amdgpu_device_supports_px(struct drm_device *dev) 424 { 425 struct amdgpu_device *adev = drm_to_adev(dev); 426 427 if ((adev->flags & AMD_IS_PX) && !amdgpu_is_atpx_hybrid()) 428 return true; 429 return false; 430 } 431 432 /** 433 * amdgpu_device_supports_boco - Is the device a dGPU with ACPI power resources 434 * 435 * @dev: drm_device pointer 436 * 437 * Returns true if the device is a dGPU with ACPI power control, 438 * otherwise return false. 439 */ 440 bool amdgpu_device_supports_boco(struct drm_device *dev) 441 { 442 struct amdgpu_device *adev = drm_to_adev(dev); 443 444 if (!IS_ENABLED(CONFIG_HOTPLUG_PCI_PCIE)) 445 return false; 446 447 if (adev->has_pr3 || 448 ((adev->flags & AMD_IS_PX) && amdgpu_is_atpx_hybrid())) 449 return true; 450 return false; 451 } 452 453 /** 454 * amdgpu_device_supports_baco - Does the device support BACO 455 * 456 * @dev: drm_device pointer 457 * 458 * Return: 459 * 1 if the device supports BACO; 460 * 3 if the device supports MACO (only works if BACO is supported) 461 * otherwise return 0. 462 */ 463 int amdgpu_device_supports_baco(struct drm_device *dev) 464 { 465 struct amdgpu_device *adev = drm_to_adev(dev); 466 467 return amdgpu_asic_supports_baco(adev); 468 } 469 470 void amdgpu_device_detect_runtime_pm_mode(struct amdgpu_device *adev) 471 { 472 struct drm_device *dev; 473 int bamaco_support; 474 475 dev = adev_to_drm(adev); 476 477 adev->pm.rpm_mode = AMDGPU_RUNPM_NONE; 478 bamaco_support = amdgpu_device_supports_baco(dev); 479 480 switch (amdgpu_runtime_pm) { 481 case 2: 482 if (bamaco_support & MACO_SUPPORT) { 483 adev->pm.rpm_mode = AMDGPU_RUNPM_BAMACO; 484 dev_info(adev->dev, "Forcing BAMACO for runtime pm\n"); 485 } else if (bamaco_support == BACO_SUPPORT) { 486 adev->pm.rpm_mode = AMDGPU_RUNPM_BACO; 487 dev_info(adev->dev, "Requested mode BAMACO not available,fallback to use BACO\n"); 488 } 489 break; 490 case 1: 491 if (bamaco_support & BACO_SUPPORT) { 492 adev->pm.rpm_mode = AMDGPU_RUNPM_BACO; 493 dev_info(adev->dev, "Forcing BACO for runtime pm\n"); 494 } 495 break; 496 case -1: 497 case -2: 498 if (amdgpu_device_supports_px(dev)) { /* enable PX as runtime mode */ 499 adev->pm.rpm_mode = AMDGPU_RUNPM_PX; 500 dev_info(adev->dev, "Using ATPX for runtime pm\n"); 501 } else if (amdgpu_device_supports_boco(dev)) { /* enable boco as runtime mode */ 502 adev->pm.rpm_mode = AMDGPU_RUNPM_BOCO; 503 dev_info(adev->dev, "Using BOCO for runtime pm\n"); 504 } else { 505 if (!bamaco_support) 506 goto no_runtime_pm; 507 508 switch (adev->asic_type) { 509 case CHIP_VEGA20: 510 case CHIP_ARCTURUS: 511 /* BACO are not supported on vega20 and arctrus */ 512 break; 513 case CHIP_VEGA10: 514 /* enable BACO as runpm mode if noretry=0 */ 515 if (!adev->gmc.noretry) 516 adev->pm.rpm_mode = AMDGPU_RUNPM_BACO; 517 break; 518 default: 519 /* enable BACO as runpm mode on CI+ */ 520 adev->pm.rpm_mode = AMDGPU_RUNPM_BACO; 521 break; 522 } 523 524 if (adev->pm.rpm_mode == AMDGPU_RUNPM_BACO) { 525 if (bamaco_support & MACO_SUPPORT) { 526 adev->pm.rpm_mode = AMDGPU_RUNPM_BAMACO; 527 dev_info(adev->dev, "Using BAMACO for runtime pm\n"); 528 } else { 529 dev_info(adev->dev, "Using BACO for runtime pm\n"); 530 } 531 } 532 } 533 break; 534 case 0: 535 dev_info(adev->dev, "runtime pm is manually disabled\n"); 536 break; 537 default: 538 break; 539 } 540 541 no_runtime_pm: 542 if (adev->pm.rpm_mode == AMDGPU_RUNPM_NONE) 543 dev_info(adev->dev, "Runtime PM not available\n"); 544 } 545 /** 546 * amdgpu_device_supports_smart_shift - Is the device dGPU with 547 * smart shift support 548 * 549 * @dev: drm_device pointer 550 * 551 * Returns true if the device is a dGPU with Smart Shift support, 552 * otherwise returns false. 553 */ 554 bool amdgpu_device_supports_smart_shift(struct drm_device *dev) 555 { 556 return (amdgpu_device_supports_boco(dev) && 557 amdgpu_acpi_is_power_shift_control_supported()); 558 } 559 560 /* 561 * VRAM access helper functions 562 */ 563 564 /** 565 * amdgpu_device_mm_access - access vram by MM_INDEX/MM_DATA 566 * 567 * @adev: amdgpu_device pointer 568 * @pos: offset of the buffer in vram 569 * @buf: virtual address of the buffer in system memory 570 * @size: read/write size, sizeof(@buf) must > @size 571 * @write: true - write to vram, otherwise - read from vram 572 */ 573 void amdgpu_device_mm_access(struct amdgpu_device *adev, loff_t pos, 574 void *buf, size_t size, bool write) 575 { 576 unsigned long flags; 577 uint32_t hi = ~0, tmp = 0; 578 uint32_t *data = buf; 579 uint64_t last; 580 int idx; 581 582 if (!drm_dev_enter(adev_to_drm(adev), &idx)) 583 return; 584 585 BUG_ON(!IS_ALIGNED(pos, 4) || !IS_ALIGNED(size, 4)); 586 587 spin_lock_irqsave(&adev->mmio_idx_lock, flags); 588 for (last = pos + size; pos < last; pos += 4) { 589 tmp = pos >> 31; 590 591 WREG32_NO_KIQ(mmMM_INDEX, ((uint32_t)pos) | 0x80000000); 592 if (tmp != hi) { 593 WREG32_NO_KIQ(mmMM_INDEX_HI, tmp); 594 hi = tmp; 595 } 596 if (write) 597 WREG32_NO_KIQ(mmMM_DATA, *data++); 598 else 599 *data++ = RREG32_NO_KIQ(mmMM_DATA); 600 } 601 602 spin_unlock_irqrestore(&adev->mmio_idx_lock, flags); 603 drm_dev_exit(idx); 604 } 605 606 /** 607 * amdgpu_device_aper_access - access vram by vram aperture 608 * 609 * @adev: amdgpu_device pointer 610 * @pos: offset of the buffer in vram 611 * @buf: virtual address of the buffer in system memory 612 * @size: read/write size, sizeof(@buf) must > @size 613 * @write: true - write to vram, otherwise - read from vram 614 * 615 * The return value means how many bytes have been transferred. 616 */ 617 size_t amdgpu_device_aper_access(struct amdgpu_device *adev, loff_t pos, 618 void *buf, size_t size, bool write) 619 { 620 #ifdef CONFIG_64BIT 621 void __iomem *addr; 622 size_t count = 0; 623 uint64_t last; 624 625 if (!adev->mman.aper_base_kaddr) 626 return 0; 627 628 last = min(pos + size, adev->gmc.visible_vram_size); 629 if (last > pos) { 630 addr = adev->mman.aper_base_kaddr + pos; 631 count = last - pos; 632 633 if (write) { 634 memcpy_toio(addr, buf, count); 635 /* Make sure HDP write cache flush happens without any reordering 636 * after the system memory contents are sent over PCIe device 637 */ 638 mb(); 639 amdgpu_device_flush_hdp(adev, NULL); 640 } else { 641 amdgpu_device_invalidate_hdp(adev, NULL); 642 /* Make sure HDP read cache is invalidated before issuing a read 643 * to the PCIe device 644 */ 645 mb(); 646 memcpy_fromio(buf, addr, count); 647 } 648 649 } 650 651 return count; 652 #else 653 return 0; 654 #endif 655 } 656 657 /** 658 * amdgpu_device_vram_access - read/write a buffer in vram 659 * 660 * @adev: amdgpu_device pointer 661 * @pos: offset of the buffer in vram 662 * @buf: virtual address of the buffer in system memory 663 * @size: read/write size, sizeof(@buf) must > @size 664 * @write: true - write to vram, otherwise - read from vram 665 */ 666 void amdgpu_device_vram_access(struct amdgpu_device *adev, loff_t pos, 667 void *buf, size_t size, bool write) 668 { 669 size_t count; 670 671 /* try to using vram apreature to access vram first */ 672 count = amdgpu_device_aper_access(adev, pos, buf, size, write); 673 size -= count; 674 if (size) { 675 /* using MM to access rest vram */ 676 pos += count; 677 buf += count; 678 amdgpu_device_mm_access(adev, pos, buf, size, write); 679 } 680 } 681 682 /* 683 * register access helper functions. 684 */ 685 686 /* Check if hw access should be skipped because of hotplug or device error */ 687 bool amdgpu_device_skip_hw_access(struct amdgpu_device *adev) 688 { 689 if (adev->no_hw_access) 690 return true; 691 692 #ifdef CONFIG_LOCKDEP 693 /* 694 * This is a bit complicated to understand, so worth a comment. What we assert 695 * here is that the GPU reset is not running on another thread in parallel. 696 * 697 * For this we trylock the read side of the reset semaphore, if that succeeds 698 * we know that the reset is not running in parallel. 699 * 700 * If the trylock fails we assert that we are either already holding the read 701 * side of the lock or are the reset thread itself and hold the write side of 702 * the lock. 703 */ 704 if (in_task()) { 705 if (down_read_trylock(&adev->reset_domain->sem)) 706 up_read(&adev->reset_domain->sem); 707 else 708 lockdep_assert_held(&adev->reset_domain->sem); 709 } 710 #endif 711 return false; 712 } 713 714 /** 715 * amdgpu_device_rreg - read a memory mapped IO or indirect register 716 * 717 * @adev: amdgpu_device pointer 718 * @reg: dword aligned register offset 719 * @acc_flags: access flags which require special behavior 720 * 721 * Returns the 32 bit value from the offset specified. 722 */ 723 uint32_t amdgpu_device_rreg(struct amdgpu_device *adev, 724 uint32_t reg, uint32_t acc_flags) 725 { 726 uint32_t ret; 727 728 if (amdgpu_device_skip_hw_access(adev)) 729 return 0; 730 731 if ((reg * 4) < adev->rmmio_size) { 732 if (!(acc_flags & AMDGPU_REGS_NO_KIQ) && 733 amdgpu_sriov_runtime(adev) && 734 down_read_trylock(&adev->reset_domain->sem)) { 735 ret = amdgpu_kiq_rreg(adev, reg, 0); 736 up_read(&adev->reset_domain->sem); 737 } else { 738 ret = readl(((void __iomem *)adev->rmmio) + (reg * 4)); 739 } 740 } else { 741 ret = adev->pcie_rreg(adev, reg * 4); 742 } 743 744 trace_amdgpu_device_rreg(adev->pdev->device, reg, ret); 745 746 return ret; 747 } 748 749 /* 750 * MMIO register read with bytes helper functions 751 * @offset:bytes offset from MMIO start 752 */ 753 754 /** 755 * amdgpu_mm_rreg8 - read a memory mapped IO register 756 * 757 * @adev: amdgpu_device pointer 758 * @offset: byte aligned register offset 759 * 760 * Returns the 8 bit value from the offset specified. 761 */ 762 uint8_t amdgpu_mm_rreg8(struct amdgpu_device *adev, uint32_t offset) 763 { 764 if (amdgpu_device_skip_hw_access(adev)) 765 return 0; 766 767 if (offset < adev->rmmio_size) 768 return (readb(adev->rmmio + offset)); 769 BUG(); 770 } 771 772 773 /** 774 * amdgpu_device_xcc_rreg - read a memory mapped IO or indirect register with specific XCC 775 * 776 * @adev: amdgpu_device pointer 777 * @reg: dword aligned register offset 778 * @acc_flags: access flags which require special behavior 779 * @xcc_id: xcc accelerated compute core id 780 * 781 * Returns the 32 bit value from the offset specified. 782 */ 783 uint32_t amdgpu_device_xcc_rreg(struct amdgpu_device *adev, 784 uint32_t reg, uint32_t acc_flags, 785 uint32_t xcc_id) 786 { 787 uint32_t ret, rlcg_flag; 788 789 if (amdgpu_device_skip_hw_access(adev)) 790 return 0; 791 792 if ((reg * 4) < adev->rmmio_size) { 793 if (amdgpu_sriov_vf(adev) && 794 !amdgpu_sriov_runtime(adev) && 795 adev->gfx.rlc.rlcg_reg_access_supported && 796 amdgpu_virt_get_rlcg_reg_access_flag(adev, acc_flags, 797 GC_HWIP, false, 798 &rlcg_flag)) { 799 ret = amdgpu_virt_rlcg_reg_rw(adev, reg, 0, rlcg_flag, GET_INST(GC, xcc_id)); 800 } else if (!(acc_flags & AMDGPU_REGS_NO_KIQ) && 801 amdgpu_sriov_runtime(adev) && 802 down_read_trylock(&adev->reset_domain->sem)) { 803 ret = amdgpu_kiq_rreg(adev, reg, xcc_id); 804 up_read(&adev->reset_domain->sem); 805 } else { 806 ret = readl(((void __iomem *)adev->rmmio) + (reg * 4)); 807 } 808 } else { 809 ret = adev->pcie_rreg(adev, reg * 4); 810 } 811 812 return ret; 813 } 814 815 /* 816 * MMIO register write with bytes helper functions 817 * @offset:bytes offset from MMIO start 818 * @value: the value want to be written to the register 819 */ 820 821 /** 822 * amdgpu_mm_wreg8 - read a memory mapped IO register 823 * 824 * @adev: amdgpu_device pointer 825 * @offset: byte aligned register offset 826 * @value: 8 bit value to write 827 * 828 * Writes the value specified to the offset specified. 829 */ 830 void amdgpu_mm_wreg8(struct amdgpu_device *adev, uint32_t offset, uint8_t value) 831 { 832 if (amdgpu_device_skip_hw_access(adev)) 833 return; 834 835 if (offset < adev->rmmio_size) 836 writeb(value, adev->rmmio + offset); 837 else 838 BUG(); 839 } 840 841 /** 842 * amdgpu_device_wreg - write to a memory mapped IO or indirect register 843 * 844 * @adev: amdgpu_device pointer 845 * @reg: dword aligned register offset 846 * @v: 32 bit value to write to the register 847 * @acc_flags: access flags which require special behavior 848 * 849 * Writes the value specified to the offset specified. 850 */ 851 void amdgpu_device_wreg(struct amdgpu_device *adev, 852 uint32_t reg, uint32_t v, 853 uint32_t acc_flags) 854 { 855 if (amdgpu_device_skip_hw_access(adev)) 856 return; 857 858 if ((reg * 4) < adev->rmmio_size) { 859 if (!(acc_flags & AMDGPU_REGS_NO_KIQ) && 860 amdgpu_sriov_runtime(adev) && 861 down_read_trylock(&adev->reset_domain->sem)) { 862 amdgpu_kiq_wreg(adev, reg, v, 0); 863 up_read(&adev->reset_domain->sem); 864 } else { 865 writel(v, ((void __iomem *)adev->rmmio) + (reg * 4)); 866 } 867 } else { 868 adev->pcie_wreg(adev, reg * 4, v); 869 } 870 871 trace_amdgpu_device_wreg(adev->pdev->device, reg, v); 872 } 873 874 /** 875 * amdgpu_mm_wreg_mmio_rlc - write register either with direct/indirect mmio or with RLC path if in range 876 * 877 * @adev: amdgpu_device pointer 878 * @reg: mmio/rlc register 879 * @v: value to write 880 * @xcc_id: xcc accelerated compute core id 881 * 882 * this function is invoked only for the debugfs register access 883 */ 884 void amdgpu_mm_wreg_mmio_rlc(struct amdgpu_device *adev, 885 uint32_t reg, uint32_t v, 886 uint32_t xcc_id) 887 { 888 if (amdgpu_device_skip_hw_access(adev)) 889 return; 890 891 if (amdgpu_sriov_fullaccess(adev) && 892 adev->gfx.rlc.funcs && 893 adev->gfx.rlc.funcs->is_rlcg_access_range) { 894 if (adev->gfx.rlc.funcs->is_rlcg_access_range(adev, reg)) 895 return amdgpu_sriov_wreg(adev, reg, v, 0, 0, xcc_id); 896 } else if ((reg * 4) >= adev->rmmio_size) { 897 adev->pcie_wreg(adev, reg * 4, v); 898 } else { 899 writel(v, ((void __iomem *)adev->rmmio) + (reg * 4)); 900 } 901 } 902 903 /** 904 * amdgpu_device_xcc_wreg - write to a memory mapped IO or indirect register with specific XCC 905 * 906 * @adev: amdgpu_device pointer 907 * @reg: dword aligned register offset 908 * @v: 32 bit value to write to the register 909 * @acc_flags: access flags which require special behavior 910 * @xcc_id: xcc accelerated compute core id 911 * 912 * Writes the value specified to the offset specified. 913 */ 914 void amdgpu_device_xcc_wreg(struct amdgpu_device *adev, 915 uint32_t reg, uint32_t v, 916 uint32_t acc_flags, uint32_t xcc_id) 917 { 918 uint32_t rlcg_flag; 919 920 if (amdgpu_device_skip_hw_access(adev)) 921 return; 922 923 if ((reg * 4) < adev->rmmio_size) { 924 if (amdgpu_sriov_vf(adev) && 925 !amdgpu_sriov_runtime(adev) && 926 adev->gfx.rlc.rlcg_reg_access_supported && 927 amdgpu_virt_get_rlcg_reg_access_flag(adev, acc_flags, 928 GC_HWIP, true, 929 &rlcg_flag)) { 930 amdgpu_virt_rlcg_reg_rw(adev, reg, v, rlcg_flag, GET_INST(GC, xcc_id)); 931 } else if (!(acc_flags & AMDGPU_REGS_NO_KIQ) && 932 amdgpu_sriov_runtime(adev) && 933 down_read_trylock(&adev->reset_domain->sem)) { 934 amdgpu_kiq_wreg(adev, reg, v, xcc_id); 935 up_read(&adev->reset_domain->sem); 936 } else { 937 writel(v, ((void __iomem *)adev->rmmio) + (reg * 4)); 938 } 939 } else { 940 adev->pcie_wreg(adev, reg * 4, v); 941 } 942 } 943 944 /** 945 * amdgpu_device_indirect_rreg - read an indirect register 946 * 947 * @adev: amdgpu_device pointer 948 * @reg_addr: indirect register address to read from 949 * 950 * Returns the value of indirect register @reg_addr 951 */ 952 u32 amdgpu_device_indirect_rreg(struct amdgpu_device *adev, 953 u32 reg_addr) 954 { 955 unsigned long flags, pcie_index, pcie_data; 956 void __iomem *pcie_index_offset; 957 void __iomem *pcie_data_offset; 958 u32 r; 959 960 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 961 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 962 963 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 964 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 965 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 966 967 writel(reg_addr, pcie_index_offset); 968 readl(pcie_index_offset); 969 r = readl(pcie_data_offset); 970 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 971 972 return r; 973 } 974 975 u32 amdgpu_device_indirect_rreg_ext(struct amdgpu_device *adev, 976 u64 reg_addr) 977 { 978 unsigned long flags, pcie_index, pcie_index_hi, pcie_data; 979 u32 r; 980 void __iomem *pcie_index_offset; 981 void __iomem *pcie_index_hi_offset; 982 void __iomem *pcie_data_offset; 983 984 if (unlikely(!adev->nbio.funcs)) { 985 pcie_index = AMDGPU_PCIE_INDEX_FALLBACK; 986 pcie_data = AMDGPU_PCIE_DATA_FALLBACK; 987 } else { 988 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 989 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 990 } 991 992 if (reg_addr >> 32) { 993 if (unlikely(!adev->nbio.funcs)) 994 pcie_index_hi = AMDGPU_PCIE_INDEX_HI_FALLBACK; 995 else 996 pcie_index_hi = adev->nbio.funcs->get_pcie_index_hi_offset(adev); 997 } else { 998 pcie_index_hi = 0; 999 } 1000 1001 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 1002 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 1003 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 1004 if (pcie_index_hi != 0) 1005 pcie_index_hi_offset = (void __iomem *)adev->rmmio + 1006 pcie_index_hi * 4; 1007 1008 writel(reg_addr, pcie_index_offset); 1009 readl(pcie_index_offset); 1010 if (pcie_index_hi != 0) { 1011 writel((reg_addr >> 32) & 0xff, pcie_index_hi_offset); 1012 readl(pcie_index_hi_offset); 1013 } 1014 r = readl(pcie_data_offset); 1015 1016 /* clear the high bits */ 1017 if (pcie_index_hi != 0) { 1018 writel(0, pcie_index_hi_offset); 1019 readl(pcie_index_hi_offset); 1020 } 1021 1022 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 1023 1024 return r; 1025 } 1026 1027 /** 1028 * amdgpu_device_indirect_rreg64 - read a 64bits indirect register 1029 * 1030 * @adev: amdgpu_device pointer 1031 * @reg_addr: indirect register address to read from 1032 * 1033 * Returns the value of indirect register @reg_addr 1034 */ 1035 u64 amdgpu_device_indirect_rreg64(struct amdgpu_device *adev, 1036 u32 reg_addr) 1037 { 1038 unsigned long flags, pcie_index, pcie_data; 1039 void __iomem *pcie_index_offset; 1040 void __iomem *pcie_data_offset; 1041 u64 r; 1042 1043 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 1044 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 1045 1046 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 1047 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 1048 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 1049 1050 /* read low 32 bits */ 1051 writel(reg_addr, pcie_index_offset); 1052 readl(pcie_index_offset); 1053 r = readl(pcie_data_offset); 1054 /* read high 32 bits */ 1055 writel(reg_addr + 4, pcie_index_offset); 1056 readl(pcie_index_offset); 1057 r |= ((u64)readl(pcie_data_offset) << 32); 1058 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 1059 1060 return r; 1061 } 1062 1063 u64 amdgpu_device_indirect_rreg64_ext(struct amdgpu_device *adev, 1064 u64 reg_addr) 1065 { 1066 unsigned long flags, pcie_index, pcie_data; 1067 unsigned long pcie_index_hi = 0; 1068 void __iomem *pcie_index_offset; 1069 void __iomem *pcie_index_hi_offset; 1070 void __iomem *pcie_data_offset; 1071 u64 r; 1072 1073 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 1074 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 1075 if ((reg_addr >> 32) && (adev->nbio.funcs->get_pcie_index_hi_offset)) 1076 pcie_index_hi = adev->nbio.funcs->get_pcie_index_hi_offset(adev); 1077 1078 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 1079 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 1080 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 1081 if (pcie_index_hi != 0) 1082 pcie_index_hi_offset = (void __iomem *)adev->rmmio + 1083 pcie_index_hi * 4; 1084 1085 /* read low 32 bits */ 1086 writel(reg_addr, pcie_index_offset); 1087 readl(pcie_index_offset); 1088 if (pcie_index_hi != 0) { 1089 writel((reg_addr >> 32) & 0xff, pcie_index_hi_offset); 1090 readl(pcie_index_hi_offset); 1091 } 1092 r = readl(pcie_data_offset); 1093 /* read high 32 bits */ 1094 writel(reg_addr + 4, pcie_index_offset); 1095 readl(pcie_index_offset); 1096 if (pcie_index_hi != 0) { 1097 writel((reg_addr >> 32) & 0xff, pcie_index_hi_offset); 1098 readl(pcie_index_hi_offset); 1099 } 1100 r |= ((u64)readl(pcie_data_offset) << 32); 1101 1102 /* clear the high bits */ 1103 if (pcie_index_hi != 0) { 1104 writel(0, pcie_index_hi_offset); 1105 readl(pcie_index_hi_offset); 1106 } 1107 1108 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 1109 1110 return r; 1111 } 1112 1113 /** 1114 * amdgpu_device_indirect_wreg - write an indirect register address 1115 * 1116 * @adev: amdgpu_device pointer 1117 * @reg_addr: indirect register offset 1118 * @reg_data: indirect register data 1119 * 1120 */ 1121 void amdgpu_device_indirect_wreg(struct amdgpu_device *adev, 1122 u32 reg_addr, u32 reg_data) 1123 { 1124 unsigned long flags, pcie_index, pcie_data; 1125 void __iomem *pcie_index_offset; 1126 void __iomem *pcie_data_offset; 1127 1128 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 1129 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 1130 1131 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 1132 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 1133 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 1134 1135 writel(reg_addr, pcie_index_offset); 1136 readl(pcie_index_offset); 1137 writel(reg_data, pcie_data_offset); 1138 readl(pcie_data_offset); 1139 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 1140 } 1141 1142 void amdgpu_device_indirect_wreg_ext(struct amdgpu_device *adev, 1143 u64 reg_addr, u32 reg_data) 1144 { 1145 unsigned long flags, pcie_index, pcie_index_hi, pcie_data; 1146 void __iomem *pcie_index_offset; 1147 void __iomem *pcie_index_hi_offset; 1148 void __iomem *pcie_data_offset; 1149 1150 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 1151 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 1152 if ((reg_addr >> 32) && (adev->nbio.funcs->get_pcie_index_hi_offset)) 1153 pcie_index_hi = adev->nbio.funcs->get_pcie_index_hi_offset(adev); 1154 else 1155 pcie_index_hi = 0; 1156 1157 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 1158 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 1159 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 1160 if (pcie_index_hi != 0) 1161 pcie_index_hi_offset = (void __iomem *)adev->rmmio + 1162 pcie_index_hi * 4; 1163 1164 writel(reg_addr, pcie_index_offset); 1165 readl(pcie_index_offset); 1166 if (pcie_index_hi != 0) { 1167 writel((reg_addr >> 32) & 0xff, pcie_index_hi_offset); 1168 readl(pcie_index_hi_offset); 1169 } 1170 writel(reg_data, pcie_data_offset); 1171 readl(pcie_data_offset); 1172 1173 /* clear the high bits */ 1174 if (pcie_index_hi != 0) { 1175 writel(0, pcie_index_hi_offset); 1176 readl(pcie_index_hi_offset); 1177 } 1178 1179 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 1180 } 1181 1182 /** 1183 * amdgpu_device_indirect_wreg64 - write a 64bits indirect register address 1184 * 1185 * @adev: amdgpu_device pointer 1186 * @reg_addr: indirect register offset 1187 * @reg_data: indirect register data 1188 * 1189 */ 1190 void amdgpu_device_indirect_wreg64(struct amdgpu_device *adev, 1191 u32 reg_addr, u64 reg_data) 1192 { 1193 unsigned long flags, pcie_index, pcie_data; 1194 void __iomem *pcie_index_offset; 1195 void __iomem *pcie_data_offset; 1196 1197 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 1198 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 1199 1200 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 1201 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 1202 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 1203 1204 /* write low 32 bits */ 1205 writel(reg_addr, pcie_index_offset); 1206 readl(pcie_index_offset); 1207 writel((u32)(reg_data & 0xffffffffULL), pcie_data_offset); 1208 readl(pcie_data_offset); 1209 /* write high 32 bits */ 1210 writel(reg_addr + 4, pcie_index_offset); 1211 readl(pcie_index_offset); 1212 writel((u32)(reg_data >> 32), pcie_data_offset); 1213 readl(pcie_data_offset); 1214 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 1215 } 1216 1217 void amdgpu_device_indirect_wreg64_ext(struct amdgpu_device *adev, 1218 u64 reg_addr, u64 reg_data) 1219 { 1220 unsigned long flags, pcie_index, pcie_data; 1221 unsigned long pcie_index_hi = 0; 1222 void __iomem *pcie_index_offset; 1223 void __iomem *pcie_index_hi_offset; 1224 void __iomem *pcie_data_offset; 1225 1226 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 1227 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 1228 if ((reg_addr >> 32) && (adev->nbio.funcs->get_pcie_index_hi_offset)) 1229 pcie_index_hi = adev->nbio.funcs->get_pcie_index_hi_offset(adev); 1230 1231 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 1232 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 1233 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 1234 if (pcie_index_hi != 0) 1235 pcie_index_hi_offset = (void __iomem *)adev->rmmio + 1236 pcie_index_hi * 4; 1237 1238 /* write low 32 bits */ 1239 writel(reg_addr, pcie_index_offset); 1240 readl(pcie_index_offset); 1241 if (pcie_index_hi != 0) { 1242 writel((reg_addr >> 32) & 0xff, pcie_index_hi_offset); 1243 readl(pcie_index_hi_offset); 1244 } 1245 writel((u32)(reg_data & 0xffffffffULL), pcie_data_offset); 1246 readl(pcie_data_offset); 1247 /* write high 32 bits */ 1248 writel(reg_addr + 4, pcie_index_offset); 1249 readl(pcie_index_offset); 1250 if (pcie_index_hi != 0) { 1251 writel((reg_addr >> 32) & 0xff, pcie_index_hi_offset); 1252 readl(pcie_index_hi_offset); 1253 } 1254 writel((u32)(reg_data >> 32), pcie_data_offset); 1255 readl(pcie_data_offset); 1256 1257 /* clear the high bits */ 1258 if (pcie_index_hi != 0) { 1259 writel(0, pcie_index_hi_offset); 1260 readl(pcie_index_hi_offset); 1261 } 1262 1263 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 1264 } 1265 1266 /** 1267 * amdgpu_device_get_rev_id - query device rev_id 1268 * 1269 * @adev: amdgpu_device pointer 1270 * 1271 * Return device rev_id 1272 */ 1273 u32 amdgpu_device_get_rev_id(struct amdgpu_device *adev) 1274 { 1275 return adev->nbio.funcs->get_rev_id(adev); 1276 } 1277 1278 /** 1279 * amdgpu_invalid_rreg - dummy reg read function 1280 * 1281 * @adev: amdgpu_device pointer 1282 * @reg: offset of register 1283 * 1284 * Dummy register read function. Used for register blocks 1285 * that certain asics don't have (all asics). 1286 * Returns the value in the register. 1287 */ 1288 static uint32_t amdgpu_invalid_rreg(struct amdgpu_device *adev, uint32_t reg) 1289 { 1290 DRM_ERROR("Invalid callback to read register 0x%04X\n", reg); 1291 BUG(); 1292 return 0; 1293 } 1294 1295 static uint32_t amdgpu_invalid_rreg_ext(struct amdgpu_device *adev, uint64_t reg) 1296 { 1297 DRM_ERROR("Invalid callback to read register 0x%llX\n", reg); 1298 BUG(); 1299 return 0; 1300 } 1301 1302 /** 1303 * amdgpu_invalid_wreg - dummy reg write function 1304 * 1305 * @adev: amdgpu_device pointer 1306 * @reg: offset of register 1307 * @v: value to write to the register 1308 * 1309 * Dummy register read function. Used for register blocks 1310 * that certain asics don't have (all asics). 1311 */ 1312 static void amdgpu_invalid_wreg(struct amdgpu_device *adev, uint32_t reg, uint32_t v) 1313 { 1314 DRM_ERROR("Invalid callback to write register 0x%04X with 0x%08X\n", 1315 reg, v); 1316 BUG(); 1317 } 1318 1319 static void amdgpu_invalid_wreg_ext(struct amdgpu_device *adev, uint64_t reg, uint32_t v) 1320 { 1321 DRM_ERROR("Invalid callback to write register 0x%llX with 0x%08X\n", 1322 reg, v); 1323 BUG(); 1324 } 1325 1326 /** 1327 * amdgpu_invalid_rreg64 - dummy 64 bit reg read function 1328 * 1329 * @adev: amdgpu_device pointer 1330 * @reg: offset of register 1331 * 1332 * Dummy register read function. Used for register blocks 1333 * that certain asics don't have (all asics). 1334 * Returns the value in the register. 1335 */ 1336 static uint64_t amdgpu_invalid_rreg64(struct amdgpu_device *adev, uint32_t reg) 1337 { 1338 DRM_ERROR("Invalid callback to read 64 bit register 0x%04X\n", reg); 1339 BUG(); 1340 return 0; 1341 } 1342 1343 static uint64_t amdgpu_invalid_rreg64_ext(struct amdgpu_device *adev, uint64_t reg) 1344 { 1345 DRM_ERROR("Invalid callback to read register 0x%llX\n", reg); 1346 BUG(); 1347 return 0; 1348 } 1349 1350 /** 1351 * amdgpu_invalid_wreg64 - dummy reg write function 1352 * 1353 * @adev: amdgpu_device pointer 1354 * @reg: offset of register 1355 * @v: value to write to the register 1356 * 1357 * Dummy register read function. Used for register blocks 1358 * that certain asics don't have (all asics). 1359 */ 1360 static void amdgpu_invalid_wreg64(struct amdgpu_device *adev, uint32_t reg, uint64_t v) 1361 { 1362 DRM_ERROR("Invalid callback to write 64 bit register 0x%04X with 0x%08llX\n", 1363 reg, v); 1364 BUG(); 1365 } 1366 1367 static void amdgpu_invalid_wreg64_ext(struct amdgpu_device *adev, uint64_t reg, uint64_t v) 1368 { 1369 DRM_ERROR("Invalid callback to write 64 bit register 0x%llX with 0x%08llX\n", 1370 reg, v); 1371 BUG(); 1372 } 1373 1374 /** 1375 * amdgpu_block_invalid_rreg - dummy reg read function 1376 * 1377 * @adev: amdgpu_device pointer 1378 * @block: offset of instance 1379 * @reg: offset of register 1380 * 1381 * Dummy register read function. Used for register blocks 1382 * that certain asics don't have (all asics). 1383 * Returns the value in the register. 1384 */ 1385 static uint32_t amdgpu_block_invalid_rreg(struct amdgpu_device *adev, 1386 uint32_t block, uint32_t reg) 1387 { 1388 DRM_ERROR("Invalid callback to read register 0x%04X in block 0x%04X\n", 1389 reg, block); 1390 BUG(); 1391 return 0; 1392 } 1393 1394 /** 1395 * amdgpu_block_invalid_wreg - dummy reg write function 1396 * 1397 * @adev: amdgpu_device pointer 1398 * @block: offset of instance 1399 * @reg: offset of register 1400 * @v: value to write to the register 1401 * 1402 * Dummy register read function. Used for register blocks 1403 * that certain asics don't have (all asics). 1404 */ 1405 static void amdgpu_block_invalid_wreg(struct amdgpu_device *adev, 1406 uint32_t block, 1407 uint32_t reg, uint32_t v) 1408 { 1409 DRM_ERROR("Invalid block callback to write register 0x%04X in block 0x%04X with 0x%08X\n", 1410 reg, block, v); 1411 BUG(); 1412 } 1413 1414 static uint32_t amdgpu_device_get_vbios_flags(struct amdgpu_device *adev) 1415 { 1416 if (hweight32(adev->aid_mask) && (adev->flags & AMD_IS_APU)) 1417 return AMDGPU_VBIOS_SKIP; 1418 1419 if (hweight32(adev->aid_mask) && amdgpu_passthrough(adev)) 1420 return AMDGPU_VBIOS_OPTIONAL; 1421 1422 return 0; 1423 } 1424 1425 /** 1426 * amdgpu_device_asic_init - Wrapper for atom asic_init 1427 * 1428 * @adev: amdgpu_device pointer 1429 * 1430 * Does any asic specific work and then calls atom asic init. 1431 */ 1432 static int amdgpu_device_asic_init(struct amdgpu_device *adev) 1433 { 1434 uint32_t flags; 1435 bool optional; 1436 int ret; 1437 1438 amdgpu_asic_pre_asic_init(adev); 1439 flags = amdgpu_device_get_vbios_flags(adev); 1440 optional = !!(flags & (AMDGPU_VBIOS_OPTIONAL | AMDGPU_VBIOS_SKIP)); 1441 1442 if (amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 3) || 1443 amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 4) || 1444 amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 5, 0) || 1445 amdgpu_ip_version(adev, GC_HWIP, 0) >= IP_VERSION(11, 0, 0)) { 1446 amdgpu_psp_wait_for_bootloader(adev); 1447 if (optional && !adev->bios) 1448 return 0; 1449 1450 ret = amdgpu_atomfirmware_asic_init(adev, true); 1451 return ret; 1452 } else { 1453 if (optional && !adev->bios) 1454 return 0; 1455 1456 return amdgpu_atom_asic_init(adev->mode_info.atom_context); 1457 } 1458 1459 return 0; 1460 } 1461 1462 /** 1463 * amdgpu_device_mem_scratch_init - allocate the VRAM scratch page 1464 * 1465 * @adev: amdgpu_device pointer 1466 * 1467 * Allocates a scratch page of VRAM for use by various things in the 1468 * driver. 1469 */ 1470 static int amdgpu_device_mem_scratch_init(struct amdgpu_device *adev) 1471 { 1472 return amdgpu_bo_create_kernel(adev, AMDGPU_GPU_PAGE_SIZE, PAGE_SIZE, 1473 AMDGPU_GEM_DOMAIN_VRAM | 1474 AMDGPU_GEM_DOMAIN_GTT, 1475 &adev->mem_scratch.robj, 1476 &adev->mem_scratch.gpu_addr, 1477 (void **)&adev->mem_scratch.ptr); 1478 } 1479 1480 /** 1481 * amdgpu_device_mem_scratch_fini - Free the VRAM scratch page 1482 * 1483 * @adev: amdgpu_device pointer 1484 * 1485 * Frees the VRAM scratch page. 1486 */ 1487 static void amdgpu_device_mem_scratch_fini(struct amdgpu_device *adev) 1488 { 1489 amdgpu_bo_free_kernel(&adev->mem_scratch.robj, NULL, NULL); 1490 } 1491 1492 /** 1493 * amdgpu_device_program_register_sequence - program an array of registers. 1494 * 1495 * @adev: amdgpu_device pointer 1496 * @registers: pointer to the register array 1497 * @array_size: size of the register array 1498 * 1499 * Programs an array or registers with and or masks. 1500 * This is a helper for setting golden registers. 1501 */ 1502 void amdgpu_device_program_register_sequence(struct amdgpu_device *adev, 1503 const u32 *registers, 1504 const u32 array_size) 1505 { 1506 u32 tmp, reg, and_mask, or_mask; 1507 int i; 1508 1509 if (array_size % 3) 1510 return; 1511 1512 for (i = 0; i < array_size; i += 3) { 1513 reg = registers[i + 0]; 1514 and_mask = registers[i + 1]; 1515 or_mask = registers[i + 2]; 1516 1517 if (and_mask == 0xffffffff) { 1518 tmp = or_mask; 1519 } else { 1520 tmp = RREG32(reg); 1521 tmp &= ~and_mask; 1522 if (adev->family >= AMDGPU_FAMILY_AI) 1523 tmp |= (or_mask & and_mask); 1524 else 1525 tmp |= or_mask; 1526 } 1527 WREG32(reg, tmp); 1528 } 1529 } 1530 1531 /** 1532 * amdgpu_device_pci_config_reset - reset the GPU 1533 * 1534 * @adev: amdgpu_device pointer 1535 * 1536 * Resets the GPU using the pci config reset sequence. 1537 * Only applicable to asics prior to vega10. 1538 */ 1539 void amdgpu_device_pci_config_reset(struct amdgpu_device *adev) 1540 { 1541 pci_write_config_dword(adev->pdev, 0x7c, AMDGPU_ASIC_RESET_DATA); 1542 } 1543 1544 /** 1545 * amdgpu_device_pci_reset - reset the GPU using generic PCI means 1546 * 1547 * @adev: amdgpu_device pointer 1548 * 1549 * Resets the GPU using generic pci reset interfaces (FLR, SBR, etc.). 1550 */ 1551 int amdgpu_device_pci_reset(struct amdgpu_device *adev) 1552 { 1553 return pci_reset_function(adev->pdev); 1554 } 1555 1556 /* 1557 * amdgpu_device_wb_*() 1558 * Writeback is the method by which the GPU updates special pages in memory 1559 * with the status of certain GPU events (fences, ring pointers,etc.). 1560 */ 1561 1562 /** 1563 * amdgpu_device_wb_fini - Disable Writeback and free memory 1564 * 1565 * @adev: amdgpu_device pointer 1566 * 1567 * Disables Writeback and frees the Writeback memory (all asics). 1568 * Used at driver shutdown. 1569 */ 1570 static void amdgpu_device_wb_fini(struct amdgpu_device *adev) 1571 { 1572 if (adev->wb.wb_obj) { 1573 amdgpu_bo_free_kernel(&adev->wb.wb_obj, 1574 &adev->wb.gpu_addr, 1575 (void **)&adev->wb.wb); 1576 adev->wb.wb_obj = NULL; 1577 } 1578 } 1579 1580 /** 1581 * amdgpu_device_wb_init - Init Writeback driver info and allocate memory 1582 * 1583 * @adev: amdgpu_device pointer 1584 * 1585 * Initializes writeback and allocates writeback memory (all asics). 1586 * Used at driver startup. 1587 * Returns 0 on success or an -error on failure. 1588 */ 1589 static int amdgpu_device_wb_init(struct amdgpu_device *adev) 1590 { 1591 int r; 1592 1593 if (adev->wb.wb_obj == NULL) { 1594 /* AMDGPU_MAX_WB * sizeof(uint32_t) * 8 = AMDGPU_MAX_WB 256bit slots */ 1595 r = amdgpu_bo_create_kernel(adev, AMDGPU_MAX_WB * sizeof(uint32_t) * 8, 1596 PAGE_SIZE, AMDGPU_GEM_DOMAIN_GTT, 1597 &adev->wb.wb_obj, &adev->wb.gpu_addr, 1598 (void **)&adev->wb.wb); 1599 if (r) { 1600 dev_warn(adev->dev, "(%d) create WB bo failed\n", r); 1601 return r; 1602 } 1603 1604 adev->wb.num_wb = AMDGPU_MAX_WB; 1605 memset(&adev->wb.used, 0, sizeof(adev->wb.used)); 1606 1607 /* clear wb memory */ 1608 memset((char *)adev->wb.wb, 0, AMDGPU_MAX_WB * sizeof(uint32_t) * 8); 1609 } 1610 1611 return 0; 1612 } 1613 1614 /** 1615 * amdgpu_device_wb_get - Allocate a wb entry 1616 * 1617 * @adev: amdgpu_device pointer 1618 * @wb: wb index 1619 * 1620 * Allocate a wb slot for use by the driver (all asics). 1621 * Returns 0 on success or -EINVAL on failure. 1622 */ 1623 int amdgpu_device_wb_get(struct amdgpu_device *adev, u32 *wb) 1624 { 1625 unsigned long flags, offset; 1626 1627 spin_lock_irqsave(&adev->wb.lock, flags); 1628 offset = find_first_zero_bit(adev->wb.used, adev->wb.num_wb); 1629 if (offset < adev->wb.num_wb) { 1630 __set_bit(offset, adev->wb.used); 1631 spin_unlock_irqrestore(&adev->wb.lock, flags); 1632 *wb = offset << 3; /* convert to dw offset */ 1633 return 0; 1634 } else { 1635 spin_unlock_irqrestore(&adev->wb.lock, flags); 1636 return -EINVAL; 1637 } 1638 } 1639 1640 /** 1641 * amdgpu_device_wb_free - Free a wb entry 1642 * 1643 * @adev: amdgpu_device pointer 1644 * @wb: wb index 1645 * 1646 * Free a wb slot allocated for use by the driver (all asics) 1647 */ 1648 void amdgpu_device_wb_free(struct amdgpu_device *adev, u32 wb) 1649 { 1650 unsigned long flags; 1651 1652 wb >>= 3; 1653 spin_lock_irqsave(&adev->wb.lock, flags); 1654 if (wb < adev->wb.num_wb) 1655 __clear_bit(wb, adev->wb.used); 1656 spin_unlock_irqrestore(&adev->wb.lock, flags); 1657 } 1658 1659 /** 1660 * amdgpu_device_resize_fb_bar - try to resize FB BAR 1661 * 1662 * @adev: amdgpu_device pointer 1663 * 1664 * Try to resize FB BAR to make all VRAM CPU accessible. We try very hard not 1665 * to fail, but if any of the BARs is not accessible after the size we abort 1666 * driver loading by returning -ENODEV. 1667 */ 1668 int amdgpu_device_resize_fb_bar(struct amdgpu_device *adev) 1669 { 1670 int rbar_size = pci_rebar_bytes_to_size(adev->gmc.real_vram_size); 1671 struct pci_bus *root; 1672 struct resource *res; 1673 unsigned int i; 1674 u16 cmd; 1675 int r; 1676 1677 if (!IS_ENABLED(CONFIG_PHYS_ADDR_T_64BIT)) 1678 return 0; 1679 1680 /* Bypass for VF */ 1681 if (amdgpu_sriov_vf(adev)) 1682 return 0; 1683 1684 if (!amdgpu_rebar) 1685 return 0; 1686 1687 /* resizing on Dell G5 SE platforms causes problems with runtime pm */ 1688 if ((amdgpu_runtime_pm != 0) && 1689 adev->pdev->vendor == PCI_VENDOR_ID_ATI && 1690 adev->pdev->device == 0x731f && 1691 adev->pdev->subsystem_vendor == PCI_VENDOR_ID_DELL) 1692 return 0; 1693 1694 /* PCI_EXT_CAP_ID_VNDR extended capability is located at 0x100 */ 1695 if (!pci_find_ext_capability(adev->pdev, PCI_EXT_CAP_ID_VNDR)) 1696 DRM_WARN("System can't access extended configuration space, please check!!\n"); 1697 1698 /* skip if the bios has already enabled large BAR */ 1699 if (adev->gmc.real_vram_size && 1700 (pci_resource_len(adev->pdev, 0) >= adev->gmc.real_vram_size)) 1701 return 0; 1702 1703 /* Check if the root BUS has 64bit memory resources */ 1704 root = adev->pdev->bus; 1705 while (root->parent) 1706 root = root->parent; 1707 1708 pci_bus_for_each_resource(root, res, i) { 1709 if (res && res->flags & (IORESOURCE_MEM | IORESOURCE_MEM_64) && 1710 res->start > 0x100000000ull) 1711 break; 1712 } 1713 1714 /* Trying to resize is pointless without a root hub window above 4GB */ 1715 if (!res) 1716 return 0; 1717 1718 /* Limit the BAR size to what is available */ 1719 rbar_size = min(fls(pci_rebar_get_possible_sizes(adev->pdev, 0)) - 1, 1720 rbar_size); 1721 1722 /* Disable memory decoding while we change the BAR addresses and size */ 1723 pci_read_config_word(adev->pdev, PCI_COMMAND, &cmd); 1724 pci_write_config_word(adev->pdev, PCI_COMMAND, 1725 cmd & ~PCI_COMMAND_MEMORY); 1726 1727 /* Free the VRAM and doorbell BAR, we most likely need to move both. */ 1728 amdgpu_doorbell_fini(adev); 1729 if (adev->asic_type >= CHIP_BONAIRE) 1730 pci_release_resource(adev->pdev, 2); 1731 1732 pci_release_resource(adev->pdev, 0); 1733 1734 r = pci_resize_resource(adev->pdev, 0, rbar_size); 1735 if (r == -ENOSPC) 1736 DRM_INFO("Not enough PCI address space for a large BAR."); 1737 else if (r && r != -ENOTSUPP) 1738 DRM_ERROR("Problem resizing BAR0 (%d).", r); 1739 1740 pci_assign_unassigned_bus_resources(adev->pdev->bus); 1741 1742 /* When the doorbell or fb BAR isn't available we have no chance of 1743 * using the device. 1744 */ 1745 r = amdgpu_doorbell_init(adev); 1746 if (r || (pci_resource_flags(adev->pdev, 0) & IORESOURCE_UNSET)) 1747 return -ENODEV; 1748 1749 pci_write_config_word(adev->pdev, PCI_COMMAND, cmd); 1750 1751 return 0; 1752 } 1753 1754 /* 1755 * GPU helpers function. 1756 */ 1757 /** 1758 * amdgpu_device_need_post - check if the hw need post or not 1759 * 1760 * @adev: amdgpu_device pointer 1761 * 1762 * Check if the asic has been initialized (all asics) at driver startup 1763 * or post is needed if hw reset is performed. 1764 * Returns true if need or false if not. 1765 */ 1766 bool amdgpu_device_need_post(struct amdgpu_device *adev) 1767 { 1768 uint32_t reg, flags; 1769 1770 if (amdgpu_sriov_vf(adev)) 1771 return false; 1772 1773 flags = amdgpu_device_get_vbios_flags(adev); 1774 if (flags & AMDGPU_VBIOS_SKIP) 1775 return false; 1776 if ((flags & AMDGPU_VBIOS_OPTIONAL) && !adev->bios) 1777 return false; 1778 1779 if (amdgpu_passthrough(adev)) { 1780 /* for FIJI: In whole GPU pass-through virtualization case, after VM reboot 1781 * some old smc fw still need driver do vPost otherwise gpu hang, while 1782 * those smc fw version above 22.15 doesn't have this flaw, so we force 1783 * vpost executed for smc version below 22.15 1784 */ 1785 if (adev->asic_type == CHIP_FIJI) { 1786 int err; 1787 uint32_t fw_ver; 1788 1789 err = request_firmware(&adev->pm.fw, "amdgpu/fiji_smc.bin", adev->dev); 1790 /* force vPost if error occurred */ 1791 if (err) 1792 return true; 1793 1794 fw_ver = *((uint32_t *)adev->pm.fw->data + 69); 1795 release_firmware(adev->pm.fw); 1796 if (fw_ver < 0x00160e00) 1797 return true; 1798 } 1799 } 1800 1801 /* Don't post if we need to reset whole hive on init */ 1802 if (adev->init_lvl->level == AMDGPU_INIT_LEVEL_MINIMAL_XGMI) 1803 return false; 1804 1805 if (adev->has_hw_reset) { 1806 adev->has_hw_reset = false; 1807 return true; 1808 } 1809 1810 /* bios scratch used on CIK+ */ 1811 if (adev->asic_type >= CHIP_BONAIRE) 1812 return amdgpu_atombios_scratch_need_asic_init(adev); 1813 1814 /* check MEM_SIZE for older asics */ 1815 reg = amdgpu_asic_get_config_memsize(adev); 1816 1817 if ((reg != 0) && (reg != 0xffffffff)) 1818 return false; 1819 1820 return true; 1821 } 1822 1823 /* 1824 * Check whether seamless boot is supported. 1825 * 1826 * So far we only support seamless boot on DCE 3.0 or later. 1827 * If users report that it works on older ASICS as well, we may 1828 * loosen this. 1829 */ 1830 bool amdgpu_device_seamless_boot_supported(struct amdgpu_device *adev) 1831 { 1832 switch (amdgpu_seamless) { 1833 case -1: 1834 break; 1835 case 1: 1836 return true; 1837 case 0: 1838 return false; 1839 default: 1840 DRM_ERROR("Invalid value for amdgpu.seamless: %d\n", 1841 amdgpu_seamless); 1842 return false; 1843 } 1844 1845 if (!(adev->flags & AMD_IS_APU)) 1846 return false; 1847 1848 if (adev->mman.keep_stolen_vga_memory) 1849 return false; 1850 1851 return amdgpu_ip_version(adev, DCE_HWIP, 0) >= IP_VERSION(3, 0, 0); 1852 } 1853 1854 /* 1855 * Intel hosts such as Rocket Lake, Alder Lake, Raptor Lake and Sapphire Rapids 1856 * don't support dynamic speed switching. Until we have confirmation from Intel 1857 * that a specific host supports it, it's safer that we keep it disabled for all. 1858 * 1859 * https://edc.intel.com/content/www/us/en/design/products/platforms/details/raptor-lake-s/13th-generation-core-processors-datasheet-volume-1-of-2/005/pci-express-support/ 1860 * https://gitlab.freedesktop.org/drm/amd/-/issues/2663 1861 */ 1862 static bool amdgpu_device_pcie_dynamic_switching_supported(struct amdgpu_device *adev) 1863 { 1864 #if IS_ENABLED(CONFIG_X86) 1865 struct cpuinfo_x86 *c = &cpu_data(0); 1866 1867 /* eGPU change speeds based on USB4 fabric conditions */ 1868 if (dev_is_removable(adev->dev)) 1869 return true; 1870 1871 if (c->x86_vendor == X86_VENDOR_INTEL) 1872 return false; 1873 #endif 1874 return true; 1875 } 1876 1877 static bool amdgpu_device_aspm_support_quirk(struct amdgpu_device *adev) 1878 { 1879 #if IS_ENABLED(CONFIG_X86) 1880 struct cpuinfo_x86 *c = &cpu_data(0); 1881 1882 if (!(amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(12, 0, 0) || 1883 amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(12, 0, 1))) 1884 return false; 1885 1886 if (c->x86 == 6 && 1887 adev->pm.pcie_gen_mask & CAIL_PCIE_LINK_SPEED_SUPPORT_GEN5) { 1888 switch (c->x86_model) { 1889 case VFM_MODEL(INTEL_ALDERLAKE): 1890 case VFM_MODEL(INTEL_ALDERLAKE_L): 1891 case VFM_MODEL(INTEL_RAPTORLAKE): 1892 case VFM_MODEL(INTEL_RAPTORLAKE_P): 1893 case VFM_MODEL(INTEL_RAPTORLAKE_S): 1894 return true; 1895 default: 1896 return false; 1897 } 1898 } else { 1899 return false; 1900 } 1901 #else 1902 return false; 1903 #endif 1904 } 1905 1906 /** 1907 * amdgpu_device_should_use_aspm - check if the device should program ASPM 1908 * 1909 * @adev: amdgpu_device pointer 1910 * 1911 * Confirm whether the module parameter and pcie bridge agree that ASPM should 1912 * be set for this device. 1913 * 1914 * Returns true if it should be used or false if not. 1915 */ 1916 bool amdgpu_device_should_use_aspm(struct amdgpu_device *adev) 1917 { 1918 switch (amdgpu_aspm) { 1919 case -1: 1920 break; 1921 case 0: 1922 return false; 1923 case 1: 1924 return true; 1925 default: 1926 return false; 1927 } 1928 if (adev->flags & AMD_IS_APU) 1929 return false; 1930 if (amdgpu_device_aspm_support_quirk(adev)) 1931 return false; 1932 return pcie_aspm_enabled(adev->pdev); 1933 } 1934 1935 /* if we get transitioned to only one device, take VGA back */ 1936 /** 1937 * amdgpu_device_vga_set_decode - enable/disable vga decode 1938 * 1939 * @pdev: PCI device pointer 1940 * @state: enable/disable vga decode 1941 * 1942 * Enable/disable vga decode (all asics). 1943 * Returns VGA resource flags. 1944 */ 1945 static unsigned int amdgpu_device_vga_set_decode(struct pci_dev *pdev, 1946 bool state) 1947 { 1948 struct amdgpu_device *adev = drm_to_adev(pci_get_drvdata(pdev)); 1949 1950 amdgpu_asic_set_vga_state(adev, state); 1951 if (state) 1952 return VGA_RSRC_LEGACY_IO | VGA_RSRC_LEGACY_MEM | 1953 VGA_RSRC_NORMAL_IO | VGA_RSRC_NORMAL_MEM; 1954 else 1955 return VGA_RSRC_NORMAL_IO | VGA_RSRC_NORMAL_MEM; 1956 } 1957 1958 /** 1959 * amdgpu_device_check_block_size - validate the vm block size 1960 * 1961 * @adev: amdgpu_device pointer 1962 * 1963 * Validates the vm block size specified via module parameter. 1964 * The vm block size defines number of bits in page table versus page directory, 1965 * a page is 4KB so we have 12 bits offset, minimum 9 bits in the 1966 * page table and the remaining bits are in the page directory. 1967 */ 1968 static void amdgpu_device_check_block_size(struct amdgpu_device *adev) 1969 { 1970 /* defines number of bits in page table versus page directory, 1971 * a page is 4KB so we have 12 bits offset, minimum 9 bits in the 1972 * page table and the remaining bits are in the page directory 1973 */ 1974 if (amdgpu_vm_block_size == -1) 1975 return; 1976 1977 if (amdgpu_vm_block_size < 9) { 1978 dev_warn(adev->dev, "VM page table size (%d) too small\n", 1979 amdgpu_vm_block_size); 1980 amdgpu_vm_block_size = -1; 1981 } 1982 } 1983 1984 /** 1985 * amdgpu_device_check_vm_size - validate the vm size 1986 * 1987 * @adev: amdgpu_device pointer 1988 * 1989 * Validates the vm size in GB specified via module parameter. 1990 * The VM size is the size of the GPU virtual memory space in GB. 1991 */ 1992 static void amdgpu_device_check_vm_size(struct amdgpu_device *adev) 1993 { 1994 /* no need to check the default value */ 1995 if (amdgpu_vm_size == -1) 1996 return; 1997 1998 if (amdgpu_vm_size < 1) { 1999 dev_warn(adev->dev, "VM size (%d) too small, min is 1GB\n", 2000 amdgpu_vm_size); 2001 amdgpu_vm_size = -1; 2002 } 2003 } 2004 2005 static void amdgpu_device_check_smu_prv_buffer_size(struct amdgpu_device *adev) 2006 { 2007 struct sysinfo si; 2008 bool is_os_64 = (sizeof(void *) == 8); 2009 uint64_t total_memory; 2010 uint64_t dram_size_seven_GB = 0x1B8000000; 2011 uint64_t dram_size_three_GB = 0xB8000000; 2012 2013 if (amdgpu_smu_memory_pool_size == 0) 2014 return; 2015 2016 if (!is_os_64) { 2017 DRM_WARN("Not 64-bit OS, feature not supported\n"); 2018 goto def_value; 2019 } 2020 si_meminfo(&si); 2021 total_memory = (uint64_t)si.totalram * si.mem_unit; 2022 2023 if ((amdgpu_smu_memory_pool_size == 1) || 2024 (amdgpu_smu_memory_pool_size == 2)) { 2025 if (total_memory < dram_size_three_GB) 2026 goto def_value1; 2027 } else if ((amdgpu_smu_memory_pool_size == 4) || 2028 (amdgpu_smu_memory_pool_size == 8)) { 2029 if (total_memory < dram_size_seven_GB) 2030 goto def_value1; 2031 } else { 2032 DRM_WARN("Smu memory pool size not supported\n"); 2033 goto def_value; 2034 } 2035 adev->pm.smu_prv_buffer_size = amdgpu_smu_memory_pool_size << 28; 2036 2037 return; 2038 2039 def_value1: 2040 DRM_WARN("No enough system memory\n"); 2041 def_value: 2042 adev->pm.smu_prv_buffer_size = 0; 2043 } 2044 2045 static int amdgpu_device_init_apu_flags(struct amdgpu_device *adev) 2046 { 2047 if (!(adev->flags & AMD_IS_APU) || 2048 adev->asic_type < CHIP_RAVEN) 2049 return 0; 2050 2051 switch (adev->asic_type) { 2052 case CHIP_RAVEN: 2053 if (adev->pdev->device == 0x15dd) 2054 adev->apu_flags |= AMD_APU_IS_RAVEN; 2055 if (adev->pdev->device == 0x15d8) 2056 adev->apu_flags |= AMD_APU_IS_PICASSO; 2057 break; 2058 case CHIP_RENOIR: 2059 if ((adev->pdev->device == 0x1636) || 2060 (adev->pdev->device == 0x164c)) 2061 adev->apu_flags |= AMD_APU_IS_RENOIR; 2062 else 2063 adev->apu_flags |= AMD_APU_IS_GREEN_SARDINE; 2064 break; 2065 case CHIP_VANGOGH: 2066 adev->apu_flags |= AMD_APU_IS_VANGOGH; 2067 break; 2068 case CHIP_YELLOW_CARP: 2069 break; 2070 case CHIP_CYAN_SKILLFISH: 2071 if ((adev->pdev->device == 0x13FE) || 2072 (adev->pdev->device == 0x143F)) 2073 adev->apu_flags |= AMD_APU_IS_CYAN_SKILLFISH2; 2074 break; 2075 default: 2076 break; 2077 } 2078 2079 return 0; 2080 } 2081 2082 /** 2083 * amdgpu_device_check_arguments - validate module params 2084 * 2085 * @adev: amdgpu_device pointer 2086 * 2087 * Validates certain module parameters and updates 2088 * the associated values used by the driver (all asics). 2089 */ 2090 static int amdgpu_device_check_arguments(struct amdgpu_device *adev) 2091 { 2092 int i; 2093 2094 if (amdgpu_sched_jobs < 4) { 2095 dev_warn(adev->dev, "sched jobs (%d) must be at least 4\n", 2096 amdgpu_sched_jobs); 2097 amdgpu_sched_jobs = 4; 2098 } else if (!is_power_of_2(amdgpu_sched_jobs)) { 2099 dev_warn(adev->dev, "sched jobs (%d) must be a power of 2\n", 2100 amdgpu_sched_jobs); 2101 amdgpu_sched_jobs = roundup_pow_of_two(amdgpu_sched_jobs); 2102 } 2103 2104 if (amdgpu_gart_size != -1 && amdgpu_gart_size < 32) { 2105 /* gart size must be greater or equal to 32M */ 2106 dev_warn(adev->dev, "gart size (%d) too small\n", 2107 amdgpu_gart_size); 2108 amdgpu_gart_size = -1; 2109 } 2110 2111 if (amdgpu_gtt_size != -1 && amdgpu_gtt_size < 32) { 2112 /* gtt size must be greater or equal to 32M */ 2113 dev_warn(adev->dev, "gtt size (%d) too small\n", 2114 amdgpu_gtt_size); 2115 amdgpu_gtt_size = -1; 2116 } 2117 2118 /* valid range is between 4 and 9 inclusive */ 2119 if (amdgpu_vm_fragment_size != -1 && 2120 (amdgpu_vm_fragment_size > 9 || amdgpu_vm_fragment_size < 4)) { 2121 dev_warn(adev->dev, "valid range is between 4 and 9\n"); 2122 amdgpu_vm_fragment_size = -1; 2123 } 2124 2125 if (amdgpu_sched_hw_submission < 2) { 2126 dev_warn(adev->dev, "sched hw submission jobs (%d) must be at least 2\n", 2127 amdgpu_sched_hw_submission); 2128 amdgpu_sched_hw_submission = 2; 2129 } else if (!is_power_of_2(amdgpu_sched_hw_submission)) { 2130 dev_warn(adev->dev, "sched hw submission jobs (%d) must be a power of 2\n", 2131 amdgpu_sched_hw_submission); 2132 amdgpu_sched_hw_submission = roundup_pow_of_two(amdgpu_sched_hw_submission); 2133 } 2134 2135 if (amdgpu_reset_method < -1 || amdgpu_reset_method > 4) { 2136 dev_warn(adev->dev, "invalid option for reset method, reverting to default\n"); 2137 amdgpu_reset_method = -1; 2138 } 2139 2140 amdgpu_device_check_smu_prv_buffer_size(adev); 2141 2142 amdgpu_device_check_vm_size(adev); 2143 2144 amdgpu_device_check_block_size(adev); 2145 2146 adev->firmware.load_type = amdgpu_ucode_get_load_type(adev, amdgpu_fw_load_type); 2147 2148 for (i = 0; i < MAX_XCP; i++) { 2149 switch (amdgpu_enforce_isolation) { 2150 case -1: 2151 case 0: 2152 default: 2153 /* disable */ 2154 adev->enforce_isolation[i] = AMDGPU_ENFORCE_ISOLATION_DISABLE; 2155 break; 2156 case 1: 2157 /* enable */ 2158 adev->enforce_isolation[i] = 2159 AMDGPU_ENFORCE_ISOLATION_ENABLE; 2160 break; 2161 case 2: 2162 /* enable legacy mode */ 2163 adev->enforce_isolation[i] = 2164 AMDGPU_ENFORCE_ISOLATION_ENABLE_LEGACY; 2165 break; 2166 case 3: 2167 /* enable only process isolation without submitting cleaner shader */ 2168 adev->enforce_isolation[i] = 2169 AMDGPU_ENFORCE_ISOLATION_NO_CLEANER_SHADER; 2170 break; 2171 } 2172 } 2173 2174 return 0; 2175 } 2176 2177 /** 2178 * amdgpu_switcheroo_set_state - set switcheroo state 2179 * 2180 * @pdev: pci dev pointer 2181 * @state: vga_switcheroo state 2182 * 2183 * Callback for the switcheroo driver. Suspends or resumes 2184 * the asics before or after it is powered up using ACPI methods. 2185 */ 2186 static void amdgpu_switcheroo_set_state(struct pci_dev *pdev, 2187 enum vga_switcheroo_state state) 2188 { 2189 struct drm_device *dev = pci_get_drvdata(pdev); 2190 int r; 2191 2192 if (amdgpu_device_supports_px(dev) && state == VGA_SWITCHEROO_OFF) 2193 return; 2194 2195 if (state == VGA_SWITCHEROO_ON) { 2196 pr_info("switched on\n"); 2197 /* don't suspend or resume card normally */ 2198 dev->switch_power_state = DRM_SWITCH_POWER_CHANGING; 2199 2200 pci_set_power_state(pdev, PCI_D0); 2201 amdgpu_device_load_pci_state(pdev); 2202 r = pci_enable_device(pdev); 2203 if (r) 2204 DRM_WARN("pci_enable_device failed (%d)\n", r); 2205 amdgpu_device_resume(dev, true); 2206 2207 dev->switch_power_state = DRM_SWITCH_POWER_ON; 2208 } else { 2209 pr_info("switched off\n"); 2210 dev->switch_power_state = DRM_SWITCH_POWER_CHANGING; 2211 amdgpu_device_prepare(dev); 2212 amdgpu_device_suspend(dev, true); 2213 amdgpu_device_cache_pci_state(pdev); 2214 /* Shut down the device */ 2215 pci_disable_device(pdev); 2216 pci_set_power_state(pdev, PCI_D3cold); 2217 dev->switch_power_state = DRM_SWITCH_POWER_OFF; 2218 } 2219 } 2220 2221 /** 2222 * amdgpu_switcheroo_can_switch - see if switcheroo state can change 2223 * 2224 * @pdev: pci dev pointer 2225 * 2226 * Callback for the switcheroo driver. Check of the switcheroo 2227 * state can be changed. 2228 * Returns true if the state can be changed, false if not. 2229 */ 2230 static bool amdgpu_switcheroo_can_switch(struct pci_dev *pdev) 2231 { 2232 struct drm_device *dev = pci_get_drvdata(pdev); 2233 2234 /* 2235 * FIXME: open_count is protected by drm_global_mutex but that would lead to 2236 * locking inversion with the driver load path. And the access here is 2237 * completely racy anyway. So don't bother with locking for now. 2238 */ 2239 return atomic_read(&dev->open_count) == 0; 2240 } 2241 2242 static const struct vga_switcheroo_client_ops amdgpu_switcheroo_ops = { 2243 .set_gpu_state = amdgpu_switcheroo_set_state, 2244 .reprobe = NULL, 2245 .can_switch = amdgpu_switcheroo_can_switch, 2246 }; 2247 2248 /** 2249 * amdgpu_device_ip_set_clockgating_state - set the CG state 2250 * 2251 * @dev: amdgpu_device pointer 2252 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.) 2253 * @state: clockgating state (gate or ungate) 2254 * 2255 * Sets the requested clockgating state for all instances of 2256 * the hardware IP specified. 2257 * Returns the error code from the last instance. 2258 */ 2259 int amdgpu_device_ip_set_clockgating_state(void *dev, 2260 enum amd_ip_block_type block_type, 2261 enum amd_clockgating_state state) 2262 { 2263 struct amdgpu_device *adev = dev; 2264 int i, r = 0; 2265 2266 for (i = 0; i < adev->num_ip_blocks; i++) { 2267 if (!adev->ip_blocks[i].status.valid) 2268 continue; 2269 if (adev->ip_blocks[i].version->type != block_type) 2270 continue; 2271 if (!adev->ip_blocks[i].version->funcs->set_clockgating_state) 2272 continue; 2273 r = adev->ip_blocks[i].version->funcs->set_clockgating_state( 2274 &adev->ip_blocks[i], state); 2275 if (r) 2276 DRM_ERROR("set_clockgating_state of IP block <%s> failed %d\n", 2277 adev->ip_blocks[i].version->funcs->name, r); 2278 } 2279 return r; 2280 } 2281 2282 /** 2283 * amdgpu_device_ip_set_powergating_state - set the PG state 2284 * 2285 * @dev: amdgpu_device pointer 2286 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.) 2287 * @state: powergating state (gate or ungate) 2288 * 2289 * Sets the requested powergating state for all instances of 2290 * the hardware IP specified. 2291 * Returns the error code from the last instance. 2292 */ 2293 int amdgpu_device_ip_set_powergating_state(void *dev, 2294 enum amd_ip_block_type block_type, 2295 enum amd_powergating_state state) 2296 { 2297 struct amdgpu_device *adev = dev; 2298 int i, r = 0; 2299 2300 for (i = 0; i < adev->num_ip_blocks; i++) { 2301 if (!adev->ip_blocks[i].status.valid) 2302 continue; 2303 if (adev->ip_blocks[i].version->type != block_type) 2304 continue; 2305 if (!adev->ip_blocks[i].version->funcs->set_powergating_state) 2306 continue; 2307 r = adev->ip_blocks[i].version->funcs->set_powergating_state( 2308 &adev->ip_blocks[i], state); 2309 if (r) 2310 DRM_ERROR("set_powergating_state of IP block <%s> failed %d\n", 2311 adev->ip_blocks[i].version->funcs->name, r); 2312 } 2313 return r; 2314 } 2315 2316 /** 2317 * amdgpu_device_ip_get_clockgating_state - get the CG state 2318 * 2319 * @adev: amdgpu_device pointer 2320 * @flags: clockgating feature flags 2321 * 2322 * Walks the list of IPs on the device and updates the clockgating 2323 * flags for each IP. 2324 * Updates @flags with the feature flags for each hardware IP where 2325 * clockgating is enabled. 2326 */ 2327 void amdgpu_device_ip_get_clockgating_state(struct amdgpu_device *adev, 2328 u64 *flags) 2329 { 2330 int i; 2331 2332 for (i = 0; i < adev->num_ip_blocks; i++) { 2333 if (!adev->ip_blocks[i].status.valid) 2334 continue; 2335 if (adev->ip_blocks[i].version->funcs->get_clockgating_state) 2336 adev->ip_blocks[i].version->funcs->get_clockgating_state( 2337 &adev->ip_blocks[i], flags); 2338 } 2339 } 2340 2341 /** 2342 * amdgpu_device_ip_wait_for_idle - wait for idle 2343 * 2344 * @adev: amdgpu_device pointer 2345 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.) 2346 * 2347 * Waits for the request hardware IP to be idle. 2348 * Returns 0 for success or a negative error code on failure. 2349 */ 2350 int amdgpu_device_ip_wait_for_idle(struct amdgpu_device *adev, 2351 enum amd_ip_block_type block_type) 2352 { 2353 int i, r; 2354 2355 for (i = 0; i < adev->num_ip_blocks; i++) { 2356 if (!adev->ip_blocks[i].status.valid) 2357 continue; 2358 if (adev->ip_blocks[i].version->type == block_type) { 2359 if (adev->ip_blocks[i].version->funcs->wait_for_idle) { 2360 r = adev->ip_blocks[i].version->funcs->wait_for_idle( 2361 &adev->ip_blocks[i]); 2362 if (r) 2363 return r; 2364 } 2365 break; 2366 } 2367 } 2368 return 0; 2369 2370 } 2371 2372 /** 2373 * amdgpu_device_ip_is_valid - is the hardware IP enabled 2374 * 2375 * @adev: amdgpu_device pointer 2376 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.) 2377 * 2378 * Check if the hardware IP is enable or not. 2379 * Returns true if it the IP is enable, false if not. 2380 */ 2381 bool amdgpu_device_ip_is_valid(struct amdgpu_device *adev, 2382 enum amd_ip_block_type block_type) 2383 { 2384 int i; 2385 2386 for (i = 0; i < adev->num_ip_blocks; i++) { 2387 if (adev->ip_blocks[i].version->type == block_type) 2388 return adev->ip_blocks[i].status.valid; 2389 } 2390 return false; 2391 2392 } 2393 2394 /** 2395 * amdgpu_device_ip_get_ip_block - get a hw IP pointer 2396 * 2397 * @adev: amdgpu_device pointer 2398 * @type: Type of hardware IP (SMU, GFX, UVD, etc.) 2399 * 2400 * Returns a pointer to the hardware IP block structure 2401 * if it exists for the asic, otherwise NULL. 2402 */ 2403 struct amdgpu_ip_block * 2404 amdgpu_device_ip_get_ip_block(struct amdgpu_device *adev, 2405 enum amd_ip_block_type type) 2406 { 2407 int i; 2408 2409 for (i = 0; i < adev->num_ip_blocks; i++) 2410 if (adev->ip_blocks[i].version->type == type) 2411 return &adev->ip_blocks[i]; 2412 2413 return NULL; 2414 } 2415 2416 /** 2417 * amdgpu_device_ip_block_version_cmp 2418 * 2419 * @adev: amdgpu_device pointer 2420 * @type: enum amd_ip_block_type 2421 * @major: major version 2422 * @minor: minor version 2423 * 2424 * return 0 if equal or greater 2425 * return 1 if smaller or the ip_block doesn't exist 2426 */ 2427 int amdgpu_device_ip_block_version_cmp(struct amdgpu_device *adev, 2428 enum amd_ip_block_type type, 2429 u32 major, u32 minor) 2430 { 2431 struct amdgpu_ip_block *ip_block = amdgpu_device_ip_get_ip_block(adev, type); 2432 2433 if (ip_block && ((ip_block->version->major > major) || 2434 ((ip_block->version->major == major) && 2435 (ip_block->version->minor >= minor)))) 2436 return 0; 2437 2438 return 1; 2439 } 2440 2441 /** 2442 * amdgpu_device_ip_block_add 2443 * 2444 * @adev: amdgpu_device pointer 2445 * @ip_block_version: pointer to the IP to add 2446 * 2447 * Adds the IP block driver information to the collection of IPs 2448 * on the asic. 2449 */ 2450 int amdgpu_device_ip_block_add(struct amdgpu_device *adev, 2451 const struct amdgpu_ip_block_version *ip_block_version) 2452 { 2453 if (!ip_block_version) 2454 return -EINVAL; 2455 2456 switch (ip_block_version->type) { 2457 case AMD_IP_BLOCK_TYPE_VCN: 2458 if (adev->harvest_ip_mask & AMD_HARVEST_IP_VCN_MASK) 2459 return 0; 2460 break; 2461 case AMD_IP_BLOCK_TYPE_JPEG: 2462 if (adev->harvest_ip_mask & AMD_HARVEST_IP_JPEG_MASK) 2463 return 0; 2464 break; 2465 default: 2466 break; 2467 } 2468 2469 dev_info(adev->dev, "detected ip block number %d <%s>\n", 2470 adev->num_ip_blocks, ip_block_version->funcs->name); 2471 2472 adev->ip_blocks[adev->num_ip_blocks].adev = adev; 2473 2474 adev->ip_blocks[adev->num_ip_blocks++].version = ip_block_version; 2475 2476 return 0; 2477 } 2478 2479 /** 2480 * amdgpu_device_enable_virtual_display - enable virtual display feature 2481 * 2482 * @adev: amdgpu_device pointer 2483 * 2484 * Enabled the virtual display feature if the user has enabled it via 2485 * the module parameter virtual_display. This feature provides a virtual 2486 * display hardware on headless boards or in virtualized environments. 2487 * This function parses and validates the configuration string specified by 2488 * the user and configures the virtual display configuration (number of 2489 * virtual connectors, crtcs, etc.) specified. 2490 */ 2491 static void amdgpu_device_enable_virtual_display(struct amdgpu_device *adev) 2492 { 2493 adev->enable_virtual_display = false; 2494 2495 if (amdgpu_virtual_display) { 2496 const char *pci_address_name = pci_name(adev->pdev); 2497 char *pciaddstr, *pciaddstr_tmp, *pciaddname_tmp, *pciaddname; 2498 2499 pciaddstr = kstrdup(amdgpu_virtual_display, GFP_KERNEL); 2500 pciaddstr_tmp = pciaddstr; 2501 while ((pciaddname_tmp = strsep(&pciaddstr_tmp, ";"))) { 2502 pciaddname = strsep(&pciaddname_tmp, ","); 2503 if (!strcmp("all", pciaddname) 2504 || !strcmp(pci_address_name, pciaddname)) { 2505 long num_crtc; 2506 int res = -1; 2507 2508 adev->enable_virtual_display = true; 2509 2510 if (pciaddname_tmp) 2511 res = kstrtol(pciaddname_tmp, 10, 2512 &num_crtc); 2513 2514 if (!res) { 2515 if (num_crtc < 1) 2516 num_crtc = 1; 2517 if (num_crtc > 6) 2518 num_crtc = 6; 2519 adev->mode_info.num_crtc = num_crtc; 2520 } else { 2521 adev->mode_info.num_crtc = 1; 2522 } 2523 break; 2524 } 2525 } 2526 2527 DRM_INFO("virtual display string:%s, %s:virtual_display:%d, num_crtc:%d\n", 2528 amdgpu_virtual_display, pci_address_name, 2529 adev->enable_virtual_display, adev->mode_info.num_crtc); 2530 2531 kfree(pciaddstr); 2532 } 2533 } 2534 2535 void amdgpu_device_set_sriov_virtual_display(struct amdgpu_device *adev) 2536 { 2537 if (amdgpu_sriov_vf(adev) && !adev->enable_virtual_display) { 2538 adev->mode_info.num_crtc = 1; 2539 adev->enable_virtual_display = true; 2540 DRM_INFO("virtual_display:%d, num_crtc:%d\n", 2541 adev->enable_virtual_display, adev->mode_info.num_crtc); 2542 } 2543 } 2544 2545 /** 2546 * amdgpu_device_parse_gpu_info_fw - parse gpu info firmware 2547 * 2548 * @adev: amdgpu_device pointer 2549 * 2550 * Parses the asic configuration parameters specified in the gpu info 2551 * firmware and makes them available to the driver for use in configuring 2552 * the asic. 2553 * Returns 0 on success, -EINVAL on failure. 2554 */ 2555 static int amdgpu_device_parse_gpu_info_fw(struct amdgpu_device *adev) 2556 { 2557 const char *chip_name; 2558 int err; 2559 const struct gpu_info_firmware_header_v1_0 *hdr; 2560 2561 adev->firmware.gpu_info_fw = NULL; 2562 2563 if (adev->mman.discovery_bin) 2564 return 0; 2565 2566 switch (adev->asic_type) { 2567 default: 2568 return 0; 2569 case CHIP_VEGA10: 2570 chip_name = "vega10"; 2571 break; 2572 case CHIP_VEGA12: 2573 chip_name = "vega12"; 2574 break; 2575 case CHIP_RAVEN: 2576 if (adev->apu_flags & AMD_APU_IS_RAVEN2) 2577 chip_name = "raven2"; 2578 else if (adev->apu_flags & AMD_APU_IS_PICASSO) 2579 chip_name = "picasso"; 2580 else 2581 chip_name = "raven"; 2582 break; 2583 case CHIP_ARCTURUS: 2584 chip_name = "arcturus"; 2585 break; 2586 case CHIP_NAVI12: 2587 chip_name = "navi12"; 2588 break; 2589 } 2590 2591 err = amdgpu_ucode_request(adev, &adev->firmware.gpu_info_fw, 2592 AMDGPU_UCODE_OPTIONAL, 2593 "amdgpu/%s_gpu_info.bin", chip_name); 2594 if (err) { 2595 dev_err(adev->dev, 2596 "Failed to get gpu_info firmware \"%s_gpu_info.bin\"\n", 2597 chip_name); 2598 goto out; 2599 } 2600 2601 hdr = (const struct gpu_info_firmware_header_v1_0 *)adev->firmware.gpu_info_fw->data; 2602 amdgpu_ucode_print_gpu_info_hdr(&hdr->header); 2603 2604 switch (hdr->version_major) { 2605 case 1: 2606 { 2607 const struct gpu_info_firmware_v1_0 *gpu_info_fw = 2608 (const struct gpu_info_firmware_v1_0 *)(adev->firmware.gpu_info_fw->data + 2609 le32_to_cpu(hdr->header.ucode_array_offset_bytes)); 2610 2611 /* 2612 * Should be dropped when DAL no longer needs it. 2613 */ 2614 if (adev->asic_type == CHIP_NAVI12) 2615 goto parse_soc_bounding_box; 2616 2617 adev->gfx.config.max_shader_engines = le32_to_cpu(gpu_info_fw->gc_num_se); 2618 adev->gfx.config.max_cu_per_sh = le32_to_cpu(gpu_info_fw->gc_num_cu_per_sh); 2619 adev->gfx.config.max_sh_per_se = le32_to_cpu(gpu_info_fw->gc_num_sh_per_se); 2620 adev->gfx.config.max_backends_per_se = le32_to_cpu(gpu_info_fw->gc_num_rb_per_se); 2621 adev->gfx.config.max_texture_channel_caches = 2622 le32_to_cpu(gpu_info_fw->gc_num_tccs); 2623 adev->gfx.config.max_gprs = le32_to_cpu(gpu_info_fw->gc_num_gprs); 2624 adev->gfx.config.max_gs_threads = le32_to_cpu(gpu_info_fw->gc_num_max_gs_thds); 2625 adev->gfx.config.gs_vgt_table_depth = le32_to_cpu(gpu_info_fw->gc_gs_table_depth); 2626 adev->gfx.config.gs_prim_buffer_depth = le32_to_cpu(gpu_info_fw->gc_gsprim_buff_depth); 2627 adev->gfx.config.double_offchip_lds_buf = 2628 le32_to_cpu(gpu_info_fw->gc_double_offchip_lds_buffer); 2629 adev->gfx.cu_info.wave_front_size = le32_to_cpu(gpu_info_fw->gc_wave_size); 2630 adev->gfx.cu_info.max_waves_per_simd = 2631 le32_to_cpu(gpu_info_fw->gc_max_waves_per_simd); 2632 adev->gfx.cu_info.max_scratch_slots_per_cu = 2633 le32_to_cpu(gpu_info_fw->gc_max_scratch_slots_per_cu); 2634 adev->gfx.cu_info.lds_size = le32_to_cpu(gpu_info_fw->gc_lds_size); 2635 if (hdr->version_minor >= 1) { 2636 const struct gpu_info_firmware_v1_1 *gpu_info_fw = 2637 (const struct gpu_info_firmware_v1_1 *)(adev->firmware.gpu_info_fw->data + 2638 le32_to_cpu(hdr->header.ucode_array_offset_bytes)); 2639 adev->gfx.config.num_sc_per_sh = 2640 le32_to_cpu(gpu_info_fw->num_sc_per_sh); 2641 adev->gfx.config.num_packer_per_sc = 2642 le32_to_cpu(gpu_info_fw->num_packer_per_sc); 2643 } 2644 2645 parse_soc_bounding_box: 2646 /* 2647 * soc bounding box info is not integrated in disocovery table, 2648 * we always need to parse it from gpu info firmware if needed. 2649 */ 2650 if (hdr->version_minor == 2) { 2651 const struct gpu_info_firmware_v1_2 *gpu_info_fw = 2652 (const struct gpu_info_firmware_v1_2 *)(adev->firmware.gpu_info_fw->data + 2653 le32_to_cpu(hdr->header.ucode_array_offset_bytes)); 2654 adev->dm.soc_bounding_box = &gpu_info_fw->soc_bounding_box; 2655 } 2656 break; 2657 } 2658 default: 2659 dev_err(adev->dev, 2660 "Unsupported gpu_info table %d\n", hdr->header.ucode_version); 2661 err = -EINVAL; 2662 goto out; 2663 } 2664 out: 2665 return err; 2666 } 2667 2668 /** 2669 * amdgpu_device_ip_early_init - run early init for hardware IPs 2670 * 2671 * @adev: amdgpu_device pointer 2672 * 2673 * Early initialization pass for hardware IPs. The hardware IPs that make 2674 * up each asic are discovered each IP's early_init callback is run. This 2675 * is the first stage in initializing the asic. 2676 * Returns 0 on success, negative error code on failure. 2677 */ 2678 static int amdgpu_device_ip_early_init(struct amdgpu_device *adev) 2679 { 2680 struct amdgpu_ip_block *ip_block; 2681 struct pci_dev *parent; 2682 bool total, skip_bios; 2683 uint32_t bios_flags; 2684 int i, r; 2685 2686 amdgpu_device_enable_virtual_display(adev); 2687 2688 if (amdgpu_sriov_vf(adev)) { 2689 r = amdgpu_virt_request_full_gpu(adev, true); 2690 if (r) 2691 return r; 2692 } 2693 2694 switch (adev->asic_type) { 2695 #ifdef CONFIG_DRM_AMDGPU_SI 2696 case CHIP_VERDE: 2697 case CHIP_TAHITI: 2698 case CHIP_PITCAIRN: 2699 case CHIP_OLAND: 2700 case CHIP_HAINAN: 2701 adev->family = AMDGPU_FAMILY_SI; 2702 r = si_set_ip_blocks(adev); 2703 if (r) 2704 return r; 2705 break; 2706 #endif 2707 #ifdef CONFIG_DRM_AMDGPU_CIK 2708 case CHIP_BONAIRE: 2709 case CHIP_HAWAII: 2710 case CHIP_KAVERI: 2711 case CHIP_KABINI: 2712 case CHIP_MULLINS: 2713 if (adev->flags & AMD_IS_APU) 2714 adev->family = AMDGPU_FAMILY_KV; 2715 else 2716 adev->family = AMDGPU_FAMILY_CI; 2717 2718 r = cik_set_ip_blocks(adev); 2719 if (r) 2720 return r; 2721 break; 2722 #endif 2723 case CHIP_TOPAZ: 2724 case CHIP_TONGA: 2725 case CHIP_FIJI: 2726 case CHIP_POLARIS10: 2727 case CHIP_POLARIS11: 2728 case CHIP_POLARIS12: 2729 case CHIP_VEGAM: 2730 case CHIP_CARRIZO: 2731 case CHIP_STONEY: 2732 if (adev->flags & AMD_IS_APU) 2733 adev->family = AMDGPU_FAMILY_CZ; 2734 else 2735 adev->family = AMDGPU_FAMILY_VI; 2736 2737 r = vi_set_ip_blocks(adev); 2738 if (r) 2739 return r; 2740 break; 2741 default: 2742 r = amdgpu_discovery_set_ip_blocks(adev); 2743 if (r) 2744 return r; 2745 break; 2746 } 2747 2748 /* Check for IP version 9.4.3 with A0 hardware */ 2749 if (amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 3) && 2750 !amdgpu_device_get_rev_id(adev)) { 2751 dev_err(adev->dev, "Unsupported A0 hardware\n"); 2752 return -ENODEV; /* device unsupported - no device error */ 2753 } 2754 2755 if (amdgpu_has_atpx() && 2756 (amdgpu_is_atpx_hybrid() || 2757 amdgpu_has_atpx_dgpu_power_cntl()) && 2758 ((adev->flags & AMD_IS_APU) == 0) && 2759 !dev_is_removable(&adev->pdev->dev)) 2760 adev->flags |= AMD_IS_PX; 2761 2762 if (!(adev->flags & AMD_IS_APU)) { 2763 parent = pcie_find_root_port(adev->pdev); 2764 adev->has_pr3 = parent ? pci_pr3_present(parent) : false; 2765 } 2766 2767 adev->pm.pp_feature = amdgpu_pp_feature_mask; 2768 if (amdgpu_sriov_vf(adev) || sched_policy == KFD_SCHED_POLICY_NO_HWS) 2769 adev->pm.pp_feature &= ~PP_GFXOFF_MASK; 2770 if (amdgpu_sriov_vf(adev) && adev->asic_type == CHIP_SIENNA_CICHLID) 2771 adev->pm.pp_feature &= ~PP_OVERDRIVE_MASK; 2772 if (!amdgpu_device_pcie_dynamic_switching_supported(adev)) 2773 adev->pm.pp_feature &= ~PP_PCIE_DPM_MASK; 2774 2775 total = true; 2776 for (i = 0; i < adev->num_ip_blocks; i++) { 2777 ip_block = &adev->ip_blocks[i]; 2778 2779 if ((amdgpu_ip_block_mask & (1 << i)) == 0) { 2780 DRM_WARN("disabled ip block: %d <%s>\n", 2781 i, adev->ip_blocks[i].version->funcs->name); 2782 adev->ip_blocks[i].status.valid = false; 2783 } else if (ip_block->version->funcs->early_init) { 2784 r = ip_block->version->funcs->early_init(ip_block); 2785 if (r == -ENOENT) { 2786 adev->ip_blocks[i].status.valid = false; 2787 } else if (r) { 2788 DRM_ERROR("early_init of IP block <%s> failed %d\n", 2789 adev->ip_blocks[i].version->funcs->name, r); 2790 total = false; 2791 } else { 2792 adev->ip_blocks[i].status.valid = true; 2793 } 2794 } else { 2795 adev->ip_blocks[i].status.valid = true; 2796 } 2797 /* get the vbios after the asic_funcs are set up */ 2798 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON) { 2799 r = amdgpu_device_parse_gpu_info_fw(adev); 2800 if (r) 2801 return r; 2802 2803 bios_flags = amdgpu_device_get_vbios_flags(adev); 2804 skip_bios = !!(bios_flags & AMDGPU_VBIOS_SKIP); 2805 /* Read BIOS */ 2806 if (!skip_bios) { 2807 bool optional = 2808 !!(bios_flags & AMDGPU_VBIOS_OPTIONAL); 2809 if (!amdgpu_get_bios(adev) && !optional) 2810 return -EINVAL; 2811 2812 if (optional && !adev->bios) 2813 dev_info( 2814 adev->dev, 2815 "VBIOS image optional, proceeding without VBIOS image"); 2816 2817 if (adev->bios) { 2818 r = amdgpu_atombios_init(adev); 2819 if (r) { 2820 dev_err(adev->dev, 2821 "amdgpu_atombios_init failed\n"); 2822 amdgpu_vf_error_put( 2823 adev, 2824 AMDGIM_ERROR_VF_ATOMBIOS_INIT_FAIL, 2825 0, 0); 2826 return r; 2827 } 2828 } 2829 } 2830 2831 /*get pf2vf msg info at it's earliest time*/ 2832 if (amdgpu_sriov_vf(adev)) 2833 amdgpu_virt_init_data_exchange(adev); 2834 2835 } 2836 } 2837 if (!total) 2838 return -ENODEV; 2839 2840 if (adev->gmc.xgmi.supported) 2841 amdgpu_xgmi_early_init(adev); 2842 2843 ip_block = amdgpu_device_ip_get_ip_block(adev, AMD_IP_BLOCK_TYPE_GFX); 2844 if (ip_block->status.valid != false) 2845 amdgpu_amdkfd_device_probe(adev); 2846 2847 adev->cg_flags &= amdgpu_cg_mask; 2848 adev->pg_flags &= amdgpu_pg_mask; 2849 2850 return 0; 2851 } 2852 2853 static int amdgpu_device_ip_hw_init_phase1(struct amdgpu_device *adev) 2854 { 2855 int i, r; 2856 2857 for (i = 0; i < adev->num_ip_blocks; i++) { 2858 if (!adev->ip_blocks[i].status.sw) 2859 continue; 2860 if (adev->ip_blocks[i].status.hw) 2861 continue; 2862 if (!amdgpu_ip_member_of_hwini( 2863 adev, adev->ip_blocks[i].version->type)) 2864 continue; 2865 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON || 2866 (amdgpu_sriov_vf(adev) && (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP)) || 2867 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH) { 2868 r = adev->ip_blocks[i].version->funcs->hw_init(&adev->ip_blocks[i]); 2869 if (r) { 2870 DRM_ERROR("hw_init of IP block <%s> failed %d\n", 2871 adev->ip_blocks[i].version->funcs->name, r); 2872 return r; 2873 } 2874 adev->ip_blocks[i].status.hw = true; 2875 } 2876 } 2877 2878 return 0; 2879 } 2880 2881 static int amdgpu_device_ip_hw_init_phase2(struct amdgpu_device *adev) 2882 { 2883 int i, r; 2884 2885 for (i = 0; i < adev->num_ip_blocks; i++) { 2886 if (!adev->ip_blocks[i].status.sw) 2887 continue; 2888 if (adev->ip_blocks[i].status.hw) 2889 continue; 2890 if (!amdgpu_ip_member_of_hwini( 2891 adev, adev->ip_blocks[i].version->type)) 2892 continue; 2893 r = adev->ip_blocks[i].version->funcs->hw_init(&adev->ip_blocks[i]); 2894 if (r) { 2895 DRM_ERROR("hw_init of IP block <%s> failed %d\n", 2896 adev->ip_blocks[i].version->funcs->name, r); 2897 return r; 2898 } 2899 adev->ip_blocks[i].status.hw = true; 2900 } 2901 2902 return 0; 2903 } 2904 2905 static int amdgpu_device_fw_loading(struct amdgpu_device *adev) 2906 { 2907 int r = 0; 2908 int i; 2909 uint32_t smu_version; 2910 2911 if (adev->asic_type >= CHIP_VEGA10) { 2912 for (i = 0; i < adev->num_ip_blocks; i++) { 2913 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_PSP) 2914 continue; 2915 2916 if (!amdgpu_ip_member_of_hwini(adev, 2917 AMD_IP_BLOCK_TYPE_PSP)) 2918 break; 2919 2920 if (!adev->ip_blocks[i].status.sw) 2921 continue; 2922 2923 /* no need to do the fw loading again if already done*/ 2924 if (adev->ip_blocks[i].status.hw == true) 2925 break; 2926 2927 if (amdgpu_in_reset(adev) || adev->in_suspend) { 2928 r = amdgpu_ip_block_resume(&adev->ip_blocks[i]); 2929 if (r) 2930 return r; 2931 } else { 2932 r = adev->ip_blocks[i].version->funcs->hw_init(&adev->ip_blocks[i]); 2933 if (r) { 2934 DRM_ERROR("hw_init of IP block <%s> failed %d\n", 2935 adev->ip_blocks[i].version->funcs->name, r); 2936 return r; 2937 } 2938 adev->ip_blocks[i].status.hw = true; 2939 } 2940 break; 2941 } 2942 } 2943 2944 if (!amdgpu_sriov_vf(adev) || adev->asic_type == CHIP_TONGA) 2945 r = amdgpu_pm_load_smu_firmware(adev, &smu_version); 2946 2947 return r; 2948 } 2949 2950 static int amdgpu_device_init_schedulers(struct amdgpu_device *adev) 2951 { 2952 struct drm_sched_init_args args = { 2953 .ops = &amdgpu_sched_ops, 2954 .num_rqs = DRM_SCHED_PRIORITY_COUNT, 2955 .timeout_wq = adev->reset_domain->wq, 2956 .dev = adev->dev, 2957 }; 2958 long timeout; 2959 int r, i; 2960 2961 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 2962 struct amdgpu_ring *ring = adev->rings[i]; 2963 2964 /* No need to setup the GPU scheduler for rings that don't need it */ 2965 if (!ring || ring->no_scheduler) 2966 continue; 2967 2968 switch (ring->funcs->type) { 2969 case AMDGPU_RING_TYPE_GFX: 2970 timeout = adev->gfx_timeout; 2971 break; 2972 case AMDGPU_RING_TYPE_COMPUTE: 2973 timeout = adev->compute_timeout; 2974 break; 2975 case AMDGPU_RING_TYPE_SDMA: 2976 timeout = adev->sdma_timeout; 2977 break; 2978 default: 2979 timeout = adev->video_timeout; 2980 break; 2981 } 2982 2983 args.timeout = timeout; 2984 args.credit_limit = ring->num_hw_submission; 2985 args.score = ring->sched_score; 2986 args.name = ring->name; 2987 2988 r = drm_sched_init(&ring->sched, &args); 2989 if (r) { 2990 DRM_ERROR("Failed to create scheduler on ring %s.\n", 2991 ring->name); 2992 return r; 2993 } 2994 r = amdgpu_uvd_entity_init(adev, ring); 2995 if (r) { 2996 DRM_ERROR("Failed to create UVD scheduling entity on ring %s.\n", 2997 ring->name); 2998 return r; 2999 } 3000 r = amdgpu_vce_entity_init(adev, ring); 3001 if (r) { 3002 DRM_ERROR("Failed to create VCE scheduling entity on ring %s.\n", 3003 ring->name); 3004 return r; 3005 } 3006 } 3007 3008 amdgpu_xcp_update_partition_sched_list(adev); 3009 3010 return 0; 3011 } 3012 3013 3014 /** 3015 * amdgpu_device_ip_init - run init for hardware IPs 3016 * 3017 * @adev: amdgpu_device pointer 3018 * 3019 * Main initialization pass for hardware IPs. The list of all the hardware 3020 * IPs that make up the asic is walked and the sw_init and hw_init callbacks 3021 * are run. sw_init initializes the software state associated with each IP 3022 * and hw_init initializes the hardware associated with each IP. 3023 * Returns 0 on success, negative error code on failure. 3024 */ 3025 static int amdgpu_device_ip_init(struct amdgpu_device *adev) 3026 { 3027 bool init_badpage; 3028 int i, r; 3029 3030 r = amdgpu_ras_init(adev); 3031 if (r) 3032 return r; 3033 3034 for (i = 0; i < adev->num_ip_blocks; i++) { 3035 if (!adev->ip_blocks[i].status.valid) 3036 continue; 3037 if (adev->ip_blocks[i].version->funcs->sw_init) { 3038 r = adev->ip_blocks[i].version->funcs->sw_init(&adev->ip_blocks[i]); 3039 if (r) { 3040 DRM_ERROR("sw_init of IP block <%s> failed %d\n", 3041 adev->ip_blocks[i].version->funcs->name, r); 3042 goto init_failed; 3043 } 3044 } 3045 adev->ip_blocks[i].status.sw = true; 3046 3047 if (!amdgpu_ip_member_of_hwini( 3048 adev, adev->ip_blocks[i].version->type)) 3049 continue; 3050 3051 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON) { 3052 /* need to do common hw init early so everything is set up for gmc */ 3053 r = adev->ip_blocks[i].version->funcs->hw_init(&adev->ip_blocks[i]); 3054 if (r) { 3055 DRM_ERROR("hw_init %d failed %d\n", i, r); 3056 goto init_failed; 3057 } 3058 adev->ip_blocks[i].status.hw = true; 3059 } else if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) { 3060 /* need to do gmc hw init early so we can allocate gpu mem */ 3061 /* Try to reserve bad pages early */ 3062 if (amdgpu_sriov_vf(adev)) 3063 amdgpu_virt_exchange_data(adev); 3064 3065 r = amdgpu_device_mem_scratch_init(adev); 3066 if (r) { 3067 DRM_ERROR("amdgpu_mem_scratch_init failed %d\n", r); 3068 goto init_failed; 3069 } 3070 r = adev->ip_blocks[i].version->funcs->hw_init(&adev->ip_blocks[i]); 3071 if (r) { 3072 DRM_ERROR("hw_init %d failed %d\n", i, r); 3073 goto init_failed; 3074 } 3075 r = amdgpu_device_wb_init(adev); 3076 if (r) { 3077 DRM_ERROR("amdgpu_device_wb_init failed %d\n", r); 3078 goto init_failed; 3079 } 3080 adev->ip_blocks[i].status.hw = true; 3081 3082 /* right after GMC hw init, we create CSA */ 3083 if (adev->gfx.mcbp) { 3084 r = amdgpu_allocate_static_csa(adev, &adev->virt.csa_obj, 3085 AMDGPU_GEM_DOMAIN_VRAM | 3086 AMDGPU_GEM_DOMAIN_GTT, 3087 AMDGPU_CSA_SIZE); 3088 if (r) { 3089 DRM_ERROR("allocate CSA failed %d\n", r); 3090 goto init_failed; 3091 } 3092 } 3093 3094 r = amdgpu_seq64_init(adev); 3095 if (r) { 3096 DRM_ERROR("allocate seq64 failed %d\n", r); 3097 goto init_failed; 3098 } 3099 } 3100 } 3101 3102 if (amdgpu_sriov_vf(adev)) 3103 amdgpu_virt_init_data_exchange(adev); 3104 3105 r = amdgpu_ib_pool_init(adev); 3106 if (r) { 3107 dev_err(adev->dev, "IB initialization failed (%d).\n", r); 3108 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_IB_INIT_FAIL, 0, r); 3109 goto init_failed; 3110 } 3111 3112 r = amdgpu_ucode_create_bo(adev); /* create ucode bo when sw_init complete*/ 3113 if (r) 3114 goto init_failed; 3115 3116 r = amdgpu_device_ip_hw_init_phase1(adev); 3117 if (r) 3118 goto init_failed; 3119 3120 r = amdgpu_device_fw_loading(adev); 3121 if (r) 3122 goto init_failed; 3123 3124 r = amdgpu_device_ip_hw_init_phase2(adev); 3125 if (r) 3126 goto init_failed; 3127 3128 /* 3129 * retired pages will be loaded from eeprom and reserved here, 3130 * it should be called after amdgpu_device_ip_hw_init_phase2 since 3131 * for some ASICs the RAS EEPROM code relies on SMU fully functioning 3132 * for I2C communication which only true at this point. 3133 * 3134 * amdgpu_ras_recovery_init may fail, but the upper only cares the 3135 * failure from bad gpu situation and stop amdgpu init process 3136 * accordingly. For other failed cases, it will still release all 3137 * the resource and print error message, rather than returning one 3138 * negative value to upper level. 3139 * 3140 * Note: theoretically, this should be called before all vram allocations 3141 * to protect retired page from abusing 3142 */ 3143 init_badpage = (adev->init_lvl->level != AMDGPU_INIT_LEVEL_MINIMAL_XGMI); 3144 r = amdgpu_ras_recovery_init(adev, init_badpage); 3145 if (r) 3146 goto init_failed; 3147 3148 /** 3149 * In case of XGMI grab extra reference for reset domain for this device 3150 */ 3151 if (adev->gmc.xgmi.num_physical_nodes > 1) { 3152 if (amdgpu_xgmi_add_device(adev) == 0) { 3153 if (!amdgpu_sriov_vf(adev)) { 3154 struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev); 3155 3156 if (WARN_ON(!hive)) { 3157 r = -ENOENT; 3158 goto init_failed; 3159 } 3160 3161 if (!hive->reset_domain || 3162 !amdgpu_reset_get_reset_domain(hive->reset_domain)) { 3163 r = -ENOENT; 3164 amdgpu_put_xgmi_hive(hive); 3165 goto init_failed; 3166 } 3167 3168 /* Drop the early temporary reset domain we created for device */ 3169 amdgpu_reset_put_reset_domain(adev->reset_domain); 3170 adev->reset_domain = hive->reset_domain; 3171 amdgpu_put_xgmi_hive(hive); 3172 } 3173 } 3174 } 3175 3176 r = amdgpu_device_init_schedulers(adev); 3177 if (r) 3178 goto init_failed; 3179 3180 if (adev->mman.buffer_funcs_ring->sched.ready) 3181 amdgpu_ttm_set_buffer_funcs_status(adev, true); 3182 3183 /* Don't init kfd if whole hive need to be reset during init */ 3184 if (adev->init_lvl->level != AMDGPU_INIT_LEVEL_MINIMAL_XGMI) { 3185 kgd2kfd_init_zone_device(adev); 3186 amdgpu_amdkfd_device_init(adev); 3187 } 3188 3189 amdgpu_fru_get_product_info(adev); 3190 3191 if (!amdgpu_sriov_vf(adev) || amdgpu_sriov_ras_cper_en(adev)) 3192 r = amdgpu_cper_init(adev); 3193 3194 init_failed: 3195 3196 return r; 3197 } 3198 3199 /** 3200 * amdgpu_device_fill_reset_magic - writes reset magic to gart pointer 3201 * 3202 * @adev: amdgpu_device pointer 3203 * 3204 * Writes a reset magic value to the gart pointer in VRAM. The driver calls 3205 * this function before a GPU reset. If the value is retained after a 3206 * GPU reset, VRAM has not been lost. Some GPU resets may destroy VRAM contents. 3207 */ 3208 static void amdgpu_device_fill_reset_magic(struct amdgpu_device *adev) 3209 { 3210 memcpy(adev->reset_magic, adev->gart.ptr, AMDGPU_RESET_MAGIC_NUM); 3211 } 3212 3213 /** 3214 * amdgpu_device_check_vram_lost - check if vram is valid 3215 * 3216 * @adev: amdgpu_device pointer 3217 * 3218 * Checks the reset magic value written to the gart pointer in VRAM. 3219 * The driver calls this after a GPU reset to see if the contents of 3220 * VRAM is lost or now. 3221 * returns true if vram is lost, false if not. 3222 */ 3223 static bool amdgpu_device_check_vram_lost(struct amdgpu_device *adev) 3224 { 3225 if (memcmp(adev->gart.ptr, adev->reset_magic, 3226 AMDGPU_RESET_MAGIC_NUM)) 3227 return true; 3228 3229 if (!amdgpu_in_reset(adev)) 3230 return false; 3231 3232 /* 3233 * For all ASICs with baco/mode1 reset, the VRAM is 3234 * always assumed to be lost. 3235 */ 3236 switch (amdgpu_asic_reset_method(adev)) { 3237 case AMD_RESET_METHOD_LINK: 3238 case AMD_RESET_METHOD_BACO: 3239 case AMD_RESET_METHOD_MODE1: 3240 return true; 3241 default: 3242 return false; 3243 } 3244 } 3245 3246 /** 3247 * amdgpu_device_set_cg_state - set clockgating for amdgpu device 3248 * 3249 * @adev: amdgpu_device pointer 3250 * @state: clockgating state (gate or ungate) 3251 * 3252 * The list of all the hardware IPs that make up the asic is walked and the 3253 * set_clockgating_state callbacks are run. 3254 * Late initialization pass enabling clockgating for hardware IPs. 3255 * Fini or suspend, pass disabling clockgating for hardware IPs. 3256 * Returns 0 on success, negative error code on failure. 3257 */ 3258 3259 int amdgpu_device_set_cg_state(struct amdgpu_device *adev, 3260 enum amd_clockgating_state state) 3261 { 3262 int i, j, r; 3263 3264 if (amdgpu_emu_mode == 1) 3265 return 0; 3266 3267 for (j = 0; j < adev->num_ip_blocks; j++) { 3268 i = state == AMD_CG_STATE_GATE ? j : adev->num_ip_blocks - j - 1; 3269 if (!adev->ip_blocks[i].status.late_initialized) 3270 continue; 3271 /* skip CG for GFX, SDMA on S0ix */ 3272 if (adev->in_s0ix && 3273 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GFX || 3274 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SDMA)) 3275 continue; 3276 /* skip CG for VCE/UVD, it's handled specially */ 3277 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_UVD && 3278 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCE && 3279 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCN && 3280 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_JPEG && 3281 adev->ip_blocks[i].version->funcs->set_clockgating_state) { 3282 /* enable clockgating to save power */ 3283 r = adev->ip_blocks[i].version->funcs->set_clockgating_state(&adev->ip_blocks[i], 3284 state); 3285 if (r) { 3286 DRM_ERROR("set_clockgating_state(gate) of IP block <%s> failed %d\n", 3287 adev->ip_blocks[i].version->funcs->name, r); 3288 return r; 3289 } 3290 } 3291 } 3292 3293 return 0; 3294 } 3295 3296 int amdgpu_device_set_pg_state(struct amdgpu_device *adev, 3297 enum amd_powergating_state state) 3298 { 3299 int i, j, r; 3300 3301 if (amdgpu_emu_mode == 1) 3302 return 0; 3303 3304 for (j = 0; j < adev->num_ip_blocks; j++) { 3305 i = state == AMD_PG_STATE_GATE ? j : adev->num_ip_blocks - j - 1; 3306 if (!adev->ip_blocks[i].status.late_initialized) 3307 continue; 3308 /* skip PG for GFX, SDMA on S0ix */ 3309 if (adev->in_s0ix && 3310 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GFX || 3311 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SDMA)) 3312 continue; 3313 /* skip CG for VCE/UVD, it's handled specially */ 3314 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_UVD && 3315 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCE && 3316 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCN && 3317 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_JPEG && 3318 adev->ip_blocks[i].version->funcs->set_powergating_state) { 3319 /* enable powergating to save power */ 3320 r = adev->ip_blocks[i].version->funcs->set_powergating_state(&adev->ip_blocks[i], 3321 state); 3322 if (r) { 3323 DRM_ERROR("set_powergating_state(gate) of IP block <%s> failed %d\n", 3324 adev->ip_blocks[i].version->funcs->name, r); 3325 return r; 3326 } 3327 } 3328 } 3329 return 0; 3330 } 3331 3332 static int amdgpu_device_enable_mgpu_fan_boost(void) 3333 { 3334 struct amdgpu_gpu_instance *gpu_ins; 3335 struct amdgpu_device *adev; 3336 int i, ret = 0; 3337 3338 mutex_lock(&mgpu_info.mutex); 3339 3340 /* 3341 * MGPU fan boost feature should be enabled 3342 * only when there are two or more dGPUs in 3343 * the system 3344 */ 3345 if (mgpu_info.num_dgpu < 2) 3346 goto out; 3347 3348 for (i = 0; i < mgpu_info.num_dgpu; i++) { 3349 gpu_ins = &(mgpu_info.gpu_ins[i]); 3350 adev = gpu_ins->adev; 3351 if (!(adev->flags & AMD_IS_APU) && 3352 !gpu_ins->mgpu_fan_enabled) { 3353 ret = amdgpu_dpm_enable_mgpu_fan_boost(adev); 3354 if (ret) 3355 break; 3356 3357 gpu_ins->mgpu_fan_enabled = 1; 3358 } 3359 } 3360 3361 out: 3362 mutex_unlock(&mgpu_info.mutex); 3363 3364 return ret; 3365 } 3366 3367 /** 3368 * amdgpu_device_ip_late_init - run late init for hardware IPs 3369 * 3370 * @adev: amdgpu_device pointer 3371 * 3372 * Late initialization pass for hardware IPs. The list of all the hardware 3373 * IPs that make up the asic is walked and the late_init callbacks are run. 3374 * late_init covers any special initialization that an IP requires 3375 * after all of the have been initialized or something that needs to happen 3376 * late in the init process. 3377 * Returns 0 on success, negative error code on failure. 3378 */ 3379 static int amdgpu_device_ip_late_init(struct amdgpu_device *adev) 3380 { 3381 struct amdgpu_gpu_instance *gpu_instance; 3382 int i = 0, r; 3383 3384 for (i = 0; i < adev->num_ip_blocks; i++) { 3385 if (!adev->ip_blocks[i].status.hw) 3386 continue; 3387 if (adev->ip_blocks[i].version->funcs->late_init) { 3388 r = adev->ip_blocks[i].version->funcs->late_init(&adev->ip_blocks[i]); 3389 if (r) { 3390 DRM_ERROR("late_init of IP block <%s> failed %d\n", 3391 adev->ip_blocks[i].version->funcs->name, r); 3392 return r; 3393 } 3394 } 3395 adev->ip_blocks[i].status.late_initialized = true; 3396 } 3397 3398 r = amdgpu_ras_late_init(adev); 3399 if (r) { 3400 DRM_ERROR("amdgpu_ras_late_init failed %d", r); 3401 return r; 3402 } 3403 3404 if (!amdgpu_reset_in_recovery(adev)) 3405 amdgpu_ras_set_error_query_ready(adev, true); 3406 3407 amdgpu_device_set_cg_state(adev, AMD_CG_STATE_GATE); 3408 amdgpu_device_set_pg_state(adev, AMD_PG_STATE_GATE); 3409 3410 amdgpu_device_fill_reset_magic(adev); 3411 3412 r = amdgpu_device_enable_mgpu_fan_boost(); 3413 if (r) 3414 DRM_ERROR("enable mgpu fan boost failed (%d).\n", r); 3415 3416 /* For passthrough configuration on arcturus and aldebaran, enable special handling SBR */ 3417 if (amdgpu_passthrough(adev) && 3418 ((adev->asic_type == CHIP_ARCTURUS && adev->gmc.xgmi.num_physical_nodes > 1) || 3419 adev->asic_type == CHIP_ALDEBARAN)) 3420 amdgpu_dpm_handle_passthrough_sbr(adev, true); 3421 3422 if (adev->gmc.xgmi.num_physical_nodes > 1) { 3423 mutex_lock(&mgpu_info.mutex); 3424 3425 /* 3426 * Reset device p-state to low as this was booted with high. 3427 * 3428 * This should be performed only after all devices from the same 3429 * hive get initialized. 3430 * 3431 * However, it's unknown how many device in the hive in advance. 3432 * As this is counted one by one during devices initializations. 3433 * 3434 * So, we wait for all XGMI interlinked devices initialized. 3435 * This may bring some delays as those devices may come from 3436 * different hives. But that should be OK. 3437 */ 3438 if (mgpu_info.num_dgpu == adev->gmc.xgmi.num_physical_nodes) { 3439 for (i = 0; i < mgpu_info.num_gpu; i++) { 3440 gpu_instance = &(mgpu_info.gpu_ins[i]); 3441 if (gpu_instance->adev->flags & AMD_IS_APU) 3442 continue; 3443 3444 r = amdgpu_xgmi_set_pstate(gpu_instance->adev, 3445 AMDGPU_XGMI_PSTATE_MIN); 3446 if (r) { 3447 DRM_ERROR("pstate setting failed (%d).\n", r); 3448 break; 3449 } 3450 } 3451 } 3452 3453 mutex_unlock(&mgpu_info.mutex); 3454 } 3455 3456 return 0; 3457 } 3458 3459 static void amdgpu_ip_block_hw_fini(struct amdgpu_ip_block *ip_block) 3460 { 3461 int r; 3462 3463 if (!ip_block->version->funcs->hw_fini) { 3464 DRM_ERROR("hw_fini of IP block <%s> not defined\n", 3465 ip_block->version->funcs->name); 3466 } else { 3467 r = ip_block->version->funcs->hw_fini(ip_block); 3468 /* XXX handle errors */ 3469 if (r) { 3470 DRM_DEBUG("hw_fini of IP block <%s> failed %d\n", 3471 ip_block->version->funcs->name, r); 3472 } 3473 } 3474 3475 ip_block->status.hw = false; 3476 } 3477 3478 /** 3479 * amdgpu_device_smu_fini_early - smu hw_fini wrapper 3480 * 3481 * @adev: amdgpu_device pointer 3482 * 3483 * For ASICs need to disable SMC first 3484 */ 3485 static void amdgpu_device_smu_fini_early(struct amdgpu_device *adev) 3486 { 3487 int i; 3488 3489 if (amdgpu_ip_version(adev, GC_HWIP, 0) > IP_VERSION(9, 0, 0)) 3490 return; 3491 3492 for (i = 0; i < adev->num_ip_blocks; i++) { 3493 if (!adev->ip_blocks[i].status.hw) 3494 continue; 3495 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) { 3496 amdgpu_ip_block_hw_fini(&adev->ip_blocks[i]); 3497 break; 3498 } 3499 } 3500 } 3501 3502 static int amdgpu_device_ip_fini_early(struct amdgpu_device *adev) 3503 { 3504 int i, r; 3505 3506 for (i = 0; i < adev->num_ip_blocks; i++) { 3507 if (!adev->ip_blocks[i].version->funcs->early_fini) 3508 continue; 3509 3510 r = adev->ip_blocks[i].version->funcs->early_fini(&adev->ip_blocks[i]); 3511 if (r) { 3512 DRM_DEBUG("early_fini of IP block <%s> failed %d\n", 3513 adev->ip_blocks[i].version->funcs->name, r); 3514 } 3515 } 3516 3517 amdgpu_device_set_pg_state(adev, AMD_PG_STATE_UNGATE); 3518 amdgpu_device_set_cg_state(adev, AMD_CG_STATE_UNGATE); 3519 3520 amdgpu_amdkfd_suspend(adev, false); 3521 amdgpu_userq_suspend(adev); 3522 3523 /* Workaround for ASICs need to disable SMC first */ 3524 amdgpu_device_smu_fini_early(adev); 3525 3526 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 3527 if (!adev->ip_blocks[i].status.hw) 3528 continue; 3529 3530 amdgpu_ip_block_hw_fini(&adev->ip_blocks[i]); 3531 } 3532 3533 if (amdgpu_sriov_vf(adev)) { 3534 if (amdgpu_virt_release_full_gpu(adev, false)) 3535 DRM_ERROR("failed to release exclusive mode on fini\n"); 3536 } 3537 3538 return 0; 3539 } 3540 3541 /** 3542 * amdgpu_device_ip_fini - run fini for hardware IPs 3543 * 3544 * @adev: amdgpu_device pointer 3545 * 3546 * Main teardown pass for hardware IPs. The list of all the hardware 3547 * IPs that make up the asic is walked and the hw_fini and sw_fini callbacks 3548 * are run. hw_fini tears down the hardware associated with each IP 3549 * and sw_fini tears down any software state associated with each IP. 3550 * Returns 0 on success, negative error code on failure. 3551 */ 3552 static int amdgpu_device_ip_fini(struct amdgpu_device *adev) 3553 { 3554 int i, r; 3555 3556 amdgpu_cper_fini(adev); 3557 3558 if (amdgpu_sriov_vf(adev) && adev->virt.ras_init_done) 3559 amdgpu_virt_release_ras_err_handler_data(adev); 3560 3561 if (adev->gmc.xgmi.num_physical_nodes > 1) 3562 amdgpu_xgmi_remove_device(adev); 3563 3564 amdgpu_amdkfd_device_fini_sw(adev); 3565 3566 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 3567 if (!adev->ip_blocks[i].status.sw) 3568 continue; 3569 3570 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) { 3571 amdgpu_ucode_free_bo(adev); 3572 amdgpu_free_static_csa(&adev->virt.csa_obj); 3573 amdgpu_device_wb_fini(adev); 3574 amdgpu_device_mem_scratch_fini(adev); 3575 amdgpu_ib_pool_fini(adev); 3576 amdgpu_seq64_fini(adev); 3577 amdgpu_doorbell_fini(adev); 3578 } 3579 if (adev->ip_blocks[i].version->funcs->sw_fini) { 3580 r = adev->ip_blocks[i].version->funcs->sw_fini(&adev->ip_blocks[i]); 3581 /* XXX handle errors */ 3582 if (r) { 3583 DRM_DEBUG("sw_fini of IP block <%s> failed %d\n", 3584 adev->ip_blocks[i].version->funcs->name, r); 3585 } 3586 } 3587 adev->ip_blocks[i].status.sw = false; 3588 adev->ip_blocks[i].status.valid = false; 3589 } 3590 3591 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 3592 if (!adev->ip_blocks[i].status.late_initialized) 3593 continue; 3594 if (adev->ip_blocks[i].version->funcs->late_fini) 3595 adev->ip_blocks[i].version->funcs->late_fini(&adev->ip_blocks[i]); 3596 adev->ip_blocks[i].status.late_initialized = false; 3597 } 3598 3599 amdgpu_ras_fini(adev); 3600 3601 return 0; 3602 } 3603 3604 /** 3605 * amdgpu_device_delayed_init_work_handler - work handler for IB tests 3606 * 3607 * @work: work_struct. 3608 */ 3609 static void amdgpu_device_delayed_init_work_handler(struct work_struct *work) 3610 { 3611 struct amdgpu_device *adev = 3612 container_of(work, struct amdgpu_device, delayed_init_work.work); 3613 int r; 3614 3615 r = amdgpu_ib_ring_tests(adev); 3616 if (r) 3617 DRM_ERROR("ib ring test failed (%d).\n", r); 3618 } 3619 3620 static void amdgpu_device_delay_enable_gfx_off(struct work_struct *work) 3621 { 3622 struct amdgpu_device *adev = 3623 container_of(work, struct amdgpu_device, gfx.gfx_off_delay_work.work); 3624 3625 WARN_ON_ONCE(adev->gfx.gfx_off_state); 3626 WARN_ON_ONCE(adev->gfx.gfx_off_req_count); 3627 3628 if (!amdgpu_dpm_set_powergating_by_smu(adev, AMD_IP_BLOCK_TYPE_GFX, true, 0)) 3629 adev->gfx.gfx_off_state = true; 3630 } 3631 3632 /** 3633 * amdgpu_device_ip_suspend_phase1 - run suspend for hardware IPs (phase 1) 3634 * 3635 * @adev: amdgpu_device pointer 3636 * 3637 * Main suspend function for hardware IPs. The list of all the hardware 3638 * IPs that make up the asic is walked, clockgating is disabled and the 3639 * suspend callbacks are run. suspend puts the hardware and software state 3640 * in each IP into a state suitable for suspend. 3641 * Returns 0 on success, negative error code on failure. 3642 */ 3643 static int amdgpu_device_ip_suspend_phase1(struct amdgpu_device *adev) 3644 { 3645 int i, r; 3646 3647 amdgpu_device_set_pg_state(adev, AMD_PG_STATE_UNGATE); 3648 amdgpu_device_set_cg_state(adev, AMD_CG_STATE_UNGATE); 3649 3650 /* 3651 * Per PMFW team's suggestion, driver needs to handle gfxoff 3652 * and df cstate features disablement for gpu reset(e.g. Mode1Reset) 3653 * scenario. Add the missing df cstate disablement here. 3654 */ 3655 if (amdgpu_dpm_set_df_cstate(adev, DF_CSTATE_DISALLOW)) 3656 dev_warn(adev->dev, "Failed to disallow df cstate"); 3657 3658 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 3659 if (!adev->ip_blocks[i].status.valid) 3660 continue; 3661 3662 /* displays are handled separately */ 3663 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_DCE) 3664 continue; 3665 3666 /* XXX handle errors */ 3667 r = amdgpu_ip_block_suspend(&adev->ip_blocks[i]); 3668 if (r) 3669 return r; 3670 } 3671 3672 return 0; 3673 } 3674 3675 /** 3676 * amdgpu_device_ip_suspend_phase2 - run suspend for hardware IPs (phase 2) 3677 * 3678 * @adev: amdgpu_device pointer 3679 * 3680 * Main suspend function for hardware IPs. The list of all the hardware 3681 * IPs that make up the asic is walked, clockgating is disabled and the 3682 * suspend callbacks are run. suspend puts the hardware and software state 3683 * in each IP into a state suitable for suspend. 3684 * Returns 0 on success, negative error code on failure. 3685 */ 3686 static int amdgpu_device_ip_suspend_phase2(struct amdgpu_device *adev) 3687 { 3688 int i, r; 3689 3690 if (adev->in_s0ix) 3691 amdgpu_dpm_gfx_state_change(adev, sGpuChangeState_D3Entry); 3692 3693 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 3694 if (!adev->ip_blocks[i].status.valid) 3695 continue; 3696 /* displays are handled in phase1 */ 3697 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE) 3698 continue; 3699 /* PSP lost connection when err_event_athub occurs */ 3700 if (amdgpu_ras_intr_triggered() && 3701 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) { 3702 adev->ip_blocks[i].status.hw = false; 3703 continue; 3704 } 3705 3706 /* skip unnecessary suspend if we do not initialize them yet */ 3707 if (!amdgpu_ip_member_of_hwini( 3708 adev, adev->ip_blocks[i].version->type)) 3709 continue; 3710 3711 /* Since we skip suspend for S0i3, we need to cancel the delayed 3712 * idle work here as the suspend callback never gets called. 3713 */ 3714 if (adev->in_s0ix && 3715 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GFX && 3716 amdgpu_ip_version(adev, GC_HWIP, 0) >= IP_VERSION(10, 0, 0)) 3717 cancel_delayed_work_sync(&adev->gfx.idle_work); 3718 /* skip suspend of gfx/mes and psp for S0ix 3719 * gfx is in gfxoff state, so on resume it will exit gfxoff just 3720 * like at runtime. PSP is also part of the always on hardware 3721 * so no need to suspend it. 3722 */ 3723 if (adev->in_s0ix && 3724 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP || 3725 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GFX || 3726 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_MES)) 3727 continue; 3728 3729 /* SDMA 5.x+ is part of GFX power domain so it's covered by GFXOFF */ 3730 if (adev->in_s0ix && 3731 (amdgpu_ip_version(adev, SDMA0_HWIP, 0) >= 3732 IP_VERSION(5, 0, 0)) && 3733 (adev->ip_blocks[i].version->type == 3734 AMD_IP_BLOCK_TYPE_SDMA)) 3735 continue; 3736 3737 /* Once swPSP provides the IMU, RLC FW binaries to TOS during cold-boot. 3738 * These are in TMR, hence are expected to be reused by PSP-TOS to reload 3739 * from this location and RLC Autoload automatically also gets loaded 3740 * from here based on PMFW -> PSP message during re-init sequence. 3741 * Therefore, the psp suspend & resume should be skipped to avoid destroy 3742 * the TMR and reload FWs again for IMU enabled APU ASICs. 3743 */ 3744 if (amdgpu_in_reset(adev) && 3745 (adev->flags & AMD_IS_APU) && adev->gfx.imu.funcs && 3746 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) 3747 continue; 3748 3749 /* XXX handle errors */ 3750 r = amdgpu_ip_block_suspend(&adev->ip_blocks[i]); 3751 adev->ip_blocks[i].status.hw = false; 3752 3753 /* handle putting the SMC in the appropriate state */ 3754 if (!amdgpu_sriov_vf(adev)) { 3755 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) { 3756 r = amdgpu_dpm_set_mp1_state(adev, adev->mp1_state); 3757 if (r) { 3758 DRM_ERROR("SMC failed to set mp1 state %d, %d\n", 3759 adev->mp1_state, r); 3760 return r; 3761 } 3762 } 3763 } 3764 } 3765 3766 return 0; 3767 } 3768 3769 /** 3770 * amdgpu_device_ip_suspend - run suspend for hardware IPs 3771 * 3772 * @adev: amdgpu_device pointer 3773 * 3774 * Main suspend function for hardware IPs. The list of all the hardware 3775 * IPs that make up the asic is walked, clockgating is disabled and the 3776 * suspend callbacks are run. suspend puts the hardware and software state 3777 * in each IP into a state suitable for suspend. 3778 * Returns 0 on success, negative error code on failure. 3779 */ 3780 int amdgpu_device_ip_suspend(struct amdgpu_device *adev) 3781 { 3782 int r; 3783 3784 if (amdgpu_sriov_vf(adev)) { 3785 amdgpu_virt_fini_data_exchange(adev); 3786 amdgpu_virt_request_full_gpu(adev, false); 3787 } 3788 3789 amdgpu_ttm_set_buffer_funcs_status(adev, false); 3790 3791 r = amdgpu_device_ip_suspend_phase1(adev); 3792 if (r) 3793 return r; 3794 r = amdgpu_device_ip_suspend_phase2(adev); 3795 3796 if (amdgpu_sriov_vf(adev)) 3797 amdgpu_virt_release_full_gpu(adev, false); 3798 3799 return r; 3800 } 3801 3802 static int amdgpu_device_ip_reinit_early_sriov(struct amdgpu_device *adev) 3803 { 3804 int i, r; 3805 3806 static enum amd_ip_block_type ip_order[] = { 3807 AMD_IP_BLOCK_TYPE_COMMON, 3808 AMD_IP_BLOCK_TYPE_GMC, 3809 AMD_IP_BLOCK_TYPE_PSP, 3810 AMD_IP_BLOCK_TYPE_IH, 3811 }; 3812 3813 for (i = 0; i < adev->num_ip_blocks; i++) { 3814 int j; 3815 struct amdgpu_ip_block *block; 3816 3817 block = &adev->ip_blocks[i]; 3818 block->status.hw = false; 3819 3820 for (j = 0; j < ARRAY_SIZE(ip_order); j++) { 3821 3822 if (block->version->type != ip_order[j] || 3823 !block->status.valid) 3824 continue; 3825 3826 r = block->version->funcs->hw_init(&adev->ip_blocks[i]); 3827 if (r) { 3828 dev_err(adev->dev, "RE-INIT-early: %s failed\n", 3829 block->version->funcs->name); 3830 return r; 3831 } 3832 block->status.hw = true; 3833 } 3834 } 3835 3836 return 0; 3837 } 3838 3839 static int amdgpu_device_ip_reinit_late_sriov(struct amdgpu_device *adev) 3840 { 3841 struct amdgpu_ip_block *block; 3842 int i, r = 0; 3843 3844 static enum amd_ip_block_type ip_order[] = { 3845 AMD_IP_BLOCK_TYPE_SMC, 3846 AMD_IP_BLOCK_TYPE_DCE, 3847 AMD_IP_BLOCK_TYPE_GFX, 3848 AMD_IP_BLOCK_TYPE_SDMA, 3849 AMD_IP_BLOCK_TYPE_MES, 3850 AMD_IP_BLOCK_TYPE_UVD, 3851 AMD_IP_BLOCK_TYPE_VCE, 3852 AMD_IP_BLOCK_TYPE_VCN, 3853 AMD_IP_BLOCK_TYPE_JPEG 3854 }; 3855 3856 for (i = 0; i < ARRAY_SIZE(ip_order); i++) { 3857 block = amdgpu_device_ip_get_ip_block(adev, ip_order[i]); 3858 3859 if (!block) 3860 continue; 3861 3862 if (block->status.valid && !block->status.hw) { 3863 if (block->version->type == AMD_IP_BLOCK_TYPE_SMC) { 3864 r = amdgpu_ip_block_resume(block); 3865 } else { 3866 r = block->version->funcs->hw_init(block); 3867 } 3868 3869 if (r) { 3870 dev_err(adev->dev, "RE-INIT-late: %s failed\n", 3871 block->version->funcs->name); 3872 break; 3873 } 3874 block->status.hw = true; 3875 } 3876 } 3877 3878 return r; 3879 } 3880 3881 /** 3882 * amdgpu_device_ip_resume_phase1 - run resume for hardware IPs 3883 * 3884 * @adev: amdgpu_device pointer 3885 * 3886 * First resume function for hardware IPs. The list of all the hardware 3887 * IPs that make up the asic is walked and the resume callbacks are run for 3888 * COMMON, GMC, and IH. resume puts the hardware into a functional state 3889 * after a suspend and updates the software state as necessary. This 3890 * function is also used for restoring the GPU after a GPU reset. 3891 * Returns 0 on success, negative error code on failure. 3892 */ 3893 static int amdgpu_device_ip_resume_phase1(struct amdgpu_device *adev) 3894 { 3895 int i, r; 3896 3897 for (i = 0; i < adev->num_ip_blocks; i++) { 3898 if (!adev->ip_blocks[i].status.valid || adev->ip_blocks[i].status.hw) 3899 continue; 3900 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON || 3901 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC || 3902 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH || 3903 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP && amdgpu_sriov_vf(adev))) { 3904 3905 r = amdgpu_ip_block_resume(&adev->ip_blocks[i]); 3906 if (r) 3907 return r; 3908 } 3909 } 3910 3911 return 0; 3912 } 3913 3914 /** 3915 * amdgpu_device_ip_resume_phase2 - run resume for hardware IPs 3916 * 3917 * @adev: amdgpu_device pointer 3918 * 3919 * Second resume function for hardware IPs. The list of all the hardware 3920 * IPs that make up the asic is walked and the resume callbacks are run for 3921 * all blocks except COMMON, GMC, and IH. resume puts the hardware into a 3922 * functional state after a suspend and updates the software state as 3923 * necessary. This function is also used for restoring the GPU after a GPU 3924 * reset. 3925 * Returns 0 on success, negative error code on failure. 3926 */ 3927 static int amdgpu_device_ip_resume_phase2(struct amdgpu_device *adev) 3928 { 3929 int i, r; 3930 3931 for (i = 0; i < adev->num_ip_blocks; i++) { 3932 if (!adev->ip_blocks[i].status.valid || adev->ip_blocks[i].status.hw) 3933 continue; 3934 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON || 3935 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC || 3936 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH || 3937 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE || 3938 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) 3939 continue; 3940 r = amdgpu_ip_block_resume(&adev->ip_blocks[i]); 3941 if (r) 3942 return r; 3943 } 3944 3945 return 0; 3946 } 3947 3948 /** 3949 * amdgpu_device_ip_resume_phase3 - run resume for hardware IPs 3950 * 3951 * @adev: amdgpu_device pointer 3952 * 3953 * Third resume function for hardware IPs. The list of all the hardware 3954 * IPs that make up the asic is walked and the resume callbacks are run for 3955 * all DCE. resume puts the hardware into a functional state after a suspend 3956 * and updates the software state as necessary. This function is also used 3957 * for restoring the GPU after a GPU reset. 3958 * 3959 * Returns 0 on success, negative error code on failure. 3960 */ 3961 static int amdgpu_device_ip_resume_phase3(struct amdgpu_device *adev) 3962 { 3963 int i, r; 3964 3965 for (i = 0; i < adev->num_ip_blocks; i++) { 3966 if (!adev->ip_blocks[i].status.valid || adev->ip_blocks[i].status.hw) 3967 continue; 3968 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE) { 3969 r = amdgpu_ip_block_resume(&adev->ip_blocks[i]); 3970 if (r) 3971 return r; 3972 } 3973 } 3974 3975 return 0; 3976 } 3977 3978 /** 3979 * amdgpu_device_ip_resume - run resume for hardware IPs 3980 * 3981 * @adev: amdgpu_device pointer 3982 * 3983 * Main resume function for hardware IPs. The hardware IPs 3984 * are split into two resume functions because they are 3985 * also used in recovering from a GPU reset and some additional 3986 * steps need to be take between them. In this case (S3/S4) they are 3987 * run sequentially. 3988 * Returns 0 on success, negative error code on failure. 3989 */ 3990 static int amdgpu_device_ip_resume(struct amdgpu_device *adev) 3991 { 3992 int r; 3993 3994 r = amdgpu_device_ip_resume_phase1(adev); 3995 if (r) 3996 return r; 3997 3998 r = amdgpu_device_fw_loading(adev); 3999 if (r) 4000 return r; 4001 4002 r = amdgpu_device_ip_resume_phase2(adev); 4003 4004 if (adev->mman.buffer_funcs_ring->sched.ready) 4005 amdgpu_ttm_set_buffer_funcs_status(adev, true); 4006 4007 if (r) 4008 return r; 4009 4010 amdgpu_fence_driver_hw_init(adev); 4011 4012 r = amdgpu_device_ip_resume_phase3(adev); 4013 4014 return r; 4015 } 4016 4017 /** 4018 * amdgpu_device_detect_sriov_bios - determine if the board supports SR-IOV 4019 * 4020 * @adev: amdgpu_device pointer 4021 * 4022 * Query the VBIOS data tables to determine if the board supports SR-IOV. 4023 */ 4024 static void amdgpu_device_detect_sriov_bios(struct amdgpu_device *adev) 4025 { 4026 if (amdgpu_sriov_vf(adev)) { 4027 if (adev->is_atom_fw) { 4028 if (amdgpu_atomfirmware_gpu_virtualization_supported(adev)) 4029 adev->virt.caps |= AMDGPU_SRIOV_CAPS_SRIOV_VBIOS; 4030 } else { 4031 if (amdgpu_atombios_has_gpu_virtualization_table(adev)) 4032 adev->virt.caps |= AMDGPU_SRIOV_CAPS_SRIOV_VBIOS; 4033 } 4034 4035 if (!(adev->virt.caps & AMDGPU_SRIOV_CAPS_SRIOV_VBIOS)) 4036 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_NO_VBIOS, 0, 0); 4037 } 4038 } 4039 4040 /** 4041 * amdgpu_device_asic_has_dc_support - determine if DC supports the asic 4042 * 4043 * @asic_type: AMD asic type 4044 * 4045 * Check if there is DC (new modesetting infrastructre) support for an asic. 4046 * returns true if DC has support, false if not. 4047 */ 4048 bool amdgpu_device_asic_has_dc_support(enum amd_asic_type asic_type) 4049 { 4050 switch (asic_type) { 4051 #ifdef CONFIG_DRM_AMDGPU_SI 4052 case CHIP_HAINAN: 4053 #endif 4054 case CHIP_TOPAZ: 4055 /* chips with no display hardware */ 4056 return false; 4057 #if defined(CONFIG_DRM_AMD_DC) 4058 case CHIP_TAHITI: 4059 case CHIP_PITCAIRN: 4060 case CHIP_VERDE: 4061 case CHIP_OLAND: 4062 /* 4063 * We have systems in the wild with these ASICs that require 4064 * LVDS and VGA support which is not supported with DC. 4065 * 4066 * Fallback to the non-DC driver here by default so as not to 4067 * cause regressions. 4068 */ 4069 #if defined(CONFIG_DRM_AMD_DC_SI) 4070 return amdgpu_dc > 0; 4071 #else 4072 return false; 4073 #endif 4074 case CHIP_BONAIRE: 4075 case CHIP_KAVERI: 4076 case CHIP_KABINI: 4077 case CHIP_MULLINS: 4078 /* 4079 * We have systems in the wild with these ASICs that require 4080 * VGA support which is not supported with DC. 4081 * 4082 * Fallback to the non-DC driver here by default so as not to 4083 * cause regressions. 4084 */ 4085 return amdgpu_dc > 0; 4086 default: 4087 return amdgpu_dc != 0; 4088 #else 4089 default: 4090 if (amdgpu_dc > 0) 4091 DRM_INFO_ONCE("Display Core has been requested via kernel parameter but isn't supported by ASIC, ignoring\n"); 4092 return false; 4093 #endif 4094 } 4095 } 4096 4097 /** 4098 * amdgpu_device_has_dc_support - check if dc is supported 4099 * 4100 * @adev: amdgpu_device pointer 4101 * 4102 * Returns true for supported, false for not supported 4103 */ 4104 bool amdgpu_device_has_dc_support(struct amdgpu_device *adev) 4105 { 4106 if (adev->enable_virtual_display || 4107 (adev->harvest_ip_mask & AMD_HARVEST_IP_DMU_MASK)) 4108 return false; 4109 4110 return amdgpu_device_asic_has_dc_support(adev->asic_type); 4111 } 4112 4113 static void amdgpu_device_xgmi_reset_func(struct work_struct *__work) 4114 { 4115 struct amdgpu_device *adev = 4116 container_of(__work, struct amdgpu_device, xgmi_reset_work); 4117 struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev); 4118 4119 /* It's a bug to not have a hive within this function */ 4120 if (WARN_ON(!hive)) 4121 return; 4122 4123 /* 4124 * Use task barrier to synchronize all xgmi reset works across the 4125 * hive. task_barrier_enter and task_barrier_exit will block 4126 * until all the threads running the xgmi reset works reach 4127 * those points. task_barrier_full will do both blocks. 4128 */ 4129 if (amdgpu_asic_reset_method(adev) == AMD_RESET_METHOD_BACO) { 4130 4131 task_barrier_enter(&hive->tb); 4132 adev->asic_reset_res = amdgpu_device_baco_enter(adev_to_drm(adev)); 4133 4134 if (adev->asic_reset_res) 4135 goto fail; 4136 4137 task_barrier_exit(&hive->tb); 4138 adev->asic_reset_res = amdgpu_device_baco_exit(adev_to_drm(adev)); 4139 4140 if (adev->asic_reset_res) 4141 goto fail; 4142 4143 amdgpu_ras_reset_error_count(adev, AMDGPU_RAS_BLOCK__MMHUB); 4144 } else { 4145 4146 task_barrier_full(&hive->tb); 4147 adev->asic_reset_res = amdgpu_asic_reset(adev); 4148 } 4149 4150 fail: 4151 if (adev->asic_reset_res) 4152 DRM_WARN("ASIC reset failed with error, %d for drm dev, %s", 4153 adev->asic_reset_res, adev_to_drm(adev)->unique); 4154 amdgpu_put_xgmi_hive(hive); 4155 } 4156 4157 static int amdgpu_device_get_job_timeout_settings(struct amdgpu_device *adev) 4158 { 4159 char *input = amdgpu_lockup_timeout; 4160 char *timeout_setting = NULL; 4161 int index = 0; 4162 long timeout; 4163 int ret = 0; 4164 4165 /* 4166 * By default timeout for non compute jobs is 10000 4167 * and 60000 for compute jobs. 4168 * In SR-IOV or passthrough mode, timeout for compute 4169 * jobs are 60000 by default. 4170 */ 4171 adev->gfx_timeout = msecs_to_jiffies(10000); 4172 adev->sdma_timeout = adev->video_timeout = adev->gfx_timeout; 4173 if (amdgpu_sriov_vf(adev)) 4174 adev->compute_timeout = amdgpu_sriov_is_pp_one_vf(adev) ? 4175 msecs_to_jiffies(60000) : msecs_to_jiffies(10000); 4176 else 4177 adev->compute_timeout = msecs_to_jiffies(60000); 4178 4179 if (strnlen(input, AMDGPU_MAX_TIMEOUT_PARAM_LENGTH)) { 4180 while ((timeout_setting = strsep(&input, ",")) && 4181 strnlen(timeout_setting, AMDGPU_MAX_TIMEOUT_PARAM_LENGTH)) { 4182 ret = kstrtol(timeout_setting, 0, &timeout); 4183 if (ret) 4184 return ret; 4185 4186 if (timeout == 0) { 4187 index++; 4188 continue; 4189 } else if (timeout < 0) { 4190 timeout = MAX_SCHEDULE_TIMEOUT; 4191 dev_warn(adev->dev, "lockup timeout disabled"); 4192 add_taint(TAINT_SOFTLOCKUP, LOCKDEP_STILL_OK); 4193 } else { 4194 timeout = msecs_to_jiffies(timeout); 4195 } 4196 4197 switch (index++) { 4198 case 0: 4199 adev->gfx_timeout = timeout; 4200 break; 4201 case 1: 4202 adev->compute_timeout = timeout; 4203 break; 4204 case 2: 4205 adev->sdma_timeout = timeout; 4206 break; 4207 case 3: 4208 adev->video_timeout = timeout; 4209 break; 4210 default: 4211 break; 4212 } 4213 } 4214 /* 4215 * There is only one value specified and 4216 * it should apply to all non-compute jobs. 4217 */ 4218 if (index == 1) { 4219 adev->sdma_timeout = adev->video_timeout = adev->gfx_timeout; 4220 if (amdgpu_sriov_vf(adev) || amdgpu_passthrough(adev)) 4221 adev->compute_timeout = adev->gfx_timeout; 4222 } 4223 } 4224 4225 return ret; 4226 } 4227 4228 /** 4229 * amdgpu_device_check_iommu_direct_map - check if RAM direct mapped to GPU 4230 * 4231 * @adev: amdgpu_device pointer 4232 * 4233 * RAM direct mapped to GPU if IOMMU is not enabled or is pass through mode 4234 */ 4235 static void amdgpu_device_check_iommu_direct_map(struct amdgpu_device *adev) 4236 { 4237 struct iommu_domain *domain; 4238 4239 domain = iommu_get_domain_for_dev(adev->dev); 4240 if (!domain || domain->type == IOMMU_DOMAIN_IDENTITY) 4241 adev->ram_is_direct_mapped = true; 4242 } 4243 4244 #if defined(CONFIG_HSA_AMD_P2P) 4245 /** 4246 * amdgpu_device_check_iommu_remap - Check if DMA remapping is enabled. 4247 * 4248 * @adev: amdgpu_device pointer 4249 * 4250 * return if IOMMU remapping bar address 4251 */ 4252 static bool amdgpu_device_check_iommu_remap(struct amdgpu_device *adev) 4253 { 4254 struct iommu_domain *domain; 4255 4256 domain = iommu_get_domain_for_dev(adev->dev); 4257 if (domain && (domain->type == IOMMU_DOMAIN_DMA || 4258 domain->type == IOMMU_DOMAIN_DMA_FQ)) 4259 return true; 4260 4261 return false; 4262 } 4263 #endif 4264 4265 static void amdgpu_device_set_mcbp(struct amdgpu_device *adev) 4266 { 4267 if (amdgpu_mcbp == 1) 4268 adev->gfx.mcbp = true; 4269 else if (amdgpu_mcbp == 0) 4270 adev->gfx.mcbp = false; 4271 4272 if (amdgpu_sriov_vf(adev)) 4273 adev->gfx.mcbp = true; 4274 4275 if (adev->gfx.mcbp) 4276 DRM_INFO("MCBP is enabled\n"); 4277 } 4278 4279 /** 4280 * amdgpu_device_init - initialize the driver 4281 * 4282 * @adev: amdgpu_device pointer 4283 * @flags: driver flags 4284 * 4285 * Initializes the driver info and hw (all asics). 4286 * Returns 0 for success or an error on failure. 4287 * Called at driver startup. 4288 */ 4289 int amdgpu_device_init(struct amdgpu_device *adev, 4290 uint32_t flags) 4291 { 4292 struct drm_device *ddev = adev_to_drm(adev); 4293 struct pci_dev *pdev = adev->pdev; 4294 int r, i; 4295 bool px = false; 4296 u32 max_MBps; 4297 int tmp; 4298 4299 adev->shutdown = false; 4300 adev->flags = flags; 4301 4302 if (amdgpu_force_asic_type >= 0 && amdgpu_force_asic_type < CHIP_LAST) 4303 adev->asic_type = amdgpu_force_asic_type; 4304 else 4305 adev->asic_type = flags & AMD_ASIC_MASK; 4306 4307 adev->usec_timeout = AMDGPU_MAX_USEC_TIMEOUT; 4308 if (amdgpu_emu_mode == 1) 4309 adev->usec_timeout *= 10; 4310 adev->gmc.gart_size = 512 * 1024 * 1024; 4311 adev->accel_working = false; 4312 adev->num_rings = 0; 4313 RCU_INIT_POINTER(adev->gang_submit, dma_fence_get_stub()); 4314 adev->mman.buffer_funcs = NULL; 4315 adev->mman.buffer_funcs_ring = NULL; 4316 adev->vm_manager.vm_pte_funcs = NULL; 4317 adev->vm_manager.vm_pte_num_scheds = 0; 4318 adev->gmc.gmc_funcs = NULL; 4319 adev->harvest_ip_mask = 0x0; 4320 adev->fence_context = dma_fence_context_alloc(AMDGPU_MAX_RINGS); 4321 bitmap_zero(adev->gfx.pipe_reserve_bitmap, AMDGPU_MAX_COMPUTE_QUEUES); 4322 4323 adev->smc_rreg = &amdgpu_invalid_rreg; 4324 adev->smc_wreg = &amdgpu_invalid_wreg; 4325 adev->pcie_rreg = &amdgpu_invalid_rreg; 4326 adev->pcie_wreg = &amdgpu_invalid_wreg; 4327 adev->pcie_rreg_ext = &amdgpu_invalid_rreg_ext; 4328 adev->pcie_wreg_ext = &amdgpu_invalid_wreg_ext; 4329 adev->pciep_rreg = &amdgpu_invalid_rreg; 4330 adev->pciep_wreg = &amdgpu_invalid_wreg; 4331 adev->pcie_rreg64 = &amdgpu_invalid_rreg64; 4332 adev->pcie_wreg64 = &amdgpu_invalid_wreg64; 4333 adev->pcie_rreg64_ext = &amdgpu_invalid_rreg64_ext; 4334 adev->pcie_wreg64_ext = &amdgpu_invalid_wreg64_ext; 4335 adev->uvd_ctx_rreg = &amdgpu_invalid_rreg; 4336 adev->uvd_ctx_wreg = &amdgpu_invalid_wreg; 4337 adev->didt_rreg = &amdgpu_invalid_rreg; 4338 adev->didt_wreg = &amdgpu_invalid_wreg; 4339 adev->gc_cac_rreg = &amdgpu_invalid_rreg; 4340 adev->gc_cac_wreg = &amdgpu_invalid_wreg; 4341 adev->audio_endpt_rreg = &amdgpu_block_invalid_rreg; 4342 adev->audio_endpt_wreg = &amdgpu_block_invalid_wreg; 4343 4344 DRM_INFO("initializing kernel modesetting (%s 0x%04X:0x%04X 0x%04X:0x%04X 0x%02X).\n", 4345 amdgpu_asic_name[adev->asic_type], pdev->vendor, pdev->device, 4346 pdev->subsystem_vendor, pdev->subsystem_device, pdev->revision); 4347 4348 /* mutex initialization are all done here so we 4349 * can recall function without having locking issues 4350 */ 4351 mutex_init(&adev->firmware.mutex); 4352 mutex_init(&adev->pm.mutex); 4353 mutex_init(&adev->gfx.gpu_clock_mutex); 4354 mutex_init(&adev->srbm_mutex); 4355 mutex_init(&adev->gfx.pipe_reserve_mutex); 4356 mutex_init(&adev->gfx.gfx_off_mutex); 4357 mutex_init(&adev->gfx.partition_mutex); 4358 mutex_init(&adev->grbm_idx_mutex); 4359 mutex_init(&adev->mn_lock); 4360 mutex_init(&adev->virt.vf_errors.lock); 4361 hash_init(adev->mn_hash); 4362 mutex_init(&adev->psp.mutex); 4363 mutex_init(&adev->notifier_lock); 4364 mutex_init(&adev->pm.stable_pstate_ctx_lock); 4365 mutex_init(&adev->benchmark_mutex); 4366 mutex_init(&adev->gfx.reset_sem_mutex); 4367 /* Initialize the mutex for cleaner shader isolation between GFX and compute processes */ 4368 mutex_init(&adev->enforce_isolation_mutex); 4369 for (i = 0; i < MAX_XCP; ++i) { 4370 adev->isolation[i].spearhead = dma_fence_get_stub(); 4371 amdgpu_sync_create(&adev->isolation[i].active); 4372 amdgpu_sync_create(&adev->isolation[i].prev); 4373 } 4374 mutex_init(&adev->gfx.userq_sch_mutex); 4375 mutex_init(&adev->gfx.workload_profile_mutex); 4376 mutex_init(&adev->vcn.workload_profile_mutex); 4377 mutex_init(&adev->userq_mutex); 4378 4379 amdgpu_device_init_apu_flags(adev); 4380 4381 r = amdgpu_device_check_arguments(adev); 4382 if (r) 4383 return r; 4384 4385 spin_lock_init(&adev->mmio_idx_lock); 4386 spin_lock_init(&adev->smc_idx_lock); 4387 spin_lock_init(&adev->pcie_idx_lock); 4388 spin_lock_init(&adev->uvd_ctx_idx_lock); 4389 spin_lock_init(&adev->didt_idx_lock); 4390 spin_lock_init(&adev->gc_cac_idx_lock); 4391 spin_lock_init(&adev->se_cac_idx_lock); 4392 spin_lock_init(&adev->audio_endpt_idx_lock); 4393 spin_lock_init(&adev->mm_stats.lock); 4394 spin_lock_init(&adev->virt.rlcg_reg_lock); 4395 spin_lock_init(&adev->wb.lock); 4396 4397 xa_init_flags(&adev->userq_xa, XA_FLAGS_LOCK_IRQ); 4398 4399 INIT_LIST_HEAD(&adev->reset_list); 4400 4401 INIT_LIST_HEAD(&adev->ras_list); 4402 4403 INIT_LIST_HEAD(&adev->pm.od_kobj_list); 4404 4405 INIT_LIST_HEAD(&adev->userq_mgr_list); 4406 4407 INIT_DELAYED_WORK(&adev->delayed_init_work, 4408 amdgpu_device_delayed_init_work_handler); 4409 INIT_DELAYED_WORK(&adev->gfx.gfx_off_delay_work, 4410 amdgpu_device_delay_enable_gfx_off); 4411 /* 4412 * Initialize the enforce_isolation work structures for each XCP 4413 * partition. This work handler is responsible for enforcing shader 4414 * isolation on AMD GPUs. It counts the number of emitted fences for 4415 * each GFX and compute ring. If there are any fences, it schedules 4416 * the `enforce_isolation_work` to be run after a delay. If there are 4417 * no fences, it signals the Kernel Fusion Driver (KFD) to resume the 4418 * runqueue. 4419 */ 4420 for (i = 0; i < MAX_XCP; i++) { 4421 INIT_DELAYED_WORK(&adev->gfx.enforce_isolation[i].work, 4422 amdgpu_gfx_enforce_isolation_handler); 4423 adev->gfx.enforce_isolation[i].adev = adev; 4424 adev->gfx.enforce_isolation[i].xcp_id = i; 4425 } 4426 4427 INIT_WORK(&adev->xgmi_reset_work, amdgpu_device_xgmi_reset_func); 4428 4429 adev->gfx.gfx_off_req_count = 1; 4430 adev->gfx.gfx_off_residency = 0; 4431 adev->gfx.gfx_off_entrycount = 0; 4432 adev->pm.ac_power = power_supply_is_system_supplied() > 0; 4433 4434 atomic_set(&adev->throttling_logging_enabled, 1); 4435 /* 4436 * If throttling continues, logging will be performed every minute 4437 * to avoid log flooding. "-1" is subtracted since the thermal 4438 * throttling interrupt comes every second. Thus, the total logging 4439 * interval is 59 seconds(retelimited printk interval) + 1(waiting 4440 * for throttling interrupt) = 60 seconds. 4441 */ 4442 ratelimit_state_init(&adev->throttling_logging_rs, (60 - 1) * HZ, 1); 4443 4444 ratelimit_set_flags(&adev->throttling_logging_rs, RATELIMIT_MSG_ON_RELEASE); 4445 4446 /* Registers mapping */ 4447 /* TODO: block userspace mapping of io register */ 4448 if (adev->asic_type >= CHIP_BONAIRE) { 4449 adev->rmmio_base = pci_resource_start(adev->pdev, 5); 4450 adev->rmmio_size = pci_resource_len(adev->pdev, 5); 4451 } else { 4452 adev->rmmio_base = pci_resource_start(adev->pdev, 2); 4453 adev->rmmio_size = pci_resource_len(adev->pdev, 2); 4454 } 4455 4456 for (i = 0; i < AMD_IP_BLOCK_TYPE_NUM; i++) 4457 atomic_set(&adev->pm.pwr_state[i], POWER_STATE_UNKNOWN); 4458 4459 adev->rmmio = ioremap(adev->rmmio_base, adev->rmmio_size); 4460 if (!adev->rmmio) 4461 return -ENOMEM; 4462 4463 DRM_INFO("register mmio base: 0x%08X\n", (uint32_t)adev->rmmio_base); 4464 DRM_INFO("register mmio size: %u\n", (unsigned int)adev->rmmio_size); 4465 4466 /* 4467 * Reset domain needs to be present early, before XGMI hive discovered 4468 * (if any) and initialized to use reset sem and in_gpu reset flag 4469 * early on during init and before calling to RREG32. 4470 */ 4471 adev->reset_domain = amdgpu_reset_create_reset_domain(SINGLE_DEVICE, "amdgpu-reset-dev"); 4472 if (!adev->reset_domain) 4473 return -ENOMEM; 4474 4475 /* detect hw virtualization here */ 4476 amdgpu_virt_init(adev); 4477 4478 amdgpu_device_get_pcie_info(adev); 4479 4480 r = amdgpu_device_get_job_timeout_settings(adev); 4481 if (r) { 4482 dev_err(adev->dev, "invalid lockup_timeout parameter syntax\n"); 4483 return r; 4484 } 4485 4486 amdgpu_device_set_mcbp(adev); 4487 4488 /* 4489 * By default, use default mode where all blocks are expected to be 4490 * initialized. At present a 'swinit' of blocks is required to be 4491 * completed before the need for a different level is detected. 4492 */ 4493 amdgpu_set_init_level(adev, AMDGPU_INIT_LEVEL_DEFAULT); 4494 /* early init functions */ 4495 r = amdgpu_device_ip_early_init(adev); 4496 if (r) 4497 return r; 4498 4499 /* 4500 * No need to remove conflicting FBs for non-display class devices. 4501 * This prevents the sysfb from being freed accidently. 4502 */ 4503 if ((pdev->class >> 8) == PCI_CLASS_DISPLAY_VGA || 4504 (pdev->class >> 8) == PCI_CLASS_DISPLAY_OTHER) { 4505 /* Get rid of things like offb */ 4506 r = aperture_remove_conflicting_pci_devices(adev->pdev, amdgpu_kms_driver.name); 4507 if (r) 4508 return r; 4509 } 4510 4511 /* Enable TMZ based on IP_VERSION */ 4512 amdgpu_gmc_tmz_set(adev); 4513 4514 if (amdgpu_sriov_vf(adev) && 4515 amdgpu_ip_version(adev, GC_HWIP, 0) >= IP_VERSION(10, 3, 0)) 4516 /* VF MMIO access (except mailbox range) from CPU 4517 * will be blocked during sriov runtime 4518 */ 4519 adev->virt.caps |= AMDGPU_VF_MMIO_ACCESS_PROTECT; 4520 4521 amdgpu_gmc_noretry_set(adev); 4522 /* Need to get xgmi info early to decide the reset behavior*/ 4523 if (adev->gmc.xgmi.supported) { 4524 r = adev->gfxhub.funcs->get_xgmi_info(adev); 4525 if (r) 4526 return r; 4527 } 4528 4529 /* enable PCIE atomic ops */ 4530 if (amdgpu_sriov_vf(adev)) { 4531 if (adev->virt.fw_reserve.p_pf2vf) 4532 adev->have_atomics_support = ((struct amd_sriov_msg_pf2vf_info *) 4533 adev->virt.fw_reserve.p_pf2vf)->pcie_atomic_ops_support_flags == 4534 (PCI_EXP_DEVCAP2_ATOMIC_COMP32 | PCI_EXP_DEVCAP2_ATOMIC_COMP64); 4535 /* APUs w/ gfx9 onwards doesn't reply on PCIe atomics, rather it is a 4536 * internal path natively support atomics, set have_atomics_support to true. 4537 */ 4538 } else if ((adev->flags & AMD_IS_APU) && 4539 (amdgpu_ip_version(adev, GC_HWIP, 0) > 4540 IP_VERSION(9, 0, 0))) { 4541 adev->have_atomics_support = true; 4542 } else { 4543 adev->have_atomics_support = 4544 !pci_enable_atomic_ops_to_root(adev->pdev, 4545 PCI_EXP_DEVCAP2_ATOMIC_COMP32 | 4546 PCI_EXP_DEVCAP2_ATOMIC_COMP64); 4547 } 4548 4549 if (!adev->have_atomics_support) 4550 dev_info(adev->dev, "PCIE atomic ops is not supported\n"); 4551 4552 /* doorbell bar mapping and doorbell index init*/ 4553 amdgpu_doorbell_init(adev); 4554 4555 if (amdgpu_emu_mode == 1) { 4556 /* post the asic on emulation mode */ 4557 emu_soc_asic_init(adev); 4558 goto fence_driver_init; 4559 } 4560 4561 amdgpu_reset_init(adev); 4562 4563 /* detect if we are with an SRIOV vbios */ 4564 if (adev->bios) 4565 amdgpu_device_detect_sriov_bios(adev); 4566 4567 /* check if we need to reset the asic 4568 * E.g., driver was not cleanly unloaded previously, etc. 4569 */ 4570 if (!amdgpu_sriov_vf(adev) && amdgpu_asic_need_reset_on_init(adev)) { 4571 if (adev->gmc.xgmi.num_physical_nodes) { 4572 dev_info(adev->dev, "Pending hive reset.\n"); 4573 amdgpu_set_init_level(adev, 4574 AMDGPU_INIT_LEVEL_MINIMAL_XGMI); 4575 } else if (amdgpu_ip_version(adev, MP1_HWIP, 0) == IP_VERSION(13, 0, 10) && 4576 !amdgpu_device_has_display_hardware(adev)) { 4577 r = psp_gpu_reset(adev); 4578 } else { 4579 tmp = amdgpu_reset_method; 4580 /* It should do a default reset when loading or reloading the driver, 4581 * regardless of the module parameter reset_method. 4582 */ 4583 amdgpu_reset_method = AMD_RESET_METHOD_NONE; 4584 r = amdgpu_asic_reset(adev); 4585 amdgpu_reset_method = tmp; 4586 } 4587 4588 if (r) { 4589 dev_err(adev->dev, "asic reset on init failed\n"); 4590 goto failed; 4591 } 4592 } 4593 4594 /* Post card if necessary */ 4595 if (amdgpu_device_need_post(adev)) { 4596 if (!adev->bios) { 4597 dev_err(adev->dev, "no vBIOS found\n"); 4598 r = -EINVAL; 4599 goto failed; 4600 } 4601 DRM_INFO("GPU posting now...\n"); 4602 r = amdgpu_device_asic_init(adev); 4603 if (r) { 4604 dev_err(adev->dev, "gpu post error!\n"); 4605 goto failed; 4606 } 4607 } 4608 4609 if (adev->bios) { 4610 if (adev->is_atom_fw) { 4611 /* Initialize clocks */ 4612 r = amdgpu_atomfirmware_get_clock_info(adev); 4613 if (r) { 4614 dev_err(adev->dev, "amdgpu_atomfirmware_get_clock_info failed\n"); 4615 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_GET_CLOCK_FAIL, 0, 0); 4616 goto failed; 4617 } 4618 } else { 4619 /* Initialize clocks */ 4620 r = amdgpu_atombios_get_clock_info(adev); 4621 if (r) { 4622 dev_err(adev->dev, "amdgpu_atombios_get_clock_info failed\n"); 4623 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_GET_CLOCK_FAIL, 0, 0); 4624 goto failed; 4625 } 4626 /* init i2c buses */ 4627 amdgpu_i2c_init(adev); 4628 } 4629 } 4630 4631 fence_driver_init: 4632 /* Fence driver */ 4633 r = amdgpu_fence_driver_sw_init(adev); 4634 if (r) { 4635 dev_err(adev->dev, "amdgpu_fence_driver_sw_init failed\n"); 4636 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_FENCE_INIT_FAIL, 0, 0); 4637 goto failed; 4638 } 4639 4640 /* init the mode config */ 4641 drm_mode_config_init(adev_to_drm(adev)); 4642 4643 r = amdgpu_device_ip_init(adev); 4644 if (r) { 4645 dev_err(adev->dev, "amdgpu_device_ip_init failed\n"); 4646 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_AMDGPU_INIT_FAIL, 0, 0); 4647 goto release_ras_con; 4648 } 4649 4650 amdgpu_fence_driver_hw_init(adev); 4651 4652 dev_info(adev->dev, 4653 "SE %d, SH per SE %d, CU per SH %d, active_cu_number %d\n", 4654 adev->gfx.config.max_shader_engines, 4655 adev->gfx.config.max_sh_per_se, 4656 adev->gfx.config.max_cu_per_sh, 4657 adev->gfx.cu_info.number); 4658 4659 adev->accel_working = true; 4660 4661 amdgpu_vm_check_compute_bug(adev); 4662 4663 /* Initialize the buffer migration limit. */ 4664 if (amdgpu_moverate >= 0) 4665 max_MBps = amdgpu_moverate; 4666 else 4667 max_MBps = 8; /* Allow 8 MB/s. */ 4668 /* Get a log2 for easy divisions. */ 4669 adev->mm_stats.log2_max_MBps = ilog2(max(1u, max_MBps)); 4670 4671 /* 4672 * Register gpu instance before amdgpu_device_enable_mgpu_fan_boost. 4673 * Otherwise the mgpu fan boost feature will be skipped due to the 4674 * gpu instance is counted less. 4675 */ 4676 amdgpu_register_gpu_instance(adev); 4677 4678 /* enable clockgating, etc. after ib tests, etc. since some blocks require 4679 * explicit gating rather than handling it automatically. 4680 */ 4681 if (adev->init_lvl->level != AMDGPU_INIT_LEVEL_MINIMAL_XGMI) { 4682 r = amdgpu_device_ip_late_init(adev); 4683 if (r) { 4684 dev_err(adev->dev, "amdgpu_device_ip_late_init failed\n"); 4685 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_AMDGPU_LATE_INIT_FAIL, 0, r); 4686 goto release_ras_con; 4687 } 4688 /* must succeed. */ 4689 amdgpu_ras_resume(adev); 4690 queue_delayed_work(system_wq, &adev->delayed_init_work, 4691 msecs_to_jiffies(AMDGPU_RESUME_MS)); 4692 } 4693 4694 if (amdgpu_sriov_vf(adev)) { 4695 amdgpu_virt_release_full_gpu(adev, true); 4696 flush_delayed_work(&adev->delayed_init_work); 4697 } 4698 4699 /* 4700 * Place those sysfs registering after `late_init`. As some of those 4701 * operations performed in `late_init` might affect the sysfs 4702 * interfaces creating. 4703 */ 4704 r = amdgpu_atombios_sysfs_init(adev); 4705 if (r) 4706 drm_err(&adev->ddev, 4707 "registering atombios sysfs failed (%d).\n", r); 4708 4709 r = amdgpu_pm_sysfs_init(adev); 4710 if (r) 4711 DRM_ERROR("registering pm sysfs failed (%d).\n", r); 4712 4713 r = amdgpu_ucode_sysfs_init(adev); 4714 if (r) { 4715 adev->ucode_sysfs_en = false; 4716 DRM_ERROR("Creating firmware sysfs failed (%d).\n", r); 4717 } else 4718 adev->ucode_sysfs_en = true; 4719 4720 r = amdgpu_device_attr_sysfs_init(adev); 4721 if (r) 4722 dev_err(adev->dev, "Could not create amdgpu device attr\n"); 4723 4724 r = devm_device_add_group(adev->dev, &amdgpu_board_attrs_group); 4725 if (r) 4726 dev_err(adev->dev, 4727 "Could not create amdgpu board attributes\n"); 4728 4729 amdgpu_fru_sysfs_init(adev); 4730 amdgpu_reg_state_sysfs_init(adev); 4731 amdgpu_xcp_cfg_sysfs_init(adev); 4732 4733 if (IS_ENABLED(CONFIG_PERF_EVENTS)) 4734 r = amdgpu_pmu_init(adev); 4735 if (r) 4736 dev_err(adev->dev, "amdgpu_pmu_init failed\n"); 4737 4738 /* Have stored pci confspace at hand for restore in sudden PCI error */ 4739 if (amdgpu_device_cache_pci_state(adev->pdev)) 4740 pci_restore_state(pdev); 4741 4742 /* if we have > 1 VGA cards, then disable the amdgpu VGA resources */ 4743 /* this will fail for cards that aren't VGA class devices, just 4744 * ignore it 4745 */ 4746 if ((adev->pdev->class >> 8) == PCI_CLASS_DISPLAY_VGA) 4747 vga_client_register(adev->pdev, amdgpu_device_vga_set_decode); 4748 4749 px = amdgpu_device_supports_px(ddev); 4750 4751 if (px || (!dev_is_removable(&adev->pdev->dev) && 4752 apple_gmux_detect(NULL, NULL))) 4753 vga_switcheroo_register_client(adev->pdev, 4754 &amdgpu_switcheroo_ops, px); 4755 4756 if (px) 4757 vga_switcheroo_init_domain_pm_ops(adev->dev, &adev->vga_pm_domain); 4758 4759 if (adev->init_lvl->level == AMDGPU_INIT_LEVEL_MINIMAL_XGMI) 4760 amdgpu_xgmi_reset_on_init(adev); 4761 4762 amdgpu_device_check_iommu_direct_map(adev); 4763 4764 adev->pm_nb.notifier_call = amdgpu_device_pm_notifier; 4765 r = register_pm_notifier(&adev->pm_nb); 4766 if (r) 4767 goto failed; 4768 4769 return 0; 4770 4771 release_ras_con: 4772 if (amdgpu_sriov_vf(adev)) 4773 amdgpu_virt_release_full_gpu(adev, true); 4774 4775 /* failed in exclusive mode due to timeout */ 4776 if (amdgpu_sriov_vf(adev) && 4777 !amdgpu_sriov_runtime(adev) && 4778 amdgpu_virt_mmio_blocked(adev) && 4779 !amdgpu_virt_wait_reset(adev)) { 4780 dev_err(adev->dev, "VF exclusive mode timeout\n"); 4781 /* Don't send request since VF is inactive. */ 4782 adev->virt.caps &= ~AMDGPU_SRIOV_CAPS_RUNTIME; 4783 adev->virt.ops = NULL; 4784 r = -EAGAIN; 4785 } 4786 amdgpu_release_ras_context(adev); 4787 4788 failed: 4789 amdgpu_vf_error_trans_all(adev); 4790 4791 return r; 4792 } 4793 4794 static void amdgpu_device_unmap_mmio(struct amdgpu_device *adev) 4795 { 4796 4797 /* Clear all CPU mappings pointing to this device */ 4798 unmap_mapping_range(adev->ddev.anon_inode->i_mapping, 0, 0, 1); 4799 4800 /* Unmap all mapped bars - Doorbell, registers and VRAM */ 4801 amdgpu_doorbell_fini(adev); 4802 4803 iounmap(adev->rmmio); 4804 adev->rmmio = NULL; 4805 if (adev->mman.aper_base_kaddr) 4806 iounmap(adev->mman.aper_base_kaddr); 4807 adev->mman.aper_base_kaddr = NULL; 4808 4809 /* Memory manager related */ 4810 if (!adev->gmc.xgmi.connected_to_cpu && !adev->gmc.is_app_apu) { 4811 arch_phys_wc_del(adev->gmc.vram_mtrr); 4812 arch_io_free_memtype_wc(adev->gmc.aper_base, adev->gmc.aper_size); 4813 } 4814 } 4815 4816 /** 4817 * amdgpu_device_fini_hw - tear down the driver 4818 * 4819 * @adev: amdgpu_device pointer 4820 * 4821 * Tear down the driver info (all asics). 4822 * Called at driver shutdown. 4823 */ 4824 void amdgpu_device_fini_hw(struct amdgpu_device *adev) 4825 { 4826 dev_info(adev->dev, "amdgpu: finishing device.\n"); 4827 flush_delayed_work(&adev->delayed_init_work); 4828 4829 if (adev->mman.initialized) 4830 drain_workqueue(adev->mman.bdev.wq); 4831 adev->shutdown = true; 4832 4833 unregister_pm_notifier(&adev->pm_nb); 4834 4835 /* make sure IB test finished before entering exclusive mode 4836 * to avoid preemption on IB test 4837 */ 4838 if (amdgpu_sriov_vf(adev)) { 4839 amdgpu_virt_request_full_gpu(adev, false); 4840 amdgpu_virt_fini_data_exchange(adev); 4841 } 4842 4843 /* disable all interrupts */ 4844 amdgpu_irq_disable_all(adev); 4845 if (adev->mode_info.mode_config_initialized) { 4846 if (!drm_drv_uses_atomic_modeset(adev_to_drm(adev))) 4847 drm_helper_force_disable_all(adev_to_drm(adev)); 4848 else 4849 drm_atomic_helper_shutdown(adev_to_drm(adev)); 4850 } 4851 amdgpu_fence_driver_hw_fini(adev); 4852 4853 if (adev->pm.sysfs_initialized) 4854 amdgpu_pm_sysfs_fini(adev); 4855 if (adev->ucode_sysfs_en) 4856 amdgpu_ucode_sysfs_fini(adev); 4857 amdgpu_device_attr_sysfs_fini(adev); 4858 amdgpu_fru_sysfs_fini(adev); 4859 4860 amdgpu_reg_state_sysfs_fini(adev); 4861 amdgpu_xcp_cfg_sysfs_fini(adev); 4862 4863 /* disable ras feature must before hw fini */ 4864 amdgpu_ras_pre_fini(adev); 4865 4866 amdgpu_ttm_set_buffer_funcs_status(adev, false); 4867 4868 amdgpu_device_ip_fini_early(adev); 4869 4870 amdgpu_irq_fini_hw(adev); 4871 4872 if (adev->mman.initialized) 4873 ttm_device_clear_dma_mappings(&adev->mman.bdev); 4874 4875 amdgpu_gart_dummy_page_fini(adev); 4876 4877 if (drm_dev_is_unplugged(adev_to_drm(adev))) 4878 amdgpu_device_unmap_mmio(adev); 4879 4880 } 4881 4882 void amdgpu_device_fini_sw(struct amdgpu_device *adev) 4883 { 4884 int i, idx; 4885 bool px; 4886 4887 amdgpu_device_ip_fini(adev); 4888 amdgpu_fence_driver_sw_fini(adev); 4889 amdgpu_ucode_release(&adev->firmware.gpu_info_fw); 4890 adev->accel_working = false; 4891 dma_fence_put(rcu_dereference_protected(adev->gang_submit, true)); 4892 for (i = 0; i < MAX_XCP; ++i) { 4893 dma_fence_put(adev->isolation[i].spearhead); 4894 amdgpu_sync_free(&adev->isolation[i].active); 4895 amdgpu_sync_free(&adev->isolation[i].prev); 4896 } 4897 4898 amdgpu_reset_fini(adev); 4899 4900 /* free i2c buses */ 4901 amdgpu_i2c_fini(adev); 4902 4903 if (adev->bios) { 4904 if (amdgpu_emu_mode != 1) 4905 amdgpu_atombios_fini(adev); 4906 amdgpu_bios_release(adev); 4907 } 4908 4909 kfree(adev->fru_info); 4910 adev->fru_info = NULL; 4911 4912 kfree(adev->xcp_mgr); 4913 adev->xcp_mgr = NULL; 4914 4915 px = amdgpu_device_supports_px(adev_to_drm(adev)); 4916 4917 if (px || (!dev_is_removable(&adev->pdev->dev) && 4918 apple_gmux_detect(NULL, NULL))) 4919 vga_switcheroo_unregister_client(adev->pdev); 4920 4921 if (px) 4922 vga_switcheroo_fini_domain_pm_ops(adev->dev); 4923 4924 if ((adev->pdev->class >> 8) == PCI_CLASS_DISPLAY_VGA) 4925 vga_client_unregister(adev->pdev); 4926 4927 if (drm_dev_enter(adev_to_drm(adev), &idx)) { 4928 4929 iounmap(adev->rmmio); 4930 adev->rmmio = NULL; 4931 drm_dev_exit(idx); 4932 } 4933 4934 if (IS_ENABLED(CONFIG_PERF_EVENTS)) 4935 amdgpu_pmu_fini(adev); 4936 if (adev->mman.discovery_bin) 4937 amdgpu_discovery_fini(adev); 4938 4939 amdgpu_reset_put_reset_domain(adev->reset_domain); 4940 adev->reset_domain = NULL; 4941 4942 kfree(adev->pci_state); 4943 4944 } 4945 4946 /** 4947 * amdgpu_device_evict_resources - evict device resources 4948 * @adev: amdgpu device object 4949 * 4950 * Evicts all ttm device resources(vram BOs, gart table) from the lru list 4951 * of the vram memory type. Mainly used for evicting device resources 4952 * at suspend time. 4953 * 4954 */ 4955 static int amdgpu_device_evict_resources(struct amdgpu_device *adev) 4956 { 4957 int ret; 4958 4959 /* No need to evict vram on APUs unless going to S4 */ 4960 if (!adev->in_s4 && (adev->flags & AMD_IS_APU)) 4961 return 0; 4962 4963 ret = amdgpu_ttm_evict_resources(adev, TTM_PL_VRAM); 4964 if (ret) 4965 DRM_WARN("evicting device resources failed\n"); 4966 return ret; 4967 } 4968 4969 /* 4970 * Suspend & resume. 4971 */ 4972 /** 4973 * amdgpu_device_pm_notifier - Notification block for Suspend/Hibernate events 4974 * @nb: notifier block 4975 * @mode: suspend mode 4976 * @data: data 4977 * 4978 * This function is called when the system is about to suspend or hibernate. 4979 * It is used to set the appropriate flags so that eviction can be optimized 4980 * in the pm prepare callback. 4981 */ 4982 static int amdgpu_device_pm_notifier(struct notifier_block *nb, unsigned long mode, 4983 void *data) 4984 { 4985 struct amdgpu_device *adev = container_of(nb, struct amdgpu_device, pm_nb); 4986 4987 switch (mode) { 4988 case PM_HIBERNATION_PREPARE: 4989 adev->in_s4 = true; 4990 break; 4991 case PM_POST_HIBERNATION: 4992 adev->in_s4 = false; 4993 break; 4994 } 4995 4996 return NOTIFY_DONE; 4997 } 4998 4999 /** 5000 * amdgpu_device_prepare - prepare for device suspend 5001 * 5002 * @dev: drm dev pointer 5003 * 5004 * Prepare to put the hw in the suspend state (all asics). 5005 * Returns 0 for success or an error on failure. 5006 * Called at driver suspend. 5007 */ 5008 int amdgpu_device_prepare(struct drm_device *dev) 5009 { 5010 struct amdgpu_device *adev = drm_to_adev(dev); 5011 int i, r; 5012 5013 if (dev->switch_power_state == DRM_SWITCH_POWER_OFF) 5014 return 0; 5015 5016 /* Evict the majority of BOs before starting suspend sequence */ 5017 r = amdgpu_device_evict_resources(adev); 5018 if (r) 5019 return r; 5020 5021 flush_delayed_work(&adev->gfx.gfx_off_delay_work); 5022 5023 for (i = 0; i < adev->num_ip_blocks; i++) { 5024 if (!adev->ip_blocks[i].status.valid) 5025 continue; 5026 if (!adev->ip_blocks[i].version->funcs->prepare_suspend) 5027 continue; 5028 r = adev->ip_blocks[i].version->funcs->prepare_suspend(&adev->ip_blocks[i]); 5029 if (r) 5030 return r; 5031 } 5032 5033 return 0; 5034 } 5035 5036 /** 5037 * amdgpu_device_suspend - initiate device suspend 5038 * 5039 * @dev: drm dev pointer 5040 * @notify_clients: notify in-kernel DRM clients 5041 * 5042 * Puts the hw in the suspend state (all asics). 5043 * Returns 0 for success or an error on failure. 5044 * Called at driver suspend. 5045 */ 5046 int amdgpu_device_suspend(struct drm_device *dev, bool notify_clients) 5047 { 5048 struct amdgpu_device *adev = drm_to_adev(dev); 5049 int r = 0; 5050 5051 if (dev->switch_power_state == DRM_SWITCH_POWER_OFF) 5052 return 0; 5053 5054 adev->in_suspend = true; 5055 5056 if (amdgpu_sriov_vf(adev)) { 5057 amdgpu_virt_fini_data_exchange(adev); 5058 r = amdgpu_virt_request_full_gpu(adev, false); 5059 if (r) 5060 return r; 5061 } 5062 5063 if (amdgpu_acpi_smart_shift_update(dev, AMDGPU_SS_DEV_D3)) 5064 DRM_WARN("smart shift update failed\n"); 5065 5066 if (notify_clients) 5067 drm_client_dev_suspend(adev_to_drm(adev), false); 5068 5069 cancel_delayed_work_sync(&adev->delayed_init_work); 5070 5071 amdgpu_ras_suspend(adev); 5072 5073 amdgpu_device_ip_suspend_phase1(adev); 5074 5075 if (!adev->in_s0ix) { 5076 amdgpu_amdkfd_suspend(adev, adev->in_runpm); 5077 amdgpu_userq_suspend(adev); 5078 } 5079 5080 r = amdgpu_device_evict_resources(adev); 5081 if (r) 5082 return r; 5083 5084 amdgpu_ttm_set_buffer_funcs_status(adev, false); 5085 5086 amdgpu_fence_driver_hw_fini(adev); 5087 5088 amdgpu_device_ip_suspend_phase2(adev); 5089 5090 if (amdgpu_sriov_vf(adev)) 5091 amdgpu_virt_release_full_gpu(adev, false); 5092 5093 r = amdgpu_dpm_notify_rlc_state(adev, false); 5094 if (r) 5095 return r; 5096 5097 return 0; 5098 } 5099 5100 /** 5101 * amdgpu_device_resume - initiate device resume 5102 * 5103 * @dev: drm dev pointer 5104 * @notify_clients: notify in-kernel DRM clients 5105 * 5106 * Bring the hw back to operating state (all asics). 5107 * Returns 0 for success or an error on failure. 5108 * Called at driver resume. 5109 */ 5110 int amdgpu_device_resume(struct drm_device *dev, bool notify_clients) 5111 { 5112 struct amdgpu_device *adev = drm_to_adev(dev); 5113 int r = 0; 5114 5115 if (amdgpu_sriov_vf(adev)) { 5116 r = amdgpu_virt_request_full_gpu(adev, true); 5117 if (r) 5118 return r; 5119 } 5120 5121 if (dev->switch_power_state == DRM_SWITCH_POWER_OFF) 5122 return 0; 5123 5124 if (adev->in_s0ix) 5125 amdgpu_dpm_gfx_state_change(adev, sGpuChangeState_D0Entry); 5126 5127 /* post card */ 5128 if (amdgpu_device_need_post(adev)) { 5129 r = amdgpu_device_asic_init(adev); 5130 if (r) 5131 dev_err(adev->dev, "amdgpu asic init failed\n"); 5132 } 5133 5134 r = amdgpu_device_ip_resume(adev); 5135 5136 if (r) { 5137 dev_err(adev->dev, "amdgpu_device_ip_resume failed (%d).\n", r); 5138 goto exit; 5139 } 5140 5141 if (!adev->in_s0ix) { 5142 r = amdgpu_amdkfd_resume(adev, adev->in_runpm); 5143 if (r) 5144 goto exit; 5145 5146 r = amdgpu_userq_resume(adev); 5147 if (r) 5148 goto exit; 5149 } 5150 5151 r = amdgpu_device_ip_late_init(adev); 5152 if (r) 5153 goto exit; 5154 5155 queue_delayed_work(system_wq, &adev->delayed_init_work, 5156 msecs_to_jiffies(AMDGPU_RESUME_MS)); 5157 exit: 5158 if (amdgpu_sriov_vf(adev)) { 5159 amdgpu_virt_init_data_exchange(adev); 5160 amdgpu_virt_release_full_gpu(adev, true); 5161 } 5162 5163 if (r) 5164 return r; 5165 5166 /* Make sure IB tests flushed */ 5167 flush_delayed_work(&adev->delayed_init_work); 5168 5169 if (notify_clients) 5170 drm_client_dev_resume(adev_to_drm(adev), false); 5171 5172 amdgpu_ras_resume(adev); 5173 5174 if (adev->mode_info.num_crtc) { 5175 /* 5176 * Most of the connector probing functions try to acquire runtime pm 5177 * refs to ensure that the GPU is powered on when connector polling is 5178 * performed. Since we're calling this from a runtime PM callback, 5179 * trying to acquire rpm refs will cause us to deadlock. 5180 * 5181 * Since we're guaranteed to be holding the rpm lock, it's safe to 5182 * temporarily disable the rpm helpers so this doesn't deadlock us. 5183 */ 5184 #ifdef CONFIG_PM 5185 dev->dev->power.disable_depth++; 5186 #endif 5187 if (!adev->dc_enabled) 5188 drm_helper_hpd_irq_event(dev); 5189 else 5190 drm_kms_helper_hotplug_event(dev); 5191 #ifdef CONFIG_PM 5192 dev->dev->power.disable_depth--; 5193 #endif 5194 } 5195 adev->in_suspend = false; 5196 5197 if (amdgpu_acpi_smart_shift_update(dev, AMDGPU_SS_DEV_D0)) 5198 DRM_WARN("smart shift update failed\n"); 5199 5200 return 0; 5201 } 5202 5203 /** 5204 * amdgpu_device_ip_check_soft_reset - did soft reset succeed 5205 * 5206 * @adev: amdgpu_device pointer 5207 * 5208 * The list of all the hardware IPs that make up the asic is walked and 5209 * the check_soft_reset callbacks are run. check_soft_reset determines 5210 * if the asic is still hung or not. 5211 * Returns true if any of the IPs are still in a hung state, false if not. 5212 */ 5213 static bool amdgpu_device_ip_check_soft_reset(struct amdgpu_device *adev) 5214 { 5215 int i; 5216 bool asic_hang = false; 5217 5218 if (amdgpu_sriov_vf(adev)) 5219 return true; 5220 5221 if (amdgpu_asic_need_full_reset(adev)) 5222 return true; 5223 5224 for (i = 0; i < adev->num_ip_blocks; i++) { 5225 if (!adev->ip_blocks[i].status.valid) 5226 continue; 5227 if (adev->ip_blocks[i].version->funcs->check_soft_reset) 5228 adev->ip_blocks[i].status.hang = 5229 adev->ip_blocks[i].version->funcs->check_soft_reset( 5230 &adev->ip_blocks[i]); 5231 if (adev->ip_blocks[i].status.hang) { 5232 dev_info(adev->dev, "IP block:%s is hung!\n", adev->ip_blocks[i].version->funcs->name); 5233 asic_hang = true; 5234 } 5235 } 5236 return asic_hang; 5237 } 5238 5239 /** 5240 * amdgpu_device_ip_pre_soft_reset - prepare for soft reset 5241 * 5242 * @adev: amdgpu_device pointer 5243 * 5244 * The list of all the hardware IPs that make up the asic is walked and the 5245 * pre_soft_reset callbacks are run if the block is hung. pre_soft_reset 5246 * handles any IP specific hardware or software state changes that are 5247 * necessary for a soft reset to succeed. 5248 * Returns 0 on success, negative error code on failure. 5249 */ 5250 static int amdgpu_device_ip_pre_soft_reset(struct amdgpu_device *adev) 5251 { 5252 int i, r = 0; 5253 5254 for (i = 0; i < adev->num_ip_blocks; i++) { 5255 if (!adev->ip_blocks[i].status.valid) 5256 continue; 5257 if (adev->ip_blocks[i].status.hang && 5258 adev->ip_blocks[i].version->funcs->pre_soft_reset) { 5259 r = adev->ip_blocks[i].version->funcs->pre_soft_reset(&adev->ip_blocks[i]); 5260 if (r) 5261 return r; 5262 } 5263 } 5264 5265 return 0; 5266 } 5267 5268 /** 5269 * amdgpu_device_ip_need_full_reset - check if a full asic reset is needed 5270 * 5271 * @adev: amdgpu_device pointer 5272 * 5273 * Some hardware IPs cannot be soft reset. If they are hung, a full gpu 5274 * reset is necessary to recover. 5275 * Returns true if a full asic reset is required, false if not. 5276 */ 5277 static bool amdgpu_device_ip_need_full_reset(struct amdgpu_device *adev) 5278 { 5279 int i; 5280 5281 if (amdgpu_asic_need_full_reset(adev)) 5282 return true; 5283 5284 for (i = 0; i < adev->num_ip_blocks; i++) { 5285 if (!adev->ip_blocks[i].status.valid) 5286 continue; 5287 if ((adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) || 5288 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) || 5289 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_ACP) || 5290 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE) || 5291 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) { 5292 if (adev->ip_blocks[i].status.hang) { 5293 dev_info(adev->dev, "Some block need full reset!\n"); 5294 return true; 5295 } 5296 } 5297 } 5298 return false; 5299 } 5300 5301 /** 5302 * amdgpu_device_ip_soft_reset - do a soft reset 5303 * 5304 * @adev: amdgpu_device pointer 5305 * 5306 * The list of all the hardware IPs that make up the asic is walked and the 5307 * soft_reset callbacks are run if the block is hung. soft_reset handles any 5308 * IP specific hardware or software state changes that are necessary to soft 5309 * reset the IP. 5310 * Returns 0 on success, negative error code on failure. 5311 */ 5312 static int amdgpu_device_ip_soft_reset(struct amdgpu_device *adev) 5313 { 5314 int i, r = 0; 5315 5316 for (i = 0; i < adev->num_ip_blocks; i++) { 5317 if (!adev->ip_blocks[i].status.valid) 5318 continue; 5319 if (adev->ip_blocks[i].status.hang && 5320 adev->ip_blocks[i].version->funcs->soft_reset) { 5321 r = adev->ip_blocks[i].version->funcs->soft_reset(&adev->ip_blocks[i]); 5322 if (r) 5323 return r; 5324 } 5325 } 5326 5327 return 0; 5328 } 5329 5330 /** 5331 * amdgpu_device_ip_post_soft_reset - clean up from soft reset 5332 * 5333 * @adev: amdgpu_device pointer 5334 * 5335 * The list of all the hardware IPs that make up the asic is walked and the 5336 * post_soft_reset callbacks are run if the asic was hung. post_soft_reset 5337 * handles any IP specific hardware or software state changes that are 5338 * necessary after the IP has been soft reset. 5339 * Returns 0 on success, negative error code on failure. 5340 */ 5341 static int amdgpu_device_ip_post_soft_reset(struct amdgpu_device *adev) 5342 { 5343 int i, r = 0; 5344 5345 for (i = 0; i < adev->num_ip_blocks; i++) { 5346 if (!adev->ip_blocks[i].status.valid) 5347 continue; 5348 if (adev->ip_blocks[i].status.hang && 5349 adev->ip_blocks[i].version->funcs->post_soft_reset) 5350 r = adev->ip_blocks[i].version->funcs->post_soft_reset(&adev->ip_blocks[i]); 5351 if (r) 5352 return r; 5353 } 5354 5355 return 0; 5356 } 5357 5358 /** 5359 * amdgpu_device_reset_sriov - reset ASIC for SR-IOV vf 5360 * 5361 * @adev: amdgpu_device pointer 5362 * @reset_context: amdgpu reset context pointer 5363 * 5364 * do VF FLR and reinitialize Asic 5365 * return 0 means succeeded otherwise failed 5366 */ 5367 static int amdgpu_device_reset_sriov(struct amdgpu_device *adev, 5368 struct amdgpu_reset_context *reset_context) 5369 { 5370 int r; 5371 struct amdgpu_hive_info *hive = NULL; 5372 5373 if (test_bit(AMDGPU_HOST_FLR, &reset_context->flags)) { 5374 if (!amdgpu_ras_get_fed_status(adev)) 5375 amdgpu_virt_ready_to_reset(adev); 5376 amdgpu_virt_wait_reset(adev); 5377 clear_bit(AMDGPU_HOST_FLR, &reset_context->flags); 5378 r = amdgpu_virt_request_full_gpu(adev, true); 5379 } else { 5380 r = amdgpu_virt_reset_gpu(adev); 5381 } 5382 if (r) 5383 return r; 5384 5385 amdgpu_ras_clear_err_state(adev); 5386 amdgpu_irq_gpu_reset_resume_helper(adev); 5387 5388 /* some sw clean up VF needs to do before recover */ 5389 amdgpu_virt_post_reset(adev); 5390 5391 /* Resume IP prior to SMC */ 5392 r = amdgpu_device_ip_reinit_early_sriov(adev); 5393 if (r) 5394 return r; 5395 5396 amdgpu_virt_init_data_exchange(adev); 5397 5398 r = amdgpu_device_fw_loading(adev); 5399 if (r) 5400 return r; 5401 5402 /* now we are okay to resume SMC/CP/SDMA */ 5403 r = amdgpu_device_ip_reinit_late_sriov(adev); 5404 if (r) 5405 return r; 5406 5407 hive = amdgpu_get_xgmi_hive(adev); 5408 /* Update PSP FW topology after reset */ 5409 if (hive && adev->gmc.xgmi.num_physical_nodes > 1) 5410 r = amdgpu_xgmi_update_topology(hive, adev); 5411 if (hive) 5412 amdgpu_put_xgmi_hive(hive); 5413 if (r) 5414 return r; 5415 5416 r = amdgpu_ib_ring_tests(adev); 5417 if (r) 5418 return r; 5419 5420 if (adev->virt.gim_feature & AMDGIM_FEATURE_GIM_FLR_VRAMLOST) 5421 amdgpu_inc_vram_lost(adev); 5422 5423 /* need to be called during full access so we can't do it later like 5424 * bare-metal does. 5425 */ 5426 amdgpu_amdkfd_post_reset(adev); 5427 amdgpu_virt_release_full_gpu(adev, true); 5428 5429 /* Aldebaran and gfx_11_0_3 support ras in SRIOV, so need resume ras during reset */ 5430 if (amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 2) || 5431 amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 3) || 5432 amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 4) || 5433 amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 5, 0) || 5434 amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(11, 0, 3)) 5435 amdgpu_ras_resume(adev); 5436 5437 amdgpu_virt_ras_telemetry_post_reset(adev); 5438 5439 return 0; 5440 } 5441 5442 /** 5443 * amdgpu_device_has_job_running - check if there is any unfinished job 5444 * 5445 * @adev: amdgpu_device pointer 5446 * 5447 * check if there is any job running on the device when guest driver receives 5448 * FLR notification from host driver. If there are still jobs running, then 5449 * the guest driver will not respond the FLR reset. Instead, let the job hit 5450 * the timeout and guest driver then issue the reset request. 5451 */ 5452 bool amdgpu_device_has_job_running(struct amdgpu_device *adev) 5453 { 5454 int i; 5455 5456 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 5457 struct amdgpu_ring *ring = adev->rings[i]; 5458 5459 if (!amdgpu_ring_sched_ready(ring)) 5460 continue; 5461 5462 if (amdgpu_fence_count_emitted(ring)) 5463 return true; 5464 } 5465 return false; 5466 } 5467 5468 /** 5469 * amdgpu_device_should_recover_gpu - check if we should try GPU recovery 5470 * 5471 * @adev: amdgpu_device pointer 5472 * 5473 * Check amdgpu_gpu_recovery and SRIOV status to see if we should try to recover 5474 * a hung GPU. 5475 */ 5476 bool amdgpu_device_should_recover_gpu(struct amdgpu_device *adev) 5477 { 5478 5479 if (amdgpu_gpu_recovery == 0) 5480 goto disabled; 5481 5482 /* Skip soft reset check in fatal error mode */ 5483 if (!amdgpu_ras_is_poison_mode_supported(adev)) 5484 return true; 5485 5486 if (amdgpu_sriov_vf(adev)) 5487 return true; 5488 5489 if (amdgpu_gpu_recovery == -1) { 5490 switch (adev->asic_type) { 5491 #ifdef CONFIG_DRM_AMDGPU_SI 5492 case CHIP_VERDE: 5493 case CHIP_TAHITI: 5494 case CHIP_PITCAIRN: 5495 case CHIP_OLAND: 5496 case CHIP_HAINAN: 5497 #endif 5498 #ifdef CONFIG_DRM_AMDGPU_CIK 5499 case CHIP_KAVERI: 5500 case CHIP_KABINI: 5501 case CHIP_MULLINS: 5502 #endif 5503 case CHIP_CARRIZO: 5504 case CHIP_STONEY: 5505 case CHIP_CYAN_SKILLFISH: 5506 goto disabled; 5507 default: 5508 break; 5509 } 5510 } 5511 5512 return true; 5513 5514 disabled: 5515 dev_info(adev->dev, "GPU recovery disabled.\n"); 5516 return false; 5517 } 5518 5519 int amdgpu_device_mode1_reset(struct amdgpu_device *adev) 5520 { 5521 u32 i; 5522 int ret = 0; 5523 5524 if (adev->bios) 5525 amdgpu_atombios_scratch_regs_engine_hung(adev, true); 5526 5527 dev_info(adev->dev, "GPU mode1 reset\n"); 5528 5529 /* Cache the state before bus master disable. The saved config space 5530 * values are used in other cases like restore after mode-2 reset. 5531 */ 5532 amdgpu_device_cache_pci_state(adev->pdev); 5533 5534 /* disable BM */ 5535 pci_clear_master(adev->pdev); 5536 5537 if (amdgpu_dpm_is_mode1_reset_supported(adev)) { 5538 dev_info(adev->dev, "GPU smu mode1 reset\n"); 5539 ret = amdgpu_dpm_mode1_reset(adev); 5540 } else { 5541 dev_info(adev->dev, "GPU psp mode1 reset\n"); 5542 ret = psp_gpu_reset(adev); 5543 } 5544 5545 if (ret) 5546 goto mode1_reset_failed; 5547 5548 amdgpu_device_load_pci_state(adev->pdev); 5549 ret = amdgpu_psp_wait_for_bootloader(adev); 5550 if (ret) 5551 goto mode1_reset_failed; 5552 5553 /* wait for asic to come out of reset */ 5554 for (i = 0; i < adev->usec_timeout; i++) { 5555 u32 memsize = adev->nbio.funcs->get_memsize(adev); 5556 5557 if (memsize != 0xffffffff) 5558 break; 5559 udelay(1); 5560 } 5561 5562 if (i >= adev->usec_timeout) { 5563 ret = -ETIMEDOUT; 5564 goto mode1_reset_failed; 5565 } 5566 5567 if (adev->bios) 5568 amdgpu_atombios_scratch_regs_engine_hung(adev, false); 5569 5570 return 0; 5571 5572 mode1_reset_failed: 5573 dev_err(adev->dev, "GPU mode1 reset failed\n"); 5574 return ret; 5575 } 5576 5577 int amdgpu_device_link_reset(struct amdgpu_device *adev) 5578 { 5579 int ret = 0; 5580 5581 dev_info(adev->dev, "GPU link reset\n"); 5582 5583 if (!adev->pcie_reset_ctx.occurs_dpc) 5584 ret = amdgpu_dpm_link_reset(adev); 5585 5586 if (ret) 5587 goto link_reset_failed; 5588 5589 ret = amdgpu_psp_wait_for_bootloader(adev); 5590 if (ret) 5591 goto link_reset_failed; 5592 5593 return 0; 5594 5595 link_reset_failed: 5596 dev_err(adev->dev, "GPU link reset failed\n"); 5597 return ret; 5598 } 5599 5600 int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev, 5601 struct amdgpu_reset_context *reset_context) 5602 { 5603 int i, r = 0; 5604 struct amdgpu_job *job = NULL; 5605 struct amdgpu_device *tmp_adev = reset_context->reset_req_dev; 5606 bool need_full_reset = 5607 test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 5608 5609 if (reset_context->reset_req_dev == adev) 5610 job = reset_context->job; 5611 5612 if (amdgpu_sriov_vf(adev)) 5613 amdgpu_virt_pre_reset(adev); 5614 5615 amdgpu_fence_driver_isr_toggle(adev, true); 5616 5617 /* block all schedulers and reset given job's ring */ 5618 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 5619 struct amdgpu_ring *ring = adev->rings[i]; 5620 5621 if (!amdgpu_ring_sched_ready(ring)) 5622 continue; 5623 5624 /* Clear job fence from fence drv to avoid force_completion 5625 * leave NULL and vm flush fence in fence drv 5626 */ 5627 amdgpu_fence_driver_clear_job_fences(ring); 5628 5629 /* after all hw jobs are reset, hw fence is meaningless, so force_completion */ 5630 amdgpu_fence_driver_force_completion(ring); 5631 } 5632 5633 amdgpu_fence_driver_isr_toggle(adev, false); 5634 5635 if (job && job->vm) 5636 drm_sched_increase_karma(&job->base); 5637 5638 r = amdgpu_reset_prepare_hwcontext(adev, reset_context); 5639 /* If reset handler not implemented, continue; otherwise return */ 5640 if (r == -EOPNOTSUPP) 5641 r = 0; 5642 else 5643 return r; 5644 5645 /* Don't suspend on bare metal if we are not going to HW reset the ASIC */ 5646 if (!amdgpu_sriov_vf(adev)) { 5647 5648 if (!need_full_reset) 5649 need_full_reset = amdgpu_device_ip_need_full_reset(adev); 5650 5651 if (!need_full_reset && amdgpu_gpu_recovery && 5652 amdgpu_device_ip_check_soft_reset(adev)) { 5653 amdgpu_device_ip_pre_soft_reset(adev); 5654 r = amdgpu_device_ip_soft_reset(adev); 5655 amdgpu_device_ip_post_soft_reset(adev); 5656 if (r || amdgpu_device_ip_check_soft_reset(adev)) { 5657 dev_info(adev->dev, "soft reset failed, will fallback to full reset!\n"); 5658 need_full_reset = true; 5659 } 5660 } 5661 5662 if (!test_bit(AMDGPU_SKIP_COREDUMP, &reset_context->flags)) { 5663 dev_info(tmp_adev->dev, "Dumping IP State\n"); 5664 /* Trigger ip dump before we reset the asic */ 5665 for (i = 0; i < tmp_adev->num_ip_blocks; i++) 5666 if (tmp_adev->ip_blocks[i].version->funcs->dump_ip_state) 5667 tmp_adev->ip_blocks[i].version->funcs 5668 ->dump_ip_state((void *)&tmp_adev->ip_blocks[i]); 5669 dev_info(tmp_adev->dev, "Dumping IP State Completed\n"); 5670 } 5671 5672 if (need_full_reset) 5673 r = amdgpu_device_ip_suspend(adev); 5674 if (need_full_reset) 5675 set_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 5676 else 5677 clear_bit(AMDGPU_NEED_FULL_RESET, 5678 &reset_context->flags); 5679 } 5680 5681 return r; 5682 } 5683 5684 int amdgpu_device_reinit_after_reset(struct amdgpu_reset_context *reset_context) 5685 { 5686 struct list_head *device_list_handle; 5687 bool full_reset, vram_lost = false; 5688 struct amdgpu_device *tmp_adev; 5689 int r, init_level; 5690 5691 device_list_handle = reset_context->reset_device_list; 5692 5693 if (!device_list_handle) 5694 return -EINVAL; 5695 5696 full_reset = test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 5697 5698 /** 5699 * If it's reset on init, it's default init level, otherwise keep level 5700 * as recovery level. 5701 */ 5702 if (reset_context->method == AMD_RESET_METHOD_ON_INIT) 5703 init_level = AMDGPU_INIT_LEVEL_DEFAULT; 5704 else 5705 init_level = AMDGPU_INIT_LEVEL_RESET_RECOVERY; 5706 5707 r = 0; 5708 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 5709 amdgpu_set_init_level(tmp_adev, init_level); 5710 if (full_reset) { 5711 /* post card */ 5712 amdgpu_ras_clear_err_state(tmp_adev); 5713 r = amdgpu_device_asic_init(tmp_adev); 5714 if (r) { 5715 dev_warn(tmp_adev->dev, "asic atom init failed!"); 5716 } else { 5717 dev_info(tmp_adev->dev, "GPU reset succeeded, trying to resume\n"); 5718 5719 r = amdgpu_device_ip_resume_phase1(tmp_adev); 5720 if (r) 5721 goto out; 5722 5723 vram_lost = amdgpu_device_check_vram_lost(tmp_adev); 5724 5725 if (!test_bit(AMDGPU_SKIP_COREDUMP, &reset_context->flags)) 5726 amdgpu_coredump(tmp_adev, false, vram_lost, reset_context->job); 5727 5728 if (vram_lost) { 5729 DRM_INFO("VRAM is lost due to GPU reset!\n"); 5730 amdgpu_inc_vram_lost(tmp_adev); 5731 } 5732 5733 r = amdgpu_device_fw_loading(tmp_adev); 5734 if (r) 5735 return r; 5736 5737 r = amdgpu_xcp_restore_partition_mode( 5738 tmp_adev->xcp_mgr); 5739 if (r) 5740 goto out; 5741 5742 r = amdgpu_device_ip_resume_phase2(tmp_adev); 5743 if (r) 5744 goto out; 5745 5746 if (tmp_adev->mman.buffer_funcs_ring->sched.ready) 5747 amdgpu_ttm_set_buffer_funcs_status(tmp_adev, true); 5748 5749 r = amdgpu_device_ip_resume_phase3(tmp_adev); 5750 if (r) 5751 goto out; 5752 5753 if (vram_lost) 5754 amdgpu_device_fill_reset_magic(tmp_adev); 5755 5756 /* 5757 * Add this ASIC as tracked as reset was already 5758 * complete successfully. 5759 */ 5760 amdgpu_register_gpu_instance(tmp_adev); 5761 5762 if (!reset_context->hive && 5763 tmp_adev->gmc.xgmi.num_physical_nodes > 1) 5764 amdgpu_xgmi_add_device(tmp_adev); 5765 5766 r = amdgpu_device_ip_late_init(tmp_adev); 5767 if (r) 5768 goto out; 5769 5770 drm_client_dev_resume(adev_to_drm(tmp_adev), false); 5771 5772 /* 5773 * The GPU enters bad state once faulty pages 5774 * by ECC has reached the threshold, and ras 5775 * recovery is scheduled next. So add one check 5776 * here to break recovery if it indeed exceeds 5777 * bad page threshold, and remind user to 5778 * retire this GPU or setting one bigger 5779 * bad_page_threshold value to fix this once 5780 * probing driver again. 5781 */ 5782 if (!amdgpu_ras_is_rma(tmp_adev)) { 5783 /* must succeed. */ 5784 amdgpu_ras_resume(tmp_adev); 5785 } else { 5786 r = -EINVAL; 5787 goto out; 5788 } 5789 5790 /* Update PSP FW topology after reset */ 5791 if (reset_context->hive && 5792 tmp_adev->gmc.xgmi.num_physical_nodes > 1) 5793 r = amdgpu_xgmi_update_topology( 5794 reset_context->hive, tmp_adev); 5795 } 5796 } 5797 5798 out: 5799 if (!r) { 5800 /* IP init is complete now, set level as default */ 5801 amdgpu_set_init_level(tmp_adev, 5802 AMDGPU_INIT_LEVEL_DEFAULT); 5803 amdgpu_irq_gpu_reset_resume_helper(tmp_adev); 5804 r = amdgpu_ib_ring_tests(tmp_adev); 5805 if (r) { 5806 dev_err(tmp_adev->dev, "ib ring test failed (%d).\n", r); 5807 r = -EAGAIN; 5808 goto end; 5809 } 5810 } 5811 5812 if (r) 5813 tmp_adev->asic_reset_res = r; 5814 } 5815 5816 end: 5817 return r; 5818 } 5819 5820 int amdgpu_do_asic_reset(struct list_head *device_list_handle, 5821 struct amdgpu_reset_context *reset_context) 5822 { 5823 struct amdgpu_device *tmp_adev = NULL; 5824 bool need_full_reset, skip_hw_reset; 5825 int r = 0; 5826 5827 /* Try reset handler method first */ 5828 tmp_adev = list_first_entry(device_list_handle, struct amdgpu_device, 5829 reset_list); 5830 5831 reset_context->reset_device_list = device_list_handle; 5832 r = amdgpu_reset_perform_reset(tmp_adev, reset_context); 5833 /* If reset handler not implemented, continue; otherwise return */ 5834 if (r == -EOPNOTSUPP) 5835 r = 0; 5836 else 5837 return r; 5838 5839 /* Reset handler not implemented, use the default method */ 5840 need_full_reset = 5841 test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 5842 skip_hw_reset = test_bit(AMDGPU_SKIP_HW_RESET, &reset_context->flags); 5843 5844 /* 5845 * ASIC reset has to be done on all XGMI hive nodes ASAP 5846 * to allow proper links negotiation in FW (within 1 sec) 5847 */ 5848 if (!skip_hw_reset && need_full_reset) { 5849 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 5850 /* For XGMI run all resets in parallel to speed up the process */ 5851 if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) { 5852 if (!queue_work(system_unbound_wq, 5853 &tmp_adev->xgmi_reset_work)) 5854 r = -EALREADY; 5855 } else 5856 r = amdgpu_asic_reset(tmp_adev); 5857 5858 if (r) { 5859 dev_err(tmp_adev->dev, 5860 "ASIC reset failed with error, %d for drm dev, %s", 5861 r, adev_to_drm(tmp_adev)->unique); 5862 goto out; 5863 } 5864 } 5865 5866 /* For XGMI wait for all resets to complete before proceed */ 5867 if (!r) { 5868 list_for_each_entry(tmp_adev, device_list_handle, 5869 reset_list) { 5870 if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) { 5871 flush_work(&tmp_adev->xgmi_reset_work); 5872 r = tmp_adev->asic_reset_res; 5873 if (r) 5874 break; 5875 } 5876 } 5877 } 5878 } 5879 5880 if (!r && amdgpu_ras_intr_triggered()) { 5881 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 5882 amdgpu_ras_reset_error_count(tmp_adev, 5883 AMDGPU_RAS_BLOCK__MMHUB); 5884 } 5885 5886 amdgpu_ras_intr_cleared(); 5887 } 5888 5889 r = amdgpu_device_reinit_after_reset(reset_context); 5890 if (r == -EAGAIN) 5891 set_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 5892 else 5893 clear_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 5894 5895 out: 5896 return r; 5897 } 5898 5899 static void amdgpu_device_set_mp1_state(struct amdgpu_device *adev) 5900 { 5901 5902 switch (amdgpu_asic_reset_method(adev)) { 5903 case AMD_RESET_METHOD_MODE1: 5904 case AMD_RESET_METHOD_LINK: 5905 adev->mp1_state = PP_MP1_STATE_SHUTDOWN; 5906 break; 5907 case AMD_RESET_METHOD_MODE2: 5908 adev->mp1_state = PP_MP1_STATE_RESET; 5909 break; 5910 default: 5911 adev->mp1_state = PP_MP1_STATE_NONE; 5912 break; 5913 } 5914 } 5915 5916 static void amdgpu_device_unset_mp1_state(struct amdgpu_device *adev) 5917 { 5918 amdgpu_vf_error_trans_all(adev); 5919 adev->mp1_state = PP_MP1_STATE_NONE; 5920 } 5921 5922 static void amdgpu_device_resume_display_audio(struct amdgpu_device *adev) 5923 { 5924 struct pci_dev *p = NULL; 5925 5926 p = pci_get_domain_bus_and_slot(pci_domain_nr(adev->pdev->bus), 5927 adev->pdev->bus->number, 1); 5928 if (p) { 5929 pm_runtime_enable(&(p->dev)); 5930 pm_runtime_resume(&(p->dev)); 5931 } 5932 5933 pci_dev_put(p); 5934 } 5935 5936 static int amdgpu_device_suspend_display_audio(struct amdgpu_device *adev) 5937 { 5938 enum amd_reset_method reset_method; 5939 struct pci_dev *p = NULL; 5940 u64 expires; 5941 5942 /* 5943 * For now, only BACO and mode1 reset are confirmed 5944 * to suffer the audio issue without proper suspended. 5945 */ 5946 reset_method = amdgpu_asic_reset_method(adev); 5947 if ((reset_method != AMD_RESET_METHOD_BACO) && 5948 (reset_method != AMD_RESET_METHOD_MODE1)) 5949 return -EINVAL; 5950 5951 p = pci_get_domain_bus_and_slot(pci_domain_nr(adev->pdev->bus), 5952 adev->pdev->bus->number, 1); 5953 if (!p) 5954 return -ENODEV; 5955 5956 expires = pm_runtime_autosuspend_expiration(&(p->dev)); 5957 if (!expires) 5958 /* 5959 * If we cannot get the audio device autosuspend delay, 5960 * a fixed 4S interval will be used. Considering 3S is 5961 * the audio controller default autosuspend delay setting. 5962 * 4S used here is guaranteed to cover that. 5963 */ 5964 expires = ktime_get_mono_fast_ns() + NSEC_PER_SEC * 4ULL; 5965 5966 while (!pm_runtime_status_suspended(&(p->dev))) { 5967 if (!pm_runtime_suspend(&(p->dev))) 5968 break; 5969 5970 if (expires < ktime_get_mono_fast_ns()) { 5971 dev_warn(adev->dev, "failed to suspend display audio\n"); 5972 pci_dev_put(p); 5973 /* TODO: abort the succeeding gpu reset? */ 5974 return -ETIMEDOUT; 5975 } 5976 } 5977 5978 pm_runtime_disable(&(p->dev)); 5979 5980 pci_dev_put(p); 5981 return 0; 5982 } 5983 5984 static inline void amdgpu_device_stop_pending_resets(struct amdgpu_device *adev) 5985 { 5986 struct amdgpu_ras *con = amdgpu_ras_get_context(adev); 5987 5988 #if defined(CONFIG_DEBUG_FS) 5989 if (!amdgpu_sriov_vf(adev)) 5990 cancel_work(&adev->reset_work); 5991 #endif 5992 5993 if (adev->kfd.dev) 5994 cancel_work(&adev->kfd.reset_work); 5995 5996 if (amdgpu_sriov_vf(adev)) 5997 cancel_work(&adev->virt.flr_work); 5998 5999 if (con && adev->ras_enabled) 6000 cancel_work(&con->recovery_work); 6001 6002 } 6003 6004 static int amdgpu_device_health_check(struct list_head *device_list_handle) 6005 { 6006 struct amdgpu_device *tmp_adev; 6007 int ret = 0; 6008 u32 status; 6009 6010 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 6011 pci_read_config_dword(tmp_adev->pdev, PCI_COMMAND, &status); 6012 if (PCI_POSSIBLE_ERROR(status)) { 6013 dev_err(tmp_adev->dev, "device lost from bus!"); 6014 ret = -ENODEV; 6015 } 6016 } 6017 6018 return ret; 6019 } 6020 6021 static int amdgpu_device_halt_activities(struct amdgpu_device *adev, 6022 struct amdgpu_job *job, 6023 struct amdgpu_reset_context *reset_context, 6024 struct list_head *device_list, 6025 struct amdgpu_hive_info *hive, 6026 bool need_emergency_restart) 6027 { 6028 struct list_head *device_list_handle = NULL; 6029 struct amdgpu_device *tmp_adev = NULL; 6030 int i, r = 0; 6031 6032 /* 6033 * Build list of devices to reset. 6034 * In case we are in XGMI hive mode, resort the device list 6035 * to put adev in the 1st position. 6036 */ 6037 if (!amdgpu_sriov_vf(adev) && (adev->gmc.xgmi.num_physical_nodes > 1) && hive) { 6038 list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head) { 6039 list_add_tail(&tmp_adev->reset_list, device_list); 6040 if (adev->shutdown) 6041 tmp_adev->shutdown = true; 6042 if (adev->pcie_reset_ctx.occurs_dpc) 6043 tmp_adev->pcie_reset_ctx.in_link_reset = true; 6044 } 6045 if (!list_is_first(&adev->reset_list, device_list)) 6046 list_rotate_to_front(&adev->reset_list, device_list); 6047 device_list_handle = device_list; 6048 } else { 6049 list_add_tail(&adev->reset_list, device_list); 6050 device_list_handle = device_list; 6051 } 6052 6053 if (!amdgpu_sriov_vf(adev) && (!adev->pcie_reset_ctx.occurs_dpc)) { 6054 r = amdgpu_device_health_check(device_list_handle); 6055 if (r) 6056 return r; 6057 } 6058 6059 /* We need to lock reset domain only once both for XGMI and single device */ 6060 tmp_adev = list_first_entry(device_list_handle, struct amdgpu_device, 6061 reset_list); 6062 amdgpu_device_lock_reset_domain(tmp_adev->reset_domain); 6063 6064 /* block all schedulers and reset given job's ring */ 6065 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 6066 6067 amdgpu_device_set_mp1_state(tmp_adev); 6068 6069 /* 6070 * Try to put the audio codec into suspend state 6071 * before gpu reset started. 6072 * 6073 * Due to the power domain of the graphics device 6074 * is shared with AZ power domain. Without this, 6075 * we may change the audio hardware from behind 6076 * the audio driver's back. That will trigger 6077 * some audio codec errors. 6078 */ 6079 if (!amdgpu_device_suspend_display_audio(tmp_adev)) 6080 tmp_adev->pcie_reset_ctx.audio_suspended = true; 6081 6082 amdgpu_ras_set_error_query_ready(tmp_adev, false); 6083 6084 cancel_delayed_work_sync(&tmp_adev->delayed_init_work); 6085 6086 amdgpu_amdkfd_pre_reset(tmp_adev, reset_context); 6087 6088 /* 6089 * Mark these ASICs to be reset as untracked first 6090 * And add them back after reset completed 6091 */ 6092 amdgpu_unregister_gpu_instance(tmp_adev); 6093 6094 drm_client_dev_suspend(adev_to_drm(tmp_adev), false); 6095 6096 /* disable ras on ALL IPs */ 6097 if (!need_emergency_restart && 6098 (!adev->pcie_reset_ctx.occurs_dpc) && 6099 amdgpu_device_ip_need_full_reset(tmp_adev)) 6100 amdgpu_ras_suspend(tmp_adev); 6101 6102 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 6103 struct amdgpu_ring *ring = tmp_adev->rings[i]; 6104 6105 if (!amdgpu_ring_sched_ready(ring)) 6106 continue; 6107 6108 drm_sched_stop(&ring->sched, job ? &job->base : NULL); 6109 6110 if (need_emergency_restart) 6111 amdgpu_job_stop_all_jobs_on_sched(&ring->sched); 6112 } 6113 atomic_inc(&tmp_adev->gpu_reset_counter); 6114 } 6115 6116 return r; 6117 } 6118 6119 static int amdgpu_device_asic_reset(struct amdgpu_device *adev, 6120 struct list_head *device_list, 6121 struct amdgpu_reset_context *reset_context) 6122 { 6123 struct amdgpu_device *tmp_adev = NULL; 6124 int retry_limit = AMDGPU_MAX_RETRY_LIMIT; 6125 int r = 0; 6126 6127 retry: /* Rest of adevs pre asic reset from XGMI hive. */ 6128 list_for_each_entry(tmp_adev, device_list, reset_list) { 6129 if (adev->pcie_reset_ctx.occurs_dpc) 6130 tmp_adev->no_hw_access = true; 6131 r = amdgpu_device_pre_asic_reset(tmp_adev, reset_context); 6132 if (adev->pcie_reset_ctx.occurs_dpc) 6133 tmp_adev->no_hw_access = false; 6134 /*TODO Should we stop ?*/ 6135 if (r) { 6136 dev_err(tmp_adev->dev, "GPU pre asic reset failed with err, %d for drm dev, %s ", 6137 r, adev_to_drm(tmp_adev)->unique); 6138 tmp_adev->asic_reset_res = r; 6139 } 6140 } 6141 6142 /* Actual ASIC resets if needed.*/ 6143 /* Host driver will handle XGMI hive reset for SRIOV */ 6144 if (amdgpu_sriov_vf(adev)) { 6145 6146 /* Bail out of reset early */ 6147 if (amdgpu_ras_is_rma(adev)) 6148 return -ENODEV; 6149 6150 if (amdgpu_ras_get_fed_status(adev) || amdgpu_virt_rcvd_ras_interrupt(adev)) { 6151 dev_dbg(adev->dev, "Detected RAS error, wait for FLR completion\n"); 6152 amdgpu_ras_set_fed(adev, true); 6153 set_bit(AMDGPU_HOST_FLR, &reset_context->flags); 6154 } 6155 6156 r = amdgpu_device_reset_sriov(adev, reset_context); 6157 if (AMDGPU_RETRY_SRIOV_RESET(r) && (retry_limit--) > 0) { 6158 amdgpu_virt_release_full_gpu(adev, true); 6159 goto retry; 6160 } 6161 if (r) 6162 adev->asic_reset_res = r; 6163 } else { 6164 r = amdgpu_do_asic_reset(device_list, reset_context); 6165 if (r && r == -EAGAIN) 6166 goto retry; 6167 } 6168 6169 list_for_each_entry(tmp_adev, device_list, reset_list) { 6170 /* 6171 * Drop any pending non scheduler resets queued before reset is done. 6172 * Any reset scheduled after this point would be valid. Scheduler resets 6173 * were already dropped during drm_sched_stop and no new ones can come 6174 * in before drm_sched_start. 6175 */ 6176 amdgpu_device_stop_pending_resets(tmp_adev); 6177 } 6178 6179 return r; 6180 } 6181 6182 static int amdgpu_device_sched_resume(struct list_head *device_list, 6183 struct amdgpu_reset_context *reset_context, 6184 bool job_signaled) 6185 { 6186 struct amdgpu_device *tmp_adev = NULL; 6187 int i, r = 0; 6188 6189 /* Post ASIC reset for all devs .*/ 6190 list_for_each_entry(tmp_adev, device_list, reset_list) { 6191 6192 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 6193 struct amdgpu_ring *ring = tmp_adev->rings[i]; 6194 6195 if (!amdgpu_ring_sched_ready(ring)) 6196 continue; 6197 6198 drm_sched_start(&ring->sched, 0); 6199 } 6200 6201 if (!drm_drv_uses_atomic_modeset(adev_to_drm(tmp_adev)) && !job_signaled) 6202 drm_helper_resume_force_mode(adev_to_drm(tmp_adev)); 6203 6204 if (tmp_adev->asic_reset_res) 6205 r = tmp_adev->asic_reset_res; 6206 6207 tmp_adev->asic_reset_res = 0; 6208 6209 if (r) { 6210 /* bad news, how to tell it to userspace ? 6211 * for ras error, we should report GPU bad status instead of 6212 * reset failure 6213 */ 6214 if (reset_context->src != AMDGPU_RESET_SRC_RAS || 6215 !amdgpu_ras_eeprom_check_err_threshold(tmp_adev)) 6216 dev_info(tmp_adev->dev, "GPU reset(%d) failed\n", 6217 atomic_read(&tmp_adev->gpu_reset_counter)); 6218 amdgpu_vf_error_put(tmp_adev, AMDGIM_ERROR_VF_GPU_RESET_FAIL, 0, r); 6219 } else { 6220 dev_info(tmp_adev->dev, "GPU reset(%d) succeeded!\n", atomic_read(&tmp_adev->gpu_reset_counter)); 6221 if (amdgpu_acpi_smart_shift_update(adev_to_drm(tmp_adev), AMDGPU_SS_DEV_D0)) 6222 DRM_WARN("smart shift update failed\n"); 6223 } 6224 } 6225 6226 return r; 6227 } 6228 6229 static void amdgpu_device_gpu_resume(struct amdgpu_device *adev, 6230 struct list_head *device_list, 6231 bool need_emergency_restart) 6232 { 6233 struct amdgpu_device *tmp_adev = NULL; 6234 6235 list_for_each_entry(tmp_adev, device_list, reset_list) { 6236 /* unlock kfd: SRIOV would do it separately */ 6237 if (!need_emergency_restart && !amdgpu_sriov_vf(tmp_adev)) 6238 amdgpu_amdkfd_post_reset(tmp_adev); 6239 6240 /* kfd_post_reset will do nothing if kfd device is not initialized, 6241 * need to bring up kfd here if it's not be initialized before 6242 */ 6243 if (!adev->kfd.init_complete) 6244 amdgpu_amdkfd_device_init(adev); 6245 6246 if (tmp_adev->pcie_reset_ctx.audio_suspended) 6247 amdgpu_device_resume_display_audio(tmp_adev); 6248 6249 amdgpu_device_unset_mp1_state(tmp_adev); 6250 6251 amdgpu_ras_set_error_query_ready(tmp_adev, true); 6252 6253 } 6254 6255 tmp_adev = list_first_entry(device_list, struct amdgpu_device, 6256 reset_list); 6257 amdgpu_device_unlock_reset_domain(tmp_adev->reset_domain); 6258 6259 } 6260 6261 6262 /** 6263 * amdgpu_device_gpu_recover - reset the asic and recover scheduler 6264 * 6265 * @adev: amdgpu_device pointer 6266 * @job: which job trigger hang 6267 * @reset_context: amdgpu reset context pointer 6268 * 6269 * Attempt to reset the GPU if it has hung (all asics). 6270 * Attempt to do soft-reset or full-reset and reinitialize Asic 6271 * Returns 0 for success or an error on failure. 6272 */ 6273 6274 int amdgpu_device_gpu_recover(struct amdgpu_device *adev, 6275 struct amdgpu_job *job, 6276 struct amdgpu_reset_context *reset_context) 6277 { 6278 struct list_head device_list; 6279 bool job_signaled = false; 6280 struct amdgpu_hive_info *hive = NULL; 6281 int r = 0; 6282 bool need_emergency_restart = false; 6283 6284 /* 6285 * If it reaches here because of hang/timeout and a RAS error is 6286 * detected at the same time, let RAS recovery take care of it. 6287 */ 6288 if (amdgpu_ras_is_err_state(adev, AMDGPU_RAS_BLOCK__ANY) && 6289 !amdgpu_sriov_vf(adev) && 6290 reset_context->src != AMDGPU_RESET_SRC_RAS) { 6291 dev_dbg(adev->dev, 6292 "Gpu recovery from source: %d yielding to RAS error recovery handling", 6293 reset_context->src); 6294 return 0; 6295 } 6296 6297 /* 6298 * Special case: RAS triggered and full reset isn't supported 6299 */ 6300 need_emergency_restart = amdgpu_ras_need_emergency_restart(adev); 6301 6302 /* 6303 * Flush RAM to disk so that after reboot 6304 * the user can read log and see why the system rebooted. 6305 */ 6306 if (need_emergency_restart && amdgpu_ras_get_context(adev) && 6307 amdgpu_ras_get_context(adev)->reboot) { 6308 DRM_WARN("Emergency reboot."); 6309 6310 ksys_sync_helper(); 6311 emergency_restart(); 6312 } 6313 6314 dev_info(adev->dev, "GPU %s begin!\n", 6315 need_emergency_restart ? "jobs stop":"reset"); 6316 6317 if (!amdgpu_sriov_vf(adev)) 6318 hive = amdgpu_get_xgmi_hive(adev); 6319 if (hive) 6320 mutex_lock(&hive->hive_lock); 6321 6322 reset_context->job = job; 6323 reset_context->hive = hive; 6324 INIT_LIST_HEAD(&device_list); 6325 6326 r = amdgpu_device_halt_activities(adev, job, reset_context, &device_list, 6327 hive, need_emergency_restart); 6328 if (r) 6329 goto end_reset; 6330 6331 if (need_emergency_restart) 6332 goto skip_sched_resume; 6333 /* 6334 * Must check guilty signal here since after this point all old 6335 * HW fences are force signaled. 6336 * 6337 * job->base holds a reference to parent fence 6338 */ 6339 if (job && dma_fence_is_signaled(&job->hw_fence)) { 6340 job_signaled = true; 6341 dev_info(adev->dev, "Guilty job already signaled, skipping HW reset"); 6342 goto skip_hw_reset; 6343 } 6344 6345 r = amdgpu_device_asic_reset(adev, &device_list, reset_context); 6346 if (r) 6347 goto end_reset; 6348 skip_hw_reset: 6349 r = amdgpu_device_sched_resume(&device_list, reset_context, job_signaled); 6350 if (r) 6351 goto end_reset; 6352 skip_sched_resume: 6353 amdgpu_device_gpu_resume(adev, &device_list, need_emergency_restart); 6354 end_reset: 6355 if (hive) { 6356 mutex_unlock(&hive->hive_lock); 6357 amdgpu_put_xgmi_hive(hive); 6358 } 6359 6360 if (r) 6361 dev_info(adev->dev, "GPU reset end with ret = %d\n", r); 6362 6363 atomic_set(&adev->reset_domain->reset_res, r); 6364 6365 if (!r) 6366 drm_dev_wedged_event(adev_to_drm(adev), DRM_WEDGE_RECOVERY_NONE); 6367 6368 return r; 6369 } 6370 6371 /** 6372 * amdgpu_device_partner_bandwidth - find the bandwidth of appropriate partner 6373 * 6374 * @adev: amdgpu_device pointer 6375 * @speed: pointer to the speed of the link 6376 * @width: pointer to the width of the link 6377 * 6378 * Evaluate the hierarchy to find the speed and bandwidth capabilities of the 6379 * first physical partner to an AMD dGPU. 6380 * This will exclude any virtual switches and links. 6381 */ 6382 static void amdgpu_device_partner_bandwidth(struct amdgpu_device *adev, 6383 enum pci_bus_speed *speed, 6384 enum pcie_link_width *width) 6385 { 6386 struct pci_dev *parent = adev->pdev; 6387 6388 if (!speed || !width) 6389 return; 6390 6391 *speed = PCI_SPEED_UNKNOWN; 6392 *width = PCIE_LNK_WIDTH_UNKNOWN; 6393 6394 if (amdgpu_device_pcie_dynamic_switching_supported(adev)) { 6395 while ((parent = pci_upstream_bridge(parent))) { 6396 /* skip upstream/downstream switches internal to dGPU*/ 6397 if (parent->vendor == PCI_VENDOR_ID_ATI) 6398 continue; 6399 *speed = pcie_get_speed_cap(parent); 6400 *width = pcie_get_width_cap(parent); 6401 break; 6402 } 6403 } else { 6404 /* use the current speeds rather than max if switching is not supported */ 6405 pcie_bandwidth_available(adev->pdev, NULL, speed, width); 6406 } 6407 } 6408 6409 /** 6410 * amdgpu_device_gpu_bandwidth - find the bandwidth of the GPU 6411 * 6412 * @adev: amdgpu_device pointer 6413 * @speed: pointer to the speed of the link 6414 * @width: pointer to the width of the link 6415 * 6416 * Evaluate the hierarchy to find the speed and bandwidth capabilities of the 6417 * AMD dGPU which may be a virtual upstream bridge. 6418 */ 6419 static void amdgpu_device_gpu_bandwidth(struct amdgpu_device *adev, 6420 enum pci_bus_speed *speed, 6421 enum pcie_link_width *width) 6422 { 6423 struct pci_dev *parent = adev->pdev; 6424 6425 if (!speed || !width) 6426 return; 6427 6428 parent = pci_upstream_bridge(parent); 6429 if (parent && parent->vendor == PCI_VENDOR_ID_ATI) { 6430 /* use the upstream/downstream switches internal to dGPU */ 6431 *speed = pcie_get_speed_cap(parent); 6432 *width = pcie_get_width_cap(parent); 6433 while ((parent = pci_upstream_bridge(parent))) { 6434 if (parent->vendor == PCI_VENDOR_ID_ATI) { 6435 /* use the upstream/downstream switches internal to dGPU */ 6436 *speed = pcie_get_speed_cap(parent); 6437 *width = pcie_get_width_cap(parent); 6438 } 6439 } 6440 } else { 6441 /* use the device itself */ 6442 *speed = pcie_get_speed_cap(adev->pdev); 6443 *width = pcie_get_width_cap(adev->pdev); 6444 } 6445 } 6446 6447 /** 6448 * amdgpu_device_get_pcie_info - fence pcie info about the PCIE slot 6449 * 6450 * @adev: amdgpu_device pointer 6451 * 6452 * Fetches and stores in the driver the PCIE capabilities (gen speed 6453 * and lanes) of the slot the device is in. Handles APUs and 6454 * virtualized environments where PCIE config space may not be available. 6455 */ 6456 static void amdgpu_device_get_pcie_info(struct amdgpu_device *adev) 6457 { 6458 enum pci_bus_speed speed_cap, platform_speed_cap; 6459 enum pcie_link_width platform_link_width, link_width; 6460 6461 if (amdgpu_pcie_gen_cap) 6462 adev->pm.pcie_gen_mask = amdgpu_pcie_gen_cap; 6463 6464 if (amdgpu_pcie_lane_cap) 6465 adev->pm.pcie_mlw_mask = amdgpu_pcie_lane_cap; 6466 6467 /* covers APUs as well */ 6468 if (pci_is_root_bus(adev->pdev->bus) && !amdgpu_passthrough(adev)) { 6469 if (adev->pm.pcie_gen_mask == 0) 6470 adev->pm.pcie_gen_mask = AMDGPU_DEFAULT_PCIE_GEN_MASK; 6471 if (adev->pm.pcie_mlw_mask == 0) 6472 adev->pm.pcie_mlw_mask = AMDGPU_DEFAULT_PCIE_MLW_MASK; 6473 return; 6474 } 6475 6476 if (adev->pm.pcie_gen_mask && adev->pm.pcie_mlw_mask) 6477 return; 6478 6479 amdgpu_device_partner_bandwidth(adev, &platform_speed_cap, 6480 &platform_link_width); 6481 amdgpu_device_gpu_bandwidth(adev, &speed_cap, &link_width); 6482 6483 if (adev->pm.pcie_gen_mask == 0) { 6484 /* asic caps */ 6485 if (speed_cap == PCI_SPEED_UNKNOWN) { 6486 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 6487 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 | 6488 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3); 6489 } else { 6490 if (speed_cap == PCIE_SPEED_32_0GT) 6491 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 6492 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 | 6493 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3 | 6494 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN4 | 6495 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN5); 6496 else if (speed_cap == PCIE_SPEED_16_0GT) 6497 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 6498 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 | 6499 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3 | 6500 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN4); 6501 else if (speed_cap == PCIE_SPEED_8_0GT) 6502 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 6503 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 | 6504 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3); 6505 else if (speed_cap == PCIE_SPEED_5_0GT) 6506 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 6507 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2); 6508 else 6509 adev->pm.pcie_gen_mask |= CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1; 6510 } 6511 /* platform caps */ 6512 if (platform_speed_cap == PCI_SPEED_UNKNOWN) { 6513 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 6514 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2); 6515 } else { 6516 if (platform_speed_cap == PCIE_SPEED_32_0GT) 6517 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 6518 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 | 6519 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3 | 6520 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN4 | 6521 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN5); 6522 else if (platform_speed_cap == PCIE_SPEED_16_0GT) 6523 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 6524 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 | 6525 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3 | 6526 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN4); 6527 else if (platform_speed_cap == PCIE_SPEED_8_0GT) 6528 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 6529 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 | 6530 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3); 6531 else if (platform_speed_cap == PCIE_SPEED_5_0GT) 6532 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 6533 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2); 6534 else 6535 adev->pm.pcie_gen_mask |= CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1; 6536 6537 } 6538 } 6539 if (adev->pm.pcie_mlw_mask == 0) { 6540 /* asic caps */ 6541 if (link_width == PCIE_LNK_WIDTH_UNKNOWN) { 6542 adev->pm.pcie_mlw_mask |= AMDGPU_DEFAULT_ASIC_PCIE_MLW_MASK; 6543 } else { 6544 switch (link_width) { 6545 case PCIE_LNK_X32: 6546 adev->pm.pcie_mlw_mask |= (CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X32 | 6547 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X16 | 6548 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X12 | 6549 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X8 | 6550 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X4 | 6551 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X2 | 6552 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X1); 6553 break; 6554 case PCIE_LNK_X16: 6555 adev->pm.pcie_mlw_mask |= (CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X16 | 6556 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X12 | 6557 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X8 | 6558 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X4 | 6559 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X2 | 6560 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X1); 6561 break; 6562 case PCIE_LNK_X12: 6563 adev->pm.pcie_mlw_mask |= (CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X12 | 6564 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X8 | 6565 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X4 | 6566 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X2 | 6567 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X1); 6568 break; 6569 case PCIE_LNK_X8: 6570 adev->pm.pcie_mlw_mask |= (CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X8 | 6571 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X4 | 6572 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X2 | 6573 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X1); 6574 break; 6575 case PCIE_LNK_X4: 6576 adev->pm.pcie_mlw_mask |= (CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X4 | 6577 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X2 | 6578 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X1); 6579 break; 6580 case PCIE_LNK_X2: 6581 adev->pm.pcie_mlw_mask |= (CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X2 | 6582 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X1); 6583 break; 6584 case PCIE_LNK_X1: 6585 adev->pm.pcie_mlw_mask |= CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X1; 6586 break; 6587 default: 6588 break; 6589 } 6590 } 6591 /* platform caps */ 6592 if (platform_link_width == PCIE_LNK_WIDTH_UNKNOWN) { 6593 adev->pm.pcie_mlw_mask |= AMDGPU_DEFAULT_PCIE_MLW_MASK; 6594 } else { 6595 switch (platform_link_width) { 6596 case PCIE_LNK_X32: 6597 adev->pm.pcie_mlw_mask |= (CAIL_PCIE_LINK_WIDTH_SUPPORT_X32 | 6598 CAIL_PCIE_LINK_WIDTH_SUPPORT_X16 | 6599 CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 | 6600 CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 | 6601 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 6602 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 6603 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 6604 break; 6605 case PCIE_LNK_X16: 6606 adev->pm.pcie_mlw_mask |= (CAIL_PCIE_LINK_WIDTH_SUPPORT_X16 | 6607 CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 | 6608 CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 | 6609 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 6610 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 6611 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 6612 break; 6613 case PCIE_LNK_X12: 6614 adev->pm.pcie_mlw_mask |= (CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 | 6615 CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 | 6616 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 6617 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 6618 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 6619 break; 6620 case PCIE_LNK_X8: 6621 adev->pm.pcie_mlw_mask |= (CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 | 6622 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 6623 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 6624 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 6625 break; 6626 case PCIE_LNK_X4: 6627 adev->pm.pcie_mlw_mask |= (CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 6628 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 6629 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 6630 break; 6631 case PCIE_LNK_X2: 6632 adev->pm.pcie_mlw_mask |= (CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 6633 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 6634 break; 6635 case PCIE_LNK_X1: 6636 adev->pm.pcie_mlw_mask |= CAIL_PCIE_LINK_WIDTH_SUPPORT_X1; 6637 break; 6638 default: 6639 break; 6640 } 6641 } 6642 } 6643 } 6644 6645 /** 6646 * amdgpu_device_is_peer_accessible - Check peer access through PCIe BAR 6647 * 6648 * @adev: amdgpu_device pointer 6649 * @peer_adev: amdgpu_device pointer for peer device trying to access @adev 6650 * 6651 * Return true if @peer_adev can access (DMA) @adev through the PCIe 6652 * BAR, i.e. @adev is "large BAR" and the BAR matches the DMA mask of 6653 * @peer_adev. 6654 */ 6655 bool amdgpu_device_is_peer_accessible(struct amdgpu_device *adev, 6656 struct amdgpu_device *peer_adev) 6657 { 6658 #ifdef CONFIG_HSA_AMD_P2P 6659 bool p2p_access = 6660 !adev->gmc.xgmi.connected_to_cpu && 6661 !(pci_p2pdma_distance(adev->pdev, peer_adev->dev, false) < 0); 6662 if (!p2p_access) 6663 dev_info(adev->dev, "PCIe P2P access from peer device %s is not supported by the chipset\n", 6664 pci_name(peer_adev->pdev)); 6665 6666 bool is_large_bar = adev->gmc.visible_vram_size && 6667 adev->gmc.real_vram_size == adev->gmc.visible_vram_size; 6668 bool p2p_addressable = amdgpu_device_check_iommu_remap(peer_adev); 6669 6670 if (!p2p_addressable) { 6671 uint64_t address_mask = peer_adev->dev->dma_mask ? 6672 ~*peer_adev->dev->dma_mask : ~((1ULL << 32) - 1); 6673 resource_size_t aper_limit = 6674 adev->gmc.aper_base + adev->gmc.aper_size - 1; 6675 6676 p2p_addressable = !(adev->gmc.aper_base & address_mask || 6677 aper_limit & address_mask); 6678 } 6679 return pcie_p2p && is_large_bar && p2p_access && p2p_addressable; 6680 #else 6681 return false; 6682 #endif 6683 } 6684 6685 int amdgpu_device_baco_enter(struct drm_device *dev) 6686 { 6687 struct amdgpu_device *adev = drm_to_adev(dev); 6688 struct amdgpu_ras *ras = amdgpu_ras_get_context(adev); 6689 6690 if (!amdgpu_device_supports_baco(dev)) 6691 return -ENOTSUPP; 6692 6693 if (ras && adev->ras_enabled && 6694 adev->nbio.funcs->enable_doorbell_interrupt) 6695 adev->nbio.funcs->enable_doorbell_interrupt(adev, false); 6696 6697 return amdgpu_dpm_baco_enter(adev); 6698 } 6699 6700 int amdgpu_device_baco_exit(struct drm_device *dev) 6701 { 6702 struct amdgpu_device *adev = drm_to_adev(dev); 6703 struct amdgpu_ras *ras = amdgpu_ras_get_context(adev); 6704 int ret = 0; 6705 6706 if (!amdgpu_device_supports_baco(dev)) 6707 return -ENOTSUPP; 6708 6709 ret = amdgpu_dpm_baco_exit(adev); 6710 if (ret) 6711 return ret; 6712 6713 if (ras && adev->ras_enabled && 6714 adev->nbio.funcs->enable_doorbell_interrupt) 6715 adev->nbio.funcs->enable_doorbell_interrupt(adev, true); 6716 6717 if (amdgpu_passthrough(adev) && adev->nbio.funcs && 6718 adev->nbio.funcs->clear_doorbell_interrupt) 6719 adev->nbio.funcs->clear_doorbell_interrupt(adev); 6720 6721 return 0; 6722 } 6723 6724 /** 6725 * amdgpu_pci_error_detected - Called when a PCI error is detected. 6726 * @pdev: PCI device struct 6727 * @state: PCI channel state 6728 * 6729 * Description: Called when a PCI error is detected. 6730 * 6731 * Return: PCI_ERS_RESULT_NEED_RESET or PCI_ERS_RESULT_DISCONNECT. 6732 */ 6733 pci_ers_result_t amdgpu_pci_error_detected(struct pci_dev *pdev, pci_channel_state_t state) 6734 { 6735 struct drm_device *dev = pci_get_drvdata(pdev); 6736 struct amdgpu_device *adev = drm_to_adev(dev); 6737 struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev); 6738 struct amdgpu_reset_context reset_context; 6739 struct list_head device_list; 6740 int r = 0; 6741 6742 dev_info(adev->dev, "PCI error: detected callback!!\n"); 6743 6744 if (!amdgpu_dpm_is_link_reset_supported(adev)) { 6745 dev_warn(adev->dev, "No support for XGMI hive yet...\n"); 6746 return PCI_ERS_RESULT_DISCONNECT; 6747 } 6748 6749 adev->pci_channel_state = state; 6750 6751 switch (state) { 6752 case pci_channel_io_normal: 6753 dev_info(adev->dev, "pci_channel_io_normal: state(%d)!!\n", state); 6754 return PCI_ERS_RESULT_CAN_RECOVER; 6755 case pci_channel_io_frozen: 6756 /* Fatal error, prepare for slot reset */ 6757 dev_info(adev->dev, "pci_channel_io_frozen: state(%d)!!\n", state); 6758 6759 if (hive) 6760 mutex_lock(&hive->hive_lock); 6761 adev->pcie_reset_ctx.occurs_dpc = true; 6762 memset(&reset_context, 0, sizeof(reset_context)); 6763 INIT_LIST_HEAD(&device_list); 6764 6765 r = amdgpu_device_halt_activities(adev, NULL, &reset_context, &device_list, 6766 hive, false); 6767 if (hive) { 6768 mutex_unlock(&hive->hive_lock); 6769 amdgpu_put_xgmi_hive(hive); 6770 } 6771 if (r) 6772 return PCI_ERS_RESULT_DISCONNECT; 6773 return PCI_ERS_RESULT_NEED_RESET; 6774 case pci_channel_io_perm_failure: 6775 /* Permanent error, prepare for device removal */ 6776 dev_info(adev->dev, "pci_channel_io_perm_failure: state(%d)!!\n", state); 6777 return PCI_ERS_RESULT_DISCONNECT; 6778 } 6779 6780 return PCI_ERS_RESULT_NEED_RESET; 6781 } 6782 6783 /** 6784 * amdgpu_pci_mmio_enabled - Enable MMIO and dump debug registers 6785 * @pdev: pointer to PCI device 6786 */ 6787 pci_ers_result_t amdgpu_pci_mmio_enabled(struct pci_dev *pdev) 6788 { 6789 struct drm_device *dev = pci_get_drvdata(pdev); 6790 struct amdgpu_device *adev = drm_to_adev(dev); 6791 6792 dev_info(adev->dev, "PCI error: mmio enabled callback!!\n"); 6793 6794 /* TODO - dump whatever for debugging purposes */ 6795 6796 /* This called only if amdgpu_pci_error_detected returns 6797 * PCI_ERS_RESULT_CAN_RECOVER. Read/write to the device still 6798 * works, no need to reset slot. 6799 */ 6800 6801 return PCI_ERS_RESULT_RECOVERED; 6802 } 6803 6804 /** 6805 * amdgpu_pci_slot_reset - Called when PCI slot has been reset. 6806 * @pdev: PCI device struct 6807 * 6808 * Description: This routine is called by the pci error recovery 6809 * code after the PCI slot has been reset, just before we 6810 * should resume normal operations. 6811 */ 6812 pci_ers_result_t amdgpu_pci_slot_reset(struct pci_dev *pdev) 6813 { 6814 struct drm_device *dev = pci_get_drvdata(pdev); 6815 struct amdgpu_device *adev = drm_to_adev(dev); 6816 struct amdgpu_reset_context reset_context; 6817 struct amdgpu_device *tmp_adev; 6818 struct amdgpu_hive_info *hive; 6819 struct list_head device_list; 6820 int r = 0, i; 6821 u32 memsize; 6822 6823 /* PCI error slot reset should be skipped During RAS recovery */ 6824 if ((amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 3) || 6825 amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 4)) && 6826 amdgpu_ras_in_recovery(adev)) 6827 return PCI_ERS_RESULT_RECOVERED; 6828 6829 dev_info(adev->dev, "PCI error: slot reset callback!!\n"); 6830 6831 memset(&reset_context, 0, sizeof(reset_context)); 6832 6833 /* wait for asic to come out of reset */ 6834 msleep(700); 6835 6836 /* Restore PCI confspace */ 6837 amdgpu_device_load_pci_state(pdev); 6838 6839 /* confirm ASIC came out of reset */ 6840 for (i = 0; i < adev->usec_timeout; i++) { 6841 memsize = amdgpu_asic_get_config_memsize(adev); 6842 6843 if (memsize != 0xffffffff) 6844 break; 6845 udelay(1); 6846 } 6847 if (memsize == 0xffffffff) { 6848 r = -ETIME; 6849 goto out; 6850 } 6851 6852 reset_context.method = AMD_RESET_METHOD_NONE; 6853 reset_context.reset_req_dev = adev; 6854 set_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags); 6855 set_bit(AMDGPU_SKIP_COREDUMP, &reset_context.flags); 6856 INIT_LIST_HEAD(&device_list); 6857 6858 hive = amdgpu_get_xgmi_hive(adev); 6859 if (hive) { 6860 mutex_lock(&hive->hive_lock); 6861 reset_context.hive = hive; 6862 list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head) { 6863 tmp_adev->pcie_reset_ctx.in_link_reset = true; 6864 list_add_tail(&tmp_adev->reset_list, &device_list); 6865 } 6866 } else { 6867 set_bit(AMDGPU_SKIP_HW_RESET, &reset_context.flags); 6868 list_add_tail(&adev->reset_list, &device_list); 6869 } 6870 6871 r = amdgpu_device_asic_reset(adev, &device_list, &reset_context); 6872 out: 6873 if (!r) { 6874 if (amdgpu_device_cache_pci_state(adev->pdev)) 6875 pci_restore_state(adev->pdev); 6876 dev_info(adev->dev, "PCIe error recovery succeeded\n"); 6877 } else { 6878 dev_err(adev->dev, "PCIe error recovery failed, err:%d\n", r); 6879 if (hive) { 6880 list_for_each_entry(tmp_adev, &device_list, reset_list) 6881 amdgpu_device_unset_mp1_state(tmp_adev); 6882 amdgpu_device_unlock_reset_domain(adev->reset_domain); 6883 } 6884 } 6885 6886 if (hive) { 6887 mutex_unlock(&hive->hive_lock); 6888 amdgpu_put_xgmi_hive(hive); 6889 } 6890 6891 return r ? PCI_ERS_RESULT_DISCONNECT : PCI_ERS_RESULT_RECOVERED; 6892 } 6893 6894 /** 6895 * amdgpu_pci_resume() - resume normal ops after PCI reset 6896 * @pdev: pointer to PCI device 6897 * 6898 * Called when the error recovery driver tells us that its 6899 * OK to resume normal operation. 6900 */ 6901 void amdgpu_pci_resume(struct pci_dev *pdev) 6902 { 6903 struct drm_device *dev = pci_get_drvdata(pdev); 6904 struct amdgpu_device *adev = drm_to_adev(dev); 6905 struct list_head device_list; 6906 struct amdgpu_hive_info *hive = NULL; 6907 struct amdgpu_device *tmp_adev = NULL; 6908 6909 dev_info(adev->dev, "PCI error: resume callback!!\n"); 6910 6911 /* Only continue execution for the case of pci_channel_io_frozen */ 6912 if (adev->pci_channel_state != pci_channel_io_frozen) 6913 return; 6914 6915 INIT_LIST_HEAD(&device_list); 6916 6917 hive = amdgpu_get_xgmi_hive(adev); 6918 if (hive) { 6919 mutex_lock(&hive->hive_lock); 6920 list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head) { 6921 tmp_adev->pcie_reset_ctx.in_link_reset = false; 6922 list_add_tail(&tmp_adev->reset_list, &device_list); 6923 } 6924 } else 6925 list_add_tail(&adev->reset_list, &device_list); 6926 6927 amdgpu_device_sched_resume(&device_list, NULL, NULL); 6928 amdgpu_device_gpu_resume(adev, &device_list, false); 6929 adev->pcie_reset_ctx.occurs_dpc = false; 6930 6931 if (hive) { 6932 mutex_unlock(&hive->hive_lock); 6933 amdgpu_put_xgmi_hive(hive); 6934 } 6935 } 6936 6937 bool amdgpu_device_cache_pci_state(struct pci_dev *pdev) 6938 { 6939 struct drm_device *dev = pci_get_drvdata(pdev); 6940 struct amdgpu_device *adev = drm_to_adev(dev); 6941 int r; 6942 6943 if (amdgpu_sriov_vf(adev)) 6944 return false; 6945 6946 r = pci_save_state(pdev); 6947 if (!r) { 6948 kfree(adev->pci_state); 6949 6950 adev->pci_state = pci_store_saved_state(pdev); 6951 6952 if (!adev->pci_state) { 6953 DRM_ERROR("Failed to store PCI saved state"); 6954 return false; 6955 } 6956 } else { 6957 DRM_WARN("Failed to save PCI state, err:%d\n", r); 6958 return false; 6959 } 6960 6961 return true; 6962 } 6963 6964 bool amdgpu_device_load_pci_state(struct pci_dev *pdev) 6965 { 6966 struct drm_device *dev = pci_get_drvdata(pdev); 6967 struct amdgpu_device *adev = drm_to_adev(dev); 6968 int r; 6969 6970 if (!adev->pci_state) 6971 return false; 6972 6973 r = pci_load_saved_state(pdev, adev->pci_state); 6974 6975 if (!r) { 6976 pci_restore_state(pdev); 6977 } else { 6978 DRM_WARN("Failed to load PCI state, err:%d\n", r); 6979 return false; 6980 } 6981 6982 return true; 6983 } 6984 6985 void amdgpu_device_flush_hdp(struct amdgpu_device *adev, 6986 struct amdgpu_ring *ring) 6987 { 6988 #ifdef CONFIG_X86_64 6989 if ((adev->flags & AMD_IS_APU) && !amdgpu_passthrough(adev)) 6990 return; 6991 #endif 6992 if (adev->gmc.xgmi.connected_to_cpu) 6993 return; 6994 6995 if (ring && ring->funcs->emit_hdp_flush) 6996 amdgpu_ring_emit_hdp_flush(ring); 6997 else 6998 amdgpu_asic_flush_hdp(adev, ring); 6999 } 7000 7001 void amdgpu_device_invalidate_hdp(struct amdgpu_device *adev, 7002 struct amdgpu_ring *ring) 7003 { 7004 #ifdef CONFIG_X86_64 7005 if ((adev->flags & AMD_IS_APU) && !amdgpu_passthrough(adev)) 7006 return; 7007 #endif 7008 if (adev->gmc.xgmi.connected_to_cpu) 7009 return; 7010 7011 amdgpu_asic_invalidate_hdp(adev, ring); 7012 } 7013 7014 int amdgpu_in_reset(struct amdgpu_device *adev) 7015 { 7016 return atomic_read(&adev->reset_domain->in_gpu_reset); 7017 } 7018 7019 /** 7020 * amdgpu_device_halt() - bring hardware to some kind of halt state 7021 * 7022 * @adev: amdgpu_device pointer 7023 * 7024 * Bring hardware to some kind of halt state so that no one can touch it 7025 * any more. It will help to maintain error context when error occurred. 7026 * Compare to a simple hang, the system will keep stable at least for SSH 7027 * access. Then it should be trivial to inspect the hardware state and 7028 * see what's going on. Implemented as following: 7029 * 7030 * 1. drm_dev_unplug() makes device inaccessible to user space(IOCTLs, etc), 7031 * clears all CPU mappings to device, disallows remappings through page faults 7032 * 2. amdgpu_irq_disable_all() disables all interrupts 7033 * 3. amdgpu_fence_driver_hw_fini() signals all HW fences 7034 * 4. set adev->no_hw_access to avoid potential crashes after setp 5 7035 * 5. amdgpu_device_unmap_mmio() clears all MMIO mappings 7036 * 6. pci_disable_device() and pci_wait_for_pending_transaction() 7037 * flush any in flight DMA operations 7038 */ 7039 void amdgpu_device_halt(struct amdgpu_device *adev) 7040 { 7041 struct pci_dev *pdev = adev->pdev; 7042 struct drm_device *ddev = adev_to_drm(adev); 7043 7044 amdgpu_xcp_dev_unplug(adev); 7045 drm_dev_unplug(ddev); 7046 7047 amdgpu_irq_disable_all(adev); 7048 7049 amdgpu_fence_driver_hw_fini(adev); 7050 7051 adev->no_hw_access = true; 7052 7053 amdgpu_device_unmap_mmio(adev); 7054 7055 pci_disable_device(pdev); 7056 pci_wait_for_pending_transaction(pdev); 7057 } 7058 7059 u32 amdgpu_device_pcie_port_rreg(struct amdgpu_device *adev, 7060 u32 reg) 7061 { 7062 unsigned long flags, address, data; 7063 u32 r; 7064 7065 address = adev->nbio.funcs->get_pcie_port_index_offset(adev); 7066 data = adev->nbio.funcs->get_pcie_port_data_offset(adev); 7067 7068 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 7069 WREG32(address, reg * 4); 7070 (void)RREG32(address); 7071 r = RREG32(data); 7072 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 7073 return r; 7074 } 7075 7076 void amdgpu_device_pcie_port_wreg(struct amdgpu_device *adev, 7077 u32 reg, u32 v) 7078 { 7079 unsigned long flags, address, data; 7080 7081 address = adev->nbio.funcs->get_pcie_port_index_offset(adev); 7082 data = adev->nbio.funcs->get_pcie_port_data_offset(adev); 7083 7084 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 7085 WREG32(address, reg * 4); 7086 (void)RREG32(address); 7087 WREG32(data, v); 7088 (void)RREG32(data); 7089 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 7090 } 7091 7092 /** 7093 * amdgpu_device_get_gang - return a reference to the current gang 7094 * @adev: amdgpu_device pointer 7095 * 7096 * Returns: A new reference to the current gang leader. 7097 */ 7098 struct dma_fence *amdgpu_device_get_gang(struct amdgpu_device *adev) 7099 { 7100 struct dma_fence *fence; 7101 7102 rcu_read_lock(); 7103 fence = dma_fence_get_rcu_safe(&adev->gang_submit); 7104 rcu_read_unlock(); 7105 return fence; 7106 } 7107 7108 /** 7109 * amdgpu_device_switch_gang - switch to a new gang 7110 * @adev: amdgpu_device pointer 7111 * @gang: the gang to switch to 7112 * 7113 * Try to switch to a new gang. 7114 * Returns: NULL if we switched to the new gang or a reference to the current 7115 * gang leader. 7116 */ 7117 struct dma_fence *amdgpu_device_switch_gang(struct amdgpu_device *adev, 7118 struct dma_fence *gang) 7119 { 7120 struct dma_fence *old = NULL; 7121 7122 dma_fence_get(gang); 7123 do { 7124 dma_fence_put(old); 7125 old = amdgpu_device_get_gang(adev); 7126 if (old == gang) 7127 break; 7128 7129 if (!dma_fence_is_signaled(old)) { 7130 dma_fence_put(gang); 7131 return old; 7132 } 7133 7134 } while (cmpxchg((struct dma_fence __force **)&adev->gang_submit, 7135 old, gang) != old); 7136 7137 /* 7138 * Drop it once for the exchanged reference in adev and once for the 7139 * thread local reference acquired in amdgpu_device_get_gang(). 7140 */ 7141 dma_fence_put(old); 7142 dma_fence_put(old); 7143 return NULL; 7144 } 7145 7146 /** 7147 * amdgpu_device_enforce_isolation - enforce HW isolation 7148 * @adev: the amdgpu device pointer 7149 * @ring: the HW ring the job is supposed to run on 7150 * @job: the job which is about to be pushed to the HW ring 7151 * 7152 * Makes sure that only one client at a time can use the GFX block. 7153 * Returns: The dependency to wait on before the job can be pushed to the HW. 7154 * The function is called multiple times until NULL is returned. 7155 */ 7156 struct dma_fence *amdgpu_device_enforce_isolation(struct amdgpu_device *adev, 7157 struct amdgpu_ring *ring, 7158 struct amdgpu_job *job) 7159 { 7160 struct amdgpu_isolation *isolation = &adev->isolation[ring->xcp_id]; 7161 struct drm_sched_fence *f = job->base.s_fence; 7162 struct dma_fence *dep; 7163 void *owner; 7164 int r; 7165 7166 /* 7167 * For now enforce isolation only for the GFX block since we only need 7168 * the cleaner shader on those rings. 7169 */ 7170 if (ring->funcs->type != AMDGPU_RING_TYPE_GFX && 7171 ring->funcs->type != AMDGPU_RING_TYPE_COMPUTE) 7172 return NULL; 7173 7174 /* 7175 * All submissions where enforce isolation is false are handled as if 7176 * they come from a single client. Use ~0l as the owner to distinct it 7177 * from kernel submissions where the owner is NULL. 7178 */ 7179 owner = job->enforce_isolation ? f->owner : (void *)~0l; 7180 7181 mutex_lock(&adev->enforce_isolation_mutex); 7182 7183 /* 7184 * The "spearhead" submission is the first one which changes the 7185 * ownership to its client. We always need to wait for it to be 7186 * pushed to the HW before proceeding with anything. 7187 */ 7188 if (&f->scheduled != isolation->spearhead && 7189 !dma_fence_is_signaled(isolation->spearhead)) { 7190 dep = isolation->spearhead; 7191 goto out_grab_ref; 7192 } 7193 7194 if (isolation->owner != owner) { 7195 7196 /* 7197 * Wait for any gang to be assembled before switching to a 7198 * different owner or otherwise we could deadlock the 7199 * submissions. 7200 */ 7201 if (!job->gang_submit) { 7202 dep = amdgpu_device_get_gang(adev); 7203 if (!dma_fence_is_signaled(dep)) 7204 goto out_return_dep; 7205 dma_fence_put(dep); 7206 } 7207 7208 dma_fence_put(isolation->spearhead); 7209 isolation->spearhead = dma_fence_get(&f->scheduled); 7210 amdgpu_sync_move(&isolation->active, &isolation->prev); 7211 trace_amdgpu_isolation(isolation->owner, owner); 7212 isolation->owner = owner; 7213 } 7214 7215 /* 7216 * Specifying the ring here helps to pipeline submissions even when 7217 * isolation is enabled. If that is not desired for testing NULL can be 7218 * used instead of the ring to enforce a CPU round trip while switching 7219 * between clients. 7220 */ 7221 dep = amdgpu_sync_peek_fence(&isolation->prev, ring); 7222 r = amdgpu_sync_fence(&isolation->active, &f->finished, GFP_NOWAIT); 7223 if (r) 7224 DRM_WARN("OOM tracking isolation\n"); 7225 7226 out_grab_ref: 7227 dma_fence_get(dep); 7228 out_return_dep: 7229 mutex_unlock(&adev->enforce_isolation_mutex); 7230 return dep; 7231 } 7232 7233 bool amdgpu_device_has_display_hardware(struct amdgpu_device *adev) 7234 { 7235 switch (adev->asic_type) { 7236 #ifdef CONFIG_DRM_AMDGPU_SI 7237 case CHIP_HAINAN: 7238 #endif 7239 case CHIP_TOPAZ: 7240 /* chips with no display hardware */ 7241 return false; 7242 #ifdef CONFIG_DRM_AMDGPU_SI 7243 case CHIP_TAHITI: 7244 case CHIP_PITCAIRN: 7245 case CHIP_VERDE: 7246 case CHIP_OLAND: 7247 #endif 7248 #ifdef CONFIG_DRM_AMDGPU_CIK 7249 case CHIP_BONAIRE: 7250 case CHIP_HAWAII: 7251 case CHIP_KAVERI: 7252 case CHIP_KABINI: 7253 case CHIP_MULLINS: 7254 #endif 7255 case CHIP_TONGA: 7256 case CHIP_FIJI: 7257 case CHIP_POLARIS10: 7258 case CHIP_POLARIS11: 7259 case CHIP_POLARIS12: 7260 case CHIP_VEGAM: 7261 case CHIP_CARRIZO: 7262 case CHIP_STONEY: 7263 /* chips with display hardware */ 7264 return true; 7265 default: 7266 /* IP discovery */ 7267 if (!amdgpu_ip_version(adev, DCE_HWIP, 0) || 7268 (adev->harvest_ip_mask & AMD_HARVEST_IP_DMU_MASK)) 7269 return false; 7270 return true; 7271 } 7272 } 7273 7274 uint32_t amdgpu_device_wait_on_rreg(struct amdgpu_device *adev, 7275 uint32_t inst, uint32_t reg_addr, char reg_name[], 7276 uint32_t expected_value, uint32_t mask) 7277 { 7278 uint32_t ret = 0; 7279 uint32_t old_ = 0; 7280 uint32_t tmp_ = RREG32(reg_addr); 7281 uint32_t loop = adev->usec_timeout; 7282 7283 while ((tmp_ & (mask)) != (expected_value)) { 7284 if (old_ != tmp_) { 7285 loop = adev->usec_timeout; 7286 old_ = tmp_; 7287 } else 7288 udelay(1); 7289 tmp_ = RREG32(reg_addr); 7290 loop--; 7291 if (!loop) { 7292 DRM_WARN("Register(%d) [%s] failed to reach value 0x%08x != 0x%08xn", 7293 inst, reg_name, (uint32_t)expected_value, 7294 (uint32_t)(tmp_ & (mask))); 7295 ret = -ETIMEDOUT; 7296 break; 7297 } 7298 } 7299 return ret; 7300 } 7301 7302 ssize_t amdgpu_get_soft_full_reset_mask(struct amdgpu_ring *ring) 7303 { 7304 ssize_t size = 0; 7305 7306 if (!ring || !ring->adev) 7307 return size; 7308 7309 if (amdgpu_device_should_recover_gpu(ring->adev)) 7310 size |= AMDGPU_RESET_TYPE_FULL; 7311 7312 if (unlikely(!ring->adev->debug_disable_soft_recovery) && 7313 !amdgpu_sriov_vf(ring->adev) && ring->funcs->soft_recovery) 7314 size |= AMDGPU_RESET_TYPE_SOFT_RESET; 7315 7316 return size; 7317 } 7318 7319 ssize_t amdgpu_show_reset_mask(char *buf, uint32_t supported_reset) 7320 { 7321 ssize_t size = 0; 7322 7323 if (supported_reset == 0) { 7324 size += sysfs_emit_at(buf, size, "unsupported"); 7325 size += sysfs_emit_at(buf, size, "\n"); 7326 return size; 7327 7328 } 7329 7330 if (supported_reset & AMDGPU_RESET_TYPE_SOFT_RESET) 7331 size += sysfs_emit_at(buf, size, "soft "); 7332 7333 if (supported_reset & AMDGPU_RESET_TYPE_PER_QUEUE) 7334 size += sysfs_emit_at(buf, size, "queue "); 7335 7336 if (supported_reset & AMDGPU_RESET_TYPE_PER_PIPE) 7337 size += sysfs_emit_at(buf, size, "pipe "); 7338 7339 if (supported_reset & AMDGPU_RESET_TYPE_FULL) 7340 size += sysfs_emit_at(buf, size, "full "); 7341 7342 size += sysfs_emit_at(buf, size, "\n"); 7343 return size; 7344 } 7345