1 /* 2 * Copyright 2008 Advanced Micro Devices, Inc. 3 * Copyright 2008 Red Hat Inc. 4 * Copyright 2009 Jerome Glisse. 5 * 6 * Permission is hereby granted, free of charge, to any person obtaining a 7 * copy of this software and associated documentation files (the "Software"), 8 * to deal in the Software without restriction, including without limitation 9 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 10 * and/or sell copies of the Software, and to permit persons to whom the 11 * Software is furnished to do so, subject to the following conditions: 12 * 13 * The above copyright notice and this permission notice shall be included in 14 * all copies or substantial portions of the Software. 15 * 16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 19 * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR 20 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, 21 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR 22 * OTHER DEALINGS IN THE SOFTWARE. 23 * 24 * Authors: Dave Airlie 25 * Alex Deucher 26 * Jerome Glisse 27 */ 28 29 #include <linux/aperture.h> 30 #include <linux/power_supply.h> 31 #include <linux/kthread.h> 32 #include <linux/module.h> 33 #include <linux/console.h> 34 #include <linux/slab.h> 35 #include <linux/iommu.h> 36 #include <linux/pci.h> 37 #include <linux/pci-p2pdma.h> 38 #include <linux/apple-gmux.h> 39 40 #include <drm/drm_atomic_helper.h> 41 #include <drm/drm_client_event.h> 42 #include <drm/drm_crtc_helper.h> 43 #include <drm/drm_probe_helper.h> 44 #include <drm/amdgpu_drm.h> 45 #include <linux/device.h> 46 #include <linux/vgaarb.h> 47 #include <linux/vga_switcheroo.h> 48 #include <linux/efi.h> 49 #include "amdgpu.h" 50 #include "amdgpu_trace.h" 51 #include "amdgpu_i2c.h" 52 #include "atom.h" 53 #include "amdgpu_atombios.h" 54 #include "amdgpu_atomfirmware.h" 55 #include "amd_pcie.h" 56 #ifdef CONFIG_DRM_AMDGPU_SI 57 #include "si.h" 58 #endif 59 #ifdef CONFIG_DRM_AMDGPU_CIK 60 #include "cik.h" 61 #endif 62 #include "vi.h" 63 #include "soc15.h" 64 #include "nv.h" 65 #include "bif/bif_4_1_d.h" 66 #include <linux/firmware.h> 67 #include "amdgpu_vf_error.h" 68 69 #include "amdgpu_amdkfd.h" 70 #include "amdgpu_pm.h" 71 72 #include "amdgpu_xgmi.h" 73 #include "amdgpu_ras.h" 74 #include "amdgpu_pmu.h" 75 #include "amdgpu_fru_eeprom.h" 76 #include "amdgpu_reset.h" 77 #include "amdgpu_virt.h" 78 #include "amdgpu_dev_coredump.h" 79 80 #include <linux/suspend.h> 81 #include <drm/task_barrier.h> 82 #include <linux/pm_runtime.h> 83 84 #include <drm/drm_drv.h> 85 86 #if IS_ENABLED(CONFIG_X86) 87 #include <asm/intel-family.h> 88 #endif 89 90 MODULE_FIRMWARE("amdgpu/vega10_gpu_info.bin"); 91 MODULE_FIRMWARE("amdgpu/vega12_gpu_info.bin"); 92 MODULE_FIRMWARE("amdgpu/raven_gpu_info.bin"); 93 MODULE_FIRMWARE("amdgpu/picasso_gpu_info.bin"); 94 MODULE_FIRMWARE("amdgpu/raven2_gpu_info.bin"); 95 MODULE_FIRMWARE("amdgpu/arcturus_gpu_info.bin"); 96 MODULE_FIRMWARE("amdgpu/navi12_gpu_info.bin"); 97 98 #define AMDGPU_RESUME_MS 2000 99 #define AMDGPU_MAX_RETRY_LIMIT 2 100 #define AMDGPU_RETRY_SRIOV_RESET(r) ((r) == -EBUSY || (r) == -ETIMEDOUT || (r) == -EINVAL) 101 #define AMDGPU_PCIE_INDEX_FALLBACK (0x38 >> 2) 102 #define AMDGPU_PCIE_INDEX_HI_FALLBACK (0x44 >> 2) 103 #define AMDGPU_PCIE_DATA_FALLBACK (0x3C >> 2) 104 105 static const struct drm_driver amdgpu_kms_driver; 106 107 const char *amdgpu_asic_name[] = { 108 "TAHITI", 109 "PITCAIRN", 110 "VERDE", 111 "OLAND", 112 "HAINAN", 113 "BONAIRE", 114 "KAVERI", 115 "KABINI", 116 "HAWAII", 117 "MULLINS", 118 "TOPAZ", 119 "TONGA", 120 "FIJI", 121 "CARRIZO", 122 "STONEY", 123 "POLARIS10", 124 "POLARIS11", 125 "POLARIS12", 126 "VEGAM", 127 "VEGA10", 128 "VEGA12", 129 "VEGA20", 130 "RAVEN", 131 "ARCTURUS", 132 "RENOIR", 133 "ALDEBARAN", 134 "NAVI10", 135 "CYAN_SKILLFISH", 136 "NAVI14", 137 "NAVI12", 138 "SIENNA_CICHLID", 139 "NAVY_FLOUNDER", 140 "VANGOGH", 141 "DIMGREY_CAVEFISH", 142 "BEIGE_GOBY", 143 "YELLOW_CARP", 144 "IP DISCOVERY", 145 "LAST", 146 }; 147 148 #define AMDGPU_IP_BLK_MASK_ALL GENMASK(AMDGPU_MAX_IP_NUM - 1, 0) 149 /* 150 * Default init level where all blocks are expected to be initialized. This is 151 * the level of initialization expected by default and also after a full reset 152 * of the device. 153 */ 154 struct amdgpu_init_level amdgpu_init_default = { 155 .level = AMDGPU_INIT_LEVEL_DEFAULT, 156 .hwini_ip_block_mask = AMDGPU_IP_BLK_MASK_ALL, 157 }; 158 159 /* 160 * Minimal blocks needed to be initialized before a XGMI hive can be reset. This 161 * is used for cases like reset on initialization where the entire hive needs to 162 * be reset before first use. 163 */ 164 struct amdgpu_init_level amdgpu_init_minimal_xgmi = { 165 .level = AMDGPU_INIT_LEVEL_MINIMAL_XGMI, 166 .hwini_ip_block_mask = 167 BIT(AMD_IP_BLOCK_TYPE_GMC) | BIT(AMD_IP_BLOCK_TYPE_SMC) | 168 BIT(AMD_IP_BLOCK_TYPE_COMMON) | BIT(AMD_IP_BLOCK_TYPE_IH) | 169 BIT(AMD_IP_BLOCK_TYPE_PSP) 170 }; 171 172 static inline bool amdgpu_ip_member_of_hwini(struct amdgpu_device *adev, 173 enum amd_ip_block_type block) 174 { 175 return (adev->init_lvl->hwini_ip_block_mask & (1U << block)) != 0; 176 } 177 178 void amdgpu_set_init_level(struct amdgpu_device *adev, 179 enum amdgpu_init_lvl_id lvl) 180 { 181 switch (lvl) { 182 case AMDGPU_INIT_LEVEL_MINIMAL_XGMI: 183 adev->init_lvl = &amdgpu_init_minimal_xgmi; 184 break; 185 case AMDGPU_INIT_LEVEL_DEFAULT: 186 fallthrough; 187 default: 188 adev->init_lvl = &amdgpu_init_default; 189 break; 190 } 191 } 192 193 static inline void amdgpu_device_stop_pending_resets(struct amdgpu_device *adev); 194 195 /** 196 * DOC: pcie_replay_count 197 * 198 * The amdgpu driver provides a sysfs API for reporting the total number 199 * of PCIe replays (NAKs) 200 * The file pcie_replay_count is used for this and returns the total 201 * number of replays as a sum of the NAKs generated and NAKs received 202 */ 203 204 static ssize_t amdgpu_device_get_pcie_replay_count(struct device *dev, 205 struct device_attribute *attr, char *buf) 206 { 207 struct drm_device *ddev = dev_get_drvdata(dev); 208 struct amdgpu_device *adev = drm_to_adev(ddev); 209 uint64_t cnt = amdgpu_asic_get_pcie_replay_count(adev); 210 211 return sysfs_emit(buf, "%llu\n", cnt); 212 } 213 214 static DEVICE_ATTR(pcie_replay_count, 0444, 215 amdgpu_device_get_pcie_replay_count, NULL); 216 217 static ssize_t amdgpu_sysfs_reg_state_get(struct file *f, struct kobject *kobj, 218 struct bin_attribute *attr, char *buf, 219 loff_t ppos, size_t count) 220 { 221 struct device *dev = kobj_to_dev(kobj); 222 struct drm_device *ddev = dev_get_drvdata(dev); 223 struct amdgpu_device *adev = drm_to_adev(ddev); 224 ssize_t bytes_read; 225 226 switch (ppos) { 227 case AMDGPU_SYS_REG_STATE_XGMI: 228 bytes_read = amdgpu_asic_get_reg_state( 229 adev, AMDGPU_REG_STATE_TYPE_XGMI, buf, count); 230 break; 231 case AMDGPU_SYS_REG_STATE_WAFL: 232 bytes_read = amdgpu_asic_get_reg_state( 233 adev, AMDGPU_REG_STATE_TYPE_WAFL, buf, count); 234 break; 235 case AMDGPU_SYS_REG_STATE_PCIE: 236 bytes_read = amdgpu_asic_get_reg_state( 237 adev, AMDGPU_REG_STATE_TYPE_PCIE, buf, count); 238 break; 239 case AMDGPU_SYS_REG_STATE_USR: 240 bytes_read = amdgpu_asic_get_reg_state( 241 adev, AMDGPU_REG_STATE_TYPE_USR, buf, count); 242 break; 243 case AMDGPU_SYS_REG_STATE_USR_1: 244 bytes_read = amdgpu_asic_get_reg_state( 245 adev, AMDGPU_REG_STATE_TYPE_USR_1, buf, count); 246 break; 247 default: 248 return -EINVAL; 249 } 250 251 return bytes_read; 252 } 253 254 BIN_ATTR(reg_state, 0444, amdgpu_sysfs_reg_state_get, NULL, 255 AMDGPU_SYS_REG_STATE_END); 256 257 int amdgpu_reg_state_sysfs_init(struct amdgpu_device *adev) 258 { 259 int ret; 260 261 if (!amdgpu_asic_get_reg_state_supported(adev)) 262 return 0; 263 264 ret = sysfs_create_bin_file(&adev->dev->kobj, &bin_attr_reg_state); 265 266 return ret; 267 } 268 269 void amdgpu_reg_state_sysfs_fini(struct amdgpu_device *adev) 270 { 271 if (!amdgpu_asic_get_reg_state_supported(adev)) 272 return; 273 sysfs_remove_bin_file(&adev->dev->kobj, &bin_attr_reg_state); 274 } 275 276 int amdgpu_ip_block_suspend(struct amdgpu_ip_block *ip_block) 277 { 278 int r; 279 280 if (ip_block->version->funcs->suspend) { 281 r = ip_block->version->funcs->suspend(ip_block); 282 if (r) { 283 dev_err(ip_block->adev->dev, 284 "suspend of IP block <%s> failed %d\n", 285 ip_block->version->funcs->name, r); 286 return r; 287 } 288 } 289 290 ip_block->status.hw = false; 291 return 0; 292 } 293 294 int amdgpu_ip_block_resume(struct amdgpu_ip_block *ip_block) 295 { 296 int r; 297 298 if (ip_block->version->funcs->resume) { 299 r = ip_block->version->funcs->resume(ip_block); 300 if (r) { 301 dev_err(ip_block->adev->dev, 302 "resume of IP block <%s> failed %d\n", 303 ip_block->version->funcs->name, r); 304 return r; 305 } 306 } 307 308 ip_block->status.hw = true; 309 return 0; 310 } 311 312 /** 313 * DOC: board_info 314 * 315 * The amdgpu driver provides a sysfs API for giving board related information. 316 * It provides the form factor information in the format 317 * 318 * type : form factor 319 * 320 * Possible form factor values 321 * 322 * - "cem" - PCIE CEM card 323 * - "oam" - Open Compute Accelerator Module 324 * - "unknown" - Not known 325 * 326 */ 327 328 static ssize_t amdgpu_device_get_board_info(struct device *dev, 329 struct device_attribute *attr, 330 char *buf) 331 { 332 struct drm_device *ddev = dev_get_drvdata(dev); 333 struct amdgpu_device *adev = drm_to_adev(ddev); 334 enum amdgpu_pkg_type pkg_type = AMDGPU_PKG_TYPE_CEM; 335 const char *pkg; 336 337 if (adev->smuio.funcs && adev->smuio.funcs->get_pkg_type) 338 pkg_type = adev->smuio.funcs->get_pkg_type(adev); 339 340 switch (pkg_type) { 341 case AMDGPU_PKG_TYPE_CEM: 342 pkg = "cem"; 343 break; 344 case AMDGPU_PKG_TYPE_OAM: 345 pkg = "oam"; 346 break; 347 default: 348 pkg = "unknown"; 349 break; 350 } 351 352 return sysfs_emit(buf, "%s : %s\n", "type", pkg); 353 } 354 355 static DEVICE_ATTR(board_info, 0444, amdgpu_device_get_board_info, NULL); 356 357 static struct attribute *amdgpu_board_attrs[] = { 358 &dev_attr_board_info.attr, 359 NULL, 360 }; 361 362 static umode_t amdgpu_board_attrs_is_visible(struct kobject *kobj, 363 struct attribute *attr, int n) 364 { 365 struct device *dev = kobj_to_dev(kobj); 366 struct drm_device *ddev = dev_get_drvdata(dev); 367 struct amdgpu_device *adev = drm_to_adev(ddev); 368 369 if (adev->flags & AMD_IS_APU) 370 return 0; 371 372 return attr->mode; 373 } 374 375 static const struct attribute_group amdgpu_board_attrs_group = { 376 .attrs = amdgpu_board_attrs, 377 .is_visible = amdgpu_board_attrs_is_visible 378 }; 379 380 static void amdgpu_device_get_pcie_info(struct amdgpu_device *adev); 381 382 383 /** 384 * amdgpu_device_supports_px - Is the device a dGPU with ATPX power control 385 * 386 * @dev: drm_device pointer 387 * 388 * Returns true if the device is a dGPU with ATPX power control, 389 * otherwise return false. 390 */ 391 bool amdgpu_device_supports_px(struct drm_device *dev) 392 { 393 struct amdgpu_device *adev = drm_to_adev(dev); 394 395 if ((adev->flags & AMD_IS_PX) && !amdgpu_is_atpx_hybrid()) 396 return true; 397 return false; 398 } 399 400 /** 401 * amdgpu_device_supports_boco - Is the device a dGPU with ACPI power resources 402 * 403 * @dev: drm_device pointer 404 * 405 * Returns true if the device is a dGPU with ACPI power control, 406 * otherwise return false. 407 */ 408 bool amdgpu_device_supports_boco(struct drm_device *dev) 409 { 410 struct amdgpu_device *adev = drm_to_adev(dev); 411 412 if (adev->has_pr3 || 413 ((adev->flags & AMD_IS_PX) && amdgpu_is_atpx_hybrid())) 414 return true; 415 return false; 416 } 417 418 /** 419 * amdgpu_device_supports_baco - Does the device support BACO 420 * 421 * @dev: drm_device pointer 422 * 423 * Return: 424 * 1 if the device supporte BACO; 425 * 3 if the device support MACO (only works if BACO is supported) 426 * otherwise return 0. 427 */ 428 int amdgpu_device_supports_baco(struct drm_device *dev) 429 { 430 struct amdgpu_device *adev = drm_to_adev(dev); 431 432 return amdgpu_asic_supports_baco(adev); 433 } 434 435 void amdgpu_device_detect_runtime_pm_mode(struct amdgpu_device *adev) 436 { 437 struct drm_device *dev; 438 int bamaco_support; 439 440 dev = adev_to_drm(adev); 441 442 adev->pm.rpm_mode = AMDGPU_RUNPM_NONE; 443 bamaco_support = amdgpu_device_supports_baco(dev); 444 445 switch (amdgpu_runtime_pm) { 446 case 2: 447 if (bamaco_support & MACO_SUPPORT) { 448 adev->pm.rpm_mode = AMDGPU_RUNPM_BAMACO; 449 dev_info(adev->dev, "Forcing BAMACO for runtime pm\n"); 450 } else if (bamaco_support == BACO_SUPPORT) { 451 adev->pm.rpm_mode = AMDGPU_RUNPM_BACO; 452 dev_info(adev->dev, "Requested mode BAMACO not available,fallback to use BACO\n"); 453 } 454 break; 455 case 1: 456 if (bamaco_support & BACO_SUPPORT) { 457 adev->pm.rpm_mode = AMDGPU_RUNPM_BACO; 458 dev_info(adev->dev, "Forcing BACO for runtime pm\n"); 459 } 460 break; 461 case -1: 462 case -2: 463 if (amdgpu_device_supports_px(dev)) { /* enable PX as runtime mode */ 464 adev->pm.rpm_mode = AMDGPU_RUNPM_PX; 465 dev_info(adev->dev, "Using ATPX for runtime pm\n"); 466 } else if (amdgpu_device_supports_boco(dev)) { /* enable boco as runtime mode */ 467 adev->pm.rpm_mode = AMDGPU_RUNPM_BOCO; 468 dev_info(adev->dev, "Using BOCO for runtime pm\n"); 469 } else { 470 if (!bamaco_support) 471 goto no_runtime_pm; 472 473 switch (adev->asic_type) { 474 case CHIP_VEGA20: 475 case CHIP_ARCTURUS: 476 /* BACO are not supported on vega20 and arctrus */ 477 break; 478 case CHIP_VEGA10: 479 /* enable BACO as runpm mode if noretry=0 */ 480 if (!adev->gmc.noretry) 481 adev->pm.rpm_mode = AMDGPU_RUNPM_BACO; 482 break; 483 default: 484 /* enable BACO as runpm mode on CI+ */ 485 adev->pm.rpm_mode = AMDGPU_RUNPM_BACO; 486 break; 487 } 488 489 if (adev->pm.rpm_mode == AMDGPU_RUNPM_BACO) { 490 if (bamaco_support & MACO_SUPPORT) { 491 adev->pm.rpm_mode = AMDGPU_RUNPM_BAMACO; 492 dev_info(adev->dev, "Using BAMACO for runtime pm\n"); 493 } else { 494 dev_info(adev->dev, "Using BACO for runtime pm\n"); 495 } 496 } 497 } 498 break; 499 case 0: 500 dev_info(adev->dev, "runtime pm is manually disabled\n"); 501 break; 502 default: 503 break; 504 } 505 506 no_runtime_pm: 507 if (adev->pm.rpm_mode == AMDGPU_RUNPM_NONE) 508 dev_info(adev->dev, "Runtime PM not available\n"); 509 } 510 /** 511 * amdgpu_device_supports_smart_shift - Is the device dGPU with 512 * smart shift support 513 * 514 * @dev: drm_device pointer 515 * 516 * Returns true if the device is a dGPU with Smart Shift support, 517 * otherwise returns false. 518 */ 519 bool amdgpu_device_supports_smart_shift(struct drm_device *dev) 520 { 521 return (amdgpu_device_supports_boco(dev) && 522 amdgpu_acpi_is_power_shift_control_supported()); 523 } 524 525 /* 526 * VRAM access helper functions 527 */ 528 529 /** 530 * amdgpu_device_mm_access - access vram by MM_INDEX/MM_DATA 531 * 532 * @adev: amdgpu_device pointer 533 * @pos: offset of the buffer in vram 534 * @buf: virtual address of the buffer in system memory 535 * @size: read/write size, sizeof(@buf) must > @size 536 * @write: true - write to vram, otherwise - read from vram 537 */ 538 void amdgpu_device_mm_access(struct amdgpu_device *adev, loff_t pos, 539 void *buf, size_t size, bool write) 540 { 541 unsigned long flags; 542 uint32_t hi = ~0, tmp = 0; 543 uint32_t *data = buf; 544 uint64_t last; 545 int idx; 546 547 if (!drm_dev_enter(adev_to_drm(adev), &idx)) 548 return; 549 550 BUG_ON(!IS_ALIGNED(pos, 4) || !IS_ALIGNED(size, 4)); 551 552 spin_lock_irqsave(&adev->mmio_idx_lock, flags); 553 for (last = pos + size; pos < last; pos += 4) { 554 tmp = pos >> 31; 555 556 WREG32_NO_KIQ(mmMM_INDEX, ((uint32_t)pos) | 0x80000000); 557 if (tmp != hi) { 558 WREG32_NO_KIQ(mmMM_INDEX_HI, tmp); 559 hi = tmp; 560 } 561 if (write) 562 WREG32_NO_KIQ(mmMM_DATA, *data++); 563 else 564 *data++ = RREG32_NO_KIQ(mmMM_DATA); 565 } 566 567 spin_unlock_irqrestore(&adev->mmio_idx_lock, flags); 568 drm_dev_exit(idx); 569 } 570 571 /** 572 * amdgpu_device_aper_access - access vram by vram aperature 573 * 574 * @adev: amdgpu_device pointer 575 * @pos: offset of the buffer in vram 576 * @buf: virtual address of the buffer in system memory 577 * @size: read/write size, sizeof(@buf) must > @size 578 * @write: true - write to vram, otherwise - read from vram 579 * 580 * The return value means how many bytes have been transferred. 581 */ 582 size_t amdgpu_device_aper_access(struct amdgpu_device *adev, loff_t pos, 583 void *buf, size_t size, bool write) 584 { 585 #ifdef CONFIG_64BIT 586 void __iomem *addr; 587 size_t count = 0; 588 uint64_t last; 589 590 if (!adev->mman.aper_base_kaddr) 591 return 0; 592 593 last = min(pos + size, adev->gmc.visible_vram_size); 594 if (last > pos) { 595 addr = adev->mman.aper_base_kaddr + pos; 596 count = last - pos; 597 598 if (write) { 599 memcpy_toio(addr, buf, count); 600 /* Make sure HDP write cache flush happens without any reordering 601 * after the system memory contents are sent over PCIe device 602 */ 603 mb(); 604 amdgpu_device_flush_hdp(adev, NULL); 605 } else { 606 amdgpu_device_invalidate_hdp(adev, NULL); 607 /* Make sure HDP read cache is invalidated before issuing a read 608 * to the PCIe device 609 */ 610 mb(); 611 memcpy_fromio(buf, addr, count); 612 } 613 614 } 615 616 return count; 617 #else 618 return 0; 619 #endif 620 } 621 622 /** 623 * amdgpu_device_vram_access - read/write a buffer in vram 624 * 625 * @adev: amdgpu_device pointer 626 * @pos: offset of the buffer in vram 627 * @buf: virtual address of the buffer in system memory 628 * @size: read/write size, sizeof(@buf) must > @size 629 * @write: true - write to vram, otherwise - read from vram 630 */ 631 void amdgpu_device_vram_access(struct amdgpu_device *adev, loff_t pos, 632 void *buf, size_t size, bool write) 633 { 634 size_t count; 635 636 /* try to using vram apreature to access vram first */ 637 count = amdgpu_device_aper_access(adev, pos, buf, size, write); 638 size -= count; 639 if (size) { 640 /* using MM to access rest vram */ 641 pos += count; 642 buf += count; 643 amdgpu_device_mm_access(adev, pos, buf, size, write); 644 } 645 } 646 647 /* 648 * register access helper functions. 649 */ 650 651 /* Check if hw access should be skipped because of hotplug or device error */ 652 bool amdgpu_device_skip_hw_access(struct amdgpu_device *adev) 653 { 654 if (adev->no_hw_access) 655 return true; 656 657 #ifdef CONFIG_LOCKDEP 658 /* 659 * This is a bit complicated to understand, so worth a comment. What we assert 660 * here is that the GPU reset is not running on another thread in parallel. 661 * 662 * For this we trylock the read side of the reset semaphore, if that succeeds 663 * we know that the reset is not running in paralell. 664 * 665 * If the trylock fails we assert that we are either already holding the read 666 * side of the lock or are the reset thread itself and hold the write side of 667 * the lock. 668 */ 669 if (in_task()) { 670 if (down_read_trylock(&adev->reset_domain->sem)) 671 up_read(&adev->reset_domain->sem); 672 else 673 lockdep_assert_held(&adev->reset_domain->sem); 674 } 675 #endif 676 return false; 677 } 678 679 /** 680 * amdgpu_device_rreg - read a memory mapped IO or indirect register 681 * 682 * @adev: amdgpu_device pointer 683 * @reg: dword aligned register offset 684 * @acc_flags: access flags which require special behavior 685 * 686 * Returns the 32 bit value from the offset specified. 687 */ 688 uint32_t amdgpu_device_rreg(struct amdgpu_device *adev, 689 uint32_t reg, uint32_t acc_flags) 690 { 691 uint32_t ret; 692 693 if (amdgpu_device_skip_hw_access(adev)) 694 return 0; 695 696 if ((reg * 4) < adev->rmmio_size) { 697 if (!(acc_flags & AMDGPU_REGS_NO_KIQ) && 698 amdgpu_sriov_runtime(adev) && 699 down_read_trylock(&adev->reset_domain->sem)) { 700 ret = amdgpu_kiq_rreg(adev, reg, 0); 701 up_read(&adev->reset_domain->sem); 702 } else { 703 ret = readl(((void __iomem *)adev->rmmio) + (reg * 4)); 704 } 705 } else { 706 ret = adev->pcie_rreg(adev, reg * 4); 707 } 708 709 trace_amdgpu_device_rreg(adev->pdev->device, reg, ret); 710 711 return ret; 712 } 713 714 /* 715 * MMIO register read with bytes helper functions 716 * @offset:bytes offset from MMIO start 717 */ 718 719 /** 720 * amdgpu_mm_rreg8 - read a memory mapped IO register 721 * 722 * @adev: amdgpu_device pointer 723 * @offset: byte aligned register offset 724 * 725 * Returns the 8 bit value from the offset specified. 726 */ 727 uint8_t amdgpu_mm_rreg8(struct amdgpu_device *adev, uint32_t offset) 728 { 729 if (amdgpu_device_skip_hw_access(adev)) 730 return 0; 731 732 if (offset < adev->rmmio_size) 733 return (readb(adev->rmmio + offset)); 734 BUG(); 735 } 736 737 738 /** 739 * amdgpu_device_xcc_rreg - read a memory mapped IO or indirect register with specific XCC 740 * 741 * @adev: amdgpu_device pointer 742 * @reg: dword aligned register offset 743 * @acc_flags: access flags which require special behavior 744 * @xcc_id: xcc accelerated compute core id 745 * 746 * Returns the 32 bit value from the offset specified. 747 */ 748 uint32_t amdgpu_device_xcc_rreg(struct amdgpu_device *adev, 749 uint32_t reg, uint32_t acc_flags, 750 uint32_t xcc_id) 751 { 752 uint32_t ret, rlcg_flag; 753 754 if (amdgpu_device_skip_hw_access(adev)) 755 return 0; 756 757 if ((reg * 4) < adev->rmmio_size) { 758 if (amdgpu_sriov_vf(adev) && 759 !amdgpu_sriov_runtime(adev) && 760 adev->gfx.rlc.rlcg_reg_access_supported && 761 amdgpu_virt_get_rlcg_reg_access_flag(adev, acc_flags, 762 GC_HWIP, false, 763 &rlcg_flag)) { 764 ret = amdgpu_virt_rlcg_reg_rw(adev, reg, 0, rlcg_flag, GET_INST(GC, xcc_id)); 765 } else if (!(acc_flags & AMDGPU_REGS_NO_KIQ) && 766 amdgpu_sriov_runtime(adev) && 767 down_read_trylock(&adev->reset_domain->sem)) { 768 ret = amdgpu_kiq_rreg(adev, reg, xcc_id); 769 up_read(&adev->reset_domain->sem); 770 } else { 771 ret = readl(((void __iomem *)adev->rmmio) + (reg * 4)); 772 } 773 } else { 774 ret = adev->pcie_rreg(adev, reg * 4); 775 } 776 777 return ret; 778 } 779 780 /* 781 * MMIO register write with bytes helper functions 782 * @offset:bytes offset from MMIO start 783 * @value: the value want to be written to the register 784 */ 785 786 /** 787 * amdgpu_mm_wreg8 - read a memory mapped IO register 788 * 789 * @adev: amdgpu_device pointer 790 * @offset: byte aligned register offset 791 * @value: 8 bit value to write 792 * 793 * Writes the value specified to the offset specified. 794 */ 795 void amdgpu_mm_wreg8(struct amdgpu_device *adev, uint32_t offset, uint8_t value) 796 { 797 if (amdgpu_device_skip_hw_access(adev)) 798 return; 799 800 if (offset < adev->rmmio_size) 801 writeb(value, adev->rmmio + offset); 802 else 803 BUG(); 804 } 805 806 /** 807 * amdgpu_device_wreg - write to a memory mapped IO or indirect register 808 * 809 * @adev: amdgpu_device pointer 810 * @reg: dword aligned register offset 811 * @v: 32 bit value to write to the register 812 * @acc_flags: access flags which require special behavior 813 * 814 * Writes the value specified to the offset specified. 815 */ 816 void amdgpu_device_wreg(struct amdgpu_device *adev, 817 uint32_t reg, uint32_t v, 818 uint32_t acc_flags) 819 { 820 if (amdgpu_device_skip_hw_access(adev)) 821 return; 822 823 if ((reg * 4) < adev->rmmio_size) { 824 if (!(acc_flags & AMDGPU_REGS_NO_KIQ) && 825 amdgpu_sriov_runtime(adev) && 826 down_read_trylock(&adev->reset_domain->sem)) { 827 amdgpu_kiq_wreg(adev, reg, v, 0); 828 up_read(&adev->reset_domain->sem); 829 } else { 830 writel(v, ((void __iomem *)adev->rmmio) + (reg * 4)); 831 } 832 } else { 833 adev->pcie_wreg(adev, reg * 4, v); 834 } 835 836 trace_amdgpu_device_wreg(adev->pdev->device, reg, v); 837 } 838 839 /** 840 * amdgpu_mm_wreg_mmio_rlc - write register either with direct/indirect mmio or with RLC path if in range 841 * 842 * @adev: amdgpu_device pointer 843 * @reg: mmio/rlc register 844 * @v: value to write 845 * @xcc_id: xcc accelerated compute core id 846 * 847 * this function is invoked only for the debugfs register access 848 */ 849 void amdgpu_mm_wreg_mmio_rlc(struct amdgpu_device *adev, 850 uint32_t reg, uint32_t v, 851 uint32_t xcc_id) 852 { 853 if (amdgpu_device_skip_hw_access(adev)) 854 return; 855 856 if (amdgpu_sriov_fullaccess(adev) && 857 adev->gfx.rlc.funcs && 858 adev->gfx.rlc.funcs->is_rlcg_access_range) { 859 if (adev->gfx.rlc.funcs->is_rlcg_access_range(adev, reg)) 860 return amdgpu_sriov_wreg(adev, reg, v, 0, 0, xcc_id); 861 } else if ((reg * 4) >= adev->rmmio_size) { 862 adev->pcie_wreg(adev, reg * 4, v); 863 } else { 864 writel(v, ((void __iomem *)adev->rmmio) + (reg * 4)); 865 } 866 } 867 868 /** 869 * amdgpu_device_xcc_wreg - write to a memory mapped IO or indirect register with specific XCC 870 * 871 * @adev: amdgpu_device pointer 872 * @reg: dword aligned register offset 873 * @v: 32 bit value to write to the register 874 * @acc_flags: access flags which require special behavior 875 * @xcc_id: xcc accelerated compute core id 876 * 877 * Writes the value specified to the offset specified. 878 */ 879 void amdgpu_device_xcc_wreg(struct amdgpu_device *adev, 880 uint32_t reg, uint32_t v, 881 uint32_t acc_flags, uint32_t xcc_id) 882 { 883 uint32_t rlcg_flag; 884 885 if (amdgpu_device_skip_hw_access(adev)) 886 return; 887 888 if ((reg * 4) < adev->rmmio_size) { 889 if (amdgpu_sriov_vf(adev) && 890 !amdgpu_sriov_runtime(adev) && 891 adev->gfx.rlc.rlcg_reg_access_supported && 892 amdgpu_virt_get_rlcg_reg_access_flag(adev, acc_flags, 893 GC_HWIP, true, 894 &rlcg_flag)) { 895 amdgpu_virt_rlcg_reg_rw(adev, reg, v, rlcg_flag, GET_INST(GC, xcc_id)); 896 } else if (!(acc_flags & AMDGPU_REGS_NO_KIQ) && 897 amdgpu_sriov_runtime(adev) && 898 down_read_trylock(&adev->reset_domain->sem)) { 899 amdgpu_kiq_wreg(adev, reg, v, xcc_id); 900 up_read(&adev->reset_domain->sem); 901 } else { 902 writel(v, ((void __iomem *)adev->rmmio) + (reg * 4)); 903 } 904 } else { 905 adev->pcie_wreg(adev, reg * 4, v); 906 } 907 } 908 909 /** 910 * amdgpu_device_indirect_rreg - read an indirect register 911 * 912 * @adev: amdgpu_device pointer 913 * @reg_addr: indirect register address to read from 914 * 915 * Returns the value of indirect register @reg_addr 916 */ 917 u32 amdgpu_device_indirect_rreg(struct amdgpu_device *adev, 918 u32 reg_addr) 919 { 920 unsigned long flags, pcie_index, pcie_data; 921 void __iomem *pcie_index_offset; 922 void __iomem *pcie_data_offset; 923 u32 r; 924 925 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 926 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 927 928 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 929 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 930 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 931 932 writel(reg_addr, pcie_index_offset); 933 readl(pcie_index_offset); 934 r = readl(pcie_data_offset); 935 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 936 937 return r; 938 } 939 940 u32 amdgpu_device_indirect_rreg_ext(struct amdgpu_device *adev, 941 u64 reg_addr) 942 { 943 unsigned long flags, pcie_index, pcie_index_hi, pcie_data; 944 u32 r; 945 void __iomem *pcie_index_offset; 946 void __iomem *pcie_index_hi_offset; 947 void __iomem *pcie_data_offset; 948 949 if (unlikely(!adev->nbio.funcs)) { 950 pcie_index = AMDGPU_PCIE_INDEX_FALLBACK; 951 pcie_data = AMDGPU_PCIE_DATA_FALLBACK; 952 } else { 953 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 954 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 955 } 956 957 if (reg_addr >> 32) { 958 if (unlikely(!adev->nbio.funcs)) 959 pcie_index_hi = AMDGPU_PCIE_INDEX_HI_FALLBACK; 960 else 961 pcie_index_hi = adev->nbio.funcs->get_pcie_index_hi_offset(adev); 962 } else { 963 pcie_index_hi = 0; 964 } 965 966 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 967 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 968 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 969 if (pcie_index_hi != 0) 970 pcie_index_hi_offset = (void __iomem *)adev->rmmio + 971 pcie_index_hi * 4; 972 973 writel(reg_addr, pcie_index_offset); 974 readl(pcie_index_offset); 975 if (pcie_index_hi != 0) { 976 writel((reg_addr >> 32) & 0xff, pcie_index_hi_offset); 977 readl(pcie_index_hi_offset); 978 } 979 r = readl(pcie_data_offset); 980 981 /* clear the high bits */ 982 if (pcie_index_hi != 0) { 983 writel(0, pcie_index_hi_offset); 984 readl(pcie_index_hi_offset); 985 } 986 987 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 988 989 return r; 990 } 991 992 /** 993 * amdgpu_device_indirect_rreg64 - read a 64bits indirect register 994 * 995 * @adev: amdgpu_device pointer 996 * @reg_addr: indirect register address to read from 997 * 998 * Returns the value of indirect register @reg_addr 999 */ 1000 u64 amdgpu_device_indirect_rreg64(struct amdgpu_device *adev, 1001 u32 reg_addr) 1002 { 1003 unsigned long flags, pcie_index, pcie_data; 1004 void __iomem *pcie_index_offset; 1005 void __iomem *pcie_data_offset; 1006 u64 r; 1007 1008 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 1009 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 1010 1011 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 1012 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 1013 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 1014 1015 /* read low 32 bits */ 1016 writel(reg_addr, pcie_index_offset); 1017 readl(pcie_index_offset); 1018 r = readl(pcie_data_offset); 1019 /* read high 32 bits */ 1020 writel(reg_addr + 4, pcie_index_offset); 1021 readl(pcie_index_offset); 1022 r |= ((u64)readl(pcie_data_offset) << 32); 1023 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 1024 1025 return r; 1026 } 1027 1028 u64 amdgpu_device_indirect_rreg64_ext(struct amdgpu_device *adev, 1029 u64 reg_addr) 1030 { 1031 unsigned long flags, pcie_index, pcie_data; 1032 unsigned long pcie_index_hi = 0; 1033 void __iomem *pcie_index_offset; 1034 void __iomem *pcie_index_hi_offset; 1035 void __iomem *pcie_data_offset; 1036 u64 r; 1037 1038 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 1039 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 1040 if ((reg_addr >> 32) && (adev->nbio.funcs->get_pcie_index_hi_offset)) 1041 pcie_index_hi = adev->nbio.funcs->get_pcie_index_hi_offset(adev); 1042 1043 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 1044 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 1045 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 1046 if (pcie_index_hi != 0) 1047 pcie_index_hi_offset = (void __iomem *)adev->rmmio + 1048 pcie_index_hi * 4; 1049 1050 /* read low 32 bits */ 1051 writel(reg_addr, pcie_index_offset); 1052 readl(pcie_index_offset); 1053 if (pcie_index_hi != 0) { 1054 writel((reg_addr >> 32) & 0xff, pcie_index_hi_offset); 1055 readl(pcie_index_hi_offset); 1056 } 1057 r = readl(pcie_data_offset); 1058 /* read high 32 bits */ 1059 writel(reg_addr + 4, pcie_index_offset); 1060 readl(pcie_index_offset); 1061 if (pcie_index_hi != 0) { 1062 writel((reg_addr >> 32) & 0xff, pcie_index_hi_offset); 1063 readl(pcie_index_hi_offset); 1064 } 1065 r |= ((u64)readl(pcie_data_offset) << 32); 1066 1067 /* clear the high bits */ 1068 if (pcie_index_hi != 0) { 1069 writel(0, pcie_index_hi_offset); 1070 readl(pcie_index_hi_offset); 1071 } 1072 1073 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 1074 1075 return r; 1076 } 1077 1078 /** 1079 * amdgpu_device_indirect_wreg - write an indirect register address 1080 * 1081 * @adev: amdgpu_device pointer 1082 * @reg_addr: indirect register offset 1083 * @reg_data: indirect register data 1084 * 1085 */ 1086 void amdgpu_device_indirect_wreg(struct amdgpu_device *adev, 1087 u32 reg_addr, u32 reg_data) 1088 { 1089 unsigned long flags, pcie_index, pcie_data; 1090 void __iomem *pcie_index_offset; 1091 void __iomem *pcie_data_offset; 1092 1093 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 1094 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 1095 1096 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 1097 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 1098 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 1099 1100 writel(reg_addr, pcie_index_offset); 1101 readl(pcie_index_offset); 1102 writel(reg_data, pcie_data_offset); 1103 readl(pcie_data_offset); 1104 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 1105 } 1106 1107 void amdgpu_device_indirect_wreg_ext(struct amdgpu_device *adev, 1108 u64 reg_addr, u32 reg_data) 1109 { 1110 unsigned long flags, pcie_index, pcie_index_hi, pcie_data; 1111 void __iomem *pcie_index_offset; 1112 void __iomem *pcie_index_hi_offset; 1113 void __iomem *pcie_data_offset; 1114 1115 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 1116 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 1117 if ((reg_addr >> 32) && (adev->nbio.funcs->get_pcie_index_hi_offset)) 1118 pcie_index_hi = adev->nbio.funcs->get_pcie_index_hi_offset(adev); 1119 else 1120 pcie_index_hi = 0; 1121 1122 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 1123 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 1124 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 1125 if (pcie_index_hi != 0) 1126 pcie_index_hi_offset = (void __iomem *)adev->rmmio + 1127 pcie_index_hi * 4; 1128 1129 writel(reg_addr, pcie_index_offset); 1130 readl(pcie_index_offset); 1131 if (pcie_index_hi != 0) { 1132 writel((reg_addr >> 32) & 0xff, pcie_index_hi_offset); 1133 readl(pcie_index_hi_offset); 1134 } 1135 writel(reg_data, pcie_data_offset); 1136 readl(pcie_data_offset); 1137 1138 /* clear the high bits */ 1139 if (pcie_index_hi != 0) { 1140 writel(0, pcie_index_hi_offset); 1141 readl(pcie_index_hi_offset); 1142 } 1143 1144 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 1145 } 1146 1147 /** 1148 * amdgpu_device_indirect_wreg64 - write a 64bits indirect register address 1149 * 1150 * @adev: amdgpu_device pointer 1151 * @reg_addr: indirect register offset 1152 * @reg_data: indirect register data 1153 * 1154 */ 1155 void amdgpu_device_indirect_wreg64(struct amdgpu_device *adev, 1156 u32 reg_addr, u64 reg_data) 1157 { 1158 unsigned long flags, pcie_index, pcie_data; 1159 void __iomem *pcie_index_offset; 1160 void __iomem *pcie_data_offset; 1161 1162 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 1163 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 1164 1165 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 1166 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 1167 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 1168 1169 /* write low 32 bits */ 1170 writel(reg_addr, pcie_index_offset); 1171 readl(pcie_index_offset); 1172 writel((u32)(reg_data & 0xffffffffULL), pcie_data_offset); 1173 readl(pcie_data_offset); 1174 /* write high 32 bits */ 1175 writel(reg_addr + 4, pcie_index_offset); 1176 readl(pcie_index_offset); 1177 writel((u32)(reg_data >> 32), pcie_data_offset); 1178 readl(pcie_data_offset); 1179 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 1180 } 1181 1182 void amdgpu_device_indirect_wreg64_ext(struct amdgpu_device *adev, 1183 u64 reg_addr, u64 reg_data) 1184 { 1185 unsigned long flags, pcie_index, pcie_data; 1186 unsigned long pcie_index_hi = 0; 1187 void __iomem *pcie_index_offset; 1188 void __iomem *pcie_index_hi_offset; 1189 void __iomem *pcie_data_offset; 1190 1191 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 1192 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 1193 if ((reg_addr >> 32) && (adev->nbio.funcs->get_pcie_index_hi_offset)) 1194 pcie_index_hi = adev->nbio.funcs->get_pcie_index_hi_offset(adev); 1195 1196 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 1197 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 1198 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 1199 if (pcie_index_hi != 0) 1200 pcie_index_hi_offset = (void __iomem *)adev->rmmio + 1201 pcie_index_hi * 4; 1202 1203 /* write low 32 bits */ 1204 writel(reg_addr, pcie_index_offset); 1205 readl(pcie_index_offset); 1206 if (pcie_index_hi != 0) { 1207 writel((reg_addr >> 32) & 0xff, pcie_index_hi_offset); 1208 readl(pcie_index_hi_offset); 1209 } 1210 writel((u32)(reg_data & 0xffffffffULL), pcie_data_offset); 1211 readl(pcie_data_offset); 1212 /* write high 32 bits */ 1213 writel(reg_addr + 4, pcie_index_offset); 1214 readl(pcie_index_offset); 1215 if (pcie_index_hi != 0) { 1216 writel((reg_addr >> 32) & 0xff, pcie_index_hi_offset); 1217 readl(pcie_index_hi_offset); 1218 } 1219 writel((u32)(reg_data >> 32), pcie_data_offset); 1220 readl(pcie_data_offset); 1221 1222 /* clear the high bits */ 1223 if (pcie_index_hi != 0) { 1224 writel(0, pcie_index_hi_offset); 1225 readl(pcie_index_hi_offset); 1226 } 1227 1228 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 1229 } 1230 1231 /** 1232 * amdgpu_device_get_rev_id - query device rev_id 1233 * 1234 * @adev: amdgpu_device pointer 1235 * 1236 * Return device rev_id 1237 */ 1238 u32 amdgpu_device_get_rev_id(struct amdgpu_device *adev) 1239 { 1240 return adev->nbio.funcs->get_rev_id(adev); 1241 } 1242 1243 /** 1244 * amdgpu_invalid_rreg - dummy reg read function 1245 * 1246 * @adev: amdgpu_device pointer 1247 * @reg: offset of register 1248 * 1249 * Dummy register read function. Used for register blocks 1250 * that certain asics don't have (all asics). 1251 * Returns the value in the register. 1252 */ 1253 static uint32_t amdgpu_invalid_rreg(struct amdgpu_device *adev, uint32_t reg) 1254 { 1255 DRM_ERROR("Invalid callback to read register 0x%04X\n", reg); 1256 BUG(); 1257 return 0; 1258 } 1259 1260 static uint32_t amdgpu_invalid_rreg_ext(struct amdgpu_device *adev, uint64_t reg) 1261 { 1262 DRM_ERROR("Invalid callback to read register 0x%llX\n", reg); 1263 BUG(); 1264 return 0; 1265 } 1266 1267 /** 1268 * amdgpu_invalid_wreg - dummy reg write function 1269 * 1270 * @adev: amdgpu_device pointer 1271 * @reg: offset of register 1272 * @v: value to write to the register 1273 * 1274 * Dummy register read function. Used for register blocks 1275 * that certain asics don't have (all asics). 1276 */ 1277 static void amdgpu_invalid_wreg(struct amdgpu_device *adev, uint32_t reg, uint32_t v) 1278 { 1279 DRM_ERROR("Invalid callback to write register 0x%04X with 0x%08X\n", 1280 reg, v); 1281 BUG(); 1282 } 1283 1284 static void amdgpu_invalid_wreg_ext(struct amdgpu_device *adev, uint64_t reg, uint32_t v) 1285 { 1286 DRM_ERROR("Invalid callback to write register 0x%llX with 0x%08X\n", 1287 reg, v); 1288 BUG(); 1289 } 1290 1291 /** 1292 * amdgpu_invalid_rreg64 - dummy 64 bit reg read function 1293 * 1294 * @adev: amdgpu_device pointer 1295 * @reg: offset of register 1296 * 1297 * Dummy register read function. Used for register blocks 1298 * that certain asics don't have (all asics). 1299 * Returns the value in the register. 1300 */ 1301 static uint64_t amdgpu_invalid_rreg64(struct amdgpu_device *adev, uint32_t reg) 1302 { 1303 DRM_ERROR("Invalid callback to read 64 bit register 0x%04X\n", reg); 1304 BUG(); 1305 return 0; 1306 } 1307 1308 static uint64_t amdgpu_invalid_rreg64_ext(struct amdgpu_device *adev, uint64_t reg) 1309 { 1310 DRM_ERROR("Invalid callback to read register 0x%llX\n", reg); 1311 BUG(); 1312 return 0; 1313 } 1314 1315 /** 1316 * amdgpu_invalid_wreg64 - dummy reg write function 1317 * 1318 * @adev: amdgpu_device pointer 1319 * @reg: offset of register 1320 * @v: value to write to the register 1321 * 1322 * Dummy register read function. Used for register blocks 1323 * that certain asics don't have (all asics). 1324 */ 1325 static void amdgpu_invalid_wreg64(struct amdgpu_device *adev, uint32_t reg, uint64_t v) 1326 { 1327 DRM_ERROR("Invalid callback to write 64 bit register 0x%04X with 0x%08llX\n", 1328 reg, v); 1329 BUG(); 1330 } 1331 1332 static void amdgpu_invalid_wreg64_ext(struct amdgpu_device *adev, uint64_t reg, uint64_t v) 1333 { 1334 DRM_ERROR("Invalid callback to write 64 bit register 0x%llX with 0x%08llX\n", 1335 reg, v); 1336 BUG(); 1337 } 1338 1339 /** 1340 * amdgpu_block_invalid_rreg - dummy reg read function 1341 * 1342 * @adev: amdgpu_device pointer 1343 * @block: offset of instance 1344 * @reg: offset of register 1345 * 1346 * Dummy register read function. Used for register blocks 1347 * that certain asics don't have (all asics). 1348 * Returns the value in the register. 1349 */ 1350 static uint32_t amdgpu_block_invalid_rreg(struct amdgpu_device *adev, 1351 uint32_t block, uint32_t reg) 1352 { 1353 DRM_ERROR("Invalid callback to read register 0x%04X in block 0x%04X\n", 1354 reg, block); 1355 BUG(); 1356 return 0; 1357 } 1358 1359 /** 1360 * amdgpu_block_invalid_wreg - dummy reg write function 1361 * 1362 * @adev: amdgpu_device pointer 1363 * @block: offset of instance 1364 * @reg: offset of register 1365 * @v: value to write to the register 1366 * 1367 * Dummy register read function. Used for register blocks 1368 * that certain asics don't have (all asics). 1369 */ 1370 static void amdgpu_block_invalid_wreg(struct amdgpu_device *adev, 1371 uint32_t block, 1372 uint32_t reg, uint32_t v) 1373 { 1374 DRM_ERROR("Invalid block callback to write register 0x%04X in block 0x%04X with 0x%08X\n", 1375 reg, block, v); 1376 BUG(); 1377 } 1378 1379 /** 1380 * amdgpu_device_asic_init - Wrapper for atom asic_init 1381 * 1382 * @adev: amdgpu_device pointer 1383 * 1384 * Does any asic specific work and then calls atom asic init. 1385 */ 1386 static int amdgpu_device_asic_init(struct amdgpu_device *adev) 1387 { 1388 int ret; 1389 1390 amdgpu_asic_pre_asic_init(adev); 1391 1392 if (amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 3) || 1393 amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 4) || 1394 amdgpu_ip_version(adev, GC_HWIP, 0) >= IP_VERSION(11, 0, 0)) { 1395 amdgpu_psp_wait_for_bootloader(adev); 1396 ret = amdgpu_atomfirmware_asic_init(adev, true); 1397 return ret; 1398 } else { 1399 return amdgpu_atom_asic_init(adev->mode_info.atom_context); 1400 } 1401 1402 return 0; 1403 } 1404 1405 /** 1406 * amdgpu_device_mem_scratch_init - allocate the VRAM scratch page 1407 * 1408 * @adev: amdgpu_device pointer 1409 * 1410 * Allocates a scratch page of VRAM for use by various things in the 1411 * driver. 1412 */ 1413 static int amdgpu_device_mem_scratch_init(struct amdgpu_device *adev) 1414 { 1415 return amdgpu_bo_create_kernel(adev, AMDGPU_GPU_PAGE_SIZE, PAGE_SIZE, 1416 AMDGPU_GEM_DOMAIN_VRAM | 1417 AMDGPU_GEM_DOMAIN_GTT, 1418 &adev->mem_scratch.robj, 1419 &adev->mem_scratch.gpu_addr, 1420 (void **)&adev->mem_scratch.ptr); 1421 } 1422 1423 /** 1424 * amdgpu_device_mem_scratch_fini - Free the VRAM scratch page 1425 * 1426 * @adev: amdgpu_device pointer 1427 * 1428 * Frees the VRAM scratch page. 1429 */ 1430 static void amdgpu_device_mem_scratch_fini(struct amdgpu_device *adev) 1431 { 1432 amdgpu_bo_free_kernel(&adev->mem_scratch.robj, NULL, NULL); 1433 } 1434 1435 /** 1436 * amdgpu_device_program_register_sequence - program an array of registers. 1437 * 1438 * @adev: amdgpu_device pointer 1439 * @registers: pointer to the register array 1440 * @array_size: size of the register array 1441 * 1442 * Programs an array or registers with and or masks. 1443 * This is a helper for setting golden registers. 1444 */ 1445 void amdgpu_device_program_register_sequence(struct amdgpu_device *adev, 1446 const u32 *registers, 1447 const u32 array_size) 1448 { 1449 u32 tmp, reg, and_mask, or_mask; 1450 int i; 1451 1452 if (array_size % 3) 1453 return; 1454 1455 for (i = 0; i < array_size; i += 3) { 1456 reg = registers[i + 0]; 1457 and_mask = registers[i + 1]; 1458 or_mask = registers[i + 2]; 1459 1460 if (and_mask == 0xffffffff) { 1461 tmp = or_mask; 1462 } else { 1463 tmp = RREG32(reg); 1464 tmp &= ~and_mask; 1465 if (adev->family >= AMDGPU_FAMILY_AI) 1466 tmp |= (or_mask & and_mask); 1467 else 1468 tmp |= or_mask; 1469 } 1470 WREG32(reg, tmp); 1471 } 1472 } 1473 1474 /** 1475 * amdgpu_device_pci_config_reset - reset the GPU 1476 * 1477 * @adev: amdgpu_device pointer 1478 * 1479 * Resets the GPU using the pci config reset sequence. 1480 * Only applicable to asics prior to vega10. 1481 */ 1482 void amdgpu_device_pci_config_reset(struct amdgpu_device *adev) 1483 { 1484 pci_write_config_dword(adev->pdev, 0x7c, AMDGPU_ASIC_RESET_DATA); 1485 } 1486 1487 /** 1488 * amdgpu_device_pci_reset - reset the GPU using generic PCI means 1489 * 1490 * @adev: amdgpu_device pointer 1491 * 1492 * Resets the GPU using generic pci reset interfaces (FLR, SBR, etc.). 1493 */ 1494 int amdgpu_device_pci_reset(struct amdgpu_device *adev) 1495 { 1496 return pci_reset_function(adev->pdev); 1497 } 1498 1499 /* 1500 * amdgpu_device_wb_*() 1501 * Writeback is the method by which the GPU updates special pages in memory 1502 * with the status of certain GPU events (fences, ring pointers,etc.). 1503 */ 1504 1505 /** 1506 * amdgpu_device_wb_fini - Disable Writeback and free memory 1507 * 1508 * @adev: amdgpu_device pointer 1509 * 1510 * Disables Writeback and frees the Writeback memory (all asics). 1511 * Used at driver shutdown. 1512 */ 1513 static void amdgpu_device_wb_fini(struct amdgpu_device *adev) 1514 { 1515 if (adev->wb.wb_obj) { 1516 amdgpu_bo_free_kernel(&adev->wb.wb_obj, 1517 &adev->wb.gpu_addr, 1518 (void **)&adev->wb.wb); 1519 adev->wb.wb_obj = NULL; 1520 } 1521 } 1522 1523 /** 1524 * amdgpu_device_wb_init - Init Writeback driver info and allocate memory 1525 * 1526 * @adev: amdgpu_device pointer 1527 * 1528 * Initializes writeback and allocates writeback memory (all asics). 1529 * Used at driver startup. 1530 * Returns 0 on success or an -error on failure. 1531 */ 1532 static int amdgpu_device_wb_init(struct amdgpu_device *adev) 1533 { 1534 int r; 1535 1536 if (adev->wb.wb_obj == NULL) { 1537 /* AMDGPU_MAX_WB * sizeof(uint32_t) * 8 = AMDGPU_MAX_WB 256bit slots */ 1538 r = amdgpu_bo_create_kernel(adev, AMDGPU_MAX_WB * sizeof(uint32_t) * 8, 1539 PAGE_SIZE, AMDGPU_GEM_DOMAIN_GTT, 1540 &adev->wb.wb_obj, &adev->wb.gpu_addr, 1541 (void **)&adev->wb.wb); 1542 if (r) { 1543 dev_warn(adev->dev, "(%d) create WB bo failed\n", r); 1544 return r; 1545 } 1546 1547 adev->wb.num_wb = AMDGPU_MAX_WB; 1548 memset(&adev->wb.used, 0, sizeof(adev->wb.used)); 1549 1550 /* clear wb memory */ 1551 memset((char *)adev->wb.wb, 0, AMDGPU_MAX_WB * sizeof(uint32_t) * 8); 1552 } 1553 1554 return 0; 1555 } 1556 1557 /** 1558 * amdgpu_device_wb_get - Allocate a wb entry 1559 * 1560 * @adev: amdgpu_device pointer 1561 * @wb: wb index 1562 * 1563 * Allocate a wb slot for use by the driver (all asics). 1564 * Returns 0 on success or -EINVAL on failure. 1565 */ 1566 int amdgpu_device_wb_get(struct amdgpu_device *adev, u32 *wb) 1567 { 1568 unsigned long flags, offset; 1569 1570 spin_lock_irqsave(&adev->wb.lock, flags); 1571 offset = find_first_zero_bit(adev->wb.used, adev->wb.num_wb); 1572 if (offset < adev->wb.num_wb) { 1573 __set_bit(offset, adev->wb.used); 1574 spin_unlock_irqrestore(&adev->wb.lock, flags); 1575 *wb = offset << 3; /* convert to dw offset */ 1576 return 0; 1577 } else { 1578 spin_unlock_irqrestore(&adev->wb.lock, flags); 1579 return -EINVAL; 1580 } 1581 } 1582 1583 /** 1584 * amdgpu_device_wb_free - Free a wb entry 1585 * 1586 * @adev: amdgpu_device pointer 1587 * @wb: wb index 1588 * 1589 * Free a wb slot allocated for use by the driver (all asics) 1590 */ 1591 void amdgpu_device_wb_free(struct amdgpu_device *adev, u32 wb) 1592 { 1593 unsigned long flags; 1594 1595 wb >>= 3; 1596 spin_lock_irqsave(&adev->wb.lock, flags); 1597 if (wb < adev->wb.num_wb) 1598 __clear_bit(wb, adev->wb.used); 1599 spin_unlock_irqrestore(&adev->wb.lock, flags); 1600 } 1601 1602 /** 1603 * amdgpu_device_resize_fb_bar - try to resize FB BAR 1604 * 1605 * @adev: amdgpu_device pointer 1606 * 1607 * Try to resize FB BAR to make all VRAM CPU accessible. We try very hard not 1608 * to fail, but if any of the BARs is not accessible after the size we abort 1609 * driver loading by returning -ENODEV. 1610 */ 1611 int amdgpu_device_resize_fb_bar(struct amdgpu_device *adev) 1612 { 1613 int rbar_size = pci_rebar_bytes_to_size(adev->gmc.real_vram_size); 1614 struct pci_bus *root; 1615 struct resource *res; 1616 unsigned int i; 1617 u16 cmd; 1618 int r; 1619 1620 if (!IS_ENABLED(CONFIG_PHYS_ADDR_T_64BIT)) 1621 return 0; 1622 1623 /* Bypass for VF */ 1624 if (amdgpu_sriov_vf(adev)) 1625 return 0; 1626 1627 /* PCI_EXT_CAP_ID_VNDR extended capability is located at 0x100 */ 1628 if (!pci_find_ext_capability(adev->pdev, PCI_EXT_CAP_ID_VNDR)) 1629 DRM_WARN("System can't access extended configuration space, please check!!\n"); 1630 1631 /* skip if the bios has already enabled large BAR */ 1632 if (adev->gmc.real_vram_size && 1633 (pci_resource_len(adev->pdev, 0) >= adev->gmc.real_vram_size)) 1634 return 0; 1635 1636 /* Check if the root BUS has 64bit memory resources */ 1637 root = adev->pdev->bus; 1638 while (root->parent) 1639 root = root->parent; 1640 1641 pci_bus_for_each_resource(root, res, i) { 1642 if (res && res->flags & (IORESOURCE_MEM | IORESOURCE_MEM_64) && 1643 res->start > 0x100000000ull) 1644 break; 1645 } 1646 1647 /* Trying to resize is pointless without a root hub window above 4GB */ 1648 if (!res) 1649 return 0; 1650 1651 /* Limit the BAR size to what is available */ 1652 rbar_size = min(fls(pci_rebar_get_possible_sizes(adev->pdev, 0)) - 1, 1653 rbar_size); 1654 1655 /* Disable memory decoding while we change the BAR addresses and size */ 1656 pci_read_config_word(adev->pdev, PCI_COMMAND, &cmd); 1657 pci_write_config_word(adev->pdev, PCI_COMMAND, 1658 cmd & ~PCI_COMMAND_MEMORY); 1659 1660 /* Free the VRAM and doorbell BAR, we most likely need to move both. */ 1661 amdgpu_doorbell_fini(adev); 1662 if (adev->asic_type >= CHIP_BONAIRE) 1663 pci_release_resource(adev->pdev, 2); 1664 1665 pci_release_resource(adev->pdev, 0); 1666 1667 r = pci_resize_resource(adev->pdev, 0, rbar_size); 1668 if (r == -ENOSPC) 1669 DRM_INFO("Not enough PCI address space for a large BAR."); 1670 else if (r && r != -ENOTSUPP) 1671 DRM_ERROR("Problem resizing BAR0 (%d).", r); 1672 1673 pci_assign_unassigned_bus_resources(adev->pdev->bus); 1674 1675 /* When the doorbell or fb BAR isn't available we have no chance of 1676 * using the device. 1677 */ 1678 r = amdgpu_doorbell_init(adev); 1679 if (r || (pci_resource_flags(adev->pdev, 0) & IORESOURCE_UNSET)) 1680 return -ENODEV; 1681 1682 pci_write_config_word(adev->pdev, PCI_COMMAND, cmd); 1683 1684 return 0; 1685 } 1686 1687 static bool amdgpu_device_read_bios(struct amdgpu_device *adev) 1688 { 1689 if (hweight32(adev->aid_mask) && (adev->flags & AMD_IS_APU)) 1690 return false; 1691 1692 return true; 1693 } 1694 1695 /* 1696 * GPU helpers function. 1697 */ 1698 /** 1699 * amdgpu_device_need_post - check if the hw need post or not 1700 * 1701 * @adev: amdgpu_device pointer 1702 * 1703 * Check if the asic has been initialized (all asics) at driver startup 1704 * or post is needed if hw reset is performed. 1705 * Returns true if need or false if not. 1706 */ 1707 bool amdgpu_device_need_post(struct amdgpu_device *adev) 1708 { 1709 uint32_t reg; 1710 1711 if (amdgpu_sriov_vf(adev)) 1712 return false; 1713 1714 if (!amdgpu_device_read_bios(adev)) 1715 return false; 1716 1717 if (amdgpu_passthrough(adev)) { 1718 /* for FIJI: In whole GPU pass-through virtualization case, after VM reboot 1719 * some old smc fw still need driver do vPost otherwise gpu hang, while 1720 * those smc fw version above 22.15 doesn't have this flaw, so we force 1721 * vpost executed for smc version below 22.15 1722 */ 1723 if (adev->asic_type == CHIP_FIJI) { 1724 int err; 1725 uint32_t fw_ver; 1726 1727 err = request_firmware(&adev->pm.fw, "amdgpu/fiji_smc.bin", adev->dev); 1728 /* force vPost if error occured */ 1729 if (err) 1730 return true; 1731 1732 fw_ver = *((uint32_t *)adev->pm.fw->data + 69); 1733 release_firmware(adev->pm.fw); 1734 if (fw_ver < 0x00160e00) 1735 return true; 1736 } 1737 } 1738 1739 /* Don't post if we need to reset whole hive on init */ 1740 if (adev->init_lvl->level == AMDGPU_INIT_LEVEL_MINIMAL_XGMI) 1741 return false; 1742 1743 if (adev->has_hw_reset) { 1744 adev->has_hw_reset = false; 1745 return true; 1746 } 1747 1748 /* bios scratch used on CIK+ */ 1749 if (adev->asic_type >= CHIP_BONAIRE) 1750 return amdgpu_atombios_scratch_need_asic_init(adev); 1751 1752 /* check MEM_SIZE for older asics */ 1753 reg = amdgpu_asic_get_config_memsize(adev); 1754 1755 if ((reg != 0) && (reg != 0xffffffff)) 1756 return false; 1757 1758 return true; 1759 } 1760 1761 /* 1762 * Check whether seamless boot is supported. 1763 * 1764 * So far we only support seamless boot on DCE 3.0 or later. 1765 * If users report that it works on older ASICS as well, we may 1766 * loosen this. 1767 */ 1768 bool amdgpu_device_seamless_boot_supported(struct amdgpu_device *adev) 1769 { 1770 switch (amdgpu_seamless) { 1771 case -1: 1772 break; 1773 case 1: 1774 return true; 1775 case 0: 1776 return false; 1777 default: 1778 DRM_ERROR("Invalid value for amdgpu.seamless: %d\n", 1779 amdgpu_seamless); 1780 return false; 1781 } 1782 1783 if (!(adev->flags & AMD_IS_APU)) 1784 return false; 1785 1786 if (adev->mman.keep_stolen_vga_memory) 1787 return false; 1788 1789 return amdgpu_ip_version(adev, DCE_HWIP, 0) >= IP_VERSION(3, 0, 0); 1790 } 1791 1792 /* 1793 * Intel hosts such as Rocket Lake, Alder Lake, Raptor Lake and Sapphire Rapids 1794 * don't support dynamic speed switching. Until we have confirmation from Intel 1795 * that a specific host supports it, it's safer that we keep it disabled for all. 1796 * 1797 * https://edc.intel.com/content/www/us/en/design/products/platforms/details/raptor-lake-s/13th-generation-core-processors-datasheet-volume-1-of-2/005/pci-express-support/ 1798 * https://gitlab.freedesktop.org/drm/amd/-/issues/2663 1799 */ 1800 static bool amdgpu_device_pcie_dynamic_switching_supported(struct amdgpu_device *adev) 1801 { 1802 #if IS_ENABLED(CONFIG_X86) 1803 struct cpuinfo_x86 *c = &cpu_data(0); 1804 1805 /* eGPU change speeds based on USB4 fabric conditions */ 1806 if (dev_is_removable(adev->dev)) 1807 return true; 1808 1809 if (c->x86_vendor == X86_VENDOR_INTEL) 1810 return false; 1811 #endif 1812 return true; 1813 } 1814 1815 /** 1816 * amdgpu_device_should_use_aspm - check if the device should program ASPM 1817 * 1818 * @adev: amdgpu_device pointer 1819 * 1820 * Confirm whether the module parameter and pcie bridge agree that ASPM should 1821 * be set for this device. 1822 * 1823 * Returns true if it should be used or false if not. 1824 */ 1825 bool amdgpu_device_should_use_aspm(struct amdgpu_device *adev) 1826 { 1827 switch (amdgpu_aspm) { 1828 case -1: 1829 break; 1830 case 0: 1831 return false; 1832 case 1: 1833 return true; 1834 default: 1835 return false; 1836 } 1837 if (adev->flags & AMD_IS_APU) 1838 return false; 1839 if (!(adev->pm.pp_feature & PP_PCIE_DPM_MASK)) 1840 return false; 1841 return pcie_aspm_enabled(adev->pdev); 1842 } 1843 1844 /* if we get transitioned to only one device, take VGA back */ 1845 /** 1846 * amdgpu_device_vga_set_decode - enable/disable vga decode 1847 * 1848 * @pdev: PCI device pointer 1849 * @state: enable/disable vga decode 1850 * 1851 * Enable/disable vga decode (all asics). 1852 * Returns VGA resource flags. 1853 */ 1854 static unsigned int amdgpu_device_vga_set_decode(struct pci_dev *pdev, 1855 bool state) 1856 { 1857 struct amdgpu_device *adev = drm_to_adev(pci_get_drvdata(pdev)); 1858 1859 amdgpu_asic_set_vga_state(adev, state); 1860 if (state) 1861 return VGA_RSRC_LEGACY_IO | VGA_RSRC_LEGACY_MEM | 1862 VGA_RSRC_NORMAL_IO | VGA_RSRC_NORMAL_MEM; 1863 else 1864 return VGA_RSRC_NORMAL_IO | VGA_RSRC_NORMAL_MEM; 1865 } 1866 1867 /** 1868 * amdgpu_device_check_block_size - validate the vm block size 1869 * 1870 * @adev: amdgpu_device pointer 1871 * 1872 * Validates the vm block size specified via module parameter. 1873 * The vm block size defines number of bits in page table versus page directory, 1874 * a page is 4KB so we have 12 bits offset, minimum 9 bits in the 1875 * page table and the remaining bits are in the page directory. 1876 */ 1877 static void amdgpu_device_check_block_size(struct amdgpu_device *adev) 1878 { 1879 /* defines number of bits in page table versus page directory, 1880 * a page is 4KB so we have 12 bits offset, minimum 9 bits in the 1881 * page table and the remaining bits are in the page directory 1882 */ 1883 if (amdgpu_vm_block_size == -1) 1884 return; 1885 1886 if (amdgpu_vm_block_size < 9) { 1887 dev_warn(adev->dev, "VM page table size (%d) too small\n", 1888 amdgpu_vm_block_size); 1889 amdgpu_vm_block_size = -1; 1890 } 1891 } 1892 1893 /** 1894 * amdgpu_device_check_vm_size - validate the vm size 1895 * 1896 * @adev: amdgpu_device pointer 1897 * 1898 * Validates the vm size in GB specified via module parameter. 1899 * The VM size is the size of the GPU virtual memory space in GB. 1900 */ 1901 static void amdgpu_device_check_vm_size(struct amdgpu_device *adev) 1902 { 1903 /* no need to check the default value */ 1904 if (amdgpu_vm_size == -1) 1905 return; 1906 1907 if (amdgpu_vm_size < 1) { 1908 dev_warn(adev->dev, "VM size (%d) too small, min is 1GB\n", 1909 amdgpu_vm_size); 1910 amdgpu_vm_size = -1; 1911 } 1912 } 1913 1914 static void amdgpu_device_check_smu_prv_buffer_size(struct amdgpu_device *adev) 1915 { 1916 struct sysinfo si; 1917 bool is_os_64 = (sizeof(void *) == 8); 1918 uint64_t total_memory; 1919 uint64_t dram_size_seven_GB = 0x1B8000000; 1920 uint64_t dram_size_three_GB = 0xB8000000; 1921 1922 if (amdgpu_smu_memory_pool_size == 0) 1923 return; 1924 1925 if (!is_os_64) { 1926 DRM_WARN("Not 64-bit OS, feature not supported\n"); 1927 goto def_value; 1928 } 1929 si_meminfo(&si); 1930 total_memory = (uint64_t)si.totalram * si.mem_unit; 1931 1932 if ((amdgpu_smu_memory_pool_size == 1) || 1933 (amdgpu_smu_memory_pool_size == 2)) { 1934 if (total_memory < dram_size_three_GB) 1935 goto def_value1; 1936 } else if ((amdgpu_smu_memory_pool_size == 4) || 1937 (amdgpu_smu_memory_pool_size == 8)) { 1938 if (total_memory < dram_size_seven_GB) 1939 goto def_value1; 1940 } else { 1941 DRM_WARN("Smu memory pool size not supported\n"); 1942 goto def_value; 1943 } 1944 adev->pm.smu_prv_buffer_size = amdgpu_smu_memory_pool_size << 28; 1945 1946 return; 1947 1948 def_value1: 1949 DRM_WARN("No enough system memory\n"); 1950 def_value: 1951 adev->pm.smu_prv_buffer_size = 0; 1952 } 1953 1954 static int amdgpu_device_init_apu_flags(struct amdgpu_device *adev) 1955 { 1956 if (!(adev->flags & AMD_IS_APU) || 1957 adev->asic_type < CHIP_RAVEN) 1958 return 0; 1959 1960 switch (adev->asic_type) { 1961 case CHIP_RAVEN: 1962 if (adev->pdev->device == 0x15dd) 1963 adev->apu_flags |= AMD_APU_IS_RAVEN; 1964 if (adev->pdev->device == 0x15d8) 1965 adev->apu_flags |= AMD_APU_IS_PICASSO; 1966 break; 1967 case CHIP_RENOIR: 1968 if ((adev->pdev->device == 0x1636) || 1969 (adev->pdev->device == 0x164c)) 1970 adev->apu_flags |= AMD_APU_IS_RENOIR; 1971 else 1972 adev->apu_flags |= AMD_APU_IS_GREEN_SARDINE; 1973 break; 1974 case CHIP_VANGOGH: 1975 adev->apu_flags |= AMD_APU_IS_VANGOGH; 1976 break; 1977 case CHIP_YELLOW_CARP: 1978 break; 1979 case CHIP_CYAN_SKILLFISH: 1980 if ((adev->pdev->device == 0x13FE) || 1981 (adev->pdev->device == 0x143F)) 1982 adev->apu_flags |= AMD_APU_IS_CYAN_SKILLFISH2; 1983 break; 1984 default: 1985 break; 1986 } 1987 1988 return 0; 1989 } 1990 1991 /** 1992 * amdgpu_device_check_arguments - validate module params 1993 * 1994 * @adev: amdgpu_device pointer 1995 * 1996 * Validates certain module parameters and updates 1997 * the associated values used by the driver (all asics). 1998 */ 1999 static int amdgpu_device_check_arguments(struct amdgpu_device *adev) 2000 { 2001 int i; 2002 2003 if (amdgpu_sched_jobs < 4) { 2004 dev_warn(adev->dev, "sched jobs (%d) must be at least 4\n", 2005 amdgpu_sched_jobs); 2006 amdgpu_sched_jobs = 4; 2007 } else if (!is_power_of_2(amdgpu_sched_jobs)) { 2008 dev_warn(adev->dev, "sched jobs (%d) must be a power of 2\n", 2009 amdgpu_sched_jobs); 2010 amdgpu_sched_jobs = roundup_pow_of_two(amdgpu_sched_jobs); 2011 } 2012 2013 if (amdgpu_gart_size != -1 && amdgpu_gart_size < 32) { 2014 /* gart size must be greater or equal to 32M */ 2015 dev_warn(adev->dev, "gart size (%d) too small\n", 2016 amdgpu_gart_size); 2017 amdgpu_gart_size = -1; 2018 } 2019 2020 if (amdgpu_gtt_size != -1 && amdgpu_gtt_size < 32) { 2021 /* gtt size must be greater or equal to 32M */ 2022 dev_warn(adev->dev, "gtt size (%d) too small\n", 2023 amdgpu_gtt_size); 2024 amdgpu_gtt_size = -1; 2025 } 2026 2027 /* valid range is between 4 and 9 inclusive */ 2028 if (amdgpu_vm_fragment_size != -1 && 2029 (amdgpu_vm_fragment_size > 9 || amdgpu_vm_fragment_size < 4)) { 2030 dev_warn(adev->dev, "valid range is between 4 and 9\n"); 2031 amdgpu_vm_fragment_size = -1; 2032 } 2033 2034 if (amdgpu_sched_hw_submission < 2) { 2035 dev_warn(adev->dev, "sched hw submission jobs (%d) must be at least 2\n", 2036 amdgpu_sched_hw_submission); 2037 amdgpu_sched_hw_submission = 2; 2038 } else if (!is_power_of_2(amdgpu_sched_hw_submission)) { 2039 dev_warn(adev->dev, "sched hw submission jobs (%d) must be a power of 2\n", 2040 amdgpu_sched_hw_submission); 2041 amdgpu_sched_hw_submission = roundup_pow_of_two(amdgpu_sched_hw_submission); 2042 } 2043 2044 if (amdgpu_reset_method < -1 || amdgpu_reset_method > 4) { 2045 dev_warn(adev->dev, "invalid option for reset method, reverting to default\n"); 2046 amdgpu_reset_method = -1; 2047 } 2048 2049 amdgpu_device_check_smu_prv_buffer_size(adev); 2050 2051 amdgpu_device_check_vm_size(adev); 2052 2053 amdgpu_device_check_block_size(adev); 2054 2055 adev->firmware.load_type = amdgpu_ucode_get_load_type(adev, amdgpu_fw_load_type); 2056 2057 for (i = 0; i < MAX_XCP; i++) 2058 adev->enforce_isolation[i] = !!enforce_isolation; 2059 2060 return 0; 2061 } 2062 2063 /** 2064 * amdgpu_switcheroo_set_state - set switcheroo state 2065 * 2066 * @pdev: pci dev pointer 2067 * @state: vga_switcheroo state 2068 * 2069 * Callback for the switcheroo driver. Suspends or resumes 2070 * the asics before or after it is powered up using ACPI methods. 2071 */ 2072 static void amdgpu_switcheroo_set_state(struct pci_dev *pdev, 2073 enum vga_switcheroo_state state) 2074 { 2075 struct drm_device *dev = pci_get_drvdata(pdev); 2076 int r; 2077 2078 if (amdgpu_device_supports_px(dev) && state == VGA_SWITCHEROO_OFF) 2079 return; 2080 2081 if (state == VGA_SWITCHEROO_ON) { 2082 pr_info("switched on\n"); 2083 /* don't suspend or resume card normally */ 2084 dev->switch_power_state = DRM_SWITCH_POWER_CHANGING; 2085 2086 pci_set_power_state(pdev, PCI_D0); 2087 amdgpu_device_load_pci_state(pdev); 2088 r = pci_enable_device(pdev); 2089 if (r) 2090 DRM_WARN("pci_enable_device failed (%d)\n", r); 2091 amdgpu_device_resume(dev, true); 2092 2093 dev->switch_power_state = DRM_SWITCH_POWER_ON; 2094 } else { 2095 pr_info("switched off\n"); 2096 dev->switch_power_state = DRM_SWITCH_POWER_CHANGING; 2097 amdgpu_device_prepare(dev); 2098 amdgpu_device_suspend(dev, true); 2099 amdgpu_device_cache_pci_state(pdev); 2100 /* Shut down the device */ 2101 pci_disable_device(pdev); 2102 pci_set_power_state(pdev, PCI_D3cold); 2103 dev->switch_power_state = DRM_SWITCH_POWER_OFF; 2104 } 2105 } 2106 2107 /** 2108 * amdgpu_switcheroo_can_switch - see if switcheroo state can change 2109 * 2110 * @pdev: pci dev pointer 2111 * 2112 * Callback for the switcheroo driver. Check of the switcheroo 2113 * state can be changed. 2114 * Returns true if the state can be changed, false if not. 2115 */ 2116 static bool amdgpu_switcheroo_can_switch(struct pci_dev *pdev) 2117 { 2118 struct drm_device *dev = pci_get_drvdata(pdev); 2119 2120 /* 2121 * FIXME: open_count is protected by drm_global_mutex but that would lead to 2122 * locking inversion with the driver load path. And the access here is 2123 * completely racy anyway. So don't bother with locking for now. 2124 */ 2125 return atomic_read(&dev->open_count) == 0; 2126 } 2127 2128 static const struct vga_switcheroo_client_ops amdgpu_switcheroo_ops = { 2129 .set_gpu_state = amdgpu_switcheroo_set_state, 2130 .reprobe = NULL, 2131 .can_switch = amdgpu_switcheroo_can_switch, 2132 }; 2133 2134 /** 2135 * amdgpu_device_ip_set_clockgating_state - set the CG state 2136 * 2137 * @dev: amdgpu_device pointer 2138 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.) 2139 * @state: clockgating state (gate or ungate) 2140 * 2141 * Sets the requested clockgating state for all instances of 2142 * the hardware IP specified. 2143 * Returns the error code from the last instance. 2144 */ 2145 int amdgpu_device_ip_set_clockgating_state(void *dev, 2146 enum amd_ip_block_type block_type, 2147 enum amd_clockgating_state state) 2148 { 2149 struct amdgpu_device *adev = dev; 2150 int i, r = 0; 2151 2152 for (i = 0; i < adev->num_ip_blocks; i++) { 2153 if (!adev->ip_blocks[i].status.valid) 2154 continue; 2155 if (adev->ip_blocks[i].version->type != block_type) 2156 continue; 2157 if (!adev->ip_blocks[i].version->funcs->set_clockgating_state) 2158 continue; 2159 r = adev->ip_blocks[i].version->funcs->set_clockgating_state( 2160 (void *)adev, state); 2161 if (r) 2162 DRM_ERROR("set_clockgating_state of IP block <%s> failed %d\n", 2163 adev->ip_blocks[i].version->funcs->name, r); 2164 } 2165 return r; 2166 } 2167 2168 /** 2169 * amdgpu_device_ip_set_powergating_state - set the PG state 2170 * 2171 * @dev: amdgpu_device pointer 2172 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.) 2173 * @state: powergating state (gate or ungate) 2174 * 2175 * Sets the requested powergating state for all instances of 2176 * the hardware IP specified. 2177 * Returns the error code from the last instance. 2178 */ 2179 int amdgpu_device_ip_set_powergating_state(void *dev, 2180 enum amd_ip_block_type block_type, 2181 enum amd_powergating_state state) 2182 { 2183 struct amdgpu_device *adev = dev; 2184 int i, r = 0; 2185 2186 for (i = 0; i < adev->num_ip_blocks; i++) { 2187 if (!adev->ip_blocks[i].status.valid) 2188 continue; 2189 if (adev->ip_blocks[i].version->type != block_type) 2190 continue; 2191 if (!adev->ip_blocks[i].version->funcs->set_powergating_state) 2192 continue; 2193 r = adev->ip_blocks[i].version->funcs->set_powergating_state( 2194 (void *)adev, state); 2195 if (r) 2196 DRM_ERROR("set_powergating_state of IP block <%s> failed %d\n", 2197 adev->ip_blocks[i].version->funcs->name, r); 2198 } 2199 return r; 2200 } 2201 2202 /** 2203 * amdgpu_device_ip_get_clockgating_state - get the CG state 2204 * 2205 * @adev: amdgpu_device pointer 2206 * @flags: clockgating feature flags 2207 * 2208 * Walks the list of IPs on the device and updates the clockgating 2209 * flags for each IP. 2210 * Updates @flags with the feature flags for each hardware IP where 2211 * clockgating is enabled. 2212 */ 2213 void amdgpu_device_ip_get_clockgating_state(struct amdgpu_device *adev, 2214 u64 *flags) 2215 { 2216 int i; 2217 2218 for (i = 0; i < adev->num_ip_blocks; i++) { 2219 if (!adev->ip_blocks[i].status.valid) 2220 continue; 2221 if (adev->ip_blocks[i].version->funcs->get_clockgating_state) 2222 adev->ip_blocks[i].version->funcs->get_clockgating_state((void *)adev, flags); 2223 } 2224 } 2225 2226 /** 2227 * amdgpu_device_ip_wait_for_idle - wait for idle 2228 * 2229 * @adev: amdgpu_device pointer 2230 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.) 2231 * 2232 * Waits for the request hardware IP to be idle. 2233 * Returns 0 for success or a negative error code on failure. 2234 */ 2235 int amdgpu_device_ip_wait_for_idle(struct amdgpu_device *adev, 2236 enum amd_ip_block_type block_type) 2237 { 2238 int i, r; 2239 2240 for (i = 0; i < adev->num_ip_blocks; i++) { 2241 if (!adev->ip_blocks[i].status.valid) 2242 continue; 2243 if (adev->ip_blocks[i].version->type == block_type) { 2244 if (adev->ip_blocks[i].version->funcs->wait_for_idle) { 2245 r = adev->ip_blocks[i].version->funcs->wait_for_idle( 2246 &adev->ip_blocks[i]); 2247 if (r) 2248 return r; 2249 } 2250 break; 2251 } 2252 } 2253 return 0; 2254 2255 } 2256 2257 /** 2258 * amdgpu_device_ip_is_valid - is the hardware IP enabled 2259 * 2260 * @adev: amdgpu_device pointer 2261 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.) 2262 * 2263 * Check if the hardware IP is enable or not. 2264 * Returns true if it the IP is enable, false if not. 2265 */ 2266 bool amdgpu_device_ip_is_valid(struct amdgpu_device *adev, 2267 enum amd_ip_block_type block_type) 2268 { 2269 int i; 2270 2271 for (i = 0; i < adev->num_ip_blocks; i++) { 2272 if (adev->ip_blocks[i].version->type == block_type) 2273 return adev->ip_blocks[i].status.valid; 2274 } 2275 return false; 2276 2277 } 2278 2279 /** 2280 * amdgpu_device_ip_get_ip_block - get a hw IP pointer 2281 * 2282 * @adev: amdgpu_device pointer 2283 * @type: Type of hardware IP (SMU, GFX, UVD, etc.) 2284 * 2285 * Returns a pointer to the hardware IP block structure 2286 * if it exists for the asic, otherwise NULL. 2287 */ 2288 struct amdgpu_ip_block * 2289 amdgpu_device_ip_get_ip_block(struct amdgpu_device *adev, 2290 enum amd_ip_block_type type) 2291 { 2292 int i; 2293 2294 for (i = 0; i < adev->num_ip_blocks; i++) 2295 if (adev->ip_blocks[i].version->type == type) 2296 return &adev->ip_blocks[i]; 2297 2298 return NULL; 2299 } 2300 2301 /** 2302 * amdgpu_device_ip_block_version_cmp 2303 * 2304 * @adev: amdgpu_device pointer 2305 * @type: enum amd_ip_block_type 2306 * @major: major version 2307 * @minor: minor version 2308 * 2309 * return 0 if equal or greater 2310 * return 1 if smaller or the ip_block doesn't exist 2311 */ 2312 int amdgpu_device_ip_block_version_cmp(struct amdgpu_device *adev, 2313 enum amd_ip_block_type type, 2314 u32 major, u32 minor) 2315 { 2316 struct amdgpu_ip_block *ip_block = amdgpu_device_ip_get_ip_block(adev, type); 2317 2318 if (ip_block && ((ip_block->version->major > major) || 2319 ((ip_block->version->major == major) && 2320 (ip_block->version->minor >= minor)))) 2321 return 0; 2322 2323 return 1; 2324 } 2325 2326 /** 2327 * amdgpu_device_ip_block_add 2328 * 2329 * @adev: amdgpu_device pointer 2330 * @ip_block_version: pointer to the IP to add 2331 * 2332 * Adds the IP block driver information to the collection of IPs 2333 * on the asic. 2334 */ 2335 int amdgpu_device_ip_block_add(struct amdgpu_device *adev, 2336 const struct amdgpu_ip_block_version *ip_block_version) 2337 { 2338 if (!ip_block_version) 2339 return -EINVAL; 2340 2341 switch (ip_block_version->type) { 2342 case AMD_IP_BLOCK_TYPE_VCN: 2343 if (adev->harvest_ip_mask & AMD_HARVEST_IP_VCN_MASK) 2344 return 0; 2345 break; 2346 case AMD_IP_BLOCK_TYPE_JPEG: 2347 if (adev->harvest_ip_mask & AMD_HARVEST_IP_JPEG_MASK) 2348 return 0; 2349 break; 2350 default: 2351 break; 2352 } 2353 2354 DRM_INFO("add ip block number %d <%s>\n", adev->num_ip_blocks, 2355 ip_block_version->funcs->name); 2356 2357 adev->ip_blocks[adev->num_ip_blocks].adev = adev; 2358 2359 adev->ip_blocks[adev->num_ip_blocks++].version = ip_block_version; 2360 2361 return 0; 2362 } 2363 2364 /** 2365 * amdgpu_device_enable_virtual_display - enable virtual display feature 2366 * 2367 * @adev: amdgpu_device pointer 2368 * 2369 * Enabled the virtual display feature if the user has enabled it via 2370 * the module parameter virtual_display. This feature provides a virtual 2371 * display hardware on headless boards or in virtualized environments. 2372 * This function parses and validates the configuration string specified by 2373 * the user and configues the virtual display configuration (number of 2374 * virtual connectors, crtcs, etc.) specified. 2375 */ 2376 static void amdgpu_device_enable_virtual_display(struct amdgpu_device *adev) 2377 { 2378 adev->enable_virtual_display = false; 2379 2380 if (amdgpu_virtual_display) { 2381 const char *pci_address_name = pci_name(adev->pdev); 2382 char *pciaddstr, *pciaddstr_tmp, *pciaddname_tmp, *pciaddname; 2383 2384 pciaddstr = kstrdup(amdgpu_virtual_display, GFP_KERNEL); 2385 pciaddstr_tmp = pciaddstr; 2386 while ((pciaddname_tmp = strsep(&pciaddstr_tmp, ";"))) { 2387 pciaddname = strsep(&pciaddname_tmp, ","); 2388 if (!strcmp("all", pciaddname) 2389 || !strcmp(pci_address_name, pciaddname)) { 2390 long num_crtc; 2391 int res = -1; 2392 2393 adev->enable_virtual_display = true; 2394 2395 if (pciaddname_tmp) 2396 res = kstrtol(pciaddname_tmp, 10, 2397 &num_crtc); 2398 2399 if (!res) { 2400 if (num_crtc < 1) 2401 num_crtc = 1; 2402 if (num_crtc > 6) 2403 num_crtc = 6; 2404 adev->mode_info.num_crtc = num_crtc; 2405 } else { 2406 adev->mode_info.num_crtc = 1; 2407 } 2408 break; 2409 } 2410 } 2411 2412 DRM_INFO("virtual display string:%s, %s:virtual_display:%d, num_crtc:%d\n", 2413 amdgpu_virtual_display, pci_address_name, 2414 adev->enable_virtual_display, adev->mode_info.num_crtc); 2415 2416 kfree(pciaddstr); 2417 } 2418 } 2419 2420 void amdgpu_device_set_sriov_virtual_display(struct amdgpu_device *adev) 2421 { 2422 if (amdgpu_sriov_vf(adev) && !adev->enable_virtual_display) { 2423 adev->mode_info.num_crtc = 1; 2424 adev->enable_virtual_display = true; 2425 DRM_INFO("virtual_display:%d, num_crtc:%d\n", 2426 adev->enable_virtual_display, adev->mode_info.num_crtc); 2427 } 2428 } 2429 2430 /** 2431 * amdgpu_device_parse_gpu_info_fw - parse gpu info firmware 2432 * 2433 * @adev: amdgpu_device pointer 2434 * 2435 * Parses the asic configuration parameters specified in the gpu info 2436 * firmware and makes them availale to the driver for use in configuring 2437 * the asic. 2438 * Returns 0 on success, -EINVAL on failure. 2439 */ 2440 static int amdgpu_device_parse_gpu_info_fw(struct amdgpu_device *adev) 2441 { 2442 const char *chip_name; 2443 int err; 2444 const struct gpu_info_firmware_header_v1_0 *hdr; 2445 2446 adev->firmware.gpu_info_fw = NULL; 2447 2448 if (adev->mman.discovery_bin) 2449 return 0; 2450 2451 switch (adev->asic_type) { 2452 default: 2453 return 0; 2454 case CHIP_VEGA10: 2455 chip_name = "vega10"; 2456 break; 2457 case CHIP_VEGA12: 2458 chip_name = "vega12"; 2459 break; 2460 case CHIP_RAVEN: 2461 if (adev->apu_flags & AMD_APU_IS_RAVEN2) 2462 chip_name = "raven2"; 2463 else if (adev->apu_flags & AMD_APU_IS_PICASSO) 2464 chip_name = "picasso"; 2465 else 2466 chip_name = "raven"; 2467 break; 2468 case CHIP_ARCTURUS: 2469 chip_name = "arcturus"; 2470 break; 2471 case CHIP_NAVI12: 2472 chip_name = "navi12"; 2473 break; 2474 } 2475 2476 err = amdgpu_ucode_request(adev, &adev->firmware.gpu_info_fw, 2477 "amdgpu/%s_gpu_info.bin", chip_name); 2478 if (err) { 2479 dev_err(adev->dev, 2480 "Failed to get gpu_info firmware \"%s_gpu_info.bin\"\n", 2481 chip_name); 2482 goto out; 2483 } 2484 2485 hdr = (const struct gpu_info_firmware_header_v1_0 *)adev->firmware.gpu_info_fw->data; 2486 amdgpu_ucode_print_gpu_info_hdr(&hdr->header); 2487 2488 switch (hdr->version_major) { 2489 case 1: 2490 { 2491 const struct gpu_info_firmware_v1_0 *gpu_info_fw = 2492 (const struct gpu_info_firmware_v1_0 *)(adev->firmware.gpu_info_fw->data + 2493 le32_to_cpu(hdr->header.ucode_array_offset_bytes)); 2494 2495 /* 2496 * Should be droped when DAL no longer needs it. 2497 */ 2498 if (adev->asic_type == CHIP_NAVI12) 2499 goto parse_soc_bounding_box; 2500 2501 adev->gfx.config.max_shader_engines = le32_to_cpu(gpu_info_fw->gc_num_se); 2502 adev->gfx.config.max_cu_per_sh = le32_to_cpu(gpu_info_fw->gc_num_cu_per_sh); 2503 adev->gfx.config.max_sh_per_se = le32_to_cpu(gpu_info_fw->gc_num_sh_per_se); 2504 adev->gfx.config.max_backends_per_se = le32_to_cpu(gpu_info_fw->gc_num_rb_per_se); 2505 adev->gfx.config.max_texture_channel_caches = 2506 le32_to_cpu(gpu_info_fw->gc_num_tccs); 2507 adev->gfx.config.max_gprs = le32_to_cpu(gpu_info_fw->gc_num_gprs); 2508 adev->gfx.config.max_gs_threads = le32_to_cpu(gpu_info_fw->gc_num_max_gs_thds); 2509 adev->gfx.config.gs_vgt_table_depth = le32_to_cpu(gpu_info_fw->gc_gs_table_depth); 2510 adev->gfx.config.gs_prim_buffer_depth = le32_to_cpu(gpu_info_fw->gc_gsprim_buff_depth); 2511 adev->gfx.config.double_offchip_lds_buf = 2512 le32_to_cpu(gpu_info_fw->gc_double_offchip_lds_buffer); 2513 adev->gfx.cu_info.wave_front_size = le32_to_cpu(gpu_info_fw->gc_wave_size); 2514 adev->gfx.cu_info.max_waves_per_simd = 2515 le32_to_cpu(gpu_info_fw->gc_max_waves_per_simd); 2516 adev->gfx.cu_info.max_scratch_slots_per_cu = 2517 le32_to_cpu(gpu_info_fw->gc_max_scratch_slots_per_cu); 2518 adev->gfx.cu_info.lds_size = le32_to_cpu(gpu_info_fw->gc_lds_size); 2519 if (hdr->version_minor >= 1) { 2520 const struct gpu_info_firmware_v1_1 *gpu_info_fw = 2521 (const struct gpu_info_firmware_v1_1 *)(adev->firmware.gpu_info_fw->data + 2522 le32_to_cpu(hdr->header.ucode_array_offset_bytes)); 2523 adev->gfx.config.num_sc_per_sh = 2524 le32_to_cpu(gpu_info_fw->num_sc_per_sh); 2525 adev->gfx.config.num_packer_per_sc = 2526 le32_to_cpu(gpu_info_fw->num_packer_per_sc); 2527 } 2528 2529 parse_soc_bounding_box: 2530 /* 2531 * soc bounding box info is not integrated in disocovery table, 2532 * we always need to parse it from gpu info firmware if needed. 2533 */ 2534 if (hdr->version_minor == 2) { 2535 const struct gpu_info_firmware_v1_2 *gpu_info_fw = 2536 (const struct gpu_info_firmware_v1_2 *)(adev->firmware.gpu_info_fw->data + 2537 le32_to_cpu(hdr->header.ucode_array_offset_bytes)); 2538 adev->dm.soc_bounding_box = &gpu_info_fw->soc_bounding_box; 2539 } 2540 break; 2541 } 2542 default: 2543 dev_err(adev->dev, 2544 "Unsupported gpu_info table %d\n", hdr->header.ucode_version); 2545 err = -EINVAL; 2546 goto out; 2547 } 2548 out: 2549 return err; 2550 } 2551 2552 /** 2553 * amdgpu_device_ip_early_init - run early init for hardware IPs 2554 * 2555 * @adev: amdgpu_device pointer 2556 * 2557 * Early initialization pass for hardware IPs. The hardware IPs that make 2558 * up each asic are discovered each IP's early_init callback is run. This 2559 * is the first stage in initializing the asic. 2560 * Returns 0 on success, negative error code on failure. 2561 */ 2562 static int amdgpu_device_ip_early_init(struct amdgpu_device *adev) 2563 { 2564 struct amdgpu_ip_block *ip_block; 2565 struct pci_dev *parent; 2566 int i, r; 2567 bool total; 2568 2569 amdgpu_device_enable_virtual_display(adev); 2570 2571 if (amdgpu_sriov_vf(adev)) { 2572 r = amdgpu_virt_request_full_gpu(adev, true); 2573 if (r) 2574 return r; 2575 } 2576 2577 switch (adev->asic_type) { 2578 #ifdef CONFIG_DRM_AMDGPU_SI 2579 case CHIP_VERDE: 2580 case CHIP_TAHITI: 2581 case CHIP_PITCAIRN: 2582 case CHIP_OLAND: 2583 case CHIP_HAINAN: 2584 adev->family = AMDGPU_FAMILY_SI; 2585 r = si_set_ip_blocks(adev); 2586 if (r) 2587 return r; 2588 break; 2589 #endif 2590 #ifdef CONFIG_DRM_AMDGPU_CIK 2591 case CHIP_BONAIRE: 2592 case CHIP_HAWAII: 2593 case CHIP_KAVERI: 2594 case CHIP_KABINI: 2595 case CHIP_MULLINS: 2596 if (adev->flags & AMD_IS_APU) 2597 adev->family = AMDGPU_FAMILY_KV; 2598 else 2599 adev->family = AMDGPU_FAMILY_CI; 2600 2601 r = cik_set_ip_blocks(adev); 2602 if (r) 2603 return r; 2604 break; 2605 #endif 2606 case CHIP_TOPAZ: 2607 case CHIP_TONGA: 2608 case CHIP_FIJI: 2609 case CHIP_POLARIS10: 2610 case CHIP_POLARIS11: 2611 case CHIP_POLARIS12: 2612 case CHIP_VEGAM: 2613 case CHIP_CARRIZO: 2614 case CHIP_STONEY: 2615 if (adev->flags & AMD_IS_APU) 2616 adev->family = AMDGPU_FAMILY_CZ; 2617 else 2618 adev->family = AMDGPU_FAMILY_VI; 2619 2620 r = vi_set_ip_blocks(adev); 2621 if (r) 2622 return r; 2623 break; 2624 default: 2625 r = amdgpu_discovery_set_ip_blocks(adev); 2626 if (r) 2627 return r; 2628 break; 2629 } 2630 2631 if (amdgpu_has_atpx() && 2632 (amdgpu_is_atpx_hybrid() || 2633 amdgpu_has_atpx_dgpu_power_cntl()) && 2634 ((adev->flags & AMD_IS_APU) == 0) && 2635 !dev_is_removable(&adev->pdev->dev)) 2636 adev->flags |= AMD_IS_PX; 2637 2638 if (!(adev->flags & AMD_IS_APU)) { 2639 parent = pcie_find_root_port(adev->pdev); 2640 adev->has_pr3 = parent ? pci_pr3_present(parent) : false; 2641 } 2642 2643 2644 adev->pm.pp_feature = amdgpu_pp_feature_mask; 2645 if (amdgpu_sriov_vf(adev) || sched_policy == KFD_SCHED_POLICY_NO_HWS) 2646 adev->pm.pp_feature &= ~PP_GFXOFF_MASK; 2647 if (amdgpu_sriov_vf(adev) && adev->asic_type == CHIP_SIENNA_CICHLID) 2648 adev->pm.pp_feature &= ~PP_OVERDRIVE_MASK; 2649 if (!amdgpu_device_pcie_dynamic_switching_supported(adev)) 2650 adev->pm.pp_feature &= ~PP_PCIE_DPM_MASK; 2651 2652 total = true; 2653 for (i = 0; i < adev->num_ip_blocks; i++) { 2654 ip_block = &adev->ip_blocks[i]; 2655 2656 if ((amdgpu_ip_block_mask & (1 << i)) == 0) { 2657 DRM_WARN("disabled ip block: %d <%s>\n", 2658 i, adev->ip_blocks[i].version->funcs->name); 2659 adev->ip_blocks[i].status.valid = false; 2660 } else if (ip_block->version->funcs->early_init) { 2661 r = ip_block->version->funcs->early_init(ip_block); 2662 if (r == -ENOENT) { 2663 adev->ip_blocks[i].status.valid = false; 2664 } else if (r) { 2665 DRM_ERROR("early_init of IP block <%s> failed %d\n", 2666 adev->ip_blocks[i].version->funcs->name, r); 2667 total = false; 2668 } else { 2669 adev->ip_blocks[i].status.valid = true; 2670 } 2671 } else { 2672 adev->ip_blocks[i].status.valid = true; 2673 } 2674 /* get the vbios after the asic_funcs are set up */ 2675 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON) { 2676 r = amdgpu_device_parse_gpu_info_fw(adev); 2677 if (r) 2678 return r; 2679 2680 /* Read BIOS */ 2681 if (amdgpu_device_read_bios(adev)) { 2682 if (!amdgpu_get_bios(adev)) 2683 return -EINVAL; 2684 2685 r = amdgpu_atombios_init(adev); 2686 if (r) { 2687 dev_err(adev->dev, "amdgpu_atombios_init failed\n"); 2688 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_INIT_FAIL, 0, 0); 2689 return r; 2690 } 2691 } 2692 2693 /*get pf2vf msg info at it's earliest time*/ 2694 if (amdgpu_sriov_vf(adev)) 2695 amdgpu_virt_init_data_exchange(adev); 2696 2697 } 2698 } 2699 if (!total) 2700 return -ENODEV; 2701 2702 ip_block = amdgpu_device_ip_get_ip_block(adev, AMD_IP_BLOCK_TYPE_GFX); 2703 if (ip_block->status.valid != false) 2704 amdgpu_amdkfd_device_probe(adev); 2705 2706 adev->cg_flags &= amdgpu_cg_mask; 2707 adev->pg_flags &= amdgpu_pg_mask; 2708 2709 return 0; 2710 } 2711 2712 static int amdgpu_device_ip_hw_init_phase1(struct amdgpu_device *adev) 2713 { 2714 int i, r; 2715 2716 for (i = 0; i < adev->num_ip_blocks; i++) { 2717 if (!adev->ip_blocks[i].status.sw) 2718 continue; 2719 if (adev->ip_blocks[i].status.hw) 2720 continue; 2721 if (!amdgpu_ip_member_of_hwini( 2722 adev, adev->ip_blocks[i].version->type)) 2723 continue; 2724 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON || 2725 (amdgpu_sriov_vf(adev) && (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP)) || 2726 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH) { 2727 r = adev->ip_blocks[i].version->funcs->hw_init(&adev->ip_blocks[i]); 2728 if (r) { 2729 DRM_ERROR("hw_init of IP block <%s> failed %d\n", 2730 adev->ip_blocks[i].version->funcs->name, r); 2731 return r; 2732 } 2733 adev->ip_blocks[i].status.hw = true; 2734 } 2735 } 2736 2737 return 0; 2738 } 2739 2740 static int amdgpu_device_ip_hw_init_phase2(struct amdgpu_device *adev) 2741 { 2742 int i, r; 2743 2744 for (i = 0; i < adev->num_ip_blocks; i++) { 2745 if (!adev->ip_blocks[i].status.sw) 2746 continue; 2747 if (adev->ip_blocks[i].status.hw) 2748 continue; 2749 if (!amdgpu_ip_member_of_hwini( 2750 adev, adev->ip_blocks[i].version->type)) 2751 continue; 2752 r = adev->ip_blocks[i].version->funcs->hw_init(&adev->ip_blocks[i]); 2753 if (r) { 2754 DRM_ERROR("hw_init of IP block <%s> failed %d\n", 2755 adev->ip_blocks[i].version->funcs->name, r); 2756 return r; 2757 } 2758 adev->ip_blocks[i].status.hw = true; 2759 } 2760 2761 return 0; 2762 } 2763 2764 static int amdgpu_device_fw_loading(struct amdgpu_device *adev) 2765 { 2766 int r = 0; 2767 int i; 2768 uint32_t smu_version; 2769 2770 if (adev->asic_type >= CHIP_VEGA10) { 2771 for (i = 0; i < adev->num_ip_blocks; i++) { 2772 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_PSP) 2773 continue; 2774 2775 if (!amdgpu_ip_member_of_hwini(adev, 2776 AMD_IP_BLOCK_TYPE_PSP)) 2777 break; 2778 2779 if (!adev->ip_blocks[i].status.sw) 2780 continue; 2781 2782 /* no need to do the fw loading again if already done*/ 2783 if (adev->ip_blocks[i].status.hw == true) 2784 break; 2785 2786 if (amdgpu_in_reset(adev) || adev->in_suspend) { 2787 r = amdgpu_ip_block_resume(&adev->ip_blocks[i]); 2788 if (r) 2789 return r; 2790 } else { 2791 r = adev->ip_blocks[i].version->funcs->hw_init(&adev->ip_blocks[i]); 2792 if (r) { 2793 DRM_ERROR("hw_init of IP block <%s> failed %d\n", 2794 adev->ip_blocks[i].version->funcs->name, r); 2795 return r; 2796 } 2797 adev->ip_blocks[i].status.hw = true; 2798 } 2799 break; 2800 } 2801 } 2802 2803 if (!amdgpu_sriov_vf(adev) || adev->asic_type == CHIP_TONGA) 2804 r = amdgpu_pm_load_smu_firmware(adev, &smu_version); 2805 2806 return r; 2807 } 2808 2809 static int amdgpu_device_init_schedulers(struct amdgpu_device *adev) 2810 { 2811 long timeout; 2812 int r, i; 2813 2814 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 2815 struct amdgpu_ring *ring = adev->rings[i]; 2816 2817 /* No need to setup the GPU scheduler for rings that don't need it */ 2818 if (!ring || ring->no_scheduler) 2819 continue; 2820 2821 switch (ring->funcs->type) { 2822 case AMDGPU_RING_TYPE_GFX: 2823 timeout = adev->gfx_timeout; 2824 break; 2825 case AMDGPU_RING_TYPE_COMPUTE: 2826 timeout = adev->compute_timeout; 2827 break; 2828 case AMDGPU_RING_TYPE_SDMA: 2829 timeout = adev->sdma_timeout; 2830 break; 2831 default: 2832 timeout = adev->video_timeout; 2833 break; 2834 } 2835 2836 r = drm_sched_init(&ring->sched, &amdgpu_sched_ops, NULL, 2837 DRM_SCHED_PRIORITY_COUNT, 2838 ring->num_hw_submission, 0, 2839 timeout, adev->reset_domain->wq, 2840 ring->sched_score, ring->name, 2841 adev->dev); 2842 if (r) { 2843 DRM_ERROR("Failed to create scheduler on ring %s.\n", 2844 ring->name); 2845 return r; 2846 } 2847 r = amdgpu_uvd_entity_init(adev, ring); 2848 if (r) { 2849 DRM_ERROR("Failed to create UVD scheduling entity on ring %s.\n", 2850 ring->name); 2851 return r; 2852 } 2853 r = amdgpu_vce_entity_init(adev, ring); 2854 if (r) { 2855 DRM_ERROR("Failed to create VCE scheduling entity on ring %s.\n", 2856 ring->name); 2857 return r; 2858 } 2859 } 2860 2861 amdgpu_xcp_update_partition_sched_list(adev); 2862 2863 return 0; 2864 } 2865 2866 2867 /** 2868 * amdgpu_device_ip_init - run init for hardware IPs 2869 * 2870 * @adev: amdgpu_device pointer 2871 * 2872 * Main initialization pass for hardware IPs. The list of all the hardware 2873 * IPs that make up the asic is walked and the sw_init and hw_init callbacks 2874 * are run. sw_init initializes the software state associated with each IP 2875 * and hw_init initializes the hardware associated with each IP. 2876 * Returns 0 on success, negative error code on failure. 2877 */ 2878 static int amdgpu_device_ip_init(struct amdgpu_device *adev) 2879 { 2880 bool init_badpage; 2881 int i, r; 2882 2883 r = amdgpu_ras_init(adev); 2884 if (r) 2885 return r; 2886 2887 for (i = 0; i < adev->num_ip_blocks; i++) { 2888 if (!adev->ip_blocks[i].status.valid) 2889 continue; 2890 if (adev->ip_blocks[i].version->funcs->sw_init) { 2891 r = adev->ip_blocks[i].version->funcs->sw_init(&adev->ip_blocks[i]); 2892 if (r) { 2893 DRM_ERROR("sw_init of IP block <%s> failed %d\n", 2894 adev->ip_blocks[i].version->funcs->name, r); 2895 goto init_failed; 2896 } 2897 } 2898 adev->ip_blocks[i].status.sw = true; 2899 2900 if (!amdgpu_ip_member_of_hwini( 2901 adev, adev->ip_blocks[i].version->type)) 2902 continue; 2903 2904 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON) { 2905 /* need to do common hw init early so everything is set up for gmc */ 2906 r = adev->ip_blocks[i].version->funcs->hw_init(&adev->ip_blocks[i]); 2907 if (r) { 2908 DRM_ERROR("hw_init %d failed %d\n", i, r); 2909 goto init_failed; 2910 } 2911 adev->ip_blocks[i].status.hw = true; 2912 } else if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) { 2913 /* need to do gmc hw init early so we can allocate gpu mem */ 2914 /* Try to reserve bad pages early */ 2915 if (amdgpu_sriov_vf(adev)) 2916 amdgpu_virt_exchange_data(adev); 2917 2918 r = amdgpu_device_mem_scratch_init(adev); 2919 if (r) { 2920 DRM_ERROR("amdgpu_mem_scratch_init failed %d\n", r); 2921 goto init_failed; 2922 } 2923 r = adev->ip_blocks[i].version->funcs->hw_init(&adev->ip_blocks[i]); 2924 if (r) { 2925 DRM_ERROR("hw_init %d failed %d\n", i, r); 2926 goto init_failed; 2927 } 2928 r = amdgpu_device_wb_init(adev); 2929 if (r) { 2930 DRM_ERROR("amdgpu_device_wb_init failed %d\n", r); 2931 goto init_failed; 2932 } 2933 adev->ip_blocks[i].status.hw = true; 2934 2935 /* right after GMC hw init, we create CSA */ 2936 if (adev->gfx.mcbp) { 2937 r = amdgpu_allocate_static_csa(adev, &adev->virt.csa_obj, 2938 AMDGPU_GEM_DOMAIN_VRAM | 2939 AMDGPU_GEM_DOMAIN_GTT, 2940 AMDGPU_CSA_SIZE); 2941 if (r) { 2942 DRM_ERROR("allocate CSA failed %d\n", r); 2943 goto init_failed; 2944 } 2945 } 2946 2947 r = amdgpu_seq64_init(adev); 2948 if (r) { 2949 DRM_ERROR("allocate seq64 failed %d\n", r); 2950 goto init_failed; 2951 } 2952 } 2953 } 2954 2955 if (amdgpu_sriov_vf(adev)) 2956 amdgpu_virt_init_data_exchange(adev); 2957 2958 r = amdgpu_ib_pool_init(adev); 2959 if (r) { 2960 dev_err(adev->dev, "IB initialization failed (%d).\n", r); 2961 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_IB_INIT_FAIL, 0, r); 2962 goto init_failed; 2963 } 2964 2965 r = amdgpu_ucode_create_bo(adev); /* create ucode bo when sw_init complete*/ 2966 if (r) 2967 goto init_failed; 2968 2969 r = amdgpu_device_ip_hw_init_phase1(adev); 2970 if (r) 2971 goto init_failed; 2972 2973 r = amdgpu_device_fw_loading(adev); 2974 if (r) 2975 goto init_failed; 2976 2977 r = amdgpu_device_ip_hw_init_phase2(adev); 2978 if (r) 2979 goto init_failed; 2980 2981 /* 2982 * retired pages will be loaded from eeprom and reserved here, 2983 * it should be called after amdgpu_device_ip_hw_init_phase2 since 2984 * for some ASICs the RAS EEPROM code relies on SMU fully functioning 2985 * for I2C communication which only true at this point. 2986 * 2987 * amdgpu_ras_recovery_init may fail, but the upper only cares the 2988 * failure from bad gpu situation and stop amdgpu init process 2989 * accordingly. For other failed cases, it will still release all 2990 * the resource and print error message, rather than returning one 2991 * negative value to upper level. 2992 * 2993 * Note: theoretically, this should be called before all vram allocations 2994 * to protect retired page from abusing 2995 */ 2996 init_badpage = (adev->init_lvl->level != AMDGPU_INIT_LEVEL_MINIMAL_XGMI); 2997 r = amdgpu_ras_recovery_init(adev, init_badpage); 2998 if (r) 2999 goto init_failed; 3000 3001 /** 3002 * In case of XGMI grab extra reference for reset domain for this device 3003 */ 3004 if (adev->gmc.xgmi.num_physical_nodes > 1) { 3005 if (amdgpu_xgmi_add_device(adev) == 0) { 3006 if (!amdgpu_sriov_vf(adev)) { 3007 struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev); 3008 3009 if (WARN_ON(!hive)) { 3010 r = -ENOENT; 3011 goto init_failed; 3012 } 3013 3014 if (!hive->reset_domain || 3015 !amdgpu_reset_get_reset_domain(hive->reset_domain)) { 3016 r = -ENOENT; 3017 amdgpu_put_xgmi_hive(hive); 3018 goto init_failed; 3019 } 3020 3021 /* Drop the early temporary reset domain we created for device */ 3022 amdgpu_reset_put_reset_domain(adev->reset_domain); 3023 adev->reset_domain = hive->reset_domain; 3024 amdgpu_put_xgmi_hive(hive); 3025 } 3026 } 3027 } 3028 3029 r = amdgpu_device_init_schedulers(adev); 3030 if (r) 3031 goto init_failed; 3032 3033 if (adev->mman.buffer_funcs_ring->sched.ready) 3034 amdgpu_ttm_set_buffer_funcs_status(adev, true); 3035 3036 /* Don't init kfd if whole hive need to be reset during init */ 3037 if (adev->init_lvl->level != AMDGPU_INIT_LEVEL_MINIMAL_XGMI) { 3038 kgd2kfd_init_zone_device(adev); 3039 amdgpu_amdkfd_device_init(adev); 3040 } 3041 3042 amdgpu_fru_get_product_info(adev); 3043 3044 init_failed: 3045 3046 return r; 3047 } 3048 3049 /** 3050 * amdgpu_device_fill_reset_magic - writes reset magic to gart pointer 3051 * 3052 * @adev: amdgpu_device pointer 3053 * 3054 * Writes a reset magic value to the gart pointer in VRAM. The driver calls 3055 * this function before a GPU reset. If the value is retained after a 3056 * GPU reset, VRAM has not been lost. Some GPU resets may destry VRAM contents. 3057 */ 3058 static void amdgpu_device_fill_reset_magic(struct amdgpu_device *adev) 3059 { 3060 memcpy(adev->reset_magic, adev->gart.ptr, AMDGPU_RESET_MAGIC_NUM); 3061 } 3062 3063 /** 3064 * amdgpu_device_check_vram_lost - check if vram is valid 3065 * 3066 * @adev: amdgpu_device pointer 3067 * 3068 * Checks the reset magic value written to the gart pointer in VRAM. 3069 * The driver calls this after a GPU reset to see if the contents of 3070 * VRAM is lost or now. 3071 * returns true if vram is lost, false if not. 3072 */ 3073 static bool amdgpu_device_check_vram_lost(struct amdgpu_device *adev) 3074 { 3075 if (memcmp(adev->gart.ptr, adev->reset_magic, 3076 AMDGPU_RESET_MAGIC_NUM)) 3077 return true; 3078 3079 if (!amdgpu_in_reset(adev)) 3080 return false; 3081 3082 /* 3083 * For all ASICs with baco/mode1 reset, the VRAM is 3084 * always assumed to be lost. 3085 */ 3086 switch (amdgpu_asic_reset_method(adev)) { 3087 case AMD_RESET_METHOD_BACO: 3088 case AMD_RESET_METHOD_MODE1: 3089 return true; 3090 default: 3091 return false; 3092 } 3093 } 3094 3095 /** 3096 * amdgpu_device_set_cg_state - set clockgating for amdgpu device 3097 * 3098 * @adev: amdgpu_device pointer 3099 * @state: clockgating state (gate or ungate) 3100 * 3101 * The list of all the hardware IPs that make up the asic is walked and the 3102 * set_clockgating_state callbacks are run. 3103 * Late initialization pass enabling clockgating for hardware IPs. 3104 * Fini or suspend, pass disabling clockgating for hardware IPs. 3105 * Returns 0 on success, negative error code on failure. 3106 */ 3107 3108 int amdgpu_device_set_cg_state(struct amdgpu_device *adev, 3109 enum amd_clockgating_state state) 3110 { 3111 int i, j, r; 3112 3113 if (amdgpu_emu_mode == 1) 3114 return 0; 3115 3116 for (j = 0; j < adev->num_ip_blocks; j++) { 3117 i = state == AMD_CG_STATE_GATE ? j : adev->num_ip_blocks - j - 1; 3118 if (!adev->ip_blocks[i].status.late_initialized) 3119 continue; 3120 /* skip CG for GFX, SDMA on S0ix */ 3121 if (adev->in_s0ix && 3122 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GFX || 3123 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SDMA)) 3124 continue; 3125 /* skip CG for VCE/UVD, it's handled specially */ 3126 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_UVD && 3127 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCE && 3128 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCN && 3129 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_JPEG && 3130 adev->ip_blocks[i].version->funcs->set_clockgating_state) { 3131 /* enable clockgating to save power */ 3132 r = adev->ip_blocks[i].version->funcs->set_clockgating_state((void *)adev, 3133 state); 3134 if (r) { 3135 DRM_ERROR("set_clockgating_state(gate) of IP block <%s> failed %d\n", 3136 adev->ip_blocks[i].version->funcs->name, r); 3137 return r; 3138 } 3139 } 3140 } 3141 3142 return 0; 3143 } 3144 3145 int amdgpu_device_set_pg_state(struct amdgpu_device *adev, 3146 enum amd_powergating_state state) 3147 { 3148 int i, j, r; 3149 3150 if (amdgpu_emu_mode == 1) 3151 return 0; 3152 3153 for (j = 0; j < adev->num_ip_blocks; j++) { 3154 i = state == AMD_PG_STATE_GATE ? j : adev->num_ip_blocks - j - 1; 3155 if (!adev->ip_blocks[i].status.late_initialized) 3156 continue; 3157 /* skip PG for GFX, SDMA on S0ix */ 3158 if (adev->in_s0ix && 3159 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GFX || 3160 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SDMA)) 3161 continue; 3162 /* skip CG for VCE/UVD, it's handled specially */ 3163 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_UVD && 3164 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCE && 3165 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCN && 3166 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_JPEG && 3167 adev->ip_blocks[i].version->funcs->set_powergating_state) { 3168 /* enable powergating to save power */ 3169 r = adev->ip_blocks[i].version->funcs->set_powergating_state((void *)adev, 3170 state); 3171 if (r) { 3172 DRM_ERROR("set_powergating_state(gate) of IP block <%s> failed %d\n", 3173 adev->ip_blocks[i].version->funcs->name, r); 3174 return r; 3175 } 3176 } 3177 } 3178 return 0; 3179 } 3180 3181 static int amdgpu_device_enable_mgpu_fan_boost(void) 3182 { 3183 struct amdgpu_gpu_instance *gpu_ins; 3184 struct amdgpu_device *adev; 3185 int i, ret = 0; 3186 3187 mutex_lock(&mgpu_info.mutex); 3188 3189 /* 3190 * MGPU fan boost feature should be enabled 3191 * only when there are two or more dGPUs in 3192 * the system 3193 */ 3194 if (mgpu_info.num_dgpu < 2) 3195 goto out; 3196 3197 for (i = 0; i < mgpu_info.num_dgpu; i++) { 3198 gpu_ins = &(mgpu_info.gpu_ins[i]); 3199 adev = gpu_ins->adev; 3200 if (!(adev->flags & AMD_IS_APU) && 3201 !gpu_ins->mgpu_fan_enabled) { 3202 ret = amdgpu_dpm_enable_mgpu_fan_boost(adev); 3203 if (ret) 3204 break; 3205 3206 gpu_ins->mgpu_fan_enabled = 1; 3207 } 3208 } 3209 3210 out: 3211 mutex_unlock(&mgpu_info.mutex); 3212 3213 return ret; 3214 } 3215 3216 /** 3217 * amdgpu_device_ip_late_init - run late init for hardware IPs 3218 * 3219 * @adev: amdgpu_device pointer 3220 * 3221 * Late initialization pass for hardware IPs. The list of all the hardware 3222 * IPs that make up the asic is walked and the late_init callbacks are run. 3223 * late_init covers any special initialization that an IP requires 3224 * after all of the have been initialized or something that needs to happen 3225 * late in the init process. 3226 * Returns 0 on success, negative error code on failure. 3227 */ 3228 static int amdgpu_device_ip_late_init(struct amdgpu_device *adev) 3229 { 3230 struct amdgpu_gpu_instance *gpu_instance; 3231 int i = 0, r; 3232 3233 for (i = 0; i < adev->num_ip_blocks; i++) { 3234 if (!adev->ip_blocks[i].status.hw) 3235 continue; 3236 if (adev->ip_blocks[i].version->funcs->late_init) { 3237 r = adev->ip_blocks[i].version->funcs->late_init(&adev->ip_blocks[i]); 3238 if (r) { 3239 DRM_ERROR("late_init of IP block <%s> failed %d\n", 3240 adev->ip_blocks[i].version->funcs->name, r); 3241 return r; 3242 } 3243 } 3244 adev->ip_blocks[i].status.late_initialized = true; 3245 } 3246 3247 r = amdgpu_ras_late_init(adev); 3248 if (r) { 3249 DRM_ERROR("amdgpu_ras_late_init failed %d", r); 3250 return r; 3251 } 3252 3253 if (!amdgpu_in_reset(adev)) 3254 amdgpu_ras_set_error_query_ready(adev, true); 3255 3256 amdgpu_device_set_cg_state(adev, AMD_CG_STATE_GATE); 3257 amdgpu_device_set_pg_state(adev, AMD_PG_STATE_GATE); 3258 3259 amdgpu_device_fill_reset_magic(adev); 3260 3261 r = amdgpu_device_enable_mgpu_fan_boost(); 3262 if (r) 3263 DRM_ERROR("enable mgpu fan boost failed (%d).\n", r); 3264 3265 /* For passthrough configuration on arcturus and aldebaran, enable special handling SBR */ 3266 if (amdgpu_passthrough(adev) && 3267 ((adev->asic_type == CHIP_ARCTURUS && adev->gmc.xgmi.num_physical_nodes > 1) || 3268 adev->asic_type == CHIP_ALDEBARAN)) 3269 amdgpu_dpm_handle_passthrough_sbr(adev, true); 3270 3271 if (adev->gmc.xgmi.num_physical_nodes > 1) { 3272 mutex_lock(&mgpu_info.mutex); 3273 3274 /* 3275 * Reset device p-state to low as this was booted with high. 3276 * 3277 * This should be performed only after all devices from the same 3278 * hive get initialized. 3279 * 3280 * However, it's unknown how many device in the hive in advance. 3281 * As this is counted one by one during devices initializations. 3282 * 3283 * So, we wait for all XGMI interlinked devices initialized. 3284 * This may bring some delays as those devices may come from 3285 * different hives. But that should be OK. 3286 */ 3287 if (mgpu_info.num_dgpu == adev->gmc.xgmi.num_physical_nodes) { 3288 for (i = 0; i < mgpu_info.num_gpu; i++) { 3289 gpu_instance = &(mgpu_info.gpu_ins[i]); 3290 if (gpu_instance->adev->flags & AMD_IS_APU) 3291 continue; 3292 3293 r = amdgpu_xgmi_set_pstate(gpu_instance->adev, 3294 AMDGPU_XGMI_PSTATE_MIN); 3295 if (r) { 3296 DRM_ERROR("pstate setting failed (%d).\n", r); 3297 break; 3298 } 3299 } 3300 } 3301 3302 mutex_unlock(&mgpu_info.mutex); 3303 } 3304 3305 return 0; 3306 } 3307 3308 static void amdgpu_ip_block_hw_fini(struct amdgpu_ip_block *ip_block) 3309 { 3310 int r; 3311 3312 if (!ip_block->version->funcs->hw_fini) { 3313 DRM_ERROR("hw_fini of IP block <%s> not defined\n", 3314 ip_block->version->funcs->name); 3315 } else { 3316 r = ip_block->version->funcs->hw_fini(ip_block); 3317 /* XXX handle errors */ 3318 if (r) { 3319 DRM_DEBUG("hw_fini of IP block <%s> failed %d\n", 3320 ip_block->version->funcs->name, r); 3321 } 3322 } 3323 3324 ip_block->status.hw = false; 3325 } 3326 3327 /** 3328 * amdgpu_device_smu_fini_early - smu hw_fini wrapper 3329 * 3330 * @adev: amdgpu_device pointer 3331 * 3332 * For ASICs need to disable SMC first 3333 */ 3334 static void amdgpu_device_smu_fini_early(struct amdgpu_device *adev) 3335 { 3336 int i; 3337 3338 if (amdgpu_ip_version(adev, GC_HWIP, 0) > IP_VERSION(9, 0, 0)) 3339 return; 3340 3341 for (i = 0; i < adev->num_ip_blocks; i++) { 3342 if (!adev->ip_blocks[i].status.hw) 3343 continue; 3344 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) { 3345 amdgpu_ip_block_hw_fini(&adev->ip_blocks[i]); 3346 break; 3347 } 3348 } 3349 } 3350 3351 static int amdgpu_device_ip_fini_early(struct amdgpu_device *adev) 3352 { 3353 int i, r; 3354 3355 for (i = 0; i < adev->num_ip_blocks; i++) { 3356 if (!adev->ip_blocks[i].version->funcs->early_fini) 3357 continue; 3358 3359 r = adev->ip_blocks[i].version->funcs->early_fini(&adev->ip_blocks[i]); 3360 if (r) { 3361 DRM_DEBUG("early_fini of IP block <%s> failed %d\n", 3362 adev->ip_blocks[i].version->funcs->name, r); 3363 } 3364 } 3365 3366 amdgpu_device_set_pg_state(adev, AMD_PG_STATE_UNGATE); 3367 amdgpu_device_set_cg_state(adev, AMD_CG_STATE_UNGATE); 3368 3369 amdgpu_amdkfd_suspend(adev, false); 3370 3371 /* Workaroud for ASICs need to disable SMC first */ 3372 amdgpu_device_smu_fini_early(adev); 3373 3374 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 3375 if (!adev->ip_blocks[i].status.hw) 3376 continue; 3377 3378 amdgpu_ip_block_hw_fini(&adev->ip_blocks[i]); 3379 } 3380 3381 if (amdgpu_sriov_vf(adev)) { 3382 if (amdgpu_virt_release_full_gpu(adev, false)) 3383 DRM_ERROR("failed to release exclusive mode on fini\n"); 3384 } 3385 3386 return 0; 3387 } 3388 3389 /** 3390 * amdgpu_device_ip_fini - run fini for hardware IPs 3391 * 3392 * @adev: amdgpu_device pointer 3393 * 3394 * Main teardown pass for hardware IPs. The list of all the hardware 3395 * IPs that make up the asic is walked and the hw_fini and sw_fini callbacks 3396 * are run. hw_fini tears down the hardware associated with each IP 3397 * and sw_fini tears down any software state associated with each IP. 3398 * Returns 0 on success, negative error code on failure. 3399 */ 3400 static int amdgpu_device_ip_fini(struct amdgpu_device *adev) 3401 { 3402 int i, r; 3403 3404 if (amdgpu_sriov_vf(adev) && adev->virt.ras_init_done) 3405 amdgpu_virt_release_ras_err_handler_data(adev); 3406 3407 if (adev->gmc.xgmi.num_physical_nodes > 1) 3408 amdgpu_xgmi_remove_device(adev); 3409 3410 amdgpu_amdkfd_device_fini_sw(adev); 3411 3412 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 3413 if (!adev->ip_blocks[i].status.sw) 3414 continue; 3415 3416 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) { 3417 amdgpu_ucode_free_bo(adev); 3418 amdgpu_free_static_csa(&adev->virt.csa_obj); 3419 amdgpu_device_wb_fini(adev); 3420 amdgpu_device_mem_scratch_fini(adev); 3421 amdgpu_ib_pool_fini(adev); 3422 amdgpu_seq64_fini(adev); 3423 } 3424 if (adev->ip_blocks[i].version->funcs->sw_fini) { 3425 r = adev->ip_blocks[i].version->funcs->sw_fini(&adev->ip_blocks[i]); 3426 /* XXX handle errors */ 3427 if (r) { 3428 DRM_DEBUG("sw_fini of IP block <%s> failed %d\n", 3429 adev->ip_blocks[i].version->funcs->name, r); 3430 } 3431 } 3432 adev->ip_blocks[i].status.sw = false; 3433 adev->ip_blocks[i].status.valid = false; 3434 } 3435 3436 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 3437 if (!adev->ip_blocks[i].status.late_initialized) 3438 continue; 3439 if (adev->ip_blocks[i].version->funcs->late_fini) 3440 adev->ip_blocks[i].version->funcs->late_fini(&adev->ip_blocks[i]); 3441 adev->ip_blocks[i].status.late_initialized = false; 3442 } 3443 3444 amdgpu_ras_fini(adev); 3445 3446 return 0; 3447 } 3448 3449 /** 3450 * amdgpu_device_delayed_init_work_handler - work handler for IB tests 3451 * 3452 * @work: work_struct. 3453 */ 3454 static void amdgpu_device_delayed_init_work_handler(struct work_struct *work) 3455 { 3456 struct amdgpu_device *adev = 3457 container_of(work, struct amdgpu_device, delayed_init_work.work); 3458 int r; 3459 3460 r = amdgpu_ib_ring_tests(adev); 3461 if (r) 3462 DRM_ERROR("ib ring test failed (%d).\n", r); 3463 } 3464 3465 static void amdgpu_device_delay_enable_gfx_off(struct work_struct *work) 3466 { 3467 struct amdgpu_device *adev = 3468 container_of(work, struct amdgpu_device, gfx.gfx_off_delay_work.work); 3469 3470 WARN_ON_ONCE(adev->gfx.gfx_off_state); 3471 WARN_ON_ONCE(adev->gfx.gfx_off_req_count); 3472 3473 if (!amdgpu_dpm_set_powergating_by_smu(adev, AMD_IP_BLOCK_TYPE_GFX, true)) 3474 adev->gfx.gfx_off_state = true; 3475 } 3476 3477 /** 3478 * amdgpu_device_ip_suspend_phase1 - run suspend for hardware IPs (phase 1) 3479 * 3480 * @adev: amdgpu_device pointer 3481 * 3482 * Main suspend function for hardware IPs. The list of all the hardware 3483 * IPs that make up the asic is walked, clockgating is disabled and the 3484 * suspend callbacks are run. suspend puts the hardware and software state 3485 * in each IP into a state suitable for suspend. 3486 * Returns 0 on success, negative error code on failure. 3487 */ 3488 static int amdgpu_device_ip_suspend_phase1(struct amdgpu_device *adev) 3489 { 3490 int i, r; 3491 3492 amdgpu_device_set_pg_state(adev, AMD_PG_STATE_UNGATE); 3493 amdgpu_device_set_cg_state(adev, AMD_CG_STATE_UNGATE); 3494 3495 /* 3496 * Per PMFW team's suggestion, driver needs to handle gfxoff 3497 * and df cstate features disablement for gpu reset(e.g. Mode1Reset) 3498 * scenario. Add the missing df cstate disablement here. 3499 */ 3500 if (amdgpu_dpm_set_df_cstate(adev, DF_CSTATE_DISALLOW)) 3501 dev_warn(adev->dev, "Failed to disallow df cstate"); 3502 3503 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 3504 if (!adev->ip_blocks[i].status.valid) 3505 continue; 3506 3507 /* displays are handled separately */ 3508 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_DCE) 3509 continue; 3510 3511 /* XXX handle errors */ 3512 r = amdgpu_ip_block_suspend(&adev->ip_blocks[i]); 3513 if (r) 3514 return r; 3515 } 3516 3517 return 0; 3518 } 3519 3520 /** 3521 * amdgpu_device_ip_suspend_phase2 - run suspend for hardware IPs (phase 2) 3522 * 3523 * @adev: amdgpu_device pointer 3524 * 3525 * Main suspend function for hardware IPs. The list of all the hardware 3526 * IPs that make up the asic is walked, clockgating is disabled and the 3527 * suspend callbacks are run. suspend puts the hardware and software state 3528 * in each IP into a state suitable for suspend. 3529 * Returns 0 on success, negative error code on failure. 3530 */ 3531 static int amdgpu_device_ip_suspend_phase2(struct amdgpu_device *adev) 3532 { 3533 int i, r; 3534 3535 if (adev->in_s0ix) 3536 amdgpu_dpm_gfx_state_change(adev, sGpuChangeState_D3Entry); 3537 3538 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 3539 if (!adev->ip_blocks[i].status.valid) 3540 continue; 3541 /* displays are handled in phase1 */ 3542 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE) 3543 continue; 3544 /* PSP lost connection when err_event_athub occurs */ 3545 if (amdgpu_ras_intr_triggered() && 3546 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) { 3547 adev->ip_blocks[i].status.hw = false; 3548 continue; 3549 } 3550 3551 /* skip unnecessary suspend if we do not initialize them yet */ 3552 if (!amdgpu_ip_member_of_hwini( 3553 adev, adev->ip_blocks[i].version->type)) 3554 continue; 3555 3556 /* skip suspend of gfx/mes and psp for S0ix 3557 * gfx is in gfxoff state, so on resume it will exit gfxoff just 3558 * like at runtime. PSP is also part of the always on hardware 3559 * so no need to suspend it. 3560 */ 3561 if (adev->in_s0ix && 3562 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP || 3563 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GFX || 3564 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_MES)) 3565 continue; 3566 3567 /* SDMA 5.x+ is part of GFX power domain so it's covered by GFXOFF */ 3568 if (adev->in_s0ix && 3569 (amdgpu_ip_version(adev, SDMA0_HWIP, 0) >= 3570 IP_VERSION(5, 0, 0)) && 3571 (adev->ip_blocks[i].version->type == 3572 AMD_IP_BLOCK_TYPE_SDMA)) 3573 continue; 3574 3575 /* Once swPSP provides the IMU, RLC FW binaries to TOS during cold-boot. 3576 * These are in TMR, hence are expected to be reused by PSP-TOS to reload 3577 * from this location and RLC Autoload automatically also gets loaded 3578 * from here based on PMFW -> PSP message during re-init sequence. 3579 * Therefore, the psp suspend & resume should be skipped to avoid destroy 3580 * the TMR and reload FWs again for IMU enabled APU ASICs. 3581 */ 3582 if (amdgpu_in_reset(adev) && 3583 (adev->flags & AMD_IS_APU) && adev->gfx.imu.funcs && 3584 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) 3585 continue; 3586 3587 /* XXX handle errors */ 3588 r = amdgpu_ip_block_suspend(&adev->ip_blocks[i]); 3589 adev->ip_blocks[i].status.hw = false; 3590 3591 /* handle putting the SMC in the appropriate state */ 3592 if (!amdgpu_sriov_vf(adev)) { 3593 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) { 3594 r = amdgpu_dpm_set_mp1_state(adev, adev->mp1_state); 3595 if (r) { 3596 DRM_ERROR("SMC failed to set mp1 state %d, %d\n", 3597 adev->mp1_state, r); 3598 return r; 3599 } 3600 } 3601 } 3602 } 3603 3604 return 0; 3605 } 3606 3607 /** 3608 * amdgpu_device_ip_suspend - run suspend for hardware IPs 3609 * 3610 * @adev: amdgpu_device pointer 3611 * 3612 * Main suspend function for hardware IPs. The list of all the hardware 3613 * IPs that make up the asic is walked, clockgating is disabled and the 3614 * suspend callbacks are run. suspend puts the hardware and software state 3615 * in each IP into a state suitable for suspend. 3616 * Returns 0 on success, negative error code on failure. 3617 */ 3618 int amdgpu_device_ip_suspend(struct amdgpu_device *adev) 3619 { 3620 int r; 3621 3622 if (amdgpu_sriov_vf(adev)) { 3623 amdgpu_virt_fini_data_exchange(adev); 3624 amdgpu_virt_request_full_gpu(adev, false); 3625 } 3626 3627 amdgpu_ttm_set_buffer_funcs_status(adev, false); 3628 3629 r = amdgpu_device_ip_suspend_phase1(adev); 3630 if (r) 3631 return r; 3632 r = amdgpu_device_ip_suspend_phase2(adev); 3633 3634 if (amdgpu_sriov_vf(adev)) 3635 amdgpu_virt_release_full_gpu(adev, false); 3636 3637 return r; 3638 } 3639 3640 static int amdgpu_device_ip_reinit_early_sriov(struct amdgpu_device *adev) 3641 { 3642 int i, r; 3643 3644 static enum amd_ip_block_type ip_order[] = { 3645 AMD_IP_BLOCK_TYPE_COMMON, 3646 AMD_IP_BLOCK_TYPE_GMC, 3647 AMD_IP_BLOCK_TYPE_PSP, 3648 AMD_IP_BLOCK_TYPE_IH, 3649 }; 3650 3651 for (i = 0; i < adev->num_ip_blocks; i++) { 3652 int j; 3653 struct amdgpu_ip_block *block; 3654 3655 block = &adev->ip_blocks[i]; 3656 block->status.hw = false; 3657 3658 for (j = 0; j < ARRAY_SIZE(ip_order); j++) { 3659 3660 if (block->version->type != ip_order[j] || 3661 !block->status.valid) 3662 continue; 3663 3664 r = block->version->funcs->hw_init(&adev->ip_blocks[i]); 3665 DRM_INFO("RE-INIT-early: %s %s\n", block->version->funcs->name, r?"failed":"succeeded"); 3666 if (r) 3667 return r; 3668 block->status.hw = true; 3669 } 3670 } 3671 3672 return 0; 3673 } 3674 3675 static int amdgpu_device_ip_reinit_late_sriov(struct amdgpu_device *adev) 3676 { 3677 int i, r; 3678 3679 static enum amd_ip_block_type ip_order[] = { 3680 AMD_IP_BLOCK_TYPE_SMC, 3681 AMD_IP_BLOCK_TYPE_DCE, 3682 AMD_IP_BLOCK_TYPE_GFX, 3683 AMD_IP_BLOCK_TYPE_SDMA, 3684 AMD_IP_BLOCK_TYPE_MES, 3685 AMD_IP_BLOCK_TYPE_UVD, 3686 AMD_IP_BLOCK_TYPE_VCE, 3687 AMD_IP_BLOCK_TYPE_VCN, 3688 AMD_IP_BLOCK_TYPE_JPEG 3689 }; 3690 3691 for (i = 0; i < ARRAY_SIZE(ip_order); i++) { 3692 int j; 3693 struct amdgpu_ip_block *block; 3694 3695 for (j = 0; j < adev->num_ip_blocks; j++) { 3696 block = &adev->ip_blocks[j]; 3697 3698 if (block->version->type != ip_order[i] || 3699 !block->status.valid || 3700 block->status.hw) 3701 continue; 3702 3703 if (block->version->type == AMD_IP_BLOCK_TYPE_SMC) { 3704 r = amdgpu_ip_block_resume(&adev->ip_blocks[i]); 3705 if (r) 3706 return r; 3707 } else { 3708 r = block->version->funcs->hw_init(&adev->ip_blocks[i]); 3709 if (r) { 3710 DRM_ERROR("hw_init of IP block <%s> failed %d\n", 3711 adev->ip_blocks[i].version->funcs->name, r); 3712 return r; 3713 } 3714 block->status.hw = true; 3715 } 3716 } 3717 } 3718 3719 return 0; 3720 } 3721 3722 /** 3723 * amdgpu_device_ip_resume_phase1 - run resume for hardware IPs 3724 * 3725 * @adev: amdgpu_device pointer 3726 * 3727 * First resume function for hardware IPs. The list of all the hardware 3728 * IPs that make up the asic is walked and the resume callbacks are run for 3729 * COMMON, GMC, and IH. resume puts the hardware into a functional state 3730 * after a suspend and updates the software state as necessary. This 3731 * function is also used for restoring the GPU after a GPU reset. 3732 * Returns 0 on success, negative error code on failure. 3733 */ 3734 static int amdgpu_device_ip_resume_phase1(struct amdgpu_device *adev) 3735 { 3736 int i, r; 3737 3738 for (i = 0; i < adev->num_ip_blocks; i++) { 3739 if (!adev->ip_blocks[i].status.valid || adev->ip_blocks[i].status.hw) 3740 continue; 3741 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON || 3742 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC || 3743 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH || 3744 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP && amdgpu_sriov_vf(adev))) { 3745 3746 r = amdgpu_ip_block_resume(&adev->ip_blocks[i]); 3747 if (r) 3748 return r; 3749 } 3750 } 3751 3752 return 0; 3753 } 3754 3755 /** 3756 * amdgpu_device_ip_resume_phase2 - run resume for hardware IPs 3757 * 3758 * @adev: amdgpu_device pointer 3759 * 3760 * First resume function for hardware IPs. The list of all the hardware 3761 * IPs that make up the asic is walked and the resume callbacks are run for 3762 * all blocks except COMMON, GMC, and IH. resume puts the hardware into a 3763 * functional state after a suspend and updates the software state as 3764 * necessary. This function is also used for restoring the GPU after a GPU 3765 * reset. 3766 * Returns 0 on success, negative error code on failure. 3767 */ 3768 static int amdgpu_device_ip_resume_phase2(struct amdgpu_device *adev) 3769 { 3770 int i, r; 3771 3772 for (i = 0; i < adev->num_ip_blocks; i++) { 3773 if (!adev->ip_blocks[i].status.valid || adev->ip_blocks[i].status.hw) 3774 continue; 3775 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON || 3776 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC || 3777 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH || 3778 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) 3779 continue; 3780 r = amdgpu_ip_block_resume(&adev->ip_blocks[i]); 3781 if (r) 3782 return r; 3783 } 3784 3785 return 0; 3786 } 3787 3788 /** 3789 * amdgpu_device_ip_resume - run resume for hardware IPs 3790 * 3791 * @adev: amdgpu_device pointer 3792 * 3793 * Main resume function for hardware IPs. The hardware IPs 3794 * are split into two resume functions because they are 3795 * also used in recovering from a GPU reset and some additional 3796 * steps need to be take between them. In this case (S3/S4) they are 3797 * run sequentially. 3798 * Returns 0 on success, negative error code on failure. 3799 */ 3800 static int amdgpu_device_ip_resume(struct amdgpu_device *adev) 3801 { 3802 int r; 3803 3804 r = amdgpu_device_ip_resume_phase1(adev); 3805 if (r) 3806 return r; 3807 3808 r = amdgpu_device_fw_loading(adev); 3809 if (r) 3810 return r; 3811 3812 r = amdgpu_device_ip_resume_phase2(adev); 3813 3814 if (adev->mman.buffer_funcs_ring->sched.ready) 3815 amdgpu_ttm_set_buffer_funcs_status(adev, true); 3816 3817 return r; 3818 } 3819 3820 /** 3821 * amdgpu_device_detect_sriov_bios - determine if the board supports SR-IOV 3822 * 3823 * @adev: amdgpu_device pointer 3824 * 3825 * Query the VBIOS data tables to determine if the board supports SR-IOV. 3826 */ 3827 static void amdgpu_device_detect_sriov_bios(struct amdgpu_device *adev) 3828 { 3829 if (amdgpu_sriov_vf(adev)) { 3830 if (adev->is_atom_fw) { 3831 if (amdgpu_atomfirmware_gpu_virtualization_supported(adev)) 3832 adev->virt.caps |= AMDGPU_SRIOV_CAPS_SRIOV_VBIOS; 3833 } else { 3834 if (amdgpu_atombios_has_gpu_virtualization_table(adev)) 3835 adev->virt.caps |= AMDGPU_SRIOV_CAPS_SRIOV_VBIOS; 3836 } 3837 3838 if (!(adev->virt.caps & AMDGPU_SRIOV_CAPS_SRIOV_VBIOS)) 3839 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_NO_VBIOS, 0, 0); 3840 } 3841 } 3842 3843 /** 3844 * amdgpu_device_asic_has_dc_support - determine if DC supports the asic 3845 * 3846 * @asic_type: AMD asic type 3847 * 3848 * Check if there is DC (new modesetting infrastructre) support for an asic. 3849 * returns true if DC has support, false if not. 3850 */ 3851 bool amdgpu_device_asic_has_dc_support(enum amd_asic_type asic_type) 3852 { 3853 switch (asic_type) { 3854 #ifdef CONFIG_DRM_AMDGPU_SI 3855 case CHIP_HAINAN: 3856 #endif 3857 case CHIP_TOPAZ: 3858 /* chips with no display hardware */ 3859 return false; 3860 #if defined(CONFIG_DRM_AMD_DC) 3861 case CHIP_TAHITI: 3862 case CHIP_PITCAIRN: 3863 case CHIP_VERDE: 3864 case CHIP_OLAND: 3865 /* 3866 * We have systems in the wild with these ASICs that require 3867 * LVDS and VGA support which is not supported with DC. 3868 * 3869 * Fallback to the non-DC driver here by default so as not to 3870 * cause regressions. 3871 */ 3872 #if defined(CONFIG_DRM_AMD_DC_SI) 3873 return amdgpu_dc > 0; 3874 #else 3875 return false; 3876 #endif 3877 case CHIP_BONAIRE: 3878 case CHIP_KAVERI: 3879 case CHIP_KABINI: 3880 case CHIP_MULLINS: 3881 /* 3882 * We have systems in the wild with these ASICs that require 3883 * VGA support which is not supported with DC. 3884 * 3885 * Fallback to the non-DC driver here by default so as not to 3886 * cause regressions. 3887 */ 3888 return amdgpu_dc > 0; 3889 default: 3890 return amdgpu_dc != 0; 3891 #else 3892 default: 3893 if (amdgpu_dc > 0) 3894 DRM_INFO_ONCE("Display Core has been requested via kernel parameter but isn't supported by ASIC, ignoring\n"); 3895 return false; 3896 #endif 3897 } 3898 } 3899 3900 /** 3901 * amdgpu_device_has_dc_support - check if dc is supported 3902 * 3903 * @adev: amdgpu_device pointer 3904 * 3905 * Returns true for supported, false for not supported 3906 */ 3907 bool amdgpu_device_has_dc_support(struct amdgpu_device *adev) 3908 { 3909 if (adev->enable_virtual_display || 3910 (adev->harvest_ip_mask & AMD_HARVEST_IP_DMU_MASK)) 3911 return false; 3912 3913 return amdgpu_device_asic_has_dc_support(adev->asic_type); 3914 } 3915 3916 static void amdgpu_device_xgmi_reset_func(struct work_struct *__work) 3917 { 3918 struct amdgpu_device *adev = 3919 container_of(__work, struct amdgpu_device, xgmi_reset_work); 3920 struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev); 3921 3922 /* It's a bug to not have a hive within this function */ 3923 if (WARN_ON(!hive)) 3924 return; 3925 3926 /* 3927 * Use task barrier to synchronize all xgmi reset works across the 3928 * hive. task_barrier_enter and task_barrier_exit will block 3929 * until all the threads running the xgmi reset works reach 3930 * those points. task_barrier_full will do both blocks. 3931 */ 3932 if (amdgpu_asic_reset_method(adev) == AMD_RESET_METHOD_BACO) { 3933 3934 task_barrier_enter(&hive->tb); 3935 adev->asic_reset_res = amdgpu_device_baco_enter(adev_to_drm(adev)); 3936 3937 if (adev->asic_reset_res) 3938 goto fail; 3939 3940 task_barrier_exit(&hive->tb); 3941 adev->asic_reset_res = amdgpu_device_baco_exit(adev_to_drm(adev)); 3942 3943 if (adev->asic_reset_res) 3944 goto fail; 3945 3946 amdgpu_ras_reset_error_count(adev, AMDGPU_RAS_BLOCK__MMHUB); 3947 } else { 3948 3949 task_barrier_full(&hive->tb); 3950 adev->asic_reset_res = amdgpu_asic_reset(adev); 3951 } 3952 3953 fail: 3954 if (adev->asic_reset_res) 3955 DRM_WARN("ASIC reset failed with error, %d for drm dev, %s", 3956 adev->asic_reset_res, adev_to_drm(adev)->unique); 3957 amdgpu_put_xgmi_hive(hive); 3958 } 3959 3960 static int amdgpu_device_get_job_timeout_settings(struct amdgpu_device *adev) 3961 { 3962 char *input = amdgpu_lockup_timeout; 3963 char *timeout_setting = NULL; 3964 int index = 0; 3965 long timeout; 3966 int ret = 0; 3967 3968 /* 3969 * By default timeout for non compute jobs is 10000 3970 * and 60000 for compute jobs. 3971 * In SR-IOV or passthrough mode, timeout for compute 3972 * jobs are 60000 by default. 3973 */ 3974 adev->gfx_timeout = msecs_to_jiffies(10000); 3975 adev->sdma_timeout = adev->video_timeout = adev->gfx_timeout; 3976 if (amdgpu_sriov_vf(adev)) 3977 adev->compute_timeout = amdgpu_sriov_is_pp_one_vf(adev) ? 3978 msecs_to_jiffies(60000) : msecs_to_jiffies(10000); 3979 else 3980 adev->compute_timeout = msecs_to_jiffies(60000); 3981 3982 if (strnlen(input, AMDGPU_MAX_TIMEOUT_PARAM_LENGTH)) { 3983 while ((timeout_setting = strsep(&input, ",")) && 3984 strnlen(timeout_setting, AMDGPU_MAX_TIMEOUT_PARAM_LENGTH)) { 3985 ret = kstrtol(timeout_setting, 0, &timeout); 3986 if (ret) 3987 return ret; 3988 3989 if (timeout == 0) { 3990 index++; 3991 continue; 3992 } else if (timeout < 0) { 3993 timeout = MAX_SCHEDULE_TIMEOUT; 3994 dev_warn(adev->dev, "lockup timeout disabled"); 3995 add_taint(TAINT_SOFTLOCKUP, LOCKDEP_STILL_OK); 3996 } else { 3997 timeout = msecs_to_jiffies(timeout); 3998 } 3999 4000 switch (index++) { 4001 case 0: 4002 adev->gfx_timeout = timeout; 4003 break; 4004 case 1: 4005 adev->compute_timeout = timeout; 4006 break; 4007 case 2: 4008 adev->sdma_timeout = timeout; 4009 break; 4010 case 3: 4011 adev->video_timeout = timeout; 4012 break; 4013 default: 4014 break; 4015 } 4016 } 4017 /* 4018 * There is only one value specified and 4019 * it should apply to all non-compute jobs. 4020 */ 4021 if (index == 1) { 4022 adev->sdma_timeout = adev->video_timeout = adev->gfx_timeout; 4023 if (amdgpu_sriov_vf(adev) || amdgpu_passthrough(adev)) 4024 adev->compute_timeout = adev->gfx_timeout; 4025 } 4026 } 4027 4028 return ret; 4029 } 4030 4031 /** 4032 * amdgpu_device_check_iommu_direct_map - check if RAM direct mapped to GPU 4033 * 4034 * @adev: amdgpu_device pointer 4035 * 4036 * RAM direct mapped to GPU if IOMMU is not enabled or is pass through mode 4037 */ 4038 static void amdgpu_device_check_iommu_direct_map(struct amdgpu_device *adev) 4039 { 4040 struct iommu_domain *domain; 4041 4042 domain = iommu_get_domain_for_dev(adev->dev); 4043 if (!domain || domain->type == IOMMU_DOMAIN_IDENTITY) 4044 adev->ram_is_direct_mapped = true; 4045 } 4046 4047 #if defined(CONFIG_HSA_AMD_P2P) 4048 /** 4049 * amdgpu_device_check_iommu_remap - Check if DMA remapping is enabled. 4050 * 4051 * @adev: amdgpu_device pointer 4052 * 4053 * return if IOMMU remapping bar address 4054 */ 4055 static bool amdgpu_device_check_iommu_remap(struct amdgpu_device *adev) 4056 { 4057 struct iommu_domain *domain; 4058 4059 domain = iommu_get_domain_for_dev(adev->dev); 4060 if (domain && (domain->type == IOMMU_DOMAIN_DMA || 4061 domain->type == IOMMU_DOMAIN_DMA_FQ)) 4062 return true; 4063 4064 return false; 4065 } 4066 #endif 4067 4068 static const struct attribute *amdgpu_dev_attributes[] = { 4069 &dev_attr_pcie_replay_count.attr, 4070 NULL 4071 }; 4072 4073 static void amdgpu_device_set_mcbp(struct amdgpu_device *adev) 4074 { 4075 if (amdgpu_mcbp == 1) 4076 adev->gfx.mcbp = true; 4077 else if (amdgpu_mcbp == 0) 4078 adev->gfx.mcbp = false; 4079 4080 if (amdgpu_sriov_vf(adev)) 4081 adev->gfx.mcbp = true; 4082 4083 if (adev->gfx.mcbp) 4084 DRM_INFO("MCBP is enabled\n"); 4085 } 4086 4087 /** 4088 * amdgpu_device_init - initialize the driver 4089 * 4090 * @adev: amdgpu_device pointer 4091 * @flags: driver flags 4092 * 4093 * Initializes the driver info and hw (all asics). 4094 * Returns 0 for success or an error on failure. 4095 * Called at driver startup. 4096 */ 4097 int amdgpu_device_init(struct amdgpu_device *adev, 4098 uint32_t flags) 4099 { 4100 struct drm_device *ddev = adev_to_drm(adev); 4101 struct pci_dev *pdev = adev->pdev; 4102 int r, i; 4103 bool px = false; 4104 u32 max_MBps; 4105 int tmp; 4106 4107 adev->shutdown = false; 4108 adev->flags = flags; 4109 4110 if (amdgpu_force_asic_type >= 0 && amdgpu_force_asic_type < CHIP_LAST) 4111 adev->asic_type = amdgpu_force_asic_type; 4112 else 4113 adev->asic_type = flags & AMD_ASIC_MASK; 4114 4115 adev->usec_timeout = AMDGPU_MAX_USEC_TIMEOUT; 4116 if (amdgpu_emu_mode == 1) 4117 adev->usec_timeout *= 10; 4118 adev->gmc.gart_size = 512 * 1024 * 1024; 4119 adev->accel_working = false; 4120 adev->num_rings = 0; 4121 RCU_INIT_POINTER(adev->gang_submit, dma_fence_get_stub()); 4122 adev->mman.buffer_funcs = NULL; 4123 adev->mman.buffer_funcs_ring = NULL; 4124 adev->vm_manager.vm_pte_funcs = NULL; 4125 adev->vm_manager.vm_pte_num_scheds = 0; 4126 adev->gmc.gmc_funcs = NULL; 4127 adev->harvest_ip_mask = 0x0; 4128 adev->fence_context = dma_fence_context_alloc(AMDGPU_MAX_RINGS); 4129 bitmap_zero(adev->gfx.pipe_reserve_bitmap, AMDGPU_MAX_COMPUTE_QUEUES); 4130 4131 adev->smc_rreg = &amdgpu_invalid_rreg; 4132 adev->smc_wreg = &amdgpu_invalid_wreg; 4133 adev->pcie_rreg = &amdgpu_invalid_rreg; 4134 adev->pcie_wreg = &amdgpu_invalid_wreg; 4135 adev->pcie_rreg_ext = &amdgpu_invalid_rreg_ext; 4136 adev->pcie_wreg_ext = &amdgpu_invalid_wreg_ext; 4137 adev->pciep_rreg = &amdgpu_invalid_rreg; 4138 adev->pciep_wreg = &amdgpu_invalid_wreg; 4139 adev->pcie_rreg64 = &amdgpu_invalid_rreg64; 4140 adev->pcie_wreg64 = &amdgpu_invalid_wreg64; 4141 adev->pcie_rreg64_ext = &amdgpu_invalid_rreg64_ext; 4142 adev->pcie_wreg64_ext = &amdgpu_invalid_wreg64_ext; 4143 adev->uvd_ctx_rreg = &amdgpu_invalid_rreg; 4144 adev->uvd_ctx_wreg = &amdgpu_invalid_wreg; 4145 adev->didt_rreg = &amdgpu_invalid_rreg; 4146 adev->didt_wreg = &amdgpu_invalid_wreg; 4147 adev->gc_cac_rreg = &amdgpu_invalid_rreg; 4148 adev->gc_cac_wreg = &amdgpu_invalid_wreg; 4149 adev->audio_endpt_rreg = &amdgpu_block_invalid_rreg; 4150 adev->audio_endpt_wreg = &amdgpu_block_invalid_wreg; 4151 4152 DRM_INFO("initializing kernel modesetting (%s 0x%04X:0x%04X 0x%04X:0x%04X 0x%02X).\n", 4153 amdgpu_asic_name[adev->asic_type], pdev->vendor, pdev->device, 4154 pdev->subsystem_vendor, pdev->subsystem_device, pdev->revision); 4155 4156 /* mutex initialization are all done here so we 4157 * can recall function without having locking issues 4158 */ 4159 mutex_init(&adev->firmware.mutex); 4160 mutex_init(&adev->pm.mutex); 4161 mutex_init(&adev->gfx.gpu_clock_mutex); 4162 mutex_init(&adev->srbm_mutex); 4163 mutex_init(&adev->gfx.pipe_reserve_mutex); 4164 mutex_init(&adev->gfx.gfx_off_mutex); 4165 mutex_init(&adev->gfx.partition_mutex); 4166 mutex_init(&adev->grbm_idx_mutex); 4167 mutex_init(&adev->mn_lock); 4168 mutex_init(&adev->virt.vf_errors.lock); 4169 mutex_init(&adev->virt.rlcg_reg_lock); 4170 hash_init(adev->mn_hash); 4171 mutex_init(&adev->psp.mutex); 4172 mutex_init(&adev->notifier_lock); 4173 mutex_init(&adev->pm.stable_pstate_ctx_lock); 4174 mutex_init(&adev->benchmark_mutex); 4175 mutex_init(&adev->gfx.reset_sem_mutex); 4176 /* Initialize the mutex for cleaner shader isolation between GFX and compute processes */ 4177 mutex_init(&adev->enforce_isolation_mutex); 4178 mutex_init(&adev->gfx.kfd_sch_mutex); 4179 4180 amdgpu_device_init_apu_flags(adev); 4181 4182 r = amdgpu_device_check_arguments(adev); 4183 if (r) 4184 return r; 4185 4186 spin_lock_init(&adev->mmio_idx_lock); 4187 spin_lock_init(&adev->smc_idx_lock); 4188 spin_lock_init(&adev->pcie_idx_lock); 4189 spin_lock_init(&adev->uvd_ctx_idx_lock); 4190 spin_lock_init(&adev->didt_idx_lock); 4191 spin_lock_init(&adev->gc_cac_idx_lock); 4192 spin_lock_init(&adev->se_cac_idx_lock); 4193 spin_lock_init(&adev->audio_endpt_idx_lock); 4194 spin_lock_init(&adev->mm_stats.lock); 4195 spin_lock_init(&adev->wb.lock); 4196 4197 INIT_LIST_HEAD(&adev->reset_list); 4198 4199 INIT_LIST_HEAD(&adev->ras_list); 4200 4201 INIT_LIST_HEAD(&adev->pm.od_kobj_list); 4202 4203 INIT_DELAYED_WORK(&adev->delayed_init_work, 4204 amdgpu_device_delayed_init_work_handler); 4205 INIT_DELAYED_WORK(&adev->gfx.gfx_off_delay_work, 4206 amdgpu_device_delay_enable_gfx_off); 4207 /* 4208 * Initialize the enforce_isolation work structures for each XCP 4209 * partition. This work handler is responsible for enforcing shader 4210 * isolation on AMD GPUs. It counts the number of emitted fences for 4211 * each GFX and compute ring. If there are any fences, it schedules 4212 * the `enforce_isolation_work` to be run after a delay. If there are 4213 * no fences, it signals the Kernel Fusion Driver (KFD) to resume the 4214 * runqueue. 4215 */ 4216 for (i = 0; i < MAX_XCP; i++) { 4217 INIT_DELAYED_WORK(&adev->gfx.enforce_isolation[i].work, 4218 amdgpu_gfx_enforce_isolation_handler); 4219 adev->gfx.enforce_isolation[i].adev = adev; 4220 adev->gfx.enforce_isolation[i].xcp_id = i; 4221 } 4222 4223 INIT_WORK(&adev->xgmi_reset_work, amdgpu_device_xgmi_reset_func); 4224 4225 adev->gfx.gfx_off_req_count = 1; 4226 adev->gfx.gfx_off_residency = 0; 4227 adev->gfx.gfx_off_entrycount = 0; 4228 adev->pm.ac_power = power_supply_is_system_supplied() > 0; 4229 4230 atomic_set(&adev->throttling_logging_enabled, 1); 4231 /* 4232 * If throttling continues, logging will be performed every minute 4233 * to avoid log flooding. "-1" is subtracted since the thermal 4234 * throttling interrupt comes every second. Thus, the total logging 4235 * interval is 59 seconds(retelimited printk interval) + 1(waiting 4236 * for throttling interrupt) = 60 seconds. 4237 */ 4238 ratelimit_state_init(&adev->throttling_logging_rs, (60 - 1) * HZ, 1); 4239 ratelimit_set_flags(&adev->throttling_logging_rs, RATELIMIT_MSG_ON_RELEASE); 4240 4241 /* Registers mapping */ 4242 /* TODO: block userspace mapping of io register */ 4243 if (adev->asic_type >= CHIP_BONAIRE) { 4244 adev->rmmio_base = pci_resource_start(adev->pdev, 5); 4245 adev->rmmio_size = pci_resource_len(adev->pdev, 5); 4246 } else { 4247 adev->rmmio_base = pci_resource_start(adev->pdev, 2); 4248 adev->rmmio_size = pci_resource_len(adev->pdev, 2); 4249 } 4250 4251 for (i = 0; i < AMD_IP_BLOCK_TYPE_NUM; i++) 4252 atomic_set(&adev->pm.pwr_state[i], POWER_STATE_UNKNOWN); 4253 4254 adev->rmmio = ioremap(adev->rmmio_base, adev->rmmio_size); 4255 if (!adev->rmmio) 4256 return -ENOMEM; 4257 4258 DRM_INFO("register mmio base: 0x%08X\n", (uint32_t)adev->rmmio_base); 4259 DRM_INFO("register mmio size: %u\n", (unsigned int)adev->rmmio_size); 4260 4261 /* 4262 * Reset domain needs to be present early, before XGMI hive discovered 4263 * (if any) and intitialized to use reset sem and in_gpu reset flag 4264 * early on during init and before calling to RREG32. 4265 */ 4266 adev->reset_domain = amdgpu_reset_create_reset_domain(SINGLE_DEVICE, "amdgpu-reset-dev"); 4267 if (!adev->reset_domain) 4268 return -ENOMEM; 4269 4270 /* detect hw virtualization here */ 4271 amdgpu_detect_virtualization(adev); 4272 4273 amdgpu_device_get_pcie_info(adev); 4274 4275 r = amdgpu_device_get_job_timeout_settings(adev); 4276 if (r) { 4277 dev_err(adev->dev, "invalid lockup_timeout parameter syntax\n"); 4278 return r; 4279 } 4280 4281 amdgpu_device_set_mcbp(adev); 4282 4283 /* 4284 * By default, use default mode where all blocks are expected to be 4285 * initialized. At present a 'swinit' of blocks is required to be 4286 * completed before the need for a different level is detected. 4287 */ 4288 amdgpu_set_init_level(adev, AMDGPU_INIT_LEVEL_DEFAULT); 4289 /* early init functions */ 4290 r = amdgpu_device_ip_early_init(adev); 4291 if (r) 4292 return r; 4293 4294 /* Get rid of things like offb */ 4295 r = aperture_remove_conflicting_pci_devices(adev->pdev, amdgpu_kms_driver.name); 4296 if (r) 4297 return r; 4298 4299 /* Enable TMZ based on IP_VERSION */ 4300 amdgpu_gmc_tmz_set(adev); 4301 4302 if (amdgpu_sriov_vf(adev) && 4303 amdgpu_ip_version(adev, GC_HWIP, 0) >= IP_VERSION(10, 3, 0)) 4304 /* VF MMIO access (except mailbox range) from CPU 4305 * will be blocked during sriov runtime 4306 */ 4307 adev->virt.caps |= AMDGPU_VF_MMIO_ACCESS_PROTECT; 4308 4309 amdgpu_gmc_noretry_set(adev); 4310 /* Need to get xgmi info early to decide the reset behavior*/ 4311 if (adev->gmc.xgmi.supported) { 4312 r = adev->gfxhub.funcs->get_xgmi_info(adev); 4313 if (r) 4314 return r; 4315 } 4316 4317 /* enable PCIE atomic ops */ 4318 if (amdgpu_sriov_vf(adev)) { 4319 if (adev->virt.fw_reserve.p_pf2vf) 4320 adev->have_atomics_support = ((struct amd_sriov_msg_pf2vf_info *) 4321 adev->virt.fw_reserve.p_pf2vf)->pcie_atomic_ops_support_flags == 4322 (PCI_EXP_DEVCAP2_ATOMIC_COMP32 | PCI_EXP_DEVCAP2_ATOMIC_COMP64); 4323 /* APUs w/ gfx9 onwards doesn't reply on PCIe atomics, rather it is a 4324 * internal path natively support atomics, set have_atomics_support to true. 4325 */ 4326 } else if ((adev->flags & AMD_IS_APU) && 4327 (amdgpu_ip_version(adev, GC_HWIP, 0) > 4328 IP_VERSION(9, 0, 0))) { 4329 adev->have_atomics_support = true; 4330 } else { 4331 adev->have_atomics_support = 4332 !pci_enable_atomic_ops_to_root(adev->pdev, 4333 PCI_EXP_DEVCAP2_ATOMIC_COMP32 | 4334 PCI_EXP_DEVCAP2_ATOMIC_COMP64); 4335 } 4336 4337 if (!adev->have_atomics_support) 4338 dev_info(adev->dev, "PCIE atomic ops is not supported\n"); 4339 4340 /* doorbell bar mapping and doorbell index init*/ 4341 amdgpu_doorbell_init(adev); 4342 4343 if (amdgpu_emu_mode == 1) { 4344 /* post the asic on emulation mode */ 4345 emu_soc_asic_init(adev); 4346 goto fence_driver_init; 4347 } 4348 4349 amdgpu_reset_init(adev); 4350 4351 /* detect if we are with an SRIOV vbios */ 4352 if (adev->bios) 4353 amdgpu_device_detect_sriov_bios(adev); 4354 4355 /* check if we need to reset the asic 4356 * E.g., driver was not cleanly unloaded previously, etc. 4357 */ 4358 if (!amdgpu_sriov_vf(adev) && amdgpu_asic_need_reset_on_init(adev)) { 4359 if (adev->gmc.xgmi.num_physical_nodes) { 4360 dev_info(adev->dev, "Pending hive reset.\n"); 4361 amdgpu_set_init_level(adev, 4362 AMDGPU_INIT_LEVEL_MINIMAL_XGMI); 4363 } else if (amdgpu_ip_version(adev, MP1_HWIP, 0) == IP_VERSION(13, 0, 10) && 4364 !amdgpu_device_has_display_hardware(adev)) { 4365 r = psp_gpu_reset(adev); 4366 } else { 4367 tmp = amdgpu_reset_method; 4368 /* It should do a default reset when loading or reloading the driver, 4369 * regardless of the module parameter reset_method. 4370 */ 4371 amdgpu_reset_method = AMD_RESET_METHOD_NONE; 4372 r = amdgpu_asic_reset(adev); 4373 amdgpu_reset_method = tmp; 4374 } 4375 4376 if (r) { 4377 dev_err(adev->dev, "asic reset on init failed\n"); 4378 goto failed; 4379 } 4380 } 4381 4382 /* Post card if necessary */ 4383 if (amdgpu_device_need_post(adev)) { 4384 if (!adev->bios) { 4385 dev_err(adev->dev, "no vBIOS found\n"); 4386 r = -EINVAL; 4387 goto failed; 4388 } 4389 DRM_INFO("GPU posting now...\n"); 4390 r = amdgpu_device_asic_init(adev); 4391 if (r) { 4392 dev_err(adev->dev, "gpu post error!\n"); 4393 goto failed; 4394 } 4395 } 4396 4397 if (adev->bios) { 4398 if (adev->is_atom_fw) { 4399 /* Initialize clocks */ 4400 r = amdgpu_atomfirmware_get_clock_info(adev); 4401 if (r) { 4402 dev_err(adev->dev, "amdgpu_atomfirmware_get_clock_info failed\n"); 4403 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_GET_CLOCK_FAIL, 0, 0); 4404 goto failed; 4405 } 4406 } else { 4407 /* Initialize clocks */ 4408 r = amdgpu_atombios_get_clock_info(adev); 4409 if (r) { 4410 dev_err(adev->dev, "amdgpu_atombios_get_clock_info failed\n"); 4411 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_GET_CLOCK_FAIL, 0, 0); 4412 goto failed; 4413 } 4414 /* init i2c buses */ 4415 if (!amdgpu_device_has_dc_support(adev)) 4416 amdgpu_atombios_i2c_init(adev); 4417 } 4418 } 4419 4420 fence_driver_init: 4421 /* Fence driver */ 4422 r = amdgpu_fence_driver_sw_init(adev); 4423 if (r) { 4424 dev_err(adev->dev, "amdgpu_fence_driver_sw_init failed\n"); 4425 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_FENCE_INIT_FAIL, 0, 0); 4426 goto failed; 4427 } 4428 4429 /* init the mode config */ 4430 drm_mode_config_init(adev_to_drm(adev)); 4431 4432 r = amdgpu_device_ip_init(adev); 4433 if (r) { 4434 dev_err(adev->dev, "amdgpu_device_ip_init failed\n"); 4435 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_AMDGPU_INIT_FAIL, 0, 0); 4436 goto release_ras_con; 4437 } 4438 4439 amdgpu_fence_driver_hw_init(adev); 4440 4441 dev_info(adev->dev, 4442 "SE %d, SH per SE %d, CU per SH %d, active_cu_number %d\n", 4443 adev->gfx.config.max_shader_engines, 4444 adev->gfx.config.max_sh_per_se, 4445 adev->gfx.config.max_cu_per_sh, 4446 adev->gfx.cu_info.number); 4447 4448 adev->accel_working = true; 4449 4450 amdgpu_vm_check_compute_bug(adev); 4451 4452 /* Initialize the buffer migration limit. */ 4453 if (amdgpu_moverate >= 0) 4454 max_MBps = amdgpu_moverate; 4455 else 4456 max_MBps = 8; /* Allow 8 MB/s. */ 4457 /* Get a log2 for easy divisions. */ 4458 adev->mm_stats.log2_max_MBps = ilog2(max(1u, max_MBps)); 4459 4460 /* 4461 * Register gpu instance before amdgpu_device_enable_mgpu_fan_boost. 4462 * Otherwise the mgpu fan boost feature will be skipped due to the 4463 * gpu instance is counted less. 4464 */ 4465 amdgpu_register_gpu_instance(adev); 4466 4467 /* enable clockgating, etc. after ib tests, etc. since some blocks require 4468 * explicit gating rather than handling it automatically. 4469 */ 4470 if (adev->init_lvl->level != AMDGPU_INIT_LEVEL_MINIMAL_XGMI) { 4471 r = amdgpu_device_ip_late_init(adev); 4472 if (r) { 4473 dev_err(adev->dev, "amdgpu_device_ip_late_init failed\n"); 4474 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_AMDGPU_LATE_INIT_FAIL, 0, r); 4475 goto release_ras_con; 4476 } 4477 /* must succeed. */ 4478 amdgpu_ras_resume(adev); 4479 queue_delayed_work(system_wq, &adev->delayed_init_work, 4480 msecs_to_jiffies(AMDGPU_RESUME_MS)); 4481 } 4482 4483 if (amdgpu_sriov_vf(adev)) { 4484 amdgpu_virt_release_full_gpu(adev, true); 4485 flush_delayed_work(&adev->delayed_init_work); 4486 } 4487 4488 /* 4489 * Place those sysfs registering after `late_init`. As some of those 4490 * operations performed in `late_init` might affect the sysfs 4491 * interfaces creating. 4492 */ 4493 r = amdgpu_atombios_sysfs_init(adev); 4494 if (r) 4495 drm_err(&adev->ddev, 4496 "registering atombios sysfs failed (%d).\n", r); 4497 4498 r = amdgpu_pm_sysfs_init(adev); 4499 if (r) 4500 DRM_ERROR("registering pm sysfs failed (%d).\n", r); 4501 4502 r = amdgpu_ucode_sysfs_init(adev); 4503 if (r) { 4504 adev->ucode_sysfs_en = false; 4505 DRM_ERROR("Creating firmware sysfs failed (%d).\n", r); 4506 } else 4507 adev->ucode_sysfs_en = true; 4508 4509 r = sysfs_create_files(&adev->dev->kobj, amdgpu_dev_attributes); 4510 if (r) 4511 dev_err(adev->dev, "Could not create amdgpu device attr\n"); 4512 4513 r = devm_device_add_group(adev->dev, &amdgpu_board_attrs_group); 4514 if (r) 4515 dev_err(adev->dev, 4516 "Could not create amdgpu board attributes\n"); 4517 4518 amdgpu_fru_sysfs_init(adev); 4519 amdgpu_reg_state_sysfs_init(adev); 4520 amdgpu_xcp_cfg_sysfs_init(adev); 4521 4522 if (IS_ENABLED(CONFIG_PERF_EVENTS)) 4523 r = amdgpu_pmu_init(adev); 4524 if (r) 4525 dev_err(adev->dev, "amdgpu_pmu_init failed\n"); 4526 4527 /* Have stored pci confspace at hand for restore in sudden PCI error */ 4528 if (amdgpu_device_cache_pci_state(adev->pdev)) 4529 pci_restore_state(pdev); 4530 4531 /* if we have > 1 VGA cards, then disable the amdgpu VGA resources */ 4532 /* this will fail for cards that aren't VGA class devices, just 4533 * ignore it 4534 */ 4535 if ((adev->pdev->class >> 8) == PCI_CLASS_DISPLAY_VGA) 4536 vga_client_register(adev->pdev, amdgpu_device_vga_set_decode); 4537 4538 px = amdgpu_device_supports_px(ddev); 4539 4540 if (px || (!dev_is_removable(&adev->pdev->dev) && 4541 apple_gmux_detect(NULL, NULL))) 4542 vga_switcheroo_register_client(adev->pdev, 4543 &amdgpu_switcheroo_ops, px); 4544 4545 if (px) 4546 vga_switcheroo_init_domain_pm_ops(adev->dev, &adev->vga_pm_domain); 4547 4548 if (adev->init_lvl->level == AMDGPU_INIT_LEVEL_MINIMAL_XGMI) 4549 amdgpu_xgmi_reset_on_init(adev); 4550 4551 amdgpu_device_check_iommu_direct_map(adev); 4552 4553 return 0; 4554 4555 release_ras_con: 4556 if (amdgpu_sriov_vf(adev)) 4557 amdgpu_virt_release_full_gpu(adev, true); 4558 4559 /* failed in exclusive mode due to timeout */ 4560 if (amdgpu_sriov_vf(adev) && 4561 !amdgpu_sriov_runtime(adev) && 4562 amdgpu_virt_mmio_blocked(adev) && 4563 !amdgpu_virt_wait_reset(adev)) { 4564 dev_err(adev->dev, "VF exclusive mode timeout\n"); 4565 /* Don't send request since VF is inactive. */ 4566 adev->virt.caps &= ~AMDGPU_SRIOV_CAPS_RUNTIME; 4567 adev->virt.ops = NULL; 4568 r = -EAGAIN; 4569 } 4570 amdgpu_release_ras_context(adev); 4571 4572 failed: 4573 amdgpu_vf_error_trans_all(adev); 4574 4575 return r; 4576 } 4577 4578 static void amdgpu_device_unmap_mmio(struct amdgpu_device *adev) 4579 { 4580 4581 /* Clear all CPU mappings pointing to this device */ 4582 unmap_mapping_range(adev->ddev.anon_inode->i_mapping, 0, 0, 1); 4583 4584 /* Unmap all mapped bars - Doorbell, registers and VRAM */ 4585 amdgpu_doorbell_fini(adev); 4586 4587 iounmap(adev->rmmio); 4588 adev->rmmio = NULL; 4589 if (adev->mman.aper_base_kaddr) 4590 iounmap(adev->mman.aper_base_kaddr); 4591 adev->mman.aper_base_kaddr = NULL; 4592 4593 /* Memory manager related */ 4594 if (!adev->gmc.xgmi.connected_to_cpu && !adev->gmc.is_app_apu) { 4595 arch_phys_wc_del(adev->gmc.vram_mtrr); 4596 arch_io_free_memtype_wc(adev->gmc.aper_base, adev->gmc.aper_size); 4597 } 4598 } 4599 4600 /** 4601 * amdgpu_device_fini_hw - tear down the driver 4602 * 4603 * @adev: amdgpu_device pointer 4604 * 4605 * Tear down the driver info (all asics). 4606 * Called at driver shutdown. 4607 */ 4608 void amdgpu_device_fini_hw(struct amdgpu_device *adev) 4609 { 4610 dev_info(adev->dev, "amdgpu: finishing device.\n"); 4611 flush_delayed_work(&adev->delayed_init_work); 4612 4613 if (adev->mman.initialized) 4614 drain_workqueue(adev->mman.bdev.wq); 4615 adev->shutdown = true; 4616 4617 /* make sure IB test finished before entering exclusive mode 4618 * to avoid preemption on IB test 4619 */ 4620 if (amdgpu_sriov_vf(adev)) { 4621 amdgpu_virt_request_full_gpu(adev, false); 4622 amdgpu_virt_fini_data_exchange(adev); 4623 } 4624 4625 /* disable all interrupts */ 4626 amdgpu_irq_disable_all(adev); 4627 if (adev->mode_info.mode_config_initialized) { 4628 if (!drm_drv_uses_atomic_modeset(adev_to_drm(adev))) 4629 drm_helper_force_disable_all(adev_to_drm(adev)); 4630 else 4631 drm_atomic_helper_shutdown(adev_to_drm(adev)); 4632 } 4633 amdgpu_fence_driver_hw_fini(adev); 4634 4635 if (adev->pm.sysfs_initialized) 4636 amdgpu_pm_sysfs_fini(adev); 4637 if (adev->ucode_sysfs_en) 4638 amdgpu_ucode_sysfs_fini(adev); 4639 sysfs_remove_files(&adev->dev->kobj, amdgpu_dev_attributes); 4640 amdgpu_fru_sysfs_fini(adev); 4641 4642 amdgpu_reg_state_sysfs_fini(adev); 4643 amdgpu_xcp_cfg_sysfs_fini(adev); 4644 4645 /* disable ras feature must before hw fini */ 4646 amdgpu_ras_pre_fini(adev); 4647 4648 amdgpu_ttm_set_buffer_funcs_status(adev, false); 4649 4650 amdgpu_device_ip_fini_early(adev); 4651 4652 amdgpu_irq_fini_hw(adev); 4653 4654 if (adev->mman.initialized) 4655 ttm_device_clear_dma_mappings(&adev->mman.bdev); 4656 4657 amdgpu_gart_dummy_page_fini(adev); 4658 4659 if (drm_dev_is_unplugged(adev_to_drm(adev))) 4660 amdgpu_device_unmap_mmio(adev); 4661 4662 } 4663 4664 void amdgpu_device_fini_sw(struct amdgpu_device *adev) 4665 { 4666 int idx; 4667 bool px; 4668 4669 amdgpu_fence_driver_sw_fini(adev); 4670 amdgpu_device_ip_fini(adev); 4671 amdgpu_ucode_release(&adev->firmware.gpu_info_fw); 4672 adev->accel_working = false; 4673 dma_fence_put(rcu_dereference_protected(adev->gang_submit, true)); 4674 4675 amdgpu_reset_fini(adev); 4676 4677 /* free i2c buses */ 4678 if (!amdgpu_device_has_dc_support(adev)) 4679 amdgpu_i2c_fini(adev); 4680 4681 if (amdgpu_emu_mode != 1) 4682 amdgpu_atombios_fini(adev); 4683 4684 kfree(adev->bios); 4685 adev->bios = NULL; 4686 4687 kfree(adev->fru_info); 4688 adev->fru_info = NULL; 4689 4690 px = amdgpu_device_supports_px(adev_to_drm(adev)); 4691 4692 if (px || (!dev_is_removable(&adev->pdev->dev) && 4693 apple_gmux_detect(NULL, NULL))) 4694 vga_switcheroo_unregister_client(adev->pdev); 4695 4696 if (px) 4697 vga_switcheroo_fini_domain_pm_ops(adev->dev); 4698 4699 if ((adev->pdev->class >> 8) == PCI_CLASS_DISPLAY_VGA) 4700 vga_client_unregister(adev->pdev); 4701 4702 if (drm_dev_enter(adev_to_drm(adev), &idx)) { 4703 4704 iounmap(adev->rmmio); 4705 adev->rmmio = NULL; 4706 amdgpu_doorbell_fini(adev); 4707 drm_dev_exit(idx); 4708 } 4709 4710 if (IS_ENABLED(CONFIG_PERF_EVENTS)) 4711 amdgpu_pmu_fini(adev); 4712 if (adev->mman.discovery_bin) 4713 amdgpu_discovery_fini(adev); 4714 4715 amdgpu_reset_put_reset_domain(adev->reset_domain); 4716 adev->reset_domain = NULL; 4717 4718 kfree(adev->pci_state); 4719 4720 } 4721 4722 /** 4723 * amdgpu_device_evict_resources - evict device resources 4724 * @adev: amdgpu device object 4725 * 4726 * Evicts all ttm device resources(vram BOs, gart table) from the lru list 4727 * of the vram memory type. Mainly used for evicting device resources 4728 * at suspend time. 4729 * 4730 */ 4731 static int amdgpu_device_evict_resources(struct amdgpu_device *adev) 4732 { 4733 int ret; 4734 4735 /* No need to evict vram on APUs for suspend to ram or s2idle */ 4736 if ((adev->in_s3 || adev->in_s0ix) && (adev->flags & AMD_IS_APU)) 4737 return 0; 4738 4739 ret = amdgpu_ttm_evict_resources(adev, TTM_PL_VRAM); 4740 if (ret) 4741 DRM_WARN("evicting device resources failed\n"); 4742 return ret; 4743 } 4744 4745 /* 4746 * Suspend & resume. 4747 */ 4748 /** 4749 * amdgpu_device_prepare - prepare for device suspend 4750 * 4751 * @dev: drm dev pointer 4752 * 4753 * Prepare to put the hw in the suspend state (all asics). 4754 * Returns 0 for success or an error on failure. 4755 * Called at driver suspend. 4756 */ 4757 int amdgpu_device_prepare(struct drm_device *dev) 4758 { 4759 struct amdgpu_device *adev = drm_to_adev(dev); 4760 int i, r; 4761 4762 amdgpu_choose_low_power_state(adev); 4763 4764 if (dev->switch_power_state == DRM_SWITCH_POWER_OFF) 4765 return 0; 4766 4767 /* Evict the majority of BOs before starting suspend sequence */ 4768 r = amdgpu_device_evict_resources(adev); 4769 if (r) 4770 goto unprepare; 4771 4772 flush_delayed_work(&adev->gfx.gfx_off_delay_work); 4773 4774 for (i = 0; i < adev->num_ip_blocks; i++) { 4775 if (!adev->ip_blocks[i].status.valid) 4776 continue; 4777 if (!adev->ip_blocks[i].version->funcs->prepare_suspend) 4778 continue; 4779 r = adev->ip_blocks[i].version->funcs->prepare_suspend(&adev->ip_blocks[i]); 4780 if (r) 4781 goto unprepare; 4782 } 4783 4784 return 0; 4785 4786 unprepare: 4787 adev->in_s0ix = adev->in_s3 = false; 4788 4789 return r; 4790 } 4791 4792 /** 4793 * amdgpu_device_suspend - initiate device suspend 4794 * 4795 * @dev: drm dev pointer 4796 * @notify_clients: notify in-kernel DRM clients 4797 * 4798 * Puts the hw in the suspend state (all asics). 4799 * Returns 0 for success or an error on failure. 4800 * Called at driver suspend. 4801 */ 4802 int amdgpu_device_suspend(struct drm_device *dev, bool notify_clients) 4803 { 4804 struct amdgpu_device *adev = drm_to_adev(dev); 4805 int r = 0; 4806 4807 if (dev->switch_power_state == DRM_SWITCH_POWER_OFF) 4808 return 0; 4809 4810 adev->in_suspend = true; 4811 4812 if (amdgpu_sriov_vf(adev)) { 4813 amdgpu_virt_fini_data_exchange(adev); 4814 r = amdgpu_virt_request_full_gpu(adev, false); 4815 if (r) 4816 return r; 4817 } 4818 4819 if (amdgpu_acpi_smart_shift_update(dev, AMDGPU_SS_DEV_D3)) 4820 DRM_WARN("smart shift update failed\n"); 4821 4822 if (notify_clients) 4823 drm_client_dev_suspend(adev_to_drm(adev), false); 4824 4825 cancel_delayed_work_sync(&adev->delayed_init_work); 4826 4827 amdgpu_ras_suspend(adev); 4828 4829 amdgpu_device_ip_suspend_phase1(adev); 4830 4831 if (!adev->in_s0ix) 4832 amdgpu_amdkfd_suspend(adev, adev->in_runpm); 4833 4834 r = amdgpu_device_evict_resources(adev); 4835 if (r) 4836 return r; 4837 4838 amdgpu_ttm_set_buffer_funcs_status(adev, false); 4839 4840 amdgpu_fence_driver_hw_fini(adev); 4841 4842 amdgpu_device_ip_suspend_phase2(adev); 4843 4844 if (amdgpu_sriov_vf(adev)) 4845 amdgpu_virt_release_full_gpu(adev, false); 4846 4847 r = amdgpu_dpm_notify_rlc_state(adev, false); 4848 if (r) 4849 return r; 4850 4851 return 0; 4852 } 4853 4854 /** 4855 * amdgpu_device_resume - initiate device resume 4856 * 4857 * @dev: drm dev pointer 4858 * @notify_clients: notify in-kernel DRM clients 4859 * 4860 * Bring the hw back to operating state (all asics). 4861 * Returns 0 for success or an error on failure. 4862 * Called at driver resume. 4863 */ 4864 int amdgpu_device_resume(struct drm_device *dev, bool notify_clients) 4865 { 4866 struct amdgpu_device *adev = drm_to_adev(dev); 4867 int r = 0; 4868 4869 if (amdgpu_sriov_vf(adev)) { 4870 r = amdgpu_virt_request_full_gpu(adev, true); 4871 if (r) 4872 return r; 4873 } 4874 4875 if (dev->switch_power_state == DRM_SWITCH_POWER_OFF) 4876 return 0; 4877 4878 if (adev->in_s0ix) 4879 amdgpu_dpm_gfx_state_change(adev, sGpuChangeState_D0Entry); 4880 4881 /* post card */ 4882 if (amdgpu_device_need_post(adev)) { 4883 r = amdgpu_device_asic_init(adev); 4884 if (r) 4885 dev_err(adev->dev, "amdgpu asic init failed\n"); 4886 } 4887 4888 r = amdgpu_device_ip_resume(adev); 4889 4890 if (r) { 4891 dev_err(adev->dev, "amdgpu_device_ip_resume failed (%d).\n", r); 4892 goto exit; 4893 } 4894 amdgpu_fence_driver_hw_init(adev); 4895 4896 if (!adev->in_s0ix) { 4897 r = amdgpu_amdkfd_resume(adev, adev->in_runpm); 4898 if (r) 4899 goto exit; 4900 } 4901 4902 r = amdgpu_device_ip_late_init(adev); 4903 if (r) 4904 goto exit; 4905 4906 queue_delayed_work(system_wq, &adev->delayed_init_work, 4907 msecs_to_jiffies(AMDGPU_RESUME_MS)); 4908 exit: 4909 if (amdgpu_sriov_vf(adev)) { 4910 amdgpu_virt_init_data_exchange(adev); 4911 amdgpu_virt_release_full_gpu(adev, true); 4912 } 4913 4914 if (r) 4915 return r; 4916 4917 /* Make sure IB tests flushed */ 4918 flush_delayed_work(&adev->delayed_init_work); 4919 4920 if (notify_clients) 4921 drm_client_dev_resume(adev_to_drm(adev), false); 4922 4923 amdgpu_ras_resume(adev); 4924 4925 if (adev->mode_info.num_crtc) { 4926 /* 4927 * Most of the connector probing functions try to acquire runtime pm 4928 * refs to ensure that the GPU is powered on when connector polling is 4929 * performed. Since we're calling this from a runtime PM callback, 4930 * trying to acquire rpm refs will cause us to deadlock. 4931 * 4932 * Since we're guaranteed to be holding the rpm lock, it's safe to 4933 * temporarily disable the rpm helpers so this doesn't deadlock us. 4934 */ 4935 #ifdef CONFIG_PM 4936 dev->dev->power.disable_depth++; 4937 #endif 4938 if (!adev->dc_enabled) 4939 drm_helper_hpd_irq_event(dev); 4940 else 4941 drm_kms_helper_hotplug_event(dev); 4942 #ifdef CONFIG_PM 4943 dev->dev->power.disable_depth--; 4944 #endif 4945 } 4946 adev->in_suspend = false; 4947 4948 if (adev->enable_mes) 4949 amdgpu_mes_self_test(adev); 4950 4951 if (amdgpu_acpi_smart_shift_update(dev, AMDGPU_SS_DEV_D0)) 4952 DRM_WARN("smart shift update failed\n"); 4953 4954 return 0; 4955 } 4956 4957 /** 4958 * amdgpu_device_ip_check_soft_reset - did soft reset succeed 4959 * 4960 * @adev: amdgpu_device pointer 4961 * 4962 * The list of all the hardware IPs that make up the asic is walked and 4963 * the check_soft_reset callbacks are run. check_soft_reset determines 4964 * if the asic is still hung or not. 4965 * Returns true if any of the IPs are still in a hung state, false if not. 4966 */ 4967 static bool amdgpu_device_ip_check_soft_reset(struct amdgpu_device *adev) 4968 { 4969 int i; 4970 bool asic_hang = false; 4971 4972 if (amdgpu_sriov_vf(adev)) 4973 return true; 4974 4975 if (amdgpu_asic_need_full_reset(adev)) 4976 return true; 4977 4978 for (i = 0; i < adev->num_ip_blocks; i++) { 4979 if (!adev->ip_blocks[i].status.valid) 4980 continue; 4981 if (adev->ip_blocks[i].version->funcs->check_soft_reset) 4982 adev->ip_blocks[i].status.hang = 4983 adev->ip_blocks[i].version->funcs->check_soft_reset( 4984 &adev->ip_blocks[i]); 4985 if (adev->ip_blocks[i].status.hang) { 4986 dev_info(adev->dev, "IP block:%s is hung!\n", adev->ip_blocks[i].version->funcs->name); 4987 asic_hang = true; 4988 } 4989 } 4990 return asic_hang; 4991 } 4992 4993 /** 4994 * amdgpu_device_ip_pre_soft_reset - prepare for soft reset 4995 * 4996 * @adev: amdgpu_device pointer 4997 * 4998 * The list of all the hardware IPs that make up the asic is walked and the 4999 * pre_soft_reset callbacks are run if the block is hung. pre_soft_reset 5000 * handles any IP specific hardware or software state changes that are 5001 * necessary for a soft reset to succeed. 5002 * Returns 0 on success, negative error code on failure. 5003 */ 5004 static int amdgpu_device_ip_pre_soft_reset(struct amdgpu_device *adev) 5005 { 5006 int i, r = 0; 5007 5008 for (i = 0; i < adev->num_ip_blocks; i++) { 5009 if (!adev->ip_blocks[i].status.valid) 5010 continue; 5011 if (adev->ip_blocks[i].status.hang && 5012 adev->ip_blocks[i].version->funcs->pre_soft_reset) { 5013 r = adev->ip_blocks[i].version->funcs->pre_soft_reset(&adev->ip_blocks[i]); 5014 if (r) 5015 return r; 5016 } 5017 } 5018 5019 return 0; 5020 } 5021 5022 /** 5023 * amdgpu_device_ip_need_full_reset - check if a full asic reset is needed 5024 * 5025 * @adev: amdgpu_device pointer 5026 * 5027 * Some hardware IPs cannot be soft reset. If they are hung, a full gpu 5028 * reset is necessary to recover. 5029 * Returns true if a full asic reset is required, false if not. 5030 */ 5031 static bool amdgpu_device_ip_need_full_reset(struct amdgpu_device *adev) 5032 { 5033 int i; 5034 5035 if (amdgpu_asic_need_full_reset(adev)) 5036 return true; 5037 5038 for (i = 0; i < adev->num_ip_blocks; i++) { 5039 if (!adev->ip_blocks[i].status.valid) 5040 continue; 5041 if ((adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) || 5042 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) || 5043 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_ACP) || 5044 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE) || 5045 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) { 5046 if (adev->ip_blocks[i].status.hang) { 5047 dev_info(adev->dev, "Some block need full reset!\n"); 5048 return true; 5049 } 5050 } 5051 } 5052 return false; 5053 } 5054 5055 /** 5056 * amdgpu_device_ip_soft_reset - do a soft reset 5057 * 5058 * @adev: amdgpu_device pointer 5059 * 5060 * The list of all the hardware IPs that make up the asic is walked and the 5061 * soft_reset callbacks are run if the block is hung. soft_reset handles any 5062 * IP specific hardware or software state changes that are necessary to soft 5063 * reset the IP. 5064 * Returns 0 on success, negative error code on failure. 5065 */ 5066 static int amdgpu_device_ip_soft_reset(struct amdgpu_device *adev) 5067 { 5068 int i, r = 0; 5069 5070 for (i = 0; i < adev->num_ip_blocks; i++) { 5071 if (!adev->ip_blocks[i].status.valid) 5072 continue; 5073 if (adev->ip_blocks[i].status.hang && 5074 adev->ip_blocks[i].version->funcs->soft_reset) { 5075 r = adev->ip_blocks[i].version->funcs->soft_reset(&adev->ip_blocks[i]); 5076 if (r) 5077 return r; 5078 } 5079 } 5080 5081 return 0; 5082 } 5083 5084 /** 5085 * amdgpu_device_ip_post_soft_reset - clean up from soft reset 5086 * 5087 * @adev: amdgpu_device pointer 5088 * 5089 * The list of all the hardware IPs that make up the asic is walked and the 5090 * post_soft_reset callbacks are run if the asic was hung. post_soft_reset 5091 * handles any IP specific hardware or software state changes that are 5092 * necessary after the IP has been soft reset. 5093 * Returns 0 on success, negative error code on failure. 5094 */ 5095 static int amdgpu_device_ip_post_soft_reset(struct amdgpu_device *adev) 5096 { 5097 int i, r = 0; 5098 5099 for (i = 0; i < adev->num_ip_blocks; i++) { 5100 if (!adev->ip_blocks[i].status.valid) 5101 continue; 5102 if (adev->ip_blocks[i].status.hang && 5103 adev->ip_blocks[i].version->funcs->post_soft_reset) 5104 r = adev->ip_blocks[i].version->funcs->post_soft_reset(&adev->ip_blocks[i]); 5105 if (r) 5106 return r; 5107 } 5108 5109 return 0; 5110 } 5111 5112 /** 5113 * amdgpu_device_reset_sriov - reset ASIC for SR-IOV vf 5114 * 5115 * @adev: amdgpu_device pointer 5116 * @reset_context: amdgpu reset context pointer 5117 * 5118 * do VF FLR and reinitialize Asic 5119 * return 0 means succeeded otherwise failed 5120 */ 5121 static int amdgpu_device_reset_sriov(struct amdgpu_device *adev, 5122 struct amdgpu_reset_context *reset_context) 5123 { 5124 int r; 5125 struct amdgpu_hive_info *hive = NULL; 5126 5127 if (test_bit(AMDGPU_HOST_FLR, &reset_context->flags)) { 5128 if (!amdgpu_ras_get_fed_status(adev)) 5129 amdgpu_virt_ready_to_reset(adev); 5130 amdgpu_virt_wait_reset(adev); 5131 clear_bit(AMDGPU_HOST_FLR, &reset_context->flags); 5132 r = amdgpu_virt_request_full_gpu(adev, true); 5133 } else { 5134 r = amdgpu_virt_reset_gpu(adev); 5135 } 5136 if (r) 5137 return r; 5138 5139 amdgpu_ras_set_fed(adev, false); 5140 amdgpu_irq_gpu_reset_resume_helper(adev); 5141 5142 /* some sw clean up VF needs to do before recover */ 5143 amdgpu_virt_post_reset(adev); 5144 5145 /* Resume IP prior to SMC */ 5146 r = amdgpu_device_ip_reinit_early_sriov(adev); 5147 if (r) 5148 return r; 5149 5150 amdgpu_virt_init_data_exchange(adev); 5151 5152 r = amdgpu_device_fw_loading(adev); 5153 if (r) 5154 return r; 5155 5156 /* now we are okay to resume SMC/CP/SDMA */ 5157 r = amdgpu_device_ip_reinit_late_sriov(adev); 5158 if (r) 5159 return r; 5160 5161 hive = amdgpu_get_xgmi_hive(adev); 5162 /* Update PSP FW topology after reset */ 5163 if (hive && adev->gmc.xgmi.num_physical_nodes > 1) 5164 r = amdgpu_xgmi_update_topology(hive, adev); 5165 if (hive) 5166 amdgpu_put_xgmi_hive(hive); 5167 if (r) 5168 return r; 5169 5170 r = amdgpu_ib_ring_tests(adev); 5171 if (r) 5172 return r; 5173 5174 if (adev->virt.gim_feature & AMDGIM_FEATURE_GIM_FLR_VRAMLOST) 5175 amdgpu_inc_vram_lost(adev); 5176 5177 /* need to be called during full access so we can't do it later like 5178 * bare-metal does. 5179 */ 5180 amdgpu_amdkfd_post_reset(adev); 5181 amdgpu_virt_release_full_gpu(adev, true); 5182 5183 /* Aldebaran and gfx_11_0_3 support ras in SRIOV, so need resume ras during reset */ 5184 if (amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 2) || 5185 amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 3) || 5186 amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 4) || 5187 amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(11, 0, 3)) 5188 amdgpu_ras_resume(adev); 5189 return 0; 5190 } 5191 5192 /** 5193 * amdgpu_device_has_job_running - check if there is any job in mirror list 5194 * 5195 * @adev: amdgpu_device pointer 5196 * 5197 * check if there is any job in mirror list 5198 */ 5199 bool amdgpu_device_has_job_running(struct amdgpu_device *adev) 5200 { 5201 int i; 5202 struct drm_sched_job *job; 5203 5204 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 5205 struct amdgpu_ring *ring = adev->rings[i]; 5206 5207 if (!amdgpu_ring_sched_ready(ring)) 5208 continue; 5209 5210 spin_lock(&ring->sched.job_list_lock); 5211 job = list_first_entry_or_null(&ring->sched.pending_list, 5212 struct drm_sched_job, list); 5213 spin_unlock(&ring->sched.job_list_lock); 5214 if (job) 5215 return true; 5216 } 5217 return false; 5218 } 5219 5220 /** 5221 * amdgpu_device_should_recover_gpu - check if we should try GPU recovery 5222 * 5223 * @adev: amdgpu_device pointer 5224 * 5225 * Check amdgpu_gpu_recovery and SRIOV status to see if we should try to recover 5226 * a hung GPU. 5227 */ 5228 bool amdgpu_device_should_recover_gpu(struct amdgpu_device *adev) 5229 { 5230 5231 if (amdgpu_gpu_recovery == 0) 5232 goto disabled; 5233 5234 /* Skip soft reset check in fatal error mode */ 5235 if (!amdgpu_ras_is_poison_mode_supported(adev)) 5236 return true; 5237 5238 if (amdgpu_sriov_vf(adev)) 5239 return true; 5240 5241 if (amdgpu_gpu_recovery == -1) { 5242 switch (adev->asic_type) { 5243 #ifdef CONFIG_DRM_AMDGPU_SI 5244 case CHIP_VERDE: 5245 case CHIP_TAHITI: 5246 case CHIP_PITCAIRN: 5247 case CHIP_OLAND: 5248 case CHIP_HAINAN: 5249 #endif 5250 #ifdef CONFIG_DRM_AMDGPU_CIK 5251 case CHIP_KAVERI: 5252 case CHIP_KABINI: 5253 case CHIP_MULLINS: 5254 #endif 5255 case CHIP_CARRIZO: 5256 case CHIP_STONEY: 5257 case CHIP_CYAN_SKILLFISH: 5258 goto disabled; 5259 default: 5260 break; 5261 } 5262 } 5263 5264 return true; 5265 5266 disabled: 5267 dev_info(adev->dev, "GPU recovery disabled.\n"); 5268 return false; 5269 } 5270 5271 int amdgpu_device_mode1_reset(struct amdgpu_device *adev) 5272 { 5273 u32 i; 5274 int ret = 0; 5275 5276 amdgpu_atombios_scratch_regs_engine_hung(adev, true); 5277 5278 dev_info(adev->dev, "GPU mode1 reset\n"); 5279 5280 /* Cache the state before bus master disable. The saved config space 5281 * values are used in other cases like restore after mode-2 reset. 5282 */ 5283 amdgpu_device_cache_pci_state(adev->pdev); 5284 5285 /* disable BM */ 5286 pci_clear_master(adev->pdev); 5287 5288 if (amdgpu_dpm_is_mode1_reset_supported(adev)) { 5289 dev_info(adev->dev, "GPU smu mode1 reset\n"); 5290 ret = amdgpu_dpm_mode1_reset(adev); 5291 } else { 5292 dev_info(adev->dev, "GPU psp mode1 reset\n"); 5293 ret = psp_gpu_reset(adev); 5294 } 5295 5296 if (ret) 5297 goto mode1_reset_failed; 5298 5299 amdgpu_device_load_pci_state(adev->pdev); 5300 ret = amdgpu_psp_wait_for_bootloader(adev); 5301 if (ret) 5302 goto mode1_reset_failed; 5303 5304 /* wait for asic to come out of reset */ 5305 for (i = 0; i < adev->usec_timeout; i++) { 5306 u32 memsize = adev->nbio.funcs->get_memsize(adev); 5307 5308 if (memsize != 0xffffffff) 5309 break; 5310 udelay(1); 5311 } 5312 5313 if (i >= adev->usec_timeout) { 5314 ret = -ETIMEDOUT; 5315 goto mode1_reset_failed; 5316 } 5317 5318 amdgpu_atombios_scratch_regs_engine_hung(adev, false); 5319 5320 return 0; 5321 5322 mode1_reset_failed: 5323 dev_err(adev->dev, "GPU mode1 reset failed\n"); 5324 return ret; 5325 } 5326 5327 int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev, 5328 struct amdgpu_reset_context *reset_context) 5329 { 5330 int i, r = 0; 5331 struct amdgpu_job *job = NULL; 5332 struct amdgpu_device *tmp_adev = reset_context->reset_req_dev; 5333 bool need_full_reset = 5334 test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 5335 5336 if (reset_context->reset_req_dev == adev) 5337 job = reset_context->job; 5338 5339 if (amdgpu_sriov_vf(adev)) 5340 amdgpu_virt_pre_reset(adev); 5341 5342 amdgpu_fence_driver_isr_toggle(adev, true); 5343 5344 /* block all schedulers and reset given job's ring */ 5345 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 5346 struct amdgpu_ring *ring = adev->rings[i]; 5347 5348 if (!amdgpu_ring_sched_ready(ring)) 5349 continue; 5350 5351 /* Clear job fence from fence drv to avoid force_completion 5352 * leave NULL and vm flush fence in fence drv 5353 */ 5354 amdgpu_fence_driver_clear_job_fences(ring); 5355 5356 /* after all hw jobs are reset, hw fence is meaningless, so force_completion */ 5357 amdgpu_fence_driver_force_completion(ring); 5358 } 5359 5360 amdgpu_fence_driver_isr_toggle(adev, false); 5361 5362 if (job && job->vm) 5363 drm_sched_increase_karma(&job->base); 5364 5365 r = amdgpu_reset_prepare_hwcontext(adev, reset_context); 5366 /* If reset handler not implemented, continue; otherwise return */ 5367 if (r == -EOPNOTSUPP) 5368 r = 0; 5369 else 5370 return r; 5371 5372 /* Don't suspend on bare metal if we are not going to HW reset the ASIC */ 5373 if (!amdgpu_sriov_vf(adev)) { 5374 5375 if (!need_full_reset) 5376 need_full_reset = amdgpu_device_ip_need_full_reset(adev); 5377 5378 if (!need_full_reset && amdgpu_gpu_recovery && 5379 amdgpu_device_ip_check_soft_reset(adev)) { 5380 amdgpu_device_ip_pre_soft_reset(adev); 5381 r = amdgpu_device_ip_soft_reset(adev); 5382 amdgpu_device_ip_post_soft_reset(adev); 5383 if (r || amdgpu_device_ip_check_soft_reset(adev)) { 5384 dev_info(adev->dev, "soft reset failed, will fallback to full reset!\n"); 5385 need_full_reset = true; 5386 } 5387 } 5388 5389 if (!test_bit(AMDGPU_SKIP_COREDUMP, &reset_context->flags)) { 5390 dev_info(tmp_adev->dev, "Dumping IP State\n"); 5391 /* Trigger ip dump before we reset the asic */ 5392 for (i = 0; i < tmp_adev->num_ip_blocks; i++) 5393 if (tmp_adev->ip_blocks[i].version->funcs->dump_ip_state) 5394 tmp_adev->ip_blocks[i].version->funcs 5395 ->dump_ip_state((void *)&tmp_adev->ip_blocks[i]); 5396 dev_info(tmp_adev->dev, "Dumping IP State Completed\n"); 5397 } 5398 5399 if (need_full_reset) 5400 r = amdgpu_device_ip_suspend(adev); 5401 if (need_full_reset) 5402 set_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 5403 else 5404 clear_bit(AMDGPU_NEED_FULL_RESET, 5405 &reset_context->flags); 5406 } 5407 5408 return r; 5409 } 5410 5411 int amdgpu_device_reinit_after_reset(struct amdgpu_reset_context *reset_context) 5412 { 5413 struct list_head *device_list_handle; 5414 bool full_reset, vram_lost = false; 5415 struct amdgpu_device *tmp_adev; 5416 int r; 5417 5418 device_list_handle = reset_context->reset_device_list; 5419 5420 if (!device_list_handle) 5421 return -EINVAL; 5422 5423 full_reset = test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 5424 5425 r = 0; 5426 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 5427 /* After reset, it's default init level */ 5428 amdgpu_set_init_level(tmp_adev, AMDGPU_INIT_LEVEL_DEFAULT); 5429 if (full_reset) { 5430 /* post card */ 5431 amdgpu_ras_set_fed(tmp_adev, false); 5432 r = amdgpu_device_asic_init(tmp_adev); 5433 if (r) { 5434 dev_warn(tmp_adev->dev, "asic atom init failed!"); 5435 } else { 5436 dev_info(tmp_adev->dev, "GPU reset succeeded, trying to resume\n"); 5437 5438 r = amdgpu_device_ip_resume_phase1(tmp_adev); 5439 if (r) 5440 goto out; 5441 5442 vram_lost = amdgpu_device_check_vram_lost(tmp_adev); 5443 5444 if (!test_bit(AMDGPU_SKIP_COREDUMP, &reset_context->flags)) 5445 amdgpu_coredump(tmp_adev, false, vram_lost, reset_context->job); 5446 5447 if (vram_lost) { 5448 DRM_INFO("VRAM is lost due to GPU reset!\n"); 5449 amdgpu_inc_vram_lost(tmp_adev); 5450 } 5451 5452 r = amdgpu_device_fw_loading(tmp_adev); 5453 if (r) 5454 return r; 5455 5456 r = amdgpu_xcp_restore_partition_mode( 5457 tmp_adev->xcp_mgr); 5458 if (r) 5459 goto out; 5460 5461 r = amdgpu_device_ip_resume_phase2(tmp_adev); 5462 if (r) 5463 goto out; 5464 5465 if (tmp_adev->mman.buffer_funcs_ring->sched.ready) 5466 amdgpu_ttm_set_buffer_funcs_status(tmp_adev, true); 5467 5468 if (vram_lost) 5469 amdgpu_device_fill_reset_magic(tmp_adev); 5470 5471 /* 5472 * Add this ASIC as tracked as reset was already 5473 * complete successfully. 5474 */ 5475 amdgpu_register_gpu_instance(tmp_adev); 5476 5477 if (!reset_context->hive && 5478 tmp_adev->gmc.xgmi.num_physical_nodes > 1) 5479 amdgpu_xgmi_add_device(tmp_adev); 5480 5481 r = amdgpu_device_ip_late_init(tmp_adev); 5482 if (r) 5483 goto out; 5484 5485 drm_client_dev_resume(adev_to_drm(tmp_adev), false); 5486 5487 /* 5488 * The GPU enters bad state once faulty pages 5489 * by ECC has reached the threshold, and ras 5490 * recovery is scheduled next. So add one check 5491 * here to break recovery if it indeed exceeds 5492 * bad page threshold, and remind user to 5493 * retire this GPU or setting one bigger 5494 * bad_page_threshold value to fix this once 5495 * probing driver again. 5496 */ 5497 if (!amdgpu_ras_is_rma(tmp_adev)) { 5498 /* must succeed. */ 5499 amdgpu_ras_resume(tmp_adev); 5500 } else { 5501 r = -EINVAL; 5502 goto out; 5503 } 5504 5505 /* Update PSP FW topology after reset */ 5506 if (reset_context->hive && 5507 tmp_adev->gmc.xgmi.num_physical_nodes > 1) 5508 r = amdgpu_xgmi_update_topology( 5509 reset_context->hive, tmp_adev); 5510 } 5511 } 5512 5513 out: 5514 if (!r) { 5515 amdgpu_irq_gpu_reset_resume_helper(tmp_adev); 5516 r = amdgpu_ib_ring_tests(tmp_adev); 5517 if (r) { 5518 dev_err(tmp_adev->dev, "ib ring test failed (%d).\n", r); 5519 r = -EAGAIN; 5520 goto end; 5521 } 5522 } 5523 5524 if (r) 5525 tmp_adev->asic_reset_res = r; 5526 } 5527 5528 end: 5529 return r; 5530 } 5531 5532 int amdgpu_do_asic_reset(struct list_head *device_list_handle, 5533 struct amdgpu_reset_context *reset_context) 5534 { 5535 struct amdgpu_device *tmp_adev = NULL; 5536 bool need_full_reset, skip_hw_reset; 5537 int r = 0; 5538 5539 /* Try reset handler method first */ 5540 tmp_adev = list_first_entry(device_list_handle, struct amdgpu_device, 5541 reset_list); 5542 5543 reset_context->reset_device_list = device_list_handle; 5544 r = amdgpu_reset_perform_reset(tmp_adev, reset_context); 5545 /* If reset handler not implemented, continue; otherwise return */ 5546 if (r == -EOPNOTSUPP) 5547 r = 0; 5548 else 5549 return r; 5550 5551 /* Reset handler not implemented, use the default method */ 5552 need_full_reset = 5553 test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 5554 skip_hw_reset = test_bit(AMDGPU_SKIP_HW_RESET, &reset_context->flags); 5555 5556 /* 5557 * ASIC reset has to be done on all XGMI hive nodes ASAP 5558 * to allow proper links negotiation in FW (within 1 sec) 5559 */ 5560 if (!skip_hw_reset && need_full_reset) { 5561 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 5562 /* For XGMI run all resets in parallel to speed up the process */ 5563 if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) { 5564 if (!queue_work(system_unbound_wq, 5565 &tmp_adev->xgmi_reset_work)) 5566 r = -EALREADY; 5567 } else 5568 r = amdgpu_asic_reset(tmp_adev); 5569 5570 if (r) { 5571 dev_err(tmp_adev->dev, 5572 "ASIC reset failed with error, %d for drm dev, %s", 5573 r, adev_to_drm(tmp_adev)->unique); 5574 goto out; 5575 } 5576 } 5577 5578 /* For XGMI wait for all resets to complete before proceed */ 5579 if (!r) { 5580 list_for_each_entry(tmp_adev, device_list_handle, 5581 reset_list) { 5582 if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) { 5583 flush_work(&tmp_adev->xgmi_reset_work); 5584 r = tmp_adev->asic_reset_res; 5585 if (r) 5586 break; 5587 } 5588 } 5589 } 5590 } 5591 5592 if (!r && amdgpu_ras_intr_triggered()) { 5593 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 5594 amdgpu_ras_reset_error_count(tmp_adev, 5595 AMDGPU_RAS_BLOCK__MMHUB); 5596 } 5597 5598 amdgpu_ras_intr_cleared(); 5599 } 5600 5601 r = amdgpu_device_reinit_after_reset(reset_context); 5602 if (r == -EAGAIN) 5603 set_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 5604 else 5605 clear_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 5606 5607 out: 5608 return r; 5609 } 5610 5611 static void amdgpu_device_set_mp1_state(struct amdgpu_device *adev) 5612 { 5613 5614 switch (amdgpu_asic_reset_method(adev)) { 5615 case AMD_RESET_METHOD_MODE1: 5616 adev->mp1_state = PP_MP1_STATE_SHUTDOWN; 5617 break; 5618 case AMD_RESET_METHOD_MODE2: 5619 adev->mp1_state = PP_MP1_STATE_RESET; 5620 break; 5621 default: 5622 adev->mp1_state = PP_MP1_STATE_NONE; 5623 break; 5624 } 5625 } 5626 5627 static void amdgpu_device_unset_mp1_state(struct amdgpu_device *adev) 5628 { 5629 amdgpu_vf_error_trans_all(adev); 5630 adev->mp1_state = PP_MP1_STATE_NONE; 5631 } 5632 5633 static void amdgpu_device_resume_display_audio(struct amdgpu_device *adev) 5634 { 5635 struct pci_dev *p = NULL; 5636 5637 p = pci_get_domain_bus_and_slot(pci_domain_nr(adev->pdev->bus), 5638 adev->pdev->bus->number, 1); 5639 if (p) { 5640 pm_runtime_enable(&(p->dev)); 5641 pm_runtime_resume(&(p->dev)); 5642 } 5643 5644 pci_dev_put(p); 5645 } 5646 5647 static int amdgpu_device_suspend_display_audio(struct amdgpu_device *adev) 5648 { 5649 enum amd_reset_method reset_method; 5650 struct pci_dev *p = NULL; 5651 u64 expires; 5652 5653 /* 5654 * For now, only BACO and mode1 reset are confirmed 5655 * to suffer the audio issue without proper suspended. 5656 */ 5657 reset_method = amdgpu_asic_reset_method(adev); 5658 if ((reset_method != AMD_RESET_METHOD_BACO) && 5659 (reset_method != AMD_RESET_METHOD_MODE1)) 5660 return -EINVAL; 5661 5662 p = pci_get_domain_bus_and_slot(pci_domain_nr(adev->pdev->bus), 5663 adev->pdev->bus->number, 1); 5664 if (!p) 5665 return -ENODEV; 5666 5667 expires = pm_runtime_autosuspend_expiration(&(p->dev)); 5668 if (!expires) 5669 /* 5670 * If we cannot get the audio device autosuspend delay, 5671 * a fixed 4S interval will be used. Considering 3S is 5672 * the audio controller default autosuspend delay setting. 5673 * 4S used here is guaranteed to cover that. 5674 */ 5675 expires = ktime_get_mono_fast_ns() + NSEC_PER_SEC * 4ULL; 5676 5677 while (!pm_runtime_status_suspended(&(p->dev))) { 5678 if (!pm_runtime_suspend(&(p->dev))) 5679 break; 5680 5681 if (expires < ktime_get_mono_fast_ns()) { 5682 dev_warn(adev->dev, "failed to suspend display audio\n"); 5683 pci_dev_put(p); 5684 /* TODO: abort the succeeding gpu reset? */ 5685 return -ETIMEDOUT; 5686 } 5687 } 5688 5689 pm_runtime_disable(&(p->dev)); 5690 5691 pci_dev_put(p); 5692 return 0; 5693 } 5694 5695 static inline void amdgpu_device_stop_pending_resets(struct amdgpu_device *adev) 5696 { 5697 struct amdgpu_ras *con = amdgpu_ras_get_context(adev); 5698 5699 #if defined(CONFIG_DEBUG_FS) 5700 if (!amdgpu_sriov_vf(adev)) 5701 cancel_work(&adev->reset_work); 5702 #endif 5703 5704 if (adev->kfd.dev) 5705 cancel_work(&adev->kfd.reset_work); 5706 5707 if (amdgpu_sriov_vf(adev)) 5708 cancel_work(&adev->virt.flr_work); 5709 5710 if (con && adev->ras_enabled) 5711 cancel_work(&con->recovery_work); 5712 5713 } 5714 5715 static int amdgpu_device_health_check(struct list_head *device_list_handle) 5716 { 5717 struct amdgpu_device *tmp_adev; 5718 int ret = 0; 5719 u32 status; 5720 5721 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 5722 pci_read_config_dword(tmp_adev->pdev, PCI_COMMAND, &status); 5723 if (PCI_POSSIBLE_ERROR(status)) { 5724 dev_err(tmp_adev->dev, "device lost from bus!"); 5725 ret = -ENODEV; 5726 } 5727 } 5728 5729 return ret; 5730 } 5731 5732 /** 5733 * amdgpu_device_gpu_recover - reset the asic and recover scheduler 5734 * 5735 * @adev: amdgpu_device pointer 5736 * @job: which job trigger hang 5737 * @reset_context: amdgpu reset context pointer 5738 * 5739 * Attempt to reset the GPU if it has hung (all asics). 5740 * Attempt to do soft-reset or full-reset and reinitialize Asic 5741 * Returns 0 for success or an error on failure. 5742 */ 5743 5744 int amdgpu_device_gpu_recover(struct amdgpu_device *adev, 5745 struct amdgpu_job *job, 5746 struct amdgpu_reset_context *reset_context) 5747 { 5748 struct list_head device_list, *device_list_handle = NULL; 5749 bool job_signaled = false; 5750 struct amdgpu_hive_info *hive = NULL; 5751 struct amdgpu_device *tmp_adev = NULL; 5752 int i, r = 0; 5753 bool need_emergency_restart = false; 5754 bool audio_suspended = false; 5755 int retry_limit = AMDGPU_MAX_RETRY_LIMIT; 5756 5757 /* 5758 * Special case: RAS triggered and full reset isn't supported 5759 */ 5760 need_emergency_restart = amdgpu_ras_need_emergency_restart(adev); 5761 5762 /* 5763 * Flush RAM to disk so that after reboot 5764 * the user can read log and see why the system rebooted. 5765 */ 5766 if (need_emergency_restart && amdgpu_ras_get_context(adev) && 5767 amdgpu_ras_get_context(adev)->reboot) { 5768 DRM_WARN("Emergency reboot."); 5769 5770 ksys_sync_helper(); 5771 emergency_restart(); 5772 } 5773 5774 dev_info(adev->dev, "GPU %s begin!\n", 5775 need_emergency_restart ? "jobs stop":"reset"); 5776 5777 if (!amdgpu_sriov_vf(adev)) 5778 hive = amdgpu_get_xgmi_hive(adev); 5779 if (hive) 5780 mutex_lock(&hive->hive_lock); 5781 5782 reset_context->job = job; 5783 reset_context->hive = hive; 5784 /* 5785 * Build list of devices to reset. 5786 * In case we are in XGMI hive mode, resort the device list 5787 * to put adev in the 1st position. 5788 */ 5789 INIT_LIST_HEAD(&device_list); 5790 if (!amdgpu_sriov_vf(adev) && (adev->gmc.xgmi.num_physical_nodes > 1) && hive) { 5791 list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head) { 5792 list_add_tail(&tmp_adev->reset_list, &device_list); 5793 if (adev->shutdown) 5794 tmp_adev->shutdown = true; 5795 } 5796 if (!list_is_first(&adev->reset_list, &device_list)) 5797 list_rotate_to_front(&adev->reset_list, &device_list); 5798 device_list_handle = &device_list; 5799 } else { 5800 list_add_tail(&adev->reset_list, &device_list); 5801 device_list_handle = &device_list; 5802 } 5803 5804 if (!amdgpu_sriov_vf(adev)) { 5805 r = amdgpu_device_health_check(device_list_handle); 5806 if (r) 5807 goto end_reset; 5808 } 5809 5810 /* We need to lock reset domain only once both for XGMI and single device */ 5811 tmp_adev = list_first_entry(device_list_handle, struct amdgpu_device, 5812 reset_list); 5813 amdgpu_device_lock_reset_domain(tmp_adev->reset_domain); 5814 5815 /* block all schedulers and reset given job's ring */ 5816 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 5817 5818 amdgpu_device_set_mp1_state(tmp_adev); 5819 5820 /* 5821 * Try to put the audio codec into suspend state 5822 * before gpu reset started. 5823 * 5824 * Due to the power domain of the graphics device 5825 * is shared with AZ power domain. Without this, 5826 * we may change the audio hardware from behind 5827 * the audio driver's back. That will trigger 5828 * some audio codec errors. 5829 */ 5830 if (!amdgpu_device_suspend_display_audio(tmp_adev)) 5831 audio_suspended = true; 5832 5833 amdgpu_ras_set_error_query_ready(tmp_adev, false); 5834 5835 cancel_delayed_work_sync(&tmp_adev->delayed_init_work); 5836 5837 amdgpu_amdkfd_pre_reset(tmp_adev, reset_context); 5838 5839 /* 5840 * Mark these ASICs to be reseted as untracked first 5841 * And add them back after reset completed 5842 */ 5843 amdgpu_unregister_gpu_instance(tmp_adev); 5844 5845 drm_client_dev_suspend(adev_to_drm(tmp_adev), false); 5846 5847 /* disable ras on ALL IPs */ 5848 if (!need_emergency_restart && 5849 amdgpu_device_ip_need_full_reset(tmp_adev)) 5850 amdgpu_ras_suspend(tmp_adev); 5851 5852 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 5853 struct amdgpu_ring *ring = tmp_adev->rings[i]; 5854 5855 if (!amdgpu_ring_sched_ready(ring)) 5856 continue; 5857 5858 drm_sched_stop(&ring->sched, job ? &job->base : NULL); 5859 5860 if (need_emergency_restart) 5861 amdgpu_job_stop_all_jobs_on_sched(&ring->sched); 5862 } 5863 atomic_inc(&tmp_adev->gpu_reset_counter); 5864 } 5865 5866 if (need_emergency_restart) 5867 goto skip_sched_resume; 5868 5869 /* 5870 * Must check guilty signal here since after this point all old 5871 * HW fences are force signaled. 5872 * 5873 * job->base holds a reference to parent fence 5874 */ 5875 if (job && dma_fence_is_signaled(&job->hw_fence)) { 5876 job_signaled = true; 5877 dev_info(adev->dev, "Guilty job already signaled, skipping HW reset"); 5878 goto skip_hw_reset; 5879 } 5880 5881 retry: /* Rest of adevs pre asic reset from XGMI hive. */ 5882 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 5883 r = amdgpu_device_pre_asic_reset(tmp_adev, reset_context); 5884 /*TODO Should we stop ?*/ 5885 if (r) { 5886 dev_err(tmp_adev->dev, "GPU pre asic reset failed with err, %d for drm dev, %s ", 5887 r, adev_to_drm(tmp_adev)->unique); 5888 tmp_adev->asic_reset_res = r; 5889 } 5890 } 5891 5892 /* Actual ASIC resets if needed.*/ 5893 /* Host driver will handle XGMI hive reset for SRIOV */ 5894 if (amdgpu_sriov_vf(adev)) { 5895 if (amdgpu_ras_get_fed_status(adev) || amdgpu_virt_rcvd_ras_interrupt(adev)) { 5896 dev_dbg(adev->dev, "Detected RAS error, wait for FLR completion\n"); 5897 amdgpu_ras_set_fed(adev, true); 5898 set_bit(AMDGPU_HOST_FLR, &reset_context->flags); 5899 } 5900 5901 r = amdgpu_device_reset_sriov(adev, reset_context); 5902 if (AMDGPU_RETRY_SRIOV_RESET(r) && (retry_limit--) > 0) { 5903 amdgpu_virt_release_full_gpu(adev, true); 5904 goto retry; 5905 } 5906 if (r) 5907 adev->asic_reset_res = r; 5908 } else { 5909 r = amdgpu_do_asic_reset(device_list_handle, reset_context); 5910 if (r && r == -EAGAIN) 5911 goto retry; 5912 } 5913 5914 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 5915 /* 5916 * Drop any pending non scheduler resets queued before reset is done. 5917 * Any reset scheduled after this point would be valid. Scheduler resets 5918 * were already dropped during drm_sched_stop and no new ones can come 5919 * in before drm_sched_start. 5920 */ 5921 amdgpu_device_stop_pending_resets(tmp_adev); 5922 } 5923 5924 skip_hw_reset: 5925 5926 /* Post ASIC reset for all devs .*/ 5927 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 5928 5929 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 5930 struct amdgpu_ring *ring = tmp_adev->rings[i]; 5931 5932 if (!amdgpu_ring_sched_ready(ring)) 5933 continue; 5934 5935 drm_sched_start(&ring->sched, 0); 5936 } 5937 5938 if (!drm_drv_uses_atomic_modeset(adev_to_drm(tmp_adev)) && !job_signaled) 5939 drm_helper_resume_force_mode(adev_to_drm(tmp_adev)); 5940 5941 if (tmp_adev->asic_reset_res) 5942 r = tmp_adev->asic_reset_res; 5943 5944 tmp_adev->asic_reset_res = 0; 5945 5946 if (r) { 5947 /* bad news, how to tell it to userspace ? 5948 * for ras error, we should report GPU bad status instead of 5949 * reset failure 5950 */ 5951 if (reset_context->src != AMDGPU_RESET_SRC_RAS || 5952 !amdgpu_ras_eeprom_check_err_threshold(tmp_adev)) 5953 dev_info(tmp_adev->dev, "GPU reset(%d) failed\n", 5954 atomic_read(&tmp_adev->gpu_reset_counter)); 5955 amdgpu_vf_error_put(tmp_adev, AMDGIM_ERROR_VF_GPU_RESET_FAIL, 0, r); 5956 } else { 5957 dev_info(tmp_adev->dev, "GPU reset(%d) succeeded!\n", atomic_read(&tmp_adev->gpu_reset_counter)); 5958 if (amdgpu_acpi_smart_shift_update(adev_to_drm(tmp_adev), AMDGPU_SS_DEV_D0)) 5959 DRM_WARN("smart shift update failed\n"); 5960 } 5961 } 5962 5963 skip_sched_resume: 5964 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 5965 /* unlock kfd: SRIOV would do it separately */ 5966 if (!need_emergency_restart && !amdgpu_sriov_vf(tmp_adev)) 5967 amdgpu_amdkfd_post_reset(tmp_adev); 5968 5969 /* kfd_post_reset will do nothing if kfd device is not initialized, 5970 * need to bring up kfd here if it's not be initialized before 5971 */ 5972 if (!adev->kfd.init_complete) 5973 amdgpu_amdkfd_device_init(adev); 5974 5975 if (audio_suspended) 5976 amdgpu_device_resume_display_audio(tmp_adev); 5977 5978 amdgpu_device_unset_mp1_state(tmp_adev); 5979 5980 amdgpu_ras_set_error_query_ready(tmp_adev, true); 5981 } 5982 5983 tmp_adev = list_first_entry(device_list_handle, struct amdgpu_device, 5984 reset_list); 5985 amdgpu_device_unlock_reset_domain(tmp_adev->reset_domain); 5986 5987 end_reset: 5988 if (hive) { 5989 mutex_unlock(&hive->hive_lock); 5990 amdgpu_put_xgmi_hive(hive); 5991 } 5992 5993 if (r) 5994 dev_info(adev->dev, "GPU reset end with ret = %d\n", r); 5995 5996 atomic_set(&adev->reset_domain->reset_res, r); 5997 return r; 5998 } 5999 6000 /** 6001 * amdgpu_device_partner_bandwidth - find the bandwidth of appropriate partner 6002 * 6003 * @adev: amdgpu_device pointer 6004 * @speed: pointer to the speed of the link 6005 * @width: pointer to the width of the link 6006 * 6007 * Evaluate the hierarchy to find the speed and bandwidth capabilities of the 6008 * first physical partner to an AMD dGPU. 6009 * This will exclude any virtual switches and links. 6010 */ 6011 static void amdgpu_device_partner_bandwidth(struct amdgpu_device *adev, 6012 enum pci_bus_speed *speed, 6013 enum pcie_link_width *width) 6014 { 6015 struct pci_dev *parent = adev->pdev; 6016 6017 if (!speed || !width) 6018 return; 6019 6020 *speed = PCI_SPEED_UNKNOWN; 6021 *width = PCIE_LNK_WIDTH_UNKNOWN; 6022 6023 if (amdgpu_device_pcie_dynamic_switching_supported(adev)) { 6024 while ((parent = pci_upstream_bridge(parent))) { 6025 /* skip upstream/downstream switches internal to dGPU*/ 6026 if (parent->vendor == PCI_VENDOR_ID_ATI) 6027 continue; 6028 *speed = pcie_get_speed_cap(parent); 6029 *width = pcie_get_width_cap(parent); 6030 break; 6031 } 6032 } else { 6033 /* use the current speeds rather than max if switching is not supported */ 6034 pcie_bandwidth_available(adev->pdev, NULL, speed, width); 6035 } 6036 } 6037 6038 /** 6039 * amdgpu_device_get_pcie_info - fence pcie info about the PCIE slot 6040 * 6041 * @adev: amdgpu_device pointer 6042 * 6043 * Fetchs and stores in the driver the PCIE capabilities (gen speed 6044 * and lanes) of the slot the device is in. Handles APUs and 6045 * virtualized environments where PCIE config space may not be available. 6046 */ 6047 static void amdgpu_device_get_pcie_info(struct amdgpu_device *adev) 6048 { 6049 struct pci_dev *pdev; 6050 enum pci_bus_speed speed_cap, platform_speed_cap; 6051 enum pcie_link_width platform_link_width; 6052 6053 if (amdgpu_pcie_gen_cap) 6054 adev->pm.pcie_gen_mask = amdgpu_pcie_gen_cap; 6055 6056 if (amdgpu_pcie_lane_cap) 6057 adev->pm.pcie_mlw_mask = amdgpu_pcie_lane_cap; 6058 6059 /* covers APUs as well */ 6060 if (pci_is_root_bus(adev->pdev->bus) && !amdgpu_passthrough(adev)) { 6061 if (adev->pm.pcie_gen_mask == 0) 6062 adev->pm.pcie_gen_mask = AMDGPU_DEFAULT_PCIE_GEN_MASK; 6063 if (adev->pm.pcie_mlw_mask == 0) 6064 adev->pm.pcie_mlw_mask = AMDGPU_DEFAULT_PCIE_MLW_MASK; 6065 return; 6066 } 6067 6068 if (adev->pm.pcie_gen_mask && adev->pm.pcie_mlw_mask) 6069 return; 6070 6071 amdgpu_device_partner_bandwidth(adev, &platform_speed_cap, 6072 &platform_link_width); 6073 6074 if (adev->pm.pcie_gen_mask == 0) { 6075 /* asic caps */ 6076 pdev = adev->pdev; 6077 speed_cap = pcie_get_speed_cap(pdev); 6078 if (speed_cap == PCI_SPEED_UNKNOWN) { 6079 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 6080 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 | 6081 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3); 6082 } else { 6083 if (speed_cap == PCIE_SPEED_32_0GT) 6084 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 6085 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 | 6086 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3 | 6087 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN4 | 6088 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN5); 6089 else if (speed_cap == PCIE_SPEED_16_0GT) 6090 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 6091 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 | 6092 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3 | 6093 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN4); 6094 else if (speed_cap == PCIE_SPEED_8_0GT) 6095 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 6096 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 | 6097 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3); 6098 else if (speed_cap == PCIE_SPEED_5_0GT) 6099 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 6100 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2); 6101 else 6102 adev->pm.pcie_gen_mask |= CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1; 6103 } 6104 /* platform caps */ 6105 if (platform_speed_cap == PCI_SPEED_UNKNOWN) { 6106 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 6107 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2); 6108 } else { 6109 if (platform_speed_cap == PCIE_SPEED_32_0GT) 6110 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 6111 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 | 6112 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3 | 6113 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN4 | 6114 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN5); 6115 else if (platform_speed_cap == PCIE_SPEED_16_0GT) 6116 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 6117 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 | 6118 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3 | 6119 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN4); 6120 else if (platform_speed_cap == PCIE_SPEED_8_0GT) 6121 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 6122 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 | 6123 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3); 6124 else if (platform_speed_cap == PCIE_SPEED_5_0GT) 6125 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 6126 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2); 6127 else 6128 adev->pm.pcie_gen_mask |= CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1; 6129 6130 } 6131 } 6132 if (adev->pm.pcie_mlw_mask == 0) { 6133 if (platform_link_width == PCIE_LNK_WIDTH_UNKNOWN) { 6134 adev->pm.pcie_mlw_mask |= AMDGPU_DEFAULT_PCIE_MLW_MASK; 6135 } else { 6136 switch (platform_link_width) { 6137 case PCIE_LNK_X32: 6138 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X32 | 6139 CAIL_PCIE_LINK_WIDTH_SUPPORT_X16 | 6140 CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 | 6141 CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 | 6142 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 6143 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 6144 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 6145 break; 6146 case PCIE_LNK_X16: 6147 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X16 | 6148 CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 | 6149 CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 | 6150 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 6151 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 6152 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 6153 break; 6154 case PCIE_LNK_X12: 6155 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 | 6156 CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 | 6157 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 6158 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 6159 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 6160 break; 6161 case PCIE_LNK_X8: 6162 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 | 6163 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 6164 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 6165 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 6166 break; 6167 case PCIE_LNK_X4: 6168 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 6169 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 6170 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 6171 break; 6172 case PCIE_LNK_X2: 6173 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 6174 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 6175 break; 6176 case PCIE_LNK_X1: 6177 adev->pm.pcie_mlw_mask = CAIL_PCIE_LINK_WIDTH_SUPPORT_X1; 6178 break; 6179 default: 6180 break; 6181 } 6182 } 6183 } 6184 } 6185 6186 /** 6187 * amdgpu_device_is_peer_accessible - Check peer access through PCIe BAR 6188 * 6189 * @adev: amdgpu_device pointer 6190 * @peer_adev: amdgpu_device pointer for peer device trying to access @adev 6191 * 6192 * Return true if @peer_adev can access (DMA) @adev through the PCIe 6193 * BAR, i.e. @adev is "large BAR" and the BAR matches the DMA mask of 6194 * @peer_adev. 6195 */ 6196 bool amdgpu_device_is_peer_accessible(struct amdgpu_device *adev, 6197 struct amdgpu_device *peer_adev) 6198 { 6199 #ifdef CONFIG_HSA_AMD_P2P 6200 bool p2p_access = 6201 !adev->gmc.xgmi.connected_to_cpu && 6202 !(pci_p2pdma_distance(adev->pdev, peer_adev->dev, false) < 0); 6203 if (!p2p_access) 6204 dev_info(adev->dev, "PCIe P2P access from peer device %s is not supported by the chipset\n", 6205 pci_name(peer_adev->pdev)); 6206 6207 bool is_large_bar = adev->gmc.visible_vram_size && 6208 adev->gmc.real_vram_size == adev->gmc.visible_vram_size; 6209 bool p2p_addressable = amdgpu_device_check_iommu_remap(peer_adev); 6210 6211 if (!p2p_addressable) { 6212 uint64_t address_mask = peer_adev->dev->dma_mask ? 6213 ~*peer_adev->dev->dma_mask : ~((1ULL << 32) - 1); 6214 resource_size_t aper_limit = 6215 adev->gmc.aper_base + adev->gmc.aper_size - 1; 6216 6217 p2p_addressable = !(adev->gmc.aper_base & address_mask || 6218 aper_limit & address_mask); 6219 } 6220 return pcie_p2p && is_large_bar && p2p_access && p2p_addressable; 6221 #else 6222 return false; 6223 #endif 6224 } 6225 6226 int amdgpu_device_baco_enter(struct drm_device *dev) 6227 { 6228 struct amdgpu_device *adev = drm_to_adev(dev); 6229 struct amdgpu_ras *ras = amdgpu_ras_get_context(adev); 6230 6231 if (!amdgpu_device_supports_baco(dev)) 6232 return -ENOTSUPP; 6233 6234 if (ras && adev->ras_enabled && 6235 adev->nbio.funcs->enable_doorbell_interrupt) 6236 adev->nbio.funcs->enable_doorbell_interrupt(adev, false); 6237 6238 return amdgpu_dpm_baco_enter(adev); 6239 } 6240 6241 int amdgpu_device_baco_exit(struct drm_device *dev) 6242 { 6243 struct amdgpu_device *adev = drm_to_adev(dev); 6244 struct amdgpu_ras *ras = amdgpu_ras_get_context(adev); 6245 int ret = 0; 6246 6247 if (!amdgpu_device_supports_baco(dev)) 6248 return -ENOTSUPP; 6249 6250 ret = amdgpu_dpm_baco_exit(adev); 6251 if (ret) 6252 return ret; 6253 6254 if (ras && adev->ras_enabled && 6255 adev->nbio.funcs->enable_doorbell_interrupt) 6256 adev->nbio.funcs->enable_doorbell_interrupt(adev, true); 6257 6258 if (amdgpu_passthrough(adev) && adev->nbio.funcs && 6259 adev->nbio.funcs->clear_doorbell_interrupt) 6260 adev->nbio.funcs->clear_doorbell_interrupt(adev); 6261 6262 return 0; 6263 } 6264 6265 /** 6266 * amdgpu_pci_error_detected - Called when a PCI error is detected. 6267 * @pdev: PCI device struct 6268 * @state: PCI channel state 6269 * 6270 * Description: Called when a PCI error is detected. 6271 * 6272 * Return: PCI_ERS_RESULT_NEED_RESET or PCI_ERS_RESULT_DISCONNECT. 6273 */ 6274 pci_ers_result_t amdgpu_pci_error_detected(struct pci_dev *pdev, pci_channel_state_t state) 6275 { 6276 struct drm_device *dev = pci_get_drvdata(pdev); 6277 struct amdgpu_device *adev = drm_to_adev(dev); 6278 int i; 6279 6280 DRM_INFO("PCI error: detected callback, state(%d)!!\n", state); 6281 6282 if (adev->gmc.xgmi.num_physical_nodes > 1) { 6283 DRM_WARN("No support for XGMI hive yet..."); 6284 return PCI_ERS_RESULT_DISCONNECT; 6285 } 6286 6287 adev->pci_channel_state = state; 6288 6289 switch (state) { 6290 case pci_channel_io_normal: 6291 return PCI_ERS_RESULT_CAN_RECOVER; 6292 /* Fatal error, prepare for slot reset */ 6293 case pci_channel_io_frozen: 6294 /* 6295 * Locking adev->reset_domain->sem will prevent any external access 6296 * to GPU during PCI error recovery 6297 */ 6298 amdgpu_device_lock_reset_domain(adev->reset_domain); 6299 amdgpu_device_set_mp1_state(adev); 6300 6301 /* 6302 * Block any work scheduling as we do for regular GPU reset 6303 * for the duration of the recovery 6304 */ 6305 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 6306 struct amdgpu_ring *ring = adev->rings[i]; 6307 6308 if (!amdgpu_ring_sched_ready(ring)) 6309 continue; 6310 6311 drm_sched_stop(&ring->sched, NULL); 6312 } 6313 atomic_inc(&adev->gpu_reset_counter); 6314 return PCI_ERS_RESULT_NEED_RESET; 6315 case pci_channel_io_perm_failure: 6316 /* Permanent error, prepare for device removal */ 6317 return PCI_ERS_RESULT_DISCONNECT; 6318 } 6319 6320 return PCI_ERS_RESULT_NEED_RESET; 6321 } 6322 6323 /** 6324 * amdgpu_pci_mmio_enabled - Enable MMIO and dump debug registers 6325 * @pdev: pointer to PCI device 6326 */ 6327 pci_ers_result_t amdgpu_pci_mmio_enabled(struct pci_dev *pdev) 6328 { 6329 6330 DRM_INFO("PCI error: mmio enabled callback!!\n"); 6331 6332 /* TODO - dump whatever for debugging purposes */ 6333 6334 /* This called only if amdgpu_pci_error_detected returns 6335 * PCI_ERS_RESULT_CAN_RECOVER. Read/write to the device still 6336 * works, no need to reset slot. 6337 */ 6338 6339 return PCI_ERS_RESULT_RECOVERED; 6340 } 6341 6342 /** 6343 * amdgpu_pci_slot_reset - Called when PCI slot has been reset. 6344 * @pdev: PCI device struct 6345 * 6346 * Description: This routine is called by the pci error recovery 6347 * code after the PCI slot has been reset, just before we 6348 * should resume normal operations. 6349 */ 6350 pci_ers_result_t amdgpu_pci_slot_reset(struct pci_dev *pdev) 6351 { 6352 struct drm_device *dev = pci_get_drvdata(pdev); 6353 struct amdgpu_device *adev = drm_to_adev(dev); 6354 int r, i; 6355 struct amdgpu_reset_context reset_context; 6356 u32 memsize; 6357 struct list_head device_list; 6358 6359 /* PCI error slot reset should be skipped During RAS recovery */ 6360 if ((amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 3) || 6361 amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 4)) && 6362 amdgpu_ras_in_recovery(adev)) 6363 return PCI_ERS_RESULT_RECOVERED; 6364 6365 DRM_INFO("PCI error: slot reset callback!!\n"); 6366 6367 memset(&reset_context, 0, sizeof(reset_context)); 6368 6369 INIT_LIST_HEAD(&device_list); 6370 list_add_tail(&adev->reset_list, &device_list); 6371 6372 /* wait for asic to come out of reset */ 6373 msleep(500); 6374 6375 /* Restore PCI confspace */ 6376 amdgpu_device_load_pci_state(pdev); 6377 6378 /* confirm ASIC came out of reset */ 6379 for (i = 0; i < adev->usec_timeout; i++) { 6380 memsize = amdgpu_asic_get_config_memsize(adev); 6381 6382 if (memsize != 0xffffffff) 6383 break; 6384 udelay(1); 6385 } 6386 if (memsize == 0xffffffff) { 6387 r = -ETIME; 6388 goto out; 6389 } 6390 6391 reset_context.method = AMD_RESET_METHOD_NONE; 6392 reset_context.reset_req_dev = adev; 6393 set_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags); 6394 set_bit(AMDGPU_SKIP_HW_RESET, &reset_context.flags); 6395 6396 adev->no_hw_access = true; 6397 r = amdgpu_device_pre_asic_reset(adev, &reset_context); 6398 adev->no_hw_access = false; 6399 if (r) 6400 goto out; 6401 6402 r = amdgpu_do_asic_reset(&device_list, &reset_context); 6403 6404 out: 6405 if (!r) { 6406 if (amdgpu_device_cache_pci_state(adev->pdev)) 6407 pci_restore_state(adev->pdev); 6408 6409 DRM_INFO("PCIe error recovery succeeded\n"); 6410 } else { 6411 DRM_ERROR("PCIe error recovery failed, err:%d", r); 6412 amdgpu_device_unset_mp1_state(adev); 6413 amdgpu_device_unlock_reset_domain(adev->reset_domain); 6414 } 6415 6416 return r ? PCI_ERS_RESULT_DISCONNECT : PCI_ERS_RESULT_RECOVERED; 6417 } 6418 6419 /** 6420 * amdgpu_pci_resume() - resume normal ops after PCI reset 6421 * @pdev: pointer to PCI device 6422 * 6423 * Called when the error recovery driver tells us that its 6424 * OK to resume normal operation. 6425 */ 6426 void amdgpu_pci_resume(struct pci_dev *pdev) 6427 { 6428 struct drm_device *dev = pci_get_drvdata(pdev); 6429 struct amdgpu_device *adev = drm_to_adev(dev); 6430 int i; 6431 6432 6433 DRM_INFO("PCI error: resume callback!!\n"); 6434 6435 /* Only continue execution for the case of pci_channel_io_frozen */ 6436 if (adev->pci_channel_state != pci_channel_io_frozen) 6437 return; 6438 6439 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 6440 struct amdgpu_ring *ring = adev->rings[i]; 6441 6442 if (!amdgpu_ring_sched_ready(ring)) 6443 continue; 6444 6445 drm_sched_start(&ring->sched, 0); 6446 } 6447 6448 amdgpu_device_unset_mp1_state(adev); 6449 amdgpu_device_unlock_reset_domain(adev->reset_domain); 6450 } 6451 6452 bool amdgpu_device_cache_pci_state(struct pci_dev *pdev) 6453 { 6454 struct drm_device *dev = pci_get_drvdata(pdev); 6455 struct amdgpu_device *adev = drm_to_adev(dev); 6456 int r; 6457 6458 if (amdgpu_sriov_vf(adev)) 6459 return false; 6460 6461 r = pci_save_state(pdev); 6462 if (!r) { 6463 kfree(adev->pci_state); 6464 6465 adev->pci_state = pci_store_saved_state(pdev); 6466 6467 if (!adev->pci_state) { 6468 DRM_ERROR("Failed to store PCI saved state"); 6469 return false; 6470 } 6471 } else { 6472 DRM_WARN("Failed to save PCI state, err:%d\n", r); 6473 return false; 6474 } 6475 6476 return true; 6477 } 6478 6479 bool amdgpu_device_load_pci_state(struct pci_dev *pdev) 6480 { 6481 struct drm_device *dev = pci_get_drvdata(pdev); 6482 struct amdgpu_device *adev = drm_to_adev(dev); 6483 int r; 6484 6485 if (!adev->pci_state) 6486 return false; 6487 6488 r = pci_load_saved_state(pdev, adev->pci_state); 6489 6490 if (!r) { 6491 pci_restore_state(pdev); 6492 } else { 6493 DRM_WARN("Failed to load PCI state, err:%d\n", r); 6494 return false; 6495 } 6496 6497 return true; 6498 } 6499 6500 void amdgpu_device_flush_hdp(struct amdgpu_device *adev, 6501 struct amdgpu_ring *ring) 6502 { 6503 #ifdef CONFIG_X86_64 6504 if ((adev->flags & AMD_IS_APU) && !amdgpu_passthrough(adev)) 6505 return; 6506 #endif 6507 if (adev->gmc.xgmi.connected_to_cpu) 6508 return; 6509 6510 if (ring && ring->funcs->emit_hdp_flush) 6511 amdgpu_ring_emit_hdp_flush(ring); 6512 else 6513 amdgpu_asic_flush_hdp(adev, ring); 6514 } 6515 6516 void amdgpu_device_invalidate_hdp(struct amdgpu_device *adev, 6517 struct amdgpu_ring *ring) 6518 { 6519 #ifdef CONFIG_X86_64 6520 if ((adev->flags & AMD_IS_APU) && !amdgpu_passthrough(adev)) 6521 return; 6522 #endif 6523 if (adev->gmc.xgmi.connected_to_cpu) 6524 return; 6525 6526 amdgpu_asic_invalidate_hdp(adev, ring); 6527 } 6528 6529 int amdgpu_in_reset(struct amdgpu_device *adev) 6530 { 6531 return atomic_read(&adev->reset_domain->in_gpu_reset); 6532 } 6533 6534 /** 6535 * amdgpu_device_halt() - bring hardware to some kind of halt state 6536 * 6537 * @adev: amdgpu_device pointer 6538 * 6539 * Bring hardware to some kind of halt state so that no one can touch it 6540 * any more. It will help to maintain error context when error occurred. 6541 * Compare to a simple hang, the system will keep stable at least for SSH 6542 * access. Then it should be trivial to inspect the hardware state and 6543 * see what's going on. Implemented as following: 6544 * 6545 * 1. drm_dev_unplug() makes device inaccessible to user space(IOCTLs, etc), 6546 * clears all CPU mappings to device, disallows remappings through page faults 6547 * 2. amdgpu_irq_disable_all() disables all interrupts 6548 * 3. amdgpu_fence_driver_hw_fini() signals all HW fences 6549 * 4. set adev->no_hw_access to avoid potential crashes after setp 5 6550 * 5. amdgpu_device_unmap_mmio() clears all MMIO mappings 6551 * 6. pci_disable_device() and pci_wait_for_pending_transaction() 6552 * flush any in flight DMA operations 6553 */ 6554 void amdgpu_device_halt(struct amdgpu_device *adev) 6555 { 6556 struct pci_dev *pdev = adev->pdev; 6557 struct drm_device *ddev = adev_to_drm(adev); 6558 6559 amdgpu_xcp_dev_unplug(adev); 6560 drm_dev_unplug(ddev); 6561 6562 amdgpu_irq_disable_all(adev); 6563 6564 amdgpu_fence_driver_hw_fini(adev); 6565 6566 adev->no_hw_access = true; 6567 6568 amdgpu_device_unmap_mmio(adev); 6569 6570 pci_disable_device(pdev); 6571 pci_wait_for_pending_transaction(pdev); 6572 } 6573 6574 u32 amdgpu_device_pcie_port_rreg(struct amdgpu_device *adev, 6575 u32 reg) 6576 { 6577 unsigned long flags, address, data; 6578 u32 r; 6579 6580 address = adev->nbio.funcs->get_pcie_port_index_offset(adev); 6581 data = adev->nbio.funcs->get_pcie_port_data_offset(adev); 6582 6583 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 6584 WREG32(address, reg * 4); 6585 (void)RREG32(address); 6586 r = RREG32(data); 6587 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 6588 return r; 6589 } 6590 6591 void amdgpu_device_pcie_port_wreg(struct amdgpu_device *adev, 6592 u32 reg, u32 v) 6593 { 6594 unsigned long flags, address, data; 6595 6596 address = adev->nbio.funcs->get_pcie_port_index_offset(adev); 6597 data = adev->nbio.funcs->get_pcie_port_data_offset(adev); 6598 6599 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 6600 WREG32(address, reg * 4); 6601 (void)RREG32(address); 6602 WREG32(data, v); 6603 (void)RREG32(data); 6604 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 6605 } 6606 6607 /** 6608 * amdgpu_device_get_gang - return a reference to the current gang 6609 * @adev: amdgpu_device pointer 6610 * 6611 * Returns: A new reference to the current gang leader. 6612 */ 6613 struct dma_fence *amdgpu_device_get_gang(struct amdgpu_device *adev) 6614 { 6615 struct dma_fence *fence; 6616 6617 rcu_read_lock(); 6618 fence = dma_fence_get_rcu_safe(&adev->gang_submit); 6619 rcu_read_unlock(); 6620 return fence; 6621 } 6622 6623 /** 6624 * amdgpu_device_switch_gang - switch to a new gang 6625 * @adev: amdgpu_device pointer 6626 * @gang: the gang to switch to 6627 * 6628 * Try to switch to a new gang. 6629 * Returns: NULL if we switched to the new gang or a reference to the current 6630 * gang leader. 6631 */ 6632 struct dma_fence *amdgpu_device_switch_gang(struct amdgpu_device *adev, 6633 struct dma_fence *gang) 6634 { 6635 struct dma_fence *old = NULL; 6636 6637 do { 6638 dma_fence_put(old); 6639 old = amdgpu_device_get_gang(adev); 6640 if (old == gang) 6641 break; 6642 6643 if (!dma_fence_is_signaled(old)) 6644 return old; 6645 6646 } while (cmpxchg((struct dma_fence __force **)&adev->gang_submit, 6647 old, gang) != old); 6648 6649 dma_fence_put(old); 6650 return NULL; 6651 } 6652 6653 bool amdgpu_device_has_display_hardware(struct amdgpu_device *adev) 6654 { 6655 switch (adev->asic_type) { 6656 #ifdef CONFIG_DRM_AMDGPU_SI 6657 case CHIP_HAINAN: 6658 #endif 6659 case CHIP_TOPAZ: 6660 /* chips with no display hardware */ 6661 return false; 6662 #ifdef CONFIG_DRM_AMDGPU_SI 6663 case CHIP_TAHITI: 6664 case CHIP_PITCAIRN: 6665 case CHIP_VERDE: 6666 case CHIP_OLAND: 6667 #endif 6668 #ifdef CONFIG_DRM_AMDGPU_CIK 6669 case CHIP_BONAIRE: 6670 case CHIP_HAWAII: 6671 case CHIP_KAVERI: 6672 case CHIP_KABINI: 6673 case CHIP_MULLINS: 6674 #endif 6675 case CHIP_TONGA: 6676 case CHIP_FIJI: 6677 case CHIP_POLARIS10: 6678 case CHIP_POLARIS11: 6679 case CHIP_POLARIS12: 6680 case CHIP_VEGAM: 6681 case CHIP_CARRIZO: 6682 case CHIP_STONEY: 6683 /* chips with display hardware */ 6684 return true; 6685 default: 6686 /* IP discovery */ 6687 if (!amdgpu_ip_version(adev, DCE_HWIP, 0) || 6688 (adev->harvest_ip_mask & AMD_HARVEST_IP_DMU_MASK)) 6689 return false; 6690 return true; 6691 } 6692 } 6693 6694 uint32_t amdgpu_device_wait_on_rreg(struct amdgpu_device *adev, 6695 uint32_t inst, uint32_t reg_addr, char reg_name[], 6696 uint32_t expected_value, uint32_t mask) 6697 { 6698 uint32_t ret = 0; 6699 uint32_t old_ = 0; 6700 uint32_t tmp_ = RREG32(reg_addr); 6701 uint32_t loop = adev->usec_timeout; 6702 6703 while ((tmp_ & (mask)) != (expected_value)) { 6704 if (old_ != tmp_) { 6705 loop = adev->usec_timeout; 6706 old_ = tmp_; 6707 } else 6708 udelay(1); 6709 tmp_ = RREG32(reg_addr); 6710 loop--; 6711 if (!loop) { 6712 DRM_WARN("Register(%d) [%s] failed to reach value 0x%08x != 0x%08xn", 6713 inst, reg_name, (uint32_t)expected_value, 6714 (uint32_t)(tmp_ & (mask))); 6715 ret = -ETIMEDOUT; 6716 break; 6717 } 6718 } 6719 return ret; 6720 } 6721 6722 ssize_t amdgpu_get_soft_full_reset_mask(struct amdgpu_ring *ring) 6723 { 6724 ssize_t size = 0; 6725 6726 if (!ring || !ring->adev) 6727 return size; 6728 6729 if (amdgpu_device_should_recover_gpu(ring->adev)) 6730 size |= AMDGPU_RESET_TYPE_FULL; 6731 6732 if (unlikely(!ring->adev->debug_disable_soft_recovery) && 6733 !amdgpu_sriov_vf(ring->adev) && ring->funcs->soft_recovery) 6734 size |= AMDGPU_RESET_TYPE_SOFT_RESET; 6735 6736 return size; 6737 } 6738 6739 ssize_t amdgpu_show_reset_mask(char *buf, uint32_t supported_reset) 6740 { 6741 ssize_t size = 0; 6742 6743 if (supported_reset == 0) { 6744 size += sysfs_emit_at(buf, size, "unsupported"); 6745 size += sysfs_emit_at(buf, size, "\n"); 6746 return size; 6747 6748 } 6749 6750 if (supported_reset & AMDGPU_RESET_TYPE_SOFT_RESET) 6751 size += sysfs_emit_at(buf, size, "soft "); 6752 6753 if (supported_reset & AMDGPU_RESET_TYPE_PER_QUEUE) 6754 size += sysfs_emit_at(buf, size, "queue "); 6755 6756 if (supported_reset & AMDGPU_RESET_TYPE_PER_PIPE) 6757 size += sysfs_emit_at(buf, size, "pipe "); 6758 6759 if (supported_reset & AMDGPU_RESET_TYPE_FULL) 6760 size += sysfs_emit_at(buf, size, "full "); 6761 6762 size += sysfs_emit_at(buf, size, "\n"); 6763 return size; 6764 } 6765