1 /* 2 * Copyright 2008 Advanced Micro Devices, Inc. 3 * Copyright 2008 Red Hat Inc. 4 * Copyright 2009 Jerome Glisse. 5 * 6 * Permission is hereby granted, free of charge, to any person obtaining a 7 * copy of this software and associated documentation files (the "Software"), 8 * to deal in the Software without restriction, including without limitation 9 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 10 * and/or sell copies of the Software, and to permit persons to whom the 11 * Software is furnished to do so, subject to the following conditions: 12 * 13 * The above copyright notice and this permission notice shall be included in 14 * all copies or substantial portions of the Software. 15 * 16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 19 * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR 20 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, 21 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR 22 * OTHER DEALINGS IN THE SOFTWARE. 23 * 24 * Authors: Dave Airlie 25 * Alex Deucher 26 * Jerome Glisse 27 */ 28 29 #include <linux/aperture.h> 30 #include <linux/power_supply.h> 31 #include <linux/kthread.h> 32 #include <linux/module.h> 33 #include <linux/console.h> 34 #include <linux/slab.h> 35 #include <linux/iommu.h> 36 #include <linux/pci.h> 37 #include <linux/pci-p2pdma.h> 38 #include <linux/apple-gmux.h> 39 40 #include <drm/drm_atomic_helper.h> 41 #include <drm/drm_client_event.h> 42 #include <drm/drm_crtc_helper.h> 43 #include <drm/drm_probe_helper.h> 44 #include <drm/amdgpu_drm.h> 45 #include <linux/device.h> 46 #include <linux/vgaarb.h> 47 #include <linux/vga_switcheroo.h> 48 #include <linux/efi.h> 49 #include "amdgpu.h" 50 #include "amdgpu_trace.h" 51 #include "amdgpu_i2c.h" 52 #include "atom.h" 53 #include "amdgpu_atombios.h" 54 #include "amdgpu_atomfirmware.h" 55 #include "amd_pcie.h" 56 #ifdef CONFIG_DRM_AMDGPU_SI 57 #include "si.h" 58 #endif 59 #ifdef CONFIG_DRM_AMDGPU_CIK 60 #include "cik.h" 61 #endif 62 #include "vi.h" 63 #include "soc15.h" 64 #include "nv.h" 65 #include "bif/bif_4_1_d.h" 66 #include <linux/firmware.h> 67 #include "amdgpu_vf_error.h" 68 69 #include "amdgpu_amdkfd.h" 70 #include "amdgpu_pm.h" 71 72 #include "amdgpu_xgmi.h" 73 #include "amdgpu_ras.h" 74 #include "amdgpu_pmu.h" 75 #include "amdgpu_fru_eeprom.h" 76 #include "amdgpu_reset.h" 77 #include "amdgpu_virt.h" 78 #include "amdgpu_dev_coredump.h" 79 80 #include <linux/suspend.h> 81 #include <drm/task_barrier.h> 82 #include <linux/pm_runtime.h> 83 84 #include <drm/drm_drv.h> 85 86 #if IS_ENABLED(CONFIG_X86) 87 #include <asm/intel-family.h> 88 #endif 89 90 MODULE_FIRMWARE("amdgpu/vega10_gpu_info.bin"); 91 MODULE_FIRMWARE("amdgpu/vega12_gpu_info.bin"); 92 MODULE_FIRMWARE("amdgpu/raven_gpu_info.bin"); 93 MODULE_FIRMWARE("amdgpu/picasso_gpu_info.bin"); 94 MODULE_FIRMWARE("amdgpu/raven2_gpu_info.bin"); 95 MODULE_FIRMWARE("amdgpu/arcturus_gpu_info.bin"); 96 MODULE_FIRMWARE("amdgpu/navi12_gpu_info.bin"); 97 98 #define AMDGPU_RESUME_MS 2000 99 #define AMDGPU_MAX_RETRY_LIMIT 2 100 #define AMDGPU_RETRY_SRIOV_RESET(r) ((r) == -EBUSY || (r) == -ETIMEDOUT || (r) == -EINVAL) 101 #define AMDGPU_PCIE_INDEX_FALLBACK (0x38 >> 2) 102 #define AMDGPU_PCIE_INDEX_HI_FALLBACK (0x44 >> 2) 103 #define AMDGPU_PCIE_DATA_FALLBACK (0x3C >> 2) 104 105 static const struct drm_driver amdgpu_kms_driver; 106 107 const char *amdgpu_asic_name[] = { 108 "TAHITI", 109 "PITCAIRN", 110 "VERDE", 111 "OLAND", 112 "HAINAN", 113 "BONAIRE", 114 "KAVERI", 115 "KABINI", 116 "HAWAII", 117 "MULLINS", 118 "TOPAZ", 119 "TONGA", 120 "FIJI", 121 "CARRIZO", 122 "STONEY", 123 "POLARIS10", 124 "POLARIS11", 125 "POLARIS12", 126 "VEGAM", 127 "VEGA10", 128 "VEGA12", 129 "VEGA20", 130 "RAVEN", 131 "ARCTURUS", 132 "RENOIR", 133 "ALDEBARAN", 134 "NAVI10", 135 "CYAN_SKILLFISH", 136 "NAVI14", 137 "NAVI12", 138 "SIENNA_CICHLID", 139 "NAVY_FLOUNDER", 140 "VANGOGH", 141 "DIMGREY_CAVEFISH", 142 "BEIGE_GOBY", 143 "YELLOW_CARP", 144 "IP DISCOVERY", 145 "LAST", 146 }; 147 148 #define AMDGPU_IP_BLK_MASK_ALL GENMASK(AMDGPU_MAX_IP_NUM - 1, 0) 149 /* 150 * Default init level where all blocks are expected to be initialized. This is 151 * the level of initialization expected by default and also after a full reset 152 * of the device. 153 */ 154 struct amdgpu_init_level amdgpu_init_default = { 155 .level = AMDGPU_INIT_LEVEL_DEFAULT, 156 .hwini_ip_block_mask = AMDGPU_IP_BLK_MASK_ALL, 157 }; 158 159 /* 160 * Minimal blocks needed to be initialized before a XGMI hive can be reset. This 161 * is used for cases like reset on initialization where the entire hive needs to 162 * be reset before first use. 163 */ 164 struct amdgpu_init_level amdgpu_init_minimal_xgmi = { 165 .level = AMDGPU_INIT_LEVEL_MINIMAL_XGMI, 166 .hwini_ip_block_mask = 167 BIT(AMD_IP_BLOCK_TYPE_GMC) | BIT(AMD_IP_BLOCK_TYPE_SMC) | 168 BIT(AMD_IP_BLOCK_TYPE_COMMON) | BIT(AMD_IP_BLOCK_TYPE_IH) | 169 BIT(AMD_IP_BLOCK_TYPE_PSP) 170 }; 171 172 static inline bool amdgpu_ip_member_of_hwini(struct amdgpu_device *adev, 173 enum amd_ip_block_type block) 174 { 175 return (adev->init_lvl->hwini_ip_block_mask & (1U << block)) != 0; 176 } 177 178 void amdgpu_set_init_level(struct amdgpu_device *adev, 179 enum amdgpu_init_lvl_id lvl) 180 { 181 switch (lvl) { 182 case AMDGPU_INIT_LEVEL_MINIMAL_XGMI: 183 adev->init_lvl = &amdgpu_init_minimal_xgmi; 184 break; 185 case AMDGPU_INIT_LEVEL_DEFAULT: 186 fallthrough; 187 default: 188 adev->init_lvl = &amdgpu_init_default; 189 break; 190 } 191 } 192 193 static inline void amdgpu_device_stop_pending_resets(struct amdgpu_device *adev); 194 195 /** 196 * DOC: pcie_replay_count 197 * 198 * The amdgpu driver provides a sysfs API for reporting the total number 199 * of PCIe replays (NAKs) 200 * The file pcie_replay_count is used for this and returns the total 201 * number of replays as a sum of the NAKs generated and NAKs received 202 */ 203 204 static ssize_t amdgpu_device_get_pcie_replay_count(struct device *dev, 205 struct device_attribute *attr, char *buf) 206 { 207 struct drm_device *ddev = dev_get_drvdata(dev); 208 struct amdgpu_device *adev = drm_to_adev(ddev); 209 uint64_t cnt = amdgpu_asic_get_pcie_replay_count(adev); 210 211 return sysfs_emit(buf, "%llu\n", cnt); 212 } 213 214 static DEVICE_ATTR(pcie_replay_count, 0444, 215 amdgpu_device_get_pcie_replay_count, NULL); 216 217 static ssize_t amdgpu_sysfs_reg_state_get(struct file *f, struct kobject *kobj, 218 struct bin_attribute *attr, char *buf, 219 loff_t ppos, size_t count) 220 { 221 struct device *dev = kobj_to_dev(kobj); 222 struct drm_device *ddev = dev_get_drvdata(dev); 223 struct amdgpu_device *adev = drm_to_adev(ddev); 224 ssize_t bytes_read; 225 226 switch (ppos) { 227 case AMDGPU_SYS_REG_STATE_XGMI: 228 bytes_read = amdgpu_asic_get_reg_state( 229 adev, AMDGPU_REG_STATE_TYPE_XGMI, buf, count); 230 break; 231 case AMDGPU_SYS_REG_STATE_WAFL: 232 bytes_read = amdgpu_asic_get_reg_state( 233 adev, AMDGPU_REG_STATE_TYPE_WAFL, buf, count); 234 break; 235 case AMDGPU_SYS_REG_STATE_PCIE: 236 bytes_read = amdgpu_asic_get_reg_state( 237 adev, AMDGPU_REG_STATE_TYPE_PCIE, buf, count); 238 break; 239 case AMDGPU_SYS_REG_STATE_USR: 240 bytes_read = amdgpu_asic_get_reg_state( 241 adev, AMDGPU_REG_STATE_TYPE_USR, buf, count); 242 break; 243 case AMDGPU_SYS_REG_STATE_USR_1: 244 bytes_read = amdgpu_asic_get_reg_state( 245 adev, AMDGPU_REG_STATE_TYPE_USR_1, buf, count); 246 break; 247 default: 248 return -EINVAL; 249 } 250 251 return bytes_read; 252 } 253 254 BIN_ATTR(reg_state, 0444, amdgpu_sysfs_reg_state_get, NULL, 255 AMDGPU_SYS_REG_STATE_END); 256 257 int amdgpu_reg_state_sysfs_init(struct amdgpu_device *adev) 258 { 259 int ret; 260 261 if (!amdgpu_asic_get_reg_state_supported(adev)) 262 return 0; 263 264 ret = sysfs_create_bin_file(&adev->dev->kobj, &bin_attr_reg_state); 265 266 return ret; 267 } 268 269 void amdgpu_reg_state_sysfs_fini(struct amdgpu_device *adev) 270 { 271 if (!amdgpu_asic_get_reg_state_supported(adev)) 272 return; 273 sysfs_remove_bin_file(&adev->dev->kobj, &bin_attr_reg_state); 274 } 275 276 int amdgpu_ip_block_suspend(struct amdgpu_ip_block *ip_block) 277 { 278 int r; 279 280 if (ip_block->version->funcs->suspend) { 281 r = ip_block->version->funcs->suspend(ip_block); 282 if (r) { 283 dev_err(ip_block->adev->dev, 284 "suspend of IP block <%s> failed %d\n", 285 ip_block->version->funcs->name, r); 286 return r; 287 } 288 } 289 290 ip_block->status.hw = false; 291 return 0; 292 } 293 294 int amdgpu_ip_block_resume(struct amdgpu_ip_block *ip_block) 295 { 296 int r; 297 298 if (ip_block->version->funcs->resume) { 299 r = ip_block->version->funcs->resume(ip_block); 300 if (r) { 301 dev_err(ip_block->adev->dev, 302 "resume of IP block <%s> failed %d\n", 303 ip_block->version->funcs->name, r); 304 return r; 305 } 306 } 307 308 ip_block->status.hw = true; 309 return 0; 310 } 311 312 /** 313 * DOC: board_info 314 * 315 * The amdgpu driver provides a sysfs API for giving board related information. 316 * It provides the form factor information in the format 317 * 318 * type : form factor 319 * 320 * Possible form factor values 321 * 322 * - "cem" - PCIE CEM card 323 * - "oam" - Open Compute Accelerator Module 324 * - "unknown" - Not known 325 * 326 */ 327 328 static ssize_t amdgpu_device_get_board_info(struct device *dev, 329 struct device_attribute *attr, 330 char *buf) 331 { 332 struct drm_device *ddev = dev_get_drvdata(dev); 333 struct amdgpu_device *adev = drm_to_adev(ddev); 334 enum amdgpu_pkg_type pkg_type = AMDGPU_PKG_TYPE_CEM; 335 const char *pkg; 336 337 if (adev->smuio.funcs && adev->smuio.funcs->get_pkg_type) 338 pkg_type = adev->smuio.funcs->get_pkg_type(adev); 339 340 switch (pkg_type) { 341 case AMDGPU_PKG_TYPE_CEM: 342 pkg = "cem"; 343 break; 344 case AMDGPU_PKG_TYPE_OAM: 345 pkg = "oam"; 346 break; 347 default: 348 pkg = "unknown"; 349 break; 350 } 351 352 return sysfs_emit(buf, "%s : %s\n", "type", pkg); 353 } 354 355 static DEVICE_ATTR(board_info, 0444, amdgpu_device_get_board_info, NULL); 356 357 static struct attribute *amdgpu_board_attrs[] = { 358 &dev_attr_board_info.attr, 359 NULL, 360 }; 361 362 static umode_t amdgpu_board_attrs_is_visible(struct kobject *kobj, 363 struct attribute *attr, int n) 364 { 365 struct device *dev = kobj_to_dev(kobj); 366 struct drm_device *ddev = dev_get_drvdata(dev); 367 struct amdgpu_device *adev = drm_to_adev(ddev); 368 369 if (adev->flags & AMD_IS_APU) 370 return 0; 371 372 return attr->mode; 373 } 374 375 static const struct attribute_group amdgpu_board_attrs_group = { 376 .attrs = amdgpu_board_attrs, 377 .is_visible = amdgpu_board_attrs_is_visible 378 }; 379 380 static void amdgpu_device_get_pcie_info(struct amdgpu_device *adev); 381 382 383 /** 384 * amdgpu_device_supports_px - Is the device a dGPU with ATPX power control 385 * 386 * @dev: drm_device pointer 387 * 388 * Returns true if the device is a dGPU with ATPX power control, 389 * otherwise return false. 390 */ 391 bool amdgpu_device_supports_px(struct drm_device *dev) 392 { 393 struct amdgpu_device *adev = drm_to_adev(dev); 394 395 if ((adev->flags & AMD_IS_PX) && !amdgpu_is_atpx_hybrid()) 396 return true; 397 return false; 398 } 399 400 /** 401 * amdgpu_device_supports_boco - Is the device a dGPU with ACPI power resources 402 * 403 * @dev: drm_device pointer 404 * 405 * Returns true if the device is a dGPU with ACPI power control, 406 * otherwise return false. 407 */ 408 bool amdgpu_device_supports_boco(struct drm_device *dev) 409 { 410 struct amdgpu_device *adev = drm_to_adev(dev); 411 412 if (adev->has_pr3 || 413 ((adev->flags & AMD_IS_PX) && amdgpu_is_atpx_hybrid())) 414 return true; 415 return false; 416 } 417 418 /** 419 * amdgpu_device_supports_baco - Does the device support BACO 420 * 421 * @dev: drm_device pointer 422 * 423 * Return: 424 * 1 if the device supporte BACO; 425 * 3 if the device support MACO (only works if BACO is supported) 426 * otherwise return 0. 427 */ 428 int amdgpu_device_supports_baco(struct drm_device *dev) 429 { 430 struct amdgpu_device *adev = drm_to_adev(dev); 431 432 return amdgpu_asic_supports_baco(adev); 433 } 434 435 void amdgpu_device_detect_runtime_pm_mode(struct amdgpu_device *adev) 436 { 437 struct drm_device *dev; 438 int bamaco_support; 439 440 dev = adev_to_drm(adev); 441 442 adev->pm.rpm_mode = AMDGPU_RUNPM_NONE; 443 bamaco_support = amdgpu_device_supports_baco(dev); 444 445 switch (amdgpu_runtime_pm) { 446 case 2: 447 if (bamaco_support & MACO_SUPPORT) { 448 adev->pm.rpm_mode = AMDGPU_RUNPM_BAMACO; 449 dev_info(adev->dev, "Forcing BAMACO for runtime pm\n"); 450 } else if (bamaco_support == BACO_SUPPORT) { 451 adev->pm.rpm_mode = AMDGPU_RUNPM_BACO; 452 dev_info(adev->dev, "Requested mode BAMACO not available,fallback to use BACO\n"); 453 } 454 break; 455 case 1: 456 if (bamaco_support & BACO_SUPPORT) { 457 adev->pm.rpm_mode = AMDGPU_RUNPM_BACO; 458 dev_info(adev->dev, "Forcing BACO for runtime pm\n"); 459 } 460 break; 461 case -1: 462 case -2: 463 if (amdgpu_device_supports_px(dev)) { /* enable PX as runtime mode */ 464 adev->pm.rpm_mode = AMDGPU_RUNPM_PX; 465 dev_info(adev->dev, "Using ATPX for runtime pm\n"); 466 } else if (amdgpu_device_supports_boco(dev)) { /* enable boco as runtime mode */ 467 adev->pm.rpm_mode = AMDGPU_RUNPM_BOCO; 468 dev_info(adev->dev, "Using BOCO for runtime pm\n"); 469 } else { 470 if (!bamaco_support) 471 goto no_runtime_pm; 472 473 switch (adev->asic_type) { 474 case CHIP_VEGA20: 475 case CHIP_ARCTURUS: 476 /* BACO are not supported on vega20 and arctrus */ 477 break; 478 case CHIP_VEGA10: 479 /* enable BACO as runpm mode if noretry=0 */ 480 if (!adev->gmc.noretry) 481 adev->pm.rpm_mode = AMDGPU_RUNPM_BACO; 482 break; 483 default: 484 /* enable BACO as runpm mode on CI+ */ 485 adev->pm.rpm_mode = AMDGPU_RUNPM_BACO; 486 break; 487 } 488 489 if (adev->pm.rpm_mode == AMDGPU_RUNPM_BACO) { 490 if (bamaco_support & MACO_SUPPORT) { 491 adev->pm.rpm_mode = AMDGPU_RUNPM_BAMACO; 492 dev_info(adev->dev, "Using BAMACO for runtime pm\n"); 493 } else { 494 dev_info(adev->dev, "Using BACO for runtime pm\n"); 495 } 496 } 497 } 498 break; 499 case 0: 500 dev_info(adev->dev, "runtime pm is manually disabled\n"); 501 break; 502 default: 503 break; 504 } 505 506 no_runtime_pm: 507 if (adev->pm.rpm_mode == AMDGPU_RUNPM_NONE) 508 dev_info(adev->dev, "Runtime PM not available\n"); 509 } 510 /** 511 * amdgpu_device_supports_smart_shift - Is the device dGPU with 512 * smart shift support 513 * 514 * @dev: drm_device pointer 515 * 516 * Returns true if the device is a dGPU with Smart Shift support, 517 * otherwise returns false. 518 */ 519 bool amdgpu_device_supports_smart_shift(struct drm_device *dev) 520 { 521 return (amdgpu_device_supports_boco(dev) && 522 amdgpu_acpi_is_power_shift_control_supported()); 523 } 524 525 /* 526 * VRAM access helper functions 527 */ 528 529 /** 530 * amdgpu_device_mm_access - access vram by MM_INDEX/MM_DATA 531 * 532 * @adev: amdgpu_device pointer 533 * @pos: offset of the buffer in vram 534 * @buf: virtual address of the buffer in system memory 535 * @size: read/write size, sizeof(@buf) must > @size 536 * @write: true - write to vram, otherwise - read from vram 537 */ 538 void amdgpu_device_mm_access(struct amdgpu_device *adev, loff_t pos, 539 void *buf, size_t size, bool write) 540 { 541 unsigned long flags; 542 uint32_t hi = ~0, tmp = 0; 543 uint32_t *data = buf; 544 uint64_t last; 545 int idx; 546 547 if (!drm_dev_enter(adev_to_drm(adev), &idx)) 548 return; 549 550 BUG_ON(!IS_ALIGNED(pos, 4) || !IS_ALIGNED(size, 4)); 551 552 spin_lock_irqsave(&adev->mmio_idx_lock, flags); 553 for (last = pos + size; pos < last; pos += 4) { 554 tmp = pos >> 31; 555 556 WREG32_NO_KIQ(mmMM_INDEX, ((uint32_t)pos) | 0x80000000); 557 if (tmp != hi) { 558 WREG32_NO_KIQ(mmMM_INDEX_HI, tmp); 559 hi = tmp; 560 } 561 if (write) 562 WREG32_NO_KIQ(mmMM_DATA, *data++); 563 else 564 *data++ = RREG32_NO_KIQ(mmMM_DATA); 565 } 566 567 spin_unlock_irqrestore(&adev->mmio_idx_lock, flags); 568 drm_dev_exit(idx); 569 } 570 571 /** 572 * amdgpu_device_aper_access - access vram by vram aperature 573 * 574 * @adev: amdgpu_device pointer 575 * @pos: offset of the buffer in vram 576 * @buf: virtual address of the buffer in system memory 577 * @size: read/write size, sizeof(@buf) must > @size 578 * @write: true - write to vram, otherwise - read from vram 579 * 580 * The return value means how many bytes have been transferred. 581 */ 582 size_t amdgpu_device_aper_access(struct amdgpu_device *adev, loff_t pos, 583 void *buf, size_t size, bool write) 584 { 585 #ifdef CONFIG_64BIT 586 void __iomem *addr; 587 size_t count = 0; 588 uint64_t last; 589 590 if (!adev->mman.aper_base_kaddr) 591 return 0; 592 593 last = min(pos + size, adev->gmc.visible_vram_size); 594 if (last > pos) { 595 addr = adev->mman.aper_base_kaddr + pos; 596 count = last - pos; 597 598 if (write) { 599 memcpy_toio(addr, buf, count); 600 /* Make sure HDP write cache flush happens without any reordering 601 * after the system memory contents are sent over PCIe device 602 */ 603 mb(); 604 amdgpu_device_flush_hdp(adev, NULL); 605 } else { 606 amdgpu_device_invalidate_hdp(adev, NULL); 607 /* Make sure HDP read cache is invalidated before issuing a read 608 * to the PCIe device 609 */ 610 mb(); 611 memcpy_fromio(buf, addr, count); 612 } 613 614 } 615 616 return count; 617 #else 618 return 0; 619 #endif 620 } 621 622 /** 623 * amdgpu_device_vram_access - read/write a buffer in vram 624 * 625 * @adev: amdgpu_device pointer 626 * @pos: offset of the buffer in vram 627 * @buf: virtual address of the buffer in system memory 628 * @size: read/write size, sizeof(@buf) must > @size 629 * @write: true - write to vram, otherwise - read from vram 630 */ 631 void amdgpu_device_vram_access(struct amdgpu_device *adev, loff_t pos, 632 void *buf, size_t size, bool write) 633 { 634 size_t count; 635 636 /* try to using vram apreature to access vram first */ 637 count = amdgpu_device_aper_access(adev, pos, buf, size, write); 638 size -= count; 639 if (size) { 640 /* using MM to access rest vram */ 641 pos += count; 642 buf += count; 643 amdgpu_device_mm_access(adev, pos, buf, size, write); 644 } 645 } 646 647 /* 648 * register access helper functions. 649 */ 650 651 /* Check if hw access should be skipped because of hotplug or device error */ 652 bool amdgpu_device_skip_hw_access(struct amdgpu_device *adev) 653 { 654 if (adev->no_hw_access) 655 return true; 656 657 #ifdef CONFIG_LOCKDEP 658 /* 659 * This is a bit complicated to understand, so worth a comment. What we assert 660 * here is that the GPU reset is not running on another thread in parallel. 661 * 662 * For this we trylock the read side of the reset semaphore, if that succeeds 663 * we know that the reset is not running in paralell. 664 * 665 * If the trylock fails we assert that we are either already holding the read 666 * side of the lock or are the reset thread itself and hold the write side of 667 * the lock. 668 */ 669 if (in_task()) { 670 if (down_read_trylock(&adev->reset_domain->sem)) 671 up_read(&adev->reset_domain->sem); 672 else 673 lockdep_assert_held(&adev->reset_domain->sem); 674 } 675 #endif 676 return false; 677 } 678 679 /** 680 * amdgpu_device_rreg - read a memory mapped IO or indirect register 681 * 682 * @adev: amdgpu_device pointer 683 * @reg: dword aligned register offset 684 * @acc_flags: access flags which require special behavior 685 * 686 * Returns the 32 bit value from the offset specified. 687 */ 688 uint32_t amdgpu_device_rreg(struct amdgpu_device *adev, 689 uint32_t reg, uint32_t acc_flags) 690 { 691 uint32_t ret; 692 693 if (amdgpu_device_skip_hw_access(adev)) 694 return 0; 695 696 if ((reg * 4) < adev->rmmio_size) { 697 if (!(acc_flags & AMDGPU_REGS_NO_KIQ) && 698 amdgpu_sriov_runtime(adev) && 699 down_read_trylock(&adev->reset_domain->sem)) { 700 ret = amdgpu_kiq_rreg(adev, reg, 0); 701 up_read(&adev->reset_domain->sem); 702 } else { 703 ret = readl(((void __iomem *)adev->rmmio) + (reg * 4)); 704 } 705 } else { 706 ret = adev->pcie_rreg(adev, reg * 4); 707 } 708 709 trace_amdgpu_device_rreg(adev->pdev->device, reg, ret); 710 711 return ret; 712 } 713 714 /* 715 * MMIO register read with bytes helper functions 716 * @offset:bytes offset from MMIO start 717 */ 718 719 /** 720 * amdgpu_mm_rreg8 - read a memory mapped IO register 721 * 722 * @adev: amdgpu_device pointer 723 * @offset: byte aligned register offset 724 * 725 * Returns the 8 bit value from the offset specified. 726 */ 727 uint8_t amdgpu_mm_rreg8(struct amdgpu_device *adev, uint32_t offset) 728 { 729 if (amdgpu_device_skip_hw_access(adev)) 730 return 0; 731 732 if (offset < adev->rmmio_size) 733 return (readb(adev->rmmio + offset)); 734 BUG(); 735 } 736 737 738 /** 739 * amdgpu_device_xcc_rreg - read a memory mapped IO or indirect register with specific XCC 740 * 741 * @adev: amdgpu_device pointer 742 * @reg: dword aligned register offset 743 * @acc_flags: access flags which require special behavior 744 * @xcc_id: xcc accelerated compute core id 745 * 746 * Returns the 32 bit value from the offset specified. 747 */ 748 uint32_t amdgpu_device_xcc_rreg(struct amdgpu_device *adev, 749 uint32_t reg, uint32_t acc_flags, 750 uint32_t xcc_id) 751 { 752 uint32_t ret, rlcg_flag; 753 754 if (amdgpu_device_skip_hw_access(adev)) 755 return 0; 756 757 if ((reg * 4) < adev->rmmio_size) { 758 if (amdgpu_sriov_vf(adev) && 759 !amdgpu_sriov_runtime(adev) && 760 adev->gfx.rlc.rlcg_reg_access_supported && 761 amdgpu_virt_get_rlcg_reg_access_flag(adev, acc_flags, 762 GC_HWIP, false, 763 &rlcg_flag)) { 764 ret = amdgpu_virt_rlcg_reg_rw(adev, reg, 0, rlcg_flag, GET_INST(GC, xcc_id)); 765 } else if (!(acc_flags & AMDGPU_REGS_NO_KIQ) && 766 amdgpu_sriov_runtime(adev) && 767 down_read_trylock(&adev->reset_domain->sem)) { 768 ret = amdgpu_kiq_rreg(adev, reg, xcc_id); 769 up_read(&adev->reset_domain->sem); 770 } else { 771 ret = readl(((void __iomem *)adev->rmmio) + (reg * 4)); 772 } 773 } else { 774 ret = adev->pcie_rreg(adev, reg * 4); 775 } 776 777 return ret; 778 } 779 780 /* 781 * MMIO register write with bytes helper functions 782 * @offset:bytes offset from MMIO start 783 * @value: the value want to be written to the register 784 */ 785 786 /** 787 * amdgpu_mm_wreg8 - read a memory mapped IO register 788 * 789 * @adev: amdgpu_device pointer 790 * @offset: byte aligned register offset 791 * @value: 8 bit value to write 792 * 793 * Writes the value specified to the offset specified. 794 */ 795 void amdgpu_mm_wreg8(struct amdgpu_device *adev, uint32_t offset, uint8_t value) 796 { 797 if (amdgpu_device_skip_hw_access(adev)) 798 return; 799 800 if (offset < adev->rmmio_size) 801 writeb(value, adev->rmmio + offset); 802 else 803 BUG(); 804 } 805 806 /** 807 * amdgpu_device_wreg - write to a memory mapped IO or indirect register 808 * 809 * @adev: amdgpu_device pointer 810 * @reg: dword aligned register offset 811 * @v: 32 bit value to write to the register 812 * @acc_flags: access flags which require special behavior 813 * 814 * Writes the value specified to the offset specified. 815 */ 816 void amdgpu_device_wreg(struct amdgpu_device *adev, 817 uint32_t reg, uint32_t v, 818 uint32_t acc_flags) 819 { 820 if (amdgpu_device_skip_hw_access(adev)) 821 return; 822 823 if ((reg * 4) < adev->rmmio_size) { 824 if (!(acc_flags & AMDGPU_REGS_NO_KIQ) && 825 amdgpu_sriov_runtime(adev) && 826 down_read_trylock(&adev->reset_domain->sem)) { 827 amdgpu_kiq_wreg(adev, reg, v, 0); 828 up_read(&adev->reset_domain->sem); 829 } else { 830 writel(v, ((void __iomem *)adev->rmmio) + (reg * 4)); 831 } 832 } else { 833 adev->pcie_wreg(adev, reg * 4, v); 834 } 835 836 trace_amdgpu_device_wreg(adev->pdev->device, reg, v); 837 } 838 839 /** 840 * amdgpu_mm_wreg_mmio_rlc - write register either with direct/indirect mmio or with RLC path if in range 841 * 842 * @adev: amdgpu_device pointer 843 * @reg: mmio/rlc register 844 * @v: value to write 845 * @xcc_id: xcc accelerated compute core id 846 * 847 * this function is invoked only for the debugfs register access 848 */ 849 void amdgpu_mm_wreg_mmio_rlc(struct amdgpu_device *adev, 850 uint32_t reg, uint32_t v, 851 uint32_t xcc_id) 852 { 853 if (amdgpu_device_skip_hw_access(adev)) 854 return; 855 856 if (amdgpu_sriov_fullaccess(adev) && 857 adev->gfx.rlc.funcs && 858 adev->gfx.rlc.funcs->is_rlcg_access_range) { 859 if (adev->gfx.rlc.funcs->is_rlcg_access_range(adev, reg)) 860 return amdgpu_sriov_wreg(adev, reg, v, 0, 0, xcc_id); 861 } else if ((reg * 4) >= adev->rmmio_size) { 862 adev->pcie_wreg(adev, reg * 4, v); 863 } else { 864 writel(v, ((void __iomem *)adev->rmmio) + (reg * 4)); 865 } 866 } 867 868 /** 869 * amdgpu_device_xcc_wreg - write to a memory mapped IO or indirect register with specific XCC 870 * 871 * @adev: amdgpu_device pointer 872 * @reg: dword aligned register offset 873 * @v: 32 bit value to write to the register 874 * @acc_flags: access flags which require special behavior 875 * @xcc_id: xcc accelerated compute core id 876 * 877 * Writes the value specified to the offset specified. 878 */ 879 void amdgpu_device_xcc_wreg(struct amdgpu_device *adev, 880 uint32_t reg, uint32_t v, 881 uint32_t acc_flags, uint32_t xcc_id) 882 { 883 uint32_t rlcg_flag; 884 885 if (amdgpu_device_skip_hw_access(adev)) 886 return; 887 888 if ((reg * 4) < adev->rmmio_size) { 889 if (amdgpu_sriov_vf(adev) && 890 !amdgpu_sriov_runtime(adev) && 891 adev->gfx.rlc.rlcg_reg_access_supported && 892 amdgpu_virt_get_rlcg_reg_access_flag(adev, acc_flags, 893 GC_HWIP, true, 894 &rlcg_flag)) { 895 amdgpu_virt_rlcg_reg_rw(adev, reg, v, rlcg_flag, GET_INST(GC, xcc_id)); 896 } else if (!(acc_flags & AMDGPU_REGS_NO_KIQ) && 897 amdgpu_sriov_runtime(adev) && 898 down_read_trylock(&adev->reset_domain->sem)) { 899 amdgpu_kiq_wreg(adev, reg, v, xcc_id); 900 up_read(&adev->reset_domain->sem); 901 } else { 902 writel(v, ((void __iomem *)adev->rmmio) + (reg * 4)); 903 } 904 } else { 905 adev->pcie_wreg(adev, reg * 4, v); 906 } 907 } 908 909 /** 910 * amdgpu_device_indirect_rreg - read an indirect register 911 * 912 * @adev: amdgpu_device pointer 913 * @reg_addr: indirect register address to read from 914 * 915 * Returns the value of indirect register @reg_addr 916 */ 917 u32 amdgpu_device_indirect_rreg(struct amdgpu_device *adev, 918 u32 reg_addr) 919 { 920 unsigned long flags, pcie_index, pcie_data; 921 void __iomem *pcie_index_offset; 922 void __iomem *pcie_data_offset; 923 u32 r; 924 925 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 926 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 927 928 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 929 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 930 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 931 932 writel(reg_addr, pcie_index_offset); 933 readl(pcie_index_offset); 934 r = readl(pcie_data_offset); 935 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 936 937 return r; 938 } 939 940 u32 amdgpu_device_indirect_rreg_ext(struct amdgpu_device *adev, 941 u64 reg_addr) 942 { 943 unsigned long flags, pcie_index, pcie_index_hi, pcie_data; 944 u32 r; 945 void __iomem *pcie_index_offset; 946 void __iomem *pcie_index_hi_offset; 947 void __iomem *pcie_data_offset; 948 949 if (unlikely(!adev->nbio.funcs)) { 950 pcie_index = AMDGPU_PCIE_INDEX_FALLBACK; 951 pcie_data = AMDGPU_PCIE_DATA_FALLBACK; 952 } else { 953 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 954 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 955 } 956 957 if (reg_addr >> 32) { 958 if (unlikely(!adev->nbio.funcs)) 959 pcie_index_hi = AMDGPU_PCIE_INDEX_HI_FALLBACK; 960 else 961 pcie_index_hi = adev->nbio.funcs->get_pcie_index_hi_offset(adev); 962 } else { 963 pcie_index_hi = 0; 964 } 965 966 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 967 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 968 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 969 if (pcie_index_hi != 0) 970 pcie_index_hi_offset = (void __iomem *)adev->rmmio + 971 pcie_index_hi * 4; 972 973 writel(reg_addr, pcie_index_offset); 974 readl(pcie_index_offset); 975 if (pcie_index_hi != 0) { 976 writel((reg_addr >> 32) & 0xff, pcie_index_hi_offset); 977 readl(pcie_index_hi_offset); 978 } 979 r = readl(pcie_data_offset); 980 981 /* clear the high bits */ 982 if (pcie_index_hi != 0) { 983 writel(0, pcie_index_hi_offset); 984 readl(pcie_index_hi_offset); 985 } 986 987 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 988 989 return r; 990 } 991 992 /** 993 * amdgpu_device_indirect_rreg64 - read a 64bits indirect register 994 * 995 * @adev: amdgpu_device pointer 996 * @reg_addr: indirect register address to read from 997 * 998 * Returns the value of indirect register @reg_addr 999 */ 1000 u64 amdgpu_device_indirect_rreg64(struct amdgpu_device *adev, 1001 u32 reg_addr) 1002 { 1003 unsigned long flags, pcie_index, pcie_data; 1004 void __iomem *pcie_index_offset; 1005 void __iomem *pcie_data_offset; 1006 u64 r; 1007 1008 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 1009 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 1010 1011 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 1012 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 1013 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 1014 1015 /* read low 32 bits */ 1016 writel(reg_addr, pcie_index_offset); 1017 readl(pcie_index_offset); 1018 r = readl(pcie_data_offset); 1019 /* read high 32 bits */ 1020 writel(reg_addr + 4, pcie_index_offset); 1021 readl(pcie_index_offset); 1022 r |= ((u64)readl(pcie_data_offset) << 32); 1023 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 1024 1025 return r; 1026 } 1027 1028 u64 amdgpu_device_indirect_rreg64_ext(struct amdgpu_device *adev, 1029 u64 reg_addr) 1030 { 1031 unsigned long flags, pcie_index, pcie_data; 1032 unsigned long pcie_index_hi = 0; 1033 void __iomem *pcie_index_offset; 1034 void __iomem *pcie_index_hi_offset; 1035 void __iomem *pcie_data_offset; 1036 u64 r; 1037 1038 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 1039 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 1040 if ((reg_addr >> 32) && (adev->nbio.funcs->get_pcie_index_hi_offset)) 1041 pcie_index_hi = adev->nbio.funcs->get_pcie_index_hi_offset(adev); 1042 1043 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 1044 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 1045 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 1046 if (pcie_index_hi != 0) 1047 pcie_index_hi_offset = (void __iomem *)adev->rmmio + 1048 pcie_index_hi * 4; 1049 1050 /* read low 32 bits */ 1051 writel(reg_addr, pcie_index_offset); 1052 readl(pcie_index_offset); 1053 if (pcie_index_hi != 0) { 1054 writel((reg_addr >> 32) & 0xff, pcie_index_hi_offset); 1055 readl(pcie_index_hi_offset); 1056 } 1057 r = readl(pcie_data_offset); 1058 /* read high 32 bits */ 1059 writel(reg_addr + 4, pcie_index_offset); 1060 readl(pcie_index_offset); 1061 if (pcie_index_hi != 0) { 1062 writel((reg_addr >> 32) & 0xff, pcie_index_hi_offset); 1063 readl(pcie_index_hi_offset); 1064 } 1065 r |= ((u64)readl(pcie_data_offset) << 32); 1066 1067 /* clear the high bits */ 1068 if (pcie_index_hi != 0) { 1069 writel(0, pcie_index_hi_offset); 1070 readl(pcie_index_hi_offset); 1071 } 1072 1073 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 1074 1075 return r; 1076 } 1077 1078 /** 1079 * amdgpu_device_indirect_wreg - write an indirect register address 1080 * 1081 * @adev: amdgpu_device pointer 1082 * @reg_addr: indirect register offset 1083 * @reg_data: indirect register data 1084 * 1085 */ 1086 void amdgpu_device_indirect_wreg(struct amdgpu_device *adev, 1087 u32 reg_addr, u32 reg_data) 1088 { 1089 unsigned long flags, pcie_index, pcie_data; 1090 void __iomem *pcie_index_offset; 1091 void __iomem *pcie_data_offset; 1092 1093 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 1094 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 1095 1096 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 1097 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 1098 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 1099 1100 writel(reg_addr, pcie_index_offset); 1101 readl(pcie_index_offset); 1102 writel(reg_data, pcie_data_offset); 1103 readl(pcie_data_offset); 1104 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 1105 } 1106 1107 void amdgpu_device_indirect_wreg_ext(struct amdgpu_device *adev, 1108 u64 reg_addr, u32 reg_data) 1109 { 1110 unsigned long flags, pcie_index, pcie_index_hi, pcie_data; 1111 void __iomem *pcie_index_offset; 1112 void __iomem *pcie_index_hi_offset; 1113 void __iomem *pcie_data_offset; 1114 1115 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 1116 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 1117 if ((reg_addr >> 32) && (adev->nbio.funcs->get_pcie_index_hi_offset)) 1118 pcie_index_hi = adev->nbio.funcs->get_pcie_index_hi_offset(adev); 1119 else 1120 pcie_index_hi = 0; 1121 1122 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 1123 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 1124 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 1125 if (pcie_index_hi != 0) 1126 pcie_index_hi_offset = (void __iomem *)adev->rmmio + 1127 pcie_index_hi * 4; 1128 1129 writel(reg_addr, pcie_index_offset); 1130 readl(pcie_index_offset); 1131 if (pcie_index_hi != 0) { 1132 writel((reg_addr >> 32) & 0xff, pcie_index_hi_offset); 1133 readl(pcie_index_hi_offset); 1134 } 1135 writel(reg_data, pcie_data_offset); 1136 readl(pcie_data_offset); 1137 1138 /* clear the high bits */ 1139 if (pcie_index_hi != 0) { 1140 writel(0, pcie_index_hi_offset); 1141 readl(pcie_index_hi_offset); 1142 } 1143 1144 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 1145 } 1146 1147 /** 1148 * amdgpu_device_indirect_wreg64 - write a 64bits indirect register address 1149 * 1150 * @adev: amdgpu_device pointer 1151 * @reg_addr: indirect register offset 1152 * @reg_data: indirect register data 1153 * 1154 */ 1155 void amdgpu_device_indirect_wreg64(struct amdgpu_device *adev, 1156 u32 reg_addr, u64 reg_data) 1157 { 1158 unsigned long flags, pcie_index, pcie_data; 1159 void __iomem *pcie_index_offset; 1160 void __iomem *pcie_data_offset; 1161 1162 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 1163 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 1164 1165 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 1166 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 1167 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 1168 1169 /* write low 32 bits */ 1170 writel(reg_addr, pcie_index_offset); 1171 readl(pcie_index_offset); 1172 writel((u32)(reg_data & 0xffffffffULL), pcie_data_offset); 1173 readl(pcie_data_offset); 1174 /* write high 32 bits */ 1175 writel(reg_addr + 4, pcie_index_offset); 1176 readl(pcie_index_offset); 1177 writel((u32)(reg_data >> 32), pcie_data_offset); 1178 readl(pcie_data_offset); 1179 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 1180 } 1181 1182 void amdgpu_device_indirect_wreg64_ext(struct amdgpu_device *adev, 1183 u64 reg_addr, u64 reg_data) 1184 { 1185 unsigned long flags, pcie_index, pcie_data; 1186 unsigned long pcie_index_hi = 0; 1187 void __iomem *pcie_index_offset; 1188 void __iomem *pcie_index_hi_offset; 1189 void __iomem *pcie_data_offset; 1190 1191 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 1192 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 1193 if ((reg_addr >> 32) && (adev->nbio.funcs->get_pcie_index_hi_offset)) 1194 pcie_index_hi = adev->nbio.funcs->get_pcie_index_hi_offset(adev); 1195 1196 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 1197 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 1198 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 1199 if (pcie_index_hi != 0) 1200 pcie_index_hi_offset = (void __iomem *)adev->rmmio + 1201 pcie_index_hi * 4; 1202 1203 /* write low 32 bits */ 1204 writel(reg_addr, pcie_index_offset); 1205 readl(pcie_index_offset); 1206 if (pcie_index_hi != 0) { 1207 writel((reg_addr >> 32) & 0xff, pcie_index_hi_offset); 1208 readl(pcie_index_hi_offset); 1209 } 1210 writel((u32)(reg_data & 0xffffffffULL), pcie_data_offset); 1211 readl(pcie_data_offset); 1212 /* write high 32 bits */ 1213 writel(reg_addr + 4, pcie_index_offset); 1214 readl(pcie_index_offset); 1215 if (pcie_index_hi != 0) { 1216 writel((reg_addr >> 32) & 0xff, pcie_index_hi_offset); 1217 readl(pcie_index_hi_offset); 1218 } 1219 writel((u32)(reg_data >> 32), pcie_data_offset); 1220 readl(pcie_data_offset); 1221 1222 /* clear the high bits */ 1223 if (pcie_index_hi != 0) { 1224 writel(0, pcie_index_hi_offset); 1225 readl(pcie_index_hi_offset); 1226 } 1227 1228 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 1229 } 1230 1231 /** 1232 * amdgpu_device_get_rev_id - query device rev_id 1233 * 1234 * @adev: amdgpu_device pointer 1235 * 1236 * Return device rev_id 1237 */ 1238 u32 amdgpu_device_get_rev_id(struct amdgpu_device *adev) 1239 { 1240 return adev->nbio.funcs->get_rev_id(adev); 1241 } 1242 1243 /** 1244 * amdgpu_invalid_rreg - dummy reg read function 1245 * 1246 * @adev: amdgpu_device pointer 1247 * @reg: offset of register 1248 * 1249 * Dummy register read function. Used for register blocks 1250 * that certain asics don't have (all asics). 1251 * Returns the value in the register. 1252 */ 1253 static uint32_t amdgpu_invalid_rreg(struct amdgpu_device *adev, uint32_t reg) 1254 { 1255 DRM_ERROR("Invalid callback to read register 0x%04X\n", reg); 1256 BUG(); 1257 return 0; 1258 } 1259 1260 static uint32_t amdgpu_invalid_rreg_ext(struct amdgpu_device *adev, uint64_t reg) 1261 { 1262 DRM_ERROR("Invalid callback to read register 0x%llX\n", reg); 1263 BUG(); 1264 return 0; 1265 } 1266 1267 /** 1268 * amdgpu_invalid_wreg - dummy reg write function 1269 * 1270 * @adev: amdgpu_device pointer 1271 * @reg: offset of register 1272 * @v: value to write to the register 1273 * 1274 * Dummy register read function. Used for register blocks 1275 * that certain asics don't have (all asics). 1276 */ 1277 static void amdgpu_invalid_wreg(struct amdgpu_device *adev, uint32_t reg, uint32_t v) 1278 { 1279 DRM_ERROR("Invalid callback to write register 0x%04X with 0x%08X\n", 1280 reg, v); 1281 BUG(); 1282 } 1283 1284 static void amdgpu_invalid_wreg_ext(struct amdgpu_device *adev, uint64_t reg, uint32_t v) 1285 { 1286 DRM_ERROR("Invalid callback to write register 0x%llX with 0x%08X\n", 1287 reg, v); 1288 BUG(); 1289 } 1290 1291 /** 1292 * amdgpu_invalid_rreg64 - dummy 64 bit reg read function 1293 * 1294 * @adev: amdgpu_device pointer 1295 * @reg: offset of register 1296 * 1297 * Dummy register read function. Used for register blocks 1298 * that certain asics don't have (all asics). 1299 * Returns the value in the register. 1300 */ 1301 static uint64_t amdgpu_invalid_rreg64(struct amdgpu_device *adev, uint32_t reg) 1302 { 1303 DRM_ERROR("Invalid callback to read 64 bit register 0x%04X\n", reg); 1304 BUG(); 1305 return 0; 1306 } 1307 1308 static uint64_t amdgpu_invalid_rreg64_ext(struct amdgpu_device *adev, uint64_t reg) 1309 { 1310 DRM_ERROR("Invalid callback to read register 0x%llX\n", reg); 1311 BUG(); 1312 return 0; 1313 } 1314 1315 /** 1316 * amdgpu_invalid_wreg64 - dummy reg write function 1317 * 1318 * @adev: amdgpu_device pointer 1319 * @reg: offset of register 1320 * @v: value to write to the register 1321 * 1322 * Dummy register read function. Used for register blocks 1323 * that certain asics don't have (all asics). 1324 */ 1325 static void amdgpu_invalid_wreg64(struct amdgpu_device *adev, uint32_t reg, uint64_t v) 1326 { 1327 DRM_ERROR("Invalid callback to write 64 bit register 0x%04X with 0x%08llX\n", 1328 reg, v); 1329 BUG(); 1330 } 1331 1332 static void amdgpu_invalid_wreg64_ext(struct amdgpu_device *adev, uint64_t reg, uint64_t v) 1333 { 1334 DRM_ERROR("Invalid callback to write 64 bit register 0x%llX with 0x%08llX\n", 1335 reg, v); 1336 BUG(); 1337 } 1338 1339 /** 1340 * amdgpu_block_invalid_rreg - dummy reg read function 1341 * 1342 * @adev: amdgpu_device pointer 1343 * @block: offset of instance 1344 * @reg: offset of register 1345 * 1346 * Dummy register read function. Used for register blocks 1347 * that certain asics don't have (all asics). 1348 * Returns the value in the register. 1349 */ 1350 static uint32_t amdgpu_block_invalid_rreg(struct amdgpu_device *adev, 1351 uint32_t block, uint32_t reg) 1352 { 1353 DRM_ERROR("Invalid callback to read register 0x%04X in block 0x%04X\n", 1354 reg, block); 1355 BUG(); 1356 return 0; 1357 } 1358 1359 /** 1360 * amdgpu_block_invalid_wreg - dummy reg write function 1361 * 1362 * @adev: amdgpu_device pointer 1363 * @block: offset of instance 1364 * @reg: offset of register 1365 * @v: value to write to the register 1366 * 1367 * Dummy register read function. Used for register blocks 1368 * that certain asics don't have (all asics). 1369 */ 1370 static void amdgpu_block_invalid_wreg(struct amdgpu_device *adev, 1371 uint32_t block, 1372 uint32_t reg, uint32_t v) 1373 { 1374 DRM_ERROR("Invalid block callback to write register 0x%04X in block 0x%04X with 0x%08X\n", 1375 reg, block, v); 1376 BUG(); 1377 } 1378 1379 /** 1380 * amdgpu_device_asic_init - Wrapper for atom asic_init 1381 * 1382 * @adev: amdgpu_device pointer 1383 * 1384 * Does any asic specific work and then calls atom asic init. 1385 */ 1386 static int amdgpu_device_asic_init(struct amdgpu_device *adev) 1387 { 1388 int ret; 1389 1390 amdgpu_asic_pre_asic_init(adev); 1391 1392 if (amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 3) || 1393 amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 4) || 1394 amdgpu_ip_version(adev, GC_HWIP, 0) >= IP_VERSION(11, 0, 0)) { 1395 amdgpu_psp_wait_for_bootloader(adev); 1396 ret = amdgpu_atomfirmware_asic_init(adev, true); 1397 return ret; 1398 } else { 1399 return amdgpu_atom_asic_init(adev->mode_info.atom_context); 1400 } 1401 1402 return 0; 1403 } 1404 1405 /** 1406 * amdgpu_device_mem_scratch_init - allocate the VRAM scratch page 1407 * 1408 * @adev: amdgpu_device pointer 1409 * 1410 * Allocates a scratch page of VRAM for use by various things in the 1411 * driver. 1412 */ 1413 static int amdgpu_device_mem_scratch_init(struct amdgpu_device *adev) 1414 { 1415 return amdgpu_bo_create_kernel(adev, AMDGPU_GPU_PAGE_SIZE, PAGE_SIZE, 1416 AMDGPU_GEM_DOMAIN_VRAM | 1417 AMDGPU_GEM_DOMAIN_GTT, 1418 &adev->mem_scratch.robj, 1419 &adev->mem_scratch.gpu_addr, 1420 (void **)&adev->mem_scratch.ptr); 1421 } 1422 1423 /** 1424 * amdgpu_device_mem_scratch_fini - Free the VRAM scratch page 1425 * 1426 * @adev: amdgpu_device pointer 1427 * 1428 * Frees the VRAM scratch page. 1429 */ 1430 static void amdgpu_device_mem_scratch_fini(struct amdgpu_device *adev) 1431 { 1432 amdgpu_bo_free_kernel(&adev->mem_scratch.robj, NULL, NULL); 1433 } 1434 1435 /** 1436 * amdgpu_device_program_register_sequence - program an array of registers. 1437 * 1438 * @adev: amdgpu_device pointer 1439 * @registers: pointer to the register array 1440 * @array_size: size of the register array 1441 * 1442 * Programs an array or registers with and or masks. 1443 * This is a helper for setting golden registers. 1444 */ 1445 void amdgpu_device_program_register_sequence(struct amdgpu_device *adev, 1446 const u32 *registers, 1447 const u32 array_size) 1448 { 1449 u32 tmp, reg, and_mask, or_mask; 1450 int i; 1451 1452 if (array_size % 3) 1453 return; 1454 1455 for (i = 0; i < array_size; i += 3) { 1456 reg = registers[i + 0]; 1457 and_mask = registers[i + 1]; 1458 or_mask = registers[i + 2]; 1459 1460 if (and_mask == 0xffffffff) { 1461 tmp = or_mask; 1462 } else { 1463 tmp = RREG32(reg); 1464 tmp &= ~and_mask; 1465 if (adev->family >= AMDGPU_FAMILY_AI) 1466 tmp |= (or_mask & and_mask); 1467 else 1468 tmp |= or_mask; 1469 } 1470 WREG32(reg, tmp); 1471 } 1472 } 1473 1474 /** 1475 * amdgpu_device_pci_config_reset - reset the GPU 1476 * 1477 * @adev: amdgpu_device pointer 1478 * 1479 * Resets the GPU using the pci config reset sequence. 1480 * Only applicable to asics prior to vega10. 1481 */ 1482 void amdgpu_device_pci_config_reset(struct amdgpu_device *adev) 1483 { 1484 pci_write_config_dword(adev->pdev, 0x7c, AMDGPU_ASIC_RESET_DATA); 1485 } 1486 1487 /** 1488 * amdgpu_device_pci_reset - reset the GPU using generic PCI means 1489 * 1490 * @adev: amdgpu_device pointer 1491 * 1492 * Resets the GPU using generic pci reset interfaces (FLR, SBR, etc.). 1493 */ 1494 int amdgpu_device_pci_reset(struct amdgpu_device *adev) 1495 { 1496 return pci_reset_function(adev->pdev); 1497 } 1498 1499 /* 1500 * amdgpu_device_wb_*() 1501 * Writeback is the method by which the GPU updates special pages in memory 1502 * with the status of certain GPU events (fences, ring pointers,etc.). 1503 */ 1504 1505 /** 1506 * amdgpu_device_wb_fini - Disable Writeback and free memory 1507 * 1508 * @adev: amdgpu_device pointer 1509 * 1510 * Disables Writeback and frees the Writeback memory (all asics). 1511 * Used at driver shutdown. 1512 */ 1513 static void amdgpu_device_wb_fini(struct amdgpu_device *adev) 1514 { 1515 if (adev->wb.wb_obj) { 1516 amdgpu_bo_free_kernel(&adev->wb.wb_obj, 1517 &adev->wb.gpu_addr, 1518 (void **)&adev->wb.wb); 1519 adev->wb.wb_obj = NULL; 1520 } 1521 } 1522 1523 /** 1524 * amdgpu_device_wb_init - Init Writeback driver info and allocate memory 1525 * 1526 * @adev: amdgpu_device pointer 1527 * 1528 * Initializes writeback and allocates writeback memory (all asics). 1529 * Used at driver startup. 1530 * Returns 0 on success or an -error on failure. 1531 */ 1532 static int amdgpu_device_wb_init(struct amdgpu_device *adev) 1533 { 1534 int r; 1535 1536 if (adev->wb.wb_obj == NULL) { 1537 /* AMDGPU_MAX_WB * sizeof(uint32_t) * 8 = AMDGPU_MAX_WB 256bit slots */ 1538 r = amdgpu_bo_create_kernel(adev, AMDGPU_MAX_WB * sizeof(uint32_t) * 8, 1539 PAGE_SIZE, AMDGPU_GEM_DOMAIN_GTT, 1540 &adev->wb.wb_obj, &adev->wb.gpu_addr, 1541 (void **)&adev->wb.wb); 1542 if (r) { 1543 dev_warn(adev->dev, "(%d) create WB bo failed\n", r); 1544 return r; 1545 } 1546 1547 adev->wb.num_wb = AMDGPU_MAX_WB; 1548 memset(&adev->wb.used, 0, sizeof(adev->wb.used)); 1549 1550 /* clear wb memory */ 1551 memset((char *)adev->wb.wb, 0, AMDGPU_MAX_WB * sizeof(uint32_t) * 8); 1552 } 1553 1554 return 0; 1555 } 1556 1557 /** 1558 * amdgpu_device_wb_get - Allocate a wb entry 1559 * 1560 * @adev: amdgpu_device pointer 1561 * @wb: wb index 1562 * 1563 * Allocate a wb slot for use by the driver (all asics). 1564 * Returns 0 on success or -EINVAL on failure. 1565 */ 1566 int amdgpu_device_wb_get(struct amdgpu_device *adev, u32 *wb) 1567 { 1568 unsigned long flags, offset; 1569 1570 spin_lock_irqsave(&adev->wb.lock, flags); 1571 offset = find_first_zero_bit(adev->wb.used, adev->wb.num_wb); 1572 if (offset < adev->wb.num_wb) { 1573 __set_bit(offset, adev->wb.used); 1574 spin_unlock_irqrestore(&adev->wb.lock, flags); 1575 *wb = offset << 3; /* convert to dw offset */ 1576 return 0; 1577 } else { 1578 spin_unlock_irqrestore(&adev->wb.lock, flags); 1579 return -EINVAL; 1580 } 1581 } 1582 1583 /** 1584 * amdgpu_device_wb_free - Free a wb entry 1585 * 1586 * @adev: amdgpu_device pointer 1587 * @wb: wb index 1588 * 1589 * Free a wb slot allocated for use by the driver (all asics) 1590 */ 1591 void amdgpu_device_wb_free(struct amdgpu_device *adev, u32 wb) 1592 { 1593 unsigned long flags; 1594 1595 wb >>= 3; 1596 spin_lock_irqsave(&adev->wb.lock, flags); 1597 if (wb < adev->wb.num_wb) 1598 __clear_bit(wb, adev->wb.used); 1599 spin_unlock_irqrestore(&adev->wb.lock, flags); 1600 } 1601 1602 /** 1603 * amdgpu_device_resize_fb_bar - try to resize FB BAR 1604 * 1605 * @adev: amdgpu_device pointer 1606 * 1607 * Try to resize FB BAR to make all VRAM CPU accessible. We try very hard not 1608 * to fail, but if any of the BARs is not accessible after the size we abort 1609 * driver loading by returning -ENODEV. 1610 */ 1611 int amdgpu_device_resize_fb_bar(struct amdgpu_device *adev) 1612 { 1613 int rbar_size = pci_rebar_bytes_to_size(adev->gmc.real_vram_size); 1614 struct pci_bus *root; 1615 struct resource *res; 1616 unsigned int i; 1617 u16 cmd; 1618 int r; 1619 1620 if (!IS_ENABLED(CONFIG_PHYS_ADDR_T_64BIT)) 1621 return 0; 1622 1623 /* Bypass for VF */ 1624 if (amdgpu_sriov_vf(adev)) 1625 return 0; 1626 1627 /* PCI_EXT_CAP_ID_VNDR extended capability is located at 0x100 */ 1628 if (!pci_find_ext_capability(adev->pdev, PCI_EXT_CAP_ID_VNDR)) 1629 DRM_WARN("System can't access extended configuration space, please check!!\n"); 1630 1631 /* skip if the bios has already enabled large BAR */ 1632 if (adev->gmc.real_vram_size && 1633 (pci_resource_len(adev->pdev, 0) >= adev->gmc.real_vram_size)) 1634 return 0; 1635 1636 /* Check if the root BUS has 64bit memory resources */ 1637 root = adev->pdev->bus; 1638 while (root->parent) 1639 root = root->parent; 1640 1641 pci_bus_for_each_resource(root, res, i) { 1642 if (res && res->flags & (IORESOURCE_MEM | IORESOURCE_MEM_64) && 1643 res->start > 0x100000000ull) 1644 break; 1645 } 1646 1647 /* Trying to resize is pointless without a root hub window above 4GB */ 1648 if (!res) 1649 return 0; 1650 1651 /* Limit the BAR size to what is available */ 1652 rbar_size = min(fls(pci_rebar_get_possible_sizes(adev->pdev, 0)) - 1, 1653 rbar_size); 1654 1655 /* Disable memory decoding while we change the BAR addresses and size */ 1656 pci_read_config_word(adev->pdev, PCI_COMMAND, &cmd); 1657 pci_write_config_word(adev->pdev, PCI_COMMAND, 1658 cmd & ~PCI_COMMAND_MEMORY); 1659 1660 /* Free the VRAM and doorbell BAR, we most likely need to move both. */ 1661 amdgpu_doorbell_fini(adev); 1662 if (adev->asic_type >= CHIP_BONAIRE) 1663 pci_release_resource(adev->pdev, 2); 1664 1665 pci_release_resource(adev->pdev, 0); 1666 1667 r = pci_resize_resource(adev->pdev, 0, rbar_size); 1668 if (r == -ENOSPC) 1669 DRM_INFO("Not enough PCI address space for a large BAR."); 1670 else if (r && r != -ENOTSUPP) 1671 DRM_ERROR("Problem resizing BAR0 (%d).", r); 1672 1673 pci_assign_unassigned_bus_resources(adev->pdev->bus); 1674 1675 /* When the doorbell or fb BAR isn't available we have no chance of 1676 * using the device. 1677 */ 1678 r = amdgpu_doorbell_init(adev); 1679 if (r || (pci_resource_flags(adev->pdev, 0) & IORESOURCE_UNSET)) 1680 return -ENODEV; 1681 1682 pci_write_config_word(adev->pdev, PCI_COMMAND, cmd); 1683 1684 return 0; 1685 } 1686 1687 static bool amdgpu_device_read_bios(struct amdgpu_device *adev) 1688 { 1689 if (hweight32(adev->aid_mask) && (adev->flags & AMD_IS_APU)) 1690 return false; 1691 1692 return true; 1693 } 1694 1695 /* 1696 * GPU helpers function. 1697 */ 1698 /** 1699 * amdgpu_device_need_post - check if the hw need post or not 1700 * 1701 * @adev: amdgpu_device pointer 1702 * 1703 * Check if the asic has been initialized (all asics) at driver startup 1704 * or post is needed if hw reset is performed. 1705 * Returns true if need or false if not. 1706 */ 1707 bool amdgpu_device_need_post(struct amdgpu_device *adev) 1708 { 1709 uint32_t reg; 1710 1711 if (amdgpu_sriov_vf(adev)) 1712 return false; 1713 1714 if (!amdgpu_device_read_bios(adev)) 1715 return false; 1716 1717 if (amdgpu_passthrough(adev)) { 1718 /* for FIJI: In whole GPU pass-through virtualization case, after VM reboot 1719 * some old smc fw still need driver do vPost otherwise gpu hang, while 1720 * those smc fw version above 22.15 doesn't have this flaw, so we force 1721 * vpost executed for smc version below 22.15 1722 */ 1723 if (adev->asic_type == CHIP_FIJI) { 1724 int err; 1725 uint32_t fw_ver; 1726 1727 err = request_firmware(&adev->pm.fw, "amdgpu/fiji_smc.bin", adev->dev); 1728 /* force vPost if error occured */ 1729 if (err) 1730 return true; 1731 1732 fw_ver = *((uint32_t *)adev->pm.fw->data + 69); 1733 release_firmware(adev->pm.fw); 1734 if (fw_ver < 0x00160e00) 1735 return true; 1736 } 1737 } 1738 1739 /* Don't post if we need to reset whole hive on init */ 1740 if (adev->init_lvl->level == AMDGPU_INIT_LEVEL_MINIMAL_XGMI) 1741 return false; 1742 1743 if (adev->has_hw_reset) { 1744 adev->has_hw_reset = false; 1745 return true; 1746 } 1747 1748 /* bios scratch used on CIK+ */ 1749 if (adev->asic_type >= CHIP_BONAIRE) 1750 return amdgpu_atombios_scratch_need_asic_init(adev); 1751 1752 /* check MEM_SIZE for older asics */ 1753 reg = amdgpu_asic_get_config_memsize(adev); 1754 1755 if ((reg != 0) && (reg != 0xffffffff)) 1756 return false; 1757 1758 return true; 1759 } 1760 1761 /* 1762 * Check whether seamless boot is supported. 1763 * 1764 * So far we only support seamless boot on DCE 3.0 or later. 1765 * If users report that it works on older ASICS as well, we may 1766 * loosen this. 1767 */ 1768 bool amdgpu_device_seamless_boot_supported(struct amdgpu_device *adev) 1769 { 1770 switch (amdgpu_seamless) { 1771 case -1: 1772 break; 1773 case 1: 1774 return true; 1775 case 0: 1776 return false; 1777 default: 1778 DRM_ERROR("Invalid value for amdgpu.seamless: %d\n", 1779 amdgpu_seamless); 1780 return false; 1781 } 1782 1783 if (!(adev->flags & AMD_IS_APU)) 1784 return false; 1785 1786 if (adev->mman.keep_stolen_vga_memory) 1787 return false; 1788 1789 return amdgpu_ip_version(adev, DCE_HWIP, 0) >= IP_VERSION(3, 0, 0); 1790 } 1791 1792 /* 1793 * Intel hosts such as Rocket Lake, Alder Lake, Raptor Lake and Sapphire Rapids 1794 * don't support dynamic speed switching. Until we have confirmation from Intel 1795 * that a specific host supports it, it's safer that we keep it disabled for all. 1796 * 1797 * https://edc.intel.com/content/www/us/en/design/products/platforms/details/raptor-lake-s/13th-generation-core-processors-datasheet-volume-1-of-2/005/pci-express-support/ 1798 * https://gitlab.freedesktop.org/drm/amd/-/issues/2663 1799 */ 1800 static bool amdgpu_device_pcie_dynamic_switching_supported(struct amdgpu_device *adev) 1801 { 1802 #if IS_ENABLED(CONFIG_X86) 1803 struct cpuinfo_x86 *c = &cpu_data(0); 1804 1805 /* eGPU change speeds based on USB4 fabric conditions */ 1806 if (dev_is_removable(adev->dev)) 1807 return true; 1808 1809 if (c->x86_vendor == X86_VENDOR_INTEL) 1810 return false; 1811 #endif 1812 return true; 1813 } 1814 1815 /** 1816 * amdgpu_device_should_use_aspm - check if the device should program ASPM 1817 * 1818 * @adev: amdgpu_device pointer 1819 * 1820 * Confirm whether the module parameter and pcie bridge agree that ASPM should 1821 * be set for this device. 1822 * 1823 * Returns true if it should be used or false if not. 1824 */ 1825 bool amdgpu_device_should_use_aspm(struct amdgpu_device *adev) 1826 { 1827 switch (amdgpu_aspm) { 1828 case -1: 1829 break; 1830 case 0: 1831 return false; 1832 case 1: 1833 return true; 1834 default: 1835 return false; 1836 } 1837 if (adev->flags & AMD_IS_APU) 1838 return false; 1839 if (!(adev->pm.pp_feature & PP_PCIE_DPM_MASK)) 1840 return false; 1841 return pcie_aspm_enabled(adev->pdev); 1842 } 1843 1844 /* if we get transitioned to only one device, take VGA back */ 1845 /** 1846 * amdgpu_device_vga_set_decode - enable/disable vga decode 1847 * 1848 * @pdev: PCI device pointer 1849 * @state: enable/disable vga decode 1850 * 1851 * Enable/disable vga decode (all asics). 1852 * Returns VGA resource flags. 1853 */ 1854 static unsigned int amdgpu_device_vga_set_decode(struct pci_dev *pdev, 1855 bool state) 1856 { 1857 struct amdgpu_device *adev = drm_to_adev(pci_get_drvdata(pdev)); 1858 1859 amdgpu_asic_set_vga_state(adev, state); 1860 if (state) 1861 return VGA_RSRC_LEGACY_IO | VGA_RSRC_LEGACY_MEM | 1862 VGA_RSRC_NORMAL_IO | VGA_RSRC_NORMAL_MEM; 1863 else 1864 return VGA_RSRC_NORMAL_IO | VGA_RSRC_NORMAL_MEM; 1865 } 1866 1867 /** 1868 * amdgpu_device_check_block_size - validate the vm block size 1869 * 1870 * @adev: amdgpu_device pointer 1871 * 1872 * Validates the vm block size specified via module parameter. 1873 * The vm block size defines number of bits in page table versus page directory, 1874 * a page is 4KB so we have 12 bits offset, minimum 9 bits in the 1875 * page table and the remaining bits are in the page directory. 1876 */ 1877 static void amdgpu_device_check_block_size(struct amdgpu_device *adev) 1878 { 1879 /* defines number of bits in page table versus page directory, 1880 * a page is 4KB so we have 12 bits offset, minimum 9 bits in the 1881 * page table and the remaining bits are in the page directory 1882 */ 1883 if (amdgpu_vm_block_size == -1) 1884 return; 1885 1886 if (amdgpu_vm_block_size < 9) { 1887 dev_warn(adev->dev, "VM page table size (%d) too small\n", 1888 amdgpu_vm_block_size); 1889 amdgpu_vm_block_size = -1; 1890 } 1891 } 1892 1893 /** 1894 * amdgpu_device_check_vm_size - validate the vm size 1895 * 1896 * @adev: amdgpu_device pointer 1897 * 1898 * Validates the vm size in GB specified via module parameter. 1899 * The VM size is the size of the GPU virtual memory space in GB. 1900 */ 1901 static void amdgpu_device_check_vm_size(struct amdgpu_device *adev) 1902 { 1903 /* no need to check the default value */ 1904 if (amdgpu_vm_size == -1) 1905 return; 1906 1907 if (amdgpu_vm_size < 1) { 1908 dev_warn(adev->dev, "VM size (%d) too small, min is 1GB\n", 1909 amdgpu_vm_size); 1910 amdgpu_vm_size = -1; 1911 } 1912 } 1913 1914 static void amdgpu_device_check_smu_prv_buffer_size(struct amdgpu_device *adev) 1915 { 1916 struct sysinfo si; 1917 bool is_os_64 = (sizeof(void *) == 8); 1918 uint64_t total_memory; 1919 uint64_t dram_size_seven_GB = 0x1B8000000; 1920 uint64_t dram_size_three_GB = 0xB8000000; 1921 1922 if (amdgpu_smu_memory_pool_size == 0) 1923 return; 1924 1925 if (!is_os_64) { 1926 DRM_WARN("Not 64-bit OS, feature not supported\n"); 1927 goto def_value; 1928 } 1929 si_meminfo(&si); 1930 total_memory = (uint64_t)si.totalram * si.mem_unit; 1931 1932 if ((amdgpu_smu_memory_pool_size == 1) || 1933 (amdgpu_smu_memory_pool_size == 2)) { 1934 if (total_memory < dram_size_three_GB) 1935 goto def_value1; 1936 } else if ((amdgpu_smu_memory_pool_size == 4) || 1937 (amdgpu_smu_memory_pool_size == 8)) { 1938 if (total_memory < dram_size_seven_GB) 1939 goto def_value1; 1940 } else { 1941 DRM_WARN("Smu memory pool size not supported\n"); 1942 goto def_value; 1943 } 1944 adev->pm.smu_prv_buffer_size = amdgpu_smu_memory_pool_size << 28; 1945 1946 return; 1947 1948 def_value1: 1949 DRM_WARN("No enough system memory\n"); 1950 def_value: 1951 adev->pm.smu_prv_buffer_size = 0; 1952 } 1953 1954 static int amdgpu_device_init_apu_flags(struct amdgpu_device *adev) 1955 { 1956 if (!(adev->flags & AMD_IS_APU) || 1957 adev->asic_type < CHIP_RAVEN) 1958 return 0; 1959 1960 switch (adev->asic_type) { 1961 case CHIP_RAVEN: 1962 if (adev->pdev->device == 0x15dd) 1963 adev->apu_flags |= AMD_APU_IS_RAVEN; 1964 if (adev->pdev->device == 0x15d8) 1965 adev->apu_flags |= AMD_APU_IS_PICASSO; 1966 break; 1967 case CHIP_RENOIR: 1968 if ((adev->pdev->device == 0x1636) || 1969 (adev->pdev->device == 0x164c)) 1970 adev->apu_flags |= AMD_APU_IS_RENOIR; 1971 else 1972 adev->apu_flags |= AMD_APU_IS_GREEN_SARDINE; 1973 break; 1974 case CHIP_VANGOGH: 1975 adev->apu_flags |= AMD_APU_IS_VANGOGH; 1976 break; 1977 case CHIP_YELLOW_CARP: 1978 break; 1979 case CHIP_CYAN_SKILLFISH: 1980 if ((adev->pdev->device == 0x13FE) || 1981 (adev->pdev->device == 0x143F)) 1982 adev->apu_flags |= AMD_APU_IS_CYAN_SKILLFISH2; 1983 break; 1984 default: 1985 break; 1986 } 1987 1988 return 0; 1989 } 1990 1991 /** 1992 * amdgpu_device_check_arguments - validate module params 1993 * 1994 * @adev: amdgpu_device pointer 1995 * 1996 * Validates certain module parameters and updates 1997 * the associated values used by the driver (all asics). 1998 */ 1999 static int amdgpu_device_check_arguments(struct amdgpu_device *adev) 2000 { 2001 int i; 2002 2003 if (amdgpu_sched_jobs < 4) { 2004 dev_warn(adev->dev, "sched jobs (%d) must be at least 4\n", 2005 amdgpu_sched_jobs); 2006 amdgpu_sched_jobs = 4; 2007 } else if (!is_power_of_2(amdgpu_sched_jobs)) { 2008 dev_warn(adev->dev, "sched jobs (%d) must be a power of 2\n", 2009 amdgpu_sched_jobs); 2010 amdgpu_sched_jobs = roundup_pow_of_two(amdgpu_sched_jobs); 2011 } 2012 2013 if (amdgpu_gart_size != -1 && amdgpu_gart_size < 32) { 2014 /* gart size must be greater or equal to 32M */ 2015 dev_warn(adev->dev, "gart size (%d) too small\n", 2016 amdgpu_gart_size); 2017 amdgpu_gart_size = -1; 2018 } 2019 2020 if (amdgpu_gtt_size != -1 && amdgpu_gtt_size < 32) { 2021 /* gtt size must be greater or equal to 32M */ 2022 dev_warn(adev->dev, "gtt size (%d) too small\n", 2023 amdgpu_gtt_size); 2024 amdgpu_gtt_size = -1; 2025 } 2026 2027 /* valid range is between 4 and 9 inclusive */ 2028 if (amdgpu_vm_fragment_size != -1 && 2029 (amdgpu_vm_fragment_size > 9 || amdgpu_vm_fragment_size < 4)) { 2030 dev_warn(adev->dev, "valid range is between 4 and 9\n"); 2031 amdgpu_vm_fragment_size = -1; 2032 } 2033 2034 if (amdgpu_sched_hw_submission < 2) { 2035 dev_warn(adev->dev, "sched hw submission jobs (%d) must be at least 2\n", 2036 amdgpu_sched_hw_submission); 2037 amdgpu_sched_hw_submission = 2; 2038 } else if (!is_power_of_2(amdgpu_sched_hw_submission)) { 2039 dev_warn(adev->dev, "sched hw submission jobs (%d) must be a power of 2\n", 2040 amdgpu_sched_hw_submission); 2041 amdgpu_sched_hw_submission = roundup_pow_of_two(amdgpu_sched_hw_submission); 2042 } 2043 2044 if (amdgpu_reset_method < -1 || amdgpu_reset_method > 4) { 2045 dev_warn(adev->dev, "invalid option for reset method, reverting to default\n"); 2046 amdgpu_reset_method = -1; 2047 } 2048 2049 amdgpu_device_check_smu_prv_buffer_size(adev); 2050 2051 amdgpu_device_check_vm_size(adev); 2052 2053 amdgpu_device_check_block_size(adev); 2054 2055 adev->firmware.load_type = amdgpu_ucode_get_load_type(adev, amdgpu_fw_load_type); 2056 2057 for (i = 0; i < MAX_XCP; i++) 2058 adev->enforce_isolation[i] = !!enforce_isolation; 2059 2060 return 0; 2061 } 2062 2063 /** 2064 * amdgpu_switcheroo_set_state - set switcheroo state 2065 * 2066 * @pdev: pci dev pointer 2067 * @state: vga_switcheroo state 2068 * 2069 * Callback for the switcheroo driver. Suspends or resumes 2070 * the asics before or after it is powered up using ACPI methods. 2071 */ 2072 static void amdgpu_switcheroo_set_state(struct pci_dev *pdev, 2073 enum vga_switcheroo_state state) 2074 { 2075 struct drm_device *dev = pci_get_drvdata(pdev); 2076 int r; 2077 2078 if (amdgpu_device_supports_px(dev) && state == VGA_SWITCHEROO_OFF) 2079 return; 2080 2081 if (state == VGA_SWITCHEROO_ON) { 2082 pr_info("switched on\n"); 2083 /* don't suspend or resume card normally */ 2084 dev->switch_power_state = DRM_SWITCH_POWER_CHANGING; 2085 2086 pci_set_power_state(pdev, PCI_D0); 2087 amdgpu_device_load_pci_state(pdev); 2088 r = pci_enable_device(pdev); 2089 if (r) 2090 DRM_WARN("pci_enable_device failed (%d)\n", r); 2091 amdgpu_device_resume(dev, true); 2092 2093 dev->switch_power_state = DRM_SWITCH_POWER_ON; 2094 } else { 2095 pr_info("switched off\n"); 2096 dev->switch_power_state = DRM_SWITCH_POWER_CHANGING; 2097 amdgpu_device_prepare(dev); 2098 amdgpu_device_suspend(dev, true); 2099 amdgpu_device_cache_pci_state(pdev); 2100 /* Shut down the device */ 2101 pci_disable_device(pdev); 2102 pci_set_power_state(pdev, PCI_D3cold); 2103 dev->switch_power_state = DRM_SWITCH_POWER_OFF; 2104 } 2105 } 2106 2107 /** 2108 * amdgpu_switcheroo_can_switch - see if switcheroo state can change 2109 * 2110 * @pdev: pci dev pointer 2111 * 2112 * Callback for the switcheroo driver. Check of the switcheroo 2113 * state can be changed. 2114 * Returns true if the state can be changed, false if not. 2115 */ 2116 static bool amdgpu_switcheroo_can_switch(struct pci_dev *pdev) 2117 { 2118 struct drm_device *dev = pci_get_drvdata(pdev); 2119 2120 /* 2121 * FIXME: open_count is protected by drm_global_mutex but that would lead to 2122 * locking inversion with the driver load path. And the access here is 2123 * completely racy anyway. So don't bother with locking for now. 2124 */ 2125 return atomic_read(&dev->open_count) == 0; 2126 } 2127 2128 static const struct vga_switcheroo_client_ops amdgpu_switcheroo_ops = { 2129 .set_gpu_state = amdgpu_switcheroo_set_state, 2130 .reprobe = NULL, 2131 .can_switch = amdgpu_switcheroo_can_switch, 2132 }; 2133 2134 /** 2135 * amdgpu_device_ip_set_clockgating_state - set the CG state 2136 * 2137 * @dev: amdgpu_device pointer 2138 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.) 2139 * @state: clockgating state (gate or ungate) 2140 * 2141 * Sets the requested clockgating state for all instances of 2142 * the hardware IP specified. 2143 * Returns the error code from the last instance. 2144 */ 2145 int amdgpu_device_ip_set_clockgating_state(void *dev, 2146 enum amd_ip_block_type block_type, 2147 enum amd_clockgating_state state) 2148 { 2149 struct amdgpu_device *adev = dev; 2150 int i, r = 0; 2151 2152 for (i = 0; i < adev->num_ip_blocks; i++) { 2153 if (!adev->ip_blocks[i].status.valid) 2154 continue; 2155 if (adev->ip_blocks[i].version->type != block_type) 2156 continue; 2157 if (!adev->ip_blocks[i].version->funcs->set_clockgating_state) 2158 continue; 2159 r = adev->ip_blocks[i].version->funcs->set_clockgating_state( 2160 (void *)adev, state); 2161 if (r) 2162 DRM_ERROR("set_clockgating_state of IP block <%s> failed %d\n", 2163 adev->ip_blocks[i].version->funcs->name, r); 2164 } 2165 return r; 2166 } 2167 2168 /** 2169 * amdgpu_device_ip_set_powergating_state - set the PG state 2170 * 2171 * @dev: amdgpu_device pointer 2172 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.) 2173 * @state: powergating state (gate or ungate) 2174 * 2175 * Sets the requested powergating state for all instances of 2176 * the hardware IP specified. 2177 * Returns the error code from the last instance. 2178 */ 2179 int amdgpu_device_ip_set_powergating_state(void *dev, 2180 enum amd_ip_block_type block_type, 2181 enum amd_powergating_state state) 2182 { 2183 struct amdgpu_device *adev = dev; 2184 int i, r = 0; 2185 2186 for (i = 0; i < adev->num_ip_blocks; i++) { 2187 if (!adev->ip_blocks[i].status.valid) 2188 continue; 2189 if (adev->ip_blocks[i].version->type != block_type) 2190 continue; 2191 if (!adev->ip_blocks[i].version->funcs->set_powergating_state) 2192 continue; 2193 r = adev->ip_blocks[i].version->funcs->set_powergating_state( 2194 (void *)adev, state); 2195 if (r) 2196 DRM_ERROR("set_powergating_state of IP block <%s> failed %d\n", 2197 adev->ip_blocks[i].version->funcs->name, r); 2198 } 2199 return r; 2200 } 2201 2202 /** 2203 * amdgpu_device_ip_get_clockgating_state - get the CG state 2204 * 2205 * @adev: amdgpu_device pointer 2206 * @flags: clockgating feature flags 2207 * 2208 * Walks the list of IPs on the device and updates the clockgating 2209 * flags for each IP. 2210 * Updates @flags with the feature flags for each hardware IP where 2211 * clockgating is enabled. 2212 */ 2213 void amdgpu_device_ip_get_clockgating_state(struct amdgpu_device *adev, 2214 u64 *flags) 2215 { 2216 int i; 2217 2218 for (i = 0; i < adev->num_ip_blocks; i++) { 2219 if (!adev->ip_blocks[i].status.valid) 2220 continue; 2221 if (adev->ip_blocks[i].version->funcs->get_clockgating_state) 2222 adev->ip_blocks[i].version->funcs->get_clockgating_state((void *)adev, flags); 2223 } 2224 } 2225 2226 /** 2227 * amdgpu_device_ip_wait_for_idle - wait for idle 2228 * 2229 * @adev: amdgpu_device pointer 2230 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.) 2231 * 2232 * Waits for the request hardware IP to be idle. 2233 * Returns 0 for success or a negative error code on failure. 2234 */ 2235 int amdgpu_device_ip_wait_for_idle(struct amdgpu_device *adev, 2236 enum amd_ip_block_type block_type) 2237 { 2238 int i, r; 2239 2240 for (i = 0; i < adev->num_ip_blocks; i++) { 2241 if (!adev->ip_blocks[i].status.valid) 2242 continue; 2243 if (adev->ip_blocks[i].version->type == block_type) { 2244 if (adev->ip_blocks[i].version->funcs->wait_for_idle) { 2245 r = adev->ip_blocks[i].version->funcs->wait_for_idle( 2246 &adev->ip_blocks[i]); 2247 if (r) 2248 return r; 2249 } 2250 break; 2251 } 2252 } 2253 return 0; 2254 2255 } 2256 2257 /** 2258 * amdgpu_device_ip_is_valid - is the hardware IP enabled 2259 * 2260 * @adev: amdgpu_device pointer 2261 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.) 2262 * 2263 * Check if the hardware IP is enable or not. 2264 * Returns true if it the IP is enable, false if not. 2265 */ 2266 bool amdgpu_device_ip_is_valid(struct amdgpu_device *adev, 2267 enum amd_ip_block_type block_type) 2268 { 2269 int i; 2270 2271 for (i = 0; i < adev->num_ip_blocks; i++) { 2272 if (adev->ip_blocks[i].version->type == block_type) 2273 return adev->ip_blocks[i].status.valid; 2274 } 2275 return false; 2276 2277 } 2278 2279 /** 2280 * amdgpu_device_ip_get_ip_block - get a hw IP pointer 2281 * 2282 * @adev: amdgpu_device pointer 2283 * @type: Type of hardware IP (SMU, GFX, UVD, etc.) 2284 * 2285 * Returns a pointer to the hardware IP block structure 2286 * if it exists for the asic, otherwise NULL. 2287 */ 2288 struct amdgpu_ip_block * 2289 amdgpu_device_ip_get_ip_block(struct amdgpu_device *adev, 2290 enum amd_ip_block_type type) 2291 { 2292 int i; 2293 2294 for (i = 0; i < adev->num_ip_blocks; i++) 2295 if (adev->ip_blocks[i].version->type == type) 2296 return &adev->ip_blocks[i]; 2297 2298 return NULL; 2299 } 2300 2301 /** 2302 * amdgpu_device_ip_block_version_cmp 2303 * 2304 * @adev: amdgpu_device pointer 2305 * @type: enum amd_ip_block_type 2306 * @major: major version 2307 * @minor: minor version 2308 * 2309 * return 0 if equal or greater 2310 * return 1 if smaller or the ip_block doesn't exist 2311 */ 2312 int amdgpu_device_ip_block_version_cmp(struct amdgpu_device *adev, 2313 enum amd_ip_block_type type, 2314 u32 major, u32 minor) 2315 { 2316 struct amdgpu_ip_block *ip_block = amdgpu_device_ip_get_ip_block(adev, type); 2317 2318 if (ip_block && ((ip_block->version->major > major) || 2319 ((ip_block->version->major == major) && 2320 (ip_block->version->minor >= minor)))) 2321 return 0; 2322 2323 return 1; 2324 } 2325 2326 /** 2327 * amdgpu_device_ip_block_add 2328 * 2329 * @adev: amdgpu_device pointer 2330 * @ip_block_version: pointer to the IP to add 2331 * 2332 * Adds the IP block driver information to the collection of IPs 2333 * on the asic. 2334 */ 2335 int amdgpu_device_ip_block_add(struct amdgpu_device *adev, 2336 const struct amdgpu_ip_block_version *ip_block_version) 2337 { 2338 if (!ip_block_version) 2339 return -EINVAL; 2340 2341 switch (ip_block_version->type) { 2342 case AMD_IP_BLOCK_TYPE_VCN: 2343 if (adev->harvest_ip_mask & AMD_HARVEST_IP_VCN_MASK) 2344 return 0; 2345 break; 2346 case AMD_IP_BLOCK_TYPE_JPEG: 2347 if (adev->harvest_ip_mask & AMD_HARVEST_IP_JPEG_MASK) 2348 return 0; 2349 break; 2350 default: 2351 break; 2352 } 2353 2354 DRM_INFO("add ip block number %d <%s>\n", adev->num_ip_blocks, 2355 ip_block_version->funcs->name); 2356 2357 adev->ip_blocks[adev->num_ip_blocks].adev = adev; 2358 2359 adev->ip_blocks[adev->num_ip_blocks++].version = ip_block_version; 2360 2361 return 0; 2362 } 2363 2364 /** 2365 * amdgpu_device_enable_virtual_display - enable virtual display feature 2366 * 2367 * @adev: amdgpu_device pointer 2368 * 2369 * Enabled the virtual display feature if the user has enabled it via 2370 * the module parameter virtual_display. This feature provides a virtual 2371 * display hardware on headless boards or in virtualized environments. 2372 * This function parses and validates the configuration string specified by 2373 * the user and configues the virtual display configuration (number of 2374 * virtual connectors, crtcs, etc.) specified. 2375 */ 2376 static void amdgpu_device_enable_virtual_display(struct amdgpu_device *adev) 2377 { 2378 adev->enable_virtual_display = false; 2379 2380 if (amdgpu_virtual_display) { 2381 const char *pci_address_name = pci_name(adev->pdev); 2382 char *pciaddstr, *pciaddstr_tmp, *pciaddname_tmp, *pciaddname; 2383 2384 pciaddstr = kstrdup(amdgpu_virtual_display, GFP_KERNEL); 2385 pciaddstr_tmp = pciaddstr; 2386 while ((pciaddname_tmp = strsep(&pciaddstr_tmp, ";"))) { 2387 pciaddname = strsep(&pciaddname_tmp, ","); 2388 if (!strcmp("all", pciaddname) 2389 || !strcmp(pci_address_name, pciaddname)) { 2390 long num_crtc; 2391 int res = -1; 2392 2393 adev->enable_virtual_display = true; 2394 2395 if (pciaddname_tmp) 2396 res = kstrtol(pciaddname_tmp, 10, 2397 &num_crtc); 2398 2399 if (!res) { 2400 if (num_crtc < 1) 2401 num_crtc = 1; 2402 if (num_crtc > 6) 2403 num_crtc = 6; 2404 adev->mode_info.num_crtc = num_crtc; 2405 } else { 2406 adev->mode_info.num_crtc = 1; 2407 } 2408 break; 2409 } 2410 } 2411 2412 DRM_INFO("virtual display string:%s, %s:virtual_display:%d, num_crtc:%d\n", 2413 amdgpu_virtual_display, pci_address_name, 2414 adev->enable_virtual_display, adev->mode_info.num_crtc); 2415 2416 kfree(pciaddstr); 2417 } 2418 } 2419 2420 void amdgpu_device_set_sriov_virtual_display(struct amdgpu_device *adev) 2421 { 2422 if (amdgpu_sriov_vf(adev) && !adev->enable_virtual_display) { 2423 adev->mode_info.num_crtc = 1; 2424 adev->enable_virtual_display = true; 2425 DRM_INFO("virtual_display:%d, num_crtc:%d\n", 2426 adev->enable_virtual_display, adev->mode_info.num_crtc); 2427 } 2428 } 2429 2430 /** 2431 * amdgpu_device_parse_gpu_info_fw - parse gpu info firmware 2432 * 2433 * @adev: amdgpu_device pointer 2434 * 2435 * Parses the asic configuration parameters specified in the gpu info 2436 * firmware and makes them availale to the driver for use in configuring 2437 * the asic. 2438 * Returns 0 on success, -EINVAL on failure. 2439 */ 2440 static int amdgpu_device_parse_gpu_info_fw(struct amdgpu_device *adev) 2441 { 2442 const char *chip_name; 2443 int err; 2444 const struct gpu_info_firmware_header_v1_0 *hdr; 2445 2446 adev->firmware.gpu_info_fw = NULL; 2447 2448 if (adev->mman.discovery_bin) 2449 return 0; 2450 2451 switch (adev->asic_type) { 2452 default: 2453 return 0; 2454 case CHIP_VEGA10: 2455 chip_name = "vega10"; 2456 break; 2457 case CHIP_VEGA12: 2458 chip_name = "vega12"; 2459 break; 2460 case CHIP_RAVEN: 2461 if (adev->apu_flags & AMD_APU_IS_RAVEN2) 2462 chip_name = "raven2"; 2463 else if (adev->apu_flags & AMD_APU_IS_PICASSO) 2464 chip_name = "picasso"; 2465 else 2466 chip_name = "raven"; 2467 break; 2468 case CHIP_ARCTURUS: 2469 chip_name = "arcturus"; 2470 break; 2471 case CHIP_NAVI12: 2472 chip_name = "navi12"; 2473 break; 2474 } 2475 2476 err = amdgpu_ucode_request(adev, &adev->firmware.gpu_info_fw, 2477 "amdgpu/%s_gpu_info.bin", chip_name); 2478 if (err) { 2479 dev_err(adev->dev, 2480 "Failed to get gpu_info firmware \"%s_gpu_info.bin\"\n", 2481 chip_name); 2482 goto out; 2483 } 2484 2485 hdr = (const struct gpu_info_firmware_header_v1_0 *)adev->firmware.gpu_info_fw->data; 2486 amdgpu_ucode_print_gpu_info_hdr(&hdr->header); 2487 2488 switch (hdr->version_major) { 2489 case 1: 2490 { 2491 const struct gpu_info_firmware_v1_0 *gpu_info_fw = 2492 (const struct gpu_info_firmware_v1_0 *)(adev->firmware.gpu_info_fw->data + 2493 le32_to_cpu(hdr->header.ucode_array_offset_bytes)); 2494 2495 /* 2496 * Should be droped when DAL no longer needs it. 2497 */ 2498 if (adev->asic_type == CHIP_NAVI12) 2499 goto parse_soc_bounding_box; 2500 2501 adev->gfx.config.max_shader_engines = le32_to_cpu(gpu_info_fw->gc_num_se); 2502 adev->gfx.config.max_cu_per_sh = le32_to_cpu(gpu_info_fw->gc_num_cu_per_sh); 2503 adev->gfx.config.max_sh_per_se = le32_to_cpu(gpu_info_fw->gc_num_sh_per_se); 2504 adev->gfx.config.max_backends_per_se = le32_to_cpu(gpu_info_fw->gc_num_rb_per_se); 2505 adev->gfx.config.max_texture_channel_caches = 2506 le32_to_cpu(gpu_info_fw->gc_num_tccs); 2507 adev->gfx.config.max_gprs = le32_to_cpu(gpu_info_fw->gc_num_gprs); 2508 adev->gfx.config.max_gs_threads = le32_to_cpu(gpu_info_fw->gc_num_max_gs_thds); 2509 adev->gfx.config.gs_vgt_table_depth = le32_to_cpu(gpu_info_fw->gc_gs_table_depth); 2510 adev->gfx.config.gs_prim_buffer_depth = le32_to_cpu(gpu_info_fw->gc_gsprim_buff_depth); 2511 adev->gfx.config.double_offchip_lds_buf = 2512 le32_to_cpu(gpu_info_fw->gc_double_offchip_lds_buffer); 2513 adev->gfx.cu_info.wave_front_size = le32_to_cpu(gpu_info_fw->gc_wave_size); 2514 adev->gfx.cu_info.max_waves_per_simd = 2515 le32_to_cpu(gpu_info_fw->gc_max_waves_per_simd); 2516 adev->gfx.cu_info.max_scratch_slots_per_cu = 2517 le32_to_cpu(gpu_info_fw->gc_max_scratch_slots_per_cu); 2518 adev->gfx.cu_info.lds_size = le32_to_cpu(gpu_info_fw->gc_lds_size); 2519 if (hdr->version_minor >= 1) { 2520 const struct gpu_info_firmware_v1_1 *gpu_info_fw = 2521 (const struct gpu_info_firmware_v1_1 *)(adev->firmware.gpu_info_fw->data + 2522 le32_to_cpu(hdr->header.ucode_array_offset_bytes)); 2523 adev->gfx.config.num_sc_per_sh = 2524 le32_to_cpu(gpu_info_fw->num_sc_per_sh); 2525 adev->gfx.config.num_packer_per_sc = 2526 le32_to_cpu(gpu_info_fw->num_packer_per_sc); 2527 } 2528 2529 parse_soc_bounding_box: 2530 /* 2531 * soc bounding box info is not integrated in disocovery table, 2532 * we always need to parse it from gpu info firmware if needed. 2533 */ 2534 if (hdr->version_minor == 2) { 2535 const struct gpu_info_firmware_v1_2 *gpu_info_fw = 2536 (const struct gpu_info_firmware_v1_2 *)(adev->firmware.gpu_info_fw->data + 2537 le32_to_cpu(hdr->header.ucode_array_offset_bytes)); 2538 adev->dm.soc_bounding_box = &gpu_info_fw->soc_bounding_box; 2539 } 2540 break; 2541 } 2542 default: 2543 dev_err(adev->dev, 2544 "Unsupported gpu_info table %d\n", hdr->header.ucode_version); 2545 err = -EINVAL; 2546 goto out; 2547 } 2548 out: 2549 return err; 2550 } 2551 2552 /** 2553 * amdgpu_device_ip_early_init - run early init for hardware IPs 2554 * 2555 * @adev: amdgpu_device pointer 2556 * 2557 * Early initialization pass for hardware IPs. The hardware IPs that make 2558 * up each asic are discovered each IP's early_init callback is run. This 2559 * is the first stage in initializing the asic. 2560 * Returns 0 on success, negative error code on failure. 2561 */ 2562 static int amdgpu_device_ip_early_init(struct amdgpu_device *adev) 2563 { 2564 struct amdgpu_ip_block *ip_block; 2565 struct pci_dev *parent; 2566 int i, r; 2567 bool total; 2568 2569 amdgpu_device_enable_virtual_display(adev); 2570 2571 if (amdgpu_sriov_vf(adev)) { 2572 r = amdgpu_virt_request_full_gpu(adev, true); 2573 if (r) 2574 return r; 2575 } 2576 2577 switch (adev->asic_type) { 2578 #ifdef CONFIG_DRM_AMDGPU_SI 2579 case CHIP_VERDE: 2580 case CHIP_TAHITI: 2581 case CHIP_PITCAIRN: 2582 case CHIP_OLAND: 2583 case CHIP_HAINAN: 2584 adev->family = AMDGPU_FAMILY_SI; 2585 r = si_set_ip_blocks(adev); 2586 if (r) 2587 return r; 2588 break; 2589 #endif 2590 #ifdef CONFIG_DRM_AMDGPU_CIK 2591 case CHIP_BONAIRE: 2592 case CHIP_HAWAII: 2593 case CHIP_KAVERI: 2594 case CHIP_KABINI: 2595 case CHIP_MULLINS: 2596 if (adev->flags & AMD_IS_APU) 2597 adev->family = AMDGPU_FAMILY_KV; 2598 else 2599 adev->family = AMDGPU_FAMILY_CI; 2600 2601 r = cik_set_ip_blocks(adev); 2602 if (r) 2603 return r; 2604 break; 2605 #endif 2606 case CHIP_TOPAZ: 2607 case CHIP_TONGA: 2608 case CHIP_FIJI: 2609 case CHIP_POLARIS10: 2610 case CHIP_POLARIS11: 2611 case CHIP_POLARIS12: 2612 case CHIP_VEGAM: 2613 case CHIP_CARRIZO: 2614 case CHIP_STONEY: 2615 if (adev->flags & AMD_IS_APU) 2616 adev->family = AMDGPU_FAMILY_CZ; 2617 else 2618 adev->family = AMDGPU_FAMILY_VI; 2619 2620 r = vi_set_ip_blocks(adev); 2621 if (r) 2622 return r; 2623 break; 2624 default: 2625 r = amdgpu_discovery_set_ip_blocks(adev); 2626 if (r) 2627 return r; 2628 break; 2629 } 2630 2631 if (amdgpu_has_atpx() && 2632 (amdgpu_is_atpx_hybrid() || 2633 amdgpu_has_atpx_dgpu_power_cntl()) && 2634 ((adev->flags & AMD_IS_APU) == 0) && 2635 !dev_is_removable(&adev->pdev->dev)) 2636 adev->flags |= AMD_IS_PX; 2637 2638 if (!(adev->flags & AMD_IS_APU)) { 2639 parent = pcie_find_root_port(adev->pdev); 2640 adev->has_pr3 = parent ? pci_pr3_present(parent) : false; 2641 } 2642 2643 2644 adev->pm.pp_feature = amdgpu_pp_feature_mask; 2645 if (amdgpu_sriov_vf(adev) || sched_policy == KFD_SCHED_POLICY_NO_HWS) 2646 adev->pm.pp_feature &= ~PP_GFXOFF_MASK; 2647 if (amdgpu_sriov_vf(adev) && adev->asic_type == CHIP_SIENNA_CICHLID) 2648 adev->pm.pp_feature &= ~PP_OVERDRIVE_MASK; 2649 if (!amdgpu_device_pcie_dynamic_switching_supported(adev)) 2650 adev->pm.pp_feature &= ~PP_PCIE_DPM_MASK; 2651 2652 total = true; 2653 for (i = 0; i < adev->num_ip_blocks; i++) { 2654 ip_block = &adev->ip_blocks[i]; 2655 2656 if ((amdgpu_ip_block_mask & (1 << i)) == 0) { 2657 DRM_WARN("disabled ip block: %d <%s>\n", 2658 i, adev->ip_blocks[i].version->funcs->name); 2659 adev->ip_blocks[i].status.valid = false; 2660 } else if (ip_block->version->funcs->early_init) { 2661 r = ip_block->version->funcs->early_init(ip_block); 2662 if (r == -ENOENT) { 2663 adev->ip_blocks[i].status.valid = false; 2664 } else if (r) { 2665 DRM_ERROR("early_init of IP block <%s> failed %d\n", 2666 adev->ip_blocks[i].version->funcs->name, r); 2667 total = false; 2668 } else { 2669 adev->ip_blocks[i].status.valid = true; 2670 } 2671 } else { 2672 adev->ip_blocks[i].status.valid = true; 2673 } 2674 /* get the vbios after the asic_funcs are set up */ 2675 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON) { 2676 r = amdgpu_device_parse_gpu_info_fw(adev); 2677 if (r) 2678 return r; 2679 2680 /* Read BIOS */ 2681 if (amdgpu_device_read_bios(adev)) { 2682 if (!amdgpu_get_bios(adev)) 2683 return -EINVAL; 2684 2685 r = amdgpu_atombios_init(adev); 2686 if (r) { 2687 dev_err(adev->dev, "amdgpu_atombios_init failed\n"); 2688 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_INIT_FAIL, 0, 0); 2689 return r; 2690 } 2691 } 2692 2693 /*get pf2vf msg info at it's earliest time*/ 2694 if (amdgpu_sriov_vf(adev)) 2695 amdgpu_virt_init_data_exchange(adev); 2696 2697 } 2698 } 2699 if (!total) 2700 return -ENODEV; 2701 2702 ip_block = amdgpu_device_ip_get_ip_block(adev, AMD_IP_BLOCK_TYPE_GFX); 2703 if (ip_block->status.valid != false) 2704 amdgpu_amdkfd_device_probe(adev); 2705 2706 adev->cg_flags &= amdgpu_cg_mask; 2707 adev->pg_flags &= amdgpu_pg_mask; 2708 2709 return 0; 2710 } 2711 2712 static int amdgpu_device_ip_hw_init_phase1(struct amdgpu_device *adev) 2713 { 2714 int i, r; 2715 2716 for (i = 0; i < adev->num_ip_blocks; i++) { 2717 if (!adev->ip_blocks[i].status.sw) 2718 continue; 2719 if (adev->ip_blocks[i].status.hw) 2720 continue; 2721 if (!amdgpu_ip_member_of_hwini( 2722 adev, adev->ip_blocks[i].version->type)) 2723 continue; 2724 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON || 2725 (amdgpu_sriov_vf(adev) && (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP)) || 2726 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH) { 2727 r = adev->ip_blocks[i].version->funcs->hw_init(&adev->ip_blocks[i]); 2728 if (r) { 2729 DRM_ERROR("hw_init of IP block <%s> failed %d\n", 2730 adev->ip_blocks[i].version->funcs->name, r); 2731 return r; 2732 } 2733 adev->ip_blocks[i].status.hw = true; 2734 } 2735 } 2736 2737 return 0; 2738 } 2739 2740 static int amdgpu_device_ip_hw_init_phase2(struct amdgpu_device *adev) 2741 { 2742 int i, r; 2743 2744 for (i = 0; i < adev->num_ip_blocks; i++) { 2745 if (!adev->ip_blocks[i].status.sw) 2746 continue; 2747 if (adev->ip_blocks[i].status.hw) 2748 continue; 2749 if (!amdgpu_ip_member_of_hwini( 2750 adev, adev->ip_blocks[i].version->type)) 2751 continue; 2752 r = adev->ip_blocks[i].version->funcs->hw_init(&adev->ip_blocks[i]); 2753 if (r) { 2754 DRM_ERROR("hw_init of IP block <%s> failed %d\n", 2755 adev->ip_blocks[i].version->funcs->name, r); 2756 return r; 2757 } 2758 adev->ip_blocks[i].status.hw = true; 2759 } 2760 2761 return 0; 2762 } 2763 2764 static int amdgpu_device_fw_loading(struct amdgpu_device *adev) 2765 { 2766 int r = 0; 2767 int i; 2768 uint32_t smu_version; 2769 2770 if (adev->asic_type >= CHIP_VEGA10) { 2771 for (i = 0; i < adev->num_ip_blocks; i++) { 2772 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_PSP) 2773 continue; 2774 2775 if (!amdgpu_ip_member_of_hwini(adev, 2776 AMD_IP_BLOCK_TYPE_PSP)) 2777 break; 2778 2779 if (!adev->ip_blocks[i].status.sw) 2780 continue; 2781 2782 /* no need to do the fw loading again if already done*/ 2783 if (adev->ip_blocks[i].status.hw == true) 2784 break; 2785 2786 if (amdgpu_in_reset(adev) || adev->in_suspend) { 2787 r = amdgpu_ip_block_resume(&adev->ip_blocks[i]); 2788 if (r) 2789 return r; 2790 } else { 2791 r = adev->ip_blocks[i].version->funcs->hw_init(&adev->ip_blocks[i]); 2792 if (r) { 2793 DRM_ERROR("hw_init of IP block <%s> failed %d\n", 2794 adev->ip_blocks[i].version->funcs->name, r); 2795 return r; 2796 } 2797 adev->ip_blocks[i].status.hw = true; 2798 } 2799 break; 2800 } 2801 } 2802 2803 if (!amdgpu_sriov_vf(adev) || adev->asic_type == CHIP_TONGA) 2804 r = amdgpu_pm_load_smu_firmware(adev, &smu_version); 2805 2806 return r; 2807 } 2808 2809 static int amdgpu_device_init_schedulers(struct amdgpu_device *adev) 2810 { 2811 long timeout; 2812 int r, i; 2813 2814 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 2815 struct amdgpu_ring *ring = adev->rings[i]; 2816 2817 /* No need to setup the GPU scheduler for rings that don't need it */ 2818 if (!ring || ring->no_scheduler) 2819 continue; 2820 2821 switch (ring->funcs->type) { 2822 case AMDGPU_RING_TYPE_GFX: 2823 timeout = adev->gfx_timeout; 2824 break; 2825 case AMDGPU_RING_TYPE_COMPUTE: 2826 timeout = adev->compute_timeout; 2827 break; 2828 case AMDGPU_RING_TYPE_SDMA: 2829 timeout = adev->sdma_timeout; 2830 break; 2831 default: 2832 timeout = adev->video_timeout; 2833 break; 2834 } 2835 2836 r = drm_sched_init(&ring->sched, &amdgpu_sched_ops, NULL, 2837 DRM_SCHED_PRIORITY_COUNT, 2838 ring->num_hw_submission, 0, 2839 timeout, adev->reset_domain->wq, 2840 ring->sched_score, ring->name, 2841 adev->dev); 2842 if (r) { 2843 DRM_ERROR("Failed to create scheduler on ring %s.\n", 2844 ring->name); 2845 return r; 2846 } 2847 r = amdgpu_uvd_entity_init(adev, ring); 2848 if (r) { 2849 DRM_ERROR("Failed to create UVD scheduling entity on ring %s.\n", 2850 ring->name); 2851 return r; 2852 } 2853 r = amdgpu_vce_entity_init(adev, ring); 2854 if (r) { 2855 DRM_ERROR("Failed to create VCE scheduling entity on ring %s.\n", 2856 ring->name); 2857 return r; 2858 } 2859 } 2860 2861 amdgpu_xcp_update_partition_sched_list(adev); 2862 2863 return 0; 2864 } 2865 2866 2867 /** 2868 * amdgpu_device_ip_init - run init for hardware IPs 2869 * 2870 * @adev: amdgpu_device pointer 2871 * 2872 * Main initialization pass for hardware IPs. The list of all the hardware 2873 * IPs that make up the asic is walked and the sw_init and hw_init callbacks 2874 * are run. sw_init initializes the software state associated with each IP 2875 * and hw_init initializes the hardware associated with each IP. 2876 * Returns 0 on success, negative error code on failure. 2877 */ 2878 static int amdgpu_device_ip_init(struct amdgpu_device *adev) 2879 { 2880 bool init_badpage; 2881 int i, r; 2882 2883 r = amdgpu_ras_init(adev); 2884 if (r) 2885 return r; 2886 2887 for (i = 0; i < adev->num_ip_blocks; i++) { 2888 if (!adev->ip_blocks[i].status.valid) 2889 continue; 2890 if (adev->ip_blocks[i].version->funcs->sw_init) { 2891 r = adev->ip_blocks[i].version->funcs->sw_init(&adev->ip_blocks[i]); 2892 if (r) { 2893 DRM_ERROR("sw_init of IP block <%s> failed %d\n", 2894 adev->ip_blocks[i].version->funcs->name, r); 2895 goto init_failed; 2896 } 2897 } 2898 adev->ip_blocks[i].status.sw = true; 2899 2900 if (!amdgpu_ip_member_of_hwini( 2901 adev, adev->ip_blocks[i].version->type)) 2902 continue; 2903 2904 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON) { 2905 /* need to do common hw init early so everything is set up for gmc */ 2906 r = adev->ip_blocks[i].version->funcs->hw_init(&adev->ip_blocks[i]); 2907 if (r) { 2908 DRM_ERROR("hw_init %d failed %d\n", i, r); 2909 goto init_failed; 2910 } 2911 adev->ip_blocks[i].status.hw = true; 2912 } else if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) { 2913 /* need to do gmc hw init early so we can allocate gpu mem */ 2914 /* Try to reserve bad pages early */ 2915 if (amdgpu_sriov_vf(adev)) 2916 amdgpu_virt_exchange_data(adev); 2917 2918 r = amdgpu_device_mem_scratch_init(adev); 2919 if (r) { 2920 DRM_ERROR("amdgpu_mem_scratch_init failed %d\n", r); 2921 goto init_failed; 2922 } 2923 r = adev->ip_blocks[i].version->funcs->hw_init(&adev->ip_blocks[i]); 2924 if (r) { 2925 DRM_ERROR("hw_init %d failed %d\n", i, r); 2926 goto init_failed; 2927 } 2928 r = amdgpu_device_wb_init(adev); 2929 if (r) { 2930 DRM_ERROR("amdgpu_device_wb_init failed %d\n", r); 2931 goto init_failed; 2932 } 2933 adev->ip_blocks[i].status.hw = true; 2934 2935 /* right after GMC hw init, we create CSA */ 2936 if (adev->gfx.mcbp) { 2937 r = amdgpu_allocate_static_csa(adev, &adev->virt.csa_obj, 2938 AMDGPU_GEM_DOMAIN_VRAM | 2939 AMDGPU_GEM_DOMAIN_GTT, 2940 AMDGPU_CSA_SIZE); 2941 if (r) { 2942 DRM_ERROR("allocate CSA failed %d\n", r); 2943 goto init_failed; 2944 } 2945 } 2946 2947 r = amdgpu_seq64_init(adev); 2948 if (r) { 2949 DRM_ERROR("allocate seq64 failed %d\n", r); 2950 goto init_failed; 2951 } 2952 } 2953 } 2954 2955 if (amdgpu_sriov_vf(adev)) 2956 amdgpu_virt_init_data_exchange(adev); 2957 2958 r = amdgpu_ib_pool_init(adev); 2959 if (r) { 2960 dev_err(adev->dev, "IB initialization failed (%d).\n", r); 2961 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_IB_INIT_FAIL, 0, r); 2962 goto init_failed; 2963 } 2964 2965 r = amdgpu_ucode_create_bo(adev); /* create ucode bo when sw_init complete*/ 2966 if (r) 2967 goto init_failed; 2968 2969 r = amdgpu_device_ip_hw_init_phase1(adev); 2970 if (r) 2971 goto init_failed; 2972 2973 r = amdgpu_device_fw_loading(adev); 2974 if (r) 2975 goto init_failed; 2976 2977 r = amdgpu_device_ip_hw_init_phase2(adev); 2978 if (r) 2979 goto init_failed; 2980 2981 /* 2982 * retired pages will be loaded from eeprom and reserved here, 2983 * it should be called after amdgpu_device_ip_hw_init_phase2 since 2984 * for some ASICs the RAS EEPROM code relies on SMU fully functioning 2985 * for I2C communication which only true at this point. 2986 * 2987 * amdgpu_ras_recovery_init may fail, but the upper only cares the 2988 * failure from bad gpu situation and stop amdgpu init process 2989 * accordingly. For other failed cases, it will still release all 2990 * the resource and print error message, rather than returning one 2991 * negative value to upper level. 2992 * 2993 * Note: theoretically, this should be called before all vram allocations 2994 * to protect retired page from abusing 2995 */ 2996 init_badpage = (adev->init_lvl->level != AMDGPU_INIT_LEVEL_MINIMAL_XGMI); 2997 r = amdgpu_ras_recovery_init(adev, init_badpage); 2998 if (r) 2999 goto init_failed; 3000 3001 /** 3002 * In case of XGMI grab extra reference for reset domain for this device 3003 */ 3004 if (adev->gmc.xgmi.num_physical_nodes > 1) { 3005 if (amdgpu_xgmi_add_device(adev) == 0) { 3006 if (!amdgpu_sriov_vf(adev)) { 3007 struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev); 3008 3009 if (WARN_ON(!hive)) { 3010 r = -ENOENT; 3011 goto init_failed; 3012 } 3013 3014 if (!hive->reset_domain || 3015 !amdgpu_reset_get_reset_domain(hive->reset_domain)) { 3016 r = -ENOENT; 3017 amdgpu_put_xgmi_hive(hive); 3018 goto init_failed; 3019 } 3020 3021 /* Drop the early temporary reset domain we created for device */ 3022 amdgpu_reset_put_reset_domain(adev->reset_domain); 3023 adev->reset_domain = hive->reset_domain; 3024 amdgpu_put_xgmi_hive(hive); 3025 } 3026 } 3027 } 3028 3029 r = amdgpu_device_init_schedulers(adev); 3030 if (r) 3031 goto init_failed; 3032 3033 if (adev->mman.buffer_funcs_ring->sched.ready) 3034 amdgpu_ttm_set_buffer_funcs_status(adev, true); 3035 3036 /* Don't init kfd if whole hive need to be reset during init */ 3037 if (adev->init_lvl->level != AMDGPU_INIT_LEVEL_MINIMAL_XGMI) { 3038 kgd2kfd_init_zone_device(adev); 3039 amdgpu_amdkfd_device_init(adev); 3040 } 3041 3042 amdgpu_fru_get_product_info(adev); 3043 3044 init_failed: 3045 3046 return r; 3047 } 3048 3049 /** 3050 * amdgpu_device_fill_reset_magic - writes reset magic to gart pointer 3051 * 3052 * @adev: amdgpu_device pointer 3053 * 3054 * Writes a reset magic value to the gart pointer in VRAM. The driver calls 3055 * this function before a GPU reset. If the value is retained after a 3056 * GPU reset, VRAM has not been lost. Some GPU resets may destry VRAM contents. 3057 */ 3058 static void amdgpu_device_fill_reset_magic(struct amdgpu_device *adev) 3059 { 3060 memcpy(adev->reset_magic, adev->gart.ptr, AMDGPU_RESET_MAGIC_NUM); 3061 } 3062 3063 /** 3064 * amdgpu_device_check_vram_lost - check if vram is valid 3065 * 3066 * @adev: amdgpu_device pointer 3067 * 3068 * Checks the reset magic value written to the gart pointer in VRAM. 3069 * The driver calls this after a GPU reset to see if the contents of 3070 * VRAM is lost or now. 3071 * returns true if vram is lost, false if not. 3072 */ 3073 static bool amdgpu_device_check_vram_lost(struct amdgpu_device *adev) 3074 { 3075 if (memcmp(adev->gart.ptr, adev->reset_magic, 3076 AMDGPU_RESET_MAGIC_NUM)) 3077 return true; 3078 3079 if (!amdgpu_in_reset(adev)) 3080 return false; 3081 3082 /* 3083 * For all ASICs with baco/mode1 reset, the VRAM is 3084 * always assumed to be lost. 3085 */ 3086 switch (amdgpu_asic_reset_method(adev)) { 3087 case AMD_RESET_METHOD_BACO: 3088 case AMD_RESET_METHOD_MODE1: 3089 return true; 3090 default: 3091 return false; 3092 } 3093 } 3094 3095 /** 3096 * amdgpu_device_set_cg_state - set clockgating for amdgpu device 3097 * 3098 * @adev: amdgpu_device pointer 3099 * @state: clockgating state (gate or ungate) 3100 * 3101 * The list of all the hardware IPs that make up the asic is walked and the 3102 * set_clockgating_state callbacks are run. 3103 * Late initialization pass enabling clockgating for hardware IPs. 3104 * Fini or suspend, pass disabling clockgating for hardware IPs. 3105 * Returns 0 on success, negative error code on failure. 3106 */ 3107 3108 int amdgpu_device_set_cg_state(struct amdgpu_device *adev, 3109 enum amd_clockgating_state state) 3110 { 3111 int i, j, r; 3112 3113 if (amdgpu_emu_mode == 1) 3114 return 0; 3115 3116 for (j = 0; j < adev->num_ip_blocks; j++) { 3117 i = state == AMD_CG_STATE_GATE ? j : adev->num_ip_blocks - j - 1; 3118 if (!adev->ip_blocks[i].status.late_initialized) 3119 continue; 3120 /* skip CG for GFX, SDMA on S0ix */ 3121 if (adev->in_s0ix && 3122 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GFX || 3123 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SDMA)) 3124 continue; 3125 /* skip CG for VCE/UVD, it's handled specially */ 3126 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_UVD && 3127 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCE && 3128 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCN && 3129 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_JPEG && 3130 adev->ip_blocks[i].version->funcs->set_clockgating_state) { 3131 /* enable clockgating to save power */ 3132 r = adev->ip_blocks[i].version->funcs->set_clockgating_state((void *)adev, 3133 state); 3134 if (r) { 3135 DRM_ERROR("set_clockgating_state(gate) of IP block <%s> failed %d\n", 3136 adev->ip_blocks[i].version->funcs->name, r); 3137 return r; 3138 } 3139 } 3140 } 3141 3142 return 0; 3143 } 3144 3145 int amdgpu_device_set_pg_state(struct amdgpu_device *adev, 3146 enum amd_powergating_state state) 3147 { 3148 int i, j, r; 3149 3150 if (amdgpu_emu_mode == 1) 3151 return 0; 3152 3153 for (j = 0; j < adev->num_ip_blocks; j++) { 3154 i = state == AMD_PG_STATE_GATE ? j : adev->num_ip_blocks - j - 1; 3155 if (!adev->ip_blocks[i].status.late_initialized) 3156 continue; 3157 /* skip PG for GFX, SDMA on S0ix */ 3158 if (adev->in_s0ix && 3159 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GFX || 3160 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SDMA)) 3161 continue; 3162 /* skip CG for VCE/UVD, it's handled specially */ 3163 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_UVD && 3164 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCE && 3165 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCN && 3166 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_JPEG && 3167 adev->ip_blocks[i].version->funcs->set_powergating_state) { 3168 /* enable powergating to save power */ 3169 r = adev->ip_blocks[i].version->funcs->set_powergating_state((void *)adev, 3170 state); 3171 if (r) { 3172 DRM_ERROR("set_powergating_state(gate) of IP block <%s> failed %d\n", 3173 adev->ip_blocks[i].version->funcs->name, r); 3174 return r; 3175 } 3176 } 3177 } 3178 return 0; 3179 } 3180 3181 static int amdgpu_device_enable_mgpu_fan_boost(void) 3182 { 3183 struct amdgpu_gpu_instance *gpu_ins; 3184 struct amdgpu_device *adev; 3185 int i, ret = 0; 3186 3187 mutex_lock(&mgpu_info.mutex); 3188 3189 /* 3190 * MGPU fan boost feature should be enabled 3191 * only when there are two or more dGPUs in 3192 * the system 3193 */ 3194 if (mgpu_info.num_dgpu < 2) 3195 goto out; 3196 3197 for (i = 0; i < mgpu_info.num_dgpu; i++) { 3198 gpu_ins = &(mgpu_info.gpu_ins[i]); 3199 adev = gpu_ins->adev; 3200 if (!(adev->flags & AMD_IS_APU) && 3201 !gpu_ins->mgpu_fan_enabled) { 3202 ret = amdgpu_dpm_enable_mgpu_fan_boost(adev); 3203 if (ret) 3204 break; 3205 3206 gpu_ins->mgpu_fan_enabled = 1; 3207 } 3208 } 3209 3210 out: 3211 mutex_unlock(&mgpu_info.mutex); 3212 3213 return ret; 3214 } 3215 3216 /** 3217 * amdgpu_device_ip_late_init - run late init for hardware IPs 3218 * 3219 * @adev: amdgpu_device pointer 3220 * 3221 * Late initialization pass for hardware IPs. The list of all the hardware 3222 * IPs that make up the asic is walked and the late_init callbacks are run. 3223 * late_init covers any special initialization that an IP requires 3224 * after all of the have been initialized or something that needs to happen 3225 * late in the init process. 3226 * Returns 0 on success, negative error code on failure. 3227 */ 3228 static int amdgpu_device_ip_late_init(struct amdgpu_device *adev) 3229 { 3230 struct amdgpu_gpu_instance *gpu_instance; 3231 int i = 0, r; 3232 3233 for (i = 0; i < adev->num_ip_blocks; i++) { 3234 if (!adev->ip_blocks[i].status.hw) 3235 continue; 3236 if (adev->ip_blocks[i].version->funcs->late_init) { 3237 r = adev->ip_blocks[i].version->funcs->late_init(&adev->ip_blocks[i]); 3238 if (r) { 3239 DRM_ERROR("late_init of IP block <%s> failed %d\n", 3240 adev->ip_blocks[i].version->funcs->name, r); 3241 return r; 3242 } 3243 } 3244 adev->ip_blocks[i].status.late_initialized = true; 3245 } 3246 3247 r = amdgpu_ras_late_init(adev); 3248 if (r) { 3249 DRM_ERROR("amdgpu_ras_late_init failed %d", r); 3250 return r; 3251 } 3252 3253 if (!amdgpu_in_reset(adev)) 3254 amdgpu_ras_set_error_query_ready(adev, true); 3255 3256 amdgpu_device_set_cg_state(adev, AMD_CG_STATE_GATE); 3257 amdgpu_device_set_pg_state(adev, AMD_PG_STATE_GATE); 3258 3259 amdgpu_device_fill_reset_magic(adev); 3260 3261 r = amdgpu_device_enable_mgpu_fan_boost(); 3262 if (r) 3263 DRM_ERROR("enable mgpu fan boost failed (%d).\n", r); 3264 3265 /* For passthrough configuration on arcturus and aldebaran, enable special handling SBR */ 3266 if (amdgpu_passthrough(adev) && 3267 ((adev->asic_type == CHIP_ARCTURUS && adev->gmc.xgmi.num_physical_nodes > 1) || 3268 adev->asic_type == CHIP_ALDEBARAN)) 3269 amdgpu_dpm_handle_passthrough_sbr(adev, true); 3270 3271 if (adev->gmc.xgmi.num_physical_nodes > 1) { 3272 mutex_lock(&mgpu_info.mutex); 3273 3274 /* 3275 * Reset device p-state to low as this was booted with high. 3276 * 3277 * This should be performed only after all devices from the same 3278 * hive get initialized. 3279 * 3280 * However, it's unknown how many device in the hive in advance. 3281 * As this is counted one by one during devices initializations. 3282 * 3283 * So, we wait for all XGMI interlinked devices initialized. 3284 * This may bring some delays as those devices may come from 3285 * different hives. But that should be OK. 3286 */ 3287 if (mgpu_info.num_dgpu == adev->gmc.xgmi.num_physical_nodes) { 3288 for (i = 0; i < mgpu_info.num_gpu; i++) { 3289 gpu_instance = &(mgpu_info.gpu_ins[i]); 3290 if (gpu_instance->adev->flags & AMD_IS_APU) 3291 continue; 3292 3293 r = amdgpu_xgmi_set_pstate(gpu_instance->adev, 3294 AMDGPU_XGMI_PSTATE_MIN); 3295 if (r) { 3296 DRM_ERROR("pstate setting failed (%d).\n", r); 3297 break; 3298 } 3299 } 3300 } 3301 3302 mutex_unlock(&mgpu_info.mutex); 3303 } 3304 3305 return 0; 3306 } 3307 3308 static void amdgpu_ip_block_hw_fini(struct amdgpu_ip_block *ip_block) 3309 { 3310 int r; 3311 3312 if (!ip_block->version->funcs->hw_fini) { 3313 DRM_ERROR("hw_fini of IP block <%s> not defined\n", 3314 ip_block->version->funcs->name); 3315 } else { 3316 r = ip_block->version->funcs->hw_fini(ip_block); 3317 /* XXX handle errors */ 3318 if (r) { 3319 DRM_DEBUG("hw_fini of IP block <%s> failed %d\n", 3320 ip_block->version->funcs->name, r); 3321 } 3322 } 3323 3324 ip_block->status.hw = false; 3325 } 3326 3327 /** 3328 * amdgpu_device_smu_fini_early - smu hw_fini wrapper 3329 * 3330 * @adev: amdgpu_device pointer 3331 * 3332 * For ASICs need to disable SMC first 3333 */ 3334 static void amdgpu_device_smu_fini_early(struct amdgpu_device *adev) 3335 { 3336 int i; 3337 3338 if (amdgpu_ip_version(adev, GC_HWIP, 0) > IP_VERSION(9, 0, 0)) 3339 return; 3340 3341 for (i = 0; i < adev->num_ip_blocks; i++) { 3342 if (!adev->ip_blocks[i].status.hw) 3343 continue; 3344 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) { 3345 amdgpu_ip_block_hw_fini(&adev->ip_blocks[i]); 3346 break; 3347 } 3348 } 3349 } 3350 3351 static int amdgpu_device_ip_fini_early(struct amdgpu_device *adev) 3352 { 3353 int i, r; 3354 3355 for (i = 0; i < adev->num_ip_blocks; i++) { 3356 if (!adev->ip_blocks[i].version->funcs->early_fini) 3357 continue; 3358 3359 r = adev->ip_blocks[i].version->funcs->early_fini(&adev->ip_blocks[i]); 3360 if (r) { 3361 DRM_DEBUG("early_fini of IP block <%s> failed %d\n", 3362 adev->ip_blocks[i].version->funcs->name, r); 3363 } 3364 } 3365 3366 amdgpu_device_set_pg_state(adev, AMD_PG_STATE_UNGATE); 3367 amdgpu_device_set_cg_state(adev, AMD_CG_STATE_UNGATE); 3368 3369 amdgpu_amdkfd_suspend(adev, false); 3370 3371 /* Workaroud for ASICs need to disable SMC first */ 3372 amdgpu_device_smu_fini_early(adev); 3373 3374 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 3375 if (!adev->ip_blocks[i].status.hw) 3376 continue; 3377 3378 amdgpu_ip_block_hw_fini(&adev->ip_blocks[i]); 3379 } 3380 3381 if (amdgpu_sriov_vf(adev)) { 3382 if (amdgpu_virt_release_full_gpu(adev, false)) 3383 DRM_ERROR("failed to release exclusive mode on fini\n"); 3384 } 3385 3386 return 0; 3387 } 3388 3389 /** 3390 * amdgpu_device_ip_fini - run fini for hardware IPs 3391 * 3392 * @adev: amdgpu_device pointer 3393 * 3394 * Main teardown pass for hardware IPs. The list of all the hardware 3395 * IPs that make up the asic is walked and the hw_fini and sw_fini callbacks 3396 * are run. hw_fini tears down the hardware associated with each IP 3397 * and sw_fini tears down any software state associated with each IP. 3398 * Returns 0 on success, negative error code on failure. 3399 */ 3400 static int amdgpu_device_ip_fini(struct amdgpu_device *adev) 3401 { 3402 int i, r; 3403 3404 if (amdgpu_sriov_vf(adev) && adev->virt.ras_init_done) 3405 amdgpu_virt_release_ras_err_handler_data(adev); 3406 3407 if (adev->gmc.xgmi.num_physical_nodes > 1) 3408 amdgpu_xgmi_remove_device(adev); 3409 3410 amdgpu_amdkfd_device_fini_sw(adev); 3411 3412 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 3413 if (!adev->ip_blocks[i].status.sw) 3414 continue; 3415 3416 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) { 3417 amdgpu_ucode_free_bo(adev); 3418 amdgpu_free_static_csa(&adev->virt.csa_obj); 3419 amdgpu_device_wb_fini(adev); 3420 amdgpu_device_mem_scratch_fini(adev); 3421 amdgpu_ib_pool_fini(adev); 3422 amdgpu_seq64_fini(adev); 3423 } 3424 if (adev->ip_blocks[i].version->funcs->sw_fini) { 3425 r = adev->ip_blocks[i].version->funcs->sw_fini(&adev->ip_blocks[i]); 3426 /* XXX handle errors */ 3427 if (r) { 3428 DRM_DEBUG("sw_fini of IP block <%s> failed %d\n", 3429 adev->ip_blocks[i].version->funcs->name, r); 3430 } 3431 } 3432 adev->ip_blocks[i].status.sw = false; 3433 adev->ip_blocks[i].status.valid = false; 3434 } 3435 3436 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 3437 if (!adev->ip_blocks[i].status.late_initialized) 3438 continue; 3439 if (adev->ip_blocks[i].version->funcs->late_fini) 3440 adev->ip_blocks[i].version->funcs->late_fini(&adev->ip_blocks[i]); 3441 adev->ip_blocks[i].status.late_initialized = false; 3442 } 3443 3444 amdgpu_ras_fini(adev); 3445 3446 return 0; 3447 } 3448 3449 /** 3450 * amdgpu_device_delayed_init_work_handler - work handler for IB tests 3451 * 3452 * @work: work_struct. 3453 */ 3454 static void amdgpu_device_delayed_init_work_handler(struct work_struct *work) 3455 { 3456 struct amdgpu_device *adev = 3457 container_of(work, struct amdgpu_device, delayed_init_work.work); 3458 int r; 3459 3460 r = amdgpu_ib_ring_tests(adev); 3461 if (r) 3462 DRM_ERROR("ib ring test failed (%d).\n", r); 3463 } 3464 3465 static void amdgpu_device_delay_enable_gfx_off(struct work_struct *work) 3466 { 3467 struct amdgpu_device *adev = 3468 container_of(work, struct amdgpu_device, gfx.gfx_off_delay_work.work); 3469 3470 WARN_ON_ONCE(adev->gfx.gfx_off_state); 3471 WARN_ON_ONCE(adev->gfx.gfx_off_req_count); 3472 3473 if (!amdgpu_dpm_set_powergating_by_smu(adev, AMD_IP_BLOCK_TYPE_GFX, true)) 3474 adev->gfx.gfx_off_state = true; 3475 } 3476 3477 /** 3478 * amdgpu_device_ip_suspend_phase1 - run suspend for hardware IPs (phase 1) 3479 * 3480 * @adev: amdgpu_device pointer 3481 * 3482 * Main suspend function for hardware IPs. The list of all the hardware 3483 * IPs that make up the asic is walked, clockgating is disabled and the 3484 * suspend callbacks are run. suspend puts the hardware and software state 3485 * in each IP into a state suitable for suspend. 3486 * Returns 0 on success, negative error code on failure. 3487 */ 3488 static int amdgpu_device_ip_suspend_phase1(struct amdgpu_device *adev) 3489 { 3490 int i, r; 3491 3492 amdgpu_device_set_pg_state(adev, AMD_PG_STATE_UNGATE); 3493 amdgpu_device_set_cg_state(adev, AMD_CG_STATE_UNGATE); 3494 3495 /* 3496 * Per PMFW team's suggestion, driver needs to handle gfxoff 3497 * and df cstate features disablement for gpu reset(e.g. Mode1Reset) 3498 * scenario. Add the missing df cstate disablement here. 3499 */ 3500 if (amdgpu_dpm_set_df_cstate(adev, DF_CSTATE_DISALLOW)) 3501 dev_warn(adev->dev, "Failed to disallow df cstate"); 3502 3503 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 3504 if (!adev->ip_blocks[i].status.valid) 3505 continue; 3506 3507 /* displays are handled separately */ 3508 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_DCE) 3509 continue; 3510 3511 /* XXX handle errors */ 3512 r = amdgpu_ip_block_suspend(&adev->ip_blocks[i]); 3513 if (r) 3514 return r; 3515 } 3516 3517 return 0; 3518 } 3519 3520 /** 3521 * amdgpu_device_ip_suspend_phase2 - run suspend for hardware IPs (phase 2) 3522 * 3523 * @adev: amdgpu_device pointer 3524 * 3525 * Main suspend function for hardware IPs. The list of all the hardware 3526 * IPs that make up the asic is walked, clockgating is disabled and the 3527 * suspend callbacks are run. suspend puts the hardware and software state 3528 * in each IP into a state suitable for suspend. 3529 * Returns 0 on success, negative error code on failure. 3530 */ 3531 static int amdgpu_device_ip_suspend_phase2(struct amdgpu_device *adev) 3532 { 3533 int i, r; 3534 3535 if (adev->in_s0ix) 3536 amdgpu_dpm_gfx_state_change(adev, sGpuChangeState_D3Entry); 3537 3538 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 3539 if (!adev->ip_blocks[i].status.valid) 3540 continue; 3541 /* displays are handled in phase1 */ 3542 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE) 3543 continue; 3544 /* PSP lost connection when err_event_athub occurs */ 3545 if (amdgpu_ras_intr_triggered() && 3546 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) { 3547 adev->ip_blocks[i].status.hw = false; 3548 continue; 3549 } 3550 3551 /* skip unnecessary suspend if we do not initialize them yet */ 3552 if (!amdgpu_ip_member_of_hwini( 3553 adev, adev->ip_blocks[i].version->type)) 3554 continue; 3555 3556 /* skip suspend of gfx/mes and psp for S0ix 3557 * gfx is in gfxoff state, so on resume it will exit gfxoff just 3558 * like at runtime. PSP is also part of the always on hardware 3559 * so no need to suspend it. 3560 */ 3561 if (adev->in_s0ix && 3562 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP || 3563 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GFX || 3564 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_MES)) 3565 continue; 3566 3567 /* SDMA 5.x+ is part of GFX power domain so it's covered by GFXOFF */ 3568 if (adev->in_s0ix && 3569 (amdgpu_ip_version(adev, SDMA0_HWIP, 0) >= 3570 IP_VERSION(5, 0, 0)) && 3571 (adev->ip_blocks[i].version->type == 3572 AMD_IP_BLOCK_TYPE_SDMA)) 3573 continue; 3574 3575 /* Once swPSP provides the IMU, RLC FW binaries to TOS during cold-boot. 3576 * These are in TMR, hence are expected to be reused by PSP-TOS to reload 3577 * from this location and RLC Autoload automatically also gets loaded 3578 * from here based on PMFW -> PSP message during re-init sequence. 3579 * Therefore, the psp suspend & resume should be skipped to avoid destroy 3580 * the TMR and reload FWs again for IMU enabled APU ASICs. 3581 */ 3582 if (amdgpu_in_reset(adev) && 3583 (adev->flags & AMD_IS_APU) && adev->gfx.imu.funcs && 3584 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) 3585 continue; 3586 3587 /* XXX handle errors */ 3588 r = amdgpu_ip_block_suspend(&adev->ip_blocks[i]); 3589 adev->ip_blocks[i].status.hw = false; 3590 3591 /* handle putting the SMC in the appropriate state */ 3592 if (!amdgpu_sriov_vf(adev)) { 3593 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) { 3594 r = amdgpu_dpm_set_mp1_state(adev, adev->mp1_state); 3595 if (r) { 3596 DRM_ERROR("SMC failed to set mp1 state %d, %d\n", 3597 adev->mp1_state, r); 3598 return r; 3599 } 3600 } 3601 } 3602 } 3603 3604 return 0; 3605 } 3606 3607 /** 3608 * amdgpu_device_ip_suspend - run suspend for hardware IPs 3609 * 3610 * @adev: amdgpu_device pointer 3611 * 3612 * Main suspend function for hardware IPs. The list of all the hardware 3613 * IPs that make up the asic is walked, clockgating is disabled and the 3614 * suspend callbacks are run. suspend puts the hardware and software state 3615 * in each IP into a state suitable for suspend. 3616 * Returns 0 on success, negative error code on failure. 3617 */ 3618 int amdgpu_device_ip_suspend(struct amdgpu_device *adev) 3619 { 3620 int r; 3621 3622 if (amdgpu_sriov_vf(adev)) { 3623 amdgpu_virt_fini_data_exchange(adev); 3624 amdgpu_virt_request_full_gpu(adev, false); 3625 } 3626 3627 amdgpu_ttm_set_buffer_funcs_status(adev, false); 3628 3629 r = amdgpu_device_ip_suspend_phase1(adev); 3630 if (r) 3631 return r; 3632 r = amdgpu_device_ip_suspend_phase2(adev); 3633 3634 if (amdgpu_sriov_vf(adev)) 3635 amdgpu_virt_release_full_gpu(adev, false); 3636 3637 return r; 3638 } 3639 3640 static int amdgpu_device_ip_reinit_early_sriov(struct amdgpu_device *adev) 3641 { 3642 int i, r; 3643 3644 static enum amd_ip_block_type ip_order[] = { 3645 AMD_IP_BLOCK_TYPE_COMMON, 3646 AMD_IP_BLOCK_TYPE_GMC, 3647 AMD_IP_BLOCK_TYPE_PSP, 3648 AMD_IP_BLOCK_TYPE_IH, 3649 }; 3650 3651 for (i = 0; i < adev->num_ip_blocks; i++) { 3652 int j; 3653 struct amdgpu_ip_block *block; 3654 3655 block = &adev->ip_blocks[i]; 3656 block->status.hw = false; 3657 3658 for (j = 0; j < ARRAY_SIZE(ip_order); j++) { 3659 3660 if (block->version->type != ip_order[j] || 3661 !block->status.valid) 3662 continue; 3663 3664 r = block->version->funcs->hw_init(&adev->ip_blocks[i]); 3665 DRM_INFO("RE-INIT-early: %s %s\n", block->version->funcs->name, r?"failed":"succeeded"); 3666 if (r) 3667 return r; 3668 block->status.hw = true; 3669 } 3670 } 3671 3672 return 0; 3673 } 3674 3675 static int amdgpu_device_ip_reinit_late_sriov(struct amdgpu_device *adev) 3676 { 3677 int i, r; 3678 3679 static enum amd_ip_block_type ip_order[] = { 3680 AMD_IP_BLOCK_TYPE_SMC, 3681 AMD_IP_BLOCK_TYPE_DCE, 3682 AMD_IP_BLOCK_TYPE_GFX, 3683 AMD_IP_BLOCK_TYPE_SDMA, 3684 AMD_IP_BLOCK_TYPE_MES, 3685 AMD_IP_BLOCK_TYPE_UVD, 3686 AMD_IP_BLOCK_TYPE_VCE, 3687 AMD_IP_BLOCK_TYPE_VCN, 3688 AMD_IP_BLOCK_TYPE_JPEG 3689 }; 3690 3691 for (i = 0; i < ARRAY_SIZE(ip_order); i++) { 3692 int j; 3693 struct amdgpu_ip_block *block; 3694 3695 for (j = 0; j < adev->num_ip_blocks; j++) { 3696 block = &adev->ip_blocks[j]; 3697 3698 if (block->version->type != ip_order[i] || 3699 !block->status.valid || 3700 block->status.hw) 3701 continue; 3702 3703 if (block->version->type == AMD_IP_BLOCK_TYPE_SMC) { 3704 r = amdgpu_ip_block_resume(&adev->ip_blocks[i]); 3705 if (r) 3706 return r; 3707 } else { 3708 r = block->version->funcs->hw_init(&adev->ip_blocks[i]); 3709 if (r) { 3710 DRM_ERROR("hw_init of IP block <%s> failed %d\n", 3711 adev->ip_blocks[i].version->funcs->name, r); 3712 return r; 3713 } 3714 block->status.hw = true; 3715 } 3716 } 3717 } 3718 3719 return 0; 3720 } 3721 3722 /** 3723 * amdgpu_device_ip_resume_phase1 - run resume for hardware IPs 3724 * 3725 * @adev: amdgpu_device pointer 3726 * 3727 * First resume function for hardware IPs. The list of all the hardware 3728 * IPs that make up the asic is walked and the resume callbacks are run for 3729 * COMMON, GMC, and IH. resume puts the hardware into a functional state 3730 * after a suspend and updates the software state as necessary. This 3731 * function is also used for restoring the GPU after a GPU reset. 3732 * Returns 0 on success, negative error code on failure. 3733 */ 3734 static int amdgpu_device_ip_resume_phase1(struct amdgpu_device *adev) 3735 { 3736 int i, r; 3737 3738 for (i = 0; i < adev->num_ip_blocks; i++) { 3739 if (!adev->ip_blocks[i].status.valid || adev->ip_blocks[i].status.hw) 3740 continue; 3741 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON || 3742 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC || 3743 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH || 3744 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP && amdgpu_sriov_vf(adev))) { 3745 3746 r = amdgpu_ip_block_resume(&adev->ip_blocks[i]); 3747 if (r) 3748 return r; 3749 } 3750 } 3751 3752 return 0; 3753 } 3754 3755 /** 3756 * amdgpu_device_ip_resume_phase2 - run resume for hardware IPs 3757 * 3758 * @adev: amdgpu_device pointer 3759 * 3760 * First resume function for hardware IPs. The list of all the hardware 3761 * IPs that make up the asic is walked and the resume callbacks are run for 3762 * all blocks except COMMON, GMC, and IH. resume puts the hardware into a 3763 * functional state after a suspend and updates the software state as 3764 * necessary. This function is also used for restoring the GPU after a GPU 3765 * reset. 3766 * Returns 0 on success, negative error code on failure. 3767 */ 3768 static int amdgpu_device_ip_resume_phase2(struct amdgpu_device *adev) 3769 { 3770 int i, r; 3771 3772 for (i = 0; i < adev->num_ip_blocks; i++) { 3773 if (!adev->ip_blocks[i].status.valid || adev->ip_blocks[i].status.hw) 3774 continue; 3775 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON || 3776 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC || 3777 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH || 3778 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) 3779 continue; 3780 r = amdgpu_ip_block_resume(&adev->ip_blocks[i]); 3781 if (r) 3782 return r; 3783 } 3784 3785 return 0; 3786 } 3787 3788 /** 3789 * amdgpu_device_ip_resume - run resume for hardware IPs 3790 * 3791 * @adev: amdgpu_device pointer 3792 * 3793 * Main resume function for hardware IPs. The hardware IPs 3794 * are split into two resume functions because they are 3795 * also used in recovering from a GPU reset and some additional 3796 * steps need to be take between them. In this case (S3/S4) they are 3797 * run sequentially. 3798 * Returns 0 on success, negative error code on failure. 3799 */ 3800 static int amdgpu_device_ip_resume(struct amdgpu_device *adev) 3801 { 3802 int r; 3803 3804 r = amdgpu_device_ip_resume_phase1(adev); 3805 if (r) 3806 return r; 3807 3808 r = amdgpu_device_fw_loading(adev); 3809 if (r) 3810 return r; 3811 3812 r = amdgpu_device_ip_resume_phase2(adev); 3813 3814 if (adev->mman.buffer_funcs_ring->sched.ready) 3815 amdgpu_ttm_set_buffer_funcs_status(adev, true); 3816 3817 return r; 3818 } 3819 3820 /** 3821 * amdgpu_device_detect_sriov_bios - determine if the board supports SR-IOV 3822 * 3823 * @adev: amdgpu_device pointer 3824 * 3825 * Query the VBIOS data tables to determine if the board supports SR-IOV. 3826 */ 3827 static void amdgpu_device_detect_sriov_bios(struct amdgpu_device *adev) 3828 { 3829 if (amdgpu_sriov_vf(adev)) { 3830 if (adev->is_atom_fw) { 3831 if (amdgpu_atomfirmware_gpu_virtualization_supported(adev)) 3832 adev->virt.caps |= AMDGPU_SRIOV_CAPS_SRIOV_VBIOS; 3833 } else { 3834 if (amdgpu_atombios_has_gpu_virtualization_table(adev)) 3835 adev->virt.caps |= AMDGPU_SRIOV_CAPS_SRIOV_VBIOS; 3836 } 3837 3838 if (!(adev->virt.caps & AMDGPU_SRIOV_CAPS_SRIOV_VBIOS)) 3839 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_NO_VBIOS, 0, 0); 3840 } 3841 } 3842 3843 /** 3844 * amdgpu_device_asic_has_dc_support - determine if DC supports the asic 3845 * 3846 * @asic_type: AMD asic type 3847 * 3848 * Check if there is DC (new modesetting infrastructre) support for an asic. 3849 * returns true if DC has support, false if not. 3850 */ 3851 bool amdgpu_device_asic_has_dc_support(enum amd_asic_type asic_type) 3852 { 3853 switch (asic_type) { 3854 #ifdef CONFIG_DRM_AMDGPU_SI 3855 case CHIP_HAINAN: 3856 #endif 3857 case CHIP_TOPAZ: 3858 /* chips with no display hardware */ 3859 return false; 3860 #if defined(CONFIG_DRM_AMD_DC) 3861 case CHIP_TAHITI: 3862 case CHIP_PITCAIRN: 3863 case CHIP_VERDE: 3864 case CHIP_OLAND: 3865 /* 3866 * We have systems in the wild with these ASICs that require 3867 * LVDS and VGA support which is not supported with DC. 3868 * 3869 * Fallback to the non-DC driver here by default so as not to 3870 * cause regressions. 3871 */ 3872 #if defined(CONFIG_DRM_AMD_DC_SI) 3873 return amdgpu_dc > 0; 3874 #else 3875 return false; 3876 #endif 3877 case CHIP_BONAIRE: 3878 case CHIP_KAVERI: 3879 case CHIP_KABINI: 3880 case CHIP_MULLINS: 3881 /* 3882 * We have systems in the wild with these ASICs that require 3883 * VGA support which is not supported with DC. 3884 * 3885 * Fallback to the non-DC driver here by default so as not to 3886 * cause regressions. 3887 */ 3888 return amdgpu_dc > 0; 3889 default: 3890 return amdgpu_dc != 0; 3891 #else 3892 default: 3893 if (amdgpu_dc > 0) 3894 DRM_INFO_ONCE("Display Core has been requested via kernel parameter but isn't supported by ASIC, ignoring\n"); 3895 return false; 3896 #endif 3897 } 3898 } 3899 3900 /** 3901 * amdgpu_device_has_dc_support - check if dc is supported 3902 * 3903 * @adev: amdgpu_device pointer 3904 * 3905 * Returns true for supported, false for not supported 3906 */ 3907 bool amdgpu_device_has_dc_support(struct amdgpu_device *adev) 3908 { 3909 if (adev->enable_virtual_display || 3910 (adev->harvest_ip_mask & AMD_HARVEST_IP_DMU_MASK)) 3911 return false; 3912 3913 return amdgpu_device_asic_has_dc_support(adev->asic_type); 3914 } 3915 3916 static void amdgpu_device_xgmi_reset_func(struct work_struct *__work) 3917 { 3918 struct amdgpu_device *adev = 3919 container_of(__work, struct amdgpu_device, xgmi_reset_work); 3920 struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev); 3921 3922 /* It's a bug to not have a hive within this function */ 3923 if (WARN_ON(!hive)) 3924 return; 3925 3926 /* 3927 * Use task barrier to synchronize all xgmi reset works across the 3928 * hive. task_barrier_enter and task_barrier_exit will block 3929 * until all the threads running the xgmi reset works reach 3930 * those points. task_barrier_full will do both blocks. 3931 */ 3932 if (amdgpu_asic_reset_method(adev) == AMD_RESET_METHOD_BACO) { 3933 3934 task_barrier_enter(&hive->tb); 3935 adev->asic_reset_res = amdgpu_device_baco_enter(adev_to_drm(adev)); 3936 3937 if (adev->asic_reset_res) 3938 goto fail; 3939 3940 task_barrier_exit(&hive->tb); 3941 adev->asic_reset_res = amdgpu_device_baco_exit(adev_to_drm(adev)); 3942 3943 if (adev->asic_reset_res) 3944 goto fail; 3945 3946 amdgpu_ras_reset_error_count(adev, AMDGPU_RAS_BLOCK__MMHUB); 3947 } else { 3948 3949 task_barrier_full(&hive->tb); 3950 adev->asic_reset_res = amdgpu_asic_reset(adev); 3951 } 3952 3953 fail: 3954 if (adev->asic_reset_res) 3955 DRM_WARN("ASIC reset failed with error, %d for drm dev, %s", 3956 adev->asic_reset_res, adev_to_drm(adev)->unique); 3957 amdgpu_put_xgmi_hive(hive); 3958 } 3959 3960 static int amdgpu_device_get_job_timeout_settings(struct amdgpu_device *adev) 3961 { 3962 char *input = amdgpu_lockup_timeout; 3963 char *timeout_setting = NULL; 3964 int index = 0; 3965 long timeout; 3966 int ret = 0; 3967 3968 /* 3969 * By default timeout for non compute jobs is 10000 3970 * and 60000 for compute jobs. 3971 * In SR-IOV or passthrough mode, timeout for compute 3972 * jobs are 60000 by default. 3973 */ 3974 adev->gfx_timeout = msecs_to_jiffies(10000); 3975 adev->sdma_timeout = adev->video_timeout = adev->gfx_timeout; 3976 if (amdgpu_sriov_vf(adev)) 3977 adev->compute_timeout = amdgpu_sriov_is_pp_one_vf(adev) ? 3978 msecs_to_jiffies(60000) : msecs_to_jiffies(10000); 3979 else 3980 adev->compute_timeout = msecs_to_jiffies(60000); 3981 3982 if (strnlen(input, AMDGPU_MAX_TIMEOUT_PARAM_LENGTH)) { 3983 while ((timeout_setting = strsep(&input, ",")) && 3984 strnlen(timeout_setting, AMDGPU_MAX_TIMEOUT_PARAM_LENGTH)) { 3985 ret = kstrtol(timeout_setting, 0, &timeout); 3986 if (ret) 3987 return ret; 3988 3989 if (timeout == 0) { 3990 index++; 3991 continue; 3992 } else if (timeout < 0) { 3993 timeout = MAX_SCHEDULE_TIMEOUT; 3994 dev_warn(adev->dev, "lockup timeout disabled"); 3995 add_taint(TAINT_SOFTLOCKUP, LOCKDEP_STILL_OK); 3996 } else { 3997 timeout = msecs_to_jiffies(timeout); 3998 } 3999 4000 switch (index++) { 4001 case 0: 4002 adev->gfx_timeout = timeout; 4003 break; 4004 case 1: 4005 adev->compute_timeout = timeout; 4006 break; 4007 case 2: 4008 adev->sdma_timeout = timeout; 4009 break; 4010 case 3: 4011 adev->video_timeout = timeout; 4012 break; 4013 default: 4014 break; 4015 } 4016 } 4017 /* 4018 * There is only one value specified and 4019 * it should apply to all non-compute jobs. 4020 */ 4021 if (index == 1) { 4022 adev->sdma_timeout = adev->video_timeout = adev->gfx_timeout; 4023 if (amdgpu_sriov_vf(adev) || amdgpu_passthrough(adev)) 4024 adev->compute_timeout = adev->gfx_timeout; 4025 } 4026 } 4027 4028 return ret; 4029 } 4030 4031 /** 4032 * amdgpu_device_check_iommu_direct_map - check if RAM direct mapped to GPU 4033 * 4034 * @adev: amdgpu_device pointer 4035 * 4036 * RAM direct mapped to GPU if IOMMU is not enabled or is pass through mode 4037 */ 4038 static void amdgpu_device_check_iommu_direct_map(struct amdgpu_device *adev) 4039 { 4040 struct iommu_domain *domain; 4041 4042 domain = iommu_get_domain_for_dev(adev->dev); 4043 if (!domain || domain->type == IOMMU_DOMAIN_IDENTITY) 4044 adev->ram_is_direct_mapped = true; 4045 } 4046 4047 #if defined(CONFIG_HSA_AMD_P2P) 4048 /** 4049 * amdgpu_device_check_iommu_remap - Check if DMA remapping is enabled. 4050 * 4051 * @adev: amdgpu_device pointer 4052 * 4053 * return if IOMMU remapping bar address 4054 */ 4055 static bool amdgpu_device_check_iommu_remap(struct amdgpu_device *adev) 4056 { 4057 struct iommu_domain *domain; 4058 4059 domain = iommu_get_domain_for_dev(adev->dev); 4060 if (domain && (domain->type == IOMMU_DOMAIN_DMA || 4061 domain->type == IOMMU_DOMAIN_DMA_FQ)) 4062 return true; 4063 4064 return false; 4065 } 4066 #endif 4067 4068 static const struct attribute *amdgpu_dev_attributes[] = { 4069 &dev_attr_pcie_replay_count.attr, 4070 NULL 4071 }; 4072 4073 static void amdgpu_device_set_mcbp(struct amdgpu_device *adev) 4074 { 4075 if (amdgpu_mcbp == 1) 4076 adev->gfx.mcbp = true; 4077 else if (amdgpu_mcbp == 0) 4078 adev->gfx.mcbp = false; 4079 4080 if (amdgpu_sriov_vf(adev)) 4081 adev->gfx.mcbp = true; 4082 4083 if (adev->gfx.mcbp) 4084 DRM_INFO("MCBP is enabled\n"); 4085 } 4086 4087 /** 4088 * amdgpu_device_init - initialize the driver 4089 * 4090 * @adev: amdgpu_device pointer 4091 * @flags: driver flags 4092 * 4093 * Initializes the driver info and hw (all asics). 4094 * Returns 0 for success or an error on failure. 4095 * Called at driver startup. 4096 */ 4097 int amdgpu_device_init(struct amdgpu_device *adev, 4098 uint32_t flags) 4099 { 4100 struct drm_device *ddev = adev_to_drm(adev); 4101 struct pci_dev *pdev = adev->pdev; 4102 int r, i; 4103 bool px = false; 4104 u32 max_MBps; 4105 int tmp; 4106 4107 adev->shutdown = false; 4108 adev->flags = flags; 4109 4110 if (amdgpu_force_asic_type >= 0 && amdgpu_force_asic_type < CHIP_LAST) 4111 adev->asic_type = amdgpu_force_asic_type; 4112 else 4113 adev->asic_type = flags & AMD_ASIC_MASK; 4114 4115 adev->usec_timeout = AMDGPU_MAX_USEC_TIMEOUT; 4116 if (amdgpu_emu_mode == 1) 4117 adev->usec_timeout *= 10; 4118 adev->gmc.gart_size = 512 * 1024 * 1024; 4119 adev->accel_working = false; 4120 adev->num_rings = 0; 4121 RCU_INIT_POINTER(adev->gang_submit, dma_fence_get_stub()); 4122 adev->mman.buffer_funcs = NULL; 4123 adev->mman.buffer_funcs_ring = NULL; 4124 adev->vm_manager.vm_pte_funcs = NULL; 4125 adev->vm_manager.vm_pte_num_scheds = 0; 4126 adev->gmc.gmc_funcs = NULL; 4127 adev->harvest_ip_mask = 0x0; 4128 adev->fence_context = dma_fence_context_alloc(AMDGPU_MAX_RINGS); 4129 bitmap_zero(adev->gfx.pipe_reserve_bitmap, AMDGPU_MAX_COMPUTE_QUEUES); 4130 4131 adev->smc_rreg = &amdgpu_invalid_rreg; 4132 adev->smc_wreg = &amdgpu_invalid_wreg; 4133 adev->pcie_rreg = &amdgpu_invalid_rreg; 4134 adev->pcie_wreg = &amdgpu_invalid_wreg; 4135 adev->pcie_rreg_ext = &amdgpu_invalid_rreg_ext; 4136 adev->pcie_wreg_ext = &amdgpu_invalid_wreg_ext; 4137 adev->pciep_rreg = &amdgpu_invalid_rreg; 4138 adev->pciep_wreg = &amdgpu_invalid_wreg; 4139 adev->pcie_rreg64 = &amdgpu_invalid_rreg64; 4140 adev->pcie_wreg64 = &amdgpu_invalid_wreg64; 4141 adev->pcie_rreg64_ext = &amdgpu_invalid_rreg64_ext; 4142 adev->pcie_wreg64_ext = &amdgpu_invalid_wreg64_ext; 4143 adev->uvd_ctx_rreg = &amdgpu_invalid_rreg; 4144 adev->uvd_ctx_wreg = &amdgpu_invalid_wreg; 4145 adev->didt_rreg = &amdgpu_invalid_rreg; 4146 adev->didt_wreg = &amdgpu_invalid_wreg; 4147 adev->gc_cac_rreg = &amdgpu_invalid_rreg; 4148 adev->gc_cac_wreg = &amdgpu_invalid_wreg; 4149 adev->audio_endpt_rreg = &amdgpu_block_invalid_rreg; 4150 adev->audio_endpt_wreg = &amdgpu_block_invalid_wreg; 4151 4152 DRM_INFO("initializing kernel modesetting (%s 0x%04X:0x%04X 0x%04X:0x%04X 0x%02X).\n", 4153 amdgpu_asic_name[adev->asic_type], pdev->vendor, pdev->device, 4154 pdev->subsystem_vendor, pdev->subsystem_device, pdev->revision); 4155 4156 /* mutex initialization are all done here so we 4157 * can recall function without having locking issues 4158 */ 4159 mutex_init(&adev->firmware.mutex); 4160 mutex_init(&adev->pm.mutex); 4161 mutex_init(&adev->gfx.gpu_clock_mutex); 4162 mutex_init(&adev->srbm_mutex); 4163 mutex_init(&adev->gfx.pipe_reserve_mutex); 4164 mutex_init(&adev->gfx.gfx_off_mutex); 4165 mutex_init(&adev->gfx.partition_mutex); 4166 mutex_init(&adev->grbm_idx_mutex); 4167 mutex_init(&adev->mn_lock); 4168 mutex_init(&adev->virt.vf_errors.lock); 4169 mutex_init(&adev->virt.rlcg_reg_lock); 4170 hash_init(adev->mn_hash); 4171 mutex_init(&adev->psp.mutex); 4172 mutex_init(&adev->notifier_lock); 4173 mutex_init(&adev->pm.stable_pstate_ctx_lock); 4174 mutex_init(&adev->benchmark_mutex); 4175 mutex_init(&adev->gfx.reset_sem_mutex); 4176 /* Initialize the mutex for cleaner shader isolation between GFX and compute processes */ 4177 mutex_init(&adev->enforce_isolation_mutex); 4178 mutex_init(&adev->gfx.kfd_sch_mutex); 4179 4180 amdgpu_device_init_apu_flags(adev); 4181 4182 r = amdgpu_device_check_arguments(adev); 4183 if (r) 4184 return r; 4185 4186 spin_lock_init(&adev->mmio_idx_lock); 4187 spin_lock_init(&adev->smc_idx_lock); 4188 spin_lock_init(&adev->pcie_idx_lock); 4189 spin_lock_init(&adev->uvd_ctx_idx_lock); 4190 spin_lock_init(&adev->didt_idx_lock); 4191 spin_lock_init(&adev->gc_cac_idx_lock); 4192 spin_lock_init(&adev->se_cac_idx_lock); 4193 spin_lock_init(&adev->audio_endpt_idx_lock); 4194 spin_lock_init(&adev->mm_stats.lock); 4195 spin_lock_init(&adev->wb.lock); 4196 4197 INIT_LIST_HEAD(&adev->reset_list); 4198 4199 INIT_LIST_HEAD(&adev->ras_list); 4200 4201 INIT_LIST_HEAD(&adev->pm.od_kobj_list); 4202 4203 INIT_DELAYED_WORK(&adev->delayed_init_work, 4204 amdgpu_device_delayed_init_work_handler); 4205 INIT_DELAYED_WORK(&adev->gfx.gfx_off_delay_work, 4206 amdgpu_device_delay_enable_gfx_off); 4207 /* 4208 * Initialize the enforce_isolation work structures for each XCP 4209 * partition. This work handler is responsible for enforcing shader 4210 * isolation on AMD GPUs. It counts the number of emitted fences for 4211 * each GFX and compute ring. If there are any fences, it schedules 4212 * the `enforce_isolation_work` to be run after a delay. If there are 4213 * no fences, it signals the Kernel Fusion Driver (KFD) to resume the 4214 * runqueue. 4215 */ 4216 for (i = 0; i < MAX_XCP; i++) { 4217 INIT_DELAYED_WORK(&adev->gfx.enforce_isolation[i].work, 4218 amdgpu_gfx_enforce_isolation_handler); 4219 adev->gfx.enforce_isolation[i].adev = adev; 4220 adev->gfx.enforce_isolation[i].xcp_id = i; 4221 } 4222 4223 INIT_WORK(&adev->xgmi_reset_work, amdgpu_device_xgmi_reset_func); 4224 4225 adev->gfx.gfx_off_req_count = 1; 4226 adev->gfx.gfx_off_residency = 0; 4227 adev->gfx.gfx_off_entrycount = 0; 4228 adev->pm.ac_power = power_supply_is_system_supplied() > 0; 4229 4230 atomic_set(&adev->throttling_logging_enabled, 1); 4231 /* 4232 * If throttling continues, logging will be performed every minute 4233 * to avoid log flooding. "-1" is subtracted since the thermal 4234 * throttling interrupt comes every second. Thus, the total logging 4235 * interval is 59 seconds(retelimited printk interval) + 1(waiting 4236 * for throttling interrupt) = 60 seconds. 4237 */ 4238 ratelimit_state_init(&adev->throttling_logging_rs, (60 - 1) * HZ, 1); 4239 ratelimit_state_init(&adev->virt.ras_telemetry_rs, 5 * HZ, 1); 4240 4241 ratelimit_set_flags(&adev->throttling_logging_rs, RATELIMIT_MSG_ON_RELEASE); 4242 ratelimit_set_flags(&adev->virt.ras_telemetry_rs, RATELIMIT_MSG_ON_RELEASE); 4243 4244 /* Registers mapping */ 4245 /* TODO: block userspace mapping of io register */ 4246 if (adev->asic_type >= CHIP_BONAIRE) { 4247 adev->rmmio_base = pci_resource_start(adev->pdev, 5); 4248 adev->rmmio_size = pci_resource_len(adev->pdev, 5); 4249 } else { 4250 adev->rmmio_base = pci_resource_start(adev->pdev, 2); 4251 adev->rmmio_size = pci_resource_len(adev->pdev, 2); 4252 } 4253 4254 for (i = 0; i < AMD_IP_BLOCK_TYPE_NUM; i++) 4255 atomic_set(&adev->pm.pwr_state[i], POWER_STATE_UNKNOWN); 4256 4257 adev->rmmio = ioremap(adev->rmmio_base, adev->rmmio_size); 4258 if (!adev->rmmio) 4259 return -ENOMEM; 4260 4261 DRM_INFO("register mmio base: 0x%08X\n", (uint32_t)adev->rmmio_base); 4262 DRM_INFO("register mmio size: %u\n", (unsigned int)adev->rmmio_size); 4263 4264 /* 4265 * Reset domain needs to be present early, before XGMI hive discovered 4266 * (if any) and intitialized to use reset sem and in_gpu reset flag 4267 * early on during init and before calling to RREG32. 4268 */ 4269 adev->reset_domain = amdgpu_reset_create_reset_domain(SINGLE_DEVICE, "amdgpu-reset-dev"); 4270 if (!adev->reset_domain) 4271 return -ENOMEM; 4272 4273 /* detect hw virtualization here */ 4274 amdgpu_detect_virtualization(adev); 4275 4276 amdgpu_device_get_pcie_info(adev); 4277 4278 r = amdgpu_device_get_job_timeout_settings(adev); 4279 if (r) { 4280 dev_err(adev->dev, "invalid lockup_timeout parameter syntax\n"); 4281 return r; 4282 } 4283 4284 amdgpu_device_set_mcbp(adev); 4285 4286 /* 4287 * By default, use default mode where all blocks are expected to be 4288 * initialized. At present a 'swinit' of blocks is required to be 4289 * completed before the need for a different level is detected. 4290 */ 4291 amdgpu_set_init_level(adev, AMDGPU_INIT_LEVEL_DEFAULT); 4292 /* early init functions */ 4293 r = amdgpu_device_ip_early_init(adev); 4294 if (r) 4295 return r; 4296 4297 /* Get rid of things like offb */ 4298 r = aperture_remove_conflicting_pci_devices(adev->pdev, amdgpu_kms_driver.name); 4299 if (r) 4300 return r; 4301 4302 /* Enable TMZ based on IP_VERSION */ 4303 amdgpu_gmc_tmz_set(adev); 4304 4305 if (amdgpu_sriov_vf(adev) && 4306 amdgpu_ip_version(adev, GC_HWIP, 0) >= IP_VERSION(10, 3, 0)) 4307 /* VF MMIO access (except mailbox range) from CPU 4308 * will be blocked during sriov runtime 4309 */ 4310 adev->virt.caps |= AMDGPU_VF_MMIO_ACCESS_PROTECT; 4311 4312 amdgpu_gmc_noretry_set(adev); 4313 /* Need to get xgmi info early to decide the reset behavior*/ 4314 if (adev->gmc.xgmi.supported) { 4315 r = adev->gfxhub.funcs->get_xgmi_info(adev); 4316 if (r) 4317 return r; 4318 } 4319 4320 /* enable PCIE atomic ops */ 4321 if (amdgpu_sriov_vf(adev)) { 4322 if (adev->virt.fw_reserve.p_pf2vf) 4323 adev->have_atomics_support = ((struct amd_sriov_msg_pf2vf_info *) 4324 adev->virt.fw_reserve.p_pf2vf)->pcie_atomic_ops_support_flags == 4325 (PCI_EXP_DEVCAP2_ATOMIC_COMP32 | PCI_EXP_DEVCAP2_ATOMIC_COMP64); 4326 /* APUs w/ gfx9 onwards doesn't reply on PCIe atomics, rather it is a 4327 * internal path natively support atomics, set have_atomics_support to true. 4328 */ 4329 } else if ((adev->flags & AMD_IS_APU) && 4330 (amdgpu_ip_version(adev, GC_HWIP, 0) > 4331 IP_VERSION(9, 0, 0))) { 4332 adev->have_atomics_support = true; 4333 } else { 4334 adev->have_atomics_support = 4335 !pci_enable_atomic_ops_to_root(adev->pdev, 4336 PCI_EXP_DEVCAP2_ATOMIC_COMP32 | 4337 PCI_EXP_DEVCAP2_ATOMIC_COMP64); 4338 } 4339 4340 if (!adev->have_atomics_support) 4341 dev_info(adev->dev, "PCIE atomic ops is not supported\n"); 4342 4343 /* doorbell bar mapping and doorbell index init*/ 4344 amdgpu_doorbell_init(adev); 4345 4346 if (amdgpu_emu_mode == 1) { 4347 /* post the asic on emulation mode */ 4348 emu_soc_asic_init(adev); 4349 goto fence_driver_init; 4350 } 4351 4352 amdgpu_reset_init(adev); 4353 4354 /* detect if we are with an SRIOV vbios */ 4355 if (adev->bios) 4356 amdgpu_device_detect_sriov_bios(adev); 4357 4358 /* check if we need to reset the asic 4359 * E.g., driver was not cleanly unloaded previously, etc. 4360 */ 4361 if (!amdgpu_sriov_vf(adev) && amdgpu_asic_need_reset_on_init(adev)) { 4362 if (adev->gmc.xgmi.num_physical_nodes) { 4363 dev_info(adev->dev, "Pending hive reset.\n"); 4364 amdgpu_set_init_level(adev, 4365 AMDGPU_INIT_LEVEL_MINIMAL_XGMI); 4366 } else if (amdgpu_ip_version(adev, MP1_HWIP, 0) == IP_VERSION(13, 0, 10) && 4367 !amdgpu_device_has_display_hardware(adev)) { 4368 r = psp_gpu_reset(adev); 4369 } else { 4370 tmp = amdgpu_reset_method; 4371 /* It should do a default reset when loading or reloading the driver, 4372 * regardless of the module parameter reset_method. 4373 */ 4374 amdgpu_reset_method = AMD_RESET_METHOD_NONE; 4375 r = amdgpu_asic_reset(adev); 4376 amdgpu_reset_method = tmp; 4377 } 4378 4379 if (r) { 4380 dev_err(adev->dev, "asic reset on init failed\n"); 4381 goto failed; 4382 } 4383 } 4384 4385 /* Post card if necessary */ 4386 if (amdgpu_device_need_post(adev)) { 4387 if (!adev->bios) { 4388 dev_err(adev->dev, "no vBIOS found\n"); 4389 r = -EINVAL; 4390 goto failed; 4391 } 4392 DRM_INFO("GPU posting now...\n"); 4393 r = amdgpu_device_asic_init(adev); 4394 if (r) { 4395 dev_err(adev->dev, "gpu post error!\n"); 4396 goto failed; 4397 } 4398 } 4399 4400 if (adev->bios) { 4401 if (adev->is_atom_fw) { 4402 /* Initialize clocks */ 4403 r = amdgpu_atomfirmware_get_clock_info(adev); 4404 if (r) { 4405 dev_err(adev->dev, "amdgpu_atomfirmware_get_clock_info failed\n"); 4406 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_GET_CLOCK_FAIL, 0, 0); 4407 goto failed; 4408 } 4409 } else { 4410 /* Initialize clocks */ 4411 r = amdgpu_atombios_get_clock_info(adev); 4412 if (r) { 4413 dev_err(adev->dev, "amdgpu_atombios_get_clock_info failed\n"); 4414 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_GET_CLOCK_FAIL, 0, 0); 4415 goto failed; 4416 } 4417 /* init i2c buses */ 4418 if (!amdgpu_device_has_dc_support(adev)) 4419 amdgpu_atombios_i2c_init(adev); 4420 } 4421 } 4422 4423 fence_driver_init: 4424 /* Fence driver */ 4425 r = amdgpu_fence_driver_sw_init(adev); 4426 if (r) { 4427 dev_err(adev->dev, "amdgpu_fence_driver_sw_init failed\n"); 4428 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_FENCE_INIT_FAIL, 0, 0); 4429 goto failed; 4430 } 4431 4432 /* init the mode config */ 4433 drm_mode_config_init(adev_to_drm(adev)); 4434 4435 r = amdgpu_device_ip_init(adev); 4436 if (r) { 4437 dev_err(adev->dev, "amdgpu_device_ip_init failed\n"); 4438 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_AMDGPU_INIT_FAIL, 0, 0); 4439 goto release_ras_con; 4440 } 4441 4442 amdgpu_fence_driver_hw_init(adev); 4443 4444 dev_info(adev->dev, 4445 "SE %d, SH per SE %d, CU per SH %d, active_cu_number %d\n", 4446 adev->gfx.config.max_shader_engines, 4447 adev->gfx.config.max_sh_per_se, 4448 adev->gfx.config.max_cu_per_sh, 4449 adev->gfx.cu_info.number); 4450 4451 adev->accel_working = true; 4452 4453 amdgpu_vm_check_compute_bug(adev); 4454 4455 /* Initialize the buffer migration limit. */ 4456 if (amdgpu_moverate >= 0) 4457 max_MBps = amdgpu_moverate; 4458 else 4459 max_MBps = 8; /* Allow 8 MB/s. */ 4460 /* Get a log2 for easy divisions. */ 4461 adev->mm_stats.log2_max_MBps = ilog2(max(1u, max_MBps)); 4462 4463 /* 4464 * Register gpu instance before amdgpu_device_enable_mgpu_fan_boost. 4465 * Otherwise the mgpu fan boost feature will be skipped due to the 4466 * gpu instance is counted less. 4467 */ 4468 amdgpu_register_gpu_instance(adev); 4469 4470 /* enable clockgating, etc. after ib tests, etc. since some blocks require 4471 * explicit gating rather than handling it automatically. 4472 */ 4473 if (adev->init_lvl->level != AMDGPU_INIT_LEVEL_MINIMAL_XGMI) { 4474 r = amdgpu_device_ip_late_init(adev); 4475 if (r) { 4476 dev_err(adev->dev, "amdgpu_device_ip_late_init failed\n"); 4477 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_AMDGPU_LATE_INIT_FAIL, 0, r); 4478 goto release_ras_con; 4479 } 4480 /* must succeed. */ 4481 amdgpu_ras_resume(adev); 4482 queue_delayed_work(system_wq, &adev->delayed_init_work, 4483 msecs_to_jiffies(AMDGPU_RESUME_MS)); 4484 } 4485 4486 if (amdgpu_sriov_vf(adev)) { 4487 amdgpu_virt_release_full_gpu(adev, true); 4488 flush_delayed_work(&adev->delayed_init_work); 4489 } 4490 4491 /* 4492 * Place those sysfs registering after `late_init`. As some of those 4493 * operations performed in `late_init` might affect the sysfs 4494 * interfaces creating. 4495 */ 4496 r = amdgpu_atombios_sysfs_init(adev); 4497 if (r) 4498 drm_err(&adev->ddev, 4499 "registering atombios sysfs failed (%d).\n", r); 4500 4501 r = amdgpu_pm_sysfs_init(adev); 4502 if (r) 4503 DRM_ERROR("registering pm sysfs failed (%d).\n", r); 4504 4505 r = amdgpu_ucode_sysfs_init(adev); 4506 if (r) { 4507 adev->ucode_sysfs_en = false; 4508 DRM_ERROR("Creating firmware sysfs failed (%d).\n", r); 4509 } else 4510 adev->ucode_sysfs_en = true; 4511 4512 r = sysfs_create_files(&adev->dev->kobj, amdgpu_dev_attributes); 4513 if (r) 4514 dev_err(adev->dev, "Could not create amdgpu device attr\n"); 4515 4516 r = devm_device_add_group(adev->dev, &amdgpu_board_attrs_group); 4517 if (r) 4518 dev_err(adev->dev, 4519 "Could not create amdgpu board attributes\n"); 4520 4521 amdgpu_fru_sysfs_init(adev); 4522 amdgpu_reg_state_sysfs_init(adev); 4523 amdgpu_xcp_cfg_sysfs_init(adev); 4524 4525 if (IS_ENABLED(CONFIG_PERF_EVENTS)) 4526 r = amdgpu_pmu_init(adev); 4527 if (r) 4528 dev_err(adev->dev, "amdgpu_pmu_init failed\n"); 4529 4530 /* Have stored pci confspace at hand for restore in sudden PCI error */ 4531 if (amdgpu_device_cache_pci_state(adev->pdev)) 4532 pci_restore_state(pdev); 4533 4534 /* if we have > 1 VGA cards, then disable the amdgpu VGA resources */ 4535 /* this will fail for cards that aren't VGA class devices, just 4536 * ignore it 4537 */ 4538 if ((adev->pdev->class >> 8) == PCI_CLASS_DISPLAY_VGA) 4539 vga_client_register(adev->pdev, amdgpu_device_vga_set_decode); 4540 4541 px = amdgpu_device_supports_px(ddev); 4542 4543 if (px || (!dev_is_removable(&adev->pdev->dev) && 4544 apple_gmux_detect(NULL, NULL))) 4545 vga_switcheroo_register_client(adev->pdev, 4546 &amdgpu_switcheroo_ops, px); 4547 4548 if (px) 4549 vga_switcheroo_init_domain_pm_ops(adev->dev, &adev->vga_pm_domain); 4550 4551 if (adev->init_lvl->level == AMDGPU_INIT_LEVEL_MINIMAL_XGMI) 4552 amdgpu_xgmi_reset_on_init(adev); 4553 4554 amdgpu_device_check_iommu_direct_map(adev); 4555 4556 return 0; 4557 4558 release_ras_con: 4559 if (amdgpu_sriov_vf(adev)) 4560 amdgpu_virt_release_full_gpu(adev, true); 4561 4562 /* failed in exclusive mode due to timeout */ 4563 if (amdgpu_sriov_vf(adev) && 4564 !amdgpu_sriov_runtime(adev) && 4565 amdgpu_virt_mmio_blocked(adev) && 4566 !amdgpu_virt_wait_reset(adev)) { 4567 dev_err(adev->dev, "VF exclusive mode timeout\n"); 4568 /* Don't send request since VF is inactive. */ 4569 adev->virt.caps &= ~AMDGPU_SRIOV_CAPS_RUNTIME; 4570 adev->virt.ops = NULL; 4571 r = -EAGAIN; 4572 } 4573 amdgpu_release_ras_context(adev); 4574 4575 failed: 4576 amdgpu_vf_error_trans_all(adev); 4577 4578 return r; 4579 } 4580 4581 static void amdgpu_device_unmap_mmio(struct amdgpu_device *adev) 4582 { 4583 4584 /* Clear all CPU mappings pointing to this device */ 4585 unmap_mapping_range(adev->ddev.anon_inode->i_mapping, 0, 0, 1); 4586 4587 /* Unmap all mapped bars - Doorbell, registers and VRAM */ 4588 amdgpu_doorbell_fini(adev); 4589 4590 iounmap(adev->rmmio); 4591 adev->rmmio = NULL; 4592 if (adev->mman.aper_base_kaddr) 4593 iounmap(adev->mman.aper_base_kaddr); 4594 adev->mman.aper_base_kaddr = NULL; 4595 4596 /* Memory manager related */ 4597 if (!adev->gmc.xgmi.connected_to_cpu && !adev->gmc.is_app_apu) { 4598 arch_phys_wc_del(adev->gmc.vram_mtrr); 4599 arch_io_free_memtype_wc(adev->gmc.aper_base, adev->gmc.aper_size); 4600 } 4601 } 4602 4603 /** 4604 * amdgpu_device_fini_hw - tear down the driver 4605 * 4606 * @adev: amdgpu_device pointer 4607 * 4608 * Tear down the driver info (all asics). 4609 * Called at driver shutdown. 4610 */ 4611 void amdgpu_device_fini_hw(struct amdgpu_device *adev) 4612 { 4613 dev_info(adev->dev, "amdgpu: finishing device.\n"); 4614 flush_delayed_work(&adev->delayed_init_work); 4615 4616 if (adev->mman.initialized) 4617 drain_workqueue(adev->mman.bdev.wq); 4618 adev->shutdown = true; 4619 4620 /* make sure IB test finished before entering exclusive mode 4621 * to avoid preemption on IB test 4622 */ 4623 if (amdgpu_sriov_vf(adev)) { 4624 amdgpu_virt_request_full_gpu(adev, false); 4625 amdgpu_virt_fini_data_exchange(adev); 4626 } 4627 4628 /* disable all interrupts */ 4629 amdgpu_irq_disable_all(adev); 4630 if (adev->mode_info.mode_config_initialized) { 4631 if (!drm_drv_uses_atomic_modeset(adev_to_drm(adev))) 4632 drm_helper_force_disable_all(adev_to_drm(adev)); 4633 else 4634 drm_atomic_helper_shutdown(adev_to_drm(adev)); 4635 } 4636 amdgpu_fence_driver_hw_fini(adev); 4637 4638 if (adev->pm.sysfs_initialized) 4639 amdgpu_pm_sysfs_fini(adev); 4640 if (adev->ucode_sysfs_en) 4641 amdgpu_ucode_sysfs_fini(adev); 4642 sysfs_remove_files(&adev->dev->kobj, amdgpu_dev_attributes); 4643 amdgpu_fru_sysfs_fini(adev); 4644 4645 amdgpu_reg_state_sysfs_fini(adev); 4646 amdgpu_xcp_cfg_sysfs_fini(adev); 4647 4648 /* disable ras feature must before hw fini */ 4649 amdgpu_ras_pre_fini(adev); 4650 4651 amdgpu_ttm_set_buffer_funcs_status(adev, false); 4652 4653 amdgpu_device_ip_fini_early(adev); 4654 4655 amdgpu_irq_fini_hw(adev); 4656 4657 if (adev->mman.initialized) 4658 ttm_device_clear_dma_mappings(&adev->mman.bdev); 4659 4660 amdgpu_gart_dummy_page_fini(adev); 4661 4662 if (drm_dev_is_unplugged(adev_to_drm(adev))) 4663 amdgpu_device_unmap_mmio(adev); 4664 4665 } 4666 4667 void amdgpu_device_fini_sw(struct amdgpu_device *adev) 4668 { 4669 int idx; 4670 bool px; 4671 4672 amdgpu_fence_driver_sw_fini(adev); 4673 amdgpu_device_ip_fini(adev); 4674 amdgpu_ucode_release(&adev->firmware.gpu_info_fw); 4675 adev->accel_working = false; 4676 dma_fence_put(rcu_dereference_protected(adev->gang_submit, true)); 4677 4678 amdgpu_reset_fini(adev); 4679 4680 /* free i2c buses */ 4681 if (!amdgpu_device_has_dc_support(adev)) 4682 amdgpu_i2c_fini(adev); 4683 4684 if (amdgpu_emu_mode != 1) 4685 amdgpu_atombios_fini(adev); 4686 4687 kfree(adev->bios); 4688 adev->bios = NULL; 4689 4690 kfree(adev->fru_info); 4691 adev->fru_info = NULL; 4692 4693 px = amdgpu_device_supports_px(adev_to_drm(adev)); 4694 4695 if (px || (!dev_is_removable(&adev->pdev->dev) && 4696 apple_gmux_detect(NULL, NULL))) 4697 vga_switcheroo_unregister_client(adev->pdev); 4698 4699 if (px) 4700 vga_switcheroo_fini_domain_pm_ops(adev->dev); 4701 4702 if ((adev->pdev->class >> 8) == PCI_CLASS_DISPLAY_VGA) 4703 vga_client_unregister(adev->pdev); 4704 4705 if (drm_dev_enter(adev_to_drm(adev), &idx)) { 4706 4707 iounmap(adev->rmmio); 4708 adev->rmmio = NULL; 4709 amdgpu_doorbell_fini(adev); 4710 drm_dev_exit(idx); 4711 } 4712 4713 if (IS_ENABLED(CONFIG_PERF_EVENTS)) 4714 amdgpu_pmu_fini(adev); 4715 if (adev->mman.discovery_bin) 4716 amdgpu_discovery_fini(adev); 4717 4718 amdgpu_reset_put_reset_domain(adev->reset_domain); 4719 adev->reset_domain = NULL; 4720 4721 kfree(adev->pci_state); 4722 4723 } 4724 4725 /** 4726 * amdgpu_device_evict_resources - evict device resources 4727 * @adev: amdgpu device object 4728 * 4729 * Evicts all ttm device resources(vram BOs, gart table) from the lru list 4730 * of the vram memory type. Mainly used for evicting device resources 4731 * at suspend time. 4732 * 4733 */ 4734 static int amdgpu_device_evict_resources(struct amdgpu_device *adev) 4735 { 4736 int ret; 4737 4738 /* No need to evict vram on APUs for suspend to ram or s2idle */ 4739 if ((adev->in_s3 || adev->in_s0ix) && (adev->flags & AMD_IS_APU)) 4740 return 0; 4741 4742 ret = amdgpu_ttm_evict_resources(adev, TTM_PL_VRAM); 4743 if (ret) 4744 DRM_WARN("evicting device resources failed\n"); 4745 return ret; 4746 } 4747 4748 /* 4749 * Suspend & resume. 4750 */ 4751 /** 4752 * amdgpu_device_prepare - prepare for device suspend 4753 * 4754 * @dev: drm dev pointer 4755 * 4756 * Prepare to put the hw in the suspend state (all asics). 4757 * Returns 0 for success or an error on failure. 4758 * Called at driver suspend. 4759 */ 4760 int amdgpu_device_prepare(struct drm_device *dev) 4761 { 4762 struct amdgpu_device *adev = drm_to_adev(dev); 4763 int i, r; 4764 4765 amdgpu_choose_low_power_state(adev); 4766 4767 if (dev->switch_power_state == DRM_SWITCH_POWER_OFF) 4768 return 0; 4769 4770 /* Evict the majority of BOs before starting suspend sequence */ 4771 r = amdgpu_device_evict_resources(adev); 4772 if (r) 4773 goto unprepare; 4774 4775 flush_delayed_work(&adev->gfx.gfx_off_delay_work); 4776 4777 for (i = 0; i < adev->num_ip_blocks; i++) { 4778 if (!adev->ip_blocks[i].status.valid) 4779 continue; 4780 if (!adev->ip_blocks[i].version->funcs->prepare_suspend) 4781 continue; 4782 r = adev->ip_blocks[i].version->funcs->prepare_suspend(&adev->ip_blocks[i]); 4783 if (r) 4784 goto unprepare; 4785 } 4786 4787 return 0; 4788 4789 unprepare: 4790 adev->in_s0ix = adev->in_s3 = false; 4791 4792 return r; 4793 } 4794 4795 /** 4796 * amdgpu_device_suspend - initiate device suspend 4797 * 4798 * @dev: drm dev pointer 4799 * @notify_clients: notify in-kernel DRM clients 4800 * 4801 * Puts the hw in the suspend state (all asics). 4802 * Returns 0 for success or an error on failure. 4803 * Called at driver suspend. 4804 */ 4805 int amdgpu_device_suspend(struct drm_device *dev, bool notify_clients) 4806 { 4807 struct amdgpu_device *adev = drm_to_adev(dev); 4808 int r = 0; 4809 4810 if (dev->switch_power_state == DRM_SWITCH_POWER_OFF) 4811 return 0; 4812 4813 adev->in_suspend = true; 4814 4815 if (amdgpu_sriov_vf(adev)) { 4816 amdgpu_virt_fini_data_exchange(adev); 4817 r = amdgpu_virt_request_full_gpu(adev, false); 4818 if (r) 4819 return r; 4820 } 4821 4822 if (amdgpu_acpi_smart_shift_update(dev, AMDGPU_SS_DEV_D3)) 4823 DRM_WARN("smart shift update failed\n"); 4824 4825 if (notify_clients) 4826 drm_client_dev_suspend(adev_to_drm(adev), false); 4827 4828 cancel_delayed_work_sync(&adev->delayed_init_work); 4829 4830 amdgpu_ras_suspend(adev); 4831 4832 amdgpu_device_ip_suspend_phase1(adev); 4833 4834 if (!adev->in_s0ix) 4835 amdgpu_amdkfd_suspend(adev, adev->in_runpm); 4836 4837 r = amdgpu_device_evict_resources(adev); 4838 if (r) 4839 return r; 4840 4841 amdgpu_ttm_set_buffer_funcs_status(adev, false); 4842 4843 amdgpu_fence_driver_hw_fini(adev); 4844 4845 amdgpu_device_ip_suspend_phase2(adev); 4846 4847 if (amdgpu_sriov_vf(adev)) 4848 amdgpu_virt_release_full_gpu(adev, false); 4849 4850 r = amdgpu_dpm_notify_rlc_state(adev, false); 4851 if (r) 4852 return r; 4853 4854 return 0; 4855 } 4856 4857 /** 4858 * amdgpu_device_resume - initiate device resume 4859 * 4860 * @dev: drm dev pointer 4861 * @notify_clients: notify in-kernel DRM clients 4862 * 4863 * Bring the hw back to operating state (all asics). 4864 * Returns 0 for success or an error on failure. 4865 * Called at driver resume. 4866 */ 4867 int amdgpu_device_resume(struct drm_device *dev, bool notify_clients) 4868 { 4869 struct amdgpu_device *adev = drm_to_adev(dev); 4870 int r = 0; 4871 4872 if (amdgpu_sriov_vf(adev)) { 4873 r = amdgpu_virt_request_full_gpu(adev, true); 4874 if (r) 4875 return r; 4876 } 4877 4878 if (dev->switch_power_state == DRM_SWITCH_POWER_OFF) 4879 return 0; 4880 4881 if (adev->in_s0ix) 4882 amdgpu_dpm_gfx_state_change(adev, sGpuChangeState_D0Entry); 4883 4884 /* post card */ 4885 if (amdgpu_device_need_post(adev)) { 4886 r = amdgpu_device_asic_init(adev); 4887 if (r) 4888 dev_err(adev->dev, "amdgpu asic init failed\n"); 4889 } 4890 4891 r = amdgpu_device_ip_resume(adev); 4892 4893 if (r) { 4894 dev_err(adev->dev, "amdgpu_device_ip_resume failed (%d).\n", r); 4895 goto exit; 4896 } 4897 amdgpu_fence_driver_hw_init(adev); 4898 4899 if (!adev->in_s0ix) { 4900 r = amdgpu_amdkfd_resume(adev, adev->in_runpm); 4901 if (r) 4902 goto exit; 4903 } 4904 4905 r = amdgpu_device_ip_late_init(adev); 4906 if (r) 4907 goto exit; 4908 4909 queue_delayed_work(system_wq, &adev->delayed_init_work, 4910 msecs_to_jiffies(AMDGPU_RESUME_MS)); 4911 exit: 4912 if (amdgpu_sriov_vf(adev)) { 4913 amdgpu_virt_init_data_exchange(adev); 4914 amdgpu_virt_release_full_gpu(adev, true); 4915 } 4916 4917 if (r) 4918 return r; 4919 4920 /* Make sure IB tests flushed */ 4921 flush_delayed_work(&adev->delayed_init_work); 4922 4923 if (notify_clients) 4924 drm_client_dev_resume(adev_to_drm(adev), false); 4925 4926 amdgpu_ras_resume(adev); 4927 4928 if (adev->mode_info.num_crtc) { 4929 /* 4930 * Most of the connector probing functions try to acquire runtime pm 4931 * refs to ensure that the GPU is powered on when connector polling is 4932 * performed. Since we're calling this from a runtime PM callback, 4933 * trying to acquire rpm refs will cause us to deadlock. 4934 * 4935 * Since we're guaranteed to be holding the rpm lock, it's safe to 4936 * temporarily disable the rpm helpers so this doesn't deadlock us. 4937 */ 4938 #ifdef CONFIG_PM 4939 dev->dev->power.disable_depth++; 4940 #endif 4941 if (!adev->dc_enabled) 4942 drm_helper_hpd_irq_event(dev); 4943 else 4944 drm_kms_helper_hotplug_event(dev); 4945 #ifdef CONFIG_PM 4946 dev->dev->power.disable_depth--; 4947 #endif 4948 } 4949 adev->in_suspend = false; 4950 4951 if (adev->enable_mes) 4952 amdgpu_mes_self_test(adev); 4953 4954 if (amdgpu_acpi_smart_shift_update(dev, AMDGPU_SS_DEV_D0)) 4955 DRM_WARN("smart shift update failed\n"); 4956 4957 return 0; 4958 } 4959 4960 /** 4961 * amdgpu_device_ip_check_soft_reset - did soft reset succeed 4962 * 4963 * @adev: amdgpu_device pointer 4964 * 4965 * The list of all the hardware IPs that make up the asic is walked and 4966 * the check_soft_reset callbacks are run. check_soft_reset determines 4967 * if the asic is still hung or not. 4968 * Returns true if any of the IPs are still in a hung state, false if not. 4969 */ 4970 static bool amdgpu_device_ip_check_soft_reset(struct amdgpu_device *adev) 4971 { 4972 int i; 4973 bool asic_hang = false; 4974 4975 if (amdgpu_sriov_vf(adev)) 4976 return true; 4977 4978 if (amdgpu_asic_need_full_reset(adev)) 4979 return true; 4980 4981 for (i = 0; i < adev->num_ip_blocks; i++) { 4982 if (!adev->ip_blocks[i].status.valid) 4983 continue; 4984 if (adev->ip_blocks[i].version->funcs->check_soft_reset) 4985 adev->ip_blocks[i].status.hang = 4986 adev->ip_blocks[i].version->funcs->check_soft_reset( 4987 &adev->ip_blocks[i]); 4988 if (adev->ip_blocks[i].status.hang) { 4989 dev_info(adev->dev, "IP block:%s is hung!\n", adev->ip_blocks[i].version->funcs->name); 4990 asic_hang = true; 4991 } 4992 } 4993 return asic_hang; 4994 } 4995 4996 /** 4997 * amdgpu_device_ip_pre_soft_reset - prepare for soft reset 4998 * 4999 * @adev: amdgpu_device pointer 5000 * 5001 * The list of all the hardware IPs that make up the asic is walked and the 5002 * pre_soft_reset callbacks are run if the block is hung. pre_soft_reset 5003 * handles any IP specific hardware or software state changes that are 5004 * necessary for a soft reset to succeed. 5005 * Returns 0 on success, negative error code on failure. 5006 */ 5007 static int amdgpu_device_ip_pre_soft_reset(struct amdgpu_device *adev) 5008 { 5009 int i, r = 0; 5010 5011 for (i = 0; i < adev->num_ip_blocks; i++) { 5012 if (!adev->ip_blocks[i].status.valid) 5013 continue; 5014 if (adev->ip_blocks[i].status.hang && 5015 adev->ip_blocks[i].version->funcs->pre_soft_reset) { 5016 r = adev->ip_blocks[i].version->funcs->pre_soft_reset(&adev->ip_blocks[i]); 5017 if (r) 5018 return r; 5019 } 5020 } 5021 5022 return 0; 5023 } 5024 5025 /** 5026 * amdgpu_device_ip_need_full_reset - check if a full asic reset is needed 5027 * 5028 * @adev: amdgpu_device pointer 5029 * 5030 * Some hardware IPs cannot be soft reset. If they are hung, a full gpu 5031 * reset is necessary to recover. 5032 * Returns true if a full asic reset is required, false if not. 5033 */ 5034 static bool amdgpu_device_ip_need_full_reset(struct amdgpu_device *adev) 5035 { 5036 int i; 5037 5038 if (amdgpu_asic_need_full_reset(adev)) 5039 return true; 5040 5041 for (i = 0; i < adev->num_ip_blocks; i++) { 5042 if (!adev->ip_blocks[i].status.valid) 5043 continue; 5044 if ((adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) || 5045 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) || 5046 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_ACP) || 5047 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE) || 5048 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) { 5049 if (adev->ip_blocks[i].status.hang) { 5050 dev_info(adev->dev, "Some block need full reset!\n"); 5051 return true; 5052 } 5053 } 5054 } 5055 return false; 5056 } 5057 5058 /** 5059 * amdgpu_device_ip_soft_reset - do a soft reset 5060 * 5061 * @adev: amdgpu_device pointer 5062 * 5063 * The list of all the hardware IPs that make up the asic is walked and the 5064 * soft_reset callbacks are run if the block is hung. soft_reset handles any 5065 * IP specific hardware or software state changes that are necessary to soft 5066 * reset the IP. 5067 * Returns 0 on success, negative error code on failure. 5068 */ 5069 static int amdgpu_device_ip_soft_reset(struct amdgpu_device *adev) 5070 { 5071 int i, r = 0; 5072 5073 for (i = 0; i < adev->num_ip_blocks; i++) { 5074 if (!adev->ip_blocks[i].status.valid) 5075 continue; 5076 if (adev->ip_blocks[i].status.hang && 5077 adev->ip_blocks[i].version->funcs->soft_reset) { 5078 r = adev->ip_blocks[i].version->funcs->soft_reset(&adev->ip_blocks[i]); 5079 if (r) 5080 return r; 5081 } 5082 } 5083 5084 return 0; 5085 } 5086 5087 /** 5088 * amdgpu_device_ip_post_soft_reset - clean up from soft reset 5089 * 5090 * @adev: amdgpu_device pointer 5091 * 5092 * The list of all the hardware IPs that make up the asic is walked and the 5093 * post_soft_reset callbacks are run if the asic was hung. post_soft_reset 5094 * handles any IP specific hardware or software state changes that are 5095 * necessary after the IP has been soft reset. 5096 * Returns 0 on success, negative error code on failure. 5097 */ 5098 static int amdgpu_device_ip_post_soft_reset(struct amdgpu_device *adev) 5099 { 5100 int i, r = 0; 5101 5102 for (i = 0; i < adev->num_ip_blocks; i++) { 5103 if (!adev->ip_blocks[i].status.valid) 5104 continue; 5105 if (adev->ip_blocks[i].status.hang && 5106 adev->ip_blocks[i].version->funcs->post_soft_reset) 5107 r = adev->ip_blocks[i].version->funcs->post_soft_reset(&adev->ip_blocks[i]); 5108 if (r) 5109 return r; 5110 } 5111 5112 return 0; 5113 } 5114 5115 /** 5116 * amdgpu_device_reset_sriov - reset ASIC for SR-IOV vf 5117 * 5118 * @adev: amdgpu_device pointer 5119 * @reset_context: amdgpu reset context pointer 5120 * 5121 * do VF FLR and reinitialize Asic 5122 * return 0 means succeeded otherwise failed 5123 */ 5124 static int amdgpu_device_reset_sriov(struct amdgpu_device *adev, 5125 struct amdgpu_reset_context *reset_context) 5126 { 5127 int r; 5128 struct amdgpu_hive_info *hive = NULL; 5129 5130 if (test_bit(AMDGPU_HOST_FLR, &reset_context->flags)) { 5131 if (!amdgpu_ras_get_fed_status(adev)) 5132 amdgpu_virt_ready_to_reset(adev); 5133 amdgpu_virt_wait_reset(adev); 5134 clear_bit(AMDGPU_HOST_FLR, &reset_context->flags); 5135 r = amdgpu_virt_request_full_gpu(adev, true); 5136 } else { 5137 r = amdgpu_virt_reset_gpu(adev); 5138 } 5139 if (r) 5140 return r; 5141 5142 amdgpu_ras_set_fed(adev, false); 5143 amdgpu_irq_gpu_reset_resume_helper(adev); 5144 5145 /* some sw clean up VF needs to do before recover */ 5146 amdgpu_virt_post_reset(adev); 5147 5148 /* Resume IP prior to SMC */ 5149 r = amdgpu_device_ip_reinit_early_sriov(adev); 5150 if (r) 5151 return r; 5152 5153 amdgpu_virt_init_data_exchange(adev); 5154 5155 r = amdgpu_device_fw_loading(adev); 5156 if (r) 5157 return r; 5158 5159 /* now we are okay to resume SMC/CP/SDMA */ 5160 r = amdgpu_device_ip_reinit_late_sriov(adev); 5161 if (r) 5162 return r; 5163 5164 hive = amdgpu_get_xgmi_hive(adev); 5165 /* Update PSP FW topology after reset */ 5166 if (hive && adev->gmc.xgmi.num_physical_nodes > 1) 5167 r = amdgpu_xgmi_update_topology(hive, adev); 5168 if (hive) 5169 amdgpu_put_xgmi_hive(hive); 5170 if (r) 5171 return r; 5172 5173 r = amdgpu_ib_ring_tests(adev); 5174 if (r) 5175 return r; 5176 5177 if (adev->virt.gim_feature & AMDGIM_FEATURE_GIM_FLR_VRAMLOST) 5178 amdgpu_inc_vram_lost(adev); 5179 5180 /* need to be called during full access so we can't do it later like 5181 * bare-metal does. 5182 */ 5183 amdgpu_amdkfd_post_reset(adev); 5184 amdgpu_virt_release_full_gpu(adev, true); 5185 5186 /* Aldebaran and gfx_11_0_3 support ras in SRIOV, so need resume ras during reset */ 5187 if (amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 2) || 5188 amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 3) || 5189 amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 4) || 5190 amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(11, 0, 3)) 5191 amdgpu_ras_resume(adev); 5192 5193 amdgpu_virt_ras_telemetry_post_reset(adev); 5194 5195 return 0; 5196 } 5197 5198 /** 5199 * amdgpu_device_has_job_running - check if there is any job in mirror list 5200 * 5201 * @adev: amdgpu_device pointer 5202 * 5203 * check if there is any job in mirror list 5204 */ 5205 bool amdgpu_device_has_job_running(struct amdgpu_device *adev) 5206 { 5207 int i; 5208 struct drm_sched_job *job; 5209 5210 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 5211 struct amdgpu_ring *ring = adev->rings[i]; 5212 5213 if (!amdgpu_ring_sched_ready(ring)) 5214 continue; 5215 5216 spin_lock(&ring->sched.job_list_lock); 5217 job = list_first_entry_or_null(&ring->sched.pending_list, 5218 struct drm_sched_job, list); 5219 spin_unlock(&ring->sched.job_list_lock); 5220 if (job) 5221 return true; 5222 } 5223 return false; 5224 } 5225 5226 /** 5227 * amdgpu_device_should_recover_gpu - check if we should try GPU recovery 5228 * 5229 * @adev: amdgpu_device pointer 5230 * 5231 * Check amdgpu_gpu_recovery and SRIOV status to see if we should try to recover 5232 * a hung GPU. 5233 */ 5234 bool amdgpu_device_should_recover_gpu(struct amdgpu_device *adev) 5235 { 5236 5237 if (amdgpu_gpu_recovery == 0) 5238 goto disabled; 5239 5240 /* Skip soft reset check in fatal error mode */ 5241 if (!amdgpu_ras_is_poison_mode_supported(adev)) 5242 return true; 5243 5244 if (amdgpu_sriov_vf(adev)) 5245 return true; 5246 5247 if (amdgpu_gpu_recovery == -1) { 5248 switch (adev->asic_type) { 5249 #ifdef CONFIG_DRM_AMDGPU_SI 5250 case CHIP_VERDE: 5251 case CHIP_TAHITI: 5252 case CHIP_PITCAIRN: 5253 case CHIP_OLAND: 5254 case CHIP_HAINAN: 5255 #endif 5256 #ifdef CONFIG_DRM_AMDGPU_CIK 5257 case CHIP_KAVERI: 5258 case CHIP_KABINI: 5259 case CHIP_MULLINS: 5260 #endif 5261 case CHIP_CARRIZO: 5262 case CHIP_STONEY: 5263 case CHIP_CYAN_SKILLFISH: 5264 goto disabled; 5265 default: 5266 break; 5267 } 5268 } 5269 5270 return true; 5271 5272 disabled: 5273 dev_info(adev->dev, "GPU recovery disabled.\n"); 5274 return false; 5275 } 5276 5277 int amdgpu_device_mode1_reset(struct amdgpu_device *adev) 5278 { 5279 u32 i; 5280 int ret = 0; 5281 5282 amdgpu_atombios_scratch_regs_engine_hung(adev, true); 5283 5284 dev_info(adev->dev, "GPU mode1 reset\n"); 5285 5286 /* Cache the state before bus master disable. The saved config space 5287 * values are used in other cases like restore after mode-2 reset. 5288 */ 5289 amdgpu_device_cache_pci_state(adev->pdev); 5290 5291 /* disable BM */ 5292 pci_clear_master(adev->pdev); 5293 5294 if (amdgpu_dpm_is_mode1_reset_supported(adev)) { 5295 dev_info(adev->dev, "GPU smu mode1 reset\n"); 5296 ret = amdgpu_dpm_mode1_reset(adev); 5297 } else { 5298 dev_info(adev->dev, "GPU psp mode1 reset\n"); 5299 ret = psp_gpu_reset(adev); 5300 } 5301 5302 if (ret) 5303 goto mode1_reset_failed; 5304 5305 amdgpu_device_load_pci_state(adev->pdev); 5306 ret = amdgpu_psp_wait_for_bootloader(adev); 5307 if (ret) 5308 goto mode1_reset_failed; 5309 5310 /* wait for asic to come out of reset */ 5311 for (i = 0; i < adev->usec_timeout; i++) { 5312 u32 memsize = adev->nbio.funcs->get_memsize(adev); 5313 5314 if (memsize != 0xffffffff) 5315 break; 5316 udelay(1); 5317 } 5318 5319 if (i >= adev->usec_timeout) { 5320 ret = -ETIMEDOUT; 5321 goto mode1_reset_failed; 5322 } 5323 5324 amdgpu_atombios_scratch_regs_engine_hung(adev, false); 5325 5326 return 0; 5327 5328 mode1_reset_failed: 5329 dev_err(adev->dev, "GPU mode1 reset failed\n"); 5330 return ret; 5331 } 5332 5333 int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev, 5334 struct amdgpu_reset_context *reset_context) 5335 { 5336 int i, r = 0; 5337 struct amdgpu_job *job = NULL; 5338 struct amdgpu_device *tmp_adev = reset_context->reset_req_dev; 5339 bool need_full_reset = 5340 test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 5341 5342 if (reset_context->reset_req_dev == adev) 5343 job = reset_context->job; 5344 5345 if (amdgpu_sriov_vf(adev)) 5346 amdgpu_virt_pre_reset(adev); 5347 5348 amdgpu_fence_driver_isr_toggle(adev, true); 5349 5350 /* block all schedulers and reset given job's ring */ 5351 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 5352 struct amdgpu_ring *ring = adev->rings[i]; 5353 5354 if (!amdgpu_ring_sched_ready(ring)) 5355 continue; 5356 5357 /* Clear job fence from fence drv to avoid force_completion 5358 * leave NULL and vm flush fence in fence drv 5359 */ 5360 amdgpu_fence_driver_clear_job_fences(ring); 5361 5362 /* after all hw jobs are reset, hw fence is meaningless, so force_completion */ 5363 amdgpu_fence_driver_force_completion(ring); 5364 } 5365 5366 amdgpu_fence_driver_isr_toggle(adev, false); 5367 5368 if (job && job->vm) 5369 drm_sched_increase_karma(&job->base); 5370 5371 r = amdgpu_reset_prepare_hwcontext(adev, reset_context); 5372 /* If reset handler not implemented, continue; otherwise return */ 5373 if (r == -EOPNOTSUPP) 5374 r = 0; 5375 else 5376 return r; 5377 5378 /* Don't suspend on bare metal if we are not going to HW reset the ASIC */ 5379 if (!amdgpu_sriov_vf(adev)) { 5380 5381 if (!need_full_reset) 5382 need_full_reset = amdgpu_device_ip_need_full_reset(adev); 5383 5384 if (!need_full_reset && amdgpu_gpu_recovery && 5385 amdgpu_device_ip_check_soft_reset(adev)) { 5386 amdgpu_device_ip_pre_soft_reset(adev); 5387 r = amdgpu_device_ip_soft_reset(adev); 5388 amdgpu_device_ip_post_soft_reset(adev); 5389 if (r || amdgpu_device_ip_check_soft_reset(adev)) { 5390 dev_info(adev->dev, "soft reset failed, will fallback to full reset!\n"); 5391 need_full_reset = true; 5392 } 5393 } 5394 5395 if (!test_bit(AMDGPU_SKIP_COREDUMP, &reset_context->flags)) { 5396 dev_info(tmp_adev->dev, "Dumping IP State\n"); 5397 /* Trigger ip dump before we reset the asic */ 5398 for (i = 0; i < tmp_adev->num_ip_blocks; i++) 5399 if (tmp_adev->ip_blocks[i].version->funcs->dump_ip_state) 5400 tmp_adev->ip_blocks[i].version->funcs 5401 ->dump_ip_state((void *)&tmp_adev->ip_blocks[i]); 5402 dev_info(tmp_adev->dev, "Dumping IP State Completed\n"); 5403 } 5404 5405 if (need_full_reset) 5406 r = amdgpu_device_ip_suspend(adev); 5407 if (need_full_reset) 5408 set_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 5409 else 5410 clear_bit(AMDGPU_NEED_FULL_RESET, 5411 &reset_context->flags); 5412 } 5413 5414 return r; 5415 } 5416 5417 int amdgpu_device_reinit_after_reset(struct amdgpu_reset_context *reset_context) 5418 { 5419 struct list_head *device_list_handle; 5420 bool full_reset, vram_lost = false; 5421 struct amdgpu_device *tmp_adev; 5422 int r; 5423 5424 device_list_handle = reset_context->reset_device_list; 5425 5426 if (!device_list_handle) 5427 return -EINVAL; 5428 5429 full_reset = test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 5430 5431 r = 0; 5432 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 5433 /* After reset, it's default init level */ 5434 amdgpu_set_init_level(tmp_adev, AMDGPU_INIT_LEVEL_DEFAULT); 5435 if (full_reset) { 5436 /* post card */ 5437 amdgpu_ras_set_fed(tmp_adev, false); 5438 r = amdgpu_device_asic_init(tmp_adev); 5439 if (r) { 5440 dev_warn(tmp_adev->dev, "asic atom init failed!"); 5441 } else { 5442 dev_info(tmp_adev->dev, "GPU reset succeeded, trying to resume\n"); 5443 5444 r = amdgpu_device_ip_resume_phase1(tmp_adev); 5445 if (r) 5446 goto out; 5447 5448 vram_lost = amdgpu_device_check_vram_lost(tmp_adev); 5449 5450 if (!test_bit(AMDGPU_SKIP_COREDUMP, &reset_context->flags)) 5451 amdgpu_coredump(tmp_adev, false, vram_lost, reset_context->job); 5452 5453 if (vram_lost) { 5454 DRM_INFO("VRAM is lost due to GPU reset!\n"); 5455 amdgpu_inc_vram_lost(tmp_adev); 5456 } 5457 5458 r = amdgpu_device_fw_loading(tmp_adev); 5459 if (r) 5460 return r; 5461 5462 r = amdgpu_xcp_restore_partition_mode( 5463 tmp_adev->xcp_mgr); 5464 if (r) 5465 goto out; 5466 5467 r = amdgpu_device_ip_resume_phase2(tmp_adev); 5468 if (r) 5469 goto out; 5470 5471 if (tmp_adev->mman.buffer_funcs_ring->sched.ready) 5472 amdgpu_ttm_set_buffer_funcs_status(tmp_adev, true); 5473 5474 if (vram_lost) 5475 amdgpu_device_fill_reset_magic(tmp_adev); 5476 5477 /* 5478 * Add this ASIC as tracked as reset was already 5479 * complete successfully. 5480 */ 5481 amdgpu_register_gpu_instance(tmp_adev); 5482 5483 if (!reset_context->hive && 5484 tmp_adev->gmc.xgmi.num_physical_nodes > 1) 5485 amdgpu_xgmi_add_device(tmp_adev); 5486 5487 r = amdgpu_device_ip_late_init(tmp_adev); 5488 if (r) 5489 goto out; 5490 5491 drm_client_dev_resume(adev_to_drm(tmp_adev), false); 5492 5493 /* 5494 * The GPU enters bad state once faulty pages 5495 * by ECC has reached the threshold, and ras 5496 * recovery is scheduled next. So add one check 5497 * here to break recovery if it indeed exceeds 5498 * bad page threshold, and remind user to 5499 * retire this GPU or setting one bigger 5500 * bad_page_threshold value to fix this once 5501 * probing driver again. 5502 */ 5503 if (!amdgpu_ras_is_rma(tmp_adev)) { 5504 /* must succeed. */ 5505 amdgpu_ras_resume(tmp_adev); 5506 } else { 5507 r = -EINVAL; 5508 goto out; 5509 } 5510 5511 /* Update PSP FW topology after reset */ 5512 if (reset_context->hive && 5513 tmp_adev->gmc.xgmi.num_physical_nodes > 1) 5514 r = amdgpu_xgmi_update_topology( 5515 reset_context->hive, tmp_adev); 5516 } 5517 } 5518 5519 out: 5520 if (!r) { 5521 amdgpu_irq_gpu_reset_resume_helper(tmp_adev); 5522 r = amdgpu_ib_ring_tests(tmp_adev); 5523 if (r) { 5524 dev_err(tmp_adev->dev, "ib ring test failed (%d).\n", r); 5525 r = -EAGAIN; 5526 goto end; 5527 } 5528 } 5529 5530 if (r) 5531 tmp_adev->asic_reset_res = r; 5532 } 5533 5534 end: 5535 return r; 5536 } 5537 5538 int amdgpu_do_asic_reset(struct list_head *device_list_handle, 5539 struct amdgpu_reset_context *reset_context) 5540 { 5541 struct amdgpu_device *tmp_adev = NULL; 5542 bool need_full_reset, skip_hw_reset; 5543 int r = 0; 5544 5545 /* Try reset handler method first */ 5546 tmp_adev = list_first_entry(device_list_handle, struct amdgpu_device, 5547 reset_list); 5548 5549 reset_context->reset_device_list = device_list_handle; 5550 r = amdgpu_reset_perform_reset(tmp_adev, reset_context); 5551 /* If reset handler not implemented, continue; otherwise return */ 5552 if (r == -EOPNOTSUPP) 5553 r = 0; 5554 else 5555 return r; 5556 5557 /* Reset handler not implemented, use the default method */ 5558 need_full_reset = 5559 test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 5560 skip_hw_reset = test_bit(AMDGPU_SKIP_HW_RESET, &reset_context->flags); 5561 5562 /* 5563 * ASIC reset has to be done on all XGMI hive nodes ASAP 5564 * to allow proper links negotiation in FW (within 1 sec) 5565 */ 5566 if (!skip_hw_reset && need_full_reset) { 5567 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 5568 /* For XGMI run all resets in parallel to speed up the process */ 5569 if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) { 5570 if (!queue_work(system_unbound_wq, 5571 &tmp_adev->xgmi_reset_work)) 5572 r = -EALREADY; 5573 } else 5574 r = amdgpu_asic_reset(tmp_adev); 5575 5576 if (r) { 5577 dev_err(tmp_adev->dev, 5578 "ASIC reset failed with error, %d for drm dev, %s", 5579 r, adev_to_drm(tmp_adev)->unique); 5580 goto out; 5581 } 5582 } 5583 5584 /* For XGMI wait for all resets to complete before proceed */ 5585 if (!r) { 5586 list_for_each_entry(tmp_adev, device_list_handle, 5587 reset_list) { 5588 if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) { 5589 flush_work(&tmp_adev->xgmi_reset_work); 5590 r = tmp_adev->asic_reset_res; 5591 if (r) 5592 break; 5593 } 5594 } 5595 } 5596 } 5597 5598 if (!r && amdgpu_ras_intr_triggered()) { 5599 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 5600 amdgpu_ras_reset_error_count(tmp_adev, 5601 AMDGPU_RAS_BLOCK__MMHUB); 5602 } 5603 5604 amdgpu_ras_intr_cleared(); 5605 } 5606 5607 r = amdgpu_device_reinit_after_reset(reset_context); 5608 if (r == -EAGAIN) 5609 set_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 5610 else 5611 clear_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 5612 5613 out: 5614 return r; 5615 } 5616 5617 static void amdgpu_device_set_mp1_state(struct amdgpu_device *adev) 5618 { 5619 5620 switch (amdgpu_asic_reset_method(adev)) { 5621 case AMD_RESET_METHOD_MODE1: 5622 adev->mp1_state = PP_MP1_STATE_SHUTDOWN; 5623 break; 5624 case AMD_RESET_METHOD_MODE2: 5625 adev->mp1_state = PP_MP1_STATE_RESET; 5626 break; 5627 default: 5628 adev->mp1_state = PP_MP1_STATE_NONE; 5629 break; 5630 } 5631 } 5632 5633 static void amdgpu_device_unset_mp1_state(struct amdgpu_device *adev) 5634 { 5635 amdgpu_vf_error_trans_all(adev); 5636 adev->mp1_state = PP_MP1_STATE_NONE; 5637 } 5638 5639 static void amdgpu_device_resume_display_audio(struct amdgpu_device *adev) 5640 { 5641 struct pci_dev *p = NULL; 5642 5643 p = pci_get_domain_bus_and_slot(pci_domain_nr(adev->pdev->bus), 5644 adev->pdev->bus->number, 1); 5645 if (p) { 5646 pm_runtime_enable(&(p->dev)); 5647 pm_runtime_resume(&(p->dev)); 5648 } 5649 5650 pci_dev_put(p); 5651 } 5652 5653 static int amdgpu_device_suspend_display_audio(struct amdgpu_device *adev) 5654 { 5655 enum amd_reset_method reset_method; 5656 struct pci_dev *p = NULL; 5657 u64 expires; 5658 5659 /* 5660 * For now, only BACO and mode1 reset are confirmed 5661 * to suffer the audio issue without proper suspended. 5662 */ 5663 reset_method = amdgpu_asic_reset_method(adev); 5664 if ((reset_method != AMD_RESET_METHOD_BACO) && 5665 (reset_method != AMD_RESET_METHOD_MODE1)) 5666 return -EINVAL; 5667 5668 p = pci_get_domain_bus_and_slot(pci_domain_nr(adev->pdev->bus), 5669 adev->pdev->bus->number, 1); 5670 if (!p) 5671 return -ENODEV; 5672 5673 expires = pm_runtime_autosuspend_expiration(&(p->dev)); 5674 if (!expires) 5675 /* 5676 * If we cannot get the audio device autosuspend delay, 5677 * a fixed 4S interval will be used. Considering 3S is 5678 * the audio controller default autosuspend delay setting. 5679 * 4S used here is guaranteed to cover that. 5680 */ 5681 expires = ktime_get_mono_fast_ns() + NSEC_PER_SEC * 4ULL; 5682 5683 while (!pm_runtime_status_suspended(&(p->dev))) { 5684 if (!pm_runtime_suspend(&(p->dev))) 5685 break; 5686 5687 if (expires < ktime_get_mono_fast_ns()) { 5688 dev_warn(adev->dev, "failed to suspend display audio\n"); 5689 pci_dev_put(p); 5690 /* TODO: abort the succeeding gpu reset? */ 5691 return -ETIMEDOUT; 5692 } 5693 } 5694 5695 pm_runtime_disable(&(p->dev)); 5696 5697 pci_dev_put(p); 5698 return 0; 5699 } 5700 5701 static inline void amdgpu_device_stop_pending_resets(struct amdgpu_device *adev) 5702 { 5703 struct amdgpu_ras *con = amdgpu_ras_get_context(adev); 5704 5705 #if defined(CONFIG_DEBUG_FS) 5706 if (!amdgpu_sriov_vf(adev)) 5707 cancel_work(&adev->reset_work); 5708 #endif 5709 5710 if (adev->kfd.dev) 5711 cancel_work(&adev->kfd.reset_work); 5712 5713 if (amdgpu_sriov_vf(adev)) 5714 cancel_work(&adev->virt.flr_work); 5715 5716 if (con && adev->ras_enabled) 5717 cancel_work(&con->recovery_work); 5718 5719 } 5720 5721 static int amdgpu_device_health_check(struct list_head *device_list_handle) 5722 { 5723 struct amdgpu_device *tmp_adev; 5724 int ret = 0; 5725 u32 status; 5726 5727 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 5728 pci_read_config_dword(tmp_adev->pdev, PCI_COMMAND, &status); 5729 if (PCI_POSSIBLE_ERROR(status)) { 5730 dev_err(tmp_adev->dev, "device lost from bus!"); 5731 ret = -ENODEV; 5732 } 5733 } 5734 5735 return ret; 5736 } 5737 5738 /** 5739 * amdgpu_device_gpu_recover - reset the asic and recover scheduler 5740 * 5741 * @adev: amdgpu_device pointer 5742 * @job: which job trigger hang 5743 * @reset_context: amdgpu reset context pointer 5744 * 5745 * Attempt to reset the GPU if it has hung (all asics). 5746 * Attempt to do soft-reset or full-reset and reinitialize Asic 5747 * Returns 0 for success or an error on failure. 5748 */ 5749 5750 int amdgpu_device_gpu_recover(struct amdgpu_device *adev, 5751 struct amdgpu_job *job, 5752 struct amdgpu_reset_context *reset_context) 5753 { 5754 struct list_head device_list, *device_list_handle = NULL; 5755 bool job_signaled = false; 5756 struct amdgpu_hive_info *hive = NULL; 5757 struct amdgpu_device *tmp_adev = NULL; 5758 int i, r = 0; 5759 bool need_emergency_restart = false; 5760 bool audio_suspended = false; 5761 int retry_limit = AMDGPU_MAX_RETRY_LIMIT; 5762 5763 /* 5764 * Special case: RAS triggered and full reset isn't supported 5765 */ 5766 need_emergency_restart = amdgpu_ras_need_emergency_restart(adev); 5767 5768 /* 5769 * Flush RAM to disk so that after reboot 5770 * the user can read log and see why the system rebooted. 5771 */ 5772 if (need_emergency_restart && amdgpu_ras_get_context(adev) && 5773 amdgpu_ras_get_context(adev)->reboot) { 5774 DRM_WARN("Emergency reboot."); 5775 5776 ksys_sync_helper(); 5777 emergency_restart(); 5778 } 5779 5780 dev_info(adev->dev, "GPU %s begin!\n", 5781 need_emergency_restart ? "jobs stop":"reset"); 5782 5783 if (!amdgpu_sriov_vf(adev)) 5784 hive = amdgpu_get_xgmi_hive(adev); 5785 if (hive) 5786 mutex_lock(&hive->hive_lock); 5787 5788 reset_context->job = job; 5789 reset_context->hive = hive; 5790 /* 5791 * Build list of devices to reset. 5792 * In case we are in XGMI hive mode, resort the device list 5793 * to put adev in the 1st position. 5794 */ 5795 INIT_LIST_HEAD(&device_list); 5796 if (!amdgpu_sriov_vf(adev) && (adev->gmc.xgmi.num_physical_nodes > 1) && hive) { 5797 list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head) { 5798 list_add_tail(&tmp_adev->reset_list, &device_list); 5799 if (adev->shutdown) 5800 tmp_adev->shutdown = true; 5801 } 5802 if (!list_is_first(&adev->reset_list, &device_list)) 5803 list_rotate_to_front(&adev->reset_list, &device_list); 5804 device_list_handle = &device_list; 5805 } else { 5806 list_add_tail(&adev->reset_list, &device_list); 5807 device_list_handle = &device_list; 5808 } 5809 5810 if (!amdgpu_sriov_vf(adev)) { 5811 r = amdgpu_device_health_check(device_list_handle); 5812 if (r) 5813 goto end_reset; 5814 } 5815 5816 /* We need to lock reset domain only once both for XGMI and single device */ 5817 tmp_adev = list_first_entry(device_list_handle, struct amdgpu_device, 5818 reset_list); 5819 amdgpu_device_lock_reset_domain(tmp_adev->reset_domain); 5820 5821 /* block all schedulers and reset given job's ring */ 5822 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 5823 5824 amdgpu_device_set_mp1_state(tmp_adev); 5825 5826 /* 5827 * Try to put the audio codec into suspend state 5828 * before gpu reset started. 5829 * 5830 * Due to the power domain of the graphics device 5831 * is shared with AZ power domain. Without this, 5832 * we may change the audio hardware from behind 5833 * the audio driver's back. That will trigger 5834 * some audio codec errors. 5835 */ 5836 if (!amdgpu_device_suspend_display_audio(tmp_adev)) 5837 audio_suspended = true; 5838 5839 amdgpu_ras_set_error_query_ready(tmp_adev, false); 5840 5841 cancel_delayed_work_sync(&tmp_adev->delayed_init_work); 5842 5843 amdgpu_amdkfd_pre_reset(tmp_adev, reset_context); 5844 5845 /* 5846 * Mark these ASICs to be reseted as untracked first 5847 * And add them back after reset completed 5848 */ 5849 amdgpu_unregister_gpu_instance(tmp_adev); 5850 5851 drm_client_dev_suspend(adev_to_drm(tmp_adev), false); 5852 5853 /* disable ras on ALL IPs */ 5854 if (!need_emergency_restart && 5855 amdgpu_device_ip_need_full_reset(tmp_adev)) 5856 amdgpu_ras_suspend(tmp_adev); 5857 5858 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 5859 struct amdgpu_ring *ring = tmp_adev->rings[i]; 5860 5861 if (!amdgpu_ring_sched_ready(ring)) 5862 continue; 5863 5864 drm_sched_stop(&ring->sched, job ? &job->base : NULL); 5865 5866 if (need_emergency_restart) 5867 amdgpu_job_stop_all_jobs_on_sched(&ring->sched); 5868 } 5869 atomic_inc(&tmp_adev->gpu_reset_counter); 5870 } 5871 5872 if (need_emergency_restart) 5873 goto skip_sched_resume; 5874 5875 /* 5876 * Must check guilty signal here since after this point all old 5877 * HW fences are force signaled. 5878 * 5879 * job->base holds a reference to parent fence 5880 */ 5881 if (job && dma_fence_is_signaled(&job->hw_fence)) { 5882 job_signaled = true; 5883 dev_info(adev->dev, "Guilty job already signaled, skipping HW reset"); 5884 goto skip_hw_reset; 5885 } 5886 5887 retry: /* Rest of adevs pre asic reset from XGMI hive. */ 5888 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 5889 r = amdgpu_device_pre_asic_reset(tmp_adev, reset_context); 5890 /*TODO Should we stop ?*/ 5891 if (r) { 5892 dev_err(tmp_adev->dev, "GPU pre asic reset failed with err, %d for drm dev, %s ", 5893 r, adev_to_drm(tmp_adev)->unique); 5894 tmp_adev->asic_reset_res = r; 5895 } 5896 } 5897 5898 /* Actual ASIC resets if needed.*/ 5899 /* Host driver will handle XGMI hive reset for SRIOV */ 5900 if (amdgpu_sriov_vf(adev)) { 5901 if (amdgpu_ras_get_fed_status(adev) || amdgpu_virt_rcvd_ras_interrupt(adev)) { 5902 dev_dbg(adev->dev, "Detected RAS error, wait for FLR completion\n"); 5903 amdgpu_ras_set_fed(adev, true); 5904 set_bit(AMDGPU_HOST_FLR, &reset_context->flags); 5905 } 5906 5907 r = amdgpu_device_reset_sriov(adev, reset_context); 5908 if (AMDGPU_RETRY_SRIOV_RESET(r) && (retry_limit--) > 0) { 5909 amdgpu_virt_release_full_gpu(adev, true); 5910 goto retry; 5911 } 5912 if (r) 5913 adev->asic_reset_res = r; 5914 } else { 5915 r = amdgpu_do_asic_reset(device_list_handle, reset_context); 5916 if (r && r == -EAGAIN) 5917 goto retry; 5918 } 5919 5920 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 5921 /* 5922 * Drop any pending non scheduler resets queued before reset is done. 5923 * Any reset scheduled after this point would be valid. Scheduler resets 5924 * were already dropped during drm_sched_stop and no new ones can come 5925 * in before drm_sched_start. 5926 */ 5927 amdgpu_device_stop_pending_resets(tmp_adev); 5928 } 5929 5930 skip_hw_reset: 5931 5932 /* Post ASIC reset for all devs .*/ 5933 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 5934 5935 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 5936 struct amdgpu_ring *ring = tmp_adev->rings[i]; 5937 5938 if (!amdgpu_ring_sched_ready(ring)) 5939 continue; 5940 5941 drm_sched_start(&ring->sched, 0); 5942 } 5943 5944 if (!drm_drv_uses_atomic_modeset(adev_to_drm(tmp_adev)) && !job_signaled) 5945 drm_helper_resume_force_mode(adev_to_drm(tmp_adev)); 5946 5947 if (tmp_adev->asic_reset_res) 5948 r = tmp_adev->asic_reset_res; 5949 5950 tmp_adev->asic_reset_res = 0; 5951 5952 if (r) { 5953 /* bad news, how to tell it to userspace ? 5954 * for ras error, we should report GPU bad status instead of 5955 * reset failure 5956 */ 5957 if (reset_context->src != AMDGPU_RESET_SRC_RAS || 5958 !amdgpu_ras_eeprom_check_err_threshold(tmp_adev)) 5959 dev_info(tmp_adev->dev, "GPU reset(%d) failed\n", 5960 atomic_read(&tmp_adev->gpu_reset_counter)); 5961 amdgpu_vf_error_put(tmp_adev, AMDGIM_ERROR_VF_GPU_RESET_FAIL, 0, r); 5962 } else { 5963 dev_info(tmp_adev->dev, "GPU reset(%d) succeeded!\n", atomic_read(&tmp_adev->gpu_reset_counter)); 5964 if (amdgpu_acpi_smart_shift_update(adev_to_drm(tmp_adev), AMDGPU_SS_DEV_D0)) 5965 DRM_WARN("smart shift update failed\n"); 5966 } 5967 } 5968 5969 skip_sched_resume: 5970 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 5971 /* unlock kfd: SRIOV would do it separately */ 5972 if (!need_emergency_restart && !amdgpu_sriov_vf(tmp_adev)) 5973 amdgpu_amdkfd_post_reset(tmp_adev); 5974 5975 /* kfd_post_reset will do nothing if kfd device is not initialized, 5976 * need to bring up kfd here if it's not be initialized before 5977 */ 5978 if (!adev->kfd.init_complete) 5979 amdgpu_amdkfd_device_init(adev); 5980 5981 if (audio_suspended) 5982 amdgpu_device_resume_display_audio(tmp_adev); 5983 5984 amdgpu_device_unset_mp1_state(tmp_adev); 5985 5986 amdgpu_ras_set_error_query_ready(tmp_adev, true); 5987 } 5988 5989 tmp_adev = list_first_entry(device_list_handle, struct amdgpu_device, 5990 reset_list); 5991 amdgpu_device_unlock_reset_domain(tmp_adev->reset_domain); 5992 5993 end_reset: 5994 if (hive) { 5995 mutex_unlock(&hive->hive_lock); 5996 amdgpu_put_xgmi_hive(hive); 5997 } 5998 5999 if (r) 6000 dev_info(adev->dev, "GPU reset end with ret = %d\n", r); 6001 6002 atomic_set(&adev->reset_domain->reset_res, r); 6003 return r; 6004 } 6005 6006 /** 6007 * amdgpu_device_partner_bandwidth - find the bandwidth of appropriate partner 6008 * 6009 * @adev: amdgpu_device pointer 6010 * @speed: pointer to the speed of the link 6011 * @width: pointer to the width of the link 6012 * 6013 * Evaluate the hierarchy to find the speed and bandwidth capabilities of the 6014 * first physical partner to an AMD dGPU. 6015 * This will exclude any virtual switches and links. 6016 */ 6017 static void amdgpu_device_partner_bandwidth(struct amdgpu_device *adev, 6018 enum pci_bus_speed *speed, 6019 enum pcie_link_width *width) 6020 { 6021 struct pci_dev *parent = adev->pdev; 6022 6023 if (!speed || !width) 6024 return; 6025 6026 *speed = PCI_SPEED_UNKNOWN; 6027 *width = PCIE_LNK_WIDTH_UNKNOWN; 6028 6029 if (amdgpu_device_pcie_dynamic_switching_supported(adev)) { 6030 while ((parent = pci_upstream_bridge(parent))) { 6031 /* skip upstream/downstream switches internal to dGPU*/ 6032 if (parent->vendor == PCI_VENDOR_ID_ATI) 6033 continue; 6034 *speed = pcie_get_speed_cap(parent); 6035 *width = pcie_get_width_cap(parent); 6036 break; 6037 } 6038 } else { 6039 /* use the current speeds rather than max if switching is not supported */ 6040 pcie_bandwidth_available(adev->pdev, NULL, speed, width); 6041 } 6042 } 6043 6044 /** 6045 * amdgpu_device_get_pcie_info - fence pcie info about the PCIE slot 6046 * 6047 * @adev: amdgpu_device pointer 6048 * 6049 * Fetchs and stores in the driver the PCIE capabilities (gen speed 6050 * and lanes) of the slot the device is in. Handles APUs and 6051 * virtualized environments where PCIE config space may not be available. 6052 */ 6053 static void amdgpu_device_get_pcie_info(struct amdgpu_device *adev) 6054 { 6055 struct pci_dev *pdev; 6056 enum pci_bus_speed speed_cap, platform_speed_cap; 6057 enum pcie_link_width platform_link_width; 6058 6059 if (amdgpu_pcie_gen_cap) 6060 adev->pm.pcie_gen_mask = amdgpu_pcie_gen_cap; 6061 6062 if (amdgpu_pcie_lane_cap) 6063 adev->pm.pcie_mlw_mask = amdgpu_pcie_lane_cap; 6064 6065 /* covers APUs as well */ 6066 if (pci_is_root_bus(adev->pdev->bus) && !amdgpu_passthrough(adev)) { 6067 if (adev->pm.pcie_gen_mask == 0) 6068 adev->pm.pcie_gen_mask = AMDGPU_DEFAULT_PCIE_GEN_MASK; 6069 if (adev->pm.pcie_mlw_mask == 0) 6070 adev->pm.pcie_mlw_mask = AMDGPU_DEFAULT_PCIE_MLW_MASK; 6071 return; 6072 } 6073 6074 if (adev->pm.pcie_gen_mask && adev->pm.pcie_mlw_mask) 6075 return; 6076 6077 amdgpu_device_partner_bandwidth(adev, &platform_speed_cap, 6078 &platform_link_width); 6079 6080 if (adev->pm.pcie_gen_mask == 0) { 6081 /* asic caps */ 6082 pdev = adev->pdev; 6083 speed_cap = pcie_get_speed_cap(pdev); 6084 if (speed_cap == PCI_SPEED_UNKNOWN) { 6085 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 6086 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 | 6087 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3); 6088 } else { 6089 if (speed_cap == PCIE_SPEED_32_0GT) 6090 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 6091 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 | 6092 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3 | 6093 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN4 | 6094 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN5); 6095 else if (speed_cap == PCIE_SPEED_16_0GT) 6096 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 6097 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 | 6098 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3 | 6099 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN4); 6100 else if (speed_cap == PCIE_SPEED_8_0GT) 6101 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 6102 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 | 6103 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3); 6104 else if (speed_cap == PCIE_SPEED_5_0GT) 6105 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 6106 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2); 6107 else 6108 adev->pm.pcie_gen_mask |= CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1; 6109 } 6110 /* platform caps */ 6111 if (platform_speed_cap == PCI_SPEED_UNKNOWN) { 6112 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 6113 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2); 6114 } else { 6115 if (platform_speed_cap == PCIE_SPEED_32_0GT) 6116 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 6117 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 | 6118 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3 | 6119 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN4 | 6120 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN5); 6121 else if (platform_speed_cap == PCIE_SPEED_16_0GT) 6122 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 6123 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 | 6124 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3 | 6125 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN4); 6126 else if (platform_speed_cap == PCIE_SPEED_8_0GT) 6127 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 6128 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 | 6129 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3); 6130 else if (platform_speed_cap == PCIE_SPEED_5_0GT) 6131 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 6132 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2); 6133 else 6134 adev->pm.pcie_gen_mask |= CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1; 6135 6136 } 6137 } 6138 if (adev->pm.pcie_mlw_mask == 0) { 6139 if (platform_link_width == PCIE_LNK_WIDTH_UNKNOWN) { 6140 adev->pm.pcie_mlw_mask |= AMDGPU_DEFAULT_PCIE_MLW_MASK; 6141 } else { 6142 switch (platform_link_width) { 6143 case PCIE_LNK_X32: 6144 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X32 | 6145 CAIL_PCIE_LINK_WIDTH_SUPPORT_X16 | 6146 CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 | 6147 CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 | 6148 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 6149 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 6150 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 6151 break; 6152 case PCIE_LNK_X16: 6153 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X16 | 6154 CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 | 6155 CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 | 6156 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 6157 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 6158 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 6159 break; 6160 case PCIE_LNK_X12: 6161 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 | 6162 CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 | 6163 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 6164 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 6165 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 6166 break; 6167 case PCIE_LNK_X8: 6168 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 | 6169 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 6170 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 6171 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 6172 break; 6173 case PCIE_LNK_X4: 6174 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 6175 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 6176 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 6177 break; 6178 case PCIE_LNK_X2: 6179 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 6180 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 6181 break; 6182 case PCIE_LNK_X1: 6183 adev->pm.pcie_mlw_mask = CAIL_PCIE_LINK_WIDTH_SUPPORT_X1; 6184 break; 6185 default: 6186 break; 6187 } 6188 } 6189 } 6190 } 6191 6192 /** 6193 * amdgpu_device_is_peer_accessible - Check peer access through PCIe BAR 6194 * 6195 * @adev: amdgpu_device pointer 6196 * @peer_adev: amdgpu_device pointer for peer device trying to access @adev 6197 * 6198 * Return true if @peer_adev can access (DMA) @adev through the PCIe 6199 * BAR, i.e. @adev is "large BAR" and the BAR matches the DMA mask of 6200 * @peer_adev. 6201 */ 6202 bool amdgpu_device_is_peer_accessible(struct amdgpu_device *adev, 6203 struct amdgpu_device *peer_adev) 6204 { 6205 #ifdef CONFIG_HSA_AMD_P2P 6206 bool p2p_access = 6207 !adev->gmc.xgmi.connected_to_cpu && 6208 !(pci_p2pdma_distance(adev->pdev, peer_adev->dev, false) < 0); 6209 if (!p2p_access) 6210 dev_info(adev->dev, "PCIe P2P access from peer device %s is not supported by the chipset\n", 6211 pci_name(peer_adev->pdev)); 6212 6213 bool is_large_bar = adev->gmc.visible_vram_size && 6214 adev->gmc.real_vram_size == adev->gmc.visible_vram_size; 6215 bool p2p_addressable = amdgpu_device_check_iommu_remap(peer_adev); 6216 6217 if (!p2p_addressable) { 6218 uint64_t address_mask = peer_adev->dev->dma_mask ? 6219 ~*peer_adev->dev->dma_mask : ~((1ULL << 32) - 1); 6220 resource_size_t aper_limit = 6221 adev->gmc.aper_base + adev->gmc.aper_size - 1; 6222 6223 p2p_addressable = !(adev->gmc.aper_base & address_mask || 6224 aper_limit & address_mask); 6225 } 6226 return pcie_p2p && is_large_bar && p2p_access && p2p_addressable; 6227 #else 6228 return false; 6229 #endif 6230 } 6231 6232 int amdgpu_device_baco_enter(struct drm_device *dev) 6233 { 6234 struct amdgpu_device *adev = drm_to_adev(dev); 6235 struct amdgpu_ras *ras = amdgpu_ras_get_context(adev); 6236 6237 if (!amdgpu_device_supports_baco(dev)) 6238 return -ENOTSUPP; 6239 6240 if (ras && adev->ras_enabled && 6241 adev->nbio.funcs->enable_doorbell_interrupt) 6242 adev->nbio.funcs->enable_doorbell_interrupt(adev, false); 6243 6244 return amdgpu_dpm_baco_enter(adev); 6245 } 6246 6247 int amdgpu_device_baco_exit(struct drm_device *dev) 6248 { 6249 struct amdgpu_device *adev = drm_to_adev(dev); 6250 struct amdgpu_ras *ras = amdgpu_ras_get_context(adev); 6251 int ret = 0; 6252 6253 if (!amdgpu_device_supports_baco(dev)) 6254 return -ENOTSUPP; 6255 6256 ret = amdgpu_dpm_baco_exit(adev); 6257 if (ret) 6258 return ret; 6259 6260 if (ras && adev->ras_enabled && 6261 adev->nbio.funcs->enable_doorbell_interrupt) 6262 adev->nbio.funcs->enable_doorbell_interrupt(adev, true); 6263 6264 if (amdgpu_passthrough(adev) && adev->nbio.funcs && 6265 adev->nbio.funcs->clear_doorbell_interrupt) 6266 adev->nbio.funcs->clear_doorbell_interrupt(adev); 6267 6268 return 0; 6269 } 6270 6271 /** 6272 * amdgpu_pci_error_detected - Called when a PCI error is detected. 6273 * @pdev: PCI device struct 6274 * @state: PCI channel state 6275 * 6276 * Description: Called when a PCI error is detected. 6277 * 6278 * Return: PCI_ERS_RESULT_NEED_RESET or PCI_ERS_RESULT_DISCONNECT. 6279 */ 6280 pci_ers_result_t amdgpu_pci_error_detected(struct pci_dev *pdev, pci_channel_state_t state) 6281 { 6282 struct drm_device *dev = pci_get_drvdata(pdev); 6283 struct amdgpu_device *adev = drm_to_adev(dev); 6284 int i; 6285 6286 DRM_INFO("PCI error: detected callback, state(%d)!!\n", state); 6287 6288 if (adev->gmc.xgmi.num_physical_nodes > 1) { 6289 DRM_WARN("No support for XGMI hive yet..."); 6290 return PCI_ERS_RESULT_DISCONNECT; 6291 } 6292 6293 adev->pci_channel_state = state; 6294 6295 switch (state) { 6296 case pci_channel_io_normal: 6297 return PCI_ERS_RESULT_CAN_RECOVER; 6298 /* Fatal error, prepare for slot reset */ 6299 case pci_channel_io_frozen: 6300 /* 6301 * Locking adev->reset_domain->sem will prevent any external access 6302 * to GPU during PCI error recovery 6303 */ 6304 amdgpu_device_lock_reset_domain(adev->reset_domain); 6305 amdgpu_device_set_mp1_state(adev); 6306 6307 /* 6308 * Block any work scheduling as we do for regular GPU reset 6309 * for the duration of the recovery 6310 */ 6311 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 6312 struct amdgpu_ring *ring = adev->rings[i]; 6313 6314 if (!amdgpu_ring_sched_ready(ring)) 6315 continue; 6316 6317 drm_sched_stop(&ring->sched, NULL); 6318 } 6319 atomic_inc(&adev->gpu_reset_counter); 6320 return PCI_ERS_RESULT_NEED_RESET; 6321 case pci_channel_io_perm_failure: 6322 /* Permanent error, prepare for device removal */ 6323 return PCI_ERS_RESULT_DISCONNECT; 6324 } 6325 6326 return PCI_ERS_RESULT_NEED_RESET; 6327 } 6328 6329 /** 6330 * amdgpu_pci_mmio_enabled - Enable MMIO and dump debug registers 6331 * @pdev: pointer to PCI device 6332 */ 6333 pci_ers_result_t amdgpu_pci_mmio_enabled(struct pci_dev *pdev) 6334 { 6335 6336 DRM_INFO("PCI error: mmio enabled callback!!\n"); 6337 6338 /* TODO - dump whatever for debugging purposes */ 6339 6340 /* This called only if amdgpu_pci_error_detected returns 6341 * PCI_ERS_RESULT_CAN_RECOVER. Read/write to the device still 6342 * works, no need to reset slot. 6343 */ 6344 6345 return PCI_ERS_RESULT_RECOVERED; 6346 } 6347 6348 /** 6349 * amdgpu_pci_slot_reset - Called when PCI slot has been reset. 6350 * @pdev: PCI device struct 6351 * 6352 * Description: This routine is called by the pci error recovery 6353 * code after the PCI slot has been reset, just before we 6354 * should resume normal operations. 6355 */ 6356 pci_ers_result_t amdgpu_pci_slot_reset(struct pci_dev *pdev) 6357 { 6358 struct drm_device *dev = pci_get_drvdata(pdev); 6359 struct amdgpu_device *adev = drm_to_adev(dev); 6360 int r, i; 6361 struct amdgpu_reset_context reset_context; 6362 u32 memsize; 6363 struct list_head device_list; 6364 6365 /* PCI error slot reset should be skipped During RAS recovery */ 6366 if ((amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 3) || 6367 amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 4)) && 6368 amdgpu_ras_in_recovery(adev)) 6369 return PCI_ERS_RESULT_RECOVERED; 6370 6371 DRM_INFO("PCI error: slot reset callback!!\n"); 6372 6373 memset(&reset_context, 0, sizeof(reset_context)); 6374 6375 INIT_LIST_HEAD(&device_list); 6376 list_add_tail(&adev->reset_list, &device_list); 6377 6378 /* wait for asic to come out of reset */ 6379 msleep(500); 6380 6381 /* Restore PCI confspace */ 6382 amdgpu_device_load_pci_state(pdev); 6383 6384 /* confirm ASIC came out of reset */ 6385 for (i = 0; i < adev->usec_timeout; i++) { 6386 memsize = amdgpu_asic_get_config_memsize(adev); 6387 6388 if (memsize != 0xffffffff) 6389 break; 6390 udelay(1); 6391 } 6392 if (memsize == 0xffffffff) { 6393 r = -ETIME; 6394 goto out; 6395 } 6396 6397 reset_context.method = AMD_RESET_METHOD_NONE; 6398 reset_context.reset_req_dev = adev; 6399 set_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags); 6400 set_bit(AMDGPU_SKIP_HW_RESET, &reset_context.flags); 6401 6402 adev->no_hw_access = true; 6403 r = amdgpu_device_pre_asic_reset(adev, &reset_context); 6404 adev->no_hw_access = false; 6405 if (r) 6406 goto out; 6407 6408 r = amdgpu_do_asic_reset(&device_list, &reset_context); 6409 6410 out: 6411 if (!r) { 6412 if (amdgpu_device_cache_pci_state(adev->pdev)) 6413 pci_restore_state(adev->pdev); 6414 6415 DRM_INFO("PCIe error recovery succeeded\n"); 6416 } else { 6417 DRM_ERROR("PCIe error recovery failed, err:%d", r); 6418 amdgpu_device_unset_mp1_state(adev); 6419 amdgpu_device_unlock_reset_domain(adev->reset_domain); 6420 } 6421 6422 return r ? PCI_ERS_RESULT_DISCONNECT : PCI_ERS_RESULT_RECOVERED; 6423 } 6424 6425 /** 6426 * amdgpu_pci_resume() - resume normal ops after PCI reset 6427 * @pdev: pointer to PCI device 6428 * 6429 * Called when the error recovery driver tells us that its 6430 * OK to resume normal operation. 6431 */ 6432 void amdgpu_pci_resume(struct pci_dev *pdev) 6433 { 6434 struct drm_device *dev = pci_get_drvdata(pdev); 6435 struct amdgpu_device *adev = drm_to_adev(dev); 6436 int i; 6437 6438 6439 DRM_INFO("PCI error: resume callback!!\n"); 6440 6441 /* Only continue execution for the case of pci_channel_io_frozen */ 6442 if (adev->pci_channel_state != pci_channel_io_frozen) 6443 return; 6444 6445 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 6446 struct amdgpu_ring *ring = adev->rings[i]; 6447 6448 if (!amdgpu_ring_sched_ready(ring)) 6449 continue; 6450 6451 drm_sched_start(&ring->sched, 0); 6452 } 6453 6454 amdgpu_device_unset_mp1_state(adev); 6455 amdgpu_device_unlock_reset_domain(adev->reset_domain); 6456 } 6457 6458 bool amdgpu_device_cache_pci_state(struct pci_dev *pdev) 6459 { 6460 struct drm_device *dev = pci_get_drvdata(pdev); 6461 struct amdgpu_device *adev = drm_to_adev(dev); 6462 int r; 6463 6464 if (amdgpu_sriov_vf(adev)) 6465 return false; 6466 6467 r = pci_save_state(pdev); 6468 if (!r) { 6469 kfree(adev->pci_state); 6470 6471 adev->pci_state = pci_store_saved_state(pdev); 6472 6473 if (!adev->pci_state) { 6474 DRM_ERROR("Failed to store PCI saved state"); 6475 return false; 6476 } 6477 } else { 6478 DRM_WARN("Failed to save PCI state, err:%d\n", r); 6479 return false; 6480 } 6481 6482 return true; 6483 } 6484 6485 bool amdgpu_device_load_pci_state(struct pci_dev *pdev) 6486 { 6487 struct drm_device *dev = pci_get_drvdata(pdev); 6488 struct amdgpu_device *adev = drm_to_adev(dev); 6489 int r; 6490 6491 if (!adev->pci_state) 6492 return false; 6493 6494 r = pci_load_saved_state(pdev, adev->pci_state); 6495 6496 if (!r) { 6497 pci_restore_state(pdev); 6498 } else { 6499 DRM_WARN("Failed to load PCI state, err:%d\n", r); 6500 return false; 6501 } 6502 6503 return true; 6504 } 6505 6506 void amdgpu_device_flush_hdp(struct amdgpu_device *adev, 6507 struct amdgpu_ring *ring) 6508 { 6509 #ifdef CONFIG_X86_64 6510 if ((adev->flags & AMD_IS_APU) && !amdgpu_passthrough(adev)) 6511 return; 6512 #endif 6513 if (adev->gmc.xgmi.connected_to_cpu) 6514 return; 6515 6516 if (ring && ring->funcs->emit_hdp_flush) 6517 amdgpu_ring_emit_hdp_flush(ring); 6518 else 6519 amdgpu_asic_flush_hdp(adev, ring); 6520 } 6521 6522 void amdgpu_device_invalidate_hdp(struct amdgpu_device *adev, 6523 struct amdgpu_ring *ring) 6524 { 6525 #ifdef CONFIG_X86_64 6526 if ((adev->flags & AMD_IS_APU) && !amdgpu_passthrough(adev)) 6527 return; 6528 #endif 6529 if (adev->gmc.xgmi.connected_to_cpu) 6530 return; 6531 6532 amdgpu_asic_invalidate_hdp(adev, ring); 6533 } 6534 6535 int amdgpu_in_reset(struct amdgpu_device *adev) 6536 { 6537 return atomic_read(&adev->reset_domain->in_gpu_reset); 6538 } 6539 6540 /** 6541 * amdgpu_device_halt() - bring hardware to some kind of halt state 6542 * 6543 * @adev: amdgpu_device pointer 6544 * 6545 * Bring hardware to some kind of halt state so that no one can touch it 6546 * any more. It will help to maintain error context when error occurred. 6547 * Compare to a simple hang, the system will keep stable at least for SSH 6548 * access. Then it should be trivial to inspect the hardware state and 6549 * see what's going on. Implemented as following: 6550 * 6551 * 1. drm_dev_unplug() makes device inaccessible to user space(IOCTLs, etc), 6552 * clears all CPU mappings to device, disallows remappings through page faults 6553 * 2. amdgpu_irq_disable_all() disables all interrupts 6554 * 3. amdgpu_fence_driver_hw_fini() signals all HW fences 6555 * 4. set adev->no_hw_access to avoid potential crashes after setp 5 6556 * 5. amdgpu_device_unmap_mmio() clears all MMIO mappings 6557 * 6. pci_disable_device() and pci_wait_for_pending_transaction() 6558 * flush any in flight DMA operations 6559 */ 6560 void amdgpu_device_halt(struct amdgpu_device *adev) 6561 { 6562 struct pci_dev *pdev = adev->pdev; 6563 struct drm_device *ddev = adev_to_drm(adev); 6564 6565 amdgpu_xcp_dev_unplug(adev); 6566 drm_dev_unplug(ddev); 6567 6568 amdgpu_irq_disable_all(adev); 6569 6570 amdgpu_fence_driver_hw_fini(adev); 6571 6572 adev->no_hw_access = true; 6573 6574 amdgpu_device_unmap_mmio(adev); 6575 6576 pci_disable_device(pdev); 6577 pci_wait_for_pending_transaction(pdev); 6578 } 6579 6580 u32 amdgpu_device_pcie_port_rreg(struct amdgpu_device *adev, 6581 u32 reg) 6582 { 6583 unsigned long flags, address, data; 6584 u32 r; 6585 6586 address = adev->nbio.funcs->get_pcie_port_index_offset(adev); 6587 data = adev->nbio.funcs->get_pcie_port_data_offset(adev); 6588 6589 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 6590 WREG32(address, reg * 4); 6591 (void)RREG32(address); 6592 r = RREG32(data); 6593 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 6594 return r; 6595 } 6596 6597 void amdgpu_device_pcie_port_wreg(struct amdgpu_device *adev, 6598 u32 reg, u32 v) 6599 { 6600 unsigned long flags, address, data; 6601 6602 address = adev->nbio.funcs->get_pcie_port_index_offset(adev); 6603 data = adev->nbio.funcs->get_pcie_port_data_offset(adev); 6604 6605 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 6606 WREG32(address, reg * 4); 6607 (void)RREG32(address); 6608 WREG32(data, v); 6609 (void)RREG32(data); 6610 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 6611 } 6612 6613 /** 6614 * amdgpu_device_get_gang - return a reference to the current gang 6615 * @adev: amdgpu_device pointer 6616 * 6617 * Returns: A new reference to the current gang leader. 6618 */ 6619 struct dma_fence *amdgpu_device_get_gang(struct amdgpu_device *adev) 6620 { 6621 struct dma_fence *fence; 6622 6623 rcu_read_lock(); 6624 fence = dma_fence_get_rcu_safe(&adev->gang_submit); 6625 rcu_read_unlock(); 6626 return fence; 6627 } 6628 6629 /** 6630 * amdgpu_device_switch_gang - switch to a new gang 6631 * @adev: amdgpu_device pointer 6632 * @gang: the gang to switch to 6633 * 6634 * Try to switch to a new gang. 6635 * Returns: NULL if we switched to the new gang or a reference to the current 6636 * gang leader. 6637 */ 6638 struct dma_fence *amdgpu_device_switch_gang(struct amdgpu_device *adev, 6639 struct dma_fence *gang) 6640 { 6641 struct dma_fence *old = NULL; 6642 6643 do { 6644 dma_fence_put(old); 6645 old = amdgpu_device_get_gang(adev); 6646 if (old == gang) 6647 break; 6648 6649 if (!dma_fence_is_signaled(old)) 6650 return old; 6651 6652 } while (cmpxchg((struct dma_fence __force **)&adev->gang_submit, 6653 old, gang) != old); 6654 6655 dma_fence_put(old); 6656 return NULL; 6657 } 6658 6659 bool amdgpu_device_has_display_hardware(struct amdgpu_device *adev) 6660 { 6661 switch (adev->asic_type) { 6662 #ifdef CONFIG_DRM_AMDGPU_SI 6663 case CHIP_HAINAN: 6664 #endif 6665 case CHIP_TOPAZ: 6666 /* chips with no display hardware */ 6667 return false; 6668 #ifdef CONFIG_DRM_AMDGPU_SI 6669 case CHIP_TAHITI: 6670 case CHIP_PITCAIRN: 6671 case CHIP_VERDE: 6672 case CHIP_OLAND: 6673 #endif 6674 #ifdef CONFIG_DRM_AMDGPU_CIK 6675 case CHIP_BONAIRE: 6676 case CHIP_HAWAII: 6677 case CHIP_KAVERI: 6678 case CHIP_KABINI: 6679 case CHIP_MULLINS: 6680 #endif 6681 case CHIP_TONGA: 6682 case CHIP_FIJI: 6683 case CHIP_POLARIS10: 6684 case CHIP_POLARIS11: 6685 case CHIP_POLARIS12: 6686 case CHIP_VEGAM: 6687 case CHIP_CARRIZO: 6688 case CHIP_STONEY: 6689 /* chips with display hardware */ 6690 return true; 6691 default: 6692 /* IP discovery */ 6693 if (!amdgpu_ip_version(adev, DCE_HWIP, 0) || 6694 (adev->harvest_ip_mask & AMD_HARVEST_IP_DMU_MASK)) 6695 return false; 6696 return true; 6697 } 6698 } 6699 6700 uint32_t amdgpu_device_wait_on_rreg(struct amdgpu_device *adev, 6701 uint32_t inst, uint32_t reg_addr, char reg_name[], 6702 uint32_t expected_value, uint32_t mask) 6703 { 6704 uint32_t ret = 0; 6705 uint32_t old_ = 0; 6706 uint32_t tmp_ = RREG32(reg_addr); 6707 uint32_t loop = adev->usec_timeout; 6708 6709 while ((tmp_ & (mask)) != (expected_value)) { 6710 if (old_ != tmp_) { 6711 loop = adev->usec_timeout; 6712 old_ = tmp_; 6713 } else 6714 udelay(1); 6715 tmp_ = RREG32(reg_addr); 6716 loop--; 6717 if (!loop) { 6718 DRM_WARN("Register(%d) [%s] failed to reach value 0x%08x != 0x%08xn", 6719 inst, reg_name, (uint32_t)expected_value, 6720 (uint32_t)(tmp_ & (mask))); 6721 ret = -ETIMEDOUT; 6722 break; 6723 } 6724 } 6725 return ret; 6726 } 6727 6728 ssize_t amdgpu_get_soft_full_reset_mask(struct amdgpu_ring *ring) 6729 { 6730 ssize_t size = 0; 6731 6732 if (!ring || !ring->adev) 6733 return size; 6734 6735 if (amdgpu_device_should_recover_gpu(ring->adev)) 6736 size |= AMDGPU_RESET_TYPE_FULL; 6737 6738 if (unlikely(!ring->adev->debug_disable_soft_recovery) && 6739 !amdgpu_sriov_vf(ring->adev) && ring->funcs->soft_recovery) 6740 size |= AMDGPU_RESET_TYPE_SOFT_RESET; 6741 6742 return size; 6743 } 6744 6745 ssize_t amdgpu_show_reset_mask(char *buf, uint32_t supported_reset) 6746 { 6747 ssize_t size = 0; 6748 6749 if (supported_reset == 0) { 6750 size += sysfs_emit_at(buf, size, "unsupported"); 6751 size += sysfs_emit_at(buf, size, "\n"); 6752 return size; 6753 6754 } 6755 6756 if (supported_reset & AMDGPU_RESET_TYPE_SOFT_RESET) 6757 size += sysfs_emit_at(buf, size, "soft "); 6758 6759 if (supported_reset & AMDGPU_RESET_TYPE_PER_QUEUE) 6760 size += sysfs_emit_at(buf, size, "queue "); 6761 6762 if (supported_reset & AMDGPU_RESET_TYPE_PER_PIPE) 6763 size += sysfs_emit_at(buf, size, "pipe "); 6764 6765 if (supported_reset & AMDGPU_RESET_TYPE_FULL) 6766 size += sysfs_emit_at(buf, size, "full "); 6767 6768 size += sysfs_emit_at(buf, size, "\n"); 6769 return size; 6770 } 6771