1 /* 2 * Copyright 2008 Advanced Micro Devices, Inc. 3 * Copyright 2008 Red Hat Inc. 4 * Copyright 2009 Jerome Glisse. 5 * 6 * Permission is hereby granted, free of charge, to any person obtaining a 7 * copy of this software and associated documentation files (the "Software"), 8 * to deal in the Software without restriction, including without limitation 9 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 10 * and/or sell copies of the Software, and to permit persons to whom the 11 * Software is furnished to do so, subject to the following conditions: 12 * 13 * The above copyright notice and this permission notice shall be included in 14 * all copies or substantial portions of the Software. 15 * 16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 19 * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR 20 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, 21 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR 22 * OTHER DEALINGS IN THE SOFTWARE. 23 * 24 * Authors: Dave Airlie 25 * Alex Deucher 26 * Jerome Glisse 27 */ 28 #include <linux/power_supply.h> 29 #include <linux/kthread.h> 30 #include <linux/module.h> 31 #include <linux/console.h> 32 #include <linux/slab.h> 33 #include <linux/iommu.h> 34 #include <linux/pci.h> 35 #include <linux/pci-p2pdma.h> 36 #include <linux/apple-gmux.h> 37 38 #include <drm/drm_aperture.h> 39 #include <drm/drm_atomic_helper.h> 40 #include <drm/drm_crtc_helper.h> 41 #include <drm/drm_fb_helper.h> 42 #include <drm/drm_probe_helper.h> 43 #include <drm/amdgpu_drm.h> 44 #include <linux/device.h> 45 #include <linux/vgaarb.h> 46 #include <linux/vga_switcheroo.h> 47 #include <linux/efi.h> 48 #include "amdgpu.h" 49 #include "amdgpu_trace.h" 50 #include "amdgpu_i2c.h" 51 #include "atom.h" 52 #include "amdgpu_atombios.h" 53 #include "amdgpu_atomfirmware.h" 54 #include "amd_pcie.h" 55 #ifdef CONFIG_DRM_AMDGPU_SI 56 #include "si.h" 57 #endif 58 #ifdef CONFIG_DRM_AMDGPU_CIK 59 #include "cik.h" 60 #endif 61 #include "vi.h" 62 #include "soc15.h" 63 #include "nv.h" 64 #include "bif/bif_4_1_d.h" 65 #include <linux/firmware.h> 66 #include "amdgpu_vf_error.h" 67 68 #include "amdgpu_amdkfd.h" 69 #include "amdgpu_pm.h" 70 71 #include "amdgpu_xgmi.h" 72 #include "amdgpu_ras.h" 73 #include "amdgpu_pmu.h" 74 #include "amdgpu_fru_eeprom.h" 75 #include "amdgpu_reset.h" 76 #include "amdgpu_virt.h" 77 #include "amdgpu_dev_coredump.h" 78 79 #include <linux/suspend.h> 80 #include <drm/task_barrier.h> 81 #include <linux/pm_runtime.h> 82 83 #include <drm/drm_drv.h> 84 85 #if IS_ENABLED(CONFIG_X86) 86 #include <asm/intel-family.h> 87 #endif 88 89 MODULE_FIRMWARE("amdgpu/vega10_gpu_info.bin"); 90 MODULE_FIRMWARE("amdgpu/vega12_gpu_info.bin"); 91 MODULE_FIRMWARE("amdgpu/raven_gpu_info.bin"); 92 MODULE_FIRMWARE("amdgpu/picasso_gpu_info.bin"); 93 MODULE_FIRMWARE("amdgpu/raven2_gpu_info.bin"); 94 MODULE_FIRMWARE("amdgpu/arcturus_gpu_info.bin"); 95 MODULE_FIRMWARE("amdgpu/navi12_gpu_info.bin"); 96 97 #define AMDGPU_RESUME_MS 2000 98 #define AMDGPU_MAX_RETRY_LIMIT 2 99 #define AMDGPU_RETRY_SRIOV_RESET(r) ((r) == -EBUSY || (r) == -ETIMEDOUT || (r) == -EINVAL) 100 #define AMDGPU_PCIE_INDEX_FALLBACK (0x38 >> 2) 101 #define AMDGPU_PCIE_INDEX_HI_FALLBACK (0x44 >> 2) 102 #define AMDGPU_PCIE_DATA_FALLBACK (0x3C >> 2) 103 104 static const struct drm_driver amdgpu_kms_driver; 105 106 const char *amdgpu_asic_name[] = { 107 "TAHITI", 108 "PITCAIRN", 109 "VERDE", 110 "OLAND", 111 "HAINAN", 112 "BONAIRE", 113 "KAVERI", 114 "KABINI", 115 "HAWAII", 116 "MULLINS", 117 "TOPAZ", 118 "TONGA", 119 "FIJI", 120 "CARRIZO", 121 "STONEY", 122 "POLARIS10", 123 "POLARIS11", 124 "POLARIS12", 125 "VEGAM", 126 "VEGA10", 127 "VEGA12", 128 "VEGA20", 129 "RAVEN", 130 "ARCTURUS", 131 "RENOIR", 132 "ALDEBARAN", 133 "NAVI10", 134 "CYAN_SKILLFISH", 135 "NAVI14", 136 "NAVI12", 137 "SIENNA_CICHLID", 138 "NAVY_FLOUNDER", 139 "VANGOGH", 140 "DIMGREY_CAVEFISH", 141 "BEIGE_GOBY", 142 "YELLOW_CARP", 143 "IP DISCOVERY", 144 "LAST", 145 }; 146 147 #define AMDGPU_IP_BLK_MASK_ALL GENMASK(AMDGPU_MAX_IP_NUM - 1, 0) 148 /* 149 * Default init level where all blocks are expected to be initialized. This is 150 * the level of initialization expected by default and also after a full reset 151 * of the device. 152 */ 153 struct amdgpu_init_level amdgpu_init_default = { 154 .level = AMDGPU_INIT_LEVEL_DEFAULT, 155 .hwini_ip_block_mask = AMDGPU_IP_BLK_MASK_ALL, 156 }; 157 158 /* 159 * Minimal blocks needed to be initialized before a XGMI hive can be reset. This 160 * is used for cases like reset on initialization where the entire hive needs to 161 * be reset before first use. 162 */ 163 struct amdgpu_init_level amdgpu_init_minimal_xgmi = { 164 .level = AMDGPU_INIT_LEVEL_MINIMAL_XGMI, 165 .hwini_ip_block_mask = 166 BIT(AMD_IP_BLOCK_TYPE_GMC) | BIT(AMD_IP_BLOCK_TYPE_SMC) | 167 BIT(AMD_IP_BLOCK_TYPE_COMMON) | BIT(AMD_IP_BLOCK_TYPE_IH) | 168 BIT(AMD_IP_BLOCK_TYPE_PSP) 169 }; 170 171 static inline bool amdgpu_ip_member_of_hwini(struct amdgpu_device *adev, 172 enum amd_ip_block_type block) 173 { 174 return (adev->init_lvl->hwini_ip_block_mask & (1U << block)) != 0; 175 } 176 177 void amdgpu_set_init_level(struct amdgpu_device *adev, 178 enum amdgpu_init_lvl_id lvl) 179 { 180 switch (lvl) { 181 case AMDGPU_INIT_LEVEL_MINIMAL_XGMI: 182 adev->init_lvl = &amdgpu_init_minimal_xgmi; 183 break; 184 case AMDGPU_INIT_LEVEL_DEFAULT: 185 fallthrough; 186 default: 187 adev->init_lvl = &amdgpu_init_default; 188 break; 189 } 190 } 191 192 static inline void amdgpu_device_stop_pending_resets(struct amdgpu_device *adev); 193 194 /** 195 * DOC: pcie_replay_count 196 * 197 * The amdgpu driver provides a sysfs API for reporting the total number 198 * of PCIe replays (NAKs) 199 * The file pcie_replay_count is used for this and returns the total 200 * number of replays as a sum of the NAKs generated and NAKs received 201 */ 202 203 static ssize_t amdgpu_device_get_pcie_replay_count(struct device *dev, 204 struct device_attribute *attr, char *buf) 205 { 206 struct drm_device *ddev = dev_get_drvdata(dev); 207 struct amdgpu_device *adev = drm_to_adev(ddev); 208 uint64_t cnt = amdgpu_asic_get_pcie_replay_count(adev); 209 210 return sysfs_emit(buf, "%llu\n", cnt); 211 } 212 213 static DEVICE_ATTR(pcie_replay_count, 0444, 214 amdgpu_device_get_pcie_replay_count, NULL); 215 216 static ssize_t amdgpu_sysfs_reg_state_get(struct file *f, struct kobject *kobj, 217 struct bin_attribute *attr, char *buf, 218 loff_t ppos, size_t count) 219 { 220 struct device *dev = kobj_to_dev(kobj); 221 struct drm_device *ddev = dev_get_drvdata(dev); 222 struct amdgpu_device *adev = drm_to_adev(ddev); 223 ssize_t bytes_read; 224 225 switch (ppos) { 226 case AMDGPU_SYS_REG_STATE_XGMI: 227 bytes_read = amdgpu_asic_get_reg_state( 228 adev, AMDGPU_REG_STATE_TYPE_XGMI, buf, count); 229 break; 230 case AMDGPU_SYS_REG_STATE_WAFL: 231 bytes_read = amdgpu_asic_get_reg_state( 232 adev, AMDGPU_REG_STATE_TYPE_WAFL, buf, count); 233 break; 234 case AMDGPU_SYS_REG_STATE_PCIE: 235 bytes_read = amdgpu_asic_get_reg_state( 236 adev, AMDGPU_REG_STATE_TYPE_PCIE, buf, count); 237 break; 238 case AMDGPU_SYS_REG_STATE_USR: 239 bytes_read = amdgpu_asic_get_reg_state( 240 adev, AMDGPU_REG_STATE_TYPE_USR, buf, count); 241 break; 242 case AMDGPU_SYS_REG_STATE_USR_1: 243 bytes_read = amdgpu_asic_get_reg_state( 244 adev, AMDGPU_REG_STATE_TYPE_USR_1, buf, count); 245 break; 246 default: 247 return -EINVAL; 248 } 249 250 return bytes_read; 251 } 252 253 BIN_ATTR(reg_state, 0444, amdgpu_sysfs_reg_state_get, NULL, 254 AMDGPU_SYS_REG_STATE_END); 255 256 int amdgpu_reg_state_sysfs_init(struct amdgpu_device *adev) 257 { 258 int ret; 259 260 if (!amdgpu_asic_get_reg_state_supported(adev)) 261 return 0; 262 263 ret = sysfs_create_bin_file(&adev->dev->kobj, &bin_attr_reg_state); 264 265 return ret; 266 } 267 268 void amdgpu_reg_state_sysfs_fini(struct amdgpu_device *adev) 269 { 270 if (!amdgpu_asic_get_reg_state_supported(adev)) 271 return; 272 sysfs_remove_bin_file(&adev->dev->kobj, &bin_attr_reg_state); 273 } 274 275 int amdgpu_ip_block_suspend(struct amdgpu_ip_block *ip_block) 276 { 277 int r; 278 279 if (ip_block->version->funcs->suspend) { 280 r = ip_block->version->funcs->suspend(ip_block); 281 if (r) { 282 dev_err(ip_block->adev->dev, 283 "suspend of IP block <%s> failed %d\n", 284 ip_block->version->funcs->name, r); 285 return r; 286 } 287 } 288 289 ip_block->status.hw = false; 290 return 0; 291 } 292 293 int amdgpu_ip_block_resume(struct amdgpu_ip_block *ip_block) 294 { 295 int r; 296 297 if (ip_block->version->funcs->resume) { 298 r = ip_block->version->funcs->resume(ip_block); 299 if (r) { 300 dev_err(ip_block->adev->dev, 301 "resume of IP block <%s> failed %d\n", 302 ip_block->version->funcs->name, r); 303 return r; 304 } 305 } 306 307 ip_block->status.hw = true; 308 return 0; 309 } 310 311 /** 312 * DOC: board_info 313 * 314 * The amdgpu driver provides a sysfs API for giving board related information. 315 * It provides the form factor information in the format 316 * 317 * type : form factor 318 * 319 * Possible form factor values 320 * 321 * - "cem" - PCIE CEM card 322 * - "oam" - Open Compute Accelerator Module 323 * - "unknown" - Not known 324 * 325 */ 326 327 static ssize_t amdgpu_device_get_board_info(struct device *dev, 328 struct device_attribute *attr, 329 char *buf) 330 { 331 struct drm_device *ddev = dev_get_drvdata(dev); 332 struct amdgpu_device *adev = drm_to_adev(ddev); 333 enum amdgpu_pkg_type pkg_type = AMDGPU_PKG_TYPE_CEM; 334 const char *pkg; 335 336 if (adev->smuio.funcs && adev->smuio.funcs->get_pkg_type) 337 pkg_type = adev->smuio.funcs->get_pkg_type(adev); 338 339 switch (pkg_type) { 340 case AMDGPU_PKG_TYPE_CEM: 341 pkg = "cem"; 342 break; 343 case AMDGPU_PKG_TYPE_OAM: 344 pkg = "oam"; 345 break; 346 default: 347 pkg = "unknown"; 348 break; 349 } 350 351 return sysfs_emit(buf, "%s : %s\n", "type", pkg); 352 } 353 354 static DEVICE_ATTR(board_info, 0444, amdgpu_device_get_board_info, NULL); 355 356 static struct attribute *amdgpu_board_attrs[] = { 357 &dev_attr_board_info.attr, 358 NULL, 359 }; 360 361 static umode_t amdgpu_board_attrs_is_visible(struct kobject *kobj, 362 struct attribute *attr, int n) 363 { 364 struct device *dev = kobj_to_dev(kobj); 365 struct drm_device *ddev = dev_get_drvdata(dev); 366 struct amdgpu_device *adev = drm_to_adev(ddev); 367 368 if (adev->flags & AMD_IS_APU) 369 return 0; 370 371 return attr->mode; 372 } 373 374 static const struct attribute_group amdgpu_board_attrs_group = { 375 .attrs = amdgpu_board_attrs, 376 .is_visible = amdgpu_board_attrs_is_visible 377 }; 378 379 static void amdgpu_device_get_pcie_info(struct amdgpu_device *adev); 380 381 382 /** 383 * amdgpu_device_supports_px - Is the device a dGPU with ATPX power control 384 * 385 * @dev: drm_device pointer 386 * 387 * Returns true if the device is a dGPU with ATPX power control, 388 * otherwise return false. 389 */ 390 bool amdgpu_device_supports_px(struct drm_device *dev) 391 { 392 struct amdgpu_device *adev = drm_to_adev(dev); 393 394 if ((adev->flags & AMD_IS_PX) && !amdgpu_is_atpx_hybrid()) 395 return true; 396 return false; 397 } 398 399 /** 400 * amdgpu_device_supports_boco - Is the device a dGPU with ACPI power resources 401 * 402 * @dev: drm_device pointer 403 * 404 * Returns true if the device is a dGPU with ACPI power control, 405 * otherwise return false. 406 */ 407 bool amdgpu_device_supports_boco(struct drm_device *dev) 408 { 409 struct amdgpu_device *adev = drm_to_adev(dev); 410 411 if (adev->has_pr3 || 412 ((adev->flags & AMD_IS_PX) && amdgpu_is_atpx_hybrid())) 413 return true; 414 return false; 415 } 416 417 /** 418 * amdgpu_device_supports_baco - Does the device support BACO 419 * 420 * @dev: drm_device pointer 421 * 422 * Return: 423 * 1 if the device supporte BACO; 424 * 3 if the device support MACO (only works if BACO is supported) 425 * otherwise return 0. 426 */ 427 int amdgpu_device_supports_baco(struct drm_device *dev) 428 { 429 struct amdgpu_device *adev = drm_to_adev(dev); 430 431 return amdgpu_asic_supports_baco(adev); 432 } 433 434 void amdgpu_device_detect_runtime_pm_mode(struct amdgpu_device *adev) 435 { 436 struct drm_device *dev; 437 int bamaco_support; 438 439 dev = adev_to_drm(adev); 440 441 adev->pm.rpm_mode = AMDGPU_RUNPM_NONE; 442 bamaco_support = amdgpu_device_supports_baco(dev); 443 444 switch (amdgpu_runtime_pm) { 445 case 2: 446 if (bamaco_support & MACO_SUPPORT) { 447 adev->pm.rpm_mode = AMDGPU_RUNPM_BAMACO; 448 dev_info(adev->dev, "Forcing BAMACO for runtime pm\n"); 449 } else if (bamaco_support == BACO_SUPPORT) { 450 adev->pm.rpm_mode = AMDGPU_RUNPM_BACO; 451 dev_info(adev->dev, "Requested mode BAMACO not available,fallback to use BACO\n"); 452 } 453 break; 454 case 1: 455 if (bamaco_support & BACO_SUPPORT) { 456 adev->pm.rpm_mode = AMDGPU_RUNPM_BACO; 457 dev_info(adev->dev, "Forcing BACO for runtime pm\n"); 458 } 459 break; 460 case -1: 461 case -2: 462 if (amdgpu_device_supports_px(dev)) { /* enable PX as runtime mode */ 463 adev->pm.rpm_mode = AMDGPU_RUNPM_PX; 464 dev_info(adev->dev, "Using ATPX for runtime pm\n"); 465 } else if (amdgpu_device_supports_boco(dev)) { /* enable boco as runtime mode */ 466 adev->pm.rpm_mode = AMDGPU_RUNPM_BOCO; 467 dev_info(adev->dev, "Using BOCO for runtime pm\n"); 468 } else { 469 if (!bamaco_support) 470 goto no_runtime_pm; 471 472 switch (adev->asic_type) { 473 case CHIP_VEGA20: 474 case CHIP_ARCTURUS: 475 /* BACO are not supported on vega20 and arctrus */ 476 break; 477 case CHIP_VEGA10: 478 /* enable BACO as runpm mode if noretry=0 */ 479 if (!adev->gmc.noretry) 480 adev->pm.rpm_mode = AMDGPU_RUNPM_BACO; 481 break; 482 default: 483 /* enable BACO as runpm mode on CI+ */ 484 adev->pm.rpm_mode = AMDGPU_RUNPM_BACO; 485 break; 486 } 487 488 if (adev->pm.rpm_mode == AMDGPU_RUNPM_BACO) { 489 if (bamaco_support & MACO_SUPPORT) { 490 adev->pm.rpm_mode = AMDGPU_RUNPM_BAMACO; 491 dev_info(adev->dev, "Using BAMACO for runtime pm\n"); 492 } else { 493 dev_info(adev->dev, "Using BACO for runtime pm\n"); 494 } 495 } 496 } 497 break; 498 case 0: 499 dev_info(adev->dev, "runtime pm is manually disabled\n"); 500 break; 501 default: 502 break; 503 } 504 505 no_runtime_pm: 506 if (adev->pm.rpm_mode == AMDGPU_RUNPM_NONE) 507 dev_info(adev->dev, "Runtime PM not available\n"); 508 } 509 /** 510 * amdgpu_device_supports_smart_shift - Is the device dGPU with 511 * smart shift support 512 * 513 * @dev: drm_device pointer 514 * 515 * Returns true if the device is a dGPU with Smart Shift support, 516 * otherwise returns false. 517 */ 518 bool amdgpu_device_supports_smart_shift(struct drm_device *dev) 519 { 520 return (amdgpu_device_supports_boco(dev) && 521 amdgpu_acpi_is_power_shift_control_supported()); 522 } 523 524 /* 525 * VRAM access helper functions 526 */ 527 528 /** 529 * amdgpu_device_mm_access - access vram by MM_INDEX/MM_DATA 530 * 531 * @adev: amdgpu_device pointer 532 * @pos: offset of the buffer in vram 533 * @buf: virtual address of the buffer in system memory 534 * @size: read/write size, sizeof(@buf) must > @size 535 * @write: true - write to vram, otherwise - read from vram 536 */ 537 void amdgpu_device_mm_access(struct amdgpu_device *adev, loff_t pos, 538 void *buf, size_t size, bool write) 539 { 540 unsigned long flags; 541 uint32_t hi = ~0, tmp = 0; 542 uint32_t *data = buf; 543 uint64_t last; 544 int idx; 545 546 if (!drm_dev_enter(adev_to_drm(adev), &idx)) 547 return; 548 549 BUG_ON(!IS_ALIGNED(pos, 4) || !IS_ALIGNED(size, 4)); 550 551 spin_lock_irqsave(&adev->mmio_idx_lock, flags); 552 for (last = pos + size; pos < last; pos += 4) { 553 tmp = pos >> 31; 554 555 WREG32_NO_KIQ(mmMM_INDEX, ((uint32_t)pos) | 0x80000000); 556 if (tmp != hi) { 557 WREG32_NO_KIQ(mmMM_INDEX_HI, tmp); 558 hi = tmp; 559 } 560 if (write) 561 WREG32_NO_KIQ(mmMM_DATA, *data++); 562 else 563 *data++ = RREG32_NO_KIQ(mmMM_DATA); 564 } 565 566 spin_unlock_irqrestore(&adev->mmio_idx_lock, flags); 567 drm_dev_exit(idx); 568 } 569 570 /** 571 * amdgpu_device_aper_access - access vram by vram aperature 572 * 573 * @adev: amdgpu_device pointer 574 * @pos: offset of the buffer in vram 575 * @buf: virtual address of the buffer in system memory 576 * @size: read/write size, sizeof(@buf) must > @size 577 * @write: true - write to vram, otherwise - read from vram 578 * 579 * The return value means how many bytes have been transferred. 580 */ 581 size_t amdgpu_device_aper_access(struct amdgpu_device *adev, loff_t pos, 582 void *buf, size_t size, bool write) 583 { 584 #ifdef CONFIG_64BIT 585 void __iomem *addr; 586 size_t count = 0; 587 uint64_t last; 588 589 if (!adev->mman.aper_base_kaddr) 590 return 0; 591 592 last = min(pos + size, adev->gmc.visible_vram_size); 593 if (last > pos) { 594 addr = adev->mman.aper_base_kaddr + pos; 595 count = last - pos; 596 597 if (write) { 598 memcpy_toio(addr, buf, count); 599 /* Make sure HDP write cache flush happens without any reordering 600 * after the system memory contents are sent over PCIe device 601 */ 602 mb(); 603 amdgpu_device_flush_hdp(adev, NULL); 604 } else { 605 amdgpu_device_invalidate_hdp(adev, NULL); 606 /* Make sure HDP read cache is invalidated before issuing a read 607 * to the PCIe device 608 */ 609 mb(); 610 memcpy_fromio(buf, addr, count); 611 } 612 613 } 614 615 return count; 616 #else 617 return 0; 618 #endif 619 } 620 621 /** 622 * amdgpu_device_vram_access - read/write a buffer in vram 623 * 624 * @adev: amdgpu_device pointer 625 * @pos: offset of the buffer in vram 626 * @buf: virtual address of the buffer in system memory 627 * @size: read/write size, sizeof(@buf) must > @size 628 * @write: true - write to vram, otherwise - read from vram 629 */ 630 void amdgpu_device_vram_access(struct amdgpu_device *adev, loff_t pos, 631 void *buf, size_t size, bool write) 632 { 633 size_t count; 634 635 /* try to using vram apreature to access vram first */ 636 count = amdgpu_device_aper_access(adev, pos, buf, size, write); 637 size -= count; 638 if (size) { 639 /* using MM to access rest vram */ 640 pos += count; 641 buf += count; 642 amdgpu_device_mm_access(adev, pos, buf, size, write); 643 } 644 } 645 646 /* 647 * register access helper functions. 648 */ 649 650 /* Check if hw access should be skipped because of hotplug or device error */ 651 bool amdgpu_device_skip_hw_access(struct amdgpu_device *adev) 652 { 653 if (adev->no_hw_access) 654 return true; 655 656 #ifdef CONFIG_LOCKDEP 657 /* 658 * This is a bit complicated to understand, so worth a comment. What we assert 659 * here is that the GPU reset is not running on another thread in parallel. 660 * 661 * For this we trylock the read side of the reset semaphore, if that succeeds 662 * we know that the reset is not running in paralell. 663 * 664 * If the trylock fails we assert that we are either already holding the read 665 * side of the lock or are the reset thread itself and hold the write side of 666 * the lock. 667 */ 668 if (in_task()) { 669 if (down_read_trylock(&adev->reset_domain->sem)) 670 up_read(&adev->reset_domain->sem); 671 else 672 lockdep_assert_held(&adev->reset_domain->sem); 673 } 674 #endif 675 return false; 676 } 677 678 /** 679 * amdgpu_device_rreg - read a memory mapped IO or indirect register 680 * 681 * @adev: amdgpu_device pointer 682 * @reg: dword aligned register offset 683 * @acc_flags: access flags which require special behavior 684 * 685 * Returns the 32 bit value from the offset specified. 686 */ 687 uint32_t amdgpu_device_rreg(struct amdgpu_device *adev, 688 uint32_t reg, uint32_t acc_flags) 689 { 690 uint32_t ret; 691 692 if (amdgpu_device_skip_hw_access(adev)) 693 return 0; 694 695 if ((reg * 4) < adev->rmmio_size) { 696 if (!(acc_flags & AMDGPU_REGS_NO_KIQ) && 697 amdgpu_sriov_runtime(adev) && 698 down_read_trylock(&adev->reset_domain->sem)) { 699 ret = amdgpu_kiq_rreg(adev, reg, 0); 700 up_read(&adev->reset_domain->sem); 701 } else { 702 ret = readl(((void __iomem *)adev->rmmio) + (reg * 4)); 703 } 704 } else { 705 ret = adev->pcie_rreg(adev, reg * 4); 706 } 707 708 trace_amdgpu_device_rreg(adev->pdev->device, reg, ret); 709 710 return ret; 711 } 712 713 /* 714 * MMIO register read with bytes helper functions 715 * @offset:bytes offset from MMIO start 716 */ 717 718 /** 719 * amdgpu_mm_rreg8 - read a memory mapped IO register 720 * 721 * @adev: amdgpu_device pointer 722 * @offset: byte aligned register offset 723 * 724 * Returns the 8 bit value from the offset specified. 725 */ 726 uint8_t amdgpu_mm_rreg8(struct amdgpu_device *adev, uint32_t offset) 727 { 728 if (amdgpu_device_skip_hw_access(adev)) 729 return 0; 730 731 if (offset < adev->rmmio_size) 732 return (readb(adev->rmmio + offset)); 733 BUG(); 734 } 735 736 737 /** 738 * amdgpu_device_xcc_rreg - read a memory mapped IO or indirect register with specific XCC 739 * 740 * @adev: amdgpu_device pointer 741 * @reg: dword aligned register offset 742 * @acc_flags: access flags which require special behavior 743 * @xcc_id: xcc accelerated compute core id 744 * 745 * Returns the 32 bit value from the offset specified. 746 */ 747 uint32_t amdgpu_device_xcc_rreg(struct amdgpu_device *adev, 748 uint32_t reg, uint32_t acc_flags, 749 uint32_t xcc_id) 750 { 751 uint32_t ret, rlcg_flag; 752 753 if (amdgpu_device_skip_hw_access(adev)) 754 return 0; 755 756 if ((reg * 4) < adev->rmmio_size) { 757 if (amdgpu_sriov_vf(adev) && 758 !amdgpu_sriov_runtime(adev) && 759 adev->gfx.rlc.rlcg_reg_access_supported && 760 amdgpu_virt_get_rlcg_reg_access_flag(adev, acc_flags, 761 GC_HWIP, false, 762 &rlcg_flag)) { 763 ret = amdgpu_virt_rlcg_reg_rw(adev, reg, 0, rlcg_flag, GET_INST(GC, xcc_id)); 764 } else if (!(acc_flags & AMDGPU_REGS_NO_KIQ) && 765 amdgpu_sriov_runtime(adev) && 766 down_read_trylock(&adev->reset_domain->sem)) { 767 ret = amdgpu_kiq_rreg(adev, reg, xcc_id); 768 up_read(&adev->reset_domain->sem); 769 } else { 770 ret = readl(((void __iomem *)adev->rmmio) + (reg * 4)); 771 } 772 } else { 773 ret = adev->pcie_rreg(adev, reg * 4); 774 } 775 776 return ret; 777 } 778 779 /* 780 * MMIO register write with bytes helper functions 781 * @offset:bytes offset from MMIO start 782 * @value: the value want to be written to the register 783 */ 784 785 /** 786 * amdgpu_mm_wreg8 - read a memory mapped IO register 787 * 788 * @adev: amdgpu_device pointer 789 * @offset: byte aligned register offset 790 * @value: 8 bit value to write 791 * 792 * Writes the value specified to the offset specified. 793 */ 794 void amdgpu_mm_wreg8(struct amdgpu_device *adev, uint32_t offset, uint8_t value) 795 { 796 if (amdgpu_device_skip_hw_access(adev)) 797 return; 798 799 if (offset < adev->rmmio_size) 800 writeb(value, adev->rmmio + offset); 801 else 802 BUG(); 803 } 804 805 /** 806 * amdgpu_device_wreg - write to a memory mapped IO or indirect register 807 * 808 * @adev: amdgpu_device pointer 809 * @reg: dword aligned register offset 810 * @v: 32 bit value to write to the register 811 * @acc_flags: access flags which require special behavior 812 * 813 * Writes the value specified to the offset specified. 814 */ 815 void amdgpu_device_wreg(struct amdgpu_device *adev, 816 uint32_t reg, uint32_t v, 817 uint32_t acc_flags) 818 { 819 if (amdgpu_device_skip_hw_access(adev)) 820 return; 821 822 if ((reg * 4) < adev->rmmio_size) { 823 if (!(acc_flags & AMDGPU_REGS_NO_KIQ) && 824 amdgpu_sriov_runtime(adev) && 825 down_read_trylock(&adev->reset_domain->sem)) { 826 amdgpu_kiq_wreg(adev, reg, v, 0); 827 up_read(&adev->reset_domain->sem); 828 } else { 829 writel(v, ((void __iomem *)adev->rmmio) + (reg * 4)); 830 } 831 } else { 832 adev->pcie_wreg(adev, reg * 4, v); 833 } 834 835 trace_amdgpu_device_wreg(adev->pdev->device, reg, v); 836 } 837 838 /** 839 * amdgpu_mm_wreg_mmio_rlc - write register either with direct/indirect mmio or with RLC path if in range 840 * 841 * @adev: amdgpu_device pointer 842 * @reg: mmio/rlc register 843 * @v: value to write 844 * @xcc_id: xcc accelerated compute core id 845 * 846 * this function is invoked only for the debugfs register access 847 */ 848 void amdgpu_mm_wreg_mmio_rlc(struct amdgpu_device *adev, 849 uint32_t reg, uint32_t v, 850 uint32_t xcc_id) 851 { 852 if (amdgpu_device_skip_hw_access(adev)) 853 return; 854 855 if (amdgpu_sriov_fullaccess(adev) && 856 adev->gfx.rlc.funcs && 857 adev->gfx.rlc.funcs->is_rlcg_access_range) { 858 if (adev->gfx.rlc.funcs->is_rlcg_access_range(adev, reg)) 859 return amdgpu_sriov_wreg(adev, reg, v, 0, 0, xcc_id); 860 } else if ((reg * 4) >= adev->rmmio_size) { 861 adev->pcie_wreg(adev, reg * 4, v); 862 } else { 863 writel(v, ((void __iomem *)adev->rmmio) + (reg * 4)); 864 } 865 } 866 867 /** 868 * amdgpu_device_xcc_wreg - write to a memory mapped IO or indirect register with specific XCC 869 * 870 * @adev: amdgpu_device pointer 871 * @reg: dword aligned register offset 872 * @v: 32 bit value to write to the register 873 * @acc_flags: access flags which require special behavior 874 * @xcc_id: xcc accelerated compute core id 875 * 876 * Writes the value specified to the offset specified. 877 */ 878 void amdgpu_device_xcc_wreg(struct amdgpu_device *adev, 879 uint32_t reg, uint32_t v, 880 uint32_t acc_flags, uint32_t xcc_id) 881 { 882 uint32_t rlcg_flag; 883 884 if (amdgpu_device_skip_hw_access(adev)) 885 return; 886 887 if ((reg * 4) < adev->rmmio_size) { 888 if (amdgpu_sriov_vf(adev) && 889 !amdgpu_sriov_runtime(adev) && 890 adev->gfx.rlc.rlcg_reg_access_supported && 891 amdgpu_virt_get_rlcg_reg_access_flag(adev, acc_flags, 892 GC_HWIP, true, 893 &rlcg_flag)) { 894 amdgpu_virt_rlcg_reg_rw(adev, reg, v, rlcg_flag, GET_INST(GC, xcc_id)); 895 } else if (!(acc_flags & AMDGPU_REGS_NO_KIQ) && 896 amdgpu_sriov_runtime(adev) && 897 down_read_trylock(&adev->reset_domain->sem)) { 898 amdgpu_kiq_wreg(adev, reg, v, xcc_id); 899 up_read(&adev->reset_domain->sem); 900 } else { 901 writel(v, ((void __iomem *)adev->rmmio) + (reg * 4)); 902 } 903 } else { 904 adev->pcie_wreg(adev, reg * 4, v); 905 } 906 } 907 908 /** 909 * amdgpu_device_indirect_rreg - read an indirect register 910 * 911 * @adev: amdgpu_device pointer 912 * @reg_addr: indirect register address to read from 913 * 914 * Returns the value of indirect register @reg_addr 915 */ 916 u32 amdgpu_device_indirect_rreg(struct amdgpu_device *adev, 917 u32 reg_addr) 918 { 919 unsigned long flags, pcie_index, pcie_data; 920 void __iomem *pcie_index_offset; 921 void __iomem *pcie_data_offset; 922 u32 r; 923 924 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 925 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 926 927 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 928 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 929 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 930 931 writel(reg_addr, pcie_index_offset); 932 readl(pcie_index_offset); 933 r = readl(pcie_data_offset); 934 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 935 936 return r; 937 } 938 939 u32 amdgpu_device_indirect_rreg_ext(struct amdgpu_device *adev, 940 u64 reg_addr) 941 { 942 unsigned long flags, pcie_index, pcie_index_hi, pcie_data; 943 u32 r; 944 void __iomem *pcie_index_offset; 945 void __iomem *pcie_index_hi_offset; 946 void __iomem *pcie_data_offset; 947 948 if (unlikely(!adev->nbio.funcs)) { 949 pcie_index = AMDGPU_PCIE_INDEX_FALLBACK; 950 pcie_data = AMDGPU_PCIE_DATA_FALLBACK; 951 } else { 952 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 953 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 954 } 955 956 if (reg_addr >> 32) { 957 if (unlikely(!adev->nbio.funcs)) 958 pcie_index_hi = AMDGPU_PCIE_INDEX_HI_FALLBACK; 959 else 960 pcie_index_hi = adev->nbio.funcs->get_pcie_index_hi_offset(adev); 961 } else { 962 pcie_index_hi = 0; 963 } 964 965 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 966 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 967 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 968 if (pcie_index_hi != 0) 969 pcie_index_hi_offset = (void __iomem *)adev->rmmio + 970 pcie_index_hi * 4; 971 972 writel(reg_addr, pcie_index_offset); 973 readl(pcie_index_offset); 974 if (pcie_index_hi != 0) { 975 writel((reg_addr >> 32) & 0xff, pcie_index_hi_offset); 976 readl(pcie_index_hi_offset); 977 } 978 r = readl(pcie_data_offset); 979 980 /* clear the high bits */ 981 if (pcie_index_hi != 0) { 982 writel(0, pcie_index_hi_offset); 983 readl(pcie_index_hi_offset); 984 } 985 986 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 987 988 return r; 989 } 990 991 /** 992 * amdgpu_device_indirect_rreg64 - read a 64bits indirect register 993 * 994 * @adev: amdgpu_device pointer 995 * @reg_addr: indirect register address to read from 996 * 997 * Returns the value of indirect register @reg_addr 998 */ 999 u64 amdgpu_device_indirect_rreg64(struct amdgpu_device *adev, 1000 u32 reg_addr) 1001 { 1002 unsigned long flags, pcie_index, pcie_data; 1003 void __iomem *pcie_index_offset; 1004 void __iomem *pcie_data_offset; 1005 u64 r; 1006 1007 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 1008 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 1009 1010 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 1011 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 1012 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 1013 1014 /* read low 32 bits */ 1015 writel(reg_addr, pcie_index_offset); 1016 readl(pcie_index_offset); 1017 r = readl(pcie_data_offset); 1018 /* read high 32 bits */ 1019 writel(reg_addr + 4, pcie_index_offset); 1020 readl(pcie_index_offset); 1021 r |= ((u64)readl(pcie_data_offset) << 32); 1022 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 1023 1024 return r; 1025 } 1026 1027 u64 amdgpu_device_indirect_rreg64_ext(struct amdgpu_device *adev, 1028 u64 reg_addr) 1029 { 1030 unsigned long flags, pcie_index, pcie_data; 1031 unsigned long pcie_index_hi = 0; 1032 void __iomem *pcie_index_offset; 1033 void __iomem *pcie_index_hi_offset; 1034 void __iomem *pcie_data_offset; 1035 u64 r; 1036 1037 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 1038 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 1039 if ((reg_addr >> 32) && (adev->nbio.funcs->get_pcie_index_hi_offset)) 1040 pcie_index_hi = adev->nbio.funcs->get_pcie_index_hi_offset(adev); 1041 1042 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 1043 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 1044 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 1045 if (pcie_index_hi != 0) 1046 pcie_index_hi_offset = (void __iomem *)adev->rmmio + 1047 pcie_index_hi * 4; 1048 1049 /* read low 32 bits */ 1050 writel(reg_addr, pcie_index_offset); 1051 readl(pcie_index_offset); 1052 if (pcie_index_hi != 0) { 1053 writel((reg_addr >> 32) & 0xff, pcie_index_hi_offset); 1054 readl(pcie_index_hi_offset); 1055 } 1056 r = readl(pcie_data_offset); 1057 /* read high 32 bits */ 1058 writel(reg_addr + 4, pcie_index_offset); 1059 readl(pcie_index_offset); 1060 if (pcie_index_hi != 0) { 1061 writel((reg_addr >> 32) & 0xff, pcie_index_hi_offset); 1062 readl(pcie_index_hi_offset); 1063 } 1064 r |= ((u64)readl(pcie_data_offset) << 32); 1065 1066 /* clear the high bits */ 1067 if (pcie_index_hi != 0) { 1068 writel(0, pcie_index_hi_offset); 1069 readl(pcie_index_hi_offset); 1070 } 1071 1072 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 1073 1074 return r; 1075 } 1076 1077 /** 1078 * amdgpu_device_indirect_wreg - write an indirect register address 1079 * 1080 * @adev: amdgpu_device pointer 1081 * @reg_addr: indirect register offset 1082 * @reg_data: indirect register data 1083 * 1084 */ 1085 void amdgpu_device_indirect_wreg(struct amdgpu_device *adev, 1086 u32 reg_addr, u32 reg_data) 1087 { 1088 unsigned long flags, pcie_index, pcie_data; 1089 void __iomem *pcie_index_offset; 1090 void __iomem *pcie_data_offset; 1091 1092 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 1093 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 1094 1095 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 1096 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 1097 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 1098 1099 writel(reg_addr, pcie_index_offset); 1100 readl(pcie_index_offset); 1101 writel(reg_data, pcie_data_offset); 1102 readl(pcie_data_offset); 1103 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 1104 } 1105 1106 void amdgpu_device_indirect_wreg_ext(struct amdgpu_device *adev, 1107 u64 reg_addr, u32 reg_data) 1108 { 1109 unsigned long flags, pcie_index, pcie_index_hi, pcie_data; 1110 void __iomem *pcie_index_offset; 1111 void __iomem *pcie_index_hi_offset; 1112 void __iomem *pcie_data_offset; 1113 1114 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 1115 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 1116 if ((reg_addr >> 32) && (adev->nbio.funcs->get_pcie_index_hi_offset)) 1117 pcie_index_hi = adev->nbio.funcs->get_pcie_index_hi_offset(adev); 1118 else 1119 pcie_index_hi = 0; 1120 1121 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 1122 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 1123 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 1124 if (pcie_index_hi != 0) 1125 pcie_index_hi_offset = (void __iomem *)adev->rmmio + 1126 pcie_index_hi * 4; 1127 1128 writel(reg_addr, pcie_index_offset); 1129 readl(pcie_index_offset); 1130 if (pcie_index_hi != 0) { 1131 writel((reg_addr >> 32) & 0xff, pcie_index_hi_offset); 1132 readl(pcie_index_hi_offset); 1133 } 1134 writel(reg_data, pcie_data_offset); 1135 readl(pcie_data_offset); 1136 1137 /* clear the high bits */ 1138 if (pcie_index_hi != 0) { 1139 writel(0, pcie_index_hi_offset); 1140 readl(pcie_index_hi_offset); 1141 } 1142 1143 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 1144 } 1145 1146 /** 1147 * amdgpu_device_indirect_wreg64 - write a 64bits indirect register address 1148 * 1149 * @adev: amdgpu_device pointer 1150 * @reg_addr: indirect register offset 1151 * @reg_data: indirect register data 1152 * 1153 */ 1154 void amdgpu_device_indirect_wreg64(struct amdgpu_device *adev, 1155 u32 reg_addr, u64 reg_data) 1156 { 1157 unsigned long flags, pcie_index, pcie_data; 1158 void __iomem *pcie_index_offset; 1159 void __iomem *pcie_data_offset; 1160 1161 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 1162 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 1163 1164 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 1165 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 1166 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 1167 1168 /* write low 32 bits */ 1169 writel(reg_addr, pcie_index_offset); 1170 readl(pcie_index_offset); 1171 writel((u32)(reg_data & 0xffffffffULL), pcie_data_offset); 1172 readl(pcie_data_offset); 1173 /* write high 32 bits */ 1174 writel(reg_addr + 4, pcie_index_offset); 1175 readl(pcie_index_offset); 1176 writel((u32)(reg_data >> 32), pcie_data_offset); 1177 readl(pcie_data_offset); 1178 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 1179 } 1180 1181 void amdgpu_device_indirect_wreg64_ext(struct amdgpu_device *adev, 1182 u64 reg_addr, u64 reg_data) 1183 { 1184 unsigned long flags, pcie_index, pcie_data; 1185 unsigned long pcie_index_hi = 0; 1186 void __iomem *pcie_index_offset; 1187 void __iomem *pcie_index_hi_offset; 1188 void __iomem *pcie_data_offset; 1189 1190 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 1191 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 1192 if ((reg_addr >> 32) && (adev->nbio.funcs->get_pcie_index_hi_offset)) 1193 pcie_index_hi = adev->nbio.funcs->get_pcie_index_hi_offset(adev); 1194 1195 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 1196 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 1197 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 1198 if (pcie_index_hi != 0) 1199 pcie_index_hi_offset = (void __iomem *)adev->rmmio + 1200 pcie_index_hi * 4; 1201 1202 /* write low 32 bits */ 1203 writel(reg_addr, pcie_index_offset); 1204 readl(pcie_index_offset); 1205 if (pcie_index_hi != 0) { 1206 writel((reg_addr >> 32) & 0xff, pcie_index_hi_offset); 1207 readl(pcie_index_hi_offset); 1208 } 1209 writel((u32)(reg_data & 0xffffffffULL), pcie_data_offset); 1210 readl(pcie_data_offset); 1211 /* write high 32 bits */ 1212 writel(reg_addr + 4, pcie_index_offset); 1213 readl(pcie_index_offset); 1214 if (pcie_index_hi != 0) { 1215 writel((reg_addr >> 32) & 0xff, pcie_index_hi_offset); 1216 readl(pcie_index_hi_offset); 1217 } 1218 writel((u32)(reg_data >> 32), pcie_data_offset); 1219 readl(pcie_data_offset); 1220 1221 /* clear the high bits */ 1222 if (pcie_index_hi != 0) { 1223 writel(0, pcie_index_hi_offset); 1224 readl(pcie_index_hi_offset); 1225 } 1226 1227 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 1228 } 1229 1230 /** 1231 * amdgpu_device_get_rev_id - query device rev_id 1232 * 1233 * @adev: amdgpu_device pointer 1234 * 1235 * Return device rev_id 1236 */ 1237 u32 amdgpu_device_get_rev_id(struct amdgpu_device *adev) 1238 { 1239 return adev->nbio.funcs->get_rev_id(adev); 1240 } 1241 1242 /** 1243 * amdgpu_invalid_rreg - dummy reg read function 1244 * 1245 * @adev: amdgpu_device pointer 1246 * @reg: offset of register 1247 * 1248 * Dummy register read function. Used for register blocks 1249 * that certain asics don't have (all asics). 1250 * Returns the value in the register. 1251 */ 1252 static uint32_t amdgpu_invalid_rreg(struct amdgpu_device *adev, uint32_t reg) 1253 { 1254 DRM_ERROR("Invalid callback to read register 0x%04X\n", reg); 1255 BUG(); 1256 return 0; 1257 } 1258 1259 static uint32_t amdgpu_invalid_rreg_ext(struct amdgpu_device *adev, uint64_t reg) 1260 { 1261 DRM_ERROR("Invalid callback to read register 0x%llX\n", reg); 1262 BUG(); 1263 return 0; 1264 } 1265 1266 /** 1267 * amdgpu_invalid_wreg - dummy reg write function 1268 * 1269 * @adev: amdgpu_device pointer 1270 * @reg: offset of register 1271 * @v: value to write to the register 1272 * 1273 * Dummy register read function. Used for register blocks 1274 * that certain asics don't have (all asics). 1275 */ 1276 static void amdgpu_invalid_wreg(struct amdgpu_device *adev, uint32_t reg, uint32_t v) 1277 { 1278 DRM_ERROR("Invalid callback to write register 0x%04X with 0x%08X\n", 1279 reg, v); 1280 BUG(); 1281 } 1282 1283 static void amdgpu_invalid_wreg_ext(struct amdgpu_device *adev, uint64_t reg, uint32_t v) 1284 { 1285 DRM_ERROR("Invalid callback to write register 0x%llX with 0x%08X\n", 1286 reg, v); 1287 BUG(); 1288 } 1289 1290 /** 1291 * amdgpu_invalid_rreg64 - dummy 64 bit reg read function 1292 * 1293 * @adev: amdgpu_device pointer 1294 * @reg: offset of register 1295 * 1296 * Dummy register read function. Used for register blocks 1297 * that certain asics don't have (all asics). 1298 * Returns the value in the register. 1299 */ 1300 static uint64_t amdgpu_invalid_rreg64(struct amdgpu_device *adev, uint32_t reg) 1301 { 1302 DRM_ERROR("Invalid callback to read 64 bit register 0x%04X\n", reg); 1303 BUG(); 1304 return 0; 1305 } 1306 1307 static uint64_t amdgpu_invalid_rreg64_ext(struct amdgpu_device *adev, uint64_t reg) 1308 { 1309 DRM_ERROR("Invalid callback to read register 0x%llX\n", reg); 1310 BUG(); 1311 return 0; 1312 } 1313 1314 /** 1315 * amdgpu_invalid_wreg64 - dummy reg write function 1316 * 1317 * @adev: amdgpu_device pointer 1318 * @reg: offset of register 1319 * @v: value to write to the register 1320 * 1321 * Dummy register read function. Used for register blocks 1322 * that certain asics don't have (all asics). 1323 */ 1324 static void amdgpu_invalid_wreg64(struct amdgpu_device *adev, uint32_t reg, uint64_t v) 1325 { 1326 DRM_ERROR("Invalid callback to write 64 bit register 0x%04X with 0x%08llX\n", 1327 reg, v); 1328 BUG(); 1329 } 1330 1331 static void amdgpu_invalid_wreg64_ext(struct amdgpu_device *adev, uint64_t reg, uint64_t v) 1332 { 1333 DRM_ERROR("Invalid callback to write 64 bit register 0x%llX with 0x%08llX\n", 1334 reg, v); 1335 BUG(); 1336 } 1337 1338 /** 1339 * amdgpu_block_invalid_rreg - dummy reg read function 1340 * 1341 * @adev: amdgpu_device pointer 1342 * @block: offset of instance 1343 * @reg: offset of register 1344 * 1345 * Dummy register read function. Used for register blocks 1346 * that certain asics don't have (all asics). 1347 * Returns the value in the register. 1348 */ 1349 static uint32_t amdgpu_block_invalid_rreg(struct amdgpu_device *adev, 1350 uint32_t block, uint32_t reg) 1351 { 1352 DRM_ERROR("Invalid callback to read register 0x%04X in block 0x%04X\n", 1353 reg, block); 1354 BUG(); 1355 return 0; 1356 } 1357 1358 /** 1359 * amdgpu_block_invalid_wreg - dummy reg write function 1360 * 1361 * @adev: amdgpu_device pointer 1362 * @block: offset of instance 1363 * @reg: offset of register 1364 * @v: value to write to the register 1365 * 1366 * Dummy register read function. Used for register blocks 1367 * that certain asics don't have (all asics). 1368 */ 1369 static void amdgpu_block_invalid_wreg(struct amdgpu_device *adev, 1370 uint32_t block, 1371 uint32_t reg, uint32_t v) 1372 { 1373 DRM_ERROR("Invalid block callback to write register 0x%04X in block 0x%04X with 0x%08X\n", 1374 reg, block, v); 1375 BUG(); 1376 } 1377 1378 /** 1379 * amdgpu_device_asic_init - Wrapper for atom asic_init 1380 * 1381 * @adev: amdgpu_device pointer 1382 * 1383 * Does any asic specific work and then calls atom asic init. 1384 */ 1385 static int amdgpu_device_asic_init(struct amdgpu_device *adev) 1386 { 1387 int ret; 1388 1389 amdgpu_asic_pre_asic_init(adev); 1390 1391 if (amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 3) || 1392 amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 4) || 1393 amdgpu_ip_version(adev, GC_HWIP, 0) >= IP_VERSION(11, 0, 0)) { 1394 amdgpu_psp_wait_for_bootloader(adev); 1395 ret = amdgpu_atomfirmware_asic_init(adev, true); 1396 return ret; 1397 } else { 1398 return amdgpu_atom_asic_init(adev->mode_info.atom_context); 1399 } 1400 1401 return 0; 1402 } 1403 1404 /** 1405 * amdgpu_device_mem_scratch_init - allocate the VRAM scratch page 1406 * 1407 * @adev: amdgpu_device pointer 1408 * 1409 * Allocates a scratch page of VRAM for use by various things in the 1410 * driver. 1411 */ 1412 static int amdgpu_device_mem_scratch_init(struct amdgpu_device *adev) 1413 { 1414 return amdgpu_bo_create_kernel(adev, AMDGPU_GPU_PAGE_SIZE, PAGE_SIZE, 1415 AMDGPU_GEM_DOMAIN_VRAM | 1416 AMDGPU_GEM_DOMAIN_GTT, 1417 &adev->mem_scratch.robj, 1418 &adev->mem_scratch.gpu_addr, 1419 (void **)&adev->mem_scratch.ptr); 1420 } 1421 1422 /** 1423 * amdgpu_device_mem_scratch_fini - Free the VRAM scratch page 1424 * 1425 * @adev: amdgpu_device pointer 1426 * 1427 * Frees the VRAM scratch page. 1428 */ 1429 static void amdgpu_device_mem_scratch_fini(struct amdgpu_device *adev) 1430 { 1431 amdgpu_bo_free_kernel(&adev->mem_scratch.robj, NULL, NULL); 1432 } 1433 1434 /** 1435 * amdgpu_device_program_register_sequence - program an array of registers. 1436 * 1437 * @adev: amdgpu_device pointer 1438 * @registers: pointer to the register array 1439 * @array_size: size of the register array 1440 * 1441 * Programs an array or registers with and or masks. 1442 * This is a helper for setting golden registers. 1443 */ 1444 void amdgpu_device_program_register_sequence(struct amdgpu_device *adev, 1445 const u32 *registers, 1446 const u32 array_size) 1447 { 1448 u32 tmp, reg, and_mask, or_mask; 1449 int i; 1450 1451 if (array_size % 3) 1452 return; 1453 1454 for (i = 0; i < array_size; i += 3) { 1455 reg = registers[i + 0]; 1456 and_mask = registers[i + 1]; 1457 or_mask = registers[i + 2]; 1458 1459 if (and_mask == 0xffffffff) { 1460 tmp = or_mask; 1461 } else { 1462 tmp = RREG32(reg); 1463 tmp &= ~and_mask; 1464 if (adev->family >= AMDGPU_FAMILY_AI) 1465 tmp |= (or_mask & and_mask); 1466 else 1467 tmp |= or_mask; 1468 } 1469 WREG32(reg, tmp); 1470 } 1471 } 1472 1473 /** 1474 * amdgpu_device_pci_config_reset - reset the GPU 1475 * 1476 * @adev: amdgpu_device pointer 1477 * 1478 * Resets the GPU using the pci config reset sequence. 1479 * Only applicable to asics prior to vega10. 1480 */ 1481 void amdgpu_device_pci_config_reset(struct amdgpu_device *adev) 1482 { 1483 pci_write_config_dword(adev->pdev, 0x7c, AMDGPU_ASIC_RESET_DATA); 1484 } 1485 1486 /** 1487 * amdgpu_device_pci_reset - reset the GPU using generic PCI means 1488 * 1489 * @adev: amdgpu_device pointer 1490 * 1491 * Resets the GPU using generic pci reset interfaces (FLR, SBR, etc.). 1492 */ 1493 int amdgpu_device_pci_reset(struct amdgpu_device *adev) 1494 { 1495 return pci_reset_function(adev->pdev); 1496 } 1497 1498 /* 1499 * amdgpu_device_wb_*() 1500 * Writeback is the method by which the GPU updates special pages in memory 1501 * with the status of certain GPU events (fences, ring pointers,etc.). 1502 */ 1503 1504 /** 1505 * amdgpu_device_wb_fini - Disable Writeback and free memory 1506 * 1507 * @adev: amdgpu_device pointer 1508 * 1509 * Disables Writeback and frees the Writeback memory (all asics). 1510 * Used at driver shutdown. 1511 */ 1512 static void amdgpu_device_wb_fini(struct amdgpu_device *adev) 1513 { 1514 if (adev->wb.wb_obj) { 1515 amdgpu_bo_free_kernel(&adev->wb.wb_obj, 1516 &adev->wb.gpu_addr, 1517 (void **)&adev->wb.wb); 1518 adev->wb.wb_obj = NULL; 1519 } 1520 } 1521 1522 /** 1523 * amdgpu_device_wb_init - Init Writeback driver info and allocate memory 1524 * 1525 * @adev: amdgpu_device pointer 1526 * 1527 * Initializes writeback and allocates writeback memory (all asics). 1528 * Used at driver startup. 1529 * Returns 0 on success or an -error on failure. 1530 */ 1531 static int amdgpu_device_wb_init(struct amdgpu_device *adev) 1532 { 1533 int r; 1534 1535 if (adev->wb.wb_obj == NULL) { 1536 /* AMDGPU_MAX_WB * sizeof(uint32_t) * 8 = AMDGPU_MAX_WB 256bit slots */ 1537 r = amdgpu_bo_create_kernel(adev, AMDGPU_MAX_WB * sizeof(uint32_t) * 8, 1538 PAGE_SIZE, AMDGPU_GEM_DOMAIN_GTT, 1539 &adev->wb.wb_obj, &adev->wb.gpu_addr, 1540 (void **)&adev->wb.wb); 1541 if (r) { 1542 dev_warn(adev->dev, "(%d) create WB bo failed\n", r); 1543 return r; 1544 } 1545 1546 adev->wb.num_wb = AMDGPU_MAX_WB; 1547 memset(&adev->wb.used, 0, sizeof(adev->wb.used)); 1548 1549 /* clear wb memory */ 1550 memset((char *)adev->wb.wb, 0, AMDGPU_MAX_WB * sizeof(uint32_t) * 8); 1551 } 1552 1553 return 0; 1554 } 1555 1556 /** 1557 * amdgpu_device_wb_get - Allocate a wb entry 1558 * 1559 * @adev: amdgpu_device pointer 1560 * @wb: wb index 1561 * 1562 * Allocate a wb slot for use by the driver (all asics). 1563 * Returns 0 on success or -EINVAL on failure. 1564 */ 1565 int amdgpu_device_wb_get(struct amdgpu_device *adev, u32 *wb) 1566 { 1567 unsigned long flags, offset; 1568 1569 spin_lock_irqsave(&adev->wb.lock, flags); 1570 offset = find_first_zero_bit(adev->wb.used, adev->wb.num_wb); 1571 if (offset < adev->wb.num_wb) { 1572 __set_bit(offset, adev->wb.used); 1573 spin_unlock_irqrestore(&adev->wb.lock, flags); 1574 *wb = offset << 3; /* convert to dw offset */ 1575 return 0; 1576 } else { 1577 spin_unlock_irqrestore(&adev->wb.lock, flags); 1578 return -EINVAL; 1579 } 1580 } 1581 1582 /** 1583 * amdgpu_device_wb_free - Free a wb entry 1584 * 1585 * @adev: amdgpu_device pointer 1586 * @wb: wb index 1587 * 1588 * Free a wb slot allocated for use by the driver (all asics) 1589 */ 1590 void amdgpu_device_wb_free(struct amdgpu_device *adev, u32 wb) 1591 { 1592 unsigned long flags; 1593 1594 wb >>= 3; 1595 spin_lock_irqsave(&adev->wb.lock, flags); 1596 if (wb < adev->wb.num_wb) 1597 __clear_bit(wb, adev->wb.used); 1598 spin_unlock_irqrestore(&adev->wb.lock, flags); 1599 } 1600 1601 /** 1602 * amdgpu_device_resize_fb_bar - try to resize FB BAR 1603 * 1604 * @adev: amdgpu_device pointer 1605 * 1606 * Try to resize FB BAR to make all VRAM CPU accessible. We try very hard not 1607 * to fail, but if any of the BARs is not accessible after the size we abort 1608 * driver loading by returning -ENODEV. 1609 */ 1610 int amdgpu_device_resize_fb_bar(struct amdgpu_device *adev) 1611 { 1612 int rbar_size = pci_rebar_bytes_to_size(adev->gmc.real_vram_size); 1613 struct pci_bus *root; 1614 struct resource *res; 1615 unsigned int i; 1616 u16 cmd; 1617 int r; 1618 1619 if (!IS_ENABLED(CONFIG_PHYS_ADDR_T_64BIT)) 1620 return 0; 1621 1622 /* Bypass for VF */ 1623 if (amdgpu_sriov_vf(adev)) 1624 return 0; 1625 1626 /* PCI_EXT_CAP_ID_VNDR extended capability is located at 0x100 */ 1627 if (!pci_find_ext_capability(adev->pdev, PCI_EXT_CAP_ID_VNDR)) 1628 DRM_WARN("System can't access extended configuration space, please check!!\n"); 1629 1630 /* skip if the bios has already enabled large BAR */ 1631 if (adev->gmc.real_vram_size && 1632 (pci_resource_len(adev->pdev, 0) >= adev->gmc.real_vram_size)) 1633 return 0; 1634 1635 /* Check if the root BUS has 64bit memory resources */ 1636 root = adev->pdev->bus; 1637 while (root->parent) 1638 root = root->parent; 1639 1640 pci_bus_for_each_resource(root, res, i) { 1641 if (res && res->flags & (IORESOURCE_MEM | IORESOURCE_MEM_64) && 1642 res->start > 0x100000000ull) 1643 break; 1644 } 1645 1646 /* Trying to resize is pointless without a root hub window above 4GB */ 1647 if (!res) 1648 return 0; 1649 1650 /* Limit the BAR size to what is available */ 1651 rbar_size = min(fls(pci_rebar_get_possible_sizes(adev->pdev, 0)) - 1, 1652 rbar_size); 1653 1654 /* Disable memory decoding while we change the BAR addresses and size */ 1655 pci_read_config_word(adev->pdev, PCI_COMMAND, &cmd); 1656 pci_write_config_word(adev->pdev, PCI_COMMAND, 1657 cmd & ~PCI_COMMAND_MEMORY); 1658 1659 /* Free the VRAM and doorbell BAR, we most likely need to move both. */ 1660 amdgpu_doorbell_fini(adev); 1661 if (adev->asic_type >= CHIP_BONAIRE) 1662 pci_release_resource(adev->pdev, 2); 1663 1664 pci_release_resource(adev->pdev, 0); 1665 1666 r = pci_resize_resource(adev->pdev, 0, rbar_size); 1667 if (r == -ENOSPC) 1668 DRM_INFO("Not enough PCI address space for a large BAR."); 1669 else if (r && r != -ENOTSUPP) 1670 DRM_ERROR("Problem resizing BAR0 (%d).", r); 1671 1672 pci_assign_unassigned_bus_resources(adev->pdev->bus); 1673 1674 /* When the doorbell or fb BAR isn't available we have no chance of 1675 * using the device. 1676 */ 1677 r = amdgpu_doorbell_init(adev); 1678 if (r || (pci_resource_flags(adev->pdev, 0) & IORESOURCE_UNSET)) 1679 return -ENODEV; 1680 1681 pci_write_config_word(adev->pdev, PCI_COMMAND, cmd); 1682 1683 return 0; 1684 } 1685 1686 static bool amdgpu_device_read_bios(struct amdgpu_device *adev) 1687 { 1688 if (hweight32(adev->aid_mask) && (adev->flags & AMD_IS_APU)) 1689 return false; 1690 1691 return true; 1692 } 1693 1694 /* 1695 * GPU helpers function. 1696 */ 1697 /** 1698 * amdgpu_device_need_post - check if the hw need post or not 1699 * 1700 * @adev: amdgpu_device pointer 1701 * 1702 * Check if the asic has been initialized (all asics) at driver startup 1703 * or post is needed if hw reset is performed. 1704 * Returns true if need or false if not. 1705 */ 1706 bool amdgpu_device_need_post(struct amdgpu_device *adev) 1707 { 1708 uint32_t reg; 1709 1710 if (amdgpu_sriov_vf(adev)) 1711 return false; 1712 1713 if (!amdgpu_device_read_bios(adev)) 1714 return false; 1715 1716 if (amdgpu_passthrough(adev)) { 1717 /* for FIJI: In whole GPU pass-through virtualization case, after VM reboot 1718 * some old smc fw still need driver do vPost otherwise gpu hang, while 1719 * those smc fw version above 22.15 doesn't have this flaw, so we force 1720 * vpost executed for smc version below 22.15 1721 */ 1722 if (adev->asic_type == CHIP_FIJI) { 1723 int err; 1724 uint32_t fw_ver; 1725 1726 err = request_firmware(&adev->pm.fw, "amdgpu/fiji_smc.bin", adev->dev); 1727 /* force vPost if error occured */ 1728 if (err) 1729 return true; 1730 1731 fw_ver = *((uint32_t *)adev->pm.fw->data + 69); 1732 release_firmware(adev->pm.fw); 1733 if (fw_ver < 0x00160e00) 1734 return true; 1735 } 1736 } 1737 1738 /* Don't post if we need to reset whole hive on init */ 1739 if (adev->init_lvl->level == AMDGPU_INIT_LEVEL_MINIMAL_XGMI) 1740 return false; 1741 1742 if (adev->has_hw_reset) { 1743 adev->has_hw_reset = false; 1744 return true; 1745 } 1746 1747 /* bios scratch used on CIK+ */ 1748 if (adev->asic_type >= CHIP_BONAIRE) 1749 return amdgpu_atombios_scratch_need_asic_init(adev); 1750 1751 /* check MEM_SIZE for older asics */ 1752 reg = amdgpu_asic_get_config_memsize(adev); 1753 1754 if ((reg != 0) && (reg != 0xffffffff)) 1755 return false; 1756 1757 return true; 1758 } 1759 1760 /* 1761 * Check whether seamless boot is supported. 1762 * 1763 * So far we only support seamless boot on DCE 3.0 or later. 1764 * If users report that it works on older ASICS as well, we may 1765 * loosen this. 1766 */ 1767 bool amdgpu_device_seamless_boot_supported(struct amdgpu_device *adev) 1768 { 1769 switch (amdgpu_seamless) { 1770 case -1: 1771 break; 1772 case 1: 1773 return true; 1774 case 0: 1775 return false; 1776 default: 1777 DRM_ERROR("Invalid value for amdgpu.seamless: %d\n", 1778 amdgpu_seamless); 1779 return false; 1780 } 1781 1782 if (!(adev->flags & AMD_IS_APU)) 1783 return false; 1784 1785 if (adev->mman.keep_stolen_vga_memory) 1786 return false; 1787 1788 return amdgpu_ip_version(adev, DCE_HWIP, 0) >= IP_VERSION(3, 0, 0); 1789 } 1790 1791 /* 1792 * Intel hosts such as Rocket Lake, Alder Lake, Raptor Lake and Sapphire Rapids 1793 * don't support dynamic speed switching. Until we have confirmation from Intel 1794 * that a specific host supports it, it's safer that we keep it disabled for all. 1795 * 1796 * https://edc.intel.com/content/www/us/en/design/products/platforms/details/raptor-lake-s/13th-generation-core-processors-datasheet-volume-1-of-2/005/pci-express-support/ 1797 * https://gitlab.freedesktop.org/drm/amd/-/issues/2663 1798 */ 1799 static bool amdgpu_device_pcie_dynamic_switching_supported(struct amdgpu_device *adev) 1800 { 1801 #if IS_ENABLED(CONFIG_X86) 1802 struct cpuinfo_x86 *c = &cpu_data(0); 1803 1804 /* eGPU change speeds based on USB4 fabric conditions */ 1805 if (dev_is_removable(adev->dev)) 1806 return true; 1807 1808 if (c->x86_vendor == X86_VENDOR_INTEL) 1809 return false; 1810 #endif 1811 return true; 1812 } 1813 1814 /** 1815 * amdgpu_device_should_use_aspm - check if the device should program ASPM 1816 * 1817 * @adev: amdgpu_device pointer 1818 * 1819 * Confirm whether the module parameter and pcie bridge agree that ASPM should 1820 * be set for this device. 1821 * 1822 * Returns true if it should be used or false if not. 1823 */ 1824 bool amdgpu_device_should_use_aspm(struct amdgpu_device *adev) 1825 { 1826 switch (amdgpu_aspm) { 1827 case -1: 1828 break; 1829 case 0: 1830 return false; 1831 case 1: 1832 return true; 1833 default: 1834 return false; 1835 } 1836 if (adev->flags & AMD_IS_APU) 1837 return false; 1838 if (!(adev->pm.pp_feature & PP_PCIE_DPM_MASK)) 1839 return false; 1840 return pcie_aspm_enabled(adev->pdev); 1841 } 1842 1843 /* if we get transitioned to only one device, take VGA back */ 1844 /** 1845 * amdgpu_device_vga_set_decode - enable/disable vga decode 1846 * 1847 * @pdev: PCI device pointer 1848 * @state: enable/disable vga decode 1849 * 1850 * Enable/disable vga decode (all asics). 1851 * Returns VGA resource flags. 1852 */ 1853 static unsigned int amdgpu_device_vga_set_decode(struct pci_dev *pdev, 1854 bool state) 1855 { 1856 struct amdgpu_device *adev = drm_to_adev(pci_get_drvdata(pdev)); 1857 1858 amdgpu_asic_set_vga_state(adev, state); 1859 if (state) 1860 return VGA_RSRC_LEGACY_IO | VGA_RSRC_LEGACY_MEM | 1861 VGA_RSRC_NORMAL_IO | VGA_RSRC_NORMAL_MEM; 1862 else 1863 return VGA_RSRC_NORMAL_IO | VGA_RSRC_NORMAL_MEM; 1864 } 1865 1866 /** 1867 * amdgpu_device_check_block_size - validate the vm block size 1868 * 1869 * @adev: amdgpu_device pointer 1870 * 1871 * Validates the vm block size specified via module parameter. 1872 * The vm block size defines number of bits in page table versus page directory, 1873 * a page is 4KB so we have 12 bits offset, minimum 9 bits in the 1874 * page table and the remaining bits are in the page directory. 1875 */ 1876 static void amdgpu_device_check_block_size(struct amdgpu_device *adev) 1877 { 1878 /* defines number of bits in page table versus page directory, 1879 * a page is 4KB so we have 12 bits offset, minimum 9 bits in the 1880 * page table and the remaining bits are in the page directory 1881 */ 1882 if (amdgpu_vm_block_size == -1) 1883 return; 1884 1885 if (amdgpu_vm_block_size < 9) { 1886 dev_warn(adev->dev, "VM page table size (%d) too small\n", 1887 amdgpu_vm_block_size); 1888 amdgpu_vm_block_size = -1; 1889 } 1890 } 1891 1892 /** 1893 * amdgpu_device_check_vm_size - validate the vm size 1894 * 1895 * @adev: amdgpu_device pointer 1896 * 1897 * Validates the vm size in GB specified via module parameter. 1898 * The VM size is the size of the GPU virtual memory space in GB. 1899 */ 1900 static void amdgpu_device_check_vm_size(struct amdgpu_device *adev) 1901 { 1902 /* no need to check the default value */ 1903 if (amdgpu_vm_size == -1) 1904 return; 1905 1906 if (amdgpu_vm_size < 1) { 1907 dev_warn(adev->dev, "VM size (%d) too small, min is 1GB\n", 1908 amdgpu_vm_size); 1909 amdgpu_vm_size = -1; 1910 } 1911 } 1912 1913 static void amdgpu_device_check_smu_prv_buffer_size(struct amdgpu_device *adev) 1914 { 1915 struct sysinfo si; 1916 bool is_os_64 = (sizeof(void *) == 8); 1917 uint64_t total_memory; 1918 uint64_t dram_size_seven_GB = 0x1B8000000; 1919 uint64_t dram_size_three_GB = 0xB8000000; 1920 1921 if (amdgpu_smu_memory_pool_size == 0) 1922 return; 1923 1924 if (!is_os_64) { 1925 DRM_WARN("Not 64-bit OS, feature not supported\n"); 1926 goto def_value; 1927 } 1928 si_meminfo(&si); 1929 total_memory = (uint64_t)si.totalram * si.mem_unit; 1930 1931 if ((amdgpu_smu_memory_pool_size == 1) || 1932 (amdgpu_smu_memory_pool_size == 2)) { 1933 if (total_memory < dram_size_three_GB) 1934 goto def_value1; 1935 } else if ((amdgpu_smu_memory_pool_size == 4) || 1936 (amdgpu_smu_memory_pool_size == 8)) { 1937 if (total_memory < dram_size_seven_GB) 1938 goto def_value1; 1939 } else { 1940 DRM_WARN("Smu memory pool size not supported\n"); 1941 goto def_value; 1942 } 1943 adev->pm.smu_prv_buffer_size = amdgpu_smu_memory_pool_size << 28; 1944 1945 return; 1946 1947 def_value1: 1948 DRM_WARN("No enough system memory\n"); 1949 def_value: 1950 adev->pm.smu_prv_buffer_size = 0; 1951 } 1952 1953 static int amdgpu_device_init_apu_flags(struct amdgpu_device *adev) 1954 { 1955 if (!(adev->flags & AMD_IS_APU) || 1956 adev->asic_type < CHIP_RAVEN) 1957 return 0; 1958 1959 switch (adev->asic_type) { 1960 case CHIP_RAVEN: 1961 if (adev->pdev->device == 0x15dd) 1962 adev->apu_flags |= AMD_APU_IS_RAVEN; 1963 if (adev->pdev->device == 0x15d8) 1964 adev->apu_flags |= AMD_APU_IS_PICASSO; 1965 break; 1966 case CHIP_RENOIR: 1967 if ((adev->pdev->device == 0x1636) || 1968 (adev->pdev->device == 0x164c)) 1969 adev->apu_flags |= AMD_APU_IS_RENOIR; 1970 else 1971 adev->apu_flags |= AMD_APU_IS_GREEN_SARDINE; 1972 break; 1973 case CHIP_VANGOGH: 1974 adev->apu_flags |= AMD_APU_IS_VANGOGH; 1975 break; 1976 case CHIP_YELLOW_CARP: 1977 break; 1978 case CHIP_CYAN_SKILLFISH: 1979 if ((adev->pdev->device == 0x13FE) || 1980 (adev->pdev->device == 0x143F)) 1981 adev->apu_flags |= AMD_APU_IS_CYAN_SKILLFISH2; 1982 break; 1983 default: 1984 break; 1985 } 1986 1987 return 0; 1988 } 1989 1990 /** 1991 * amdgpu_device_check_arguments - validate module params 1992 * 1993 * @adev: amdgpu_device pointer 1994 * 1995 * Validates certain module parameters and updates 1996 * the associated values used by the driver (all asics). 1997 */ 1998 static int amdgpu_device_check_arguments(struct amdgpu_device *adev) 1999 { 2000 int i; 2001 2002 if (amdgpu_sched_jobs < 4) { 2003 dev_warn(adev->dev, "sched jobs (%d) must be at least 4\n", 2004 amdgpu_sched_jobs); 2005 amdgpu_sched_jobs = 4; 2006 } else if (!is_power_of_2(amdgpu_sched_jobs)) { 2007 dev_warn(adev->dev, "sched jobs (%d) must be a power of 2\n", 2008 amdgpu_sched_jobs); 2009 amdgpu_sched_jobs = roundup_pow_of_two(amdgpu_sched_jobs); 2010 } 2011 2012 if (amdgpu_gart_size != -1 && amdgpu_gart_size < 32) { 2013 /* gart size must be greater or equal to 32M */ 2014 dev_warn(adev->dev, "gart size (%d) too small\n", 2015 amdgpu_gart_size); 2016 amdgpu_gart_size = -1; 2017 } 2018 2019 if (amdgpu_gtt_size != -1 && amdgpu_gtt_size < 32) { 2020 /* gtt size must be greater or equal to 32M */ 2021 dev_warn(adev->dev, "gtt size (%d) too small\n", 2022 amdgpu_gtt_size); 2023 amdgpu_gtt_size = -1; 2024 } 2025 2026 /* valid range is between 4 and 9 inclusive */ 2027 if (amdgpu_vm_fragment_size != -1 && 2028 (amdgpu_vm_fragment_size > 9 || amdgpu_vm_fragment_size < 4)) { 2029 dev_warn(adev->dev, "valid range is between 4 and 9\n"); 2030 amdgpu_vm_fragment_size = -1; 2031 } 2032 2033 if (amdgpu_sched_hw_submission < 2) { 2034 dev_warn(adev->dev, "sched hw submission jobs (%d) must be at least 2\n", 2035 amdgpu_sched_hw_submission); 2036 amdgpu_sched_hw_submission = 2; 2037 } else if (!is_power_of_2(amdgpu_sched_hw_submission)) { 2038 dev_warn(adev->dev, "sched hw submission jobs (%d) must be a power of 2\n", 2039 amdgpu_sched_hw_submission); 2040 amdgpu_sched_hw_submission = roundup_pow_of_two(amdgpu_sched_hw_submission); 2041 } 2042 2043 if (amdgpu_reset_method < -1 || amdgpu_reset_method > 4) { 2044 dev_warn(adev->dev, "invalid option for reset method, reverting to default\n"); 2045 amdgpu_reset_method = -1; 2046 } 2047 2048 amdgpu_device_check_smu_prv_buffer_size(adev); 2049 2050 amdgpu_device_check_vm_size(adev); 2051 2052 amdgpu_device_check_block_size(adev); 2053 2054 adev->firmware.load_type = amdgpu_ucode_get_load_type(adev, amdgpu_fw_load_type); 2055 2056 for (i = 0; i < MAX_XCP; i++) 2057 adev->enforce_isolation[i] = !!enforce_isolation; 2058 2059 return 0; 2060 } 2061 2062 /** 2063 * amdgpu_switcheroo_set_state - set switcheroo state 2064 * 2065 * @pdev: pci dev pointer 2066 * @state: vga_switcheroo state 2067 * 2068 * Callback for the switcheroo driver. Suspends or resumes 2069 * the asics before or after it is powered up using ACPI methods. 2070 */ 2071 static void amdgpu_switcheroo_set_state(struct pci_dev *pdev, 2072 enum vga_switcheroo_state state) 2073 { 2074 struct drm_device *dev = pci_get_drvdata(pdev); 2075 int r; 2076 2077 if (amdgpu_device_supports_px(dev) && state == VGA_SWITCHEROO_OFF) 2078 return; 2079 2080 if (state == VGA_SWITCHEROO_ON) { 2081 pr_info("switched on\n"); 2082 /* don't suspend or resume card normally */ 2083 dev->switch_power_state = DRM_SWITCH_POWER_CHANGING; 2084 2085 pci_set_power_state(pdev, PCI_D0); 2086 amdgpu_device_load_pci_state(pdev); 2087 r = pci_enable_device(pdev); 2088 if (r) 2089 DRM_WARN("pci_enable_device failed (%d)\n", r); 2090 amdgpu_device_resume(dev, true); 2091 2092 dev->switch_power_state = DRM_SWITCH_POWER_ON; 2093 } else { 2094 pr_info("switched off\n"); 2095 dev->switch_power_state = DRM_SWITCH_POWER_CHANGING; 2096 amdgpu_device_prepare(dev); 2097 amdgpu_device_suspend(dev, true); 2098 amdgpu_device_cache_pci_state(pdev); 2099 /* Shut down the device */ 2100 pci_disable_device(pdev); 2101 pci_set_power_state(pdev, PCI_D3cold); 2102 dev->switch_power_state = DRM_SWITCH_POWER_OFF; 2103 } 2104 } 2105 2106 /** 2107 * amdgpu_switcheroo_can_switch - see if switcheroo state can change 2108 * 2109 * @pdev: pci dev pointer 2110 * 2111 * Callback for the switcheroo driver. Check of the switcheroo 2112 * state can be changed. 2113 * Returns true if the state can be changed, false if not. 2114 */ 2115 static bool amdgpu_switcheroo_can_switch(struct pci_dev *pdev) 2116 { 2117 struct drm_device *dev = pci_get_drvdata(pdev); 2118 2119 /* 2120 * FIXME: open_count is protected by drm_global_mutex but that would lead to 2121 * locking inversion with the driver load path. And the access here is 2122 * completely racy anyway. So don't bother with locking for now. 2123 */ 2124 return atomic_read(&dev->open_count) == 0; 2125 } 2126 2127 static const struct vga_switcheroo_client_ops amdgpu_switcheroo_ops = { 2128 .set_gpu_state = amdgpu_switcheroo_set_state, 2129 .reprobe = NULL, 2130 .can_switch = amdgpu_switcheroo_can_switch, 2131 }; 2132 2133 /** 2134 * amdgpu_device_ip_set_clockgating_state - set the CG state 2135 * 2136 * @dev: amdgpu_device pointer 2137 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.) 2138 * @state: clockgating state (gate or ungate) 2139 * 2140 * Sets the requested clockgating state for all instances of 2141 * the hardware IP specified. 2142 * Returns the error code from the last instance. 2143 */ 2144 int amdgpu_device_ip_set_clockgating_state(void *dev, 2145 enum amd_ip_block_type block_type, 2146 enum amd_clockgating_state state) 2147 { 2148 struct amdgpu_device *adev = dev; 2149 int i, r = 0; 2150 2151 for (i = 0; i < adev->num_ip_blocks; i++) { 2152 if (!adev->ip_blocks[i].status.valid) 2153 continue; 2154 if (adev->ip_blocks[i].version->type != block_type) 2155 continue; 2156 if (!adev->ip_blocks[i].version->funcs->set_clockgating_state) 2157 continue; 2158 r = adev->ip_blocks[i].version->funcs->set_clockgating_state( 2159 (void *)adev, state); 2160 if (r) 2161 DRM_ERROR("set_clockgating_state of IP block <%s> failed %d\n", 2162 adev->ip_blocks[i].version->funcs->name, r); 2163 } 2164 return r; 2165 } 2166 2167 /** 2168 * amdgpu_device_ip_set_powergating_state - set the PG state 2169 * 2170 * @dev: amdgpu_device pointer 2171 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.) 2172 * @state: powergating state (gate or ungate) 2173 * 2174 * Sets the requested powergating state for all instances of 2175 * the hardware IP specified. 2176 * Returns the error code from the last instance. 2177 */ 2178 int amdgpu_device_ip_set_powergating_state(void *dev, 2179 enum amd_ip_block_type block_type, 2180 enum amd_powergating_state state) 2181 { 2182 struct amdgpu_device *adev = dev; 2183 int i, r = 0; 2184 2185 for (i = 0; i < adev->num_ip_blocks; i++) { 2186 if (!adev->ip_blocks[i].status.valid) 2187 continue; 2188 if (adev->ip_blocks[i].version->type != block_type) 2189 continue; 2190 if (!adev->ip_blocks[i].version->funcs->set_powergating_state) 2191 continue; 2192 r = adev->ip_blocks[i].version->funcs->set_powergating_state( 2193 (void *)adev, state); 2194 if (r) 2195 DRM_ERROR("set_powergating_state of IP block <%s> failed %d\n", 2196 adev->ip_blocks[i].version->funcs->name, r); 2197 } 2198 return r; 2199 } 2200 2201 /** 2202 * amdgpu_device_ip_get_clockgating_state - get the CG state 2203 * 2204 * @adev: amdgpu_device pointer 2205 * @flags: clockgating feature flags 2206 * 2207 * Walks the list of IPs on the device and updates the clockgating 2208 * flags for each IP. 2209 * Updates @flags with the feature flags for each hardware IP where 2210 * clockgating is enabled. 2211 */ 2212 void amdgpu_device_ip_get_clockgating_state(struct amdgpu_device *adev, 2213 u64 *flags) 2214 { 2215 int i; 2216 2217 for (i = 0; i < adev->num_ip_blocks; i++) { 2218 if (!adev->ip_blocks[i].status.valid) 2219 continue; 2220 if (adev->ip_blocks[i].version->funcs->get_clockgating_state) 2221 adev->ip_blocks[i].version->funcs->get_clockgating_state((void *)adev, flags); 2222 } 2223 } 2224 2225 /** 2226 * amdgpu_device_ip_wait_for_idle - wait for idle 2227 * 2228 * @adev: amdgpu_device pointer 2229 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.) 2230 * 2231 * Waits for the request hardware IP to be idle. 2232 * Returns 0 for success or a negative error code on failure. 2233 */ 2234 int amdgpu_device_ip_wait_for_idle(struct amdgpu_device *adev, 2235 enum amd_ip_block_type block_type) 2236 { 2237 int i, r; 2238 2239 for (i = 0; i < adev->num_ip_blocks; i++) { 2240 if (!adev->ip_blocks[i].status.valid) 2241 continue; 2242 if (adev->ip_blocks[i].version->type == block_type) { 2243 if (adev->ip_blocks[i].version->funcs->wait_for_idle) { 2244 r = adev->ip_blocks[i].version->funcs->wait_for_idle( 2245 &adev->ip_blocks[i]); 2246 if (r) 2247 return r; 2248 } 2249 break; 2250 } 2251 } 2252 return 0; 2253 2254 } 2255 2256 /** 2257 * amdgpu_device_ip_is_valid - is the hardware IP enabled 2258 * 2259 * @adev: amdgpu_device pointer 2260 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.) 2261 * 2262 * Check if the hardware IP is enable or not. 2263 * Returns true if it the IP is enable, false if not. 2264 */ 2265 bool amdgpu_device_ip_is_valid(struct amdgpu_device *adev, 2266 enum amd_ip_block_type block_type) 2267 { 2268 int i; 2269 2270 for (i = 0; i < adev->num_ip_blocks; i++) { 2271 if (adev->ip_blocks[i].version->type == block_type) 2272 return adev->ip_blocks[i].status.valid; 2273 } 2274 return false; 2275 2276 } 2277 2278 /** 2279 * amdgpu_device_ip_get_ip_block - get a hw IP pointer 2280 * 2281 * @adev: amdgpu_device pointer 2282 * @type: Type of hardware IP (SMU, GFX, UVD, etc.) 2283 * 2284 * Returns a pointer to the hardware IP block structure 2285 * if it exists for the asic, otherwise NULL. 2286 */ 2287 struct amdgpu_ip_block * 2288 amdgpu_device_ip_get_ip_block(struct amdgpu_device *adev, 2289 enum amd_ip_block_type type) 2290 { 2291 int i; 2292 2293 for (i = 0; i < adev->num_ip_blocks; i++) 2294 if (adev->ip_blocks[i].version->type == type) 2295 return &adev->ip_blocks[i]; 2296 2297 return NULL; 2298 } 2299 2300 /** 2301 * amdgpu_device_ip_block_version_cmp 2302 * 2303 * @adev: amdgpu_device pointer 2304 * @type: enum amd_ip_block_type 2305 * @major: major version 2306 * @minor: minor version 2307 * 2308 * return 0 if equal or greater 2309 * return 1 if smaller or the ip_block doesn't exist 2310 */ 2311 int amdgpu_device_ip_block_version_cmp(struct amdgpu_device *adev, 2312 enum amd_ip_block_type type, 2313 u32 major, u32 minor) 2314 { 2315 struct amdgpu_ip_block *ip_block = amdgpu_device_ip_get_ip_block(adev, type); 2316 2317 if (ip_block && ((ip_block->version->major > major) || 2318 ((ip_block->version->major == major) && 2319 (ip_block->version->minor >= minor)))) 2320 return 0; 2321 2322 return 1; 2323 } 2324 2325 /** 2326 * amdgpu_device_ip_block_add 2327 * 2328 * @adev: amdgpu_device pointer 2329 * @ip_block_version: pointer to the IP to add 2330 * 2331 * Adds the IP block driver information to the collection of IPs 2332 * on the asic. 2333 */ 2334 int amdgpu_device_ip_block_add(struct amdgpu_device *adev, 2335 const struct amdgpu_ip_block_version *ip_block_version) 2336 { 2337 if (!ip_block_version) 2338 return -EINVAL; 2339 2340 switch (ip_block_version->type) { 2341 case AMD_IP_BLOCK_TYPE_VCN: 2342 if (adev->harvest_ip_mask & AMD_HARVEST_IP_VCN_MASK) 2343 return 0; 2344 break; 2345 case AMD_IP_BLOCK_TYPE_JPEG: 2346 if (adev->harvest_ip_mask & AMD_HARVEST_IP_JPEG_MASK) 2347 return 0; 2348 break; 2349 default: 2350 break; 2351 } 2352 2353 DRM_INFO("add ip block number %d <%s>\n", adev->num_ip_blocks, 2354 ip_block_version->funcs->name); 2355 2356 adev->ip_blocks[adev->num_ip_blocks].adev = adev; 2357 2358 adev->ip_blocks[adev->num_ip_blocks++].version = ip_block_version; 2359 2360 return 0; 2361 } 2362 2363 /** 2364 * amdgpu_device_enable_virtual_display - enable virtual display feature 2365 * 2366 * @adev: amdgpu_device pointer 2367 * 2368 * Enabled the virtual display feature if the user has enabled it via 2369 * the module parameter virtual_display. This feature provides a virtual 2370 * display hardware on headless boards or in virtualized environments. 2371 * This function parses and validates the configuration string specified by 2372 * the user and configues the virtual display configuration (number of 2373 * virtual connectors, crtcs, etc.) specified. 2374 */ 2375 static void amdgpu_device_enable_virtual_display(struct amdgpu_device *adev) 2376 { 2377 adev->enable_virtual_display = false; 2378 2379 if (amdgpu_virtual_display) { 2380 const char *pci_address_name = pci_name(adev->pdev); 2381 char *pciaddstr, *pciaddstr_tmp, *pciaddname_tmp, *pciaddname; 2382 2383 pciaddstr = kstrdup(amdgpu_virtual_display, GFP_KERNEL); 2384 pciaddstr_tmp = pciaddstr; 2385 while ((pciaddname_tmp = strsep(&pciaddstr_tmp, ";"))) { 2386 pciaddname = strsep(&pciaddname_tmp, ","); 2387 if (!strcmp("all", pciaddname) 2388 || !strcmp(pci_address_name, pciaddname)) { 2389 long num_crtc; 2390 int res = -1; 2391 2392 adev->enable_virtual_display = true; 2393 2394 if (pciaddname_tmp) 2395 res = kstrtol(pciaddname_tmp, 10, 2396 &num_crtc); 2397 2398 if (!res) { 2399 if (num_crtc < 1) 2400 num_crtc = 1; 2401 if (num_crtc > 6) 2402 num_crtc = 6; 2403 adev->mode_info.num_crtc = num_crtc; 2404 } else { 2405 adev->mode_info.num_crtc = 1; 2406 } 2407 break; 2408 } 2409 } 2410 2411 DRM_INFO("virtual display string:%s, %s:virtual_display:%d, num_crtc:%d\n", 2412 amdgpu_virtual_display, pci_address_name, 2413 adev->enable_virtual_display, adev->mode_info.num_crtc); 2414 2415 kfree(pciaddstr); 2416 } 2417 } 2418 2419 void amdgpu_device_set_sriov_virtual_display(struct amdgpu_device *adev) 2420 { 2421 if (amdgpu_sriov_vf(adev) && !adev->enable_virtual_display) { 2422 adev->mode_info.num_crtc = 1; 2423 adev->enable_virtual_display = true; 2424 DRM_INFO("virtual_display:%d, num_crtc:%d\n", 2425 adev->enable_virtual_display, adev->mode_info.num_crtc); 2426 } 2427 } 2428 2429 /** 2430 * amdgpu_device_parse_gpu_info_fw - parse gpu info firmware 2431 * 2432 * @adev: amdgpu_device pointer 2433 * 2434 * Parses the asic configuration parameters specified in the gpu info 2435 * firmware and makes them availale to the driver for use in configuring 2436 * the asic. 2437 * Returns 0 on success, -EINVAL on failure. 2438 */ 2439 static int amdgpu_device_parse_gpu_info_fw(struct amdgpu_device *adev) 2440 { 2441 const char *chip_name; 2442 int err; 2443 const struct gpu_info_firmware_header_v1_0 *hdr; 2444 2445 adev->firmware.gpu_info_fw = NULL; 2446 2447 if (adev->mman.discovery_bin) 2448 return 0; 2449 2450 switch (adev->asic_type) { 2451 default: 2452 return 0; 2453 case CHIP_VEGA10: 2454 chip_name = "vega10"; 2455 break; 2456 case CHIP_VEGA12: 2457 chip_name = "vega12"; 2458 break; 2459 case CHIP_RAVEN: 2460 if (adev->apu_flags & AMD_APU_IS_RAVEN2) 2461 chip_name = "raven2"; 2462 else if (adev->apu_flags & AMD_APU_IS_PICASSO) 2463 chip_name = "picasso"; 2464 else 2465 chip_name = "raven"; 2466 break; 2467 case CHIP_ARCTURUS: 2468 chip_name = "arcturus"; 2469 break; 2470 case CHIP_NAVI12: 2471 chip_name = "navi12"; 2472 break; 2473 } 2474 2475 err = amdgpu_ucode_request(adev, &adev->firmware.gpu_info_fw, 2476 "amdgpu/%s_gpu_info.bin", chip_name); 2477 if (err) { 2478 dev_err(adev->dev, 2479 "Failed to get gpu_info firmware \"%s_gpu_info.bin\"\n", 2480 chip_name); 2481 goto out; 2482 } 2483 2484 hdr = (const struct gpu_info_firmware_header_v1_0 *)adev->firmware.gpu_info_fw->data; 2485 amdgpu_ucode_print_gpu_info_hdr(&hdr->header); 2486 2487 switch (hdr->version_major) { 2488 case 1: 2489 { 2490 const struct gpu_info_firmware_v1_0 *gpu_info_fw = 2491 (const struct gpu_info_firmware_v1_0 *)(adev->firmware.gpu_info_fw->data + 2492 le32_to_cpu(hdr->header.ucode_array_offset_bytes)); 2493 2494 /* 2495 * Should be droped when DAL no longer needs it. 2496 */ 2497 if (adev->asic_type == CHIP_NAVI12) 2498 goto parse_soc_bounding_box; 2499 2500 adev->gfx.config.max_shader_engines = le32_to_cpu(gpu_info_fw->gc_num_se); 2501 adev->gfx.config.max_cu_per_sh = le32_to_cpu(gpu_info_fw->gc_num_cu_per_sh); 2502 adev->gfx.config.max_sh_per_se = le32_to_cpu(gpu_info_fw->gc_num_sh_per_se); 2503 adev->gfx.config.max_backends_per_se = le32_to_cpu(gpu_info_fw->gc_num_rb_per_se); 2504 adev->gfx.config.max_texture_channel_caches = 2505 le32_to_cpu(gpu_info_fw->gc_num_tccs); 2506 adev->gfx.config.max_gprs = le32_to_cpu(gpu_info_fw->gc_num_gprs); 2507 adev->gfx.config.max_gs_threads = le32_to_cpu(gpu_info_fw->gc_num_max_gs_thds); 2508 adev->gfx.config.gs_vgt_table_depth = le32_to_cpu(gpu_info_fw->gc_gs_table_depth); 2509 adev->gfx.config.gs_prim_buffer_depth = le32_to_cpu(gpu_info_fw->gc_gsprim_buff_depth); 2510 adev->gfx.config.double_offchip_lds_buf = 2511 le32_to_cpu(gpu_info_fw->gc_double_offchip_lds_buffer); 2512 adev->gfx.cu_info.wave_front_size = le32_to_cpu(gpu_info_fw->gc_wave_size); 2513 adev->gfx.cu_info.max_waves_per_simd = 2514 le32_to_cpu(gpu_info_fw->gc_max_waves_per_simd); 2515 adev->gfx.cu_info.max_scratch_slots_per_cu = 2516 le32_to_cpu(gpu_info_fw->gc_max_scratch_slots_per_cu); 2517 adev->gfx.cu_info.lds_size = le32_to_cpu(gpu_info_fw->gc_lds_size); 2518 if (hdr->version_minor >= 1) { 2519 const struct gpu_info_firmware_v1_1 *gpu_info_fw = 2520 (const struct gpu_info_firmware_v1_1 *)(adev->firmware.gpu_info_fw->data + 2521 le32_to_cpu(hdr->header.ucode_array_offset_bytes)); 2522 adev->gfx.config.num_sc_per_sh = 2523 le32_to_cpu(gpu_info_fw->num_sc_per_sh); 2524 adev->gfx.config.num_packer_per_sc = 2525 le32_to_cpu(gpu_info_fw->num_packer_per_sc); 2526 } 2527 2528 parse_soc_bounding_box: 2529 /* 2530 * soc bounding box info is not integrated in disocovery table, 2531 * we always need to parse it from gpu info firmware if needed. 2532 */ 2533 if (hdr->version_minor == 2) { 2534 const struct gpu_info_firmware_v1_2 *gpu_info_fw = 2535 (const struct gpu_info_firmware_v1_2 *)(adev->firmware.gpu_info_fw->data + 2536 le32_to_cpu(hdr->header.ucode_array_offset_bytes)); 2537 adev->dm.soc_bounding_box = &gpu_info_fw->soc_bounding_box; 2538 } 2539 break; 2540 } 2541 default: 2542 dev_err(adev->dev, 2543 "Unsupported gpu_info table %d\n", hdr->header.ucode_version); 2544 err = -EINVAL; 2545 goto out; 2546 } 2547 out: 2548 return err; 2549 } 2550 2551 /** 2552 * amdgpu_device_ip_early_init - run early init for hardware IPs 2553 * 2554 * @adev: amdgpu_device pointer 2555 * 2556 * Early initialization pass for hardware IPs. The hardware IPs that make 2557 * up each asic are discovered each IP's early_init callback is run. This 2558 * is the first stage in initializing the asic. 2559 * Returns 0 on success, negative error code on failure. 2560 */ 2561 static int amdgpu_device_ip_early_init(struct amdgpu_device *adev) 2562 { 2563 struct amdgpu_ip_block *ip_block; 2564 struct pci_dev *parent; 2565 int i, r; 2566 bool total; 2567 2568 amdgpu_device_enable_virtual_display(adev); 2569 2570 if (amdgpu_sriov_vf(adev)) { 2571 r = amdgpu_virt_request_full_gpu(adev, true); 2572 if (r) 2573 return r; 2574 } 2575 2576 switch (adev->asic_type) { 2577 #ifdef CONFIG_DRM_AMDGPU_SI 2578 case CHIP_VERDE: 2579 case CHIP_TAHITI: 2580 case CHIP_PITCAIRN: 2581 case CHIP_OLAND: 2582 case CHIP_HAINAN: 2583 adev->family = AMDGPU_FAMILY_SI; 2584 r = si_set_ip_blocks(adev); 2585 if (r) 2586 return r; 2587 break; 2588 #endif 2589 #ifdef CONFIG_DRM_AMDGPU_CIK 2590 case CHIP_BONAIRE: 2591 case CHIP_HAWAII: 2592 case CHIP_KAVERI: 2593 case CHIP_KABINI: 2594 case CHIP_MULLINS: 2595 if (adev->flags & AMD_IS_APU) 2596 adev->family = AMDGPU_FAMILY_KV; 2597 else 2598 adev->family = AMDGPU_FAMILY_CI; 2599 2600 r = cik_set_ip_blocks(adev); 2601 if (r) 2602 return r; 2603 break; 2604 #endif 2605 case CHIP_TOPAZ: 2606 case CHIP_TONGA: 2607 case CHIP_FIJI: 2608 case CHIP_POLARIS10: 2609 case CHIP_POLARIS11: 2610 case CHIP_POLARIS12: 2611 case CHIP_VEGAM: 2612 case CHIP_CARRIZO: 2613 case CHIP_STONEY: 2614 if (adev->flags & AMD_IS_APU) 2615 adev->family = AMDGPU_FAMILY_CZ; 2616 else 2617 adev->family = AMDGPU_FAMILY_VI; 2618 2619 r = vi_set_ip_blocks(adev); 2620 if (r) 2621 return r; 2622 break; 2623 default: 2624 r = amdgpu_discovery_set_ip_blocks(adev); 2625 if (r) 2626 return r; 2627 break; 2628 } 2629 2630 if (amdgpu_has_atpx() && 2631 (amdgpu_is_atpx_hybrid() || 2632 amdgpu_has_atpx_dgpu_power_cntl()) && 2633 ((adev->flags & AMD_IS_APU) == 0) && 2634 !dev_is_removable(&adev->pdev->dev)) 2635 adev->flags |= AMD_IS_PX; 2636 2637 if (!(adev->flags & AMD_IS_APU)) { 2638 parent = pcie_find_root_port(adev->pdev); 2639 adev->has_pr3 = parent ? pci_pr3_present(parent) : false; 2640 } 2641 2642 2643 adev->pm.pp_feature = amdgpu_pp_feature_mask; 2644 if (amdgpu_sriov_vf(adev) || sched_policy == KFD_SCHED_POLICY_NO_HWS) 2645 adev->pm.pp_feature &= ~PP_GFXOFF_MASK; 2646 if (amdgpu_sriov_vf(adev) && adev->asic_type == CHIP_SIENNA_CICHLID) 2647 adev->pm.pp_feature &= ~PP_OVERDRIVE_MASK; 2648 if (!amdgpu_device_pcie_dynamic_switching_supported(adev)) 2649 adev->pm.pp_feature &= ~PP_PCIE_DPM_MASK; 2650 2651 total = true; 2652 for (i = 0; i < adev->num_ip_blocks; i++) { 2653 ip_block = &adev->ip_blocks[i]; 2654 2655 if ((amdgpu_ip_block_mask & (1 << i)) == 0) { 2656 DRM_WARN("disabled ip block: %d <%s>\n", 2657 i, adev->ip_blocks[i].version->funcs->name); 2658 adev->ip_blocks[i].status.valid = false; 2659 } else if (ip_block->version->funcs->early_init) { 2660 r = ip_block->version->funcs->early_init(ip_block); 2661 if (r == -ENOENT) { 2662 adev->ip_blocks[i].status.valid = false; 2663 } else if (r) { 2664 DRM_ERROR("early_init of IP block <%s> failed %d\n", 2665 adev->ip_blocks[i].version->funcs->name, r); 2666 total = false; 2667 } else { 2668 adev->ip_blocks[i].status.valid = true; 2669 } 2670 } else { 2671 adev->ip_blocks[i].status.valid = true; 2672 } 2673 /* get the vbios after the asic_funcs are set up */ 2674 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON) { 2675 r = amdgpu_device_parse_gpu_info_fw(adev); 2676 if (r) 2677 return r; 2678 2679 /* Read BIOS */ 2680 if (amdgpu_device_read_bios(adev)) { 2681 if (!amdgpu_get_bios(adev)) 2682 return -EINVAL; 2683 2684 r = amdgpu_atombios_init(adev); 2685 if (r) { 2686 dev_err(adev->dev, "amdgpu_atombios_init failed\n"); 2687 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_INIT_FAIL, 0, 0); 2688 return r; 2689 } 2690 } 2691 2692 /*get pf2vf msg info at it's earliest time*/ 2693 if (amdgpu_sriov_vf(adev)) 2694 amdgpu_virt_init_data_exchange(adev); 2695 2696 } 2697 } 2698 if (!total) 2699 return -ENODEV; 2700 2701 ip_block = amdgpu_device_ip_get_ip_block(adev, AMD_IP_BLOCK_TYPE_GFX); 2702 if (ip_block->status.valid != false) 2703 amdgpu_amdkfd_device_probe(adev); 2704 2705 adev->cg_flags &= amdgpu_cg_mask; 2706 adev->pg_flags &= amdgpu_pg_mask; 2707 2708 return 0; 2709 } 2710 2711 static int amdgpu_device_ip_hw_init_phase1(struct amdgpu_device *adev) 2712 { 2713 int i, r; 2714 2715 for (i = 0; i < adev->num_ip_blocks; i++) { 2716 if (!adev->ip_blocks[i].status.sw) 2717 continue; 2718 if (adev->ip_blocks[i].status.hw) 2719 continue; 2720 if (!amdgpu_ip_member_of_hwini( 2721 adev, adev->ip_blocks[i].version->type)) 2722 continue; 2723 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON || 2724 (amdgpu_sriov_vf(adev) && (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP)) || 2725 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH) { 2726 r = adev->ip_blocks[i].version->funcs->hw_init(&adev->ip_blocks[i]); 2727 if (r) { 2728 DRM_ERROR("hw_init of IP block <%s> failed %d\n", 2729 adev->ip_blocks[i].version->funcs->name, r); 2730 return r; 2731 } 2732 adev->ip_blocks[i].status.hw = true; 2733 } 2734 } 2735 2736 return 0; 2737 } 2738 2739 static int amdgpu_device_ip_hw_init_phase2(struct amdgpu_device *adev) 2740 { 2741 int i, r; 2742 2743 for (i = 0; i < adev->num_ip_blocks; i++) { 2744 if (!adev->ip_blocks[i].status.sw) 2745 continue; 2746 if (adev->ip_blocks[i].status.hw) 2747 continue; 2748 if (!amdgpu_ip_member_of_hwini( 2749 adev, adev->ip_blocks[i].version->type)) 2750 continue; 2751 r = adev->ip_blocks[i].version->funcs->hw_init(&adev->ip_blocks[i]); 2752 if (r) { 2753 DRM_ERROR("hw_init of IP block <%s> failed %d\n", 2754 adev->ip_blocks[i].version->funcs->name, r); 2755 return r; 2756 } 2757 adev->ip_blocks[i].status.hw = true; 2758 } 2759 2760 return 0; 2761 } 2762 2763 static int amdgpu_device_fw_loading(struct amdgpu_device *adev) 2764 { 2765 int r = 0; 2766 int i; 2767 uint32_t smu_version; 2768 2769 if (adev->asic_type >= CHIP_VEGA10) { 2770 for (i = 0; i < adev->num_ip_blocks; i++) { 2771 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_PSP) 2772 continue; 2773 2774 if (!amdgpu_ip_member_of_hwini(adev, 2775 AMD_IP_BLOCK_TYPE_PSP)) 2776 break; 2777 2778 if (!adev->ip_blocks[i].status.sw) 2779 continue; 2780 2781 /* no need to do the fw loading again if already done*/ 2782 if (adev->ip_blocks[i].status.hw == true) 2783 break; 2784 2785 if (amdgpu_in_reset(adev) || adev->in_suspend) { 2786 r = amdgpu_ip_block_resume(&adev->ip_blocks[i]); 2787 if (r) 2788 return r; 2789 } else { 2790 r = adev->ip_blocks[i].version->funcs->hw_init(&adev->ip_blocks[i]); 2791 if (r) { 2792 DRM_ERROR("hw_init of IP block <%s> failed %d\n", 2793 adev->ip_blocks[i].version->funcs->name, r); 2794 return r; 2795 } 2796 adev->ip_blocks[i].status.hw = true; 2797 } 2798 break; 2799 } 2800 } 2801 2802 if (!amdgpu_sriov_vf(adev) || adev->asic_type == CHIP_TONGA) 2803 r = amdgpu_pm_load_smu_firmware(adev, &smu_version); 2804 2805 return r; 2806 } 2807 2808 static int amdgpu_device_init_schedulers(struct amdgpu_device *adev) 2809 { 2810 long timeout; 2811 int r, i; 2812 2813 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 2814 struct amdgpu_ring *ring = adev->rings[i]; 2815 2816 /* No need to setup the GPU scheduler for rings that don't need it */ 2817 if (!ring || ring->no_scheduler) 2818 continue; 2819 2820 switch (ring->funcs->type) { 2821 case AMDGPU_RING_TYPE_GFX: 2822 timeout = adev->gfx_timeout; 2823 break; 2824 case AMDGPU_RING_TYPE_COMPUTE: 2825 timeout = adev->compute_timeout; 2826 break; 2827 case AMDGPU_RING_TYPE_SDMA: 2828 timeout = adev->sdma_timeout; 2829 break; 2830 default: 2831 timeout = adev->video_timeout; 2832 break; 2833 } 2834 2835 r = drm_sched_init(&ring->sched, &amdgpu_sched_ops, NULL, 2836 DRM_SCHED_PRIORITY_COUNT, 2837 ring->num_hw_submission, 0, 2838 timeout, adev->reset_domain->wq, 2839 ring->sched_score, ring->name, 2840 adev->dev); 2841 if (r) { 2842 DRM_ERROR("Failed to create scheduler on ring %s.\n", 2843 ring->name); 2844 return r; 2845 } 2846 r = amdgpu_uvd_entity_init(adev, ring); 2847 if (r) { 2848 DRM_ERROR("Failed to create UVD scheduling entity on ring %s.\n", 2849 ring->name); 2850 return r; 2851 } 2852 r = amdgpu_vce_entity_init(adev, ring); 2853 if (r) { 2854 DRM_ERROR("Failed to create VCE scheduling entity on ring %s.\n", 2855 ring->name); 2856 return r; 2857 } 2858 } 2859 2860 amdgpu_xcp_update_partition_sched_list(adev); 2861 2862 return 0; 2863 } 2864 2865 2866 /** 2867 * amdgpu_device_ip_init - run init for hardware IPs 2868 * 2869 * @adev: amdgpu_device pointer 2870 * 2871 * Main initialization pass for hardware IPs. The list of all the hardware 2872 * IPs that make up the asic is walked and the sw_init and hw_init callbacks 2873 * are run. sw_init initializes the software state associated with each IP 2874 * and hw_init initializes the hardware associated with each IP. 2875 * Returns 0 on success, negative error code on failure. 2876 */ 2877 static int amdgpu_device_ip_init(struct amdgpu_device *adev) 2878 { 2879 bool init_badpage; 2880 int i, r; 2881 2882 r = amdgpu_ras_init(adev); 2883 if (r) 2884 return r; 2885 2886 for (i = 0; i < adev->num_ip_blocks; i++) { 2887 if (!adev->ip_blocks[i].status.valid) 2888 continue; 2889 if (adev->ip_blocks[i].version->funcs->sw_init) { 2890 r = adev->ip_blocks[i].version->funcs->sw_init(&adev->ip_blocks[i]); 2891 if (r) { 2892 DRM_ERROR("sw_init of IP block <%s> failed %d\n", 2893 adev->ip_blocks[i].version->funcs->name, r); 2894 goto init_failed; 2895 } 2896 } 2897 adev->ip_blocks[i].status.sw = true; 2898 2899 if (!amdgpu_ip_member_of_hwini( 2900 adev, adev->ip_blocks[i].version->type)) 2901 continue; 2902 2903 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON) { 2904 /* need to do common hw init early so everything is set up for gmc */ 2905 r = adev->ip_blocks[i].version->funcs->hw_init(&adev->ip_blocks[i]); 2906 if (r) { 2907 DRM_ERROR("hw_init %d failed %d\n", i, r); 2908 goto init_failed; 2909 } 2910 adev->ip_blocks[i].status.hw = true; 2911 } else if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) { 2912 /* need to do gmc hw init early so we can allocate gpu mem */ 2913 /* Try to reserve bad pages early */ 2914 if (amdgpu_sriov_vf(adev)) 2915 amdgpu_virt_exchange_data(adev); 2916 2917 r = amdgpu_device_mem_scratch_init(adev); 2918 if (r) { 2919 DRM_ERROR("amdgpu_mem_scratch_init failed %d\n", r); 2920 goto init_failed; 2921 } 2922 r = adev->ip_blocks[i].version->funcs->hw_init(&adev->ip_blocks[i]); 2923 if (r) { 2924 DRM_ERROR("hw_init %d failed %d\n", i, r); 2925 goto init_failed; 2926 } 2927 r = amdgpu_device_wb_init(adev); 2928 if (r) { 2929 DRM_ERROR("amdgpu_device_wb_init failed %d\n", r); 2930 goto init_failed; 2931 } 2932 adev->ip_blocks[i].status.hw = true; 2933 2934 /* right after GMC hw init, we create CSA */ 2935 if (adev->gfx.mcbp) { 2936 r = amdgpu_allocate_static_csa(adev, &adev->virt.csa_obj, 2937 AMDGPU_GEM_DOMAIN_VRAM | 2938 AMDGPU_GEM_DOMAIN_GTT, 2939 AMDGPU_CSA_SIZE); 2940 if (r) { 2941 DRM_ERROR("allocate CSA failed %d\n", r); 2942 goto init_failed; 2943 } 2944 } 2945 2946 r = amdgpu_seq64_init(adev); 2947 if (r) { 2948 DRM_ERROR("allocate seq64 failed %d\n", r); 2949 goto init_failed; 2950 } 2951 } 2952 } 2953 2954 if (amdgpu_sriov_vf(adev)) 2955 amdgpu_virt_init_data_exchange(adev); 2956 2957 r = amdgpu_ib_pool_init(adev); 2958 if (r) { 2959 dev_err(adev->dev, "IB initialization failed (%d).\n", r); 2960 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_IB_INIT_FAIL, 0, r); 2961 goto init_failed; 2962 } 2963 2964 r = amdgpu_ucode_create_bo(adev); /* create ucode bo when sw_init complete*/ 2965 if (r) 2966 goto init_failed; 2967 2968 r = amdgpu_device_ip_hw_init_phase1(adev); 2969 if (r) 2970 goto init_failed; 2971 2972 r = amdgpu_device_fw_loading(adev); 2973 if (r) 2974 goto init_failed; 2975 2976 r = amdgpu_device_ip_hw_init_phase2(adev); 2977 if (r) 2978 goto init_failed; 2979 2980 /* 2981 * retired pages will be loaded from eeprom and reserved here, 2982 * it should be called after amdgpu_device_ip_hw_init_phase2 since 2983 * for some ASICs the RAS EEPROM code relies on SMU fully functioning 2984 * for I2C communication which only true at this point. 2985 * 2986 * amdgpu_ras_recovery_init may fail, but the upper only cares the 2987 * failure from bad gpu situation and stop amdgpu init process 2988 * accordingly. For other failed cases, it will still release all 2989 * the resource and print error message, rather than returning one 2990 * negative value to upper level. 2991 * 2992 * Note: theoretically, this should be called before all vram allocations 2993 * to protect retired page from abusing 2994 */ 2995 init_badpage = (adev->init_lvl->level != AMDGPU_INIT_LEVEL_MINIMAL_XGMI); 2996 r = amdgpu_ras_recovery_init(adev, init_badpage); 2997 if (r) 2998 goto init_failed; 2999 3000 /** 3001 * In case of XGMI grab extra reference for reset domain for this device 3002 */ 3003 if (adev->gmc.xgmi.num_physical_nodes > 1) { 3004 if (amdgpu_xgmi_add_device(adev) == 0) { 3005 if (!amdgpu_sriov_vf(adev)) { 3006 struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev); 3007 3008 if (WARN_ON(!hive)) { 3009 r = -ENOENT; 3010 goto init_failed; 3011 } 3012 3013 if (!hive->reset_domain || 3014 !amdgpu_reset_get_reset_domain(hive->reset_domain)) { 3015 r = -ENOENT; 3016 amdgpu_put_xgmi_hive(hive); 3017 goto init_failed; 3018 } 3019 3020 /* Drop the early temporary reset domain we created for device */ 3021 amdgpu_reset_put_reset_domain(adev->reset_domain); 3022 adev->reset_domain = hive->reset_domain; 3023 amdgpu_put_xgmi_hive(hive); 3024 } 3025 } 3026 } 3027 3028 r = amdgpu_device_init_schedulers(adev); 3029 if (r) 3030 goto init_failed; 3031 3032 if (adev->mman.buffer_funcs_ring->sched.ready) 3033 amdgpu_ttm_set_buffer_funcs_status(adev, true); 3034 3035 /* Don't init kfd if whole hive need to be reset during init */ 3036 if (adev->init_lvl->level != AMDGPU_INIT_LEVEL_MINIMAL_XGMI) { 3037 kgd2kfd_init_zone_device(adev); 3038 amdgpu_amdkfd_device_init(adev); 3039 } 3040 3041 amdgpu_fru_get_product_info(adev); 3042 3043 init_failed: 3044 3045 return r; 3046 } 3047 3048 /** 3049 * amdgpu_device_fill_reset_magic - writes reset magic to gart pointer 3050 * 3051 * @adev: amdgpu_device pointer 3052 * 3053 * Writes a reset magic value to the gart pointer in VRAM. The driver calls 3054 * this function before a GPU reset. If the value is retained after a 3055 * GPU reset, VRAM has not been lost. Some GPU resets may destry VRAM contents. 3056 */ 3057 static void amdgpu_device_fill_reset_magic(struct amdgpu_device *adev) 3058 { 3059 memcpy(adev->reset_magic, adev->gart.ptr, AMDGPU_RESET_MAGIC_NUM); 3060 } 3061 3062 /** 3063 * amdgpu_device_check_vram_lost - check if vram is valid 3064 * 3065 * @adev: amdgpu_device pointer 3066 * 3067 * Checks the reset magic value written to the gart pointer in VRAM. 3068 * The driver calls this after a GPU reset to see if the contents of 3069 * VRAM is lost or now. 3070 * returns true if vram is lost, false if not. 3071 */ 3072 static bool amdgpu_device_check_vram_lost(struct amdgpu_device *adev) 3073 { 3074 if (memcmp(adev->gart.ptr, adev->reset_magic, 3075 AMDGPU_RESET_MAGIC_NUM)) 3076 return true; 3077 3078 if (!amdgpu_in_reset(adev)) 3079 return false; 3080 3081 /* 3082 * For all ASICs with baco/mode1 reset, the VRAM is 3083 * always assumed to be lost. 3084 */ 3085 switch (amdgpu_asic_reset_method(adev)) { 3086 case AMD_RESET_METHOD_BACO: 3087 case AMD_RESET_METHOD_MODE1: 3088 return true; 3089 default: 3090 return false; 3091 } 3092 } 3093 3094 /** 3095 * amdgpu_device_set_cg_state - set clockgating for amdgpu device 3096 * 3097 * @adev: amdgpu_device pointer 3098 * @state: clockgating state (gate or ungate) 3099 * 3100 * The list of all the hardware IPs that make up the asic is walked and the 3101 * set_clockgating_state callbacks are run. 3102 * Late initialization pass enabling clockgating for hardware IPs. 3103 * Fini or suspend, pass disabling clockgating for hardware IPs. 3104 * Returns 0 on success, negative error code on failure. 3105 */ 3106 3107 int amdgpu_device_set_cg_state(struct amdgpu_device *adev, 3108 enum amd_clockgating_state state) 3109 { 3110 int i, j, r; 3111 3112 if (amdgpu_emu_mode == 1) 3113 return 0; 3114 3115 for (j = 0; j < adev->num_ip_blocks; j++) { 3116 i = state == AMD_CG_STATE_GATE ? j : adev->num_ip_blocks - j - 1; 3117 if (!adev->ip_blocks[i].status.late_initialized) 3118 continue; 3119 /* skip CG for GFX, SDMA on S0ix */ 3120 if (adev->in_s0ix && 3121 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GFX || 3122 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SDMA)) 3123 continue; 3124 /* skip CG for VCE/UVD, it's handled specially */ 3125 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_UVD && 3126 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCE && 3127 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCN && 3128 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_JPEG && 3129 adev->ip_blocks[i].version->funcs->set_clockgating_state) { 3130 /* enable clockgating to save power */ 3131 r = adev->ip_blocks[i].version->funcs->set_clockgating_state((void *)adev, 3132 state); 3133 if (r) { 3134 DRM_ERROR("set_clockgating_state(gate) of IP block <%s> failed %d\n", 3135 adev->ip_blocks[i].version->funcs->name, r); 3136 return r; 3137 } 3138 } 3139 } 3140 3141 return 0; 3142 } 3143 3144 int amdgpu_device_set_pg_state(struct amdgpu_device *adev, 3145 enum amd_powergating_state state) 3146 { 3147 int i, j, r; 3148 3149 if (amdgpu_emu_mode == 1) 3150 return 0; 3151 3152 for (j = 0; j < adev->num_ip_blocks; j++) { 3153 i = state == AMD_PG_STATE_GATE ? j : adev->num_ip_blocks - j - 1; 3154 if (!adev->ip_blocks[i].status.late_initialized) 3155 continue; 3156 /* skip PG for GFX, SDMA on S0ix */ 3157 if (adev->in_s0ix && 3158 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GFX || 3159 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SDMA)) 3160 continue; 3161 /* skip CG for VCE/UVD, it's handled specially */ 3162 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_UVD && 3163 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCE && 3164 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCN && 3165 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_JPEG && 3166 adev->ip_blocks[i].version->funcs->set_powergating_state) { 3167 /* enable powergating to save power */ 3168 r = adev->ip_blocks[i].version->funcs->set_powergating_state((void *)adev, 3169 state); 3170 if (r) { 3171 DRM_ERROR("set_powergating_state(gate) of IP block <%s> failed %d\n", 3172 adev->ip_blocks[i].version->funcs->name, r); 3173 return r; 3174 } 3175 } 3176 } 3177 return 0; 3178 } 3179 3180 static int amdgpu_device_enable_mgpu_fan_boost(void) 3181 { 3182 struct amdgpu_gpu_instance *gpu_ins; 3183 struct amdgpu_device *adev; 3184 int i, ret = 0; 3185 3186 mutex_lock(&mgpu_info.mutex); 3187 3188 /* 3189 * MGPU fan boost feature should be enabled 3190 * only when there are two or more dGPUs in 3191 * the system 3192 */ 3193 if (mgpu_info.num_dgpu < 2) 3194 goto out; 3195 3196 for (i = 0; i < mgpu_info.num_dgpu; i++) { 3197 gpu_ins = &(mgpu_info.gpu_ins[i]); 3198 adev = gpu_ins->adev; 3199 if (!(adev->flags & AMD_IS_APU) && 3200 !gpu_ins->mgpu_fan_enabled) { 3201 ret = amdgpu_dpm_enable_mgpu_fan_boost(adev); 3202 if (ret) 3203 break; 3204 3205 gpu_ins->mgpu_fan_enabled = 1; 3206 } 3207 } 3208 3209 out: 3210 mutex_unlock(&mgpu_info.mutex); 3211 3212 return ret; 3213 } 3214 3215 /** 3216 * amdgpu_device_ip_late_init - run late init for hardware IPs 3217 * 3218 * @adev: amdgpu_device pointer 3219 * 3220 * Late initialization pass for hardware IPs. The list of all the hardware 3221 * IPs that make up the asic is walked and the late_init callbacks are run. 3222 * late_init covers any special initialization that an IP requires 3223 * after all of the have been initialized or something that needs to happen 3224 * late in the init process. 3225 * Returns 0 on success, negative error code on failure. 3226 */ 3227 static int amdgpu_device_ip_late_init(struct amdgpu_device *adev) 3228 { 3229 struct amdgpu_gpu_instance *gpu_instance; 3230 int i = 0, r; 3231 3232 for (i = 0; i < adev->num_ip_blocks; i++) { 3233 if (!adev->ip_blocks[i].status.hw) 3234 continue; 3235 if (adev->ip_blocks[i].version->funcs->late_init) { 3236 r = adev->ip_blocks[i].version->funcs->late_init(&adev->ip_blocks[i]); 3237 if (r) { 3238 DRM_ERROR("late_init of IP block <%s> failed %d\n", 3239 adev->ip_blocks[i].version->funcs->name, r); 3240 return r; 3241 } 3242 } 3243 adev->ip_blocks[i].status.late_initialized = true; 3244 } 3245 3246 r = amdgpu_ras_late_init(adev); 3247 if (r) { 3248 DRM_ERROR("amdgpu_ras_late_init failed %d", r); 3249 return r; 3250 } 3251 3252 if (!amdgpu_in_reset(adev)) 3253 amdgpu_ras_set_error_query_ready(adev, true); 3254 3255 amdgpu_device_set_cg_state(adev, AMD_CG_STATE_GATE); 3256 amdgpu_device_set_pg_state(adev, AMD_PG_STATE_GATE); 3257 3258 amdgpu_device_fill_reset_magic(adev); 3259 3260 r = amdgpu_device_enable_mgpu_fan_boost(); 3261 if (r) 3262 DRM_ERROR("enable mgpu fan boost failed (%d).\n", r); 3263 3264 /* For passthrough configuration on arcturus and aldebaran, enable special handling SBR */ 3265 if (amdgpu_passthrough(adev) && 3266 ((adev->asic_type == CHIP_ARCTURUS && adev->gmc.xgmi.num_physical_nodes > 1) || 3267 adev->asic_type == CHIP_ALDEBARAN)) 3268 amdgpu_dpm_handle_passthrough_sbr(adev, true); 3269 3270 if (adev->gmc.xgmi.num_physical_nodes > 1) { 3271 mutex_lock(&mgpu_info.mutex); 3272 3273 /* 3274 * Reset device p-state to low as this was booted with high. 3275 * 3276 * This should be performed only after all devices from the same 3277 * hive get initialized. 3278 * 3279 * However, it's unknown how many device in the hive in advance. 3280 * As this is counted one by one during devices initializations. 3281 * 3282 * So, we wait for all XGMI interlinked devices initialized. 3283 * This may bring some delays as those devices may come from 3284 * different hives. But that should be OK. 3285 */ 3286 if (mgpu_info.num_dgpu == adev->gmc.xgmi.num_physical_nodes) { 3287 for (i = 0; i < mgpu_info.num_gpu; i++) { 3288 gpu_instance = &(mgpu_info.gpu_ins[i]); 3289 if (gpu_instance->adev->flags & AMD_IS_APU) 3290 continue; 3291 3292 r = amdgpu_xgmi_set_pstate(gpu_instance->adev, 3293 AMDGPU_XGMI_PSTATE_MIN); 3294 if (r) { 3295 DRM_ERROR("pstate setting failed (%d).\n", r); 3296 break; 3297 } 3298 } 3299 } 3300 3301 mutex_unlock(&mgpu_info.mutex); 3302 } 3303 3304 return 0; 3305 } 3306 3307 static void amdgpu_ip_block_hw_fini(struct amdgpu_ip_block *ip_block) 3308 { 3309 int r; 3310 3311 if (!ip_block->version->funcs->hw_fini) { 3312 DRM_ERROR("hw_fini of IP block <%s> not defined\n", 3313 ip_block->version->funcs->name); 3314 } else { 3315 r = ip_block->version->funcs->hw_fini(ip_block); 3316 /* XXX handle errors */ 3317 if (r) { 3318 DRM_DEBUG("hw_fini of IP block <%s> failed %d\n", 3319 ip_block->version->funcs->name, r); 3320 } 3321 } 3322 3323 ip_block->status.hw = false; 3324 } 3325 3326 /** 3327 * amdgpu_device_smu_fini_early - smu hw_fini wrapper 3328 * 3329 * @adev: amdgpu_device pointer 3330 * 3331 * For ASICs need to disable SMC first 3332 */ 3333 static void amdgpu_device_smu_fini_early(struct amdgpu_device *adev) 3334 { 3335 int i; 3336 3337 if (amdgpu_ip_version(adev, GC_HWIP, 0) > IP_VERSION(9, 0, 0)) 3338 return; 3339 3340 for (i = 0; i < adev->num_ip_blocks; i++) { 3341 if (!adev->ip_blocks[i].status.hw) 3342 continue; 3343 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) { 3344 amdgpu_ip_block_hw_fini(&adev->ip_blocks[i]); 3345 break; 3346 } 3347 } 3348 } 3349 3350 static int amdgpu_device_ip_fini_early(struct amdgpu_device *adev) 3351 { 3352 int i, r; 3353 3354 for (i = 0; i < adev->num_ip_blocks; i++) { 3355 if (!adev->ip_blocks[i].version->funcs->early_fini) 3356 continue; 3357 3358 r = adev->ip_blocks[i].version->funcs->early_fini(&adev->ip_blocks[i]); 3359 if (r) { 3360 DRM_DEBUG("early_fini of IP block <%s> failed %d\n", 3361 adev->ip_blocks[i].version->funcs->name, r); 3362 } 3363 } 3364 3365 amdgpu_device_set_pg_state(adev, AMD_PG_STATE_UNGATE); 3366 amdgpu_device_set_cg_state(adev, AMD_CG_STATE_UNGATE); 3367 3368 amdgpu_amdkfd_suspend(adev, false); 3369 3370 /* Workaroud for ASICs need to disable SMC first */ 3371 amdgpu_device_smu_fini_early(adev); 3372 3373 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 3374 if (!adev->ip_blocks[i].status.hw) 3375 continue; 3376 3377 amdgpu_ip_block_hw_fini(&adev->ip_blocks[i]); 3378 } 3379 3380 if (amdgpu_sriov_vf(adev)) { 3381 if (amdgpu_virt_release_full_gpu(adev, false)) 3382 DRM_ERROR("failed to release exclusive mode on fini\n"); 3383 } 3384 3385 return 0; 3386 } 3387 3388 /** 3389 * amdgpu_device_ip_fini - run fini for hardware IPs 3390 * 3391 * @adev: amdgpu_device pointer 3392 * 3393 * Main teardown pass for hardware IPs. The list of all the hardware 3394 * IPs that make up the asic is walked and the hw_fini and sw_fini callbacks 3395 * are run. hw_fini tears down the hardware associated with each IP 3396 * and sw_fini tears down any software state associated with each IP. 3397 * Returns 0 on success, negative error code on failure. 3398 */ 3399 static int amdgpu_device_ip_fini(struct amdgpu_device *adev) 3400 { 3401 int i, r; 3402 3403 if (amdgpu_sriov_vf(adev) && adev->virt.ras_init_done) 3404 amdgpu_virt_release_ras_err_handler_data(adev); 3405 3406 if (adev->gmc.xgmi.num_physical_nodes > 1) 3407 amdgpu_xgmi_remove_device(adev); 3408 3409 amdgpu_amdkfd_device_fini_sw(adev); 3410 3411 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 3412 if (!adev->ip_blocks[i].status.sw) 3413 continue; 3414 3415 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) { 3416 amdgpu_ucode_free_bo(adev); 3417 amdgpu_free_static_csa(&adev->virt.csa_obj); 3418 amdgpu_device_wb_fini(adev); 3419 amdgpu_device_mem_scratch_fini(adev); 3420 amdgpu_ib_pool_fini(adev); 3421 amdgpu_seq64_fini(adev); 3422 } 3423 if (adev->ip_blocks[i].version->funcs->sw_fini) { 3424 r = adev->ip_blocks[i].version->funcs->sw_fini(&adev->ip_blocks[i]); 3425 /* XXX handle errors */ 3426 if (r) { 3427 DRM_DEBUG("sw_fini of IP block <%s> failed %d\n", 3428 adev->ip_blocks[i].version->funcs->name, r); 3429 } 3430 } 3431 adev->ip_blocks[i].status.sw = false; 3432 adev->ip_blocks[i].status.valid = false; 3433 } 3434 3435 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 3436 if (!adev->ip_blocks[i].status.late_initialized) 3437 continue; 3438 if (adev->ip_blocks[i].version->funcs->late_fini) 3439 adev->ip_blocks[i].version->funcs->late_fini(&adev->ip_blocks[i]); 3440 adev->ip_blocks[i].status.late_initialized = false; 3441 } 3442 3443 amdgpu_ras_fini(adev); 3444 3445 return 0; 3446 } 3447 3448 /** 3449 * amdgpu_device_delayed_init_work_handler - work handler for IB tests 3450 * 3451 * @work: work_struct. 3452 */ 3453 static void amdgpu_device_delayed_init_work_handler(struct work_struct *work) 3454 { 3455 struct amdgpu_device *adev = 3456 container_of(work, struct amdgpu_device, delayed_init_work.work); 3457 int r; 3458 3459 r = amdgpu_ib_ring_tests(adev); 3460 if (r) 3461 DRM_ERROR("ib ring test failed (%d).\n", r); 3462 } 3463 3464 static void amdgpu_device_delay_enable_gfx_off(struct work_struct *work) 3465 { 3466 struct amdgpu_device *adev = 3467 container_of(work, struct amdgpu_device, gfx.gfx_off_delay_work.work); 3468 3469 WARN_ON_ONCE(adev->gfx.gfx_off_state); 3470 WARN_ON_ONCE(adev->gfx.gfx_off_req_count); 3471 3472 if (!amdgpu_dpm_set_powergating_by_smu(adev, AMD_IP_BLOCK_TYPE_GFX, true)) 3473 adev->gfx.gfx_off_state = true; 3474 } 3475 3476 /** 3477 * amdgpu_device_ip_suspend_phase1 - run suspend for hardware IPs (phase 1) 3478 * 3479 * @adev: amdgpu_device pointer 3480 * 3481 * Main suspend function for hardware IPs. The list of all the hardware 3482 * IPs that make up the asic is walked, clockgating is disabled and the 3483 * suspend callbacks are run. suspend puts the hardware and software state 3484 * in each IP into a state suitable for suspend. 3485 * Returns 0 on success, negative error code on failure. 3486 */ 3487 static int amdgpu_device_ip_suspend_phase1(struct amdgpu_device *adev) 3488 { 3489 int i, r; 3490 3491 amdgpu_device_set_pg_state(adev, AMD_PG_STATE_UNGATE); 3492 amdgpu_device_set_cg_state(adev, AMD_CG_STATE_UNGATE); 3493 3494 /* 3495 * Per PMFW team's suggestion, driver needs to handle gfxoff 3496 * and df cstate features disablement for gpu reset(e.g. Mode1Reset) 3497 * scenario. Add the missing df cstate disablement here. 3498 */ 3499 if (amdgpu_dpm_set_df_cstate(adev, DF_CSTATE_DISALLOW)) 3500 dev_warn(adev->dev, "Failed to disallow df cstate"); 3501 3502 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 3503 if (!adev->ip_blocks[i].status.valid) 3504 continue; 3505 3506 /* displays are handled separately */ 3507 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_DCE) 3508 continue; 3509 3510 /* XXX handle errors */ 3511 r = amdgpu_ip_block_suspend(&adev->ip_blocks[i]); 3512 if (r) 3513 return r; 3514 } 3515 3516 return 0; 3517 } 3518 3519 /** 3520 * amdgpu_device_ip_suspend_phase2 - run suspend for hardware IPs (phase 2) 3521 * 3522 * @adev: amdgpu_device pointer 3523 * 3524 * Main suspend function for hardware IPs. The list of all the hardware 3525 * IPs that make up the asic is walked, clockgating is disabled and the 3526 * suspend callbacks are run. suspend puts the hardware and software state 3527 * in each IP into a state suitable for suspend. 3528 * Returns 0 on success, negative error code on failure. 3529 */ 3530 static int amdgpu_device_ip_suspend_phase2(struct amdgpu_device *adev) 3531 { 3532 int i, r; 3533 3534 if (adev->in_s0ix) 3535 amdgpu_dpm_gfx_state_change(adev, sGpuChangeState_D3Entry); 3536 3537 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 3538 if (!adev->ip_blocks[i].status.valid) 3539 continue; 3540 /* displays are handled in phase1 */ 3541 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE) 3542 continue; 3543 /* PSP lost connection when err_event_athub occurs */ 3544 if (amdgpu_ras_intr_triggered() && 3545 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) { 3546 adev->ip_blocks[i].status.hw = false; 3547 continue; 3548 } 3549 3550 /* skip unnecessary suspend if we do not initialize them yet */ 3551 if (!amdgpu_ip_member_of_hwini( 3552 adev, adev->ip_blocks[i].version->type)) 3553 continue; 3554 3555 /* skip suspend of gfx/mes and psp for S0ix 3556 * gfx is in gfxoff state, so on resume it will exit gfxoff just 3557 * like at runtime. PSP is also part of the always on hardware 3558 * so no need to suspend it. 3559 */ 3560 if (adev->in_s0ix && 3561 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP || 3562 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GFX || 3563 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_MES)) 3564 continue; 3565 3566 /* SDMA 5.x+ is part of GFX power domain so it's covered by GFXOFF */ 3567 if (adev->in_s0ix && 3568 (amdgpu_ip_version(adev, SDMA0_HWIP, 0) >= 3569 IP_VERSION(5, 0, 0)) && 3570 (adev->ip_blocks[i].version->type == 3571 AMD_IP_BLOCK_TYPE_SDMA)) 3572 continue; 3573 3574 /* Once swPSP provides the IMU, RLC FW binaries to TOS during cold-boot. 3575 * These are in TMR, hence are expected to be reused by PSP-TOS to reload 3576 * from this location and RLC Autoload automatically also gets loaded 3577 * from here based on PMFW -> PSP message during re-init sequence. 3578 * Therefore, the psp suspend & resume should be skipped to avoid destroy 3579 * the TMR and reload FWs again for IMU enabled APU ASICs. 3580 */ 3581 if (amdgpu_in_reset(adev) && 3582 (adev->flags & AMD_IS_APU) && adev->gfx.imu.funcs && 3583 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) 3584 continue; 3585 3586 /* XXX handle errors */ 3587 r = amdgpu_ip_block_suspend(&adev->ip_blocks[i]); 3588 adev->ip_blocks[i].status.hw = false; 3589 3590 /* handle putting the SMC in the appropriate state */ 3591 if (!amdgpu_sriov_vf(adev)) { 3592 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) { 3593 r = amdgpu_dpm_set_mp1_state(adev, adev->mp1_state); 3594 if (r) { 3595 DRM_ERROR("SMC failed to set mp1 state %d, %d\n", 3596 adev->mp1_state, r); 3597 return r; 3598 } 3599 } 3600 } 3601 } 3602 3603 return 0; 3604 } 3605 3606 /** 3607 * amdgpu_device_ip_suspend - run suspend for hardware IPs 3608 * 3609 * @adev: amdgpu_device pointer 3610 * 3611 * Main suspend function for hardware IPs. The list of all the hardware 3612 * IPs that make up the asic is walked, clockgating is disabled and the 3613 * suspend callbacks are run. suspend puts the hardware and software state 3614 * in each IP into a state suitable for suspend. 3615 * Returns 0 on success, negative error code on failure. 3616 */ 3617 int amdgpu_device_ip_suspend(struct amdgpu_device *adev) 3618 { 3619 int r; 3620 3621 if (amdgpu_sriov_vf(adev)) { 3622 amdgpu_virt_fini_data_exchange(adev); 3623 amdgpu_virt_request_full_gpu(adev, false); 3624 } 3625 3626 amdgpu_ttm_set_buffer_funcs_status(adev, false); 3627 3628 r = amdgpu_device_ip_suspend_phase1(adev); 3629 if (r) 3630 return r; 3631 r = amdgpu_device_ip_suspend_phase2(adev); 3632 3633 if (amdgpu_sriov_vf(adev)) 3634 amdgpu_virt_release_full_gpu(adev, false); 3635 3636 return r; 3637 } 3638 3639 static int amdgpu_device_ip_reinit_early_sriov(struct amdgpu_device *adev) 3640 { 3641 int i, r; 3642 3643 static enum amd_ip_block_type ip_order[] = { 3644 AMD_IP_BLOCK_TYPE_COMMON, 3645 AMD_IP_BLOCK_TYPE_GMC, 3646 AMD_IP_BLOCK_TYPE_PSP, 3647 AMD_IP_BLOCK_TYPE_IH, 3648 }; 3649 3650 for (i = 0; i < adev->num_ip_blocks; i++) { 3651 int j; 3652 struct amdgpu_ip_block *block; 3653 3654 block = &adev->ip_blocks[i]; 3655 block->status.hw = false; 3656 3657 for (j = 0; j < ARRAY_SIZE(ip_order); j++) { 3658 3659 if (block->version->type != ip_order[j] || 3660 !block->status.valid) 3661 continue; 3662 3663 r = block->version->funcs->hw_init(&adev->ip_blocks[i]); 3664 DRM_INFO("RE-INIT-early: %s %s\n", block->version->funcs->name, r?"failed":"succeeded"); 3665 if (r) 3666 return r; 3667 block->status.hw = true; 3668 } 3669 } 3670 3671 return 0; 3672 } 3673 3674 static int amdgpu_device_ip_reinit_late_sriov(struct amdgpu_device *adev) 3675 { 3676 int i, r; 3677 3678 static enum amd_ip_block_type ip_order[] = { 3679 AMD_IP_BLOCK_TYPE_SMC, 3680 AMD_IP_BLOCK_TYPE_DCE, 3681 AMD_IP_BLOCK_TYPE_GFX, 3682 AMD_IP_BLOCK_TYPE_SDMA, 3683 AMD_IP_BLOCK_TYPE_MES, 3684 AMD_IP_BLOCK_TYPE_UVD, 3685 AMD_IP_BLOCK_TYPE_VCE, 3686 AMD_IP_BLOCK_TYPE_VCN, 3687 AMD_IP_BLOCK_TYPE_JPEG 3688 }; 3689 3690 for (i = 0; i < ARRAY_SIZE(ip_order); i++) { 3691 int j; 3692 struct amdgpu_ip_block *block; 3693 3694 for (j = 0; j < adev->num_ip_blocks; j++) { 3695 block = &adev->ip_blocks[j]; 3696 3697 if (block->version->type != ip_order[i] || 3698 !block->status.valid || 3699 block->status.hw) 3700 continue; 3701 3702 if (block->version->type == AMD_IP_BLOCK_TYPE_SMC) { 3703 r = amdgpu_ip_block_resume(&adev->ip_blocks[i]); 3704 if (r) 3705 return r; 3706 } else { 3707 r = block->version->funcs->hw_init(&adev->ip_blocks[i]); 3708 if (r) { 3709 DRM_ERROR("hw_init of IP block <%s> failed %d\n", 3710 adev->ip_blocks[i].version->funcs->name, r); 3711 return r; 3712 } 3713 block->status.hw = true; 3714 } 3715 } 3716 } 3717 3718 return 0; 3719 } 3720 3721 /** 3722 * amdgpu_device_ip_resume_phase1 - run resume for hardware IPs 3723 * 3724 * @adev: amdgpu_device pointer 3725 * 3726 * First resume function for hardware IPs. The list of all the hardware 3727 * IPs that make up the asic is walked and the resume callbacks are run for 3728 * COMMON, GMC, and IH. resume puts the hardware into a functional state 3729 * after a suspend and updates the software state as necessary. This 3730 * function is also used for restoring the GPU after a GPU reset. 3731 * Returns 0 on success, negative error code on failure. 3732 */ 3733 static int amdgpu_device_ip_resume_phase1(struct amdgpu_device *adev) 3734 { 3735 int i, r; 3736 3737 for (i = 0; i < adev->num_ip_blocks; i++) { 3738 if (!adev->ip_blocks[i].status.valid || adev->ip_blocks[i].status.hw) 3739 continue; 3740 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON || 3741 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC || 3742 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH || 3743 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP && amdgpu_sriov_vf(adev))) { 3744 3745 r = amdgpu_ip_block_resume(&adev->ip_blocks[i]); 3746 if (r) 3747 return r; 3748 } 3749 } 3750 3751 return 0; 3752 } 3753 3754 /** 3755 * amdgpu_device_ip_resume_phase2 - run resume for hardware IPs 3756 * 3757 * @adev: amdgpu_device pointer 3758 * 3759 * First resume function for hardware IPs. The list of all the hardware 3760 * IPs that make up the asic is walked and the resume callbacks are run for 3761 * all blocks except COMMON, GMC, and IH. resume puts the hardware into a 3762 * functional state after a suspend and updates the software state as 3763 * necessary. This function is also used for restoring the GPU after a GPU 3764 * reset. 3765 * Returns 0 on success, negative error code on failure. 3766 */ 3767 static int amdgpu_device_ip_resume_phase2(struct amdgpu_device *adev) 3768 { 3769 int i, r; 3770 3771 for (i = 0; i < adev->num_ip_blocks; i++) { 3772 if (!adev->ip_blocks[i].status.valid || adev->ip_blocks[i].status.hw) 3773 continue; 3774 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON || 3775 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC || 3776 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH || 3777 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) 3778 continue; 3779 r = amdgpu_ip_block_resume(&adev->ip_blocks[i]); 3780 if (r) 3781 return r; 3782 } 3783 3784 return 0; 3785 } 3786 3787 /** 3788 * amdgpu_device_ip_resume - run resume for hardware IPs 3789 * 3790 * @adev: amdgpu_device pointer 3791 * 3792 * Main resume function for hardware IPs. The hardware IPs 3793 * are split into two resume functions because they are 3794 * also used in recovering from a GPU reset and some additional 3795 * steps need to be take between them. In this case (S3/S4) they are 3796 * run sequentially. 3797 * Returns 0 on success, negative error code on failure. 3798 */ 3799 static int amdgpu_device_ip_resume(struct amdgpu_device *adev) 3800 { 3801 int r; 3802 3803 r = amdgpu_device_ip_resume_phase1(adev); 3804 if (r) 3805 return r; 3806 3807 r = amdgpu_device_fw_loading(adev); 3808 if (r) 3809 return r; 3810 3811 r = amdgpu_device_ip_resume_phase2(adev); 3812 3813 if (adev->mman.buffer_funcs_ring->sched.ready) 3814 amdgpu_ttm_set_buffer_funcs_status(adev, true); 3815 3816 return r; 3817 } 3818 3819 /** 3820 * amdgpu_device_detect_sriov_bios - determine if the board supports SR-IOV 3821 * 3822 * @adev: amdgpu_device pointer 3823 * 3824 * Query the VBIOS data tables to determine if the board supports SR-IOV. 3825 */ 3826 static void amdgpu_device_detect_sriov_bios(struct amdgpu_device *adev) 3827 { 3828 if (amdgpu_sriov_vf(adev)) { 3829 if (adev->is_atom_fw) { 3830 if (amdgpu_atomfirmware_gpu_virtualization_supported(adev)) 3831 adev->virt.caps |= AMDGPU_SRIOV_CAPS_SRIOV_VBIOS; 3832 } else { 3833 if (amdgpu_atombios_has_gpu_virtualization_table(adev)) 3834 adev->virt.caps |= AMDGPU_SRIOV_CAPS_SRIOV_VBIOS; 3835 } 3836 3837 if (!(adev->virt.caps & AMDGPU_SRIOV_CAPS_SRIOV_VBIOS)) 3838 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_NO_VBIOS, 0, 0); 3839 } 3840 } 3841 3842 /** 3843 * amdgpu_device_asic_has_dc_support - determine if DC supports the asic 3844 * 3845 * @asic_type: AMD asic type 3846 * 3847 * Check if there is DC (new modesetting infrastructre) support for an asic. 3848 * returns true if DC has support, false if not. 3849 */ 3850 bool amdgpu_device_asic_has_dc_support(enum amd_asic_type asic_type) 3851 { 3852 switch (asic_type) { 3853 #ifdef CONFIG_DRM_AMDGPU_SI 3854 case CHIP_HAINAN: 3855 #endif 3856 case CHIP_TOPAZ: 3857 /* chips with no display hardware */ 3858 return false; 3859 #if defined(CONFIG_DRM_AMD_DC) 3860 case CHIP_TAHITI: 3861 case CHIP_PITCAIRN: 3862 case CHIP_VERDE: 3863 case CHIP_OLAND: 3864 /* 3865 * We have systems in the wild with these ASICs that require 3866 * LVDS and VGA support which is not supported with DC. 3867 * 3868 * Fallback to the non-DC driver here by default so as not to 3869 * cause regressions. 3870 */ 3871 #if defined(CONFIG_DRM_AMD_DC_SI) 3872 return amdgpu_dc > 0; 3873 #else 3874 return false; 3875 #endif 3876 case CHIP_BONAIRE: 3877 case CHIP_KAVERI: 3878 case CHIP_KABINI: 3879 case CHIP_MULLINS: 3880 /* 3881 * We have systems in the wild with these ASICs that require 3882 * VGA support which is not supported with DC. 3883 * 3884 * Fallback to the non-DC driver here by default so as not to 3885 * cause regressions. 3886 */ 3887 return amdgpu_dc > 0; 3888 default: 3889 return amdgpu_dc != 0; 3890 #else 3891 default: 3892 if (amdgpu_dc > 0) 3893 DRM_INFO_ONCE("Display Core has been requested via kernel parameter but isn't supported by ASIC, ignoring\n"); 3894 return false; 3895 #endif 3896 } 3897 } 3898 3899 /** 3900 * amdgpu_device_has_dc_support - check if dc is supported 3901 * 3902 * @adev: amdgpu_device pointer 3903 * 3904 * Returns true for supported, false for not supported 3905 */ 3906 bool amdgpu_device_has_dc_support(struct amdgpu_device *adev) 3907 { 3908 if (adev->enable_virtual_display || 3909 (adev->harvest_ip_mask & AMD_HARVEST_IP_DMU_MASK)) 3910 return false; 3911 3912 return amdgpu_device_asic_has_dc_support(adev->asic_type); 3913 } 3914 3915 static void amdgpu_device_xgmi_reset_func(struct work_struct *__work) 3916 { 3917 struct amdgpu_device *adev = 3918 container_of(__work, struct amdgpu_device, xgmi_reset_work); 3919 struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev); 3920 3921 /* It's a bug to not have a hive within this function */ 3922 if (WARN_ON(!hive)) 3923 return; 3924 3925 /* 3926 * Use task barrier to synchronize all xgmi reset works across the 3927 * hive. task_barrier_enter and task_barrier_exit will block 3928 * until all the threads running the xgmi reset works reach 3929 * those points. task_barrier_full will do both blocks. 3930 */ 3931 if (amdgpu_asic_reset_method(adev) == AMD_RESET_METHOD_BACO) { 3932 3933 task_barrier_enter(&hive->tb); 3934 adev->asic_reset_res = amdgpu_device_baco_enter(adev_to_drm(adev)); 3935 3936 if (adev->asic_reset_res) 3937 goto fail; 3938 3939 task_barrier_exit(&hive->tb); 3940 adev->asic_reset_res = amdgpu_device_baco_exit(adev_to_drm(adev)); 3941 3942 if (adev->asic_reset_res) 3943 goto fail; 3944 3945 amdgpu_ras_reset_error_count(adev, AMDGPU_RAS_BLOCK__MMHUB); 3946 } else { 3947 3948 task_barrier_full(&hive->tb); 3949 adev->asic_reset_res = amdgpu_asic_reset(adev); 3950 } 3951 3952 fail: 3953 if (adev->asic_reset_res) 3954 DRM_WARN("ASIC reset failed with error, %d for drm dev, %s", 3955 adev->asic_reset_res, adev_to_drm(adev)->unique); 3956 amdgpu_put_xgmi_hive(hive); 3957 } 3958 3959 static int amdgpu_device_get_job_timeout_settings(struct amdgpu_device *adev) 3960 { 3961 char *input = amdgpu_lockup_timeout; 3962 char *timeout_setting = NULL; 3963 int index = 0; 3964 long timeout; 3965 int ret = 0; 3966 3967 /* 3968 * By default timeout for non compute jobs is 10000 3969 * and 60000 for compute jobs. 3970 * In SR-IOV or passthrough mode, timeout for compute 3971 * jobs are 60000 by default. 3972 */ 3973 adev->gfx_timeout = msecs_to_jiffies(10000); 3974 adev->sdma_timeout = adev->video_timeout = adev->gfx_timeout; 3975 if (amdgpu_sriov_vf(adev)) 3976 adev->compute_timeout = amdgpu_sriov_is_pp_one_vf(adev) ? 3977 msecs_to_jiffies(60000) : msecs_to_jiffies(10000); 3978 else 3979 adev->compute_timeout = msecs_to_jiffies(60000); 3980 3981 if (strnlen(input, AMDGPU_MAX_TIMEOUT_PARAM_LENGTH)) { 3982 while ((timeout_setting = strsep(&input, ",")) && 3983 strnlen(timeout_setting, AMDGPU_MAX_TIMEOUT_PARAM_LENGTH)) { 3984 ret = kstrtol(timeout_setting, 0, &timeout); 3985 if (ret) 3986 return ret; 3987 3988 if (timeout == 0) { 3989 index++; 3990 continue; 3991 } else if (timeout < 0) { 3992 timeout = MAX_SCHEDULE_TIMEOUT; 3993 dev_warn(adev->dev, "lockup timeout disabled"); 3994 add_taint(TAINT_SOFTLOCKUP, LOCKDEP_STILL_OK); 3995 } else { 3996 timeout = msecs_to_jiffies(timeout); 3997 } 3998 3999 switch (index++) { 4000 case 0: 4001 adev->gfx_timeout = timeout; 4002 break; 4003 case 1: 4004 adev->compute_timeout = timeout; 4005 break; 4006 case 2: 4007 adev->sdma_timeout = timeout; 4008 break; 4009 case 3: 4010 adev->video_timeout = timeout; 4011 break; 4012 default: 4013 break; 4014 } 4015 } 4016 /* 4017 * There is only one value specified and 4018 * it should apply to all non-compute jobs. 4019 */ 4020 if (index == 1) { 4021 adev->sdma_timeout = adev->video_timeout = adev->gfx_timeout; 4022 if (amdgpu_sriov_vf(adev) || amdgpu_passthrough(adev)) 4023 adev->compute_timeout = adev->gfx_timeout; 4024 } 4025 } 4026 4027 return ret; 4028 } 4029 4030 /** 4031 * amdgpu_device_check_iommu_direct_map - check if RAM direct mapped to GPU 4032 * 4033 * @adev: amdgpu_device pointer 4034 * 4035 * RAM direct mapped to GPU if IOMMU is not enabled or is pass through mode 4036 */ 4037 static void amdgpu_device_check_iommu_direct_map(struct amdgpu_device *adev) 4038 { 4039 struct iommu_domain *domain; 4040 4041 domain = iommu_get_domain_for_dev(adev->dev); 4042 if (!domain || domain->type == IOMMU_DOMAIN_IDENTITY) 4043 adev->ram_is_direct_mapped = true; 4044 } 4045 4046 #if defined(CONFIG_HSA_AMD_P2P) 4047 /** 4048 * amdgpu_device_check_iommu_remap - Check if DMA remapping is enabled. 4049 * 4050 * @adev: amdgpu_device pointer 4051 * 4052 * return if IOMMU remapping bar address 4053 */ 4054 static bool amdgpu_device_check_iommu_remap(struct amdgpu_device *adev) 4055 { 4056 struct iommu_domain *domain; 4057 4058 domain = iommu_get_domain_for_dev(adev->dev); 4059 if (domain && (domain->type == IOMMU_DOMAIN_DMA || 4060 domain->type == IOMMU_DOMAIN_DMA_FQ)) 4061 return true; 4062 4063 return false; 4064 } 4065 #endif 4066 4067 static const struct attribute *amdgpu_dev_attributes[] = { 4068 &dev_attr_pcie_replay_count.attr, 4069 NULL 4070 }; 4071 4072 static void amdgpu_device_set_mcbp(struct amdgpu_device *adev) 4073 { 4074 if (amdgpu_mcbp == 1) 4075 adev->gfx.mcbp = true; 4076 else if (amdgpu_mcbp == 0) 4077 adev->gfx.mcbp = false; 4078 4079 if (amdgpu_sriov_vf(adev)) 4080 adev->gfx.mcbp = true; 4081 4082 if (adev->gfx.mcbp) 4083 DRM_INFO("MCBP is enabled\n"); 4084 } 4085 4086 /** 4087 * amdgpu_device_init - initialize the driver 4088 * 4089 * @adev: amdgpu_device pointer 4090 * @flags: driver flags 4091 * 4092 * Initializes the driver info and hw (all asics). 4093 * Returns 0 for success or an error on failure. 4094 * Called at driver startup. 4095 */ 4096 int amdgpu_device_init(struct amdgpu_device *adev, 4097 uint32_t flags) 4098 { 4099 struct drm_device *ddev = adev_to_drm(adev); 4100 struct pci_dev *pdev = adev->pdev; 4101 int r, i; 4102 bool px = false; 4103 u32 max_MBps; 4104 int tmp; 4105 4106 adev->shutdown = false; 4107 adev->flags = flags; 4108 4109 if (amdgpu_force_asic_type >= 0 && amdgpu_force_asic_type < CHIP_LAST) 4110 adev->asic_type = amdgpu_force_asic_type; 4111 else 4112 adev->asic_type = flags & AMD_ASIC_MASK; 4113 4114 adev->usec_timeout = AMDGPU_MAX_USEC_TIMEOUT; 4115 if (amdgpu_emu_mode == 1) 4116 adev->usec_timeout *= 10; 4117 adev->gmc.gart_size = 512 * 1024 * 1024; 4118 adev->accel_working = false; 4119 adev->num_rings = 0; 4120 RCU_INIT_POINTER(adev->gang_submit, dma_fence_get_stub()); 4121 adev->mman.buffer_funcs = NULL; 4122 adev->mman.buffer_funcs_ring = NULL; 4123 adev->vm_manager.vm_pte_funcs = NULL; 4124 adev->vm_manager.vm_pte_num_scheds = 0; 4125 adev->gmc.gmc_funcs = NULL; 4126 adev->harvest_ip_mask = 0x0; 4127 adev->fence_context = dma_fence_context_alloc(AMDGPU_MAX_RINGS); 4128 bitmap_zero(adev->gfx.pipe_reserve_bitmap, AMDGPU_MAX_COMPUTE_QUEUES); 4129 4130 adev->smc_rreg = &amdgpu_invalid_rreg; 4131 adev->smc_wreg = &amdgpu_invalid_wreg; 4132 adev->pcie_rreg = &amdgpu_invalid_rreg; 4133 adev->pcie_wreg = &amdgpu_invalid_wreg; 4134 adev->pcie_rreg_ext = &amdgpu_invalid_rreg_ext; 4135 adev->pcie_wreg_ext = &amdgpu_invalid_wreg_ext; 4136 adev->pciep_rreg = &amdgpu_invalid_rreg; 4137 adev->pciep_wreg = &amdgpu_invalid_wreg; 4138 adev->pcie_rreg64 = &amdgpu_invalid_rreg64; 4139 adev->pcie_wreg64 = &amdgpu_invalid_wreg64; 4140 adev->pcie_rreg64_ext = &amdgpu_invalid_rreg64_ext; 4141 adev->pcie_wreg64_ext = &amdgpu_invalid_wreg64_ext; 4142 adev->uvd_ctx_rreg = &amdgpu_invalid_rreg; 4143 adev->uvd_ctx_wreg = &amdgpu_invalid_wreg; 4144 adev->didt_rreg = &amdgpu_invalid_rreg; 4145 adev->didt_wreg = &amdgpu_invalid_wreg; 4146 adev->gc_cac_rreg = &amdgpu_invalid_rreg; 4147 adev->gc_cac_wreg = &amdgpu_invalid_wreg; 4148 adev->audio_endpt_rreg = &amdgpu_block_invalid_rreg; 4149 adev->audio_endpt_wreg = &amdgpu_block_invalid_wreg; 4150 4151 DRM_INFO("initializing kernel modesetting (%s 0x%04X:0x%04X 0x%04X:0x%04X 0x%02X).\n", 4152 amdgpu_asic_name[adev->asic_type], pdev->vendor, pdev->device, 4153 pdev->subsystem_vendor, pdev->subsystem_device, pdev->revision); 4154 4155 /* mutex initialization are all done here so we 4156 * can recall function without having locking issues 4157 */ 4158 mutex_init(&adev->firmware.mutex); 4159 mutex_init(&adev->pm.mutex); 4160 mutex_init(&adev->gfx.gpu_clock_mutex); 4161 mutex_init(&adev->srbm_mutex); 4162 mutex_init(&adev->gfx.pipe_reserve_mutex); 4163 mutex_init(&adev->gfx.gfx_off_mutex); 4164 mutex_init(&adev->gfx.partition_mutex); 4165 mutex_init(&adev->grbm_idx_mutex); 4166 mutex_init(&adev->mn_lock); 4167 mutex_init(&adev->virt.vf_errors.lock); 4168 mutex_init(&adev->virt.rlcg_reg_lock); 4169 hash_init(adev->mn_hash); 4170 mutex_init(&adev->psp.mutex); 4171 mutex_init(&adev->notifier_lock); 4172 mutex_init(&adev->pm.stable_pstate_ctx_lock); 4173 mutex_init(&adev->benchmark_mutex); 4174 mutex_init(&adev->gfx.reset_sem_mutex); 4175 /* Initialize the mutex for cleaner shader isolation between GFX and compute processes */ 4176 mutex_init(&adev->enforce_isolation_mutex); 4177 mutex_init(&adev->gfx.kfd_sch_mutex); 4178 4179 amdgpu_device_init_apu_flags(adev); 4180 4181 r = amdgpu_device_check_arguments(adev); 4182 if (r) 4183 return r; 4184 4185 spin_lock_init(&adev->mmio_idx_lock); 4186 spin_lock_init(&adev->smc_idx_lock); 4187 spin_lock_init(&adev->pcie_idx_lock); 4188 spin_lock_init(&adev->uvd_ctx_idx_lock); 4189 spin_lock_init(&adev->didt_idx_lock); 4190 spin_lock_init(&adev->gc_cac_idx_lock); 4191 spin_lock_init(&adev->se_cac_idx_lock); 4192 spin_lock_init(&adev->audio_endpt_idx_lock); 4193 spin_lock_init(&adev->mm_stats.lock); 4194 spin_lock_init(&adev->wb.lock); 4195 4196 INIT_LIST_HEAD(&adev->reset_list); 4197 4198 INIT_LIST_HEAD(&adev->ras_list); 4199 4200 INIT_LIST_HEAD(&adev->pm.od_kobj_list); 4201 4202 INIT_DELAYED_WORK(&adev->delayed_init_work, 4203 amdgpu_device_delayed_init_work_handler); 4204 INIT_DELAYED_WORK(&adev->gfx.gfx_off_delay_work, 4205 amdgpu_device_delay_enable_gfx_off); 4206 /* 4207 * Initialize the enforce_isolation work structures for each XCP 4208 * partition. This work handler is responsible for enforcing shader 4209 * isolation on AMD GPUs. It counts the number of emitted fences for 4210 * each GFX and compute ring. If there are any fences, it schedules 4211 * the `enforce_isolation_work` to be run after a delay. If there are 4212 * no fences, it signals the Kernel Fusion Driver (KFD) to resume the 4213 * runqueue. 4214 */ 4215 for (i = 0; i < MAX_XCP; i++) { 4216 INIT_DELAYED_WORK(&adev->gfx.enforce_isolation[i].work, 4217 amdgpu_gfx_enforce_isolation_handler); 4218 adev->gfx.enforce_isolation[i].adev = adev; 4219 adev->gfx.enforce_isolation[i].xcp_id = i; 4220 } 4221 4222 INIT_WORK(&adev->xgmi_reset_work, amdgpu_device_xgmi_reset_func); 4223 4224 adev->gfx.gfx_off_req_count = 1; 4225 adev->gfx.gfx_off_residency = 0; 4226 adev->gfx.gfx_off_entrycount = 0; 4227 adev->pm.ac_power = power_supply_is_system_supplied() > 0; 4228 4229 atomic_set(&adev->throttling_logging_enabled, 1); 4230 /* 4231 * If throttling continues, logging will be performed every minute 4232 * to avoid log flooding. "-1" is subtracted since the thermal 4233 * throttling interrupt comes every second. Thus, the total logging 4234 * interval is 59 seconds(retelimited printk interval) + 1(waiting 4235 * for throttling interrupt) = 60 seconds. 4236 */ 4237 ratelimit_state_init(&adev->throttling_logging_rs, (60 - 1) * HZ, 1); 4238 ratelimit_set_flags(&adev->throttling_logging_rs, RATELIMIT_MSG_ON_RELEASE); 4239 4240 /* Registers mapping */ 4241 /* TODO: block userspace mapping of io register */ 4242 if (adev->asic_type >= CHIP_BONAIRE) { 4243 adev->rmmio_base = pci_resource_start(adev->pdev, 5); 4244 adev->rmmio_size = pci_resource_len(adev->pdev, 5); 4245 } else { 4246 adev->rmmio_base = pci_resource_start(adev->pdev, 2); 4247 adev->rmmio_size = pci_resource_len(adev->pdev, 2); 4248 } 4249 4250 for (i = 0; i < AMD_IP_BLOCK_TYPE_NUM; i++) 4251 atomic_set(&adev->pm.pwr_state[i], POWER_STATE_UNKNOWN); 4252 4253 adev->rmmio = ioremap(adev->rmmio_base, adev->rmmio_size); 4254 if (!adev->rmmio) 4255 return -ENOMEM; 4256 4257 DRM_INFO("register mmio base: 0x%08X\n", (uint32_t)adev->rmmio_base); 4258 DRM_INFO("register mmio size: %u\n", (unsigned int)adev->rmmio_size); 4259 4260 /* 4261 * Reset domain needs to be present early, before XGMI hive discovered 4262 * (if any) and intitialized to use reset sem and in_gpu reset flag 4263 * early on during init and before calling to RREG32. 4264 */ 4265 adev->reset_domain = amdgpu_reset_create_reset_domain(SINGLE_DEVICE, "amdgpu-reset-dev"); 4266 if (!adev->reset_domain) 4267 return -ENOMEM; 4268 4269 /* detect hw virtualization here */ 4270 amdgpu_detect_virtualization(adev); 4271 4272 amdgpu_device_get_pcie_info(adev); 4273 4274 r = amdgpu_device_get_job_timeout_settings(adev); 4275 if (r) { 4276 dev_err(adev->dev, "invalid lockup_timeout parameter syntax\n"); 4277 return r; 4278 } 4279 4280 amdgpu_device_set_mcbp(adev); 4281 4282 /* 4283 * By default, use default mode where all blocks are expected to be 4284 * initialized. At present a 'swinit' of blocks is required to be 4285 * completed before the need for a different level is detected. 4286 */ 4287 amdgpu_set_init_level(adev, AMDGPU_INIT_LEVEL_DEFAULT); 4288 /* early init functions */ 4289 r = amdgpu_device_ip_early_init(adev); 4290 if (r) 4291 return r; 4292 4293 /* Get rid of things like offb */ 4294 r = drm_aperture_remove_conflicting_pci_framebuffers(adev->pdev, &amdgpu_kms_driver); 4295 if (r) 4296 return r; 4297 4298 /* Enable TMZ based on IP_VERSION */ 4299 amdgpu_gmc_tmz_set(adev); 4300 4301 if (amdgpu_sriov_vf(adev) && 4302 amdgpu_ip_version(adev, GC_HWIP, 0) >= IP_VERSION(10, 3, 0)) 4303 /* VF MMIO access (except mailbox range) from CPU 4304 * will be blocked during sriov runtime 4305 */ 4306 adev->virt.caps |= AMDGPU_VF_MMIO_ACCESS_PROTECT; 4307 4308 amdgpu_gmc_noretry_set(adev); 4309 /* Need to get xgmi info early to decide the reset behavior*/ 4310 if (adev->gmc.xgmi.supported) { 4311 r = adev->gfxhub.funcs->get_xgmi_info(adev); 4312 if (r) 4313 return r; 4314 } 4315 4316 /* enable PCIE atomic ops */ 4317 if (amdgpu_sriov_vf(adev)) { 4318 if (adev->virt.fw_reserve.p_pf2vf) 4319 adev->have_atomics_support = ((struct amd_sriov_msg_pf2vf_info *) 4320 adev->virt.fw_reserve.p_pf2vf)->pcie_atomic_ops_support_flags == 4321 (PCI_EXP_DEVCAP2_ATOMIC_COMP32 | PCI_EXP_DEVCAP2_ATOMIC_COMP64); 4322 /* APUs w/ gfx9 onwards doesn't reply on PCIe atomics, rather it is a 4323 * internal path natively support atomics, set have_atomics_support to true. 4324 */ 4325 } else if ((adev->flags & AMD_IS_APU) && 4326 (amdgpu_ip_version(adev, GC_HWIP, 0) > 4327 IP_VERSION(9, 0, 0))) { 4328 adev->have_atomics_support = true; 4329 } else { 4330 adev->have_atomics_support = 4331 !pci_enable_atomic_ops_to_root(adev->pdev, 4332 PCI_EXP_DEVCAP2_ATOMIC_COMP32 | 4333 PCI_EXP_DEVCAP2_ATOMIC_COMP64); 4334 } 4335 4336 if (!adev->have_atomics_support) 4337 dev_info(adev->dev, "PCIE atomic ops is not supported\n"); 4338 4339 /* doorbell bar mapping and doorbell index init*/ 4340 amdgpu_doorbell_init(adev); 4341 4342 if (amdgpu_emu_mode == 1) { 4343 /* post the asic on emulation mode */ 4344 emu_soc_asic_init(adev); 4345 goto fence_driver_init; 4346 } 4347 4348 amdgpu_reset_init(adev); 4349 4350 /* detect if we are with an SRIOV vbios */ 4351 if (adev->bios) 4352 amdgpu_device_detect_sriov_bios(adev); 4353 4354 /* check if we need to reset the asic 4355 * E.g., driver was not cleanly unloaded previously, etc. 4356 */ 4357 if (!amdgpu_sriov_vf(adev) && amdgpu_asic_need_reset_on_init(adev)) { 4358 if (adev->gmc.xgmi.num_physical_nodes) { 4359 dev_info(adev->dev, "Pending hive reset.\n"); 4360 amdgpu_set_init_level(adev, 4361 AMDGPU_INIT_LEVEL_MINIMAL_XGMI); 4362 } else if (amdgpu_ip_version(adev, MP1_HWIP, 0) == IP_VERSION(13, 0, 10) && 4363 !amdgpu_device_has_display_hardware(adev)) { 4364 r = psp_gpu_reset(adev); 4365 } else { 4366 tmp = amdgpu_reset_method; 4367 /* It should do a default reset when loading or reloading the driver, 4368 * regardless of the module parameter reset_method. 4369 */ 4370 amdgpu_reset_method = AMD_RESET_METHOD_NONE; 4371 r = amdgpu_asic_reset(adev); 4372 amdgpu_reset_method = tmp; 4373 } 4374 4375 if (r) { 4376 dev_err(adev->dev, "asic reset on init failed\n"); 4377 goto failed; 4378 } 4379 } 4380 4381 /* Post card if necessary */ 4382 if (amdgpu_device_need_post(adev)) { 4383 if (!adev->bios) { 4384 dev_err(adev->dev, "no vBIOS found\n"); 4385 r = -EINVAL; 4386 goto failed; 4387 } 4388 DRM_INFO("GPU posting now...\n"); 4389 r = amdgpu_device_asic_init(adev); 4390 if (r) { 4391 dev_err(adev->dev, "gpu post error!\n"); 4392 goto failed; 4393 } 4394 } 4395 4396 if (adev->bios) { 4397 if (adev->is_atom_fw) { 4398 /* Initialize clocks */ 4399 r = amdgpu_atomfirmware_get_clock_info(adev); 4400 if (r) { 4401 dev_err(adev->dev, "amdgpu_atomfirmware_get_clock_info failed\n"); 4402 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_GET_CLOCK_FAIL, 0, 0); 4403 goto failed; 4404 } 4405 } else { 4406 /* Initialize clocks */ 4407 r = amdgpu_atombios_get_clock_info(adev); 4408 if (r) { 4409 dev_err(adev->dev, "amdgpu_atombios_get_clock_info failed\n"); 4410 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_GET_CLOCK_FAIL, 0, 0); 4411 goto failed; 4412 } 4413 /* init i2c buses */ 4414 if (!amdgpu_device_has_dc_support(adev)) 4415 amdgpu_atombios_i2c_init(adev); 4416 } 4417 } 4418 4419 fence_driver_init: 4420 /* Fence driver */ 4421 r = amdgpu_fence_driver_sw_init(adev); 4422 if (r) { 4423 dev_err(adev->dev, "amdgpu_fence_driver_sw_init failed\n"); 4424 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_FENCE_INIT_FAIL, 0, 0); 4425 goto failed; 4426 } 4427 4428 /* init the mode config */ 4429 drm_mode_config_init(adev_to_drm(adev)); 4430 4431 r = amdgpu_device_ip_init(adev); 4432 if (r) { 4433 dev_err(adev->dev, "amdgpu_device_ip_init failed\n"); 4434 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_AMDGPU_INIT_FAIL, 0, 0); 4435 goto release_ras_con; 4436 } 4437 4438 amdgpu_fence_driver_hw_init(adev); 4439 4440 dev_info(adev->dev, 4441 "SE %d, SH per SE %d, CU per SH %d, active_cu_number %d\n", 4442 adev->gfx.config.max_shader_engines, 4443 adev->gfx.config.max_sh_per_se, 4444 adev->gfx.config.max_cu_per_sh, 4445 adev->gfx.cu_info.number); 4446 4447 adev->accel_working = true; 4448 4449 amdgpu_vm_check_compute_bug(adev); 4450 4451 /* Initialize the buffer migration limit. */ 4452 if (amdgpu_moverate >= 0) 4453 max_MBps = amdgpu_moverate; 4454 else 4455 max_MBps = 8; /* Allow 8 MB/s. */ 4456 /* Get a log2 for easy divisions. */ 4457 adev->mm_stats.log2_max_MBps = ilog2(max(1u, max_MBps)); 4458 4459 /* 4460 * Register gpu instance before amdgpu_device_enable_mgpu_fan_boost. 4461 * Otherwise the mgpu fan boost feature will be skipped due to the 4462 * gpu instance is counted less. 4463 */ 4464 amdgpu_register_gpu_instance(adev); 4465 4466 /* enable clockgating, etc. after ib tests, etc. since some blocks require 4467 * explicit gating rather than handling it automatically. 4468 */ 4469 if (adev->init_lvl->level != AMDGPU_INIT_LEVEL_MINIMAL_XGMI) { 4470 r = amdgpu_device_ip_late_init(adev); 4471 if (r) { 4472 dev_err(adev->dev, "amdgpu_device_ip_late_init failed\n"); 4473 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_AMDGPU_LATE_INIT_FAIL, 0, r); 4474 goto release_ras_con; 4475 } 4476 /* must succeed. */ 4477 amdgpu_ras_resume(adev); 4478 queue_delayed_work(system_wq, &adev->delayed_init_work, 4479 msecs_to_jiffies(AMDGPU_RESUME_MS)); 4480 } 4481 4482 if (amdgpu_sriov_vf(adev)) { 4483 amdgpu_virt_release_full_gpu(adev, true); 4484 flush_delayed_work(&adev->delayed_init_work); 4485 } 4486 4487 /* 4488 * Place those sysfs registering after `late_init`. As some of those 4489 * operations performed in `late_init` might affect the sysfs 4490 * interfaces creating. 4491 */ 4492 r = amdgpu_atombios_sysfs_init(adev); 4493 if (r) 4494 drm_err(&adev->ddev, 4495 "registering atombios sysfs failed (%d).\n", r); 4496 4497 r = amdgpu_pm_sysfs_init(adev); 4498 if (r) 4499 DRM_ERROR("registering pm sysfs failed (%d).\n", r); 4500 4501 r = amdgpu_ucode_sysfs_init(adev); 4502 if (r) { 4503 adev->ucode_sysfs_en = false; 4504 DRM_ERROR("Creating firmware sysfs failed (%d).\n", r); 4505 } else 4506 adev->ucode_sysfs_en = true; 4507 4508 r = sysfs_create_files(&adev->dev->kobj, amdgpu_dev_attributes); 4509 if (r) 4510 dev_err(adev->dev, "Could not create amdgpu device attr\n"); 4511 4512 r = devm_device_add_group(adev->dev, &amdgpu_board_attrs_group); 4513 if (r) 4514 dev_err(adev->dev, 4515 "Could not create amdgpu board attributes\n"); 4516 4517 amdgpu_fru_sysfs_init(adev); 4518 amdgpu_reg_state_sysfs_init(adev); 4519 amdgpu_xcp_cfg_sysfs_init(adev); 4520 4521 if (IS_ENABLED(CONFIG_PERF_EVENTS)) 4522 r = amdgpu_pmu_init(adev); 4523 if (r) 4524 dev_err(adev->dev, "amdgpu_pmu_init failed\n"); 4525 4526 /* Have stored pci confspace at hand for restore in sudden PCI error */ 4527 if (amdgpu_device_cache_pci_state(adev->pdev)) 4528 pci_restore_state(pdev); 4529 4530 /* if we have > 1 VGA cards, then disable the amdgpu VGA resources */ 4531 /* this will fail for cards that aren't VGA class devices, just 4532 * ignore it 4533 */ 4534 if ((adev->pdev->class >> 8) == PCI_CLASS_DISPLAY_VGA) 4535 vga_client_register(adev->pdev, amdgpu_device_vga_set_decode); 4536 4537 px = amdgpu_device_supports_px(ddev); 4538 4539 if (px || (!dev_is_removable(&adev->pdev->dev) && 4540 apple_gmux_detect(NULL, NULL))) 4541 vga_switcheroo_register_client(adev->pdev, 4542 &amdgpu_switcheroo_ops, px); 4543 4544 if (px) 4545 vga_switcheroo_init_domain_pm_ops(adev->dev, &adev->vga_pm_domain); 4546 4547 if (adev->init_lvl->level == AMDGPU_INIT_LEVEL_MINIMAL_XGMI) 4548 amdgpu_xgmi_reset_on_init(adev); 4549 4550 amdgpu_device_check_iommu_direct_map(adev); 4551 4552 return 0; 4553 4554 release_ras_con: 4555 if (amdgpu_sriov_vf(adev)) 4556 amdgpu_virt_release_full_gpu(adev, true); 4557 4558 /* failed in exclusive mode due to timeout */ 4559 if (amdgpu_sriov_vf(adev) && 4560 !amdgpu_sriov_runtime(adev) && 4561 amdgpu_virt_mmio_blocked(adev) && 4562 !amdgpu_virt_wait_reset(adev)) { 4563 dev_err(adev->dev, "VF exclusive mode timeout\n"); 4564 /* Don't send request since VF is inactive. */ 4565 adev->virt.caps &= ~AMDGPU_SRIOV_CAPS_RUNTIME; 4566 adev->virt.ops = NULL; 4567 r = -EAGAIN; 4568 } 4569 amdgpu_release_ras_context(adev); 4570 4571 failed: 4572 amdgpu_vf_error_trans_all(adev); 4573 4574 return r; 4575 } 4576 4577 static void amdgpu_device_unmap_mmio(struct amdgpu_device *adev) 4578 { 4579 4580 /* Clear all CPU mappings pointing to this device */ 4581 unmap_mapping_range(adev->ddev.anon_inode->i_mapping, 0, 0, 1); 4582 4583 /* Unmap all mapped bars - Doorbell, registers and VRAM */ 4584 amdgpu_doorbell_fini(adev); 4585 4586 iounmap(adev->rmmio); 4587 adev->rmmio = NULL; 4588 if (adev->mman.aper_base_kaddr) 4589 iounmap(adev->mman.aper_base_kaddr); 4590 adev->mman.aper_base_kaddr = NULL; 4591 4592 /* Memory manager related */ 4593 if (!adev->gmc.xgmi.connected_to_cpu && !adev->gmc.is_app_apu) { 4594 arch_phys_wc_del(adev->gmc.vram_mtrr); 4595 arch_io_free_memtype_wc(adev->gmc.aper_base, adev->gmc.aper_size); 4596 } 4597 } 4598 4599 /** 4600 * amdgpu_device_fini_hw - tear down the driver 4601 * 4602 * @adev: amdgpu_device pointer 4603 * 4604 * Tear down the driver info (all asics). 4605 * Called at driver shutdown. 4606 */ 4607 void amdgpu_device_fini_hw(struct amdgpu_device *adev) 4608 { 4609 dev_info(adev->dev, "amdgpu: finishing device.\n"); 4610 flush_delayed_work(&adev->delayed_init_work); 4611 4612 if (adev->mman.initialized) 4613 drain_workqueue(adev->mman.bdev.wq); 4614 adev->shutdown = true; 4615 4616 /* make sure IB test finished before entering exclusive mode 4617 * to avoid preemption on IB test 4618 */ 4619 if (amdgpu_sriov_vf(adev)) { 4620 amdgpu_virt_request_full_gpu(adev, false); 4621 amdgpu_virt_fini_data_exchange(adev); 4622 } 4623 4624 /* disable all interrupts */ 4625 amdgpu_irq_disable_all(adev); 4626 if (adev->mode_info.mode_config_initialized) { 4627 if (!drm_drv_uses_atomic_modeset(adev_to_drm(adev))) 4628 drm_helper_force_disable_all(adev_to_drm(adev)); 4629 else 4630 drm_atomic_helper_shutdown(adev_to_drm(adev)); 4631 } 4632 amdgpu_fence_driver_hw_fini(adev); 4633 4634 if (adev->pm.sysfs_initialized) 4635 amdgpu_pm_sysfs_fini(adev); 4636 if (adev->ucode_sysfs_en) 4637 amdgpu_ucode_sysfs_fini(adev); 4638 sysfs_remove_files(&adev->dev->kobj, amdgpu_dev_attributes); 4639 amdgpu_fru_sysfs_fini(adev); 4640 4641 amdgpu_reg_state_sysfs_fini(adev); 4642 amdgpu_xcp_cfg_sysfs_fini(adev); 4643 4644 /* disable ras feature must before hw fini */ 4645 amdgpu_ras_pre_fini(adev); 4646 4647 amdgpu_ttm_set_buffer_funcs_status(adev, false); 4648 4649 amdgpu_device_ip_fini_early(adev); 4650 4651 amdgpu_irq_fini_hw(adev); 4652 4653 if (adev->mman.initialized) 4654 ttm_device_clear_dma_mappings(&adev->mman.bdev); 4655 4656 amdgpu_gart_dummy_page_fini(adev); 4657 4658 if (drm_dev_is_unplugged(adev_to_drm(adev))) 4659 amdgpu_device_unmap_mmio(adev); 4660 4661 } 4662 4663 void amdgpu_device_fini_sw(struct amdgpu_device *adev) 4664 { 4665 int idx; 4666 bool px; 4667 4668 amdgpu_fence_driver_sw_fini(adev); 4669 amdgpu_device_ip_fini(adev); 4670 amdgpu_ucode_release(&adev->firmware.gpu_info_fw); 4671 adev->accel_working = false; 4672 dma_fence_put(rcu_dereference_protected(adev->gang_submit, true)); 4673 4674 amdgpu_reset_fini(adev); 4675 4676 /* free i2c buses */ 4677 if (!amdgpu_device_has_dc_support(adev)) 4678 amdgpu_i2c_fini(adev); 4679 4680 if (amdgpu_emu_mode != 1) 4681 amdgpu_atombios_fini(adev); 4682 4683 kfree(adev->bios); 4684 adev->bios = NULL; 4685 4686 kfree(adev->fru_info); 4687 adev->fru_info = NULL; 4688 4689 px = amdgpu_device_supports_px(adev_to_drm(adev)); 4690 4691 if (px || (!dev_is_removable(&adev->pdev->dev) && 4692 apple_gmux_detect(NULL, NULL))) 4693 vga_switcheroo_unregister_client(adev->pdev); 4694 4695 if (px) 4696 vga_switcheroo_fini_domain_pm_ops(adev->dev); 4697 4698 if ((adev->pdev->class >> 8) == PCI_CLASS_DISPLAY_VGA) 4699 vga_client_unregister(adev->pdev); 4700 4701 if (drm_dev_enter(adev_to_drm(adev), &idx)) { 4702 4703 iounmap(adev->rmmio); 4704 adev->rmmio = NULL; 4705 amdgpu_doorbell_fini(adev); 4706 drm_dev_exit(idx); 4707 } 4708 4709 if (IS_ENABLED(CONFIG_PERF_EVENTS)) 4710 amdgpu_pmu_fini(adev); 4711 if (adev->mman.discovery_bin) 4712 amdgpu_discovery_fini(adev); 4713 4714 amdgpu_reset_put_reset_domain(adev->reset_domain); 4715 adev->reset_domain = NULL; 4716 4717 kfree(adev->pci_state); 4718 4719 } 4720 4721 /** 4722 * amdgpu_device_evict_resources - evict device resources 4723 * @adev: amdgpu device object 4724 * 4725 * Evicts all ttm device resources(vram BOs, gart table) from the lru list 4726 * of the vram memory type. Mainly used for evicting device resources 4727 * at suspend time. 4728 * 4729 */ 4730 static int amdgpu_device_evict_resources(struct amdgpu_device *adev) 4731 { 4732 int ret; 4733 4734 /* No need to evict vram on APUs for suspend to ram or s2idle */ 4735 if ((adev->in_s3 || adev->in_s0ix) && (adev->flags & AMD_IS_APU)) 4736 return 0; 4737 4738 ret = amdgpu_ttm_evict_resources(adev, TTM_PL_VRAM); 4739 if (ret) 4740 DRM_WARN("evicting device resources failed\n"); 4741 return ret; 4742 } 4743 4744 /* 4745 * Suspend & resume. 4746 */ 4747 /** 4748 * amdgpu_device_prepare - prepare for device suspend 4749 * 4750 * @dev: drm dev pointer 4751 * 4752 * Prepare to put the hw in the suspend state (all asics). 4753 * Returns 0 for success or an error on failure. 4754 * Called at driver suspend. 4755 */ 4756 int amdgpu_device_prepare(struct drm_device *dev) 4757 { 4758 struct amdgpu_device *adev = drm_to_adev(dev); 4759 int i, r; 4760 4761 amdgpu_choose_low_power_state(adev); 4762 4763 if (dev->switch_power_state == DRM_SWITCH_POWER_OFF) 4764 return 0; 4765 4766 /* Evict the majority of BOs before starting suspend sequence */ 4767 r = amdgpu_device_evict_resources(adev); 4768 if (r) 4769 goto unprepare; 4770 4771 flush_delayed_work(&adev->gfx.gfx_off_delay_work); 4772 4773 for (i = 0; i < adev->num_ip_blocks; i++) { 4774 if (!adev->ip_blocks[i].status.valid) 4775 continue; 4776 if (!adev->ip_blocks[i].version->funcs->prepare_suspend) 4777 continue; 4778 r = adev->ip_blocks[i].version->funcs->prepare_suspend(&adev->ip_blocks[i]); 4779 if (r) 4780 goto unprepare; 4781 } 4782 4783 return 0; 4784 4785 unprepare: 4786 adev->in_s0ix = adev->in_s3 = false; 4787 4788 return r; 4789 } 4790 4791 /** 4792 * amdgpu_device_suspend - initiate device suspend 4793 * 4794 * @dev: drm dev pointer 4795 * @fbcon : notify the fbdev of suspend 4796 * 4797 * Puts the hw in the suspend state (all asics). 4798 * Returns 0 for success or an error on failure. 4799 * Called at driver suspend. 4800 */ 4801 int amdgpu_device_suspend(struct drm_device *dev, bool fbcon) 4802 { 4803 struct amdgpu_device *adev = drm_to_adev(dev); 4804 int r = 0; 4805 4806 if (dev->switch_power_state == DRM_SWITCH_POWER_OFF) 4807 return 0; 4808 4809 adev->in_suspend = true; 4810 4811 if (amdgpu_sriov_vf(adev)) { 4812 amdgpu_virt_fini_data_exchange(adev); 4813 r = amdgpu_virt_request_full_gpu(adev, false); 4814 if (r) 4815 return r; 4816 } 4817 4818 if (amdgpu_acpi_smart_shift_update(dev, AMDGPU_SS_DEV_D3)) 4819 DRM_WARN("smart shift update failed\n"); 4820 4821 if (fbcon) 4822 drm_fb_helper_set_suspend_unlocked(adev_to_drm(adev)->fb_helper, true); 4823 4824 cancel_delayed_work_sync(&adev->delayed_init_work); 4825 4826 amdgpu_ras_suspend(adev); 4827 4828 amdgpu_device_ip_suspend_phase1(adev); 4829 4830 if (!adev->in_s0ix) 4831 amdgpu_amdkfd_suspend(adev, adev->in_runpm); 4832 4833 r = amdgpu_device_evict_resources(adev); 4834 if (r) 4835 return r; 4836 4837 amdgpu_ttm_set_buffer_funcs_status(adev, false); 4838 4839 amdgpu_fence_driver_hw_fini(adev); 4840 4841 amdgpu_device_ip_suspend_phase2(adev); 4842 4843 if (amdgpu_sriov_vf(adev)) 4844 amdgpu_virt_release_full_gpu(adev, false); 4845 4846 r = amdgpu_dpm_notify_rlc_state(adev, false); 4847 if (r) 4848 return r; 4849 4850 return 0; 4851 } 4852 4853 /** 4854 * amdgpu_device_resume - initiate device resume 4855 * 4856 * @dev: drm dev pointer 4857 * @fbcon : notify the fbdev of resume 4858 * 4859 * Bring the hw back to operating state (all asics). 4860 * Returns 0 for success or an error on failure. 4861 * Called at driver resume. 4862 */ 4863 int amdgpu_device_resume(struct drm_device *dev, bool fbcon) 4864 { 4865 struct amdgpu_device *adev = drm_to_adev(dev); 4866 int r = 0; 4867 4868 if (amdgpu_sriov_vf(adev)) { 4869 r = amdgpu_virt_request_full_gpu(adev, true); 4870 if (r) 4871 return r; 4872 } 4873 4874 if (dev->switch_power_state == DRM_SWITCH_POWER_OFF) 4875 return 0; 4876 4877 if (adev->in_s0ix) 4878 amdgpu_dpm_gfx_state_change(adev, sGpuChangeState_D0Entry); 4879 4880 /* post card */ 4881 if (amdgpu_device_need_post(adev)) { 4882 r = amdgpu_device_asic_init(adev); 4883 if (r) 4884 dev_err(adev->dev, "amdgpu asic init failed\n"); 4885 } 4886 4887 r = amdgpu_device_ip_resume(adev); 4888 4889 if (r) { 4890 dev_err(adev->dev, "amdgpu_device_ip_resume failed (%d).\n", r); 4891 goto exit; 4892 } 4893 amdgpu_fence_driver_hw_init(adev); 4894 4895 if (!adev->in_s0ix) { 4896 r = amdgpu_amdkfd_resume(adev, adev->in_runpm); 4897 if (r) 4898 goto exit; 4899 } 4900 4901 r = amdgpu_device_ip_late_init(adev); 4902 if (r) 4903 goto exit; 4904 4905 queue_delayed_work(system_wq, &adev->delayed_init_work, 4906 msecs_to_jiffies(AMDGPU_RESUME_MS)); 4907 exit: 4908 if (amdgpu_sriov_vf(adev)) { 4909 amdgpu_virt_init_data_exchange(adev); 4910 amdgpu_virt_release_full_gpu(adev, true); 4911 } 4912 4913 if (r) 4914 return r; 4915 4916 /* Make sure IB tests flushed */ 4917 flush_delayed_work(&adev->delayed_init_work); 4918 4919 if (fbcon) 4920 drm_fb_helper_set_suspend_unlocked(adev_to_drm(adev)->fb_helper, false); 4921 4922 amdgpu_ras_resume(adev); 4923 4924 if (adev->mode_info.num_crtc) { 4925 /* 4926 * Most of the connector probing functions try to acquire runtime pm 4927 * refs to ensure that the GPU is powered on when connector polling is 4928 * performed. Since we're calling this from a runtime PM callback, 4929 * trying to acquire rpm refs will cause us to deadlock. 4930 * 4931 * Since we're guaranteed to be holding the rpm lock, it's safe to 4932 * temporarily disable the rpm helpers so this doesn't deadlock us. 4933 */ 4934 #ifdef CONFIG_PM 4935 dev->dev->power.disable_depth++; 4936 #endif 4937 if (!adev->dc_enabled) 4938 drm_helper_hpd_irq_event(dev); 4939 else 4940 drm_kms_helper_hotplug_event(dev); 4941 #ifdef CONFIG_PM 4942 dev->dev->power.disable_depth--; 4943 #endif 4944 } 4945 adev->in_suspend = false; 4946 4947 if (adev->enable_mes) 4948 amdgpu_mes_self_test(adev); 4949 4950 if (amdgpu_acpi_smart_shift_update(dev, AMDGPU_SS_DEV_D0)) 4951 DRM_WARN("smart shift update failed\n"); 4952 4953 return 0; 4954 } 4955 4956 /** 4957 * amdgpu_device_ip_check_soft_reset - did soft reset succeed 4958 * 4959 * @adev: amdgpu_device pointer 4960 * 4961 * The list of all the hardware IPs that make up the asic is walked and 4962 * the check_soft_reset callbacks are run. check_soft_reset determines 4963 * if the asic is still hung or not. 4964 * Returns true if any of the IPs are still in a hung state, false if not. 4965 */ 4966 static bool amdgpu_device_ip_check_soft_reset(struct amdgpu_device *adev) 4967 { 4968 int i; 4969 bool asic_hang = false; 4970 4971 if (amdgpu_sriov_vf(adev)) 4972 return true; 4973 4974 if (amdgpu_asic_need_full_reset(adev)) 4975 return true; 4976 4977 for (i = 0; i < adev->num_ip_blocks; i++) { 4978 if (!adev->ip_blocks[i].status.valid) 4979 continue; 4980 if (adev->ip_blocks[i].version->funcs->check_soft_reset) 4981 adev->ip_blocks[i].status.hang = 4982 adev->ip_blocks[i].version->funcs->check_soft_reset( 4983 &adev->ip_blocks[i]); 4984 if (adev->ip_blocks[i].status.hang) { 4985 dev_info(adev->dev, "IP block:%s is hung!\n", adev->ip_blocks[i].version->funcs->name); 4986 asic_hang = true; 4987 } 4988 } 4989 return asic_hang; 4990 } 4991 4992 /** 4993 * amdgpu_device_ip_pre_soft_reset - prepare for soft reset 4994 * 4995 * @adev: amdgpu_device pointer 4996 * 4997 * The list of all the hardware IPs that make up the asic is walked and the 4998 * pre_soft_reset callbacks are run if the block is hung. pre_soft_reset 4999 * handles any IP specific hardware or software state changes that are 5000 * necessary for a soft reset to succeed. 5001 * Returns 0 on success, negative error code on failure. 5002 */ 5003 static int amdgpu_device_ip_pre_soft_reset(struct amdgpu_device *adev) 5004 { 5005 int i, r = 0; 5006 5007 for (i = 0; i < adev->num_ip_blocks; i++) { 5008 if (!adev->ip_blocks[i].status.valid) 5009 continue; 5010 if (adev->ip_blocks[i].status.hang && 5011 adev->ip_blocks[i].version->funcs->pre_soft_reset) { 5012 r = adev->ip_blocks[i].version->funcs->pre_soft_reset(&adev->ip_blocks[i]); 5013 if (r) 5014 return r; 5015 } 5016 } 5017 5018 return 0; 5019 } 5020 5021 /** 5022 * amdgpu_device_ip_need_full_reset - check if a full asic reset is needed 5023 * 5024 * @adev: amdgpu_device pointer 5025 * 5026 * Some hardware IPs cannot be soft reset. If they are hung, a full gpu 5027 * reset is necessary to recover. 5028 * Returns true if a full asic reset is required, false if not. 5029 */ 5030 static bool amdgpu_device_ip_need_full_reset(struct amdgpu_device *adev) 5031 { 5032 int i; 5033 5034 if (amdgpu_asic_need_full_reset(adev)) 5035 return true; 5036 5037 for (i = 0; i < adev->num_ip_blocks; i++) { 5038 if (!adev->ip_blocks[i].status.valid) 5039 continue; 5040 if ((adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) || 5041 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) || 5042 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_ACP) || 5043 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE) || 5044 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) { 5045 if (adev->ip_blocks[i].status.hang) { 5046 dev_info(adev->dev, "Some block need full reset!\n"); 5047 return true; 5048 } 5049 } 5050 } 5051 return false; 5052 } 5053 5054 /** 5055 * amdgpu_device_ip_soft_reset - do a soft reset 5056 * 5057 * @adev: amdgpu_device pointer 5058 * 5059 * The list of all the hardware IPs that make up the asic is walked and the 5060 * soft_reset callbacks are run if the block is hung. soft_reset handles any 5061 * IP specific hardware or software state changes that are necessary to soft 5062 * reset the IP. 5063 * Returns 0 on success, negative error code on failure. 5064 */ 5065 static int amdgpu_device_ip_soft_reset(struct amdgpu_device *adev) 5066 { 5067 int i, r = 0; 5068 5069 for (i = 0; i < adev->num_ip_blocks; i++) { 5070 if (!adev->ip_blocks[i].status.valid) 5071 continue; 5072 if (adev->ip_blocks[i].status.hang && 5073 adev->ip_blocks[i].version->funcs->soft_reset) { 5074 r = adev->ip_blocks[i].version->funcs->soft_reset(&adev->ip_blocks[i]); 5075 if (r) 5076 return r; 5077 } 5078 } 5079 5080 return 0; 5081 } 5082 5083 /** 5084 * amdgpu_device_ip_post_soft_reset - clean up from soft reset 5085 * 5086 * @adev: amdgpu_device pointer 5087 * 5088 * The list of all the hardware IPs that make up the asic is walked and the 5089 * post_soft_reset callbacks are run if the asic was hung. post_soft_reset 5090 * handles any IP specific hardware or software state changes that are 5091 * necessary after the IP has been soft reset. 5092 * Returns 0 on success, negative error code on failure. 5093 */ 5094 static int amdgpu_device_ip_post_soft_reset(struct amdgpu_device *adev) 5095 { 5096 int i, r = 0; 5097 5098 for (i = 0; i < adev->num_ip_blocks; i++) { 5099 if (!adev->ip_blocks[i].status.valid) 5100 continue; 5101 if (adev->ip_blocks[i].status.hang && 5102 adev->ip_blocks[i].version->funcs->post_soft_reset) 5103 r = adev->ip_blocks[i].version->funcs->post_soft_reset(&adev->ip_blocks[i]); 5104 if (r) 5105 return r; 5106 } 5107 5108 return 0; 5109 } 5110 5111 /** 5112 * amdgpu_device_reset_sriov - reset ASIC for SR-IOV vf 5113 * 5114 * @adev: amdgpu_device pointer 5115 * @reset_context: amdgpu reset context pointer 5116 * 5117 * do VF FLR and reinitialize Asic 5118 * return 0 means succeeded otherwise failed 5119 */ 5120 static int amdgpu_device_reset_sriov(struct amdgpu_device *adev, 5121 struct amdgpu_reset_context *reset_context) 5122 { 5123 int r; 5124 struct amdgpu_hive_info *hive = NULL; 5125 5126 if (test_bit(AMDGPU_HOST_FLR, &reset_context->flags)) { 5127 if (!amdgpu_ras_get_fed_status(adev)) 5128 amdgpu_virt_ready_to_reset(adev); 5129 amdgpu_virt_wait_reset(adev); 5130 clear_bit(AMDGPU_HOST_FLR, &reset_context->flags); 5131 r = amdgpu_virt_request_full_gpu(adev, true); 5132 } else { 5133 r = amdgpu_virt_reset_gpu(adev); 5134 } 5135 if (r) 5136 return r; 5137 5138 amdgpu_ras_set_fed(adev, false); 5139 amdgpu_irq_gpu_reset_resume_helper(adev); 5140 5141 /* some sw clean up VF needs to do before recover */ 5142 amdgpu_virt_post_reset(adev); 5143 5144 /* Resume IP prior to SMC */ 5145 r = amdgpu_device_ip_reinit_early_sriov(adev); 5146 if (r) 5147 return r; 5148 5149 amdgpu_virt_init_data_exchange(adev); 5150 5151 r = amdgpu_device_fw_loading(adev); 5152 if (r) 5153 return r; 5154 5155 /* now we are okay to resume SMC/CP/SDMA */ 5156 r = amdgpu_device_ip_reinit_late_sriov(adev); 5157 if (r) 5158 return r; 5159 5160 hive = amdgpu_get_xgmi_hive(adev); 5161 /* Update PSP FW topology after reset */ 5162 if (hive && adev->gmc.xgmi.num_physical_nodes > 1) 5163 r = amdgpu_xgmi_update_topology(hive, adev); 5164 if (hive) 5165 amdgpu_put_xgmi_hive(hive); 5166 if (r) 5167 return r; 5168 5169 r = amdgpu_ib_ring_tests(adev); 5170 if (r) 5171 return r; 5172 5173 if (adev->virt.gim_feature & AMDGIM_FEATURE_GIM_FLR_VRAMLOST) 5174 amdgpu_inc_vram_lost(adev); 5175 5176 /* need to be called during full access so we can't do it later like 5177 * bare-metal does. 5178 */ 5179 amdgpu_amdkfd_post_reset(adev); 5180 amdgpu_virt_release_full_gpu(adev, true); 5181 5182 /* Aldebaran and gfx_11_0_3 support ras in SRIOV, so need resume ras during reset */ 5183 if (amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 2) || 5184 amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 3) || 5185 amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 4) || 5186 amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(11, 0, 3)) 5187 amdgpu_ras_resume(adev); 5188 return 0; 5189 } 5190 5191 /** 5192 * amdgpu_device_has_job_running - check if there is any job in mirror list 5193 * 5194 * @adev: amdgpu_device pointer 5195 * 5196 * check if there is any job in mirror list 5197 */ 5198 bool amdgpu_device_has_job_running(struct amdgpu_device *adev) 5199 { 5200 int i; 5201 struct drm_sched_job *job; 5202 5203 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 5204 struct amdgpu_ring *ring = adev->rings[i]; 5205 5206 if (!amdgpu_ring_sched_ready(ring)) 5207 continue; 5208 5209 spin_lock(&ring->sched.job_list_lock); 5210 job = list_first_entry_or_null(&ring->sched.pending_list, 5211 struct drm_sched_job, list); 5212 spin_unlock(&ring->sched.job_list_lock); 5213 if (job) 5214 return true; 5215 } 5216 return false; 5217 } 5218 5219 /** 5220 * amdgpu_device_should_recover_gpu - check if we should try GPU recovery 5221 * 5222 * @adev: amdgpu_device pointer 5223 * 5224 * Check amdgpu_gpu_recovery and SRIOV status to see if we should try to recover 5225 * a hung GPU. 5226 */ 5227 bool amdgpu_device_should_recover_gpu(struct amdgpu_device *adev) 5228 { 5229 5230 if (amdgpu_gpu_recovery == 0) 5231 goto disabled; 5232 5233 /* Skip soft reset check in fatal error mode */ 5234 if (!amdgpu_ras_is_poison_mode_supported(adev)) 5235 return true; 5236 5237 if (amdgpu_sriov_vf(adev)) 5238 return true; 5239 5240 if (amdgpu_gpu_recovery == -1) { 5241 switch (adev->asic_type) { 5242 #ifdef CONFIG_DRM_AMDGPU_SI 5243 case CHIP_VERDE: 5244 case CHIP_TAHITI: 5245 case CHIP_PITCAIRN: 5246 case CHIP_OLAND: 5247 case CHIP_HAINAN: 5248 #endif 5249 #ifdef CONFIG_DRM_AMDGPU_CIK 5250 case CHIP_KAVERI: 5251 case CHIP_KABINI: 5252 case CHIP_MULLINS: 5253 #endif 5254 case CHIP_CARRIZO: 5255 case CHIP_STONEY: 5256 case CHIP_CYAN_SKILLFISH: 5257 goto disabled; 5258 default: 5259 break; 5260 } 5261 } 5262 5263 return true; 5264 5265 disabled: 5266 dev_info(adev->dev, "GPU recovery disabled.\n"); 5267 return false; 5268 } 5269 5270 int amdgpu_device_mode1_reset(struct amdgpu_device *adev) 5271 { 5272 u32 i; 5273 int ret = 0; 5274 5275 amdgpu_atombios_scratch_regs_engine_hung(adev, true); 5276 5277 dev_info(adev->dev, "GPU mode1 reset\n"); 5278 5279 /* Cache the state before bus master disable. The saved config space 5280 * values are used in other cases like restore after mode-2 reset. 5281 */ 5282 amdgpu_device_cache_pci_state(adev->pdev); 5283 5284 /* disable BM */ 5285 pci_clear_master(adev->pdev); 5286 5287 if (amdgpu_dpm_is_mode1_reset_supported(adev)) { 5288 dev_info(adev->dev, "GPU smu mode1 reset\n"); 5289 ret = amdgpu_dpm_mode1_reset(adev); 5290 } else { 5291 dev_info(adev->dev, "GPU psp mode1 reset\n"); 5292 ret = psp_gpu_reset(adev); 5293 } 5294 5295 if (ret) 5296 goto mode1_reset_failed; 5297 5298 amdgpu_device_load_pci_state(adev->pdev); 5299 ret = amdgpu_psp_wait_for_bootloader(adev); 5300 if (ret) 5301 goto mode1_reset_failed; 5302 5303 /* wait for asic to come out of reset */ 5304 for (i = 0; i < adev->usec_timeout; i++) { 5305 u32 memsize = adev->nbio.funcs->get_memsize(adev); 5306 5307 if (memsize != 0xffffffff) 5308 break; 5309 udelay(1); 5310 } 5311 5312 if (i >= adev->usec_timeout) { 5313 ret = -ETIMEDOUT; 5314 goto mode1_reset_failed; 5315 } 5316 5317 amdgpu_atombios_scratch_regs_engine_hung(adev, false); 5318 5319 return 0; 5320 5321 mode1_reset_failed: 5322 dev_err(adev->dev, "GPU mode1 reset failed\n"); 5323 return ret; 5324 } 5325 5326 int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev, 5327 struct amdgpu_reset_context *reset_context) 5328 { 5329 int i, r = 0; 5330 struct amdgpu_job *job = NULL; 5331 struct amdgpu_device *tmp_adev = reset_context->reset_req_dev; 5332 bool need_full_reset = 5333 test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 5334 5335 if (reset_context->reset_req_dev == adev) 5336 job = reset_context->job; 5337 5338 if (amdgpu_sriov_vf(adev)) 5339 amdgpu_virt_pre_reset(adev); 5340 5341 amdgpu_fence_driver_isr_toggle(adev, true); 5342 5343 /* block all schedulers and reset given job's ring */ 5344 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 5345 struct amdgpu_ring *ring = adev->rings[i]; 5346 5347 if (!amdgpu_ring_sched_ready(ring)) 5348 continue; 5349 5350 /* Clear job fence from fence drv to avoid force_completion 5351 * leave NULL and vm flush fence in fence drv 5352 */ 5353 amdgpu_fence_driver_clear_job_fences(ring); 5354 5355 /* after all hw jobs are reset, hw fence is meaningless, so force_completion */ 5356 amdgpu_fence_driver_force_completion(ring); 5357 } 5358 5359 amdgpu_fence_driver_isr_toggle(adev, false); 5360 5361 if (job && job->vm) 5362 drm_sched_increase_karma(&job->base); 5363 5364 r = amdgpu_reset_prepare_hwcontext(adev, reset_context); 5365 /* If reset handler not implemented, continue; otherwise return */ 5366 if (r == -EOPNOTSUPP) 5367 r = 0; 5368 else 5369 return r; 5370 5371 /* Don't suspend on bare metal if we are not going to HW reset the ASIC */ 5372 if (!amdgpu_sriov_vf(adev)) { 5373 5374 if (!need_full_reset) 5375 need_full_reset = amdgpu_device_ip_need_full_reset(adev); 5376 5377 if (!need_full_reset && amdgpu_gpu_recovery && 5378 amdgpu_device_ip_check_soft_reset(adev)) { 5379 amdgpu_device_ip_pre_soft_reset(adev); 5380 r = amdgpu_device_ip_soft_reset(adev); 5381 amdgpu_device_ip_post_soft_reset(adev); 5382 if (r || amdgpu_device_ip_check_soft_reset(adev)) { 5383 dev_info(adev->dev, "soft reset failed, will fallback to full reset!\n"); 5384 need_full_reset = true; 5385 } 5386 } 5387 5388 if (!test_bit(AMDGPU_SKIP_COREDUMP, &reset_context->flags)) { 5389 dev_info(tmp_adev->dev, "Dumping IP State\n"); 5390 /* Trigger ip dump before we reset the asic */ 5391 for (i = 0; i < tmp_adev->num_ip_blocks; i++) 5392 if (tmp_adev->ip_blocks[i].version->funcs->dump_ip_state) 5393 tmp_adev->ip_blocks[i].version->funcs 5394 ->dump_ip_state((void *)&tmp_adev->ip_blocks[i]); 5395 dev_info(tmp_adev->dev, "Dumping IP State Completed\n"); 5396 } 5397 5398 if (need_full_reset) 5399 r = amdgpu_device_ip_suspend(adev); 5400 if (need_full_reset) 5401 set_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 5402 else 5403 clear_bit(AMDGPU_NEED_FULL_RESET, 5404 &reset_context->flags); 5405 } 5406 5407 return r; 5408 } 5409 5410 int amdgpu_device_reinit_after_reset(struct amdgpu_reset_context *reset_context) 5411 { 5412 struct list_head *device_list_handle; 5413 bool full_reset, vram_lost = false; 5414 struct amdgpu_device *tmp_adev; 5415 int r; 5416 5417 device_list_handle = reset_context->reset_device_list; 5418 5419 if (!device_list_handle) 5420 return -EINVAL; 5421 5422 full_reset = test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 5423 5424 r = 0; 5425 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 5426 /* After reset, it's default init level */ 5427 amdgpu_set_init_level(tmp_adev, AMDGPU_INIT_LEVEL_DEFAULT); 5428 if (full_reset) { 5429 /* post card */ 5430 amdgpu_ras_set_fed(tmp_adev, false); 5431 r = amdgpu_device_asic_init(tmp_adev); 5432 if (r) { 5433 dev_warn(tmp_adev->dev, "asic atom init failed!"); 5434 } else { 5435 dev_info(tmp_adev->dev, "GPU reset succeeded, trying to resume\n"); 5436 5437 r = amdgpu_device_ip_resume_phase1(tmp_adev); 5438 if (r) 5439 goto out; 5440 5441 vram_lost = amdgpu_device_check_vram_lost(tmp_adev); 5442 5443 if (!test_bit(AMDGPU_SKIP_COREDUMP, &reset_context->flags)) 5444 amdgpu_coredump(tmp_adev, false, vram_lost, reset_context->job); 5445 5446 if (vram_lost) { 5447 DRM_INFO("VRAM is lost due to GPU reset!\n"); 5448 amdgpu_inc_vram_lost(tmp_adev); 5449 } 5450 5451 r = amdgpu_device_fw_loading(tmp_adev); 5452 if (r) 5453 return r; 5454 5455 r = amdgpu_xcp_restore_partition_mode( 5456 tmp_adev->xcp_mgr); 5457 if (r) 5458 goto out; 5459 5460 r = amdgpu_device_ip_resume_phase2(tmp_adev); 5461 if (r) 5462 goto out; 5463 5464 if (tmp_adev->mman.buffer_funcs_ring->sched.ready) 5465 amdgpu_ttm_set_buffer_funcs_status(tmp_adev, true); 5466 5467 if (vram_lost) 5468 amdgpu_device_fill_reset_magic(tmp_adev); 5469 5470 /* 5471 * Add this ASIC as tracked as reset was already 5472 * complete successfully. 5473 */ 5474 amdgpu_register_gpu_instance(tmp_adev); 5475 5476 if (!reset_context->hive && 5477 tmp_adev->gmc.xgmi.num_physical_nodes > 1) 5478 amdgpu_xgmi_add_device(tmp_adev); 5479 5480 r = amdgpu_device_ip_late_init(tmp_adev); 5481 if (r) 5482 goto out; 5483 5484 drm_fb_helper_set_suspend_unlocked(adev_to_drm(tmp_adev)->fb_helper, false); 5485 5486 /* 5487 * The GPU enters bad state once faulty pages 5488 * by ECC has reached the threshold, and ras 5489 * recovery is scheduled next. So add one check 5490 * here to break recovery if it indeed exceeds 5491 * bad page threshold, and remind user to 5492 * retire this GPU or setting one bigger 5493 * bad_page_threshold value to fix this once 5494 * probing driver again. 5495 */ 5496 if (!amdgpu_ras_is_rma(tmp_adev)) { 5497 /* must succeed. */ 5498 amdgpu_ras_resume(tmp_adev); 5499 } else { 5500 r = -EINVAL; 5501 goto out; 5502 } 5503 5504 /* Update PSP FW topology after reset */ 5505 if (reset_context->hive && 5506 tmp_adev->gmc.xgmi.num_physical_nodes > 1) 5507 r = amdgpu_xgmi_update_topology( 5508 reset_context->hive, tmp_adev); 5509 } 5510 } 5511 5512 out: 5513 if (!r) { 5514 amdgpu_irq_gpu_reset_resume_helper(tmp_adev); 5515 r = amdgpu_ib_ring_tests(tmp_adev); 5516 if (r) { 5517 dev_err(tmp_adev->dev, "ib ring test failed (%d).\n", r); 5518 r = -EAGAIN; 5519 goto end; 5520 } 5521 } 5522 5523 if (r) 5524 tmp_adev->asic_reset_res = r; 5525 } 5526 5527 end: 5528 return r; 5529 } 5530 5531 int amdgpu_do_asic_reset(struct list_head *device_list_handle, 5532 struct amdgpu_reset_context *reset_context) 5533 { 5534 struct amdgpu_device *tmp_adev = NULL; 5535 bool need_full_reset, skip_hw_reset; 5536 int r = 0; 5537 5538 /* Try reset handler method first */ 5539 tmp_adev = list_first_entry(device_list_handle, struct amdgpu_device, 5540 reset_list); 5541 5542 reset_context->reset_device_list = device_list_handle; 5543 r = amdgpu_reset_perform_reset(tmp_adev, reset_context); 5544 /* If reset handler not implemented, continue; otherwise return */ 5545 if (r == -EOPNOTSUPP) 5546 r = 0; 5547 else 5548 return r; 5549 5550 /* Reset handler not implemented, use the default method */ 5551 need_full_reset = 5552 test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 5553 skip_hw_reset = test_bit(AMDGPU_SKIP_HW_RESET, &reset_context->flags); 5554 5555 /* 5556 * ASIC reset has to be done on all XGMI hive nodes ASAP 5557 * to allow proper links negotiation in FW (within 1 sec) 5558 */ 5559 if (!skip_hw_reset && need_full_reset) { 5560 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 5561 /* For XGMI run all resets in parallel to speed up the process */ 5562 if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) { 5563 if (!queue_work(system_unbound_wq, 5564 &tmp_adev->xgmi_reset_work)) 5565 r = -EALREADY; 5566 } else 5567 r = amdgpu_asic_reset(tmp_adev); 5568 5569 if (r) { 5570 dev_err(tmp_adev->dev, 5571 "ASIC reset failed with error, %d for drm dev, %s", 5572 r, adev_to_drm(tmp_adev)->unique); 5573 goto out; 5574 } 5575 } 5576 5577 /* For XGMI wait for all resets to complete before proceed */ 5578 if (!r) { 5579 list_for_each_entry(tmp_adev, device_list_handle, 5580 reset_list) { 5581 if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) { 5582 flush_work(&tmp_adev->xgmi_reset_work); 5583 r = tmp_adev->asic_reset_res; 5584 if (r) 5585 break; 5586 } 5587 } 5588 } 5589 } 5590 5591 if (!r && amdgpu_ras_intr_triggered()) { 5592 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 5593 amdgpu_ras_reset_error_count(tmp_adev, 5594 AMDGPU_RAS_BLOCK__MMHUB); 5595 } 5596 5597 amdgpu_ras_intr_cleared(); 5598 } 5599 5600 r = amdgpu_device_reinit_after_reset(reset_context); 5601 if (r == -EAGAIN) 5602 set_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 5603 else 5604 clear_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 5605 5606 out: 5607 return r; 5608 } 5609 5610 static void amdgpu_device_set_mp1_state(struct amdgpu_device *adev) 5611 { 5612 5613 switch (amdgpu_asic_reset_method(adev)) { 5614 case AMD_RESET_METHOD_MODE1: 5615 adev->mp1_state = PP_MP1_STATE_SHUTDOWN; 5616 break; 5617 case AMD_RESET_METHOD_MODE2: 5618 adev->mp1_state = PP_MP1_STATE_RESET; 5619 break; 5620 default: 5621 adev->mp1_state = PP_MP1_STATE_NONE; 5622 break; 5623 } 5624 } 5625 5626 static void amdgpu_device_unset_mp1_state(struct amdgpu_device *adev) 5627 { 5628 amdgpu_vf_error_trans_all(adev); 5629 adev->mp1_state = PP_MP1_STATE_NONE; 5630 } 5631 5632 static void amdgpu_device_resume_display_audio(struct amdgpu_device *adev) 5633 { 5634 struct pci_dev *p = NULL; 5635 5636 p = pci_get_domain_bus_and_slot(pci_domain_nr(adev->pdev->bus), 5637 adev->pdev->bus->number, 1); 5638 if (p) { 5639 pm_runtime_enable(&(p->dev)); 5640 pm_runtime_resume(&(p->dev)); 5641 } 5642 5643 pci_dev_put(p); 5644 } 5645 5646 static int amdgpu_device_suspend_display_audio(struct amdgpu_device *adev) 5647 { 5648 enum amd_reset_method reset_method; 5649 struct pci_dev *p = NULL; 5650 u64 expires; 5651 5652 /* 5653 * For now, only BACO and mode1 reset are confirmed 5654 * to suffer the audio issue without proper suspended. 5655 */ 5656 reset_method = amdgpu_asic_reset_method(adev); 5657 if ((reset_method != AMD_RESET_METHOD_BACO) && 5658 (reset_method != AMD_RESET_METHOD_MODE1)) 5659 return -EINVAL; 5660 5661 p = pci_get_domain_bus_and_slot(pci_domain_nr(adev->pdev->bus), 5662 adev->pdev->bus->number, 1); 5663 if (!p) 5664 return -ENODEV; 5665 5666 expires = pm_runtime_autosuspend_expiration(&(p->dev)); 5667 if (!expires) 5668 /* 5669 * If we cannot get the audio device autosuspend delay, 5670 * a fixed 4S interval will be used. Considering 3S is 5671 * the audio controller default autosuspend delay setting. 5672 * 4S used here is guaranteed to cover that. 5673 */ 5674 expires = ktime_get_mono_fast_ns() + NSEC_PER_SEC * 4ULL; 5675 5676 while (!pm_runtime_status_suspended(&(p->dev))) { 5677 if (!pm_runtime_suspend(&(p->dev))) 5678 break; 5679 5680 if (expires < ktime_get_mono_fast_ns()) { 5681 dev_warn(adev->dev, "failed to suspend display audio\n"); 5682 pci_dev_put(p); 5683 /* TODO: abort the succeeding gpu reset? */ 5684 return -ETIMEDOUT; 5685 } 5686 } 5687 5688 pm_runtime_disable(&(p->dev)); 5689 5690 pci_dev_put(p); 5691 return 0; 5692 } 5693 5694 static inline void amdgpu_device_stop_pending_resets(struct amdgpu_device *adev) 5695 { 5696 struct amdgpu_ras *con = amdgpu_ras_get_context(adev); 5697 5698 #if defined(CONFIG_DEBUG_FS) 5699 if (!amdgpu_sriov_vf(adev)) 5700 cancel_work(&adev->reset_work); 5701 #endif 5702 5703 if (adev->kfd.dev) 5704 cancel_work(&adev->kfd.reset_work); 5705 5706 if (amdgpu_sriov_vf(adev)) 5707 cancel_work(&adev->virt.flr_work); 5708 5709 if (con && adev->ras_enabled) 5710 cancel_work(&con->recovery_work); 5711 5712 } 5713 5714 static int amdgpu_device_health_check(struct list_head *device_list_handle) 5715 { 5716 struct amdgpu_device *tmp_adev; 5717 int ret = 0; 5718 u32 status; 5719 5720 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 5721 pci_read_config_dword(tmp_adev->pdev, PCI_COMMAND, &status); 5722 if (PCI_POSSIBLE_ERROR(status)) { 5723 dev_err(tmp_adev->dev, "device lost from bus!"); 5724 ret = -ENODEV; 5725 } 5726 } 5727 5728 return ret; 5729 } 5730 5731 /** 5732 * amdgpu_device_gpu_recover - reset the asic and recover scheduler 5733 * 5734 * @adev: amdgpu_device pointer 5735 * @job: which job trigger hang 5736 * @reset_context: amdgpu reset context pointer 5737 * 5738 * Attempt to reset the GPU if it has hung (all asics). 5739 * Attempt to do soft-reset or full-reset and reinitialize Asic 5740 * Returns 0 for success or an error on failure. 5741 */ 5742 5743 int amdgpu_device_gpu_recover(struct amdgpu_device *adev, 5744 struct amdgpu_job *job, 5745 struct amdgpu_reset_context *reset_context) 5746 { 5747 struct list_head device_list, *device_list_handle = NULL; 5748 bool job_signaled = false; 5749 struct amdgpu_hive_info *hive = NULL; 5750 struct amdgpu_device *tmp_adev = NULL; 5751 int i, r = 0; 5752 bool need_emergency_restart = false; 5753 bool audio_suspended = false; 5754 int retry_limit = AMDGPU_MAX_RETRY_LIMIT; 5755 5756 /* 5757 * Special case: RAS triggered and full reset isn't supported 5758 */ 5759 need_emergency_restart = amdgpu_ras_need_emergency_restart(adev); 5760 5761 /* 5762 * Flush RAM to disk so that after reboot 5763 * the user can read log and see why the system rebooted. 5764 */ 5765 if (need_emergency_restart && amdgpu_ras_get_context(adev) && 5766 amdgpu_ras_get_context(adev)->reboot) { 5767 DRM_WARN("Emergency reboot."); 5768 5769 ksys_sync_helper(); 5770 emergency_restart(); 5771 } 5772 5773 dev_info(adev->dev, "GPU %s begin!\n", 5774 need_emergency_restart ? "jobs stop":"reset"); 5775 5776 if (!amdgpu_sriov_vf(adev)) 5777 hive = amdgpu_get_xgmi_hive(adev); 5778 if (hive) 5779 mutex_lock(&hive->hive_lock); 5780 5781 reset_context->job = job; 5782 reset_context->hive = hive; 5783 /* 5784 * Build list of devices to reset. 5785 * In case we are in XGMI hive mode, resort the device list 5786 * to put adev in the 1st position. 5787 */ 5788 INIT_LIST_HEAD(&device_list); 5789 if (!amdgpu_sriov_vf(adev) && (adev->gmc.xgmi.num_physical_nodes > 1) && hive) { 5790 list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head) { 5791 list_add_tail(&tmp_adev->reset_list, &device_list); 5792 if (adev->shutdown) 5793 tmp_adev->shutdown = true; 5794 } 5795 if (!list_is_first(&adev->reset_list, &device_list)) 5796 list_rotate_to_front(&adev->reset_list, &device_list); 5797 device_list_handle = &device_list; 5798 } else { 5799 list_add_tail(&adev->reset_list, &device_list); 5800 device_list_handle = &device_list; 5801 } 5802 5803 if (!amdgpu_sriov_vf(adev)) { 5804 r = amdgpu_device_health_check(device_list_handle); 5805 if (r) 5806 goto end_reset; 5807 } 5808 5809 /* We need to lock reset domain only once both for XGMI and single device */ 5810 tmp_adev = list_first_entry(device_list_handle, struct amdgpu_device, 5811 reset_list); 5812 amdgpu_device_lock_reset_domain(tmp_adev->reset_domain); 5813 5814 /* block all schedulers and reset given job's ring */ 5815 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 5816 5817 amdgpu_device_set_mp1_state(tmp_adev); 5818 5819 /* 5820 * Try to put the audio codec into suspend state 5821 * before gpu reset started. 5822 * 5823 * Due to the power domain of the graphics device 5824 * is shared with AZ power domain. Without this, 5825 * we may change the audio hardware from behind 5826 * the audio driver's back. That will trigger 5827 * some audio codec errors. 5828 */ 5829 if (!amdgpu_device_suspend_display_audio(tmp_adev)) 5830 audio_suspended = true; 5831 5832 amdgpu_ras_set_error_query_ready(tmp_adev, false); 5833 5834 cancel_delayed_work_sync(&tmp_adev->delayed_init_work); 5835 5836 amdgpu_amdkfd_pre_reset(tmp_adev, reset_context); 5837 5838 /* 5839 * Mark these ASICs to be reseted as untracked first 5840 * And add them back after reset completed 5841 */ 5842 amdgpu_unregister_gpu_instance(tmp_adev); 5843 5844 drm_fb_helper_set_suspend_unlocked(adev_to_drm(tmp_adev)->fb_helper, true); 5845 5846 /* disable ras on ALL IPs */ 5847 if (!need_emergency_restart && 5848 amdgpu_device_ip_need_full_reset(tmp_adev)) 5849 amdgpu_ras_suspend(tmp_adev); 5850 5851 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 5852 struct amdgpu_ring *ring = tmp_adev->rings[i]; 5853 5854 if (!amdgpu_ring_sched_ready(ring)) 5855 continue; 5856 5857 drm_sched_stop(&ring->sched, job ? &job->base : NULL); 5858 5859 if (need_emergency_restart) 5860 amdgpu_job_stop_all_jobs_on_sched(&ring->sched); 5861 } 5862 atomic_inc(&tmp_adev->gpu_reset_counter); 5863 } 5864 5865 if (need_emergency_restart) 5866 goto skip_sched_resume; 5867 5868 /* 5869 * Must check guilty signal here since after this point all old 5870 * HW fences are force signaled. 5871 * 5872 * job->base holds a reference to parent fence 5873 */ 5874 if (job && dma_fence_is_signaled(&job->hw_fence)) { 5875 job_signaled = true; 5876 dev_info(adev->dev, "Guilty job already signaled, skipping HW reset"); 5877 goto skip_hw_reset; 5878 } 5879 5880 retry: /* Rest of adevs pre asic reset from XGMI hive. */ 5881 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 5882 r = amdgpu_device_pre_asic_reset(tmp_adev, reset_context); 5883 /*TODO Should we stop ?*/ 5884 if (r) { 5885 dev_err(tmp_adev->dev, "GPU pre asic reset failed with err, %d for drm dev, %s ", 5886 r, adev_to_drm(tmp_adev)->unique); 5887 tmp_adev->asic_reset_res = r; 5888 } 5889 } 5890 5891 /* Actual ASIC resets if needed.*/ 5892 /* Host driver will handle XGMI hive reset for SRIOV */ 5893 if (amdgpu_sriov_vf(adev)) { 5894 if (amdgpu_ras_get_fed_status(adev) || amdgpu_virt_rcvd_ras_interrupt(adev)) { 5895 dev_dbg(adev->dev, "Detected RAS error, wait for FLR completion\n"); 5896 amdgpu_ras_set_fed(adev, true); 5897 set_bit(AMDGPU_HOST_FLR, &reset_context->flags); 5898 } 5899 5900 r = amdgpu_device_reset_sriov(adev, reset_context); 5901 if (AMDGPU_RETRY_SRIOV_RESET(r) && (retry_limit--) > 0) { 5902 amdgpu_virt_release_full_gpu(adev, true); 5903 goto retry; 5904 } 5905 if (r) 5906 adev->asic_reset_res = r; 5907 } else { 5908 r = amdgpu_do_asic_reset(device_list_handle, reset_context); 5909 if (r && r == -EAGAIN) 5910 goto retry; 5911 } 5912 5913 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 5914 /* 5915 * Drop any pending non scheduler resets queued before reset is done. 5916 * Any reset scheduled after this point would be valid. Scheduler resets 5917 * were already dropped during drm_sched_stop and no new ones can come 5918 * in before drm_sched_start. 5919 */ 5920 amdgpu_device_stop_pending_resets(tmp_adev); 5921 } 5922 5923 skip_hw_reset: 5924 5925 /* Post ASIC reset for all devs .*/ 5926 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 5927 5928 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 5929 struct amdgpu_ring *ring = tmp_adev->rings[i]; 5930 5931 if (!amdgpu_ring_sched_ready(ring)) 5932 continue; 5933 5934 drm_sched_start(&ring->sched); 5935 } 5936 5937 if (!drm_drv_uses_atomic_modeset(adev_to_drm(tmp_adev)) && !job_signaled) 5938 drm_helper_resume_force_mode(adev_to_drm(tmp_adev)); 5939 5940 if (tmp_adev->asic_reset_res) 5941 r = tmp_adev->asic_reset_res; 5942 5943 tmp_adev->asic_reset_res = 0; 5944 5945 if (r) { 5946 /* bad news, how to tell it to userspace ? 5947 * for ras error, we should report GPU bad status instead of 5948 * reset failure 5949 */ 5950 if (reset_context->src != AMDGPU_RESET_SRC_RAS || 5951 !amdgpu_ras_eeprom_check_err_threshold(tmp_adev)) 5952 dev_info(tmp_adev->dev, "GPU reset(%d) failed\n", 5953 atomic_read(&tmp_adev->gpu_reset_counter)); 5954 amdgpu_vf_error_put(tmp_adev, AMDGIM_ERROR_VF_GPU_RESET_FAIL, 0, r); 5955 } else { 5956 dev_info(tmp_adev->dev, "GPU reset(%d) succeeded!\n", atomic_read(&tmp_adev->gpu_reset_counter)); 5957 if (amdgpu_acpi_smart_shift_update(adev_to_drm(tmp_adev), AMDGPU_SS_DEV_D0)) 5958 DRM_WARN("smart shift update failed\n"); 5959 } 5960 } 5961 5962 skip_sched_resume: 5963 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 5964 /* unlock kfd: SRIOV would do it separately */ 5965 if (!need_emergency_restart && !amdgpu_sriov_vf(tmp_adev)) 5966 amdgpu_amdkfd_post_reset(tmp_adev); 5967 5968 /* kfd_post_reset will do nothing if kfd device is not initialized, 5969 * need to bring up kfd here if it's not be initialized before 5970 */ 5971 if (!adev->kfd.init_complete) 5972 amdgpu_amdkfd_device_init(adev); 5973 5974 if (audio_suspended) 5975 amdgpu_device_resume_display_audio(tmp_adev); 5976 5977 amdgpu_device_unset_mp1_state(tmp_adev); 5978 5979 amdgpu_ras_set_error_query_ready(tmp_adev, true); 5980 } 5981 5982 tmp_adev = list_first_entry(device_list_handle, struct amdgpu_device, 5983 reset_list); 5984 amdgpu_device_unlock_reset_domain(tmp_adev->reset_domain); 5985 5986 end_reset: 5987 if (hive) { 5988 mutex_unlock(&hive->hive_lock); 5989 amdgpu_put_xgmi_hive(hive); 5990 } 5991 5992 if (r) 5993 dev_info(adev->dev, "GPU reset end with ret = %d\n", r); 5994 5995 atomic_set(&adev->reset_domain->reset_res, r); 5996 return r; 5997 } 5998 5999 /** 6000 * amdgpu_device_partner_bandwidth - find the bandwidth of appropriate partner 6001 * 6002 * @adev: amdgpu_device pointer 6003 * @speed: pointer to the speed of the link 6004 * @width: pointer to the width of the link 6005 * 6006 * Evaluate the hierarchy to find the speed and bandwidth capabilities of the 6007 * first physical partner to an AMD dGPU. 6008 * This will exclude any virtual switches and links. 6009 */ 6010 static void amdgpu_device_partner_bandwidth(struct amdgpu_device *adev, 6011 enum pci_bus_speed *speed, 6012 enum pcie_link_width *width) 6013 { 6014 struct pci_dev *parent = adev->pdev; 6015 6016 if (!speed || !width) 6017 return; 6018 6019 *speed = PCI_SPEED_UNKNOWN; 6020 *width = PCIE_LNK_WIDTH_UNKNOWN; 6021 6022 if (amdgpu_device_pcie_dynamic_switching_supported(adev)) { 6023 while ((parent = pci_upstream_bridge(parent))) { 6024 /* skip upstream/downstream switches internal to dGPU*/ 6025 if (parent->vendor == PCI_VENDOR_ID_ATI) 6026 continue; 6027 *speed = pcie_get_speed_cap(parent); 6028 *width = pcie_get_width_cap(parent); 6029 break; 6030 } 6031 } else { 6032 /* use the current speeds rather than max if switching is not supported */ 6033 pcie_bandwidth_available(adev->pdev, NULL, speed, width); 6034 } 6035 } 6036 6037 /** 6038 * amdgpu_device_get_pcie_info - fence pcie info about the PCIE slot 6039 * 6040 * @adev: amdgpu_device pointer 6041 * 6042 * Fetchs and stores in the driver the PCIE capabilities (gen speed 6043 * and lanes) of the slot the device is in. Handles APUs and 6044 * virtualized environments where PCIE config space may not be available. 6045 */ 6046 static void amdgpu_device_get_pcie_info(struct amdgpu_device *adev) 6047 { 6048 struct pci_dev *pdev; 6049 enum pci_bus_speed speed_cap, platform_speed_cap; 6050 enum pcie_link_width platform_link_width; 6051 6052 if (amdgpu_pcie_gen_cap) 6053 adev->pm.pcie_gen_mask = amdgpu_pcie_gen_cap; 6054 6055 if (amdgpu_pcie_lane_cap) 6056 adev->pm.pcie_mlw_mask = amdgpu_pcie_lane_cap; 6057 6058 /* covers APUs as well */ 6059 if (pci_is_root_bus(adev->pdev->bus) && !amdgpu_passthrough(adev)) { 6060 if (adev->pm.pcie_gen_mask == 0) 6061 adev->pm.pcie_gen_mask = AMDGPU_DEFAULT_PCIE_GEN_MASK; 6062 if (adev->pm.pcie_mlw_mask == 0) 6063 adev->pm.pcie_mlw_mask = AMDGPU_DEFAULT_PCIE_MLW_MASK; 6064 return; 6065 } 6066 6067 if (adev->pm.pcie_gen_mask && adev->pm.pcie_mlw_mask) 6068 return; 6069 6070 amdgpu_device_partner_bandwidth(adev, &platform_speed_cap, 6071 &platform_link_width); 6072 6073 if (adev->pm.pcie_gen_mask == 0) { 6074 /* asic caps */ 6075 pdev = adev->pdev; 6076 speed_cap = pcie_get_speed_cap(pdev); 6077 if (speed_cap == PCI_SPEED_UNKNOWN) { 6078 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 6079 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 | 6080 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3); 6081 } else { 6082 if (speed_cap == PCIE_SPEED_32_0GT) 6083 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 6084 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 | 6085 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3 | 6086 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN4 | 6087 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN5); 6088 else if (speed_cap == PCIE_SPEED_16_0GT) 6089 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 6090 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 | 6091 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3 | 6092 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN4); 6093 else if (speed_cap == PCIE_SPEED_8_0GT) 6094 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 6095 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 | 6096 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3); 6097 else if (speed_cap == PCIE_SPEED_5_0GT) 6098 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 6099 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2); 6100 else 6101 adev->pm.pcie_gen_mask |= CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1; 6102 } 6103 /* platform caps */ 6104 if (platform_speed_cap == PCI_SPEED_UNKNOWN) { 6105 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 6106 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2); 6107 } else { 6108 if (platform_speed_cap == PCIE_SPEED_32_0GT) 6109 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 6110 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 | 6111 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3 | 6112 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN4 | 6113 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN5); 6114 else if (platform_speed_cap == PCIE_SPEED_16_0GT) 6115 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 6116 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 | 6117 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3 | 6118 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN4); 6119 else if (platform_speed_cap == PCIE_SPEED_8_0GT) 6120 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 6121 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 | 6122 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3); 6123 else if (platform_speed_cap == PCIE_SPEED_5_0GT) 6124 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 6125 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2); 6126 else 6127 adev->pm.pcie_gen_mask |= CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1; 6128 6129 } 6130 } 6131 if (adev->pm.pcie_mlw_mask == 0) { 6132 if (platform_link_width == PCIE_LNK_WIDTH_UNKNOWN) { 6133 adev->pm.pcie_mlw_mask |= AMDGPU_DEFAULT_PCIE_MLW_MASK; 6134 } else { 6135 switch (platform_link_width) { 6136 case PCIE_LNK_X32: 6137 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X32 | 6138 CAIL_PCIE_LINK_WIDTH_SUPPORT_X16 | 6139 CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 | 6140 CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 | 6141 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 6142 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 6143 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 6144 break; 6145 case PCIE_LNK_X16: 6146 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X16 | 6147 CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 | 6148 CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 | 6149 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 6150 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 6151 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 6152 break; 6153 case PCIE_LNK_X12: 6154 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 | 6155 CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 | 6156 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 6157 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 6158 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 6159 break; 6160 case PCIE_LNK_X8: 6161 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 | 6162 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 6163 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 6164 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 6165 break; 6166 case PCIE_LNK_X4: 6167 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 6168 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 6169 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 6170 break; 6171 case PCIE_LNK_X2: 6172 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 6173 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 6174 break; 6175 case PCIE_LNK_X1: 6176 adev->pm.pcie_mlw_mask = CAIL_PCIE_LINK_WIDTH_SUPPORT_X1; 6177 break; 6178 default: 6179 break; 6180 } 6181 } 6182 } 6183 } 6184 6185 /** 6186 * amdgpu_device_is_peer_accessible - Check peer access through PCIe BAR 6187 * 6188 * @adev: amdgpu_device pointer 6189 * @peer_adev: amdgpu_device pointer for peer device trying to access @adev 6190 * 6191 * Return true if @peer_adev can access (DMA) @adev through the PCIe 6192 * BAR, i.e. @adev is "large BAR" and the BAR matches the DMA mask of 6193 * @peer_adev. 6194 */ 6195 bool amdgpu_device_is_peer_accessible(struct amdgpu_device *adev, 6196 struct amdgpu_device *peer_adev) 6197 { 6198 #ifdef CONFIG_HSA_AMD_P2P 6199 bool p2p_access = 6200 !adev->gmc.xgmi.connected_to_cpu && 6201 !(pci_p2pdma_distance(adev->pdev, peer_adev->dev, false) < 0); 6202 6203 bool is_large_bar = adev->gmc.visible_vram_size && 6204 adev->gmc.real_vram_size == adev->gmc.visible_vram_size; 6205 bool p2p_addressable = amdgpu_device_check_iommu_remap(peer_adev); 6206 6207 if (!p2p_addressable) { 6208 uint64_t address_mask = peer_adev->dev->dma_mask ? 6209 ~*peer_adev->dev->dma_mask : ~((1ULL << 32) - 1); 6210 resource_size_t aper_limit = 6211 adev->gmc.aper_base + adev->gmc.aper_size - 1; 6212 6213 p2p_addressable = !(adev->gmc.aper_base & address_mask || 6214 aper_limit & address_mask); 6215 } 6216 return pcie_p2p && is_large_bar && p2p_access && p2p_addressable; 6217 #else 6218 return false; 6219 #endif 6220 } 6221 6222 int amdgpu_device_baco_enter(struct drm_device *dev) 6223 { 6224 struct amdgpu_device *adev = drm_to_adev(dev); 6225 struct amdgpu_ras *ras = amdgpu_ras_get_context(adev); 6226 6227 if (!amdgpu_device_supports_baco(dev)) 6228 return -ENOTSUPP; 6229 6230 if (ras && adev->ras_enabled && 6231 adev->nbio.funcs->enable_doorbell_interrupt) 6232 adev->nbio.funcs->enable_doorbell_interrupt(adev, false); 6233 6234 return amdgpu_dpm_baco_enter(adev); 6235 } 6236 6237 int amdgpu_device_baco_exit(struct drm_device *dev) 6238 { 6239 struct amdgpu_device *adev = drm_to_adev(dev); 6240 struct amdgpu_ras *ras = amdgpu_ras_get_context(adev); 6241 int ret = 0; 6242 6243 if (!amdgpu_device_supports_baco(dev)) 6244 return -ENOTSUPP; 6245 6246 ret = amdgpu_dpm_baco_exit(adev); 6247 if (ret) 6248 return ret; 6249 6250 if (ras && adev->ras_enabled && 6251 adev->nbio.funcs->enable_doorbell_interrupt) 6252 adev->nbio.funcs->enable_doorbell_interrupt(adev, true); 6253 6254 if (amdgpu_passthrough(adev) && adev->nbio.funcs && 6255 adev->nbio.funcs->clear_doorbell_interrupt) 6256 adev->nbio.funcs->clear_doorbell_interrupt(adev); 6257 6258 return 0; 6259 } 6260 6261 /** 6262 * amdgpu_pci_error_detected - Called when a PCI error is detected. 6263 * @pdev: PCI device struct 6264 * @state: PCI channel state 6265 * 6266 * Description: Called when a PCI error is detected. 6267 * 6268 * Return: PCI_ERS_RESULT_NEED_RESET or PCI_ERS_RESULT_DISCONNECT. 6269 */ 6270 pci_ers_result_t amdgpu_pci_error_detected(struct pci_dev *pdev, pci_channel_state_t state) 6271 { 6272 struct drm_device *dev = pci_get_drvdata(pdev); 6273 struct amdgpu_device *adev = drm_to_adev(dev); 6274 int i; 6275 6276 DRM_INFO("PCI error: detected callback, state(%d)!!\n", state); 6277 6278 if (adev->gmc.xgmi.num_physical_nodes > 1) { 6279 DRM_WARN("No support for XGMI hive yet..."); 6280 return PCI_ERS_RESULT_DISCONNECT; 6281 } 6282 6283 adev->pci_channel_state = state; 6284 6285 switch (state) { 6286 case pci_channel_io_normal: 6287 return PCI_ERS_RESULT_CAN_RECOVER; 6288 /* Fatal error, prepare for slot reset */ 6289 case pci_channel_io_frozen: 6290 /* 6291 * Locking adev->reset_domain->sem will prevent any external access 6292 * to GPU during PCI error recovery 6293 */ 6294 amdgpu_device_lock_reset_domain(adev->reset_domain); 6295 amdgpu_device_set_mp1_state(adev); 6296 6297 /* 6298 * Block any work scheduling as we do for regular GPU reset 6299 * for the duration of the recovery 6300 */ 6301 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 6302 struct amdgpu_ring *ring = adev->rings[i]; 6303 6304 if (!amdgpu_ring_sched_ready(ring)) 6305 continue; 6306 6307 drm_sched_stop(&ring->sched, NULL); 6308 } 6309 atomic_inc(&adev->gpu_reset_counter); 6310 return PCI_ERS_RESULT_NEED_RESET; 6311 case pci_channel_io_perm_failure: 6312 /* Permanent error, prepare for device removal */ 6313 return PCI_ERS_RESULT_DISCONNECT; 6314 } 6315 6316 return PCI_ERS_RESULT_NEED_RESET; 6317 } 6318 6319 /** 6320 * amdgpu_pci_mmio_enabled - Enable MMIO and dump debug registers 6321 * @pdev: pointer to PCI device 6322 */ 6323 pci_ers_result_t amdgpu_pci_mmio_enabled(struct pci_dev *pdev) 6324 { 6325 6326 DRM_INFO("PCI error: mmio enabled callback!!\n"); 6327 6328 /* TODO - dump whatever for debugging purposes */ 6329 6330 /* This called only if amdgpu_pci_error_detected returns 6331 * PCI_ERS_RESULT_CAN_RECOVER. Read/write to the device still 6332 * works, no need to reset slot. 6333 */ 6334 6335 return PCI_ERS_RESULT_RECOVERED; 6336 } 6337 6338 /** 6339 * amdgpu_pci_slot_reset - Called when PCI slot has been reset. 6340 * @pdev: PCI device struct 6341 * 6342 * Description: This routine is called by the pci error recovery 6343 * code after the PCI slot has been reset, just before we 6344 * should resume normal operations. 6345 */ 6346 pci_ers_result_t amdgpu_pci_slot_reset(struct pci_dev *pdev) 6347 { 6348 struct drm_device *dev = pci_get_drvdata(pdev); 6349 struct amdgpu_device *adev = drm_to_adev(dev); 6350 int r, i; 6351 struct amdgpu_reset_context reset_context; 6352 u32 memsize; 6353 struct list_head device_list; 6354 6355 /* PCI error slot reset should be skipped During RAS recovery */ 6356 if ((amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 3) || 6357 amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 4)) && 6358 amdgpu_ras_in_recovery(adev)) 6359 return PCI_ERS_RESULT_RECOVERED; 6360 6361 DRM_INFO("PCI error: slot reset callback!!\n"); 6362 6363 memset(&reset_context, 0, sizeof(reset_context)); 6364 6365 INIT_LIST_HEAD(&device_list); 6366 list_add_tail(&adev->reset_list, &device_list); 6367 6368 /* wait for asic to come out of reset */ 6369 msleep(500); 6370 6371 /* Restore PCI confspace */ 6372 amdgpu_device_load_pci_state(pdev); 6373 6374 /* confirm ASIC came out of reset */ 6375 for (i = 0; i < adev->usec_timeout; i++) { 6376 memsize = amdgpu_asic_get_config_memsize(adev); 6377 6378 if (memsize != 0xffffffff) 6379 break; 6380 udelay(1); 6381 } 6382 if (memsize == 0xffffffff) { 6383 r = -ETIME; 6384 goto out; 6385 } 6386 6387 reset_context.method = AMD_RESET_METHOD_NONE; 6388 reset_context.reset_req_dev = adev; 6389 set_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags); 6390 set_bit(AMDGPU_SKIP_HW_RESET, &reset_context.flags); 6391 6392 adev->no_hw_access = true; 6393 r = amdgpu_device_pre_asic_reset(adev, &reset_context); 6394 adev->no_hw_access = false; 6395 if (r) 6396 goto out; 6397 6398 r = amdgpu_do_asic_reset(&device_list, &reset_context); 6399 6400 out: 6401 if (!r) { 6402 if (amdgpu_device_cache_pci_state(adev->pdev)) 6403 pci_restore_state(adev->pdev); 6404 6405 DRM_INFO("PCIe error recovery succeeded\n"); 6406 } else { 6407 DRM_ERROR("PCIe error recovery failed, err:%d", r); 6408 amdgpu_device_unset_mp1_state(adev); 6409 amdgpu_device_unlock_reset_domain(adev->reset_domain); 6410 } 6411 6412 return r ? PCI_ERS_RESULT_DISCONNECT : PCI_ERS_RESULT_RECOVERED; 6413 } 6414 6415 /** 6416 * amdgpu_pci_resume() - resume normal ops after PCI reset 6417 * @pdev: pointer to PCI device 6418 * 6419 * Called when the error recovery driver tells us that its 6420 * OK to resume normal operation. 6421 */ 6422 void amdgpu_pci_resume(struct pci_dev *pdev) 6423 { 6424 struct drm_device *dev = pci_get_drvdata(pdev); 6425 struct amdgpu_device *adev = drm_to_adev(dev); 6426 int i; 6427 6428 6429 DRM_INFO("PCI error: resume callback!!\n"); 6430 6431 /* Only continue execution for the case of pci_channel_io_frozen */ 6432 if (adev->pci_channel_state != pci_channel_io_frozen) 6433 return; 6434 6435 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 6436 struct amdgpu_ring *ring = adev->rings[i]; 6437 6438 if (!amdgpu_ring_sched_ready(ring)) 6439 continue; 6440 6441 drm_sched_start(&ring->sched); 6442 } 6443 6444 amdgpu_device_unset_mp1_state(adev); 6445 amdgpu_device_unlock_reset_domain(adev->reset_domain); 6446 } 6447 6448 bool amdgpu_device_cache_pci_state(struct pci_dev *pdev) 6449 { 6450 struct drm_device *dev = pci_get_drvdata(pdev); 6451 struct amdgpu_device *adev = drm_to_adev(dev); 6452 int r; 6453 6454 r = pci_save_state(pdev); 6455 if (!r) { 6456 kfree(adev->pci_state); 6457 6458 adev->pci_state = pci_store_saved_state(pdev); 6459 6460 if (!adev->pci_state) { 6461 DRM_ERROR("Failed to store PCI saved state"); 6462 return false; 6463 } 6464 } else { 6465 DRM_WARN("Failed to save PCI state, err:%d\n", r); 6466 return false; 6467 } 6468 6469 return true; 6470 } 6471 6472 bool amdgpu_device_load_pci_state(struct pci_dev *pdev) 6473 { 6474 struct drm_device *dev = pci_get_drvdata(pdev); 6475 struct amdgpu_device *adev = drm_to_adev(dev); 6476 int r; 6477 6478 if (!adev->pci_state) 6479 return false; 6480 6481 r = pci_load_saved_state(pdev, adev->pci_state); 6482 6483 if (!r) { 6484 pci_restore_state(pdev); 6485 } else { 6486 DRM_WARN("Failed to load PCI state, err:%d\n", r); 6487 return false; 6488 } 6489 6490 return true; 6491 } 6492 6493 void amdgpu_device_flush_hdp(struct amdgpu_device *adev, 6494 struct amdgpu_ring *ring) 6495 { 6496 #ifdef CONFIG_X86_64 6497 if ((adev->flags & AMD_IS_APU) && !amdgpu_passthrough(adev)) 6498 return; 6499 #endif 6500 if (adev->gmc.xgmi.connected_to_cpu) 6501 return; 6502 6503 if (ring && ring->funcs->emit_hdp_flush) 6504 amdgpu_ring_emit_hdp_flush(ring); 6505 else 6506 amdgpu_asic_flush_hdp(adev, ring); 6507 } 6508 6509 void amdgpu_device_invalidate_hdp(struct amdgpu_device *adev, 6510 struct amdgpu_ring *ring) 6511 { 6512 #ifdef CONFIG_X86_64 6513 if ((adev->flags & AMD_IS_APU) && !amdgpu_passthrough(adev)) 6514 return; 6515 #endif 6516 if (adev->gmc.xgmi.connected_to_cpu) 6517 return; 6518 6519 amdgpu_asic_invalidate_hdp(adev, ring); 6520 } 6521 6522 int amdgpu_in_reset(struct amdgpu_device *adev) 6523 { 6524 return atomic_read(&adev->reset_domain->in_gpu_reset); 6525 } 6526 6527 /** 6528 * amdgpu_device_halt() - bring hardware to some kind of halt state 6529 * 6530 * @adev: amdgpu_device pointer 6531 * 6532 * Bring hardware to some kind of halt state so that no one can touch it 6533 * any more. It will help to maintain error context when error occurred. 6534 * Compare to a simple hang, the system will keep stable at least for SSH 6535 * access. Then it should be trivial to inspect the hardware state and 6536 * see what's going on. Implemented as following: 6537 * 6538 * 1. drm_dev_unplug() makes device inaccessible to user space(IOCTLs, etc), 6539 * clears all CPU mappings to device, disallows remappings through page faults 6540 * 2. amdgpu_irq_disable_all() disables all interrupts 6541 * 3. amdgpu_fence_driver_hw_fini() signals all HW fences 6542 * 4. set adev->no_hw_access to avoid potential crashes after setp 5 6543 * 5. amdgpu_device_unmap_mmio() clears all MMIO mappings 6544 * 6. pci_disable_device() and pci_wait_for_pending_transaction() 6545 * flush any in flight DMA operations 6546 */ 6547 void amdgpu_device_halt(struct amdgpu_device *adev) 6548 { 6549 struct pci_dev *pdev = adev->pdev; 6550 struct drm_device *ddev = adev_to_drm(adev); 6551 6552 amdgpu_xcp_dev_unplug(adev); 6553 drm_dev_unplug(ddev); 6554 6555 amdgpu_irq_disable_all(adev); 6556 6557 amdgpu_fence_driver_hw_fini(adev); 6558 6559 adev->no_hw_access = true; 6560 6561 amdgpu_device_unmap_mmio(adev); 6562 6563 pci_disable_device(pdev); 6564 pci_wait_for_pending_transaction(pdev); 6565 } 6566 6567 u32 amdgpu_device_pcie_port_rreg(struct amdgpu_device *adev, 6568 u32 reg) 6569 { 6570 unsigned long flags, address, data; 6571 u32 r; 6572 6573 address = adev->nbio.funcs->get_pcie_port_index_offset(adev); 6574 data = adev->nbio.funcs->get_pcie_port_data_offset(adev); 6575 6576 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 6577 WREG32(address, reg * 4); 6578 (void)RREG32(address); 6579 r = RREG32(data); 6580 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 6581 return r; 6582 } 6583 6584 void amdgpu_device_pcie_port_wreg(struct amdgpu_device *adev, 6585 u32 reg, u32 v) 6586 { 6587 unsigned long flags, address, data; 6588 6589 address = adev->nbio.funcs->get_pcie_port_index_offset(adev); 6590 data = adev->nbio.funcs->get_pcie_port_data_offset(adev); 6591 6592 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 6593 WREG32(address, reg * 4); 6594 (void)RREG32(address); 6595 WREG32(data, v); 6596 (void)RREG32(data); 6597 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 6598 } 6599 6600 /** 6601 * amdgpu_device_get_gang - return a reference to the current gang 6602 * @adev: amdgpu_device pointer 6603 * 6604 * Returns: A new reference to the current gang leader. 6605 */ 6606 struct dma_fence *amdgpu_device_get_gang(struct amdgpu_device *adev) 6607 { 6608 struct dma_fence *fence; 6609 6610 rcu_read_lock(); 6611 fence = dma_fence_get_rcu_safe(&adev->gang_submit); 6612 rcu_read_unlock(); 6613 return fence; 6614 } 6615 6616 /** 6617 * amdgpu_device_switch_gang - switch to a new gang 6618 * @adev: amdgpu_device pointer 6619 * @gang: the gang to switch to 6620 * 6621 * Try to switch to a new gang. 6622 * Returns: NULL if we switched to the new gang or a reference to the current 6623 * gang leader. 6624 */ 6625 struct dma_fence *amdgpu_device_switch_gang(struct amdgpu_device *adev, 6626 struct dma_fence *gang) 6627 { 6628 struct dma_fence *old = NULL; 6629 6630 do { 6631 dma_fence_put(old); 6632 old = amdgpu_device_get_gang(adev); 6633 if (old == gang) 6634 break; 6635 6636 if (!dma_fence_is_signaled(old)) 6637 return old; 6638 6639 } while (cmpxchg((struct dma_fence __force **)&adev->gang_submit, 6640 old, gang) != old); 6641 6642 dma_fence_put(old); 6643 return NULL; 6644 } 6645 6646 bool amdgpu_device_has_display_hardware(struct amdgpu_device *adev) 6647 { 6648 switch (adev->asic_type) { 6649 #ifdef CONFIG_DRM_AMDGPU_SI 6650 case CHIP_HAINAN: 6651 #endif 6652 case CHIP_TOPAZ: 6653 /* chips with no display hardware */ 6654 return false; 6655 #ifdef CONFIG_DRM_AMDGPU_SI 6656 case CHIP_TAHITI: 6657 case CHIP_PITCAIRN: 6658 case CHIP_VERDE: 6659 case CHIP_OLAND: 6660 #endif 6661 #ifdef CONFIG_DRM_AMDGPU_CIK 6662 case CHIP_BONAIRE: 6663 case CHIP_HAWAII: 6664 case CHIP_KAVERI: 6665 case CHIP_KABINI: 6666 case CHIP_MULLINS: 6667 #endif 6668 case CHIP_TONGA: 6669 case CHIP_FIJI: 6670 case CHIP_POLARIS10: 6671 case CHIP_POLARIS11: 6672 case CHIP_POLARIS12: 6673 case CHIP_VEGAM: 6674 case CHIP_CARRIZO: 6675 case CHIP_STONEY: 6676 /* chips with display hardware */ 6677 return true; 6678 default: 6679 /* IP discovery */ 6680 if (!amdgpu_ip_version(adev, DCE_HWIP, 0) || 6681 (adev->harvest_ip_mask & AMD_HARVEST_IP_DMU_MASK)) 6682 return false; 6683 return true; 6684 } 6685 } 6686 6687 uint32_t amdgpu_device_wait_on_rreg(struct amdgpu_device *adev, 6688 uint32_t inst, uint32_t reg_addr, char reg_name[], 6689 uint32_t expected_value, uint32_t mask) 6690 { 6691 uint32_t ret = 0; 6692 uint32_t old_ = 0; 6693 uint32_t tmp_ = RREG32(reg_addr); 6694 uint32_t loop = adev->usec_timeout; 6695 6696 while ((tmp_ & (mask)) != (expected_value)) { 6697 if (old_ != tmp_) { 6698 loop = adev->usec_timeout; 6699 old_ = tmp_; 6700 } else 6701 udelay(1); 6702 tmp_ = RREG32(reg_addr); 6703 loop--; 6704 if (!loop) { 6705 DRM_WARN("Register(%d) [%s] failed to reach value 0x%08x != 0x%08xn", 6706 inst, reg_name, (uint32_t)expected_value, 6707 (uint32_t)(tmp_ & (mask))); 6708 ret = -ETIMEDOUT; 6709 break; 6710 } 6711 } 6712 return ret; 6713 } 6714