1 /* 2 * Copyright 2008 Advanced Micro Devices, Inc. 3 * Copyright 2008 Red Hat Inc. 4 * Copyright 2009 Jerome Glisse. 5 * 6 * Permission is hereby granted, free of charge, to any person obtaining a 7 * copy of this software and associated documentation files (the "Software"), 8 * to deal in the Software without restriction, including without limitation 9 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 10 * and/or sell copies of the Software, and to permit persons to whom the 11 * Software is furnished to do so, subject to the following conditions: 12 * 13 * The above copyright notice and this permission notice shall be included in 14 * all copies or substantial portions of the Software. 15 * 16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 19 * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR 20 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, 21 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR 22 * OTHER DEALINGS IN THE SOFTWARE. 23 * 24 * Authors: Dave Airlie 25 * Alex Deucher 26 * Jerome Glisse 27 */ 28 29 #include <linux/aperture.h> 30 #include <linux/power_supply.h> 31 #include <linux/kthread.h> 32 #include <linux/module.h> 33 #include <linux/console.h> 34 #include <linux/slab.h> 35 #include <linux/iommu.h> 36 #include <linux/pci.h> 37 #include <linux/pci-p2pdma.h> 38 #include <linux/apple-gmux.h> 39 40 #include <drm/drm_atomic_helper.h> 41 #include <drm/drm_client_event.h> 42 #include <drm/drm_crtc_helper.h> 43 #include <drm/drm_probe_helper.h> 44 #include <drm/amdgpu_drm.h> 45 #include <linux/device.h> 46 #include <linux/vgaarb.h> 47 #include <linux/vga_switcheroo.h> 48 #include <linux/efi.h> 49 #include "amdgpu.h" 50 #include "amdgpu_trace.h" 51 #include "amdgpu_i2c.h" 52 #include "atom.h" 53 #include "amdgpu_atombios.h" 54 #include "amdgpu_atomfirmware.h" 55 #include "amd_pcie.h" 56 #ifdef CONFIG_DRM_AMDGPU_SI 57 #include "si.h" 58 #endif 59 #ifdef CONFIG_DRM_AMDGPU_CIK 60 #include "cik.h" 61 #endif 62 #include "vi.h" 63 #include "soc15.h" 64 #include "nv.h" 65 #include "bif/bif_4_1_d.h" 66 #include <linux/firmware.h> 67 #include "amdgpu_vf_error.h" 68 69 #include "amdgpu_amdkfd.h" 70 #include "amdgpu_pm.h" 71 72 #include "amdgpu_xgmi.h" 73 #include "amdgpu_ras.h" 74 #include "amdgpu_pmu.h" 75 #include "amdgpu_fru_eeprom.h" 76 #include "amdgpu_reset.h" 77 #include "amdgpu_virt.h" 78 #include "amdgpu_dev_coredump.h" 79 80 #include <linux/suspend.h> 81 #include <drm/task_barrier.h> 82 #include <linux/pm_runtime.h> 83 84 #include <drm/drm_drv.h> 85 86 #if IS_ENABLED(CONFIG_X86) 87 #include <asm/intel-family.h> 88 #endif 89 90 MODULE_FIRMWARE("amdgpu/vega10_gpu_info.bin"); 91 MODULE_FIRMWARE("amdgpu/vega12_gpu_info.bin"); 92 MODULE_FIRMWARE("amdgpu/raven_gpu_info.bin"); 93 MODULE_FIRMWARE("amdgpu/picasso_gpu_info.bin"); 94 MODULE_FIRMWARE("amdgpu/raven2_gpu_info.bin"); 95 MODULE_FIRMWARE("amdgpu/arcturus_gpu_info.bin"); 96 MODULE_FIRMWARE("amdgpu/navi12_gpu_info.bin"); 97 98 #define AMDGPU_RESUME_MS 2000 99 #define AMDGPU_MAX_RETRY_LIMIT 2 100 #define AMDGPU_RETRY_SRIOV_RESET(r) ((r) == -EBUSY || (r) == -ETIMEDOUT || (r) == -EINVAL) 101 #define AMDGPU_PCIE_INDEX_FALLBACK (0x38 >> 2) 102 #define AMDGPU_PCIE_INDEX_HI_FALLBACK (0x44 >> 2) 103 #define AMDGPU_PCIE_DATA_FALLBACK (0x3C >> 2) 104 105 static const struct drm_driver amdgpu_kms_driver; 106 107 const char *amdgpu_asic_name[] = { 108 "TAHITI", 109 "PITCAIRN", 110 "VERDE", 111 "OLAND", 112 "HAINAN", 113 "BONAIRE", 114 "KAVERI", 115 "KABINI", 116 "HAWAII", 117 "MULLINS", 118 "TOPAZ", 119 "TONGA", 120 "FIJI", 121 "CARRIZO", 122 "STONEY", 123 "POLARIS10", 124 "POLARIS11", 125 "POLARIS12", 126 "VEGAM", 127 "VEGA10", 128 "VEGA12", 129 "VEGA20", 130 "RAVEN", 131 "ARCTURUS", 132 "RENOIR", 133 "ALDEBARAN", 134 "NAVI10", 135 "CYAN_SKILLFISH", 136 "NAVI14", 137 "NAVI12", 138 "SIENNA_CICHLID", 139 "NAVY_FLOUNDER", 140 "VANGOGH", 141 "DIMGREY_CAVEFISH", 142 "BEIGE_GOBY", 143 "YELLOW_CARP", 144 "IP DISCOVERY", 145 "LAST", 146 }; 147 148 #define AMDGPU_IP_BLK_MASK_ALL GENMASK(AMD_IP_BLOCK_TYPE_NUM - 1, 0) 149 /* 150 * Default init level where all blocks are expected to be initialized. This is 151 * the level of initialization expected by default and also after a full reset 152 * of the device. 153 */ 154 struct amdgpu_init_level amdgpu_init_default = { 155 .level = AMDGPU_INIT_LEVEL_DEFAULT, 156 .hwini_ip_block_mask = AMDGPU_IP_BLK_MASK_ALL, 157 }; 158 159 struct amdgpu_init_level amdgpu_init_recovery = { 160 .level = AMDGPU_INIT_LEVEL_RESET_RECOVERY, 161 .hwini_ip_block_mask = AMDGPU_IP_BLK_MASK_ALL, 162 }; 163 164 /* 165 * Minimal blocks needed to be initialized before a XGMI hive can be reset. This 166 * is used for cases like reset on initialization where the entire hive needs to 167 * be reset before first use. 168 */ 169 struct amdgpu_init_level amdgpu_init_minimal_xgmi = { 170 .level = AMDGPU_INIT_LEVEL_MINIMAL_XGMI, 171 .hwini_ip_block_mask = 172 BIT(AMD_IP_BLOCK_TYPE_GMC) | BIT(AMD_IP_BLOCK_TYPE_SMC) | 173 BIT(AMD_IP_BLOCK_TYPE_COMMON) | BIT(AMD_IP_BLOCK_TYPE_IH) | 174 BIT(AMD_IP_BLOCK_TYPE_PSP) 175 }; 176 177 static inline bool amdgpu_ip_member_of_hwini(struct amdgpu_device *adev, 178 enum amd_ip_block_type block) 179 { 180 return (adev->init_lvl->hwini_ip_block_mask & (1U << block)) != 0; 181 } 182 183 void amdgpu_set_init_level(struct amdgpu_device *adev, 184 enum amdgpu_init_lvl_id lvl) 185 { 186 switch (lvl) { 187 case AMDGPU_INIT_LEVEL_MINIMAL_XGMI: 188 adev->init_lvl = &amdgpu_init_minimal_xgmi; 189 break; 190 case AMDGPU_INIT_LEVEL_RESET_RECOVERY: 191 adev->init_lvl = &amdgpu_init_recovery; 192 break; 193 case AMDGPU_INIT_LEVEL_DEFAULT: 194 fallthrough; 195 default: 196 adev->init_lvl = &amdgpu_init_default; 197 break; 198 } 199 } 200 201 static inline void amdgpu_device_stop_pending_resets(struct amdgpu_device *adev); 202 static int amdgpu_device_pm_notifier(struct notifier_block *nb, unsigned long mode, 203 void *data); 204 205 /** 206 * DOC: pcie_replay_count 207 * 208 * The amdgpu driver provides a sysfs API for reporting the total number 209 * of PCIe replays (NAKs). 210 * The file pcie_replay_count is used for this and returns the total 211 * number of replays as a sum of the NAKs generated and NAKs received. 212 */ 213 214 static ssize_t amdgpu_device_get_pcie_replay_count(struct device *dev, 215 struct device_attribute *attr, char *buf) 216 { 217 struct drm_device *ddev = dev_get_drvdata(dev); 218 struct amdgpu_device *adev = drm_to_adev(ddev); 219 uint64_t cnt = amdgpu_asic_get_pcie_replay_count(adev); 220 221 return sysfs_emit(buf, "%llu\n", cnt); 222 } 223 224 static DEVICE_ATTR(pcie_replay_count, 0444, 225 amdgpu_device_get_pcie_replay_count, NULL); 226 227 static ssize_t amdgpu_sysfs_reg_state_get(struct file *f, struct kobject *kobj, 228 struct bin_attribute *attr, char *buf, 229 loff_t ppos, size_t count) 230 { 231 struct device *dev = kobj_to_dev(kobj); 232 struct drm_device *ddev = dev_get_drvdata(dev); 233 struct amdgpu_device *adev = drm_to_adev(ddev); 234 ssize_t bytes_read; 235 236 switch (ppos) { 237 case AMDGPU_SYS_REG_STATE_XGMI: 238 bytes_read = amdgpu_asic_get_reg_state( 239 adev, AMDGPU_REG_STATE_TYPE_XGMI, buf, count); 240 break; 241 case AMDGPU_SYS_REG_STATE_WAFL: 242 bytes_read = amdgpu_asic_get_reg_state( 243 adev, AMDGPU_REG_STATE_TYPE_WAFL, buf, count); 244 break; 245 case AMDGPU_SYS_REG_STATE_PCIE: 246 bytes_read = amdgpu_asic_get_reg_state( 247 adev, AMDGPU_REG_STATE_TYPE_PCIE, buf, count); 248 break; 249 case AMDGPU_SYS_REG_STATE_USR: 250 bytes_read = amdgpu_asic_get_reg_state( 251 adev, AMDGPU_REG_STATE_TYPE_USR, buf, count); 252 break; 253 case AMDGPU_SYS_REG_STATE_USR_1: 254 bytes_read = amdgpu_asic_get_reg_state( 255 adev, AMDGPU_REG_STATE_TYPE_USR_1, buf, count); 256 break; 257 default: 258 return -EINVAL; 259 } 260 261 return bytes_read; 262 } 263 264 BIN_ATTR(reg_state, 0444, amdgpu_sysfs_reg_state_get, NULL, 265 AMDGPU_SYS_REG_STATE_END); 266 267 int amdgpu_reg_state_sysfs_init(struct amdgpu_device *adev) 268 { 269 int ret; 270 271 if (!amdgpu_asic_get_reg_state_supported(adev)) 272 return 0; 273 274 ret = sysfs_create_bin_file(&adev->dev->kobj, &bin_attr_reg_state); 275 276 return ret; 277 } 278 279 void amdgpu_reg_state_sysfs_fini(struct amdgpu_device *adev) 280 { 281 if (!amdgpu_asic_get_reg_state_supported(adev)) 282 return; 283 sysfs_remove_bin_file(&adev->dev->kobj, &bin_attr_reg_state); 284 } 285 286 int amdgpu_ip_block_suspend(struct amdgpu_ip_block *ip_block) 287 { 288 int r; 289 290 if (ip_block->version->funcs->suspend) { 291 r = ip_block->version->funcs->suspend(ip_block); 292 if (r) { 293 dev_err(ip_block->adev->dev, 294 "suspend of IP block <%s> failed %d\n", 295 ip_block->version->funcs->name, r); 296 return r; 297 } 298 } 299 300 ip_block->status.hw = false; 301 return 0; 302 } 303 304 int amdgpu_ip_block_resume(struct amdgpu_ip_block *ip_block) 305 { 306 int r; 307 308 if (ip_block->version->funcs->resume) { 309 r = ip_block->version->funcs->resume(ip_block); 310 if (r) { 311 dev_err(ip_block->adev->dev, 312 "resume of IP block <%s> failed %d\n", 313 ip_block->version->funcs->name, r); 314 return r; 315 } 316 } 317 318 ip_block->status.hw = true; 319 return 0; 320 } 321 322 /** 323 * DOC: board_info 324 * 325 * The amdgpu driver provides a sysfs API for giving board related information. 326 * It provides the form factor information in the format 327 * 328 * type : form factor 329 * 330 * Possible form factor values 331 * 332 * - "cem" - PCIE CEM card 333 * - "oam" - Open Compute Accelerator Module 334 * - "unknown" - Not known 335 * 336 */ 337 338 static ssize_t amdgpu_device_get_board_info(struct device *dev, 339 struct device_attribute *attr, 340 char *buf) 341 { 342 struct drm_device *ddev = dev_get_drvdata(dev); 343 struct amdgpu_device *adev = drm_to_adev(ddev); 344 enum amdgpu_pkg_type pkg_type = AMDGPU_PKG_TYPE_CEM; 345 const char *pkg; 346 347 if (adev->smuio.funcs && adev->smuio.funcs->get_pkg_type) 348 pkg_type = adev->smuio.funcs->get_pkg_type(adev); 349 350 switch (pkg_type) { 351 case AMDGPU_PKG_TYPE_CEM: 352 pkg = "cem"; 353 break; 354 case AMDGPU_PKG_TYPE_OAM: 355 pkg = "oam"; 356 break; 357 default: 358 pkg = "unknown"; 359 break; 360 } 361 362 return sysfs_emit(buf, "%s : %s\n", "type", pkg); 363 } 364 365 static DEVICE_ATTR(board_info, 0444, amdgpu_device_get_board_info, NULL); 366 367 static struct attribute *amdgpu_board_attrs[] = { 368 &dev_attr_board_info.attr, 369 NULL, 370 }; 371 372 static umode_t amdgpu_board_attrs_is_visible(struct kobject *kobj, 373 struct attribute *attr, int n) 374 { 375 struct device *dev = kobj_to_dev(kobj); 376 struct drm_device *ddev = dev_get_drvdata(dev); 377 struct amdgpu_device *adev = drm_to_adev(ddev); 378 379 if (adev->flags & AMD_IS_APU) 380 return 0; 381 382 return attr->mode; 383 } 384 385 static const struct attribute_group amdgpu_board_attrs_group = { 386 .attrs = amdgpu_board_attrs, 387 .is_visible = amdgpu_board_attrs_is_visible 388 }; 389 390 static void amdgpu_device_get_pcie_info(struct amdgpu_device *adev); 391 392 393 /** 394 * amdgpu_device_supports_px - Is the device a dGPU with ATPX power control 395 * 396 * @dev: drm_device pointer 397 * 398 * Returns true if the device is a dGPU with ATPX power control, 399 * otherwise return false. 400 */ 401 bool amdgpu_device_supports_px(struct drm_device *dev) 402 { 403 struct amdgpu_device *adev = drm_to_adev(dev); 404 405 if ((adev->flags & AMD_IS_PX) && !amdgpu_is_atpx_hybrid()) 406 return true; 407 return false; 408 } 409 410 /** 411 * amdgpu_device_supports_boco - Is the device a dGPU with ACPI power resources 412 * 413 * @dev: drm_device pointer 414 * 415 * Returns true if the device is a dGPU with ACPI power control, 416 * otherwise return false. 417 */ 418 bool amdgpu_device_supports_boco(struct drm_device *dev) 419 { 420 struct amdgpu_device *adev = drm_to_adev(dev); 421 422 if (adev->has_pr3 || 423 ((adev->flags & AMD_IS_PX) && amdgpu_is_atpx_hybrid())) 424 return true; 425 return false; 426 } 427 428 /** 429 * amdgpu_device_supports_baco - Does the device support BACO 430 * 431 * @dev: drm_device pointer 432 * 433 * Return: 434 * 1 if the device supports BACO; 435 * 3 if the device supports MACO (only works if BACO is supported) 436 * otherwise return 0. 437 */ 438 int amdgpu_device_supports_baco(struct drm_device *dev) 439 { 440 struct amdgpu_device *adev = drm_to_adev(dev); 441 442 return amdgpu_asic_supports_baco(adev); 443 } 444 445 void amdgpu_device_detect_runtime_pm_mode(struct amdgpu_device *adev) 446 { 447 struct drm_device *dev; 448 int bamaco_support; 449 450 dev = adev_to_drm(adev); 451 452 adev->pm.rpm_mode = AMDGPU_RUNPM_NONE; 453 bamaco_support = amdgpu_device_supports_baco(dev); 454 455 switch (amdgpu_runtime_pm) { 456 case 2: 457 if (bamaco_support & MACO_SUPPORT) { 458 adev->pm.rpm_mode = AMDGPU_RUNPM_BAMACO; 459 dev_info(adev->dev, "Forcing BAMACO for runtime pm\n"); 460 } else if (bamaco_support == BACO_SUPPORT) { 461 adev->pm.rpm_mode = AMDGPU_RUNPM_BACO; 462 dev_info(adev->dev, "Requested mode BAMACO not available,fallback to use BACO\n"); 463 } 464 break; 465 case 1: 466 if (bamaco_support & BACO_SUPPORT) { 467 adev->pm.rpm_mode = AMDGPU_RUNPM_BACO; 468 dev_info(adev->dev, "Forcing BACO for runtime pm\n"); 469 } 470 break; 471 case -1: 472 case -2: 473 if (amdgpu_device_supports_px(dev)) { /* enable PX as runtime mode */ 474 adev->pm.rpm_mode = AMDGPU_RUNPM_PX; 475 dev_info(adev->dev, "Using ATPX for runtime pm\n"); 476 } else if (amdgpu_device_supports_boco(dev)) { /* enable boco as runtime mode */ 477 adev->pm.rpm_mode = AMDGPU_RUNPM_BOCO; 478 dev_info(adev->dev, "Using BOCO for runtime pm\n"); 479 } else { 480 if (!bamaco_support) 481 goto no_runtime_pm; 482 483 switch (adev->asic_type) { 484 case CHIP_VEGA20: 485 case CHIP_ARCTURUS: 486 /* BACO are not supported on vega20 and arctrus */ 487 break; 488 case CHIP_VEGA10: 489 /* enable BACO as runpm mode if noretry=0 */ 490 if (!adev->gmc.noretry) 491 adev->pm.rpm_mode = AMDGPU_RUNPM_BACO; 492 break; 493 default: 494 /* enable BACO as runpm mode on CI+ */ 495 adev->pm.rpm_mode = AMDGPU_RUNPM_BACO; 496 break; 497 } 498 499 if (adev->pm.rpm_mode == AMDGPU_RUNPM_BACO) { 500 if (bamaco_support & MACO_SUPPORT) { 501 adev->pm.rpm_mode = AMDGPU_RUNPM_BAMACO; 502 dev_info(adev->dev, "Using BAMACO for runtime pm\n"); 503 } else { 504 dev_info(adev->dev, "Using BACO for runtime pm\n"); 505 } 506 } 507 } 508 break; 509 case 0: 510 dev_info(adev->dev, "runtime pm is manually disabled\n"); 511 break; 512 default: 513 break; 514 } 515 516 no_runtime_pm: 517 if (adev->pm.rpm_mode == AMDGPU_RUNPM_NONE) 518 dev_info(adev->dev, "Runtime PM not available\n"); 519 } 520 /** 521 * amdgpu_device_supports_smart_shift - Is the device dGPU with 522 * smart shift support 523 * 524 * @dev: drm_device pointer 525 * 526 * Returns true if the device is a dGPU with Smart Shift support, 527 * otherwise returns false. 528 */ 529 bool amdgpu_device_supports_smart_shift(struct drm_device *dev) 530 { 531 return (amdgpu_device_supports_boco(dev) && 532 amdgpu_acpi_is_power_shift_control_supported()); 533 } 534 535 /* 536 * VRAM access helper functions 537 */ 538 539 /** 540 * amdgpu_device_mm_access - access vram by MM_INDEX/MM_DATA 541 * 542 * @adev: amdgpu_device pointer 543 * @pos: offset of the buffer in vram 544 * @buf: virtual address of the buffer in system memory 545 * @size: read/write size, sizeof(@buf) must > @size 546 * @write: true - write to vram, otherwise - read from vram 547 */ 548 void amdgpu_device_mm_access(struct amdgpu_device *adev, loff_t pos, 549 void *buf, size_t size, bool write) 550 { 551 unsigned long flags; 552 uint32_t hi = ~0, tmp = 0; 553 uint32_t *data = buf; 554 uint64_t last; 555 int idx; 556 557 if (!drm_dev_enter(adev_to_drm(adev), &idx)) 558 return; 559 560 BUG_ON(!IS_ALIGNED(pos, 4) || !IS_ALIGNED(size, 4)); 561 562 spin_lock_irqsave(&adev->mmio_idx_lock, flags); 563 for (last = pos + size; pos < last; pos += 4) { 564 tmp = pos >> 31; 565 566 WREG32_NO_KIQ(mmMM_INDEX, ((uint32_t)pos) | 0x80000000); 567 if (tmp != hi) { 568 WREG32_NO_KIQ(mmMM_INDEX_HI, tmp); 569 hi = tmp; 570 } 571 if (write) 572 WREG32_NO_KIQ(mmMM_DATA, *data++); 573 else 574 *data++ = RREG32_NO_KIQ(mmMM_DATA); 575 } 576 577 spin_unlock_irqrestore(&adev->mmio_idx_lock, flags); 578 drm_dev_exit(idx); 579 } 580 581 /** 582 * amdgpu_device_aper_access - access vram by vram aperture 583 * 584 * @adev: amdgpu_device pointer 585 * @pos: offset of the buffer in vram 586 * @buf: virtual address of the buffer in system memory 587 * @size: read/write size, sizeof(@buf) must > @size 588 * @write: true - write to vram, otherwise - read from vram 589 * 590 * The return value means how many bytes have been transferred. 591 */ 592 size_t amdgpu_device_aper_access(struct amdgpu_device *adev, loff_t pos, 593 void *buf, size_t size, bool write) 594 { 595 #ifdef CONFIG_64BIT 596 void __iomem *addr; 597 size_t count = 0; 598 uint64_t last; 599 600 if (!adev->mman.aper_base_kaddr) 601 return 0; 602 603 last = min(pos + size, adev->gmc.visible_vram_size); 604 if (last > pos) { 605 addr = adev->mman.aper_base_kaddr + pos; 606 count = last - pos; 607 608 if (write) { 609 memcpy_toio(addr, buf, count); 610 /* Make sure HDP write cache flush happens without any reordering 611 * after the system memory contents are sent over PCIe device 612 */ 613 mb(); 614 amdgpu_device_flush_hdp(adev, NULL); 615 } else { 616 amdgpu_device_invalidate_hdp(adev, NULL); 617 /* Make sure HDP read cache is invalidated before issuing a read 618 * to the PCIe device 619 */ 620 mb(); 621 memcpy_fromio(buf, addr, count); 622 } 623 624 } 625 626 return count; 627 #else 628 return 0; 629 #endif 630 } 631 632 /** 633 * amdgpu_device_vram_access - read/write a buffer in vram 634 * 635 * @adev: amdgpu_device pointer 636 * @pos: offset of the buffer in vram 637 * @buf: virtual address of the buffer in system memory 638 * @size: read/write size, sizeof(@buf) must > @size 639 * @write: true - write to vram, otherwise - read from vram 640 */ 641 void amdgpu_device_vram_access(struct amdgpu_device *adev, loff_t pos, 642 void *buf, size_t size, bool write) 643 { 644 size_t count; 645 646 /* try to using vram apreature to access vram first */ 647 count = amdgpu_device_aper_access(adev, pos, buf, size, write); 648 size -= count; 649 if (size) { 650 /* using MM to access rest vram */ 651 pos += count; 652 buf += count; 653 amdgpu_device_mm_access(adev, pos, buf, size, write); 654 } 655 } 656 657 /* 658 * register access helper functions. 659 */ 660 661 /* Check if hw access should be skipped because of hotplug or device error */ 662 bool amdgpu_device_skip_hw_access(struct amdgpu_device *adev) 663 { 664 if (adev->no_hw_access) 665 return true; 666 667 #ifdef CONFIG_LOCKDEP 668 /* 669 * This is a bit complicated to understand, so worth a comment. What we assert 670 * here is that the GPU reset is not running on another thread in parallel. 671 * 672 * For this we trylock the read side of the reset semaphore, if that succeeds 673 * we know that the reset is not running in parallel. 674 * 675 * If the trylock fails we assert that we are either already holding the read 676 * side of the lock or are the reset thread itself and hold the write side of 677 * the lock. 678 */ 679 if (in_task()) { 680 if (down_read_trylock(&adev->reset_domain->sem)) 681 up_read(&adev->reset_domain->sem); 682 else 683 lockdep_assert_held(&adev->reset_domain->sem); 684 } 685 #endif 686 return false; 687 } 688 689 /** 690 * amdgpu_device_rreg - read a memory mapped IO or indirect register 691 * 692 * @adev: amdgpu_device pointer 693 * @reg: dword aligned register offset 694 * @acc_flags: access flags which require special behavior 695 * 696 * Returns the 32 bit value from the offset specified. 697 */ 698 uint32_t amdgpu_device_rreg(struct amdgpu_device *adev, 699 uint32_t reg, uint32_t acc_flags) 700 { 701 uint32_t ret; 702 703 if (amdgpu_device_skip_hw_access(adev)) 704 return 0; 705 706 if ((reg * 4) < adev->rmmio_size) { 707 if (!(acc_flags & AMDGPU_REGS_NO_KIQ) && 708 amdgpu_sriov_runtime(adev) && 709 down_read_trylock(&adev->reset_domain->sem)) { 710 ret = amdgpu_kiq_rreg(adev, reg, 0); 711 up_read(&adev->reset_domain->sem); 712 } else { 713 ret = readl(((void __iomem *)adev->rmmio) + (reg * 4)); 714 } 715 } else { 716 ret = adev->pcie_rreg(adev, reg * 4); 717 } 718 719 trace_amdgpu_device_rreg(adev->pdev->device, reg, ret); 720 721 return ret; 722 } 723 724 /* 725 * MMIO register read with bytes helper functions 726 * @offset:bytes offset from MMIO start 727 */ 728 729 /** 730 * amdgpu_mm_rreg8 - read a memory mapped IO register 731 * 732 * @adev: amdgpu_device pointer 733 * @offset: byte aligned register offset 734 * 735 * Returns the 8 bit value from the offset specified. 736 */ 737 uint8_t amdgpu_mm_rreg8(struct amdgpu_device *adev, uint32_t offset) 738 { 739 if (amdgpu_device_skip_hw_access(adev)) 740 return 0; 741 742 if (offset < adev->rmmio_size) 743 return (readb(adev->rmmio + offset)); 744 BUG(); 745 } 746 747 748 /** 749 * amdgpu_device_xcc_rreg - read a memory mapped IO or indirect register with specific XCC 750 * 751 * @adev: amdgpu_device pointer 752 * @reg: dword aligned register offset 753 * @acc_flags: access flags which require special behavior 754 * @xcc_id: xcc accelerated compute core id 755 * 756 * Returns the 32 bit value from the offset specified. 757 */ 758 uint32_t amdgpu_device_xcc_rreg(struct amdgpu_device *adev, 759 uint32_t reg, uint32_t acc_flags, 760 uint32_t xcc_id) 761 { 762 uint32_t ret, rlcg_flag; 763 764 if (amdgpu_device_skip_hw_access(adev)) 765 return 0; 766 767 if ((reg * 4) < adev->rmmio_size) { 768 if (amdgpu_sriov_vf(adev) && 769 !amdgpu_sriov_runtime(adev) && 770 adev->gfx.rlc.rlcg_reg_access_supported && 771 amdgpu_virt_get_rlcg_reg_access_flag(adev, acc_flags, 772 GC_HWIP, false, 773 &rlcg_flag)) { 774 ret = amdgpu_virt_rlcg_reg_rw(adev, reg, 0, rlcg_flag, GET_INST(GC, xcc_id)); 775 } else if (!(acc_flags & AMDGPU_REGS_NO_KIQ) && 776 amdgpu_sriov_runtime(adev) && 777 down_read_trylock(&adev->reset_domain->sem)) { 778 ret = amdgpu_kiq_rreg(adev, reg, xcc_id); 779 up_read(&adev->reset_domain->sem); 780 } else { 781 ret = readl(((void __iomem *)adev->rmmio) + (reg * 4)); 782 } 783 } else { 784 ret = adev->pcie_rreg(adev, reg * 4); 785 } 786 787 return ret; 788 } 789 790 /* 791 * MMIO register write with bytes helper functions 792 * @offset:bytes offset from MMIO start 793 * @value: the value want to be written to the register 794 */ 795 796 /** 797 * amdgpu_mm_wreg8 - read a memory mapped IO register 798 * 799 * @adev: amdgpu_device pointer 800 * @offset: byte aligned register offset 801 * @value: 8 bit value to write 802 * 803 * Writes the value specified to the offset specified. 804 */ 805 void amdgpu_mm_wreg8(struct amdgpu_device *adev, uint32_t offset, uint8_t value) 806 { 807 if (amdgpu_device_skip_hw_access(adev)) 808 return; 809 810 if (offset < adev->rmmio_size) 811 writeb(value, adev->rmmio + offset); 812 else 813 BUG(); 814 } 815 816 /** 817 * amdgpu_device_wreg - write to a memory mapped IO or indirect register 818 * 819 * @adev: amdgpu_device pointer 820 * @reg: dword aligned register offset 821 * @v: 32 bit value to write to the register 822 * @acc_flags: access flags which require special behavior 823 * 824 * Writes the value specified to the offset specified. 825 */ 826 void amdgpu_device_wreg(struct amdgpu_device *adev, 827 uint32_t reg, uint32_t v, 828 uint32_t acc_flags) 829 { 830 if (amdgpu_device_skip_hw_access(adev)) 831 return; 832 833 if ((reg * 4) < adev->rmmio_size) { 834 if (!(acc_flags & AMDGPU_REGS_NO_KIQ) && 835 amdgpu_sriov_runtime(adev) && 836 down_read_trylock(&adev->reset_domain->sem)) { 837 amdgpu_kiq_wreg(adev, reg, v, 0); 838 up_read(&adev->reset_domain->sem); 839 } else { 840 writel(v, ((void __iomem *)adev->rmmio) + (reg * 4)); 841 } 842 } else { 843 adev->pcie_wreg(adev, reg * 4, v); 844 } 845 846 trace_amdgpu_device_wreg(adev->pdev->device, reg, v); 847 } 848 849 /** 850 * amdgpu_mm_wreg_mmio_rlc - write register either with direct/indirect mmio or with RLC path if in range 851 * 852 * @adev: amdgpu_device pointer 853 * @reg: mmio/rlc register 854 * @v: value to write 855 * @xcc_id: xcc accelerated compute core id 856 * 857 * this function is invoked only for the debugfs register access 858 */ 859 void amdgpu_mm_wreg_mmio_rlc(struct amdgpu_device *adev, 860 uint32_t reg, uint32_t v, 861 uint32_t xcc_id) 862 { 863 if (amdgpu_device_skip_hw_access(adev)) 864 return; 865 866 if (amdgpu_sriov_fullaccess(adev) && 867 adev->gfx.rlc.funcs && 868 adev->gfx.rlc.funcs->is_rlcg_access_range) { 869 if (adev->gfx.rlc.funcs->is_rlcg_access_range(adev, reg)) 870 return amdgpu_sriov_wreg(adev, reg, v, 0, 0, xcc_id); 871 } else if ((reg * 4) >= adev->rmmio_size) { 872 adev->pcie_wreg(adev, reg * 4, v); 873 } else { 874 writel(v, ((void __iomem *)adev->rmmio) + (reg * 4)); 875 } 876 } 877 878 /** 879 * amdgpu_device_xcc_wreg - write to a memory mapped IO or indirect register with specific XCC 880 * 881 * @adev: amdgpu_device pointer 882 * @reg: dword aligned register offset 883 * @v: 32 bit value to write to the register 884 * @acc_flags: access flags which require special behavior 885 * @xcc_id: xcc accelerated compute core id 886 * 887 * Writes the value specified to the offset specified. 888 */ 889 void amdgpu_device_xcc_wreg(struct amdgpu_device *adev, 890 uint32_t reg, uint32_t v, 891 uint32_t acc_flags, uint32_t xcc_id) 892 { 893 uint32_t rlcg_flag; 894 895 if (amdgpu_device_skip_hw_access(adev)) 896 return; 897 898 if ((reg * 4) < adev->rmmio_size) { 899 if (amdgpu_sriov_vf(adev) && 900 !amdgpu_sriov_runtime(adev) && 901 adev->gfx.rlc.rlcg_reg_access_supported && 902 amdgpu_virt_get_rlcg_reg_access_flag(adev, acc_flags, 903 GC_HWIP, true, 904 &rlcg_flag)) { 905 amdgpu_virt_rlcg_reg_rw(adev, reg, v, rlcg_flag, GET_INST(GC, xcc_id)); 906 } else if (!(acc_flags & AMDGPU_REGS_NO_KIQ) && 907 amdgpu_sriov_runtime(adev) && 908 down_read_trylock(&adev->reset_domain->sem)) { 909 amdgpu_kiq_wreg(adev, reg, v, xcc_id); 910 up_read(&adev->reset_domain->sem); 911 } else { 912 writel(v, ((void __iomem *)adev->rmmio) + (reg * 4)); 913 } 914 } else { 915 adev->pcie_wreg(adev, reg * 4, v); 916 } 917 } 918 919 /** 920 * amdgpu_device_indirect_rreg - read an indirect register 921 * 922 * @adev: amdgpu_device pointer 923 * @reg_addr: indirect register address to read from 924 * 925 * Returns the value of indirect register @reg_addr 926 */ 927 u32 amdgpu_device_indirect_rreg(struct amdgpu_device *adev, 928 u32 reg_addr) 929 { 930 unsigned long flags, pcie_index, pcie_data; 931 void __iomem *pcie_index_offset; 932 void __iomem *pcie_data_offset; 933 u32 r; 934 935 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 936 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 937 938 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 939 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 940 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 941 942 writel(reg_addr, pcie_index_offset); 943 readl(pcie_index_offset); 944 r = readl(pcie_data_offset); 945 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 946 947 return r; 948 } 949 950 u32 amdgpu_device_indirect_rreg_ext(struct amdgpu_device *adev, 951 u64 reg_addr) 952 { 953 unsigned long flags, pcie_index, pcie_index_hi, pcie_data; 954 u32 r; 955 void __iomem *pcie_index_offset; 956 void __iomem *pcie_index_hi_offset; 957 void __iomem *pcie_data_offset; 958 959 if (unlikely(!adev->nbio.funcs)) { 960 pcie_index = AMDGPU_PCIE_INDEX_FALLBACK; 961 pcie_data = AMDGPU_PCIE_DATA_FALLBACK; 962 } else { 963 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 964 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 965 } 966 967 if (reg_addr >> 32) { 968 if (unlikely(!adev->nbio.funcs)) 969 pcie_index_hi = AMDGPU_PCIE_INDEX_HI_FALLBACK; 970 else 971 pcie_index_hi = adev->nbio.funcs->get_pcie_index_hi_offset(adev); 972 } else { 973 pcie_index_hi = 0; 974 } 975 976 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 977 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 978 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 979 if (pcie_index_hi != 0) 980 pcie_index_hi_offset = (void __iomem *)adev->rmmio + 981 pcie_index_hi * 4; 982 983 writel(reg_addr, pcie_index_offset); 984 readl(pcie_index_offset); 985 if (pcie_index_hi != 0) { 986 writel((reg_addr >> 32) & 0xff, pcie_index_hi_offset); 987 readl(pcie_index_hi_offset); 988 } 989 r = readl(pcie_data_offset); 990 991 /* clear the high bits */ 992 if (pcie_index_hi != 0) { 993 writel(0, pcie_index_hi_offset); 994 readl(pcie_index_hi_offset); 995 } 996 997 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 998 999 return r; 1000 } 1001 1002 /** 1003 * amdgpu_device_indirect_rreg64 - read a 64bits indirect register 1004 * 1005 * @adev: amdgpu_device pointer 1006 * @reg_addr: indirect register address to read from 1007 * 1008 * Returns the value of indirect register @reg_addr 1009 */ 1010 u64 amdgpu_device_indirect_rreg64(struct amdgpu_device *adev, 1011 u32 reg_addr) 1012 { 1013 unsigned long flags, pcie_index, pcie_data; 1014 void __iomem *pcie_index_offset; 1015 void __iomem *pcie_data_offset; 1016 u64 r; 1017 1018 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 1019 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 1020 1021 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 1022 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 1023 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 1024 1025 /* read low 32 bits */ 1026 writel(reg_addr, pcie_index_offset); 1027 readl(pcie_index_offset); 1028 r = readl(pcie_data_offset); 1029 /* read high 32 bits */ 1030 writel(reg_addr + 4, pcie_index_offset); 1031 readl(pcie_index_offset); 1032 r |= ((u64)readl(pcie_data_offset) << 32); 1033 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 1034 1035 return r; 1036 } 1037 1038 u64 amdgpu_device_indirect_rreg64_ext(struct amdgpu_device *adev, 1039 u64 reg_addr) 1040 { 1041 unsigned long flags, pcie_index, pcie_data; 1042 unsigned long pcie_index_hi = 0; 1043 void __iomem *pcie_index_offset; 1044 void __iomem *pcie_index_hi_offset; 1045 void __iomem *pcie_data_offset; 1046 u64 r; 1047 1048 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 1049 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 1050 if ((reg_addr >> 32) && (adev->nbio.funcs->get_pcie_index_hi_offset)) 1051 pcie_index_hi = adev->nbio.funcs->get_pcie_index_hi_offset(adev); 1052 1053 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 1054 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 1055 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 1056 if (pcie_index_hi != 0) 1057 pcie_index_hi_offset = (void __iomem *)adev->rmmio + 1058 pcie_index_hi * 4; 1059 1060 /* read low 32 bits */ 1061 writel(reg_addr, pcie_index_offset); 1062 readl(pcie_index_offset); 1063 if (pcie_index_hi != 0) { 1064 writel((reg_addr >> 32) & 0xff, pcie_index_hi_offset); 1065 readl(pcie_index_hi_offset); 1066 } 1067 r = readl(pcie_data_offset); 1068 /* read high 32 bits */ 1069 writel(reg_addr + 4, pcie_index_offset); 1070 readl(pcie_index_offset); 1071 if (pcie_index_hi != 0) { 1072 writel((reg_addr >> 32) & 0xff, pcie_index_hi_offset); 1073 readl(pcie_index_hi_offset); 1074 } 1075 r |= ((u64)readl(pcie_data_offset) << 32); 1076 1077 /* clear the high bits */ 1078 if (pcie_index_hi != 0) { 1079 writel(0, pcie_index_hi_offset); 1080 readl(pcie_index_hi_offset); 1081 } 1082 1083 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 1084 1085 return r; 1086 } 1087 1088 /** 1089 * amdgpu_device_indirect_wreg - write an indirect register address 1090 * 1091 * @adev: amdgpu_device pointer 1092 * @reg_addr: indirect register offset 1093 * @reg_data: indirect register data 1094 * 1095 */ 1096 void amdgpu_device_indirect_wreg(struct amdgpu_device *adev, 1097 u32 reg_addr, u32 reg_data) 1098 { 1099 unsigned long flags, pcie_index, pcie_data; 1100 void __iomem *pcie_index_offset; 1101 void __iomem *pcie_data_offset; 1102 1103 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 1104 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 1105 1106 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 1107 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 1108 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 1109 1110 writel(reg_addr, pcie_index_offset); 1111 readl(pcie_index_offset); 1112 writel(reg_data, pcie_data_offset); 1113 readl(pcie_data_offset); 1114 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 1115 } 1116 1117 void amdgpu_device_indirect_wreg_ext(struct amdgpu_device *adev, 1118 u64 reg_addr, u32 reg_data) 1119 { 1120 unsigned long flags, pcie_index, pcie_index_hi, pcie_data; 1121 void __iomem *pcie_index_offset; 1122 void __iomem *pcie_index_hi_offset; 1123 void __iomem *pcie_data_offset; 1124 1125 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 1126 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 1127 if ((reg_addr >> 32) && (adev->nbio.funcs->get_pcie_index_hi_offset)) 1128 pcie_index_hi = adev->nbio.funcs->get_pcie_index_hi_offset(adev); 1129 else 1130 pcie_index_hi = 0; 1131 1132 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 1133 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 1134 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 1135 if (pcie_index_hi != 0) 1136 pcie_index_hi_offset = (void __iomem *)adev->rmmio + 1137 pcie_index_hi * 4; 1138 1139 writel(reg_addr, pcie_index_offset); 1140 readl(pcie_index_offset); 1141 if (pcie_index_hi != 0) { 1142 writel((reg_addr >> 32) & 0xff, pcie_index_hi_offset); 1143 readl(pcie_index_hi_offset); 1144 } 1145 writel(reg_data, pcie_data_offset); 1146 readl(pcie_data_offset); 1147 1148 /* clear the high bits */ 1149 if (pcie_index_hi != 0) { 1150 writel(0, pcie_index_hi_offset); 1151 readl(pcie_index_hi_offset); 1152 } 1153 1154 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 1155 } 1156 1157 /** 1158 * amdgpu_device_indirect_wreg64 - write a 64bits indirect register address 1159 * 1160 * @adev: amdgpu_device pointer 1161 * @reg_addr: indirect register offset 1162 * @reg_data: indirect register data 1163 * 1164 */ 1165 void amdgpu_device_indirect_wreg64(struct amdgpu_device *adev, 1166 u32 reg_addr, u64 reg_data) 1167 { 1168 unsigned long flags, pcie_index, pcie_data; 1169 void __iomem *pcie_index_offset; 1170 void __iomem *pcie_data_offset; 1171 1172 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 1173 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 1174 1175 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 1176 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 1177 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 1178 1179 /* write low 32 bits */ 1180 writel(reg_addr, pcie_index_offset); 1181 readl(pcie_index_offset); 1182 writel((u32)(reg_data & 0xffffffffULL), pcie_data_offset); 1183 readl(pcie_data_offset); 1184 /* write high 32 bits */ 1185 writel(reg_addr + 4, pcie_index_offset); 1186 readl(pcie_index_offset); 1187 writel((u32)(reg_data >> 32), pcie_data_offset); 1188 readl(pcie_data_offset); 1189 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 1190 } 1191 1192 void amdgpu_device_indirect_wreg64_ext(struct amdgpu_device *adev, 1193 u64 reg_addr, u64 reg_data) 1194 { 1195 unsigned long flags, pcie_index, pcie_data; 1196 unsigned long pcie_index_hi = 0; 1197 void __iomem *pcie_index_offset; 1198 void __iomem *pcie_index_hi_offset; 1199 void __iomem *pcie_data_offset; 1200 1201 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 1202 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 1203 if ((reg_addr >> 32) && (adev->nbio.funcs->get_pcie_index_hi_offset)) 1204 pcie_index_hi = adev->nbio.funcs->get_pcie_index_hi_offset(adev); 1205 1206 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 1207 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 1208 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 1209 if (pcie_index_hi != 0) 1210 pcie_index_hi_offset = (void __iomem *)adev->rmmio + 1211 pcie_index_hi * 4; 1212 1213 /* write low 32 bits */ 1214 writel(reg_addr, pcie_index_offset); 1215 readl(pcie_index_offset); 1216 if (pcie_index_hi != 0) { 1217 writel((reg_addr >> 32) & 0xff, pcie_index_hi_offset); 1218 readl(pcie_index_hi_offset); 1219 } 1220 writel((u32)(reg_data & 0xffffffffULL), pcie_data_offset); 1221 readl(pcie_data_offset); 1222 /* write high 32 bits */ 1223 writel(reg_addr + 4, pcie_index_offset); 1224 readl(pcie_index_offset); 1225 if (pcie_index_hi != 0) { 1226 writel((reg_addr >> 32) & 0xff, pcie_index_hi_offset); 1227 readl(pcie_index_hi_offset); 1228 } 1229 writel((u32)(reg_data >> 32), pcie_data_offset); 1230 readl(pcie_data_offset); 1231 1232 /* clear the high bits */ 1233 if (pcie_index_hi != 0) { 1234 writel(0, pcie_index_hi_offset); 1235 readl(pcie_index_hi_offset); 1236 } 1237 1238 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 1239 } 1240 1241 /** 1242 * amdgpu_device_get_rev_id - query device rev_id 1243 * 1244 * @adev: amdgpu_device pointer 1245 * 1246 * Return device rev_id 1247 */ 1248 u32 amdgpu_device_get_rev_id(struct amdgpu_device *adev) 1249 { 1250 return adev->nbio.funcs->get_rev_id(adev); 1251 } 1252 1253 /** 1254 * amdgpu_invalid_rreg - dummy reg read function 1255 * 1256 * @adev: amdgpu_device pointer 1257 * @reg: offset of register 1258 * 1259 * Dummy register read function. Used for register blocks 1260 * that certain asics don't have (all asics). 1261 * Returns the value in the register. 1262 */ 1263 static uint32_t amdgpu_invalid_rreg(struct amdgpu_device *adev, uint32_t reg) 1264 { 1265 DRM_ERROR("Invalid callback to read register 0x%04X\n", reg); 1266 BUG(); 1267 return 0; 1268 } 1269 1270 static uint32_t amdgpu_invalid_rreg_ext(struct amdgpu_device *adev, uint64_t reg) 1271 { 1272 DRM_ERROR("Invalid callback to read register 0x%llX\n", reg); 1273 BUG(); 1274 return 0; 1275 } 1276 1277 /** 1278 * amdgpu_invalid_wreg - dummy reg write function 1279 * 1280 * @adev: amdgpu_device pointer 1281 * @reg: offset of register 1282 * @v: value to write to the register 1283 * 1284 * Dummy register read function. Used for register blocks 1285 * that certain asics don't have (all asics). 1286 */ 1287 static void amdgpu_invalid_wreg(struct amdgpu_device *adev, uint32_t reg, uint32_t v) 1288 { 1289 DRM_ERROR("Invalid callback to write register 0x%04X with 0x%08X\n", 1290 reg, v); 1291 BUG(); 1292 } 1293 1294 static void amdgpu_invalid_wreg_ext(struct amdgpu_device *adev, uint64_t reg, uint32_t v) 1295 { 1296 DRM_ERROR("Invalid callback to write register 0x%llX with 0x%08X\n", 1297 reg, v); 1298 BUG(); 1299 } 1300 1301 /** 1302 * amdgpu_invalid_rreg64 - dummy 64 bit reg read function 1303 * 1304 * @adev: amdgpu_device pointer 1305 * @reg: offset of register 1306 * 1307 * Dummy register read function. Used for register blocks 1308 * that certain asics don't have (all asics). 1309 * Returns the value in the register. 1310 */ 1311 static uint64_t amdgpu_invalid_rreg64(struct amdgpu_device *adev, uint32_t reg) 1312 { 1313 DRM_ERROR("Invalid callback to read 64 bit register 0x%04X\n", reg); 1314 BUG(); 1315 return 0; 1316 } 1317 1318 static uint64_t amdgpu_invalid_rreg64_ext(struct amdgpu_device *adev, uint64_t reg) 1319 { 1320 DRM_ERROR("Invalid callback to read register 0x%llX\n", reg); 1321 BUG(); 1322 return 0; 1323 } 1324 1325 /** 1326 * amdgpu_invalid_wreg64 - dummy reg write function 1327 * 1328 * @adev: amdgpu_device pointer 1329 * @reg: offset of register 1330 * @v: value to write to the register 1331 * 1332 * Dummy register read function. Used for register blocks 1333 * that certain asics don't have (all asics). 1334 */ 1335 static void amdgpu_invalid_wreg64(struct amdgpu_device *adev, uint32_t reg, uint64_t v) 1336 { 1337 DRM_ERROR("Invalid callback to write 64 bit register 0x%04X with 0x%08llX\n", 1338 reg, v); 1339 BUG(); 1340 } 1341 1342 static void amdgpu_invalid_wreg64_ext(struct amdgpu_device *adev, uint64_t reg, uint64_t v) 1343 { 1344 DRM_ERROR("Invalid callback to write 64 bit register 0x%llX with 0x%08llX\n", 1345 reg, v); 1346 BUG(); 1347 } 1348 1349 /** 1350 * amdgpu_block_invalid_rreg - dummy reg read function 1351 * 1352 * @adev: amdgpu_device pointer 1353 * @block: offset of instance 1354 * @reg: offset of register 1355 * 1356 * Dummy register read function. Used for register blocks 1357 * that certain asics don't have (all asics). 1358 * Returns the value in the register. 1359 */ 1360 static uint32_t amdgpu_block_invalid_rreg(struct amdgpu_device *adev, 1361 uint32_t block, uint32_t reg) 1362 { 1363 DRM_ERROR("Invalid callback to read register 0x%04X in block 0x%04X\n", 1364 reg, block); 1365 BUG(); 1366 return 0; 1367 } 1368 1369 /** 1370 * amdgpu_block_invalid_wreg - dummy reg write function 1371 * 1372 * @adev: amdgpu_device pointer 1373 * @block: offset of instance 1374 * @reg: offset of register 1375 * @v: value to write to the register 1376 * 1377 * Dummy register read function. Used for register blocks 1378 * that certain asics don't have (all asics). 1379 */ 1380 static void amdgpu_block_invalid_wreg(struct amdgpu_device *adev, 1381 uint32_t block, 1382 uint32_t reg, uint32_t v) 1383 { 1384 DRM_ERROR("Invalid block callback to write register 0x%04X in block 0x%04X with 0x%08X\n", 1385 reg, block, v); 1386 BUG(); 1387 } 1388 1389 /** 1390 * amdgpu_device_asic_init - Wrapper for atom asic_init 1391 * 1392 * @adev: amdgpu_device pointer 1393 * 1394 * Does any asic specific work and then calls atom asic init. 1395 */ 1396 static int amdgpu_device_asic_init(struct amdgpu_device *adev) 1397 { 1398 int ret; 1399 1400 amdgpu_asic_pre_asic_init(adev); 1401 1402 if (amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 3) || 1403 amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 4) || 1404 amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 5, 0) || 1405 amdgpu_ip_version(adev, GC_HWIP, 0) >= IP_VERSION(11, 0, 0)) { 1406 amdgpu_psp_wait_for_bootloader(adev); 1407 ret = amdgpu_atomfirmware_asic_init(adev, true); 1408 return ret; 1409 } else { 1410 return amdgpu_atom_asic_init(adev->mode_info.atom_context); 1411 } 1412 1413 return 0; 1414 } 1415 1416 /** 1417 * amdgpu_device_mem_scratch_init - allocate the VRAM scratch page 1418 * 1419 * @adev: amdgpu_device pointer 1420 * 1421 * Allocates a scratch page of VRAM for use by various things in the 1422 * driver. 1423 */ 1424 static int amdgpu_device_mem_scratch_init(struct amdgpu_device *adev) 1425 { 1426 return amdgpu_bo_create_kernel(adev, AMDGPU_GPU_PAGE_SIZE, PAGE_SIZE, 1427 AMDGPU_GEM_DOMAIN_VRAM | 1428 AMDGPU_GEM_DOMAIN_GTT, 1429 &adev->mem_scratch.robj, 1430 &adev->mem_scratch.gpu_addr, 1431 (void **)&adev->mem_scratch.ptr); 1432 } 1433 1434 /** 1435 * amdgpu_device_mem_scratch_fini - Free the VRAM scratch page 1436 * 1437 * @adev: amdgpu_device pointer 1438 * 1439 * Frees the VRAM scratch page. 1440 */ 1441 static void amdgpu_device_mem_scratch_fini(struct amdgpu_device *adev) 1442 { 1443 amdgpu_bo_free_kernel(&adev->mem_scratch.robj, NULL, NULL); 1444 } 1445 1446 /** 1447 * amdgpu_device_program_register_sequence - program an array of registers. 1448 * 1449 * @adev: amdgpu_device pointer 1450 * @registers: pointer to the register array 1451 * @array_size: size of the register array 1452 * 1453 * Programs an array or registers with and or masks. 1454 * This is a helper for setting golden registers. 1455 */ 1456 void amdgpu_device_program_register_sequence(struct amdgpu_device *adev, 1457 const u32 *registers, 1458 const u32 array_size) 1459 { 1460 u32 tmp, reg, and_mask, or_mask; 1461 int i; 1462 1463 if (array_size % 3) 1464 return; 1465 1466 for (i = 0; i < array_size; i += 3) { 1467 reg = registers[i + 0]; 1468 and_mask = registers[i + 1]; 1469 or_mask = registers[i + 2]; 1470 1471 if (and_mask == 0xffffffff) { 1472 tmp = or_mask; 1473 } else { 1474 tmp = RREG32(reg); 1475 tmp &= ~and_mask; 1476 if (adev->family >= AMDGPU_FAMILY_AI) 1477 tmp |= (or_mask & and_mask); 1478 else 1479 tmp |= or_mask; 1480 } 1481 WREG32(reg, tmp); 1482 } 1483 } 1484 1485 /** 1486 * amdgpu_device_pci_config_reset - reset the GPU 1487 * 1488 * @adev: amdgpu_device pointer 1489 * 1490 * Resets the GPU using the pci config reset sequence. 1491 * Only applicable to asics prior to vega10. 1492 */ 1493 void amdgpu_device_pci_config_reset(struct amdgpu_device *adev) 1494 { 1495 pci_write_config_dword(adev->pdev, 0x7c, AMDGPU_ASIC_RESET_DATA); 1496 } 1497 1498 /** 1499 * amdgpu_device_pci_reset - reset the GPU using generic PCI means 1500 * 1501 * @adev: amdgpu_device pointer 1502 * 1503 * Resets the GPU using generic pci reset interfaces (FLR, SBR, etc.). 1504 */ 1505 int amdgpu_device_pci_reset(struct amdgpu_device *adev) 1506 { 1507 return pci_reset_function(adev->pdev); 1508 } 1509 1510 /* 1511 * amdgpu_device_wb_*() 1512 * Writeback is the method by which the GPU updates special pages in memory 1513 * with the status of certain GPU events (fences, ring pointers,etc.). 1514 */ 1515 1516 /** 1517 * amdgpu_device_wb_fini - Disable Writeback and free memory 1518 * 1519 * @adev: amdgpu_device pointer 1520 * 1521 * Disables Writeback and frees the Writeback memory (all asics). 1522 * Used at driver shutdown. 1523 */ 1524 static void amdgpu_device_wb_fini(struct amdgpu_device *adev) 1525 { 1526 if (adev->wb.wb_obj) { 1527 amdgpu_bo_free_kernel(&adev->wb.wb_obj, 1528 &adev->wb.gpu_addr, 1529 (void **)&adev->wb.wb); 1530 adev->wb.wb_obj = NULL; 1531 } 1532 } 1533 1534 /** 1535 * amdgpu_device_wb_init - Init Writeback driver info and allocate memory 1536 * 1537 * @adev: amdgpu_device pointer 1538 * 1539 * Initializes writeback and allocates writeback memory (all asics). 1540 * Used at driver startup. 1541 * Returns 0 on success or an -error on failure. 1542 */ 1543 static int amdgpu_device_wb_init(struct amdgpu_device *adev) 1544 { 1545 int r; 1546 1547 if (adev->wb.wb_obj == NULL) { 1548 /* AMDGPU_MAX_WB * sizeof(uint32_t) * 8 = AMDGPU_MAX_WB 256bit slots */ 1549 r = amdgpu_bo_create_kernel(adev, AMDGPU_MAX_WB * sizeof(uint32_t) * 8, 1550 PAGE_SIZE, AMDGPU_GEM_DOMAIN_GTT, 1551 &adev->wb.wb_obj, &adev->wb.gpu_addr, 1552 (void **)&adev->wb.wb); 1553 if (r) { 1554 dev_warn(adev->dev, "(%d) create WB bo failed\n", r); 1555 return r; 1556 } 1557 1558 adev->wb.num_wb = AMDGPU_MAX_WB; 1559 memset(&adev->wb.used, 0, sizeof(adev->wb.used)); 1560 1561 /* clear wb memory */ 1562 memset((char *)adev->wb.wb, 0, AMDGPU_MAX_WB * sizeof(uint32_t) * 8); 1563 } 1564 1565 return 0; 1566 } 1567 1568 /** 1569 * amdgpu_device_wb_get - Allocate a wb entry 1570 * 1571 * @adev: amdgpu_device pointer 1572 * @wb: wb index 1573 * 1574 * Allocate a wb slot for use by the driver (all asics). 1575 * Returns 0 on success or -EINVAL on failure. 1576 */ 1577 int amdgpu_device_wb_get(struct amdgpu_device *adev, u32 *wb) 1578 { 1579 unsigned long flags, offset; 1580 1581 spin_lock_irqsave(&adev->wb.lock, flags); 1582 offset = find_first_zero_bit(adev->wb.used, adev->wb.num_wb); 1583 if (offset < adev->wb.num_wb) { 1584 __set_bit(offset, adev->wb.used); 1585 spin_unlock_irqrestore(&adev->wb.lock, flags); 1586 *wb = offset << 3; /* convert to dw offset */ 1587 return 0; 1588 } else { 1589 spin_unlock_irqrestore(&adev->wb.lock, flags); 1590 return -EINVAL; 1591 } 1592 } 1593 1594 /** 1595 * amdgpu_device_wb_free - Free a wb entry 1596 * 1597 * @adev: amdgpu_device pointer 1598 * @wb: wb index 1599 * 1600 * Free a wb slot allocated for use by the driver (all asics) 1601 */ 1602 void amdgpu_device_wb_free(struct amdgpu_device *adev, u32 wb) 1603 { 1604 unsigned long flags; 1605 1606 wb >>= 3; 1607 spin_lock_irqsave(&adev->wb.lock, flags); 1608 if (wb < adev->wb.num_wb) 1609 __clear_bit(wb, adev->wb.used); 1610 spin_unlock_irqrestore(&adev->wb.lock, flags); 1611 } 1612 1613 /** 1614 * amdgpu_device_resize_fb_bar - try to resize FB BAR 1615 * 1616 * @adev: amdgpu_device pointer 1617 * 1618 * Try to resize FB BAR to make all VRAM CPU accessible. We try very hard not 1619 * to fail, but if any of the BARs is not accessible after the size we abort 1620 * driver loading by returning -ENODEV. 1621 */ 1622 int amdgpu_device_resize_fb_bar(struct amdgpu_device *adev) 1623 { 1624 int rbar_size = pci_rebar_bytes_to_size(adev->gmc.real_vram_size); 1625 struct pci_bus *root; 1626 struct resource *res; 1627 unsigned int i; 1628 u16 cmd; 1629 int r; 1630 1631 if (!IS_ENABLED(CONFIG_PHYS_ADDR_T_64BIT)) 1632 return 0; 1633 1634 /* Bypass for VF */ 1635 if (amdgpu_sriov_vf(adev)) 1636 return 0; 1637 1638 /* PCI_EXT_CAP_ID_VNDR extended capability is located at 0x100 */ 1639 if (!pci_find_ext_capability(adev->pdev, PCI_EXT_CAP_ID_VNDR)) 1640 DRM_WARN("System can't access extended configuration space, please check!!\n"); 1641 1642 /* skip if the bios has already enabled large BAR */ 1643 if (adev->gmc.real_vram_size && 1644 (pci_resource_len(adev->pdev, 0) >= adev->gmc.real_vram_size)) 1645 return 0; 1646 1647 /* Check if the root BUS has 64bit memory resources */ 1648 root = adev->pdev->bus; 1649 while (root->parent) 1650 root = root->parent; 1651 1652 pci_bus_for_each_resource(root, res, i) { 1653 if (res && res->flags & (IORESOURCE_MEM | IORESOURCE_MEM_64) && 1654 res->start > 0x100000000ull) 1655 break; 1656 } 1657 1658 /* Trying to resize is pointless without a root hub window above 4GB */ 1659 if (!res) 1660 return 0; 1661 1662 /* Limit the BAR size to what is available */ 1663 rbar_size = min(fls(pci_rebar_get_possible_sizes(adev->pdev, 0)) - 1, 1664 rbar_size); 1665 1666 /* Disable memory decoding while we change the BAR addresses and size */ 1667 pci_read_config_word(adev->pdev, PCI_COMMAND, &cmd); 1668 pci_write_config_word(adev->pdev, PCI_COMMAND, 1669 cmd & ~PCI_COMMAND_MEMORY); 1670 1671 /* Free the VRAM and doorbell BAR, we most likely need to move both. */ 1672 amdgpu_doorbell_fini(adev); 1673 if (adev->asic_type >= CHIP_BONAIRE) 1674 pci_release_resource(adev->pdev, 2); 1675 1676 pci_release_resource(adev->pdev, 0); 1677 1678 r = pci_resize_resource(adev->pdev, 0, rbar_size); 1679 if (r == -ENOSPC) 1680 DRM_INFO("Not enough PCI address space for a large BAR."); 1681 else if (r && r != -ENOTSUPP) 1682 DRM_ERROR("Problem resizing BAR0 (%d).", r); 1683 1684 pci_assign_unassigned_bus_resources(adev->pdev->bus); 1685 1686 /* When the doorbell or fb BAR isn't available we have no chance of 1687 * using the device. 1688 */ 1689 r = amdgpu_doorbell_init(adev); 1690 if (r || (pci_resource_flags(adev->pdev, 0) & IORESOURCE_UNSET)) 1691 return -ENODEV; 1692 1693 pci_write_config_word(adev->pdev, PCI_COMMAND, cmd); 1694 1695 return 0; 1696 } 1697 1698 static bool amdgpu_device_read_bios(struct amdgpu_device *adev) 1699 { 1700 if (hweight32(adev->aid_mask) && (adev->flags & AMD_IS_APU)) 1701 return false; 1702 1703 return true; 1704 } 1705 1706 /* 1707 * GPU helpers function. 1708 */ 1709 /** 1710 * amdgpu_device_need_post - check if the hw need post or not 1711 * 1712 * @adev: amdgpu_device pointer 1713 * 1714 * Check if the asic has been initialized (all asics) at driver startup 1715 * or post is needed if hw reset is performed. 1716 * Returns true if need or false if not. 1717 */ 1718 bool amdgpu_device_need_post(struct amdgpu_device *adev) 1719 { 1720 uint32_t reg; 1721 1722 if (amdgpu_sriov_vf(adev)) 1723 return false; 1724 1725 if (!amdgpu_device_read_bios(adev)) 1726 return false; 1727 1728 if (amdgpu_passthrough(adev)) { 1729 /* for FIJI: In whole GPU pass-through virtualization case, after VM reboot 1730 * some old smc fw still need driver do vPost otherwise gpu hang, while 1731 * those smc fw version above 22.15 doesn't have this flaw, so we force 1732 * vpost executed for smc version below 22.15 1733 */ 1734 if (adev->asic_type == CHIP_FIJI) { 1735 int err; 1736 uint32_t fw_ver; 1737 1738 err = request_firmware(&adev->pm.fw, "amdgpu/fiji_smc.bin", adev->dev); 1739 /* force vPost if error occurred */ 1740 if (err) 1741 return true; 1742 1743 fw_ver = *((uint32_t *)adev->pm.fw->data + 69); 1744 release_firmware(adev->pm.fw); 1745 if (fw_ver < 0x00160e00) 1746 return true; 1747 } 1748 } 1749 1750 /* Don't post if we need to reset whole hive on init */ 1751 if (adev->init_lvl->level == AMDGPU_INIT_LEVEL_MINIMAL_XGMI) 1752 return false; 1753 1754 if (adev->has_hw_reset) { 1755 adev->has_hw_reset = false; 1756 return true; 1757 } 1758 1759 /* bios scratch used on CIK+ */ 1760 if (adev->asic_type >= CHIP_BONAIRE) 1761 return amdgpu_atombios_scratch_need_asic_init(adev); 1762 1763 /* check MEM_SIZE for older asics */ 1764 reg = amdgpu_asic_get_config_memsize(adev); 1765 1766 if ((reg != 0) && (reg != 0xffffffff)) 1767 return false; 1768 1769 return true; 1770 } 1771 1772 /* 1773 * Check whether seamless boot is supported. 1774 * 1775 * So far we only support seamless boot on DCE 3.0 or later. 1776 * If users report that it works on older ASICS as well, we may 1777 * loosen this. 1778 */ 1779 bool amdgpu_device_seamless_boot_supported(struct amdgpu_device *adev) 1780 { 1781 switch (amdgpu_seamless) { 1782 case -1: 1783 break; 1784 case 1: 1785 return true; 1786 case 0: 1787 return false; 1788 default: 1789 DRM_ERROR("Invalid value for amdgpu.seamless: %d\n", 1790 amdgpu_seamless); 1791 return false; 1792 } 1793 1794 if (!(adev->flags & AMD_IS_APU)) 1795 return false; 1796 1797 if (adev->mman.keep_stolen_vga_memory) 1798 return false; 1799 1800 return amdgpu_ip_version(adev, DCE_HWIP, 0) >= IP_VERSION(3, 0, 0); 1801 } 1802 1803 /* 1804 * Intel hosts such as Rocket Lake, Alder Lake, Raptor Lake and Sapphire Rapids 1805 * don't support dynamic speed switching. Until we have confirmation from Intel 1806 * that a specific host supports it, it's safer that we keep it disabled for all. 1807 * 1808 * https://edc.intel.com/content/www/us/en/design/products/platforms/details/raptor-lake-s/13th-generation-core-processors-datasheet-volume-1-of-2/005/pci-express-support/ 1809 * https://gitlab.freedesktop.org/drm/amd/-/issues/2663 1810 */ 1811 static bool amdgpu_device_pcie_dynamic_switching_supported(struct amdgpu_device *adev) 1812 { 1813 #if IS_ENABLED(CONFIG_X86) 1814 struct cpuinfo_x86 *c = &cpu_data(0); 1815 1816 /* eGPU change speeds based on USB4 fabric conditions */ 1817 if (dev_is_removable(adev->dev)) 1818 return true; 1819 1820 if (c->x86_vendor == X86_VENDOR_INTEL) 1821 return false; 1822 #endif 1823 return true; 1824 } 1825 1826 /** 1827 * amdgpu_device_should_use_aspm - check if the device should program ASPM 1828 * 1829 * @adev: amdgpu_device pointer 1830 * 1831 * Confirm whether the module parameter and pcie bridge agree that ASPM should 1832 * be set for this device. 1833 * 1834 * Returns true if it should be used or false if not. 1835 */ 1836 bool amdgpu_device_should_use_aspm(struct amdgpu_device *adev) 1837 { 1838 switch (amdgpu_aspm) { 1839 case -1: 1840 break; 1841 case 0: 1842 return false; 1843 case 1: 1844 return true; 1845 default: 1846 return false; 1847 } 1848 if (adev->flags & AMD_IS_APU) 1849 return false; 1850 if (!(adev->pm.pp_feature & PP_PCIE_DPM_MASK)) 1851 return false; 1852 return pcie_aspm_enabled(adev->pdev); 1853 } 1854 1855 /* if we get transitioned to only one device, take VGA back */ 1856 /** 1857 * amdgpu_device_vga_set_decode - enable/disable vga decode 1858 * 1859 * @pdev: PCI device pointer 1860 * @state: enable/disable vga decode 1861 * 1862 * Enable/disable vga decode (all asics). 1863 * Returns VGA resource flags. 1864 */ 1865 static unsigned int amdgpu_device_vga_set_decode(struct pci_dev *pdev, 1866 bool state) 1867 { 1868 struct amdgpu_device *adev = drm_to_adev(pci_get_drvdata(pdev)); 1869 1870 amdgpu_asic_set_vga_state(adev, state); 1871 if (state) 1872 return VGA_RSRC_LEGACY_IO | VGA_RSRC_LEGACY_MEM | 1873 VGA_RSRC_NORMAL_IO | VGA_RSRC_NORMAL_MEM; 1874 else 1875 return VGA_RSRC_NORMAL_IO | VGA_RSRC_NORMAL_MEM; 1876 } 1877 1878 /** 1879 * amdgpu_device_check_block_size - validate the vm block size 1880 * 1881 * @adev: amdgpu_device pointer 1882 * 1883 * Validates the vm block size specified via module parameter. 1884 * The vm block size defines number of bits in page table versus page directory, 1885 * a page is 4KB so we have 12 bits offset, minimum 9 bits in the 1886 * page table and the remaining bits are in the page directory. 1887 */ 1888 static void amdgpu_device_check_block_size(struct amdgpu_device *adev) 1889 { 1890 /* defines number of bits in page table versus page directory, 1891 * a page is 4KB so we have 12 bits offset, minimum 9 bits in the 1892 * page table and the remaining bits are in the page directory 1893 */ 1894 if (amdgpu_vm_block_size == -1) 1895 return; 1896 1897 if (amdgpu_vm_block_size < 9) { 1898 dev_warn(adev->dev, "VM page table size (%d) too small\n", 1899 amdgpu_vm_block_size); 1900 amdgpu_vm_block_size = -1; 1901 } 1902 } 1903 1904 /** 1905 * amdgpu_device_check_vm_size - validate the vm size 1906 * 1907 * @adev: amdgpu_device pointer 1908 * 1909 * Validates the vm size in GB specified via module parameter. 1910 * The VM size is the size of the GPU virtual memory space in GB. 1911 */ 1912 static void amdgpu_device_check_vm_size(struct amdgpu_device *adev) 1913 { 1914 /* no need to check the default value */ 1915 if (amdgpu_vm_size == -1) 1916 return; 1917 1918 if (amdgpu_vm_size < 1) { 1919 dev_warn(adev->dev, "VM size (%d) too small, min is 1GB\n", 1920 amdgpu_vm_size); 1921 amdgpu_vm_size = -1; 1922 } 1923 } 1924 1925 static void amdgpu_device_check_smu_prv_buffer_size(struct amdgpu_device *adev) 1926 { 1927 struct sysinfo si; 1928 bool is_os_64 = (sizeof(void *) == 8); 1929 uint64_t total_memory; 1930 uint64_t dram_size_seven_GB = 0x1B8000000; 1931 uint64_t dram_size_three_GB = 0xB8000000; 1932 1933 if (amdgpu_smu_memory_pool_size == 0) 1934 return; 1935 1936 if (!is_os_64) { 1937 DRM_WARN("Not 64-bit OS, feature not supported\n"); 1938 goto def_value; 1939 } 1940 si_meminfo(&si); 1941 total_memory = (uint64_t)si.totalram * si.mem_unit; 1942 1943 if ((amdgpu_smu_memory_pool_size == 1) || 1944 (amdgpu_smu_memory_pool_size == 2)) { 1945 if (total_memory < dram_size_three_GB) 1946 goto def_value1; 1947 } else if ((amdgpu_smu_memory_pool_size == 4) || 1948 (amdgpu_smu_memory_pool_size == 8)) { 1949 if (total_memory < dram_size_seven_GB) 1950 goto def_value1; 1951 } else { 1952 DRM_WARN("Smu memory pool size not supported\n"); 1953 goto def_value; 1954 } 1955 adev->pm.smu_prv_buffer_size = amdgpu_smu_memory_pool_size << 28; 1956 1957 return; 1958 1959 def_value1: 1960 DRM_WARN("No enough system memory\n"); 1961 def_value: 1962 adev->pm.smu_prv_buffer_size = 0; 1963 } 1964 1965 static int amdgpu_device_init_apu_flags(struct amdgpu_device *adev) 1966 { 1967 if (!(adev->flags & AMD_IS_APU) || 1968 adev->asic_type < CHIP_RAVEN) 1969 return 0; 1970 1971 switch (adev->asic_type) { 1972 case CHIP_RAVEN: 1973 if (adev->pdev->device == 0x15dd) 1974 adev->apu_flags |= AMD_APU_IS_RAVEN; 1975 if (adev->pdev->device == 0x15d8) 1976 adev->apu_flags |= AMD_APU_IS_PICASSO; 1977 break; 1978 case CHIP_RENOIR: 1979 if ((adev->pdev->device == 0x1636) || 1980 (adev->pdev->device == 0x164c)) 1981 adev->apu_flags |= AMD_APU_IS_RENOIR; 1982 else 1983 adev->apu_flags |= AMD_APU_IS_GREEN_SARDINE; 1984 break; 1985 case CHIP_VANGOGH: 1986 adev->apu_flags |= AMD_APU_IS_VANGOGH; 1987 break; 1988 case CHIP_YELLOW_CARP: 1989 break; 1990 case CHIP_CYAN_SKILLFISH: 1991 if ((adev->pdev->device == 0x13FE) || 1992 (adev->pdev->device == 0x143F)) 1993 adev->apu_flags |= AMD_APU_IS_CYAN_SKILLFISH2; 1994 break; 1995 default: 1996 break; 1997 } 1998 1999 return 0; 2000 } 2001 2002 /** 2003 * amdgpu_device_check_arguments - validate module params 2004 * 2005 * @adev: amdgpu_device pointer 2006 * 2007 * Validates certain module parameters and updates 2008 * the associated values used by the driver (all asics). 2009 */ 2010 static int amdgpu_device_check_arguments(struct amdgpu_device *adev) 2011 { 2012 int i; 2013 2014 if (amdgpu_sched_jobs < 4) { 2015 dev_warn(adev->dev, "sched jobs (%d) must be at least 4\n", 2016 amdgpu_sched_jobs); 2017 amdgpu_sched_jobs = 4; 2018 } else if (!is_power_of_2(amdgpu_sched_jobs)) { 2019 dev_warn(adev->dev, "sched jobs (%d) must be a power of 2\n", 2020 amdgpu_sched_jobs); 2021 amdgpu_sched_jobs = roundup_pow_of_two(amdgpu_sched_jobs); 2022 } 2023 2024 if (amdgpu_gart_size != -1 && amdgpu_gart_size < 32) { 2025 /* gart size must be greater or equal to 32M */ 2026 dev_warn(adev->dev, "gart size (%d) too small\n", 2027 amdgpu_gart_size); 2028 amdgpu_gart_size = -1; 2029 } 2030 2031 if (amdgpu_gtt_size != -1 && amdgpu_gtt_size < 32) { 2032 /* gtt size must be greater or equal to 32M */ 2033 dev_warn(adev->dev, "gtt size (%d) too small\n", 2034 amdgpu_gtt_size); 2035 amdgpu_gtt_size = -1; 2036 } 2037 2038 /* valid range is between 4 and 9 inclusive */ 2039 if (amdgpu_vm_fragment_size != -1 && 2040 (amdgpu_vm_fragment_size > 9 || amdgpu_vm_fragment_size < 4)) { 2041 dev_warn(adev->dev, "valid range is between 4 and 9\n"); 2042 amdgpu_vm_fragment_size = -1; 2043 } 2044 2045 if (amdgpu_sched_hw_submission < 2) { 2046 dev_warn(adev->dev, "sched hw submission jobs (%d) must be at least 2\n", 2047 amdgpu_sched_hw_submission); 2048 amdgpu_sched_hw_submission = 2; 2049 } else if (!is_power_of_2(amdgpu_sched_hw_submission)) { 2050 dev_warn(adev->dev, "sched hw submission jobs (%d) must be a power of 2\n", 2051 amdgpu_sched_hw_submission); 2052 amdgpu_sched_hw_submission = roundup_pow_of_two(amdgpu_sched_hw_submission); 2053 } 2054 2055 if (amdgpu_reset_method < -1 || amdgpu_reset_method > 4) { 2056 dev_warn(adev->dev, "invalid option for reset method, reverting to default\n"); 2057 amdgpu_reset_method = -1; 2058 } 2059 2060 amdgpu_device_check_smu_prv_buffer_size(adev); 2061 2062 amdgpu_device_check_vm_size(adev); 2063 2064 amdgpu_device_check_block_size(adev); 2065 2066 adev->firmware.load_type = amdgpu_ucode_get_load_type(adev, amdgpu_fw_load_type); 2067 2068 for (i = 0; i < MAX_XCP; i++) 2069 adev->enforce_isolation[i] = !!enforce_isolation; 2070 2071 return 0; 2072 } 2073 2074 /** 2075 * amdgpu_switcheroo_set_state - set switcheroo state 2076 * 2077 * @pdev: pci dev pointer 2078 * @state: vga_switcheroo state 2079 * 2080 * Callback for the switcheroo driver. Suspends or resumes 2081 * the asics before or after it is powered up using ACPI methods. 2082 */ 2083 static void amdgpu_switcheroo_set_state(struct pci_dev *pdev, 2084 enum vga_switcheroo_state state) 2085 { 2086 struct drm_device *dev = pci_get_drvdata(pdev); 2087 int r; 2088 2089 if (amdgpu_device_supports_px(dev) && state == VGA_SWITCHEROO_OFF) 2090 return; 2091 2092 if (state == VGA_SWITCHEROO_ON) { 2093 pr_info("switched on\n"); 2094 /* don't suspend or resume card normally */ 2095 dev->switch_power_state = DRM_SWITCH_POWER_CHANGING; 2096 2097 pci_set_power_state(pdev, PCI_D0); 2098 amdgpu_device_load_pci_state(pdev); 2099 r = pci_enable_device(pdev); 2100 if (r) 2101 DRM_WARN("pci_enable_device failed (%d)\n", r); 2102 amdgpu_device_resume(dev, true); 2103 2104 dev->switch_power_state = DRM_SWITCH_POWER_ON; 2105 } else { 2106 pr_info("switched off\n"); 2107 dev->switch_power_state = DRM_SWITCH_POWER_CHANGING; 2108 amdgpu_device_prepare(dev); 2109 amdgpu_device_suspend(dev, true); 2110 amdgpu_device_cache_pci_state(pdev); 2111 /* Shut down the device */ 2112 pci_disable_device(pdev); 2113 pci_set_power_state(pdev, PCI_D3cold); 2114 dev->switch_power_state = DRM_SWITCH_POWER_OFF; 2115 } 2116 } 2117 2118 /** 2119 * amdgpu_switcheroo_can_switch - see if switcheroo state can change 2120 * 2121 * @pdev: pci dev pointer 2122 * 2123 * Callback for the switcheroo driver. Check of the switcheroo 2124 * state can be changed. 2125 * Returns true if the state can be changed, false if not. 2126 */ 2127 static bool amdgpu_switcheroo_can_switch(struct pci_dev *pdev) 2128 { 2129 struct drm_device *dev = pci_get_drvdata(pdev); 2130 2131 /* 2132 * FIXME: open_count is protected by drm_global_mutex but that would lead to 2133 * locking inversion with the driver load path. And the access here is 2134 * completely racy anyway. So don't bother with locking for now. 2135 */ 2136 return atomic_read(&dev->open_count) == 0; 2137 } 2138 2139 static const struct vga_switcheroo_client_ops amdgpu_switcheroo_ops = { 2140 .set_gpu_state = amdgpu_switcheroo_set_state, 2141 .reprobe = NULL, 2142 .can_switch = amdgpu_switcheroo_can_switch, 2143 }; 2144 2145 /** 2146 * amdgpu_device_ip_set_clockgating_state - set the CG state 2147 * 2148 * @dev: amdgpu_device pointer 2149 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.) 2150 * @state: clockgating state (gate or ungate) 2151 * 2152 * Sets the requested clockgating state for all instances of 2153 * the hardware IP specified. 2154 * Returns the error code from the last instance. 2155 */ 2156 int amdgpu_device_ip_set_clockgating_state(void *dev, 2157 enum amd_ip_block_type block_type, 2158 enum amd_clockgating_state state) 2159 { 2160 struct amdgpu_device *adev = dev; 2161 int i, r = 0; 2162 2163 for (i = 0; i < adev->num_ip_blocks; i++) { 2164 if (!adev->ip_blocks[i].status.valid) 2165 continue; 2166 if (adev->ip_blocks[i].version->type != block_type) 2167 continue; 2168 if (!adev->ip_blocks[i].version->funcs->set_clockgating_state) 2169 continue; 2170 r = adev->ip_blocks[i].version->funcs->set_clockgating_state( 2171 &adev->ip_blocks[i], state); 2172 if (r) 2173 DRM_ERROR("set_clockgating_state of IP block <%s> failed %d\n", 2174 adev->ip_blocks[i].version->funcs->name, r); 2175 } 2176 return r; 2177 } 2178 2179 /** 2180 * amdgpu_device_ip_set_powergating_state - set the PG state 2181 * 2182 * @dev: amdgpu_device pointer 2183 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.) 2184 * @state: powergating state (gate or ungate) 2185 * 2186 * Sets the requested powergating state for all instances of 2187 * the hardware IP specified. 2188 * Returns the error code from the last instance. 2189 */ 2190 int amdgpu_device_ip_set_powergating_state(void *dev, 2191 enum amd_ip_block_type block_type, 2192 enum amd_powergating_state state) 2193 { 2194 struct amdgpu_device *adev = dev; 2195 int i, r = 0; 2196 2197 for (i = 0; i < adev->num_ip_blocks; i++) { 2198 if (!adev->ip_blocks[i].status.valid) 2199 continue; 2200 if (adev->ip_blocks[i].version->type != block_type) 2201 continue; 2202 if (!adev->ip_blocks[i].version->funcs->set_powergating_state) 2203 continue; 2204 r = adev->ip_blocks[i].version->funcs->set_powergating_state( 2205 &adev->ip_blocks[i], state); 2206 if (r) 2207 DRM_ERROR("set_powergating_state of IP block <%s> failed %d\n", 2208 adev->ip_blocks[i].version->funcs->name, r); 2209 } 2210 return r; 2211 } 2212 2213 /** 2214 * amdgpu_device_ip_get_clockgating_state - get the CG state 2215 * 2216 * @adev: amdgpu_device pointer 2217 * @flags: clockgating feature flags 2218 * 2219 * Walks the list of IPs on the device and updates the clockgating 2220 * flags for each IP. 2221 * Updates @flags with the feature flags for each hardware IP where 2222 * clockgating is enabled. 2223 */ 2224 void amdgpu_device_ip_get_clockgating_state(struct amdgpu_device *adev, 2225 u64 *flags) 2226 { 2227 int i; 2228 2229 for (i = 0; i < adev->num_ip_blocks; i++) { 2230 if (!adev->ip_blocks[i].status.valid) 2231 continue; 2232 if (adev->ip_blocks[i].version->funcs->get_clockgating_state) 2233 adev->ip_blocks[i].version->funcs->get_clockgating_state((void *)adev, flags); 2234 } 2235 } 2236 2237 /** 2238 * amdgpu_device_ip_wait_for_idle - wait for idle 2239 * 2240 * @adev: amdgpu_device pointer 2241 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.) 2242 * 2243 * Waits for the request hardware IP to be idle. 2244 * Returns 0 for success or a negative error code on failure. 2245 */ 2246 int amdgpu_device_ip_wait_for_idle(struct amdgpu_device *adev, 2247 enum amd_ip_block_type block_type) 2248 { 2249 int i, r; 2250 2251 for (i = 0; i < adev->num_ip_blocks; i++) { 2252 if (!adev->ip_blocks[i].status.valid) 2253 continue; 2254 if (adev->ip_blocks[i].version->type == block_type) { 2255 if (adev->ip_blocks[i].version->funcs->wait_for_idle) { 2256 r = adev->ip_blocks[i].version->funcs->wait_for_idle( 2257 &adev->ip_blocks[i]); 2258 if (r) 2259 return r; 2260 } 2261 break; 2262 } 2263 } 2264 return 0; 2265 2266 } 2267 2268 /** 2269 * amdgpu_device_ip_is_valid - is the hardware IP enabled 2270 * 2271 * @adev: amdgpu_device pointer 2272 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.) 2273 * 2274 * Check if the hardware IP is enable or not. 2275 * Returns true if it the IP is enable, false if not. 2276 */ 2277 bool amdgpu_device_ip_is_valid(struct amdgpu_device *adev, 2278 enum amd_ip_block_type block_type) 2279 { 2280 int i; 2281 2282 for (i = 0; i < adev->num_ip_blocks; i++) { 2283 if (adev->ip_blocks[i].version->type == block_type) 2284 return adev->ip_blocks[i].status.valid; 2285 } 2286 return false; 2287 2288 } 2289 2290 /** 2291 * amdgpu_device_ip_get_ip_block - get a hw IP pointer 2292 * 2293 * @adev: amdgpu_device pointer 2294 * @type: Type of hardware IP (SMU, GFX, UVD, etc.) 2295 * 2296 * Returns a pointer to the hardware IP block structure 2297 * if it exists for the asic, otherwise NULL. 2298 */ 2299 struct amdgpu_ip_block * 2300 amdgpu_device_ip_get_ip_block(struct amdgpu_device *adev, 2301 enum amd_ip_block_type type) 2302 { 2303 int i; 2304 2305 for (i = 0; i < adev->num_ip_blocks; i++) 2306 if (adev->ip_blocks[i].version->type == type) 2307 return &adev->ip_blocks[i]; 2308 2309 return NULL; 2310 } 2311 2312 /** 2313 * amdgpu_device_ip_block_version_cmp 2314 * 2315 * @adev: amdgpu_device pointer 2316 * @type: enum amd_ip_block_type 2317 * @major: major version 2318 * @minor: minor version 2319 * 2320 * return 0 if equal or greater 2321 * return 1 if smaller or the ip_block doesn't exist 2322 */ 2323 int amdgpu_device_ip_block_version_cmp(struct amdgpu_device *adev, 2324 enum amd_ip_block_type type, 2325 u32 major, u32 minor) 2326 { 2327 struct amdgpu_ip_block *ip_block = amdgpu_device_ip_get_ip_block(adev, type); 2328 2329 if (ip_block && ((ip_block->version->major > major) || 2330 ((ip_block->version->major == major) && 2331 (ip_block->version->minor >= minor)))) 2332 return 0; 2333 2334 return 1; 2335 } 2336 2337 /** 2338 * amdgpu_device_ip_block_add 2339 * 2340 * @adev: amdgpu_device pointer 2341 * @ip_block_version: pointer to the IP to add 2342 * 2343 * Adds the IP block driver information to the collection of IPs 2344 * on the asic. 2345 */ 2346 int amdgpu_device_ip_block_add(struct amdgpu_device *adev, 2347 const struct amdgpu_ip_block_version *ip_block_version) 2348 { 2349 if (!ip_block_version) 2350 return -EINVAL; 2351 2352 switch (ip_block_version->type) { 2353 case AMD_IP_BLOCK_TYPE_VCN: 2354 if (adev->harvest_ip_mask & AMD_HARVEST_IP_VCN_MASK) 2355 return 0; 2356 break; 2357 case AMD_IP_BLOCK_TYPE_JPEG: 2358 if (adev->harvest_ip_mask & AMD_HARVEST_IP_JPEG_MASK) 2359 return 0; 2360 break; 2361 default: 2362 break; 2363 } 2364 2365 DRM_INFO("add ip block number %d <%s>\n", adev->num_ip_blocks, 2366 ip_block_version->funcs->name); 2367 2368 adev->ip_blocks[adev->num_ip_blocks].adev = adev; 2369 2370 adev->ip_blocks[adev->num_ip_blocks++].version = ip_block_version; 2371 2372 return 0; 2373 } 2374 2375 /** 2376 * amdgpu_device_enable_virtual_display - enable virtual display feature 2377 * 2378 * @adev: amdgpu_device pointer 2379 * 2380 * Enabled the virtual display feature if the user has enabled it via 2381 * the module parameter virtual_display. This feature provides a virtual 2382 * display hardware on headless boards or in virtualized environments. 2383 * This function parses and validates the configuration string specified by 2384 * the user and configures the virtual display configuration (number of 2385 * virtual connectors, crtcs, etc.) specified. 2386 */ 2387 static void amdgpu_device_enable_virtual_display(struct amdgpu_device *adev) 2388 { 2389 adev->enable_virtual_display = false; 2390 2391 if (amdgpu_virtual_display) { 2392 const char *pci_address_name = pci_name(adev->pdev); 2393 char *pciaddstr, *pciaddstr_tmp, *pciaddname_tmp, *pciaddname; 2394 2395 pciaddstr = kstrdup(amdgpu_virtual_display, GFP_KERNEL); 2396 pciaddstr_tmp = pciaddstr; 2397 while ((pciaddname_tmp = strsep(&pciaddstr_tmp, ";"))) { 2398 pciaddname = strsep(&pciaddname_tmp, ","); 2399 if (!strcmp("all", pciaddname) 2400 || !strcmp(pci_address_name, pciaddname)) { 2401 long num_crtc; 2402 int res = -1; 2403 2404 adev->enable_virtual_display = true; 2405 2406 if (pciaddname_tmp) 2407 res = kstrtol(pciaddname_tmp, 10, 2408 &num_crtc); 2409 2410 if (!res) { 2411 if (num_crtc < 1) 2412 num_crtc = 1; 2413 if (num_crtc > 6) 2414 num_crtc = 6; 2415 adev->mode_info.num_crtc = num_crtc; 2416 } else { 2417 adev->mode_info.num_crtc = 1; 2418 } 2419 break; 2420 } 2421 } 2422 2423 DRM_INFO("virtual display string:%s, %s:virtual_display:%d, num_crtc:%d\n", 2424 amdgpu_virtual_display, pci_address_name, 2425 adev->enable_virtual_display, adev->mode_info.num_crtc); 2426 2427 kfree(pciaddstr); 2428 } 2429 } 2430 2431 void amdgpu_device_set_sriov_virtual_display(struct amdgpu_device *adev) 2432 { 2433 if (amdgpu_sriov_vf(adev) && !adev->enable_virtual_display) { 2434 adev->mode_info.num_crtc = 1; 2435 adev->enable_virtual_display = true; 2436 DRM_INFO("virtual_display:%d, num_crtc:%d\n", 2437 adev->enable_virtual_display, adev->mode_info.num_crtc); 2438 } 2439 } 2440 2441 /** 2442 * amdgpu_device_parse_gpu_info_fw - parse gpu info firmware 2443 * 2444 * @adev: amdgpu_device pointer 2445 * 2446 * Parses the asic configuration parameters specified in the gpu info 2447 * firmware and makes them available to the driver for use in configuring 2448 * the asic. 2449 * Returns 0 on success, -EINVAL on failure. 2450 */ 2451 static int amdgpu_device_parse_gpu_info_fw(struct amdgpu_device *adev) 2452 { 2453 const char *chip_name; 2454 int err; 2455 const struct gpu_info_firmware_header_v1_0 *hdr; 2456 2457 adev->firmware.gpu_info_fw = NULL; 2458 2459 if (adev->mman.discovery_bin) 2460 return 0; 2461 2462 switch (adev->asic_type) { 2463 default: 2464 return 0; 2465 case CHIP_VEGA10: 2466 chip_name = "vega10"; 2467 break; 2468 case CHIP_VEGA12: 2469 chip_name = "vega12"; 2470 break; 2471 case CHIP_RAVEN: 2472 if (adev->apu_flags & AMD_APU_IS_RAVEN2) 2473 chip_name = "raven2"; 2474 else if (adev->apu_flags & AMD_APU_IS_PICASSO) 2475 chip_name = "picasso"; 2476 else 2477 chip_name = "raven"; 2478 break; 2479 case CHIP_ARCTURUS: 2480 chip_name = "arcturus"; 2481 break; 2482 case CHIP_NAVI12: 2483 chip_name = "navi12"; 2484 break; 2485 } 2486 2487 err = amdgpu_ucode_request(adev, &adev->firmware.gpu_info_fw, 2488 AMDGPU_UCODE_OPTIONAL, 2489 "amdgpu/%s_gpu_info.bin", chip_name); 2490 if (err) { 2491 dev_err(adev->dev, 2492 "Failed to get gpu_info firmware \"%s_gpu_info.bin\"\n", 2493 chip_name); 2494 goto out; 2495 } 2496 2497 hdr = (const struct gpu_info_firmware_header_v1_0 *)adev->firmware.gpu_info_fw->data; 2498 amdgpu_ucode_print_gpu_info_hdr(&hdr->header); 2499 2500 switch (hdr->version_major) { 2501 case 1: 2502 { 2503 const struct gpu_info_firmware_v1_0 *gpu_info_fw = 2504 (const struct gpu_info_firmware_v1_0 *)(adev->firmware.gpu_info_fw->data + 2505 le32_to_cpu(hdr->header.ucode_array_offset_bytes)); 2506 2507 /* 2508 * Should be dropped when DAL no longer needs it. 2509 */ 2510 if (adev->asic_type == CHIP_NAVI12) 2511 goto parse_soc_bounding_box; 2512 2513 adev->gfx.config.max_shader_engines = le32_to_cpu(gpu_info_fw->gc_num_se); 2514 adev->gfx.config.max_cu_per_sh = le32_to_cpu(gpu_info_fw->gc_num_cu_per_sh); 2515 adev->gfx.config.max_sh_per_se = le32_to_cpu(gpu_info_fw->gc_num_sh_per_se); 2516 adev->gfx.config.max_backends_per_se = le32_to_cpu(gpu_info_fw->gc_num_rb_per_se); 2517 adev->gfx.config.max_texture_channel_caches = 2518 le32_to_cpu(gpu_info_fw->gc_num_tccs); 2519 adev->gfx.config.max_gprs = le32_to_cpu(gpu_info_fw->gc_num_gprs); 2520 adev->gfx.config.max_gs_threads = le32_to_cpu(gpu_info_fw->gc_num_max_gs_thds); 2521 adev->gfx.config.gs_vgt_table_depth = le32_to_cpu(gpu_info_fw->gc_gs_table_depth); 2522 adev->gfx.config.gs_prim_buffer_depth = le32_to_cpu(gpu_info_fw->gc_gsprim_buff_depth); 2523 adev->gfx.config.double_offchip_lds_buf = 2524 le32_to_cpu(gpu_info_fw->gc_double_offchip_lds_buffer); 2525 adev->gfx.cu_info.wave_front_size = le32_to_cpu(gpu_info_fw->gc_wave_size); 2526 adev->gfx.cu_info.max_waves_per_simd = 2527 le32_to_cpu(gpu_info_fw->gc_max_waves_per_simd); 2528 adev->gfx.cu_info.max_scratch_slots_per_cu = 2529 le32_to_cpu(gpu_info_fw->gc_max_scratch_slots_per_cu); 2530 adev->gfx.cu_info.lds_size = le32_to_cpu(gpu_info_fw->gc_lds_size); 2531 if (hdr->version_minor >= 1) { 2532 const struct gpu_info_firmware_v1_1 *gpu_info_fw = 2533 (const struct gpu_info_firmware_v1_1 *)(adev->firmware.gpu_info_fw->data + 2534 le32_to_cpu(hdr->header.ucode_array_offset_bytes)); 2535 adev->gfx.config.num_sc_per_sh = 2536 le32_to_cpu(gpu_info_fw->num_sc_per_sh); 2537 adev->gfx.config.num_packer_per_sc = 2538 le32_to_cpu(gpu_info_fw->num_packer_per_sc); 2539 } 2540 2541 parse_soc_bounding_box: 2542 /* 2543 * soc bounding box info is not integrated in disocovery table, 2544 * we always need to parse it from gpu info firmware if needed. 2545 */ 2546 if (hdr->version_minor == 2) { 2547 const struct gpu_info_firmware_v1_2 *gpu_info_fw = 2548 (const struct gpu_info_firmware_v1_2 *)(adev->firmware.gpu_info_fw->data + 2549 le32_to_cpu(hdr->header.ucode_array_offset_bytes)); 2550 adev->dm.soc_bounding_box = &gpu_info_fw->soc_bounding_box; 2551 } 2552 break; 2553 } 2554 default: 2555 dev_err(adev->dev, 2556 "Unsupported gpu_info table %d\n", hdr->header.ucode_version); 2557 err = -EINVAL; 2558 goto out; 2559 } 2560 out: 2561 return err; 2562 } 2563 2564 /** 2565 * amdgpu_device_ip_early_init - run early init for hardware IPs 2566 * 2567 * @adev: amdgpu_device pointer 2568 * 2569 * Early initialization pass for hardware IPs. The hardware IPs that make 2570 * up each asic are discovered each IP's early_init callback is run. This 2571 * is the first stage in initializing the asic. 2572 * Returns 0 on success, negative error code on failure. 2573 */ 2574 static int amdgpu_device_ip_early_init(struct amdgpu_device *adev) 2575 { 2576 struct amdgpu_ip_block *ip_block; 2577 struct pci_dev *parent; 2578 int i, r; 2579 bool total; 2580 2581 amdgpu_device_enable_virtual_display(adev); 2582 2583 if (amdgpu_sriov_vf(adev)) { 2584 r = amdgpu_virt_request_full_gpu(adev, true); 2585 if (r) 2586 return r; 2587 } 2588 2589 switch (adev->asic_type) { 2590 #ifdef CONFIG_DRM_AMDGPU_SI 2591 case CHIP_VERDE: 2592 case CHIP_TAHITI: 2593 case CHIP_PITCAIRN: 2594 case CHIP_OLAND: 2595 case CHIP_HAINAN: 2596 adev->family = AMDGPU_FAMILY_SI; 2597 r = si_set_ip_blocks(adev); 2598 if (r) 2599 return r; 2600 break; 2601 #endif 2602 #ifdef CONFIG_DRM_AMDGPU_CIK 2603 case CHIP_BONAIRE: 2604 case CHIP_HAWAII: 2605 case CHIP_KAVERI: 2606 case CHIP_KABINI: 2607 case CHIP_MULLINS: 2608 if (adev->flags & AMD_IS_APU) 2609 adev->family = AMDGPU_FAMILY_KV; 2610 else 2611 adev->family = AMDGPU_FAMILY_CI; 2612 2613 r = cik_set_ip_blocks(adev); 2614 if (r) 2615 return r; 2616 break; 2617 #endif 2618 case CHIP_TOPAZ: 2619 case CHIP_TONGA: 2620 case CHIP_FIJI: 2621 case CHIP_POLARIS10: 2622 case CHIP_POLARIS11: 2623 case CHIP_POLARIS12: 2624 case CHIP_VEGAM: 2625 case CHIP_CARRIZO: 2626 case CHIP_STONEY: 2627 if (adev->flags & AMD_IS_APU) 2628 adev->family = AMDGPU_FAMILY_CZ; 2629 else 2630 adev->family = AMDGPU_FAMILY_VI; 2631 2632 r = vi_set_ip_blocks(adev); 2633 if (r) 2634 return r; 2635 break; 2636 default: 2637 r = amdgpu_discovery_set_ip_blocks(adev); 2638 if (r) 2639 return r; 2640 break; 2641 } 2642 2643 if (amdgpu_has_atpx() && 2644 (amdgpu_is_atpx_hybrid() || 2645 amdgpu_has_atpx_dgpu_power_cntl()) && 2646 ((adev->flags & AMD_IS_APU) == 0) && 2647 !dev_is_removable(&adev->pdev->dev)) 2648 adev->flags |= AMD_IS_PX; 2649 2650 if (!(adev->flags & AMD_IS_APU)) { 2651 parent = pcie_find_root_port(adev->pdev); 2652 adev->has_pr3 = parent ? pci_pr3_present(parent) : false; 2653 } 2654 2655 2656 adev->pm.pp_feature = amdgpu_pp_feature_mask; 2657 if (amdgpu_sriov_vf(adev) || sched_policy == KFD_SCHED_POLICY_NO_HWS) 2658 adev->pm.pp_feature &= ~PP_GFXOFF_MASK; 2659 if (amdgpu_sriov_vf(adev) && adev->asic_type == CHIP_SIENNA_CICHLID) 2660 adev->pm.pp_feature &= ~PP_OVERDRIVE_MASK; 2661 if (!amdgpu_device_pcie_dynamic_switching_supported(adev)) 2662 adev->pm.pp_feature &= ~PP_PCIE_DPM_MASK; 2663 2664 total = true; 2665 for (i = 0; i < adev->num_ip_blocks; i++) { 2666 ip_block = &adev->ip_blocks[i]; 2667 2668 if ((amdgpu_ip_block_mask & (1 << i)) == 0) { 2669 DRM_WARN("disabled ip block: %d <%s>\n", 2670 i, adev->ip_blocks[i].version->funcs->name); 2671 adev->ip_blocks[i].status.valid = false; 2672 } else if (ip_block->version->funcs->early_init) { 2673 r = ip_block->version->funcs->early_init(ip_block); 2674 if (r == -ENOENT) { 2675 adev->ip_blocks[i].status.valid = false; 2676 } else if (r) { 2677 DRM_ERROR("early_init of IP block <%s> failed %d\n", 2678 adev->ip_blocks[i].version->funcs->name, r); 2679 total = false; 2680 } else { 2681 adev->ip_blocks[i].status.valid = true; 2682 } 2683 } else { 2684 adev->ip_blocks[i].status.valid = true; 2685 } 2686 /* get the vbios after the asic_funcs are set up */ 2687 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON) { 2688 r = amdgpu_device_parse_gpu_info_fw(adev); 2689 if (r) 2690 return r; 2691 2692 /* Read BIOS */ 2693 if (amdgpu_device_read_bios(adev)) { 2694 if (!amdgpu_get_bios(adev)) 2695 return -EINVAL; 2696 2697 r = amdgpu_atombios_init(adev); 2698 if (r) { 2699 dev_err(adev->dev, "amdgpu_atombios_init failed\n"); 2700 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_INIT_FAIL, 0, 0); 2701 return r; 2702 } 2703 } 2704 2705 /*get pf2vf msg info at it's earliest time*/ 2706 if (amdgpu_sriov_vf(adev)) 2707 amdgpu_virt_init_data_exchange(adev); 2708 2709 } 2710 } 2711 if (!total) 2712 return -ENODEV; 2713 2714 ip_block = amdgpu_device_ip_get_ip_block(adev, AMD_IP_BLOCK_TYPE_GFX); 2715 if (ip_block->status.valid != false) 2716 amdgpu_amdkfd_device_probe(adev); 2717 2718 adev->cg_flags &= amdgpu_cg_mask; 2719 adev->pg_flags &= amdgpu_pg_mask; 2720 2721 return 0; 2722 } 2723 2724 static int amdgpu_device_ip_hw_init_phase1(struct amdgpu_device *adev) 2725 { 2726 int i, r; 2727 2728 for (i = 0; i < adev->num_ip_blocks; i++) { 2729 if (!adev->ip_blocks[i].status.sw) 2730 continue; 2731 if (adev->ip_blocks[i].status.hw) 2732 continue; 2733 if (!amdgpu_ip_member_of_hwini( 2734 adev, adev->ip_blocks[i].version->type)) 2735 continue; 2736 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON || 2737 (amdgpu_sriov_vf(adev) && (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP)) || 2738 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH) { 2739 r = adev->ip_blocks[i].version->funcs->hw_init(&adev->ip_blocks[i]); 2740 if (r) { 2741 DRM_ERROR("hw_init of IP block <%s> failed %d\n", 2742 adev->ip_blocks[i].version->funcs->name, r); 2743 return r; 2744 } 2745 adev->ip_blocks[i].status.hw = true; 2746 } 2747 } 2748 2749 return 0; 2750 } 2751 2752 static int amdgpu_device_ip_hw_init_phase2(struct amdgpu_device *adev) 2753 { 2754 int i, r; 2755 2756 for (i = 0; i < adev->num_ip_blocks; i++) { 2757 if (!adev->ip_blocks[i].status.sw) 2758 continue; 2759 if (adev->ip_blocks[i].status.hw) 2760 continue; 2761 if (!amdgpu_ip_member_of_hwini( 2762 adev, adev->ip_blocks[i].version->type)) 2763 continue; 2764 r = adev->ip_blocks[i].version->funcs->hw_init(&adev->ip_blocks[i]); 2765 if (r) { 2766 DRM_ERROR("hw_init of IP block <%s> failed %d\n", 2767 adev->ip_blocks[i].version->funcs->name, r); 2768 return r; 2769 } 2770 adev->ip_blocks[i].status.hw = true; 2771 } 2772 2773 return 0; 2774 } 2775 2776 static int amdgpu_device_fw_loading(struct amdgpu_device *adev) 2777 { 2778 int r = 0; 2779 int i; 2780 uint32_t smu_version; 2781 2782 if (adev->asic_type >= CHIP_VEGA10) { 2783 for (i = 0; i < adev->num_ip_blocks; i++) { 2784 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_PSP) 2785 continue; 2786 2787 if (!amdgpu_ip_member_of_hwini(adev, 2788 AMD_IP_BLOCK_TYPE_PSP)) 2789 break; 2790 2791 if (!adev->ip_blocks[i].status.sw) 2792 continue; 2793 2794 /* no need to do the fw loading again if already done*/ 2795 if (adev->ip_blocks[i].status.hw == true) 2796 break; 2797 2798 if (amdgpu_in_reset(adev) || adev->in_suspend) { 2799 r = amdgpu_ip_block_resume(&adev->ip_blocks[i]); 2800 if (r) 2801 return r; 2802 } else { 2803 r = adev->ip_blocks[i].version->funcs->hw_init(&adev->ip_blocks[i]); 2804 if (r) { 2805 DRM_ERROR("hw_init of IP block <%s> failed %d\n", 2806 adev->ip_blocks[i].version->funcs->name, r); 2807 return r; 2808 } 2809 adev->ip_blocks[i].status.hw = true; 2810 } 2811 break; 2812 } 2813 } 2814 2815 if (!amdgpu_sriov_vf(adev) || adev->asic_type == CHIP_TONGA) 2816 r = amdgpu_pm_load_smu_firmware(adev, &smu_version); 2817 2818 return r; 2819 } 2820 2821 static int amdgpu_device_init_schedulers(struct amdgpu_device *adev) 2822 { 2823 long timeout; 2824 int r, i; 2825 2826 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 2827 struct amdgpu_ring *ring = adev->rings[i]; 2828 2829 /* No need to setup the GPU scheduler for rings that don't need it */ 2830 if (!ring || ring->no_scheduler) 2831 continue; 2832 2833 switch (ring->funcs->type) { 2834 case AMDGPU_RING_TYPE_GFX: 2835 timeout = adev->gfx_timeout; 2836 break; 2837 case AMDGPU_RING_TYPE_COMPUTE: 2838 timeout = adev->compute_timeout; 2839 break; 2840 case AMDGPU_RING_TYPE_SDMA: 2841 timeout = adev->sdma_timeout; 2842 break; 2843 default: 2844 timeout = adev->video_timeout; 2845 break; 2846 } 2847 2848 r = drm_sched_init(&ring->sched, &amdgpu_sched_ops, NULL, 2849 DRM_SCHED_PRIORITY_COUNT, 2850 ring->num_hw_submission, 0, 2851 timeout, adev->reset_domain->wq, 2852 ring->sched_score, ring->name, 2853 adev->dev); 2854 if (r) { 2855 DRM_ERROR("Failed to create scheduler on ring %s.\n", 2856 ring->name); 2857 return r; 2858 } 2859 r = amdgpu_uvd_entity_init(adev, ring); 2860 if (r) { 2861 DRM_ERROR("Failed to create UVD scheduling entity on ring %s.\n", 2862 ring->name); 2863 return r; 2864 } 2865 r = amdgpu_vce_entity_init(adev, ring); 2866 if (r) { 2867 DRM_ERROR("Failed to create VCE scheduling entity on ring %s.\n", 2868 ring->name); 2869 return r; 2870 } 2871 } 2872 2873 amdgpu_xcp_update_partition_sched_list(adev); 2874 2875 return 0; 2876 } 2877 2878 2879 /** 2880 * amdgpu_device_ip_init - run init for hardware IPs 2881 * 2882 * @adev: amdgpu_device pointer 2883 * 2884 * Main initialization pass for hardware IPs. The list of all the hardware 2885 * IPs that make up the asic is walked and the sw_init and hw_init callbacks 2886 * are run. sw_init initializes the software state associated with each IP 2887 * and hw_init initializes the hardware associated with each IP. 2888 * Returns 0 on success, negative error code on failure. 2889 */ 2890 static int amdgpu_device_ip_init(struct amdgpu_device *adev) 2891 { 2892 bool init_badpage; 2893 int i, r; 2894 2895 r = amdgpu_ras_init(adev); 2896 if (r) 2897 return r; 2898 2899 for (i = 0; i < adev->num_ip_blocks; i++) { 2900 if (!adev->ip_blocks[i].status.valid) 2901 continue; 2902 if (adev->ip_blocks[i].version->funcs->sw_init) { 2903 r = adev->ip_blocks[i].version->funcs->sw_init(&adev->ip_blocks[i]); 2904 if (r) { 2905 DRM_ERROR("sw_init of IP block <%s> failed %d\n", 2906 adev->ip_blocks[i].version->funcs->name, r); 2907 goto init_failed; 2908 } 2909 } 2910 adev->ip_blocks[i].status.sw = true; 2911 2912 if (!amdgpu_ip_member_of_hwini( 2913 adev, adev->ip_blocks[i].version->type)) 2914 continue; 2915 2916 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON) { 2917 /* need to do common hw init early so everything is set up for gmc */ 2918 r = adev->ip_blocks[i].version->funcs->hw_init(&adev->ip_blocks[i]); 2919 if (r) { 2920 DRM_ERROR("hw_init %d failed %d\n", i, r); 2921 goto init_failed; 2922 } 2923 adev->ip_blocks[i].status.hw = true; 2924 } else if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) { 2925 /* need to do gmc hw init early so we can allocate gpu mem */ 2926 /* Try to reserve bad pages early */ 2927 if (amdgpu_sriov_vf(adev)) 2928 amdgpu_virt_exchange_data(adev); 2929 2930 r = amdgpu_device_mem_scratch_init(adev); 2931 if (r) { 2932 DRM_ERROR("amdgpu_mem_scratch_init failed %d\n", r); 2933 goto init_failed; 2934 } 2935 r = adev->ip_blocks[i].version->funcs->hw_init(&adev->ip_blocks[i]); 2936 if (r) { 2937 DRM_ERROR("hw_init %d failed %d\n", i, r); 2938 goto init_failed; 2939 } 2940 r = amdgpu_device_wb_init(adev); 2941 if (r) { 2942 DRM_ERROR("amdgpu_device_wb_init failed %d\n", r); 2943 goto init_failed; 2944 } 2945 adev->ip_blocks[i].status.hw = true; 2946 2947 /* right after GMC hw init, we create CSA */ 2948 if (adev->gfx.mcbp) { 2949 r = amdgpu_allocate_static_csa(adev, &adev->virt.csa_obj, 2950 AMDGPU_GEM_DOMAIN_VRAM | 2951 AMDGPU_GEM_DOMAIN_GTT, 2952 AMDGPU_CSA_SIZE); 2953 if (r) { 2954 DRM_ERROR("allocate CSA failed %d\n", r); 2955 goto init_failed; 2956 } 2957 } 2958 2959 r = amdgpu_seq64_init(adev); 2960 if (r) { 2961 DRM_ERROR("allocate seq64 failed %d\n", r); 2962 goto init_failed; 2963 } 2964 } 2965 } 2966 2967 if (amdgpu_sriov_vf(adev)) 2968 amdgpu_virt_init_data_exchange(adev); 2969 2970 r = amdgpu_ib_pool_init(adev); 2971 if (r) { 2972 dev_err(adev->dev, "IB initialization failed (%d).\n", r); 2973 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_IB_INIT_FAIL, 0, r); 2974 goto init_failed; 2975 } 2976 2977 r = amdgpu_ucode_create_bo(adev); /* create ucode bo when sw_init complete*/ 2978 if (r) 2979 goto init_failed; 2980 2981 r = amdgpu_device_ip_hw_init_phase1(adev); 2982 if (r) 2983 goto init_failed; 2984 2985 r = amdgpu_device_fw_loading(adev); 2986 if (r) 2987 goto init_failed; 2988 2989 r = amdgpu_device_ip_hw_init_phase2(adev); 2990 if (r) 2991 goto init_failed; 2992 2993 /* 2994 * retired pages will be loaded from eeprom and reserved here, 2995 * it should be called after amdgpu_device_ip_hw_init_phase2 since 2996 * for some ASICs the RAS EEPROM code relies on SMU fully functioning 2997 * for I2C communication which only true at this point. 2998 * 2999 * amdgpu_ras_recovery_init may fail, but the upper only cares the 3000 * failure from bad gpu situation and stop amdgpu init process 3001 * accordingly. For other failed cases, it will still release all 3002 * the resource and print error message, rather than returning one 3003 * negative value to upper level. 3004 * 3005 * Note: theoretically, this should be called before all vram allocations 3006 * to protect retired page from abusing 3007 */ 3008 init_badpage = (adev->init_lvl->level != AMDGPU_INIT_LEVEL_MINIMAL_XGMI); 3009 r = amdgpu_ras_recovery_init(adev, init_badpage); 3010 if (r) 3011 goto init_failed; 3012 3013 /** 3014 * In case of XGMI grab extra reference for reset domain for this device 3015 */ 3016 if (adev->gmc.xgmi.num_physical_nodes > 1) { 3017 if (amdgpu_xgmi_add_device(adev) == 0) { 3018 if (!amdgpu_sriov_vf(adev)) { 3019 struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev); 3020 3021 if (WARN_ON(!hive)) { 3022 r = -ENOENT; 3023 goto init_failed; 3024 } 3025 3026 if (!hive->reset_domain || 3027 !amdgpu_reset_get_reset_domain(hive->reset_domain)) { 3028 r = -ENOENT; 3029 amdgpu_put_xgmi_hive(hive); 3030 goto init_failed; 3031 } 3032 3033 /* Drop the early temporary reset domain we created for device */ 3034 amdgpu_reset_put_reset_domain(adev->reset_domain); 3035 adev->reset_domain = hive->reset_domain; 3036 amdgpu_put_xgmi_hive(hive); 3037 } 3038 } 3039 } 3040 3041 r = amdgpu_device_init_schedulers(adev); 3042 if (r) 3043 goto init_failed; 3044 3045 if (adev->mman.buffer_funcs_ring->sched.ready) 3046 amdgpu_ttm_set_buffer_funcs_status(adev, true); 3047 3048 /* Don't init kfd if whole hive need to be reset during init */ 3049 if (adev->init_lvl->level != AMDGPU_INIT_LEVEL_MINIMAL_XGMI) { 3050 kgd2kfd_init_zone_device(adev); 3051 amdgpu_amdkfd_device_init(adev); 3052 } 3053 3054 amdgpu_fru_get_product_info(adev); 3055 3056 init_failed: 3057 3058 return r; 3059 } 3060 3061 /** 3062 * amdgpu_device_fill_reset_magic - writes reset magic to gart pointer 3063 * 3064 * @adev: amdgpu_device pointer 3065 * 3066 * Writes a reset magic value to the gart pointer in VRAM. The driver calls 3067 * this function before a GPU reset. If the value is retained after a 3068 * GPU reset, VRAM has not been lost. Some GPU resets may destroy VRAM contents. 3069 */ 3070 static void amdgpu_device_fill_reset_magic(struct amdgpu_device *adev) 3071 { 3072 memcpy(adev->reset_magic, adev->gart.ptr, AMDGPU_RESET_MAGIC_NUM); 3073 } 3074 3075 /** 3076 * amdgpu_device_check_vram_lost - check if vram is valid 3077 * 3078 * @adev: amdgpu_device pointer 3079 * 3080 * Checks the reset magic value written to the gart pointer in VRAM. 3081 * The driver calls this after a GPU reset to see if the contents of 3082 * VRAM is lost or now. 3083 * returns true if vram is lost, false if not. 3084 */ 3085 static bool amdgpu_device_check_vram_lost(struct amdgpu_device *adev) 3086 { 3087 if (memcmp(adev->gart.ptr, adev->reset_magic, 3088 AMDGPU_RESET_MAGIC_NUM)) 3089 return true; 3090 3091 if (!amdgpu_in_reset(adev)) 3092 return false; 3093 3094 /* 3095 * For all ASICs with baco/mode1 reset, the VRAM is 3096 * always assumed to be lost. 3097 */ 3098 switch (amdgpu_asic_reset_method(adev)) { 3099 case AMD_RESET_METHOD_BACO: 3100 case AMD_RESET_METHOD_MODE1: 3101 return true; 3102 default: 3103 return false; 3104 } 3105 } 3106 3107 /** 3108 * amdgpu_device_set_cg_state - set clockgating for amdgpu device 3109 * 3110 * @adev: amdgpu_device pointer 3111 * @state: clockgating state (gate or ungate) 3112 * 3113 * The list of all the hardware IPs that make up the asic is walked and the 3114 * set_clockgating_state callbacks are run. 3115 * Late initialization pass enabling clockgating for hardware IPs. 3116 * Fini or suspend, pass disabling clockgating for hardware IPs. 3117 * Returns 0 on success, negative error code on failure. 3118 */ 3119 3120 int amdgpu_device_set_cg_state(struct amdgpu_device *adev, 3121 enum amd_clockgating_state state) 3122 { 3123 int i, j, r; 3124 3125 if (amdgpu_emu_mode == 1) 3126 return 0; 3127 3128 for (j = 0; j < adev->num_ip_blocks; j++) { 3129 i = state == AMD_CG_STATE_GATE ? j : adev->num_ip_blocks - j - 1; 3130 if (!adev->ip_blocks[i].status.late_initialized) 3131 continue; 3132 /* skip CG for GFX, SDMA on S0ix */ 3133 if (adev->in_s0ix && 3134 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GFX || 3135 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SDMA)) 3136 continue; 3137 /* skip CG for VCE/UVD, it's handled specially */ 3138 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_UVD && 3139 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCE && 3140 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCN && 3141 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_JPEG && 3142 adev->ip_blocks[i].version->funcs->set_clockgating_state) { 3143 /* enable clockgating to save power */ 3144 r = adev->ip_blocks[i].version->funcs->set_clockgating_state(&adev->ip_blocks[i], 3145 state); 3146 if (r) { 3147 DRM_ERROR("set_clockgating_state(gate) of IP block <%s> failed %d\n", 3148 adev->ip_blocks[i].version->funcs->name, r); 3149 return r; 3150 } 3151 } 3152 } 3153 3154 return 0; 3155 } 3156 3157 int amdgpu_device_set_pg_state(struct amdgpu_device *adev, 3158 enum amd_powergating_state state) 3159 { 3160 int i, j, r; 3161 3162 if (amdgpu_emu_mode == 1) 3163 return 0; 3164 3165 for (j = 0; j < adev->num_ip_blocks; j++) { 3166 i = state == AMD_PG_STATE_GATE ? j : adev->num_ip_blocks - j - 1; 3167 if (!adev->ip_blocks[i].status.late_initialized) 3168 continue; 3169 /* skip PG for GFX, SDMA on S0ix */ 3170 if (adev->in_s0ix && 3171 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GFX || 3172 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SDMA)) 3173 continue; 3174 /* skip CG for VCE/UVD, it's handled specially */ 3175 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_UVD && 3176 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCE && 3177 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCN && 3178 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_JPEG && 3179 adev->ip_blocks[i].version->funcs->set_powergating_state) { 3180 /* enable powergating to save power */ 3181 r = adev->ip_blocks[i].version->funcs->set_powergating_state(&adev->ip_blocks[i], 3182 state); 3183 if (r) { 3184 DRM_ERROR("set_powergating_state(gate) of IP block <%s> failed %d\n", 3185 adev->ip_blocks[i].version->funcs->name, r); 3186 return r; 3187 } 3188 } 3189 } 3190 return 0; 3191 } 3192 3193 static int amdgpu_device_enable_mgpu_fan_boost(void) 3194 { 3195 struct amdgpu_gpu_instance *gpu_ins; 3196 struct amdgpu_device *adev; 3197 int i, ret = 0; 3198 3199 mutex_lock(&mgpu_info.mutex); 3200 3201 /* 3202 * MGPU fan boost feature should be enabled 3203 * only when there are two or more dGPUs in 3204 * the system 3205 */ 3206 if (mgpu_info.num_dgpu < 2) 3207 goto out; 3208 3209 for (i = 0; i < mgpu_info.num_dgpu; i++) { 3210 gpu_ins = &(mgpu_info.gpu_ins[i]); 3211 adev = gpu_ins->adev; 3212 if (!(adev->flags & AMD_IS_APU) && 3213 !gpu_ins->mgpu_fan_enabled) { 3214 ret = amdgpu_dpm_enable_mgpu_fan_boost(adev); 3215 if (ret) 3216 break; 3217 3218 gpu_ins->mgpu_fan_enabled = 1; 3219 } 3220 } 3221 3222 out: 3223 mutex_unlock(&mgpu_info.mutex); 3224 3225 return ret; 3226 } 3227 3228 /** 3229 * amdgpu_device_ip_late_init - run late init for hardware IPs 3230 * 3231 * @adev: amdgpu_device pointer 3232 * 3233 * Late initialization pass for hardware IPs. The list of all the hardware 3234 * IPs that make up the asic is walked and the late_init callbacks are run. 3235 * late_init covers any special initialization that an IP requires 3236 * after all of the have been initialized or something that needs to happen 3237 * late in the init process. 3238 * Returns 0 on success, negative error code on failure. 3239 */ 3240 static int amdgpu_device_ip_late_init(struct amdgpu_device *adev) 3241 { 3242 struct amdgpu_gpu_instance *gpu_instance; 3243 int i = 0, r; 3244 3245 for (i = 0; i < adev->num_ip_blocks; i++) { 3246 if (!adev->ip_blocks[i].status.hw) 3247 continue; 3248 if (adev->ip_blocks[i].version->funcs->late_init) { 3249 r = adev->ip_blocks[i].version->funcs->late_init(&adev->ip_blocks[i]); 3250 if (r) { 3251 DRM_ERROR("late_init of IP block <%s> failed %d\n", 3252 adev->ip_blocks[i].version->funcs->name, r); 3253 return r; 3254 } 3255 } 3256 adev->ip_blocks[i].status.late_initialized = true; 3257 } 3258 3259 r = amdgpu_ras_late_init(adev); 3260 if (r) { 3261 DRM_ERROR("amdgpu_ras_late_init failed %d", r); 3262 return r; 3263 } 3264 3265 if (!amdgpu_reset_in_recovery(adev)) 3266 amdgpu_ras_set_error_query_ready(adev, true); 3267 3268 amdgpu_device_set_cg_state(adev, AMD_CG_STATE_GATE); 3269 amdgpu_device_set_pg_state(adev, AMD_PG_STATE_GATE); 3270 3271 amdgpu_device_fill_reset_magic(adev); 3272 3273 r = amdgpu_device_enable_mgpu_fan_boost(); 3274 if (r) 3275 DRM_ERROR("enable mgpu fan boost failed (%d).\n", r); 3276 3277 /* For passthrough configuration on arcturus and aldebaran, enable special handling SBR */ 3278 if (amdgpu_passthrough(adev) && 3279 ((adev->asic_type == CHIP_ARCTURUS && adev->gmc.xgmi.num_physical_nodes > 1) || 3280 adev->asic_type == CHIP_ALDEBARAN)) 3281 amdgpu_dpm_handle_passthrough_sbr(adev, true); 3282 3283 if (adev->gmc.xgmi.num_physical_nodes > 1) { 3284 mutex_lock(&mgpu_info.mutex); 3285 3286 /* 3287 * Reset device p-state to low as this was booted with high. 3288 * 3289 * This should be performed only after all devices from the same 3290 * hive get initialized. 3291 * 3292 * However, it's unknown how many device in the hive in advance. 3293 * As this is counted one by one during devices initializations. 3294 * 3295 * So, we wait for all XGMI interlinked devices initialized. 3296 * This may bring some delays as those devices may come from 3297 * different hives. But that should be OK. 3298 */ 3299 if (mgpu_info.num_dgpu == adev->gmc.xgmi.num_physical_nodes) { 3300 for (i = 0; i < mgpu_info.num_gpu; i++) { 3301 gpu_instance = &(mgpu_info.gpu_ins[i]); 3302 if (gpu_instance->adev->flags & AMD_IS_APU) 3303 continue; 3304 3305 r = amdgpu_xgmi_set_pstate(gpu_instance->adev, 3306 AMDGPU_XGMI_PSTATE_MIN); 3307 if (r) { 3308 DRM_ERROR("pstate setting failed (%d).\n", r); 3309 break; 3310 } 3311 } 3312 } 3313 3314 mutex_unlock(&mgpu_info.mutex); 3315 } 3316 3317 return 0; 3318 } 3319 3320 static void amdgpu_ip_block_hw_fini(struct amdgpu_ip_block *ip_block) 3321 { 3322 int r; 3323 3324 if (!ip_block->version->funcs->hw_fini) { 3325 DRM_ERROR("hw_fini of IP block <%s> not defined\n", 3326 ip_block->version->funcs->name); 3327 } else { 3328 r = ip_block->version->funcs->hw_fini(ip_block); 3329 /* XXX handle errors */ 3330 if (r) { 3331 DRM_DEBUG("hw_fini of IP block <%s> failed %d\n", 3332 ip_block->version->funcs->name, r); 3333 } 3334 } 3335 3336 ip_block->status.hw = false; 3337 } 3338 3339 /** 3340 * amdgpu_device_smu_fini_early - smu hw_fini wrapper 3341 * 3342 * @adev: amdgpu_device pointer 3343 * 3344 * For ASICs need to disable SMC first 3345 */ 3346 static void amdgpu_device_smu_fini_early(struct amdgpu_device *adev) 3347 { 3348 int i; 3349 3350 if (amdgpu_ip_version(adev, GC_HWIP, 0) > IP_VERSION(9, 0, 0)) 3351 return; 3352 3353 for (i = 0; i < adev->num_ip_blocks; i++) { 3354 if (!adev->ip_blocks[i].status.hw) 3355 continue; 3356 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) { 3357 amdgpu_ip_block_hw_fini(&adev->ip_blocks[i]); 3358 break; 3359 } 3360 } 3361 } 3362 3363 static int amdgpu_device_ip_fini_early(struct amdgpu_device *adev) 3364 { 3365 int i, r; 3366 3367 for (i = 0; i < adev->num_ip_blocks; i++) { 3368 if (!adev->ip_blocks[i].version->funcs->early_fini) 3369 continue; 3370 3371 r = adev->ip_blocks[i].version->funcs->early_fini(&adev->ip_blocks[i]); 3372 if (r) { 3373 DRM_DEBUG("early_fini of IP block <%s> failed %d\n", 3374 adev->ip_blocks[i].version->funcs->name, r); 3375 } 3376 } 3377 3378 amdgpu_device_set_pg_state(adev, AMD_PG_STATE_UNGATE); 3379 amdgpu_device_set_cg_state(adev, AMD_CG_STATE_UNGATE); 3380 3381 amdgpu_amdkfd_suspend(adev, false); 3382 3383 /* Workaround for ASICs need to disable SMC first */ 3384 amdgpu_device_smu_fini_early(adev); 3385 3386 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 3387 if (!adev->ip_blocks[i].status.hw) 3388 continue; 3389 3390 amdgpu_ip_block_hw_fini(&adev->ip_blocks[i]); 3391 } 3392 3393 if (amdgpu_sriov_vf(adev)) { 3394 if (amdgpu_virt_release_full_gpu(adev, false)) 3395 DRM_ERROR("failed to release exclusive mode on fini\n"); 3396 } 3397 3398 return 0; 3399 } 3400 3401 /** 3402 * amdgpu_device_ip_fini - run fini for hardware IPs 3403 * 3404 * @adev: amdgpu_device pointer 3405 * 3406 * Main teardown pass for hardware IPs. The list of all the hardware 3407 * IPs that make up the asic is walked and the hw_fini and sw_fini callbacks 3408 * are run. hw_fini tears down the hardware associated with each IP 3409 * and sw_fini tears down any software state associated with each IP. 3410 * Returns 0 on success, negative error code on failure. 3411 */ 3412 static int amdgpu_device_ip_fini(struct amdgpu_device *adev) 3413 { 3414 int i, r; 3415 3416 if (amdgpu_sriov_vf(adev) && adev->virt.ras_init_done) 3417 amdgpu_virt_release_ras_err_handler_data(adev); 3418 3419 if (adev->gmc.xgmi.num_physical_nodes > 1) 3420 amdgpu_xgmi_remove_device(adev); 3421 3422 amdgpu_amdkfd_device_fini_sw(adev); 3423 3424 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 3425 if (!adev->ip_blocks[i].status.sw) 3426 continue; 3427 3428 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) { 3429 amdgpu_ucode_free_bo(adev); 3430 amdgpu_free_static_csa(&adev->virt.csa_obj); 3431 amdgpu_device_wb_fini(adev); 3432 amdgpu_device_mem_scratch_fini(adev); 3433 amdgpu_ib_pool_fini(adev); 3434 amdgpu_seq64_fini(adev); 3435 } 3436 if (adev->ip_blocks[i].version->funcs->sw_fini) { 3437 r = adev->ip_blocks[i].version->funcs->sw_fini(&adev->ip_blocks[i]); 3438 /* XXX handle errors */ 3439 if (r) { 3440 DRM_DEBUG("sw_fini of IP block <%s> failed %d\n", 3441 adev->ip_blocks[i].version->funcs->name, r); 3442 } 3443 } 3444 adev->ip_blocks[i].status.sw = false; 3445 adev->ip_blocks[i].status.valid = false; 3446 } 3447 3448 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 3449 if (!adev->ip_blocks[i].status.late_initialized) 3450 continue; 3451 if (adev->ip_blocks[i].version->funcs->late_fini) 3452 adev->ip_blocks[i].version->funcs->late_fini(&adev->ip_blocks[i]); 3453 adev->ip_blocks[i].status.late_initialized = false; 3454 } 3455 3456 amdgpu_ras_fini(adev); 3457 3458 return 0; 3459 } 3460 3461 /** 3462 * amdgpu_device_delayed_init_work_handler - work handler for IB tests 3463 * 3464 * @work: work_struct. 3465 */ 3466 static void amdgpu_device_delayed_init_work_handler(struct work_struct *work) 3467 { 3468 struct amdgpu_device *adev = 3469 container_of(work, struct amdgpu_device, delayed_init_work.work); 3470 int r; 3471 3472 r = amdgpu_ib_ring_tests(adev); 3473 if (r) 3474 DRM_ERROR("ib ring test failed (%d).\n", r); 3475 } 3476 3477 static void amdgpu_device_delay_enable_gfx_off(struct work_struct *work) 3478 { 3479 struct amdgpu_device *adev = 3480 container_of(work, struct amdgpu_device, gfx.gfx_off_delay_work.work); 3481 3482 WARN_ON_ONCE(adev->gfx.gfx_off_state); 3483 WARN_ON_ONCE(adev->gfx.gfx_off_req_count); 3484 3485 if (!amdgpu_dpm_set_powergating_by_smu(adev, AMD_IP_BLOCK_TYPE_GFX, true, 0)) 3486 adev->gfx.gfx_off_state = true; 3487 } 3488 3489 /** 3490 * amdgpu_device_ip_suspend_phase1 - run suspend for hardware IPs (phase 1) 3491 * 3492 * @adev: amdgpu_device pointer 3493 * 3494 * Main suspend function for hardware IPs. The list of all the hardware 3495 * IPs that make up the asic is walked, clockgating is disabled and the 3496 * suspend callbacks are run. suspend puts the hardware and software state 3497 * in each IP into a state suitable for suspend. 3498 * Returns 0 on success, negative error code on failure. 3499 */ 3500 static int amdgpu_device_ip_suspend_phase1(struct amdgpu_device *adev) 3501 { 3502 int i, r; 3503 3504 amdgpu_device_set_pg_state(adev, AMD_PG_STATE_UNGATE); 3505 amdgpu_device_set_cg_state(adev, AMD_CG_STATE_UNGATE); 3506 3507 /* 3508 * Per PMFW team's suggestion, driver needs to handle gfxoff 3509 * and df cstate features disablement for gpu reset(e.g. Mode1Reset) 3510 * scenario. Add the missing df cstate disablement here. 3511 */ 3512 if (amdgpu_dpm_set_df_cstate(adev, DF_CSTATE_DISALLOW)) 3513 dev_warn(adev->dev, "Failed to disallow df cstate"); 3514 3515 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 3516 if (!adev->ip_blocks[i].status.valid) 3517 continue; 3518 3519 /* displays are handled separately */ 3520 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_DCE) 3521 continue; 3522 3523 /* XXX handle errors */ 3524 r = amdgpu_ip_block_suspend(&adev->ip_blocks[i]); 3525 if (r) 3526 return r; 3527 } 3528 3529 return 0; 3530 } 3531 3532 /** 3533 * amdgpu_device_ip_suspend_phase2 - run suspend for hardware IPs (phase 2) 3534 * 3535 * @adev: amdgpu_device pointer 3536 * 3537 * Main suspend function for hardware IPs. The list of all the hardware 3538 * IPs that make up the asic is walked, clockgating is disabled and the 3539 * suspend callbacks are run. suspend puts the hardware and software state 3540 * in each IP into a state suitable for suspend. 3541 * Returns 0 on success, negative error code on failure. 3542 */ 3543 static int amdgpu_device_ip_suspend_phase2(struct amdgpu_device *adev) 3544 { 3545 int i, r; 3546 3547 if (adev->in_s0ix) 3548 amdgpu_dpm_gfx_state_change(adev, sGpuChangeState_D3Entry); 3549 3550 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 3551 if (!adev->ip_blocks[i].status.valid) 3552 continue; 3553 /* displays are handled in phase1 */ 3554 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE) 3555 continue; 3556 /* PSP lost connection when err_event_athub occurs */ 3557 if (amdgpu_ras_intr_triggered() && 3558 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) { 3559 adev->ip_blocks[i].status.hw = false; 3560 continue; 3561 } 3562 3563 /* skip unnecessary suspend if we do not initialize them yet */ 3564 if (!amdgpu_ip_member_of_hwini( 3565 adev, adev->ip_blocks[i].version->type)) 3566 continue; 3567 3568 /* skip suspend of gfx/mes and psp for S0ix 3569 * gfx is in gfxoff state, so on resume it will exit gfxoff just 3570 * like at runtime. PSP is also part of the always on hardware 3571 * so no need to suspend it. 3572 */ 3573 if (adev->in_s0ix && 3574 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP || 3575 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GFX || 3576 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_MES)) 3577 continue; 3578 3579 /* SDMA 5.x+ is part of GFX power domain so it's covered by GFXOFF */ 3580 if (adev->in_s0ix && 3581 (amdgpu_ip_version(adev, SDMA0_HWIP, 0) >= 3582 IP_VERSION(5, 0, 0)) && 3583 (adev->ip_blocks[i].version->type == 3584 AMD_IP_BLOCK_TYPE_SDMA)) 3585 continue; 3586 3587 /* Once swPSP provides the IMU, RLC FW binaries to TOS during cold-boot. 3588 * These are in TMR, hence are expected to be reused by PSP-TOS to reload 3589 * from this location and RLC Autoload automatically also gets loaded 3590 * from here based on PMFW -> PSP message during re-init sequence. 3591 * Therefore, the psp suspend & resume should be skipped to avoid destroy 3592 * the TMR and reload FWs again for IMU enabled APU ASICs. 3593 */ 3594 if (amdgpu_in_reset(adev) && 3595 (adev->flags & AMD_IS_APU) && adev->gfx.imu.funcs && 3596 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) 3597 continue; 3598 3599 /* XXX handle errors */ 3600 r = amdgpu_ip_block_suspend(&adev->ip_blocks[i]); 3601 adev->ip_blocks[i].status.hw = false; 3602 3603 /* handle putting the SMC in the appropriate state */ 3604 if (!amdgpu_sriov_vf(adev)) { 3605 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) { 3606 r = amdgpu_dpm_set_mp1_state(adev, adev->mp1_state); 3607 if (r) { 3608 DRM_ERROR("SMC failed to set mp1 state %d, %d\n", 3609 adev->mp1_state, r); 3610 return r; 3611 } 3612 } 3613 } 3614 } 3615 3616 return 0; 3617 } 3618 3619 /** 3620 * amdgpu_device_ip_suspend - run suspend for hardware IPs 3621 * 3622 * @adev: amdgpu_device pointer 3623 * 3624 * Main suspend function for hardware IPs. The list of all the hardware 3625 * IPs that make up the asic is walked, clockgating is disabled and the 3626 * suspend callbacks are run. suspend puts the hardware and software state 3627 * in each IP into a state suitable for suspend. 3628 * Returns 0 on success, negative error code on failure. 3629 */ 3630 int amdgpu_device_ip_suspend(struct amdgpu_device *adev) 3631 { 3632 int r; 3633 3634 if (amdgpu_sriov_vf(adev)) { 3635 amdgpu_virt_fini_data_exchange(adev); 3636 amdgpu_virt_request_full_gpu(adev, false); 3637 } 3638 3639 amdgpu_ttm_set_buffer_funcs_status(adev, false); 3640 3641 r = amdgpu_device_ip_suspend_phase1(adev); 3642 if (r) 3643 return r; 3644 r = amdgpu_device_ip_suspend_phase2(adev); 3645 3646 if (amdgpu_sriov_vf(adev)) 3647 amdgpu_virt_release_full_gpu(adev, false); 3648 3649 return r; 3650 } 3651 3652 static int amdgpu_device_ip_reinit_early_sriov(struct amdgpu_device *adev) 3653 { 3654 int i, r; 3655 3656 static enum amd_ip_block_type ip_order[] = { 3657 AMD_IP_BLOCK_TYPE_COMMON, 3658 AMD_IP_BLOCK_TYPE_GMC, 3659 AMD_IP_BLOCK_TYPE_PSP, 3660 AMD_IP_BLOCK_TYPE_IH, 3661 }; 3662 3663 for (i = 0; i < adev->num_ip_blocks; i++) { 3664 int j; 3665 struct amdgpu_ip_block *block; 3666 3667 block = &adev->ip_blocks[i]; 3668 block->status.hw = false; 3669 3670 for (j = 0; j < ARRAY_SIZE(ip_order); j++) { 3671 3672 if (block->version->type != ip_order[j] || 3673 !block->status.valid) 3674 continue; 3675 3676 r = block->version->funcs->hw_init(&adev->ip_blocks[i]); 3677 if (r) { 3678 dev_err(adev->dev, "RE-INIT-early: %s failed\n", 3679 block->version->funcs->name); 3680 return r; 3681 } 3682 block->status.hw = true; 3683 } 3684 } 3685 3686 return 0; 3687 } 3688 3689 static int amdgpu_device_ip_reinit_late_sriov(struct amdgpu_device *adev) 3690 { 3691 struct amdgpu_ip_block *block; 3692 int i, r = 0; 3693 3694 static enum amd_ip_block_type ip_order[] = { 3695 AMD_IP_BLOCK_TYPE_SMC, 3696 AMD_IP_BLOCK_TYPE_DCE, 3697 AMD_IP_BLOCK_TYPE_GFX, 3698 AMD_IP_BLOCK_TYPE_SDMA, 3699 AMD_IP_BLOCK_TYPE_MES, 3700 AMD_IP_BLOCK_TYPE_UVD, 3701 AMD_IP_BLOCK_TYPE_VCE, 3702 AMD_IP_BLOCK_TYPE_VCN, 3703 AMD_IP_BLOCK_TYPE_JPEG 3704 }; 3705 3706 for (i = 0; i < ARRAY_SIZE(ip_order); i++) { 3707 block = amdgpu_device_ip_get_ip_block(adev, ip_order[i]); 3708 3709 if (!block) 3710 continue; 3711 3712 if (block->status.valid && !block->status.hw) { 3713 if (block->version->type == AMD_IP_BLOCK_TYPE_SMC) { 3714 r = amdgpu_ip_block_resume(block); 3715 } else { 3716 r = block->version->funcs->hw_init(block); 3717 } 3718 3719 if (r) { 3720 dev_err(adev->dev, "RE-INIT-late: %s failed\n", 3721 block->version->funcs->name); 3722 break; 3723 } 3724 block->status.hw = true; 3725 } 3726 } 3727 3728 return r; 3729 } 3730 3731 /** 3732 * amdgpu_device_ip_resume_phase1 - run resume for hardware IPs 3733 * 3734 * @adev: amdgpu_device pointer 3735 * 3736 * First resume function for hardware IPs. The list of all the hardware 3737 * IPs that make up the asic is walked and the resume callbacks are run for 3738 * COMMON, GMC, and IH. resume puts the hardware into a functional state 3739 * after a suspend and updates the software state as necessary. This 3740 * function is also used for restoring the GPU after a GPU reset. 3741 * Returns 0 on success, negative error code on failure. 3742 */ 3743 static int amdgpu_device_ip_resume_phase1(struct amdgpu_device *adev) 3744 { 3745 int i, r; 3746 3747 for (i = 0; i < adev->num_ip_blocks; i++) { 3748 if (!adev->ip_blocks[i].status.valid || adev->ip_blocks[i].status.hw) 3749 continue; 3750 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON || 3751 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC || 3752 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH || 3753 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP && amdgpu_sriov_vf(adev))) { 3754 3755 r = amdgpu_ip_block_resume(&adev->ip_blocks[i]); 3756 if (r) 3757 return r; 3758 } 3759 } 3760 3761 return 0; 3762 } 3763 3764 /** 3765 * amdgpu_device_ip_resume_phase2 - run resume for hardware IPs 3766 * 3767 * @adev: amdgpu_device pointer 3768 * 3769 * Second resume function for hardware IPs. The list of all the hardware 3770 * IPs that make up the asic is walked and the resume callbacks are run for 3771 * all blocks except COMMON, GMC, and IH. resume puts the hardware into a 3772 * functional state after a suspend and updates the software state as 3773 * necessary. This function is also used for restoring the GPU after a GPU 3774 * reset. 3775 * Returns 0 on success, negative error code on failure. 3776 */ 3777 static int amdgpu_device_ip_resume_phase2(struct amdgpu_device *adev) 3778 { 3779 int i, r; 3780 3781 for (i = 0; i < adev->num_ip_blocks; i++) { 3782 if (!adev->ip_blocks[i].status.valid || adev->ip_blocks[i].status.hw) 3783 continue; 3784 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON || 3785 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC || 3786 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH || 3787 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE || 3788 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) 3789 continue; 3790 r = amdgpu_ip_block_resume(&adev->ip_blocks[i]); 3791 if (r) 3792 return r; 3793 } 3794 3795 return 0; 3796 } 3797 3798 /** 3799 * amdgpu_device_ip_resume_phase3 - run resume for hardware IPs 3800 * 3801 * @adev: amdgpu_device pointer 3802 * 3803 * Third resume function for hardware IPs. The list of all the hardware 3804 * IPs that make up the asic is walked and the resume callbacks are run for 3805 * all DCE. resume puts the hardware into a functional state after a suspend 3806 * and updates the software state as necessary. This function is also used 3807 * for restoring the GPU after a GPU reset. 3808 * 3809 * Returns 0 on success, negative error code on failure. 3810 */ 3811 static int amdgpu_device_ip_resume_phase3(struct amdgpu_device *adev) 3812 { 3813 int i, r; 3814 3815 for (i = 0; i < adev->num_ip_blocks; i++) { 3816 if (!adev->ip_blocks[i].status.valid || adev->ip_blocks[i].status.hw) 3817 continue; 3818 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE) { 3819 r = amdgpu_ip_block_resume(&adev->ip_blocks[i]); 3820 if (r) 3821 return r; 3822 } 3823 } 3824 3825 return 0; 3826 } 3827 3828 /** 3829 * amdgpu_device_ip_resume - run resume for hardware IPs 3830 * 3831 * @adev: amdgpu_device pointer 3832 * 3833 * Main resume function for hardware IPs. The hardware IPs 3834 * are split into two resume functions because they are 3835 * also used in recovering from a GPU reset and some additional 3836 * steps need to be take between them. In this case (S3/S4) they are 3837 * run sequentially. 3838 * Returns 0 on success, negative error code on failure. 3839 */ 3840 static int amdgpu_device_ip_resume(struct amdgpu_device *adev) 3841 { 3842 int r; 3843 3844 r = amdgpu_device_ip_resume_phase1(adev); 3845 if (r) 3846 return r; 3847 3848 r = amdgpu_device_fw_loading(adev); 3849 if (r) 3850 return r; 3851 3852 r = amdgpu_device_ip_resume_phase2(adev); 3853 3854 if (adev->mman.buffer_funcs_ring->sched.ready) 3855 amdgpu_ttm_set_buffer_funcs_status(adev, true); 3856 3857 if (r) 3858 return r; 3859 3860 amdgpu_fence_driver_hw_init(adev); 3861 3862 r = amdgpu_device_ip_resume_phase3(adev); 3863 3864 return r; 3865 } 3866 3867 /** 3868 * amdgpu_device_detect_sriov_bios - determine if the board supports SR-IOV 3869 * 3870 * @adev: amdgpu_device pointer 3871 * 3872 * Query the VBIOS data tables to determine if the board supports SR-IOV. 3873 */ 3874 static void amdgpu_device_detect_sriov_bios(struct amdgpu_device *adev) 3875 { 3876 if (amdgpu_sriov_vf(adev)) { 3877 if (adev->is_atom_fw) { 3878 if (amdgpu_atomfirmware_gpu_virtualization_supported(adev)) 3879 adev->virt.caps |= AMDGPU_SRIOV_CAPS_SRIOV_VBIOS; 3880 } else { 3881 if (amdgpu_atombios_has_gpu_virtualization_table(adev)) 3882 adev->virt.caps |= AMDGPU_SRIOV_CAPS_SRIOV_VBIOS; 3883 } 3884 3885 if (!(adev->virt.caps & AMDGPU_SRIOV_CAPS_SRIOV_VBIOS)) 3886 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_NO_VBIOS, 0, 0); 3887 } 3888 } 3889 3890 /** 3891 * amdgpu_device_asic_has_dc_support - determine if DC supports the asic 3892 * 3893 * @asic_type: AMD asic type 3894 * 3895 * Check if there is DC (new modesetting infrastructre) support for an asic. 3896 * returns true if DC has support, false if not. 3897 */ 3898 bool amdgpu_device_asic_has_dc_support(enum amd_asic_type asic_type) 3899 { 3900 switch (asic_type) { 3901 #ifdef CONFIG_DRM_AMDGPU_SI 3902 case CHIP_HAINAN: 3903 #endif 3904 case CHIP_TOPAZ: 3905 /* chips with no display hardware */ 3906 return false; 3907 #if defined(CONFIG_DRM_AMD_DC) 3908 case CHIP_TAHITI: 3909 case CHIP_PITCAIRN: 3910 case CHIP_VERDE: 3911 case CHIP_OLAND: 3912 /* 3913 * We have systems in the wild with these ASICs that require 3914 * LVDS and VGA support which is not supported with DC. 3915 * 3916 * Fallback to the non-DC driver here by default so as not to 3917 * cause regressions. 3918 */ 3919 #if defined(CONFIG_DRM_AMD_DC_SI) 3920 return amdgpu_dc > 0; 3921 #else 3922 return false; 3923 #endif 3924 case CHIP_BONAIRE: 3925 case CHIP_KAVERI: 3926 case CHIP_KABINI: 3927 case CHIP_MULLINS: 3928 /* 3929 * We have systems in the wild with these ASICs that require 3930 * VGA support which is not supported with DC. 3931 * 3932 * Fallback to the non-DC driver here by default so as not to 3933 * cause regressions. 3934 */ 3935 return amdgpu_dc > 0; 3936 default: 3937 return amdgpu_dc != 0; 3938 #else 3939 default: 3940 if (amdgpu_dc > 0) 3941 DRM_INFO_ONCE("Display Core has been requested via kernel parameter but isn't supported by ASIC, ignoring\n"); 3942 return false; 3943 #endif 3944 } 3945 } 3946 3947 /** 3948 * amdgpu_device_has_dc_support - check if dc is supported 3949 * 3950 * @adev: amdgpu_device pointer 3951 * 3952 * Returns true for supported, false for not supported 3953 */ 3954 bool amdgpu_device_has_dc_support(struct amdgpu_device *adev) 3955 { 3956 if (adev->enable_virtual_display || 3957 (adev->harvest_ip_mask & AMD_HARVEST_IP_DMU_MASK)) 3958 return false; 3959 3960 return amdgpu_device_asic_has_dc_support(adev->asic_type); 3961 } 3962 3963 static void amdgpu_device_xgmi_reset_func(struct work_struct *__work) 3964 { 3965 struct amdgpu_device *adev = 3966 container_of(__work, struct amdgpu_device, xgmi_reset_work); 3967 struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev); 3968 3969 /* It's a bug to not have a hive within this function */ 3970 if (WARN_ON(!hive)) 3971 return; 3972 3973 /* 3974 * Use task barrier to synchronize all xgmi reset works across the 3975 * hive. task_barrier_enter and task_barrier_exit will block 3976 * until all the threads running the xgmi reset works reach 3977 * those points. task_barrier_full will do both blocks. 3978 */ 3979 if (amdgpu_asic_reset_method(adev) == AMD_RESET_METHOD_BACO) { 3980 3981 task_barrier_enter(&hive->tb); 3982 adev->asic_reset_res = amdgpu_device_baco_enter(adev_to_drm(adev)); 3983 3984 if (adev->asic_reset_res) 3985 goto fail; 3986 3987 task_barrier_exit(&hive->tb); 3988 adev->asic_reset_res = amdgpu_device_baco_exit(adev_to_drm(adev)); 3989 3990 if (adev->asic_reset_res) 3991 goto fail; 3992 3993 amdgpu_ras_reset_error_count(adev, AMDGPU_RAS_BLOCK__MMHUB); 3994 } else { 3995 3996 task_barrier_full(&hive->tb); 3997 adev->asic_reset_res = amdgpu_asic_reset(adev); 3998 } 3999 4000 fail: 4001 if (adev->asic_reset_res) 4002 DRM_WARN("ASIC reset failed with error, %d for drm dev, %s", 4003 adev->asic_reset_res, adev_to_drm(adev)->unique); 4004 amdgpu_put_xgmi_hive(hive); 4005 } 4006 4007 static int amdgpu_device_get_job_timeout_settings(struct amdgpu_device *adev) 4008 { 4009 char *input = amdgpu_lockup_timeout; 4010 char *timeout_setting = NULL; 4011 int index = 0; 4012 long timeout; 4013 int ret = 0; 4014 4015 /* 4016 * By default timeout for non compute jobs is 10000 4017 * and 60000 for compute jobs. 4018 * In SR-IOV or passthrough mode, timeout for compute 4019 * jobs are 60000 by default. 4020 */ 4021 adev->gfx_timeout = msecs_to_jiffies(10000); 4022 adev->sdma_timeout = adev->video_timeout = adev->gfx_timeout; 4023 if (amdgpu_sriov_vf(adev)) 4024 adev->compute_timeout = amdgpu_sriov_is_pp_one_vf(adev) ? 4025 msecs_to_jiffies(60000) : msecs_to_jiffies(10000); 4026 else 4027 adev->compute_timeout = msecs_to_jiffies(60000); 4028 4029 if (strnlen(input, AMDGPU_MAX_TIMEOUT_PARAM_LENGTH)) { 4030 while ((timeout_setting = strsep(&input, ",")) && 4031 strnlen(timeout_setting, AMDGPU_MAX_TIMEOUT_PARAM_LENGTH)) { 4032 ret = kstrtol(timeout_setting, 0, &timeout); 4033 if (ret) 4034 return ret; 4035 4036 if (timeout == 0) { 4037 index++; 4038 continue; 4039 } else if (timeout < 0) { 4040 timeout = MAX_SCHEDULE_TIMEOUT; 4041 dev_warn(adev->dev, "lockup timeout disabled"); 4042 add_taint(TAINT_SOFTLOCKUP, LOCKDEP_STILL_OK); 4043 } else { 4044 timeout = msecs_to_jiffies(timeout); 4045 } 4046 4047 switch (index++) { 4048 case 0: 4049 adev->gfx_timeout = timeout; 4050 break; 4051 case 1: 4052 adev->compute_timeout = timeout; 4053 break; 4054 case 2: 4055 adev->sdma_timeout = timeout; 4056 break; 4057 case 3: 4058 adev->video_timeout = timeout; 4059 break; 4060 default: 4061 break; 4062 } 4063 } 4064 /* 4065 * There is only one value specified and 4066 * it should apply to all non-compute jobs. 4067 */ 4068 if (index == 1) { 4069 adev->sdma_timeout = adev->video_timeout = adev->gfx_timeout; 4070 if (amdgpu_sriov_vf(adev) || amdgpu_passthrough(adev)) 4071 adev->compute_timeout = adev->gfx_timeout; 4072 } 4073 } 4074 4075 return ret; 4076 } 4077 4078 /** 4079 * amdgpu_device_check_iommu_direct_map - check if RAM direct mapped to GPU 4080 * 4081 * @adev: amdgpu_device pointer 4082 * 4083 * RAM direct mapped to GPU if IOMMU is not enabled or is pass through mode 4084 */ 4085 static void amdgpu_device_check_iommu_direct_map(struct amdgpu_device *adev) 4086 { 4087 struct iommu_domain *domain; 4088 4089 domain = iommu_get_domain_for_dev(adev->dev); 4090 if (!domain || domain->type == IOMMU_DOMAIN_IDENTITY) 4091 adev->ram_is_direct_mapped = true; 4092 } 4093 4094 #if defined(CONFIG_HSA_AMD_P2P) 4095 /** 4096 * amdgpu_device_check_iommu_remap - Check if DMA remapping is enabled. 4097 * 4098 * @adev: amdgpu_device pointer 4099 * 4100 * return if IOMMU remapping bar address 4101 */ 4102 static bool amdgpu_device_check_iommu_remap(struct amdgpu_device *adev) 4103 { 4104 struct iommu_domain *domain; 4105 4106 domain = iommu_get_domain_for_dev(adev->dev); 4107 if (domain && (domain->type == IOMMU_DOMAIN_DMA || 4108 domain->type == IOMMU_DOMAIN_DMA_FQ)) 4109 return true; 4110 4111 return false; 4112 } 4113 #endif 4114 4115 static const struct attribute *amdgpu_dev_attributes[] = { 4116 &dev_attr_pcie_replay_count.attr, 4117 NULL 4118 }; 4119 4120 static void amdgpu_device_set_mcbp(struct amdgpu_device *adev) 4121 { 4122 if (amdgpu_mcbp == 1) 4123 adev->gfx.mcbp = true; 4124 else if (amdgpu_mcbp == 0) 4125 adev->gfx.mcbp = false; 4126 4127 if (amdgpu_sriov_vf(adev)) 4128 adev->gfx.mcbp = true; 4129 4130 if (adev->gfx.mcbp) 4131 DRM_INFO("MCBP is enabled\n"); 4132 } 4133 4134 /** 4135 * amdgpu_device_init - initialize the driver 4136 * 4137 * @adev: amdgpu_device pointer 4138 * @flags: driver flags 4139 * 4140 * Initializes the driver info and hw (all asics). 4141 * Returns 0 for success or an error on failure. 4142 * Called at driver startup. 4143 */ 4144 int amdgpu_device_init(struct amdgpu_device *adev, 4145 uint32_t flags) 4146 { 4147 struct drm_device *ddev = adev_to_drm(adev); 4148 struct pci_dev *pdev = adev->pdev; 4149 int r, i; 4150 bool px = false; 4151 u32 max_MBps; 4152 int tmp; 4153 4154 adev->shutdown = false; 4155 adev->flags = flags; 4156 4157 if (amdgpu_force_asic_type >= 0 && amdgpu_force_asic_type < CHIP_LAST) 4158 adev->asic_type = amdgpu_force_asic_type; 4159 else 4160 adev->asic_type = flags & AMD_ASIC_MASK; 4161 4162 adev->usec_timeout = AMDGPU_MAX_USEC_TIMEOUT; 4163 if (amdgpu_emu_mode == 1) 4164 adev->usec_timeout *= 10; 4165 adev->gmc.gart_size = 512 * 1024 * 1024; 4166 adev->accel_working = false; 4167 adev->num_rings = 0; 4168 RCU_INIT_POINTER(adev->gang_submit, dma_fence_get_stub()); 4169 adev->mman.buffer_funcs = NULL; 4170 adev->mman.buffer_funcs_ring = NULL; 4171 adev->vm_manager.vm_pte_funcs = NULL; 4172 adev->vm_manager.vm_pte_num_scheds = 0; 4173 adev->gmc.gmc_funcs = NULL; 4174 adev->harvest_ip_mask = 0x0; 4175 adev->fence_context = dma_fence_context_alloc(AMDGPU_MAX_RINGS); 4176 bitmap_zero(adev->gfx.pipe_reserve_bitmap, AMDGPU_MAX_COMPUTE_QUEUES); 4177 4178 adev->smc_rreg = &amdgpu_invalid_rreg; 4179 adev->smc_wreg = &amdgpu_invalid_wreg; 4180 adev->pcie_rreg = &amdgpu_invalid_rreg; 4181 adev->pcie_wreg = &amdgpu_invalid_wreg; 4182 adev->pcie_rreg_ext = &amdgpu_invalid_rreg_ext; 4183 adev->pcie_wreg_ext = &amdgpu_invalid_wreg_ext; 4184 adev->pciep_rreg = &amdgpu_invalid_rreg; 4185 adev->pciep_wreg = &amdgpu_invalid_wreg; 4186 adev->pcie_rreg64 = &amdgpu_invalid_rreg64; 4187 adev->pcie_wreg64 = &amdgpu_invalid_wreg64; 4188 adev->pcie_rreg64_ext = &amdgpu_invalid_rreg64_ext; 4189 adev->pcie_wreg64_ext = &amdgpu_invalid_wreg64_ext; 4190 adev->uvd_ctx_rreg = &amdgpu_invalid_rreg; 4191 adev->uvd_ctx_wreg = &amdgpu_invalid_wreg; 4192 adev->didt_rreg = &amdgpu_invalid_rreg; 4193 adev->didt_wreg = &amdgpu_invalid_wreg; 4194 adev->gc_cac_rreg = &amdgpu_invalid_rreg; 4195 adev->gc_cac_wreg = &amdgpu_invalid_wreg; 4196 adev->audio_endpt_rreg = &amdgpu_block_invalid_rreg; 4197 adev->audio_endpt_wreg = &amdgpu_block_invalid_wreg; 4198 4199 DRM_INFO("initializing kernel modesetting (%s 0x%04X:0x%04X 0x%04X:0x%04X 0x%02X).\n", 4200 amdgpu_asic_name[adev->asic_type], pdev->vendor, pdev->device, 4201 pdev->subsystem_vendor, pdev->subsystem_device, pdev->revision); 4202 4203 /* mutex initialization are all done here so we 4204 * can recall function without having locking issues 4205 */ 4206 mutex_init(&adev->firmware.mutex); 4207 mutex_init(&adev->pm.mutex); 4208 mutex_init(&adev->gfx.gpu_clock_mutex); 4209 mutex_init(&adev->srbm_mutex); 4210 mutex_init(&adev->gfx.pipe_reserve_mutex); 4211 mutex_init(&adev->gfx.gfx_off_mutex); 4212 mutex_init(&adev->gfx.partition_mutex); 4213 mutex_init(&adev->grbm_idx_mutex); 4214 mutex_init(&adev->mn_lock); 4215 mutex_init(&adev->virt.vf_errors.lock); 4216 mutex_init(&adev->virt.rlcg_reg_lock); 4217 hash_init(adev->mn_hash); 4218 mutex_init(&adev->psp.mutex); 4219 mutex_init(&adev->notifier_lock); 4220 mutex_init(&adev->pm.stable_pstate_ctx_lock); 4221 mutex_init(&adev->benchmark_mutex); 4222 mutex_init(&adev->gfx.reset_sem_mutex); 4223 /* Initialize the mutex for cleaner shader isolation between GFX and compute processes */ 4224 mutex_init(&adev->enforce_isolation_mutex); 4225 mutex_init(&adev->gfx.kfd_sch_mutex); 4226 4227 amdgpu_device_init_apu_flags(adev); 4228 4229 r = amdgpu_device_check_arguments(adev); 4230 if (r) 4231 return r; 4232 4233 spin_lock_init(&adev->mmio_idx_lock); 4234 spin_lock_init(&adev->smc_idx_lock); 4235 spin_lock_init(&adev->pcie_idx_lock); 4236 spin_lock_init(&adev->uvd_ctx_idx_lock); 4237 spin_lock_init(&adev->didt_idx_lock); 4238 spin_lock_init(&adev->gc_cac_idx_lock); 4239 spin_lock_init(&adev->se_cac_idx_lock); 4240 spin_lock_init(&adev->audio_endpt_idx_lock); 4241 spin_lock_init(&adev->mm_stats.lock); 4242 spin_lock_init(&adev->wb.lock); 4243 4244 INIT_LIST_HEAD(&adev->reset_list); 4245 4246 INIT_LIST_HEAD(&adev->ras_list); 4247 4248 INIT_LIST_HEAD(&adev->pm.od_kobj_list); 4249 4250 INIT_DELAYED_WORK(&adev->delayed_init_work, 4251 amdgpu_device_delayed_init_work_handler); 4252 INIT_DELAYED_WORK(&adev->gfx.gfx_off_delay_work, 4253 amdgpu_device_delay_enable_gfx_off); 4254 /* 4255 * Initialize the enforce_isolation work structures for each XCP 4256 * partition. This work handler is responsible for enforcing shader 4257 * isolation on AMD GPUs. It counts the number of emitted fences for 4258 * each GFX and compute ring. If there are any fences, it schedules 4259 * the `enforce_isolation_work` to be run after a delay. If there are 4260 * no fences, it signals the Kernel Fusion Driver (KFD) to resume the 4261 * runqueue. 4262 */ 4263 for (i = 0; i < MAX_XCP; i++) { 4264 INIT_DELAYED_WORK(&adev->gfx.enforce_isolation[i].work, 4265 amdgpu_gfx_enforce_isolation_handler); 4266 adev->gfx.enforce_isolation[i].adev = adev; 4267 adev->gfx.enforce_isolation[i].xcp_id = i; 4268 } 4269 4270 INIT_WORK(&adev->xgmi_reset_work, amdgpu_device_xgmi_reset_func); 4271 4272 adev->gfx.gfx_off_req_count = 1; 4273 adev->gfx.gfx_off_residency = 0; 4274 adev->gfx.gfx_off_entrycount = 0; 4275 adev->pm.ac_power = power_supply_is_system_supplied() > 0; 4276 4277 atomic_set(&adev->throttling_logging_enabled, 1); 4278 /* 4279 * If throttling continues, logging will be performed every minute 4280 * to avoid log flooding. "-1" is subtracted since the thermal 4281 * throttling interrupt comes every second. Thus, the total logging 4282 * interval is 59 seconds(retelimited printk interval) + 1(waiting 4283 * for throttling interrupt) = 60 seconds. 4284 */ 4285 ratelimit_state_init(&adev->throttling_logging_rs, (60 - 1) * HZ, 1); 4286 ratelimit_state_init(&adev->virt.ras_telemetry_rs, 5 * HZ, 1); 4287 4288 ratelimit_set_flags(&adev->throttling_logging_rs, RATELIMIT_MSG_ON_RELEASE); 4289 ratelimit_set_flags(&adev->virt.ras_telemetry_rs, RATELIMIT_MSG_ON_RELEASE); 4290 4291 /* Registers mapping */ 4292 /* TODO: block userspace mapping of io register */ 4293 if (adev->asic_type >= CHIP_BONAIRE) { 4294 adev->rmmio_base = pci_resource_start(adev->pdev, 5); 4295 adev->rmmio_size = pci_resource_len(adev->pdev, 5); 4296 } else { 4297 adev->rmmio_base = pci_resource_start(adev->pdev, 2); 4298 adev->rmmio_size = pci_resource_len(adev->pdev, 2); 4299 } 4300 4301 for (i = 0; i < AMD_IP_BLOCK_TYPE_NUM; i++) 4302 atomic_set(&adev->pm.pwr_state[i], POWER_STATE_UNKNOWN); 4303 4304 adev->rmmio = ioremap(adev->rmmio_base, adev->rmmio_size); 4305 if (!adev->rmmio) 4306 return -ENOMEM; 4307 4308 DRM_INFO("register mmio base: 0x%08X\n", (uint32_t)adev->rmmio_base); 4309 DRM_INFO("register mmio size: %u\n", (unsigned int)adev->rmmio_size); 4310 4311 /* 4312 * Reset domain needs to be present early, before XGMI hive discovered 4313 * (if any) and initialized to use reset sem and in_gpu reset flag 4314 * early on during init and before calling to RREG32. 4315 */ 4316 adev->reset_domain = amdgpu_reset_create_reset_domain(SINGLE_DEVICE, "amdgpu-reset-dev"); 4317 if (!adev->reset_domain) 4318 return -ENOMEM; 4319 4320 /* detect hw virtualization here */ 4321 amdgpu_detect_virtualization(adev); 4322 4323 amdgpu_device_get_pcie_info(adev); 4324 4325 r = amdgpu_device_get_job_timeout_settings(adev); 4326 if (r) { 4327 dev_err(adev->dev, "invalid lockup_timeout parameter syntax\n"); 4328 return r; 4329 } 4330 4331 amdgpu_device_set_mcbp(adev); 4332 4333 /* 4334 * By default, use default mode where all blocks are expected to be 4335 * initialized. At present a 'swinit' of blocks is required to be 4336 * completed before the need for a different level is detected. 4337 */ 4338 amdgpu_set_init_level(adev, AMDGPU_INIT_LEVEL_DEFAULT); 4339 /* early init functions */ 4340 r = amdgpu_device_ip_early_init(adev); 4341 if (r) 4342 return r; 4343 4344 /* Get rid of things like offb */ 4345 r = aperture_remove_conflicting_pci_devices(adev->pdev, amdgpu_kms_driver.name); 4346 if (r) 4347 return r; 4348 4349 /* Enable TMZ based on IP_VERSION */ 4350 amdgpu_gmc_tmz_set(adev); 4351 4352 if (amdgpu_sriov_vf(adev) && 4353 amdgpu_ip_version(adev, GC_HWIP, 0) >= IP_VERSION(10, 3, 0)) 4354 /* VF MMIO access (except mailbox range) from CPU 4355 * will be blocked during sriov runtime 4356 */ 4357 adev->virt.caps |= AMDGPU_VF_MMIO_ACCESS_PROTECT; 4358 4359 amdgpu_gmc_noretry_set(adev); 4360 /* Need to get xgmi info early to decide the reset behavior*/ 4361 if (adev->gmc.xgmi.supported) { 4362 r = adev->gfxhub.funcs->get_xgmi_info(adev); 4363 if (r) 4364 return r; 4365 } 4366 4367 /* enable PCIE atomic ops */ 4368 if (amdgpu_sriov_vf(adev)) { 4369 if (adev->virt.fw_reserve.p_pf2vf) 4370 adev->have_atomics_support = ((struct amd_sriov_msg_pf2vf_info *) 4371 adev->virt.fw_reserve.p_pf2vf)->pcie_atomic_ops_support_flags == 4372 (PCI_EXP_DEVCAP2_ATOMIC_COMP32 | PCI_EXP_DEVCAP2_ATOMIC_COMP64); 4373 /* APUs w/ gfx9 onwards doesn't reply on PCIe atomics, rather it is a 4374 * internal path natively support atomics, set have_atomics_support to true. 4375 */ 4376 } else if ((adev->flags & AMD_IS_APU) && 4377 (amdgpu_ip_version(adev, GC_HWIP, 0) > 4378 IP_VERSION(9, 0, 0))) { 4379 adev->have_atomics_support = true; 4380 } else { 4381 adev->have_atomics_support = 4382 !pci_enable_atomic_ops_to_root(adev->pdev, 4383 PCI_EXP_DEVCAP2_ATOMIC_COMP32 | 4384 PCI_EXP_DEVCAP2_ATOMIC_COMP64); 4385 } 4386 4387 if (!adev->have_atomics_support) 4388 dev_info(adev->dev, "PCIE atomic ops is not supported\n"); 4389 4390 /* doorbell bar mapping and doorbell index init*/ 4391 amdgpu_doorbell_init(adev); 4392 4393 if (amdgpu_emu_mode == 1) { 4394 /* post the asic on emulation mode */ 4395 emu_soc_asic_init(adev); 4396 goto fence_driver_init; 4397 } 4398 4399 amdgpu_reset_init(adev); 4400 4401 /* detect if we are with an SRIOV vbios */ 4402 if (adev->bios) 4403 amdgpu_device_detect_sriov_bios(adev); 4404 4405 /* check if we need to reset the asic 4406 * E.g., driver was not cleanly unloaded previously, etc. 4407 */ 4408 if (!amdgpu_sriov_vf(adev) && amdgpu_asic_need_reset_on_init(adev)) { 4409 if (adev->gmc.xgmi.num_physical_nodes) { 4410 dev_info(adev->dev, "Pending hive reset.\n"); 4411 amdgpu_set_init_level(adev, 4412 AMDGPU_INIT_LEVEL_MINIMAL_XGMI); 4413 } else if (amdgpu_ip_version(adev, MP1_HWIP, 0) == IP_VERSION(13, 0, 10) && 4414 !amdgpu_device_has_display_hardware(adev)) { 4415 r = psp_gpu_reset(adev); 4416 } else { 4417 tmp = amdgpu_reset_method; 4418 /* It should do a default reset when loading or reloading the driver, 4419 * regardless of the module parameter reset_method. 4420 */ 4421 amdgpu_reset_method = AMD_RESET_METHOD_NONE; 4422 r = amdgpu_asic_reset(adev); 4423 amdgpu_reset_method = tmp; 4424 } 4425 4426 if (r) { 4427 dev_err(adev->dev, "asic reset on init failed\n"); 4428 goto failed; 4429 } 4430 } 4431 4432 /* Post card if necessary */ 4433 if (amdgpu_device_need_post(adev)) { 4434 if (!adev->bios) { 4435 dev_err(adev->dev, "no vBIOS found\n"); 4436 r = -EINVAL; 4437 goto failed; 4438 } 4439 DRM_INFO("GPU posting now...\n"); 4440 r = amdgpu_device_asic_init(adev); 4441 if (r) { 4442 dev_err(adev->dev, "gpu post error!\n"); 4443 goto failed; 4444 } 4445 } 4446 4447 if (adev->bios) { 4448 if (adev->is_atom_fw) { 4449 /* Initialize clocks */ 4450 r = amdgpu_atomfirmware_get_clock_info(adev); 4451 if (r) { 4452 dev_err(adev->dev, "amdgpu_atomfirmware_get_clock_info failed\n"); 4453 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_GET_CLOCK_FAIL, 0, 0); 4454 goto failed; 4455 } 4456 } else { 4457 /* Initialize clocks */ 4458 r = amdgpu_atombios_get_clock_info(adev); 4459 if (r) { 4460 dev_err(adev->dev, "amdgpu_atombios_get_clock_info failed\n"); 4461 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_GET_CLOCK_FAIL, 0, 0); 4462 goto failed; 4463 } 4464 /* init i2c buses */ 4465 if (!amdgpu_device_has_dc_support(adev)) 4466 amdgpu_atombios_i2c_init(adev); 4467 } 4468 } 4469 4470 fence_driver_init: 4471 /* Fence driver */ 4472 r = amdgpu_fence_driver_sw_init(adev); 4473 if (r) { 4474 dev_err(adev->dev, "amdgpu_fence_driver_sw_init failed\n"); 4475 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_FENCE_INIT_FAIL, 0, 0); 4476 goto failed; 4477 } 4478 4479 /* init the mode config */ 4480 drm_mode_config_init(adev_to_drm(adev)); 4481 4482 r = amdgpu_device_ip_init(adev); 4483 if (r) { 4484 dev_err(adev->dev, "amdgpu_device_ip_init failed\n"); 4485 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_AMDGPU_INIT_FAIL, 0, 0); 4486 goto release_ras_con; 4487 } 4488 4489 amdgpu_fence_driver_hw_init(adev); 4490 4491 dev_info(adev->dev, 4492 "SE %d, SH per SE %d, CU per SH %d, active_cu_number %d\n", 4493 adev->gfx.config.max_shader_engines, 4494 adev->gfx.config.max_sh_per_se, 4495 adev->gfx.config.max_cu_per_sh, 4496 adev->gfx.cu_info.number); 4497 4498 adev->accel_working = true; 4499 4500 amdgpu_vm_check_compute_bug(adev); 4501 4502 /* Initialize the buffer migration limit. */ 4503 if (amdgpu_moverate >= 0) 4504 max_MBps = amdgpu_moverate; 4505 else 4506 max_MBps = 8; /* Allow 8 MB/s. */ 4507 /* Get a log2 for easy divisions. */ 4508 adev->mm_stats.log2_max_MBps = ilog2(max(1u, max_MBps)); 4509 4510 /* 4511 * Register gpu instance before amdgpu_device_enable_mgpu_fan_boost. 4512 * Otherwise the mgpu fan boost feature will be skipped due to the 4513 * gpu instance is counted less. 4514 */ 4515 amdgpu_register_gpu_instance(adev); 4516 4517 /* enable clockgating, etc. after ib tests, etc. since some blocks require 4518 * explicit gating rather than handling it automatically. 4519 */ 4520 if (adev->init_lvl->level != AMDGPU_INIT_LEVEL_MINIMAL_XGMI) { 4521 r = amdgpu_device_ip_late_init(adev); 4522 if (r) { 4523 dev_err(adev->dev, "amdgpu_device_ip_late_init failed\n"); 4524 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_AMDGPU_LATE_INIT_FAIL, 0, r); 4525 goto release_ras_con; 4526 } 4527 /* must succeed. */ 4528 amdgpu_ras_resume(adev); 4529 queue_delayed_work(system_wq, &adev->delayed_init_work, 4530 msecs_to_jiffies(AMDGPU_RESUME_MS)); 4531 } 4532 4533 if (amdgpu_sriov_vf(adev)) { 4534 amdgpu_virt_release_full_gpu(adev, true); 4535 flush_delayed_work(&adev->delayed_init_work); 4536 } 4537 4538 /* 4539 * Place those sysfs registering after `late_init`. As some of those 4540 * operations performed in `late_init` might affect the sysfs 4541 * interfaces creating. 4542 */ 4543 r = amdgpu_atombios_sysfs_init(adev); 4544 if (r) 4545 drm_err(&adev->ddev, 4546 "registering atombios sysfs failed (%d).\n", r); 4547 4548 r = amdgpu_pm_sysfs_init(adev); 4549 if (r) 4550 DRM_ERROR("registering pm sysfs failed (%d).\n", r); 4551 4552 r = amdgpu_ucode_sysfs_init(adev); 4553 if (r) { 4554 adev->ucode_sysfs_en = false; 4555 DRM_ERROR("Creating firmware sysfs failed (%d).\n", r); 4556 } else 4557 adev->ucode_sysfs_en = true; 4558 4559 r = sysfs_create_files(&adev->dev->kobj, amdgpu_dev_attributes); 4560 if (r) 4561 dev_err(adev->dev, "Could not create amdgpu device attr\n"); 4562 4563 r = devm_device_add_group(adev->dev, &amdgpu_board_attrs_group); 4564 if (r) 4565 dev_err(adev->dev, 4566 "Could not create amdgpu board attributes\n"); 4567 4568 amdgpu_fru_sysfs_init(adev); 4569 amdgpu_reg_state_sysfs_init(adev); 4570 amdgpu_xcp_cfg_sysfs_init(adev); 4571 4572 if (IS_ENABLED(CONFIG_PERF_EVENTS)) 4573 r = amdgpu_pmu_init(adev); 4574 if (r) 4575 dev_err(adev->dev, "amdgpu_pmu_init failed\n"); 4576 4577 /* Have stored pci confspace at hand for restore in sudden PCI error */ 4578 if (amdgpu_device_cache_pci_state(adev->pdev)) 4579 pci_restore_state(pdev); 4580 4581 /* if we have > 1 VGA cards, then disable the amdgpu VGA resources */ 4582 /* this will fail for cards that aren't VGA class devices, just 4583 * ignore it 4584 */ 4585 if ((adev->pdev->class >> 8) == PCI_CLASS_DISPLAY_VGA) 4586 vga_client_register(adev->pdev, amdgpu_device_vga_set_decode); 4587 4588 px = amdgpu_device_supports_px(ddev); 4589 4590 if (px || (!dev_is_removable(&adev->pdev->dev) && 4591 apple_gmux_detect(NULL, NULL))) 4592 vga_switcheroo_register_client(adev->pdev, 4593 &amdgpu_switcheroo_ops, px); 4594 4595 if (px) 4596 vga_switcheroo_init_domain_pm_ops(adev->dev, &adev->vga_pm_domain); 4597 4598 if (adev->init_lvl->level == AMDGPU_INIT_LEVEL_MINIMAL_XGMI) 4599 amdgpu_xgmi_reset_on_init(adev); 4600 4601 amdgpu_device_check_iommu_direct_map(adev); 4602 4603 adev->pm_nb.notifier_call = amdgpu_device_pm_notifier; 4604 r = register_pm_notifier(&adev->pm_nb); 4605 if (r) 4606 goto failed; 4607 4608 return 0; 4609 4610 release_ras_con: 4611 if (amdgpu_sriov_vf(adev)) 4612 amdgpu_virt_release_full_gpu(adev, true); 4613 4614 /* failed in exclusive mode due to timeout */ 4615 if (amdgpu_sriov_vf(adev) && 4616 !amdgpu_sriov_runtime(adev) && 4617 amdgpu_virt_mmio_blocked(adev) && 4618 !amdgpu_virt_wait_reset(adev)) { 4619 dev_err(adev->dev, "VF exclusive mode timeout\n"); 4620 /* Don't send request since VF is inactive. */ 4621 adev->virt.caps &= ~AMDGPU_SRIOV_CAPS_RUNTIME; 4622 adev->virt.ops = NULL; 4623 r = -EAGAIN; 4624 } 4625 amdgpu_release_ras_context(adev); 4626 4627 failed: 4628 amdgpu_vf_error_trans_all(adev); 4629 4630 return r; 4631 } 4632 4633 static void amdgpu_device_unmap_mmio(struct amdgpu_device *adev) 4634 { 4635 4636 /* Clear all CPU mappings pointing to this device */ 4637 unmap_mapping_range(adev->ddev.anon_inode->i_mapping, 0, 0, 1); 4638 4639 /* Unmap all mapped bars - Doorbell, registers and VRAM */ 4640 amdgpu_doorbell_fini(adev); 4641 4642 iounmap(adev->rmmio); 4643 adev->rmmio = NULL; 4644 if (adev->mman.aper_base_kaddr) 4645 iounmap(adev->mman.aper_base_kaddr); 4646 adev->mman.aper_base_kaddr = NULL; 4647 4648 /* Memory manager related */ 4649 if (!adev->gmc.xgmi.connected_to_cpu && !adev->gmc.is_app_apu) { 4650 arch_phys_wc_del(adev->gmc.vram_mtrr); 4651 arch_io_free_memtype_wc(adev->gmc.aper_base, adev->gmc.aper_size); 4652 } 4653 } 4654 4655 /** 4656 * amdgpu_device_fini_hw - tear down the driver 4657 * 4658 * @adev: amdgpu_device pointer 4659 * 4660 * Tear down the driver info (all asics). 4661 * Called at driver shutdown. 4662 */ 4663 void amdgpu_device_fini_hw(struct amdgpu_device *adev) 4664 { 4665 dev_info(adev->dev, "amdgpu: finishing device.\n"); 4666 flush_delayed_work(&adev->delayed_init_work); 4667 4668 if (adev->mman.initialized) 4669 drain_workqueue(adev->mman.bdev.wq); 4670 adev->shutdown = true; 4671 4672 unregister_pm_notifier(&adev->pm_nb); 4673 4674 /* make sure IB test finished before entering exclusive mode 4675 * to avoid preemption on IB test 4676 */ 4677 if (amdgpu_sriov_vf(adev)) { 4678 amdgpu_virt_request_full_gpu(adev, false); 4679 amdgpu_virt_fini_data_exchange(adev); 4680 } 4681 4682 /* disable all interrupts */ 4683 amdgpu_irq_disable_all(adev); 4684 if (adev->mode_info.mode_config_initialized) { 4685 if (!drm_drv_uses_atomic_modeset(adev_to_drm(adev))) 4686 drm_helper_force_disable_all(adev_to_drm(adev)); 4687 else 4688 drm_atomic_helper_shutdown(adev_to_drm(adev)); 4689 } 4690 amdgpu_fence_driver_hw_fini(adev); 4691 4692 if (adev->pm.sysfs_initialized) 4693 amdgpu_pm_sysfs_fini(adev); 4694 if (adev->ucode_sysfs_en) 4695 amdgpu_ucode_sysfs_fini(adev); 4696 sysfs_remove_files(&adev->dev->kobj, amdgpu_dev_attributes); 4697 amdgpu_fru_sysfs_fini(adev); 4698 4699 amdgpu_reg_state_sysfs_fini(adev); 4700 amdgpu_xcp_cfg_sysfs_fini(adev); 4701 4702 /* disable ras feature must before hw fini */ 4703 amdgpu_ras_pre_fini(adev); 4704 4705 amdgpu_ttm_set_buffer_funcs_status(adev, false); 4706 4707 amdgpu_device_ip_fini_early(adev); 4708 4709 amdgpu_irq_fini_hw(adev); 4710 4711 if (adev->mman.initialized) 4712 ttm_device_clear_dma_mappings(&adev->mman.bdev); 4713 4714 amdgpu_gart_dummy_page_fini(adev); 4715 4716 if (drm_dev_is_unplugged(adev_to_drm(adev))) 4717 amdgpu_device_unmap_mmio(adev); 4718 4719 } 4720 4721 void amdgpu_device_fini_sw(struct amdgpu_device *adev) 4722 { 4723 int idx; 4724 bool px; 4725 4726 amdgpu_device_ip_fini(adev); 4727 amdgpu_fence_driver_sw_fini(adev); 4728 amdgpu_ucode_release(&adev->firmware.gpu_info_fw); 4729 adev->accel_working = false; 4730 dma_fence_put(rcu_dereference_protected(adev->gang_submit, true)); 4731 4732 amdgpu_reset_fini(adev); 4733 4734 /* free i2c buses */ 4735 if (!amdgpu_device_has_dc_support(adev)) 4736 amdgpu_i2c_fini(adev); 4737 4738 if (amdgpu_emu_mode != 1) 4739 amdgpu_atombios_fini(adev); 4740 4741 kfree(adev->bios); 4742 adev->bios = NULL; 4743 4744 kfree(adev->fru_info); 4745 adev->fru_info = NULL; 4746 4747 px = amdgpu_device_supports_px(adev_to_drm(adev)); 4748 4749 if (px || (!dev_is_removable(&adev->pdev->dev) && 4750 apple_gmux_detect(NULL, NULL))) 4751 vga_switcheroo_unregister_client(adev->pdev); 4752 4753 if (px) 4754 vga_switcheroo_fini_domain_pm_ops(adev->dev); 4755 4756 if ((adev->pdev->class >> 8) == PCI_CLASS_DISPLAY_VGA) 4757 vga_client_unregister(adev->pdev); 4758 4759 if (drm_dev_enter(adev_to_drm(adev), &idx)) { 4760 4761 iounmap(adev->rmmio); 4762 adev->rmmio = NULL; 4763 amdgpu_doorbell_fini(adev); 4764 drm_dev_exit(idx); 4765 } 4766 4767 if (IS_ENABLED(CONFIG_PERF_EVENTS)) 4768 amdgpu_pmu_fini(adev); 4769 if (adev->mman.discovery_bin) 4770 amdgpu_discovery_fini(adev); 4771 4772 amdgpu_reset_put_reset_domain(adev->reset_domain); 4773 adev->reset_domain = NULL; 4774 4775 kfree(adev->pci_state); 4776 4777 } 4778 4779 /** 4780 * amdgpu_device_evict_resources - evict device resources 4781 * @adev: amdgpu device object 4782 * 4783 * Evicts all ttm device resources(vram BOs, gart table) from the lru list 4784 * of the vram memory type. Mainly used for evicting device resources 4785 * at suspend time. 4786 * 4787 */ 4788 static int amdgpu_device_evict_resources(struct amdgpu_device *adev) 4789 { 4790 int ret; 4791 4792 /* No need to evict vram on APUs unless going to S4 */ 4793 if (!adev->in_s4 && (adev->flags & AMD_IS_APU)) 4794 return 0; 4795 4796 ret = amdgpu_ttm_evict_resources(adev, TTM_PL_VRAM); 4797 if (ret) 4798 DRM_WARN("evicting device resources failed\n"); 4799 return ret; 4800 } 4801 4802 /* 4803 * Suspend & resume. 4804 */ 4805 /** 4806 * amdgpu_device_pm_notifier - Notification block for Suspend/Hibernate events 4807 * @nb: notifier block 4808 * @mode: suspend mode 4809 * @data: data 4810 * 4811 * This function is called when the system is about to suspend or hibernate. 4812 * It is used to evict resources from the device before the system goes to 4813 * sleep while there is still access to swap. 4814 */ 4815 static int amdgpu_device_pm_notifier(struct notifier_block *nb, unsigned long mode, 4816 void *data) 4817 { 4818 struct amdgpu_device *adev = container_of(nb, struct amdgpu_device, pm_nb); 4819 int r; 4820 4821 switch (mode) { 4822 case PM_HIBERNATION_PREPARE: 4823 adev->in_s4 = true; 4824 fallthrough; 4825 case PM_SUSPEND_PREPARE: 4826 r = amdgpu_device_evict_resources(adev); 4827 /* 4828 * This is considered non-fatal at this time because 4829 * amdgpu_device_prepare() will also fatally evict resources. 4830 * See https://gitlab.freedesktop.org/drm/amd/-/issues/3781 4831 */ 4832 if (r) 4833 drm_warn(adev_to_drm(adev), "Failed to evict resources, freeze active processes if problems occur: %d\n", r); 4834 break; 4835 } 4836 4837 return NOTIFY_DONE; 4838 } 4839 4840 /** 4841 * amdgpu_device_prepare - prepare for device suspend 4842 * 4843 * @dev: drm dev pointer 4844 * 4845 * Prepare to put the hw in the suspend state (all asics). 4846 * Returns 0 for success or an error on failure. 4847 * Called at driver suspend. 4848 */ 4849 int amdgpu_device_prepare(struct drm_device *dev) 4850 { 4851 struct amdgpu_device *adev = drm_to_adev(dev); 4852 int i, r; 4853 4854 amdgpu_choose_low_power_state(adev); 4855 4856 if (dev->switch_power_state == DRM_SWITCH_POWER_OFF) 4857 return 0; 4858 4859 /* Evict the majority of BOs before starting suspend sequence */ 4860 r = amdgpu_device_evict_resources(adev); 4861 if (r) 4862 goto unprepare; 4863 4864 flush_delayed_work(&adev->gfx.gfx_off_delay_work); 4865 4866 for (i = 0; i < adev->num_ip_blocks; i++) { 4867 if (!adev->ip_blocks[i].status.valid) 4868 continue; 4869 if (!adev->ip_blocks[i].version->funcs->prepare_suspend) 4870 continue; 4871 r = adev->ip_blocks[i].version->funcs->prepare_suspend(&adev->ip_blocks[i]); 4872 if (r) 4873 goto unprepare; 4874 } 4875 4876 return 0; 4877 4878 unprepare: 4879 adev->in_s0ix = adev->in_s3 = adev->in_s4 = false; 4880 4881 return r; 4882 } 4883 4884 /** 4885 * amdgpu_device_suspend - initiate device suspend 4886 * 4887 * @dev: drm dev pointer 4888 * @notify_clients: notify in-kernel DRM clients 4889 * 4890 * Puts the hw in the suspend state (all asics). 4891 * Returns 0 for success or an error on failure. 4892 * Called at driver suspend. 4893 */ 4894 int amdgpu_device_suspend(struct drm_device *dev, bool notify_clients) 4895 { 4896 struct amdgpu_device *adev = drm_to_adev(dev); 4897 int r = 0; 4898 4899 if (dev->switch_power_state == DRM_SWITCH_POWER_OFF) 4900 return 0; 4901 4902 adev->in_suspend = true; 4903 4904 if (amdgpu_sriov_vf(adev)) { 4905 amdgpu_virt_fini_data_exchange(adev); 4906 r = amdgpu_virt_request_full_gpu(adev, false); 4907 if (r) 4908 return r; 4909 } 4910 4911 if (amdgpu_acpi_smart_shift_update(dev, AMDGPU_SS_DEV_D3)) 4912 DRM_WARN("smart shift update failed\n"); 4913 4914 if (notify_clients) 4915 drm_client_dev_suspend(adev_to_drm(adev), false); 4916 4917 cancel_delayed_work_sync(&adev->delayed_init_work); 4918 4919 amdgpu_ras_suspend(adev); 4920 4921 amdgpu_device_ip_suspend_phase1(adev); 4922 4923 if (!adev->in_s0ix) 4924 amdgpu_amdkfd_suspend(adev, adev->in_runpm); 4925 4926 r = amdgpu_device_evict_resources(adev); 4927 if (r) 4928 return r; 4929 4930 amdgpu_ttm_set_buffer_funcs_status(adev, false); 4931 4932 amdgpu_fence_driver_hw_fini(adev); 4933 4934 amdgpu_device_ip_suspend_phase2(adev); 4935 4936 if (amdgpu_sriov_vf(adev)) 4937 amdgpu_virt_release_full_gpu(adev, false); 4938 4939 r = amdgpu_dpm_notify_rlc_state(adev, false); 4940 if (r) 4941 return r; 4942 4943 return 0; 4944 } 4945 4946 /** 4947 * amdgpu_device_resume - initiate device resume 4948 * 4949 * @dev: drm dev pointer 4950 * @notify_clients: notify in-kernel DRM clients 4951 * 4952 * Bring the hw back to operating state (all asics). 4953 * Returns 0 for success or an error on failure. 4954 * Called at driver resume. 4955 */ 4956 int amdgpu_device_resume(struct drm_device *dev, bool notify_clients) 4957 { 4958 struct amdgpu_device *adev = drm_to_adev(dev); 4959 int r = 0; 4960 4961 if (amdgpu_sriov_vf(adev)) { 4962 r = amdgpu_virt_request_full_gpu(adev, true); 4963 if (r) 4964 return r; 4965 } 4966 4967 if (dev->switch_power_state == DRM_SWITCH_POWER_OFF) 4968 return 0; 4969 4970 if (adev->in_s0ix) 4971 amdgpu_dpm_gfx_state_change(adev, sGpuChangeState_D0Entry); 4972 4973 /* post card */ 4974 if (amdgpu_device_need_post(adev)) { 4975 r = amdgpu_device_asic_init(adev); 4976 if (r) 4977 dev_err(adev->dev, "amdgpu asic init failed\n"); 4978 } 4979 4980 r = amdgpu_device_ip_resume(adev); 4981 4982 if (r) { 4983 dev_err(adev->dev, "amdgpu_device_ip_resume failed (%d).\n", r); 4984 goto exit; 4985 } 4986 4987 if (!adev->in_s0ix) { 4988 r = amdgpu_amdkfd_resume(adev, adev->in_runpm); 4989 if (r) 4990 goto exit; 4991 } 4992 4993 r = amdgpu_device_ip_late_init(adev); 4994 if (r) 4995 goto exit; 4996 4997 queue_delayed_work(system_wq, &adev->delayed_init_work, 4998 msecs_to_jiffies(AMDGPU_RESUME_MS)); 4999 exit: 5000 if (amdgpu_sriov_vf(adev)) { 5001 amdgpu_virt_init_data_exchange(adev); 5002 amdgpu_virt_release_full_gpu(adev, true); 5003 } 5004 5005 if (r) 5006 return r; 5007 5008 /* Make sure IB tests flushed */ 5009 flush_delayed_work(&adev->delayed_init_work); 5010 5011 if (notify_clients) 5012 drm_client_dev_resume(adev_to_drm(adev), false); 5013 5014 amdgpu_ras_resume(adev); 5015 5016 if (adev->mode_info.num_crtc) { 5017 /* 5018 * Most of the connector probing functions try to acquire runtime pm 5019 * refs to ensure that the GPU is powered on when connector polling is 5020 * performed. Since we're calling this from a runtime PM callback, 5021 * trying to acquire rpm refs will cause us to deadlock. 5022 * 5023 * Since we're guaranteed to be holding the rpm lock, it's safe to 5024 * temporarily disable the rpm helpers so this doesn't deadlock us. 5025 */ 5026 #ifdef CONFIG_PM 5027 dev->dev->power.disable_depth++; 5028 #endif 5029 if (!adev->dc_enabled) 5030 drm_helper_hpd_irq_event(dev); 5031 else 5032 drm_kms_helper_hotplug_event(dev); 5033 #ifdef CONFIG_PM 5034 dev->dev->power.disable_depth--; 5035 #endif 5036 } 5037 adev->in_suspend = false; 5038 5039 if (adev->enable_mes) 5040 amdgpu_mes_self_test(adev); 5041 5042 if (amdgpu_acpi_smart_shift_update(dev, AMDGPU_SS_DEV_D0)) 5043 DRM_WARN("smart shift update failed\n"); 5044 5045 return 0; 5046 } 5047 5048 /** 5049 * amdgpu_device_ip_check_soft_reset - did soft reset succeed 5050 * 5051 * @adev: amdgpu_device pointer 5052 * 5053 * The list of all the hardware IPs that make up the asic is walked and 5054 * the check_soft_reset callbacks are run. check_soft_reset determines 5055 * if the asic is still hung or not. 5056 * Returns true if any of the IPs are still in a hung state, false if not. 5057 */ 5058 static bool amdgpu_device_ip_check_soft_reset(struct amdgpu_device *adev) 5059 { 5060 int i; 5061 bool asic_hang = false; 5062 5063 if (amdgpu_sriov_vf(adev)) 5064 return true; 5065 5066 if (amdgpu_asic_need_full_reset(adev)) 5067 return true; 5068 5069 for (i = 0; i < adev->num_ip_blocks; i++) { 5070 if (!adev->ip_blocks[i].status.valid) 5071 continue; 5072 if (adev->ip_blocks[i].version->funcs->check_soft_reset) 5073 adev->ip_blocks[i].status.hang = 5074 adev->ip_blocks[i].version->funcs->check_soft_reset( 5075 &adev->ip_blocks[i]); 5076 if (adev->ip_blocks[i].status.hang) { 5077 dev_info(adev->dev, "IP block:%s is hung!\n", adev->ip_blocks[i].version->funcs->name); 5078 asic_hang = true; 5079 } 5080 } 5081 return asic_hang; 5082 } 5083 5084 /** 5085 * amdgpu_device_ip_pre_soft_reset - prepare for soft reset 5086 * 5087 * @adev: amdgpu_device pointer 5088 * 5089 * The list of all the hardware IPs that make up the asic is walked and the 5090 * pre_soft_reset callbacks are run if the block is hung. pre_soft_reset 5091 * handles any IP specific hardware or software state changes that are 5092 * necessary for a soft reset to succeed. 5093 * Returns 0 on success, negative error code on failure. 5094 */ 5095 static int amdgpu_device_ip_pre_soft_reset(struct amdgpu_device *adev) 5096 { 5097 int i, r = 0; 5098 5099 for (i = 0; i < adev->num_ip_blocks; i++) { 5100 if (!adev->ip_blocks[i].status.valid) 5101 continue; 5102 if (adev->ip_blocks[i].status.hang && 5103 adev->ip_blocks[i].version->funcs->pre_soft_reset) { 5104 r = adev->ip_blocks[i].version->funcs->pre_soft_reset(&adev->ip_blocks[i]); 5105 if (r) 5106 return r; 5107 } 5108 } 5109 5110 return 0; 5111 } 5112 5113 /** 5114 * amdgpu_device_ip_need_full_reset - check if a full asic reset is needed 5115 * 5116 * @adev: amdgpu_device pointer 5117 * 5118 * Some hardware IPs cannot be soft reset. If they are hung, a full gpu 5119 * reset is necessary to recover. 5120 * Returns true if a full asic reset is required, false if not. 5121 */ 5122 static bool amdgpu_device_ip_need_full_reset(struct amdgpu_device *adev) 5123 { 5124 int i; 5125 5126 if (amdgpu_asic_need_full_reset(adev)) 5127 return true; 5128 5129 for (i = 0; i < adev->num_ip_blocks; i++) { 5130 if (!adev->ip_blocks[i].status.valid) 5131 continue; 5132 if ((adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) || 5133 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) || 5134 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_ACP) || 5135 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE) || 5136 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) { 5137 if (adev->ip_blocks[i].status.hang) { 5138 dev_info(adev->dev, "Some block need full reset!\n"); 5139 return true; 5140 } 5141 } 5142 } 5143 return false; 5144 } 5145 5146 /** 5147 * amdgpu_device_ip_soft_reset - do a soft reset 5148 * 5149 * @adev: amdgpu_device pointer 5150 * 5151 * The list of all the hardware IPs that make up the asic is walked and the 5152 * soft_reset callbacks are run if the block is hung. soft_reset handles any 5153 * IP specific hardware or software state changes that are necessary to soft 5154 * reset the IP. 5155 * Returns 0 on success, negative error code on failure. 5156 */ 5157 static int amdgpu_device_ip_soft_reset(struct amdgpu_device *adev) 5158 { 5159 int i, r = 0; 5160 5161 for (i = 0; i < adev->num_ip_blocks; i++) { 5162 if (!adev->ip_blocks[i].status.valid) 5163 continue; 5164 if (adev->ip_blocks[i].status.hang && 5165 adev->ip_blocks[i].version->funcs->soft_reset) { 5166 r = adev->ip_blocks[i].version->funcs->soft_reset(&adev->ip_blocks[i]); 5167 if (r) 5168 return r; 5169 } 5170 } 5171 5172 return 0; 5173 } 5174 5175 /** 5176 * amdgpu_device_ip_post_soft_reset - clean up from soft reset 5177 * 5178 * @adev: amdgpu_device pointer 5179 * 5180 * The list of all the hardware IPs that make up the asic is walked and the 5181 * post_soft_reset callbacks are run if the asic was hung. post_soft_reset 5182 * handles any IP specific hardware or software state changes that are 5183 * necessary after the IP has been soft reset. 5184 * Returns 0 on success, negative error code on failure. 5185 */ 5186 static int amdgpu_device_ip_post_soft_reset(struct amdgpu_device *adev) 5187 { 5188 int i, r = 0; 5189 5190 for (i = 0; i < adev->num_ip_blocks; i++) { 5191 if (!adev->ip_blocks[i].status.valid) 5192 continue; 5193 if (adev->ip_blocks[i].status.hang && 5194 adev->ip_blocks[i].version->funcs->post_soft_reset) 5195 r = adev->ip_blocks[i].version->funcs->post_soft_reset(&adev->ip_blocks[i]); 5196 if (r) 5197 return r; 5198 } 5199 5200 return 0; 5201 } 5202 5203 /** 5204 * amdgpu_device_reset_sriov - reset ASIC for SR-IOV vf 5205 * 5206 * @adev: amdgpu_device pointer 5207 * @reset_context: amdgpu reset context pointer 5208 * 5209 * do VF FLR and reinitialize Asic 5210 * return 0 means succeeded otherwise failed 5211 */ 5212 static int amdgpu_device_reset_sriov(struct amdgpu_device *adev, 5213 struct amdgpu_reset_context *reset_context) 5214 { 5215 int r; 5216 struct amdgpu_hive_info *hive = NULL; 5217 5218 if (test_bit(AMDGPU_HOST_FLR, &reset_context->flags)) { 5219 if (!amdgpu_ras_get_fed_status(adev)) 5220 amdgpu_virt_ready_to_reset(adev); 5221 amdgpu_virt_wait_reset(adev); 5222 clear_bit(AMDGPU_HOST_FLR, &reset_context->flags); 5223 r = amdgpu_virt_request_full_gpu(adev, true); 5224 } else { 5225 r = amdgpu_virt_reset_gpu(adev); 5226 } 5227 if (r) 5228 return r; 5229 5230 amdgpu_ras_clear_err_state(adev); 5231 amdgpu_irq_gpu_reset_resume_helper(adev); 5232 5233 /* some sw clean up VF needs to do before recover */ 5234 amdgpu_virt_post_reset(adev); 5235 5236 /* Resume IP prior to SMC */ 5237 r = amdgpu_device_ip_reinit_early_sriov(adev); 5238 if (r) 5239 return r; 5240 5241 amdgpu_virt_init_data_exchange(adev); 5242 5243 r = amdgpu_device_fw_loading(adev); 5244 if (r) 5245 return r; 5246 5247 /* now we are okay to resume SMC/CP/SDMA */ 5248 r = amdgpu_device_ip_reinit_late_sriov(adev); 5249 if (r) 5250 return r; 5251 5252 hive = amdgpu_get_xgmi_hive(adev); 5253 /* Update PSP FW topology after reset */ 5254 if (hive && adev->gmc.xgmi.num_physical_nodes > 1) 5255 r = amdgpu_xgmi_update_topology(hive, adev); 5256 if (hive) 5257 amdgpu_put_xgmi_hive(hive); 5258 if (r) 5259 return r; 5260 5261 r = amdgpu_ib_ring_tests(adev); 5262 if (r) 5263 return r; 5264 5265 if (adev->virt.gim_feature & AMDGIM_FEATURE_GIM_FLR_VRAMLOST) 5266 amdgpu_inc_vram_lost(adev); 5267 5268 /* need to be called during full access so we can't do it later like 5269 * bare-metal does. 5270 */ 5271 amdgpu_amdkfd_post_reset(adev); 5272 amdgpu_virt_release_full_gpu(adev, true); 5273 5274 /* Aldebaran and gfx_11_0_3 support ras in SRIOV, so need resume ras during reset */ 5275 if (amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 2) || 5276 amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 3) || 5277 amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 4) || 5278 amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(11, 0, 3)) 5279 amdgpu_ras_resume(adev); 5280 5281 amdgpu_virt_ras_telemetry_post_reset(adev); 5282 5283 return 0; 5284 } 5285 5286 /** 5287 * amdgpu_device_has_job_running - check if there is any unfinished job 5288 * 5289 * @adev: amdgpu_device pointer 5290 * 5291 * check if there is any job running on the device when guest driver receives 5292 * FLR notification from host driver. If there are still jobs running, then 5293 * the guest driver will not respond the FLR reset. Instead, let the job hit 5294 * the timeout and guest driver then issue the reset request. 5295 */ 5296 bool amdgpu_device_has_job_running(struct amdgpu_device *adev) 5297 { 5298 int i; 5299 5300 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 5301 struct amdgpu_ring *ring = adev->rings[i]; 5302 5303 if (!amdgpu_ring_sched_ready(ring)) 5304 continue; 5305 5306 if (amdgpu_fence_count_emitted(ring)) 5307 return true; 5308 } 5309 return false; 5310 } 5311 5312 /** 5313 * amdgpu_device_should_recover_gpu - check if we should try GPU recovery 5314 * 5315 * @adev: amdgpu_device pointer 5316 * 5317 * Check amdgpu_gpu_recovery and SRIOV status to see if we should try to recover 5318 * a hung GPU. 5319 */ 5320 bool amdgpu_device_should_recover_gpu(struct amdgpu_device *adev) 5321 { 5322 5323 if (amdgpu_gpu_recovery == 0) 5324 goto disabled; 5325 5326 /* Skip soft reset check in fatal error mode */ 5327 if (!amdgpu_ras_is_poison_mode_supported(adev)) 5328 return true; 5329 5330 if (amdgpu_sriov_vf(adev)) 5331 return true; 5332 5333 if (amdgpu_gpu_recovery == -1) { 5334 switch (adev->asic_type) { 5335 #ifdef CONFIG_DRM_AMDGPU_SI 5336 case CHIP_VERDE: 5337 case CHIP_TAHITI: 5338 case CHIP_PITCAIRN: 5339 case CHIP_OLAND: 5340 case CHIP_HAINAN: 5341 #endif 5342 #ifdef CONFIG_DRM_AMDGPU_CIK 5343 case CHIP_KAVERI: 5344 case CHIP_KABINI: 5345 case CHIP_MULLINS: 5346 #endif 5347 case CHIP_CARRIZO: 5348 case CHIP_STONEY: 5349 case CHIP_CYAN_SKILLFISH: 5350 goto disabled; 5351 default: 5352 break; 5353 } 5354 } 5355 5356 return true; 5357 5358 disabled: 5359 dev_info(adev->dev, "GPU recovery disabled.\n"); 5360 return false; 5361 } 5362 5363 int amdgpu_device_mode1_reset(struct amdgpu_device *adev) 5364 { 5365 u32 i; 5366 int ret = 0; 5367 5368 amdgpu_atombios_scratch_regs_engine_hung(adev, true); 5369 5370 dev_info(adev->dev, "GPU mode1 reset\n"); 5371 5372 /* Cache the state before bus master disable. The saved config space 5373 * values are used in other cases like restore after mode-2 reset. 5374 */ 5375 amdgpu_device_cache_pci_state(adev->pdev); 5376 5377 /* disable BM */ 5378 pci_clear_master(adev->pdev); 5379 5380 if (amdgpu_dpm_is_mode1_reset_supported(adev)) { 5381 dev_info(adev->dev, "GPU smu mode1 reset\n"); 5382 ret = amdgpu_dpm_mode1_reset(adev); 5383 } else { 5384 dev_info(adev->dev, "GPU psp mode1 reset\n"); 5385 ret = psp_gpu_reset(adev); 5386 } 5387 5388 if (ret) 5389 goto mode1_reset_failed; 5390 5391 amdgpu_device_load_pci_state(adev->pdev); 5392 ret = amdgpu_psp_wait_for_bootloader(adev); 5393 if (ret) 5394 goto mode1_reset_failed; 5395 5396 /* wait for asic to come out of reset */ 5397 for (i = 0; i < adev->usec_timeout; i++) { 5398 u32 memsize = adev->nbio.funcs->get_memsize(adev); 5399 5400 if (memsize != 0xffffffff) 5401 break; 5402 udelay(1); 5403 } 5404 5405 if (i >= adev->usec_timeout) { 5406 ret = -ETIMEDOUT; 5407 goto mode1_reset_failed; 5408 } 5409 5410 amdgpu_atombios_scratch_regs_engine_hung(adev, false); 5411 5412 return 0; 5413 5414 mode1_reset_failed: 5415 dev_err(adev->dev, "GPU mode1 reset failed\n"); 5416 return ret; 5417 } 5418 5419 int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev, 5420 struct amdgpu_reset_context *reset_context) 5421 { 5422 int i, r = 0; 5423 struct amdgpu_job *job = NULL; 5424 struct amdgpu_device *tmp_adev = reset_context->reset_req_dev; 5425 bool need_full_reset = 5426 test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 5427 5428 if (reset_context->reset_req_dev == adev) 5429 job = reset_context->job; 5430 5431 if (amdgpu_sriov_vf(adev)) 5432 amdgpu_virt_pre_reset(adev); 5433 5434 amdgpu_fence_driver_isr_toggle(adev, true); 5435 5436 /* block all schedulers and reset given job's ring */ 5437 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 5438 struct amdgpu_ring *ring = adev->rings[i]; 5439 5440 if (!amdgpu_ring_sched_ready(ring)) 5441 continue; 5442 5443 /* Clear job fence from fence drv to avoid force_completion 5444 * leave NULL and vm flush fence in fence drv 5445 */ 5446 amdgpu_fence_driver_clear_job_fences(ring); 5447 5448 /* after all hw jobs are reset, hw fence is meaningless, so force_completion */ 5449 amdgpu_fence_driver_force_completion(ring); 5450 } 5451 5452 amdgpu_fence_driver_isr_toggle(adev, false); 5453 5454 if (job && job->vm) 5455 drm_sched_increase_karma(&job->base); 5456 5457 r = amdgpu_reset_prepare_hwcontext(adev, reset_context); 5458 /* If reset handler not implemented, continue; otherwise return */ 5459 if (r == -EOPNOTSUPP) 5460 r = 0; 5461 else 5462 return r; 5463 5464 /* Don't suspend on bare metal if we are not going to HW reset the ASIC */ 5465 if (!amdgpu_sriov_vf(adev)) { 5466 5467 if (!need_full_reset) 5468 need_full_reset = amdgpu_device_ip_need_full_reset(adev); 5469 5470 if (!need_full_reset && amdgpu_gpu_recovery && 5471 amdgpu_device_ip_check_soft_reset(adev)) { 5472 amdgpu_device_ip_pre_soft_reset(adev); 5473 r = amdgpu_device_ip_soft_reset(adev); 5474 amdgpu_device_ip_post_soft_reset(adev); 5475 if (r || amdgpu_device_ip_check_soft_reset(adev)) { 5476 dev_info(adev->dev, "soft reset failed, will fallback to full reset!\n"); 5477 need_full_reset = true; 5478 } 5479 } 5480 5481 if (!test_bit(AMDGPU_SKIP_COREDUMP, &reset_context->flags)) { 5482 dev_info(tmp_adev->dev, "Dumping IP State\n"); 5483 /* Trigger ip dump before we reset the asic */ 5484 for (i = 0; i < tmp_adev->num_ip_blocks; i++) 5485 if (tmp_adev->ip_blocks[i].version->funcs->dump_ip_state) 5486 tmp_adev->ip_blocks[i].version->funcs 5487 ->dump_ip_state((void *)&tmp_adev->ip_blocks[i]); 5488 dev_info(tmp_adev->dev, "Dumping IP State Completed\n"); 5489 } 5490 5491 if (need_full_reset) 5492 r = amdgpu_device_ip_suspend(adev); 5493 if (need_full_reset) 5494 set_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 5495 else 5496 clear_bit(AMDGPU_NEED_FULL_RESET, 5497 &reset_context->flags); 5498 } 5499 5500 return r; 5501 } 5502 5503 int amdgpu_device_reinit_after_reset(struct amdgpu_reset_context *reset_context) 5504 { 5505 struct list_head *device_list_handle; 5506 bool full_reset, vram_lost = false; 5507 struct amdgpu_device *tmp_adev; 5508 int r, init_level; 5509 5510 device_list_handle = reset_context->reset_device_list; 5511 5512 if (!device_list_handle) 5513 return -EINVAL; 5514 5515 full_reset = test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 5516 5517 /** 5518 * If it's reset on init, it's default init level, otherwise keep level 5519 * as recovery level. 5520 */ 5521 if (reset_context->method == AMD_RESET_METHOD_ON_INIT) 5522 init_level = AMDGPU_INIT_LEVEL_DEFAULT; 5523 else 5524 init_level = AMDGPU_INIT_LEVEL_RESET_RECOVERY; 5525 5526 r = 0; 5527 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 5528 amdgpu_set_init_level(tmp_adev, init_level); 5529 if (full_reset) { 5530 /* post card */ 5531 amdgpu_ras_clear_err_state(tmp_adev); 5532 r = amdgpu_device_asic_init(tmp_adev); 5533 if (r) { 5534 dev_warn(tmp_adev->dev, "asic atom init failed!"); 5535 } else { 5536 dev_info(tmp_adev->dev, "GPU reset succeeded, trying to resume\n"); 5537 5538 r = amdgpu_device_ip_resume_phase1(tmp_adev); 5539 if (r) 5540 goto out; 5541 5542 vram_lost = amdgpu_device_check_vram_lost(tmp_adev); 5543 5544 if (!test_bit(AMDGPU_SKIP_COREDUMP, &reset_context->flags)) 5545 amdgpu_coredump(tmp_adev, false, vram_lost, reset_context->job); 5546 5547 if (vram_lost) { 5548 DRM_INFO("VRAM is lost due to GPU reset!\n"); 5549 amdgpu_inc_vram_lost(tmp_adev); 5550 } 5551 5552 r = amdgpu_device_fw_loading(tmp_adev); 5553 if (r) 5554 return r; 5555 5556 r = amdgpu_xcp_restore_partition_mode( 5557 tmp_adev->xcp_mgr); 5558 if (r) 5559 goto out; 5560 5561 r = amdgpu_device_ip_resume_phase2(tmp_adev); 5562 if (r) 5563 goto out; 5564 5565 if (tmp_adev->mman.buffer_funcs_ring->sched.ready) 5566 amdgpu_ttm_set_buffer_funcs_status(tmp_adev, true); 5567 5568 r = amdgpu_device_ip_resume_phase3(tmp_adev); 5569 if (r) 5570 goto out; 5571 5572 if (vram_lost) 5573 amdgpu_device_fill_reset_magic(tmp_adev); 5574 5575 /* 5576 * Add this ASIC as tracked as reset was already 5577 * complete successfully. 5578 */ 5579 amdgpu_register_gpu_instance(tmp_adev); 5580 5581 if (!reset_context->hive && 5582 tmp_adev->gmc.xgmi.num_physical_nodes > 1) 5583 amdgpu_xgmi_add_device(tmp_adev); 5584 5585 r = amdgpu_device_ip_late_init(tmp_adev); 5586 if (r) 5587 goto out; 5588 5589 drm_client_dev_resume(adev_to_drm(tmp_adev), false); 5590 5591 /* 5592 * The GPU enters bad state once faulty pages 5593 * by ECC has reached the threshold, and ras 5594 * recovery is scheduled next. So add one check 5595 * here to break recovery if it indeed exceeds 5596 * bad page threshold, and remind user to 5597 * retire this GPU or setting one bigger 5598 * bad_page_threshold value to fix this once 5599 * probing driver again. 5600 */ 5601 if (!amdgpu_ras_is_rma(tmp_adev)) { 5602 /* must succeed. */ 5603 amdgpu_ras_resume(tmp_adev); 5604 } else { 5605 r = -EINVAL; 5606 goto out; 5607 } 5608 5609 /* Update PSP FW topology after reset */ 5610 if (reset_context->hive && 5611 tmp_adev->gmc.xgmi.num_physical_nodes > 1) 5612 r = amdgpu_xgmi_update_topology( 5613 reset_context->hive, tmp_adev); 5614 } 5615 } 5616 5617 out: 5618 if (!r) { 5619 /* IP init is complete now, set level as default */ 5620 amdgpu_set_init_level(tmp_adev, 5621 AMDGPU_INIT_LEVEL_DEFAULT); 5622 amdgpu_irq_gpu_reset_resume_helper(tmp_adev); 5623 r = amdgpu_ib_ring_tests(tmp_adev); 5624 if (r) { 5625 dev_err(tmp_adev->dev, "ib ring test failed (%d).\n", r); 5626 r = -EAGAIN; 5627 goto end; 5628 } 5629 } 5630 5631 if (r) 5632 tmp_adev->asic_reset_res = r; 5633 } 5634 5635 end: 5636 return r; 5637 } 5638 5639 int amdgpu_do_asic_reset(struct list_head *device_list_handle, 5640 struct amdgpu_reset_context *reset_context) 5641 { 5642 struct amdgpu_device *tmp_adev = NULL; 5643 bool need_full_reset, skip_hw_reset; 5644 int r = 0; 5645 5646 /* Try reset handler method first */ 5647 tmp_adev = list_first_entry(device_list_handle, struct amdgpu_device, 5648 reset_list); 5649 5650 reset_context->reset_device_list = device_list_handle; 5651 r = amdgpu_reset_perform_reset(tmp_adev, reset_context); 5652 /* If reset handler not implemented, continue; otherwise return */ 5653 if (r == -EOPNOTSUPP) 5654 r = 0; 5655 else 5656 return r; 5657 5658 /* Reset handler not implemented, use the default method */ 5659 need_full_reset = 5660 test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 5661 skip_hw_reset = test_bit(AMDGPU_SKIP_HW_RESET, &reset_context->flags); 5662 5663 /* 5664 * ASIC reset has to be done on all XGMI hive nodes ASAP 5665 * to allow proper links negotiation in FW (within 1 sec) 5666 */ 5667 if (!skip_hw_reset && need_full_reset) { 5668 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 5669 /* For XGMI run all resets in parallel to speed up the process */ 5670 if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) { 5671 if (!queue_work(system_unbound_wq, 5672 &tmp_adev->xgmi_reset_work)) 5673 r = -EALREADY; 5674 } else 5675 r = amdgpu_asic_reset(tmp_adev); 5676 5677 if (r) { 5678 dev_err(tmp_adev->dev, 5679 "ASIC reset failed with error, %d for drm dev, %s", 5680 r, adev_to_drm(tmp_adev)->unique); 5681 goto out; 5682 } 5683 } 5684 5685 /* For XGMI wait for all resets to complete before proceed */ 5686 if (!r) { 5687 list_for_each_entry(tmp_adev, device_list_handle, 5688 reset_list) { 5689 if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) { 5690 flush_work(&tmp_adev->xgmi_reset_work); 5691 r = tmp_adev->asic_reset_res; 5692 if (r) 5693 break; 5694 } 5695 } 5696 } 5697 } 5698 5699 if (!r && amdgpu_ras_intr_triggered()) { 5700 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 5701 amdgpu_ras_reset_error_count(tmp_adev, 5702 AMDGPU_RAS_BLOCK__MMHUB); 5703 } 5704 5705 amdgpu_ras_intr_cleared(); 5706 } 5707 5708 r = amdgpu_device_reinit_after_reset(reset_context); 5709 if (r == -EAGAIN) 5710 set_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 5711 else 5712 clear_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 5713 5714 out: 5715 return r; 5716 } 5717 5718 static void amdgpu_device_set_mp1_state(struct amdgpu_device *adev) 5719 { 5720 5721 switch (amdgpu_asic_reset_method(adev)) { 5722 case AMD_RESET_METHOD_MODE1: 5723 adev->mp1_state = PP_MP1_STATE_SHUTDOWN; 5724 break; 5725 case AMD_RESET_METHOD_MODE2: 5726 adev->mp1_state = PP_MP1_STATE_RESET; 5727 break; 5728 default: 5729 adev->mp1_state = PP_MP1_STATE_NONE; 5730 break; 5731 } 5732 } 5733 5734 static void amdgpu_device_unset_mp1_state(struct amdgpu_device *adev) 5735 { 5736 amdgpu_vf_error_trans_all(adev); 5737 adev->mp1_state = PP_MP1_STATE_NONE; 5738 } 5739 5740 static void amdgpu_device_resume_display_audio(struct amdgpu_device *adev) 5741 { 5742 struct pci_dev *p = NULL; 5743 5744 p = pci_get_domain_bus_and_slot(pci_domain_nr(adev->pdev->bus), 5745 adev->pdev->bus->number, 1); 5746 if (p) { 5747 pm_runtime_enable(&(p->dev)); 5748 pm_runtime_resume(&(p->dev)); 5749 } 5750 5751 pci_dev_put(p); 5752 } 5753 5754 static int amdgpu_device_suspend_display_audio(struct amdgpu_device *adev) 5755 { 5756 enum amd_reset_method reset_method; 5757 struct pci_dev *p = NULL; 5758 u64 expires; 5759 5760 /* 5761 * For now, only BACO and mode1 reset are confirmed 5762 * to suffer the audio issue without proper suspended. 5763 */ 5764 reset_method = amdgpu_asic_reset_method(adev); 5765 if ((reset_method != AMD_RESET_METHOD_BACO) && 5766 (reset_method != AMD_RESET_METHOD_MODE1)) 5767 return -EINVAL; 5768 5769 p = pci_get_domain_bus_and_slot(pci_domain_nr(adev->pdev->bus), 5770 adev->pdev->bus->number, 1); 5771 if (!p) 5772 return -ENODEV; 5773 5774 expires = pm_runtime_autosuspend_expiration(&(p->dev)); 5775 if (!expires) 5776 /* 5777 * If we cannot get the audio device autosuspend delay, 5778 * a fixed 4S interval will be used. Considering 3S is 5779 * the audio controller default autosuspend delay setting. 5780 * 4S used here is guaranteed to cover that. 5781 */ 5782 expires = ktime_get_mono_fast_ns() + NSEC_PER_SEC * 4ULL; 5783 5784 while (!pm_runtime_status_suspended(&(p->dev))) { 5785 if (!pm_runtime_suspend(&(p->dev))) 5786 break; 5787 5788 if (expires < ktime_get_mono_fast_ns()) { 5789 dev_warn(adev->dev, "failed to suspend display audio\n"); 5790 pci_dev_put(p); 5791 /* TODO: abort the succeeding gpu reset? */ 5792 return -ETIMEDOUT; 5793 } 5794 } 5795 5796 pm_runtime_disable(&(p->dev)); 5797 5798 pci_dev_put(p); 5799 return 0; 5800 } 5801 5802 static inline void amdgpu_device_stop_pending_resets(struct amdgpu_device *adev) 5803 { 5804 struct amdgpu_ras *con = amdgpu_ras_get_context(adev); 5805 5806 #if defined(CONFIG_DEBUG_FS) 5807 if (!amdgpu_sriov_vf(adev)) 5808 cancel_work(&adev->reset_work); 5809 #endif 5810 5811 if (adev->kfd.dev) 5812 cancel_work(&adev->kfd.reset_work); 5813 5814 if (amdgpu_sriov_vf(adev)) 5815 cancel_work(&adev->virt.flr_work); 5816 5817 if (con && adev->ras_enabled) 5818 cancel_work(&con->recovery_work); 5819 5820 } 5821 5822 static int amdgpu_device_health_check(struct list_head *device_list_handle) 5823 { 5824 struct amdgpu_device *tmp_adev; 5825 int ret = 0; 5826 u32 status; 5827 5828 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 5829 pci_read_config_dword(tmp_adev->pdev, PCI_COMMAND, &status); 5830 if (PCI_POSSIBLE_ERROR(status)) { 5831 dev_err(tmp_adev->dev, "device lost from bus!"); 5832 ret = -ENODEV; 5833 } 5834 } 5835 5836 return ret; 5837 } 5838 5839 /** 5840 * amdgpu_device_gpu_recover - reset the asic and recover scheduler 5841 * 5842 * @adev: amdgpu_device pointer 5843 * @job: which job trigger hang 5844 * @reset_context: amdgpu reset context pointer 5845 * 5846 * Attempt to reset the GPU if it has hung (all asics). 5847 * Attempt to do soft-reset or full-reset and reinitialize Asic 5848 * Returns 0 for success or an error on failure. 5849 */ 5850 5851 int amdgpu_device_gpu_recover(struct amdgpu_device *adev, 5852 struct amdgpu_job *job, 5853 struct amdgpu_reset_context *reset_context) 5854 { 5855 struct list_head device_list, *device_list_handle = NULL; 5856 bool job_signaled = false; 5857 struct amdgpu_hive_info *hive = NULL; 5858 struct amdgpu_device *tmp_adev = NULL; 5859 int i, r = 0; 5860 bool need_emergency_restart = false; 5861 bool audio_suspended = false; 5862 int retry_limit = AMDGPU_MAX_RETRY_LIMIT; 5863 5864 /* 5865 * If it reaches here because of hang/timeout and a RAS error is 5866 * detected at the same time, let RAS recovery take care of it. 5867 */ 5868 if (amdgpu_ras_is_err_state(adev, AMDGPU_RAS_BLOCK__ANY) && 5869 reset_context->src != AMDGPU_RESET_SRC_RAS) { 5870 dev_dbg(adev->dev, 5871 "Gpu recovery from source: %d yielding to RAS error recovery handling", 5872 reset_context->src); 5873 return 0; 5874 } 5875 /* 5876 * Special case: RAS triggered and full reset isn't supported 5877 */ 5878 need_emergency_restart = amdgpu_ras_need_emergency_restart(adev); 5879 5880 /* 5881 * Flush RAM to disk so that after reboot 5882 * the user can read log and see why the system rebooted. 5883 */ 5884 if (need_emergency_restart && amdgpu_ras_get_context(adev) && 5885 amdgpu_ras_get_context(adev)->reboot) { 5886 DRM_WARN("Emergency reboot."); 5887 5888 ksys_sync_helper(); 5889 emergency_restart(); 5890 } 5891 5892 dev_info(adev->dev, "GPU %s begin!\n", 5893 need_emergency_restart ? "jobs stop":"reset"); 5894 5895 if (!amdgpu_sriov_vf(adev)) 5896 hive = amdgpu_get_xgmi_hive(adev); 5897 if (hive) 5898 mutex_lock(&hive->hive_lock); 5899 5900 reset_context->job = job; 5901 reset_context->hive = hive; 5902 /* 5903 * Build list of devices to reset. 5904 * In case we are in XGMI hive mode, resort the device list 5905 * to put adev in the 1st position. 5906 */ 5907 INIT_LIST_HEAD(&device_list); 5908 if (!amdgpu_sriov_vf(adev) && (adev->gmc.xgmi.num_physical_nodes > 1) && hive) { 5909 list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head) { 5910 list_add_tail(&tmp_adev->reset_list, &device_list); 5911 if (adev->shutdown) 5912 tmp_adev->shutdown = true; 5913 } 5914 if (!list_is_first(&adev->reset_list, &device_list)) 5915 list_rotate_to_front(&adev->reset_list, &device_list); 5916 device_list_handle = &device_list; 5917 } else { 5918 list_add_tail(&adev->reset_list, &device_list); 5919 device_list_handle = &device_list; 5920 } 5921 5922 if (!amdgpu_sriov_vf(adev)) { 5923 r = amdgpu_device_health_check(device_list_handle); 5924 if (r) 5925 goto end_reset; 5926 } 5927 5928 /* We need to lock reset domain only once both for XGMI and single device */ 5929 tmp_adev = list_first_entry(device_list_handle, struct amdgpu_device, 5930 reset_list); 5931 amdgpu_device_lock_reset_domain(tmp_adev->reset_domain); 5932 5933 /* block all schedulers and reset given job's ring */ 5934 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 5935 5936 amdgpu_device_set_mp1_state(tmp_adev); 5937 5938 /* 5939 * Try to put the audio codec into suspend state 5940 * before gpu reset started. 5941 * 5942 * Due to the power domain of the graphics device 5943 * is shared with AZ power domain. Without this, 5944 * we may change the audio hardware from behind 5945 * the audio driver's back. That will trigger 5946 * some audio codec errors. 5947 */ 5948 if (!amdgpu_device_suspend_display_audio(tmp_adev)) 5949 audio_suspended = true; 5950 5951 amdgpu_ras_set_error_query_ready(tmp_adev, false); 5952 5953 cancel_delayed_work_sync(&tmp_adev->delayed_init_work); 5954 5955 amdgpu_amdkfd_pre_reset(tmp_adev, reset_context); 5956 5957 /* 5958 * Mark these ASICs to be reset as untracked first 5959 * And add them back after reset completed 5960 */ 5961 amdgpu_unregister_gpu_instance(tmp_adev); 5962 5963 drm_client_dev_suspend(adev_to_drm(tmp_adev), false); 5964 5965 /* disable ras on ALL IPs */ 5966 if (!need_emergency_restart && 5967 amdgpu_device_ip_need_full_reset(tmp_adev)) 5968 amdgpu_ras_suspend(tmp_adev); 5969 5970 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 5971 struct amdgpu_ring *ring = tmp_adev->rings[i]; 5972 5973 if (!amdgpu_ring_sched_ready(ring)) 5974 continue; 5975 5976 drm_sched_stop(&ring->sched, job ? &job->base : NULL); 5977 5978 if (need_emergency_restart) 5979 amdgpu_job_stop_all_jobs_on_sched(&ring->sched); 5980 } 5981 atomic_inc(&tmp_adev->gpu_reset_counter); 5982 } 5983 5984 if (need_emergency_restart) 5985 goto skip_sched_resume; 5986 5987 /* 5988 * Must check guilty signal here since after this point all old 5989 * HW fences are force signaled. 5990 * 5991 * job->base holds a reference to parent fence 5992 */ 5993 if (job && dma_fence_is_signaled(&job->hw_fence)) { 5994 job_signaled = true; 5995 dev_info(adev->dev, "Guilty job already signaled, skipping HW reset"); 5996 goto skip_hw_reset; 5997 } 5998 5999 retry: /* Rest of adevs pre asic reset from XGMI hive. */ 6000 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 6001 r = amdgpu_device_pre_asic_reset(tmp_adev, reset_context); 6002 /*TODO Should we stop ?*/ 6003 if (r) { 6004 dev_err(tmp_adev->dev, "GPU pre asic reset failed with err, %d for drm dev, %s ", 6005 r, adev_to_drm(tmp_adev)->unique); 6006 tmp_adev->asic_reset_res = r; 6007 } 6008 } 6009 6010 /* Actual ASIC resets if needed.*/ 6011 /* Host driver will handle XGMI hive reset for SRIOV */ 6012 if (amdgpu_sriov_vf(adev)) { 6013 if (amdgpu_ras_get_fed_status(adev) || amdgpu_virt_rcvd_ras_interrupt(adev)) { 6014 dev_dbg(adev->dev, "Detected RAS error, wait for FLR completion\n"); 6015 amdgpu_ras_set_fed(adev, true); 6016 set_bit(AMDGPU_HOST_FLR, &reset_context->flags); 6017 } 6018 6019 r = amdgpu_device_reset_sriov(adev, reset_context); 6020 if (AMDGPU_RETRY_SRIOV_RESET(r) && (retry_limit--) > 0) { 6021 amdgpu_virt_release_full_gpu(adev, true); 6022 goto retry; 6023 } 6024 if (r) 6025 adev->asic_reset_res = r; 6026 } else { 6027 r = amdgpu_do_asic_reset(device_list_handle, reset_context); 6028 if (r && r == -EAGAIN) 6029 goto retry; 6030 } 6031 6032 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 6033 /* 6034 * Drop any pending non scheduler resets queued before reset is done. 6035 * Any reset scheduled after this point would be valid. Scheduler resets 6036 * were already dropped during drm_sched_stop and no new ones can come 6037 * in before drm_sched_start. 6038 */ 6039 amdgpu_device_stop_pending_resets(tmp_adev); 6040 } 6041 6042 skip_hw_reset: 6043 6044 /* Post ASIC reset for all devs .*/ 6045 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 6046 6047 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 6048 struct amdgpu_ring *ring = tmp_adev->rings[i]; 6049 6050 if (!amdgpu_ring_sched_ready(ring)) 6051 continue; 6052 6053 drm_sched_start(&ring->sched, 0); 6054 } 6055 6056 if (!drm_drv_uses_atomic_modeset(adev_to_drm(tmp_adev)) && !job_signaled) 6057 drm_helper_resume_force_mode(adev_to_drm(tmp_adev)); 6058 6059 if (tmp_adev->asic_reset_res) 6060 r = tmp_adev->asic_reset_res; 6061 6062 tmp_adev->asic_reset_res = 0; 6063 6064 if (r) { 6065 /* bad news, how to tell it to userspace ? 6066 * for ras error, we should report GPU bad status instead of 6067 * reset failure 6068 */ 6069 if (reset_context->src != AMDGPU_RESET_SRC_RAS || 6070 !amdgpu_ras_eeprom_check_err_threshold(tmp_adev)) 6071 dev_info(tmp_adev->dev, "GPU reset(%d) failed\n", 6072 atomic_read(&tmp_adev->gpu_reset_counter)); 6073 amdgpu_vf_error_put(tmp_adev, AMDGIM_ERROR_VF_GPU_RESET_FAIL, 0, r); 6074 } else { 6075 dev_info(tmp_adev->dev, "GPU reset(%d) succeeded!\n", atomic_read(&tmp_adev->gpu_reset_counter)); 6076 if (amdgpu_acpi_smart_shift_update(adev_to_drm(tmp_adev), AMDGPU_SS_DEV_D0)) 6077 DRM_WARN("smart shift update failed\n"); 6078 } 6079 } 6080 6081 skip_sched_resume: 6082 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 6083 /* unlock kfd: SRIOV would do it separately */ 6084 if (!need_emergency_restart && !amdgpu_sriov_vf(tmp_adev)) 6085 amdgpu_amdkfd_post_reset(tmp_adev); 6086 6087 /* kfd_post_reset will do nothing if kfd device is not initialized, 6088 * need to bring up kfd here if it's not be initialized before 6089 */ 6090 if (!adev->kfd.init_complete) 6091 amdgpu_amdkfd_device_init(adev); 6092 6093 if (audio_suspended) 6094 amdgpu_device_resume_display_audio(tmp_adev); 6095 6096 amdgpu_device_unset_mp1_state(tmp_adev); 6097 6098 amdgpu_ras_set_error_query_ready(tmp_adev, true); 6099 } 6100 6101 tmp_adev = list_first_entry(device_list_handle, struct amdgpu_device, 6102 reset_list); 6103 amdgpu_device_unlock_reset_domain(tmp_adev->reset_domain); 6104 6105 end_reset: 6106 if (hive) { 6107 mutex_unlock(&hive->hive_lock); 6108 amdgpu_put_xgmi_hive(hive); 6109 } 6110 6111 if (r) 6112 dev_info(adev->dev, "GPU reset end with ret = %d\n", r); 6113 6114 atomic_set(&adev->reset_domain->reset_res, r); 6115 return r; 6116 } 6117 6118 /** 6119 * amdgpu_device_partner_bandwidth - find the bandwidth of appropriate partner 6120 * 6121 * @adev: amdgpu_device pointer 6122 * @speed: pointer to the speed of the link 6123 * @width: pointer to the width of the link 6124 * 6125 * Evaluate the hierarchy to find the speed and bandwidth capabilities of the 6126 * first physical partner to an AMD dGPU. 6127 * This will exclude any virtual switches and links. 6128 */ 6129 static void amdgpu_device_partner_bandwidth(struct amdgpu_device *adev, 6130 enum pci_bus_speed *speed, 6131 enum pcie_link_width *width) 6132 { 6133 struct pci_dev *parent = adev->pdev; 6134 6135 if (!speed || !width) 6136 return; 6137 6138 *speed = PCI_SPEED_UNKNOWN; 6139 *width = PCIE_LNK_WIDTH_UNKNOWN; 6140 6141 if (amdgpu_device_pcie_dynamic_switching_supported(adev)) { 6142 while ((parent = pci_upstream_bridge(parent))) { 6143 /* skip upstream/downstream switches internal to dGPU*/ 6144 if (parent->vendor == PCI_VENDOR_ID_ATI) 6145 continue; 6146 *speed = pcie_get_speed_cap(parent); 6147 *width = pcie_get_width_cap(parent); 6148 break; 6149 } 6150 } else { 6151 /* use the current speeds rather than max if switching is not supported */ 6152 pcie_bandwidth_available(adev->pdev, NULL, speed, width); 6153 } 6154 } 6155 6156 /** 6157 * amdgpu_device_get_pcie_info - fence pcie info about the PCIE slot 6158 * 6159 * @adev: amdgpu_device pointer 6160 * 6161 * Fetches and stores in the driver the PCIE capabilities (gen speed 6162 * and lanes) of the slot the device is in. Handles APUs and 6163 * virtualized environments where PCIE config space may not be available. 6164 */ 6165 static void amdgpu_device_get_pcie_info(struct amdgpu_device *adev) 6166 { 6167 struct pci_dev *pdev; 6168 enum pci_bus_speed speed_cap, platform_speed_cap; 6169 enum pcie_link_width platform_link_width; 6170 6171 if (amdgpu_pcie_gen_cap) 6172 adev->pm.pcie_gen_mask = amdgpu_pcie_gen_cap; 6173 6174 if (amdgpu_pcie_lane_cap) 6175 adev->pm.pcie_mlw_mask = amdgpu_pcie_lane_cap; 6176 6177 /* covers APUs as well */ 6178 if (pci_is_root_bus(adev->pdev->bus) && !amdgpu_passthrough(adev)) { 6179 if (adev->pm.pcie_gen_mask == 0) 6180 adev->pm.pcie_gen_mask = AMDGPU_DEFAULT_PCIE_GEN_MASK; 6181 if (adev->pm.pcie_mlw_mask == 0) 6182 adev->pm.pcie_mlw_mask = AMDGPU_DEFAULT_PCIE_MLW_MASK; 6183 return; 6184 } 6185 6186 if (adev->pm.pcie_gen_mask && adev->pm.pcie_mlw_mask) 6187 return; 6188 6189 amdgpu_device_partner_bandwidth(adev, &platform_speed_cap, 6190 &platform_link_width); 6191 6192 if (adev->pm.pcie_gen_mask == 0) { 6193 /* asic caps */ 6194 pdev = adev->pdev; 6195 speed_cap = pcie_get_speed_cap(pdev); 6196 if (speed_cap == PCI_SPEED_UNKNOWN) { 6197 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 6198 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 | 6199 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3); 6200 } else { 6201 if (speed_cap == PCIE_SPEED_32_0GT) 6202 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 6203 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 | 6204 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3 | 6205 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN4 | 6206 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN5); 6207 else if (speed_cap == PCIE_SPEED_16_0GT) 6208 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 6209 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 | 6210 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3 | 6211 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN4); 6212 else if (speed_cap == PCIE_SPEED_8_0GT) 6213 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 6214 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 | 6215 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3); 6216 else if (speed_cap == PCIE_SPEED_5_0GT) 6217 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 6218 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2); 6219 else 6220 adev->pm.pcie_gen_mask |= CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1; 6221 } 6222 /* platform caps */ 6223 if (platform_speed_cap == PCI_SPEED_UNKNOWN) { 6224 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 6225 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2); 6226 } else { 6227 if (platform_speed_cap == PCIE_SPEED_32_0GT) 6228 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 6229 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 | 6230 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3 | 6231 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN4 | 6232 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN5); 6233 else if (platform_speed_cap == PCIE_SPEED_16_0GT) 6234 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 6235 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 | 6236 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3 | 6237 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN4); 6238 else if (platform_speed_cap == PCIE_SPEED_8_0GT) 6239 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 6240 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 | 6241 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3); 6242 else if (platform_speed_cap == PCIE_SPEED_5_0GT) 6243 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 6244 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2); 6245 else 6246 adev->pm.pcie_gen_mask |= CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1; 6247 6248 } 6249 } 6250 if (adev->pm.pcie_mlw_mask == 0) { 6251 if (platform_link_width == PCIE_LNK_WIDTH_UNKNOWN) { 6252 adev->pm.pcie_mlw_mask |= AMDGPU_DEFAULT_PCIE_MLW_MASK; 6253 } else { 6254 switch (platform_link_width) { 6255 case PCIE_LNK_X32: 6256 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X32 | 6257 CAIL_PCIE_LINK_WIDTH_SUPPORT_X16 | 6258 CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 | 6259 CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 | 6260 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 6261 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 6262 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 6263 break; 6264 case PCIE_LNK_X16: 6265 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X16 | 6266 CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 | 6267 CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 | 6268 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 6269 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 6270 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 6271 break; 6272 case PCIE_LNK_X12: 6273 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 | 6274 CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 | 6275 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 6276 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 6277 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 6278 break; 6279 case PCIE_LNK_X8: 6280 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 | 6281 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 6282 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 6283 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 6284 break; 6285 case PCIE_LNK_X4: 6286 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 6287 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 6288 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 6289 break; 6290 case PCIE_LNK_X2: 6291 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 6292 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 6293 break; 6294 case PCIE_LNK_X1: 6295 adev->pm.pcie_mlw_mask = CAIL_PCIE_LINK_WIDTH_SUPPORT_X1; 6296 break; 6297 default: 6298 break; 6299 } 6300 } 6301 } 6302 } 6303 6304 /** 6305 * amdgpu_device_is_peer_accessible - Check peer access through PCIe BAR 6306 * 6307 * @adev: amdgpu_device pointer 6308 * @peer_adev: amdgpu_device pointer for peer device trying to access @adev 6309 * 6310 * Return true if @peer_adev can access (DMA) @adev through the PCIe 6311 * BAR, i.e. @adev is "large BAR" and the BAR matches the DMA mask of 6312 * @peer_adev. 6313 */ 6314 bool amdgpu_device_is_peer_accessible(struct amdgpu_device *adev, 6315 struct amdgpu_device *peer_adev) 6316 { 6317 #ifdef CONFIG_HSA_AMD_P2P 6318 bool p2p_access = 6319 !adev->gmc.xgmi.connected_to_cpu && 6320 !(pci_p2pdma_distance(adev->pdev, peer_adev->dev, false) < 0); 6321 if (!p2p_access) 6322 dev_info(adev->dev, "PCIe P2P access from peer device %s is not supported by the chipset\n", 6323 pci_name(peer_adev->pdev)); 6324 6325 bool is_large_bar = adev->gmc.visible_vram_size && 6326 adev->gmc.real_vram_size == adev->gmc.visible_vram_size; 6327 bool p2p_addressable = amdgpu_device_check_iommu_remap(peer_adev); 6328 6329 if (!p2p_addressable) { 6330 uint64_t address_mask = peer_adev->dev->dma_mask ? 6331 ~*peer_adev->dev->dma_mask : ~((1ULL << 32) - 1); 6332 resource_size_t aper_limit = 6333 adev->gmc.aper_base + adev->gmc.aper_size - 1; 6334 6335 p2p_addressable = !(adev->gmc.aper_base & address_mask || 6336 aper_limit & address_mask); 6337 } 6338 return pcie_p2p && is_large_bar && p2p_access && p2p_addressable; 6339 #else 6340 return false; 6341 #endif 6342 } 6343 6344 int amdgpu_device_baco_enter(struct drm_device *dev) 6345 { 6346 struct amdgpu_device *adev = drm_to_adev(dev); 6347 struct amdgpu_ras *ras = amdgpu_ras_get_context(adev); 6348 6349 if (!amdgpu_device_supports_baco(dev)) 6350 return -ENOTSUPP; 6351 6352 if (ras && adev->ras_enabled && 6353 adev->nbio.funcs->enable_doorbell_interrupt) 6354 adev->nbio.funcs->enable_doorbell_interrupt(adev, false); 6355 6356 return amdgpu_dpm_baco_enter(adev); 6357 } 6358 6359 int amdgpu_device_baco_exit(struct drm_device *dev) 6360 { 6361 struct amdgpu_device *adev = drm_to_adev(dev); 6362 struct amdgpu_ras *ras = amdgpu_ras_get_context(adev); 6363 int ret = 0; 6364 6365 if (!amdgpu_device_supports_baco(dev)) 6366 return -ENOTSUPP; 6367 6368 ret = amdgpu_dpm_baco_exit(adev); 6369 if (ret) 6370 return ret; 6371 6372 if (ras && adev->ras_enabled && 6373 adev->nbio.funcs->enable_doorbell_interrupt) 6374 adev->nbio.funcs->enable_doorbell_interrupt(adev, true); 6375 6376 if (amdgpu_passthrough(adev) && adev->nbio.funcs && 6377 adev->nbio.funcs->clear_doorbell_interrupt) 6378 adev->nbio.funcs->clear_doorbell_interrupt(adev); 6379 6380 return 0; 6381 } 6382 6383 /** 6384 * amdgpu_pci_error_detected - Called when a PCI error is detected. 6385 * @pdev: PCI device struct 6386 * @state: PCI channel state 6387 * 6388 * Description: Called when a PCI error is detected. 6389 * 6390 * Return: PCI_ERS_RESULT_NEED_RESET or PCI_ERS_RESULT_DISCONNECT. 6391 */ 6392 pci_ers_result_t amdgpu_pci_error_detected(struct pci_dev *pdev, pci_channel_state_t state) 6393 { 6394 struct drm_device *dev = pci_get_drvdata(pdev); 6395 struct amdgpu_device *adev = drm_to_adev(dev); 6396 int i; 6397 6398 DRM_INFO("PCI error: detected callback, state(%d)!!\n", state); 6399 6400 if (adev->gmc.xgmi.num_physical_nodes > 1) { 6401 DRM_WARN("No support for XGMI hive yet..."); 6402 return PCI_ERS_RESULT_DISCONNECT; 6403 } 6404 6405 adev->pci_channel_state = state; 6406 6407 switch (state) { 6408 case pci_channel_io_normal: 6409 return PCI_ERS_RESULT_CAN_RECOVER; 6410 /* Fatal error, prepare for slot reset */ 6411 case pci_channel_io_frozen: 6412 /* 6413 * Locking adev->reset_domain->sem will prevent any external access 6414 * to GPU during PCI error recovery 6415 */ 6416 amdgpu_device_lock_reset_domain(adev->reset_domain); 6417 amdgpu_device_set_mp1_state(adev); 6418 6419 /* 6420 * Block any work scheduling as we do for regular GPU reset 6421 * for the duration of the recovery 6422 */ 6423 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 6424 struct amdgpu_ring *ring = adev->rings[i]; 6425 6426 if (!amdgpu_ring_sched_ready(ring)) 6427 continue; 6428 6429 drm_sched_stop(&ring->sched, NULL); 6430 } 6431 atomic_inc(&adev->gpu_reset_counter); 6432 return PCI_ERS_RESULT_NEED_RESET; 6433 case pci_channel_io_perm_failure: 6434 /* Permanent error, prepare for device removal */ 6435 return PCI_ERS_RESULT_DISCONNECT; 6436 } 6437 6438 return PCI_ERS_RESULT_NEED_RESET; 6439 } 6440 6441 /** 6442 * amdgpu_pci_mmio_enabled - Enable MMIO and dump debug registers 6443 * @pdev: pointer to PCI device 6444 */ 6445 pci_ers_result_t amdgpu_pci_mmio_enabled(struct pci_dev *pdev) 6446 { 6447 6448 DRM_INFO("PCI error: mmio enabled callback!!\n"); 6449 6450 /* TODO - dump whatever for debugging purposes */ 6451 6452 /* This called only if amdgpu_pci_error_detected returns 6453 * PCI_ERS_RESULT_CAN_RECOVER. Read/write to the device still 6454 * works, no need to reset slot. 6455 */ 6456 6457 return PCI_ERS_RESULT_RECOVERED; 6458 } 6459 6460 /** 6461 * amdgpu_pci_slot_reset - Called when PCI slot has been reset. 6462 * @pdev: PCI device struct 6463 * 6464 * Description: This routine is called by the pci error recovery 6465 * code after the PCI slot has been reset, just before we 6466 * should resume normal operations. 6467 */ 6468 pci_ers_result_t amdgpu_pci_slot_reset(struct pci_dev *pdev) 6469 { 6470 struct drm_device *dev = pci_get_drvdata(pdev); 6471 struct amdgpu_device *adev = drm_to_adev(dev); 6472 int r, i; 6473 struct amdgpu_reset_context reset_context; 6474 u32 memsize; 6475 struct list_head device_list; 6476 6477 /* PCI error slot reset should be skipped During RAS recovery */ 6478 if ((amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 3) || 6479 amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 4)) && 6480 amdgpu_ras_in_recovery(adev)) 6481 return PCI_ERS_RESULT_RECOVERED; 6482 6483 DRM_INFO("PCI error: slot reset callback!!\n"); 6484 6485 memset(&reset_context, 0, sizeof(reset_context)); 6486 6487 INIT_LIST_HEAD(&device_list); 6488 list_add_tail(&adev->reset_list, &device_list); 6489 6490 /* wait for asic to come out of reset */ 6491 msleep(500); 6492 6493 /* Restore PCI confspace */ 6494 amdgpu_device_load_pci_state(pdev); 6495 6496 /* confirm ASIC came out of reset */ 6497 for (i = 0; i < adev->usec_timeout; i++) { 6498 memsize = amdgpu_asic_get_config_memsize(adev); 6499 6500 if (memsize != 0xffffffff) 6501 break; 6502 udelay(1); 6503 } 6504 if (memsize == 0xffffffff) { 6505 r = -ETIME; 6506 goto out; 6507 } 6508 6509 reset_context.method = AMD_RESET_METHOD_NONE; 6510 reset_context.reset_req_dev = adev; 6511 set_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags); 6512 set_bit(AMDGPU_SKIP_HW_RESET, &reset_context.flags); 6513 6514 adev->no_hw_access = true; 6515 r = amdgpu_device_pre_asic_reset(adev, &reset_context); 6516 adev->no_hw_access = false; 6517 if (r) 6518 goto out; 6519 6520 r = amdgpu_do_asic_reset(&device_list, &reset_context); 6521 6522 out: 6523 if (!r) { 6524 if (amdgpu_device_cache_pci_state(adev->pdev)) 6525 pci_restore_state(adev->pdev); 6526 6527 DRM_INFO("PCIe error recovery succeeded\n"); 6528 } else { 6529 DRM_ERROR("PCIe error recovery failed, err:%d", r); 6530 amdgpu_device_unset_mp1_state(adev); 6531 amdgpu_device_unlock_reset_domain(adev->reset_domain); 6532 } 6533 6534 return r ? PCI_ERS_RESULT_DISCONNECT : PCI_ERS_RESULT_RECOVERED; 6535 } 6536 6537 /** 6538 * amdgpu_pci_resume() - resume normal ops after PCI reset 6539 * @pdev: pointer to PCI device 6540 * 6541 * Called when the error recovery driver tells us that its 6542 * OK to resume normal operation. 6543 */ 6544 void amdgpu_pci_resume(struct pci_dev *pdev) 6545 { 6546 struct drm_device *dev = pci_get_drvdata(pdev); 6547 struct amdgpu_device *adev = drm_to_adev(dev); 6548 int i; 6549 6550 6551 DRM_INFO("PCI error: resume callback!!\n"); 6552 6553 /* Only continue execution for the case of pci_channel_io_frozen */ 6554 if (adev->pci_channel_state != pci_channel_io_frozen) 6555 return; 6556 6557 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 6558 struct amdgpu_ring *ring = adev->rings[i]; 6559 6560 if (!amdgpu_ring_sched_ready(ring)) 6561 continue; 6562 6563 drm_sched_start(&ring->sched, 0); 6564 } 6565 6566 amdgpu_device_unset_mp1_state(adev); 6567 amdgpu_device_unlock_reset_domain(adev->reset_domain); 6568 } 6569 6570 bool amdgpu_device_cache_pci_state(struct pci_dev *pdev) 6571 { 6572 struct drm_device *dev = pci_get_drvdata(pdev); 6573 struct amdgpu_device *adev = drm_to_adev(dev); 6574 int r; 6575 6576 if (amdgpu_sriov_vf(adev)) 6577 return false; 6578 6579 r = pci_save_state(pdev); 6580 if (!r) { 6581 kfree(adev->pci_state); 6582 6583 adev->pci_state = pci_store_saved_state(pdev); 6584 6585 if (!adev->pci_state) { 6586 DRM_ERROR("Failed to store PCI saved state"); 6587 return false; 6588 } 6589 } else { 6590 DRM_WARN("Failed to save PCI state, err:%d\n", r); 6591 return false; 6592 } 6593 6594 return true; 6595 } 6596 6597 bool amdgpu_device_load_pci_state(struct pci_dev *pdev) 6598 { 6599 struct drm_device *dev = pci_get_drvdata(pdev); 6600 struct amdgpu_device *adev = drm_to_adev(dev); 6601 int r; 6602 6603 if (!adev->pci_state) 6604 return false; 6605 6606 r = pci_load_saved_state(pdev, adev->pci_state); 6607 6608 if (!r) { 6609 pci_restore_state(pdev); 6610 } else { 6611 DRM_WARN("Failed to load PCI state, err:%d\n", r); 6612 return false; 6613 } 6614 6615 return true; 6616 } 6617 6618 void amdgpu_device_flush_hdp(struct amdgpu_device *adev, 6619 struct amdgpu_ring *ring) 6620 { 6621 #ifdef CONFIG_X86_64 6622 if ((adev->flags & AMD_IS_APU) && !amdgpu_passthrough(adev)) 6623 return; 6624 #endif 6625 if (adev->gmc.xgmi.connected_to_cpu) 6626 return; 6627 6628 if (ring && ring->funcs->emit_hdp_flush) 6629 amdgpu_ring_emit_hdp_flush(ring); 6630 else 6631 amdgpu_asic_flush_hdp(adev, ring); 6632 } 6633 6634 void amdgpu_device_invalidate_hdp(struct amdgpu_device *adev, 6635 struct amdgpu_ring *ring) 6636 { 6637 #ifdef CONFIG_X86_64 6638 if ((adev->flags & AMD_IS_APU) && !amdgpu_passthrough(adev)) 6639 return; 6640 #endif 6641 if (adev->gmc.xgmi.connected_to_cpu) 6642 return; 6643 6644 amdgpu_asic_invalidate_hdp(adev, ring); 6645 } 6646 6647 int amdgpu_in_reset(struct amdgpu_device *adev) 6648 { 6649 return atomic_read(&adev->reset_domain->in_gpu_reset); 6650 } 6651 6652 /** 6653 * amdgpu_device_halt() - bring hardware to some kind of halt state 6654 * 6655 * @adev: amdgpu_device pointer 6656 * 6657 * Bring hardware to some kind of halt state so that no one can touch it 6658 * any more. It will help to maintain error context when error occurred. 6659 * Compare to a simple hang, the system will keep stable at least for SSH 6660 * access. Then it should be trivial to inspect the hardware state and 6661 * see what's going on. Implemented as following: 6662 * 6663 * 1. drm_dev_unplug() makes device inaccessible to user space(IOCTLs, etc), 6664 * clears all CPU mappings to device, disallows remappings through page faults 6665 * 2. amdgpu_irq_disable_all() disables all interrupts 6666 * 3. amdgpu_fence_driver_hw_fini() signals all HW fences 6667 * 4. set adev->no_hw_access to avoid potential crashes after setp 5 6668 * 5. amdgpu_device_unmap_mmio() clears all MMIO mappings 6669 * 6. pci_disable_device() and pci_wait_for_pending_transaction() 6670 * flush any in flight DMA operations 6671 */ 6672 void amdgpu_device_halt(struct amdgpu_device *adev) 6673 { 6674 struct pci_dev *pdev = adev->pdev; 6675 struct drm_device *ddev = adev_to_drm(adev); 6676 6677 amdgpu_xcp_dev_unplug(adev); 6678 drm_dev_unplug(ddev); 6679 6680 amdgpu_irq_disable_all(adev); 6681 6682 amdgpu_fence_driver_hw_fini(adev); 6683 6684 adev->no_hw_access = true; 6685 6686 amdgpu_device_unmap_mmio(adev); 6687 6688 pci_disable_device(pdev); 6689 pci_wait_for_pending_transaction(pdev); 6690 } 6691 6692 u32 amdgpu_device_pcie_port_rreg(struct amdgpu_device *adev, 6693 u32 reg) 6694 { 6695 unsigned long flags, address, data; 6696 u32 r; 6697 6698 address = adev->nbio.funcs->get_pcie_port_index_offset(adev); 6699 data = adev->nbio.funcs->get_pcie_port_data_offset(adev); 6700 6701 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 6702 WREG32(address, reg * 4); 6703 (void)RREG32(address); 6704 r = RREG32(data); 6705 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 6706 return r; 6707 } 6708 6709 void amdgpu_device_pcie_port_wreg(struct amdgpu_device *adev, 6710 u32 reg, u32 v) 6711 { 6712 unsigned long flags, address, data; 6713 6714 address = adev->nbio.funcs->get_pcie_port_index_offset(adev); 6715 data = adev->nbio.funcs->get_pcie_port_data_offset(adev); 6716 6717 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 6718 WREG32(address, reg * 4); 6719 (void)RREG32(address); 6720 WREG32(data, v); 6721 (void)RREG32(data); 6722 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 6723 } 6724 6725 /** 6726 * amdgpu_device_get_gang - return a reference to the current gang 6727 * @adev: amdgpu_device pointer 6728 * 6729 * Returns: A new reference to the current gang leader. 6730 */ 6731 struct dma_fence *amdgpu_device_get_gang(struct amdgpu_device *adev) 6732 { 6733 struct dma_fence *fence; 6734 6735 rcu_read_lock(); 6736 fence = dma_fence_get_rcu_safe(&adev->gang_submit); 6737 rcu_read_unlock(); 6738 return fence; 6739 } 6740 6741 /** 6742 * amdgpu_device_switch_gang - switch to a new gang 6743 * @adev: amdgpu_device pointer 6744 * @gang: the gang to switch to 6745 * 6746 * Try to switch to a new gang. 6747 * Returns: NULL if we switched to the new gang or a reference to the current 6748 * gang leader. 6749 */ 6750 struct dma_fence *amdgpu_device_switch_gang(struct amdgpu_device *adev, 6751 struct dma_fence *gang) 6752 { 6753 struct dma_fence *old = NULL; 6754 6755 do { 6756 dma_fence_put(old); 6757 old = amdgpu_device_get_gang(adev); 6758 if (old == gang) 6759 break; 6760 6761 if (!dma_fence_is_signaled(old)) 6762 return old; 6763 6764 } while (cmpxchg((struct dma_fence __force **)&adev->gang_submit, 6765 old, gang) != old); 6766 6767 dma_fence_put(old); 6768 return NULL; 6769 } 6770 6771 bool amdgpu_device_has_display_hardware(struct amdgpu_device *adev) 6772 { 6773 switch (adev->asic_type) { 6774 #ifdef CONFIG_DRM_AMDGPU_SI 6775 case CHIP_HAINAN: 6776 #endif 6777 case CHIP_TOPAZ: 6778 /* chips with no display hardware */ 6779 return false; 6780 #ifdef CONFIG_DRM_AMDGPU_SI 6781 case CHIP_TAHITI: 6782 case CHIP_PITCAIRN: 6783 case CHIP_VERDE: 6784 case CHIP_OLAND: 6785 #endif 6786 #ifdef CONFIG_DRM_AMDGPU_CIK 6787 case CHIP_BONAIRE: 6788 case CHIP_HAWAII: 6789 case CHIP_KAVERI: 6790 case CHIP_KABINI: 6791 case CHIP_MULLINS: 6792 #endif 6793 case CHIP_TONGA: 6794 case CHIP_FIJI: 6795 case CHIP_POLARIS10: 6796 case CHIP_POLARIS11: 6797 case CHIP_POLARIS12: 6798 case CHIP_VEGAM: 6799 case CHIP_CARRIZO: 6800 case CHIP_STONEY: 6801 /* chips with display hardware */ 6802 return true; 6803 default: 6804 /* IP discovery */ 6805 if (!amdgpu_ip_version(adev, DCE_HWIP, 0) || 6806 (adev->harvest_ip_mask & AMD_HARVEST_IP_DMU_MASK)) 6807 return false; 6808 return true; 6809 } 6810 } 6811 6812 uint32_t amdgpu_device_wait_on_rreg(struct amdgpu_device *adev, 6813 uint32_t inst, uint32_t reg_addr, char reg_name[], 6814 uint32_t expected_value, uint32_t mask) 6815 { 6816 uint32_t ret = 0; 6817 uint32_t old_ = 0; 6818 uint32_t tmp_ = RREG32(reg_addr); 6819 uint32_t loop = adev->usec_timeout; 6820 6821 while ((tmp_ & (mask)) != (expected_value)) { 6822 if (old_ != tmp_) { 6823 loop = adev->usec_timeout; 6824 old_ = tmp_; 6825 } else 6826 udelay(1); 6827 tmp_ = RREG32(reg_addr); 6828 loop--; 6829 if (!loop) { 6830 DRM_WARN("Register(%d) [%s] failed to reach value 0x%08x != 0x%08xn", 6831 inst, reg_name, (uint32_t)expected_value, 6832 (uint32_t)(tmp_ & (mask))); 6833 ret = -ETIMEDOUT; 6834 break; 6835 } 6836 } 6837 return ret; 6838 } 6839 6840 ssize_t amdgpu_get_soft_full_reset_mask(struct amdgpu_ring *ring) 6841 { 6842 ssize_t size = 0; 6843 6844 if (!ring || !ring->adev) 6845 return size; 6846 6847 if (amdgpu_device_should_recover_gpu(ring->adev)) 6848 size |= AMDGPU_RESET_TYPE_FULL; 6849 6850 if (unlikely(!ring->adev->debug_disable_soft_recovery) && 6851 !amdgpu_sriov_vf(ring->adev) && ring->funcs->soft_recovery) 6852 size |= AMDGPU_RESET_TYPE_SOFT_RESET; 6853 6854 return size; 6855 } 6856 6857 ssize_t amdgpu_show_reset_mask(char *buf, uint32_t supported_reset) 6858 { 6859 ssize_t size = 0; 6860 6861 if (supported_reset == 0) { 6862 size += sysfs_emit_at(buf, size, "unsupported"); 6863 size += sysfs_emit_at(buf, size, "\n"); 6864 return size; 6865 6866 } 6867 6868 if (supported_reset & AMDGPU_RESET_TYPE_SOFT_RESET) 6869 size += sysfs_emit_at(buf, size, "soft "); 6870 6871 if (supported_reset & AMDGPU_RESET_TYPE_PER_QUEUE) 6872 size += sysfs_emit_at(buf, size, "queue "); 6873 6874 if (supported_reset & AMDGPU_RESET_TYPE_PER_PIPE) 6875 size += sysfs_emit_at(buf, size, "pipe "); 6876 6877 if (supported_reset & AMDGPU_RESET_TYPE_FULL) 6878 size += sysfs_emit_at(buf, size, "full "); 6879 6880 size += sysfs_emit_at(buf, size, "\n"); 6881 return size; 6882 } 6883