1 /* 2 * Copyright 2008 Advanced Micro Devices, Inc. 3 * Copyright 2008 Red Hat Inc. 4 * Copyright 2009 Jerome Glisse. 5 * 6 * Permission is hereby granted, free of charge, to any person obtaining a 7 * copy of this software and associated documentation files (the "Software"), 8 * to deal in the Software without restriction, including without limitation 9 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 10 * and/or sell copies of the Software, and to permit persons to whom the 11 * Software is furnished to do so, subject to the following conditions: 12 * 13 * The above copyright notice and this permission notice shall be included in 14 * all copies or substantial portions of the Software. 15 * 16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 19 * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR 20 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, 21 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR 22 * OTHER DEALINGS IN THE SOFTWARE. 23 * 24 * Authors: Dave Airlie 25 * Alex Deucher 26 * Jerome Glisse 27 */ 28 29 #include <linux/aperture.h> 30 #include <linux/power_supply.h> 31 #include <linux/kthread.h> 32 #include <linux/module.h> 33 #include <linux/console.h> 34 #include <linux/slab.h> 35 #include <linux/iommu.h> 36 #include <linux/pci.h> 37 #include <linux/pci-p2pdma.h> 38 #include <linux/apple-gmux.h> 39 40 #include <drm/drm_atomic_helper.h> 41 #include <drm/drm_client_event.h> 42 #include <drm/drm_crtc_helper.h> 43 #include <drm/drm_probe_helper.h> 44 #include <drm/amdgpu_drm.h> 45 #include <linux/device.h> 46 #include <linux/vgaarb.h> 47 #include <linux/vga_switcheroo.h> 48 #include <linux/efi.h> 49 #include "amdgpu.h" 50 #include "amdgpu_trace.h" 51 #include "amdgpu_i2c.h" 52 #include "atom.h" 53 #include "amdgpu_atombios.h" 54 #include "amdgpu_atomfirmware.h" 55 #include "amd_pcie.h" 56 #ifdef CONFIG_DRM_AMDGPU_SI 57 #include "si.h" 58 #endif 59 #ifdef CONFIG_DRM_AMDGPU_CIK 60 #include "cik.h" 61 #endif 62 #include "vi.h" 63 #include "soc15.h" 64 #include "nv.h" 65 #include "bif/bif_4_1_d.h" 66 #include <linux/firmware.h> 67 #include "amdgpu_vf_error.h" 68 69 #include "amdgpu_amdkfd.h" 70 #include "amdgpu_pm.h" 71 72 #include "amdgpu_xgmi.h" 73 #include "amdgpu_ras.h" 74 #include "amdgpu_pmu.h" 75 #include "amdgpu_fru_eeprom.h" 76 #include "amdgpu_reset.h" 77 #include "amdgpu_virt.h" 78 #include "amdgpu_dev_coredump.h" 79 80 #include <linux/suspend.h> 81 #include <drm/task_barrier.h> 82 #include <linux/pm_runtime.h> 83 84 #include <drm/drm_drv.h> 85 86 #if IS_ENABLED(CONFIG_X86) 87 #include <asm/intel-family.h> 88 #endif 89 90 MODULE_FIRMWARE("amdgpu/vega10_gpu_info.bin"); 91 MODULE_FIRMWARE("amdgpu/vega12_gpu_info.bin"); 92 MODULE_FIRMWARE("amdgpu/raven_gpu_info.bin"); 93 MODULE_FIRMWARE("amdgpu/picasso_gpu_info.bin"); 94 MODULE_FIRMWARE("amdgpu/raven2_gpu_info.bin"); 95 MODULE_FIRMWARE("amdgpu/arcturus_gpu_info.bin"); 96 MODULE_FIRMWARE("amdgpu/navi12_gpu_info.bin"); 97 98 #define AMDGPU_RESUME_MS 2000 99 #define AMDGPU_MAX_RETRY_LIMIT 2 100 #define AMDGPU_RETRY_SRIOV_RESET(r) ((r) == -EBUSY || (r) == -ETIMEDOUT || (r) == -EINVAL) 101 #define AMDGPU_PCIE_INDEX_FALLBACK (0x38 >> 2) 102 #define AMDGPU_PCIE_INDEX_HI_FALLBACK (0x44 >> 2) 103 #define AMDGPU_PCIE_DATA_FALLBACK (0x3C >> 2) 104 105 #define AMDGPU_VBIOS_SKIP (1U << 0) 106 #define AMDGPU_VBIOS_OPTIONAL (1U << 1) 107 108 static const struct drm_driver amdgpu_kms_driver; 109 110 const char *amdgpu_asic_name[] = { 111 "TAHITI", 112 "PITCAIRN", 113 "VERDE", 114 "OLAND", 115 "HAINAN", 116 "BONAIRE", 117 "KAVERI", 118 "KABINI", 119 "HAWAII", 120 "MULLINS", 121 "TOPAZ", 122 "TONGA", 123 "FIJI", 124 "CARRIZO", 125 "STONEY", 126 "POLARIS10", 127 "POLARIS11", 128 "POLARIS12", 129 "VEGAM", 130 "VEGA10", 131 "VEGA12", 132 "VEGA20", 133 "RAVEN", 134 "ARCTURUS", 135 "RENOIR", 136 "ALDEBARAN", 137 "NAVI10", 138 "CYAN_SKILLFISH", 139 "NAVI14", 140 "NAVI12", 141 "SIENNA_CICHLID", 142 "NAVY_FLOUNDER", 143 "VANGOGH", 144 "DIMGREY_CAVEFISH", 145 "BEIGE_GOBY", 146 "YELLOW_CARP", 147 "IP DISCOVERY", 148 "LAST", 149 }; 150 151 #define AMDGPU_IP_BLK_MASK_ALL GENMASK(AMD_IP_BLOCK_TYPE_NUM - 1, 0) 152 /* 153 * Default init level where all blocks are expected to be initialized. This is 154 * the level of initialization expected by default and also after a full reset 155 * of the device. 156 */ 157 struct amdgpu_init_level amdgpu_init_default = { 158 .level = AMDGPU_INIT_LEVEL_DEFAULT, 159 .hwini_ip_block_mask = AMDGPU_IP_BLK_MASK_ALL, 160 }; 161 162 struct amdgpu_init_level amdgpu_init_recovery = { 163 .level = AMDGPU_INIT_LEVEL_RESET_RECOVERY, 164 .hwini_ip_block_mask = AMDGPU_IP_BLK_MASK_ALL, 165 }; 166 167 /* 168 * Minimal blocks needed to be initialized before a XGMI hive can be reset. This 169 * is used for cases like reset on initialization where the entire hive needs to 170 * be reset before first use. 171 */ 172 struct amdgpu_init_level amdgpu_init_minimal_xgmi = { 173 .level = AMDGPU_INIT_LEVEL_MINIMAL_XGMI, 174 .hwini_ip_block_mask = 175 BIT(AMD_IP_BLOCK_TYPE_GMC) | BIT(AMD_IP_BLOCK_TYPE_SMC) | 176 BIT(AMD_IP_BLOCK_TYPE_COMMON) | BIT(AMD_IP_BLOCK_TYPE_IH) | 177 BIT(AMD_IP_BLOCK_TYPE_PSP) 178 }; 179 180 static inline bool amdgpu_ip_member_of_hwini(struct amdgpu_device *adev, 181 enum amd_ip_block_type block) 182 { 183 return (adev->init_lvl->hwini_ip_block_mask & (1U << block)) != 0; 184 } 185 186 void amdgpu_set_init_level(struct amdgpu_device *adev, 187 enum amdgpu_init_lvl_id lvl) 188 { 189 switch (lvl) { 190 case AMDGPU_INIT_LEVEL_MINIMAL_XGMI: 191 adev->init_lvl = &amdgpu_init_minimal_xgmi; 192 break; 193 case AMDGPU_INIT_LEVEL_RESET_RECOVERY: 194 adev->init_lvl = &amdgpu_init_recovery; 195 break; 196 case AMDGPU_INIT_LEVEL_DEFAULT: 197 fallthrough; 198 default: 199 adev->init_lvl = &amdgpu_init_default; 200 break; 201 } 202 } 203 204 static inline void amdgpu_device_stop_pending_resets(struct amdgpu_device *adev); 205 static int amdgpu_device_pm_notifier(struct notifier_block *nb, unsigned long mode, 206 void *data); 207 208 /** 209 * DOC: pcie_replay_count 210 * 211 * The amdgpu driver provides a sysfs API for reporting the total number 212 * of PCIe replays (NAKs). 213 * The file pcie_replay_count is used for this and returns the total 214 * number of replays as a sum of the NAKs generated and NAKs received. 215 */ 216 217 static ssize_t amdgpu_device_get_pcie_replay_count(struct device *dev, 218 struct device_attribute *attr, char *buf) 219 { 220 struct drm_device *ddev = dev_get_drvdata(dev); 221 struct amdgpu_device *adev = drm_to_adev(ddev); 222 uint64_t cnt = amdgpu_asic_get_pcie_replay_count(adev); 223 224 return sysfs_emit(buf, "%llu\n", cnt); 225 } 226 227 static DEVICE_ATTR(pcie_replay_count, 0444, 228 amdgpu_device_get_pcie_replay_count, NULL); 229 230 static ssize_t amdgpu_sysfs_reg_state_get(struct file *f, struct kobject *kobj, 231 struct bin_attribute *attr, char *buf, 232 loff_t ppos, size_t count) 233 { 234 struct device *dev = kobj_to_dev(kobj); 235 struct drm_device *ddev = dev_get_drvdata(dev); 236 struct amdgpu_device *adev = drm_to_adev(ddev); 237 ssize_t bytes_read; 238 239 switch (ppos) { 240 case AMDGPU_SYS_REG_STATE_XGMI: 241 bytes_read = amdgpu_asic_get_reg_state( 242 adev, AMDGPU_REG_STATE_TYPE_XGMI, buf, count); 243 break; 244 case AMDGPU_SYS_REG_STATE_WAFL: 245 bytes_read = amdgpu_asic_get_reg_state( 246 adev, AMDGPU_REG_STATE_TYPE_WAFL, buf, count); 247 break; 248 case AMDGPU_SYS_REG_STATE_PCIE: 249 bytes_read = amdgpu_asic_get_reg_state( 250 adev, AMDGPU_REG_STATE_TYPE_PCIE, buf, count); 251 break; 252 case AMDGPU_SYS_REG_STATE_USR: 253 bytes_read = amdgpu_asic_get_reg_state( 254 adev, AMDGPU_REG_STATE_TYPE_USR, buf, count); 255 break; 256 case AMDGPU_SYS_REG_STATE_USR_1: 257 bytes_read = amdgpu_asic_get_reg_state( 258 adev, AMDGPU_REG_STATE_TYPE_USR_1, buf, count); 259 break; 260 default: 261 return -EINVAL; 262 } 263 264 return bytes_read; 265 } 266 267 BIN_ATTR(reg_state, 0444, amdgpu_sysfs_reg_state_get, NULL, 268 AMDGPU_SYS_REG_STATE_END); 269 270 int amdgpu_reg_state_sysfs_init(struct amdgpu_device *adev) 271 { 272 int ret; 273 274 if (!amdgpu_asic_get_reg_state_supported(adev)) 275 return 0; 276 277 ret = sysfs_create_bin_file(&adev->dev->kobj, &bin_attr_reg_state); 278 279 return ret; 280 } 281 282 void amdgpu_reg_state_sysfs_fini(struct amdgpu_device *adev) 283 { 284 if (!amdgpu_asic_get_reg_state_supported(adev)) 285 return; 286 sysfs_remove_bin_file(&adev->dev->kobj, &bin_attr_reg_state); 287 } 288 289 int amdgpu_ip_block_suspend(struct amdgpu_ip_block *ip_block) 290 { 291 int r; 292 293 if (ip_block->version->funcs->suspend) { 294 r = ip_block->version->funcs->suspend(ip_block); 295 if (r) { 296 dev_err(ip_block->adev->dev, 297 "suspend of IP block <%s> failed %d\n", 298 ip_block->version->funcs->name, r); 299 return r; 300 } 301 } 302 303 ip_block->status.hw = false; 304 return 0; 305 } 306 307 int amdgpu_ip_block_resume(struct amdgpu_ip_block *ip_block) 308 { 309 int r; 310 311 if (ip_block->version->funcs->resume) { 312 r = ip_block->version->funcs->resume(ip_block); 313 if (r) { 314 dev_err(ip_block->adev->dev, 315 "resume of IP block <%s> failed %d\n", 316 ip_block->version->funcs->name, r); 317 return r; 318 } 319 } 320 321 ip_block->status.hw = true; 322 return 0; 323 } 324 325 /** 326 * DOC: board_info 327 * 328 * The amdgpu driver provides a sysfs API for giving board related information. 329 * It provides the form factor information in the format 330 * 331 * type : form factor 332 * 333 * Possible form factor values 334 * 335 * - "cem" - PCIE CEM card 336 * - "oam" - Open Compute Accelerator Module 337 * - "unknown" - Not known 338 * 339 */ 340 341 static ssize_t amdgpu_device_get_board_info(struct device *dev, 342 struct device_attribute *attr, 343 char *buf) 344 { 345 struct drm_device *ddev = dev_get_drvdata(dev); 346 struct amdgpu_device *adev = drm_to_adev(ddev); 347 enum amdgpu_pkg_type pkg_type = AMDGPU_PKG_TYPE_CEM; 348 const char *pkg; 349 350 if (adev->smuio.funcs && adev->smuio.funcs->get_pkg_type) 351 pkg_type = adev->smuio.funcs->get_pkg_type(adev); 352 353 switch (pkg_type) { 354 case AMDGPU_PKG_TYPE_CEM: 355 pkg = "cem"; 356 break; 357 case AMDGPU_PKG_TYPE_OAM: 358 pkg = "oam"; 359 break; 360 default: 361 pkg = "unknown"; 362 break; 363 } 364 365 return sysfs_emit(buf, "%s : %s\n", "type", pkg); 366 } 367 368 static DEVICE_ATTR(board_info, 0444, amdgpu_device_get_board_info, NULL); 369 370 static struct attribute *amdgpu_board_attrs[] = { 371 &dev_attr_board_info.attr, 372 NULL, 373 }; 374 375 static umode_t amdgpu_board_attrs_is_visible(struct kobject *kobj, 376 struct attribute *attr, int n) 377 { 378 struct device *dev = kobj_to_dev(kobj); 379 struct drm_device *ddev = dev_get_drvdata(dev); 380 struct amdgpu_device *adev = drm_to_adev(ddev); 381 382 if (adev->flags & AMD_IS_APU) 383 return 0; 384 385 return attr->mode; 386 } 387 388 static const struct attribute_group amdgpu_board_attrs_group = { 389 .attrs = amdgpu_board_attrs, 390 .is_visible = amdgpu_board_attrs_is_visible 391 }; 392 393 static void amdgpu_device_get_pcie_info(struct amdgpu_device *adev); 394 395 396 /** 397 * amdgpu_device_supports_px - Is the device a dGPU with ATPX power control 398 * 399 * @dev: drm_device pointer 400 * 401 * Returns true if the device is a dGPU with ATPX power control, 402 * otherwise return false. 403 */ 404 bool amdgpu_device_supports_px(struct drm_device *dev) 405 { 406 struct amdgpu_device *adev = drm_to_adev(dev); 407 408 if ((adev->flags & AMD_IS_PX) && !amdgpu_is_atpx_hybrid()) 409 return true; 410 return false; 411 } 412 413 /** 414 * amdgpu_device_supports_boco - Is the device a dGPU with ACPI power resources 415 * 416 * @dev: drm_device pointer 417 * 418 * Returns true if the device is a dGPU with ACPI power control, 419 * otherwise return false. 420 */ 421 bool amdgpu_device_supports_boco(struct drm_device *dev) 422 { 423 struct amdgpu_device *adev = drm_to_adev(dev); 424 425 if (!IS_ENABLED(CONFIG_HOTPLUG_PCI_PCIE)) 426 return false; 427 428 if (adev->has_pr3 || 429 ((adev->flags & AMD_IS_PX) && amdgpu_is_atpx_hybrid())) 430 return true; 431 return false; 432 } 433 434 /** 435 * amdgpu_device_supports_baco - Does the device support BACO 436 * 437 * @dev: drm_device pointer 438 * 439 * Return: 440 * 1 if the device supports BACO; 441 * 3 if the device supports MACO (only works if BACO is supported) 442 * otherwise return 0. 443 */ 444 int amdgpu_device_supports_baco(struct drm_device *dev) 445 { 446 struct amdgpu_device *adev = drm_to_adev(dev); 447 448 return amdgpu_asic_supports_baco(adev); 449 } 450 451 void amdgpu_device_detect_runtime_pm_mode(struct amdgpu_device *adev) 452 { 453 struct drm_device *dev; 454 int bamaco_support; 455 456 dev = adev_to_drm(adev); 457 458 adev->pm.rpm_mode = AMDGPU_RUNPM_NONE; 459 bamaco_support = amdgpu_device_supports_baco(dev); 460 461 switch (amdgpu_runtime_pm) { 462 case 2: 463 if (bamaco_support & MACO_SUPPORT) { 464 adev->pm.rpm_mode = AMDGPU_RUNPM_BAMACO; 465 dev_info(adev->dev, "Forcing BAMACO for runtime pm\n"); 466 } else if (bamaco_support == BACO_SUPPORT) { 467 adev->pm.rpm_mode = AMDGPU_RUNPM_BACO; 468 dev_info(adev->dev, "Requested mode BAMACO not available,fallback to use BACO\n"); 469 } 470 break; 471 case 1: 472 if (bamaco_support & BACO_SUPPORT) { 473 adev->pm.rpm_mode = AMDGPU_RUNPM_BACO; 474 dev_info(adev->dev, "Forcing BACO for runtime pm\n"); 475 } 476 break; 477 case -1: 478 case -2: 479 if (amdgpu_device_supports_px(dev)) { /* enable PX as runtime mode */ 480 adev->pm.rpm_mode = AMDGPU_RUNPM_PX; 481 dev_info(adev->dev, "Using ATPX for runtime pm\n"); 482 } else if (amdgpu_device_supports_boco(dev)) { /* enable boco as runtime mode */ 483 adev->pm.rpm_mode = AMDGPU_RUNPM_BOCO; 484 dev_info(adev->dev, "Using BOCO for runtime pm\n"); 485 } else { 486 if (!bamaco_support) 487 goto no_runtime_pm; 488 489 switch (adev->asic_type) { 490 case CHIP_VEGA20: 491 case CHIP_ARCTURUS: 492 /* BACO are not supported on vega20 and arctrus */ 493 break; 494 case CHIP_VEGA10: 495 /* enable BACO as runpm mode if noretry=0 */ 496 if (!adev->gmc.noretry) 497 adev->pm.rpm_mode = AMDGPU_RUNPM_BACO; 498 break; 499 default: 500 /* enable BACO as runpm mode on CI+ */ 501 adev->pm.rpm_mode = AMDGPU_RUNPM_BACO; 502 break; 503 } 504 505 if (adev->pm.rpm_mode == AMDGPU_RUNPM_BACO) { 506 if (bamaco_support & MACO_SUPPORT) { 507 adev->pm.rpm_mode = AMDGPU_RUNPM_BAMACO; 508 dev_info(adev->dev, "Using BAMACO for runtime pm\n"); 509 } else { 510 dev_info(adev->dev, "Using BACO for runtime pm\n"); 511 } 512 } 513 } 514 break; 515 case 0: 516 dev_info(adev->dev, "runtime pm is manually disabled\n"); 517 break; 518 default: 519 break; 520 } 521 522 no_runtime_pm: 523 if (adev->pm.rpm_mode == AMDGPU_RUNPM_NONE) 524 dev_info(adev->dev, "Runtime PM not available\n"); 525 } 526 /** 527 * amdgpu_device_supports_smart_shift - Is the device dGPU with 528 * smart shift support 529 * 530 * @dev: drm_device pointer 531 * 532 * Returns true if the device is a dGPU with Smart Shift support, 533 * otherwise returns false. 534 */ 535 bool amdgpu_device_supports_smart_shift(struct drm_device *dev) 536 { 537 return (amdgpu_device_supports_boco(dev) && 538 amdgpu_acpi_is_power_shift_control_supported()); 539 } 540 541 /* 542 * VRAM access helper functions 543 */ 544 545 /** 546 * amdgpu_device_mm_access - access vram by MM_INDEX/MM_DATA 547 * 548 * @adev: amdgpu_device pointer 549 * @pos: offset of the buffer in vram 550 * @buf: virtual address of the buffer in system memory 551 * @size: read/write size, sizeof(@buf) must > @size 552 * @write: true - write to vram, otherwise - read from vram 553 */ 554 void amdgpu_device_mm_access(struct amdgpu_device *adev, loff_t pos, 555 void *buf, size_t size, bool write) 556 { 557 unsigned long flags; 558 uint32_t hi = ~0, tmp = 0; 559 uint32_t *data = buf; 560 uint64_t last; 561 int idx; 562 563 if (!drm_dev_enter(adev_to_drm(adev), &idx)) 564 return; 565 566 BUG_ON(!IS_ALIGNED(pos, 4) || !IS_ALIGNED(size, 4)); 567 568 spin_lock_irqsave(&adev->mmio_idx_lock, flags); 569 for (last = pos + size; pos < last; pos += 4) { 570 tmp = pos >> 31; 571 572 WREG32_NO_KIQ(mmMM_INDEX, ((uint32_t)pos) | 0x80000000); 573 if (tmp != hi) { 574 WREG32_NO_KIQ(mmMM_INDEX_HI, tmp); 575 hi = tmp; 576 } 577 if (write) 578 WREG32_NO_KIQ(mmMM_DATA, *data++); 579 else 580 *data++ = RREG32_NO_KIQ(mmMM_DATA); 581 } 582 583 spin_unlock_irqrestore(&adev->mmio_idx_lock, flags); 584 drm_dev_exit(idx); 585 } 586 587 /** 588 * amdgpu_device_aper_access - access vram by vram aperture 589 * 590 * @adev: amdgpu_device pointer 591 * @pos: offset of the buffer in vram 592 * @buf: virtual address of the buffer in system memory 593 * @size: read/write size, sizeof(@buf) must > @size 594 * @write: true - write to vram, otherwise - read from vram 595 * 596 * The return value means how many bytes have been transferred. 597 */ 598 size_t amdgpu_device_aper_access(struct amdgpu_device *adev, loff_t pos, 599 void *buf, size_t size, bool write) 600 { 601 #ifdef CONFIG_64BIT 602 void __iomem *addr; 603 size_t count = 0; 604 uint64_t last; 605 606 if (!adev->mman.aper_base_kaddr) 607 return 0; 608 609 last = min(pos + size, adev->gmc.visible_vram_size); 610 if (last > pos) { 611 addr = adev->mman.aper_base_kaddr + pos; 612 count = last - pos; 613 614 if (write) { 615 memcpy_toio(addr, buf, count); 616 /* Make sure HDP write cache flush happens without any reordering 617 * after the system memory contents are sent over PCIe device 618 */ 619 mb(); 620 amdgpu_device_flush_hdp(adev, NULL); 621 } else { 622 amdgpu_device_invalidate_hdp(adev, NULL); 623 /* Make sure HDP read cache is invalidated before issuing a read 624 * to the PCIe device 625 */ 626 mb(); 627 memcpy_fromio(buf, addr, count); 628 } 629 630 } 631 632 return count; 633 #else 634 return 0; 635 #endif 636 } 637 638 /** 639 * amdgpu_device_vram_access - read/write a buffer in vram 640 * 641 * @adev: amdgpu_device pointer 642 * @pos: offset of the buffer in vram 643 * @buf: virtual address of the buffer in system memory 644 * @size: read/write size, sizeof(@buf) must > @size 645 * @write: true - write to vram, otherwise - read from vram 646 */ 647 void amdgpu_device_vram_access(struct amdgpu_device *adev, loff_t pos, 648 void *buf, size_t size, bool write) 649 { 650 size_t count; 651 652 /* try to using vram apreature to access vram first */ 653 count = amdgpu_device_aper_access(adev, pos, buf, size, write); 654 size -= count; 655 if (size) { 656 /* using MM to access rest vram */ 657 pos += count; 658 buf += count; 659 amdgpu_device_mm_access(adev, pos, buf, size, write); 660 } 661 } 662 663 /* 664 * register access helper functions. 665 */ 666 667 /* Check if hw access should be skipped because of hotplug or device error */ 668 bool amdgpu_device_skip_hw_access(struct amdgpu_device *adev) 669 { 670 if (adev->no_hw_access) 671 return true; 672 673 #ifdef CONFIG_LOCKDEP 674 /* 675 * This is a bit complicated to understand, so worth a comment. What we assert 676 * here is that the GPU reset is not running on another thread in parallel. 677 * 678 * For this we trylock the read side of the reset semaphore, if that succeeds 679 * we know that the reset is not running in parallel. 680 * 681 * If the trylock fails we assert that we are either already holding the read 682 * side of the lock or are the reset thread itself and hold the write side of 683 * the lock. 684 */ 685 if (in_task()) { 686 if (down_read_trylock(&adev->reset_domain->sem)) 687 up_read(&adev->reset_domain->sem); 688 else 689 lockdep_assert_held(&adev->reset_domain->sem); 690 } 691 #endif 692 return false; 693 } 694 695 /** 696 * amdgpu_device_rreg - read a memory mapped IO or indirect register 697 * 698 * @adev: amdgpu_device pointer 699 * @reg: dword aligned register offset 700 * @acc_flags: access flags which require special behavior 701 * 702 * Returns the 32 bit value from the offset specified. 703 */ 704 uint32_t amdgpu_device_rreg(struct amdgpu_device *adev, 705 uint32_t reg, uint32_t acc_flags) 706 { 707 uint32_t ret; 708 709 if (amdgpu_device_skip_hw_access(adev)) 710 return 0; 711 712 if ((reg * 4) < adev->rmmio_size) { 713 if (!(acc_flags & AMDGPU_REGS_NO_KIQ) && 714 amdgpu_sriov_runtime(adev) && 715 down_read_trylock(&adev->reset_domain->sem)) { 716 ret = amdgpu_kiq_rreg(adev, reg, 0); 717 up_read(&adev->reset_domain->sem); 718 } else { 719 ret = readl(((void __iomem *)adev->rmmio) + (reg * 4)); 720 } 721 } else { 722 ret = adev->pcie_rreg(adev, reg * 4); 723 } 724 725 trace_amdgpu_device_rreg(adev->pdev->device, reg, ret); 726 727 return ret; 728 } 729 730 /* 731 * MMIO register read with bytes helper functions 732 * @offset:bytes offset from MMIO start 733 */ 734 735 /** 736 * amdgpu_mm_rreg8 - read a memory mapped IO register 737 * 738 * @adev: amdgpu_device pointer 739 * @offset: byte aligned register offset 740 * 741 * Returns the 8 bit value from the offset specified. 742 */ 743 uint8_t amdgpu_mm_rreg8(struct amdgpu_device *adev, uint32_t offset) 744 { 745 if (amdgpu_device_skip_hw_access(adev)) 746 return 0; 747 748 if (offset < adev->rmmio_size) 749 return (readb(adev->rmmio + offset)); 750 BUG(); 751 } 752 753 754 /** 755 * amdgpu_device_xcc_rreg - read a memory mapped IO or indirect register with specific XCC 756 * 757 * @adev: amdgpu_device pointer 758 * @reg: dword aligned register offset 759 * @acc_flags: access flags which require special behavior 760 * @xcc_id: xcc accelerated compute core id 761 * 762 * Returns the 32 bit value from the offset specified. 763 */ 764 uint32_t amdgpu_device_xcc_rreg(struct amdgpu_device *adev, 765 uint32_t reg, uint32_t acc_flags, 766 uint32_t xcc_id) 767 { 768 uint32_t ret, rlcg_flag; 769 770 if (amdgpu_device_skip_hw_access(adev)) 771 return 0; 772 773 if ((reg * 4) < adev->rmmio_size) { 774 if (amdgpu_sriov_vf(adev) && 775 !amdgpu_sriov_runtime(adev) && 776 adev->gfx.rlc.rlcg_reg_access_supported && 777 amdgpu_virt_get_rlcg_reg_access_flag(adev, acc_flags, 778 GC_HWIP, false, 779 &rlcg_flag)) { 780 ret = amdgpu_virt_rlcg_reg_rw(adev, reg, 0, rlcg_flag, GET_INST(GC, xcc_id)); 781 } else if (!(acc_flags & AMDGPU_REGS_NO_KIQ) && 782 amdgpu_sriov_runtime(adev) && 783 down_read_trylock(&adev->reset_domain->sem)) { 784 ret = amdgpu_kiq_rreg(adev, reg, xcc_id); 785 up_read(&adev->reset_domain->sem); 786 } else { 787 ret = readl(((void __iomem *)adev->rmmio) + (reg * 4)); 788 } 789 } else { 790 ret = adev->pcie_rreg(adev, reg * 4); 791 } 792 793 return ret; 794 } 795 796 /* 797 * MMIO register write with bytes helper functions 798 * @offset:bytes offset from MMIO start 799 * @value: the value want to be written to the register 800 */ 801 802 /** 803 * amdgpu_mm_wreg8 - read a memory mapped IO register 804 * 805 * @adev: amdgpu_device pointer 806 * @offset: byte aligned register offset 807 * @value: 8 bit value to write 808 * 809 * Writes the value specified to the offset specified. 810 */ 811 void amdgpu_mm_wreg8(struct amdgpu_device *adev, uint32_t offset, uint8_t value) 812 { 813 if (amdgpu_device_skip_hw_access(adev)) 814 return; 815 816 if (offset < adev->rmmio_size) 817 writeb(value, adev->rmmio + offset); 818 else 819 BUG(); 820 } 821 822 /** 823 * amdgpu_device_wreg - write to a memory mapped IO or indirect register 824 * 825 * @adev: amdgpu_device pointer 826 * @reg: dword aligned register offset 827 * @v: 32 bit value to write to the register 828 * @acc_flags: access flags which require special behavior 829 * 830 * Writes the value specified to the offset specified. 831 */ 832 void amdgpu_device_wreg(struct amdgpu_device *adev, 833 uint32_t reg, uint32_t v, 834 uint32_t acc_flags) 835 { 836 if (amdgpu_device_skip_hw_access(adev)) 837 return; 838 839 if ((reg * 4) < adev->rmmio_size) { 840 if (!(acc_flags & AMDGPU_REGS_NO_KIQ) && 841 amdgpu_sriov_runtime(adev) && 842 down_read_trylock(&adev->reset_domain->sem)) { 843 amdgpu_kiq_wreg(adev, reg, v, 0); 844 up_read(&adev->reset_domain->sem); 845 } else { 846 writel(v, ((void __iomem *)adev->rmmio) + (reg * 4)); 847 } 848 } else { 849 adev->pcie_wreg(adev, reg * 4, v); 850 } 851 852 trace_amdgpu_device_wreg(adev->pdev->device, reg, v); 853 } 854 855 /** 856 * amdgpu_mm_wreg_mmio_rlc - write register either with direct/indirect mmio or with RLC path if in range 857 * 858 * @adev: amdgpu_device pointer 859 * @reg: mmio/rlc register 860 * @v: value to write 861 * @xcc_id: xcc accelerated compute core id 862 * 863 * this function is invoked only for the debugfs register access 864 */ 865 void amdgpu_mm_wreg_mmio_rlc(struct amdgpu_device *adev, 866 uint32_t reg, uint32_t v, 867 uint32_t xcc_id) 868 { 869 if (amdgpu_device_skip_hw_access(adev)) 870 return; 871 872 if (amdgpu_sriov_fullaccess(adev) && 873 adev->gfx.rlc.funcs && 874 adev->gfx.rlc.funcs->is_rlcg_access_range) { 875 if (adev->gfx.rlc.funcs->is_rlcg_access_range(adev, reg)) 876 return amdgpu_sriov_wreg(adev, reg, v, 0, 0, xcc_id); 877 } else if ((reg * 4) >= adev->rmmio_size) { 878 adev->pcie_wreg(adev, reg * 4, v); 879 } else { 880 writel(v, ((void __iomem *)adev->rmmio) + (reg * 4)); 881 } 882 } 883 884 /** 885 * amdgpu_device_xcc_wreg - write to a memory mapped IO or indirect register with specific XCC 886 * 887 * @adev: amdgpu_device pointer 888 * @reg: dword aligned register offset 889 * @v: 32 bit value to write to the register 890 * @acc_flags: access flags which require special behavior 891 * @xcc_id: xcc accelerated compute core id 892 * 893 * Writes the value specified to the offset specified. 894 */ 895 void amdgpu_device_xcc_wreg(struct amdgpu_device *adev, 896 uint32_t reg, uint32_t v, 897 uint32_t acc_flags, uint32_t xcc_id) 898 { 899 uint32_t rlcg_flag; 900 901 if (amdgpu_device_skip_hw_access(adev)) 902 return; 903 904 if ((reg * 4) < adev->rmmio_size) { 905 if (amdgpu_sriov_vf(adev) && 906 !amdgpu_sriov_runtime(adev) && 907 adev->gfx.rlc.rlcg_reg_access_supported && 908 amdgpu_virt_get_rlcg_reg_access_flag(adev, acc_flags, 909 GC_HWIP, true, 910 &rlcg_flag)) { 911 amdgpu_virt_rlcg_reg_rw(adev, reg, v, rlcg_flag, GET_INST(GC, xcc_id)); 912 } else if (!(acc_flags & AMDGPU_REGS_NO_KIQ) && 913 amdgpu_sriov_runtime(adev) && 914 down_read_trylock(&adev->reset_domain->sem)) { 915 amdgpu_kiq_wreg(adev, reg, v, xcc_id); 916 up_read(&adev->reset_domain->sem); 917 } else { 918 writel(v, ((void __iomem *)adev->rmmio) + (reg * 4)); 919 } 920 } else { 921 adev->pcie_wreg(adev, reg * 4, v); 922 } 923 } 924 925 /** 926 * amdgpu_device_indirect_rreg - read an indirect register 927 * 928 * @adev: amdgpu_device pointer 929 * @reg_addr: indirect register address to read from 930 * 931 * Returns the value of indirect register @reg_addr 932 */ 933 u32 amdgpu_device_indirect_rreg(struct amdgpu_device *adev, 934 u32 reg_addr) 935 { 936 unsigned long flags, pcie_index, pcie_data; 937 void __iomem *pcie_index_offset; 938 void __iomem *pcie_data_offset; 939 u32 r; 940 941 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 942 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 943 944 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 945 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 946 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 947 948 writel(reg_addr, pcie_index_offset); 949 readl(pcie_index_offset); 950 r = readl(pcie_data_offset); 951 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 952 953 return r; 954 } 955 956 u32 amdgpu_device_indirect_rreg_ext(struct amdgpu_device *adev, 957 u64 reg_addr) 958 { 959 unsigned long flags, pcie_index, pcie_index_hi, pcie_data; 960 u32 r; 961 void __iomem *pcie_index_offset; 962 void __iomem *pcie_index_hi_offset; 963 void __iomem *pcie_data_offset; 964 965 if (unlikely(!adev->nbio.funcs)) { 966 pcie_index = AMDGPU_PCIE_INDEX_FALLBACK; 967 pcie_data = AMDGPU_PCIE_DATA_FALLBACK; 968 } else { 969 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 970 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 971 } 972 973 if (reg_addr >> 32) { 974 if (unlikely(!adev->nbio.funcs)) 975 pcie_index_hi = AMDGPU_PCIE_INDEX_HI_FALLBACK; 976 else 977 pcie_index_hi = adev->nbio.funcs->get_pcie_index_hi_offset(adev); 978 } else { 979 pcie_index_hi = 0; 980 } 981 982 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 983 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 984 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 985 if (pcie_index_hi != 0) 986 pcie_index_hi_offset = (void __iomem *)adev->rmmio + 987 pcie_index_hi * 4; 988 989 writel(reg_addr, pcie_index_offset); 990 readl(pcie_index_offset); 991 if (pcie_index_hi != 0) { 992 writel((reg_addr >> 32) & 0xff, pcie_index_hi_offset); 993 readl(pcie_index_hi_offset); 994 } 995 r = readl(pcie_data_offset); 996 997 /* clear the high bits */ 998 if (pcie_index_hi != 0) { 999 writel(0, pcie_index_hi_offset); 1000 readl(pcie_index_hi_offset); 1001 } 1002 1003 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 1004 1005 return r; 1006 } 1007 1008 /** 1009 * amdgpu_device_indirect_rreg64 - read a 64bits indirect register 1010 * 1011 * @adev: amdgpu_device pointer 1012 * @reg_addr: indirect register address to read from 1013 * 1014 * Returns the value of indirect register @reg_addr 1015 */ 1016 u64 amdgpu_device_indirect_rreg64(struct amdgpu_device *adev, 1017 u32 reg_addr) 1018 { 1019 unsigned long flags, pcie_index, pcie_data; 1020 void __iomem *pcie_index_offset; 1021 void __iomem *pcie_data_offset; 1022 u64 r; 1023 1024 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 1025 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 1026 1027 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 1028 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 1029 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 1030 1031 /* read low 32 bits */ 1032 writel(reg_addr, pcie_index_offset); 1033 readl(pcie_index_offset); 1034 r = readl(pcie_data_offset); 1035 /* read high 32 bits */ 1036 writel(reg_addr + 4, pcie_index_offset); 1037 readl(pcie_index_offset); 1038 r |= ((u64)readl(pcie_data_offset) << 32); 1039 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 1040 1041 return r; 1042 } 1043 1044 u64 amdgpu_device_indirect_rreg64_ext(struct amdgpu_device *adev, 1045 u64 reg_addr) 1046 { 1047 unsigned long flags, pcie_index, pcie_data; 1048 unsigned long pcie_index_hi = 0; 1049 void __iomem *pcie_index_offset; 1050 void __iomem *pcie_index_hi_offset; 1051 void __iomem *pcie_data_offset; 1052 u64 r; 1053 1054 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 1055 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 1056 if ((reg_addr >> 32) && (adev->nbio.funcs->get_pcie_index_hi_offset)) 1057 pcie_index_hi = adev->nbio.funcs->get_pcie_index_hi_offset(adev); 1058 1059 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 1060 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 1061 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 1062 if (pcie_index_hi != 0) 1063 pcie_index_hi_offset = (void __iomem *)adev->rmmio + 1064 pcie_index_hi * 4; 1065 1066 /* read low 32 bits */ 1067 writel(reg_addr, pcie_index_offset); 1068 readl(pcie_index_offset); 1069 if (pcie_index_hi != 0) { 1070 writel((reg_addr >> 32) & 0xff, pcie_index_hi_offset); 1071 readl(pcie_index_hi_offset); 1072 } 1073 r = readl(pcie_data_offset); 1074 /* read high 32 bits */ 1075 writel(reg_addr + 4, pcie_index_offset); 1076 readl(pcie_index_offset); 1077 if (pcie_index_hi != 0) { 1078 writel((reg_addr >> 32) & 0xff, pcie_index_hi_offset); 1079 readl(pcie_index_hi_offset); 1080 } 1081 r |= ((u64)readl(pcie_data_offset) << 32); 1082 1083 /* clear the high bits */ 1084 if (pcie_index_hi != 0) { 1085 writel(0, pcie_index_hi_offset); 1086 readl(pcie_index_hi_offset); 1087 } 1088 1089 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 1090 1091 return r; 1092 } 1093 1094 /** 1095 * amdgpu_device_indirect_wreg - write an indirect register address 1096 * 1097 * @adev: amdgpu_device pointer 1098 * @reg_addr: indirect register offset 1099 * @reg_data: indirect register data 1100 * 1101 */ 1102 void amdgpu_device_indirect_wreg(struct amdgpu_device *adev, 1103 u32 reg_addr, u32 reg_data) 1104 { 1105 unsigned long flags, pcie_index, pcie_data; 1106 void __iomem *pcie_index_offset; 1107 void __iomem *pcie_data_offset; 1108 1109 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 1110 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 1111 1112 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 1113 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 1114 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 1115 1116 writel(reg_addr, pcie_index_offset); 1117 readl(pcie_index_offset); 1118 writel(reg_data, pcie_data_offset); 1119 readl(pcie_data_offset); 1120 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 1121 } 1122 1123 void amdgpu_device_indirect_wreg_ext(struct amdgpu_device *adev, 1124 u64 reg_addr, u32 reg_data) 1125 { 1126 unsigned long flags, pcie_index, pcie_index_hi, pcie_data; 1127 void __iomem *pcie_index_offset; 1128 void __iomem *pcie_index_hi_offset; 1129 void __iomem *pcie_data_offset; 1130 1131 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 1132 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 1133 if ((reg_addr >> 32) && (adev->nbio.funcs->get_pcie_index_hi_offset)) 1134 pcie_index_hi = adev->nbio.funcs->get_pcie_index_hi_offset(adev); 1135 else 1136 pcie_index_hi = 0; 1137 1138 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 1139 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 1140 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 1141 if (pcie_index_hi != 0) 1142 pcie_index_hi_offset = (void __iomem *)adev->rmmio + 1143 pcie_index_hi * 4; 1144 1145 writel(reg_addr, pcie_index_offset); 1146 readl(pcie_index_offset); 1147 if (pcie_index_hi != 0) { 1148 writel((reg_addr >> 32) & 0xff, pcie_index_hi_offset); 1149 readl(pcie_index_hi_offset); 1150 } 1151 writel(reg_data, pcie_data_offset); 1152 readl(pcie_data_offset); 1153 1154 /* clear the high bits */ 1155 if (pcie_index_hi != 0) { 1156 writel(0, pcie_index_hi_offset); 1157 readl(pcie_index_hi_offset); 1158 } 1159 1160 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 1161 } 1162 1163 /** 1164 * amdgpu_device_indirect_wreg64 - write a 64bits indirect register address 1165 * 1166 * @adev: amdgpu_device pointer 1167 * @reg_addr: indirect register offset 1168 * @reg_data: indirect register data 1169 * 1170 */ 1171 void amdgpu_device_indirect_wreg64(struct amdgpu_device *adev, 1172 u32 reg_addr, u64 reg_data) 1173 { 1174 unsigned long flags, pcie_index, pcie_data; 1175 void __iomem *pcie_index_offset; 1176 void __iomem *pcie_data_offset; 1177 1178 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 1179 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 1180 1181 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 1182 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 1183 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 1184 1185 /* write low 32 bits */ 1186 writel(reg_addr, pcie_index_offset); 1187 readl(pcie_index_offset); 1188 writel((u32)(reg_data & 0xffffffffULL), pcie_data_offset); 1189 readl(pcie_data_offset); 1190 /* write high 32 bits */ 1191 writel(reg_addr + 4, pcie_index_offset); 1192 readl(pcie_index_offset); 1193 writel((u32)(reg_data >> 32), pcie_data_offset); 1194 readl(pcie_data_offset); 1195 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 1196 } 1197 1198 void amdgpu_device_indirect_wreg64_ext(struct amdgpu_device *adev, 1199 u64 reg_addr, u64 reg_data) 1200 { 1201 unsigned long flags, pcie_index, pcie_data; 1202 unsigned long pcie_index_hi = 0; 1203 void __iomem *pcie_index_offset; 1204 void __iomem *pcie_index_hi_offset; 1205 void __iomem *pcie_data_offset; 1206 1207 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 1208 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 1209 if ((reg_addr >> 32) && (adev->nbio.funcs->get_pcie_index_hi_offset)) 1210 pcie_index_hi = adev->nbio.funcs->get_pcie_index_hi_offset(adev); 1211 1212 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 1213 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 1214 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 1215 if (pcie_index_hi != 0) 1216 pcie_index_hi_offset = (void __iomem *)adev->rmmio + 1217 pcie_index_hi * 4; 1218 1219 /* write low 32 bits */ 1220 writel(reg_addr, pcie_index_offset); 1221 readl(pcie_index_offset); 1222 if (pcie_index_hi != 0) { 1223 writel((reg_addr >> 32) & 0xff, pcie_index_hi_offset); 1224 readl(pcie_index_hi_offset); 1225 } 1226 writel((u32)(reg_data & 0xffffffffULL), pcie_data_offset); 1227 readl(pcie_data_offset); 1228 /* write high 32 bits */ 1229 writel(reg_addr + 4, pcie_index_offset); 1230 readl(pcie_index_offset); 1231 if (pcie_index_hi != 0) { 1232 writel((reg_addr >> 32) & 0xff, pcie_index_hi_offset); 1233 readl(pcie_index_hi_offset); 1234 } 1235 writel((u32)(reg_data >> 32), pcie_data_offset); 1236 readl(pcie_data_offset); 1237 1238 /* clear the high bits */ 1239 if (pcie_index_hi != 0) { 1240 writel(0, pcie_index_hi_offset); 1241 readl(pcie_index_hi_offset); 1242 } 1243 1244 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 1245 } 1246 1247 /** 1248 * amdgpu_device_get_rev_id - query device rev_id 1249 * 1250 * @adev: amdgpu_device pointer 1251 * 1252 * Return device rev_id 1253 */ 1254 u32 amdgpu_device_get_rev_id(struct amdgpu_device *adev) 1255 { 1256 return adev->nbio.funcs->get_rev_id(adev); 1257 } 1258 1259 /** 1260 * amdgpu_invalid_rreg - dummy reg read function 1261 * 1262 * @adev: amdgpu_device pointer 1263 * @reg: offset of register 1264 * 1265 * Dummy register read function. Used for register blocks 1266 * that certain asics don't have (all asics). 1267 * Returns the value in the register. 1268 */ 1269 static uint32_t amdgpu_invalid_rreg(struct amdgpu_device *adev, uint32_t reg) 1270 { 1271 DRM_ERROR("Invalid callback to read register 0x%04X\n", reg); 1272 BUG(); 1273 return 0; 1274 } 1275 1276 static uint32_t amdgpu_invalid_rreg_ext(struct amdgpu_device *adev, uint64_t reg) 1277 { 1278 DRM_ERROR("Invalid callback to read register 0x%llX\n", reg); 1279 BUG(); 1280 return 0; 1281 } 1282 1283 /** 1284 * amdgpu_invalid_wreg - dummy reg write function 1285 * 1286 * @adev: amdgpu_device pointer 1287 * @reg: offset of register 1288 * @v: value to write to the register 1289 * 1290 * Dummy register read function. Used for register blocks 1291 * that certain asics don't have (all asics). 1292 */ 1293 static void amdgpu_invalid_wreg(struct amdgpu_device *adev, uint32_t reg, uint32_t v) 1294 { 1295 DRM_ERROR("Invalid callback to write register 0x%04X with 0x%08X\n", 1296 reg, v); 1297 BUG(); 1298 } 1299 1300 static void amdgpu_invalid_wreg_ext(struct amdgpu_device *adev, uint64_t reg, uint32_t v) 1301 { 1302 DRM_ERROR("Invalid callback to write register 0x%llX with 0x%08X\n", 1303 reg, v); 1304 BUG(); 1305 } 1306 1307 /** 1308 * amdgpu_invalid_rreg64 - dummy 64 bit reg read function 1309 * 1310 * @adev: amdgpu_device pointer 1311 * @reg: offset of register 1312 * 1313 * Dummy register read function. Used for register blocks 1314 * that certain asics don't have (all asics). 1315 * Returns the value in the register. 1316 */ 1317 static uint64_t amdgpu_invalid_rreg64(struct amdgpu_device *adev, uint32_t reg) 1318 { 1319 DRM_ERROR("Invalid callback to read 64 bit register 0x%04X\n", reg); 1320 BUG(); 1321 return 0; 1322 } 1323 1324 static uint64_t amdgpu_invalid_rreg64_ext(struct amdgpu_device *adev, uint64_t reg) 1325 { 1326 DRM_ERROR("Invalid callback to read register 0x%llX\n", reg); 1327 BUG(); 1328 return 0; 1329 } 1330 1331 /** 1332 * amdgpu_invalid_wreg64 - dummy reg write function 1333 * 1334 * @adev: amdgpu_device pointer 1335 * @reg: offset of register 1336 * @v: value to write to the register 1337 * 1338 * Dummy register read function. Used for register blocks 1339 * that certain asics don't have (all asics). 1340 */ 1341 static void amdgpu_invalid_wreg64(struct amdgpu_device *adev, uint32_t reg, uint64_t v) 1342 { 1343 DRM_ERROR("Invalid callback to write 64 bit register 0x%04X with 0x%08llX\n", 1344 reg, v); 1345 BUG(); 1346 } 1347 1348 static void amdgpu_invalid_wreg64_ext(struct amdgpu_device *adev, uint64_t reg, uint64_t v) 1349 { 1350 DRM_ERROR("Invalid callback to write 64 bit register 0x%llX with 0x%08llX\n", 1351 reg, v); 1352 BUG(); 1353 } 1354 1355 /** 1356 * amdgpu_block_invalid_rreg - dummy reg read function 1357 * 1358 * @adev: amdgpu_device pointer 1359 * @block: offset of instance 1360 * @reg: offset of register 1361 * 1362 * Dummy register read function. Used for register blocks 1363 * that certain asics don't have (all asics). 1364 * Returns the value in the register. 1365 */ 1366 static uint32_t amdgpu_block_invalid_rreg(struct amdgpu_device *adev, 1367 uint32_t block, uint32_t reg) 1368 { 1369 DRM_ERROR("Invalid callback to read register 0x%04X in block 0x%04X\n", 1370 reg, block); 1371 BUG(); 1372 return 0; 1373 } 1374 1375 /** 1376 * amdgpu_block_invalid_wreg - dummy reg write function 1377 * 1378 * @adev: amdgpu_device pointer 1379 * @block: offset of instance 1380 * @reg: offset of register 1381 * @v: value to write to the register 1382 * 1383 * Dummy register read function. Used for register blocks 1384 * that certain asics don't have (all asics). 1385 */ 1386 static void amdgpu_block_invalid_wreg(struct amdgpu_device *adev, 1387 uint32_t block, 1388 uint32_t reg, uint32_t v) 1389 { 1390 DRM_ERROR("Invalid block callback to write register 0x%04X in block 0x%04X with 0x%08X\n", 1391 reg, block, v); 1392 BUG(); 1393 } 1394 1395 static uint32_t amdgpu_device_get_vbios_flags(struct amdgpu_device *adev) 1396 { 1397 if (hweight32(adev->aid_mask) && (adev->flags & AMD_IS_APU)) 1398 return AMDGPU_VBIOS_SKIP; 1399 1400 if (hweight32(adev->aid_mask) && amdgpu_passthrough(adev)) 1401 return AMDGPU_VBIOS_OPTIONAL; 1402 1403 return 0; 1404 } 1405 1406 /** 1407 * amdgpu_device_asic_init - Wrapper for atom asic_init 1408 * 1409 * @adev: amdgpu_device pointer 1410 * 1411 * Does any asic specific work and then calls atom asic init. 1412 */ 1413 static int amdgpu_device_asic_init(struct amdgpu_device *adev) 1414 { 1415 uint32_t flags; 1416 bool optional; 1417 int ret; 1418 1419 amdgpu_asic_pre_asic_init(adev); 1420 flags = amdgpu_device_get_vbios_flags(adev); 1421 optional = !!(flags & (AMDGPU_VBIOS_OPTIONAL | AMDGPU_VBIOS_SKIP)); 1422 1423 if (amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 3) || 1424 amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 4) || 1425 amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 5, 0) || 1426 amdgpu_ip_version(adev, GC_HWIP, 0) >= IP_VERSION(11, 0, 0)) { 1427 amdgpu_psp_wait_for_bootloader(adev); 1428 if (optional && !adev->bios) 1429 return 0; 1430 1431 ret = amdgpu_atomfirmware_asic_init(adev, true); 1432 return ret; 1433 } else { 1434 if (optional && !adev->bios) 1435 return 0; 1436 1437 return amdgpu_atom_asic_init(adev->mode_info.atom_context); 1438 } 1439 1440 return 0; 1441 } 1442 1443 /** 1444 * amdgpu_device_mem_scratch_init - allocate the VRAM scratch page 1445 * 1446 * @adev: amdgpu_device pointer 1447 * 1448 * Allocates a scratch page of VRAM for use by various things in the 1449 * driver. 1450 */ 1451 static int amdgpu_device_mem_scratch_init(struct amdgpu_device *adev) 1452 { 1453 return amdgpu_bo_create_kernel(adev, AMDGPU_GPU_PAGE_SIZE, PAGE_SIZE, 1454 AMDGPU_GEM_DOMAIN_VRAM | 1455 AMDGPU_GEM_DOMAIN_GTT, 1456 &adev->mem_scratch.robj, 1457 &adev->mem_scratch.gpu_addr, 1458 (void **)&adev->mem_scratch.ptr); 1459 } 1460 1461 /** 1462 * amdgpu_device_mem_scratch_fini - Free the VRAM scratch page 1463 * 1464 * @adev: amdgpu_device pointer 1465 * 1466 * Frees the VRAM scratch page. 1467 */ 1468 static void amdgpu_device_mem_scratch_fini(struct amdgpu_device *adev) 1469 { 1470 amdgpu_bo_free_kernel(&adev->mem_scratch.robj, NULL, NULL); 1471 } 1472 1473 /** 1474 * amdgpu_device_program_register_sequence - program an array of registers. 1475 * 1476 * @adev: amdgpu_device pointer 1477 * @registers: pointer to the register array 1478 * @array_size: size of the register array 1479 * 1480 * Programs an array or registers with and or masks. 1481 * This is a helper for setting golden registers. 1482 */ 1483 void amdgpu_device_program_register_sequence(struct amdgpu_device *adev, 1484 const u32 *registers, 1485 const u32 array_size) 1486 { 1487 u32 tmp, reg, and_mask, or_mask; 1488 int i; 1489 1490 if (array_size % 3) 1491 return; 1492 1493 for (i = 0; i < array_size; i += 3) { 1494 reg = registers[i + 0]; 1495 and_mask = registers[i + 1]; 1496 or_mask = registers[i + 2]; 1497 1498 if (and_mask == 0xffffffff) { 1499 tmp = or_mask; 1500 } else { 1501 tmp = RREG32(reg); 1502 tmp &= ~and_mask; 1503 if (adev->family >= AMDGPU_FAMILY_AI) 1504 tmp |= (or_mask & and_mask); 1505 else 1506 tmp |= or_mask; 1507 } 1508 WREG32(reg, tmp); 1509 } 1510 } 1511 1512 /** 1513 * amdgpu_device_pci_config_reset - reset the GPU 1514 * 1515 * @adev: amdgpu_device pointer 1516 * 1517 * Resets the GPU using the pci config reset sequence. 1518 * Only applicable to asics prior to vega10. 1519 */ 1520 void amdgpu_device_pci_config_reset(struct amdgpu_device *adev) 1521 { 1522 pci_write_config_dword(adev->pdev, 0x7c, AMDGPU_ASIC_RESET_DATA); 1523 } 1524 1525 /** 1526 * amdgpu_device_pci_reset - reset the GPU using generic PCI means 1527 * 1528 * @adev: amdgpu_device pointer 1529 * 1530 * Resets the GPU using generic pci reset interfaces (FLR, SBR, etc.). 1531 */ 1532 int amdgpu_device_pci_reset(struct amdgpu_device *adev) 1533 { 1534 return pci_reset_function(adev->pdev); 1535 } 1536 1537 /* 1538 * amdgpu_device_wb_*() 1539 * Writeback is the method by which the GPU updates special pages in memory 1540 * with the status of certain GPU events (fences, ring pointers,etc.). 1541 */ 1542 1543 /** 1544 * amdgpu_device_wb_fini - Disable Writeback and free memory 1545 * 1546 * @adev: amdgpu_device pointer 1547 * 1548 * Disables Writeback and frees the Writeback memory (all asics). 1549 * Used at driver shutdown. 1550 */ 1551 static void amdgpu_device_wb_fini(struct amdgpu_device *adev) 1552 { 1553 if (adev->wb.wb_obj) { 1554 amdgpu_bo_free_kernel(&adev->wb.wb_obj, 1555 &adev->wb.gpu_addr, 1556 (void **)&adev->wb.wb); 1557 adev->wb.wb_obj = NULL; 1558 } 1559 } 1560 1561 /** 1562 * amdgpu_device_wb_init - Init Writeback driver info and allocate memory 1563 * 1564 * @adev: amdgpu_device pointer 1565 * 1566 * Initializes writeback and allocates writeback memory (all asics). 1567 * Used at driver startup. 1568 * Returns 0 on success or an -error on failure. 1569 */ 1570 static int amdgpu_device_wb_init(struct amdgpu_device *adev) 1571 { 1572 int r; 1573 1574 if (adev->wb.wb_obj == NULL) { 1575 /* AMDGPU_MAX_WB * sizeof(uint32_t) * 8 = AMDGPU_MAX_WB 256bit slots */ 1576 r = amdgpu_bo_create_kernel(adev, AMDGPU_MAX_WB * sizeof(uint32_t) * 8, 1577 PAGE_SIZE, AMDGPU_GEM_DOMAIN_GTT, 1578 &adev->wb.wb_obj, &adev->wb.gpu_addr, 1579 (void **)&adev->wb.wb); 1580 if (r) { 1581 dev_warn(adev->dev, "(%d) create WB bo failed\n", r); 1582 return r; 1583 } 1584 1585 adev->wb.num_wb = AMDGPU_MAX_WB; 1586 memset(&adev->wb.used, 0, sizeof(adev->wb.used)); 1587 1588 /* clear wb memory */ 1589 memset((char *)adev->wb.wb, 0, AMDGPU_MAX_WB * sizeof(uint32_t) * 8); 1590 } 1591 1592 return 0; 1593 } 1594 1595 /** 1596 * amdgpu_device_wb_get - Allocate a wb entry 1597 * 1598 * @adev: amdgpu_device pointer 1599 * @wb: wb index 1600 * 1601 * Allocate a wb slot for use by the driver (all asics). 1602 * Returns 0 on success or -EINVAL on failure. 1603 */ 1604 int amdgpu_device_wb_get(struct amdgpu_device *adev, u32 *wb) 1605 { 1606 unsigned long flags, offset; 1607 1608 spin_lock_irqsave(&adev->wb.lock, flags); 1609 offset = find_first_zero_bit(adev->wb.used, adev->wb.num_wb); 1610 if (offset < adev->wb.num_wb) { 1611 __set_bit(offset, adev->wb.used); 1612 spin_unlock_irqrestore(&adev->wb.lock, flags); 1613 *wb = offset << 3; /* convert to dw offset */ 1614 return 0; 1615 } else { 1616 spin_unlock_irqrestore(&adev->wb.lock, flags); 1617 return -EINVAL; 1618 } 1619 } 1620 1621 /** 1622 * amdgpu_device_wb_free - Free a wb entry 1623 * 1624 * @adev: amdgpu_device pointer 1625 * @wb: wb index 1626 * 1627 * Free a wb slot allocated for use by the driver (all asics) 1628 */ 1629 void amdgpu_device_wb_free(struct amdgpu_device *adev, u32 wb) 1630 { 1631 unsigned long flags; 1632 1633 wb >>= 3; 1634 spin_lock_irqsave(&adev->wb.lock, flags); 1635 if (wb < adev->wb.num_wb) 1636 __clear_bit(wb, adev->wb.used); 1637 spin_unlock_irqrestore(&adev->wb.lock, flags); 1638 } 1639 1640 /** 1641 * amdgpu_device_resize_fb_bar - try to resize FB BAR 1642 * 1643 * @adev: amdgpu_device pointer 1644 * 1645 * Try to resize FB BAR to make all VRAM CPU accessible. We try very hard not 1646 * to fail, but if any of the BARs is not accessible after the size we abort 1647 * driver loading by returning -ENODEV. 1648 */ 1649 int amdgpu_device_resize_fb_bar(struct amdgpu_device *adev) 1650 { 1651 int rbar_size = pci_rebar_bytes_to_size(adev->gmc.real_vram_size); 1652 struct pci_bus *root; 1653 struct resource *res; 1654 unsigned int i; 1655 u16 cmd; 1656 int r; 1657 1658 if (!IS_ENABLED(CONFIG_PHYS_ADDR_T_64BIT)) 1659 return 0; 1660 1661 /* Bypass for VF */ 1662 if (amdgpu_sriov_vf(adev)) 1663 return 0; 1664 1665 /* PCI_EXT_CAP_ID_VNDR extended capability is located at 0x100 */ 1666 if (!pci_find_ext_capability(adev->pdev, PCI_EXT_CAP_ID_VNDR)) 1667 DRM_WARN("System can't access extended configuration space, please check!!\n"); 1668 1669 /* skip if the bios has already enabled large BAR */ 1670 if (adev->gmc.real_vram_size && 1671 (pci_resource_len(adev->pdev, 0) >= adev->gmc.real_vram_size)) 1672 return 0; 1673 1674 /* Check if the root BUS has 64bit memory resources */ 1675 root = adev->pdev->bus; 1676 while (root->parent) 1677 root = root->parent; 1678 1679 pci_bus_for_each_resource(root, res, i) { 1680 if (res && res->flags & (IORESOURCE_MEM | IORESOURCE_MEM_64) && 1681 res->start > 0x100000000ull) 1682 break; 1683 } 1684 1685 /* Trying to resize is pointless without a root hub window above 4GB */ 1686 if (!res) 1687 return 0; 1688 1689 /* Limit the BAR size to what is available */ 1690 rbar_size = min(fls(pci_rebar_get_possible_sizes(adev->pdev, 0)) - 1, 1691 rbar_size); 1692 1693 /* Disable memory decoding while we change the BAR addresses and size */ 1694 pci_read_config_word(adev->pdev, PCI_COMMAND, &cmd); 1695 pci_write_config_word(adev->pdev, PCI_COMMAND, 1696 cmd & ~PCI_COMMAND_MEMORY); 1697 1698 /* Free the VRAM and doorbell BAR, we most likely need to move both. */ 1699 amdgpu_doorbell_fini(adev); 1700 if (adev->asic_type >= CHIP_BONAIRE) 1701 pci_release_resource(adev->pdev, 2); 1702 1703 pci_release_resource(adev->pdev, 0); 1704 1705 r = pci_resize_resource(adev->pdev, 0, rbar_size); 1706 if (r == -ENOSPC) 1707 DRM_INFO("Not enough PCI address space for a large BAR."); 1708 else if (r && r != -ENOTSUPP) 1709 DRM_ERROR("Problem resizing BAR0 (%d).", r); 1710 1711 pci_assign_unassigned_bus_resources(adev->pdev->bus); 1712 1713 /* When the doorbell or fb BAR isn't available we have no chance of 1714 * using the device. 1715 */ 1716 r = amdgpu_doorbell_init(adev); 1717 if (r || (pci_resource_flags(adev->pdev, 0) & IORESOURCE_UNSET)) 1718 return -ENODEV; 1719 1720 pci_write_config_word(adev->pdev, PCI_COMMAND, cmd); 1721 1722 return 0; 1723 } 1724 1725 /* 1726 * GPU helpers function. 1727 */ 1728 /** 1729 * amdgpu_device_need_post - check if the hw need post or not 1730 * 1731 * @adev: amdgpu_device pointer 1732 * 1733 * Check if the asic has been initialized (all asics) at driver startup 1734 * or post is needed if hw reset is performed. 1735 * Returns true if need or false if not. 1736 */ 1737 bool amdgpu_device_need_post(struct amdgpu_device *adev) 1738 { 1739 uint32_t reg, flags; 1740 1741 if (amdgpu_sriov_vf(adev)) 1742 return false; 1743 1744 flags = amdgpu_device_get_vbios_flags(adev); 1745 if (flags & AMDGPU_VBIOS_SKIP) 1746 return false; 1747 if ((flags & AMDGPU_VBIOS_OPTIONAL) && !adev->bios) 1748 return false; 1749 1750 if (amdgpu_passthrough(adev)) { 1751 /* for FIJI: In whole GPU pass-through virtualization case, after VM reboot 1752 * some old smc fw still need driver do vPost otherwise gpu hang, while 1753 * those smc fw version above 22.15 doesn't have this flaw, so we force 1754 * vpost executed for smc version below 22.15 1755 */ 1756 if (adev->asic_type == CHIP_FIJI) { 1757 int err; 1758 uint32_t fw_ver; 1759 1760 err = request_firmware(&adev->pm.fw, "amdgpu/fiji_smc.bin", adev->dev); 1761 /* force vPost if error occurred */ 1762 if (err) 1763 return true; 1764 1765 fw_ver = *((uint32_t *)adev->pm.fw->data + 69); 1766 release_firmware(adev->pm.fw); 1767 if (fw_ver < 0x00160e00) 1768 return true; 1769 } 1770 } 1771 1772 /* Don't post if we need to reset whole hive on init */ 1773 if (adev->init_lvl->level == AMDGPU_INIT_LEVEL_MINIMAL_XGMI) 1774 return false; 1775 1776 if (adev->has_hw_reset) { 1777 adev->has_hw_reset = false; 1778 return true; 1779 } 1780 1781 /* bios scratch used on CIK+ */ 1782 if (adev->asic_type >= CHIP_BONAIRE) 1783 return amdgpu_atombios_scratch_need_asic_init(adev); 1784 1785 /* check MEM_SIZE for older asics */ 1786 reg = amdgpu_asic_get_config_memsize(adev); 1787 1788 if ((reg != 0) && (reg != 0xffffffff)) 1789 return false; 1790 1791 return true; 1792 } 1793 1794 /* 1795 * Check whether seamless boot is supported. 1796 * 1797 * So far we only support seamless boot on DCE 3.0 or later. 1798 * If users report that it works on older ASICS as well, we may 1799 * loosen this. 1800 */ 1801 bool amdgpu_device_seamless_boot_supported(struct amdgpu_device *adev) 1802 { 1803 switch (amdgpu_seamless) { 1804 case -1: 1805 break; 1806 case 1: 1807 return true; 1808 case 0: 1809 return false; 1810 default: 1811 DRM_ERROR("Invalid value for amdgpu.seamless: %d\n", 1812 amdgpu_seamless); 1813 return false; 1814 } 1815 1816 if (!(adev->flags & AMD_IS_APU)) 1817 return false; 1818 1819 if (adev->mman.keep_stolen_vga_memory) 1820 return false; 1821 1822 return amdgpu_ip_version(adev, DCE_HWIP, 0) >= IP_VERSION(3, 0, 0); 1823 } 1824 1825 /* 1826 * Intel hosts such as Rocket Lake, Alder Lake, Raptor Lake and Sapphire Rapids 1827 * don't support dynamic speed switching. Until we have confirmation from Intel 1828 * that a specific host supports it, it's safer that we keep it disabled for all. 1829 * 1830 * https://edc.intel.com/content/www/us/en/design/products/platforms/details/raptor-lake-s/13th-generation-core-processors-datasheet-volume-1-of-2/005/pci-express-support/ 1831 * https://gitlab.freedesktop.org/drm/amd/-/issues/2663 1832 */ 1833 static bool amdgpu_device_pcie_dynamic_switching_supported(struct amdgpu_device *adev) 1834 { 1835 #if IS_ENABLED(CONFIG_X86) 1836 struct cpuinfo_x86 *c = &cpu_data(0); 1837 1838 /* eGPU change speeds based on USB4 fabric conditions */ 1839 if (dev_is_removable(adev->dev)) 1840 return true; 1841 1842 if (c->x86_vendor == X86_VENDOR_INTEL) 1843 return false; 1844 #endif 1845 return true; 1846 } 1847 1848 /** 1849 * amdgpu_device_should_use_aspm - check if the device should program ASPM 1850 * 1851 * @adev: amdgpu_device pointer 1852 * 1853 * Confirm whether the module parameter and pcie bridge agree that ASPM should 1854 * be set for this device. 1855 * 1856 * Returns true if it should be used or false if not. 1857 */ 1858 bool amdgpu_device_should_use_aspm(struct amdgpu_device *adev) 1859 { 1860 switch (amdgpu_aspm) { 1861 case -1: 1862 break; 1863 case 0: 1864 return false; 1865 case 1: 1866 return true; 1867 default: 1868 return false; 1869 } 1870 if (adev->flags & AMD_IS_APU) 1871 return false; 1872 if (!(adev->pm.pp_feature & PP_PCIE_DPM_MASK)) 1873 return false; 1874 return pcie_aspm_enabled(adev->pdev); 1875 } 1876 1877 /* if we get transitioned to only one device, take VGA back */ 1878 /** 1879 * amdgpu_device_vga_set_decode - enable/disable vga decode 1880 * 1881 * @pdev: PCI device pointer 1882 * @state: enable/disable vga decode 1883 * 1884 * Enable/disable vga decode (all asics). 1885 * Returns VGA resource flags. 1886 */ 1887 static unsigned int amdgpu_device_vga_set_decode(struct pci_dev *pdev, 1888 bool state) 1889 { 1890 struct amdgpu_device *adev = drm_to_adev(pci_get_drvdata(pdev)); 1891 1892 amdgpu_asic_set_vga_state(adev, state); 1893 if (state) 1894 return VGA_RSRC_LEGACY_IO | VGA_RSRC_LEGACY_MEM | 1895 VGA_RSRC_NORMAL_IO | VGA_RSRC_NORMAL_MEM; 1896 else 1897 return VGA_RSRC_NORMAL_IO | VGA_RSRC_NORMAL_MEM; 1898 } 1899 1900 /** 1901 * amdgpu_device_check_block_size - validate the vm block size 1902 * 1903 * @adev: amdgpu_device pointer 1904 * 1905 * Validates the vm block size specified via module parameter. 1906 * The vm block size defines number of bits in page table versus page directory, 1907 * a page is 4KB so we have 12 bits offset, minimum 9 bits in the 1908 * page table and the remaining bits are in the page directory. 1909 */ 1910 static void amdgpu_device_check_block_size(struct amdgpu_device *adev) 1911 { 1912 /* defines number of bits in page table versus page directory, 1913 * a page is 4KB so we have 12 bits offset, minimum 9 bits in the 1914 * page table and the remaining bits are in the page directory 1915 */ 1916 if (amdgpu_vm_block_size == -1) 1917 return; 1918 1919 if (amdgpu_vm_block_size < 9) { 1920 dev_warn(adev->dev, "VM page table size (%d) too small\n", 1921 amdgpu_vm_block_size); 1922 amdgpu_vm_block_size = -1; 1923 } 1924 } 1925 1926 /** 1927 * amdgpu_device_check_vm_size - validate the vm size 1928 * 1929 * @adev: amdgpu_device pointer 1930 * 1931 * Validates the vm size in GB specified via module parameter. 1932 * The VM size is the size of the GPU virtual memory space in GB. 1933 */ 1934 static void amdgpu_device_check_vm_size(struct amdgpu_device *adev) 1935 { 1936 /* no need to check the default value */ 1937 if (amdgpu_vm_size == -1) 1938 return; 1939 1940 if (amdgpu_vm_size < 1) { 1941 dev_warn(adev->dev, "VM size (%d) too small, min is 1GB\n", 1942 amdgpu_vm_size); 1943 amdgpu_vm_size = -1; 1944 } 1945 } 1946 1947 static void amdgpu_device_check_smu_prv_buffer_size(struct amdgpu_device *adev) 1948 { 1949 struct sysinfo si; 1950 bool is_os_64 = (sizeof(void *) == 8); 1951 uint64_t total_memory; 1952 uint64_t dram_size_seven_GB = 0x1B8000000; 1953 uint64_t dram_size_three_GB = 0xB8000000; 1954 1955 if (amdgpu_smu_memory_pool_size == 0) 1956 return; 1957 1958 if (!is_os_64) { 1959 DRM_WARN("Not 64-bit OS, feature not supported\n"); 1960 goto def_value; 1961 } 1962 si_meminfo(&si); 1963 total_memory = (uint64_t)si.totalram * si.mem_unit; 1964 1965 if ((amdgpu_smu_memory_pool_size == 1) || 1966 (amdgpu_smu_memory_pool_size == 2)) { 1967 if (total_memory < dram_size_three_GB) 1968 goto def_value1; 1969 } else if ((amdgpu_smu_memory_pool_size == 4) || 1970 (amdgpu_smu_memory_pool_size == 8)) { 1971 if (total_memory < dram_size_seven_GB) 1972 goto def_value1; 1973 } else { 1974 DRM_WARN("Smu memory pool size not supported\n"); 1975 goto def_value; 1976 } 1977 adev->pm.smu_prv_buffer_size = amdgpu_smu_memory_pool_size << 28; 1978 1979 return; 1980 1981 def_value1: 1982 DRM_WARN("No enough system memory\n"); 1983 def_value: 1984 adev->pm.smu_prv_buffer_size = 0; 1985 } 1986 1987 static int amdgpu_device_init_apu_flags(struct amdgpu_device *adev) 1988 { 1989 if (!(adev->flags & AMD_IS_APU) || 1990 adev->asic_type < CHIP_RAVEN) 1991 return 0; 1992 1993 switch (adev->asic_type) { 1994 case CHIP_RAVEN: 1995 if (adev->pdev->device == 0x15dd) 1996 adev->apu_flags |= AMD_APU_IS_RAVEN; 1997 if (adev->pdev->device == 0x15d8) 1998 adev->apu_flags |= AMD_APU_IS_PICASSO; 1999 break; 2000 case CHIP_RENOIR: 2001 if ((adev->pdev->device == 0x1636) || 2002 (adev->pdev->device == 0x164c)) 2003 adev->apu_flags |= AMD_APU_IS_RENOIR; 2004 else 2005 adev->apu_flags |= AMD_APU_IS_GREEN_SARDINE; 2006 break; 2007 case CHIP_VANGOGH: 2008 adev->apu_flags |= AMD_APU_IS_VANGOGH; 2009 break; 2010 case CHIP_YELLOW_CARP: 2011 break; 2012 case CHIP_CYAN_SKILLFISH: 2013 if ((adev->pdev->device == 0x13FE) || 2014 (adev->pdev->device == 0x143F)) 2015 adev->apu_flags |= AMD_APU_IS_CYAN_SKILLFISH2; 2016 break; 2017 default: 2018 break; 2019 } 2020 2021 return 0; 2022 } 2023 2024 /** 2025 * amdgpu_device_check_arguments - validate module params 2026 * 2027 * @adev: amdgpu_device pointer 2028 * 2029 * Validates certain module parameters and updates 2030 * the associated values used by the driver (all asics). 2031 */ 2032 static int amdgpu_device_check_arguments(struct amdgpu_device *adev) 2033 { 2034 int i; 2035 2036 if (amdgpu_sched_jobs < 4) { 2037 dev_warn(adev->dev, "sched jobs (%d) must be at least 4\n", 2038 amdgpu_sched_jobs); 2039 amdgpu_sched_jobs = 4; 2040 } else if (!is_power_of_2(amdgpu_sched_jobs)) { 2041 dev_warn(adev->dev, "sched jobs (%d) must be a power of 2\n", 2042 amdgpu_sched_jobs); 2043 amdgpu_sched_jobs = roundup_pow_of_two(amdgpu_sched_jobs); 2044 } 2045 2046 if (amdgpu_gart_size != -1 && amdgpu_gart_size < 32) { 2047 /* gart size must be greater or equal to 32M */ 2048 dev_warn(adev->dev, "gart size (%d) too small\n", 2049 amdgpu_gart_size); 2050 amdgpu_gart_size = -1; 2051 } 2052 2053 if (amdgpu_gtt_size != -1 && amdgpu_gtt_size < 32) { 2054 /* gtt size must be greater or equal to 32M */ 2055 dev_warn(adev->dev, "gtt size (%d) too small\n", 2056 amdgpu_gtt_size); 2057 amdgpu_gtt_size = -1; 2058 } 2059 2060 /* valid range is between 4 and 9 inclusive */ 2061 if (amdgpu_vm_fragment_size != -1 && 2062 (amdgpu_vm_fragment_size > 9 || amdgpu_vm_fragment_size < 4)) { 2063 dev_warn(adev->dev, "valid range is between 4 and 9\n"); 2064 amdgpu_vm_fragment_size = -1; 2065 } 2066 2067 if (amdgpu_sched_hw_submission < 2) { 2068 dev_warn(adev->dev, "sched hw submission jobs (%d) must be at least 2\n", 2069 amdgpu_sched_hw_submission); 2070 amdgpu_sched_hw_submission = 2; 2071 } else if (!is_power_of_2(amdgpu_sched_hw_submission)) { 2072 dev_warn(adev->dev, "sched hw submission jobs (%d) must be a power of 2\n", 2073 amdgpu_sched_hw_submission); 2074 amdgpu_sched_hw_submission = roundup_pow_of_two(amdgpu_sched_hw_submission); 2075 } 2076 2077 if (amdgpu_reset_method < -1 || amdgpu_reset_method > 4) { 2078 dev_warn(adev->dev, "invalid option for reset method, reverting to default\n"); 2079 amdgpu_reset_method = -1; 2080 } 2081 2082 amdgpu_device_check_smu_prv_buffer_size(adev); 2083 2084 amdgpu_device_check_vm_size(adev); 2085 2086 amdgpu_device_check_block_size(adev); 2087 2088 adev->firmware.load_type = amdgpu_ucode_get_load_type(adev, amdgpu_fw_load_type); 2089 2090 for (i = 0; i < MAX_XCP; i++) 2091 adev->enforce_isolation[i] = !!enforce_isolation; 2092 2093 return 0; 2094 } 2095 2096 /** 2097 * amdgpu_switcheroo_set_state - set switcheroo state 2098 * 2099 * @pdev: pci dev pointer 2100 * @state: vga_switcheroo state 2101 * 2102 * Callback for the switcheroo driver. Suspends or resumes 2103 * the asics before or after it is powered up using ACPI methods. 2104 */ 2105 static void amdgpu_switcheroo_set_state(struct pci_dev *pdev, 2106 enum vga_switcheroo_state state) 2107 { 2108 struct drm_device *dev = pci_get_drvdata(pdev); 2109 int r; 2110 2111 if (amdgpu_device_supports_px(dev) && state == VGA_SWITCHEROO_OFF) 2112 return; 2113 2114 if (state == VGA_SWITCHEROO_ON) { 2115 pr_info("switched on\n"); 2116 /* don't suspend or resume card normally */ 2117 dev->switch_power_state = DRM_SWITCH_POWER_CHANGING; 2118 2119 pci_set_power_state(pdev, PCI_D0); 2120 amdgpu_device_load_pci_state(pdev); 2121 r = pci_enable_device(pdev); 2122 if (r) 2123 DRM_WARN("pci_enable_device failed (%d)\n", r); 2124 amdgpu_device_resume(dev, true); 2125 2126 dev->switch_power_state = DRM_SWITCH_POWER_ON; 2127 } else { 2128 pr_info("switched off\n"); 2129 dev->switch_power_state = DRM_SWITCH_POWER_CHANGING; 2130 amdgpu_device_prepare(dev); 2131 amdgpu_device_suspend(dev, true); 2132 amdgpu_device_cache_pci_state(pdev); 2133 /* Shut down the device */ 2134 pci_disable_device(pdev); 2135 pci_set_power_state(pdev, PCI_D3cold); 2136 dev->switch_power_state = DRM_SWITCH_POWER_OFF; 2137 } 2138 } 2139 2140 /** 2141 * amdgpu_switcheroo_can_switch - see if switcheroo state can change 2142 * 2143 * @pdev: pci dev pointer 2144 * 2145 * Callback for the switcheroo driver. Check of the switcheroo 2146 * state can be changed. 2147 * Returns true if the state can be changed, false if not. 2148 */ 2149 static bool amdgpu_switcheroo_can_switch(struct pci_dev *pdev) 2150 { 2151 struct drm_device *dev = pci_get_drvdata(pdev); 2152 2153 /* 2154 * FIXME: open_count is protected by drm_global_mutex but that would lead to 2155 * locking inversion with the driver load path. And the access here is 2156 * completely racy anyway. So don't bother with locking for now. 2157 */ 2158 return atomic_read(&dev->open_count) == 0; 2159 } 2160 2161 static const struct vga_switcheroo_client_ops amdgpu_switcheroo_ops = { 2162 .set_gpu_state = amdgpu_switcheroo_set_state, 2163 .reprobe = NULL, 2164 .can_switch = amdgpu_switcheroo_can_switch, 2165 }; 2166 2167 /** 2168 * amdgpu_device_ip_set_clockgating_state - set the CG state 2169 * 2170 * @dev: amdgpu_device pointer 2171 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.) 2172 * @state: clockgating state (gate or ungate) 2173 * 2174 * Sets the requested clockgating state for all instances of 2175 * the hardware IP specified. 2176 * Returns the error code from the last instance. 2177 */ 2178 int amdgpu_device_ip_set_clockgating_state(void *dev, 2179 enum amd_ip_block_type block_type, 2180 enum amd_clockgating_state state) 2181 { 2182 struct amdgpu_device *adev = dev; 2183 int i, r = 0; 2184 2185 for (i = 0; i < adev->num_ip_blocks; i++) { 2186 if (!adev->ip_blocks[i].status.valid) 2187 continue; 2188 if (adev->ip_blocks[i].version->type != block_type) 2189 continue; 2190 if (!adev->ip_blocks[i].version->funcs->set_clockgating_state) 2191 continue; 2192 r = adev->ip_blocks[i].version->funcs->set_clockgating_state( 2193 &adev->ip_blocks[i], state); 2194 if (r) 2195 DRM_ERROR("set_clockgating_state of IP block <%s> failed %d\n", 2196 adev->ip_blocks[i].version->funcs->name, r); 2197 } 2198 return r; 2199 } 2200 2201 /** 2202 * amdgpu_device_ip_set_powergating_state - set the PG state 2203 * 2204 * @dev: amdgpu_device pointer 2205 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.) 2206 * @state: powergating state (gate or ungate) 2207 * 2208 * Sets the requested powergating state for all instances of 2209 * the hardware IP specified. 2210 * Returns the error code from the last instance. 2211 */ 2212 int amdgpu_device_ip_set_powergating_state(void *dev, 2213 enum amd_ip_block_type block_type, 2214 enum amd_powergating_state state) 2215 { 2216 struct amdgpu_device *adev = dev; 2217 int i, r = 0; 2218 2219 for (i = 0; i < adev->num_ip_blocks; i++) { 2220 if (!adev->ip_blocks[i].status.valid) 2221 continue; 2222 if (adev->ip_blocks[i].version->type != block_type) 2223 continue; 2224 if (!adev->ip_blocks[i].version->funcs->set_powergating_state) 2225 continue; 2226 r = adev->ip_blocks[i].version->funcs->set_powergating_state( 2227 &adev->ip_blocks[i], state); 2228 if (r) 2229 DRM_ERROR("set_powergating_state of IP block <%s> failed %d\n", 2230 adev->ip_blocks[i].version->funcs->name, r); 2231 } 2232 return r; 2233 } 2234 2235 /** 2236 * amdgpu_device_ip_get_clockgating_state - get the CG state 2237 * 2238 * @adev: amdgpu_device pointer 2239 * @flags: clockgating feature flags 2240 * 2241 * Walks the list of IPs on the device and updates the clockgating 2242 * flags for each IP. 2243 * Updates @flags with the feature flags for each hardware IP where 2244 * clockgating is enabled. 2245 */ 2246 void amdgpu_device_ip_get_clockgating_state(struct amdgpu_device *adev, 2247 u64 *flags) 2248 { 2249 int i; 2250 2251 for (i = 0; i < adev->num_ip_blocks; i++) { 2252 if (!adev->ip_blocks[i].status.valid) 2253 continue; 2254 if (adev->ip_blocks[i].version->funcs->get_clockgating_state) 2255 adev->ip_blocks[i].version->funcs->get_clockgating_state( 2256 &adev->ip_blocks[i], flags); 2257 } 2258 } 2259 2260 /** 2261 * amdgpu_device_ip_wait_for_idle - wait for idle 2262 * 2263 * @adev: amdgpu_device pointer 2264 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.) 2265 * 2266 * Waits for the request hardware IP to be idle. 2267 * Returns 0 for success or a negative error code on failure. 2268 */ 2269 int amdgpu_device_ip_wait_for_idle(struct amdgpu_device *adev, 2270 enum amd_ip_block_type block_type) 2271 { 2272 int i, r; 2273 2274 for (i = 0; i < adev->num_ip_blocks; i++) { 2275 if (!adev->ip_blocks[i].status.valid) 2276 continue; 2277 if (adev->ip_blocks[i].version->type == block_type) { 2278 if (adev->ip_blocks[i].version->funcs->wait_for_idle) { 2279 r = adev->ip_blocks[i].version->funcs->wait_for_idle( 2280 &adev->ip_blocks[i]); 2281 if (r) 2282 return r; 2283 } 2284 break; 2285 } 2286 } 2287 return 0; 2288 2289 } 2290 2291 /** 2292 * amdgpu_device_ip_is_valid - is the hardware IP enabled 2293 * 2294 * @adev: amdgpu_device pointer 2295 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.) 2296 * 2297 * Check if the hardware IP is enable or not. 2298 * Returns true if it the IP is enable, false if not. 2299 */ 2300 bool amdgpu_device_ip_is_valid(struct amdgpu_device *adev, 2301 enum amd_ip_block_type block_type) 2302 { 2303 int i; 2304 2305 for (i = 0; i < adev->num_ip_blocks; i++) { 2306 if (adev->ip_blocks[i].version->type == block_type) 2307 return adev->ip_blocks[i].status.valid; 2308 } 2309 return false; 2310 2311 } 2312 2313 /** 2314 * amdgpu_device_ip_get_ip_block - get a hw IP pointer 2315 * 2316 * @adev: amdgpu_device pointer 2317 * @type: Type of hardware IP (SMU, GFX, UVD, etc.) 2318 * 2319 * Returns a pointer to the hardware IP block structure 2320 * if it exists for the asic, otherwise NULL. 2321 */ 2322 struct amdgpu_ip_block * 2323 amdgpu_device_ip_get_ip_block(struct amdgpu_device *adev, 2324 enum amd_ip_block_type type) 2325 { 2326 int i; 2327 2328 for (i = 0; i < adev->num_ip_blocks; i++) 2329 if (adev->ip_blocks[i].version->type == type) 2330 return &adev->ip_blocks[i]; 2331 2332 return NULL; 2333 } 2334 2335 /** 2336 * amdgpu_device_ip_block_version_cmp 2337 * 2338 * @adev: amdgpu_device pointer 2339 * @type: enum amd_ip_block_type 2340 * @major: major version 2341 * @minor: minor version 2342 * 2343 * return 0 if equal or greater 2344 * return 1 if smaller or the ip_block doesn't exist 2345 */ 2346 int amdgpu_device_ip_block_version_cmp(struct amdgpu_device *adev, 2347 enum amd_ip_block_type type, 2348 u32 major, u32 minor) 2349 { 2350 struct amdgpu_ip_block *ip_block = amdgpu_device_ip_get_ip_block(adev, type); 2351 2352 if (ip_block && ((ip_block->version->major > major) || 2353 ((ip_block->version->major == major) && 2354 (ip_block->version->minor >= minor)))) 2355 return 0; 2356 2357 return 1; 2358 } 2359 2360 /** 2361 * amdgpu_device_ip_block_add 2362 * 2363 * @adev: amdgpu_device pointer 2364 * @ip_block_version: pointer to the IP to add 2365 * 2366 * Adds the IP block driver information to the collection of IPs 2367 * on the asic. 2368 */ 2369 int amdgpu_device_ip_block_add(struct amdgpu_device *adev, 2370 const struct amdgpu_ip_block_version *ip_block_version) 2371 { 2372 if (!ip_block_version) 2373 return -EINVAL; 2374 2375 switch (ip_block_version->type) { 2376 case AMD_IP_BLOCK_TYPE_VCN: 2377 if (adev->harvest_ip_mask & AMD_HARVEST_IP_VCN_MASK) 2378 return 0; 2379 break; 2380 case AMD_IP_BLOCK_TYPE_JPEG: 2381 if (adev->harvest_ip_mask & AMD_HARVEST_IP_JPEG_MASK) 2382 return 0; 2383 break; 2384 default: 2385 break; 2386 } 2387 2388 dev_info(adev->dev, "detected ip block number %d <%s>\n", 2389 adev->num_ip_blocks, ip_block_version->funcs->name); 2390 2391 adev->ip_blocks[adev->num_ip_blocks].adev = adev; 2392 2393 adev->ip_blocks[adev->num_ip_blocks++].version = ip_block_version; 2394 2395 return 0; 2396 } 2397 2398 /** 2399 * amdgpu_device_enable_virtual_display - enable virtual display feature 2400 * 2401 * @adev: amdgpu_device pointer 2402 * 2403 * Enabled the virtual display feature if the user has enabled it via 2404 * the module parameter virtual_display. This feature provides a virtual 2405 * display hardware on headless boards or in virtualized environments. 2406 * This function parses and validates the configuration string specified by 2407 * the user and configures the virtual display configuration (number of 2408 * virtual connectors, crtcs, etc.) specified. 2409 */ 2410 static void amdgpu_device_enable_virtual_display(struct amdgpu_device *adev) 2411 { 2412 adev->enable_virtual_display = false; 2413 2414 if (amdgpu_virtual_display) { 2415 const char *pci_address_name = pci_name(adev->pdev); 2416 char *pciaddstr, *pciaddstr_tmp, *pciaddname_tmp, *pciaddname; 2417 2418 pciaddstr = kstrdup(amdgpu_virtual_display, GFP_KERNEL); 2419 pciaddstr_tmp = pciaddstr; 2420 while ((pciaddname_tmp = strsep(&pciaddstr_tmp, ";"))) { 2421 pciaddname = strsep(&pciaddname_tmp, ","); 2422 if (!strcmp("all", pciaddname) 2423 || !strcmp(pci_address_name, pciaddname)) { 2424 long num_crtc; 2425 int res = -1; 2426 2427 adev->enable_virtual_display = true; 2428 2429 if (pciaddname_tmp) 2430 res = kstrtol(pciaddname_tmp, 10, 2431 &num_crtc); 2432 2433 if (!res) { 2434 if (num_crtc < 1) 2435 num_crtc = 1; 2436 if (num_crtc > 6) 2437 num_crtc = 6; 2438 adev->mode_info.num_crtc = num_crtc; 2439 } else { 2440 adev->mode_info.num_crtc = 1; 2441 } 2442 break; 2443 } 2444 } 2445 2446 DRM_INFO("virtual display string:%s, %s:virtual_display:%d, num_crtc:%d\n", 2447 amdgpu_virtual_display, pci_address_name, 2448 adev->enable_virtual_display, adev->mode_info.num_crtc); 2449 2450 kfree(pciaddstr); 2451 } 2452 } 2453 2454 void amdgpu_device_set_sriov_virtual_display(struct amdgpu_device *adev) 2455 { 2456 if (amdgpu_sriov_vf(adev) && !adev->enable_virtual_display) { 2457 adev->mode_info.num_crtc = 1; 2458 adev->enable_virtual_display = true; 2459 DRM_INFO("virtual_display:%d, num_crtc:%d\n", 2460 adev->enable_virtual_display, adev->mode_info.num_crtc); 2461 } 2462 } 2463 2464 /** 2465 * amdgpu_device_parse_gpu_info_fw - parse gpu info firmware 2466 * 2467 * @adev: amdgpu_device pointer 2468 * 2469 * Parses the asic configuration parameters specified in the gpu info 2470 * firmware and makes them available to the driver for use in configuring 2471 * the asic. 2472 * Returns 0 on success, -EINVAL on failure. 2473 */ 2474 static int amdgpu_device_parse_gpu_info_fw(struct amdgpu_device *adev) 2475 { 2476 const char *chip_name; 2477 int err; 2478 const struct gpu_info_firmware_header_v1_0 *hdr; 2479 2480 adev->firmware.gpu_info_fw = NULL; 2481 2482 if (adev->mman.discovery_bin) 2483 return 0; 2484 2485 switch (adev->asic_type) { 2486 default: 2487 return 0; 2488 case CHIP_VEGA10: 2489 chip_name = "vega10"; 2490 break; 2491 case CHIP_VEGA12: 2492 chip_name = "vega12"; 2493 break; 2494 case CHIP_RAVEN: 2495 if (adev->apu_flags & AMD_APU_IS_RAVEN2) 2496 chip_name = "raven2"; 2497 else if (adev->apu_flags & AMD_APU_IS_PICASSO) 2498 chip_name = "picasso"; 2499 else 2500 chip_name = "raven"; 2501 break; 2502 case CHIP_ARCTURUS: 2503 chip_name = "arcturus"; 2504 break; 2505 case CHIP_NAVI12: 2506 chip_name = "navi12"; 2507 break; 2508 } 2509 2510 err = amdgpu_ucode_request(adev, &adev->firmware.gpu_info_fw, 2511 AMDGPU_UCODE_OPTIONAL, 2512 "amdgpu/%s_gpu_info.bin", chip_name); 2513 if (err) { 2514 dev_err(adev->dev, 2515 "Failed to get gpu_info firmware \"%s_gpu_info.bin\"\n", 2516 chip_name); 2517 goto out; 2518 } 2519 2520 hdr = (const struct gpu_info_firmware_header_v1_0 *)adev->firmware.gpu_info_fw->data; 2521 amdgpu_ucode_print_gpu_info_hdr(&hdr->header); 2522 2523 switch (hdr->version_major) { 2524 case 1: 2525 { 2526 const struct gpu_info_firmware_v1_0 *gpu_info_fw = 2527 (const struct gpu_info_firmware_v1_0 *)(adev->firmware.gpu_info_fw->data + 2528 le32_to_cpu(hdr->header.ucode_array_offset_bytes)); 2529 2530 /* 2531 * Should be dropped when DAL no longer needs it. 2532 */ 2533 if (adev->asic_type == CHIP_NAVI12) 2534 goto parse_soc_bounding_box; 2535 2536 adev->gfx.config.max_shader_engines = le32_to_cpu(gpu_info_fw->gc_num_se); 2537 adev->gfx.config.max_cu_per_sh = le32_to_cpu(gpu_info_fw->gc_num_cu_per_sh); 2538 adev->gfx.config.max_sh_per_se = le32_to_cpu(gpu_info_fw->gc_num_sh_per_se); 2539 adev->gfx.config.max_backends_per_se = le32_to_cpu(gpu_info_fw->gc_num_rb_per_se); 2540 adev->gfx.config.max_texture_channel_caches = 2541 le32_to_cpu(gpu_info_fw->gc_num_tccs); 2542 adev->gfx.config.max_gprs = le32_to_cpu(gpu_info_fw->gc_num_gprs); 2543 adev->gfx.config.max_gs_threads = le32_to_cpu(gpu_info_fw->gc_num_max_gs_thds); 2544 adev->gfx.config.gs_vgt_table_depth = le32_to_cpu(gpu_info_fw->gc_gs_table_depth); 2545 adev->gfx.config.gs_prim_buffer_depth = le32_to_cpu(gpu_info_fw->gc_gsprim_buff_depth); 2546 adev->gfx.config.double_offchip_lds_buf = 2547 le32_to_cpu(gpu_info_fw->gc_double_offchip_lds_buffer); 2548 adev->gfx.cu_info.wave_front_size = le32_to_cpu(gpu_info_fw->gc_wave_size); 2549 adev->gfx.cu_info.max_waves_per_simd = 2550 le32_to_cpu(gpu_info_fw->gc_max_waves_per_simd); 2551 adev->gfx.cu_info.max_scratch_slots_per_cu = 2552 le32_to_cpu(gpu_info_fw->gc_max_scratch_slots_per_cu); 2553 adev->gfx.cu_info.lds_size = le32_to_cpu(gpu_info_fw->gc_lds_size); 2554 if (hdr->version_minor >= 1) { 2555 const struct gpu_info_firmware_v1_1 *gpu_info_fw = 2556 (const struct gpu_info_firmware_v1_1 *)(adev->firmware.gpu_info_fw->data + 2557 le32_to_cpu(hdr->header.ucode_array_offset_bytes)); 2558 adev->gfx.config.num_sc_per_sh = 2559 le32_to_cpu(gpu_info_fw->num_sc_per_sh); 2560 adev->gfx.config.num_packer_per_sc = 2561 le32_to_cpu(gpu_info_fw->num_packer_per_sc); 2562 } 2563 2564 parse_soc_bounding_box: 2565 /* 2566 * soc bounding box info is not integrated in disocovery table, 2567 * we always need to parse it from gpu info firmware if needed. 2568 */ 2569 if (hdr->version_minor == 2) { 2570 const struct gpu_info_firmware_v1_2 *gpu_info_fw = 2571 (const struct gpu_info_firmware_v1_2 *)(adev->firmware.gpu_info_fw->data + 2572 le32_to_cpu(hdr->header.ucode_array_offset_bytes)); 2573 adev->dm.soc_bounding_box = &gpu_info_fw->soc_bounding_box; 2574 } 2575 break; 2576 } 2577 default: 2578 dev_err(adev->dev, 2579 "Unsupported gpu_info table %d\n", hdr->header.ucode_version); 2580 err = -EINVAL; 2581 goto out; 2582 } 2583 out: 2584 return err; 2585 } 2586 2587 /** 2588 * amdgpu_device_ip_early_init - run early init for hardware IPs 2589 * 2590 * @adev: amdgpu_device pointer 2591 * 2592 * Early initialization pass for hardware IPs. The hardware IPs that make 2593 * up each asic are discovered each IP's early_init callback is run. This 2594 * is the first stage in initializing the asic. 2595 * Returns 0 on success, negative error code on failure. 2596 */ 2597 static int amdgpu_device_ip_early_init(struct amdgpu_device *adev) 2598 { 2599 struct amdgpu_ip_block *ip_block; 2600 struct pci_dev *parent; 2601 bool total, skip_bios; 2602 uint32_t bios_flags; 2603 int i, r; 2604 2605 amdgpu_device_enable_virtual_display(adev); 2606 2607 if (amdgpu_sriov_vf(adev)) { 2608 r = amdgpu_virt_request_full_gpu(adev, true); 2609 if (r) 2610 return r; 2611 } 2612 2613 switch (adev->asic_type) { 2614 #ifdef CONFIG_DRM_AMDGPU_SI 2615 case CHIP_VERDE: 2616 case CHIP_TAHITI: 2617 case CHIP_PITCAIRN: 2618 case CHIP_OLAND: 2619 case CHIP_HAINAN: 2620 adev->family = AMDGPU_FAMILY_SI; 2621 r = si_set_ip_blocks(adev); 2622 if (r) 2623 return r; 2624 break; 2625 #endif 2626 #ifdef CONFIG_DRM_AMDGPU_CIK 2627 case CHIP_BONAIRE: 2628 case CHIP_HAWAII: 2629 case CHIP_KAVERI: 2630 case CHIP_KABINI: 2631 case CHIP_MULLINS: 2632 if (adev->flags & AMD_IS_APU) 2633 adev->family = AMDGPU_FAMILY_KV; 2634 else 2635 adev->family = AMDGPU_FAMILY_CI; 2636 2637 r = cik_set_ip_blocks(adev); 2638 if (r) 2639 return r; 2640 break; 2641 #endif 2642 case CHIP_TOPAZ: 2643 case CHIP_TONGA: 2644 case CHIP_FIJI: 2645 case CHIP_POLARIS10: 2646 case CHIP_POLARIS11: 2647 case CHIP_POLARIS12: 2648 case CHIP_VEGAM: 2649 case CHIP_CARRIZO: 2650 case CHIP_STONEY: 2651 if (adev->flags & AMD_IS_APU) 2652 adev->family = AMDGPU_FAMILY_CZ; 2653 else 2654 adev->family = AMDGPU_FAMILY_VI; 2655 2656 r = vi_set_ip_blocks(adev); 2657 if (r) 2658 return r; 2659 break; 2660 default: 2661 r = amdgpu_discovery_set_ip_blocks(adev); 2662 if (r) 2663 return r; 2664 break; 2665 } 2666 2667 if (amdgpu_has_atpx() && 2668 (amdgpu_is_atpx_hybrid() || 2669 amdgpu_has_atpx_dgpu_power_cntl()) && 2670 ((adev->flags & AMD_IS_APU) == 0) && 2671 !dev_is_removable(&adev->pdev->dev)) 2672 adev->flags |= AMD_IS_PX; 2673 2674 if (!(adev->flags & AMD_IS_APU)) { 2675 parent = pcie_find_root_port(adev->pdev); 2676 adev->has_pr3 = parent ? pci_pr3_present(parent) : false; 2677 } 2678 2679 2680 adev->pm.pp_feature = amdgpu_pp_feature_mask; 2681 if (amdgpu_sriov_vf(adev) || sched_policy == KFD_SCHED_POLICY_NO_HWS) 2682 adev->pm.pp_feature &= ~PP_GFXOFF_MASK; 2683 if (amdgpu_sriov_vf(adev) && adev->asic_type == CHIP_SIENNA_CICHLID) 2684 adev->pm.pp_feature &= ~PP_OVERDRIVE_MASK; 2685 if (!amdgpu_device_pcie_dynamic_switching_supported(adev)) 2686 adev->pm.pp_feature &= ~PP_PCIE_DPM_MASK; 2687 2688 total = true; 2689 for (i = 0; i < adev->num_ip_blocks; i++) { 2690 ip_block = &adev->ip_blocks[i]; 2691 2692 if ((amdgpu_ip_block_mask & (1 << i)) == 0) { 2693 DRM_WARN("disabled ip block: %d <%s>\n", 2694 i, adev->ip_blocks[i].version->funcs->name); 2695 adev->ip_blocks[i].status.valid = false; 2696 } else if (ip_block->version->funcs->early_init) { 2697 r = ip_block->version->funcs->early_init(ip_block); 2698 if (r == -ENOENT) { 2699 adev->ip_blocks[i].status.valid = false; 2700 } else if (r) { 2701 DRM_ERROR("early_init of IP block <%s> failed %d\n", 2702 adev->ip_blocks[i].version->funcs->name, r); 2703 total = false; 2704 } else { 2705 adev->ip_blocks[i].status.valid = true; 2706 } 2707 } else { 2708 adev->ip_blocks[i].status.valid = true; 2709 } 2710 /* get the vbios after the asic_funcs are set up */ 2711 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON) { 2712 r = amdgpu_device_parse_gpu_info_fw(adev); 2713 if (r) 2714 return r; 2715 2716 bios_flags = amdgpu_device_get_vbios_flags(adev); 2717 skip_bios = !!(bios_flags & AMDGPU_VBIOS_SKIP); 2718 /* Read BIOS */ 2719 if (!skip_bios) { 2720 bool optional = 2721 !!(bios_flags & AMDGPU_VBIOS_OPTIONAL); 2722 if (!amdgpu_get_bios(adev) && !optional) 2723 return -EINVAL; 2724 2725 if (optional && !adev->bios) 2726 dev_info( 2727 adev->dev, 2728 "VBIOS image optional, proceeding without VBIOS image"); 2729 2730 if (adev->bios) { 2731 r = amdgpu_atombios_init(adev); 2732 if (r) { 2733 dev_err(adev->dev, 2734 "amdgpu_atombios_init failed\n"); 2735 amdgpu_vf_error_put( 2736 adev, 2737 AMDGIM_ERROR_VF_ATOMBIOS_INIT_FAIL, 2738 0, 0); 2739 return r; 2740 } 2741 } 2742 } 2743 2744 /*get pf2vf msg info at it's earliest time*/ 2745 if (amdgpu_sriov_vf(adev)) 2746 amdgpu_virt_init_data_exchange(adev); 2747 2748 } 2749 } 2750 if (!total) 2751 return -ENODEV; 2752 2753 ip_block = amdgpu_device_ip_get_ip_block(adev, AMD_IP_BLOCK_TYPE_GFX); 2754 if (ip_block->status.valid != false) 2755 amdgpu_amdkfd_device_probe(adev); 2756 2757 adev->cg_flags &= amdgpu_cg_mask; 2758 adev->pg_flags &= amdgpu_pg_mask; 2759 2760 return 0; 2761 } 2762 2763 static int amdgpu_device_ip_hw_init_phase1(struct amdgpu_device *adev) 2764 { 2765 int i, r; 2766 2767 for (i = 0; i < adev->num_ip_blocks; i++) { 2768 if (!adev->ip_blocks[i].status.sw) 2769 continue; 2770 if (adev->ip_blocks[i].status.hw) 2771 continue; 2772 if (!amdgpu_ip_member_of_hwini( 2773 adev, adev->ip_blocks[i].version->type)) 2774 continue; 2775 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON || 2776 (amdgpu_sriov_vf(adev) && (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP)) || 2777 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH) { 2778 r = adev->ip_blocks[i].version->funcs->hw_init(&adev->ip_blocks[i]); 2779 if (r) { 2780 DRM_ERROR("hw_init of IP block <%s> failed %d\n", 2781 adev->ip_blocks[i].version->funcs->name, r); 2782 return r; 2783 } 2784 adev->ip_blocks[i].status.hw = true; 2785 } 2786 } 2787 2788 return 0; 2789 } 2790 2791 static int amdgpu_device_ip_hw_init_phase2(struct amdgpu_device *adev) 2792 { 2793 int i, r; 2794 2795 for (i = 0; i < adev->num_ip_blocks; i++) { 2796 if (!adev->ip_blocks[i].status.sw) 2797 continue; 2798 if (adev->ip_blocks[i].status.hw) 2799 continue; 2800 if (!amdgpu_ip_member_of_hwini( 2801 adev, adev->ip_blocks[i].version->type)) 2802 continue; 2803 r = adev->ip_blocks[i].version->funcs->hw_init(&adev->ip_blocks[i]); 2804 if (r) { 2805 DRM_ERROR("hw_init of IP block <%s> failed %d\n", 2806 adev->ip_blocks[i].version->funcs->name, r); 2807 return r; 2808 } 2809 adev->ip_blocks[i].status.hw = true; 2810 } 2811 2812 return 0; 2813 } 2814 2815 static int amdgpu_device_fw_loading(struct amdgpu_device *adev) 2816 { 2817 int r = 0; 2818 int i; 2819 uint32_t smu_version; 2820 2821 if (adev->asic_type >= CHIP_VEGA10) { 2822 for (i = 0; i < adev->num_ip_blocks; i++) { 2823 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_PSP) 2824 continue; 2825 2826 if (!amdgpu_ip_member_of_hwini(adev, 2827 AMD_IP_BLOCK_TYPE_PSP)) 2828 break; 2829 2830 if (!adev->ip_blocks[i].status.sw) 2831 continue; 2832 2833 /* no need to do the fw loading again if already done*/ 2834 if (adev->ip_blocks[i].status.hw == true) 2835 break; 2836 2837 if (amdgpu_in_reset(adev) || adev->in_suspend) { 2838 r = amdgpu_ip_block_resume(&adev->ip_blocks[i]); 2839 if (r) 2840 return r; 2841 } else { 2842 r = adev->ip_blocks[i].version->funcs->hw_init(&adev->ip_blocks[i]); 2843 if (r) { 2844 DRM_ERROR("hw_init of IP block <%s> failed %d\n", 2845 adev->ip_blocks[i].version->funcs->name, r); 2846 return r; 2847 } 2848 adev->ip_blocks[i].status.hw = true; 2849 } 2850 break; 2851 } 2852 } 2853 2854 if (!amdgpu_sriov_vf(adev) || adev->asic_type == CHIP_TONGA) 2855 r = amdgpu_pm_load_smu_firmware(adev, &smu_version); 2856 2857 return r; 2858 } 2859 2860 static int amdgpu_device_init_schedulers(struct amdgpu_device *adev) 2861 { 2862 struct drm_sched_init_args args = { 2863 .ops = &amdgpu_sched_ops, 2864 .num_rqs = DRM_SCHED_PRIORITY_COUNT, 2865 .timeout_wq = adev->reset_domain->wq, 2866 .dev = adev->dev, 2867 }; 2868 long timeout; 2869 int r, i; 2870 2871 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 2872 struct amdgpu_ring *ring = adev->rings[i]; 2873 2874 /* No need to setup the GPU scheduler for rings that don't need it */ 2875 if (!ring || ring->no_scheduler) 2876 continue; 2877 2878 switch (ring->funcs->type) { 2879 case AMDGPU_RING_TYPE_GFX: 2880 timeout = adev->gfx_timeout; 2881 break; 2882 case AMDGPU_RING_TYPE_COMPUTE: 2883 timeout = adev->compute_timeout; 2884 break; 2885 case AMDGPU_RING_TYPE_SDMA: 2886 timeout = adev->sdma_timeout; 2887 break; 2888 default: 2889 timeout = adev->video_timeout; 2890 break; 2891 } 2892 2893 args.timeout = timeout; 2894 args.credit_limit = ring->num_hw_submission; 2895 args.score = ring->sched_score; 2896 args.name = ring->name; 2897 2898 r = drm_sched_init(&ring->sched, &args); 2899 if (r) { 2900 DRM_ERROR("Failed to create scheduler on ring %s.\n", 2901 ring->name); 2902 return r; 2903 } 2904 r = amdgpu_uvd_entity_init(adev, ring); 2905 if (r) { 2906 DRM_ERROR("Failed to create UVD scheduling entity on ring %s.\n", 2907 ring->name); 2908 return r; 2909 } 2910 r = amdgpu_vce_entity_init(adev, ring); 2911 if (r) { 2912 DRM_ERROR("Failed to create VCE scheduling entity on ring %s.\n", 2913 ring->name); 2914 return r; 2915 } 2916 } 2917 2918 amdgpu_xcp_update_partition_sched_list(adev); 2919 2920 return 0; 2921 } 2922 2923 2924 /** 2925 * amdgpu_device_ip_init - run init for hardware IPs 2926 * 2927 * @adev: amdgpu_device pointer 2928 * 2929 * Main initialization pass for hardware IPs. The list of all the hardware 2930 * IPs that make up the asic is walked and the sw_init and hw_init callbacks 2931 * are run. sw_init initializes the software state associated with each IP 2932 * and hw_init initializes the hardware associated with each IP. 2933 * Returns 0 on success, negative error code on failure. 2934 */ 2935 static int amdgpu_device_ip_init(struct amdgpu_device *adev) 2936 { 2937 bool init_badpage; 2938 int i, r; 2939 2940 r = amdgpu_ras_init(adev); 2941 if (r) 2942 return r; 2943 2944 for (i = 0; i < adev->num_ip_blocks; i++) { 2945 if (!adev->ip_blocks[i].status.valid) 2946 continue; 2947 if (adev->ip_blocks[i].version->funcs->sw_init) { 2948 r = adev->ip_blocks[i].version->funcs->sw_init(&adev->ip_blocks[i]); 2949 if (r) { 2950 DRM_ERROR("sw_init of IP block <%s> failed %d\n", 2951 adev->ip_blocks[i].version->funcs->name, r); 2952 goto init_failed; 2953 } 2954 } 2955 adev->ip_blocks[i].status.sw = true; 2956 2957 if (!amdgpu_ip_member_of_hwini( 2958 adev, adev->ip_blocks[i].version->type)) 2959 continue; 2960 2961 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON) { 2962 /* need to do common hw init early so everything is set up for gmc */ 2963 r = adev->ip_blocks[i].version->funcs->hw_init(&adev->ip_blocks[i]); 2964 if (r) { 2965 DRM_ERROR("hw_init %d failed %d\n", i, r); 2966 goto init_failed; 2967 } 2968 adev->ip_blocks[i].status.hw = true; 2969 } else if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) { 2970 /* need to do gmc hw init early so we can allocate gpu mem */ 2971 /* Try to reserve bad pages early */ 2972 if (amdgpu_sriov_vf(adev)) 2973 amdgpu_virt_exchange_data(adev); 2974 2975 r = amdgpu_device_mem_scratch_init(adev); 2976 if (r) { 2977 DRM_ERROR("amdgpu_mem_scratch_init failed %d\n", r); 2978 goto init_failed; 2979 } 2980 r = adev->ip_blocks[i].version->funcs->hw_init(&adev->ip_blocks[i]); 2981 if (r) { 2982 DRM_ERROR("hw_init %d failed %d\n", i, r); 2983 goto init_failed; 2984 } 2985 r = amdgpu_device_wb_init(adev); 2986 if (r) { 2987 DRM_ERROR("amdgpu_device_wb_init failed %d\n", r); 2988 goto init_failed; 2989 } 2990 adev->ip_blocks[i].status.hw = true; 2991 2992 /* right after GMC hw init, we create CSA */ 2993 if (adev->gfx.mcbp) { 2994 r = amdgpu_allocate_static_csa(adev, &adev->virt.csa_obj, 2995 AMDGPU_GEM_DOMAIN_VRAM | 2996 AMDGPU_GEM_DOMAIN_GTT, 2997 AMDGPU_CSA_SIZE); 2998 if (r) { 2999 DRM_ERROR("allocate CSA failed %d\n", r); 3000 goto init_failed; 3001 } 3002 } 3003 3004 r = amdgpu_seq64_init(adev); 3005 if (r) { 3006 DRM_ERROR("allocate seq64 failed %d\n", r); 3007 goto init_failed; 3008 } 3009 } 3010 } 3011 3012 if (amdgpu_sriov_vf(adev)) 3013 amdgpu_virt_init_data_exchange(adev); 3014 3015 r = amdgpu_ib_pool_init(adev); 3016 if (r) { 3017 dev_err(adev->dev, "IB initialization failed (%d).\n", r); 3018 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_IB_INIT_FAIL, 0, r); 3019 goto init_failed; 3020 } 3021 3022 r = amdgpu_ucode_create_bo(adev); /* create ucode bo when sw_init complete*/ 3023 if (r) 3024 goto init_failed; 3025 3026 r = amdgpu_device_ip_hw_init_phase1(adev); 3027 if (r) 3028 goto init_failed; 3029 3030 r = amdgpu_device_fw_loading(adev); 3031 if (r) 3032 goto init_failed; 3033 3034 r = amdgpu_device_ip_hw_init_phase2(adev); 3035 if (r) 3036 goto init_failed; 3037 3038 /* 3039 * retired pages will be loaded from eeprom and reserved here, 3040 * it should be called after amdgpu_device_ip_hw_init_phase2 since 3041 * for some ASICs the RAS EEPROM code relies on SMU fully functioning 3042 * for I2C communication which only true at this point. 3043 * 3044 * amdgpu_ras_recovery_init may fail, but the upper only cares the 3045 * failure from bad gpu situation and stop amdgpu init process 3046 * accordingly. For other failed cases, it will still release all 3047 * the resource and print error message, rather than returning one 3048 * negative value to upper level. 3049 * 3050 * Note: theoretically, this should be called before all vram allocations 3051 * to protect retired page from abusing 3052 */ 3053 init_badpage = (adev->init_lvl->level != AMDGPU_INIT_LEVEL_MINIMAL_XGMI); 3054 r = amdgpu_ras_recovery_init(adev, init_badpage); 3055 if (r) 3056 goto init_failed; 3057 3058 /** 3059 * In case of XGMI grab extra reference for reset domain for this device 3060 */ 3061 if (adev->gmc.xgmi.num_physical_nodes > 1) { 3062 if (amdgpu_xgmi_add_device(adev) == 0) { 3063 if (!amdgpu_sriov_vf(adev)) { 3064 struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev); 3065 3066 if (WARN_ON(!hive)) { 3067 r = -ENOENT; 3068 goto init_failed; 3069 } 3070 3071 if (!hive->reset_domain || 3072 !amdgpu_reset_get_reset_domain(hive->reset_domain)) { 3073 r = -ENOENT; 3074 amdgpu_put_xgmi_hive(hive); 3075 goto init_failed; 3076 } 3077 3078 /* Drop the early temporary reset domain we created for device */ 3079 amdgpu_reset_put_reset_domain(adev->reset_domain); 3080 adev->reset_domain = hive->reset_domain; 3081 amdgpu_put_xgmi_hive(hive); 3082 } 3083 } 3084 } 3085 3086 r = amdgpu_device_init_schedulers(adev); 3087 if (r) 3088 goto init_failed; 3089 3090 if (adev->mman.buffer_funcs_ring->sched.ready) 3091 amdgpu_ttm_set_buffer_funcs_status(adev, true); 3092 3093 /* Don't init kfd if whole hive need to be reset during init */ 3094 if (adev->init_lvl->level != AMDGPU_INIT_LEVEL_MINIMAL_XGMI) { 3095 kgd2kfd_init_zone_device(adev); 3096 amdgpu_amdkfd_device_init(adev); 3097 } 3098 3099 amdgpu_fru_get_product_info(adev); 3100 3101 r = amdgpu_cper_init(adev); 3102 3103 init_failed: 3104 3105 return r; 3106 } 3107 3108 /** 3109 * amdgpu_device_fill_reset_magic - writes reset magic to gart pointer 3110 * 3111 * @adev: amdgpu_device pointer 3112 * 3113 * Writes a reset magic value to the gart pointer in VRAM. The driver calls 3114 * this function before a GPU reset. If the value is retained after a 3115 * GPU reset, VRAM has not been lost. Some GPU resets may destroy VRAM contents. 3116 */ 3117 static void amdgpu_device_fill_reset_magic(struct amdgpu_device *adev) 3118 { 3119 memcpy(adev->reset_magic, adev->gart.ptr, AMDGPU_RESET_MAGIC_NUM); 3120 } 3121 3122 /** 3123 * amdgpu_device_check_vram_lost - check if vram is valid 3124 * 3125 * @adev: amdgpu_device pointer 3126 * 3127 * Checks the reset magic value written to the gart pointer in VRAM. 3128 * The driver calls this after a GPU reset to see if the contents of 3129 * VRAM is lost or now. 3130 * returns true if vram is lost, false if not. 3131 */ 3132 static bool amdgpu_device_check_vram_lost(struct amdgpu_device *adev) 3133 { 3134 if (memcmp(adev->gart.ptr, adev->reset_magic, 3135 AMDGPU_RESET_MAGIC_NUM)) 3136 return true; 3137 3138 if (!amdgpu_in_reset(adev)) 3139 return false; 3140 3141 /* 3142 * For all ASICs with baco/mode1 reset, the VRAM is 3143 * always assumed to be lost. 3144 */ 3145 switch (amdgpu_asic_reset_method(adev)) { 3146 case AMD_RESET_METHOD_BACO: 3147 case AMD_RESET_METHOD_MODE1: 3148 return true; 3149 default: 3150 return false; 3151 } 3152 } 3153 3154 /** 3155 * amdgpu_device_set_cg_state - set clockgating for amdgpu device 3156 * 3157 * @adev: amdgpu_device pointer 3158 * @state: clockgating state (gate or ungate) 3159 * 3160 * The list of all the hardware IPs that make up the asic is walked and the 3161 * set_clockgating_state callbacks are run. 3162 * Late initialization pass enabling clockgating for hardware IPs. 3163 * Fini or suspend, pass disabling clockgating for hardware IPs. 3164 * Returns 0 on success, negative error code on failure. 3165 */ 3166 3167 int amdgpu_device_set_cg_state(struct amdgpu_device *adev, 3168 enum amd_clockgating_state state) 3169 { 3170 int i, j, r; 3171 3172 if (amdgpu_emu_mode == 1) 3173 return 0; 3174 3175 for (j = 0; j < adev->num_ip_blocks; j++) { 3176 i = state == AMD_CG_STATE_GATE ? j : adev->num_ip_blocks - j - 1; 3177 if (!adev->ip_blocks[i].status.late_initialized) 3178 continue; 3179 /* skip CG for GFX, SDMA on S0ix */ 3180 if (adev->in_s0ix && 3181 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GFX || 3182 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SDMA)) 3183 continue; 3184 /* skip CG for VCE/UVD, it's handled specially */ 3185 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_UVD && 3186 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCE && 3187 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCN && 3188 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_JPEG && 3189 adev->ip_blocks[i].version->funcs->set_clockgating_state) { 3190 /* enable clockgating to save power */ 3191 r = adev->ip_blocks[i].version->funcs->set_clockgating_state(&adev->ip_blocks[i], 3192 state); 3193 if (r) { 3194 DRM_ERROR("set_clockgating_state(gate) of IP block <%s> failed %d\n", 3195 adev->ip_blocks[i].version->funcs->name, r); 3196 return r; 3197 } 3198 } 3199 } 3200 3201 return 0; 3202 } 3203 3204 int amdgpu_device_set_pg_state(struct amdgpu_device *adev, 3205 enum amd_powergating_state state) 3206 { 3207 int i, j, r; 3208 3209 if (amdgpu_emu_mode == 1) 3210 return 0; 3211 3212 for (j = 0; j < adev->num_ip_blocks; j++) { 3213 i = state == AMD_PG_STATE_GATE ? j : adev->num_ip_blocks - j - 1; 3214 if (!adev->ip_blocks[i].status.late_initialized) 3215 continue; 3216 /* skip PG for GFX, SDMA on S0ix */ 3217 if (adev->in_s0ix && 3218 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GFX || 3219 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SDMA)) 3220 continue; 3221 /* skip CG for VCE/UVD, it's handled specially */ 3222 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_UVD && 3223 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCE && 3224 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCN && 3225 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_JPEG && 3226 adev->ip_blocks[i].version->funcs->set_powergating_state) { 3227 /* enable powergating to save power */ 3228 r = adev->ip_blocks[i].version->funcs->set_powergating_state(&adev->ip_blocks[i], 3229 state); 3230 if (r) { 3231 DRM_ERROR("set_powergating_state(gate) of IP block <%s> failed %d\n", 3232 adev->ip_blocks[i].version->funcs->name, r); 3233 return r; 3234 } 3235 } 3236 } 3237 return 0; 3238 } 3239 3240 static int amdgpu_device_enable_mgpu_fan_boost(void) 3241 { 3242 struct amdgpu_gpu_instance *gpu_ins; 3243 struct amdgpu_device *adev; 3244 int i, ret = 0; 3245 3246 mutex_lock(&mgpu_info.mutex); 3247 3248 /* 3249 * MGPU fan boost feature should be enabled 3250 * only when there are two or more dGPUs in 3251 * the system 3252 */ 3253 if (mgpu_info.num_dgpu < 2) 3254 goto out; 3255 3256 for (i = 0; i < mgpu_info.num_dgpu; i++) { 3257 gpu_ins = &(mgpu_info.gpu_ins[i]); 3258 adev = gpu_ins->adev; 3259 if (!(adev->flags & AMD_IS_APU) && 3260 !gpu_ins->mgpu_fan_enabled) { 3261 ret = amdgpu_dpm_enable_mgpu_fan_boost(adev); 3262 if (ret) 3263 break; 3264 3265 gpu_ins->mgpu_fan_enabled = 1; 3266 } 3267 } 3268 3269 out: 3270 mutex_unlock(&mgpu_info.mutex); 3271 3272 return ret; 3273 } 3274 3275 /** 3276 * amdgpu_device_ip_late_init - run late init for hardware IPs 3277 * 3278 * @adev: amdgpu_device pointer 3279 * 3280 * Late initialization pass for hardware IPs. The list of all the hardware 3281 * IPs that make up the asic is walked and the late_init callbacks are run. 3282 * late_init covers any special initialization that an IP requires 3283 * after all of the have been initialized or something that needs to happen 3284 * late in the init process. 3285 * Returns 0 on success, negative error code on failure. 3286 */ 3287 static int amdgpu_device_ip_late_init(struct amdgpu_device *adev) 3288 { 3289 struct amdgpu_gpu_instance *gpu_instance; 3290 int i = 0, r; 3291 3292 for (i = 0; i < adev->num_ip_blocks; i++) { 3293 if (!adev->ip_blocks[i].status.hw) 3294 continue; 3295 if (adev->ip_blocks[i].version->funcs->late_init) { 3296 r = adev->ip_blocks[i].version->funcs->late_init(&adev->ip_blocks[i]); 3297 if (r) { 3298 DRM_ERROR("late_init of IP block <%s> failed %d\n", 3299 adev->ip_blocks[i].version->funcs->name, r); 3300 return r; 3301 } 3302 } 3303 adev->ip_blocks[i].status.late_initialized = true; 3304 } 3305 3306 r = amdgpu_ras_late_init(adev); 3307 if (r) { 3308 DRM_ERROR("amdgpu_ras_late_init failed %d", r); 3309 return r; 3310 } 3311 3312 if (!amdgpu_reset_in_recovery(adev)) 3313 amdgpu_ras_set_error_query_ready(adev, true); 3314 3315 amdgpu_device_set_cg_state(adev, AMD_CG_STATE_GATE); 3316 amdgpu_device_set_pg_state(adev, AMD_PG_STATE_GATE); 3317 3318 amdgpu_device_fill_reset_magic(adev); 3319 3320 r = amdgpu_device_enable_mgpu_fan_boost(); 3321 if (r) 3322 DRM_ERROR("enable mgpu fan boost failed (%d).\n", r); 3323 3324 /* For passthrough configuration on arcturus and aldebaran, enable special handling SBR */ 3325 if (amdgpu_passthrough(adev) && 3326 ((adev->asic_type == CHIP_ARCTURUS && adev->gmc.xgmi.num_physical_nodes > 1) || 3327 adev->asic_type == CHIP_ALDEBARAN)) 3328 amdgpu_dpm_handle_passthrough_sbr(adev, true); 3329 3330 if (adev->gmc.xgmi.num_physical_nodes > 1) { 3331 mutex_lock(&mgpu_info.mutex); 3332 3333 /* 3334 * Reset device p-state to low as this was booted with high. 3335 * 3336 * This should be performed only after all devices from the same 3337 * hive get initialized. 3338 * 3339 * However, it's unknown how many device in the hive in advance. 3340 * As this is counted one by one during devices initializations. 3341 * 3342 * So, we wait for all XGMI interlinked devices initialized. 3343 * This may bring some delays as those devices may come from 3344 * different hives. But that should be OK. 3345 */ 3346 if (mgpu_info.num_dgpu == adev->gmc.xgmi.num_physical_nodes) { 3347 for (i = 0; i < mgpu_info.num_gpu; i++) { 3348 gpu_instance = &(mgpu_info.gpu_ins[i]); 3349 if (gpu_instance->adev->flags & AMD_IS_APU) 3350 continue; 3351 3352 r = amdgpu_xgmi_set_pstate(gpu_instance->adev, 3353 AMDGPU_XGMI_PSTATE_MIN); 3354 if (r) { 3355 DRM_ERROR("pstate setting failed (%d).\n", r); 3356 break; 3357 } 3358 } 3359 } 3360 3361 mutex_unlock(&mgpu_info.mutex); 3362 } 3363 3364 return 0; 3365 } 3366 3367 static void amdgpu_ip_block_hw_fini(struct amdgpu_ip_block *ip_block) 3368 { 3369 int r; 3370 3371 if (!ip_block->version->funcs->hw_fini) { 3372 DRM_ERROR("hw_fini of IP block <%s> not defined\n", 3373 ip_block->version->funcs->name); 3374 } else { 3375 r = ip_block->version->funcs->hw_fini(ip_block); 3376 /* XXX handle errors */ 3377 if (r) { 3378 DRM_DEBUG("hw_fini of IP block <%s> failed %d\n", 3379 ip_block->version->funcs->name, r); 3380 } 3381 } 3382 3383 ip_block->status.hw = false; 3384 } 3385 3386 /** 3387 * amdgpu_device_smu_fini_early - smu hw_fini wrapper 3388 * 3389 * @adev: amdgpu_device pointer 3390 * 3391 * For ASICs need to disable SMC first 3392 */ 3393 static void amdgpu_device_smu_fini_early(struct amdgpu_device *adev) 3394 { 3395 int i; 3396 3397 if (amdgpu_ip_version(adev, GC_HWIP, 0) > IP_VERSION(9, 0, 0)) 3398 return; 3399 3400 for (i = 0; i < adev->num_ip_blocks; i++) { 3401 if (!adev->ip_blocks[i].status.hw) 3402 continue; 3403 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) { 3404 amdgpu_ip_block_hw_fini(&adev->ip_blocks[i]); 3405 break; 3406 } 3407 } 3408 } 3409 3410 static int amdgpu_device_ip_fini_early(struct amdgpu_device *adev) 3411 { 3412 int i, r; 3413 3414 for (i = 0; i < adev->num_ip_blocks; i++) { 3415 if (!adev->ip_blocks[i].version->funcs->early_fini) 3416 continue; 3417 3418 r = adev->ip_blocks[i].version->funcs->early_fini(&adev->ip_blocks[i]); 3419 if (r) { 3420 DRM_DEBUG("early_fini of IP block <%s> failed %d\n", 3421 adev->ip_blocks[i].version->funcs->name, r); 3422 } 3423 } 3424 3425 amdgpu_device_set_pg_state(adev, AMD_PG_STATE_UNGATE); 3426 amdgpu_device_set_cg_state(adev, AMD_CG_STATE_UNGATE); 3427 3428 amdgpu_amdkfd_suspend(adev, false); 3429 3430 /* Workaround for ASICs need to disable SMC first */ 3431 amdgpu_device_smu_fini_early(adev); 3432 3433 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 3434 if (!adev->ip_blocks[i].status.hw) 3435 continue; 3436 3437 amdgpu_ip_block_hw_fini(&adev->ip_blocks[i]); 3438 } 3439 3440 if (amdgpu_sriov_vf(adev)) { 3441 if (amdgpu_virt_release_full_gpu(adev, false)) 3442 DRM_ERROR("failed to release exclusive mode on fini\n"); 3443 } 3444 3445 return 0; 3446 } 3447 3448 /** 3449 * amdgpu_device_ip_fini - run fini for hardware IPs 3450 * 3451 * @adev: amdgpu_device pointer 3452 * 3453 * Main teardown pass for hardware IPs. The list of all the hardware 3454 * IPs that make up the asic is walked and the hw_fini and sw_fini callbacks 3455 * are run. hw_fini tears down the hardware associated with each IP 3456 * and sw_fini tears down any software state associated with each IP. 3457 * Returns 0 on success, negative error code on failure. 3458 */ 3459 static int amdgpu_device_ip_fini(struct amdgpu_device *adev) 3460 { 3461 int i, r; 3462 3463 amdgpu_cper_fini(adev); 3464 3465 if (amdgpu_sriov_vf(adev) && adev->virt.ras_init_done) 3466 amdgpu_virt_release_ras_err_handler_data(adev); 3467 3468 if (adev->gmc.xgmi.num_physical_nodes > 1) 3469 amdgpu_xgmi_remove_device(adev); 3470 3471 amdgpu_amdkfd_device_fini_sw(adev); 3472 3473 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 3474 if (!adev->ip_blocks[i].status.sw) 3475 continue; 3476 3477 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) { 3478 amdgpu_ucode_free_bo(adev); 3479 amdgpu_free_static_csa(&adev->virt.csa_obj); 3480 amdgpu_device_wb_fini(adev); 3481 amdgpu_device_mem_scratch_fini(adev); 3482 amdgpu_ib_pool_fini(adev); 3483 amdgpu_seq64_fini(adev); 3484 } 3485 if (adev->ip_blocks[i].version->funcs->sw_fini) { 3486 r = adev->ip_blocks[i].version->funcs->sw_fini(&adev->ip_blocks[i]); 3487 /* XXX handle errors */ 3488 if (r) { 3489 DRM_DEBUG("sw_fini of IP block <%s> failed %d\n", 3490 adev->ip_blocks[i].version->funcs->name, r); 3491 } 3492 } 3493 adev->ip_blocks[i].status.sw = false; 3494 adev->ip_blocks[i].status.valid = false; 3495 } 3496 3497 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 3498 if (!adev->ip_blocks[i].status.late_initialized) 3499 continue; 3500 if (adev->ip_blocks[i].version->funcs->late_fini) 3501 adev->ip_blocks[i].version->funcs->late_fini(&adev->ip_blocks[i]); 3502 adev->ip_blocks[i].status.late_initialized = false; 3503 } 3504 3505 amdgpu_ras_fini(adev); 3506 3507 return 0; 3508 } 3509 3510 /** 3511 * amdgpu_device_delayed_init_work_handler - work handler for IB tests 3512 * 3513 * @work: work_struct. 3514 */ 3515 static void amdgpu_device_delayed_init_work_handler(struct work_struct *work) 3516 { 3517 struct amdgpu_device *adev = 3518 container_of(work, struct amdgpu_device, delayed_init_work.work); 3519 int r; 3520 3521 r = amdgpu_ib_ring_tests(adev); 3522 if (r) 3523 DRM_ERROR("ib ring test failed (%d).\n", r); 3524 } 3525 3526 static void amdgpu_device_delay_enable_gfx_off(struct work_struct *work) 3527 { 3528 struct amdgpu_device *adev = 3529 container_of(work, struct amdgpu_device, gfx.gfx_off_delay_work.work); 3530 3531 WARN_ON_ONCE(adev->gfx.gfx_off_state); 3532 WARN_ON_ONCE(adev->gfx.gfx_off_req_count); 3533 3534 if (!amdgpu_dpm_set_powergating_by_smu(adev, AMD_IP_BLOCK_TYPE_GFX, true, 0)) 3535 adev->gfx.gfx_off_state = true; 3536 } 3537 3538 /** 3539 * amdgpu_device_ip_suspend_phase1 - run suspend for hardware IPs (phase 1) 3540 * 3541 * @adev: amdgpu_device pointer 3542 * 3543 * Main suspend function for hardware IPs. The list of all the hardware 3544 * IPs that make up the asic is walked, clockgating is disabled and the 3545 * suspend callbacks are run. suspend puts the hardware and software state 3546 * in each IP into a state suitable for suspend. 3547 * Returns 0 on success, negative error code on failure. 3548 */ 3549 static int amdgpu_device_ip_suspend_phase1(struct amdgpu_device *adev) 3550 { 3551 int i, r; 3552 3553 amdgpu_device_set_pg_state(adev, AMD_PG_STATE_UNGATE); 3554 amdgpu_device_set_cg_state(adev, AMD_CG_STATE_UNGATE); 3555 3556 /* 3557 * Per PMFW team's suggestion, driver needs to handle gfxoff 3558 * and df cstate features disablement for gpu reset(e.g. Mode1Reset) 3559 * scenario. Add the missing df cstate disablement here. 3560 */ 3561 if (amdgpu_dpm_set_df_cstate(adev, DF_CSTATE_DISALLOW)) 3562 dev_warn(adev->dev, "Failed to disallow df cstate"); 3563 3564 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 3565 if (!adev->ip_blocks[i].status.valid) 3566 continue; 3567 3568 /* displays are handled separately */ 3569 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_DCE) 3570 continue; 3571 3572 /* XXX handle errors */ 3573 r = amdgpu_ip_block_suspend(&adev->ip_blocks[i]); 3574 if (r) 3575 return r; 3576 } 3577 3578 return 0; 3579 } 3580 3581 /** 3582 * amdgpu_device_ip_suspend_phase2 - run suspend for hardware IPs (phase 2) 3583 * 3584 * @adev: amdgpu_device pointer 3585 * 3586 * Main suspend function for hardware IPs. The list of all the hardware 3587 * IPs that make up the asic is walked, clockgating is disabled and the 3588 * suspend callbacks are run. suspend puts the hardware and software state 3589 * in each IP into a state suitable for suspend. 3590 * Returns 0 on success, negative error code on failure. 3591 */ 3592 static int amdgpu_device_ip_suspend_phase2(struct amdgpu_device *adev) 3593 { 3594 int i, r; 3595 3596 if (adev->in_s0ix) 3597 amdgpu_dpm_gfx_state_change(adev, sGpuChangeState_D3Entry); 3598 3599 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 3600 if (!adev->ip_blocks[i].status.valid) 3601 continue; 3602 /* displays are handled in phase1 */ 3603 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE) 3604 continue; 3605 /* PSP lost connection when err_event_athub occurs */ 3606 if (amdgpu_ras_intr_triggered() && 3607 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) { 3608 adev->ip_blocks[i].status.hw = false; 3609 continue; 3610 } 3611 3612 /* skip unnecessary suspend if we do not initialize them yet */ 3613 if (!amdgpu_ip_member_of_hwini( 3614 adev, adev->ip_blocks[i].version->type)) 3615 continue; 3616 3617 /* skip suspend of gfx/mes and psp for S0ix 3618 * gfx is in gfxoff state, so on resume it will exit gfxoff just 3619 * like at runtime. PSP is also part of the always on hardware 3620 * so no need to suspend it. 3621 */ 3622 if (adev->in_s0ix && 3623 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP || 3624 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GFX || 3625 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_MES)) 3626 continue; 3627 3628 /* SDMA 5.x+ is part of GFX power domain so it's covered by GFXOFF */ 3629 if (adev->in_s0ix && 3630 (amdgpu_ip_version(adev, SDMA0_HWIP, 0) >= 3631 IP_VERSION(5, 0, 0)) && 3632 (adev->ip_blocks[i].version->type == 3633 AMD_IP_BLOCK_TYPE_SDMA)) 3634 continue; 3635 3636 /* Once swPSP provides the IMU, RLC FW binaries to TOS during cold-boot. 3637 * These are in TMR, hence are expected to be reused by PSP-TOS to reload 3638 * from this location and RLC Autoload automatically also gets loaded 3639 * from here based on PMFW -> PSP message during re-init sequence. 3640 * Therefore, the psp suspend & resume should be skipped to avoid destroy 3641 * the TMR and reload FWs again for IMU enabled APU ASICs. 3642 */ 3643 if (amdgpu_in_reset(adev) && 3644 (adev->flags & AMD_IS_APU) && adev->gfx.imu.funcs && 3645 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) 3646 continue; 3647 3648 /* XXX handle errors */ 3649 r = amdgpu_ip_block_suspend(&adev->ip_blocks[i]); 3650 adev->ip_blocks[i].status.hw = false; 3651 3652 /* handle putting the SMC in the appropriate state */ 3653 if (!amdgpu_sriov_vf(adev)) { 3654 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) { 3655 r = amdgpu_dpm_set_mp1_state(adev, adev->mp1_state); 3656 if (r) { 3657 DRM_ERROR("SMC failed to set mp1 state %d, %d\n", 3658 adev->mp1_state, r); 3659 return r; 3660 } 3661 } 3662 } 3663 } 3664 3665 return 0; 3666 } 3667 3668 /** 3669 * amdgpu_device_ip_suspend - run suspend for hardware IPs 3670 * 3671 * @adev: amdgpu_device pointer 3672 * 3673 * Main suspend function for hardware IPs. The list of all the hardware 3674 * IPs that make up the asic is walked, clockgating is disabled and the 3675 * suspend callbacks are run. suspend puts the hardware and software state 3676 * in each IP into a state suitable for suspend. 3677 * Returns 0 on success, negative error code on failure. 3678 */ 3679 int amdgpu_device_ip_suspend(struct amdgpu_device *adev) 3680 { 3681 int r; 3682 3683 if (amdgpu_sriov_vf(adev)) { 3684 amdgpu_virt_fini_data_exchange(adev); 3685 amdgpu_virt_request_full_gpu(adev, false); 3686 } 3687 3688 amdgpu_ttm_set_buffer_funcs_status(adev, false); 3689 3690 r = amdgpu_device_ip_suspend_phase1(adev); 3691 if (r) 3692 return r; 3693 r = amdgpu_device_ip_suspend_phase2(adev); 3694 3695 if (amdgpu_sriov_vf(adev)) 3696 amdgpu_virt_release_full_gpu(adev, false); 3697 3698 return r; 3699 } 3700 3701 static int amdgpu_device_ip_reinit_early_sriov(struct amdgpu_device *adev) 3702 { 3703 int i, r; 3704 3705 static enum amd_ip_block_type ip_order[] = { 3706 AMD_IP_BLOCK_TYPE_COMMON, 3707 AMD_IP_BLOCK_TYPE_GMC, 3708 AMD_IP_BLOCK_TYPE_PSP, 3709 AMD_IP_BLOCK_TYPE_IH, 3710 }; 3711 3712 for (i = 0; i < adev->num_ip_blocks; i++) { 3713 int j; 3714 struct amdgpu_ip_block *block; 3715 3716 block = &adev->ip_blocks[i]; 3717 block->status.hw = false; 3718 3719 for (j = 0; j < ARRAY_SIZE(ip_order); j++) { 3720 3721 if (block->version->type != ip_order[j] || 3722 !block->status.valid) 3723 continue; 3724 3725 r = block->version->funcs->hw_init(&adev->ip_blocks[i]); 3726 if (r) { 3727 dev_err(adev->dev, "RE-INIT-early: %s failed\n", 3728 block->version->funcs->name); 3729 return r; 3730 } 3731 block->status.hw = true; 3732 } 3733 } 3734 3735 return 0; 3736 } 3737 3738 static int amdgpu_device_ip_reinit_late_sriov(struct amdgpu_device *adev) 3739 { 3740 struct amdgpu_ip_block *block; 3741 int i, r = 0; 3742 3743 static enum amd_ip_block_type ip_order[] = { 3744 AMD_IP_BLOCK_TYPE_SMC, 3745 AMD_IP_BLOCK_TYPE_DCE, 3746 AMD_IP_BLOCK_TYPE_GFX, 3747 AMD_IP_BLOCK_TYPE_SDMA, 3748 AMD_IP_BLOCK_TYPE_MES, 3749 AMD_IP_BLOCK_TYPE_UVD, 3750 AMD_IP_BLOCK_TYPE_VCE, 3751 AMD_IP_BLOCK_TYPE_VCN, 3752 AMD_IP_BLOCK_TYPE_JPEG 3753 }; 3754 3755 for (i = 0; i < ARRAY_SIZE(ip_order); i++) { 3756 block = amdgpu_device_ip_get_ip_block(adev, ip_order[i]); 3757 3758 if (!block) 3759 continue; 3760 3761 if (block->status.valid && !block->status.hw) { 3762 if (block->version->type == AMD_IP_BLOCK_TYPE_SMC) { 3763 r = amdgpu_ip_block_resume(block); 3764 } else { 3765 r = block->version->funcs->hw_init(block); 3766 } 3767 3768 if (r) { 3769 dev_err(adev->dev, "RE-INIT-late: %s failed\n", 3770 block->version->funcs->name); 3771 break; 3772 } 3773 block->status.hw = true; 3774 } 3775 } 3776 3777 return r; 3778 } 3779 3780 /** 3781 * amdgpu_device_ip_resume_phase1 - run resume for hardware IPs 3782 * 3783 * @adev: amdgpu_device pointer 3784 * 3785 * First resume function for hardware IPs. The list of all the hardware 3786 * IPs that make up the asic is walked and the resume callbacks are run for 3787 * COMMON, GMC, and IH. resume puts the hardware into a functional state 3788 * after a suspend and updates the software state as necessary. This 3789 * function is also used for restoring the GPU after a GPU reset. 3790 * Returns 0 on success, negative error code on failure. 3791 */ 3792 static int amdgpu_device_ip_resume_phase1(struct amdgpu_device *adev) 3793 { 3794 int i, r; 3795 3796 for (i = 0; i < adev->num_ip_blocks; i++) { 3797 if (!adev->ip_blocks[i].status.valid || adev->ip_blocks[i].status.hw) 3798 continue; 3799 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON || 3800 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC || 3801 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH || 3802 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP && amdgpu_sriov_vf(adev))) { 3803 3804 r = amdgpu_ip_block_resume(&adev->ip_blocks[i]); 3805 if (r) 3806 return r; 3807 } 3808 } 3809 3810 return 0; 3811 } 3812 3813 /** 3814 * amdgpu_device_ip_resume_phase2 - run resume for hardware IPs 3815 * 3816 * @adev: amdgpu_device pointer 3817 * 3818 * Second resume function for hardware IPs. The list of all the hardware 3819 * IPs that make up the asic is walked and the resume callbacks are run for 3820 * all blocks except COMMON, GMC, and IH. resume puts the hardware into a 3821 * functional state after a suspend and updates the software state as 3822 * necessary. This function is also used for restoring the GPU after a GPU 3823 * reset. 3824 * Returns 0 on success, negative error code on failure. 3825 */ 3826 static int amdgpu_device_ip_resume_phase2(struct amdgpu_device *adev) 3827 { 3828 int i, r; 3829 3830 for (i = 0; i < adev->num_ip_blocks; i++) { 3831 if (!adev->ip_blocks[i].status.valid || adev->ip_blocks[i].status.hw) 3832 continue; 3833 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON || 3834 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC || 3835 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH || 3836 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE || 3837 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) 3838 continue; 3839 r = amdgpu_ip_block_resume(&adev->ip_blocks[i]); 3840 if (r) 3841 return r; 3842 } 3843 3844 return 0; 3845 } 3846 3847 /** 3848 * amdgpu_device_ip_resume_phase3 - run resume for hardware IPs 3849 * 3850 * @adev: amdgpu_device pointer 3851 * 3852 * Third resume function for hardware IPs. The list of all the hardware 3853 * IPs that make up the asic is walked and the resume callbacks are run for 3854 * all DCE. resume puts the hardware into a functional state after a suspend 3855 * and updates the software state as necessary. This function is also used 3856 * for restoring the GPU after a GPU reset. 3857 * 3858 * Returns 0 on success, negative error code on failure. 3859 */ 3860 static int amdgpu_device_ip_resume_phase3(struct amdgpu_device *adev) 3861 { 3862 int i, r; 3863 3864 for (i = 0; i < adev->num_ip_blocks; i++) { 3865 if (!adev->ip_blocks[i].status.valid || adev->ip_blocks[i].status.hw) 3866 continue; 3867 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE) { 3868 r = amdgpu_ip_block_resume(&adev->ip_blocks[i]); 3869 if (r) 3870 return r; 3871 } 3872 } 3873 3874 return 0; 3875 } 3876 3877 /** 3878 * amdgpu_device_ip_resume - run resume for hardware IPs 3879 * 3880 * @adev: amdgpu_device pointer 3881 * 3882 * Main resume function for hardware IPs. The hardware IPs 3883 * are split into two resume functions because they are 3884 * also used in recovering from a GPU reset and some additional 3885 * steps need to be take between them. In this case (S3/S4) they are 3886 * run sequentially. 3887 * Returns 0 on success, negative error code on failure. 3888 */ 3889 static int amdgpu_device_ip_resume(struct amdgpu_device *adev) 3890 { 3891 int r; 3892 3893 r = amdgpu_device_ip_resume_phase1(adev); 3894 if (r) 3895 return r; 3896 3897 r = amdgpu_device_fw_loading(adev); 3898 if (r) 3899 return r; 3900 3901 r = amdgpu_device_ip_resume_phase2(adev); 3902 3903 if (adev->mman.buffer_funcs_ring->sched.ready) 3904 amdgpu_ttm_set_buffer_funcs_status(adev, true); 3905 3906 if (r) 3907 return r; 3908 3909 amdgpu_fence_driver_hw_init(adev); 3910 3911 r = amdgpu_device_ip_resume_phase3(adev); 3912 3913 return r; 3914 } 3915 3916 /** 3917 * amdgpu_device_detect_sriov_bios - determine if the board supports SR-IOV 3918 * 3919 * @adev: amdgpu_device pointer 3920 * 3921 * Query the VBIOS data tables to determine if the board supports SR-IOV. 3922 */ 3923 static void amdgpu_device_detect_sriov_bios(struct amdgpu_device *adev) 3924 { 3925 if (amdgpu_sriov_vf(adev)) { 3926 if (adev->is_atom_fw) { 3927 if (amdgpu_atomfirmware_gpu_virtualization_supported(adev)) 3928 adev->virt.caps |= AMDGPU_SRIOV_CAPS_SRIOV_VBIOS; 3929 } else { 3930 if (amdgpu_atombios_has_gpu_virtualization_table(adev)) 3931 adev->virt.caps |= AMDGPU_SRIOV_CAPS_SRIOV_VBIOS; 3932 } 3933 3934 if (!(adev->virt.caps & AMDGPU_SRIOV_CAPS_SRIOV_VBIOS)) 3935 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_NO_VBIOS, 0, 0); 3936 } 3937 } 3938 3939 /** 3940 * amdgpu_device_asic_has_dc_support - determine if DC supports the asic 3941 * 3942 * @asic_type: AMD asic type 3943 * 3944 * Check if there is DC (new modesetting infrastructre) support for an asic. 3945 * returns true if DC has support, false if not. 3946 */ 3947 bool amdgpu_device_asic_has_dc_support(enum amd_asic_type asic_type) 3948 { 3949 switch (asic_type) { 3950 #ifdef CONFIG_DRM_AMDGPU_SI 3951 case CHIP_HAINAN: 3952 #endif 3953 case CHIP_TOPAZ: 3954 /* chips with no display hardware */ 3955 return false; 3956 #if defined(CONFIG_DRM_AMD_DC) 3957 case CHIP_TAHITI: 3958 case CHIP_PITCAIRN: 3959 case CHIP_VERDE: 3960 case CHIP_OLAND: 3961 /* 3962 * We have systems in the wild with these ASICs that require 3963 * LVDS and VGA support which is not supported with DC. 3964 * 3965 * Fallback to the non-DC driver here by default so as not to 3966 * cause regressions. 3967 */ 3968 #if defined(CONFIG_DRM_AMD_DC_SI) 3969 return amdgpu_dc > 0; 3970 #else 3971 return false; 3972 #endif 3973 case CHIP_BONAIRE: 3974 case CHIP_KAVERI: 3975 case CHIP_KABINI: 3976 case CHIP_MULLINS: 3977 /* 3978 * We have systems in the wild with these ASICs that require 3979 * VGA support which is not supported with DC. 3980 * 3981 * Fallback to the non-DC driver here by default so as not to 3982 * cause regressions. 3983 */ 3984 return amdgpu_dc > 0; 3985 default: 3986 return amdgpu_dc != 0; 3987 #else 3988 default: 3989 if (amdgpu_dc > 0) 3990 DRM_INFO_ONCE("Display Core has been requested via kernel parameter but isn't supported by ASIC, ignoring\n"); 3991 return false; 3992 #endif 3993 } 3994 } 3995 3996 /** 3997 * amdgpu_device_has_dc_support - check if dc is supported 3998 * 3999 * @adev: amdgpu_device pointer 4000 * 4001 * Returns true for supported, false for not supported 4002 */ 4003 bool amdgpu_device_has_dc_support(struct amdgpu_device *adev) 4004 { 4005 if (adev->enable_virtual_display || 4006 (adev->harvest_ip_mask & AMD_HARVEST_IP_DMU_MASK)) 4007 return false; 4008 4009 return amdgpu_device_asic_has_dc_support(adev->asic_type); 4010 } 4011 4012 static void amdgpu_device_xgmi_reset_func(struct work_struct *__work) 4013 { 4014 struct amdgpu_device *adev = 4015 container_of(__work, struct amdgpu_device, xgmi_reset_work); 4016 struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev); 4017 4018 /* It's a bug to not have a hive within this function */ 4019 if (WARN_ON(!hive)) 4020 return; 4021 4022 /* 4023 * Use task barrier to synchronize all xgmi reset works across the 4024 * hive. task_barrier_enter and task_barrier_exit will block 4025 * until all the threads running the xgmi reset works reach 4026 * those points. task_barrier_full will do both blocks. 4027 */ 4028 if (amdgpu_asic_reset_method(adev) == AMD_RESET_METHOD_BACO) { 4029 4030 task_barrier_enter(&hive->tb); 4031 adev->asic_reset_res = amdgpu_device_baco_enter(adev_to_drm(adev)); 4032 4033 if (adev->asic_reset_res) 4034 goto fail; 4035 4036 task_barrier_exit(&hive->tb); 4037 adev->asic_reset_res = amdgpu_device_baco_exit(adev_to_drm(adev)); 4038 4039 if (adev->asic_reset_res) 4040 goto fail; 4041 4042 amdgpu_ras_reset_error_count(adev, AMDGPU_RAS_BLOCK__MMHUB); 4043 } else { 4044 4045 task_barrier_full(&hive->tb); 4046 adev->asic_reset_res = amdgpu_asic_reset(adev); 4047 } 4048 4049 fail: 4050 if (adev->asic_reset_res) 4051 DRM_WARN("ASIC reset failed with error, %d for drm dev, %s", 4052 adev->asic_reset_res, adev_to_drm(adev)->unique); 4053 amdgpu_put_xgmi_hive(hive); 4054 } 4055 4056 static int amdgpu_device_get_job_timeout_settings(struct amdgpu_device *adev) 4057 { 4058 char *input = amdgpu_lockup_timeout; 4059 char *timeout_setting = NULL; 4060 int index = 0; 4061 long timeout; 4062 int ret = 0; 4063 4064 /* 4065 * By default timeout for non compute jobs is 10000 4066 * and 60000 for compute jobs. 4067 * In SR-IOV or passthrough mode, timeout for compute 4068 * jobs are 60000 by default. 4069 */ 4070 adev->gfx_timeout = msecs_to_jiffies(10000); 4071 adev->sdma_timeout = adev->video_timeout = adev->gfx_timeout; 4072 if (amdgpu_sriov_vf(adev)) 4073 adev->compute_timeout = amdgpu_sriov_is_pp_one_vf(adev) ? 4074 msecs_to_jiffies(60000) : msecs_to_jiffies(10000); 4075 else 4076 adev->compute_timeout = msecs_to_jiffies(60000); 4077 4078 if (strnlen(input, AMDGPU_MAX_TIMEOUT_PARAM_LENGTH)) { 4079 while ((timeout_setting = strsep(&input, ",")) && 4080 strnlen(timeout_setting, AMDGPU_MAX_TIMEOUT_PARAM_LENGTH)) { 4081 ret = kstrtol(timeout_setting, 0, &timeout); 4082 if (ret) 4083 return ret; 4084 4085 if (timeout == 0) { 4086 index++; 4087 continue; 4088 } else if (timeout < 0) { 4089 timeout = MAX_SCHEDULE_TIMEOUT; 4090 dev_warn(adev->dev, "lockup timeout disabled"); 4091 add_taint(TAINT_SOFTLOCKUP, LOCKDEP_STILL_OK); 4092 } else { 4093 timeout = msecs_to_jiffies(timeout); 4094 } 4095 4096 switch (index++) { 4097 case 0: 4098 adev->gfx_timeout = timeout; 4099 break; 4100 case 1: 4101 adev->compute_timeout = timeout; 4102 break; 4103 case 2: 4104 adev->sdma_timeout = timeout; 4105 break; 4106 case 3: 4107 adev->video_timeout = timeout; 4108 break; 4109 default: 4110 break; 4111 } 4112 } 4113 /* 4114 * There is only one value specified and 4115 * it should apply to all non-compute jobs. 4116 */ 4117 if (index == 1) { 4118 adev->sdma_timeout = adev->video_timeout = adev->gfx_timeout; 4119 if (amdgpu_sriov_vf(adev) || amdgpu_passthrough(adev)) 4120 adev->compute_timeout = adev->gfx_timeout; 4121 } 4122 } 4123 4124 return ret; 4125 } 4126 4127 /** 4128 * amdgpu_device_check_iommu_direct_map - check if RAM direct mapped to GPU 4129 * 4130 * @adev: amdgpu_device pointer 4131 * 4132 * RAM direct mapped to GPU if IOMMU is not enabled or is pass through mode 4133 */ 4134 static void amdgpu_device_check_iommu_direct_map(struct amdgpu_device *adev) 4135 { 4136 struct iommu_domain *domain; 4137 4138 domain = iommu_get_domain_for_dev(adev->dev); 4139 if (!domain || domain->type == IOMMU_DOMAIN_IDENTITY) 4140 adev->ram_is_direct_mapped = true; 4141 } 4142 4143 #if defined(CONFIG_HSA_AMD_P2P) 4144 /** 4145 * amdgpu_device_check_iommu_remap - Check if DMA remapping is enabled. 4146 * 4147 * @adev: amdgpu_device pointer 4148 * 4149 * return if IOMMU remapping bar address 4150 */ 4151 static bool amdgpu_device_check_iommu_remap(struct amdgpu_device *adev) 4152 { 4153 struct iommu_domain *domain; 4154 4155 domain = iommu_get_domain_for_dev(adev->dev); 4156 if (domain && (domain->type == IOMMU_DOMAIN_DMA || 4157 domain->type == IOMMU_DOMAIN_DMA_FQ)) 4158 return true; 4159 4160 return false; 4161 } 4162 #endif 4163 4164 static const struct attribute *amdgpu_dev_attributes[] = { 4165 &dev_attr_pcie_replay_count.attr, 4166 NULL 4167 }; 4168 4169 static void amdgpu_device_set_mcbp(struct amdgpu_device *adev) 4170 { 4171 if (amdgpu_mcbp == 1) 4172 adev->gfx.mcbp = true; 4173 else if (amdgpu_mcbp == 0) 4174 adev->gfx.mcbp = false; 4175 4176 if (amdgpu_sriov_vf(adev)) 4177 adev->gfx.mcbp = true; 4178 4179 if (adev->gfx.mcbp) 4180 DRM_INFO("MCBP is enabled\n"); 4181 } 4182 4183 /** 4184 * amdgpu_device_init - initialize the driver 4185 * 4186 * @adev: amdgpu_device pointer 4187 * @flags: driver flags 4188 * 4189 * Initializes the driver info and hw (all asics). 4190 * Returns 0 for success or an error on failure. 4191 * Called at driver startup. 4192 */ 4193 int amdgpu_device_init(struct amdgpu_device *adev, 4194 uint32_t flags) 4195 { 4196 struct drm_device *ddev = adev_to_drm(adev); 4197 struct pci_dev *pdev = adev->pdev; 4198 int r, i; 4199 bool px = false; 4200 u32 max_MBps; 4201 int tmp; 4202 4203 adev->shutdown = false; 4204 adev->flags = flags; 4205 4206 if (amdgpu_force_asic_type >= 0 && amdgpu_force_asic_type < CHIP_LAST) 4207 adev->asic_type = amdgpu_force_asic_type; 4208 else 4209 adev->asic_type = flags & AMD_ASIC_MASK; 4210 4211 adev->usec_timeout = AMDGPU_MAX_USEC_TIMEOUT; 4212 if (amdgpu_emu_mode == 1) 4213 adev->usec_timeout *= 10; 4214 adev->gmc.gart_size = 512 * 1024 * 1024; 4215 adev->accel_working = false; 4216 adev->num_rings = 0; 4217 RCU_INIT_POINTER(adev->gang_submit, dma_fence_get_stub()); 4218 adev->mman.buffer_funcs = NULL; 4219 adev->mman.buffer_funcs_ring = NULL; 4220 adev->vm_manager.vm_pte_funcs = NULL; 4221 adev->vm_manager.vm_pte_num_scheds = 0; 4222 adev->gmc.gmc_funcs = NULL; 4223 adev->harvest_ip_mask = 0x0; 4224 adev->fence_context = dma_fence_context_alloc(AMDGPU_MAX_RINGS); 4225 bitmap_zero(adev->gfx.pipe_reserve_bitmap, AMDGPU_MAX_COMPUTE_QUEUES); 4226 4227 adev->smc_rreg = &amdgpu_invalid_rreg; 4228 adev->smc_wreg = &amdgpu_invalid_wreg; 4229 adev->pcie_rreg = &amdgpu_invalid_rreg; 4230 adev->pcie_wreg = &amdgpu_invalid_wreg; 4231 adev->pcie_rreg_ext = &amdgpu_invalid_rreg_ext; 4232 adev->pcie_wreg_ext = &amdgpu_invalid_wreg_ext; 4233 adev->pciep_rreg = &amdgpu_invalid_rreg; 4234 adev->pciep_wreg = &amdgpu_invalid_wreg; 4235 adev->pcie_rreg64 = &amdgpu_invalid_rreg64; 4236 adev->pcie_wreg64 = &amdgpu_invalid_wreg64; 4237 adev->pcie_rreg64_ext = &amdgpu_invalid_rreg64_ext; 4238 adev->pcie_wreg64_ext = &amdgpu_invalid_wreg64_ext; 4239 adev->uvd_ctx_rreg = &amdgpu_invalid_rreg; 4240 adev->uvd_ctx_wreg = &amdgpu_invalid_wreg; 4241 adev->didt_rreg = &amdgpu_invalid_rreg; 4242 adev->didt_wreg = &amdgpu_invalid_wreg; 4243 adev->gc_cac_rreg = &amdgpu_invalid_rreg; 4244 adev->gc_cac_wreg = &amdgpu_invalid_wreg; 4245 adev->audio_endpt_rreg = &amdgpu_block_invalid_rreg; 4246 adev->audio_endpt_wreg = &amdgpu_block_invalid_wreg; 4247 4248 DRM_INFO("initializing kernel modesetting (%s 0x%04X:0x%04X 0x%04X:0x%04X 0x%02X).\n", 4249 amdgpu_asic_name[adev->asic_type], pdev->vendor, pdev->device, 4250 pdev->subsystem_vendor, pdev->subsystem_device, pdev->revision); 4251 4252 /* mutex initialization are all done here so we 4253 * can recall function without having locking issues 4254 */ 4255 mutex_init(&adev->firmware.mutex); 4256 mutex_init(&adev->pm.mutex); 4257 mutex_init(&adev->gfx.gpu_clock_mutex); 4258 mutex_init(&adev->srbm_mutex); 4259 mutex_init(&adev->gfx.pipe_reserve_mutex); 4260 mutex_init(&adev->gfx.gfx_off_mutex); 4261 mutex_init(&adev->gfx.partition_mutex); 4262 mutex_init(&adev->grbm_idx_mutex); 4263 mutex_init(&adev->mn_lock); 4264 mutex_init(&adev->virt.vf_errors.lock); 4265 hash_init(adev->mn_hash); 4266 mutex_init(&adev->psp.mutex); 4267 mutex_init(&adev->notifier_lock); 4268 mutex_init(&adev->pm.stable_pstate_ctx_lock); 4269 mutex_init(&adev->benchmark_mutex); 4270 mutex_init(&adev->gfx.reset_sem_mutex); 4271 /* Initialize the mutex for cleaner shader isolation between GFX and compute processes */ 4272 mutex_init(&adev->enforce_isolation_mutex); 4273 mutex_init(&adev->gfx.kfd_sch_mutex); 4274 4275 amdgpu_device_init_apu_flags(adev); 4276 4277 r = amdgpu_device_check_arguments(adev); 4278 if (r) 4279 return r; 4280 4281 spin_lock_init(&adev->mmio_idx_lock); 4282 spin_lock_init(&adev->smc_idx_lock); 4283 spin_lock_init(&adev->pcie_idx_lock); 4284 spin_lock_init(&adev->uvd_ctx_idx_lock); 4285 spin_lock_init(&adev->didt_idx_lock); 4286 spin_lock_init(&adev->gc_cac_idx_lock); 4287 spin_lock_init(&adev->se_cac_idx_lock); 4288 spin_lock_init(&adev->audio_endpt_idx_lock); 4289 spin_lock_init(&adev->mm_stats.lock); 4290 spin_lock_init(&adev->virt.rlcg_reg_lock); 4291 spin_lock_init(&adev->wb.lock); 4292 4293 INIT_LIST_HEAD(&adev->reset_list); 4294 4295 INIT_LIST_HEAD(&adev->ras_list); 4296 4297 INIT_LIST_HEAD(&adev->pm.od_kobj_list); 4298 4299 INIT_DELAYED_WORK(&adev->delayed_init_work, 4300 amdgpu_device_delayed_init_work_handler); 4301 INIT_DELAYED_WORK(&adev->gfx.gfx_off_delay_work, 4302 amdgpu_device_delay_enable_gfx_off); 4303 /* 4304 * Initialize the enforce_isolation work structures for each XCP 4305 * partition. This work handler is responsible for enforcing shader 4306 * isolation on AMD GPUs. It counts the number of emitted fences for 4307 * each GFX and compute ring. If there are any fences, it schedules 4308 * the `enforce_isolation_work` to be run after a delay. If there are 4309 * no fences, it signals the Kernel Fusion Driver (KFD) to resume the 4310 * runqueue. 4311 */ 4312 for (i = 0; i < MAX_XCP; i++) { 4313 INIT_DELAYED_WORK(&adev->gfx.enforce_isolation[i].work, 4314 amdgpu_gfx_enforce_isolation_handler); 4315 adev->gfx.enforce_isolation[i].adev = adev; 4316 adev->gfx.enforce_isolation[i].xcp_id = i; 4317 } 4318 4319 INIT_WORK(&adev->xgmi_reset_work, amdgpu_device_xgmi_reset_func); 4320 4321 adev->gfx.gfx_off_req_count = 1; 4322 adev->gfx.gfx_off_residency = 0; 4323 adev->gfx.gfx_off_entrycount = 0; 4324 adev->pm.ac_power = power_supply_is_system_supplied() > 0; 4325 4326 atomic_set(&adev->throttling_logging_enabled, 1); 4327 /* 4328 * If throttling continues, logging will be performed every minute 4329 * to avoid log flooding. "-1" is subtracted since the thermal 4330 * throttling interrupt comes every second. Thus, the total logging 4331 * interval is 59 seconds(retelimited printk interval) + 1(waiting 4332 * for throttling interrupt) = 60 seconds. 4333 */ 4334 ratelimit_state_init(&adev->throttling_logging_rs, (60 - 1) * HZ, 1); 4335 ratelimit_state_init(&adev->virt.ras_telemetry_rs, 5 * HZ, 1); 4336 4337 ratelimit_set_flags(&adev->throttling_logging_rs, RATELIMIT_MSG_ON_RELEASE); 4338 ratelimit_set_flags(&adev->virt.ras_telemetry_rs, RATELIMIT_MSG_ON_RELEASE); 4339 4340 /* Registers mapping */ 4341 /* TODO: block userspace mapping of io register */ 4342 if (adev->asic_type >= CHIP_BONAIRE) { 4343 adev->rmmio_base = pci_resource_start(adev->pdev, 5); 4344 adev->rmmio_size = pci_resource_len(adev->pdev, 5); 4345 } else { 4346 adev->rmmio_base = pci_resource_start(adev->pdev, 2); 4347 adev->rmmio_size = pci_resource_len(adev->pdev, 2); 4348 } 4349 4350 for (i = 0; i < AMD_IP_BLOCK_TYPE_NUM; i++) 4351 atomic_set(&adev->pm.pwr_state[i], POWER_STATE_UNKNOWN); 4352 4353 adev->rmmio = ioremap(adev->rmmio_base, adev->rmmio_size); 4354 if (!adev->rmmio) 4355 return -ENOMEM; 4356 4357 DRM_INFO("register mmio base: 0x%08X\n", (uint32_t)adev->rmmio_base); 4358 DRM_INFO("register mmio size: %u\n", (unsigned int)adev->rmmio_size); 4359 4360 /* 4361 * Reset domain needs to be present early, before XGMI hive discovered 4362 * (if any) and initialized to use reset sem and in_gpu reset flag 4363 * early on during init and before calling to RREG32. 4364 */ 4365 adev->reset_domain = amdgpu_reset_create_reset_domain(SINGLE_DEVICE, "amdgpu-reset-dev"); 4366 if (!adev->reset_domain) 4367 return -ENOMEM; 4368 4369 /* detect hw virtualization here */ 4370 amdgpu_detect_virtualization(adev); 4371 4372 amdgpu_device_get_pcie_info(adev); 4373 4374 r = amdgpu_device_get_job_timeout_settings(adev); 4375 if (r) { 4376 dev_err(adev->dev, "invalid lockup_timeout parameter syntax\n"); 4377 return r; 4378 } 4379 4380 amdgpu_device_set_mcbp(adev); 4381 4382 /* 4383 * By default, use default mode where all blocks are expected to be 4384 * initialized. At present a 'swinit' of blocks is required to be 4385 * completed before the need for a different level is detected. 4386 */ 4387 amdgpu_set_init_level(adev, AMDGPU_INIT_LEVEL_DEFAULT); 4388 /* early init functions */ 4389 r = amdgpu_device_ip_early_init(adev); 4390 if (r) 4391 return r; 4392 4393 /* Get rid of things like offb */ 4394 r = aperture_remove_conflicting_pci_devices(adev->pdev, amdgpu_kms_driver.name); 4395 if (r) 4396 return r; 4397 4398 /* Enable TMZ based on IP_VERSION */ 4399 amdgpu_gmc_tmz_set(adev); 4400 4401 if (amdgpu_sriov_vf(adev) && 4402 amdgpu_ip_version(adev, GC_HWIP, 0) >= IP_VERSION(10, 3, 0)) 4403 /* VF MMIO access (except mailbox range) from CPU 4404 * will be blocked during sriov runtime 4405 */ 4406 adev->virt.caps |= AMDGPU_VF_MMIO_ACCESS_PROTECT; 4407 4408 amdgpu_gmc_noretry_set(adev); 4409 /* Need to get xgmi info early to decide the reset behavior*/ 4410 if (adev->gmc.xgmi.supported) { 4411 r = adev->gfxhub.funcs->get_xgmi_info(adev); 4412 if (r) 4413 return r; 4414 } 4415 4416 /* enable PCIE atomic ops */ 4417 if (amdgpu_sriov_vf(adev)) { 4418 if (adev->virt.fw_reserve.p_pf2vf) 4419 adev->have_atomics_support = ((struct amd_sriov_msg_pf2vf_info *) 4420 adev->virt.fw_reserve.p_pf2vf)->pcie_atomic_ops_support_flags == 4421 (PCI_EXP_DEVCAP2_ATOMIC_COMP32 | PCI_EXP_DEVCAP2_ATOMIC_COMP64); 4422 /* APUs w/ gfx9 onwards doesn't reply on PCIe atomics, rather it is a 4423 * internal path natively support atomics, set have_atomics_support to true. 4424 */ 4425 } else if ((adev->flags & AMD_IS_APU) && 4426 (amdgpu_ip_version(adev, GC_HWIP, 0) > 4427 IP_VERSION(9, 0, 0))) { 4428 adev->have_atomics_support = true; 4429 } else { 4430 adev->have_atomics_support = 4431 !pci_enable_atomic_ops_to_root(adev->pdev, 4432 PCI_EXP_DEVCAP2_ATOMIC_COMP32 | 4433 PCI_EXP_DEVCAP2_ATOMIC_COMP64); 4434 } 4435 4436 if (!adev->have_atomics_support) 4437 dev_info(adev->dev, "PCIE atomic ops is not supported\n"); 4438 4439 /* doorbell bar mapping and doorbell index init*/ 4440 amdgpu_doorbell_init(adev); 4441 4442 if (amdgpu_emu_mode == 1) { 4443 /* post the asic on emulation mode */ 4444 emu_soc_asic_init(adev); 4445 goto fence_driver_init; 4446 } 4447 4448 amdgpu_reset_init(adev); 4449 4450 /* detect if we are with an SRIOV vbios */ 4451 if (adev->bios) 4452 amdgpu_device_detect_sriov_bios(adev); 4453 4454 /* check if we need to reset the asic 4455 * E.g., driver was not cleanly unloaded previously, etc. 4456 */ 4457 if (!amdgpu_sriov_vf(adev) && amdgpu_asic_need_reset_on_init(adev)) { 4458 if (adev->gmc.xgmi.num_physical_nodes) { 4459 dev_info(adev->dev, "Pending hive reset.\n"); 4460 amdgpu_set_init_level(adev, 4461 AMDGPU_INIT_LEVEL_MINIMAL_XGMI); 4462 } else if (amdgpu_ip_version(adev, MP1_HWIP, 0) == IP_VERSION(13, 0, 10) && 4463 !amdgpu_device_has_display_hardware(adev)) { 4464 r = psp_gpu_reset(adev); 4465 } else { 4466 tmp = amdgpu_reset_method; 4467 /* It should do a default reset when loading or reloading the driver, 4468 * regardless of the module parameter reset_method. 4469 */ 4470 amdgpu_reset_method = AMD_RESET_METHOD_NONE; 4471 r = amdgpu_asic_reset(adev); 4472 amdgpu_reset_method = tmp; 4473 } 4474 4475 if (r) { 4476 dev_err(adev->dev, "asic reset on init failed\n"); 4477 goto failed; 4478 } 4479 } 4480 4481 /* Post card if necessary */ 4482 if (amdgpu_device_need_post(adev)) { 4483 if (!adev->bios) { 4484 dev_err(adev->dev, "no vBIOS found\n"); 4485 r = -EINVAL; 4486 goto failed; 4487 } 4488 DRM_INFO("GPU posting now...\n"); 4489 r = amdgpu_device_asic_init(adev); 4490 if (r) { 4491 dev_err(adev->dev, "gpu post error!\n"); 4492 goto failed; 4493 } 4494 } 4495 4496 if (adev->bios) { 4497 if (adev->is_atom_fw) { 4498 /* Initialize clocks */ 4499 r = amdgpu_atomfirmware_get_clock_info(adev); 4500 if (r) { 4501 dev_err(adev->dev, "amdgpu_atomfirmware_get_clock_info failed\n"); 4502 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_GET_CLOCK_FAIL, 0, 0); 4503 goto failed; 4504 } 4505 } else { 4506 /* Initialize clocks */ 4507 r = amdgpu_atombios_get_clock_info(adev); 4508 if (r) { 4509 dev_err(adev->dev, "amdgpu_atombios_get_clock_info failed\n"); 4510 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_GET_CLOCK_FAIL, 0, 0); 4511 goto failed; 4512 } 4513 /* init i2c buses */ 4514 amdgpu_i2c_init(adev); 4515 } 4516 } 4517 4518 fence_driver_init: 4519 /* Fence driver */ 4520 r = amdgpu_fence_driver_sw_init(adev); 4521 if (r) { 4522 dev_err(adev->dev, "amdgpu_fence_driver_sw_init failed\n"); 4523 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_FENCE_INIT_FAIL, 0, 0); 4524 goto failed; 4525 } 4526 4527 /* init the mode config */ 4528 drm_mode_config_init(adev_to_drm(adev)); 4529 4530 r = amdgpu_device_ip_init(adev); 4531 if (r) { 4532 dev_err(adev->dev, "amdgpu_device_ip_init failed\n"); 4533 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_AMDGPU_INIT_FAIL, 0, 0); 4534 goto release_ras_con; 4535 } 4536 4537 amdgpu_fence_driver_hw_init(adev); 4538 4539 dev_info(adev->dev, 4540 "SE %d, SH per SE %d, CU per SH %d, active_cu_number %d\n", 4541 adev->gfx.config.max_shader_engines, 4542 adev->gfx.config.max_sh_per_se, 4543 adev->gfx.config.max_cu_per_sh, 4544 adev->gfx.cu_info.number); 4545 4546 adev->accel_working = true; 4547 4548 amdgpu_vm_check_compute_bug(adev); 4549 4550 /* Initialize the buffer migration limit. */ 4551 if (amdgpu_moverate >= 0) 4552 max_MBps = amdgpu_moverate; 4553 else 4554 max_MBps = 8; /* Allow 8 MB/s. */ 4555 /* Get a log2 for easy divisions. */ 4556 adev->mm_stats.log2_max_MBps = ilog2(max(1u, max_MBps)); 4557 4558 /* 4559 * Register gpu instance before amdgpu_device_enable_mgpu_fan_boost. 4560 * Otherwise the mgpu fan boost feature will be skipped due to the 4561 * gpu instance is counted less. 4562 */ 4563 amdgpu_register_gpu_instance(adev); 4564 4565 /* enable clockgating, etc. after ib tests, etc. since some blocks require 4566 * explicit gating rather than handling it automatically. 4567 */ 4568 if (adev->init_lvl->level != AMDGPU_INIT_LEVEL_MINIMAL_XGMI) { 4569 r = amdgpu_device_ip_late_init(adev); 4570 if (r) { 4571 dev_err(adev->dev, "amdgpu_device_ip_late_init failed\n"); 4572 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_AMDGPU_LATE_INIT_FAIL, 0, r); 4573 goto release_ras_con; 4574 } 4575 /* must succeed. */ 4576 amdgpu_ras_resume(adev); 4577 queue_delayed_work(system_wq, &adev->delayed_init_work, 4578 msecs_to_jiffies(AMDGPU_RESUME_MS)); 4579 } 4580 4581 if (amdgpu_sriov_vf(adev)) { 4582 amdgpu_virt_release_full_gpu(adev, true); 4583 flush_delayed_work(&adev->delayed_init_work); 4584 } 4585 4586 /* 4587 * Place those sysfs registering after `late_init`. As some of those 4588 * operations performed in `late_init` might affect the sysfs 4589 * interfaces creating. 4590 */ 4591 r = amdgpu_atombios_sysfs_init(adev); 4592 if (r) 4593 drm_err(&adev->ddev, 4594 "registering atombios sysfs failed (%d).\n", r); 4595 4596 r = amdgpu_pm_sysfs_init(adev); 4597 if (r) 4598 DRM_ERROR("registering pm sysfs failed (%d).\n", r); 4599 4600 r = amdgpu_ucode_sysfs_init(adev); 4601 if (r) { 4602 adev->ucode_sysfs_en = false; 4603 DRM_ERROR("Creating firmware sysfs failed (%d).\n", r); 4604 } else 4605 adev->ucode_sysfs_en = true; 4606 4607 r = sysfs_create_files(&adev->dev->kobj, amdgpu_dev_attributes); 4608 if (r) 4609 dev_err(adev->dev, "Could not create amdgpu device attr\n"); 4610 4611 r = devm_device_add_group(adev->dev, &amdgpu_board_attrs_group); 4612 if (r) 4613 dev_err(adev->dev, 4614 "Could not create amdgpu board attributes\n"); 4615 4616 amdgpu_fru_sysfs_init(adev); 4617 amdgpu_reg_state_sysfs_init(adev); 4618 amdgpu_xcp_cfg_sysfs_init(adev); 4619 4620 if (IS_ENABLED(CONFIG_PERF_EVENTS)) 4621 r = amdgpu_pmu_init(adev); 4622 if (r) 4623 dev_err(adev->dev, "amdgpu_pmu_init failed\n"); 4624 4625 /* Have stored pci confspace at hand for restore in sudden PCI error */ 4626 if (amdgpu_device_cache_pci_state(adev->pdev)) 4627 pci_restore_state(pdev); 4628 4629 /* if we have > 1 VGA cards, then disable the amdgpu VGA resources */ 4630 /* this will fail for cards that aren't VGA class devices, just 4631 * ignore it 4632 */ 4633 if ((adev->pdev->class >> 8) == PCI_CLASS_DISPLAY_VGA) 4634 vga_client_register(adev->pdev, amdgpu_device_vga_set_decode); 4635 4636 px = amdgpu_device_supports_px(ddev); 4637 4638 if (px || (!dev_is_removable(&adev->pdev->dev) && 4639 apple_gmux_detect(NULL, NULL))) 4640 vga_switcheroo_register_client(adev->pdev, 4641 &amdgpu_switcheroo_ops, px); 4642 4643 if (px) 4644 vga_switcheroo_init_domain_pm_ops(adev->dev, &adev->vga_pm_domain); 4645 4646 if (adev->init_lvl->level == AMDGPU_INIT_LEVEL_MINIMAL_XGMI) 4647 amdgpu_xgmi_reset_on_init(adev); 4648 4649 amdgpu_device_check_iommu_direct_map(adev); 4650 4651 adev->pm_nb.notifier_call = amdgpu_device_pm_notifier; 4652 r = register_pm_notifier(&adev->pm_nb); 4653 if (r) 4654 goto failed; 4655 4656 return 0; 4657 4658 release_ras_con: 4659 if (amdgpu_sriov_vf(adev)) 4660 amdgpu_virt_release_full_gpu(adev, true); 4661 4662 /* failed in exclusive mode due to timeout */ 4663 if (amdgpu_sriov_vf(adev) && 4664 !amdgpu_sriov_runtime(adev) && 4665 amdgpu_virt_mmio_blocked(adev) && 4666 !amdgpu_virt_wait_reset(adev)) { 4667 dev_err(adev->dev, "VF exclusive mode timeout\n"); 4668 /* Don't send request since VF is inactive. */ 4669 adev->virt.caps &= ~AMDGPU_SRIOV_CAPS_RUNTIME; 4670 adev->virt.ops = NULL; 4671 r = -EAGAIN; 4672 } 4673 amdgpu_release_ras_context(adev); 4674 4675 failed: 4676 amdgpu_vf_error_trans_all(adev); 4677 4678 return r; 4679 } 4680 4681 static void amdgpu_device_unmap_mmio(struct amdgpu_device *adev) 4682 { 4683 4684 /* Clear all CPU mappings pointing to this device */ 4685 unmap_mapping_range(adev->ddev.anon_inode->i_mapping, 0, 0, 1); 4686 4687 /* Unmap all mapped bars - Doorbell, registers and VRAM */ 4688 amdgpu_doorbell_fini(adev); 4689 4690 iounmap(adev->rmmio); 4691 adev->rmmio = NULL; 4692 if (adev->mman.aper_base_kaddr) 4693 iounmap(adev->mman.aper_base_kaddr); 4694 adev->mman.aper_base_kaddr = NULL; 4695 4696 /* Memory manager related */ 4697 if (!adev->gmc.xgmi.connected_to_cpu && !adev->gmc.is_app_apu) { 4698 arch_phys_wc_del(adev->gmc.vram_mtrr); 4699 arch_io_free_memtype_wc(adev->gmc.aper_base, adev->gmc.aper_size); 4700 } 4701 } 4702 4703 /** 4704 * amdgpu_device_fini_hw - tear down the driver 4705 * 4706 * @adev: amdgpu_device pointer 4707 * 4708 * Tear down the driver info (all asics). 4709 * Called at driver shutdown. 4710 */ 4711 void amdgpu_device_fini_hw(struct amdgpu_device *adev) 4712 { 4713 dev_info(adev->dev, "amdgpu: finishing device.\n"); 4714 flush_delayed_work(&adev->delayed_init_work); 4715 4716 if (adev->mman.initialized) 4717 drain_workqueue(adev->mman.bdev.wq); 4718 adev->shutdown = true; 4719 4720 unregister_pm_notifier(&adev->pm_nb); 4721 4722 /* make sure IB test finished before entering exclusive mode 4723 * to avoid preemption on IB test 4724 */ 4725 if (amdgpu_sriov_vf(adev)) { 4726 amdgpu_virt_request_full_gpu(adev, false); 4727 amdgpu_virt_fini_data_exchange(adev); 4728 } 4729 4730 /* disable all interrupts */ 4731 amdgpu_irq_disable_all(adev); 4732 if (adev->mode_info.mode_config_initialized) { 4733 if (!drm_drv_uses_atomic_modeset(adev_to_drm(adev))) 4734 drm_helper_force_disable_all(adev_to_drm(adev)); 4735 else 4736 drm_atomic_helper_shutdown(adev_to_drm(adev)); 4737 } 4738 amdgpu_fence_driver_hw_fini(adev); 4739 4740 if (adev->pm.sysfs_initialized) 4741 amdgpu_pm_sysfs_fini(adev); 4742 if (adev->ucode_sysfs_en) 4743 amdgpu_ucode_sysfs_fini(adev); 4744 sysfs_remove_files(&adev->dev->kobj, amdgpu_dev_attributes); 4745 amdgpu_fru_sysfs_fini(adev); 4746 4747 amdgpu_reg_state_sysfs_fini(adev); 4748 amdgpu_xcp_cfg_sysfs_fini(adev); 4749 4750 /* disable ras feature must before hw fini */ 4751 amdgpu_ras_pre_fini(adev); 4752 4753 amdgpu_ttm_set_buffer_funcs_status(adev, false); 4754 4755 amdgpu_device_ip_fini_early(adev); 4756 4757 amdgpu_irq_fini_hw(adev); 4758 4759 if (adev->mman.initialized) 4760 ttm_device_clear_dma_mappings(&adev->mman.bdev); 4761 4762 amdgpu_gart_dummy_page_fini(adev); 4763 4764 if (drm_dev_is_unplugged(adev_to_drm(adev))) 4765 amdgpu_device_unmap_mmio(adev); 4766 4767 } 4768 4769 void amdgpu_device_fini_sw(struct amdgpu_device *adev) 4770 { 4771 int idx; 4772 bool px; 4773 4774 amdgpu_device_ip_fini(adev); 4775 amdgpu_fence_driver_sw_fini(adev); 4776 amdgpu_ucode_release(&adev->firmware.gpu_info_fw); 4777 adev->accel_working = false; 4778 dma_fence_put(rcu_dereference_protected(adev->gang_submit, true)); 4779 4780 amdgpu_reset_fini(adev); 4781 4782 /* free i2c buses */ 4783 amdgpu_i2c_fini(adev); 4784 4785 if (adev->bios) { 4786 if (amdgpu_emu_mode != 1) 4787 amdgpu_atombios_fini(adev); 4788 amdgpu_bios_release(adev); 4789 } 4790 4791 kfree(adev->fru_info); 4792 adev->fru_info = NULL; 4793 4794 px = amdgpu_device_supports_px(adev_to_drm(adev)); 4795 4796 if (px || (!dev_is_removable(&adev->pdev->dev) && 4797 apple_gmux_detect(NULL, NULL))) 4798 vga_switcheroo_unregister_client(adev->pdev); 4799 4800 if (px) 4801 vga_switcheroo_fini_domain_pm_ops(adev->dev); 4802 4803 if ((adev->pdev->class >> 8) == PCI_CLASS_DISPLAY_VGA) 4804 vga_client_unregister(adev->pdev); 4805 4806 if (drm_dev_enter(adev_to_drm(adev), &idx)) { 4807 4808 iounmap(adev->rmmio); 4809 adev->rmmio = NULL; 4810 amdgpu_doorbell_fini(adev); 4811 drm_dev_exit(idx); 4812 } 4813 4814 if (IS_ENABLED(CONFIG_PERF_EVENTS)) 4815 amdgpu_pmu_fini(adev); 4816 if (adev->mman.discovery_bin) 4817 amdgpu_discovery_fini(adev); 4818 4819 amdgpu_reset_put_reset_domain(adev->reset_domain); 4820 adev->reset_domain = NULL; 4821 4822 kfree(adev->pci_state); 4823 4824 } 4825 4826 /** 4827 * amdgpu_device_evict_resources - evict device resources 4828 * @adev: amdgpu device object 4829 * 4830 * Evicts all ttm device resources(vram BOs, gart table) from the lru list 4831 * of the vram memory type. Mainly used for evicting device resources 4832 * at suspend time. 4833 * 4834 */ 4835 static int amdgpu_device_evict_resources(struct amdgpu_device *adev) 4836 { 4837 int ret; 4838 4839 /* No need to evict vram on APUs unless going to S4 */ 4840 if (!adev->in_s4 && (adev->flags & AMD_IS_APU)) 4841 return 0; 4842 4843 ret = amdgpu_ttm_evict_resources(adev, TTM_PL_VRAM); 4844 if (ret) 4845 DRM_WARN("evicting device resources failed\n"); 4846 return ret; 4847 } 4848 4849 /* 4850 * Suspend & resume. 4851 */ 4852 /** 4853 * amdgpu_device_pm_notifier - Notification block for Suspend/Hibernate events 4854 * @nb: notifier block 4855 * @mode: suspend mode 4856 * @data: data 4857 * 4858 * This function is called when the system is about to suspend or hibernate. 4859 * It is used to evict resources from the device before the system goes to 4860 * sleep while there is still access to swap. 4861 */ 4862 static int amdgpu_device_pm_notifier(struct notifier_block *nb, unsigned long mode, 4863 void *data) 4864 { 4865 struct amdgpu_device *adev = container_of(nb, struct amdgpu_device, pm_nb); 4866 int r; 4867 4868 switch (mode) { 4869 case PM_HIBERNATION_PREPARE: 4870 adev->in_s4 = true; 4871 fallthrough; 4872 case PM_SUSPEND_PREPARE: 4873 r = amdgpu_device_evict_resources(adev); 4874 /* 4875 * This is considered non-fatal at this time because 4876 * amdgpu_device_prepare() will also fatally evict resources. 4877 * See https://gitlab.freedesktop.org/drm/amd/-/issues/3781 4878 */ 4879 if (r) 4880 drm_warn(adev_to_drm(adev), "Failed to evict resources, freeze active processes if problems occur: %d\n", r); 4881 break; 4882 } 4883 4884 return NOTIFY_DONE; 4885 } 4886 4887 /** 4888 * amdgpu_device_prepare - prepare for device suspend 4889 * 4890 * @dev: drm dev pointer 4891 * 4892 * Prepare to put the hw in the suspend state (all asics). 4893 * Returns 0 for success or an error on failure. 4894 * Called at driver suspend. 4895 */ 4896 int amdgpu_device_prepare(struct drm_device *dev) 4897 { 4898 struct amdgpu_device *adev = drm_to_adev(dev); 4899 int i, r; 4900 4901 amdgpu_choose_low_power_state(adev); 4902 4903 if (dev->switch_power_state == DRM_SWITCH_POWER_OFF) 4904 return 0; 4905 4906 /* Evict the majority of BOs before starting suspend sequence */ 4907 r = amdgpu_device_evict_resources(adev); 4908 if (r) 4909 goto unprepare; 4910 4911 flush_delayed_work(&adev->gfx.gfx_off_delay_work); 4912 4913 for (i = 0; i < adev->num_ip_blocks; i++) { 4914 if (!adev->ip_blocks[i].status.valid) 4915 continue; 4916 if (!adev->ip_blocks[i].version->funcs->prepare_suspend) 4917 continue; 4918 r = adev->ip_blocks[i].version->funcs->prepare_suspend(&adev->ip_blocks[i]); 4919 if (r) 4920 goto unprepare; 4921 } 4922 4923 return 0; 4924 4925 unprepare: 4926 adev->in_s0ix = adev->in_s3 = adev->in_s4 = false; 4927 4928 return r; 4929 } 4930 4931 /** 4932 * amdgpu_device_suspend - initiate device suspend 4933 * 4934 * @dev: drm dev pointer 4935 * @notify_clients: notify in-kernel DRM clients 4936 * 4937 * Puts the hw in the suspend state (all asics). 4938 * Returns 0 for success or an error on failure. 4939 * Called at driver suspend. 4940 */ 4941 int amdgpu_device_suspend(struct drm_device *dev, bool notify_clients) 4942 { 4943 struct amdgpu_device *adev = drm_to_adev(dev); 4944 int r = 0; 4945 4946 if (dev->switch_power_state == DRM_SWITCH_POWER_OFF) 4947 return 0; 4948 4949 adev->in_suspend = true; 4950 4951 if (amdgpu_sriov_vf(adev)) { 4952 amdgpu_virt_fini_data_exchange(adev); 4953 r = amdgpu_virt_request_full_gpu(adev, false); 4954 if (r) 4955 return r; 4956 } 4957 4958 if (amdgpu_acpi_smart_shift_update(dev, AMDGPU_SS_DEV_D3)) 4959 DRM_WARN("smart shift update failed\n"); 4960 4961 if (notify_clients) 4962 drm_client_dev_suspend(adev_to_drm(adev), false); 4963 4964 cancel_delayed_work_sync(&adev->delayed_init_work); 4965 4966 amdgpu_ras_suspend(adev); 4967 4968 amdgpu_device_ip_suspend_phase1(adev); 4969 4970 if (!adev->in_s0ix) 4971 amdgpu_amdkfd_suspend(adev, adev->in_runpm); 4972 4973 r = amdgpu_device_evict_resources(adev); 4974 if (r) 4975 return r; 4976 4977 amdgpu_ttm_set_buffer_funcs_status(adev, false); 4978 4979 amdgpu_fence_driver_hw_fini(adev); 4980 4981 amdgpu_device_ip_suspend_phase2(adev); 4982 4983 if (amdgpu_sriov_vf(adev)) 4984 amdgpu_virt_release_full_gpu(adev, false); 4985 4986 r = amdgpu_dpm_notify_rlc_state(adev, false); 4987 if (r) 4988 return r; 4989 4990 return 0; 4991 } 4992 4993 /** 4994 * amdgpu_device_resume - initiate device resume 4995 * 4996 * @dev: drm dev pointer 4997 * @notify_clients: notify in-kernel DRM clients 4998 * 4999 * Bring the hw back to operating state (all asics). 5000 * Returns 0 for success or an error on failure. 5001 * Called at driver resume. 5002 */ 5003 int amdgpu_device_resume(struct drm_device *dev, bool notify_clients) 5004 { 5005 struct amdgpu_device *adev = drm_to_adev(dev); 5006 int r = 0; 5007 5008 if (amdgpu_sriov_vf(adev)) { 5009 r = amdgpu_virt_request_full_gpu(adev, true); 5010 if (r) 5011 return r; 5012 } 5013 5014 if (dev->switch_power_state == DRM_SWITCH_POWER_OFF) 5015 return 0; 5016 5017 if (adev->in_s0ix) 5018 amdgpu_dpm_gfx_state_change(adev, sGpuChangeState_D0Entry); 5019 5020 /* post card */ 5021 if (amdgpu_device_need_post(adev)) { 5022 r = amdgpu_device_asic_init(adev); 5023 if (r) 5024 dev_err(adev->dev, "amdgpu asic init failed\n"); 5025 } 5026 5027 r = amdgpu_device_ip_resume(adev); 5028 5029 if (r) { 5030 dev_err(adev->dev, "amdgpu_device_ip_resume failed (%d).\n", r); 5031 goto exit; 5032 } 5033 5034 if (!adev->in_s0ix) { 5035 r = amdgpu_amdkfd_resume(adev, adev->in_runpm); 5036 if (r) 5037 goto exit; 5038 } 5039 5040 r = amdgpu_device_ip_late_init(adev); 5041 if (r) 5042 goto exit; 5043 5044 queue_delayed_work(system_wq, &adev->delayed_init_work, 5045 msecs_to_jiffies(AMDGPU_RESUME_MS)); 5046 exit: 5047 if (amdgpu_sriov_vf(adev)) { 5048 amdgpu_virt_init_data_exchange(adev); 5049 amdgpu_virt_release_full_gpu(adev, true); 5050 } 5051 5052 if (r) 5053 return r; 5054 5055 /* Make sure IB tests flushed */ 5056 flush_delayed_work(&adev->delayed_init_work); 5057 5058 if (notify_clients) 5059 drm_client_dev_resume(adev_to_drm(adev), false); 5060 5061 amdgpu_ras_resume(adev); 5062 5063 if (adev->mode_info.num_crtc) { 5064 /* 5065 * Most of the connector probing functions try to acquire runtime pm 5066 * refs to ensure that the GPU is powered on when connector polling is 5067 * performed. Since we're calling this from a runtime PM callback, 5068 * trying to acquire rpm refs will cause us to deadlock. 5069 * 5070 * Since we're guaranteed to be holding the rpm lock, it's safe to 5071 * temporarily disable the rpm helpers so this doesn't deadlock us. 5072 */ 5073 #ifdef CONFIG_PM 5074 dev->dev->power.disable_depth++; 5075 #endif 5076 if (!adev->dc_enabled) 5077 drm_helper_hpd_irq_event(dev); 5078 else 5079 drm_kms_helper_hotplug_event(dev); 5080 #ifdef CONFIG_PM 5081 dev->dev->power.disable_depth--; 5082 #endif 5083 } 5084 adev->in_suspend = false; 5085 5086 if (adev->enable_mes) 5087 amdgpu_mes_self_test(adev); 5088 5089 if (amdgpu_acpi_smart_shift_update(dev, AMDGPU_SS_DEV_D0)) 5090 DRM_WARN("smart shift update failed\n"); 5091 5092 return 0; 5093 } 5094 5095 /** 5096 * amdgpu_device_ip_check_soft_reset - did soft reset succeed 5097 * 5098 * @adev: amdgpu_device pointer 5099 * 5100 * The list of all the hardware IPs that make up the asic is walked and 5101 * the check_soft_reset callbacks are run. check_soft_reset determines 5102 * if the asic is still hung or not. 5103 * Returns true if any of the IPs are still in a hung state, false if not. 5104 */ 5105 static bool amdgpu_device_ip_check_soft_reset(struct amdgpu_device *adev) 5106 { 5107 int i; 5108 bool asic_hang = false; 5109 5110 if (amdgpu_sriov_vf(adev)) 5111 return true; 5112 5113 if (amdgpu_asic_need_full_reset(adev)) 5114 return true; 5115 5116 for (i = 0; i < adev->num_ip_blocks; i++) { 5117 if (!adev->ip_blocks[i].status.valid) 5118 continue; 5119 if (adev->ip_blocks[i].version->funcs->check_soft_reset) 5120 adev->ip_blocks[i].status.hang = 5121 adev->ip_blocks[i].version->funcs->check_soft_reset( 5122 &adev->ip_blocks[i]); 5123 if (adev->ip_blocks[i].status.hang) { 5124 dev_info(adev->dev, "IP block:%s is hung!\n", adev->ip_blocks[i].version->funcs->name); 5125 asic_hang = true; 5126 } 5127 } 5128 return asic_hang; 5129 } 5130 5131 /** 5132 * amdgpu_device_ip_pre_soft_reset - prepare for soft reset 5133 * 5134 * @adev: amdgpu_device pointer 5135 * 5136 * The list of all the hardware IPs that make up the asic is walked and the 5137 * pre_soft_reset callbacks are run if the block is hung. pre_soft_reset 5138 * handles any IP specific hardware or software state changes that are 5139 * necessary for a soft reset to succeed. 5140 * Returns 0 on success, negative error code on failure. 5141 */ 5142 static int amdgpu_device_ip_pre_soft_reset(struct amdgpu_device *adev) 5143 { 5144 int i, r = 0; 5145 5146 for (i = 0; i < adev->num_ip_blocks; i++) { 5147 if (!adev->ip_blocks[i].status.valid) 5148 continue; 5149 if (adev->ip_blocks[i].status.hang && 5150 adev->ip_blocks[i].version->funcs->pre_soft_reset) { 5151 r = adev->ip_blocks[i].version->funcs->pre_soft_reset(&adev->ip_blocks[i]); 5152 if (r) 5153 return r; 5154 } 5155 } 5156 5157 return 0; 5158 } 5159 5160 /** 5161 * amdgpu_device_ip_need_full_reset - check if a full asic reset is needed 5162 * 5163 * @adev: amdgpu_device pointer 5164 * 5165 * Some hardware IPs cannot be soft reset. If they are hung, a full gpu 5166 * reset is necessary to recover. 5167 * Returns true if a full asic reset is required, false if not. 5168 */ 5169 static bool amdgpu_device_ip_need_full_reset(struct amdgpu_device *adev) 5170 { 5171 int i; 5172 5173 if (amdgpu_asic_need_full_reset(adev)) 5174 return true; 5175 5176 for (i = 0; i < adev->num_ip_blocks; i++) { 5177 if (!adev->ip_blocks[i].status.valid) 5178 continue; 5179 if ((adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) || 5180 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) || 5181 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_ACP) || 5182 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE) || 5183 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) { 5184 if (adev->ip_blocks[i].status.hang) { 5185 dev_info(adev->dev, "Some block need full reset!\n"); 5186 return true; 5187 } 5188 } 5189 } 5190 return false; 5191 } 5192 5193 /** 5194 * amdgpu_device_ip_soft_reset - do a soft reset 5195 * 5196 * @adev: amdgpu_device pointer 5197 * 5198 * The list of all the hardware IPs that make up the asic is walked and the 5199 * soft_reset callbacks are run if the block is hung. soft_reset handles any 5200 * IP specific hardware or software state changes that are necessary to soft 5201 * reset the IP. 5202 * Returns 0 on success, negative error code on failure. 5203 */ 5204 static int amdgpu_device_ip_soft_reset(struct amdgpu_device *adev) 5205 { 5206 int i, r = 0; 5207 5208 for (i = 0; i < adev->num_ip_blocks; i++) { 5209 if (!adev->ip_blocks[i].status.valid) 5210 continue; 5211 if (adev->ip_blocks[i].status.hang && 5212 adev->ip_blocks[i].version->funcs->soft_reset) { 5213 r = adev->ip_blocks[i].version->funcs->soft_reset(&adev->ip_blocks[i]); 5214 if (r) 5215 return r; 5216 } 5217 } 5218 5219 return 0; 5220 } 5221 5222 /** 5223 * amdgpu_device_ip_post_soft_reset - clean up from soft reset 5224 * 5225 * @adev: amdgpu_device pointer 5226 * 5227 * The list of all the hardware IPs that make up the asic is walked and the 5228 * post_soft_reset callbacks are run if the asic was hung. post_soft_reset 5229 * handles any IP specific hardware or software state changes that are 5230 * necessary after the IP has been soft reset. 5231 * Returns 0 on success, negative error code on failure. 5232 */ 5233 static int amdgpu_device_ip_post_soft_reset(struct amdgpu_device *adev) 5234 { 5235 int i, r = 0; 5236 5237 for (i = 0; i < adev->num_ip_blocks; i++) { 5238 if (!adev->ip_blocks[i].status.valid) 5239 continue; 5240 if (adev->ip_blocks[i].status.hang && 5241 adev->ip_blocks[i].version->funcs->post_soft_reset) 5242 r = adev->ip_blocks[i].version->funcs->post_soft_reset(&adev->ip_blocks[i]); 5243 if (r) 5244 return r; 5245 } 5246 5247 return 0; 5248 } 5249 5250 /** 5251 * amdgpu_device_reset_sriov - reset ASIC for SR-IOV vf 5252 * 5253 * @adev: amdgpu_device pointer 5254 * @reset_context: amdgpu reset context pointer 5255 * 5256 * do VF FLR and reinitialize Asic 5257 * return 0 means succeeded otherwise failed 5258 */ 5259 static int amdgpu_device_reset_sriov(struct amdgpu_device *adev, 5260 struct amdgpu_reset_context *reset_context) 5261 { 5262 int r; 5263 struct amdgpu_hive_info *hive = NULL; 5264 5265 if (test_bit(AMDGPU_HOST_FLR, &reset_context->flags)) { 5266 if (!amdgpu_ras_get_fed_status(adev)) 5267 amdgpu_virt_ready_to_reset(adev); 5268 amdgpu_virt_wait_reset(adev); 5269 clear_bit(AMDGPU_HOST_FLR, &reset_context->flags); 5270 r = amdgpu_virt_request_full_gpu(adev, true); 5271 } else { 5272 r = amdgpu_virt_reset_gpu(adev); 5273 } 5274 if (r) 5275 return r; 5276 5277 amdgpu_ras_clear_err_state(adev); 5278 amdgpu_irq_gpu_reset_resume_helper(adev); 5279 5280 /* some sw clean up VF needs to do before recover */ 5281 amdgpu_virt_post_reset(adev); 5282 5283 /* Resume IP prior to SMC */ 5284 r = amdgpu_device_ip_reinit_early_sriov(adev); 5285 if (r) 5286 return r; 5287 5288 amdgpu_virt_init_data_exchange(adev); 5289 5290 r = amdgpu_device_fw_loading(adev); 5291 if (r) 5292 return r; 5293 5294 /* now we are okay to resume SMC/CP/SDMA */ 5295 r = amdgpu_device_ip_reinit_late_sriov(adev); 5296 if (r) 5297 return r; 5298 5299 hive = amdgpu_get_xgmi_hive(adev); 5300 /* Update PSP FW topology after reset */ 5301 if (hive && adev->gmc.xgmi.num_physical_nodes > 1) 5302 r = amdgpu_xgmi_update_topology(hive, adev); 5303 if (hive) 5304 amdgpu_put_xgmi_hive(hive); 5305 if (r) 5306 return r; 5307 5308 r = amdgpu_ib_ring_tests(adev); 5309 if (r) 5310 return r; 5311 5312 if (adev->virt.gim_feature & AMDGIM_FEATURE_GIM_FLR_VRAMLOST) 5313 amdgpu_inc_vram_lost(adev); 5314 5315 /* need to be called during full access so we can't do it later like 5316 * bare-metal does. 5317 */ 5318 amdgpu_amdkfd_post_reset(adev); 5319 amdgpu_virt_release_full_gpu(adev, true); 5320 5321 /* Aldebaran and gfx_11_0_3 support ras in SRIOV, so need resume ras during reset */ 5322 if (amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 2) || 5323 amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 3) || 5324 amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 4) || 5325 amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(11, 0, 3)) 5326 amdgpu_ras_resume(adev); 5327 5328 amdgpu_virt_ras_telemetry_post_reset(adev); 5329 5330 return 0; 5331 } 5332 5333 /** 5334 * amdgpu_device_has_job_running - check if there is any unfinished job 5335 * 5336 * @adev: amdgpu_device pointer 5337 * 5338 * check if there is any job running on the device when guest driver receives 5339 * FLR notification from host driver. If there are still jobs running, then 5340 * the guest driver will not respond the FLR reset. Instead, let the job hit 5341 * the timeout and guest driver then issue the reset request. 5342 */ 5343 bool amdgpu_device_has_job_running(struct amdgpu_device *adev) 5344 { 5345 int i; 5346 5347 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 5348 struct amdgpu_ring *ring = adev->rings[i]; 5349 5350 if (!amdgpu_ring_sched_ready(ring)) 5351 continue; 5352 5353 if (amdgpu_fence_count_emitted(ring)) 5354 return true; 5355 } 5356 return false; 5357 } 5358 5359 /** 5360 * amdgpu_device_should_recover_gpu - check if we should try GPU recovery 5361 * 5362 * @adev: amdgpu_device pointer 5363 * 5364 * Check amdgpu_gpu_recovery and SRIOV status to see if we should try to recover 5365 * a hung GPU. 5366 */ 5367 bool amdgpu_device_should_recover_gpu(struct amdgpu_device *adev) 5368 { 5369 5370 if (amdgpu_gpu_recovery == 0) 5371 goto disabled; 5372 5373 /* Skip soft reset check in fatal error mode */ 5374 if (!amdgpu_ras_is_poison_mode_supported(adev)) 5375 return true; 5376 5377 if (amdgpu_sriov_vf(adev)) 5378 return true; 5379 5380 if (amdgpu_gpu_recovery == -1) { 5381 switch (adev->asic_type) { 5382 #ifdef CONFIG_DRM_AMDGPU_SI 5383 case CHIP_VERDE: 5384 case CHIP_TAHITI: 5385 case CHIP_PITCAIRN: 5386 case CHIP_OLAND: 5387 case CHIP_HAINAN: 5388 #endif 5389 #ifdef CONFIG_DRM_AMDGPU_CIK 5390 case CHIP_KAVERI: 5391 case CHIP_KABINI: 5392 case CHIP_MULLINS: 5393 #endif 5394 case CHIP_CARRIZO: 5395 case CHIP_STONEY: 5396 case CHIP_CYAN_SKILLFISH: 5397 goto disabled; 5398 default: 5399 break; 5400 } 5401 } 5402 5403 return true; 5404 5405 disabled: 5406 dev_info(adev->dev, "GPU recovery disabled.\n"); 5407 return false; 5408 } 5409 5410 int amdgpu_device_mode1_reset(struct amdgpu_device *adev) 5411 { 5412 u32 i; 5413 int ret = 0; 5414 5415 if (adev->bios) 5416 amdgpu_atombios_scratch_regs_engine_hung(adev, true); 5417 5418 dev_info(adev->dev, "GPU mode1 reset\n"); 5419 5420 /* Cache the state before bus master disable. The saved config space 5421 * values are used in other cases like restore after mode-2 reset. 5422 */ 5423 amdgpu_device_cache_pci_state(adev->pdev); 5424 5425 /* disable BM */ 5426 pci_clear_master(adev->pdev); 5427 5428 if (amdgpu_dpm_is_mode1_reset_supported(adev)) { 5429 dev_info(adev->dev, "GPU smu mode1 reset\n"); 5430 ret = amdgpu_dpm_mode1_reset(adev); 5431 } else { 5432 dev_info(adev->dev, "GPU psp mode1 reset\n"); 5433 ret = psp_gpu_reset(adev); 5434 } 5435 5436 if (ret) 5437 goto mode1_reset_failed; 5438 5439 amdgpu_device_load_pci_state(adev->pdev); 5440 ret = amdgpu_psp_wait_for_bootloader(adev); 5441 if (ret) 5442 goto mode1_reset_failed; 5443 5444 /* wait for asic to come out of reset */ 5445 for (i = 0; i < adev->usec_timeout; i++) { 5446 u32 memsize = adev->nbio.funcs->get_memsize(adev); 5447 5448 if (memsize != 0xffffffff) 5449 break; 5450 udelay(1); 5451 } 5452 5453 if (i >= adev->usec_timeout) { 5454 ret = -ETIMEDOUT; 5455 goto mode1_reset_failed; 5456 } 5457 5458 if (adev->bios) 5459 amdgpu_atombios_scratch_regs_engine_hung(adev, false); 5460 5461 return 0; 5462 5463 mode1_reset_failed: 5464 dev_err(adev->dev, "GPU mode1 reset failed\n"); 5465 return ret; 5466 } 5467 5468 int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev, 5469 struct amdgpu_reset_context *reset_context) 5470 { 5471 int i, r = 0; 5472 struct amdgpu_job *job = NULL; 5473 struct amdgpu_device *tmp_adev = reset_context->reset_req_dev; 5474 bool need_full_reset = 5475 test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 5476 5477 if (reset_context->reset_req_dev == adev) 5478 job = reset_context->job; 5479 5480 if (amdgpu_sriov_vf(adev)) 5481 amdgpu_virt_pre_reset(adev); 5482 5483 amdgpu_fence_driver_isr_toggle(adev, true); 5484 5485 /* block all schedulers and reset given job's ring */ 5486 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 5487 struct amdgpu_ring *ring = adev->rings[i]; 5488 5489 if (!amdgpu_ring_sched_ready(ring)) 5490 continue; 5491 5492 /* Clear job fence from fence drv to avoid force_completion 5493 * leave NULL and vm flush fence in fence drv 5494 */ 5495 amdgpu_fence_driver_clear_job_fences(ring); 5496 5497 /* after all hw jobs are reset, hw fence is meaningless, so force_completion */ 5498 amdgpu_fence_driver_force_completion(ring); 5499 } 5500 5501 amdgpu_fence_driver_isr_toggle(adev, false); 5502 5503 if (job && job->vm) 5504 drm_sched_increase_karma(&job->base); 5505 5506 r = amdgpu_reset_prepare_hwcontext(adev, reset_context); 5507 /* If reset handler not implemented, continue; otherwise return */ 5508 if (r == -EOPNOTSUPP) 5509 r = 0; 5510 else 5511 return r; 5512 5513 /* Don't suspend on bare metal if we are not going to HW reset the ASIC */ 5514 if (!amdgpu_sriov_vf(adev)) { 5515 5516 if (!need_full_reset) 5517 need_full_reset = amdgpu_device_ip_need_full_reset(adev); 5518 5519 if (!need_full_reset && amdgpu_gpu_recovery && 5520 amdgpu_device_ip_check_soft_reset(adev)) { 5521 amdgpu_device_ip_pre_soft_reset(adev); 5522 r = amdgpu_device_ip_soft_reset(adev); 5523 amdgpu_device_ip_post_soft_reset(adev); 5524 if (r || amdgpu_device_ip_check_soft_reset(adev)) { 5525 dev_info(adev->dev, "soft reset failed, will fallback to full reset!\n"); 5526 need_full_reset = true; 5527 } 5528 } 5529 5530 if (!test_bit(AMDGPU_SKIP_COREDUMP, &reset_context->flags)) { 5531 dev_info(tmp_adev->dev, "Dumping IP State\n"); 5532 /* Trigger ip dump before we reset the asic */ 5533 for (i = 0; i < tmp_adev->num_ip_blocks; i++) 5534 if (tmp_adev->ip_blocks[i].version->funcs->dump_ip_state) 5535 tmp_adev->ip_blocks[i].version->funcs 5536 ->dump_ip_state((void *)&tmp_adev->ip_blocks[i]); 5537 dev_info(tmp_adev->dev, "Dumping IP State Completed\n"); 5538 } 5539 5540 if (need_full_reset) 5541 r = amdgpu_device_ip_suspend(adev); 5542 if (need_full_reset) 5543 set_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 5544 else 5545 clear_bit(AMDGPU_NEED_FULL_RESET, 5546 &reset_context->flags); 5547 } 5548 5549 return r; 5550 } 5551 5552 int amdgpu_device_reinit_after_reset(struct amdgpu_reset_context *reset_context) 5553 { 5554 struct list_head *device_list_handle; 5555 bool full_reset, vram_lost = false; 5556 struct amdgpu_device *tmp_adev; 5557 int r, init_level; 5558 5559 device_list_handle = reset_context->reset_device_list; 5560 5561 if (!device_list_handle) 5562 return -EINVAL; 5563 5564 full_reset = test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 5565 5566 /** 5567 * If it's reset on init, it's default init level, otherwise keep level 5568 * as recovery level. 5569 */ 5570 if (reset_context->method == AMD_RESET_METHOD_ON_INIT) 5571 init_level = AMDGPU_INIT_LEVEL_DEFAULT; 5572 else 5573 init_level = AMDGPU_INIT_LEVEL_RESET_RECOVERY; 5574 5575 r = 0; 5576 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 5577 amdgpu_set_init_level(tmp_adev, init_level); 5578 if (full_reset) { 5579 /* post card */ 5580 amdgpu_ras_clear_err_state(tmp_adev); 5581 r = amdgpu_device_asic_init(tmp_adev); 5582 if (r) { 5583 dev_warn(tmp_adev->dev, "asic atom init failed!"); 5584 } else { 5585 dev_info(tmp_adev->dev, "GPU reset succeeded, trying to resume\n"); 5586 5587 r = amdgpu_device_ip_resume_phase1(tmp_adev); 5588 if (r) 5589 goto out; 5590 5591 vram_lost = amdgpu_device_check_vram_lost(tmp_adev); 5592 5593 if (!test_bit(AMDGPU_SKIP_COREDUMP, &reset_context->flags)) 5594 amdgpu_coredump(tmp_adev, false, vram_lost, reset_context->job); 5595 5596 if (vram_lost) { 5597 DRM_INFO("VRAM is lost due to GPU reset!\n"); 5598 amdgpu_inc_vram_lost(tmp_adev); 5599 } 5600 5601 r = amdgpu_device_fw_loading(tmp_adev); 5602 if (r) 5603 return r; 5604 5605 r = amdgpu_xcp_restore_partition_mode( 5606 tmp_adev->xcp_mgr); 5607 if (r) 5608 goto out; 5609 5610 r = amdgpu_device_ip_resume_phase2(tmp_adev); 5611 if (r) 5612 goto out; 5613 5614 if (tmp_adev->mman.buffer_funcs_ring->sched.ready) 5615 amdgpu_ttm_set_buffer_funcs_status(tmp_adev, true); 5616 5617 r = amdgpu_device_ip_resume_phase3(tmp_adev); 5618 if (r) 5619 goto out; 5620 5621 if (vram_lost) 5622 amdgpu_device_fill_reset_magic(tmp_adev); 5623 5624 /* 5625 * Add this ASIC as tracked as reset was already 5626 * complete successfully. 5627 */ 5628 amdgpu_register_gpu_instance(tmp_adev); 5629 5630 if (!reset_context->hive && 5631 tmp_adev->gmc.xgmi.num_physical_nodes > 1) 5632 amdgpu_xgmi_add_device(tmp_adev); 5633 5634 r = amdgpu_device_ip_late_init(tmp_adev); 5635 if (r) 5636 goto out; 5637 5638 drm_client_dev_resume(adev_to_drm(tmp_adev), false); 5639 5640 /* 5641 * The GPU enters bad state once faulty pages 5642 * by ECC has reached the threshold, and ras 5643 * recovery is scheduled next. So add one check 5644 * here to break recovery if it indeed exceeds 5645 * bad page threshold, and remind user to 5646 * retire this GPU or setting one bigger 5647 * bad_page_threshold value to fix this once 5648 * probing driver again. 5649 */ 5650 if (!amdgpu_ras_is_rma(tmp_adev)) { 5651 /* must succeed. */ 5652 amdgpu_ras_resume(tmp_adev); 5653 } else { 5654 r = -EINVAL; 5655 goto out; 5656 } 5657 5658 /* Update PSP FW topology after reset */ 5659 if (reset_context->hive && 5660 tmp_adev->gmc.xgmi.num_physical_nodes > 1) 5661 r = amdgpu_xgmi_update_topology( 5662 reset_context->hive, tmp_adev); 5663 } 5664 } 5665 5666 out: 5667 if (!r) { 5668 /* IP init is complete now, set level as default */ 5669 amdgpu_set_init_level(tmp_adev, 5670 AMDGPU_INIT_LEVEL_DEFAULT); 5671 amdgpu_irq_gpu_reset_resume_helper(tmp_adev); 5672 r = amdgpu_ib_ring_tests(tmp_adev); 5673 if (r) { 5674 dev_err(tmp_adev->dev, "ib ring test failed (%d).\n", r); 5675 r = -EAGAIN; 5676 goto end; 5677 } 5678 } 5679 5680 if (r) 5681 tmp_adev->asic_reset_res = r; 5682 } 5683 5684 end: 5685 return r; 5686 } 5687 5688 int amdgpu_do_asic_reset(struct list_head *device_list_handle, 5689 struct amdgpu_reset_context *reset_context) 5690 { 5691 struct amdgpu_device *tmp_adev = NULL; 5692 bool need_full_reset, skip_hw_reset; 5693 int r = 0; 5694 5695 /* Try reset handler method first */ 5696 tmp_adev = list_first_entry(device_list_handle, struct amdgpu_device, 5697 reset_list); 5698 5699 reset_context->reset_device_list = device_list_handle; 5700 r = amdgpu_reset_perform_reset(tmp_adev, reset_context); 5701 /* If reset handler not implemented, continue; otherwise return */ 5702 if (r == -EOPNOTSUPP) 5703 r = 0; 5704 else 5705 return r; 5706 5707 /* Reset handler not implemented, use the default method */ 5708 need_full_reset = 5709 test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 5710 skip_hw_reset = test_bit(AMDGPU_SKIP_HW_RESET, &reset_context->flags); 5711 5712 /* 5713 * ASIC reset has to be done on all XGMI hive nodes ASAP 5714 * to allow proper links negotiation in FW (within 1 sec) 5715 */ 5716 if (!skip_hw_reset && need_full_reset) { 5717 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 5718 /* For XGMI run all resets in parallel to speed up the process */ 5719 if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) { 5720 if (!queue_work(system_unbound_wq, 5721 &tmp_adev->xgmi_reset_work)) 5722 r = -EALREADY; 5723 } else 5724 r = amdgpu_asic_reset(tmp_adev); 5725 5726 if (r) { 5727 dev_err(tmp_adev->dev, 5728 "ASIC reset failed with error, %d for drm dev, %s", 5729 r, adev_to_drm(tmp_adev)->unique); 5730 goto out; 5731 } 5732 } 5733 5734 /* For XGMI wait for all resets to complete before proceed */ 5735 if (!r) { 5736 list_for_each_entry(tmp_adev, device_list_handle, 5737 reset_list) { 5738 if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) { 5739 flush_work(&tmp_adev->xgmi_reset_work); 5740 r = tmp_adev->asic_reset_res; 5741 if (r) 5742 break; 5743 } 5744 } 5745 } 5746 } 5747 5748 if (!r && amdgpu_ras_intr_triggered()) { 5749 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 5750 amdgpu_ras_reset_error_count(tmp_adev, 5751 AMDGPU_RAS_BLOCK__MMHUB); 5752 } 5753 5754 amdgpu_ras_intr_cleared(); 5755 } 5756 5757 r = amdgpu_device_reinit_after_reset(reset_context); 5758 if (r == -EAGAIN) 5759 set_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 5760 else 5761 clear_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 5762 5763 out: 5764 return r; 5765 } 5766 5767 static void amdgpu_device_set_mp1_state(struct amdgpu_device *adev) 5768 { 5769 5770 switch (amdgpu_asic_reset_method(adev)) { 5771 case AMD_RESET_METHOD_MODE1: 5772 adev->mp1_state = PP_MP1_STATE_SHUTDOWN; 5773 break; 5774 case AMD_RESET_METHOD_MODE2: 5775 adev->mp1_state = PP_MP1_STATE_RESET; 5776 break; 5777 default: 5778 adev->mp1_state = PP_MP1_STATE_NONE; 5779 break; 5780 } 5781 } 5782 5783 static void amdgpu_device_unset_mp1_state(struct amdgpu_device *adev) 5784 { 5785 amdgpu_vf_error_trans_all(adev); 5786 adev->mp1_state = PP_MP1_STATE_NONE; 5787 } 5788 5789 static void amdgpu_device_resume_display_audio(struct amdgpu_device *adev) 5790 { 5791 struct pci_dev *p = NULL; 5792 5793 p = pci_get_domain_bus_and_slot(pci_domain_nr(adev->pdev->bus), 5794 adev->pdev->bus->number, 1); 5795 if (p) { 5796 pm_runtime_enable(&(p->dev)); 5797 pm_runtime_resume(&(p->dev)); 5798 } 5799 5800 pci_dev_put(p); 5801 } 5802 5803 static int amdgpu_device_suspend_display_audio(struct amdgpu_device *adev) 5804 { 5805 enum amd_reset_method reset_method; 5806 struct pci_dev *p = NULL; 5807 u64 expires; 5808 5809 /* 5810 * For now, only BACO and mode1 reset are confirmed 5811 * to suffer the audio issue without proper suspended. 5812 */ 5813 reset_method = amdgpu_asic_reset_method(adev); 5814 if ((reset_method != AMD_RESET_METHOD_BACO) && 5815 (reset_method != AMD_RESET_METHOD_MODE1)) 5816 return -EINVAL; 5817 5818 p = pci_get_domain_bus_and_slot(pci_domain_nr(adev->pdev->bus), 5819 adev->pdev->bus->number, 1); 5820 if (!p) 5821 return -ENODEV; 5822 5823 expires = pm_runtime_autosuspend_expiration(&(p->dev)); 5824 if (!expires) 5825 /* 5826 * If we cannot get the audio device autosuspend delay, 5827 * a fixed 4S interval will be used. Considering 3S is 5828 * the audio controller default autosuspend delay setting. 5829 * 4S used here is guaranteed to cover that. 5830 */ 5831 expires = ktime_get_mono_fast_ns() + NSEC_PER_SEC * 4ULL; 5832 5833 while (!pm_runtime_status_suspended(&(p->dev))) { 5834 if (!pm_runtime_suspend(&(p->dev))) 5835 break; 5836 5837 if (expires < ktime_get_mono_fast_ns()) { 5838 dev_warn(adev->dev, "failed to suspend display audio\n"); 5839 pci_dev_put(p); 5840 /* TODO: abort the succeeding gpu reset? */ 5841 return -ETIMEDOUT; 5842 } 5843 } 5844 5845 pm_runtime_disable(&(p->dev)); 5846 5847 pci_dev_put(p); 5848 return 0; 5849 } 5850 5851 static inline void amdgpu_device_stop_pending_resets(struct amdgpu_device *adev) 5852 { 5853 struct amdgpu_ras *con = amdgpu_ras_get_context(adev); 5854 5855 #if defined(CONFIG_DEBUG_FS) 5856 if (!amdgpu_sriov_vf(adev)) 5857 cancel_work(&adev->reset_work); 5858 #endif 5859 5860 if (adev->kfd.dev) 5861 cancel_work(&adev->kfd.reset_work); 5862 5863 if (amdgpu_sriov_vf(adev)) 5864 cancel_work(&adev->virt.flr_work); 5865 5866 if (con && adev->ras_enabled) 5867 cancel_work(&con->recovery_work); 5868 5869 } 5870 5871 static int amdgpu_device_health_check(struct list_head *device_list_handle) 5872 { 5873 struct amdgpu_device *tmp_adev; 5874 int ret = 0; 5875 u32 status; 5876 5877 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 5878 pci_read_config_dword(tmp_adev->pdev, PCI_COMMAND, &status); 5879 if (PCI_POSSIBLE_ERROR(status)) { 5880 dev_err(tmp_adev->dev, "device lost from bus!"); 5881 ret = -ENODEV; 5882 } 5883 } 5884 5885 return ret; 5886 } 5887 5888 /** 5889 * amdgpu_device_gpu_recover - reset the asic and recover scheduler 5890 * 5891 * @adev: amdgpu_device pointer 5892 * @job: which job trigger hang 5893 * @reset_context: amdgpu reset context pointer 5894 * 5895 * Attempt to reset the GPU if it has hung (all asics). 5896 * Attempt to do soft-reset or full-reset and reinitialize Asic 5897 * Returns 0 for success or an error on failure. 5898 */ 5899 5900 int amdgpu_device_gpu_recover(struct amdgpu_device *adev, 5901 struct amdgpu_job *job, 5902 struct amdgpu_reset_context *reset_context) 5903 { 5904 struct list_head device_list, *device_list_handle = NULL; 5905 bool job_signaled = false; 5906 struct amdgpu_hive_info *hive = NULL; 5907 struct amdgpu_device *tmp_adev = NULL; 5908 int i, r = 0; 5909 bool need_emergency_restart = false; 5910 bool audio_suspended = false; 5911 int retry_limit = AMDGPU_MAX_RETRY_LIMIT; 5912 5913 /* 5914 * If it reaches here because of hang/timeout and a RAS error is 5915 * detected at the same time, let RAS recovery take care of it. 5916 */ 5917 if (amdgpu_ras_is_err_state(adev, AMDGPU_RAS_BLOCK__ANY) && 5918 !amdgpu_sriov_vf(adev) && 5919 reset_context->src != AMDGPU_RESET_SRC_RAS) { 5920 dev_dbg(adev->dev, 5921 "Gpu recovery from source: %d yielding to RAS error recovery handling", 5922 reset_context->src); 5923 return 0; 5924 } 5925 /* 5926 * Special case: RAS triggered and full reset isn't supported 5927 */ 5928 need_emergency_restart = amdgpu_ras_need_emergency_restart(adev); 5929 5930 /* 5931 * Flush RAM to disk so that after reboot 5932 * the user can read log and see why the system rebooted. 5933 */ 5934 if (need_emergency_restart && amdgpu_ras_get_context(adev) && 5935 amdgpu_ras_get_context(adev)->reboot) { 5936 DRM_WARN("Emergency reboot."); 5937 5938 ksys_sync_helper(); 5939 emergency_restart(); 5940 } 5941 5942 dev_info(adev->dev, "GPU %s begin!\n", 5943 need_emergency_restart ? "jobs stop":"reset"); 5944 5945 if (!amdgpu_sriov_vf(adev)) 5946 hive = amdgpu_get_xgmi_hive(adev); 5947 if (hive) 5948 mutex_lock(&hive->hive_lock); 5949 5950 reset_context->job = job; 5951 reset_context->hive = hive; 5952 /* 5953 * Build list of devices to reset. 5954 * In case we are in XGMI hive mode, resort the device list 5955 * to put adev in the 1st position. 5956 */ 5957 INIT_LIST_HEAD(&device_list); 5958 if (!amdgpu_sriov_vf(adev) && (adev->gmc.xgmi.num_physical_nodes > 1) && hive) { 5959 list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head) { 5960 list_add_tail(&tmp_adev->reset_list, &device_list); 5961 if (adev->shutdown) 5962 tmp_adev->shutdown = true; 5963 } 5964 if (!list_is_first(&adev->reset_list, &device_list)) 5965 list_rotate_to_front(&adev->reset_list, &device_list); 5966 device_list_handle = &device_list; 5967 } else { 5968 list_add_tail(&adev->reset_list, &device_list); 5969 device_list_handle = &device_list; 5970 } 5971 5972 if (!amdgpu_sriov_vf(adev)) { 5973 r = amdgpu_device_health_check(device_list_handle); 5974 if (r) 5975 goto end_reset; 5976 } 5977 5978 /* We need to lock reset domain only once both for XGMI and single device */ 5979 tmp_adev = list_first_entry(device_list_handle, struct amdgpu_device, 5980 reset_list); 5981 amdgpu_device_lock_reset_domain(tmp_adev->reset_domain); 5982 5983 /* block all schedulers and reset given job's ring */ 5984 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 5985 5986 amdgpu_device_set_mp1_state(tmp_adev); 5987 5988 /* 5989 * Try to put the audio codec into suspend state 5990 * before gpu reset started. 5991 * 5992 * Due to the power domain of the graphics device 5993 * is shared with AZ power domain. Without this, 5994 * we may change the audio hardware from behind 5995 * the audio driver's back. That will trigger 5996 * some audio codec errors. 5997 */ 5998 if (!amdgpu_device_suspend_display_audio(tmp_adev)) 5999 audio_suspended = true; 6000 6001 amdgpu_ras_set_error_query_ready(tmp_adev, false); 6002 6003 cancel_delayed_work_sync(&tmp_adev->delayed_init_work); 6004 6005 amdgpu_amdkfd_pre_reset(tmp_adev, reset_context); 6006 6007 /* 6008 * Mark these ASICs to be reset as untracked first 6009 * And add them back after reset completed 6010 */ 6011 amdgpu_unregister_gpu_instance(tmp_adev); 6012 6013 drm_client_dev_suspend(adev_to_drm(tmp_adev), false); 6014 6015 /* disable ras on ALL IPs */ 6016 if (!need_emergency_restart && 6017 amdgpu_device_ip_need_full_reset(tmp_adev)) 6018 amdgpu_ras_suspend(tmp_adev); 6019 6020 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 6021 struct amdgpu_ring *ring = tmp_adev->rings[i]; 6022 6023 if (!amdgpu_ring_sched_ready(ring)) 6024 continue; 6025 6026 drm_sched_stop(&ring->sched, job ? &job->base : NULL); 6027 6028 if (need_emergency_restart) 6029 amdgpu_job_stop_all_jobs_on_sched(&ring->sched); 6030 } 6031 atomic_inc(&tmp_adev->gpu_reset_counter); 6032 } 6033 6034 if (need_emergency_restart) 6035 goto skip_sched_resume; 6036 6037 /* 6038 * Must check guilty signal here since after this point all old 6039 * HW fences are force signaled. 6040 * 6041 * job->base holds a reference to parent fence 6042 */ 6043 if (job && dma_fence_is_signaled(&job->hw_fence)) { 6044 job_signaled = true; 6045 dev_info(adev->dev, "Guilty job already signaled, skipping HW reset"); 6046 goto skip_hw_reset; 6047 } 6048 6049 retry: /* Rest of adevs pre asic reset from XGMI hive. */ 6050 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 6051 r = amdgpu_device_pre_asic_reset(tmp_adev, reset_context); 6052 /*TODO Should we stop ?*/ 6053 if (r) { 6054 dev_err(tmp_adev->dev, "GPU pre asic reset failed with err, %d for drm dev, %s ", 6055 r, adev_to_drm(tmp_adev)->unique); 6056 tmp_adev->asic_reset_res = r; 6057 } 6058 } 6059 6060 /* Actual ASIC resets if needed.*/ 6061 /* Host driver will handle XGMI hive reset for SRIOV */ 6062 if (amdgpu_sriov_vf(adev)) { 6063 if (amdgpu_ras_get_fed_status(adev) || amdgpu_virt_rcvd_ras_interrupt(adev)) { 6064 dev_dbg(adev->dev, "Detected RAS error, wait for FLR completion\n"); 6065 amdgpu_ras_set_fed(adev, true); 6066 set_bit(AMDGPU_HOST_FLR, &reset_context->flags); 6067 } 6068 6069 r = amdgpu_device_reset_sriov(adev, reset_context); 6070 if (AMDGPU_RETRY_SRIOV_RESET(r) && (retry_limit--) > 0) { 6071 amdgpu_virt_release_full_gpu(adev, true); 6072 goto retry; 6073 } 6074 if (r) 6075 adev->asic_reset_res = r; 6076 } else { 6077 r = amdgpu_do_asic_reset(device_list_handle, reset_context); 6078 if (r && r == -EAGAIN) 6079 goto retry; 6080 } 6081 6082 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 6083 /* 6084 * Drop any pending non scheduler resets queued before reset is done. 6085 * Any reset scheduled after this point would be valid. Scheduler resets 6086 * were already dropped during drm_sched_stop and no new ones can come 6087 * in before drm_sched_start. 6088 */ 6089 amdgpu_device_stop_pending_resets(tmp_adev); 6090 } 6091 6092 skip_hw_reset: 6093 6094 /* Post ASIC reset for all devs .*/ 6095 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 6096 6097 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 6098 struct amdgpu_ring *ring = tmp_adev->rings[i]; 6099 6100 if (!amdgpu_ring_sched_ready(ring)) 6101 continue; 6102 6103 drm_sched_start(&ring->sched, 0); 6104 } 6105 6106 if (!drm_drv_uses_atomic_modeset(adev_to_drm(tmp_adev)) && !job_signaled) 6107 drm_helper_resume_force_mode(adev_to_drm(tmp_adev)); 6108 6109 if (tmp_adev->asic_reset_res) 6110 r = tmp_adev->asic_reset_res; 6111 6112 tmp_adev->asic_reset_res = 0; 6113 6114 if (r) { 6115 /* bad news, how to tell it to userspace ? 6116 * for ras error, we should report GPU bad status instead of 6117 * reset failure 6118 */ 6119 if (reset_context->src != AMDGPU_RESET_SRC_RAS || 6120 !amdgpu_ras_eeprom_check_err_threshold(tmp_adev)) 6121 dev_info(tmp_adev->dev, "GPU reset(%d) failed\n", 6122 atomic_read(&tmp_adev->gpu_reset_counter)); 6123 amdgpu_vf_error_put(tmp_adev, AMDGIM_ERROR_VF_GPU_RESET_FAIL, 0, r); 6124 } else { 6125 dev_info(tmp_adev->dev, "GPU reset(%d) succeeded!\n", atomic_read(&tmp_adev->gpu_reset_counter)); 6126 if (amdgpu_acpi_smart_shift_update(adev_to_drm(tmp_adev), AMDGPU_SS_DEV_D0)) 6127 DRM_WARN("smart shift update failed\n"); 6128 } 6129 } 6130 6131 skip_sched_resume: 6132 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 6133 /* unlock kfd: SRIOV would do it separately */ 6134 if (!need_emergency_restart && !amdgpu_sriov_vf(tmp_adev)) 6135 amdgpu_amdkfd_post_reset(tmp_adev); 6136 6137 /* kfd_post_reset will do nothing if kfd device is not initialized, 6138 * need to bring up kfd here if it's not be initialized before 6139 */ 6140 if (!adev->kfd.init_complete) 6141 amdgpu_amdkfd_device_init(adev); 6142 6143 if (audio_suspended) 6144 amdgpu_device_resume_display_audio(tmp_adev); 6145 6146 amdgpu_device_unset_mp1_state(tmp_adev); 6147 6148 amdgpu_ras_set_error_query_ready(tmp_adev, true); 6149 } 6150 6151 tmp_adev = list_first_entry(device_list_handle, struct amdgpu_device, 6152 reset_list); 6153 amdgpu_device_unlock_reset_domain(tmp_adev->reset_domain); 6154 6155 end_reset: 6156 if (hive) { 6157 mutex_unlock(&hive->hive_lock); 6158 amdgpu_put_xgmi_hive(hive); 6159 } 6160 6161 if (r) 6162 dev_info(adev->dev, "GPU reset end with ret = %d\n", r); 6163 6164 atomic_set(&adev->reset_domain->reset_res, r); 6165 6166 if (!r) 6167 drm_dev_wedged_event(adev_to_drm(adev), DRM_WEDGE_RECOVERY_NONE); 6168 6169 return r; 6170 } 6171 6172 /** 6173 * amdgpu_device_partner_bandwidth - find the bandwidth of appropriate partner 6174 * 6175 * @adev: amdgpu_device pointer 6176 * @speed: pointer to the speed of the link 6177 * @width: pointer to the width of the link 6178 * 6179 * Evaluate the hierarchy to find the speed and bandwidth capabilities of the 6180 * first physical partner to an AMD dGPU. 6181 * This will exclude any virtual switches and links. 6182 */ 6183 static void amdgpu_device_partner_bandwidth(struct amdgpu_device *adev, 6184 enum pci_bus_speed *speed, 6185 enum pcie_link_width *width) 6186 { 6187 struct pci_dev *parent = adev->pdev; 6188 6189 if (!speed || !width) 6190 return; 6191 6192 *speed = PCI_SPEED_UNKNOWN; 6193 *width = PCIE_LNK_WIDTH_UNKNOWN; 6194 6195 if (amdgpu_device_pcie_dynamic_switching_supported(adev)) { 6196 while ((parent = pci_upstream_bridge(parent))) { 6197 /* skip upstream/downstream switches internal to dGPU*/ 6198 if (parent->vendor == PCI_VENDOR_ID_ATI) 6199 continue; 6200 *speed = pcie_get_speed_cap(parent); 6201 *width = pcie_get_width_cap(parent); 6202 break; 6203 } 6204 } else { 6205 /* use the current speeds rather than max if switching is not supported */ 6206 pcie_bandwidth_available(adev->pdev, NULL, speed, width); 6207 } 6208 } 6209 6210 /** 6211 * amdgpu_device_gpu_bandwidth - find the bandwidth of the GPU 6212 * 6213 * @adev: amdgpu_device pointer 6214 * @speed: pointer to the speed of the link 6215 * @width: pointer to the width of the link 6216 * 6217 * Evaluate the hierarchy to find the speed and bandwidth capabilities of the 6218 * AMD dGPU which may be a virtual upstream bridge. 6219 */ 6220 static void amdgpu_device_gpu_bandwidth(struct amdgpu_device *adev, 6221 enum pci_bus_speed *speed, 6222 enum pcie_link_width *width) 6223 { 6224 struct pci_dev *parent = adev->pdev; 6225 6226 if (!speed || !width) 6227 return; 6228 6229 parent = pci_upstream_bridge(parent); 6230 if (parent && parent->vendor == PCI_VENDOR_ID_ATI) { 6231 /* use the upstream/downstream switches internal to dGPU */ 6232 *speed = pcie_get_speed_cap(parent); 6233 *width = pcie_get_width_cap(parent); 6234 while ((parent = pci_upstream_bridge(parent))) { 6235 if (parent->vendor == PCI_VENDOR_ID_ATI) { 6236 /* use the upstream/downstream switches internal to dGPU */ 6237 *speed = pcie_get_speed_cap(parent); 6238 *width = pcie_get_width_cap(parent); 6239 } 6240 } 6241 } else { 6242 /* use the device itself */ 6243 *speed = pcie_get_speed_cap(adev->pdev); 6244 *width = pcie_get_width_cap(adev->pdev); 6245 } 6246 } 6247 6248 /** 6249 * amdgpu_device_get_pcie_info - fence pcie info about the PCIE slot 6250 * 6251 * @adev: amdgpu_device pointer 6252 * 6253 * Fetches and stores in the driver the PCIE capabilities (gen speed 6254 * and lanes) of the slot the device is in. Handles APUs and 6255 * virtualized environments where PCIE config space may not be available. 6256 */ 6257 static void amdgpu_device_get_pcie_info(struct amdgpu_device *adev) 6258 { 6259 enum pci_bus_speed speed_cap, platform_speed_cap; 6260 enum pcie_link_width platform_link_width, link_width; 6261 6262 if (amdgpu_pcie_gen_cap) 6263 adev->pm.pcie_gen_mask = amdgpu_pcie_gen_cap; 6264 6265 if (amdgpu_pcie_lane_cap) 6266 adev->pm.pcie_mlw_mask = amdgpu_pcie_lane_cap; 6267 6268 /* covers APUs as well */ 6269 if (pci_is_root_bus(adev->pdev->bus) && !amdgpu_passthrough(adev)) { 6270 if (adev->pm.pcie_gen_mask == 0) 6271 adev->pm.pcie_gen_mask = AMDGPU_DEFAULT_PCIE_GEN_MASK; 6272 if (adev->pm.pcie_mlw_mask == 0) 6273 adev->pm.pcie_mlw_mask = AMDGPU_DEFAULT_PCIE_MLW_MASK; 6274 return; 6275 } 6276 6277 if (adev->pm.pcie_gen_mask && adev->pm.pcie_mlw_mask) 6278 return; 6279 6280 amdgpu_device_partner_bandwidth(adev, &platform_speed_cap, 6281 &platform_link_width); 6282 amdgpu_device_gpu_bandwidth(adev, &speed_cap, &link_width); 6283 6284 if (adev->pm.pcie_gen_mask == 0) { 6285 /* asic caps */ 6286 if (speed_cap == PCI_SPEED_UNKNOWN) { 6287 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 6288 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 | 6289 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3); 6290 } else { 6291 if (speed_cap == PCIE_SPEED_32_0GT) 6292 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 6293 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 | 6294 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3 | 6295 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN4 | 6296 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN5); 6297 else if (speed_cap == PCIE_SPEED_16_0GT) 6298 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 6299 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 | 6300 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3 | 6301 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN4); 6302 else if (speed_cap == PCIE_SPEED_8_0GT) 6303 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 6304 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 | 6305 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3); 6306 else if (speed_cap == PCIE_SPEED_5_0GT) 6307 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 6308 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2); 6309 else 6310 adev->pm.pcie_gen_mask |= CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1; 6311 } 6312 /* platform caps */ 6313 if (platform_speed_cap == PCI_SPEED_UNKNOWN) { 6314 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 6315 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2); 6316 } else { 6317 if (platform_speed_cap == PCIE_SPEED_32_0GT) 6318 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 6319 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 | 6320 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3 | 6321 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN4 | 6322 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN5); 6323 else if (platform_speed_cap == PCIE_SPEED_16_0GT) 6324 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 6325 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 | 6326 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3 | 6327 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN4); 6328 else if (platform_speed_cap == PCIE_SPEED_8_0GT) 6329 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 6330 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 | 6331 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3); 6332 else if (platform_speed_cap == PCIE_SPEED_5_0GT) 6333 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 6334 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2); 6335 else 6336 adev->pm.pcie_gen_mask |= CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1; 6337 6338 } 6339 } 6340 if (adev->pm.pcie_mlw_mask == 0) { 6341 /* asic caps */ 6342 if (link_width == PCIE_LNK_WIDTH_UNKNOWN) { 6343 adev->pm.pcie_mlw_mask |= AMDGPU_DEFAULT_ASIC_PCIE_MLW_MASK; 6344 } else { 6345 switch (link_width) { 6346 case PCIE_LNK_X32: 6347 adev->pm.pcie_mlw_mask |= (CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X32 | 6348 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X16 | 6349 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X12 | 6350 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X8 | 6351 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X4 | 6352 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X2 | 6353 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X1); 6354 break; 6355 case PCIE_LNK_X16: 6356 adev->pm.pcie_mlw_mask |= (CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X16 | 6357 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X12 | 6358 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X8 | 6359 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X4 | 6360 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X2 | 6361 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X1); 6362 break; 6363 case PCIE_LNK_X12: 6364 adev->pm.pcie_mlw_mask |= (CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X12 | 6365 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X8 | 6366 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X4 | 6367 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X2 | 6368 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X1); 6369 break; 6370 case PCIE_LNK_X8: 6371 adev->pm.pcie_mlw_mask |= (CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X8 | 6372 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X4 | 6373 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X2 | 6374 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X1); 6375 break; 6376 case PCIE_LNK_X4: 6377 adev->pm.pcie_mlw_mask |= (CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X4 | 6378 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X2 | 6379 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X1); 6380 break; 6381 case PCIE_LNK_X2: 6382 adev->pm.pcie_mlw_mask |= (CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X2 | 6383 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X1); 6384 break; 6385 case PCIE_LNK_X1: 6386 adev->pm.pcie_mlw_mask |= CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X1; 6387 break; 6388 default: 6389 break; 6390 } 6391 } 6392 /* platform caps */ 6393 if (platform_link_width == PCIE_LNK_WIDTH_UNKNOWN) { 6394 adev->pm.pcie_mlw_mask |= AMDGPU_DEFAULT_PCIE_MLW_MASK; 6395 } else { 6396 switch (platform_link_width) { 6397 case PCIE_LNK_X32: 6398 adev->pm.pcie_mlw_mask |= (CAIL_PCIE_LINK_WIDTH_SUPPORT_X32 | 6399 CAIL_PCIE_LINK_WIDTH_SUPPORT_X16 | 6400 CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 | 6401 CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 | 6402 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 6403 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 6404 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 6405 break; 6406 case PCIE_LNK_X16: 6407 adev->pm.pcie_mlw_mask |= (CAIL_PCIE_LINK_WIDTH_SUPPORT_X16 | 6408 CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 | 6409 CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 | 6410 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 6411 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 6412 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 6413 break; 6414 case PCIE_LNK_X12: 6415 adev->pm.pcie_mlw_mask |= (CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 | 6416 CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 | 6417 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 6418 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 6419 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 6420 break; 6421 case PCIE_LNK_X8: 6422 adev->pm.pcie_mlw_mask |= (CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 | 6423 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 6424 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 6425 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 6426 break; 6427 case PCIE_LNK_X4: 6428 adev->pm.pcie_mlw_mask |= (CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 6429 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 6430 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 6431 break; 6432 case PCIE_LNK_X2: 6433 adev->pm.pcie_mlw_mask |= (CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 6434 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 6435 break; 6436 case PCIE_LNK_X1: 6437 adev->pm.pcie_mlw_mask |= CAIL_PCIE_LINK_WIDTH_SUPPORT_X1; 6438 break; 6439 default: 6440 break; 6441 } 6442 } 6443 } 6444 } 6445 6446 /** 6447 * amdgpu_device_is_peer_accessible - Check peer access through PCIe BAR 6448 * 6449 * @adev: amdgpu_device pointer 6450 * @peer_adev: amdgpu_device pointer for peer device trying to access @adev 6451 * 6452 * Return true if @peer_adev can access (DMA) @adev through the PCIe 6453 * BAR, i.e. @adev is "large BAR" and the BAR matches the DMA mask of 6454 * @peer_adev. 6455 */ 6456 bool amdgpu_device_is_peer_accessible(struct amdgpu_device *adev, 6457 struct amdgpu_device *peer_adev) 6458 { 6459 #ifdef CONFIG_HSA_AMD_P2P 6460 bool p2p_access = 6461 !adev->gmc.xgmi.connected_to_cpu && 6462 !(pci_p2pdma_distance(adev->pdev, peer_adev->dev, false) < 0); 6463 if (!p2p_access) 6464 dev_info(adev->dev, "PCIe P2P access from peer device %s is not supported by the chipset\n", 6465 pci_name(peer_adev->pdev)); 6466 6467 bool is_large_bar = adev->gmc.visible_vram_size && 6468 adev->gmc.real_vram_size == adev->gmc.visible_vram_size; 6469 bool p2p_addressable = amdgpu_device_check_iommu_remap(peer_adev); 6470 6471 if (!p2p_addressable) { 6472 uint64_t address_mask = peer_adev->dev->dma_mask ? 6473 ~*peer_adev->dev->dma_mask : ~((1ULL << 32) - 1); 6474 resource_size_t aper_limit = 6475 adev->gmc.aper_base + adev->gmc.aper_size - 1; 6476 6477 p2p_addressable = !(adev->gmc.aper_base & address_mask || 6478 aper_limit & address_mask); 6479 } 6480 return pcie_p2p && is_large_bar && p2p_access && p2p_addressable; 6481 #else 6482 return false; 6483 #endif 6484 } 6485 6486 int amdgpu_device_baco_enter(struct drm_device *dev) 6487 { 6488 struct amdgpu_device *adev = drm_to_adev(dev); 6489 struct amdgpu_ras *ras = amdgpu_ras_get_context(adev); 6490 6491 if (!amdgpu_device_supports_baco(dev)) 6492 return -ENOTSUPP; 6493 6494 if (ras && adev->ras_enabled && 6495 adev->nbio.funcs->enable_doorbell_interrupt) 6496 adev->nbio.funcs->enable_doorbell_interrupt(adev, false); 6497 6498 return amdgpu_dpm_baco_enter(adev); 6499 } 6500 6501 int amdgpu_device_baco_exit(struct drm_device *dev) 6502 { 6503 struct amdgpu_device *adev = drm_to_adev(dev); 6504 struct amdgpu_ras *ras = amdgpu_ras_get_context(adev); 6505 int ret = 0; 6506 6507 if (!amdgpu_device_supports_baco(dev)) 6508 return -ENOTSUPP; 6509 6510 ret = amdgpu_dpm_baco_exit(adev); 6511 if (ret) 6512 return ret; 6513 6514 if (ras && adev->ras_enabled && 6515 adev->nbio.funcs->enable_doorbell_interrupt) 6516 adev->nbio.funcs->enable_doorbell_interrupt(adev, true); 6517 6518 if (amdgpu_passthrough(adev) && adev->nbio.funcs && 6519 adev->nbio.funcs->clear_doorbell_interrupt) 6520 adev->nbio.funcs->clear_doorbell_interrupt(adev); 6521 6522 return 0; 6523 } 6524 6525 /** 6526 * amdgpu_pci_error_detected - Called when a PCI error is detected. 6527 * @pdev: PCI device struct 6528 * @state: PCI channel state 6529 * 6530 * Description: Called when a PCI error is detected. 6531 * 6532 * Return: PCI_ERS_RESULT_NEED_RESET or PCI_ERS_RESULT_DISCONNECT. 6533 */ 6534 pci_ers_result_t amdgpu_pci_error_detected(struct pci_dev *pdev, pci_channel_state_t state) 6535 { 6536 struct drm_device *dev = pci_get_drvdata(pdev); 6537 struct amdgpu_device *adev = drm_to_adev(dev); 6538 int i; 6539 6540 DRM_INFO("PCI error: detected callback, state(%d)!!\n", state); 6541 6542 if (adev->gmc.xgmi.num_physical_nodes > 1) { 6543 DRM_WARN("No support for XGMI hive yet..."); 6544 return PCI_ERS_RESULT_DISCONNECT; 6545 } 6546 6547 adev->pci_channel_state = state; 6548 6549 switch (state) { 6550 case pci_channel_io_normal: 6551 return PCI_ERS_RESULT_CAN_RECOVER; 6552 /* Fatal error, prepare for slot reset */ 6553 case pci_channel_io_frozen: 6554 /* 6555 * Locking adev->reset_domain->sem will prevent any external access 6556 * to GPU during PCI error recovery 6557 */ 6558 amdgpu_device_lock_reset_domain(adev->reset_domain); 6559 amdgpu_device_set_mp1_state(adev); 6560 6561 /* 6562 * Block any work scheduling as we do for regular GPU reset 6563 * for the duration of the recovery 6564 */ 6565 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 6566 struct amdgpu_ring *ring = adev->rings[i]; 6567 6568 if (!amdgpu_ring_sched_ready(ring)) 6569 continue; 6570 6571 drm_sched_stop(&ring->sched, NULL); 6572 } 6573 atomic_inc(&adev->gpu_reset_counter); 6574 return PCI_ERS_RESULT_NEED_RESET; 6575 case pci_channel_io_perm_failure: 6576 /* Permanent error, prepare for device removal */ 6577 return PCI_ERS_RESULT_DISCONNECT; 6578 } 6579 6580 return PCI_ERS_RESULT_NEED_RESET; 6581 } 6582 6583 /** 6584 * amdgpu_pci_mmio_enabled - Enable MMIO and dump debug registers 6585 * @pdev: pointer to PCI device 6586 */ 6587 pci_ers_result_t amdgpu_pci_mmio_enabled(struct pci_dev *pdev) 6588 { 6589 6590 DRM_INFO("PCI error: mmio enabled callback!!\n"); 6591 6592 /* TODO - dump whatever for debugging purposes */ 6593 6594 /* This called only if amdgpu_pci_error_detected returns 6595 * PCI_ERS_RESULT_CAN_RECOVER. Read/write to the device still 6596 * works, no need to reset slot. 6597 */ 6598 6599 return PCI_ERS_RESULT_RECOVERED; 6600 } 6601 6602 /** 6603 * amdgpu_pci_slot_reset - Called when PCI slot has been reset. 6604 * @pdev: PCI device struct 6605 * 6606 * Description: This routine is called by the pci error recovery 6607 * code after the PCI slot has been reset, just before we 6608 * should resume normal operations. 6609 */ 6610 pci_ers_result_t amdgpu_pci_slot_reset(struct pci_dev *pdev) 6611 { 6612 struct drm_device *dev = pci_get_drvdata(pdev); 6613 struct amdgpu_device *adev = drm_to_adev(dev); 6614 int r, i; 6615 struct amdgpu_reset_context reset_context; 6616 u32 memsize; 6617 struct list_head device_list; 6618 6619 /* PCI error slot reset should be skipped During RAS recovery */ 6620 if ((amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 3) || 6621 amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 4)) && 6622 amdgpu_ras_in_recovery(adev)) 6623 return PCI_ERS_RESULT_RECOVERED; 6624 6625 DRM_INFO("PCI error: slot reset callback!!\n"); 6626 6627 memset(&reset_context, 0, sizeof(reset_context)); 6628 6629 INIT_LIST_HEAD(&device_list); 6630 list_add_tail(&adev->reset_list, &device_list); 6631 6632 /* wait for asic to come out of reset */ 6633 msleep(500); 6634 6635 /* Restore PCI confspace */ 6636 amdgpu_device_load_pci_state(pdev); 6637 6638 /* confirm ASIC came out of reset */ 6639 for (i = 0; i < adev->usec_timeout; i++) { 6640 memsize = amdgpu_asic_get_config_memsize(adev); 6641 6642 if (memsize != 0xffffffff) 6643 break; 6644 udelay(1); 6645 } 6646 if (memsize == 0xffffffff) { 6647 r = -ETIME; 6648 goto out; 6649 } 6650 6651 reset_context.method = AMD_RESET_METHOD_NONE; 6652 reset_context.reset_req_dev = adev; 6653 set_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags); 6654 set_bit(AMDGPU_SKIP_HW_RESET, &reset_context.flags); 6655 6656 adev->no_hw_access = true; 6657 r = amdgpu_device_pre_asic_reset(adev, &reset_context); 6658 adev->no_hw_access = false; 6659 if (r) 6660 goto out; 6661 6662 r = amdgpu_do_asic_reset(&device_list, &reset_context); 6663 6664 out: 6665 if (!r) { 6666 if (amdgpu_device_cache_pci_state(adev->pdev)) 6667 pci_restore_state(adev->pdev); 6668 6669 DRM_INFO("PCIe error recovery succeeded\n"); 6670 } else { 6671 DRM_ERROR("PCIe error recovery failed, err:%d", r); 6672 amdgpu_device_unset_mp1_state(adev); 6673 amdgpu_device_unlock_reset_domain(adev->reset_domain); 6674 } 6675 6676 return r ? PCI_ERS_RESULT_DISCONNECT : PCI_ERS_RESULT_RECOVERED; 6677 } 6678 6679 /** 6680 * amdgpu_pci_resume() - resume normal ops after PCI reset 6681 * @pdev: pointer to PCI device 6682 * 6683 * Called when the error recovery driver tells us that its 6684 * OK to resume normal operation. 6685 */ 6686 void amdgpu_pci_resume(struct pci_dev *pdev) 6687 { 6688 struct drm_device *dev = pci_get_drvdata(pdev); 6689 struct amdgpu_device *adev = drm_to_adev(dev); 6690 int i; 6691 6692 6693 DRM_INFO("PCI error: resume callback!!\n"); 6694 6695 /* Only continue execution for the case of pci_channel_io_frozen */ 6696 if (adev->pci_channel_state != pci_channel_io_frozen) 6697 return; 6698 6699 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 6700 struct amdgpu_ring *ring = adev->rings[i]; 6701 6702 if (!amdgpu_ring_sched_ready(ring)) 6703 continue; 6704 6705 drm_sched_start(&ring->sched, 0); 6706 } 6707 6708 amdgpu_device_unset_mp1_state(adev); 6709 amdgpu_device_unlock_reset_domain(adev->reset_domain); 6710 } 6711 6712 bool amdgpu_device_cache_pci_state(struct pci_dev *pdev) 6713 { 6714 struct drm_device *dev = pci_get_drvdata(pdev); 6715 struct amdgpu_device *adev = drm_to_adev(dev); 6716 int r; 6717 6718 if (amdgpu_sriov_vf(adev)) 6719 return false; 6720 6721 r = pci_save_state(pdev); 6722 if (!r) { 6723 kfree(adev->pci_state); 6724 6725 adev->pci_state = pci_store_saved_state(pdev); 6726 6727 if (!adev->pci_state) { 6728 DRM_ERROR("Failed to store PCI saved state"); 6729 return false; 6730 } 6731 } else { 6732 DRM_WARN("Failed to save PCI state, err:%d\n", r); 6733 return false; 6734 } 6735 6736 return true; 6737 } 6738 6739 bool amdgpu_device_load_pci_state(struct pci_dev *pdev) 6740 { 6741 struct drm_device *dev = pci_get_drvdata(pdev); 6742 struct amdgpu_device *adev = drm_to_adev(dev); 6743 int r; 6744 6745 if (!adev->pci_state) 6746 return false; 6747 6748 r = pci_load_saved_state(pdev, adev->pci_state); 6749 6750 if (!r) { 6751 pci_restore_state(pdev); 6752 } else { 6753 DRM_WARN("Failed to load PCI state, err:%d\n", r); 6754 return false; 6755 } 6756 6757 return true; 6758 } 6759 6760 void amdgpu_device_flush_hdp(struct amdgpu_device *adev, 6761 struct amdgpu_ring *ring) 6762 { 6763 #ifdef CONFIG_X86_64 6764 if ((adev->flags & AMD_IS_APU) && !amdgpu_passthrough(adev)) 6765 return; 6766 #endif 6767 if (adev->gmc.xgmi.connected_to_cpu) 6768 return; 6769 6770 if (ring && ring->funcs->emit_hdp_flush) 6771 amdgpu_ring_emit_hdp_flush(ring); 6772 else 6773 amdgpu_asic_flush_hdp(adev, ring); 6774 } 6775 6776 void amdgpu_device_invalidate_hdp(struct amdgpu_device *adev, 6777 struct amdgpu_ring *ring) 6778 { 6779 #ifdef CONFIG_X86_64 6780 if ((adev->flags & AMD_IS_APU) && !amdgpu_passthrough(adev)) 6781 return; 6782 #endif 6783 if (adev->gmc.xgmi.connected_to_cpu) 6784 return; 6785 6786 amdgpu_asic_invalidate_hdp(adev, ring); 6787 } 6788 6789 int amdgpu_in_reset(struct amdgpu_device *adev) 6790 { 6791 return atomic_read(&adev->reset_domain->in_gpu_reset); 6792 } 6793 6794 /** 6795 * amdgpu_device_halt() - bring hardware to some kind of halt state 6796 * 6797 * @adev: amdgpu_device pointer 6798 * 6799 * Bring hardware to some kind of halt state so that no one can touch it 6800 * any more. It will help to maintain error context when error occurred. 6801 * Compare to a simple hang, the system will keep stable at least for SSH 6802 * access. Then it should be trivial to inspect the hardware state and 6803 * see what's going on. Implemented as following: 6804 * 6805 * 1. drm_dev_unplug() makes device inaccessible to user space(IOCTLs, etc), 6806 * clears all CPU mappings to device, disallows remappings through page faults 6807 * 2. amdgpu_irq_disable_all() disables all interrupts 6808 * 3. amdgpu_fence_driver_hw_fini() signals all HW fences 6809 * 4. set adev->no_hw_access to avoid potential crashes after setp 5 6810 * 5. amdgpu_device_unmap_mmio() clears all MMIO mappings 6811 * 6. pci_disable_device() and pci_wait_for_pending_transaction() 6812 * flush any in flight DMA operations 6813 */ 6814 void amdgpu_device_halt(struct amdgpu_device *adev) 6815 { 6816 struct pci_dev *pdev = adev->pdev; 6817 struct drm_device *ddev = adev_to_drm(adev); 6818 6819 amdgpu_xcp_dev_unplug(adev); 6820 drm_dev_unplug(ddev); 6821 6822 amdgpu_irq_disable_all(adev); 6823 6824 amdgpu_fence_driver_hw_fini(adev); 6825 6826 adev->no_hw_access = true; 6827 6828 amdgpu_device_unmap_mmio(adev); 6829 6830 pci_disable_device(pdev); 6831 pci_wait_for_pending_transaction(pdev); 6832 } 6833 6834 u32 amdgpu_device_pcie_port_rreg(struct amdgpu_device *adev, 6835 u32 reg) 6836 { 6837 unsigned long flags, address, data; 6838 u32 r; 6839 6840 address = adev->nbio.funcs->get_pcie_port_index_offset(adev); 6841 data = adev->nbio.funcs->get_pcie_port_data_offset(adev); 6842 6843 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 6844 WREG32(address, reg * 4); 6845 (void)RREG32(address); 6846 r = RREG32(data); 6847 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 6848 return r; 6849 } 6850 6851 void amdgpu_device_pcie_port_wreg(struct amdgpu_device *adev, 6852 u32 reg, u32 v) 6853 { 6854 unsigned long flags, address, data; 6855 6856 address = adev->nbio.funcs->get_pcie_port_index_offset(adev); 6857 data = adev->nbio.funcs->get_pcie_port_data_offset(adev); 6858 6859 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 6860 WREG32(address, reg * 4); 6861 (void)RREG32(address); 6862 WREG32(data, v); 6863 (void)RREG32(data); 6864 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 6865 } 6866 6867 /** 6868 * amdgpu_device_get_gang - return a reference to the current gang 6869 * @adev: amdgpu_device pointer 6870 * 6871 * Returns: A new reference to the current gang leader. 6872 */ 6873 struct dma_fence *amdgpu_device_get_gang(struct amdgpu_device *adev) 6874 { 6875 struct dma_fence *fence; 6876 6877 rcu_read_lock(); 6878 fence = dma_fence_get_rcu_safe(&adev->gang_submit); 6879 rcu_read_unlock(); 6880 return fence; 6881 } 6882 6883 /** 6884 * amdgpu_device_switch_gang - switch to a new gang 6885 * @adev: amdgpu_device pointer 6886 * @gang: the gang to switch to 6887 * 6888 * Try to switch to a new gang. 6889 * Returns: NULL if we switched to the new gang or a reference to the current 6890 * gang leader. 6891 */ 6892 struct dma_fence *amdgpu_device_switch_gang(struct amdgpu_device *adev, 6893 struct dma_fence *gang) 6894 { 6895 struct dma_fence *old = NULL; 6896 6897 do { 6898 dma_fence_put(old); 6899 old = amdgpu_device_get_gang(adev); 6900 if (old == gang) 6901 break; 6902 6903 if (!dma_fence_is_signaled(old)) 6904 return old; 6905 6906 } while (cmpxchg((struct dma_fence __force **)&adev->gang_submit, 6907 old, gang) != old); 6908 6909 dma_fence_put(old); 6910 return NULL; 6911 } 6912 6913 bool amdgpu_device_has_display_hardware(struct amdgpu_device *adev) 6914 { 6915 switch (adev->asic_type) { 6916 #ifdef CONFIG_DRM_AMDGPU_SI 6917 case CHIP_HAINAN: 6918 #endif 6919 case CHIP_TOPAZ: 6920 /* chips with no display hardware */ 6921 return false; 6922 #ifdef CONFIG_DRM_AMDGPU_SI 6923 case CHIP_TAHITI: 6924 case CHIP_PITCAIRN: 6925 case CHIP_VERDE: 6926 case CHIP_OLAND: 6927 #endif 6928 #ifdef CONFIG_DRM_AMDGPU_CIK 6929 case CHIP_BONAIRE: 6930 case CHIP_HAWAII: 6931 case CHIP_KAVERI: 6932 case CHIP_KABINI: 6933 case CHIP_MULLINS: 6934 #endif 6935 case CHIP_TONGA: 6936 case CHIP_FIJI: 6937 case CHIP_POLARIS10: 6938 case CHIP_POLARIS11: 6939 case CHIP_POLARIS12: 6940 case CHIP_VEGAM: 6941 case CHIP_CARRIZO: 6942 case CHIP_STONEY: 6943 /* chips with display hardware */ 6944 return true; 6945 default: 6946 /* IP discovery */ 6947 if (!amdgpu_ip_version(adev, DCE_HWIP, 0) || 6948 (adev->harvest_ip_mask & AMD_HARVEST_IP_DMU_MASK)) 6949 return false; 6950 return true; 6951 } 6952 } 6953 6954 uint32_t amdgpu_device_wait_on_rreg(struct amdgpu_device *adev, 6955 uint32_t inst, uint32_t reg_addr, char reg_name[], 6956 uint32_t expected_value, uint32_t mask) 6957 { 6958 uint32_t ret = 0; 6959 uint32_t old_ = 0; 6960 uint32_t tmp_ = RREG32(reg_addr); 6961 uint32_t loop = adev->usec_timeout; 6962 6963 while ((tmp_ & (mask)) != (expected_value)) { 6964 if (old_ != tmp_) { 6965 loop = adev->usec_timeout; 6966 old_ = tmp_; 6967 } else 6968 udelay(1); 6969 tmp_ = RREG32(reg_addr); 6970 loop--; 6971 if (!loop) { 6972 DRM_WARN("Register(%d) [%s] failed to reach value 0x%08x != 0x%08xn", 6973 inst, reg_name, (uint32_t)expected_value, 6974 (uint32_t)(tmp_ & (mask))); 6975 ret = -ETIMEDOUT; 6976 break; 6977 } 6978 } 6979 return ret; 6980 } 6981 6982 ssize_t amdgpu_get_soft_full_reset_mask(struct amdgpu_ring *ring) 6983 { 6984 ssize_t size = 0; 6985 6986 if (!ring || !ring->adev) 6987 return size; 6988 6989 if (amdgpu_device_should_recover_gpu(ring->adev)) 6990 size |= AMDGPU_RESET_TYPE_FULL; 6991 6992 if (unlikely(!ring->adev->debug_disable_soft_recovery) && 6993 !amdgpu_sriov_vf(ring->adev) && ring->funcs->soft_recovery) 6994 size |= AMDGPU_RESET_TYPE_SOFT_RESET; 6995 6996 return size; 6997 } 6998 6999 ssize_t amdgpu_show_reset_mask(char *buf, uint32_t supported_reset) 7000 { 7001 ssize_t size = 0; 7002 7003 if (supported_reset == 0) { 7004 size += sysfs_emit_at(buf, size, "unsupported"); 7005 size += sysfs_emit_at(buf, size, "\n"); 7006 return size; 7007 7008 } 7009 7010 if (supported_reset & AMDGPU_RESET_TYPE_SOFT_RESET) 7011 size += sysfs_emit_at(buf, size, "soft "); 7012 7013 if (supported_reset & AMDGPU_RESET_TYPE_PER_QUEUE) 7014 size += sysfs_emit_at(buf, size, "queue "); 7015 7016 if (supported_reset & AMDGPU_RESET_TYPE_PER_PIPE) 7017 size += sysfs_emit_at(buf, size, "pipe "); 7018 7019 if (supported_reset & AMDGPU_RESET_TYPE_FULL) 7020 size += sysfs_emit_at(buf, size, "full "); 7021 7022 size += sysfs_emit_at(buf, size, "\n"); 7023 return size; 7024 } 7025