1 /* 2 * Copyright 2008 Advanced Micro Devices, Inc. 3 * Copyright 2008 Red Hat Inc. 4 * Copyright 2009 Jerome Glisse. 5 * 6 * Permission is hereby granted, free of charge, to any person obtaining a 7 * copy of this software and associated documentation files (the "Software"), 8 * to deal in the Software without restriction, including without limitation 9 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 10 * and/or sell copies of the Software, and to permit persons to whom the 11 * Software is furnished to do so, subject to the following conditions: 12 * 13 * The above copyright notice and this permission notice shall be included in 14 * all copies or substantial portions of the Software. 15 * 16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 19 * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR 20 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, 21 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR 22 * OTHER DEALINGS IN THE SOFTWARE. 23 * 24 * Authors: Dave Airlie 25 * Alex Deucher 26 * Jerome Glisse 27 */ 28 29 #include <linux/aperture.h> 30 #include <linux/power_supply.h> 31 #include <linux/kthread.h> 32 #include <linux/module.h> 33 #include <linux/console.h> 34 #include <linux/slab.h> 35 #include <linux/iommu.h> 36 #include <linux/pci.h> 37 #include <linux/pci-p2pdma.h> 38 #include <linux/apple-gmux.h> 39 40 #include <drm/drm_atomic_helper.h> 41 #include <drm/drm_client_event.h> 42 #include <drm/drm_crtc_helper.h> 43 #include <drm/drm_probe_helper.h> 44 #include <drm/amdgpu_drm.h> 45 #include <linux/device.h> 46 #include <linux/vgaarb.h> 47 #include <linux/vga_switcheroo.h> 48 #include <linux/efi.h> 49 #include "amdgpu.h" 50 #include "amdgpu_trace.h" 51 #include "amdgpu_i2c.h" 52 #include "atom.h" 53 #include "amdgpu_atombios.h" 54 #include "amdgpu_atomfirmware.h" 55 #include "amd_pcie.h" 56 #ifdef CONFIG_DRM_AMDGPU_SI 57 #include "si.h" 58 #endif 59 #ifdef CONFIG_DRM_AMDGPU_CIK 60 #include "cik.h" 61 #endif 62 #include "vi.h" 63 #include "soc15.h" 64 #include "nv.h" 65 #include "bif/bif_4_1_d.h" 66 #include <linux/firmware.h> 67 #include "amdgpu_vf_error.h" 68 69 #include "amdgpu_amdkfd.h" 70 #include "amdgpu_pm.h" 71 72 #include "amdgpu_xgmi.h" 73 #include "amdgpu_ras.h" 74 #include "amdgpu_pmu.h" 75 #include "amdgpu_fru_eeprom.h" 76 #include "amdgpu_reset.h" 77 #include "amdgpu_virt.h" 78 #include "amdgpu_dev_coredump.h" 79 80 #include <linux/suspend.h> 81 #include <drm/task_barrier.h> 82 #include <linux/pm_runtime.h> 83 84 #include <drm/drm_drv.h> 85 86 #if IS_ENABLED(CONFIG_X86) 87 #include <asm/intel-family.h> 88 #endif 89 90 MODULE_FIRMWARE("amdgpu/vega10_gpu_info.bin"); 91 MODULE_FIRMWARE("amdgpu/vega12_gpu_info.bin"); 92 MODULE_FIRMWARE("amdgpu/raven_gpu_info.bin"); 93 MODULE_FIRMWARE("amdgpu/picasso_gpu_info.bin"); 94 MODULE_FIRMWARE("amdgpu/raven2_gpu_info.bin"); 95 MODULE_FIRMWARE("amdgpu/arcturus_gpu_info.bin"); 96 MODULE_FIRMWARE("amdgpu/navi12_gpu_info.bin"); 97 98 #define AMDGPU_RESUME_MS 2000 99 #define AMDGPU_MAX_RETRY_LIMIT 2 100 #define AMDGPU_RETRY_SRIOV_RESET(r) ((r) == -EBUSY || (r) == -ETIMEDOUT || (r) == -EINVAL) 101 #define AMDGPU_PCIE_INDEX_FALLBACK (0x38 >> 2) 102 #define AMDGPU_PCIE_INDEX_HI_FALLBACK (0x44 >> 2) 103 #define AMDGPU_PCIE_DATA_FALLBACK (0x3C >> 2) 104 105 #define AMDGPU_VBIOS_SKIP (1U << 0) 106 #define AMDGPU_VBIOS_OPTIONAL (1U << 1) 107 108 static const struct drm_driver amdgpu_kms_driver; 109 110 const char *amdgpu_asic_name[] = { 111 "TAHITI", 112 "PITCAIRN", 113 "VERDE", 114 "OLAND", 115 "HAINAN", 116 "BONAIRE", 117 "KAVERI", 118 "KABINI", 119 "HAWAII", 120 "MULLINS", 121 "TOPAZ", 122 "TONGA", 123 "FIJI", 124 "CARRIZO", 125 "STONEY", 126 "POLARIS10", 127 "POLARIS11", 128 "POLARIS12", 129 "VEGAM", 130 "VEGA10", 131 "VEGA12", 132 "VEGA20", 133 "RAVEN", 134 "ARCTURUS", 135 "RENOIR", 136 "ALDEBARAN", 137 "NAVI10", 138 "CYAN_SKILLFISH", 139 "NAVI14", 140 "NAVI12", 141 "SIENNA_CICHLID", 142 "NAVY_FLOUNDER", 143 "VANGOGH", 144 "DIMGREY_CAVEFISH", 145 "BEIGE_GOBY", 146 "YELLOW_CARP", 147 "IP DISCOVERY", 148 "LAST", 149 }; 150 151 #define AMDGPU_IP_BLK_MASK_ALL GENMASK(AMD_IP_BLOCK_TYPE_NUM - 1, 0) 152 /* 153 * Default init level where all blocks are expected to be initialized. This is 154 * the level of initialization expected by default and also after a full reset 155 * of the device. 156 */ 157 struct amdgpu_init_level amdgpu_init_default = { 158 .level = AMDGPU_INIT_LEVEL_DEFAULT, 159 .hwini_ip_block_mask = AMDGPU_IP_BLK_MASK_ALL, 160 }; 161 162 struct amdgpu_init_level amdgpu_init_recovery = { 163 .level = AMDGPU_INIT_LEVEL_RESET_RECOVERY, 164 .hwini_ip_block_mask = AMDGPU_IP_BLK_MASK_ALL, 165 }; 166 167 /* 168 * Minimal blocks needed to be initialized before a XGMI hive can be reset. This 169 * is used for cases like reset on initialization where the entire hive needs to 170 * be reset before first use. 171 */ 172 struct amdgpu_init_level amdgpu_init_minimal_xgmi = { 173 .level = AMDGPU_INIT_LEVEL_MINIMAL_XGMI, 174 .hwini_ip_block_mask = 175 BIT(AMD_IP_BLOCK_TYPE_GMC) | BIT(AMD_IP_BLOCK_TYPE_SMC) | 176 BIT(AMD_IP_BLOCK_TYPE_COMMON) | BIT(AMD_IP_BLOCK_TYPE_IH) | 177 BIT(AMD_IP_BLOCK_TYPE_PSP) 178 }; 179 180 static inline bool amdgpu_ip_member_of_hwini(struct amdgpu_device *adev, 181 enum amd_ip_block_type block) 182 { 183 return (adev->init_lvl->hwini_ip_block_mask & (1U << block)) != 0; 184 } 185 186 void amdgpu_set_init_level(struct amdgpu_device *adev, 187 enum amdgpu_init_lvl_id lvl) 188 { 189 switch (lvl) { 190 case AMDGPU_INIT_LEVEL_MINIMAL_XGMI: 191 adev->init_lvl = &amdgpu_init_minimal_xgmi; 192 break; 193 case AMDGPU_INIT_LEVEL_RESET_RECOVERY: 194 adev->init_lvl = &amdgpu_init_recovery; 195 break; 196 case AMDGPU_INIT_LEVEL_DEFAULT: 197 fallthrough; 198 default: 199 adev->init_lvl = &amdgpu_init_default; 200 break; 201 } 202 } 203 204 static inline void amdgpu_device_stop_pending_resets(struct amdgpu_device *adev); 205 static int amdgpu_device_pm_notifier(struct notifier_block *nb, unsigned long mode, 206 void *data); 207 208 /** 209 * DOC: pcie_replay_count 210 * 211 * The amdgpu driver provides a sysfs API for reporting the total number 212 * of PCIe replays (NAKs). 213 * The file pcie_replay_count is used for this and returns the total 214 * number of replays as a sum of the NAKs generated and NAKs received. 215 */ 216 217 static ssize_t amdgpu_device_get_pcie_replay_count(struct device *dev, 218 struct device_attribute *attr, char *buf) 219 { 220 struct drm_device *ddev = dev_get_drvdata(dev); 221 struct amdgpu_device *adev = drm_to_adev(ddev); 222 uint64_t cnt = amdgpu_asic_get_pcie_replay_count(adev); 223 224 return sysfs_emit(buf, "%llu\n", cnt); 225 } 226 227 static DEVICE_ATTR(pcie_replay_count, 0444, 228 amdgpu_device_get_pcie_replay_count, NULL); 229 230 static int amdgpu_device_attr_sysfs_init(struct amdgpu_device *adev) 231 { 232 int ret = 0; 233 234 if (!amdgpu_sriov_vf(adev)) 235 ret = sysfs_create_file(&adev->dev->kobj, 236 &dev_attr_pcie_replay_count.attr); 237 238 return ret; 239 } 240 241 static void amdgpu_device_attr_sysfs_fini(struct amdgpu_device *adev) 242 { 243 if (!amdgpu_sriov_vf(adev)) 244 sysfs_remove_file(&adev->dev->kobj, 245 &dev_attr_pcie_replay_count.attr); 246 } 247 248 static ssize_t amdgpu_sysfs_reg_state_get(struct file *f, struct kobject *kobj, 249 struct bin_attribute *attr, char *buf, 250 loff_t ppos, size_t count) 251 { 252 struct device *dev = kobj_to_dev(kobj); 253 struct drm_device *ddev = dev_get_drvdata(dev); 254 struct amdgpu_device *adev = drm_to_adev(ddev); 255 ssize_t bytes_read; 256 257 switch (ppos) { 258 case AMDGPU_SYS_REG_STATE_XGMI: 259 bytes_read = amdgpu_asic_get_reg_state( 260 adev, AMDGPU_REG_STATE_TYPE_XGMI, buf, count); 261 break; 262 case AMDGPU_SYS_REG_STATE_WAFL: 263 bytes_read = amdgpu_asic_get_reg_state( 264 adev, AMDGPU_REG_STATE_TYPE_WAFL, buf, count); 265 break; 266 case AMDGPU_SYS_REG_STATE_PCIE: 267 bytes_read = amdgpu_asic_get_reg_state( 268 adev, AMDGPU_REG_STATE_TYPE_PCIE, buf, count); 269 break; 270 case AMDGPU_SYS_REG_STATE_USR: 271 bytes_read = amdgpu_asic_get_reg_state( 272 adev, AMDGPU_REG_STATE_TYPE_USR, buf, count); 273 break; 274 case AMDGPU_SYS_REG_STATE_USR_1: 275 bytes_read = amdgpu_asic_get_reg_state( 276 adev, AMDGPU_REG_STATE_TYPE_USR_1, buf, count); 277 break; 278 default: 279 return -EINVAL; 280 } 281 282 return bytes_read; 283 } 284 285 BIN_ATTR(reg_state, 0444, amdgpu_sysfs_reg_state_get, NULL, 286 AMDGPU_SYS_REG_STATE_END); 287 288 int amdgpu_reg_state_sysfs_init(struct amdgpu_device *adev) 289 { 290 int ret; 291 292 if (!amdgpu_asic_get_reg_state_supported(adev)) 293 return 0; 294 295 ret = sysfs_create_bin_file(&adev->dev->kobj, &bin_attr_reg_state); 296 297 return ret; 298 } 299 300 void amdgpu_reg_state_sysfs_fini(struct amdgpu_device *adev) 301 { 302 if (!amdgpu_asic_get_reg_state_supported(adev)) 303 return; 304 sysfs_remove_bin_file(&adev->dev->kobj, &bin_attr_reg_state); 305 } 306 307 int amdgpu_ip_block_suspend(struct amdgpu_ip_block *ip_block) 308 { 309 int r; 310 311 if (ip_block->version->funcs->suspend) { 312 r = ip_block->version->funcs->suspend(ip_block); 313 if (r) { 314 dev_err(ip_block->adev->dev, 315 "suspend of IP block <%s> failed %d\n", 316 ip_block->version->funcs->name, r); 317 return r; 318 } 319 } 320 321 ip_block->status.hw = false; 322 return 0; 323 } 324 325 int amdgpu_ip_block_resume(struct amdgpu_ip_block *ip_block) 326 { 327 int r; 328 329 if (ip_block->version->funcs->resume) { 330 r = ip_block->version->funcs->resume(ip_block); 331 if (r) { 332 dev_err(ip_block->adev->dev, 333 "resume of IP block <%s> failed %d\n", 334 ip_block->version->funcs->name, r); 335 return r; 336 } 337 } 338 339 ip_block->status.hw = true; 340 return 0; 341 } 342 343 /** 344 * DOC: board_info 345 * 346 * The amdgpu driver provides a sysfs API for giving board related information. 347 * It provides the form factor information in the format 348 * 349 * type : form factor 350 * 351 * Possible form factor values 352 * 353 * - "cem" - PCIE CEM card 354 * - "oam" - Open Compute Accelerator Module 355 * - "unknown" - Not known 356 * 357 */ 358 359 static ssize_t amdgpu_device_get_board_info(struct device *dev, 360 struct device_attribute *attr, 361 char *buf) 362 { 363 struct drm_device *ddev = dev_get_drvdata(dev); 364 struct amdgpu_device *adev = drm_to_adev(ddev); 365 enum amdgpu_pkg_type pkg_type = AMDGPU_PKG_TYPE_CEM; 366 const char *pkg; 367 368 if (adev->smuio.funcs && adev->smuio.funcs->get_pkg_type) 369 pkg_type = adev->smuio.funcs->get_pkg_type(adev); 370 371 switch (pkg_type) { 372 case AMDGPU_PKG_TYPE_CEM: 373 pkg = "cem"; 374 break; 375 case AMDGPU_PKG_TYPE_OAM: 376 pkg = "oam"; 377 break; 378 default: 379 pkg = "unknown"; 380 break; 381 } 382 383 return sysfs_emit(buf, "%s : %s\n", "type", pkg); 384 } 385 386 static DEVICE_ATTR(board_info, 0444, amdgpu_device_get_board_info, NULL); 387 388 static struct attribute *amdgpu_board_attrs[] = { 389 &dev_attr_board_info.attr, 390 NULL, 391 }; 392 393 static umode_t amdgpu_board_attrs_is_visible(struct kobject *kobj, 394 struct attribute *attr, int n) 395 { 396 struct device *dev = kobj_to_dev(kobj); 397 struct drm_device *ddev = dev_get_drvdata(dev); 398 struct amdgpu_device *adev = drm_to_adev(ddev); 399 400 if (adev->flags & AMD_IS_APU) 401 return 0; 402 403 return attr->mode; 404 } 405 406 static const struct attribute_group amdgpu_board_attrs_group = { 407 .attrs = amdgpu_board_attrs, 408 .is_visible = amdgpu_board_attrs_is_visible 409 }; 410 411 static void amdgpu_device_get_pcie_info(struct amdgpu_device *adev); 412 413 414 /** 415 * amdgpu_device_supports_px - Is the device a dGPU with ATPX power control 416 * 417 * @dev: drm_device pointer 418 * 419 * Returns true if the device is a dGPU with ATPX power control, 420 * otherwise return false. 421 */ 422 bool amdgpu_device_supports_px(struct drm_device *dev) 423 { 424 struct amdgpu_device *adev = drm_to_adev(dev); 425 426 if ((adev->flags & AMD_IS_PX) && !amdgpu_is_atpx_hybrid()) 427 return true; 428 return false; 429 } 430 431 /** 432 * amdgpu_device_supports_boco - Is the device a dGPU with ACPI power resources 433 * 434 * @dev: drm_device pointer 435 * 436 * Returns true if the device is a dGPU with ACPI power control, 437 * otherwise return false. 438 */ 439 bool amdgpu_device_supports_boco(struct drm_device *dev) 440 { 441 struct amdgpu_device *adev = drm_to_adev(dev); 442 443 if (!IS_ENABLED(CONFIG_HOTPLUG_PCI_PCIE)) 444 return false; 445 446 if (adev->has_pr3 || 447 ((adev->flags & AMD_IS_PX) && amdgpu_is_atpx_hybrid())) 448 return true; 449 return false; 450 } 451 452 /** 453 * amdgpu_device_supports_baco - Does the device support BACO 454 * 455 * @dev: drm_device pointer 456 * 457 * Return: 458 * 1 if the device supports BACO; 459 * 3 if the device supports MACO (only works if BACO is supported) 460 * otherwise return 0. 461 */ 462 int amdgpu_device_supports_baco(struct drm_device *dev) 463 { 464 struct amdgpu_device *adev = drm_to_adev(dev); 465 466 return amdgpu_asic_supports_baco(adev); 467 } 468 469 void amdgpu_device_detect_runtime_pm_mode(struct amdgpu_device *adev) 470 { 471 struct drm_device *dev; 472 int bamaco_support; 473 474 dev = adev_to_drm(adev); 475 476 adev->pm.rpm_mode = AMDGPU_RUNPM_NONE; 477 bamaco_support = amdgpu_device_supports_baco(dev); 478 479 switch (amdgpu_runtime_pm) { 480 case 2: 481 if (bamaco_support & MACO_SUPPORT) { 482 adev->pm.rpm_mode = AMDGPU_RUNPM_BAMACO; 483 dev_info(adev->dev, "Forcing BAMACO for runtime pm\n"); 484 } else if (bamaco_support == BACO_SUPPORT) { 485 adev->pm.rpm_mode = AMDGPU_RUNPM_BACO; 486 dev_info(adev->dev, "Requested mode BAMACO not available,fallback to use BACO\n"); 487 } 488 break; 489 case 1: 490 if (bamaco_support & BACO_SUPPORT) { 491 adev->pm.rpm_mode = AMDGPU_RUNPM_BACO; 492 dev_info(adev->dev, "Forcing BACO for runtime pm\n"); 493 } 494 break; 495 case -1: 496 case -2: 497 if (amdgpu_device_supports_px(dev)) { /* enable PX as runtime mode */ 498 adev->pm.rpm_mode = AMDGPU_RUNPM_PX; 499 dev_info(adev->dev, "Using ATPX for runtime pm\n"); 500 } else if (amdgpu_device_supports_boco(dev)) { /* enable boco as runtime mode */ 501 adev->pm.rpm_mode = AMDGPU_RUNPM_BOCO; 502 dev_info(adev->dev, "Using BOCO for runtime pm\n"); 503 } else { 504 if (!bamaco_support) 505 goto no_runtime_pm; 506 507 switch (adev->asic_type) { 508 case CHIP_VEGA20: 509 case CHIP_ARCTURUS: 510 /* BACO are not supported on vega20 and arctrus */ 511 break; 512 case CHIP_VEGA10: 513 /* enable BACO as runpm mode if noretry=0 */ 514 if (!adev->gmc.noretry) 515 adev->pm.rpm_mode = AMDGPU_RUNPM_BACO; 516 break; 517 default: 518 /* enable BACO as runpm mode on CI+ */ 519 adev->pm.rpm_mode = AMDGPU_RUNPM_BACO; 520 break; 521 } 522 523 if (adev->pm.rpm_mode == AMDGPU_RUNPM_BACO) { 524 if (bamaco_support & MACO_SUPPORT) { 525 adev->pm.rpm_mode = AMDGPU_RUNPM_BAMACO; 526 dev_info(adev->dev, "Using BAMACO for runtime pm\n"); 527 } else { 528 dev_info(adev->dev, "Using BACO for runtime pm\n"); 529 } 530 } 531 } 532 break; 533 case 0: 534 dev_info(adev->dev, "runtime pm is manually disabled\n"); 535 break; 536 default: 537 break; 538 } 539 540 no_runtime_pm: 541 if (adev->pm.rpm_mode == AMDGPU_RUNPM_NONE) 542 dev_info(adev->dev, "Runtime PM not available\n"); 543 } 544 /** 545 * amdgpu_device_supports_smart_shift - Is the device dGPU with 546 * smart shift support 547 * 548 * @dev: drm_device pointer 549 * 550 * Returns true if the device is a dGPU with Smart Shift support, 551 * otherwise returns false. 552 */ 553 bool amdgpu_device_supports_smart_shift(struct drm_device *dev) 554 { 555 return (amdgpu_device_supports_boco(dev) && 556 amdgpu_acpi_is_power_shift_control_supported()); 557 } 558 559 /* 560 * VRAM access helper functions 561 */ 562 563 /** 564 * amdgpu_device_mm_access - access vram by MM_INDEX/MM_DATA 565 * 566 * @adev: amdgpu_device pointer 567 * @pos: offset of the buffer in vram 568 * @buf: virtual address of the buffer in system memory 569 * @size: read/write size, sizeof(@buf) must > @size 570 * @write: true - write to vram, otherwise - read from vram 571 */ 572 void amdgpu_device_mm_access(struct amdgpu_device *adev, loff_t pos, 573 void *buf, size_t size, bool write) 574 { 575 unsigned long flags; 576 uint32_t hi = ~0, tmp = 0; 577 uint32_t *data = buf; 578 uint64_t last; 579 int idx; 580 581 if (!drm_dev_enter(adev_to_drm(adev), &idx)) 582 return; 583 584 BUG_ON(!IS_ALIGNED(pos, 4) || !IS_ALIGNED(size, 4)); 585 586 spin_lock_irqsave(&adev->mmio_idx_lock, flags); 587 for (last = pos + size; pos < last; pos += 4) { 588 tmp = pos >> 31; 589 590 WREG32_NO_KIQ(mmMM_INDEX, ((uint32_t)pos) | 0x80000000); 591 if (tmp != hi) { 592 WREG32_NO_KIQ(mmMM_INDEX_HI, tmp); 593 hi = tmp; 594 } 595 if (write) 596 WREG32_NO_KIQ(mmMM_DATA, *data++); 597 else 598 *data++ = RREG32_NO_KIQ(mmMM_DATA); 599 } 600 601 spin_unlock_irqrestore(&adev->mmio_idx_lock, flags); 602 drm_dev_exit(idx); 603 } 604 605 /** 606 * amdgpu_device_aper_access - access vram by vram aperture 607 * 608 * @adev: amdgpu_device pointer 609 * @pos: offset of the buffer in vram 610 * @buf: virtual address of the buffer in system memory 611 * @size: read/write size, sizeof(@buf) must > @size 612 * @write: true - write to vram, otherwise - read from vram 613 * 614 * The return value means how many bytes have been transferred. 615 */ 616 size_t amdgpu_device_aper_access(struct amdgpu_device *adev, loff_t pos, 617 void *buf, size_t size, bool write) 618 { 619 #ifdef CONFIG_64BIT 620 void __iomem *addr; 621 size_t count = 0; 622 uint64_t last; 623 624 if (!adev->mman.aper_base_kaddr) 625 return 0; 626 627 last = min(pos + size, adev->gmc.visible_vram_size); 628 if (last > pos) { 629 addr = adev->mman.aper_base_kaddr + pos; 630 count = last - pos; 631 632 if (write) { 633 memcpy_toio(addr, buf, count); 634 /* Make sure HDP write cache flush happens without any reordering 635 * after the system memory contents are sent over PCIe device 636 */ 637 mb(); 638 amdgpu_device_flush_hdp(adev, NULL); 639 } else { 640 amdgpu_device_invalidate_hdp(adev, NULL); 641 /* Make sure HDP read cache is invalidated before issuing a read 642 * to the PCIe device 643 */ 644 mb(); 645 memcpy_fromio(buf, addr, count); 646 } 647 648 } 649 650 return count; 651 #else 652 return 0; 653 #endif 654 } 655 656 /** 657 * amdgpu_device_vram_access - read/write a buffer in vram 658 * 659 * @adev: amdgpu_device pointer 660 * @pos: offset of the buffer in vram 661 * @buf: virtual address of the buffer in system memory 662 * @size: read/write size, sizeof(@buf) must > @size 663 * @write: true - write to vram, otherwise - read from vram 664 */ 665 void amdgpu_device_vram_access(struct amdgpu_device *adev, loff_t pos, 666 void *buf, size_t size, bool write) 667 { 668 size_t count; 669 670 /* try to using vram apreature to access vram first */ 671 count = amdgpu_device_aper_access(adev, pos, buf, size, write); 672 size -= count; 673 if (size) { 674 /* using MM to access rest vram */ 675 pos += count; 676 buf += count; 677 amdgpu_device_mm_access(adev, pos, buf, size, write); 678 } 679 } 680 681 /* 682 * register access helper functions. 683 */ 684 685 /* Check if hw access should be skipped because of hotplug or device error */ 686 bool amdgpu_device_skip_hw_access(struct amdgpu_device *adev) 687 { 688 if (adev->no_hw_access) 689 return true; 690 691 #ifdef CONFIG_LOCKDEP 692 /* 693 * This is a bit complicated to understand, so worth a comment. What we assert 694 * here is that the GPU reset is not running on another thread in parallel. 695 * 696 * For this we trylock the read side of the reset semaphore, if that succeeds 697 * we know that the reset is not running in parallel. 698 * 699 * If the trylock fails we assert that we are either already holding the read 700 * side of the lock or are the reset thread itself and hold the write side of 701 * the lock. 702 */ 703 if (in_task()) { 704 if (down_read_trylock(&adev->reset_domain->sem)) 705 up_read(&adev->reset_domain->sem); 706 else 707 lockdep_assert_held(&adev->reset_domain->sem); 708 } 709 #endif 710 return false; 711 } 712 713 /** 714 * amdgpu_device_rreg - read a memory mapped IO or indirect register 715 * 716 * @adev: amdgpu_device pointer 717 * @reg: dword aligned register offset 718 * @acc_flags: access flags which require special behavior 719 * 720 * Returns the 32 bit value from the offset specified. 721 */ 722 uint32_t amdgpu_device_rreg(struct amdgpu_device *adev, 723 uint32_t reg, uint32_t acc_flags) 724 { 725 uint32_t ret; 726 727 if (amdgpu_device_skip_hw_access(adev)) 728 return 0; 729 730 if ((reg * 4) < adev->rmmio_size) { 731 if (!(acc_flags & AMDGPU_REGS_NO_KIQ) && 732 amdgpu_sriov_runtime(adev) && 733 down_read_trylock(&adev->reset_domain->sem)) { 734 ret = amdgpu_kiq_rreg(adev, reg, 0); 735 up_read(&adev->reset_domain->sem); 736 } else { 737 ret = readl(((void __iomem *)adev->rmmio) + (reg * 4)); 738 } 739 } else { 740 ret = adev->pcie_rreg(adev, reg * 4); 741 } 742 743 trace_amdgpu_device_rreg(adev->pdev->device, reg, ret); 744 745 return ret; 746 } 747 748 /* 749 * MMIO register read with bytes helper functions 750 * @offset:bytes offset from MMIO start 751 */ 752 753 /** 754 * amdgpu_mm_rreg8 - read a memory mapped IO register 755 * 756 * @adev: amdgpu_device pointer 757 * @offset: byte aligned register offset 758 * 759 * Returns the 8 bit value from the offset specified. 760 */ 761 uint8_t amdgpu_mm_rreg8(struct amdgpu_device *adev, uint32_t offset) 762 { 763 if (amdgpu_device_skip_hw_access(adev)) 764 return 0; 765 766 if (offset < adev->rmmio_size) 767 return (readb(adev->rmmio + offset)); 768 BUG(); 769 } 770 771 772 /** 773 * amdgpu_device_xcc_rreg - read a memory mapped IO or indirect register with specific XCC 774 * 775 * @adev: amdgpu_device pointer 776 * @reg: dword aligned register offset 777 * @acc_flags: access flags which require special behavior 778 * @xcc_id: xcc accelerated compute core id 779 * 780 * Returns the 32 bit value from the offset specified. 781 */ 782 uint32_t amdgpu_device_xcc_rreg(struct amdgpu_device *adev, 783 uint32_t reg, uint32_t acc_flags, 784 uint32_t xcc_id) 785 { 786 uint32_t ret, rlcg_flag; 787 788 if (amdgpu_device_skip_hw_access(adev)) 789 return 0; 790 791 if ((reg * 4) < adev->rmmio_size) { 792 if (amdgpu_sriov_vf(adev) && 793 !amdgpu_sriov_runtime(adev) && 794 adev->gfx.rlc.rlcg_reg_access_supported && 795 amdgpu_virt_get_rlcg_reg_access_flag(adev, acc_flags, 796 GC_HWIP, false, 797 &rlcg_flag)) { 798 ret = amdgpu_virt_rlcg_reg_rw(adev, reg, 0, rlcg_flag, GET_INST(GC, xcc_id)); 799 } else if (!(acc_flags & AMDGPU_REGS_NO_KIQ) && 800 amdgpu_sriov_runtime(adev) && 801 down_read_trylock(&adev->reset_domain->sem)) { 802 ret = amdgpu_kiq_rreg(adev, reg, xcc_id); 803 up_read(&adev->reset_domain->sem); 804 } else { 805 ret = readl(((void __iomem *)adev->rmmio) + (reg * 4)); 806 } 807 } else { 808 ret = adev->pcie_rreg(adev, reg * 4); 809 } 810 811 return ret; 812 } 813 814 /* 815 * MMIO register write with bytes helper functions 816 * @offset:bytes offset from MMIO start 817 * @value: the value want to be written to the register 818 */ 819 820 /** 821 * amdgpu_mm_wreg8 - read a memory mapped IO register 822 * 823 * @adev: amdgpu_device pointer 824 * @offset: byte aligned register offset 825 * @value: 8 bit value to write 826 * 827 * Writes the value specified to the offset specified. 828 */ 829 void amdgpu_mm_wreg8(struct amdgpu_device *adev, uint32_t offset, uint8_t value) 830 { 831 if (amdgpu_device_skip_hw_access(adev)) 832 return; 833 834 if (offset < adev->rmmio_size) 835 writeb(value, adev->rmmio + offset); 836 else 837 BUG(); 838 } 839 840 /** 841 * amdgpu_device_wreg - write to a memory mapped IO or indirect register 842 * 843 * @adev: amdgpu_device pointer 844 * @reg: dword aligned register offset 845 * @v: 32 bit value to write to the register 846 * @acc_flags: access flags which require special behavior 847 * 848 * Writes the value specified to the offset specified. 849 */ 850 void amdgpu_device_wreg(struct amdgpu_device *adev, 851 uint32_t reg, uint32_t v, 852 uint32_t acc_flags) 853 { 854 if (amdgpu_device_skip_hw_access(adev)) 855 return; 856 857 if ((reg * 4) < adev->rmmio_size) { 858 if (!(acc_flags & AMDGPU_REGS_NO_KIQ) && 859 amdgpu_sriov_runtime(adev) && 860 down_read_trylock(&adev->reset_domain->sem)) { 861 amdgpu_kiq_wreg(adev, reg, v, 0); 862 up_read(&adev->reset_domain->sem); 863 } else { 864 writel(v, ((void __iomem *)adev->rmmio) + (reg * 4)); 865 } 866 } else { 867 adev->pcie_wreg(adev, reg * 4, v); 868 } 869 870 trace_amdgpu_device_wreg(adev->pdev->device, reg, v); 871 } 872 873 /** 874 * amdgpu_mm_wreg_mmio_rlc - write register either with direct/indirect mmio or with RLC path if in range 875 * 876 * @adev: amdgpu_device pointer 877 * @reg: mmio/rlc register 878 * @v: value to write 879 * @xcc_id: xcc accelerated compute core id 880 * 881 * this function is invoked only for the debugfs register access 882 */ 883 void amdgpu_mm_wreg_mmio_rlc(struct amdgpu_device *adev, 884 uint32_t reg, uint32_t v, 885 uint32_t xcc_id) 886 { 887 if (amdgpu_device_skip_hw_access(adev)) 888 return; 889 890 if (amdgpu_sriov_fullaccess(adev) && 891 adev->gfx.rlc.funcs && 892 adev->gfx.rlc.funcs->is_rlcg_access_range) { 893 if (adev->gfx.rlc.funcs->is_rlcg_access_range(adev, reg)) 894 return amdgpu_sriov_wreg(adev, reg, v, 0, 0, xcc_id); 895 } else if ((reg * 4) >= adev->rmmio_size) { 896 adev->pcie_wreg(adev, reg * 4, v); 897 } else { 898 writel(v, ((void __iomem *)adev->rmmio) + (reg * 4)); 899 } 900 } 901 902 /** 903 * amdgpu_device_xcc_wreg - write to a memory mapped IO or indirect register with specific XCC 904 * 905 * @adev: amdgpu_device pointer 906 * @reg: dword aligned register offset 907 * @v: 32 bit value to write to the register 908 * @acc_flags: access flags which require special behavior 909 * @xcc_id: xcc accelerated compute core id 910 * 911 * Writes the value specified to the offset specified. 912 */ 913 void amdgpu_device_xcc_wreg(struct amdgpu_device *adev, 914 uint32_t reg, uint32_t v, 915 uint32_t acc_flags, uint32_t xcc_id) 916 { 917 uint32_t rlcg_flag; 918 919 if (amdgpu_device_skip_hw_access(adev)) 920 return; 921 922 if ((reg * 4) < adev->rmmio_size) { 923 if (amdgpu_sriov_vf(adev) && 924 !amdgpu_sriov_runtime(adev) && 925 adev->gfx.rlc.rlcg_reg_access_supported && 926 amdgpu_virt_get_rlcg_reg_access_flag(adev, acc_flags, 927 GC_HWIP, true, 928 &rlcg_flag)) { 929 amdgpu_virt_rlcg_reg_rw(adev, reg, v, rlcg_flag, GET_INST(GC, xcc_id)); 930 } else if (!(acc_flags & AMDGPU_REGS_NO_KIQ) && 931 amdgpu_sriov_runtime(adev) && 932 down_read_trylock(&adev->reset_domain->sem)) { 933 amdgpu_kiq_wreg(adev, reg, v, xcc_id); 934 up_read(&adev->reset_domain->sem); 935 } else { 936 writel(v, ((void __iomem *)adev->rmmio) + (reg * 4)); 937 } 938 } else { 939 adev->pcie_wreg(adev, reg * 4, v); 940 } 941 } 942 943 /** 944 * amdgpu_device_indirect_rreg - read an indirect register 945 * 946 * @adev: amdgpu_device pointer 947 * @reg_addr: indirect register address to read from 948 * 949 * Returns the value of indirect register @reg_addr 950 */ 951 u32 amdgpu_device_indirect_rreg(struct amdgpu_device *adev, 952 u32 reg_addr) 953 { 954 unsigned long flags, pcie_index, pcie_data; 955 void __iomem *pcie_index_offset; 956 void __iomem *pcie_data_offset; 957 u32 r; 958 959 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 960 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 961 962 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 963 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 964 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 965 966 writel(reg_addr, pcie_index_offset); 967 readl(pcie_index_offset); 968 r = readl(pcie_data_offset); 969 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 970 971 return r; 972 } 973 974 u32 amdgpu_device_indirect_rreg_ext(struct amdgpu_device *adev, 975 u64 reg_addr) 976 { 977 unsigned long flags, pcie_index, pcie_index_hi, pcie_data; 978 u32 r; 979 void __iomem *pcie_index_offset; 980 void __iomem *pcie_index_hi_offset; 981 void __iomem *pcie_data_offset; 982 983 if (unlikely(!adev->nbio.funcs)) { 984 pcie_index = AMDGPU_PCIE_INDEX_FALLBACK; 985 pcie_data = AMDGPU_PCIE_DATA_FALLBACK; 986 } else { 987 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 988 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 989 } 990 991 if (reg_addr >> 32) { 992 if (unlikely(!adev->nbio.funcs)) 993 pcie_index_hi = AMDGPU_PCIE_INDEX_HI_FALLBACK; 994 else 995 pcie_index_hi = adev->nbio.funcs->get_pcie_index_hi_offset(adev); 996 } else { 997 pcie_index_hi = 0; 998 } 999 1000 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 1001 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 1002 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 1003 if (pcie_index_hi != 0) 1004 pcie_index_hi_offset = (void __iomem *)adev->rmmio + 1005 pcie_index_hi * 4; 1006 1007 writel(reg_addr, pcie_index_offset); 1008 readl(pcie_index_offset); 1009 if (pcie_index_hi != 0) { 1010 writel((reg_addr >> 32) & 0xff, pcie_index_hi_offset); 1011 readl(pcie_index_hi_offset); 1012 } 1013 r = readl(pcie_data_offset); 1014 1015 /* clear the high bits */ 1016 if (pcie_index_hi != 0) { 1017 writel(0, pcie_index_hi_offset); 1018 readl(pcie_index_hi_offset); 1019 } 1020 1021 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 1022 1023 return r; 1024 } 1025 1026 /** 1027 * amdgpu_device_indirect_rreg64 - read a 64bits indirect register 1028 * 1029 * @adev: amdgpu_device pointer 1030 * @reg_addr: indirect register address to read from 1031 * 1032 * Returns the value of indirect register @reg_addr 1033 */ 1034 u64 amdgpu_device_indirect_rreg64(struct amdgpu_device *adev, 1035 u32 reg_addr) 1036 { 1037 unsigned long flags, pcie_index, pcie_data; 1038 void __iomem *pcie_index_offset; 1039 void __iomem *pcie_data_offset; 1040 u64 r; 1041 1042 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 1043 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 1044 1045 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 1046 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 1047 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 1048 1049 /* read low 32 bits */ 1050 writel(reg_addr, pcie_index_offset); 1051 readl(pcie_index_offset); 1052 r = readl(pcie_data_offset); 1053 /* read high 32 bits */ 1054 writel(reg_addr + 4, pcie_index_offset); 1055 readl(pcie_index_offset); 1056 r |= ((u64)readl(pcie_data_offset) << 32); 1057 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 1058 1059 return r; 1060 } 1061 1062 u64 amdgpu_device_indirect_rreg64_ext(struct amdgpu_device *adev, 1063 u64 reg_addr) 1064 { 1065 unsigned long flags, pcie_index, pcie_data; 1066 unsigned long pcie_index_hi = 0; 1067 void __iomem *pcie_index_offset; 1068 void __iomem *pcie_index_hi_offset; 1069 void __iomem *pcie_data_offset; 1070 u64 r; 1071 1072 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 1073 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 1074 if ((reg_addr >> 32) && (adev->nbio.funcs->get_pcie_index_hi_offset)) 1075 pcie_index_hi = adev->nbio.funcs->get_pcie_index_hi_offset(adev); 1076 1077 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 1078 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 1079 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 1080 if (pcie_index_hi != 0) 1081 pcie_index_hi_offset = (void __iomem *)adev->rmmio + 1082 pcie_index_hi * 4; 1083 1084 /* read low 32 bits */ 1085 writel(reg_addr, pcie_index_offset); 1086 readl(pcie_index_offset); 1087 if (pcie_index_hi != 0) { 1088 writel((reg_addr >> 32) & 0xff, pcie_index_hi_offset); 1089 readl(pcie_index_hi_offset); 1090 } 1091 r = readl(pcie_data_offset); 1092 /* read high 32 bits */ 1093 writel(reg_addr + 4, pcie_index_offset); 1094 readl(pcie_index_offset); 1095 if (pcie_index_hi != 0) { 1096 writel((reg_addr >> 32) & 0xff, pcie_index_hi_offset); 1097 readl(pcie_index_hi_offset); 1098 } 1099 r |= ((u64)readl(pcie_data_offset) << 32); 1100 1101 /* clear the high bits */ 1102 if (pcie_index_hi != 0) { 1103 writel(0, pcie_index_hi_offset); 1104 readl(pcie_index_hi_offset); 1105 } 1106 1107 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 1108 1109 return r; 1110 } 1111 1112 /** 1113 * amdgpu_device_indirect_wreg - write an indirect register address 1114 * 1115 * @adev: amdgpu_device pointer 1116 * @reg_addr: indirect register offset 1117 * @reg_data: indirect register data 1118 * 1119 */ 1120 void amdgpu_device_indirect_wreg(struct amdgpu_device *adev, 1121 u32 reg_addr, u32 reg_data) 1122 { 1123 unsigned long flags, pcie_index, pcie_data; 1124 void __iomem *pcie_index_offset; 1125 void __iomem *pcie_data_offset; 1126 1127 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 1128 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 1129 1130 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 1131 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 1132 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 1133 1134 writel(reg_addr, pcie_index_offset); 1135 readl(pcie_index_offset); 1136 writel(reg_data, pcie_data_offset); 1137 readl(pcie_data_offset); 1138 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 1139 } 1140 1141 void amdgpu_device_indirect_wreg_ext(struct amdgpu_device *adev, 1142 u64 reg_addr, u32 reg_data) 1143 { 1144 unsigned long flags, pcie_index, pcie_index_hi, pcie_data; 1145 void __iomem *pcie_index_offset; 1146 void __iomem *pcie_index_hi_offset; 1147 void __iomem *pcie_data_offset; 1148 1149 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 1150 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 1151 if ((reg_addr >> 32) && (adev->nbio.funcs->get_pcie_index_hi_offset)) 1152 pcie_index_hi = adev->nbio.funcs->get_pcie_index_hi_offset(adev); 1153 else 1154 pcie_index_hi = 0; 1155 1156 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 1157 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 1158 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 1159 if (pcie_index_hi != 0) 1160 pcie_index_hi_offset = (void __iomem *)adev->rmmio + 1161 pcie_index_hi * 4; 1162 1163 writel(reg_addr, pcie_index_offset); 1164 readl(pcie_index_offset); 1165 if (pcie_index_hi != 0) { 1166 writel((reg_addr >> 32) & 0xff, pcie_index_hi_offset); 1167 readl(pcie_index_hi_offset); 1168 } 1169 writel(reg_data, pcie_data_offset); 1170 readl(pcie_data_offset); 1171 1172 /* clear the high bits */ 1173 if (pcie_index_hi != 0) { 1174 writel(0, pcie_index_hi_offset); 1175 readl(pcie_index_hi_offset); 1176 } 1177 1178 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 1179 } 1180 1181 /** 1182 * amdgpu_device_indirect_wreg64 - write a 64bits indirect register address 1183 * 1184 * @adev: amdgpu_device pointer 1185 * @reg_addr: indirect register offset 1186 * @reg_data: indirect register data 1187 * 1188 */ 1189 void amdgpu_device_indirect_wreg64(struct amdgpu_device *adev, 1190 u32 reg_addr, u64 reg_data) 1191 { 1192 unsigned long flags, pcie_index, pcie_data; 1193 void __iomem *pcie_index_offset; 1194 void __iomem *pcie_data_offset; 1195 1196 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 1197 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 1198 1199 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 1200 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 1201 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 1202 1203 /* write low 32 bits */ 1204 writel(reg_addr, pcie_index_offset); 1205 readl(pcie_index_offset); 1206 writel((u32)(reg_data & 0xffffffffULL), pcie_data_offset); 1207 readl(pcie_data_offset); 1208 /* write high 32 bits */ 1209 writel(reg_addr + 4, pcie_index_offset); 1210 readl(pcie_index_offset); 1211 writel((u32)(reg_data >> 32), pcie_data_offset); 1212 readl(pcie_data_offset); 1213 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 1214 } 1215 1216 void amdgpu_device_indirect_wreg64_ext(struct amdgpu_device *adev, 1217 u64 reg_addr, u64 reg_data) 1218 { 1219 unsigned long flags, pcie_index, pcie_data; 1220 unsigned long pcie_index_hi = 0; 1221 void __iomem *pcie_index_offset; 1222 void __iomem *pcie_index_hi_offset; 1223 void __iomem *pcie_data_offset; 1224 1225 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 1226 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 1227 if ((reg_addr >> 32) && (adev->nbio.funcs->get_pcie_index_hi_offset)) 1228 pcie_index_hi = adev->nbio.funcs->get_pcie_index_hi_offset(adev); 1229 1230 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 1231 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 1232 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 1233 if (pcie_index_hi != 0) 1234 pcie_index_hi_offset = (void __iomem *)adev->rmmio + 1235 pcie_index_hi * 4; 1236 1237 /* write low 32 bits */ 1238 writel(reg_addr, pcie_index_offset); 1239 readl(pcie_index_offset); 1240 if (pcie_index_hi != 0) { 1241 writel((reg_addr >> 32) & 0xff, pcie_index_hi_offset); 1242 readl(pcie_index_hi_offset); 1243 } 1244 writel((u32)(reg_data & 0xffffffffULL), pcie_data_offset); 1245 readl(pcie_data_offset); 1246 /* write high 32 bits */ 1247 writel(reg_addr + 4, pcie_index_offset); 1248 readl(pcie_index_offset); 1249 if (pcie_index_hi != 0) { 1250 writel((reg_addr >> 32) & 0xff, pcie_index_hi_offset); 1251 readl(pcie_index_hi_offset); 1252 } 1253 writel((u32)(reg_data >> 32), pcie_data_offset); 1254 readl(pcie_data_offset); 1255 1256 /* clear the high bits */ 1257 if (pcie_index_hi != 0) { 1258 writel(0, pcie_index_hi_offset); 1259 readl(pcie_index_hi_offset); 1260 } 1261 1262 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 1263 } 1264 1265 /** 1266 * amdgpu_device_get_rev_id - query device rev_id 1267 * 1268 * @adev: amdgpu_device pointer 1269 * 1270 * Return device rev_id 1271 */ 1272 u32 amdgpu_device_get_rev_id(struct amdgpu_device *adev) 1273 { 1274 return adev->nbio.funcs->get_rev_id(adev); 1275 } 1276 1277 /** 1278 * amdgpu_invalid_rreg - dummy reg read function 1279 * 1280 * @adev: amdgpu_device pointer 1281 * @reg: offset of register 1282 * 1283 * Dummy register read function. Used for register blocks 1284 * that certain asics don't have (all asics). 1285 * Returns the value in the register. 1286 */ 1287 static uint32_t amdgpu_invalid_rreg(struct amdgpu_device *adev, uint32_t reg) 1288 { 1289 DRM_ERROR("Invalid callback to read register 0x%04X\n", reg); 1290 BUG(); 1291 return 0; 1292 } 1293 1294 static uint32_t amdgpu_invalid_rreg_ext(struct amdgpu_device *adev, uint64_t reg) 1295 { 1296 DRM_ERROR("Invalid callback to read register 0x%llX\n", reg); 1297 BUG(); 1298 return 0; 1299 } 1300 1301 /** 1302 * amdgpu_invalid_wreg - dummy reg write function 1303 * 1304 * @adev: amdgpu_device pointer 1305 * @reg: offset of register 1306 * @v: value to write to the register 1307 * 1308 * Dummy register read function. Used for register blocks 1309 * that certain asics don't have (all asics). 1310 */ 1311 static void amdgpu_invalid_wreg(struct amdgpu_device *adev, uint32_t reg, uint32_t v) 1312 { 1313 DRM_ERROR("Invalid callback to write register 0x%04X with 0x%08X\n", 1314 reg, v); 1315 BUG(); 1316 } 1317 1318 static void amdgpu_invalid_wreg_ext(struct amdgpu_device *adev, uint64_t reg, uint32_t v) 1319 { 1320 DRM_ERROR("Invalid callback to write register 0x%llX with 0x%08X\n", 1321 reg, v); 1322 BUG(); 1323 } 1324 1325 /** 1326 * amdgpu_invalid_rreg64 - dummy 64 bit reg read function 1327 * 1328 * @adev: amdgpu_device pointer 1329 * @reg: offset of register 1330 * 1331 * Dummy register read function. Used for register blocks 1332 * that certain asics don't have (all asics). 1333 * Returns the value in the register. 1334 */ 1335 static uint64_t amdgpu_invalid_rreg64(struct amdgpu_device *adev, uint32_t reg) 1336 { 1337 DRM_ERROR("Invalid callback to read 64 bit register 0x%04X\n", reg); 1338 BUG(); 1339 return 0; 1340 } 1341 1342 static uint64_t amdgpu_invalid_rreg64_ext(struct amdgpu_device *adev, uint64_t reg) 1343 { 1344 DRM_ERROR("Invalid callback to read register 0x%llX\n", reg); 1345 BUG(); 1346 return 0; 1347 } 1348 1349 /** 1350 * amdgpu_invalid_wreg64 - dummy reg write function 1351 * 1352 * @adev: amdgpu_device pointer 1353 * @reg: offset of register 1354 * @v: value to write to the register 1355 * 1356 * Dummy register read function. Used for register blocks 1357 * that certain asics don't have (all asics). 1358 */ 1359 static void amdgpu_invalid_wreg64(struct amdgpu_device *adev, uint32_t reg, uint64_t v) 1360 { 1361 DRM_ERROR("Invalid callback to write 64 bit register 0x%04X with 0x%08llX\n", 1362 reg, v); 1363 BUG(); 1364 } 1365 1366 static void amdgpu_invalid_wreg64_ext(struct amdgpu_device *adev, uint64_t reg, uint64_t v) 1367 { 1368 DRM_ERROR("Invalid callback to write 64 bit register 0x%llX with 0x%08llX\n", 1369 reg, v); 1370 BUG(); 1371 } 1372 1373 /** 1374 * amdgpu_block_invalid_rreg - dummy reg read function 1375 * 1376 * @adev: amdgpu_device pointer 1377 * @block: offset of instance 1378 * @reg: offset of register 1379 * 1380 * Dummy register read function. Used for register blocks 1381 * that certain asics don't have (all asics). 1382 * Returns the value in the register. 1383 */ 1384 static uint32_t amdgpu_block_invalid_rreg(struct amdgpu_device *adev, 1385 uint32_t block, uint32_t reg) 1386 { 1387 DRM_ERROR("Invalid callback to read register 0x%04X in block 0x%04X\n", 1388 reg, block); 1389 BUG(); 1390 return 0; 1391 } 1392 1393 /** 1394 * amdgpu_block_invalid_wreg - dummy reg write function 1395 * 1396 * @adev: amdgpu_device pointer 1397 * @block: offset of instance 1398 * @reg: offset of register 1399 * @v: value to write to the register 1400 * 1401 * Dummy register read function. Used for register blocks 1402 * that certain asics don't have (all asics). 1403 */ 1404 static void amdgpu_block_invalid_wreg(struct amdgpu_device *adev, 1405 uint32_t block, 1406 uint32_t reg, uint32_t v) 1407 { 1408 DRM_ERROR("Invalid block callback to write register 0x%04X in block 0x%04X with 0x%08X\n", 1409 reg, block, v); 1410 BUG(); 1411 } 1412 1413 static uint32_t amdgpu_device_get_vbios_flags(struct amdgpu_device *adev) 1414 { 1415 if (hweight32(adev->aid_mask) && (adev->flags & AMD_IS_APU)) 1416 return AMDGPU_VBIOS_SKIP; 1417 1418 if (hweight32(adev->aid_mask) && amdgpu_passthrough(adev)) 1419 return AMDGPU_VBIOS_OPTIONAL; 1420 1421 return 0; 1422 } 1423 1424 /** 1425 * amdgpu_device_asic_init - Wrapper for atom asic_init 1426 * 1427 * @adev: amdgpu_device pointer 1428 * 1429 * Does any asic specific work and then calls atom asic init. 1430 */ 1431 static int amdgpu_device_asic_init(struct amdgpu_device *adev) 1432 { 1433 uint32_t flags; 1434 bool optional; 1435 int ret; 1436 1437 amdgpu_asic_pre_asic_init(adev); 1438 flags = amdgpu_device_get_vbios_flags(adev); 1439 optional = !!(flags & (AMDGPU_VBIOS_OPTIONAL | AMDGPU_VBIOS_SKIP)); 1440 1441 if (amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 3) || 1442 amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 4) || 1443 amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 5, 0) || 1444 amdgpu_ip_version(adev, GC_HWIP, 0) >= IP_VERSION(11, 0, 0)) { 1445 amdgpu_psp_wait_for_bootloader(adev); 1446 if (optional && !adev->bios) 1447 return 0; 1448 1449 ret = amdgpu_atomfirmware_asic_init(adev, true); 1450 return ret; 1451 } else { 1452 if (optional && !adev->bios) 1453 return 0; 1454 1455 return amdgpu_atom_asic_init(adev->mode_info.atom_context); 1456 } 1457 1458 return 0; 1459 } 1460 1461 /** 1462 * amdgpu_device_mem_scratch_init - allocate the VRAM scratch page 1463 * 1464 * @adev: amdgpu_device pointer 1465 * 1466 * Allocates a scratch page of VRAM for use by various things in the 1467 * driver. 1468 */ 1469 static int amdgpu_device_mem_scratch_init(struct amdgpu_device *adev) 1470 { 1471 return amdgpu_bo_create_kernel(adev, AMDGPU_GPU_PAGE_SIZE, PAGE_SIZE, 1472 AMDGPU_GEM_DOMAIN_VRAM | 1473 AMDGPU_GEM_DOMAIN_GTT, 1474 &adev->mem_scratch.robj, 1475 &adev->mem_scratch.gpu_addr, 1476 (void **)&adev->mem_scratch.ptr); 1477 } 1478 1479 /** 1480 * amdgpu_device_mem_scratch_fini - Free the VRAM scratch page 1481 * 1482 * @adev: amdgpu_device pointer 1483 * 1484 * Frees the VRAM scratch page. 1485 */ 1486 static void amdgpu_device_mem_scratch_fini(struct amdgpu_device *adev) 1487 { 1488 amdgpu_bo_free_kernel(&adev->mem_scratch.robj, NULL, NULL); 1489 } 1490 1491 /** 1492 * amdgpu_device_program_register_sequence - program an array of registers. 1493 * 1494 * @adev: amdgpu_device pointer 1495 * @registers: pointer to the register array 1496 * @array_size: size of the register array 1497 * 1498 * Programs an array or registers with and or masks. 1499 * This is a helper for setting golden registers. 1500 */ 1501 void amdgpu_device_program_register_sequence(struct amdgpu_device *adev, 1502 const u32 *registers, 1503 const u32 array_size) 1504 { 1505 u32 tmp, reg, and_mask, or_mask; 1506 int i; 1507 1508 if (array_size % 3) 1509 return; 1510 1511 for (i = 0; i < array_size; i += 3) { 1512 reg = registers[i + 0]; 1513 and_mask = registers[i + 1]; 1514 or_mask = registers[i + 2]; 1515 1516 if (and_mask == 0xffffffff) { 1517 tmp = or_mask; 1518 } else { 1519 tmp = RREG32(reg); 1520 tmp &= ~and_mask; 1521 if (adev->family >= AMDGPU_FAMILY_AI) 1522 tmp |= (or_mask & and_mask); 1523 else 1524 tmp |= or_mask; 1525 } 1526 WREG32(reg, tmp); 1527 } 1528 } 1529 1530 /** 1531 * amdgpu_device_pci_config_reset - reset the GPU 1532 * 1533 * @adev: amdgpu_device pointer 1534 * 1535 * Resets the GPU using the pci config reset sequence. 1536 * Only applicable to asics prior to vega10. 1537 */ 1538 void amdgpu_device_pci_config_reset(struct amdgpu_device *adev) 1539 { 1540 pci_write_config_dword(adev->pdev, 0x7c, AMDGPU_ASIC_RESET_DATA); 1541 } 1542 1543 /** 1544 * amdgpu_device_pci_reset - reset the GPU using generic PCI means 1545 * 1546 * @adev: amdgpu_device pointer 1547 * 1548 * Resets the GPU using generic pci reset interfaces (FLR, SBR, etc.). 1549 */ 1550 int amdgpu_device_pci_reset(struct amdgpu_device *adev) 1551 { 1552 return pci_reset_function(adev->pdev); 1553 } 1554 1555 /* 1556 * amdgpu_device_wb_*() 1557 * Writeback is the method by which the GPU updates special pages in memory 1558 * with the status of certain GPU events (fences, ring pointers,etc.). 1559 */ 1560 1561 /** 1562 * amdgpu_device_wb_fini - Disable Writeback and free memory 1563 * 1564 * @adev: amdgpu_device pointer 1565 * 1566 * Disables Writeback and frees the Writeback memory (all asics). 1567 * Used at driver shutdown. 1568 */ 1569 static void amdgpu_device_wb_fini(struct amdgpu_device *adev) 1570 { 1571 if (adev->wb.wb_obj) { 1572 amdgpu_bo_free_kernel(&adev->wb.wb_obj, 1573 &adev->wb.gpu_addr, 1574 (void **)&adev->wb.wb); 1575 adev->wb.wb_obj = NULL; 1576 } 1577 } 1578 1579 /** 1580 * amdgpu_device_wb_init - Init Writeback driver info and allocate memory 1581 * 1582 * @adev: amdgpu_device pointer 1583 * 1584 * Initializes writeback and allocates writeback memory (all asics). 1585 * Used at driver startup. 1586 * Returns 0 on success or an -error on failure. 1587 */ 1588 static int amdgpu_device_wb_init(struct amdgpu_device *adev) 1589 { 1590 int r; 1591 1592 if (adev->wb.wb_obj == NULL) { 1593 /* AMDGPU_MAX_WB * sizeof(uint32_t) * 8 = AMDGPU_MAX_WB 256bit slots */ 1594 r = amdgpu_bo_create_kernel(adev, AMDGPU_MAX_WB * sizeof(uint32_t) * 8, 1595 PAGE_SIZE, AMDGPU_GEM_DOMAIN_GTT, 1596 &adev->wb.wb_obj, &adev->wb.gpu_addr, 1597 (void **)&adev->wb.wb); 1598 if (r) { 1599 dev_warn(adev->dev, "(%d) create WB bo failed\n", r); 1600 return r; 1601 } 1602 1603 adev->wb.num_wb = AMDGPU_MAX_WB; 1604 memset(&adev->wb.used, 0, sizeof(adev->wb.used)); 1605 1606 /* clear wb memory */ 1607 memset((char *)adev->wb.wb, 0, AMDGPU_MAX_WB * sizeof(uint32_t) * 8); 1608 } 1609 1610 return 0; 1611 } 1612 1613 /** 1614 * amdgpu_device_wb_get - Allocate a wb entry 1615 * 1616 * @adev: amdgpu_device pointer 1617 * @wb: wb index 1618 * 1619 * Allocate a wb slot for use by the driver (all asics). 1620 * Returns 0 on success or -EINVAL on failure. 1621 */ 1622 int amdgpu_device_wb_get(struct amdgpu_device *adev, u32 *wb) 1623 { 1624 unsigned long flags, offset; 1625 1626 spin_lock_irqsave(&adev->wb.lock, flags); 1627 offset = find_first_zero_bit(adev->wb.used, adev->wb.num_wb); 1628 if (offset < adev->wb.num_wb) { 1629 __set_bit(offset, adev->wb.used); 1630 spin_unlock_irqrestore(&adev->wb.lock, flags); 1631 *wb = offset << 3; /* convert to dw offset */ 1632 return 0; 1633 } else { 1634 spin_unlock_irqrestore(&adev->wb.lock, flags); 1635 return -EINVAL; 1636 } 1637 } 1638 1639 /** 1640 * amdgpu_device_wb_free - Free a wb entry 1641 * 1642 * @adev: amdgpu_device pointer 1643 * @wb: wb index 1644 * 1645 * Free a wb slot allocated for use by the driver (all asics) 1646 */ 1647 void amdgpu_device_wb_free(struct amdgpu_device *adev, u32 wb) 1648 { 1649 unsigned long flags; 1650 1651 wb >>= 3; 1652 spin_lock_irqsave(&adev->wb.lock, flags); 1653 if (wb < adev->wb.num_wb) 1654 __clear_bit(wb, adev->wb.used); 1655 spin_unlock_irqrestore(&adev->wb.lock, flags); 1656 } 1657 1658 /** 1659 * amdgpu_device_resize_fb_bar - try to resize FB BAR 1660 * 1661 * @adev: amdgpu_device pointer 1662 * 1663 * Try to resize FB BAR to make all VRAM CPU accessible. We try very hard not 1664 * to fail, but if any of the BARs is not accessible after the size we abort 1665 * driver loading by returning -ENODEV. 1666 */ 1667 int amdgpu_device_resize_fb_bar(struct amdgpu_device *adev) 1668 { 1669 int rbar_size = pci_rebar_bytes_to_size(adev->gmc.real_vram_size); 1670 struct pci_bus *root; 1671 struct resource *res; 1672 unsigned int i; 1673 u16 cmd; 1674 int r; 1675 1676 if (!IS_ENABLED(CONFIG_PHYS_ADDR_T_64BIT)) 1677 return 0; 1678 1679 /* Bypass for VF */ 1680 if (amdgpu_sriov_vf(adev)) 1681 return 0; 1682 1683 if (!amdgpu_rebar) 1684 return 0; 1685 1686 /* resizing on Dell G5 SE platforms causes problems with runtime pm */ 1687 if ((amdgpu_runtime_pm != 0) && 1688 adev->pdev->vendor == PCI_VENDOR_ID_ATI && 1689 adev->pdev->device == 0x731f && 1690 adev->pdev->subsystem_vendor == PCI_VENDOR_ID_DELL) 1691 return 0; 1692 1693 /* PCI_EXT_CAP_ID_VNDR extended capability is located at 0x100 */ 1694 if (!pci_find_ext_capability(adev->pdev, PCI_EXT_CAP_ID_VNDR)) 1695 DRM_WARN("System can't access extended configuration space, please check!!\n"); 1696 1697 /* skip if the bios has already enabled large BAR */ 1698 if (adev->gmc.real_vram_size && 1699 (pci_resource_len(adev->pdev, 0) >= adev->gmc.real_vram_size)) 1700 return 0; 1701 1702 /* Check if the root BUS has 64bit memory resources */ 1703 root = adev->pdev->bus; 1704 while (root->parent) 1705 root = root->parent; 1706 1707 pci_bus_for_each_resource(root, res, i) { 1708 if (res && res->flags & (IORESOURCE_MEM | IORESOURCE_MEM_64) && 1709 res->start > 0x100000000ull) 1710 break; 1711 } 1712 1713 /* Trying to resize is pointless without a root hub window above 4GB */ 1714 if (!res) 1715 return 0; 1716 1717 /* Limit the BAR size to what is available */ 1718 rbar_size = min(fls(pci_rebar_get_possible_sizes(adev->pdev, 0)) - 1, 1719 rbar_size); 1720 1721 /* Disable memory decoding while we change the BAR addresses and size */ 1722 pci_read_config_word(adev->pdev, PCI_COMMAND, &cmd); 1723 pci_write_config_word(adev->pdev, PCI_COMMAND, 1724 cmd & ~PCI_COMMAND_MEMORY); 1725 1726 /* Free the VRAM and doorbell BAR, we most likely need to move both. */ 1727 amdgpu_doorbell_fini(adev); 1728 if (adev->asic_type >= CHIP_BONAIRE) 1729 pci_release_resource(adev->pdev, 2); 1730 1731 pci_release_resource(adev->pdev, 0); 1732 1733 r = pci_resize_resource(adev->pdev, 0, rbar_size); 1734 if (r == -ENOSPC) 1735 DRM_INFO("Not enough PCI address space for a large BAR."); 1736 else if (r && r != -ENOTSUPP) 1737 DRM_ERROR("Problem resizing BAR0 (%d).", r); 1738 1739 pci_assign_unassigned_bus_resources(adev->pdev->bus); 1740 1741 /* When the doorbell or fb BAR isn't available we have no chance of 1742 * using the device. 1743 */ 1744 r = amdgpu_doorbell_init(adev); 1745 if (r || (pci_resource_flags(adev->pdev, 0) & IORESOURCE_UNSET)) 1746 return -ENODEV; 1747 1748 pci_write_config_word(adev->pdev, PCI_COMMAND, cmd); 1749 1750 return 0; 1751 } 1752 1753 /* 1754 * GPU helpers function. 1755 */ 1756 /** 1757 * amdgpu_device_need_post - check if the hw need post or not 1758 * 1759 * @adev: amdgpu_device pointer 1760 * 1761 * Check if the asic has been initialized (all asics) at driver startup 1762 * or post is needed if hw reset is performed. 1763 * Returns true if need or false if not. 1764 */ 1765 bool amdgpu_device_need_post(struct amdgpu_device *adev) 1766 { 1767 uint32_t reg, flags; 1768 1769 if (amdgpu_sriov_vf(adev)) 1770 return false; 1771 1772 flags = amdgpu_device_get_vbios_flags(adev); 1773 if (flags & AMDGPU_VBIOS_SKIP) 1774 return false; 1775 if ((flags & AMDGPU_VBIOS_OPTIONAL) && !adev->bios) 1776 return false; 1777 1778 if (amdgpu_passthrough(adev)) { 1779 /* for FIJI: In whole GPU pass-through virtualization case, after VM reboot 1780 * some old smc fw still need driver do vPost otherwise gpu hang, while 1781 * those smc fw version above 22.15 doesn't have this flaw, so we force 1782 * vpost executed for smc version below 22.15 1783 */ 1784 if (adev->asic_type == CHIP_FIJI) { 1785 int err; 1786 uint32_t fw_ver; 1787 1788 err = request_firmware(&adev->pm.fw, "amdgpu/fiji_smc.bin", adev->dev); 1789 /* force vPost if error occurred */ 1790 if (err) 1791 return true; 1792 1793 fw_ver = *((uint32_t *)adev->pm.fw->data + 69); 1794 release_firmware(adev->pm.fw); 1795 if (fw_ver < 0x00160e00) 1796 return true; 1797 } 1798 } 1799 1800 /* Don't post if we need to reset whole hive on init */ 1801 if (adev->init_lvl->level == AMDGPU_INIT_LEVEL_MINIMAL_XGMI) 1802 return false; 1803 1804 if (adev->has_hw_reset) { 1805 adev->has_hw_reset = false; 1806 return true; 1807 } 1808 1809 /* bios scratch used on CIK+ */ 1810 if (adev->asic_type >= CHIP_BONAIRE) 1811 return amdgpu_atombios_scratch_need_asic_init(adev); 1812 1813 /* check MEM_SIZE for older asics */ 1814 reg = amdgpu_asic_get_config_memsize(adev); 1815 1816 if ((reg != 0) && (reg != 0xffffffff)) 1817 return false; 1818 1819 return true; 1820 } 1821 1822 /* 1823 * Check whether seamless boot is supported. 1824 * 1825 * So far we only support seamless boot on DCE 3.0 or later. 1826 * If users report that it works on older ASICS as well, we may 1827 * loosen this. 1828 */ 1829 bool amdgpu_device_seamless_boot_supported(struct amdgpu_device *adev) 1830 { 1831 switch (amdgpu_seamless) { 1832 case -1: 1833 break; 1834 case 1: 1835 return true; 1836 case 0: 1837 return false; 1838 default: 1839 DRM_ERROR("Invalid value for amdgpu.seamless: %d\n", 1840 amdgpu_seamless); 1841 return false; 1842 } 1843 1844 if (!(adev->flags & AMD_IS_APU)) 1845 return false; 1846 1847 if (adev->mman.keep_stolen_vga_memory) 1848 return false; 1849 1850 return amdgpu_ip_version(adev, DCE_HWIP, 0) >= IP_VERSION(3, 0, 0); 1851 } 1852 1853 /* 1854 * Intel hosts such as Rocket Lake, Alder Lake, Raptor Lake and Sapphire Rapids 1855 * don't support dynamic speed switching. Until we have confirmation from Intel 1856 * that a specific host supports it, it's safer that we keep it disabled for all. 1857 * 1858 * https://edc.intel.com/content/www/us/en/design/products/platforms/details/raptor-lake-s/13th-generation-core-processors-datasheet-volume-1-of-2/005/pci-express-support/ 1859 * https://gitlab.freedesktop.org/drm/amd/-/issues/2663 1860 */ 1861 static bool amdgpu_device_pcie_dynamic_switching_supported(struct amdgpu_device *adev) 1862 { 1863 #if IS_ENABLED(CONFIG_X86) 1864 struct cpuinfo_x86 *c = &cpu_data(0); 1865 1866 /* eGPU change speeds based on USB4 fabric conditions */ 1867 if (dev_is_removable(adev->dev)) 1868 return true; 1869 1870 if (c->x86_vendor == X86_VENDOR_INTEL) 1871 return false; 1872 #endif 1873 return true; 1874 } 1875 1876 /** 1877 * amdgpu_device_should_use_aspm - check if the device should program ASPM 1878 * 1879 * @adev: amdgpu_device pointer 1880 * 1881 * Confirm whether the module parameter and pcie bridge agree that ASPM should 1882 * be set for this device. 1883 * 1884 * Returns true if it should be used or false if not. 1885 */ 1886 bool amdgpu_device_should_use_aspm(struct amdgpu_device *adev) 1887 { 1888 switch (amdgpu_aspm) { 1889 case -1: 1890 break; 1891 case 0: 1892 return false; 1893 case 1: 1894 return true; 1895 default: 1896 return false; 1897 } 1898 if (adev->flags & AMD_IS_APU) 1899 return false; 1900 if (!(adev->pm.pp_feature & PP_PCIE_DPM_MASK)) 1901 return false; 1902 return pcie_aspm_enabled(adev->pdev); 1903 } 1904 1905 /* if we get transitioned to only one device, take VGA back */ 1906 /** 1907 * amdgpu_device_vga_set_decode - enable/disable vga decode 1908 * 1909 * @pdev: PCI device pointer 1910 * @state: enable/disable vga decode 1911 * 1912 * Enable/disable vga decode (all asics). 1913 * Returns VGA resource flags. 1914 */ 1915 static unsigned int amdgpu_device_vga_set_decode(struct pci_dev *pdev, 1916 bool state) 1917 { 1918 struct amdgpu_device *adev = drm_to_adev(pci_get_drvdata(pdev)); 1919 1920 amdgpu_asic_set_vga_state(adev, state); 1921 if (state) 1922 return VGA_RSRC_LEGACY_IO | VGA_RSRC_LEGACY_MEM | 1923 VGA_RSRC_NORMAL_IO | VGA_RSRC_NORMAL_MEM; 1924 else 1925 return VGA_RSRC_NORMAL_IO | VGA_RSRC_NORMAL_MEM; 1926 } 1927 1928 /** 1929 * amdgpu_device_check_block_size - validate the vm block size 1930 * 1931 * @adev: amdgpu_device pointer 1932 * 1933 * Validates the vm block size specified via module parameter. 1934 * The vm block size defines number of bits in page table versus page directory, 1935 * a page is 4KB so we have 12 bits offset, minimum 9 bits in the 1936 * page table and the remaining bits are in the page directory. 1937 */ 1938 static void amdgpu_device_check_block_size(struct amdgpu_device *adev) 1939 { 1940 /* defines number of bits in page table versus page directory, 1941 * a page is 4KB so we have 12 bits offset, minimum 9 bits in the 1942 * page table and the remaining bits are in the page directory 1943 */ 1944 if (amdgpu_vm_block_size == -1) 1945 return; 1946 1947 if (amdgpu_vm_block_size < 9) { 1948 dev_warn(adev->dev, "VM page table size (%d) too small\n", 1949 amdgpu_vm_block_size); 1950 amdgpu_vm_block_size = -1; 1951 } 1952 } 1953 1954 /** 1955 * amdgpu_device_check_vm_size - validate the vm size 1956 * 1957 * @adev: amdgpu_device pointer 1958 * 1959 * Validates the vm size in GB specified via module parameter. 1960 * The VM size is the size of the GPU virtual memory space in GB. 1961 */ 1962 static void amdgpu_device_check_vm_size(struct amdgpu_device *adev) 1963 { 1964 /* no need to check the default value */ 1965 if (amdgpu_vm_size == -1) 1966 return; 1967 1968 if (amdgpu_vm_size < 1) { 1969 dev_warn(adev->dev, "VM size (%d) too small, min is 1GB\n", 1970 amdgpu_vm_size); 1971 amdgpu_vm_size = -1; 1972 } 1973 } 1974 1975 static void amdgpu_device_check_smu_prv_buffer_size(struct amdgpu_device *adev) 1976 { 1977 struct sysinfo si; 1978 bool is_os_64 = (sizeof(void *) == 8); 1979 uint64_t total_memory; 1980 uint64_t dram_size_seven_GB = 0x1B8000000; 1981 uint64_t dram_size_three_GB = 0xB8000000; 1982 1983 if (amdgpu_smu_memory_pool_size == 0) 1984 return; 1985 1986 if (!is_os_64) { 1987 DRM_WARN("Not 64-bit OS, feature not supported\n"); 1988 goto def_value; 1989 } 1990 si_meminfo(&si); 1991 total_memory = (uint64_t)si.totalram * si.mem_unit; 1992 1993 if ((amdgpu_smu_memory_pool_size == 1) || 1994 (amdgpu_smu_memory_pool_size == 2)) { 1995 if (total_memory < dram_size_three_GB) 1996 goto def_value1; 1997 } else if ((amdgpu_smu_memory_pool_size == 4) || 1998 (amdgpu_smu_memory_pool_size == 8)) { 1999 if (total_memory < dram_size_seven_GB) 2000 goto def_value1; 2001 } else { 2002 DRM_WARN("Smu memory pool size not supported\n"); 2003 goto def_value; 2004 } 2005 adev->pm.smu_prv_buffer_size = amdgpu_smu_memory_pool_size << 28; 2006 2007 return; 2008 2009 def_value1: 2010 DRM_WARN("No enough system memory\n"); 2011 def_value: 2012 adev->pm.smu_prv_buffer_size = 0; 2013 } 2014 2015 static int amdgpu_device_init_apu_flags(struct amdgpu_device *adev) 2016 { 2017 if (!(adev->flags & AMD_IS_APU) || 2018 adev->asic_type < CHIP_RAVEN) 2019 return 0; 2020 2021 switch (adev->asic_type) { 2022 case CHIP_RAVEN: 2023 if (adev->pdev->device == 0x15dd) 2024 adev->apu_flags |= AMD_APU_IS_RAVEN; 2025 if (adev->pdev->device == 0x15d8) 2026 adev->apu_flags |= AMD_APU_IS_PICASSO; 2027 break; 2028 case CHIP_RENOIR: 2029 if ((adev->pdev->device == 0x1636) || 2030 (adev->pdev->device == 0x164c)) 2031 adev->apu_flags |= AMD_APU_IS_RENOIR; 2032 else 2033 adev->apu_flags |= AMD_APU_IS_GREEN_SARDINE; 2034 break; 2035 case CHIP_VANGOGH: 2036 adev->apu_flags |= AMD_APU_IS_VANGOGH; 2037 break; 2038 case CHIP_YELLOW_CARP: 2039 break; 2040 case CHIP_CYAN_SKILLFISH: 2041 if ((adev->pdev->device == 0x13FE) || 2042 (adev->pdev->device == 0x143F)) 2043 adev->apu_flags |= AMD_APU_IS_CYAN_SKILLFISH2; 2044 break; 2045 default: 2046 break; 2047 } 2048 2049 return 0; 2050 } 2051 2052 /** 2053 * amdgpu_device_check_arguments - validate module params 2054 * 2055 * @adev: amdgpu_device pointer 2056 * 2057 * Validates certain module parameters and updates 2058 * the associated values used by the driver (all asics). 2059 */ 2060 static int amdgpu_device_check_arguments(struct amdgpu_device *adev) 2061 { 2062 int i; 2063 2064 if (amdgpu_sched_jobs < 4) { 2065 dev_warn(adev->dev, "sched jobs (%d) must be at least 4\n", 2066 amdgpu_sched_jobs); 2067 amdgpu_sched_jobs = 4; 2068 } else if (!is_power_of_2(amdgpu_sched_jobs)) { 2069 dev_warn(adev->dev, "sched jobs (%d) must be a power of 2\n", 2070 amdgpu_sched_jobs); 2071 amdgpu_sched_jobs = roundup_pow_of_two(amdgpu_sched_jobs); 2072 } 2073 2074 if (amdgpu_gart_size != -1 && amdgpu_gart_size < 32) { 2075 /* gart size must be greater or equal to 32M */ 2076 dev_warn(adev->dev, "gart size (%d) too small\n", 2077 amdgpu_gart_size); 2078 amdgpu_gart_size = -1; 2079 } 2080 2081 if (amdgpu_gtt_size != -1 && amdgpu_gtt_size < 32) { 2082 /* gtt size must be greater or equal to 32M */ 2083 dev_warn(adev->dev, "gtt size (%d) too small\n", 2084 amdgpu_gtt_size); 2085 amdgpu_gtt_size = -1; 2086 } 2087 2088 /* valid range is between 4 and 9 inclusive */ 2089 if (amdgpu_vm_fragment_size != -1 && 2090 (amdgpu_vm_fragment_size > 9 || amdgpu_vm_fragment_size < 4)) { 2091 dev_warn(adev->dev, "valid range is between 4 and 9\n"); 2092 amdgpu_vm_fragment_size = -1; 2093 } 2094 2095 if (amdgpu_sched_hw_submission < 2) { 2096 dev_warn(adev->dev, "sched hw submission jobs (%d) must be at least 2\n", 2097 amdgpu_sched_hw_submission); 2098 amdgpu_sched_hw_submission = 2; 2099 } else if (!is_power_of_2(amdgpu_sched_hw_submission)) { 2100 dev_warn(adev->dev, "sched hw submission jobs (%d) must be a power of 2\n", 2101 amdgpu_sched_hw_submission); 2102 amdgpu_sched_hw_submission = roundup_pow_of_two(amdgpu_sched_hw_submission); 2103 } 2104 2105 if (amdgpu_reset_method < -1 || amdgpu_reset_method > 4) { 2106 dev_warn(adev->dev, "invalid option for reset method, reverting to default\n"); 2107 amdgpu_reset_method = -1; 2108 } 2109 2110 amdgpu_device_check_smu_prv_buffer_size(adev); 2111 2112 amdgpu_device_check_vm_size(adev); 2113 2114 amdgpu_device_check_block_size(adev); 2115 2116 adev->firmware.load_type = amdgpu_ucode_get_load_type(adev, amdgpu_fw_load_type); 2117 2118 for (i = 0; i < MAX_XCP; i++) 2119 adev->enforce_isolation[i] = !!enforce_isolation; 2120 2121 return 0; 2122 } 2123 2124 /** 2125 * amdgpu_switcheroo_set_state - set switcheroo state 2126 * 2127 * @pdev: pci dev pointer 2128 * @state: vga_switcheroo state 2129 * 2130 * Callback for the switcheroo driver. Suspends or resumes 2131 * the asics before or after it is powered up using ACPI methods. 2132 */ 2133 static void amdgpu_switcheroo_set_state(struct pci_dev *pdev, 2134 enum vga_switcheroo_state state) 2135 { 2136 struct drm_device *dev = pci_get_drvdata(pdev); 2137 int r; 2138 2139 if (amdgpu_device_supports_px(dev) && state == VGA_SWITCHEROO_OFF) 2140 return; 2141 2142 if (state == VGA_SWITCHEROO_ON) { 2143 pr_info("switched on\n"); 2144 /* don't suspend or resume card normally */ 2145 dev->switch_power_state = DRM_SWITCH_POWER_CHANGING; 2146 2147 pci_set_power_state(pdev, PCI_D0); 2148 amdgpu_device_load_pci_state(pdev); 2149 r = pci_enable_device(pdev); 2150 if (r) 2151 DRM_WARN("pci_enable_device failed (%d)\n", r); 2152 amdgpu_device_resume(dev, true); 2153 2154 dev->switch_power_state = DRM_SWITCH_POWER_ON; 2155 } else { 2156 pr_info("switched off\n"); 2157 dev->switch_power_state = DRM_SWITCH_POWER_CHANGING; 2158 amdgpu_device_prepare(dev); 2159 amdgpu_device_suspend(dev, true); 2160 amdgpu_device_cache_pci_state(pdev); 2161 /* Shut down the device */ 2162 pci_disable_device(pdev); 2163 pci_set_power_state(pdev, PCI_D3cold); 2164 dev->switch_power_state = DRM_SWITCH_POWER_OFF; 2165 } 2166 } 2167 2168 /** 2169 * amdgpu_switcheroo_can_switch - see if switcheroo state can change 2170 * 2171 * @pdev: pci dev pointer 2172 * 2173 * Callback for the switcheroo driver. Check of the switcheroo 2174 * state can be changed. 2175 * Returns true if the state can be changed, false if not. 2176 */ 2177 static bool amdgpu_switcheroo_can_switch(struct pci_dev *pdev) 2178 { 2179 struct drm_device *dev = pci_get_drvdata(pdev); 2180 2181 /* 2182 * FIXME: open_count is protected by drm_global_mutex but that would lead to 2183 * locking inversion with the driver load path. And the access here is 2184 * completely racy anyway. So don't bother with locking for now. 2185 */ 2186 return atomic_read(&dev->open_count) == 0; 2187 } 2188 2189 static const struct vga_switcheroo_client_ops amdgpu_switcheroo_ops = { 2190 .set_gpu_state = amdgpu_switcheroo_set_state, 2191 .reprobe = NULL, 2192 .can_switch = amdgpu_switcheroo_can_switch, 2193 }; 2194 2195 /** 2196 * amdgpu_device_ip_set_clockgating_state - set the CG state 2197 * 2198 * @dev: amdgpu_device pointer 2199 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.) 2200 * @state: clockgating state (gate or ungate) 2201 * 2202 * Sets the requested clockgating state for all instances of 2203 * the hardware IP specified. 2204 * Returns the error code from the last instance. 2205 */ 2206 int amdgpu_device_ip_set_clockgating_state(void *dev, 2207 enum amd_ip_block_type block_type, 2208 enum amd_clockgating_state state) 2209 { 2210 struct amdgpu_device *adev = dev; 2211 int i, r = 0; 2212 2213 for (i = 0; i < adev->num_ip_blocks; i++) { 2214 if (!adev->ip_blocks[i].status.valid) 2215 continue; 2216 if (adev->ip_blocks[i].version->type != block_type) 2217 continue; 2218 if (!adev->ip_blocks[i].version->funcs->set_clockgating_state) 2219 continue; 2220 r = adev->ip_blocks[i].version->funcs->set_clockgating_state( 2221 &adev->ip_blocks[i], state); 2222 if (r) 2223 DRM_ERROR("set_clockgating_state of IP block <%s> failed %d\n", 2224 adev->ip_blocks[i].version->funcs->name, r); 2225 } 2226 return r; 2227 } 2228 2229 /** 2230 * amdgpu_device_ip_set_powergating_state - set the PG state 2231 * 2232 * @dev: amdgpu_device pointer 2233 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.) 2234 * @state: powergating state (gate or ungate) 2235 * 2236 * Sets the requested powergating state for all instances of 2237 * the hardware IP specified. 2238 * Returns the error code from the last instance. 2239 */ 2240 int amdgpu_device_ip_set_powergating_state(void *dev, 2241 enum amd_ip_block_type block_type, 2242 enum amd_powergating_state state) 2243 { 2244 struct amdgpu_device *adev = dev; 2245 int i, r = 0; 2246 2247 for (i = 0; i < adev->num_ip_blocks; i++) { 2248 if (!adev->ip_blocks[i].status.valid) 2249 continue; 2250 if (adev->ip_blocks[i].version->type != block_type) 2251 continue; 2252 if (!adev->ip_blocks[i].version->funcs->set_powergating_state) 2253 continue; 2254 r = adev->ip_blocks[i].version->funcs->set_powergating_state( 2255 &adev->ip_blocks[i], state); 2256 if (r) 2257 DRM_ERROR("set_powergating_state of IP block <%s> failed %d\n", 2258 adev->ip_blocks[i].version->funcs->name, r); 2259 } 2260 return r; 2261 } 2262 2263 /** 2264 * amdgpu_device_ip_get_clockgating_state - get the CG state 2265 * 2266 * @adev: amdgpu_device pointer 2267 * @flags: clockgating feature flags 2268 * 2269 * Walks the list of IPs on the device and updates the clockgating 2270 * flags for each IP. 2271 * Updates @flags with the feature flags for each hardware IP where 2272 * clockgating is enabled. 2273 */ 2274 void amdgpu_device_ip_get_clockgating_state(struct amdgpu_device *adev, 2275 u64 *flags) 2276 { 2277 int i; 2278 2279 for (i = 0; i < adev->num_ip_blocks; i++) { 2280 if (!adev->ip_blocks[i].status.valid) 2281 continue; 2282 if (adev->ip_blocks[i].version->funcs->get_clockgating_state) 2283 adev->ip_blocks[i].version->funcs->get_clockgating_state( 2284 &adev->ip_blocks[i], flags); 2285 } 2286 } 2287 2288 /** 2289 * amdgpu_device_ip_wait_for_idle - wait for idle 2290 * 2291 * @adev: amdgpu_device pointer 2292 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.) 2293 * 2294 * Waits for the request hardware IP to be idle. 2295 * Returns 0 for success or a negative error code on failure. 2296 */ 2297 int amdgpu_device_ip_wait_for_idle(struct amdgpu_device *adev, 2298 enum amd_ip_block_type block_type) 2299 { 2300 int i, r; 2301 2302 for (i = 0; i < adev->num_ip_blocks; i++) { 2303 if (!adev->ip_blocks[i].status.valid) 2304 continue; 2305 if (adev->ip_blocks[i].version->type == block_type) { 2306 if (adev->ip_blocks[i].version->funcs->wait_for_idle) { 2307 r = adev->ip_blocks[i].version->funcs->wait_for_idle( 2308 &adev->ip_blocks[i]); 2309 if (r) 2310 return r; 2311 } 2312 break; 2313 } 2314 } 2315 return 0; 2316 2317 } 2318 2319 /** 2320 * amdgpu_device_ip_is_valid - is the hardware IP enabled 2321 * 2322 * @adev: amdgpu_device pointer 2323 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.) 2324 * 2325 * Check if the hardware IP is enable or not. 2326 * Returns true if it the IP is enable, false if not. 2327 */ 2328 bool amdgpu_device_ip_is_valid(struct amdgpu_device *adev, 2329 enum amd_ip_block_type block_type) 2330 { 2331 int i; 2332 2333 for (i = 0; i < adev->num_ip_blocks; i++) { 2334 if (adev->ip_blocks[i].version->type == block_type) 2335 return adev->ip_blocks[i].status.valid; 2336 } 2337 return false; 2338 2339 } 2340 2341 /** 2342 * amdgpu_device_ip_get_ip_block - get a hw IP pointer 2343 * 2344 * @adev: amdgpu_device pointer 2345 * @type: Type of hardware IP (SMU, GFX, UVD, etc.) 2346 * 2347 * Returns a pointer to the hardware IP block structure 2348 * if it exists for the asic, otherwise NULL. 2349 */ 2350 struct amdgpu_ip_block * 2351 amdgpu_device_ip_get_ip_block(struct amdgpu_device *adev, 2352 enum amd_ip_block_type type) 2353 { 2354 int i; 2355 2356 for (i = 0; i < adev->num_ip_blocks; i++) 2357 if (adev->ip_blocks[i].version->type == type) 2358 return &adev->ip_blocks[i]; 2359 2360 return NULL; 2361 } 2362 2363 /** 2364 * amdgpu_device_ip_block_version_cmp 2365 * 2366 * @adev: amdgpu_device pointer 2367 * @type: enum amd_ip_block_type 2368 * @major: major version 2369 * @minor: minor version 2370 * 2371 * return 0 if equal or greater 2372 * return 1 if smaller or the ip_block doesn't exist 2373 */ 2374 int amdgpu_device_ip_block_version_cmp(struct amdgpu_device *adev, 2375 enum amd_ip_block_type type, 2376 u32 major, u32 minor) 2377 { 2378 struct amdgpu_ip_block *ip_block = amdgpu_device_ip_get_ip_block(adev, type); 2379 2380 if (ip_block && ((ip_block->version->major > major) || 2381 ((ip_block->version->major == major) && 2382 (ip_block->version->minor >= minor)))) 2383 return 0; 2384 2385 return 1; 2386 } 2387 2388 /** 2389 * amdgpu_device_ip_block_add 2390 * 2391 * @adev: amdgpu_device pointer 2392 * @ip_block_version: pointer to the IP to add 2393 * 2394 * Adds the IP block driver information to the collection of IPs 2395 * on the asic. 2396 */ 2397 int amdgpu_device_ip_block_add(struct amdgpu_device *adev, 2398 const struct amdgpu_ip_block_version *ip_block_version) 2399 { 2400 if (!ip_block_version) 2401 return -EINVAL; 2402 2403 switch (ip_block_version->type) { 2404 case AMD_IP_BLOCK_TYPE_VCN: 2405 if (adev->harvest_ip_mask & AMD_HARVEST_IP_VCN_MASK) 2406 return 0; 2407 break; 2408 case AMD_IP_BLOCK_TYPE_JPEG: 2409 if (adev->harvest_ip_mask & AMD_HARVEST_IP_JPEG_MASK) 2410 return 0; 2411 break; 2412 default: 2413 break; 2414 } 2415 2416 dev_info(adev->dev, "detected ip block number %d <%s>\n", 2417 adev->num_ip_blocks, ip_block_version->funcs->name); 2418 2419 adev->ip_blocks[adev->num_ip_blocks].adev = adev; 2420 2421 adev->ip_blocks[adev->num_ip_blocks++].version = ip_block_version; 2422 2423 return 0; 2424 } 2425 2426 /** 2427 * amdgpu_device_enable_virtual_display - enable virtual display feature 2428 * 2429 * @adev: amdgpu_device pointer 2430 * 2431 * Enabled the virtual display feature if the user has enabled it via 2432 * the module parameter virtual_display. This feature provides a virtual 2433 * display hardware on headless boards or in virtualized environments. 2434 * This function parses and validates the configuration string specified by 2435 * the user and configures the virtual display configuration (number of 2436 * virtual connectors, crtcs, etc.) specified. 2437 */ 2438 static void amdgpu_device_enable_virtual_display(struct amdgpu_device *adev) 2439 { 2440 adev->enable_virtual_display = false; 2441 2442 if (amdgpu_virtual_display) { 2443 const char *pci_address_name = pci_name(adev->pdev); 2444 char *pciaddstr, *pciaddstr_tmp, *pciaddname_tmp, *pciaddname; 2445 2446 pciaddstr = kstrdup(amdgpu_virtual_display, GFP_KERNEL); 2447 pciaddstr_tmp = pciaddstr; 2448 while ((pciaddname_tmp = strsep(&pciaddstr_tmp, ";"))) { 2449 pciaddname = strsep(&pciaddname_tmp, ","); 2450 if (!strcmp("all", pciaddname) 2451 || !strcmp(pci_address_name, pciaddname)) { 2452 long num_crtc; 2453 int res = -1; 2454 2455 adev->enable_virtual_display = true; 2456 2457 if (pciaddname_tmp) 2458 res = kstrtol(pciaddname_tmp, 10, 2459 &num_crtc); 2460 2461 if (!res) { 2462 if (num_crtc < 1) 2463 num_crtc = 1; 2464 if (num_crtc > 6) 2465 num_crtc = 6; 2466 adev->mode_info.num_crtc = num_crtc; 2467 } else { 2468 adev->mode_info.num_crtc = 1; 2469 } 2470 break; 2471 } 2472 } 2473 2474 DRM_INFO("virtual display string:%s, %s:virtual_display:%d, num_crtc:%d\n", 2475 amdgpu_virtual_display, pci_address_name, 2476 adev->enable_virtual_display, adev->mode_info.num_crtc); 2477 2478 kfree(pciaddstr); 2479 } 2480 } 2481 2482 void amdgpu_device_set_sriov_virtual_display(struct amdgpu_device *adev) 2483 { 2484 if (amdgpu_sriov_vf(adev) && !adev->enable_virtual_display) { 2485 adev->mode_info.num_crtc = 1; 2486 adev->enable_virtual_display = true; 2487 DRM_INFO("virtual_display:%d, num_crtc:%d\n", 2488 adev->enable_virtual_display, adev->mode_info.num_crtc); 2489 } 2490 } 2491 2492 /** 2493 * amdgpu_device_parse_gpu_info_fw - parse gpu info firmware 2494 * 2495 * @adev: amdgpu_device pointer 2496 * 2497 * Parses the asic configuration parameters specified in the gpu info 2498 * firmware and makes them available to the driver for use in configuring 2499 * the asic. 2500 * Returns 0 on success, -EINVAL on failure. 2501 */ 2502 static int amdgpu_device_parse_gpu_info_fw(struct amdgpu_device *adev) 2503 { 2504 const char *chip_name; 2505 int err; 2506 const struct gpu_info_firmware_header_v1_0 *hdr; 2507 2508 adev->firmware.gpu_info_fw = NULL; 2509 2510 if (adev->mman.discovery_bin) 2511 return 0; 2512 2513 switch (adev->asic_type) { 2514 default: 2515 return 0; 2516 case CHIP_VEGA10: 2517 chip_name = "vega10"; 2518 break; 2519 case CHIP_VEGA12: 2520 chip_name = "vega12"; 2521 break; 2522 case CHIP_RAVEN: 2523 if (adev->apu_flags & AMD_APU_IS_RAVEN2) 2524 chip_name = "raven2"; 2525 else if (adev->apu_flags & AMD_APU_IS_PICASSO) 2526 chip_name = "picasso"; 2527 else 2528 chip_name = "raven"; 2529 break; 2530 case CHIP_ARCTURUS: 2531 chip_name = "arcturus"; 2532 break; 2533 case CHIP_NAVI12: 2534 chip_name = "navi12"; 2535 break; 2536 } 2537 2538 err = amdgpu_ucode_request(adev, &adev->firmware.gpu_info_fw, 2539 AMDGPU_UCODE_OPTIONAL, 2540 "amdgpu/%s_gpu_info.bin", chip_name); 2541 if (err) { 2542 dev_err(adev->dev, 2543 "Failed to get gpu_info firmware \"%s_gpu_info.bin\"\n", 2544 chip_name); 2545 goto out; 2546 } 2547 2548 hdr = (const struct gpu_info_firmware_header_v1_0 *)adev->firmware.gpu_info_fw->data; 2549 amdgpu_ucode_print_gpu_info_hdr(&hdr->header); 2550 2551 switch (hdr->version_major) { 2552 case 1: 2553 { 2554 const struct gpu_info_firmware_v1_0 *gpu_info_fw = 2555 (const struct gpu_info_firmware_v1_0 *)(adev->firmware.gpu_info_fw->data + 2556 le32_to_cpu(hdr->header.ucode_array_offset_bytes)); 2557 2558 /* 2559 * Should be dropped when DAL no longer needs it. 2560 */ 2561 if (adev->asic_type == CHIP_NAVI12) 2562 goto parse_soc_bounding_box; 2563 2564 adev->gfx.config.max_shader_engines = le32_to_cpu(gpu_info_fw->gc_num_se); 2565 adev->gfx.config.max_cu_per_sh = le32_to_cpu(gpu_info_fw->gc_num_cu_per_sh); 2566 adev->gfx.config.max_sh_per_se = le32_to_cpu(gpu_info_fw->gc_num_sh_per_se); 2567 adev->gfx.config.max_backends_per_se = le32_to_cpu(gpu_info_fw->gc_num_rb_per_se); 2568 adev->gfx.config.max_texture_channel_caches = 2569 le32_to_cpu(gpu_info_fw->gc_num_tccs); 2570 adev->gfx.config.max_gprs = le32_to_cpu(gpu_info_fw->gc_num_gprs); 2571 adev->gfx.config.max_gs_threads = le32_to_cpu(gpu_info_fw->gc_num_max_gs_thds); 2572 adev->gfx.config.gs_vgt_table_depth = le32_to_cpu(gpu_info_fw->gc_gs_table_depth); 2573 adev->gfx.config.gs_prim_buffer_depth = le32_to_cpu(gpu_info_fw->gc_gsprim_buff_depth); 2574 adev->gfx.config.double_offchip_lds_buf = 2575 le32_to_cpu(gpu_info_fw->gc_double_offchip_lds_buffer); 2576 adev->gfx.cu_info.wave_front_size = le32_to_cpu(gpu_info_fw->gc_wave_size); 2577 adev->gfx.cu_info.max_waves_per_simd = 2578 le32_to_cpu(gpu_info_fw->gc_max_waves_per_simd); 2579 adev->gfx.cu_info.max_scratch_slots_per_cu = 2580 le32_to_cpu(gpu_info_fw->gc_max_scratch_slots_per_cu); 2581 adev->gfx.cu_info.lds_size = le32_to_cpu(gpu_info_fw->gc_lds_size); 2582 if (hdr->version_minor >= 1) { 2583 const struct gpu_info_firmware_v1_1 *gpu_info_fw = 2584 (const struct gpu_info_firmware_v1_1 *)(adev->firmware.gpu_info_fw->data + 2585 le32_to_cpu(hdr->header.ucode_array_offset_bytes)); 2586 adev->gfx.config.num_sc_per_sh = 2587 le32_to_cpu(gpu_info_fw->num_sc_per_sh); 2588 adev->gfx.config.num_packer_per_sc = 2589 le32_to_cpu(gpu_info_fw->num_packer_per_sc); 2590 } 2591 2592 parse_soc_bounding_box: 2593 /* 2594 * soc bounding box info is not integrated in disocovery table, 2595 * we always need to parse it from gpu info firmware if needed. 2596 */ 2597 if (hdr->version_minor == 2) { 2598 const struct gpu_info_firmware_v1_2 *gpu_info_fw = 2599 (const struct gpu_info_firmware_v1_2 *)(adev->firmware.gpu_info_fw->data + 2600 le32_to_cpu(hdr->header.ucode_array_offset_bytes)); 2601 adev->dm.soc_bounding_box = &gpu_info_fw->soc_bounding_box; 2602 } 2603 break; 2604 } 2605 default: 2606 dev_err(adev->dev, 2607 "Unsupported gpu_info table %d\n", hdr->header.ucode_version); 2608 err = -EINVAL; 2609 goto out; 2610 } 2611 out: 2612 return err; 2613 } 2614 2615 /** 2616 * amdgpu_device_ip_early_init - run early init for hardware IPs 2617 * 2618 * @adev: amdgpu_device pointer 2619 * 2620 * Early initialization pass for hardware IPs. The hardware IPs that make 2621 * up each asic are discovered each IP's early_init callback is run. This 2622 * is the first stage in initializing the asic. 2623 * Returns 0 on success, negative error code on failure. 2624 */ 2625 static int amdgpu_device_ip_early_init(struct amdgpu_device *adev) 2626 { 2627 struct amdgpu_ip_block *ip_block; 2628 struct pci_dev *parent; 2629 bool total, skip_bios; 2630 uint32_t bios_flags; 2631 int i, r; 2632 2633 amdgpu_device_enable_virtual_display(adev); 2634 2635 if (amdgpu_sriov_vf(adev)) { 2636 r = amdgpu_virt_request_full_gpu(adev, true); 2637 if (r) 2638 return r; 2639 } 2640 2641 switch (adev->asic_type) { 2642 #ifdef CONFIG_DRM_AMDGPU_SI 2643 case CHIP_VERDE: 2644 case CHIP_TAHITI: 2645 case CHIP_PITCAIRN: 2646 case CHIP_OLAND: 2647 case CHIP_HAINAN: 2648 adev->family = AMDGPU_FAMILY_SI; 2649 r = si_set_ip_blocks(adev); 2650 if (r) 2651 return r; 2652 break; 2653 #endif 2654 #ifdef CONFIG_DRM_AMDGPU_CIK 2655 case CHIP_BONAIRE: 2656 case CHIP_HAWAII: 2657 case CHIP_KAVERI: 2658 case CHIP_KABINI: 2659 case CHIP_MULLINS: 2660 if (adev->flags & AMD_IS_APU) 2661 adev->family = AMDGPU_FAMILY_KV; 2662 else 2663 adev->family = AMDGPU_FAMILY_CI; 2664 2665 r = cik_set_ip_blocks(adev); 2666 if (r) 2667 return r; 2668 break; 2669 #endif 2670 case CHIP_TOPAZ: 2671 case CHIP_TONGA: 2672 case CHIP_FIJI: 2673 case CHIP_POLARIS10: 2674 case CHIP_POLARIS11: 2675 case CHIP_POLARIS12: 2676 case CHIP_VEGAM: 2677 case CHIP_CARRIZO: 2678 case CHIP_STONEY: 2679 if (adev->flags & AMD_IS_APU) 2680 adev->family = AMDGPU_FAMILY_CZ; 2681 else 2682 adev->family = AMDGPU_FAMILY_VI; 2683 2684 r = vi_set_ip_blocks(adev); 2685 if (r) 2686 return r; 2687 break; 2688 default: 2689 r = amdgpu_discovery_set_ip_blocks(adev); 2690 if (r) 2691 return r; 2692 break; 2693 } 2694 2695 if (amdgpu_has_atpx() && 2696 (amdgpu_is_atpx_hybrid() || 2697 amdgpu_has_atpx_dgpu_power_cntl()) && 2698 ((adev->flags & AMD_IS_APU) == 0) && 2699 !dev_is_removable(&adev->pdev->dev)) 2700 adev->flags |= AMD_IS_PX; 2701 2702 if (!(adev->flags & AMD_IS_APU)) { 2703 parent = pcie_find_root_port(adev->pdev); 2704 adev->has_pr3 = parent ? pci_pr3_present(parent) : false; 2705 } 2706 2707 2708 adev->pm.pp_feature = amdgpu_pp_feature_mask; 2709 if (amdgpu_sriov_vf(adev) || sched_policy == KFD_SCHED_POLICY_NO_HWS) 2710 adev->pm.pp_feature &= ~PP_GFXOFF_MASK; 2711 if (amdgpu_sriov_vf(adev) && adev->asic_type == CHIP_SIENNA_CICHLID) 2712 adev->pm.pp_feature &= ~PP_OVERDRIVE_MASK; 2713 if (!amdgpu_device_pcie_dynamic_switching_supported(adev)) 2714 adev->pm.pp_feature &= ~PP_PCIE_DPM_MASK; 2715 2716 total = true; 2717 for (i = 0; i < adev->num_ip_blocks; i++) { 2718 ip_block = &adev->ip_blocks[i]; 2719 2720 if ((amdgpu_ip_block_mask & (1 << i)) == 0) { 2721 DRM_WARN("disabled ip block: %d <%s>\n", 2722 i, adev->ip_blocks[i].version->funcs->name); 2723 adev->ip_blocks[i].status.valid = false; 2724 } else if (ip_block->version->funcs->early_init) { 2725 r = ip_block->version->funcs->early_init(ip_block); 2726 if (r == -ENOENT) { 2727 adev->ip_blocks[i].status.valid = false; 2728 } else if (r) { 2729 DRM_ERROR("early_init of IP block <%s> failed %d\n", 2730 adev->ip_blocks[i].version->funcs->name, r); 2731 total = false; 2732 } else { 2733 adev->ip_blocks[i].status.valid = true; 2734 } 2735 } else { 2736 adev->ip_blocks[i].status.valid = true; 2737 } 2738 /* get the vbios after the asic_funcs are set up */ 2739 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON) { 2740 r = amdgpu_device_parse_gpu_info_fw(adev); 2741 if (r) 2742 return r; 2743 2744 bios_flags = amdgpu_device_get_vbios_flags(adev); 2745 skip_bios = !!(bios_flags & AMDGPU_VBIOS_SKIP); 2746 /* Read BIOS */ 2747 if (!skip_bios) { 2748 bool optional = 2749 !!(bios_flags & AMDGPU_VBIOS_OPTIONAL); 2750 if (!amdgpu_get_bios(adev) && !optional) 2751 return -EINVAL; 2752 2753 if (optional && !adev->bios) 2754 dev_info( 2755 adev->dev, 2756 "VBIOS image optional, proceeding without VBIOS image"); 2757 2758 if (adev->bios) { 2759 r = amdgpu_atombios_init(adev); 2760 if (r) { 2761 dev_err(adev->dev, 2762 "amdgpu_atombios_init failed\n"); 2763 amdgpu_vf_error_put( 2764 adev, 2765 AMDGIM_ERROR_VF_ATOMBIOS_INIT_FAIL, 2766 0, 0); 2767 return r; 2768 } 2769 } 2770 } 2771 2772 /*get pf2vf msg info at it's earliest time*/ 2773 if (amdgpu_sriov_vf(adev)) 2774 amdgpu_virt_init_data_exchange(adev); 2775 2776 } 2777 } 2778 if (!total) 2779 return -ENODEV; 2780 2781 if (adev->gmc.xgmi.supported) 2782 amdgpu_xgmi_early_init(adev); 2783 2784 ip_block = amdgpu_device_ip_get_ip_block(adev, AMD_IP_BLOCK_TYPE_GFX); 2785 if (ip_block->status.valid != false) 2786 amdgpu_amdkfd_device_probe(adev); 2787 2788 adev->cg_flags &= amdgpu_cg_mask; 2789 adev->pg_flags &= amdgpu_pg_mask; 2790 2791 return 0; 2792 } 2793 2794 static int amdgpu_device_ip_hw_init_phase1(struct amdgpu_device *adev) 2795 { 2796 int i, r; 2797 2798 for (i = 0; i < adev->num_ip_blocks; i++) { 2799 if (!adev->ip_blocks[i].status.sw) 2800 continue; 2801 if (adev->ip_blocks[i].status.hw) 2802 continue; 2803 if (!amdgpu_ip_member_of_hwini( 2804 adev, adev->ip_blocks[i].version->type)) 2805 continue; 2806 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON || 2807 (amdgpu_sriov_vf(adev) && (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP)) || 2808 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH) { 2809 r = adev->ip_blocks[i].version->funcs->hw_init(&adev->ip_blocks[i]); 2810 if (r) { 2811 DRM_ERROR("hw_init of IP block <%s> failed %d\n", 2812 adev->ip_blocks[i].version->funcs->name, r); 2813 return r; 2814 } 2815 adev->ip_blocks[i].status.hw = true; 2816 } 2817 } 2818 2819 return 0; 2820 } 2821 2822 static int amdgpu_device_ip_hw_init_phase2(struct amdgpu_device *adev) 2823 { 2824 int i, r; 2825 2826 for (i = 0; i < adev->num_ip_blocks; i++) { 2827 if (!adev->ip_blocks[i].status.sw) 2828 continue; 2829 if (adev->ip_blocks[i].status.hw) 2830 continue; 2831 if (!amdgpu_ip_member_of_hwini( 2832 adev, adev->ip_blocks[i].version->type)) 2833 continue; 2834 r = adev->ip_blocks[i].version->funcs->hw_init(&adev->ip_blocks[i]); 2835 if (r) { 2836 DRM_ERROR("hw_init of IP block <%s> failed %d\n", 2837 adev->ip_blocks[i].version->funcs->name, r); 2838 return r; 2839 } 2840 adev->ip_blocks[i].status.hw = true; 2841 } 2842 2843 return 0; 2844 } 2845 2846 static int amdgpu_device_fw_loading(struct amdgpu_device *adev) 2847 { 2848 int r = 0; 2849 int i; 2850 uint32_t smu_version; 2851 2852 if (adev->asic_type >= CHIP_VEGA10) { 2853 for (i = 0; i < adev->num_ip_blocks; i++) { 2854 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_PSP) 2855 continue; 2856 2857 if (!amdgpu_ip_member_of_hwini(adev, 2858 AMD_IP_BLOCK_TYPE_PSP)) 2859 break; 2860 2861 if (!adev->ip_blocks[i].status.sw) 2862 continue; 2863 2864 /* no need to do the fw loading again if already done*/ 2865 if (adev->ip_blocks[i].status.hw == true) 2866 break; 2867 2868 if (amdgpu_in_reset(adev) || adev->in_suspend) { 2869 r = amdgpu_ip_block_resume(&adev->ip_blocks[i]); 2870 if (r) 2871 return r; 2872 } else { 2873 r = adev->ip_blocks[i].version->funcs->hw_init(&adev->ip_blocks[i]); 2874 if (r) { 2875 DRM_ERROR("hw_init of IP block <%s> failed %d\n", 2876 adev->ip_blocks[i].version->funcs->name, r); 2877 return r; 2878 } 2879 adev->ip_blocks[i].status.hw = true; 2880 } 2881 break; 2882 } 2883 } 2884 2885 if (!amdgpu_sriov_vf(adev) || adev->asic_type == CHIP_TONGA) 2886 r = amdgpu_pm_load_smu_firmware(adev, &smu_version); 2887 2888 return r; 2889 } 2890 2891 static int amdgpu_device_init_schedulers(struct amdgpu_device *adev) 2892 { 2893 struct drm_sched_init_args args = { 2894 .ops = &amdgpu_sched_ops, 2895 .num_rqs = DRM_SCHED_PRIORITY_COUNT, 2896 .timeout_wq = adev->reset_domain->wq, 2897 .dev = adev->dev, 2898 }; 2899 long timeout; 2900 int r, i; 2901 2902 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 2903 struct amdgpu_ring *ring = adev->rings[i]; 2904 2905 /* No need to setup the GPU scheduler for rings that don't need it */ 2906 if (!ring || ring->no_scheduler) 2907 continue; 2908 2909 switch (ring->funcs->type) { 2910 case AMDGPU_RING_TYPE_GFX: 2911 timeout = adev->gfx_timeout; 2912 break; 2913 case AMDGPU_RING_TYPE_COMPUTE: 2914 timeout = adev->compute_timeout; 2915 break; 2916 case AMDGPU_RING_TYPE_SDMA: 2917 timeout = adev->sdma_timeout; 2918 break; 2919 default: 2920 timeout = adev->video_timeout; 2921 break; 2922 } 2923 2924 args.timeout = timeout; 2925 args.credit_limit = ring->num_hw_submission; 2926 args.score = ring->sched_score; 2927 args.name = ring->name; 2928 2929 r = drm_sched_init(&ring->sched, &args); 2930 if (r) { 2931 DRM_ERROR("Failed to create scheduler on ring %s.\n", 2932 ring->name); 2933 return r; 2934 } 2935 r = amdgpu_uvd_entity_init(adev, ring); 2936 if (r) { 2937 DRM_ERROR("Failed to create UVD scheduling entity on ring %s.\n", 2938 ring->name); 2939 return r; 2940 } 2941 r = amdgpu_vce_entity_init(adev, ring); 2942 if (r) { 2943 DRM_ERROR("Failed to create VCE scheduling entity on ring %s.\n", 2944 ring->name); 2945 return r; 2946 } 2947 } 2948 2949 amdgpu_xcp_update_partition_sched_list(adev); 2950 2951 return 0; 2952 } 2953 2954 2955 /** 2956 * amdgpu_device_ip_init - run init for hardware IPs 2957 * 2958 * @adev: amdgpu_device pointer 2959 * 2960 * Main initialization pass for hardware IPs. The list of all the hardware 2961 * IPs that make up the asic is walked and the sw_init and hw_init callbacks 2962 * are run. sw_init initializes the software state associated with each IP 2963 * and hw_init initializes the hardware associated with each IP. 2964 * Returns 0 on success, negative error code on failure. 2965 */ 2966 static int amdgpu_device_ip_init(struct amdgpu_device *adev) 2967 { 2968 bool init_badpage; 2969 int i, r; 2970 2971 r = amdgpu_ras_init(adev); 2972 if (r) 2973 return r; 2974 2975 for (i = 0; i < adev->num_ip_blocks; i++) { 2976 if (!adev->ip_blocks[i].status.valid) 2977 continue; 2978 if (adev->ip_blocks[i].version->funcs->sw_init) { 2979 r = adev->ip_blocks[i].version->funcs->sw_init(&adev->ip_blocks[i]); 2980 if (r) { 2981 DRM_ERROR("sw_init of IP block <%s> failed %d\n", 2982 adev->ip_blocks[i].version->funcs->name, r); 2983 goto init_failed; 2984 } 2985 } 2986 adev->ip_blocks[i].status.sw = true; 2987 2988 if (!amdgpu_ip_member_of_hwini( 2989 adev, adev->ip_blocks[i].version->type)) 2990 continue; 2991 2992 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON) { 2993 /* need to do common hw init early so everything is set up for gmc */ 2994 r = adev->ip_blocks[i].version->funcs->hw_init(&adev->ip_blocks[i]); 2995 if (r) { 2996 DRM_ERROR("hw_init %d failed %d\n", i, r); 2997 goto init_failed; 2998 } 2999 adev->ip_blocks[i].status.hw = true; 3000 } else if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) { 3001 /* need to do gmc hw init early so we can allocate gpu mem */ 3002 /* Try to reserve bad pages early */ 3003 if (amdgpu_sriov_vf(adev)) 3004 amdgpu_virt_exchange_data(adev); 3005 3006 r = amdgpu_device_mem_scratch_init(adev); 3007 if (r) { 3008 DRM_ERROR("amdgpu_mem_scratch_init failed %d\n", r); 3009 goto init_failed; 3010 } 3011 r = adev->ip_blocks[i].version->funcs->hw_init(&adev->ip_blocks[i]); 3012 if (r) { 3013 DRM_ERROR("hw_init %d failed %d\n", i, r); 3014 goto init_failed; 3015 } 3016 r = amdgpu_device_wb_init(adev); 3017 if (r) { 3018 DRM_ERROR("amdgpu_device_wb_init failed %d\n", r); 3019 goto init_failed; 3020 } 3021 adev->ip_blocks[i].status.hw = true; 3022 3023 /* right after GMC hw init, we create CSA */ 3024 if (adev->gfx.mcbp) { 3025 r = amdgpu_allocate_static_csa(adev, &adev->virt.csa_obj, 3026 AMDGPU_GEM_DOMAIN_VRAM | 3027 AMDGPU_GEM_DOMAIN_GTT, 3028 AMDGPU_CSA_SIZE); 3029 if (r) { 3030 DRM_ERROR("allocate CSA failed %d\n", r); 3031 goto init_failed; 3032 } 3033 } 3034 3035 r = amdgpu_seq64_init(adev); 3036 if (r) { 3037 DRM_ERROR("allocate seq64 failed %d\n", r); 3038 goto init_failed; 3039 } 3040 } 3041 } 3042 3043 if (amdgpu_sriov_vf(adev)) 3044 amdgpu_virt_init_data_exchange(adev); 3045 3046 r = amdgpu_ib_pool_init(adev); 3047 if (r) { 3048 dev_err(adev->dev, "IB initialization failed (%d).\n", r); 3049 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_IB_INIT_FAIL, 0, r); 3050 goto init_failed; 3051 } 3052 3053 r = amdgpu_ucode_create_bo(adev); /* create ucode bo when sw_init complete*/ 3054 if (r) 3055 goto init_failed; 3056 3057 r = amdgpu_device_ip_hw_init_phase1(adev); 3058 if (r) 3059 goto init_failed; 3060 3061 r = amdgpu_device_fw_loading(adev); 3062 if (r) 3063 goto init_failed; 3064 3065 r = amdgpu_device_ip_hw_init_phase2(adev); 3066 if (r) 3067 goto init_failed; 3068 3069 /* 3070 * retired pages will be loaded from eeprom and reserved here, 3071 * it should be called after amdgpu_device_ip_hw_init_phase2 since 3072 * for some ASICs the RAS EEPROM code relies on SMU fully functioning 3073 * for I2C communication which only true at this point. 3074 * 3075 * amdgpu_ras_recovery_init may fail, but the upper only cares the 3076 * failure from bad gpu situation and stop amdgpu init process 3077 * accordingly. For other failed cases, it will still release all 3078 * the resource and print error message, rather than returning one 3079 * negative value to upper level. 3080 * 3081 * Note: theoretically, this should be called before all vram allocations 3082 * to protect retired page from abusing 3083 */ 3084 init_badpage = (adev->init_lvl->level != AMDGPU_INIT_LEVEL_MINIMAL_XGMI); 3085 r = amdgpu_ras_recovery_init(adev, init_badpage); 3086 if (r) 3087 goto init_failed; 3088 3089 /** 3090 * In case of XGMI grab extra reference for reset domain for this device 3091 */ 3092 if (adev->gmc.xgmi.num_physical_nodes > 1) { 3093 if (amdgpu_xgmi_add_device(adev) == 0) { 3094 if (!amdgpu_sriov_vf(adev)) { 3095 struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev); 3096 3097 if (WARN_ON(!hive)) { 3098 r = -ENOENT; 3099 goto init_failed; 3100 } 3101 3102 if (!hive->reset_domain || 3103 !amdgpu_reset_get_reset_domain(hive->reset_domain)) { 3104 r = -ENOENT; 3105 amdgpu_put_xgmi_hive(hive); 3106 goto init_failed; 3107 } 3108 3109 /* Drop the early temporary reset domain we created for device */ 3110 amdgpu_reset_put_reset_domain(adev->reset_domain); 3111 adev->reset_domain = hive->reset_domain; 3112 amdgpu_put_xgmi_hive(hive); 3113 } 3114 } 3115 } 3116 3117 r = amdgpu_device_init_schedulers(adev); 3118 if (r) 3119 goto init_failed; 3120 3121 if (adev->mman.buffer_funcs_ring->sched.ready) 3122 amdgpu_ttm_set_buffer_funcs_status(adev, true); 3123 3124 /* Don't init kfd if whole hive need to be reset during init */ 3125 if (adev->init_lvl->level != AMDGPU_INIT_LEVEL_MINIMAL_XGMI) { 3126 kgd2kfd_init_zone_device(adev); 3127 amdgpu_amdkfd_device_init(adev); 3128 } 3129 3130 amdgpu_fru_get_product_info(adev); 3131 3132 if (!amdgpu_sriov_vf(adev) || amdgpu_sriov_ras_cper_en(adev)) 3133 r = amdgpu_cper_init(adev); 3134 3135 init_failed: 3136 3137 return r; 3138 } 3139 3140 /** 3141 * amdgpu_device_fill_reset_magic - writes reset magic to gart pointer 3142 * 3143 * @adev: amdgpu_device pointer 3144 * 3145 * Writes a reset magic value to the gart pointer in VRAM. The driver calls 3146 * this function before a GPU reset. If the value is retained after a 3147 * GPU reset, VRAM has not been lost. Some GPU resets may destroy VRAM contents. 3148 */ 3149 static void amdgpu_device_fill_reset_magic(struct amdgpu_device *adev) 3150 { 3151 memcpy(adev->reset_magic, adev->gart.ptr, AMDGPU_RESET_MAGIC_NUM); 3152 } 3153 3154 /** 3155 * amdgpu_device_check_vram_lost - check if vram is valid 3156 * 3157 * @adev: amdgpu_device pointer 3158 * 3159 * Checks the reset magic value written to the gart pointer in VRAM. 3160 * The driver calls this after a GPU reset to see if the contents of 3161 * VRAM is lost or now. 3162 * returns true if vram is lost, false if not. 3163 */ 3164 static bool amdgpu_device_check_vram_lost(struct amdgpu_device *adev) 3165 { 3166 if (memcmp(adev->gart.ptr, adev->reset_magic, 3167 AMDGPU_RESET_MAGIC_NUM)) 3168 return true; 3169 3170 if (!amdgpu_in_reset(adev)) 3171 return false; 3172 3173 /* 3174 * For all ASICs with baco/mode1 reset, the VRAM is 3175 * always assumed to be lost. 3176 */ 3177 switch (amdgpu_asic_reset_method(adev)) { 3178 case AMD_RESET_METHOD_LINK: 3179 case AMD_RESET_METHOD_BACO: 3180 case AMD_RESET_METHOD_MODE1: 3181 return true; 3182 default: 3183 return false; 3184 } 3185 } 3186 3187 /** 3188 * amdgpu_device_set_cg_state - set clockgating for amdgpu device 3189 * 3190 * @adev: amdgpu_device pointer 3191 * @state: clockgating state (gate or ungate) 3192 * 3193 * The list of all the hardware IPs that make up the asic is walked and the 3194 * set_clockgating_state callbacks are run. 3195 * Late initialization pass enabling clockgating for hardware IPs. 3196 * Fini or suspend, pass disabling clockgating for hardware IPs. 3197 * Returns 0 on success, negative error code on failure. 3198 */ 3199 3200 int amdgpu_device_set_cg_state(struct amdgpu_device *adev, 3201 enum amd_clockgating_state state) 3202 { 3203 int i, j, r; 3204 3205 if (amdgpu_emu_mode == 1) 3206 return 0; 3207 3208 for (j = 0; j < adev->num_ip_blocks; j++) { 3209 i = state == AMD_CG_STATE_GATE ? j : adev->num_ip_blocks - j - 1; 3210 if (!adev->ip_blocks[i].status.late_initialized) 3211 continue; 3212 /* skip CG for GFX, SDMA on S0ix */ 3213 if (adev->in_s0ix && 3214 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GFX || 3215 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SDMA)) 3216 continue; 3217 /* skip CG for VCE/UVD, it's handled specially */ 3218 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_UVD && 3219 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCE && 3220 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCN && 3221 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_JPEG && 3222 adev->ip_blocks[i].version->funcs->set_clockgating_state) { 3223 /* enable clockgating to save power */ 3224 r = adev->ip_blocks[i].version->funcs->set_clockgating_state(&adev->ip_blocks[i], 3225 state); 3226 if (r) { 3227 DRM_ERROR("set_clockgating_state(gate) of IP block <%s> failed %d\n", 3228 adev->ip_blocks[i].version->funcs->name, r); 3229 return r; 3230 } 3231 } 3232 } 3233 3234 return 0; 3235 } 3236 3237 int amdgpu_device_set_pg_state(struct amdgpu_device *adev, 3238 enum amd_powergating_state state) 3239 { 3240 int i, j, r; 3241 3242 if (amdgpu_emu_mode == 1) 3243 return 0; 3244 3245 for (j = 0; j < adev->num_ip_blocks; j++) { 3246 i = state == AMD_PG_STATE_GATE ? j : adev->num_ip_blocks - j - 1; 3247 if (!adev->ip_blocks[i].status.late_initialized) 3248 continue; 3249 /* skip PG for GFX, SDMA on S0ix */ 3250 if (adev->in_s0ix && 3251 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GFX || 3252 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SDMA)) 3253 continue; 3254 /* skip CG for VCE/UVD, it's handled specially */ 3255 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_UVD && 3256 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCE && 3257 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCN && 3258 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_JPEG && 3259 adev->ip_blocks[i].version->funcs->set_powergating_state) { 3260 /* enable powergating to save power */ 3261 r = adev->ip_blocks[i].version->funcs->set_powergating_state(&adev->ip_blocks[i], 3262 state); 3263 if (r) { 3264 DRM_ERROR("set_powergating_state(gate) of IP block <%s> failed %d\n", 3265 adev->ip_blocks[i].version->funcs->name, r); 3266 return r; 3267 } 3268 } 3269 } 3270 return 0; 3271 } 3272 3273 static int amdgpu_device_enable_mgpu_fan_boost(void) 3274 { 3275 struct amdgpu_gpu_instance *gpu_ins; 3276 struct amdgpu_device *adev; 3277 int i, ret = 0; 3278 3279 mutex_lock(&mgpu_info.mutex); 3280 3281 /* 3282 * MGPU fan boost feature should be enabled 3283 * only when there are two or more dGPUs in 3284 * the system 3285 */ 3286 if (mgpu_info.num_dgpu < 2) 3287 goto out; 3288 3289 for (i = 0; i < mgpu_info.num_dgpu; i++) { 3290 gpu_ins = &(mgpu_info.gpu_ins[i]); 3291 adev = gpu_ins->adev; 3292 if (!(adev->flags & AMD_IS_APU) && 3293 !gpu_ins->mgpu_fan_enabled) { 3294 ret = amdgpu_dpm_enable_mgpu_fan_boost(adev); 3295 if (ret) 3296 break; 3297 3298 gpu_ins->mgpu_fan_enabled = 1; 3299 } 3300 } 3301 3302 out: 3303 mutex_unlock(&mgpu_info.mutex); 3304 3305 return ret; 3306 } 3307 3308 /** 3309 * amdgpu_device_ip_late_init - run late init for hardware IPs 3310 * 3311 * @adev: amdgpu_device pointer 3312 * 3313 * Late initialization pass for hardware IPs. The list of all the hardware 3314 * IPs that make up the asic is walked and the late_init callbacks are run. 3315 * late_init covers any special initialization that an IP requires 3316 * after all of the have been initialized or something that needs to happen 3317 * late in the init process. 3318 * Returns 0 on success, negative error code on failure. 3319 */ 3320 static int amdgpu_device_ip_late_init(struct amdgpu_device *adev) 3321 { 3322 struct amdgpu_gpu_instance *gpu_instance; 3323 int i = 0, r; 3324 3325 for (i = 0; i < adev->num_ip_blocks; i++) { 3326 if (!adev->ip_blocks[i].status.hw) 3327 continue; 3328 if (adev->ip_blocks[i].version->funcs->late_init) { 3329 r = adev->ip_blocks[i].version->funcs->late_init(&adev->ip_blocks[i]); 3330 if (r) { 3331 DRM_ERROR("late_init of IP block <%s> failed %d\n", 3332 adev->ip_blocks[i].version->funcs->name, r); 3333 return r; 3334 } 3335 } 3336 adev->ip_blocks[i].status.late_initialized = true; 3337 } 3338 3339 r = amdgpu_ras_late_init(adev); 3340 if (r) { 3341 DRM_ERROR("amdgpu_ras_late_init failed %d", r); 3342 return r; 3343 } 3344 3345 if (!amdgpu_reset_in_recovery(adev)) 3346 amdgpu_ras_set_error_query_ready(adev, true); 3347 3348 amdgpu_device_set_cg_state(adev, AMD_CG_STATE_GATE); 3349 amdgpu_device_set_pg_state(adev, AMD_PG_STATE_GATE); 3350 3351 amdgpu_device_fill_reset_magic(adev); 3352 3353 r = amdgpu_device_enable_mgpu_fan_boost(); 3354 if (r) 3355 DRM_ERROR("enable mgpu fan boost failed (%d).\n", r); 3356 3357 /* For passthrough configuration on arcturus and aldebaran, enable special handling SBR */ 3358 if (amdgpu_passthrough(adev) && 3359 ((adev->asic_type == CHIP_ARCTURUS && adev->gmc.xgmi.num_physical_nodes > 1) || 3360 adev->asic_type == CHIP_ALDEBARAN)) 3361 amdgpu_dpm_handle_passthrough_sbr(adev, true); 3362 3363 if (adev->gmc.xgmi.num_physical_nodes > 1) { 3364 mutex_lock(&mgpu_info.mutex); 3365 3366 /* 3367 * Reset device p-state to low as this was booted with high. 3368 * 3369 * This should be performed only after all devices from the same 3370 * hive get initialized. 3371 * 3372 * However, it's unknown how many device in the hive in advance. 3373 * As this is counted one by one during devices initializations. 3374 * 3375 * So, we wait for all XGMI interlinked devices initialized. 3376 * This may bring some delays as those devices may come from 3377 * different hives. But that should be OK. 3378 */ 3379 if (mgpu_info.num_dgpu == adev->gmc.xgmi.num_physical_nodes) { 3380 for (i = 0; i < mgpu_info.num_gpu; i++) { 3381 gpu_instance = &(mgpu_info.gpu_ins[i]); 3382 if (gpu_instance->adev->flags & AMD_IS_APU) 3383 continue; 3384 3385 r = amdgpu_xgmi_set_pstate(gpu_instance->adev, 3386 AMDGPU_XGMI_PSTATE_MIN); 3387 if (r) { 3388 DRM_ERROR("pstate setting failed (%d).\n", r); 3389 break; 3390 } 3391 } 3392 } 3393 3394 mutex_unlock(&mgpu_info.mutex); 3395 } 3396 3397 return 0; 3398 } 3399 3400 static void amdgpu_ip_block_hw_fini(struct amdgpu_ip_block *ip_block) 3401 { 3402 int r; 3403 3404 if (!ip_block->version->funcs->hw_fini) { 3405 DRM_ERROR("hw_fini of IP block <%s> not defined\n", 3406 ip_block->version->funcs->name); 3407 } else { 3408 r = ip_block->version->funcs->hw_fini(ip_block); 3409 /* XXX handle errors */ 3410 if (r) { 3411 DRM_DEBUG("hw_fini of IP block <%s> failed %d\n", 3412 ip_block->version->funcs->name, r); 3413 } 3414 } 3415 3416 ip_block->status.hw = false; 3417 } 3418 3419 /** 3420 * amdgpu_device_smu_fini_early - smu hw_fini wrapper 3421 * 3422 * @adev: amdgpu_device pointer 3423 * 3424 * For ASICs need to disable SMC first 3425 */ 3426 static void amdgpu_device_smu_fini_early(struct amdgpu_device *adev) 3427 { 3428 int i; 3429 3430 if (amdgpu_ip_version(adev, GC_HWIP, 0) > IP_VERSION(9, 0, 0)) 3431 return; 3432 3433 for (i = 0; i < adev->num_ip_blocks; i++) { 3434 if (!adev->ip_blocks[i].status.hw) 3435 continue; 3436 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) { 3437 amdgpu_ip_block_hw_fini(&adev->ip_blocks[i]); 3438 break; 3439 } 3440 } 3441 } 3442 3443 static int amdgpu_device_ip_fini_early(struct amdgpu_device *adev) 3444 { 3445 int i, r; 3446 3447 for (i = 0; i < adev->num_ip_blocks; i++) { 3448 if (!adev->ip_blocks[i].version->funcs->early_fini) 3449 continue; 3450 3451 r = adev->ip_blocks[i].version->funcs->early_fini(&adev->ip_blocks[i]); 3452 if (r) { 3453 DRM_DEBUG("early_fini of IP block <%s> failed %d\n", 3454 adev->ip_blocks[i].version->funcs->name, r); 3455 } 3456 } 3457 3458 amdgpu_device_set_pg_state(adev, AMD_PG_STATE_UNGATE); 3459 amdgpu_device_set_cg_state(adev, AMD_CG_STATE_UNGATE); 3460 3461 amdgpu_amdkfd_suspend(adev, false); 3462 3463 /* Workaround for ASICs need to disable SMC first */ 3464 amdgpu_device_smu_fini_early(adev); 3465 3466 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 3467 if (!adev->ip_blocks[i].status.hw) 3468 continue; 3469 3470 amdgpu_ip_block_hw_fini(&adev->ip_blocks[i]); 3471 } 3472 3473 if (amdgpu_sriov_vf(adev)) { 3474 if (amdgpu_virt_release_full_gpu(adev, false)) 3475 DRM_ERROR("failed to release exclusive mode on fini\n"); 3476 } 3477 3478 return 0; 3479 } 3480 3481 /** 3482 * amdgpu_device_ip_fini - run fini for hardware IPs 3483 * 3484 * @adev: amdgpu_device pointer 3485 * 3486 * Main teardown pass for hardware IPs. The list of all the hardware 3487 * IPs that make up the asic is walked and the hw_fini and sw_fini callbacks 3488 * are run. hw_fini tears down the hardware associated with each IP 3489 * and sw_fini tears down any software state associated with each IP. 3490 * Returns 0 on success, negative error code on failure. 3491 */ 3492 static int amdgpu_device_ip_fini(struct amdgpu_device *adev) 3493 { 3494 int i, r; 3495 3496 amdgpu_cper_fini(adev); 3497 3498 if (amdgpu_sriov_vf(adev) && adev->virt.ras_init_done) 3499 amdgpu_virt_release_ras_err_handler_data(adev); 3500 3501 if (adev->gmc.xgmi.num_physical_nodes > 1) 3502 amdgpu_xgmi_remove_device(adev); 3503 3504 amdgpu_amdkfd_device_fini_sw(adev); 3505 3506 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 3507 if (!adev->ip_blocks[i].status.sw) 3508 continue; 3509 3510 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) { 3511 amdgpu_ucode_free_bo(adev); 3512 amdgpu_free_static_csa(&adev->virt.csa_obj); 3513 amdgpu_device_wb_fini(adev); 3514 amdgpu_device_mem_scratch_fini(adev); 3515 amdgpu_ib_pool_fini(adev); 3516 amdgpu_seq64_fini(adev); 3517 } 3518 if (adev->ip_blocks[i].version->funcs->sw_fini) { 3519 r = adev->ip_blocks[i].version->funcs->sw_fini(&adev->ip_blocks[i]); 3520 /* XXX handle errors */ 3521 if (r) { 3522 DRM_DEBUG("sw_fini of IP block <%s> failed %d\n", 3523 adev->ip_blocks[i].version->funcs->name, r); 3524 } 3525 } 3526 adev->ip_blocks[i].status.sw = false; 3527 adev->ip_blocks[i].status.valid = false; 3528 } 3529 3530 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 3531 if (!adev->ip_blocks[i].status.late_initialized) 3532 continue; 3533 if (adev->ip_blocks[i].version->funcs->late_fini) 3534 adev->ip_blocks[i].version->funcs->late_fini(&adev->ip_blocks[i]); 3535 adev->ip_blocks[i].status.late_initialized = false; 3536 } 3537 3538 amdgpu_ras_fini(adev); 3539 3540 return 0; 3541 } 3542 3543 /** 3544 * amdgpu_device_delayed_init_work_handler - work handler for IB tests 3545 * 3546 * @work: work_struct. 3547 */ 3548 static void amdgpu_device_delayed_init_work_handler(struct work_struct *work) 3549 { 3550 struct amdgpu_device *adev = 3551 container_of(work, struct amdgpu_device, delayed_init_work.work); 3552 int r; 3553 3554 r = amdgpu_ib_ring_tests(adev); 3555 if (r) 3556 DRM_ERROR("ib ring test failed (%d).\n", r); 3557 } 3558 3559 static void amdgpu_device_delay_enable_gfx_off(struct work_struct *work) 3560 { 3561 struct amdgpu_device *adev = 3562 container_of(work, struct amdgpu_device, gfx.gfx_off_delay_work.work); 3563 3564 WARN_ON_ONCE(adev->gfx.gfx_off_state); 3565 WARN_ON_ONCE(adev->gfx.gfx_off_req_count); 3566 3567 if (!amdgpu_dpm_set_powergating_by_smu(adev, AMD_IP_BLOCK_TYPE_GFX, true, 0)) 3568 adev->gfx.gfx_off_state = true; 3569 } 3570 3571 /** 3572 * amdgpu_device_ip_suspend_phase1 - run suspend for hardware IPs (phase 1) 3573 * 3574 * @adev: amdgpu_device pointer 3575 * 3576 * Main suspend function for hardware IPs. The list of all the hardware 3577 * IPs that make up the asic is walked, clockgating is disabled and the 3578 * suspend callbacks are run. suspend puts the hardware and software state 3579 * in each IP into a state suitable for suspend. 3580 * Returns 0 on success, negative error code on failure. 3581 */ 3582 static int amdgpu_device_ip_suspend_phase1(struct amdgpu_device *adev) 3583 { 3584 int i, r; 3585 3586 amdgpu_device_set_pg_state(adev, AMD_PG_STATE_UNGATE); 3587 amdgpu_device_set_cg_state(adev, AMD_CG_STATE_UNGATE); 3588 3589 /* 3590 * Per PMFW team's suggestion, driver needs to handle gfxoff 3591 * and df cstate features disablement for gpu reset(e.g. Mode1Reset) 3592 * scenario. Add the missing df cstate disablement here. 3593 */ 3594 if (amdgpu_dpm_set_df_cstate(adev, DF_CSTATE_DISALLOW)) 3595 dev_warn(adev->dev, "Failed to disallow df cstate"); 3596 3597 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 3598 if (!adev->ip_blocks[i].status.valid) 3599 continue; 3600 3601 /* displays are handled separately */ 3602 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_DCE) 3603 continue; 3604 3605 /* XXX handle errors */ 3606 r = amdgpu_ip_block_suspend(&adev->ip_blocks[i]); 3607 if (r) 3608 return r; 3609 } 3610 3611 return 0; 3612 } 3613 3614 /** 3615 * amdgpu_device_ip_suspend_phase2 - run suspend for hardware IPs (phase 2) 3616 * 3617 * @adev: amdgpu_device pointer 3618 * 3619 * Main suspend function for hardware IPs. The list of all the hardware 3620 * IPs that make up the asic is walked, clockgating is disabled and the 3621 * suspend callbacks are run. suspend puts the hardware and software state 3622 * in each IP into a state suitable for suspend. 3623 * Returns 0 on success, negative error code on failure. 3624 */ 3625 static int amdgpu_device_ip_suspend_phase2(struct amdgpu_device *adev) 3626 { 3627 int i, r; 3628 3629 if (adev->in_s0ix) 3630 amdgpu_dpm_gfx_state_change(adev, sGpuChangeState_D3Entry); 3631 3632 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 3633 if (!adev->ip_blocks[i].status.valid) 3634 continue; 3635 /* displays are handled in phase1 */ 3636 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE) 3637 continue; 3638 /* PSP lost connection when err_event_athub occurs */ 3639 if (amdgpu_ras_intr_triggered() && 3640 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) { 3641 adev->ip_blocks[i].status.hw = false; 3642 continue; 3643 } 3644 3645 /* skip unnecessary suspend if we do not initialize them yet */ 3646 if (!amdgpu_ip_member_of_hwini( 3647 adev, adev->ip_blocks[i].version->type)) 3648 continue; 3649 3650 /* skip suspend of gfx/mes and psp for S0ix 3651 * gfx is in gfxoff state, so on resume it will exit gfxoff just 3652 * like at runtime. PSP is also part of the always on hardware 3653 * so no need to suspend it. 3654 */ 3655 if (adev->in_s0ix && 3656 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP || 3657 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GFX || 3658 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_MES)) 3659 continue; 3660 3661 /* SDMA 5.x+ is part of GFX power domain so it's covered by GFXOFF */ 3662 if (adev->in_s0ix && 3663 (amdgpu_ip_version(adev, SDMA0_HWIP, 0) >= 3664 IP_VERSION(5, 0, 0)) && 3665 (adev->ip_blocks[i].version->type == 3666 AMD_IP_BLOCK_TYPE_SDMA)) 3667 continue; 3668 3669 /* Once swPSP provides the IMU, RLC FW binaries to TOS during cold-boot. 3670 * These are in TMR, hence are expected to be reused by PSP-TOS to reload 3671 * from this location and RLC Autoload automatically also gets loaded 3672 * from here based on PMFW -> PSP message during re-init sequence. 3673 * Therefore, the psp suspend & resume should be skipped to avoid destroy 3674 * the TMR and reload FWs again for IMU enabled APU ASICs. 3675 */ 3676 if (amdgpu_in_reset(adev) && 3677 (adev->flags & AMD_IS_APU) && adev->gfx.imu.funcs && 3678 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) 3679 continue; 3680 3681 /* XXX handle errors */ 3682 r = amdgpu_ip_block_suspend(&adev->ip_blocks[i]); 3683 adev->ip_blocks[i].status.hw = false; 3684 3685 /* handle putting the SMC in the appropriate state */ 3686 if (!amdgpu_sriov_vf(adev)) { 3687 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) { 3688 r = amdgpu_dpm_set_mp1_state(adev, adev->mp1_state); 3689 if (r) { 3690 DRM_ERROR("SMC failed to set mp1 state %d, %d\n", 3691 adev->mp1_state, r); 3692 return r; 3693 } 3694 } 3695 } 3696 } 3697 3698 return 0; 3699 } 3700 3701 /** 3702 * amdgpu_device_ip_suspend - run suspend for hardware IPs 3703 * 3704 * @adev: amdgpu_device pointer 3705 * 3706 * Main suspend function for hardware IPs. The list of all the hardware 3707 * IPs that make up the asic is walked, clockgating is disabled and the 3708 * suspend callbacks are run. suspend puts the hardware and software state 3709 * in each IP into a state suitable for suspend. 3710 * Returns 0 on success, negative error code on failure. 3711 */ 3712 int amdgpu_device_ip_suspend(struct amdgpu_device *adev) 3713 { 3714 int r; 3715 3716 if (amdgpu_sriov_vf(adev)) { 3717 amdgpu_virt_fini_data_exchange(adev); 3718 amdgpu_virt_request_full_gpu(adev, false); 3719 } 3720 3721 amdgpu_ttm_set_buffer_funcs_status(adev, false); 3722 3723 r = amdgpu_device_ip_suspend_phase1(adev); 3724 if (r) 3725 return r; 3726 r = amdgpu_device_ip_suspend_phase2(adev); 3727 3728 if (amdgpu_sriov_vf(adev)) 3729 amdgpu_virt_release_full_gpu(adev, false); 3730 3731 return r; 3732 } 3733 3734 static int amdgpu_device_ip_reinit_early_sriov(struct amdgpu_device *adev) 3735 { 3736 int i, r; 3737 3738 static enum amd_ip_block_type ip_order[] = { 3739 AMD_IP_BLOCK_TYPE_COMMON, 3740 AMD_IP_BLOCK_TYPE_GMC, 3741 AMD_IP_BLOCK_TYPE_PSP, 3742 AMD_IP_BLOCK_TYPE_IH, 3743 }; 3744 3745 for (i = 0; i < adev->num_ip_blocks; i++) { 3746 int j; 3747 struct amdgpu_ip_block *block; 3748 3749 block = &adev->ip_blocks[i]; 3750 block->status.hw = false; 3751 3752 for (j = 0; j < ARRAY_SIZE(ip_order); j++) { 3753 3754 if (block->version->type != ip_order[j] || 3755 !block->status.valid) 3756 continue; 3757 3758 r = block->version->funcs->hw_init(&adev->ip_blocks[i]); 3759 if (r) { 3760 dev_err(adev->dev, "RE-INIT-early: %s failed\n", 3761 block->version->funcs->name); 3762 return r; 3763 } 3764 block->status.hw = true; 3765 } 3766 } 3767 3768 return 0; 3769 } 3770 3771 static int amdgpu_device_ip_reinit_late_sriov(struct amdgpu_device *adev) 3772 { 3773 struct amdgpu_ip_block *block; 3774 int i, r = 0; 3775 3776 static enum amd_ip_block_type ip_order[] = { 3777 AMD_IP_BLOCK_TYPE_SMC, 3778 AMD_IP_BLOCK_TYPE_DCE, 3779 AMD_IP_BLOCK_TYPE_GFX, 3780 AMD_IP_BLOCK_TYPE_SDMA, 3781 AMD_IP_BLOCK_TYPE_MES, 3782 AMD_IP_BLOCK_TYPE_UVD, 3783 AMD_IP_BLOCK_TYPE_VCE, 3784 AMD_IP_BLOCK_TYPE_VCN, 3785 AMD_IP_BLOCK_TYPE_JPEG 3786 }; 3787 3788 for (i = 0; i < ARRAY_SIZE(ip_order); i++) { 3789 block = amdgpu_device_ip_get_ip_block(adev, ip_order[i]); 3790 3791 if (!block) 3792 continue; 3793 3794 if (block->status.valid && !block->status.hw) { 3795 if (block->version->type == AMD_IP_BLOCK_TYPE_SMC) { 3796 r = amdgpu_ip_block_resume(block); 3797 } else { 3798 r = block->version->funcs->hw_init(block); 3799 } 3800 3801 if (r) { 3802 dev_err(adev->dev, "RE-INIT-late: %s failed\n", 3803 block->version->funcs->name); 3804 break; 3805 } 3806 block->status.hw = true; 3807 } 3808 } 3809 3810 return r; 3811 } 3812 3813 /** 3814 * amdgpu_device_ip_resume_phase1 - run resume for hardware IPs 3815 * 3816 * @adev: amdgpu_device pointer 3817 * 3818 * First resume function for hardware IPs. The list of all the hardware 3819 * IPs that make up the asic is walked and the resume callbacks are run for 3820 * COMMON, GMC, and IH. resume puts the hardware into a functional state 3821 * after a suspend and updates the software state as necessary. This 3822 * function is also used for restoring the GPU after a GPU reset. 3823 * Returns 0 on success, negative error code on failure. 3824 */ 3825 static int amdgpu_device_ip_resume_phase1(struct amdgpu_device *adev) 3826 { 3827 int i, r; 3828 3829 for (i = 0; i < adev->num_ip_blocks; i++) { 3830 if (!adev->ip_blocks[i].status.valid || adev->ip_blocks[i].status.hw) 3831 continue; 3832 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON || 3833 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC || 3834 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH || 3835 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP && amdgpu_sriov_vf(adev))) { 3836 3837 r = amdgpu_ip_block_resume(&adev->ip_blocks[i]); 3838 if (r) 3839 return r; 3840 } 3841 } 3842 3843 return 0; 3844 } 3845 3846 /** 3847 * amdgpu_device_ip_resume_phase2 - run resume for hardware IPs 3848 * 3849 * @adev: amdgpu_device pointer 3850 * 3851 * Second resume function for hardware IPs. The list of all the hardware 3852 * IPs that make up the asic is walked and the resume callbacks are run for 3853 * all blocks except COMMON, GMC, and IH. resume puts the hardware into a 3854 * functional state after a suspend and updates the software state as 3855 * necessary. This function is also used for restoring the GPU after a GPU 3856 * reset. 3857 * Returns 0 on success, negative error code on failure. 3858 */ 3859 static int amdgpu_device_ip_resume_phase2(struct amdgpu_device *adev) 3860 { 3861 int i, r; 3862 3863 for (i = 0; i < adev->num_ip_blocks; i++) { 3864 if (!adev->ip_blocks[i].status.valid || adev->ip_blocks[i].status.hw) 3865 continue; 3866 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON || 3867 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC || 3868 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH || 3869 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE || 3870 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) 3871 continue; 3872 r = amdgpu_ip_block_resume(&adev->ip_blocks[i]); 3873 if (r) 3874 return r; 3875 } 3876 3877 return 0; 3878 } 3879 3880 /** 3881 * amdgpu_device_ip_resume_phase3 - run resume for hardware IPs 3882 * 3883 * @adev: amdgpu_device pointer 3884 * 3885 * Third resume function for hardware IPs. The list of all the hardware 3886 * IPs that make up the asic is walked and the resume callbacks are run for 3887 * all DCE. resume puts the hardware into a functional state after a suspend 3888 * and updates the software state as necessary. This function is also used 3889 * for restoring the GPU after a GPU reset. 3890 * 3891 * Returns 0 on success, negative error code on failure. 3892 */ 3893 static int amdgpu_device_ip_resume_phase3(struct amdgpu_device *adev) 3894 { 3895 int i, r; 3896 3897 for (i = 0; i < adev->num_ip_blocks; i++) { 3898 if (!adev->ip_blocks[i].status.valid || adev->ip_blocks[i].status.hw) 3899 continue; 3900 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE) { 3901 r = amdgpu_ip_block_resume(&adev->ip_blocks[i]); 3902 if (r) 3903 return r; 3904 } 3905 } 3906 3907 return 0; 3908 } 3909 3910 /** 3911 * amdgpu_device_ip_resume - run resume for hardware IPs 3912 * 3913 * @adev: amdgpu_device pointer 3914 * 3915 * Main resume function for hardware IPs. The hardware IPs 3916 * are split into two resume functions because they are 3917 * also used in recovering from a GPU reset and some additional 3918 * steps need to be take between them. In this case (S3/S4) they are 3919 * run sequentially. 3920 * Returns 0 on success, negative error code on failure. 3921 */ 3922 static int amdgpu_device_ip_resume(struct amdgpu_device *adev) 3923 { 3924 int r; 3925 3926 r = amdgpu_device_ip_resume_phase1(adev); 3927 if (r) 3928 return r; 3929 3930 r = amdgpu_device_fw_loading(adev); 3931 if (r) 3932 return r; 3933 3934 r = amdgpu_device_ip_resume_phase2(adev); 3935 3936 if (adev->mman.buffer_funcs_ring->sched.ready) 3937 amdgpu_ttm_set_buffer_funcs_status(adev, true); 3938 3939 if (r) 3940 return r; 3941 3942 amdgpu_fence_driver_hw_init(adev); 3943 3944 r = amdgpu_device_ip_resume_phase3(adev); 3945 3946 return r; 3947 } 3948 3949 /** 3950 * amdgpu_device_detect_sriov_bios - determine if the board supports SR-IOV 3951 * 3952 * @adev: amdgpu_device pointer 3953 * 3954 * Query the VBIOS data tables to determine if the board supports SR-IOV. 3955 */ 3956 static void amdgpu_device_detect_sriov_bios(struct amdgpu_device *adev) 3957 { 3958 if (amdgpu_sriov_vf(adev)) { 3959 if (adev->is_atom_fw) { 3960 if (amdgpu_atomfirmware_gpu_virtualization_supported(adev)) 3961 adev->virt.caps |= AMDGPU_SRIOV_CAPS_SRIOV_VBIOS; 3962 } else { 3963 if (amdgpu_atombios_has_gpu_virtualization_table(adev)) 3964 adev->virt.caps |= AMDGPU_SRIOV_CAPS_SRIOV_VBIOS; 3965 } 3966 3967 if (!(adev->virt.caps & AMDGPU_SRIOV_CAPS_SRIOV_VBIOS)) 3968 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_NO_VBIOS, 0, 0); 3969 } 3970 } 3971 3972 /** 3973 * amdgpu_device_asic_has_dc_support - determine if DC supports the asic 3974 * 3975 * @asic_type: AMD asic type 3976 * 3977 * Check if there is DC (new modesetting infrastructre) support for an asic. 3978 * returns true if DC has support, false if not. 3979 */ 3980 bool amdgpu_device_asic_has_dc_support(enum amd_asic_type asic_type) 3981 { 3982 switch (asic_type) { 3983 #ifdef CONFIG_DRM_AMDGPU_SI 3984 case CHIP_HAINAN: 3985 #endif 3986 case CHIP_TOPAZ: 3987 /* chips with no display hardware */ 3988 return false; 3989 #if defined(CONFIG_DRM_AMD_DC) 3990 case CHIP_TAHITI: 3991 case CHIP_PITCAIRN: 3992 case CHIP_VERDE: 3993 case CHIP_OLAND: 3994 /* 3995 * We have systems in the wild with these ASICs that require 3996 * LVDS and VGA support which is not supported with DC. 3997 * 3998 * Fallback to the non-DC driver here by default so as not to 3999 * cause regressions. 4000 */ 4001 #if defined(CONFIG_DRM_AMD_DC_SI) 4002 return amdgpu_dc > 0; 4003 #else 4004 return false; 4005 #endif 4006 case CHIP_BONAIRE: 4007 case CHIP_KAVERI: 4008 case CHIP_KABINI: 4009 case CHIP_MULLINS: 4010 /* 4011 * We have systems in the wild with these ASICs that require 4012 * VGA support which is not supported with DC. 4013 * 4014 * Fallback to the non-DC driver here by default so as not to 4015 * cause regressions. 4016 */ 4017 return amdgpu_dc > 0; 4018 default: 4019 return amdgpu_dc != 0; 4020 #else 4021 default: 4022 if (amdgpu_dc > 0) 4023 DRM_INFO_ONCE("Display Core has been requested via kernel parameter but isn't supported by ASIC, ignoring\n"); 4024 return false; 4025 #endif 4026 } 4027 } 4028 4029 /** 4030 * amdgpu_device_has_dc_support - check if dc is supported 4031 * 4032 * @adev: amdgpu_device pointer 4033 * 4034 * Returns true for supported, false for not supported 4035 */ 4036 bool amdgpu_device_has_dc_support(struct amdgpu_device *adev) 4037 { 4038 if (adev->enable_virtual_display || 4039 (adev->harvest_ip_mask & AMD_HARVEST_IP_DMU_MASK)) 4040 return false; 4041 4042 return amdgpu_device_asic_has_dc_support(adev->asic_type); 4043 } 4044 4045 static void amdgpu_device_xgmi_reset_func(struct work_struct *__work) 4046 { 4047 struct amdgpu_device *adev = 4048 container_of(__work, struct amdgpu_device, xgmi_reset_work); 4049 struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev); 4050 4051 /* It's a bug to not have a hive within this function */ 4052 if (WARN_ON(!hive)) 4053 return; 4054 4055 /* 4056 * Use task barrier to synchronize all xgmi reset works across the 4057 * hive. task_barrier_enter and task_barrier_exit will block 4058 * until all the threads running the xgmi reset works reach 4059 * those points. task_barrier_full will do both blocks. 4060 */ 4061 if (amdgpu_asic_reset_method(adev) == AMD_RESET_METHOD_BACO) { 4062 4063 task_barrier_enter(&hive->tb); 4064 adev->asic_reset_res = amdgpu_device_baco_enter(adev_to_drm(adev)); 4065 4066 if (adev->asic_reset_res) 4067 goto fail; 4068 4069 task_barrier_exit(&hive->tb); 4070 adev->asic_reset_res = amdgpu_device_baco_exit(adev_to_drm(adev)); 4071 4072 if (adev->asic_reset_res) 4073 goto fail; 4074 4075 amdgpu_ras_reset_error_count(adev, AMDGPU_RAS_BLOCK__MMHUB); 4076 } else { 4077 4078 task_barrier_full(&hive->tb); 4079 adev->asic_reset_res = amdgpu_asic_reset(adev); 4080 } 4081 4082 fail: 4083 if (adev->asic_reset_res) 4084 DRM_WARN("ASIC reset failed with error, %d for drm dev, %s", 4085 adev->asic_reset_res, adev_to_drm(adev)->unique); 4086 amdgpu_put_xgmi_hive(hive); 4087 } 4088 4089 static int amdgpu_device_get_job_timeout_settings(struct amdgpu_device *adev) 4090 { 4091 char *input = amdgpu_lockup_timeout; 4092 char *timeout_setting = NULL; 4093 int index = 0; 4094 long timeout; 4095 int ret = 0; 4096 4097 /* 4098 * By default timeout for non compute jobs is 10000 4099 * and 60000 for compute jobs. 4100 * In SR-IOV or passthrough mode, timeout for compute 4101 * jobs are 60000 by default. 4102 */ 4103 adev->gfx_timeout = msecs_to_jiffies(10000); 4104 adev->sdma_timeout = adev->video_timeout = adev->gfx_timeout; 4105 if (amdgpu_sriov_vf(adev)) 4106 adev->compute_timeout = amdgpu_sriov_is_pp_one_vf(adev) ? 4107 msecs_to_jiffies(60000) : msecs_to_jiffies(10000); 4108 else 4109 adev->compute_timeout = msecs_to_jiffies(60000); 4110 4111 if (strnlen(input, AMDGPU_MAX_TIMEOUT_PARAM_LENGTH)) { 4112 while ((timeout_setting = strsep(&input, ",")) && 4113 strnlen(timeout_setting, AMDGPU_MAX_TIMEOUT_PARAM_LENGTH)) { 4114 ret = kstrtol(timeout_setting, 0, &timeout); 4115 if (ret) 4116 return ret; 4117 4118 if (timeout == 0) { 4119 index++; 4120 continue; 4121 } else if (timeout < 0) { 4122 timeout = MAX_SCHEDULE_TIMEOUT; 4123 dev_warn(adev->dev, "lockup timeout disabled"); 4124 add_taint(TAINT_SOFTLOCKUP, LOCKDEP_STILL_OK); 4125 } else { 4126 timeout = msecs_to_jiffies(timeout); 4127 } 4128 4129 switch (index++) { 4130 case 0: 4131 adev->gfx_timeout = timeout; 4132 break; 4133 case 1: 4134 adev->compute_timeout = timeout; 4135 break; 4136 case 2: 4137 adev->sdma_timeout = timeout; 4138 break; 4139 case 3: 4140 adev->video_timeout = timeout; 4141 break; 4142 default: 4143 break; 4144 } 4145 } 4146 /* 4147 * There is only one value specified and 4148 * it should apply to all non-compute jobs. 4149 */ 4150 if (index == 1) { 4151 adev->sdma_timeout = adev->video_timeout = adev->gfx_timeout; 4152 if (amdgpu_sriov_vf(adev) || amdgpu_passthrough(adev)) 4153 adev->compute_timeout = adev->gfx_timeout; 4154 } 4155 } 4156 4157 return ret; 4158 } 4159 4160 /** 4161 * amdgpu_device_check_iommu_direct_map - check if RAM direct mapped to GPU 4162 * 4163 * @adev: amdgpu_device pointer 4164 * 4165 * RAM direct mapped to GPU if IOMMU is not enabled or is pass through mode 4166 */ 4167 static void amdgpu_device_check_iommu_direct_map(struct amdgpu_device *adev) 4168 { 4169 struct iommu_domain *domain; 4170 4171 domain = iommu_get_domain_for_dev(adev->dev); 4172 if (!domain || domain->type == IOMMU_DOMAIN_IDENTITY) 4173 adev->ram_is_direct_mapped = true; 4174 } 4175 4176 #if defined(CONFIG_HSA_AMD_P2P) 4177 /** 4178 * amdgpu_device_check_iommu_remap - Check if DMA remapping is enabled. 4179 * 4180 * @adev: amdgpu_device pointer 4181 * 4182 * return if IOMMU remapping bar address 4183 */ 4184 static bool amdgpu_device_check_iommu_remap(struct amdgpu_device *adev) 4185 { 4186 struct iommu_domain *domain; 4187 4188 domain = iommu_get_domain_for_dev(adev->dev); 4189 if (domain && (domain->type == IOMMU_DOMAIN_DMA || 4190 domain->type == IOMMU_DOMAIN_DMA_FQ)) 4191 return true; 4192 4193 return false; 4194 } 4195 #endif 4196 4197 static void amdgpu_device_set_mcbp(struct amdgpu_device *adev) 4198 { 4199 if (amdgpu_mcbp == 1) 4200 adev->gfx.mcbp = true; 4201 else if (amdgpu_mcbp == 0) 4202 adev->gfx.mcbp = false; 4203 4204 if (amdgpu_sriov_vf(adev)) 4205 adev->gfx.mcbp = true; 4206 4207 if (adev->gfx.mcbp) 4208 DRM_INFO("MCBP is enabled\n"); 4209 } 4210 4211 /** 4212 * amdgpu_device_init - initialize the driver 4213 * 4214 * @adev: amdgpu_device pointer 4215 * @flags: driver flags 4216 * 4217 * Initializes the driver info and hw (all asics). 4218 * Returns 0 for success or an error on failure. 4219 * Called at driver startup. 4220 */ 4221 int amdgpu_device_init(struct amdgpu_device *adev, 4222 uint32_t flags) 4223 { 4224 struct drm_device *ddev = adev_to_drm(adev); 4225 struct pci_dev *pdev = adev->pdev; 4226 int r, i; 4227 bool px = false; 4228 u32 max_MBps; 4229 int tmp; 4230 4231 adev->shutdown = false; 4232 adev->flags = flags; 4233 4234 if (amdgpu_force_asic_type >= 0 && amdgpu_force_asic_type < CHIP_LAST) 4235 adev->asic_type = amdgpu_force_asic_type; 4236 else 4237 adev->asic_type = flags & AMD_ASIC_MASK; 4238 4239 adev->usec_timeout = AMDGPU_MAX_USEC_TIMEOUT; 4240 if (amdgpu_emu_mode == 1) 4241 adev->usec_timeout *= 10; 4242 adev->gmc.gart_size = 512 * 1024 * 1024; 4243 adev->accel_working = false; 4244 adev->num_rings = 0; 4245 RCU_INIT_POINTER(adev->gang_submit, dma_fence_get_stub()); 4246 adev->mman.buffer_funcs = NULL; 4247 adev->mman.buffer_funcs_ring = NULL; 4248 adev->vm_manager.vm_pte_funcs = NULL; 4249 adev->vm_manager.vm_pte_num_scheds = 0; 4250 adev->gmc.gmc_funcs = NULL; 4251 adev->harvest_ip_mask = 0x0; 4252 adev->fence_context = dma_fence_context_alloc(AMDGPU_MAX_RINGS); 4253 bitmap_zero(adev->gfx.pipe_reserve_bitmap, AMDGPU_MAX_COMPUTE_QUEUES); 4254 4255 adev->smc_rreg = &amdgpu_invalid_rreg; 4256 adev->smc_wreg = &amdgpu_invalid_wreg; 4257 adev->pcie_rreg = &amdgpu_invalid_rreg; 4258 adev->pcie_wreg = &amdgpu_invalid_wreg; 4259 adev->pcie_rreg_ext = &amdgpu_invalid_rreg_ext; 4260 adev->pcie_wreg_ext = &amdgpu_invalid_wreg_ext; 4261 adev->pciep_rreg = &amdgpu_invalid_rreg; 4262 adev->pciep_wreg = &amdgpu_invalid_wreg; 4263 adev->pcie_rreg64 = &amdgpu_invalid_rreg64; 4264 adev->pcie_wreg64 = &amdgpu_invalid_wreg64; 4265 adev->pcie_rreg64_ext = &amdgpu_invalid_rreg64_ext; 4266 adev->pcie_wreg64_ext = &amdgpu_invalid_wreg64_ext; 4267 adev->uvd_ctx_rreg = &amdgpu_invalid_rreg; 4268 adev->uvd_ctx_wreg = &amdgpu_invalid_wreg; 4269 adev->didt_rreg = &amdgpu_invalid_rreg; 4270 adev->didt_wreg = &amdgpu_invalid_wreg; 4271 adev->gc_cac_rreg = &amdgpu_invalid_rreg; 4272 adev->gc_cac_wreg = &amdgpu_invalid_wreg; 4273 adev->audio_endpt_rreg = &amdgpu_block_invalid_rreg; 4274 adev->audio_endpt_wreg = &amdgpu_block_invalid_wreg; 4275 4276 DRM_INFO("initializing kernel modesetting (%s 0x%04X:0x%04X 0x%04X:0x%04X 0x%02X).\n", 4277 amdgpu_asic_name[adev->asic_type], pdev->vendor, pdev->device, 4278 pdev->subsystem_vendor, pdev->subsystem_device, pdev->revision); 4279 4280 /* mutex initialization are all done here so we 4281 * can recall function without having locking issues 4282 */ 4283 mutex_init(&adev->firmware.mutex); 4284 mutex_init(&adev->pm.mutex); 4285 mutex_init(&adev->gfx.gpu_clock_mutex); 4286 mutex_init(&adev->srbm_mutex); 4287 mutex_init(&adev->gfx.pipe_reserve_mutex); 4288 mutex_init(&adev->gfx.gfx_off_mutex); 4289 mutex_init(&adev->gfx.partition_mutex); 4290 mutex_init(&adev->grbm_idx_mutex); 4291 mutex_init(&adev->mn_lock); 4292 mutex_init(&adev->virt.vf_errors.lock); 4293 hash_init(adev->mn_hash); 4294 mutex_init(&adev->psp.mutex); 4295 mutex_init(&adev->notifier_lock); 4296 mutex_init(&adev->pm.stable_pstate_ctx_lock); 4297 mutex_init(&adev->benchmark_mutex); 4298 mutex_init(&adev->gfx.reset_sem_mutex); 4299 /* Initialize the mutex for cleaner shader isolation between GFX and compute processes */ 4300 mutex_init(&adev->enforce_isolation_mutex); 4301 for (i = 0; i < MAX_XCP; ++i) { 4302 adev->isolation[i].spearhead = dma_fence_get_stub(); 4303 amdgpu_sync_create(&adev->isolation[i].active); 4304 amdgpu_sync_create(&adev->isolation[i].prev); 4305 } 4306 mutex_init(&adev->gfx.kfd_sch_mutex); 4307 mutex_init(&adev->gfx.workload_profile_mutex); 4308 mutex_init(&adev->vcn.workload_profile_mutex); 4309 4310 amdgpu_device_init_apu_flags(adev); 4311 4312 r = amdgpu_device_check_arguments(adev); 4313 if (r) 4314 return r; 4315 4316 spin_lock_init(&adev->mmio_idx_lock); 4317 spin_lock_init(&adev->smc_idx_lock); 4318 spin_lock_init(&adev->pcie_idx_lock); 4319 spin_lock_init(&adev->uvd_ctx_idx_lock); 4320 spin_lock_init(&adev->didt_idx_lock); 4321 spin_lock_init(&adev->gc_cac_idx_lock); 4322 spin_lock_init(&adev->se_cac_idx_lock); 4323 spin_lock_init(&adev->audio_endpt_idx_lock); 4324 spin_lock_init(&adev->mm_stats.lock); 4325 spin_lock_init(&adev->virt.rlcg_reg_lock); 4326 spin_lock_init(&adev->wb.lock); 4327 4328 INIT_LIST_HEAD(&adev->reset_list); 4329 4330 INIT_LIST_HEAD(&adev->ras_list); 4331 4332 INIT_LIST_HEAD(&adev->pm.od_kobj_list); 4333 4334 INIT_DELAYED_WORK(&adev->delayed_init_work, 4335 amdgpu_device_delayed_init_work_handler); 4336 INIT_DELAYED_WORK(&adev->gfx.gfx_off_delay_work, 4337 amdgpu_device_delay_enable_gfx_off); 4338 /* 4339 * Initialize the enforce_isolation work structures for each XCP 4340 * partition. This work handler is responsible for enforcing shader 4341 * isolation on AMD GPUs. It counts the number of emitted fences for 4342 * each GFX and compute ring. If there are any fences, it schedules 4343 * the `enforce_isolation_work` to be run after a delay. If there are 4344 * no fences, it signals the Kernel Fusion Driver (KFD) to resume the 4345 * runqueue. 4346 */ 4347 for (i = 0; i < MAX_XCP; i++) { 4348 INIT_DELAYED_WORK(&adev->gfx.enforce_isolation[i].work, 4349 amdgpu_gfx_enforce_isolation_handler); 4350 adev->gfx.enforce_isolation[i].adev = adev; 4351 adev->gfx.enforce_isolation[i].xcp_id = i; 4352 } 4353 4354 INIT_WORK(&adev->xgmi_reset_work, amdgpu_device_xgmi_reset_func); 4355 4356 adev->gfx.gfx_off_req_count = 1; 4357 adev->gfx.gfx_off_residency = 0; 4358 adev->gfx.gfx_off_entrycount = 0; 4359 adev->pm.ac_power = power_supply_is_system_supplied() > 0; 4360 4361 atomic_set(&adev->throttling_logging_enabled, 1); 4362 /* 4363 * If throttling continues, logging will be performed every minute 4364 * to avoid log flooding. "-1" is subtracted since the thermal 4365 * throttling interrupt comes every second. Thus, the total logging 4366 * interval is 59 seconds(retelimited printk interval) + 1(waiting 4367 * for throttling interrupt) = 60 seconds. 4368 */ 4369 ratelimit_state_init(&adev->throttling_logging_rs, (60 - 1) * HZ, 1); 4370 4371 ratelimit_set_flags(&adev->throttling_logging_rs, RATELIMIT_MSG_ON_RELEASE); 4372 4373 /* Registers mapping */ 4374 /* TODO: block userspace mapping of io register */ 4375 if (adev->asic_type >= CHIP_BONAIRE) { 4376 adev->rmmio_base = pci_resource_start(adev->pdev, 5); 4377 adev->rmmio_size = pci_resource_len(adev->pdev, 5); 4378 } else { 4379 adev->rmmio_base = pci_resource_start(adev->pdev, 2); 4380 adev->rmmio_size = pci_resource_len(adev->pdev, 2); 4381 } 4382 4383 for (i = 0; i < AMD_IP_BLOCK_TYPE_NUM; i++) 4384 atomic_set(&adev->pm.pwr_state[i], POWER_STATE_UNKNOWN); 4385 4386 adev->rmmio = ioremap(adev->rmmio_base, adev->rmmio_size); 4387 if (!adev->rmmio) 4388 return -ENOMEM; 4389 4390 DRM_INFO("register mmio base: 0x%08X\n", (uint32_t)adev->rmmio_base); 4391 DRM_INFO("register mmio size: %u\n", (unsigned int)adev->rmmio_size); 4392 4393 /* 4394 * Reset domain needs to be present early, before XGMI hive discovered 4395 * (if any) and initialized to use reset sem and in_gpu reset flag 4396 * early on during init and before calling to RREG32. 4397 */ 4398 adev->reset_domain = amdgpu_reset_create_reset_domain(SINGLE_DEVICE, "amdgpu-reset-dev"); 4399 if (!adev->reset_domain) 4400 return -ENOMEM; 4401 4402 /* detect hw virtualization here */ 4403 amdgpu_virt_init(adev); 4404 4405 amdgpu_device_get_pcie_info(adev); 4406 4407 r = amdgpu_device_get_job_timeout_settings(adev); 4408 if (r) { 4409 dev_err(adev->dev, "invalid lockup_timeout parameter syntax\n"); 4410 return r; 4411 } 4412 4413 amdgpu_device_set_mcbp(adev); 4414 4415 /* 4416 * By default, use default mode where all blocks are expected to be 4417 * initialized. At present a 'swinit' of blocks is required to be 4418 * completed before the need for a different level is detected. 4419 */ 4420 amdgpu_set_init_level(adev, AMDGPU_INIT_LEVEL_DEFAULT); 4421 /* early init functions */ 4422 r = amdgpu_device_ip_early_init(adev); 4423 if (r) 4424 return r; 4425 4426 /* 4427 * No need to remove conflicting FBs for non-display class devices. 4428 * This prevents the sysfb from being freed accidently. 4429 */ 4430 if ((pdev->class >> 8) == PCI_CLASS_DISPLAY_VGA || 4431 (pdev->class >> 8) == PCI_CLASS_DISPLAY_OTHER) { 4432 /* Get rid of things like offb */ 4433 r = aperture_remove_conflicting_pci_devices(adev->pdev, amdgpu_kms_driver.name); 4434 if (r) 4435 return r; 4436 } 4437 4438 /* Enable TMZ based on IP_VERSION */ 4439 amdgpu_gmc_tmz_set(adev); 4440 4441 if (amdgpu_sriov_vf(adev) && 4442 amdgpu_ip_version(adev, GC_HWIP, 0) >= IP_VERSION(10, 3, 0)) 4443 /* VF MMIO access (except mailbox range) from CPU 4444 * will be blocked during sriov runtime 4445 */ 4446 adev->virt.caps |= AMDGPU_VF_MMIO_ACCESS_PROTECT; 4447 4448 amdgpu_gmc_noretry_set(adev); 4449 /* Need to get xgmi info early to decide the reset behavior*/ 4450 if (adev->gmc.xgmi.supported) { 4451 r = adev->gfxhub.funcs->get_xgmi_info(adev); 4452 if (r) 4453 return r; 4454 } 4455 4456 /* enable PCIE atomic ops */ 4457 if (amdgpu_sriov_vf(adev)) { 4458 if (adev->virt.fw_reserve.p_pf2vf) 4459 adev->have_atomics_support = ((struct amd_sriov_msg_pf2vf_info *) 4460 adev->virt.fw_reserve.p_pf2vf)->pcie_atomic_ops_support_flags == 4461 (PCI_EXP_DEVCAP2_ATOMIC_COMP32 | PCI_EXP_DEVCAP2_ATOMIC_COMP64); 4462 /* APUs w/ gfx9 onwards doesn't reply on PCIe atomics, rather it is a 4463 * internal path natively support atomics, set have_atomics_support to true. 4464 */ 4465 } else if ((adev->flags & AMD_IS_APU) && 4466 (amdgpu_ip_version(adev, GC_HWIP, 0) > 4467 IP_VERSION(9, 0, 0))) { 4468 adev->have_atomics_support = true; 4469 } else { 4470 adev->have_atomics_support = 4471 !pci_enable_atomic_ops_to_root(adev->pdev, 4472 PCI_EXP_DEVCAP2_ATOMIC_COMP32 | 4473 PCI_EXP_DEVCAP2_ATOMIC_COMP64); 4474 } 4475 4476 if (!adev->have_atomics_support) 4477 dev_info(adev->dev, "PCIE atomic ops is not supported\n"); 4478 4479 /* doorbell bar mapping and doorbell index init*/ 4480 amdgpu_doorbell_init(adev); 4481 4482 if (amdgpu_emu_mode == 1) { 4483 /* post the asic on emulation mode */ 4484 emu_soc_asic_init(adev); 4485 goto fence_driver_init; 4486 } 4487 4488 amdgpu_reset_init(adev); 4489 4490 /* detect if we are with an SRIOV vbios */ 4491 if (adev->bios) 4492 amdgpu_device_detect_sriov_bios(adev); 4493 4494 /* check if we need to reset the asic 4495 * E.g., driver was not cleanly unloaded previously, etc. 4496 */ 4497 if (!amdgpu_sriov_vf(adev) && amdgpu_asic_need_reset_on_init(adev)) { 4498 if (adev->gmc.xgmi.num_physical_nodes) { 4499 dev_info(adev->dev, "Pending hive reset.\n"); 4500 amdgpu_set_init_level(adev, 4501 AMDGPU_INIT_LEVEL_MINIMAL_XGMI); 4502 } else if (amdgpu_ip_version(adev, MP1_HWIP, 0) == IP_VERSION(13, 0, 10) && 4503 !amdgpu_device_has_display_hardware(adev)) { 4504 r = psp_gpu_reset(adev); 4505 } else { 4506 tmp = amdgpu_reset_method; 4507 /* It should do a default reset when loading or reloading the driver, 4508 * regardless of the module parameter reset_method. 4509 */ 4510 amdgpu_reset_method = AMD_RESET_METHOD_NONE; 4511 r = amdgpu_asic_reset(adev); 4512 amdgpu_reset_method = tmp; 4513 } 4514 4515 if (r) { 4516 dev_err(adev->dev, "asic reset on init failed\n"); 4517 goto failed; 4518 } 4519 } 4520 4521 /* Post card if necessary */ 4522 if (amdgpu_device_need_post(adev)) { 4523 if (!adev->bios) { 4524 dev_err(adev->dev, "no vBIOS found\n"); 4525 r = -EINVAL; 4526 goto failed; 4527 } 4528 DRM_INFO("GPU posting now...\n"); 4529 r = amdgpu_device_asic_init(adev); 4530 if (r) { 4531 dev_err(adev->dev, "gpu post error!\n"); 4532 goto failed; 4533 } 4534 } 4535 4536 if (adev->bios) { 4537 if (adev->is_atom_fw) { 4538 /* Initialize clocks */ 4539 r = amdgpu_atomfirmware_get_clock_info(adev); 4540 if (r) { 4541 dev_err(adev->dev, "amdgpu_atomfirmware_get_clock_info failed\n"); 4542 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_GET_CLOCK_FAIL, 0, 0); 4543 goto failed; 4544 } 4545 } else { 4546 /* Initialize clocks */ 4547 r = amdgpu_atombios_get_clock_info(adev); 4548 if (r) { 4549 dev_err(adev->dev, "amdgpu_atombios_get_clock_info failed\n"); 4550 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_GET_CLOCK_FAIL, 0, 0); 4551 goto failed; 4552 } 4553 /* init i2c buses */ 4554 amdgpu_i2c_init(adev); 4555 } 4556 } 4557 4558 fence_driver_init: 4559 /* Fence driver */ 4560 r = amdgpu_fence_driver_sw_init(adev); 4561 if (r) { 4562 dev_err(adev->dev, "amdgpu_fence_driver_sw_init failed\n"); 4563 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_FENCE_INIT_FAIL, 0, 0); 4564 goto failed; 4565 } 4566 4567 /* init the mode config */ 4568 drm_mode_config_init(adev_to_drm(adev)); 4569 4570 r = amdgpu_device_ip_init(adev); 4571 if (r) { 4572 dev_err(adev->dev, "amdgpu_device_ip_init failed\n"); 4573 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_AMDGPU_INIT_FAIL, 0, 0); 4574 goto release_ras_con; 4575 } 4576 4577 amdgpu_fence_driver_hw_init(adev); 4578 4579 dev_info(adev->dev, 4580 "SE %d, SH per SE %d, CU per SH %d, active_cu_number %d\n", 4581 adev->gfx.config.max_shader_engines, 4582 adev->gfx.config.max_sh_per_se, 4583 adev->gfx.config.max_cu_per_sh, 4584 adev->gfx.cu_info.number); 4585 4586 adev->accel_working = true; 4587 4588 amdgpu_vm_check_compute_bug(adev); 4589 4590 /* Initialize the buffer migration limit. */ 4591 if (amdgpu_moverate >= 0) 4592 max_MBps = amdgpu_moverate; 4593 else 4594 max_MBps = 8; /* Allow 8 MB/s. */ 4595 /* Get a log2 for easy divisions. */ 4596 adev->mm_stats.log2_max_MBps = ilog2(max(1u, max_MBps)); 4597 4598 /* 4599 * Register gpu instance before amdgpu_device_enable_mgpu_fan_boost. 4600 * Otherwise the mgpu fan boost feature will be skipped due to the 4601 * gpu instance is counted less. 4602 */ 4603 amdgpu_register_gpu_instance(adev); 4604 4605 /* enable clockgating, etc. after ib tests, etc. since some blocks require 4606 * explicit gating rather than handling it automatically. 4607 */ 4608 if (adev->init_lvl->level != AMDGPU_INIT_LEVEL_MINIMAL_XGMI) { 4609 r = amdgpu_device_ip_late_init(adev); 4610 if (r) { 4611 dev_err(adev->dev, "amdgpu_device_ip_late_init failed\n"); 4612 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_AMDGPU_LATE_INIT_FAIL, 0, r); 4613 goto release_ras_con; 4614 } 4615 /* must succeed. */ 4616 amdgpu_ras_resume(adev); 4617 queue_delayed_work(system_wq, &adev->delayed_init_work, 4618 msecs_to_jiffies(AMDGPU_RESUME_MS)); 4619 } 4620 4621 if (amdgpu_sriov_vf(adev)) { 4622 amdgpu_virt_release_full_gpu(adev, true); 4623 flush_delayed_work(&adev->delayed_init_work); 4624 } 4625 4626 /* 4627 * Place those sysfs registering after `late_init`. As some of those 4628 * operations performed in `late_init` might affect the sysfs 4629 * interfaces creating. 4630 */ 4631 r = amdgpu_atombios_sysfs_init(adev); 4632 if (r) 4633 drm_err(&adev->ddev, 4634 "registering atombios sysfs failed (%d).\n", r); 4635 4636 r = amdgpu_pm_sysfs_init(adev); 4637 if (r) 4638 DRM_ERROR("registering pm sysfs failed (%d).\n", r); 4639 4640 r = amdgpu_ucode_sysfs_init(adev); 4641 if (r) { 4642 adev->ucode_sysfs_en = false; 4643 DRM_ERROR("Creating firmware sysfs failed (%d).\n", r); 4644 } else 4645 adev->ucode_sysfs_en = true; 4646 4647 r = amdgpu_device_attr_sysfs_init(adev); 4648 if (r) 4649 dev_err(adev->dev, "Could not create amdgpu device attr\n"); 4650 4651 r = devm_device_add_group(adev->dev, &amdgpu_board_attrs_group); 4652 if (r) 4653 dev_err(adev->dev, 4654 "Could not create amdgpu board attributes\n"); 4655 4656 amdgpu_fru_sysfs_init(adev); 4657 amdgpu_reg_state_sysfs_init(adev); 4658 amdgpu_xcp_cfg_sysfs_init(adev); 4659 4660 if (IS_ENABLED(CONFIG_PERF_EVENTS)) 4661 r = amdgpu_pmu_init(adev); 4662 if (r) 4663 dev_err(adev->dev, "amdgpu_pmu_init failed\n"); 4664 4665 /* Have stored pci confspace at hand for restore in sudden PCI error */ 4666 if (amdgpu_device_cache_pci_state(adev->pdev)) 4667 pci_restore_state(pdev); 4668 4669 /* if we have > 1 VGA cards, then disable the amdgpu VGA resources */ 4670 /* this will fail for cards that aren't VGA class devices, just 4671 * ignore it 4672 */ 4673 if ((adev->pdev->class >> 8) == PCI_CLASS_DISPLAY_VGA) 4674 vga_client_register(adev->pdev, amdgpu_device_vga_set_decode); 4675 4676 px = amdgpu_device_supports_px(ddev); 4677 4678 if (px || (!dev_is_removable(&adev->pdev->dev) && 4679 apple_gmux_detect(NULL, NULL))) 4680 vga_switcheroo_register_client(adev->pdev, 4681 &amdgpu_switcheroo_ops, px); 4682 4683 if (px) 4684 vga_switcheroo_init_domain_pm_ops(adev->dev, &adev->vga_pm_domain); 4685 4686 if (adev->init_lvl->level == AMDGPU_INIT_LEVEL_MINIMAL_XGMI) 4687 amdgpu_xgmi_reset_on_init(adev); 4688 4689 amdgpu_device_check_iommu_direct_map(adev); 4690 4691 adev->pm_nb.notifier_call = amdgpu_device_pm_notifier; 4692 r = register_pm_notifier(&adev->pm_nb); 4693 if (r) 4694 goto failed; 4695 4696 return 0; 4697 4698 release_ras_con: 4699 if (amdgpu_sriov_vf(adev)) 4700 amdgpu_virt_release_full_gpu(adev, true); 4701 4702 /* failed in exclusive mode due to timeout */ 4703 if (amdgpu_sriov_vf(adev) && 4704 !amdgpu_sriov_runtime(adev) && 4705 amdgpu_virt_mmio_blocked(adev) && 4706 !amdgpu_virt_wait_reset(adev)) { 4707 dev_err(adev->dev, "VF exclusive mode timeout\n"); 4708 /* Don't send request since VF is inactive. */ 4709 adev->virt.caps &= ~AMDGPU_SRIOV_CAPS_RUNTIME; 4710 adev->virt.ops = NULL; 4711 r = -EAGAIN; 4712 } 4713 amdgpu_release_ras_context(adev); 4714 4715 failed: 4716 amdgpu_vf_error_trans_all(adev); 4717 4718 return r; 4719 } 4720 4721 static void amdgpu_device_unmap_mmio(struct amdgpu_device *adev) 4722 { 4723 4724 /* Clear all CPU mappings pointing to this device */ 4725 unmap_mapping_range(adev->ddev.anon_inode->i_mapping, 0, 0, 1); 4726 4727 /* Unmap all mapped bars - Doorbell, registers and VRAM */ 4728 amdgpu_doorbell_fini(adev); 4729 4730 iounmap(adev->rmmio); 4731 adev->rmmio = NULL; 4732 if (adev->mman.aper_base_kaddr) 4733 iounmap(adev->mman.aper_base_kaddr); 4734 adev->mman.aper_base_kaddr = NULL; 4735 4736 /* Memory manager related */ 4737 if (!adev->gmc.xgmi.connected_to_cpu && !adev->gmc.is_app_apu) { 4738 arch_phys_wc_del(adev->gmc.vram_mtrr); 4739 arch_io_free_memtype_wc(adev->gmc.aper_base, adev->gmc.aper_size); 4740 } 4741 } 4742 4743 /** 4744 * amdgpu_device_fini_hw - tear down the driver 4745 * 4746 * @adev: amdgpu_device pointer 4747 * 4748 * Tear down the driver info (all asics). 4749 * Called at driver shutdown. 4750 */ 4751 void amdgpu_device_fini_hw(struct amdgpu_device *adev) 4752 { 4753 dev_info(adev->dev, "amdgpu: finishing device.\n"); 4754 flush_delayed_work(&adev->delayed_init_work); 4755 4756 if (adev->mman.initialized) 4757 drain_workqueue(adev->mman.bdev.wq); 4758 adev->shutdown = true; 4759 4760 unregister_pm_notifier(&adev->pm_nb); 4761 4762 /* make sure IB test finished before entering exclusive mode 4763 * to avoid preemption on IB test 4764 */ 4765 if (amdgpu_sriov_vf(adev)) { 4766 amdgpu_virt_request_full_gpu(adev, false); 4767 amdgpu_virt_fini_data_exchange(adev); 4768 } 4769 4770 /* disable all interrupts */ 4771 amdgpu_irq_disable_all(adev); 4772 if (adev->mode_info.mode_config_initialized) { 4773 if (!drm_drv_uses_atomic_modeset(adev_to_drm(adev))) 4774 drm_helper_force_disable_all(adev_to_drm(adev)); 4775 else 4776 drm_atomic_helper_shutdown(adev_to_drm(adev)); 4777 } 4778 amdgpu_fence_driver_hw_fini(adev); 4779 4780 if (adev->pm.sysfs_initialized) 4781 amdgpu_pm_sysfs_fini(adev); 4782 if (adev->ucode_sysfs_en) 4783 amdgpu_ucode_sysfs_fini(adev); 4784 amdgpu_device_attr_sysfs_fini(adev); 4785 amdgpu_fru_sysfs_fini(adev); 4786 4787 amdgpu_reg_state_sysfs_fini(adev); 4788 amdgpu_xcp_cfg_sysfs_fini(adev); 4789 4790 /* disable ras feature must before hw fini */ 4791 amdgpu_ras_pre_fini(adev); 4792 4793 amdgpu_ttm_set_buffer_funcs_status(adev, false); 4794 4795 amdgpu_device_ip_fini_early(adev); 4796 4797 amdgpu_irq_fini_hw(adev); 4798 4799 if (adev->mman.initialized) 4800 ttm_device_clear_dma_mappings(&adev->mman.bdev); 4801 4802 amdgpu_gart_dummy_page_fini(adev); 4803 4804 if (drm_dev_is_unplugged(adev_to_drm(adev))) 4805 amdgpu_device_unmap_mmio(adev); 4806 4807 } 4808 4809 void amdgpu_device_fini_sw(struct amdgpu_device *adev) 4810 { 4811 int i, idx; 4812 bool px; 4813 4814 amdgpu_device_ip_fini(adev); 4815 amdgpu_fence_driver_sw_fini(adev); 4816 amdgpu_ucode_release(&adev->firmware.gpu_info_fw); 4817 adev->accel_working = false; 4818 dma_fence_put(rcu_dereference_protected(adev->gang_submit, true)); 4819 for (i = 0; i < MAX_XCP; ++i) { 4820 dma_fence_put(adev->isolation[i].spearhead); 4821 amdgpu_sync_free(&adev->isolation[i].active); 4822 amdgpu_sync_free(&adev->isolation[i].prev); 4823 } 4824 4825 amdgpu_reset_fini(adev); 4826 4827 /* free i2c buses */ 4828 amdgpu_i2c_fini(adev); 4829 4830 if (adev->bios) { 4831 if (amdgpu_emu_mode != 1) 4832 amdgpu_atombios_fini(adev); 4833 amdgpu_bios_release(adev); 4834 } 4835 4836 kfree(adev->fru_info); 4837 adev->fru_info = NULL; 4838 4839 kfree(adev->xcp_mgr); 4840 adev->xcp_mgr = NULL; 4841 4842 px = amdgpu_device_supports_px(adev_to_drm(adev)); 4843 4844 if (px || (!dev_is_removable(&adev->pdev->dev) && 4845 apple_gmux_detect(NULL, NULL))) 4846 vga_switcheroo_unregister_client(adev->pdev); 4847 4848 if (px) 4849 vga_switcheroo_fini_domain_pm_ops(adev->dev); 4850 4851 if ((adev->pdev->class >> 8) == PCI_CLASS_DISPLAY_VGA) 4852 vga_client_unregister(adev->pdev); 4853 4854 if (drm_dev_enter(adev_to_drm(adev), &idx)) { 4855 4856 iounmap(adev->rmmio); 4857 adev->rmmio = NULL; 4858 amdgpu_doorbell_fini(adev); 4859 drm_dev_exit(idx); 4860 } 4861 4862 if (IS_ENABLED(CONFIG_PERF_EVENTS)) 4863 amdgpu_pmu_fini(adev); 4864 if (adev->mman.discovery_bin) 4865 amdgpu_discovery_fini(adev); 4866 4867 amdgpu_reset_put_reset_domain(adev->reset_domain); 4868 adev->reset_domain = NULL; 4869 4870 kfree(adev->pci_state); 4871 4872 } 4873 4874 /** 4875 * amdgpu_device_evict_resources - evict device resources 4876 * @adev: amdgpu device object 4877 * 4878 * Evicts all ttm device resources(vram BOs, gart table) from the lru list 4879 * of the vram memory type. Mainly used for evicting device resources 4880 * at suspend time. 4881 * 4882 */ 4883 static int amdgpu_device_evict_resources(struct amdgpu_device *adev) 4884 { 4885 int ret; 4886 4887 /* No need to evict vram on APUs unless going to S4 */ 4888 if (!adev->in_s4 && (adev->flags & AMD_IS_APU)) 4889 return 0; 4890 4891 ret = amdgpu_ttm_evict_resources(adev, TTM_PL_VRAM); 4892 if (ret) 4893 DRM_WARN("evicting device resources failed\n"); 4894 return ret; 4895 } 4896 4897 /* 4898 * Suspend & resume. 4899 */ 4900 /** 4901 * amdgpu_device_pm_notifier - Notification block for Suspend/Hibernate events 4902 * @nb: notifier block 4903 * @mode: suspend mode 4904 * @data: data 4905 * 4906 * This function is called when the system is about to suspend or hibernate. 4907 * It is used to evict resources from the device before the system goes to 4908 * sleep while there is still access to swap. 4909 */ 4910 static int amdgpu_device_pm_notifier(struct notifier_block *nb, unsigned long mode, 4911 void *data) 4912 { 4913 struct amdgpu_device *adev = container_of(nb, struct amdgpu_device, pm_nb); 4914 int r; 4915 4916 switch (mode) { 4917 case PM_HIBERNATION_PREPARE: 4918 adev->in_s4 = true; 4919 fallthrough; 4920 case PM_SUSPEND_PREPARE: 4921 r = amdgpu_device_evict_resources(adev); 4922 /* 4923 * This is considered non-fatal at this time because 4924 * amdgpu_device_prepare() will also fatally evict resources. 4925 * See https://gitlab.freedesktop.org/drm/amd/-/issues/3781 4926 */ 4927 if (r) 4928 drm_warn(adev_to_drm(adev), "Failed to evict resources, freeze active processes if problems occur: %d\n", r); 4929 break; 4930 } 4931 4932 return NOTIFY_DONE; 4933 } 4934 4935 /** 4936 * amdgpu_device_prepare - prepare for device suspend 4937 * 4938 * @dev: drm dev pointer 4939 * 4940 * Prepare to put the hw in the suspend state (all asics). 4941 * Returns 0 for success or an error on failure. 4942 * Called at driver suspend. 4943 */ 4944 int amdgpu_device_prepare(struct drm_device *dev) 4945 { 4946 struct amdgpu_device *adev = drm_to_adev(dev); 4947 int i, r; 4948 4949 amdgpu_choose_low_power_state(adev); 4950 4951 if (dev->switch_power_state == DRM_SWITCH_POWER_OFF) 4952 return 0; 4953 4954 /* Evict the majority of BOs before starting suspend sequence */ 4955 r = amdgpu_device_evict_resources(adev); 4956 if (r) 4957 goto unprepare; 4958 4959 flush_delayed_work(&adev->gfx.gfx_off_delay_work); 4960 4961 for (i = 0; i < adev->num_ip_blocks; i++) { 4962 if (!adev->ip_blocks[i].status.valid) 4963 continue; 4964 if (!adev->ip_blocks[i].version->funcs->prepare_suspend) 4965 continue; 4966 r = adev->ip_blocks[i].version->funcs->prepare_suspend(&adev->ip_blocks[i]); 4967 if (r) 4968 goto unprepare; 4969 } 4970 4971 return 0; 4972 4973 unprepare: 4974 adev->in_s0ix = adev->in_s3 = adev->in_s4 = false; 4975 4976 return r; 4977 } 4978 4979 /** 4980 * amdgpu_device_suspend - initiate device suspend 4981 * 4982 * @dev: drm dev pointer 4983 * @notify_clients: notify in-kernel DRM clients 4984 * 4985 * Puts the hw in the suspend state (all asics). 4986 * Returns 0 for success or an error on failure. 4987 * Called at driver suspend. 4988 */ 4989 int amdgpu_device_suspend(struct drm_device *dev, bool notify_clients) 4990 { 4991 struct amdgpu_device *adev = drm_to_adev(dev); 4992 int r = 0; 4993 4994 if (dev->switch_power_state == DRM_SWITCH_POWER_OFF) 4995 return 0; 4996 4997 adev->in_suspend = true; 4998 4999 if (amdgpu_sriov_vf(adev)) { 5000 amdgpu_virt_fini_data_exchange(adev); 5001 r = amdgpu_virt_request_full_gpu(adev, false); 5002 if (r) 5003 return r; 5004 } 5005 5006 if (amdgpu_acpi_smart_shift_update(dev, AMDGPU_SS_DEV_D3)) 5007 DRM_WARN("smart shift update failed\n"); 5008 5009 if (notify_clients) 5010 drm_client_dev_suspend(adev_to_drm(adev), false); 5011 5012 cancel_delayed_work_sync(&adev->delayed_init_work); 5013 5014 amdgpu_ras_suspend(adev); 5015 5016 amdgpu_device_ip_suspend_phase1(adev); 5017 5018 if (!adev->in_s0ix) 5019 amdgpu_amdkfd_suspend(adev, adev->in_runpm); 5020 5021 r = amdgpu_device_evict_resources(adev); 5022 if (r) 5023 return r; 5024 5025 amdgpu_ttm_set_buffer_funcs_status(adev, false); 5026 5027 amdgpu_fence_driver_hw_fini(adev); 5028 5029 amdgpu_device_ip_suspend_phase2(adev); 5030 5031 if (amdgpu_sriov_vf(adev)) 5032 amdgpu_virt_release_full_gpu(adev, false); 5033 5034 r = amdgpu_dpm_notify_rlc_state(adev, false); 5035 if (r) 5036 return r; 5037 5038 return 0; 5039 } 5040 5041 /** 5042 * amdgpu_device_resume - initiate device resume 5043 * 5044 * @dev: drm dev pointer 5045 * @notify_clients: notify in-kernel DRM clients 5046 * 5047 * Bring the hw back to operating state (all asics). 5048 * Returns 0 for success or an error on failure. 5049 * Called at driver resume. 5050 */ 5051 int amdgpu_device_resume(struct drm_device *dev, bool notify_clients) 5052 { 5053 struct amdgpu_device *adev = drm_to_adev(dev); 5054 int r = 0; 5055 5056 if (amdgpu_sriov_vf(adev)) { 5057 r = amdgpu_virt_request_full_gpu(adev, true); 5058 if (r) 5059 return r; 5060 } 5061 5062 if (dev->switch_power_state == DRM_SWITCH_POWER_OFF) 5063 return 0; 5064 5065 if (adev->in_s0ix) 5066 amdgpu_dpm_gfx_state_change(adev, sGpuChangeState_D0Entry); 5067 5068 /* post card */ 5069 if (amdgpu_device_need_post(adev)) { 5070 r = amdgpu_device_asic_init(adev); 5071 if (r) 5072 dev_err(adev->dev, "amdgpu asic init failed\n"); 5073 } 5074 5075 r = amdgpu_device_ip_resume(adev); 5076 5077 if (r) { 5078 dev_err(adev->dev, "amdgpu_device_ip_resume failed (%d).\n", r); 5079 goto exit; 5080 } 5081 5082 if (!adev->in_s0ix) { 5083 r = amdgpu_amdkfd_resume(adev, adev->in_runpm); 5084 if (r) 5085 goto exit; 5086 } 5087 5088 r = amdgpu_device_ip_late_init(adev); 5089 if (r) 5090 goto exit; 5091 5092 queue_delayed_work(system_wq, &adev->delayed_init_work, 5093 msecs_to_jiffies(AMDGPU_RESUME_MS)); 5094 exit: 5095 if (amdgpu_sriov_vf(adev)) { 5096 amdgpu_virt_init_data_exchange(adev); 5097 amdgpu_virt_release_full_gpu(adev, true); 5098 } 5099 5100 if (r) 5101 return r; 5102 5103 /* Make sure IB tests flushed */ 5104 flush_delayed_work(&adev->delayed_init_work); 5105 5106 if (notify_clients) 5107 drm_client_dev_resume(adev_to_drm(adev), false); 5108 5109 amdgpu_ras_resume(adev); 5110 5111 if (adev->mode_info.num_crtc) { 5112 /* 5113 * Most of the connector probing functions try to acquire runtime pm 5114 * refs to ensure that the GPU is powered on when connector polling is 5115 * performed. Since we're calling this from a runtime PM callback, 5116 * trying to acquire rpm refs will cause us to deadlock. 5117 * 5118 * Since we're guaranteed to be holding the rpm lock, it's safe to 5119 * temporarily disable the rpm helpers so this doesn't deadlock us. 5120 */ 5121 #ifdef CONFIG_PM 5122 dev->dev->power.disable_depth++; 5123 #endif 5124 if (!adev->dc_enabled) 5125 drm_helper_hpd_irq_event(dev); 5126 else 5127 drm_kms_helper_hotplug_event(dev); 5128 #ifdef CONFIG_PM 5129 dev->dev->power.disable_depth--; 5130 #endif 5131 } 5132 adev->in_suspend = false; 5133 5134 if (adev->enable_mes) 5135 amdgpu_mes_self_test(adev); 5136 5137 if (amdgpu_acpi_smart_shift_update(dev, AMDGPU_SS_DEV_D0)) 5138 DRM_WARN("smart shift update failed\n"); 5139 5140 return 0; 5141 } 5142 5143 /** 5144 * amdgpu_device_ip_check_soft_reset - did soft reset succeed 5145 * 5146 * @adev: amdgpu_device pointer 5147 * 5148 * The list of all the hardware IPs that make up the asic is walked and 5149 * the check_soft_reset callbacks are run. check_soft_reset determines 5150 * if the asic is still hung or not. 5151 * Returns true if any of the IPs are still in a hung state, false if not. 5152 */ 5153 static bool amdgpu_device_ip_check_soft_reset(struct amdgpu_device *adev) 5154 { 5155 int i; 5156 bool asic_hang = false; 5157 5158 if (amdgpu_sriov_vf(adev)) 5159 return true; 5160 5161 if (amdgpu_asic_need_full_reset(adev)) 5162 return true; 5163 5164 for (i = 0; i < adev->num_ip_blocks; i++) { 5165 if (!adev->ip_blocks[i].status.valid) 5166 continue; 5167 if (adev->ip_blocks[i].version->funcs->check_soft_reset) 5168 adev->ip_blocks[i].status.hang = 5169 adev->ip_blocks[i].version->funcs->check_soft_reset( 5170 &adev->ip_blocks[i]); 5171 if (adev->ip_blocks[i].status.hang) { 5172 dev_info(adev->dev, "IP block:%s is hung!\n", adev->ip_blocks[i].version->funcs->name); 5173 asic_hang = true; 5174 } 5175 } 5176 return asic_hang; 5177 } 5178 5179 /** 5180 * amdgpu_device_ip_pre_soft_reset - prepare for soft reset 5181 * 5182 * @adev: amdgpu_device pointer 5183 * 5184 * The list of all the hardware IPs that make up the asic is walked and the 5185 * pre_soft_reset callbacks are run if the block is hung. pre_soft_reset 5186 * handles any IP specific hardware or software state changes that are 5187 * necessary for a soft reset to succeed. 5188 * Returns 0 on success, negative error code on failure. 5189 */ 5190 static int amdgpu_device_ip_pre_soft_reset(struct amdgpu_device *adev) 5191 { 5192 int i, r = 0; 5193 5194 for (i = 0; i < adev->num_ip_blocks; i++) { 5195 if (!adev->ip_blocks[i].status.valid) 5196 continue; 5197 if (adev->ip_blocks[i].status.hang && 5198 adev->ip_blocks[i].version->funcs->pre_soft_reset) { 5199 r = adev->ip_blocks[i].version->funcs->pre_soft_reset(&adev->ip_blocks[i]); 5200 if (r) 5201 return r; 5202 } 5203 } 5204 5205 return 0; 5206 } 5207 5208 /** 5209 * amdgpu_device_ip_need_full_reset - check if a full asic reset is needed 5210 * 5211 * @adev: amdgpu_device pointer 5212 * 5213 * Some hardware IPs cannot be soft reset. If they are hung, a full gpu 5214 * reset is necessary to recover. 5215 * Returns true if a full asic reset is required, false if not. 5216 */ 5217 static bool amdgpu_device_ip_need_full_reset(struct amdgpu_device *adev) 5218 { 5219 int i; 5220 5221 if (amdgpu_asic_need_full_reset(adev)) 5222 return true; 5223 5224 for (i = 0; i < adev->num_ip_blocks; i++) { 5225 if (!adev->ip_blocks[i].status.valid) 5226 continue; 5227 if ((adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) || 5228 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) || 5229 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_ACP) || 5230 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE) || 5231 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) { 5232 if (adev->ip_blocks[i].status.hang) { 5233 dev_info(adev->dev, "Some block need full reset!\n"); 5234 return true; 5235 } 5236 } 5237 } 5238 return false; 5239 } 5240 5241 /** 5242 * amdgpu_device_ip_soft_reset - do a soft reset 5243 * 5244 * @adev: amdgpu_device pointer 5245 * 5246 * The list of all the hardware IPs that make up the asic is walked and the 5247 * soft_reset callbacks are run if the block is hung. soft_reset handles any 5248 * IP specific hardware or software state changes that are necessary to soft 5249 * reset the IP. 5250 * Returns 0 on success, negative error code on failure. 5251 */ 5252 static int amdgpu_device_ip_soft_reset(struct amdgpu_device *adev) 5253 { 5254 int i, r = 0; 5255 5256 for (i = 0; i < adev->num_ip_blocks; i++) { 5257 if (!adev->ip_blocks[i].status.valid) 5258 continue; 5259 if (adev->ip_blocks[i].status.hang && 5260 adev->ip_blocks[i].version->funcs->soft_reset) { 5261 r = adev->ip_blocks[i].version->funcs->soft_reset(&adev->ip_blocks[i]); 5262 if (r) 5263 return r; 5264 } 5265 } 5266 5267 return 0; 5268 } 5269 5270 /** 5271 * amdgpu_device_ip_post_soft_reset - clean up from soft reset 5272 * 5273 * @adev: amdgpu_device pointer 5274 * 5275 * The list of all the hardware IPs that make up the asic is walked and the 5276 * post_soft_reset callbacks are run if the asic was hung. post_soft_reset 5277 * handles any IP specific hardware or software state changes that are 5278 * necessary after the IP has been soft reset. 5279 * Returns 0 on success, negative error code on failure. 5280 */ 5281 static int amdgpu_device_ip_post_soft_reset(struct amdgpu_device *adev) 5282 { 5283 int i, r = 0; 5284 5285 for (i = 0; i < adev->num_ip_blocks; i++) { 5286 if (!adev->ip_blocks[i].status.valid) 5287 continue; 5288 if (adev->ip_blocks[i].status.hang && 5289 adev->ip_blocks[i].version->funcs->post_soft_reset) 5290 r = adev->ip_blocks[i].version->funcs->post_soft_reset(&adev->ip_blocks[i]); 5291 if (r) 5292 return r; 5293 } 5294 5295 return 0; 5296 } 5297 5298 /** 5299 * amdgpu_device_reset_sriov - reset ASIC for SR-IOV vf 5300 * 5301 * @adev: amdgpu_device pointer 5302 * @reset_context: amdgpu reset context pointer 5303 * 5304 * do VF FLR and reinitialize Asic 5305 * return 0 means succeeded otherwise failed 5306 */ 5307 static int amdgpu_device_reset_sriov(struct amdgpu_device *adev, 5308 struct amdgpu_reset_context *reset_context) 5309 { 5310 int r; 5311 struct amdgpu_hive_info *hive = NULL; 5312 5313 if (test_bit(AMDGPU_HOST_FLR, &reset_context->flags)) { 5314 if (!amdgpu_ras_get_fed_status(adev)) 5315 amdgpu_virt_ready_to_reset(adev); 5316 amdgpu_virt_wait_reset(adev); 5317 clear_bit(AMDGPU_HOST_FLR, &reset_context->flags); 5318 r = amdgpu_virt_request_full_gpu(adev, true); 5319 } else { 5320 r = amdgpu_virt_reset_gpu(adev); 5321 } 5322 if (r) 5323 return r; 5324 5325 amdgpu_ras_clear_err_state(adev); 5326 amdgpu_irq_gpu_reset_resume_helper(adev); 5327 5328 /* some sw clean up VF needs to do before recover */ 5329 amdgpu_virt_post_reset(adev); 5330 5331 /* Resume IP prior to SMC */ 5332 r = amdgpu_device_ip_reinit_early_sriov(adev); 5333 if (r) 5334 return r; 5335 5336 amdgpu_virt_init_data_exchange(adev); 5337 5338 r = amdgpu_device_fw_loading(adev); 5339 if (r) 5340 return r; 5341 5342 /* now we are okay to resume SMC/CP/SDMA */ 5343 r = amdgpu_device_ip_reinit_late_sriov(adev); 5344 if (r) 5345 return r; 5346 5347 hive = amdgpu_get_xgmi_hive(adev); 5348 /* Update PSP FW topology after reset */ 5349 if (hive && adev->gmc.xgmi.num_physical_nodes > 1) 5350 r = amdgpu_xgmi_update_topology(hive, adev); 5351 if (hive) 5352 amdgpu_put_xgmi_hive(hive); 5353 if (r) 5354 return r; 5355 5356 r = amdgpu_ib_ring_tests(adev); 5357 if (r) 5358 return r; 5359 5360 if (adev->virt.gim_feature & AMDGIM_FEATURE_GIM_FLR_VRAMLOST) 5361 amdgpu_inc_vram_lost(adev); 5362 5363 /* need to be called during full access so we can't do it later like 5364 * bare-metal does. 5365 */ 5366 amdgpu_amdkfd_post_reset(adev); 5367 amdgpu_virt_release_full_gpu(adev, true); 5368 5369 /* Aldebaran and gfx_11_0_3 support ras in SRIOV, so need resume ras during reset */ 5370 if (amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 2) || 5371 amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 3) || 5372 amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 4) || 5373 amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 5, 0) || 5374 amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(11, 0, 3)) 5375 amdgpu_ras_resume(adev); 5376 5377 amdgpu_virt_ras_telemetry_post_reset(adev); 5378 5379 return 0; 5380 } 5381 5382 /** 5383 * amdgpu_device_has_job_running - check if there is any unfinished job 5384 * 5385 * @adev: amdgpu_device pointer 5386 * 5387 * check if there is any job running on the device when guest driver receives 5388 * FLR notification from host driver. If there are still jobs running, then 5389 * the guest driver will not respond the FLR reset. Instead, let the job hit 5390 * the timeout and guest driver then issue the reset request. 5391 */ 5392 bool amdgpu_device_has_job_running(struct amdgpu_device *adev) 5393 { 5394 int i; 5395 5396 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 5397 struct amdgpu_ring *ring = adev->rings[i]; 5398 5399 if (!amdgpu_ring_sched_ready(ring)) 5400 continue; 5401 5402 if (amdgpu_fence_count_emitted(ring)) 5403 return true; 5404 } 5405 return false; 5406 } 5407 5408 /** 5409 * amdgpu_device_should_recover_gpu - check if we should try GPU recovery 5410 * 5411 * @adev: amdgpu_device pointer 5412 * 5413 * Check amdgpu_gpu_recovery and SRIOV status to see if we should try to recover 5414 * a hung GPU. 5415 */ 5416 bool amdgpu_device_should_recover_gpu(struct amdgpu_device *adev) 5417 { 5418 5419 if (amdgpu_gpu_recovery == 0) 5420 goto disabled; 5421 5422 /* Skip soft reset check in fatal error mode */ 5423 if (!amdgpu_ras_is_poison_mode_supported(adev)) 5424 return true; 5425 5426 if (amdgpu_sriov_vf(adev)) 5427 return true; 5428 5429 if (amdgpu_gpu_recovery == -1) { 5430 switch (adev->asic_type) { 5431 #ifdef CONFIG_DRM_AMDGPU_SI 5432 case CHIP_VERDE: 5433 case CHIP_TAHITI: 5434 case CHIP_PITCAIRN: 5435 case CHIP_OLAND: 5436 case CHIP_HAINAN: 5437 #endif 5438 #ifdef CONFIG_DRM_AMDGPU_CIK 5439 case CHIP_KAVERI: 5440 case CHIP_KABINI: 5441 case CHIP_MULLINS: 5442 #endif 5443 case CHIP_CARRIZO: 5444 case CHIP_STONEY: 5445 case CHIP_CYAN_SKILLFISH: 5446 goto disabled; 5447 default: 5448 break; 5449 } 5450 } 5451 5452 return true; 5453 5454 disabled: 5455 dev_info(adev->dev, "GPU recovery disabled.\n"); 5456 return false; 5457 } 5458 5459 int amdgpu_device_mode1_reset(struct amdgpu_device *adev) 5460 { 5461 u32 i; 5462 int ret = 0; 5463 5464 if (adev->bios) 5465 amdgpu_atombios_scratch_regs_engine_hung(adev, true); 5466 5467 dev_info(adev->dev, "GPU mode1 reset\n"); 5468 5469 /* Cache the state before bus master disable. The saved config space 5470 * values are used in other cases like restore after mode-2 reset. 5471 */ 5472 amdgpu_device_cache_pci_state(adev->pdev); 5473 5474 /* disable BM */ 5475 pci_clear_master(adev->pdev); 5476 5477 if (amdgpu_dpm_is_mode1_reset_supported(adev)) { 5478 dev_info(adev->dev, "GPU smu mode1 reset\n"); 5479 ret = amdgpu_dpm_mode1_reset(adev); 5480 } else { 5481 dev_info(adev->dev, "GPU psp mode1 reset\n"); 5482 ret = psp_gpu_reset(adev); 5483 } 5484 5485 if (ret) 5486 goto mode1_reset_failed; 5487 5488 amdgpu_device_load_pci_state(adev->pdev); 5489 ret = amdgpu_psp_wait_for_bootloader(adev); 5490 if (ret) 5491 goto mode1_reset_failed; 5492 5493 /* wait for asic to come out of reset */ 5494 for (i = 0; i < adev->usec_timeout; i++) { 5495 u32 memsize = adev->nbio.funcs->get_memsize(adev); 5496 5497 if (memsize != 0xffffffff) 5498 break; 5499 udelay(1); 5500 } 5501 5502 if (i >= adev->usec_timeout) { 5503 ret = -ETIMEDOUT; 5504 goto mode1_reset_failed; 5505 } 5506 5507 if (adev->bios) 5508 amdgpu_atombios_scratch_regs_engine_hung(adev, false); 5509 5510 return 0; 5511 5512 mode1_reset_failed: 5513 dev_err(adev->dev, "GPU mode1 reset failed\n"); 5514 return ret; 5515 } 5516 5517 int amdgpu_device_link_reset(struct amdgpu_device *adev) 5518 { 5519 int ret = 0; 5520 5521 dev_info(adev->dev, "GPU link reset\n"); 5522 5523 if (!adev->pcie_reset_ctx.occurs_dpc) 5524 ret = amdgpu_dpm_link_reset(adev); 5525 5526 if (ret) 5527 goto link_reset_failed; 5528 5529 ret = amdgpu_psp_wait_for_bootloader(adev); 5530 if (ret) 5531 goto link_reset_failed; 5532 5533 return 0; 5534 5535 link_reset_failed: 5536 dev_err(adev->dev, "GPU link reset failed\n"); 5537 return ret; 5538 } 5539 5540 int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev, 5541 struct amdgpu_reset_context *reset_context) 5542 { 5543 int i, r = 0; 5544 struct amdgpu_job *job = NULL; 5545 struct amdgpu_device *tmp_adev = reset_context->reset_req_dev; 5546 bool need_full_reset = 5547 test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 5548 5549 if (reset_context->reset_req_dev == adev) 5550 job = reset_context->job; 5551 5552 if (amdgpu_sriov_vf(adev)) 5553 amdgpu_virt_pre_reset(adev); 5554 5555 amdgpu_fence_driver_isr_toggle(adev, true); 5556 5557 /* block all schedulers and reset given job's ring */ 5558 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 5559 struct amdgpu_ring *ring = adev->rings[i]; 5560 5561 if (!amdgpu_ring_sched_ready(ring)) 5562 continue; 5563 5564 /* Clear job fence from fence drv to avoid force_completion 5565 * leave NULL and vm flush fence in fence drv 5566 */ 5567 amdgpu_fence_driver_clear_job_fences(ring); 5568 5569 /* after all hw jobs are reset, hw fence is meaningless, so force_completion */ 5570 amdgpu_fence_driver_force_completion(ring); 5571 } 5572 5573 amdgpu_fence_driver_isr_toggle(adev, false); 5574 5575 if (job && job->vm) 5576 drm_sched_increase_karma(&job->base); 5577 5578 r = amdgpu_reset_prepare_hwcontext(adev, reset_context); 5579 /* If reset handler not implemented, continue; otherwise return */ 5580 if (r == -EOPNOTSUPP) 5581 r = 0; 5582 else 5583 return r; 5584 5585 /* Don't suspend on bare metal if we are not going to HW reset the ASIC */ 5586 if (!amdgpu_sriov_vf(adev)) { 5587 5588 if (!need_full_reset) 5589 need_full_reset = amdgpu_device_ip_need_full_reset(adev); 5590 5591 if (!need_full_reset && amdgpu_gpu_recovery && 5592 amdgpu_device_ip_check_soft_reset(adev)) { 5593 amdgpu_device_ip_pre_soft_reset(adev); 5594 r = amdgpu_device_ip_soft_reset(adev); 5595 amdgpu_device_ip_post_soft_reset(adev); 5596 if (r || amdgpu_device_ip_check_soft_reset(adev)) { 5597 dev_info(adev->dev, "soft reset failed, will fallback to full reset!\n"); 5598 need_full_reset = true; 5599 } 5600 } 5601 5602 if (!test_bit(AMDGPU_SKIP_COREDUMP, &reset_context->flags)) { 5603 dev_info(tmp_adev->dev, "Dumping IP State\n"); 5604 /* Trigger ip dump before we reset the asic */ 5605 for (i = 0; i < tmp_adev->num_ip_blocks; i++) 5606 if (tmp_adev->ip_blocks[i].version->funcs->dump_ip_state) 5607 tmp_adev->ip_blocks[i].version->funcs 5608 ->dump_ip_state((void *)&tmp_adev->ip_blocks[i]); 5609 dev_info(tmp_adev->dev, "Dumping IP State Completed\n"); 5610 } 5611 5612 if (need_full_reset) 5613 r = amdgpu_device_ip_suspend(adev); 5614 if (need_full_reset) 5615 set_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 5616 else 5617 clear_bit(AMDGPU_NEED_FULL_RESET, 5618 &reset_context->flags); 5619 } 5620 5621 return r; 5622 } 5623 5624 int amdgpu_device_reinit_after_reset(struct amdgpu_reset_context *reset_context) 5625 { 5626 struct list_head *device_list_handle; 5627 bool full_reset, vram_lost = false; 5628 struct amdgpu_device *tmp_adev; 5629 int r, init_level; 5630 5631 device_list_handle = reset_context->reset_device_list; 5632 5633 if (!device_list_handle) 5634 return -EINVAL; 5635 5636 full_reset = test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 5637 5638 /** 5639 * If it's reset on init, it's default init level, otherwise keep level 5640 * as recovery level. 5641 */ 5642 if (reset_context->method == AMD_RESET_METHOD_ON_INIT) 5643 init_level = AMDGPU_INIT_LEVEL_DEFAULT; 5644 else 5645 init_level = AMDGPU_INIT_LEVEL_RESET_RECOVERY; 5646 5647 r = 0; 5648 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 5649 amdgpu_set_init_level(tmp_adev, init_level); 5650 if (full_reset) { 5651 /* post card */ 5652 amdgpu_ras_clear_err_state(tmp_adev); 5653 r = amdgpu_device_asic_init(tmp_adev); 5654 if (r) { 5655 dev_warn(tmp_adev->dev, "asic atom init failed!"); 5656 } else { 5657 dev_info(tmp_adev->dev, "GPU reset succeeded, trying to resume\n"); 5658 5659 r = amdgpu_device_ip_resume_phase1(tmp_adev); 5660 if (r) 5661 goto out; 5662 5663 vram_lost = amdgpu_device_check_vram_lost(tmp_adev); 5664 5665 if (!test_bit(AMDGPU_SKIP_COREDUMP, &reset_context->flags)) 5666 amdgpu_coredump(tmp_adev, false, vram_lost, reset_context->job); 5667 5668 if (vram_lost) { 5669 DRM_INFO("VRAM is lost due to GPU reset!\n"); 5670 amdgpu_inc_vram_lost(tmp_adev); 5671 } 5672 5673 r = amdgpu_device_fw_loading(tmp_adev); 5674 if (r) 5675 return r; 5676 5677 r = amdgpu_xcp_restore_partition_mode( 5678 tmp_adev->xcp_mgr); 5679 if (r) 5680 goto out; 5681 5682 r = amdgpu_device_ip_resume_phase2(tmp_adev); 5683 if (r) 5684 goto out; 5685 5686 if (tmp_adev->mman.buffer_funcs_ring->sched.ready) 5687 amdgpu_ttm_set_buffer_funcs_status(tmp_adev, true); 5688 5689 r = amdgpu_device_ip_resume_phase3(tmp_adev); 5690 if (r) 5691 goto out; 5692 5693 if (vram_lost) 5694 amdgpu_device_fill_reset_magic(tmp_adev); 5695 5696 /* 5697 * Add this ASIC as tracked as reset was already 5698 * complete successfully. 5699 */ 5700 amdgpu_register_gpu_instance(tmp_adev); 5701 5702 if (!reset_context->hive && 5703 tmp_adev->gmc.xgmi.num_physical_nodes > 1) 5704 amdgpu_xgmi_add_device(tmp_adev); 5705 5706 r = amdgpu_device_ip_late_init(tmp_adev); 5707 if (r) 5708 goto out; 5709 5710 drm_client_dev_resume(adev_to_drm(tmp_adev), false); 5711 5712 /* 5713 * The GPU enters bad state once faulty pages 5714 * by ECC has reached the threshold, and ras 5715 * recovery is scheduled next. So add one check 5716 * here to break recovery if it indeed exceeds 5717 * bad page threshold, and remind user to 5718 * retire this GPU or setting one bigger 5719 * bad_page_threshold value to fix this once 5720 * probing driver again. 5721 */ 5722 if (!amdgpu_ras_is_rma(tmp_adev)) { 5723 /* must succeed. */ 5724 amdgpu_ras_resume(tmp_adev); 5725 } else { 5726 r = -EINVAL; 5727 goto out; 5728 } 5729 5730 /* Update PSP FW topology after reset */ 5731 if (reset_context->hive && 5732 tmp_adev->gmc.xgmi.num_physical_nodes > 1) 5733 r = amdgpu_xgmi_update_topology( 5734 reset_context->hive, tmp_adev); 5735 } 5736 } 5737 5738 out: 5739 if (!r) { 5740 /* IP init is complete now, set level as default */ 5741 amdgpu_set_init_level(tmp_adev, 5742 AMDGPU_INIT_LEVEL_DEFAULT); 5743 amdgpu_irq_gpu_reset_resume_helper(tmp_adev); 5744 r = amdgpu_ib_ring_tests(tmp_adev); 5745 if (r) { 5746 dev_err(tmp_adev->dev, "ib ring test failed (%d).\n", r); 5747 r = -EAGAIN; 5748 goto end; 5749 } 5750 } 5751 5752 if (r) 5753 tmp_adev->asic_reset_res = r; 5754 } 5755 5756 end: 5757 return r; 5758 } 5759 5760 int amdgpu_do_asic_reset(struct list_head *device_list_handle, 5761 struct amdgpu_reset_context *reset_context) 5762 { 5763 struct amdgpu_device *tmp_adev = NULL; 5764 bool need_full_reset, skip_hw_reset; 5765 int r = 0; 5766 5767 /* Try reset handler method first */ 5768 tmp_adev = list_first_entry(device_list_handle, struct amdgpu_device, 5769 reset_list); 5770 5771 reset_context->reset_device_list = device_list_handle; 5772 r = amdgpu_reset_perform_reset(tmp_adev, reset_context); 5773 /* If reset handler not implemented, continue; otherwise return */ 5774 if (r == -EOPNOTSUPP) 5775 r = 0; 5776 else 5777 return r; 5778 5779 /* Reset handler not implemented, use the default method */ 5780 need_full_reset = 5781 test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 5782 skip_hw_reset = test_bit(AMDGPU_SKIP_HW_RESET, &reset_context->flags); 5783 5784 /* 5785 * ASIC reset has to be done on all XGMI hive nodes ASAP 5786 * to allow proper links negotiation in FW (within 1 sec) 5787 */ 5788 if (!skip_hw_reset && need_full_reset) { 5789 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 5790 /* For XGMI run all resets in parallel to speed up the process */ 5791 if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) { 5792 if (!queue_work(system_unbound_wq, 5793 &tmp_adev->xgmi_reset_work)) 5794 r = -EALREADY; 5795 } else 5796 r = amdgpu_asic_reset(tmp_adev); 5797 5798 if (r) { 5799 dev_err(tmp_adev->dev, 5800 "ASIC reset failed with error, %d for drm dev, %s", 5801 r, adev_to_drm(tmp_adev)->unique); 5802 goto out; 5803 } 5804 } 5805 5806 /* For XGMI wait for all resets to complete before proceed */ 5807 if (!r) { 5808 list_for_each_entry(tmp_adev, device_list_handle, 5809 reset_list) { 5810 if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) { 5811 flush_work(&tmp_adev->xgmi_reset_work); 5812 r = tmp_adev->asic_reset_res; 5813 if (r) 5814 break; 5815 } 5816 } 5817 } 5818 } 5819 5820 if (!r && amdgpu_ras_intr_triggered()) { 5821 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 5822 amdgpu_ras_reset_error_count(tmp_adev, 5823 AMDGPU_RAS_BLOCK__MMHUB); 5824 } 5825 5826 amdgpu_ras_intr_cleared(); 5827 } 5828 5829 r = amdgpu_device_reinit_after_reset(reset_context); 5830 if (r == -EAGAIN) 5831 set_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 5832 else 5833 clear_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 5834 5835 out: 5836 return r; 5837 } 5838 5839 static void amdgpu_device_set_mp1_state(struct amdgpu_device *adev) 5840 { 5841 5842 switch (amdgpu_asic_reset_method(adev)) { 5843 case AMD_RESET_METHOD_MODE1: 5844 case AMD_RESET_METHOD_LINK: 5845 adev->mp1_state = PP_MP1_STATE_SHUTDOWN; 5846 break; 5847 case AMD_RESET_METHOD_MODE2: 5848 adev->mp1_state = PP_MP1_STATE_RESET; 5849 break; 5850 default: 5851 adev->mp1_state = PP_MP1_STATE_NONE; 5852 break; 5853 } 5854 } 5855 5856 static void amdgpu_device_unset_mp1_state(struct amdgpu_device *adev) 5857 { 5858 amdgpu_vf_error_trans_all(adev); 5859 adev->mp1_state = PP_MP1_STATE_NONE; 5860 } 5861 5862 static void amdgpu_device_resume_display_audio(struct amdgpu_device *adev) 5863 { 5864 struct pci_dev *p = NULL; 5865 5866 p = pci_get_domain_bus_and_slot(pci_domain_nr(adev->pdev->bus), 5867 adev->pdev->bus->number, 1); 5868 if (p) { 5869 pm_runtime_enable(&(p->dev)); 5870 pm_runtime_resume(&(p->dev)); 5871 } 5872 5873 pci_dev_put(p); 5874 } 5875 5876 static int amdgpu_device_suspend_display_audio(struct amdgpu_device *adev) 5877 { 5878 enum amd_reset_method reset_method; 5879 struct pci_dev *p = NULL; 5880 u64 expires; 5881 5882 /* 5883 * For now, only BACO and mode1 reset are confirmed 5884 * to suffer the audio issue without proper suspended. 5885 */ 5886 reset_method = amdgpu_asic_reset_method(adev); 5887 if ((reset_method != AMD_RESET_METHOD_BACO) && 5888 (reset_method != AMD_RESET_METHOD_MODE1)) 5889 return -EINVAL; 5890 5891 p = pci_get_domain_bus_and_slot(pci_domain_nr(adev->pdev->bus), 5892 adev->pdev->bus->number, 1); 5893 if (!p) 5894 return -ENODEV; 5895 5896 expires = pm_runtime_autosuspend_expiration(&(p->dev)); 5897 if (!expires) 5898 /* 5899 * If we cannot get the audio device autosuspend delay, 5900 * a fixed 4S interval will be used. Considering 3S is 5901 * the audio controller default autosuspend delay setting. 5902 * 4S used here is guaranteed to cover that. 5903 */ 5904 expires = ktime_get_mono_fast_ns() + NSEC_PER_SEC * 4ULL; 5905 5906 while (!pm_runtime_status_suspended(&(p->dev))) { 5907 if (!pm_runtime_suspend(&(p->dev))) 5908 break; 5909 5910 if (expires < ktime_get_mono_fast_ns()) { 5911 dev_warn(adev->dev, "failed to suspend display audio\n"); 5912 pci_dev_put(p); 5913 /* TODO: abort the succeeding gpu reset? */ 5914 return -ETIMEDOUT; 5915 } 5916 } 5917 5918 pm_runtime_disable(&(p->dev)); 5919 5920 pci_dev_put(p); 5921 return 0; 5922 } 5923 5924 static inline void amdgpu_device_stop_pending_resets(struct amdgpu_device *adev) 5925 { 5926 struct amdgpu_ras *con = amdgpu_ras_get_context(adev); 5927 5928 #if defined(CONFIG_DEBUG_FS) 5929 if (!amdgpu_sriov_vf(adev)) 5930 cancel_work(&adev->reset_work); 5931 #endif 5932 5933 if (adev->kfd.dev) 5934 cancel_work(&adev->kfd.reset_work); 5935 5936 if (amdgpu_sriov_vf(adev)) 5937 cancel_work(&adev->virt.flr_work); 5938 5939 if (con && adev->ras_enabled) 5940 cancel_work(&con->recovery_work); 5941 5942 } 5943 5944 static int amdgpu_device_health_check(struct list_head *device_list_handle) 5945 { 5946 struct amdgpu_device *tmp_adev; 5947 int ret = 0; 5948 u32 status; 5949 5950 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 5951 pci_read_config_dword(tmp_adev->pdev, PCI_COMMAND, &status); 5952 if (PCI_POSSIBLE_ERROR(status)) { 5953 dev_err(tmp_adev->dev, "device lost from bus!"); 5954 ret = -ENODEV; 5955 } 5956 } 5957 5958 return ret; 5959 } 5960 5961 static int amdgpu_device_halt_activities(struct amdgpu_device *adev, 5962 struct amdgpu_job *job, 5963 struct amdgpu_reset_context *reset_context, 5964 struct list_head *device_list, 5965 struct amdgpu_hive_info *hive, 5966 bool need_emergency_restart) 5967 { 5968 struct list_head *device_list_handle = NULL; 5969 struct amdgpu_device *tmp_adev = NULL; 5970 int i, r = 0; 5971 5972 /* 5973 * Build list of devices to reset. 5974 * In case we are in XGMI hive mode, resort the device list 5975 * to put adev in the 1st position. 5976 */ 5977 if (!amdgpu_sriov_vf(adev) && (adev->gmc.xgmi.num_physical_nodes > 1) && hive) { 5978 list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head) { 5979 list_add_tail(&tmp_adev->reset_list, device_list); 5980 if (adev->shutdown) 5981 tmp_adev->shutdown = true; 5982 if (adev->pcie_reset_ctx.occurs_dpc) 5983 tmp_adev->pcie_reset_ctx.in_link_reset = true; 5984 } 5985 if (!list_is_first(&adev->reset_list, device_list)) 5986 list_rotate_to_front(&adev->reset_list, device_list); 5987 device_list_handle = device_list; 5988 } else { 5989 list_add_tail(&adev->reset_list, device_list); 5990 device_list_handle = device_list; 5991 } 5992 5993 if (!amdgpu_sriov_vf(adev) && (!adev->pcie_reset_ctx.occurs_dpc)) { 5994 r = amdgpu_device_health_check(device_list_handle); 5995 if (r) 5996 return r; 5997 } 5998 5999 /* We need to lock reset domain only once both for XGMI and single device */ 6000 tmp_adev = list_first_entry(device_list_handle, struct amdgpu_device, 6001 reset_list); 6002 amdgpu_device_lock_reset_domain(tmp_adev->reset_domain); 6003 6004 /* block all schedulers and reset given job's ring */ 6005 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 6006 6007 amdgpu_device_set_mp1_state(tmp_adev); 6008 6009 /* 6010 * Try to put the audio codec into suspend state 6011 * before gpu reset started. 6012 * 6013 * Due to the power domain of the graphics device 6014 * is shared with AZ power domain. Without this, 6015 * we may change the audio hardware from behind 6016 * the audio driver's back. That will trigger 6017 * some audio codec errors. 6018 */ 6019 if (!amdgpu_device_suspend_display_audio(tmp_adev)) 6020 tmp_adev->pcie_reset_ctx.audio_suspended = true; 6021 6022 amdgpu_ras_set_error_query_ready(tmp_adev, false); 6023 6024 cancel_delayed_work_sync(&tmp_adev->delayed_init_work); 6025 6026 amdgpu_amdkfd_pre_reset(tmp_adev, reset_context); 6027 6028 /* 6029 * Mark these ASICs to be reset as untracked first 6030 * And add them back after reset completed 6031 */ 6032 amdgpu_unregister_gpu_instance(tmp_adev); 6033 6034 drm_client_dev_suspend(adev_to_drm(tmp_adev), false); 6035 6036 /* disable ras on ALL IPs */ 6037 if (!need_emergency_restart && 6038 (!adev->pcie_reset_ctx.occurs_dpc) && 6039 amdgpu_device_ip_need_full_reset(tmp_adev)) 6040 amdgpu_ras_suspend(tmp_adev); 6041 6042 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 6043 struct amdgpu_ring *ring = tmp_adev->rings[i]; 6044 6045 if (!amdgpu_ring_sched_ready(ring)) 6046 continue; 6047 6048 drm_sched_stop(&ring->sched, job ? &job->base : NULL); 6049 6050 if (need_emergency_restart) 6051 amdgpu_job_stop_all_jobs_on_sched(&ring->sched); 6052 } 6053 atomic_inc(&tmp_adev->gpu_reset_counter); 6054 } 6055 6056 return r; 6057 } 6058 6059 static int amdgpu_device_asic_reset(struct amdgpu_device *adev, 6060 struct list_head *device_list, 6061 struct amdgpu_reset_context *reset_context) 6062 { 6063 struct amdgpu_device *tmp_adev = NULL; 6064 int retry_limit = AMDGPU_MAX_RETRY_LIMIT; 6065 int r = 0; 6066 6067 retry: /* Rest of adevs pre asic reset from XGMI hive. */ 6068 list_for_each_entry(tmp_adev, device_list, reset_list) { 6069 if (adev->pcie_reset_ctx.occurs_dpc) 6070 tmp_adev->no_hw_access = true; 6071 r = amdgpu_device_pre_asic_reset(tmp_adev, reset_context); 6072 if (adev->pcie_reset_ctx.occurs_dpc) 6073 tmp_adev->no_hw_access = false; 6074 /*TODO Should we stop ?*/ 6075 if (r) { 6076 dev_err(tmp_adev->dev, "GPU pre asic reset failed with err, %d for drm dev, %s ", 6077 r, adev_to_drm(tmp_adev)->unique); 6078 tmp_adev->asic_reset_res = r; 6079 } 6080 } 6081 6082 /* Actual ASIC resets if needed.*/ 6083 /* Host driver will handle XGMI hive reset for SRIOV */ 6084 if (amdgpu_sriov_vf(adev)) { 6085 if (amdgpu_ras_get_fed_status(adev) || amdgpu_virt_rcvd_ras_interrupt(adev)) { 6086 dev_dbg(adev->dev, "Detected RAS error, wait for FLR completion\n"); 6087 amdgpu_ras_set_fed(adev, true); 6088 set_bit(AMDGPU_HOST_FLR, &reset_context->flags); 6089 } 6090 6091 r = amdgpu_device_reset_sriov(adev, reset_context); 6092 if (AMDGPU_RETRY_SRIOV_RESET(r) && (retry_limit--) > 0) { 6093 amdgpu_virt_release_full_gpu(adev, true); 6094 goto retry; 6095 } 6096 if (r) 6097 adev->asic_reset_res = r; 6098 } else { 6099 r = amdgpu_do_asic_reset(device_list, reset_context); 6100 if (r && r == -EAGAIN) 6101 goto retry; 6102 } 6103 6104 list_for_each_entry(tmp_adev, device_list, reset_list) { 6105 /* 6106 * Drop any pending non scheduler resets queued before reset is done. 6107 * Any reset scheduled after this point would be valid. Scheduler resets 6108 * were already dropped during drm_sched_stop and no new ones can come 6109 * in before drm_sched_start. 6110 */ 6111 amdgpu_device_stop_pending_resets(tmp_adev); 6112 } 6113 6114 return r; 6115 } 6116 6117 static int amdgpu_device_sched_resume(struct list_head *device_list, 6118 struct amdgpu_reset_context *reset_context, 6119 bool job_signaled) 6120 { 6121 struct amdgpu_device *tmp_adev = NULL; 6122 int i, r = 0; 6123 6124 /* Post ASIC reset for all devs .*/ 6125 list_for_each_entry(tmp_adev, device_list, reset_list) { 6126 6127 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 6128 struct amdgpu_ring *ring = tmp_adev->rings[i]; 6129 6130 if (!amdgpu_ring_sched_ready(ring)) 6131 continue; 6132 6133 drm_sched_start(&ring->sched, 0); 6134 } 6135 6136 if (!drm_drv_uses_atomic_modeset(adev_to_drm(tmp_adev)) && !job_signaled) 6137 drm_helper_resume_force_mode(adev_to_drm(tmp_adev)); 6138 6139 if (tmp_adev->asic_reset_res) 6140 r = tmp_adev->asic_reset_res; 6141 6142 tmp_adev->asic_reset_res = 0; 6143 6144 if (r) { 6145 /* bad news, how to tell it to userspace ? 6146 * for ras error, we should report GPU bad status instead of 6147 * reset failure 6148 */ 6149 if (reset_context->src != AMDGPU_RESET_SRC_RAS || 6150 !amdgpu_ras_eeprom_check_err_threshold(tmp_adev)) 6151 dev_info(tmp_adev->dev, "GPU reset(%d) failed\n", 6152 atomic_read(&tmp_adev->gpu_reset_counter)); 6153 amdgpu_vf_error_put(tmp_adev, AMDGIM_ERROR_VF_GPU_RESET_FAIL, 0, r); 6154 } else { 6155 dev_info(tmp_adev->dev, "GPU reset(%d) succeeded!\n", atomic_read(&tmp_adev->gpu_reset_counter)); 6156 if (amdgpu_acpi_smart_shift_update(adev_to_drm(tmp_adev), AMDGPU_SS_DEV_D0)) 6157 DRM_WARN("smart shift update failed\n"); 6158 } 6159 } 6160 6161 return r; 6162 } 6163 6164 static void amdgpu_device_gpu_resume(struct amdgpu_device *adev, 6165 struct list_head *device_list, 6166 bool need_emergency_restart) 6167 { 6168 struct amdgpu_device *tmp_adev = NULL; 6169 6170 list_for_each_entry(tmp_adev, device_list, reset_list) { 6171 /* unlock kfd: SRIOV would do it separately */ 6172 if (!need_emergency_restart && !amdgpu_sriov_vf(tmp_adev)) 6173 amdgpu_amdkfd_post_reset(tmp_adev); 6174 6175 /* kfd_post_reset will do nothing if kfd device is not initialized, 6176 * need to bring up kfd here if it's not be initialized before 6177 */ 6178 if (!adev->kfd.init_complete) 6179 amdgpu_amdkfd_device_init(adev); 6180 6181 if (tmp_adev->pcie_reset_ctx.audio_suspended) 6182 amdgpu_device_resume_display_audio(tmp_adev); 6183 6184 amdgpu_device_unset_mp1_state(tmp_adev); 6185 6186 amdgpu_ras_set_error_query_ready(tmp_adev, true); 6187 6188 } 6189 6190 tmp_adev = list_first_entry(device_list, struct amdgpu_device, 6191 reset_list); 6192 amdgpu_device_unlock_reset_domain(tmp_adev->reset_domain); 6193 6194 } 6195 6196 6197 /** 6198 * amdgpu_device_gpu_recover - reset the asic and recover scheduler 6199 * 6200 * @adev: amdgpu_device pointer 6201 * @job: which job trigger hang 6202 * @reset_context: amdgpu reset context pointer 6203 * 6204 * Attempt to reset the GPU if it has hung (all asics). 6205 * Attempt to do soft-reset or full-reset and reinitialize Asic 6206 * Returns 0 for success or an error on failure. 6207 */ 6208 6209 int amdgpu_device_gpu_recover(struct amdgpu_device *adev, 6210 struct amdgpu_job *job, 6211 struct amdgpu_reset_context *reset_context) 6212 { 6213 struct list_head device_list; 6214 bool job_signaled = false; 6215 struct amdgpu_hive_info *hive = NULL; 6216 int r = 0; 6217 bool need_emergency_restart = false; 6218 6219 /* 6220 * If it reaches here because of hang/timeout and a RAS error is 6221 * detected at the same time, let RAS recovery take care of it. 6222 */ 6223 if (amdgpu_ras_is_err_state(adev, AMDGPU_RAS_BLOCK__ANY) && 6224 !amdgpu_sriov_vf(adev) && 6225 reset_context->src != AMDGPU_RESET_SRC_RAS) { 6226 dev_dbg(adev->dev, 6227 "Gpu recovery from source: %d yielding to RAS error recovery handling", 6228 reset_context->src); 6229 return 0; 6230 } 6231 6232 /* 6233 * Special case: RAS triggered and full reset isn't supported 6234 */ 6235 need_emergency_restart = amdgpu_ras_need_emergency_restart(adev); 6236 6237 /* 6238 * Flush RAM to disk so that after reboot 6239 * the user can read log and see why the system rebooted. 6240 */ 6241 if (need_emergency_restart && amdgpu_ras_get_context(adev) && 6242 amdgpu_ras_get_context(adev)->reboot) { 6243 DRM_WARN("Emergency reboot."); 6244 6245 ksys_sync_helper(); 6246 emergency_restart(); 6247 } 6248 6249 dev_info(adev->dev, "GPU %s begin!\n", 6250 need_emergency_restart ? "jobs stop":"reset"); 6251 6252 if (!amdgpu_sriov_vf(adev)) 6253 hive = amdgpu_get_xgmi_hive(adev); 6254 if (hive) 6255 mutex_lock(&hive->hive_lock); 6256 6257 reset_context->job = job; 6258 reset_context->hive = hive; 6259 INIT_LIST_HEAD(&device_list); 6260 6261 r = amdgpu_device_halt_activities(adev, job, reset_context, &device_list, 6262 hive, need_emergency_restart); 6263 if (r) 6264 goto end_reset; 6265 6266 if (need_emergency_restart) 6267 goto skip_sched_resume; 6268 /* 6269 * Must check guilty signal here since after this point all old 6270 * HW fences are force signaled. 6271 * 6272 * job->base holds a reference to parent fence 6273 */ 6274 if (job && dma_fence_is_signaled(&job->hw_fence)) { 6275 job_signaled = true; 6276 dev_info(adev->dev, "Guilty job already signaled, skipping HW reset"); 6277 goto skip_hw_reset; 6278 } 6279 6280 r = amdgpu_device_asic_reset(adev, &device_list, reset_context); 6281 if (r) 6282 goto end_reset; 6283 skip_hw_reset: 6284 r = amdgpu_device_sched_resume(&device_list, reset_context, job_signaled); 6285 if (r) 6286 goto end_reset; 6287 skip_sched_resume: 6288 amdgpu_device_gpu_resume(adev, &device_list, need_emergency_restart); 6289 end_reset: 6290 if (hive) { 6291 mutex_unlock(&hive->hive_lock); 6292 amdgpu_put_xgmi_hive(hive); 6293 } 6294 6295 if (r) 6296 dev_info(adev->dev, "GPU reset end with ret = %d\n", r); 6297 6298 atomic_set(&adev->reset_domain->reset_res, r); 6299 6300 if (!r) 6301 drm_dev_wedged_event(adev_to_drm(adev), DRM_WEDGE_RECOVERY_NONE); 6302 6303 return r; 6304 } 6305 6306 /** 6307 * amdgpu_device_partner_bandwidth - find the bandwidth of appropriate partner 6308 * 6309 * @adev: amdgpu_device pointer 6310 * @speed: pointer to the speed of the link 6311 * @width: pointer to the width of the link 6312 * 6313 * Evaluate the hierarchy to find the speed and bandwidth capabilities of the 6314 * first physical partner to an AMD dGPU. 6315 * This will exclude any virtual switches and links. 6316 */ 6317 static void amdgpu_device_partner_bandwidth(struct amdgpu_device *adev, 6318 enum pci_bus_speed *speed, 6319 enum pcie_link_width *width) 6320 { 6321 struct pci_dev *parent = adev->pdev; 6322 6323 if (!speed || !width) 6324 return; 6325 6326 *speed = PCI_SPEED_UNKNOWN; 6327 *width = PCIE_LNK_WIDTH_UNKNOWN; 6328 6329 if (amdgpu_device_pcie_dynamic_switching_supported(adev)) { 6330 while ((parent = pci_upstream_bridge(parent))) { 6331 /* skip upstream/downstream switches internal to dGPU*/ 6332 if (parent->vendor == PCI_VENDOR_ID_ATI) 6333 continue; 6334 *speed = pcie_get_speed_cap(parent); 6335 *width = pcie_get_width_cap(parent); 6336 break; 6337 } 6338 } else { 6339 /* use the current speeds rather than max if switching is not supported */ 6340 pcie_bandwidth_available(adev->pdev, NULL, speed, width); 6341 } 6342 } 6343 6344 /** 6345 * amdgpu_device_gpu_bandwidth - find the bandwidth of the GPU 6346 * 6347 * @adev: amdgpu_device pointer 6348 * @speed: pointer to the speed of the link 6349 * @width: pointer to the width of the link 6350 * 6351 * Evaluate the hierarchy to find the speed and bandwidth capabilities of the 6352 * AMD dGPU which may be a virtual upstream bridge. 6353 */ 6354 static void amdgpu_device_gpu_bandwidth(struct amdgpu_device *adev, 6355 enum pci_bus_speed *speed, 6356 enum pcie_link_width *width) 6357 { 6358 struct pci_dev *parent = adev->pdev; 6359 6360 if (!speed || !width) 6361 return; 6362 6363 parent = pci_upstream_bridge(parent); 6364 if (parent && parent->vendor == PCI_VENDOR_ID_ATI) { 6365 /* use the upstream/downstream switches internal to dGPU */ 6366 *speed = pcie_get_speed_cap(parent); 6367 *width = pcie_get_width_cap(parent); 6368 while ((parent = pci_upstream_bridge(parent))) { 6369 if (parent->vendor == PCI_VENDOR_ID_ATI) { 6370 /* use the upstream/downstream switches internal to dGPU */ 6371 *speed = pcie_get_speed_cap(parent); 6372 *width = pcie_get_width_cap(parent); 6373 } 6374 } 6375 } else { 6376 /* use the device itself */ 6377 *speed = pcie_get_speed_cap(adev->pdev); 6378 *width = pcie_get_width_cap(adev->pdev); 6379 } 6380 } 6381 6382 /** 6383 * amdgpu_device_get_pcie_info - fence pcie info about the PCIE slot 6384 * 6385 * @adev: amdgpu_device pointer 6386 * 6387 * Fetches and stores in the driver the PCIE capabilities (gen speed 6388 * and lanes) of the slot the device is in. Handles APUs and 6389 * virtualized environments where PCIE config space may not be available. 6390 */ 6391 static void amdgpu_device_get_pcie_info(struct amdgpu_device *adev) 6392 { 6393 enum pci_bus_speed speed_cap, platform_speed_cap; 6394 enum pcie_link_width platform_link_width, link_width; 6395 6396 if (amdgpu_pcie_gen_cap) 6397 adev->pm.pcie_gen_mask = amdgpu_pcie_gen_cap; 6398 6399 if (amdgpu_pcie_lane_cap) 6400 adev->pm.pcie_mlw_mask = amdgpu_pcie_lane_cap; 6401 6402 /* covers APUs as well */ 6403 if (pci_is_root_bus(adev->pdev->bus) && !amdgpu_passthrough(adev)) { 6404 if (adev->pm.pcie_gen_mask == 0) 6405 adev->pm.pcie_gen_mask = AMDGPU_DEFAULT_PCIE_GEN_MASK; 6406 if (adev->pm.pcie_mlw_mask == 0) 6407 adev->pm.pcie_mlw_mask = AMDGPU_DEFAULT_PCIE_MLW_MASK; 6408 return; 6409 } 6410 6411 if (adev->pm.pcie_gen_mask && adev->pm.pcie_mlw_mask) 6412 return; 6413 6414 amdgpu_device_partner_bandwidth(adev, &platform_speed_cap, 6415 &platform_link_width); 6416 amdgpu_device_gpu_bandwidth(adev, &speed_cap, &link_width); 6417 6418 if (adev->pm.pcie_gen_mask == 0) { 6419 /* asic caps */ 6420 if (speed_cap == PCI_SPEED_UNKNOWN) { 6421 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 6422 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 | 6423 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3); 6424 } else { 6425 if (speed_cap == PCIE_SPEED_32_0GT) 6426 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 6427 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 | 6428 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3 | 6429 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN4 | 6430 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN5); 6431 else if (speed_cap == PCIE_SPEED_16_0GT) 6432 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 6433 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 | 6434 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3 | 6435 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN4); 6436 else if (speed_cap == PCIE_SPEED_8_0GT) 6437 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 6438 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 | 6439 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3); 6440 else if (speed_cap == PCIE_SPEED_5_0GT) 6441 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 6442 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2); 6443 else 6444 adev->pm.pcie_gen_mask |= CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1; 6445 } 6446 /* platform caps */ 6447 if (platform_speed_cap == PCI_SPEED_UNKNOWN) { 6448 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 6449 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2); 6450 } else { 6451 if (platform_speed_cap == PCIE_SPEED_32_0GT) 6452 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 6453 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 | 6454 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3 | 6455 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN4 | 6456 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN5); 6457 else if (platform_speed_cap == PCIE_SPEED_16_0GT) 6458 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 6459 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 | 6460 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3 | 6461 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN4); 6462 else if (platform_speed_cap == PCIE_SPEED_8_0GT) 6463 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 6464 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 | 6465 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3); 6466 else if (platform_speed_cap == PCIE_SPEED_5_0GT) 6467 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 6468 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2); 6469 else 6470 adev->pm.pcie_gen_mask |= CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1; 6471 6472 } 6473 } 6474 if (adev->pm.pcie_mlw_mask == 0) { 6475 /* asic caps */ 6476 if (link_width == PCIE_LNK_WIDTH_UNKNOWN) { 6477 adev->pm.pcie_mlw_mask |= AMDGPU_DEFAULT_ASIC_PCIE_MLW_MASK; 6478 } else { 6479 switch (link_width) { 6480 case PCIE_LNK_X32: 6481 adev->pm.pcie_mlw_mask |= (CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X32 | 6482 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X16 | 6483 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X12 | 6484 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X8 | 6485 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X4 | 6486 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X2 | 6487 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X1); 6488 break; 6489 case PCIE_LNK_X16: 6490 adev->pm.pcie_mlw_mask |= (CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X16 | 6491 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X12 | 6492 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X8 | 6493 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X4 | 6494 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X2 | 6495 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X1); 6496 break; 6497 case PCIE_LNK_X12: 6498 adev->pm.pcie_mlw_mask |= (CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X12 | 6499 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X8 | 6500 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X4 | 6501 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X2 | 6502 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X1); 6503 break; 6504 case PCIE_LNK_X8: 6505 adev->pm.pcie_mlw_mask |= (CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X8 | 6506 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X4 | 6507 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X2 | 6508 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X1); 6509 break; 6510 case PCIE_LNK_X4: 6511 adev->pm.pcie_mlw_mask |= (CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X4 | 6512 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X2 | 6513 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X1); 6514 break; 6515 case PCIE_LNK_X2: 6516 adev->pm.pcie_mlw_mask |= (CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X2 | 6517 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X1); 6518 break; 6519 case PCIE_LNK_X1: 6520 adev->pm.pcie_mlw_mask |= CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X1; 6521 break; 6522 default: 6523 break; 6524 } 6525 } 6526 /* platform caps */ 6527 if (platform_link_width == PCIE_LNK_WIDTH_UNKNOWN) { 6528 adev->pm.pcie_mlw_mask |= AMDGPU_DEFAULT_PCIE_MLW_MASK; 6529 } else { 6530 switch (platform_link_width) { 6531 case PCIE_LNK_X32: 6532 adev->pm.pcie_mlw_mask |= (CAIL_PCIE_LINK_WIDTH_SUPPORT_X32 | 6533 CAIL_PCIE_LINK_WIDTH_SUPPORT_X16 | 6534 CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 | 6535 CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 | 6536 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 6537 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 6538 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 6539 break; 6540 case PCIE_LNK_X16: 6541 adev->pm.pcie_mlw_mask |= (CAIL_PCIE_LINK_WIDTH_SUPPORT_X16 | 6542 CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 | 6543 CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 | 6544 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 6545 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 6546 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 6547 break; 6548 case PCIE_LNK_X12: 6549 adev->pm.pcie_mlw_mask |= (CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 | 6550 CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 | 6551 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 6552 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 6553 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 6554 break; 6555 case PCIE_LNK_X8: 6556 adev->pm.pcie_mlw_mask |= (CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 | 6557 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 6558 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 6559 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 6560 break; 6561 case PCIE_LNK_X4: 6562 adev->pm.pcie_mlw_mask |= (CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 6563 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 6564 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 6565 break; 6566 case PCIE_LNK_X2: 6567 adev->pm.pcie_mlw_mask |= (CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 6568 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 6569 break; 6570 case PCIE_LNK_X1: 6571 adev->pm.pcie_mlw_mask |= CAIL_PCIE_LINK_WIDTH_SUPPORT_X1; 6572 break; 6573 default: 6574 break; 6575 } 6576 } 6577 } 6578 } 6579 6580 /** 6581 * amdgpu_device_is_peer_accessible - Check peer access through PCIe BAR 6582 * 6583 * @adev: amdgpu_device pointer 6584 * @peer_adev: amdgpu_device pointer for peer device trying to access @adev 6585 * 6586 * Return true if @peer_adev can access (DMA) @adev through the PCIe 6587 * BAR, i.e. @adev is "large BAR" and the BAR matches the DMA mask of 6588 * @peer_adev. 6589 */ 6590 bool amdgpu_device_is_peer_accessible(struct amdgpu_device *adev, 6591 struct amdgpu_device *peer_adev) 6592 { 6593 #ifdef CONFIG_HSA_AMD_P2P 6594 bool p2p_access = 6595 !adev->gmc.xgmi.connected_to_cpu && 6596 !(pci_p2pdma_distance(adev->pdev, peer_adev->dev, false) < 0); 6597 if (!p2p_access) 6598 dev_info(adev->dev, "PCIe P2P access from peer device %s is not supported by the chipset\n", 6599 pci_name(peer_adev->pdev)); 6600 6601 bool is_large_bar = adev->gmc.visible_vram_size && 6602 adev->gmc.real_vram_size == adev->gmc.visible_vram_size; 6603 bool p2p_addressable = amdgpu_device_check_iommu_remap(peer_adev); 6604 6605 if (!p2p_addressable) { 6606 uint64_t address_mask = peer_adev->dev->dma_mask ? 6607 ~*peer_adev->dev->dma_mask : ~((1ULL << 32) - 1); 6608 resource_size_t aper_limit = 6609 adev->gmc.aper_base + adev->gmc.aper_size - 1; 6610 6611 p2p_addressable = !(adev->gmc.aper_base & address_mask || 6612 aper_limit & address_mask); 6613 } 6614 return pcie_p2p && is_large_bar && p2p_access && p2p_addressable; 6615 #else 6616 return false; 6617 #endif 6618 } 6619 6620 int amdgpu_device_baco_enter(struct drm_device *dev) 6621 { 6622 struct amdgpu_device *adev = drm_to_adev(dev); 6623 struct amdgpu_ras *ras = amdgpu_ras_get_context(adev); 6624 6625 if (!amdgpu_device_supports_baco(dev)) 6626 return -ENOTSUPP; 6627 6628 if (ras && adev->ras_enabled && 6629 adev->nbio.funcs->enable_doorbell_interrupt) 6630 adev->nbio.funcs->enable_doorbell_interrupt(adev, false); 6631 6632 return amdgpu_dpm_baco_enter(adev); 6633 } 6634 6635 int amdgpu_device_baco_exit(struct drm_device *dev) 6636 { 6637 struct amdgpu_device *adev = drm_to_adev(dev); 6638 struct amdgpu_ras *ras = amdgpu_ras_get_context(adev); 6639 int ret = 0; 6640 6641 if (!amdgpu_device_supports_baco(dev)) 6642 return -ENOTSUPP; 6643 6644 ret = amdgpu_dpm_baco_exit(adev); 6645 if (ret) 6646 return ret; 6647 6648 if (ras && adev->ras_enabled && 6649 adev->nbio.funcs->enable_doorbell_interrupt) 6650 adev->nbio.funcs->enable_doorbell_interrupt(adev, true); 6651 6652 if (amdgpu_passthrough(adev) && adev->nbio.funcs && 6653 adev->nbio.funcs->clear_doorbell_interrupt) 6654 adev->nbio.funcs->clear_doorbell_interrupt(adev); 6655 6656 return 0; 6657 } 6658 6659 /** 6660 * amdgpu_pci_error_detected - Called when a PCI error is detected. 6661 * @pdev: PCI device struct 6662 * @state: PCI channel state 6663 * 6664 * Description: Called when a PCI error is detected. 6665 * 6666 * Return: PCI_ERS_RESULT_NEED_RESET or PCI_ERS_RESULT_DISCONNECT. 6667 */ 6668 pci_ers_result_t amdgpu_pci_error_detected(struct pci_dev *pdev, pci_channel_state_t state) 6669 { 6670 struct drm_device *dev = pci_get_drvdata(pdev); 6671 struct amdgpu_device *adev = drm_to_adev(dev); 6672 struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev); 6673 struct amdgpu_reset_context reset_context; 6674 struct list_head device_list; 6675 int r = 0; 6676 6677 dev_info(adev->dev, "PCI error: detected callback!!\n"); 6678 6679 if (!amdgpu_dpm_is_link_reset_supported(adev)) { 6680 dev_warn(adev->dev, "No support for XGMI hive yet...\n"); 6681 return PCI_ERS_RESULT_DISCONNECT; 6682 } 6683 6684 adev->pci_channel_state = state; 6685 6686 switch (state) { 6687 case pci_channel_io_normal: 6688 dev_info(adev->dev, "pci_channel_io_normal: state(%d)!!\n", state); 6689 return PCI_ERS_RESULT_CAN_RECOVER; 6690 case pci_channel_io_frozen: 6691 /* Fatal error, prepare for slot reset */ 6692 dev_info(adev->dev, "pci_channel_io_frozen: state(%d)!!\n", state); 6693 6694 if (hive) 6695 mutex_lock(&hive->hive_lock); 6696 adev->pcie_reset_ctx.occurs_dpc = true; 6697 memset(&reset_context, 0, sizeof(reset_context)); 6698 INIT_LIST_HEAD(&device_list); 6699 6700 r = amdgpu_device_halt_activities(adev, NULL, &reset_context, &device_list, 6701 hive, false); 6702 if (hive) { 6703 mutex_unlock(&hive->hive_lock); 6704 amdgpu_put_xgmi_hive(hive); 6705 } 6706 if (r) 6707 return PCI_ERS_RESULT_DISCONNECT; 6708 return PCI_ERS_RESULT_NEED_RESET; 6709 case pci_channel_io_perm_failure: 6710 /* Permanent error, prepare for device removal */ 6711 dev_info(adev->dev, "pci_channel_io_perm_failure: state(%d)!!\n", state); 6712 return PCI_ERS_RESULT_DISCONNECT; 6713 } 6714 6715 return PCI_ERS_RESULT_NEED_RESET; 6716 } 6717 6718 /** 6719 * amdgpu_pci_mmio_enabled - Enable MMIO and dump debug registers 6720 * @pdev: pointer to PCI device 6721 */ 6722 pci_ers_result_t amdgpu_pci_mmio_enabled(struct pci_dev *pdev) 6723 { 6724 struct drm_device *dev = pci_get_drvdata(pdev); 6725 struct amdgpu_device *adev = drm_to_adev(dev); 6726 6727 dev_info(adev->dev, "PCI error: mmio enabled callback!!\n"); 6728 6729 /* TODO - dump whatever for debugging purposes */ 6730 6731 /* This called only if amdgpu_pci_error_detected returns 6732 * PCI_ERS_RESULT_CAN_RECOVER. Read/write to the device still 6733 * works, no need to reset slot. 6734 */ 6735 6736 return PCI_ERS_RESULT_RECOVERED; 6737 } 6738 6739 /** 6740 * amdgpu_pci_slot_reset - Called when PCI slot has been reset. 6741 * @pdev: PCI device struct 6742 * 6743 * Description: This routine is called by the pci error recovery 6744 * code after the PCI slot has been reset, just before we 6745 * should resume normal operations. 6746 */ 6747 pci_ers_result_t amdgpu_pci_slot_reset(struct pci_dev *pdev) 6748 { 6749 struct drm_device *dev = pci_get_drvdata(pdev); 6750 struct amdgpu_device *adev = drm_to_adev(dev); 6751 struct amdgpu_reset_context reset_context; 6752 struct amdgpu_device *tmp_adev = NULL; 6753 struct amdgpu_hive_info *hive = NULL; 6754 struct list_head device_list; 6755 int r = 0, i; 6756 u32 memsize; 6757 6758 /* PCI error slot reset should be skipped During RAS recovery */ 6759 if ((amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 3) || 6760 amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 4)) && 6761 amdgpu_ras_in_recovery(adev)) 6762 return PCI_ERS_RESULT_RECOVERED; 6763 6764 dev_info(adev->dev, "PCI error: slot reset callback!!\n"); 6765 6766 memset(&reset_context, 0, sizeof(reset_context)); 6767 6768 /* wait for asic to come out of reset */ 6769 msleep(700); 6770 6771 /* Restore PCI confspace */ 6772 amdgpu_device_load_pci_state(pdev); 6773 6774 /* confirm ASIC came out of reset */ 6775 for (i = 0; i < adev->usec_timeout; i++) { 6776 memsize = amdgpu_asic_get_config_memsize(adev); 6777 6778 if (memsize != 0xffffffff) 6779 break; 6780 udelay(1); 6781 } 6782 if (memsize == 0xffffffff) { 6783 r = -ETIME; 6784 goto out; 6785 } 6786 6787 reset_context.method = AMD_RESET_METHOD_NONE; 6788 reset_context.reset_req_dev = adev; 6789 set_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags); 6790 set_bit(AMDGPU_SKIP_COREDUMP, &reset_context.flags); 6791 INIT_LIST_HEAD(&device_list); 6792 6793 hive = amdgpu_get_xgmi_hive(adev); 6794 if (hive) { 6795 mutex_lock(&hive->hive_lock); 6796 reset_context.hive = hive; 6797 list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head) { 6798 tmp_adev->pcie_reset_ctx.in_link_reset = true; 6799 list_add_tail(&tmp_adev->reset_list, &device_list); 6800 } 6801 } else { 6802 set_bit(AMDGPU_SKIP_HW_RESET, &reset_context.flags); 6803 list_add_tail(&adev->reset_list, &device_list); 6804 } 6805 6806 r = amdgpu_device_asic_reset(adev, &device_list, &reset_context); 6807 out: 6808 if (!r) { 6809 if (amdgpu_device_cache_pci_state(adev->pdev)) 6810 pci_restore_state(adev->pdev); 6811 dev_info(adev->dev, "PCIe error recovery succeeded\n"); 6812 } else { 6813 dev_err(adev->dev, "PCIe error recovery failed, err:%d\n", r); 6814 if (tmp_adev) { 6815 list_for_each_entry(tmp_adev, &device_list, reset_list) 6816 amdgpu_device_unset_mp1_state(tmp_adev); 6817 amdgpu_device_unlock_reset_domain(adev->reset_domain); 6818 } 6819 } 6820 6821 if (hive) { 6822 mutex_unlock(&hive->hive_lock); 6823 amdgpu_put_xgmi_hive(hive); 6824 } 6825 6826 return r ? PCI_ERS_RESULT_DISCONNECT : PCI_ERS_RESULT_RECOVERED; 6827 } 6828 6829 /** 6830 * amdgpu_pci_resume() - resume normal ops after PCI reset 6831 * @pdev: pointer to PCI device 6832 * 6833 * Called when the error recovery driver tells us that its 6834 * OK to resume normal operation. 6835 */ 6836 void amdgpu_pci_resume(struct pci_dev *pdev) 6837 { 6838 struct drm_device *dev = pci_get_drvdata(pdev); 6839 struct amdgpu_device *adev = drm_to_adev(dev); 6840 struct list_head device_list; 6841 struct amdgpu_hive_info *hive = NULL; 6842 struct amdgpu_device *tmp_adev = NULL; 6843 6844 dev_info(adev->dev, "PCI error: resume callback!!\n"); 6845 6846 /* Only continue execution for the case of pci_channel_io_frozen */ 6847 if (adev->pci_channel_state != pci_channel_io_frozen) 6848 return; 6849 6850 INIT_LIST_HEAD(&device_list); 6851 6852 hive = amdgpu_get_xgmi_hive(adev); 6853 if (hive) { 6854 mutex_lock(&hive->hive_lock); 6855 list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head) { 6856 tmp_adev->pcie_reset_ctx.in_link_reset = false; 6857 list_add_tail(&tmp_adev->reset_list, &device_list); 6858 } 6859 } else 6860 list_add_tail(&adev->reset_list, &device_list); 6861 6862 amdgpu_device_sched_resume(&device_list, NULL, NULL); 6863 amdgpu_device_gpu_resume(adev, &device_list, false); 6864 adev->pcie_reset_ctx.occurs_dpc = false; 6865 6866 if (hive) { 6867 mutex_unlock(&hive->hive_lock); 6868 amdgpu_put_xgmi_hive(hive); 6869 } 6870 } 6871 6872 bool amdgpu_device_cache_pci_state(struct pci_dev *pdev) 6873 { 6874 struct drm_device *dev = pci_get_drvdata(pdev); 6875 struct amdgpu_device *adev = drm_to_adev(dev); 6876 int r; 6877 6878 if (amdgpu_sriov_vf(adev)) 6879 return false; 6880 6881 r = pci_save_state(pdev); 6882 if (!r) { 6883 kfree(adev->pci_state); 6884 6885 adev->pci_state = pci_store_saved_state(pdev); 6886 6887 if (!adev->pci_state) { 6888 DRM_ERROR("Failed to store PCI saved state"); 6889 return false; 6890 } 6891 } else { 6892 DRM_WARN("Failed to save PCI state, err:%d\n", r); 6893 return false; 6894 } 6895 6896 return true; 6897 } 6898 6899 bool amdgpu_device_load_pci_state(struct pci_dev *pdev) 6900 { 6901 struct drm_device *dev = pci_get_drvdata(pdev); 6902 struct amdgpu_device *adev = drm_to_adev(dev); 6903 int r; 6904 6905 if (!adev->pci_state) 6906 return false; 6907 6908 r = pci_load_saved_state(pdev, adev->pci_state); 6909 6910 if (!r) { 6911 pci_restore_state(pdev); 6912 } else { 6913 DRM_WARN("Failed to load PCI state, err:%d\n", r); 6914 return false; 6915 } 6916 6917 return true; 6918 } 6919 6920 void amdgpu_device_flush_hdp(struct amdgpu_device *adev, 6921 struct amdgpu_ring *ring) 6922 { 6923 #ifdef CONFIG_X86_64 6924 if ((adev->flags & AMD_IS_APU) && !amdgpu_passthrough(adev)) 6925 return; 6926 #endif 6927 if (adev->gmc.xgmi.connected_to_cpu) 6928 return; 6929 6930 if (ring && ring->funcs->emit_hdp_flush) 6931 amdgpu_ring_emit_hdp_flush(ring); 6932 else 6933 amdgpu_asic_flush_hdp(adev, ring); 6934 } 6935 6936 void amdgpu_device_invalidate_hdp(struct amdgpu_device *adev, 6937 struct amdgpu_ring *ring) 6938 { 6939 #ifdef CONFIG_X86_64 6940 if ((adev->flags & AMD_IS_APU) && !amdgpu_passthrough(adev)) 6941 return; 6942 #endif 6943 if (adev->gmc.xgmi.connected_to_cpu) 6944 return; 6945 6946 amdgpu_asic_invalidate_hdp(adev, ring); 6947 } 6948 6949 int amdgpu_in_reset(struct amdgpu_device *adev) 6950 { 6951 return atomic_read(&adev->reset_domain->in_gpu_reset); 6952 } 6953 6954 /** 6955 * amdgpu_device_halt() - bring hardware to some kind of halt state 6956 * 6957 * @adev: amdgpu_device pointer 6958 * 6959 * Bring hardware to some kind of halt state so that no one can touch it 6960 * any more. It will help to maintain error context when error occurred. 6961 * Compare to a simple hang, the system will keep stable at least for SSH 6962 * access. Then it should be trivial to inspect the hardware state and 6963 * see what's going on. Implemented as following: 6964 * 6965 * 1. drm_dev_unplug() makes device inaccessible to user space(IOCTLs, etc), 6966 * clears all CPU mappings to device, disallows remappings through page faults 6967 * 2. amdgpu_irq_disable_all() disables all interrupts 6968 * 3. amdgpu_fence_driver_hw_fini() signals all HW fences 6969 * 4. set adev->no_hw_access to avoid potential crashes after setp 5 6970 * 5. amdgpu_device_unmap_mmio() clears all MMIO mappings 6971 * 6. pci_disable_device() and pci_wait_for_pending_transaction() 6972 * flush any in flight DMA operations 6973 */ 6974 void amdgpu_device_halt(struct amdgpu_device *adev) 6975 { 6976 struct pci_dev *pdev = adev->pdev; 6977 struct drm_device *ddev = adev_to_drm(adev); 6978 6979 amdgpu_xcp_dev_unplug(adev); 6980 drm_dev_unplug(ddev); 6981 6982 amdgpu_irq_disable_all(adev); 6983 6984 amdgpu_fence_driver_hw_fini(adev); 6985 6986 adev->no_hw_access = true; 6987 6988 amdgpu_device_unmap_mmio(adev); 6989 6990 pci_disable_device(pdev); 6991 pci_wait_for_pending_transaction(pdev); 6992 } 6993 6994 u32 amdgpu_device_pcie_port_rreg(struct amdgpu_device *adev, 6995 u32 reg) 6996 { 6997 unsigned long flags, address, data; 6998 u32 r; 6999 7000 address = adev->nbio.funcs->get_pcie_port_index_offset(adev); 7001 data = adev->nbio.funcs->get_pcie_port_data_offset(adev); 7002 7003 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 7004 WREG32(address, reg * 4); 7005 (void)RREG32(address); 7006 r = RREG32(data); 7007 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 7008 return r; 7009 } 7010 7011 void amdgpu_device_pcie_port_wreg(struct amdgpu_device *adev, 7012 u32 reg, u32 v) 7013 { 7014 unsigned long flags, address, data; 7015 7016 address = adev->nbio.funcs->get_pcie_port_index_offset(adev); 7017 data = adev->nbio.funcs->get_pcie_port_data_offset(adev); 7018 7019 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 7020 WREG32(address, reg * 4); 7021 (void)RREG32(address); 7022 WREG32(data, v); 7023 (void)RREG32(data); 7024 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 7025 } 7026 7027 /** 7028 * amdgpu_device_get_gang - return a reference to the current gang 7029 * @adev: amdgpu_device pointer 7030 * 7031 * Returns: A new reference to the current gang leader. 7032 */ 7033 struct dma_fence *amdgpu_device_get_gang(struct amdgpu_device *adev) 7034 { 7035 struct dma_fence *fence; 7036 7037 rcu_read_lock(); 7038 fence = dma_fence_get_rcu_safe(&adev->gang_submit); 7039 rcu_read_unlock(); 7040 return fence; 7041 } 7042 7043 /** 7044 * amdgpu_device_switch_gang - switch to a new gang 7045 * @adev: amdgpu_device pointer 7046 * @gang: the gang to switch to 7047 * 7048 * Try to switch to a new gang. 7049 * Returns: NULL if we switched to the new gang or a reference to the current 7050 * gang leader. 7051 */ 7052 struct dma_fence *amdgpu_device_switch_gang(struct amdgpu_device *adev, 7053 struct dma_fence *gang) 7054 { 7055 struct dma_fence *old = NULL; 7056 7057 dma_fence_get(gang); 7058 do { 7059 dma_fence_put(old); 7060 old = amdgpu_device_get_gang(adev); 7061 if (old == gang) 7062 break; 7063 7064 if (!dma_fence_is_signaled(old)) { 7065 dma_fence_put(gang); 7066 return old; 7067 } 7068 7069 } while (cmpxchg((struct dma_fence __force **)&adev->gang_submit, 7070 old, gang) != old); 7071 7072 /* 7073 * Drop it once for the exchanged reference in adev and once for the 7074 * thread local reference acquired in amdgpu_device_get_gang(). 7075 */ 7076 dma_fence_put(old); 7077 dma_fence_put(old); 7078 return NULL; 7079 } 7080 7081 /** 7082 * amdgpu_device_enforce_isolation - enforce HW isolation 7083 * @adev: the amdgpu device pointer 7084 * @ring: the HW ring the job is supposed to run on 7085 * @job: the job which is about to be pushed to the HW ring 7086 * 7087 * Makes sure that only one client at a time can use the GFX block. 7088 * Returns: The dependency to wait on before the job can be pushed to the HW. 7089 * The function is called multiple times until NULL is returned. 7090 */ 7091 struct dma_fence *amdgpu_device_enforce_isolation(struct amdgpu_device *adev, 7092 struct amdgpu_ring *ring, 7093 struct amdgpu_job *job) 7094 { 7095 struct amdgpu_isolation *isolation = &adev->isolation[ring->xcp_id]; 7096 struct drm_sched_fence *f = job->base.s_fence; 7097 struct dma_fence *dep; 7098 void *owner; 7099 int r; 7100 7101 /* 7102 * For now enforce isolation only for the GFX block since we only need 7103 * the cleaner shader on those rings. 7104 */ 7105 if (ring->funcs->type != AMDGPU_RING_TYPE_GFX && 7106 ring->funcs->type != AMDGPU_RING_TYPE_COMPUTE) 7107 return NULL; 7108 7109 /* 7110 * All submissions where enforce isolation is false are handled as if 7111 * they come from a single client. Use ~0l as the owner to distinct it 7112 * from kernel submissions where the owner is NULL. 7113 */ 7114 owner = job->enforce_isolation ? f->owner : (void *)~0l; 7115 7116 mutex_lock(&adev->enforce_isolation_mutex); 7117 7118 /* 7119 * The "spearhead" submission is the first one which changes the 7120 * ownership to its client. We always need to wait for it to be 7121 * pushed to the HW before proceeding with anything. 7122 */ 7123 if (&f->scheduled != isolation->spearhead && 7124 !dma_fence_is_signaled(isolation->spearhead)) { 7125 dep = isolation->spearhead; 7126 goto out_grab_ref; 7127 } 7128 7129 if (isolation->owner != owner) { 7130 7131 /* 7132 * Wait for any gang to be assembled before switching to a 7133 * different owner or otherwise we could deadlock the 7134 * submissions. 7135 */ 7136 if (!job->gang_submit) { 7137 dep = amdgpu_device_get_gang(adev); 7138 if (!dma_fence_is_signaled(dep)) 7139 goto out_return_dep; 7140 dma_fence_put(dep); 7141 } 7142 7143 dma_fence_put(isolation->spearhead); 7144 isolation->spearhead = dma_fence_get(&f->scheduled); 7145 amdgpu_sync_move(&isolation->active, &isolation->prev); 7146 trace_amdgpu_isolation(isolation->owner, owner); 7147 isolation->owner = owner; 7148 } 7149 7150 /* 7151 * Specifying the ring here helps to pipeline submissions even when 7152 * isolation is enabled. If that is not desired for testing NULL can be 7153 * used instead of the ring to enforce a CPU round trip while switching 7154 * between clients. 7155 */ 7156 dep = amdgpu_sync_peek_fence(&isolation->prev, ring); 7157 r = amdgpu_sync_fence(&isolation->active, &f->finished, GFP_NOWAIT); 7158 if (r) 7159 DRM_WARN("OOM tracking isolation\n"); 7160 7161 out_grab_ref: 7162 dma_fence_get(dep); 7163 out_return_dep: 7164 mutex_unlock(&adev->enforce_isolation_mutex); 7165 return dep; 7166 } 7167 7168 bool amdgpu_device_has_display_hardware(struct amdgpu_device *adev) 7169 { 7170 switch (adev->asic_type) { 7171 #ifdef CONFIG_DRM_AMDGPU_SI 7172 case CHIP_HAINAN: 7173 #endif 7174 case CHIP_TOPAZ: 7175 /* chips with no display hardware */ 7176 return false; 7177 #ifdef CONFIG_DRM_AMDGPU_SI 7178 case CHIP_TAHITI: 7179 case CHIP_PITCAIRN: 7180 case CHIP_VERDE: 7181 case CHIP_OLAND: 7182 #endif 7183 #ifdef CONFIG_DRM_AMDGPU_CIK 7184 case CHIP_BONAIRE: 7185 case CHIP_HAWAII: 7186 case CHIP_KAVERI: 7187 case CHIP_KABINI: 7188 case CHIP_MULLINS: 7189 #endif 7190 case CHIP_TONGA: 7191 case CHIP_FIJI: 7192 case CHIP_POLARIS10: 7193 case CHIP_POLARIS11: 7194 case CHIP_POLARIS12: 7195 case CHIP_VEGAM: 7196 case CHIP_CARRIZO: 7197 case CHIP_STONEY: 7198 /* chips with display hardware */ 7199 return true; 7200 default: 7201 /* IP discovery */ 7202 if (!amdgpu_ip_version(adev, DCE_HWIP, 0) || 7203 (adev->harvest_ip_mask & AMD_HARVEST_IP_DMU_MASK)) 7204 return false; 7205 return true; 7206 } 7207 } 7208 7209 uint32_t amdgpu_device_wait_on_rreg(struct amdgpu_device *adev, 7210 uint32_t inst, uint32_t reg_addr, char reg_name[], 7211 uint32_t expected_value, uint32_t mask) 7212 { 7213 uint32_t ret = 0; 7214 uint32_t old_ = 0; 7215 uint32_t tmp_ = RREG32(reg_addr); 7216 uint32_t loop = adev->usec_timeout; 7217 7218 while ((tmp_ & (mask)) != (expected_value)) { 7219 if (old_ != tmp_) { 7220 loop = adev->usec_timeout; 7221 old_ = tmp_; 7222 } else 7223 udelay(1); 7224 tmp_ = RREG32(reg_addr); 7225 loop--; 7226 if (!loop) { 7227 DRM_WARN("Register(%d) [%s] failed to reach value 0x%08x != 0x%08xn", 7228 inst, reg_name, (uint32_t)expected_value, 7229 (uint32_t)(tmp_ & (mask))); 7230 ret = -ETIMEDOUT; 7231 break; 7232 } 7233 } 7234 return ret; 7235 } 7236 7237 ssize_t amdgpu_get_soft_full_reset_mask(struct amdgpu_ring *ring) 7238 { 7239 ssize_t size = 0; 7240 7241 if (!ring || !ring->adev) 7242 return size; 7243 7244 if (amdgpu_device_should_recover_gpu(ring->adev)) 7245 size |= AMDGPU_RESET_TYPE_FULL; 7246 7247 if (unlikely(!ring->adev->debug_disable_soft_recovery) && 7248 !amdgpu_sriov_vf(ring->adev) && ring->funcs->soft_recovery) 7249 size |= AMDGPU_RESET_TYPE_SOFT_RESET; 7250 7251 return size; 7252 } 7253 7254 ssize_t amdgpu_show_reset_mask(char *buf, uint32_t supported_reset) 7255 { 7256 ssize_t size = 0; 7257 7258 if (supported_reset == 0) { 7259 size += sysfs_emit_at(buf, size, "unsupported"); 7260 size += sysfs_emit_at(buf, size, "\n"); 7261 return size; 7262 7263 } 7264 7265 if (supported_reset & AMDGPU_RESET_TYPE_SOFT_RESET) 7266 size += sysfs_emit_at(buf, size, "soft "); 7267 7268 if (supported_reset & AMDGPU_RESET_TYPE_PER_QUEUE) 7269 size += sysfs_emit_at(buf, size, "queue "); 7270 7271 if (supported_reset & AMDGPU_RESET_TYPE_PER_PIPE) 7272 size += sysfs_emit_at(buf, size, "pipe "); 7273 7274 if (supported_reset & AMDGPU_RESET_TYPE_FULL) 7275 size += sysfs_emit_at(buf, size, "full "); 7276 7277 size += sysfs_emit_at(buf, size, "\n"); 7278 return size; 7279 } 7280