1 /* 2 * Copyright 2008 Advanced Micro Devices, Inc. 3 * Copyright 2008 Red Hat Inc. 4 * Copyright 2009 Jerome Glisse. 5 * 6 * Permission is hereby granted, free of charge, to any person obtaining a 7 * copy of this software and associated documentation files (the "Software"), 8 * to deal in the Software without restriction, including without limitation 9 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 10 * and/or sell copies of the Software, and to permit persons to whom the 11 * Software is furnished to do so, subject to the following conditions: 12 * 13 * The above copyright notice and this permission notice shall be included in 14 * all copies or substantial portions of the Software. 15 * 16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 19 * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR 20 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, 21 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR 22 * OTHER DEALINGS IN THE SOFTWARE. 23 * 24 * Authors: Dave Airlie 25 * Alex Deucher 26 * Jerome Glisse 27 */ 28 29 #include <linux/aperture.h> 30 #include <linux/power_supply.h> 31 #include <linux/kthread.h> 32 #include <linux/module.h> 33 #include <linux/console.h> 34 #include <linux/slab.h> 35 #include <linux/iommu.h> 36 #include <linux/pci.h> 37 #include <linux/pci-p2pdma.h> 38 #include <linux/apple-gmux.h> 39 40 #include <drm/drm_atomic_helper.h> 41 #include <drm/drm_client_event.h> 42 #include <drm/drm_crtc_helper.h> 43 #include <drm/drm_probe_helper.h> 44 #include <drm/amdgpu_drm.h> 45 #include <linux/device.h> 46 #include <linux/vgaarb.h> 47 #include <linux/vga_switcheroo.h> 48 #include <linux/efi.h> 49 #include "amdgpu.h" 50 #include "amdgpu_trace.h" 51 #include "amdgpu_i2c.h" 52 #include "atom.h" 53 #include "amdgpu_atombios.h" 54 #include "amdgpu_atomfirmware.h" 55 #include "amd_pcie.h" 56 #ifdef CONFIG_DRM_AMDGPU_SI 57 #include "si.h" 58 #endif 59 #ifdef CONFIG_DRM_AMDGPU_CIK 60 #include "cik.h" 61 #endif 62 #include "vi.h" 63 #include "soc15.h" 64 #include "nv.h" 65 #include "bif/bif_4_1_d.h" 66 #include <linux/firmware.h> 67 #include "amdgpu_vf_error.h" 68 69 #include "amdgpu_amdkfd.h" 70 #include "amdgpu_pm.h" 71 72 #include "amdgpu_xgmi.h" 73 #include "amdgpu_ras.h" 74 #include "amdgpu_pmu.h" 75 #include "amdgpu_fru_eeprom.h" 76 #include "amdgpu_reset.h" 77 #include "amdgpu_virt.h" 78 #include "amdgpu_dev_coredump.h" 79 80 #include <linux/suspend.h> 81 #include <drm/task_barrier.h> 82 #include <linux/pm_runtime.h> 83 84 #include <drm/drm_drv.h> 85 86 #if IS_ENABLED(CONFIG_X86) 87 #include <asm/intel-family.h> 88 #endif 89 90 MODULE_FIRMWARE("amdgpu/vega10_gpu_info.bin"); 91 MODULE_FIRMWARE("amdgpu/vega12_gpu_info.bin"); 92 MODULE_FIRMWARE("amdgpu/raven_gpu_info.bin"); 93 MODULE_FIRMWARE("amdgpu/picasso_gpu_info.bin"); 94 MODULE_FIRMWARE("amdgpu/raven2_gpu_info.bin"); 95 MODULE_FIRMWARE("amdgpu/arcturus_gpu_info.bin"); 96 MODULE_FIRMWARE("amdgpu/navi12_gpu_info.bin"); 97 98 #define AMDGPU_RESUME_MS 2000 99 #define AMDGPU_MAX_RETRY_LIMIT 2 100 #define AMDGPU_RETRY_SRIOV_RESET(r) ((r) == -EBUSY || (r) == -ETIMEDOUT || (r) == -EINVAL) 101 #define AMDGPU_PCIE_INDEX_FALLBACK (0x38 >> 2) 102 #define AMDGPU_PCIE_INDEX_HI_FALLBACK (0x44 >> 2) 103 #define AMDGPU_PCIE_DATA_FALLBACK (0x3C >> 2) 104 105 #define AMDGPU_VBIOS_SKIP (1U << 0) 106 #define AMDGPU_VBIOS_OPTIONAL (1U << 1) 107 108 static const struct drm_driver amdgpu_kms_driver; 109 110 const char *amdgpu_asic_name[] = { 111 "TAHITI", 112 "PITCAIRN", 113 "VERDE", 114 "OLAND", 115 "HAINAN", 116 "BONAIRE", 117 "KAVERI", 118 "KABINI", 119 "HAWAII", 120 "MULLINS", 121 "TOPAZ", 122 "TONGA", 123 "FIJI", 124 "CARRIZO", 125 "STONEY", 126 "POLARIS10", 127 "POLARIS11", 128 "POLARIS12", 129 "VEGAM", 130 "VEGA10", 131 "VEGA12", 132 "VEGA20", 133 "RAVEN", 134 "ARCTURUS", 135 "RENOIR", 136 "ALDEBARAN", 137 "NAVI10", 138 "CYAN_SKILLFISH", 139 "NAVI14", 140 "NAVI12", 141 "SIENNA_CICHLID", 142 "NAVY_FLOUNDER", 143 "VANGOGH", 144 "DIMGREY_CAVEFISH", 145 "BEIGE_GOBY", 146 "YELLOW_CARP", 147 "IP DISCOVERY", 148 "LAST", 149 }; 150 151 #define AMDGPU_IP_BLK_MASK_ALL GENMASK(AMD_IP_BLOCK_TYPE_NUM - 1, 0) 152 /* 153 * Default init level where all blocks are expected to be initialized. This is 154 * the level of initialization expected by default and also after a full reset 155 * of the device. 156 */ 157 struct amdgpu_init_level amdgpu_init_default = { 158 .level = AMDGPU_INIT_LEVEL_DEFAULT, 159 .hwini_ip_block_mask = AMDGPU_IP_BLK_MASK_ALL, 160 }; 161 162 struct amdgpu_init_level amdgpu_init_recovery = { 163 .level = AMDGPU_INIT_LEVEL_RESET_RECOVERY, 164 .hwini_ip_block_mask = AMDGPU_IP_BLK_MASK_ALL, 165 }; 166 167 /* 168 * Minimal blocks needed to be initialized before a XGMI hive can be reset. This 169 * is used for cases like reset on initialization where the entire hive needs to 170 * be reset before first use. 171 */ 172 struct amdgpu_init_level amdgpu_init_minimal_xgmi = { 173 .level = AMDGPU_INIT_LEVEL_MINIMAL_XGMI, 174 .hwini_ip_block_mask = 175 BIT(AMD_IP_BLOCK_TYPE_GMC) | BIT(AMD_IP_BLOCK_TYPE_SMC) | 176 BIT(AMD_IP_BLOCK_TYPE_COMMON) | BIT(AMD_IP_BLOCK_TYPE_IH) | 177 BIT(AMD_IP_BLOCK_TYPE_PSP) 178 }; 179 180 static inline bool amdgpu_ip_member_of_hwini(struct amdgpu_device *adev, 181 enum amd_ip_block_type block) 182 { 183 return (adev->init_lvl->hwini_ip_block_mask & (1U << block)) != 0; 184 } 185 186 void amdgpu_set_init_level(struct amdgpu_device *adev, 187 enum amdgpu_init_lvl_id lvl) 188 { 189 switch (lvl) { 190 case AMDGPU_INIT_LEVEL_MINIMAL_XGMI: 191 adev->init_lvl = &amdgpu_init_minimal_xgmi; 192 break; 193 case AMDGPU_INIT_LEVEL_RESET_RECOVERY: 194 adev->init_lvl = &amdgpu_init_recovery; 195 break; 196 case AMDGPU_INIT_LEVEL_DEFAULT: 197 fallthrough; 198 default: 199 adev->init_lvl = &amdgpu_init_default; 200 break; 201 } 202 } 203 204 static inline void amdgpu_device_stop_pending_resets(struct amdgpu_device *adev); 205 static int amdgpu_device_pm_notifier(struct notifier_block *nb, unsigned long mode, 206 void *data); 207 208 /** 209 * DOC: pcie_replay_count 210 * 211 * The amdgpu driver provides a sysfs API for reporting the total number 212 * of PCIe replays (NAKs). 213 * The file pcie_replay_count is used for this and returns the total 214 * number of replays as a sum of the NAKs generated and NAKs received. 215 */ 216 217 static ssize_t amdgpu_device_get_pcie_replay_count(struct device *dev, 218 struct device_attribute *attr, char *buf) 219 { 220 struct drm_device *ddev = dev_get_drvdata(dev); 221 struct amdgpu_device *adev = drm_to_adev(ddev); 222 uint64_t cnt = amdgpu_asic_get_pcie_replay_count(adev); 223 224 return sysfs_emit(buf, "%llu\n", cnt); 225 } 226 227 static DEVICE_ATTR(pcie_replay_count, 0444, 228 amdgpu_device_get_pcie_replay_count, NULL); 229 230 static ssize_t amdgpu_sysfs_reg_state_get(struct file *f, struct kobject *kobj, 231 struct bin_attribute *attr, char *buf, 232 loff_t ppos, size_t count) 233 { 234 struct device *dev = kobj_to_dev(kobj); 235 struct drm_device *ddev = dev_get_drvdata(dev); 236 struct amdgpu_device *adev = drm_to_adev(ddev); 237 ssize_t bytes_read; 238 239 switch (ppos) { 240 case AMDGPU_SYS_REG_STATE_XGMI: 241 bytes_read = amdgpu_asic_get_reg_state( 242 adev, AMDGPU_REG_STATE_TYPE_XGMI, buf, count); 243 break; 244 case AMDGPU_SYS_REG_STATE_WAFL: 245 bytes_read = amdgpu_asic_get_reg_state( 246 adev, AMDGPU_REG_STATE_TYPE_WAFL, buf, count); 247 break; 248 case AMDGPU_SYS_REG_STATE_PCIE: 249 bytes_read = amdgpu_asic_get_reg_state( 250 adev, AMDGPU_REG_STATE_TYPE_PCIE, buf, count); 251 break; 252 case AMDGPU_SYS_REG_STATE_USR: 253 bytes_read = amdgpu_asic_get_reg_state( 254 adev, AMDGPU_REG_STATE_TYPE_USR, buf, count); 255 break; 256 case AMDGPU_SYS_REG_STATE_USR_1: 257 bytes_read = amdgpu_asic_get_reg_state( 258 adev, AMDGPU_REG_STATE_TYPE_USR_1, buf, count); 259 break; 260 default: 261 return -EINVAL; 262 } 263 264 return bytes_read; 265 } 266 267 BIN_ATTR(reg_state, 0444, amdgpu_sysfs_reg_state_get, NULL, 268 AMDGPU_SYS_REG_STATE_END); 269 270 int amdgpu_reg_state_sysfs_init(struct amdgpu_device *adev) 271 { 272 int ret; 273 274 if (!amdgpu_asic_get_reg_state_supported(adev)) 275 return 0; 276 277 ret = sysfs_create_bin_file(&adev->dev->kobj, &bin_attr_reg_state); 278 279 return ret; 280 } 281 282 void amdgpu_reg_state_sysfs_fini(struct amdgpu_device *adev) 283 { 284 if (!amdgpu_asic_get_reg_state_supported(adev)) 285 return; 286 sysfs_remove_bin_file(&adev->dev->kobj, &bin_attr_reg_state); 287 } 288 289 int amdgpu_ip_block_suspend(struct amdgpu_ip_block *ip_block) 290 { 291 int r; 292 293 if (ip_block->version->funcs->suspend) { 294 r = ip_block->version->funcs->suspend(ip_block); 295 if (r) { 296 dev_err(ip_block->adev->dev, 297 "suspend of IP block <%s> failed %d\n", 298 ip_block->version->funcs->name, r); 299 return r; 300 } 301 } 302 303 ip_block->status.hw = false; 304 return 0; 305 } 306 307 int amdgpu_ip_block_resume(struct amdgpu_ip_block *ip_block) 308 { 309 int r; 310 311 if (ip_block->version->funcs->resume) { 312 r = ip_block->version->funcs->resume(ip_block); 313 if (r) { 314 dev_err(ip_block->adev->dev, 315 "resume of IP block <%s> failed %d\n", 316 ip_block->version->funcs->name, r); 317 return r; 318 } 319 } 320 321 ip_block->status.hw = true; 322 return 0; 323 } 324 325 /** 326 * DOC: board_info 327 * 328 * The amdgpu driver provides a sysfs API for giving board related information. 329 * It provides the form factor information in the format 330 * 331 * type : form factor 332 * 333 * Possible form factor values 334 * 335 * - "cem" - PCIE CEM card 336 * - "oam" - Open Compute Accelerator Module 337 * - "unknown" - Not known 338 * 339 */ 340 341 static ssize_t amdgpu_device_get_board_info(struct device *dev, 342 struct device_attribute *attr, 343 char *buf) 344 { 345 struct drm_device *ddev = dev_get_drvdata(dev); 346 struct amdgpu_device *adev = drm_to_adev(ddev); 347 enum amdgpu_pkg_type pkg_type = AMDGPU_PKG_TYPE_CEM; 348 const char *pkg; 349 350 if (adev->smuio.funcs && adev->smuio.funcs->get_pkg_type) 351 pkg_type = adev->smuio.funcs->get_pkg_type(adev); 352 353 switch (pkg_type) { 354 case AMDGPU_PKG_TYPE_CEM: 355 pkg = "cem"; 356 break; 357 case AMDGPU_PKG_TYPE_OAM: 358 pkg = "oam"; 359 break; 360 default: 361 pkg = "unknown"; 362 break; 363 } 364 365 return sysfs_emit(buf, "%s : %s\n", "type", pkg); 366 } 367 368 static DEVICE_ATTR(board_info, 0444, amdgpu_device_get_board_info, NULL); 369 370 static struct attribute *amdgpu_board_attrs[] = { 371 &dev_attr_board_info.attr, 372 NULL, 373 }; 374 375 static umode_t amdgpu_board_attrs_is_visible(struct kobject *kobj, 376 struct attribute *attr, int n) 377 { 378 struct device *dev = kobj_to_dev(kobj); 379 struct drm_device *ddev = dev_get_drvdata(dev); 380 struct amdgpu_device *adev = drm_to_adev(ddev); 381 382 if (adev->flags & AMD_IS_APU) 383 return 0; 384 385 return attr->mode; 386 } 387 388 static const struct attribute_group amdgpu_board_attrs_group = { 389 .attrs = amdgpu_board_attrs, 390 .is_visible = amdgpu_board_attrs_is_visible 391 }; 392 393 static void amdgpu_device_get_pcie_info(struct amdgpu_device *adev); 394 395 396 /** 397 * amdgpu_device_supports_px - Is the device a dGPU with ATPX power control 398 * 399 * @dev: drm_device pointer 400 * 401 * Returns true if the device is a dGPU with ATPX power control, 402 * otherwise return false. 403 */ 404 bool amdgpu_device_supports_px(struct drm_device *dev) 405 { 406 struct amdgpu_device *adev = drm_to_adev(dev); 407 408 if ((adev->flags & AMD_IS_PX) && !amdgpu_is_atpx_hybrid()) 409 return true; 410 return false; 411 } 412 413 /** 414 * amdgpu_device_supports_boco - Is the device a dGPU with ACPI power resources 415 * 416 * @dev: drm_device pointer 417 * 418 * Returns true if the device is a dGPU with ACPI power control, 419 * otherwise return false. 420 */ 421 bool amdgpu_device_supports_boco(struct drm_device *dev) 422 { 423 struct amdgpu_device *adev = drm_to_adev(dev); 424 425 if (!IS_ENABLED(CONFIG_HOTPLUG_PCI_PCIE)) 426 return false; 427 428 if (adev->has_pr3 || 429 ((adev->flags & AMD_IS_PX) && amdgpu_is_atpx_hybrid())) 430 return true; 431 return false; 432 } 433 434 /** 435 * amdgpu_device_supports_baco - Does the device support BACO 436 * 437 * @dev: drm_device pointer 438 * 439 * Return: 440 * 1 if the device supports BACO; 441 * 3 if the device supports MACO (only works if BACO is supported) 442 * otherwise return 0. 443 */ 444 int amdgpu_device_supports_baco(struct drm_device *dev) 445 { 446 struct amdgpu_device *adev = drm_to_adev(dev); 447 448 return amdgpu_asic_supports_baco(adev); 449 } 450 451 void amdgpu_device_detect_runtime_pm_mode(struct amdgpu_device *adev) 452 { 453 struct drm_device *dev; 454 int bamaco_support; 455 456 dev = adev_to_drm(adev); 457 458 adev->pm.rpm_mode = AMDGPU_RUNPM_NONE; 459 bamaco_support = amdgpu_device_supports_baco(dev); 460 461 switch (amdgpu_runtime_pm) { 462 case 2: 463 if (bamaco_support & MACO_SUPPORT) { 464 adev->pm.rpm_mode = AMDGPU_RUNPM_BAMACO; 465 dev_info(adev->dev, "Forcing BAMACO for runtime pm\n"); 466 } else if (bamaco_support == BACO_SUPPORT) { 467 adev->pm.rpm_mode = AMDGPU_RUNPM_BACO; 468 dev_info(adev->dev, "Requested mode BAMACO not available,fallback to use BACO\n"); 469 } 470 break; 471 case 1: 472 if (bamaco_support & BACO_SUPPORT) { 473 adev->pm.rpm_mode = AMDGPU_RUNPM_BACO; 474 dev_info(adev->dev, "Forcing BACO for runtime pm\n"); 475 } 476 break; 477 case -1: 478 case -2: 479 if (amdgpu_device_supports_px(dev)) { /* enable PX as runtime mode */ 480 adev->pm.rpm_mode = AMDGPU_RUNPM_PX; 481 dev_info(adev->dev, "Using ATPX for runtime pm\n"); 482 } else if (amdgpu_device_supports_boco(dev)) { /* enable boco as runtime mode */ 483 adev->pm.rpm_mode = AMDGPU_RUNPM_BOCO; 484 dev_info(adev->dev, "Using BOCO for runtime pm\n"); 485 } else { 486 if (!bamaco_support) 487 goto no_runtime_pm; 488 489 switch (adev->asic_type) { 490 case CHIP_VEGA20: 491 case CHIP_ARCTURUS: 492 /* BACO are not supported on vega20 and arctrus */ 493 break; 494 case CHIP_VEGA10: 495 /* enable BACO as runpm mode if noretry=0 */ 496 if (!adev->gmc.noretry) 497 adev->pm.rpm_mode = AMDGPU_RUNPM_BACO; 498 break; 499 default: 500 /* enable BACO as runpm mode on CI+ */ 501 adev->pm.rpm_mode = AMDGPU_RUNPM_BACO; 502 break; 503 } 504 505 if (adev->pm.rpm_mode == AMDGPU_RUNPM_BACO) { 506 if (bamaco_support & MACO_SUPPORT) { 507 adev->pm.rpm_mode = AMDGPU_RUNPM_BAMACO; 508 dev_info(adev->dev, "Using BAMACO for runtime pm\n"); 509 } else { 510 dev_info(adev->dev, "Using BACO for runtime pm\n"); 511 } 512 } 513 } 514 break; 515 case 0: 516 dev_info(adev->dev, "runtime pm is manually disabled\n"); 517 break; 518 default: 519 break; 520 } 521 522 no_runtime_pm: 523 if (adev->pm.rpm_mode == AMDGPU_RUNPM_NONE) 524 dev_info(adev->dev, "Runtime PM not available\n"); 525 } 526 /** 527 * amdgpu_device_supports_smart_shift - Is the device dGPU with 528 * smart shift support 529 * 530 * @dev: drm_device pointer 531 * 532 * Returns true if the device is a dGPU with Smart Shift support, 533 * otherwise returns false. 534 */ 535 bool amdgpu_device_supports_smart_shift(struct drm_device *dev) 536 { 537 return (amdgpu_device_supports_boco(dev) && 538 amdgpu_acpi_is_power_shift_control_supported()); 539 } 540 541 /* 542 * VRAM access helper functions 543 */ 544 545 /** 546 * amdgpu_device_mm_access - access vram by MM_INDEX/MM_DATA 547 * 548 * @adev: amdgpu_device pointer 549 * @pos: offset of the buffer in vram 550 * @buf: virtual address of the buffer in system memory 551 * @size: read/write size, sizeof(@buf) must > @size 552 * @write: true - write to vram, otherwise - read from vram 553 */ 554 void amdgpu_device_mm_access(struct amdgpu_device *adev, loff_t pos, 555 void *buf, size_t size, bool write) 556 { 557 unsigned long flags; 558 uint32_t hi = ~0, tmp = 0; 559 uint32_t *data = buf; 560 uint64_t last; 561 int idx; 562 563 if (!drm_dev_enter(adev_to_drm(adev), &idx)) 564 return; 565 566 BUG_ON(!IS_ALIGNED(pos, 4) || !IS_ALIGNED(size, 4)); 567 568 spin_lock_irqsave(&adev->mmio_idx_lock, flags); 569 for (last = pos + size; pos < last; pos += 4) { 570 tmp = pos >> 31; 571 572 WREG32_NO_KIQ(mmMM_INDEX, ((uint32_t)pos) | 0x80000000); 573 if (tmp != hi) { 574 WREG32_NO_KIQ(mmMM_INDEX_HI, tmp); 575 hi = tmp; 576 } 577 if (write) 578 WREG32_NO_KIQ(mmMM_DATA, *data++); 579 else 580 *data++ = RREG32_NO_KIQ(mmMM_DATA); 581 } 582 583 spin_unlock_irqrestore(&adev->mmio_idx_lock, flags); 584 drm_dev_exit(idx); 585 } 586 587 /** 588 * amdgpu_device_aper_access - access vram by vram aperture 589 * 590 * @adev: amdgpu_device pointer 591 * @pos: offset of the buffer in vram 592 * @buf: virtual address of the buffer in system memory 593 * @size: read/write size, sizeof(@buf) must > @size 594 * @write: true - write to vram, otherwise - read from vram 595 * 596 * The return value means how many bytes have been transferred. 597 */ 598 size_t amdgpu_device_aper_access(struct amdgpu_device *adev, loff_t pos, 599 void *buf, size_t size, bool write) 600 { 601 #ifdef CONFIG_64BIT 602 void __iomem *addr; 603 size_t count = 0; 604 uint64_t last; 605 606 if (!adev->mman.aper_base_kaddr) 607 return 0; 608 609 last = min(pos + size, adev->gmc.visible_vram_size); 610 if (last > pos) { 611 addr = adev->mman.aper_base_kaddr + pos; 612 count = last - pos; 613 614 if (write) { 615 memcpy_toio(addr, buf, count); 616 /* Make sure HDP write cache flush happens without any reordering 617 * after the system memory contents are sent over PCIe device 618 */ 619 mb(); 620 amdgpu_device_flush_hdp(adev, NULL); 621 } else { 622 amdgpu_device_invalidate_hdp(adev, NULL); 623 /* Make sure HDP read cache is invalidated before issuing a read 624 * to the PCIe device 625 */ 626 mb(); 627 memcpy_fromio(buf, addr, count); 628 } 629 630 } 631 632 return count; 633 #else 634 return 0; 635 #endif 636 } 637 638 /** 639 * amdgpu_device_vram_access - read/write a buffer in vram 640 * 641 * @adev: amdgpu_device pointer 642 * @pos: offset of the buffer in vram 643 * @buf: virtual address of the buffer in system memory 644 * @size: read/write size, sizeof(@buf) must > @size 645 * @write: true - write to vram, otherwise - read from vram 646 */ 647 void amdgpu_device_vram_access(struct amdgpu_device *adev, loff_t pos, 648 void *buf, size_t size, bool write) 649 { 650 size_t count; 651 652 /* try to using vram apreature to access vram first */ 653 count = amdgpu_device_aper_access(adev, pos, buf, size, write); 654 size -= count; 655 if (size) { 656 /* using MM to access rest vram */ 657 pos += count; 658 buf += count; 659 amdgpu_device_mm_access(adev, pos, buf, size, write); 660 } 661 } 662 663 /* 664 * register access helper functions. 665 */ 666 667 /* Check if hw access should be skipped because of hotplug or device error */ 668 bool amdgpu_device_skip_hw_access(struct amdgpu_device *adev) 669 { 670 if (adev->no_hw_access) 671 return true; 672 673 #ifdef CONFIG_LOCKDEP 674 /* 675 * This is a bit complicated to understand, so worth a comment. What we assert 676 * here is that the GPU reset is not running on another thread in parallel. 677 * 678 * For this we trylock the read side of the reset semaphore, if that succeeds 679 * we know that the reset is not running in parallel. 680 * 681 * If the trylock fails we assert that we are either already holding the read 682 * side of the lock or are the reset thread itself and hold the write side of 683 * the lock. 684 */ 685 if (in_task()) { 686 if (down_read_trylock(&adev->reset_domain->sem)) 687 up_read(&adev->reset_domain->sem); 688 else 689 lockdep_assert_held(&adev->reset_domain->sem); 690 } 691 #endif 692 return false; 693 } 694 695 /** 696 * amdgpu_device_rreg - read a memory mapped IO or indirect register 697 * 698 * @adev: amdgpu_device pointer 699 * @reg: dword aligned register offset 700 * @acc_flags: access flags which require special behavior 701 * 702 * Returns the 32 bit value from the offset specified. 703 */ 704 uint32_t amdgpu_device_rreg(struct amdgpu_device *adev, 705 uint32_t reg, uint32_t acc_flags) 706 { 707 uint32_t ret; 708 709 if (amdgpu_device_skip_hw_access(adev)) 710 return 0; 711 712 if ((reg * 4) < adev->rmmio_size) { 713 if (!(acc_flags & AMDGPU_REGS_NO_KIQ) && 714 amdgpu_sriov_runtime(adev) && 715 down_read_trylock(&adev->reset_domain->sem)) { 716 ret = amdgpu_kiq_rreg(adev, reg, 0); 717 up_read(&adev->reset_domain->sem); 718 } else { 719 ret = readl(((void __iomem *)adev->rmmio) + (reg * 4)); 720 } 721 } else { 722 ret = adev->pcie_rreg(adev, reg * 4); 723 } 724 725 trace_amdgpu_device_rreg(adev->pdev->device, reg, ret); 726 727 return ret; 728 } 729 730 /* 731 * MMIO register read with bytes helper functions 732 * @offset:bytes offset from MMIO start 733 */ 734 735 /** 736 * amdgpu_mm_rreg8 - read a memory mapped IO register 737 * 738 * @adev: amdgpu_device pointer 739 * @offset: byte aligned register offset 740 * 741 * Returns the 8 bit value from the offset specified. 742 */ 743 uint8_t amdgpu_mm_rreg8(struct amdgpu_device *adev, uint32_t offset) 744 { 745 if (amdgpu_device_skip_hw_access(adev)) 746 return 0; 747 748 if (offset < adev->rmmio_size) 749 return (readb(adev->rmmio + offset)); 750 BUG(); 751 } 752 753 754 /** 755 * amdgpu_device_xcc_rreg - read a memory mapped IO or indirect register with specific XCC 756 * 757 * @adev: amdgpu_device pointer 758 * @reg: dword aligned register offset 759 * @acc_flags: access flags which require special behavior 760 * @xcc_id: xcc accelerated compute core id 761 * 762 * Returns the 32 bit value from the offset specified. 763 */ 764 uint32_t amdgpu_device_xcc_rreg(struct amdgpu_device *adev, 765 uint32_t reg, uint32_t acc_flags, 766 uint32_t xcc_id) 767 { 768 uint32_t ret, rlcg_flag; 769 770 if (amdgpu_device_skip_hw_access(adev)) 771 return 0; 772 773 if ((reg * 4) < adev->rmmio_size) { 774 if (amdgpu_sriov_vf(adev) && 775 !amdgpu_sriov_runtime(adev) && 776 adev->gfx.rlc.rlcg_reg_access_supported && 777 amdgpu_virt_get_rlcg_reg_access_flag(adev, acc_flags, 778 GC_HWIP, false, 779 &rlcg_flag)) { 780 ret = amdgpu_virt_rlcg_reg_rw(adev, reg, 0, rlcg_flag, GET_INST(GC, xcc_id)); 781 } else if (!(acc_flags & AMDGPU_REGS_NO_KIQ) && 782 amdgpu_sriov_runtime(adev) && 783 down_read_trylock(&adev->reset_domain->sem)) { 784 ret = amdgpu_kiq_rreg(adev, reg, xcc_id); 785 up_read(&adev->reset_domain->sem); 786 } else { 787 ret = readl(((void __iomem *)adev->rmmio) + (reg * 4)); 788 } 789 } else { 790 ret = adev->pcie_rreg(adev, reg * 4); 791 } 792 793 return ret; 794 } 795 796 /* 797 * MMIO register write with bytes helper functions 798 * @offset:bytes offset from MMIO start 799 * @value: the value want to be written to the register 800 */ 801 802 /** 803 * amdgpu_mm_wreg8 - read a memory mapped IO register 804 * 805 * @adev: amdgpu_device pointer 806 * @offset: byte aligned register offset 807 * @value: 8 bit value to write 808 * 809 * Writes the value specified to the offset specified. 810 */ 811 void amdgpu_mm_wreg8(struct amdgpu_device *adev, uint32_t offset, uint8_t value) 812 { 813 if (amdgpu_device_skip_hw_access(adev)) 814 return; 815 816 if (offset < adev->rmmio_size) 817 writeb(value, adev->rmmio + offset); 818 else 819 BUG(); 820 } 821 822 /** 823 * amdgpu_device_wreg - write to a memory mapped IO or indirect register 824 * 825 * @adev: amdgpu_device pointer 826 * @reg: dword aligned register offset 827 * @v: 32 bit value to write to the register 828 * @acc_flags: access flags which require special behavior 829 * 830 * Writes the value specified to the offset specified. 831 */ 832 void amdgpu_device_wreg(struct amdgpu_device *adev, 833 uint32_t reg, uint32_t v, 834 uint32_t acc_flags) 835 { 836 if (amdgpu_device_skip_hw_access(adev)) 837 return; 838 839 if ((reg * 4) < adev->rmmio_size) { 840 if (!(acc_flags & AMDGPU_REGS_NO_KIQ) && 841 amdgpu_sriov_runtime(adev) && 842 down_read_trylock(&adev->reset_domain->sem)) { 843 amdgpu_kiq_wreg(adev, reg, v, 0); 844 up_read(&adev->reset_domain->sem); 845 } else { 846 writel(v, ((void __iomem *)adev->rmmio) + (reg * 4)); 847 } 848 } else { 849 adev->pcie_wreg(adev, reg * 4, v); 850 } 851 852 trace_amdgpu_device_wreg(adev->pdev->device, reg, v); 853 } 854 855 /** 856 * amdgpu_mm_wreg_mmio_rlc - write register either with direct/indirect mmio or with RLC path if in range 857 * 858 * @adev: amdgpu_device pointer 859 * @reg: mmio/rlc register 860 * @v: value to write 861 * @xcc_id: xcc accelerated compute core id 862 * 863 * this function is invoked only for the debugfs register access 864 */ 865 void amdgpu_mm_wreg_mmio_rlc(struct amdgpu_device *adev, 866 uint32_t reg, uint32_t v, 867 uint32_t xcc_id) 868 { 869 if (amdgpu_device_skip_hw_access(adev)) 870 return; 871 872 if (amdgpu_sriov_fullaccess(adev) && 873 adev->gfx.rlc.funcs && 874 adev->gfx.rlc.funcs->is_rlcg_access_range) { 875 if (adev->gfx.rlc.funcs->is_rlcg_access_range(adev, reg)) 876 return amdgpu_sriov_wreg(adev, reg, v, 0, 0, xcc_id); 877 } else if ((reg * 4) >= adev->rmmio_size) { 878 adev->pcie_wreg(adev, reg * 4, v); 879 } else { 880 writel(v, ((void __iomem *)adev->rmmio) + (reg * 4)); 881 } 882 } 883 884 /** 885 * amdgpu_device_xcc_wreg - write to a memory mapped IO or indirect register with specific XCC 886 * 887 * @adev: amdgpu_device pointer 888 * @reg: dword aligned register offset 889 * @v: 32 bit value to write to the register 890 * @acc_flags: access flags which require special behavior 891 * @xcc_id: xcc accelerated compute core id 892 * 893 * Writes the value specified to the offset specified. 894 */ 895 void amdgpu_device_xcc_wreg(struct amdgpu_device *adev, 896 uint32_t reg, uint32_t v, 897 uint32_t acc_flags, uint32_t xcc_id) 898 { 899 uint32_t rlcg_flag; 900 901 if (amdgpu_device_skip_hw_access(adev)) 902 return; 903 904 if ((reg * 4) < adev->rmmio_size) { 905 if (amdgpu_sriov_vf(adev) && 906 !amdgpu_sriov_runtime(adev) && 907 adev->gfx.rlc.rlcg_reg_access_supported && 908 amdgpu_virt_get_rlcg_reg_access_flag(adev, acc_flags, 909 GC_HWIP, true, 910 &rlcg_flag)) { 911 amdgpu_virt_rlcg_reg_rw(adev, reg, v, rlcg_flag, GET_INST(GC, xcc_id)); 912 } else if (!(acc_flags & AMDGPU_REGS_NO_KIQ) && 913 amdgpu_sriov_runtime(adev) && 914 down_read_trylock(&adev->reset_domain->sem)) { 915 amdgpu_kiq_wreg(adev, reg, v, xcc_id); 916 up_read(&adev->reset_domain->sem); 917 } else { 918 writel(v, ((void __iomem *)adev->rmmio) + (reg * 4)); 919 } 920 } else { 921 adev->pcie_wreg(adev, reg * 4, v); 922 } 923 } 924 925 /** 926 * amdgpu_device_indirect_rreg - read an indirect register 927 * 928 * @adev: amdgpu_device pointer 929 * @reg_addr: indirect register address to read from 930 * 931 * Returns the value of indirect register @reg_addr 932 */ 933 u32 amdgpu_device_indirect_rreg(struct amdgpu_device *adev, 934 u32 reg_addr) 935 { 936 unsigned long flags, pcie_index, pcie_data; 937 void __iomem *pcie_index_offset; 938 void __iomem *pcie_data_offset; 939 u32 r; 940 941 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 942 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 943 944 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 945 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 946 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 947 948 writel(reg_addr, pcie_index_offset); 949 readl(pcie_index_offset); 950 r = readl(pcie_data_offset); 951 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 952 953 return r; 954 } 955 956 u32 amdgpu_device_indirect_rreg_ext(struct amdgpu_device *adev, 957 u64 reg_addr) 958 { 959 unsigned long flags, pcie_index, pcie_index_hi, pcie_data; 960 u32 r; 961 void __iomem *pcie_index_offset; 962 void __iomem *pcie_index_hi_offset; 963 void __iomem *pcie_data_offset; 964 965 if (unlikely(!adev->nbio.funcs)) { 966 pcie_index = AMDGPU_PCIE_INDEX_FALLBACK; 967 pcie_data = AMDGPU_PCIE_DATA_FALLBACK; 968 } else { 969 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 970 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 971 } 972 973 if (reg_addr >> 32) { 974 if (unlikely(!adev->nbio.funcs)) 975 pcie_index_hi = AMDGPU_PCIE_INDEX_HI_FALLBACK; 976 else 977 pcie_index_hi = adev->nbio.funcs->get_pcie_index_hi_offset(adev); 978 } else { 979 pcie_index_hi = 0; 980 } 981 982 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 983 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 984 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 985 if (pcie_index_hi != 0) 986 pcie_index_hi_offset = (void __iomem *)adev->rmmio + 987 pcie_index_hi * 4; 988 989 writel(reg_addr, pcie_index_offset); 990 readl(pcie_index_offset); 991 if (pcie_index_hi != 0) { 992 writel((reg_addr >> 32) & 0xff, pcie_index_hi_offset); 993 readl(pcie_index_hi_offset); 994 } 995 r = readl(pcie_data_offset); 996 997 /* clear the high bits */ 998 if (pcie_index_hi != 0) { 999 writel(0, pcie_index_hi_offset); 1000 readl(pcie_index_hi_offset); 1001 } 1002 1003 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 1004 1005 return r; 1006 } 1007 1008 /** 1009 * amdgpu_device_indirect_rreg64 - read a 64bits indirect register 1010 * 1011 * @adev: amdgpu_device pointer 1012 * @reg_addr: indirect register address to read from 1013 * 1014 * Returns the value of indirect register @reg_addr 1015 */ 1016 u64 amdgpu_device_indirect_rreg64(struct amdgpu_device *adev, 1017 u32 reg_addr) 1018 { 1019 unsigned long flags, pcie_index, pcie_data; 1020 void __iomem *pcie_index_offset; 1021 void __iomem *pcie_data_offset; 1022 u64 r; 1023 1024 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 1025 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 1026 1027 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 1028 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 1029 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 1030 1031 /* read low 32 bits */ 1032 writel(reg_addr, pcie_index_offset); 1033 readl(pcie_index_offset); 1034 r = readl(pcie_data_offset); 1035 /* read high 32 bits */ 1036 writel(reg_addr + 4, pcie_index_offset); 1037 readl(pcie_index_offset); 1038 r |= ((u64)readl(pcie_data_offset) << 32); 1039 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 1040 1041 return r; 1042 } 1043 1044 u64 amdgpu_device_indirect_rreg64_ext(struct amdgpu_device *adev, 1045 u64 reg_addr) 1046 { 1047 unsigned long flags, pcie_index, pcie_data; 1048 unsigned long pcie_index_hi = 0; 1049 void __iomem *pcie_index_offset; 1050 void __iomem *pcie_index_hi_offset; 1051 void __iomem *pcie_data_offset; 1052 u64 r; 1053 1054 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 1055 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 1056 if ((reg_addr >> 32) && (adev->nbio.funcs->get_pcie_index_hi_offset)) 1057 pcie_index_hi = adev->nbio.funcs->get_pcie_index_hi_offset(adev); 1058 1059 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 1060 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 1061 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 1062 if (pcie_index_hi != 0) 1063 pcie_index_hi_offset = (void __iomem *)adev->rmmio + 1064 pcie_index_hi * 4; 1065 1066 /* read low 32 bits */ 1067 writel(reg_addr, pcie_index_offset); 1068 readl(pcie_index_offset); 1069 if (pcie_index_hi != 0) { 1070 writel((reg_addr >> 32) & 0xff, pcie_index_hi_offset); 1071 readl(pcie_index_hi_offset); 1072 } 1073 r = readl(pcie_data_offset); 1074 /* read high 32 bits */ 1075 writel(reg_addr + 4, pcie_index_offset); 1076 readl(pcie_index_offset); 1077 if (pcie_index_hi != 0) { 1078 writel((reg_addr >> 32) & 0xff, pcie_index_hi_offset); 1079 readl(pcie_index_hi_offset); 1080 } 1081 r |= ((u64)readl(pcie_data_offset) << 32); 1082 1083 /* clear the high bits */ 1084 if (pcie_index_hi != 0) { 1085 writel(0, pcie_index_hi_offset); 1086 readl(pcie_index_hi_offset); 1087 } 1088 1089 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 1090 1091 return r; 1092 } 1093 1094 /** 1095 * amdgpu_device_indirect_wreg - write an indirect register address 1096 * 1097 * @adev: amdgpu_device pointer 1098 * @reg_addr: indirect register offset 1099 * @reg_data: indirect register data 1100 * 1101 */ 1102 void amdgpu_device_indirect_wreg(struct amdgpu_device *adev, 1103 u32 reg_addr, u32 reg_data) 1104 { 1105 unsigned long flags, pcie_index, pcie_data; 1106 void __iomem *pcie_index_offset; 1107 void __iomem *pcie_data_offset; 1108 1109 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 1110 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 1111 1112 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 1113 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 1114 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 1115 1116 writel(reg_addr, pcie_index_offset); 1117 readl(pcie_index_offset); 1118 writel(reg_data, pcie_data_offset); 1119 readl(pcie_data_offset); 1120 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 1121 } 1122 1123 void amdgpu_device_indirect_wreg_ext(struct amdgpu_device *adev, 1124 u64 reg_addr, u32 reg_data) 1125 { 1126 unsigned long flags, pcie_index, pcie_index_hi, pcie_data; 1127 void __iomem *pcie_index_offset; 1128 void __iomem *pcie_index_hi_offset; 1129 void __iomem *pcie_data_offset; 1130 1131 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 1132 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 1133 if ((reg_addr >> 32) && (adev->nbio.funcs->get_pcie_index_hi_offset)) 1134 pcie_index_hi = adev->nbio.funcs->get_pcie_index_hi_offset(adev); 1135 else 1136 pcie_index_hi = 0; 1137 1138 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 1139 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 1140 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 1141 if (pcie_index_hi != 0) 1142 pcie_index_hi_offset = (void __iomem *)adev->rmmio + 1143 pcie_index_hi * 4; 1144 1145 writel(reg_addr, pcie_index_offset); 1146 readl(pcie_index_offset); 1147 if (pcie_index_hi != 0) { 1148 writel((reg_addr >> 32) & 0xff, pcie_index_hi_offset); 1149 readl(pcie_index_hi_offset); 1150 } 1151 writel(reg_data, pcie_data_offset); 1152 readl(pcie_data_offset); 1153 1154 /* clear the high bits */ 1155 if (pcie_index_hi != 0) { 1156 writel(0, pcie_index_hi_offset); 1157 readl(pcie_index_hi_offset); 1158 } 1159 1160 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 1161 } 1162 1163 /** 1164 * amdgpu_device_indirect_wreg64 - write a 64bits indirect register address 1165 * 1166 * @adev: amdgpu_device pointer 1167 * @reg_addr: indirect register offset 1168 * @reg_data: indirect register data 1169 * 1170 */ 1171 void amdgpu_device_indirect_wreg64(struct amdgpu_device *adev, 1172 u32 reg_addr, u64 reg_data) 1173 { 1174 unsigned long flags, pcie_index, pcie_data; 1175 void __iomem *pcie_index_offset; 1176 void __iomem *pcie_data_offset; 1177 1178 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 1179 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 1180 1181 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 1182 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 1183 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 1184 1185 /* write low 32 bits */ 1186 writel(reg_addr, pcie_index_offset); 1187 readl(pcie_index_offset); 1188 writel((u32)(reg_data & 0xffffffffULL), pcie_data_offset); 1189 readl(pcie_data_offset); 1190 /* write high 32 bits */ 1191 writel(reg_addr + 4, pcie_index_offset); 1192 readl(pcie_index_offset); 1193 writel((u32)(reg_data >> 32), pcie_data_offset); 1194 readl(pcie_data_offset); 1195 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 1196 } 1197 1198 void amdgpu_device_indirect_wreg64_ext(struct amdgpu_device *adev, 1199 u64 reg_addr, u64 reg_data) 1200 { 1201 unsigned long flags, pcie_index, pcie_data; 1202 unsigned long pcie_index_hi = 0; 1203 void __iomem *pcie_index_offset; 1204 void __iomem *pcie_index_hi_offset; 1205 void __iomem *pcie_data_offset; 1206 1207 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 1208 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 1209 if ((reg_addr >> 32) && (adev->nbio.funcs->get_pcie_index_hi_offset)) 1210 pcie_index_hi = adev->nbio.funcs->get_pcie_index_hi_offset(adev); 1211 1212 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 1213 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 1214 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 1215 if (pcie_index_hi != 0) 1216 pcie_index_hi_offset = (void __iomem *)adev->rmmio + 1217 pcie_index_hi * 4; 1218 1219 /* write low 32 bits */ 1220 writel(reg_addr, pcie_index_offset); 1221 readl(pcie_index_offset); 1222 if (pcie_index_hi != 0) { 1223 writel((reg_addr >> 32) & 0xff, pcie_index_hi_offset); 1224 readl(pcie_index_hi_offset); 1225 } 1226 writel((u32)(reg_data & 0xffffffffULL), pcie_data_offset); 1227 readl(pcie_data_offset); 1228 /* write high 32 bits */ 1229 writel(reg_addr + 4, pcie_index_offset); 1230 readl(pcie_index_offset); 1231 if (pcie_index_hi != 0) { 1232 writel((reg_addr >> 32) & 0xff, pcie_index_hi_offset); 1233 readl(pcie_index_hi_offset); 1234 } 1235 writel((u32)(reg_data >> 32), pcie_data_offset); 1236 readl(pcie_data_offset); 1237 1238 /* clear the high bits */ 1239 if (pcie_index_hi != 0) { 1240 writel(0, pcie_index_hi_offset); 1241 readl(pcie_index_hi_offset); 1242 } 1243 1244 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 1245 } 1246 1247 /** 1248 * amdgpu_device_get_rev_id - query device rev_id 1249 * 1250 * @adev: amdgpu_device pointer 1251 * 1252 * Return device rev_id 1253 */ 1254 u32 amdgpu_device_get_rev_id(struct amdgpu_device *adev) 1255 { 1256 return adev->nbio.funcs->get_rev_id(adev); 1257 } 1258 1259 /** 1260 * amdgpu_invalid_rreg - dummy reg read function 1261 * 1262 * @adev: amdgpu_device pointer 1263 * @reg: offset of register 1264 * 1265 * Dummy register read function. Used for register blocks 1266 * that certain asics don't have (all asics). 1267 * Returns the value in the register. 1268 */ 1269 static uint32_t amdgpu_invalid_rreg(struct amdgpu_device *adev, uint32_t reg) 1270 { 1271 DRM_ERROR("Invalid callback to read register 0x%04X\n", reg); 1272 BUG(); 1273 return 0; 1274 } 1275 1276 static uint32_t amdgpu_invalid_rreg_ext(struct amdgpu_device *adev, uint64_t reg) 1277 { 1278 DRM_ERROR("Invalid callback to read register 0x%llX\n", reg); 1279 BUG(); 1280 return 0; 1281 } 1282 1283 /** 1284 * amdgpu_invalid_wreg - dummy reg write function 1285 * 1286 * @adev: amdgpu_device pointer 1287 * @reg: offset of register 1288 * @v: value to write to the register 1289 * 1290 * Dummy register read function. Used for register blocks 1291 * that certain asics don't have (all asics). 1292 */ 1293 static void amdgpu_invalid_wreg(struct amdgpu_device *adev, uint32_t reg, uint32_t v) 1294 { 1295 DRM_ERROR("Invalid callback to write register 0x%04X with 0x%08X\n", 1296 reg, v); 1297 BUG(); 1298 } 1299 1300 static void amdgpu_invalid_wreg_ext(struct amdgpu_device *adev, uint64_t reg, uint32_t v) 1301 { 1302 DRM_ERROR("Invalid callback to write register 0x%llX with 0x%08X\n", 1303 reg, v); 1304 BUG(); 1305 } 1306 1307 /** 1308 * amdgpu_invalid_rreg64 - dummy 64 bit reg read function 1309 * 1310 * @adev: amdgpu_device pointer 1311 * @reg: offset of register 1312 * 1313 * Dummy register read function. Used for register blocks 1314 * that certain asics don't have (all asics). 1315 * Returns the value in the register. 1316 */ 1317 static uint64_t amdgpu_invalid_rreg64(struct amdgpu_device *adev, uint32_t reg) 1318 { 1319 DRM_ERROR("Invalid callback to read 64 bit register 0x%04X\n", reg); 1320 BUG(); 1321 return 0; 1322 } 1323 1324 static uint64_t amdgpu_invalid_rreg64_ext(struct amdgpu_device *adev, uint64_t reg) 1325 { 1326 DRM_ERROR("Invalid callback to read register 0x%llX\n", reg); 1327 BUG(); 1328 return 0; 1329 } 1330 1331 /** 1332 * amdgpu_invalid_wreg64 - dummy reg write function 1333 * 1334 * @adev: amdgpu_device pointer 1335 * @reg: offset of register 1336 * @v: value to write to the register 1337 * 1338 * Dummy register read function. Used for register blocks 1339 * that certain asics don't have (all asics). 1340 */ 1341 static void amdgpu_invalid_wreg64(struct amdgpu_device *adev, uint32_t reg, uint64_t v) 1342 { 1343 DRM_ERROR("Invalid callback to write 64 bit register 0x%04X with 0x%08llX\n", 1344 reg, v); 1345 BUG(); 1346 } 1347 1348 static void amdgpu_invalid_wreg64_ext(struct amdgpu_device *adev, uint64_t reg, uint64_t v) 1349 { 1350 DRM_ERROR("Invalid callback to write 64 bit register 0x%llX with 0x%08llX\n", 1351 reg, v); 1352 BUG(); 1353 } 1354 1355 /** 1356 * amdgpu_block_invalid_rreg - dummy reg read function 1357 * 1358 * @adev: amdgpu_device pointer 1359 * @block: offset of instance 1360 * @reg: offset of register 1361 * 1362 * Dummy register read function. Used for register blocks 1363 * that certain asics don't have (all asics). 1364 * Returns the value in the register. 1365 */ 1366 static uint32_t amdgpu_block_invalid_rreg(struct amdgpu_device *adev, 1367 uint32_t block, uint32_t reg) 1368 { 1369 DRM_ERROR("Invalid callback to read register 0x%04X in block 0x%04X\n", 1370 reg, block); 1371 BUG(); 1372 return 0; 1373 } 1374 1375 /** 1376 * amdgpu_block_invalid_wreg - dummy reg write function 1377 * 1378 * @adev: amdgpu_device pointer 1379 * @block: offset of instance 1380 * @reg: offset of register 1381 * @v: value to write to the register 1382 * 1383 * Dummy register read function. Used for register blocks 1384 * that certain asics don't have (all asics). 1385 */ 1386 static void amdgpu_block_invalid_wreg(struct amdgpu_device *adev, 1387 uint32_t block, 1388 uint32_t reg, uint32_t v) 1389 { 1390 DRM_ERROR("Invalid block callback to write register 0x%04X in block 0x%04X with 0x%08X\n", 1391 reg, block, v); 1392 BUG(); 1393 } 1394 1395 static uint32_t amdgpu_device_get_vbios_flags(struct amdgpu_device *adev) 1396 { 1397 if (hweight32(adev->aid_mask) && (adev->flags & AMD_IS_APU)) 1398 return AMDGPU_VBIOS_SKIP; 1399 1400 if (hweight32(adev->aid_mask) && amdgpu_passthrough(adev)) 1401 return AMDGPU_VBIOS_OPTIONAL; 1402 1403 return 0; 1404 } 1405 1406 /** 1407 * amdgpu_device_asic_init - Wrapper for atom asic_init 1408 * 1409 * @adev: amdgpu_device pointer 1410 * 1411 * Does any asic specific work and then calls atom asic init. 1412 */ 1413 static int amdgpu_device_asic_init(struct amdgpu_device *adev) 1414 { 1415 uint32_t flags; 1416 bool optional; 1417 int ret; 1418 1419 amdgpu_asic_pre_asic_init(adev); 1420 flags = amdgpu_device_get_vbios_flags(adev); 1421 optional = !!(flags & (AMDGPU_VBIOS_OPTIONAL | AMDGPU_VBIOS_SKIP)); 1422 1423 if (amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 3) || 1424 amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 4) || 1425 amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 5, 0) || 1426 amdgpu_ip_version(adev, GC_HWIP, 0) >= IP_VERSION(11, 0, 0)) { 1427 amdgpu_psp_wait_for_bootloader(adev); 1428 if (optional && !adev->bios) 1429 return 0; 1430 1431 ret = amdgpu_atomfirmware_asic_init(adev, true); 1432 return ret; 1433 } else { 1434 if (optional && !adev->bios) 1435 return 0; 1436 1437 return amdgpu_atom_asic_init(adev->mode_info.atom_context); 1438 } 1439 1440 return 0; 1441 } 1442 1443 /** 1444 * amdgpu_device_mem_scratch_init - allocate the VRAM scratch page 1445 * 1446 * @adev: amdgpu_device pointer 1447 * 1448 * Allocates a scratch page of VRAM for use by various things in the 1449 * driver. 1450 */ 1451 static int amdgpu_device_mem_scratch_init(struct amdgpu_device *adev) 1452 { 1453 return amdgpu_bo_create_kernel(adev, AMDGPU_GPU_PAGE_SIZE, PAGE_SIZE, 1454 AMDGPU_GEM_DOMAIN_VRAM | 1455 AMDGPU_GEM_DOMAIN_GTT, 1456 &adev->mem_scratch.robj, 1457 &adev->mem_scratch.gpu_addr, 1458 (void **)&adev->mem_scratch.ptr); 1459 } 1460 1461 /** 1462 * amdgpu_device_mem_scratch_fini - Free the VRAM scratch page 1463 * 1464 * @adev: amdgpu_device pointer 1465 * 1466 * Frees the VRAM scratch page. 1467 */ 1468 static void amdgpu_device_mem_scratch_fini(struct amdgpu_device *adev) 1469 { 1470 amdgpu_bo_free_kernel(&adev->mem_scratch.robj, NULL, NULL); 1471 } 1472 1473 /** 1474 * amdgpu_device_program_register_sequence - program an array of registers. 1475 * 1476 * @adev: amdgpu_device pointer 1477 * @registers: pointer to the register array 1478 * @array_size: size of the register array 1479 * 1480 * Programs an array or registers with and or masks. 1481 * This is a helper for setting golden registers. 1482 */ 1483 void amdgpu_device_program_register_sequence(struct amdgpu_device *adev, 1484 const u32 *registers, 1485 const u32 array_size) 1486 { 1487 u32 tmp, reg, and_mask, or_mask; 1488 int i; 1489 1490 if (array_size % 3) 1491 return; 1492 1493 for (i = 0; i < array_size; i += 3) { 1494 reg = registers[i + 0]; 1495 and_mask = registers[i + 1]; 1496 or_mask = registers[i + 2]; 1497 1498 if (and_mask == 0xffffffff) { 1499 tmp = or_mask; 1500 } else { 1501 tmp = RREG32(reg); 1502 tmp &= ~and_mask; 1503 if (adev->family >= AMDGPU_FAMILY_AI) 1504 tmp |= (or_mask & and_mask); 1505 else 1506 tmp |= or_mask; 1507 } 1508 WREG32(reg, tmp); 1509 } 1510 } 1511 1512 /** 1513 * amdgpu_device_pci_config_reset - reset the GPU 1514 * 1515 * @adev: amdgpu_device pointer 1516 * 1517 * Resets the GPU using the pci config reset sequence. 1518 * Only applicable to asics prior to vega10. 1519 */ 1520 void amdgpu_device_pci_config_reset(struct amdgpu_device *adev) 1521 { 1522 pci_write_config_dword(adev->pdev, 0x7c, AMDGPU_ASIC_RESET_DATA); 1523 } 1524 1525 /** 1526 * amdgpu_device_pci_reset - reset the GPU using generic PCI means 1527 * 1528 * @adev: amdgpu_device pointer 1529 * 1530 * Resets the GPU using generic pci reset interfaces (FLR, SBR, etc.). 1531 */ 1532 int amdgpu_device_pci_reset(struct amdgpu_device *adev) 1533 { 1534 return pci_reset_function(adev->pdev); 1535 } 1536 1537 /* 1538 * amdgpu_device_wb_*() 1539 * Writeback is the method by which the GPU updates special pages in memory 1540 * with the status of certain GPU events (fences, ring pointers,etc.). 1541 */ 1542 1543 /** 1544 * amdgpu_device_wb_fini - Disable Writeback and free memory 1545 * 1546 * @adev: amdgpu_device pointer 1547 * 1548 * Disables Writeback and frees the Writeback memory (all asics). 1549 * Used at driver shutdown. 1550 */ 1551 static void amdgpu_device_wb_fini(struct amdgpu_device *adev) 1552 { 1553 if (adev->wb.wb_obj) { 1554 amdgpu_bo_free_kernel(&adev->wb.wb_obj, 1555 &adev->wb.gpu_addr, 1556 (void **)&adev->wb.wb); 1557 adev->wb.wb_obj = NULL; 1558 } 1559 } 1560 1561 /** 1562 * amdgpu_device_wb_init - Init Writeback driver info and allocate memory 1563 * 1564 * @adev: amdgpu_device pointer 1565 * 1566 * Initializes writeback and allocates writeback memory (all asics). 1567 * Used at driver startup. 1568 * Returns 0 on success or an -error on failure. 1569 */ 1570 static int amdgpu_device_wb_init(struct amdgpu_device *adev) 1571 { 1572 int r; 1573 1574 if (adev->wb.wb_obj == NULL) { 1575 /* AMDGPU_MAX_WB * sizeof(uint32_t) * 8 = AMDGPU_MAX_WB 256bit slots */ 1576 r = amdgpu_bo_create_kernel(adev, AMDGPU_MAX_WB * sizeof(uint32_t) * 8, 1577 PAGE_SIZE, AMDGPU_GEM_DOMAIN_GTT, 1578 &adev->wb.wb_obj, &adev->wb.gpu_addr, 1579 (void **)&adev->wb.wb); 1580 if (r) { 1581 dev_warn(adev->dev, "(%d) create WB bo failed\n", r); 1582 return r; 1583 } 1584 1585 adev->wb.num_wb = AMDGPU_MAX_WB; 1586 memset(&adev->wb.used, 0, sizeof(adev->wb.used)); 1587 1588 /* clear wb memory */ 1589 memset((char *)adev->wb.wb, 0, AMDGPU_MAX_WB * sizeof(uint32_t) * 8); 1590 } 1591 1592 return 0; 1593 } 1594 1595 /** 1596 * amdgpu_device_wb_get - Allocate a wb entry 1597 * 1598 * @adev: amdgpu_device pointer 1599 * @wb: wb index 1600 * 1601 * Allocate a wb slot for use by the driver (all asics). 1602 * Returns 0 on success or -EINVAL on failure. 1603 */ 1604 int amdgpu_device_wb_get(struct amdgpu_device *adev, u32 *wb) 1605 { 1606 unsigned long flags, offset; 1607 1608 spin_lock_irqsave(&adev->wb.lock, flags); 1609 offset = find_first_zero_bit(adev->wb.used, adev->wb.num_wb); 1610 if (offset < adev->wb.num_wb) { 1611 __set_bit(offset, adev->wb.used); 1612 spin_unlock_irqrestore(&adev->wb.lock, flags); 1613 *wb = offset << 3; /* convert to dw offset */ 1614 return 0; 1615 } else { 1616 spin_unlock_irqrestore(&adev->wb.lock, flags); 1617 return -EINVAL; 1618 } 1619 } 1620 1621 /** 1622 * amdgpu_device_wb_free - Free a wb entry 1623 * 1624 * @adev: amdgpu_device pointer 1625 * @wb: wb index 1626 * 1627 * Free a wb slot allocated for use by the driver (all asics) 1628 */ 1629 void amdgpu_device_wb_free(struct amdgpu_device *adev, u32 wb) 1630 { 1631 unsigned long flags; 1632 1633 wb >>= 3; 1634 spin_lock_irqsave(&adev->wb.lock, flags); 1635 if (wb < adev->wb.num_wb) 1636 __clear_bit(wb, adev->wb.used); 1637 spin_unlock_irqrestore(&adev->wb.lock, flags); 1638 } 1639 1640 /** 1641 * amdgpu_device_resize_fb_bar - try to resize FB BAR 1642 * 1643 * @adev: amdgpu_device pointer 1644 * 1645 * Try to resize FB BAR to make all VRAM CPU accessible. We try very hard not 1646 * to fail, but if any of the BARs is not accessible after the size we abort 1647 * driver loading by returning -ENODEV. 1648 */ 1649 int amdgpu_device_resize_fb_bar(struct amdgpu_device *adev) 1650 { 1651 int rbar_size = pci_rebar_bytes_to_size(adev->gmc.real_vram_size); 1652 struct pci_bus *root; 1653 struct resource *res; 1654 unsigned int i; 1655 u16 cmd; 1656 int r; 1657 1658 if (!IS_ENABLED(CONFIG_PHYS_ADDR_T_64BIT)) 1659 return 0; 1660 1661 /* Bypass for VF */ 1662 if (amdgpu_sriov_vf(adev)) 1663 return 0; 1664 1665 /* resizing on Dell G5 SE platforms causes problems with runtime pm */ 1666 if ((amdgpu_runtime_pm != 0) && 1667 adev->pdev->vendor == PCI_VENDOR_ID_ATI && 1668 adev->pdev->device == 0x731f && 1669 adev->pdev->subsystem_vendor == PCI_VENDOR_ID_DELL) 1670 return 0; 1671 1672 /* PCI_EXT_CAP_ID_VNDR extended capability is located at 0x100 */ 1673 if (!pci_find_ext_capability(adev->pdev, PCI_EXT_CAP_ID_VNDR)) 1674 DRM_WARN("System can't access extended configuration space, please check!!\n"); 1675 1676 /* skip if the bios has already enabled large BAR */ 1677 if (adev->gmc.real_vram_size && 1678 (pci_resource_len(adev->pdev, 0) >= adev->gmc.real_vram_size)) 1679 return 0; 1680 1681 /* Check if the root BUS has 64bit memory resources */ 1682 root = adev->pdev->bus; 1683 while (root->parent) 1684 root = root->parent; 1685 1686 pci_bus_for_each_resource(root, res, i) { 1687 if (res && res->flags & (IORESOURCE_MEM | IORESOURCE_MEM_64) && 1688 res->start > 0x100000000ull) 1689 break; 1690 } 1691 1692 /* Trying to resize is pointless without a root hub window above 4GB */ 1693 if (!res) 1694 return 0; 1695 1696 /* Limit the BAR size to what is available */ 1697 rbar_size = min(fls(pci_rebar_get_possible_sizes(adev->pdev, 0)) - 1, 1698 rbar_size); 1699 1700 /* Disable memory decoding while we change the BAR addresses and size */ 1701 pci_read_config_word(adev->pdev, PCI_COMMAND, &cmd); 1702 pci_write_config_word(adev->pdev, PCI_COMMAND, 1703 cmd & ~PCI_COMMAND_MEMORY); 1704 1705 /* Free the VRAM and doorbell BAR, we most likely need to move both. */ 1706 amdgpu_doorbell_fini(adev); 1707 if (adev->asic_type >= CHIP_BONAIRE) 1708 pci_release_resource(adev->pdev, 2); 1709 1710 pci_release_resource(adev->pdev, 0); 1711 1712 r = pci_resize_resource(adev->pdev, 0, rbar_size); 1713 if (r == -ENOSPC) 1714 DRM_INFO("Not enough PCI address space for a large BAR."); 1715 else if (r && r != -ENOTSUPP) 1716 DRM_ERROR("Problem resizing BAR0 (%d).", r); 1717 1718 pci_assign_unassigned_bus_resources(adev->pdev->bus); 1719 1720 /* When the doorbell or fb BAR isn't available we have no chance of 1721 * using the device. 1722 */ 1723 r = amdgpu_doorbell_init(adev); 1724 if (r || (pci_resource_flags(adev->pdev, 0) & IORESOURCE_UNSET)) 1725 return -ENODEV; 1726 1727 pci_write_config_word(adev->pdev, PCI_COMMAND, cmd); 1728 1729 return 0; 1730 } 1731 1732 /* 1733 * GPU helpers function. 1734 */ 1735 /** 1736 * amdgpu_device_need_post - check if the hw need post or not 1737 * 1738 * @adev: amdgpu_device pointer 1739 * 1740 * Check if the asic has been initialized (all asics) at driver startup 1741 * or post is needed if hw reset is performed. 1742 * Returns true if need or false if not. 1743 */ 1744 bool amdgpu_device_need_post(struct amdgpu_device *adev) 1745 { 1746 uint32_t reg, flags; 1747 1748 if (amdgpu_sriov_vf(adev)) 1749 return false; 1750 1751 flags = amdgpu_device_get_vbios_flags(adev); 1752 if (flags & AMDGPU_VBIOS_SKIP) 1753 return false; 1754 if ((flags & AMDGPU_VBIOS_OPTIONAL) && !adev->bios) 1755 return false; 1756 1757 if (amdgpu_passthrough(adev)) { 1758 /* for FIJI: In whole GPU pass-through virtualization case, after VM reboot 1759 * some old smc fw still need driver do vPost otherwise gpu hang, while 1760 * those smc fw version above 22.15 doesn't have this flaw, so we force 1761 * vpost executed for smc version below 22.15 1762 */ 1763 if (adev->asic_type == CHIP_FIJI) { 1764 int err; 1765 uint32_t fw_ver; 1766 1767 err = request_firmware(&adev->pm.fw, "amdgpu/fiji_smc.bin", adev->dev); 1768 /* force vPost if error occurred */ 1769 if (err) 1770 return true; 1771 1772 fw_ver = *((uint32_t *)adev->pm.fw->data + 69); 1773 release_firmware(adev->pm.fw); 1774 if (fw_ver < 0x00160e00) 1775 return true; 1776 } 1777 } 1778 1779 /* Don't post if we need to reset whole hive on init */ 1780 if (adev->init_lvl->level == AMDGPU_INIT_LEVEL_MINIMAL_XGMI) 1781 return false; 1782 1783 if (adev->has_hw_reset) { 1784 adev->has_hw_reset = false; 1785 return true; 1786 } 1787 1788 /* bios scratch used on CIK+ */ 1789 if (adev->asic_type >= CHIP_BONAIRE) 1790 return amdgpu_atombios_scratch_need_asic_init(adev); 1791 1792 /* check MEM_SIZE for older asics */ 1793 reg = amdgpu_asic_get_config_memsize(adev); 1794 1795 if ((reg != 0) && (reg != 0xffffffff)) 1796 return false; 1797 1798 return true; 1799 } 1800 1801 /* 1802 * Check whether seamless boot is supported. 1803 * 1804 * So far we only support seamless boot on DCE 3.0 or later. 1805 * If users report that it works on older ASICS as well, we may 1806 * loosen this. 1807 */ 1808 bool amdgpu_device_seamless_boot_supported(struct amdgpu_device *adev) 1809 { 1810 switch (amdgpu_seamless) { 1811 case -1: 1812 break; 1813 case 1: 1814 return true; 1815 case 0: 1816 return false; 1817 default: 1818 DRM_ERROR("Invalid value for amdgpu.seamless: %d\n", 1819 amdgpu_seamless); 1820 return false; 1821 } 1822 1823 if (!(adev->flags & AMD_IS_APU)) 1824 return false; 1825 1826 if (adev->mman.keep_stolen_vga_memory) 1827 return false; 1828 1829 return amdgpu_ip_version(adev, DCE_HWIP, 0) >= IP_VERSION(3, 0, 0); 1830 } 1831 1832 /* 1833 * Intel hosts such as Rocket Lake, Alder Lake, Raptor Lake and Sapphire Rapids 1834 * don't support dynamic speed switching. Until we have confirmation from Intel 1835 * that a specific host supports it, it's safer that we keep it disabled for all. 1836 * 1837 * https://edc.intel.com/content/www/us/en/design/products/platforms/details/raptor-lake-s/13th-generation-core-processors-datasheet-volume-1-of-2/005/pci-express-support/ 1838 * https://gitlab.freedesktop.org/drm/amd/-/issues/2663 1839 */ 1840 static bool amdgpu_device_pcie_dynamic_switching_supported(struct amdgpu_device *adev) 1841 { 1842 #if IS_ENABLED(CONFIG_X86) 1843 struct cpuinfo_x86 *c = &cpu_data(0); 1844 1845 /* eGPU change speeds based on USB4 fabric conditions */ 1846 if (dev_is_removable(adev->dev)) 1847 return true; 1848 1849 if (c->x86_vendor == X86_VENDOR_INTEL) 1850 return false; 1851 #endif 1852 return true; 1853 } 1854 1855 /** 1856 * amdgpu_device_should_use_aspm - check if the device should program ASPM 1857 * 1858 * @adev: amdgpu_device pointer 1859 * 1860 * Confirm whether the module parameter and pcie bridge agree that ASPM should 1861 * be set for this device. 1862 * 1863 * Returns true if it should be used or false if not. 1864 */ 1865 bool amdgpu_device_should_use_aspm(struct amdgpu_device *adev) 1866 { 1867 switch (amdgpu_aspm) { 1868 case -1: 1869 break; 1870 case 0: 1871 return false; 1872 case 1: 1873 return true; 1874 default: 1875 return false; 1876 } 1877 if (adev->flags & AMD_IS_APU) 1878 return false; 1879 if (!(adev->pm.pp_feature & PP_PCIE_DPM_MASK)) 1880 return false; 1881 return pcie_aspm_enabled(adev->pdev); 1882 } 1883 1884 /* if we get transitioned to only one device, take VGA back */ 1885 /** 1886 * amdgpu_device_vga_set_decode - enable/disable vga decode 1887 * 1888 * @pdev: PCI device pointer 1889 * @state: enable/disable vga decode 1890 * 1891 * Enable/disable vga decode (all asics). 1892 * Returns VGA resource flags. 1893 */ 1894 static unsigned int amdgpu_device_vga_set_decode(struct pci_dev *pdev, 1895 bool state) 1896 { 1897 struct amdgpu_device *adev = drm_to_adev(pci_get_drvdata(pdev)); 1898 1899 amdgpu_asic_set_vga_state(adev, state); 1900 if (state) 1901 return VGA_RSRC_LEGACY_IO | VGA_RSRC_LEGACY_MEM | 1902 VGA_RSRC_NORMAL_IO | VGA_RSRC_NORMAL_MEM; 1903 else 1904 return VGA_RSRC_NORMAL_IO | VGA_RSRC_NORMAL_MEM; 1905 } 1906 1907 /** 1908 * amdgpu_device_check_block_size - validate the vm block size 1909 * 1910 * @adev: amdgpu_device pointer 1911 * 1912 * Validates the vm block size specified via module parameter. 1913 * The vm block size defines number of bits in page table versus page directory, 1914 * a page is 4KB so we have 12 bits offset, minimum 9 bits in the 1915 * page table and the remaining bits are in the page directory. 1916 */ 1917 static void amdgpu_device_check_block_size(struct amdgpu_device *adev) 1918 { 1919 /* defines number of bits in page table versus page directory, 1920 * a page is 4KB so we have 12 bits offset, minimum 9 bits in the 1921 * page table and the remaining bits are in the page directory 1922 */ 1923 if (amdgpu_vm_block_size == -1) 1924 return; 1925 1926 if (amdgpu_vm_block_size < 9) { 1927 dev_warn(adev->dev, "VM page table size (%d) too small\n", 1928 amdgpu_vm_block_size); 1929 amdgpu_vm_block_size = -1; 1930 } 1931 } 1932 1933 /** 1934 * amdgpu_device_check_vm_size - validate the vm size 1935 * 1936 * @adev: amdgpu_device pointer 1937 * 1938 * Validates the vm size in GB specified via module parameter. 1939 * The VM size is the size of the GPU virtual memory space in GB. 1940 */ 1941 static void amdgpu_device_check_vm_size(struct amdgpu_device *adev) 1942 { 1943 /* no need to check the default value */ 1944 if (amdgpu_vm_size == -1) 1945 return; 1946 1947 if (amdgpu_vm_size < 1) { 1948 dev_warn(adev->dev, "VM size (%d) too small, min is 1GB\n", 1949 amdgpu_vm_size); 1950 amdgpu_vm_size = -1; 1951 } 1952 } 1953 1954 static void amdgpu_device_check_smu_prv_buffer_size(struct amdgpu_device *adev) 1955 { 1956 struct sysinfo si; 1957 bool is_os_64 = (sizeof(void *) == 8); 1958 uint64_t total_memory; 1959 uint64_t dram_size_seven_GB = 0x1B8000000; 1960 uint64_t dram_size_three_GB = 0xB8000000; 1961 1962 if (amdgpu_smu_memory_pool_size == 0) 1963 return; 1964 1965 if (!is_os_64) { 1966 DRM_WARN("Not 64-bit OS, feature not supported\n"); 1967 goto def_value; 1968 } 1969 si_meminfo(&si); 1970 total_memory = (uint64_t)si.totalram * si.mem_unit; 1971 1972 if ((amdgpu_smu_memory_pool_size == 1) || 1973 (amdgpu_smu_memory_pool_size == 2)) { 1974 if (total_memory < dram_size_three_GB) 1975 goto def_value1; 1976 } else if ((amdgpu_smu_memory_pool_size == 4) || 1977 (amdgpu_smu_memory_pool_size == 8)) { 1978 if (total_memory < dram_size_seven_GB) 1979 goto def_value1; 1980 } else { 1981 DRM_WARN("Smu memory pool size not supported\n"); 1982 goto def_value; 1983 } 1984 adev->pm.smu_prv_buffer_size = amdgpu_smu_memory_pool_size << 28; 1985 1986 return; 1987 1988 def_value1: 1989 DRM_WARN("No enough system memory\n"); 1990 def_value: 1991 adev->pm.smu_prv_buffer_size = 0; 1992 } 1993 1994 static int amdgpu_device_init_apu_flags(struct amdgpu_device *adev) 1995 { 1996 if (!(adev->flags & AMD_IS_APU) || 1997 adev->asic_type < CHIP_RAVEN) 1998 return 0; 1999 2000 switch (adev->asic_type) { 2001 case CHIP_RAVEN: 2002 if (adev->pdev->device == 0x15dd) 2003 adev->apu_flags |= AMD_APU_IS_RAVEN; 2004 if (adev->pdev->device == 0x15d8) 2005 adev->apu_flags |= AMD_APU_IS_PICASSO; 2006 break; 2007 case CHIP_RENOIR: 2008 if ((adev->pdev->device == 0x1636) || 2009 (adev->pdev->device == 0x164c)) 2010 adev->apu_flags |= AMD_APU_IS_RENOIR; 2011 else 2012 adev->apu_flags |= AMD_APU_IS_GREEN_SARDINE; 2013 break; 2014 case CHIP_VANGOGH: 2015 adev->apu_flags |= AMD_APU_IS_VANGOGH; 2016 break; 2017 case CHIP_YELLOW_CARP: 2018 break; 2019 case CHIP_CYAN_SKILLFISH: 2020 if ((adev->pdev->device == 0x13FE) || 2021 (adev->pdev->device == 0x143F)) 2022 adev->apu_flags |= AMD_APU_IS_CYAN_SKILLFISH2; 2023 break; 2024 default: 2025 break; 2026 } 2027 2028 return 0; 2029 } 2030 2031 /** 2032 * amdgpu_device_check_arguments - validate module params 2033 * 2034 * @adev: amdgpu_device pointer 2035 * 2036 * Validates certain module parameters and updates 2037 * the associated values used by the driver (all asics). 2038 */ 2039 static int amdgpu_device_check_arguments(struct amdgpu_device *adev) 2040 { 2041 int i; 2042 2043 if (amdgpu_sched_jobs < 4) { 2044 dev_warn(adev->dev, "sched jobs (%d) must be at least 4\n", 2045 amdgpu_sched_jobs); 2046 amdgpu_sched_jobs = 4; 2047 } else if (!is_power_of_2(amdgpu_sched_jobs)) { 2048 dev_warn(adev->dev, "sched jobs (%d) must be a power of 2\n", 2049 amdgpu_sched_jobs); 2050 amdgpu_sched_jobs = roundup_pow_of_two(amdgpu_sched_jobs); 2051 } 2052 2053 if (amdgpu_gart_size != -1 && amdgpu_gart_size < 32) { 2054 /* gart size must be greater or equal to 32M */ 2055 dev_warn(adev->dev, "gart size (%d) too small\n", 2056 amdgpu_gart_size); 2057 amdgpu_gart_size = -1; 2058 } 2059 2060 if (amdgpu_gtt_size != -1 && amdgpu_gtt_size < 32) { 2061 /* gtt size must be greater or equal to 32M */ 2062 dev_warn(adev->dev, "gtt size (%d) too small\n", 2063 amdgpu_gtt_size); 2064 amdgpu_gtt_size = -1; 2065 } 2066 2067 /* valid range is between 4 and 9 inclusive */ 2068 if (amdgpu_vm_fragment_size != -1 && 2069 (amdgpu_vm_fragment_size > 9 || amdgpu_vm_fragment_size < 4)) { 2070 dev_warn(adev->dev, "valid range is between 4 and 9\n"); 2071 amdgpu_vm_fragment_size = -1; 2072 } 2073 2074 if (amdgpu_sched_hw_submission < 2) { 2075 dev_warn(adev->dev, "sched hw submission jobs (%d) must be at least 2\n", 2076 amdgpu_sched_hw_submission); 2077 amdgpu_sched_hw_submission = 2; 2078 } else if (!is_power_of_2(amdgpu_sched_hw_submission)) { 2079 dev_warn(adev->dev, "sched hw submission jobs (%d) must be a power of 2\n", 2080 amdgpu_sched_hw_submission); 2081 amdgpu_sched_hw_submission = roundup_pow_of_two(amdgpu_sched_hw_submission); 2082 } 2083 2084 if (amdgpu_reset_method < -1 || amdgpu_reset_method > 4) { 2085 dev_warn(adev->dev, "invalid option for reset method, reverting to default\n"); 2086 amdgpu_reset_method = -1; 2087 } 2088 2089 amdgpu_device_check_smu_prv_buffer_size(adev); 2090 2091 amdgpu_device_check_vm_size(adev); 2092 2093 amdgpu_device_check_block_size(adev); 2094 2095 adev->firmware.load_type = amdgpu_ucode_get_load_type(adev, amdgpu_fw_load_type); 2096 2097 for (i = 0; i < MAX_XCP; i++) 2098 adev->enforce_isolation[i] = !!enforce_isolation; 2099 2100 return 0; 2101 } 2102 2103 /** 2104 * amdgpu_switcheroo_set_state - set switcheroo state 2105 * 2106 * @pdev: pci dev pointer 2107 * @state: vga_switcheroo state 2108 * 2109 * Callback for the switcheroo driver. Suspends or resumes 2110 * the asics before or after it is powered up using ACPI methods. 2111 */ 2112 static void amdgpu_switcheroo_set_state(struct pci_dev *pdev, 2113 enum vga_switcheroo_state state) 2114 { 2115 struct drm_device *dev = pci_get_drvdata(pdev); 2116 int r; 2117 2118 if (amdgpu_device_supports_px(dev) && state == VGA_SWITCHEROO_OFF) 2119 return; 2120 2121 if (state == VGA_SWITCHEROO_ON) { 2122 pr_info("switched on\n"); 2123 /* don't suspend or resume card normally */ 2124 dev->switch_power_state = DRM_SWITCH_POWER_CHANGING; 2125 2126 pci_set_power_state(pdev, PCI_D0); 2127 amdgpu_device_load_pci_state(pdev); 2128 r = pci_enable_device(pdev); 2129 if (r) 2130 DRM_WARN("pci_enable_device failed (%d)\n", r); 2131 amdgpu_device_resume(dev, true); 2132 2133 dev->switch_power_state = DRM_SWITCH_POWER_ON; 2134 } else { 2135 pr_info("switched off\n"); 2136 dev->switch_power_state = DRM_SWITCH_POWER_CHANGING; 2137 amdgpu_device_prepare(dev); 2138 amdgpu_device_suspend(dev, true); 2139 amdgpu_device_cache_pci_state(pdev); 2140 /* Shut down the device */ 2141 pci_disable_device(pdev); 2142 pci_set_power_state(pdev, PCI_D3cold); 2143 dev->switch_power_state = DRM_SWITCH_POWER_OFF; 2144 } 2145 } 2146 2147 /** 2148 * amdgpu_switcheroo_can_switch - see if switcheroo state can change 2149 * 2150 * @pdev: pci dev pointer 2151 * 2152 * Callback for the switcheroo driver. Check of the switcheroo 2153 * state can be changed. 2154 * Returns true if the state can be changed, false if not. 2155 */ 2156 static bool amdgpu_switcheroo_can_switch(struct pci_dev *pdev) 2157 { 2158 struct drm_device *dev = pci_get_drvdata(pdev); 2159 2160 /* 2161 * FIXME: open_count is protected by drm_global_mutex but that would lead to 2162 * locking inversion with the driver load path. And the access here is 2163 * completely racy anyway. So don't bother with locking for now. 2164 */ 2165 return atomic_read(&dev->open_count) == 0; 2166 } 2167 2168 static const struct vga_switcheroo_client_ops amdgpu_switcheroo_ops = { 2169 .set_gpu_state = amdgpu_switcheroo_set_state, 2170 .reprobe = NULL, 2171 .can_switch = amdgpu_switcheroo_can_switch, 2172 }; 2173 2174 /** 2175 * amdgpu_device_ip_set_clockgating_state - set the CG state 2176 * 2177 * @dev: amdgpu_device pointer 2178 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.) 2179 * @state: clockgating state (gate or ungate) 2180 * 2181 * Sets the requested clockgating state for all instances of 2182 * the hardware IP specified. 2183 * Returns the error code from the last instance. 2184 */ 2185 int amdgpu_device_ip_set_clockgating_state(void *dev, 2186 enum amd_ip_block_type block_type, 2187 enum amd_clockgating_state state) 2188 { 2189 struct amdgpu_device *adev = dev; 2190 int i, r = 0; 2191 2192 for (i = 0; i < adev->num_ip_blocks; i++) { 2193 if (!adev->ip_blocks[i].status.valid) 2194 continue; 2195 if (adev->ip_blocks[i].version->type != block_type) 2196 continue; 2197 if (!adev->ip_blocks[i].version->funcs->set_clockgating_state) 2198 continue; 2199 r = adev->ip_blocks[i].version->funcs->set_clockgating_state( 2200 &adev->ip_blocks[i], state); 2201 if (r) 2202 DRM_ERROR("set_clockgating_state of IP block <%s> failed %d\n", 2203 adev->ip_blocks[i].version->funcs->name, r); 2204 } 2205 return r; 2206 } 2207 2208 /** 2209 * amdgpu_device_ip_set_powergating_state - set the PG state 2210 * 2211 * @dev: amdgpu_device pointer 2212 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.) 2213 * @state: powergating state (gate or ungate) 2214 * 2215 * Sets the requested powergating state for all instances of 2216 * the hardware IP specified. 2217 * Returns the error code from the last instance. 2218 */ 2219 int amdgpu_device_ip_set_powergating_state(void *dev, 2220 enum amd_ip_block_type block_type, 2221 enum amd_powergating_state state) 2222 { 2223 struct amdgpu_device *adev = dev; 2224 int i, r = 0; 2225 2226 for (i = 0; i < adev->num_ip_blocks; i++) { 2227 if (!adev->ip_blocks[i].status.valid) 2228 continue; 2229 if (adev->ip_blocks[i].version->type != block_type) 2230 continue; 2231 if (!adev->ip_blocks[i].version->funcs->set_powergating_state) 2232 continue; 2233 r = adev->ip_blocks[i].version->funcs->set_powergating_state( 2234 &adev->ip_blocks[i], state); 2235 if (r) 2236 DRM_ERROR("set_powergating_state of IP block <%s> failed %d\n", 2237 adev->ip_blocks[i].version->funcs->name, r); 2238 } 2239 return r; 2240 } 2241 2242 /** 2243 * amdgpu_device_ip_get_clockgating_state - get the CG state 2244 * 2245 * @adev: amdgpu_device pointer 2246 * @flags: clockgating feature flags 2247 * 2248 * Walks the list of IPs on the device and updates the clockgating 2249 * flags for each IP. 2250 * Updates @flags with the feature flags for each hardware IP where 2251 * clockgating is enabled. 2252 */ 2253 void amdgpu_device_ip_get_clockgating_state(struct amdgpu_device *adev, 2254 u64 *flags) 2255 { 2256 int i; 2257 2258 for (i = 0; i < adev->num_ip_blocks; i++) { 2259 if (!adev->ip_blocks[i].status.valid) 2260 continue; 2261 if (adev->ip_blocks[i].version->funcs->get_clockgating_state) 2262 adev->ip_blocks[i].version->funcs->get_clockgating_state( 2263 &adev->ip_blocks[i], flags); 2264 } 2265 } 2266 2267 /** 2268 * amdgpu_device_ip_wait_for_idle - wait for idle 2269 * 2270 * @adev: amdgpu_device pointer 2271 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.) 2272 * 2273 * Waits for the request hardware IP to be idle. 2274 * Returns 0 for success or a negative error code on failure. 2275 */ 2276 int amdgpu_device_ip_wait_for_idle(struct amdgpu_device *adev, 2277 enum amd_ip_block_type block_type) 2278 { 2279 int i, r; 2280 2281 for (i = 0; i < adev->num_ip_blocks; i++) { 2282 if (!adev->ip_blocks[i].status.valid) 2283 continue; 2284 if (adev->ip_blocks[i].version->type == block_type) { 2285 if (adev->ip_blocks[i].version->funcs->wait_for_idle) { 2286 r = adev->ip_blocks[i].version->funcs->wait_for_idle( 2287 &adev->ip_blocks[i]); 2288 if (r) 2289 return r; 2290 } 2291 break; 2292 } 2293 } 2294 return 0; 2295 2296 } 2297 2298 /** 2299 * amdgpu_device_ip_is_valid - is the hardware IP enabled 2300 * 2301 * @adev: amdgpu_device pointer 2302 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.) 2303 * 2304 * Check if the hardware IP is enable or not. 2305 * Returns true if it the IP is enable, false if not. 2306 */ 2307 bool amdgpu_device_ip_is_valid(struct amdgpu_device *adev, 2308 enum amd_ip_block_type block_type) 2309 { 2310 int i; 2311 2312 for (i = 0; i < adev->num_ip_blocks; i++) { 2313 if (adev->ip_blocks[i].version->type == block_type) 2314 return adev->ip_blocks[i].status.valid; 2315 } 2316 return false; 2317 2318 } 2319 2320 /** 2321 * amdgpu_device_ip_get_ip_block - get a hw IP pointer 2322 * 2323 * @adev: amdgpu_device pointer 2324 * @type: Type of hardware IP (SMU, GFX, UVD, etc.) 2325 * 2326 * Returns a pointer to the hardware IP block structure 2327 * if it exists for the asic, otherwise NULL. 2328 */ 2329 struct amdgpu_ip_block * 2330 amdgpu_device_ip_get_ip_block(struct amdgpu_device *adev, 2331 enum amd_ip_block_type type) 2332 { 2333 int i; 2334 2335 for (i = 0; i < adev->num_ip_blocks; i++) 2336 if (adev->ip_blocks[i].version->type == type) 2337 return &adev->ip_blocks[i]; 2338 2339 return NULL; 2340 } 2341 2342 /** 2343 * amdgpu_device_ip_block_version_cmp 2344 * 2345 * @adev: amdgpu_device pointer 2346 * @type: enum amd_ip_block_type 2347 * @major: major version 2348 * @minor: minor version 2349 * 2350 * return 0 if equal or greater 2351 * return 1 if smaller or the ip_block doesn't exist 2352 */ 2353 int amdgpu_device_ip_block_version_cmp(struct amdgpu_device *adev, 2354 enum amd_ip_block_type type, 2355 u32 major, u32 minor) 2356 { 2357 struct amdgpu_ip_block *ip_block = amdgpu_device_ip_get_ip_block(adev, type); 2358 2359 if (ip_block && ((ip_block->version->major > major) || 2360 ((ip_block->version->major == major) && 2361 (ip_block->version->minor >= minor)))) 2362 return 0; 2363 2364 return 1; 2365 } 2366 2367 /** 2368 * amdgpu_device_ip_block_add 2369 * 2370 * @adev: amdgpu_device pointer 2371 * @ip_block_version: pointer to the IP to add 2372 * 2373 * Adds the IP block driver information to the collection of IPs 2374 * on the asic. 2375 */ 2376 int amdgpu_device_ip_block_add(struct amdgpu_device *adev, 2377 const struct amdgpu_ip_block_version *ip_block_version) 2378 { 2379 if (!ip_block_version) 2380 return -EINVAL; 2381 2382 switch (ip_block_version->type) { 2383 case AMD_IP_BLOCK_TYPE_VCN: 2384 if (adev->harvest_ip_mask & AMD_HARVEST_IP_VCN_MASK) 2385 return 0; 2386 break; 2387 case AMD_IP_BLOCK_TYPE_JPEG: 2388 if (adev->harvest_ip_mask & AMD_HARVEST_IP_JPEG_MASK) 2389 return 0; 2390 break; 2391 default: 2392 break; 2393 } 2394 2395 dev_info(adev->dev, "detected ip block number %d <%s>\n", 2396 adev->num_ip_blocks, ip_block_version->funcs->name); 2397 2398 adev->ip_blocks[adev->num_ip_blocks].adev = adev; 2399 2400 adev->ip_blocks[adev->num_ip_blocks++].version = ip_block_version; 2401 2402 return 0; 2403 } 2404 2405 /** 2406 * amdgpu_device_enable_virtual_display - enable virtual display feature 2407 * 2408 * @adev: amdgpu_device pointer 2409 * 2410 * Enabled the virtual display feature if the user has enabled it via 2411 * the module parameter virtual_display. This feature provides a virtual 2412 * display hardware on headless boards or in virtualized environments. 2413 * This function parses and validates the configuration string specified by 2414 * the user and configures the virtual display configuration (number of 2415 * virtual connectors, crtcs, etc.) specified. 2416 */ 2417 static void amdgpu_device_enable_virtual_display(struct amdgpu_device *adev) 2418 { 2419 adev->enable_virtual_display = false; 2420 2421 if (amdgpu_virtual_display) { 2422 const char *pci_address_name = pci_name(adev->pdev); 2423 char *pciaddstr, *pciaddstr_tmp, *pciaddname_tmp, *pciaddname; 2424 2425 pciaddstr = kstrdup(amdgpu_virtual_display, GFP_KERNEL); 2426 pciaddstr_tmp = pciaddstr; 2427 while ((pciaddname_tmp = strsep(&pciaddstr_tmp, ";"))) { 2428 pciaddname = strsep(&pciaddname_tmp, ","); 2429 if (!strcmp("all", pciaddname) 2430 || !strcmp(pci_address_name, pciaddname)) { 2431 long num_crtc; 2432 int res = -1; 2433 2434 adev->enable_virtual_display = true; 2435 2436 if (pciaddname_tmp) 2437 res = kstrtol(pciaddname_tmp, 10, 2438 &num_crtc); 2439 2440 if (!res) { 2441 if (num_crtc < 1) 2442 num_crtc = 1; 2443 if (num_crtc > 6) 2444 num_crtc = 6; 2445 adev->mode_info.num_crtc = num_crtc; 2446 } else { 2447 adev->mode_info.num_crtc = 1; 2448 } 2449 break; 2450 } 2451 } 2452 2453 DRM_INFO("virtual display string:%s, %s:virtual_display:%d, num_crtc:%d\n", 2454 amdgpu_virtual_display, pci_address_name, 2455 adev->enable_virtual_display, adev->mode_info.num_crtc); 2456 2457 kfree(pciaddstr); 2458 } 2459 } 2460 2461 void amdgpu_device_set_sriov_virtual_display(struct amdgpu_device *adev) 2462 { 2463 if (amdgpu_sriov_vf(adev) && !adev->enable_virtual_display) { 2464 adev->mode_info.num_crtc = 1; 2465 adev->enable_virtual_display = true; 2466 DRM_INFO("virtual_display:%d, num_crtc:%d\n", 2467 adev->enable_virtual_display, adev->mode_info.num_crtc); 2468 } 2469 } 2470 2471 /** 2472 * amdgpu_device_parse_gpu_info_fw - parse gpu info firmware 2473 * 2474 * @adev: amdgpu_device pointer 2475 * 2476 * Parses the asic configuration parameters specified in the gpu info 2477 * firmware and makes them available to the driver for use in configuring 2478 * the asic. 2479 * Returns 0 on success, -EINVAL on failure. 2480 */ 2481 static int amdgpu_device_parse_gpu_info_fw(struct amdgpu_device *adev) 2482 { 2483 const char *chip_name; 2484 int err; 2485 const struct gpu_info_firmware_header_v1_0 *hdr; 2486 2487 adev->firmware.gpu_info_fw = NULL; 2488 2489 if (adev->mman.discovery_bin) 2490 return 0; 2491 2492 switch (adev->asic_type) { 2493 default: 2494 return 0; 2495 case CHIP_VEGA10: 2496 chip_name = "vega10"; 2497 break; 2498 case CHIP_VEGA12: 2499 chip_name = "vega12"; 2500 break; 2501 case CHIP_RAVEN: 2502 if (adev->apu_flags & AMD_APU_IS_RAVEN2) 2503 chip_name = "raven2"; 2504 else if (adev->apu_flags & AMD_APU_IS_PICASSO) 2505 chip_name = "picasso"; 2506 else 2507 chip_name = "raven"; 2508 break; 2509 case CHIP_ARCTURUS: 2510 chip_name = "arcturus"; 2511 break; 2512 case CHIP_NAVI12: 2513 chip_name = "navi12"; 2514 break; 2515 } 2516 2517 err = amdgpu_ucode_request(adev, &adev->firmware.gpu_info_fw, 2518 AMDGPU_UCODE_OPTIONAL, 2519 "amdgpu/%s_gpu_info.bin", chip_name); 2520 if (err) { 2521 dev_err(adev->dev, 2522 "Failed to get gpu_info firmware \"%s_gpu_info.bin\"\n", 2523 chip_name); 2524 goto out; 2525 } 2526 2527 hdr = (const struct gpu_info_firmware_header_v1_0 *)adev->firmware.gpu_info_fw->data; 2528 amdgpu_ucode_print_gpu_info_hdr(&hdr->header); 2529 2530 switch (hdr->version_major) { 2531 case 1: 2532 { 2533 const struct gpu_info_firmware_v1_0 *gpu_info_fw = 2534 (const struct gpu_info_firmware_v1_0 *)(adev->firmware.gpu_info_fw->data + 2535 le32_to_cpu(hdr->header.ucode_array_offset_bytes)); 2536 2537 /* 2538 * Should be dropped when DAL no longer needs it. 2539 */ 2540 if (adev->asic_type == CHIP_NAVI12) 2541 goto parse_soc_bounding_box; 2542 2543 adev->gfx.config.max_shader_engines = le32_to_cpu(gpu_info_fw->gc_num_se); 2544 adev->gfx.config.max_cu_per_sh = le32_to_cpu(gpu_info_fw->gc_num_cu_per_sh); 2545 adev->gfx.config.max_sh_per_se = le32_to_cpu(gpu_info_fw->gc_num_sh_per_se); 2546 adev->gfx.config.max_backends_per_se = le32_to_cpu(gpu_info_fw->gc_num_rb_per_se); 2547 adev->gfx.config.max_texture_channel_caches = 2548 le32_to_cpu(gpu_info_fw->gc_num_tccs); 2549 adev->gfx.config.max_gprs = le32_to_cpu(gpu_info_fw->gc_num_gprs); 2550 adev->gfx.config.max_gs_threads = le32_to_cpu(gpu_info_fw->gc_num_max_gs_thds); 2551 adev->gfx.config.gs_vgt_table_depth = le32_to_cpu(gpu_info_fw->gc_gs_table_depth); 2552 adev->gfx.config.gs_prim_buffer_depth = le32_to_cpu(gpu_info_fw->gc_gsprim_buff_depth); 2553 adev->gfx.config.double_offchip_lds_buf = 2554 le32_to_cpu(gpu_info_fw->gc_double_offchip_lds_buffer); 2555 adev->gfx.cu_info.wave_front_size = le32_to_cpu(gpu_info_fw->gc_wave_size); 2556 adev->gfx.cu_info.max_waves_per_simd = 2557 le32_to_cpu(gpu_info_fw->gc_max_waves_per_simd); 2558 adev->gfx.cu_info.max_scratch_slots_per_cu = 2559 le32_to_cpu(gpu_info_fw->gc_max_scratch_slots_per_cu); 2560 adev->gfx.cu_info.lds_size = le32_to_cpu(gpu_info_fw->gc_lds_size); 2561 if (hdr->version_minor >= 1) { 2562 const struct gpu_info_firmware_v1_1 *gpu_info_fw = 2563 (const struct gpu_info_firmware_v1_1 *)(adev->firmware.gpu_info_fw->data + 2564 le32_to_cpu(hdr->header.ucode_array_offset_bytes)); 2565 adev->gfx.config.num_sc_per_sh = 2566 le32_to_cpu(gpu_info_fw->num_sc_per_sh); 2567 adev->gfx.config.num_packer_per_sc = 2568 le32_to_cpu(gpu_info_fw->num_packer_per_sc); 2569 } 2570 2571 parse_soc_bounding_box: 2572 /* 2573 * soc bounding box info is not integrated in disocovery table, 2574 * we always need to parse it from gpu info firmware if needed. 2575 */ 2576 if (hdr->version_minor == 2) { 2577 const struct gpu_info_firmware_v1_2 *gpu_info_fw = 2578 (const struct gpu_info_firmware_v1_2 *)(adev->firmware.gpu_info_fw->data + 2579 le32_to_cpu(hdr->header.ucode_array_offset_bytes)); 2580 adev->dm.soc_bounding_box = &gpu_info_fw->soc_bounding_box; 2581 } 2582 break; 2583 } 2584 default: 2585 dev_err(adev->dev, 2586 "Unsupported gpu_info table %d\n", hdr->header.ucode_version); 2587 err = -EINVAL; 2588 goto out; 2589 } 2590 out: 2591 return err; 2592 } 2593 2594 /** 2595 * amdgpu_device_ip_early_init - run early init for hardware IPs 2596 * 2597 * @adev: amdgpu_device pointer 2598 * 2599 * Early initialization pass for hardware IPs. The hardware IPs that make 2600 * up each asic are discovered each IP's early_init callback is run. This 2601 * is the first stage in initializing the asic. 2602 * Returns 0 on success, negative error code on failure. 2603 */ 2604 static int amdgpu_device_ip_early_init(struct amdgpu_device *adev) 2605 { 2606 struct amdgpu_ip_block *ip_block; 2607 struct pci_dev *parent; 2608 bool total, skip_bios; 2609 uint32_t bios_flags; 2610 int i, r; 2611 2612 amdgpu_device_enable_virtual_display(adev); 2613 2614 if (amdgpu_sriov_vf(adev)) { 2615 r = amdgpu_virt_request_full_gpu(adev, true); 2616 if (r) 2617 return r; 2618 } 2619 2620 switch (adev->asic_type) { 2621 #ifdef CONFIG_DRM_AMDGPU_SI 2622 case CHIP_VERDE: 2623 case CHIP_TAHITI: 2624 case CHIP_PITCAIRN: 2625 case CHIP_OLAND: 2626 case CHIP_HAINAN: 2627 adev->family = AMDGPU_FAMILY_SI; 2628 r = si_set_ip_blocks(adev); 2629 if (r) 2630 return r; 2631 break; 2632 #endif 2633 #ifdef CONFIG_DRM_AMDGPU_CIK 2634 case CHIP_BONAIRE: 2635 case CHIP_HAWAII: 2636 case CHIP_KAVERI: 2637 case CHIP_KABINI: 2638 case CHIP_MULLINS: 2639 if (adev->flags & AMD_IS_APU) 2640 adev->family = AMDGPU_FAMILY_KV; 2641 else 2642 adev->family = AMDGPU_FAMILY_CI; 2643 2644 r = cik_set_ip_blocks(adev); 2645 if (r) 2646 return r; 2647 break; 2648 #endif 2649 case CHIP_TOPAZ: 2650 case CHIP_TONGA: 2651 case CHIP_FIJI: 2652 case CHIP_POLARIS10: 2653 case CHIP_POLARIS11: 2654 case CHIP_POLARIS12: 2655 case CHIP_VEGAM: 2656 case CHIP_CARRIZO: 2657 case CHIP_STONEY: 2658 if (adev->flags & AMD_IS_APU) 2659 adev->family = AMDGPU_FAMILY_CZ; 2660 else 2661 adev->family = AMDGPU_FAMILY_VI; 2662 2663 r = vi_set_ip_blocks(adev); 2664 if (r) 2665 return r; 2666 break; 2667 default: 2668 r = amdgpu_discovery_set_ip_blocks(adev); 2669 if (r) 2670 return r; 2671 break; 2672 } 2673 2674 if (amdgpu_has_atpx() && 2675 (amdgpu_is_atpx_hybrid() || 2676 amdgpu_has_atpx_dgpu_power_cntl()) && 2677 ((adev->flags & AMD_IS_APU) == 0) && 2678 !dev_is_removable(&adev->pdev->dev)) 2679 adev->flags |= AMD_IS_PX; 2680 2681 if (!(adev->flags & AMD_IS_APU)) { 2682 parent = pcie_find_root_port(adev->pdev); 2683 adev->has_pr3 = parent ? pci_pr3_present(parent) : false; 2684 } 2685 2686 2687 adev->pm.pp_feature = amdgpu_pp_feature_mask; 2688 if (amdgpu_sriov_vf(adev) || sched_policy == KFD_SCHED_POLICY_NO_HWS) 2689 adev->pm.pp_feature &= ~PP_GFXOFF_MASK; 2690 if (amdgpu_sriov_vf(adev) && adev->asic_type == CHIP_SIENNA_CICHLID) 2691 adev->pm.pp_feature &= ~PP_OVERDRIVE_MASK; 2692 if (!amdgpu_device_pcie_dynamic_switching_supported(adev)) 2693 adev->pm.pp_feature &= ~PP_PCIE_DPM_MASK; 2694 2695 total = true; 2696 for (i = 0; i < adev->num_ip_blocks; i++) { 2697 ip_block = &adev->ip_blocks[i]; 2698 2699 if ((amdgpu_ip_block_mask & (1 << i)) == 0) { 2700 DRM_WARN("disabled ip block: %d <%s>\n", 2701 i, adev->ip_blocks[i].version->funcs->name); 2702 adev->ip_blocks[i].status.valid = false; 2703 } else if (ip_block->version->funcs->early_init) { 2704 r = ip_block->version->funcs->early_init(ip_block); 2705 if (r == -ENOENT) { 2706 adev->ip_blocks[i].status.valid = false; 2707 } else if (r) { 2708 DRM_ERROR("early_init of IP block <%s> failed %d\n", 2709 adev->ip_blocks[i].version->funcs->name, r); 2710 total = false; 2711 } else { 2712 adev->ip_blocks[i].status.valid = true; 2713 } 2714 } else { 2715 adev->ip_blocks[i].status.valid = true; 2716 } 2717 /* get the vbios after the asic_funcs are set up */ 2718 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON) { 2719 r = amdgpu_device_parse_gpu_info_fw(adev); 2720 if (r) 2721 return r; 2722 2723 bios_flags = amdgpu_device_get_vbios_flags(adev); 2724 skip_bios = !!(bios_flags & AMDGPU_VBIOS_SKIP); 2725 /* Read BIOS */ 2726 if (!skip_bios) { 2727 bool optional = 2728 !!(bios_flags & AMDGPU_VBIOS_OPTIONAL); 2729 if (!amdgpu_get_bios(adev) && !optional) 2730 return -EINVAL; 2731 2732 if (optional && !adev->bios) 2733 dev_info( 2734 adev->dev, 2735 "VBIOS image optional, proceeding without VBIOS image"); 2736 2737 if (adev->bios) { 2738 r = amdgpu_atombios_init(adev); 2739 if (r) { 2740 dev_err(adev->dev, 2741 "amdgpu_atombios_init failed\n"); 2742 amdgpu_vf_error_put( 2743 adev, 2744 AMDGIM_ERROR_VF_ATOMBIOS_INIT_FAIL, 2745 0, 0); 2746 return r; 2747 } 2748 } 2749 } 2750 2751 /*get pf2vf msg info at it's earliest time*/ 2752 if (amdgpu_sriov_vf(adev)) 2753 amdgpu_virt_init_data_exchange(adev); 2754 2755 } 2756 } 2757 if (!total) 2758 return -ENODEV; 2759 2760 ip_block = amdgpu_device_ip_get_ip_block(adev, AMD_IP_BLOCK_TYPE_GFX); 2761 if (ip_block->status.valid != false) 2762 amdgpu_amdkfd_device_probe(adev); 2763 2764 adev->cg_flags &= amdgpu_cg_mask; 2765 adev->pg_flags &= amdgpu_pg_mask; 2766 2767 return 0; 2768 } 2769 2770 static int amdgpu_device_ip_hw_init_phase1(struct amdgpu_device *adev) 2771 { 2772 int i, r; 2773 2774 for (i = 0; i < adev->num_ip_blocks; i++) { 2775 if (!adev->ip_blocks[i].status.sw) 2776 continue; 2777 if (adev->ip_blocks[i].status.hw) 2778 continue; 2779 if (!amdgpu_ip_member_of_hwini( 2780 adev, adev->ip_blocks[i].version->type)) 2781 continue; 2782 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON || 2783 (amdgpu_sriov_vf(adev) && (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP)) || 2784 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH) { 2785 r = adev->ip_blocks[i].version->funcs->hw_init(&adev->ip_blocks[i]); 2786 if (r) { 2787 DRM_ERROR("hw_init of IP block <%s> failed %d\n", 2788 adev->ip_blocks[i].version->funcs->name, r); 2789 return r; 2790 } 2791 adev->ip_blocks[i].status.hw = true; 2792 } 2793 } 2794 2795 return 0; 2796 } 2797 2798 static int amdgpu_device_ip_hw_init_phase2(struct amdgpu_device *adev) 2799 { 2800 int i, r; 2801 2802 for (i = 0; i < adev->num_ip_blocks; i++) { 2803 if (!adev->ip_blocks[i].status.sw) 2804 continue; 2805 if (adev->ip_blocks[i].status.hw) 2806 continue; 2807 if (!amdgpu_ip_member_of_hwini( 2808 adev, adev->ip_blocks[i].version->type)) 2809 continue; 2810 r = adev->ip_blocks[i].version->funcs->hw_init(&adev->ip_blocks[i]); 2811 if (r) { 2812 DRM_ERROR("hw_init of IP block <%s> failed %d\n", 2813 adev->ip_blocks[i].version->funcs->name, r); 2814 return r; 2815 } 2816 adev->ip_blocks[i].status.hw = true; 2817 } 2818 2819 return 0; 2820 } 2821 2822 static int amdgpu_device_fw_loading(struct amdgpu_device *adev) 2823 { 2824 int r = 0; 2825 int i; 2826 uint32_t smu_version; 2827 2828 if (adev->asic_type >= CHIP_VEGA10) { 2829 for (i = 0; i < adev->num_ip_blocks; i++) { 2830 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_PSP) 2831 continue; 2832 2833 if (!amdgpu_ip_member_of_hwini(adev, 2834 AMD_IP_BLOCK_TYPE_PSP)) 2835 break; 2836 2837 if (!adev->ip_blocks[i].status.sw) 2838 continue; 2839 2840 /* no need to do the fw loading again if already done*/ 2841 if (adev->ip_blocks[i].status.hw == true) 2842 break; 2843 2844 if (amdgpu_in_reset(adev) || adev->in_suspend) { 2845 r = amdgpu_ip_block_resume(&adev->ip_blocks[i]); 2846 if (r) 2847 return r; 2848 } else { 2849 r = adev->ip_blocks[i].version->funcs->hw_init(&adev->ip_blocks[i]); 2850 if (r) { 2851 DRM_ERROR("hw_init of IP block <%s> failed %d\n", 2852 adev->ip_blocks[i].version->funcs->name, r); 2853 return r; 2854 } 2855 adev->ip_blocks[i].status.hw = true; 2856 } 2857 break; 2858 } 2859 } 2860 2861 if (!amdgpu_sriov_vf(adev) || adev->asic_type == CHIP_TONGA) 2862 r = amdgpu_pm_load_smu_firmware(adev, &smu_version); 2863 2864 return r; 2865 } 2866 2867 static int amdgpu_device_init_schedulers(struct amdgpu_device *adev) 2868 { 2869 struct drm_sched_init_args args = { 2870 .ops = &amdgpu_sched_ops, 2871 .num_rqs = DRM_SCHED_PRIORITY_COUNT, 2872 .timeout_wq = adev->reset_domain->wq, 2873 .dev = adev->dev, 2874 }; 2875 long timeout; 2876 int r, i; 2877 2878 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 2879 struct amdgpu_ring *ring = adev->rings[i]; 2880 2881 /* No need to setup the GPU scheduler for rings that don't need it */ 2882 if (!ring || ring->no_scheduler) 2883 continue; 2884 2885 switch (ring->funcs->type) { 2886 case AMDGPU_RING_TYPE_GFX: 2887 timeout = adev->gfx_timeout; 2888 break; 2889 case AMDGPU_RING_TYPE_COMPUTE: 2890 timeout = adev->compute_timeout; 2891 break; 2892 case AMDGPU_RING_TYPE_SDMA: 2893 timeout = adev->sdma_timeout; 2894 break; 2895 default: 2896 timeout = adev->video_timeout; 2897 break; 2898 } 2899 2900 args.timeout = timeout; 2901 args.credit_limit = ring->num_hw_submission; 2902 args.score = ring->sched_score; 2903 args.name = ring->name; 2904 2905 r = drm_sched_init(&ring->sched, &args); 2906 if (r) { 2907 DRM_ERROR("Failed to create scheduler on ring %s.\n", 2908 ring->name); 2909 return r; 2910 } 2911 r = amdgpu_uvd_entity_init(adev, ring); 2912 if (r) { 2913 DRM_ERROR("Failed to create UVD scheduling entity on ring %s.\n", 2914 ring->name); 2915 return r; 2916 } 2917 r = amdgpu_vce_entity_init(adev, ring); 2918 if (r) { 2919 DRM_ERROR("Failed to create VCE scheduling entity on ring %s.\n", 2920 ring->name); 2921 return r; 2922 } 2923 } 2924 2925 amdgpu_xcp_update_partition_sched_list(adev); 2926 2927 return 0; 2928 } 2929 2930 2931 /** 2932 * amdgpu_device_ip_init - run init for hardware IPs 2933 * 2934 * @adev: amdgpu_device pointer 2935 * 2936 * Main initialization pass for hardware IPs. The list of all the hardware 2937 * IPs that make up the asic is walked and the sw_init and hw_init callbacks 2938 * are run. sw_init initializes the software state associated with each IP 2939 * and hw_init initializes the hardware associated with each IP. 2940 * Returns 0 on success, negative error code on failure. 2941 */ 2942 static int amdgpu_device_ip_init(struct amdgpu_device *adev) 2943 { 2944 bool init_badpage; 2945 int i, r; 2946 2947 r = amdgpu_ras_init(adev); 2948 if (r) 2949 return r; 2950 2951 for (i = 0; i < adev->num_ip_blocks; i++) { 2952 if (!adev->ip_blocks[i].status.valid) 2953 continue; 2954 if (adev->ip_blocks[i].version->funcs->sw_init) { 2955 r = adev->ip_blocks[i].version->funcs->sw_init(&adev->ip_blocks[i]); 2956 if (r) { 2957 DRM_ERROR("sw_init of IP block <%s> failed %d\n", 2958 adev->ip_blocks[i].version->funcs->name, r); 2959 goto init_failed; 2960 } 2961 } 2962 adev->ip_blocks[i].status.sw = true; 2963 2964 if (!amdgpu_ip_member_of_hwini( 2965 adev, adev->ip_blocks[i].version->type)) 2966 continue; 2967 2968 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON) { 2969 /* need to do common hw init early so everything is set up for gmc */ 2970 r = adev->ip_blocks[i].version->funcs->hw_init(&adev->ip_blocks[i]); 2971 if (r) { 2972 DRM_ERROR("hw_init %d failed %d\n", i, r); 2973 goto init_failed; 2974 } 2975 adev->ip_blocks[i].status.hw = true; 2976 } else if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) { 2977 /* need to do gmc hw init early so we can allocate gpu mem */ 2978 /* Try to reserve bad pages early */ 2979 if (amdgpu_sriov_vf(adev)) 2980 amdgpu_virt_exchange_data(adev); 2981 2982 r = amdgpu_device_mem_scratch_init(adev); 2983 if (r) { 2984 DRM_ERROR("amdgpu_mem_scratch_init failed %d\n", r); 2985 goto init_failed; 2986 } 2987 r = adev->ip_blocks[i].version->funcs->hw_init(&adev->ip_blocks[i]); 2988 if (r) { 2989 DRM_ERROR("hw_init %d failed %d\n", i, r); 2990 goto init_failed; 2991 } 2992 r = amdgpu_device_wb_init(adev); 2993 if (r) { 2994 DRM_ERROR("amdgpu_device_wb_init failed %d\n", r); 2995 goto init_failed; 2996 } 2997 adev->ip_blocks[i].status.hw = true; 2998 2999 /* right after GMC hw init, we create CSA */ 3000 if (adev->gfx.mcbp) { 3001 r = amdgpu_allocate_static_csa(adev, &adev->virt.csa_obj, 3002 AMDGPU_GEM_DOMAIN_VRAM | 3003 AMDGPU_GEM_DOMAIN_GTT, 3004 AMDGPU_CSA_SIZE); 3005 if (r) { 3006 DRM_ERROR("allocate CSA failed %d\n", r); 3007 goto init_failed; 3008 } 3009 } 3010 3011 r = amdgpu_seq64_init(adev); 3012 if (r) { 3013 DRM_ERROR("allocate seq64 failed %d\n", r); 3014 goto init_failed; 3015 } 3016 } 3017 } 3018 3019 if (amdgpu_sriov_vf(adev)) 3020 amdgpu_virt_init_data_exchange(adev); 3021 3022 r = amdgpu_ib_pool_init(adev); 3023 if (r) { 3024 dev_err(adev->dev, "IB initialization failed (%d).\n", r); 3025 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_IB_INIT_FAIL, 0, r); 3026 goto init_failed; 3027 } 3028 3029 r = amdgpu_ucode_create_bo(adev); /* create ucode bo when sw_init complete*/ 3030 if (r) 3031 goto init_failed; 3032 3033 r = amdgpu_device_ip_hw_init_phase1(adev); 3034 if (r) 3035 goto init_failed; 3036 3037 r = amdgpu_device_fw_loading(adev); 3038 if (r) 3039 goto init_failed; 3040 3041 r = amdgpu_device_ip_hw_init_phase2(adev); 3042 if (r) 3043 goto init_failed; 3044 3045 /* 3046 * retired pages will be loaded from eeprom and reserved here, 3047 * it should be called after amdgpu_device_ip_hw_init_phase2 since 3048 * for some ASICs the RAS EEPROM code relies on SMU fully functioning 3049 * for I2C communication which only true at this point. 3050 * 3051 * amdgpu_ras_recovery_init may fail, but the upper only cares the 3052 * failure from bad gpu situation and stop amdgpu init process 3053 * accordingly. For other failed cases, it will still release all 3054 * the resource and print error message, rather than returning one 3055 * negative value to upper level. 3056 * 3057 * Note: theoretically, this should be called before all vram allocations 3058 * to protect retired page from abusing 3059 */ 3060 init_badpage = (adev->init_lvl->level != AMDGPU_INIT_LEVEL_MINIMAL_XGMI); 3061 r = amdgpu_ras_recovery_init(adev, init_badpage); 3062 if (r) 3063 goto init_failed; 3064 3065 /** 3066 * In case of XGMI grab extra reference for reset domain for this device 3067 */ 3068 if (adev->gmc.xgmi.num_physical_nodes > 1) { 3069 if (amdgpu_xgmi_add_device(adev) == 0) { 3070 if (!amdgpu_sriov_vf(adev)) { 3071 struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev); 3072 3073 if (WARN_ON(!hive)) { 3074 r = -ENOENT; 3075 goto init_failed; 3076 } 3077 3078 if (!hive->reset_domain || 3079 !amdgpu_reset_get_reset_domain(hive->reset_domain)) { 3080 r = -ENOENT; 3081 amdgpu_put_xgmi_hive(hive); 3082 goto init_failed; 3083 } 3084 3085 /* Drop the early temporary reset domain we created for device */ 3086 amdgpu_reset_put_reset_domain(adev->reset_domain); 3087 adev->reset_domain = hive->reset_domain; 3088 amdgpu_put_xgmi_hive(hive); 3089 } 3090 } 3091 } 3092 3093 r = amdgpu_device_init_schedulers(adev); 3094 if (r) 3095 goto init_failed; 3096 3097 if (adev->mman.buffer_funcs_ring->sched.ready) 3098 amdgpu_ttm_set_buffer_funcs_status(adev, true); 3099 3100 /* Don't init kfd if whole hive need to be reset during init */ 3101 if (adev->init_lvl->level != AMDGPU_INIT_LEVEL_MINIMAL_XGMI) { 3102 kgd2kfd_init_zone_device(adev); 3103 amdgpu_amdkfd_device_init(adev); 3104 } 3105 3106 amdgpu_fru_get_product_info(adev); 3107 3108 if (!amdgpu_sriov_vf(adev) || amdgpu_sriov_ras_cper_en(adev)) 3109 r = amdgpu_cper_init(adev); 3110 3111 init_failed: 3112 3113 return r; 3114 } 3115 3116 /** 3117 * amdgpu_device_fill_reset_magic - writes reset magic to gart pointer 3118 * 3119 * @adev: amdgpu_device pointer 3120 * 3121 * Writes a reset magic value to the gart pointer in VRAM. The driver calls 3122 * this function before a GPU reset. If the value is retained after a 3123 * GPU reset, VRAM has not been lost. Some GPU resets may destroy VRAM contents. 3124 */ 3125 static void amdgpu_device_fill_reset_magic(struct amdgpu_device *adev) 3126 { 3127 memcpy(adev->reset_magic, adev->gart.ptr, AMDGPU_RESET_MAGIC_NUM); 3128 } 3129 3130 /** 3131 * amdgpu_device_check_vram_lost - check if vram is valid 3132 * 3133 * @adev: amdgpu_device pointer 3134 * 3135 * Checks the reset magic value written to the gart pointer in VRAM. 3136 * The driver calls this after a GPU reset to see if the contents of 3137 * VRAM is lost or now. 3138 * returns true if vram is lost, false if not. 3139 */ 3140 static bool amdgpu_device_check_vram_lost(struct amdgpu_device *adev) 3141 { 3142 if (memcmp(adev->gart.ptr, adev->reset_magic, 3143 AMDGPU_RESET_MAGIC_NUM)) 3144 return true; 3145 3146 if (!amdgpu_in_reset(adev)) 3147 return false; 3148 3149 /* 3150 * For all ASICs with baco/mode1 reset, the VRAM is 3151 * always assumed to be lost. 3152 */ 3153 switch (amdgpu_asic_reset_method(adev)) { 3154 case AMD_RESET_METHOD_BACO: 3155 case AMD_RESET_METHOD_MODE1: 3156 return true; 3157 default: 3158 return false; 3159 } 3160 } 3161 3162 /** 3163 * amdgpu_device_set_cg_state - set clockgating for amdgpu device 3164 * 3165 * @adev: amdgpu_device pointer 3166 * @state: clockgating state (gate or ungate) 3167 * 3168 * The list of all the hardware IPs that make up the asic is walked and the 3169 * set_clockgating_state callbacks are run. 3170 * Late initialization pass enabling clockgating for hardware IPs. 3171 * Fini or suspend, pass disabling clockgating for hardware IPs. 3172 * Returns 0 on success, negative error code on failure. 3173 */ 3174 3175 int amdgpu_device_set_cg_state(struct amdgpu_device *adev, 3176 enum amd_clockgating_state state) 3177 { 3178 int i, j, r; 3179 3180 if (amdgpu_emu_mode == 1) 3181 return 0; 3182 3183 for (j = 0; j < adev->num_ip_blocks; j++) { 3184 i = state == AMD_CG_STATE_GATE ? j : adev->num_ip_blocks - j - 1; 3185 if (!adev->ip_blocks[i].status.late_initialized) 3186 continue; 3187 /* skip CG for GFX, SDMA on S0ix */ 3188 if (adev->in_s0ix && 3189 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GFX || 3190 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SDMA)) 3191 continue; 3192 /* skip CG for VCE/UVD, it's handled specially */ 3193 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_UVD && 3194 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCE && 3195 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCN && 3196 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_JPEG && 3197 adev->ip_blocks[i].version->funcs->set_clockgating_state) { 3198 /* enable clockgating to save power */ 3199 r = adev->ip_blocks[i].version->funcs->set_clockgating_state(&adev->ip_blocks[i], 3200 state); 3201 if (r) { 3202 DRM_ERROR("set_clockgating_state(gate) of IP block <%s> failed %d\n", 3203 adev->ip_blocks[i].version->funcs->name, r); 3204 return r; 3205 } 3206 } 3207 } 3208 3209 return 0; 3210 } 3211 3212 int amdgpu_device_set_pg_state(struct amdgpu_device *adev, 3213 enum amd_powergating_state state) 3214 { 3215 int i, j, r; 3216 3217 if (amdgpu_emu_mode == 1) 3218 return 0; 3219 3220 for (j = 0; j < adev->num_ip_blocks; j++) { 3221 i = state == AMD_PG_STATE_GATE ? j : adev->num_ip_blocks - j - 1; 3222 if (!adev->ip_blocks[i].status.late_initialized) 3223 continue; 3224 /* skip PG for GFX, SDMA on S0ix */ 3225 if (adev->in_s0ix && 3226 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GFX || 3227 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SDMA)) 3228 continue; 3229 /* skip CG for VCE/UVD, it's handled specially */ 3230 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_UVD && 3231 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCE && 3232 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCN && 3233 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_JPEG && 3234 adev->ip_blocks[i].version->funcs->set_powergating_state) { 3235 /* enable powergating to save power */ 3236 r = adev->ip_blocks[i].version->funcs->set_powergating_state(&adev->ip_blocks[i], 3237 state); 3238 if (r) { 3239 DRM_ERROR("set_powergating_state(gate) of IP block <%s> failed %d\n", 3240 adev->ip_blocks[i].version->funcs->name, r); 3241 return r; 3242 } 3243 } 3244 } 3245 return 0; 3246 } 3247 3248 static int amdgpu_device_enable_mgpu_fan_boost(void) 3249 { 3250 struct amdgpu_gpu_instance *gpu_ins; 3251 struct amdgpu_device *adev; 3252 int i, ret = 0; 3253 3254 mutex_lock(&mgpu_info.mutex); 3255 3256 /* 3257 * MGPU fan boost feature should be enabled 3258 * only when there are two or more dGPUs in 3259 * the system 3260 */ 3261 if (mgpu_info.num_dgpu < 2) 3262 goto out; 3263 3264 for (i = 0; i < mgpu_info.num_dgpu; i++) { 3265 gpu_ins = &(mgpu_info.gpu_ins[i]); 3266 adev = gpu_ins->adev; 3267 if (!(adev->flags & AMD_IS_APU) && 3268 !gpu_ins->mgpu_fan_enabled) { 3269 ret = amdgpu_dpm_enable_mgpu_fan_boost(adev); 3270 if (ret) 3271 break; 3272 3273 gpu_ins->mgpu_fan_enabled = 1; 3274 } 3275 } 3276 3277 out: 3278 mutex_unlock(&mgpu_info.mutex); 3279 3280 return ret; 3281 } 3282 3283 /** 3284 * amdgpu_device_ip_late_init - run late init for hardware IPs 3285 * 3286 * @adev: amdgpu_device pointer 3287 * 3288 * Late initialization pass for hardware IPs. The list of all the hardware 3289 * IPs that make up the asic is walked and the late_init callbacks are run. 3290 * late_init covers any special initialization that an IP requires 3291 * after all of the have been initialized or something that needs to happen 3292 * late in the init process. 3293 * Returns 0 on success, negative error code on failure. 3294 */ 3295 static int amdgpu_device_ip_late_init(struct amdgpu_device *adev) 3296 { 3297 struct amdgpu_gpu_instance *gpu_instance; 3298 int i = 0, r; 3299 3300 for (i = 0; i < adev->num_ip_blocks; i++) { 3301 if (!adev->ip_blocks[i].status.hw) 3302 continue; 3303 if (adev->ip_blocks[i].version->funcs->late_init) { 3304 r = adev->ip_blocks[i].version->funcs->late_init(&adev->ip_blocks[i]); 3305 if (r) { 3306 DRM_ERROR("late_init of IP block <%s> failed %d\n", 3307 adev->ip_blocks[i].version->funcs->name, r); 3308 return r; 3309 } 3310 } 3311 adev->ip_blocks[i].status.late_initialized = true; 3312 } 3313 3314 r = amdgpu_ras_late_init(adev); 3315 if (r) { 3316 DRM_ERROR("amdgpu_ras_late_init failed %d", r); 3317 return r; 3318 } 3319 3320 if (!amdgpu_reset_in_recovery(adev)) 3321 amdgpu_ras_set_error_query_ready(adev, true); 3322 3323 amdgpu_device_set_cg_state(adev, AMD_CG_STATE_GATE); 3324 amdgpu_device_set_pg_state(adev, AMD_PG_STATE_GATE); 3325 3326 amdgpu_device_fill_reset_magic(adev); 3327 3328 r = amdgpu_device_enable_mgpu_fan_boost(); 3329 if (r) 3330 DRM_ERROR("enable mgpu fan boost failed (%d).\n", r); 3331 3332 /* For passthrough configuration on arcturus and aldebaran, enable special handling SBR */ 3333 if (amdgpu_passthrough(adev) && 3334 ((adev->asic_type == CHIP_ARCTURUS && adev->gmc.xgmi.num_physical_nodes > 1) || 3335 adev->asic_type == CHIP_ALDEBARAN)) 3336 amdgpu_dpm_handle_passthrough_sbr(adev, true); 3337 3338 if (adev->gmc.xgmi.num_physical_nodes > 1) { 3339 mutex_lock(&mgpu_info.mutex); 3340 3341 /* 3342 * Reset device p-state to low as this was booted with high. 3343 * 3344 * This should be performed only after all devices from the same 3345 * hive get initialized. 3346 * 3347 * However, it's unknown how many device in the hive in advance. 3348 * As this is counted one by one during devices initializations. 3349 * 3350 * So, we wait for all XGMI interlinked devices initialized. 3351 * This may bring some delays as those devices may come from 3352 * different hives. But that should be OK. 3353 */ 3354 if (mgpu_info.num_dgpu == adev->gmc.xgmi.num_physical_nodes) { 3355 for (i = 0; i < mgpu_info.num_gpu; i++) { 3356 gpu_instance = &(mgpu_info.gpu_ins[i]); 3357 if (gpu_instance->adev->flags & AMD_IS_APU) 3358 continue; 3359 3360 r = amdgpu_xgmi_set_pstate(gpu_instance->adev, 3361 AMDGPU_XGMI_PSTATE_MIN); 3362 if (r) { 3363 DRM_ERROR("pstate setting failed (%d).\n", r); 3364 break; 3365 } 3366 } 3367 } 3368 3369 mutex_unlock(&mgpu_info.mutex); 3370 } 3371 3372 return 0; 3373 } 3374 3375 static void amdgpu_ip_block_hw_fini(struct amdgpu_ip_block *ip_block) 3376 { 3377 int r; 3378 3379 if (!ip_block->version->funcs->hw_fini) { 3380 DRM_ERROR("hw_fini of IP block <%s> not defined\n", 3381 ip_block->version->funcs->name); 3382 } else { 3383 r = ip_block->version->funcs->hw_fini(ip_block); 3384 /* XXX handle errors */ 3385 if (r) { 3386 DRM_DEBUG("hw_fini of IP block <%s> failed %d\n", 3387 ip_block->version->funcs->name, r); 3388 } 3389 } 3390 3391 ip_block->status.hw = false; 3392 } 3393 3394 /** 3395 * amdgpu_device_smu_fini_early - smu hw_fini wrapper 3396 * 3397 * @adev: amdgpu_device pointer 3398 * 3399 * For ASICs need to disable SMC first 3400 */ 3401 static void amdgpu_device_smu_fini_early(struct amdgpu_device *adev) 3402 { 3403 int i; 3404 3405 if (amdgpu_ip_version(adev, GC_HWIP, 0) > IP_VERSION(9, 0, 0)) 3406 return; 3407 3408 for (i = 0; i < adev->num_ip_blocks; i++) { 3409 if (!adev->ip_blocks[i].status.hw) 3410 continue; 3411 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) { 3412 amdgpu_ip_block_hw_fini(&adev->ip_blocks[i]); 3413 break; 3414 } 3415 } 3416 } 3417 3418 static int amdgpu_device_ip_fini_early(struct amdgpu_device *adev) 3419 { 3420 int i, r; 3421 3422 for (i = 0; i < adev->num_ip_blocks; i++) { 3423 if (!adev->ip_blocks[i].version->funcs->early_fini) 3424 continue; 3425 3426 r = adev->ip_blocks[i].version->funcs->early_fini(&adev->ip_blocks[i]); 3427 if (r) { 3428 DRM_DEBUG("early_fini of IP block <%s> failed %d\n", 3429 adev->ip_blocks[i].version->funcs->name, r); 3430 } 3431 } 3432 3433 amdgpu_device_set_pg_state(adev, AMD_PG_STATE_UNGATE); 3434 amdgpu_device_set_cg_state(adev, AMD_CG_STATE_UNGATE); 3435 3436 amdgpu_amdkfd_suspend(adev, false); 3437 3438 /* Workaround for ASICs need to disable SMC first */ 3439 amdgpu_device_smu_fini_early(adev); 3440 3441 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 3442 if (!adev->ip_blocks[i].status.hw) 3443 continue; 3444 3445 amdgpu_ip_block_hw_fini(&adev->ip_blocks[i]); 3446 } 3447 3448 if (amdgpu_sriov_vf(adev)) { 3449 if (amdgpu_virt_release_full_gpu(adev, false)) 3450 DRM_ERROR("failed to release exclusive mode on fini\n"); 3451 } 3452 3453 return 0; 3454 } 3455 3456 /** 3457 * amdgpu_device_ip_fini - run fini for hardware IPs 3458 * 3459 * @adev: amdgpu_device pointer 3460 * 3461 * Main teardown pass for hardware IPs. The list of all the hardware 3462 * IPs that make up the asic is walked and the hw_fini and sw_fini callbacks 3463 * are run. hw_fini tears down the hardware associated with each IP 3464 * and sw_fini tears down any software state associated with each IP. 3465 * Returns 0 on success, negative error code on failure. 3466 */ 3467 static int amdgpu_device_ip_fini(struct amdgpu_device *adev) 3468 { 3469 int i, r; 3470 3471 amdgpu_cper_fini(adev); 3472 3473 if (amdgpu_sriov_vf(adev) && adev->virt.ras_init_done) 3474 amdgpu_virt_release_ras_err_handler_data(adev); 3475 3476 if (adev->gmc.xgmi.num_physical_nodes > 1) 3477 amdgpu_xgmi_remove_device(adev); 3478 3479 amdgpu_amdkfd_device_fini_sw(adev); 3480 3481 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 3482 if (!adev->ip_blocks[i].status.sw) 3483 continue; 3484 3485 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) { 3486 amdgpu_ucode_free_bo(adev); 3487 amdgpu_free_static_csa(&adev->virt.csa_obj); 3488 amdgpu_device_wb_fini(adev); 3489 amdgpu_device_mem_scratch_fini(adev); 3490 amdgpu_ib_pool_fini(adev); 3491 amdgpu_seq64_fini(adev); 3492 } 3493 if (adev->ip_blocks[i].version->funcs->sw_fini) { 3494 r = adev->ip_blocks[i].version->funcs->sw_fini(&adev->ip_blocks[i]); 3495 /* XXX handle errors */ 3496 if (r) { 3497 DRM_DEBUG("sw_fini of IP block <%s> failed %d\n", 3498 adev->ip_blocks[i].version->funcs->name, r); 3499 } 3500 } 3501 adev->ip_blocks[i].status.sw = false; 3502 adev->ip_blocks[i].status.valid = false; 3503 } 3504 3505 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 3506 if (!adev->ip_blocks[i].status.late_initialized) 3507 continue; 3508 if (adev->ip_blocks[i].version->funcs->late_fini) 3509 adev->ip_blocks[i].version->funcs->late_fini(&adev->ip_blocks[i]); 3510 adev->ip_blocks[i].status.late_initialized = false; 3511 } 3512 3513 amdgpu_ras_fini(adev); 3514 3515 return 0; 3516 } 3517 3518 /** 3519 * amdgpu_device_delayed_init_work_handler - work handler for IB tests 3520 * 3521 * @work: work_struct. 3522 */ 3523 static void amdgpu_device_delayed_init_work_handler(struct work_struct *work) 3524 { 3525 struct amdgpu_device *adev = 3526 container_of(work, struct amdgpu_device, delayed_init_work.work); 3527 int r; 3528 3529 r = amdgpu_ib_ring_tests(adev); 3530 if (r) 3531 DRM_ERROR("ib ring test failed (%d).\n", r); 3532 } 3533 3534 static void amdgpu_device_delay_enable_gfx_off(struct work_struct *work) 3535 { 3536 struct amdgpu_device *adev = 3537 container_of(work, struct amdgpu_device, gfx.gfx_off_delay_work.work); 3538 3539 WARN_ON_ONCE(adev->gfx.gfx_off_state); 3540 WARN_ON_ONCE(adev->gfx.gfx_off_req_count); 3541 3542 if (!amdgpu_dpm_set_powergating_by_smu(adev, AMD_IP_BLOCK_TYPE_GFX, true, 0)) 3543 adev->gfx.gfx_off_state = true; 3544 } 3545 3546 /** 3547 * amdgpu_device_ip_suspend_phase1 - run suspend for hardware IPs (phase 1) 3548 * 3549 * @adev: amdgpu_device pointer 3550 * 3551 * Main suspend function for hardware IPs. The list of all the hardware 3552 * IPs that make up the asic is walked, clockgating is disabled and the 3553 * suspend callbacks are run. suspend puts the hardware and software state 3554 * in each IP into a state suitable for suspend. 3555 * Returns 0 on success, negative error code on failure. 3556 */ 3557 static int amdgpu_device_ip_suspend_phase1(struct amdgpu_device *adev) 3558 { 3559 int i, r; 3560 3561 amdgpu_device_set_pg_state(adev, AMD_PG_STATE_UNGATE); 3562 amdgpu_device_set_cg_state(adev, AMD_CG_STATE_UNGATE); 3563 3564 /* 3565 * Per PMFW team's suggestion, driver needs to handle gfxoff 3566 * and df cstate features disablement for gpu reset(e.g. Mode1Reset) 3567 * scenario. Add the missing df cstate disablement here. 3568 */ 3569 if (amdgpu_dpm_set_df_cstate(adev, DF_CSTATE_DISALLOW)) 3570 dev_warn(adev->dev, "Failed to disallow df cstate"); 3571 3572 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 3573 if (!adev->ip_blocks[i].status.valid) 3574 continue; 3575 3576 /* displays are handled separately */ 3577 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_DCE) 3578 continue; 3579 3580 /* XXX handle errors */ 3581 r = amdgpu_ip_block_suspend(&adev->ip_blocks[i]); 3582 if (r) 3583 return r; 3584 } 3585 3586 return 0; 3587 } 3588 3589 /** 3590 * amdgpu_device_ip_suspend_phase2 - run suspend for hardware IPs (phase 2) 3591 * 3592 * @adev: amdgpu_device pointer 3593 * 3594 * Main suspend function for hardware IPs. The list of all the hardware 3595 * IPs that make up the asic is walked, clockgating is disabled and the 3596 * suspend callbacks are run. suspend puts the hardware and software state 3597 * in each IP into a state suitable for suspend. 3598 * Returns 0 on success, negative error code on failure. 3599 */ 3600 static int amdgpu_device_ip_suspend_phase2(struct amdgpu_device *adev) 3601 { 3602 int i, r; 3603 3604 if (adev->in_s0ix) 3605 amdgpu_dpm_gfx_state_change(adev, sGpuChangeState_D3Entry); 3606 3607 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 3608 if (!adev->ip_blocks[i].status.valid) 3609 continue; 3610 /* displays are handled in phase1 */ 3611 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE) 3612 continue; 3613 /* PSP lost connection when err_event_athub occurs */ 3614 if (amdgpu_ras_intr_triggered() && 3615 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) { 3616 adev->ip_blocks[i].status.hw = false; 3617 continue; 3618 } 3619 3620 /* skip unnecessary suspend if we do not initialize them yet */ 3621 if (!amdgpu_ip_member_of_hwini( 3622 adev, adev->ip_blocks[i].version->type)) 3623 continue; 3624 3625 /* skip suspend of gfx/mes and psp for S0ix 3626 * gfx is in gfxoff state, so on resume it will exit gfxoff just 3627 * like at runtime. PSP is also part of the always on hardware 3628 * so no need to suspend it. 3629 */ 3630 if (adev->in_s0ix && 3631 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP || 3632 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GFX || 3633 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_MES)) 3634 continue; 3635 3636 /* SDMA 5.x+ is part of GFX power domain so it's covered by GFXOFF */ 3637 if (adev->in_s0ix && 3638 (amdgpu_ip_version(adev, SDMA0_HWIP, 0) >= 3639 IP_VERSION(5, 0, 0)) && 3640 (adev->ip_blocks[i].version->type == 3641 AMD_IP_BLOCK_TYPE_SDMA)) 3642 continue; 3643 3644 /* Once swPSP provides the IMU, RLC FW binaries to TOS during cold-boot. 3645 * These are in TMR, hence are expected to be reused by PSP-TOS to reload 3646 * from this location and RLC Autoload automatically also gets loaded 3647 * from here based on PMFW -> PSP message during re-init sequence. 3648 * Therefore, the psp suspend & resume should be skipped to avoid destroy 3649 * the TMR and reload FWs again for IMU enabled APU ASICs. 3650 */ 3651 if (amdgpu_in_reset(adev) && 3652 (adev->flags & AMD_IS_APU) && adev->gfx.imu.funcs && 3653 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) 3654 continue; 3655 3656 /* XXX handle errors */ 3657 r = amdgpu_ip_block_suspend(&adev->ip_blocks[i]); 3658 adev->ip_blocks[i].status.hw = false; 3659 3660 /* handle putting the SMC in the appropriate state */ 3661 if (!amdgpu_sriov_vf(adev)) { 3662 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) { 3663 r = amdgpu_dpm_set_mp1_state(adev, adev->mp1_state); 3664 if (r) { 3665 DRM_ERROR("SMC failed to set mp1 state %d, %d\n", 3666 adev->mp1_state, r); 3667 return r; 3668 } 3669 } 3670 } 3671 } 3672 3673 return 0; 3674 } 3675 3676 /** 3677 * amdgpu_device_ip_suspend - run suspend for hardware IPs 3678 * 3679 * @adev: amdgpu_device pointer 3680 * 3681 * Main suspend function for hardware IPs. The list of all the hardware 3682 * IPs that make up the asic is walked, clockgating is disabled and the 3683 * suspend callbacks are run. suspend puts the hardware and software state 3684 * in each IP into a state suitable for suspend. 3685 * Returns 0 on success, negative error code on failure. 3686 */ 3687 int amdgpu_device_ip_suspend(struct amdgpu_device *adev) 3688 { 3689 int r; 3690 3691 if (amdgpu_sriov_vf(adev)) { 3692 amdgpu_virt_fini_data_exchange(adev); 3693 amdgpu_virt_request_full_gpu(adev, false); 3694 } 3695 3696 amdgpu_ttm_set_buffer_funcs_status(adev, false); 3697 3698 r = amdgpu_device_ip_suspend_phase1(adev); 3699 if (r) 3700 return r; 3701 r = amdgpu_device_ip_suspend_phase2(adev); 3702 3703 if (amdgpu_sriov_vf(adev)) 3704 amdgpu_virt_release_full_gpu(adev, false); 3705 3706 return r; 3707 } 3708 3709 static int amdgpu_device_ip_reinit_early_sriov(struct amdgpu_device *adev) 3710 { 3711 int i, r; 3712 3713 static enum amd_ip_block_type ip_order[] = { 3714 AMD_IP_BLOCK_TYPE_COMMON, 3715 AMD_IP_BLOCK_TYPE_GMC, 3716 AMD_IP_BLOCK_TYPE_PSP, 3717 AMD_IP_BLOCK_TYPE_IH, 3718 }; 3719 3720 for (i = 0; i < adev->num_ip_blocks; i++) { 3721 int j; 3722 struct amdgpu_ip_block *block; 3723 3724 block = &adev->ip_blocks[i]; 3725 block->status.hw = false; 3726 3727 for (j = 0; j < ARRAY_SIZE(ip_order); j++) { 3728 3729 if (block->version->type != ip_order[j] || 3730 !block->status.valid) 3731 continue; 3732 3733 r = block->version->funcs->hw_init(&adev->ip_blocks[i]); 3734 if (r) { 3735 dev_err(adev->dev, "RE-INIT-early: %s failed\n", 3736 block->version->funcs->name); 3737 return r; 3738 } 3739 block->status.hw = true; 3740 } 3741 } 3742 3743 return 0; 3744 } 3745 3746 static int amdgpu_device_ip_reinit_late_sriov(struct amdgpu_device *adev) 3747 { 3748 struct amdgpu_ip_block *block; 3749 int i, r = 0; 3750 3751 static enum amd_ip_block_type ip_order[] = { 3752 AMD_IP_BLOCK_TYPE_SMC, 3753 AMD_IP_BLOCK_TYPE_DCE, 3754 AMD_IP_BLOCK_TYPE_GFX, 3755 AMD_IP_BLOCK_TYPE_SDMA, 3756 AMD_IP_BLOCK_TYPE_MES, 3757 AMD_IP_BLOCK_TYPE_UVD, 3758 AMD_IP_BLOCK_TYPE_VCE, 3759 AMD_IP_BLOCK_TYPE_VCN, 3760 AMD_IP_BLOCK_TYPE_JPEG 3761 }; 3762 3763 for (i = 0; i < ARRAY_SIZE(ip_order); i++) { 3764 block = amdgpu_device_ip_get_ip_block(adev, ip_order[i]); 3765 3766 if (!block) 3767 continue; 3768 3769 if (block->status.valid && !block->status.hw) { 3770 if (block->version->type == AMD_IP_BLOCK_TYPE_SMC) { 3771 r = amdgpu_ip_block_resume(block); 3772 } else { 3773 r = block->version->funcs->hw_init(block); 3774 } 3775 3776 if (r) { 3777 dev_err(adev->dev, "RE-INIT-late: %s failed\n", 3778 block->version->funcs->name); 3779 break; 3780 } 3781 block->status.hw = true; 3782 } 3783 } 3784 3785 return r; 3786 } 3787 3788 /** 3789 * amdgpu_device_ip_resume_phase1 - run resume for hardware IPs 3790 * 3791 * @adev: amdgpu_device pointer 3792 * 3793 * First resume function for hardware IPs. The list of all the hardware 3794 * IPs that make up the asic is walked and the resume callbacks are run for 3795 * COMMON, GMC, and IH. resume puts the hardware into a functional state 3796 * after a suspend and updates the software state as necessary. This 3797 * function is also used for restoring the GPU after a GPU reset. 3798 * Returns 0 on success, negative error code on failure. 3799 */ 3800 static int amdgpu_device_ip_resume_phase1(struct amdgpu_device *adev) 3801 { 3802 int i, r; 3803 3804 for (i = 0; i < adev->num_ip_blocks; i++) { 3805 if (!adev->ip_blocks[i].status.valid || adev->ip_blocks[i].status.hw) 3806 continue; 3807 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON || 3808 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC || 3809 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH || 3810 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP && amdgpu_sriov_vf(adev))) { 3811 3812 r = amdgpu_ip_block_resume(&adev->ip_blocks[i]); 3813 if (r) 3814 return r; 3815 } 3816 } 3817 3818 return 0; 3819 } 3820 3821 /** 3822 * amdgpu_device_ip_resume_phase2 - run resume for hardware IPs 3823 * 3824 * @adev: amdgpu_device pointer 3825 * 3826 * Second resume function for hardware IPs. The list of all the hardware 3827 * IPs that make up the asic is walked and the resume callbacks are run for 3828 * all blocks except COMMON, GMC, and IH. resume puts the hardware into a 3829 * functional state after a suspend and updates the software state as 3830 * necessary. This function is also used for restoring the GPU after a GPU 3831 * reset. 3832 * Returns 0 on success, negative error code on failure. 3833 */ 3834 static int amdgpu_device_ip_resume_phase2(struct amdgpu_device *adev) 3835 { 3836 int i, r; 3837 3838 for (i = 0; i < adev->num_ip_blocks; i++) { 3839 if (!adev->ip_blocks[i].status.valid || adev->ip_blocks[i].status.hw) 3840 continue; 3841 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON || 3842 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC || 3843 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH || 3844 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE || 3845 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) 3846 continue; 3847 r = amdgpu_ip_block_resume(&adev->ip_blocks[i]); 3848 if (r) 3849 return r; 3850 } 3851 3852 return 0; 3853 } 3854 3855 /** 3856 * amdgpu_device_ip_resume_phase3 - run resume for hardware IPs 3857 * 3858 * @adev: amdgpu_device pointer 3859 * 3860 * Third resume function for hardware IPs. The list of all the hardware 3861 * IPs that make up the asic is walked and the resume callbacks are run for 3862 * all DCE. resume puts the hardware into a functional state after a suspend 3863 * and updates the software state as necessary. This function is also used 3864 * for restoring the GPU after a GPU reset. 3865 * 3866 * Returns 0 on success, negative error code on failure. 3867 */ 3868 static int amdgpu_device_ip_resume_phase3(struct amdgpu_device *adev) 3869 { 3870 int i, r; 3871 3872 for (i = 0; i < adev->num_ip_blocks; i++) { 3873 if (!adev->ip_blocks[i].status.valid || adev->ip_blocks[i].status.hw) 3874 continue; 3875 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE) { 3876 r = amdgpu_ip_block_resume(&adev->ip_blocks[i]); 3877 if (r) 3878 return r; 3879 } 3880 } 3881 3882 return 0; 3883 } 3884 3885 /** 3886 * amdgpu_device_ip_resume - run resume for hardware IPs 3887 * 3888 * @adev: amdgpu_device pointer 3889 * 3890 * Main resume function for hardware IPs. The hardware IPs 3891 * are split into two resume functions because they are 3892 * also used in recovering from a GPU reset and some additional 3893 * steps need to be take between them. In this case (S3/S4) they are 3894 * run sequentially. 3895 * Returns 0 on success, negative error code on failure. 3896 */ 3897 static int amdgpu_device_ip_resume(struct amdgpu_device *adev) 3898 { 3899 int r; 3900 3901 r = amdgpu_device_ip_resume_phase1(adev); 3902 if (r) 3903 return r; 3904 3905 r = amdgpu_device_fw_loading(adev); 3906 if (r) 3907 return r; 3908 3909 r = amdgpu_device_ip_resume_phase2(adev); 3910 3911 if (adev->mman.buffer_funcs_ring->sched.ready) 3912 amdgpu_ttm_set_buffer_funcs_status(adev, true); 3913 3914 if (r) 3915 return r; 3916 3917 amdgpu_fence_driver_hw_init(adev); 3918 3919 r = amdgpu_device_ip_resume_phase3(adev); 3920 3921 return r; 3922 } 3923 3924 /** 3925 * amdgpu_device_detect_sriov_bios - determine if the board supports SR-IOV 3926 * 3927 * @adev: amdgpu_device pointer 3928 * 3929 * Query the VBIOS data tables to determine if the board supports SR-IOV. 3930 */ 3931 static void amdgpu_device_detect_sriov_bios(struct amdgpu_device *adev) 3932 { 3933 if (amdgpu_sriov_vf(adev)) { 3934 if (adev->is_atom_fw) { 3935 if (amdgpu_atomfirmware_gpu_virtualization_supported(adev)) 3936 adev->virt.caps |= AMDGPU_SRIOV_CAPS_SRIOV_VBIOS; 3937 } else { 3938 if (amdgpu_atombios_has_gpu_virtualization_table(adev)) 3939 adev->virt.caps |= AMDGPU_SRIOV_CAPS_SRIOV_VBIOS; 3940 } 3941 3942 if (!(adev->virt.caps & AMDGPU_SRIOV_CAPS_SRIOV_VBIOS)) 3943 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_NO_VBIOS, 0, 0); 3944 } 3945 } 3946 3947 /** 3948 * amdgpu_device_asic_has_dc_support - determine if DC supports the asic 3949 * 3950 * @asic_type: AMD asic type 3951 * 3952 * Check if there is DC (new modesetting infrastructre) support for an asic. 3953 * returns true if DC has support, false if not. 3954 */ 3955 bool amdgpu_device_asic_has_dc_support(enum amd_asic_type asic_type) 3956 { 3957 switch (asic_type) { 3958 #ifdef CONFIG_DRM_AMDGPU_SI 3959 case CHIP_HAINAN: 3960 #endif 3961 case CHIP_TOPAZ: 3962 /* chips with no display hardware */ 3963 return false; 3964 #if defined(CONFIG_DRM_AMD_DC) 3965 case CHIP_TAHITI: 3966 case CHIP_PITCAIRN: 3967 case CHIP_VERDE: 3968 case CHIP_OLAND: 3969 /* 3970 * We have systems in the wild with these ASICs that require 3971 * LVDS and VGA support which is not supported with DC. 3972 * 3973 * Fallback to the non-DC driver here by default so as not to 3974 * cause regressions. 3975 */ 3976 #if defined(CONFIG_DRM_AMD_DC_SI) 3977 return amdgpu_dc > 0; 3978 #else 3979 return false; 3980 #endif 3981 case CHIP_BONAIRE: 3982 case CHIP_KAVERI: 3983 case CHIP_KABINI: 3984 case CHIP_MULLINS: 3985 /* 3986 * We have systems in the wild with these ASICs that require 3987 * VGA support which is not supported with DC. 3988 * 3989 * Fallback to the non-DC driver here by default so as not to 3990 * cause regressions. 3991 */ 3992 return amdgpu_dc > 0; 3993 default: 3994 return amdgpu_dc != 0; 3995 #else 3996 default: 3997 if (amdgpu_dc > 0) 3998 DRM_INFO_ONCE("Display Core has been requested via kernel parameter but isn't supported by ASIC, ignoring\n"); 3999 return false; 4000 #endif 4001 } 4002 } 4003 4004 /** 4005 * amdgpu_device_has_dc_support - check if dc is supported 4006 * 4007 * @adev: amdgpu_device pointer 4008 * 4009 * Returns true for supported, false for not supported 4010 */ 4011 bool amdgpu_device_has_dc_support(struct amdgpu_device *adev) 4012 { 4013 if (adev->enable_virtual_display || 4014 (adev->harvest_ip_mask & AMD_HARVEST_IP_DMU_MASK)) 4015 return false; 4016 4017 return amdgpu_device_asic_has_dc_support(adev->asic_type); 4018 } 4019 4020 static void amdgpu_device_xgmi_reset_func(struct work_struct *__work) 4021 { 4022 struct amdgpu_device *adev = 4023 container_of(__work, struct amdgpu_device, xgmi_reset_work); 4024 struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev); 4025 4026 /* It's a bug to not have a hive within this function */ 4027 if (WARN_ON(!hive)) 4028 return; 4029 4030 /* 4031 * Use task barrier to synchronize all xgmi reset works across the 4032 * hive. task_barrier_enter and task_barrier_exit will block 4033 * until all the threads running the xgmi reset works reach 4034 * those points. task_barrier_full will do both blocks. 4035 */ 4036 if (amdgpu_asic_reset_method(adev) == AMD_RESET_METHOD_BACO) { 4037 4038 task_barrier_enter(&hive->tb); 4039 adev->asic_reset_res = amdgpu_device_baco_enter(adev_to_drm(adev)); 4040 4041 if (adev->asic_reset_res) 4042 goto fail; 4043 4044 task_barrier_exit(&hive->tb); 4045 adev->asic_reset_res = amdgpu_device_baco_exit(adev_to_drm(adev)); 4046 4047 if (adev->asic_reset_res) 4048 goto fail; 4049 4050 amdgpu_ras_reset_error_count(adev, AMDGPU_RAS_BLOCK__MMHUB); 4051 } else { 4052 4053 task_barrier_full(&hive->tb); 4054 adev->asic_reset_res = amdgpu_asic_reset(adev); 4055 } 4056 4057 fail: 4058 if (adev->asic_reset_res) 4059 DRM_WARN("ASIC reset failed with error, %d for drm dev, %s", 4060 adev->asic_reset_res, adev_to_drm(adev)->unique); 4061 amdgpu_put_xgmi_hive(hive); 4062 } 4063 4064 static int amdgpu_device_get_job_timeout_settings(struct amdgpu_device *adev) 4065 { 4066 char *input = amdgpu_lockup_timeout; 4067 char *timeout_setting = NULL; 4068 int index = 0; 4069 long timeout; 4070 int ret = 0; 4071 4072 /* 4073 * By default timeout for non compute jobs is 10000 4074 * and 60000 for compute jobs. 4075 * In SR-IOV or passthrough mode, timeout for compute 4076 * jobs are 60000 by default. 4077 */ 4078 adev->gfx_timeout = msecs_to_jiffies(10000); 4079 adev->sdma_timeout = adev->video_timeout = adev->gfx_timeout; 4080 if (amdgpu_sriov_vf(adev)) 4081 adev->compute_timeout = amdgpu_sriov_is_pp_one_vf(adev) ? 4082 msecs_to_jiffies(60000) : msecs_to_jiffies(10000); 4083 else 4084 adev->compute_timeout = msecs_to_jiffies(60000); 4085 4086 if (strnlen(input, AMDGPU_MAX_TIMEOUT_PARAM_LENGTH)) { 4087 while ((timeout_setting = strsep(&input, ",")) && 4088 strnlen(timeout_setting, AMDGPU_MAX_TIMEOUT_PARAM_LENGTH)) { 4089 ret = kstrtol(timeout_setting, 0, &timeout); 4090 if (ret) 4091 return ret; 4092 4093 if (timeout == 0) { 4094 index++; 4095 continue; 4096 } else if (timeout < 0) { 4097 timeout = MAX_SCHEDULE_TIMEOUT; 4098 dev_warn(adev->dev, "lockup timeout disabled"); 4099 add_taint(TAINT_SOFTLOCKUP, LOCKDEP_STILL_OK); 4100 } else { 4101 timeout = msecs_to_jiffies(timeout); 4102 } 4103 4104 switch (index++) { 4105 case 0: 4106 adev->gfx_timeout = timeout; 4107 break; 4108 case 1: 4109 adev->compute_timeout = timeout; 4110 break; 4111 case 2: 4112 adev->sdma_timeout = timeout; 4113 break; 4114 case 3: 4115 adev->video_timeout = timeout; 4116 break; 4117 default: 4118 break; 4119 } 4120 } 4121 /* 4122 * There is only one value specified and 4123 * it should apply to all non-compute jobs. 4124 */ 4125 if (index == 1) { 4126 adev->sdma_timeout = adev->video_timeout = adev->gfx_timeout; 4127 if (amdgpu_sriov_vf(adev) || amdgpu_passthrough(adev)) 4128 adev->compute_timeout = adev->gfx_timeout; 4129 } 4130 } 4131 4132 return ret; 4133 } 4134 4135 /** 4136 * amdgpu_device_check_iommu_direct_map - check if RAM direct mapped to GPU 4137 * 4138 * @adev: amdgpu_device pointer 4139 * 4140 * RAM direct mapped to GPU if IOMMU is not enabled or is pass through mode 4141 */ 4142 static void amdgpu_device_check_iommu_direct_map(struct amdgpu_device *adev) 4143 { 4144 struct iommu_domain *domain; 4145 4146 domain = iommu_get_domain_for_dev(adev->dev); 4147 if (!domain || domain->type == IOMMU_DOMAIN_IDENTITY) 4148 adev->ram_is_direct_mapped = true; 4149 } 4150 4151 #if defined(CONFIG_HSA_AMD_P2P) 4152 /** 4153 * amdgpu_device_check_iommu_remap - Check if DMA remapping is enabled. 4154 * 4155 * @adev: amdgpu_device pointer 4156 * 4157 * return if IOMMU remapping bar address 4158 */ 4159 static bool amdgpu_device_check_iommu_remap(struct amdgpu_device *adev) 4160 { 4161 struct iommu_domain *domain; 4162 4163 domain = iommu_get_domain_for_dev(adev->dev); 4164 if (domain && (domain->type == IOMMU_DOMAIN_DMA || 4165 domain->type == IOMMU_DOMAIN_DMA_FQ)) 4166 return true; 4167 4168 return false; 4169 } 4170 #endif 4171 4172 static const struct attribute *amdgpu_dev_attributes[] = { 4173 &dev_attr_pcie_replay_count.attr, 4174 NULL 4175 }; 4176 4177 static void amdgpu_device_set_mcbp(struct amdgpu_device *adev) 4178 { 4179 if (amdgpu_mcbp == 1) 4180 adev->gfx.mcbp = true; 4181 else if (amdgpu_mcbp == 0) 4182 adev->gfx.mcbp = false; 4183 4184 if (amdgpu_sriov_vf(adev)) 4185 adev->gfx.mcbp = true; 4186 4187 if (adev->gfx.mcbp) 4188 DRM_INFO("MCBP is enabled\n"); 4189 } 4190 4191 /** 4192 * amdgpu_device_init - initialize the driver 4193 * 4194 * @adev: amdgpu_device pointer 4195 * @flags: driver flags 4196 * 4197 * Initializes the driver info and hw (all asics). 4198 * Returns 0 for success or an error on failure. 4199 * Called at driver startup. 4200 */ 4201 int amdgpu_device_init(struct amdgpu_device *adev, 4202 uint32_t flags) 4203 { 4204 struct drm_device *ddev = adev_to_drm(adev); 4205 struct pci_dev *pdev = adev->pdev; 4206 int r, i; 4207 bool px = false; 4208 u32 max_MBps; 4209 int tmp; 4210 4211 adev->shutdown = false; 4212 adev->flags = flags; 4213 4214 if (amdgpu_force_asic_type >= 0 && amdgpu_force_asic_type < CHIP_LAST) 4215 adev->asic_type = amdgpu_force_asic_type; 4216 else 4217 adev->asic_type = flags & AMD_ASIC_MASK; 4218 4219 adev->usec_timeout = AMDGPU_MAX_USEC_TIMEOUT; 4220 if (amdgpu_emu_mode == 1) 4221 adev->usec_timeout *= 10; 4222 adev->gmc.gart_size = 512 * 1024 * 1024; 4223 adev->accel_working = false; 4224 adev->num_rings = 0; 4225 RCU_INIT_POINTER(adev->gang_submit, dma_fence_get_stub()); 4226 adev->mman.buffer_funcs = NULL; 4227 adev->mman.buffer_funcs_ring = NULL; 4228 adev->vm_manager.vm_pte_funcs = NULL; 4229 adev->vm_manager.vm_pte_num_scheds = 0; 4230 adev->gmc.gmc_funcs = NULL; 4231 adev->harvest_ip_mask = 0x0; 4232 adev->fence_context = dma_fence_context_alloc(AMDGPU_MAX_RINGS); 4233 bitmap_zero(adev->gfx.pipe_reserve_bitmap, AMDGPU_MAX_COMPUTE_QUEUES); 4234 4235 adev->smc_rreg = &amdgpu_invalid_rreg; 4236 adev->smc_wreg = &amdgpu_invalid_wreg; 4237 adev->pcie_rreg = &amdgpu_invalid_rreg; 4238 adev->pcie_wreg = &amdgpu_invalid_wreg; 4239 adev->pcie_rreg_ext = &amdgpu_invalid_rreg_ext; 4240 adev->pcie_wreg_ext = &amdgpu_invalid_wreg_ext; 4241 adev->pciep_rreg = &amdgpu_invalid_rreg; 4242 adev->pciep_wreg = &amdgpu_invalid_wreg; 4243 adev->pcie_rreg64 = &amdgpu_invalid_rreg64; 4244 adev->pcie_wreg64 = &amdgpu_invalid_wreg64; 4245 adev->pcie_rreg64_ext = &amdgpu_invalid_rreg64_ext; 4246 adev->pcie_wreg64_ext = &amdgpu_invalid_wreg64_ext; 4247 adev->uvd_ctx_rreg = &amdgpu_invalid_rreg; 4248 adev->uvd_ctx_wreg = &amdgpu_invalid_wreg; 4249 adev->didt_rreg = &amdgpu_invalid_rreg; 4250 adev->didt_wreg = &amdgpu_invalid_wreg; 4251 adev->gc_cac_rreg = &amdgpu_invalid_rreg; 4252 adev->gc_cac_wreg = &amdgpu_invalid_wreg; 4253 adev->audio_endpt_rreg = &amdgpu_block_invalid_rreg; 4254 adev->audio_endpt_wreg = &amdgpu_block_invalid_wreg; 4255 4256 DRM_INFO("initializing kernel modesetting (%s 0x%04X:0x%04X 0x%04X:0x%04X 0x%02X).\n", 4257 amdgpu_asic_name[adev->asic_type], pdev->vendor, pdev->device, 4258 pdev->subsystem_vendor, pdev->subsystem_device, pdev->revision); 4259 4260 /* mutex initialization are all done here so we 4261 * can recall function without having locking issues 4262 */ 4263 mutex_init(&adev->firmware.mutex); 4264 mutex_init(&adev->pm.mutex); 4265 mutex_init(&adev->gfx.gpu_clock_mutex); 4266 mutex_init(&adev->srbm_mutex); 4267 mutex_init(&adev->gfx.pipe_reserve_mutex); 4268 mutex_init(&adev->gfx.gfx_off_mutex); 4269 mutex_init(&adev->gfx.partition_mutex); 4270 mutex_init(&adev->grbm_idx_mutex); 4271 mutex_init(&adev->mn_lock); 4272 mutex_init(&adev->virt.vf_errors.lock); 4273 hash_init(adev->mn_hash); 4274 mutex_init(&adev->psp.mutex); 4275 mutex_init(&adev->notifier_lock); 4276 mutex_init(&adev->pm.stable_pstate_ctx_lock); 4277 mutex_init(&adev->benchmark_mutex); 4278 mutex_init(&adev->gfx.reset_sem_mutex); 4279 /* Initialize the mutex for cleaner shader isolation between GFX and compute processes */ 4280 mutex_init(&adev->enforce_isolation_mutex); 4281 mutex_init(&adev->gfx.kfd_sch_mutex); 4282 4283 amdgpu_device_init_apu_flags(adev); 4284 4285 r = amdgpu_device_check_arguments(adev); 4286 if (r) 4287 return r; 4288 4289 spin_lock_init(&adev->mmio_idx_lock); 4290 spin_lock_init(&adev->smc_idx_lock); 4291 spin_lock_init(&adev->pcie_idx_lock); 4292 spin_lock_init(&adev->uvd_ctx_idx_lock); 4293 spin_lock_init(&adev->didt_idx_lock); 4294 spin_lock_init(&adev->gc_cac_idx_lock); 4295 spin_lock_init(&adev->se_cac_idx_lock); 4296 spin_lock_init(&adev->audio_endpt_idx_lock); 4297 spin_lock_init(&adev->mm_stats.lock); 4298 spin_lock_init(&adev->virt.rlcg_reg_lock); 4299 spin_lock_init(&adev->wb.lock); 4300 4301 INIT_LIST_HEAD(&adev->reset_list); 4302 4303 INIT_LIST_HEAD(&adev->ras_list); 4304 4305 INIT_LIST_HEAD(&adev->pm.od_kobj_list); 4306 4307 INIT_DELAYED_WORK(&adev->delayed_init_work, 4308 amdgpu_device_delayed_init_work_handler); 4309 INIT_DELAYED_WORK(&adev->gfx.gfx_off_delay_work, 4310 amdgpu_device_delay_enable_gfx_off); 4311 /* 4312 * Initialize the enforce_isolation work structures for each XCP 4313 * partition. This work handler is responsible for enforcing shader 4314 * isolation on AMD GPUs. It counts the number of emitted fences for 4315 * each GFX and compute ring. If there are any fences, it schedules 4316 * the `enforce_isolation_work` to be run after a delay. If there are 4317 * no fences, it signals the Kernel Fusion Driver (KFD) to resume the 4318 * runqueue. 4319 */ 4320 for (i = 0; i < MAX_XCP; i++) { 4321 INIT_DELAYED_WORK(&adev->gfx.enforce_isolation[i].work, 4322 amdgpu_gfx_enforce_isolation_handler); 4323 adev->gfx.enforce_isolation[i].adev = adev; 4324 adev->gfx.enforce_isolation[i].xcp_id = i; 4325 } 4326 4327 INIT_WORK(&adev->xgmi_reset_work, amdgpu_device_xgmi_reset_func); 4328 4329 adev->gfx.gfx_off_req_count = 1; 4330 adev->gfx.gfx_off_residency = 0; 4331 adev->gfx.gfx_off_entrycount = 0; 4332 adev->pm.ac_power = power_supply_is_system_supplied() > 0; 4333 4334 atomic_set(&adev->throttling_logging_enabled, 1); 4335 /* 4336 * If throttling continues, logging will be performed every minute 4337 * to avoid log flooding. "-1" is subtracted since the thermal 4338 * throttling interrupt comes every second. Thus, the total logging 4339 * interval is 59 seconds(retelimited printk interval) + 1(waiting 4340 * for throttling interrupt) = 60 seconds. 4341 */ 4342 ratelimit_state_init(&adev->throttling_logging_rs, (60 - 1) * HZ, 1); 4343 4344 ratelimit_set_flags(&adev->throttling_logging_rs, RATELIMIT_MSG_ON_RELEASE); 4345 4346 /* Registers mapping */ 4347 /* TODO: block userspace mapping of io register */ 4348 if (adev->asic_type >= CHIP_BONAIRE) { 4349 adev->rmmio_base = pci_resource_start(adev->pdev, 5); 4350 adev->rmmio_size = pci_resource_len(adev->pdev, 5); 4351 } else { 4352 adev->rmmio_base = pci_resource_start(adev->pdev, 2); 4353 adev->rmmio_size = pci_resource_len(adev->pdev, 2); 4354 } 4355 4356 for (i = 0; i < AMD_IP_BLOCK_TYPE_NUM; i++) 4357 atomic_set(&adev->pm.pwr_state[i], POWER_STATE_UNKNOWN); 4358 4359 adev->rmmio = ioremap(adev->rmmio_base, adev->rmmio_size); 4360 if (!adev->rmmio) 4361 return -ENOMEM; 4362 4363 DRM_INFO("register mmio base: 0x%08X\n", (uint32_t)adev->rmmio_base); 4364 DRM_INFO("register mmio size: %u\n", (unsigned int)adev->rmmio_size); 4365 4366 /* 4367 * Reset domain needs to be present early, before XGMI hive discovered 4368 * (if any) and initialized to use reset sem and in_gpu reset flag 4369 * early on during init and before calling to RREG32. 4370 */ 4371 adev->reset_domain = amdgpu_reset_create_reset_domain(SINGLE_DEVICE, "amdgpu-reset-dev"); 4372 if (!adev->reset_domain) 4373 return -ENOMEM; 4374 4375 /* detect hw virtualization here */ 4376 amdgpu_virt_init(adev); 4377 4378 amdgpu_device_get_pcie_info(adev); 4379 4380 r = amdgpu_device_get_job_timeout_settings(adev); 4381 if (r) { 4382 dev_err(adev->dev, "invalid lockup_timeout parameter syntax\n"); 4383 return r; 4384 } 4385 4386 amdgpu_device_set_mcbp(adev); 4387 4388 /* 4389 * By default, use default mode where all blocks are expected to be 4390 * initialized. At present a 'swinit' of blocks is required to be 4391 * completed before the need for a different level is detected. 4392 */ 4393 amdgpu_set_init_level(adev, AMDGPU_INIT_LEVEL_DEFAULT); 4394 /* early init functions */ 4395 r = amdgpu_device_ip_early_init(adev); 4396 if (r) 4397 return r; 4398 4399 /* Get rid of things like offb */ 4400 r = aperture_remove_conflicting_pci_devices(adev->pdev, amdgpu_kms_driver.name); 4401 if (r) 4402 return r; 4403 4404 /* Enable TMZ based on IP_VERSION */ 4405 amdgpu_gmc_tmz_set(adev); 4406 4407 if (amdgpu_sriov_vf(adev) && 4408 amdgpu_ip_version(adev, GC_HWIP, 0) >= IP_VERSION(10, 3, 0)) 4409 /* VF MMIO access (except mailbox range) from CPU 4410 * will be blocked during sriov runtime 4411 */ 4412 adev->virt.caps |= AMDGPU_VF_MMIO_ACCESS_PROTECT; 4413 4414 amdgpu_gmc_noretry_set(adev); 4415 /* Need to get xgmi info early to decide the reset behavior*/ 4416 if (adev->gmc.xgmi.supported) { 4417 r = adev->gfxhub.funcs->get_xgmi_info(adev); 4418 if (r) 4419 return r; 4420 } 4421 4422 /* enable PCIE atomic ops */ 4423 if (amdgpu_sriov_vf(adev)) { 4424 if (adev->virt.fw_reserve.p_pf2vf) 4425 adev->have_atomics_support = ((struct amd_sriov_msg_pf2vf_info *) 4426 adev->virt.fw_reserve.p_pf2vf)->pcie_atomic_ops_support_flags == 4427 (PCI_EXP_DEVCAP2_ATOMIC_COMP32 | PCI_EXP_DEVCAP2_ATOMIC_COMP64); 4428 /* APUs w/ gfx9 onwards doesn't reply on PCIe atomics, rather it is a 4429 * internal path natively support atomics, set have_atomics_support to true. 4430 */ 4431 } else if ((adev->flags & AMD_IS_APU) && 4432 (amdgpu_ip_version(adev, GC_HWIP, 0) > 4433 IP_VERSION(9, 0, 0))) { 4434 adev->have_atomics_support = true; 4435 } else { 4436 adev->have_atomics_support = 4437 !pci_enable_atomic_ops_to_root(adev->pdev, 4438 PCI_EXP_DEVCAP2_ATOMIC_COMP32 | 4439 PCI_EXP_DEVCAP2_ATOMIC_COMP64); 4440 } 4441 4442 if (!adev->have_atomics_support) 4443 dev_info(adev->dev, "PCIE atomic ops is not supported\n"); 4444 4445 /* doorbell bar mapping and doorbell index init*/ 4446 amdgpu_doorbell_init(adev); 4447 4448 if (amdgpu_emu_mode == 1) { 4449 /* post the asic on emulation mode */ 4450 emu_soc_asic_init(adev); 4451 goto fence_driver_init; 4452 } 4453 4454 amdgpu_reset_init(adev); 4455 4456 /* detect if we are with an SRIOV vbios */ 4457 if (adev->bios) 4458 amdgpu_device_detect_sriov_bios(adev); 4459 4460 /* check if we need to reset the asic 4461 * E.g., driver was not cleanly unloaded previously, etc. 4462 */ 4463 if (!amdgpu_sriov_vf(adev) && amdgpu_asic_need_reset_on_init(adev)) { 4464 if (adev->gmc.xgmi.num_physical_nodes) { 4465 dev_info(adev->dev, "Pending hive reset.\n"); 4466 amdgpu_set_init_level(adev, 4467 AMDGPU_INIT_LEVEL_MINIMAL_XGMI); 4468 } else if (amdgpu_ip_version(adev, MP1_HWIP, 0) == IP_VERSION(13, 0, 10) && 4469 !amdgpu_device_has_display_hardware(adev)) { 4470 r = psp_gpu_reset(adev); 4471 } else { 4472 tmp = amdgpu_reset_method; 4473 /* It should do a default reset when loading or reloading the driver, 4474 * regardless of the module parameter reset_method. 4475 */ 4476 amdgpu_reset_method = AMD_RESET_METHOD_NONE; 4477 r = amdgpu_asic_reset(adev); 4478 amdgpu_reset_method = tmp; 4479 } 4480 4481 if (r) { 4482 dev_err(adev->dev, "asic reset on init failed\n"); 4483 goto failed; 4484 } 4485 } 4486 4487 /* Post card if necessary */ 4488 if (amdgpu_device_need_post(adev)) { 4489 if (!adev->bios) { 4490 dev_err(adev->dev, "no vBIOS found\n"); 4491 r = -EINVAL; 4492 goto failed; 4493 } 4494 DRM_INFO("GPU posting now...\n"); 4495 r = amdgpu_device_asic_init(adev); 4496 if (r) { 4497 dev_err(adev->dev, "gpu post error!\n"); 4498 goto failed; 4499 } 4500 } 4501 4502 if (adev->bios) { 4503 if (adev->is_atom_fw) { 4504 /* Initialize clocks */ 4505 r = amdgpu_atomfirmware_get_clock_info(adev); 4506 if (r) { 4507 dev_err(adev->dev, "amdgpu_atomfirmware_get_clock_info failed\n"); 4508 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_GET_CLOCK_FAIL, 0, 0); 4509 goto failed; 4510 } 4511 } else { 4512 /* Initialize clocks */ 4513 r = amdgpu_atombios_get_clock_info(adev); 4514 if (r) { 4515 dev_err(adev->dev, "amdgpu_atombios_get_clock_info failed\n"); 4516 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_GET_CLOCK_FAIL, 0, 0); 4517 goto failed; 4518 } 4519 /* init i2c buses */ 4520 amdgpu_i2c_init(adev); 4521 } 4522 } 4523 4524 fence_driver_init: 4525 /* Fence driver */ 4526 r = amdgpu_fence_driver_sw_init(adev); 4527 if (r) { 4528 dev_err(adev->dev, "amdgpu_fence_driver_sw_init failed\n"); 4529 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_FENCE_INIT_FAIL, 0, 0); 4530 goto failed; 4531 } 4532 4533 /* init the mode config */ 4534 drm_mode_config_init(adev_to_drm(adev)); 4535 4536 r = amdgpu_device_ip_init(adev); 4537 if (r) { 4538 dev_err(adev->dev, "amdgpu_device_ip_init failed\n"); 4539 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_AMDGPU_INIT_FAIL, 0, 0); 4540 goto release_ras_con; 4541 } 4542 4543 amdgpu_fence_driver_hw_init(adev); 4544 4545 dev_info(adev->dev, 4546 "SE %d, SH per SE %d, CU per SH %d, active_cu_number %d\n", 4547 adev->gfx.config.max_shader_engines, 4548 adev->gfx.config.max_sh_per_se, 4549 adev->gfx.config.max_cu_per_sh, 4550 adev->gfx.cu_info.number); 4551 4552 adev->accel_working = true; 4553 4554 amdgpu_vm_check_compute_bug(adev); 4555 4556 /* Initialize the buffer migration limit. */ 4557 if (amdgpu_moverate >= 0) 4558 max_MBps = amdgpu_moverate; 4559 else 4560 max_MBps = 8; /* Allow 8 MB/s. */ 4561 /* Get a log2 for easy divisions. */ 4562 adev->mm_stats.log2_max_MBps = ilog2(max(1u, max_MBps)); 4563 4564 /* 4565 * Register gpu instance before amdgpu_device_enable_mgpu_fan_boost. 4566 * Otherwise the mgpu fan boost feature will be skipped due to the 4567 * gpu instance is counted less. 4568 */ 4569 amdgpu_register_gpu_instance(adev); 4570 4571 /* enable clockgating, etc. after ib tests, etc. since some blocks require 4572 * explicit gating rather than handling it automatically. 4573 */ 4574 if (adev->init_lvl->level != AMDGPU_INIT_LEVEL_MINIMAL_XGMI) { 4575 r = amdgpu_device_ip_late_init(adev); 4576 if (r) { 4577 dev_err(adev->dev, "amdgpu_device_ip_late_init failed\n"); 4578 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_AMDGPU_LATE_INIT_FAIL, 0, r); 4579 goto release_ras_con; 4580 } 4581 /* must succeed. */ 4582 amdgpu_ras_resume(adev); 4583 queue_delayed_work(system_wq, &adev->delayed_init_work, 4584 msecs_to_jiffies(AMDGPU_RESUME_MS)); 4585 } 4586 4587 if (amdgpu_sriov_vf(adev)) { 4588 amdgpu_virt_release_full_gpu(adev, true); 4589 flush_delayed_work(&adev->delayed_init_work); 4590 } 4591 4592 /* 4593 * Place those sysfs registering after `late_init`. As some of those 4594 * operations performed in `late_init` might affect the sysfs 4595 * interfaces creating. 4596 */ 4597 r = amdgpu_atombios_sysfs_init(adev); 4598 if (r) 4599 drm_err(&adev->ddev, 4600 "registering atombios sysfs failed (%d).\n", r); 4601 4602 r = amdgpu_pm_sysfs_init(adev); 4603 if (r) 4604 DRM_ERROR("registering pm sysfs failed (%d).\n", r); 4605 4606 r = amdgpu_ucode_sysfs_init(adev); 4607 if (r) { 4608 adev->ucode_sysfs_en = false; 4609 DRM_ERROR("Creating firmware sysfs failed (%d).\n", r); 4610 } else 4611 adev->ucode_sysfs_en = true; 4612 4613 r = sysfs_create_files(&adev->dev->kobj, amdgpu_dev_attributes); 4614 if (r) 4615 dev_err(adev->dev, "Could not create amdgpu device attr\n"); 4616 4617 r = devm_device_add_group(adev->dev, &amdgpu_board_attrs_group); 4618 if (r) 4619 dev_err(adev->dev, 4620 "Could not create amdgpu board attributes\n"); 4621 4622 amdgpu_fru_sysfs_init(adev); 4623 amdgpu_reg_state_sysfs_init(adev); 4624 amdgpu_xcp_cfg_sysfs_init(adev); 4625 4626 if (IS_ENABLED(CONFIG_PERF_EVENTS)) 4627 r = amdgpu_pmu_init(adev); 4628 if (r) 4629 dev_err(adev->dev, "amdgpu_pmu_init failed\n"); 4630 4631 /* Have stored pci confspace at hand for restore in sudden PCI error */ 4632 if (amdgpu_device_cache_pci_state(adev->pdev)) 4633 pci_restore_state(pdev); 4634 4635 /* if we have > 1 VGA cards, then disable the amdgpu VGA resources */ 4636 /* this will fail for cards that aren't VGA class devices, just 4637 * ignore it 4638 */ 4639 if ((adev->pdev->class >> 8) == PCI_CLASS_DISPLAY_VGA) 4640 vga_client_register(adev->pdev, amdgpu_device_vga_set_decode); 4641 4642 px = amdgpu_device_supports_px(ddev); 4643 4644 if (px || (!dev_is_removable(&adev->pdev->dev) && 4645 apple_gmux_detect(NULL, NULL))) 4646 vga_switcheroo_register_client(adev->pdev, 4647 &amdgpu_switcheroo_ops, px); 4648 4649 if (px) 4650 vga_switcheroo_init_domain_pm_ops(adev->dev, &adev->vga_pm_domain); 4651 4652 if (adev->init_lvl->level == AMDGPU_INIT_LEVEL_MINIMAL_XGMI) 4653 amdgpu_xgmi_reset_on_init(adev); 4654 4655 amdgpu_device_check_iommu_direct_map(adev); 4656 4657 adev->pm_nb.notifier_call = amdgpu_device_pm_notifier; 4658 r = register_pm_notifier(&adev->pm_nb); 4659 if (r) 4660 goto failed; 4661 4662 return 0; 4663 4664 release_ras_con: 4665 if (amdgpu_sriov_vf(adev)) 4666 amdgpu_virt_release_full_gpu(adev, true); 4667 4668 /* failed in exclusive mode due to timeout */ 4669 if (amdgpu_sriov_vf(adev) && 4670 !amdgpu_sriov_runtime(adev) && 4671 amdgpu_virt_mmio_blocked(adev) && 4672 !amdgpu_virt_wait_reset(adev)) { 4673 dev_err(adev->dev, "VF exclusive mode timeout\n"); 4674 /* Don't send request since VF is inactive. */ 4675 adev->virt.caps &= ~AMDGPU_SRIOV_CAPS_RUNTIME; 4676 adev->virt.ops = NULL; 4677 r = -EAGAIN; 4678 } 4679 amdgpu_release_ras_context(adev); 4680 4681 failed: 4682 amdgpu_vf_error_trans_all(adev); 4683 4684 return r; 4685 } 4686 4687 static void amdgpu_device_unmap_mmio(struct amdgpu_device *adev) 4688 { 4689 4690 /* Clear all CPU mappings pointing to this device */ 4691 unmap_mapping_range(adev->ddev.anon_inode->i_mapping, 0, 0, 1); 4692 4693 /* Unmap all mapped bars - Doorbell, registers and VRAM */ 4694 amdgpu_doorbell_fini(adev); 4695 4696 iounmap(adev->rmmio); 4697 adev->rmmio = NULL; 4698 if (adev->mman.aper_base_kaddr) 4699 iounmap(adev->mman.aper_base_kaddr); 4700 adev->mman.aper_base_kaddr = NULL; 4701 4702 /* Memory manager related */ 4703 if (!adev->gmc.xgmi.connected_to_cpu && !adev->gmc.is_app_apu) { 4704 arch_phys_wc_del(adev->gmc.vram_mtrr); 4705 arch_io_free_memtype_wc(adev->gmc.aper_base, adev->gmc.aper_size); 4706 } 4707 } 4708 4709 /** 4710 * amdgpu_device_fini_hw - tear down the driver 4711 * 4712 * @adev: amdgpu_device pointer 4713 * 4714 * Tear down the driver info (all asics). 4715 * Called at driver shutdown. 4716 */ 4717 void amdgpu_device_fini_hw(struct amdgpu_device *adev) 4718 { 4719 dev_info(adev->dev, "amdgpu: finishing device.\n"); 4720 flush_delayed_work(&adev->delayed_init_work); 4721 4722 if (adev->mman.initialized) 4723 drain_workqueue(adev->mman.bdev.wq); 4724 adev->shutdown = true; 4725 4726 unregister_pm_notifier(&adev->pm_nb); 4727 4728 /* make sure IB test finished before entering exclusive mode 4729 * to avoid preemption on IB test 4730 */ 4731 if (amdgpu_sriov_vf(adev)) { 4732 amdgpu_virt_request_full_gpu(adev, false); 4733 amdgpu_virt_fini_data_exchange(adev); 4734 } 4735 4736 /* disable all interrupts */ 4737 amdgpu_irq_disable_all(adev); 4738 if (adev->mode_info.mode_config_initialized) { 4739 if (!drm_drv_uses_atomic_modeset(adev_to_drm(adev))) 4740 drm_helper_force_disable_all(adev_to_drm(adev)); 4741 else 4742 drm_atomic_helper_shutdown(adev_to_drm(adev)); 4743 } 4744 amdgpu_fence_driver_hw_fini(adev); 4745 4746 if (adev->pm.sysfs_initialized) 4747 amdgpu_pm_sysfs_fini(adev); 4748 if (adev->ucode_sysfs_en) 4749 amdgpu_ucode_sysfs_fini(adev); 4750 sysfs_remove_files(&adev->dev->kobj, amdgpu_dev_attributes); 4751 amdgpu_fru_sysfs_fini(adev); 4752 4753 amdgpu_reg_state_sysfs_fini(adev); 4754 amdgpu_xcp_cfg_sysfs_fini(adev); 4755 4756 /* disable ras feature must before hw fini */ 4757 amdgpu_ras_pre_fini(adev); 4758 4759 amdgpu_ttm_set_buffer_funcs_status(adev, false); 4760 4761 amdgpu_device_ip_fini_early(adev); 4762 4763 amdgpu_irq_fini_hw(adev); 4764 4765 if (adev->mman.initialized) 4766 ttm_device_clear_dma_mappings(&adev->mman.bdev); 4767 4768 amdgpu_gart_dummy_page_fini(adev); 4769 4770 if (drm_dev_is_unplugged(adev_to_drm(adev))) 4771 amdgpu_device_unmap_mmio(adev); 4772 4773 } 4774 4775 void amdgpu_device_fini_sw(struct amdgpu_device *adev) 4776 { 4777 int idx; 4778 bool px; 4779 4780 amdgpu_device_ip_fini(adev); 4781 amdgpu_fence_driver_sw_fini(adev); 4782 amdgpu_ucode_release(&adev->firmware.gpu_info_fw); 4783 adev->accel_working = false; 4784 dma_fence_put(rcu_dereference_protected(adev->gang_submit, true)); 4785 4786 amdgpu_reset_fini(adev); 4787 4788 /* free i2c buses */ 4789 amdgpu_i2c_fini(adev); 4790 4791 if (adev->bios) { 4792 if (amdgpu_emu_mode != 1) 4793 amdgpu_atombios_fini(adev); 4794 amdgpu_bios_release(adev); 4795 } 4796 4797 kfree(adev->fru_info); 4798 adev->fru_info = NULL; 4799 4800 px = amdgpu_device_supports_px(adev_to_drm(adev)); 4801 4802 if (px || (!dev_is_removable(&adev->pdev->dev) && 4803 apple_gmux_detect(NULL, NULL))) 4804 vga_switcheroo_unregister_client(adev->pdev); 4805 4806 if (px) 4807 vga_switcheroo_fini_domain_pm_ops(adev->dev); 4808 4809 if ((adev->pdev->class >> 8) == PCI_CLASS_DISPLAY_VGA) 4810 vga_client_unregister(adev->pdev); 4811 4812 if (drm_dev_enter(adev_to_drm(adev), &idx)) { 4813 4814 iounmap(adev->rmmio); 4815 adev->rmmio = NULL; 4816 amdgpu_doorbell_fini(adev); 4817 drm_dev_exit(idx); 4818 } 4819 4820 if (IS_ENABLED(CONFIG_PERF_EVENTS)) 4821 amdgpu_pmu_fini(adev); 4822 if (adev->mman.discovery_bin) 4823 amdgpu_discovery_fini(adev); 4824 4825 amdgpu_reset_put_reset_domain(adev->reset_domain); 4826 adev->reset_domain = NULL; 4827 4828 kfree(adev->pci_state); 4829 4830 } 4831 4832 /** 4833 * amdgpu_device_evict_resources - evict device resources 4834 * @adev: amdgpu device object 4835 * 4836 * Evicts all ttm device resources(vram BOs, gart table) from the lru list 4837 * of the vram memory type. Mainly used for evicting device resources 4838 * at suspend time. 4839 * 4840 */ 4841 static int amdgpu_device_evict_resources(struct amdgpu_device *adev) 4842 { 4843 int ret; 4844 4845 /* No need to evict vram on APUs unless going to S4 */ 4846 if (!adev->in_s4 && (adev->flags & AMD_IS_APU)) 4847 return 0; 4848 4849 ret = amdgpu_ttm_evict_resources(adev, TTM_PL_VRAM); 4850 if (ret) 4851 DRM_WARN("evicting device resources failed\n"); 4852 return ret; 4853 } 4854 4855 /* 4856 * Suspend & resume. 4857 */ 4858 /** 4859 * amdgpu_device_pm_notifier - Notification block for Suspend/Hibernate events 4860 * @nb: notifier block 4861 * @mode: suspend mode 4862 * @data: data 4863 * 4864 * This function is called when the system is about to suspend or hibernate. 4865 * It is used to evict resources from the device before the system goes to 4866 * sleep while there is still access to swap. 4867 */ 4868 static int amdgpu_device_pm_notifier(struct notifier_block *nb, unsigned long mode, 4869 void *data) 4870 { 4871 struct amdgpu_device *adev = container_of(nb, struct amdgpu_device, pm_nb); 4872 int r; 4873 4874 switch (mode) { 4875 case PM_HIBERNATION_PREPARE: 4876 adev->in_s4 = true; 4877 fallthrough; 4878 case PM_SUSPEND_PREPARE: 4879 r = amdgpu_device_evict_resources(adev); 4880 /* 4881 * This is considered non-fatal at this time because 4882 * amdgpu_device_prepare() will also fatally evict resources. 4883 * See https://gitlab.freedesktop.org/drm/amd/-/issues/3781 4884 */ 4885 if (r) 4886 drm_warn(adev_to_drm(adev), "Failed to evict resources, freeze active processes if problems occur: %d\n", r); 4887 break; 4888 } 4889 4890 return NOTIFY_DONE; 4891 } 4892 4893 /** 4894 * amdgpu_device_prepare - prepare for device suspend 4895 * 4896 * @dev: drm dev pointer 4897 * 4898 * Prepare to put the hw in the suspend state (all asics). 4899 * Returns 0 for success or an error on failure. 4900 * Called at driver suspend. 4901 */ 4902 int amdgpu_device_prepare(struct drm_device *dev) 4903 { 4904 struct amdgpu_device *adev = drm_to_adev(dev); 4905 int i, r; 4906 4907 amdgpu_choose_low_power_state(adev); 4908 4909 if (dev->switch_power_state == DRM_SWITCH_POWER_OFF) 4910 return 0; 4911 4912 /* Evict the majority of BOs before starting suspend sequence */ 4913 r = amdgpu_device_evict_resources(adev); 4914 if (r) 4915 goto unprepare; 4916 4917 flush_delayed_work(&adev->gfx.gfx_off_delay_work); 4918 4919 for (i = 0; i < adev->num_ip_blocks; i++) { 4920 if (!adev->ip_blocks[i].status.valid) 4921 continue; 4922 if (!adev->ip_blocks[i].version->funcs->prepare_suspend) 4923 continue; 4924 r = adev->ip_blocks[i].version->funcs->prepare_suspend(&adev->ip_blocks[i]); 4925 if (r) 4926 goto unprepare; 4927 } 4928 4929 return 0; 4930 4931 unprepare: 4932 adev->in_s0ix = adev->in_s3 = adev->in_s4 = false; 4933 4934 return r; 4935 } 4936 4937 /** 4938 * amdgpu_device_suspend - initiate device suspend 4939 * 4940 * @dev: drm dev pointer 4941 * @notify_clients: notify in-kernel DRM clients 4942 * 4943 * Puts the hw in the suspend state (all asics). 4944 * Returns 0 for success or an error on failure. 4945 * Called at driver suspend. 4946 */ 4947 int amdgpu_device_suspend(struct drm_device *dev, bool notify_clients) 4948 { 4949 struct amdgpu_device *adev = drm_to_adev(dev); 4950 int r = 0; 4951 4952 if (dev->switch_power_state == DRM_SWITCH_POWER_OFF) 4953 return 0; 4954 4955 adev->in_suspend = true; 4956 4957 if (amdgpu_sriov_vf(adev)) { 4958 amdgpu_virt_fini_data_exchange(adev); 4959 r = amdgpu_virt_request_full_gpu(adev, false); 4960 if (r) 4961 return r; 4962 } 4963 4964 if (amdgpu_acpi_smart_shift_update(dev, AMDGPU_SS_DEV_D3)) 4965 DRM_WARN("smart shift update failed\n"); 4966 4967 if (notify_clients) 4968 drm_client_dev_suspend(adev_to_drm(adev), false); 4969 4970 cancel_delayed_work_sync(&adev->delayed_init_work); 4971 4972 amdgpu_ras_suspend(adev); 4973 4974 amdgpu_device_ip_suspend_phase1(adev); 4975 4976 if (!adev->in_s0ix) 4977 amdgpu_amdkfd_suspend(adev, adev->in_runpm); 4978 4979 r = amdgpu_device_evict_resources(adev); 4980 if (r) 4981 return r; 4982 4983 amdgpu_ttm_set_buffer_funcs_status(adev, false); 4984 4985 amdgpu_fence_driver_hw_fini(adev); 4986 4987 amdgpu_device_ip_suspend_phase2(adev); 4988 4989 if (amdgpu_sriov_vf(adev)) 4990 amdgpu_virt_release_full_gpu(adev, false); 4991 4992 r = amdgpu_dpm_notify_rlc_state(adev, false); 4993 if (r) 4994 return r; 4995 4996 return 0; 4997 } 4998 4999 /** 5000 * amdgpu_device_resume - initiate device resume 5001 * 5002 * @dev: drm dev pointer 5003 * @notify_clients: notify in-kernel DRM clients 5004 * 5005 * Bring the hw back to operating state (all asics). 5006 * Returns 0 for success or an error on failure. 5007 * Called at driver resume. 5008 */ 5009 int amdgpu_device_resume(struct drm_device *dev, bool notify_clients) 5010 { 5011 struct amdgpu_device *adev = drm_to_adev(dev); 5012 int r = 0; 5013 5014 if (amdgpu_sriov_vf(adev)) { 5015 r = amdgpu_virt_request_full_gpu(adev, true); 5016 if (r) 5017 return r; 5018 } 5019 5020 if (dev->switch_power_state == DRM_SWITCH_POWER_OFF) 5021 return 0; 5022 5023 if (adev->in_s0ix) 5024 amdgpu_dpm_gfx_state_change(adev, sGpuChangeState_D0Entry); 5025 5026 /* post card */ 5027 if (amdgpu_device_need_post(adev)) { 5028 r = amdgpu_device_asic_init(adev); 5029 if (r) 5030 dev_err(adev->dev, "amdgpu asic init failed\n"); 5031 } 5032 5033 r = amdgpu_device_ip_resume(adev); 5034 5035 if (r) { 5036 dev_err(adev->dev, "amdgpu_device_ip_resume failed (%d).\n", r); 5037 goto exit; 5038 } 5039 5040 if (!adev->in_s0ix) { 5041 r = amdgpu_amdkfd_resume(adev, adev->in_runpm); 5042 if (r) 5043 goto exit; 5044 } 5045 5046 r = amdgpu_device_ip_late_init(adev); 5047 if (r) 5048 goto exit; 5049 5050 queue_delayed_work(system_wq, &adev->delayed_init_work, 5051 msecs_to_jiffies(AMDGPU_RESUME_MS)); 5052 exit: 5053 if (amdgpu_sriov_vf(adev)) { 5054 amdgpu_virt_init_data_exchange(adev); 5055 amdgpu_virt_release_full_gpu(adev, true); 5056 } 5057 5058 if (r) 5059 return r; 5060 5061 /* Make sure IB tests flushed */ 5062 flush_delayed_work(&adev->delayed_init_work); 5063 5064 if (notify_clients) 5065 drm_client_dev_resume(adev_to_drm(adev), false); 5066 5067 amdgpu_ras_resume(adev); 5068 5069 if (adev->mode_info.num_crtc) { 5070 /* 5071 * Most of the connector probing functions try to acquire runtime pm 5072 * refs to ensure that the GPU is powered on when connector polling is 5073 * performed. Since we're calling this from a runtime PM callback, 5074 * trying to acquire rpm refs will cause us to deadlock. 5075 * 5076 * Since we're guaranteed to be holding the rpm lock, it's safe to 5077 * temporarily disable the rpm helpers so this doesn't deadlock us. 5078 */ 5079 #ifdef CONFIG_PM 5080 dev->dev->power.disable_depth++; 5081 #endif 5082 if (!adev->dc_enabled) 5083 drm_helper_hpd_irq_event(dev); 5084 else 5085 drm_kms_helper_hotplug_event(dev); 5086 #ifdef CONFIG_PM 5087 dev->dev->power.disable_depth--; 5088 #endif 5089 } 5090 adev->in_suspend = false; 5091 5092 if (adev->enable_mes) 5093 amdgpu_mes_self_test(adev); 5094 5095 if (amdgpu_acpi_smart_shift_update(dev, AMDGPU_SS_DEV_D0)) 5096 DRM_WARN("smart shift update failed\n"); 5097 5098 return 0; 5099 } 5100 5101 /** 5102 * amdgpu_device_ip_check_soft_reset - did soft reset succeed 5103 * 5104 * @adev: amdgpu_device pointer 5105 * 5106 * The list of all the hardware IPs that make up the asic is walked and 5107 * the check_soft_reset callbacks are run. check_soft_reset determines 5108 * if the asic is still hung or not. 5109 * Returns true if any of the IPs are still in a hung state, false if not. 5110 */ 5111 static bool amdgpu_device_ip_check_soft_reset(struct amdgpu_device *adev) 5112 { 5113 int i; 5114 bool asic_hang = false; 5115 5116 if (amdgpu_sriov_vf(adev)) 5117 return true; 5118 5119 if (amdgpu_asic_need_full_reset(adev)) 5120 return true; 5121 5122 for (i = 0; i < adev->num_ip_blocks; i++) { 5123 if (!adev->ip_blocks[i].status.valid) 5124 continue; 5125 if (adev->ip_blocks[i].version->funcs->check_soft_reset) 5126 adev->ip_blocks[i].status.hang = 5127 adev->ip_blocks[i].version->funcs->check_soft_reset( 5128 &adev->ip_blocks[i]); 5129 if (adev->ip_blocks[i].status.hang) { 5130 dev_info(adev->dev, "IP block:%s is hung!\n", adev->ip_blocks[i].version->funcs->name); 5131 asic_hang = true; 5132 } 5133 } 5134 return asic_hang; 5135 } 5136 5137 /** 5138 * amdgpu_device_ip_pre_soft_reset - prepare for soft reset 5139 * 5140 * @adev: amdgpu_device pointer 5141 * 5142 * The list of all the hardware IPs that make up the asic is walked and the 5143 * pre_soft_reset callbacks are run if the block is hung. pre_soft_reset 5144 * handles any IP specific hardware or software state changes that are 5145 * necessary for a soft reset to succeed. 5146 * Returns 0 on success, negative error code on failure. 5147 */ 5148 static int amdgpu_device_ip_pre_soft_reset(struct amdgpu_device *adev) 5149 { 5150 int i, r = 0; 5151 5152 for (i = 0; i < adev->num_ip_blocks; i++) { 5153 if (!adev->ip_blocks[i].status.valid) 5154 continue; 5155 if (adev->ip_blocks[i].status.hang && 5156 adev->ip_blocks[i].version->funcs->pre_soft_reset) { 5157 r = adev->ip_blocks[i].version->funcs->pre_soft_reset(&adev->ip_blocks[i]); 5158 if (r) 5159 return r; 5160 } 5161 } 5162 5163 return 0; 5164 } 5165 5166 /** 5167 * amdgpu_device_ip_need_full_reset - check if a full asic reset is needed 5168 * 5169 * @adev: amdgpu_device pointer 5170 * 5171 * Some hardware IPs cannot be soft reset. If they are hung, a full gpu 5172 * reset is necessary to recover. 5173 * Returns true if a full asic reset is required, false if not. 5174 */ 5175 static bool amdgpu_device_ip_need_full_reset(struct amdgpu_device *adev) 5176 { 5177 int i; 5178 5179 if (amdgpu_asic_need_full_reset(adev)) 5180 return true; 5181 5182 for (i = 0; i < adev->num_ip_blocks; i++) { 5183 if (!adev->ip_blocks[i].status.valid) 5184 continue; 5185 if ((adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) || 5186 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) || 5187 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_ACP) || 5188 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE) || 5189 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) { 5190 if (adev->ip_blocks[i].status.hang) { 5191 dev_info(adev->dev, "Some block need full reset!\n"); 5192 return true; 5193 } 5194 } 5195 } 5196 return false; 5197 } 5198 5199 /** 5200 * amdgpu_device_ip_soft_reset - do a soft reset 5201 * 5202 * @adev: amdgpu_device pointer 5203 * 5204 * The list of all the hardware IPs that make up the asic is walked and the 5205 * soft_reset callbacks are run if the block is hung. soft_reset handles any 5206 * IP specific hardware or software state changes that are necessary to soft 5207 * reset the IP. 5208 * Returns 0 on success, negative error code on failure. 5209 */ 5210 static int amdgpu_device_ip_soft_reset(struct amdgpu_device *adev) 5211 { 5212 int i, r = 0; 5213 5214 for (i = 0; i < adev->num_ip_blocks; i++) { 5215 if (!adev->ip_blocks[i].status.valid) 5216 continue; 5217 if (adev->ip_blocks[i].status.hang && 5218 adev->ip_blocks[i].version->funcs->soft_reset) { 5219 r = adev->ip_blocks[i].version->funcs->soft_reset(&adev->ip_blocks[i]); 5220 if (r) 5221 return r; 5222 } 5223 } 5224 5225 return 0; 5226 } 5227 5228 /** 5229 * amdgpu_device_ip_post_soft_reset - clean up from soft reset 5230 * 5231 * @adev: amdgpu_device pointer 5232 * 5233 * The list of all the hardware IPs that make up the asic is walked and the 5234 * post_soft_reset callbacks are run if the asic was hung. post_soft_reset 5235 * handles any IP specific hardware or software state changes that are 5236 * necessary after the IP has been soft reset. 5237 * Returns 0 on success, negative error code on failure. 5238 */ 5239 static int amdgpu_device_ip_post_soft_reset(struct amdgpu_device *adev) 5240 { 5241 int i, r = 0; 5242 5243 for (i = 0; i < adev->num_ip_blocks; i++) { 5244 if (!adev->ip_blocks[i].status.valid) 5245 continue; 5246 if (adev->ip_blocks[i].status.hang && 5247 adev->ip_blocks[i].version->funcs->post_soft_reset) 5248 r = adev->ip_blocks[i].version->funcs->post_soft_reset(&adev->ip_blocks[i]); 5249 if (r) 5250 return r; 5251 } 5252 5253 return 0; 5254 } 5255 5256 /** 5257 * amdgpu_device_reset_sriov - reset ASIC for SR-IOV vf 5258 * 5259 * @adev: amdgpu_device pointer 5260 * @reset_context: amdgpu reset context pointer 5261 * 5262 * do VF FLR and reinitialize Asic 5263 * return 0 means succeeded otherwise failed 5264 */ 5265 static int amdgpu_device_reset_sriov(struct amdgpu_device *adev, 5266 struct amdgpu_reset_context *reset_context) 5267 { 5268 int r; 5269 struct amdgpu_hive_info *hive = NULL; 5270 5271 if (test_bit(AMDGPU_HOST_FLR, &reset_context->flags)) { 5272 if (!amdgpu_ras_get_fed_status(adev)) 5273 amdgpu_virt_ready_to_reset(adev); 5274 amdgpu_virt_wait_reset(adev); 5275 clear_bit(AMDGPU_HOST_FLR, &reset_context->flags); 5276 r = amdgpu_virt_request_full_gpu(adev, true); 5277 } else { 5278 r = amdgpu_virt_reset_gpu(adev); 5279 } 5280 if (r) 5281 return r; 5282 5283 amdgpu_ras_clear_err_state(adev); 5284 amdgpu_irq_gpu_reset_resume_helper(adev); 5285 5286 /* some sw clean up VF needs to do before recover */ 5287 amdgpu_virt_post_reset(adev); 5288 5289 /* Resume IP prior to SMC */ 5290 r = amdgpu_device_ip_reinit_early_sriov(adev); 5291 if (r) 5292 return r; 5293 5294 amdgpu_virt_init_data_exchange(adev); 5295 5296 r = amdgpu_device_fw_loading(adev); 5297 if (r) 5298 return r; 5299 5300 /* now we are okay to resume SMC/CP/SDMA */ 5301 r = amdgpu_device_ip_reinit_late_sriov(adev); 5302 if (r) 5303 return r; 5304 5305 hive = amdgpu_get_xgmi_hive(adev); 5306 /* Update PSP FW topology after reset */ 5307 if (hive && adev->gmc.xgmi.num_physical_nodes > 1) 5308 r = amdgpu_xgmi_update_topology(hive, adev); 5309 if (hive) 5310 amdgpu_put_xgmi_hive(hive); 5311 if (r) 5312 return r; 5313 5314 r = amdgpu_ib_ring_tests(adev); 5315 if (r) 5316 return r; 5317 5318 if (adev->virt.gim_feature & AMDGIM_FEATURE_GIM_FLR_VRAMLOST) 5319 amdgpu_inc_vram_lost(adev); 5320 5321 /* need to be called during full access so we can't do it later like 5322 * bare-metal does. 5323 */ 5324 amdgpu_amdkfd_post_reset(adev); 5325 amdgpu_virt_release_full_gpu(adev, true); 5326 5327 /* Aldebaran and gfx_11_0_3 support ras in SRIOV, so need resume ras during reset */ 5328 if (amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 2) || 5329 amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 3) || 5330 amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 4) || 5331 amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(11, 0, 3)) 5332 amdgpu_ras_resume(adev); 5333 5334 amdgpu_virt_ras_telemetry_post_reset(adev); 5335 5336 return 0; 5337 } 5338 5339 /** 5340 * amdgpu_device_has_job_running - check if there is any unfinished job 5341 * 5342 * @adev: amdgpu_device pointer 5343 * 5344 * check if there is any job running on the device when guest driver receives 5345 * FLR notification from host driver. If there are still jobs running, then 5346 * the guest driver will not respond the FLR reset. Instead, let the job hit 5347 * the timeout and guest driver then issue the reset request. 5348 */ 5349 bool amdgpu_device_has_job_running(struct amdgpu_device *adev) 5350 { 5351 int i; 5352 5353 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 5354 struct amdgpu_ring *ring = adev->rings[i]; 5355 5356 if (!amdgpu_ring_sched_ready(ring)) 5357 continue; 5358 5359 if (amdgpu_fence_count_emitted(ring)) 5360 return true; 5361 } 5362 return false; 5363 } 5364 5365 /** 5366 * amdgpu_device_should_recover_gpu - check if we should try GPU recovery 5367 * 5368 * @adev: amdgpu_device pointer 5369 * 5370 * Check amdgpu_gpu_recovery and SRIOV status to see if we should try to recover 5371 * a hung GPU. 5372 */ 5373 bool amdgpu_device_should_recover_gpu(struct amdgpu_device *adev) 5374 { 5375 5376 if (amdgpu_gpu_recovery == 0) 5377 goto disabled; 5378 5379 /* Skip soft reset check in fatal error mode */ 5380 if (!amdgpu_ras_is_poison_mode_supported(adev)) 5381 return true; 5382 5383 if (amdgpu_sriov_vf(adev)) 5384 return true; 5385 5386 if (amdgpu_gpu_recovery == -1) { 5387 switch (adev->asic_type) { 5388 #ifdef CONFIG_DRM_AMDGPU_SI 5389 case CHIP_VERDE: 5390 case CHIP_TAHITI: 5391 case CHIP_PITCAIRN: 5392 case CHIP_OLAND: 5393 case CHIP_HAINAN: 5394 #endif 5395 #ifdef CONFIG_DRM_AMDGPU_CIK 5396 case CHIP_KAVERI: 5397 case CHIP_KABINI: 5398 case CHIP_MULLINS: 5399 #endif 5400 case CHIP_CARRIZO: 5401 case CHIP_STONEY: 5402 case CHIP_CYAN_SKILLFISH: 5403 goto disabled; 5404 default: 5405 break; 5406 } 5407 } 5408 5409 return true; 5410 5411 disabled: 5412 dev_info(adev->dev, "GPU recovery disabled.\n"); 5413 return false; 5414 } 5415 5416 int amdgpu_device_mode1_reset(struct amdgpu_device *adev) 5417 { 5418 u32 i; 5419 int ret = 0; 5420 5421 if (adev->bios) 5422 amdgpu_atombios_scratch_regs_engine_hung(adev, true); 5423 5424 dev_info(adev->dev, "GPU mode1 reset\n"); 5425 5426 /* Cache the state before bus master disable. The saved config space 5427 * values are used in other cases like restore after mode-2 reset. 5428 */ 5429 amdgpu_device_cache_pci_state(adev->pdev); 5430 5431 /* disable BM */ 5432 pci_clear_master(adev->pdev); 5433 5434 if (amdgpu_dpm_is_mode1_reset_supported(adev)) { 5435 dev_info(adev->dev, "GPU smu mode1 reset\n"); 5436 ret = amdgpu_dpm_mode1_reset(adev); 5437 } else { 5438 dev_info(adev->dev, "GPU psp mode1 reset\n"); 5439 ret = psp_gpu_reset(adev); 5440 } 5441 5442 if (ret) 5443 goto mode1_reset_failed; 5444 5445 amdgpu_device_load_pci_state(adev->pdev); 5446 ret = amdgpu_psp_wait_for_bootloader(adev); 5447 if (ret) 5448 goto mode1_reset_failed; 5449 5450 /* wait for asic to come out of reset */ 5451 for (i = 0; i < adev->usec_timeout; i++) { 5452 u32 memsize = adev->nbio.funcs->get_memsize(adev); 5453 5454 if (memsize != 0xffffffff) 5455 break; 5456 udelay(1); 5457 } 5458 5459 if (i >= adev->usec_timeout) { 5460 ret = -ETIMEDOUT; 5461 goto mode1_reset_failed; 5462 } 5463 5464 if (adev->bios) 5465 amdgpu_atombios_scratch_regs_engine_hung(adev, false); 5466 5467 return 0; 5468 5469 mode1_reset_failed: 5470 dev_err(adev->dev, "GPU mode1 reset failed\n"); 5471 return ret; 5472 } 5473 5474 int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev, 5475 struct amdgpu_reset_context *reset_context) 5476 { 5477 int i, r = 0; 5478 struct amdgpu_job *job = NULL; 5479 struct amdgpu_device *tmp_adev = reset_context->reset_req_dev; 5480 bool need_full_reset = 5481 test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 5482 5483 if (reset_context->reset_req_dev == adev) 5484 job = reset_context->job; 5485 5486 if (amdgpu_sriov_vf(adev)) 5487 amdgpu_virt_pre_reset(adev); 5488 5489 amdgpu_fence_driver_isr_toggle(adev, true); 5490 5491 /* block all schedulers and reset given job's ring */ 5492 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 5493 struct amdgpu_ring *ring = adev->rings[i]; 5494 5495 if (!amdgpu_ring_sched_ready(ring)) 5496 continue; 5497 5498 /* Clear job fence from fence drv to avoid force_completion 5499 * leave NULL and vm flush fence in fence drv 5500 */ 5501 amdgpu_fence_driver_clear_job_fences(ring); 5502 5503 /* after all hw jobs are reset, hw fence is meaningless, so force_completion */ 5504 amdgpu_fence_driver_force_completion(ring); 5505 } 5506 5507 amdgpu_fence_driver_isr_toggle(adev, false); 5508 5509 if (job && job->vm) 5510 drm_sched_increase_karma(&job->base); 5511 5512 r = amdgpu_reset_prepare_hwcontext(adev, reset_context); 5513 /* If reset handler not implemented, continue; otherwise return */ 5514 if (r == -EOPNOTSUPP) 5515 r = 0; 5516 else 5517 return r; 5518 5519 /* Don't suspend on bare metal if we are not going to HW reset the ASIC */ 5520 if (!amdgpu_sriov_vf(adev)) { 5521 5522 if (!need_full_reset) 5523 need_full_reset = amdgpu_device_ip_need_full_reset(adev); 5524 5525 if (!need_full_reset && amdgpu_gpu_recovery && 5526 amdgpu_device_ip_check_soft_reset(adev)) { 5527 amdgpu_device_ip_pre_soft_reset(adev); 5528 r = amdgpu_device_ip_soft_reset(adev); 5529 amdgpu_device_ip_post_soft_reset(adev); 5530 if (r || amdgpu_device_ip_check_soft_reset(adev)) { 5531 dev_info(adev->dev, "soft reset failed, will fallback to full reset!\n"); 5532 need_full_reset = true; 5533 } 5534 } 5535 5536 if (!test_bit(AMDGPU_SKIP_COREDUMP, &reset_context->flags)) { 5537 dev_info(tmp_adev->dev, "Dumping IP State\n"); 5538 /* Trigger ip dump before we reset the asic */ 5539 for (i = 0; i < tmp_adev->num_ip_blocks; i++) 5540 if (tmp_adev->ip_blocks[i].version->funcs->dump_ip_state) 5541 tmp_adev->ip_blocks[i].version->funcs 5542 ->dump_ip_state((void *)&tmp_adev->ip_blocks[i]); 5543 dev_info(tmp_adev->dev, "Dumping IP State Completed\n"); 5544 } 5545 5546 if (need_full_reset) 5547 r = amdgpu_device_ip_suspend(adev); 5548 if (need_full_reset) 5549 set_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 5550 else 5551 clear_bit(AMDGPU_NEED_FULL_RESET, 5552 &reset_context->flags); 5553 } 5554 5555 return r; 5556 } 5557 5558 int amdgpu_device_reinit_after_reset(struct amdgpu_reset_context *reset_context) 5559 { 5560 struct list_head *device_list_handle; 5561 bool full_reset, vram_lost = false; 5562 struct amdgpu_device *tmp_adev; 5563 int r, init_level; 5564 5565 device_list_handle = reset_context->reset_device_list; 5566 5567 if (!device_list_handle) 5568 return -EINVAL; 5569 5570 full_reset = test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 5571 5572 /** 5573 * If it's reset on init, it's default init level, otherwise keep level 5574 * as recovery level. 5575 */ 5576 if (reset_context->method == AMD_RESET_METHOD_ON_INIT) 5577 init_level = AMDGPU_INIT_LEVEL_DEFAULT; 5578 else 5579 init_level = AMDGPU_INIT_LEVEL_RESET_RECOVERY; 5580 5581 r = 0; 5582 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 5583 amdgpu_set_init_level(tmp_adev, init_level); 5584 if (full_reset) { 5585 /* post card */ 5586 amdgpu_ras_clear_err_state(tmp_adev); 5587 r = amdgpu_device_asic_init(tmp_adev); 5588 if (r) { 5589 dev_warn(tmp_adev->dev, "asic atom init failed!"); 5590 } else { 5591 dev_info(tmp_adev->dev, "GPU reset succeeded, trying to resume\n"); 5592 5593 r = amdgpu_device_ip_resume_phase1(tmp_adev); 5594 if (r) 5595 goto out; 5596 5597 vram_lost = amdgpu_device_check_vram_lost(tmp_adev); 5598 5599 if (!test_bit(AMDGPU_SKIP_COREDUMP, &reset_context->flags)) 5600 amdgpu_coredump(tmp_adev, false, vram_lost, reset_context->job); 5601 5602 if (vram_lost) { 5603 DRM_INFO("VRAM is lost due to GPU reset!\n"); 5604 amdgpu_inc_vram_lost(tmp_adev); 5605 } 5606 5607 r = amdgpu_device_fw_loading(tmp_adev); 5608 if (r) 5609 return r; 5610 5611 r = amdgpu_xcp_restore_partition_mode( 5612 tmp_adev->xcp_mgr); 5613 if (r) 5614 goto out; 5615 5616 r = amdgpu_device_ip_resume_phase2(tmp_adev); 5617 if (r) 5618 goto out; 5619 5620 if (tmp_adev->mman.buffer_funcs_ring->sched.ready) 5621 amdgpu_ttm_set_buffer_funcs_status(tmp_adev, true); 5622 5623 r = amdgpu_device_ip_resume_phase3(tmp_adev); 5624 if (r) 5625 goto out; 5626 5627 if (vram_lost) 5628 amdgpu_device_fill_reset_magic(tmp_adev); 5629 5630 /* 5631 * Add this ASIC as tracked as reset was already 5632 * complete successfully. 5633 */ 5634 amdgpu_register_gpu_instance(tmp_adev); 5635 5636 if (!reset_context->hive && 5637 tmp_adev->gmc.xgmi.num_physical_nodes > 1) 5638 amdgpu_xgmi_add_device(tmp_adev); 5639 5640 r = amdgpu_device_ip_late_init(tmp_adev); 5641 if (r) 5642 goto out; 5643 5644 drm_client_dev_resume(adev_to_drm(tmp_adev), false); 5645 5646 /* 5647 * The GPU enters bad state once faulty pages 5648 * by ECC has reached the threshold, and ras 5649 * recovery is scheduled next. So add one check 5650 * here to break recovery if it indeed exceeds 5651 * bad page threshold, and remind user to 5652 * retire this GPU or setting one bigger 5653 * bad_page_threshold value to fix this once 5654 * probing driver again. 5655 */ 5656 if (!amdgpu_ras_is_rma(tmp_adev)) { 5657 /* must succeed. */ 5658 amdgpu_ras_resume(tmp_adev); 5659 } else { 5660 r = -EINVAL; 5661 goto out; 5662 } 5663 5664 /* Update PSP FW topology after reset */ 5665 if (reset_context->hive && 5666 tmp_adev->gmc.xgmi.num_physical_nodes > 1) 5667 r = amdgpu_xgmi_update_topology( 5668 reset_context->hive, tmp_adev); 5669 } 5670 } 5671 5672 out: 5673 if (!r) { 5674 /* IP init is complete now, set level as default */ 5675 amdgpu_set_init_level(tmp_adev, 5676 AMDGPU_INIT_LEVEL_DEFAULT); 5677 amdgpu_irq_gpu_reset_resume_helper(tmp_adev); 5678 r = amdgpu_ib_ring_tests(tmp_adev); 5679 if (r) { 5680 dev_err(tmp_adev->dev, "ib ring test failed (%d).\n", r); 5681 r = -EAGAIN; 5682 goto end; 5683 } 5684 } 5685 5686 if (r) 5687 tmp_adev->asic_reset_res = r; 5688 } 5689 5690 end: 5691 return r; 5692 } 5693 5694 int amdgpu_do_asic_reset(struct list_head *device_list_handle, 5695 struct amdgpu_reset_context *reset_context) 5696 { 5697 struct amdgpu_device *tmp_adev = NULL; 5698 bool need_full_reset, skip_hw_reset; 5699 int r = 0; 5700 5701 /* Try reset handler method first */ 5702 tmp_adev = list_first_entry(device_list_handle, struct amdgpu_device, 5703 reset_list); 5704 5705 reset_context->reset_device_list = device_list_handle; 5706 r = amdgpu_reset_perform_reset(tmp_adev, reset_context); 5707 /* If reset handler not implemented, continue; otherwise return */ 5708 if (r == -EOPNOTSUPP) 5709 r = 0; 5710 else 5711 return r; 5712 5713 /* Reset handler not implemented, use the default method */ 5714 need_full_reset = 5715 test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 5716 skip_hw_reset = test_bit(AMDGPU_SKIP_HW_RESET, &reset_context->flags); 5717 5718 /* 5719 * ASIC reset has to be done on all XGMI hive nodes ASAP 5720 * to allow proper links negotiation in FW (within 1 sec) 5721 */ 5722 if (!skip_hw_reset && need_full_reset) { 5723 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 5724 /* For XGMI run all resets in parallel to speed up the process */ 5725 if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) { 5726 if (!queue_work(system_unbound_wq, 5727 &tmp_adev->xgmi_reset_work)) 5728 r = -EALREADY; 5729 } else 5730 r = amdgpu_asic_reset(tmp_adev); 5731 5732 if (r) { 5733 dev_err(tmp_adev->dev, 5734 "ASIC reset failed with error, %d for drm dev, %s", 5735 r, adev_to_drm(tmp_adev)->unique); 5736 goto out; 5737 } 5738 } 5739 5740 /* For XGMI wait for all resets to complete before proceed */ 5741 if (!r) { 5742 list_for_each_entry(tmp_adev, device_list_handle, 5743 reset_list) { 5744 if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) { 5745 flush_work(&tmp_adev->xgmi_reset_work); 5746 r = tmp_adev->asic_reset_res; 5747 if (r) 5748 break; 5749 } 5750 } 5751 } 5752 } 5753 5754 if (!r && amdgpu_ras_intr_triggered()) { 5755 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 5756 amdgpu_ras_reset_error_count(tmp_adev, 5757 AMDGPU_RAS_BLOCK__MMHUB); 5758 } 5759 5760 amdgpu_ras_intr_cleared(); 5761 } 5762 5763 r = amdgpu_device_reinit_after_reset(reset_context); 5764 if (r == -EAGAIN) 5765 set_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 5766 else 5767 clear_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 5768 5769 out: 5770 return r; 5771 } 5772 5773 static void amdgpu_device_set_mp1_state(struct amdgpu_device *adev) 5774 { 5775 5776 switch (amdgpu_asic_reset_method(adev)) { 5777 case AMD_RESET_METHOD_MODE1: 5778 adev->mp1_state = PP_MP1_STATE_SHUTDOWN; 5779 break; 5780 case AMD_RESET_METHOD_MODE2: 5781 adev->mp1_state = PP_MP1_STATE_RESET; 5782 break; 5783 default: 5784 adev->mp1_state = PP_MP1_STATE_NONE; 5785 break; 5786 } 5787 } 5788 5789 static void amdgpu_device_unset_mp1_state(struct amdgpu_device *adev) 5790 { 5791 amdgpu_vf_error_trans_all(adev); 5792 adev->mp1_state = PP_MP1_STATE_NONE; 5793 } 5794 5795 static void amdgpu_device_resume_display_audio(struct amdgpu_device *adev) 5796 { 5797 struct pci_dev *p = NULL; 5798 5799 p = pci_get_domain_bus_and_slot(pci_domain_nr(adev->pdev->bus), 5800 adev->pdev->bus->number, 1); 5801 if (p) { 5802 pm_runtime_enable(&(p->dev)); 5803 pm_runtime_resume(&(p->dev)); 5804 } 5805 5806 pci_dev_put(p); 5807 } 5808 5809 static int amdgpu_device_suspend_display_audio(struct amdgpu_device *adev) 5810 { 5811 enum amd_reset_method reset_method; 5812 struct pci_dev *p = NULL; 5813 u64 expires; 5814 5815 /* 5816 * For now, only BACO and mode1 reset are confirmed 5817 * to suffer the audio issue without proper suspended. 5818 */ 5819 reset_method = amdgpu_asic_reset_method(adev); 5820 if ((reset_method != AMD_RESET_METHOD_BACO) && 5821 (reset_method != AMD_RESET_METHOD_MODE1)) 5822 return -EINVAL; 5823 5824 p = pci_get_domain_bus_and_slot(pci_domain_nr(adev->pdev->bus), 5825 adev->pdev->bus->number, 1); 5826 if (!p) 5827 return -ENODEV; 5828 5829 expires = pm_runtime_autosuspend_expiration(&(p->dev)); 5830 if (!expires) 5831 /* 5832 * If we cannot get the audio device autosuspend delay, 5833 * a fixed 4S interval will be used. Considering 3S is 5834 * the audio controller default autosuspend delay setting. 5835 * 4S used here is guaranteed to cover that. 5836 */ 5837 expires = ktime_get_mono_fast_ns() + NSEC_PER_SEC * 4ULL; 5838 5839 while (!pm_runtime_status_suspended(&(p->dev))) { 5840 if (!pm_runtime_suspend(&(p->dev))) 5841 break; 5842 5843 if (expires < ktime_get_mono_fast_ns()) { 5844 dev_warn(adev->dev, "failed to suspend display audio\n"); 5845 pci_dev_put(p); 5846 /* TODO: abort the succeeding gpu reset? */ 5847 return -ETIMEDOUT; 5848 } 5849 } 5850 5851 pm_runtime_disable(&(p->dev)); 5852 5853 pci_dev_put(p); 5854 return 0; 5855 } 5856 5857 static inline void amdgpu_device_stop_pending_resets(struct amdgpu_device *adev) 5858 { 5859 struct amdgpu_ras *con = amdgpu_ras_get_context(adev); 5860 5861 #if defined(CONFIG_DEBUG_FS) 5862 if (!amdgpu_sriov_vf(adev)) 5863 cancel_work(&adev->reset_work); 5864 #endif 5865 5866 if (adev->kfd.dev) 5867 cancel_work(&adev->kfd.reset_work); 5868 5869 if (amdgpu_sriov_vf(adev)) 5870 cancel_work(&adev->virt.flr_work); 5871 5872 if (con && adev->ras_enabled) 5873 cancel_work(&con->recovery_work); 5874 5875 } 5876 5877 static int amdgpu_device_health_check(struct list_head *device_list_handle) 5878 { 5879 struct amdgpu_device *tmp_adev; 5880 int ret = 0; 5881 u32 status; 5882 5883 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 5884 pci_read_config_dword(tmp_adev->pdev, PCI_COMMAND, &status); 5885 if (PCI_POSSIBLE_ERROR(status)) { 5886 dev_err(tmp_adev->dev, "device lost from bus!"); 5887 ret = -ENODEV; 5888 } 5889 } 5890 5891 return ret; 5892 } 5893 5894 /** 5895 * amdgpu_device_gpu_recover - reset the asic and recover scheduler 5896 * 5897 * @adev: amdgpu_device pointer 5898 * @job: which job trigger hang 5899 * @reset_context: amdgpu reset context pointer 5900 * 5901 * Attempt to reset the GPU if it has hung (all asics). 5902 * Attempt to do soft-reset or full-reset and reinitialize Asic 5903 * Returns 0 for success or an error on failure. 5904 */ 5905 5906 int amdgpu_device_gpu_recover(struct amdgpu_device *adev, 5907 struct amdgpu_job *job, 5908 struct amdgpu_reset_context *reset_context) 5909 { 5910 struct list_head device_list, *device_list_handle = NULL; 5911 bool job_signaled = false; 5912 struct amdgpu_hive_info *hive = NULL; 5913 struct amdgpu_device *tmp_adev = NULL; 5914 int i, r = 0; 5915 bool need_emergency_restart = false; 5916 bool audio_suspended = false; 5917 int retry_limit = AMDGPU_MAX_RETRY_LIMIT; 5918 5919 /* 5920 * If it reaches here because of hang/timeout and a RAS error is 5921 * detected at the same time, let RAS recovery take care of it. 5922 */ 5923 if (amdgpu_ras_is_err_state(adev, AMDGPU_RAS_BLOCK__ANY) && 5924 !amdgpu_sriov_vf(adev) && 5925 reset_context->src != AMDGPU_RESET_SRC_RAS) { 5926 dev_dbg(adev->dev, 5927 "Gpu recovery from source: %d yielding to RAS error recovery handling", 5928 reset_context->src); 5929 return 0; 5930 } 5931 /* 5932 * Special case: RAS triggered and full reset isn't supported 5933 */ 5934 need_emergency_restart = amdgpu_ras_need_emergency_restart(adev); 5935 5936 /* 5937 * Flush RAM to disk so that after reboot 5938 * the user can read log and see why the system rebooted. 5939 */ 5940 if (need_emergency_restart && amdgpu_ras_get_context(adev) && 5941 amdgpu_ras_get_context(adev)->reboot) { 5942 DRM_WARN("Emergency reboot."); 5943 5944 ksys_sync_helper(); 5945 emergency_restart(); 5946 } 5947 5948 dev_info(adev->dev, "GPU %s begin!\n", 5949 need_emergency_restart ? "jobs stop":"reset"); 5950 5951 if (!amdgpu_sriov_vf(adev)) 5952 hive = amdgpu_get_xgmi_hive(adev); 5953 if (hive) 5954 mutex_lock(&hive->hive_lock); 5955 5956 reset_context->job = job; 5957 reset_context->hive = hive; 5958 /* 5959 * Build list of devices to reset. 5960 * In case we are in XGMI hive mode, resort the device list 5961 * to put adev in the 1st position. 5962 */ 5963 INIT_LIST_HEAD(&device_list); 5964 if (!amdgpu_sriov_vf(adev) && (adev->gmc.xgmi.num_physical_nodes > 1) && hive) { 5965 list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head) { 5966 list_add_tail(&tmp_adev->reset_list, &device_list); 5967 if (adev->shutdown) 5968 tmp_adev->shutdown = true; 5969 } 5970 if (!list_is_first(&adev->reset_list, &device_list)) 5971 list_rotate_to_front(&adev->reset_list, &device_list); 5972 device_list_handle = &device_list; 5973 } else { 5974 list_add_tail(&adev->reset_list, &device_list); 5975 device_list_handle = &device_list; 5976 } 5977 5978 if (!amdgpu_sriov_vf(adev)) { 5979 r = amdgpu_device_health_check(device_list_handle); 5980 if (r) 5981 goto end_reset; 5982 } 5983 5984 /* We need to lock reset domain only once both for XGMI and single device */ 5985 tmp_adev = list_first_entry(device_list_handle, struct amdgpu_device, 5986 reset_list); 5987 amdgpu_device_lock_reset_domain(tmp_adev->reset_domain); 5988 5989 /* block all schedulers and reset given job's ring */ 5990 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 5991 5992 amdgpu_device_set_mp1_state(tmp_adev); 5993 5994 /* 5995 * Try to put the audio codec into suspend state 5996 * before gpu reset started. 5997 * 5998 * Due to the power domain of the graphics device 5999 * is shared with AZ power domain. Without this, 6000 * we may change the audio hardware from behind 6001 * the audio driver's back. That will trigger 6002 * some audio codec errors. 6003 */ 6004 if (!amdgpu_device_suspend_display_audio(tmp_adev)) 6005 audio_suspended = true; 6006 6007 amdgpu_ras_set_error_query_ready(tmp_adev, false); 6008 6009 cancel_delayed_work_sync(&tmp_adev->delayed_init_work); 6010 6011 amdgpu_amdkfd_pre_reset(tmp_adev, reset_context); 6012 6013 /* 6014 * Mark these ASICs to be reset as untracked first 6015 * And add them back after reset completed 6016 */ 6017 amdgpu_unregister_gpu_instance(tmp_adev); 6018 6019 drm_client_dev_suspend(adev_to_drm(tmp_adev), false); 6020 6021 /* disable ras on ALL IPs */ 6022 if (!need_emergency_restart && 6023 amdgpu_device_ip_need_full_reset(tmp_adev)) 6024 amdgpu_ras_suspend(tmp_adev); 6025 6026 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 6027 struct amdgpu_ring *ring = tmp_adev->rings[i]; 6028 6029 if (!amdgpu_ring_sched_ready(ring)) 6030 continue; 6031 6032 drm_sched_stop(&ring->sched, job ? &job->base : NULL); 6033 6034 if (need_emergency_restart) 6035 amdgpu_job_stop_all_jobs_on_sched(&ring->sched); 6036 } 6037 atomic_inc(&tmp_adev->gpu_reset_counter); 6038 } 6039 6040 if (need_emergency_restart) 6041 goto skip_sched_resume; 6042 6043 /* 6044 * Must check guilty signal here since after this point all old 6045 * HW fences are force signaled. 6046 * 6047 * job->base holds a reference to parent fence 6048 */ 6049 if (job && dma_fence_is_signaled(&job->hw_fence)) { 6050 job_signaled = true; 6051 dev_info(adev->dev, "Guilty job already signaled, skipping HW reset"); 6052 goto skip_hw_reset; 6053 } 6054 6055 retry: /* Rest of adevs pre asic reset from XGMI hive. */ 6056 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 6057 r = amdgpu_device_pre_asic_reset(tmp_adev, reset_context); 6058 /*TODO Should we stop ?*/ 6059 if (r) { 6060 dev_err(tmp_adev->dev, "GPU pre asic reset failed with err, %d for drm dev, %s ", 6061 r, adev_to_drm(tmp_adev)->unique); 6062 tmp_adev->asic_reset_res = r; 6063 } 6064 } 6065 6066 /* Actual ASIC resets if needed.*/ 6067 /* Host driver will handle XGMI hive reset for SRIOV */ 6068 if (amdgpu_sriov_vf(adev)) { 6069 if (amdgpu_ras_get_fed_status(adev) || amdgpu_virt_rcvd_ras_interrupt(adev)) { 6070 dev_dbg(adev->dev, "Detected RAS error, wait for FLR completion\n"); 6071 amdgpu_ras_set_fed(adev, true); 6072 set_bit(AMDGPU_HOST_FLR, &reset_context->flags); 6073 } 6074 6075 r = amdgpu_device_reset_sriov(adev, reset_context); 6076 if (AMDGPU_RETRY_SRIOV_RESET(r) && (retry_limit--) > 0) { 6077 amdgpu_virt_release_full_gpu(adev, true); 6078 goto retry; 6079 } 6080 if (r) 6081 adev->asic_reset_res = r; 6082 } else { 6083 r = amdgpu_do_asic_reset(device_list_handle, reset_context); 6084 if (r && r == -EAGAIN) 6085 goto retry; 6086 } 6087 6088 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 6089 /* 6090 * Drop any pending non scheduler resets queued before reset is done. 6091 * Any reset scheduled after this point would be valid. Scheduler resets 6092 * were already dropped during drm_sched_stop and no new ones can come 6093 * in before drm_sched_start. 6094 */ 6095 amdgpu_device_stop_pending_resets(tmp_adev); 6096 } 6097 6098 skip_hw_reset: 6099 6100 /* Post ASIC reset for all devs .*/ 6101 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 6102 6103 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 6104 struct amdgpu_ring *ring = tmp_adev->rings[i]; 6105 6106 if (!amdgpu_ring_sched_ready(ring)) 6107 continue; 6108 6109 drm_sched_start(&ring->sched, 0); 6110 } 6111 6112 if (!drm_drv_uses_atomic_modeset(adev_to_drm(tmp_adev)) && !job_signaled) 6113 drm_helper_resume_force_mode(adev_to_drm(tmp_adev)); 6114 6115 if (tmp_adev->asic_reset_res) 6116 r = tmp_adev->asic_reset_res; 6117 6118 tmp_adev->asic_reset_res = 0; 6119 6120 if (r) { 6121 /* bad news, how to tell it to userspace ? 6122 * for ras error, we should report GPU bad status instead of 6123 * reset failure 6124 */ 6125 if (reset_context->src != AMDGPU_RESET_SRC_RAS || 6126 !amdgpu_ras_eeprom_check_err_threshold(tmp_adev)) 6127 dev_info(tmp_adev->dev, "GPU reset(%d) failed\n", 6128 atomic_read(&tmp_adev->gpu_reset_counter)); 6129 amdgpu_vf_error_put(tmp_adev, AMDGIM_ERROR_VF_GPU_RESET_FAIL, 0, r); 6130 } else { 6131 dev_info(tmp_adev->dev, "GPU reset(%d) succeeded!\n", atomic_read(&tmp_adev->gpu_reset_counter)); 6132 if (amdgpu_acpi_smart_shift_update(adev_to_drm(tmp_adev), AMDGPU_SS_DEV_D0)) 6133 DRM_WARN("smart shift update failed\n"); 6134 } 6135 } 6136 6137 skip_sched_resume: 6138 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 6139 /* unlock kfd: SRIOV would do it separately */ 6140 if (!need_emergency_restart && !amdgpu_sriov_vf(tmp_adev)) 6141 amdgpu_amdkfd_post_reset(tmp_adev); 6142 6143 /* kfd_post_reset will do nothing if kfd device is not initialized, 6144 * need to bring up kfd here if it's not be initialized before 6145 */ 6146 if (!adev->kfd.init_complete) 6147 amdgpu_amdkfd_device_init(adev); 6148 6149 if (audio_suspended) 6150 amdgpu_device_resume_display_audio(tmp_adev); 6151 6152 amdgpu_device_unset_mp1_state(tmp_adev); 6153 6154 amdgpu_ras_set_error_query_ready(tmp_adev, true); 6155 } 6156 6157 tmp_adev = list_first_entry(device_list_handle, struct amdgpu_device, 6158 reset_list); 6159 amdgpu_device_unlock_reset_domain(tmp_adev->reset_domain); 6160 6161 end_reset: 6162 if (hive) { 6163 mutex_unlock(&hive->hive_lock); 6164 amdgpu_put_xgmi_hive(hive); 6165 } 6166 6167 if (r) 6168 dev_info(adev->dev, "GPU reset end with ret = %d\n", r); 6169 6170 atomic_set(&adev->reset_domain->reset_res, r); 6171 6172 if (!r) 6173 drm_dev_wedged_event(adev_to_drm(adev), DRM_WEDGE_RECOVERY_NONE); 6174 6175 return r; 6176 } 6177 6178 /** 6179 * amdgpu_device_partner_bandwidth - find the bandwidth of appropriate partner 6180 * 6181 * @adev: amdgpu_device pointer 6182 * @speed: pointer to the speed of the link 6183 * @width: pointer to the width of the link 6184 * 6185 * Evaluate the hierarchy to find the speed and bandwidth capabilities of the 6186 * first physical partner to an AMD dGPU. 6187 * This will exclude any virtual switches and links. 6188 */ 6189 static void amdgpu_device_partner_bandwidth(struct amdgpu_device *adev, 6190 enum pci_bus_speed *speed, 6191 enum pcie_link_width *width) 6192 { 6193 struct pci_dev *parent = adev->pdev; 6194 6195 if (!speed || !width) 6196 return; 6197 6198 *speed = PCI_SPEED_UNKNOWN; 6199 *width = PCIE_LNK_WIDTH_UNKNOWN; 6200 6201 if (amdgpu_device_pcie_dynamic_switching_supported(adev)) { 6202 while ((parent = pci_upstream_bridge(parent))) { 6203 /* skip upstream/downstream switches internal to dGPU*/ 6204 if (parent->vendor == PCI_VENDOR_ID_ATI) 6205 continue; 6206 *speed = pcie_get_speed_cap(parent); 6207 *width = pcie_get_width_cap(parent); 6208 break; 6209 } 6210 } else { 6211 /* use the current speeds rather than max if switching is not supported */ 6212 pcie_bandwidth_available(adev->pdev, NULL, speed, width); 6213 } 6214 } 6215 6216 /** 6217 * amdgpu_device_gpu_bandwidth - find the bandwidth of the GPU 6218 * 6219 * @adev: amdgpu_device pointer 6220 * @speed: pointer to the speed of the link 6221 * @width: pointer to the width of the link 6222 * 6223 * Evaluate the hierarchy to find the speed and bandwidth capabilities of the 6224 * AMD dGPU which may be a virtual upstream bridge. 6225 */ 6226 static void amdgpu_device_gpu_bandwidth(struct amdgpu_device *adev, 6227 enum pci_bus_speed *speed, 6228 enum pcie_link_width *width) 6229 { 6230 struct pci_dev *parent = adev->pdev; 6231 6232 if (!speed || !width) 6233 return; 6234 6235 parent = pci_upstream_bridge(parent); 6236 if (parent && parent->vendor == PCI_VENDOR_ID_ATI) { 6237 /* use the upstream/downstream switches internal to dGPU */ 6238 *speed = pcie_get_speed_cap(parent); 6239 *width = pcie_get_width_cap(parent); 6240 while ((parent = pci_upstream_bridge(parent))) { 6241 if (parent->vendor == PCI_VENDOR_ID_ATI) { 6242 /* use the upstream/downstream switches internal to dGPU */ 6243 *speed = pcie_get_speed_cap(parent); 6244 *width = pcie_get_width_cap(parent); 6245 } 6246 } 6247 } else { 6248 /* use the device itself */ 6249 *speed = pcie_get_speed_cap(adev->pdev); 6250 *width = pcie_get_width_cap(adev->pdev); 6251 } 6252 } 6253 6254 /** 6255 * amdgpu_device_get_pcie_info - fence pcie info about the PCIE slot 6256 * 6257 * @adev: amdgpu_device pointer 6258 * 6259 * Fetches and stores in the driver the PCIE capabilities (gen speed 6260 * and lanes) of the slot the device is in. Handles APUs and 6261 * virtualized environments where PCIE config space may not be available. 6262 */ 6263 static void amdgpu_device_get_pcie_info(struct amdgpu_device *adev) 6264 { 6265 enum pci_bus_speed speed_cap, platform_speed_cap; 6266 enum pcie_link_width platform_link_width, link_width; 6267 6268 if (amdgpu_pcie_gen_cap) 6269 adev->pm.pcie_gen_mask = amdgpu_pcie_gen_cap; 6270 6271 if (amdgpu_pcie_lane_cap) 6272 adev->pm.pcie_mlw_mask = amdgpu_pcie_lane_cap; 6273 6274 /* covers APUs as well */ 6275 if (pci_is_root_bus(adev->pdev->bus) && !amdgpu_passthrough(adev)) { 6276 if (adev->pm.pcie_gen_mask == 0) 6277 adev->pm.pcie_gen_mask = AMDGPU_DEFAULT_PCIE_GEN_MASK; 6278 if (adev->pm.pcie_mlw_mask == 0) 6279 adev->pm.pcie_mlw_mask = AMDGPU_DEFAULT_PCIE_MLW_MASK; 6280 return; 6281 } 6282 6283 if (adev->pm.pcie_gen_mask && adev->pm.pcie_mlw_mask) 6284 return; 6285 6286 amdgpu_device_partner_bandwidth(adev, &platform_speed_cap, 6287 &platform_link_width); 6288 amdgpu_device_gpu_bandwidth(adev, &speed_cap, &link_width); 6289 6290 if (adev->pm.pcie_gen_mask == 0) { 6291 /* asic caps */ 6292 if (speed_cap == PCI_SPEED_UNKNOWN) { 6293 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 6294 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 | 6295 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3); 6296 } else { 6297 if (speed_cap == PCIE_SPEED_32_0GT) 6298 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 6299 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 | 6300 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3 | 6301 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN4 | 6302 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN5); 6303 else if (speed_cap == PCIE_SPEED_16_0GT) 6304 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 6305 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 | 6306 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3 | 6307 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN4); 6308 else if (speed_cap == PCIE_SPEED_8_0GT) 6309 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 6310 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 | 6311 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3); 6312 else if (speed_cap == PCIE_SPEED_5_0GT) 6313 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 6314 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2); 6315 else 6316 adev->pm.pcie_gen_mask |= CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1; 6317 } 6318 /* platform caps */ 6319 if (platform_speed_cap == PCI_SPEED_UNKNOWN) { 6320 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 6321 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2); 6322 } else { 6323 if (platform_speed_cap == PCIE_SPEED_32_0GT) 6324 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 6325 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 | 6326 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3 | 6327 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN4 | 6328 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN5); 6329 else if (platform_speed_cap == PCIE_SPEED_16_0GT) 6330 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 6331 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 | 6332 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3 | 6333 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN4); 6334 else if (platform_speed_cap == PCIE_SPEED_8_0GT) 6335 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 6336 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 | 6337 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3); 6338 else if (platform_speed_cap == PCIE_SPEED_5_0GT) 6339 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 6340 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2); 6341 else 6342 adev->pm.pcie_gen_mask |= CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1; 6343 6344 } 6345 } 6346 if (adev->pm.pcie_mlw_mask == 0) { 6347 /* asic caps */ 6348 if (link_width == PCIE_LNK_WIDTH_UNKNOWN) { 6349 adev->pm.pcie_mlw_mask |= AMDGPU_DEFAULT_ASIC_PCIE_MLW_MASK; 6350 } else { 6351 switch (link_width) { 6352 case PCIE_LNK_X32: 6353 adev->pm.pcie_mlw_mask |= (CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X32 | 6354 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X16 | 6355 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X12 | 6356 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X8 | 6357 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X4 | 6358 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X2 | 6359 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X1); 6360 break; 6361 case PCIE_LNK_X16: 6362 adev->pm.pcie_mlw_mask |= (CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X16 | 6363 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X12 | 6364 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X8 | 6365 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X4 | 6366 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X2 | 6367 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X1); 6368 break; 6369 case PCIE_LNK_X12: 6370 adev->pm.pcie_mlw_mask |= (CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X12 | 6371 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X8 | 6372 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X4 | 6373 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X2 | 6374 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X1); 6375 break; 6376 case PCIE_LNK_X8: 6377 adev->pm.pcie_mlw_mask |= (CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X8 | 6378 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X4 | 6379 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X2 | 6380 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X1); 6381 break; 6382 case PCIE_LNK_X4: 6383 adev->pm.pcie_mlw_mask |= (CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X4 | 6384 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X2 | 6385 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X1); 6386 break; 6387 case PCIE_LNK_X2: 6388 adev->pm.pcie_mlw_mask |= (CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X2 | 6389 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X1); 6390 break; 6391 case PCIE_LNK_X1: 6392 adev->pm.pcie_mlw_mask |= CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X1; 6393 break; 6394 default: 6395 break; 6396 } 6397 } 6398 /* platform caps */ 6399 if (platform_link_width == PCIE_LNK_WIDTH_UNKNOWN) { 6400 adev->pm.pcie_mlw_mask |= AMDGPU_DEFAULT_PCIE_MLW_MASK; 6401 } else { 6402 switch (platform_link_width) { 6403 case PCIE_LNK_X32: 6404 adev->pm.pcie_mlw_mask |= (CAIL_PCIE_LINK_WIDTH_SUPPORT_X32 | 6405 CAIL_PCIE_LINK_WIDTH_SUPPORT_X16 | 6406 CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 | 6407 CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 | 6408 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 6409 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 6410 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 6411 break; 6412 case PCIE_LNK_X16: 6413 adev->pm.pcie_mlw_mask |= (CAIL_PCIE_LINK_WIDTH_SUPPORT_X16 | 6414 CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 | 6415 CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 | 6416 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 6417 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 6418 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 6419 break; 6420 case PCIE_LNK_X12: 6421 adev->pm.pcie_mlw_mask |= (CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 | 6422 CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 | 6423 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 6424 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 6425 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 6426 break; 6427 case PCIE_LNK_X8: 6428 adev->pm.pcie_mlw_mask |= (CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 | 6429 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 6430 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 6431 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 6432 break; 6433 case PCIE_LNK_X4: 6434 adev->pm.pcie_mlw_mask |= (CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 6435 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 6436 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 6437 break; 6438 case PCIE_LNK_X2: 6439 adev->pm.pcie_mlw_mask |= (CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 6440 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 6441 break; 6442 case PCIE_LNK_X1: 6443 adev->pm.pcie_mlw_mask |= CAIL_PCIE_LINK_WIDTH_SUPPORT_X1; 6444 break; 6445 default: 6446 break; 6447 } 6448 } 6449 } 6450 } 6451 6452 /** 6453 * amdgpu_device_is_peer_accessible - Check peer access through PCIe BAR 6454 * 6455 * @adev: amdgpu_device pointer 6456 * @peer_adev: amdgpu_device pointer for peer device trying to access @adev 6457 * 6458 * Return true if @peer_adev can access (DMA) @adev through the PCIe 6459 * BAR, i.e. @adev is "large BAR" and the BAR matches the DMA mask of 6460 * @peer_adev. 6461 */ 6462 bool amdgpu_device_is_peer_accessible(struct amdgpu_device *adev, 6463 struct amdgpu_device *peer_adev) 6464 { 6465 #ifdef CONFIG_HSA_AMD_P2P 6466 bool p2p_access = 6467 !adev->gmc.xgmi.connected_to_cpu && 6468 !(pci_p2pdma_distance(adev->pdev, peer_adev->dev, false) < 0); 6469 if (!p2p_access) 6470 dev_info(adev->dev, "PCIe P2P access from peer device %s is not supported by the chipset\n", 6471 pci_name(peer_adev->pdev)); 6472 6473 bool is_large_bar = adev->gmc.visible_vram_size && 6474 adev->gmc.real_vram_size == adev->gmc.visible_vram_size; 6475 bool p2p_addressable = amdgpu_device_check_iommu_remap(peer_adev); 6476 6477 if (!p2p_addressable) { 6478 uint64_t address_mask = peer_adev->dev->dma_mask ? 6479 ~*peer_adev->dev->dma_mask : ~((1ULL << 32) - 1); 6480 resource_size_t aper_limit = 6481 adev->gmc.aper_base + adev->gmc.aper_size - 1; 6482 6483 p2p_addressable = !(adev->gmc.aper_base & address_mask || 6484 aper_limit & address_mask); 6485 } 6486 return pcie_p2p && is_large_bar && p2p_access && p2p_addressable; 6487 #else 6488 return false; 6489 #endif 6490 } 6491 6492 int amdgpu_device_baco_enter(struct drm_device *dev) 6493 { 6494 struct amdgpu_device *adev = drm_to_adev(dev); 6495 struct amdgpu_ras *ras = amdgpu_ras_get_context(adev); 6496 6497 if (!amdgpu_device_supports_baco(dev)) 6498 return -ENOTSUPP; 6499 6500 if (ras && adev->ras_enabled && 6501 adev->nbio.funcs->enable_doorbell_interrupt) 6502 adev->nbio.funcs->enable_doorbell_interrupt(adev, false); 6503 6504 return amdgpu_dpm_baco_enter(adev); 6505 } 6506 6507 int amdgpu_device_baco_exit(struct drm_device *dev) 6508 { 6509 struct amdgpu_device *adev = drm_to_adev(dev); 6510 struct amdgpu_ras *ras = amdgpu_ras_get_context(adev); 6511 int ret = 0; 6512 6513 if (!amdgpu_device_supports_baco(dev)) 6514 return -ENOTSUPP; 6515 6516 ret = amdgpu_dpm_baco_exit(adev); 6517 if (ret) 6518 return ret; 6519 6520 if (ras && adev->ras_enabled && 6521 adev->nbio.funcs->enable_doorbell_interrupt) 6522 adev->nbio.funcs->enable_doorbell_interrupt(adev, true); 6523 6524 if (amdgpu_passthrough(adev) && adev->nbio.funcs && 6525 adev->nbio.funcs->clear_doorbell_interrupt) 6526 adev->nbio.funcs->clear_doorbell_interrupt(adev); 6527 6528 return 0; 6529 } 6530 6531 /** 6532 * amdgpu_pci_error_detected - Called when a PCI error is detected. 6533 * @pdev: PCI device struct 6534 * @state: PCI channel state 6535 * 6536 * Description: Called when a PCI error is detected. 6537 * 6538 * Return: PCI_ERS_RESULT_NEED_RESET or PCI_ERS_RESULT_DISCONNECT. 6539 */ 6540 pci_ers_result_t amdgpu_pci_error_detected(struct pci_dev *pdev, pci_channel_state_t state) 6541 { 6542 struct drm_device *dev = pci_get_drvdata(pdev); 6543 struct amdgpu_device *adev = drm_to_adev(dev); 6544 int i; 6545 6546 DRM_INFO("PCI error: detected callback, state(%d)!!\n", state); 6547 6548 if (adev->gmc.xgmi.num_physical_nodes > 1) { 6549 DRM_WARN("No support for XGMI hive yet..."); 6550 return PCI_ERS_RESULT_DISCONNECT; 6551 } 6552 6553 adev->pci_channel_state = state; 6554 6555 switch (state) { 6556 case pci_channel_io_normal: 6557 return PCI_ERS_RESULT_CAN_RECOVER; 6558 /* Fatal error, prepare for slot reset */ 6559 case pci_channel_io_frozen: 6560 /* 6561 * Locking adev->reset_domain->sem will prevent any external access 6562 * to GPU during PCI error recovery 6563 */ 6564 amdgpu_device_lock_reset_domain(adev->reset_domain); 6565 amdgpu_device_set_mp1_state(adev); 6566 6567 /* 6568 * Block any work scheduling as we do for regular GPU reset 6569 * for the duration of the recovery 6570 */ 6571 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 6572 struct amdgpu_ring *ring = adev->rings[i]; 6573 6574 if (!amdgpu_ring_sched_ready(ring)) 6575 continue; 6576 6577 drm_sched_stop(&ring->sched, NULL); 6578 } 6579 atomic_inc(&adev->gpu_reset_counter); 6580 return PCI_ERS_RESULT_NEED_RESET; 6581 case pci_channel_io_perm_failure: 6582 /* Permanent error, prepare for device removal */ 6583 return PCI_ERS_RESULT_DISCONNECT; 6584 } 6585 6586 return PCI_ERS_RESULT_NEED_RESET; 6587 } 6588 6589 /** 6590 * amdgpu_pci_mmio_enabled - Enable MMIO and dump debug registers 6591 * @pdev: pointer to PCI device 6592 */ 6593 pci_ers_result_t amdgpu_pci_mmio_enabled(struct pci_dev *pdev) 6594 { 6595 6596 DRM_INFO("PCI error: mmio enabled callback!!\n"); 6597 6598 /* TODO - dump whatever for debugging purposes */ 6599 6600 /* This called only if amdgpu_pci_error_detected returns 6601 * PCI_ERS_RESULT_CAN_RECOVER. Read/write to the device still 6602 * works, no need to reset slot. 6603 */ 6604 6605 return PCI_ERS_RESULT_RECOVERED; 6606 } 6607 6608 /** 6609 * amdgpu_pci_slot_reset - Called when PCI slot has been reset. 6610 * @pdev: PCI device struct 6611 * 6612 * Description: This routine is called by the pci error recovery 6613 * code after the PCI slot has been reset, just before we 6614 * should resume normal operations. 6615 */ 6616 pci_ers_result_t amdgpu_pci_slot_reset(struct pci_dev *pdev) 6617 { 6618 struct drm_device *dev = pci_get_drvdata(pdev); 6619 struct amdgpu_device *adev = drm_to_adev(dev); 6620 int r, i; 6621 struct amdgpu_reset_context reset_context; 6622 u32 memsize; 6623 struct list_head device_list; 6624 6625 /* PCI error slot reset should be skipped During RAS recovery */ 6626 if ((amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 3) || 6627 amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 4)) && 6628 amdgpu_ras_in_recovery(adev)) 6629 return PCI_ERS_RESULT_RECOVERED; 6630 6631 DRM_INFO("PCI error: slot reset callback!!\n"); 6632 6633 memset(&reset_context, 0, sizeof(reset_context)); 6634 6635 INIT_LIST_HEAD(&device_list); 6636 list_add_tail(&adev->reset_list, &device_list); 6637 6638 /* wait for asic to come out of reset */ 6639 msleep(500); 6640 6641 /* Restore PCI confspace */ 6642 amdgpu_device_load_pci_state(pdev); 6643 6644 /* confirm ASIC came out of reset */ 6645 for (i = 0; i < adev->usec_timeout; i++) { 6646 memsize = amdgpu_asic_get_config_memsize(adev); 6647 6648 if (memsize != 0xffffffff) 6649 break; 6650 udelay(1); 6651 } 6652 if (memsize == 0xffffffff) { 6653 r = -ETIME; 6654 goto out; 6655 } 6656 6657 reset_context.method = AMD_RESET_METHOD_NONE; 6658 reset_context.reset_req_dev = adev; 6659 set_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags); 6660 set_bit(AMDGPU_SKIP_HW_RESET, &reset_context.flags); 6661 6662 adev->no_hw_access = true; 6663 r = amdgpu_device_pre_asic_reset(adev, &reset_context); 6664 adev->no_hw_access = false; 6665 if (r) 6666 goto out; 6667 6668 r = amdgpu_do_asic_reset(&device_list, &reset_context); 6669 6670 out: 6671 if (!r) { 6672 if (amdgpu_device_cache_pci_state(adev->pdev)) 6673 pci_restore_state(adev->pdev); 6674 6675 DRM_INFO("PCIe error recovery succeeded\n"); 6676 } else { 6677 DRM_ERROR("PCIe error recovery failed, err:%d", r); 6678 amdgpu_device_unset_mp1_state(adev); 6679 amdgpu_device_unlock_reset_domain(adev->reset_domain); 6680 } 6681 6682 return r ? PCI_ERS_RESULT_DISCONNECT : PCI_ERS_RESULT_RECOVERED; 6683 } 6684 6685 /** 6686 * amdgpu_pci_resume() - resume normal ops after PCI reset 6687 * @pdev: pointer to PCI device 6688 * 6689 * Called when the error recovery driver tells us that its 6690 * OK to resume normal operation. 6691 */ 6692 void amdgpu_pci_resume(struct pci_dev *pdev) 6693 { 6694 struct drm_device *dev = pci_get_drvdata(pdev); 6695 struct amdgpu_device *adev = drm_to_adev(dev); 6696 int i; 6697 6698 6699 DRM_INFO("PCI error: resume callback!!\n"); 6700 6701 /* Only continue execution for the case of pci_channel_io_frozen */ 6702 if (adev->pci_channel_state != pci_channel_io_frozen) 6703 return; 6704 6705 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 6706 struct amdgpu_ring *ring = adev->rings[i]; 6707 6708 if (!amdgpu_ring_sched_ready(ring)) 6709 continue; 6710 6711 drm_sched_start(&ring->sched, 0); 6712 } 6713 6714 amdgpu_device_unset_mp1_state(adev); 6715 amdgpu_device_unlock_reset_domain(adev->reset_domain); 6716 } 6717 6718 bool amdgpu_device_cache_pci_state(struct pci_dev *pdev) 6719 { 6720 struct drm_device *dev = pci_get_drvdata(pdev); 6721 struct amdgpu_device *adev = drm_to_adev(dev); 6722 int r; 6723 6724 if (amdgpu_sriov_vf(adev)) 6725 return false; 6726 6727 r = pci_save_state(pdev); 6728 if (!r) { 6729 kfree(adev->pci_state); 6730 6731 adev->pci_state = pci_store_saved_state(pdev); 6732 6733 if (!adev->pci_state) { 6734 DRM_ERROR("Failed to store PCI saved state"); 6735 return false; 6736 } 6737 } else { 6738 DRM_WARN("Failed to save PCI state, err:%d\n", r); 6739 return false; 6740 } 6741 6742 return true; 6743 } 6744 6745 bool amdgpu_device_load_pci_state(struct pci_dev *pdev) 6746 { 6747 struct drm_device *dev = pci_get_drvdata(pdev); 6748 struct amdgpu_device *adev = drm_to_adev(dev); 6749 int r; 6750 6751 if (!adev->pci_state) 6752 return false; 6753 6754 r = pci_load_saved_state(pdev, adev->pci_state); 6755 6756 if (!r) { 6757 pci_restore_state(pdev); 6758 } else { 6759 DRM_WARN("Failed to load PCI state, err:%d\n", r); 6760 return false; 6761 } 6762 6763 return true; 6764 } 6765 6766 void amdgpu_device_flush_hdp(struct amdgpu_device *adev, 6767 struct amdgpu_ring *ring) 6768 { 6769 #ifdef CONFIG_X86_64 6770 if ((adev->flags & AMD_IS_APU) && !amdgpu_passthrough(adev)) 6771 return; 6772 #endif 6773 if (adev->gmc.xgmi.connected_to_cpu) 6774 return; 6775 6776 if (ring && ring->funcs->emit_hdp_flush) 6777 amdgpu_ring_emit_hdp_flush(ring); 6778 else 6779 amdgpu_asic_flush_hdp(adev, ring); 6780 } 6781 6782 void amdgpu_device_invalidate_hdp(struct amdgpu_device *adev, 6783 struct amdgpu_ring *ring) 6784 { 6785 #ifdef CONFIG_X86_64 6786 if ((adev->flags & AMD_IS_APU) && !amdgpu_passthrough(adev)) 6787 return; 6788 #endif 6789 if (adev->gmc.xgmi.connected_to_cpu) 6790 return; 6791 6792 amdgpu_asic_invalidate_hdp(adev, ring); 6793 } 6794 6795 int amdgpu_in_reset(struct amdgpu_device *adev) 6796 { 6797 return atomic_read(&adev->reset_domain->in_gpu_reset); 6798 } 6799 6800 /** 6801 * amdgpu_device_halt() - bring hardware to some kind of halt state 6802 * 6803 * @adev: amdgpu_device pointer 6804 * 6805 * Bring hardware to some kind of halt state so that no one can touch it 6806 * any more. It will help to maintain error context when error occurred. 6807 * Compare to a simple hang, the system will keep stable at least for SSH 6808 * access. Then it should be trivial to inspect the hardware state and 6809 * see what's going on. Implemented as following: 6810 * 6811 * 1. drm_dev_unplug() makes device inaccessible to user space(IOCTLs, etc), 6812 * clears all CPU mappings to device, disallows remappings through page faults 6813 * 2. amdgpu_irq_disable_all() disables all interrupts 6814 * 3. amdgpu_fence_driver_hw_fini() signals all HW fences 6815 * 4. set adev->no_hw_access to avoid potential crashes after setp 5 6816 * 5. amdgpu_device_unmap_mmio() clears all MMIO mappings 6817 * 6. pci_disable_device() and pci_wait_for_pending_transaction() 6818 * flush any in flight DMA operations 6819 */ 6820 void amdgpu_device_halt(struct amdgpu_device *adev) 6821 { 6822 struct pci_dev *pdev = adev->pdev; 6823 struct drm_device *ddev = adev_to_drm(adev); 6824 6825 amdgpu_xcp_dev_unplug(adev); 6826 drm_dev_unplug(ddev); 6827 6828 amdgpu_irq_disable_all(adev); 6829 6830 amdgpu_fence_driver_hw_fini(adev); 6831 6832 adev->no_hw_access = true; 6833 6834 amdgpu_device_unmap_mmio(adev); 6835 6836 pci_disable_device(pdev); 6837 pci_wait_for_pending_transaction(pdev); 6838 } 6839 6840 u32 amdgpu_device_pcie_port_rreg(struct amdgpu_device *adev, 6841 u32 reg) 6842 { 6843 unsigned long flags, address, data; 6844 u32 r; 6845 6846 address = adev->nbio.funcs->get_pcie_port_index_offset(adev); 6847 data = adev->nbio.funcs->get_pcie_port_data_offset(adev); 6848 6849 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 6850 WREG32(address, reg * 4); 6851 (void)RREG32(address); 6852 r = RREG32(data); 6853 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 6854 return r; 6855 } 6856 6857 void amdgpu_device_pcie_port_wreg(struct amdgpu_device *adev, 6858 u32 reg, u32 v) 6859 { 6860 unsigned long flags, address, data; 6861 6862 address = adev->nbio.funcs->get_pcie_port_index_offset(adev); 6863 data = adev->nbio.funcs->get_pcie_port_data_offset(adev); 6864 6865 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 6866 WREG32(address, reg * 4); 6867 (void)RREG32(address); 6868 WREG32(data, v); 6869 (void)RREG32(data); 6870 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 6871 } 6872 6873 /** 6874 * amdgpu_device_get_gang - return a reference to the current gang 6875 * @adev: amdgpu_device pointer 6876 * 6877 * Returns: A new reference to the current gang leader. 6878 */ 6879 struct dma_fence *amdgpu_device_get_gang(struct amdgpu_device *adev) 6880 { 6881 struct dma_fence *fence; 6882 6883 rcu_read_lock(); 6884 fence = dma_fence_get_rcu_safe(&adev->gang_submit); 6885 rcu_read_unlock(); 6886 return fence; 6887 } 6888 6889 /** 6890 * amdgpu_device_switch_gang - switch to a new gang 6891 * @adev: amdgpu_device pointer 6892 * @gang: the gang to switch to 6893 * 6894 * Try to switch to a new gang. 6895 * Returns: NULL if we switched to the new gang or a reference to the current 6896 * gang leader. 6897 */ 6898 struct dma_fence *amdgpu_device_switch_gang(struct amdgpu_device *adev, 6899 struct dma_fence *gang) 6900 { 6901 struct dma_fence *old = NULL; 6902 6903 do { 6904 dma_fence_put(old); 6905 old = amdgpu_device_get_gang(adev); 6906 if (old == gang) 6907 break; 6908 6909 if (!dma_fence_is_signaled(old)) 6910 return old; 6911 6912 } while (cmpxchg((struct dma_fence __force **)&adev->gang_submit, 6913 old, gang) != old); 6914 6915 dma_fence_put(old); 6916 return NULL; 6917 } 6918 6919 bool amdgpu_device_has_display_hardware(struct amdgpu_device *adev) 6920 { 6921 switch (adev->asic_type) { 6922 #ifdef CONFIG_DRM_AMDGPU_SI 6923 case CHIP_HAINAN: 6924 #endif 6925 case CHIP_TOPAZ: 6926 /* chips with no display hardware */ 6927 return false; 6928 #ifdef CONFIG_DRM_AMDGPU_SI 6929 case CHIP_TAHITI: 6930 case CHIP_PITCAIRN: 6931 case CHIP_VERDE: 6932 case CHIP_OLAND: 6933 #endif 6934 #ifdef CONFIG_DRM_AMDGPU_CIK 6935 case CHIP_BONAIRE: 6936 case CHIP_HAWAII: 6937 case CHIP_KAVERI: 6938 case CHIP_KABINI: 6939 case CHIP_MULLINS: 6940 #endif 6941 case CHIP_TONGA: 6942 case CHIP_FIJI: 6943 case CHIP_POLARIS10: 6944 case CHIP_POLARIS11: 6945 case CHIP_POLARIS12: 6946 case CHIP_VEGAM: 6947 case CHIP_CARRIZO: 6948 case CHIP_STONEY: 6949 /* chips with display hardware */ 6950 return true; 6951 default: 6952 /* IP discovery */ 6953 if (!amdgpu_ip_version(adev, DCE_HWIP, 0) || 6954 (adev->harvest_ip_mask & AMD_HARVEST_IP_DMU_MASK)) 6955 return false; 6956 return true; 6957 } 6958 } 6959 6960 uint32_t amdgpu_device_wait_on_rreg(struct amdgpu_device *adev, 6961 uint32_t inst, uint32_t reg_addr, char reg_name[], 6962 uint32_t expected_value, uint32_t mask) 6963 { 6964 uint32_t ret = 0; 6965 uint32_t old_ = 0; 6966 uint32_t tmp_ = RREG32(reg_addr); 6967 uint32_t loop = adev->usec_timeout; 6968 6969 while ((tmp_ & (mask)) != (expected_value)) { 6970 if (old_ != tmp_) { 6971 loop = adev->usec_timeout; 6972 old_ = tmp_; 6973 } else 6974 udelay(1); 6975 tmp_ = RREG32(reg_addr); 6976 loop--; 6977 if (!loop) { 6978 DRM_WARN("Register(%d) [%s] failed to reach value 0x%08x != 0x%08xn", 6979 inst, reg_name, (uint32_t)expected_value, 6980 (uint32_t)(tmp_ & (mask))); 6981 ret = -ETIMEDOUT; 6982 break; 6983 } 6984 } 6985 return ret; 6986 } 6987 6988 ssize_t amdgpu_get_soft_full_reset_mask(struct amdgpu_ring *ring) 6989 { 6990 ssize_t size = 0; 6991 6992 if (!ring || !ring->adev) 6993 return size; 6994 6995 if (amdgpu_device_should_recover_gpu(ring->adev)) 6996 size |= AMDGPU_RESET_TYPE_FULL; 6997 6998 if (unlikely(!ring->adev->debug_disable_soft_recovery) && 6999 !amdgpu_sriov_vf(ring->adev) && ring->funcs->soft_recovery) 7000 size |= AMDGPU_RESET_TYPE_SOFT_RESET; 7001 7002 return size; 7003 } 7004 7005 ssize_t amdgpu_show_reset_mask(char *buf, uint32_t supported_reset) 7006 { 7007 ssize_t size = 0; 7008 7009 if (supported_reset == 0) { 7010 size += sysfs_emit_at(buf, size, "unsupported"); 7011 size += sysfs_emit_at(buf, size, "\n"); 7012 return size; 7013 7014 } 7015 7016 if (supported_reset & AMDGPU_RESET_TYPE_SOFT_RESET) 7017 size += sysfs_emit_at(buf, size, "soft "); 7018 7019 if (supported_reset & AMDGPU_RESET_TYPE_PER_QUEUE) 7020 size += sysfs_emit_at(buf, size, "queue "); 7021 7022 if (supported_reset & AMDGPU_RESET_TYPE_PER_PIPE) 7023 size += sysfs_emit_at(buf, size, "pipe "); 7024 7025 if (supported_reset & AMDGPU_RESET_TYPE_FULL) 7026 size += sysfs_emit_at(buf, size, "full "); 7027 7028 size += sysfs_emit_at(buf, size, "\n"); 7029 return size; 7030 } 7031