1 /* 2 * Copyright 2008 Advanced Micro Devices, Inc. 3 * Copyright 2008 Red Hat Inc. 4 * Copyright 2009 Jerome Glisse. 5 * 6 * Permission is hereby granted, free of charge, to any person obtaining a 7 * copy of this software and associated documentation files (the "Software"), 8 * to deal in the Software without restriction, including without limitation 9 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 10 * and/or sell copies of the Software, and to permit persons to whom the 11 * Software is furnished to do so, subject to the following conditions: 12 * 13 * The above copyright notice and this permission notice shall be included in 14 * all copies or substantial portions of the Software. 15 * 16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 19 * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR 20 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, 21 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR 22 * OTHER DEALINGS IN THE SOFTWARE. 23 * 24 * Authors: Dave Airlie 25 * Alex Deucher 26 * Jerome Glisse 27 */ 28 29 #include <linux/aperture.h> 30 #include <linux/power_supply.h> 31 #include <linux/kthread.h> 32 #include <linux/module.h> 33 #include <linux/console.h> 34 #include <linux/slab.h> 35 #include <linux/iommu.h> 36 #include <linux/pci.h> 37 #include <linux/pci-p2pdma.h> 38 #include <linux/apple-gmux.h> 39 40 #include <drm/drm_atomic_helper.h> 41 #include <drm/drm_client_event.h> 42 #include <drm/drm_crtc_helper.h> 43 #include <drm/drm_probe_helper.h> 44 #include <drm/amdgpu_drm.h> 45 #include <linux/device.h> 46 #include <linux/vgaarb.h> 47 #include <linux/vga_switcheroo.h> 48 #include <linux/efi.h> 49 #include "amdgpu.h" 50 #include "amdgpu_trace.h" 51 #include "amdgpu_i2c.h" 52 #include "atom.h" 53 #include "amdgpu_atombios.h" 54 #include "amdgpu_atomfirmware.h" 55 #include "amd_pcie.h" 56 #ifdef CONFIG_DRM_AMDGPU_SI 57 #include "si.h" 58 #endif 59 #ifdef CONFIG_DRM_AMDGPU_CIK 60 #include "cik.h" 61 #endif 62 #include "vi.h" 63 #include "soc15.h" 64 #include "nv.h" 65 #include "bif/bif_4_1_d.h" 66 #include <linux/firmware.h> 67 #include "amdgpu_vf_error.h" 68 69 #include "amdgpu_amdkfd.h" 70 #include "amdgpu_pm.h" 71 72 #include "amdgpu_xgmi.h" 73 #include "amdgpu_ras.h" 74 #include "amdgpu_pmu.h" 75 #include "amdgpu_fru_eeprom.h" 76 #include "amdgpu_reset.h" 77 #include "amdgpu_virt.h" 78 #include "amdgpu_dev_coredump.h" 79 80 #include <linux/suspend.h> 81 #include <drm/task_barrier.h> 82 #include <linux/pm_runtime.h> 83 84 #include <drm/drm_drv.h> 85 86 #if IS_ENABLED(CONFIG_X86) 87 #include <asm/intel-family.h> 88 #endif 89 90 MODULE_FIRMWARE("amdgpu/vega10_gpu_info.bin"); 91 MODULE_FIRMWARE("amdgpu/vega12_gpu_info.bin"); 92 MODULE_FIRMWARE("amdgpu/raven_gpu_info.bin"); 93 MODULE_FIRMWARE("amdgpu/picasso_gpu_info.bin"); 94 MODULE_FIRMWARE("amdgpu/raven2_gpu_info.bin"); 95 MODULE_FIRMWARE("amdgpu/arcturus_gpu_info.bin"); 96 MODULE_FIRMWARE("amdgpu/navi12_gpu_info.bin"); 97 98 #define AMDGPU_RESUME_MS 2000 99 #define AMDGPU_MAX_RETRY_LIMIT 2 100 #define AMDGPU_RETRY_SRIOV_RESET(r) ((r) == -EBUSY || (r) == -ETIMEDOUT || (r) == -EINVAL) 101 #define AMDGPU_PCIE_INDEX_FALLBACK (0x38 >> 2) 102 #define AMDGPU_PCIE_INDEX_HI_FALLBACK (0x44 >> 2) 103 #define AMDGPU_PCIE_DATA_FALLBACK (0x3C >> 2) 104 105 static const struct drm_driver amdgpu_kms_driver; 106 107 const char *amdgpu_asic_name[] = { 108 "TAHITI", 109 "PITCAIRN", 110 "VERDE", 111 "OLAND", 112 "HAINAN", 113 "BONAIRE", 114 "KAVERI", 115 "KABINI", 116 "HAWAII", 117 "MULLINS", 118 "TOPAZ", 119 "TONGA", 120 "FIJI", 121 "CARRIZO", 122 "STONEY", 123 "POLARIS10", 124 "POLARIS11", 125 "POLARIS12", 126 "VEGAM", 127 "VEGA10", 128 "VEGA12", 129 "VEGA20", 130 "RAVEN", 131 "ARCTURUS", 132 "RENOIR", 133 "ALDEBARAN", 134 "NAVI10", 135 "CYAN_SKILLFISH", 136 "NAVI14", 137 "NAVI12", 138 "SIENNA_CICHLID", 139 "NAVY_FLOUNDER", 140 "VANGOGH", 141 "DIMGREY_CAVEFISH", 142 "BEIGE_GOBY", 143 "YELLOW_CARP", 144 "IP DISCOVERY", 145 "LAST", 146 }; 147 148 #define AMDGPU_IP_BLK_MASK_ALL GENMASK(AMD_IP_BLOCK_TYPE_NUM - 1, 0) 149 /* 150 * Default init level where all blocks are expected to be initialized. This is 151 * the level of initialization expected by default and also after a full reset 152 * of the device. 153 */ 154 struct amdgpu_init_level amdgpu_init_default = { 155 .level = AMDGPU_INIT_LEVEL_DEFAULT, 156 .hwini_ip_block_mask = AMDGPU_IP_BLK_MASK_ALL, 157 }; 158 159 struct amdgpu_init_level amdgpu_init_recovery = { 160 .level = AMDGPU_INIT_LEVEL_RESET_RECOVERY, 161 .hwini_ip_block_mask = AMDGPU_IP_BLK_MASK_ALL, 162 }; 163 164 /* 165 * Minimal blocks needed to be initialized before a XGMI hive can be reset. This 166 * is used for cases like reset on initialization where the entire hive needs to 167 * be reset before first use. 168 */ 169 struct amdgpu_init_level amdgpu_init_minimal_xgmi = { 170 .level = AMDGPU_INIT_LEVEL_MINIMAL_XGMI, 171 .hwini_ip_block_mask = 172 BIT(AMD_IP_BLOCK_TYPE_GMC) | BIT(AMD_IP_BLOCK_TYPE_SMC) | 173 BIT(AMD_IP_BLOCK_TYPE_COMMON) | BIT(AMD_IP_BLOCK_TYPE_IH) | 174 BIT(AMD_IP_BLOCK_TYPE_PSP) 175 }; 176 177 static inline bool amdgpu_ip_member_of_hwini(struct amdgpu_device *adev, 178 enum amd_ip_block_type block) 179 { 180 return (adev->init_lvl->hwini_ip_block_mask & (1U << block)) != 0; 181 } 182 183 void amdgpu_set_init_level(struct amdgpu_device *adev, 184 enum amdgpu_init_lvl_id lvl) 185 { 186 switch (lvl) { 187 case AMDGPU_INIT_LEVEL_MINIMAL_XGMI: 188 adev->init_lvl = &amdgpu_init_minimal_xgmi; 189 break; 190 case AMDGPU_INIT_LEVEL_RESET_RECOVERY: 191 adev->init_lvl = &amdgpu_init_recovery; 192 break; 193 case AMDGPU_INIT_LEVEL_DEFAULT: 194 fallthrough; 195 default: 196 adev->init_lvl = &amdgpu_init_default; 197 break; 198 } 199 } 200 201 static inline void amdgpu_device_stop_pending_resets(struct amdgpu_device *adev); 202 static int amdgpu_device_pm_notifier(struct notifier_block *nb, unsigned long mode, 203 void *data); 204 205 /** 206 * DOC: pcie_replay_count 207 * 208 * The amdgpu driver provides a sysfs API for reporting the total number 209 * of PCIe replays (NAKs). 210 * The file pcie_replay_count is used for this and returns the total 211 * number of replays as a sum of the NAKs generated and NAKs received. 212 */ 213 214 static ssize_t amdgpu_device_get_pcie_replay_count(struct device *dev, 215 struct device_attribute *attr, char *buf) 216 { 217 struct drm_device *ddev = dev_get_drvdata(dev); 218 struct amdgpu_device *adev = drm_to_adev(ddev); 219 uint64_t cnt = amdgpu_asic_get_pcie_replay_count(adev); 220 221 return sysfs_emit(buf, "%llu\n", cnt); 222 } 223 224 static DEVICE_ATTR(pcie_replay_count, 0444, 225 amdgpu_device_get_pcie_replay_count, NULL); 226 227 static ssize_t amdgpu_sysfs_reg_state_get(struct file *f, struct kobject *kobj, 228 struct bin_attribute *attr, char *buf, 229 loff_t ppos, size_t count) 230 { 231 struct device *dev = kobj_to_dev(kobj); 232 struct drm_device *ddev = dev_get_drvdata(dev); 233 struct amdgpu_device *adev = drm_to_adev(ddev); 234 ssize_t bytes_read; 235 236 switch (ppos) { 237 case AMDGPU_SYS_REG_STATE_XGMI: 238 bytes_read = amdgpu_asic_get_reg_state( 239 adev, AMDGPU_REG_STATE_TYPE_XGMI, buf, count); 240 break; 241 case AMDGPU_SYS_REG_STATE_WAFL: 242 bytes_read = amdgpu_asic_get_reg_state( 243 adev, AMDGPU_REG_STATE_TYPE_WAFL, buf, count); 244 break; 245 case AMDGPU_SYS_REG_STATE_PCIE: 246 bytes_read = amdgpu_asic_get_reg_state( 247 adev, AMDGPU_REG_STATE_TYPE_PCIE, buf, count); 248 break; 249 case AMDGPU_SYS_REG_STATE_USR: 250 bytes_read = amdgpu_asic_get_reg_state( 251 adev, AMDGPU_REG_STATE_TYPE_USR, buf, count); 252 break; 253 case AMDGPU_SYS_REG_STATE_USR_1: 254 bytes_read = amdgpu_asic_get_reg_state( 255 adev, AMDGPU_REG_STATE_TYPE_USR_1, buf, count); 256 break; 257 default: 258 return -EINVAL; 259 } 260 261 return bytes_read; 262 } 263 264 BIN_ATTR(reg_state, 0444, amdgpu_sysfs_reg_state_get, NULL, 265 AMDGPU_SYS_REG_STATE_END); 266 267 int amdgpu_reg_state_sysfs_init(struct amdgpu_device *adev) 268 { 269 int ret; 270 271 if (!amdgpu_asic_get_reg_state_supported(adev)) 272 return 0; 273 274 ret = sysfs_create_bin_file(&adev->dev->kobj, &bin_attr_reg_state); 275 276 return ret; 277 } 278 279 void amdgpu_reg_state_sysfs_fini(struct amdgpu_device *adev) 280 { 281 if (!amdgpu_asic_get_reg_state_supported(adev)) 282 return; 283 sysfs_remove_bin_file(&adev->dev->kobj, &bin_attr_reg_state); 284 } 285 286 int amdgpu_ip_block_suspend(struct amdgpu_ip_block *ip_block) 287 { 288 int r; 289 290 if (ip_block->version->funcs->suspend) { 291 r = ip_block->version->funcs->suspend(ip_block); 292 if (r) { 293 dev_err(ip_block->adev->dev, 294 "suspend of IP block <%s> failed %d\n", 295 ip_block->version->funcs->name, r); 296 return r; 297 } 298 } 299 300 ip_block->status.hw = false; 301 return 0; 302 } 303 304 int amdgpu_ip_block_resume(struct amdgpu_ip_block *ip_block) 305 { 306 int r; 307 308 if (ip_block->version->funcs->resume) { 309 r = ip_block->version->funcs->resume(ip_block); 310 if (r) { 311 dev_err(ip_block->adev->dev, 312 "resume of IP block <%s> failed %d\n", 313 ip_block->version->funcs->name, r); 314 return r; 315 } 316 } 317 318 ip_block->status.hw = true; 319 return 0; 320 } 321 322 /** 323 * DOC: board_info 324 * 325 * The amdgpu driver provides a sysfs API for giving board related information. 326 * It provides the form factor information in the format 327 * 328 * type : form factor 329 * 330 * Possible form factor values 331 * 332 * - "cem" - PCIE CEM card 333 * - "oam" - Open Compute Accelerator Module 334 * - "unknown" - Not known 335 * 336 */ 337 338 static ssize_t amdgpu_device_get_board_info(struct device *dev, 339 struct device_attribute *attr, 340 char *buf) 341 { 342 struct drm_device *ddev = dev_get_drvdata(dev); 343 struct amdgpu_device *adev = drm_to_adev(ddev); 344 enum amdgpu_pkg_type pkg_type = AMDGPU_PKG_TYPE_CEM; 345 const char *pkg; 346 347 if (adev->smuio.funcs && adev->smuio.funcs->get_pkg_type) 348 pkg_type = adev->smuio.funcs->get_pkg_type(adev); 349 350 switch (pkg_type) { 351 case AMDGPU_PKG_TYPE_CEM: 352 pkg = "cem"; 353 break; 354 case AMDGPU_PKG_TYPE_OAM: 355 pkg = "oam"; 356 break; 357 default: 358 pkg = "unknown"; 359 break; 360 } 361 362 return sysfs_emit(buf, "%s : %s\n", "type", pkg); 363 } 364 365 static DEVICE_ATTR(board_info, 0444, amdgpu_device_get_board_info, NULL); 366 367 static struct attribute *amdgpu_board_attrs[] = { 368 &dev_attr_board_info.attr, 369 NULL, 370 }; 371 372 static umode_t amdgpu_board_attrs_is_visible(struct kobject *kobj, 373 struct attribute *attr, int n) 374 { 375 struct device *dev = kobj_to_dev(kobj); 376 struct drm_device *ddev = dev_get_drvdata(dev); 377 struct amdgpu_device *adev = drm_to_adev(ddev); 378 379 if (adev->flags & AMD_IS_APU) 380 return 0; 381 382 return attr->mode; 383 } 384 385 static const struct attribute_group amdgpu_board_attrs_group = { 386 .attrs = amdgpu_board_attrs, 387 .is_visible = amdgpu_board_attrs_is_visible 388 }; 389 390 static void amdgpu_device_get_pcie_info(struct amdgpu_device *adev); 391 392 393 /** 394 * amdgpu_device_supports_px - Is the device a dGPU with ATPX power control 395 * 396 * @dev: drm_device pointer 397 * 398 * Returns true if the device is a dGPU with ATPX power control, 399 * otherwise return false. 400 */ 401 bool amdgpu_device_supports_px(struct drm_device *dev) 402 { 403 struct amdgpu_device *adev = drm_to_adev(dev); 404 405 if ((adev->flags & AMD_IS_PX) && !amdgpu_is_atpx_hybrid()) 406 return true; 407 return false; 408 } 409 410 /** 411 * amdgpu_device_supports_boco - Is the device a dGPU with ACPI power resources 412 * 413 * @dev: drm_device pointer 414 * 415 * Returns true if the device is a dGPU with ACPI power control, 416 * otherwise return false. 417 */ 418 bool amdgpu_device_supports_boco(struct drm_device *dev) 419 { 420 struct amdgpu_device *adev = drm_to_adev(dev); 421 422 if (!IS_ENABLED(CONFIG_HOTPLUG_PCI_PCIE)) 423 return false; 424 425 if (adev->has_pr3 || 426 ((adev->flags & AMD_IS_PX) && amdgpu_is_atpx_hybrid())) 427 return true; 428 return false; 429 } 430 431 /** 432 * amdgpu_device_supports_baco - Does the device support BACO 433 * 434 * @dev: drm_device pointer 435 * 436 * Return: 437 * 1 if the device supports BACO; 438 * 3 if the device supports MACO (only works if BACO is supported) 439 * otherwise return 0. 440 */ 441 int amdgpu_device_supports_baco(struct drm_device *dev) 442 { 443 struct amdgpu_device *adev = drm_to_adev(dev); 444 445 return amdgpu_asic_supports_baco(adev); 446 } 447 448 void amdgpu_device_detect_runtime_pm_mode(struct amdgpu_device *adev) 449 { 450 struct drm_device *dev; 451 int bamaco_support; 452 453 dev = adev_to_drm(adev); 454 455 adev->pm.rpm_mode = AMDGPU_RUNPM_NONE; 456 bamaco_support = amdgpu_device_supports_baco(dev); 457 458 switch (amdgpu_runtime_pm) { 459 case 2: 460 if (bamaco_support & MACO_SUPPORT) { 461 adev->pm.rpm_mode = AMDGPU_RUNPM_BAMACO; 462 dev_info(adev->dev, "Forcing BAMACO for runtime pm\n"); 463 } else if (bamaco_support == BACO_SUPPORT) { 464 adev->pm.rpm_mode = AMDGPU_RUNPM_BACO; 465 dev_info(adev->dev, "Requested mode BAMACO not available,fallback to use BACO\n"); 466 } 467 break; 468 case 1: 469 if (bamaco_support & BACO_SUPPORT) { 470 adev->pm.rpm_mode = AMDGPU_RUNPM_BACO; 471 dev_info(adev->dev, "Forcing BACO for runtime pm\n"); 472 } 473 break; 474 case -1: 475 case -2: 476 if (amdgpu_device_supports_px(dev)) { /* enable PX as runtime mode */ 477 adev->pm.rpm_mode = AMDGPU_RUNPM_PX; 478 dev_info(adev->dev, "Using ATPX for runtime pm\n"); 479 } else if (amdgpu_device_supports_boco(dev)) { /* enable boco as runtime mode */ 480 adev->pm.rpm_mode = AMDGPU_RUNPM_BOCO; 481 dev_info(adev->dev, "Using BOCO for runtime pm\n"); 482 } else { 483 if (!bamaco_support) 484 goto no_runtime_pm; 485 486 switch (adev->asic_type) { 487 case CHIP_VEGA20: 488 case CHIP_ARCTURUS: 489 /* BACO are not supported on vega20 and arctrus */ 490 break; 491 case CHIP_VEGA10: 492 /* enable BACO as runpm mode if noretry=0 */ 493 if (!adev->gmc.noretry) 494 adev->pm.rpm_mode = AMDGPU_RUNPM_BACO; 495 break; 496 default: 497 /* enable BACO as runpm mode on CI+ */ 498 adev->pm.rpm_mode = AMDGPU_RUNPM_BACO; 499 break; 500 } 501 502 if (adev->pm.rpm_mode == AMDGPU_RUNPM_BACO) { 503 if (bamaco_support & MACO_SUPPORT) { 504 adev->pm.rpm_mode = AMDGPU_RUNPM_BAMACO; 505 dev_info(adev->dev, "Using BAMACO for runtime pm\n"); 506 } else { 507 dev_info(adev->dev, "Using BACO for runtime pm\n"); 508 } 509 } 510 } 511 break; 512 case 0: 513 dev_info(adev->dev, "runtime pm is manually disabled\n"); 514 break; 515 default: 516 break; 517 } 518 519 no_runtime_pm: 520 if (adev->pm.rpm_mode == AMDGPU_RUNPM_NONE) 521 dev_info(adev->dev, "Runtime PM not available\n"); 522 } 523 /** 524 * amdgpu_device_supports_smart_shift - Is the device dGPU with 525 * smart shift support 526 * 527 * @dev: drm_device pointer 528 * 529 * Returns true if the device is a dGPU with Smart Shift support, 530 * otherwise returns false. 531 */ 532 bool amdgpu_device_supports_smart_shift(struct drm_device *dev) 533 { 534 return (amdgpu_device_supports_boco(dev) && 535 amdgpu_acpi_is_power_shift_control_supported()); 536 } 537 538 /* 539 * VRAM access helper functions 540 */ 541 542 /** 543 * amdgpu_device_mm_access - access vram by MM_INDEX/MM_DATA 544 * 545 * @adev: amdgpu_device pointer 546 * @pos: offset of the buffer in vram 547 * @buf: virtual address of the buffer in system memory 548 * @size: read/write size, sizeof(@buf) must > @size 549 * @write: true - write to vram, otherwise - read from vram 550 */ 551 void amdgpu_device_mm_access(struct amdgpu_device *adev, loff_t pos, 552 void *buf, size_t size, bool write) 553 { 554 unsigned long flags; 555 uint32_t hi = ~0, tmp = 0; 556 uint32_t *data = buf; 557 uint64_t last; 558 int idx; 559 560 if (!drm_dev_enter(adev_to_drm(adev), &idx)) 561 return; 562 563 BUG_ON(!IS_ALIGNED(pos, 4) || !IS_ALIGNED(size, 4)); 564 565 spin_lock_irqsave(&adev->mmio_idx_lock, flags); 566 for (last = pos + size; pos < last; pos += 4) { 567 tmp = pos >> 31; 568 569 WREG32_NO_KIQ(mmMM_INDEX, ((uint32_t)pos) | 0x80000000); 570 if (tmp != hi) { 571 WREG32_NO_KIQ(mmMM_INDEX_HI, tmp); 572 hi = tmp; 573 } 574 if (write) 575 WREG32_NO_KIQ(mmMM_DATA, *data++); 576 else 577 *data++ = RREG32_NO_KIQ(mmMM_DATA); 578 } 579 580 spin_unlock_irqrestore(&adev->mmio_idx_lock, flags); 581 drm_dev_exit(idx); 582 } 583 584 /** 585 * amdgpu_device_aper_access - access vram by vram aperture 586 * 587 * @adev: amdgpu_device pointer 588 * @pos: offset of the buffer in vram 589 * @buf: virtual address of the buffer in system memory 590 * @size: read/write size, sizeof(@buf) must > @size 591 * @write: true - write to vram, otherwise - read from vram 592 * 593 * The return value means how many bytes have been transferred. 594 */ 595 size_t amdgpu_device_aper_access(struct amdgpu_device *adev, loff_t pos, 596 void *buf, size_t size, bool write) 597 { 598 #ifdef CONFIG_64BIT 599 void __iomem *addr; 600 size_t count = 0; 601 uint64_t last; 602 603 if (!adev->mman.aper_base_kaddr) 604 return 0; 605 606 last = min(pos + size, adev->gmc.visible_vram_size); 607 if (last > pos) { 608 addr = adev->mman.aper_base_kaddr + pos; 609 count = last - pos; 610 611 if (write) { 612 memcpy_toio(addr, buf, count); 613 /* Make sure HDP write cache flush happens without any reordering 614 * after the system memory contents are sent over PCIe device 615 */ 616 mb(); 617 amdgpu_device_flush_hdp(adev, NULL); 618 } else { 619 amdgpu_device_invalidate_hdp(adev, NULL); 620 /* Make sure HDP read cache is invalidated before issuing a read 621 * to the PCIe device 622 */ 623 mb(); 624 memcpy_fromio(buf, addr, count); 625 } 626 627 } 628 629 return count; 630 #else 631 return 0; 632 #endif 633 } 634 635 /** 636 * amdgpu_device_vram_access - read/write a buffer in vram 637 * 638 * @adev: amdgpu_device pointer 639 * @pos: offset of the buffer in vram 640 * @buf: virtual address of the buffer in system memory 641 * @size: read/write size, sizeof(@buf) must > @size 642 * @write: true - write to vram, otherwise - read from vram 643 */ 644 void amdgpu_device_vram_access(struct amdgpu_device *adev, loff_t pos, 645 void *buf, size_t size, bool write) 646 { 647 size_t count; 648 649 /* try to using vram apreature to access vram first */ 650 count = amdgpu_device_aper_access(adev, pos, buf, size, write); 651 size -= count; 652 if (size) { 653 /* using MM to access rest vram */ 654 pos += count; 655 buf += count; 656 amdgpu_device_mm_access(adev, pos, buf, size, write); 657 } 658 } 659 660 /* 661 * register access helper functions. 662 */ 663 664 /* Check if hw access should be skipped because of hotplug or device error */ 665 bool amdgpu_device_skip_hw_access(struct amdgpu_device *adev) 666 { 667 if (adev->no_hw_access) 668 return true; 669 670 #ifdef CONFIG_LOCKDEP 671 /* 672 * This is a bit complicated to understand, so worth a comment. What we assert 673 * here is that the GPU reset is not running on another thread in parallel. 674 * 675 * For this we trylock the read side of the reset semaphore, if that succeeds 676 * we know that the reset is not running in parallel. 677 * 678 * If the trylock fails we assert that we are either already holding the read 679 * side of the lock or are the reset thread itself and hold the write side of 680 * the lock. 681 */ 682 if (in_task()) { 683 if (down_read_trylock(&adev->reset_domain->sem)) 684 up_read(&adev->reset_domain->sem); 685 else 686 lockdep_assert_held(&adev->reset_domain->sem); 687 } 688 #endif 689 return false; 690 } 691 692 /** 693 * amdgpu_device_rreg - read a memory mapped IO or indirect register 694 * 695 * @adev: amdgpu_device pointer 696 * @reg: dword aligned register offset 697 * @acc_flags: access flags which require special behavior 698 * 699 * Returns the 32 bit value from the offset specified. 700 */ 701 uint32_t amdgpu_device_rreg(struct amdgpu_device *adev, 702 uint32_t reg, uint32_t acc_flags) 703 { 704 uint32_t ret; 705 706 if (amdgpu_device_skip_hw_access(adev)) 707 return 0; 708 709 if ((reg * 4) < adev->rmmio_size) { 710 if (!(acc_flags & AMDGPU_REGS_NO_KIQ) && 711 amdgpu_sriov_runtime(adev) && 712 down_read_trylock(&adev->reset_domain->sem)) { 713 ret = amdgpu_kiq_rreg(adev, reg, 0); 714 up_read(&adev->reset_domain->sem); 715 } else { 716 ret = readl(((void __iomem *)adev->rmmio) + (reg * 4)); 717 } 718 } else { 719 ret = adev->pcie_rreg(adev, reg * 4); 720 } 721 722 trace_amdgpu_device_rreg(adev->pdev->device, reg, ret); 723 724 return ret; 725 } 726 727 /* 728 * MMIO register read with bytes helper functions 729 * @offset:bytes offset from MMIO start 730 */ 731 732 /** 733 * amdgpu_mm_rreg8 - read a memory mapped IO register 734 * 735 * @adev: amdgpu_device pointer 736 * @offset: byte aligned register offset 737 * 738 * Returns the 8 bit value from the offset specified. 739 */ 740 uint8_t amdgpu_mm_rreg8(struct amdgpu_device *adev, uint32_t offset) 741 { 742 if (amdgpu_device_skip_hw_access(adev)) 743 return 0; 744 745 if (offset < adev->rmmio_size) 746 return (readb(adev->rmmio + offset)); 747 BUG(); 748 } 749 750 751 /** 752 * amdgpu_device_xcc_rreg - read a memory mapped IO or indirect register with specific XCC 753 * 754 * @adev: amdgpu_device pointer 755 * @reg: dword aligned register offset 756 * @acc_flags: access flags which require special behavior 757 * @xcc_id: xcc accelerated compute core id 758 * 759 * Returns the 32 bit value from the offset specified. 760 */ 761 uint32_t amdgpu_device_xcc_rreg(struct amdgpu_device *adev, 762 uint32_t reg, uint32_t acc_flags, 763 uint32_t xcc_id) 764 { 765 uint32_t ret, rlcg_flag; 766 767 if (amdgpu_device_skip_hw_access(adev)) 768 return 0; 769 770 if ((reg * 4) < adev->rmmio_size) { 771 if (amdgpu_sriov_vf(adev) && 772 !amdgpu_sriov_runtime(adev) && 773 adev->gfx.rlc.rlcg_reg_access_supported && 774 amdgpu_virt_get_rlcg_reg_access_flag(adev, acc_flags, 775 GC_HWIP, false, 776 &rlcg_flag)) { 777 ret = amdgpu_virt_rlcg_reg_rw(adev, reg, 0, rlcg_flag, GET_INST(GC, xcc_id)); 778 } else if (!(acc_flags & AMDGPU_REGS_NO_KIQ) && 779 amdgpu_sriov_runtime(adev) && 780 down_read_trylock(&adev->reset_domain->sem)) { 781 ret = amdgpu_kiq_rreg(adev, reg, xcc_id); 782 up_read(&adev->reset_domain->sem); 783 } else { 784 ret = readl(((void __iomem *)adev->rmmio) + (reg * 4)); 785 } 786 } else { 787 ret = adev->pcie_rreg(adev, reg * 4); 788 } 789 790 return ret; 791 } 792 793 /* 794 * MMIO register write with bytes helper functions 795 * @offset:bytes offset from MMIO start 796 * @value: the value want to be written to the register 797 */ 798 799 /** 800 * amdgpu_mm_wreg8 - read a memory mapped IO register 801 * 802 * @adev: amdgpu_device pointer 803 * @offset: byte aligned register offset 804 * @value: 8 bit value to write 805 * 806 * Writes the value specified to the offset specified. 807 */ 808 void amdgpu_mm_wreg8(struct amdgpu_device *adev, uint32_t offset, uint8_t value) 809 { 810 if (amdgpu_device_skip_hw_access(adev)) 811 return; 812 813 if (offset < adev->rmmio_size) 814 writeb(value, adev->rmmio + offset); 815 else 816 BUG(); 817 } 818 819 /** 820 * amdgpu_device_wreg - write to a memory mapped IO or indirect register 821 * 822 * @adev: amdgpu_device pointer 823 * @reg: dword aligned register offset 824 * @v: 32 bit value to write to the register 825 * @acc_flags: access flags which require special behavior 826 * 827 * Writes the value specified to the offset specified. 828 */ 829 void amdgpu_device_wreg(struct amdgpu_device *adev, 830 uint32_t reg, uint32_t v, 831 uint32_t acc_flags) 832 { 833 if (amdgpu_device_skip_hw_access(adev)) 834 return; 835 836 if ((reg * 4) < adev->rmmio_size) { 837 if (!(acc_flags & AMDGPU_REGS_NO_KIQ) && 838 amdgpu_sriov_runtime(adev) && 839 down_read_trylock(&adev->reset_domain->sem)) { 840 amdgpu_kiq_wreg(adev, reg, v, 0); 841 up_read(&adev->reset_domain->sem); 842 } else { 843 writel(v, ((void __iomem *)adev->rmmio) + (reg * 4)); 844 } 845 } else { 846 adev->pcie_wreg(adev, reg * 4, v); 847 } 848 849 trace_amdgpu_device_wreg(adev->pdev->device, reg, v); 850 } 851 852 /** 853 * amdgpu_mm_wreg_mmio_rlc - write register either with direct/indirect mmio or with RLC path if in range 854 * 855 * @adev: amdgpu_device pointer 856 * @reg: mmio/rlc register 857 * @v: value to write 858 * @xcc_id: xcc accelerated compute core id 859 * 860 * this function is invoked only for the debugfs register access 861 */ 862 void amdgpu_mm_wreg_mmio_rlc(struct amdgpu_device *adev, 863 uint32_t reg, uint32_t v, 864 uint32_t xcc_id) 865 { 866 if (amdgpu_device_skip_hw_access(adev)) 867 return; 868 869 if (amdgpu_sriov_fullaccess(adev) && 870 adev->gfx.rlc.funcs && 871 adev->gfx.rlc.funcs->is_rlcg_access_range) { 872 if (adev->gfx.rlc.funcs->is_rlcg_access_range(adev, reg)) 873 return amdgpu_sriov_wreg(adev, reg, v, 0, 0, xcc_id); 874 } else if ((reg * 4) >= adev->rmmio_size) { 875 adev->pcie_wreg(adev, reg * 4, v); 876 } else { 877 writel(v, ((void __iomem *)adev->rmmio) + (reg * 4)); 878 } 879 } 880 881 /** 882 * amdgpu_device_xcc_wreg - write to a memory mapped IO or indirect register with specific XCC 883 * 884 * @adev: amdgpu_device pointer 885 * @reg: dword aligned register offset 886 * @v: 32 bit value to write to the register 887 * @acc_flags: access flags which require special behavior 888 * @xcc_id: xcc accelerated compute core id 889 * 890 * Writes the value specified to the offset specified. 891 */ 892 void amdgpu_device_xcc_wreg(struct amdgpu_device *adev, 893 uint32_t reg, uint32_t v, 894 uint32_t acc_flags, uint32_t xcc_id) 895 { 896 uint32_t rlcg_flag; 897 898 if (amdgpu_device_skip_hw_access(adev)) 899 return; 900 901 if ((reg * 4) < adev->rmmio_size) { 902 if (amdgpu_sriov_vf(adev) && 903 !amdgpu_sriov_runtime(adev) && 904 adev->gfx.rlc.rlcg_reg_access_supported && 905 amdgpu_virt_get_rlcg_reg_access_flag(adev, acc_flags, 906 GC_HWIP, true, 907 &rlcg_flag)) { 908 amdgpu_virt_rlcg_reg_rw(adev, reg, v, rlcg_flag, GET_INST(GC, xcc_id)); 909 } else if (!(acc_flags & AMDGPU_REGS_NO_KIQ) && 910 amdgpu_sriov_runtime(adev) && 911 down_read_trylock(&adev->reset_domain->sem)) { 912 amdgpu_kiq_wreg(adev, reg, v, xcc_id); 913 up_read(&adev->reset_domain->sem); 914 } else { 915 writel(v, ((void __iomem *)adev->rmmio) + (reg * 4)); 916 } 917 } else { 918 adev->pcie_wreg(adev, reg * 4, v); 919 } 920 } 921 922 /** 923 * amdgpu_device_indirect_rreg - read an indirect register 924 * 925 * @adev: amdgpu_device pointer 926 * @reg_addr: indirect register address to read from 927 * 928 * Returns the value of indirect register @reg_addr 929 */ 930 u32 amdgpu_device_indirect_rreg(struct amdgpu_device *adev, 931 u32 reg_addr) 932 { 933 unsigned long flags, pcie_index, pcie_data; 934 void __iomem *pcie_index_offset; 935 void __iomem *pcie_data_offset; 936 u32 r; 937 938 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 939 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 940 941 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 942 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 943 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 944 945 writel(reg_addr, pcie_index_offset); 946 readl(pcie_index_offset); 947 r = readl(pcie_data_offset); 948 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 949 950 return r; 951 } 952 953 u32 amdgpu_device_indirect_rreg_ext(struct amdgpu_device *adev, 954 u64 reg_addr) 955 { 956 unsigned long flags, pcie_index, pcie_index_hi, pcie_data; 957 u32 r; 958 void __iomem *pcie_index_offset; 959 void __iomem *pcie_index_hi_offset; 960 void __iomem *pcie_data_offset; 961 962 if (unlikely(!adev->nbio.funcs)) { 963 pcie_index = AMDGPU_PCIE_INDEX_FALLBACK; 964 pcie_data = AMDGPU_PCIE_DATA_FALLBACK; 965 } else { 966 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 967 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 968 } 969 970 if (reg_addr >> 32) { 971 if (unlikely(!adev->nbio.funcs)) 972 pcie_index_hi = AMDGPU_PCIE_INDEX_HI_FALLBACK; 973 else 974 pcie_index_hi = adev->nbio.funcs->get_pcie_index_hi_offset(adev); 975 } else { 976 pcie_index_hi = 0; 977 } 978 979 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 980 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 981 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 982 if (pcie_index_hi != 0) 983 pcie_index_hi_offset = (void __iomem *)adev->rmmio + 984 pcie_index_hi * 4; 985 986 writel(reg_addr, pcie_index_offset); 987 readl(pcie_index_offset); 988 if (pcie_index_hi != 0) { 989 writel((reg_addr >> 32) & 0xff, pcie_index_hi_offset); 990 readl(pcie_index_hi_offset); 991 } 992 r = readl(pcie_data_offset); 993 994 /* clear the high bits */ 995 if (pcie_index_hi != 0) { 996 writel(0, pcie_index_hi_offset); 997 readl(pcie_index_hi_offset); 998 } 999 1000 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 1001 1002 return r; 1003 } 1004 1005 /** 1006 * amdgpu_device_indirect_rreg64 - read a 64bits indirect register 1007 * 1008 * @adev: amdgpu_device pointer 1009 * @reg_addr: indirect register address to read from 1010 * 1011 * Returns the value of indirect register @reg_addr 1012 */ 1013 u64 amdgpu_device_indirect_rreg64(struct amdgpu_device *adev, 1014 u32 reg_addr) 1015 { 1016 unsigned long flags, pcie_index, pcie_data; 1017 void __iomem *pcie_index_offset; 1018 void __iomem *pcie_data_offset; 1019 u64 r; 1020 1021 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 1022 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 1023 1024 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 1025 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 1026 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 1027 1028 /* read low 32 bits */ 1029 writel(reg_addr, pcie_index_offset); 1030 readl(pcie_index_offset); 1031 r = readl(pcie_data_offset); 1032 /* read high 32 bits */ 1033 writel(reg_addr + 4, pcie_index_offset); 1034 readl(pcie_index_offset); 1035 r |= ((u64)readl(pcie_data_offset) << 32); 1036 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 1037 1038 return r; 1039 } 1040 1041 u64 amdgpu_device_indirect_rreg64_ext(struct amdgpu_device *adev, 1042 u64 reg_addr) 1043 { 1044 unsigned long flags, pcie_index, pcie_data; 1045 unsigned long pcie_index_hi = 0; 1046 void __iomem *pcie_index_offset; 1047 void __iomem *pcie_index_hi_offset; 1048 void __iomem *pcie_data_offset; 1049 u64 r; 1050 1051 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 1052 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 1053 if ((reg_addr >> 32) && (adev->nbio.funcs->get_pcie_index_hi_offset)) 1054 pcie_index_hi = adev->nbio.funcs->get_pcie_index_hi_offset(adev); 1055 1056 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 1057 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 1058 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 1059 if (pcie_index_hi != 0) 1060 pcie_index_hi_offset = (void __iomem *)adev->rmmio + 1061 pcie_index_hi * 4; 1062 1063 /* read low 32 bits */ 1064 writel(reg_addr, pcie_index_offset); 1065 readl(pcie_index_offset); 1066 if (pcie_index_hi != 0) { 1067 writel((reg_addr >> 32) & 0xff, pcie_index_hi_offset); 1068 readl(pcie_index_hi_offset); 1069 } 1070 r = readl(pcie_data_offset); 1071 /* read high 32 bits */ 1072 writel(reg_addr + 4, pcie_index_offset); 1073 readl(pcie_index_offset); 1074 if (pcie_index_hi != 0) { 1075 writel((reg_addr >> 32) & 0xff, pcie_index_hi_offset); 1076 readl(pcie_index_hi_offset); 1077 } 1078 r |= ((u64)readl(pcie_data_offset) << 32); 1079 1080 /* clear the high bits */ 1081 if (pcie_index_hi != 0) { 1082 writel(0, pcie_index_hi_offset); 1083 readl(pcie_index_hi_offset); 1084 } 1085 1086 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 1087 1088 return r; 1089 } 1090 1091 /** 1092 * amdgpu_device_indirect_wreg - write an indirect register address 1093 * 1094 * @adev: amdgpu_device pointer 1095 * @reg_addr: indirect register offset 1096 * @reg_data: indirect register data 1097 * 1098 */ 1099 void amdgpu_device_indirect_wreg(struct amdgpu_device *adev, 1100 u32 reg_addr, u32 reg_data) 1101 { 1102 unsigned long flags, pcie_index, pcie_data; 1103 void __iomem *pcie_index_offset; 1104 void __iomem *pcie_data_offset; 1105 1106 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 1107 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 1108 1109 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 1110 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 1111 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 1112 1113 writel(reg_addr, pcie_index_offset); 1114 readl(pcie_index_offset); 1115 writel(reg_data, pcie_data_offset); 1116 readl(pcie_data_offset); 1117 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 1118 } 1119 1120 void amdgpu_device_indirect_wreg_ext(struct amdgpu_device *adev, 1121 u64 reg_addr, u32 reg_data) 1122 { 1123 unsigned long flags, pcie_index, pcie_index_hi, pcie_data; 1124 void __iomem *pcie_index_offset; 1125 void __iomem *pcie_index_hi_offset; 1126 void __iomem *pcie_data_offset; 1127 1128 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 1129 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 1130 if ((reg_addr >> 32) && (adev->nbio.funcs->get_pcie_index_hi_offset)) 1131 pcie_index_hi = adev->nbio.funcs->get_pcie_index_hi_offset(adev); 1132 else 1133 pcie_index_hi = 0; 1134 1135 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 1136 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 1137 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 1138 if (pcie_index_hi != 0) 1139 pcie_index_hi_offset = (void __iomem *)adev->rmmio + 1140 pcie_index_hi * 4; 1141 1142 writel(reg_addr, pcie_index_offset); 1143 readl(pcie_index_offset); 1144 if (pcie_index_hi != 0) { 1145 writel((reg_addr >> 32) & 0xff, pcie_index_hi_offset); 1146 readl(pcie_index_hi_offset); 1147 } 1148 writel(reg_data, pcie_data_offset); 1149 readl(pcie_data_offset); 1150 1151 /* clear the high bits */ 1152 if (pcie_index_hi != 0) { 1153 writel(0, pcie_index_hi_offset); 1154 readl(pcie_index_hi_offset); 1155 } 1156 1157 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 1158 } 1159 1160 /** 1161 * amdgpu_device_indirect_wreg64 - write a 64bits indirect register address 1162 * 1163 * @adev: amdgpu_device pointer 1164 * @reg_addr: indirect register offset 1165 * @reg_data: indirect register data 1166 * 1167 */ 1168 void amdgpu_device_indirect_wreg64(struct amdgpu_device *adev, 1169 u32 reg_addr, u64 reg_data) 1170 { 1171 unsigned long flags, pcie_index, pcie_data; 1172 void __iomem *pcie_index_offset; 1173 void __iomem *pcie_data_offset; 1174 1175 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 1176 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 1177 1178 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 1179 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 1180 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 1181 1182 /* write low 32 bits */ 1183 writel(reg_addr, pcie_index_offset); 1184 readl(pcie_index_offset); 1185 writel((u32)(reg_data & 0xffffffffULL), pcie_data_offset); 1186 readl(pcie_data_offset); 1187 /* write high 32 bits */ 1188 writel(reg_addr + 4, pcie_index_offset); 1189 readl(pcie_index_offset); 1190 writel((u32)(reg_data >> 32), pcie_data_offset); 1191 readl(pcie_data_offset); 1192 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 1193 } 1194 1195 void amdgpu_device_indirect_wreg64_ext(struct amdgpu_device *adev, 1196 u64 reg_addr, u64 reg_data) 1197 { 1198 unsigned long flags, pcie_index, pcie_data; 1199 unsigned long pcie_index_hi = 0; 1200 void __iomem *pcie_index_offset; 1201 void __iomem *pcie_index_hi_offset; 1202 void __iomem *pcie_data_offset; 1203 1204 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 1205 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 1206 if ((reg_addr >> 32) && (adev->nbio.funcs->get_pcie_index_hi_offset)) 1207 pcie_index_hi = adev->nbio.funcs->get_pcie_index_hi_offset(adev); 1208 1209 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 1210 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 1211 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 1212 if (pcie_index_hi != 0) 1213 pcie_index_hi_offset = (void __iomem *)adev->rmmio + 1214 pcie_index_hi * 4; 1215 1216 /* write low 32 bits */ 1217 writel(reg_addr, pcie_index_offset); 1218 readl(pcie_index_offset); 1219 if (pcie_index_hi != 0) { 1220 writel((reg_addr >> 32) & 0xff, pcie_index_hi_offset); 1221 readl(pcie_index_hi_offset); 1222 } 1223 writel((u32)(reg_data & 0xffffffffULL), pcie_data_offset); 1224 readl(pcie_data_offset); 1225 /* write high 32 bits */ 1226 writel(reg_addr + 4, pcie_index_offset); 1227 readl(pcie_index_offset); 1228 if (pcie_index_hi != 0) { 1229 writel((reg_addr >> 32) & 0xff, pcie_index_hi_offset); 1230 readl(pcie_index_hi_offset); 1231 } 1232 writel((u32)(reg_data >> 32), pcie_data_offset); 1233 readl(pcie_data_offset); 1234 1235 /* clear the high bits */ 1236 if (pcie_index_hi != 0) { 1237 writel(0, pcie_index_hi_offset); 1238 readl(pcie_index_hi_offset); 1239 } 1240 1241 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 1242 } 1243 1244 /** 1245 * amdgpu_device_get_rev_id - query device rev_id 1246 * 1247 * @adev: amdgpu_device pointer 1248 * 1249 * Return device rev_id 1250 */ 1251 u32 amdgpu_device_get_rev_id(struct amdgpu_device *adev) 1252 { 1253 return adev->nbio.funcs->get_rev_id(adev); 1254 } 1255 1256 /** 1257 * amdgpu_invalid_rreg - dummy reg read function 1258 * 1259 * @adev: amdgpu_device pointer 1260 * @reg: offset of register 1261 * 1262 * Dummy register read function. Used for register blocks 1263 * that certain asics don't have (all asics). 1264 * Returns the value in the register. 1265 */ 1266 static uint32_t amdgpu_invalid_rreg(struct amdgpu_device *adev, uint32_t reg) 1267 { 1268 DRM_ERROR("Invalid callback to read register 0x%04X\n", reg); 1269 BUG(); 1270 return 0; 1271 } 1272 1273 static uint32_t amdgpu_invalid_rreg_ext(struct amdgpu_device *adev, uint64_t reg) 1274 { 1275 DRM_ERROR("Invalid callback to read register 0x%llX\n", reg); 1276 BUG(); 1277 return 0; 1278 } 1279 1280 /** 1281 * amdgpu_invalid_wreg - dummy reg write function 1282 * 1283 * @adev: amdgpu_device pointer 1284 * @reg: offset of register 1285 * @v: value to write to the register 1286 * 1287 * Dummy register read function. Used for register blocks 1288 * that certain asics don't have (all asics). 1289 */ 1290 static void amdgpu_invalid_wreg(struct amdgpu_device *adev, uint32_t reg, uint32_t v) 1291 { 1292 DRM_ERROR("Invalid callback to write register 0x%04X with 0x%08X\n", 1293 reg, v); 1294 BUG(); 1295 } 1296 1297 static void amdgpu_invalid_wreg_ext(struct amdgpu_device *adev, uint64_t reg, uint32_t v) 1298 { 1299 DRM_ERROR("Invalid callback to write register 0x%llX with 0x%08X\n", 1300 reg, v); 1301 BUG(); 1302 } 1303 1304 /** 1305 * amdgpu_invalid_rreg64 - dummy 64 bit reg read function 1306 * 1307 * @adev: amdgpu_device pointer 1308 * @reg: offset of register 1309 * 1310 * Dummy register read function. Used for register blocks 1311 * that certain asics don't have (all asics). 1312 * Returns the value in the register. 1313 */ 1314 static uint64_t amdgpu_invalid_rreg64(struct amdgpu_device *adev, uint32_t reg) 1315 { 1316 DRM_ERROR("Invalid callback to read 64 bit register 0x%04X\n", reg); 1317 BUG(); 1318 return 0; 1319 } 1320 1321 static uint64_t amdgpu_invalid_rreg64_ext(struct amdgpu_device *adev, uint64_t reg) 1322 { 1323 DRM_ERROR("Invalid callback to read register 0x%llX\n", reg); 1324 BUG(); 1325 return 0; 1326 } 1327 1328 /** 1329 * amdgpu_invalid_wreg64 - dummy reg write function 1330 * 1331 * @adev: amdgpu_device pointer 1332 * @reg: offset of register 1333 * @v: value to write to the register 1334 * 1335 * Dummy register read function. Used for register blocks 1336 * that certain asics don't have (all asics). 1337 */ 1338 static void amdgpu_invalid_wreg64(struct amdgpu_device *adev, uint32_t reg, uint64_t v) 1339 { 1340 DRM_ERROR("Invalid callback to write 64 bit register 0x%04X with 0x%08llX\n", 1341 reg, v); 1342 BUG(); 1343 } 1344 1345 static void amdgpu_invalid_wreg64_ext(struct amdgpu_device *adev, uint64_t reg, uint64_t v) 1346 { 1347 DRM_ERROR("Invalid callback to write 64 bit register 0x%llX with 0x%08llX\n", 1348 reg, v); 1349 BUG(); 1350 } 1351 1352 /** 1353 * amdgpu_block_invalid_rreg - dummy reg read function 1354 * 1355 * @adev: amdgpu_device pointer 1356 * @block: offset of instance 1357 * @reg: offset of register 1358 * 1359 * Dummy register read function. Used for register blocks 1360 * that certain asics don't have (all asics). 1361 * Returns the value in the register. 1362 */ 1363 static uint32_t amdgpu_block_invalid_rreg(struct amdgpu_device *adev, 1364 uint32_t block, uint32_t reg) 1365 { 1366 DRM_ERROR("Invalid callback to read register 0x%04X in block 0x%04X\n", 1367 reg, block); 1368 BUG(); 1369 return 0; 1370 } 1371 1372 /** 1373 * amdgpu_block_invalid_wreg - dummy reg write function 1374 * 1375 * @adev: amdgpu_device pointer 1376 * @block: offset of instance 1377 * @reg: offset of register 1378 * @v: value to write to the register 1379 * 1380 * Dummy register read function. Used for register blocks 1381 * that certain asics don't have (all asics). 1382 */ 1383 static void amdgpu_block_invalid_wreg(struct amdgpu_device *adev, 1384 uint32_t block, 1385 uint32_t reg, uint32_t v) 1386 { 1387 DRM_ERROR("Invalid block callback to write register 0x%04X in block 0x%04X with 0x%08X\n", 1388 reg, block, v); 1389 BUG(); 1390 } 1391 1392 /** 1393 * amdgpu_device_asic_init - Wrapper for atom asic_init 1394 * 1395 * @adev: amdgpu_device pointer 1396 * 1397 * Does any asic specific work and then calls atom asic init. 1398 */ 1399 static int amdgpu_device_asic_init(struct amdgpu_device *adev) 1400 { 1401 int ret; 1402 1403 amdgpu_asic_pre_asic_init(adev); 1404 1405 if (amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 3) || 1406 amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 4) || 1407 amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 5, 0) || 1408 amdgpu_ip_version(adev, GC_HWIP, 0) >= IP_VERSION(11, 0, 0)) { 1409 amdgpu_psp_wait_for_bootloader(adev); 1410 ret = amdgpu_atomfirmware_asic_init(adev, true); 1411 return ret; 1412 } else { 1413 return amdgpu_atom_asic_init(adev->mode_info.atom_context); 1414 } 1415 1416 return 0; 1417 } 1418 1419 /** 1420 * amdgpu_device_mem_scratch_init - allocate the VRAM scratch page 1421 * 1422 * @adev: amdgpu_device pointer 1423 * 1424 * Allocates a scratch page of VRAM for use by various things in the 1425 * driver. 1426 */ 1427 static int amdgpu_device_mem_scratch_init(struct amdgpu_device *adev) 1428 { 1429 return amdgpu_bo_create_kernel(adev, AMDGPU_GPU_PAGE_SIZE, PAGE_SIZE, 1430 AMDGPU_GEM_DOMAIN_VRAM | 1431 AMDGPU_GEM_DOMAIN_GTT, 1432 &adev->mem_scratch.robj, 1433 &adev->mem_scratch.gpu_addr, 1434 (void **)&adev->mem_scratch.ptr); 1435 } 1436 1437 /** 1438 * amdgpu_device_mem_scratch_fini - Free the VRAM scratch page 1439 * 1440 * @adev: amdgpu_device pointer 1441 * 1442 * Frees the VRAM scratch page. 1443 */ 1444 static void amdgpu_device_mem_scratch_fini(struct amdgpu_device *adev) 1445 { 1446 amdgpu_bo_free_kernel(&adev->mem_scratch.robj, NULL, NULL); 1447 } 1448 1449 /** 1450 * amdgpu_device_program_register_sequence - program an array of registers. 1451 * 1452 * @adev: amdgpu_device pointer 1453 * @registers: pointer to the register array 1454 * @array_size: size of the register array 1455 * 1456 * Programs an array or registers with and or masks. 1457 * This is a helper for setting golden registers. 1458 */ 1459 void amdgpu_device_program_register_sequence(struct amdgpu_device *adev, 1460 const u32 *registers, 1461 const u32 array_size) 1462 { 1463 u32 tmp, reg, and_mask, or_mask; 1464 int i; 1465 1466 if (array_size % 3) 1467 return; 1468 1469 for (i = 0; i < array_size; i += 3) { 1470 reg = registers[i + 0]; 1471 and_mask = registers[i + 1]; 1472 or_mask = registers[i + 2]; 1473 1474 if (and_mask == 0xffffffff) { 1475 tmp = or_mask; 1476 } else { 1477 tmp = RREG32(reg); 1478 tmp &= ~and_mask; 1479 if (adev->family >= AMDGPU_FAMILY_AI) 1480 tmp |= (or_mask & and_mask); 1481 else 1482 tmp |= or_mask; 1483 } 1484 WREG32(reg, tmp); 1485 } 1486 } 1487 1488 /** 1489 * amdgpu_device_pci_config_reset - reset the GPU 1490 * 1491 * @adev: amdgpu_device pointer 1492 * 1493 * Resets the GPU using the pci config reset sequence. 1494 * Only applicable to asics prior to vega10. 1495 */ 1496 void amdgpu_device_pci_config_reset(struct amdgpu_device *adev) 1497 { 1498 pci_write_config_dword(adev->pdev, 0x7c, AMDGPU_ASIC_RESET_DATA); 1499 } 1500 1501 /** 1502 * amdgpu_device_pci_reset - reset the GPU using generic PCI means 1503 * 1504 * @adev: amdgpu_device pointer 1505 * 1506 * Resets the GPU using generic pci reset interfaces (FLR, SBR, etc.). 1507 */ 1508 int amdgpu_device_pci_reset(struct amdgpu_device *adev) 1509 { 1510 return pci_reset_function(adev->pdev); 1511 } 1512 1513 /* 1514 * amdgpu_device_wb_*() 1515 * Writeback is the method by which the GPU updates special pages in memory 1516 * with the status of certain GPU events (fences, ring pointers,etc.). 1517 */ 1518 1519 /** 1520 * amdgpu_device_wb_fini - Disable Writeback and free memory 1521 * 1522 * @adev: amdgpu_device pointer 1523 * 1524 * Disables Writeback and frees the Writeback memory (all asics). 1525 * Used at driver shutdown. 1526 */ 1527 static void amdgpu_device_wb_fini(struct amdgpu_device *adev) 1528 { 1529 if (adev->wb.wb_obj) { 1530 amdgpu_bo_free_kernel(&adev->wb.wb_obj, 1531 &adev->wb.gpu_addr, 1532 (void **)&adev->wb.wb); 1533 adev->wb.wb_obj = NULL; 1534 } 1535 } 1536 1537 /** 1538 * amdgpu_device_wb_init - Init Writeback driver info and allocate memory 1539 * 1540 * @adev: amdgpu_device pointer 1541 * 1542 * Initializes writeback and allocates writeback memory (all asics). 1543 * Used at driver startup. 1544 * Returns 0 on success or an -error on failure. 1545 */ 1546 static int amdgpu_device_wb_init(struct amdgpu_device *adev) 1547 { 1548 int r; 1549 1550 if (adev->wb.wb_obj == NULL) { 1551 /* AMDGPU_MAX_WB * sizeof(uint32_t) * 8 = AMDGPU_MAX_WB 256bit slots */ 1552 r = amdgpu_bo_create_kernel(adev, AMDGPU_MAX_WB * sizeof(uint32_t) * 8, 1553 PAGE_SIZE, AMDGPU_GEM_DOMAIN_GTT, 1554 &adev->wb.wb_obj, &adev->wb.gpu_addr, 1555 (void **)&adev->wb.wb); 1556 if (r) { 1557 dev_warn(adev->dev, "(%d) create WB bo failed\n", r); 1558 return r; 1559 } 1560 1561 adev->wb.num_wb = AMDGPU_MAX_WB; 1562 memset(&adev->wb.used, 0, sizeof(adev->wb.used)); 1563 1564 /* clear wb memory */ 1565 memset((char *)adev->wb.wb, 0, AMDGPU_MAX_WB * sizeof(uint32_t) * 8); 1566 } 1567 1568 return 0; 1569 } 1570 1571 /** 1572 * amdgpu_device_wb_get - Allocate a wb entry 1573 * 1574 * @adev: amdgpu_device pointer 1575 * @wb: wb index 1576 * 1577 * Allocate a wb slot for use by the driver (all asics). 1578 * Returns 0 on success or -EINVAL on failure. 1579 */ 1580 int amdgpu_device_wb_get(struct amdgpu_device *adev, u32 *wb) 1581 { 1582 unsigned long flags, offset; 1583 1584 spin_lock_irqsave(&adev->wb.lock, flags); 1585 offset = find_first_zero_bit(adev->wb.used, adev->wb.num_wb); 1586 if (offset < adev->wb.num_wb) { 1587 __set_bit(offset, adev->wb.used); 1588 spin_unlock_irqrestore(&adev->wb.lock, flags); 1589 *wb = offset << 3; /* convert to dw offset */ 1590 return 0; 1591 } else { 1592 spin_unlock_irqrestore(&adev->wb.lock, flags); 1593 return -EINVAL; 1594 } 1595 } 1596 1597 /** 1598 * amdgpu_device_wb_free - Free a wb entry 1599 * 1600 * @adev: amdgpu_device pointer 1601 * @wb: wb index 1602 * 1603 * Free a wb slot allocated for use by the driver (all asics) 1604 */ 1605 void amdgpu_device_wb_free(struct amdgpu_device *adev, u32 wb) 1606 { 1607 unsigned long flags; 1608 1609 wb >>= 3; 1610 spin_lock_irqsave(&adev->wb.lock, flags); 1611 if (wb < adev->wb.num_wb) 1612 __clear_bit(wb, adev->wb.used); 1613 spin_unlock_irqrestore(&adev->wb.lock, flags); 1614 } 1615 1616 /** 1617 * amdgpu_device_resize_fb_bar - try to resize FB BAR 1618 * 1619 * @adev: amdgpu_device pointer 1620 * 1621 * Try to resize FB BAR to make all VRAM CPU accessible. We try very hard not 1622 * to fail, but if any of the BARs is not accessible after the size we abort 1623 * driver loading by returning -ENODEV. 1624 */ 1625 int amdgpu_device_resize_fb_bar(struct amdgpu_device *adev) 1626 { 1627 int rbar_size = pci_rebar_bytes_to_size(adev->gmc.real_vram_size); 1628 struct pci_bus *root; 1629 struct resource *res; 1630 unsigned int i; 1631 u16 cmd; 1632 int r; 1633 1634 if (!IS_ENABLED(CONFIG_PHYS_ADDR_T_64BIT)) 1635 return 0; 1636 1637 /* Bypass for VF */ 1638 if (amdgpu_sriov_vf(adev)) 1639 return 0; 1640 1641 /* PCI_EXT_CAP_ID_VNDR extended capability is located at 0x100 */ 1642 if (!pci_find_ext_capability(adev->pdev, PCI_EXT_CAP_ID_VNDR)) 1643 DRM_WARN("System can't access extended configuration space, please check!!\n"); 1644 1645 /* skip if the bios has already enabled large BAR */ 1646 if (adev->gmc.real_vram_size && 1647 (pci_resource_len(adev->pdev, 0) >= adev->gmc.real_vram_size)) 1648 return 0; 1649 1650 /* Check if the root BUS has 64bit memory resources */ 1651 root = adev->pdev->bus; 1652 while (root->parent) 1653 root = root->parent; 1654 1655 pci_bus_for_each_resource(root, res, i) { 1656 if (res && res->flags & (IORESOURCE_MEM | IORESOURCE_MEM_64) && 1657 res->start > 0x100000000ull) 1658 break; 1659 } 1660 1661 /* Trying to resize is pointless without a root hub window above 4GB */ 1662 if (!res) 1663 return 0; 1664 1665 /* Limit the BAR size to what is available */ 1666 rbar_size = min(fls(pci_rebar_get_possible_sizes(adev->pdev, 0)) - 1, 1667 rbar_size); 1668 1669 /* Disable memory decoding while we change the BAR addresses and size */ 1670 pci_read_config_word(adev->pdev, PCI_COMMAND, &cmd); 1671 pci_write_config_word(adev->pdev, PCI_COMMAND, 1672 cmd & ~PCI_COMMAND_MEMORY); 1673 1674 /* Free the VRAM and doorbell BAR, we most likely need to move both. */ 1675 amdgpu_doorbell_fini(adev); 1676 if (adev->asic_type >= CHIP_BONAIRE) 1677 pci_release_resource(adev->pdev, 2); 1678 1679 pci_release_resource(adev->pdev, 0); 1680 1681 r = pci_resize_resource(adev->pdev, 0, rbar_size); 1682 if (r == -ENOSPC) 1683 DRM_INFO("Not enough PCI address space for a large BAR."); 1684 else if (r && r != -ENOTSUPP) 1685 DRM_ERROR("Problem resizing BAR0 (%d).", r); 1686 1687 pci_assign_unassigned_bus_resources(adev->pdev->bus); 1688 1689 /* When the doorbell or fb BAR isn't available we have no chance of 1690 * using the device. 1691 */ 1692 r = amdgpu_doorbell_init(adev); 1693 if (r || (pci_resource_flags(adev->pdev, 0) & IORESOURCE_UNSET)) 1694 return -ENODEV; 1695 1696 pci_write_config_word(adev->pdev, PCI_COMMAND, cmd); 1697 1698 return 0; 1699 } 1700 1701 static bool amdgpu_device_read_bios(struct amdgpu_device *adev) 1702 { 1703 if (hweight32(adev->aid_mask) && (adev->flags & AMD_IS_APU)) 1704 return false; 1705 1706 return true; 1707 } 1708 1709 /* 1710 * GPU helpers function. 1711 */ 1712 /** 1713 * amdgpu_device_need_post - check if the hw need post or not 1714 * 1715 * @adev: amdgpu_device pointer 1716 * 1717 * Check if the asic has been initialized (all asics) at driver startup 1718 * or post is needed if hw reset is performed. 1719 * Returns true if need or false if not. 1720 */ 1721 bool amdgpu_device_need_post(struct amdgpu_device *adev) 1722 { 1723 uint32_t reg; 1724 1725 if (amdgpu_sriov_vf(adev)) 1726 return false; 1727 1728 if (!amdgpu_device_read_bios(adev)) 1729 return false; 1730 1731 if (amdgpu_passthrough(adev)) { 1732 /* for FIJI: In whole GPU pass-through virtualization case, after VM reboot 1733 * some old smc fw still need driver do vPost otherwise gpu hang, while 1734 * those smc fw version above 22.15 doesn't have this flaw, so we force 1735 * vpost executed for smc version below 22.15 1736 */ 1737 if (adev->asic_type == CHIP_FIJI) { 1738 int err; 1739 uint32_t fw_ver; 1740 1741 err = request_firmware(&adev->pm.fw, "amdgpu/fiji_smc.bin", adev->dev); 1742 /* force vPost if error occurred */ 1743 if (err) 1744 return true; 1745 1746 fw_ver = *((uint32_t *)adev->pm.fw->data + 69); 1747 release_firmware(adev->pm.fw); 1748 if (fw_ver < 0x00160e00) 1749 return true; 1750 } 1751 } 1752 1753 /* Don't post if we need to reset whole hive on init */ 1754 if (adev->init_lvl->level == AMDGPU_INIT_LEVEL_MINIMAL_XGMI) 1755 return false; 1756 1757 if (adev->has_hw_reset) { 1758 adev->has_hw_reset = false; 1759 return true; 1760 } 1761 1762 /* bios scratch used on CIK+ */ 1763 if (adev->asic_type >= CHIP_BONAIRE) 1764 return amdgpu_atombios_scratch_need_asic_init(adev); 1765 1766 /* check MEM_SIZE for older asics */ 1767 reg = amdgpu_asic_get_config_memsize(adev); 1768 1769 if ((reg != 0) && (reg != 0xffffffff)) 1770 return false; 1771 1772 return true; 1773 } 1774 1775 /* 1776 * Check whether seamless boot is supported. 1777 * 1778 * So far we only support seamless boot on DCE 3.0 or later. 1779 * If users report that it works on older ASICS as well, we may 1780 * loosen this. 1781 */ 1782 bool amdgpu_device_seamless_boot_supported(struct amdgpu_device *adev) 1783 { 1784 switch (amdgpu_seamless) { 1785 case -1: 1786 break; 1787 case 1: 1788 return true; 1789 case 0: 1790 return false; 1791 default: 1792 DRM_ERROR("Invalid value for amdgpu.seamless: %d\n", 1793 amdgpu_seamless); 1794 return false; 1795 } 1796 1797 if (!(adev->flags & AMD_IS_APU)) 1798 return false; 1799 1800 if (adev->mman.keep_stolen_vga_memory) 1801 return false; 1802 1803 return amdgpu_ip_version(adev, DCE_HWIP, 0) >= IP_VERSION(3, 0, 0); 1804 } 1805 1806 /* 1807 * Intel hosts such as Rocket Lake, Alder Lake, Raptor Lake and Sapphire Rapids 1808 * don't support dynamic speed switching. Until we have confirmation from Intel 1809 * that a specific host supports it, it's safer that we keep it disabled for all. 1810 * 1811 * https://edc.intel.com/content/www/us/en/design/products/platforms/details/raptor-lake-s/13th-generation-core-processors-datasheet-volume-1-of-2/005/pci-express-support/ 1812 * https://gitlab.freedesktop.org/drm/amd/-/issues/2663 1813 */ 1814 static bool amdgpu_device_pcie_dynamic_switching_supported(struct amdgpu_device *adev) 1815 { 1816 #if IS_ENABLED(CONFIG_X86) 1817 struct cpuinfo_x86 *c = &cpu_data(0); 1818 1819 /* eGPU change speeds based on USB4 fabric conditions */ 1820 if (dev_is_removable(adev->dev)) 1821 return true; 1822 1823 if (c->x86_vendor == X86_VENDOR_INTEL) 1824 return false; 1825 #endif 1826 return true; 1827 } 1828 1829 /** 1830 * amdgpu_device_should_use_aspm - check if the device should program ASPM 1831 * 1832 * @adev: amdgpu_device pointer 1833 * 1834 * Confirm whether the module parameter and pcie bridge agree that ASPM should 1835 * be set for this device. 1836 * 1837 * Returns true if it should be used or false if not. 1838 */ 1839 bool amdgpu_device_should_use_aspm(struct amdgpu_device *adev) 1840 { 1841 switch (amdgpu_aspm) { 1842 case -1: 1843 break; 1844 case 0: 1845 return false; 1846 case 1: 1847 return true; 1848 default: 1849 return false; 1850 } 1851 if (adev->flags & AMD_IS_APU) 1852 return false; 1853 if (!(adev->pm.pp_feature & PP_PCIE_DPM_MASK)) 1854 return false; 1855 return pcie_aspm_enabled(adev->pdev); 1856 } 1857 1858 /* if we get transitioned to only one device, take VGA back */ 1859 /** 1860 * amdgpu_device_vga_set_decode - enable/disable vga decode 1861 * 1862 * @pdev: PCI device pointer 1863 * @state: enable/disable vga decode 1864 * 1865 * Enable/disable vga decode (all asics). 1866 * Returns VGA resource flags. 1867 */ 1868 static unsigned int amdgpu_device_vga_set_decode(struct pci_dev *pdev, 1869 bool state) 1870 { 1871 struct amdgpu_device *adev = drm_to_adev(pci_get_drvdata(pdev)); 1872 1873 amdgpu_asic_set_vga_state(adev, state); 1874 if (state) 1875 return VGA_RSRC_LEGACY_IO | VGA_RSRC_LEGACY_MEM | 1876 VGA_RSRC_NORMAL_IO | VGA_RSRC_NORMAL_MEM; 1877 else 1878 return VGA_RSRC_NORMAL_IO | VGA_RSRC_NORMAL_MEM; 1879 } 1880 1881 /** 1882 * amdgpu_device_check_block_size - validate the vm block size 1883 * 1884 * @adev: amdgpu_device pointer 1885 * 1886 * Validates the vm block size specified via module parameter. 1887 * The vm block size defines number of bits in page table versus page directory, 1888 * a page is 4KB so we have 12 bits offset, minimum 9 bits in the 1889 * page table and the remaining bits are in the page directory. 1890 */ 1891 static void amdgpu_device_check_block_size(struct amdgpu_device *adev) 1892 { 1893 /* defines number of bits in page table versus page directory, 1894 * a page is 4KB so we have 12 bits offset, minimum 9 bits in the 1895 * page table and the remaining bits are in the page directory 1896 */ 1897 if (amdgpu_vm_block_size == -1) 1898 return; 1899 1900 if (amdgpu_vm_block_size < 9) { 1901 dev_warn(adev->dev, "VM page table size (%d) too small\n", 1902 amdgpu_vm_block_size); 1903 amdgpu_vm_block_size = -1; 1904 } 1905 } 1906 1907 /** 1908 * amdgpu_device_check_vm_size - validate the vm size 1909 * 1910 * @adev: amdgpu_device pointer 1911 * 1912 * Validates the vm size in GB specified via module parameter. 1913 * The VM size is the size of the GPU virtual memory space in GB. 1914 */ 1915 static void amdgpu_device_check_vm_size(struct amdgpu_device *adev) 1916 { 1917 /* no need to check the default value */ 1918 if (amdgpu_vm_size == -1) 1919 return; 1920 1921 if (amdgpu_vm_size < 1) { 1922 dev_warn(adev->dev, "VM size (%d) too small, min is 1GB\n", 1923 amdgpu_vm_size); 1924 amdgpu_vm_size = -1; 1925 } 1926 } 1927 1928 static void amdgpu_device_check_smu_prv_buffer_size(struct amdgpu_device *adev) 1929 { 1930 struct sysinfo si; 1931 bool is_os_64 = (sizeof(void *) == 8); 1932 uint64_t total_memory; 1933 uint64_t dram_size_seven_GB = 0x1B8000000; 1934 uint64_t dram_size_three_GB = 0xB8000000; 1935 1936 if (amdgpu_smu_memory_pool_size == 0) 1937 return; 1938 1939 if (!is_os_64) { 1940 DRM_WARN("Not 64-bit OS, feature not supported\n"); 1941 goto def_value; 1942 } 1943 si_meminfo(&si); 1944 total_memory = (uint64_t)si.totalram * si.mem_unit; 1945 1946 if ((amdgpu_smu_memory_pool_size == 1) || 1947 (amdgpu_smu_memory_pool_size == 2)) { 1948 if (total_memory < dram_size_three_GB) 1949 goto def_value1; 1950 } else if ((amdgpu_smu_memory_pool_size == 4) || 1951 (amdgpu_smu_memory_pool_size == 8)) { 1952 if (total_memory < dram_size_seven_GB) 1953 goto def_value1; 1954 } else { 1955 DRM_WARN("Smu memory pool size not supported\n"); 1956 goto def_value; 1957 } 1958 adev->pm.smu_prv_buffer_size = amdgpu_smu_memory_pool_size << 28; 1959 1960 return; 1961 1962 def_value1: 1963 DRM_WARN("No enough system memory\n"); 1964 def_value: 1965 adev->pm.smu_prv_buffer_size = 0; 1966 } 1967 1968 static int amdgpu_device_init_apu_flags(struct amdgpu_device *adev) 1969 { 1970 if (!(adev->flags & AMD_IS_APU) || 1971 adev->asic_type < CHIP_RAVEN) 1972 return 0; 1973 1974 switch (adev->asic_type) { 1975 case CHIP_RAVEN: 1976 if (adev->pdev->device == 0x15dd) 1977 adev->apu_flags |= AMD_APU_IS_RAVEN; 1978 if (adev->pdev->device == 0x15d8) 1979 adev->apu_flags |= AMD_APU_IS_PICASSO; 1980 break; 1981 case CHIP_RENOIR: 1982 if ((adev->pdev->device == 0x1636) || 1983 (adev->pdev->device == 0x164c)) 1984 adev->apu_flags |= AMD_APU_IS_RENOIR; 1985 else 1986 adev->apu_flags |= AMD_APU_IS_GREEN_SARDINE; 1987 break; 1988 case CHIP_VANGOGH: 1989 adev->apu_flags |= AMD_APU_IS_VANGOGH; 1990 break; 1991 case CHIP_YELLOW_CARP: 1992 break; 1993 case CHIP_CYAN_SKILLFISH: 1994 if ((adev->pdev->device == 0x13FE) || 1995 (adev->pdev->device == 0x143F)) 1996 adev->apu_flags |= AMD_APU_IS_CYAN_SKILLFISH2; 1997 break; 1998 default: 1999 break; 2000 } 2001 2002 return 0; 2003 } 2004 2005 /** 2006 * amdgpu_device_check_arguments - validate module params 2007 * 2008 * @adev: amdgpu_device pointer 2009 * 2010 * Validates certain module parameters and updates 2011 * the associated values used by the driver (all asics). 2012 */ 2013 static int amdgpu_device_check_arguments(struct amdgpu_device *adev) 2014 { 2015 int i; 2016 2017 if (amdgpu_sched_jobs < 4) { 2018 dev_warn(adev->dev, "sched jobs (%d) must be at least 4\n", 2019 amdgpu_sched_jobs); 2020 amdgpu_sched_jobs = 4; 2021 } else if (!is_power_of_2(amdgpu_sched_jobs)) { 2022 dev_warn(adev->dev, "sched jobs (%d) must be a power of 2\n", 2023 amdgpu_sched_jobs); 2024 amdgpu_sched_jobs = roundup_pow_of_two(amdgpu_sched_jobs); 2025 } 2026 2027 if (amdgpu_gart_size != -1 && amdgpu_gart_size < 32) { 2028 /* gart size must be greater or equal to 32M */ 2029 dev_warn(adev->dev, "gart size (%d) too small\n", 2030 amdgpu_gart_size); 2031 amdgpu_gart_size = -1; 2032 } 2033 2034 if (amdgpu_gtt_size != -1 && amdgpu_gtt_size < 32) { 2035 /* gtt size must be greater or equal to 32M */ 2036 dev_warn(adev->dev, "gtt size (%d) too small\n", 2037 amdgpu_gtt_size); 2038 amdgpu_gtt_size = -1; 2039 } 2040 2041 /* valid range is between 4 and 9 inclusive */ 2042 if (amdgpu_vm_fragment_size != -1 && 2043 (amdgpu_vm_fragment_size > 9 || amdgpu_vm_fragment_size < 4)) { 2044 dev_warn(adev->dev, "valid range is between 4 and 9\n"); 2045 amdgpu_vm_fragment_size = -1; 2046 } 2047 2048 if (amdgpu_sched_hw_submission < 2) { 2049 dev_warn(adev->dev, "sched hw submission jobs (%d) must be at least 2\n", 2050 amdgpu_sched_hw_submission); 2051 amdgpu_sched_hw_submission = 2; 2052 } else if (!is_power_of_2(amdgpu_sched_hw_submission)) { 2053 dev_warn(adev->dev, "sched hw submission jobs (%d) must be a power of 2\n", 2054 amdgpu_sched_hw_submission); 2055 amdgpu_sched_hw_submission = roundup_pow_of_two(amdgpu_sched_hw_submission); 2056 } 2057 2058 if (amdgpu_reset_method < -1 || amdgpu_reset_method > 4) { 2059 dev_warn(adev->dev, "invalid option for reset method, reverting to default\n"); 2060 amdgpu_reset_method = -1; 2061 } 2062 2063 amdgpu_device_check_smu_prv_buffer_size(adev); 2064 2065 amdgpu_device_check_vm_size(adev); 2066 2067 amdgpu_device_check_block_size(adev); 2068 2069 adev->firmware.load_type = amdgpu_ucode_get_load_type(adev, amdgpu_fw_load_type); 2070 2071 for (i = 0; i < MAX_XCP; i++) 2072 adev->enforce_isolation[i] = !!enforce_isolation; 2073 2074 return 0; 2075 } 2076 2077 /** 2078 * amdgpu_switcheroo_set_state - set switcheroo state 2079 * 2080 * @pdev: pci dev pointer 2081 * @state: vga_switcheroo state 2082 * 2083 * Callback for the switcheroo driver. Suspends or resumes 2084 * the asics before or after it is powered up using ACPI methods. 2085 */ 2086 static void amdgpu_switcheroo_set_state(struct pci_dev *pdev, 2087 enum vga_switcheroo_state state) 2088 { 2089 struct drm_device *dev = pci_get_drvdata(pdev); 2090 int r; 2091 2092 if (amdgpu_device_supports_px(dev) && state == VGA_SWITCHEROO_OFF) 2093 return; 2094 2095 if (state == VGA_SWITCHEROO_ON) { 2096 pr_info("switched on\n"); 2097 /* don't suspend or resume card normally */ 2098 dev->switch_power_state = DRM_SWITCH_POWER_CHANGING; 2099 2100 pci_set_power_state(pdev, PCI_D0); 2101 amdgpu_device_load_pci_state(pdev); 2102 r = pci_enable_device(pdev); 2103 if (r) 2104 DRM_WARN("pci_enable_device failed (%d)\n", r); 2105 amdgpu_device_resume(dev, true); 2106 2107 dev->switch_power_state = DRM_SWITCH_POWER_ON; 2108 } else { 2109 pr_info("switched off\n"); 2110 dev->switch_power_state = DRM_SWITCH_POWER_CHANGING; 2111 amdgpu_device_prepare(dev); 2112 amdgpu_device_suspend(dev, true); 2113 amdgpu_device_cache_pci_state(pdev); 2114 /* Shut down the device */ 2115 pci_disable_device(pdev); 2116 pci_set_power_state(pdev, PCI_D3cold); 2117 dev->switch_power_state = DRM_SWITCH_POWER_OFF; 2118 } 2119 } 2120 2121 /** 2122 * amdgpu_switcheroo_can_switch - see if switcheroo state can change 2123 * 2124 * @pdev: pci dev pointer 2125 * 2126 * Callback for the switcheroo driver. Check of the switcheroo 2127 * state can be changed. 2128 * Returns true if the state can be changed, false if not. 2129 */ 2130 static bool amdgpu_switcheroo_can_switch(struct pci_dev *pdev) 2131 { 2132 struct drm_device *dev = pci_get_drvdata(pdev); 2133 2134 /* 2135 * FIXME: open_count is protected by drm_global_mutex but that would lead to 2136 * locking inversion with the driver load path. And the access here is 2137 * completely racy anyway. So don't bother with locking for now. 2138 */ 2139 return atomic_read(&dev->open_count) == 0; 2140 } 2141 2142 static const struct vga_switcheroo_client_ops amdgpu_switcheroo_ops = { 2143 .set_gpu_state = amdgpu_switcheroo_set_state, 2144 .reprobe = NULL, 2145 .can_switch = amdgpu_switcheroo_can_switch, 2146 }; 2147 2148 /** 2149 * amdgpu_device_ip_set_clockgating_state - set the CG state 2150 * 2151 * @dev: amdgpu_device pointer 2152 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.) 2153 * @state: clockgating state (gate or ungate) 2154 * 2155 * Sets the requested clockgating state for all instances of 2156 * the hardware IP specified. 2157 * Returns the error code from the last instance. 2158 */ 2159 int amdgpu_device_ip_set_clockgating_state(void *dev, 2160 enum amd_ip_block_type block_type, 2161 enum amd_clockgating_state state) 2162 { 2163 struct amdgpu_device *adev = dev; 2164 int i, r = 0; 2165 2166 for (i = 0; i < adev->num_ip_blocks; i++) { 2167 if (!adev->ip_blocks[i].status.valid) 2168 continue; 2169 if (adev->ip_blocks[i].version->type != block_type) 2170 continue; 2171 if (!adev->ip_blocks[i].version->funcs->set_clockgating_state) 2172 continue; 2173 r = adev->ip_blocks[i].version->funcs->set_clockgating_state( 2174 &adev->ip_blocks[i], state); 2175 if (r) 2176 DRM_ERROR("set_clockgating_state of IP block <%s> failed %d\n", 2177 adev->ip_blocks[i].version->funcs->name, r); 2178 } 2179 return r; 2180 } 2181 2182 /** 2183 * amdgpu_device_ip_set_powergating_state - set the PG state 2184 * 2185 * @dev: amdgpu_device pointer 2186 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.) 2187 * @state: powergating state (gate or ungate) 2188 * 2189 * Sets the requested powergating state for all instances of 2190 * the hardware IP specified. 2191 * Returns the error code from the last instance. 2192 */ 2193 int amdgpu_device_ip_set_powergating_state(void *dev, 2194 enum amd_ip_block_type block_type, 2195 enum amd_powergating_state state) 2196 { 2197 struct amdgpu_device *adev = dev; 2198 int i, r = 0; 2199 2200 for (i = 0; i < adev->num_ip_blocks; i++) { 2201 if (!adev->ip_blocks[i].status.valid) 2202 continue; 2203 if (adev->ip_blocks[i].version->type != block_type) 2204 continue; 2205 if (!adev->ip_blocks[i].version->funcs->set_powergating_state) 2206 continue; 2207 r = adev->ip_blocks[i].version->funcs->set_powergating_state( 2208 &adev->ip_blocks[i], state); 2209 if (r) 2210 DRM_ERROR("set_powergating_state of IP block <%s> failed %d\n", 2211 adev->ip_blocks[i].version->funcs->name, r); 2212 } 2213 return r; 2214 } 2215 2216 /** 2217 * amdgpu_device_ip_get_clockgating_state - get the CG state 2218 * 2219 * @adev: amdgpu_device pointer 2220 * @flags: clockgating feature flags 2221 * 2222 * Walks the list of IPs on the device and updates the clockgating 2223 * flags for each IP. 2224 * Updates @flags with the feature flags for each hardware IP where 2225 * clockgating is enabled. 2226 */ 2227 void amdgpu_device_ip_get_clockgating_state(struct amdgpu_device *adev, 2228 u64 *flags) 2229 { 2230 int i; 2231 2232 for (i = 0; i < adev->num_ip_blocks; i++) { 2233 if (!adev->ip_blocks[i].status.valid) 2234 continue; 2235 if (adev->ip_blocks[i].version->funcs->get_clockgating_state) 2236 adev->ip_blocks[i].version->funcs->get_clockgating_state((void *)adev, flags); 2237 } 2238 } 2239 2240 /** 2241 * amdgpu_device_ip_wait_for_idle - wait for idle 2242 * 2243 * @adev: amdgpu_device pointer 2244 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.) 2245 * 2246 * Waits for the request hardware IP to be idle. 2247 * Returns 0 for success or a negative error code on failure. 2248 */ 2249 int amdgpu_device_ip_wait_for_idle(struct amdgpu_device *adev, 2250 enum amd_ip_block_type block_type) 2251 { 2252 int i, r; 2253 2254 for (i = 0; i < adev->num_ip_blocks; i++) { 2255 if (!adev->ip_blocks[i].status.valid) 2256 continue; 2257 if (adev->ip_blocks[i].version->type == block_type) { 2258 if (adev->ip_blocks[i].version->funcs->wait_for_idle) { 2259 r = adev->ip_blocks[i].version->funcs->wait_for_idle( 2260 &adev->ip_blocks[i]); 2261 if (r) 2262 return r; 2263 } 2264 break; 2265 } 2266 } 2267 return 0; 2268 2269 } 2270 2271 /** 2272 * amdgpu_device_ip_is_valid - is the hardware IP enabled 2273 * 2274 * @adev: amdgpu_device pointer 2275 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.) 2276 * 2277 * Check if the hardware IP is enable or not. 2278 * Returns true if it the IP is enable, false if not. 2279 */ 2280 bool amdgpu_device_ip_is_valid(struct amdgpu_device *adev, 2281 enum amd_ip_block_type block_type) 2282 { 2283 int i; 2284 2285 for (i = 0; i < adev->num_ip_blocks; i++) { 2286 if (adev->ip_blocks[i].version->type == block_type) 2287 return adev->ip_blocks[i].status.valid; 2288 } 2289 return false; 2290 2291 } 2292 2293 /** 2294 * amdgpu_device_ip_get_ip_block - get a hw IP pointer 2295 * 2296 * @adev: amdgpu_device pointer 2297 * @type: Type of hardware IP (SMU, GFX, UVD, etc.) 2298 * 2299 * Returns a pointer to the hardware IP block structure 2300 * if it exists for the asic, otherwise NULL. 2301 */ 2302 struct amdgpu_ip_block * 2303 amdgpu_device_ip_get_ip_block(struct amdgpu_device *adev, 2304 enum amd_ip_block_type type) 2305 { 2306 int i; 2307 2308 for (i = 0; i < adev->num_ip_blocks; i++) 2309 if (adev->ip_blocks[i].version->type == type) 2310 return &adev->ip_blocks[i]; 2311 2312 return NULL; 2313 } 2314 2315 /** 2316 * amdgpu_device_ip_block_version_cmp 2317 * 2318 * @adev: amdgpu_device pointer 2319 * @type: enum amd_ip_block_type 2320 * @major: major version 2321 * @minor: minor version 2322 * 2323 * return 0 if equal or greater 2324 * return 1 if smaller or the ip_block doesn't exist 2325 */ 2326 int amdgpu_device_ip_block_version_cmp(struct amdgpu_device *adev, 2327 enum amd_ip_block_type type, 2328 u32 major, u32 minor) 2329 { 2330 struct amdgpu_ip_block *ip_block = amdgpu_device_ip_get_ip_block(adev, type); 2331 2332 if (ip_block && ((ip_block->version->major > major) || 2333 ((ip_block->version->major == major) && 2334 (ip_block->version->minor >= minor)))) 2335 return 0; 2336 2337 return 1; 2338 } 2339 2340 /** 2341 * amdgpu_device_ip_block_add 2342 * 2343 * @adev: amdgpu_device pointer 2344 * @ip_block_version: pointer to the IP to add 2345 * 2346 * Adds the IP block driver information to the collection of IPs 2347 * on the asic. 2348 */ 2349 int amdgpu_device_ip_block_add(struct amdgpu_device *adev, 2350 const struct amdgpu_ip_block_version *ip_block_version) 2351 { 2352 if (!ip_block_version) 2353 return -EINVAL; 2354 2355 switch (ip_block_version->type) { 2356 case AMD_IP_BLOCK_TYPE_VCN: 2357 if (adev->harvest_ip_mask & AMD_HARVEST_IP_VCN_MASK) 2358 return 0; 2359 break; 2360 case AMD_IP_BLOCK_TYPE_JPEG: 2361 if (adev->harvest_ip_mask & AMD_HARVEST_IP_JPEG_MASK) 2362 return 0; 2363 break; 2364 default: 2365 break; 2366 } 2367 2368 dev_info(adev->dev, "detected ip block number %d <%s>\n", 2369 adev->num_ip_blocks, ip_block_version->funcs->name); 2370 2371 adev->ip_blocks[adev->num_ip_blocks].adev = adev; 2372 2373 adev->ip_blocks[adev->num_ip_blocks++].version = ip_block_version; 2374 2375 return 0; 2376 } 2377 2378 /** 2379 * amdgpu_device_enable_virtual_display - enable virtual display feature 2380 * 2381 * @adev: amdgpu_device pointer 2382 * 2383 * Enabled the virtual display feature if the user has enabled it via 2384 * the module parameter virtual_display. This feature provides a virtual 2385 * display hardware on headless boards or in virtualized environments. 2386 * This function parses and validates the configuration string specified by 2387 * the user and configures the virtual display configuration (number of 2388 * virtual connectors, crtcs, etc.) specified. 2389 */ 2390 static void amdgpu_device_enable_virtual_display(struct amdgpu_device *adev) 2391 { 2392 adev->enable_virtual_display = false; 2393 2394 if (amdgpu_virtual_display) { 2395 const char *pci_address_name = pci_name(adev->pdev); 2396 char *pciaddstr, *pciaddstr_tmp, *pciaddname_tmp, *pciaddname; 2397 2398 pciaddstr = kstrdup(amdgpu_virtual_display, GFP_KERNEL); 2399 pciaddstr_tmp = pciaddstr; 2400 while ((pciaddname_tmp = strsep(&pciaddstr_tmp, ";"))) { 2401 pciaddname = strsep(&pciaddname_tmp, ","); 2402 if (!strcmp("all", pciaddname) 2403 || !strcmp(pci_address_name, pciaddname)) { 2404 long num_crtc; 2405 int res = -1; 2406 2407 adev->enable_virtual_display = true; 2408 2409 if (pciaddname_tmp) 2410 res = kstrtol(pciaddname_tmp, 10, 2411 &num_crtc); 2412 2413 if (!res) { 2414 if (num_crtc < 1) 2415 num_crtc = 1; 2416 if (num_crtc > 6) 2417 num_crtc = 6; 2418 adev->mode_info.num_crtc = num_crtc; 2419 } else { 2420 adev->mode_info.num_crtc = 1; 2421 } 2422 break; 2423 } 2424 } 2425 2426 DRM_INFO("virtual display string:%s, %s:virtual_display:%d, num_crtc:%d\n", 2427 amdgpu_virtual_display, pci_address_name, 2428 adev->enable_virtual_display, adev->mode_info.num_crtc); 2429 2430 kfree(pciaddstr); 2431 } 2432 } 2433 2434 void amdgpu_device_set_sriov_virtual_display(struct amdgpu_device *adev) 2435 { 2436 if (amdgpu_sriov_vf(adev) && !adev->enable_virtual_display) { 2437 adev->mode_info.num_crtc = 1; 2438 adev->enable_virtual_display = true; 2439 DRM_INFO("virtual_display:%d, num_crtc:%d\n", 2440 adev->enable_virtual_display, adev->mode_info.num_crtc); 2441 } 2442 } 2443 2444 /** 2445 * amdgpu_device_parse_gpu_info_fw - parse gpu info firmware 2446 * 2447 * @adev: amdgpu_device pointer 2448 * 2449 * Parses the asic configuration parameters specified in the gpu info 2450 * firmware and makes them available to the driver for use in configuring 2451 * the asic. 2452 * Returns 0 on success, -EINVAL on failure. 2453 */ 2454 static int amdgpu_device_parse_gpu_info_fw(struct amdgpu_device *adev) 2455 { 2456 const char *chip_name; 2457 int err; 2458 const struct gpu_info_firmware_header_v1_0 *hdr; 2459 2460 adev->firmware.gpu_info_fw = NULL; 2461 2462 if (adev->mman.discovery_bin) 2463 return 0; 2464 2465 switch (adev->asic_type) { 2466 default: 2467 return 0; 2468 case CHIP_VEGA10: 2469 chip_name = "vega10"; 2470 break; 2471 case CHIP_VEGA12: 2472 chip_name = "vega12"; 2473 break; 2474 case CHIP_RAVEN: 2475 if (adev->apu_flags & AMD_APU_IS_RAVEN2) 2476 chip_name = "raven2"; 2477 else if (adev->apu_flags & AMD_APU_IS_PICASSO) 2478 chip_name = "picasso"; 2479 else 2480 chip_name = "raven"; 2481 break; 2482 case CHIP_ARCTURUS: 2483 chip_name = "arcturus"; 2484 break; 2485 case CHIP_NAVI12: 2486 chip_name = "navi12"; 2487 break; 2488 } 2489 2490 err = amdgpu_ucode_request(adev, &adev->firmware.gpu_info_fw, 2491 AMDGPU_UCODE_OPTIONAL, 2492 "amdgpu/%s_gpu_info.bin", chip_name); 2493 if (err) { 2494 dev_err(adev->dev, 2495 "Failed to get gpu_info firmware \"%s_gpu_info.bin\"\n", 2496 chip_name); 2497 goto out; 2498 } 2499 2500 hdr = (const struct gpu_info_firmware_header_v1_0 *)adev->firmware.gpu_info_fw->data; 2501 amdgpu_ucode_print_gpu_info_hdr(&hdr->header); 2502 2503 switch (hdr->version_major) { 2504 case 1: 2505 { 2506 const struct gpu_info_firmware_v1_0 *gpu_info_fw = 2507 (const struct gpu_info_firmware_v1_0 *)(adev->firmware.gpu_info_fw->data + 2508 le32_to_cpu(hdr->header.ucode_array_offset_bytes)); 2509 2510 /* 2511 * Should be dropped when DAL no longer needs it. 2512 */ 2513 if (adev->asic_type == CHIP_NAVI12) 2514 goto parse_soc_bounding_box; 2515 2516 adev->gfx.config.max_shader_engines = le32_to_cpu(gpu_info_fw->gc_num_se); 2517 adev->gfx.config.max_cu_per_sh = le32_to_cpu(gpu_info_fw->gc_num_cu_per_sh); 2518 adev->gfx.config.max_sh_per_se = le32_to_cpu(gpu_info_fw->gc_num_sh_per_se); 2519 adev->gfx.config.max_backends_per_se = le32_to_cpu(gpu_info_fw->gc_num_rb_per_se); 2520 adev->gfx.config.max_texture_channel_caches = 2521 le32_to_cpu(gpu_info_fw->gc_num_tccs); 2522 adev->gfx.config.max_gprs = le32_to_cpu(gpu_info_fw->gc_num_gprs); 2523 adev->gfx.config.max_gs_threads = le32_to_cpu(gpu_info_fw->gc_num_max_gs_thds); 2524 adev->gfx.config.gs_vgt_table_depth = le32_to_cpu(gpu_info_fw->gc_gs_table_depth); 2525 adev->gfx.config.gs_prim_buffer_depth = le32_to_cpu(gpu_info_fw->gc_gsprim_buff_depth); 2526 adev->gfx.config.double_offchip_lds_buf = 2527 le32_to_cpu(gpu_info_fw->gc_double_offchip_lds_buffer); 2528 adev->gfx.cu_info.wave_front_size = le32_to_cpu(gpu_info_fw->gc_wave_size); 2529 adev->gfx.cu_info.max_waves_per_simd = 2530 le32_to_cpu(gpu_info_fw->gc_max_waves_per_simd); 2531 adev->gfx.cu_info.max_scratch_slots_per_cu = 2532 le32_to_cpu(gpu_info_fw->gc_max_scratch_slots_per_cu); 2533 adev->gfx.cu_info.lds_size = le32_to_cpu(gpu_info_fw->gc_lds_size); 2534 if (hdr->version_minor >= 1) { 2535 const struct gpu_info_firmware_v1_1 *gpu_info_fw = 2536 (const struct gpu_info_firmware_v1_1 *)(adev->firmware.gpu_info_fw->data + 2537 le32_to_cpu(hdr->header.ucode_array_offset_bytes)); 2538 adev->gfx.config.num_sc_per_sh = 2539 le32_to_cpu(gpu_info_fw->num_sc_per_sh); 2540 adev->gfx.config.num_packer_per_sc = 2541 le32_to_cpu(gpu_info_fw->num_packer_per_sc); 2542 } 2543 2544 parse_soc_bounding_box: 2545 /* 2546 * soc bounding box info is not integrated in disocovery table, 2547 * we always need to parse it from gpu info firmware if needed. 2548 */ 2549 if (hdr->version_minor == 2) { 2550 const struct gpu_info_firmware_v1_2 *gpu_info_fw = 2551 (const struct gpu_info_firmware_v1_2 *)(adev->firmware.gpu_info_fw->data + 2552 le32_to_cpu(hdr->header.ucode_array_offset_bytes)); 2553 adev->dm.soc_bounding_box = &gpu_info_fw->soc_bounding_box; 2554 } 2555 break; 2556 } 2557 default: 2558 dev_err(adev->dev, 2559 "Unsupported gpu_info table %d\n", hdr->header.ucode_version); 2560 err = -EINVAL; 2561 goto out; 2562 } 2563 out: 2564 return err; 2565 } 2566 2567 /** 2568 * amdgpu_device_ip_early_init - run early init for hardware IPs 2569 * 2570 * @adev: amdgpu_device pointer 2571 * 2572 * Early initialization pass for hardware IPs. The hardware IPs that make 2573 * up each asic are discovered each IP's early_init callback is run. This 2574 * is the first stage in initializing the asic. 2575 * Returns 0 on success, negative error code on failure. 2576 */ 2577 static int amdgpu_device_ip_early_init(struct amdgpu_device *adev) 2578 { 2579 struct amdgpu_ip_block *ip_block; 2580 struct pci_dev *parent; 2581 int i, r; 2582 bool total; 2583 2584 amdgpu_device_enable_virtual_display(adev); 2585 2586 if (amdgpu_sriov_vf(adev)) { 2587 r = amdgpu_virt_request_full_gpu(adev, true); 2588 if (r) 2589 return r; 2590 } 2591 2592 switch (adev->asic_type) { 2593 #ifdef CONFIG_DRM_AMDGPU_SI 2594 case CHIP_VERDE: 2595 case CHIP_TAHITI: 2596 case CHIP_PITCAIRN: 2597 case CHIP_OLAND: 2598 case CHIP_HAINAN: 2599 adev->family = AMDGPU_FAMILY_SI; 2600 r = si_set_ip_blocks(adev); 2601 if (r) 2602 return r; 2603 break; 2604 #endif 2605 #ifdef CONFIG_DRM_AMDGPU_CIK 2606 case CHIP_BONAIRE: 2607 case CHIP_HAWAII: 2608 case CHIP_KAVERI: 2609 case CHIP_KABINI: 2610 case CHIP_MULLINS: 2611 if (adev->flags & AMD_IS_APU) 2612 adev->family = AMDGPU_FAMILY_KV; 2613 else 2614 adev->family = AMDGPU_FAMILY_CI; 2615 2616 r = cik_set_ip_blocks(adev); 2617 if (r) 2618 return r; 2619 break; 2620 #endif 2621 case CHIP_TOPAZ: 2622 case CHIP_TONGA: 2623 case CHIP_FIJI: 2624 case CHIP_POLARIS10: 2625 case CHIP_POLARIS11: 2626 case CHIP_POLARIS12: 2627 case CHIP_VEGAM: 2628 case CHIP_CARRIZO: 2629 case CHIP_STONEY: 2630 if (adev->flags & AMD_IS_APU) 2631 adev->family = AMDGPU_FAMILY_CZ; 2632 else 2633 adev->family = AMDGPU_FAMILY_VI; 2634 2635 r = vi_set_ip_blocks(adev); 2636 if (r) 2637 return r; 2638 break; 2639 default: 2640 r = amdgpu_discovery_set_ip_blocks(adev); 2641 if (r) 2642 return r; 2643 break; 2644 } 2645 2646 if (amdgpu_has_atpx() && 2647 (amdgpu_is_atpx_hybrid() || 2648 amdgpu_has_atpx_dgpu_power_cntl()) && 2649 ((adev->flags & AMD_IS_APU) == 0) && 2650 !dev_is_removable(&adev->pdev->dev)) 2651 adev->flags |= AMD_IS_PX; 2652 2653 if (!(adev->flags & AMD_IS_APU)) { 2654 parent = pcie_find_root_port(adev->pdev); 2655 adev->has_pr3 = parent ? pci_pr3_present(parent) : false; 2656 } 2657 2658 2659 adev->pm.pp_feature = amdgpu_pp_feature_mask; 2660 if (amdgpu_sriov_vf(adev) || sched_policy == KFD_SCHED_POLICY_NO_HWS) 2661 adev->pm.pp_feature &= ~PP_GFXOFF_MASK; 2662 if (amdgpu_sriov_vf(adev) && adev->asic_type == CHIP_SIENNA_CICHLID) 2663 adev->pm.pp_feature &= ~PP_OVERDRIVE_MASK; 2664 if (!amdgpu_device_pcie_dynamic_switching_supported(adev)) 2665 adev->pm.pp_feature &= ~PP_PCIE_DPM_MASK; 2666 2667 total = true; 2668 for (i = 0; i < adev->num_ip_blocks; i++) { 2669 ip_block = &adev->ip_blocks[i]; 2670 2671 if ((amdgpu_ip_block_mask & (1 << i)) == 0) { 2672 DRM_WARN("disabled ip block: %d <%s>\n", 2673 i, adev->ip_blocks[i].version->funcs->name); 2674 adev->ip_blocks[i].status.valid = false; 2675 } else if (ip_block->version->funcs->early_init) { 2676 r = ip_block->version->funcs->early_init(ip_block); 2677 if (r == -ENOENT) { 2678 adev->ip_blocks[i].status.valid = false; 2679 } else if (r) { 2680 DRM_ERROR("early_init of IP block <%s> failed %d\n", 2681 adev->ip_blocks[i].version->funcs->name, r); 2682 total = false; 2683 } else { 2684 adev->ip_blocks[i].status.valid = true; 2685 } 2686 } else { 2687 adev->ip_blocks[i].status.valid = true; 2688 } 2689 /* get the vbios after the asic_funcs are set up */ 2690 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON) { 2691 r = amdgpu_device_parse_gpu_info_fw(adev); 2692 if (r) 2693 return r; 2694 2695 /* Read BIOS */ 2696 if (amdgpu_device_read_bios(adev)) { 2697 if (!amdgpu_get_bios(adev)) 2698 return -EINVAL; 2699 2700 r = amdgpu_atombios_init(adev); 2701 if (r) { 2702 dev_err(adev->dev, "amdgpu_atombios_init failed\n"); 2703 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_INIT_FAIL, 0, 0); 2704 return r; 2705 } 2706 } 2707 2708 /*get pf2vf msg info at it's earliest time*/ 2709 if (amdgpu_sriov_vf(adev)) 2710 amdgpu_virt_init_data_exchange(adev); 2711 2712 } 2713 } 2714 if (!total) 2715 return -ENODEV; 2716 2717 ip_block = amdgpu_device_ip_get_ip_block(adev, AMD_IP_BLOCK_TYPE_GFX); 2718 if (ip_block->status.valid != false) 2719 amdgpu_amdkfd_device_probe(adev); 2720 2721 adev->cg_flags &= amdgpu_cg_mask; 2722 adev->pg_flags &= amdgpu_pg_mask; 2723 2724 return 0; 2725 } 2726 2727 static int amdgpu_device_ip_hw_init_phase1(struct amdgpu_device *adev) 2728 { 2729 int i, r; 2730 2731 for (i = 0; i < adev->num_ip_blocks; i++) { 2732 if (!adev->ip_blocks[i].status.sw) 2733 continue; 2734 if (adev->ip_blocks[i].status.hw) 2735 continue; 2736 if (!amdgpu_ip_member_of_hwini( 2737 adev, adev->ip_blocks[i].version->type)) 2738 continue; 2739 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON || 2740 (amdgpu_sriov_vf(adev) && (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP)) || 2741 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH) { 2742 r = adev->ip_blocks[i].version->funcs->hw_init(&adev->ip_blocks[i]); 2743 if (r) { 2744 DRM_ERROR("hw_init of IP block <%s> failed %d\n", 2745 adev->ip_blocks[i].version->funcs->name, r); 2746 return r; 2747 } 2748 adev->ip_blocks[i].status.hw = true; 2749 } 2750 } 2751 2752 return 0; 2753 } 2754 2755 static int amdgpu_device_ip_hw_init_phase2(struct amdgpu_device *adev) 2756 { 2757 int i, r; 2758 2759 for (i = 0; i < adev->num_ip_blocks; i++) { 2760 if (!adev->ip_blocks[i].status.sw) 2761 continue; 2762 if (adev->ip_blocks[i].status.hw) 2763 continue; 2764 if (!amdgpu_ip_member_of_hwini( 2765 adev, adev->ip_blocks[i].version->type)) 2766 continue; 2767 r = adev->ip_blocks[i].version->funcs->hw_init(&adev->ip_blocks[i]); 2768 if (r) { 2769 DRM_ERROR("hw_init of IP block <%s> failed %d\n", 2770 adev->ip_blocks[i].version->funcs->name, r); 2771 return r; 2772 } 2773 adev->ip_blocks[i].status.hw = true; 2774 } 2775 2776 return 0; 2777 } 2778 2779 static int amdgpu_device_fw_loading(struct amdgpu_device *adev) 2780 { 2781 int r = 0; 2782 int i; 2783 uint32_t smu_version; 2784 2785 if (adev->asic_type >= CHIP_VEGA10) { 2786 for (i = 0; i < adev->num_ip_blocks; i++) { 2787 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_PSP) 2788 continue; 2789 2790 if (!amdgpu_ip_member_of_hwini(adev, 2791 AMD_IP_BLOCK_TYPE_PSP)) 2792 break; 2793 2794 if (!adev->ip_blocks[i].status.sw) 2795 continue; 2796 2797 /* no need to do the fw loading again if already done*/ 2798 if (adev->ip_blocks[i].status.hw == true) 2799 break; 2800 2801 if (amdgpu_in_reset(adev) || adev->in_suspend) { 2802 r = amdgpu_ip_block_resume(&adev->ip_blocks[i]); 2803 if (r) 2804 return r; 2805 } else { 2806 r = adev->ip_blocks[i].version->funcs->hw_init(&adev->ip_blocks[i]); 2807 if (r) { 2808 DRM_ERROR("hw_init of IP block <%s> failed %d\n", 2809 adev->ip_blocks[i].version->funcs->name, r); 2810 return r; 2811 } 2812 adev->ip_blocks[i].status.hw = true; 2813 } 2814 break; 2815 } 2816 } 2817 2818 if (!amdgpu_sriov_vf(adev) || adev->asic_type == CHIP_TONGA) 2819 r = amdgpu_pm_load_smu_firmware(adev, &smu_version); 2820 2821 return r; 2822 } 2823 2824 static int amdgpu_device_init_schedulers(struct amdgpu_device *adev) 2825 { 2826 struct drm_sched_init_args args = { 2827 .ops = &amdgpu_sched_ops, 2828 .num_rqs = DRM_SCHED_PRIORITY_COUNT, 2829 .timeout_wq = adev->reset_domain->wq, 2830 .dev = adev->dev, 2831 }; 2832 long timeout; 2833 int r, i; 2834 2835 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 2836 struct amdgpu_ring *ring = adev->rings[i]; 2837 2838 /* No need to setup the GPU scheduler for rings that don't need it */ 2839 if (!ring || ring->no_scheduler) 2840 continue; 2841 2842 switch (ring->funcs->type) { 2843 case AMDGPU_RING_TYPE_GFX: 2844 timeout = adev->gfx_timeout; 2845 break; 2846 case AMDGPU_RING_TYPE_COMPUTE: 2847 timeout = adev->compute_timeout; 2848 break; 2849 case AMDGPU_RING_TYPE_SDMA: 2850 timeout = adev->sdma_timeout; 2851 break; 2852 default: 2853 timeout = adev->video_timeout; 2854 break; 2855 } 2856 2857 args.timeout = timeout; 2858 args.credit_limit = ring->num_hw_submission; 2859 args.score = ring->sched_score; 2860 args.name = ring->name; 2861 2862 r = drm_sched_init(&ring->sched, &args); 2863 if (r) { 2864 DRM_ERROR("Failed to create scheduler on ring %s.\n", 2865 ring->name); 2866 return r; 2867 } 2868 r = amdgpu_uvd_entity_init(adev, ring); 2869 if (r) { 2870 DRM_ERROR("Failed to create UVD scheduling entity on ring %s.\n", 2871 ring->name); 2872 return r; 2873 } 2874 r = amdgpu_vce_entity_init(adev, ring); 2875 if (r) { 2876 DRM_ERROR("Failed to create VCE scheduling entity on ring %s.\n", 2877 ring->name); 2878 return r; 2879 } 2880 } 2881 2882 amdgpu_xcp_update_partition_sched_list(adev); 2883 2884 return 0; 2885 } 2886 2887 2888 /** 2889 * amdgpu_device_ip_init - run init for hardware IPs 2890 * 2891 * @adev: amdgpu_device pointer 2892 * 2893 * Main initialization pass for hardware IPs. The list of all the hardware 2894 * IPs that make up the asic is walked and the sw_init and hw_init callbacks 2895 * are run. sw_init initializes the software state associated with each IP 2896 * and hw_init initializes the hardware associated with each IP. 2897 * Returns 0 on success, negative error code on failure. 2898 */ 2899 static int amdgpu_device_ip_init(struct amdgpu_device *adev) 2900 { 2901 bool init_badpage; 2902 int i, r; 2903 2904 r = amdgpu_ras_init(adev); 2905 if (r) 2906 return r; 2907 2908 for (i = 0; i < adev->num_ip_blocks; i++) { 2909 if (!adev->ip_blocks[i].status.valid) 2910 continue; 2911 if (adev->ip_blocks[i].version->funcs->sw_init) { 2912 r = adev->ip_blocks[i].version->funcs->sw_init(&adev->ip_blocks[i]); 2913 if (r) { 2914 DRM_ERROR("sw_init of IP block <%s> failed %d\n", 2915 adev->ip_blocks[i].version->funcs->name, r); 2916 goto init_failed; 2917 } 2918 } 2919 adev->ip_blocks[i].status.sw = true; 2920 2921 if (!amdgpu_ip_member_of_hwini( 2922 adev, adev->ip_blocks[i].version->type)) 2923 continue; 2924 2925 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON) { 2926 /* need to do common hw init early so everything is set up for gmc */ 2927 r = adev->ip_blocks[i].version->funcs->hw_init(&adev->ip_blocks[i]); 2928 if (r) { 2929 DRM_ERROR("hw_init %d failed %d\n", i, r); 2930 goto init_failed; 2931 } 2932 adev->ip_blocks[i].status.hw = true; 2933 } else if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) { 2934 /* need to do gmc hw init early so we can allocate gpu mem */ 2935 /* Try to reserve bad pages early */ 2936 if (amdgpu_sriov_vf(adev)) 2937 amdgpu_virt_exchange_data(adev); 2938 2939 r = amdgpu_device_mem_scratch_init(adev); 2940 if (r) { 2941 DRM_ERROR("amdgpu_mem_scratch_init failed %d\n", r); 2942 goto init_failed; 2943 } 2944 r = adev->ip_blocks[i].version->funcs->hw_init(&adev->ip_blocks[i]); 2945 if (r) { 2946 DRM_ERROR("hw_init %d failed %d\n", i, r); 2947 goto init_failed; 2948 } 2949 r = amdgpu_device_wb_init(adev); 2950 if (r) { 2951 DRM_ERROR("amdgpu_device_wb_init failed %d\n", r); 2952 goto init_failed; 2953 } 2954 adev->ip_blocks[i].status.hw = true; 2955 2956 /* right after GMC hw init, we create CSA */ 2957 if (adev->gfx.mcbp) { 2958 r = amdgpu_allocate_static_csa(adev, &adev->virt.csa_obj, 2959 AMDGPU_GEM_DOMAIN_VRAM | 2960 AMDGPU_GEM_DOMAIN_GTT, 2961 AMDGPU_CSA_SIZE); 2962 if (r) { 2963 DRM_ERROR("allocate CSA failed %d\n", r); 2964 goto init_failed; 2965 } 2966 } 2967 2968 r = amdgpu_seq64_init(adev); 2969 if (r) { 2970 DRM_ERROR("allocate seq64 failed %d\n", r); 2971 goto init_failed; 2972 } 2973 } 2974 } 2975 2976 if (amdgpu_sriov_vf(adev)) 2977 amdgpu_virt_init_data_exchange(adev); 2978 2979 r = amdgpu_ib_pool_init(adev); 2980 if (r) { 2981 dev_err(adev->dev, "IB initialization failed (%d).\n", r); 2982 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_IB_INIT_FAIL, 0, r); 2983 goto init_failed; 2984 } 2985 2986 r = amdgpu_ucode_create_bo(adev); /* create ucode bo when sw_init complete*/ 2987 if (r) 2988 goto init_failed; 2989 2990 r = amdgpu_device_ip_hw_init_phase1(adev); 2991 if (r) 2992 goto init_failed; 2993 2994 r = amdgpu_device_fw_loading(adev); 2995 if (r) 2996 goto init_failed; 2997 2998 r = amdgpu_device_ip_hw_init_phase2(adev); 2999 if (r) 3000 goto init_failed; 3001 3002 /* 3003 * retired pages will be loaded from eeprom and reserved here, 3004 * it should be called after amdgpu_device_ip_hw_init_phase2 since 3005 * for some ASICs the RAS EEPROM code relies on SMU fully functioning 3006 * for I2C communication which only true at this point. 3007 * 3008 * amdgpu_ras_recovery_init may fail, but the upper only cares the 3009 * failure from bad gpu situation and stop amdgpu init process 3010 * accordingly. For other failed cases, it will still release all 3011 * the resource and print error message, rather than returning one 3012 * negative value to upper level. 3013 * 3014 * Note: theoretically, this should be called before all vram allocations 3015 * to protect retired page from abusing 3016 */ 3017 init_badpage = (adev->init_lvl->level != AMDGPU_INIT_LEVEL_MINIMAL_XGMI); 3018 r = amdgpu_ras_recovery_init(adev, init_badpage); 3019 if (r) 3020 goto init_failed; 3021 3022 /** 3023 * In case of XGMI grab extra reference for reset domain for this device 3024 */ 3025 if (adev->gmc.xgmi.num_physical_nodes > 1) { 3026 if (amdgpu_xgmi_add_device(adev) == 0) { 3027 if (!amdgpu_sriov_vf(adev)) { 3028 struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev); 3029 3030 if (WARN_ON(!hive)) { 3031 r = -ENOENT; 3032 goto init_failed; 3033 } 3034 3035 if (!hive->reset_domain || 3036 !amdgpu_reset_get_reset_domain(hive->reset_domain)) { 3037 r = -ENOENT; 3038 amdgpu_put_xgmi_hive(hive); 3039 goto init_failed; 3040 } 3041 3042 /* Drop the early temporary reset domain we created for device */ 3043 amdgpu_reset_put_reset_domain(adev->reset_domain); 3044 adev->reset_domain = hive->reset_domain; 3045 amdgpu_put_xgmi_hive(hive); 3046 } 3047 } 3048 } 3049 3050 r = amdgpu_device_init_schedulers(adev); 3051 if (r) 3052 goto init_failed; 3053 3054 if (adev->mman.buffer_funcs_ring->sched.ready) 3055 amdgpu_ttm_set_buffer_funcs_status(adev, true); 3056 3057 /* Don't init kfd if whole hive need to be reset during init */ 3058 if (adev->init_lvl->level != AMDGPU_INIT_LEVEL_MINIMAL_XGMI) { 3059 kgd2kfd_init_zone_device(adev); 3060 amdgpu_amdkfd_device_init(adev); 3061 } 3062 3063 amdgpu_fru_get_product_info(adev); 3064 3065 init_failed: 3066 3067 return r; 3068 } 3069 3070 /** 3071 * amdgpu_device_fill_reset_magic - writes reset magic to gart pointer 3072 * 3073 * @adev: amdgpu_device pointer 3074 * 3075 * Writes a reset magic value to the gart pointer in VRAM. The driver calls 3076 * this function before a GPU reset. If the value is retained after a 3077 * GPU reset, VRAM has not been lost. Some GPU resets may destroy VRAM contents. 3078 */ 3079 static void amdgpu_device_fill_reset_magic(struct amdgpu_device *adev) 3080 { 3081 memcpy(adev->reset_magic, adev->gart.ptr, AMDGPU_RESET_MAGIC_NUM); 3082 } 3083 3084 /** 3085 * amdgpu_device_check_vram_lost - check if vram is valid 3086 * 3087 * @adev: amdgpu_device pointer 3088 * 3089 * Checks the reset magic value written to the gart pointer in VRAM. 3090 * The driver calls this after a GPU reset to see if the contents of 3091 * VRAM is lost or now. 3092 * returns true if vram is lost, false if not. 3093 */ 3094 static bool amdgpu_device_check_vram_lost(struct amdgpu_device *adev) 3095 { 3096 if (memcmp(adev->gart.ptr, adev->reset_magic, 3097 AMDGPU_RESET_MAGIC_NUM)) 3098 return true; 3099 3100 if (!amdgpu_in_reset(adev)) 3101 return false; 3102 3103 /* 3104 * For all ASICs with baco/mode1 reset, the VRAM is 3105 * always assumed to be lost. 3106 */ 3107 switch (amdgpu_asic_reset_method(adev)) { 3108 case AMD_RESET_METHOD_BACO: 3109 case AMD_RESET_METHOD_MODE1: 3110 return true; 3111 default: 3112 return false; 3113 } 3114 } 3115 3116 /** 3117 * amdgpu_device_set_cg_state - set clockgating for amdgpu device 3118 * 3119 * @adev: amdgpu_device pointer 3120 * @state: clockgating state (gate or ungate) 3121 * 3122 * The list of all the hardware IPs that make up the asic is walked and the 3123 * set_clockgating_state callbacks are run. 3124 * Late initialization pass enabling clockgating for hardware IPs. 3125 * Fini or suspend, pass disabling clockgating for hardware IPs. 3126 * Returns 0 on success, negative error code on failure. 3127 */ 3128 3129 int amdgpu_device_set_cg_state(struct amdgpu_device *adev, 3130 enum amd_clockgating_state state) 3131 { 3132 int i, j, r; 3133 3134 if (amdgpu_emu_mode == 1) 3135 return 0; 3136 3137 for (j = 0; j < adev->num_ip_blocks; j++) { 3138 i = state == AMD_CG_STATE_GATE ? j : adev->num_ip_blocks - j - 1; 3139 if (!adev->ip_blocks[i].status.late_initialized) 3140 continue; 3141 /* skip CG for GFX, SDMA on S0ix */ 3142 if (adev->in_s0ix && 3143 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GFX || 3144 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SDMA)) 3145 continue; 3146 /* skip CG for VCE/UVD, it's handled specially */ 3147 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_UVD && 3148 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCE && 3149 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCN && 3150 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_JPEG && 3151 adev->ip_blocks[i].version->funcs->set_clockgating_state) { 3152 /* enable clockgating to save power */ 3153 r = adev->ip_blocks[i].version->funcs->set_clockgating_state(&adev->ip_blocks[i], 3154 state); 3155 if (r) { 3156 DRM_ERROR("set_clockgating_state(gate) of IP block <%s> failed %d\n", 3157 adev->ip_blocks[i].version->funcs->name, r); 3158 return r; 3159 } 3160 } 3161 } 3162 3163 return 0; 3164 } 3165 3166 int amdgpu_device_set_pg_state(struct amdgpu_device *adev, 3167 enum amd_powergating_state state) 3168 { 3169 int i, j, r; 3170 3171 if (amdgpu_emu_mode == 1) 3172 return 0; 3173 3174 for (j = 0; j < adev->num_ip_blocks; j++) { 3175 i = state == AMD_PG_STATE_GATE ? j : adev->num_ip_blocks - j - 1; 3176 if (!adev->ip_blocks[i].status.late_initialized) 3177 continue; 3178 /* skip PG for GFX, SDMA on S0ix */ 3179 if (adev->in_s0ix && 3180 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GFX || 3181 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SDMA)) 3182 continue; 3183 /* skip CG for VCE/UVD, it's handled specially */ 3184 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_UVD && 3185 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCE && 3186 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCN && 3187 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_JPEG && 3188 adev->ip_blocks[i].version->funcs->set_powergating_state) { 3189 /* enable powergating to save power */ 3190 r = adev->ip_blocks[i].version->funcs->set_powergating_state(&adev->ip_blocks[i], 3191 state); 3192 if (r) { 3193 DRM_ERROR("set_powergating_state(gate) of IP block <%s> failed %d\n", 3194 adev->ip_blocks[i].version->funcs->name, r); 3195 return r; 3196 } 3197 } 3198 } 3199 return 0; 3200 } 3201 3202 static int amdgpu_device_enable_mgpu_fan_boost(void) 3203 { 3204 struct amdgpu_gpu_instance *gpu_ins; 3205 struct amdgpu_device *adev; 3206 int i, ret = 0; 3207 3208 mutex_lock(&mgpu_info.mutex); 3209 3210 /* 3211 * MGPU fan boost feature should be enabled 3212 * only when there are two or more dGPUs in 3213 * the system 3214 */ 3215 if (mgpu_info.num_dgpu < 2) 3216 goto out; 3217 3218 for (i = 0; i < mgpu_info.num_dgpu; i++) { 3219 gpu_ins = &(mgpu_info.gpu_ins[i]); 3220 adev = gpu_ins->adev; 3221 if (!(adev->flags & AMD_IS_APU) && 3222 !gpu_ins->mgpu_fan_enabled) { 3223 ret = amdgpu_dpm_enable_mgpu_fan_boost(adev); 3224 if (ret) 3225 break; 3226 3227 gpu_ins->mgpu_fan_enabled = 1; 3228 } 3229 } 3230 3231 out: 3232 mutex_unlock(&mgpu_info.mutex); 3233 3234 return ret; 3235 } 3236 3237 /** 3238 * amdgpu_device_ip_late_init - run late init for hardware IPs 3239 * 3240 * @adev: amdgpu_device pointer 3241 * 3242 * Late initialization pass for hardware IPs. The list of all the hardware 3243 * IPs that make up the asic is walked and the late_init callbacks are run. 3244 * late_init covers any special initialization that an IP requires 3245 * after all of the have been initialized or something that needs to happen 3246 * late in the init process. 3247 * Returns 0 on success, negative error code on failure. 3248 */ 3249 static int amdgpu_device_ip_late_init(struct amdgpu_device *adev) 3250 { 3251 struct amdgpu_gpu_instance *gpu_instance; 3252 int i = 0, r; 3253 3254 for (i = 0; i < adev->num_ip_blocks; i++) { 3255 if (!adev->ip_blocks[i].status.hw) 3256 continue; 3257 if (adev->ip_blocks[i].version->funcs->late_init) { 3258 r = adev->ip_blocks[i].version->funcs->late_init(&adev->ip_blocks[i]); 3259 if (r) { 3260 DRM_ERROR("late_init of IP block <%s> failed %d\n", 3261 adev->ip_blocks[i].version->funcs->name, r); 3262 return r; 3263 } 3264 } 3265 adev->ip_blocks[i].status.late_initialized = true; 3266 } 3267 3268 r = amdgpu_ras_late_init(adev); 3269 if (r) { 3270 DRM_ERROR("amdgpu_ras_late_init failed %d", r); 3271 return r; 3272 } 3273 3274 if (!amdgpu_reset_in_recovery(adev)) 3275 amdgpu_ras_set_error_query_ready(adev, true); 3276 3277 amdgpu_device_set_cg_state(adev, AMD_CG_STATE_GATE); 3278 amdgpu_device_set_pg_state(adev, AMD_PG_STATE_GATE); 3279 3280 amdgpu_device_fill_reset_magic(adev); 3281 3282 r = amdgpu_device_enable_mgpu_fan_boost(); 3283 if (r) 3284 DRM_ERROR("enable mgpu fan boost failed (%d).\n", r); 3285 3286 /* For passthrough configuration on arcturus and aldebaran, enable special handling SBR */ 3287 if (amdgpu_passthrough(adev) && 3288 ((adev->asic_type == CHIP_ARCTURUS && adev->gmc.xgmi.num_physical_nodes > 1) || 3289 adev->asic_type == CHIP_ALDEBARAN)) 3290 amdgpu_dpm_handle_passthrough_sbr(adev, true); 3291 3292 if (adev->gmc.xgmi.num_physical_nodes > 1) { 3293 mutex_lock(&mgpu_info.mutex); 3294 3295 /* 3296 * Reset device p-state to low as this was booted with high. 3297 * 3298 * This should be performed only after all devices from the same 3299 * hive get initialized. 3300 * 3301 * However, it's unknown how many device in the hive in advance. 3302 * As this is counted one by one during devices initializations. 3303 * 3304 * So, we wait for all XGMI interlinked devices initialized. 3305 * This may bring some delays as those devices may come from 3306 * different hives. But that should be OK. 3307 */ 3308 if (mgpu_info.num_dgpu == adev->gmc.xgmi.num_physical_nodes) { 3309 for (i = 0; i < mgpu_info.num_gpu; i++) { 3310 gpu_instance = &(mgpu_info.gpu_ins[i]); 3311 if (gpu_instance->adev->flags & AMD_IS_APU) 3312 continue; 3313 3314 r = amdgpu_xgmi_set_pstate(gpu_instance->adev, 3315 AMDGPU_XGMI_PSTATE_MIN); 3316 if (r) { 3317 DRM_ERROR("pstate setting failed (%d).\n", r); 3318 break; 3319 } 3320 } 3321 } 3322 3323 mutex_unlock(&mgpu_info.mutex); 3324 } 3325 3326 return 0; 3327 } 3328 3329 static void amdgpu_ip_block_hw_fini(struct amdgpu_ip_block *ip_block) 3330 { 3331 int r; 3332 3333 if (!ip_block->version->funcs->hw_fini) { 3334 DRM_ERROR("hw_fini of IP block <%s> not defined\n", 3335 ip_block->version->funcs->name); 3336 } else { 3337 r = ip_block->version->funcs->hw_fini(ip_block); 3338 /* XXX handle errors */ 3339 if (r) { 3340 DRM_DEBUG("hw_fini of IP block <%s> failed %d\n", 3341 ip_block->version->funcs->name, r); 3342 } 3343 } 3344 3345 ip_block->status.hw = false; 3346 } 3347 3348 /** 3349 * amdgpu_device_smu_fini_early - smu hw_fini wrapper 3350 * 3351 * @adev: amdgpu_device pointer 3352 * 3353 * For ASICs need to disable SMC first 3354 */ 3355 static void amdgpu_device_smu_fini_early(struct amdgpu_device *adev) 3356 { 3357 int i; 3358 3359 if (amdgpu_ip_version(adev, GC_HWIP, 0) > IP_VERSION(9, 0, 0)) 3360 return; 3361 3362 for (i = 0; i < adev->num_ip_blocks; i++) { 3363 if (!adev->ip_blocks[i].status.hw) 3364 continue; 3365 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) { 3366 amdgpu_ip_block_hw_fini(&adev->ip_blocks[i]); 3367 break; 3368 } 3369 } 3370 } 3371 3372 static int amdgpu_device_ip_fini_early(struct amdgpu_device *adev) 3373 { 3374 int i, r; 3375 3376 for (i = 0; i < adev->num_ip_blocks; i++) { 3377 if (!adev->ip_blocks[i].version->funcs->early_fini) 3378 continue; 3379 3380 r = adev->ip_blocks[i].version->funcs->early_fini(&adev->ip_blocks[i]); 3381 if (r) { 3382 DRM_DEBUG("early_fini of IP block <%s> failed %d\n", 3383 adev->ip_blocks[i].version->funcs->name, r); 3384 } 3385 } 3386 3387 amdgpu_device_set_pg_state(adev, AMD_PG_STATE_UNGATE); 3388 amdgpu_device_set_cg_state(adev, AMD_CG_STATE_UNGATE); 3389 3390 amdgpu_amdkfd_suspend(adev, false); 3391 3392 /* Workaround for ASICs need to disable SMC first */ 3393 amdgpu_device_smu_fini_early(adev); 3394 3395 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 3396 if (!adev->ip_blocks[i].status.hw) 3397 continue; 3398 3399 amdgpu_ip_block_hw_fini(&adev->ip_blocks[i]); 3400 } 3401 3402 if (amdgpu_sriov_vf(adev)) { 3403 if (amdgpu_virt_release_full_gpu(adev, false)) 3404 DRM_ERROR("failed to release exclusive mode on fini\n"); 3405 } 3406 3407 return 0; 3408 } 3409 3410 /** 3411 * amdgpu_device_ip_fini - run fini for hardware IPs 3412 * 3413 * @adev: amdgpu_device pointer 3414 * 3415 * Main teardown pass for hardware IPs. The list of all the hardware 3416 * IPs that make up the asic is walked and the hw_fini and sw_fini callbacks 3417 * are run. hw_fini tears down the hardware associated with each IP 3418 * and sw_fini tears down any software state associated with each IP. 3419 * Returns 0 on success, negative error code on failure. 3420 */ 3421 static int amdgpu_device_ip_fini(struct amdgpu_device *adev) 3422 { 3423 int i, r; 3424 3425 if (amdgpu_sriov_vf(adev) && adev->virt.ras_init_done) 3426 amdgpu_virt_release_ras_err_handler_data(adev); 3427 3428 if (adev->gmc.xgmi.num_physical_nodes > 1) 3429 amdgpu_xgmi_remove_device(adev); 3430 3431 amdgpu_amdkfd_device_fini_sw(adev); 3432 3433 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 3434 if (!adev->ip_blocks[i].status.sw) 3435 continue; 3436 3437 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) { 3438 amdgpu_ucode_free_bo(adev); 3439 amdgpu_free_static_csa(&adev->virt.csa_obj); 3440 amdgpu_device_wb_fini(adev); 3441 amdgpu_device_mem_scratch_fini(adev); 3442 amdgpu_ib_pool_fini(adev); 3443 amdgpu_seq64_fini(adev); 3444 } 3445 if (adev->ip_blocks[i].version->funcs->sw_fini) { 3446 r = adev->ip_blocks[i].version->funcs->sw_fini(&adev->ip_blocks[i]); 3447 /* XXX handle errors */ 3448 if (r) { 3449 DRM_DEBUG("sw_fini of IP block <%s> failed %d\n", 3450 adev->ip_blocks[i].version->funcs->name, r); 3451 } 3452 } 3453 adev->ip_blocks[i].status.sw = false; 3454 adev->ip_blocks[i].status.valid = false; 3455 } 3456 3457 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 3458 if (!adev->ip_blocks[i].status.late_initialized) 3459 continue; 3460 if (adev->ip_blocks[i].version->funcs->late_fini) 3461 adev->ip_blocks[i].version->funcs->late_fini(&adev->ip_blocks[i]); 3462 adev->ip_blocks[i].status.late_initialized = false; 3463 } 3464 3465 amdgpu_ras_fini(adev); 3466 3467 return 0; 3468 } 3469 3470 /** 3471 * amdgpu_device_delayed_init_work_handler - work handler for IB tests 3472 * 3473 * @work: work_struct. 3474 */ 3475 static void amdgpu_device_delayed_init_work_handler(struct work_struct *work) 3476 { 3477 struct amdgpu_device *adev = 3478 container_of(work, struct amdgpu_device, delayed_init_work.work); 3479 int r; 3480 3481 r = amdgpu_ib_ring_tests(adev); 3482 if (r) 3483 DRM_ERROR("ib ring test failed (%d).\n", r); 3484 } 3485 3486 static void amdgpu_device_delay_enable_gfx_off(struct work_struct *work) 3487 { 3488 struct amdgpu_device *adev = 3489 container_of(work, struct amdgpu_device, gfx.gfx_off_delay_work.work); 3490 3491 WARN_ON_ONCE(adev->gfx.gfx_off_state); 3492 WARN_ON_ONCE(adev->gfx.gfx_off_req_count); 3493 3494 if (!amdgpu_dpm_set_powergating_by_smu(adev, AMD_IP_BLOCK_TYPE_GFX, true, 0)) 3495 adev->gfx.gfx_off_state = true; 3496 } 3497 3498 /** 3499 * amdgpu_device_ip_suspend_phase1 - run suspend for hardware IPs (phase 1) 3500 * 3501 * @adev: amdgpu_device pointer 3502 * 3503 * Main suspend function for hardware IPs. The list of all the hardware 3504 * IPs that make up the asic is walked, clockgating is disabled and the 3505 * suspend callbacks are run. suspend puts the hardware and software state 3506 * in each IP into a state suitable for suspend. 3507 * Returns 0 on success, negative error code on failure. 3508 */ 3509 static int amdgpu_device_ip_suspend_phase1(struct amdgpu_device *adev) 3510 { 3511 int i, r; 3512 3513 amdgpu_device_set_pg_state(adev, AMD_PG_STATE_UNGATE); 3514 amdgpu_device_set_cg_state(adev, AMD_CG_STATE_UNGATE); 3515 3516 /* 3517 * Per PMFW team's suggestion, driver needs to handle gfxoff 3518 * and df cstate features disablement for gpu reset(e.g. Mode1Reset) 3519 * scenario. Add the missing df cstate disablement here. 3520 */ 3521 if (amdgpu_dpm_set_df_cstate(adev, DF_CSTATE_DISALLOW)) 3522 dev_warn(adev->dev, "Failed to disallow df cstate"); 3523 3524 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 3525 if (!adev->ip_blocks[i].status.valid) 3526 continue; 3527 3528 /* displays are handled separately */ 3529 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_DCE) 3530 continue; 3531 3532 /* XXX handle errors */ 3533 r = amdgpu_ip_block_suspend(&adev->ip_blocks[i]); 3534 if (r) 3535 return r; 3536 } 3537 3538 return 0; 3539 } 3540 3541 /** 3542 * amdgpu_device_ip_suspend_phase2 - run suspend for hardware IPs (phase 2) 3543 * 3544 * @adev: amdgpu_device pointer 3545 * 3546 * Main suspend function for hardware IPs. The list of all the hardware 3547 * IPs that make up the asic is walked, clockgating is disabled and the 3548 * suspend callbacks are run. suspend puts the hardware and software state 3549 * in each IP into a state suitable for suspend. 3550 * Returns 0 on success, negative error code on failure. 3551 */ 3552 static int amdgpu_device_ip_suspend_phase2(struct amdgpu_device *adev) 3553 { 3554 int i, r; 3555 3556 if (adev->in_s0ix) 3557 amdgpu_dpm_gfx_state_change(adev, sGpuChangeState_D3Entry); 3558 3559 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 3560 if (!adev->ip_blocks[i].status.valid) 3561 continue; 3562 /* displays are handled in phase1 */ 3563 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE) 3564 continue; 3565 /* PSP lost connection when err_event_athub occurs */ 3566 if (amdgpu_ras_intr_triggered() && 3567 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) { 3568 adev->ip_blocks[i].status.hw = false; 3569 continue; 3570 } 3571 3572 /* skip unnecessary suspend if we do not initialize them yet */ 3573 if (!amdgpu_ip_member_of_hwini( 3574 adev, adev->ip_blocks[i].version->type)) 3575 continue; 3576 3577 /* skip suspend of gfx/mes and psp for S0ix 3578 * gfx is in gfxoff state, so on resume it will exit gfxoff just 3579 * like at runtime. PSP is also part of the always on hardware 3580 * so no need to suspend it. 3581 */ 3582 if (adev->in_s0ix && 3583 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP || 3584 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GFX || 3585 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_MES)) 3586 continue; 3587 3588 /* SDMA 5.x+ is part of GFX power domain so it's covered by GFXOFF */ 3589 if (adev->in_s0ix && 3590 (amdgpu_ip_version(adev, SDMA0_HWIP, 0) >= 3591 IP_VERSION(5, 0, 0)) && 3592 (adev->ip_blocks[i].version->type == 3593 AMD_IP_BLOCK_TYPE_SDMA)) 3594 continue; 3595 3596 /* Once swPSP provides the IMU, RLC FW binaries to TOS during cold-boot. 3597 * These are in TMR, hence are expected to be reused by PSP-TOS to reload 3598 * from this location and RLC Autoload automatically also gets loaded 3599 * from here based on PMFW -> PSP message during re-init sequence. 3600 * Therefore, the psp suspend & resume should be skipped to avoid destroy 3601 * the TMR and reload FWs again for IMU enabled APU ASICs. 3602 */ 3603 if (amdgpu_in_reset(adev) && 3604 (adev->flags & AMD_IS_APU) && adev->gfx.imu.funcs && 3605 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) 3606 continue; 3607 3608 /* XXX handle errors */ 3609 r = amdgpu_ip_block_suspend(&adev->ip_blocks[i]); 3610 adev->ip_blocks[i].status.hw = false; 3611 3612 /* handle putting the SMC in the appropriate state */ 3613 if (!amdgpu_sriov_vf(adev)) { 3614 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) { 3615 r = amdgpu_dpm_set_mp1_state(adev, adev->mp1_state); 3616 if (r) { 3617 DRM_ERROR("SMC failed to set mp1 state %d, %d\n", 3618 adev->mp1_state, r); 3619 return r; 3620 } 3621 } 3622 } 3623 } 3624 3625 return 0; 3626 } 3627 3628 /** 3629 * amdgpu_device_ip_suspend - run suspend for hardware IPs 3630 * 3631 * @adev: amdgpu_device pointer 3632 * 3633 * Main suspend function for hardware IPs. The list of all the hardware 3634 * IPs that make up the asic is walked, clockgating is disabled and the 3635 * suspend callbacks are run. suspend puts the hardware and software state 3636 * in each IP into a state suitable for suspend. 3637 * Returns 0 on success, negative error code on failure. 3638 */ 3639 int amdgpu_device_ip_suspend(struct amdgpu_device *adev) 3640 { 3641 int r; 3642 3643 if (amdgpu_sriov_vf(adev)) { 3644 amdgpu_virt_fini_data_exchange(adev); 3645 amdgpu_virt_request_full_gpu(adev, false); 3646 } 3647 3648 amdgpu_ttm_set_buffer_funcs_status(adev, false); 3649 3650 r = amdgpu_device_ip_suspend_phase1(adev); 3651 if (r) 3652 return r; 3653 r = amdgpu_device_ip_suspend_phase2(adev); 3654 3655 if (amdgpu_sriov_vf(adev)) 3656 amdgpu_virt_release_full_gpu(adev, false); 3657 3658 return r; 3659 } 3660 3661 static int amdgpu_device_ip_reinit_early_sriov(struct amdgpu_device *adev) 3662 { 3663 int i, r; 3664 3665 static enum amd_ip_block_type ip_order[] = { 3666 AMD_IP_BLOCK_TYPE_COMMON, 3667 AMD_IP_BLOCK_TYPE_GMC, 3668 AMD_IP_BLOCK_TYPE_PSP, 3669 AMD_IP_BLOCK_TYPE_IH, 3670 }; 3671 3672 for (i = 0; i < adev->num_ip_blocks; i++) { 3673 int j; 3674 struct amdgpu_ip_block *block; 3675 3676 block = &adev->ip_blocks[i]; 3677 block->status.hw = false; 3678 3679 for (j = 0; j < ARRAY_SIZE(ip_order); j++) { 3680 3681 if (block->version->type != ip_order[j] || 3682 !block->status.valid) 3683 continue; 3684 3685 r = block->version->funcs->hw_init(&adev->ip_blocks[i]); 3686 if (r) { 3687 dev_err(adev->dev, "RE-INIT-early: %s failed\n", 3688 block->version->funcs->name); 3689 return r; 3690 } 3691 block->status.hw = true; 3692 } 3693 } 3694 3695 return 0; 3696 } 3697 3698 static int amdgpu_device_ip_reinit_late_sriov(struct amdgpu_device *adev) 3699 { 3700 struct amdgpu_ip_block *block; 3701 int i, r = 0; 3702 3703 static enum amd_ip_block_type ip_order[] = { 3704 AMD_IP_BLOCK_TYPE_SMC, 3705 AMD_IP_BLOCK_TYPE_DCE, 3706 AMD_IP_BLOCK_TYPE_GFX, 3707 AMD_IP_BLOCK_TYPE_SDMA, 3708 AMD_IP_BLOCK_TYPE_MES, 3709 AMD_IP_BLOCK_TYPE_UVD, 3710 AMD_IP_BLOCK_TYPE_VCE, 3711 AMD_IP_BLOCK_TYPE_VCN, 3712 AMD_IP_BLOCK_TYPE_JPEG 3713 }; 3714 3715 for (i = 0; i < ARRAY_SIZE(ip_order); i++) { 3716 block = amdgpu_device_ip_get_ip_block(adev, ip_order[i]); 3717 3718 if (!block) 3719 continue; 3720 3721 if (block->status.valid && !block->status.hw) { 3722 if (block->version->type == AMD_IP_BLOCK_TYPE_SMC) { 3723 r = amdgpu_ip_block_resume(block); 3724 } else { 3725 r = block->version->funcs->hw_init(block); 3726 } 3727 3728 if (r) { 3729 dev_err(adev->dev, "RE-INIT-late: %s failed\n", 3730 block->version->funcs->name); 3731 break; 3732 } 3733 block->status.hw = true; 3734 } 3735 } 3736 3737 return r; 3738 } 3739 3740 /** 3741 * amdgpu_device_ip_resume_phase1 - run resume for hardware IPs 3742 * 3743 * @adev: amdgpu_device pointer 3744 * 3745 * First resume function for hardware IPs. The list of all the hardware 3746 * IPs that make up the asic is walked and the resume callbacks are run for 3747 * COMMON, GMC, and IH. resume puts the hardware into a functional state 3748 * after a suspend and updates the software state as necessary. This 3749 * function is also used for restoring the GPU after a GPU reset. 3750 * Returns 0 on success, negative error code on failure. 3751 */ 3752 static int amdgpu_device_ip_resume_phase1(struct amdgpu_device *adev) 3753 { 3754 int i, r; 3755 3756 for (i = 0; i < adev->num_ip_blocks; i++) { 3757 if (!adev->ip_blocks[i].status.valid || adev->ip_blocks[i].status.hw) 3758 continue; 3759 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON || 3760 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC || 3761 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH || 3762 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP && amdgpu_sriov_vf(adev))) { 3763 3764 r = amdgpu_ip_block_resume(&adev->ip_blocks[i]); 3765 if (r) 3766 return r; 3767 } 3768 } 3769 3770 return 0; 3771 } 3772 3773 /** 3774 * amdgpu_device_ip_resume_phase2 - run resume for hardware IPs 3775 * 3776 * @adev: amdgpu_device pointer 3777 * 3778 * Second resume function for hardware IPs. The list of all the hardware 3779 * IPs that make up the asic is walked and the resume callbacks are run for 3780 * all blocks except COMMON, GMC, and IH. resume puts the hardware into a 3781 * functional state after a suspend and updates the software state as 3782 * necessary. This function is also used for restoring the GPU after a GPU 3783 * reset. 3784 * Returns 0 on success, negative error code on failure. 3785 */ 3786 static int amdgpu_device_ip_resume_phase2(struct amdgpu_device *adev) 3787 { 3788 int i, r; 3789 3790 for (i = 0; i < adev->num_ip_blocks; i++) { 3791 if (!adev->ip_blocks[i].status.valid || adev->ip_blocks[i].status.hw) 3792 continue; 3793 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON || 3794 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC || 3795 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH || 3796 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE || 3797 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) 3798 continue; 3799 r = amdgpu_ip_block_resume(&adev->ip_blocks[i]); 3800 if (r) 3801 return r; 3802 } 3803 3804 return 0; 3805 } 3806 3807 /** 3808 * amdgpu_device_ip_resume_phase3 - run resume for hardware IPs 3809 * 3810 * @adev: amdgpu_device pointer 3811 * 3812 * Third resume function for hardware IPs. The list of all the hardware 3813 * IPs that make up the asic is walked and the resume callbacks are run for 3814 * all DCE. resume puts the hardware into a functional state after a suspend 3815 * and updates the software state as necessary. This function is also used 3816 * for restoring the GPU after a GPU reset. 3817 * 3818 * Returns 0 on success, negative error code on failure. 3819 */ 3820 static int amdgpu_device_ip_resume_phase3(struct amdgpu_device *adev) 3821 { 3822 int i, r; 3823 3824 for (i = 0; i < adev->num_ip_blocks; i++) { 3825 if (!adev->ip_blocks[i].status.valid || adev->ip_blocks[i].status.hw) 3826 continue; 3827 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE) { 3828 r = amdgpu_ip_block_resume(&adev->ip_blocks[i]); 3829 if (r) 3830 return r; 3831 } 3832 } 3833 3834 return 0; 3835 } 3836 3837 /** 3838 * amdgpu_device_ip_resume - run resume for hardware IPs 3839 * 3840 * @adev: amdgpu_device pointer 3841 * 3842 * Main resume function for hardware IPs. The hardware IPs 3843 * are split into two resume functions because they are 3844 * also used in recovering from a GPU reset and some additional 3845 * steps need to be take between them. In this case (S3/S4) they are 3846 * run sequentially. 3847 * Returns 0 on success, negative error code on failure. 3848 */ 3849 static int amdgpu_device_ip_resume(struct amdgpu_device *adev) 3850 { 3851 int r; 3852 3853 r = amdgpu_device_ip_resume_phase1(adev); 3854 if (r) 3855 return r; 3856 3857 r = amdgpu_device_fw_loading(adev); 3858 if (r) 3859 return r; 3860 3861 r = amdgpu_device_ip_resume_phase2(adev); 3862 3863 if (adev->mman.buffer_funcs_ring->sched.ready) 3864 amdgpu_ttm_set_buffer_funcs_status(adev, true); 3865 3866 if (r) 3867 return r; 3868 3869 amdgpu_fence_driver_hw_init(adev); 3870 3871 r = amdgpu_device_ip_resume_phase3(adev); 3872 3873 return r; 3874 } 3875 3876 /** 3877 * amdgpu_device_detect_sriov_bios - determine if the board supports SR-IOV 3878 * 3879 * @adev: amdgpu_device pointer 3880 * 3881 * Query the VBIOS data tables to determine if the board supports SR-IOV. 3882 */ 3883 static void amdgpu_device_detect_sriov_bios(struct amdgpu_device *adev) 3884 { 3885 if (amdgpu_sriov_vf(adev)) { 3886 if (adev->is_atom_fw) { 3887 if (amdgpu_atomfirmware_gpu_virtualization_supported(adev)) 3888 adev->virt.caps |= AMDGPU_SRIOV_CAPS_SRIOV_VBIOS; 3889 } else { 3890 if (amdgpu_atombios_has_gpu_virtualization_table(adev)) 3891 adev->virt.caps |= AMDGPU_SRIOV_CAPS_SRIOV_VBIOS; 3892 } 3893 3894 if (!(adev->virt.caps & AMDGPU_SRIOV_CAPS_SRIOV_VBIOS)) 3895 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_NO_VBIOS, 0, 0); 3896 } 3897 } 3898 3899 /** 3900 * amdgpu_device_asic_has_dc_support - determine if DC supports the asic 3901 * 3902 * @asic_type: AMD asic type 3903 * 3904 * Check if there is DC (new modesetting infrastructre) support for an asic. 3905 * returns true if DC has support, false if not. 3906 */ 3907 bool amdgpu_device_asic_has_dc_support(enum amd_asic_type asic_type) 3908 { 3909 switch (asic_type) { 3910 #ifdef CONFIG_DRM_AMDGPU_SI 3911 case CHIP_HAINAN: 3912 #endif 3913 case CHIP_TOPAZ: 3914 /* chips with no display hardware */ 3915 return false; 3916 #if defined(CONFIG_DRM_AMD_DC) 3917 case CHIP_TAHITI: 3918 case CHIP_PITCAIRN: 3919 case CHIP_VERDE: 3920 case CHIP_OLAND: 3921 /* 3922 * We have systems in the wild with these ASICs that require 3923 * LVDS and VGA support which is not supported with DC. 3924 * 3925 * Fallback to the non-DC driver here by default so as not to 3926 * cause regressions. 3927 */ 3928 #if defined(CONFIG_DRM_AMD_DC_SI) 3929 return amdgpu_dc > 0; 3930 #else 3931 return false; 3932 #endif 3933 case CHIP_BONAIRE: 3934 case CHIP_KAVERI: 3935 case CHIP_KABINI: 3936 case CHIP_MULLINS: 3937 /* 3938 * We have systems in the wild with these ASICs that require 3939 * VGA support which is not supported with DC. 3940 * 3941 * Fallback to the non-DC driver here by default so as not to 3942 * cause regressions. 3943 */ 3944 return amdgpu_dc > 0; 3945 default: 3946 return amdgpu_dc != 0; 3947 #else 3948 default: 3949 if (amdgpu_dc > 0) 3950 DRM_INFO_ONCE("Display Core has been requested via kernel parameter but isn't supported by ASIC, ignoring\n"); 3951 return false; 3952 #endif 3953 } 3954 } 3955 3956 /** 3957 * amdgpu_device_has_dc_support - check if dc is supported 3958 * 3959 * @adev: amdgpu_device pointer 3960 * 3961 * Returns true for supported, false for not supported 3962 */ 3963 bool amdgpu_device_has_dc_support(struct amdgpu_device *adev) 3964 { 3965 if (adev->enable_virtual_display || 3966 (adev->harvest_ip_mask & AMD_HARVEST_IP_DMU_MASK)) 3967 return false; 3968 3969 return amdgpu_device_asic_has_dc_support(adev->asic_type); 3970 } 3971 3972 static void amdgpu_device_xgmi_reset_func(struct work_struct *__work) 3973 { 3974 struct amdgpu_device *adev = 3975 container_of(__work, struct amdgpu_device, xgmi_reset_work); 3976 struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev); 3977 3978 /* It's a bug to not have a hive within this function */ 3979 if (WARN_ON(!hive)) 3980 return; 3981 3982 /* 3983 * Use task barrier to synchronize all xgmi reset works across the 3984 * hive. task_barrier_enter and task_barrier_exit will block 3985 * until all the threads running the xgmi reset works reach 3986 * those points. task_barrier_full will do both blocks. 3987 */ 3988 if (amdgpu_asic_reset_method(adev) == AMD_RESET_METHOD_BACO) { 3989 3990 task_barrier_enter(&hive->tb); 3991 adev->asic_reset_res = amdgpu_device_baco_enter(adev_to_drm(adev)); 3992 3993 if (adev->asic_reset_res) 3994 goto fail; 3995 3996 task_barrier_exit(&hive->tb); 3997 adev->asic_reset_res = amdgpu_device_baco_exit(adev_to_drm(adev)); 3998 3999 if (adev->asic_reset_res) 4000 goto fail; 4001 4002 amdgpu_ras_reset_error_count(adev, AMDGPU_RAS_BLOCK__MMHUB); 4003 } else { 4004 4005 task_barrier_full(&hive->tb); 4006 adev->asic_reset_res = amdgpu_asic_reset(adev); 4007 } 4008 4009 fail: 4010 if (adev->asic_reset_res) 4011 DRM_WARN("ASIC reset failed with error, %d for drm dev, %s", 4012 adev->asic_reset_res, adev_to_drm(adev)->unique); 4013 amdgpu_put_xgmi_hive(hive); 4014 } 4015 4016 static int amdgpu_device_get_job_timeout_settings(struct amdgpu_device *adev) 4017 { 4018 char *input = amdgpu_lockup_timeout; 4019 char *timeout_setting = NULL; 4020 int index = 0; 4021 long timeout; 4022 int ret = 0; 4023 4024 /* 4025 * By default timeout for non compute jobs is 10000 4026 * and 60000 for compute jobs. 4027 * In SR-IOV or passthrough mode, timeout for compute 4028 * jobs are 60000 by default. 4029 */ 4030 adev->gfx_timeout = msecs_to_jiffies(10000); 4031 adev->sdma_timeout = adev->video_timeout = adev->gfx_timeout; 4032 if (amdgpu_sriov_vf(adev)) 4033 adev->compute_timeout = amdgpu_sriov_is_pp_one_vf(adev) ? 4034 msecs_to_jiffies(60000) : msecs_to_jiffies(10000); 4035 else 4036 adev->compute_timeout = msecs_to_jiffies(60000); 4037 4038 if (strnlen(input, AMDGPU_MAX_TIMEOUT_PARAM_LENGTH)) { 4039 while ((timeout_setting = strsep(&input, ",")) && 4040 strnlen(timeout_setting, AMDGPU_MAX_TIMEOUT_PARAM_LENGTH)) { 4041 ret = kstrtol(timeout_setting, 0, &timeout); 4042 if (ret) 4043 return ret; 4044 4045 if (timeout == 0) { 4046 index++; 4047 continue; 4048 } else if (timeout < 0) { 4049 timeout = MAX_SCHEDULE_TIMEOUT; 4050 dev_warn(adev->dev, "lockup timeout disabled"); 4051 add_taint(TAINT_SOFTLOCKUP, LOCKDEP_STILL_OK); 4052 } else { 4053 timeout = msecs_to_jiffies(timeout); 4054 } 4055 4056 switch (index++) { 4057 case 0: 4058 adev->gfx_timeout = timeout; 4059 break; 4060 case 1: 4061 adev->compute_timeout = timeout; 4062 break; 4063 case 2: 4064 adev->sdma_timeout = timeout; 4065 break; 4066 case 3: 4067 adev->video_timeout = timeout; 4068 break; 4069 default: 4070 break; 4071 } 4072 } 4073 /* 4074 * There is only one value specified and 4075 * it should apply to all non-compute jobs. 4076 */ 4077 if (index == 1) { 4078 adev->sdma_timeout = adev->video_timeout = adev->gfx_timeout; 4079 if (amdgpu_sriov_vf(adev) || amdgpu_passthrough(adev)) 4080 adev->compute_timeout = adev->gfx_timeout; 4081 } 4082 } 4083 4084 return ret; 4085 } 4086 4087 /** 4088 * amdgpu_device_check_iommu_direct_map - check if RAM direct mapped to GPU 4089 * 4090 * @adev: amdgpu_device pointer 4091 * 4092 * RAM direct mapped to GPU if IOMMU is not enabled or is pass through mode 4093 */ 4094 static void amdgpu_device_check_iommu_direct_map(struct amdgpu_device *adev) 4095 { 4096 struct iommu_domain *domain; 4097 4098 domain = iommu_get_domain_for_dev(adev->dev); 4099 if (!domain || domain->type == IOMMU_DOMAIN_IDENTITY) 4100 adev->ram_is_direct_mapped = true; 4101 } 4102 4103 #if defined(CONFIG_HSA_AMD_P2P) 4104 /** 4105 * amdgpu_device_check_iommu_remap - Check if DMA remapping is enabled. 4106 * 4107 * @adev: amdgpu_device pointer 4108 * 4109 * return if IOMMU remapping bar address 4110 */ 4111 static bool amdgpu_device_check_iommu_remap(struct amdgpu_device *adev) 4112 { 4113 struct iommu_domain *domain; 4114 4115 domain = iommu_get_domain_for_dev(adev->dev); 4116 if (domain && (domain->type == IOMMU_DOMAIN_DMA || 4117 domain->type == IOMMU_DOMAIN_DMA_FQ)) 4118 return true; 4119 4120 return false; 4121 } 4122 #endif 4123 4124 static const struct attribute *amdgpu_dev_attributes[] = { 4125 &dev_attr_pcie_replay_count.attr, 4126 NULL 4127 }; 4128 4129 static void amdgpu_device_set_mcbp(struct amdgpu_device *adev) 4130 { 4131 if (amdgpu_mcbp == 1) 4132 adev->gfx.mcbp = true; 4133 else if (amdgpu_mcbp == 0) 4134 adev->gfx.mcbp = false; 4135 4136 if (amdgpu_sriov_vf(adev)) 4137 adev->gfx.mcbp = true; 4138 4139 if (adev->gfx.mcbp) 4140 DRM_INFO("MCBP is enabled\n"); 4141 } 4142 4143 /** 4144 * amdgpu_device_init - initialize the driver 4145 * 4146 * @adev: amdgpu_device pointer 4147 * @flags: driver flags 4148 * 4149 * Initializes the driver info and hw (all asics). 4150 * Returns 0 for success or an error on failure. 4151 * Called at driver startup. 4152 */ 4153 int amdgpu_device_init(struct amdgpu_device *adev, 4154 uint32_t flags) 4155 { 4156 struct drm_device *ddev = adev_to_drm(adev); 4157 struct pci_dev *pdev = adev->pdev; 4158 int r, i; 4159 bool px = false; 4160 u32 max_MBps; 4161 int tmp; 4162 4163 adev->shutdown = false; 4164 adev->flags = flags; 4165 4166 if (amdgpu_force_asic_type >= 0 && amdgpu_force_asic_type < CHIP_LAST) 4167 adev->asic_type = amdgpu_force_asic_type; 4168 else 4169 adev->asic_type = flags & AMD_ASIC_MASK; 4170 4171 adev->usec_timeout = AMDGPU_MAX_USEC_TIMEOUT; 4172 if (amdgpu_emu_mode == 1) 4173 adev->usec_timeout *= 10; 4174 adev->gmc.gart_size = 512 * 1024 * 1024; 4175 adev->accel_working = false; 4176 adev->num_rings = 0; 4177 RCU_INIT_POINTER(adev->gang_submit, dma_fence_get_stub()); 4178 adev->mman.buffer_funcs = NULL; 4179 adev->mman.buffer_funcs_ring = NULL; 4180 adev->vm_manager.vm_pte_funcs = NULL; 4181 adev->vm_manager.vm_pte_num_scheds = 0; 4182 adev->gmc.gmc_funcs = NULL; 4183 adev->harvest_ip_mask = 0x0; 4184 adev->fence_context = dma_fence_context_alloc(AMDGPU_MAX_RINGS); 4185 bitmap_zero(adev->gfx.pipe_reserve_bitmap, AMDGPU_MAX_COMPUTE_QUEUES); 4186 4187 adev->smc_rreg = &amdgpu_invalid_rreg; 4188 adev->smc_wreg = &amdgpu_invalid_wreg; 4189 adev->pcie_rreg = &amdgpu_invalid_rreg; 4190 adev->pcie_wreg = &amdgpu_invalid_wreg; 4191 adev->pcie_rreg_ext = &amdgpu_invalid_rreg_ext; 4192 adev->pcie_wreg_ext = &amdgpu_invalid_wreg_ext; 4193 adev->pciep_rreg = &amdgpu_invalid_rreg; 4194 adev->pciep_wreg = &amdgpu_invalid_wreg; 4195 adev->pcie_rreg64 = &amdgpu_invalid_rreg64; 4196 adev->pcie_wreg64 = &amdgpu_invalid_wreg64; 4197 adev->pcie_rreg64_ext = &amdgpu_invalid_rreg64_ext; 4198 adev->pcie_wreg64_ext = &amdgpu_invalid_wreg64_ext; 4199 adev->uvd_ctx_rreg = &amdgpu_invalid_rreg; 4200 adev->uvd_ctx_wreg = &amdgpu_invalid_wreg; 4201 adev->didt_rreg = &amdgpu_invalid_rreg; 4202 adev->didt_wreg = &amdgpu_invalid_wreg; 4203 adev->gc_cac_rreg = &amdgpu_invalid_rreg; 4204 adev->gc_cac_wreg = &amdgpu_invalid_wreg; 4205 adev->audio_endpt_rreg = &amdgpu_block_invalid_rreg; 4206 adev->audio_endpt_wreg = &amdgpu_block_invalid_wreg; 4207 4208 DRM_INFO("initializing kernel modesetting (%s 0x%04X:0x%04X 0x%04X:0x%04X 0x%02X).\n", 4209 amdgpu_asic_name[adev->asic_type], pdev->vendor, pdev->device, 4210 pdev->subsystem_vendor, pdev->subsystem_device, pdev->revision); 4211 4212 /* mutex initialization are all done here so we 4213 * can recall function without having locking issues 4214 */ 4215 mutex_init(&adev->firmware.mutex); 4216 mutex_init(&adev->pm.mutex); 4217 mutex_init(&adev->gfx.gpu_clock_mutex); 4218 mutex_init(&adev->srbm_mutex); 4219 mutex_init(&adev->gfx.pipe_reserve_mutex); 4220 mutex_init(&adev->gfx.gfx_off_mutex); 4221 mutex_init(&adev->gfx.partition_mutex); 4222 mutex_init(&adev->grbm_idx_mutex); 4223 mutex_init(&adev->mn_lock); 4224 mutex_init(&adev->virt.vf_errors.lock); 4225 mutex_init(&adev->virt.rlcg_reg_lock); 4226 hash_init(adev->mn_hash); 4227 mutex_init(&adev->psp.mutex); 4228 mutex_init(&adev->notifier_lock); 4229 mutex_init(&adev->pm.stable_pstate_ctx_lock); 4230 mutex_init(&adev->benchmark_mutex); 4231 mutex_init(&adev->gfx.reset_sem_mutex); 4232 /* Initialize the mutex for cleaner shader isolation between GFX and compute processes */ 4233 mutex_init(&adev->enforce_isolation_mutex); 4234 mutex_init(&adev->gfx.kfd_sch_mutex); 4235 4236 amdgpu_device_init_apu_flags(adev); 4237 4238 r = amdgpu_device_check_arguments(adev); 4239 if (r) 4240 return r; 4241 4242 spin_lock_init(&adev->mmio_idx_lock); 4243 spin_lock_init(&adev->smc_idx_lock); 4244 spin_lock_init(&adev->pcie_idx_lock); 4245 spin_lock_init(&adev->uvd_ctx_idx_lock); 4246 spin_lock_init(&adev->didt_idx_lock); 4247 spin_lock_init(&adev->gc_cac_idx_lock); 4248 spin_lock_init(&adev->se_cac_idx_lock); 4249 spin_lock_init(&adev->audio_endpt_idx_lock); 4250 spin_lock_init(&adev->mm_stats.lock); 4251 spin_lock_init(&adev->wb.lock); 4252 4253 INIT_LIST_HEAD(&adev->reset_list); 4254 4255 INIT_LIST_HEAD(&adev->ras_list); 4256 4257 INIT_LIST_HEAD(&adev->pm.od_kobj_list); 4258 4259 INIT_DELAYED_WORK(&adev->delayed_init_work, 4260 amdgpu_device_delayed_init_work_handler); 4261 INIT_DELAYED_WORK(&adev->gfx.gfx_off_delay_work, 4262 amdgpu_device_delay_enable_gfx_off); 4263 /* 4264 * Initialize the enforce_isolation work structures for each XCP 4265 * partition. This work handler is responsible for enforcing shader 4266 * isolation on AMD GPUs. It counts the number of emitted fences for 4267 * each GFX and compute ring. If there are any fences, it schedules 4268 * the `enforce_isolation_work` to be run after a delay. If there are 4269 * no fences, it signals the Kernel Fusion Driver (KFD) to resume the 4270 * runqueue. 4271 */ 4272 for (i = 0; i < MAX_XCP; i++) { 4273 INIT_DELAYED_WORK(&adev->gfx.enforce_isolation[i].work, 4274 amdgpu_gfx_enforce_isolation_handler); 4275 adev->gfx.enforce_isolation[i].adev = adev; 4276 adev->gfx.enforce_isolation[i].xcp_id = i; 4277 } 4278 4279 INIT_WORK(&adev->xgmi_reset_work, amdgpu_device_xgmi_reset_func); 4280 4281 adev->gfx.gfx_off_req_count = 1; 4282 adev->gfx.gfx_off_residency = 0; 4283 adev->gfx.gfx_off_entrycount = 0; 4284 adev->pm.ac_power = power_supply_is_system_supplied() > 0; 4285 4286 atomic_set(&adev->throttling_logging_enabled, 1); 4287 /* 4288 * If throttling continues, logging will be performed every minute 4289 * to avoid log flooding. "-1" is subtracted since the thermal 4290 * throttling interrupt comes every second. Thus, the total logging 4291 * interval is 59 seconds(retelimited printk interval) + 1(waiting 4292 * for throttling interrupt) = 60 seconds. 4293 */ 4294 ratelimit_state_init(&adev->throttling_logging_rs, (60 - 1) * HZ, 1); 4295 ratelimit_state_init(&adev->virt.ras_telemetry_rs, 5 * HZ, 1); 4296 4297 ratelimit_set_flags(&adev->throttling_logging_rs, RATELIMIT_MSG_ON_RELEASE); 4298 ratelimit_set_flags(&adev->virt.ras_telemetry_rs, RATELIMIT_MSG_ON_RELEASE); 4299 4300 /* Registers mapping */ 4301 /* TODO: block userspace mapping of io register */ 4302 if (adev->asic_type >= CHIP_BONAIRE) { 4303 adev->rmmio_base = pci_resource_start(adev->pdev, 5); 4304 adev->rmmio_size = pci_resource_len(adev->pdev, 5); 4305 } else { 4306 adev->rmmio_base = pci_resource_start(adev->pdev, 2); 4307 adev->rmmio_size = pci_resource_len(adev->pdev, 2); 4308 } 4309 4310 for (i = 0; i < AMD_IP_BLOCK_TYPE_NUM; i++) 4311 atomic_set(&adev->pm.pwr_state[i], POWER_STATE_UNKNOWN); 4312 4313 adev->rmmio = ioremap(adev->rmmio_base, adev->rmmio_size); 4314 if (!adev->rmmio) 4315 return -ENOMEM; 4316 4317 DRM_INFO("register mmio base: 0x%08X\n", (uint32_t)adev->rmmio_base); 4318 DRM_INFO("register mmio size: %u\n", (unsigned int)adev->rmmio_size); 4319 4320 /* 4321 * Reset domain needs to be present early, before XGMI hive discovered 4322 * (if any) and initialized to use reset sem and in_gpu reset flag 4323 * early on during init and before calling to RREG32. 4324 */ 4325 adev->reset_domain = amdgpu_reset_create_reset_domain(SINGLE_DEVICE, "amdgpu-reset-dev"); 4326 if (!adev->reset_domain) 4327 return -ENOMEM; 4328 4329 /* detect hw virtualization here */ 4330 amdgpu_detect_virtualization(adev); 4331 4332 amdgpu_device_get_pcie_info(adev); 4333 4334 r = amdgpu_device_get_job_timeout_settings(adev); 4335 if (r) { 4336 dev_err(adev->dev, "invalid lockup_timeout parameter syntax\n"); 4337 return r; 4338 } 4339 4340 amdgpu_device_set_mcbp(adev); 4341 4342 /* 4343 * By default, use default mode where all blocks are expected to be 4344 * initialized. At present a 'swinit' of blocks is required to be 4345 * completed before the need for a different level is detected. 4346 */ 4347 amdgpu_set_init_level(adev, AMDGPU_INIT_LEVEL_DEFAULT); 4348 /* early init functions */ 4349 r = amdgpu_device_ip_early_init(adev); 4350 if (r) 4351 return r; 4352 4353 /* Get rid of things like offb */ 4354 r = aperture_remove_conflicting_pci_devices(adev->pdev, amdgpu_kms_driver.name); 4355 if (r) 4356 return r; 4357 4358 /* Enable TMZ based on IP_VERSION */ 4359 amdgpu_gmc_tmz_set(adev); 4360 4361 if (amdgpu_sriov_vf(adev) && 4362 amdgpu_ip_version(adev, GC_HWIP, 0) >= IP_VERSION(10, 3, 0)) 4363 /* VF MMIO access (except mailbox range) from CPU 4364 * will be blocked during sriov runtime 4365 */ 4366 adev->virt.caps |= AMDGPU_VF_MMIO_ACCESS_PROTECT; 4367 4368 amdgpu_gmc_noretry_set(adev); 4369 /* Need to get xgmi info early to decide the reset behavior*/ 4370 if (adev->gmc.xgmi.supported) { 4371 r = adev->gfxhub.funcs->get_xgmi_info(adev); 4372 if (r) 4373 return r; 4374 } 4375 4376 /* enable PCIE atomic ops */ 4377 if (amdgpu_sriov_vf(adev)) { 4378 if (adev->virt.fw_reserve.p_pf2vf) 4379 adev->have_atomics_support = ((struct amd_sriov_msg_pf2vf_info *) 4380 adev->virt.fw_reserve.p_pf2vf)->pcie_atomic_ops_support_flags == 4381 (PCI_EXP_DEVCAP2_ATOMIC_COMP32 | PCI_EXP_DEVCAP2_ATOMIC_COMP64); 4382 /* APUs w/ gfx9 onwards doesn't reply on PCIe atomics, rather it is a 4383 * internal path natively support atomics, set have_atomics_support to true. 4384 */ 4385 } else if ((adev->flags & AMD_IS_APU) && 4386 (amdgpu_ip_version(adev, GC_HWIP, 0) > 4387 IP_VERSION(9, 0, 0))) { 4388 adev->have_atomics_support = true; 4389 } else { 4390 adev->have_atomics_support = 4391 !pci_enable_atomic_ops_to_root(adev->pdev, 4392 PCI_EXP_DEVCAP2_ATOMIC_COMP32 | 4393 PCI_EXP_DEVCAP2_ATOMIC_COMP64); 4394 } 4395 4396 if (!adev->have_atomics_support) 4397 dev_info(adev->dev, "PCIE atomic ops is not supported\n"); 4398 4399 /* doorbell bar mapping and doorbell index init*/ 4400 amdgpu_doorbell_init(adev); 4401 4402 if (amdgpu_emu_mode == 1) { 4403 /* post the asic on emulation mode */ 4404 emu_soc_asic_init(adev); 4405 goto fence_driver_init; 4406 } 4407 4408 amdgpu_reset_init(adev); 4409 4410 /* detect if we are with an SRIOV vbios */ 4411 if (adev->bios) 4412 amdgpu_device_detect_sriov_bios(adev); 4413 4414 /* check if we need to reset the asic 4415 * E.g., driver was not cleanly unloaded previously, etc. 4416 */ 4417 if (!amdgpu_sriov_vf(adev) && amdgpu_asic_need_reset_on_init(adev)) { 4418 if (adev->gmc.xgmi.num_physical_nodes) { 4419 dev_info(adev->dev, "Pending hive reset.\n"); 4420 amdgpu_set_init_level(adev, 4421 AMDGPU_INIT_LEVEL_MINIMAL_XGMI); 4422 } else if (amdgpu_ip_version(adev, MP1_HWIP, 0) == IP_VERSION(13, 0, 10) && 4423 !amdgpu_device_has_display_hardware(adev)) { 4424 r = psp_gpu_reset(adev); 4425 } else { 4426 tmp = amdgpu_reset_method; 4427 /* It should do a default reset when loading or reloading the driver, 4428 * regardless of the module parameter reset_method. 4429 */ 4430 amdgpu_reset_method = AMD_RESET_METHOD_NONE; 4431 r = amdgpu_asic_reset(adev); 4432 amdgpu_reset_method = tmp; 4433 } 4434 4435 if (r) { 4436 dev_err(adev->dev, "asic reset on init failed\n"); 4437 goto failed; 4438 } 4439 } 4440 4441 /* Post card if necessary */ 4442 if (amdgpu_device_need_post(adev)) { 4443 if (!adev->bios) { 4444 dev_err(adev->dev, "no vBIOS found\n"); 4445 r = -EINVAL; 4446 goto failed; 4447 } 4448 DRM_INFO("GPU posting now...\n"); 4449 r = amdgpu_device_asic_init(adev); 4450 if (r) { 4451 dev_err(adev->dev, "gpu post error!\n"); 4452 goto failed; 4453 } 4454 } 4455 4456 if (adev->bios) { 4457 if (adev->is_atom_fw) { 4458 /* Initialize clocks */ 4459 r = amdgpu_atomfirmware_get_clock_info(adev); 4460 if (r) { 4461 dev_err(adev->dev, "amdgpu_atomfirmware_get_clock_info failed\n"); 4462 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_GET_CLOCK_FAIL, 0, 0); 4463 goto failed; 4464 } 4465 } else { 4466 /* Initialize clocks */ 4467 r = amdgpu_atombios_get_clock_info(adev); 4468 if (r) { 4469 dev_err(adev->dev, "amdgpu_atombios_get_clock_info failed\n"); 4470 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_GET_CLOCK_FAIL, 0, 0); 4471 goto failed; 4472 } 4473 /* init i2c buses */ 4474 if (!amdgpu_device_has_dc_support(adev)) 4475 amdgpu_atombios_i2c_init(adev); 4476 } 4477 } 4478 4479 fence_driver_init: 4480 /* Fence driver */ 4481 r = amdgpu_fence_driver_sw_init(adev); 4482 if (r) { 4483 dev_err(adev->dev, "amdgpu_fence_driver_sw_init failed\n"); 4484 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_FENCE_INIT_FAIL, 0, 0); 4485 goto failed; 4486 } 4487 4488 /* init the mode config */ 4489 drm_mode_config_init(adev_to_drm(adev)); 4490 4491 r = amdgpu_device_ip_init(adev); 4492 if (r) { 4493 dev_err(adev->dev, "amdgpu_device_ip_init failed\n"); 4494 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_AMDGPU_INIT_FAIL, 0, 0); 4495 goto release_ras_con; 4496 } 4497 4498 amdgpu_fence_driver_hw_init(adev); 4499 4500 dev_info(adev->dev, 4501 "SE %d, SH per SE %d, CU per SH %d, active_cu_number %d\n", 4502 adev->gfx.config.max_shader_engines, 4503 adev->gfx.config.max_sh_per_se, 4504 adev->gfx.config.max_cu_per_sh, 4505 adev->gfx.cu_info.number); 4506 4507 adev->accel_working = true; 4508 4509 amdgpu_vm_check_compute_bug(adev); 4510 4511 /* Initialize the buffer migration limit. */ 4512 if (amdgpu_moverate >= 0) 4513 max_MBps = amdgpu_moverate; 4514 else 4515 max_MBps = 8; /* Allow 8 MB/s. */ 4516 /* Get a log2 for easy divisions. */ 4517 adev->mm_stats.log2_max_MBps = ilog2(max(1u, max_MBps)); 4518 4519 /* 4520 * Register gpu instance before amdgpu_device_enable_mgpu_fan_boost. 4521 * Otherwise the mgpu fan boost feature will be skipped due to the 4522 * gpu instance is counted less. 4523 */ 4524 amdgpu_register_gpu_instance(adev); 4525 4526 /* enable clockgating, etc. after ib tests, etc. since some blocks require 4527 * explicit gating rather than handling it automatically. 4528 */ 4529 if (adev->init_lvl->level != AMDGPU_INIT_LEVEL_MINIMAL_XGMI) { 4530 r = amdgpu_device_ip_late_init(adev); 4531 if (r) { 4532 dev_err(adev->dev, "amdgpu_device_ip_late_init failed\n"); 4533 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_AMDGPU_LATE_INIT_FAIL, 0, r); 4534 goto release_ras_con; 4535 } 4536 /* must succeed. */ 4537 amdgpu_ras_resume(adev); 4538 queue_delayed_work(system_wq, &adev->delayed_init_work, 4539 msecs_to_jiffies(AMDGPU_RESUME_MS)); 4540 } 4541 4542 if (amdgpu_sriov_vf(adev)) { 4543 amdgpu_virt_release_full_gpu(adev, true); 4544 flush_delayed_work(&adev->delayed_init_work); 4545 } 4546 4547 /* 4548 * Place those sysfs registering after `late_init`. As some of those 4549 * operations performed in `late_init` might affect the sysfs 4550 * interfaces creating. 4551 */ 4552 r = amdgpu_atombios_sysfs_init(adev); 4553 if (r) 4554 drm_err(&adev->ddev, 4555 "registering atombios sysfs failed (%d).\n", r); 4556 4557 r = amdgpu_pm_sysfs_init(adev); 4558 if (r) 4559 DRM_ERROR("registering pm sysfs failed (%d).\n", r); 4560 4561 r = amdgpu_ucode_sysfs_init(adev); 4562 if (r) { 4563 adev->ucode_sysfs_en = false; 4564 DRM_ERROR("Creating firmware sysfs failed (%d).\n", r); 4565 } else 4566 adev->ucode_sysfs_en = true; 4567 4568 r = sysfs_create_files(&adev->dev->kobj, amdgpu_dev_attributes); 4569 if (r) 4570 dev_err(adev->dev, "Could not create amdgpu device attr\n"); 4571 4572 r = devm_device_add_group(adev->dev, &amdgpu_board_attrs_group); 4573 if (r) 4574 dev_err(adev->dev, 4575 "Could not create amdgpu board attributes\n"); 4576 4577 amdgpu_fru_sysfs_init(adev); 4578 amdgpu_reg_state_sysfs_init(adev); 4579 amdgpu_xcp_cfg_sysfs_init(adev); 4580 4581 if (IS_ENABLED(CONFIG_PERF_EVENTS)) 4582 r = amdgpu_pmu_init(adev); 4583 if (r) 4584 dev_err(adev->dev, "amdgpu_pmu_init failed\n"); 4585 4586 /* Have stored pci confspace at hand for restore in sudden PCI error */ 4587 if (amdgpu_device_cache_pci_state(adev->pdev)) 4588 pci_restore_state(pdev); 4589 4590 /* if we have > 1 VGA cards, then disable the amdgpu VGA resources */ 4591 /* this will fail for cards that aren't VGA class devices, just 4592 * ignore it 4593 */ 4594 if ((adev->pdev->class >> 8) == PCI_CLASS_DISPLAY_VGA) 4595 vga_client_register(adev->pdev, amdgpu_device_vga_set_decode); 4596 4597 px = amdgpu_device_supports_px(ddev); 4598 4599 if (px || (!dev_is_removable(&adev->pdev->dev) && 4600 apple_gmux_detect(NULL, NULL))) 4601 vga_switcheroo_register_client(adev->pdev, 4602 &amdgpu_switcheroo_ops, px); 4603 4604 if (px) 4605 vga_switcheroo_init_domain_pm_ops(adev->dev, &adev->vga_pm_domain); 4606 4607 if (adev->init_lvl->level == AMDGPU_INIT_LEVEL_MINIMAL_XGMI) 4608 amdgpu_xgmi_reset_on_init(adev); 4609 4610 amdgpu_device_check_iommu_direct_map(adev); 4611 4612 adev->pm_nb.notifier_call = amdgpu_device_pm_notifier; 4613 r = register_pm_notifier(&adev->pm_nb); 4614 if (r) 4615 goto failed; 4616 4617 return 0; 4618 4619 release_ras_con: 4620 if (amdgpu_sriov_vf(adev)) 4621 amdgpu_virt_release_full_gpu(adev, true); 4622 4623 /* failed in exclusive mode due to timeout */ 4624 if (amdgpu_sriov_vf(adev) && 4625 !amdgpu_sriov_runtime(adev) && 4626 amdgpu_virt_mmio_blocked(adev) && 4627 !amdgpu_virt_wait_reset(adev)) { 4628 dev_err(adev->dev, "VF exclusive mode timeout\n"); 4629 /* Don't send request since VF is inactive. */ 4630 adev->virt.caps &= ~AMDGPU_SRIOV_CAPS_RUNTIME; 4631 adev->virt.ops = NULL; 4632 r = -EAGAIN; 4633 } 4634 amdgpu_release_ras_context(adev); 4635 4636 failed: 4637 amdgpu_vf_error_trans_all(adev); 4638 4639 return r; 4640 } 4641 4642 static void amdgpu_device_unmap_mmio(struct amdgpu_device *adev) 4643 { 4644 4645 /* Clear all CPU mappings pointing to this device */ 4646 unmap_mapping_range(adev->ddev.anon_inode->i_mapping, 0, 0, 1); 4647 4648 /* Unmap all mapped bars - Doorbell, registers and VRAM */ 4649 amdgpu_doorbell_fini(adev); 4650 4651 iounmap(adev->rmmio); 4652 adev->rmmio = NULL; 4653 if (adev->mman.aper_base_kaddr) 4654 iounmap(adev->mman.aper_base_kaddr); 4655 adev->mman.aper_base_kaddr = NULL; 4656 4657 /* Memory manager related */ 4658 if (!adev->gmc.xgmi.connected_to_cpu && !adev->gmc.is_app_apu) { 4659 arch_phys_wc_del(adev->gmc.vram_mtrr); 4660 arch_io_free_memtype_wc(adev->gmc.aper_base, adev->gmc.aper_size); 4661 } 4662 } 4663 4664 /** 4665 * amdgpu_device_fini_hw - tear down the driver 4666 * 4667 * @adev: amdgpu_device pointer 4668 * 4669 * Tear down the driver info (all asics). 4670 * Called at driver shutdown. 4671 */ 4672 void amdgpu_device_fini_hw(struct amdgpu_device *adev) 4673 { 4674 dev_info(adev->dev, "amdgpu: finishing device.\n"); 4675 flush_delayed_work(&adev->delayed_init_work); 4676 4677 if (adev->mman.initialized) 4678 drain_workqueue(adev->mman.bdev.wq); 4679 adev->shutdown = true; 4680 4681 unregister_pm_notifier(&adev->pm_nb); 4682 4683 /* make sure IB test finished before entering exclusive mode 4684 * to avoid preemption on IB test 4685 */ 4686 if (amdgpu_sriov_vf(adev)) { 4687 amdgpu_virt_request_full_gpu(adev, false); 4688 amdgpu_virt_fini_data_exchange(adev); 4689 } 4690 4691 /* disable all interrupts */ 4692 amdgpu_irq_disable_all(adev); 4693 if (adev->mode_info.mode_config_initialized) { 4694 if (!drm_drv_uses_atomic_modeset(adev_to_drm(adev))) 4695 drm_helper_force_disable_all(adev_to_drm(adev)); 4696 else 4697 drm_atomic_helper_shutdown(adev_to_drm(adev)); 4698 } 4699 amdgpu_fence_driver_hw_fini(adev); 4700 4701 if (adev->pm.sysfs_initialized) 4702 amdgpu_pm_sysfs_fini(adev); 4703 if (adev->ucode_sysfs_en) 4704 amdgpu_ucode_sysfs_fini(adev); 4705 sysfs_remove_files(&adev->dev->kobj, amdgpu_dev_attributes); 4706 amdgpu_fru_sysfs_fini(adev); 4707 4708 amdgpu_reg_state_sysfs_fini(adev); 4709 amdgpu_xcp_cfg_sysfs_fini(adev); 4710 4711 /* disable ras feature must before hw fini */ 4712 amdgpu_ras_pre_fini(adev); 4713 4714 amdgpu_ttm_set_buffer_funcs_status(adev, false); 4715 4716 amdgpu_device_ip_fini_early(adev); 4717 4718 amdgpu_irq_fini_hw(adev); 4719 4720 if (adev->mman.initialized) 4721 ttm_device_clear_dma_mappings(&adev->mman.bdev); 4722 4723 amdgpu_gart_dummy_page_fini(adev); 4724 4725 if (drm_dev_is_unplugged(adev_to_drm(adev))) 4726 amdgpu_device_unmap_mmio(adev); 4727 4728 } 4729 4730 void amdgpu_device_fini_sw(struct amdgpu_device *adev) 4731 { 4732 int idx; 4733 bool px; 4734 4735 amdgpu_device_ip_fini(adev); 4736 amdgpu_fence_driver_sw_fini(adev); 4737 amdgpu_ucode_release(&adev->firmware.gpu_info_fw); 4738 adev->accel_working = false; 4739 dma_fence_put(rcu_dereference_protected(adev->gang_submit, true)); 4740 4741 amdgpu_reset_fini(adev); 4742 4743 /* free i2c buses */ 4744 if (!amdgpu_device_has_dc_support(adev)) 4745 amdgpu_i2c_fini(adev); 4746 4747 if (amdgpu_emu_mode != 1) 4748 amdgpu_atombios_fini(adev); 4749 4750 kfree(adev->bios); 4751 adev->bios = NULL; 4752 4753 kfree(adev->fru_info); 4754 adev->fru_info = NULL; 4755 4756 px = amdgpu_device_supports_px(adev_to_drm(adev)); 4757 4758 if (px || (!dev_is_removable(&adev->pdev->dev) && 4759 apple_gmux_detect(NULL, NULL))) 4760 vga_switcheroo_unregister_client(adev->pdev); 4761 4762 if (px) 4763 vga_switcheroo_fini_domain_pm_ops(adev->dev); 4764 4765 if ((adev->pdev->class >> 8) == PCI_CLASS_DISPLAY_VGA) 4766 vga_client_unregister(adev->pdev); 4767 4768 if (drm_dev_enter(adev_to_drm(adev), &idx)) { 4769 4770 iounmap(adev->rmmio); 4771 adev->rmmio = NULL; 4772 amdgpu_doorbell_fini(adev); 4773 drm_dev_exit(idx); 4774 } 4775 4776 if (IS_ENABLED(CONFIG_PERF_EVENTS)) 4777 amdgpu_pmu_fini(adev); 4778 if (adev->mman.discovery_bin) 4779 amdgpu_discovery_fini(adev); 4780 4781 amdgpu_reset_put_reset_domain(adev->reset_domain); 4782 adev->reset_domain = NULL; 4783 4784 kfree(adev->pci_state); 4785 4786 } 4787 4788 /** 4789 * amdgpu_device_evict_resources - evict device resources 4790 * @adev: amdgpu device object 4791 * 4792 * Evicts all ttm device resources(vram BOs, gart table) from the lru list 4793 * of the vram memory type. Mainly used for evicting device resources 4794 * at suspend time. 4795 * 4796 */ 4797 static int amdgpu_device_evict_resources(struct amdgpu_device *adev) 4798 { 4799 int ret; 4800 4801 /* No need to evict vram on APUs unless going to S4 */ 4802 if (!adev->in_s4 && (adev->flags & AMD_IS_APU)) 4803 return 0; 4804 4805 ret = amdgpu_ttm_evict_resources(adev, TTM_PL_VRAM); 4806 if (ret) 4807 DRM_WARN("evicting device resources failed\n"); 4808 return ret; 4809 } 4810 4811 /* 4812 * Suspend & resume. 4813 */ 4814 /** 4815 * amdgpu_device_pm_notifier - Notification block for Suspend/Hibernate events 4816 * @nb: notifier block 4817 * @mode: suspend mode 4818 * @data: data 4819 * 4820 * This function is called when the system is about to suspend or hibernate. 4821 * It is used to evict resources from the device before the system goes to 4822 * sleep while there is still access to swap. 4823 */ 4824 static int amdgpu_device_pm_notifier(struct notifier_block *nb, unsigned long mode, 4825 void *data) 4826 { 4827 struct amdgpu_device *adev = container_of(nb, struct amdgpu_device, pm_nb); 4828 int r; 4829 4830 switch (mode) { 4831 case PM_HIBERNATION_PREPARE: 4832 adev->in_s4 = true; 4833 fallthrough; 4834 case PM_SUSPEND_PREPARE: 4835 r = amdgpu_device_evict_resources(adev); 4836 /* 4837 * This is considered non-fatal at this time because 4838 * amdgpu_device_prepare() will also fatally evict resources. 4839 * See https://gitlab.freedesktop.org/drm/amd/-/issues/3781 4840 */ 4841 if (r) 4842 drm_warn(adev_to_drm(adev), "Failed to evict resources, freeze active processes if problems occur: %d\n", r); 4843 break; 4844 } 4845 4846 return NOTIFY_DONE; 4847 } 4848 4849 /** 4850 * amdgpu_device_prepare - prepare for device suspend 4851 * 4852 * @dev: drm dev pointer 4853 * 4854 * Prepare to put the hw in the suspend state (all asics). 4855 * Returns 0 for success or an error on failure. 4856 * Called at driver suspend. 4857 */ 4858 int amdgpu_device_prepare(struct drm_device *dev) 4859 { 4860 struct amdgpu_device *adev = drm_to_adev(dev); 4861 int i, r; 4862 4863 amdgpu_choose_low_power_state(adev); 4864 4865 if (dev->switch_power_state == DRM_SWITCH_POWER_OFF) 4866 return 0; 4867 4868 /* Evict the majority of BOs before starting suspend sequence */ 4869 r = amdgpu_device_evict_resources(adev); 4870 if (r) 4871 goto unprepare; 4872 4873 flush_delayed_work(&adev->gfx.gfx_off_delay_work); 4874 4875 for (i = 0; i < adev->num_ip_blocks; i++) { 4876 if (!adev->ip_blocks[i].status.valid) 4877 continue; 4878 if (!adev->ip_blocks[i].version->funcs->prepare_suspend) 4879 continue; 4880 r = adev->ip_blocks[i].version->funcs->prepare_suspend(&adev->ip_blocks[i]); 4881 if (r) 4882 goto unprepare; 4883 } 4884 4885 return 0; 4886 4887 unprepare: 4888 adev->in_s0ix = adev->in_s3 = adev->in_s4 = false; 4889 4890 return r; 4891 } 4892 4893 /** 4894 * amdgpu_device_suspend - initiate device suspend 4895 * 4896 * @dev: drm dev pointer 4897 * @notify_clients: notify in-kernel DRM clients 4898 * 4899 * Puts the hw in the suspend state (all asics). 4900 * Returns 0 for success or an error on failure. 4901 * Called at driver suspend. 4902 */ 4903 int amdgpu_device_suspend(struct drm_device *dev, bool notify_clients) 4904 { 4905 struct amdgpu_device *adev = drm_to_adev(dev); 4906 int r = 0; 4907 4908 if (dev->switch_power_state == DRM_SWITCH_POWER_OFF) 4909 return 0; 4910 4911 adev->in_suspend = true; 4912 4913 if (amdgpu_sriov_vf(adev)) { 4914 amdgpu_virt_fini_data_exchange(adev); 4915 r = amdgpu_virt_request_full_gpu(adev, false); 4916 if (r) 4917 return r; 4918 } 4919 4920 if (amdgpu_acpi_smart_shift_update(dev, AMDGPU_SS_DEV_D3)) 4921 DRM_WARN("smart shift update failed\n"); 4922 4923 if (notify_clients) 4924 drm_client_dev_suspend(adev_to_drm(adev), false); 4925 4926 cancel_delayed_work_sync(&adev->delayed_init_work); 4927 4928 amdgpu_ras_suspend(adev); 4929 4930 amdgpu_device_ip_suspend_phase1(adev); 4931 4932 if (!adev->in_s0ix) 4933 amdgpu_amdkfd_suspend(adev, adev->in_runpm); 4934 4935 r = amdgpu_device_evict_resources(adev); 4936 if (r) 4937 return r; 4938 4939 amdgpu_ttm_set_buffer_funcs_status(adev, false); 4940 4941 amdgpu_fence_driver_hw_fini(adev); 4942 4943 amdgpu_device_ip_suspend_phase2(adev); 4944 4945 if (amdgpu_sriov_vf(adev)) 4946 amdgpu_virt_release_full_gpu(adev, false); 4947 4948 r = amdgpu_dpm_notify_rlc_state(adev, false); 4949 if (r) 4950 return r; 4951 4952 return 0; 4953 } 4954 4955 /** 4956 * amdgpu_device_resume - initiate device resume 4957 * 4958 * @dev: drm dev pointer 4959 * @notify_clients: notify in-kernel DRM clients 4960 * 4961 * Bring the hw back to operating state (all asics). 4962 * Returns 0 for success or an error on failure. 4963 * Called at driver resume. 4964 */ 4965 int amdgpu_device_resume(struct drm_device *dev, bool notify_clients) 4966 { 4967 struct amdgpu_device *adev = drm_to_adev(dev); 4968 int r = 0; 4969 4970 if (amdgpu_sriov_vf(adev)) { 4971 r = amdgpu_virt_request_full_gpu(adev, true); 4972 if (r) 4973 return r; 4974 } 4975 4976 if (dev->switch_power_state == DRM_SWITCH_POWER_OFF) 4977 return 0; 4978 4979 if (adev->in_s0ix) 4980 amdgpu_dpm_gfx_state_change(adev, sGpuChangeState_D0Entry); 4981 4982 /* post card */ 4983 if (amdgpu_device_need_post(adev)) { 4984 r = amdgpu_device_asic_init(adev); 4985 if (r) 4986 dev_err(adev->dev, "amdgpu asic init failed\n"); 4987 } 4988 4989 r = amdgpu_device_ip_resume(adev); 4990 4991 if (r) { 4992 dev_err(adev->dev, "amdgpu_device_ip_resume failed (%d).\n", r); 4993 goto exit; 4994 } 4995 4996 if (!adev->in_s0ix) { 4997 r = amdgpu_amdkfd_resume(adev, adev->in_runpm); 4998 if (r) 4999 goto exit; 5000 } 5001 5002 r = amdgpu_device_ip_late_init(adev); 5003 if (r) 5004 goto exit; 5005 5006 queue_delayed_work(system_wq, &adev->delayed_init_work, 5007 msecs_to_jiffies(AMDGPU_RESUME_MS)); 5008 exit: 5009 if (amdgpu_sriov_vf(adev)) { 5010 amdgpu_virt_init_data_exchange(adev); 5011 amdgpu_virt_release_full_gpu(adev, true); 5012 } 5013 5014 if (r) 5015 return r; 5016 5017 /* Make sure IB tests flushed */ 5018 flush_delayed_work(&adev->delayed_init_work); 5019 5020 if (notify_clients) 5021 drm_client_dev_resume(adev_to_drm(adev), false); 5022 5023 amdgpu_ras_resume(adev); 5024 5025 if (adev->mode_info.num_crtc) { 5026 /* 5027 * Most of the connector probing functions try to acquire runtime pm 5028 * refs to ensure that the GPU is powered on when connector polling is 5029 * performed. Since we're calling this from a runtime PM callback, 5030 * trying to acquire rpm refs will cause us to deadlock. 5031 * 5032 * Since we're guaranteed to be holding the rpm lock, it's safe to 5033 * temporarily disable the rpm helpers so this doesn't deadlock us. 5034 */ 5035 #ifdef CONFIG_PM 5036 dev->dev->power.disable_depth++; 5037 #endif 5038 if (!adev->dc_enabled) 5039 drm_helper_hpd_irq_event(dev); 5040 else 5041 drm_kms_helper_hotplug_event(dev); 5042 #ifdef CONFIG_PM 5043 dev->dev->power.disable_depth--; 5044 #endif 5045 } 5046 adev->in_suspend = false; 5047 5048 if (adev->enable_mes) 5049 amdgpu_mes_self_test(adev); 5050 5051 if (amdgpu_acpi_smart_shift_update(dev, AMDGPU_SS_DEV_D0)) 5052 DRM_WARN("smart shift update failed\n"); 5053 5054 return 0; 5055 } 5056 5057 /** 5058 * amdgpu_device_ip_check_soft_reset - did soft reset succeed 5059 * 5060 * @adev: amdgpu_device pointer 5061 * 5062 * The list of all the hardware IPs that make up the asic is walked and 5063 * the check_soft_reset callbacks are run. check_soft_reset determines 5064 * if the asic is still hung or not. 5065 * Returns true if any of the IPs are still in a hung state, false if not. 5066 */ 5067 static bool amdgpu_device_ip_check_soft_reset(struct amdgpu_device *adev) 5068 { 5069 int i; 5070 bool asic_hang = false; 5071 5072 if (amdgpu_sriov_vf(adev)) 5073 return true; 5074 5075 if (amdgpu_asic_need_full_reset(adev)) 5076 return true; 5077 5078 for (i = 0; i < adev->num_ip_blocks; i++) { 5079 if (!adev->ip_blocks[i].status.valid) 5080 continue; 5081 if (adev->ip_blocks[i].version->funcs->check_soft_reset) 5082 adev->ip_blocks[i].status.hang = 5083 adev->ip_blocks[i].version->funcs->check_soft_reset( 5084 &adev->ip_blocks[i]); 5085 if (adev->ip_blocks[i].status.hang) { 5086 dev_info(adev->dev, "IP block:%s is hung!\n", adev->ip_blocks[i].version->funcs->name); 5087 asic_hang = true; 5088 } 5089 } 5090 return asic_hang; 5091 } 5092 5093 /** 5094 * amdgpu_device_ip_pre_soft_reset - prepare for soft reset 5095 * 5096 * @adev: amdgpu_device pointer 5097 * 5098 * The list of all the hardware IPs that make up the asic is walked and the 5099 * pre_soft_reset callbacks are run if the block is hung. pre_soft_reset 5100 * handles any IP specific hardware or software state changes that are 5101 * necessary for a soft reset to succeed. 5102 * Returns 0 on success, negative error code on failure. 5103 */ 5104 static int amdgpu_device_ip_pre_soft_reset(struct amdgpu_device *adev) 5105 { 5106 int i, r = 0; 5107 5108 for (i = 0; i < adev->num_ip_blocks; i++) { 5109 if (!adev->ip_blocks[i].status.valid) 5110 continue; 5111 if (adev->ip_blocks[i].status.hang && 5112 adev->ip_blocks[i].version->funcs->pre_soft_reset) { 5113 r = adev->ip_blocks[i].version->funcs->pre_soft_reset(&adev->ip_blocks[i]); 5114 if (r) 5115 return r; 5116 } 5117 } 5118 5119 return 0; 5120 } 5121 5122 /** 5123 * amdgpu_device_ip_need_full_reset - check if a full asic reset is needed 5124 * 5125 * @adev: amdgpu_device pointer 5126 * 5127 * Some hardware IPs cannot be soft reset. If they are hung, a full gpu 5128 * reset is necessary to recover. 5129 * Returns true if a full asic reset is required, false if not. 5130 */ 5131 static bool amdgpu_device_ip_need_full_reset(struct amdgpu_device *adev) 5132 { 5133 int i; 5134 5135 if (amdgpu_asic_need_full_reset(adev)) 5136 return true; 5137 5138 for (i = 0; i < adev->num_ip_blocks; i++) { 5139 if (!adev->ip_blocks[i].status.valid) 5140 continue; 5141 if ((adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) || 5142 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) || 5143 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_ACP) || 5144 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE) || 5145 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) { 5146 if (adev->ip_blocks[i].status.hang) { 5147 dev_info(adev->dev, "Some block need full reset!\n"); 5148 return true; 5149 } 5150 } 5151 } 5152 return false; 5153 } 5154 5155 /** 5156 * amdgpu_device_ip_soft_reset - do a soft reset 5157 * 5158 * @adev: amdgpu_device pointer 5159 * 5160 * The list of all the hardware IPs that make up the asic is walked and the 5161 * soft_reset callbacks are run if the block is hung. soft_reset handles any 5162 * IP specific hardware or software state changes that are necessary to soft 5163 * reset the IP. 5164 * Returns 0 on success, negative error code on failure. 5165 */ 5166 static int amdgpu_device_ip_soft_reset(struct amdgpu_device *adev) 5167 { 5168 int i, r = 0; 5169 5170 for (i = 0; i < adev->num_ip_blocks; i++) { 5171 if (!adev->ip_blocks[i].status.valid) 5172 continue; 5173 if (adev->ip_blocks[i].status.hang && 5174 adev->ip_blocks[i].version->funcs->soft_reset) { 5175 r = adev->ip_blocks[i].version->funcs->soft_reset(&adev->ip_blocks[i]); 5176 if (r) 5177 return r; 5178 } 5179 } 5180 5181 return 0; 5182 } 5183 5184 /** 5185 * amdgpu_device_ip_post_soft_reset - clean up from soft reset 5186 * 5187 * @adev: amdgpu_device pointer 5188 * 5189 * The list of all the hardware IPs that make up the asic is walked and the 5190 * post_soft_reset callbacks are run if the asic was hung. post_soft_reset 5191 * handles any IP specific hardware or software state changes that are 5192 * necessary after the IP has been soft reset. 5193 * Returns 0 on success, negative error code on failure. 5194 */ 5195 static int amdgpu_device_ip_post_soft_reset(struct amdgpu_device *adev) 5196 { 5197 int i, r = 0; 5198 5199 for (i = 0; i < adev->num_ip_blocks; i++) { 5200 if (!adev->ip_blocks[i].status.valid) 5201 continue; 5202 if (adev->ip_blocks[i].status.hang && 5203 adev->ip_blocks[i].version->funcs->post_soft_reset) 5204 r = adev->ip_blocks[i].version->funcs->post_soft_reset(&adev->ip_blocks[i]); 5205 if (r) 5206 return r; 5207 } 5208 5209 return 0; 5210 } 5211 5212 /** 5213 * amdgpu_device_reset_sriov - reset ASIC for SR-IOV vf 5214 * 5215 * @adev: amdgpu_device pointer 5216 * @reset_context: amdgpu reset context pointer 5217 * 5218 * do VF FLR and reinitialize Asic 5219 * return 0 means succeeded otherwise failed 5220 */ 5221 static int amdgpu_device_reset_sriov(struct amdgpu_device *adev, 5222 struct amdgpu_reset_context *reset_context) 5223 { 5224 int r; 5225 struct amdgpu_hive_info *hive = NULL; 5226 5227 if (test_bit(AMDGPU_HOST_FLR, &reset_context->flags)) { 5228 if (!amdgpu_ras_get_fed_status(adev)) 5229 amdgpu_virt_ready_to_reset(adev); 5230 amdgpu_virt_wait_reset(adev); 5231 clear_bit(AMDGPU_HOST_FLR, &reset_context->flags); 5232 r = amdgpu_virt_request_full_gpu(adev, true); 5233 } else { 5234 r = amdgpu_virt_reset_gpu(adev); 5235 } 5236 if (r) 5237 return r; 5238 5239 amdgpu_ras_clear_err_state(adev); 5240 amdgpu_irq_gpu_reset_resume_helper(adev); 5241 5242 /* some sw clean up VF needs to do before recover */ 5243 amdgpu_virt_post_reset(adev); 5244 5245 /* Resume IP prior to SMC */ 5246 r = amdgpu_device_ip_reinit_early_sriov(adev); 5247 if (r) 5248 return r; 5249 5250 amdgpu_virt_init_data_exchange(adev); 5251 5252 r = amdgpu_device_fw_loading(adev); 5253 if (r) 5254 return r; 5255 5256 /* now we are okay to resume SMC/CP/SDMA */ 5257 r = amdgpu_device_ip_reinit_late_sriov(adev); 5258 if (r) 5259 return r; 5260 5261 hive = amdgpu_get_xgmi_hive(adev); 5262 /* Update PSP FW topology after reset */ 5263 if (hive && adev->gmc.xgmi.num_physical_nodes > 1) 5264 r = amdgpu_xgmi_update_topology(hive, adev); 5265 if (hive) 5266 amdgpu_put_xgmi_hive(hive); 5267 if (r) 5268 return r; 5269 5270 r = amdgpu_ib_ring_tests(adev); 5271 if (r) 5272 return r; 5273 5274 if (adev->virt.gim_feature & AMDGIM_FEATURE_GIM_FLR_VRAMLOST) 5275 amdgpu_inc_vram_lost(adev); 5276 5277 /* need to be called during full access so we can't do it later like 5278 * bare-metal does. 5279 */ 5280 amdgpu_amdkfd_post_reset(adev); 5281 amdgpu_virt_release_full_gpu(adev, true); 5282 5283 /* Aldebaran and gfx_11_0_3 support ras in SRIOV, so need resume ras during reset */ 5284 if (amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 2) || 5285 amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 3) || 5286 amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 4) || 5287 amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(11, 0, 3)) 5288 amdgpu_ras_resume(adev); 5289 5290 amdgpu_virt_ras_telemetry_post_reset(adev); 5291 5292 return 0; 5293 } 5294 5295 /** 5296 * amdgpu_device_has_job_running - check if there is any unfinished job 5297 * 5298 * @adev: amdgpu_device pointer 5299 * 5300 * check if there is any job running on the device when guest driver receives 5301 * FLR notification from host driver. If there are still jobs running, then 5302 * the guest driver will not respond the FLR reset. Instead, let the job hit 5303 * the timeout and guest driver then issue the reset request. 5304 */ 5305 bool amdgpu_device_has_job_running(struct amdgpu_device *adev) 5306 { 5307 int i; 5308 5309 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 5310 struct amdgpu_ring *ring = adev->rings[i]; 5311 5312 if (!amdgpu_ring_sched_ready(ring)) 5313 continue; 5314 5315 if (amdgpu_fence_count_emitted(ring)) 5316 return true; 5317 } 5318 return false; 5319 } 5320 5321 /** 5322 * amdgpu_device_should_recover_gpu - check if we should try GPU recovery 5323 * 5324 * @adev: amdgpu_device pointer 5325 * 5326 * Check amdgpu_gpu_recovery and SRIOV status to see if we should try to recover 5327 * a hung GPU. 5328 */ 5329 bool amdgpu_device_should_recover_gpu(struct amdgpu_device *adev) 5330 { 5331 5332 if (amdgpu_gpu_recovery == 0) 5333 goto disabled; 5334 5335 /* Skip soft reset check in fatal error mode */ 5336 if (!amdgpu_ras_is_poison_mode_supported(adev)) 5337 return true; 5338 5339 if (amdgpu_sriov_vf(adev)) 5340 return true; 5341 5342 if (amdgpu_gpu_recovery == -1) { 5343 switch (adev->asic_type) { 5344 #ifdef CONFIG_DRM_AMDGPU_SI 5345 case CHIP_VERDE: 5346 case CHIP_TAHITI: 5347 case CHIP_PITCAIRN: 5348 case CHIP_OLAND: 5349 case CHIP_HAINAN: 5350 #endif 5351 #ifdef CONFIG_DRM_AMDGPU_CIK 5352 case CHIP_KAVERI: 5353 case CHIP_KABINI: 5354 case CHIP_MULLINS: 5355 #endif 5356 case CHIP_CARRIZO: 5357 case CHIP_STONEY: 5358 case CHIP_CYAN_SKILLFISH: 5359 goto disabled; 5360 default: 5361 break; 5362 } 5363 } 5364 5365 return true; 5366 5367 disabled: 5368 dev_info(adev->dev, "GPU recovery disabled.\n"); 5369 return false; 5370 } 5371 5372 int amdgpu_device_mode1_reset(struct amdgpu_device *adev) 5373 { 5374 u32 i; 5375 int ret = 0; 5376 5377 amdgpu_atombios_scratch_regs_engine_hung(adev, true); 5378 5379 dev_info(adev->dev, "GPU mode1 reset\n"); 5380 5381 /* Cache the state before bus master disable. The saved config space 5382 * values are used in other cases like restore after mode-2 reset. 5383 */ 5384 amdgpu_device_cache_pci_state(adev->pdev); 5385 5386 /* disable BM */ 5387 pci_clear_master(adev->pdev); 5388 5389 if (amdgpu_dpm_is_mode1_reset_supported(adev)) { 5390 dev_info(adev->dev, "GPU smu mode1 reset\n"); 5391 ret = amdgpu_dpm_mode1_reset(adev); 5392 } else { 5393 dev_info(adev->dev, "GPU psp mode1 reset\n"); 5394 ret = psp_gpu_reset(adev); 5395 } 5396 5397 if (ret) 5398 goto mode1_reset_failed; 5399 5400 amdgpu_device_load_pci_state(adev->pdev); 5401 ret = amdgpu_psp_wait_for_bootloader(adev); 5402 if (ret) 5403 goto mode1_reset_failed; 5404 5405 /* wait for asic to come out of reset */ 5406 for (i = 0; i < adev->usec_timeout; i++) { 5407 u32 memsize = adev->nbio.funcs->get_memsize(adev); 5408 5409 if (memsize != 0xffffffff) 5410 break; 5411 udelay(1); 5412 } 5413 5414 if (i >= adev->usec_timeout) { 5415 ret = -ETIMEDOUT; 5416 goto mode1_reset_failed; 5417 } 5418 5419 amdgpu_atombios_scratch_regs_engine_hung(adev, false); 5420 5421 return 0; 5422 5423 mode1_reset_failed: 5424 dev_err(adev->dev, "GPU mode1 reset failed\n"); 5425 return ret; 5426 } 5427 5428 int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev, 5429 struct amdgpu_reset_context *reset_context) 5430 { 5431 int i, r = 0; 5432 struct amdgpu_job *job = NULL; 5433 struct amdgpu_device *tmp_adev = reset_context->reset_req_dev; 5434 bool need_full_reset = 5435 test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 5436 5437 if (reset_context->reset_req_dev == adev) 5438 job = reset_context->job; 5439 5440 if (amdgpu_sriov_vf(adev)) 5441 amdgpu_virt_pre_reset(adev); 5442 5443 amdgpu_fence_driver_isr_toggle(adev, true); 5444 5445 /* block all schedulers and reset given job's ring */ 5446 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 5447 struct amdgpu_ring *ring = adev->rings[i]; 5448 5449 if (!amdgpu_ring_sched_ready(ring)) 5450 continue; 5451 5452 /* Clear job fence from fence drv to avoid force_completion 5453 * leave NULL and vm flush fence in fence drv 5454 */ 5455 amdgpu_fence_driver_clear_job_fences(ring); 5456 5457 /* after all hw jobs are reset, hw fence is meaningless, so force_completion */ 5458 amdgpu_fence_driver_force_completion(ring); 5459 } 5460 5461 amdgpu_fence_driver_isr_toggle(adev, false); 5462 5463 if (job && job->vm) 5464 drm_sched_increase_karma(&job->base); 5465 5466 r = amdgpu_reset_prepare_hwcontext(adev, reset_context); 5467 /* If reset handler not implemented, continue; otherwise return */ 5468 if (r == -EOPNOTSUPP) 5469 r = 0; 5470 else 5471 return r; 5472 5473 /* Don't suspend on bare metal if we are not going to HW reset the ASIC */ 5474 if (!amdgpu_sriov_vf(adev)) { 5475 5476 if (!need_full_reset) 5477 need_full_reset = amdgpu_device_ip_need_full_reset(adev); 5478 5479 if (!need_full_reset && amdgpu_gpu_recovery && 5480 amdgpu_device_ip_check_soft_reset(adev)) { 5481 amdgpu_device_ip_pre_soft_reset(adev); 5482 r = amdgpu_device_ip_soft_reset(adev); 5483 amdgpu_device_ip_post_soft_reset(adev); 5484 if (r || amdgpu_device_ip_check_soft_reset(adev)) { 5485 dev_info(adev->dev, "soft reset failed, will fallback to full reset!\n"); 5486 need_full_reset = true; 5487 } 5488 } 5489 5490 if (!test_bit(AMDGPU_SKIP_COREDUMP, &reset_context->flags)) { 5491 dev_info(tmp_adev->dev, "Dumping IP State\n"); 5492 /* Trigger ip dump before we reset the asic */ 5493 for (i = 0; i < tmp_adev->num_ip_blocks; i++) 5494 if (tmp_adev->ip_blocks[i].version->funcs->dump_ip_state) 5495 tmp_adev->ip_blocks[i].version->funcs 5496 ->dump_ip_state((void *)&tmp_adev->ip_blocks[i]); 5497 dev_info(tmp_adev->dev, "Dumping IP State Completed\n"); 5498 } 5499 5500 if (need_full_reset) 5501 r = amdgpu_device_ip_suspend(adev); 5502 if (need_full_reset) 5503 set_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 5504 else 5505 clear_bit(AMDGPU_NEED_FULL_RESET, 5506 &reset_context->flags); 5507 } 5508 5509 return r; 5510 } 5511 5512 int amdgpu_device_reinit_after_reset(struct amdgpu_reset_context *reset_context) 5513 { 5514 struct list_head *device_list_handle; 5515 bool full_reset, vram_lost = false; 5516 struct amdgpu_device *tmp_adev; 5517 int r, init_level; 5518 5519 device_list_handle = reset_context->reset_device_list; 5520 5521 if (!device_list_handle) 5522 return -EINVAL; 5523 5524 full_reset = test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 5525 5526 /** 5527 * If it's reset on init, it's default init level, otherwise keep level 5528 * as recovery level. 5529 */ 5530 if (reset_context->method == AMD_RESET_METHOD_ON_INIT) 5531 init_level = AMDGPU_INIT_LEVEL_DEFAULT; 5532 else 5533 init_level = AMDGPU_INIT_LEVEL_RESET_RECOVERY; 5534 5535 r = 0; 5536 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 5537 amdgpu_set_init_level(tmp_adev, init_level); 5538 if (full_reset) { 5539 /* post card */ 5540 amdgpu_ras_clear_err_state(tmp_adev); 5541 r = amdgpu_device_asic_init(tmp_adev); 5542 if (r) { 5543 dev_warn(tmp_adev->dev, "asic atom init failed!"); 5544 } else { 5545 dev_info(tmp_adev->dev, "GPU reset succeeded, trying to resume\n"); 5546 5547 r = amdgpu_device_ip_resume_phase1(tmp_adev); 5548 if (r) 5549 goto out; 5550 5551 vram_lost = amdgpu_device_check_vram_lost(tmp_adev); 5552 5553 if (!test_bit(AMDGPU_SKIP_COREDUMP, &reset_context->flags)) 5554 amdgpu_coredump(tmp_adev, false, vram_lost, reset_context->job); 5555 5556 if (vram_lost) { 5557 DRM_INFO("VRAM is lost due to GPU reset!\n"); 5558 amdgpu_inc_vram_lost(tmp_adev); 5559 } 5560 5561 r = amdgpu_device_fw_loading(tmp_adev); 5562 if (r) 5563 return r; 5564 5565 r = amdgpu_xcp_restore_partition_mode( 5566 tmp_adev->xcp_mgr); 5567 if (r) 5568 goto out; 5569 5570 r = amdgpu_device_ip_resume_phase2(tmp_adev); 5571 if (r) 5572 goto out; 5573 5574 if (tmp_adev->mman.buffer_funcs_ring->sched.ready) 5575 amdgpu_ttm_set_buffer_funcs_status(tmp_adev, true); 5576 5577 r = amdgpu_device_ip_resume_phase3(tmp_adev); 5578 if (r) 5579 goto out; 5580 5581 if (vram_lost) 5582 amdgpu_device_fill_reset_magic(tmp_adev); 5583 5584 /* 5585 * Add this ASIC as tracked as reset was already 5586 * complete successfully. 5587 */ 5588 amdgpu_register_gpu_instance(tmp_adev); 5589 5590 if (!reset_context->hive && 5591 tmp_adev->gmc.xgmi.num_physical_nodes > 1) 5592 amdgpu_xgmi_add_device(tmp_adev); 5593 5594 r = amdgpu_device_ip_late_init(tmp_adev); 5595 if (r) 5596 goto out; 5597 5598 drm_client_dev_resume(adev_to_drm(tmp_adev), false); 5599 5600 /* 5601 * The GPU enters bad state once faulty pages 5602 * by ECC has reached the threshold, and ras 5603 * recovery is scheduled next. So add one check 5604 * here to break recovery if it indeed exceeds 5605 * bad page threshold, and remind user to 5606 * retire this GPU or setting one bigger 5607 * bad_page_threshold value to fix this once 5608 * probing driver again. 5609 */ 5610 if (!amdgpu_ras_is_rma(tmp_adev)) { 5611 /* must succeed. */ 5612 amdgpu_ras_resume(tmp_adev); 5613 } else { 5614 r = -EINVAL; 5615 goto out; 5616 } 5617 5618 /* Update PSP FW topology after reset */ 5619 if (reset_context->hive && 5620 tmp_adev->gmc.xgmi.num_physical_nodes > 1) 5621 r = amdgpu_xgmi_update_topology( 5622 reset_context->hive, tmp_adev); 5623 } 5624 } 5625 5626 out: 5627 if (!r) { 5628 /* IP init is complete now, set level as default */ 5629 amdgpu_set_init_level(tmp_adev, 5630 AMDGPU_INIT_LEVEL_DEFAULT); 5631 amdgpu_irq_gpu_reset_resume_helper(tmp_adev); 5632 r = amdgpu_ib_ring_tests(tmp_adev); 5633 if (r) { 5634 dev_err(tmp_adev->dev, "ib ring test failed (%d).\n", r); 5635 r = -EAGAIN; 5636 goto end; 5637 } 5638 } 5639 5640 if (r) 5641 tmp_adev->asic_reset_res = r; 5642 } 5643 5644 end: 5645 return r; 5646 } 5647 5648 int amdgpu_do_asic_reset(struct list_head *device_list_handle, 5649 struct amdgpu_reset_context *reset_context) 5650 { 5651 struct amdgpu_device *tmp_adev = NULL; 5652 bool need_full_reset, skip_hw_reset; 5653 int r = 0; 5654 5655 /* Try reset handler method first */ 5656 tmp_adev = list_first_entry(device_list_handle, struct amdgpu_device, 5657 reset_list); 5658 5659 reset_context->reset_device_list = device_list_handle; 5660 r = amdgpu_reset_perform_reset(tmp_adev, reset_context); 5661 /* If reset handler not implemented, continue; otherwise return */ 5662 if (r == -EOPNOTSUPP) 5663 r = 0; 5664 else 5665 return r; 5666 5667 /* Reset handler not implemented, use the default method */ 5668 need_full_reset = 5669 test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 5670 skip_hw_reset = test_bit(AMDGPU_SKIP_HW_RESET, &reset_context->flags); 5671 5672 /* 5673 * ASIC reset has to be done on all XGMI hive nodes ASAP 5674 * to allow proper links negotiation in FW (within 1 sec) 5675 */ 5676 if (!skip_hw_reset && need_full_reset) { 5677 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 5678 /* For XGMI run all resets in parallel to speed up the process */ 5679 if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) { 5680 if (!queue_work(system_unbound_wq, 5681 &tmp_adev->xgmi_reset_work)) 5682 r = -EALREADY; 5683 } else 5684 r = amdgpu_asic_reset(tmp_adev); 5685 5686 if (r) { 5687 dev_err(tmp_adev->dev, 5688 "ASIC reset failed with error, %d for drm dev, %s", 5689 r, adev_to_drm(tmp_adev)->unique); 5690 goto out; 5691 } 5692 } 5693 5694 /* For XGMI wait for all resets to complete before proceed */ 5695 if (!r) { 5696 list_for_each_entry(tmp_adev, device_list_handle, 5697 reset_list) { 5698 if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) { 5699 flush_work(&tmp_adev->xgmi_reset_work); 5700 r = tmp_adev->asic_reset_res; 5701 if (r) 5702 break; 5703 } 5704 } 5705 } 5706 } 5707 5708 if (!r && amdgpu_ras_intr_triggered()) { 5709 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 5710 amdgpu_ras_reset_error_count(tmp_adev, 5711 AMDGPU_RAS_BLOCK__MMHUB); 5712 } 5713 5714 amdgpu_ras_intr_cleared(); 5715 } 5716 5717 r = amdgpu_device_reinit_after_reset(reset_context); 5718 if (r == -EAGAIN) 5719 set_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 5720 else 5721 clear_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 5722 5723 out: 5724 return r; 5725 } 5726 5727 static void amdgpu_device_set_mp1_state(struct amdgpu_device *adev) 5728 { 5729 5730 switch (amdgpu_asic_reset_method(adev)) { 5731 case AMD_RESET_METHOD_MODE1: 5732 adev->mp1_state = PP_MP1_STATE_SHUTDOWN; 5733 break; 5734 case AMD_RESET_METHOD_MODE2: 5735 adev->mp1_state = PP_MP1_STATE_RESET; 5736 break; 5737 default: 5738 adev->mp1_state = PP_MP1_STATE_NONE; 5739 break; 5740 } 5741 } 5742 5743 static void amdgpu_device_unset_mp1_state(struct amdgpu_device *adev) 5744 { 5745 amdgpu_vf_error_trans_all(adev); 5746 adev->mp1_state = PP_MP1_STATE_NONE; 5747 } 5748 5749 static void amdgpu_device_resume_display_audio(struct amdgpu_device *adev) 5750 { 5751 struct pci_dev *p = NULL; 5752 5753 p = pci_get_domain_bus_and_slot(pci_domain_nr(adev->pdev->bus), 5754 adev->pdev->bus->number, 1); 5755 if (p) { 5756 pm_runtime_enable(&(p->dev)); 5757 pm_runtime_resume(&(p->dev)); 5758 } 5759 5760 pci_dev_put(p); 5761 } 5762 5763 static int amdgpu_device_suspend_display_audio(struct amdgpu_device *adev) 5764 { 5765 enum amd_reset_method reset_method; 5766 struct pci_dev *p = NULL; 5767 u64 expires; 5768 5769 /* 5770 * For now, only BACO and mode1 reset are confirmed 5771 * to suffer the audio issue without proper suspended. 5772 */ 5773 reset_method = amdgpu_asic_reset_method(adev); 5774 if ((reset_method != AMD_RESET_METHOD_BACO) && 5775 (reset_method != AMD_RESET_METHOD_MODE1)) 5776 return -EINVAL; 5777 5778 p = pci_get_domain_bus_and_slot(pci_domain_nr(adev->pdev->bus), 5779 adev->pdev->bus->number, 1); 5780 if (!p) 5781 return -ENODEV; 5782 5783 expires = pm_runtime_autosuspend_expiration(&(p->dev)); 5784 if (!expires) 5785 /* 5786 * If we cannot get the audio device autosuspend delay, 5787 * a fixed 4S interval will be used. Considering 3S is 5788 * the audio controller default autosuspend delay setting. 5789 * 4S used here is guaranteed to cover that. 5790 */ 5791 expires = ktime_get_mono_fast_ns() + NSEC_PER_SEC * 4ULL; 5792 5793 while (!pm_runtime_status_suspended(&(p->dev))) { 5794 if (!pm_runtime_suspend(&(p->dev))) 5795 break; 5796 5797 if (expires < ktime_get_mono_fast_ns()) { 5798 dev_warn(adev->dev, "failed to suspend display audio\n"); 5799 pci_dev_put(p); 5800 /* TODO: abort the succeeding gpu reset? */ 5801 return -ETIMEDOUT; 5802 } 5803 } 5804 5805 pm_runtime_disable(&(p->dev)); 5806 5807 pci_dev_put(p); 5808 return 0; 5809 } 5810 5811 static inline void amdgpu_device_stop_pending_resets(struct amdgpu_device *adev) 5812 { 5813 struct amdgpu_ras *con = amdgpu_ras_get_context(adev); 5814 5815 #if defined(CONFIG_DEBUG_FS) 5816 if (!amdgpu_sriov_vf(adev)) 5817 cancel_work(&adev->reset_work); 5818 #endif 5819 5820 if (adev->kfd.dev) 5821 cancel_work(&adev->kfd.reset_work); 5822 5823 if (amdgpu_sriov_vf(adev)) 5824 cancel_work(&adev->virt.flr_work); 5825 5826 if (con && adev->ras_enabled) 5827 cancel_work(&con->recovery_work); 5828 5829 } 5830 5831 static int amdgpu_device_health_check(struct list_head *device_list_handle) 5832 { 5833 struct amdgpu_device *tmp_adev; 5834 int ret = 0; 5835 u32 status; 5836 5837 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 5838 pci_read_config_dword(tmp_adev->pdev, PCI_COMMAND, &status); 5839 if (PCI_POSSIBLE_ERROR(status)) { 5840 dev_err(tmp_adev->dev, "device lost from bus!"); 5841 ret = -ENODEV; 5842 } 5843 } 5844 5845 return ret; 5846 } 5847 5848 /** 5849 * amdgpu_device_gpu_recover - reset the asic and recover scheduler 5850 * 5851 * @adev: amdgpu_device pointer 5852 * @job: which job trigger hang 5853 * @reset_context: amdgpu reset context pointer 5854 * 5855 * Attempt to reset the GPU if it has hung (all asics). 5856 * Attempt to do soft-reset or full-reset and reinitialize Asic 5857 * Returns 0 for success or an error on failure. 5858 */ 5859 5860 int amdgpu_device_gpu_recover(struct amdgpu_device *adev, 5861 struct amdgpu_job *job, 5862 struct amdgpu_reset_context *reset_context) 5863 { 5864 struct list_head device_list, *device_list_handle = NULL; 5865 bool job_signaled = false; 5866 struct amdgpu_hive_info *hive = NULL; 5867 struct amdgpu_device *tmp_adev = NULL; 5868 int i, r = 0; 5869 bool need_emergency_restart = false; 5870 bool audio_suspended = false; 5871 int retry_limit = AMDGPU_MAX_RETRY_LIMIT; 5872 5873 /* 5874 * If it reaches here because of hang/timeout and a RAS error is 5875 * detected at the same time, let RAS recovery take care of it. 5876 */ 5877 if (amdgpu_ras_is_err_state(adev, AMDGPU_RAS_BLOCK__ANY) && 5878 !amdgpu_sriov_vf(adev) && 5879 reset_context->src != AMDGPU_RESET_SRC_RAS) { 5880 dev_dbg(adev->dev, 5881 "Gpu recovery from source: %d yielding to RAS error recovery handling", 5882 reset_context->src); 5883 return 0; 5884 } 5885 /* 5886 * Special case: RAS triggered and full reset isn't supported 5887 */ 5888 need_emergency_restart = amdgpu_ras_need_emergency_restart(adev); 5889 5890 /* 5891 * Flush RAM to disk so that after reboot 5892 * the user can read log and see why the system rebooted. 5893 */ 5894 if (need_emergency_restart && amdgpu_ras_get_context(adev) && 5895 amdgpu_ras_get_context(adev)->reboot) { 5896 DRM_WARN("Emergency reboot."); 5897 5898 ksys_sync_helper(); 5899 emergency_restart(); 5900 } 5901 5902 dev_info(adev->dev, "GPU %s begin!\n", 5903 need_emergency_restart ? "jobs stop":"reset"); 5904 5905 if (!amdgpu_sriov_vf(adev)) 5906 hive = amdgpu_get_xgmi_hive(adev); 5907 if (hive) 5908 mutex_lock(&hive->hive_lock); 5909 5910 reset_context->job = job; 5911 reset_context->hive = hive; 5912 /* 5913 * Build list of devices to reset. 5914 * In case we are in XGMI hive mode, resort the device list 5915 * to put adev in the 1st position. 5916 */ 5917 INIT_LIST_HEAD(&device_list); 5918 if (!amdgpu_sriov_vf(adev) && (adev->gmc.xgmi.num_physical_nodes > 1) && hive) { 5919 list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head) { 5920 list_add_tail(&tmp_adev->reset_list, &device_list); 5921 if (adev->shutdown) 5922 tmp_adev->shutdown = true; 5923 } 5924 if (!list_is_first(&adev->reset_list, &device_list)) 5925 list_rotate_to_front(&adev->reset_list, &device_list); 5926 device_list_handle = &device_list; 5927 } else { 5928 list_add_tail(&adev->reset_list, &device_list); 5929 device_list_handle = &device_list; 5930 } 5931 5932 if (!amdgpu_sriov_vf(adev)) { 5933 r = amdgpu_device_health_check(device_list_handle); 5934 if (r) 5935 goto end_reset; 5936 } 5937 5938 /* We need to lock reset domain only once both for XGMI and single device */ 5939 tmp_adev = list_first_entry(device_list_handle, struct amdgpu_device, 5940 reset_list); 5941 amdgpu_device_lock_reset_domain(tmp_adev->reset_domain); 5942 5943 /* block all schedulers and reset given job's ring */ 5944 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 5945 5946 amdgpu_device_set_mp1_state(tmp_adev); 5947 5948 /* 5949 * Try to put the audio codec into suspend state 5950 * before gpu reset started. 5951 * 5952 * Due to the power domain of the graphics device 5953 * is shared with AZ power domain. Without this, 5954 * we may change the audio hardware from behind 5955 * the audio driver's back. That will trigger 5956 * some audio codec errors. 5957 */ 5958 if (!amdgpu_device_suspend_display_audio(tmp_adev)) 5959 audio_suspended = true; 5960 5961 amdgpu_ras_set_error_query_ready(tmp_adev, false); 5962 5963 cancel_delayed_work_sync(&tmp_adev->delayed_init_work); 5964 5965 amdgpu_amdkfd_pre_reset(tmp_adev, reset_context); 5966 5967 /* 5968 * Mark these ASICs to be reset as untracked first 5969 * And add them back after reset completed 5970 */ 5971 amdgpu_unregister_gpu_instance(tmp_adev); 5972 5973 drm_client_dev_suspend(adev_to_drm(tmp_adev), false); 5974 5975 /* disable ras on ALL IPs */ 5976 if (!need_emergency_restart && 5977 amdgpu_device_ip_need_full_reset(tmp_adev)) 5978 amdgpu_ras_suspend(tmp_adev); 5979 5980 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 5981 struct amdgpu_ring *ring = tmp_adev->rings[i]; 5982 5983 if (!amdgpu_ring_sched_ready(ring)) 5984 continue; 5985 5986 drm_sched_stop(&ring->sched, job ? &job->base : NULL); 5987 5988 if (need_emergency_restart) 5989 amdgpu_job_stop_all_jobs_on_sched(&ring->sched); 5990 } 5991 atomic_inc(&tmp_adev->gpu_reset_counter); 5992 } 5993 5994 if (need_emergency_restart) 5995 goto skip_sched_resume; 5996 5997 /* 5998 * Must check guilty signal here since after this point all old 5999 * HW fences are force signaled. 6000 * 6001 * job->base holds a reference to parent fence 6002 */ 6003 if (job && dma_fence_is_signaled(&job->hw_fence)) { 6004 job_signaled = true; 6005 dev_info(adev->dev, "Guilty job already signaled, skipping HW reset"); 6006 goto skip_hw_reset; 6007 } 6008 6009 retry: /* Rest of adevs pre asic reset from XGMI hive. */ 6010 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 6011 r = amdgpu_device_pre_asic_reset(tmp_adev, reset_context); 6012 /*TODO Should we stop ?*/ 6013 if (r) { 6014 dev_err(tmp_adev->dev, "GPU pre asic reset failed with err, %d for drm dev, %s ", 6015 r, adev_to_drm(tmp_adev)->unique); 6016 tmp_adev->asic_reset_res = r; 6017 } 6018 } 6019 6020 /* Actual ASIC resets if needed.*/ 6021 /* Host driver will handle XGMI hive reset for SRIOV */ 6022 if (amdgpu_sriov_vf(adev)) { 6023 if (amdgpu_ras_get_fed_status(adev) || amdgpu_virt_rcvd_ras_interrupt(adev)) { 6024 dev_dbg(adev->dev, "Detected RAS error, wait for FLR completion\n"); 6025 amdgpu_ras_set_fed(adev, true); 6026 set_bit(AMDGPU_HOST_FLR, &reset_context->flags); 6027 } 6028 6029 r = amdgpu_device_reset_sriov(adev, reset_context); 6030 if (AMDGPU_RETRY_SRIOV_RESET(r) && (retry_limit--) > 0) { 6031 amdgpu_virt_release_full_gpu(adev, true); 6032 goto retry; 6033 } 6034 if (r) 6035 adev->asic_reset_res = r; 6036 } else { 6037 r = amdgpu_do_asic_reset(device_list_handle, reset_context); 6038 if (r && r == -EAGAIN) 6039 goto retry; 6040 } 6041 6042 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 6043 /* 6044 * Drop any pending non scheduler resets queued before reset is done. 6045 * Any reset scheduled after this point would be valid. Scheduler resets 6046 * were already dropped during drm_sched_stop and no new ones can come 6047 * in before drm_sched_start. 6048 */ 6049 amdgpu_device_stop_pending_resets(tmp_adev); 6050 } 6051 6052 skip_hw_reset: 6053 6054 /* Post ASIC reset for all devs .*/ 6055 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 6056 6057 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 6058 struct amdgpu_ring *ring = tmp_adev->rings[i]; 6059 6060 if (!amdgpu_ring_sched_ready(ring)) 6061 continue; 6062 6063 drm_sched_start(&ring->sched, 0); 6064 } 6065 6066 if (!drm_drv_uses_atomic_modeset(adev_to_drm(tmp_adev)) && !job_signaled) 6067 drm_helper_resume_force_mode(adev_to_drm(tmp_adev)); 6068 6069 if (tmp_adev->asic_reset_res) 6070 r = tmp_adev->asic_reset_res; 6071 6072 tmp_adev->asic_reset_res = 0; 6073 6074 if (r) { 6075 /* bad news, how to tell it to userspace ? 6076 * for ras error, we should report GPU bad status instead of 6077 * reset failure 6078 */ 6079 if (reset_context->src != AMDGPU_RESET_SRC_RAS || 6080 !amdgpu_ras_eeprom_check_err_threshold(tmp_adev)) 6081 dev_info(tmp_adev->dev, "GPU reset(%d) failed\n", 6082 atomic_read(&tmp_adev->gpu_reset_counter)); 6083 amdgpu_vf_error_put(tmp_adev, AMDGIM_ERROR_VF_GPU_RESET_FAIL, 0, r); 6084 } else { 6085 dev_info(tmp_adev->dev, "GPU reset(%d) succeeded!\n", atomic_read(&tmp_adev->gpu_reset_counter)); 6086 if (amdgpu_acpi_smart_shift_update(adev_to_drm(tmp_adev), AMDGPU_SS_DEV_D0)) 6087 DRM_WARN("smart shift update failed\n"); 6088 } 6089 } 6090 6091 skip_sched_resume: 6092 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 6093 /* unlock kfd: SRIOV would do it separately */ 6094 if (!need_emergency_restart && !amdgpu_sriov_vf(tmp_adev)) 6095 amdgpu_amdkfd_post_reset(tmp_adev); 6096 6097 /* kfd_post_reset will do nothing if kfd device is not initialized, 6098 * need to bring up kfd here if it's not be initialized before 6099 */ 6100 if (!adev->kfd.init_complete) 6101 amdgpu_amdkfd_device_init(adev); 6102 6103 if (audio_suspended) 6104 amdgpu_device_resume_display_audio(tmp_adev); 6105 6106 amdgpu_device_unset_mp1_state(tmp_adev); 6107 6108 amdgpu_ras_set_error_query_ready(tmp_adev, true); 6109 } 6110 6111 tmp_adev = list_first_entry(device_list_handle, struct amdgpu_device, 6112 reset_list); 6113 amdgpu_device_unlock_reset_domain(tmp_adev->reset_domain); 6114 6115 end_reset: 6116 if (hive) { 6117 mutex_unlock(&hive->hive_lock); 6118 amdgpu_put_xgmi_hive(hive); 6119 } 6120 6121 if (r) 6122 dev_info(adev->dev, "GPU reset end with ret = %d\n", r); 6123 6124 atomic_set(&adev->reset_domain->reset_res, r); 6125 6126 if (!r) 6127 drm_dev_wedged_event(adev_to_drm(adev), DRM_WEDGE_RECOVERY_NONE); 6128 6129 return r; 6130 } 6131 6132 /** 6133 * amdgpu_device_partner_bandwidth - find the bandwidth of appropriate partner 6134 * 6135 * @adev: amdgpu_device pointer 6136 * @speed: pointer to the speed of the link 6137 * @width: pointer to the width of the link 6138 * 6139 * Evaluate the hierarchy to find the speed and bandwidth capabilities of the 6140 * first physical partner to an AMD dGPU. 6141 * This will exclude any virtual switches and links. 6142 */ 6143 static void amdgpu_device_partner_bandwidth(struct amdgpu_device *adev, 6144 enum pci_bus_speed *speed, 6145 enum pcie_link_width *width) 6146 { 6147 struct pci_dev *parent = adev->pdev; 6148 6149 if (!speed || !width) 6150 return; 6151 6152 *speed = PCI_SPEED_UNKNOWN; 6153 *width = PCIE_LNK_WIDTH_UNKNOWN; 6154 6155 if (amdgpu_device_pcie_dynamic_switching_supported(adev)) { 6156 while ((parent = pci_upstream_bridge(parent))) { 6157 /* skip upstream/downstream switches internal to dGPU*/ 6158 if (parent->vendor == PCI_VENDOR_ID_ATI) 6159 continue; 6160 *speed = pcie_get_speed_cap(parent); 6161 *width = pcie_get_width_cap(parent); 6162 break; 6163 } 6164 } else { 6165 /* use the current speeds rather than max if switching is not supported */ 6166 pcie_bandwidth_available(adev->pdev, NULL, speed, width); 6167 } 6168 } 6169 6170 /** 6171 * amdgpu_device_gpu_bandwidth - find the bandwidth of the GPU 6172 * 6173 * @adev: amdgpu_device pointer 6174 * @speed: pointer to the speed of the link 6175 * @width: pointer to the width of the link 6176 * 6177 * Evaluate the hierarchy to find the speed and bandwidth capabilities of the 6178 * AMD dGPU which may be a virtual upstream bridge. 6179 */ 6180 static void amdgpu_device_gpu_bandwidth(struct amdgpu_device *adev, 6181 enum pci_bus_speed *speed, 6182 enum pcie_link_width *width) 6183 { 6184 struct pci_dev *parent = adev->pdev; 6185 6186 if (!speed || !width) 6187 return; 6188 6189 parent = pci_upstream_bridge(parent); 6190 if (parent && parent->vendor == PCI_VENDOR_ID_ATI) { 6191 /* use the upstream/downstream switches internal to dGPU */ 6192 *speed = pcie_get_speed_cap(parent); 6193 *width = pcie_get_width_cap(parent); 6194 while ((parent = pci_upstream_bridge(parent))) { 6195 if (parent->vendor == PCI_VENDOR_ID_ATI) { 6196 /* use the upstream/downstream switches internal to dGPU */ 6197 *speed = pcie_get_speed_cap(parent); 6198 *width = pcie_get_width_cap(parent); 6199 } 6200 } 6201 } else { 6202 /* use the device itself */ 6203 *speed = pcie_get_speed_cap(adev->pdev); 6204 *width = pcie_get_width_cap(adev->pdev); 6205 } 6206 } 6207 6208 /** 6209 * amdgpu_device_get_pcie_info - fence pcie info about the PCIE slot 6210 * 6211 * @adev: amdgpu_device pointer 6212 * 6213 * Fetches and stores in the driver the PCIE capabilities (gen speed 6214 * and lanes) of the slot the device is in. Handles APUs and 6215 * virtualized environments where PCIE config space may not be available. 6216 */ 6217 static void amdgpu_device_get_pcie_info(struct amdgpu_device *adev) 6218 { 6219 enum pci_bus_speed speed_cap, platform_speed_cap; 6220 enum pcie_link_width platform_link_width, link_width; 6221 6222 if (amdgpu_pcie_gen_cap) 6223 adev->pm.pcie_gen_mask = amdgpu_pcie_gen_cap; 6224 6225 if (amdgpu_pcie_lane_cap) 6226 adev->pm.pcie_mlw_mask = amdgpu_pcie_lane_cap; 6227 6228 /* covers APUs as well */ 6229 if (pci_is_root_bus(adev->pdev->bus) && !amdgpu_passthrough(adev)) { 6230 if (adev->pm.pcie_gen_mask == 0) 6231 adev->pm.pcie_gen_mask = AMDGPU_DEFAULT_PCIE_GEN_MASK; 6232 if (adev->pm.pcie_mlw_mask == 0) 6233 adev->pm.pcie_mlw_mask = AMDGPU_DEFAULT_PCIE_MLW_MASK; 6234 return; 6235 } 6236 6237 if (adev->pm.pcie_gen_mask && adev->pm.pcie_mlw_mask) 6238 return; 6239 6240 amdgpu_device_partner_bandwidth(adev, &platform_speed_cap, 6241 &platform_link_width); 6242 amdgpu_device_gpu_bandwidth(adev, &speed_cap, &link_width); 6243 6244 if (adev->pm.pcie_gen_mask == 0) { 6245 /* asic caps */ 6246 if (speed_cap == PCI_SPEED_UNKNOWN) { 6247 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 6248 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 | 6249 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3); 6250 } else { 6251 if (speed_cap == PCIE_SPEED_32_0GT) 6252 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 6253 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 | 6254 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3 | 6255 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN4 | 6256 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN5); 6257 else if (speed_cap == PCIE_SPEED_16_0GT) 6258 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 6259 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 | 6260 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3 | 6261 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN4); 6262 else if (speed_cap == PCIE_SPEED_8_0GT) 6263 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 6264 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 | 6265 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3); 6266 else if (speed_cap == PCIE_SPEED_5_0GT) 6267 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 6268 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2); 6269 else 6270 adev->pm.pcie_gen_mask |= CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1; 6271 } 6272 /* platform caps */ 6273 if (platform_speed_cap == PCI_SPEED_UNKNOWN) { 6274 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 6275 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2); 6276 } else { 6277 if (platform_speed_cap == PCIE_SPEED_32_0GT) 6278 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 6279 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 | 6280 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3 | 6281 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN4 | 6282 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN5); 6283 else if (platform_speed_cap == PCIE_SPEED_16_0GT) 6284 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 6285 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 | 6286 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3 | 6287 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN4); 6288 else if (platform_speed_cap == PCIE_SPEED_8_0GT) 6289 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 6290 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 | 6291 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3); 6292 else if (platform_speed_cap == PCIE_SPEED_5_0GT) 6293 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 6294 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2); 6295 else 6296 adev->pm.pcie_gen_mask |= CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1; 6297 6298 } 6299 } 6300 if (adev->pm.pcie_mlw_mask == 0) { 6301 /* asic caps */ 6302 if (link_width == PCIE_LNK_WIDTH_UNKNOWN) { 6303 adev->pm.pcie_mlw_mask |= AMDGPU_DEFAULT_ASIC_PCIE_MLW_MASK; 6304 } else { 6305 switch (link_width) { 6306 case PCIE_LNK_X32: 6307 adev->pm.pcie_mlw_mask |= (CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X32 | 6308 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X16 | 6309 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X12 | 6310 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X8 | 6311 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X4 | 6312 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X2 | 6313 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X1); 6314 break; 6315 case PCIE_LNK_X16: 6316 adev->pm.pcie_mlw_mask |= (CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X16 | 6317 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X12 | 6318 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X8 | 6319 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X4 | 6320 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X2 | 6321 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X1); 6322 break; 6323 case PCIE_LNK_X12: 6324 adev->pm.pcie_mlw_mask |= (CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X12 | 6325 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X8 | 6326 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X4 | 6327 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X2 | 6328 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X1); 6329 break; 6330 case PCIE_LNK_X8: 6331 adev->pm.pcie_mlw_mask |= (CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X8 | 6332 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X4 | 6333 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X2 | 6334 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X1); 6335 break; 6336 case PCIE_LNK_X4: 6337 adev->pm.pcie_mlw_mask |= (CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X4 | 6338 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X2 | 6339 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X1); 6340 break; 6341 case PCIE_LNK_X2: 6342 adev->pm.pcie_mlw_mask |= (CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X2 | 6343 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X1); 6344 break; 6345 case PCIE_LNK_X1: 6346 adev->pm.pcie_mlw_mask |= CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X1; 6347 break; 6348 default: 6349 break; 6350 } 6351 } 6352 /* platform caps */ 6353 if (platform_link_width == PCIE_LNK_WIDTH_UNKNOWN) { 6354 adev->pm.pcie_mlw_mask |= AMDGPU_DEFAULT_PCIE_MLW_MASK; 6355 } else { 6356 switch (platform_link_width) { 6357 case PCIE_LNK_X32: 6358 adev->pm.pcie_mlw_mask |= (CAIL_PCIE_LINK_WIDTH_SUPPORT_X32 | 6359 CAIL_PCIE_LINK_WIDTH_SUPPORT_X16 | 6360 CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 | 6361 CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 | 6362 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 6363 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 6364 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 6365 break; 6366 case PCIE_LNK_X16: 6367 adev->pm.pcie_mlw_mask |= (CAIL_PCIE_LINK_WIDTH_SUPPORT_X16 | 6368 CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 | 6369 CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 | 6370 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 6371 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 6372 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 6373 break; 6374 case PCIE_LNK_X12: 6375 adev->pm.pcie_mlw_mask |= (CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 | 6376 CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 | 6377 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 6378 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 6379 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 6380 break; 6381 case PCIE_LNK_X8: 6382 adev->pm.pcie_mlw_mask |= (CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 | 6383 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 6384 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 6385 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 6386 break; 6387 case PCIE_LNK_X4: 6388 adev->pm.pcie_mlw_mask |= (CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 6389 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 6390 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 6391 break; 6392 case PCIE_LNK_X2: 6393 adev->pm.pcie_mlw_mask |= (CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 6394 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 6395 break; 6396 case PCIE_LNK_X1: 6397 adev->pm.pcie_mlw_mask |= CAIL_PCIE_LINK_WIDTH_SUPPORT_X1; 6398 break; 6399 default: 6400 break; 6401 } 6402 } 6403 } 6404 } 6405 6406 /** 6407 * amdgpu_device_is_peer_accessible - Check peer access through PCIe BAR 6408 * 6409 * @adev: amdgpu_device pointer 6410 * @peer_adev: amdgpu_device pointer for peer device trying to access @adev 6411 * 6412 * Return true if @peer_adev can access (DMA) @adev through the PCIe 6413 * BAR, i.e. @adev is "large BAR" and the BAR matches the DMA mask of 6414 * @peer_adev. 6415 */ 6416 bool amdgpu_device_is_peer_accessible(struct amdgpu_device *adev, 6417 struct amdgpu_device *peer_adev) 6418 { 6419 #ifdef CONFIG_HSA_AMD_P2P 6420 bool p2p_access = 6421 !adev->gmc.xgmi.connected_to_cpu && 6422 !(pci_p2pdma_distance(adev->pdev, peer_adev->dev, false) < 0); 6423 if (!p2p_access) 6424 dev_info(adev->dev, "PCIe P2P access from peer device %s is not supported by the chipset\n", 6425 pci_name(peer_adev->pdev)); 6426 6427 bool is_large_bar = adev->gmc.visible_vram_size && 6428 adev->gmc.real_vram_size == adev->gmc.visible_vram_size; 6429 bool p2p_addressable = amdgpu_device_check_iommu_remap(peer_adev); 6430 6431 if (!p2p_addressable) { 6432 uint64_t address_mask = peer_adev->dev->dma_mask ? 6433 ~*peer_adev->dev->dma_mask : ~((1ULL << 32) - 1); 6434 resource_size_t aper_limit = 6435 adev->gmc.aper_base + adev->gmc.aper_size - 1; 6436 6437 p2p_addressable = !(adev->gmc.aper_base & address_mask || 6438 aper_limit & address_mask); 6439 } 6440 return pcie_p2p && is_large_bar && p2p_access && p2p_addressable; 6441 #else 6442 return false; 6443 #endif 6444 } 6445 6446 int amdgpu_device_baco_enter(struct drm_device *dev) 6447 { 6448 struct amdgpu_device *adev = drm_to_adev(dev); 6449 struct amdgpu_ras *ras = amdgpu_ras_get_context(adev); 6450 6451 if (!amdgpu_device_supports_baco(dev)) 6452 return -ENOTSUPP; 6453 6454 if (ras && adev->ras_enabled && 6455 adev->nbio.funcs->enable_doorbell_interrupt) 6456 adev->nbio.funcs->enable_doorbell_interrupt(adev, false); 6457 6458 return amdgpu_dpm_baco_enter(adev); 6459 } 6460 6461 int amdgpu_device_baco_exit(struct drm_device *dev) 6462 { 6463 struct amdgpu_device *adev = drm_to_adev(dev); 6464 struct amdgpu_ras *ras = amdgpu_ras_get_context(adev); 6465 int ret = 0; 6466 6467 if (!amdgpu_device_supports_baco(dev)) 6468 return -ENOTSUPP; 6469 6470 ret = amdgpu_dpm_baco_exit(adev); 6471 if (ret) 6472 return ret; 6473 6474 if (ras && adev->ras_enabled && 6475 adev->nbio.funcs->enable_doorbell_interrupt) 6476 adev->nbio.funcs->enable_doorbell_interrupt(adev, true); 6477 6478 if (amdgpu_passthrough(adev) && adev->nbio.funcs && 6479 adev->nbio.funcs->clear_doorbell_interrupt) 6480 adev->nbio.funcs->clear_doorbell_interrupt(adev); 6481 6482 return 0; 6483 } 6484 6485 /** 6486 * amdgpu_pci_error_detected - Called when a PCI error is detected. 6487 * @pdev: PCI device struct 6488 * @state: PCI channel state 6489 * 6490 * Description: Called when a PCI error is detected. 6491 * 6492 * Return: PCI_ERS_RESULT_NEED_RESET or PCI_ERS_RESULT_DISCONNECT. 6493 */ 6494 pci_ers_result_t amdgpu_pci_error_detected(struct pci_dev *pdev, pci_channel_state_t state) 6495 { 6496 struct drm_device *dev = pci_get_drvdata(pdev); 6497 struct amdgpu_device *adev = drm_to_adev(dev); 6498 int i; 6499 6500 DRM_INFO("PCI error: detected callback, state(%d)!!\n", state); 6501 6502 if (adev->gmc.xgmi.num_physical_nodes > 1) { 6503 DRM_WARN("No support for XGMI hive yet..."); 6504 return PCI_ERS_RESULT_DISCONNECT; 6505 } 6506 6507 adev->pci_channel_state = state; 6508 6509 switch (state) { 6510 case pci_channel_io_normal: 6511 return PCI_ERS_RESULT_CAN_RECOVER; 6512 /* Fatal error, prepare for slot reset */ 6513 case pci_channel_io_frozen: 6514 /* 6515 * Locking adev->reset_domain->sem will prevent any external access 6516 * to GPU during PCI error recovery 6517 */ 6518 amdgpu_device_lock_reset_domain(adev->reset_domain); 6519 amdgpu_device_set_mp1_state(adev); 6520 6521 /* 6522 * Block any work scheduling as we do for regular GPU reset 6523 * for the duration of the recovery 6524 */ 6525 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 6526 struct amdgpu_ring *ring = adev->rings[i]; 6527 6528 if (!amdgpu_ring_sched_ready(ring)) 6529 continue; 6530 6531 drm_sched_stop(&ring->sched, NULL); 6532 } 6533 atomic_inc(&adev->gpu_reset_counter); 6534 return PCI_ERS_RESULT_NEED_RESET; 6535 case pci_channel_io_perm_failure: 6536 /* Permanent error, prepare for device removal */ 6537 return PCI_ERS_RESULT_DISCONNECT; 6538 } 6539 6540 return PCI_ERS_RESULT_NEED_RESET; 6541 } 6542 6543 /** 6544 * amdgpu_pci_mmio_enabled - Enable MMIO and dump debug registers 6545 * @pdev: pointer to PCI device 6546 */ 6547 pci_ers_result_t amdgpu_pci_mmio_enabled(struct pci_dev *pdev) 6548 { 6549 6550 DRM_INFO("PCI error: mmio enabled callback!!\n"); 6551 6552 /* TODO - dump whatever for debugging purposes */ 6553 6554 /* This called only if amdgpu_pci_error_detected returns 6555 * PCI_ERS_RESULT_CAN_RECOVER. Read/write to the device still 6556 * works, no need to reset slot. 6557 */ 6558 6559 return PCI_ERS_RESULT_RECOVERED; 6560 } 6561 6562 /** 6563 * amdgpu_pci_slot_reset - Called when PCI slot has been reset. 6564 * @pdev: PCI device struct 6565 * 6566 * Description: This routine is called by the pci error recovery 6567 * code after the PCI slot has been reset, just before we 6568 * should resume normal operations. 6569 */ 6570 pci_ers_result_t amdgpu_pci_slot_reset(struct pci_dev *pdev) 6571 { 6572 struct drm_device *dev = pci_get_drvdata(pdev); 6573 struct amdgpu_device *adev = drm_to_adev(dev); 6574 int r, i; 6575 struct amdgpu_reset_context reset_context; 6576 u32 memsize; 6577 struct list_head device_list; 6578 6579 /* PCI error slot reset should be skipped During RAS recovery */ 6580 if ((amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 3) || 6581 amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 4)) && 6582 amdgpu_ras_in_recovery(adev)) 6583 return PCI_ERS_RESULT_RECOVERED; 6584 6585 DRM_INFO("PCI error: slot reset callback!!\n"); 6586 6587 memset(&reset_context, 0, sizeof(reset_context)); 6588 6589 INIT_LIST_HEAD(&device_list); 6590 list_add_tail(&adev->reset_list, &device_list); 6591 6592 /* wait for asic to come out of reset */ 6593 msleep(500); 6594 6595 /* Restore PCI confspace */ 6596 amdgpu_device_load_pci_state(pdev); 6597 6598 /* confirm ASIC came out of reset */ 6599 for (i = 0; i < adev->usec_timeout; i++) { 6600 memsize = amdgpu_asic_get_config_memsize(adev); 6601 6602 if (memsize != 0xffffffff) 6603 break; 6604 udelay(1); 6605 } 6606 if (memsize == 0xffffffff) { 6607 r = -ETIME; 6608 goto out; 6609 } 6610 6611 reset_context.method = AMD_RESET_METHOD_NONE; 6612 reset_context.reset_req_dev = adev; 6613 set_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags); 6614 set_bit(AMDGPU_SKIP_HW_RESET, &reset_context.flags); 6615 6616 adev->no_hw_access = true; 6617 r = amdgpu_device_pre_asic_reset(adev, &reset_context); 6618 adev->no_hw_access = false; 6619 if (r) 6620 goto out; 6621 6622 r = amdgpu_do_asic_reset(&device_list, &reset_context); 6623 6624 out: 6625 if (!r) { 6626 if (amdgpu_device_cache_pci_state(adev->pdev)) 6627 pci_restore_state(adev->pdev); 6628 6629 DRM_INFO("PCIe error recovery succeeded\n"); 6630 } else { 6631 DRM_ERROR("PCIe error recovery failed, err:%d", r); 6632 amdgpu_device_unset_mp1_state(adev); 6633 amdgpu_device_unlock_reset_domain(adev->reset_domain); 6634 } 6635 6636 return r ? PCI_ERS_RESULT_DISCONNECT : PCI_ERS_RESULT_RECOVERED; 6637 } 6638 6639 /** 6640 * amdgpu_pci_resume() - resume normal ops after PCI reset 6641 * @pdev: pointer to PCI device 6642 * 6643 * Called when the error recovery driver tells us that its 6644 * OK to resume normal operation. 6645 */ 6646 void amdgpu_pci_resume(struct pci_dev *pdev) 6647 { 6648 struct drm_device *dev = pci_get_drvdata(pdev); 6649 struct amdgpu_device *adev = drm_to_adev(dev); 6650 int i; 6651 6652 6653 DRM_INFO("PCI error: resume callback!!\n"); 6654 6655 /* Only continue execution for the case of pci_channel_io_frozen */ 6656 if (adev->pci_channel_state != pci_channel_io_frozen) 6657 return; 6658 6659 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 6660 struct amdgpu_ring *ring = adev->rings[i]; 6661 6662 if (!amdgpu_ring_sched_ready(ring)) 6663 continue; 6664 6665 drm_sched_start(&ring->sched, 0); 6666 } 6667 6668 amdgpu_device_unset_mp1_state(adev); 6669 amdgpu_device_unlock_reset_domain(adev->reset_domain); 6670 } 6671 6672 bool amdgpu_device_cache_pci_state(struct pci_dev *pdev) 6673 { 6674 struct drm_device *dev = pci_get_drvdata(pdev); 6675 struct amdgpu_device *adev = drm_to_adev(dev); 6676 int r; 6677 6678 if (amdgpu_sriov_vf(adev)) 6679 return false; 6680 6681 r = pci_save_state(pdev); 6682 if (!r) { 6683 kfree(adev->pci_state); 6684 6685 adev->pci_state = pci_store_saved_state(pdev); 6686 6687 if (!adev->pci_state) { 6688 DRM_ERROR("Failed to store PCI saved state"); 6689 return false; 6690 } 6691 } else { 6692 DRM_WARN("Failed to save PCI state, err:%d\n", r); 6693 return false; 6694 } 6695 6696 return true; 6697 } 6698 6699 bool amdgpu_device_load_pci_state(struct pci_dev *pdev) 6700 { 6701 struct drm_device *dev = pci_get_drvdata(pdev); 6702 struct amdgpu_device *adev = drm_to_adev(dev); 6703 int r; 6704 6705 if (!adev->pci_state) 6706 return false; 6707 6708 r = pci_load_saved_state(pdev, adev->pci_state); 6709 6710 if (!r) { 6711 pci_restore_state(pdev); 6712 } else { 6713 DRM_WARN("Failed to load PCI state, err:%d\n", r); 6714 return false; 6715 } 6716 6717 return true; 6718 } 6719 6720 void amdgpu_device_flush_hdp(struct amdgpu_device *adev, 6721 struct amdgpu_ring *ring) 6722 { 6723 #ifdef CONFIG_X86_64 6724 if ((adev->flags & AMD_IS_APU) && !amdgpu_passthrough(adev)) 6725 return; 6726 #endif 6727 if (adev->gmc.xgmi.connected_to_cpu) 6728 return; 6729 6730 if (ring && ring->funcs->emit_hdp_flush) 6731 amdgpu_ring_emit_hdp_flush(ring); 6732 else 6733 amdgpu_asic_flush_hdp(adev, ring); 6734 } 6735 6736 void amdgpu_device_invalidate_hdp(struct amdgpu_device *adev, 6737 struct amdgpu_ring *ring) 6738 { 6739 #ifdef CONFIG_X86_64 6740 if ((adev->flags & AMD_IS_APU) && !amdgpu_passthrough(adev)) 6741 return; 6742 #endif 6743 if (adev->gmc.xgmi.connected_to_cpu) 6744 return; 6745 6746 amdgpu_asic_invalidate_hdp(adev, ring); 6747 } 6748 6749 int amdgpu_in_reset(struct amdgpu_device *adev) 6750 { 6751 return atomic_read(&adev->reset_domain->in_gpu_reset); 6752 } 6753 6754 /** 6755 * amdgpu_device_halt() - bring hardware to some kind of halt state 6756 * 6757 * @adev: amdgpu_device pointer 6758 * 6759 * Bring hardware to some kind of halt state so that no one can touch it 6760 * any more. It will help to maintain error context when error occurred. 6761 * Compare to a simple hang, the system will keep stable at least for SSH 6762 * access. Then it should be trivial to inspect the hardware state and 6763 * see what's going on. Implemented as following: 6764 * 6765 * 1. drm_dev_unplug() makes device inaccessible to user space(IOCTLs, etc), 6766 * clears all CPU mappings to device, disallows remappings through page faults 6767 * 2. amdgpu_irq_disable_all() disables all interrupts 6768 * 3. amdgpu_fence_driver_hw_fini() signals all HW fences 6769 * 4. set adev->no_hw_access to avoid potential crashes after setp 5 6770 * 5. amdgpu_device_unmap_mmio() clears all MMIO mappings 6771 * 6. pci_disable_device() and pci_wait_for_pending_transaction() 6772 * flush any in flight DMA operations 6773 */ 6774 void amdgpu_device_halt(struct amdgpu_device *adev) 6775 { 6776 struct pci_dev *pdev = adev->pdev; 6777 struct drm_device *ddev = adev_to_drm(adev); 6778 6779 amdgpu_xcp_dev_unplug(adev); 6780 drm_dev_unplug(ddev); 6781 6782 amdgpu_irq_disable_all(adev); 6783 6784 amdgpu_fence_driver_hw_fini(adev); 6785 6786 adev->no_hw_access = true; 6787 6788 amdgpu_device_unmap_mmio(adev); 6789 6790 pci_disable_device(pdev); 6791 pci_wait_for_pending_transaction(pdev); 6792 } 6793 6794 u32 amdgpu_device_pcie_port_rreg(struct amdgpu_device *adev, 6795 u32 reg) 6796 { 6797 unsigned long flags, address, data; 6798 u32 r; 6799 6800 address = adev->nbio.funcs->get_pcie_port_index_offset(adev); 6801 data = adev->nbio.funcs->get_pcie_port_data_offset(adev); 6802 6803 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 6804 WREG32(address, reg * 4); 6805 (void)RREG32(address); 6806 r = RREG32(data); 6807 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 6808 return r; 6809 } 6810 6811 void amdgpu_device_pcie_port_wreg(struct amdgpu_device *adev, 6812 u32 reg, u32 v) 6813 { 6814 unsigned long flags, address, data; 6815 6816 address = adev->nbio.funcs->get_pcie_port_index_offset(adev); 6817 data = adev->nbio.funcs->get_pcie_port_data_offset(adev); 6818 6819 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 6820 WREG32(address, reg * 4); 6821 (void)RREG32(address); 6822 WREG32(data, v); 6823 (void)RREG32(data); 6824 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 6825 } 6826 6827 /** 6828 * amdgpu_device_get_gang - return a reference to the current gang 6829 * @adev: amdgpu_device pointer 6830 * 6831 * Returns: A new reference to the current gang leader. 6832 */ 6833 struct dma_fence *amdgpu_device_get_gang(struct amdgpu_device *adev) 6834 { 6835 struct dma_fence *fence; 6836 6837 rcu_read_lock(); 6838 fence = dma_fence_get_rcu_safe(&adev->gang_submit); 6839 rcu_read_unlock(); 6840 return fence; 6841 } 6842 6843 /** 6844 * amdgpu_device_switch_gang - switch to a new gang 6845 * @adev: amdgpu_device pointer 6846 * @gang: the gang to switch to 6847 * 6848 * Try to switch to a new gang. 6849 * Returns: NULL if we switched to the new gang or a reference to the current 6850 * gang leader. 6851 */ 6852 struct dma_fence *amdgpu_device_switch_gang(struct amdgpu_device *adev, 6853 struct dma_fence *gang) 6854 { 6855 struct dma_fence *old = NULL; 6856 6857 do { 6858 dma_fence_put(old); 6859 old = amdgpu_device_get_gang(adev); 6860 if (old == gang) 6861 break; 6862 6863 if (!dma_fence_is_signaled(old)) 6864 return old; 6865 6866 } while (cmpxchg((struct dma_fence __force **)&adev->gang_submit, 6867 old, gang) != old); 6868 6869 dma_fence_put(old); 6870 return NULL; 6871 } 6872 6873 bool amdgpu_device_has_display_hardware(struct amdgpu_device *adev) 6874 { 6875 switch (adev->asic_type) { 6876 #ifdef CONFIG_DRM_AMDGPU_SI 6877 case CHIP_HAINAN: 6878 #endif 6879 case CHIP_TOPAZ: 6880 /* chips with no display hardware */ 6881 return false; 6882 #ifdef CONFIG_DRM_AMDGPU_SI 6883 case CHIP_TAHITI: 6884 case CHIP_PITCAIRN: 6885 case CHIP_VERDE: 6886 case CHIP_OLAND: 6887 #endif 6888 #ifdef CONFIG_DRM_AMDGPU_CIK 6889 case CHIP_BONAIRE: 6890 case CHIP_HAWAII: 6891 case CHIP_KAVERI: 6892 case CHIP_KABINI: 6893 case CHIP_MULLINS: 6894 #endif 6895 case CHIP_TONGA: 6896 case CHIP_FIJI: 6897 case CHIP_POLARIS10: 6898 case CHIP_POLARIS11: 6899 case CHIP_POLARIS12: 6900 case CHIP_VEGAM: 6901 case CHIP_CARRIZO: 6902 case CHIP_STONEY: 6903 /* chips with display hardware */ 6904 return true; 6905 default: 6906 /* IP discovery */ 6907 if (!amdgpu_ip_version(adev, DCE_HWIP, 0) || 6908 (adev->harvest_ip_mask & AMD_HARVEST_IP_DMU_MASK)) 6909 return false; 6910 return true; 6911 } 6912 } 6913 6914 uint32_t amdgpu_device_wait_on_rreg(struct amdgpu_device *adev, 6915 uint32_t inst, uint32_t reg_addr, char reg_name[], 6916 uint32_t expected_value, uint32_t mask) 6917 { 6918 uint32_t ret = 0; 6919 uint32_t old_ = 0; 6920 uint32_t tmp_ = RREG32(reg_addr); 6921 uint32_t loop = adev->usec_timeout; 6922 6923 while ((tmp_ & (mask)) != (expected_value)) { 6924 if (old_ != tmp_) { 6925 loop = adev->usec_timeout; 6926 old_ = tmp_; 6927 } else 6928 udelay(1); 6929 tmp_ = RREG32(reg_addr); 6930 loop--; 6931 if (!loop) { 6932 DRM_WARN("Register(%d) [%s] failed to reach value 0x%08x != 0x%08xn", 6933 inst, reg_name, (uint32_t)expected_value, 6934 (uint32_t)(tmp_ & (mask))); 6935 ret = -ETIMEDOUT; 6936 break; 6937 } 6938 } 6939 return ret; 6940 } 6941 6942 ssize_t amdgpu_get_soft_full_reset_mask(struct amdgpu_ring *ring) 6943 { 6944 ssize_t size = 0; 6945 6946 if (!ring || !ring->adev) 6947 return size; 6948 6949 if (amdgpu_device_should_recover_gpu(ring->adev)) 6950 size |= AMDGPU_RESET_TYPE_FULL; 6951 6952 if (unlikely(!ring->adev->debug_disable_soft_recovery) && 6953 !amdgpu_sriov_vf(ring->adev) && ring->funcs->soft_recovery) 6954 size |= AMDGPU_RESET_TYPE_SOFT_RESET; 6955 6956 return size; 6957 } 6958 6959 ssize_t amdgpu_show_reset_mask(char *buf, uint32_t supported_reset) 6960 { 6961 ssize_t size = 0; 6962 6963 if (supported_reset == 0) { 6964 size += sysfs_emit_at(buf, size, "unsupported"); 6965 size += sysfs_emit_at(buf, size, "\n"); 6966 return size; 6967 6968 } 6969 6970 if (supported_reset & AMDGPU_RESET_TYPE_SOFT_RESET) 6971 size += sysfs_emit_at(buf, size, "soft "); 6972 6973 if (supported_reset & AMDGPU_RESET_TYPE_PER_QUEUE) 6974 size += sysfs_emit_at(buf, size, "queue "); 6975 6976 if (supported_reset & AMDGPU_RESET_TYPE_PER_PIPE) 6977 size += sysfs_emit_at(buf, size, "pipe "); 6978 6979 if (supported_reset & AMDGPU_RESET_TYPE_FULL) 6980 size += sysfs_emit_at(buf, size, "full "); 6981 6982 size += sysfs_emit_at(buf, size, "\n"); 6983 return size; 6984 } 6985