1 /* 2 * Copyright 2008 Advanced Micro Devices, Inc. 3 * Copyright 2008 Red Hat Inc. 4 * Copyright 2009 Jerome Glisse. 5 * 6 * Permission is hereby granted, free of charge, to any person obtaining a 7 * copy of this software and associated documentation files (the "Software"), 8 * to deal in the Software without restriction, including without limitation 9 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 10 * and/or sell copies of the Software, and to permit persons to whom the 11 * Software is furnished to do so, subject to the following conditions: 12 * 13 * The above copyright notice and this permission notice shall be included in 14 * all copies or substantial portions of the Software. 15 * 16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 19 * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR 20 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, 21 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR 22 * OTHER DEALINGS IN THE SOFTWARE. 23 * 24 * Authors: Dave Airlie 25 * Alex Deucher 26 * Jerome Glisse 27 */ 28 29 #include <linux/aperture.h> 30 #include <linux/power_supply.h> 31 #include <linux/kthread.h> 32 #include <linux/module.h> 33 #include <linux/console.h> 34 #include <linux/slab.h> 35 #include <linux/iommu.h> 36 #include <linux/pci.h> 37 #include <linux/pci-p2pdma.h> 38 #include <linux/apple-gmux.h> 39 40 #include <drm/drm_atomic_helper.h> 41 #include <drm/drm_client_event.h> 42 #include <drm/drm_crtc_helper.h> 43 #include <drm/drm_probe_helper.h> 44 #include <drm/amdgpu_drm.h> 45 #include <linux/device.h> 46 #include <linux/vgaarb.h> 47 #include <linux/vga_switcheroo.h> 48 #include <linux/efi.h> 49 #include "amdgpu.h" 50 #include "amdgpu_trace.h" 51 #include "amdgpu_i2c.h" 52 #include "atom.h" 53 #include "amdgpu_atombios.h" 54 #include "amdgpu_atomfirmware.h" 55 #include "amd_pcie.h" 56 #ifdef CONFIG_DRM_AMDGPU_SI 57 #include "si.h" 58 #endif 59 #ifdef CONFIG_DRM_AMDGPU_CIK 60 #include "cik.h" 61 #endif 62 #include "vi.h" 63 #include "soc15.h" 64 #include "nv.h" 65 #include "bif/bif_4_1_d.h" 66 #include <linux/firmware.h> 67 #include "amdgpu_vf_error.h" 68 69 #include "amdgpu_amdkfd.h" 70 #include "amdgpu_pm.h" 71 72 #include "amdgpu_xgmi.h" 73 #include "amdgpu_ras.h" 74 #include "amdgpu_pmu.h" 75 #include "amdgpu_fru_eeprom.h" 76 #include "amdgpu_reset.h" 77 #include "amdgpu_virt.h" 78 #include "amdgpu_dev_coredump.h" 79 80 #include <linux/suspend.h> 81 #include <drm/task_barrier.h> 82 #include <linux/pm_runtime.h> 83 84 #include <drm/drm_drv.h> 85 86 #if IS_ENABLED(CONFIG_X86) 87 #include <asm/intel-family.h> 88 #endif 89 90 MODULE_FIRMWARE("amdgpu/vega10_gpu_info.bin"); 91 MODULE_FIRMWARE("amdgpu/vega12_gpu_info.bin"); 92 MODULE_FIRMWARE("amdgpu/raven_gpu_info.bin"); 93 MODULE_FIRMWARE("amdgpu/picasso_gpu_info.bin"); 94 MODULE_FIRMWARE("amdgpu/raven2_gpu_info.bin"); 95 MODULE_FIRMWARE("amdgpu/arcturus_gpu_info.bin"); 96 MODULE_FIRMWARE("amdgpu/navi12_gpu_info.bin"); 97 98 #define AMDGPU_RESUME_MS 2000 99 #define AMDGPU_MAX_RETRY_LIMIT 2 100 #define AMDGPU_RETRY_SRIOV_RESET(r) ((r) == -EBUSY || (r) == -ETIMEDOUT || (r) == -EINVAL) 101 #define AMDGPU_PCIE_INDEX_FALLBACK (0x38 >> 2) 102 #define AMDGPU_PCIE_INDEX_HI_FALLBACK (0x44 >> 2) 103 #define AMDGPU_PCIE_DATA_FALLBACK (0x3C >> 2) 104 105 #define AMDGPU_VBIOS_SKIP (1U << 0) 106 #define AMDGPU_VBIOS_OPTIONAL (1U << 1) 107 108 static const struct drm_driver amdgpu_kms_driver; 109 110 const char *amdgpu_asic_name[] = { 111 "TAHITI", 112 "PITCAIRN", 113 "VERDE", 114 "OLAND", 115 "HAINAN", 116 "BONAIRE", 117 "KAVERI", 118 "KABINI", 119 "HAWAII", 120 "MULLINS", 121 "TOPAZ", 122 "TONGA", 123 "FIJI", 124 "CARRIZO", 125 "STONEY", 126 "POLARIS10", 127 "POLARIS11", 128 "POLARIS12", 129 "VEGAM", 130 "VEGA10", 131 "VEGA12", 132 "VEGA20", 133 "RAVEN", 134 "ARCTURUS", 135 "RENOIR", 136 "ALDEBARAN", 137 "NAVI10", 138 "CYAN_SKILLFISH", 139 "NAVI14", 140 "NAVI12", 141 "SIENNA_CICHLID", 142 "NAVY_FLOUNDER", 143 "VANGOGH", 144 "DIMGREY_CAVEFISH", 145 "BEIGE_GOBY", 146 "YELLOW_CARP", 147 "IP DISCOVERY", 148 "LAST", 149 }; 150 151 #define AMDGPU_IP_BLK_MASK_ALL GENMASK(AMD_IP_BLOCK_TYPE_NUM - 1, 0) 152 /* 153 * Default init level where all blocks are expected to be initialized. This is 154 * the level of initialization expected by default and also after a full reset 155 * of the device. 156 */ 157 struct amdgpu_init_level amdgpu_init_default = { 158 .level = AMDGPU_INIT_LEVEL_DEFAULT, 159 .hwini_ip_block_mask = AMDGPU_IP_BLK_MASK_ALL, 160 }; 161 162 struct amdgpu_init_level amdgpu_init_recovery = { 163 .level = AMDGPU_INIT_LEVEL_RESET_RECOVERY, 164 .hwini_ip_block_mask = AMDGPU_IP_BLK_MASK_ALL, 165 }; 166 167 /* 168 * Minimal blocks needed to be initialized before a XGMI hive can be reset. This 169 * is used for cases like reset on initialization where the entire hive needs to 170 * be reset before first use. 171 */ 172 struct amdgpu_init_level amdgpu_init_minimal_xgmi = { 173 .level = AMDGPU_INIT_LEVEL_MINIMAL_XGMI, 174 .hwini_ip_block_mask = 175 BIT(AMD_IP_BLOCK_TYPE_GMC) | BIT(AMD_IP_BLOCK_TYPE_SMC) | 176 BIT(AMD_IP_BLOCK_TYPE_COMMON) | BIT(AMD_IP_BLOCK_TYPE_IH) | 177 BIT(AMD_IP_BLOCK_TYPE_PSP) 178 }; 179 180 static inline bool amdgpu_ip_member_of_hwini(struct amdgpu_device *adev, 181 enum amd_ip_block_type block) 182 { 183 return (adev->init_lvl->hwini_ip_block_mask & (1U << block)) != 0; 184 } 185 186 void amdgpu_set_init_level(struct amdgpu_device *adev, 187 enum amdgpu_init_lvl_id lvl) 188 { 189 switch (lvl) { 190 case AMDGPU_INIT_LEVEL_MINIMAL_XGMI: 191 adev->init_lvl = &amdgpu_init_minimal_xgmi; 192 break; 193 case AMDGPU_INIT_LEVEL_RESET_RECOVERY: 194 adev->init_lvl = &amdgpu_init_recovery; 195 break; 196 case AMDGPU_INIT_LEVEL_DEFAULT: 197 fallthrough; 198 default: 199 adev->init_lvl = &amdgpu_init_default; 200 break; 201 } 202 } 203 204 static inline void amdgpu_device_stop_pending_resets(struct amdgpu_device *adev); 205 static int amdgpu_device_pm_notifier(struct notifier_block *nb, unsigned long mode, 206 void *data); 207 208 /** 209 * DOC: pcie_replay_count 210 * 211 * The amdgpu driver provides a sysfs API for reporting the total number 212 * of PCIe replays (NAKs). 213 * The file pcie_replay_count is used for this and returns the total 214 * number of replays as a sum of the NAKs generated and NAKs received. 215 */ 216 217 static ssize_t amdgpu_device_get_pcie_replay_count(struct device *dev, 218 struct device_attribute *attr, char *buf) 219 { 220 struct drm_device *ddev = dev_get_drvdata(dev); 221 struct amdgpu_device *adev = drm_to_adev(ddev); 222 uint64_t cnt = amdgpu_asic_get_pcie_replay_count(adev); 223 224 return sysfs_emit(buf, "%llu\n", cnt); 225 } 226 227 static DEVICE_ATTR(pcie_replay_count, 0444, 228 amdgpu_device_get_pcie_replay_count, NULL); 229 230 static int amdgpu_device_attr_sysfs_init(struct amdgpu_device *adev) 231 { 232 int ret = 0; 233 234 if (!amdgpu_sriov_vf(adev)) 235 ret = sysfs_create_file(&adev->dev->kobj, 236 &dev_attr_pcie_replay_count.attr); 237 238 return ret; 239 } 240 241 static void amdgpu_device_attr_sysfs_fini(struct amdgpu_device *adev) 242 { 243 if (!amdgpu_sriov_vf(adev)) 244 sysfs_remove_file(&adev->dev->kobj, 245 &dev_attr_pcie_replay_count.attr); 246 } 247 248 static ssize_t amdgpu_sysfs_reg_state_get(struct file *f, struct kobject *kobj, 249 struct bin_attribute *attr, char *buf, 250 loff_t ppos, size_t count) 251 { 252 struct device *dev = kobj_to_dev(kobj); 253 struct drm_device *ddev = dev_get_drvdata(dev); 254 struct amdgpu_device *adev = drm_to_adev(ddev); 255 ssize_t bytes_read; 256 257 switch (ppos) { 258 case AMDGPU_SYS_REG_STATE_XGMI: 259 bytes_read = amdgpu_asic_get_reg_state( 260 adev, AMDGPU_REG_STATE_TYPE_XGMI, buf, count); 261 break; 262 case AMDGPU_SYS_REG_STATE_WAFL: 263 bytes_read = amdgpu_asic_get_reg_state( 264 adev, AMDGPU_REG_STATE_TYPE_WAFL, buf, count); 265 break; 266 case AMDGPU_SYS_REG_STATE_PCIE: 267 bytes_read = amdgpu_asic_get_reg_state( 268 adev, AMDGPU_REG_STATE_TYPE_PCIE, buf, count); 269 break; 270 case AMDGPU_SYS_REG_STATE_USR: 271 bytes_read = amdgpu_asic_get_reg_state( 272 adev, AMDGPU_REG_STATE_TYPE_USR, buf, count); 273 break; 274 case AMDGPU_SYS_REG_STATE_USR_1: 275 bytes_read = amdgpu_asic_get_reg_state( 276 adev, AMDGPU_REG_STATE_TYPE_USR_1, buf, count); 277 break; 278 default: 279 return -EINVAL; 280 } 281 282 return bytes_read; 283 } 284 285 BIN_ATTR(reg_state, 0444, amdgpu_sysfs_reg_state_get, NULL, 286 AMDGPU_SYS_REG_STATE_END); 287 288 int amdgpu_reg_state_sysfs_init(struct amdgpu_device *adev) 289 { 290 int ret; 291 292 if (!amdgpu_asic_get_reg_state_supported(adev)) 293 return 0; 294 295 ret = sysfs_create_bin_file(&adev->dev->kobj, &bin_attr_reg_state); 296 297 return ret; 298 } 299 300 void amdgpu_reg_state_sysfs_fini(struct amdgpu_device *adev) 301 { 302 if (!amdgpu_asic_get_reg_state_supported(adev)) 303 return; 304 sysfs_remove_bin_file(&adev->dev->kobj, &bin_attr_reg_state); 305 } 306 307 int amdgpu_ip_block_suspend(struct amdgpu_ip_block *ip_block) 308 { 309 int r; 310 311 if (ip_block->version->funcs->suspend) { 312 r = ip_block->version->funcs->suspend(ip_block); 313 if (r) { 314 dev_err(ip_block->adev->dev, 315 "suspend of IP block <%s> failed %d\n", 316 ip_block->version->funcs->name, r); 317 return r; 318 } 319 } 320 321 ip_block->status.hw = false; 322 return 0; 323 } 324 325 int amdgpu_ip_block_resume(struct amdgpu_ip_block *ip_block) 326 { 327 int r; 328 329 if (ip_block->version->funcs->resume) { 330 r = ip_block->version->funcs->resume(ip_block); 331 if (r) { 332 dev_err(ip_block->adev->dev, 333 "resume of IP block <%s> failed %d\n", 334 ip_block->version->funcs->name, r); 335 return r; 336 } 337 } 338 339 ip_block->status.hw = true; 340 return 0; 341 } 342 343 /** 344 * DOC: board_info 345 * 346 * The amdgpu driver provides a sysfs API for giving board related information. 347 * It provides the form factor information in the format 348 * 349 * type : form factor 350 * 351 * Possible form factor values 352 * 353 * - "cem" - PCIE CEM card 354 * - "oam" - Open Compute Accelerator Module 355 * - "unknown" - Not known 356 * 357 */ 358 359 static ssize_t amdgpu_device_get_board_info(struct device *dev, 360 struct device_attribute *attr, 361 char *buf) 362 { 363 struct drm_device *ddev = dev_get_drvdata(dev); 364 struct amdgpu_device *adev = drm_to_adev(ddev); 365 enum amdgpu_pkg_type pkg_type = AMDGPU_PKG_TYPE_CEM; 366 const char *pkg; 367 368 if (adev->smuio.funcs && adev->smuio.funcs->get_pkg_type) 369 pkg_type = adev->smuio.funcs->get_pkg_type(adev); 370 371 switch (pkg_type) { 372 case AMDGPU_PKG_TYPE_CEM: 373 pkg = "cem"; 374 break; 375 case AMDGPU_PKG_TYPE_OAM: 376 pkg = "oam"; 377 break; 378 default: 379 pkg = "unknown"; 380 break; 381 } 382 383 return sysfs_emit(buf, "%s : %s\n", "type", pkg); 384 } 385 386 static DEVICE_ATTR(board_info, 0444, amdgpu_device_get_board_info, NULL); 387 388 static struct attribute *amdgpu_board_attrs[] = { 389 &dev_attr_board_info.attr, 390 NULL, 391 }; 392 393 static umode_t amdgpu_board_attrs_is_visible(struct kobject *kobj, 394 struct attribute *attr, int n) 395 { 396 struct device *dev = kobj_to_dev(kobj); 397 struct drm_device *ddev = dev_get_drvdata(dev); 398 struct amdgpu_device *adev = drm_to_adev(ddev); 399 400 if (adev->flags & AMD_IS_APU) 401 return 0; 402 403 return attr->mode; 404 } 405 406 static const struct attribute_group amdgpu_board_attrs_group = { 407 .attrs = amdgpu_board_attrs, 408 .is_visible = amdgpu_board_attrs_is_visible 409 }; 410 411 static void amdgpu_device_get_pcie_info(struct amdgpu_device *adev); 412 413 414 /** 415 * amdgpu_device_supports_px - Is the device a dGPU with ATPX power control 416 * 417 * @dev: drm_device pointer 418 * 419 * Returns true if the device is a dGPU with ATPX power control, 420 * otherwise return false. 421 */ 422 bool amdgpu_device_supports_px(struct drm_device *dev) 423 { 424 struct amdgpu_device *adev = drm_to_adev(dev); 425 426 if ((adev->flags & AMD_IS_PX) && !amdgpu_is_atpx_hybrid()) 427 return true; 428 return false; 429 } 430 431 /** 432 * amdgpu_device_supports_boco - Is the device a dGPU with ACPI power resources 433 * 434 * @dev: drm_device pointer 435 * 436 * Returns true if the device is a dGPU with ACPI power control, 437 * otherwise return false. 438 */ 439 bool amdgpu_device_supports_boco(struct drm_device *dev) 440 { 441 struct amdgpu_device *adev = drm_to_adev(dev); 442 443 if (!IS_ENABLED(CONFIG_HOTPLUG_PCI_PCIE)) 444 return false; 445 446 if (adev->has_pr3 || 447 ((adev->flags & AMD_IS_PX) && amdgpu_is_atpx_hybrid())) 448 return true; 449 return false; 450 } 451 452 /** 453 * amdgpu_device_supports_baco - Does the device support BACO 454 * 455 * @dev: drm_device pointer 456 * 457 * Return: 458 * 1 if the device supports BACO; 459 * 3 if the device supports MACO (only works if BACO is supported) 460 * otherwise return 0. 461 */ 462 int amdgpu_device_supports_baco(struct drm_device *dev) 463 { 464 struct amdgpu_device *adev = drm_to_adev(dev); 465 466 return amdgpu_asic_supports_baco(adev); 467 } 468 469 void amdgpu_device_detect_runtime_pm_mode(struct amdgpu_device *adev) 470 { 471 struct drm_device *dev; 472 int bamaco_support; 473 474 dev = adev_to_drm(adev); 475 476 adev->pm.rpm_mode = AMDGPU_RUNPM_NONE; 477 bamaco_support = amdgpu_device_supports_baco(dev); 478 479 switch (amdgpu_runtime_pm) { 480 case 2: 481 if (bamaco_support & MACO_SUPPORT) { 482 adev->pm.rpm_mode = AMDGPU_RUNPM_BAMACO; 483 dev_info(adev->dev, "Forcing BAMACO for runtime pm\n"); 484 } else if (bamaco_support == BACO_SUPPORT) { 485 adev->pm.rpm_mode = AMDGPU_RUNPM_BACO; 486 dev_info(adev->dev, "Requested mode BAMACO not available,fallback to use BACO\n"); 487 } 488 break; 489 case 1: 490 if (bamaco_support & BACO_SUPPORT) { 491 adev->pm.rpm_mode = AMDGPU_RUNPM_BACO; 492 dev_info(adev->dev, "Forcing BACO for runtime pm\n"); 493 } 494 break; 495 case -1: 496 case -2: 497 if (amdgpu_device_supports_px(dev)) { /* enable PX as runtime mode */ 498 adev->pm.rpm_mode = AMDGPU_RUNPM_PX; 499 dev_info(adev->dev, "Using ATPX for runtime pm\n"); 500 } else if (amdgpu_device_supports_boco(dev)) { /* enable boco as runtime mode */ 501 adev->pm.rpm_mode = AMDGPU_RUNPM_BOCO; 502 dev_info(adev->dev, "Using BOCO for runtime pm\n"); 503 } else { 504 if (!bamaco_support) 505 goto no_runtime_pm; 506 507 switch (adev->asic_type) { 508 case CHIP_VEGA20: 509 case CHIP_ARCTURUS: 510 /* BACO are not supported on vega20 and arctrus */ 511 break; 512 case CHIP_VEGA10: 513 /* enable BACO as runpm mode if noretry=0 */ 514 if (!adev->gmc.noretry) 515 adev->pm.rpm_mode = AMDGPU_RUNPM_BACO; 516 break; 517 default: 518 /* enable BACO as runpm mode on CI+ */ 519 adev->pm.rpm_mode = AMDGPU_RUNPM_BACO; 520 break; 521 } 522 523 if (adev->pm.rpm_mode == AMDGPU_RUNPM_BACO) { 524 if (bamaco_support & MACO_SUPPORT) { 525 adev->pm.rpm_mode = AMDGPU_RUNPM_BAMACO; 526 dev_info(adev->dev, "Using BAMACO for runtime pm\n"); 527 } else { 528 dev_info(adev->dev, "Using BACO for runtime pm\n"); 529 } 530 } 531 } 532 break; 533 case 0: 534 dev_info(adev->dev, "runtime pm is manually disabled\n"); 535 break; 536 default: 537 break; 538 } 539 540 no_runtime_pm: 541 if (adev->pm.rpm_mode == AMDGPU_RUNPM_NONE) 542 dev_info(adev->dev, "Runtime PM not available\n"); 543 } 544 /** 545 * amdgpu_device_supports_smart_shift - Is the device dGPU with 546 * smart shift support 547 * 548 * @dev: drm_device pointer 549 * 550 * Returns true if the device is a dGPU with Smart Shift support, 551 * otherwise returns false. 552 */ 553 bool amdgpu_device_supports_smart_shift(struct drm_device *dev) 554 { 555 return (amdgpu_device_supports_boco(dev) && 556 amdgpu_acpi_is_power_shift_control_supported()); 557 } 558 559 /* 560 * VRAM access helper functions 561 */ 562 563 /** 564 * amdgpu_device_mm_access - access vram by MM_INDEX/MM_DATA 565 * 566 * @adev: amdgpu_device pointer 567 * @pos: offset of the buffer in vram 568 * @buf: virtual address of the buffer in system memory 569 * @size: read/write size, sizeof(@buf) must > @size 570 * @write: true - write to vram, otherwise - read from vram 571 */ 572 void amdgpu_device_mm_access(struct amdgpu_device *adev, loff_t pos, 573 void *buf, size_t size, bool write) 574 { 575 unsigned long flags; 576 uint32_t hi = ~0, tmp = 0; 577 uint32_t *data = buf; 578 uint64_t last; 579 int idx; 580 581 if (!drm_dev_enter(adev_to_drm(adev), &idx)) 582 return; 583 584 BUG_ON(!IS_ALIGNED(pos, 4) || !IS_ALIGNED(size, 4)); 585 586 spin_lock_irqsave(&adev->mmio_idx_lock, flags); 587 for (last = pos + size; pos < last; pos += 4) { 588 tmp = pos >> 31; 589 590 WREG32_NO_KIQ(mmMM_INDEX, ((uint32_t)pos) | 0x80000000); 591 if (tmp != hi) { 592 WREG32_NO_KIQ(mmMM_INDEX_HI, tmp); 593 hi = tmp; 594 } 595 if (write) 596 WREG32_NO_KIQ(mmMM_DATA, *data++); 597 else 598 *data++ = RREG32_NO_KIQ(mmMM_DATA); 599 } 600 601 spin_unlock_irqrestore(&adev->mmio_idx_lock, flags); 602 drm_dev_exit(idx); 603 } 604 605 /** 606 * amdgpu_device_aper_access - access vram by vram aperture 607 * 608 * @adev: amdgpu_device pointer 609 * @pos: offset of the buffer in vram 610 * @buf: virtual address of the buffer in system memory 611 * @size: read/write size, sizeof(@buf) must > @size 612 * @write: true - write to vram, otherwise - read from vram 613 * 614 * The return value means how many bytes have been transferred. 615 */ 616 size_t amdgpu_device_aper_access(struct amdgpu_device *adev, loff_t pos, 617 void *buf, size_t size, bool write) 618 { 619 #ifdef CONFIG_64BIT 620 void __iomem *addr; 621 size_t count = 0; 622 uint64_t last; 623 624 if (!adev->mman.aper_base_kaddr) 625 return 0; 626 627 last = min(pos + size, adev->gmc.visible_vram_size); 628 if (last > pos) { 629 addr = adev->mman.aper_base_kaddr + pos; 630 count = last - pos; 631 632 if (write) { 633 memcpy_toio(addr, buf, count); 634 /* Make sure HDP write cache flush happens without any reordering 635 * after the system memory contents are sent over PCIe device 636 */ 637 mb(); 638 amdgpu_device_flush_hdp(adev, NULL); 639 } else { 640 amdgpu_device_invalidate_hdp(adev, NULL); 641 /* Make sure HDP read cache is invalidated before issuing a read 642 * to the PCIe device 643 */ 644 mb(); 645 memcpy_fromio(buf, addr, count); 646 } 647 648 } 649 650 return count; 651 #else 652 return 0; 653 #endif 654 } 655 656 /** 657 * amdgpu_device_vram_access - read/write a buffer in vram 658 * 659 * @adev: amdgpu_device pointer 660 * @pos: offset of the buffer in vram 661 * @buf: virtual address of the buffer in system memory 662 * @size: read/write size, sizeof(@buf) must > @size 663 * @write: true - write to vram, otherwise - read from vram 664 */ 665 void amdgpu_device_vram_access(struct amdgpu_device *adev, loff_t pos, 666 void *buf, size_t size, bool write) 667 { 668 size_t count; 669 670 /* try to using vram apreature to access vram first */ 671 count = amdgpu_device_aper_access(adev, pos, buf, size, write); 672 size -= count; 673 if (size) { 674 /* using MM to access rest vram */ 675 pos += count; 676 buf += count; 677 amdgpu_device_mm_access(adev, pos, buf, size, write); 678 } 679 } 680 681 /* 682 * register access helper functions. 683 */ 684 685 /* Check if hw access should be skipped because of hotplug or device error */ 686 bool amdgpu_device_skip_hw_access(struct amdgpu_device *adev) 687 { 688 if (adev->no_hw_access) 689 return true; 690 691 #ifdef CONFIG_LOCKDEP 692 /* 693 * This is a bit complicated to understand, so worth a comment. What we assert 694 * here is that the GPU reset is not running on another thread in parallel. 695 * 696 * For this we trylock the read side of the reset semaphore, if that succeeds 697 * we know that the reset is not running in parallel. 698 * 699 * If the trylock fails we assert that we are either already holding the read 700 * side of the lock or are the reset thread itself and hold the write side of 701 * the lock. 702 */ 703 if (in_task()) { 704 if (down_read_trylock(&adev->reset_domain->sem)) 705 up_read(&adev->reset_domain->sem); 706 else 707 lockdep_assert_held(&adev->reset_domain->sem); 708 } 709 #endif 710 return false; 711 } 712 713 /** 714 * amdgpu_device_rreg - read a memory mapped IO or indirect register 715 * 716 * @adev: amdgpu_device pointer 717 * @reg: dword aligned register offset 718 * @acc_flags: access flags which require special behavior 719 * 720 * Returns the 32 bit value from the offset specified. 721 */ 722 uint32_t amdgpu_device_rreg(struct amdgpu_device *adev, 723 uint32_t reg, uint32_t acc_flags) 724 { 725 uint32_t ret; 726 727 if (amdgpu_device_skip_hw_access(adev)) 728 return 0; 729 730 if ((reg * 4) < adev->rmmio_size) { 731 if (!(acc_flags & AMDGPU_REGS_NO_KIQ) && 732 amdgpu_sriov_runtime(adev) && 733 down_read_trylock(&adev->reset_domain->sem)) { 734 ret = amdgpu_kiq_rreg(adev, reg, 0); 735 up_read(&adev->reset_domain->sem); 736 } else { 737 ret = readl(((void __iomem *)adev->rmmio) + (reg * 4)); 738 } 739 } else { 740 ret = adev->pcie_rreg(adev, reg * 4); 741 } 742 743 trace_amdgpu_device_rreg(adev->pdev->device, reg, ret); 744 745 return ret; 746 } 747 748 /* 749 * MMIO register read with bytes helper functions 750 * @offset:bytes offset from MMIO start 751 */ 752 753 /** 754 * amdgpu_mm_rreg8 - read a memory mapped IO register 755 * 756 * @adev: amdgpu_device pointer 757 * @offset: byte aligned register offset 758 * 759 * Returns the 8 bit value from the offset specified. 760 */ 761 uint8_t amdgpu_mm_rreg8(struct amdgpu_device *adev, uint32_t offset) 762 { 763 if (amdgpu_device_skip_hw_access(adev)) 764 return 0; 765 766 if (offset < adev->rmmio_size) 767 return (readb(adev->rmmio + offset)); 768 BUG(); 769 } 770 771 772 /** 773 * amdgpu_device_xcc_rreg - read a memory mapped IO or indirect register with specific XCC 774 * 775 * @adev: amdgpu_device pointer 776 * @reg: dword aligned register offset 777 * @acc_flags: access flags which require special behavior 778 * @xcc_id: xcc accelerated compute core id 779 * 780 * Returns the 32 bit value from the offset specified. 781 */ 782 uint32_t amdgpu_device_xcc_rreg(struct amdgpu_device *adev, 783 uint32_t reg, uint32_t acc_flags, 784 uint32_t xcc_id) 785 { 786 uint32_t ret, rlcg_flag; 787 788 if (amdgpu_device_skip_hw_access(adev)) 789 return 0; 790 791 if ((reg * 4) < adev->rmmio_size) { 792 if (amdgpu_sriov_vf(adev) && 793 !amdgpu_sriov_runtime(adev) && 794 adev->gfx.rlc.rlcg_reg_access_supported && 795 amdgpu_virt_get_rlcg_reg_access_flag(adev, acc_flags, 796 GC_HWIP, false, 797 &rlcg_flag)) { 798 ret = amdgpu_virt_rlcg_reg_rw(adev, reg, 0, rlcg_flag, GET_INST(GC, xcc_id)); 799 } else if (!(acc_flags & AMDGPU_REGS_NO_KIQ) && 800 amdgpu_sriov_runtime(adev) && 801 down_read_trylock(&adev->reset_domain->sem)) { 802 ret = amdgpu_kiq_rreg(adev, reg, xcc_id); 803 up_read(&adev->reset_domain->sem); 804 } else { 805 ret = readl(((void __iomem *)adev->rmmio) + (reg * 4)); 806 } 807 } else { 808 ret = adev->pcie_rreg(adev, reg * 4); 809 } 810 811 return ret; 812 } 813 814 /* 815 * MMIO register write with bytes helper functions 816 * @offset:bytes offset from MMIO start 817 * @value: the value want to be written to the register 818 */ 819 820 /** 821 * amdgpu_mm_wreg8 - read a memory mapped IO register 822 * 823 * @adev: amdgpu_device pointer 824 * @offset: byte aligned register offset 825 * @value: 8 bit value to write 826 * 827 * Writes the value specified to the offset specified. 828 */ 829 void amdgpu_mm_wreg8(struct amdgpu_device *adev, uint32_t offset, uint8_t value) 830 { 831 if (amdgpu_device_skip_hw_access(adev)) 832 return; 833 834 if (offset < adev->rmmio_size) 835 writeb(value, adev->rmmio + offset); 836 else 837 BUG(); 838 } 839 840 /** 841 * amdgpu_device_wreg - write to a memory mapped IO or indirect register 842 * 843 * @adev: amdgpu_device pointer 844 * @reg: dword aligned register offset 845 * @v: 32 bit value to write to the register 846 * @acc_flags: access flags which require special behavior 847 * 848 * Writes the value specified to the offset specified. 849 */ 850 void amdgpu_device_wreg(struct amdgpu_device *adev, 851 uint32_t reg, uint32_t v, 852 uint32_t acc_flags) 853 { 854 if (amdgpu_device_skip_hw_access(adev)) 855 return; 856 857 if ((reg * 4) < adev->rmmio_size) { 858 if (!(acc_flags & AMDGPU_REGS_NO_KIQ) && 859 amdgpu_sriov_runtime(adev) && 860 down_read_trylock(&adev->reset_domain->sem)) { 861 amdgpu_kiq_wreg(adev, reg, v, 0); 862 up_read(&adev->reset_domain->sem); 863 } else { 864 writel(v, ((void __iomem *)adev->rmmio) + (reg * 4)); 865 } 866 } else { 867 adev->pcie_wreg(adev, reg * 4, v); 868 } 869 870 trace_amdgpu_device_wreg(adev->pdev->device, reg, v); 871 } 872 873 /** 874 * amdgpu_mm_wreg_mmio_rlc - write register either with direct/indirect mmio or with RLC path if in range 875 * 876 * @adev: amdgpu_device pointer 877 * @reg: mmio/rlc register 878 * @v: value to write 879 * @xcc_id: xcc accelerated compute core id 880 * 881 * this function is invoked only for the debugfs register access 882 */ 883 void amdgpu_mm_wreg_mmio_rlc(struct amdgpu_device *adev, 884 uint32_t reg, uint32_t v, 885 uint32_t xcc_id) 886 { 887 if (amdgpu_device_skip_hw_access(adev)) 888 return; 889 890 if (amdgpu_sriov_fullaccess(adev) && 891 adev->gfx.rlc.funcs && 892 adev->gfx.rlc.funcs->is_rlcg_access_range) { 893 if (adev->gfx.rlc.funcs->is_rlcg_access_range(adev, reg)) 894 return amdgpu_sriov_wreg(adev, reg, v, 0, 0, xcc_id); 895 } else if ((reg * 4) >= adev->rmmio_size) { 896 adev->pcie_wreg(adev, reg * 4, v); 897 } else { 898 writel(v, ((void __iomem *)adev->rmmio) + (reg * 4)); 899 } 900 } 901 902 /** 903 * amdgpu_device_xcc_wreg - write to a memory mapped IO or indirect register with specific XCC 904 * 905 * @adev: amdgpu_device pointer 906 * @reg: dword aligned register offset 907 * @v: 32 bit value to write to the register 908 * @acc_flags: access flags which require special behavior 909 * @xcc_id: xcc accelerated compute core id 910 * 911 * Writes the value specified to the offset specified. 912 */ 913 void amdgpu_device_xcc_wreg(struct amdgpu_device *adev, 914 uint32_t reg, uint32_t v, 915 uint32_t acc_flags, uint32_t xcc_id) 916 { 917 uint32_t rlcg_flag; 918 919 if (amdgpu_device_skip_hw_access(adev)) 920 return; 921 922 if ((reg * 4) < adev->rmmio_size) { 923 if (amdgpu_sriov_vf(adev) && 924 !amdgpu_sriov_runtime(adev) && 925 adev->gfx.rlc.rlcg_reg_access_supported && 926 amdgpu_virt_get_rlcg_reg_access_flag(adev, acc_flags, 927 GC_HWIP, true, 928 &rlcg_flag)) { 929 amdgpu_virt_rlcg_reg_rw(adev, reg, v, rlcg_flag, GET_INST(GC, xcc_id)); 930 } else if (!(acc_flags & AMDGPU_REGS_NO_KIQ) && 931 amdgpu_sriov_runtime(adev) && 932 down_read_trylock(&adev->reset_domain->sem)) { 933 amdgpu_kiq_wreg(adev, reg, v, xcc_id); 934 up_read(&adev->reset_domain->sem); 935 } else { 936 writel(v, ((void __iomem *)adev->rmmio) + (reg * 4)); 937 } 938 } else { 939 adev->pcie_wreg(adev, reg * 4, v); 940 } 941 } 942 943 /** 944 * amdgpu_device_indirect_rreg - read an indirect register 945 * 946 * @adev: amdgpu_device pointer 947 * @reg_addr: indirect register address to read from 948 * 949 * Returns the value of indirect register @reg_addr 950 */ 951 u32 amdgpu_device_indirect_rreg(struct amdgpu_device *adev, 952 u32 reg_addr) 953 { 954 unsigned long flags, pcie_index, pcie_data; 955 void __iomem *pcie_index_offset; 956 void __iomem *pcie_data_offset; 957 u32 r; 958 959 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 960 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 961 962 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 963 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 964 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 965 966 writel(reg_addr, pcie_index_offset); 967 readl(pcie_index_offset); 968 r = readl(pcie_data_offset); 969 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 970 971 return r; 972 } 973 974 u32 amdgpu_device_indirect_rreg_ext(struct amdgpu_device *adev, 975 u64 reg_addr) 976 { 977 unsigned long flags, pcie_index, pcie_index_hi, pcie_data; 978 u32 r; 979 void __iomem *pcie_index_offset; 980 void __iomem *pcie_index_hi_offset; 981 void __iomem *pcie_data_offset; 982 983 if (unlikely(!adev->nbio.funcs)) { 984 pcie_index = AMDGPU_PCIE_INDEX_FALLBACK; 985 pcie_data = AMDGPU_PCIE_DATA_FALLBACK; 986 } else { 987 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 988 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 989 } 990 991 if (reg_addr >> 32) { 992 if (unlikely(!adev->nbio.funcs)) 993 pcie_index_hi = AMDGPU_PCIE_INDEX_HI_FALLBACK; 994 else 995 pcie_index_hi = adev->nbio.funcs->get_pcie_index_hi_offset(adev); 996 } else { 997 pcie_index_hi = 0; 998 } 999 1000 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 1001 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 1002 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 1003 if (pcie_index_hi != 0) 1004 pcie_index_hi_offset = (void __iomem *)adev->rmmio + 1005 pcie_index_hi * 4; 1006 1007 writel(reg_addr, pcie_index_offset); 1008 readl(pcie_index_offset); 1009 if (pcie_index_hi != 0) { 1010 writel((reg_addr >> 32) & 0xff, pcie_index_hi_offset); 1011 readl(pcie_index_hi_offset); 1012 } 1013 r = readl(pcie_data_offset); 1014 1015 /* clear the high bits */ 1016 if (pcie_index_hi != 0) { 1017 writel(0, pcie_index_hi_offset); 1018 readl(pcie_index_hi_offset); 1019 } 1020 1021 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 1022 1023 return r; 1024 } 1025 1026 /** 1027 * amdgpu_device_indirect_rreg64 - read a 64bits indirect register 1028 * 1029 * @adev: amdgpu_device pointer 1030 * @reg_addr: indirect register address to read from 1031 * 1032 * Returns the value of indirect register @reg_addr 1033 */ 1034 u64 amdgpu_device_indirect_rreg64(struct amdgpu_device *adev, 1035 u32 reg_addr) 1036 { 1037 unsigned long flags, pcie_index, pcie_data; 1038 void __iomem *pcie_index_offset; 1039 void __iomem *pcie_data_offset; 1040 u64 r; 1041 1042 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 1043 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 1044 1045 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 1046 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 1047 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 1048 1049 /* read low 32 bits */ 1050 writel(reg_addr, pcie_index_offset); 1051 readl(pcie_index_offset); 1052 r = readl(pcie_data_offset); 1053 /* read high 32 bits */ 1054 writel(reg_addr + 4, pcie_index_offset); 1055 readl(pcie_index_offset); 1056 r |= ((u64)readl(pcie_data_offset) << 32); 1057 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 1058 1059 return r; 1060 } 1061 1062 u64 amdgpu_device_indirect_rreg64_ext(struct amdgpu_device *adev, 1063 u64 reg_addr) 1064 { 1065 unsigned long flags, pcie_index, pcie_data; 1066 unsigned long pcie_index_hi = 0; 1067 void __iomem *pcie_index_offset; 1068 void __iomem *pcie_index_hi_offset; 1069 void __iomem *pcie_data_offset; 1070 u64 r; 1071 1072 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 1073 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 1074 if ((reg_addr >> 32) && (adev->nbio.funcs->get_pcie_index_hi_offset)) 1075 pcie_index_hi = adev->nbio.funcs->get_pcie_index_hi_offset(adev); 1076 1077 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 1078 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 1079 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 1080 if (pcie_index_hi != 0) 1081 pcie_index_hi_offset = (void __iomem *)adev->rmmio + 1082 pcie_index_hi * 4; 1083 1084 /* read low 32 bits */ 1085 writel(reg_addr, pcie_index_offset); 1086 readl(pcie_index_offset); 1087 if (pcie_index_hi != 0) { 1088 writel((reg_addr >> 32) & 0xff, pcie_index_hi_offset); 1089 readl(pcie_index_hi_offset); 1090 } 1091 r = readl(pcie_data_offset); 1092 /* read high 32 bits */ 1093 writel(reg_addr + 4, pcie_index_offset); 1094 readl(pcie_index_offset); 1095 if (pcie_index_hi != 0) { 1096 writel((reg_addr >> 32) & 0xff, pcie_index_hi_offset); 1097 readl(pcie_index_hi_offset); 1098 } 1099 r |= ((u64)readl(pcie_data_offset) << 32); 1100 1101 /* clear the high bits */ 1102 if (pcie_index_hi != 0) { 1103 writel(0, pcie_index_hi_offset); 1104 readl(pcie_index_hi_offset); 1105 } 1106 1107 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 1108 1109 return r; 1110 } 1111 1112 /** 1113 * amdgpu_device_indirect_wreg - write an indirect register address 1114 * 1115 * @adev: amdgpu_device pointer 1116 * @reg_addr: indirect register offset 1117 * @reg_data: indirect register data 1118 * 1119 */ 1120 void amdgpu_device_indirect_wreg(struct amdgpu_device *adev, 1121 u32 reg_addr, u32 reg_data) 1122 { 1123 unsigned long flags, pcie_index, pcie_data; 1124 void __iomem *pcie_index_offset; 1125 void __iomem *pcie_data_offset; 1126 1127 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 1128 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 1129 1130 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 1131 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 1132 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 1133 1134 writel(reg_addr, pcie_index_offset); 1135 readl(pcie_index_offset); 1136 writel(reg_data, pcie_data_offset); 1137 readl(pcie_data_offset); 1138 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 1139 } 1140 1141 void amdgpu_device_indirect_wreg_ext(struct amdgpu_device *adev, 1142 u64 reg_addr, u32 reg_data) 1143 { 1144 unsigned long flags, pcie_index, pcie_index_hi, pcie_data; 1145 void __iomem *pcie_index_offset; 1146 void __iomem *pcie_index_hi_offset; 1147 void __iomem *pcie_data_offset; 1148 1149 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 1150 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 1151 if ((reg_addr >> 32) && (adev->nbio.funcs->get_pcie_index_hi_offset)) 1152 pcie_index_hi = adev->nbio.funcs->get_pcie_index_hi_offset(adev); 1153 else 1154 pcie_index_hi = 0; 1155 1156 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 1157 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 1158 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 1159 if (pcie_index_hi != 0) 1160 pcie_index_hi_offset = (void __iomem *)adev->rmmio + 1161 pcie_index_hi * 4; 1162 1163 writel(reg_addr, pcie_index_offset); 1164 readl(pcie_index_offset); 1165 if (pcie_index_hi != 0) { 1166 writel((reg_addr >> 32) & 0xff, pcie_index_hi_offset); 1167 readl(pcie_index_hi_offset); 1168 } 1169 writel(reg_data, pcie_data_offset); 1170 readl(pcie_data_offset); 1171 1172 /* clear the high bits */ 1173 if (pcie_index_hi != 0) { 1174 writel(0, pcie_index_hi_offset); 1175 readl(pcie_index_hi_offset); 1176 } 1177 1178 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 1179 } 1180 1181 /** 1182 * amdgpu_device_indirect_wreg64 - write a 64bits indirect register address 1183 * 1184 * @adev: amdgpu_device pointer 1185 * @reg_addr: indirect register offset 1186 * @reg_data: indirect register data 1187 * 1188 */ 1189 void amdgpu_device_indirect_wreg64(struct amdgpu_device *adev, 1190 u32 reg_addr, u64 reg_data) 1191 { 1192 unsigned long flags, pcie_index, pcie_data; 1193 void __iomem *pcie_index_offset; 1194 void __iomem *pcie_data_offset; 1195 1196 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 1197 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 1198 1199 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 1200 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 1201 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 1202 1203 /* write low 32 bits */ 1204 writel(reg_addr, pcie_index_offset); 1205 readl(pcie_index_offset); 1206 writel((u32)(reg_data & 0xffffffffULL), pcie_data_offset); 1207 readl(pcie_data_offset); 1208 /* write high 32 bits */ 1209 writel(reg_addr + 4, pcie_index_offset); 1210 readl(pcie_index_offset); 1211 writel((u32)(reg_data >> 32), pcie_data_offset); 1212 readl(pcie_data_offset); 1213 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 1214 } 1215 1216 void amdgpu_device_indirect_wreg64_ext(struct amdgpu_device *adev, 1217 u64 reg_addr, u64 reg_data) 1218 { 1219 unsigned long flags, pcie_index, pcie_data; 1220 unsigned long pcie_index_hi = 0; 1221 void __iomem *pcie_index_offset; 1222 void __iomem *pcie_index_hi_offset; 1223 void __iomem *pcie_data_offset; 1224 1225 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 1226 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 1227 if ((reg_addr >> 32) && (adev->nbio.funcs->get_pcie_index_hi_offset)) 1228 pcie_index_hi = adev->nbio.funcs->get_pcie_index_hi_offset(adev); 1229 1230 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 1231 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 1232 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 1233 if (pcie_index_hi != 0) 1234 pcie_index_hi_offset = (void __iomem *)adev->rmmio + 1235 pcie_index_hi * 4; 1236 1237 /* write low 32 bits */ 1238 writel(reg_addr, pcie_index_offset); 1239 readl(pcie_index_offset); 1240 if (pcie_index_hi != 0) { 1241 writel((reg_addr >> 32) & 0xff, pcie_index_hi_offset); 1242 readl(pcie_index_hi_offset); 1243 } 1244 writel((u32)(reg_data & 0xffffffffULL), pcie_data_offset); 1245 readl(pcie_data_offset); 1246 /* write high 32 bits */ 1247 writel(reg_addr + 4, pcie_index_offset); 1248 readl(pcie_index_offset); 1249 if (pcie_index_hi != 0) { 1250 writel((reg_addr >> 32) & 0xff, pcie_index_hi_offset); 1251 readl(pcie_index_hi_offset); 1252 } 1253 writel((u32)(reg_data >> 32), pcie_data_offset); 1254 readl(pcie_data_offset); 1255 1256 /* clear the high bits */ 1257 if (pcie_index_hi != 0) { 1258 writel(0, pcie_index_hi_offset); 1259 readl(pcie_index_hi_offset); 1260 } 1261 1262 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 1263 } 1264 1265 /** 1266 * amdgpu_device_get_rev_id - query device rev_id 1267 * 1268 * @adev: amdgpu_device pointer 1269 * 1270 * Return device rev_id 1271 */ 1272 u32 amdgpu_device_get_rev_id(struct amdgpu_device *adev) 1273 { 1274 return adev->nbio.funcs->get_rev_id(adev); 1275 } 1276 1277 /** 1278 * amdgpu_invalid_rreg - dummy reg read function 1279 * 1280 * @adev: amdgpu_device pointer 1281 * @reg: offset of register 1282 * 1283 * Dummy register read function. Used for register blocks 1284 * that certain asics don't have (all asics). 1285 * Returns the value in the register. 1286 */ 1287 static uint32_t amdgpu_invalid_rreg(struct amdgpu_device *adev, uint32_t reg) 1288 { 1289 DRM_ERROR("Invalid callback to read register 0x%04X\n", reg); 1290 BUG(); 1291 return 0; 1292 } 1293 1294 static uint32_t amdgpu_invalid_rreg_ext(struct amdgpu_device *adev, uint64_t reg) 1295 { 1296 DRM_ERROR("Invalid callback to read register 0x%llX\n", reg); 1297 BUG(); 1298 return 0; 1299 } 1300 1301 /** 1302 * amdgpu_invalid_wreg - dummy reg write function 1303 * 1304 * @adev: amdgpu_device pointer 1305 * @reg: offset of register 1306 * @v: value to write to the register 1307 * 1308 * Dummy register read function. Used for register blocks 1309 * that certain asics don't have (all asics). 1310 */ 1311 static void amdgpu_invalid_wreg(struct amdgpu_device *adev, uint32_t reg, uint32_t v) 1312 { 1313 DRM_ERROR("Invalid callback to write register 0x%04X with 0x%08X\n", 1314 reg, v); 1315 BUG(); 1316 } 1317 1318 static void amdgpu_invalid_wreg_ext(struct amdgpu_device *adev, uint64_t reg, uint32_t v) 1319 { 1320 DRM_ERROR("Invalid callback to write register 0x%llX with 0x%08X\n", 1321 reg, v); 1322 BUG(); 1323 } 1324 1325 /** 1326 * amdgpu_invalid_rreg64 - dummy 64 bit reg read function 1327 * 1328 * @adev: amdgpu_device pointer 1329 * @reg: offset of register 1330 * 1331 * Dummy register read function. Used for register blocks 1332 * that certain asics don't have (all asics). 1333 * Returns the value in the register. 1334 */ 1335 static uint64_t amdgpu_invalid_rreg64(struct amdgpu_device *adev, uint32_t reg) 1336 { 1337 DRM_ERROR("Invalid callback to read 64 bit register 0x%04X\n", reg); 1338 BUG(); 1339 return 0; 1340 } 1341 1342 static uint64_t amdgpu_invalid_rreg64_ext(struct amdgpu_device *adev, uint64_t reg) 1343 { 1344 DRM_ERROR("Invalid callback to read register 0x%llX\n", reg); 1345 BUG(); 1346 return 0; 1347 } 1348 1349 /** 1350 * amdgpu_invalid_wreg64 - dummy reg write function 1351 * 1352 * @adev: amdgpu_device pointer 1353 * @reg: offset of register 1354 * @v: value to write to the register 1355 * 1356 * Dummy register read function. Used for register blocks 1357 * that certain asics don't have (all asics). 1358 */ 1359 static void amdgpu_invalid_wreg64(struct amdgpu_device *adev, uint32_t reg, uint64_t v) 1360 { 1361 DRM_ERROR("Invalid callback to write 64 bit register 0x%04X with 0x%08llX\n", 1362 reg, v); 1363 BUG(); 1364 } 1365 1366 static void amdgpu_invalid_wreg64_ext(struct amdgpu_device *adev, uint64_t reg, uint64_t v) 1367 { 1368 DRM_ERROR("Invalid callback to write 64 bit register 0x%llX with 0x%08llX\n", 1369 reg, v); 1370 BUG(); 1371 } 1372 1373 /** 1374 * amdgpu_block_invalid_rreg - dummy reg read function 1375 * 1376 * @adev: amdgpu_device pointer 1377 * @block: offset of instance 1378 * @reg: offset of register 1379 * 1380 * Dummy register read function. Used for register blocks 1381 * that certain asics don't have (all asics). 1382 * Returns the value in the register. 1383 */ 1384 static uint32_t amdgpu_block_invalid_rreg(struct amdgpu_device *adev, 1385 uint32_t block, uint32_t reg) 1386 { 1387 DRM_ERROR("Invalid callback to read register 0x%04X in block 0x%04X\n", 1388 reg, block); 1389 BUG(); 1390 return 0; 1391 } 1392 1393 /** 1394 * amdgpu_block_invalid_wreg - dummy reg write function 1395 * 1396 * @adev: amdgpu_device pointer 1397 * @block: offset of instance 1398 * @reg: offset of register 1399 * @v: value to write to the register 1400 * 1401 * Dummy register read function. Used for register blocks 1402 * that certain asics don't have (all asics). 1403 */ 1404 static void amdgpu_block_invalid_wreg(struct amdgpu_device *adev, 1405 uint32_t block, 1406 uint32_t reg, uint32_t v) 1407 { 1408 DRM_ERROR("Invalid block callback to write register 0x%04X in block 0x%04X with 0x%08X\n", 1409 reg, block, v); 1410 BUG(); 1411 } 1412 1413 static uint32_t amdgpu_device_get_vbios_flags(struct amdgpu_device *adev) 1414 { 1415 if (hweight32(adev->aid_mask) && (adev->flags & AMD_IS_APU)) 1416 return AMDGPU_VBIOS_SKIP; 1417 1418 if (hweight32(adev->aid_mask) && amdgpu_passthrough(adev)) 1419 return AMDGPU_VBIOS_OPTIONAL; 1420 1421 return 0; 1422 } 1423 1424 /** 1425 * amdgpu_device_asic_init - Wrapper for atom asic_init 1426 * 1427 * @adev: amdgpu_device pointer 1428 * 1429 * Does any asic specific work and then calls atom asic init. 1430 */ 1431 static int amdgpu_device_asic_init(struct amdgpu_device *adev) 1432 { 1433 uint32_t flags; 1434 bool optional; 1435 int ret; 1436 1437 amdgpu_asic_pre_asic_init(adev); 1438 flags = amdgpu_device_get_vbios_flags(adev); 1439 optional = !!(flags & (AMDGPU_VBIOS_OPTIONAL | AMDGPU_VBIOS_SKIP)); 1440 1441 if (amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 3) || 1442 amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 4) || 1443 amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 5, 0) || 1444 amdgpu_ip_version(adev, GC_HWIP, 0) >= IP_VERSION(11, 0, 0)) { 1445 amdgpu_psp_wait_for_bootloader(adev); 1446 if (optional && !adev->bios) 1447 return 0; 1448 1449 ret = amdgpu_atomfirmware_asic_init(adev, true); 1450 return ret; 1451 } else { 1452 if (optional && !adev->bios) 1453 return 0; 1454 1455 return amdgpu_atom_asic_init(adev->mode_info.atom_context); 1456 } 1457 1458 return 0; 1459 } 1460 1461 /** 1462 * amdgpu_device_mem_scratch_init - allocate the VRAM scratch page 1463 * 1464 * @adev: amdgpu_device pointer 1465 * 1466 * Allocates a scratch page of VRAM for use by various things in the 1467 * driver. 1468 */ 1469 static int amdgpu_device_mem_scratch_init(struct amdgpu_device *adev) 1470 { 1471 return amdgpu_bo_create_kernel(adev, AMDGPU_GPU_PAGE_SIZE, PAGE_SIZE, 1472 AMDGPU_GEM_DOMAIN_VRAM | 1473 AMDGPU_GEM_DOMAIN_GTT, 1474 &adev->mem_scratch.robj, 1475 &adev->mem_scratch.gpu_addr, 1476 (void **)&adev->mem_scratch.ptr); 1477 } 1478 1479 /** 1480 * amdgpu_device_mem_scratch_fini - Free the VRAM scratch page 1481 * 1482 * @adev: amdgpu_device pointer 1483 * 1484 * Frees the VRAM scratch page. 1485 */ 1486 static void amdgpu_device_mem_scratch_fini(struct amdgpu_device *adev) 1487 { 1488 amdgpu_bo_free_kernel(&adev->mem_scratch.robj, NULL, NULL); 1489 } 1490 1491 /** 1492 * amdgpu_device_program_register_sequence - program an array of registers. 1493 * 1494 * @adev: amdgpu_device pointer 1495 * @registers: pointer to the register array 1496 * @array_size: size of the register array 1497 * 1498 * Programs an array or registers with and or masks. 1499 * This is a helper for setting golden registers. 1500 */ 1501 void amdgpu_device_program_register_sequence(struct amdgpu_device *adev, 1502 const u32 *registers, 1503 const u32 array_size) 1504 { 1505 u32 tmp, reg, and_mask, or_mask; 1506 int i; 1507 1508 if (array_size % 3) 1509 return; 1510 1511 for (i = 0; i < array_size; i += 3) { 1512 reg = registers[i + 0]; 1513 and_mask = registers[i + 1]; 1514 or_mask = registers[i + 2]; 1515 1516 if (and_mask == 0xffffffff) { 1517 tmp = or_mask; 1518 } else { 1519 tmp = RREG32(reg); 1520 tmp &= ~and_mask; 1521 if (adev->family >= AMDGPU_FAMILY_AI) 1522 tmp |= (or_mask & and_mask); 1523 else 1524 tmp |= or_mask; 1525 } 1526 WREG32(reg, tmp); 1527 } 1528 } 1529 1530 /** 1531 * amdgpu_device_pci_config_reset - reset the GPU 1532 * 1533 * @adev: amdgpu_device pointer 1534 * 1535 * Resets the GPU using the pci config reset sequence. 1536 * Only applicable to asics prior to vega10. 1537 */ 1538 void amdgpu_device_pci_config_reset(struct amdgpu_device *adev) 1539 { 1540 pci_write_config_dword(adev->pdev, 0x7c, AMDGPU_ASIC_RESET_DATA); 1541 } 1542 1543 /** 1544 * amdgpu_device_pci_reset - reset the GPU using generic PCI means 1545 * 1546 * @adev: amdgpu_device pointer 1547 * 1548 * Resets the GPU using generic pci reset interfaces (FLR, SBR, etc.). 1549 */ 1550 int amdgpu_device_pci_reset(struct amdgpu_device *adev) 1551 { 1552 return pci_reset_function(adev->pdev); 1553 } 1554 1555 /* 1556 * amdgpu_device_wb_*() 1557 * Writeback is the method by which the GPU updates special pages in memory 1558 * with the status of certain GPU events (fences, ring pointers,etc.). 1559 */ 1560 1561 /** 1562 * amdgpu_device_wb_fini - Disable Writeback and free memory 1563 * 1564 * @adev: amdgpu_device pointer 1565 * 1566 * Disables Writeback and frees the Writeback memory (all asics). 1567 * Used at driver shutdown. 1568 */ 1569 static void amdgpu_device_wb_fini(struct amdgpu_device *adev) 1570 { 1571 if (adev->wb.wb_obj) { 1572 amdgpu_bo_free_kernel(&adev->wb.wb_obj, 1573 &adev->wb.gpu_addr, 1574 (void **)&adev->wb.wb); 1575 adev->wb.wb_obj = NULL; 1576 } 1577 } 1578 1579 /** 1580 * amdgpu_device_wb_init - Init Writeback driver info and allocate memory 1581 * 1582 * @adev: amdgpu_device pointer 1583 * 1584 * Initializes writeback and allocates writeback memory (all asics). 1585 * Used at driver startup. 1586 * Returns 0 on success or an -error on failure. 1587 */ 1588 static int amdgpu_device_wb_init(struct amdgpu_device *adev) 1589 { 1590 int r; 1591 1592 if (adev->wb.wb_obj == NULL) { 1593 /* AMDGPU_MAX_WB * sizeof(uint32_t) * 8 = AMDGPU_MAX_WB 256bit slots */ 1594 r = amdgpu_bo_create_kernel(adev, AMDGPU_MAX_WB * sizeof(uint32_t) * 8, 1595 PAGE_SIZE, AMDGPU_GEM_DOMAIN_GTT, 1596 &adev->wb.wb_obj, &adev->wb.gpu_addr, 1597 (void **)&adev->wb.wb); 1598 if (r) { 1599 dev_warn(adev->dev, "(%d) create WB bo failed\n", r); 1600 return r; 1601 } 1602 1603 adev->wb.num_wb = AMDGPU_MAX_WB; 1604 memset(&adev->wb.used, 0, sizeof(adev->wb.used)); 1605 1606 /* clear wb memory */ 1607 memset((char *)adev->wb.wb, 0, AMDGPU_MAX_WB * sizeof(uint32_t) * 8); 1608 } 1609 1610 return 0; 1611 } 1612 1613 /** 1614 * amdgpu_device_wb_get - Allocate a wb entry 1615 * 1616 * @adev: amdgpu_device pointer 1617 * @wb: wb index 1618 * 1619 * Allocate a wb slot for use by the driver (all asics). 1620 * Returns 0 on success or -EINVAL on failure. 1621 */ 1622 int amdgpu_device_wb_get(struct amdgpu_device *adev, u32 *wb) 1623 { 1624 unsigned long flags, offset; 1625 1626 spin_lock_irqsave(&adev->wb.lock, flags); 1627 offset = find_first_zero_bit(adev->wb.used, adev->wb.num_wb); 1628 if (offset < adev->wb.num_wb) { 1629 __set_bit(offset, adev->wb.used); 1630 spin_unlock_irqrestore(&adev->wb.lock, flags); 1631 *wb = offset << 3; /* convert to dw offset */ 1632 return 0; 1633 } else { 1634 spin_unlock_irqrestore(&adev->wb.lock, flags); 1635 return -EINVAL; 1636 } 1637 } 1638 1639 /** 1640 * amdgpu_device_wb_free - Free a wb entry 1641 * 1642 * @adev: amdgpu_device pointer 1643 * @wb: wb index 1644 * 1645 * Free a wb slot allocated for use by the driver (all asics) 1646 */ 1647 void amdgpu_device_wb_free(struct amdgpu_device *adev, u32 wb) 1648 { 1649 unsigned long flags; 1650 1651 wb >>= 3; 1652 spin_lock_irqsave(&adev->wb.lock, flags); 1653 if (wb < adev->wb.num_wb) 1654 __clear_bit(wb, adev->wb.used); 1655 spin_unlock_irqrestore(&adev->wb.lock, flags); 1656 } 1657 1658 /** 1659 * amdgpu_device_resize_fb_bar - try to resize FB BAR 1660 * 1661 * @adev: amdgpu_device pointer 1662 * 1663 * Try to resize FB BAR to make all VRAM CPU accessible. We try very hard not 1664 * to fail, but if any of the BARs is not accessible after the size we abort 1665 * driver loading by returning -ENODEV. 1666 */ 1667 int amdgpu_device_resize_fb_bar(struct amdgpu_device *adev) 1668 { 1669 int rbar_size = pci_rebar_bytes_to_size(adev->gmc.real_vram_size); 1670 struct pci_bus *root; 1671 struct resource *res; 1672 unsigned int i; 1673 u16 cmd; 1674 int r; 1675 1676 if (!IS_ENABLED(CONFIG_PHYS_ADDR_T_64BIT)) 1677 return 0; 1678 1679 /* Bypass for VF */ 1680 if (amdgpu_sriov_vf(adev)) 1681 return 0; 1682 1683 if (!amdgpu_rebar) 1684 return 0; 1685 1686 /* resizing on Dell G5 SE platforms causes problems with runtime pm */ 1687 if ((amdgpu_runtime_pm != 0) && 1688 adev->pdev->vendor == PCI_VENDOR_ID_ATI && 1689 adev->pdev->device == 0x731f && 1690 adev->pdev->subsystem_vendor == PCI_VENDOR_ID_DELL) 1691 return 0; 1692 1693 /* PCI_EXT_CAP_ID_VNDR extended capability is located at 0x100 */ 1694 if (!pci_find_ext_capability(adev->pdev, PCI_EXT_CAP_ID_VNDR)) 1695 DRM_WARN("System can't access extended configuration space, please check!!\n"); 1696 1697 /* skip if the bios has already enabled large BAR */ 1698 if (adev->gmc.real_vram_size && 1699 (pci_resource_len(adev->pdev, 0) >= adev->gmc.real_vram_size)) 1700 return 0; 1701 1702 /* Check if the root BUS has 64bit memory resources */ 1703 root = adev->pdev->bus; 1704 while (root->parent) 1705 root = root->parent; 1706 1707 pci_bus_for_each_resource(root, res, i) { 1708 if (res && res->flags & (IORESOURCE_MEM | IORESOURCE_MEM_64) && 1709 res->start > 0x100000000ull) 1710 break; 1711 } 1712 1713 /* Trying to resize is pointless without a root hub window above 4GB */ 1714 if (!res) 1715 return 0; 1716 1717 /* Limit the BAR size to what is available */ 1718 rbar_size = min(fls(pci_rebar_get_possible_sizes(adev->pdev, 0)) - 1, 1719 rbar_size); 1720 1721 /* Disable memory decoding while we change the BAR addresses and size */ 1722 pci_read_config_word(adev->pdev, PCI_COMMAND, &cmd); 1723 pci_write_config_word(adev->pdev, PCI_COMMAND, 1724 cmd & ~PCI_COMMAND_MEMORY); 1725 1726 /* Free the VRAM and doorbell BAR, we most likely need to move both. */ 1727 amdgpu_doorbell_fini(adev); 1728 if (adev->asic_type >= CHIP_BONAIRE) 1729 pci_release_resource(adev->pdev, 2); 1730 1731 pci_release_resource(adev->pdev, 0); 1732 1733 r = pci_resize_resource(adev->pdev, 0, rbar_size); 1734 if (r == -ENOSPC) 1735 DRM_INFO("Not enough PCI address space for a large BAR."); 1736 else if (r && r != -ENOTSUPP) 1737 DRM_ERROR("Problem resizing BAR0 (%d).", r); 1738 1739 pci_assign_unassigned_bus_resources(adev->pdev->bus); 1740 1741 /* When the doorbell or fb BAR isn't available we have no chance of 1742 * using the device. 1743 */ 1744 r = amdgpu_doorbell_init(adev); 1745 if (r || (pci_resource_flags(adev->pdev, 0) & IORESOURCE_UNSET)) 1746 return -ENODEV; 1747 1748 pci_write_config_word(adev->pdev, PCI_COMMAND, cmd); 1749 1750 return 0; 1751 } 1752 1753 /* 1754 * GPU helpers function. 1755 */ 1756 /** 1757 * amdgpu_device_need_post - check if the hw need post or not 1758 * 1759 * @adev: amdgpu_device pointer 1760 * 1761 * Check if the asic has been initialized (all asics) at driver startup 1762 * or post is needed if hw reset is performed. 1763 * Returns true if need or false if not. 1764 */ 1765 bool amdgpu_device_need_post(struct amdgpu_device *adev) 1766 { 1767 uint32_t reg, flags; 1768 1769 if (amdgpu_sriov_vf(adev)) 1770 return false; 1771 1772 flags = amdgpu_device_get_vbios_flags(adev); 1773 if (flags & AMDGPU_VBIOS_SKIP) 1774 return false; 1775 if ((flags & AMDGPU_VBIOS_OPTIONAL) && !adev->bios) 1776 return false; 1777 1778 if (amdgpu_passthrough(adev)) { 1779 /* for FIJI: In whole GPU pass-through virtualization case, after VM reboot 1780 * some old smc fw still need driver do vPost otherwise gpu hang, while 1781 * those smc fw version above 22.15 doesn't have this flaw, so we force 1782 * vpost executed for smc version below 22.15 1783 */ 1784 if (adev->asic_type == CHIP_FIJI) { 1785 int err; 1786 uint32_t fw_ver; 1787 1788 err = request_firmware(&adev->pm.fw, "amdgpu/fiji_smc.bin", adev->dev); 1789 /* force vPost if error occurred */ 1790 if (err) 1791 return true; 1792 1793 fw_ver = *((uint32_t *)adev->pm.fw->data + 69); 1794 release_firmware(adev->pm.fw); 1795 if (fw_ver < 0x00160e00) 1796 return true; 1797 } 1798 } 1799 1800 /* Don't post if we need to reset whole hive on init */ 1801 if (adev->init_lvl->level == AMDGPU_INIT_LEVEL_MINIMAL_XGMI) 1802 return false; 1803 1804 if (adev->has_hw_reset) { 1805 adev->has_hw_reset = false; 1806 return true; 1807 } 1808 1809 /* bios scratch used on CIK+ */ 1810 if (adev->asic_type >= CHIP_BONAIRE) 1811 return amdgpu_atombios_scratch_need_asic_init(adev); 1812 1813 /* check MEM_SIZE for older asics */ 1814 reg = amdgpu_asic_get_config_memsize(adev); 1815 1816 if ((reg != 0) && (reg != 0xffffffff)) 1817 return false; 1818 1819 return true; 1820 } 1821 1822 /* 1823 * Check whether seamless boot is supported. 1824 * 1825 * So far we only support seamless boot on DCE 3.0 or later. 1826 * If users report that it works on older ASICS as well, we may 1827 * loosen this. 1828 */ 1829 bool amdgpu_device_seamless_boot_supported(struct amdgpu_device *adev) 1830 { 1831 switch (amdgpu_seamless) { 1832 case -1: 1833 break; 1834 case 1: 1835 return true; 1836 case 0: 1837 return false; 1838 default: 1839 DRM_ERROR("Invalid value for amdgpu.seamless: %d\n", 1840 amdgpu_seamless); 1841 return false; 1842 } 1843 1844 if (!(adev->flags & AMD_IS_APU)) 1845 return false; 1846 1847 if (adev->mman.keep_stolen_vga_memory) 1848 return false; 1849 1850 return amdgpu_ip_version(adev, DCE_HWIP, 0) >= IP_VERSION(3, 0, 0); 1851 } 1852 1853 /* 1854 * Intel hosts such as Rocket Lake, Alder Lake, Raptor Lake and Sapphire Rapids 1855 * don't support dynamic speed switching. Until we have confirmation from Intel 1856 * that a specific host supports it, it's safer that we keep it disabled for all. 1857 * 1858 * https://edc.intel.com/content/www/us/en/design/products/platforms/details/raptor-lake-s/13th-generation-core-processors-datasheet-volume-1-of-2/005/pci-express-support/ 1859 * https://gitlab.freedesktop.org/drm/amd/-/issues/2663 1860 */ 1861 static bool amdgpu_device_pcie_dynamic_switching_supported(struct amdgpu_device *adev) 1862 { 1863 #if IS_ENABLED(CONFIG_X86) 1864 struct cpuinfo_x86 *c = &cpu_data(0); 1865 1866 /* eGPU change speeds based on USB4 fabric conditions */ 1867 if (dev_is_removable(adev->dev)) 1868 return true; 1869 1870 if (c->x86_vendor == X86_VENDOR_INTEL) 1871 return false; 1872 #endif 1873 return true; 1874 } 1875 1876 /** 1877 * amdgpu_device_should_use_aspm - check if the device should program ASPM 1878 * 1879 * @adev: amdgpu_device pointer 1880 * 1881 * Confirm whether the module parameter and pcie bridge agree that ASPM should 1882 * be set for this device. 1883 * 1884 * Returns true if it should be used or false if not. 1885 */ 1886 bool amdgpu_device_should_use_aspm(struct amdgpu_device *adev) 1887 { 1888 switch (amdgpu_aspm) { 1889 case -1: 1890 break; 1891 case 0: 1892 return false; 1893 case 1: 1894 return true; 1895 default: 1896 return false; 1897 } 1898 if (adev->flags & AMD_IS_APU) 1899 return false; 1900 if (!(adev->pm.pp_feature & PP_PCIE_DPM_MASK)) 1901 return false; 1902 return pcie_aspm_enabled(adev->pdev); 1903 } 1904 1905 /* if we get transitioned to only one device, take VGA back */ 1906 /** 1907 * amdgpu_device_vga_set_decode - enable/disable vga decode 1908 * 1909 * @pdev: PCI device pointer 1910 * @state: enable/disable vga decode 1911 * 1912 * Enable/disable vga decode (all asics). 1913 * Returns VGA resource flags. 1914 */ 1915 static unsigned int amdgpu_device_vga_set_decode(struct pci_dev *pdev, 1916 bool state) 1917 { 1918 struct amdgpu_device *adev = drm_to_adev(pci_get_drvdata(pdev)); 1919 1920 amdgpu_asic_set_vga_state(adev, state); 1921 if (state) 1922 return VGA_RSRC_LEGACY_IO | VGA_RSRC_LEGACY_MEM | 1923 VGA_RSRC_NORMAL_IO | VGA_RSRC_NORMAL_MEM; 1924 else 1925 return VGA_RSRC_NORMAL_IO | VGA_RSRC_NORMAL_MEM; 1926 } 1927 1928 /** 1929 * amdgpu_device_check_block_size - validate the vm block size 1930 * 1931 * @adev: amdgpu_device pointer 1932 * 1933 * Validates the vm block size specified via module parameter. 1934 * The vm block size defines number of bits in page table versus page directory, 1935 * a page is 4KB so we have 12 bits offset, minimum 9 bits in the 1936 * page table and the remaining bits are in the page directory. 1937 */ 1938 static void amdgpu_device_check_block_size(struct amdgpu_device *adev) 1939 { 1940 /* defines number of bits in page table versus page directory, 1941 * a page is 4KB so we have 12 bits offset, minimum 9 bits in the 1942 * page table and the remaining bits are in the page directory 1943 */ 1944 if (amdgpu_vm_block_size == -1) 1945 return; 1946 1947 if (amdgpu_vm_block_size < 9) { 1948 dev_warn(adev->dev, "VM page table size (%d) too small\n", 1949 amdgpu_vm_block_size); 1950 amdgpu_vm_block_size = -1; 1951 } 1952 } 1953 1954 /** 1955 * amdgpu_device_check_vm_size - validate the vm size 1956 * 1957 * @adev: amdgpu_device pointer 1958 * 1959 * Validates the vm size in GB specified via module parameter. 1960 * The VM size is the size of the GPU virtual memory space in GB. 1961 */ 1962 static void amdgpu_device_check_vm_size(struct amdgpu_device *adev) 1963 { 1964 /* no need to check the default value */ 1965 if (amdgpu_vm_size == -1) 1966 return; 1967 1968 if (amdgpu_vm_size < 1) { 1969 dev_warn(adev->dev, "VM size (%d) too small, min is 1GB\n", 1970 amdgpu_vm_size); 1971 amdgpu_vm_size = -1; 1972 } 1973 } 1974 1975 static void amdgpu_device_check_smu_prv_buffer_size(struct amdgpu_device *adev) 1976 { 1977 struct sysinfo si; 1978 bool is_os_64 = (sizeof(void *) == 8); 1979 uint64_t total_memory; 1980 uint64_t dram_size_seven_GB = 0x1B8000000; 1981 uint64_t dram_size_three_GB = 0xB8000000; 1982 1983 if (amdgpu_smu_memory_pool_size == 0) 1984 return; 1985 1986 if (!is_os_64) { 1987 DRM_WARN("Not 64-bit OS, feature not supported\n"); 1988 goto def_value; 1989 } 1990 si_meminfo(&si); 1991 total_memory = (uint64_t)si.totalram * si.mem_unit; 1992 1993 if ((amdgpu_smu_memory_pool_size == 1) || 1994 (amdgpu_smu_memory_pool_size == 2)) { 1995 if (total_memory < dram_size_three_GB) 1996 goto def_value1; 1997 } else if ((amdgpu_smu_memory_pool_size == 4) || 1998 (amdgpu_smu_memory_pool_size == 8)) { 1999 if (total_memory < dram_size_seven_GB) 2000 goto def_value1; 2001 } else { 2002 DRM_WARN("Smu memory pool size not supported\n"); 2003 goto def_value; 2004 } 2005 adev->pm.smu_prv_buffer_size = amdgpu_smu_memory_pool_size << 28; 2006 2007 return; 2008 2009 def_value1: 2010 DRM_WARN("No enough system memory\n"); 2011 def_value: 2012 adev->pm.smu_prv_buffer_size = 0; 2013 } 2014 2015 static int amdgpu_device_init_apu_flags(struct amdgpu_device *adev) 2016 { 2017 if (!(adev->flags & AMD_IS_APU) || 2018 adev->asic_type < CHIP_RAVEN) 2019 return 0; 2020 2021 switch (adev->asic_type) { 2022 case CHIP_RAVEN: 2023 if (adev->pdev->device == 0x15dd) 2024 adev->apu_flags |= AMD_APU_IS_RAVEN; 2025 if (adev->pdev->device == 0x15d8) 2026 adev->apu_flags |= AMD_APU_IS_PICASSO; 2027 break; 2028 case CHIP_RENOIR: 2029 if ((adev->pdev->device == 0x1636) || 2030 (adev->pdev->device == 0x164c)) 2031 adev->apu_flags |= AMD_APU_IS_RENOIR; 2032 else 2033 adev->apu_flags |= AMD_APU_IS_GREEN_SARDINE; 2034 break; 2035 case CHIP_VANGOGH: 2036 adev->apu_flags |= AMD_APU_IS_VANGOGH; 2037 break; 2038 case CHIP_YELLOW_CARP: 2039 break; 2040 case CHIP_CYAN_SKILLFISH: 2041 if ((adev->pdev->device == 0x13FE) || 2042 (adev->pdev->device == 0x143F)) 2043 adev->apu_flags |= AMD_APU_IS_CYAN_SKILLFISH2; 2044 break; 2045 default: 2046 break; 2047 } 2048 2049 return 0; 2050 } 2051 2052 /** 2053 * amdgpu_device_check_arguments - validate module params 2054 * 2055 * @adev: amdgpu_device pointer 2056 * 2057 * Validates certain module parameters and updates 2058 * the associated values used by the driver (all asics). 2059 */ 2060 static int amdgpu_device_check_arguments(struct amdgpu_device *adev) 2061 { 2062 int i; 2063 2064 if (amdgpu_sched_jobs < 4) { 2065 dev_warn(adev->dev, "sched jobs (%d) must be at least 4\n", 2066 amdgpu_sched_jobs); 2067 amdgpu_sched_jobs = 4; 2068 } else if (!is_power_of_2(amdgpu_sched_jobs)) { 2069 dev_warn(adev->dev, "sched jobs (%d) must be a power of 2\n", 2070 amdgpu_sched_jobs); 2071 amdgpu_sched_jobs = roundup_pow_of_two(amdgpu_sched_jobs); 2072 } 2073 2074 if (amdgpu_gart_size != -1 && amdgpu_gart_size < 32) { 2075 /* gart size must be greater or equal to 32M */ 2076 dev_warn(adev->dev, "gart size (%d) too small\n", 2077 amdgpu_gart_size); 2078 amdgpu_gart_size = -1; 2079 } 2080 2081 if (amdgpu_gtt_size != -1 && amdgpu_gtt_size < 32) { 2082 /* gtt size must be greater or equal to 32M */ 2083 dev_warn(adev->dev, "gtt size (%d) too small\n", 2084 amdgpu_gtt_size); 2085 amdgpu_gtt_size = -1; 2086 } 2087 2088 /* valid range is between 4 and 9 inclusive */ 2089 if (amdgpu_vm_fragment_size != -1 && 2090 (amdgpu_vm_fragment_size > 9 || amdgpu_vm_fragment_size < 4)) { 2091 dev_warn(adev->dev, "valid range is between 4 and 9\n"); 2092 amdgpu_vm_fragment_size = -1; 2093 } 2094 2095 if (amdgpu_sched_hw_submission < 2) { 2096 dev_warn(adev->dev, "sched hw submission jobs (%d) must be at least 2\n", 2097 amdgpu_sched_hw_submission); 2098 amdgpu_sched_hw_submission = 2; 2099 } else if (!is_power_of_2(amdgpu_sched_hw_submission)) { 2100 dev_warn(adev->dev, "sched hw submission jobs (%d) must be a power of 2\n", 2101 amdgpu_sched_hw_submission); 2102 amdgpu_sched_hw_submission = roundup_pow_of_two(amdgpu_sched_hw_submission); 2103 } 2104 2105 if (amdgpu_reset_method < -1 || amdgpu_reset_method > 4) { 2106 dev_warn(adev->dev, "invalid option for reset method, reverting to default\n"); 2107 amdgpu_reset_method = -1; 2108 } 2109 2110 amdgpu_device_check_smu_prv_buffer_size(adev); 2111 2112 amdgpu_device_check_vm_size(adev); 2113 2114 amdgpu_device_check_block_size(adev); 2115 2116 adev->firmware.load_type = amdgpu_ucode_get_load_type(adev, amdgpu_fw_load_type); 2117 2118 for (i = 0; i < MAX_XCP; i++) 2119 adev->enforce_isolation[i] = !!enforce_isolation; 2120 2121 return 0; 2122 } 2123 2124 /** 2125 * amdgpu_switcheroo_set_state - set switcheroo state 2126 * 2127 * @pdev: pci dev pointer 2128 * @state: vga_switcheroo state 2129 * 2130 * Callback for the switcheroo driver. Suspends or resumes 2131 * the asics before or after it is powered up using ACPI methods. 2132 */ 2133 static void amdgpu_switcheroo_set_state(struct pci_dev *pdev, 2134 enum vga_switcheroo_state state) 2135 { 2136 struct drm_device *dev = pci_get_drvdata(pdev); 2137 int r; 2138 2139 if (amdgpu_device_supports_px(dev) && state == VGA_SWITCHEROO_OFF) 2140 return; 2141 2142 if (state == VGA_SWITCHEROO_ON) { 2143 pr_info("switched on\n"); 2144 /* don't suspend or resume card normally */ 2145 dev->switch_power_state = DRM_SWITCH_POWER_CHANGING; 2146 2147 pci_set_power_state(pdev, PCI_D0); 2148 amdgpu_device_load_pci_state(pdev); 2149 r = pci_enable_device(pdev); 2150 if (r) 2151 DRM_WARN("pci_enable_device failed (%d)\n", r); 2152 amdgpu_device_resume(dev, true); 2153 2154 dev->switch_power_state = DRM_SWITCH_POWER_ON; 2155 } else { 2156 pr_info("switched off\n"); 2157 dev->switch_power_state = DRM_SWITCH_POWER_CHANGING; 2158 amdgpu_device_prepare(dev); 2159 amdgpu_device_suspend(dev, true); 2160 amdgpu_device_cache_pci_state(pdev); 2161 /* Shut down the device */ 2162 pci_disable_device(pdev); 2163 pci_set_power_state(pdev, PCI_D3cold); 2164 dev->switch_power_state = DRM_SWITCH_POWER_OFF; 2165 } 2166 } 2167 2168 /** 2169 * amdgpu_switcheroo_can_switch - see if switcheroo state can change 2170 * 2171 * @pdev: pci dev pointer 2172 * 2173 * Callback for the switcheroo driver. Check of the switcheroo 2174 * state can be changed. 2175 * Returns true if the state can be changed, false if not. 2176 */ 2177 static bool amdgpu_switcheroo_can_switch(struct pci_dev *pdev) 2178 { 2179 struct drm_device *dev = pci_get_drvdata(pdev); 2180 2181 /* 2182 * FIXME: open_count is protected by drm_global_mutex but that would lead to 2183 * locking inversion with the driver load path. And the access here is 2184 * completely racy anyway. So don't bother with locking for now. 2185 */ 2186 return atomic_read(&dev->open_count) == 0; 2187 } 2188 2189 static const struct vga_switcheroo_client_ops amdgpu_switcheroo_ops = { 2190 .set_gpu_state = amdgpu_switcheroo_set_state, 2191 .reprobe = NULL, 2192 .can_switch = amdgpu_switcheroo_can_switch, 2193 }; 2194 2195 /** 2196 * amdgpu_device_ip_set_clockgating_state - set the CG state 2197 * 2198 * @dev: amdgpu_device pointer 2199 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.) 2200 * @state: clockgating state (gate or ungate) 2201 * 2202 * Sets the requested clockgating state for all instances of 2203 * the hardware IP specified. 2204 * Returns the error code from the last instance. 2205 */ 2206 int amdgpu_device_ip_set_clockgating_state(void *dev, 2207 enum amd_ip_block_type block_type, 2208 enum amd_clockgating_state state) 2209 { 2210 struct amdgpu_device *adev = dev; 2211 int i, r = 0; 2212 2213 for (i = 0; i < adev->num_ip_blocks; i++) { 2214 if (!adev->ip_blocks[i].status.valid) 2215 continue; 2216 if (adev->ip_blocks[i].version->type != block_type) 2217 continue; 2218 if (!adev->ip_blocks[i].version->funcs->set_clockgating_state) 2219 continue; 2220 r = adev->ip_blocks[i].version->funcs->set_clockgating_state( 2221 &adev->ip_blocks[i], state); 2222 if (r) 2223 DRM_ERROR("set_clockgating_state of IP block <%s> failed %d\n", 2224 adev->ip_blocks[i].version->funcs->name, r); 2225 } 2226 return r; 2227 } 2228 2229 /** 2230 * amdgpu_device_ip_set_powergating_state - set the PG state 2231 * 2232 * @dev: amdgpu_device pointer 2233 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.) 2234 * @state: powergating state (gate or ungate) 2235 * 2236 * Sets the requested powergating state for all instances of 2237 * the hardware IP specified. 2238 * Returns the error code from the last instance. 2239 */ 2240 int amdgpu_device_ip_set_powergating_state(void *dev, 2241 enum amd_ip_block_type block_type, 2242 enum amd_powergating_state state) 2243 { 2244 struct amdgpu_device *adev = dev; 2245 int i, r = 0; 2246 2247 for (i = 0; i < adev->num_ip_blocks; i++) { 2248 if (!adev->ip_blocks[i].status.valid) 2249 continue; 2250 if (adev->ip_blocks[i].version->type != block_type) 2251 continue; 2252 if (!adev->ip_blocks[i].version->funcs->set_powergating_state) 2253 continue; 2254 r = adev->ip_blocks[i].version->funcs->set_powergating_state( 2255 &adev->ip_blocks[i], state); 2256 if (r) 2257 DRM_ERROR("set_powergating_state of IP block <%s> failed %d\n", 2258 adev->ip_blocks[i].version->funcs->name, r); 2259 } 2260 return r; 2261 } 2262 2263 /** 2264 * amdgpu_device_ip_get_clockgating_state - get the CG state 2265 * 2266 * @adev: amdgpu_device pointer 2267 * @flags: clockgating feature flags 2268 * 2269 * Walks the list of IPs on the device and updates the clockgating 2270 * flags for each IP. 2271 * Updates @flags with the feature flags for each hardware IP where 2272 * clockgating is enabled. 2273 */ 2274 void amdgpu_device_ip_get_clockgating_state(struct amdgpu_device *adev, 2275 u64 *flags) 2276 { 2277 int i; 2278 2279 for (i = 0; i < adev->num_ip_blocks; i++) { 2280 if (!adev->ip_blocks[i].status.valid) 2281 continue; 2282 if (adev->ip_blocks[i].version->funcs->get_clockgating_state) 2283 adev->ip_blocks[i].version->funcs->get_clockgating_state( 2284 &adev->ip_blocks[i], flags); 2285 } 2286 } 2287 2288 /** 2289 * amdgpu_device_ip_wait_for_idle - wait for idle 2290 * 2291 * @adev: amdgpu_device pointer 2292 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.) 2293 * 2294 * Waits for the request hardware IP to be idle. 2295 * Returns 0 for success or a negative error code on failure. 2296 */ 2297 int amdgpu_device_ip_wait_for_idle(struct amdgpu_device *adev, 2298 enum amd_ip_block_type block_type) 2299 { 2300 int i, r; 2301 2302 for (i = 0; i < adev->num_ip_blocks; i++) { 2303 if (!adev->ip_blocks[i].status.valid) 2304 continue; 2305 if (adev->ip_blocks[i].version->type == block_type) { 2306 if (adev->ip_blocks[i].version->funcs->wait_for_idle) { 2307 r = adev->ip_blocks[i].version->funcs->wait_for_idle( 2308 &adev->ip_blocks[i]); 2309 if (r) 2310 return r; 2311 } 2312 break; 2313 } 2314 } 2315 return 0; 2316 2317 } 2318 2319 /** 2320 * amdgpu_device_ip_is_valid - is the hardware IP enabled 2321 * 2322 * @adev: amdgpu_device pointer 2323 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.) 2324 * 2325 * Check if the hardware IP is enable or not. 2326 * Returns true if it the IP is enable, false if not. 2327 */ 2328 bool amdgpu_device_ip_is_valid(struct amdgpu_device *adev, 2329 enum amd_ip_block_type block_type) 2330 { 2331 int i; 2332 2333 for (i = 0; i < adev->num_ip_blocks; i++) { 2334 if (adev->ip_blocks[i].version->type == block_type) 2335 return adev->ip_blocks[i].status.valid; 2336 } 2337 return false; 2338 2339 } 2340 2341 /** 2342 * amdgpu_device_ip_get_ip_block - get a hw IP pointer 2343 * 2344 * @adev: amdgpu_device pointer 2345 * @type: Type of hardware IP (SMU, GFX, UVD, etc.) 2346 * 2347 * Returns a pointer to the hardware IP block structure 2348 * if it exists for the asic, otherwise NULL. 2349 */ 2350 struct amdgpu_ip_block * 2351 amdgpu_device_ip_get_ip_block(struct amdgpu_device *adev, 2352 enum amd_ip_block_type type) 2353 { 2354 int i; 2355 2356 for (i = 0; i < adev->num_ip_blocks; i++) 2357 if (adev->ip_blocks[i].version->type == type) 2358 return &adev->ip_blocks[i]; 2359 2360 return NULL; 2361 } 2362 2363 /** 2364 * amdgpu_device_ip_block_version_cmp 2365 * 2366 * @adev: amdgpu_device pointer 2367 * @type: enum amd_ip_block_type 2368 * @major: major version 2369 * @minor: minor version 2370 * 2371 * return 0 if equal or greater 2372 * return 1 if smaller or the ip_block doesn't exist 2373 */ 2374 int amdgpu_device_ip_block_version_cmp(struct amdgpu_device *adev, 2375 enum amd_ip_block_type type, 2376 u32 major, u32 minor) 2377 { 2378 struct amdgpu_ip_block *ip_block = amdgpu_device_ip_get_ip_block(adev, type); 2379 2380 if (ip_block && ((ip_block->version->major > major) || 2381 ((ip_block->version->major == major) && 2382 (ip_block->version->minor >= minor)))) 2383 return 0; 2384 2385 return 1; 2386 } 2387 2388 /** 2389 * amdgpu_device_ip_block_add 2390 * 2391 * @adev: amdgpu_device pointer 2392 * @ip_block_version: pointer to the IP to add 2393 * 2394 * Adds the IP block driver information to the collection of IPs 2395 * on the asic. 2396 */ 2397 int amdgpu_device_ip_block_add(struct amdgpu_device *adev, 2398 const struct amdgpu_ip_block_version *ip_block_version) 2399 { 2400 if (!ip_block_version) 2401 return -EINVAL; 2402 2403 switch (ip_block_version->type) { 2404 case AMD_IP_BLOCK_TYPE_VCN: 2405 if (adev->harvest_ip_mask & AMD_HARVEST_IP_VCN_MASK) 2406 return 0; 2407 break; 2408 case AMD_IP_BLOCK_TYPE_JPEG: 2409 if (adev->harvest_ip_mask & AMD_HARVEST_IP_JPEG_MASK) 2410 return 0; 2411 break; 2412 default: 2413 break; 2414 } 2415 2416 dev_info(adev->dev, "detected ip block number %d <%s>\n", 2417 adev->num_ip_blocks, ip_block_version->funcs->name); 2418 2419 adev->ip_blocks[adev->num_ip_blocks].adev = adev; 2420 2421 adev->ip_blocks[adev->num_ip_blocks++].version = ip_block_version; 2422 2423 return 0; 2424 } 2425 2426 /** 2427 * amdgpu_device_enable_virtual_display - enable virtual display feature 2428 * 2429 * @adev: amdgpu_device pointer 2430 * 2431 * Enabled the virtual display feature if the user has enabled it via 2432 * the module parameter virtual_display. This feature provides a virtual 2433 * display hardware on headless boards or in virtualized environments. 2434 * This function parses and validates the configuration string specified by 2435 * the user and configures the virtual display configuration (number of 2436 * virtual connectors, crtcs, etc.) specified. 2437 */ 2438 static void amdgpu_device_enable_virtual_display(struct amdgpu_device *adev) 2439 { 2440 adev->enable_virtual_display = false; 2441 2442 if (amdgpu_virtual_display) { 2443 const char *pci_address_name = pci_name(adev->pdev); 2444 char *pciaddstr, *pciaddstr_tmp, *pciaddname_tmp, *pciaddname; 2445 2446 pciaddstr = kstrdup(amdgpu_virtual_display, GFP_KERNEL); 2447 pciaddstr_tmp = pciaddstr; 2448 while ((pciaddname_tmp = strsep(&pciaddstr_tmp, ";"))) { 2449 pciaddname = strsep(&pciaddname_tmp, ","); 2450 if (!strcmp("all", pciaddname) 2451 || !strcmp(pci_address_name, pciaddname)) { 2452 long num_crtc; 2453 int res = -1; 2454 2455 adev->enable_virtual_display = true; 2456 2457 if (pciaddname_tmp) 2458 res = kstrtol(pciaddname_tmp, 10, 2459 &num_crtc); 2460 2461 if (!res) { 2462 if (num_crtc < 1) 2463 num_crtc = 1; 2464 if (num_crtc > 6) 2465 num_crtc = 6; 2466 adev->mode_info.num_crtc = num_crtc; 2467 } else { 2468 adev->mode_info.num_crtc = 1; 2469 } 2470 break; 2471 } 2472 } 2473 2474 DRM_INFO("virtual display string:%s, %s:virtual_display:%d, num_crtc:%d\n", 2475 amdgpu_virtual_display, pci_address_name, 2476 adev->enable_virtual_display, adev->mode_info.num_crtc); 2477 2478 kfree(pciaddstr); 2479 } 2480 } 2481 2482 void amdgpu_device_set_sriov_virtual_display(struct amdgpu_device *adev) 2483 { 2484 if (amdgpu_sriov_vf(adev) && !adev->enable_virtual_display) { 2485 adev->mode_info.num_crtc = 1; 2486 adev->enable_virtual_display = true; 2487 DRM_INFO("virtual_display:%d, num_crtc:%d\n", 2488 adev->enable_virtual_display, adev->mode_info.num_crtc); 2489 } 2490 } 2491 2492 /** 2493 * amdgpu_device_parse_gpu_info_fw - parse gpu info firmware 2494 * 2495 * @adev: amdgpu_device pointer 2496 * 2497 * Parses the asic configuration parameters specified in the gpu info 2498 * firmware and makes them available to the driver for use in configuring 2499 * the asic. 2500 * Returns 0 on success, -EINVAL on failure. 2501 */ 2502 static int amdgpu_device_parse_gpu_info_fw(struct amdgpu_device *adev) 2503 { 2504 const char *chip_name; 2505 int err; 2506 const struct gpu_info_firmware_header_v1_0 *hdr; 2507 2508 adev->firmware.gpu_info_fw = NULL; 2509 2510 if (adev->mman.discovery_bin) 2511 return 0; 2512 2513 switch (adev->asic_type) { 2514 default: 2515 return 0; 2516 case CHIP_VEGA10: 2517 chip_name = "vega10"; 2518 break; 2519 case CHIP_VEGA12: 2520 chip_name = "vega12"; 2521 break; 2522 case CHIP_RAVEN: 2523 if (adev->apu_flags & AMD_APU_IS_RAVEN2) 2524 chip_name = "raven2"; 2525 else if (adev->apu_flags & AMD_APU_IS_PICASSO) 2526 chip_name = "picasso"; 2527 else 2528 chip_name = "raven"; 2529 break; 2530 case CHIP_ARCTURUS: 2531 chip_name = "arcturus"; 2532 break; 2533 case CHIP_NAVI12: 2534 chip_name = "navi12"; 2535 break; 2536 } 2537 2538 err = amdgpu_ucode_request(adev, &adev->firmware.gpu_info_fw, 2539 AMDGPU_UCODE_OPTIONAL, 2540 "amdgpu/%s_gpu_info.bin", chip_name); 2541 if (err) { 2542 dev_err(adev->dev, 2543 "Failed to get gpu_info firmware \"%s_gpu_info.bin\"\n", 2544 chip_name); 2545 goto out; 2546 } 2547 2548 hdr = (const struct gpu_info_firmware_header_v1_0 *)adev->firmware.gpu_info_fw->data; 2549 amdgpu_ucode_print_gpu_info_hdr(&hdr->header); 2550 2551 switch (hdr->version_major) { 2552 case 1: 2553 { 2554 const struct gpu_info_firmware_v1_0 *gpu_info_fw = 2555 (const struct gpu_info_firmware_v1_0 *)(adev->firmware.gpu_info_fw->data + 2556 le32_to_cpu(hdr->header.ucode_array_offset_bytes)); 2557 2558 /* 2559 * Should be dropped when DAL no longer needs it. 2560 */ 2561 if (adev->asic_type == CHIP_NAVI12) 2562 goto parse_soc_bounding_box; 2563 2564 adev->gfx.config.max_shader_engines = le32_to_cpu(gpu_info_fw->gc_num_se); 2565 adev->gfx.config.max_cu_per_sh = le32_to_cpu(gpu_info_fw->gc_num_cu_per_sh); 2566 adev->gfx.config.max_sh_per_se = le32_to_cpu(gpu_info_fw->gc_num_sh_per_se); 2567 adev->gfx.config.max_backends_per_se = le32_to_cpu(gpu_info_fw->gc_num_rb_per_se); 2568 adev->gfx.config.max_texture_channel_caches = 2569 le32_to_cpu(gpu_info_fw->gc_num_tccs); 2570 adev->gfx.config.max_gprs = le32_to_cpu(gpu_info_fw->gc_num_gprs); 2571 adev->gfx.config.max_gs_threads = le32_to_cpu(gpu_info_fw->gc_num_max_gs_thds); 2572 adev->gfx.config.gs_vgt_table_depth = le32_to_cpu(gpu_info_fw->gc_gs_table_depth); 2573 adev->gfx.config.gs_prim_buffer_depth = le32_to_cpu(gpu_info_fw->gc_gsprim_buff_depth); 2574 adev->gfx.config.double_offchip_lds_buf = 2575 le32_to_cpu(gpu_info_fw->gc_double_offchip_lds_buffer); 2576 adev->gfx.cu_info.wave_front_size = le32_to_cpu(gpu_info_fw->gc_wave_size); 2577 adev->gfx.cu_info.max_waves_per_simd = 2578 le32_to_cpu(gpu_info_fw->gc_max_waves_per_simd); 2579 adev->gfx.cu_info.max_scratch_slots_per_cu = 2580 le32_to_cpu(gpu_info_fw->gc_max_scratch_slots_per_cu); 2581 adev->gfx.cu_info.lds_size = le32_to_cpu(gpu_info_fw->gc_lds_size); 2582 if (hdr->version_minor >= 1) { 2583 const struct gpu_info_firmware_v1_1 *gpu_info_fw = 2584 (const struct gpu_info_firmware_v1_1 *)(adev->firmware.gpu_info_fw->data + 2585 le32_to_cpu(hdr->header.ucode_array_offset_bytes)); 2586 adev->gfx.config.num_sc_per_sh = 2587 le32_to_cpu(gpu_info_fw->num_sc_per_sh); 2588 adev->gfx.config.num_packer_per_sc = 2589 le32_to_cpu(gpu_info_fw->num_packer_per_sc); 2590 } 2591 2592 parse_soc_bounding_box: 2593 /* 2594 * soc bounding box info is not integrated in disocovery table, 2595 * we always need to parse it from gpu info firmware if needed. 2596 */ 2597 if (hdr->version_minor == 2) { 2598 const struct gpu_info_firmware_v1_2 *gpu_info_fw = 2599 (const struct gpu_info_firmware_v1_2 *)(adev->firmware.gpu_info_fw->data + 2600 le32_to_cpu(hdr->header.ucode_array_offset_bytes)); 2601 adev->dm.soc_bounding_box = &gpu_info_fw->soc_bounding_box; 2602 } 2603 break; 2604 } 2605 default: 2606 dev_err(adev->dev, 2607 "Unsupported gpu_info table %d\n", hdr->header.ucode_version); 2608 err = -EINVAL; 2609 goto out; 2610 } 2611 out: 2612 return err; 2613 } 2614 2615 /** 2616 * amdgpu_device_ip_early_init - run early init for hardware IPs 2617 * 2618 * @adev: amdgpu_device pointer 2619 * 2620 * Early initialization pass for hardware IPs. The hardware IPs that make 2621 * up each asic are discovered each IP's early_init callback is run. This 2622 * is the first stage in initializing the asic. 2623 * Returns 0 on success, negative error code on failure. 2624 */ 2625 static int amdgpu_device_ip_early_init(struct amdgpu_device *adev) 2626 { 2627 struct amdgpu_ip_block *ip_block; 2628 struct pci_dev *parent; 2629 bool total, skip_bios; 2630 uint32_t bios_flags; 2631 int i, r; 2632 2633 amdgpu_device_enable_virtual_display(adev); 2634 2635 if (amdgpu_sriov_vf(adev)) { 2636 r = amdgpu_virt_request_full_gpu(adev, true); 2637 if (r) 2638 return r; 2639 } 2640 2641 switch (adev->asic_type) { 2642 #ifdef CONFIG_DRM_AMDGPU_SI 2643 case CHIP_VERDE: 2644 case CHIP_TAHITI: 2645 case CHIP_PITCAIRN: 2646 case CHIP_OLAND: 2647 case CHIP_HAINAN: 2648 adev->family = AMDGPU_FAMILY_SI; 2649 r = si_set_ip_blocks(adev); 2650 if (r) 2651 return r; 2652 break; 2653 #endif 2654 #ifdef CONFIG_DRM_AMDGPU_CIK 2655 case CHIP_BONAIRE: 2656 case CHIP_HAWAII: 2657 case CHIP_KAVERI: 2658 case CHIP_KABINI: 2659 case CHIP_MULLINS: 2660 if (adev->flags & AMD_IS_APU) 2661 adev->family = AMDGPU_FAMILY_KV; 2662 else 2663 adev->family = AMDGPU_FAMILY_CI; 2664 2665 r = cik_set_ip_blocks(adev); 2666 if (r) 2667 return r; 2668 break; 2669 #endif 2670 case CHIP_TOPAZ: 2671 case CHIP_TONGA: 2672 case CHIP_FIJI: 2673 case CHIP_POLARIS10: 2674 case CHIP_POLARIS11: 2675 case CHIP_POLARIS12: 2676 case CHIP_VEGAM: 2677 case CHIP_CARRIZO: 2678 case CHIP_STONEY: 2679 if (adev->flags & AMD_IS_APU) 2680 adev->family = AMDGPU_FAMILY_CZ; 2681 else 2682 adev->family = AMDGPU_FAMILY_VI; 2683 2684 r = vi_set_ip_blocks(adev); 2685 if (r) 2686 return r; 2687 break; 2688 default: 2689 r = amdgpu_discovery_set_ip_blocks(adev); 2690 if (r) 2691 return r; 2692 break; 2693 } 2694 2695 /* Check for IP version 9.4.3 with A0 hardware */ 2696 if (amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 3) && 2697 !amdgpu_device_get_rev_id(adev)) { 2698 dev_err(adev->dev, "Unsupported A0 hardware\n"); 2699 return -ENODEV; /* device unsupported - no device error */ 2700 } 2701 2702 if (amdgpu_has_atpx() && 2703 (amdgpu_is_atpx_hybrid() || 2704 amdgpu_has_atpx_dgpu_power_cntl()) && 2705 ((adev->flags & AMD_IS_APU) == 0) && 2706 !dev_is_removable(&adev->pdev->dev)) 2707 adev->flags |= AMD_IS_PX; 2708 2709 if (!(adev->flags & AMD_IS_APU)) { 2710 parent = pcie_find_root_port(adev->pdev); 2711 adev->has_pr3 = parent ? pci_pr3_present(parent) : false; 2712 } 2713 2714 adev->pm.pp_feature = amdgpu_pp_feature_mask; 2715 if (amdgpu_sriov_vf(adev) || sched_policy == KFD_SCHED_POLICY_NO_HWS) 2716 adev->pm.pp_feature &= ~PP_GFXOFF_MASK; 2717 if (amdgpu_sriov_vf(adev) && adev->asic_type == CHIP_SIENNA_CICHLID) 2718 adev->pm.pp_feature &= ~PP_OVERDRIVE_MASK; 2719 if (!amdgpu_device_pcie_dynamic_switching_supported(adev)) 2720 adev->pm.pp_feature &= ~PP_PCIE_DPM_MASK; 2721 2722 total = true; 2723 for (i = 0; i < adev->num_ip_blocks; i++) { 2724 ip_block = &adev->ip_blocks[i]; 2725 2726 if ((amdgpu_ip_block_mask & (1 << i)) == 0) { 2727 DRM_WARN("disabled ip block: %d <%s>\n", 2728 i, adev->ip_blocks[i].version->funcs->name); 2729 adev->ip_blocks[i].status.valid = false; 2730 } else if (ip_block->version->funcs->early_init) { 2731 r = ip_block->version->funcs->early_init(ip_block); 2732 if (r == -ENOENT) { 2733 adev->ip_blocks[i].status.valid = false; 2734 } else if (r) { 2735 DRM_ERROR("early_init of IP block <%s> failed %d\n", 2736 adev->ip_blocks[i].version->funcs->name, r); 2737 total = false; 2738 } else { 2739 adev->ip_blocks[i].status.valid = true; 2740 } 2741 } else { 2742 adev->ip_blocks[i].status.valid = true; 2743 } 2744 /* get the vbios after the asic_funcs are set up */ 2745 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON) { 2746 r = amdgpu_device_parse_gpu_info_fw(adev); 2747 if (r) 2748 return r; 2749 2750 bios_flags = amdgpu_device_get_vbios_flags(adev); 2751 skip_bios = !!(bios_flags & AMDGPU_VBIOS_SKIP); 2752 /* Read BIOS */ 2753 if (!skip_bios) { 2754 bool optional = 2755 !!(bios_flags & AMDGPU_VBIOS_OPTIONAL); 2756 if (!amdgpu_get_bios(adev) && !optional) 2757 return -EINVAL; 2758 2759 if (optional && !adev->bios) 2760 dev_info( 2761 adev->dev, 2762 "VBIOS image optional, proceeding without VBIOS image"); 2763 2764 if (adev->bios) { 2765 r = amdgpu_atombios_init(adev); 2766 if (r) { 2767 dev_err(adev->dev, 2768 "amdgpu_atombios_init failed\n"); 2769 amdgpu_vf_error_put( 2770 adev, 2771 AMDGIM_ERROR_VF_ATOMBIOS_INIT_FAIL, 2772 0, 0); 2773 return r; 2774 } 2775 } 2776 } 2777 2778 /*get pf2vf msg info at it's earliest time*/ 2779 if (amdgpu_sriov_vf(adev)) 2780 amdgpu_virt_init_data_exchange(adev); 2781 2782 } 2783 } 2784 if (!total) 2785 return -ENODEV; 2786 2787 if (adev->gmc.xgmi.supported) 2788 amdgpu_xgmi_early_init(adev); 2789 2790 ip_block = amdgpu_device_ip_get_ip_block(adev, AMD_IP_BLOCK_TYPE_GFX); 2791 if (ip_block->status.valid != false) 2792 amdgpu_amdkfd_device_probe(adev); 2793 2794 adev->cg_flags &= amdgpu_cg_mask; 2795 adev->pg_flags &= amdgpu_pg_mask; 2796 2797 return 0; 2798 } 2799 2800 static int amdgpu_device_ip_hw_init_phase1(struct amdgpu_device *adev) 2801 { 2802 int i, r; 2803 2804 for (i = 0; i < adev->num_ip_blocks; i++) { 2805 if (!adev->ip_blocks[i].status.sw) 2806 continue; 2807 if (adev->ip_blocks[i].status.hw) 2808 continue; 2809 if (!amdgpu_ip_member_of_hwini( 2810 adev, adev->ip_blocks[i].version->type)) 2811 continue; 2812 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON || 2813 (amdgpu_sriov_vf(adev) && (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP)) || 2814 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH) { 2815 r = adev->ip_blocks[i].version->funcs->hw_init(&adev->ip_blocks[i]); 2816 if (r) { 2817 DRM_ERROR("hw_init of IP block <%s> failed %d\n", 2818 adev->ip_blocks[i].version->funcs->name, r); 2819 return r; 2820 } 2821 adev->ip_blocks[i].status.hw = true; 2822 } 2823 } 2824 2825 return 0; 2826 } 2827 2828 static int amdgpu_device_ip_hw_init_phase2(struct amdgpu_device *adev) 2829 { 2830 int i, r; 2831 2832 for (i = 0; i < adev->num_ip_blocks; i++) { 2833 if (!adev->ip_blocks[i].status.sw) 2834 continue; 2835 if (adev->ip_blocks[i].status.hw) 2836 continue; 2837 if (!amdgpu_ip_member_of_hwini( 2838 adev, adev->ip_blocks[i].version->type)) 2839 continue; 2840 r = adev->ip_blocks[i].version->funcs->hw_init(&adev->ip_blocks[i]); 2841 if (r) { 2842 DRM_ERROR("hw_init of IP block <%s> failed %d\n", 2843 adev->ip_blocks[i].version->funcs->name, r); 2844 return r; 2845 } 2846 adev->ip_blocks[i].status.hw = true; 2847 } 2848 2849 return 0; 2850 } 2851 2852 static int amdgpu_device_fw_loading(struct amdgpu_device *adev) 2853 { 2854 int r = 0; 2855 int i; 2856 uint32_t smu_version; 2857 2858 if (adev->asic_type >= CHIP_VEGA10) { 2859 for (i = 0; i < adev->num_ip_blocks; i++) { 2860 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_PSP) 2861 continue; 2862 2863 if (!amdgpu_ip_member_of_hwini(adev, 2864 AMD_IP_BLOCK_TYPE_PSP)) 2865 break; 2866 2867 if (!adev->ip_blocks[i].status.sw) 2868 continue; 2869 2870 /* no need to do the fw loading again if already done*/ 2871 if (adev->ip_blocks[i].status.hw == true) 2872 break; 2873 2874 if (amdgpu_in_reset(adev) || adev->in_suspend) { 2875 r = amdgpu_ip_block_resume(&adev->ip_blocks[i]); 2876 if (r) 2877 return r; 2878 } else { 2879 r = adev->ip_blocks[i].version->funcs->hw_init(&adev->ip_blocks[i]); 2880 if (r) { 2881 DRM_ERROR("hw_init of IP block <%s> failed %d\n", 2882 adev->ip_blocks[i].version->funcs->name, r); 2883 return r; 2884 } 2885 adev->ip_blocks[i].status.hw = true; 2886 } 2887 break; 2888 } 2889 } 2890 2891 if (!amdgpu_sriov_vf(adev) || adev->asic_type == CHIP_TONGA) 2892 r = amdgpu_pm_load_smu_firmware(adev, &smu_version); 2893 2894 return r; 2895 } 2896 2897 static int amdgpu_device_init_schedulers(struct amdgpu_device *adev) 2898 { 2899 struct drm_sched_init_args args = { 2900 .ops = &amdgpu_sched_ops, 2901 .num_rqs = DRM_SCHED_PRIORITY_COUNT, 2902 .timeout_wq = adev->reset_domain->wq, 2903 .dev = adev->dev, 2904 }; 2905 long timeout; 2906 int r, i; 2907 2908 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 2909 struct amdgpu_ring *ring = adev->rings[i]; 2910 2911 /* No need to setup the GPU scheduler for rings that don't need it */ 2912 if (!ring || ring->no_scheduler) 2913 continue; 2914 2915 switch (ring->funcs->type) { 2916 case AMDGPU_RING_TYPE_GFX: 2917 timeout = adev->gfx_timeout; 2918 break; 2919 case AMDGPU_RING_TYPE_COMPUTE: 2920 timeout = adev->compute_timeout; 2921 break; 2922 case AMDGPU_RING_TYPE_SDMA: 2923 timeout = adev->sdma_timeout; 2924 break; 2925 default: 2926 timeout = adev->video_timeout; 2927 break; 2928 } 2929 2930 args.timeout = timeout; 2931 args.credit_limit = ring->num_hw_submission; 2932 args.score = ring->sched_score; 2933 args.name = ring->name; 2934 2935 r = drm_sched_init(&ring->sched, &args); 2936 if (r) { 2937 DRM_ERROR("Failed to create scheduler on ring %s.\n", 2938 ring->name); 2939 return r; 2940 } 2941 r = amdgpu_uvd_entity_init(adev, ring); 2942 if (r) { 2943 DRM_ERROR("Failed to create UVD scheduling entity on ring %s.\n", 2944 ring->name); 2945 return r; 2946 } 2947 r = amdgpu_vce_entity_init(adev, ring); 2948 if (r) { 2949 DRM_ERROR("Failed to create VCE scheduling entity on ring %s.\n", 2950 ring->name); 2951 return r; 2952 } 2953 } 2954 2955 amdgpu_xcp_update_partition_sched_list(adev); 2956 2957 return 0; 2958 } 2959 2960 2961 /** 2962 * amdgpu_device_ip_init - run init for hardware IPs 2963 * 2964 * @adev: amdgpu_device pointer 2965 * 2966 * Main initialization pass for hardware IPs. The list of all the hardware 2967 * IPs that make up the asic is walked and the sw_init and hw_init callbacks 2968 * are run. sw_init initializes the software state associated with each IP 2969 * and hw_init initializes the hardware associated with each IP. 2970 * Returns 0 on success, negative error code on failure. 2971 */ 2972 static int amdgpu_device_ip_init(struct amdgpu_device *adev) 2973 { 2974 bool init_badpage; 2975 int i, r; 2976 2977 r = amdgpu_ras_init(adev); 2978 if (r) 2979 return r; 2980 2981 for (i = 0; i < adev->num_ip_blocks; i++) { 2982 if (!adev->ip_blocks[i].status.valid) 2983 continue; 2984 if (adev->ip_blocks[i].version->funcs->sw_init) { 2985 r = adev->ip_blocks[i].version->funcs->sw_init(&adev->ip_blocks[i]); 2986 if (r) { 2987 DRM_ERROR("sw_init of IP block <%s> failed %d\n", 2988 adev->ip_blocks[i].version->funcs->name, r); 2989 goto init_failed; 2990 } 2991 } 2992 adev->ip_blocks[i].status.sw = true; 2993 2994 if (!amdgpu_ip_member_of_hwini( 2995 adev, adev->ip_blocks[i].version->type)) 2996 continue; 2997 2998 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON) { 2999 /* need to do common hw init early so everything is set up for gmc */ 3000 r = adev->ip_blocks[i].version->funcs->hw_init(&adev->ip_blocks[i]); 3001 if (r) { 3002 DRM_ERROR("hw_init %d failed %d\n", i, r); 3003 goto init_failed; 3004 } 3005 adev->ip_blocks[i].status.hw = true; 3006 } else if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) { 3007 /* need to do gmc hw init early so we can allocate gpu mem */ 3008 /* Try to reserve bad pages early */ 3009 if (amdgpu_sriov_vf(adev)) 3010 amdgpu_virt_exchange_data(adev); 3011 3012 r = amdgpu_device_mem_scratch_init(adev); 3013 if (r) { 3014 DRM_ERROR("amdgpu_mem_scratch_init failed %d\n", r); 3015 goto init_failed; 3016 } 3017 r = adev->ip_blocks[i].version->funcs->hw_init(&adev->ip_blocks[i]); 3018 if (r) { 3019 DRM_ERROR("hw_init %d failed %d\n", i, r); 3020 goto init_failed; 3021 } 3022 r = amdgpu_device_wb_init(adev); 3023 if (r) { 3024 DRM_ERROR("amdgpu_device_wb_init failed %d\n", r); 3025 goto init_failed; 3026 } 3027 adev->ip_blocks[i].status.hw = true; 3028 3029 /* right after GMC hw init, we create CSA */ 3030 if (adev->gfx.mcbp) { 3031 r = amdgpu_allocate_static_csa(adev, &adev->virt.csa_obj, 3032 AMDGPU_GEM_DOMAIN_VRAM | 3033 AMDGPU_GEM_DOMAIN_GTT, 3034 AMDGPU_CSA_SIZE); 3035 if (r) { 3036 DRM_ERROR("allocate CSA failed %d\n", r); 3037 goto init_failed; 3038 } 3039 } 3040 3041 r = amdgpu_seq64_init(adev); 3042 if (r) { 3043 DRM_ERROR("allocate seq64 failed %d\n", r); 3044 goto init_failed; 3045 } 3046 } 3047 } 3048 3049 if (amdgpu_sriov_vf(adev)) 3050 amdgpu_virt_init_data_exchange(adev); 3051 3052 r = amdgpu_ib_pool_init(adev); 3053 if (r) { 3054 dev_err(adev->dev, "IB initialization failed (%d).\n", r); 3055 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_IB_INIT_FAIL, 0, r); 3056 goto init_failed; 3057 } 3058 3059 r = amdgpu_ucode_create_bo(adev); /* create ucode bo when sw_init complete*/ 3060 if (r) 3061 goto init_failed; 3062 3063 r = amdgpu_device_ip_hw_init_phase1(adev); 3064 if (r) 3065 goto init_failed; 3066 3067 r = amdgpu_device_fw_loading(adev); 3068 if (r) 3069 goto init_failed; 3070 3071 r = amdgpu_device_ip_hw_init_phase2(adev); 3072 if (r) 3073 goto init_failed; 3074 3075 /* 3076 * retired pages will be loaded from eeprom and reserved here, 3077 * it should be called after amdgpu_device_ip_hw_init_phase2 since 3078 * for some ASICs the RAS EEPROM code relies on SMU fully functioning 3079 * for I2C communication which only true at this point. 3080 * 3081 * amdgpu_ras_recovery_init may fail, but the upper only cares the 3082 * failure from bad gpu situation and stop amdgpu init process 3083 * accordingly. For other failed cases, it will still release all 3084 * the resource and print error message, rather than returning one 3085 * negative value to upper level. 3086 * 3087 * Note: theoretically, this should be called before all vram allocations 3088 * to protect retired page from abusing 3089 */ 3090 init_badpage = (adev->init_lvl->level != AMDGPU_INIT_LEVEL_MINIMAL_XGMI); 3091 r = amdgpu_ras_recovery_init(adev, init_badpage); 3092 if (r) 3093 goto init_failed; 3094 3095 /** 3096 * In case of XGMI grab extra reference for reset domain for this device 3097 */ 3098 if (adev->gmc.xgmi.num_physical_nodes > 1) { 3099 if (amdgpu_xgmi_add_device(adev) == 0) { 3100 if (!amdgpu_sriov_vf(adev)) { 3101 struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev); 3102 3103 if (WARN_ON(!hive)) { 3104 r = -ENOENT; 3105 goto init_failed; 3106 } 3107 3108 if (!hive->reset_domain || 3109 !amdgpu_reset_get_reset_domain(hive->reset_domain)) { 3110 r = -ENOENT; 3111 amdgpu_put_xgmi_hive(hive); 3112 goto init_failed; 3113 } 3114 3115 /* Drop the early temporary reset domain we created for device */ 3116 amdgpu_reset_put_reset_domain(adev->reset_domain); 3117 adev->reset_domain = hive->reset_domain; 3118 amdgpu_put_xgmi_hive(hive); 3119 } 3120 } 3121 } 3122 3123 r = amdgpu_device_init_schedulers(adev); 3124 if (r) 3125 goto init_failed; 3126 3127 if (adev->mman.buffer_funcs_ring->sched.ready) 3128 amdgpu_ttm_set_buffer_funcs_status(adev, true); 3129 3130 /* Don't init kfd if whole hive need to be reset during init */ 3131 if (adev->init_lvl->level != AMDGPU_INIT_LEVEL_MINIMAL_XGMI) { 3132 kgd2kfd_init_zone_device(adev); 3133 amdgpu_amdkfd_device_init(adev); 3134 } 3135 3136 amdgpu_fru_get_product_info(adev); 3137 3138 if (!amdgpu_sriov_vf(adev) || amdgpu_sriov_ras_cper_en(adev)) 3139 r = amdgpu_cper_init(adev); 3140 3141 init_failed: 3142 3143 return r; 3144 } 3145 3146 /** 3147 * amdgpu_device_fill_reset_magic - writes reset magic to gart pointer 3148 * 3149 * @adev: amdgpu_device pointer 3150 * 3151 * Writes a reset magic value to the gart pointer in VRAM. The driver calls 3152 * this function before a GPU reset. If the value is retained after a 3153 * GPU reset, VRAM has not been lost. Some GPU resets may destroy VRAM contents. 3154 */ 3155 static void amdgpu_device_fill_reset_magic(struct amdgpu_device *adev) 3156 { 3157 memcpy(adev->reset_magic, adev->gart.ptr, AMDGPU_RESET_MAGIC_NUM); 3158 } 3159 3160 /** 3161 * amdgpu_device_check_vram_lost - check if vram is valid 3162 * 3163 * @adev: amdgpu_device pointer 3164 * 3165 * Checks the reset magic value written to the gart pointer in VRAM. 3166 * The driver calls this after a GPU reset to see if the contents of 3167 * VRAM is lost or now. 3168 * returns true if vram is lost, false if not. 3169 */ 3170 static bool amdgpu_device_check_vram_lost(struct amdgpu_device *adev) 3171 { 3172 if (memcmp(adev->gart.ptr, adev->reset_magic, 3173 AMDGPU_RESET_MAGIC_NUM)) 3174 return true; 3175 3176 if (!amdgpu_in_reset(adev)) 3177 return false; 3178 3179 /* 3180 * For all ASICs with baco/mode1 reset, the VRAM is 3181 * always assumed to be lost. 3182 */ 3183 switch (amdgpu_asic_reset_method(adev)) { 3184 case AMD_RESET_METHOD_LINK: 3185 case AMD_RESET_METHOD_BACO: 3186 case AMD_RESET_METHOD_MODE1: 3187 return true; 3188 default: 3189 return false; 3190 } 3191 } 3192 3193 /** 3194 * amdgpu_device_set_cg_state - set clockgating for amdgpu device 3195 * 3196 * @adev: amdgpu_device pointer 3197 * @state: clockgating state (gate or ungate) 3198 * 3199 * The list of all the hardware IPs that make up the asic is walked and the 3200 * set_clockgating_state callbacks are run. 3201 * Late initialization pass enabling clockgating for hardware IPs. 3202 * Fini or suspend, pass disabling clockgating for hardware IPs. 3203 * Returns 0 on success, negative error code on failure. 3204 */ 3205 3206 int amdgpu_device_set_cg_state(struct amdgpu_device *adev, 3207 enum amd_clockgating_state state) 3208 { 3209 int i, j, r; 3210 3211 if (amdgpu_emu_mode == 1) 3212 return 0; 3213 3214 for (j = 0; j < adev->num_ip_blocks; j++) { 3215 i = state == AMD_CG_STATE_GATE ? j : adev->num_ip_blocks - j - 1; 3216 if (!adev->ip_blocks[i].status.late_initialized) 3217 continue; 3218 /* skip CG for GFX, SDMA on S0ix */ 3219 if (adev->in_s0ix && 3220 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GFX || 3221 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SDMA)) 3222 continue; 3223 /* skip CG for VCE/UVD, it's handled specially */ 3224 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_UVD && 3225 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCE && 3226 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCN && 3227 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_JPEG && 3228 adev->ip_blocks[i].version->funcs->set_clockgating_state) { 3229 /* enable clockgating to save power */ 3230 r = adev->ip_blocks[i].version->funcs->set_clockgating_state(&adev->ip_blocks[i], 3231 state); 3232 if (r) { 3233 DRM_ERROR("set_clockgating_state(gate) of IP block <%s> failed %d\n", 3234 adev->ip_blocks[i].version->funcs->name, r); 3235 return r; 3236 } 3237 } 3238 } 3239 3240 return 0; 3241 } 3242 3243 int amdgpu_device_set_pg_state(struct amdgpu_device *adev, 3244 enum amd_powergating_state state) 3245 { 3246 int i, j, r; 3247 3248 if (amdgpu_emu_mode == 1) 3249 return 0; 3250 3251 for (j = 0; j < adev->num_ip_blocks; j++) { 3252 i = state == AMD_PG_STATE_GATE ? j : adev->num_ip_blocks - j - 1; 3253 if (!adev->ip_blocks[i].status.late_initialized) 3254 continue; 3255 /* skip PG for GFX, SDMA on S0ix */ 3256 if (adev->in_s0ix && 3257 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GFX || 3258 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SDMA)) 3259 continue; 3260 /* skip CG for VCE/UVD, it's handled specially */ 3261 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_UVD && 3262 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCE && 3263 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCN && 3264 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_JPEG && 3265 adev->ip_blocks[i].version->funcs->set_powergating_state) { 3266 /* enable powergating to save power */ 3267 r = adev->ip_blocks[i].version->funcs->set_powergating_state(&adev->ip_blocks[i], 3268 state); 3269 if (r) { 3270 DRM_ERROR("set_powergating_state(gate) of IP block <%s> failed %d\n", 3271 adev->ip_blocks[i].version->funcs->name, r); 3272 return r; 3273 } 3274 } 3275 } 3276 return 0; 3277 } 3278 3279 static int amdgpu_device_enable_mgpu_fan_boost(void) 3280 { 3281 struct amdgpu_gpu_instance *gpu_ins; 3282 struct amdgpu_device *adev; 3283 int i, ret = 0; 3284 3285 mutex_lock(&mgpu_info.mutex); 3286 3287 /* 3288 * MGPU fan boost feature should be enabled 3289 * only when there are two or more dGPUs in 3290 * the system 3291 */ 3292 if (mgpu_info.num_dgpu < 2) 3293 goto out; 3294 3295 for (i = 0; i < mgpu_info.num_dgpu; i++) { 3296 gpu_ins = &(mgpu_info.gpu_ins[i]); 3297 adev = gpu_ins->adev; 3298 if (!(adev->flags & AMD_IS_APU) && 3299 !gpu_ins->mgpu_fan_enabled) { 3300 ret = amdgpu_dpm_enable_mgpu_fan_boost(adev); 3301 if (ret) 3302 break; 3303 3304 gpu_ins->mgpu_fan_enabled = 1; 3305 } 3306 } 3307 3308 out: 3309 mutex_unlock(&mgpu_info.mutex); 3310 3311 return ret; 3312 } 3313 3314 /** 3315 * amdgpu_device_ip_late_init - run late init for hardware IPs 3316 * 3317 * @adev: amdgpu_device pointer 3318 * 3319 * Late initialization pass for hardware IPs. The list of all the hardware 3320 * IPs that make up the asic is walked and the late_init callbacks are run. 3321 * late_init covers any special initialization that an IP requires 3322 * after all of the have been initialized or something that needs to happen 3323 * late in the init process. 3324 * Returns 0 on success, negative error code on failure. 3325 */ 3326 static int amdgpu_device_ip_late_init(struct amdgpu_device *adev) 3327 { 3328 struct amdgpu_gpu_instance *gpu_instance; 3329 int i = 0, r; 3330 3331 for (i = 0; i < adev->num_ip_blocks; i++) { 3332 if (!adev->ip_blocks[i].status.hw) 3333 continue; 3334 if (adev->ip_blocks[i].version->funcs->late_init) { 3335 r = adev->ip_blocks[i].version->funcs->late_init(&adev->ip_blocks[i]); 3336 if (r) { 3337 DRM_ERROR("late_init of IP block <%s> failed %d\n", 3338 adev->ip_blocks[i].version->funcs->name, r); 3339 return r; 3340 } 3341 } 3342 adev->ip_blocks[i].status.late_initialized = true; 3343 } 3344 3345 r = amdgpu_ras_late_init(adev); 3346 if (r) { 3347 DRM_ERROR("amdgpu_ras_late_init failed %d", r); 3348 return r; 3349 } 3350 3351 if (!amdgpu_reset_in_recovery(adev)) 3352 amdgpu_ras_set_error_query_ready(adev, true); 3353 3354 amdgpu_device_set_cg_state(adev, AMD_CG_STATE_GATE); 3355 amdgpu_device_set_pg_state(adev, AMD_PG_STATE_GATE); 3356 3357 amdgpu_device_fill_reset_magic(adev); 3358 3359 r = amdgpu_device_enable_mgpu_fan_boost(); 3360 if (r) 3361 DRM_ERROR("enable mgpu fan boost failed (%d).\n", r); 3362 3363 /* For passthrough configuration on arcturus and aldebaran, enable special handling SBR */ 3364 if (amdgpu_passthrough(adev) && 3365 ((adev->asic_type == CHIP_ARCTURUS && adev->gmc.xgmi.num_physical_nodes > 1) || 3366 adev->asic_type == CHIP_ALDEBARAN)) 3367 amdgpu_dpm_handle_passthrough_sbr(adev, true); 3368 3369 if (adev->gmc.xgmi.num_physical_nodes > 1) { 3370 mutex_lock(&mgpu_info.mutex); 3371 3372 /* 3373 * Reset device p-state to low as this was booted with high. 3374 * 3375 * This should be performed only after all devices from the same 3376 * hive get initialized. 3377 * 3378 * However, it's unknown how many device in the hive in advance. 3379 * As this is counted one by one during devices initializations. 3380 * 3381 * So, we wait for all XGMI interlinked devices initialized. 3382 * This may bring some delays as those devices may come from 3383 * different hives. But that should be OK. 3384 */ 3385 if (mgpu_info.num_dgpu == adev->gmc.xgmi.num_physical_nodes) { 3386 for (i = 0; i < mgpu_info.num_gpu; i++) { 3387 gpu_instance = &(mgpu_info.gpu_ins[i]); 3388 if (gpu_instance->adev->flags & AMD_IS_APU) 3389 continue; 3390 3391 r = amdgpu_xgmi_set_pstate(gpu_instance->adev, 3392 AMDGPU_XGMI_PSTATE_MIN); 3393 if (r) { 3394 DRM_ERROR("pstate setting failed (%d).\n", r); 3395 break; 3396 } 3397 } 3398 } 3399 3400 mutex_unlock(&mgpu_info.mutex); 3401 } 3402 3403 return 0; 3404 } 3405 3406 static void amdgpu_ip_block_hw_fini(struct amdgpu_ip_block *ip_block) 3407 { 3408 int r; 3409 3410 if (!ip_block->version->funcs->hw_fini) { 3411 DRM_ERROR("hw_fini of IP block <%s> not defined\n", 3412 ip_block->version->funcs->name); 3413 } else { 3414 r = ip_block->version->funcs->hw_fini(ip_block); 3415 /* XXX handle errors */ 3416 if (r) { 3417 DRM_DEBUG("hw_fini of IP block <%s> failed %d\n", 3418 ip_block->version->funcs->name, r); 3419 } 3420 } 3421 3422 ip_block->status.hw = false; 3423 } 3424 3425 /** 3426 * amdgpu_device_smu_fini_early - smu hw_fini wrapper 3427 * 3428 * @adev: amdgpu_device pointer 3429 * 3430 * For ASICs need to disable SMC first 3431 */ 3432 static void amdgpu_device_smu_fini_early(struct amdgpu_device *adev) 3433 { 3434 int i; 3435 3436 if (amdgpu_ip_version(adev, GC_HWIP, 0) > IP_VERSION(9, 0, 0)) 3437 return; 3438 3439 for (i = 0; i < adev->num_ip_blocks; i++) { 3440 if (!adev->ip_blocks[i].status.hw) 3441 continue; 3442 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) { 3443 amdgpu_ip_block_hw_fini(&adev->ip_blocks[i]); 3444 break; 3445 } 3446 } 3447 } 3448 3449 static int amdgpu_device_ip_fini_early(struct amdgpu_device *adev) 3450 { 3451 int i, r; 3452 3453 for (i = 0; i < adev->num_ip_blocks; i++) { 3454 if (!adev->ip_blocks[i].version->funcs->early_fini) 3455 continue; 3456 3457 r = adev->ip_blocks[i].version->funcs->early_fini(&adev->ip_blocks[i]); 3458 if (r) { 3459 DRM_DEBUG("early_fini of IP block <%s> failed %d\n", 3460 adev->ip_blocks[i].version->funcs->name, r); 3461 } 3462 } 3463 3464 amdgpu_device_set_pg_state(adev, AMD_PG_STATE_UNGATE); 3465 amdgpu_device_set_cg_state(adev, AMD_CG_STATE_UNGATE); 3466 3467 amdgpu_amdkfd_suspend(adev, false); 3468 3469 /* Workaround for ASICs need to disable SMC first */ 3470 amdgpu_device_smu_fini_early(adev); 3471 3472 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 3473 if (!adev->ip_blocks[i].status.hw) 3474 continue; 3475 3476 amdgpu_ip_block_hw_fini(&adev->ip_blocks[i]); 3477 } 3478 3479 if (amdgpu_sriov_vf(adev)) { 3480 if (amdgpu_virt_release_full_gpu(adev, false)) 3481 DRM_ERROR("failed to release exclusive mode on fini\n"); 3482 } 3483 3484 return 0; 3485 } 3486 3487 /** 3488 * amdgpu_device_ip_fini - run fini for hardware IPs 3489 * 3490 * @adev: amdgpu_device pointer 3491 * 3492 * Main teardown pass for hardware IPs. The list of all the hardware 3493 * IPs that make up the asic is walked and the hw_fini and sw_fini callbacks 3494 * are run. hw_fini tears down the hardware associated with each IP 3495 * and sw_fini tears down any software state associated with each IP. 3496 * Returns 0 on success, negative error code on failure. 3497 */ 3498 static int amdgpu_device_ip_fini(struct amdgpu_device *adev) 3499 { 3500 int i, r; 3501 3502 amdgpu_cper_fini(adev); 3503 3504 if (amdgpu_sriov_vf(adev) && adev->virt.ras_init_done) 3505 amdgpu_virt_release_ras_err_handler_data(adev); 3506 3507 if (adev->gmc.xgmi.num_physical_nodes > 1) 3508 amdgpu_xgmi_remove_device(adev); 3509 3510 amdgpu_amdkfd_device_fini_sw(adev); 3511 3512 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 3513 if (!adev->ip_blocks[i].status.sw) 3514 continue; 3515 3516 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) { 3517 amdgpu_ucode_free_bo(adev); 3518 amdgpu_free_static_csa(&adev->virt.csa_obj); 3519 amdgpu_device_wb_fini(adev); 3520 amdgpu_device_mem_scratch_fini(adev); 3521 amdgpu_ib_pool_fini(adev); 3522 amdgpu_seq64_fini(adev); 3523 } 3524 if (adev->ip_blocks[i].version->funcs->sw_fini) { 3525 r = adev->ip_blocks[i].version->funcs->sw_fini(&adev->ip_blocks[i]); 3526 /* XXX handle errors */ 3527 if (r) { 3528 DRM_DEBUG("sw_fini of IP block <%s> failed %d\n", 3529 adev->ip_blocks[i].version->funcs->name, r); 3530 } 3531 } 3532 adev->ip_blocks[i].status.sw = false; 3533 adev->ip_blocks[i].status.valid = false; 3534 } 3535 3536 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 3537 if (!adev->ip_blocks[i].status.late_initialized) 3538 continue; 3539 if (adev->ip_blocks[i].version->funcs->late_fini) 3540 adev->ip_blocks[i].version->funcs->late_fini(&adev->ip_blocks[i]); 3541 adev->ip_blocks[i].status.late_initialized = false; 3542 } 3543 3544 amdgpu_ras_fini(adev); 3545 3546 return 0; 3547 } 3548 3549 /** 3550 * amdgpu_device_delayed_init_work_handler - work handler for IB tests 3551 * 3552 * @work: work_struct. 3553 */ 3554 static void amdgpu_device_delayed_init_work_handler(struct work_struct *work) 3555 { 3556 struct amdgpu_device *adev = 3557 container_of(work, struct amdgpu_device, delayed_init_work.work); 3558 int r; 3559 3560 r = amdgpu_ib_ring_tests(adev); 3561 if (r) 3562 DRM_ERROR("ib ring test failed (%d).\n", r); 3563 } 3564 3565 static void amdgpu_device_delay_enable_gfx_off(struct work_struct *work) 3566 { 3567 struct amdgpu_device *adev = 3568 container_of(work, struct amdgpu_device, gfx.gfx_off_delay_work.work); 3569 3570 WARN_ON_ONCE(adev->gfx.gfx_off_state); 3571 WARN_ON_ONCE(adev->gfx.gfx_off_req_count); 3572 3573 if (!amdgpu_dpm_set_powergating_by_smu(adev, AMD_IP_BLOCK_TYPE_GFX, true, 0)) 3574 adev->gfx.gfx_off_state = true; 3575 } 3576 3577 /** 3578 * amdgpu_device_ip_suspend_phase1 - run suspend for hardware IPs (phase 1) 3579 * 3580 * @adev: amdgpu_device pointer 3581 * 3582 * Main suspend function for hardware IPs. The list of all the hardware 3583 * IPs that make up the asic is walked, clockgating is disabled and the 3584 * suspend callbacks are run. suspend puts the hardware and software state 3585 * in each IP into a state suitable for suspend. 3586 * Returns 0 on success, negative error code on failure. 3587 */ 3588 static int amdgpu_device_ip_suspend_phase1(struct amdgpu_device *adev) 3589 { 3590 int i, r; 3591 3592 amdgpu_device_set_pg_state(adev, AMD_PG_STATE_UNGATE); 3593 amdgpu_device_set_cg_state(adev, AMD_CG_STATE_UNGATE); 3594 3595 /* 3596 * Per PMFW team's suggestion, driver needs to handle gfxoff 3597 * and df cstate features disablement for gpu reset(e.g. Mode1Reset) 3598 * scenario. Add the missing df cstate disablement here. 3599 */ 3600 if (amdgpu_dpm_set_df_cstate(adev, DF_CSTATE_DISALLOW)) 3601 dev_warn(adev->dev, "Failed to disallow df cstate"); 3602 3603 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 3604 if (!adev->ip_blocks[i].status.valid) 3605 continue; 3606 3607 /* displays are handled separately */ 3608 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_DCE) 3609 continue; 3610 3611 /* XXX handle errors */ 3612 r = amdgpu_ip_block_suspend(&adev->ip_blocks[i]); 3613 if (r) 3614 return r; 3615 } 3616 3617 return 0; 3618 } 3619 3620 /** 3621 * amdgpu_device_ip_suspend_phase2 - run suspend for hardware IPs (phase 2) 3622 * 3623 * @adev: amdgpu_device pointer 3624 * 3625 * Main suspend function for hardware IPs. The list of all the hardware 3626 * IPs that make up the asic is walked, clockgating is disabled and the 3627 * suspend callbacks are run. suspend puts the hardware and software state 3628 * in each IP into a state suitable for suspend. 3629 * Returns 0 on success, negative error code on failure. 3630 */ 3631 static int amdgpu_device_ip_suspend_phase2(struct amdgpu_device *adev) 3632 { 3633 int i, r; 3634 3635 if (adev->in_s0ix) 3636 amdgpu_dpm_gfx_state_change(adev, sGpuChangeState_D3Entry); 3637 3638 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 3639 if (!adev->ip_blocks[i].status.valid) 3640 continue; 3641 /* displays are handled in phase1 */ 3642 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE) 3643 continue; 3644 /* PSP lost connection when err_event_athub occurs */ 3645 if (amdgpu_ras_intr_triggered() && 3646 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) { 3647 adev->ip_blocks[i].status.hw = false; 3648 continue; 3649 } 3650 3651 /* skip unnecessary suspend if we do not initialize them yet */ 3652 if (!amdgpu_ip_member_of_hwini( 3653 adev, adev->ip_blocks[i].version->type)) 3654 continue; 3655 3656 /* Since we skip suspend for S0i3, we need to cancel the delayed 3657 * idle work here as the suspend callback never gets called. 3658 */ 3659 if (adev->in_s0ix && 3660 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GFX && 3661 amdgpu_ip_version(adev, GC_HWIP, 0) >= IP_VERSION(10, 0, 0)) 3662 cancel_delayed_work_sync(&adev->gfx.idle_work); 3663 /* skip suspend of gfx/mes and psp for S0ix 3664 * gfx is in gfxoff state, so on resume it will exit gfxoff just 3665 * like at runtime. PSP is also part of the always on hardware 3666 * so no need to suspend it. 3667 */ 3668 if (adev->in_s0ix && 3669 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP || 3670 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GFX || 3671 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_MES)) 3672 continue; 3673 3674 /* SDMA 5.x+ is part of GFX power domain so it's covered by GFXOFF */ 3675 if (adev->in_s0ix && 3676 (amdgpu_ip_version(adev, SDMA0_HWIP, 0) >= 3677 IP_VERSION(5, 0, 0)) && 3678 (adev->ip_blocks[i].version->type == 3679 AMD_IP_BLOCK_TYPE_SDMA)) 3680 continue; 3681 3682 /* Once swPSP provides the IMU, RLC FW binaries to TOS during cold-boot. 3683 * These are in TMR, hence are expected to be reused by PSP-TOS to reload 3684 * from this location and RLC Autoload automatically also gets loaded 3685 * from here based on PMFW -> PSP message during re-init sequence. 3686 * Therefore, the psp suspend & resume should be skipped to avoid destroy 3687 * the TMR and reload FWs again for IMU enabled APU ASICs. 3688 */ 3689 if (amdgpu_in_reset(adev) && 3690 (adev->flags & AMD_IS_APU) && adev->gfx.imu.funcs && 3691 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) 3692 continue; 3693 3694 /* XXX handle errors */ 3695 r = amdgpu_ip_block_suspend(&adev->ip_blocks[i]); 3696 adev->ip_blocks[i].status.hw = false; 3697 3698 /* handle putting the SMC in the appropriate state */ 3699 if (!amdgpu_sriov_vf(adev)) { 3700 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) { 3701 r = amdgpu_dpm_set_mp1_state(adev, adev->mp1_state); 3702 if (r) { 3703 DRM_ERROR("SMC failed to set mp1 state %d, %d\n", 3704 adev->mp1_state, r); 3705 return r; 3706 } 3707 } 3708 } 3709 } 3710 3711 return 0; 3712 } 3713 3714 /** 3715 * amdgpu_device_ip_suspend - run suspend for hardware IPs 3716 * 3717 * @adev: amdgpu_device pointer 3718 * 3719 * Main suspend function for hardware IPs. The list of all the hardware 3720 * IPs that make up the asic is walked, clockgating is disabled and the 3721 * suspend callbacks are run. suspend puts the hardware and software state 3722 * in each IP into a state suitable for suspend. 3723 * Returns 0 on success, negative error code on failure. 3724 */ 3725 int amdgpu_device_ip_suspend(struct amdgpu_device *adev) 3726 { 3727 int r; 3728 3729 if (amdgpu_sriov_vf(adev)) { 3730 amdgpu_virt_fini_data_exchange(adev); 3731 amdgpu_virt_request_full_gpu(adev, false); 3732 } 3733 3734 amdgpu_ttm_set_buffer_funcs_status(adev, false); 3735 3736 r = amdgpu_device_ip_suspend_phase1(adev); 3737 if (r) 3738 return r; 3739 r = amdgpu_device_ip_suspend_phase2(adev); 3740 3741 if (amdgpu_sriov_vf(adev)) 3742 amdgpu_virt_release_full_gpu(adev, false); 3743 3744 return r; 3745 } 3746 3747 static int amdgpu_device_ip_reinit_early_sriov(struct amdgpu_device *adev) 3748 { 3749 int i, r; 3750 3751 static enum amd_ip_block_type ip_order[] = { 3752 AMD_IP_BLOCK_TYPE_COMMON, 3753 AMD_IP_BLOCK_TYPE_GMC, 3754 AMD_IP_BLOCK_TYPE_PSP, 3755 AMD_IP_BLOCK_TYPE_IH, 3756 }; 3757 3758 for (i = 0; i < adev->num_ip_blocks; i++) { 3759 int j; 3760 struct amdgpu_ip_block *block; 3761 3762 block = &adev->ip_blocks[i]; 3763 block->status.hw = false; 3764 3765 for (j = 0; j < ARRAY_SIZE(ip_order); j++) { 3766 3767 if (block->version->type != ip_order[j] || 3768 !block->status.valid) 3769 continue; 3770 3771 r = block->version->funcs->hw_init(&adev->ip_blocks[i]); 3772 if (r) { 3773 dev_err(adev->dev, "RE-INIT-early: %s failed\n", 3774 block->version->funcs->name); 3775 return r; 3776 } 3777 block->status.hw = true; 3778 } 3779 } 3780 3781 return 0; 3782 } 3783 3784 static int amdgpu_device_ip_reinit_late_sriov(struct amdgpu_device *adev) 3785 { 3786 struct amdgpu_ip_block *block; 3787 int i, r = 0; 3788 3789 static enum amd_ip_block_type ip_order[] = { 3790 AMD_IP_BLOCK_TYPE_SMC, 3791 AMD_IP_BLOCK_TYPE_DCE, 3792 AMD_IP_BLOCK_TYPE_GFX, 3793 AMD_IP_BLOCK_TYPE_SDMA, 3794 AMD_IP_BLOCK_TYPE_MES, 3795 AMD_IP_BLOCK_TYPE_UVD, 3796 AMD_IP_BLOCK_TYPE_VCE, 3797 AMD_IP_BLOCK_TYPE_VCN, 3798 AMD_IP_BLOCK_TYPE_JPEG 3799 }; 3800 3801 for (i = 0; i < ARRAY_SIZE(ip_order); i++) { 3802 block = amdgpu_device_ip_get_ip_block(adev, ip_order[i]); 3803 3804 if (!block) 3805 continue; 3806 3807 if (block->status.valid && !block->status.hw) { 3808 if (block->version->type == AMD_IP_BLOCK_TYPE_SMC) { 3809 r = amdgpu_ip_block_resume(block); 3810 } else { 3811 r = block->version->funcs->hw_init(block); 3812 } 3813 3814 if (r) { 3815 dev_err(adev->dev, "RE-INIT-late: %s failed\n", 3816 block->version->funcs->name); 3817 break; 3818 } 3819 block->status.hw = true; 3820 } 3821 } 3822 3823 return r; 3824 } 3825 3826 /** 3827 * amdgpu_device_ip_resume_phase1 - run resume for hardware IPs 3828 * 3829 * @adev: amdgpu_device pointer 3830 * 3831 * First resume function for hardware IPs. The list of all the hardware 3832 * IPs that make up the asic is walked and the resume callbacks are run for 3833 * COMMON, GMC, and IH. resume puts the hardware into a functional state 3834 * after a suspend and updates the software state as necessary. This 3835 * function is also used for restoring the GPU after a GPU reset. 3836 * Returns 0 on success, negative error code on failure. 3837 */ 3838 static int amdgpu_device_ip_resume_phase1(struct amdgpu_device *adev) 3839 { 3840 int i, r; 3841 3842 for (i = 0; i < adev->num_ip_blocks; i++) { 3843 if (!adev->ip_blocks[i].status.valid || adev->ip_blocks[i].status.hw) 3844 continue; 3845 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON || 3846 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC || 3847 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH || 3848 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP && amdgpu_sriov_vf(adev))) { 3849 3850 r = amdgpu_ip_block_resume(&adev->ip_blocks[i]); 3851 if (r) 3852 return r; 3853 } 3854 } 3855 3856 return 0; 3857 } 3858 3859 /** 3860 * amdgpu_device_ip_resume_phase2 - run resume for hardware IPs 3861 * 3862 * @adev: amdgpu_device pointer 3863 * 3864 * Second resume function for hardware IPs. The list of all the hardware 3865 * IPs that make up the asic is walked and the resume callbacks are run for 3866 * all blocks except COMMON, GMC, and IH. resume puts the hardware into a 3867 * functional state after a suspend and updates the software state as 3868 * necessary. This function is also used for restoring the GPU after a GPU 3869 * reset. 3870 * Returns 0 on success, negative error code on failure. 3871 */ 3872 static int amdgpu_device_ip_resume_phase2(struct amdgpu_device *adev) 3873 { 3874 int i, r; 3875 3876 for (i = 0; i < adev->num_ip_blocks; i++) { 3877 if (!adev->ip_blocks[i].status.valid || adev->ip_blocks[i].status.hw) 3878 continue; 3879 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON || 3880 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC || 3881 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH || 3882 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE || 3883 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) 3884 continue; 3885 r = amdgpu_ip_block_resume(&adev->ip_blocks[i]); 3886 if (r) 3887 return r; 3888 } 3889 3890 return 0; 3891 } 3892 3893 /** 3894 * amdgpu_device_ip_resume_phase3 - run resume for hardware IPs 3895 * 3896 * @adev: amdgpu_device pointer 3897 * 3898 * Third resume function for hardware IPs. The list of all the hardware 3899 * IPs that make up the asic is walked and the resume callbacks are run for 3900 * all DCE. resume puts the hardware into a functional state after a suspend 3901 * and updates the software state as necessary. This function is also used 3902 * for restoring the GPU after a GPU reset. 3903 * 3904 * Returns 0 on success, negative error code on failure. 3905 */ 3906 static int amdgpu_device_ip_resume_phase3(struct amdgpu_device *adev) 3907 { 3908 int i, r; 3909 3910 for (i = 0; i < adev->num_ip_blocks; i++) { 3911 if (!adev->ip_blocks[i].status.valid || adev->ip_blocks[i].status.hw) 3912 continue; 3913 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE) { 3914 r = amdgpu_ip_block_resume(&adev->ip_blocks[i]); 3915 if (r) 3916 return r; 3917 } 3918 } 3919 3920 return 0; 3921 } 3922 3923 /** 3924 * amdgpu_device_ip_resume - run resume for hardware IPs 3925 * 3926 * @adev: amdgpu_device pointer 3927 * 3928 * Main resume function for hardware IPs. The hardware IPs 3929 * are split into two resume functions because they are 3930 * also used in recovering from a GPU reset and some additional 3931 * steps need to be take between them. In this case (S3/S4) they are 3932 * run sequentially. 3933 * Returns 0 on success, negative error code on failure. 3934 */ 3935 static int amdgpu_device_ip_resume(struct amdgpu_device *adev) 3936 { 3937 int r; 3938 3939 r = amdgpu_device_ip_resume_phase1(adev); 3940 if (r) 3941 return r; 3942 3943 r = amdgpu_device_fw_loading(adev); 3944 if (r) 3945 return r; 3946 3947 r = amdgpu_device_ip_resume_phase2(adev); 3948 3949 if (adev->mman.buffer_funcs_ring->sched.ready) 3950 amdgpu_ttm_set_buffer_funcs_status(adev, true); 3951 3952 if (r) 3953 return r; 3954 3955 amdgpu_fence_driver_hw_init(adev); 3956 3957 r = amdgpu_device_ip_resume_phase3(adev); 3958 3959 return r; 3960 } 3961 3962 /** 3963 * amdgpu_device_detect_sriov_bios - determine if the board supports SR-IOV 3964 * 3965 * @adev: amdgpu_device pointer 3966 * 3967 * Query the VBIOS data tables to determine if the board supports SR-IOV. 3968 */ 3969 static void amdgpu_device_detect_sriov_bios(struct amdgpu_device *adev) 3970 { 3971 if (amdgpu_sriov_vf(adev)) { 3972 if (adev->is_atom_fw) { 3973 if (amdgpu_atomfirmware_gpu_virtualization_supported(adev)) 3974 adev->virt.caps |= AMDGPU_SRIOV_CAPS_SRIOV_VBIOS; 3975 } else { 3976 if (amdgpu_atombios_has_gpu_virtualization_table(adev)) 3977 adev->virt.caps |= AMDGPU_SRIOV_CAPS_SRIOV_VBIOS; 3978 } 3979 3980 if (!(adev->virt.caps & AMDGPU_SRIOV_CAPS_SRIOV_VBIOS)) 3981 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_NO_VBIOS, 0, 0); 3982 } 3983 } 3984 3985 /** 3986 * amdgpu_device_asic_has_dc_support - determine if DC supports the asic 3987 * 3988 * @asic_type: AMD asic type 3989 * 3990 * Check if there is DC (new modesetting infrastructre) support for an asic. 3991 * returns true if DC has support, false if not. 3992 */ 3993 bool amdgpu_device_asic_has_dc_support(enum amd_asic_type asic_type) 3994 { 3995 switch (asic_type) { 3996 #ifdef CONFIG_DRM_AMDGPU_SI 3997 case CHIP_HAINAN: 3998 #endif 3999 case CHIP_TOPAZ: 4000 /* chips with no display hardware */ 4001 return false; 4002 #if defined(CONFIG_DRM_AMD_DC) 4003 case CHIP_TAHITI: 4004 case CHIP_PITCAIRN: 4005 case CHIP_VERDE: 4006 case CHIP_OLAND: 4007 /* 4008 * We have systems in the wild with these ASICs that require 4009 * LVDS and VGA support which is not supported with DC. 4010 * 4011 * Fallback to the non-DC driver here by default so as not to 4012 * cause regressions. 4013 */ 4014 #if defined(CONFIG_DRM_AMD_DC_SI) 4015 return amdgpu_dc > 0; 4016 #else 4017 return false; 4018 #endif 4019 case CHIP_BONAIRE: 4020 case CHIP_KAVERI: 4021 case CHIP_KABINI: 4022 case CHIP_MULLINS: 4023 /* 4024 * We have systems in the wild with these ASICs that require 4025 * VGA support which is not supported with DC. 4026 * 4027 * Fallback to the non-DC driver here by default so as not to 4028 * cause regressions. 4029 */ 4030 return amdgpu_dc > 0; 4031 default: 4032 return amdgpu_dc != 0; 4033 #else 4034 default: 4035 if (amdgpu_dc > 0) 4036 DRM_INFO_ONCE("Display Core has been requested via kernel parameter but isn't supported by ASIC, ignoring\n"); 4037 return false; 4038 #endif 4039 } 4040 } 4041 4042 /** 4043 * amdgpu_device_has_dc_support - check if dc is supported 4044 * 4045 * @adev: amdgpu_device pointer 4046 * 4047 * Returns true for supported, false for not supported 4048 */ 4049 bool amdgpu_device_has_dc_support(struct amdgpu_device *adev) 4050 { 4051 if (adev->enable_virtual_display || 4052 (adev->harvest_ip_mask & AMD_HARVEST_IP_DMU_MASK)) 4053 return false; 4054 4055 return amdgpu_device_asic_has_dc_support(adev->asic_type); 4056 } 4057 4058 static void amdgpu_device_xgmi_reset_func(struct work_struct *__work) 4059 { 4060 struct amdgpu_device *adev = 4061 container_of(__work, struct amdgpu_device, xgmi_reset_work); 4062 struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev); 4063 4064 /* It's a bug to not have a hive within this function */ 4065 if (WARN_ON(!hive)) 4066 return; 4067 4068 /* 4069 * Use task barrier to synchronize all xgmi reset works across the 4070 * hive. task_barrier_enter and task_barrier_exit will block 4071 * until all the threads running the xgmi reset works reach 4072 * those points. task_barrier_full will do both blocks. 4073 */ 4074 if (amdgpu_asic_reset_method(adev) == AMD_RESET_METHOD_BACO) { 4075 4076 task_barrier_enter(&hive->tb); 4077 adev->asic_reset_res = amdgpu_device_baco_enter(adev_to_drm(adev)); 4078 4079 if (adev->asic_reset_res) 4080 goto fail; 4081 4082 task_barrier_exit(&hive->tb); 4083 adev->asic_reset_res = amdgpu_device_baco_exit(adev_to_drm(adev)); 4084 4085 if (adev->asic_reset_res) 4086 goto fail; 4087 4088 amdgpu_ras_reset_error_count(adev, AMDGPU_RAS_BLOCK__MMHUB); 4089 } else { 4090 4091 task_barrier_full(&hive->tb); 4092 adev->asic_reset_res = amdgpu_asic_reset(adev); 4093 } 4094 4095 fail: 4096 if (adev->asic_reset_res) 4097 DRM_WARN("ASIC reset failed with error, %d for drm dev, %s", 4098 adev->asic_reset_res, adev_to_drm(adev)->unique); 4099 amdgpu_put_xgmi_hive(hive); 4100 } 4101 4102 static int amdgpu_device_get_job_timeout_settings(struct amdgpu_device *adev) 4103 { 4104 char *input = amdgpu_lockup_timeout; 4105 char *timeout_setting = NULL; 4106 int index = 0; 4107 long timeout; 4108 int ret = 0; 4109 4110 /* 4111 * By default timeout for non compute jobs is 10000 4112 * and 60000 for compute jobs. 4113 * In SR-IOV or passthrough mode, timeout for compute 4114 * jobs are 60000 by default. 4115 */ 4116 adev->gfx_timeout = msecs_to_jiffies(10000); 4117 adev->sdma_timeout = adev->video_timeout = adev->gfx_timeout; 4118 if (amdgpu_sriov_vf(adev)) 4119 adev->compute_timeout = amdgpu_sriov_is_pp_one_vf(adev) ? 4120 msecs_to_jiffies(60000) : msecs_to_jiffies(10000); 4121 else 4122 adev->compute_timeout = msecs_to_jiffies(60000); 4123 4124 if (strnlen(input, AMDGPU_MAX_TIMEOUT_PARAM_LENGTH)) { 4125 while ((timeout_setting = strsep(&input, ",")) && 4126 strnlen(timeout_setting, AMDGPU_MAX_TIMEOUT_PARAM_LENGTH)) { 4127 ret = kstrtol(timeout_setting, 0, &timeout); 4128 if (ret) 4129 return ret; 4130 4131 if (timeout == 0) { 4132 index++; 4133 continue; 4134 } else if (timeout < 0) { 4135 timeout = MAX_SCHEDULE_TIMEOUT; 4136 dev_warn(adev->dev, "lockup timeout disabled"); 4137 add_taint(TAINT_SOFTLOCKUP, LOCKDEP_STILL_OK); 4138 } else { 4139 timeout = msecs_to_jiffies(timeout); 4140 } 4141 4142 switch (index++) { 4143 case 0: 4144 adev->gfx_timeout = timeout; 4145 break; 4146 case 1: 4147 adev->compute_timeout = timeout; 4148 break; 4149 case 2: 4150 adev->sdma_timeout = timeout; 4151 break; 4152 case 3: 4153 adev->video_timeout = timeout; 4154 break; 4155 default: 4156 break; 4157 } 4158 } 4159 /* 4160 * There is only one value specified and 4161 * it should apply to all non-compute jobs. 4162 */ 4163 if (index == 1) { 4164 adev->sdma_timeout = adev->video_timeout = adev->gfx_timeout; 4165 if (amdgpu_sriov_vf(adev) || amdgpu_passthrough(adev)) 4166 adev->compute_timeout = adev->gfx_timeout; 4167 } 4168 } 4169 4170 return ret; 4171 } 4172 4173 /** 4174 * amdgpu_device_check_iommu_direct_map - check if RAM direct mapped to GPU 4175 * 4176 * @adev: amdgpu_device pointer 4177 * 4178 * RAM direct mapped to GPU if IOMMU is not enabled or is pass through mode 4179 */ 4180 static void amdgpu_device_check_iommu_direct_map(struct amdgpu_device *adev) 4181 { 4182 struct iommu_domain *domain; 4183 4184 domain = iommu_get_domain_for_dev(adev->dev); 4185 if (!domain || domain->type == IOMMU_DOMAIN_IDENTITY) 4186 adev->ram_is_direct_mapped = true; 4187 } 4188 4189 #if defined(CONFIG_HSA_AMD_P2P) 4190 /** 4191 * amdgpu_device_check_iommu_remap - Check if DMA remapping is enabled. 4192 * 4193 * @adev: amdgpu_device pointer 4194 * 4195 * return if IOMMU remapping bar address 4196 */ 4197 static bool amdgpu_device_check_iommu_remap(struct amdgpu_device *adev) 4198 { 4199 struct iommu_domain *domain; 4200 4201 domain = iommu_get_domain_for_dev(adev->dev); 4202 if (domain && (domain->type == IOMMU_DOMAIN_DMA || 4203 domain->type == IOMMU_DOMAIN_DMA_FQ)) 4204 return true; 4205 4206 return false; 4207 } 4208 #endif 4209 4210 static void amdgpu_device_set_mcbp(struct amdgpu_device *adev) 4211 { 4212 if (amdgpu_mcbp == 1) 4213 adev->gfx.mcbp = true; 4214 else if (amdgpu_mcbp == 0) 4215 adev->gfx.mcbp = false; 4216 4217 if (amdgpu_sriov_vf(adev)) 4218 adev->gfx.mcbp = true; 4219 4220 if (adev->gfx.mcbp) 4221 DRM_INFO("MCBP is enabled\n"); 4222 } 4223 4224 /** 4225 * amdgpu_device_init - initialize the driver 4226 * 4227 * @adev: amdgpu_device pointer 4228 * @flags: driver flags 4229 * 4230 * Initializes the driver info and hw (all asics). 4231 * Returns 0 for success or an error on failure. 4232 * Called at driver startup. 4233 */ 4234 int amdgpu_device_init(struct amdgpu_device *adev, 4235 uint32_t flags) 4236 { 4237 struct drm_device *ddev = adev_to_drm(adev); 4238 struct pci_dev *pdev = adev->pdev; 4239 int r, i; 4240 bool px = false; 4241 u32 max_MBps; 4242 int tmp; 4243 4244 adev->shutdown = false; 4245 adev->flags = flags; 4246 4247 if (amdgpu_force_asic_type >= 0 && amdgpu_force_asic_type < CHIP_LAST) 4248 adev->asic_type = amdgpu_force_asic_type; 4249 else 4250 adev->asic_type = flags & AMD_ASIC_MASK; 4251 4252 adev->usec_timeout = AMDGPU_MAX_USEC_TIMEOUT; 4253 if (amdgpu_emu_mode == 1) 4254 adev->usec_timeout *= 10; 4255 adev->gmc.gart_size = 512 * 1024 * 1024; 4256 adev->accel_working = false; 4257 adev->num_rings = 0; 4258 RCU_INIT_POINTER(adev->gang_submit, dma_fence_get_stub()); 4259 adev->mman.buffer_funcs = NULL; 4260 adev->mman.buffer_funcs_ring = NULL; 4261 adev->vm_manager.vm_pte_funcs = NULL; 4262 adev->vm_manager.vm_pte_num_scheds = 0; 4263 adev->gmc.gmc_funcs = NULL; 4264 adev->harvest_ip_mask = 0x0; 4265 adev->fence_context = dma_fence_context_alloc(AMDGPU_MAX_RINGS); 4266 bitmap_zero(adev->gfx.pipe_reserve_bitmap, AMDGPU_MAX_COMPUTE_QUEUES); 4267 4268 adev->smc_rreg = &amdgpu_invalid_rreg; 4269 adev->smc_wreg = &amdgpu_invalid_wreg; 4270 adev->pcie_rreg = &amdgpu_invalid_rreg; 4271 adev->pcie_wreg = &amdgpu_invalid_wreg; 4272 adev->pcie_rreg_ext = &amdgpu_invalid_rreg_ext; 4273 adev->pcie_wreg_ext = &amdgpu_invalid_wreg_ext; 4274 adev->pciep_rreg = &amdgpu_invalid_rreg; 4275 adev->pciep_wreg = &amdgpu_invalid_wreg; 4276 adev->pcie_rreg64 = &amdgpu_invalid_rreg64; 4277 adev->pcie_wreg64 = &amdgpu_invalid_wreg64; 4278 adev->pcie_rreg64_ext = &amdgpu_invalid_rreg64_ext; 4279 adev->pcie_wreg64_ext = &amdgpu_invalid_wreg64_ext; 4280 adev->uvd_ctx_rreg = &amdgpu_invalid_rreg; 4281 adev->uvd_ctx_wreg = &amdgpu_invalid_wreg; 4282 adev->didt_rreg = &amdgpu_invalid_rreg; 4283 adev->didt_wreg = &amdgpu_invalid_wreg; 4284 adev->gc_cac_rreg = &amdgpu_invalid_rreg; 4285 adev->gc_cac_wreg = &amdgpu_invalid_wreg; 4286 adev->audio_endpt_rreg = &amdgpu_block_invalid_rreg; 4287 adev->audio_endpt_wreg = &amdgpu_block_invalid_wreg; 4288 4289 DRM_INFO("initializing kernel modesetting (%s 0x%04X:0x%04X 0x%04X:0x%04X 0x%02X).\n", 4290 amdgpu_asic_name[adev->asic_type], pdev->vendor, pdev->device, 4291 pdev->subsystem_vendor, pdev->subsystem_device, pdev->revision); 4292 4293 /* mutex initialization are all done here so we 4294 * can recall function without having locking issues 4295 */ 4296 mutex_init(&adev->firmware.mutex); 4297 mutex_init(&adev->pm.mutex); 4298 mutex_init(&adev->gfx.gpu_clock_mutex); 4299 mutex_init(&adev->srbm_mutex); 4300 mutex_init(&adev->gfx.pipe_reserve_mutex); 4301 mutex_init(&adev->gfx.gfx_off_mutex); 4302 mutex_init(&adev->gfx.partition_mutex); 4303 mutex_init(&adev->grbm_idx_mutex); 4304 mutex_init(&adev->mn_lock); 4305 mutex_init(&adev->virt.vf_errors.lock); 4306 hash_init(adev->mn_hash); 4307 mutex_init(&adev->psp.mutex); 4308 mutex_init(&adev->notifier_lock); 4309 mutex_init(&adev->pm.stable_pstate_ctx_lock); 4310 mutex_init(&adev->benchmark_mutex); 4311 mutex_init(&adev->gfx.reset_sem_mutex); 4312 /* Initialize the mutex for cleaner shader isolation between GFX and compute processes */ 4313 mutex_init(&adev->enforce_isolation_mutex); 4314 for (i = 0; i < MAX_XCP; ++i) { 4315 adev->isolation[i].spearhead = dma_fence_get_stub(); 4316 amdgpu_sync_create(&adev->isolation[i].active); 4317 amdgpu_sync_create(&adev->isolation[i].prev); 4318 } 4319 mutex_init(&adev->gfx.kfd_sch_mutex); 4320 mutex_init(&adev->gfx.workload_profile_mutex); 4321 mutex_init(&adev->vcn.workload_profile_mutex); 4322 4323 amdgpu_device_init_apu_flags(adev); 4324 4325 r = amdgpu_device_check_arguments(adev); 4326 if (r) 4327 return r; 4328 4329 spin_lock_init(&adev->mmio_idx_lock); 4330 spin_lock_init(&adev->smc_idx_lock); 4331 spin_lock_init(&adev->pcie_idx_lock); 4332 spin_lock_init(&adev->uvd_ctx_idx_lock); 4333 spin_lock_init(&adev->didt_idx_lock); 4334 spin_lock_init(&adev->gc_cac_idx_lock); 4335 spin_lock_init(&adev->se_cac_idx_lock); 4336 spin_lock_init(&adev->audio_endpt_idx_lock); 4337 spin_lock_init(&adev->mm_stats.lock); 4338 spin_lock_init(&adev->virt.rlcg_reg_lock); 4339 spin_lock_init(&adev->wb.lock); 4340 4341 INIT_LIST_HEAD(&adev->reset_list); 4342 4343 INIT_LIST_HEAD(&adev->ras_list); 4344 4345 INIT_LIST_HEAD(&adev->pm.od_kobj_list); 4346 4347 INIT_DELAYED_WORK(&adev->delayed_init_work, 4348 amdgpu_device_delayed_init_work_handler); 4349 INIT_DELAYED_WORK(&adev->gfx.gfx_off_delay_work, 4350 amdgpu_device_delay_enable_gfx_off); 4351 /* 4352 * Initialize the enforce_isolation work structures for each XCP 4353 * partition. This work handler is responsible for enforcing shader 4354 * isolation on AMD GPUs. It counts the number of emitted fences for 4355 * each GFX and compute ring. If there are any fences, it schedules 4356 * the `enforce_isolation_work` to be run after a delay. If there are 4357 * no fences, it signals the Kernel Fusion Driver (KFD) to resume the 4358 * runqueue. 4359 */ 4360 for (i = 0; i < MAX_XCP; i++) { 4361 INIT_DELAYED_WORK(&adev->gfx.enforce_isolation[i].work, 4362 amdgpu_gfx_enforce_isolation_handler); 4363 adev->gfx.enforce_isolation[i].adev = adev; 4364 adev->gfx.enforce_isolation[i].xcp_id = i; 4365 } 4366 4367 INIT_WORK(&adev->xgmi_reset_work, amdgpu_device_xgmi_reset_func); 4368 4369 adev->gfx.gfx_off_req_count = 1; 4370 adev->gfx.gfx_off_residency = 0; 4371 adev->gfx.gfx_off_entrycount = 0; 4372 adev->pm.ac_power = power_supply_is_system_supplied() > 0; 4373 4374 atomic_set(&adev->throttling_logging_enabled, 1); 4375 /* 4376 * If throttling continues, logging will be performed every minute 4377 * to avoid log flooding. "-1" is subtracted since the thermal 4378 * throttling interrupt comes every second. Thus, the total logging 4379 * interval is 59 seconds(retelimited printk interval) + 1(waiting 4380 * for throttling interrupt) = 60 seconds. 4381 */ 4382 ratelimit_state_init(&adev->throttling_logging_rs, (60 - 1) * HZ, 1); 4383 4384 ratelimit_set_flags(&adev->throttling_logging_rs, RATELIMIT_MSG_ON_RELEASE); 4385 4386 /* Registers mapping */ 4387 /* TODO: block userspace mapping of io register */ 4388 if (adev->asic_type >= CHIP_BONAIRE) { 4389 adev->rmmio_base = pci_resource_start(adev->pdev, 5); 4390 adev->rmmio_size = pci_resource_len(adev->pdev, 5); 4391 } else { 4392 adev->rmmio_base = pci_resource_start(adev->pdev, 2); 4393 adev->rmmio_size = pci_resource_len(adev->pdev, 2); 4394 } 4395 4396 for (i = 0; i < AMD_IP_BLOCK_TYPE_NUM; i++) 4397 atomic_set(&adev->pm.pwr_state[i], POWER_STATE_UNKNOWN); 4398 4399 adev->rmmio = ioremap(adev->rmmio_base, adev->rmmio_size); 4400 if (!adev->rmmio) 4401 return -ENOMEM; 4402 4403 DRM_INFO("register mmio base: 0x%08X\n", (uint32_t)adev->rmmio_base); 4404 DRM_INFO("register mmio size: %u\n", (unsigned int)adev->rmmio_size); 4405 4406 /* 4407 * Reset domain needs to be present early, before XGMI hive discovered 4408 * (if any) and initialized to use reset sem and in_gpu reset flag 4409 * early on during init and before calling to RREG32. 4410 */ 4411 adev->reset_domain = amdgpu_reset_create_reset_domain(SINGLE_DEVICE, "amdgpu-reset-dev"); 4412 if (!adev->reset_domain) 4413 return -ENOMEM; 4414 4415 /* detect hw virtualization here */ 4416 amdgpu_virt_init(adev); 4417 4418 amdgpu_device_get_pcie_info(adev); 4419 4420 r = amdgpu_device_get_job_timeout_settings(adev); 4421 if (r) { 4422 dev_err(adev->dev, "invalid lockup_timeout parameter syntax\n"); 4423 return r; 4424 } 4425 4426 amdgpu_device_set_mcbp(adev); 4427 4428 /* 4429 * By default, use default mode where all blocks are expected to be 4430 * initialized. At present a 'swinit' of blocks is required to be 4431 * completed before the need for a different level is detected. 4432 */ 4433 amdgpu_set_init_level(adev, AMDGPU_INIT_LEVEL_DEFAULT); 4434 /* early init functions */ 4435 r = amdgpu_device_ip_early_init(adev); 4436 if (r) 4437 return r; 4438 4439 /* 4440 * No need to remove conflicting FBs for non-display class devices. 4441 * This prevents the sysfb from being freed accidently. 4442 */ 4443 if ((pdev->class >> 8) == PCI_CLASS_DISPLAY_VGA || 4444 (pdev->class >> 8) == PCI_CLASS_DISPLAY_OTHER) { 4445 /* Get rid of things like offb */ 4446 r = aperture_remove_conflicting_pci_devices(adev->pdev, amdgpu_kms_driver.name); 4447 if (r) 4448 return r; 4449 } 4450 4451 /* Enable TMZ based on IP_VERSION */ 4452 amdgpu_gmc_tmz_set(adev); 4453 4454 if (amdgpu_sriov_vf(adev) && 4455 amdgpu_ip_version(adev, GC_HWIP, 0) >= IP_VERSION(10, 3, 0)) 4456 /* VF MMIO access (except mailbox range) from CPU 4457 * will be blocked during sriov runtime 4458 */ 4459 adev->virt.caps |= AMDGPU_VF_MMIO_ACCESS_PROTECT; 4460 4461 amdgpu_gmc_noretry_set(adev); 4462 /* Need to get xgmi info early to decide the reset behavior*/ 4463 if (adev->gmc.xgmi.supported) { 4464 r = adev->gfxhub.funcs->get_xgmi_info(adev); 4465 if (r) 4466 return r; 4467 } 4468 4469 /* enable PCIE atomic ops */ 4470 if (amdgpu_sriov_vf(adev)) { 4471 if (adev->virt.fw_reserve.p_pf2vf) 4472 adev->have_atomics_support = ((struct amd_sriov_msg_pf2vf_info *) 4473 adev->virt.fw_reserve.p_pf2vf)->pcie_atomic_ops_support_flags == 4474 (PCI_EXP_DEVCAP2_ATOMIC_COMP32 | PCI_EXP_DEVCAP2_ATOMIC_COMP64); 4475 /* APUs w/ gfx9 onwards doesn't reply on PCIe atomics, rather it is a 4476 * internal path natively support atomics, set have_atomics_support to true. 4477 */ 4478 } else if ((adev->flags & AMD_IS_APU) && 4479 (amdgpu_ip_version(adev, GC_HWIP, 0) > 4480 IP_VERSION(9, 0, 0))) { 4481 adev->have_atomics_support = true; 4482 } else { 4483 adev->have_atomics_support = 4484 !pci_enable_atomic_ops_to_root(adev->pdev, 4485 PCI_EXP_DEVCAP2_ATOMIC_COMP32 | 4486 PCI_EXP_DEVCAP2_ATOMIC_COMP64); 4487 } 4488 4489 if (!adev->have_atomics_support) 4490 dev_info(adev->dev, "PCIE atomic ops is not supported\n"); 4491 4492 /* doorbell bar mapping and doorbell index init*/ 4493 amdgpu_doorbell_init(adev); 4494 4495 if (amdgpu_emu_mode == 1) { 4496 /* post the asic on emulation mode */ 4497 emu_soc_asic_init(adev); 4498 goto fence_driver_init; 4499 } 4500 4501 amdgpu_reset_init(adev); 4502 4503 /* detect if we are with an SRIOV vbios */ 4504 if (adev->bios) 4505 amdgpu_device_detect_sriov_bios(adev); 4506 4507 /* check if we need to reset the asic 4508 * E.g., driver was not cleanly unloaded previously, etc. 4509 */ 4510 if (!amdgpu_sriov_vf(adev) && amdgpu_asic_need_reset_on_init(adev)) { 4511 if (adev->gmc.xgmi.num_physical_nodes) { 4512 dev_info(adev->dev, "Pending hive reset.\n"); 4513 amdgpu_set_init_level(adev, 4514 AMDGPU_INIT_LEVEL_MINIMAL_XGMI); 4515 } else if (amdgpu_ip_version(adev, MP1_HWIP, 0) == IP_VERSION(13, 0, 10) && 4516 !amdgpu_device_has_display_hardware(adev)) { 4517 r = psp_gpu_reset(adev); 4518 } else { 4519 tmp = amdgpu_reset_method; 4520 /* It should do a default reset when loading or reloading the driver, 4521 * regardless of the module parameter reset_method. 4522 */ 4523 amdgpu_reset_method = AMD_RESET_METHOD_NONE; 4524 r = amdgpu_asic_reset(adev); 4525 amdgpu_reset_method = tmp; 4526 } 4527 4528 if (r) { 4529 dev_err(adev->dev, "asic reset on init failed\n"); 4530 goto failed; 4531 } 4532 } 4533 4534 /* Post card if necessary */ 4535 if (amdgpu_device_need_post(adev)) { 4536 if (!adev->bios) { 4537 dev_err(adev->dev, "no vBIOS found\n"); 4538 r = -EINVAL; 4539 goto failed; 4540 } 4541 DRM_INFO("GPU posting now...\n"); 4542 r = amdgpu_device_asic_init(adev); 4543 if (r) { 4544 dev_err(adev->dev, "gpu post error!\n"); 4545 goto failed; 4546 } 4547 } 4548 4549 if (adev->bios) { 4550 if (adev->is_atom_fw) { 4551 /* Initialize clocks */ 4552 r = amdgpu_atomfirmware_get_clock_info(adev); 4553 if (r) { 4554 dev_err(adev->dev, "amdgpu_atomfirmware_get_clock_info failed\n"); 4555 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_GET_CLOCK_FAIL, 0, 0); 4556 goto failed; 4557 } 4558 } else { 4559 /* Initialize clocks */ 4560 r = amdgpu_atombios_get_clock_info(adev); 4561 if (r) { 4562 dev_err(adev->dev, "amdgpu_atombios_get_clock_info failed\n"); 4563 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_GET_CLOCK_FAIL, 0, 0); 4564 goto failed; 4565 } 4566 /* init i2c buses */ 4567 amdgpu_i2c_init(adev); 4568 } 4569 } 4570 4571 fence_driver_init: 4572 /* Fence driver */ 4573 r = amdgpu_fence_driver_sw_init(adev); 4574 if (r) { 4575 dev_err(adev->dev, "amdgpu_fence_driver_sw_init failed\n"); 4576 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_FENCE_INIT_FAIL, 0, 0); 4577 goto failed; 4578 } 4579 4580 /* init the mode config */ 4581 drm_mode_config_init(adev_to_drm(adev)); 4582 4583 r = amdgpu_device_ip_init(adev); 4584 if (r) { 4585 dev_err(adev->dev, "amdgpu_device_ip_init failed\n"); 4586 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_AMDGPU_INIT_FAIL, 0, 0); 4587 goto release_ras_con; 4588 } 4589 4590 amdgpu_fence_driver_hw_init(adev); 4591 4592 dev_info(adev->dev, 4593 "SE %d, SH per SE %d, CU per SH %d, active_cu_number %d\n", 4594 adev->gfx.config.max_shader_engines, 4595 adev->gfx.config.max_sh_per_se, 4596 adev->gfx.config.max_cu_per_sh, 4597 adev->gfx.cu_info.number); 4598 4599 adev->accel_working = true; 4600 4601 amdgpu_vm_check_compute_bug(adev); 4602 4603 /* Initialize the buffer migration limit. */ 4604 if (amdgpu_moverate >= 0) 4605 max_MBps = amdgpu_moverate; 4606 else 4607 max_MBps = 8; /* Allow 8 MB/s. */ 4608 /* Get a log2 for easy divisions. */ 4609 adev->mm_stats.log2_max_MBps = ilog2(max(1u, max_MBps)); 4610 4611 /* 4612 * Register gpu instance before amdgpu_device_enable_mgpu_fan_boost. 4613 * Otherwise the mgpu fan boost feature will be skipped due to the 4614 * gpu instance is counted less. 4615 */ 4616 amdgpu_register_gpu_instance(adev); 4617 4618 /* enable clockgating, etc. after ib tests, etc. since some blocks require 4619 * explicit gating rather than handling it automatically. 4620 */ 4621 if (adev->init_lvl->level != AMDGPU_INIT_LEVEL_MINIMAL_XGMI) { 4622 r = amdgpu_device_ip_late_init(adev); 4623 if (r) { 4624 dev_err(adev->dev, "amdgpu_device_ip_late_init failed\n"); 4625 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_AMDGPU_LATE_INIT_FAIL, 0, r); 4626 goto release_ras_con; 4627 } 4628 /* must succeed. */ 4629 amdgpu_ras_resume(adev); 4630 queue_delayed_work(system_wq, &adev->delayed_init_work, 4631 msecs_to_jiffies(AMDGPU_RESUME_MS)); 4632 } 4633 4634 if (amdgpu_sriov_vf(adev)) { 4635 amdgpu_virt_release_full_gpu(adev, true); 4636 flush_delayed_work(&adev->delayed_init_work); 4637 } 4638 4639 /* 4640 * Place those sysfs registering after `late_init`. As some of those 4641 * operations performed in `late_init` might affect the sysfs 4642 * interfaces creating. 4643 */ 4644 r = amdgpu_atombios_sysfs_init(adev); 4645 if (r) 4646 drm_err(&adev->ddev, 4647 "registering atombios sysfs failed (%d).\n", r); 4648 4649 r = amdgpu_pm_sysfs_init(adev); 4650 if (r) 4651 DRM_ERROR("registering pm sysfs failed (%d).\n", r); 4652 4653 r = amdgpu_ucode_sysfs_init(adev); 4654 if (r) { 4655 adev->ucode_sysfs_en = false; 4656 DRM_ERROR("Creating firmware sysfs failed (%d).\n", r); 4657 } else 4658 adev->ucode_sysfs_en = true; 4659 4660 r = amdgpu_device_attr_sysfs_init(adev); 4661 if (r) 4662 dev_err(adev->dev, "Could not create amdgpu device attr\n"); 4663 4664 r = devm_device_add_group(adev->dev, &amdgpu_board_attrs_group); 4665 if (r) 4666 dev_err(adev->dev, 4667 "Could not create amdgpu board attributes\n"); 4668 4669 amdgpu_fru_sysfs_init(adev); 4670 amdgpu_reg_state_sysfs_init(adev); 4671 amdgpu_xcp_cfg_sysfs_init(adev); 4672 4673 if (IS_ENABLED(CONFIG_PERF_EVENTS)) 4674 r = amdgpu_pmu_init(adev); 4675 if (r) 4676 dev_err(adev->dev, "amdgpu_pmu_init failed\n"); 4677 4678 /* Have stored pci confspace at hand for restore in sudden PCI error */ 4679 if (amdgpu_device_cache_pci_state(adev->pdev)) 4680 pci_restore_state(pdev); 4681 4682 /* if we have > 1 VGA cards, then disable the amdgpu VGA resources */ 4683 /* this will fail for cards that aren't VGA class devices, just 4684 * ignore it 4685 */ 4686 if ((adev->pdev->class >> 8) == PCI_CLASS_DISPLAY_VGA) 4687 vga_client_register(adev->pdev, amdgpu_device_vga_set_decode); 4688 4689 px = amdgpu_device_supports_px(ddev); 4690 4691 if (px || (!dev_is_removable(&adev->pdev->dev) && 4692 apple_gmux_detect(NULL, NULL))) 4693 vga_switcheroo_register_client(adev->pdev, 4694 &amdgpu_switcheroo_ops, px); 4695 4696 if (px) 4697 vga_switcheroo_init_domain_pm_ops(adev->dev, &adev->vga_pm_domain); 4698 4699 if (adev->init_lvl->level == AMDGPU_INIT_LEVEL_MINIMAL_XGMI) 4700 amdgpu_xgmi_reset_on_init(adev); 4701 4702 amdgpu_device_check_iommu_direct_map(adev); 4703 4704 adev->pm_nb.notifier_call = amdgpu_device_pm_notifier; 4705 r = register_pm_notifier(&adev->pm_nb); 4706 if (r) 4707 goto failed; 4708 4709 return 0; 4710 4711 release_ras_con: 4712 if (amdgpu_sriov_vf(adev)) 4713 amdgpu_virt_release_full_gpu(adev, true); 4714 4715 /* failed in exclusive mode due to timeout */ 4716 if (amdgpu_sriov_vf(adev) && 4717 !amdgpu_sriov_runtime(adev) && 4718 amdgpu_virt_mmio_blocked(adev) && 4719 !amdgpu_virt_wait_reset(adev)) { 4720 dev_err(adev->dev, "VF exclusive mode timeout\n"); 4721 /* Don't send request since VF is inactive. */ 4722 adev->virt.caps &= ~AMDGPU_SRIOV_CAPS_RUNTIME; 4723 adev->virt.ops = NULL; 4724 r = -EAGAIN; 4725 } 4726 amdgpu_release_ras_context(adev); 4727 4728 failed: 4729 amdgpu_vf_error_trans_all(adev); 4730 4731 return r; 4732 } 4733 4734 static void amdgpu_device_unmap_mmio(struct amdgpu_device *adev) 4735 { 4736 4737 /* Clear all CPU mappings pointing to this device */ 4738 unmap_mapping_range(adev->ddev.anon_inode->i_mapping, 0, 0, 1); 4739 4740 /* Unmap all mapped bars - Doorbell, registers and VRAM */ 4741 amdgpu_doorbell_fini(adev); 4742 4743 iounmap(adev->rmmio); 4744 adev->rmmio = NULL; 4745 if (adev->mman.aper_base_kaddr) 4746 iounmap(adev->mman.aper_base_kaddr); 4747 adev->mman.aper_base_kaddr = NULL; 4748 4749 /* Memory manager related */ 4750 if (!adev->gmc.xgmi.connected_to_cpu && !adev->gmc.is_app_apu) { 4751 arch_phys_wc_del(adev->gmc.vram_mtrr); 4752 arch_io_free_memtype_wc(adev->gmc.aper_base, adev->gmc.aper_size); 4753 } 4754 } 4755 4756 /** 4757 * amdgpu_device_fini_hw - tear down the driver 4758 * 4759 * @adev: amdgpu_device pointer 4760 * 4761 * Tear down the driver info (all asics). 4762 * Called at driver shutdown. 4763 */ 4764 void amdgpu_device_fini_hw(struct amdgpu_device *adev) 4765 { 4766 dev_info(adev->dev, "amdgpu: finishing device.\n"); 4767 flush_delayed_work(&adev->delayed_init_work); 4768 4769 if (adev->mman.initialized) 4770 drain_workqueue(adev->mman.bdev.wq); 4771 adev->shutdown = true; 4772 4773 unregister_pm_notifier(&adev->pm_nb); 4774 4775 /* make sure IB test finished before entering exclusive mode 4776 * to avoid preemption on IB test 4777 */ 4778 if (amdgpu_sriov_vf(adev)) { 4779 amdgpu_virt_request_full_gpu(adev, false); 4780 amdgpu_virt_fini_data_exchange(adev); 4781 } 4782 4783 /* disable all interrupts */ 4784 amdgpu_irq_disable_all(adev); 4785 if (adev->mode_info.mode_config_initialized) { 4786 if (!drm_drv_uses_atomic_modeset(adev_to_drm(adev))) 4787 drm_helper_force_disable_all(adev_to_drm(adev)); 4788 else 4789 drm_atomic_helper_shutdown(adev_to_drm(adev)); 4790 } 4791 amdgpu_fence_driver_hw_fini(adev); 4792 4793 if (adev->pm.sysfs_initialized) 4794 amdgpu_pm_sysfs_fini(adev); 4795 if (adev->ucode_sysfs_en) 4796 amdgpu_ucode_sysfs_fini(adev); 4797 amdgpu_device_attr_sysfs_fini(adev); 4798 amdgpu_fru_sysfs_fini(adev); 4799 4800 amdgpu_reg_state_sysfs_fini(adev); 4801 amdgpu_xcp_cfg_sysfs_fini(adev); 4802 4803 /* disable ras feature must before hw fini */ 4804 amdgpu_ras_pre_fini(adev); 4805 4806 amdgpu_ttm_set_buffer_funcs_status(adev, false); 4807 4808 amdgpu_device_ip_fini_early(adev); 4809 4810 amdgpu_irq_fini_hw(adev); 4811 4812 if (adev->mman.initialized) 4813 ttm_device_clear_dma_mappings(&adev->mman.bdev); 4814 4815 amdgpu_gart_dummy_page_fini(adev); 4816 4817 if (drm_dev_is_unplugged(adev_to_drm(adev))) 4818 amdgpu_device_unmap_mmio(adev); 4819 4820 } 4821 4822 void amdgpu_device_fini_sw(struct amdgpu_device *adev) 4823 { 4824 int i, idx; 4825 bool px; 4826 4827 amdgpu_device_ip_fini(adev); 4828 amdgpu_fence_driver_sw_fini(adev); 4829 amdgpu_ucode_release(&adev->firmware.gpu_info_fw); 4830 adev->accel_working = false; 4831 dma_fence_put(rcu_dereference_protected(adev->gang_submit, true)); 4832 for (i = 0; i < MAX_XCP; ++i) { 4833 dma_fence_put(adev->isolation[i].spearhead); 4834 amdgpu_sync_free(&adev->isolation[i].active); 4835 amdgpu_sync_free(&adev->isolation[i].prev); 4836 } 4837 4838 amdgpu_reset_fini(adev); 4839 4840 /* free i2c buses */ 4841 amdgpu_i2c_fini(adev); 4842 4843 if (adev->bios) { 4844 if (amdgpu_emu_mode != 1) 4845 amdgpu_atombios_fini(adev); 4846 amdgpu_bios_release(adev); 4847 } 4848 4849 kfree(adev->fru_info); 4850 adev->fru_info = NULL; 4851 4852 kfree(adev->xcp_mgr); 4853 adev->xcp_mgr = NULL; 4854 4855 px = amdgpu_device_supports_px(adev_to_drm(adev)); 4856 4857 if (px || (!dev_is_removable(&adev->pdev->dev) && 4858 apple_gmux_detect(NULL, NULL))) 4859 vga_switcheroo_unregister_client(adev->pdev); 4860 4861 if (px) 4862 vga_switcheroo_fini_domain_pm_ops(adev->dev); 4863 4864 if ((adev->pdev->class >> 8) == PCI_CLASS_DISPLAY_VGA) 4865 vga_client_unregister(adev->pdev); 4866 4867 if (drm_dev_enter(adev_to_drm(adev), &idx)) { 4868 4869 iounmap(adev->rmmio); 4870 adev->rmmio = NULL; 4871 amdgpu_doorbell_fini(adev); 4872 drm_dev_exit(idx); 4873 } 4874 4875 if (IS_ENABLED(CONFIG_PERF_EVENTS)) 4876 amdgpu_pmu_fini(adev); 4877 if (adev->mman.discovery_bin) 4878 amdgpu_discovery_fini(adev); 4879 4880 amdgpu_reset_put_reset_domain(adev->reset_domain); 4881 adev->reset_domain = NULL; 4882 4883 kfree(adev->pci_state); 4884 4885 } 4886 4887 /** 4888 * amdgpu_device_evict_resources - evict device resources 4889 * @adev: amdgpu device object 4890 * 4891 * Evicts all ttm device resources(vram BOs, gart table) from the lru list 4892 * of the vram memory type. Mainly used for evicting device resources 4893 * at suspend time. 4894 * 4895 */ 4896 static int amdgpu_device_evict_resources(struct amdgpu_device *adev) 4897 { 4898 int ret; 4899 4900 /* No need to evict vram on APUs unless going to S4 */ 4901 if (!adev->in_s4 && (adev->flags & AMD_IS_APU)) 4902 return 0; 4903 4904 ret = amdgpu_ttm_evict_resources(adev, TTM_PL_VRAM); 4905 if (ret) 4906 DRM_WARN("evicting device resources failed\n"); 4907 return ret; 4908 } 4909 4910 /* 4911 * Suspend & resume. 4912 */ 4913 /** 4914 * amdgpu_device_pm_notifier - Notification block for Suspend/Hibernate events 4915 * @nb: notifier block 4916 * @mode: suspend mode 4917 * @data: data 4918 * 4919 * This function is called when the system is about to suspend or hibernate. 4920 * It is used to evict resources from the device before the system goes to 4921 * sleep while there is still access to swap. 4922 */ 4923 static int amdgpu_device_pm_notifier(struct notifier_block *nb, unsigned long mode, 4924 void *data) 4925 { 4926 struct amdgpu_device *adev = container_of(nb, struct amdgpu_device, pm_nb); 4927 int r; 4928 4929 switch (mode) { 4930 case PM_HIBERNATION_PREPARE: 4931 adev->in_s4 = true; 4932 fallthrough; 4933 case PM_SUSPEND_PREPARE: 4934 r = amdgpu_device_evict_resources(adev); 4935 /* 4936 * This is considered non-fatal at this time because 4937 * amdgpu_device_prepare() will also fatally evict resources. 4938 * See https://gitlab.freedesktop.org/drm/amd/-/issues/3781 4939 */ 4940 if (r) 4941 drm_warn(adev_to_drm(adev), "Failed to evict resources, freeze active processes if problems occur: %d\n", r); 4942 break; 4943 } 4944 4945 return NOTIFY_DONE; 4946 } 4947 4948 /** 4949 * amdgpu_device_prepare - prepare for device suspend 4950 * 4951 * @dev: drm dev pointer 4952 * 4953 * Prepare to put the hw in the suspend state (all asics). 4954 * Returns 0 for success or an error on failure. 4955 * Called at driver suspend. 4956 */ 4957 int amdgpu_device_prepare(struct drm_device *dev) 4958 { 4959 struct amdgpu_device *adev = drm_to_adev(dev); 4960 int i, r; 4961 4962 amdgpu_choose_low_power_state(adev); 4963 4964 if (dev->switch_power_state == DRM_SWITCH_POWER_OFF) 4965 return 0; 4966 4967 /* Evict the majority of BOs before starting suspend sequence */ 4968 r = amdgpu_device_evict_resources(adev); 4969 if (r) 4970 goto unprepare; 4971 4972 flush_delayed_work(&adev->gfx.gfx_off_delay_work); 4973 4974 for (i = 0; i < adev->num_ip_blocks; i++) { 4975 if (!adev->ip_blocks[i].status.valid) 4976 continue; 4977 if (!adev->ip_blocks[i].version->funcs->prepare_suspend) 4978 continue; 4979 r = adev->ip_blocks[i].version->funcs->prepare_suspend(&adev->ip_blocks[i]); 4980 if (r) 4981 goto unprepare; 4982 } 4983 4984 return 0; 4985 4986 unprepare: 4987 adev->in_s0ix = adev->in_s3 = adev->in_s4 = false; 4988 4989 return r; 4990 } 4991 4992 /** 4993 * amdgpu_device_suspend - initiate device suspend 4994 * 4995 * @dev: drm dev pointer 4996 * @notify_clients: notify in-kernel DRM clients 4997 * 4998 * Puts the hw in the suspend state (all asics). 4999 * Returns 0 for success or an error on failure. 5000 * Called at driver suspend. 5001 */ 5002 int amdgpu_device_suspend(struct drm_device *dev, bool notify_clients) 5003 { 5004 struct amdgpu_device *adev = drm_to_adev(dev); 5005 int r = 0; 5006 5007 if (dev->switch_power_state == DRM_SWITCH_POWER_OFF) 5008 return 0; 5009 5010 adev->in_suspend = true; 5011 5012 if (amdgpu_sriov_vf(adev)) { 5013 amdgpu_virt_fini_data_exchange(adev); 5014 r = amdgpu_virt_request_full_gpu(adev, false); 5015 if (r) 5016 return r; 5017 } 5018 5019 if (amdgpu_acpi_smart_shift_update(dev, AMDGPU_SS_DEV_D3)) 5020 DRM_WARN("smart shift update failed\n"); 5021 5022 if (notify_clients) 5023 drm_client_dev_suspend(adev_to_drm(adev), false); 5024 5025 cancel_delayed_work_sync(&adev->delayed_init_work); 5026 5027 amdgpu_ras_suspend(adev); 5028 5029 amdgpu_device_ip_suspend_phase1(adev); 5030 5031 if (!adev->in_s0ix) 5032 amdgpu_amdkfd_suspend(adev, adev->in_runpm); 5033 5034 r = amdgpu_device_evict_resources(adev); 5035 if (r) 5036 return r; 5037 5038 amdgpu_ttm_set_buffer_funcs_status(adev, false); 5039 5040 amdgpu_fence_driver_hw_fini(adev); 5041 5042 amdgpu_device_ip_suspend_phase2(adev); 5043 5044 if (amdgpu_sriov_vf(adev)) 5045 amdgpu_virt_release_full_gpu(adev, false); 5046 5047 r = amdgpu_dpm_notify_rlc_state(adev, false); 5048 if (r) 5049 return r; 5050 5051 return 0; 5052 } 5053 5054 /** 5055 * amdgpu_device_resume - initiate device resume 5056 * 5057 * @dev: drm dev pointer 5058 * @notify_clients: notify in-kernel DRM clients 5059 * 5060 * Bring the hw back to operating state (all asics). 5061 * Returns 0 for success or an error on failure. 5062 * Called at driver resume. 5063 */ 5064 int amdgpu_device_resume(struct drm_device *dev, bool notify_clients) 5065 { 5066 struct amdgpu_device *adev = drm_to_adev(dev); 5067 int r = 0; 5068 5069 if (amdgpu_sriov_vf(adev)) { 5070 r = amdgpu_virt_request_full_gpu(adev, true); 5071 if (r) 5072 return r; 5073 } 5074 5075 if (dev->switch_power_state == DRM_SWITCH_POWER_OFF) 5076 return 0; 5077 5078 if (adev->in_s0ix) 5079 amdgpu_dpm_gfx_state_change(adev, sGpuChangeState_D0Entry); 5080 5081 /* post card */ 5082 if (amdgpu_device_need_post(adev)) { 5083 r = amdgpu_device_asic_init(adev); 5084 if (r) 5085 dev_err(adev->dev, "amdgpu asic init failed\n"); 5086 } 5087 5088 r = amdgpu_device_ip_resume(adev); 5089 5090 if (r) { 5091 dev_err(adev->dev, "amdgpu_device_ip_resume failed (%d).\n", r); 5092 goto exit; 5093 } 5094 5095 if (!adev->in_s0ix) { 5096 r = amdgpu_amdkfd_resume(adev, adev->in_runpm); 5097 if (r) 5098 goto exit; 5099 } 5100 5101 r = amdgpu_device_ip_late_init(adev); 5102 if (r) 5103 goto exit; 5104 5105 queue_delayed_work(system_wq, &adev->delayed_init_work, 5106 msecs_to_jiffies(AMDGPU_RESUME_MS)); 5107 exit: 5108 if (amdgpu_sriov_vf(adev)) { 5109 amdgpu_virt_init_data_exchange(adev); 5110 amdgpu_virt_release_full_gpu(adev, true); 5111 } 5112 5113 if (r) 5114 return r; 5115 5116 /* Make sure IB tests flushed */ 5117 flush_delayed_work(&adev->delayed_init_work); 5118 5119 if (notify_clients) 5120 drm_client_dev_resume(adev_to_drm(adev), false); 5121 5122 amdgpu_ras_resume(adev); 5123 5124 if (adev->mode_info.num_crtc) { 5125 /* 5126 * Most of the connector probing functions try to acquire runtime pm 5127 * refs to ensure that the GPU is powered on when connector polling is 5128 * performed. Since we're calling this from a runtime PM callback, 5129 * trying to acquire rpm refs will cause us to deadlock. 5130 * 5131 * Since we're guaranteed to be holding the rpm lock, it's safe to 5132 * temporarily disable the rpm helpers so this doesn't deadlock us. 5133 */ 5134 #ifdef CONFIG_PM 5135 dev->dev->power.disable_depth++; 5136 #endif 5137 if (!adev->dc_enabled) 5138 drm_helper_hpd_irq_event(dev); 5139 else 5140 drm_kms_helper_hotplug_event(dev); 5141 #ifdef CONFIG_PM 5142 dev->dev->power.disable_depth--; 5143 #endif 5144 } 5145 adev->in_suspend = false; 5146 5147 if (adev->enable_mes) 5148 amdgpu_mes_self_test(adev); 5149 5150 if (amdgpu_acpi_smart_shift_update(dev, AMDGPU_SS_DEV_D0)) 5151 DRM_WARN("smart shift update failed\n"); 5152 5153 return 0; 5154 } 5155 5156 /** 5157 * amdgpu_device_ip_check_soft_reset - did soft reset succeed 5158 * 5159 * @adev: amdgpu_device pointer 5160 * 5161 * The list of all the hardware IPs that make up the asic is walked and 5162 * the check_soft_reset callbacks are run. check_soft_reset determines 5163 * if the asic is still hung or not. 5164 * Returns true if any of the IPs are still in a hung state, false if not. 5165 */ 5166 static bool amdgpu_device_ip_check_soft_reset(struct amdgpu_device *adev) 5167 { 5168 int i; 5169 bool asic_hang = false; 5170 5171 if (amdgpu_sriov_vf(adev)) 5172 return true; 5173 5174 if (amdgpu_asic_need_full_reset(adev)) 5175 return true; 5176 5177 for (i = 0; i < adev->num_ip_blocks; i++) { 5178 if (!adev->ip_blocks[i].status.valid) 5179 continue; 5180 if (adev->ip_blocks[i].version->funcs->check_soft_reset) 5181 adev->ip_blocks[i].status.hang = 5182 adev->ip_blocks[i].version->funcs->check_soft_reset( 5183 &adev->ip_blocks[i]); 5184 if (adev->ip_blocks[i].status.hang) { 5185 dev_info(adev->dev, "IP block:%s is hung!\n", adev->ip_blocks[i].version->funcs->name); 5186 asic_hang = true; 5187 } 5188 } 5189 return asic_hang; 5190 } 5191 5192 /** 5193 * amdgpu_device_ip_pre_soft_reset - prepare for soft reset 5194 * 5195 * @adev: amdgpu_device pointer 5196 * 5197 * The list of all the hardware IPs that make up the asic is walked and the 5198 * pre_soft_reset callbacks are run if the block is hung. pre_soft_reset 5199 * handles any IP specific hardware or software state changes that are 5200 * necessary for a soft reset to succeed. 5201 * Returns 0 on success, negative error code on failure. 5202 */ 5203 static int amdgpu_device_ip_pre_soft_reset(struct amdgpu_device *adev) 5204 { 5205 int i, r = 0; 5206 5207 for (i = 0; i < adev->num_ip_blocks; i++) { 5208 if (!adev->ip_blocks[i].status.valid) 5209 continue; 5210 if (adev->ip_blocks[i].status.hang && 5211 adev->ip_blocks[i].version->funcs->pre_soft_reset) { 5212 r = adev->ip_blocks[i].version->funcs->pre_soft_reset(&adev->ip_blocks[i]); 5213 if (r) 5214 return r; 5215 } 5216 } 5217 5218 return 0; 5219 } 5220 5221 /** 5222 * amdgpu_device_ip_need_full_reset - check if a full asic reset is needed 5223 * 5224 * @adev: amdgpu_device pointer 5225 * 5226 * Some hardware IPs cannot be soft reset. If they are hung, a full gpu 5227 * reset is necessary to recover. 5228 * Returns true if a full asic reset is required, false if not. 5229 */ 5230 static bool amdgpu_device_ip_need_full_reset(struct amdgpu_device *adev) 5231 { 5232 int i; 5233 5234 if (amdgpu_asic_need_full_reset(adev)) 5235 return true; 5236 5237 for (i = 0; i < adev->num_ip_blocks; i++) { 5238 if (!adev->ip_blocks[i].status.valid) 5239 continue; 5240 if ((adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) || 5241 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) || 5242 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_ACP) || 5243 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE) || 5244 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) { 5245 if (adev->ip_blocks[i].status.hang) { 5246 dev_info(adev->dev, "Some block need full reset!\n"); 5247 return true; 5248 } 5249 } 5250 } 5251 return false; 5252 } 5253 5254 /** 5255 * amdgpu_device_ip_soft_reset - do a soft reset 5256 * 5257 * @adev: amdgpu_device pointer 5258 * 5259 * The list of all the hardware IPs that make up the asic is walked and the 5260 * soft_reset callbacks are run if the block is hung. soft_reset handles any 5261 * IP specific hardware or software state changes that are necessary to soft 5262 * reset the IP. 5263 * Returns 0 on success, negative error code on failure. 5264 */ 5265 static int amdgpu_device_ip_soft_reset(struct amdgpu_device *adev) 5266 { 5267 int i, r = 0; 5268 5269 for (i = 0; i < adev->num_ip_blocks; i++) { 5270 if (!adev->ip_blocks[i].status.valid) 5271 continue; 5272 if (adev->ip_blocks[i].status.hang && 5273 adev->ip_blocks[i].version->funcs->soft_reset) { 5274 r = adev->ip_blocks[i].version->funcs->soft_reset(&adev->ip_blocks[i]); 5275 if (r) 5276 return r; 5277 } 5278 } 5279 5280 return 0; 5281 } 5282 5283 /** 5284 * amdgpu_device_ip_post_soft_reset - clean up from soft reset 5285 * 5286 * @adev: amdgpu_device pointer 5287 * 5288 * The list of all the hardware IPs that make up the asic is walked and the 5289 * post_soft_reset callbacks are run if the asic was hung. post_soft_reset 5290 * handles any IP specific hardware or software state changes that are 5291 * necessary after the IP has been soft reset. 5292 * Returns 0 on success, negative error code on failure. 5293 */ 5294 static int amdgpu_device_ip_post_soft_reset(struct amdgpu_device *adev) 5295 { 5296 int i, r = 0; 5297 5298 for (i = 0; i < adev->num_ip_blocks; i++) { 5299 if (!adev->ip_blocks[i].status.valid) 5300 continue; 5301 if (adev->ip_blocks[i].status.hang && 5302 adev->ip_blocks[i].version->funcs->post_soft_reset) 5303 r = adev->ip_blocks[i].version->funcs->post_soft_reset(&adev->ip_blocks[i]); 5304 if (r) 5305 return r; 5306 } 5307 5308 return 0; 5309 } 5310 5311 /** 5312 * amdgpu_device_reset_sriov - reset ASIC for SR-IOV vf 5313 * 5314 * @adev: amdgpu_device pointer 5315 * @reset_context: amdgpu reset context pointer 5316 * 5317 * do VF FLR and reinitialize Asic 5318 * return 0 means succeeded otherwise failed 5319 */ 5320 static int amdgpu_device_reset_sriov(struct amdgpu_device *adev, 5321 struct amdgpu_reset_context *reset_context) 5322 { 5323 int r; 5324 struct amdgpu_hive_info *hive = NULL; 5325 5326 if (test_bit(AMDGPU_HOST_FLR, &reset_context->flags)) { 5327 if (!amdgpu_ras_get_fed_status(adev)) 5328 amdgpu_virt_ready_to_reset(adev); 5329 amdgpu_virt_wait_reset(adev); 5330 clear_bit(AMDGPU_HOST_FLR, &reset_context->flags); 5331 r = amdgpu_virt_request_full_gpu(adev, true); 5332 } else { 5333 r = amdgpu_virt_reset_gpu(adev); 5334 } 5335 if (r) 5336 return r; 5337 5338 amdgpu_ras_clear_err_state(adev); 5339 amdgpu_irq_gpu_reset_resume_helper(adev); 5340 5341 /* some sw clean up VF needs to do before recover */ 5342 amdgpu_virt_post_reset(adev); 5343 5344 /* Resume IP prior to SMC */ 5345 r = amdgpu_device_ip_reinit_early_sriov(adev); 5346 if (r) 5347 return r; 5348 5349 amdgpu_virt_init_data_exchange(adev); 5350 5351 r = amdgpu_device_fw_loading(adev); 5352 if (r) 5353 return r; 5354 5355 /* now we are okay to resume SMC/CP/SDMA */ 5356 r = amdgpu_device_ip_reinit_late_sriov(adev); 5357 if (r) 5358 return r; 5359 5360 hive = amdgpu_get_xgmi_hive(adev); 5361 /* Update PSP FW topology after reset */ 5362 if (hive && adev->gmc.xgmi.num_physical_nodes > 1) 5363 r = amdgpu_xgmi_update_topology(hive, adev); 5364 if (hive) 5365 amdgpu_put_xgmi_hive(hive); 5366 if (r) 5367 return r; 5368 5369 r = amdgpu_ib_ring_tests(adev); 5370 if (r) 5371 return r; 5372 5373 if (adev->virt.gim_feature & AMDGIM_FEATURE_GIM_FLR_VRAMLOST) 5374 amdgpu_inc_vram_lost(adev); 5375 5376 /* need to be called during full access so we can't do it later like 5377 * bare-metal does. 5378 */ 5379 amdgpu_amdkfd_post_reset(adev); 5380 amdgpu_virt_release_full_gpu(adev, true); 5381 5382 /* Aldebaran and gfx_11_0_3 support ras in SRIOV, so need resume ras during reset */ 5383 if (amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 2) || 5384 amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 3) || 5385 amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 4) || 5386 amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 5, 0) || 5387 amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(11, 0, 3)) 5388 amdgpu_ras_resume(adev); 5389 5390 amdgpu_virt_ras_telemetry_post_reset(adev); 5391 5392 return 0; 5393 } 5394 5395 /** 5396 * amdgpu_device_has_job_running - check if there is any unfinished job 5397 * 5398 * @adev: amdgpu_device pointer 5399 * 5400 * check if there is any job running on the device when guest driver receives 5401 * FLR notification from host driver. If there are still jobs running, then 5402 * the guest driver will not respond the FLR reset. Instead, let the job hit 5403 * the timeout and guest driver then issue the reset request. 5404 */ 5405 bool amdgpu_device_has_job_running(struct amdgpu_device *adev) 5406 { 5407 int i; 5408 5409 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 5410 struct amdgpu_ring *ring = adev->rings[i]; 5411 5412 if (!amdgpu_ring_sched_ready(ring)) 5413 continue; 5414 5415 if (amdgpu_fence_count_emitted(ring)) 5416 return true; 5417 } 5418 return false; 5419 } 5420 5421 /** 5422 * amdgpu_device_should_recover_gpu - check if we should try GPU recovery 5423 * 5424 * @adev: amdgpu_device pointer 5425 * 5426 * Check amdgpu_gpu_recovery and SRIOV status to see if we should try to recover 5427 * a hung GPU. 5428 */ 5429 bool amdgpu_device_should_recover_gpu(struct amdgpu_device *adev) 5430 { 5431 5432 if (amdgpu_gpu_recovery == 0) 5433 goto disabled; 5434 5435 /* Skip soft reset check in fatal error mode */ 5436 if (!amdgpu_ras_is_poison_mode_supported(adev)) 5437 return true; 5438 5439 if (amdgpu_sriov_vf(adev)) 5440 return true; 5441 5442 if (amdgpu_gpu_recovery == -1) { 5443 switch (adev->asic_type) { 5444 #ifdef CONFIG_DRM_AMDGPU_SI 5445 case CHIP_VERDE: 5446 case CHIP_TAHITI: 5447 case CHIP_PITCAIRN: 5448 case CHIP_OLAND: 5449 case CHIP_HAINAN: 5450 #endif 5451 #ifdef CONFIG_DRM_AMDGPU_CIK 5452 case CHIP_KAVERI: 5453 case CHIP_KABINI: 5454 case CHIP_MULLINS: 5455 #endif 5456 case CHIP_CARRIZO: 5457 case CHIP_STONEY: 5458 case CHIP_CYAN_SKILLFISH: 5459 goto disabled; 5460 default: 5461 break; 5462 } 5463 } 5464 5465 return true; 5466 5467 disabled: 5468 dev_info(adev->dev, "GPU recovery disabled.\n"); 5469 return false; 5470 } 5471 5472 int amdgpu_device_mode1_reset(struct amdgpu_device *adev) 5473 { 5474 u32 i; 5475 int ret = 0; 5476 5477 if (adev->bios) 5478 amdgpu_atombios_scratch_regs_engine_hung(adev, true); 5479 5480 dev_info(adev->dev, "GPU mode1 reset\n"); 5481 5482 /* Cache the state before bus master disable. The saved config space 5483 * values are used in other cases like restore after mode-2 reset. 5484 */ 5485 amdgpu_device_cache_pci_state(adev->pdev); 5486 5487 /* disable BM */ 5488 pci_clear_master(adev->pdev); 5489 5490 if (amdgpu_dpm_is_mode1_reset_supported(adev)) { 5491 dev_info(adev->dev, "GPU smu mode1 reset\n"); 5492 ret = amdgpu_dpm_mode1_reset(adev); 5493 } else { 5494 dev_info(adev->dev, "GPU psp mode1 reset\n"); 5495 ret = psp_gpu_reset(adev); 5496 } 5497 5498 if (ret) 5499 goto mode1_reset_failed; 5500 5501 amdgpu_device_load_pci_state(adev->pdev); 5502 ret = amdgpu_psp_wait_for_bootloader(adev); 5503 if (ret) 5504 goto mode1_reset_failed; 5505 5506 /* wait for asic to come out of reset */ 5507 for (i = 0; i < adev->usec_timeout; i++) { 5508 u32 memsize = adev->nbio.funcs->get_memsize(adev); 5509 5510 if (memsize != 0xffffffff) 5511 break; 5512 udelay(1); 5513 } 5514 5515 if (i >= adev->usec_timeout) { 5516 ret = -ETIMEDOUT; 5517 goto mode1_reset_failed; 5518 } 5519 5520 if (adev->bios) 5521 amdgpu_atombios_scratch_regs_engine_hung(adev, false); 5522 5523 return 0; 5524 5525 mode1_reset_failed: 5526 dev_err(adev->dev, "GPU mode1 reset failed\n"); 5527 return ret; 5528 } 5529 5530 int amdgpu_device_link_reset(struct amdgpu_device *adev) 5531 { 5532 int ret = 0; 5533 5534 dev_info(adev->dev, "GPU link reset\n"); 5535 5536 if (!adev->pcie_reset_ctx.occurs_dpc) 5537 ret = amdgpu_dpm_link_reset(adev); 5538 5539 if (ret) 5540 goto link_reset_failed; 5541 5542 ret = amdgpu_psp_wait_for_bootloader(adev); 5543 if (ret) 5544 goto link_reset_failed; 5545 5546 return 0; 5547 5548 link_reset_failed: 5549 dev_err(adev->dev, "GPU link reset failed\n"); 5550 return ret; 5551 } 5552 5553 int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev, 5554 struct amdgpu_reset_context *reset_context) 5555 { 5556 int i, r = 0; 5557 struct amdgpu_job *job = NULL; 5558 struct amdgpu_device *tmp_adev = reset_context->reset_req_dev; 5559 bool need_full_reset = 5560 test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 5561 5562 if (reset_context->reset_req_dev == adev) 5563 job = reset_context->job; 5564 5565 if (amdgpu_sriov_vf(adev)) 5566 amdgpu_virt_pre_reset(adev); 5567 5568 amdgpu_fence_driver_isr_toggle(adev, true); 5569 5570 /* block all schedulers and reset given job's ring */ 5571 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 5572 struct amdgpu_ring *ring = adev->rings[i]; 5573 5574 if (!amdgpu_ring_sched_ready(ring)) 5575 continue; 5576 5577 /* Clear job fence from fence drv to avoid force_completion 5578 * leave NULL and vm flush fence in fence drv 5579 */ 5580 amdgpu_fence_driver_clear_job_fences(ring); 5581 5582 /* after all hw jobs are reset, hw fence is meaningless, so force_completion */ 5583 amdgpu_fence_driver_force_completion(ring); 5584 } 5585 5586 amdgpu_fence_driver_isr_toggle(adev, false); 5587 5588 if (job && job->vm) 5589 drm_sched_increase_karma(&job->base); 5590 5591 r = amdgpu_reset_prepare_hwcontext(adev, reset_context); 5592 /* If reset handler not implemented, continue; otherwise return */ 5593 if (r == -EOPNOTSUPP) 5594 r = 0; 5595 else 5596 return r; 5597 5598 /* Don't suspend on bare metal if we are not going to HW reset the ASIC */ 5599 if (!amdgpu_sriov_vf(adev)) { 5600 5601 if (!need_full_reset) 5602 need_full_reset = amdgpu_device_ip_need_full_reset(adev); 5603 5604 if (!need_full_reset && amdgpu_gpu_recovery && 5605 amdgpu_device_ip_check_soft_reset(adev)) { 5606 amdgpu_device_ip_pre_soft_reset(adev); 5607 r = amdgpu_device_ip_soft_reset(adev); 5608 amdgpu_device_ip_post_soft_reset(adev); 5609 if (r || amdgpu_device_ip_check_soft_reset(adev)) { 5610 dev_info(adev->dev, "soft reset failed, will fallback to full reset!\n"); 5611 need_full_reset = true; 5612 } 5613 } 5614 5615 if (!test_bit(AMDGPU_SKIP_COREDUMP, &reset_context->flags)) { 5616 dev_info(tmp_adev->dev, "Dumping IP State\n"); 5617 /* Trigger ip dump before we reset the asic */ 5618 for (i = 0; i < tmp_adev->num_ip_blocks; i++) 5619 if (tmp_adev->ip_blocks[i].version->funcs->dump_ip_state) 5620 tmp_adev->ip_blocks[i].version->funcs 5621 ->dump_ip_state((void *)&tmp_adev->ip_blocks[i]); 5622 dev_info(tmp_adev->dev, "Dumping IP State Completed\n"); 5623 } 5624 5625 if (need_full_reset) 5626 r = amdgpu_device_ip_suspend(adev); 5627 if (need_full_reset) 5628 set_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 5629 else 5630 clear_bit(AMDGPU_NEED_FULL_RESET, 5631 &reset_context->flags); 5632 } 5633 5634 return r; 5635 } 5636 5637 int amdgpu_device_reinit_after_reset(struct amdgpu_reset_context *reset_context) 5638 { 5639 struct list_head *device_list_handle; 5640 bool full_reset, vram_lost = false; 5641 struct amdgpu_device *tmp_adev; 5642 int r, init_level; 5643 5644 device_list_handle = reset_context->reset_device_list; 5645 5646 if (!device_list_handle) 5647 return -EINVAL; 5648 5649 full_reset = test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 5650 5651 /** 5652 * If it's reset on init, it's default init level, otherwise keep level 5653 * as recovery level. 5654 */ 5655 if (reset_context->method == AMD_RESET_METHOD_ON_INIT) 5656 init_level = AMDGPU_INIT_LEVEL_DEFAULT; 5657 else 5658 init_level = AMDGPU_INIT_LEVEL_RESET_RECOVERY; 5659 5660 r = 0; 5661 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 5662 amdgpu_set_init_level(tmp_adev, init_level); 5663 if (full_reset) { 5664 /* post card */ 5665 amdgpu_ras_clear_err_state(tmp_adev); 5666 r = amdgpu_device_asic_init(tmp_adev); 5667 if (r) { 5668 dev_warn(tmp_adev->dev, "asic atom init failed!"); 5669 } else { 5670 dev_info(tmp_adev->dev, "GPU reset succeeded, trying to resume\n"); 5671 5672 r = amdgpu_device_ip_resume_phase1(tmp_adev); 5673 if (r) 5674 goto out; 5675 5676 vram_lost = amdgpu_device_check_vram_lost(tmp_adev); 5677 5678 if (!test_bit(AMDGPU_SKIP_COREDUMP, &reset_context->flags)) 5679 amdgpu_coredump(tmp_adev, false, vram_lost, reset_context->job); 5680 5681 if (vram_lost) { 5682 DRM_INFO("VRAM is lost due to GPU reset!\n"); 5683 amdgpu_inc_vram_lost(tmp_adev); 5684 } 5685 5686 r = amdgpu_device_fw_loading(tmp_adev); 5687 if (r) 5688 return r; 5689 5690 r = amdgpu_xcp_restore_partition_mode( 5691 tmp_adev->xcp_mgr); 5692 if (r) 5693 goto out; 5694 5695 r = amdgpu_device_ip_resume_phase2(tmp_adev); 5696 if (r) 5697 goto out; 5698 5699 if (tmp_adev->mman.buffer_funcs_ring->sched.ready) 5700 amdgpu_ttm_set_buffer_funcs_status(tmp_adev, true); 5701 5702 r = amdgpu_device_ip_resume_phase3(tmp_adev); 5703 if (r) 5704 goto out; 5705 5706 if (vram_lost) 5707 amdgpu_device_fill_reset_magic(tmp_adev); 5708 5709 /* 5710 * Add this ASIC as tracked as reset was already 5711 * complete successfully. 5712 */ 5713 amdgpu_register_gpu_instance(tmp_adev); 5714 5715 if (!reset_context->hive && 5716 tmp_adev->gmc.xgmi.num_physical_nodes > 1) 5717 amdgpu_xgmi_add_device(tmp_adev); 5718 5719 r = amdgpu_device_ip_late_init(tmp_adev); 5720 if (r) 5721 goto out; 5722 5723 drm_client_dev_resume(adev_to_drm(tmp_adev), false); 5724 5725 /* 5726 * The GPU enters bad state once faulty pages 5727 * by ECC has reached the threshold, and ras 5728 * recovery is scheduled next. So add one check 5729 * here to break recovery if it indeed exceeds 5730 * bad page threshold, and remind user to 5731 * retire this GPU or setting one bigger 5732 * bad_page_threshold value to fix this once 5733 * probing driver again. 5734 */ 5735 if (!amdgpu_ras_is_rma(tmp_adev)) { 5736 /* must succeed. */ 5737 amdgpu_ras_resume(tmp_adev); 5738 } else { 5739 r = -EINVAL; 5740 goto out; 5741 } 5742 5743 /* Update PSP FW topology after reset */ 5744 if (reset_context->hive && 5745 tmp_adev->gmc.xgmi.num_physical_nodes > 1) 5746 r = amdgpu_xgmi_update_topology( 5747 reset_context->hive, tmp_adev); 5748 } 5749 } 5750 5751 out: 5752 if (!r) { 5753 /* IP init is complete now, set level as default */ 5754 amdgpu_set_init_level(tmp_adev, 5755 AMDGPU_INIT_LEVEL_DEFAULT); 5756 amdgpu_irq_gpu_reset_resume_helper(tmp_adev); 5757 r = amdgpu_ib_ring_tests(tmp_adev); 5758 if (r) { 5759 dev_err(tmp_adev->dev, "ib ring test failed (%d).\n", r); 5760 r = -EAGAIN; 5761 goto end; 5762 } 5763 } 5764 5765 if (r) 5766 tmp_adev->asic_reset_res = r; 5767 } 5768 5769 end: 5770 return r; 5771 } 5772 5773 int amdgpu_do_asic_reset(struct list_head *device_list_handle, 5774 struct amdgpu_reset_context *reset_context) 5775 { 5776 struct amdgpu_device *tmp_adev = NULL; 5777 bool need_full_reset, skip_hw_reset; 5778 int r = 0; 5779 5780 /* Try reset handler method first */ 5781 tmp_adev = list_first_entry(device_list_handle, struct amdgpu_device, 5782 reset_list); 5783 5784 reset_context->reset_device_list = device_list_handle; 5785 r = amdgpu_reset_perform_reset(tmp_adev, reset_context); 5786 /* If reset handler not implemented, continue; otherwise return */ 5787 if (r == -EOPNOTSUPP) 5788 r = 0; 5789 else 5790 return r; 5791 5792 /* Reset handler not implemented, use the default method */ 5793 need_full_reset = 5794 test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 5795 skip_hw_reset = test_bit(AMDGPU_SKIP_HW_RESET, &reset_context->flags); 5796 5797 /* 5798 * ASIC reset has to be done on all XGMI hive nodes ASAP 5799 * to allow proper links negotiation in FW (within 1 sec) 5800 */ 5801 if (!skip_hw_reset && need_full_reset) { 5802 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 5803 /* For XGMI run all resets in parallel to speed up the process */ 5804 if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) { 5805 if (!queue_work(system_unbound_wq, 5806 &tmp_adev->xgmi_reset_work)) 5807 r = -EALREADY; 5808 } else 5809 r = amdgpu_asic_reset(tmp_adev); 5810 5811 if (r) { 5812 dev_err(tmp_adev->dev, 5813 "ASIC reset failed with error, %d for drm dev, %s", 5814 r, adev_to_drm(tmp_adev)->unique); 5815 goto out; 5816 } 5817 } 5818 5819 /* For XGMI wait for all resets to complete before proceed */ 5820 if (!r) { 5821 list_for_each_entry(tmp_adev, device_list_handle, 5822 reset_list) { 5823 if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) { 5824 flush_work(&tmp_adev->xgmi_reset_work); 5825 r = tmp_adev->asic_reset_res; 5826 if (r) 5827 break; 5828 } 5829 } 5830 } 5831 } 5832 5833 if (!r && amdgpu_ras_intr_triggered()) { 5834 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 5835 amdgpu_ras_reset_error_count(tmp_adev, 5836 AMDGPU_RAS_BLOCK__MMHUB); 5837 } 5838 5839 amdgpu_ras_intr_cleared(); 5840 } 5841 5842 r = amdgpu_device_reinit_after_reset(reset_context); 5843 if (r == -EAGAIN) 5844 set_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 5845 else 5846 clear_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 5847 5848 out: 5849 return r; 5850 } 5851 5852 static void amdgpu_device_set_mp1_state(struct amdgpu_device *adev) 5853 { 5854 5855 switch (amdgpu_asic_reset_method(adev)) { 5856 case AMD_RESET_METHOD_MODE1: 5857 case AMD_RESET_METHOD_LINK: 5858 adev->mp1_state = PP_MP1_STATE_SHUTDOWN; 5859 break; 5860 case AMD_RESET_METHOD_MODE2: 5861 adev->mp1_state = PP_MP1_STATE_RESET; 5862 break; 5863 default: 5864 adev->mp1_state = PP_MP1_STATE_NONE; 5865 break; 5866 } 5867 } 5868 5869 static void amdgpu_device_unset_mp1_state(struct amdgpu_device *adev) 5870 { 5871 amdgpu_vf_error_trans_all(adev); 5872 adev->mp1_state = PP_MP1_STATE_NONE; 5873 } 5874 5875 static void amdgpu_device_resume_display_audio(struct amdgpu_device *adev) 5876 { 5877 struct pci_dev *p = NULL; 5878 5879 p = pci_get_domain_bus_and_slot(pci_domain_nr(adev->pdev->bus), 5880 adev->pdev->bus->number, 1); 5881 if (p) { 5882 pm_runtime_enable(&(p->dev)); 5883 pm_runtime_resume(&(p->dev)); 5884 } 5885 5886 pci_dev_put(p); 5887 } 5888 5889 static int amdgpu_device_suspend_display_audio(struct amdgpu_device *adev) 5890 { 5891 enum amd_reset_method reset_method; 5892 struct pci_dev *p = NULL; 5893 u64 expires; 5894 5895 /* 5896 * For now, only BACO and mode1 reset are confirmed 5897 * to suffer the audio issue without proper suspended. 5898 */ 5899 reset_method = amdgpu_asic_reset_method(adev); 5900 if ((reset_method != AMD_RESET_METHOD_BACO) && 5901 (reset_method != AMD_RESET_METHOD_MODE1)) 5902 return -EINVAL; 5903 5904 p = pci_get_domain_bus_and_slot(pci_domain_nr(adev->pdev->bus), 5905 adev->pdev->bus->number, 1); 5906 if (!p) 5907 return -ENODEV; 5908 5909 expires = pm_runtime_autosuspend_expiration(&(p->dev)); 5910 if (!expires) 5911 /* 5912 * If we cannot get the audio device autosuspend delay, 5913 * a fixed 4S interval will be used. Considering 3S is 5914 * the audio controller default autosuspend delay setting. 5915 * 4S used here is guaranteed to cover that. 5916 */ 5917 expires = ktime_get_mono_fast_ns() + NSEC_PER_SEC * 4ULL; 5918 5919 while (!pm_runtime_status_suspended(&(p->dev))) { 5920 if (!pm_runtime_suspend(&(p->dev))) 5921 break; 5922 5923 if (expires < ktime_get_mono_fast_ns()) { 5924 dev_warn(adev->dev, "failed to suspend display audio\n"); 5925 pci_dev_put(p); 5926 /* TODO: abort the succeeding gpu reset? */ 5927 return -ETIMEDOUT; 5928 } 5929 } 5930 5931 pm_runtime_disable(&(p->dev)); 5932 5933 pci_dev_put(p); 5934 return 0; 5935 } 5936 5937 static inline void amdgpu_device_stop_pending_resets(struct amdgpu_device *adev) 5938 { 5939 struct amdgpu_ras *con = amdgpu_ras_get_context(adev); 5940 5941 #if defined(CONFIG_DEBUG_FS) 5942 if (!amdgpu_sriov_vf(adev)) 5943 cancel_work(&adev->reset_work); 5944 #endif 5945 5946 if (adev->kfd.dev) 5947 cancel_work(&adev->kfd.reset_work); 5948 5949 if (amdgpu_sriov_vf(adev)) 5950 cancel_work(&adev->virt.flr_work); 5951 5952 if (con && adev->ras_enabled) 5953 cancel_work(&con->recovery_work); 5954 5955 } 5956 5957 static int amdgpu_device_health_check(struct list_head *device_list_handle) 5958 { 5959 struct amdgpu_device *tmp_adev; 5960 int ret = 0; 5961 u32 status; 5962 5963 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 5964 pci_read_config_dword(tmp_adev->pdev, PCI_COMMAND, &status); 5965 if (PCI_POSSIBLE_ERROR(status)) { 5966 dev_err(tmp_adev->dev, "device lost from bus!"); 5967 ret = -ENODEV; 5968 } 5969 } 5970 5971 return ret; 5972 } 5973 5974 static int amdgpu_device_halt_activities(struct amdgpu_device *adev, 5975 struct amdgpu_job *job, 5976 struct amdgpu_reset_context *reset_context, 5977 struct list_head *device_list, 5978 struct amdgpu_hive_info *hive, 5979 bool need_emergency_restart) 5980 { 5981 struct list_head *device_list_handle = NULL; 5982 struct amdgpu_device *tmp_adev = NULL; 5983 int i, r = 0; 5984 5985 /* 5986 * Build list of devices to reset. 5987 * In case we are in XGMI hive mode, resort the device list 5988 * to put adev in the 1st position. 5989 */ 5990 if (!amdgpu_sriov_vf(adev) && (adev->gmc.xgmi.num_physical_nodes > 1) && hive) { 5991 list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head) { 5992 list_add_tail(&tmp_adev->reset_list, device_list); 5993 if (adev->shutdown) 5994 tmp_adev->shutdown = true; 5995 if (adev->pcie_reset_ctx.occurs_dpc) 5996 tmp_adev->pcie_reset_ctx.in_link_reset = true; 5997 } 5998 if (!list_is_first(&adev->reset_list, device_list)) 5999 list_rotate_to_front(&adev->reset_list, device_list); 6000 device_list_handle = device_list; 6001 } else { 6002 list_add_tail(&adev->reset_list, device_list); 6003 device_list_handle = device_list; 6004 } 6005 6006 if (!amdgpu_sriov_vf(adev) && (!adev->pcie_reset_ctx.occurs_dpc)) { 6007 r = amdgpu_device_health_check(device_list_handle); 6008 if (r) 6009 return r; 6010 } 6011 6012 /* We need to lock reset domain only once both for XGMI and single device */ 6013 tmp_adev = list_first_entry(device_list_handle, struct amdgpu_device, 6014 reset_list); 6015 amdgpu_device_lock_reset_domain(tmp_adev->reset_domain); 6016 6017 /* block all schedulers and reset given job's ring */ 6018 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 6019 6020 amdgpu_device_set_mp1_state(tmp_adev); 6021 6022 /* 6023 * Try to put the audio codec into suspend state 6024 * before gpu reset started. 6025 * 6026 * Due to the power domain of the graphics device 6027 * is shared with AZ power domain. Without this, 6028 * we may change the audio hardware from behind 6029 * the audio driver's back. That will trigger 6030 * some audio codec errors. 6031 */ 6032 if (!amdgpu_device_suspend_display_audio(tmp_adev)) 6033 tmp_adev->pcie_reset_ctx.audio_suspended = true; 6034 6035 amdgpu_ras_set_error_query_ready(tmp_adev, false); 6036 6037 cancel_delayed_work_sync(&tmp_adev->delayed_init_work); 6038 6039 amdgpu_amdkfd_pre_reset(tmp_adev, reset_context); 6040 6041 /* 6042 * Mark these ASICs to be reset as untracked first 6043 * And add them back after reset completed 6044 */ 6045 amdgpu_unregister_gpu_instance(tmp_adev); 6046 6047 drm_client_dev_suspend(adev_to_drm(tmp_adev), false); 6048 6049 /* disable ras on ALL IPs */ 6050 if (!need_emergency_restart && 6051 (!adev->pcie_reset_ctx.occurs_dpc) && 6052 amdgpu_device_ip_need_full_reset(tmp_adev)) 6053 amdgpu_ras_suspend(tmp_adev); 6054 6055 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 6056 struct amdgpu_ring *ring = tmp_adev->rings[i]; 6057 6058 if (!amdgpu_ring_sched_ready(ring)) 6059 continue; 6060 6061 drm_sched_stop(&ring->sched, job ? &job->base : NULL); 6062 6063 if (need_emergency_restart) 6064 amdgpu_job_stop_all_jobs_on_sched(&ring->sched); 6065 } 6066 atomic_inc(&tmp_adev->gpu_reset_counter); 6067 } 6068 6069 return r; 6070 } 6071 6072 static int amdgpu_device_asic_reset(struct amdgpu_device *adev, 6073 struct list_head *device_list, 6074 struct amdgpu_reset_context *reset_context) 6075 { 6076 struct amdgpu_device *tmp_adev = NULL; 6077 int retry_limit = AMDGPU_MAX_RETRY_LIMIT; 6078 int r = 0; 6079 6080 retry: /* Rest of adevs pre asic reset from XGMI hive. */ 6081 list_for_each_entry(tmp_adev, device_list, reset_list) { 6082 if (adev->pcie_reset_ctx.occurs_dpc) 6083 tmp_adev->no_hw_access = true; 6084 r = amdgpu_device_pre_asic_reset(tmp_adev, reset_context); 6085 if (adev->pcie_reset_ctx.occurs_dpc) 6086 tmp_adev->no_hw_access = false; 6087 /*TODO Should we stop ?*/ 6088 if (r) { 6089 dev_err(tmp_adev->dev, "GPU pre asic reset failed with err, %d for drm dev, %s ", 6090 r, adev_to_drm(tmp_adev)->unique); 6091 tmp_adev->asic_reset_res = r; 6092 } 6093 } 6094 6095 /* Actual ASIC resets if needed.*/ 6096 /* Host driver will handle XGMI hive reset for SRIOV */ 6097 if (amdgpu_sriov_vf(adev)) { 6098 if (amdgpu_ras_get_fed_status(adev) || amdgpu_virt_rcvd_ras_interrupt(adev)) { 6099 dev_dbg(adev->dev, "Detected RAS error, wait for FLR completion\n"); 6100 amdgpu_ras_set_fed(adev, true); 6101 set_bit(AMDGPU_HOST_FLR, &reset_context->flags); 6102 } 6103 6104 r = amdgpu_device_reset_sriov(adev, reset_context); 6105 if (AMDGPU_RETRY_SRIOV_RESET(r) && (retry_limit--) > 0) { 6106 amdgpu_virt_release_full_gpu(adev, true); 6107 goto retry; 6108 } 6109 if (r) 6110 adev->asic_reset_res = r; 6111 } else { 6112 r = amdgpu_do_asic_reset(device_list, reset_context); 6113 if (r && r == -EAGAIN) 6114 goto retry; 6115 } 6116 6117 list_for_each_entry(tmp_adev, device_list, reset_list) { 6118 /* 6119 * Drop any pending non scheduler resets queued before reset is done. 6120 * Any reset scheduled after this point would be valid. Scheduler resets 6121 * were already dropped during drm_sched_stop and no new ones can come 6122 * in before drm_sched_start. 6123 */ 6124 amdgpu_device_stop_pending_resets(tmp_adev); 6125 } 6126 6127 return r; 6128 } 6129 6130 static int amdgpu_device_sched_resume(struct list_head *device_list, 6131 struct amdgpu_reset_context *reset_context, 6132 bool job_signaled) 6133 { 6134 struct amdgpu_device *tmp_adev = NULL; 6135 int i, r = 0; 6136 6137 /* Post ASIC reset for all devs .*/ 6138 list_for_each_entry(tmp_adev, device_list, reset_list) { 6139 6140 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 6141 struct amdgpu_ring *ring = tmp_adev->rings[i]; 6142 6143 if (!amdgpu_ring_sched_ready(ring)) 6144 continue; 6145 6146 drm_sched_start(&ring->sched, 0); 6147 } 6148 6149 if (!drm_drv_uses_atomic_modeset(adev_to_drm(tmp_adev)) && !job_signaled) 6150 drm_helper_resume_force_mode(adev_to_drm(tmp_adev)); 6151 6152 if (tmp_adev->asic_reset_res) 6153 r = tmp_adev->asic_reset_res; 6154 6155 tmp_adev->asic_reset_res = 0; 6156 6157 if (r) { 6158 /* bad news, how to tell it to userspace ? 6159 * for ras error, we should report GPU bad status instead of 6160 * reset failure 6161 */ 6162 if (reset_context->src != AMDGPU_RESET_SRC_RAS || 6163 !amdgpu_ras_eeprom_check_err_threshold(tmp_adev)) 6164 dev_info(tmp_adev->dev, "GPU reset(%d) failed\n", 6165 atomic_read(&tmp_adev->gpu_reset_counter)); 6166 amdgpu_vf_error_put(tmp_adev, AMDGIM_ERROR_VF_GPU_RESET_FAIL, 0, r); 6167 } else { 6168 dev_info(tmp_adev->dev, "GPU reset(%d) succeeded!\n", atomic_read(&tmp_adev->gpu_reset_counter)); 6169 if (amdgpu_acpi_smart_shift_update(adev_to_drm(tmp_adev), AMDGPU_SS_DEV_D0)) 6170 DRM_WARN("smart shift update failed\n"); 6171 } 6172 } 6173 6174 return r; 6175 } 6176 6177 static void amdgpu_device_gpu_resume(struct amdgpu_device *adev, 6178 struct list_head *device_list, 6179 bool need_emergency_restart) 6180 { 6181 struct amdgpu_device *tmp_adev = NULL; 6182 6183 list_for_each_entry(tmp_adev, device_list, reset_list) { 6184 /* unlock kfd: SRIOV would do it separately */ 6185 if (!need_emergency_restart && !amdgpu_sriov_vf(tmp_adev)) 6186 amdgpu_amdkfd_post_reset(tmp_adev); 6187 6188 /* kfd_post_reset will do nothing if kfd device is not initialized, 6189 * need to bring up kfd here if it's not be initialized before 6190 */ 6191 if (!adev->kfd.init_complete) 6192 amdgpu_amdkfd_device_init(adev); 6193 6194 if (tmp_adev->pcie_reset_ctx.audio_suspended) 6195 amdgpu_device_resume_display_audio(tmp_adev); 6196 6197 amdgpu_device_unset_mp1_state(tmp_adev); 6198 6199 amdgpu_ras_set_error_query_ready(tmp_adev, true); 6200 6201 } 6202 6203 tmp_adev = list_first_entry(device_list, struct amdgpu_device, 6204 reset_list); 6205 amdgpu_device_unlock_reset_domain(tmp_adev->reset_domain); 6206 6207 } 6208 6209 6210 /** 6211 * amdgpu_device_gpu_recover - reset the asic and recover scheduler 6212 * 6213 * @adev: amdgpu_device pointer 6214 * @job: which job trigger hang 6215 * @reset_context: amdgpu reset context pointer 6216 * 6217 * Attempt to reset the GPU if it has hung (all asics). 6218 * Attempt to do soft-reset or full-reset and reinitialize Asic 6219 * Returns 0 for success or an error on failure. 6220 */ 6221 6222 int amdgpu_device_gpu_recover(struct amdgpu_device *adev, 6223 struct amdgpu_job *job, 6224 struct amdgpu_reset_context *reset_context) 6225 { 6226 struct list_head device_list; 6227 bool job_signaled = false; 6228 struct amdgpu_hive_info *hive = NULL; 6229 int r = 0; 6230 bool need_emergency_restart = false; 6231 6232 /* 6233 * If it reaches here because of hang/timeout and a RAS error is 6234 * detected at the same time, let RAS recovery take care of it. 6235 */ 6236 if (amdgpu_ras_is_err_state(adev, AMDGPU_RAS_BLOCK__ANY) && 6237 !amdgpu_sriov_vf(adev) && 6238 reset_context->src != AMDGPU_RESET_SRC_RAS) { 6239 dev_dbg(adev->dev, 6240 "Gpu recovery from source: %d yielding to RAS error recovery handling", 6241 reset_context->src); 6242 return 0; 6243 } 6244 6245 /* 6246 * Special case: RAS triggered and full reset isn't supported 6247 */ 6248 need_emergency_restart = amdgpu_ras_need_emergency_restart(adev); 6249 6250 /* 6251 * Flush RAM to disk so that after reboot 6252 * the user can read log and see why the system rebooted. 6253 */ 6254 if (need_emergency_restart && amdgpu_ras_get_context(adev) && 6255 amdgpu_ras_get_context(adev)->reboot) { 6256 DRM_WARN("Emergency reboot."); 6257 6258 ksys_sync_helper(); 6259 emergency_restart(); 6260 } 6261 6262 dev_info(adev->dev, "GPU %s begin!\n", 6263 need_emergency_restart ? "jobs stop":"reset"); 6264 6265 if (!amdgpu_sriov_vf(adev)) 6266 hive = amdgpu_get_xgmi_hive(adev); 6267 if (hive) 6268 mutex_lock(&hive->hive_lock); 6269 6270 reset_context->job = job; 6271 reset_context->hive = hive; 6272 INIT_LIST_HEAD(&device_list); 6273 6274 r = amdgpu_device_halt_activities(adev, job, reset_context, &device_list, 6275 hive, need_emergency_restart); 6276 if (r) 6277 goto end_reset; 6278 6279 if (need_emergency_restart) 6280 goto skip_sched_resume; 6281 /* 6282 * Must check guilty signal here since after this point all old 6283 * HW fences are force signaled. 6284 * 6285 * job->base holds a reference to parent fence 6286 */ 6287 if (job && dma_fence_is_signaled(&job->hw_fence)) { 6288 job_signaled = true; 6289 dev_info(adev->dev, "Guilty job already signaled, skipping HW reset"); 6290 goto skip_hw_reset; 6291 } 6292 6293 r = amdgpu_device_asic_reset(adev, &device_list, reset_context); 6294 if (r) 6295 goto end_reset; 6296 skip_hw_reset: 6297 r = amdgpu_device_sched_resume(&device_list, reset_context, job_signaled); 6298 if (r) 6299 goto end_reset; 6300 skip_sched_resume: 6301 amdgpu_device_gpu_resume(adev, &device_list, need_emergency_restart); 6302 end_reset: 6303 if (hive) { 6304 mutex_unlock(&hive->hive_lock); 6305 amdgpu_put_xgmi_hive(hive); 6306 } 6307 6308 if (r) 6309 dev_info(adev->dev, "GPU reset end with ret = %d\n", r); 6310 6311 atomic_set(&adev->reset_domain->reset_res, r); 6312 6313 if (!r) 6314 drm_dev_wedged_event(adev_to_drm(adev), DRM_WEDGE_RECOVERY_NONE); 6315 6316 return r; 6317 } 6318 6319 /** 6320 * amdgpu_device_partner_bandwidth - find the bandwidth of appropriate partner 6321 * 6322 * @adev: amdgpu_device pointer 6323 * @speed: pointer to the speed of the link 6324 * @width: pointer to the width of the link 6325 * 6326 * Evaluate the hierarchy to find the speed and bandwidth capabilities of the 6327 * first physical partner to an AMD dGPU. 6328 * This will exclude any virtual switches and links. 6329 */ 6330 static void amdgpu_device_partner_bandwidth(struct amdgpu_device *adev, 6331 enum pci_bus_speed *speed, 6332 enum pcie_link_width *width) 6333 { 6334 struct pci_dev *parent = adev->pdev; 6335 6336 if (!speed || !width) 6337 return; 6338 6339 *speed = PCI_SPEED_UNKNOWN; 6340 *width = PCIE_LNK_WIDTH_UNKNOWN; 6341 6342 if (amdgpu_device_pcie_dynamic_switching_supported(adev)) { 6343 while ((parent = pci_upstream_bridge(parent))) { 6344 /* skip upstream/downstream switches internal to dGPU*/ 6345 if (parent->vendor == PCI_VENDOR_ID_ATI) 6346 continue; 6347 *speed = pcie_get_speed_cap(parent); 6348 *width = pcie_get_width_cap(parent); 6349 break; 6350 } 6351 } else { 6352 /* use the current speeds rather than max if switching is not supported */ 6353 pcie_bandwidth_available(adev->pdev, NULL, speed, width); 6354 } 6355 } 6356 6357 /** 6358 * amdgpu_device_gpu_bandwidth - find the bandwidth of the GPU 6359 * 6360 * @adev: amdgpu_device pointer 6361 * @speed: pointer to the speed of the link 6362 * @width: pointer to the width of the link 6363 * 6364 * Evaluate the hierarchy to find the speed and bandwidth capabilities of the 6365 * AMD dGPU which may be a virtual upstream bridge. 6366 */ 6367 static void amdgpu_device_gpu_bandwidth(struct amdgpu_device *adev, 6368 enum pci_bus_speed *speed, 6369 enum pcie_link_width *width) 6370 { 6371 struct pci_dev *parent = adev->pdev; 6372 6373 if (!speed || !width) 6374 return; 6375 6376 parent = pci_upstream_bridge(parent); 6377 if (parent && parent->vendor == PCI_VENDOR_ID_ATI) { 6378 /* use the upstream/downstream switches internal to dGPU */ 6379 *speed = pcie_get_speed_cap(parent); 6380 *width = pcie_get_width_cap(parent); 6381 while ((parent = pci_upstream_bridge(parent))) { 6382 if (parent->vendor == PCI_VENDOR_ID_ATI) { 6383 /* use the upstream/downstream switches internal to dGPU */ 6384 *speed = pcie_get_speed_cap(parent); 6385 *width = pcie_get_width_cap(parent); 6386 } 6387 } 6388 } else { 6389 /* use the device itself */ 6390 *speed = pcie_get_speed_cap(adev->pdev); 6391 *width = pcie_get_width_cap(adev->pdev); 6392 } 6393 } 6394 6395 /** 6396 * amdgpu_device_get_pcie_info - fence pcie info about the PCIE slot 6397 * 6398 * @adev: amdgpu_device pointer 6399 * 6400 * Fetches and stores in the driver the PCIE capabilities (gen speed 6401 * and lanes) of the slot the device is in. Handles APUs and 6402 * virtualized environments where PCIE config space may not be available. 6403 */ 6404 static void amdgpu_device_get_pcie_info(struct amdgpu_device *adev) 6405 { 6406 enum pci_bus_speed speed_cap, platform_speed_cap; 6407 enum pcie_link_width platform_link_width, link_width; 6408 6409 if (amdgpu_pcie_gen_cap) 6410 adev->pm.pcie_gen_mask = amdgpu_pcie_gen_cap; 6411 6412 if (amdgpu_pcie_lane_cap) 6413 adev->pm.pcie_mlw_mask = amdgpu_pcie_lane_cap; 6414 6415 /* covers APUs as well */ 6416 if (pci_is_root_bus(adev->pdev->bus) && !amdgpu_passthrough(adev)) { 6417 if (adev->pm.pcie_gen_mask == 0) 6418 adev->pm.pcie_gen_mask = AMDGPU_DEFAULT_PCIE_GEN_MASK; 6419 if (adev->pm.pcie_mlw_mask == 0) 6420 adev->pm.pcie_mlw_mask = AMDGPU_DEFAULT_PCIE_MLW_MASK; 6421 return; 6422 } 6423 6424 if (adev->pm.pcie_gen_mask && adev->pm.pcie_mlw_mask) 6425 return; 6426 6427 amdgpu_device_partner_bandwidth(adev, &platform_speed_cap, 6428 &platform_link_width); 6429 amdgpu_device_gpu_bandwidth(adev, &speed_cap, &link_width); 6430 6431 if (adev->pm.pcie_gen_mask == 0) { 6432 /* asic caps */ 6433 if (speed_cap == PCI_SPEED_UNKNOWN) { 6434 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 6435 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 | 6436 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3); 6437 } else { 6438 if (speed_cap == PCIE_SPEED_32_0GT) 6439 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 6440 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 | 6441 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3 | 6442 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN4 | 6443 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN5); 6444 else if (speed_cap == PCIE_SPEED_16_0GT) 6445 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 6446 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 | 6447 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3 | 6448 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN4); 6449 else if (speed_cap == PCIE_SPEED_8_0GT) 6450 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 6451 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 | 6452 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3); 6453 else if (speed_cap == PCIE_SPEED_5_0GT) 6454 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 6455 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2); 6456 else 6457 adev->pm.pcie_gen_mask |= CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1; 6458 } 6459 /* platform caps */ 6460 if (platform_speed_cap == PCI_SPEED_UNKNOWN) { 6461 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 6462 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2); 6463 } else { 6464 if (platform_speed_cap == PCIE_SPEED_32_0GT) 6465 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 6466 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 | 6467 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3 | 6468 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN4 | 6469 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN5); 6470 else if (platform_speed_cap == PCIE_SPEED_16_0GT) 6471 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 6472 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 | 6473 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3 | 6474 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN4); 6475 else if (platform_speed_cap == PCIE_SPEED_8_0GT) 6476 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 6477 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 | 6478 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3); 6479 else if (platform_speed_cap == PCIE_SPEED_5_0GT) 6480 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 6481 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2); 6482 else 6483 adev->pm.pcie_gen_mask |= CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1; 6484 6485 } 6486 } 6487 if (adev->pm.pcie_mlw_mask == 0) { 6488 /* asic caps */ 6489 if (link_width == PCIE_LNK_WIDTH_UNKNOWN) { 6490 adev->pm.pcie_mlw_mask |= AMDGPU_DEFAULT_ASIC_PCIE_MLW_MASK; 6491 } else { 6492 switch (link_width) { 6493 case PCIE_LNK_X32: 6494 adev->pm.pcie_mlw_mask |= (CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X32 | 6495 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X16 | 6496 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X12 | 6497 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X8 | 6498 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X4 | 6499 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X2 | 6500 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X1); 6501 break; 6502 case PCIE_LNK_X16: 6503 adev->pm.pcie_mlw_mask |= (CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X16 | 6504 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X12 | 6505 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X8 | 6506 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X4 | 6507 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X2 | 6508 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X1); 6509 break; 6510 case PCIE_LNK_X12: 6511 adev->pm.pcie_mlw_mask |= (CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X12 | 6512 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X8 | 6513 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X4 | 6514 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X2 | 6515 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X1); 6516 break; 6517 case PCIE_LNK_X8: 6518 adev->pm.pcie_mlw_mask |= (CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X8 | 6519 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X4 | 6520 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X2 | 6521 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X1); 6522 break; 6523 case PCIE_LNK_X4: 6524 adev->pm.pcie_mlw_mask |= (CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X4 | 6525 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X2 | 6526 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X1); 6527 break; 6528 case PCIE_LNK_X2: 6529 adev->pm.pcie_mlw_mask |= (CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X2 | 6530 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X1); 6531 break; 6532 case PCIE_LNK_X1: 6533 adev->pm.pcie_mlw_mask |= CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X1; 6534 break; 6535 default: 6536 break; 6537 } 6538 } 6539 /* platform caps */ 6540 if (platform_link_width == PCIE_LNK_WIDTH_UNKNOWN) { 6541 adev->pm.pcie_mlw_mask |= AMDGPU_DEFAULT_PCIE_MLW_MASK; 6542 } else { 6543 switch (platform_link_width) { 6544 case PCIE_LNK_X32: 6545 adev->pm.pcie_mlw_mask |= (CAIL_PCIE_LINK_WIDTH_SUPPORT_X32 | 6546 CAIL_PCIE_LINK_WIDTH_SUPPORT_X16 | 6547 CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 | 6548 CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 | 6549 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 6550 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 6551 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 6552 break; 6553 case PCIE_LNK_X16: 6554 adev->pm.pcie_mlw_mask |= (CAIL_PCIE_LINK_WIDTH_SUPPORT_X16 | 6555 CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 | 6556 CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 | 6557 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 6558 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 6559 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 6560 break; 6561 case PCIE_LNK_X12: 6562 adev->pm.pcie_mlw_mask |= (CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 | 6563 CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 | 6564 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 6565 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 6566 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 6567 break; 6568 case PCIE_LNK_X8: 6569 adev->pm.pcie_mlw_mask |= (CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 | 6570 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 6571 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 6572 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 6573 break; 6574 case PCIE_LNK_X4: 6575 adev->pm.pcie_mlw_mask |= (CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 6576 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 6577 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 6578 break; 6579 case PCIE_LNK_X2: 6580 adev->pm.pcie_mlw_mask |= (CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 6581 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 6582 break; 6583 case PCIE_LNK_X1: 6584 adev->pm.pcie_mlw_mask |= CAIL_PCIE_LINK_WIDTH_SUPPORT_X1; 6585 break; 6586 default: 6587 break; 6588 } 6589 } 6590 } 6591 } 6592 6593 /** 6594 * amdgpu_device_is_peer_accessible - Check peer access through PCIe BAR 6595 * 6596 * @adev: amdgpu_device pointer 6597 * @peer_adev: amdgpu_device pointer for peer device trying to access @adev 6598 * 6599 * Return true if @peer_adev can access (DMA) @adev through the PCIe 6600 * BAR, i.e. @adev is "large BAR" and the BAR matches the DMA mask of 6601 * @peer_adev. 6602 */ 6603 bool amdgpu_device_is_peer_accessible(struct amdgpu_device *adev, 6604 struct amdgpu_device *peer_adev) 6605 { 6606 #ifdef CONFIG_HSA_AMD_P2P 6607 bool p2p_access = 6608 !adev->gmc.xgmi.connected_to_cpu && 6609 !(pci_p2pdma_distance(adev->pdev, peer_adev->dev, false) < 0); 6610 if (!p2p_access) 6611 dev_info(adev->dev, "PCIe P2P access from peer device %s is not supported by the chipset\n", 6612 pci_name(peer_adev->pdev)); 6613 6614 bool is_large_bar = adev->gmc.visible_vram_size && 6615 adev->gmc.real_vram_size == adev->gmc.visible_vram_size; 6616 bool p2p_addressable = amdgpu_device_check_iommu_remap(peer_adev); 6617 6618 if (!p2p_addressable) { 6619 uint64_t address_mask = peer_adev->dev->dma_mask ? 6620 ~*peer_adev->dev->dma_mask : ~((1ULL << 32) - 1); 6621 resource_size_t aper_limit = 6622 adev->gmc.aper_base + adev->gmc.aper_size - 1; 6623 6624 p2p_addressable = !(adev->gmc.aper_base & address_mask || 6625 aper_limit & address_mask); 6626 } 6627 return pcie_p2p && is_large_bar && p2p_access && p2p_addressable; 6628 #else 6629 return false; 6630 #endif 6631 } 6632 6633 int amdgpu_device_baco_enter(struct drm_device *dev) 6634 { 6635 struct amdgpu_device *adev = drm_to_adev(dev); 6636 struct amdgpu_ras *ras = amdgpu_ras_get_context(adev); 6637 6638 if (!amdgpu_device_supports_baco(dev)) 6639 return -ENOTSUPP; 6640 6641 if (ras && adev->ras_enabled && 6642 adev->nbio.funcs->enable_doorbell_interrupt) 6643 adev->nbio.funcs->enable_doorbell_interrupt(adev, false); 6644 6645 return amdgpu_dpm_baco_enter(adev); 6646 } 6647 6648 int amdgpu_device_baco_exit(struct drm_device *dev) 6649 { 6650 struct amdgpu_device *adev = drm_to_adev(dev); 6651 struct amdgpu_ras *ras = amdgpu_ras_get_context(adev); 6652 int ret = 0; 6653 6654 if (!amdgpu_device_supports_baco(dev)) 6655 return -ENOTSUPP; 6656 6657 ret = amdgpu_dpm_baco_exit(adev); 6658 if (ret) 6659 return ret; 6660 6661 if (ras && adev->ras_enabled && 6662 adev->nbio.funcs->enable_doorbell_interrupt) 6663 adev->nbio.funcs->enable_doorbell_interrupt(adev, true); 6664 6665 if (amdgpu_passthrough(adev) && adev->nbio.funcs && 6666 adev->nbio.funcs->clear_doorbell_interrupt) 6667 adev->nbio.funcs->clear_doorbell_interrupt(adev); 6668 6669 return 0; 6670 } 6671 6672 /** 6673 * amdgpu_pci_error_detected - Called when a PCI error is detected. 6674 * @pdev: PCI device struct 6675 * @state: PCI channel state 6676 * 6677 * Description: Called when a PCI error is detected. 6678 * 6679 * Return: PCI_ERS_RESULT_NEED_RESET or PCI_ERS_RESULT_DISCONNECT. 6680 */ 6681 pci_ers_result_t amdgpu_pci_error_detected(struct pci_dev *pdev, pci_channel_state_t state) 6682 { 6683 struct drm_device *dev = pci_get_drvdata(pdev); 6684 struct amdgpu_device *adev = drm_to_adev(dev); 6685 struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev); 6686 struct amdgpu_reset_context reset_context; 6687 struct list_head device_list; 6688 int r = 0; 6689 6690 dev_info(adev->dev, "PCI error: detected callback!!\n"); 6691 6692 if (!amdgpu_dpm_is_link_reset_supported(adev)) { 6693 dev_warn(adev->dev, "No support for XGMI hive yet...\n"); 6694 return PCI_ERS_RESULT_DISCONNECT; 6695 } 6696 6697 adev->pci_channel_state = state; 6698 6699 switch (state) { 6700 case pci_channel_io_normal: 6701 dev_info(adev->dev, "pci_channel_io_normal: state(%d)!!\n", state); 6702 return PCI_ERS_RESULT_CAN_RECOVER; 6703 case pci_channel_io_frozen: 6704 /* Fatal error, prepare for slot reset */ 6705 dev_info(adev->dev, "pci_channel_io_frozen: state(%d)!!\n", state); 6706 6707 if (hive) 6708 mutex_lock(&hive->hive_lock); 6709 adev->pcie_reset_ctx.occurs_dpc = true; 6710 memset(&reset_context, 0, sizeof(reset_context)); 6711 INIT_LIST_HEAD(&device_list); 6712 6713 r = amdgpu_device_halt_activities(adev, NULL, &reset_context, &device_list, 6714 hive, false); 6715 if (hive) { 6716 mutex_unlock(&hive->hive_lock); 6717 amdgpu_put_xgmi_hive(hive); 6718 } 6719 if (r) 6720 return PCI_ERS_RESULT_DISCONNECT; 6721 return PCI_ERS_RESULT_NEED_RESET; 6722 case pci_channel_io_perm_failure: 6723 /* Permanent error, prepare for device removal */ 6724 dev_info(adev->dev, "pci_channel_io_perm_failure: state(%d)!!\n", state); 6725 return PCI_ERS_RESULT_DISCONNECT; 6726 } 6727 6728 return PCI_ERS_RESULT_NEED_RESET; 6729 } 6730 6731 /** 6732 * amdgpu_pci_mmio_enabled - Enable MMIO and dump debug registers 6733 * @pdev: pointer to PCI device 6734 */ 6735 pci_ers_result_t amdgpu_pci_mmio_enabled(struct pci_dev *pdev) 6736 { 6737 struct drm_device *dev = pci_get_drvdata(pdev); 6738 struct amdgpu_device *adev = drm_to_adev(dev); 6739 6740 dev_info(adev->dev, "PCI error: mmio enabled callback!!\n"); 6741 6742 /* TODO - dump whatever for debugging purposes */ 6743 6744 /* This called only if amdgpu_pci_error_detected returns 6745 * PCI_ERS_RESULT_CAN_RECOVER. Read/write to the device still 6746 * works, no need to reset slot. 6747 */ 6748 6749 return PCI_ERS_RESULT_RECOVERED; 6750 } 6751 6752 /** 6753 * amdgpu_pci_slot_reset - Called when PCI slot has been reset. 6754 * @pdev: PCI device struct 6755 * 6756 * Description: This routine is called by the pci error recovery 6757 * code after the PCI slot has been reset, just before we 6758 * should resume normal operations. 6759 */ 6760 pci_ers_result_t amdgpu_pci_slot_reset(struct pci_dev *pdev) 6761 { 6762 struct drm_device *dev = pci_get_drvdata(pdev); 6763 struct amdgpu_device *adev = drm_to_adev(dev); 6764 struct amdgpu_reset_context reset_context; 6765 struct amdgpu_device *tmp_adev = NULL; 6766 struct amdgpu_hive_info *hive = NULL; 6767 struct list_head device_list; 6768 int r = 0, i; 6769 u32 memsize; 6770 6771 /* PCI error slot reset should be skipped During RAS recovery */ 6772 if ((amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 3) || 6773 amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 4)) && 6774 amdgpu_ras_in_recovery(adev)) 6775 return PCI_ERS_RESULT_RECOVERED; 6776 6777 dev_info(adev->dev, "PCI error: slot reset callback!!\n"); 6778 6779 memset(&reset_context, 0, sizeof(reset_context)); 6780 6781 /* wait for asic to come out of reset */ 6782 msleep(700); 6783 6784 /* Restore PCI confspace */ 6785 amdgpu_device_load_pci_state(pdev); 6786 6787 /* confirm ASIC came out of reset */ 6788 for (i = 0; i < adev->usec_timeout; i++) { 6789 memsize = amdgpu_asic_get_config_memsize(adev); 6790 6791 if (memsize != 0xffffffff) 6792 break; 6793 udelay(1); 6794 } 6795 if (memsize == 0xffffffff) { 6796 r = -ETIME; 6797 goto out; 6798 } 6799 6800 reset_context.method = AMD_RESET_METHOD_NONE; 6801 reset_context.reset_req_dev = adev; 6802 set_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags); 6803 set_bit(AMDGPU_SKIP_COREDUMP, &reset_context.flags); 6804 INIT_LIST_HEAD(&device_list); 6805 6806 hive = amdgpu_get_xgmi_hive(adev); 6807 if (hive) { 6808 mutex_lock(&hive->hive_lock); 6809 reset_context.hive = hive; 6810 list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head) { 6811 tmp_adev->pcie_reset_ctx.in_link_reset = true; 6812 list_add_tail(&tmp_adev->reset_list, &device_list); 6813 } 6814 } else { 6815 set_bit(AMDGPU_SKIP_HW_RESET, &reset_context.flags); 6816 list_add_tail(&adev->reset_list, &device_list); 6817 } 6818 6819 r = amdgpu_device_asic_reset(adev, &device_list, &reset_context); 6820 out: 6821 if (!r) { 6822 if (amdgpu_device_cache_pci_state(adev->pdev)) 6823 pci_restore_state(adev->pdev); 6824 dev_info(adev->dev, "PCIe error recovery succeeded\n"); 6825 } else { 6826 dev_err(adev->dev, "PCIe error recovery failed, err:%d\n", r); 6827 if (tmp_adev) { 6828 list_for_each_entry(tmp_adev, &device_list, reset_list) 6829 amdgpu_device_unset_mp1_state(tmp_adev); 6830 amdgpu_device_unlock_reset_domain(adev->reset_domain); 6831 } 6832 } 6833 6834 if (hive) { 6835 mutex_unlock(&hive->hive_lock); 6836 amdgpu_put_xgmi_hive(hive); 6837 } 6838 6839 return r ? PCI_ERS_RESULT_DISCONNECT : PCI_ERS_RESULT_RECOVERED; 6840 } 6841 6842 /** 6843 * amdgpu_pci_resume() - resume normal ops after PCI reset 6844 * @pdev: pointer to PCI device 6845 * 6846 * Called when the error recovery driver tells us that its 6847 * OK to resume normal operation. 6848 */ 6849 void amdgpu_pci_resume(struct pci_dev *pdev) 6850 { 6851 struct drm_device *dev = pci_get_drvdata(pdev); 6852 struct amdgpu_device *adev = drm_to_adev(dev); 6853 struct list_head device_list; 6854 struct amdgpu_hive_info *hive = NULL; 6855 struct amdgpu_device *tmp_adev = NULL; 6856 6857 dev_info(adev->dev, "PCI error: resume callback!!\n"); 6858 6859 /* Only continue execution for the case of pci_channel_io_frozen */ 6860 if (adev->pci_channel_state != pci_channel_io_frozen) 6861 return; 6862 6863 INIT_LIST_HEAD(&device_list); 6864 6865 hive = amdgpu_get_xgmi_hive(adev); 6866 if (hive) { 6867 mutex_lock(&hive->hive_lock); 6868 list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head) { 6869 tmp_adev->pcie_reset_ctx.in_link_reset = false; 6870 list_add_tail(&tmp_adev->reset_list, &device_list); 6871 } 6872 } else 6873 list_add_tail(&adev->reset_list, &device_list); 6874 6875 amdgpu_device_sched_resume(&device_list, NULL, NULL); 6876 amdgpu_device_gpu_resume(adev, &device_list, false); 6877 adev->pcie_reset_ctx.occurs_dpc = false; 6878 6879 if (hive) { 6880 mutex_unlock(&hive->hive_lock); 6881 amdgpu_put_xgmi_hive(hive); 6882 } 6883 } 6884 6885 bool amdgpu_device_cache_pci_state(struct pci_dev *pdev) 6886 { 6887 struct drm_device *dev = pci_get_drvdata(pdev); 6888 struct amdgpu_device *adev = drm_to_adev(dev); 6889 int r; 6890 6891 if (amdgpu_sriov_vf(adev)) 6892 return false; 6893 6894 r = pci_save_state(pdev); 6895 if (!r) { 6896 kfree(adev->pci_state); 6897 6898 adev->pci_state = pci_store_saved_state(pdev); 6899 6900 if (!adev->pci_state) { 6901 DRM_ERROR("Failed to store PCI saved state"); 6902 return false; 6903 } 6904 } else { 6905 DRM_WARN("Failed to save PCI state, err:%d\n", r); 6906 return false; 6907 } 6908 6909 return true; 6910 } 6911 6912 bool amdgpu_device_load_pci_state(struct pci_dev *pdev) 6913 { 6914 struct drm_device *dev = pci_get_drvdata(pdev); 6915 struct amdgpu_device *adev = drm_to_adev(dev); 6916 int r; 6917 6918 if (!adev->pci_state) 6919 return false; 6920 6921 r = pci_load_saved_state(pdev, adev->pci_state); 6922 6923 if (!r) { 6924 pci_restore_state(pdev); 6925 } else { 6926 DRM_WARN("Failed to load PCI state, err:%d\n", r); 6927 return false; 6928 } 6929 6930 return true; 6931 } 6932 6933 void amdgpu_device_flush_hdp(struct amdgpu_device *adev, 6934 struct amdgpu_ring *ring) 6935 { 6936 #ifdef CONFIG_X86_64 6937 if ((adev->flags & AMD_IS_APU) && !amdgpu_passthrough(adev)) 6938 return; 6939 #endif 6940 if (adev->gmc.xgmi.connected_to_cpu) 6941 return; 6942 6943 if (ring && ring->funcs->emit_hdp_flush) 6944 amdgpu_ring_emit_hdp_flush(ring); 6945 else 6946 amdgpu_asic_flush_hdp(adev, ring); 6947 } 6948 6949 void amdgpu_device_invalidate_hdp(struct amdgpu_device *adev, 6950 struct amdgpu_ring *ring) 6951 { 6952 #ifdef CONFIG_X86_64 6953 if ((adev->flags & AMD_IS_APU) && !amdgpu_passthrough(adev)) 6954 return; 6955 #endif 6956 if (adev->gmc.xgmi.connected_to_cpu) 6957 return; 6958 6959 amdgpu_asic_invalidate_hdp(adev, ring); 6960 } 6961 6962 int amdgpu_in_reset(struct amdgpu_device *adev) 6963 { 6964 return atomic_read(&adev->reset_domain->in_gpu_reset); 6965 } 6966 6967 /** 6968 * amdgpu_device_halt() - bring hardware to some kind of halt state 6969 * 6970 * @adev: amdgpu_device pointer 6971 * 6972 * Bring hardware to some kind of halt state so that no one can touch it 6973 * any more. It will help to maintain error context when error occurred. 6974 * Compare to a simple hang, the system will keep stable at least for SSH 6975 * access. Then it should be trivial to inspect the hardware state and 6976 * see what's going on. Implemented as following: 6977 * 6978 * 1. drm_dev_unplug() makes device inaccessible to user space(IOCTLs, etc), 6979 * clears all CPU mappings to device, disallows remappings through page faults 6980 * 2. amdgpu_irq_disable_all() disables all interrupts 6981 * 3. amdgpu_fence_driver_hw_fini() signals all HW fences 6982 * 4. set adev->no_hw_access to avoid potential crashes after setp 5 6983 * 5. amdgpu_device_unmap_mmio() clears all MMIO mappings 6984 * 6. pci_disable_device() and pci_wait_for_pending_transaction() 6985 * flush any in flight DMA operations 6986 */ 6987 void amdgpu_device_halt(struct amdgpu_device *adev) 6988 { 6989 struct pci_dev *pdev = adev->pdev; 6990 struct drm_device *ddev = adev_to_drm(adev); 6991 6992 amdgpu_xcp_dev_unplug(adev); 6993 drm_dev_unplug(ddev); 6994 6995 amdgpu_irq_disable_all(adev); 6996 6997 amdgpu_fence_driver_hw_fini(adev); 6998 6999 adev->no_hw_access = true; 7000 7001 amdgpu_device_unmap_mmio(adev); 7002 7003 pci_disable_device(pdev); 7004 pci_wait_for_pending_transaction(pdev); 7005 } 7006 7007 u32 amdgpu_device_pcie_port_rreg(struct amdgpu_device *adev, 7008 u32 reg) 7009 { 7010 unsigned long flags, address, data; 7011 u32 r; 7012 7013 address = adev->nbio.funcs->get_pcie_port_index_offset(adev); 7014 data = adev->nbio.funcs->get_pcie_port_data_offset(adev); 7015 7016 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 7017 WREG32(address, reg * 4); 7018 (void)RREG32(address); 7019 r = RREG32(data); 7020 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 7021 return r; 7022 } 7023 7024 void amdgpu_device_pcie_port_wreg(struct amdgpu_device *adev, 7025 u32 reg, u32 v) 7026 { 7027 unsigned long flags, address, data; 7028 7029 address = adev->nbio.funcs->get_pcie_port_index_offset(adev); 7030 data = adev->nbio.funcs->get_pcie_port_data_offset(adev); 7031 7032 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 7033 WREG32(address, reg * 4); 7034 (void)RREG32(address); 7035 WREG32(data, v); 7036 (void)RREG32(data); 7037 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 7038 } 7039 7040 /** 7041 * amdgpu_device_get_gang - return a reference to the current gang 7042 * @adev: amdgpu_device pointer 7043 * 7044 * Returns: A new reference to the current gang leader. 7045 */ 7046 struct dma_fence *amdgpu_device_get_gang(struct amdgpu_device *adev) 7047 { 7048 struct dma_fence *fence; 7049 7050 rcu_read_lock(); 7051 fence = dma_fence_get_rcu_safe(&adev->gang_submit); 7052 rcu_read_unlock(); 7053 return fence; 7054 } 7055 7056 /** 7057 * amdgpu_device_switch_gang - switch to a new gang 7058 * @adev: amdgpu_device pointer 7059 * @gang: the gang to switch to 7060 * 7061 * Try to switch to a new gang. 7062 * Returns: NULL if we switched to the new gang or a reference to the current 7063 * gang leader. 7064 */ 7065 struct dma_fence *amdgpu_device_switch_gang(struct amdgpu_device *adev, 7066 struct dma_fence *gang) 7067 { 7068 struct dma_fence *old = NULL; 7069 7070 dma_fence_get(gang); 7071 do { 7072 dma_fence_put(old); 7073 old = amdgpu_device_get_gang(adev); 7074 if (old == gang) 7075 break; 7076 7077 if (!dma_fence_is_signaled(old)) { 7078 dma_fence_put(gang); 7079 return old; 7080 } 7081 7082 } while (cmpxchg((struct dma_fence __force **)&adev->gang_submit, 7083 old, gang) != old); 7084 7085 /* 7086 * Drop it once for the exchanged reference in adev and once for the 7087 * thread local reference acquired in amdgpu_device_get_gang(). 7088 */ 7089 dma_fence_put(old); 7090 dma_fence_put(old); 7091 return NULL; 7092 } 7093 7094 /** 7095 * amdgpu_device_enforce_isolation - enforce HW isolation 7096 * @adev: the amdgpu device pointer 7097 * @ring: the HW ring the job is supposed to run on 7098 * @job: the job which is about to be pushed to the HW ring 7099 * 7100 * Makes sure that only one client at a time can use the GFX block. 7101 * Returns: The dependency to wait on before the job can be pushed to the HW. 7102 * The function is called multiple times until NULL is returned. 7103 */ 7104 struct dma_fence *amdgpu_device_enforce_isolation(struct amdgpu_device *adev, 7105 struct amdgpu_ring *ring, 7106 struct amdgpu_job *job) 7107 { 7108 struct amdgpu_isolation *isolation = &adev->isolation[ring->xcp_id]; 7109 struct drm_sched_fence *f = job->base.s_fence; 7110 struct dma_fence *dep; 7111 void *owner; 7112 int r; 7113 7114 /* 7115 * For now enforce isolation only for the GFX block since we only need 7116 * the cleaner shader on those rings. 7117 */ 7118 if (ring->funcs->type != AMDGPU_RING_TYPE_GFX && 7119 ring->funcs->type != AMDGPU_RING_TYPE_COMPUTE) 7120 return NULL; 7121 7122 /* 7123 * All submissions where enforce isolation is false are handled as if 7124 * they come from a single client. Use ~0l as the owner to distinct it 7125 * from kernel submissions where the owner is NULL. 7126 */ 7127 owner = job->enforce_isolation ? f->owner : (void *)~0l; 7128 7129 mutex_lock(&adev->enforce_isolation_mutex); 7130 7131 /* 7132 * The "spearhead" submission is the first one which changes the 7133 * ownership to its client. We always need to wait for it to be 7134 * pushed to the HW before proceeding with anything. 7135 */ 7136 if (&f->scheduled != isolation->spearhead && 7137 !dma_fence_is_signaled(isolation->spearhead)) { 7138 dep = isolation->spearhead; 7139 goto out_grab_ref; 7140 } 7141 7142 if (isolation->owner != owner) { 7143 7144 /* 7145 * Wait for any gang to be assembled before switching to a 7146 * different owner or otherwise we could deadlock the 7147 * submissions. 7148 */ 7149 if (!job->gang_submit) { 7150 dep = amdgpu_device_get_gang(adev); 7151 if (!dma_fence_is_signaled(dep)) 7152 goto out_return_dep; 7153 dma_fence_put(dep); 7154 } 7155 7156 dma_fence_put(isolation->spearhead); 7157 isolation->spearhead = dma_fence_get(&f->scheduled); 7158 amdgpu_sync_move(&isolation->active, &isolation->prev); 7159 trace_amdgpu_isolation(isolation->owner, owner); 7160 isolation->owner = owner; 7161 } 7162 7163 /* 7164 * Specifying the ring here helps to pipeline submissions even when 7165 * isolation is enabled. If that is not desired for testing NULL can be 7166 * used instead of the ring to enforce a CPU round trip while switching 7167 * between clients. 7168 */ 7169 dep = amdgpu_sync_peek_fence(&isolation->prev, ring); 7170 r = amdgpu_sync_fence(&isolation->active, &f->finished, GFP_NOWAIT); 7171 if (r) 7172 DRM_WARN("OOM tracking isolation\n"); 7173 7174 out_grab_ref: 7175 dma_fence_get(dep); 7176 out_return_dep: 7177 mutex_unlock(&adev->enforce_isolation_mutex); 7178 return dep; 7179 } 7180 7181 bool amdgpu_device_has_display_hardware(struct amdgpu_device *adev) 7182 { 7183 switch (adev->asic_type) { 7184 #ifdef CONFIG_DRM_AMDGPU_SI 7185 case CHIP_HAINAN: 7186 #endif 7187 case CHIP_TOPAZ: 7188 /* chips with no display hardware */ 7189 return false; 7190 #ifdef CONFIG_DRM_AMDGPU_SI 7191 case CHIP_TAHITI: 7192 case CHIP_PITCAIRN: 7193 case CHIP_VERDE: 7194 case CHIP_OLAND: 7195 #endif 7196 #ifdef CONFIG_DRM_AMDGPU_CIK 7197 case CHIP_BONAIRE: 7198 case CHIP_HAWAII: 7199 case CHIP_KAVERI: 7200 case CHIP_KABINI: 7201 case CHIP_MULLINS: 7202 #endif 7203 case CHIP_TONGA: 7204 case CHIP_FIJI: 7205 case CHIP_POLARIS10: 7206 case CHIP_POLARIS11: 7207 case CHIP_POLARIS12: 7208 case CHIP_VEGAM: 7209 case CHIP_CARRIZO: 7210 case CHIP_STONEY: 7211 /* chips with display hardware */ 7212 return true; 7213 default: 7214 /* IP discovery */ 7215 if (!amdgpu_ip_version(adev, DCE_HWIP, 0) || 7216 (adev->harvest_ip_mask & AMD_HARVEST_IP_DMU_MASK)) 7217 return false; 7218 return true; 7219 } 7220 } 7221 7222 uint32_t amdgpu_device_wait_on_rreg(struct amdgpu_device *adev, 7223 uint32_t inst, uint32_t reg_addr, char reg_name[], 7224 uint32_t expected_value, uint32_t mask) 7225 { 7226 uint32_t ret = 0; 7227 uint32_t old_ = 0; 7228 uint32_t tmp_ = RREG32(reg_addr); 7229 uint32_t loop = adev->usec_timeout; 7230 7231 while ((tmp_ & (mask)) != (expected_value)) { 7232 if (old_ != tmp_) { 7233 loop = adev->usec_timeout; 7234 old_ = tmp_; 7235 } else 7236 udelay(1); 7237 tmp_ = RREG32(reg_addr); 7238 loop--; 7239 if (!loop) { 7240 DRM_WARN("Register(%d) [%s] failed to reach value 0x%08x != 0x%08xn", 7241 inst, reg_name, (uint32_t)expected_value, 7242 (uint32_t)(tmp_ & (mask))); 7243 ret = -ETIMEDOUT; 7244 break; 7245 } 7246 } 7247 return ret; 7248 } 7249 7250 ssize_t amdgpu_get_soft_full_reset_mask(struct amdgpu_ring *ring) 7251 { 7252 ssize_t size = 0; 7253 7254 if (!ring || !ring->adev) 7255 return size; 7256 7257 if (amdgpu_device_should_recover_gpu(ring->adev)) 7258 size |= AMDGPU_RESET_TYPE_FULL; 7259 7260 if (unlikely(!ring->adev->debug_disable_soft_recovery) && 7261 !amdgpu_sriov_vf(ring->adev) && ring->funcs->soft_recovery) 7262 size |= AMDGPU_RESET_TYPE_SOFT_RESET; 7263 7264 return size; 7265 } 7266 7267 ssize_t amdgpu_show_reset_mask(char *buf, uint32_t supported_reset) 7268 { 7269 ssize_t size = 0; 7270 7271 if (supported_reset == 0) { 7272 size += sysfs_emit_at(buf, size, "unsupported"); 7273 size += sysfs_emit_at(buf, size, "\n"); 7274 return size; 7275 7276 } 7277 7278 if (supported_reset & AMDGPU_RESET_TYPE_SOFT_RESET) 7279 size += sysfs_emit_at(buf, size, "soft "); 7280 7281 if (supported_reset & AMDGPU_RESET_TYPE_PER_QUEUE) 7282 size += sysfs_emit_at(buf, size, "queue "); 7283 7284 if (supported_reset & AMDGPU_RESET_TYPE_PER_PIPE) 7285 size += sysfs_emit_at(buf, size, "pipe "); 7286 7287 if (supported_reset & AMDGPU_RESET_TYPE_FULL) 7288 size += sysfs_emit_at(buf, size, "full "); 7289 7290 size += sysfs_emit_at(buf, size, "\n"); 7291 return size; 7292 } 7293