1 /* 2 * Copyright 2008 Advanced Micro Devices, Inc. 3 * Copyright 2008 Red Hat Inc. 4 * Copyright 2009 Jerome Glisse. 5 * 6 * Permission is hereby granted, free of charge, to any person obtaining a 7 * copy of this software and associated documentation files (the "Software"), 8 * to deal in the Software without restriction, including without limitation 9 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 10 * and/or sell copies of the Software, and to permit persons to whom the 11 * Software is furnished to do so, subject to the following conditions: 12 * 13 * The above copyright notice and this permission notice shall be included in 14 * all copies or substantial portions of the Software. 15 * 16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 19 * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR 20 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, 21 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR 22 * OTHER DEALINGS IN THE SOFTWARE. 23 * 24 * Authors: Dave Airlie 25 * Alex Deucher 26 * Jerome Glisse 27 */ 28 29 #include <linux/aperture.h> 30 #include <linux/power_supply.h> 31 #include <linux/kthread.h> 32 #include <linux/module.h> 33 #include <linux/console.h> 34 #include <linux/slab.h> 35 #include <linux/iommu.h> 36 #include <linux/pci.h> 37 #include <linux/pci-p2pdma.h> 38 #include <linux/apple-gmux.h> 39 40 #include <drm/drm_atomic_helper.h> 41 #include <drm/drm_client_event.h> 42 #include <drm/drm_crtc_helper.h> 43 #include <drm/drm_probe_helper.h> 44 #include <drm/amdgpu_drm.h> 45 #include <linux/device.h> 46 #include <linux/vgaarb.h> 47 #include <linux/vga_switcheroo.h> 48 #include <linux/efi.h> 49 #include "amdgpu.h" 50 #include "amdgpu_trace.h" 51 #include "amdgpu_i2c.h" 52 #include "atom.h" 53 #include "amdgpu_atombios.h" 54 #include "amdgpu_atomfirmware.h" 55 #include "amd_pcie.h" 56 #ifdef CONFIG_DRM_AMDGPU_SI 57 #include "si.h" 58 #endif 59 #ifdef CONFIG_DRM_AMDGPU_CIK 60 #include "cik.h" 61 #endif 62 #include "vi.h" 63 #include "soc15.h" 64 #include "nv.h" 65 #include "bif/bif_4_1_d.h" 66 #include <linux/firmware.h> 67 #include "amdgpu_vf_error.h" 68 69 #include "amdgpu_amdkfd.h" 70 #include "amdgpu_pm.h" 71 72 #include "amdgpu_xgmi.h" 73 #include "amdgpu_ras.h" 74 #include "amdgpu_pmu.h" 75 #include "amdgpu_fru_eeprom.h" 76 #include "amdgpu_reset.h" 77 #include "amdgpu_virt.h" 78 #include "amdgpu_dev_coredump.h" 79 80 #include <linux/suspend.h> 81 #include <drm/task_barrier.h> 82 #include <linux/pm_runtime.h> 83 84 #include <drm/drm_drv.h> 85 86 #if IS_ENABLED(CONFIG_X86) 87 #include <asm/intel-family.h> 88 #endif 89 90 MODULE_FIRMWARE("amdgpu/vega10_gpu_info.bin"); 91 MODULE_FIRMWARE("amdgpu/vega12_gpu_info.bin"); 92 MODULE_FIRMWARE("amdgpu/raven_gpu_info.bin"); 93 MODULE_FIRMWARE("amdgpu/picasso_gpu_info.bin"); 94 MODULE_FIRMWARE("amdgpu/raven2_gpu_info.bin"); 95 MODULE_FIRMWARE("amdgpu/arcturus_gpu_info.bin"); 96 MODULE_FIRMWARE("amdgpu/navi12_gpu_info.bin"); 97 98 #define AMDGPU_RESUME_MS 2000 99 #define AMDGPU_MAX_RETRY_LIMIT 2 100 #define AMDGPU_RETRY_SRIOV_RESET(r) ((r) == -EBUSY || (r) == -ETIMEDOUT || (r) == -EINVAL) 101 #define AMDGPU_PCIE_INDEX_FALLBACK (0x38 >> 2) 102 #define AMDGPU_PCIE_INDEX_HI_FALLBACK (0x44 >> 2) 103 #define AMDGPU_PCIE_DATA_FALLBACK (0x3C >> 2) 104 105 #define AMDGPU_VBIOS_SKIP (1U << 0) 106 #define AMDGPU_VBIOS_OPTIONAL (1U << 1) 107 108 static const struct drm_driver amdgpu_kms_driver; 109 110 const char *amdgpu_asic_name[] = { 111 "TAHITI", 112 "PITCAIRN", 113 "VERDE", 114 "OLAND", 115 "HAINAN", 116 "BONAIRE", 117 "KAVERI", 118 "KABINI", 119 "HAWAII", 120 "MULLINS", 121 "TOPAZ", 122 "TONGA", 123 "FIJI", 124 "CARRIZO", 125 "STONEY", 126 "POLARIS10", 127 "POLARIS11", 128 "POLARIS12", 129 "VEGAM", 130 "VEGA10", 131 "VEGA12", 132 "VEGA20", 133 "RAVEN", 134 "ARCTURUS", 135 "RENOIR", 136 "ALDEBARAN", 137 "NAVI10", 138 "CYAN_SKILLFISH", 139 "NAVI14", 140 "NAVI12", 141 "SIENNA_CICHLID", 142 "NAVY_FLOUNDER", 143 "VANGOGH", 144 "DIMGREY_CAVEFISH", 145 "BEIGE_GOBY", 146 "YELLOW_CARP", 147 "IP DISCOVERY", 148 "LAST", 149 }; 150 151 #define AMDGPU_IP_BLK_MASK_ALL GENMASK(AMD_IP_BLOCK_TYPE_NUM - 1, 0) 152 /* 153 * Default init level where all blocks are expected to be initialized. This is 154 * the level of initialization expected by default and also after a full reset 155 * of the device. 156 */ 157 struct amdgpu_init_level amdgpu_init_default = { 158 .level = AMDGPU_INIT_LEVEL_DEFAULT, 159 .hwini_ip_block_mask = AMDGPU_IP_BLK_MASK_ALL, 160 }; 161 162 struct amdgpu_init_level amdgpu_init_recovery = { 163 .level = AMDGPU_INIT_LEVEL_RESET_RECOVERY, 164 .hwini_ip_block_mask = AMDGPU_IP_BLK_MASK_ALL, 165 }; 166 167 /* 168 * Minimal blocks needed to be initialized before a XGMI hive can be reset. This 169 * is used for cases like reset on initialization where the entire hive needs to 170 * be reset before first use. 171 */ 172 struct amdgpu_init_level amdgpu_init_minimal_xgmi = { 173 .level = AMDGPU_INIT_LEVEL_MINIMAL_XGMI, 174 .hwini_ip_block_mask = 175 BIT(AMD_IP_BLOCK_TYPE_GMC) | BIT(AMD_IP_BLOCK_TYPE_SMC) | 176 BIT(AMD_IP_BLOCK_TYPE_COMMON) | BIT(AMD_IP_BLOCK_TYPE_IH) | 177 BIT(AMD_IP_BLOCK_TYPE_PSP) 178 }; 179 180 static inline bool amdgpu_ip_member_of_hwini(struct amdgpu_device *adev, 181 enum amd_ip_block_type block) 182 { 183 return (adev->init_lvl->hwini_ip_block_mask & (1U << block)) != 0; 184 } 185 186 void amdgpu_set_init_level(struct amdgpu_device *adev, 187 enum amdgpu_init_lvl_id lvl) 188 { 189 switch (lvl) { 190 case AMDGPU_INIT_LEVEL_MINIMAL_XGMI: 191 adev->init_lvl = &amdgpu_init_minimal_xgmi; 192 break; 193 case AMDGPU_INIT_LEVEL_RESET_RECOVERY: 194 adev->init_lvl = &amdgpu_init_recovery; 195 break; 196 case AMDGPU_INIT_LEVEL_DEFAULT: 197 fallthrough; 198 default: 199 adev->init_lvl = &amdgpu_init_default; 200 break; 201 } 202 } 203 204 static inline void amdgpu_device_stop_pending_resets(struct amdgpu_device *adev); 205 static int amdgpu_device_pm_notifier(struct notifier_block *nb, unsigned long mode, 206 void *data); 207 208 /** 209 * DOC: pcie_replay_count 210 * 211 * The amdgpu driver provides a sysfs API for reporting the total number 212 * of PCIe replays (NAKs). 213 * The file pcie_replay_count is used for this and returns the total 214 * number of replays as a sum of the NAKs generated and NAKs received. 215 */ 216 217 static ssize_t amdgpu_device_get_pcie_replay_count(struct device *dev, 218 struct device_attribute *attr, char *buf) 219 { 220 struct drm_device *ddev = dev_get_drvdata(dev); 221 struct amdgpu_device *adev = drm_to_adev(ddev); 222 uint64_t cnt = amdgpu_asic_get_pcie_replay_count(adev); 223 224 return sysfs_emit(buf, "%llu\n", cnt); 225 } 226 227 static DEVICE_ATTR(pcie_replay_count, 0444, 228 amdgpu_device_get_pcie_replay_count, NULL); 229 230 static int amdgpu_device_attr_sysfs_init(struct amdgpu_device *adev) 231 { 232 int ret = 0; 233 234 if (!amdgpu_sriov_vf(adev)) 235 ret = sysfs_create_file(&adev->dev->kobj, 236 &dev_attr_pcie_replay_count.attr); 237 238 return ret; 239 } 240 241 static void amdgpu_device_attr_sysfs_fini(struct amdgpu_device *adev) 242 { 243 if (!amdgpu_sriov_vf(adev)) 244 sysfs_remove_file(&adev->dev->kobj, 245 &dev_attr_pcie_replay_count.attr); 246 } 247 248 static ssize_t amdgpu_sysfs_reg_state_get(struct file *f, struct kobject *kobj, 249 struct bin_attribute *attr, char *buf, 250 loff_t ppos, size_t count) 251 { 252 struct device *dev = kobj_to_dev(kobj); 253 struct drm_device *ddev = dev_get_drvdata(dev); 254 struct amdgpu_device *adev = drm_to_adev(ddev); 255 ssize_t bytes_read; 256 257 switch (ppos) { 258 case AMDGPU_SYS_REG_STATE_XGMI: 259 bytes_read = amdgpu_asic_get_reg_state( 260 adev, AMDGPU_REG_STATE_TYPE_XGMI, buf, count); 261 break; 262 case AMDGPU_SYS_REG_STATE_WAFL: 263 bytes_read = amdgpu_asic_get_reg_state( 264 adev, AMDGPU_REG_STATE_TYPE_WAFL, buf, count); 265 break; 266 case AMDGPU_SYS_REG_STATE_PCIE: 267 bytes_read = amdgpu_asic_get_reg_state( 268 adev, AMDGPU_REG_STATE_TYPE_PCIE, buf, count); 269 break; 270 case AMDGPU_SYS_REG_STATE_USR: 271 bytes_read = amdgpu_asic_get_reg_state( 272 adev, AMDGPU_REG_STATE_TYPE_USR, buf, count); 273 break; 274 case AMDGPU_SYS_REG_STATE_USR_1: 275 bytes_read = amdgpu_asic_get_reg_state( 276 adev, AMDGPU_REG_STATE_TYPE_USR_1, buf, count); 277 break; 278 default: 279 return -EINVAL; 280 } 281 282 return bytes_read; 283 } 284 285 BIN_ATTR(reg_state, 0444, amdgpu_sysfs_reg_state_get, NULL, 286 AMDGPU_SYS_REG_STATE_END); 287 288 int amdgpu_reg_state_sysfs_init(struct amdgpu_device *adev) 289 { 290 int ret; 291 292 if (!amdgpu_asic_get_reg_state_supported(adev)) 293 return 0; 294 295 ret = sysfs_create_bin_file(&adev->dev->kobj, &bin_attr_reg_state); 296 297 return ret; 298 } 299 300 void amdgpu_reg_state_sysfs_fini(struct amdgpu_device *adev) 301 { 302 if (!amdgpu_asic_get_reg_state_supported(adev)) 303 return; 304 sysfs_remove_bin_file(&adev->dev->kobj, &bin_attr_reg_state); 305 } 306 307 int amdgpu_ip_block_suspend(struct amdgpu_ip_block *ip_block) 308 { 309 int r; 310 311 if (ip_block->version->funcs->suspend) { 312 r = ip_block->version->funcs->suspend(ip_block); 313 if (r) { 314 dev_err(ip_block->adev->dev, 315 "suspend of IP block <%s> failed %d\n", 316 ip_block->version->funcs->name, r); 317 return r; 318 } 319 } 320 321 ip_block->status.hw = false; 322 return 0; 323 } 324 325 int amdgpu_ip_block_resume(struct amdgpu_ip_block *ip_block) 326 { 327 int r; 328 329 if (ip_block->version->funcs->resume) { 330 r = ip_block->version->funcs->resume(ip_block); 331 if (r) { 332 dev_err(ip_block->adev->dev, 333 "resume of IP block <%s> failed %d\n", 334 ip_block->version->funcs->name, r); 335 return r; 336 } 337 } 338 339 ip_block->status.hw = true; 340 return 0; 341 } 342 343 /** 344 * DOC: board_info 345 * 346 * The amdgpu driver provides a sysfs API for giving board related information. 347 * It provides the form factor information in the format 348 * 349 * type : form factor 350 * 351 * Possible form factor values 352 * 353 * - "cem" - PCIE CEM card 354 * - "oam" - Open Compute Accelerator Module 355 * - "unknown" - Not known 356 * 357 */ 358 359 static ssize_t amdgpu_device_get_board_info(struct device *dev, 360 struct device_attribute *attr, 361 char *buf) 362 { 363 struct drm_device *ddev = dev_get_drvdata(dev); 364 struct amdgpu_device *adev = drm_to_adev(ddev); 365 enum amdgpu_pkg_type pkg_type = AMDGPU_PKG_TYPE_CEM; 366 const char *pkg; 367 368 if (adev->smuio.funcs && adev->smuio.funcs->get_pkg_type) 369 pkg_type = adev->smuio.funcs->get_pkg_type(adev); 370 371 switch (pkg_type) { 372 case AMDGPU_PKG_TYPE_CEM: 373 pkg = "cem"; 374 break; 375 case AMDGPU_PKG_TYPE_OAM: 376 pkg = "oam"; 377 break; 378 default: 379 pkg = "unknown"; 380 break; 381 } 382 383 return sysfs_emit(buf, "%s : %s\n", "type", pkg); 384 } 385 386 static DEVICE_ATTR(board_info, 0444, amdgpu_device_get_board_info, NULL); 387 388 static struct attribute *amdgpu_board_attrs[] = { 389 &dev_attr_board_info.attr, 390 NULL, 391 }; 392 393 static umode_t amdgpu_board_attrs_is_visible(struct kobject *kobj, 394 struct attribute *attr, int n) 395 { 396 struct device *dev = kobj_to_dev(kobj); 397 struct drm_device *ddev = dev_get_drvdata(dev); 398 struct amdgpu_device *adev = drm_to_adev(ddev); 399 400 if (adev->flags & AMD_IS_APU) 401 return 0; 402 403 return attr->mode; 404 } 405 406 static const struct attribute_group amdgpu_board_attrs_group = { 407 .attrs = amdgpu_board_attrs, 408 .is_visible = amdgpu_board_attrs_is_visible 409 }; 410 411 static void amdgpu_device_get_pcie_info(struct amdgpu_device *adev); 412 413 414 /** 415 * amdgpu_device_supports_px - Is the device a dGPU with ATPX power control 416 * 417 * @dev: drm_device pointer 418 * 419 * Returns true if the device is a dGPU with ATPX power control, 420 * otherwise return false. 421 */ 422 bool amdgpu_device_supports_px(struct drm_device *dev) 423 { 424 struct amdgpu_device *adev = drm_to_adev(dev); 425 426 if ((adev->flags & AMD_IS_PX) && !amdgpu_is_atpx_hybrid()) 427 return true; 428 return false; 429 } 430 431 /** 432 * amdgpu_device_supports_boco - Is the device a dGPU with ACPI power resources 433 * 434 * @dev: drm_device pointer 435 * 436 * Returns true if the device is a dGPU with ACPI power control, 437 * otherwise return false. 438 */ 439 bool amdgpu_device_supports_boco(struct drm_device *dev) 440 { 441 struct amdgpu_device *adev = drm_to_adev(dev); 442 443 if (!IS_ENABLED(CONFIG_HOTPLUG_PCI_PCIE)) 444 return false; 445 446 if (adev->has_pr3 || 447 ((adev->flags & AMD_IS_PX) && amdgpu_is_atpx_hybrid())) 448 return true; 449 return false; 450 } 451 452 /** 453 * amdgpu_device_supports_baco - Does the device support BACO 454 * 455 * @dev: drm_device pointer 456 * 457 * Return: 458 * 1 if the device supports BACO; 459 * 3 if the device supports MACO (only works if BACO is supported) 460 * otherwise return 0. 461 */ 462 int amdgpu_device_supports_baco(struct drm_device *dev) 463 { 464 struct amdgpu_device *adev = drm_to_adev(dev); 465 466 return amdgpu_asic_supports_baco(adev); 467 } 468 469 void amdgpu_device_detect_runtime_pm_mode(struct amdgpu_device *adev) 470 { 471 struct drm_device *dev; 472 int bamaco_support; 473 474 dev = adev_to_drm(adev); 475 476 adev->pm.rpm_mode = AMDGPU_RUNPM_NONE; 477 bamaco_support = amdgpu_device_supports_baco(dev); 478 479 switch (amdgpu_runtime_pm) { 480 case 2: 481 if (bamaco_support & MACO_SUPPORT) { 482 adev->pm.rpm_mode = AMDGPU_RUNPM_BAMACO; 483 dev_info(adev->dev, "Forcing BAMACO for runtime pm\n"); 484 } else if (bamaco_support == BACO_SUPPORT) { 485 adev->pm.rpm_mode = AMDGPU_RUNPM_BACO; 486 dev_info(adev->dev, "Requested mode BAMACO not available,fallback to use BACO\n"); 487 } 488 break; 489 case 1: 490 if (bamaco_support & BACO_SUPPORT) { 491 adev->pm.rpm_mode = AMDGPU_RUNPM_BACO; 492 dev_info(adev->dev, "Forcing BACO for runtime pm\n"); 493 } 494 break; 495 case -1: 496 case -2: 497 if (amdgpu_device_supports_px(dev)) { /* enable PX as runtime mode */ 498 adev->pm.rpm_mode = AMDGPU_RUNPM_PX; 499 dev_info(adev->dev, "Using ATPX for runtime pm\n"); 500 } else if (amdgpu_device_supports_boco(dev)) { /* enable boco as runtime mode */ 501 adev->pm.rpm_mode = AMDGPU_RUNPM_BOCO; 502 dev_info(adev->dev, "Using BOCO for runtime pm\n"); 503 } else { 504 if (!bamaco_support) 505 goto no_runtime_pm; 506 507 switch (adev->asic_type) { 508 case CHIP_VEGA20: 509 case CHIP_ARCTURUS: 510 /* BACO are not supported on vega20 and arctrus */ 511 break; 512 case CHIP_VEGA10: 513 /* enable BACO as runpm mode if noretry=0 */ 514 if (!adev->gmc.noretry) 515 adev->pm.rpm_mode = AMDGPU_RUNPM_BACO; 516 break; 517 default: 518 /* enable BACO as runpm mode on CI+ */ 519 adev->pm.rpm_mode = AMDGPU_RUNPM_BACO; 520 break; 521 } 522 523 if (adev->pm.rpm_mode == AMDGPU_RUNPM_BACO) { 524 if (bamaco_support & MACO_SUPPORT) { 525 adev->pm.rpm_mode = AMDGPU_RUNPM_BAMACO; 526 dev_info(adev->dev, "Using BAMACO for runtime pm\n"); 527 } else { 528 dev_info(adev->dev, "Using BACO for runtime pm\n"); 529 } 530 } 531 } 532 break; 533 case 0: 534 dev_info(adev->dev, "runtime pm is manually disabled\n"); 535 break; 536 default: 537 break; 538 } 539 540 no_runtime_pm: 541 if (adev->pm.rpm_mode == AMDGPU_RUNPM_NONE) 542 dev_info(adev->dev, "Runtime PM not available\n"); 543 } 544 /** 545 * amdgpu_device_supports_smart_shift - Is the device dGPU with 546 * smart shift support 547 * 548 * @dev: drm_device pointer 549 * 550 * Returns true if the device is a dGPU with Smart Shift support, 551 * otherwise returns false. 552 */ 553 bool amdgpu_device_supports_smart_shift(struct drm_device *dev) 554 { 555 return (amdgpu_device_supports_boco(dev) && 556 amdgpu_acpi_is_power_shift_control_supported()); 557 } 558 559 /* 560 * VRAM access helper functions 561 */ 562 563 /** 564 * amdgpu_device_mm_access - access vram by MM_INDEX/MM_DATA 565 * 566 * @adev: amdgpu_device pointer 567 * @pos: offset of the buffer in vram 568 * @buf: virtual address of the buffer in system memory 569 * @size: read/write size, sizeof(@buf) must > @size 570 * @write: true - write to vram, otherwise - read from vram 571 */ 572 void amdgpu_device_mm_access(struct amdgpu_device *adev, loff_t pos, 573 void *buf, size_t size, bool write) 574 { 575 unsigned long flags; 576 uint32_t hi = ~0, tmp = 0; 577 uint32_t *data = buf; 578 uint64_t last; 579 int idx; 580 581 if (!drm_dev_enter(adev_to_drm(adev), &idx)) 582 return; 583 584 BUG_ON(!IS_ALIGNED(pos, 4) || !IS_ALIGNED(size, 4)); 585 586 spin_lock_irqsave(&adev->mmio_idx_lock, flags); 587 for (last = pos + size; pos < last; pos += 4) { 588 tmp = pos >> 31; 589 590 WREG32_NO_KIQ(mmMM_INDEX, ((uint32_t)pos) | 0x80000000); 591 if (tmp != hi) { 592 WREG32_NO_KIQ(mmMM_INDEX_HI, tmp); 593 hi = tmp; 594 } 595 if (write) 596 WREG32_NO_KIQ(mmMM_DATA, *data++); 597 else 598 *data++ = RREG32_NO_KIQ(mmMM_DATA); 599 } 600 601 spin_unlock_irqrestore(&adev->mmio_idx_lock, flags); 602 drm_dev_exit(idx); 603 } 604 605 /** 606 * amdgpu_device_aper_access - access vram by vram aperture 607 * 608 * @adev: amdgpu_device pointer 609 * @pos: offset of the buffer in vram 610 * @buf: virtual address of the buffer in system memory 611 * @size: read/write size, sizeof(@buf) must > @size 612 * @write: true - write to vram, otherwise - read from vram 613 * 614 * The return value means how many bytes have been transferred. 615 */ 616 size_t amdgpu_device_aper_access(struct amdgpu_device *adev, loff_t pos, 617 void *buf, size_t size, bool write) 618 { 619 #ifdef CONFIG_64BIT 620 void __iomem *addr; 621 size_t count = 0; 622 uint64_t last; 623 624 if (!adev->mman.aper_base_kaddr) 625 return 0; 626 627 last = min(pos + size, adev->gmc.visible_vram_size); 628 if (last > pos) { 629 addr = adev->mman.aper_base_kaddr + pos; 630 count = last - pos; 631 632 if (write) { 633 memcpy_toio(addr, buf, count); 634 /* Make sure HDP write cache flush happens without any reordering 635 * after the system memory contents are sent over PCIe device 636 */ 637 mb(); 638 amdgpu_device_flush_hdp(adev, NULL); 639 } else { 640 amdgpu_device_invalidate_hdp(adev, NULL); 641 /* Make sure HDP read cache is invalidated before issuing a read 642 * to the PCIe device 643 */ 644 mb(); 645 memcpy_fromio(buf, addr, count); 646 } 647 648 } 649 650 return count; 651 #else 652 return 0; 653 #endif 654 } 655 656 /** 657 * amdgpu_device_vram_access - read/write a buffer in vram 658 * 659 * @adev: amdgpu_device pointer 660 * @pos: offset of the buffer in vram 661 * @buf: virtual address of the buffer in system memory 662 * @size: read/write size, sizeof(@buf) must > @size 663 * @write: true - write to vram, otherwise - read from vram 664 */ 665 void amdgpu_device_vram_access(struct amdgpu_device *adev, loff_t pos, 666 void *buf, size_t size, bool write) 667 { 668 size_t count; 669 670 /* try to using vram apreature to access vram first */ 671 count = amdgpu_device_aper_access(adev, pos, buf, size, write); 672 size -= count; 673 if (size) { 674 /* using MM to access rest vram */ 675 pos += count; 676 buf += count; 677 amdgpu_device_mm_access(adev, pos, buf, size, write); 678 } 679 } 680 681 /* 682 * register access helper functions. 683 */ 684 685 /* Check if hw access should be skipped because of hotplug or device error */ 686 bool amdgpu_device_skip_hw_access(struct amdgpu_device *adev) 687 { 688 if (adev->no_hw_access) 689 return true; 690 691 #ifdef CONFIG_LOCKDEP 692 /* 693 * This is a bit complicated to understand, so worth a comment. What we assert 694 * here is that the GPU reset is not running on another thread in parallel. 695 * 696 * For this we trylock the read side of the reset semaphore, if that succeeds 697 * we know that the reset is not running in parallel. 698 * 699 * If the trylock fails we assert that we are either already holding the read 700 * side of the lock or are the reset thread itself and hold the write side of 701 * the lock. 702 */ 703 if (in_task()) { 704 if (down_read_trylock(&adev->reset_domain->sem)) 705 up_read(&adev->reset_domain->sem); 706 else 707 lockdep_assert_held(&adev->reset_domain->sem); 708 } 709 #endif 710 return false; 711 } 712 713 /** 714 * amdgpu_device_rreg - read a memory mapped IO or indirect register 715 * 716 * @adev: amdgpu_device pointer 717 * @reg: dword aligned register offset 718 * @acc_flags: access flags which require special behavior 719 * 720 * Returns the 32 bit value from the offset specified. 721 */ 722 uint32_t amdgpu_device_rreg(struct amdgpu_device *adev, 723 uint32_t reg, uint32_t acc_flags) 724 { 725 uint32_t ret; 726 727 if (amdgpu_device_skip_hw_access(adev)) 728 return 0; 729 730 if ((reg * 4) < adev->rmmio_size) { 731 if (!(acc_flags & AMDGPU_REGS_NO_KIQ) && 732 amdgpu_sriov_runtime(adev) && 733 down_read_trylock(&adev->reset_domain->sem)) { 734 ret = amdgpu_kiq_rreg(adev, reg, 0); 735 up_read(&adev->reset_domain->sem); 736 } else { 737 ret = readl(((void __iomem *)adev->rmmio) + (reg * 4)); 738 } 739 } else { 740 ret = adev->pcie_rreg(adev, reg * 4); 741 } 742 743 trace_amdgpu_device_rreg(adev->pdev->device, reg, ret); 744 745 return ret; 746 } 747 748 /* 749 * MMIO register read with bytes helper functions 750 * @offset:bytes offset from MMIO start 751 */ 752 753 /** 754 * amdgpu_mm_rreg8 - read a memory mapped IO register 755 * 756 * @adev: amdgpu_device pointer 757 * @offset: byte aligned register offset 758 * 759 * Returns the 8 bit value from the offset specified. 760 */ 761 uint8_t amdgpu_mm_rreg8(struct amdgpu_device *adev, uint32_t offset) 762 { 763 if (amdgpu_device_skip_hw_access(adev)) 764 return 0; 765 766 if (offset < adev->rmmio_size) 767 return (readb(adev->rmmio + offset)); 768 BUG(); 769 } 770 771 772 /** 773 * amdgpu_device_xcc_rreg - read a memory mapped IO or indirect register with specific XCC 774 * 775 * @adev: amdgpu_device pointer 776 * @reg: dword aligned register offset 777 * @acc_flags: access flags which require special behavior 778 * @xcc_id: xcc accelerated compute core id 779 * 780 * Returns the 32 bit value from the offset specified. 781 */ 782 uint32_t amdgpu_device_xcc_rreg(struct amdgpu_device *adev, 783 uint32_t reg, uint32_t acc_flags, 784 uint32_t xcc_id) 785 { 786 uint32_t ret, rlcg_flag; 787 788 if (amdgpu_device_skip_hw_access(adev)) 789 return 0; 790 791 if ((reg * 4) < adev->rmmio_size) { 792 if (amdgpu_sriov_vf(adev) && 793 !amdgpu_sriov_runtime(adev) && 794 adev->gfx.rlc.rlcg_reg_access_supported && 795 amdgpu_virt_get_rlcg_reg_access_flag(adev, acc_flags, 796 GC_HWIP, false, 797 &rlcg_flag)) { 798 ret = amdgpu_virt_rlcg_reg_rw(adev, reg, 0, rlcg_flag, GET_INST(GC, xcc_id)); 799 } else if (!(acc_flags & AMDGPU_REGS_NO_KIQ) && 800 amdgpu_sriov_runtime(adev) && 801 down_read_trylock(&adev->reset_domain->sem)) { 802 ret = amdgpu_kiq_rreg(adev, reg, xcc_id); 803 up_read(&adev->reset_domain->sem); 804 } else { 805 ret = readl(((void __iomem *)adev->rmmio) + (reg * 4)); 806 } 807 } else { 808 ret = adev->pcie_rreg(adev, reg * 4); 809 } 810 811 return ret; 812 } 813 814 /* 815 * MMIO register write with bytes helper functions 816 * @offset:bytes offset from MMIO start 817 * @value: the value want to be written to the register 818 */ 819 820 /** 821 * amdgpu_mm_wreg8 - read a memory mapped IO register 822 * 823 * @adev: amdgpu_device pointer 824 * @offset: byte aligned register offset 825 * @value: 8 bit value to write 826 * 827 * Writes the value specified to the offset specified. 828 */ 829 void amdgpu_mm_wreg8(struct amdgpu_device *adev, uint32_t offset, uint8_t value) 830 { 831 if (amdgpu_device_skip_hw_access(adev)) 832 return; 833 834 if (offset < adev->rmmio_size) 835 writeb(value, adev->rmmio + offset); 836 else 837 BUG(); 838 } 839 840 /** 841 * amdgpu_device_wreg - write to a memory mapped IO or indirect register 842 * 843 * @adev: amdgpu_device pointer 844 * @reg: dword aligned register offset 845 * @v: 32 bit value to write to the register 846 * @acc_flags: access flags which require special behavior 847 * 848 * Writes the value specified to the offset specified. 849 */ 850 void amdgpu_device_wreg(struct amdgpu_device *adev, 851 uint32_t reg, uint32_t v, 852 uint32_t acc_flags) 853 { 854 if (amdgpu_device_skip_hw_access(adev)) 855 return; 856 857 if ((reg * 4) < adev->rmmio_size) { 858 if (!(acc_flags & AMDGPU_REGS_NO_KIQ) && 859 amdgpu_sriov_runtime(adev) && 860 down_read_trylock(&adev->reset_domain->sem)) { 861 amdgpu_kiq_wreg(adev, reg, v, 0); 862 up_read(&adev->reset_domain->sem); 863 } else { 864 writel(v, ((void __iomem *)adev->rmmio) + (reg * 4)); 865 } 866 } else { 867 adev->pcie_wreg(adev, reg * 4, v); 868 } 869 870 trace_amdgpu_device_wreg(adev->pdev->device, reg, v); 871 } 872 873 /** 874 * amdgpu_mm_wreg_mmio_rlc - write register either with direct/indirect mmio or with RLC path if in range 875 * 876 * @adev: amdgpu_device pointer 877 * @reg: mmio/rlc register 878 * @v: value to write 879 * @xcc_id: xcc accelerated compute core id 880 * 881 * this function is invoked only for the debugfs register access 882 */ 883 void amdgpu_mm_wreg_mmio_rlc(struct amdgpu_device *adev, 884 uint32_t reg, uint32_t v, 885 uint32_t xcc_id) 886 { 887 if (amdgpu_device_skip_hw_access(adev)) 888 return; 889 890 if (amdgpu_sriov_fullaccess(adev) && 891 adev->gfx.rlc.funcs && 892 adev->gfx.rlc.funcs->is_rlcg_access_range) { 893 if (adev->gfx.rlc.funcs->is_rlcg_access_range(adev, reg)) 894 return amdgpu_sriov_wreg(adev, reg, v, 0, 0, xcc_id); 895 } else if ((reg * 4) >= adev->rmmio_size) { 896 adev->pcie_wreg(adev, reg * 4, v); 897 } else { 898 writel(v, ((void __iomem *)adev->rmmio) + (reg * 4)); 899 } 900 } 901 902 /** 903 * amdgpu_device_xcc_wreg - write to a memory mapped IO or indirect register with specific XCC 904 * 905 * @adev: amdgpu_device pointer 906 * @reg: dword aligned register offset 907 * @v: 32 bit value to write to the register 908 * @acc_flags: access flags which require special behavior 909 * @xcc_id: xcc accelerated compute core id 910 * 911 * Writes the value specified to the offset specified. 912 */ 913 void amdgpu_device_xcc_wreg(struct amdgpu_device *adev, 914 uint32_t reg, uint32_t v, 915 uint32_t acc_flags, uint32_t xcc_id) 916 { 917 uint32_t rlcg_flag; 918 919 if (amdgpu_device_skip_hw_access(adev)) 920 return; 921 922 if ((reg * 4) < adev->rmmio_size) { 923 if (amdgpu_sriov_vf(adev) && 924 !amdgpu_sriov_runtime(adev) && 925 adev->gfx.rlc.rlcg_reg_access_supported && 926 amdgpu_virt_get_rlcg_reg_access_flag(adev, acc_flags, 927 GC_HWIP, true, 928 &rlcg_flag)) { 929 amdgpu_virt_rlcg_reg_rw(adev, reg, v, rlcg_flag, GET_INST(GC, xcc_id)); 930 } else if (!(acc_flags & AMDGPU_REGS_NO_KIQ) && 931 amdgpu_sriov_runtime(adev) && 932 down_read_trylock(&adev->reset_domain->sem)) { 933 amdgpu_kiq_wreg(adev, reg, v, xcc_id); 934 up_read(&adev->reset_domain->sem); 935 } else { 936 writel(v, ((void __iomem *)adev->rmmio) + (reg * 4)); 937 } 938 } else { 939 adev->pcie_wreg(adev, reg * 4, v); 940 } 941 } 942 943 /** 944 * amdgpu_device_indirect_rreg - read an indirect register 945 * 946 * @adev: amdgpu_device pointer 947 * @reg_addr: indirect register address to read from 948 * 949 * Returns the value of indirect register @reg_addr 950 */ 951 u32 amdgpu_device_indirect_rreg(struct amdgpu_device *adev, 952 u32 reg_addr) 953 { 954 unsigned long flags, pcie_index, pcie_data; 955 void __iomem *pcie_index_offset; 956 void __iomem *pcie_data_offset; 957 u32 r; 958 959 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 960 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 961 962 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 963 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 964 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 965 966 writel(reg_addr, pcie_index_offset); 967 readl(pcie_index_offset); 968 r = readl(pcie_data_offset); 969 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 970 971 return r; 972 } 973 974 u32 amdgpu_device_indirect_rreg_ext(struct amdgpu_device *adev, 975 u64 reg_addr) 976 { 977 unsigned long flags, pcie_index, pcie_index_hi, pcie_data; 978 u32 r; 979 void __iomem *pcie_index_offset; 980 void __iomem *pcie_index_hi_offset; 981 void __iomem *pcie_data_offset; 982 983 if (unlikely(!adev->nbio.funcs)) { 984 pcie_index = AMDGPU_PCIE_INDEX_FALLBACK; 985 pcie_data = AMDGPU_PCIE_DATA_FALLBACK; 986 } else { 987 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 988 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 989 } 990 991 if (reg_addr >> 32) { 992 if (unlikely(!adev->nbio.funcs)) 993 pcie_index_hi = AMDGPU_PCIE_INDEX_HI_FALLBACK; 994 else 995 pcie_index_hi = adev->nbio.funcs->get_pcie_index_hi_offset(adev); 996 } else { 997 pcie_index_hi = 0; 998 } 999 1000 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 1001 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 1002 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 1003 if (pcie_index_hi != 0) 1004 pcie_index_hi_offset = (void __iomem *)adev->rmmio + 1005 pcie_index_hi * 4; 1006 1007 writel(reg_addr, pcie_index_offset); 1008 readl(pcie_index_offset); 1009 if (pcie_index_hi != 0) { 1010 writel((reg_addr >> 32) & 0xff, pcie_index_hi_offset); 1011 readl(pcie_index_hi_offset); 1012 } 1013 r = readl(pcie_data_offset); 1014 1015 /* clear the high bits */ 1016 if (pcie_index_hi != 0) { 1017 writel(0, pcie_index_hi_offset); 1018 readl(pcie_index_hi_offset); 1019 } 1020 1021 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 1022 1023 return r; 1024 } 1025 1026 /** 1027 * amdgpu_device_indirect_rreg64 - read a 64bits indirect register 1028 * 1029 * @adev: amdgpu_device pointer 1030 * @reg_addr: indirect register address to read from 1031 * 1032 * Returns the value of indirect register @reg_addr 1033 */ 1034 u64 amdgpu_device_indirect_rreg64(struct amdgpu_device *adev, 1035 u32 reg_addr) 1036 { 1037 unsigned long flags, pcie_index, pcie_data; 1038 void __iomem *pcie_index_offset; 1039 void __iomem *pcie_data_offset; 1040 u64 r; 1041 1042 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 1043 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 1044 1045 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 1046 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 1047 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 1048 1049 /* read low 32 bits */ 1050 writel(reg_addr, pcie_index_offset); 1051 readl(pcie_index_offset); 1052 r = readl(pcie_data_offset); 1053 /* read high 32 bits */ 1054 writel(reg_addr + 4, pcie_index_offset); 1055 readl(pcie_index_offset); 1056 r |= ((u64)readl(pcie_data_offset) << 32); 1057 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 1058 1059 return r; 1060 } 1061 1062 u64 amdgpu_device_indirect_rreg64_ext(struct amdgpu_device *adev, 1063 u64 reg_addr) 1064 { 1065 unsigned long flags, pcie_index, pcie_data; 1066 unsigned long pcie_index_hi = 0; 1067 void __iomem *pcie_index_offset; 1068 void __iomem *pcie_index_hi_offset; 1069 void __iomem *pcie_data_offset; 1070 u64 r; 1071 1072 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 1073 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 1074 if ((reg_addr >> 32) && (adev->nbio.funcs->get_pcie_index_hi_offset)) 1075 pcie_index_hi = adev->nbio.funcs->get_pcie_index_hi_offset(adev); 1076 1077 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 1078 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 1079 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 1080 if (pcie_index_hi != 0) 1081 pcie_index_hi_offset = (void __iomem *)adev->rmmio + 1082 pcie_index_hi * 4; 1083 1084 /* read low 32 bits */ 1085 writel(reg_addr, pcie_index_offset); 1086 readl(pcie_index_offset); 1087 if (pcie_index_hi != 0) { 1088 writel((reg_addr >> 32) & 0xff, pcie_index_hi_offset); 1089 readl(pcie_index_hi_offset); 1090 } 1091 r = readl(pcie_data_offset); 1092 /* read high 32 bits */ 1093 writel(reg_addr + 4, pcie_index_offset); 1094 readl(pcie_index_offset); 1095 if (pcie_index_hi != 0) { 1096 writel((reg_addr >> 32) & 0xff, pcie_index_hi_offset); 1097 readl(pcie_index_hi_offset); 1098 } 1099 r |= ((u64)readl(pcie_data_offset) << 32); 1100 1101 /* clear the high bits */ 1102 if (pcie_index_hi != 0) { 1103 writel(0, pcie_index_hi_offset); 1104 readl(pcie_index_hi_offset); 1105 } 1106 1107 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 1108 1109 return r; 1110 } 1111 1112 /** 1113 * amdgpu_device_indirect_wreg - write an indirect register address 1114 * 1115 * @adev: amdgpu_device pointer 1116 * @reg_addr: indirect register offset 1117 * @reg_data: indirect register data 1118 * 1119 */ 1120 void amdgpu_device_indirect_wreg(struct amdgpu_device *adev, 1121 u32 reg_addr, u32 reg_data) 1122 { 1123 unsigned long flags, pcie_index, pcie_data; 1124 void __iomem *pcie_index_offset; 1125 void __iomem *pcie_data_offset; 1126 1127 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 1128 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 1129 1130 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 1131 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 1132 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 1133 1134 writel(reg_addr, pcie_index_offset); 1135 readl(pcie_index_offset); 1136 writel(reg_data, pcie_data_offset); 1137 readl(pcie_data_offset); 1138 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 1139 } 1140 1141 void amdgpu_device_indirect_wreg_ext(struct amdgpu_device *adev, 1142 u64 reg_addr, u32 reg_data) 1143 { 1144 unsigned long flags, pcie_index, pcie_index_hi, pcie_data; 1145 void __iomem *pcie_index_offset; 1146 void __iomem *pcie_index_hi_offset; 1147 void __iomem *pcie_data_offset; 1148 1149 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 1150 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 1151 if ((reg_addr >> 32) && (adev->nbio.funcs->get_pcie_index_hi_offset)) 1152 pcie_index_hi = adev->nbio.funcs->get_pcie_index_hi_offset(adev); 1153 else 1154 pcie_index_hi = 0; 1155 1156 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 1157 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 1158 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 1159 if (pcie_index_hi != 0) 1160 pcie_index_hi_offset = (void __iomem *)adev->rmmio + 1161 pcie_index_hi * 4; 1162 1163 writel(reg_addr, pcie_index_offset); 1164 readl(pcie_index_offset); 1165 if (pcie_index_hi != 0) { 1166 writel((reg_addr >> 32) & 0xff, pcie_index_hi_offset); 1167 readl(pcie_index_hi_offset); 1168 } 1169 writel(reg_data, pcie_data_offset); 1170 readl(pcie_data_offset); 1171 1172 /* clear the high bits */ 1173 if (pcie_index_hi != 0) { 1174 writel(0, pcie_index_hi_offset); 1175 readl(pcie_index_hi_offset); 1176 } 1177 1178 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 1179 } 1180 1181 /** 1182 * amdgpu_device_indirect_wreg64 - write a 64bits indirect register address 1183 * 1184 * @adev: amdgpu_device pointer 1185 * @reg_addr: indirect register offset 1186 * @reg_data: indirect register data 1187 * 1188 */ 1189 void amdgpu_device_indirect_wreg64(struct amdgpu_device *adev, 1190 u32 reg_addr, u64 reg_data) 1191 { 1192 unsigned long flags, pcie_index, pcie_data; 1193 void __iomem *pcie_index_offset; 1194 void __iomem *pcie_data_offset; 1195 1196 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 1197 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 1198 1199 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 1200 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 1201 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 1202 1203 /* write low 32 bits */ 1204 writel(reg_addr, pcie_index_offset); 1205 readl(pcie_index_offset); 1206 writel((u32)(reg_data & 0xffffffffULL), pcie_data_offset); 1207 readl(pcie_data_offset); 1208 /* write high 32 bits */ 1209 writel(reg_addr + 4, pcie_index_offset); 1210 readl(pcie_index_offset); 1211 writel((u32)(reg_data >> 32), pcie_data_offset); 1212 readl(pcie_data_offset); 1213 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 1214 } 1215 1216 void amdgpu_device_indirect_wreg64_ext(struct amdgpu_device *adev, 1217 u64 reg_addr, u64 reg_data) 1218 { 1219 unsigned long flags, pcie_index, pcie_data; 1220 unsigned long pcie_index_hi = 0; 1221 void __iomem *pcie_index_offset; 1222 void __iomem *pcie_index_hi_offset; 1223 void __iomem *pcie_data_offset; 1224 1225 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 1226 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 1227 if ((reg_addr >> 32) && (adev->nbio.funcs->get_pcie_index_hi_offset)) 1228 pcie_index_hi = adev->nbio.funcs->get_pcie_index_hi_offset(adev); 1229 1230 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 1231 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 1232 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 1233 if (pcie_index_hi != 0) 1234 pcie_index_hi_offset = (void __iomem *)adev->rmmio + 1235 pcie_index_hi * 4; 1236 1237 /* write low 32 bits */ 1238 writel(reg_addr, pcie_index_offset); 1239 readl(pcie_index_offset); 1240 if (pcie_index_hi != 0) { 1241 writel((reg_addr >> 32) & 0xff, pcie_index_hi_offset); 1242 readl(pcie_index_hi_offset); 1243 } 1244 writel((u32)(reg_data & 0xffffffffULL), pcie_data_offset); 1245 readl(pcie_data_offset); 1246 /* write high 32 bits */ 1247 writel(reg_addr + 4, pcie_index_offset); 1248 readl(pcie_index_offset); 1249 if (pcie_index_hi != 0) { 1250 writel((reg_addr >> 32) & 0xff, pcie_index_hi_offset); 1251 readl(pcie_index_hi_offset); 1252 } 1253 writel((u32)(reg_data >> 32), pcie_data_offset); 1254 readl(pcie_data_offset); 1255 1256 /* clear the high bits */ 1257 if (pcie_index_hi != 0) { 1258 writel(0, pcie_index_hi_offset); 1259 readl(pcie_index_hi_offset); 1260 } 1261 1262 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 1263 } 1264 1265 /** 1266 * amdgpu_device_get_rev_id - query device rev_id 1267 * 1268 * @adev: amdgpu_device pointer 1269 * 1270 * Return device rev_id 1271 */ 1272 u32 amdgpu_device_get_rev_id(struct amdgpu_device *adev) 1273 { 1274 return adev->nbio.funcs->get_rev_id(adev); 1275 } 1276 1277 /** 1278 * amdgpu_invalid_rreg - dummy reg read function 1279 * 1280 * @adev: amdgpu_device pointer 1281 * @reg: offset of register 1282 * 1283 * Dummy register read function. Used for register blocks 1284 * that certain asics don't have (all asics). 1285 * Returns the value in the register. 1286 */ 1287 static uint32_t amdgpu_invalid_rreg(struct amdgpu_device *adev, uint32_t reg) 1288 { 1289 DRM_ERROR("Invalid callback to read register 0x%04X\n", reg); 1290 BUG(); 1291 return 0; 1292 } 1293 1294 static uint32_t amdgpu_invalid_rreg_ext(struct amdgpu_device *adev, uint64_t reg) 1295 { 1296 DRM_ERROR("Invalid callback to read register 0x%llX\n", reg); 1297 BUG(); 1298 return 0; 1299 } 1300 1301 /** 1302 * amdgpu_invalid_wreg - dummy reg write function 1303 * 1304 * @adev: amdgpu_device pointer 1305 * @reg: offset of register 1306 * @v: value to write to the register 1307 * 1308 * Dummy register read function. Used for register blocks 1309 * that certain asics don't have (all asics). 1310 */ 1311 static void amdgpu_invalid_wreg(struct amdgpu_device *adev, uint32_t reg, uint32_t v) 1312 { 1313 DRM_ERROR("Invalid callback to write register 0x%04X with 0x%08X\n", 1314 reg, v); 1315 BUG(); 1316 } 1317 1318 static void amdgpu_invalid_wreg_ext(struct amdgpu_device *adev, uint64_t reg, uint32_t v) 1319 { 1320 DRM_ERROR("Invalid callback to write register 0x%llX with 0x%08X\n", 1321 reg, v); 1322 BUG(); 1323 } 1324 1325 /** 1326 * amdgpu_invalid_rreg64 - dummy 64 bit reg read function 1327 * 1328 * @adev: amdgpu_device pointer 1329 * @reg: offset of register 1330 * 1331 * Dummy register read function. Used for register blocks 1332 * that certain asics don't have (all asics). 1333 * Returns the value in the register. 1334 */ 1335 static uint64_t amdgpu_invalid_rreg64(struct amdgpu_device *adev, uint32_t reg) 1336 { 1337 DRM_ERROR("Invalid callback to read 64 bit register 0x%04X\n", reg); 1338 BUG(); 1339 return 0; 1340 } 1341 1342 static uint64_t amdgpu_invalid_rreg64_ext(struct amdgpu_device *adev, uint64_t reg) 1343 { 1344 DRM_ERROR("Invalid callback to read register 0x%llX\n", reg); 1345 BUG(); 1346 return 0; 1347 } 1348 1349 /** 1350 * amdgpu_invalid_wreg64 - dummy reg write function 1351 * 1352 * @adev: amdgpu_device pointer 1353 * @reg: offset of register 1354 * @v: value to write to the register 1355 * 1356 * Dummy register read function. Used for register blocks 1357 * that certain asics don't have (all asics). 1358 */ 1359 static void amdgpu_invalid_wreg64(struct amdgpu_device *adev, uint32_t reg, uint64_t v) 1360 { 1361 DRM_ERROR("Invalid callback to write 64 bit register 0x%04X with 0x%08llX\n", 1362 reg, v); 1363 BUG(); 1364 } 1365 1366 static void amdgpu_invalid_wreg64_ext(struct amdgpu_device *adev, uint64_t reg, uint64_t v) 1367 { 1368 DRM_ERROR("Invalid callback to write 64 bit register 0x%llX with 0x%08llX\n", 1369 reg, v); 1370 BUG(); 1371 } 1372 1373 /** 1374 * amdgpu_block_invalid_rreg - dummy reg read function 1375 * 1376 * @adev: amdgpu_device pointer 1377 * @block: offset of instance 1378 * @reg: offset of register 1379 * 1380 * Dummy register read function. Used for register blocks 1381 * that certain asics don't have (all asics). 1382 * Returns the value in the register. 1383 */ 1384 static uint32_t amdgpu_block_invalid_rreg(struct amdgpu_device *adev, 1385 uint32_t block, uint32_t reg) 1386 { 1387 DRM_ERROR("Invalid callback to read register 0x%04X in block 0x%04X\n", 1388 reg, block); 1389 BUG(); 1390 return 0; 1391 } 1392 1393 /** 1394 * amdgpu_block_invalid_wreg - dummy reg write function 1395 * 1396 * @adev: amdgpu_device pointer 1397 * @block: offset of instance 1398 * @reg: offset of register 1399 * @v: value to write to the register 1400 * 1401 * Dummy register read function. Used for register blocks 1402 * that certain asics don't have (all asics). 1403 */ 1404 static void amdgpu_block_invalid_wreg(struct amdgpu_device *adev, 1405 uint32_t block, 1406 uint32_t reg, uint32_t v) 1407 { 1408 DRM_ERROR("Invalid block callback to write register 0x%04X in block 0x%04X with 0x%08X\n", 1409 reg, block, v); 1410 BUG(); 1411 } 1412 1413 static uint32_t amdgpu_device_get_vbios_flags(struct amdgpu_device *adev) 1414 { 1415 if (hweight32(adev->aid_mask) && (adev->flags & AMD_IS_APU)) 1416 return AMDGPU_VBIOS_SKIP; 1417 1418 if (hweight32(adev->aid_mask) && amdgpu_passthrough(adev)) 1419 return AMDGPU_VBIOS_OPTIONAL; 1420 1421 return 0; 1422 } 1423 1424 /** 1425 * amdgpu_device_asic_init - Wrapper for atom asic_init 1426 * 1427 * @adev: amdgpu_device pointer 1428 * 1429 * Does any asic specific work and then calls atom asic init. 1430 */ 1431 static int amdgpu_device_asic_init(struct amdgpu_device *adev) 1432 { 1433 uint32_t flags; 1434 bool optional; 1435 int ret; 1436 1437 amdgpu_asic_pre_asic_init(adev); 1438 flags = amdgpu_device_get_vbios_flags(adev); 1439 optional = !!(flags & (AMDGPU_VBIOS_OPTIONAL | AMDGPU_VBIOS_SKIP)); 1440 1441 if (amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 3) || 1442 amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 4) || 1443 amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 5, 0) || 1444 amdgpu_ip_version(adev, GC_HWIP, 0) >= IP_VERSION(11, 0, 0)) { 1445 amdgpu_psp_wait_for_bootloader(adev); 1446 if (optional && !adev->bios) 1447 return 0; 1448 1449 ret = amdgpu_atomfirmware_asic_init(adev, true); 1450 return ret; 1451 } else { 1452 if (optional && !adev->bios) 1453 return 0; 1454 1455 return amdgpu_atom_asic_init(adev->mode_info.atom_context); 1456 } 1457 1458 return 0; 1459 } 1460 1461 /** 1462 * amdgpu_device_mem_scratch_init - allocate the VRAM scratch page 1463 * 1464 * @adev: amdgpu_device pointer 1465 * 1466 * Allocates a scratch page of VRAM for use by various things in the 1467 * driver. 1468 */ 1469 static int amdgpu_device_mem_scratch_init(struct amdgpu_device *adev) 1470 { 1471 return amdgpu_bo_create_kernel(adev, AMDGPU_GPU_PAGE_SIZE, PAGE_SIZE, 1472 AMDGPU_GEM_DOMAIN_VRAM | 1473 AMDGPU_GEM_DOMAIN_GTT, 1474 &adev->mem_scratch.robj, 1475 &adev->mem_scratch.gpu_addr, 1476 (void **)&adev->mem_scratch.ptr); 1477 } 1478 1479 /** 1480 * amdgpu_device_mem_scratch_fini - Free the VRAM scratch page 1481 * 1482 * @adev: amdgpu_device pointer 1483 * 1484 * Frees the VRAM scratch page. 1485 */ 1486 static void amdgpu_device_mem_scratch_fini(struct amdgpu_device *adev) 1487 { 1488 amdgpu_bo_free_kernel(&adev->mem_scratch.robj, NULL, NULL); 1489 } 1490 1491 /** 1492 * amdgpu_device_program_register_sequence - program an array of registers. 1493 * 1494 * @adev: amdgpu_device pointer 1495 * @registers: pointer to the register array 1496 * @array_size: size of the register array 1497 * 1498 * Programs an array or registers with and or masks. 1499 * This is a helper for setting golden registers. 1500 */ 1501 void amdgpu_device_program_register_sequence(struct amdgpu_device *adev, 1502 const u32 *registers, 1503 const u32 array_size) 1504 { 1505 u32 tmp, reg, and_mask, or_mask; 1506 int i; 1507 1508 if (array_size % 3) 1509 return; 1510 1511 for (i = 0; i < array_size; i += 3) { 1512 reg = registers[i + 0]; 1513 and_mask = registers[i + 1]; 1514 or_mask = registers[i + 2]; 1515 1516 if (and_mask == 0xffffffff) { 1517 tmp = or_mask; 1518 } else { 1519 tmp = RREG32(reg); 1520 tmp &= ~and_mask; 1521 if (adev->family >= AMDGPU_FAMILY_AI) 1522 tmp |= (or_mask & and_mask); 1523 else 1524 tmp |= or_mask; 1525 } 1526 WREG32(reg, tmp); 1527 } 1528 } 1529 1530 /** 1531 * amdgpu_device_pci_config_reset - reset the GPU 1532 * 1533 * @adev: amdgpu_device pointer 1534 * 1535 * Resets the GPU using the pci config reset sequence. 1536 * Only applicable to asics prior to vega10. 1537 */ 1538 void amdgpu_device_pci_config_reset(struct amdgpu_device *adev) 1539 { 1540 pci_write_config_dword(adev->pdev, 0x7c, AMDGPU_ASIC_RESET_DATA); 1541 } 1542 1543 /** 1544 * amdgpu_device_pci_reset - reset the GPU using generic PCI means 1545 * 1546 * @adev: amdgpu_device pointer 1547 * 1548 * Resets the GPU using generic pci reset interfaces (FLR, SBR, etc.). 1549 */ 1550 int amdgpu_device_pci_reset(struct amdgpu_device *adev) 1551 { 1552 return pci_reset_function(adev->pdev); 1553 } 1554 1555 /* 1556 * amdgpu_device_wb_*() 1557 * Writeback is the method by which the GPU updates special pages in memory 1558 * with the status of certain GPU events (fences, ring pointers,etc.). 1559 */ 1560 1561 /** 1562 * amdgpu_device_wb_fini - Disable Writeback and free memory 1563 * 1564 * @adev: amdgpu_device pointer 1565 * 1566 * Disables Writeback and frees the Writeback memory (all asics). 1567 * Used at driver shutdown. 1568 */ 1569 static void amdgpu_device_wb_fini(struct amdgpu_device *adev) 1570 { 1571 if (adev->wb.wb_obj) { 1572 amdgpu_bo_free_kernel(&adev->wb.wb_obj, 1573 &adev->wb.gpu_addr, 1574 (void **)&adev->wb.wb); 1575 adev->wb.wb_obj = NULL; 1576 } 1577 } 1578 1579 /** 1580 * amdgpu_device_wb_init - Init Writeback driver info and allocate memory 1581 * 1582 * @adev: amdgpu_device pointer 1583 * 1584 * Initializes writeback and allocates writeback memory (all asics). 1585 * Used at driver startup. 1586 * Returns 0 on success or an -error on failure. 1587 */ 1588 static int amdgpu_device_wb_init(struct amdgpu_device *adev) 1589 { 1590 int r; 1591 1592 if (adev->wb.wb_obj == NULL) { 1593 /* AMDGPU_MAX_WB * sizeof(uint32_t) * 8 = AMDGPU_MAX_WB 256bit slots */ 1594 r = amdgpu_bo_create_kernel(adev, AMDGPU_MAX_WB * sizeof(uint32_t) * 8, 1595 PAGE_SIZE, AMDGPU_GEM_DOMAIN_GTT, 1596 &adev->wb.wb_obj, &adev->wb.gpu_addr, 1597 (void **)&adev->wb.wb); 1598 if (r) { 1599 dev_warn(adev->dev, "(%d) create WB bo failed\n", r); 1600 return r; 1601 } 1602 1603 adev->wb.num_wb = AMDGPU_MAX_WB; 1604 memset(&adev->wb.used, 0, sizeof(adev->wb.used)); 1605 1606 /* clear wb memory */ 1607 memset((char *)adev->wb.wb, 0, AMDGPU_MAX_WB * sizeof(uint32_t) * 8); 1608 } 1609 1610 return 0; 1611 } 1612 1613 /** 1614 * amdgpu_device_wb_get - Allocate a wb entry 1615 * 1616 * @adev: amdgpu_device pointer 1617 * @wb: wb index 1618 * 1619 * Allocate a wb slot for use by the driver (all asics). 1620 * Returns 0 on success or -EINVAL on failure. 1621 */ 1622 int amdgpu_device_wb_get(struct amdgpu_device *adev, u32 *wb) 1623 { 1624 unsigned long flags, offset; 1625 1626 spin_lock_irqsave(&adev->wb.lock, flags); 1627 offset = find_first_zero_bit(adev->wb.used, adev->wb.num_wb); 1628 if (offset < adev->wb.num_wb) { 1629 __set_bit(offset, adev->wb.used); 1630 spin_unlock_irqrestore(&adev->wb.lock, flags); 1631 *wb = offset << 3; /* convert to dw offset */ 1632 return 0; 1633 } else { 1634 spin_unlock_irqrestore(&adev->wb.lock, flags); 1635 return -EINVAL; 1636 } 1637 } 1638 1639 /** 1640 * amdgpu_device_wb_free - Free a wb entry 1641 * 1642 * @adev: amdgpu_device pointer 1643 * @wb: wb index 1644 * 1645 * Free a wb slot allocated for use by the driver (all asics) 1646 */ 1647 void amdgpu_device_wb_free(struct amdgpu_device *adev, u32 wb) 1648 { 1649 unsigned long flags; 1650 1651 wb >>= 3; 1652 spin_lock_irqsave(&adev->wb.lock, flags); 1653 if (wb < adev->wb.num_wb) 1654 __clear_bit(wb, adev->wb.used); 1655 spin_unlock_irqrestore(&adev->wb.lock, flags); 1656 } 1657 1658 /** 1659 * amdgpu_device_resize_fb_bar - try to resize FB BAR 1660 * 1661 * @adev: amdgpu_device pointer 1662 * 1663 * Try to resize FB BAR to make all VRAM CPU accessible. We try very hard not 1664 * to fail, but if any of the BARs is not accessible after the size we abort 1665 * driver loading by returning -ENODEV. 1666 */ 1667 int amdgpu_device_resize_fb_bar(struct amdgpu_device *adev) 1668 { 1669 int rbar_size = pci_rebar_bytes_to_size(adev->gmc.real_vram_size); 1670 struct pci_bus *root; 1671 struct resource *res; 1672 unsigned int i; 1673 u16 cmd; 1674 int r; 1675 1676 if (!IS_ENABLED(CONFIG_PHYS_ADDR_T_64BIT)) 1677 return 0; 1678 1679 /* Bypass for VF */ 1680 if (amdgpu_sriov_vf(adev)) 1681 return 0; 1682 1683 /* resizing on Dell G5 SE platforms causes problems with runtime pm */ 1684 if ((amdgpu_runtime_pm != 0) && 1685 adev->pdev->vendor == PCI_VENDOR_ID_ATI && 1686 adev->pdev->device == 0x731f && 1687 adev->pdev->subsystem_vendor == PCI_VENDOR_ID_DELL) 1688 return 0; 1689 1690 /* PCI_EXT_CAP_ID_VNDR extended capability is located at 0x100 */ 1691 if (!pci_find_ext_capability(adev->pdev, PCI_EXT_CAP_ID_VNDR)) 1692 DRM_WARN("System can't access extended configuration space, please check!!\n"); 1693 1694 /* skip if the bios has already enabled large BAR */ 1695 if (adev->gmc.real_vram_size && 1696 (pci_resource_len(adev->pdev, 0) >= adev->gmc.real_vram_size)) 1697 return 0; 1698 1699 /* Check if the root BUS has 64bit memory resources */ 1700 root = adev->pdev->bus; 1701 while (root->parent) 1702 root = root->parent; 1703 1704 pci_bus_for_each_resource(root, res, i) { 1705 if (res && res->flags & (IORESOURCE_MEM | IORESOURCE_MEM_64) && 1706 res->start > 0x100000000ull) 1707 break; 1708 } 1709 1710 /* Trying to resize is pointless without a root hub window above 4GB */ 1711 if (!res) 1712 return 0; 1713 1714 /* Limit the BAR size to what is available */ 1715 rbar_size = min(fls(pci_rebar_get_possible_sizes(adev->pdev, 0)) - 1, 1716 rbar_size); 1717 1718 /* Disable memory decoding while we change the BAR addresses and size */ 1719 pci_read_config_word(adev->pdev, PCI_COMMAND, &cmd); 1720 pci_write_config_word(adev->pdev, PCI_COMMAND, 1721 cmd & ~PCI_COMMAND_MEMORY); 1722 1723 /* Free the VRAM and doorbell BAR, we most likely need to move both. */ 1724 amdgpu_doorbell_fini(adev); 1725 if (adev->asic_type >= CHIP_BONAIRE) 1726 pci_release_resource(adev->pdev, 2); 1727 1728 pci_release_resource(adev->pdev, 0); 1729 1730 r = pci_resize_resource(adev->pdev, 0, rbar_size); 1731 if (r == -ENOSPC) 1732 DRM_INFO("Not enough PCI address space for a large BAR."); 1733 else if (r && r != -ENOTSUPP) 1734 DRM_ERROR("Problem resizing BAR0 (%d).", r); 1735 1736 pci_assign_unassigned_bus_resources(adev->pdev->bus); 1737 1738 /* When the doorbell or fb BAR isn't available we have no chance of 1739 * using the device. 1740 */ 1741 r = amdgpu_doorbell_init(adev); 1742 if (r || (pci_resource_flags(adev->pdev, 0) & IORESOURCE_UNSET)) 1743 return -ENODEV; 1744 1745 pci_write_config_word(adev->pdev, PCI_COMMAND, cmd); 1746 1747 return 0; 1748 } 1749 1750 /* 1751 * GPU helpers function. 1752 */ 1753 /** 1754 * amdgpu_device_need_post - check if the hw need post or not 1755 * 1756 * @adev: amdgpu_device pointer 1757 * 1758 * Check if the asic has been initialized (all asics) at driver startup 1759 * or post is needed if hw reset is performed. 1760 * Returns true if need or false if not. 1761 */ 1762 bool amdgpu_device_need_post(struct amdgpu_device *adev) 1763 { 1764 uint32_t reg, flags; 1765 1766 if (amdgpu_sriov_vf(adev)) 1767 return false; 1768 1769 flags = amdgpu_device_get_vbios_flags(adev); 1770 if (flags & AMDGPU_VBIOS_SKIP) 1771 return false; 1772 if ((flags & AMDGPU_VBIOS_OPTIONAL) && !adev->bios) 1773 return false; 1774 1775 if (amdgpu_passthrough(adev)) { 1776 /* for FIJI: In whole GPU pass-through virtualization case, after VM reboot 1777 * some old smc fw still need driver do vPost otherwise gpu hang, while 1778 * those smc fw version above 22.15 doesn't have this flaw, so we force 1779 * vpost executed for smc version below 22.15 1780 */ 1781 if (adev->asic_type == CHIP_FIJI) { 1782 int err; 1783 uint32_t fw_ver; 1784 1785 err = request_firmware(&adev->pm.fw, "amdgpu/fiji_smc.bin", adev->dev); 1786 /* force vPost if error occurred */ 1787 if (err) 1788 return true; 1789 1790 fw_ver = *((uint32_t *)adev->pm.fw->data + 69); 1791 release_firmware(adev->pm.fw); 1792 if (fw_ver < 0x00160e00) 1793 return true; 1794 } 1795 } 1796 1797 /* Don't post if we need to reset whole hive on init */ 1798 if (adev->init_lvl->level == AMDGPU_INIT_LEVEL_MINIMAL_XGMI) 1799 return false; 1800 1801 if (adev->has_hw_reset) { 1802 adev->has_hw_reset = false; 1803 return true; 1804 } 1805 1806 /* bios scratch used on CIK+ */ 1807 if (adev->asic_type >= CHIP_BONAIRE) 1808 return amdgpu_atombios_scratch_need_asic_init(adev); 1809 1810 /* check MEM_SIZE for older asics */ 1811 reg = amdgpu_asic_get_config_memsize(adev); 1812 1813 if ((reg != 0) && (reg != 0xffffffff)) 1814 return false; 1815 1816 return true; 1817 } 1818 1819 /* 1820 * Check whether seamless boot is supported. 1821 * 1822 * So far we only support seamless boot on DCE 3.0 or later. 1823 * If users report that it works on older ASICS as well, we may 1824 * loosen this. 1825 */ 1826 bool amdgpu_device_seamless_boot_supported(struct amdgpu_device *adev) 1827 { 1828 switch (amdgpu_seamless) { 1829 case -1: 1830 break; 1831 case 1: 1832 return true; 1833 case 0: 1834 return false; 1835 default: 1836 DRM_ERROR("Invalid value for amdgpu.seamless: %d\n", 1837 amdgpu_seamless); 1838 return false; 1839 } 1840 1841 if (!(adev->flags & AMD_IS_APU)) 1842 return false; 1843 1844 if (adev->mman.keep_stolen_vga_memory) 1845 return false; 1846 1847 return amdgpu_ip_version(adev, DCE_HWIP, 0) >= IP_VERSION(3, 0, 0); 1848 } 1849 1850 /* 1851 * Intel hosts such as Rocket Lake, Alder Lake, Raptor Lake and Sapphire Rapids 1852 * don't support dynamic speed switching. Until we have confirmation from Intel 1853 * that a specific host supports it, it's safer that we keep it disabled for all. 1854 * 1855 * https://edc.intel.com/content/www/us/en/design/products/platforms/details/raptor-lake-s/13th-generation-core-processors-datasheet-volume-1-of-2/005/pci-express-support/ 1856 * https://gitlab.freedesktop.org/drm/amd/-/issues/2663 1857 */ 1858 static bool amdgpu_device_pcie_dynamic_switching_supported(struct amdgpu_device *adev) 1859 { 1860 #if IS_ENABLED(CONFIG_X86) 1861 struct cpuinfo_x86 *c = &cpu_data(0); 1862 1863 /* eGPU change speeds based on USB4 fabric conditions */ 1864 if (dev_is_removable(adev->dev)) 1865 return true; 1866 1867 if (c->x86_vendor == X86_VENDOR_INTEL) 1868 return false; 1869 #endif 1870 return true; 1871 } 1872 1873 /** 1874 * amdgpu_device_should_use_aspm - check if the device should program ASPM 1875 * 1876 * @adev: amdgpu_device pointer 1877 * 1878 * Confirm whether the module parameter and pcie bridge agree that ASPM should 1879 * be set for this device. 1880 * 1881 * Returns true if it should be used or false if not. 1882 */ 1883 bool amdgpu_device_should_use_aspm(struct amdgpu_device *adev) 1884 { 1885 switch (amdgpu_aspm) { 1886 case -1: 1887 break; 1888 case 0: 1889 return false; 1890 case 1: 1891 return true; 1892 default: 1893 return false; 1894 } 1895 if (adev->flags & AMD_IS_APU) 1896 return false; 1897 if (!(adev->pm.pp_feature & PP_PCIE_DPM_MASK)) 1898 return false; 1899 return pcie_aspm_enabled(adev->pdev); 1900 } 1901 1902 /* if we get transitioned to only one device, take VGA back */ 1903 /** 1904 * amdgpu_device_vga_set_decode - enable/disable vga decode 1905 * 1906 * @pdev: PCI device pointer 1907 * @state: enable/disable vga decode 1908 * 1909 * Enable/disable vga decode (all asics). 1910 * Returns VGA resource flags. 1911 */ 1912 static unsigned int amdgpu_device_vga_set_decode(struct pci_dev *pdev, 1913 bool state) 1914 { 1915 struct amdgpu_device *adev = drm_to_adev(pci_get_drvdata(pdev)); 1916 1917 amdgpu_asic_set_vga_state(adev, state); 1918 if (state) 1919 return VGA_RSRC_LEGACY_IO | VGA_RSRC_LEGACY_MEM | 1920 VGA_RSRC_NORMAL_IO | VGA_RSRC_NORMAL_MEM; 1921 else 1922 return VGA_RSRC_NORMAL_IO | VGA_RSRC_NORMAL_MEM; 1923 } 1924 1925 /** 1926 * amdgpu_device_check_block_size - validate the vm block size 1927 * 1928 * @adev: amdgpu_device pointer 1929 * 1930 * Validates the vm block size specified via module parameter. 1931 * The vm block size defines number of bits in page table versus page directory, 1932 * a page is 4KB so we have 12 bits offset, minimum 9 bits in the 1933 * page table and the remaining bits are in the page directory. 1934 */ 1935 static void amdgpu_device_check_block_size(struct amdgpu_device *adev) 1936 { 1937 /* defines number of bits in page table versus page directory, 1938 * a page is 4KB so we have 12 bits offset, minimum 9 bits in the 1939 * page table and the remaining bits are in the page directory 1940 */ 1941 if (amdgpu_vm_block_size == -1) 1942 return; 1943 1944 if (amdgpu_vm_block_size < 9) { 1945 dev_warn(adev->dev, "VM page table size (%d) too small\n", 1946 amdgpu_vm_block_size); 1947 amdgpu_vm_block_size = -1; 1948 } 1949 } 1950 1951 /** 1952 * amdgpu_device_check_vm_size - validate the vm size 1953 * 1954 * @adev: amdgpu_device pointer 1955 * 1956 * Validates the vm size in GB specified via module parameter. 1957 * The VM size is the size of the GPU virtual memory space in GB. 1958 */ 1959 static void amdgpu_device_check_vm_size(struct amdgpu_device *adev) 1960 { 1961 /* no need to check the default value */ 1962 if (amdgpu_vm_size == -1) 1963 return; 1964 1965 if (amdgpu_vm_size < 1) { 1966 dev_warn(adev->dev, "VM size (%d) too small, min is 1GB\n", 1967 amdgpu_vm_size); 1968 amdgpu_vm_size = -1; 1969 } 1970 } 1971 1972 static void amdgpu_device_check_smu_prv_buffer_size(struct amdgpu_device *adev) 1973 { 1974 struct sysinfo si; 1975 bool is_os_64 = (sizeof(void *) == 8); 1976 uint64_t total_memory; 1977 uint64_t dram_size_seven_GB = 0x1B8000000; 1978 uint64_t dram_size_three_GB = 0xB8000000; 1979 1980 if (amdgpu_smu_memory_pool_size == 0) 1981 return; 1982 1983 if (!is_os_64) { 1984 DRM_WARN("Not 64-bit OS, feature not supported\n"); 1985 goto def_value; 1986 } 1987 si_meminfo(&si); 1988 total_memory = (uint64_t)si.totalram * si.mem_unit; 1989 1990 if ((amdgpu_smu_memory_pool_size == 1) || 1991 (amdgpu_smu_memory_pool_size == 2)) { 1992 if (total_memory < dram_size_three_GB) 1993 goto def_value1; 1994 } else if ((amdgpu_smu_memory_pool_size == 4) || 1995 (amdgpu_smu_memory_pool_size == 8)) { 1996 if (total_memory < dram_size_seven_GB) 1997 goto def_value1; 1998 } else { 1999 DRM_WARN("Smu memory pool size not supported\n"); 2000 goto def_value; 2001 } 2002 adev->pm.smu_prv_buffer_size = amdgpu_smu_memory_pool_size << 28; 2003 2004 return; 2005 2006 def_value1: 2007 DRM_WARN("No enough system memory\n"); 2008 def_value: 2009 adev->pm.smu_prv_buffer_size = 0; 2010 } 2011 2012 static int amdgpu_device_init_apu_flags(struct amdgpu_device *adev) 2013 { 2014 if (!(adev->flags & AMD_IS_APU) || 2015 adev->asic_type < CHIP_RAVEN) 2016 return 0; 2017 2018 switch (adev->asic_type) { 2019 case CHIP_RAVEN: 2020 if (adev->pdev->device == 0x15dd) 2021 adev->apu_flags |= AMD_APU_IS_RAVEN; 2022 if (adev->pdev->device == 0x15d8) 2023 adev->apu_flags |= AMD_APU_IS_PICASSO; 2024 break; 2025 case CHIP_RENOIR: 2026 if ((adev->pdev->device == 0x1636) || 2027 (adev->pdev->device == 0x164c)) 2028 adev->apu_flags |= AMD_APU_IS_RENOIR; 2029 else 2030 adev->apu_flags |= AMD_APU_IS_GREEN_SARDINE; 2031 break; 2032 case CHIP_VANGOGH: 2033 adev->apu_flags |= AMD_APU_IS_VANGOGH; 2034 break; 2035 case CHIP_YELLOW_CARP: 2036 break; 2037 case CHIP_CYAN_SKILLFISH: 2038 if ((adev->pdev->device == 0x13FE) || 2039 (adev->pdev->device == 0x143F)) 2040 adev->apu_flags |= AMD_APU_IS_CYAN_SKILLFISH2; 2041 break; 2042 default: 2043 break; 2044 } 2045 2046 return 0; 2047 } 2048 2049 /** 2050 * amdgpu_device_check_arguments - validate module params 2051 * 2052 * @adev: amdgpu_device pointer 2053 * 2054 * Validates certain module parameters and updates 2055 * the associated values used by the driver (all asics). 2056 */ 2057 static int amdgpu_device_check_arguments(struct amdgpu_device *adev) 2058 { 2059 int i; 2060 2061 if (amdgpu_sched_jobs < 4) { 2062 dev_warn(adev->dev, "sched jobs (%d) must be at least 4\n", 2063 amdgpu_sched_jobs); 2064 amdgpu_sched_jobs = 4; 2065 } else if (!is_power_of_2(amdgpu_sched_jobs)) { 2066 dev_warn(adev->dev, "sched jobs (%d) must be a power of 2\n", 2067 amdgpu_sched_jobs); 2068 amdgpu_sched_jobs = roundup_pow_of_two(amdgpu_sched_jobs); 2069 } 2070 2071 if (amdgpu_gart_size != -1 && amdgpu_gart_size < 32) { 2072 /* gart size must be greater or equal to 32M */ 2073 dev_warn(adev->dev, "gart size (%d) too small\n", 2074 amdgpu_gart_size); 2075 amdgpu_gart_size = -1; 2076 } 2077 2078 if (amdgpu_gtt_size != -1 && amdgpu_gtt_size < 32) { 2079 /* gtt size must be greater or equal to 32M */ 2080 dev_warn(adev->dev, "gtt size (%d) too small\n", 2081 amdgpu_gtt_size); 2082 amdgpu_gtt_size = -1; 2083 } 2084 2085 /* valid range is between 4 and 9 inclusive */ 2086 if (amdgpu_vm_fragment_size != -1 && 2087 (amdgpu_vm_fragment_size > 9 || amdgpu_vm_fragment_size < 4)) { 2088 dev_warn(adev->dev, "valid range is between 4 and 9\n"); 2089 amdgpu_vm_fragment_size = -1; 2090 } 2091 2092 if (amdgpu_sched_hw_submission < 2) { 2093 dev_warn(adev->dev, "sched hw submission jobs (%d) must be at least 2\n", 2094 amdgpu_sched_hw_submission); 2095 amdgpu_sched_hw_submission = 2; 2096 } else if (!is_power_of_2(amdgpu_sched_hw_submission)) { 2097 dev_warn(adev->dev, "sched hw submission jobs (%d) must be a power of 2\n", 2098 amdgpu_sched_hw_submission); 2099 amdgpu_sched_hw_submission = roundup_pow_of_two(amdgpu_sched_hw_submission); 2100 } 2101 2102 if (amdgpu_reset_method < -1 || amdgpu_reset_method > 4) { 2103 dev_warn(adev->dev, "invalid option for reset method, reverting to default\n"); 2104 amdgpu_reset_method = -1; 2105 } 2106 2107 amdgpu_device_check_smu_prv_buffer_size(adev); 2108 2109 amdgpu_device_check_vm_size(adev); 2110 2111 amdgpu_device_check_block_size(adev); 2112 2113 adev->firmware.load_type = amdgpu_ucode_get_load_type(adev, amdgpu_fw_load_type); 2114 2115 for (i = 0; i < MAX_XCP; i++) 2116 adev->enforce_isolation[i] = !!enforce_isolation; 2117 2118 return 0; 2119 } 2120 2121 /** 2122 * amdgpu_switcheroo_set_state - set switcheroo state 2123 * 2124 * @pdev: pci dev pointer 2125 * @state: vga_switcheroo state 2126 * 2127 * Callback for the switcheroo driver. Suspends or resumes 2128 * the asics before or after it is powered up using ACPI methods. 2129 */ 2130 static void amdgpu_switcheroo_set_state(struct pci_dev *pdev, 2131 enum vga_switcheroo_state state) 2132 { 2133 struct drm_device *dev = pci_get_drvdata(pdev); 2134 int r; 2135 2136 if (amdgpu_device_supports_px(dev) && state == VGA_SWITCHEROO_OFF) 2137 return; 2138 2139 if (state == VGA_SWITCHEROO_ON) { 2140 pr_info("switched on\n"); 2141 /* don't suspend or resume card normally */ 2142 dev->switch_power_state = DRM_SWITCH_POWER_CHANGING; 2143 2144 pci_set_power_state(pdev, PCI_D0); 2145 amdgpu_device_load_pci_state(pdev); 2146 r = pci_enable_device(pdev); 2147 if (r) 2148 DRM_WARN("pci_enable_device failed (%d)\n", r); 2149 amdgpu_device_resume(dev, true); 2150 2151 dev->switch_power_state = DRM_SWITCH_POWER_ON; 2152 } else { 2153 pr_info("switched off\n"); 2154 dev->switch_power_state = DRM_SWITCH_POWER_CHANGING; 2155 amdgpu_device_prepare(dev); 2156 amdgpu_device_suspend(dev, true); 2157 amdgpu_device_cache_pci_state(pdev); 2158 /* Shut down the device */ 2159 pci_disable_device(pdev); 2160 pci_set_power_state(pdev, PCI_D3cold); 2161 dev->switch_power_state = DRM_SWITCH_POWER_OFF; 2162 } 2163 } 2164 2165 /** 2166 * amdgpu_switcheroo_can_switch - see if switcheroo state can change 2167 * 2168 * @pdev: pci dev pointer 2169 * 2170 * Callback for the switcheroo driver. Check of the switcheroo 2171 * state can be changed. 2172 * Returns true if the state can be changed, false if not. 2173 */ 2174 static bool amdgpu_switcheroo_can_switch(struct pci_dev *pdev) 2175 { 2176 struct drm_device *dev = pci_get_drvdata(pdev); 2177 2178 /* 2179 * FIXME: open_count is protected by drm_global_mutex but that would lead to 2180 * locking inversion with the driver load path. And the access here is 2181 * completely racy anyway. So don't bother with locking for now. 2182 */ 2183 return atomic_read(&dev->open_count) == 0; 2184 } 2185 2186 static const struct vga_switcheroo_client_ops amdgpu_switcheroo_ops = { 2187 .set_gpu_state = amdgpu_switcheroo_set_state, 2188 .reprobe = NULL, 2189 .can_switch = amdgpu_switcheroo_can_switch, 2190 }; 2191 2192 /** 2193 * amdgpu_device_ip_set_clockgating_state - set the CG state 2194 * 2195 * @dev: amdgpu_device pointer 2196 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.) 2197 * @state: clockgating state (gate or ungate) 2198 * 2199 * Sets the requested clockgating state for all instances of 2200 * the hardware IP specified. 2201 * Returns the error code from the last instance. 2202 */ 2203 int amdgpu_device_ip_set_clockgating_state(void *dev, 2204 enum amd_ip_block_type block_type, 2205 enum amd_clockgating_state state) 2206 { 2207 struct amdgpu_device *adev = dev; 2208 int i, r = 0; 2209 2210 for (i = 0; i < adev->num_ip_blocks; i++) { 2211 if (!adev->ip_blocks[i].status.valid) 2212 continue; 2213 if (adev->ip_blocks[i].version->type != block_type) 2214 continue; 2215 if (!adev->ip_blocks[i].version->funcs->set_clockgating_state) 2216 continue; 2217 r = adev->ip_blocks[i].version->funcs->set_clockgating_state( 2218 &adev->ip_blocks[i], state); 2219 if (r) 2220 DRM_ERROR("set_clockgating_state of IP block <%s> failed %d\n", 2221 adev->ip_blocks[i].version->funcs->name, r); 2222 } 2223 return r; 2224 } 2225 2226 /** 2227 * amdgpu_device_ip_set_powergating_state - set the PG state 2228 * 2229 * @dev: amdgpu_device pointer 2230 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.) 2231 * @state: powergating state (gate or ungate) 2232 * 2233 * Sets the requested powergating state for all instances of 2234 * the hardware IP specified. 2235 * Returns the error code from the last instance. 2236 */ 2237 int amdgpu_device_ip_set_powergating_state(void *dev, 2238 enum amd_ip_block_type block_type, 2239 enum amd_powergating_state state) 2240 { 2241 struct amdgpu_device *adev = dev; 2242 int i, r = 0; 2243 2244 for (i = 0; i < adev->num_ip_blocks; i++) { 2245 if (!adev->ip_blocks[i].status.valid) 2246 continue; 2247 if (adev->ip_blocks[i].version->type != block_type) 2248 continue; 2249 if (!adev->ip_blocks[i].version->funcs->set_powergating_state) 2250 continue; 2251 r = adev->ip_blocks[i].version->funcs->set_powergating_state( 2252 &adev->ip_blocks[i], state); 2253 if (r) 2254 DRM_ERROR("set_powergating_state of IP block <%s> failed %d\n", 2255 adev->ip_blocks[i].version->funcs->name, r); 2256 } 2257 return r; 2258 } 2259 2260 /** 2261 * amdgpu_device_ip_get_clockgating_state - get the CG state 2262 * 2263 * @adev: amdgpu_device pointer 2264 * @flags: clockgating feature flags 2265 * 2266 * Walks the list of IPs on the device and updates the clockgating 2267 * flags for each IP. 2268 * Updates @flags with the feature flags for each hardware IP where 2269 * clockgating is enabled. 2270 */ 2271 void amdgpu_device_ip_get_clockgating_state(struct amdgpu_device *adev, 2272 u64 *flags) 2273 { 2274 int i; 2275 2276 for (i = 0; i < adev->num_ip_blocks; i++) { 2277 if (!adev->ip_blocks[i].status.valid) 2278 continue; 2279 if (adev->ip_blocks[i].version->funcs->get_clockgating_state) 2280 adev->ip_blocks[i].version->funcs->get_clockgating_state( 2281 &adev->ip_blocks[i], flags); 2282 } 2283 } 2284 2285 /** 2286 * amdgpu_device_ip_wait_for_idle - wait for idle 2287 * 2288 * @adev: amdgpu_device pointer 2289 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.) 2290 * 2291 * Waits for the request hardware IP to be idle. 2292 * Returns 0 for success or a negative error code on failure. 2293 */ 2294 int amdgpu_device_ip_wait_for_idle(struct amdgpu_device *adev, 2295 enum amd_ip_block_type block_type) 2296 { 2297 int i, r; 2298 2299 for (i = 0; i < adev->num_ip_blocks; i++) { 2300 if (!adev->ip_blocks[i].status.valid) 2301 continue; 2302 if (adev->ip_blocks[i].version->type == block_type) { 2303 if (adev->ip_blocks[i].version->funcs->wait_for_idle) { 2304 r = adev->ip_blocks[i].version->funcs->wait_for_idle( 2305 &adev->ip_blocks[i]); 2306 if (r) 2307 return r; 2308 } 2309 break; 2310 } 2311 } 2312 return 0; 2313 2314 } 2315 2316 /** 2317 * amdgpu_device_ip_is_valid - is the hardware IP enabled 2318 * 2319 * @adev: amdgpu_device pointer 2320 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.) 2321 * 2322 * Check if the hardware IP is enable or not. 2323 * Returns true if it the IP is enable, false if not. 2324 */ 2325 bool amdgpu_device_ip_is_valid(struct amdgpu_device *adev, 2326 enum amd_ip_block_type block_type) 2327 { 2328 int i; 2329 2330 for (i = 0; i < adev->num_ip_blocks; i++) { 2331 if (adev->ip_blocks[i].version->type == block_type) 2332 return adev->ip_blocks[i].status.valid; 2333 } 2334 return false; 2335 2336 } 2337 2338 /** 2339 * amdgpu_device_ip_get_ip_block - get a hw IP pointer 2340 * 2341 * @adev: amdgpu_device pointer 2342 * @type: Type of hardware IP (SMU, GFX, UVD, etc.) 2343 * 2344 * Returns a pointer to the hardware IP block structure 2345 * if it exists for the asic, otherwise NULL. 2346 */ 2347 struct amdgpu_ip_block * 2348 amdgpu_device_ip_get_ip_block(struct amdgpu_device *adev, 2349 enum amd_ip_block_type type) 2350 { 2351 int i; 2352 2353 for (i = 0; i < adev->num_ip_blocks; i++) 2354 if (adev->ip_blocks[i].version->type == type) 2355 return &adev->ip_blocks[i]; 2356 2357 return NULL; 2358 } 2359 2360 /** 2361 * amdgpu_device_ip_block_version_cmp 2362 * 2363 * @adev: amdgpu_device pointer 2364 * @type: enum amd_ip_block_type 2365 * @major: major version 2366 * @minor: minor version 2367 * 2368 * return 0 if equal or greater 2369 * return 1 if smaller or the ip_block doesn't exist 2370 */ 2371 int amdgpu_device_ip_block_version_cmp(struct amdgpu_device *adev, 2372 enum amd_ip_block_type type, 2373 u32 major, u32 minor) 2374 { 2375 struct amdgpu_ip_block *ip_block = amdgpu_device_ip_get_ip_block(adev, type); 2376 2377 if (ip_block && ((ip_block->version->major > major) || 2378 ((ip_block->version->major == major) && 2379 (ip_block->version->minor >= minor)))) 2380 return 0; 2381 2382 return 1; 2383 } 2384 2385 /** 2386 * amdgpu_device_ip_block_add 2387 * 2388 * @adev: amdgpu_device pointer 2389 * @ip_block_version: pointer to the IP to add 2390 * 2391 * Adds the IP block driver information to the collection of IPs 2392 * on the asic. 2393 */ 2394 int amdgpu_device_ip_block_add(struct amdgpu_device *adev, 2395 const struct amdgpu_ip_block_version *ip_block_version) 2396 { 2397 if (!ip_block_version) 2398 return -EINVAL; 2399 2400 switch (ip_block_version->type) { 2401 case AMD_IP_BLOCK_TYPE_VCN: 2402 if (adev->harvest_ip_mask & AMD_HARVEST_IP_VCN_MASK) 2403 return 0; 2404 break; 2405 case AMD_IP_BLOCK_TYPE_JPEG: 2406 if (adev->harvest_ip_mask & AMD_HARVEST_IP_JPEG_MASK) 2407 return 0; 2408 break; 2409 default: 2410 break; 2411 } 2412 2413 dev_info(adev->dev, "detected ip block number %d <%s>\n", 2414 adev->num_ip_blocks, ip_block_version->funcs->name); 2415 2416 adev->ip_blocks[adev->num_ip_blocks].adev = adev; 2417 2418 adev->ip_blocks[adev->num_ip_blocks++].version = ip_block_version; 2419 2420 return 0; 2421 } 2422 2423 /** 2424 * amdgpu_device_enable_virtual_display - enable virtual display feature 2425 * 2426 * @adev: amdgpu_device pointer 2427 * 2428 * Enabled the virtual display feature if the user has enabled it via 2429 * the module parameter virtual_display. This feature provides a virtual 2430 * display hardware on headless boards or in virtualized environments. 2431 * This function parses and validates the configuration string specified by 2432 * the user and configures the virtual display configuration (number of 2433 * virtual connectors, crtcs, etc.) specified. 2434 */ 2435 static void amdgpu_device_enable_virtual_display(struct amdgpu_device *adev) 2436 { 2437 adev->enable_virtual_display = false; 2438 2439 if (amdgpu_virtual_display) { 2440 const char *pci_address_name = pci_name(adev->pdev); 2441 char *pciaddstr, *pciaddstr_tmp, *pciaddname_tmp, *pciaddname; 2442 2443 pciaddstr = kstrdup(amdgpu_virtual_display, GFP_KERNEL); 2444 pciaddstr_tmp = pciaddstr; 2445 while ((pciaddname_tmp = strsep(&pciaddstr_tmp, ";"))) { 2446 pciaddname = strsep(&pciaddname_tmp, ","); 2447 if (!strcmp("all", pciaddname) 2448 || !strcmp(pci_address_name, pciaddname)) { 2449 long num_crtc; 2450 int res = -1; 2451 2452 adev->enable_virtual_display = true; 2453 2454 if (pciaddname_tmp) 2455 res = kstrtol(pciaddname_tmp, 10, 2456 &num_crtc); 2457 2458 if (!res) { 2459 if (num_crtc < 1) 2460 num_crtc = 1; 2461 if (num_crtc > 6) 2462 num_crtc = 6; 2463 adev->mode_info.num_crtc = num_crtc; 2464 } else { 2465 adev->mode_info.num_crtc = 1; 2466 } 2467 break; 2468 } 2469 } 2470 2471 DRM_INFO("virtual display string:%s, %s:virtual_display:%d, num_crtc:%d\n", 2472 amdgpu_virtual_display, pci_address_name, 2473 adev->enable_virtual_display, adev->mode_info.num_crtc); 2474 2475 kfree(pciaddstr); 2476 } 2477 } 2478 2479 void amdgpu_device_set_sriov_virtual_display(struct amdgpu_device *adev) 2480 { 2481 if (amdgpu_sriov_vf(adev) && !adev->enable_virtual_display) { 2482 adev->mode_info.num_crtc = 1; 2483 adev->enable_virtual_display = true; 2484 DRM_INFO("virtual_display:%d, num_crtc:%d\n", 2485 adev->enable_virtual_display, adev->mode_info.num_crtc); 2486 } 2487 } 2488 2489 /** 2490 * amdgpu_device_parse_gpu_info_fw - parse gpu info firmware 2491 * 2492 * @adev: amdgpu_device pointer 2493 * 2494 * Parses the asic configuration parameters specified in the gpu info 2495 * firmware and makes them available to the driver for use in configuring 2496 * the asic. 2497 * Returns 0 on success, -EINVAL on failure. 2498 */ 2499 static int amdgpu_device_parse_gpu_info_fw(struct amdgpu_device *adev) 2500 { 2501 const char *chip_name; 2502 int err; 2503 const struct gpu_info_firmware_header_v1_0 *hdr; 2504 2505 adev->firmware.gpu_info_fw = NULL; 2506 2507 if (adev->mman.discovery_bin) 2508 return 0; 2509 2510 switch (adev->asic_type) { 2511 default: 2512 return 0; 2513 case CHIP_VEGA10: 2514 chip_name = "vega10"; 2515 break; 2516 case CHIP_VEGA12: 2517 chip_name = "vega12"; 2518 break; 2519 case CHIP_RAVEN: 2520 if (adev->apu_flags & AMD_APU_IS_RAVEN2) 2521 chip_name = "raven2"; 2522 else if (adev->apu_flags & AMD_APU_IS_PICASSO) 2523 chip_name = "picasso"; 2524 else 2525 chip_name = "raven"; 2526 break; 2527 case CHIP_ARCTURUS: 2528 chip_name = "arcturus"; 2529 break; 2530 case CHIP_NAVI12: 2531 chip_name = "navi12"; 2532 break; 2533 } 2534 2535 err = amdgpu_ucode_request(adev, &adev->firmware.gpu_info_fw, 2536 AMDGPU_UCODE_OPTIONAL, 2537 "amdgpu/%s_gpu_info.bin", chip_name); 2538 if (err) { 2539 dev_err(adev->dev, 2540 "Failed to get gpu_info firmware \"%s_gpu_info.bin\"\n", 2541 chip_name); 2542 goto out; 2543 } 2544 2545 hdr = (const struct gpu_info_firmware_header_v1_0 *)adev->firmware.gpu_info_fw->data; 2546 amdgpu_ucode_print_gpu_info_hdr(&hdr->header); 2547 2548 switch (hdr->version_major) { 2549 case 1: 2550 { 2551 const struct gpu_info_firmware_v1_0 *gpu_info_fw = 2552 (const struct gpu_info_firmware_v1_0 *)(adev->firmware.gpu_info_fw->data + 2553 le32_to_cpu(hdr->header.ucode_array_offset_bytes)); 2554 2555 /* 2556 * Should be dropped when DAL no longer needs it. 2557 */ 2558 if (adev->asic_type == CHIP_NAVI12) 2559 goto parse_soc_bounding_box; 2560 2561 adev->gfx.config.max_shader_engines = le32_to_cpu(gpu_info_fw->gc_num_se); 2562 adev->gfx.config.max_cu_per_sh = le32_to_cpu(gpu_info_fw->gc_num_cu_per_sh); 2563 adev->gfx.config.max_sh_per_se = le32_to_cpu(gpu_info_fw->gc_num_sh_per_se); 2564 adev->gfx.config.max_backends_per_se = le32_to_cpu(gpu_info_fw->gc_num_rb_per_se); 2565 adev->gfx.config.max_texture_channel_caches = 2566 le32_to_cpu(gpu_info_fw->gc_num_tccs); 2567 adev->gfx.config.max_gprs = le32_to_cpu(gpu_info_fw->gc_num_gprs); 2568 adev->gfx.config.max_gs_threads = le32_to_cpu(gpu_info_fw->gc_num_max_gs_thds); 2569 adev->gfx.config.gs_vgt_table_depth = le32_to_cpu(gpu_info_fw->gc_gs_table_depth); 2570 adev->gfx.config.gs_prim_buffer_depth = le32_to_cpu(gpu_info_fw->gc_gsprim_buff_depth); 2571 adev->gfx.config.double_offchip_lds_buf = 2572 le32_to_cpu(gpu_info_fw->gc_double_offchip_lds_buffer); 2573 adev->gfx.cu_info.wave_front_size = le32_to_cpu(gpu_info_fw->gc_wave_size); 2574 adev->gfx.cu_info.max_waves_per_simd = 2575 le32_to_cpu(gpu_info_fw->gc_max_waves_per_simd); 2576 adev->gfx.cu_info.max_scratch_slots_per_cu = 2577 le32_to_cpu(gpu_info_fw->gc_max_scratch_slots_per_cu); 2578 adev->gfx.cu_info.lds_size = le32_to_cpu(gpu_info_fw->gc_lds_size); 2579 if (hdr->version_minor >= 1) { 2580 const struct gpu_info_firmware_v1_1 *gpu_info_fw = 2581 (const struct gpu_info_firmware_v1_1 *)(adev->firmware.gpu_info_fw->data + 2582 le32_to_cpu(hdr->header.ucode_array_offset_bytes)); 2583 adev->gfx.config.num_sc_per_sh = 2584 le32_to_cpu(gpu_info_fw->num_sc_per_sh); 2585 adev->gfx.config.num_packer_per_sc = 2586 le32_to_cpu(gpu_info_fw->num_packer_per_sc); 2587 } 2588 2589 parse_soc_bounding_box: 2590 /* 2591 * soc bounding box info is not integrated in disocovery table, 2592 * we always need to parse it from gpu info firmware if needed. 2593 */ 2594 if (hdr->version_minor == 2) { 2595 const struct gpu_info_firmware_v1_2 *gpu_info_fw = 2596 (const struct gpu_info_firmware_v1_2 *)(adev->firmware.gpu_info_fw->data + 2597 le32_to_cpu(hdr->header.ucode_array_offset_bytes)); 2598 adev->dm.soc_bounding_box = &gpu_info_fw->soc_bounding_box; 2599 } 2600 break; 2601 } 2602 default: 2603 dev_err(adev->dev, 2604 "Unsupported gpu_info table %d\n", hdr->header.ucode_version); 2605 err = -EINVAL; 2606 goto out; 2607 } 2608 out: 2609 return err; 2610 } 2611 2612 /** 2613 * amdgpu_device_ip_early_init - run early init for hardware IPs 2614 * 2615 * @adev: amdgpu_device pointer 2616 * 2617 * Early initialization pass for hardware IPs. The hardware IPs that make 2618 * up each asic are discovered each IP's early_init callback is run. This 2619 * is the first stage in initializing the asic. 2620 * Returns 0 on success, negative error code on failure. 2621 */ 2622 static int amdgpu_device_ip_early_init(struct amdgpu_device *adev) 2623 { 2624 struct amdgpu_ip_block *ip_block; 2625 struct pci_dev *parent; 2626 bool total, skip_bios; 2627 uint32_t bios_flags; 2628 int i, r; 2629 2630 amdgpu_device_enable_virtual_display(adev); 2631 2632 if (amdgpu_sriov_vf(adev)) { 2633 r = amdgpu_virt_request_full_gpu(adev, true); 2634 if (r) 2635 return r; 2636 } 2637 2638 switch (adev->asic_type) { 2639 #ifdef CONFIG_DRM_AMDGPU_SI 2640 case CHIP_VERDE: 2641 case CHIP_TAHITI: 2642 case CHIP_PITCAIRN: 2643 case CHIP_OLAND: 2644 case CHIP_HAINAN: 2645 adev->family = AMDGPU_FAMILY_SI; 2646 r = si_set_ip_blocks(adev); 2647 if (r) 2648 return r; 2649 break; 2650 #endif 2651 #ifdef CONFIG_DRM_AMDGPU_CIK 2652 case CHIP_BONAIRE: 2653 case CHIP_HAWAII: 2654 case CHIP_KAVERI: 2655 case CHIP_KABINI: 2656 case CHIP_MULLINS: 2657 if (adev->flags & AMD_IS_APU) 2658 adev->family = AMDGPU_FAMILY_KV; 2659 else 2660 adev->family = AMDGPU_FAMILY_CI; 2661 2662 r = cik_set_ip_blocks(adev); 2663 if (r) 2664 return r; 2665 break; 2666 #endif 2667 case CHIP_TOPAZ: 2668 case CHIP_TONGA: 2669 case CHIP_FIJI: 2670 case CHIP_POLARIS10: 2671 case CHIP_POLARIS11: 2672 case CHIP_POLARIS12: 2673 case CHIP_VEGAM: 2674 case CHIP_CARRIZO: 2675 case CHIP_STONEY: 2676 if (adev->flags & AMD_IS_APU) 2677 adev->family = AMDGPU_FAMILY_CZ; 2678 else 2679 adev->family = AMDGPU_FAMILY_VI; 2680 2681 r = vi_set_ip_blocks(adev); 2682 if (r) 2683 return r; 2684 break; 2685 default: 2686 r = amdgpu_discovery_set_ip_blocks(adev); 2687 if (r) 2688 return r; 2689 break; 2690 } 2691 2692 if (amdgpu_has_atpx() && 2693 (amdgpu_is_atpx_hybrid() || 2694 amdgpu_has_atpx_dgpu_power_cntl()) && 2695 ((adev->flags & AMD_IS_APU) == 0) && 2696 !dev_is_removable(&adev->pdev->dev)) 2697 adev->flags |= AMD_IS_PX; 2698 2699 if (!(adev->flags & AMD_IS_APU)) { 2700 parent = pcie_find_root_port(adev->pdev); 2701 adev->has_pr3 = parent ? pci_pr3_present(parent) : false; 2702 } 2703 2704 2705 adev->pm.pp_feature = amdgpu_pp_feature_mask; 2706 if (amdgpu_sriov_vf(adev) || sched_policy == KFD_SCHED_POLICY_NO_HWS) 2707 adev->pm.pp_feature &= ~PP_GFXOFF_MASK; 2708 if (amdgpu_sriov_vf(adev) && adev->asic_type == CHIP_SIENNA_CICHLID) 2709 adev->pm.pp_feature &= ~PP_OVERDRIVE_MASK; 2710 if (!amdgpu_device_pcie_dynamic_switching_supported(adev)) 2711 adev->pm.pp_feature &= ~PP_PCIE_DPM_MASK; 2712 2713 total = true; 2714 for (i = 0; i < adev->num_ip_blocks; i++) { 2715 ip_block = &adev->ip_blocks[i]; 2716 2717 if ((amdgpu_ip_block_mask & (1 << i)) == 0) { 2718 DRM_WARN("disabled ip block: %d <%s>\n", 2719 i, adev->ip_blocks[i].version->funcs->name); 2720 adev->ip_blocks[i].status.valid = false; 2721 } else if (ip_block->version->funcs->early_init) { 2722 r = ip_block->version->funcs->early_init(ip_block); 2723 if (r == -ENOENT) { 2724 adev->ip_blocks[i].status.valid = false; 2725 } else if (r) { 2726 DRM_ERROR("early_init of IP block <%s> failed %d\n", 2727 adev->ip_blocks[i].version->funcs->name, r); 2728 total = false; 2729 } else { 2730 adev->ip_blocks[i].status.valid = true; 2731 } 2732 } else { 2733 adev->ip_blocks[i].status.valid = true; 2734 } 2735 /* get the vbios after the asic_funcs are set up */ 2736 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON) { 2737 r = amdgpu_device_parse_gpu_info_fw(adev); 2738 if (r) 2739 return r; 2740 2741 bios_flags = amdgpu_device_get_vbios_flags(adev); 2742 skip_bios = !!(bios_flags & AMDGPU_VBIOS_SKIP); 2743 /* Read BIOS */ 2744 if (!skip_bios) { 2745 bool optional = 2746 !!(bios_flags & AMDGPU_VBIOS_OPTIONAL); 2747 if (!amdgpu_get_bios(adev) && !optional) 2748 return -EINVAL; 2749 2750 if (optional && !adev->bios) 2751 dev_info( 2752 adev->dev, 2753 "VBIOS image optional, proceeding without VBIOS image"); 2754 2755 if (adev->bios) { 2756 r = amdgpu_atombios_init(adev); 2757 if (r) { 2758 dev_err(adev->dev, 2759 "amdgpu_atombios_init failed\n"); 2760 amdgpu_vf_error_put( 2761 adev, 2762 AMDGIM_ERROR_VF_ATOMBIOS_INIT_FAIL, 2763 0, 0); 2764 return r; 2765 } 2766 } 2767 } 2768 2769 /*get pf2vf msg info at it's earliest time*/ 2770 if (amdgpu_sriov_vf(adev)) 2771 amdgpu_virt_init_data_exchange(adev); 2772 2773 } 2774 } 2775 if (!total) 2776 return -ENODEV; 2777 2778 if (adev->gmc.xgmi.supported) 2779 amdgpu_xgmi_early_init(adev); 2780 2781 ip_block = amdgpu_device_ip_get_ip_block(adev, AMD_IP_BLOCK_TYPE_GFX); 2782 if (ip_block->status.valid != false) 2783 amdgpu_amdkfd_device_probe(adev); 2784 2785 adev->cg_flags &= amdgpu_cg_mask; 2786 adev->pg_flags &= amdgpu_pg_mask; 2787 2788 return 0; 2789 } 2790 2791 static int amdgpu_device_ip_hw_init_phase1(struct amdgpu_device *adev) 2792 { 2793 int i, r; 2794 2795 for (i = 0; i < adev->num_ip_blocks; i++) { 2796 if (!adev->ip_blocks[i].status.sw) 2797 continue; 2798 if (adev->ip_blocks[i].status.hw) 2799 continue; 2800 if (!amdgpu_ip_member_of_hwini( 2801 adev, adev->ip_blocks[i].version->type)) 2802 continue; 2803 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON || 2804 (amdgpu_sriov_vf(adev) && (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP)) || 2805 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH) { 2806 r = adev->ip_blocks[i].version->funcs->hw_init(&adev->ip_blocks[i]); 2807 if (r) { 2808 DRM_ERROR("hw_init of IP block <%s> failed %d\n", 2809 adev->ip_blocks[i].version->funcs->name, r); 2810 return r; 2811 } 2812 adev->ip_blocks[i].status.hw = true; 2813 } 2814 } 2815 2816 return 0; 2817 } 2818 2819 static int amdgpu_device_ip_hw_init_phase2(struct amdgpu_device *adev) 2820 { 2821 int i, r; 2822 2823 for (i = 0; i < adev->num_ip_blocks; i++) { 2824 if (!adev->ip_blocks[i].status.sw) 2825 continue; 2826 if (adev->ip_blocks[i].status.hw) 2827 continue; 2828 if (!amdgpu_ip_member_of_hwini( 2829 adev, adev->ip_blocks[i].version->type)) 2830 continue; 2831 r = adev->ip_blocks[i].version->funcs->hw_init(&adev->ip_blocks[i]); 2832 if (r) { 2833 DRM_ERROR("hw_init of IP block <%s> failed %d\n", 2834 adev->ip_blocks[i].version->funcs->name, r); 2835 return r; 2836 } 2837 adev->ip_blocks[i].status.hw = true; 2838 } 2839 2840 return 0; 2841 } 2842 2843 static int amdgpu_device_fw_loading(struct amdgpu_device *adev) 2844 { 2845 int r = 0; 2846 int i; 2847 uint32_t smu_version; 2848 2849 if (adev->asic_type >= CHIP_VEGA10) { 2850 for (i = 0; i < adev->num_ip_blocks; i++) { 2851 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_PSP) 2852 continue; 2853 2854 if (!amdgpu_ip_member_of_hwini(adev, 2855 AMD_IP_BLOCK_TYPE_PSP)) 2856 break; 2857 2858 if (!adev->ip_blocks[i].status.sw) 2859 continue; 2860 2861 /* no need to do the fw loading again if already done*/ 2862 if (adev->ip_blocks[i].status.hw == true) 2863 break; 2864 2865 if (amdgpu_in_reset(adev) || adev->in_suspend) { 2866 r = amdgpu_ip_block_resume(&adev->ip_blocks[i]); 2867 if (r) 2868 return r; 2869 } else { 2870 r = adev->ip_blocks[i].version->funcs->hw_init(&adev->ip_blocks[i]); 2871 if (r) { 2872 DRM_ERROR("hw_init of IP block <%s> failed %d\n", 2873 adev->ip_blocks[i].version->funcs->name, r); 2874 return r; 2875 } 2876 adev->ip_blocks[i].status.hw = true; 2877 } 2878 break; 2879 } 2880 } 2881 2882 if (!amdgpu_sriov_vf(adev) || adev->asic_type == CHIP_TONGA) 2883 r = amdgpu_pm_load_smu_firmware(adev, &smu_version); 2884 2885 return r; 2886 } 2887 2888 static int amdgpu_device_init_schedulers(struct amdgpu_device *adev) 2889 { 2890 struct drm_sched_init_args args = { 2891 .ops = &amdgpu_sched_ops, 2892 .num_rqs = DRM_SCHED_PRIORITY_COUNT, 2893 .timeout_wq = adev->reset_domain->wq, 2894 .dev = adev->dev, 2895 }; 2896 long timeout; 2897 int r, i; 2898 2899 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 2900 struct amdgpu_ring *ring = adev->rings[i]; 2901 2902 /* No need to setup the GPU scheduler for rings that don't need it */ 2903 if (!ring || ring->no_scheduler) 2904 continue; 2905 2906 switch (ring->funcs->type) { 2907 case AMDGPU_RING_TYPE_GFX: 2908 timeout = adev->gfx_timeout; 2909 break; 2910 case AMDGPU_RING_TYPE_COMPUTE: 2911 timeout = adev->compute_timeout; 2912 break; 2913 case AMDGPU_RING_TYPE_SDMA: 2914 timeout = adev->sdma_timeout; 2915 break; 2916 default: 2917 timeout = adev->video_timeout; 2918 break; 2919 } 2920 2921 args.timeout = timeout; 2922 args.credit_limit = ring->num_hw_submission; 2923 args.score = ring->sched_score; 2924 args.name = ring->name; 2925 2926 r = drm_sched_init(&ring->sched, &args); 2927 if (r) { 2928 DRM_ERROR("Failed to create scheduler on ring %s.\n", 2929 ring->name); 2930 return r; 2931 } 2932 r = amdgpu_uvd_entity_init(adev, ring); 2933 if (r) { 2934 DRM_ERROR("Failed to create UVD scheduling entity on ring %s.\n", 2935 ring->name); 2936 return r; 2937 } 2938 r = amdgpu_vce_entity_init(adev, ring); 2939 if (r) { 2940 DRM_ERROR("Failed to create VCE scheduling entity on ring %s.\n", 2941 ring->name); 2942 return r; 2943 } 2944 } 2945 2946 amdgpu_xcp_update_partition_sched_list(adev); 2947 2948 return 0; 2949 } 2950 2951 2952 /** 2953 * amdgpu_device_ip_init - run init for hardware IPs 2954 * 2955 * @adev: amdgpu_device pointer 2956 * 2957 * Main initialization pass for hardware IPs. The list of all the hardware 2958 * IPs that make up the asic is walked and the sw_init and hw_init callbacks 2959 * are run. sw_init initializes the software state associated with each IP 2960 * and hw_init initializes the hardware associated with each IP. 2961 * Returns 0 on success, negative error code on failure. 2962 */ 2963 static int amdgpu_device_ip_init(struct amdgpu_device *adev) 2964 { 2965 bool init_badpage; 2966 int i, r; 2967 2968 r = amdgpu_ras_init(adev); 2969 if (r) 2970 return r; 2971 2972 for (i = 0; i < adev->num_ip_blocks; i++) { 2973 if (!adev->ip_blocks[i].status.valid) 2974 continue; 2975 if (adev->ip_blocks[i].version->funcs->sw_init) { 2976 r = adev->ip_blocks[i].version->funcs->sw_init(&adev->ip_blocks[i]); 2977 if (r) { 2978 DRM_ERROR("sw_init of IP block <%s> failed %d\n", 2979 adev->ip_blocks[i].version->funcs->name, r); 2980 goto init_failed; 2981 } 2982 } 2983 adev->ip_blocks[i].status.sw = true; 2984 2985 if (!amdgpu_ip_member_of_hwini( 2986 adev, adev->ip_blocks[i].version->type)) 2987 continue; 2988 2989 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON) { 2990 /* need to do common hw init early so everything is set up for gmc */ 2991 r = adev->ip_blocks[i].version->funcs->hw_init(&adev->ip_blocks[i]); 2992 if (r) { 2993 DRM_ERROR("hw_init %d failed %d\n", i, r); 2994 goto init_failed; 2995 } 2996 adev->ip_blocks[i].status.hw = true; 2997 } else if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) { 2998 /* need to do gmc hw init early so we can allocate gpu mem */ 2999 /* Try to reserve bad pages early */ 3000 if (amdgpu_sriov_vf(adev)) 3001 amdgpu_virt_exchange_data(adev); 3002 3003 r = amdgpu_device_mem_scratch_init(adev); 3004 if (r) { 3005 DRM_ERROR("amdgpu_mem_scratch_init failed %d\n", r); 3006 goto init_failed; 3007 } 3008 r = adev->ip_blocks[i].version->funcs->hw_init(&adev->ip_blocks[i]); 3009 if (r) { 3010 DRM_ERROR("hw_init %d failed %d\n", i, r); 3011 goto init_failed; 3012 } 3013 r = amdgpu_device_wb_init(adev); 3014 if (r) { 3015 DRM_ERROR("amdgpu_device_wb_init failed %d\n", r); 3016 goto init_failed; 3017 } 3018 adev->ip_blocks[i].status.hw = true; 3019 3020 /* right after GMC hw init, we create CSA */ 3021 if (adev->gfx.mcbp) { 3022 r = amdgpu_allocate_static_csa(adev, &adev->virt.csa_obj, 3023 AMDGPU_GEM_DOMAIN_VRAM | 3024 AMDGPU_GEM_DOMAIN_GTT, 3025 AMDGPU_CSA_SIZE); 3026 if (r) { 3027 DRM_ERROR("allocate CSA failed %d\n", r); 3028 goto init_failed; 3029 } 3030 } 3031 3032 r = amdgpu_seq64_init(adev); 3033 if (r) { 3034 DRM_ERROR("allocate seq64 failed %d\n", r); 3035 goto init_failed; 3036 } 3037 } 3038 } 3039 3040 if (amdgpu_sriov_vf(adev)) 3041 amdgpu_virt_init_data_exchange(adev); 3042 3043 r = amdgpu_ib_pool_init(adev); 3044 if (r) { 3045 dev_err(adev->dev, "IB initialization failed (%d).\n", r); 3046 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_IB_INIT_FAIL, 0, r); 3047 goto init_failed; 3048 } 3049 3050 r = amdgpu_ucode_create_bo(adev); /* create ucode bo when sw_init complete*/ 3051 if (r) 3052 goto init_failed; 3053 3054 r = amdgpu_device_ip_hw_init_phase1(adev); 3055 if (r) 3056 goto init_failed; 3057 3058 r = amdgpu_device_fw_loading(adev); 3059 if (r) 3060 goto init_failed; 3061 3062 r = amdgpu_device_ip_hw_init_phase2(adev); 3063 if (r) 3064 goto init_failed; 3065 3066 /* 3067 * retired pages will be loaded from eeprom and reserved here, 3068 * it should be called after amdgpu_device_ip_hw_init_phase2 since 3069 * for some ASICs the RAS EEPROM code relies on SMU fully functioning 3070 * for I2C communication which only true at this point. 3071 * 3072 * amdgpu_ras_recovery_init may fail, but the upper only cares the 3073 * failure from bad gpu situation and stop amdgpu init process 3074 * accordingly. For other failed cases, it will still release all 3075 * the resource and print error message, rather than returning one 3076 * negative value to upper level. 3077 * 3078 * Note: theoretically, this should be called before all vram allocations 3079 * to protect retired page from abusing 3080 */ 3081 init_badpage = (adev->init_lvl->level != AMDGPU_INIT_LEVEL_MINIMAL_XGMI); 3082 r = amdgpu_ras_recovery_init(adev, init_badpage); 3083 if (r) 3084 goto init_failed; 3085 3086 /** 3087 * In case of XGMI grab extra reference for reset domain for this device 3088 */ 3089 if (adev->gmc.xgmi.num_physical_nodes > 1) { 3090 if (amdgpu_xgmi_add_device(adev) == 0) { 3091 if (!amdgpu_sriov_vf(adev)) { 3092 struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev); 3093 3094 if (WARN_ON(!hive)) { 3095 r = -ENOENT; 3096 goto init_failed; 3097 } 3098 3099 if (!hive->reset_domain || 3100 !amdgpu_reset_get_reset_domain(hive->reset_domain)) { 3101 r = -ENOENT; 3102 amdgpu_put_xgmi_hive(hive); 3103 goto init_failed; 3104 } 3105 3106 /* Drop the early temporary reset domain we created for device */ 3107 amdgpu_reset_put_reset_domain(adev->reset_domain); 3108 adev->reset_domain = hive->reset_domain; 3109 amdgpu_put_xgmi_hive(hive); 3110 } 3111 } 3112 } 3113 3114 r = amdgpu_device_init_schedulers(adev); 3115 if (r) 3116 goto init_failed; 3117 3118 if (adev->mman.buffer_funcs_ring->sched.ready) 3119 amdgpu_ttm_set_buffer_funcs_status(adev, true); 3120 3121 /* Don't init kfd if whole hive need to be reset during init */ 3122 if (adev->init_lvl->level != AMDGPU_INIT_LEVEL_MINIMAL_XGMI) { 3123 kgd2kfd_init_zone_device(adev); 3124 amdgpu_amdkfd_device_init(adev); 3125 } 3126 3127 amdgpu_fru_get_product_info(adev); 3128 3129 if (!amdgpu_sriov_vf(adev) || amdgpu_sriov_ras_cper_en(adev)) 3130 r = amdgpu_cper_init(adev); 3131 3132 init_failed: 3133 3134 return r; 3135 } 3136 3137 /** 3138 * amdgpu_device_fill_reset_magic - writes reset magic to gart pointer 3139 * 3140 * @adev: amdgpu_device pointer 3141 * 3142 * Writes a reset magic value to the gart pointer in VRAM. The driver calls 3143 * this function before a GPU reset. If the value is retained after a 3144 * GPU reset, VRAM has not been lost. Some GPU resets may destroy VRAM contents. 3145 */ 3146 static void amdgpu_device_fill_reset_magic(struct amdgpu_device *adev) 3147 { 3148 memcpy(adev->reset_magic, adev->gart.ptr, AMDGPU_RESET_MAGIC_NUM); 3149 } 3150 3151 /** 3152 * amdgpu_device_check_vram_lost - check if vram is valid 3153 * 3154 * @adev: amdgpu_device pointer 3155 * 3156 * Checks the reset magic value written to the gart pointer in VRAM. 3157 * The driver calls this after a GPU reset to see if the contents of 3158 * VRAM is lost or now. 3159 * returns true if vram is lost, false if not. 3160 */ 3161 static bool amdgpu_device_check_vram_lost(struct amdgpu_device *adev) 3162 { 3163 if (memcmp(adev->gart.ptr, adev->reset_magic, 3164 AMDGPU_RESET_MAGIC_NUM)) 3165 return true; 3166 3167 if (!amdgpu_in_reset(adev)) 3168 return false; 3169 3170 /* 3171 * For all ASICs with baco/mode1 reset, the VRAM is 3172 * always assumed to be lost. 3173 */ 3174 switch (amdgpu_asic_reset_method(adev)) { 3175 case AMD_RESET_METHOD_BACO: 3176 case AMD_RESET_METHOD_MODE1: 3177 return true; 3178 default: 3179 return false; 3180 } 3181 } 3182 3183 /** 3184 * amdgpu_device_set_cg_state - set clockgating for amdgpu device 3185 * 3186 * @adev: amdgpu_device pointer 3187 * @state: clockgating state (gate or ungate) 3188 * 3189 * The list of all the hardware IPs that make up the asic is walked and the 3190 * set_clockgating_state callbacks are run. 3191 * Late initialization pass enabling clockgating for hardware IPs. 3192 * Fini or suspend, pass disabling clockgating for hardware IPs. 3193 * Returns 0 on success, negative error code on failure. 3194 */ 3195 3196 int amdgpu_device_set_cg_state(struct amdgpu_device *adev, 3197 enum amd_clockgating_state state) 3198 { 3199 int i, j, r; 3200 3201 if (amdgpu_emu_mode == 1) 3202 return 0; 3203 3204 for (j = 0; j < adev->num_ip_blocks; j++) { 3205 i = state == AMD_CG_STATE_GATE ? j : adev->num_ip_blocks - j - 1; 3206 if (!adev->ip_blocks[i].status.late_initialized) 3207 continue; 3208 /* skip CG for GFX, SDMA on S0ix */ 3209 if (adev->in_s0ix && 3210 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GFX || 3211 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SDMA)) 3212 continue; 3213 /* skip CG for VCE/UVD, it's handled specially */ 3214 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_UVD && 3215 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCE && 3216 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCN && 3217 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_JPEG && 3218 adev->ip_blocks[i].version->funcs->set_clockgating_state) { 3219 /* enable clockgating to save power */ 3220 r = adev->ip_blocks[i].version->funcs->set_clockgating_state(&adev->ip_blocks[i], 3221 state); 3222 if (r) { 3223 DRM_ERROR("set_clockgating_state(gate) of IP block <%s> failed %d\n", 3224 adev->ip_blocks[i].version->funcs->name, r); 3225 return r; 3226 } 3227 } 3228 } 3229 3230 return 0; 3231 } 3232 3233 int amdgpu_device_set_pg_state(struct amdgpu_device *adev, 3234 enum amd_powergating_state state) 3235 { 3236 int i, j, r; 3237 3238 if (amdgpu_emu_mode == 1) 3239 return 0; 3240 3241 for (j = 0; j < adev->num_ip_blocks; j++) { 3242 i = state == AMD_PG_STATE_GATE ? j : adev->num_ip_blocks - j - 1; 3243 if (!adev->ip_blocks[i].status.late_initialized) 3244 continue; 3245 /* skip PG for GFX, SDMA on S0ix */ 3246 if (adev->in_s0ix && 3247 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GFX || 3248 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SDMA)) 3249 continue; 3250 /* skip CG for VCE/UVD, it's handled specially */ 3251 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_UVD && 3252 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCE && 3253 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCN && 3254 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_JPEG && 3255 adev->ip_blocks[i].version->funcs->set_powergating_state) { 3256 /* enable powergating to save power */ 3257 r = adev->ip_blocks[i].version->funcs->set_powergating_state(&adev->ip_blocks[i], 3258 state); 3259 if (r) { 3260 DRM_ERROR("set_powergating_state(gate) of IP block <%s> failed %d\n", 3261 adev->ip_blocks[i].version->funcs->name, r); 3262 return r; 3263 } 3264 } 3265 } 3266 return 0; 3267 } 3268 3269 static int amdgpu_device_enable_mgpu_fan_boost(void) 3270 { 3271 struct amdgpu_gpu_instance *gpu_ins; 3272 struct amdgpu_device *adev; 3273 int i, ret = 0; 3274 3275 mutex_lock(&mgpu_info.mutex); 3276 3277 /* 3278 * MGPU fan boost feature should be enabled 3279 * only when there are two or more dGPUs in 3280 * the system 3281 */ 3282 if (mgpu_info.num_dgpu < 2) 3283 goto out; 3284 3285 for (i = 0; i < mgpu_info.num_dgpu; i++) { 3286 gpu_ins = &(mgpu_info.gpu_ins[i]); 3287 adev = gpu_ins->adev; 3288 if (!(adev->flags & AMD_IS_APU) && 3289 !gpu_ins->mgpu_fan_enabled) { 3290 ret = amdgpu_dpm_enable_mgpu_fan_boost(adev); 3291 if (ret) 3292 break; 3293 3294 gpu_ins->mgpu_fan_enabled = 1; 3295 } 3296 } 3297 3298 out: 3299 mutex_unlock(&mgpu_info.mutex); 3300 3301 return ret; 3302 } 3303 3304 /** 3305 * amdgpu_device_ip_late_init - run late init for hardware IPs 3306 * 3307 * @adev: amdgpu_device pointer 3308 * 3309 * Late initialization pass for hardware IPs. The list of all the hardware 3310 * IPs that make up the asic is walked and the late_init callbacks are run. 3311 * late_init covers any special initialization that an IP requires 3312 * after all of the have been initialized or something that needs to happen 3313 * late in the init process. 3314 * Returns 0 on success, negative error code on failure. 3315 */ 3316 static int amdgpu_device_ip_late_init(struct amdgpu_device *adev) 3317 { 3318 struct amdgpu_gpu_instance *gpu_instance; 3319 int i = 0, r; 3320 3321 for (i = 0; i < adev->num_ip_blocks; i++) { 3322 if (!adev->ip_blocks[i].status.hw) 3323 continue; 3324 if (adev->ip_blocks[i].version->funcs->late_init) { 3325 r = adev->ip_blocks[i].version->funcs->late_init(&adev->ip_blocks[i]); 3326 if (r) { 3327 DRM_ERROR("late_init of IP block <%s> failed %d\n", 3328 adev->ip_blocks[i].version->funcs->name, r); 3329 return r; 3330 } 3331 } 3332 adev->ip_blocks[i].status.late_initialized = true; 3333 } 3334 3335 r = amdgpu_ras_late_init(adev); 3336 if (r) { 3337 DRM_ERROR("amdgpu_ras_late_init failed %d", r); 3338 return r; 3339 } 3340 3341 if (!amdgpu_reset_in_recovery(adev)) 3342 amdgpu_ras_set_error_query_ready(adev, true); 3343 3344 amdgpu_device_set_cg_state(adev, AMD_CG_STATE_GATE); 3345 amdgpu_device_set_pg_state(adev, AMD_PG_STATE_GATE); 3346 3347 amdgpu_device_fill_reset_magic(adev); 3348 3349 r = amdgpu_device_enable_mgpu_fan_boost(); 3350 if (r) 3351 DRM_ERROR("enable mgpu fan boost failed (%d).\n", r); 3352 3353 /* For passthrough configuration on arcturus and aldebaran, enable special handling SBR */ 3354 if (amdgpu_passthrough(adev) && 3355 ((adev->asic_type == CHIP_ARCTURUS && adev->gmc.xgmi.num_physical_nodes > 1) || 3356 adev->asic_type == CHIP_ALDEBARAN)) 3357 amdgpu_dpm_handle_passthrough_sbr(adev, true); 3358 3359 if (adev->gmc.xgmi.num_physical_nodes > 1) { 3360 mutex_lock(&mgpu_info.mutex); 3361 3362 /* 3363 * Reset device p-state to low as this was booted with high. 3364 * 3365 * This should be performed only after all devices from the same 3366 * hive get initialized. 3367 * 3368 * However, it's unknown how many device in the hive in advance. 3369 * As this is counted one by one during devices initializations. 3370 * 3371 * So, we wait for all XGMI interlinked devices initialized. 3372 * This may bring some delays as those devices may come from 3373 * different hives. But that should be OK. 3374 */ 3375 if (mgpu_info.num_dgpu == adev->gmc.xgmi.num_physical_nodes) { 3376 for (i = 0; i < mgpu_info.num_gpu; i++) { 3377 gpu_instance = &(mgpu_info.gpu_ins[i]); 3378 if (gpu_instance->adev->flags & AMD_IS_APU) 3379 continue; 3380 3381 r = amdgpu_xgmi_set_pstate(gpu_instance->adev, 3382 AMDGPU_XGMI_PSTATE_MIN); 3383 if (r) { 3384 DRM_ERROR("pstate setting failed (%d).\n", r); 3385 break; 3386 } 3387 } 3388 } 3389 3390 mutex_unlock(&mgpu_info.mutex); 3391 } 3392 3393 return 0; 3394 } 3395 3396 static void amdgpu_ip_block_hw_fini(struct amdgpu_ip_block *ip_block) 3397 { 3398 int r; 3399 3400 if (!ip_block->version->funcs->hw_fini) { 3401 DRM_ERROR("hw_fini of IP block <%s> not defined\n", 3402 ip_block->version->funcs->name); 3403 } else { 3404 r = ip_block->version->funcs->hw_fini(ip_block); 3405 /* XXX handle errors */ 3406 if (r) { 3407 DRM_DEBUG("hw_fini of IP block <%s> failed %d\n", 3408 ip_block->version->funcs->name, r); 3409 } 3410 } 3411 3412 ip_block->status.hw = false; 3413 } 3414 3415 /** 3416 * amdgpu_device_smu_fini_early - smu hw_fini wrapper 3417 * 3418 * @adev: amdgpu_device pointer 3419 * 3420 * For ASICs need to disable SMC first 3421 */ 3422 static void amdgpu_device_smu_fini_early(struct amdgpu_device *adev) 3423 { 3424 int i; 3425 3426 if (amdgpu_ip_version(adev, GC_HWIP, 0) > IP_VERSION(9, 0, 0)) 3427 return; 3428 3429 for (i = 0; i < adev->num_ip_blocks; i++) { 3430 if (!adev->ip_blocks[i].status.hw) 3431 continue; 3432 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) { 3433 amdgpu_ip_block_hw_fini(&adev->ip_blocks[i]); 3434 break; 3435 } 3436 } 3437 } 3438 3439 static int amdgpu_device_ip_fini_early(struct amdgpu_device *adev) 3440 { 3441 int i, r; 3442 3443 for (i = 0; i < adev->num_ip_blocks; i++) { 3444 if (!adev->ip_blocks[i].version->funcs->early_fini) 3445 continue; 3446 3447 r = adev->ip_blocks[i].version->funcs->early_fini(&adev->ip_blocks[i]); 3448 if (r) { 3449 DRM_DEBUG("early_fini of IP block <%s> failed %d\n", 3450 adev->ip_blocks[i].version->funcs->name, r); 3451 } 3452 } 3453 3454 amdgpu_device_set_pg_state(adev, AMD_PG_STATE_UNGATE); 3455 amdgpu_device_set_cg_state(adev, AMD_CG_STATE_UNGATE); 3456 3457 amdgpu_amdkfd_suspend(adev, false); 3458 3459 /* Workaround for ASICs need to disable SMC first */ 3460 amdgpu_device_smu_fini_early(adev); 3461 3462 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 3463 if (!adev->ip_blocks[i].status.hw) 3464 continue; 3465 3466 amdgpu_ip_block_hw_fini(&adev->ip_blocks[i]); 3467 } 3468 3469 if (amdgpu_sriov_vf(adev)) { 3470 if (amdgpu_virt_release_full_gpu(adev, false)) 3471 DRM_ERROR("failed to release exclusive mode on fini\n"); 3472 } 3473 3474 return 0; 3475 } 3476 3477 /** 3478 * amdgpu_device_ip_fini - run fini for hardware IPs 3479 * 3480 * @adev: amdgpu_device pointer 3481 * 3482 * Main teardown pass for hardware IPs. The list of all the hardware 3483 * IPs that make up the asic is walked and the hw_fini and sw_fini callbacks 3484 * are run. hw_fini tears down the hardware associated with each IP 3485 * and sw_fini tears down any software state associated with each IP. 3486 * Returns 0 on success, negative error code on failure. 3487 */ 3488 static int amdgpu_device_ip_fini(struct amdgpu_device *adev) 3489 { 3490 int i, r; 3491 3492 amdgpu_cper_fini(adev); 3493 3494 if (amdgpu_sriov_vf(adev) && adev->virt.ras_init_done) 3495 amdgpu_virt_release_ras_err_handler_data(adev); 3496 3497 if (adev->gmc.xgmi.num_physical_nodes > 1) 3498 amdgpu_xgmi_remove_device(adev); 3499 3500 amdgpu_amdkfd_device_fini_sw(adev); 3501 3502 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 3503 if (!adev->ip_blocks[i].status.sw) 3504 continue; 3505 3506 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) { 3507 amdgpu_ucode_free_bo(adev); 3508 amdgpu_free_static_csa(&adev->virt.csa_obj); 3509 amdgpu_device_wb_fini(adev); 3510 amdgpu_device_mem_scratch_fini(adev); 3511 amdgpu_ib_pool_fini(adev); 3512 amdgpu_seq64_fini(adev); 3513 } 3514 if (adev->ip_blocks[i].version->funcs->sw_fini) { 3515 r = adev->ip_blocks[i].version->funcs->sw_fini(&adev->ip_blocks[i]); 3516 /* XXX handle errors */ 3517 if (r) { 3518 DRM_DEBUG("sw_fini of IP block <%s> failed %d\n", 3519 adev->ip_blocks[i].version->funcs->name, r); 3520 } 3521 } 3522 adev->ip_blocks[i].status.sw = false; 3523 adev->ip_blocks[i].status.valid = false; 3524 } 3525 3526 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 3527 if (!adev->ip_blocks[i].status.late_initialized) 3528 continue; 3529 if (adev->ip_blocks[i].version->funcs->late_fini) 3530 adev->ip_blocks[i].version->funcs->late_fini(&adev->ip_blocks[i]); 3531 adev->ip_blocks[i].status.late_initialized = false; 3532 } 3533 3534 amdgpu_ras_fini(adev); 3535 3536 return 0; 3537 } 3538 3539 /** 3540 * amdgpu_device_delayed_init_work_handler - work handler for IB tests 3541 * 3542 * @work: work_struct. 3543 */ 3544 static void amdgpu_device_delayed_init_work_handler(struct work_struct *work) 3545 { 3546 struct amdgpu_device *adev = 3547 container_of(work, struct amdgpu_device, delayed_init_work.work); 3548 int r; 3549 3550 r = amdgpu_ib_ring_tests(adev); 3551 if (r) 3552 DRM_ERROR("ib ring test failed (%d).\n", r); 3553 } 3554 3555 static void amdgpu_device_delay_enable_gfx_off(struct work_struct *work) 3556 { 3557 struct amdgpu_device *adev = 3558 container_of(work, struct amdgpu_device, gfx.gfx_off_delay_work.work); 3559 3560 WARN_ON_ONCE(adev->gfx.gfx_off_state); 3561 WARN_ON_ONCE(adev->gfx.gfx_off_req_count); 3562 3563 if (!amdgpu_dpm_set_powergating_by_smu(adev, AMD_IP_BLOCK_TYPE_GFX, true, 0)) 3564 adev->gfx.gfx_off_state = true; 3565 } 3566 3567 /** 3568 * amdgpu_device_ip_suspend_phase1 - run suspend for hardware IPs (phase 1) 3569 * 3570 * @adev: amdgpu_device pointer 3571 * 3572 * Main suspend function for hardware IPs. The list of all the hardware 3573 * IPs that make up the asic is walked, clockgating is disabled and the 3574 * suspend callbacks are run. suspend puts the hardware and software state 3575 * in each IP into a state suitable for suspend. 3576 * Returns 0 on success, negative error code on failure. 3577 */ 3578 static int amdgpu_device_ip_suspend_phase1(struct amdgpu_device *adev) 3579 { 3580 int i, r; 3581 3582 amdgpu_device_set_pg_state(adev, AMD_PG_STATE_UNGATE); 3583 amdgpu_device_set_cg_state(adev, AMD_CG_STATE_UNGATE); 3584 3585 /* 3586 * Per PMFW team's suggestion, driver needs to handle gfxoff 3587 * and df cstate features disablement for gpu reset(e.g. Mode1Reset) 3588 * scenario. Add the missing df cstate disablement here. 3589 */ 3590 if (amdgpu_dpm_set_df_cstate(adev, DF_CSTATE_DISALLOW)) 3591 dev_warn(adev->dev, "Failed to disallow df cstate"); 3592 3593 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 3594 if (!adev->ip_blocks[i].status.valid) 3595 continue; 3596 3597 /* displays are handled separately */ 3598 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_DCE) 3599 continue; 3600 3601 /* XXX handle errors */ 3602 r = amdgpu_ip_block_suspend(&adev->ip_blocks[i]); 3603 if (r) 3604 return r; 3605 } 3606 3607 return 0; 3608 } 3609 3610 /** 3611 * amdgpu_device_ip_suspend_phase2 - run suspend for hardware IPs (phase 2) 3612 * 3613 * @adev: amdgpu_device pointer 3614 * 3615 * Main suspend function for hardware IPs. The list of all the hardware 3616 * IPs that make up the asic is walked, clockgating is disabled and the 3617 * suspend callbacks are run. suspend puts the hardware and software state 3618 * in each IP into a state suitable for suspend. 3619 * Returns 0 on success, negative error code on failure. 3620 */ 3621 static int amdgpu_device_ip_suspend_phase2(struct amdgpu_device *adev) 3622 { 3623 int i, r; 3624 3625 if (adev->in_s0ix) 3626 amdgpu_dpm_gfx_state_change(adev, sGpuChangeState_D3Entry); 3627 3628 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 3629 if (!adev->ip_blocks[i].status.valid) 3630 continue; 3631 /* displays are handled in phase1 */ 3632 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE) 3633 continue; 3634 /* PSP lost connection when err_event_athub occurs */ 3635 if (amdgpu_ras_intr_triggered() && 3636 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) { 3637 adev->ip_blocks[i].status.hw = false; 3638 continue; 3639 } 3640 3641 /* skip unnecessary suspend if we do not initialize them yet */ 3642 if (!amdgpu_ip_member_of_hwini( 3643 adev, adev->ip_blocks[i].version->type)) 3644 continue; 3645 3646 /* skip suspend of gfx/mes and psp for S0ix 3647 * gfx is in gfxoff state, so on resume it will exit gfxoff just 3648 * like at runtime. PSP is also part of the always on hardware 3649 * so no need to suspend it. 3650 */ 3651 if (adev->in_s0ix && 3652 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP || 3653 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GFX || 3654 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_MES)) 3655 continue; 3656 3657 /* SDMA 5.x+ is part of GFX power domain so it's covered by GFXOFF */ 3658 if (adev->in_s0ix && 3659 (amdgpu_ip_version(adev, SDMA0_HWIP, 0) >= 3660 IP_VERSION(5, 0, 0)) && 3661 (adev->ip_blocks[i].version->type == 3662 AMD_IP_BLOCK_TYPE_SDMA)) 3663 continue; 3664 3665 /* Once swPSP provides the IMU, RLC FW binaries to TOS during cold-boot. 3666 * These are in TMR, hence are expected to be reused by PSP-TOS to reload 3667 * from this location and RLC Autoload automatically also gets loaded 3668 * from here based on PMFW -> PSP message during re-init sequence. 3669 * Therefore, the psp suspend & resume should be skipped to avoid destroy 3670 * the TMR and reload FWs again for IMU enabled APU ASICs. 3671 */ 3672 if (amdgpu_in_reset(adev) && 3673 (adev->flags & AMD_IS_APU) && adev->gfx.imu.funcs && 3674 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) 3675 continue; 3676 3677 /* XXX handle errors */ 3678 r = amdgpu_ip_block_suspend(&adev->ip_blocks[i]); 3679 adev->ip_blocks[i].status.hw = false; 3680 3681 /* handle putting the SMC in the appropriate state */ 3682 if (!amdgpu_sriov_vf(adev)) { 3683 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) { 3684 r = amdgpu_dpm_set_mp1_state(adev, adev->mp1_state); 3685 if (r) { 3686 DRM_ERROR("SMC failed to set mp1 state %d, %d\n", 3687 adev->mp1_state, r); 3688 return r; 3689 } 3690 } 3691 } 3692 } 3693 3694 return 0; 3695 } 3696 3697 /** 3698 * amdgpu_device_ip_suspend - run suspend for hardware IPs 3699 * 3700 * @adev: amdgpu_device pointer 3701 * 3702 * Main suspend function for hardware IPs. The list of all the hardware 3703 * IPs that make up the asic is walked, clockgating is disabled and the 3704 * suspend callbacks are run. suspend puts the hardware and software state 3705 * in each IP into a state suitable for suspend. 3706 * Returns 0 on success, negative error code on failure. 3707 */ 3708 int amdgpu_device_ip_suspend(struct amdgpu_device *adev) 3709 { 3710 int r; 3711 3712 if (amdgpu_sriov_vf(adev)) { 3713 amdgpu_virt_fini_data_exchange(adev); 3714 amdgpu_virt_request_full_gpu(adev, false); 3715 } 3716 3717 amdgpu_ttm_set_buffer_funcs_status(adev, false); 3718 3719 r = amdgpu_device_ip_suspend_phase1(adev); 3720 if (r) 3721 return r; 3722 r = amdgpu_device_ip_suspend_phase2(adev); 3723 3724 if (amdgpu_sriov_vf(adev)) 3725 amdgpu_virt_release_full_gpu(adev, false); 3726 3727 return r; 3728 } 3729 3730 static int amdgpu_device_ip_reinit_early_sriov(struct amdgpu_device *adev) 3731 { 3732 int i, r; 3733 3734 static enum amd_ip_block_type ip_order[] = { 3735 AMD_IP_BLOCK_TYPE_COMMON, 3736 AMD_IP_BLOCK_TYPE_GMC, 3737 AMD_IP_BLOCK_TYPE_PSP, 3738 AMD_IP_BLOCK_TYPE_IH, 3739 }; 3740 3741 for (i = 0; i < adev->num_ip_blocks; i++) { 3742 int j; 3743 struct amdgpu_ip_block *block; 3744 3745 block = &adev->ip_blocks[i]; 3746 block->status.hw = false; 3747 3748 for (j = 0; j < ARRAY_SIZE(ip_order); j++) { 3749 3750 if (block->version->type != ip_order[j] || 3751 !block->status.valid) 3752 continue; 3753 3754 r = block->version->funcs->hw_init(&adev->ip_blocks[i]); 3755 if (r) { 3756 dev_err(adev->dev, "RE-INIT-early: %s failed\n", 3757 block->version->funcs->name); 3758 return r; 3759 } 3760 block->status.hw = true; 3761 } 3762 } 3763 3764 return 0; 3765 } 3766 3767 static int amdgpu_device_ip_reinit_late_sriov(struct amdgpu_device *adev) 3768 { 3769 struct amdgpu_ip_block *block; 3770 int i, r = 0; 3771 3772 static enum amd_ip_block_type ip_order[] = { 3773 AMD_IP_BLOCK_TYPE_SMC, 3774 AMD_IP_BLOCK_TYPE_DCE, 3775 AMD_IP_BLOCK_TYPE_GFX, 3776 AMD_IP_BLOCK_TYPE_SDMA, 3777 AMD_IP_BLOCK_TYPE_MES, 3778 AMD_IP_BLOCK_TYPE_UVD, 3779 AMD_IP_BLOCK_TYPE_VCE, 3780 AMD_IP_BLOCK_TYPE_VCN, 3781 AMD_IP_BLOCK_TYPE_JPEG 3782 }; 3783 3784 for (i = 0; i < ARRAY_SIZE(ip_order); i++) { 3785 block = amdgpu_device_ip_get_ip_block(adev, ip_order[i]); 3786 3787 if (!block) 3788 continue; 3789 3790 if (block->status.valid && !block->status.hw) { 3791 if (block->version->type == AMD_IP_BLOCK_TYPE_SMC) { 3792 r = amdgpu_ip_block_resume(block); 3793 } else { 3794 r = block->version->funcs->hw_init(block); 3795 } 3796 3797 if (r) { 3798 dev_err(adev->dev, "RE-INIT-late: %s failed\n", 3799 block->version->funcs->name); 3800 break; 3801 } 3802 block->status.hw = true; 3803 } 3804 } 3805 3806 return r; 3807 } 3808 3809 /** 3810 * amdgpu_device_ip_resume_phase1 - run resume for hardware IPs 3811 * 3812 * @adev: amdgpu_device pointer 3813 * 3814 * First resume function for hardware IPs. The list of all the hardware 3815 * IPs that make up the asic is walked and the resume callbacks are run for 3816 * COMMON, GMC, and IH. resume puts the hardware into a functional state 3817 * after a suspend and updates the software state as necessary. This 3818 * function is also used for restoring the GPU after a GPU reset. 3819 * Returns 0 on success, negative error code on failure. 3820 */ 3821 static int amdgpu_device_ip_resume_phase1(struct amdgpu_device *adev) 3822 { 3823 int i, r; 3824 3825 for (i = 0; i < adev->num_ip_blocks; i++) { 3826 if (!adev->ip_blocks[i].status.valid || adev->ip_blocks[i].status.hw) 3827 continue; 3828 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON || 3829 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC || 3830 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH || 3831 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP && amdgpu_sriov_vf(adev))) { 3832 3833 r = amdgpu_ip_block_resume(&adev->ip_blocks[i]); 3834 if (r) 3835 return r; 3836 } 3837 } 3838 3839 return 0; 3840 } 3841 3842 /** 3843 * amdgpu_device_ip_resume_phase2 - run resume for hardware IPs 3844 * 3845 * @adev: amdgpu_device pointer 3846 * 3847 * Second resume function for hardware IPs. The list of all the hardware 3848 * IPs that make up the asic is walked and the resume callbacks are run for 3849 * all blocks except COMMON, GMC, and IH. resume puts the hardware into a 3850 * functional state after a suspend and updates the software state as 3851 * necessary. This function is also used for restoring the GPU after a GPU 3852 * reset. 3853 * Returns 0 on success, negative error code on failure. 3854 */ 3855 static int amdgpu_device_ip_resume_phase2(struct amdgpu_device *adev) 3856 { 3857 int i, r; 3858 3859 for (i = 0; i < adev->num_ip_blocks; i++) { 3860 if (!adev->ip_blocks[i].status.valid || adev->ip_blocks[i].status.hw) 3861 continue; 3862 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON || 3863 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC || 3864 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH || 3865 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE || 3866 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) 3867 continue; 3868 r = amdgpu_ip_block_resume(&adev->ip_blocks[i]); 3869 if (r) 3870 return r; 3871 } 3872 3873 return 0; 3874 } 3875 3876 /** 3877 * amdgpu_device_ip_resume_phase3 - run resume for hardware IPs 3878 * 3879 * @adev: amdgpu_device pointer 3880 * 3881 * Third resume function for hardware IPs. The list of all the hardware 3882 * IPs that make up the asic is walked and the resume callbacks are run for 3883 * all DCE. resume puts the hardware into a functional state after a suspend 3884 * and updates the software state as necessary. This function is also used 3885 * for restoring the GPU after a GPU reset. 3886 * 3887 * Returns 0 on success, negative error code on failure. 3888 */ 3889 static int amdgpu_device_ip_resume_phase3(struct amdgpu_device *adev) 3890 { 3891 int i, r; 3892 3893 for (i = 0; i < adev->num_ip_blocks; i++) { 3894 if (!adev->ip_blocks[i].status.valid || adev->ip_blocks[i].status.hw) 3895 continue; 3896 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE) { 3897 r = amdgpu_ip_block_resume(&adev->ip_blocks[i]); 3898 if (r) 3899 return r; 3900 } 3901 } 3902 3903 return 0; 3904 } 3905 3906 /** 3907 * amdgpu_device_ip_resume - run resume for hardware IPs 3908 * 3909 * @adev: amdgpu_device pointer 3910 * 3911 * Main resume function for hardware IPs. The hardware IPs 3912 * are split into two resume functions because they are 3913 * also used in recovering from a GPU reset and some additional 3914 * steps need to be take between them. In this case (S3/S4) they are 3915 * run sequentially. 3916 * Returns 0 on success, negative error code on failure. 3917 */ 3918 static int amdgpu_device_ip_resume(struct amdgpu_device *adev) 3919 { 3920 int r; 3921 3922 r = amdgpu_device_ip_resume_phase1(adev); 3923 if (r) 3924 return r; 3925 3926 r = amdgpu_device_fw_loading(adev); 3927 if (r) 3928 return r; 3929 3930 r = amdgpu_device_ip_resume_phase2(adev); 3931 3932 if (adev->mman.buffer_funcs_ring->sched.ready) 3933 amdgpu_ttm_set_buffer_funcs_status(adev, true); 3934 3935 if (r) 3936 return r; 3937 3938 amdgpu_fence_driver_hw_init(adev); 3939 3940 r = amdgpu_device_ip_resume_phase3(adev); 3941 3942 return r; 3943 } 3944 3945 /** 3946 * amdgpu_device_detect_sriov_bios - determine if the board supports SR-IOV 3947 * 3948 * @adev: amdgpu_device pointer 3949 * 3950 * Query the VBIOS data tables to determine if the board supports SR-IOV. 3951 */ 3952 static void amdgpu_device_detect_sriov_bios(struct amdgpu_device *adev) 3953 { 3954 if (amdgpu_sriov_vf(adev)) { 3955 if (adev->is_atom_fw) { 3956 if (amdgpu_atomfirmware_gpu_virtualization_supported(adev)) 3957 adev->virt.caps |= AMDGPU_SRIOV_CAPS_SRIOV_VBIOS; 3958 } else { 3959 if (amdgpu_atombios_has_gpu_virtualization_table(adev)) 3960 adev->virt.caps |= AMDGPU_SRIOV_CAPS_SRIOV_VBIOS; 3961 } 3962 3963 if (!(adev->virt.caps & AMDGPU_SRIOV_CAPS_SRIOV_VBIOS)) 3964 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_NO_VBIOS, 0, 0); 3965 } 3966 } 3967 3968 /** 3969 * amdgpu_device_asic_has_dc_support - determine if DC supports the asic 3970 * 3971 * @asic_type: AMD asic type 3972 * 3973 * Check if there is DC (new modesetting infrastructre) support for an asic. 3974 * returns true if DC has support, false if not. 3975 */ 3976 bool amdgpu_device_asic_has_dc_support(enum amd_asic_type asic_type) 3977 { 3978 switch (asic_type) { 3979 #ifdef CONFIG_DRM_AMDGPU_SI 3980 case CHIP_HAINAN: 3981 #endif 3982 case CHIP_TOPAZ: 3983 /* chips with no display hardware */ 3984 return false; 3985 #if defined(CONFIG_DRM_AMD_DC) 3986 case CHIP_TAHITI: 3987 case CHIP_PITCAIRN: 3988 case CHIP_VERDE: 3989 case CHIP_OLAND: 3990 /* 3991 * We have systems in the wild with these ASICs that require 3992 * LVDS and VGA support which is not supported with DC. 3993 * 3994 * Fallback to the non-DC driver here by default so as not to 3995 * cause regressions. 3996 */ 3997 #if defined(CONFIG_DRM_AMD_DC_SI) 3998 return amdgpu_dc > 0; 3999 #else 4000 return false; 4001 #endif 4002 case CHIP_BONAIRE: 4003 case CHIP_KAVERI: 4004 case CHIP_KABINI: 4005 case CHIP_MULLINS: 4006 /* 4007 * We have systems in the wild with these ASICs that require 4008 * VGA support which is not supported with DC. 4009 * 4010 * Fallback to the non-DC driver here by default so as not to 4011 * cause regressions. 4012 */ 4013 return amdgpu_dc > 0; 4014 default: 4015 return amdgpu_dc != 0; 4016 #else 4017 default: 4018 if (amdgpu_dc > 0) 4019 DRM_INFO_ONCE("Display Core has been requested via kernel parameter but isn't supported by ASIC, ignoring\n"); 4020 return false; 4021 #endif 4022 } 4023 } 4024 4025 /** 4026 * amdgpu_device_has_dc_support - check if dc is supported 4027 * 4028 * @adev: amdgpu_device pointer 4029 * 4030 * Returns true for supported, false for not supported 4031 */ 4032 bool amdgpu_device_has_dc_support(struct amdgpu_device *adev) 4033 { 4034 if (adev->enable_virtual_display || 4035 (adev->harvest_ip_mask & AMD_HARVEST_IP_DMU_MASK)) 4036 return false; 4037 4038 return amdgpu_device_asic_has_dc_support(adev->asic_type); 4039 } 4040 4041 static void amdgpu_device_xgmi_reset_func(struct work_struct *__work) 4042 { 4043 struct amdgpu_device *adev = 4044 container_of(__work, struct amdgpu_device, xgmi_reset_work); 4045 struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev); 4046 4047 /* It's a bug to not have a hive within this function */ 4048 if (WARN_ON(!hive)) 4049 return; 4050 4051 /* 4052 * Use task barrier to synchronize all xgmi reset works across the 4053 * hive. task_barrier_enter and task_barrier_exit will block 4054 * until all the threads running the xgmi reset works reach 4055 * those points. task_barrier_full will do both blocks. 4056 */ 4057 if (amdgpu_asic_reset_method(adev) == AMD_RESET_METHOD_BACO) { 4058 4059 task_barrier_enter(&hive->tb); 4060 adev->asic_reset_res = amdgpu_device_baco_enter(adev_to_drm(adev)); 4061 4062 if (adev->asic_reset_res) 4063 goto fail; 4064 4065 task_barrier_exit(&hive->tb); 4066 adev->asic_reset_res = amdgpu_device_baco_exit(adev_to_drm(adev)); 4067 4068 if (adev->asic_reset_res) 4069 goto fail; 4070 4071 amdgpu_ras_reset_error_count(adev, AMDGPU_RAS_BLOCK__MMHUB); 4072 } else { 4073 4074 task_barrier_full(&hive->tb); 4075 adev->asic_reset_res = amdgpu_asic_reset(adev); 4076 } 4077 4078 fail: 4079 if (adev->asic_reset_res) 4080 DRM_WARN("ASIC reset failed with error, %d for drm dev, %s", 4081 adev->asic_reset_res, adev_to_drm(adev)->unique); 4082 amdgpu_put_xgmi_hive(hive); 4083 } 4084 4085 static int amdgpu_device_get_job_timeout_settings(struct amdgpu_device *adev) 4086 { 4087 char *input = amdgpu_lockup_timeout; 4088 char *timeout_setting = NULL; 4089 int index = 0; 4090 long timeout; 4091 int ret = 0; 4092 4093 /* 4094 * By default timeout for non compute jobs is 10000 4095 * and 60000 for compute jobs. 4096 * In SR-IOV or passthrough mode, timeout for compute 4097 * jobs are 60000 by default. 4098 */ 4099 adev->gfx_timeout = msecs_to_jiffies(10000); 4100 adev->sdma_timeout = adev->video_timeout = adev->gfx_timeout; 4101 if (amdgpu_sriov_vf(adev)) 4102 adev->compute_timeout = amdgpu_sriov_is_pp_one_vf(adev) ? 4103 msecs_to_jiffies(60000) : msecs_to_jiffies(10000); 4104 else 4105 adev->compute_timeout = msecs_to_jiffies(60000); 4106 4107 if (strnlen(input, AMDGPU_MAX_TIMEOUT_PARAM_LENGTH)) { 4108 while ((timeout_setting = strsep(&input, ",")) && 4109 strnlen(timeout_setting, AMDGPU_MAX_TIMEOUT_PARAM_LENGTH)) { 4110 ret = kstrtol(timeout_setting, 0, &timeout); 4111 if (ret) 4112 return ret; 4113 4114 if (timeout == 0) { 4115 index++; 4116 continue; 4117 } else if (timeout < 0) { 4118 timeout = MAX_SCHEDULE_TIMEOUT; 4119 dev_warn(adev->dev, "lockup timeout disabled"); 4120 add_taint(TAINT_SOFTLOCKUP, LOCKDEP_STILL_OK); 4121 } else { 4122 timeout = msecs_to_jiffies(timeout); 4123 } 4124 4125 switch (index++) { 4126 case 0: 4127 adev->gfx_timeout = timeout; 4128 break; 4129 case 1: 4130 adev->compute_timeout = timeout; 4131 break; 4132 case 2: 4133 adev->sdma_timeout = timeout; 4134 break; 4135 case 3: 4136 adev->video_timeout = timeout; 4137 break; 4138 default: 4139 break; 4140 } 4141 } 4142 /* 4143 * There is only one value specified and 4144 * it should apply to all non-compute jobs. 4145 */ 4146 if (index == 1) { 4147 adev->sdma_timeout = adev->video_timeout = adev->gfx_timeout; 4148 if (amdgpu_sriov_vf(adev) || amdgpu_passthrough(adev)) 4149 adev->compute_timeout = adev->gfx_timeout; 4150 } 4151 } 4152 4153 return ret; 4154 } 4155 4156 /** 4157 * amdgpu_device_check_iommu_direct_map - check if RAM direct mapped to GPU 4158 * 4159 * @adev: amdgpu_device pointer 4160 * 4161 * RAM direct mapped to GPU if IOMMU is not enabled or is pass through mode 4162 */ 4163 static void amdgpu_device_check_iommu_direct_map(struct amdgpu_device *adev) 4164 { 4165 struct iommu_domain *domain; 4166 4167 domain = iommu_get_domain_for_dev(adev->dev); 4168 if (!domain || domain->type == IOMMU_DOMAIN_IDENTITY) 4169 adev->ram_is_direct_mapped = true; 4170 } 4171 4172 #if defined(CONFIG_HSA_AMD_P2P) 4173 /** 4174 * amdgpu_device_check_iommu_remap - Check if DMA remapping is enabled. 4175 * 4176 * @adev: amdgpu_device pointer 4177 * 4178 * return if IOMMU remapping bar address 4179 */ 4180 static bool amdgpu_device_check_iommu_remap(struct amdgpu_device *adev) 4181 { 4182 struct iommu_domain *domain; 4183 4184 domain = iommu_get_domain_for_dev(adev->dev); 4185 if (domain && (domain->type == IOMMU_DOMAIN_DMA || 4186 domain->type == IOMMU_DOMAIN_DMA_FQ)) 4187 return true; 4188 4189 return false; 4190 } 4191 #endif 4192 4193 static void amdgpu_device_set_mcbp(struct amdgpu_device *adev) 4194 { 4195 if (amdgpu_mcbp == 1) 4196 adev->gfx.mcbp = true; 4197 else if (amdgpu_mcbp == 0) 4198 adev->gfx.mcbp = false; 4199 4200 if (amdgpu_sriov_vf(adev)) 4201 adev->gfx.mcbp = true; 4202 4203 if (adev->gfx.mcbp) 4204 DRM_INFO("MCBP is enabled\n"); 4205 } 4206 4207 /** 4208 * amdgpu_device_init - initialize the driver 4209 * 4210 * @adev: amdgpu_device pointer 4211 * @flags: driver flags 4212 * 4213 * Initializes the driver info and hw (all asics). 4214 * Returns 0 for success or an error on failure. 4215 * Called at driver startup. 4216 */ 4217 int amdgpu_device_init(struct amdgpu_device *adev, 4218 uint32_t flags) 4219 { 4220 struct drm_device *ddev = adev_to_drm(adev); 4221 struct pci_dev *pdev = adev->pdev; 4222 int r, i; 4223 bool px = false; 4224 u32 max_MBps; 4225 int tmp; 4226 4227 adev->shutdown = false; 4228 adev->flags = flags; 4229 4230 if (amdgpu_force_asic_type >= 0 && amdgpu_force_asic_type < CHIP_LAST) 4231 adev->asic_type = amdgpu_force_asic_type; 4232 else 4233 adev->asic_type = flags & AMD_ASIC_MASK; 4234 4235 adev->usec_timeout = AMDGPU_MAX_USEC_TIMEOUT; 4236 if (amdgpu_emu_mode == 1) 4237 adev->usec_timeout *= 10; 4238 adev->gmc.gart_size = 512 * 1024 * 1024; 4239 adev->accel_working = false; 4240 adev->num_rings = 0; 4241 RCU_INIT_POINTER(adev->gang_submit, dma_fence_get_stub()); 4242 adev->mman.buffer_funcs = NULL; 4243 adev->mman.buffer_funcs_ring = NULL; 4244 adev->vm_manager.vm_pte_funcs = NULL; 4245 adev->vm_manager.vm_pte_num_scheds = 0; 4246 adev->gmc.gmc_funcs = NULL; 4247 adev->harvest_ip_mask = 0x0; 4248 adev->fence_context = dma_fence_context_alloc(AMDGPU_MAX_RINGS); 4249 bitmap_zero(adev->gfx.pipe_reserve_bitmap, AMDGPU_MAX_COMPUTE_QUEUES); 4250 4251 adev->smc_rreg = &amdgpu_invalid_rreg; 4252 adev->smc_wreg = &amdgpu_invalid_wreg; 4253 adev->pcie_rreg = &amdgpu_invalid_rreg; 4254 adev->pcie_wreg = &amdgpu_invalid_wreg; 4255 adev->pcie_rreg_ext = &amdgpu_invalid_rreg_ext; 4256 adev->pcie_wreg_ext = &amdgpu_invalid_wreg_ext; 4257 adev->pciep_rreg = &amdgpu_invalid_rreg; 4258 adev->pciep_wreg = &amdgpu_invalid_wreg; 4259 adev->pcie_rreg64 = &amdgpu_invalid_rreg64; 4260 adev->pcie_wreg64 = &amdgpu_invalid_wreg64; 4261 adev->pcie_rreg64_ext = &amdgpu_invalid_rreg64_ext; 4262 adev->pcie_wreg64_ext = &amdgpu_invalid_wreg64_ext; 4263 adev->uvd_ctx_rreg = &amdgpu_invalid_rreg; 4264 adev->uvd_ctx_wreg = &amdgpu_invalid_wreg; 4265 adev->didt_rreg = &amdgpu_invalid_rreg; 4266 adev->didt_wreg = &amdgpu_invalid_wreg; 4267 adev->gc_cac_rreg = &amdgpu_invalid_rreg; 4268 adev->gc_cac_wreg = &amdgpu_invalid_wreg; 4269 adev->audio_endpt_rreg = &amdgpu_block_invalid_rreg; 4270 adev->audio_endpt_wreg = &amdgpu_block_invalid_wreg; 4271 4272 DRM_INFO("initializing kernel modesetting (%s 0x%04X:0x%04X 0x%04X:0x%04X 0x%02X).\n", 4273 amdgpu_asic_name[adev->asic_type], pdev->vendor, pdev->device, 4274 pdev->subsystem_vendor, pdev->subsystem_device, pdev->revision); 4275 4276 /* mutex initialization are all done here so we 4277 * can recall function without having locking issues 4278 */ 4279 mutex_init(&adev->firmware.mutex); 4280 mutex_init(&adev->pm.mutex); 4281 mutex_init(&adev->gfx.gpu_clock_mutex); 4282 mutex_init(&adev->srbm_mutex); 4283 mutex_init(&adev->gfx.pipe_reserve_mutex); 4284 mutex_init(&adev->gfx.gfx_off_mutex); 4285 mutex_init(&adev->gfx.partition_mutex); 4286 mutex_init(&adev->grbm_idx_mutex); 4287 mutex_init(&adev->mn_lock); 4288 mutex_init(&adev->virt.vf_errors.lock); 4289 hash_init(adev->mn_hash); 4290 mutex_init(&adev->psp.mutex); 4291 mutex_init(&adev->notifier_lock); 4292 mutex_init(&adev->pm.stable_pstate_ctx_lock); 4293 mutex_init(&adev->benchmark_mutex); 4294 mutex_init(&adev->gfx.reset_sem_mutex); 4295 /* Initialize the mutex for cleaner shader isolation between GFX and compute processes */ 4296 mutex_init(&adev->enforce_isolation_mutex); 4297 for (i = 0; i < MAX_XCP; ++i) { 4298 adev->isolation[i].spearhead = dma_fence_get_stub(); 4299 amdgpu_sync_create(&adev->isolation[i].active); 4300 amdgpu_sync_create(&adev->isolation[i].prev); 4301 } 4302 mutex_init(&adev->gfx.kfd_sch_mutex); 4303 mutex_init(&adev->gfx.workload_profile_mutex); 4304 mutex_init(&adev->vcn.workload_profile_mutex); 4305 4306 amdgpu_device_init_apu_flags(adev); 4307 4308 r = amdgpu_device_check_arguments(adev); 4309 if (r) 4310 return r; 4311 4312 spin_lock_init(&adev->mmio_idx_lock); 4313 spin_lock_init(&adev->smc_idx_lock); 4314 spin_lock_init(&adev->pcie_idx_lock); 4315 spin_lock_init(&adev->uvd_ctx_idx_lock); 4316 spin_lock_init(&adev->didt_idx_lock); 4317 spin_lock_init(&adev->gc_cac_idx_lock); 4318 spin_lock_init(&adev->se_cac_idx_lock); 4319 spin_lock_init(&adev->audio_endpt_idx_lock); 4320 spin_lock_init(&adev->mm_stats.lock); 4321 spin_lock_init(&adev->virt.rlcg_reg_lock); 4322 spin_lock_init(&adev->wb.lock); 4323 4324 INIT_LIST_HEAD(&adev->reset_list); 4325 4326 INIT_LIST_HEAD(&adev->ras_list); 4327 4328 INIT_LIST_HEAD(&adev->pm.od_kobj_list); 4329 4330 INIT_DELAYED_WORK(&adev->delayed_init_work, 4331 amdgpu_device_delayed_init_work_handler); 4332 INIT_DELAYED_WORK(&adev->gfx.gfx_off_delay_work, 4333 amdgpu_device_delay_enable_gfx_off); 4334 /* 4335 * Initialize the enforce_isolation work structures for each XCP 4336 * partition. This work handler is responsible for enforcing shader 4337 * isolation on AMD GPUs. It counts the number of emitted fences for 4338 * each GFX and compute ring. If there are any fences, it schedules 4339 * the `enforce_isolation_work` to be run after a delay. If there are 4340 * no fences, it signals the Kernel Fusion Driver (KFD) to resume the 4341 * runqueue. 4342 */ 4343 for (i = 0; i < MAX_XCP; i++) { 4344 INIT_DELAYED_WORK(&adev->gfx.enforce_isolation[i].work, 4345 amdgpu_gfx_enforce_isolation_handler); 4346 adev->gfx.enforce_isolation[i].adev = adev; 4347 adev->gfx.enforce_isolation[i].xcp_id = i; 4348 } 4349 4350 INIT_WORK(&adev->xgmi_reset_work, amdgpu_device_xgmi_reset_func); 4351 4352 adev->gfx.gfx_off_req_count = 1; 4353 adev->gfx.gfx_off_residency = 0; 4354 adev->gfx.gfx_off_entrycount = 0; 4355 adev->pm.ac_power = power_supply_is_system_supplied() > 0; 4356 4357 atomic_set(&adev->throttling_logging_enabled, 1); 4358 /* 4359 * If throttling continues, logging will be performed every minute 4360 * to avoid log flooding. "-1" is subtracted since the thermal 4361 * throttling interrupt comes every second. Thus, the total logging 4362 * interval is 59 seconds(retelimited printk interval) + 1(waiting 4363 * for throttling interrupt) = 60 seconds. 4364 */ 4365 ratelimit_state_init(&adev->throttling_logging_rs, (60 - 1) * HZ, 1); 4366 4367 ratelimit_set_flags(&adev->throttling_logging_rs, RATELIMIT_MSG_ON_RELEASE); 4368 4369 /* Registers mapping */ 4370 /* TODO: block userspace mapping of io register */ 4371 if (adev->asic_type >= CHIP_BONAIRE) { 4372 adev->rmmio_base = pci_resource_start(adev->pdev, 5); 4373 adev->rmmio_size = pci_resource_len(adev->pdev, 5); 4374 } else { 4375 adev->rmmio_base = pci_resource_start(adev->pdev, 2); 4376 adev->rmmio_size = pci_resource_len(adev->pdev, 2); 4377 } 4378 4379 for (i = 0; i < AMD_IP_BLOCK_TYPE_NUM; i++) 4380 atomic_set(&adev->pm.pwr_state[i], POWER_STATE_UNKNOWN); 4381 4382 adev->rmmio = ioremap(adev->rmmio_base, adev->rmmio_size); 4383 if (!adev->rmmio) 4384 return -ENOMEM; 4385 4386 DRM_INFO("register mmio base: 0x%08X\n", (uint32_t)adev->rmmio_base); 4387 DRM_INFO("register mmio size: %u\n", (unsigned int)adev->rmmio_size); 4388 4389 /* 4390 * Reset domain needs to be present early, before XGMI hive discovered 4391 * (if any) and initialized to use reset sem and in_gpu reset flag 4392 * early on during init and before calling to RREG32. 4393 */ 4394 adev->reset_domain = amdgpu_reset_create_reset_domain(SINGLE_DEVICE, "amdgpu-reset-dev"); 4395 if (!adev->reset_domain) 4396 return -ENOMEM; 4397 4398 /* detect hw virtualization here */ 4399 amdgpu_virt_init(adev); 4400 4401 amdgpu_device_get_pcie_info(adev); 4402 4403 r = amdgpu_device_get_job_timeout_settings(adev); 4404 if (r) { 4405 dev_err(adev->dev, "invalid lockup_timeout parameter syntax\n"); 4406 return r; 4407 } 4408 4409 amdgpu_device_set_mcbp(adev); 4410 4411 /* 4412 * By default, use default mode where all blocks are expected to be 4413 * initialized. At present a 'swinit' of blocks is required to be 4414 * completed before the need for a different level is detected. 4415 */ 4416 amdgpu_set_init_level(adev, AMDGPU_INIT_LEVEL_DEFAULT); 4417 /* early init functions */ 4418 r = amdgpu_device_ip_early_init(adev); 4419 if (r) 4420 return r; 4421 4422 /* 4423 * No need to remove conflicting FBs for non-display class devices. 4424 * This prevents the sysfb from being freed accidently. 4425 */ 4426 if ((pdev->class >> 8) == PCI_CLASS_DISPLAY_VGA || 4427 (pdev->class >> 8) == PCI_CLASS_DISPLAY_OTHER) { 4428 /* Get rid of things like offb */ 4429 r = aperture_remove_conflicting_pci_devices(adev->pdev, amdgpu_kms_driver.name); 4430 if (r) 4431 return r; 4432 } 4433 4434 /* Enable TMZ based on IP_VERSION */ 4435 amdgpu_gmc_tmz_set(adev); 4436 4437 if (amdgpu_sriov_vf(adev) && 4438 amdgpu_ip_version(adev, GC_HWIP, 0) >= IP_VERSION(10, 3, 0)) 4439 /* VF MMIO access (except mailbox range) from CPU 4440 * will be blocked during sriov runtime 4441 */ 4442 adev->virt.caps |= AMDGPU_VF_MMIO_ACCESS_PROTECT; 4443 4444 amdgpu_gmc_noretry_set(adev); 4445 /* Need to get xgmi info early to decide the reset behavior*/ 4446 if (adev->gmc.xgmi.supported) { 4447 r = adev->gfxhub.funcs->get_xgmi_info(adev); 4448 if (r) 4449 return r; 4450 } 4451 4452 /* enable PCIE atomic ops */ 4453 if (amdgpu_sriov_vf(adev)) { 4454 if (adev->virt.fw_reserve.p_pf2vf) 4455 adev->have_atomics_support = ((struct amd_sriov_msg_pf2vf_info *) 4456 adev->virt.fw_reserve.p_pf2vf)->pcie_atomic_ops_support_flags == 4457 (PCI_EXP_DEVCAP2_ATOMIC_COMP32 | PCI_EXP_DEVCAP2_ATOMIC_COMP64); 4458 /* APUs w/ gfx9 onwards doesn't reply on PCIe atomics, rather it is a 4459 * internal path natively support atomics, set have_atomics_support to true. 4460 */ 4461 } else if ((adev->flags & AMD_IS_APU) && 4462 (amdgpu_ip_version(adev, GC_HWIP, 0) > 4463 IP_VERSION(9, 0, 0))) { 4464 adev->have_atomics_support = true; 4465 } else { 4466 adev->have_atomics_support = 4467 !pci_enable_atomic_ops_to_root(adev->pdev, 4468 PCI_EXP_DEVCAP2_ATOMIC_COMP32 | 4469 PCI_EXP_DEVCAP2_ATOMIC_COMP64); 4470 } 4471 4472 if (!adev->have_atomics_support) 4473 dev_info(adev->dev, "PCIE atomic ops is not supported\n"); 4474 4475 /* doorbell bar mapping and doorbell index init*/ 4476 amdgpu_doorbell_init(adev); 4477 4478 if (amdgpu_emu_mode == 1) { 4479 /* post the asic on emulation mode */ 4480 emu_soc_asic_init(adev); 4481 goto fence_driver_init; 4482 } 4483 4484 amdgpu_reset_init(adev); 4485 4486 /* detect if we are with an SRIOV vbios */ 4487 if (adev->bios) 4488 amdgpu_device_detect_sriov_bios(adev); 4489 4490 /* check if we need to reset the asic 4491 * E.g., driver was not cleanly unloaded previously, etc. 4492 */ 4493 if (!amdgpu_sriov_vf(adev) && amdgpu_asic_need_reset_on_init(adev)) { 4494 if (adev->gmc.xgmi.num_physical_nodes) { 4495 dev_info(adev->dev, "Pending hive reset.\n"); 4496 amdgpu_set_init_level(adev, 4497 AMDGPU_INIT_LEVEL_MINIMAL_XGMI); 4498 } else if (amdgpu_ip_version(adev, MP1_HWIP, 0) == IP_VERSION(13, 0, 10) && 4499 !amdgpu_device_has_display_hardware(adev)) { 4500 r = psp_gpu_reset(adev); 4501 } else { 4502 tmp = amdgpu_reset_method; 4503 /* It should do a default reset when loading or reloading the driver, 4504 * regardless of the module parameter reset_method. 4505 */ 4506 amdgpu_reset_method = AMD_RESET_METHOD_NONE; 4507 r = amdgpu_asic_reset(adev); 4508 amdgpu_reset_method = tmp; 4509 } 4510 4511 if (r) { 4512 dev_err(adev->dev, "asic reset on init failed\n"); 4513 goto failed; 4514 } 4515 } 4516 4517 /* Post card if necessary */ 4518 if (amdgpu_device_need_post(adev)) { 4519 if (!adev->bios) { 4520 dev_err(adev->dev, "no vBIOS found\n"); 4521 r = -EINVAL; 4522 goto failed; 4523 } 4524 DRM_INFO("GPU posting now...\n"); 4525 r = amdgpu_device_asic_init(adev); 4526 if (r) { 4527 dev_err(adev->dev, "gpu post error!\n"); 4528 goto failed; 4529 } 4530 } 4531 4532 if (adev->bios) { 4533 if (adev->is_atom_fw) { 4534 /* Initialize clocks */ 4535 r = amdgpu_atomfirmware_get_clock_info(adev); 4536 if (r) { 4537 dev_err(adev->dev, "amdgpu_atomfirmware_get_clock_info failed\n"); 4538 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_GET_CLOCK_FAIL, 0, 0); 4539 goto failed; 4540 } 4541 } else { 4542 /* Initialize clocks */ 4543 r = amdgpu_atombios_get_clock_info(adev); 4544 if (r) { 4545 dev_err(adev->dev, "amdgpu_atombios_get_clock_info failed\n"); 4546 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_GET_CLOCK_FAIL, 0, 0); 4547 goto failed; 4548 } 4549 /* init i2c buses */ 4550 amdgpu_i2c_init(adev); 4551 } 4552 } 4553 4554 fence_driver_init: 4555 /* Fence driver */ 4556 r = amdgpu_fence_driver_sw_init(adev); 4557 if (r) { 4558 dev_err(adev->dev, "amdgpu_fence_driver_sw_init failed\n"); 4559 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_FENCE_INIT_FAIL, 0, 0); 4560 goto failed; 4561 } 4562 4563 /* init the mode config */ 4564 drm_mode_config_init(adev_to_drm(adev)); 4565 4566 r = amdgpu_device_ip_init(adev); 4567 if (r) { 4568 dev_err(adev->dev, "amdgpu_device_ip_init failed\n"); 4569 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_AMDGPU_INIT_FAIL, 0, 0); 4570 goto release_ras_con; 4571 } 4572 4573 amdgpu_fence_driver_hw_init(adev); 4574 4575 dev_info(adev->dev, 4576 "SE %d, SH per SE %d, CU per SH %d, active_cu_number %d\n", 4577 adev->gfx.config.max_shader_engines, 4578 adev->gfx.config.max_sh_per_se, 4579 adev->gfx.config.max_cu_per_sh, 4580 adev->gfx.cu_info.number); 4581 4582 adev->accel_working = true; 4583 4584 amdgpu_vm_check_compute_bug(adev); 4585 4586 /* Initialize the buffer migration limit. */ 4587 if (amdgpu_moverate >= 0) 4588 max_MBps = amdgpu_moverate; 4589 else 4590 max_MBps = 8; /* Allow 8 MB/s. */ 4591 /* Get a log2 for easy divisions. */ 4592 adev->mm_stats.log2_max_MBps = ilog2(max(1u, max_MBps)); 4593 4594 /* 4595 * Register gpu instance before amdgpu_device_enable_mgpu_fan_boost. 4596 * Otherwise the mgpu fan boost feature will be skipped due to the 4597 * gpu instance is counted less. 4598 */ 4599 amdgpu_register_gpu_instance(adev); 4600 4601 /* enable clockgating, etc. after ib tests, etc. since some blocks require 4602 * explicit gating rather than handling it automatically. 4603 */ 4604 if (adev->init_lvl->level != AMDGPU_INIT_LEVEL_MINIMAL_XGMI) { 4605 r = amdgpu_device_ip_late_init(adev); 4606 if (r) { 4607 dev_err(adev->dev, "amdgpu_device_ip_late_init failed\n"); 4608 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_AMDGPU_LATE_INIT_FAIL, 0, r); 4609 goto release_ras_con; 4610 } 4611 /* must succeed. */ 4612 amdgpu_ras_resume(adev); 4613 queue_delayed_work(system_wq, &adev->delayed_init_work, 4614 msecs_to_jiffies(AMDGPU_RESUME_MS)); 4615 } 4616 4617 if (amdgpu_sriov_vf(adev)) { 4618 amdgpu_virt_release_full_gpu(adev, true); 4619 flush_delayed_work(&adev->delayed_init_work); 4620 } 4621 4622 /* 4623 * Place those sysfs registering after `late_init`. As some of those 4624 * operations performed in `late_init` might affect the sysfs 4625 * interfaces creating. 4626 */ 4627 r = amdgpu_atombios_sysfs_init(adev); 4628 if (r) 4629 drm_err(&adev->ddev, 4630 "registering atombios sysfs failed (%d).\n", r); 4631 4632 r = amdgpu_pm_sysfs_init(adev); 4633 if (r) 4634 DRM_ERROR("registering pm sysfs failed (%d).\n", r); 4635 4636 r = amdgpu_ucode_sysfs_init(adev); 4637 if (r) { 4638 adev->ucode_sysfs_en = false; 4639 DRM_ERROR("Creating firmware sysfs failed (%d).\n", r); 4640 } else 4641 adev->ucode_sysfs_en = true; 4642 4643 r = amdgpu_device_attr_sysfs_init(adev); 4644 if (r) 4645 dev_err(adev->dev, "Could not create amdgpu device attr\n"); 4646 4647 r = devm_device_add_group(adev->dev, &amdgpu_board_attrs_group); 4648 if (r) 4649 dev_err(adev->dev, 4650 "Could not create amdgpu board attributes\n"); 4651 4652 amdgpu_fru_sysfs_init(adev); 4653 amdgpu_reg_state_sysfs_init(adev); 4654 amdgpu_xcp_cfg_sysfs_init(adev); 4655 4656 if (IS_ENABLED(CONFIG_PERF_EVENTS)) 4657 r = amdgpu_pmu_init(adev); 4658 if (r) 4659 dev_err(adev->dev, "amdgpu_pmu_init failed\n"); 4660 4661 /* Have stored pci confspace at hand for restore in sudden PCI error */ 4662 if (amdgpu_device_cache_pci_state(adev->pdev)) 4663 pci_restore_state(pdev); 4664 4665 /* if we have > 1 VGA cards, then disable the amdgpu VGA resources */ 4666 /* this will fail for cards that aren't VGA class devices, just 4667 * ignore it 4668 */ 4669 if ((adev->pdev->class >> 8) == PCI_CLASS_DISPLAY_VGA) 4670 vga_client_register(adev->pdev, amdgpu_device_vga_set_decode); 4671 4672 px = amdgpu_device_supports_px(ddev); 4673 4674 if (px || (!dev_is_removable(&adev->pdev->dev) && 4675 apple_gmux_detect(NULL, NULL))) 4676 vga_switcheroo_register_client(adev->pdev, 4677 &amdgpu_switcheroo_ops, px); 4678 4679 if (px) 4680 vga_switcheroo_init_domain_pm_ops(adev->dev, &adev->vga_pm_domain); 4681 4682 if (adev->init_lvl->level == AMDGPU_INIT_LEVEL_MINIMAL_XGMI) 4683 amdgpu_xgmi_reset_on_init(adev); 4684 4685 amdgpu_device_check_iommu_direct_map(adev); 4686 4687 adev->pm_nb.notifier_call = amdgpu_device_pm_notifier; 4688 r = register_pm_notifier(&adev->pm_nb); 4689 if (r) 4690 goto failed; 4691 4692 return 0; 4693 4694 release_ras_con: 4695 if (amdgpu_sriov_vf(adev)) 4696 amdgpu_virt_release_full_gpu(adev, true); 4697 4698 /* failed in exclusive mode due to timeout */ 4699 if (amdgpu_sriov_vf(adev) && 4700 !amdgpu_sriov_runtime(adev) && 4701 amdgpu_virt_mmio_blocked(adev) && 4702 !amdgpu_virt_wait_reset(adev)) { 4703 dev_err(adev->dev, "VF exclusive mode timeout\n"); 4704 /* Don't send request since VF is inactive. */ 4705 adev->virt.caps &= ~AMDGPU_SRIOV_CAPS_RUNTIME; 4706 adev->virt.ops = NULL; 4707 r = -EAGAIN; 4708 } 4709 amdgpu_release_ras_context(adev); 4710 4711 failed: 4712 amdgpu_vf_error_trans_all(adev); 4713 4714 return r; 4715 } 4716 4717 static void amdgpu_device_unmap_mmio(struct amdgpu_device *adev) 4718 { 4719 4720 /* Clear all CPU mappings pointing to this device */ 4721 unmap_mapping_range(adev->ddev.anon_inode->i_mapping, 0, 0, 1); 4722 4723 /* Unmap all mapped bars - Doorbell, registers and VRAM */ 4724 amdgpu_doorbell_fini(adev); 4725 4726 iounmap(adev->rmmio); 4727 adev->rmmio = NULL; 4728 if (adev->mman.aper_base_kaddr) 4729 iounmap(adev->mman.aper_base_kaddr); 4730 adev->mman.aper_base_kaddr = NULL; 4731 4732 /* Memory manager related */ 4733 if (!adev->gmc.xgmi.connected_to_cpu && !adev->gmc.is_app_apu) { 4734 arch_phys_wc_del(adev->gmc.vram_mtrr); 4735 arch_io_free_memtype_wc(adev->gmc.aper_base, adev->gmc.aper_size); 4736 } 4737 } 4738 4739 /** 4740 * amdgpu_device_fini_hw - tear down the driver 4741 * 4742 * @adev: amdgpu_device pointer 4743 * 4744 * Tear down the driver info (all asics). 4745 * Called at driver shutdown. 4746 */ 4747 void amdgpu_device_fini_hw(struct amdgpu_device *adev) 4748 { 4749 dev_info(adev->dev, "amdgpu: finishing device.\n"); 4750 flush_delayed_work(&adev->delayed_init_work); 4751 4752 if (adev->mman.initialized) 4753 drain_workqueue(adev->mman.bdev.wq); 4754 adev->shutdown = true; 4755 4756 unregister_pm_notifier(&adev->pm_nb); 4757 4758 /* make sure IB test finished before entering exclusive mode 4759 * to avoid preemption on IB test 4760 */ 4761 if (amdgpu_sriov_vf(adev)) { 4762 amdgpu_virt_request_full_gpu(adev, false); 4763 amdgpu_virt_fini_data_exchange(adev); 4764 } 4765 4766 /* disable all interrupts */ 4767 amdgpu_irq_disable_all(adev); 4768 if (adev->mode_info.mode_config_initialized) { 4769 if (!drm_drv_uses_atomic_modeset(adev_to_drm(adev))) 4770 drm_helper_force_disable_all(adev_to_drm(adev)); 4771 else 4772 drm_atomic_helper_shutdown(adev_to_drm(adev)); 4773 } 4774 amdgpu_fence_driver_hw_fini(adev); 4775 4776 if (adev->pm.sysfs_initialized) 4777 amdgpu_pm_sysfs_fini(adev); 4778 if (adev->ucode_sysfs_en) 4779 amdgpu_ucode_sysfs_fini(adev); 4780 amdgpu_device_attr_sysfs_fini(adev); 4781 amdgpu_fru_sysfs_fini(adev); 4782 4783 amdgpu_reg_state_sysfs_fini(adev); 4784 amdgpu_xcp_cfg_sysfs_fini(adev); 4785 4786 /* disable ras feature must before hw fini */ 4787 amdgpu_ras_pre_fini(adev); 4788 4789 amdgpu_ttm_set_buffer_funcs_status(adev, false); 4790 4791 amdgpu_device_ip_fini_early(adev); 4792 4793 amdgpu_irq_fini_hw(adev); 4794 4795 if (adev->mman.initialized) 4796 ttm_device_clear_dma_mappings(&adev->mman.bdev); 4797 4798 amdgpu_gart_dummy_page_fini(adev); 4799 4800 if (drm_dev_is_unplugged(adev_to_drm(adev))) 4801 amdgpu_device_unmap_mmio(adev); 4802 4803 } 4804 4805 void amdgpu_device_fini_sw(struct amdgpu_device *adev) 4806 { 4807 int i, idx; 4808 bool px; 4809 4810 amdgpu_device_ip_fini(adev); 4811 amdgpu_fence_driver_sw_fini(adev); 4812 amdgpu_ucode_release(&adev->firmware.gpu_info_fw); 4813 adev->accel_working = false; 4814 dma_fence_put(rcu_dereference_protected(adev->gang_submit, true)); 4815 for (i = 0; i < MAX_XCP; ++i) { 4816 dma_fence_put(adev->isolation[i].spearhead); 4817 amdgpu_sync_free(&adev->isolation[i].active); 4818 amdgpu_sync_free(&adev->isolation[i].prev); 4819 } 4820 4821 amdgpu_reset_fini(adev); 4822 4823 /* free i2c buses */ 4824 amdgpu_i2c_fini(adev); 4825 4826 if (adev->bios) { 4827 if (amdgpu_emu_mode != 1) 4828 amdgpu_atombios_fini(adev); 4829 amdgpu_bios_release(adev); 4830 } 4831 4832 kfree(adev->fru_info); 4833 adev->fru_info = NULL; 4834 4835 kfree(adev->xcp_mgr); 4836 adev->xcp_mgr = NULL; 4837 4838 px = amdgpu_device_supports_px(adev_to_drm(adev)); 4839 4840 if (px || (!dev_is_removable(&adev->pdev->dev) && 4841 apple_gmux_detect(NULL, NULL))) 4842 vga_switcheroo_unregister_client(adev->pdev); 4843 4844 if (px) 4845 vga_switcheroo_fini_domain_pm_ops(adev->dev); 4846 4847 if ((adev->pdev->class >> 8) == PCI_CLASS_DISPLAY_VGA) 4848 vga_client_unregister(adev->pdev); 4849 4850 if (drm_dev_enter(adev_to_drm(adev), &idx)) { 4851 4852 iounmap(adev->rmmio); 4853 adev->rmmio = NULL; 4854 amdgpu_doorbell_fini(adev); 4855 drm_dev_exit(idx); 4856 } 4857 4858 if (IS_ENABLED(CONFIG_PERF_EVENTS)) 4859 amdgpu_pmu_fini(adev); 4860 if (adev->mman.discovery_bin) 4861 amdgpu_discovery_fini(adev); 4862 4863 amdgpu_reset_put_reset_domain(adev->reset_domain); 4864 adev->reset_domain = NULL; 4865 4866 kfree(adev->pci_state); 4867 4868 } 4869 4870 /** 4871 * amdgpu_device_evict_resources - evict device resources 4872 * @adev: amdgpu device object 4873 * 4874 * Evicts all ttm device resources(vram BOs, gart table) from the lru list 4875 * of the vram memory type. Mainly used for evicting device resources 4876 * at suspend time. 4877 * 4878 */ 4879 static int amdgpu_device_evict_resources(struct amdgpu_device *adev) 4880 { 4881 int ret; 4882 4883 /* No need to evict vram on APUs unless going to S4 */ 4884 if (!adev->in_s4 && (adev->flags & AMD_IS_APU)) 4885 return 0; 4886 4887 ret = amdgpu_ttm_evict_resources(adev, TTM_PL_VRAM); 4888 if (ret) 4889 DRM_WARN("evicting device resources failed\n"); 4890 return ret; 4891 } 4892 4893 /* 4894 * Suspend & resume. 4895 */ 4896 /** 4897 * amdgpu_device_pm_notifier - Notification block for Suspend/Hibernate events 4898 * @nb: notifier block 4899 * @mode: suspend mode 4900 * @data: data 4901 * 4902 * This function is called when the system is about to suspend or hibernate. 4903 * It is used to evict resources from the device before the system goes to 4904 * sleep while there is still access to swap. 4905 */ 4906 static int amdgpu_device_pm_notifier(struct notifier_block *nb, unsigned long mode, 4907 void *data) 4908 { 4909 struct amdgpu_device *adev = container_of(nb, struct amdgpu_device, pm_nb); 4910 int r; 4911 4912 switch (mode) { 4913 case PM_HIBERNATION_PREPARE: 4914 adev->in_s4 = true; 4915 fallthrough; 4916 case PM_SUSPEND_PREPARE: 4917 r = amdgpu_device_evict_resources(adev); 4918 /* 4919 * This is considered non-fatal at this time because 4920 * amdgpu_device_prepare() will also fatally evict resources. 4921 * See https://gitlab.freedesktop.org/drm/amd/-/issues/3781 4922 */ 4923 if (r) 4924 drm_warn(adev_to_drm(adev), "Failed to evict resources, freeze active processes if problems occur: %d\n", r); 4925 break; 4926 } 4927 4928 return NOTIFY_DONE; 4929 } 4930 4931 /** 4932 * amdgpu_device_prepare - prepare for device suspend 4933 * 4934 * @dev: drm dev pointer 4935 * 4936 * Prepare to put the hw in the suspend state (all asics). 4937 * Returns 0 for success or an error on failure. 4938 * Called at driver suspend. 4939 */ 4940 int amdgpu_device_prepare(struct drm_device *dev) 4941 { 4942 struct amdgpu_device *adev = drm_to_adev(dev); 4943 int i, r; 4944 4945 amdgpu_choose_low_power_state(adev); 4946 4947 if (dev->switch_power_state == DRM_SWITCH_POWER_OFF) 4948 return 0; 4949 4950 /* Evict the majority of BOs before starting suspend sequence */ 4951 r = amdgpu_device_evict_resources(adev); 4952 if (r) 4953 goto unprepare; 4954 4955 flush_delayed_work(&adev->gfx.gfx_off_delay_work); 4956 4957 for (i = 0; i < adev->num_ip_blocks; i++) { 4958 if (!adev->ip_blocks[i].status.valid) 4959 continue; 4960 if (!adev->ip_blocks[i].version->funcs->prepare_suspend) 4961 continue; 4962 r = adev->ip_blocks[i].version->funcs->prepare_suspend(&adev->ip_blocks[i]); 4963 if (r) 4964 goto unprepare; 4965 } 4966 4967 return 0; 4968 4969 unprepare: 4970 adev->in_s0ix = adev->in_s3 = adev->in_s4 = false; 4971 4972 return r; 4973 } 4974 4975 /** 4976 * amdgpu_device_suspend - initiate device suspend 4977 * 4978 * @dev: drm dev pointer 4979 * @notify_clients: notify in-kernel DRM clients 4980 * 4981 * Puts the hw in the suspend state (all asics). 4982 * Returns 0 for success or an error on failure. 4983 * Called at driver suspend. 4984 */ 4985 int amdgpu_device_suspend(struct drm_device *dev, bool notify_clients) 4986 { 4987 struct amdgpu_device *adev = drm_to_adev(dev); 4988 int r = 0; 4989 4990 if (dev->switch_power_state == DRM_SWITCH_POWER_OFF) 4991 return 0; 4992 4993 adev->in_suspend = true; 4994 4995 if (amdgpu_sriov_vf(adev)) { 4996 amdgpu_virt_fini_data_exchange(adev); 4997 r = amdgpu_virt_request_full_gpu(adev, false); 4998 if (r) 4999 return r; 5000 } 5001 5002 if (amdgpu_acpi_smart_shift_update(dev, AMDGPU_SS_DEV_D3)) 5003 DRM_WARN("smart shift update failed\n"); 5004 5005 if (notify_clients) 5006 drm_client_dev_suspend(adev_to_drm(adev), false); 5007 5008 cancel_delayed_work_sync(&adev->delayed_init_work); 5009 5010 amdgpu_ras_suspend(adev); 5011 5012 amdgpu_device_ip_suspend_phase1(adev); 5013 5014 if (!adev->in_s0ix) 5015 amdgpu_amdkfd_suspend(adev, adev->in_runpm); 5016 5017 r = amdgpu_device_evict_resources(adev); 5018 if (r) 5019 return r; 5020 5021 amdgpu_ttm_set_buffer_funcs_status(adev, false); 5022 5023 amdgpu_fence_driver_hw_fini(adev); 5024 5025 amdgpu_device_ip_suspend_phase2(adev); 5026 5027 if (amdgpu_sriov_vf(adev)) 5028 amdgpu_virt_release_full_gpu(adev, false); 5029 5030 r = amdgpu_dpm_notify_rlc_state(adev, false); 5031 if (r) 5032 return r; 5033 5034 return 0; 5035 } 5036 5037 /** 5038 * amdgpu_device_resume - initiate device resume 5039 * 5040 * @dev: drm dev pointer 5041 * @notify_clients: notify in-kernel DRM clients 5042 * 5043 * Bring the hw back to operating state (all asics). 5044 * Returns 0 for success or an error on failure. 5045 * Called at driver resume. 5046 */ 5047 int amdgpu_device_resume(struct drm_device *dev, bool notify_clients) 5048 { 5049 struct amdgpu_device *adev = drm_to_adev(dev); 5050 int r = 0; 5051 5052 if (amdgpu_sriov_vf(adev)) { 5053 r = amdgpu_virt_request_full_gpu(adev, true); 5054 if (r) 5055 return r; 5056 } 5057 5058 if (dev->switch_power_state == DRM_SWITCH_POWER_OFF) 5059 return 0; 5060 5061 if (adev->in_s0ix) 5062 amdgpu_dpm_gfx_state_change(adev, sGpuChangeState_D0Entry); 5063 5064 /* post card */ 5065 if (amdgpu_device_need_post(adev)) { 5066 r = amdgpu_device_asic_init(adev); 5067 if (r) 5068 dev_err(adev->dev, "amdgpu asic init failed\n"); 5069 } 5070 5071 r = amdgpu_device_ip_resume(adev); 5072 5073 if (r) { 5074 dev_err(adev->dev, "amdgpu_device_ip_resume failed (%d).\n", r); 5075 goto exit; 5076 } 5077 5078 if (!adev->in_s0ix) { 5079 r = amdgpu_amdkfd_resume(adev, adev->in_runpm); 5080 if (r) 5081 goto exit; 5082 } 5083 5084 r = amdgpu_device_ip_late_init(adev); 5085 if (r) 5086 goto exit; 5087 5088 queue_delayed_work(system_wq, &adev->delayed_init_work, 5089 msecs_to_jiffies(AMDGPU_RESUME_MS)); 5090 exit: 5091 if (amdgpu_sriov_vf(adev)) { 5092 amdgpu_virt_init_data_exchange(adev); 5093 amdgpu_virt_release_full_gpu(adev, true); 5094 } 5095 5096 if (r) 5097 return r; 5098 5099 /* Make sure IB tests flushed */ 5100 flush_delayed_work(&adev->delayed_init_work); 5101 5102 if (notify_clients) 5103 drm_client_dev_resume(adev_to_drm(adev), false); 5104 5105 amdgpu_ras_resume(adev); 5106 5107 if (adev->mode_info.num_crtc) { 5108 /* 5109 * Most of the connector probing functions try to acquire runtime pm 5110 * refs to ensure that the GPU is powered on when connector polling is 5111 * performed. Since we're calling this from a runtime PM callback, 5112 * trying to acquire rpm refs will cause us to deadlock. 5113 * 5114 * Since we're guaranteed to be holding the rpm lock, it's safe to 5115 * temporarily disable the rpm helpers so this doesn't deadlock us. 5116 */ 5117 #ifdef CONFIG_PM 5118 dev->dev->power.disable_depth++; 5119 #endif 5120 if (!adev->dc_enabled) 5121 drm_helper_hpd_irq_event(dev); 5122 else 5123 drm_kms_helper_hotplug_event(dev); 5124 #ifdef CONFIG_PM 5125 dev->dev->power.disable_depth--; 5126 #endif 5127 } 5128 adev->in_suspend = false; 5129 5130 if (adev->enable_mes) 5131 amdgpu_mes_self_test(adev); 5132 5133 if (amdgpu_acpi_smart_shift_update(dev, AMDGPU_SS_DEV_D0)) 5134 DRM_WARN("smart shift update failed\n"); 5135 5136 return 0; 5137 } 5138 5139 /** 5140 * amdgpu_device_ip_check_soft_reset - did soft reset succeed 5141 * 5142 * @adev: amdgpu_device pointer 5143 * 5144 * The list of all the hardware IPs that make up the asic is walked and 5145 * the check_soft_reset callbacks are run. check_soft_reset determines 5146 * if the asic is still hung or not. 5147 * Returns true if any of the IPs are still in a hung state, false if not. 5148 */ 5149 static bool amdgpu_device_ip_check_soft_reset(struct amdgpu_device *adev) 5150 { 5151 int i; 5152 bool asic_hang = false; 5153 5154 if (amdgpu_sriov_vf(adev)) 5155 return true; 5156 5157 if (amdgpu_asic_need_full_reset(adev)) 5158 return true; 5159 5160 for (i = 0; i < adev->num_ip_blocks; i++) { 5161 if (!adev->ip_blocks[i].status.valid) 5162 continue; 5163 if (adev->ip_blocks[i].version->funcs->check_soft_reset) 5164 adev->ip_blocks[i].status.hang = 5165 adev->ip_blocks[i].version->funcs->check_soft_reset( 5166 &adev->ip_blocks[i]); 5167 if (adev->ip_blocks[i].status.hang) { 5168 dev_info(adev->dev, "IP block:%s is hung!\n", adev->ip_blocks[i].version->funcs->name); 5169 asic_hang = true; 5170 } 5171 } 5172 return asic_hang; 5173 } 5174 5175 /** 5176 * amdgpu_device_ip_pre_soft_reset - prepare for soft reset 5177 * 5178 * @adev: amdgpu_device pointer 5179 * 5180 * The list of all the hardware IPs that make up the asic is walked and the 5181 * pre_soft_reset callbacks are run if the block is hung. pre_soft_reset 5182 * handles any IP specific hardware or software state changes that are 5183 * necessary for a soft reset to succeed. 5184 * Returns 0 on success, negative error code on failure. 5185 */ 5186 static int amdgpu_device_ip_pre_soft_reset(struct amdgpu_device *adev) 5187 { 5188 int i, r = 0; 5189 5190 for (i = 0; i < adev->num_ip_blocks; i++) { 5191 if (!adev->ip_blocks[i].status.valid) 5192 continue; 5193 if (adev->ip_blocks[i].status.hang && 5194 adev->ip_blocks[i].version->funcs->pre_soft_reset) { 5195 r = adev->ip_blocks[i].version->funcs->pre_soft_reset(&adev->ip_blocks[i]); 5196 if (r) 5197 return r; 5198 } 5199 } 5200 5201 return 0; 5202 } 5203 5204 /** 5205 * amdgpu_device_ip_need_full_reset - check if a full asic reset is needed 5206 * 5207 * @adev: amdgpu_device pointer 5208 * 5209 * Some hardware IPs cannot be soft reset. If they are hung, a full gpu 5210 * reset is necessary to recover. 5211 * Returns true if a full asic reset is required, false if not. 5212 */ 5213 static bool amdgpu_device_ip_need_full_reset(struct amdgpu_device *adev) 5214 { 5215 int i; 5216 5217 if (amdgpu_asic_need_full_reset(adev)) 5218 return true; 5219 5220 for (i = 0; i < adev->num_ip_blocks; i++) { 5221 if (!adev->ip_blocks[i].status.valid) 5222 continue; 5223 if ((adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) || 5224 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) || 5225 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_ACP) || 5226 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE) || 5227 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) { 5228 if (adev->ip_blocks[i].status.hang) { 5229 dev_info(adev->dev, "Some block need full reset!\n"); 5230 return true; 5231 } 5232 } 5233 } 5234 return false; 5235 } 5236 5237 /** 5238 * amdgpu_device_ip_soft_reset - do a soft reset 5239 * 5240 * @adev: amdgpu_device pointer 5241 * 5242 * The list of all the hardware IPs that make up the asic is walked and the 5243 * soft_reset callbacks are run if the block is hung. soft_reset handles any 5244 * IP specific hardware or software state changes that are necessary to soft 5245 * reset the IP. 5246 * Returns 0 on success, negative error code on failure. 5247 */ 5248 static int amdgpu_device_ip_soft_reset(struct amdgpu_device *adev) 5249 { 5250 int i, r = 0; 5251 5252 for (i = 0; i < adev->num_ip_blocks; i++) { 5253 if (!adev->ip_blocks[i].status.valid) 5254 continue; 5255 if (adev->ip_blocks[i].status.hang && 5256 adev->ip_blocks[i].version->funcs->soft_reset) { 5257 r = adev->ip_blocks[i].version->funcs->soft_reset(&adev->ip_blocks[i]); 5258 if (r) 5259 return r; 5260 } 5261 } 5262 5263 return 0; 5264 } 5265 5266 /** 5267 * amdgpu_device_ip_post_soft_reset - clean up from soft reset 5268 * 5269 * @adev: amdgpu_device pointer 5270 * 5271 * The list of all the hardware IPs that make up the asic is walked and the 5272 * post_soft_reset callbacks are run if the asic was hung. post_soft_reset 5273 * handles any IP specific hardware or software state changes that are 5274 * necessary after the IP has been soft reset. 5275 * Returns 0 on success, negative error code on failure. 5276 */ 5277 static int amdgpu_device_ip_post_soft_reset(struct amdgpu_device *adev) 5278 { 5279 int i, r = 0; 5280 5281 for (i = 0; i < adev->num_ip_blocks; i++) { 5282 if (!adev->ip_blocks[i].status.valid) 5283 continue; 5284 if (adev->ip_blocks[i].status.hang && 5285 adev->ip_blocks[i].version->funcs->post_soft_reset) 5286 r = adev->ip_blocks[i].version->funcs->post_soft_reset(&adev->ip_blocks[i]); 5287 if (r) 5288 return r; 5289 } 5290 5291 return 0; 5292 } 5293 5294 /** 5295 * amdgpu_device_reset_sriov - reset ASIC for SR-IOV vf 5296 * 5297 * @adev: amdgpu_device pointer 5298 * @reset_context: amdgpu reset context pointer 5299 * 5300 * do VF FLR and reinitialize Asic 5301 * return 0 means succeeded otherwise failed 5302 */ 5303 static int amdgpu_device_reset_sriov(struct amdgpu_device *adev, 5304 struct amdgpu_reset_context *reset_context) 5305 { 5306 int r; 5307 struct amdgpu_hive_info *hive = NULL; 5308 5309 if (test_bit(AMDGPU_HOST_FLR, &reset_context->flags)) { 5310 if (!amdgpu_ras_get_fed_status(adev)) 5311 amdgpu_virt_ready_to_reset(adev); 5312 amdgpu_virt_wait_reset(adev); 5313 clear_bit(AMDGPU_HOST_FLR, &reset_context->flags); 5314 r = amdgpu_virt_request_full_gpu(adev, true); 5315 } else { 5316 r = amdgpu_virt_reset_gpu(adev); 5317 } 5318 if (r) 5319 return r; 5320 5321 amdgpu_ras_clear_err_state(adev); 5322 amdgpu_irq_gpu_reset_resume_helper(adev); 5323 5324 /* some sw clean up VF needs to do before recover */ 5325 amdgpu_virt_post_reset(adev); 5326 5327 /* Resume IP prior to SMC */ 5328 r = amdgpu_device_ip_reinit_early_sriov(adev); 5329 if (r) 5330 return r; 5331 5332 amdgpu_virt_init_data_exchange(adev); 5333 5334 r = amdgpu_device_fw_loading(adev); 5335 if (r) 5336 return r; 5337 5338 /* now we are okay to resume SMC/CP/SDMA */ 5339 r = amdgpu_device_ip_reinit_late_sriov(adev); 5340 if (r) 5341 return r; 5342 5343 hive = amdgpu_get_xgmi_hive(adev); 5344 /* Update PSP FW topology after reset */ 5345 if (hive && adev->gmc.xgmi.num_physical_nodes > 1) 5346 r = amdgpu_xgmi_update_topology(hive, adev); 5347 if (hive) 5348 amdgpu_put_xgmi_hive(hive); 5349 if (r) 5350 return r; 5351 5352 r = amdgpu_ib_ring_tests(adev); 5353 if (r) 5354 return r; 5355 5356 if (adev->virt.gim_feature & AMDGIM_FEATURE_GIM_FLR_VRAMLOST) 5357 amdgpu_inc_vram_lost(adev); 5358 5359 /* need to be called during full access so we can't do it later like 5360 * bare-metal does. 5361 */ 5362 amdgpu_amdkfd_post_reset(adev); 5363 amdgpu_virt_release_full_gpu(adev, true); 5364 5365 /* Aldebaran and gfx_11_0_3 support ras in SRIOV, so need resume ras during reset */ 5366 if (amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 2) || 5367 amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 3) || 5368 amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 4) || 5369 amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 5, 0) || 5370 amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(11, 0, 3)) 5371 amdgpu_ras_resume(adev); 5372 5373 amdgpu_virt_ras_telemetry_post_reset(adev); 5374 5375 return 0; 5376 } 5377 5378 /** 5379 * amdgpu_device_has_job_running - check if there is any unfinished job 5380 * 5381 * @adev: amdgpu_device pointer 5382 * 5383 * check if there is any job running on the device when guest driver receives 5384 * FLR notification from host driver. If there are still jobs running, then 5385 * the guest driver will not respond the FLR reset. Instead, let the job hit 5386 * the timeout and guest driver then issue the reset request. 5387 */ 5388 bool amdgpu_device_has_job_running(struct amdgpu_device *adev) 5389 { 5390 int i; 5391 5392 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 5393 struct amdgpu_ring *ring = adev->rings[i]; 5394 5395 if (!amdgpu_ring_sched_ready(ring)) 5396 continue; 5397 5398 if (amdgpu_fence_count_emitted(ring)) 5399 return true; 5400 } 5401 return false; 5402 } 5403 5404 /** 5405 * amdgpu_device_should_recover_gpu - check if we should try GPU recovery 5406 * 5407 * @adev: amdgpu_device pointer 5408 * 5409 * Check amdgpu_gpu_recovery and SRIOV status to see if we should try to recover 5410 * a hung GPU. 5411 */ 5412 bool amdgpu_device_should_recover_gpu(struct amdgpu_device *adev) 5413 { 5414 5415 if (amdgpu_gpu_recovery == 0) 5416 goto disabled; 5417 5418 /* Skip soft reset check in fatal error mode */ 5419 if (!amdgpu_ras_is_poison_mode_supported(adev)) 5420 return true; 5421 5422 if (amdgpu_sriov_vf(adev)) 5423 return true; 5424 5425 if (amdgpu_gpu_recovery == -1) { 5426 switch (adev->asic_type) { 5427 #ifdef CONFIG_DRM_AMDGPU_SI 5428 case CHIP_VERDE: 5429 case CHIP_TAHITI: 5430 case CHIP_PITCAIRN: 5431 case CHIP_OLAND: 5432 case CHIP_HAINAN: 5433 #endif 5434 #ifdef CONFIG_DRM_AMDGPU_CIK 5435 case CHIP_KAVERI: 5436 case CHIP_KABINI: 5437 case CHIP_MULLINS: 5438 #endif 5439 case CHIP_CARRIZO: 5440 case CHIP_STONEY: 5441 case CHIP_CYAN_SKILLFISH: 5442 goto disabled; 5443 default: 5444 break; 5445 } 5446 } 5447 5448 return true; 5449 5450 disabled: 5451 dev_info(adev->dev, "GPU recovery disabled.\n"); 5452 return false; 5453 } 5454 5455 int amdgpu_device_mode1_reset(struct amdgpu_device *adev) 5456 { 5457 u32 i; 5458 int ret = 0; 5459 5460 if (adev->bios) 5461 amdgpu_atombios_scratch_regs_engine_hung(adev, true); 5462 5463 dev_info(adev->dev, "GPU mode1 reset\n"); 5464 5465 /* Cache the state before bus master disable. The saved config space 5466 * values are used in other cases like restore after mode-2 reset. 5467 */ 5468 amdgpu_device_cache_pci_state(adev->pdev); 5469 5470 /* disable BM */ 5471 pci_clear_master(adev->pdev); 5472 5473 if (amdgpu_dpm_is_mode1_reset_supported(adev)) { 5474 dev_info(adev->dev, "GPU smu mode1 reset\n"); 5475 ret = amdgpu_dpm_mode1_reset(adev); 5476 } else { 5477 dev_info(adev->dev, "GPU psp mode1 reset\n"); 5478 ret = psp_gpu_reset(adev); 5479 } 5480 5481 if (ret) 5482 goto mode1_reset_failed; 5483 5484 amdgpu_device_load_pci_state(adev->pdev); 5485 ret = amdgpu_psp_wait_for_bootloader(adev); 5486 if (ret) 5487 goto mode1_reset_failed; 5488 5489 /* wait for asic to come out of reset */ 5490 for (i = 0; i < adev->usec_timeout; i++) { 5491 u32 memsize = adev->nbio.funcs->get_memsize(adev); 5492 5493 if (memsize != 0xffffffff) 5494 break; 5495 udelay(1); 5496 } 5497 5498 if (i >= adev->usec_timeout) { 5499 ret = -ETIMEDOUT; 5500 goto mode1_reset_failed; 5501 } 5502 5503 if (adev->bios) 5504 amdgpu_atombios_scratch_regs_engine_hung(adev, false); 5505 5506 return 0; 5507 5508 mode1_reset_failed: 5509 dev_err(adev->dev, "GPU mode1 reset failed\n"); 5510 return ret; 5511 } 5512 5513 int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev, 5514 struct amdgpu_reset_context *reset_context) 5515 { 5516 int i, r = 0; 5517 struct amdgpu_job *job = NULL; 5518 struct amdgpu_device *tmp_adev = reset_context->reset_req_dev; 5519 bool need_full_reset = 5520 test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 5521 5522 if (reset_context->reset_req_dev == adev) 5523 job = reset_context->job; 5524 5525 if (amdgpu_sriov_vf(adev)) 5526 amdgpu_virt_pre_reset(adev); 5527 5528 amdgpu_fence_driver_isr_toggle(adev, true); 5529 5530 /* block all schedulers and reset given job's ring */ 5531 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 5532 struct amdgpu_ring *ring = adev->rings[i]; 5533 5534 if (!amdgpu_ring_sched_ready(ring)) 5535 continue; 5536 5537 /* Clear job fence from fence drv to avoid force_completion 5538 * leave NULL and vm flush fence in fence drv 5539 */ 5540 amdgpu_fence_driver_clear_job_fences(ring); 5541 5542 /* after all hw jobs are reset, hw fence is meaningless, so force_completion */ 5543 amdgpu_fence_driver_force_completion(ring); 5544 } 5545 5546 amdgpu_fence_driver_isr_toggle(adev, false); 5547 5548 if (job && job->vm) 5549 drm_sched_increase_karma(&job->base); 5550 5551 r = amdgpu_reset_prepare_hwcontext(adev, reset_context); 5552 /* If reset handler not implemented, continue; otherwise return */ 5553 if (r == -EOPNOTSUPP) 5554 r = 0; 5555 else 5556 return r; 5557 5558 /* Don't suspend on bare metal if we are not going to HW reset the ASIC */ 5559 if (!amdgpu_sriov_vf(adev)) { 5560 5561 if (!need_full_reset) 5562 need_full_reset = amdgpu_device_ip_need_full_reset(adev); 5563 5564 if (!need_full_reset && amdgpu_gpu_recovery && 5565 amdgpu_device_ip_check_soft_reset(adev)) { 5566 amdgpu_device_ip_pre_soft_reset(adev); 5567 r = amdgpu_device_ip_soft_reset(adev); 5568 amdgpu_device_ip_post_soft_reset(adev); 5569 if (r || amdgpu_device_ip_check_soft_reset(adev)) { 5570 dev_info(adev->dev, "soft reset failed, will fallback to full reset!\n"); 5571 need_full_reset = true; 5572 } 5573 } 5574 5575 if (!test_bit(AMDGPU_SKIP_COREDUMP, &reset_context->flags)) { 5576 dev_info(tmp_adev->dev, "Dumping IP State\n"); 5577 /* Trigger ip dump before we reset the asic */ 5578 for (i = 0; i < tmp_adev->num_ip_blocks; i++) 5579 if (tmp_adev->ip_blocks[i].version->funcs->dump_ip_state) 5580 tmp_adev->ip_blocks[i].version->funcs 5581 ->dump_ip_state((void *)&tmp_adev->ip_blocks[i]); 5582 dev_info(tmp_adev->dev, "Dumping IP State Completed\n"); 5583 } 5584 5585 if (need_full_reset) 5586 r = amdgpu_device_ip_suspend(adev); 5587 if (need_full_reset) 5588 set_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 5589 else 5590 clear_bit(AMDGPU_NEED_FULL_RESET, 5591 &reset_context->flags); 5592 } 5593 5594 return r; 5595 } 5596 5597 int amdgpu_device_reinit_after_reset(struct amdgpu_reset_context *reset_context) 5598 { 5599 struct list_head *device_list_handle; 5600 bool full_reset, vram_lost = false; 5601 struct amdgpu_device *tmp_adev; 5602 int r, init_level; 5603 5604 device_list_handle = reset_context->reset_device_list; 5605 5606 if (!device_list_handle) 5607 return -EINVAL; 5608 5609 full_reset = test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 5610 5611 /** 5612 * If it's reset on init, it's default init level, otherwise keep level 5613 * as recovery level. 5614 */ 5615 if (reset_context->method == AMD_RESET_METHOD_ON_INIT) 5616 init_level = AMDGPU_INIT_LEVEL_DEFAULT; 5617 else 5618 init_level = AMDGPU_INIT_LEVEL_RESET_RECOVERY; 5619 5620 r = 0; 5621 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 5622 amdgpu_set_init_level(tmp_adev, init_level); 5623 if (full_reset) { 5624 /* post card */ 5625 amdgpu_ras_clear_err_state(tmp_adev); 5626 r = amdgpu_device_asic_init(tmp_adev); 5627 if (r) { 5628 dev_warn(tmp_adev->dev, "asic atom init failed!"); 5629 } else { 5630 dev_info(tmp_adev->dev, "GPU reset succeeded, trying to resume\n"); 5631 5632 r = amdgpu_device_ip_resume_phase1(tmp_adev); 5633 if (r) 5634 goto out; 5635 5636 vram_lost = amdgpu_device_check_vram_lost(tmp_adev); 5637 5638 if (!test_bit(AMDGPU_SKIP_COREDUMP, &reset_context->flags)) 5639 amdgpu_coredump(tmp_adev, false, vram_lost, reset_context->job); 5640 5641 if (vram_lost) { 5642 DRM_INFO("VRAM is lost due to GPU reset!\n"); 5643 amdgpu_inc_vram_lost(tmp_adev); 5644 } 5645 5646 r = amdgpu_device_fw_loading(tmp_adev); 5647 if (r) 5648 return r; 5649 5650 r = amdgpu_xcp_restore_partition_mode( 5651 tmp_adev->xcp_mgr); 5652 if (r) 5653 goto out; 5654 5655 r = amdgpu_device_ip_resume_phase2(tmp_adev); 5656 if (r) 5657 goto out; 5658 5659 if (tmp_adev->mman.buffer_funcs_ring->sched.ready) 5660 amdgpu_ttm_set_buffer_funcs_status(tmp_adev, true); 5661 5662 r = amdgpu_device_ip_resume_phase3(tmp_adev); 5663 if (r) 5664 goto out; 5665 5666 if (vram_lost) 5667 amdgpu_device_fill_reset_magic(tmp_adev); 5668 5669 /* 5670 * Add this ASIC as tracked as reset was already 5671 * complete successfully. 5672 */ 5673 amdgpu_register_gpu_instance(tmp_adev); 5674 5675 if (!reset_context->hive && 5676 tmp_adev->gmc.xgmi.num_physical_nodes > 1) 5677 amdgpu_xgmi_add_device(tmp_adev); 5678 5679 r = amdgpu_device_ip_late_init(tmp_adev); 5680 if (r) 5681 goto out; 5682 5683 drm_client_dev_resume(adev_to_drm(tmp_adev), false); 5684 5685 /* 5686 * The GPU enters bad state once faulty pages 5687 * by ECC has reached the threshold, and ras 5688 * recovery is scheduled next. So add one check 5689 * here to break recovery if it indeed exceeds 5690 * bad page threshold, and remind user to 5691 * retire this GPU or setting one bigger 5692 * bad_page_threshold value to fix this once 5693 * probing driver again. 5694 */ 5695 if (!amdgpu_ras_is_rma(tmp_adev)) { 5696 /* must succeed. */ 5697 amdgpu_ras_resume(tmp_adev); 5698 } else { 5699 r = -EINVAL; 5700 goto out; 5701 } 5702 5703 /* Update PSP FW topology after reset */ 5704 if (reset_context->hive && 5705 tmp_adev->gmc.xgmi.num_physical_nodes > 1) 5706 r = amdgpu_xgmi_update_topology( 5707 reset_context->hive, tmp_adev); 5708 } 5709 } 5710 5711 out: 5712 if (!r) { 5713 /* IP init is complete now, set level as default */ 5714 amdgpu_set_init_level(tmp_adev, 5715 AMDGPU_INIT_LEVEL_DEFAULT); 5716 amdgpu_irq_gpu_reset_resume_helper(tmp_adev); 5717 r = amdgpu_ib_ring_tests(tmp_adev); 5718 if (r) { 5719 dev_err(tmp_adev->dev, "ib ring test failed (%d).\n", r); 5720 r = -EAGAIN; 5721 goto end; 5722 } 5723 } 5724 5725 if (r) 5726 tmp_adev->asic_reset_res = r; 5727 } 5728 5729 end: 5730 return r; 5731 } 5732 5733 int amdgpu_do_asic_reset(struct list_head *device_list_handle, 5734 struct amdgpu_reset_context *reset_context) 5735 { 5736 struct amdgpu_device *tmp_adev = NULL; 5737 bool need_full_reset, skip_hw_reset; 5738 int r = 0; 5739 5740 /* Try reset handler method first */ 5741 tmp_adev = list_first_entry(device_list_handle, struct amdgpu_device, 5742 reset_list); 5743 5744 reset_context->reset_device_list = device_list_handle; 5745 r = amdgpu_reset_perform_reset(tmp_adev, reset_context); 5746 /* If reset handler not implemented, continue; otherwise return */ 5747 if (r == -EOPNOTSUPP) 5748 r = 0; 5749 else 5750 return r; 5751 5752 /* Reset handler not implemented, use the default method */ 5753 need_full_reset = 5754 test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 5755 skip_hw_reset = test_bit(AMDGPU_SKIP_HW_RESET, &reset_context->flags); 5756 5757 /* 5758 * ASIC reset has to be done on all XGMI hive nodes ASAP 5759 * to allow proper links negotiation in FW (within 1 sec) 5760 */ 5761 if (!skip_hw_reset && need_full_reset) { 5762 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 5763 /* For XGMI run all resets in parallel to speed up the process */ 5764 if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) { 5765 if (!queue_work(system_unbound_wq, 5766 &tmp_adev->xgmi_reset_work)) 5767 r = -EALREADY; 5768 } else 5769 r = amdgpu_asic_reset(tmp_adev); 5770 5771 if (r) { 5772 dev_err(tmp_adev->dev, 5773 "ASIC reset failed with error, %d for drm dev, %s", 5774 r, adev_to_drm(tmp_adev)->unique); 5775 goto out; 5776 } 5777 } 5778 5779 /* For XGMI wait for all resets to complete before proceed */ 5780 if (!r) { 5781 list_for_each_entry(tmp_adev, device_list_handle, 5782 reset_list) { 5783 if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) { 5784 flush_work(&tmp_adev->xgmi_reset_work); 5785 r = tmp_adev->asic_reset_res; 5786 if (r) 5787 break; 5788 } 5789 } 5790 } 5791 } 5792 5793 if (!r && amdgpu_ras_intr_triggered()) { 5794 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 5795 amdgpu_ras_reset_error_count(tmp_adev, 5796 AMDGPU_RAS_BLOCK__MMHUB); 5797 } 5798 5799 amdgpu_ras_intr_cleared(); 5800 } 5801 5802 r = amdgpu_device_reinit_after_reset(reset_context); 5803 if (r == -EAGAIN) 5804 set_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 5805 else 5806 clear_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 5807 5808 out: 5809 return r; 5810 } 5811 5812 static void amdgpu_device_set_mp1_state(struct amdgpu_device *adev) 5813 { 5814 5815 switch (amdgpu_asic_reset_method(adev)) { 5816 case AMD_RESET_METHOD_MODE1: 5817 adev->mp1_state = PP_MP1_STATE_SHUTDOWN; 5818 break; 5819 case AMD_RESET_METHOD_MODE2: 5820 adev->mp1_state = PP_MP1_STATE_RESET; 5821 break; 5822 default: 5823 adev->mp1_state = PP_MP1_STATE_NONE; 5824 break; 5825 } 5826 } 5827 5828 static void amdgpu_device_unset_mp1_state(struct amdgpu_device *adev) 5829 { 5830 amdgpu_vf_error_trans_all(adev); 5831 adev->mp1_state = PP_MP1_STATE_NONE; 5832 } 5833 5834 static void amdgpu_device_resume_display_audio(struct amdgpu_device *adev) 5835 { 5836 struct pci_dev *p = NULL; 5837 5838 p = pci_get_domain_bus_and_slot(pci_domain_nr(adev->pdev->bus), 5839 adev->pdev->bus->number, 1); 5840 if (p) { 5841 pm_runtime_enable(&(p->dev)); 5842 pm_runtime_resume(&(p->dev)); 5843 } 5844 5845 pci_dev_put(p); 5846 } 5847 5848 static int amdgpu_device_suspend_display_audio(struct amdgpu_device *adev) 5849 { 5850 enum amd_reset_method reset_method; 5851 struct pci_dev *p = NULL; 5852 u64 expires; 5853 5854 /* 5855 * For now, only BACO and mode1 reset are confirmed 5856 * to suffer the audio issue without proper suspended. 5857 */ 5858 reset_method = amdgpu_asic_reset_method(adev); 5859 if ((reset_method != AMD_RESET_METHOD_BACO) && 5860 (reset_method != AMD_RESET_METHOD_MODE1)) 5861 return -EINVAL; 5862 5863 p = pci_get_domain_bus_and_slot(pci_domain_nr(adev->pdev->bus), 5864 adev->pdev->bus->number, 1); 5865 if (!p) 5866 return -ENODEV; 5867 5868 expires = pm_runtime_autosuspend_expiration(&(p->dev)); 5869 if (!expires) 5870 /* 5871 * If we cannot get the audio device autosuspend delay, 5872 * a fixed 4S interval will be used. Considering 3S is 5873 * the audio controller default autosuspend delay setting. 5874 * 4S used here is guaranteed to cover that. 5875 */ 5876 expires = ktime_get_mono_fast_ns() + NSEC_PER_SEC * 4ULL; 5877 5878 while (!pm_runtime_status_suspended(&(p->dev))) { 5879 if (!pm_runtime_suspend(&(p->dev))) 5880 break; 5881 5882 if (expires < ktime_get_mono_fast_ns()) { 5883 dev_warn(adev->dev, "failed to suspend display audio\n"); 5884 pci_dev_put(p); 5885 /* TODO: abort the succeeding gpu reset? */ 5886 return -ETIMEDOUT; 5887 } 5888 } 5889 5890 pm_runtime_disable(&(p->dev)); 5891 5892 pci_dev_put(p); 5893 return 0; 5894 } 5895 5896 static inline void amdgpu_device_stop_pending_resets(struct amdgpu_device *adev) 5897 { 5898 struct amdgpu_ras *con = amdgpu_ras_get_context(adev); 5899 5900 #if defined(CONFIG_DEBUG_FS) 5901 if (!amdgpu_sriov_vf(adev)) 5902 cancel_work(&adev->reset_work); 5903 #endif 5904 5905 if (adev->kfd.dev) 5906 cancel_work(&adev->kfd.reset_work); 5907 5908 if (amdgpu_sriov_vf(adev)) 5909 cancel_work(&adev->virt.flr_work); 5910 5911 if (con && adev->ras_enabled) 5912 cancel_work(&con->recovery_work); 5913 5914 } 5915 5916 static int amdgpu_device_health_check(struct list_head *device_list_handle) 5917 { 5918 struct amdgpu_device *tmp_adev; 5919 int ret = 0; 5920 u32 status; 5921 5922 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 5923 pci_read_config_dword(tmp_adev->pdev, PCI_COMMAND, &status); 5924 if (PCI_POSSIBLE_ERROR(status)) { 5925 dev_err(tmp_adev->dev, "device lost from bus!"); 5926 ret = -ENODEV; 5927 } 5928 } 5929 5930 return ret; 5931 } 5932 5933 /** 5934 * amdgpu_device_gpu_recover - reset the asic and recover scheduler 5935 * 5936 * @adev: amdgpu_device pointer 5937 * @job: which job trigger hang 5938 * @reset_context: amdgpu reset context pointer 5939 * 5940 * Attempt to reset the GPU if it has hung (all asics). 5941 * Attempt to do soft-reset or full-reset and reinitialize Asic 5942 * Returns 0 for success or an error on failure. 5943 */ 5944 5945 int amdgpu_device_gpu_recover(struct amdgpu_device *adev, 5946 struct amdgpu_job *job, 5947 struct amdgpu_reset_context *reset_context) 5948 { 5949 struct list_head device_list, *device_list_handle = NULL; 5950 bool job_signaled = false; 5951 struct amdgpu_hive_info *hive = NULL; 5952 struct amdgpu_device *tmp_adev = NULL; 5953 int i, r = 0; 5954 bool need_emergency_restart = false; 5955 bool audio_suspended = false; 5956 int retry_limit = AMDGPU_MAX_RETRY_LIMIT; 5957 5958 /* 5959 * If it reaches here because of hang/timeout and a RAS error is 5960 * detected at the same time, let RAS recovery take care of it. 5961 */ 5962 if (amdgpu_ras_is_err_state(adev, AMDGPU_RAS_BLOCK__ANY) && 5963 !amdgpu_sriov_vf(adev) && 5964 reset_context->src != AMDGPU_RESET_SRC_RAS) { 5965 dev_dbg(adev->dev, 5966 "Gpu recovery from source: %d yielding to RAS error recovery handling", 5967 reset_context->src); 5968 return 0; 5969 } 5970 /* 5971 * Special case: RAS triggered and full reset isn't supported 5972 */ 5973 need_emergency_restart = amdgpu_ras_need_emergency_restart(adev); 5974 5975 /* 5976 * Flush RAM to disk so that after reboot 5977 * the user can read log and see why the system rebooted. 5978 */ 5979 if (need_emergency_restart && amdgpu_ras_get_context(adev) && 5980 amdgpu_ras_get_context(adev)->reboot) { 5981 DRM_WARN("Emergency reboot."); 5982 5983 ksys_sync_helper(); 5984 emergency_restart(); 5985 } 5986 5987 dev_info(adev->dev, "GPU %s begin!\n", 5988 need_emergency_restart ? "jobs stop":"reset"); 5989 5990 if (!amdgpu_sriov_vf(adev)) 5991 hive = amdgpu_get_xgmi_hive(adev); 5992 if (hive) 5993 mutex_lock(&hive->hive_lock); 5994 5995 reset_context->job = job; 5996 reset_context->hive = hive; 5997 /* 5998 * Build list of devices to reset. 5999 * In case we are in XGMI hive mode, resort the device list 6000 * to put adev in the 1st position. 6001 */ 6002 INIT_LIST_HEAD(&device_list); 6003 if (!amdgpu_sriov_vf(adev) && (adev->gmc.xgmi.num_physical_nodes > 1) && hive) { 6004 list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head) { 6005 list_add_tail(&tmp_adev->reset_list, &device_list); 6006 if (adev->shutdown) 6007 tmp_adev->shutdown = true; 6008 } 6009 if (!list_is_first(&adev->reset_list, &device_list)) 6010 list_rotate_to_front(&adev->reset_list, &device_list); 6011 device_list_handle = &device_list; 6012 } else { 6013 list_add_tail(&adev->reset_list, &device_list); 6014 device_list_handle = &device_list; 6015 } 6016 6017 if (!amdgpu_sriov_vf(adev)) { 6018 r = amdgpu_device_health_check(device_list_handle); 6019 if (r) 6020 goto end_reset; 6021 } 6022 6023 /* We need to lock reset domain only once both for XGMI and single device */ 6024 tmp_adev = list_first_entry(device_list_handle, struct amdgpu_device, 6025 reset_list); 6026 amdgpu_device_lock_reset_domain(tmp_adev->reset_domain); 6027 6028 /* block all schedulers and reset given job's ring */ 6029 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 6030 6031 amdgpu_device_set_mp1_state(tmp_adev); 6032 6033 /* 6034 * Try to put the audio codec into suspend state 6035 * before gpu reset started. 6036 * 6037 * Due to the power domain of the graphics device 6038 * is shared with AZ power domain. Without this, 6039 * we may change the audio hardware from behind 6040 * the audio driver's back. That will trigger 6041 * some audio codec errors. 6042 */ 6043 if (!amdgpu_device_suspend_display_audio(tmp_adev)) 6044 audio_suspended = true; 6045 6046 amdgpu_ras_set_error_query_ready(tmp_adev, false); 6047 6048 cancel_delayed_work_sync(&tmp_adev->delayed_init_work); 6049 6050 amdgpu_amdkfd_pre_reset(tmp_adev, reset_context); 6051 6052 /* 6053 * Mark these ASICs to be reset as untracked first 6054 * And add them back after reset completed 6055 */ 6056 amdgpu_unregister_gpu_instance(tmp_adev); 6057 6058 drm_client_dev_suspend(adev_to_drm(tmp_adev), false); 6059 6060 /* disable ras on ALL IPs */ 6061 if (!need_emergency_restart && 6062 amdgpu_device_ip_need_full_reset(tmp_adev)) 6063 amdgpu_ras_suspend(tmp_adev); 6064 6065 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 6066 struct amdgpu_ring *ring = tmp_adev->rings[i]; 6067 6068 if (!amdgpu_ring_sched_ready(ring)) 6069 continue; 6070 6071 drm_sched_stop(&ring->sched, job ? &job->base : NULL); 6072 6073 if (need_emergency_restart) 6074 amdgpu_job_stop_all_jobs_on_sched(&ring->sched); 6075 } 6076 atomic_inc(&tmp_adev->gpu_reset_counter); 6077 } 6078 6079 if (need_emergency_restart) 6080 goto skip_sched_resume; 6081 6082 /* 6083 * Must check guilty signal here since after this point all old 6084 * HW fences are force signaled. 6085 * 6086 * job->base holds a reference to parent fence 6087 */ 6088 if (job && dma_fence_is_signaled(&job->hw_fence)) { 6089 job_signaled = true; 6090 dev_info(adev->dev, "Guilty job already signaled, skipping HW reset"); 6091 goto skip_hw_reset; 6092 } 6093 6094 retry: /* Rest of adevs pre asic reset from XGMI hive. */ 6095 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 6096 r = amdgpu_device_pre_asic_reset(tmp_adev, reset_context); 6097 /*TODO Should we stop ?*/ 6098 if (r) { 6099 dev_err(tmp_adev->dev, "GPU pre asic reset failed with err, %d for drm dev, %s ", 6100 r, adev_to_drm(tmp_adev)->unique); 6101 tmp_adev->asic_reset_res = r; 6102 } 6103 } 6104 6105 /* Actual ASIC resets if needed.*/ 6106 /* Host driver will handle XGMI hive reset for SRIOV */ 6107 if (amdgpu_sriov_vf(adev)) { 6108 if (amdgpu_ras_get_fed_status(adev) || amdgpu_virt_rcvd_ras_interrupt(adev)) { 6109 dev_dbg(adev->dev, "Detected RAS error, wait for FLR completion\n"); 6110 amdgpu_ras_set_fed(adev, true); 6111 set_bit(AMDGPU_HOST_FLR, &reset_context->flags); 6112 } 6113 6114 r = amdgpu_device_reset_sriov(adev, reset_context); 6115 if (AMDGPU_RETRY_SRIOV_RESET(r) && (retry_limit--) > 0) { 6116 amdgpu_virt_release_full_gpu(adev, true); 6117 goto retry; 6118 } 6119 if (r) 6120 adev->asic_reset_res = r; 6121 } else { 6122 r = amdgpu_do_asic_reset(device_list_handle, reset_context); 6123 if (r && r == -EAGAIN) 6124 goto retry; 6125 } 6126 6127 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 6128 /* 6129 * Drop any pending non scheduler resets queued before reset is done. 6130 * Any reset scheduled after this point would be valid. Scheduler resets 6131 * were already dropped during drm_sched_stop and no new ones can come 6132 * in before drm_sched_start. 6133 */ 6134 amdgpu_device_stop_pending_resets(tmp_adev); 6135 } 6136 6137 skip_hw_reset: 6138 6139 /* Post ASIC reset for all devs .*/ 6140 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 6141 6142 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 6143 struct amdgpu_ring *ring = tmp_adev->rings[i]; 6144 6145 if (!amdgpu_ring_sched_ready(ring)) 6146 continue; 6147 6148 drm_sched_start(&ring->sched, 0); 6149 } 6150 6151 if (!drm_drv_uses_atomic_modeset(adev_to_drm(tmp_adev)) && !job_signaled) 6152 drm_helper_resume_force_mode(adev_to_drm(tmp_adev)); 6153 6154 if (tmp_adev->asic_reset_res) 6155 r = tmp_adev->asic_reset_res; 6156 6157 tmp_adev->asic_reset_res = 0; 6158 6159 if (r) { 6160 /* bad news, how to tell it to userspace ? 6161 * for ras error, we should report GPU bad status instead of 6162 * reset failure 6163 */ 6164 if (reset_context->src != AMDGPU_RESET_SRC_RAS || 6165 !amdgpu_ras_eeprom_check_err_threshold(tmp_adev)) 6166 dev_info(tmp_adev->dev, "GPU reset(%d) failed\n", 6167 atomic_read(&tmp_adev->gpu_reset_counter)); 6168 amdgpu_vf_error_put(tmp_adev, AMDGIM_ERROR_VF_GPU_RESET_FAIL, 0, r); 6169 } else { 6170 dev_info(tmp_adev->dev, "GPU reset(%d) succeeded!\n", atomic_read(&tmp_adev->gpu_reset_counter)); 6171 if (amdgpu_acpi_smart_shift_update(adev_to_drm(tmp_adev), AMDGPU_SS_DEV_D0)) 6172 DRM_WARN("smart shift update failed\n"); 6173 } 6174 } 6175 6176 skip_sched_resume: 6177 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 6178 /* unlock kfd: SRIOV would do it separately */ 6179 if (!need_emergency_restart && !amdgpu_sriov_vf(tmp_adev)) 6180 amdgpu_amdkfd_post_reset(tmp_adev); 6181 6182 /* kfd_post_reset will do nothing if kfd device is not initialized, 6183 * need to bring up kfd here if it's not be initialized before 6184 */ 6185 if (!adev->kfd.init_complete) 6186 amdgpu_amdkfd_device_init(adev); 6187 6188 if (audio_suspended) 6189 amdgpu_device_resume_display_audio(tmp_adev); 6190 6191 amdgpu_device_unset_mp1_state(tmp_adev); 6192 6193 amdgpu_ras_set_error_query_ready(tmp_adev, true); 6194 } 6195 6196 tmp_adev = list_first_entry(device_list_handle, struct amdgpu_device, 6197 reset_list); 6198 amdgpu_device_unlock_reset_domain(tmp_adev->reset_domain); 6199 6200 end_reset: 6201 if (hive) { 6202 mutex_unlock(&hive->hive_lock); 6203 amdgpu_put_xgmi_hive(hive); 6204 } 6205 6206 if (r) 6207 dev_info(adev->dev, "GPU reset end with ret = %d\n", r); 6208 6209 atomic_set(&adev->reset_domain->reset_res, r); 6210 6211 if (!r) 6212 drm_dev_wedged_event(adev_to_drm(adev), DRM_WEDGE_RECOVERY_NONE); 6213 6214 return r; 6215 } 6216 6217 /** 6218 * amdgpu_device_partner_bandwidth - find the bandwidth of appropriate partner 6219 * 6220 * @adev: amdgpu_device pointer 6221 * @speed: pointer to the speed of the link 6222 * @width: pointer to the width of the link 6223 * 6224 * Evaluate the hierarchy to find the speed and bandwidth capabilities of the 6225 * first physical partner to an AMD dGPU. 6226 * This will exclude any virtual switches and links. 6227 */ 6228 static void amdgpu_device_partner_bandwidth(struct amdgpu_device *adev, 6229 enum pci_bus_speed *speed, 6230 enum pcie_link_width *width) 6231 { 6232 struct pci_dev *parent = adev->pdev; 6233 6234 if (!speed || !width) 6235 return; 6236 6237 *speed = PCI_SPEED_UNKNOWN; 6238 *width = PCIE_LNK_WIDTH_UNKNOWN; 6239 6240 if (amdgpu_device_pcie_dynamic_switching_supported(adev)) { 6241 while ((parent = pci_upstream_bridge(parent))) { 6242 /* skip upstream/downstream switches internal to dGPU*/ 6243 if (parent->vendor == PCI_VENDOR_ID_ATI) 6244 continue; 6245 *speed = pcie_get_speed_cap(parent); 6246 *width = pcie_get_width_cap(parent); 6247 break; 6248 } 6249 } else { 6250 /* use the current speeds rather than max if switching is not supported */ 6251 pcie_bandwidth_available(adev->pdev, NULL, speed, width); 6252 } 6253 } 6254 6255 /** 6256 * amdgpu_device_gpu_bandwidth - find the bandwidth of the GPU 6257 * 6258 * @adev: amdgpu_device pointer 6259 * @speed: pointer to the speed of the link 6260 * @width: pointer to the width of the link 6261 * 6262 * Evaluate the hierarchy to find the speed and bandwidth capabilities of the 6263 * AMD dGPU which may be a virtual upstream bridge. 6264 */ 6265 static void amdgpu_device_gpu_bandwidth(struct amdgpu_device *adev, 6266 enum pci_bus_speed *speed, 6267 enum pcie_link_width *width) 6268 { 6269 struct pci_dev *parent = adev->pdev; 6270 6271 if (!speed || !width) 6272 return; 6273 6274 parent = pci_upstream_bridge(parent); 6275 if (parent && parent->vendor == PCI_VENDOR_ID_ATI) { 6276 /* use the upstream/downstream switches internal to dGPU */ 6277 *speed = pcie_get_speed_cap(parent); 6278 *width = pcie_get_width_cap(parent); 6279 while ((parent = pci_upstream_bridge(parent))) { 6280 if (parent->vendor == PCI_VENDOR_ID_ATI) { 6281 /* use the upstream/downstream switches internal to dGPU */ 6282 *speed = pcie_get_speed_cap(parent); 6283 *width = pcie_get_width_cap(parent); 6284 } 6285 } 6286 } else { 6287 /* use the device itself */ 6288 *speed = pcie_get_speed_cap(adev->pdev); 6289 *width = pcie_get_width_cap(adev->pdev); 6290 } 6291 } 6292 6293 /** 6294 * amdgpu_device_get_pcie_info - fence pcie info about the PCIE slot 6295 * 6296 * @adev: amdgpu_device pointer 6297 * 6298 * Fetches and stores in the driver the PCIE capabilities (gen speed 6299 * and lanes) of the slot the device is in. Handles APUs and 6300 * virtualized environments where PCIE config space may not be available. 6301 */ 6302 static void amdgpu_device_get_pcie_info(struct amdgpu_device *adev) 6303 { 6304 enum pci_bus_speed speed_cap, platform_speed_cap; 6305 enum pcie_link_width platform_link_width, link_width; 6306 6307 if (amdgpu_pcie_gen_cap) 6308 adev->pm.pcie_gen_mask = amdgpu_pcie_gen_cap; 6309 6310 if (amdgpu_pcie_lane_cap) 6311 adev->pm.pcie_mlw_mask = amdgpu_pcie_lane_cap; 6312 6313 /* covers APUs as well */ 6314 if (pci_is_root_bus(adev->pdev->bus) && !amdgpu_passthrough(adev)) { 6315 if (adev->pm.pcie_gen_mask == 0) 6316 adev->pm.pcie_gen_mask = AMDGPU_DEFAULT_PCIE_GEN_MASK; 6317 if (adev->pm.pcie_mlw_mask == 0) 6318 adev->pm.pcie_mlw_mask = AMDGPU_DEFAULT_PCIE_MLW_MASK; 6319 return; 6320 } 6321 6322 if (adev->pm.pcie_gen_mask && adev->pm.pcie_mlw_mask) 6323 return; 6324 6325 amdgpu_device_partner_bandwidth(adev, &platform_speed_cap, 6326 &platform_link_width); 6327 amdgpu_device_gpu_bandwidth(adev, &speed_cap, &link_width); 6328 6329 if (adev->pm.pcie_gen_mask == 0) { 6330 /* asic caps */ 6331 if (speed_cap == PCI_SPEED_UNKNOWN) { 6332 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 6333 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 | 6334 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3); 6335 } else { 6336 if (speed_cap == PCIE_SPEED_32_0GT) 6337 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 6338 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 | 6339 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3 | 6340 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN4 | 6341 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN5); 6342 else if (speed_cap == PCIE_SPEED_16_0GT) 6343 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 6344 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 | 6345 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3 | 6346 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN4); 6347 else if (speed_cap == PCIE_SPEED_8_0GT) 6348 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 6349 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 | 6350 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3); 6351 else if (speed_cap == PCIE_SPEED_5_0GT) 6352 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 6353 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2); 6354 else 6355 adev->pm.pcie_gen_mask |= CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1; 6356 } 6357 /* platform caps */ 6358 if (platform_speed_cap == PCI_SPEED_UNKNOWN) { 6359 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 6360 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2); 6361 } else { 6362 if (platform_speed_cap == PCIE_SPEED_32_0GT) 6363 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 6364 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 | 6365 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3 | 6366 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN4 | 6367 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN5); 6368 else if (platform_speed_cap == PCIE_SPEED_16_0GT) 6369 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 6370 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 | 6371 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3 | 6372 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN4); 6373 else if (platform_speed_cap == PCIE_SPEED_8_0GT) 6374 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 6375 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 | 6376 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3); 6377 else if (platform_speed_cap == PCIE_SPEED_5_0GT) 6378 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 6379 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2); 6380 else 6381 adev->pm.pcie_gen_mask |= CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1; 6382 6383 } 6384 } 6385 if (adev->pm.pcie_mlw_mask == 0) { 6386 /* asic caps */ 6387 if (link_width == PCIE_LNK_WIDTH_UNKNOWN) { 6388 adev->pm.pcie_mlw_mask |= AMDGPU_DEFAULT_ASIC_PCIE_MLW_MASK; 6389 } else { 6390 switch (link_width) { 6391 case PCIE_LNK_X32: 6392 adev->pm.pcie_mlw_mask |= (CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X32 | 6393 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X16 | 6394 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X12 | 6395 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X8 | 6396 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X4 | 6397 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X2 | 6398 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X1); 6399 break; 6400 case PCIE_LNK_X16: 6401 adev->pm.pcie_mlw_mask |= (CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X16 | 6402 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X12 | 6403 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X8 | 6404 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X4 | 6405 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X2 | 6406 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X1); 6407 break; 6408 case PCIE_LNK_X12: 6409 adev->pm.pcie_mlw_mask |= (CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X12 | 6410 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X8 | 6411 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X4 | 6412 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X2 | 6413 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X1); 6414 break; 6415 case PCIE_LNK_X8: 6416 adev->pm.pcie_mlw_mask |= (CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X8 | 6417 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X4 | 6418 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X2 | 6419 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X1); 6420 break; 6421 case PCIE_LNK_X4: 6422 adev->pm.pcie_mlw_mask |= (CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X4 | 6423 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X2 | 6424 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X1); 6425 break; 6426 case PCIE_LNK_X2: 6427 adev->pm.pcie_mlw_mask |= (CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X2 | 6428 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X1); 6429 break; 6430 case PCIE_LNK_X1: 6431 adev->pm.pcie_mlw_mask |= CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X1; 6432 break; 6433 default: 6434 break; 6435 } 6436 } 6437 /* platform caps */ 6438 if (platform_link_width == PCIE_LNK_WIDTH_UNKNOWN) { 6439 adev->pm.pcie_mlw_mask |= AMDGPU_DEFAULT_PCIE_MLW_MASK; 6440 } else { 6441 switch (platform_link_width) { 6442 case PCIE_LNK_X32: 6443 adev->pm.pcie_mlw_mask |= (CAIL_PCIE_LINK_WIDTH_SUPPORT_X32 | 6444 CAIL_PCIE_LINK_WIDTH_SUPPORT_X16 | 6445 CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 | 6446 CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 | 6447 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 6448 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 6449 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 6450 break; 6451 case PCIE_LNK_X16: 6452 adev->pm.pcie_mlw_mask |= (CAIL_PCIE_LINK_WIDTH_SUPPORT_X16 | 6453 CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 | 6454 CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 | 6455 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 6456 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 6457 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 6458 break; 6459 case PCIE_LNK_X12: 6460 adev->pm.pcie_mlw_mask |= (CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 | 6461 CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 | 6462 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 6463 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 6464 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 6465 break; 6466 case PCIE_LNK_X8: 6467 adev->pm.pcie_mlw_mask |= (CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 | 6468 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 6469 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 6470 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 6471 break; 6472 case PCIE_LNK_X4: 6473 adev->pm.pcie_mlw_mask |= (CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 6474 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 6475 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 6476 break; 6477 case PCIE_LNK_X2: 6478 adev->pm.pcie_mlw_mask |= (CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 6479 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 6480 break; 6481 case PCIE_LNK_X1: 6482 adev->pm.pcie_mlw_mask |= CAIL_PCIE_LINK_WIDTH_SUPPORT_X1; 6483 break; 6484 default: 6485 break; 6486 } 6487 } 6488 } 6489 } 6490 6491 /** 6492 * amdgpu_device_is_peer_accessible - Check peer access through PCIe BAR 6493 * 6494 * @adev: amdgpu_device pointer 6495 * @peer_adev: amdgpu_device pointer for peer device trying to access @adev 6496 * 6497 * Return true if @peer_adev can access (DMA) @adev through the PCIe 6498 * BAR, i.e. @adev is "large BAR" and the BAR matches the DMA mask of 6499 * @peer_adev. 6500 */ 6501 bool amdgpu_device_is_peer_accessible(struct amdgpu_device *adev, 6502 struct amdgpu_device *peer_adev) 6503 { 6504 #ifdef CONFIG_HSA_AMD_P2P 6505 bool p2p_access = 6506 !adev->gmc.xgmi.connected_to_cpu && 6507 !(pci_p2pdma_distance(adev->pdev, peer_adev->dev, false) < 0); 6508 if (!p2p_access) 6509 dev_info(adev->dev, "PCIe P2P access from peer device %s is not supported by the chipset\n", 6510 pci_name(peer_adev->pdev)); 6511 6512 bool is_large_bar = adev->gmc.visible_vram_size && 6513 adev->gmc.real_vram_size == adev->gmc.visible_vram_size; 6514 bool p2p_addressable = amdgpu_device_check_iommu_remap(peer_adev); 6515 6516 if (!p2p_addressable) { 6517 uint64_t address_mask = peer_adev->dev->dma_mask ? 6518 ~*peer_adev->dev->dma_mask : ~((1ULL << 32) - 1); 6519 resource_size_t aper_limit = 6520 adev->gmc.aper_base + adev->gmc.aper_size - 1; 6521 6522 p2p_addressable = !(adev->gmc.aper_base & address_mask || 6523 aper_limit & address_mask); 6524 } 6525 return pcie_p2p && is_large_bar && p2p_access && p2p_addressable; 6526 #else 6527 return false; 6528 #endif 6529 } 6530 6531 int amdgpu_device_baco_enter(struct drm_device *dev) 6532 { 6533 struct amdgpu_device *adev = drm_to_adev(dev); 6534 struct amdgpu_ras *ras = amdgpu_ras_get_context(adev); 6535 6536 if (!amdgpu_device_supports_baco(dev)) 6537 return -ENOTSUPP; 6538 6539 if (ras && adev->ras_enabled && 6540 adev->nbio.funcs->enable_doorbell_interrupt) 6541 adev->nbio.funcs->enable_doorbell_interrupt(adev, false); 6542 6543 return amdgpu_dpm_baco_enter(adev); 6544 } 6545 6546 int amdgpu_device_baco_exit(struct drm_device *dev) 6547 { 6548 struct amdgpu_device *adev = drm_to_adev(dev); 6549 struct amdgpu_ras *ras = amdgpu_ras_get_context(adev); 6550 int ret = 0; 6551 6552 if (!amdgpu_device_supports_baco(dev)) 6553 return -ENOTSUPP; 6554 6555 ret = amdgpu_dpm_baco_exit(adev); 6556 if (ret) 6557 return ret; 6558 6559 if (ras && adev->ras_enabled && 6560 adev->nbio.funcs->enable_doorbell_interrupt) 6561 adev->nbio.funcs->enable_doorbell_interrupt(adev, true); 6562 6563 if (amdgpu_passthrough(adev) && adev->nbio.funcs && 6564 adev->nbio.funcs->clear_doorbell_interrupt) 6565 adev->nbio.funcs->clear_doorbell_interrupt(adev); 6566 6567 return 0; 6568 } 6569 6570 /** 6571 * amdgpu_pci_error_detected - Called when a PCI error is detected. 6572 * @pdev: PCI device struct 6573 * @state: PCI channel state 6574 * 6575 * Description: Called when a PCI error is detected. 6576 * 6577 * Return: PCI_ERS_RESULT_NEED_RESET or PCI_ERS_RESULT_DISCONNECT. 6578 */ 6579 pci_ers_result_t amdgpu_pci_error_detected(struct pci_dev *pdev, pci_channel_state_t state) 6580 { 6581 struct drm_device *dev = pci_get_drvdata(pdev); 6582 struct amdgpu_device *adev = drm_to_adev(dev); 6583 int i; 6584 6585 DRM_INFO("PCI error: detected callback, state(%d)!!\n", state); 6586 6587 if (adev->gmc.xgmi.num_physical_nodes > 1) { 6588 DRM_WARN("No support for XGMI hive yet..."); 6589 return PCI_ERS_RESULT_DISCONNECT; 6590 } 6591 6592 adev->pci_channel_state = state; 6593 6594 switch (state) { 6595 case pci_channel_io_normal: 6596 return PCI_ERS_RESULT_CAN_RECOVER; 6597 /* Fatal error, prepare for slot reset */ 6598 case pci_channel_io_frozen: 6599 /* 6600 * Locking adev->reset_domain->sem will prevent any external access 6601 * to GPU during PCI error recovery 6602 */ 6603 amdgpu_device_lock_reset_domain(adev->reset_domain); 6604 amdgpu_device_set_mp1_state(adev); 6605 6606 /* 6607 * Block any work scheduling as we do for regular GPU reset 6608 * for the duration of the recovery 6609 */ 6610 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 6611 struct amdgpu_ring *ring = adev->rings[i]; 6612 6613 if (!amdgpu_ring_sched_ready(ring)) 6614 continue; 6615 6616 drm_sched_stop(&ring->sched, NULL); 6617 } 6618 atomic_inc(&adev->gpu_reset_counter); 6619 return PCI_ERS_RESULT_NEED_RESET; 6620 case pci_channel_io_perm_failure: 6621 /* Permanent error, prepare for device removal */ 6622 return PCI_ERS_RESULT_DISCONNECT; 6623 } 6624 6625 return PCI_ERS_RESULT_NEED_RESET; 6626 } 6627 6628 /** 6629 * amdgpu_pci_mmio_enabled - Enable MMIO and dump debug registers 6630 * @pdev: pointer to PCI device 6631 */ 6632 pci_ers_result_t amdgpu_pci_mmio_enabled(struct pci_dev *pdev) 6633 { 6634 6635 DRM_INFO("PCI error: mmio enabled callback!!\n"); 6636 6637 /* TODO - dump whatever for debugging purposes */ 6638 6639 /* This called only if amdgpu_pci_error_detected returns 6640 * PCI_ERS_RESULT_CAN_RECOVER. Read/write to the device still 6641 * works, no need to reset slot. 6642 */ 6643 6644 return PCI_ERS_RESULT_RECOVERED; 6645 } 6646 6647 /** 6648 * amdgpu_pci_slot_reset - Called when PCI slot has been reset. 6649 * @pdev: PCI device struct 6650 * 6651 * Description: This routine is called by the pci error recovery 6652 * code after the PCI slot has been reset, just before we 6653 * should resume normal operations. 6654 */ 6655 pci_ers_result_t amdgpu_pci_slot_reset(struct pci_dev *pdev) 6656 { 6657 struct drm_device *dev = pci_get_drvdata(pdev); 6658 struct amdgpu_device *adev = drm_to_adev(dev); 6659 int r, i; 6660 struct amdgpu_reset_context reset_context; 6661 u32 memsize; 6662 struct list_head device_list; 6663 6664 /* PCI error slot reset should be skipped During RAS recovery */ 6665 if ((amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 3) || 6666 amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 4)) && 6667 amdgpu_ras_in_recovery(adev)) 6668 return PCI_ERS_RESULT_RECOVERED; 6669 6670 DRM_INFO("PCI error: slot reset callback!!\n"); 6671 6672 memset(&reset_context, 0, sizeof(reset_context)); 6673 6674 INIT_LIST_HEAD(&device_list); 6675 list_add_tail(&adev->reset_list, &device_list); 6676 6677 /* wait for asic to come out of reset */ 6678 msleep(500); 6679 6680 /* Restore PCI confspace */ 6681 amdgpu_device_load_pci_state(pdev); 6682 6683 /* confirm ASIC came out of reset */ 6684 for (i = 0; i < adev->usec_timeout; i++) { 6685 memsize = amdgpu_asic_get_config_memsize(adev); 6686 6687 if (memsize != 0xffffffff) 6688 break; 6689 udelay(1); 6690 } 6691 if (memsize == 0xffffffff) { 6692 r = -ETIME; 6693 goto out; 6694 } 6695 6696 reset_context.method = AMD_RESET_METHOD_NONE; 6697 reset_context.reset_req_dev = adev; 6698 set_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags); 6699 set_bit(AMDGPU_SKIP_HW_RESET, &reset_context.flags); 6700 6701 adev->no_hw_access = true; 6702 r = amdgpu_device_pre_asic_reset(adev, &reset_context); 6703 adev->no_hw_access = false; 6704 if (r) 6705 goto out; 6706 6707 r = amdgpu_do_asic_reset(&device_list, &reset_context); 6708 6709 out: 6710 if (!r) { 6711 if (amdgpu_device_cache_pci_state(adev->pdev)) 6712 pci_restore_state(adev->pdev); 6713 6714 DRM_INFO("PCIe error recovery succeeded\n"); 6715 } else { 6716 DRM_ERROR("PCIe error recovery failed, err:%d", r); 6717 amdgpu_device_unset_mp1_state(adev); 6718 amdgpu_device_unlock_reset_domain(adev->reset_domain); 6719 } 6720 6721 return r ? PCI_ERS_RESULT_DISCONNECT : PCI_ERS_RESULT_RECOVERED; 6722 } 6723 6724 /** 6725 * amdgpu_pci_resume() - resume normal ops after PCI reset 6726 * @pdev: pointer to PCI device 6727 * 6728 * Called when the error recovery driver tells us that its 6729 * OK to resume normal operation. 6730 */ 6731 void amdgpu_pci_resume(struct pci_dev *pdev) 6732 { 6733 struct drm_device *dev = pci_get_drvdata(pdev); 6734 struct amdgpu_device *adev = drm_to_adev(dev); 6735 int i; 6736 6737 6738 DRM_INFO("PCI error: resume callback!!\n"); 6739 6740 /* Only continue execution for the case of pci_channel_io_frozen */ 6741 if (adev->pci_channel_state != pci_channel_io_frozen) 6742 return; 6743 6744 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 6745 struct amdgpu_ring *ring = adev->rings[i]; 6746 6747 if (!amdgpu_ring_sched_ready(ring)) 6748 continue; 6749 6750 drm_sched_start(&ring->sched, 0); 6751 } 6752 6753 amdgpu_device_unset_mp1_state(adev); 6754 amdgpu_device_unlock_reset_domain(adev->reset_domain); 6755 } 6756 6757 bool amdgpu_device_cache_pci_state(struct pci_dev *pdev) 6758 { 6759 struct drm_device *dev = pci_get_drvdata(pdev); 6760 struct amdgpu_device *adev = drm_to_adev(dev); 6761 int r; 6762 6763 if (amdgpu_sriov_vf(adev)) 6764 return false; 6765 6766 r = pci_save_state(pdev); 6767 if (!r) { 6768 kfree(adev->pci_state); 6769 6770 adev->pci_state = pci_store_saved_state(pdev); 6771 6772 if (!adev->pci_state) { 6773 DRM_ERROR("Failed to store PCI saved state"); 6774 return false; 6775 } 6776 } else { 6777 DRM_WARN("Failed to save PCI state, err:%d\n", r); 6778 return false; 6779 } 6780 6781 return true; 6782 } 6783 6784 bool amdgpu_device_load_pci_state(struct pci_dev *pdev) 6785 { 6786 struct drm_device *dev = pci_get_drvdata(pdev); 6787 struct amdgpu_device *adev = drm_to_adev(dev); 6788 int r; 6789 6790 if (!adev->pci_state) 6791 return false; 6792 6793 r = pci_load_saved_state(pdev, adev->pci_state); 6794 6795 if (!r) { 6796 pci_restore_state(pdev); 6797 } else { 6798 DRM_WARN("Failed to load PCI state, err:%d\n", r); 6799 return false; 6800 } 6801 6802 return true; 6803 } 6804 6805 void amdgpu_device_flush_hdp(struct amdgpu_device *adev, 6806 struct amdgpu_ring *ring) 6807 { 6808 #ifdef CONFIG_X86_64 6809 if ((adev->flags & AMD_IS_APU) && !amdgpu_passthrough(adev)) 6810 return; 6811 #endif 6812 if (adev->gmc.xgmi.connected_to_cpu) 6813 return; 6814 6815 if (ring && ring->funcs->emit_hdp_flush) 6816 amdgpu_ring_emit_hdp_flush(ring); 6817 else 6818 amdgpu_asic_flush_hdp(adev, ring); 6819 } 6820 6821 void amdgpu_device_invalidate_hdp(struct amdgpu_device *adev, 6822 struct amdgpu_ring *ring) 6823 { 6824 #ifdef CONFIG_X86_64 6825 if ((adev->flags & AMD_IS_APU) && !amdgpu_passthrough(adev)) 6826 return; 6827 #endif 6828 if (adev->gmc.xgmi.connected_to_cpu) 6829 return; 6830 6831 amdgpu_asic_invalidate_hdp(adev, ring); 6832 } 6833 6834 int amdgpu_in_reset(struct amdgpu_device *adev) 6835 { 6836 return atomic_read(&adev->reset_domain->in_gpu_reset); 6837 } 6838 6839 /** 6840 * amdgpu_device_halt() - bring hardware to some kind of halt state 6841 * 6842 * @adev: amdgpu_device pointer 6843 * 6844 * Bring hardware to some kind of halt state so that no one can touch it 6845 * any more. It will help to maintain error context when error occurred. 6846 * Compare to a simple hang, the system will keep stable at least for SSH 6847 * access. Then it should be trivial to inspect the hardware state and 6848 * see what's going on. Implemented as following: 6849 * 6850 * 1. drm_dev_unplug() makes device inaccessible to user space(IOCTLs, etc), 6851 * clears all CPU mappings to device, disallows remappings through page faults 6852 * 2. amdgpu_irq_disable_all() disables all interrupts 6853 * 3. amdgpu_fence_driver_hw_fini() signals all HW fences 6854 * 4. set adev->no_hw_access to avoid potential crashes after setp 5 6855 * 5. amdgpu_device_unmap_mmio() clears all MMIO mappings 6856 * 6. pci_disable_device() and pci_wait_for_pending_transaction() 6857 * flush any in flight DMA operations 6858 */ 6859 void amdgpu_device_halt(struct amdgpu_device *adev) 6860 { 6861 struct pci_dev *pdev = adev->pdev; 6862 struct drm_device *ddev = adev_to_drm(adev); 6863 6864 amdgpu_xcp_dev_unplug(adev); 6865 drm_dev_unplug(ddev); 6866 6867 amdgpu_irq_disable_all(adev); 6868 6869 amdgpu_fence_driver_hw_fini(adev); 6870 6871 adev->no_hw_access = true; 6872 6873 amdgpu_device_unmap_mmio(adev); 6874 6875 pci_disable_device(pdev); 6876 pci_wait_for_pending_transaction(pdev); 6877 } 6878 6879 u32 amdgpu_device_pcie_port_rreg(struct amdgpu_device *adev, 6880 u32 reg) 6881 { 6882 unsigned long flags, address, data; 6883 u32 r; 6884 6885 address = adev->nbio.funcs->get_pcie_port_index_offset(adev); 6886 data = adev->nbio.funcs->get_pcie_port_data_offset(adev); 6887 6888 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 6889 WREG32(address, reg * 4); 6890 (void)RREG32(address); 6891 r = RREG32(data); 6892 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 6893 return r; 6894 } 6895 6896 void amdgpu_device_pcie_port_wreg(struct amdgpu_device *adev, 6897 u32 reg, u32 v) 6898 { 6899 unsigned long flags, address, data; 6900 6901 address = adev->nbio.funcs->get_pcie_port_index_offset(adev); 6902 data = adev->nbio.funcs->get_pcie_port_data_offset(adev); 6903 6904 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 6905 WREG32(address, reg * 4); 6906 (void)RREG32(address); 6907 WREG32(data, v); 6908 (void)RREG32(data); 6909 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 6910 } 6911 6912 /** 6913 * amdgpu_device_get_gang - return a reference to the current gang 6914 * @adev: amdgpu_device pointer 6915 * 6916 * Returns: A new reference to the current gang leader. 6917 */ 6918 struct dma_fence *amdgpu_device_get_gang(struct amdgpu_device *adev) 6919 { 6920 struct dma_fence *fence; 6921 6922 rcu_read_lock(); 6923 fence = dma_fence_get_rcu_safe(&adev->gang_submit); 6924 rcu_read_unlock(); 6925 return fence; 6926 } 6927 6928 /** 6929 * amdgpu_device_switch_gang - switch to a new gang 6930 * @adev: amdgpu_device pointer 6931 * @gang: the gang to switch to 6932 * 6933 * Try to switch to a new gang. 6934 * Returns: NULL if we switched to the new gang or a reference to the current 6935 * gang leader. 6936 */ 6937 struct dma_fence *amdgpu_device_switch_gang(struct amdgpu_device *adev, 6938 struct dma_fence *gang) 6939 { 6940 struct dma_fence *old = NULL; 6941 6942 dma_fence_get(gang); 6943 do { 6944 dma_fence_put(old); 6945 old = amdgpu_device_get_gang(adev); 6946 if (old == gang) 6947 break; 6948 6949 if (!dma_fence_is_signaled(old)) { 6950 dma_fence_put(gang); 6951 return old; 6952 } 6953 6954 } while (cmpxchg((struct dma_fence __force **)&adev->gang_submit, 6955 old, gang) != old); 6956 6957 /* 6958 * Drop it once for the exchanged reference in adev and once for the 6959 * thread local reference acquired in amdgpu_device_get_gang(). 6960 */ 6961 dma_fence_put(old); 6962 dma_fence_put(old); 6963 return NULL; 6964 } 6965 6966 /** 6967 * amdgpu_device_enforce_isolation - enforce HW isolation 6968 * @adev: the amdgpu device pointer 6969 * @ring: the HW ring the job is supposed to run on 6970 * @job: the job which is about to be pushed to the HW ring 6971 * 6972 * Makes sure that only one client at a time can use the GFX block. 6973 * Returns: The dependency to wait on before the job can be pushed to the HW. 6974 * The function is called multiple times until NULL is returned. 6975 */ 6976 struct dma_fence *amdgpu_device_enforce_isolation(struct amdgpu_device *adev, 6977 struct amdgpu_ring *ring, 6978 struct amdgpu_job *job) 6979 { 6980 struct amdgpu_isolation *isolation = &adev->isolation[ring->xcp_id]; 6981 struct drm_sched_fence *f = job->base.s_fence; 6982 struct dma_fence *dep; 6983 void *owner; 6984 int r; 6985 6986 /* 6987 * For now enforce isolation only for the GFX block since we only need 6988 * the cleaner shader on those rings. 6989 */ 6990 if (ring->funcs->type != AMDGPU_RING_TYPE_GFX && 6991 ring->funcs->type != AMDGPU_RING_TYPE_COMPUTE) 6992 return NULL; 6993 6994 /* 6995 * All submissions where enforce isolation is false are handled as if 6996 * they come from a single client. Use ~0l as the owner to distinct it 6997 * from kernel submissions where the owner is NULL. 6998 */ 6999 owner = job->enforce_isolation ? f->owner : (void *)~0l; 7000 7001 mutex_lock(&adev->enforce_isolation_mutex); 7002 7003 /* 7004 * The "spearhead" submission is the first one which changes the 7005 * ownership to its client. We always need to wait for it to be 7006 * pushed to the HW before proceeding with anything. 7007 */ 7008 if (&f->scheduled != isolation->spearhead && 7009 !dma_fence_is_signaled(isolation->spearhead)) { 7010 dep = isolation->spearhead; 7011 goto out_grab_ref; 7012 } 7013 7014 if (isolation->owner != owner) { 7015 7016 /* 7017 * Wait for any gang to be assembled before switching to a 7018 * different owner or otherwise we could deadlock the 7019 * submissions. 7020 */ 7021 if (!job->gang_submit) { 7022 dep = amdgpu_device_get_gang(adev); 7023 if (!dma_fence_is_signaled(dep)) 7024 goto out_return_dep; 7025 dma_fence_put(dep); 7026 } 7027 7028 dma_fence_put(isolation->spearhead); 7029 isolation->spearhead = dma_fence_get(&f->scheduled); 7030 amdgpu_sync_move(&isolation->active, &isolation->prev); 7031 trace_amdgpu_isolation(isolation->owner, owner); 7032 isolation->owner = owner; 7033 } 7034 7035 /* 7036 * Specifying the ring here helps to pipeline submissions even when 7037 * isolation is enabled. If that is not desired for testing NULL can be 7038 * used instead of the ring to enforce a CPU round trip while switching 7039 * between clients. 7040 */ 7041 dep = amdgpu_sync_peek_fence(&isolation->prev, ring); 7042 r = amdgpu_sync_fence(&isolation->active, &f->finished, GFP_NOWAIT); 7043 if (r) 7044 DRM_WARN("OOM tracking isolation\n"); 7045 7046 out_grab_ref: 7047 dma_fence_get(dep); 7048 out_return_dep: 7049 mutex_unlock(&adev->enforce_isolation_mutex); 7050 return dep; 7051 } 7052 7053 bool amdgpu_device_has_display_hardware(struct amdgpu_device *adev) 7054 { 7055 switch (adev->asic_type) { 7056 #ifdef CONFIG_DRM_AMDGPU_SI 7057 case CHIP_HAINAN: 7058 #endif 7059 case CHIP_TOPAZ: 7060 /* chips with no display hardware */ 7061 return false; 7062 #ifdef CONFIG_DRM_AMDGPU_SI 7063 case CHIP_TAHITI: 7064 case CHIP_PITCAIRN: 7065 case CHIP_VERDE: 7066 case CHIP_OLAND: 7067 #endif 7068 #ifdef CONFIG_DRM_AMDGPU_CIK 7069 case CHIP_BONAIRE: 7070 case CHIP_HAWAII: 7071 case CHIP_KAVERI: 7072 case CHIP_KABINI: 7073 case CHIP_MULLINS: 7074 #endif 7075 case CHIP_TONGA: 7076 case CHIP_FIJI: 7077 case CHIP_POLARIS10: 7078 case CHIP_POLARIS11: 7079 case CHIP_POLARIS12: 7080 case CHIP_VEGAM: 7081 case CHIP_CARRIZO: 7082 case CHIP_STONEY: 7083 /* chips with display hardware */ 7084 return true; 7085 default: 7086 /* IP discovery */ 7087 if (!amdgpu_ip_version(adev, DCE_HWIP, 0) || 7088 (adev->harvest_ip_mask & AMD_HARVEST_IP_DMU_MASK)) 7089 return false; 7090 return true; 7091 } 7092 } 7093 7094 uint32_t amdgpu_device_wait_on_rreg(struct amdgpu_device *adev, 7095 uint32_t inst, uint32_t reg_addr, char reg_name[], 7096 uint32_t expected_value, uint32_t mask) 7097 { 7098 uint32_t ret = 0; 7099 uint32_t old_ = 0; 7100 uint32_t tmp_ = RREG32(reg_addr); 7101 uint32_t loop = adev->usec_timeout; 7102 7103 while ((tmp_ & (mask)) != (expected_value)) { 7104 if (old_ != tmp_) { 7105 loop = adev->usec_timeout; 7106 old_ = tmp_; 7107 } else 7108 udelay(1); 7109 tmp_ = RREG32(reg_addr); 7110 loop--; 7111 if (!loop) { 7112 DRM_WARN("Register(%d) [%s] failed to reach value 0x%08x != 0x%08xn", 7113 inst, reg_name, (uint32_t)expected_value, 7114 (uint32_t)(tmp_ & (mask))); 7115 ret = -ETIMEDOUT; 7116 break; 7117 } 7118 } 7119 return ret; 7120 } 7121 7122 ssize_t amdgpu_get_soft_full_reset_mask(struct amdgpu_ring *ring) 7123 { 7124 ssize_t size = 0; 7125 7126 if (!ring || !ring->adev) 7127 return size; 7128 7129 if (amdgpu_device_should_recover_gpu(ring->adev)) 7130 size |= AMDGPU_RESET_TYPE_FULL; 7131 7132 if (unlikely(!ring->adev->debug_disable_soft_recovery) && 7133 !amdgpu_sriov_vf(ring->adev) && ring->funcs->soft_recovery) 7134 size |= AMDGPU_RESET_TYPE_SOFT_RESET; 7135 7136 return size; 7137 } 7138 7139 ssize_t amdgpu_show_reset_mask(char *buf, uint32_t supported_reset) 7140 { 7141 ssize_t size = 0; 7142 7143 if (supported_reset == 0) { 7144 size += sysfs_emit_at(buf, size, "unsupported"); 7145 size += sysfs_emit_at(buf, size, "\n"); 7146 return size; 7147 7148 } 7149 7150 if (supported_reset & AMDGPU_RESET_TYPE_SOFT_RESET) 7151 size += sysfs_emit_at(buf, size, "soft "); 7152 7153 if (supported_reset & AMDGPU_RESET_TYPE_PER_QUEUE) 7154 size += sysfs_emit_at(buf, size, "queue "); 7155 7156 if (supported_reset & AMDGPU_RESET_TYPE_PER_PIPE) 7157 size += sysfs_emit_at(buf, size, "pipe "); 7158 7159 if (supported_reset & AMDGPU_RESET_TYPE_FULL) 7160 size += sysfs_emit_at(buf, size, "full "); 7161 7162 size += sysfs_emit_at(buf, size, "\n"); 7163 return size; 7164 } 7165