1 /* 2 * Copyright 2008 Advanced Micro Devices, Inc. 3 * Copyright 2008 Red Hat Inc. 4 * Copyright 2009 Jerome Glisse. 5 * 6 * Permission is hereby granted, free of charge, to any person obtaining a 7 * copy of this software and associated documentation files (the "Software"), 8 * to deal in the Software without restriction, including without limitation 9 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 10 * and/or sell copies of the Software, and to permit persons to whom the 11 * Software is furnished to do so, subject to the following conditions: 12 * 13 * The above copyright notice and this permission notice shall be included in 14 * all copies or substantial portions of the Software. 15 * 16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 19 * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR 20 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, 21 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR 22 * OTHER DEALINGS IN THE SOFTWARE. 23 * 24 * Authors: Dave Airlie 25 * Alex Deucher 26 * Jerome Glisse 27 */ 28 29 #include <linux/aperture.h> 30 #include <linux/power_supply.h> 31 #include <linux/kthread.h> 32 #include <linux/module.h> 33 #include <linux/console.h> 34 #include <linux/slab.h> 35 #include <linux/iommu.h> 36 #include <linux/pci.h> 37 #include <linux/pci-p2pdma.h> 38 #include <linux/apple-gmux.h> 39 40 #include <drm/drm_atomic_helper.h> 41 #include <drm/drm_client_event.h> 42 #include <drm/drm_crtc_helper.h> 43 #include <drm/drm_probe_helper.h> 44 #include <drm/amdgpu_drm.h> 45 #include <linux/device.h> 46 #include <linux/vgaarb.h> 47 #include <linux/vga_switcheroo.h> 48 #include <linux/efi.h> 49 #include "amdgpu.h" 50 #include "amdgpu_trace.h" 51 #include "amdgpu_i2c.h" 52 #include "atom.h" 53 #include "amdgpu_atombios.h" 54 #include "amdgpu_atomfirmware.h" 55 #include "amd_pcie.h" 56 #ifdef CONFIG_DRM_AMDGPU_SI 57 #include "si.h" 58 #endif 59 #ifdef CONFIG_DRM_AMDGPU_CIK 60 #include "cik.h" 61 #endif 62 #include "vi.h" 63 #include "soc15.h" 64 #include "nv.h" 65 #include "bif/bif_4_1_d.h" 66 #include <linux/firmware.h> 67 #include "amdgpu_vf_error.h" 68 69 #include "amdgpu_amdkfd.h" 70 #include "amdgpu_pm.h" 71 72 #include "amdgpu_xgmi.h" 73 #include "amdgpu_ras.h" 74 #include "amdgpu_ras_mgr.h" 75 #include "amdgpu_pmu.h" 76 #include "amdgpu_fru_eeprom.h" 77 #include "amdgpu_reset.h" 78 #include "amdgpu_virt.h" 79 #include "amdgpu_dev_coredump.h" 80 81 #include <linux/suspend.h> 82 #include <drm/task_barrier.h> 83 #include <linux/pm_runtime.h> 84 85 #include <drm/drm_drv.h> 86 87 #if IS_ENABLED(CONFIG_X86) 88 #include <asm/intel-family.h> 89 #include <asm/cpu_device_id.h> 90 #endif 91 92 MODULE_FIRMWARE("amdgpu/vega10_gpu_info.bin"); 93 MODULE_FIRMWARE("amdgpu/vega12_gpu_info.bin"); 94 MODULE_FIRMWARE("amdgpu/raven_gpu_info.bin"); 95 MODULE_FIRMWARE("amdgpu/picasso_gpu_info.bin"); 96 MODULE_FIRMWARE("amdgpu/raven2_gpu_info.bin"); 97 MODULE_FIRMWARE("amdgpu/arcturus_gpu_info.bin"); 98 MODULE_FIRMWARE("amdgpu/navi12_gpu_info.bin"); 99 MODULE_FIRMWARE("amdgpu/cyan_skillfish_gpu_info.bin"); 100 101 #define AMDGPU_RESUME_MS 2000 102 #define AMDGPU_MAX_RETRY_LIMIT 2 103 #define AMDGPU_RETRY_SRIOV_RESET(r) ((r) == -EBUSY || (r) == -ETIMEDOUT || (r) == -EINVAL) 104 #define AMDGPU_PCIE_INDEX_FALLBACK (0x38 >> 2) 105 #define AMDGPU_PCIE_INDEX_HI_FALLBACK (0x44 >> 2) 106 #define AMDGPU_PCIE_DATA_FALLBACK (0x3C >> 2) 107 108 #define AMDGPU_VBIOS_SKIP (1U << 0) 109 #define AMDGPU_VBIOS_OPTIONAL (1U << 1) 110 111 static const struct drm_driver amdgpu_kms_driver; 112 113 const char *amdgpu_asic_name[] = { 114 "TAHITI", 115 "PITCAIRN", 116 "VERDE", 117 "OLAND", 118 "HAINAN", 119 "BONAIRE", 120 "KAVERI", 121 "KABINI", 122 "HAWAII", 123 "MULLINS", 124 "TOPAZ", 125 "TONGA", 126 "FIJI", 127 "CARRIZO", 128 "STONEY", 129 "POLARIS10", 130 "POLARIS11", 131 "POLARIS12", 132 "VEGAM", 133 "VEGA10", 134 "VEGA12", 135 "VEGA20", 136 "RAVEN", 137 "ARCTURUS", 138 "RENOIR", 139 "ALDEBARAN", 140 "NAVI10", 141 "CYAN_SKILLFISH", 142 "NAVI14", 143 "NAVI12", 144 "SIENNA_CICHLID", 145 "NAVY_FLOUNDER", 146 "VANGOGH", 147 "DIMGREY_CAVEFISH", 148 "BEIGE_GOBY", 149 "YELLOW_CARP", 150 "IP DISCOVERY", 151 "LAST", 152 }; 153 154 #define AMDGPU_IP_BLK_MASK_ALL GENMASK(AMD_IP_BLOCK_TYPE_NUM - 1, 0) 155 /* 156 * Default init level where all blocks are expected to be initialized. This is 157 * the level of initialization expected by default and also after a full reset 158 * of the device. 159 */ 160 struct amdgpu_init_level amdgpu_init_default = { 161 .level = AMDGPU_INIT_LEVEL_DEFAULT, 162 .hwini_ip_block_mask = AMDGPU_IP_BLK_MASK_ALL, 163 }; 164 165 struct amdgpu_init_level amdgpu_init_recovery = { 166 .level = AMDGPU_INIT_LEVEL_RESET_RECOVERY, 167 .hwini_ip_block_mask = AMDGPU_IP_BLK_MASK_ALL, 168 }; 169 170 /* 171 * Minimal blocks needed to be initialized before a XGMI hive can be reset. This 172 * is used for cases like reset on initialization where the entire hive needs to 173 * be reset before first use. 174 */ 175 struct amdgpu_init_level amdgpu_init_minimal_xgmi = { 176 .level = AMDGPU_INIT_LEVEL_MINIMAL_XGMI, 177 .hwini_ip_block_mask = 178 BIT(AMD_IP_BLOCK_TYPE_GMC) | BIT(AMD_IP_BLOCK_TYPE_SMC) | 179 BIT(AMD_IP_BLOCK_TYPE_COMMON) | BIT(AMD_IP_BLOCK_TYPE_IH) | 180 BIT(AMD_IP_BLOCK_TYPE_PSP) 181 }; 182 183 static int amdgpu_device_ip_resume_phase1(struct amdgpu_device *adev); 184 static int amdgpu_device_ip_resume_phase2(struct amdgpu_device *adev); 185 static int amdgpu_device_ip_resume_phase3(struct amdgpu_device *adev); 186 187 static void amdgpu_device_load_switch_state(struct amdgpu_device *adev); 188 189 static inline bool amdgpu_ip_member_of_hwini(struct amdgpu_device *adev, 190 enum amd_ip_block_type block) 191 { 192 return (adev->init_lvl->hwini_ip_block_mask & (1U << block)) != 0; 193 } 194 195 void amdgpu_set_init_level(struct amdgpu_device *adev, 196 enum amdgpu_init_lvl_id lvl) 197 { 198 switch (lvl) { 199 case AMDGPU_INIT_LEVEL_MINIMAL_XGMI: 200 adev->init_lvl = &amdgpu_init_minimal_xgmi; 201 break; 202 case AMDGPU_INIT_LEVEL_RESET_RECOVERY: 203 adev->init_lvl = &amdgpu_init_recovery; 204 break; 205 case AMDGPU_INIT_LEVEL_DEFAULT: 206 fallthrough; 207 default: 208 adev->init_lvl = &amdgpu_init_default; 209 break; 210 } 211 } 212 213 static inline void amdgpu_device_stop_pending_resets(struct amdgpu_device *adev); 214 static int amdgpu_device_pm_notifier(struct notifier_block *nb, unsigned long mode, 215 void *data); 216 217 /** 218 * DOC: pcie_replay_count 219 * 220 * The amdgpu driver provides a sysfs API for reporting the total number 221 * of PCIe replays (NAKs). 222 * The file pcie_replay_count is used for this and returns the total 223 * number of replays as a sum of the NAKs generated and NAKs received. 224 */ 225 226 static ssize_t amdgpu_device_get_pcie_replay_count(struct device *dev, 227 struct device_attribute *attr, char *buf) 228 { 229 struct drm_device *ddev = dev_get_drvdata(dev); 230 struct amdgpu_device *adev = drm_to_adev(ddev); 231 uint64_t cnt = amdgpu_asic_get_pcie_replay_count(adev); 232 233 return sysfs_emit(buf, "%llu\n", cnt); 234 } 235 236 static DEVICE_ATTR(pcie_replay_count, 0444, 237 amdgpu_device_get_pcie_replay_count, NULL); 238 239 static int amdgpu_device_attr_sysfs_init(struct amdgpu_device *adev) 240 { 241 int ret = 0; 242 243 if (amdgpu_nbio_is_replay_cnt_supported(adev)) 244 ret = sysfs_create_file(&adev->dev->kobj, 245 &dev_attr_pcie_replay_count.attr); 246 247 return ret; 248 } 249 250 static void amdgpu_device_attr_sysfs_fini(struct amdgpu_device *adev) 251 { 252 if (amdgpu_nbio_is_replay_cnt_supported(adev)) 253 sysfs_remove_file(&adev->dev->kobj, 254 &dev_attr_pcie_replay_count.attr); 255 } 256 257 static ssize_t amdgpu_sysfs_reg_state_get(struct file *f, struct kobject *kobj, 258 const struct bin_attribute *attr, char *buf, 259 loff_t ppos, size_t count) 260 { 261 struct device *dev = kobj_to_dev(kobj); 262 struct drm_device *ddev = dev_get_drvdata(dev); 263 struct amdgpu_device *adev = drm_to_adev(ddev); 264 ssize_t bytes_read; 265 266 switch (ppos) { 267 case AMDGPU_SYS_REG_STATE_XGMI: 268 bytes_read = amdgpu_asic_get_reg_state( 269 adev, AMDGPU_REG_STATE_TYPE_XGMI, buf, count); 270 break; 271 case AMDGPU_SYS_REG_STATE_WAFL: 272 bytes_read = amdgpu_asic_get_reg_state( 273 adev, AMDGPU_REG_STATE_TYPE_WAFL, buf, count); 274 break; 275 case AMDGPU_SYS_REG_STATE_PCIE: 276 bytes_read = amdgpu_asic_get_reg_state( 277 adev, AMDGPU_REG_STATE_TYPE_PCIE, buf, count); 278 break; 279 case AMDGPU_SYS_REG_STATE_USR: 280 bytes_read = amdgpu_asic_get_reg_state( 281 adev, AMDGPU_REG_STATE_TYPE_USR, buf, count); 282 break; 283 case AMDGPU_SYS_REG_STATE_USR_1: 284 bytes_read = amdgpu_asic_get_reg_state( 285 adev, AMDGPU_REG_STATE_TYPE_USR_1, buf, count); 286 break; 287 default: 288 return -EINVAL; 289 } 290 291 return bytes_read; 292 } 293 294 static const BIN_ATTR(reg_state, 0444, amdgpu_sysfs_reg_state_get, NULL, 295 AMDGPU_SYS_REG_STATE_END); 296 297 int amdgpu_reg_state_sysfs_init(struct amdgpu_device *adev) 298 { 299 int ret; 300 301 if (!amdgpu_asic_get_reg_state_supported(adev)) 302 return 0; 303 304 ret = sysfs_create_bin_file(&adev->dev->kobj, &bin_attr_reg_state); 305 306 return ret; 307 } 308 309 void amdgpu_reg_state_sysfs_fini(struct amdgpu_device *adev) 310 { 311 if (!amdgpu_asic_get_reg_state_supported(adev)) 312 return; 313 sysfs_remove_bin_file(&adev->dev->kobj, &bin_attr_reg_state); 314 } 315 316 int amdgpu_ip_block_suspend(struct amdgpu_ip_block *ip_block) 317 { 318 int r; 319 320 if (ip_block->version->funcs->suspend) { 321 r = ip_block->version->funcs->suspend(ip_block); 322 if (r) { 323 dev_err(ip_block->adev->dev, 324 "suspend of IP block <%s> failed %d\n", 325 ip_block->version->funcs->name, r); 326 return r; 327 } 328 } 329 330 ip_block->status.hw = false; 331 return 0; 332 } 333 334 int amdgpu_ip_block_resume(struct amdgpu_ip_block *ip_block) 335 { 336 int r; 337 338 if (ip_block->version->funcs->resume) { 339 r = ip_block->version->funcs->resume(ip_block); 340 if (r) { 341 dev_err(ip_block->adev->dev, 342 "resume of IP block <%s> failed %d\n", 343 ip_block->version->funcs->name, r); 344 return r; 345 } 346 } 347 348 ip_block->status.hw = true; 349 return 0; 350 } 351 352 /** 353 * DOC: board_info 354 * 355 * The amdgpu driver provides a sysfs API for giving board related information. 356 * It provides the form factor information in the format 357 * 358 * type : form factor 359 * 360 * Possible form factor values 361 * 362 * - "cem" - PCIE CEM card 363 * - "oam" - Open Compute Accelerator Module 364 * - "unknown" - Not known 365 * 366 */ 367 368 static ssize_t amdgpu_device_get_board_info(struct device *dev, 369 struct device_attribute *attr, 370 char *buf) 371 { 372 struct drm_device *ddev = dev_get_drvdata(dev); 373 struct amdgpu_device *adev = drm_to_adev(ddev); 374 enum amdgpu_pkg_type pkg_type = AMDGPU_PKG_TYPE_CEM; 375 const char *pkg; 376 377 if (adev->smuio.funcs && adev->smuio.funcs->get_pkg_type) 378 pkg_type = adev->smuio.funcs->get_pkg_type(adev); 379 380 switch (pkg_type) { 381 case AMDGPU_PKG_TYPE_CEM: 382 pkg = "cem"; 383 break; 384 case AMDGPU_PKG_TYPE_OAM: 385 pkg = "oam"; 386 break; 387 default: 388 pkg = "unknown"; 389 break; 390 } 391 392 return sysfs_emit(buf, "%s : %s\n", "type", pkg); 393 } 394 395 static DEVICE_ATTR(board_info, 0444, amdgpu_device_get_board_info, NULL); 396 397 static struct attribute *amdgpu_board_attrs[] = { 398 &dev_attr_board_info.attr, 399 NULL, 400 }; 401 402 static umode_t amdgpu_board_attrs_is_visible(struct kobject *kobj, 403 struct attribute *attr, int n) 404 { 405 struct device *dev = kobj_to_dev(kobj); 406 struct drm_device *ddev = dev_get_drvdata(dev); 407 struct amdgpu_device *adev = drm_to_adev(ddev); 408 409 if (adev->flags & AMD_IS_APU) 410 return 0; 411 412 return attr->mode; 413 } 414 415 static const struct attribute_group amdgpu_board_attrs_group = { 416 .attrs = amdgpu_board_attrs, 417 .is_visible = amdgpu_board_attrs_is_visible 418 }; 419 420 static void amdgpu_device_get_pcie_info(struct amdgpu_device *adev); 421 422 /** 423 * amdgpu_device_supports_px - Is the device a dGPU with ATPX power control 424 * 425 * @adev: amdgpu device pointer 426 * 427 * Returns true if the device is a dGPU with ATPX power control, 428 * otherwise return false. 429 */ 430 bool amdgpu_device_supports_px(struct amdgpu_device *adev) 431 { 432 if ((adev->flags & AMD_IS_PX) && !amdgpu_is_atpx_hybrid()) 433 return true; 434 return false; 435 } 436 437 /** 438 * amdgpu_device_supports_boco - Is the device a dGPU with ACPI power resources 439 * 440 * @adev: amdgpu device pointer 441 * 442 * Returns true if the device is a dGPU with ACPI power control, 443 * otherwise return false. 444 */ 445 bool amdgpu_device_supports_boco(struct amdgpu_device *adev) 446 { 447 if (!IS_ENABLED(CONFIG_HOTPLUG_PCI_PCIE)) 448 return false; 449 450 if (adev->has_pr3 || 451 ((adev->flags & AMD_IS_PX) && amdgpu_is_atpx_hybrid())) 452 return true; 453 return false; 454 } 455 456 /** 457 * amdgpu_device_supports_baco - Does the device support BACO 458 * 459 * @adev: amdgpu device pointer 460 * 461 * Return: 462 * 1 if the device supports BACO; 463 * 3 if the device supports MACO (only works if BACO is supported) 464 * otherwise return 0. 465 */ 466 int amdgpu_device_supports_baco(struct amdgpu_device *adev) 467 { 468 return amdgpu_asic_supports_baco(adev); 469 } 470 471 void amdgpu_device_detect_runtime_pm_mode(struct amdgpu_device *adev) 472 { 473 int bamaco_support; 474 475 adev->pm.rpm_mode = AMDGPU_RUNPM_NONE; 476 bamaco_support = amdgpu_device_supports_baco(adev); 477 478 switch (amdgpu_runtime_pm) { 479 case 2: 480 if (bamaco_support & MACO_SUPPORT) { 481 adev->pm.rpm_mode = AMDGPU_RUNPM_BAMACO; 482 dev_info(adev->dev, "Forcing BAMACO for runtime pm\n"); 483 } else if (bamaco_support == BACO_SUPPORT) { 484 adev->pm.rpm_mode = AMDGPU_RUNPM_BACO; 485 dev_info(adev->dev, "Requested mode BAMACO not available,fallback to use BACO\n"); 486 } 487 break; 488 case 1: 489 if (bamaco_support & BACO_SUPPORT) { 490 adev->pm.rpm_mode = AMDGPU_RUNPM_BACO; 491 dev_info(adev->dev, "Forcing BACO for runtime pm\n"); 492 } 493 break; 494 case -1: 495 case -2: 496 if (amdgpu_device_supports_px(adev)) { 497 /* enable PX as runtime mode */ 498 adev->pm.rpm_mode = AMDGPU_RUNPM_PX; 499 dev_info(adev->dev, "Using ATPX for runtime pm\n"); 500 } else if (amdgpu_device_supports_boco(adev)) { 501 /* enable boco as runtime mode */ 502 adev->pm.rpm_mode = AMDGPU_RUNPM_BOCO; 503 dev_info(adev->dev, "Using BOCO for runtime pm\n"); 504 } else { 505 if (!bamaco_support) 506 goto no_runtime_pm; 507 508 switch (adev->asic_type) { 509 case CHIP_VEGA20: 510 case CHIP_ARCTURUS: 511 /* BACO are not supported on vega20 and arctrus */ 512 break; 513 case CHIP_VEGA10: 514 /* enable BACO as runpm mode if noretry=0 */ 515 if (!adev->gmc.noretry && !amdgpu_passthrough(adev)) 516 adev->pm.rpm_mode = AMDGPU_RUNPM_BACO; 517 break; 518 default: 519 /* enable BACO as runpm mode on CI+ */ 520 if (!amdgpu_passthrough(adev)) 521 adev->pm.rpm_mode = AMDGPU_RUNPM_BACO; 522 break; 523 } 524 525 if (adev->pm.rpm_mode == AMDGPU_RUNPM_BACO) { 526 if (bamaco_support & MACO_SUPPORT) { 527 adev->pm.rpm_mode = AMDGPU_RUNPM_BAMACO; 528 dev_info(adev->dev, "Using BAMACO for runtime pm\n"); 529 } else { 530 dev_info(adev->dev, "Using BACO for runtime pm\n"); 531 } 532 } 533 } 534 break; 535 case 0: 536 dev_info(adev->dev, "runtime pm is manually disabled\n"); 537 break; 538 default: 539 break; 540 } 541 542 no_runtime_pm: 543 if (adev->pm.rpm_mode == AMDGPU_RUNPM_NONE) 544 dev_info(adev->dev, "Runtime PM not available\n"); 545 } 546 /** 547 * amdgpu_device_supports_smart_shift - Is the device dGPU with 548 * smart shift support 549 * 550 * @adev: amdgpu device pointer 551 * 552 * Returns true if the device is a dGPU with Smart Shift support, 553 * otherwise returns false. 554 */ 555 bool amdgpu_device_supports_smart_shift(struct amdgpu_device *adev) 556 { 557 return (amdgpu_device_supports_boco(adev) && 558 amdgpu_acpi_is_power_shift_control_supported()); 559 } 560 561 /* 562 * VRAM access helper functions 563 */ 564 565 /** 566 * amdgpu_device_mm_access - access vram by MM_INDEX/MM_DATA 567 * 568 * @adev: amdgpu_device pointer 569 * @pos: offset of the buffer in vram 570 * @buf: virtual address of the buffer in system memory 571 * @size: read/write size, sizeof(@buf) must > @size 572 * @write: true - write to vram, otherwise - read from vram 573 */ 574 void amdgpu_device_mm_access(struct amdgpu_device *adev, loff_t pos, 575 void *buf, size_t size, bool write) 576 { 577 unsigned long flags; 578 uint32_t hi = ~0, tmp = 0; 579 uint32_t *data = buf; 580 uint64_t last; 581 int idx; 582 583 if (!drm_dev_enter(adev_to_drm(adev), &idx)) 584 return; 585 586 BUG_ON(!IS_ALIGNED(pos, 4) || !IS_ALIGNED(size, 4)); 587 588 spin_lock_irqsave(&adev->mmio_idx_lock, flags); 589 for (last = pos + size; pos < last; pos += 4) { 590 tmp = pos >> 31; 591 592 WREG32_NO_KIQ(mmMM_INDEX, ((uint32_t)pos) | 0x80000000); 593 if (tmp != hi) { 594 WREG32_NO_KIQ(mmMM_INDEX_HI, tmp); 595 hi = tmp; 596 } 597 if (write) 598 WREG32_NO_KIQ(mmMM_DATA, *data++); 599 else 600 *data++ = RREG32_NO_KIQ(mmMM_DATA); 601 } 602 603 spin_unlock_irqrestore(&adev->mmio_idx_lock, flags); 604 drm_dev_exit(idx); 605 } 606 607 /** 608 * amdgpu_device_aper_access - access vram by vram aperture 609 * 610 * @adev: amdgpu_device pointer 611 * @pos: offset of the buffer in vram 612 * @buf: virtual address of the buffer in system memory 613 * @size: read/write size, sizeof(@buf) must > @size 614 * @write: true - write to vram, otherwise - read from vram 615 * 616 * The return value means how many bytes have been transferred. 617 */ 618 size_t amdgpu_device_aper_access(struct amdgpu_device *adev, loff_t pos, 619 void *buf, size_t size, bool write) 620 { 621 #ifdef CONFIG_64BIT 622 void __iomem *addr; 623 size_t count = 0; 624 uint64_t last; 625 626 if (!adev->mman.aper_base_kaddr) 627 return 0; 628 629 last = min(pos + size, adev->gmc.visible_vram_size); 630 if (last > pos) { 631 addr = adev->mman.aper_base_kaddr + pos; 632 count = last - pos; 633 634 if (write) { 635 memcpy_toio(addr, buf, count); 636 /* Make sure HDP write cache flush happens without any reordering 637 * after the system memory contents are sent over PCIe device 638 */ 639 mb(); 640 amdgpu_device_flush_hdp(adev, NULL); 641 } else { 642 amdgpu_device_invalidate_hdp(adev, NULL); 643 /* Make sure HDP read cache is invalidated before issuing a read 644 * to the PCIe device 645 */ 646 mb(); 647 memcpy_fromio(buf, addr, count); 648 } 649 650 } 651 652 return count; 653 #else 654 return 0; 655 #endif 656 } 657 658 /** 659 * amdgpu_device_vram_access - read/write a buffer in vram 660 * 661 * @adev: amdgpu_device pointer 662 * @pos: offset of the buffer in vram 663 * @buf: virtual address of the buffer in system memory 664 * @size: read/write size, sizeof(@buf) must > @size 665 * @write: true - write to vram, otherwise - read from vram 666 */ 667 void amdgpu_device_vram_access(struct amdgpu_device *adev, loff_t pos, 668 void *buf, size_t size, bool write) 669 { 670 size_t count; 671 672 /* try to using vram apreature to access vram first */ 673 count = amdgpu_device_aper_access(adev, pos, buf, size, write); 674 size -= count; 675 if (size) { 676 /* using MM to access rest vram */ 677 pos += count; 678 buf += count; 679 amdgpu_device_mm_access(adev, pos, buf, size, write); 680 } 681 } 682 683 /* 684 * register access helper functions. 685 */ 686 687 /* Check if hw access should be skipped because of hotplug or device error */ 688 bool amdgpu_device_skip_hw_access(struct amdgpu_device *adev) 689 { 690 if (adev->no_hw_access) 691 return true; 692 693 #ifdef CONFIG_LOCKDEP 694 /* 695 * This is a bit complicated to understand, so worth a comment. What we assert 696 * here is that the GPU reset is not running on another thread in parallel. 697 * 698 * For this we trylock the read side of the reset semaphore, if that succeeds 699 * we know that the reset is not running in parallel. 700 * 701 * If the trylock fails we assert that we are either already holding the read 702 * side of the lock or are the reset thread itself and hold the write side of 703 * the lock. 704 */ 705 if (in_task()) { 706 if (down_read_trylock(&adev->reset_domain->sem)) 707 up_read(&adev->reset_domain->sem); 708 else 709 lockdep_assert_held(&adev->reset_domain->sem); 710 } 711 #endif 712 return false; 713 } 714 715 /** 716 * amdgpu_device_rreg - read a memory mapped IO or indirect register 717 * 718 * @adev: amdgpu_device pointer 719 * @reg: dword aligned register offset 720 * @acc_flags: access flags which require special behavior 721 * 722 * Returns the 32 bit value from the offset specified. 723 */ 724 uint32_t amdgpu_device_rreg(struct amdgpu_device *adev, 725 uint32_t reg, uint32_t acc_flags) 726 { 727 uint32_t ret; 728 729 if (amdgpu_device_skip_hw_access(adev)) 730 return 0; 731 732 if ((reg * 4) < adev->rmmio_size) { 733 if (!(acc_flags & AMDGPU_REGS_NO_KIQ) && 734 amdgpu_sriov_runtime(adev) && 735 down_read_trylock(&adev->reset_domain->sem)) { 736 ret = amdgpu_kiq_rreg(adev, reg, 0); 737 up_read(&adev->reset_domain->sem); 738 } else { 739 ret = readl(((void __iomem *)adev->rmmio) + (reg * 4)); 740 } 741 } else { 742 ret = adev->pcie_rreg(adev, reg * 4); 743 } 744 745 trace_amdgpu_device_rreg(adev->pdev->device, reg, ret); 746 747 return ret; 748 } 749 750 /* 751 * MMIO register read with bytes helper functions 752 * @offset:bytes offset from MMIO start 753 */ 754 755 /** 756 * amdgpu_mm_rreg8 - read a memory mapped IO register 757 * 758 * @adev: amdgpu_device pointer 759 * @offset: byte aligned register offset 760 * 761 * Returns the 8 bit value from the offset specified. 762 */ 763 uint8_t amdgpu_mm_rreg8(struct amdgpu_device *adev, uint32_t offset) 764 { 765 if (amdgpu_device_skip_hw_access(adev)) 766 return 0; 767 768 if (offset < adev->rmmio_size) 769 return (readb(adev->rmmio + offset)); 770 BUG(); 771 } 772 773 774 /** 775 * amdgpu_device_xcc_rreg - read a memory mapped IO or indirect register with specific XCC 776 * 777 * @adev: amdgpu_device pointer 778 * @reg: dword aligned register offset 779 * @acc_flags: access flags which require special behavior 780 * @xcc_id: xcc accelerated compute core id 781 * 782 * Returns the 32 bit value from the offset specified. 783 */ 784 uint32_t amdgpu_device_xcc_rreg(struct amdgpu_device *adev, 785 uint32_t reg, uint32_t acc_flags, 786 uint32_t xcc_id) 787 { 788 uint32_t ret, rlcg_flag; 789 790 if (amdgpu_device_skip_hw_access(adev)) 791 return 0; 792 793 if ((reg * 4) < adev->rmmio_size) { 794 if (amdgpu_sriov_vf(adev) && 795 !amdgpu_sriov_runtime(adev) && 796 adev->gfx.rlc.rlcg_reg_access_supported && 797 amdgpu_virt_get_rlcg_reg_access_flag(adev, acc_flags, 798 GC_HWIP, false, 799 &rlcg_flag)) { 800 ret = amdgpu_virt_rlcg_reg_rw(adev, reg, 0, rlcg_flag, GET_INST(GC, xcc_id)); 801 } else if (!(acc_flags & AMDGPU_REGS_NO_KIQ) && 802 amdgpu_sriov_runtime(adev) && 803 down_read_trylock(&adev->reset_domain->sem)) { 804 ret = amdgpu_kiq_rreg(adev, reg, xcc_id); 805 up_read(&adev->reset_domain->sem); 806 } else { 807 ret = readl(((void __iomem *)adev->rmmio) + (reg * 4)); 808 } 809 } else { 810 ret = adev->pcie_rreg(adev, reg * 4); 811 } 812 813 return ret; 814 } 815 816 /* 817 * MMIO register write with bytes helper functions 818 * @offset:bytes offset from MMIO start 819 * @value: the value want to be written to the register 820 */ 821 822 /** 823 * amdgpu_mm_wreg8 - read a memory mapped IO register 824 * 825 * @adev: amdgpu_device pointer 826 * @offset: byte aligned register offset 827 * @value: 8 bit value to write 828 * 829 * Writes the value specified to the offset specified. 830 */ 831 void amdgpu_mm_wreg8(struct amdgpu_device *adev, uint32_t offset, uint8_t value) 832 { 833 if (amdgpu_device_skip_hw_access(adev)) 834 return; 835 836 if (offset < adev->rmmio_size) 837 writeb(value, adev->rmmio + offset); 838 else 839 BUG(); 840 } 841 842 /** 843 * amdgpu_device_wreg - write to a memory mapped IO or indirect register 844 * 845 * @adev: amdgpu_device pointer 846 * @reg: dword aligned register offset 847 * @v: 32 bit value to write to the register 848 * @acc_flags: access flags which require special behavior 849 * 850 * Writes the value specified to the offset specified. 851 */ 852 void amdgpu_device_wreg(struct amdgpu_device *adev, 853 uint32_t reg, uint32_t v, 854 uint32_t acc_flags) 855 { 856 if (amdgpu_device_skip_hw_access(adev)) 857 return; 858 859 if ((reg * 4) < adev->rmmio_size) { 860 if (!(acc_flags & AMDGPU_REGS_NO_KIQ) && 861 amdgpu_sriov_runtime(adev) && 862 down_read_trylock(&adev->reset_domain->sem)) { 863 amdgpu_kiq_wreg(adev, reg, v, 0); 864 up_read(&adev->reset_domain->sem); 865 } else { 866 writel(v, ((void __iomem *)adev->rmmio) + (reg * 4)); 867 } 868 } else { 869 adev->pcie_wreg(adev, reg * 4, v); 870 } 871 872 trace_amdgpu_device_wreg(adev->pdev->device, reg, v); 873 } 874 875 /** 876 * amdgpu_mm_wreg_mmio_rlc - write register either with direct/indirect mmio or with RLC path if in range 877 * 878 * @adev: amdgpu_device pointer 879 * @reg: mmio/rlc register 880 * @v: value to write 881 * @xcc_id: xcc accelerated compute core id 882 * 883 * this function is invoked only for the debugfs register access 884 */ 885 void amdgpu_mm_wreg_mmio_rlc(struct amdgpu_device *adev, 886 uint32_t reg, uint32_t v, 887 uint32_t xcc_id) 888 { 889 if (amdgpu_device_skip_hw_access(adev)) 890 return; 891 892 if (amdgpu_sriov_fullaccess(adev) && 893 adev->gfx.rlc.funcs && 894 adev->gfx.rlc.funcs->is_rlcg_access_range) { 895 if (adev->gfx.rlc.funcs->is_rlcg_access_range(adev, reg)) 896 return amdgpu_sriov_wreg(adev, reg, v, 0, 0, xcc_id); 897 } else if ((reg * 4) >= adev->rmmio_size) { 898 adev->pcie_wreg(adev, reg * 4, v); 899 } else { 900 writel(v, ((void __iomem *)adev->rmmio) + (reg * 4)); 901 } 902 } 903 904 /** 905 * amdgpu_device_xcc_wreg - write to a memory mapped IO or indirect register with specific XCC 906 * 907 * @adev: amdgpu_device pointer 908 * @reg: dword aligned register offset 909 * @v: 32 bit value to write to the register 910 * @acc_flags: access flags which require special behavior 911 * @xcc_id: xcc accelerated compute core id 912 * 913 * Writes the value specified to the offset specified. 914 */ 915 void amdgpu_device_xcc_wreg(struct amdgpu_device *adev, 916 uint32_t reg, uint32_t v, 917 uint32_t acc_flags, uint32_t xcc_id) 918 { 919 uint32_t rlcg_flag; 920 921 if (amdgpu_device_skip_hw_access(adev)) 922 return; 923 924 if ((reg * 4) < adev->rmmio_size) { 925 if (amdgpu_sriov_vf(adev) && 926 !amdgpu_sriov_runtime(adev) && 927 adev->gfx.rlc.rlcg_reg_access_supported && 928 amdgpu_virt_get_rlcg_reg_access_flag(adev, acc_flags, 929 GC_HWIP, true, 930 &rlcg_flag)) { 931 amdgpu_virt_rlcg_reg_rw(adev, reg, v, rlcg_flag, GET_INST(GC, xcc_id)); 932 } else if (!(acc_flags & AMDGPU_REGS_NO_KIQ) && 933 amdgpu_sriov_runtime(adev) && 934 down_read_trylock(&adev->reset_domain->sem)) { 935 amdgpu_kiq_wreg(adev, reg, v, xcc_id); 936 up_read(&adev->reset_domain->sem); 937 } else { 938 writel(v, ((void __iomem *)adev->rmmio) + (reg * 4)); 939 } 940 } else { 941 adev->pcie_wreg(adev, reg * 4, v); 942 } 943 } 944 945 /** 946 * amdgpu_device_indirect_rreg - read an indirect register 947 * 948 * @adev: amdgpu_device pointer 949 * @reg_addr: indirect register address to read from 950 * 951 * Returns the value of indirect register @reg_addr 952 */ 953 u32 amdgpu_device_indirect_rreg(struct amdgpu_device *adev, 954 u32 reg_addr) 955 { 956 unsigned long flags, pcie_index, pcie_data; 957 void __iomem *pcie_index_offset; 958 void __iomem *pcie_data_offset; 959 u32 r; 960 961 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 962 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 963 964 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 965 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 966 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 967 968 writel(reg_addr, pcie_index_offset); 969 readl(pcie_index_offset); 970 r = readl(pcie_data_offset); 971 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 972 973 return r; 974 } 975 976 u32 amdgpu_device_indirect_rreg_ext(struct amdgpu_device *adev, 977 u64 reg_addr) 978 { 979 unsigned long flags, pcie_index, pcie_index_hi, pcie_data; 980 u32 r; 981 void __iomem *pcie_index_offset; 982 void __iomem *pcie_index_hi_offset; 983 void __iomem *pcie_data_offset; 984 985 if (unlikely(!adev->nbio.funcs)) { 986 pcie_index = AMDGPU_PCIE_INDEX_FALLBACK; 987 pcie_data = AMDGPU_PCIE_DATA_FALLBACK; 988 } else { 989 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 990 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 991 } 992 993 if (reg_addr >> 32) { 994 if (unlikely(!adev->nbio.funcs)) 995 pcie_index_hi = AMDGPU_PCIE_INDEX_HI_FALLBACK; 996 else 997 pcie_index_hi = adev->nbio.funcs->get_pcie_index_hi_offset(adev); 998 } else { 999 pcie_index_hi = 0; 1000 } 1001 1002 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 1003 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 1004 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 1005 if (pcie_index_hi != 0) 1006 pcie_index_hi_offset = (void __iomem *)adev->rmmio + 1007 pcie_index_hi * 4; 1008 1009 writel(reg_addr, pcie_index_offset); 1010 readl(pcie_index_offset); 1011 if (pcie_index_hi != 0) { 1012 writel((reg_addr >> 32) & 0xff, pcie_index_hi_offset); 1013 readl(pcie_index_hi_offset); 1014 } 1015 r = readl(pcie_data_offset); 1016 1017 /* clear the high bits */ 1018 if (pcie_index_hi != 0) { 1019 writel(0, pcie_index_hi_offset); 1020 readl(pcie_index_hi_offset); 1021 } 1022 1023 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 1024 1025 return r; 1026 } 1027 1028 /** 1029 * amdgpu_device_indirect_rreg64 - read a 64bits indirect register 1030 * 1031 * @adev: amdgpu_device pointer 1032 * @reg_addr: indirect register address to read from 1033 * 1034 * Returns the value of indirect register @reg_addr 1035 */ 1036 u64 amdgpu_device_indirect_rreg64(struct amdgpu_device *adev, 1037 u32 reg_addr) 1038 { 1039 unsigned long flags, pcie_index, pcie_data; 1040 void __iomem *pcie_index_offset; 1041 void __iomem *pcie_data_offset; 1042 u64 r; 1043 1044 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 1045 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 1046 1047 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 1048 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 1049 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 1050 1051 /* read low 32 bits */ 1052 writel(reg_addr, pcie_index_offset); 1053 readl(pcie_index_offset); 1054 r = readl(pcie_data_offset); 1055 /* read high 32 bits */ 1056 writel(reg_addr + 4, pcie_index_offset); 1057 readl(pcie_index_offset); 1058 r |= ((u64)readl(pcie_data_offset) << 32); 1059 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 1060 1061 return r; 1062 } 1063 1064 u64 amdgpu_device_indirect_rreg64_ext(struct amdgpu_device *adev, 1065 u64 reg_addr) 1066 { 1067 unsigned long flags, pcie_index, pcie_data; 1068 unsigned long pcie_index_hi = 0; 1069 void __iomem *pcie_index_offset; 1070 void __iomem *pcie_index_hi_offset; 1071 void __iomem *pcie_data_offset; 1072 u64 r; 1073 1074 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 1075 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 1076 if ((reg_addr >> 32) && (adev->nbio.funcs->get_pcie_index_hi_offset)) 1077 pcie_index_hi = adev->nbio.funcs->get_pcie_index_hi_offset(adev); 1078 1079 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 1080 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 1081 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 1082 if (pcie_index_hi != 0) 1083 pcie_index_hi_offset = (void __iomem *)adev->rmmio + 1084 pcie_index_hi * 4; 1085 1086 /* read low 32 bits */ 1087 writel(reg_addr, pcie_index_offset); 1088 readl(pcie_index_offset); 1089 if (pcie_index_hi != 0) { 1090 writel((reg_addr >> 32) & 0xff, pcie_index_hi_offset); 1091 readl(pcie_index_hi_offset); 1092 } 1093 r = readl(pcie_data_offset); 1094 /* read high 32 bits */ 1095 writel(reg_addr + 4, pcie_index_offset); 1096 readl(pcie_index_offset); 1097 if (pcie_index_hi != 0) { 1098 writel((reg_addr >> 32) & 0xff, pcie_index_hi_offset); 1099 readl(pcie_index_hi_offset); 1100 } 1101 r |= ((u64)readl(pcie_data_offset) << 32); 1102 1103 /* clear the high bits */ 1104 if (pcie_index_hi != 0) { 1105 writel(0, pcie_index_hi_offset); 1106 readl(pcie_index_hi_offset); 1107 } 1108 1109 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 1110 1111 return r; 1112 } 1113 1114 /** 1115 * amdgpu_device_indirect_wreg - write an indirect register address 1116 * 1117 * @adev: amdgpu_device pointer 1118 * @reg_addr: indirect register offset 1119 * @reg_data: indirect register data 1120 * 1121 */ 1122 void amdgpu_device_indirect_wreg(struct amdgpu_device *adev, 1123 u32 reg_addr, u32 reg_data) 1124 { 1125 unsigned long flags, pcie_index, pcie_data; 1126 void __iomem *pcie_index_offset; 1127 void __iomem *pcie_data_offset; 1128 1129 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 1130 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 1131 1132 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 1133 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 1134 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 1135 1136 writel(reg_addr, pcie_index_offset); 1137 readl(pcie_index_offset); 1138 writel(reg_data, pcie_data_offset); 1139 readl(pcie_data_offset); 1140 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 1141 } 1142 1143 void amdgpu_device_indirect_wreg_ext(struct amdgpu_device *adev, 1144 u64 reg_addr, u32 reg_data) 1145 { 1146 unsigned long flags, pcie_index, pcie_index_hi, pcie_data; 1147 void __iomem *pcie_index_offset; 1148 void __iomem *pcie_index_hi_offset; 1149 void __iomem *pcie_data_offset; 1150 1151 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 1152 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 1153 if ((reg_addr >> 32) && (adev->nbio.funcs->get_pcie_index_hi_offset)) 1154 pcie_index_hi = adev->nbio.funcs->get_pcie_index_hi_offset(adev); 1155 else 1156 pcie_index_hi = 0; 1157 1158 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 1159 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 1160 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 1161 if (pcie_index_hi != 0) 1162 pcie_index_hi_offset = (void __iomem *)adev->rmmio + 1163 pcie_index_hi * 4; 1164 1165 writel(reg_addr, pcie_index_offset); 1166 readl(pcie_index_offset); 1167 if (pcie_index_hi != 0) { 1168 writel((reg_addr >> 32) & 0xff, pcie_index_hi_offset); 1169 readl(pcie_index_hi_offset); 1170 } 1171 writel(reg_data, pcie_data_offset); 1172 readl(pcie_data_offset); 1173 1174 /* clear the high bits */ 1175 if (pcie_index_hi != 0) { 1176 writel(0, pcie_index_hi_offset); 1177 readl(pcie_index_hi_offset); 1178 } 1179 1180 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 1181 } 1182 1183 /** 1184 * amdgpu_device_indirect_wreg64 - write a 64bits indirect register address 1185 * 1186 * @adev: amdgpu_device pointer 1187 * @reg_addr: indirect register offset 1188 * @reg_data: indirect register data 1189 * 1190 */ 1191 void amdgpu_device_indirect_wreg64(struct amdgpu_device *adev, 1192 u32 reg_addr, u64 reg_data) 1193 { 1194 unsigned long flags, pcie_index, pcie_data; 1195 void __iomem *pcie_index_offset; 1196 void __iomem *pcie_data_offset; 1197 1198 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 1199 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 1200 1201 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 1202 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 1203 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 1204 1205 /* write low 32 bits */ 1206 writel(reg_addr, pcie_index_offset); 1207 readl(pcie_index_offset); 1208 writel((u32)(reg_data & 0xffffffffULL), pcie_data_offset); 1209 readl(pcie_data_offset); 1210 /* write high 32 bits */ 1211 writel(reg_addr + 4, pcie_index_offset); 1212 readl(pcie_index_offset); 1213 writel((u32)(reg_data >> 32), pcie_data_offset); 1214 readl(pcie_data_offset); 1215 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 1216 } 1217 1218 void amdgpu_device_indirect_wreg64_ext(struct amdgpu_device *adev, 1219 u64 reg_addr, u64 reg_data) 1220 { 1221 unsigned long flags, pcie_index, pcie_data; 1222 unsigned long pcie_index_hi = 0; 1223 void __iomem *pcie_index_offset; 1224 void __iomem *pcie_index_hi_offset; 1225 void __iomem *pcie_data_offset; 1226 1227 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 1228 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 1229 if ((reg_addr >> 32) && (adev->nbio.funcs->get_pcie_index_hi_offset)) 1230 pcie_index_hi = adev->nbio.funcs->get_pcie_index_hi_offset(adev); 1231 1232 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 1233 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 1234 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 1235 if (pcie_index_hi != 0) 1236 pcie_index_hi_offset = (void __iomem *)adev->rmmio + 1237 pcie_index_hi * 4; 1238 1239 /* write low 32 bits */ 1240 writel(reg_addr, pcie_index_offset); 1241 readl(pcie_index_offset); 1242 if (pcie_index_hi != 0) { 1243 writel((reg_addr >> 32) & 0xff, pcie_index_hi_offset); 1244 readl(pcie_index_hi_offset); 1245 } 1246 writel((u32)(reg_data & 0xffffffffULL), pcie_data_offset); 1247 readl(pcie_data_offset); 1248 /* write high 32 bits */ 1249 writel(reg_addr + 4, pcie_index_offset); 1250 readl(pcie_index_offset); 1251 if (pcie_index_hi != 0) { 1252 writel((reg_addr >> 32) & 0xff, pcie_index_hi_offset); 1253 readl(pcie_index_hi_offset); 1254 } 1255 writel((u32)(reg_data >> 32), pcie_data_offset); 1256 readl(pcie_data_offset); 1257 1258 /* clear the high bits */ 1259 if (pcie_index_hi != 0) { 1260 writel(0, pcie_index_hi_offset); 1261 readl(pcie_index_hi_offset); 1262 } 1263 1264 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 1265 } 1266 1267 /** 1268 * amdgpu_device_get_rev_id - query device rev_id 1269 * 1270 * @adev: amdgpu_device pointer 1271 * 1272 * Return device rev_id 1273 */ 1274 u32 amdgpu_device_get_rev_id(struct amdgpu_device *adev) 1275 { 1276 return adev->nbio.funcs->get_rev_id(adev); 1277 } 1278 1279 /** 1280 * amdgpu_invalid_rreg - dummy reg read function 1281 * 1282 * @adev: amdgpu_device pointer 1283 * @reg: offset of register 1284 * 1285 * Dummy register read function. Used for register blocks 1286 * that certain asics don't have (all asics). 1287 * Returns the value in the register. 1288 */ 1289 static uint32_t amdgpu_invalid_rreg(struct amdgpu_device *adev, uint32_t reg) 1290 { 1291 dev_err(adev->dev, "Invalid callback to read register 0x%04X\n", reg); 1292 BUG(); 1293 return 0; 1294 } 1295 1296 static uint32_t amdgpu_invalid_rreg_ext(struct amdgpu_device *adev, uint64_t reg) 1297 { 1298 dev_err(adev->dev, "Invalid callback to read register 0x%llX\n", reg); 1299 BUG(); 1300 return 0; 1301 } 1302 1303 /** 1304 * amdgpu_invalid_wreg - dummy reg write function 1305 * 1306 * @adev: amdgpu_device pointer 1307 * @reg: offset of register 1308 * @v: value to write to the register 1309 * 1310 * Dummy register read function. Used for register blocks 1311 * that certain asics don't have (all asics). 1312 */ 1313 static void amdgpu_invalid_wreg(struct amdgpu_device *adev, uint32_t reg, uint32_t v) 1314 { 1315 dev_err(adev->dev, 1316 "Invalid callback to write register 0x%04X with 0x%08X\n", reg, 1317 v); 1318 BUG(); 1319 } 1320 1321 static void amdgpu_invalid_wreg_ext(struct amdgpu_device *adev, uint64_t reg, uint32_t v) 1322 { 1323 dev_err(adev->dev, 1324 "Invalid callback to write register 0x%llX with 0x%08X\n", reg, 1325 v); 1326 BUG(); 1327 } 1328 1329 /** 1330 * amdgpu_invalid_rreg64 - dummy 64 bit reg read function 1331 * 1332 * @adev: amdgpu_device pointer 1333 * @reg: offset of register 1334 * 1335 * Dummy register read function. Used for register blocks 1336 * that certain asics don't have (all asics). 1337 * Returns the value in the register. 1338 */ 1339 static uint64_t amdgpu_invalid_rreg64(struct amdgpu_device *adev, uint32_t reg) 1340 { 1341 dev_err(adev->dev, "Invalid callback to read 64 bit register 0x%04X\n", 1342 reg); 1343 BUG(); 1344 return 0; 1345 } 1346 1347 static uint64_t amdgpu_invalid_rreg64_ext(struct amdgpu_device *adev, uint64_t reg) 1348 { 1349 dev_err(adev->dev, "Invalid callback to read register 0x%llX\n", reg); 1350 BUG(); 1351 return 0; 1352 } 1353 1354 /** 1355 * amdgpu_invalid_wreg64 - dummy reg write function 1356 * 1357 * @adev: amdgpu_device pointer 1358 * @reg: offset of register 1359 * @v: value to write to the register 1360 * 1361 * Dummy register read function. Used for register blocks 1362 * that certain asics don't have (all asics). 1363 */ 1364 static void amdgpu_invalid_wreg64(struct amdgpu_device *adev, uint32_t reg, uint64_t v) 1365 { 1366 dev_err(adev->dev, 1367 "Invalid callback to write 64 bit register 0x%04X with 0x%08llX\n", 1368 reg, v); 1369 BUG(); 1370 } 1371 1372 static void amdgpu_invalid_wreg64_ext(struct amdgpu_device *adev, uint64_t reg, uint64_t v) 1373 { 1374 dev_err(adev->dev, 1375 "Invalid callback to write 64 bit register 0x%llX with 0x%08llX\n", 1376 reg, v); 1377 BUG(); 1378 } 1379 1380 /** 1381 * amdgpu_block_invalid_rreg - dummy reg read function 1382 * 1383 * @adev: amdgpu_device pointer 1384 * @block: offset of instance 1385 * @reg: offset of register 1386 * 1387 * Dummy register read function. Used for register blocks 1388 * that certain asics don't have (all asics). 1389 * Returns the value in the register. 1390 */ 1391 static uint32_t amdgpu_block_invalid_rreg(struct amdgpu_device *adev, 1392 uint32_t block, uint32_t reg) 1393 { 1394 dev_err(adev->dev, 1395 "Invalid callback to read register 0x%04X in block 0x%04X\n", 1396 reg, block); 1397 BUG(); 1398 return 0; 1399 } 1400 1401 /** 1402 * amdgpu_block_invalid_wreg - dummy reg write function 1403 * 1404 * @adev: amdgpu_device pointer 1405 * @block: offset of instance 1406 * @reg: offset of register 1407 * @v: value to write to the register 1408 * 1409 * Dummy register read function. Used for register blocks 1410 * that certain asics don't have (all asics). 1411 */ 1412 static void amdgpu_block_invalid_wreg(struct amdgpu_device *adev, 1413 uint32_t block, 1414 uint32_t reg, uint32_t v) 1415 { 1416 dev_err(adev->dev, 1417 "Invalid block callback to write register 0x%04X in block 0x%04X with 0x%08X\n", 1418 reg, block, v); 1419 BUG(); 1420 } 1421 1422 static uint32_t amdgpu_device_get_vbios_flags(struct amdgpu_device *adev) 1423 { 1424 if (hweight32(adev->aid_mask) && (adev->flags & AMD_IS_APU)) 1425 return AMDGPU_VBIOS_SKIP; 1426 1427 if (hweight32(adev->aid_mask) && amdgpu_passthrough(adev)) 1428 return AMDGPU_VBIOS_OPTIONAL; 1429 1430 return 0; 1431 } 1432 1433 /** 1434 * amdgpu_device_asic_init - Wrapper for atom asic_init 1435 * 1436 * @adev: amdgpu_device pointer 1437 * 1438 * Does any asic specific work and then calls atom asic init. 1439 */ 1440 static int amdgpu_device_asic_init(struct amdgpu_device *adev) 1441 { 1442 uint32_t flags; 1443 bool optional; 1444 int ret; 1445 1446 amdgpu_asic_pre_asic_init(adev); 1447 flags = amdgpu_device_get_vbios_flags(adev); 1448 optional = !!(flags & (AMDGPU_VBIOS_OPTIONAL | AMDGPU_VBIOS_SKIP)); 1449 1450 if (amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 3) || 1451 amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 4) || 1452 amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 5, 0) || 1453 amdgpu_ip_version(adev, GC_HWIP, 0) >= IP_VERSION(11, 0, 0)) { 1454 amdgpu_psp_wait_for_bootloader(adev); 1455 if (optional && !adev->bios) 1456 return 0; 1457 1458 ret = amdgpu_atomfirmware_asic_init(adev, true); 1459 return ret; 1460 } else { 1461 if (optional && !adev->bios) 1462 return 0; 1463 1464 return amdgpu_atom_asic_init(adev->mode_info.atom_context); 1465 } 1466 1467 return 0; 1468 } 1469 1470 /** 1471 * amdgpu_device_mem_scratch_init - allocate the VRAM scratch page 1472 * 1473 * @adev: amdgpu_device pointer 1474 * 1475 * Allocates a scratch page of VRAM for use by various things in the 1476 * driver. 1477 */ 1478 static int amdgpu_device_mem_scratch_init(struct amdgpu_device *adev) 1479 { 1480 return amdgpu_bo_create_kernel(adev, AMDGPU_GPU_PAGE_SIZE, PAGE_SIZE, 1481 AMDGPU_GEM_DOMAIN_VRAM | 1482 AMDGPU_GEM_DOMAIN_GTT, 1483 &adev->mem_scratch.robj, 1484 &adev->mem_scratch.gpu_addr, 1485 (void **)&adev->mem_scratch.ptr); 1486 } 1487 1488 /** 1489 * amdgpu_device_mem_scratch_fini - Free the VRAM scratch page 1490 * 1491 * @adev: amdgpu_device pointer 1492 * 1493 * Frees the VRAM scratch page. 1494 */ 1495 static void amdgpu_device_mem_scratch_fini(struct amdgpu_device *adev) 1496 { 1497 amdgpu_bo_free_kernel(&adev->mem_scratch.robj, NULL, NULL); 1498 } 1499 1500 /** 1501 * amdgpu_device_program_register_sequence - program an array of registers. 1502 * 1503 * @adev: amdgpu_device pointer 1504 * @registers: pointer to the register array 1505 * @array_size: size of the register array 1506 * 1507 * Programs an array or registers with and or masks. 1508 * This is a helper for setting golden registers. 1509 */ 1510 void amdgpu_device_program_register_sequence(struct amdgpu_device *adev, 1511 const u32 *registers, 1512 const u32 array_size) 1513 { 1514 u32 tmp, reg, and_mask, or_mask; 1515 int i; 1516 1517 if (array_size % 3) 1518 return; 1519 1520 for (i = 0; i < array_size; i += 3) { 1521 reg = registers[i + 0]; 1522 and_mask = registers[i + 1]; 1523 or_mask = registers[i + 2]; 1524 1525 if (and_mask == 0xffffffff) { 1526 tmp = or_mask; 1527 } else { 1528 tmp = RREG32(reg); 1529 tmp &= ~and_mask; 1530 if (adev->family >= AMDGPU_FAMILY_AI) 1531 tmp |= (or_mask & and_mask); 1532 else 1533 tmp |= or_mask; 1534 } 1535 WREG32(reg, tmp); 1536 } 1537 } 1538 1539 /** 1540 * amdgpu_device_pci_config_reset - reset the GPU 1541 * 1542 * @adev: amdgpu_device pointer 1543 * 1544 * Resets the GPU using the pci config reset sequence. 1545 * Only applicable to asics prior to vega10. 1546 */ 1547 void amdgpu_device_pci_config_reset(struct amdgpu_device *adev) 1548 { 1549 pci_write_config_dword(adev->pdev, 0x7c, AMDGPU_ASIC_RESET_DATA); 1550 } 1551 1552 /** 1553 * amdgpu_device_pci_reset - reset the GPU using generic PCI means 1554 * 1555 * @adev: amdgpu_device pointer 1556 * 1557 * Resets the GPU using generic pci reset interfaces (FLR, SBR, etc.). 1558 */ 1559 int amdgpu_device_pci_reset(struct amdgpu_device *adev) 1560 { 1561 return pci_reset_function(adev->pdev); 1562 } 1563 1564 /* 1565 * amdgpu_device_wb_*() 1566 * Writeback is the method by which the GPU updates special pages in memory 1567 * with the status of certain GPU events (fences, ring pointers,etc.). 1568 */ 1569 1570 /** 1571 * amdgpu_device_wb_fini - Disable Writeback and free memory 1572 * 1573 * @adev: amdgpu_device pointer 1574 * 1575 * Disables Writeback and frees the Writeback memory (all asics). 1576 * Used at driver shutdown. 1577 */ 1578 static void amdgpu_device_wb_fini(struct amdgpu_device *adev) 1579 { 1580 if (adev->wb.wb_obj) { 1581 amdgpu_bo_free_kernel(&adev->wb.wb_obj, 1582 &adev->wb.gpu_addr, 1583 (void **)&adev->wb.wb); 1584 adev->wb.wb_obj = NULL; 1585 } 1586 } 1587 1588 /** 1589 * amdgpu_device_wb_init - Init Writeback driver info and allocate memory 1590 * 1591 * @adev: amdgpu_device pointer 1592 * 1593 * Initializes writeback and allocates writeback memory (all asics). 1594 * Used at driver startup. 1595 * Returns 0 on success or an -error on failure. 1596 */ 1597 static int amdgpu_device_wb_init(struct amdgpu_device *adev) 1598 { 1599 int r; 1600 1601 if (adev->wb.wb_obj == NULL) { 1602 /* AMDGPU_MAX_WB * sizeof(uint32_t) * 8 = AMDGPU_MAX_WB 256bit slots */ 1603 r = amdgpu_bo_create_kernel(adev, AMDGPU_MAX_WB * sizeof(uint32_t) * 8, 1604 PAGE_SIZE, AMDGPU_GEM_DOMAIN_GTT, 1605 &adev->wb.wb_obj, &adev->wb.gpu_addr, 1606 (void **)&adev->wb.wb); 1607 if (r) { 1608 dev_warn(adev->dev, "(%d) create WB bo failed\n", r); 1609 return r; 1610 } 1611 1612 adev->wb.num_wb = AMDGPU_MAX_WB; 1613 memset(&adev->wb.used, 0, sizeof(adev->wb.used)); 1614 1615 /* clear wb memory */ 1616 memset((char *)adev->wb.wb, 0, AMDGPU_MAX_WB * sizeof(uint32_t) * 8); 1617 } 1618 1619 return 0; 1620 } 1621 1622 /** 1623 * amdgpu_device_wb_get - Allocate a wb entry 1624 * 1625 * @adev: amdgpu_device pointer 1626 * @wb: wb index 1627 * 1628 * Allocate a wb slot for use by the driver (all asics). 1629 * Returns 0 on success or -EINVAL on failure. 1630 */ 1631 int amdgpu_device_wb_get(struct amdgpu_device *adev, u32 *wb) 1632 { 1633 unsigned long flags, offset; 1634 1635 spin_lock_irqsave(&adev->wb.lock, flags); 1636 offset = find_first_zero_bit(adev->wb.used, adev->wb.num_wb); 1637 if (offset < adev->wb.num_wb) { 1638 __set_bit(offset, adev->wb.used); 1639 spin_unlock_irqrestore(&adev->wb.lock, flags); 1640 *wb = offset << 3; /* convert to dw offset */ 1641 return 0; 1642 } else { 1643 spin_unlock_irqrestore(&adev->wb.lock, flags); 1644 return -EINVAL; 1645 } 1646 } 1647 1648 /** 1649 * amdgpu_device_wb_free - Free a wb entry 1650 * 1651 * @adev: amdgpu_device pointer 1652 * @wb: wb index 1653 * 1654 * Free a wb slot allocated for use by the driver (all asics) 1655 */ 1656 void amdgpu_device_wb_free(struct amdgpu_device *adev, u32 wb) 1657 { 1658 unsigned long flags; 1659 1660 wb >>= 3; 1661 spin_lock_irqsave(&adev->wb.lock, flags); 1662 if (wb < adev->wb.num_wb) 1663 __clear_bit(wb, adev->wb.used); 1664 spin_unlock_irqrestore(&adev->wb.lock, flags); 1665 } 1666 1667 /** 1668 * amdgpu_device_resize_fb_bar - try to resize FB BAR 1669 * 1670 * @adev: amdgpu_device pointer 1671 * 1672 * Try to resize FB BAR to make all VRAM CPU accessible. We try very hard not 1673 * to fail, but if any of the BARs is not accessible after the size we abort 1674 * driver loading by returning -ENODEV. 1675 */ 1676 int amdgpu_device_resize_fb_bar(struct amdgpu_device *adev) 1677 { 1678 int rbar_size = pci_rebar_bytes_to_size(adev->gmc.real_vram_size); 1679 struct pci_bus *root; 1680 struct resource *res; 1681 unsigned int i; 1682 u16 cmd; 1683 int r; 1684 1685 if (!IS_ENABLED(CONFIG_PHYS_ADDR_T_64BIT)) 1686 return 0; 1687 1688 /* Bypass for VF */ 1689 if (amdgpu_sriov_vf(adev)) 1690 return 0; 1691 1692 if (!amdgpu_rebar) 1693 return 0; 1694 1695 /* resizing on Dell G5 SE platforms causes problems with runtime pm */ 1696 if ((amdgpu_runtime_pm != 0) && 1697 adev->pdev->vendor == PCI_VENDOR_ID_ATI && 1698 adev->pdev->device == 0x731f && 1699 adev->pdev->subsystem_vendor == PCI_VENDOR_ID_DELL) 1700 return 0; 1701 1702 /* PCI_EXT_CAP_ID_VNDR extended capability is located at 0x100 */ 1703 if (!pci_find_ext_capability(adev->pdev, PCI_EXT_CAP_ID_VNDR)) 1704 dev_warn( 1705 adev->dev, 1706 "System can't access extended configuration space, please check!!\n"); 1707 1708 /* skip if the bios has already enabled large BAR */ 1709 if (adev->gmc.real_vram_size && 1710 (pci_resource_len(adev->pdev, 0) >= adev->gmc.real_vram_size)) 1711 return 0; 1712 1713 /* Check if the root BUS has 64bit memory resources */ 1714 root = adev->pdev->bus; 1715 while (root->parent) 1716 root = root->parent; 1717 1718 pci_bus_for_each_resource(root, res, i) { 1719 if (res && res->flags & (IORESOURCE_MEM | IORESOURCE_MEM_64) && 1720 res->start > 0x100000000ull) 1721 break; 1722 } 1723 1724 /* Trying to resize is pointless without a root hub window above 4GB */ 1725 if (!res) 1726 return 0; 1727 1728 /* Limit the BAR size to what is available */ 1729 rbar_size = min(fls(pci_rebar_get_possible_sizes(adev->pdev, 0)) - 1, 1730 rbar_size); 1731 1732 /* Disable memory decoding while we change the BAR addresses and size */ 1733 pci_read_config_word(adev->pdev, PCI_COMMAND, &cmd); 1734 pci_write_config_word(adev->pdev, PCI_COMMAND, 1735 cmd & ~PCI_COMMAND_MEMORY); 1736 1737 /* Free the VRAM and doorbell BAR, we most likely need to move both. */ 1738 amdgpu_doorbell_fini(adev); 1739 if (adev->asic_type >= CHIP_BONAIRE) 1740 pci_release_resource(adev->pdev, 2); 1741 1742 pci_release_resource(adev->pdev, 0); 1743 1744 r = pci_resize_resource(adev->pdev, 0, rbar_size); 1745 if (r == -ENOSPC) 1746 dev_info(adev->dev, 1747 "Not enough PCI address space for a large BAR."); 1748 else if (r && r != -ENOTSUPP) 1749 dev_err(adev->dev, "Problem resizing BAR0 (%d).", r); 1750 1751 pci_assign_unassigned_bus_resources(adev->pdev->bus); 1752 1753 /* When the doorbell or fb BAR isn't available we have no chance of 1754 * using the device. 1755 */ 1756 r = amdgpu_doorbell_init(adev); 1757 if (r || (pci_resource_flags(adev->pdev, 0) & IORESOURCE_UNSET)) 1758 return -ENODEV; 1759 1760 pci_write_config_word(adev->pdev, PCI_COMMAND, cmd); 1761 1762 return 0; 1763 } 1764 1765 /* 1766 * GPU helpers function. 1767 */ 1768 /** 1769 * amdgpu_device_need_post - check if the hw need post or not 1770 * 1771 * @adev: amdgpu_device pointer 1772 * 1773 * Check if the asic has been initialized (all asics) at driver startup 1774 * or post is needed if hw reset is performed. 1775 * Returns true if need or false if not. 1776 */ 1777 bool amdgpu_device_need_post(struct amdgpu_device *adev) 1778 { 1779 uint32_t reg, flags; 1780 1781 if (amdgpu_sriov_vf(adev)) 1782 return false; 1783 1784 flags = amdgpu_device_get_vbios_flags(adev); 1785 if (flags & AMDGPU_VBIOS_SKIP) 1786 return false; 1787 if ((flags & AMDGPU_VBIOS_OPTIONAL) && !adev->bios) 1788 return false; 1789 1790 if (amdgpu_passthrough(adev)) { 1791 /* for FIJI: In whole GPU pass-through virtualization case, after VM reboot 1792 * some old smc fw still need driver do vPost otherwise gpu hang, while 1793 * those smc fw version above 22.15 doesn't have this flaw, so we force 1794 * vpost executed for smc version below 22.15 1795 */ 1796 if (adev->asic_type == CHIP_FIJI) { 1797 int err; 1798 uint32_t fw_ver; 1799 1800 err = request_firmware(&adev->pm.fw, "amdgpu/fiji_smc.bin", adev->dev); 1801 /* force vPost if error occurred */ 1802 if (err) 1803 return true; 1804 1805 fw_ver = *((uint32_t *)adev->pm.fw->data + 69); 1806 release_firmware(adev->pm.fw); 1807 if (fw_ver < 0x00160e00) 1808 return true; 1809 } 1810 } 1811 1812 /* Don't post if we need to reset whole hive on init */ 1813 if (adev->init_lvl->level == AMDGPU_INIT_LEVEL_MINIMAL_XGMI) 1814 return false; 1815 1816 if (adev->has_hw_reset) { 1817 adev->has_hw_reset = false; 1818 return true; 1819 } 1820 1821 /* bios scratch used on CIK+ */ 1822 if (adev->asic_type >= CHIP_BONAIRE) 1823 return amdgpu_atombios_scratch_need_asic_init(adev); 1824 1825 /* check MEM_SIZE for older asics */ 1826 reg = amdgpu_asic_get_config_memsize(adev); 1827 1828 if ((reg != 0) && (reg != 0xffffffff)) 1829 return false; 1830 1831 return true; 1832 } 1833 1834 /* 1835 * Check whether seamless boot is supported. 1836 * 1837 * So far we only support seamless boot on DCE 3.0 or later. 1838 * If users report that it works on older ASICS as well, we may 1839 * loosen this. 1840 */ 1841 bool amdgpu_device_seamless_boot_supported(struct amdgpu_device *adev) 1842 { 1843 switch (amdgpu_seamless) { 1844 case -1: 1845 break; 1846 case 1: 1847 return true; 1848 case 0: 1849 return false; 1850 default: 1851 dev_err(adev->dev, "Invalid value for amdgpu.seamless: %d\n", 1852 amdgpu_seamless); 1853 return false; 1854 } 1855 1856 if (!(adev->flags & AMD_IS_APU)) 1857 return false; 1858 1859 if (adev->mman.keep_stolen_vga_memory) 1860 return false; 1861 1862 return amdgpu_ip_version(adev, DCE_HWIP, 0) >= IP_VERSION(3, 0, 0); 1863 } 1864 1865 /* 1866 * Intel hosts such as Rocket Lake, Alder Lake, Raptor Lake and Sapphire Rapids 1867 * don't support dynamic speed switching. Until we have confirmation from Intel 1868 * that a specific host supports it, it's safer that we keep it disabled for all. 1869 * 1870 * https://edc.intel.com/content/www/us/en/design/products/platforms/details/raptor-lake-s/13th-generation-core-processors-datasheet-volume-1-of-2/005/pci-express-support/ 1871 * https://gitlab.freedesktop.org/drm/amd/-/issues/2663 1872 */ 1873 static bool amdgpu_device_pcie_dynamic_switching_supported(struct amdgpu_device *adev) 1874 { 1875 #if IS_ENABLED(CONFIG_X86) 1876 struct cpuinfo_x86 *c = &cpu_data(0); 1877 1878 /* eGPU change speeds based on USB4 fabric conditions */ 1879 if (dev_is_removable(adev->dev)) 1880 return true; 1881 1882 if (c->x86_vendor == X86_VENDOR_INTEL) 1883 return false; 1884 #endif 1885 return true; 1886 } 1887 1888 static bool amdgpu_device_aspm_support_quirk(struct amdgpu_device *adev) 1889 { 1890 /* Enabling ASPM causes randoms hangs on Tahiti and Oland on Zen4. 1891 * It's unclear if this is a platform-specific or GPU-specific issue. 1892 * Disable ASPM on SI for the time being. 1893 */ 1894 if (adev->family == AMDGPU_FAMILY_SI) 1895 return true; 1896 1897 #if IS_ENABLED(CONFIG_X86) 1898 struct cpuinfo_x86 *c = &cpu_data(0); 1899 1900 if (!(amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(12, 0, 0) || 1901 amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(12, 0, 1))) 1902 return false; 1903 1904 if (c->x86 == 6 && 1905 adev->pm.pcie_gen_mask & CAIL_PCIE_LINK_SPEED_SUPPORT_GEN5) { 1906 switch (c->x86_model) { 1907 case VFM_MODEL(INTEL_ALDERLAKE): 1908 case VFM_MODEL(INTEL_ALDERLAKE_L): 1909 case VFM_MODEL(INTEL_RAPTORLAKE): 1910 case VFM_MODEL(INTEL_RAPTORLAKE_P): 1911 case VFM_MODEL(INTEL_RAPTORLAKE_S): 1912 return true; 1913 default: 1914 return false; 1915 } 1916 } else { 1917 return false; 1918 } 1919 #else 1920 return false; 1921 #endif 1922 } 1923 1924 /** 1925 * amdgpu_device_should_use_aspm - check if the device should program ASPM 1926 * 1927 * @adev: amdgpu_device pointer 1928 * 1929 * Confirm whether the module parameter and pcie bridge agree that ASPM should 1930 * be set for this device. 1931 * 1932 * Returns true if it should be used or false if not. 1933 */ 1934 bool amdgpu_device_should_use_aspm(struct amdgpu_device *adev) 1935 { 1936 switch (amdgpu_aspm) { 1937 case -1: 1938 break; 1939 case 0: 1940 return false; 1941 case 1: 1942 return true; 1943 default: 1944 return false; 1945 } 1946 if (adev->flags & AMD_IS_APU) 1947 return false; 1948 if (amdgpu_device_aspm_support_quirk(adev)) 1949 return false; 1950 return pcie_aspm_enabled(adev->pdev); 1951 } 1952 1953 /* if we get transitioned to only one device, take VGA back */ 1954 /** 1955 * amdgpu_device_vga_set_decode - enable/disable vga decode 1956 * 1957 * @pdev: PCI device pointer 1958 * @state: enable/disable vga decode 1959 * 1960 * Enable/disable vga decode (all asics). 1961 * Returns VGA resource flags. 1962 */ 1963 static unsigned int amdgpu_device_vga_set_decode(struct pci_dev *pdev, 1964 bool state) 1965 { 1966 struct amdgpu_device *adev = drm_to_adev(pci_get_drvdata(pdev)); 1967 1968 amdgpu_asic_set_vga_state(adev, state); 1969 if (state) 1970 return VGA_RSRC_LEGACY_IO | VGA_RSRC_LEGACY_MEM | 1971 VGA_RSRC_NORMAL_IO | VGA_RSRC_NORMAL_MEM; 1972 else 1973 return VGA_RSRC_NORMAL_IO | VGA_RSRC_NORMAL_MEM; 1974 } 1975 1976 /** 1977 * amdgpu_device_check_block_size - validate the vm block size 1978 * 1979 * @adev: amdgpu_device pointer 1980 * 1981 * Validates the vm block size specified via module parameter. 1982 * The vm block size defines number of bits in page table versus page directory, 1983 * a page is 4KB so we have 12 bits offset, minimum 9 bits in the 1984 * page table and the remaining bits are in the page directory. 1985 */ 1986 static void amdgpu_device_check_block_size(struct amdgpu_device *adev) 1987 { 1988 /* defines number of bits in page table versus page directory, 1989 * a page is 4KB so we have 12 bits offset, minimum 9 bits in the 1990 * page table and the remaining bits are in the page directory 1991 */ 1992 if (amdgpu_vm_block_size == -1) 1993 return; 1994 1995 if (amdgpu_vm_block_size < 9) { 1996 dev_warn(adev->dev, "VM page table size (%d) too small\n", 1997 amdgpu_vm_block_size); 1998 amdgpu_vm_block_size = -1; 1999 } 2000 } 2001 2002 /** 2003 * amdgpu_device_check_vm_size - validate the vm size 2004 * 2005 * @adev: amdgpu_device pointer 2006 * 2007 * Validates the vm size in GB specified via module parameter. 2008 * The VM size is the size of the GPU virtual memory space in GB. 2009 */ 2010 static void amdgpu_device_check_vm_size(struct amdgpu_device *adev) 2011 { 2012 /* no need to check the default value */ 2013 if (amdgpu_vm_size == -1) 2014 return; 2015 2016 if (amdgpu_vm_size < 1) { 2017 dev_warn(adev->dev, "VM size (%d) too small, min is 1GB\n", 2018 amdgpu_vm_size); 2019 amdgpu_vm_size = -1; 2020 } 2021 } 2022 2023 static void amdgpu_device_check_smu_prv_buffer_size(struct amdgpu_device *adev) 2024 { 2025 struct sysinfo si; 2026 bool is_os_64 = (sizeof(void *) == 8); 2027 uint64_t total_memory; 2028 uint64_t dram_size_seven_GB = 0x1B8000000; 2029 uint64_t dram_size_three_GB = 0xB8000000; 2030 2031 if (amdgpu_smu_memory_pool_size == 0) 2032 return; 2033 2034 if (!is_os_64) { 2035 dev_warn(adev->dev, "Not 64-bit OS, feature not supported\n"); 2036 goto def_value; 2037 } 2038 si_meminfo(&si); 2039 total_memory = (uint64_t)si.totalram * si.mem_unit; 2040 2041 if ((amdgpu_smu_memory_pool_size == 1) || 2042 (amdgpu_smu_memory_pool_size == 2)) { 2043 if (total_memory < dram_size_three_GB) 2044 goto def_value1; 2045 } else if ((amdgpu_smu_memory_pool_size == 4) || 2046 (amdgpu_smu_memory_pool_size == 8)) { 2047 if (total_memory < dram_size_seven_GB) 2048 goto def_value1; 2049 } else { 2050 dev_warn(adev->dev, "Smu memory pool size not supported\n"); 2051 goto def_value; 2052 } 2053 adev->pm.smu_prv_buffer_size = amdgpu_smu_memory_pool_size << 28; 2054 2055 return; 2056 2057 def_value1: 2058 dev_warn(adev->dev, "No enough system memory\n"); 2059 def_value: 2060 adev->pm.smu_prv_buffer_size = 0; 2061 } 2062 2063 static int amdgpu_device_init_apu_flags(struct amdgpu_device *adev) 2064 { 2065 if (!(adev->flags & AMD_IS_APU) || 2066 adev->asic_type < CHIP_RAVEN) 2067 return 0; 2068 2069 switch (adev->asic_type) { 2070 case CHIP_RAVEN: 2071 if (adev->pdev->device == 0x15dd) 2072 adev->apu_flags |= AMD_APU_IS_RAVEN; 2073 if (adev->pdev->device == 0x15d8) 2074 adev->apu_flags |= AMD_APU_IS_PICASSO; 2075 break; 2076 case CHIP_RENOIR: 2077 if ((adev->pdev->device == 0x1636) || 2078 (adev->pdev->device == 0x164c)) 2079 adev->apu_flags |= AMD_APU_IS_RENOIR; 2080 else 2081 adev->apu_flags |= AMD_APU_IS_GREEN_SARDINE; 2082 break; 2083 case CHIP_VANGOGH: 2084 adev->apu_flags |= AMD_APU_IS_VANGOGH; 2085 break; 2086 case CHIP_YELLOW_CARP: 2087 break; 2088 case CHIP_CYAN_SKILLFISH: 2089 if ((adev->pdev->device == 0x13FE) || 2090 (adev->pdev->device == 0x143F)) 2091 adev->apu_flags |= AMD_APU_IS_CYAN_SKILLFISH2; 2092 break; 2093 default: 2094 break; 2095 } 2096 2097 return 0; 2098 } 2099 2100 /** 2101 * amdgpu_device_check_arguments - validate module params 2102 * 2103 * @adev: amdgpu_device pointer 2104 * 2105 * Validates certain module parameters and updates 2106 * the associated values used by the driver (all asics). 2107 */ 2108 static int amdgpu_device_check_arguments(struct amdgpu_device *adev) 2109 { 2110 int i; 2111 2112 if (amdgpu_sched_jobs < 4) { 2113 dev_warn(adev->dev, "sched jobs (%d) must be at least 4\n", 2114 amdgpu_sched_jobs); 2115 amdgpu_sched_jobs = 4; 2116 } else if (!is_power_of_2(amdgpu_sched_jobs)) { 2117 dev_warn(adev->dev, "sched jobs (%d) must be a power of 2\n", 2118 amdgpu_sched_jobs); 2119 amdgpu_sched_jobs = roundup_pow_of_two(amdgpu_sched_jobs); 2120 } 2121 2122 if (amdgpu_gart_size != -1 && amdgpu_gart_size < 32) { 2123 /* gart size must be greater or equal to 32M */ 2124 dev_warn(adev->dev, "gart size (%d) too small\n", 2125 amdgpu_gart_size); 2126 amdgpu_gart_size = -1; 2127 } 2128 2129 if (amdgpu_gtt_size != -1 && amdgpu_gtt_size < 32) { 2130 /* gtt size must be greater or equal to 32M */ 2131 dev_warn(adev->dev, "gtt size (%d) too small\n", 2132 amdgpu_gtt_size); 2133 amdgpu_gtt_size = -1; 2134 } 2135 2136 /* valid range is between 4 and 9 inclusive */ 2137 if (amdgpu_vm_fragment_size != -1 && 2138 (amdgpu_vm_fragment_size > 9 || amdgpu_vm_fragment_size < 4)) { 2139 dev_warn(adev->dev, "valid range is between 4 and 9\n"); 2140 amdgpu_vm_fragment_size = -1; 2141 } 2142 2143 if (amdgpu_sched_hw_submission < 2) { 2144 dev_warn(adev->dev, "sched hw submission jobs (%d) must be at least 2\n", 2145 amdgpu_sched_hw_submission); 2146 amdgpu_sched_hw_submission = 2; 2147 } else if (!is_power_of_2(amdgpu_sched_hw_submission)) { 2148 dev_warn(adev->dev, "sched hw submission jobs (%d) must be a power of 2\n", 2149 amdgpu_sched_hw_submission); 2150 amdgpu_sched_hw_submission = roundup_pow_of_two(amdgpu_sched_hw_submission); 2151 } 2152 2153 if (amdgpu_reset_method < -1 || amdgpu_reset_method > 4) { 2154 dev_warn(adev->dev, "invalid option for reset method, reverting to default\n"); 2155 amdgpu_reset_method = -1; 2156 } 2157 2158 amdgpu_device_check_smu_prv_buffer_size(adev); 2159 2160 amdgpu_device_check_vm_size(adev); 2161 2162 amdgpu_device_check_block_size(adev); 2163 2164 adev->firmware.load_type = amdgpu_ucode_get_load_type(adev, amdgpu_fw_load_type); 2165 2166 for (i = 0; i < MAX_XCP; i++) { 2167 switch (amdgpu_enforce_isolation) { 2168 case -1: 2169 case 0: 2170 default: 2171 /* disable */ 2172 adev->enforce_isolation[i] = AMDGPU_ENFORCE_ISOLATION_DISABLE; 2173 break; 2174 case 1: 2175 /* enable */ 2176 adev->enforce_isolation[i] = 2177 AMDGPU_ENFORCE_ISOLATION_ENABLE; 2178 break; 2179 case 2: 2180 /* enable legacy mode */ 2181 adev->enforce_isolation[i] = 2182 AMDGPU_ENFORCE_ISOLATION_ENABLE_LEGACY; 2183 break; 2184 case 3: 2185 /* enable only process isolation without submitting cleaner shader */ 2186 adev->enforce_isolation[i] = 2187 AMDGPU_ENFORCE_ISOLATION_NO_CLEANER_SHADER; 2188 break; 2189 } 2190 } 2191 2192 return 0; 2193 } 2194 2195 /** 2196 * amdgpu_switcheroo_set_state - set switcheroo state 2197 * 2198 * @pdev: pci dev pointer 2199 * @state: vga_switcheroo state 2200 * 2201 * Callback for the switcheroo driver. Suspends or resumes 2202 * the asics before or after it is powered up using ACPI methods. 2203 */ 2204 static void amdgpu_switcheroo_set_state(struct pci_dev *pdev, 2205 enum vga_switcheroo_state state) 2206 { 2207 struct drm_device *dev = pci_get_drvdata(pdev); 2208 int r; 2209 2210 if (amdgpu_device_supports_px(drm_to_adev(dev)) && 2211 state == VGA_SWITCHEROO_OFF) 2212 return; 2213 2214 if (state == VGA_SWITCHEROO_ON) { 2215 pr_info("switched on\n"); 2216 /* don't suspend or resume card normally */ 2217 dev->switch_power_state = DRM_SWITCH_POWER_CHANGING; 2218 2219 pci_set_power_state(pdev, PCI_D0); 2220 amdgpu_device_load_pci_state(pdev); 2221 r = pci_enable_device(pdev); 2222 if (r) 2223 dev_warn(&pdev->dev, "pci_enable_device failed (%d)\n", 2224 r); 2225 amdgpu_device_resume(dev, true); 2226 2227 dev->switch_power_state = DRM_SWITCH_POWER_ON; 2228 } else { 2229 dev_info(&pdev->dev, "switched off\n"); 2230 dev->switch_power_state = DRM_SWITCH_POWER_CHANGING; 2231 amdgpu_device_prepare(dev); 2232 amdgpu_device_suspend(dev, true); 2233 amdgpu_device_cache_pci_state(pdev); 2234 /* Shut down the device */ 2235 pci_disable_device(pdev); 2236 pci_set_power_state(pdev, PCI_D3cold); 2237 dev->switch_power_state = DRM_SWITCH_POWER_OFF; 2238 } 2239 } 2240 2241 /** 2242 * amdgpu_switcheroo_can_switch - see if switcheroo state can change 2243 * 2244 * @pdev: pci dev pointer 2245 * 2246 * Callback for the switcheroo driver. Check of the switcheroo 2247 * state can be changed. 2248 * Returns true if the state can be changed, false if not. 2249 */ 2250 static bool amdgpu_switcheroo_can_switch(struct pci_dev *pdev) 2251 { 2252 struct drm_device *dev = pci_get_drvdata(pdev); 2253 2254 /* 2255 * FIXME: open_count is protected by drm_global_mutex but that would lead to 2256 * locking inversion with the driver load path. And the access here is 2257 * completely racy anyway. So don't bother with locking for now. 2258 */ 2259 return atomic_read(&dev->open_count) == 0; 2260 } 2261 2262 static const struct vga_switcheroo_client_ops amdgpu_switcheroo_ops = { 2263 .set_gpu_state = amdgpu_switcheroo_set_state, 2264 .reprobe = NULL, 2265 .can_switch = amdgpu_switcheroo_can_switch, 2266 }; 2267 2268 /** 2269 * amdgpu_device_ip_set_clockgating_state - set the CG state 2270 * 2271 * @dev: amdgpu_device pointer 2272 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.) 2273 * @state: clockgating state (gate or ungate) 2274 * 2275 * Sets the requested clockgating state for all instances of 2276 * the hardware IP specified. 2277 * Returns the error code from the last instance. 2278 */ 2279 int amdgpu_device_ip_set_clockgating_state(void *dev, 2280 enum amd_ip_block_type block_type, 2281 enum amd_clockgating_state state) 2282 { 2283 struct amdgpu_device *adev = dev; 2284 int i, r = 0; 2285 2286 for (i = 0; i < adev->num_ip_blocks; i++) { 2287 if (!adev->ip_blocks[i].status.valid) 2288 continue; 2289 if (adev->ip_blocks[i].version->type != block_type) 2290 continue; 2291 if (!adev->ip_blocks[i].version->funcs->set_clockgating_state) 2292 continue; 2293 r = adev->ip_blocks[i].version->funcs->set_clockgating_state( 2294 &adev->ip_blocks[i], state); 2295 if (r) 2296 dev_err(adev->dev, 2297 "set_clockgating_state of IP block <%s> failed %d\n", 2298 adev->ip_blocks[i].version->funcs->name, r); 2299 } 2300 return r; 2301 } 2302 2303 /** 2304 * amdgpu_device_ip_set_powergating_state - set the PG state 2305 * 2306 * @dev: amdgpu_device pointer 2307 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.) 2308 * @state: powergating state (gate or ungate) 2309 * 2310 * Sets the requested powergating state for all instances of 2311 * the hardware IP specified. 2312 * Returns the error code from the last instance. 2313 */ 2314 int amdgpu_device_ip_set_powergating_state(void *dev, 2315 enum amd_ip_block_type block_type, 2316 enum amd_powergating_state state) 2317 { 2318 struct amdgpu_device *adev = dev; 2319 int i, r = 0; 2320 2321 for (i = 0; i < adev->num_ip_blocks; i++) { 2322 if (!adev->ip_blocks[i].status.valid) 2323 continue; 2324 if (adev->ip_blocks[i].version->type != block_type) 2325 continue; 2326 if (!adev->ip_blocks[i].version->funcs->set_powergating_state) 2327 continue; 2328 r = adev->ip_blocks[i].version->funcs->set_powergating_state( 2329 &adev->ip_blocks[i], state); 2330 if (r) 2331 dev_err(adev->dev, 2332 "set_powergating_state of IP block <%s> failed %d\n", 2333 adev->ip_blocks[i].version->funcs->name, r); 2334 } 2335 return r; 2336 } 2337 2338 /** 2339 * amdgpu_device_ip_get_clockgating_state - get the CG state 2340 * 2341 * @adev: amdgpu_device pointer 2342 * @flags: clockgating feature flags 2343 * 2344 * Walks the list of IPs on the device and updates the clockgating 2345 * flags for each IP. 2346 * Updates @flags with the feature flags for each hardware IP where 2347 * clockgating is enabled. 2348 */ 2349 void amdgpu_device_ip_get_clockgating_state(struct amdgpu_device *adev, 2350 u64 *flags) 2351 { 2352 int i; 2353 2354 for (i = 0; i < adev->num_ip_blocks; i++) { 2355 if (!adev->ip_blocks[i].status.valid) 2356 continue; 2357 if (adev->ip_blocks[i].version->funcs->get_clockgating_state) 2358 adev->ip_blocks[i].version->funcs->get_clockgating_state( 2359 &adev->ip_blocks[i], flags); 2360 } 2361 } 2362 2363 /** 2364 * amdgpu_device_ip_wait_for_idle - wait for idle 2365 * 2366 * @adev: amdgpu_device pointer 2367 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.) 2368 * 2369 * Waits for the request hardware IP to be idle. 2370 * Returns 0 for success or a negative error code on failure. 2371 */ 2372 int amdgpu_device_ip_wait_for_idle(struct amdgpu_device *adev, 2373 enum amd_ip_block_type block_type) 2374 { 2375 int i, r; 2376 2377 for (i = 0; i < adev->num_ip_blocks; i++) { 2378 if (!adev->ip_blocks[i].status.valid) 2379 continue; 2380 if (adev->ip_blocks[i].version->type == block_type) { 2381 if (adev->ip_blocks[i].version->funcs->wait_for_idle) { 2382 r = adev->ip_blocks[i].version->funcs->wait_for_idle( 2383 &adev->ip_blocks[i]); 2384 if (r) 2385 return r; 2386 } 2387 break; 2388 } 2389 } 2390 return 0; 2391 2392 } 2393 2394 /** 2395 * amdgpu_device_ip_is_hw - is the hardware IP enabled 2396 * 2397 * @adev: amdgpu_device pointer 2398 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.) 2399 * 2400 * Check if the hardware IP is enable or not. 2401 * Returns true if it the IP is enable, false if not. 2402 */ 2403 bool amdgpu_device_ip_is_hw(struct amdgpu_device *adev, 2404 enum amd_ip_block_type block_type) 2405 { 2406 int i; 2407 2408 for (i = 0; i < adev->num_ip_blocks; i++) { 2409 if (adev->ip_blocks[i].version->type == block_type) 2410 return adev->ip_blocks[i].status.hw; 2411 } 2412 return false; 2413 } 2414 2415 /** 2416 * amdgpu_device_ip_is_valid - is the hardware IP valid 2417 * 2418 * @adev: amdgpu_device pointer 2419 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.) 2420 * 2421 * Check if the hardware IP is valid or not. 2422 * Returns true if it the IP is valid, false if not. 2423 */ 2424 bool amdgpu_device_ip_is_valid(struct amdgpu_device *adev, 2425 enum amd_ip_block_type block_type) 2426 { 2427 int i; 2428 2429 for (i = 0; i < adev->num_ip_blocks; i++) { 2430 if (adev->ip_blocks[i].version->type == block_type) 2431 return adev->ip_blocks[i].status.valid; 2432 } 2433 return false; 2434 2435 } 2436 2437 /** 2438 * amdgpu_device_ip_get_ip_block - get a hw IP pointer 2439 * 2440 * @adev: amdgpu_device pointer 2441 * @type: Type of hardware IP (SMU, GFX, UVD, etc.) 2442 * 2443 * Returns a pointer to the hardware IP block structure 2444 * if it exists for the asic, otherwise NULL. 2445 */ 2446 struct amdgpu_ip_block * 2447 amdgpu_device_ip_get_ip_block(struct amdgpu_device *adev, 2448 enum amd_ip_block_type type) 2449 { 2450 int i; 2451 2452 for (i = 0; i < adev->num_ip_blocks; i++) 2453 if (adev->ip_blocks[i].version->type == type) 2454 return &adev->ip_blocks[i]; 2455 2456 return NULL; 2457 } 2458 2459 /** 2460 * amdgpu_device_ip_block_version_cmp 2461 * 2462 * @adev: amdgpu_device pointer 2463 * @type: enum amd_ip_block_type 2464 * @major: major version 2465 * @minor: minor version 2466 * 2467 * return 0 if equal or greater 2468 * return 1 if smaller or the ip_block doesn't exist 2469 */ 2470 int amdgpu_device_ip_block_version_cmp(struct amdgpu_device *adev, 2471 enum amd_ip_block_type type, 2472 u32 major, u32 minor) 2473 { 2474 struct amdgpu_ip_block *ip_block = amdgpu_device_ip_get_ip_block(adev, type); 2475 2476 if (ip_block && ((ip_block->version->major > major) || 2477 ((ip_block->version->major == major) && 2478 (ip_block->version->minor >= minor)))) 2479 return 0; 2480 2481 return 1; 2482 } 2483 2484 static const char *ip_block_names[] = { 2485 [AMD_IP_BLOCK_TYPE_COMMON] = "common", 2486 [AMD_IP_BLOCK_TYPE_GMC] = "gmc", 2487 [AMD_IP_BLOCK_TYPE_IH] = "ih", 2488 [AMD_IP_BLOCK_TYPE_SMC] = "smu", 2489 [AMD_IP_BLOCK_TYPE_PSP] = "psp", 2490 [AMD_IP_BLOCK_TYPE_DCE] = "dce", 2491 [AMD_IP_BLOCK_TYPE_GFX] = "gfx", 2492 [AMD_IP_BLOCK_TYPE_SDMA] = "sdma", 2493 [AMD_IP_BLOCK_TYPE_UVD] = "uvd", 2494 [AMD_IP_BLOCK_TYPE_VCE] = "vce", 2495 [AMD_IP_BLOCK_TYPE_ACP] = "acp", 2496 [AMD_IP_BLOCK_TYPE_VCN] = "vcn", 2497 [AMD_IP_BLOCK_TYPE_MES] = "mes", 2498 [AMD_IP_BLOCK_TYPE_JPEG] = "jpeg", 2499 [AMD_IP_BLOCK_TYPE_VPE] = "vpe", 2500 [AMD_IP_BLOCK_TYPE_UMSCH_MM] = "umsch_mm", 2501 [AMD_IP_BLOCK_TYPE_ISP] = "isp", 2502 [AMD_IP_BLOCK_TYPE_RAS] = "ras", 2503 }; 2504 2505 static const char *ip_block_name(struct amdgpu_device *adev, enum amd_ip_block_type type) 2506 { 2507 int idx = (int)type; 2508 2509 return idx < ARRAY_SIZE(ip_block_names) ? ip_block_names[idx] : "unknown"; 2510 } 2511 2512 /** 2513 * amdgpu_device_ip_block_add 2514 * 2515 * @adev: amdgpu_device pointer 2516 * @ip_block_version: pointer to the IP to add 2517 * 2518 * Adds the IP block driver information to the collection of IPs 2519 * on the asic. 2520 */ 2521 int amdgpu_device_ip_block_add(struct amdgpu_device *adev, 2522 const struct amdgpu_ip_block_version *ip_block_version) 2523 { 2524 if (!ip_block_version) 2525 return -EINVAL; 2526 2527 switch (ip_block_version->type) { 2528 case AMD_IP_BLOCK_TYPE_VCN: 2529 if (adev->harvest_ip_mask & AMD_HARVEST_IP_VCN_MASK) 2530 return 0; 2531 break; 2532 case AMD_IP_BLOCK_TYPE_JPEG: 2533 if (adev->harvest_ip_mask & AMD_HARVEST_IP_JPEG_MASK) 2534 return 0; 2535 break; 2536 default: 2537 break; 2538 } 2539 2540 dev_info(adev->dev, "detected ip block number %d <%s_v%d_%d_%d> (%s)\n", 2541 adev->num_ip_blocks, 2542 ip_block_name(adev, ip_block_version->type), 2543 ip_block_version->major, 2544 ip_block_version->minor, 2545 ip_block_version->rev, 2546 ip_block_version->funcs->name); 2547 2548 adev->ip_blocks[adev->num_ip_blocks].adev = adev; 2549 2550 adev->ip_blocks[adev->num_ip_blocks++].version = ip_block_version; 2551 2552 return 0; 2553 } 2554 2555 /** 2556 * amdgpu_device_enable_virtual_display - enable virtual display feature 2557 * 2558 * @adev: amdgpu_device pointer 2559 * 2560 * Enabled the virtual display feature if the user has enabled it via 2561 * the module parameter virtual_display. This feature provides a virtual 2562 * display hardware on headless boards or in virtualized environments. 2563 * This function parses and validates the configuration string specified by 2564 * the user and configures the virtual display configuration (number of 2565 * virtual connectors, crtcs, etc.) specified. 2566 */ 2567 static void amdgpu_device_enable_virtual_display(struct amdgpu_device *adev) 2568 { 2569 adev->enable_virtual_display = false; 2570 2571 if (amdgpu_virtual_display) { 2572 const char *pci_address_name = pci_name(adev->pdev); 2573 char *pciaddstr, *pciaddstr_tmp, *pciaddname_tmp, *pciaddname; 2574 2575 pciaddstr = kstrdup(amdgpu_virtual_display, GFP_KERNEL); 2576 pciaddstr_tmp = pciaddstr; 2577 while ((pciaddname_tmp = strsep(&pciaddstr_tmp, ";"))) { 2578 pciaddname = strsep(&pciaddname_tmp, ","); 2579 if (!strcmp("all", pciaddname) 2580 || !strcmp(pci_address_name, pciaddname)) { 2581 long num_crtc; 2582 int res = -1; 2583 2584 adev->enable_virtual_display = true; 2585 2586 if (pciaddname_tmp) 2587 res = kstrtol(pciaddname_tmp, 10, 2588 &num_crtc); 2589 2590 if (!res) { 2591 if (num_crtc < 1) 2592 num_crtc = 1; 2593 if (num_crtc > 6) 2594 num_crtc = 6; 2595 adev->mode_info.num_crtc = num_crtc; 2596 } else { 2597 adev->mode_info.num_crtc = 1; 2598 } 2599 break; 2600 } 2601 } 2602 2603 dev_info( 2604 adev->dev, 2605 "virtual display string:%s, %s:virtual_display:%d, num_crtc:%d\n", 2606 amdgpu_virtual_display, pci_address_name, 2607 adev->enable_virtual_display, adev->mode_info.num_crtc); 2608 2609 kfree(pciaddstr); 2610 } 2611 } 2612 2613 void amdgpu_device_set_sriov_virtual_display(struct amdgpu_device *adev) 2614 { 2615 if (amdgpu_sriov_vf(adev) && !adev->enable_virtual_display) { 2616 adev->mode_info.num_crtc = 1; 2617 adev->enable_virtual_display = true; 2618 dev_info(adev->dev, "virtual_display:%d, num_crtc:%d\n", 2619 adev->enable_virtual_display, 2620 adev->mode_info.num_crtc); 2621 } 2622 } 2623 2624 /** 2625 * amdgpu_device_parse_gpu_info_fw - parse gpu info firmware 2626 * 2627 * @adev: amdgpu_device pointer 2628 * 2629 * Parses the asic configuration parameters specified in the gpu info 2630 * firmware and makes them available to the driver for use in configuring 2631 * the asic. 2632 * Returns 0 on success, -EINVAL on failure. 2633 */ 2634 static int amdgpu_device_parse_gpu_info_fw(struct amdgpu_device *adev) 2635 { 2636 const char *chip_name; 2637 int err; 2638 const struct gpu_info_firmware_header_v1_0 *hdr; 2639 2640 adev->firmware.gpu_info_fw = NULL; 2641 2642 switch (adev->asic_type) { 2643 default: 2644 return 0; 2645 case CHIP_VEGA10: 2646 chip_name = "vega10"; 2647 break; 2648 case CHIP_VEGA12: 2649 chip_name = "vega12"; 2650 break; 2651 case CHIP_RAVEN: 2652 if (adev->apu_flags & AMD_APU_IS_RAVEN2) 2653 chip_name = "raven2"; 2654 else if (adev->apu_flags & AMD_APU_IS_PICASSO) 2655 chip_name = "picasso"; 2656 else 2657 chip_name = "raven"; 2658 break; 2659 case CHIP_ARCTURUS: 2660 chip_name = "arcturus"; 2661 break; 2662 case CHIP_NAVI12: 2663 if (adev->discovery.bin) 2664 return 0; 2665 chip_name = "navi12"; 2666 break; 2667 case CHIP_CYAN_SKILLFISH: 2668 if (adev->discovery.bin) 2669 return 0; 2670 chip_name = "cyan_skillfish"; 2671 break; 2672 } 2673 2674 err = amdgpu_ucode_request(adev, &adev->firmware.gpu_info_fw, 2675 AMDGPU_UCODE_OPTIONAL, 2676 "amdgpu/%s_gpu_info.bin", chip_name); 2677 if (err) { 2678 dev_err(adev->dev, 2679 "Failed to get gpu_info firmware \"%s_gpu_info.bin\"\n", 2680 chip_name); 2681 goto out; 2682 } 2683 2684 hdr = (const struct gpu_info_firmware_header_v1_0 *)adev->firmware.gpu_info_fw->data; 2685 amdgpu_ucode_print_gpu_info_hdr(&hdr->header); 2686 2687 switch (hdr->version_major) { 2688 case 1: 2689 { 2690 const struct gpu_info_firmware_v1_0 *gpu_info_fw = 2691 (const struct gpu_info_firmware_v1_0 *)(adev->firmware.gpu_info_fw->data + 2692 le32_to_cpu(hdr->header.ucode_array_offset_bytes)); 2693 2694 /* 2695 * Should be dropped when DAL no longer needs it. 2696 */ 2697 if (adev->asic_type == CHIP_NAVI12) 2698 goto parse_soc_bounding_box; 2699 2700 adev->gfx.config.max_shader_engines = le32_to_cpu(gpu_info_fw->gc_num_se); 2701 adev->gfx.config.max_cu_per_sh = le32_to_cpu(gpu_info_fw->gc_num_cu_per_sh); 2702 adev->gfx.config.max_sh_per_se = le32_to_cpu(gpu_info_fw->gc_num_sh_per_se); 2703 adev->gfx.config.max_backends_per_se = le32_to_cpu(gpu_info_fw->gc_num_rb_per_se); 2704 adev->gfx.config.max_texture_channel_caches = 2705 le32_to_cpu(gpu_info_fw->gc_num_tccs); 2706 adev->gfx.config.max_gprs = le32_to_cpu(gpu_info_fw->gc_num_gprs); 2707 adev->gfx.config.max_gs_threads = le32_to_cpu(gpu_info_fw->gc_num_max_gs_thds); 2708 adev->gfx.config.gs_vgt_table_depth = le32_to_cpu(gpu_info_fw->gc_gs_table_depth); 2709 adev->gfx.config.gs_prim_buffer_depth = le32_to_cpu(gpu_info_fw->gc_gsprim_buff_depth); 2710 adev->gfx.config.double_offchip_lds_buf = 2711 le32_to_cpu(gpu_info_fw->gc_double_offchip_lds_buffer); 2712 adev->gfx.cu_info.wave_front_size = le32_to_cpu(gpu_info_fw->gc_wave_size); 2713 adev->gfx.cu_info.max_waves_per_simd = 2714 le32_to_cpu(gpu_info_fw->gc_max_waves_per_simd); 2715 adev->gfx.cu_info.max_scratch_slots_per_cu = 2716 le32_to_cpu(gpu_info_fw->gc_max_scratch_slots_per_cu); 2717 adev->gfx.cu_info.lds_size = le32_to_cpu(gpu_info_fw->gc_lds_size); 2718 if (hdr->version_minor >= 1) { 2719 const struct gpu_info_firmware_v1_1 *gpu_info_fw = 2720 (const struct gpu_info_firmware_v1_1 *)(adev->firmware.gpu_info_fw->data + 2721 le32_to_cpu(hdr->header.ucode_array_offset_bytes)); 2722 adev->gfx.config.num_sc_per_sh = 2723 le32_to_cpu(gpu_info_fw->num_sc_per_sh); 2724 adev->gfx.config.num_packer_per_sc = 2725 le32_to_cpu(gpu_info_fw->num_packer_per_sc); 2726 } 2727 2728 parse_soc_bounding_box: 2729 /* 2730 * soc bounding box info is not integrated in disocovery table, 2731 * we always need to parse it from gpu info firmware if needed. 2732 */ 2733 if (hdr->version_minor == 2) { 2734 const struct gpu_info_firmware_v1_2 *gpu_info_fw = 2735 (const struct gpu_info_firmware_v1_2 *)(adev->firmware.gpu_info_fw->data + 2736 le32_to_cpu(hdr->header.ucode_array_offset_bytes)); 2737 adev->dm.soc_bounding_box = &gpu_info_fw->soc_bounding_box; 2738 } 2739 break; 2740 } 2741 default: 2742 dev_err(adev->dev, 2743 "Unsupported gpu_info table %d\n", hdr->header.ucode_version); 2744 err = -EINVAL; 2745 goto out; 2746 } 2747 out: 2748 return err; 2749 } 2750 2751 static void amdgpu_uid_init(struct amdgpu_device *adev) 2752 { 2753 /* Initialize the UID for the device */ 2754 adev->uid_info = kzalloc(sizeof(struct amdgpu_uid), GFP_KERNEL); 2755 if (!adev->uid_info) { 2756 dev_warn(adev->dev, "Failed to allocate memory for UID\n"); 2757 return; 2758 } 2759 adev->uid_info->adev = adev; 2760 } 2761 2762 static void amdgpu_uid_fini(struct amdgpu_device *adev) 2763 { 2764 /* Free the UID memory */ 2765 kfree(adev->uid_info); 2766 adev->uid_info = NULL; 2767 } 2768 2769 /** 2770 * amdgpu_device_ip_early_init - run early init for hardware IPs 2771 * 2772 * @adev: amdgpu_device pointer 2773 * 2774 * Early initialization pass for hardware IPs. The hardware IPs that make 2775 * up each asic are discovered each IP's early_init callback is run. This 2776 * is the first stage in initializing the asic. 2777 * Returns 0 on success, negative error code on failure. 2778 */ 2779 static int amdgpu_device_ip_early_init(struct amdgpu_device *adev) 2780 { 2781 struct amdgpu_ip_block *ip_block; 2782 struct pci_dev *parent; 2783 bool total, skip_bios; 2784 uint32_t bios_flags; 2785 int i, r; 2786 2787 amdgpu_device_enable_virtual_display(adev); 2788 2789 if (amdgpu_sriov_vf(adev)) { 2790 r = amdgpu_virt_request_full_gpu(adev, true); 2791 if (r) 2792 return r; 2793 2794 r = amdgpu_virt_init_critical_region(adev); 2795 if (r) 2796 return r; 2797 } 2798 2799 switch (adev->asic_type) { 2800 #ifdef CONFIG_DRM_AMDGPU_SI 2801 case CHIP_VERDE: 2802 case CHIP_TAHITI: 2803 case CHIP_PITCAIRN: 2804 case CHIP_OLAND: 2805 case CHIP_HAINAN: 2806 adev->family = AMDGPU_FAMILY_SI; 2807 r = si_set_ip_blocks(adev); 2808 if (r) 2809 return r; 2810 break; 2811 #endif 2812 #ifdef CONFIG_DRM_AMDGPU_CIK 2813 case CHIP_BONAIRE: 2814 case CHIP_HAWAII: 2815 case CHIP_KAVERI: 2816 case CHIP_KABINI: 2817 case CHIP_MULLINS: 2818 if (adev->flags & AMD_IS_APU) 2819 adev->family = AMDGPU_FAMILY_KV; 2820 else 2821 adev->family = AMDGPU_FAMILY_CI; 2822 2823 r = cik_set_ip_blocks(adev); 2824 if (r) 2825 return r; 2826 break; 2827 #endif 2828 case CHIP_TOPAZ: 2829 case CHIP_TONGA: 2830 case CHIP_FIJI: 2831 case CHIP_POLARIS10: 2832 case CHIP_POLARIS11: 2833 case CHIP_POLARIS12: 2834 case CHIP_VEGAM: 2835 case CHIP_CARRIZO: 2836 case CHIP_STONEY: 2837 if (adev->flags & AMD_IS_APU) 2838 adev->family = AMDGPU_FAMILY_CZ; 2839 else 2840 adev->family = AMDGPU_FAMILY_VI; 2841 2842 r = vi_set_ip_blocks(adev); 2843 if (r) 2844 return r; 2845 break; 2846 default: 2847 r = amdgpu_discovery_set_ip_blocks(adev); 2848 if (r) 2849 return r; 2850 break; 2851 } 2852 2853 /* Check for IP version 9.4.3 with A0 hardware */ 2854 if (amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 3) && 2855 !amdgpu_device_get_rev_id(adev)) { 2856 dev_err(adev->dev, "Unsupported A0 hardware\n"); 2857 return -ENODEV; /* device unsupported - no device error */ 2858 } 2859 2860 if (amdgpu_has_atpx() && 2861 (amdgpu_is_atpx_hybrid() || 2862 amdgpu_has_atpx_dgpu_power_cntl()) && 2863 ((adev->flags & AMD_IS_APU) == 0) && 2864 !dev_is_removable(&adev->pdev->dev)) 2865 adev->flags |= AMD_IS_PX; 2866 2867 if (!(adev->flags & AMD_IS_APU)) { 2868 parent = pcie_find_root_port(adev->pdev); 2869 adev->has_pr3 = parent ? pci_pr3_present(parent) : false; 2870 } 2871 2872 adev->pm.pp_feature = amdgpu_pp_feature_mask; 2873 if (amdgpu_sriov_vf(adev) || sched_policy == KFD_SCHED_POLICY_NO_HWS) 2874 adev->pm.pp_feature &= ~PP_GFXOFF_MASK; 2875 if (amdgpu_sriov_vf(adev) && adev->asic_type == CHIP_SIENNA_CICHLID) 2876 adev->pm.pp_feature &= ~PP_OVERDRIVE_MASK; 2877 if (!amdgpu_device_pcie_dynamic_switching_supported(adev)) 2878 adev->pm.pp_feature &= ~PP_PCIE_DPM_MASK; 2879 2880 adev->virt.is_xgmi_node_migrate_enabled = false; 2881 if (amdgpu_sriov_vf(adev)) { 2882 adev->virt.is_xgmi_node_migrate_enabled = 2883 amdgpu_ip_version((adev), GC_HWIP, 0) == IP_VERSION(9, 4, 4); 2884 } 2885 2886 total = true; 2887 for (i = 0; i < adev->num_ip_blocks; i++) { 2888 ip_block = &adev->ip_blocks[i]; 2889 2890 if ((amdgpu_ip_block_mask & (1 << i)) == 0) { 2891 dev_warn(adev->dev, "disabled ip block: %d <%s>\n", i, 2892 adev->ip_blocks[i].version->funcs->name); 2893 adev->ip_blocks[i].status.valid = false; 2894 } else if (ip_block->version->funcs->early_init) { 2895 r = ip_block->version->funcs->early_init(ip_block); 2896 if (r == -ENOENT) { 2897 adev->ip_blocks[i].status.valid = false; 2898 } else if (r) { 2899 dev_err(adev->dev, 2900 "early_init of IP block <%s> failed %d\n", 2901 adev->ip_blocks[i].version->funcs->name, 2902 r); 2903 total = false; 2904 } else { 2905 adev->ip_blocks[i].status.valid = true; 2906 } 2907 } else { 2908 adev->ip_blocks[i].status.valid = true; 2909 } 2910 /* get the vbios after the asic_funcs are set up */ 2911 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON) { 2912 r = amdgpu_device_parse_gpu_info_fw(adev); 2913 if (r) 2914 return r; 2915 2916 bios_flags = amdgpu_device_get_vbios_flags(adev); 2917 skip_bios = !!(bios_flags & AMDGPU_VBIOS_SKIP); 2918 /* Read BIOS */ 2919 if (!skip_bios) { 2920 bool optional = 2921 !!(bios_flags & AMDGPU_VBIOS_OPTIONAL); 2922 if (!amdgpu_get_bios(adev) && !optional) 2923 return -EINVAL; 2924 2925 if (optional && !adev->bios) 2926 dev_info( 2927 adev->dev, 2928 "VBIOS image optional, proceeding without VBIOS image"); 2929 2930 if (adev->bios) { 2931 r = amdgpu_atombios_init(adev); 2932 if (r) { 2933 dev_err(adev->dev, 2934 "amdgpu_atombios_init failed\n"); 2935 amdgpu_vf_error_put( 2936 adev, 2937 AMDGIM_ERROR_VF_ATOMBIOS_INIT_FAIL, 2938 0, 0); 2939 return r; 2940 } 2941 } 2942 } 2943 2944 /*get pf2vf msg info at it's earliest time*/ 2945 if (amdgpu_sriov_vf(adev)) 2946 amdgpu_virt_init_data_exchange(adev); 2947 2948 } 2949 } 2950 if (!total) 2951 return -ENODEV; 2952 2953 if (adev->gmc.xgmi.supported) 2954 amdgpu_xgmi_early_init(adev); 2955 2956 if (amdgpu_is_multi_aid(adev)) 2957 amdgpu_uid_init(adev); 2958 ip_block = amdgpu_device_ip_get_ip_block(adev, AMD_IP_BLOCK_TYPE_GFX); 2959 if (ip_block->status.valid != false) 2960 amdgpu_amdkfd_device_probe(adev); 2961 2962 adev->cg_flags &= amdgpu_cg_mask; 2963 adev->pg_flags &= amdgpu_pg_mask; 2964 2965 return 0; 2966 } 2967 2968 static int amdgpu_device_ip_hw_init_phase1(struct amdgpu_device *adev) 2969 { 2970 int i, r; 2971 2972 for (i = 0; i < adev->num_ip_blocks; i++) { 2973 if (!adev->ip_blocks[i].status.sw) 2974 continue; 2975 if (adev->ip_blocks[i].status.hw) 2976 continue; 2977 if (!amdgpu_ip_member_of_hwini( 2978 adev, adev->ip_blocks[i].version->type)) 2979 continue; 2980 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON || 2981 (amdgpu_sriov_vf(adev) && (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP)) || 2982 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH) { 2983 r = adev->ip_blocks[i].version->funcs->hw_init(&adev->ip_blocks[i]); 2984 if (r) { 2985 dev_err(adev->dev, 2986 "hw_init of IP block <%s> failed %d\n", 2987 adev->ip_blocks[i].version->funcs->name, 2988 r); 2989 return r; 2990 } 2991 adev->ip_blocks[i].status.hw = true; 2992 } 2993 } 2994 2995 return 0; 2996 } 2997 2998 static int amdgpu_device_ip_hw_init_phase2(struct amdgpu_device *adev) 2999 { 3000 int i, r; 3001 3002 for (i = 0; i < adev->num_ip_blocks; i++) { 3003 if (!adev->ip_blocks[i].status.sw) 3004 continue; 3005 if (adev->ip_blocks[i].status.hw) 3006 continue; 3007 if (!amdgpu_ip_member_of_hwini( 3008 adev, adev->ip_blocks[i].version->type)) 3009 continue; 3010 r = adev->ip_blocks[i].version->funcs->hw_init(&adev->ip_blocks[i]); 3011 if (r) { 3012 dev_err(adev->dev, 3013 "hw_init of IP block <%s> failed %d\n", 3014 adev->ip_blocks[i].version->funcs->name, r); 3015 return r; 3016 } 3017 adev->ip_blocks[i].status.hw = true; 3018 } 3019 3020 return 0; 3021 } 3022 3023 static int amdgpu_device_fw_loading(struct amdgpu_device *adev) 3024 { 3025 int r = 0; 3026 int i; 3027 uint32_t smu_version; 3028 3029 if (adev->asic_type >= CHIP_VEGA10) { 3030 for (i = 0; i < adev->num_ip_blocks; i++) { 3031 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_PSP) 3032 continue; 3033 3034 if (!amdgpu_ip_member_of_hwini(adev, 3035 AMD_IP_BLOCK_TYPE_PSP)) 3036 break; 3037 3038 if (!adev->ip_blocks[i].status.sw) 3039 continue; 3040 3041 /* no need to do the fw loading again if already done*/ 3042 if (adev->ip_blocks[i].status.hw == true) 3043 break; 3044 3045 if (amdgpu_in_reset(adev) || adev->in_suspend) { 3046 r = amdgpu_ip_block_resume(&adev->ip_blocks[i]); 3047 if (r) 3048 return r; 3049 } else { 3050 r = adev->ip_blocks[i].version->funcs->hw_init(&adev->ip_blocks[i]); 3051 if (r) { 3052 dev_err(adev->dev, 3053 "hw_init of IP block <%s> failed %d\n", 3054 adev->ip_blocks[i] 3055 .version->funcs->name, 3056 r); 3057 return r; 3058 } 3059 adev->ip_blocks[i].status.hw = true; 3060 } 3061 break; 3062 } 3063 } 3064 3065 if (!amdgpu_sriov_vf(adev) || adev->asic_type == CHIP_TONGA) 3066 r = amdgpu_pm_load_smu_firmware(adev, &smu_version); 3067 3068 return r; 3069 } 3070 3071 static int amdgpu_device_init_schedulers(struct amdgpu_device *adev) 3072 { 3073 struct drm_sched_init_args args = { 3074 .ops = &amdgpu_sched_ops, 3075 .num_rqs = DRM_SCHED_PRIORITY_COUNT, 3076 .timeout_wq = adev->reset_domain->wq, 3077 .dev = adev->dev, 3078 }; 3079 long timeout; 3080 int r, i; 3081 3082 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 3083 struct amdgpu_ring *ring = adev->rings[i]; 3084 3085 /* No need to setup the GPU scheduler for rings that don't need it */ 3086 if (!ring || ring->no_scheduler) 3087 continue; 3088 3089 switch (ring->funcs->type) { 3090 case AMDGPU_RING_TYPE_GFX: 3091 timeout = adev->gfx_timeout; 3092 break; 3093 case AMDGPU_RING_TYPE_COMPUTE: 3094 timeout = adev->compute_timeout; 3095 break; 3096 case AMDGPU_RING_TYPE_SDMA: 3097 timeout = adev->sdma_timeout; 3098 break; 3099 default: 3100 timeout = adev->video_timeout; 3101 break; 3102 } 3103 3104 args.timeout = timeout; 3105 args.credit_limit = ring->num_hw_submission; 3106 args.score = ring->sched_score; 3107 args.name = ring->name; 3108 3109 r = drm_sched_init(&ring->sched, &args); 3110 if (r) { 3111 dev_err(adev->dev, 3112 "Failed to create scheduler on ring %s.\n", 3113 ring->name); 3114 return r; 3115 } 3116 r = amdgpu_uvd_entity_init(adev, ring); 3117 if (r) { 3118 dev_err(adev->dev, 3119 "Failed to create UVD scheduling entity on ring %s.\n", 3120 ring->name); 3121 return r; 3122 } 3123 r = amdgpu_vce_entity_init(adev, ring); 3124 if (r) { 3125 dev_err(adev->dev, 3126 "Failed to create VCE scheduling entity on ring %s.\n", 3127 ring->name); 3128 return r; 3129 } 3130 } 3131 3132 if (adev->xcp_mgr) 3133 amdgpu_xcp_update_partition_sched_list(adev); 3134 3135 return 0; 3136 } 3137 3138 3139 /** 3140 * amdgpu_device_ip_init - run init for hardware IPs 3141 * 3142 * @adev: amdgpu_device pointer 3143 * 3144 * Main initialization pass for hardware IPs. The list of all the hardware 3145 * IPs that make up the asic is walked and the sw_init and hw_init callbacks 3146 * are run. sw_init initializes the software state associated with each IP 3147 * and hw_init initializes the hardware associated with each IP. 3148 * Returns 0 on success, negative error code on failure. 3149 */ 3150 static int amdgpu_device_ip_init(struct amdgpu_device *adev) 3151 { 3152 bool init_badpage; 3153 int i, r; 3154 3155 r = amdgpu_ras_init(adev); 3156 if (r) 3157 return r; 3158 3159 for (i = 0; i < adev->num_ip_blocks; i++) { 3160 if (!adev->ip_blocks[i].status.valid) 3161 continue; 3162 if (adev->ip_blocks[i].version->funcs->sw_init) { 3163 r = adev->ip_blocks[i].version->funcs->sw_init(&adev->ip_blocks[i]); 3164 if (r) { 3165 dev_err(adev->dev, 3166 "sw_init of IP block <%s> failed %d\n", 3167 adev->ip_blocks[i].version->funcs->name, 3168 r); 3169 goto init_failed; 3170 } 3171 } 3172 adev->ip_blocks[i].status.sw = true; 3173 3174 if (!amdgpu_ip_member_of_hwini( 3175 adev, adev->ip_blocks[i].version->type)) 3176 continue; 3177 3178 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON) { 3179 /* need to do common hw init early so everything is set up for gmc */ 3180 r = adev->ip_blocks[i].version->funcs->hw_init(&adev->ip_blocks[i]); 3181 if (r) { 3182 dev_err(adev->dev, "hw_init %d failed %d\n", i, 3183 r); 3184 goto init_failed; 3185 } 3186 adev->ip_blocks[i].status.hw = true; 3187 } else if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) { 3188 /* need to do gmc hw init early so we can allocate gpu mem */ 3189 /* Try to reserve bad pages early */ 3190 if (amdgpu_sriov_vf(adev)) 3191 amdgpu_virt_exchange_data(adev); 3192 3193 r = amdgpu_device_mem_scratch_init(adev); 3194 if (r) { 3195 dev_err(adev->dev, 3196 "amdgpu_mem_scratch_init failed %d\n", 3197 r); 3198 goto init_failed; 3199 } 3200 r = adev->ip_blocks[i].version->funcs->hw_init(&adev->ip_blocks[i]); 3201 if (r) { 3202 dev_err(adev->dev, "hw_init %d failed %d\n", i, 3203 r); 3204 goto init_failed; 3205 } 3206 r = amdgpu_device_wb_init(adev); 3207 if (r) { 3208 dev_err(adev->dev, 3209 "amdgpu_device_wb_init failed %d\n", r); 3210 goto init_failed; 3211 } 3212 adev->ip_blocks[i].status.hw = true; 3213 3214 /* right after GMC hw init, we create CSA */ 3215 if (adev->gfx.mcbp) { 3216 r = amdgpu_allocate_static_csa(adev, &adev->virt.csa_obj, 3217 AMDGPU_GEM_DOMAIN_VRAM | 3218 AMDGPU_GEM_DOMAIN_GTT, 3219 AMDGPU_CSA_SIZE); 3220 if (r) { 3221 dev_err(adev->dev, 3222 "allocate CSA failed %d\n", r); 3223 goto init_failed; 3224 } 3225 } 3226 3227 r = amdgpu_seq64_init(adev); 3228 if (r) { 3229 dev_err(adev->dev, "allocate seq64 failed %d\n", 3230 r); 3231 goto init_failed; 3232 } 3233 } 3234 } 3235 3236 if (amdgpu_sriov_vf(adev)) 3237 amdgpu_virt_init_data_exchange(adev); 3238 3239 r = amdgpu_ib_pool_init(adev); 3240 if (r) { 3241 dev_err(adev->dev, "IB initialization failed (%d).\n", r); 3242 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_IB_INIT_FAIL, 0, r); 3243 goto init_failed; 3244 } 3245 3246 r = amdgpu_ucode_create_bo(adev); /* create ucode bo when sw_init complete*/ 3247 if (r) 3248 goto init_failed; 3249 3250 r = amdgpu_device_ip_hw_init_phase1(adev); 3251 if (r) 3252 goto init_failed; 3253 3254 r = amdgpu_device_fw_loading(adev); 3255 if (r) 3256 goto init_failed; 3257 3258 r = amdgpu_device_ip_hw_init_phase2(adev); 3259 if (r) 3260 goto init_failed; 3261 3262 /* 3263 * retired pages will be loaded from eeprom and reserved here, 3264 * it should be called after amdgpu_device_ip_hw_init_phase2 since 3265 * for some ASICs the RAS EEPROM code relies on SMU fully functioning 3266 * for I2C communication which only true at this point. 3267 * 3268 * amdgpu_ras_recovery_init may fail, but the upper only cares the 3269 * failure from bad gpu situation and stop amdgpu init process 3270 * accordingly. For other failed cases, it will still release all 3271 * the resource and print error message, rather than returning one 3272 * negative value to upper level. 3273 * 3274 * Note: theoretically, this should be called before all vram allocations 3275 * to protect retired page from abusing 3276 */ 3277 init_badpage = (adev->init_lvl->level != AMDGPU_INIT_LEVEL_MINIMAL_XGMI); 3278 r = amdgpu_ras_recovery_init(adev, init_badpage); 3279 if (r) 3280 goto init_failed; 3281 3282 /** 3283 * In case of XGMI grab extra reference for reset domain for this device 3284 */ 3285 if (adev->gmc.xgmi.num_physical_nodes > 1) { 3286 if (amdgpu_xgmi_add_device(adev) == 0) { 3287 if (!amdgpu_sriov_vf(adev)) { 3288 struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev); 3289 3290 if (WARN_ON(!hive)) { 3291 r = -ENOENT; 3292 goto init_failed; 3293 } 3294 3295 if (!hive->reset_domain || 3296 !amdgpu_reset_get_reset_domain(hive->reset_domain)) { 3297 r = -ENOENT; 3298 amdgpu_put_xgmi_hive(hive); 3299 goto init_failed; 3300 } 3301 3302 /* Drop the early temporary reset domain we created for device */ 3303 amdgpu_reset_put_reset_domain(adev->reset_domain); 3304 adev->reset_domain = hive->reset_domain; 3305 amdgpu_put_xgmi_hive(hive); 3306 } 3307 } 3308 } 3309 3310 r = amdgpu_device_init_schedulers(adev); 3311 if (r) 3312 goto init_failed; 3313 3314 if (adev->mman.buffer_funcs_ring->sched.ready) 3315 amdgpu_ttm_set_buffer_funcs_status(adev, true); 3316 3317 /* Don't init kfd if whole hive need to be reset during init */ 3318 if (adev->init_lvl->level != AMDGPU_INIT_LEVEL_MINIMAL_XGMI) { 3319 kgd2kfd_init_zone_device(adev); 3320 amdgpu_amdkfd_device_init(adev); 3321 } 3322 3323 amdgpu_fru_get_product_info(adev); 3324 3325 if (!amdgpu_sriov_vf(adev) || amdgpu_sriov_ras_cper_en(adev)) 3326 r = amdgpu_cper_init(adev); 3327 3328 init_failed: 3329 3330 return r; 3331 } 3332 3333 /** 3334 * amdgpu_device_fill_reset_magic - writes reset magic to gart pointer 3335 * 3336 * @adev: amdgpu_device pointer 3337 * 3338 * Writes a reset magic value to the gart pointer in VRAM. The driver calls 3339 * this function before a GPU reset. If the value is retained after a 3340 * GPU reset, VRAM has not been lost. Some GPU resets may destroy VRAM contents. 3341 */ 3342 static void amdgpu_device_fill_reset_magic(struct amdgpu_device *adev) 3343 { 3344 memcpy(adev->reset_magic, adev->gart.ptr, AMDGPU_RESET_MAGIC_NUM); 3345 } 3346 3347 /** 3348 * amdgpu_device_check_vram_lost - check if vram is valid 3349 * 3350 * @adev: amdgpu_device pointer 3351 * 3352 * Checks the reset magic value written to the gart pointer in VRAM. 3353 * The driver calls this after a GPU reset to see if the contents of 3354 * VRAM is lost or now. 3355 * returns true if vram is lost, false if not. 3356 */ 3357 static bool amdgpu_device_check_vram_lost(struct amdgpu_device *adev) 3358 { 3359 if (memcmp(adev->gart.ptr, adev->reset_magic, 3360 AMDGPU_RESET_MAGIC_NUM)) 3361 return true; 3362 3363 if (!amdgpu_in_reset(adev)) 3364 return false; 3365 3366 /* 3367 * For all ASICs with baco/mode1 reset, the VRAM is 3368 * always assumed to be lost. 3369 */ 3370 switch (amdgpu_asic_reset_method(adev)) { 3371 case AMD_RESET_METHOD_LEGACY: 3372 case AMD_RESET_METHOD_LINK: 3373 case AMD_RESET_METHOD_BACO: 3374 case AMD_RESET_METHOD_MODE1: 3375 return true; 3376 default: 3377 return false; 3378 } 3379 } 3380 3381 /** 3382 * amdgpu_device_set_cg_state - set clockgating for amdgpu device 3383 * 3384 * @adev: amdgpu_device pointer 3385 * @state: clockgating state (gate or ungate) 3386 * 3387 * The list of all the hardware IPs that make up the asic is walked and the 3388 * set_clockgating_state callbacks are run. 3389 * Late initialization pass enabling clockgating for hardware IPs. 3390 * Fini or suspend, pass disabling clockgating for hardware IPs. 3391 * Returns 0 on success, negative error code on failure. 3392 */ 3393 3394 int amdgpu_device_set_cg_state(struct amdgpu_device *adev, 3395 enum amd_clockgating_state state) 3396 { 3397 int i, j, r; 3398 3399 if (amdgpu_emu_mode == 1) 3400 return 0; 3401 3402 for (j = 0; j < adev->num_ip_blocks; j++) { 3403 i = state == AMD_CG_STATE_GATE ? j : adev->num_ip_blocks - j - 1; 3404 if (!adev->ip_blocks[i].status.late_initialized) 3405 continue; 3406 /* skip CG for GFX, SDMA on S0ix */ 3407 if (adev->in_s0ix && 3408 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GFX || 3409 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SDMA)) 3410 continue; 3411 /* skip CG for VCE/UVD, it's handled specially */ 3412 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_UVD && 3413 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCE && 3414 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCN && 3415 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_JPEG && 3416 adev->ip_blocks[i].version->funcs->set_clockgating_state) { 3417 /* enable clockgating to save power */ 3418 r = adev->ip_blocks[i].version->funcs->set_clockgating_state(&adev->ip_blocks[i], 3419 state); 3420 if (r) { 3421 dev_err(adev->dev, 3422 "set_clockgating_state(gate) of IP block <%s> failed %d\n", 3423 adev->ip_blocks[i].version->funcs->name, 3424 r); 3425 return r; 3426 } 3427 } 3428 } 3429 3430 return 0; 3431 } 3432 3433 int amdgpu_device_set_pg_state(struct amdgpu_device *adev, 3434 enum amd_powergating_state state) 3435 { 3436 int i, j, r; 3437 3438 if (amdgpu_emu_mode == 1) 3439 return 0; 3440 3441 for (j = 0; j < adev->num_ip_blocks; j++) { 3442 i = state == AMD_PG_STATE_GATE ? j : adev->num_ip_blocks - j - 1; 3443 if (!adev->ip_blocks[i].status.late_initialized) 3444 continue; 3445 /* skip PG for GFX, SDMA on S0ix */ 3446 if (adev->in_s0ix && 3447 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GFX || 3448 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SDMA)) 3449 continue; 3450 /* skip CG for VCE/UVD/VPE, it's handled specially */ 3451 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_UVD && 3452 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCE && 3453 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCN && 3454 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VPE && 3455 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_JPEG && 3456 adev->ip_blocks[i].version->funcs->set_powergating_state) { 3457 /* enable powergating to save power */ 3458 r = adev->ip_blocks[i].version->funcs->set_powergating_state(&adev->ip_blocks[i], 3459 state); 3460 if (r) { 3461 dev_err(adev->dev, 3462 "set_powergating_state(gate) of IP block <%s> failed %d\n", 3463 adev->ip_blocks[i].version->funcs->name, 3464 r); 3465 return r; 3466 } 3467 } 3468 } 3469 return 0; 3470 } 3471 3472 static int amdgpu_device_enable_mgpu_fan_boost(void) 3473 { 3474 struct amdgpu_gpu_instance *gpu_ins; 3475 struct amdgpu_device *adev; 3476 int i, ret = 0; 3477 3478 mutex_lock(&mgpu_info.mutex); 3479 3480 /* 3481 * MGPU fan boost feature should be enabled 3482 * only when there are two or more dGPUs in 3483 * the system 3484 */ 3485 if (mgpu_info.num_dgpu < 2) 3486 goto out; 3487 3488 for (i = 0; i < mgpu_info.num_dgpu; i++) { 3489 gpu_ins = &(mgpu_info.gpu_ins[i]); 3490 adev = gpu_ins->adev; 3491 if (!(adev->flags & AMD_IS_APU || amdgpu_sriov_multi_vf_mode(adev)) && 3492 !gpu_ins->mgpu_fan_enabled) { 3493 ret = amdgpu_dpm_enable_mgpu_fan_boost(adev); 3494 if (ret) 3495 break; 3496 3497 gpu_ins->mgpu_fan_enabled = 1; 3498 } 3499 } 3500 3501 out: 3502 mutex_unlock(&mgpu_info.mutex); 3503 3504 return ret; 3505 } 3506 3507 /** 3508 * amdgpu_device_ip_late_init - run late init for hardware IPs 3509 * 3510 * @adev: amdgpu_device pointer 3511 * 3512 * Late initialization pass for hardware IPs. The list of all the hardware 3513 * IPs that make up the asic is walked and the late_init callbacks are run. 3514 * late_init covers any special initialization that an IP requires 3515 * after all of the have been initialized or something that needs to happen 3516 * late in the init process. 3517 * Returns 0 on success, negative error code on failure. 3518 */ 3519 static int amdgpu_device_ip_late_init(struct amdgpu_device *adev) 3520 { 3521 struct amdgpu_gpu_instance *gpu_instance; 3522 int i = 0, r; 3523 3524 for (i = 0; i < adev->num_ip_blocks; i++) { 3525 if (!adev->ip_blocks[i].status.hw) 3526 continue; 3527 if (adev->ip_blocks[i].version->funcs->late_init) { 3528 r = adev->ip_blocks[i].version->funcs->late_init(&adev->ip_blocks[i]); 3529 if (r) { 3530 dev_err(adev->dev, 3531 "late_init of IP block <%s> failed %d\n", 3532 adev->ip_blocks[i].version->funcs->name, 3533 r); 3534 return r; 3535 } 3536 } 3537 adev->ip_blocks[i].status.late_initialized = true; 3538 } 3539 3540 r = amdgpu_ras_late_init(adev); 3541 if (r) { 3542 dev_err(adev->dev, "amdgpu_ras_late_init failed %d", r); 3543 return r; 3544 } 3545 3546 if (!amdgpu_reset_in_recovery(adev)) 3547 amdgpu_ras_set_error_query_ready(adev, true); 3548 3549 amdgpu_device_set_cg_state(adev, AMD_CG_STATE_GATE); 3550 amdgpu_device_set_pg_state(adev, AMD_PG_STATE_GATE); 3551 3552 amdgpu_device_fill_reset_magic(adev); 3553 3554 r = amdgpu_device_enable_mgpu_fan_boost(); 3555 if (r) 3556 dev_err(adev->dev, "enable mgpu fan boost failed (%d).\n", r); 3557 3558 /* For passthrough configuration on arcturus and aldebaran, enable special handling SBR */ 3559 if (amdgpu_passthrough(adev) && 3560 ((adev->asic_type == CHIP_ARCTURUS && adev->gmc.xgmi.num_physical_nodes > 1) || 3561 adev->asic_type == CHIP_ALDEBARAN)) 3562 amdgpu_dpm_handle_passthrough_sbr(adev, true); 3563 3564 if (adev->gmc.xgmi.num_physical_nodes > 1) { 3565 mutex_lock(&mgpu_info.mutex); 3566 3567 /* 3568 * Reset device p-state to low as this was booted with high. 3569 * 3570 * This should be performed only after all devices from the same 3571 * hive get initialized. 3572 * 3573 * However, it's unknown how many device in the hive in advance. 3574 * As this is counted one by one during devices initializations. 3575 * 3576 * So, we wait for all XGMI interlinked devices initialized. 3577 * This may bring some delays as those devices may come from 3578 * different hives. But that should be OK. 3579 */ 3580 if (mgpu_info.num_dgpu == adev->gmc.xgmi.num_physical_nodes) { 3581 for (i = 0; i < mgpu_info.num_gpu; i++) { 3582 gpu_instance = &(mgpu_info.gpu_ins[i]); 3583 if (gpu_instance->adev->flags & AMD_IS_APU) 3584 continue; 3585 3586 r = amdgpu_xgmi_set_pstate(gpu_instance->adev, 3587 AMDGPU_XGMI_PSTATE_MIN); 3588 if (r) { 3589 dev_err(adev->dev, 3590 "pstate setting failed (%d).\n", 3591 r); 3592 break; 3593 } 3594 } 3595 } 3596 3597 mutex_unlock(&mgpu_info.mutex); 3598 } 3599 3600 return 0; 3601 } 3602 3603 static void amdgpu_ip_block_hw_fini(struct amdgpu_ip_block *ip_block) 3604 { 3605 struct amdgpu_device *adev = ip_block->adev; 3606 int r; 3607 3608 if (!ip_block->version->funcs->hw_fini) { 3609 dev_err(adev->dev, "hw_fini of IP block <%s> not defined\n", 3610 ip_block->version->funcs->name); 3611 } else { 3612 r = ip_block->version->funcs->hw_fini(ip_block); 3613 /* XXX handle errors */ 3614 if (r) { 3615 dev_dbg(adev->dev, 3616 "hw_fini of IP block <%s> failed %d\n", 3617 ip_block->version->funcs->name, r); 3618 } 3619 } 3620 3621 ip_block->status.hw = false; 3622 } 3623 3624 /** 3625 * amdgpu_device_smu_fini_early - smu hw_fini wrapper 3626 * 3627 * @adev: amdgpu_device pointer 3628 * 3629 * For ASICs need to disable SMC first 3630 */ 3631 static void amdgpu_device_smu_fini_early(struct amdgpu_device *adev) 3632 { 3633 int i; 3634 3635 if (amdgpu_ip_version(adev, GC_HWIP, 0) > IP_VERSION(9, 0, 0)) 3636 return; 3637 3638 for (i = 0; i < adev->num_ip_blocks; i++) { 3639 if (!adev->ip_blocks[i].status.hw) 3640 continue; 3641 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) { 3642 amdgpu_ip_block_hw_fini(&adev->ip_blocks[i]); 3643 break; 3644 } 3645 } 3646 } 3647 3648 static int amdgpu_device_ip_fini_early(struct amdgpu_device *adev) 3649 { 3650 int i, r; 3651 3652 for (i = 0; i < adev->num_ip_blocks; i++) { 3653 if (!adev->ip_blocks[i].version->funcs->early_fini) 3654 continue; 3655 3656 r = adev->ip_blocks[i].version->funcs->early_fini(&adev->ip_blocks[i]); 3657 if (r) { 3658 dev_dbg(adev->dev, 3659 "early_fini of IP block <%s> failed %d\n", 3660 adev->ip_blocks[i].version->funcs->name, r); 3661 } 3662 } 3663 3664 amdgpu_device_set_pg_state(adev, AMD_PG_STATE_UNGATE); 3665 amdgpu_device_set_cg_state(adev, AMD_CG_STATE_UNGATE); 3666 3667 amdgpu_amdkfd_suspend(adev, true); 3668 amdgpu_userq_suspend(adev); 3669 3670 /* Workaround for ASICs need to disable SMC first */ 3671 amdgpu_device_smu_fini_early(adev); 3672 3673 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 3674 if (!adev->ip_blocks[i].status.hw) 3675 continue; 3676 3677 amdgpu_ip_block_hw_fini(&adev->ip_blocks[i]); 3678 } 3679 3680 if (amdgpu_sriov_vf(adev)) { 3681 if (amdgpu_virt_release_full_gpu(adev, false)) 3682 dev_err(adev->dev, 3683 "failed to release exclusive mode on fini\n"); 3684 } 3685 3686 /* 3687 * Driver reload on the APU can fail due to firmware validation because 3688 * the PSP is always running, as it is shared across the whole SoC. 3689 * This same issue does not occur on dGPU because it has a mechanism 3690 * that checks whether the PSP is running. A solution for those issues 3691 * in the APU is to trigger a GPU reset, but this should be done during 3692 * the unload phase to avoid adding boot latency and screen flicker. 3693 */ 3694 if ((adev->flags & AMD_IS_APU) && !adev->gmc.is_app_apu) { 3695 r = amdgpu_asic_reset(adev); 3696 if (r) 3697 dev_err(adev->dev, "asic reset on %s failed\n", __func__); 3698 } 3699 3700 return 0; 3701 } 3702 3703 /** 3704 * amdgpu_device_ip_fini - run fini for hardware IPs 3705 * 3706 * @adev: amdgpu_device pointer 3707 * 3708 * Main teardown pass for hardware IPs. The list of all the hardware 3709 * IPs that make up the asic is walked and the hw_fini and sw_fini callbacks 3710 * are run. hw_fini tears down the hardware associated with each IP 3711 * and sw_fini tears down any software state associated with each IP. 3712 * Returns 0 on success, negative error code on failure. 3713 */ 3714 static int amdgpu_device_ip_fini(struct amdgpu_device *adev) 3715 { 3716 int i, r; 3717 3718 amdgpu_cper_fini(adev); 3719 3720 if (amdgpu_sriov_vf(adev) && adev->virt.ras_init_done) 3721 amdgpu_virt_release_ras_err_handler_data(adev); 3722 3723 if (adev->gmc.xgmi.num_physical_nodes > 1) 3724 amdgpu_xgmi_remove_device(adev); 3725 3726 amdgpu_amdkfd_device_fini_sw(adev); 3727 3728 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 3729 if (!adev->ip_blocks[i].status.sw) 3730 continue; 3731 3732 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) { 3733 amdgpu_ucode_free_bo(adev); 3734 amdgpu_free_static_csa(&adev->virt.csa_obj); 3735 amdgpu_device_wb_fini(adev); 3736 amdgpu_device_mem_scratch_fini(adev); 3737 amdgpu_ib_pool_fini(adev); 3738 amdgpu_seq64_fini(adev); 3739 amdgpu_doorbell_fini(adev); 3740 } 3741 if (adev->ip_blocks[i].version->funcs->sw_fini) { 3742 r = adev->ip_blocks[i].version->funcs->sw_fini(&adev->ip_blocks[i]); 3743 /* XXX handle errors */ 3744 if (r) { 3745 dev_dbg(adev->dev, 3746 "sw_fini of IP block <%s> failed %d\n", 3747 adev->ip_blocks[i].version->funcs->name, 3748 r); 3749 } 3750 } 3751 adev->ip_blocks[i].status.sw = false; 3752 adev->ip_blocks[i].status.valid = false; 3753 } 3754 3755 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 3756 if (!adev->ip_blocks[i].status.late_initialized) 3757 continue; 3758 if (adev->ip_blocks[i].version->funcs->late_fini) 3759 adev->ip_blocks[i].version->funcs->late_fini(&adev->ip_blocks[i]); 3760 adev->ip_blocks[i].status.late_initialized = false; 3761 } 3762 3763 amdgpu_ras_fini(adev); 3764 amdgpu_uid_fini(adev); 3765 3766 return 0; 3767 } 3768 3769 /** 3770 * amdgpu_device_delayed_init_work_handler - work handler for IB tests 3771 * 3772 * @work: work_struct. 3773 */ 3774 static void amdgpu_device_delayed_init_work_handler(struct work_struct *work) 3775 { 3776 struct amdgpu_device *adev = 3777 container_of(work, struct amdgpu_device, delayed_init_work.work); 3778 int r; 3779 3780 r = amdgpu_ib_ring_tests(adev); 3781 if (r) 3782 dev_err(adev->dev, "ib ring test failed (%d).\n", r); 3783 } 3784 3785 static void amdgpu_device_delay_enable_gfx_off(struct work_struct *work) 3786 { 3787 struct amdgpu_device *adev = 3788 container_of(work, struct amdgpu_device, gfx.gfx_off_delay_work.work); 3789 3790 WARN_ON_ONCE(adev->gfx.gfx_off_state); 3791 WARN_ON_ONCE(adev->gfx.gfx_off_req_count); 3792 3793 if (!amdgpu_dpm_set_powergating_by_smu(adev, AMD_IP_BLOCK_TYPE_GFX, true, 0)) 3794 adev->gfx.gfx_off_state = true; 3795 } 3796 3797 /** 3798 * amdgpu_device_ip_suspend_phase1 - run suspend for hardware IPs (phase 1) 3799 * 3800 * @adev: amdgpu_device pointer 3801 * 3802 * Main suspend function for hardware IPs. The list of all the hardware 3803 * IPs that make up the asic is walked, clockgating is disabled and the 3804 * suspend callbacks are run. suspend puts the hardware and software state 3805 * in each IP into a state suitable for suspend. 3806 * Returns 0 on success, negative error code on failure. 3807 */ 3808 static int amdgpu_device_ip_suspend_phase1(struct amdgpu_device *adev) 3809 { 3810 int i, r, rec; 3811 3812 amdgpu_device_set_pg_state(adev, AMD_PG_STATE_UNGATE); 3813 amdgpu_device_set_cg_state(adev, AMD_CG_STATE_UNGATE); 3814 3815 /* 3816 * Per PMFW team's suggestion, driver needs to handle gfxoff 3817 * and df cstate features disablement for gpu reset(e.g. Mode1Reset) 3818 * scenario. Add the missing df cstate disablement here. 3819 */ 3820 if (amdgpu_dpm_set_df_cstate(adev, DF_CSTATE_DISALLOW)) 3821 dev_warn(adev->dev, "Failed to disallow df cstate"); 3822 3823 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 3824 if (!adev->ip_blocks[i].status.valid) 3825 continue; 3826 3827 /* displays are handled separately */ 3828 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_DCE) 3829 continue; 3830 3831 r = amdgpu_ip_block_suspend(&adev->ip_blocks[i]); 3832 if (r) 3833 goto unwind; 3834 } 3835 3836 return 0; 3837 unwind: 3838 rec = amdgpu_device_ip_resume_phase3(adev); 3839 if (rec) 3840 dev_err(adev->dev, 3841 "amdgpu_device_ip_resume_phase3 failed during unwind: %d\n", 3842 rec); 3843 3844 amdgpu_dpm_set_df_cstate(adev, DF_CSTATE_ALLOW); 3845 3846 amdgpu_device_set_pg_state(adev, AMD_PG_STATE_GATE); 3847 amdgpu_device_set_cg_state(adev, AMD_CG_STATE_GATE); 3848 3849 return r; 3850 } 3851 3852 /** 3853 * amdgpu_device_ip_suspend_phase2 - run suspend for hardware IPs (phase 2) 3854 * 3855 * @adev: amdgpu_device pointer 3856 * 3857 * Main suspend function for hardware IPs. The list of all the hardware 3858 * IPs that make up the asic is walked, clockgating is disabled and the 3859 * suspend callbacks are run. suspend puts the hardware and software state 3860 * in each IP into a state suitable for suspend. 3861 * Returns 0 on success, negative error code on failure. 3862 */ 3863 static int amdgpu_device_ip_suspend_phase2(struct amdgpu_device *adev) 3864 { 3865 int i, r, rec; 3866 3867 if (adev->in_s0ix) 3868 amdgpu_dpm_gfx_state_change(adev, sGpuChangeState_D3Entry); 3869 3870 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 3871 if (!adev->ip_blocks[i].status.valid) 3872 continue; 3873 /* displays are handled in phase1 */ 3874 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE) 3875 continue; 3876 /* PSP lost connection when err_event_athub occurs */ 3877 if (amdgpu_ras_intr_triggered() && 3878 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) { 3879 adev->ip_blocks[i].status.hw = false; 3880 continue; 3881 } 3882 3883 /* skip unnecessary suspend if we do not initialize them yet */ 3884 if (!amdgpu_ip_member_of_hwini( 3885 adev, adev->ip_blocks[i].version->type)) 3886 continue; 3887 3888 /* Since we skip suspend for S0i3, we need to cancel the delayed 3889 * idle work here as the suspend callback never gets called. 3890 */ 3891 if (adev->in_s0ix && 3892 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GFX && 3893 amdgpu_ip_version(adev, GC_HWIP, 0) >= IP_VERSION(10, 0, 0)) 3894 cancel_delayed_work_sync(&adev->gfx.idle_work); 3895 /* skip suspend of gfx/mes and psp for S0ix 3896 * gfx is in gfxoff state, so on resume it will exit gfxoff just 3897 * like at runtime. PSP is also part of the always on hardware 3898 * so no need to suspend it. 3899 */ 3900 if (adev->in_s0ix && 3901 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP || 3902 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GFX || 3903 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_MES)) 3904 continue; 3905 3906 /* SDMA 5.x+ is part of GFX power domain so it's covered by GFXOFF */ 3907 if (adev->in_s0ix && 3908 (amdgpu_ip_version(adev, SDMA0_HWIP, 0) >= 3909 IP_VERSION(5, 0, 0)) && 3910 (adev->ip_blocks[i].version->type == 3911 AMD_IP_BLOCK_TYPE_SDMA)) 3912 continue; 3913 3914 /* Once swPSP provides the IMU, RLC FW binaries to TOS during cold-boot. 3915 * These are in TMR, hence are expected to be reused by PSP-TOS to reload 3916 * from this location and RLC Autoload automatically also gets loaded 3917 * from here based on PMFW -> PSP message during re-init sequence. 3918 * Therefore, the psp suspend & resume should be skipped to avoid destroy 3919 * the TMR and reload FWs again for IMU enabled APU ASICs. 3920 */ 3921 if (amdgpu_in_reset(adev) && 3922 (adev->flags & AMD_IS_APU) && adev->gfx.imu.funcs && 3923 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) 3924 continue; 3925 3926 r = amdgpu_ip_block_suspend(&adev->ip_blocks[i]); 3927 if (r) 3928 goto unwind; 3929 3930 /* handle putting the SMC in the appropriate state */ 3931 if (!amdgpu_sriov_vf(adev)) { 3932 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) { 3933 r = amdgpu_dpm_set_mp1_state(adev, adev->mp1_state); 3934 if (r) { 3935 dev_err(adev->dev, 3936 "SMC failed to set mp1 state %d, %d\n", 3937 adev->mp1_state, r); 3938 goto unwind; 3939 } 3940 } 3941 } 3942 } 3943 3944 return 0; 3945 unwind: 3946 /* suspend phase 2 = resume phase 1 + resume phase 2 */ 3947 rec = amdgpu_device_ip_resume_phase1(adev); 3948 if (rec) { 3949 dev_err(adev->dev, 3950 "amdgpu_device_ip_resume_phase1 failed during unwind: %d\n", 3951 rec); 3952 return r; 3953 } 3954 3955 rec = amdgpu_device_fw_loading(adev); 3956 if (rec) { 3957 dev_err(adev->dev, 3958 "amdgpu_device_fw_loading failed during unwind: %d\n", 3959 rec); 3960 return r; 3961 } 3962 3963 rec = amdgpu_device_ip_resume_phase2(adev); 3964 if (rec) { 3965 dev_err(adev->dev, 3966 "amdgpu_device_ip_resume_phase2 failed during unwind: %d\n", 3967 rec); 3968 return r; 3969 } 3970 3971 return r; 3972 } 3973 3974 /** 3975 * amdgpu_device_ip_suspend - run suspend for hardware IPs 3976 * 3977 * @adev: amdgpu_device pointer 3978 * 3979 * Main suspend function for hardware IPs. The list of all the hardware 3980 * IPs that make up the asic is walked, clockgating is disabled and the 3981 * suspend callbacks are run. suspend puts the hardware and software state 3982 * in each IP into a state suitable for suspend. 3983 * Returns 0 on success, negative error code on failure. 3984 */ 3985 static int amdgpu_device_ip_suspend(struct amdgpu_device *adev) 3986 { 3987 int r; 3988 3989 if (amdgpu_sriov_vf(adev)) { 3990 amdgpu_virt_fini_data_exchange(adev); 3991 amdgpu_virt_request_full_gpu(adev, false); 3992 } 3993 3994 amdgpu_ttm_set_buffer_funcs_status(adev, false); 3995 3996 r = amdgpu_device_ip_suspend_phase1(adev); 3997 if (r) 3998 return r; 3999 r = amdgpu_device_ip_suspend_phase2(adev); 4000 4001 if (amdgpu_sriov_vf(adev)) 4002 amdgpu_virt_release_full_gpu(adev, false); 4003 4004 return r; 4005 } 4006 4007 static int amdgpu_device_ip_reinit_early_sriov(struct amdgpu_device *adev) 4008 { 4009 int i, r; 4010 4011 static enum amd_ip_block_type ip_order[] = { 4012 AMD_IP_BLOCK_TYPE_COMMON, 4013 AMD_IP_BLOCK_TYPE_GMC, 4014 AMD_IP_BLOCK_TYPE_PSP, 4015 AMD_IP_BLOCK_TYPE_IH, 4016 }; 4017 4018 for (i = 0; i < adev->num_ip_blocks; i++) { 4019 int j; 4020 struct amdgpu_ip_block *block; 4021 4022 block = &adev->ip_blocks[i]; 4023 block->status.hw = false; 4024 4025 for (j = 0; j < ARRAY_SIZE(ip_order); j++) { 4026 4027 if (block->version->type != ip_order[j] || 4028 !block->status.valid) 4029 continue; 4030 4031 r = block->version->funcs->hw_init(&adev->ip_blocks[i]); 4032 if (r) { 4033 dev_err(adev->dev, "RE-INIT-early: %s failed\n", 4034 block->version->funcs->name); 4035 return r; 4036 } 4037 block->status.hw = true; 4038 } 4039 } 4040 4041 return 0; 4042 } 4043 4044 static int amdgpu_device_ip_reinit_late_sriov(struct amdgpu_device *adev) 4045 { 4046 struct amdgpu_ip_block *block; 4047 int i, r = 0; 4048 4049 static enum amd_ip_block_type ip_order[] = { 4050 AMD_IP_BLOCK_TYPE_SMC, 4051 AMD_IP_BLOCK_TYPE_DCE, 4052 AMD_IP_BLOCK_TYPE_GFX, 4053 AMD_IP_BLOCK_TYPE_SDMA, 4054 AMD_IP_BLOCK_TYPE_MES, 4055 AMD_IP_BLOCK_TYPE_UVD, 4056 AMD_IP_BLOCK_TYPE_VCE, 4057 AMD_IP_BLOCK_TYPE_VCN, 4058 AMD_IP_BLOCK_TYPE_JPEG 4059 }; 4060 4061 for (i = 0; i < ARRAY_SIZE(ip_order); i++) { 4062 block = amdgpu_device_ip_get_ip_block(adev, ip_order[i]); 4063 4064 if (!block) 4065 continue; 4066 4067 if (block->status.valid && !block->status.hw) { 4068 if (block->version->type == AMD_IP_BLOCK_TYPE_SMC) { 4069 r = amdgpu_ip_block_resume(block); 4070 } else { 4071 r = block->version->funcs->hw_init(block); 4072 } 4073 4074 if (r) { 4075 dev_err(adev->dev, "RE-INIT-late: %s failed\n", 4076 block->version->funcs->name); 4077 break; 4078 } 4079 block->status.hw = true; 4080 } 4081 } 4082 4083 return r; 4084 } 4085 4086 /** 4087 * amdgpu_device_ip_resume_phase1 - run resume for hardware IPs 4088 * 4089 * @adev: amdgpu_device pointer 4090 * 4091 * First resume function for hardware IPs. The list of all the hardware 4092 * IPs that make up the asic is walked and the resume callbacks are run for 4093 * COMMON, GMC, and IH. resume puts the hardware into a functional state 4094 * after a suspend and updates the software state as necessary. This 4095 * function is also used for restoring the GPU after a GPU reset. 4096 * Returns 0 on success, negative error code on failure. 4097 */ 4098 static int amdgpu_device_ip_resume_phase1(struct amdgpu_device *adev) 4099 { 4100 int i, r; 4101 4102 for (i = 0; i < adev->num_ip_blocks; i++) { 4103 if (!adev->ip_blocks[i].status.valid || adev->ip_blocks[i].status.hw) 4104 continue; 4105 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON || 4106 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC || 4107 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH || 4108 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP && amdgpu_sriov_vf(adev))) { 4109 4110 r = amdgpu_ip_block_resume(&adev->ip_blocks[i]); 4111 if (r) 4112 return r; 4113 } 4114 } 4115 4116 return 0; 4117 } 4118 4119 /** 4120 * amdgpu_device_ip_resume_phase2 - run resume for hardware IPs 4121 * 4122 * @adev: amdgpu_device pointer 4123 * 4124 * Second resume function for hardware IPs. The list of all the hardware 4125 * IPs that make up the asic is walked and the resume callbacks are run for 4126 * all blocks except COMMON, GMC, and IH. resume puts the hardware into a 4127 * functional state after a suspend and updates the software state as 4128 * necessary. This function is also used for restoring the GPU after a GPU 4129 * reset. 4130 * Returns 0 on success, negative error code on failure. 4131 */ 4132 static int amdgpu_device_ip_resume_phase2(struct amdgpu_device *adev) 4133 { 4134 int i, r; 4135 4136 for (i = 0; i < adev->num_ip_blocks; i++) { 4137 if (!adev->ip_blocks[i].status.valid || adev->ip_blocks[i].status.hw) 4138 continue; 4139 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON || 4140 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC || 4141 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH || 4142 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE || 4143 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) 4144 continue; 4145 r = amdgpu_ip_block_resume(&adev->ip_blocks[i]); 4146 if (r) 4147 return r; 4148 } 4149 4150 return 0; 4151 } 4152 4153 /** 4154 * amdgpu_device_ip_resume_phase3 - run resume for hardware IPs 4155 * 4156 * @adev: amdgpu_device pointer 4157 * 4158 * Third resume function for hardware IPs. The list of all the hardware 4159 * IPs that make up the asic is walked and the resume callbacks are run for 4160 * all DCE. resume puts the hardware into a functional state after a suspend 4161 * and updates the software state as necessary. This function is also used 4162 * for restoring the GPU after a GPU reset. 4163 * 4164 * Returns 0 on success, negative error code on failure. 4165 */ 4166 static int amdgpu_device_ip_resume_phase3(struct amdgpu_device *adev) 4167 { 4168 int i, r; 4169 4170 for (i = 0; i < adev->num_ip_blocks; i++) { 4171 if (!adev->ip_blocks[i].status.valid || adev->ip_blocks[i].status.hw) 4172 continue; 4173 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE) { 4174 r = amdgpu_ip_block_resume(&adev->ip_blocks[i]); 4175 if (r) 4176 return r; 4177 } 4178 } 4179 4180 return 0; 4181 } 4182 4183 /** 4184 * amdgpu_device_ip_resume - run resume for hardware IPs 4185 * 4186 * @adev: amdgpu_device pointer 4187 * 4188 * Main resume function for hardware IPs. The hardware IPs 4189 * are split into two resume functions because they are 4190 * also used in recovering from a GPU reset and some additional 4191 * steps need to be take between them. In this case (S3/S4) they are 4192 * run sequentially. 4193 * Returns 0 on success, negative error code on failure. 4194 */ 4195 static int amdgpu_device_ip_resume(struct amdgpu_device *adev) 4196 { 4197 int r; 4198 4199 r = amdgpu_device_ip_resume_phase1(adev); 4200 if (r) 4201 return r; 4202 4203 r = amdgpu_device_fw_loading(adev); 4204 if (r) 4205 return r; 4206 4207 r = amdgpu_device_ip_resume_phase2(adev); 4208 4209 if (adev->mman.buffer_funcs_ring->sched.ready) 4210 amdgpu_ttm_set_buffer_funcs_status(adev, true); 4211 4212 if (r) 4213 return r; 4214 4215 amdgpu_fence_driver_hw_init(adev); 4216 4217 r = amdgpu_device_ip_resume_phase3(adev); 4218 4219 return r; 4220 } 4221 4222 /** 4223 * amdgpu_device_detect_sriov_bios - determine if the board supports SR-IOV 4224 * 4225 * @adev: amdgpu_device pointer 4226 * 4227 * Query the VBIOS data tables to determine if the board supports SR-IOV. 4228 */ 4229 static void amdgpu_device_detect_sriov_bios(struct amdgpu_device *adev) 4230 { 4231 if (amdgpu_sriov_vf(adev)) { 4232 if (adev->is_atom_fw) { 4233 if (amdgpu_atomfirmware_gpu_virtualization_supported(adev)) 4234 adev->virt.caps |= AMDGPU_SRIOV_CAPS_SRIOV_VBIOS; 4235 } else { 4236 if (amdgpu_atombios_has_gpu_virtualization_table(adev)) 4237 adev->virt.caps |= AMDGPU_SRIOV_CAPS_SRIOV_VBIOS; 4238 } 4239 4240 if (!(adev->virt.caps & AMDGPU_SRIOV_CAPS_SRIOV_VBIOS)) 4241 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_NO_VBIOS, 0, 0); 4242 } 4243 } 4244 4245 /** 4246 * amdgpu_device_asic_has_dc_support - determine if DC supports the asic 4247 * 4248 * @pdev : pci device context 4249 * @asic_type: AMD asic type 4250 * 4251 * Check if there is DC (new modesetting infrastructre) support for an asic. 4252 * returns true if DC has support, false if not. 4253 */ 4254 bool amdgpu_device_asic_has_dc_support(struct pci_dev *pdev, 4255 enum amd_asic_type asic_type) 4256 { 4257 switch (asic_type) { 4258 #ifdef CONFIG_DRM_AMDGPU_SI 4259 case CHIP_HAINAN: 4260 #endif 4261 case CHIP_TOPAZ: 4262 /* chips with no display hardware */ 4263 return false; 4264 #if defined(CONFIG_DRM_AMD_DC) 4265 case CHIP_TAHITI: 4266 case CHIP_PITCAIRN: 4267 case CHIP_VERDE: 4268 case CHIP_OLAND: 4269 return amdgpu_dc != 0 && IS_ENABLED(CONFIG_DRM_AMD_DC_SI); 4270 case CHIP_KAVERI: 4271 case CHIP_KABINI: 4272 case CHIP_MULLINS: 4273 /* 4274 * We have systems in the wild with these ASICs that require 4275 * TRAVIS and NUTMEG support which is not supported with DC. 4276 * 4277 * Fallback to the non-DC driver here by default so as not to 4278 * cause regressions. 4279 */ 4280 return amdgpu_dc > 0; 4281 default: 4282 return amdgpu_dc != 0; 4283 #else 4284 default: 4285 if (amdgpu_dc > 0) 4286 dev_info_once( 4287 &pdev->dev, 4288 "Display Core has been requested via kernel parameter but isn't supported by ASIC, ignoring\n"); 4289 return false; 4290 #endif 4291 } 4292 } 4293 4294 /** 4295 * amdgpu_device_has_dc_support - check if dc is supported 4296 * 4297 * @adev: amdgpu_device pointer 4298 * 4299 * Returns true for supported, false for not supported 4300 */ 4301 bool amdgpu_device_has_dc_support(struct amdgpu_device *adev) 4302 { 4303 if (adev->enable_virtual_display || 4304 (adev->harvest_ip_mask & AMD_HARVEST_IP_DMU_MASK)) 4305 return false; 4306 4307 return amdgpu_device_asic_has_dc_support(adev->pdev, adev->asic_type); 4308 } 4309 4310 static void amdgpu_device_xgmi_reset_func(struct work_struct *__work) 4311 { 4312 struct amdgpu_device *adev = 4313 container_of(__work, struct amdgpu_device, xgmi_reset_work); 4314 struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev); 4315 4316 /* It's a bug to not have a hive within this function */ 4317 if (WARN_ON(!hive)) 4318 return; 4319 4320 /* 4321 * Use task barrier to synchronize all xgmi reset works across the 4322 * hive. task_barrier_enter and task_barrier_exit will block 4323 * until all the threads running the xgmi reset works reach 4324 * those points. task_barrier_full will do both blocks. 4325 */ 4326 if (amdgpu_asic_reset_method(adev) == AMD_RESET_METHOD_BACO) { 4327 4328 task_barrier_enter(&hive->tb); 4329 adev->asic_reset_res = amdgpu_device_baco_enter(adev); 4330 4331 if (adev->asic_reset_res) 4332 goto fail; 4333 4334 task_barrier_exit(&hive->tb); 4335 adev->asic_reset_res = amdgpu_device_baco_exit(adev); 4336 4337 if (adev->asic_reset_res) 4338 goto fail; 4339 4340 amdgpu_ras_reset_error_count(adev, AMDGPU_RAS_BLOCK__MMHUB); 4341 } else { 4342 4343 task_barrier_full(&hive->tb); 4344 adev->asic_reset_res = amdgpu_asic_reset(adev); 4345 } 4346 4347 fail: 4348 if (adev->asic_reset_res) 4349 dev_warn(adev->dev, 4350 "ASIC reset failed with error, %d for drm dev, %s", 4351 adev->asic_reset_res, adev_to_drm(adev)->unique); 4352 amdgpu_put_xgmi_hive(hive); 4353 } 4354 4355 static int amdgpu_device_get_job_timeout_settings(struct amdgpu_device *adev) 4356 { 4357 char *input = amdgpu_lockup_timeout; 4358 char *timeout_setting = NULL; 4359 int index = 0; 4360 long timeout; 4361 int ret = 0; 4362 4363 /* By default timeout for all queues is 2 sec */ 4364 adev->gfx_timeout = adev->compute_timeout = adev->sdma_timeout = 4365 adev->video_timeout = msecs_to_jiffies(2000); 4366 4367 if (!strnlen(input, AMDGPU_MAX_TIMEOUT_PARAM_LENGTH)) 4368 return 0; 4369 4370 while ((timeout_setting = strsep(&input, ",")) && 4371 strnlen(timeout_setting, AMDGPU_MAX_TIMEOUT_PARAM_LENGTH)) { 4372 ret = kstrtol(timeout_setting, 0, &timeout); 4373 if (ret) 4374 return ret; 4375 4376 if (timeout == 0) { 4377 index++; 4378 continue; 4379 } else if (timeout < 0) { 4380 timeout = MAX_SCHEDULE_TIMEOUT; 4381 dev_warn(adev->dev, "lockup timeout disabled"); 4382 add_taint(TAINT_SOFTLOCKUP, LOCKDEP_STILL_OK); 4383 } else { 4384 timeout = msecs_to_jiffies(timeout); 4385 } 4386 4387 switch (index++) { 4388 case 0: 4389 adev->gfx_timeout = timeout; 4390 break; 4391 case 1: 4392 adev->compute_timeout = timeout; 4393 break; 4394 case 2: 4395 adev->sdma_timeout = timeout; 4396 break; 4397 case 3: 4398 adev->video_timeout = timeout; 4399 break; 4400 default: 4401 break; 4402 } 4403 } 4404 4405 /* When only one value specified apply it to all queues. */ 4406 if (index == 1) 4407 adev->gfx_timeout = adev->compute_timeout = adev->sdma_timeout = 4408 adev->video_timeout = timeout; 4409 4410 return ret; 4411 } 4412 4413 /** 4414 * amdgpu_device_check_iommu_direct_map - check if RAM direct mapped to GPU 4415 * 4416 * @adev: amdgpu_device pointer 4417 * 4418 * RAM direct mapped to GPU if IOMMU is not enabled or is pass through mode 4419 */ 4420 static void amdgpu_device_check_iommu_direct_map(struct amdgpu_device *adev) 4421 { 4422 struct iommu_domain *domain; 4423 4424 domain = iommu_get_domain_for_dev(adev->dev); 4425 if (!domain || domain->type == IOMMU_DOMAIN_IDENTITY) 4426 adev->ram_is_direct_mapped = true; 4427 } 4428 4429 #if defined(CONFIG_HSA_AMD_P2P) 4430 /** 4431 * amdgpu_device_check_iommu_remap - Check if DMA remapping is enabled. 4432 * 4433 * @adev: amdgpu_device pointer 4434 * 4435 * return if IOMMU remapping bar address 4436 */ 4437 static bool amdgpu_device_check_iommu_remap(struct amdgpu_device *adev) 4438 { 4439 struct iommu_domain *domain; 4440 4441 domain = iommu_get_domain_for_dev(adev->dev); 4442 if (domain && (domain->type == IOMMU_DOMAIN_DMA || 4443 domain->type == IOMMU_DOMAIN_DMA_FQ)) 4444 return true; 4445 4446 return false; 4447 } 4448 #endif 4449 4450 static void amdgpu_device_set_mcbp(struct amdgpu_device *adev) 4451 { 4452 if (amdgpu_mcbp == 1) 4453 adev->gfx.mcbp = true; 4454 else if (amdgpu_mcbp == 0) 4455 adev->gfx.mcbp = false; 4456 4457 if (amdgpu_sriov_vf(adev)) 4458 adev->gfx.mcbp = true; 4459 4460 if (adev->gfx.mcbp) 4461 dev_info(adev->dev, "MCBP is enabled\n"); 4462 } 4463 4464 static int amdgpu_device_sys_interface_init(struct amdgpu_device *adev) 4465 { 4466 int r; 4467 4468 r = amdgpu_atombios_sysfs_init(adev); 4469 if (r) 4470 drm_err(&adev->ddev, 4471 "registering atombios sysfs failed (%d).\n", r); 4472 4473 r = amdgpu_pm_sysfs_init(adev); 4474 if (r) 4475 dev_err(adev->dev, "registering pm sysfs failed (%d).\n", r); 4476 4477 r = amdgpu_ucode_sysfs_init(adev); 4478 if (r) { 4479 adev->ucode_sysfs_en = false; 4480 dev_err(adev->dev, "Creating firmware sysfs failed (%d).\n", r); 4481 } else 4482 adev->ucode_sysfs_en = true; 4483 4484 r = amdgpu_device_attr_sysfs_init(adev); 4485 if (r) 4486 dev_err(adev->dev, "Could not create amdgpu device attr\n"); 4487 4488 r = devm_device_add_group(adev->dev, &amdgpu_board_attrs_group); 4489 if (r) 4490 dev_err(adev->dev, 4491 "Could not create amdgpu board attributes\n"); 4492 4493 amdgpu_fru_sysfs_init(adev); 4494 amdgpu_reg_state_sysfs_init(adev); 4495 amdgpu_xcp_sysfs_init(adev); 4496 4497 return r; 4498 } 4499 4500 static void amdgpu_device_sys_interface_fini(struct amdgpu_device *adev) 4501 { 4502 if (adev->pm.sysfs_initialized) 4503 amdgpu_pm_sysfs_fini(adev); 4504 if (adev->ucode_sysfs_en) 4505 amdgpu_ucode_sysfs_fini(adev); 4506 amdgpu_device_attr_sysfs_fini(adev); 4507 amdgpu_fru_sysfs_fini(adev); 4508 4509 amdgpu_reg_state_sysfs_fini(adev); 4510 amdgpu_xcp_sysfs_fini(adev); 4511 } 4512 4513 /** 4514 * amdgpu_device_init - initialize the driver 4515 * 4516 * @adev: amdgpu_device pointer 4517 * @flags: driver flags 4518 * 4519 * Initializes the driver info and hw (all asics). 4520 * Returns 0 for success or an error on failure. 4521 * Called at driver startup. 4522 */ 4523 int amdgpu_device_init(struct amdgpu_device *adev, 4524 uint32_t flags) 4525 { 4526 struct pci_dev *pdev = adev->pdev; 4527 int r, i; 4528 bool px = false; 4529 u32 max_MBps; 4530 int tmp; 4531 4532 adev->shutdown = false; 4533 adev->flags = flags; 4534 4535 if (amdgpu_force_asic_type >= 0 && amdgpu_force_asic_type < CHIP_LAST) 4536 adev->asic_type = amdgpu_force_asic_type; 4537 else 4538 adev->asic_type = flags & AMD_ASIC_MASK; 4539 4540 adev->usec_timeout = AMDGPU_MAX_USEC_TIMEOUT; 4541 if (amdgpu_emu_mode == 1) 4542 adev->usec_timeout *= 10; 4543 adev->gmc.gart_size = 512 * 1024 * 1024; 4544 adev->accel_working = false; 4545 adev->num_rings = 0; 4546 RCU_INIT_POINTER(adev->gang_submit, dma_fence_get_stub()); 4547 adev->mman.buffer_funcs = NULL; 4548 adev->mman.buffer_funcs_ring = NULL; 4549 adev->vm_manager.vm_pte_funcs = NULL; 4550 adev->vm_manager.vm_pte_num_scheds = 0; 4551 adev->gmc.gmc_funcs = NULL; 4552 adev->harvest_ip_mask = 0x0; 4553 adev->fence_context = dma_fence_context_alloc(AMDGPU_MAX_RINGS); 4554 bitmap_zero(adev->gfx.pipe_reserve_bitmap, AMDGPU_MAX_COMPUTE_QUEUES); 4555 4556 adev->smc_rreg = &amdgpu_invalid_rreg; 4557 adev->smc_wreg = &amdgpu_invalid_wreg; 4558 adev->pcie_rreg = &amdgpu_invalid_rreg; 4559 adev->pcie_wreg = &amdgpu_invalid_wreg; 4560 adev->pcie_rreg_ext = &amdgpu_invalid_rreg_ext; 4561 adev->pcie_wreg_ext = &amdgpu_invalid_wreg_ext; 4562 adev->pciep_rreg = &amdgpu_invalid_rreg; 4563 adev->pciep_wreg = &amdgpu_invalid_wreg; 4564 adev->pcie_rreg64 = &amdgpu_invalid_rreg64; 4565 adev->pcie_wreg64 = &amdgpu_invalid_wreg64; 4566 adev->pcie_rreg64_ext = &amdgpu_invalid_rreg64_ext; 4567 adev->pcie_wreg64_ext = &amdgpu_invalid_wreg64_ext; 4568 adev->uvd_ctx_rreg = &amdgpu_invalid_rreg; 4569 adev->uvd_ctx_wreg = &amdgpu_invalid_wreg; 4570 adev->didt_rreg = &amdgpu_invalid_rreg; 4571 adev->didt_wreg = &amdgpu_invalid_wreg; 4572 adev->gc_cac_rreg = &amdgpu_invalid_rreg; 4573 adev->gc_cac_wreg = &amdgpu_invalid_wreg; 4574 adev->audio_endpt_rreg = &amdgpu_block_invalid_rreg; 4575 adev->audio_endpt_wreg = &amdgpu_block_invalid_wreg; 4576 4577 dev_info( 4578 adev->dev, 4579 "initializing kernel modesetting (%s 0x%04X:0x%04X 0x%04X:0x%04X 0x%02X).\n", 4580 amdgpu_asic_name[adev->asic_type], pdev->vendor, pdev->device, 4581 pdev->subsystem_vendor, pdev->subsystem_device, pdev->revision); 4582 4583 /* mutex initialization are all done here so we 4584 * can recall function without having locking issues 4585 */ 4586 mutex_init(&adev->firmware.mutex); 4587 mutex_init(&adev->pm.mutex); 4588 mutex_init(&adev->gfx.gpu_clock_mutex); 4589 mutex_init(&adev->srbm_mutex); 4590 mutex_init(&adev->gfx.pipe_reserve_mutex); 4591 mutex_init(&adev->gfx.gfx_off_mutex); 4592 mutex_init(&adev->gfx.partition_mutex); 4593 mutex_init(&adev->grbm_idx_mutex); 4594 mutex_init(&adev->mn_lock); 4595 mutex_init(&adev->virt.vf_errors.lock); 4596 hash_init(adev->mn_hash); 4597 mutex_init(&adev->psp.mutex); 4598 mutex_init(&adev->notifier_lock); 4599 mutex_init(&adev->pm.stable_pstate_ctx_lock); 4600 mutex_init(&adev->benchmark_mutex); 4601 mutex_init(&adev->gfx.reset_sem_mutex); 4602 /* Initialize the mutex for cleaner shader isolation between GFX and compute processes */ 4603 mutex_init(&adev->enforce_isolation_mutex); 4604 for (i = 0; i < MAX_XCP; ++i) { 4605 adev->isolation[i].spearhead = dma_fence_get_stub(); 4606 amdgpu_sync_create(&adev->isolation[i].active); 4607 amdgpu_sync_create(&adev->isolation[i].prev); 4608 } 4609 mutex_init(&adev->gfx.userq_sch_mutex); 4610 mutex_init(&adev->gfx.workload_profile_mutex); 4611 mutex_init(&adev->vcn.workload_profile_mutex); 4612 4613 amdgpu_device_init_apu_flags(adev); 4614 4615 r = amdgpu_device_check_arguments(adev); 4616 if (r) 4617 return r; 4618 4619 spin_lock_init(&adev->mmio_idx_lock); 4620 spin_lock_init(&adev->smc_idx_lock); 4621 spin_lock_init(&adev->pcie_idx_lock); 4622 spin_lock_init(&adev->uvd_ctx_idx_lock); 4623 spin_lock_init(&adev->didt_idx_lock); 4624 spin_lock_init(&adev->gc_cac_idx_lock); 4625 spin_lock_init(&adev->se_cac_idx_lock); 4626 spin_lock_init(&adev->audio_endpt_idx_lock); 4627 spin_lock_init(&adev->mm_stats.lock); 4628 spin_lock_init(&adev->virt.rlcg_reg_lock); 4629 spin_lock_init(&adev->wb.lock); 4630 4631 xa_init_flags(&adev->userq_xa, XA_FLAGS_LOCK_IRQ); 4632 4633 INIT_LIST_HEAD(&adev->reset_list); 4634 4635 INIT_LIST_HEAD(&adev->ras_list); 4636 4637 INIT_LIST_HEAD(&adev->pm.od_kobj_list); 4638 4639 xa_init(&adev->userq_doorbell_xa); 4640 4641 INIT_DELAYED_WORK(&adev->delayed_init_work, 4642 amdgpu_device_delayed_init_work_handler); 4643 INIT_DELAYED_WORK(&adev->gfx.gfx_off_delay_work, 4644 amdgpu_device_delay_enable_gfx_off); 4645 /* 4646 * Initialize the enforce_isolation work structures for each XCP 4647 * partition. This work handler is responsible for enforcing shader 4648 * isolation on AMD GPUs. It counts the number of emitted fences for 4649 * each GFX and compute ring. If there are any fences, it schedules 4650 * the `enforce_isolation_work` to be run after a delay. If there are 4651 * no fences, it signals the Kernel Fusion Driver (KFD) to resume the 4652 * runqueue. 4653 */ 4654 for (i = 0; i < MAX_XCP; i++) { 4655 INIT_DELAYED_WORK(&adev->gfx.enforce_isolation[i].work, 4656 amdgpu_gfx_enforce_isolation_handler); 4657 adev->gfx.enforce_isolation[i].adev = adev; 4658 adev->gfx.enforce_isolation[i].xcp_id = i; 4659 } 4660 4661 INIT_WORK(&adev->xgmi_reset_work, amdgpu_device_xgmi_reset_func); 4662 INIT_WORK(&adev->userq_reset_work, amdgpu_userq_reset_work); 4663 4664 adev->gfx.gfx_off_req_count = 1; 4665 adev->gfx.gfx_off_residency = 0; 4666 adev->gfx.gfx_off_entrycount = 0; 4667 adev->pm.ac_power = power_supply_is_system_supplied() > 0; 4668 4669 atomic_set(&adev->throttling_logging_enabled, 1); 4670 /* 4671 * If throttling continues, logging will be performed every minute 4672 * to avoid log flooding. "-1" is subtracted since the thermal 4673 * throttling interrupt comes every second. Thus, the total logging 4674 * interval is 59 seconds(retelimited printk interval) + 1(waiting 4675 * for throttling interrupt) = 60 seconds. 4676 */ 4677 ratelimit_state_init(&adev->throttling_logging_rs, (60 - 1) * HZ, 1); 4678 4679 ratelimit_set_flags(&adev->throttling_logging_rs, RATELIMIT_MSG_ON_RELEASE); 4680 4681 /* Registers mapping */ 4682 /* TODO: block userspace mapping of io register */ 4683 if (adev->asic_type >= CHIP_BONAIRE) { 4684 adev->rmmio_base = pci_resource_start(adev->pdev, 5); 4685 adev->rmmio_size = pci_resource_len(adev->pdev, 5); 4686 } else { 4687 adev->rmmio_base = pci_resource_start(adev->pdev, 2); 4688 adev->rmmio_size = pci_resource_len(adev->pdev, 2); 4689 } 4690 4691 for (i = 0; i < AMD_IP_BLOCK_TYPE_NUM; i++) 4692 atomic_set(&adev->pm.pwr_state[i], POWER_STATE_UNKNOWN); 4693 4694 adev->rmmio = ioremap(adev->rmmio_base, adev->rmmio_size); 4695 if (!adev->rmmio) 4696 return -ENOMEM; 4697 4698 dev_info(adev->dev, "register mmio base: 0x%08X\n", 4699 (uint32_t)adev->rmmio_base); 4700 dev_info(adev->dev, "register mmio size: %u\n", 4701 (unsigned int)adev->rmmio_size); 4702 4703 /* 4704 * Reset domain needs to be present early, before XGMI hive discovered 4705 * (if any) and initialized to use reset sem and in_gpu reset flag 4706 * early on during init and before calling to RREG32. 4707 */ 4708 adev->reset_domain = amdgpu_reset_create_reset_domain(SINGLE_DEVICE, "amdgpu-reset-dev"); 4709 if (!adev->reset_domain) 4710 return -ENOMEM; 4711 4712 /* detect hw virtualization here */ 4713 amdgpu_virt_init(adev); 4714 4715 amdgpu_device_get_pcie_info(adev); 4716 4717 r = amdgpu_device_get_job_timeout_settings(adev); 4718 if (r) { 4719 dev_err(adev->dev, "invalid lockup_timeout parameter syntax\n"); 4720 return r; 4721 } 4722 4723 amdgpu_device_set_mcbp(adev); 4724 4725 /* 4726 * By default, use default mode where all blocks are expected to be 4727 * initialized. At present a 'swinit' of blocks is required to be 4728 * completed before the need for a different level is detected. 4729 */ 4730 amdgpu_set_init_level(adev, AMDGPU_INIT_LEVEL_DEFAULT); 4731 /* early init functions */ 4732 r = amdgpu_device_ip_early_init(adev); 4733 if (r) 4734 return r; 4735 4736 /* 4737 * No need to remove conflicting FBs for non-display class devices. 4738 * This prevents the sysfb from being freed accidently. 4739 */ 4740 if ((pdev->class >> 8) == PCI_CLASS_DISPLAY_VGA || 4741 (pdev->class >> 8) == PCI_CLASS_DISPLAY_OTHER) { 4742 /* Get rid of things like offb */ 4743 r = aperture_remove_conflicting_pci_devices(adev->pdev, amdgpu_kms_driver.name); 4744 if (r) 4745 return r; 4746 } 4747 4748 /* Enable TMZ based on IP_VERSION */ 4749 amdgpu_gmc_tmz_set(adev); 4750 4751 if (amdgpu_sriov_vf(adev) && 4752 amdgpu_ip_version(adev, GC_HWIP, 0) >= IP_VERSION(10, 3, 0)) 4753 /* VF MMIO access (except mailbox range) from CPU 4754 * will be blocked during sriov runtime 4755 */ 4756 adev->virt.caps |= AMDGPU_VF_MMIO_ACCESS_PROTECT; 4757 4758 amdgpu_gmc_noretry_set(adev); 4759 /* Need to get xgmi info early to decide the reset behavior*/ 4760 if (adev->gmc.xgmi.supported) { 4761 r = adev->gfxhub.funcs->get_xgmi_info(adev); 4762 if (r) 4763 return r; 4764 } 4765 4766 /* enable PCIE atomic ops */ 4767 if (amdgpu_sriov_vf(adev)) { 4768 if (adev->virt.fw_reserve.p_pf2vf) 4769 adev->have_atomics_support = ((struct amd_sriov_msg_pf2vf_info *) 4770 adev->virt.fw_reserve.p_pf2vf)->pcie_atomic_ops_support_flags == 4771 (PCI_EXP_DEVCAP2_ATOMIC_COMP32 | PCI_EXP_DEVCAP2_ATOMIC_COMP64); 4772 /* APUs w/ gfx9 onwards doesn't reply on PCIe atomics, rather it is a 4773 * internal path natively support atomics, set have_atomics_support to true. 4774 */ 4775 } else if ((adev->flags & AMD_IS_APU) && 4776 (amdgpu_ip_version(adev, GC_HWIP, 0) > 4777 IP_VERSION(9, 0, 0))) { 4778 adev->have_atomics_support = true; 4779 } else { 4780 adev->have_atomics_support = 4781 !pci_enable_atomic_ops_to_root(adev->pdev, 4782 PCI_EXP_DEVCAP2_ATOMIC_COMP32 | 4783 PCI_EXP_DEVCAP2_ATOMIC_COMP64); 4784 } 4785 4786 if (!adev->have_atomics_support) 4787 dev_info(adev->dev, "PCIE atomic ops is not supported\n"); 4788 4789 /* doorbell bar mapping and doorbell index init*/ 4790 amdgpu_doorbell_init(adev); 4791 4792 if (amdgpu_emu_mode == 1) { 4793 /* post the asic on emulation mode */ 4794 emu_soc_asic_init(adev); 4795 goto fence_driver_init; 4796 } 4797 4798 amdgpu_reset_init(adev); 4799 4800 /* detect if we are with an SRIOV vbios */ 4801 if (adev->bios) 4802 amdgpu_device_detect_sriov_bios(adev); 4803 4804 /* check if we need to reset the asic 4805 * E.g., driver was not cleanly unloaded previously, etc. 4806 */ 4807 if (!amdgpu_sriov_vf(adev) && amdgpu_asic_need_reset_on_init(adev)) { 4808 if (adev->gmc.xgmi.num_physical_nodes) { 4809 dev_info(adev->dev, "Pending hive reset.\n"); 4810 amdgpu_set_init_level(adev, 4811 AMDGPU_INIT_LEVEL_MINIMAL_XGMI); 4812 } else if (amdgpu_ip_version(adev, MP1_HWIP, 0) == IP_VERSION(13, 0, 10) && 4813 !amdgpu_device_has_display_hardware(adev)) { 4814 r = psp_gpu_reset(adev); 4815 } else { 4816 tmp = amdgpu_reset_method; 4817 /* It should do a default reset when loading or reloading the driver, 4818 * regardless of the module parameter reset_method. 4819 */ 4820 amdgpu_reset_method = AMD_RESET_METHOD_NONE; 4821 r = amdgpu_asic_reset(adev); 4822 amdgpu_reset_method = tmp; 4823 } 4824 4825 if (r) { 4826 dev_err(adev->dev, "asic reset on init failed\n"); 4827 goto failed; 4828 } 4829 } 4830 4831 /* Post card if necessary */ 4832 if (amdgpu_device_need_post(adev)) { 4833 if (!adev->bios) { 4834 dev_err(adev->dev, "no vBIOS found\n"); 4835 r = -EINVAL; 4836 goto failed; 4837 } 4838 dev_info(adev->dev, "GPU posting now...\n"); 4839 r = amdgpu_device_asic_init(adev); 4840 if (r) { 4841 dev_err(adev->dev, "gpu post error!\n"); 4842 goto failed; 4843 } 4844 } 4845 4846 if (adev->bios) { 4847 if (adev->is_atom_fw) { 4848 /* Initialize clocks */ 4849 r = amdgpu_atomfirmware_get_clock_info(adev); 4850 if (r) { 4851 dev_err(adev->dev, "amdgpu_atomfirmware_get_clock_info failed\n"); 4852 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_GET_CLOCK_FAIL, 0, 0); 4853 goto failed; 4854 } 4855 } else { 4856 /* Initialize clocks */ 4857 r = amdgpu_atombios_get_clock_info(adev); 4858 if (r) { 4859 dev_err(adev->dev, "amdgpu_atombios_get_clock_info failed\n"); 4860 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_GET_CLOCK_FAIL, 0, 0); 4861 goto failed; 4862 } 4863 /* init i2c buses */ 4864 amdgpu_i2c_init(adev); 4865 } 4866 } 4867 4868 fence_driver_init: 4869 /* Fence driver */ 4870 r = amdgpu_fence_driver_sw_init(adev); 4871 if (r) { 4872 dev_err(adev->dev, "amdgpu_fence_driver_sw_init failed\n"); 4873 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_FENCE_INIT_FAIL, 0, 0); 4874 goto failed; 4875 } 4876 4877 /* init the mode config */ 4878 drm_mode_config_init(adev_to_drm(adev)); 4879 4880 r = amdgpu_device_ip_init(adev); 4881 if (r) { 4882 dev_err(adev->dev, "amdgpu_device_ip_init failed\n"); 4883 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_AMDGPU_INIT_FAIL, 0, 0); 4884 goto release_ras_con; 4885 } 4886 4887 amdgpu_fence_driver_hw_init(adev); 4888 4889 dev_info(adev->dev, 4890 "SE %d, SH per SE %d, CU per SH %d, active_cu_number %d\n", 4891 adev->gfx.config.max_shader_engines, 4892 adev->gfx.config.max_sh_per_se, 4893 adev->gfx.config.max_cu_per_sh, 4894 adev->gfx.cu_info.number); 4895 4896 adev->accel_working = true; 4897 4898 amdgpu_vm_check_compute_bug(adev); 4899 4900 /* Initialize the buffer migration limit. */ 4901 if (amdgpu_moverate >= 0) 4902 max_MBps = amdgpu_moverate; 4903 else 4904 max_MBps = 8; /* Allow 8 MB/s. */ 4905 /* Get a log2 for easy divisions. */ 4906 adev->mm_stats.log2_max_MBps = ilog2(max(1u, max_MBps)); 4907 4908 /* 4909 * Register gpu instance before amdgpu_device_enable_mgpu_fan_boost. 4910 * Otherwise the mgpu fan boost feature will be skipped due to the 4911 * gpu instance is counted less. 4912 */ 4913 amdgpu_register_gpu_instance(adev); 4914 4915 /* enable clockgating, etc. after ib tests, etc. since some blocks require 4916 * explicit gating rather than handling it automatically. 4917 */ 4918 if (adev->init_lvl->level != AMDGPU_INIT_LEVEL_MINIMAL_XGMI) { 4919 r = amdgpu_device_ip_late_init(adev); 4920 if (r) { 4921 dev_err(adev->dev, "amdgpu_device_ip_late_init failed\n"); 4922 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_AMDGPU_LATE_INIT_FAIL, 0, r); 4923 goto release_ras_con; 4924 } 4925 /* must succeed. */ 4926 amdgpu_ras_resume(adev); 4927 queue_delayed_work(system_wq, &adev->delayed_init_work, 4928 msecs_to_jiffies(AMDGPU_RESUME_MS)); 4929 } 4930 4931 if (amdgpu_sriov_vf(adev)) { 4932 amdgpu_virt_release_full_gpu(adev, true); 4933 flush_delayed_work(&adev->delayed_init_work); 4934 } 4935 4936 if (adev->init_lvl->level == AMDGPU_INIT_LEVEL_MINIMAL_XGMI) 4937 amdgpu_xgmi_reset_on_init(adev); 4938 /* 4939 * Place those sysfs registering after `late_init`. As some of those 4940 * operations performed in `late_init` might affect the sysfs 4941 * interfaces creating. 4942 */ 4943 r = amdgpu_device_sys_interface_init(adev); 4944 4945 if (IS_ENABLED(CONFIG_PERF_EVENTS)) 4946 r = amdgpu_pmu_init(adev); 4947 if (r) 4948 dev_err(adev->dev, "amdgpu_pmu_init failed\n"); 4949 4950 /* Have stored pci confspace at hand for restore in sudden PCI error */ 4951 if (amdgpu_device_cache_pci_state(adev->pdev)) 4952 pci_restore_state(pdev); 4953 4954 /* if we have > 1 VGA cards, then disable the amdgpu VGA resources */ 4955 /* this will fail for cards that aren't VGA class devices, just 4956 * ignore it 4957 */ 4958 if ((adev->pdev->class >> 8) == PCI_CLASS_DISPLAY_VGA) 4959 vga_client_register(adev->pdev, amdgpu_device_vga_set_decode); 4960 4961 px = amdgpu_device_supports_px(adev); 4962 4963 if (px || (!dev_is_removable(&adev->pdev->dev) && 4964 apple_gmux_detect(NULL, NULL))) 4965 vga_switcheroo_register_client(adev->pdev, 4966 &amdgpu_switcheroo_ops, px); 4967 4968 if (px) 4969 vga_switcheroo_init_domain_pm_ops(adev->dev, &adev->vga_pm_domain); 4970 4971 amdgpu_device_check_iommu_direct_map(adev); 4972 4973 adev->pm_nb.notifier_call = amdgpu_device_pm_notifier; 4974 r = register_pm_notifier(&adev->pm_nb); 4975 if (r) 4976 goto failed; 4977 4978 return 0; 4979 4980 release_ras_con: 4981 if (amdgpu_sriov_vf(adev)) 4982 amdgpu_virt_release_full_gpu(adev, true); 4983 4984 /* failed in exclusive mode due to timeout */ 4985 if (amdgpu_sriov_vf(adev) && 4986 !amdgpu_sriov_runtime(adev) && 4987 amdgpu_virt_mmio_blocked(adev) && 4988 !amdgpu_virt_wait_reset(adev)) { 4989 dev_err(adev->dev, "VF exclusive mode timeout\n"); 4990 /* Don't send request since VF is inactive. */ 4991 adev->virt.caps &= ~AMDGPU_SRIOV_CAPS_RUNTIME; 4992 adev->virt.ops = NULL; 4993 r = -EAGAIN; 4994 } 4995 amdgpu_release_ras_context(adev); 4996 4997 failed: 4998 amdgpu_vf_error_trans_all(adev); 4999 5000 return r; 5001 } 5002 5003 static void amdgpu_device_unmap_mmio(struct amdgpu_device *adev) 5004 { 5005 5006 /* Clear all CPU mappings pointing to this device */ 5007 unmap_mapping_range(adev->ddev.anon_inode->i_mapping, 0, 0, 1); 5008 5009 /* Unmap all mapped bars - Doorbell, registers and VRAM */ 5010 amdgpu_doorbell_fini(adev); 5011 5012 iounmap(adev->rmmio); 5013 adev->rmmio = NULL; 5014 if (adev->mman.aper_base_kaddr) 5015 iounmap(adev->mman.aper_base_kaddr); 5016 adev->mman.aper_base_kaddr = NULL; 5017 5018 /* Memory manager related */ 5019 if (!adev->gmc.xgmi.connected_to_cpu && !adev->gmc.is_app_apu) { 5020 arch_phys_wc_del(adev->gmc.vram_mtrr); 5021 arch_io_free_memtype_wc(adev->gmc.aper_base, adev->gmc.aper_size); 5022 } 5023 } 5024 5025 /** 5026 * amdgpu_device_fini_hw - tear down the driver 5027 * 5028 * @adev: amdgpu_device pointer 5029 * 5030 * Tear down the driver info (all asics). 5031 * Called at driver shutdown. 5032 */ 5033 void amdgpu_device_fini_hw(struct amdgpu_device *adev) 5034 { 5035 dev_info(adev->dev, "amdgpu: finishing device.\n"); 5036 flush_delayed_work(&adev->delayed_init_work); 5037 5038 if (adev->mman.initialized) 5039 drain_workqueue(adev->mman.bdev.wq); 5040 adev->shutdown = true; 5041 5042 unregister_pm_notifier(&adev->pm_nb); 5043 5044 /* make sure IB test finished before entering exclusive mode 5045 * to avoid preemption on IB test 5046 */ 5047 if (amdgpu_sriov_vf(adev)) { 5048 amdgpu_virt_request_full_gpu(adev, false); 5049 amdgpu_virt_fini_data_exchange(adev); 5050 } 5051 5052 /* disable all interrupts */ 5053 amdgpu_irq_disable_all(adev); 5054 if (adev->mode_info.mode_config_initialized) { 5055 if (!drm_drv_uses_atomic_modeset(adev_to_drm(adev))) 5056 drm_helper_force_disable_all(adev_to_drm(adev)); 5057 else 5058 drm_atomic_helper_shutdown(adev_to_drm(adev)); 5059 } 5060 amdgpu_fence_driver_hw_fini(adev); 5061 5062 amdgpu_device_sys_interface_fini(adev); 5063 5064 /* disable ras feature must before hw fini */ 5065 amdgpu_ras_pre_fini(adev); 5066 5067 amdgpu_ttm_set_buffer_funcs_status(adev, false); 5068 5069 amdgpu_device_ip_fini_early(adev); 5070 5071 amdgpu_irq_fini_hw(adev); 5072 5073 if (adev->mman.initialized) 5074 ttm_device_clear_dma_mappings(&adev->mman.bdev); 5075 5076 amdgpu_gart_dummy_page_fini(adev); 5077 5078 if (drm_dev_is_unplugged(adev_to_drm(adev))) 5079 amdgpu_device_unmap_mmio(adev); 5080 5081 } 5082 5083 void amdgpu_device_fini_sw(struct amdgpu_device *adev) 5084 { 5085 int i, idx; 5086 bool px; 5087 5088 amdgpu_device_ip_fini(adev); 5089 amdgpu_fence_driver_sw_fini(adev); 5090 amdgpu_ucode_release(&adev->firmware.gpu_info_fw); 5091 adev->accel_working = false; 5092 dma_fence_put(rcu_dereference_protected(adev->gang_submit, true)); 5093 for (i = 0; i < MAX_XCP; ++i) { 5094 dma_fence_put(adev->isolation[i].spearhead); 5095 amdgpu_sync_free(&adev->isolation[i].active); 5096 amdgpu_sync_free(&adev->isolation[i].prev); 5097 } 5098 5099 amdgpu_reset_fini(adev); 5100 5101 /* free i2c buses */ 5102 amdgpu_i2c_fini(adev); 5103 5104 if (adev->bios) { 5105 if (amdgpu_emu_mode != 1) 5106 amdgpu_atombios_fini(adev); 5107 amdgpu_bios_release(adev); 5108 } 5109 5110 kfree(adev->fru_info); 5111 adev->fru_info = NULL; 5112 5113 kfree(adev->xcp_mgr); 5114 adev->xcp_mgr = NULL; 5115 5116 px = amdgpu_device_supports_px(adev); 5117 5118 if (px || (!dev_is_removable(&adev->pdev->dev) && 5119 apple_gmux_detect(NULL, NULL))) 5120 vga_switcheroo_unregister_client(adev->pdev); 5121 5122 if (px) 5123 vga_switcheroo_fini_domain_pm_ops(adev->dev); 5124 5125 if ((adev->pdev->class >> 8) == PCI_CLASS_DISPLAY_VGA) 5126 vga_client_unregister(adev->pdev); 5127 5128 if (drm_dev_enter(adev_to_drm(adev), &idx)) { 5129 5130 iounmap(adev->rmmio); 5131 adev->rmmio = NULL; 5132 drm_dev_exit(idx); 5133 } 5134 5135 if (IS_ENABLED(CONFIG_PERF_EVENTS)) 5136 amdgpu_pmu_fini(adev); 5137 if (adev->discovery.bin) 5138 amdgpu_discovery_fini(adev); 5139 5140 amdgpu_reset_put_reset_domain(adev->reset_domain); 5141 adev->reset_domain = NULL; 5142 5143 kfree(adev->pci_state); 5144 kfree(adev->pcie_reset_ctx.swds_pcistate); 5145 kfree(adev->pcie_reset_ctx.swus_pcistate); 5146 } 5147 5148 /** 5149 * amdgpu_device_evict_resources - evict device resources 5150 * @adev: amdgpu device object 5151 * 5152 * Evicts all ttm device resources(vram BOs, gart table) from the lru list 5153 * of the vram memory type. Mainly used for evicting device resources 5154 * at suspend time. 5155 * 5156 */ 5157 static int amdgpu_device_evict_resources(struct amdgpu_device *adev) 5158 { 5159 int ret; 5160 5161 /* No need to evict vram on APUs unless going to S4 */ 5162 if (!adev->in_s4 && (adev->flags & AMD_IS_APU)) 5163 return 0; 5164 5165 /* No need to evict when going to S5 through S4 callbacks */ 5166 if (system_state == SYSTEM_POWER_OFF) 5167 return 0; 5168 5169 ret = amdgpu_ttm_evict_resources(adev, TTM_PL_VRAM); 5170 if (ret) { 5171 dev_warn(adev->dev, "evicting device resources failed\n"); 5172 return ret; 5173 } 5174 5175 if (adev->in_s4) { 5176 ret = ttm_device_prepare_hibernation(&adev->mman.bdev); 5177 if (ret) 5178 dev_err(adev->dev, "prepare hibernation failed, %d\n", ret); 5179 } 5180 return ret; 5181 } 5182 5183 /* 5184 * Suspend & resume. 5185 */ 5186 /** 5187 * amdgpu_device_pm_notifier - Notification block for Suspend/Hibernate events 5188 * @nb: notifier block 5189 * @mode: suspend mode 5190 * @data: data 5191 * 5192 * This function is called when the system is about to suspend or hibernate. 5193 * It is used to set the appropriate flags so that eviction can be optimized 5194 * in the pm prepare callback. 5195 */ 5196 static int amdgpu_device_pm_notifier(struct notifier_block *nb, unsigned long mode, 5197 void *data) 5198 { 5199 struct amdgpu_device *adev = container_of(nb, struct amdgpu_device, pm_nb); 5200 5201 switch (mode) { 5202 case PM_HIBERNATION_PREPARE: 5203 adev->in_s4 = true; 5204 break; 5205 case PM_POST_HIBERNATION: 5206 adev->in_s4 = false; 5207 break; 5208 } 5209 5210 return NOTIFY_DONE; 5211 } 5212 5213 /** 5214 * amdgpu_device_prepare - prepare for device suspend 5215 * 5216 * @dev: drm dev pointer 5217 * 5218 * Prepare to put the hw in the suspend state (all asics). 5219 * Returns 0 for success or an error on failure. 5220 * Called at driver suspend. 5221 */ 5222 int amdgpu_device_prepare(struct drm_device *dev) 5223 { 5224 struct amdgpu_device *adev = drm_to_adev(dev); 5225 int i, r; 5226 5227 if (dev->switch_power_state == DRM_SWITCH_POWER_OFF) 5228 return 0; 5229 5230 /* Evict the majority of BOs before starting suspend sequence */ 5231 r = amdgpu_device_evict_resources(adev); 5232 if (r) 5233 return r; 5234 5235 flush_delayed_work(&adev->gfx.gfx_off_delay_work); 5236 5237 for (i = 0; i < adev->num_ip_blocks; i++) { 5238 if (!adev->ip_blocks[i].status.valid) 5239 continue; 5240 if (!adev->ip_blocks[i].version->funcs->prepare_suspend) 5241 continue; 5242 r = adev->ip_blocks[i].version->funcs->prepare_suspend(&adev->ip_blocks[i]); 5243 if (r) 5244 return r; 5245 } 5246 5247 return 0; 5248 } 5249 5250 /** 5251 * amdgpu_device_complete - complete power state transition 5252 * 5253 * @dev: drm dev pointer 5254 * 5255 * Undo the changes from amdgpu_device_prepare. This will be 5256 * called on all resume transitions, including those that failed. 5257 */ 5258 void amdgpu_device_complete(struct drm_device *dev) 5259 { 5260 struct amdgpu_device *adev = drm_to_adev(dev); 5261 int i; 5262 5263 for (i = 0; i < adev->num_ip_blocks; i++) { 5264 if (!adev->ip_blocks[i].status.valid) 5265 continue; 5266 if (!adev->ip_blocks[i].version->funcs->complete) 5267 continue; 5268 adev->ip_blocks[i].version->funcs->complete(&adev->ip_blocks[i]); 5269 } 5270 } 5271 5272 /** 5273 * amdgpu_device_suspend - initiate device suspend 5274 * 5275 * @dev: drm dev pointer 5276 * @notify_clients: notify in-kernel DRM clients 5277 * 5278 * Puts the hw in the suspend state (all asics). 5279 * Returns 0 for success or an error on failure. 5280 * Called at driver suspend. 5281 */ 5282 int amdgpu_device_suspend(struct drm_device *dev, bool notify_clients) 5283 { 5284 struct amdgpu_device *adev = drm_to_adev(dev); 5285 int r, rec; 5286 5287 if (dev->switch_power_state == DRM_SWITCH_POWER_OFF) 5288 return 0; 5289 5290 adev->in_suspend = true; 5291 5292 if (amdgpu_sriov_vf(adev)) { 5293 if (!adev->in_runpm) 5294 amdgpu_amdkfd_suspend_process(adev); 5295 amdgpu_virt_fini_data_exchange(adev); 5296 r = amdgpu_virt_request_full_gpu(adev, false); 5297 if (r) 5298 return r; 5299 } 5300 5301 r = amdgpu_acpi_smart_shift_update(adev, AMDGPU_SS_DEV_D3); 5302 if (r) 5303 goto unwind_sriov; 5304 5305 if (notify_clients) 5306 drm_client_dev_suspend(adev_to_drm(adev)); 5307 5308 cancel_delayed_work_sync(&adev->delayed_init_work); 5309 5310 amdgpu_ras_suspend(adev); 5311 5312 r = amdgpu_device_ip_suspend_phase1(adev); 5313 if (r) 5314 goto unwind_smartshift; 5315 5316 amdgpu_amdkfd_suspend(adev, !amdgpu_sriov_vf(adev) && !adev->in_runpm); 5317 r = amdgpu_userq_suspend(adev); 5318 if (r) 5319 goto unwind_ip_phase1; 5320 5321 r = amdgpu_device_evict_resources(adev); 5322 if (r) 5323 goto unwind_userq; 5324 5325 amdgpu_ttm_set_buffer_funcs_status(adev, false); 5326 5327 amdgpu_fence_driver_hw_fini(adev); 5328 5329 r = amdgpu_device_ip_suspend_phase2(adev); 5330 if (r) 5331 goto unwind_evict; 5332 5333 if (amdgpu_sriov_vf(adev)) 5334 amdgpu_virt_release_full_gpu(adev, false); 5335 5336 return 0; 5337 5338 unwind_evict: 5339 if (adev->mman.buffer_funcs_ring->sched.ready) 5340 amdgpu_ttm_set_buffer_funcs_status(adev, true); 5341 amdgpu_fence_driver_hw_init(adev); 5342 5343 unwind_userq: 5344 rec = amdgpu_userq_resume(adev); 5345 if (rec) { 5346 dev_warn(adev->dev, "failed to re-initialize user queues: %d\n", rec); 5347 return r; 5348 } 5349 rec = amdgpu_amdkfd_resume(adev, !amdgpu_sriov_vf(adev) && !adev->in_runpm); 5350 if (rec) { 5351 dev_warn(adev->dev, "failed to re-initialize kfd: %d\n", rec); 5352 return r; 5353 } 5354 5355 unwind_ip_phase1: 5356 /* suspend phase 1 = resume phase 3 */ 5357 rec = amdgpu_device_ip_resume_phase3(adev); 5358 if (rec) { 5359 dev_warn(adev->dev, "failed to re-initialize IPs phase1: %d\n", rec); 5360 return r; 5361 } 5362 5363 unwind_smartshift: 5364 rec = amdgpu_acpi_smart_shift_update(adev, AMDGPU_SS_DEV_D0); 5365 if (rec) { 5366 dev_warn(adev->dev, "failed to re-update smart shift: %d\n", rec); 5367 return r; 5368 } 5369 5370 if (notify_clients) 5371 drm_client_dev_resume(adev_to_drm(adev)); 5372 5373 amdgpu_ras_resume(adev); 5374 5375 unwind_sriov: 5376 if (amdgpu_sriov_vf(adev)) { 5377 rec = amdgpu_virt_request_full_gpu(adev, true); 5378 if (rec) { 5379 dev_warn(adev->dev, "failed to reinitialize sriov: %d\n", rec); 5380 return r; 5381 } 5382 } 5383 5384 adev->in_suspend = adev->in_s0ix = adev->in_s3 = false; 5385 5386 return r; 5387 } 5388 5389 static inline int amdgpu_virt_resume(struct amdgpu_device *adev) 5390 { 5391 int r; 5392 unsigned int prev_physical_node_id = adev->gmc.xgmi.physical_node_id; 5393 5394 /* During VM resume, QEMU programming of VF MSIX table (register GFXMSIX_VECT0_ADDR_LO) 5395 * may not work. The access could be blocked by nBIF protection as VF isn't in 5396 * exclusive access mode. Exclusive access is enabled now, disable/enable MSIX 5397 * so that QEMU reprograms MSIX table. 5398 */ 5399 amdgpu_restore_msix(adev); 5400 5401 r = adev->gfxhub.funcs->get_xgmi_info(adev); 5402 if (r) 5403 return r; 5404 5405 dev_info(adev->dev, "xgmi node, old id %d, new id %d\n", 5406 prev_physical_node_id, adev->gmc.xgmi.physical_node_id); 5407 5408 adev->vm_manager.vram_base_offset = adev->gfxhub.funcs->get_mc_fb_offset(adev); 5409 adev->vm_manager.vram_base_offset += 5410 adev->gmc.xgmi.physical_node_id * adev->gmc.xgmi.node_segment_size; 5411 5412 return 0; 5413 } 5414 5415 /** 5416 * amdgpu_device_resume - initiate device resume 5417 * 5418 * @dev: drm dev pointer 5419 * @notify_clients: notify in-kernel DRM clients 5420 * 5421 * Bring the hw back to operating state (all asics). 5422 * Returns 0 for success or an error on failure. 5423 * Called at driver resume. 5424 */ 5425 int amdgpu_device_resume(struct drm_device *dev, bool notify_clients) 5426 { 5427 struct amdgpu_device *adev = drm_to_adev(dev); 5428 int r = 0; 5429 5430 if (amdgpu_sriov_vf(adev)) { 5431 r = amdgpu_virt_request_full_gpu(adev, true); 5432 if (r) 5433 return r; 5434 } 5435 5436 if (amdgpu_virt_xgmi_migrate_enabled(adev)) { 5437 r = amdgpu_virt_resume(adev); 5438 if (r) 5439 goto exit; 5440 } 5441 5442 if (dev->switch_power_state == DRM_SWITCH_POWER_OFF) 5443 return 0; 5444 5445 if (adev->in_s0ix) 5446 amdgpu_dpm_gfx_state_change(adev, sGpuChangeState_D0Entry); 5447 5448 /* post card */ 5449 if (amdgpu_device_need_post(adev)) { 5450 r = amdgpu_device_asic_init(adev); 5451 if (r) 5452 dev_err(adev->dev, "amdgpu asic init failed\n"); 5453 } 5454 5455 r = amdgpu_device_ip_resume(adev); 5456 5457 if (r) { 5458 dev_err(adev->dev, "amdgpu_device_ip_resume failed (%d).\n", r); 5459 goto exit; 5460 } 5461 5462 r = amdgpu_amdkfd_resume(adev, !amdgpu_sriov_vf(adev) && !adev->in_runpm); 5463 if (r) 5464 goto exit; 5465 5466 r = amdgpu_userq_resume(adev); 5467 if (r) 5468 goto exit; 5469 5470 r = amdgpu_device_ip_late_init(adev); 5471 if (r) 5472 goto exit; 5473 5474 queue_delayed_work(system_wq, &adev->delayed_init_work, 5475 msecs_to_jiffies(AMDGPU_RESUME_MS)); 5476 exit: 5477 if (amdgpu_sriov_vf(adev)) { 5478 amdgpu_virt_init_data_exchange(adev); 5479 amdgpu_virt_release_full_gpu(adev, true); 5480 5481 if (!r && !adev->in_runpm) 5482 r = amdgpu_amdkfd_resume_process(adev); 5483 } 5484 5485 if (r) 5486 return r; 5487 5488 /* Make sure IB tests flushed */ 5489 flush_delayed_work(&adev->delayed_init_work); 5490 5491 if (notify_clients) 5492 drm_client_dev_resume(adev_to_drm(adev)); 5493 5494 amdgpu_ras_resume(adev); 5495 5496 if (adev->mode_info.num_crtc) { 5497 /* 5498 * Most of the connector probing functions try to acquire runtime pm 5499 * refs to ensure that the GPU is powered on when connector polling is 5500 * performed. Since we're calling this from a runtime PM callback, 5501 * trying to acquire rpm refs will cause us to deadlock. 5502 * 5503 * Since we're guaranteed to be holding the rpm lock, it's safe to 5504 * temporarily disable the rpm helpers so this doesn't deadlock us. 5505 */ 5506 #ifdef CONFIG_PM 5507 dev->dev->power.disable_depth++; 5508 #endif 5509 if (!adev->dc_enabled) 5510 drm_helper_hpd_irq_event(dev); 5511 else 5512 drm_kms_helper_hotplug_event(dev); 5513 #ifdef CONFIG_PM 5514 dev->dev->power.disable_depth--; 5515 #endif 5516 } 5517 5518 amdgpu_vram_mgr_clear_reset_blocks(adev); 5519 adev->in_suspend = false; 5520 5521 if (amdgpu_acpi_smart_shift_update(adev, AMDGPU_SS_DEV_D0)) 5522 dev_warn(adev->dev, "smart shift update failed\n"); 5523 5524 return 0; 5525 } 5526 5527 /** 5528 * amdgpu_device_ip_check_soft_reset - did soft reset succeed 5529 * 5530 * @adev: amdgpu_device pointer 5531 * 5532 * The list of all the hardware IPs that make up the asic is walked and 5533 * the check_soft_reset callbacks are run. check_soft_reset determines 5534 * if the asic is still hung or not. 5535 * Returns true if any of the IPs are still in a hung state, false if not. 5536 */ 5537 static bool amdgpu_device_ip_check_soft_reset(struct amdgpu_device *adev) 5538 { 5539 int i; 5540 bool asic_hang = false; 5541 5542 if (amdgpu_sriov_vf(adev)) 5543 return true; 5544 5545 if (amdgpu_asic_need_full_reset(adev)) 5546 return true; 5547 5548 for (i = 0; i < adev->num_ip_blocks; i++) { 5549 if (!adev->ip_blocks[i].status.valid) 5550 continue; 5551 if (adev->ip_blocks[i].version->funcs->check_soft_reset) 5552 adev->ip_blocks[i].status.hang = 5553 adev->ip_blocks[i].version->funcs->check_soft_reset( 5554 &adev->ip_blocks[i]); 5555 if (adev->ip_blocks[i].status.hang) { 5556 dev_info(adev->dev, "IP block:%s is hung!\n", adev->ip_blocks[i].version->funcs->name); 5557 asic_hang = true; 5558 } 5559 } 5560 return asic_hang; 5561 } 5562 5563 /** 5564 * amdgpu_device_ip_pre_soft_reset - prepare for soft reset 5565 * 5566 * @adev: amdgpu_device pointer 5567 * 5568 * The list of all the hardware IPs that make up the asic is walked and the 5569 * pre_soft_reset callbacks are run if the block is hung. pre_soft_reset 5570 * handles any IP specific hardware or software state changes that are 5571 * necessary for a soft reset to succeed. 5572 * Returns 0 on success, negative error code on failure. 5573 */ 5574 static int amdgpu_device_ip_pre_soft_reset(struct amdgpu_device *adev) 5575 { 5576 int i, r = 0; 5577 5578 for (i = 0; i < adev->num_ip_blocks; i++) { 5579 if (!adev->ip_blocks[i].status.valid) 5580 continue; 5581 if (adev->ip_blocks[i].status.hang && 5582 adev->ip_blocks[i].version->funcs->pre_soft_reset) { 5583 r = adev->ip_blocks[i].version->funcs->pre_soft_reset(&adev->ip_blocks[i]); 5584 if (r) 5585 return r; 5586 } 5587 } 5588 5589 return 0; 5590 } 5591 5592 /** 5593 * amdgpu_device_ip_need_full_reset - check if a full asic reset is needed 5594 * 5595 * @adev: amdgpu_device pointer 5596 * 5597 * Some hardware IPs cannot be soft reset. If they are hung, a full gpu 5598 * reset is necessary to recover. 5599 * Returns true if a full asic reset is required, false if not. 5600 */ 5601 static bool amdgpu_device_ip_need_full_reset(struct amdgpu_device *adev) 5602 { 5603 int i; 5604 5605 if (amdgpu_asic_need_full_reset(adev)) 5606 return true; 5607 5608 for (i = 0; i < adev->num_ip_blocks; i++) { 5609 if (!adev->ip_blocks[i].status.valid) 5610 continue; 5611 if ((adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) || 5612 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) || 5613 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_ACP) || 5614 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE) || 5615 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) { 5616 if (adev->ip_blocks[i].status.hang) { 5617 dev_info(adev->dev, "Some block need full reset!\n"); 5618 return true; 5619 } 5620 } 5621 } 5622 return false; 5623 } 5624 5625 /** 5626 * amdgpu_device_ip_soft_reset - do a soft reset 5627 * 5628 * @adev: amdgpu_device pointer 5629 * 5630 * The list of all the hardware IPs that make up the asic is walked and the 5631 * soft_reset callbacks are run if the block is hung. soft_reset handles any 5632 * IP specific hardware or software state changes that are necessary to soft 5633 * reset the IP. 5634 * Returns 0 on success, negative error code on failure. 5635 */ 5636 static int amdgpu_device_ip_soft_reset(struct amdgpu_device *adev) 5637 { 5638 int i, r = 0; 5639 5640 for (i = 0; i < adev->num_ip_blocks; i++) { 5641 if (!adev->ip_blocks[i].status.valid) 5642 continue; 5643 if (adev->ip_blocks[i].status.hang && 5644 adev->ip_blocks[i].version->funcs->soft_reset) { 5645 r = adev->ip_blocks[i].version->funcs->soft_reset(&adev->ip_blocks[i]); 5646 if (r) 5647 return r; 5648 } 5649 } 5650 5651 return 0; 5652 } 5653 5654 /** 5655 * amdgpu_device_ip_post_soft_reset - clean up from soft reset 5656 * 5657 * @adev: amdgpu_device pointer 5658 * 5659 * The list of all the hardware IPs that make up the asic is walked and the 5660 * post_soft_reset callbacks are run if the asic was hung. post_soft_reset 5661 * handles any IP specific hardware or software state changes that are 5662 * necessary after the IP has been soft reset. 5663 * Returns 0 on success, negative error code on failure. 5664 */ 5665 static int amdgpu_device_ip_post_soft_reset(struct amdgpu_device *adev) 5666 { 5667 int i, r = 0; 5668 5669 for (i = 0; i < adev->num_ip_blocks; i++) { 5670 if (!adev->ip_blocks[i].status.valid) 5671 continue; 5672 if (adev->ip_blocks[i].status.hang && 5673 adev->ip_blocks[i].version->funcs->post_soft_reset) 5674 r = adev->ip_blocks[i].version->funcs->post_soft_reset(&adev->ip_blocks[i]); 5675 if (r) 5676 return r; 5677 } 5678 5679 return 0; 5680 } 5681 5682 /** 5683 * amdgpu_device_reset_sriov - reset ASIC for SR-IOV vf 5684 * 5685 * @adev: amdgpu_device pointer 5686 * @reset_context: amdgpu reset context pointer 5687 * 5688 * do VF FLR and reinitialize Asic 5689 * return 0 means succeeded otherwise failed 5690 */ 5691 static int amdgpu_device_reset_sriov(struct amdgpu_device *adev, 5692 struct amdgpu_reset_context *reset_context) 5693 { 5694 int r; 5695 struct amdgpu_hive_info *hive = NULL; 5696 5697 if (test_bit(AMDGPU_HOST_FLR, &reset_context->flags)) { 5698 if (!amdgpu_ras_get_fed_status(adev)) 5699 amdgpu_virt_ready_to_reset(adev); 5700 amdgpu_virt_wait_reset(adev); 5701 clear_bit(AMDGPU_HOST_FLR, &reset_context->flags); 5702 r = amdgpu_virt_request_full_gpu(adev, true); 5703 } else { 5704 r = amdgpu_virt_reset_gpu(adev); 5705 } 5706 if (r) 5707 return r; 5708 5709 amdgpu_ras_clear_err_state(adev); 5710 amdgpu_irq_gpu_reset_resume_helper(adev); 5711 5712 /* some sw clean up VF needs to do before recover */ 5713 amdgpu_virt_post_reset(adev); 5714 5715 /* Resume IP prior to SMC */ 5716 r = amdgpu_device_ip_reinit_early_sriov(adev); 5717 if (r) 5718 return r; 5719 5720 amdgpu_virt_init_data_exchange(adev); 5721 5722 r = amdgpu_device_fw_loading(adev); 5723 if (r) 5724 return r; 5725 5726 /* now we are okay to resume SMC/CP/SDMA */ 5727 r = amdgpu_device_ip_reinit_late_sriov(adev); 5728 if (r) 5729 return r; 5730 5731 hive = amdgpu_get_xgmi_hive(adev); 5732 /* Update PSP FW topology after reset */ 5733 if (hive && adev->gmc.xgmi.num_physical_nodes > 1) 5734 r = amdgpu_xgmi_update_topology(hive, adev); 5735 if (hive) 5736 amdgpu_put_xgmi_hive(hive); 5737 if (r) 5738 return r; 5739 5740 r = amdgpu_ib_ring_tests(adev); 5741 if (r) 5742 return r; 5743 5744 if (adev->virt.gim_feature & AMDGIM_FEATURE_GIM_FLR_VRAMLOST) 5745 amdgpu_inc_vram_lost(adev); 5746 5747 /* need to be called during full access so we can't do it later like 5748 * bare-metal does. 5749 */ 5750 amdgpu_amdkfd_post_reset(adev); 5751 amdgpu_virt_release_full_gpu(adev, true); 5752 5753 /* Aldebaran and gfx_11_0_3 support ras in SRIOV, so need resume ras during reset */ 5754 if (amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 2) || 5755 amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 3) || 5756 amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 4) || 5757 amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 5, 0) || 5758 amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(11, 0, 3)) 5759 amdgpu_ras_resume(adev); 5760 5761 amdgpu_virt_ras_telemetry_post_reset(adev); 5762 5763 return 0; 5764 } 5765 5766 /** 5767 * amdgpu_device_has_job_running - check if there is any unfinished job 5768 * 5769 * @adev: amdgpu_device pointer 5770 * 5771 * check if there is any job running on the device when guest driver receives 5772 * FLR notification from host driver. If there are still jobs running, then 5773 * the guest driver will not respond the FLR reset. Instead, let the job hit 5774 * the timeout and guest driver then issue the reset request. 5775 */ 5776 bool amdgpu_device_has_job_running(struct amdgpu_device *adev) 5777 { 5778 int i; 5779 5780 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 5781 struct amdgpu_ring *ring = adev->rings[i]; 5782 5783 if (!amdgpu_ring_sched_ready(ring)) 5784 continue; 5785 5786 if (amdgpu_fence_count_emitted(ring)) 5787 return true; 5788 } 5789 return false; 5790 } 5791 5792 /** 5793 * amdgpu_device_should_recover_gpu - check if we should try GPU recovery 5794 * 5795 * @adev: amdgpu_device pointer 5796 * 5797 * Check amdgpu_gpu_recovery and SRIOV status to see if we should try to recover 5798 * a hung GPU. 5799 */ 5800 bool amdgpu_device_should_recover_gpu(struct amdgpu_device *adev) 5801 { 5802 5803 if (amdgpu_gpu_recovery == 0) 5804 goto disabled; 5805 5806 /* Skip soft reset check in fatal error mode */ 5807 if (!amdgpu_ras_is_poison_mode_supported(adev)) 5808 return true; 5809 5810 if (amdgpu_sriov_vf(adev)) 5811 return true; 5812 5813 if (amdgpu_gpu_recovery == -1) { 5814 switch (adev->asic_type) { 5815 #ifdef CONFIG_DRM_AMDGPU_SI 5816 case CHIP_VERDE: 5817 case CHIP_TAHITI: 5818 case CHIP_PITCAIRN: 5819 case CHIP_OLAND: 5820 case CHIP_HAINAN: 5821 #endif 5822 #ifdef CONFIG_DRM_AMDGPU_CIK 5823 case CHIP_KAVERI: 5824 case CHIP_KABINI: 5825 case CHIP_MULLINS: 5826 #endif 5827 case CHIP_CARRIZO: 5828 case CHIP_STONEY: 5829 case CHIP_CYAN_SKILLFISH: 5830 goto disabled; 5831 default: 5832 break; 5833 } 5834 } 5835 5836 return true; 5837 5838 disabled: 5839 dev_info(adev->dev, "GPU recovery disabled.\n"); 5840 return false; 5841 } 5842 5843 int amdgpu_device_mode1_reset(struct amdgpu_device *adev) 5844 { 5845 u32 i; 5846 int ret = 0; 5847 5848 if (adev->bios) 5849 amdgpu_atombios_scratch_regs_engine_hung(adev, true); 5850 5851 dev_info(adev->dev, "GPU mode1 reset\n"); 5852 5853 /* Cache the state before bus master disable. The saved config space 5854 * values are used in other cases like restore after mode-2 reset. 5855 */ 5856 amdgpu_device_cache_pci_state(adev->pdev); 5857 5858 /* disable BM */ 5859 pci_clear_master(adev->pdev); 5860 5861 if (amdgpu_dpm_is_mode1_reset_supported(adev)) { 5862 dev_info(adev->dev, "GPU smu mode1 reset\n"); 5863 ret = amdgpu_dpm_mode1_reset(adev); 5864 } else { 5865 dev_info(adev->dev, "GPU psp mode1 reset\n"); 5866 ret = psp_gpu_reset(adev); 5867 } 5868 5869 if (ret) 5870 goto mode1_reset_failed; 5871 5872 amdgpu_device_load_pci_state(adev->pdev); 5873 ret = amdgpu_psp_wait_for_bootloader(adev); 5874 if (ret) 5875 goto mode1_reset_failed; 5876 5877 /* wait for asic to come out of reset */ 5878 for (i = 0; i < adev->usec_timeout; i++) { 5879 u32 memsize = adev->nbio.funcs->get_memsize(adev); 5880 5881 if (memsize != 0xffffffff) 5882 break; 5883 udelay(1); 5884 } 5885 5886 if (i >= adev->usec_timeout) { 5887 ret = -ETIMEDOUT; 5888 goto mode1_reset_failed; 5889 } 5890 5891 if (adev->bios) 5892 amdgpu_atombios_scratch_regs_engine_hung(adev, false); 5893 5894 return 0; 5895 5896 mode1_reset_failed: 5897 dev_err(adev->dev, "GPU mode1 reset failed\n"); 5898 return ret; 5899 } 5900 5901 int amdgpu_device_link_reset(struct amdgpu_device *adev) 5902 { 5903 int ret = 0; 5904 5905 dev_info(adev->dev, "GPU link reset\n"); 5906 5907 if (!amdgpu_reset_in_dpc(adev)) 5908 ret = amdgpu_dpm_link_reset(adev); 5909 5910 if (ret) 5911 goto link_reset_failed; 5912 5913 ret = amdgpu_psp_wait_for_bootloader(adev); 5914 if (ret) 5915 goto link_reset_failed; 5916 5917 return 0; 5918 5919 link_reset_failed: 5920 dev_err(adev->dev, "GPU link reset failed\n"); 5921 return ret; 5922 } 5923 5924 int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev, 5925 struct amdgpu_reset_context *reset_context) 5926 { 5927 int i, r = 0; 5928 struct amdgpu_job *job = NULL; 5929 struct amdgpu_device *tmp_adev = reset_context->reset_req_dev; 5930 bool need_full_reset = 5931 test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 5932 5933 if (reset_context->reset_req_dev == adev) 5934 job = reset_context->job; 5935 5936 if (amdgpu_sriov_vf(adev)) 5937 amdgpu_virt_pre_reset(adev); 5938 5939 amdgpu_fence_driver_isr_toggle(adev, true); 5940 5941 /* block all schedulers and reset given job's ring */ 5942 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 5943 struct amdgpu_ring *ring = adev->rings[i]; 5944 5945 if (!amdgpu_ring_sched_ready(ring)) 5946 continue; 5947 5948 /* after all hw jobs are reset, hw fence is meaningless, so force_completion */ 5949 amdgpu_fence_driver_force_completion(ring); 5950 } 5951 5952 amdgpu_fence_driver_isr_toggle(adev, false); 5953 5954 if (job && job->vm) 5955 drm_sched_increase_karma(&job->base); 5956 5957 r = amdgpu_reset_prepare_hwcontext(adev, reset_context); 5958 /* If reset handler not implemented, continue; otherwise return */ 5959 if (r == -EOPNOTSUPP) 5960 r = 0; 5961 else 5962 return r; 5963 5964 /* Don't suspend on bare metal if we are not going to HW reset the ASIC */ 5965 if (!amdgpu_sriov_vf(adev)) { 5966 5967 if (!need_full_reset) 5968 need_full_reset = amdgpu_device_ip_need_full_reset(adev); 5969 5970 if (!need_full_reset && amdgpu_gpu_recovery && 5971 amdgpu_device_ip_check_soft_reset(adev)) { 5972 amdgpu_device_ip_pre_soft_reset(adev); 5973 r = amdgpu_device_ip_soft_reset(adev); 5974 amdgpu_device_ip_post_soft_reset(adev); 5975 if (r || amdgpu_device_ip_check_soft_reset(adev)) { 5976 dev_info(adev->dev, "soft reset failed, will fallback to full reset!\n"); 5977 need_full_reset = true; 5978 } 5979 } 5980 5981 if (!test_bit(AMDGPU_SKIP_COREDUMP, &reset_context->flags)) { 5982 dev_info(tmp_adev->dev, "Dumping IP State\n"); 5983 /* Trigger ip dump before we reset the asic */ 5984 for (i = 0; i < tmp_adev->num_ip_blocks; i++) 5985 if (tmp_adev->ip_blocks[i].version->funcs->dump_ip_state) 5986 tmp_adev->ip_blocks[i].version->funcs 5987 ->dump_ip_state((void *)&tmp_adev->ip_blocks[i]); 5988 dev_info(tmp_adev->dev, "Dumping IP State Completed\n"); 5989 } 5990 5991 if (need_full_reset) 5992 r = amdgpu_device_ip_suspend(adev); 5993 if (need_full_reset) 5994 set_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 5995 else 5996 clear_bit(AMDGPU_NEED_FULL_RESET, 5997 &reset_context->flags); 5998 } 5999 6000 return r; 6001 } 6002 6003 int amdgpu_device_reinit_after_reset(struct amdgpu_reset_context *reset_context) 6004 { 6005 struct list_head *device_list_handle; 6006 bool full_reset, vram_lost = false; 6007 struct amdgpu_device *tmp_adev; 6008 int r, init_level; 6009 6010 device_list_handle = reset_context->reset_device_list; 6011 6012 if (!device_list_handle) 6013 return -EINVAL; 6014 6015 full_reset = test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 6016 6017 /** 6018 * If it's reset on init, it's default init level, otherwise keep level 6019 * as recovery level. 6020 */ 6021 if (reset_context->method == AMD_RESET_METHOD_ON_INIT) 6022 init_level = AMDGPU_INIT_LEVEL_DEFAULT; 6023 else 6024 init_level = AMDGPU_INIT_LEVEL_RESET_RECOVERY; 6025 6026 r = 0; 6027 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 6028 amdgpu_set_init_level(tmp_adev, init_level); 6029 if (full_reset) { 6030 /* post card */ 6031 amdgpu_reset_set_dpc_status(tmp_adev, false); 6032 amdgpu_ras_clear_err_state(tmp_adev); 6033 r = amdgpu_device_asic_init(tmp_adev); 6034 if (r) { 6035 dev_warn(tmp_adev->dev, "asic atom init failed!"); 6036 } else { 6037 dev_info(tmp_adev->dev, "GPU reset succeeded, trying to resume\n"); 6038 6039 r = amdgpu_device_ip_resume_phase1(tmp_adev); 6040 if (r) 6041 goto out; 6042 6043 vram_lost = amdgpu_device_check_vram_lost(tmp_adev); 6044 6045 if (!test_bit(AMDGPU_SKIP_COREDUMP, &reset_context->flags)) 6046 amdgpu_coredump(tmp_adev, false, vram_lost, reset_context->job); 6047 6048 if (vram_lost) { 6049 dev_info( 6050 tmp_adev->dev, 6051 "VRAM is lost due to GPU reset!\n"); 6052 amdgpu_inc_vram_lost(tmp_adev); 6053 } 6054 6055 r = amdgpu_device_fw_loading(tmp_adev); 6056 if (r) 6057 return r; 6058 6059 r = amdgpu_xcp_restore_partition_mode( 6060 tmp_adev->xcp_mgr); 6061 if (r) 6062 goto out; 6063 6064 r = amdgpu_device_ip_resume_phase2(tmp_adev); 6065 if (r) 6066 goto out; 6067 6068 if (tmp_adev->mman.buffer_funcs_ring->sched.ready) 6069 amdgpu_ttm_set_buffer_funcs_status(tmp_adev, true); 6070 6071 r = amdgpu_device_ip_resume_phase3(tmp_adev); 6072 if (r) 6073 goto out; 6074 6075 if (vram_lost) 6076 amdgpu_device_fill_reset_magic(tmp_adev); 6077 6078 /* 6079 * Add this ASIC as tracked as reset was already 6080 * complete successfully. 6081 */ 6082 amdgpu_register_gpu_instance(tmp_adev); 6083 6084 if (!reset_context->hive && 6085 tmp_adev->gmc.xgmi.num_physical_nodes > 1) 6086 amdgpu_xgmi_add_device(tmp_adev); 6087 6088 r = amdgpu_device_ip_late_init(tmp_adev); 6089 if (r) 6090 goto out; 6091 6092 r = amdgpu_userq_post_reset(tmp_adev, vram_lost); 6093 if (r) 6094 goto out; 6095 6096 drm_client_dev_resume(adev_to_drm(tmp_adev)); 6097 6098 /* 6099 * The GPU enters bad state once faulty pages 6100 * by ECC has reached the threshold, and ras 6101 * recovery is scheduled next. So add one check 6102 * here to break recovery if it indeed exceeds 6103 * bad page threshold, and remind user to 6104 * retire this GPU or setting one bigger 6105 * bad_page_threshold value to fix this once 6106 * probing driver again. 6107 */ 6108 if (!amdgpu_ras_is_rma(tmp_adev)) { 6109 /* must succeed. */ 6110 amdgpu_ras_resume(tmp_adev); 6111 } else { 6112 r = -EINVAL; 6113 goto out; 6114 } 6115 6116 /* Update PSP FW topology after reset */ 6117 if (reset_context->hive && 6118 tmp_adev->gmc.xgmi.num_physical_nodes > 1) 6119 r = amdgpu_xgmi_update_topology( 6120 reset_context->hive, tmp_adev); 6121 } 6122 } 6123 6124 out: 6125 if (!r) { 6126 /* IP init is complete now, set level as default */ 6127 amdgpu_set_init_level(tmp_adev, 6128 AMDGPU_INIT_LEVEL_DEFAULT); 6129 amdgpu_irq_gpu_reset_resume_helper(tmp_adev); 6130 r = amdgpu_ib_ring_tests(tmp_adev); 6131 if (r) { 6132 dev_err(tmp_adev->dev, "ib ring test failed (%d).\n", r); 6133 r = -EAGAIN; 6134 goto end; 6135 } 6136 } 6137 6138 if (r) 6139 tmp_adev->asic_reset_res = r; 6140 } 6141 6142 end: 6143 return r; 6144 } 6145 6146 int amdgpu_do_asic_reset(struct list_head *device_list_handle, 6147 struct amdgpu_reset_context *reset_context) 6148 { 6149 struct amdgpu_device *tmp_adev = NULL; 6150 bool need_full_reset, skip_hw_reset; 6151 int r = 0; 6152 6153 /* Try reset handler method first */ 6154 tmp_adev = list_first_entry(device_list_handle, struct amdgpu_device, 6155 reset_list); 6156 6157 reset_context->reset_device_list = device_list_handle; 6158 r = amdgpu_reset_perform_reset(tmp_adev, reset_context); 6159 /* If reset handler not implemented, continue; otherwise return */ 6160 if (r == -EOPNOTSUPP) 6161 r = 0; 6162 else 6163 return r; 6164 6165 /* Reset handler not implemented, use the default method */ 6166 need_full_reset = 6167 test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 6168 skip_hw_reset = test_bit(AMDGPU_SKIP_HW_RESET, &reset_context->flags); 6169 6170 /* 6171 * ASIC reset has to be done on all XGMI hive nodes ASAP 6172 * to allow proper links negotiation in FW (within 1 sec) 6173 */ 6174 if (!skip_hw_reset && need_full_reset) { 6175 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 6176 /* For XGMI run all resets in parallel to speed up the process */ 6177 if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) { 6178 if (!queue_work(system_unbound_wq, 6179 &tmp_adev->xgmi_reset_work)) 6180 r = -EALREADY; 6181 } else 6182 r = amdgpu_asic_reset(tmp_adev); 6183 6184 if (r) { 6185 dev_err(tmp_adev->dev, 6186 "ASIC reset failed with error, %d for drm dev, %s", 6187 r, adev_to_drm(tmp_adev)->unique); 6188 goto out; 6189 } 6190 } 6191 6192 /* For XGMI wait for all resets to complete before proceed */ 6193 if (!r) { 6194 list_for_each_entry(tmp_adev, device_list_handle, 6195 reset_list) { 6196 if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) { 6197 flush_work(&tmp_adev->xgmi_reset_work); 6198 r = tmp_adev->asic_reset_res; 6199 if (r) 6200 break; 6201 } 6202 } 6203 } 6204 } 6205 6206 if (!r && amdgpu_ras_intr_triggered()) { 6207 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 6208 amdgpu_ras_reset_error_count(tmp_adev, 6209 AMDGPU_RAS_BLOCK__MMHUB); 6210 } 6211 6212 amdgpu_ras_intr_cleared(); 6213 } 6214 6215 r = amdgpu_device_reinit_after_reset(reset_context); 6216 if (r == -EAGAIN) 6217 set_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 6218 else 6219 clear_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 6220 6221 out: 6222 return r; 6223 } 6224 6225 static void amdgpu_device_set_mp1_state(struct amdgpu_device *adev) 6226 { 6227 6228 switch (amdgpu_asic_reset_method(adev)) { 6229 case AMD_RESET_METHOD_MODE1: 6230 case AMD_RESET_METHOD_LINK: 6231 adev->mp1_state = PP_MP1_STATE_SHUTDOWN; 6232 break; 6233 case AMD_RESET_METHOD_MODE2: 6234 adev->mp1_state = PP_MP1_STATE_RESET; 6235 break; 6236 default: 6237 adev->mp1_state = PP_MP1_STATE_NONE; 6238 break; 6239 } 6240 } 6241 6242 static void amdgpu_device_unset_mp1_state(struct amdgpu_device *adev) 6243 { 6244 amdgpu_vf_error_trans_all(adev); 6245 adev->mp1_state = PP_MP1_STATE_NONE; 6246 } 6247 6248 static void amdgpu_device_resume_display_audio(struct amdgpu_device *adev) 6249 { 6250 struct pci_dev *p = NULL; 6251 6252 p = pci_get_domain_bus_and_slot(pci_domain_nr(adev->pdev->bus), 6253 adev->pdev->bus->number, 1); 6254 if (p) { 6255 pm_runtime_enable(&(p->dev)); 6256 pm_runtime_resume(&(p->dev)); 6257 } 6258 6259 pci_dev_put(p); 6260 } 6261 6262 static int amdgpu_device_suspend_display_audio(struct amdgpu_device *adev) 6263 { 6264 enum amd_reset_method reset_method; 6265 struct pci_dev *p = NULL; 6266 u64 expires; 6267 6268 /* 6269 * For now, only BACO and mode1 reset are confirmed 6270 * to suffer the audio issue without proper suspended. 6271 */ 6272 reset_method = amdgpu_asic_reset_method(adev); 6273 if ((reset_method != AMD_RESET_METHOD_BACO) && 6274 (reset_method != AMD_RESET_METHOD_MODE1)) 6275 return -EINVAL; 6276 6277 p = pci_get_domain_bus_and_slot(pci_domain_nr(adev->pdev->bus), 6278 adev->pdev->bus->number, 1); 6279 if (!p) 6280 return -ENODEV; 6281 6282 expires = pm_runtime_autosuspend_expiration(&(p->dev)); 6283 if (!expires) 6284 /* 6285 * If we cannot get the audio device autosuspend delay, 6286 * a fixed 4S interval will be used. Considering 3S is 6287 * the audio controller default autosuspend delay setting. 6288 * 4S used here is guaranteed to cover that. 6289 */ 6290 expires = ktime_get_mono_fast_ns() + NSEC_PER_SEC * 4ULL; 6291 6292 while (!pm_runtime_status_suspended(&(p->dev))) { 6293 if (!pm_runtime_suspend(&(p->dev))) 6294 break; 6295 6296 if (expires < ktime_get_mono_fast_ns()) { 6297 dev_warn(adev->dev, "failed to suspend display audio\n"); 6298 pci_dev_put(p); 6299 /* TODO: abort the succeeding gpu reset? */ 6300 return -ETIMEDOUT; 6301 } 6302 } 6303 6304 pm_runtime_disable(&(p->dev)); 6305 6306 pci_dev_put(p); 6307 return 0; 6308 } 6309 6310 static inline void amdgpu_device_stop_pending_resets(struct amdgpu_device *adev) 6311 { 6312 struct amdgpu_ras *con = amdgpu_ras_get_context(adev); 6313 6314 #if defined(CONFIG_DEBUG_FS) 6315 if (!amdgpu_sriov_vf(adev)) 6316 cancel_work(&adev->reset_work); 6317 #endif 6318 cancel_work(&adev->userq_reset_work); 6319 6320 if (adev->kfd.dev) 6321 cancel_work(&adev->kfd.reset_work); 6322 6323 if (amdgpu_sriov_vf(adev)) 6324 cancel_work(&adev->virt.flr_work); 6325 6326 if (con && adev->ras_enabled) 6327 cancel_work(&con->recovery_work); 6328 6329 } 6330 6331 static int amdgpu_device_health_check(struct list_head *device_list_handle) 6332 { 6333 struct amdgpu_device *tmp_adev; 6334 int ret = 0; 6335 6336 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 6337 ret |= amdgpu_device_bus_status_check(tmp_adev); 6338 } 6339 6340 return ret; 6341 } 6342 6343 static void amdgpu_device_recovery_prepare(struct amdgpu_device *adev, 6344 struct list_head *device_list, 6345 struct amdgpu_hive_info *hive) 6346 { 6347 struct amdgpu_device *tmp_adev = NULL; 6348 6349 /* 6350 * Build list of devices to reset. 6351 * In case we are in XGMI hive mode, resort the device list 6352 * to put adev in the 1st position. 6353 */ 6354 if (!amdgpu_sriov_vf(adev) && (adev->gmc.xgmi.num_physical_nodes > 1) && hive) { 6355 list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head) { 6356 list_add_tail(&tmp_adev->reset_list, device_list); 6357 if (adev->shutdown) 6358 tmp_adev->shutdown = true; 6359 if (amdgpu_reset_in_dpc(adev)) 6360 tmp_adev->pcie_reset_ctx.in_link_reset = true; 6361 } 6362 if (!list_is_first(&adev->reset_list, device_list)) 6363 list_rotate_to_front(&adev->reset_list, device_list); 6364 } else { 6365 list_add_tail(&adev->reset_list, device_list); 6366 } 6367 } 6368 6369 static void amdgpu_device_recovery_get_reset_lock(struct amdgpu_device *adev, 6370 struct list_head *device_list) 6371 { 6372 struct amdgpu_device *tmp_adev = NULL; 6373 6374 if (list_empty(device_list)) 6375 return; 6376 tmp_adev = 6377 list_first_entry(device_list, struct amdgpu_device, reset_list); 6378 amdgpu_device_lock_reset_domain(tmp_adev->reset_domain); 6379 } 6380 6381 static void amdgpu_device_recovery_put_reset_lock(struct amdgpu_device *adev, 6382 struct list_head *device_list) 6383 { 6384 struct amdgpu_device *tmp_adev = NULL; 6385 6386 if (list_empty(device_list)) 6387 return; 6388 tmp_adev = 6389 list_first_entry(device_list, struct amdgpu_device, reset_list); 6390 amdgpu_device_unlock_reset_domain(tmp_adev->reset_domain); 6391 } 6392 6393 static void amdgpu_device_halt_activities(struct amdgpu_device *adev, 6394 struct amdgpu_job *job, 6395 struct amdgpu_reset_context *reset_context, 6396 struct list_head *device_list, 6397 struct amdgpu_hive_info *hive, 6398 bool need_emergency_restart) 6399 { 6400 struct amdgpu_device *tmp_adev = NULL; 6401 int i; 6402 6403 /* block all schedulers and reset given job's ring */ 6404 list_for_each_entry(tmp_adev, device_list, reset_list) { 6405 amdgpu_device_set_mp1_state(tmp_adev); 6406 6407 /* 6408 * Try to put the audio codec into suspend state 6409 * before gpu reset started. 6410 * 6411 * Due to the power domain of the graphics device 6412 * is shared with AZ power domain. Without this, 6413 * we may change the audio hardware from behind 6414 * the audio driver's back. That will trigger 6415 * some audio codec errors. 6416 */ 6417 if (!amdgpu_device_suspend_display_audio(tmp_adev)) 6418 tmp_adev->pcie_reset_ctx.audio_suspended = true; 6419 6420 amdgpu_ras_set_error_query_ready(tmp_adev, false); 6421 6422 cancel_delayed_work_sync(&tmp_adev->delayed_init_work); 6423 6424 amdgpu_amdkfd_pre_reset(tmp_adev, reset_context); 6425 6426 /* 6427 * Mark these ASICs to be reset as untracked first 6428 * And add them back after reset completed 6429 */ 6430 amdgpu_unregister_gpu_instance(tmp_adev); 6431 6432 drm_client_dev_suspend(adev_to_drm(tmp_adev)); 6433 6434 /* disable ras on ALL IPs */ 6435 if (!need_emergency_restart && !amdgpu_reset_in_dpc(adev) && 6436 amdgpu_device_ip_need_full_reset(tmp_adev)) 6437 amdgpu_ras_suspend(tmp_adev); 6438 6439 amdgpu_userq_pre_reset(tmp_adev); 6440 6441 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 6442 struct amdgpu_ring *ring = tmp_adev->rings[i]; 6443 6444 if (!amdgpu_ring_sched_ready(ring)) 6445 continue; 6446 6447 drm_sched_stop(&ring->sched, job ? &job->base : NULL); 6448 6449 if (need_emergency_restart) 6450 amdgpu_job_stop_all_jobs_on_sched(&ring->sched); 6451 } 6452 atomic_inc(&tmp_adev->gpu_reset_counter); 6453 } 6454 } 6455 6456 static int amdgpu_device_asic_reset(struct amdgpu_device *adev, 6457 struct list_head *device_list, 6458 struct amdgpu_reset_context *reset_context) 6459 { 6460 struct amdgpu_device *tmp_adev = NULL; 6461 int retry_limit = AMDGPU_MAX_RETRY_LIMIT; 6462 int r = 0; 6463 6464 retry: /* Rest of adevs pre asic reset from XGMI hive. */ 6465 list_for_each_entry(tmp_adev, device_list, reset_list) { 6466 r = amdgpu_device_pre_asic_reset(tmp_adev, reset_context); 6467 /*TODO Should we stop ?*/ 6468 if (r) { 6469 dev_err(tmp_adev->dev, "GPU pre asic reset failed with err, %d for drm dev, %s ", 6470 r, adev_to_drm(tmp_adev)->unique); 6471 tmp_adev->asic_reset_res = r; 6472 } 6473 } 6474 6475 /* Actual ASIC resets if needed.*/ 6476 /* Host driver will handle XGMI hive reset for SRIOV */ 6477 if (amdgpu_sriov_vf(adev)) { 6478 6479 /* Bail out of reset early */ 6480 if (amdgpu_ras_is_rma(adev)) 6481 return -ENODEV; 6482 6483 if (amdgpu_ras_get_fed_status(adev) || amdgpu_virt_rcvd_ras_interrupt(adev)) { 6484 dev_dbg(adev->dev, "Detected RAS error, wait for FLR completion\n"); 6485 amdgpu_ras_set_fed(adev, true); 6486 set_bit(AMDGPU_HOST_FLR, &reset_context->flags); 6487 } 6488 6489 r = amdgpu_device_reset_sriov(adev, reset_context); 6490 if (AMDGPU_RETRY_SRIOV_RESET(r) && (retry_limit--) > 0) { 6491 amdgpu_virt_release_full_gpu(adev, true); 6492 goto retry; 6493 } 6494 if (r) 6495 adev->asic_reset_res = r; 6496 } else { 6497 r = amdgpu_do_asic_reset(device_list, reset_context); 6498 if (r && r == -EAGAIN) 6499 goto retry; 6500 } 6501 6502 list_for_each_entry(tmp_adev, device_list, reset_list) { 6503 /* 6504 * Drop any pending non scheduler resets queued before reset is done. 6505 * Any reset scheduled after this point would be valid. Scheduler resets 6506 * were already dropped during drm_sched_stop and no new ones can come 6507 * in before drm_sched_start. 6508 */ 6509 amdgpu_device_stop_pending_resets(tmp_adev); 6510 } 6511 6512 return r; 6513 } 6514 6515 static int amdgpu_device_sched_resume(struct list_head *device_list, 6516 struct amdgpu_reset_context *reset_context, 6517 bool job_signaled) 6518 { 6519 struct amdgpu_device *tmp_adev = NULL; 6520 int i, r = 0; 6521 6522 /* Post ASIC reset for all devs .*/ 6523 list_for_each_entry(tmp_adev, device_list, reset_list) { 6524 6525 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 6526 struct amdgpu_ring *ring = tmp_adev->rings[i]; 6527 6528 if (!amdgpu_ring_sched_ready(ring)) 6529 continue; 6530 6531 drm_sched_start(&ring->sched, 0); 6532 } 6533 6534 if (!drm_drv_uses_atomic_modeset(adev_to_drm(tmp_adev)) && !job_signaled) 6535 drm_helper_resume_force_mode(adev_to_drm(tmp_adev)); 6536 6537 if (tmp_adev->asic_reset_res) { 6538 /* bad news, how to tell it to userspace ? 6539 * for ras error, we should report GPU bad status instead of 6540 * reset failure 6541 */ 6542 if (reset_context->src != AMDGPU_RESET_SRC_RAS || 6543 !amdgpu_ras_eeprom_check_err_threshold(tmp_adev)) 6544 dev_info( 6545 tmp_adev->dev, 6546 "GPU reset(%d) failed with error %d \n", 6547 atomic_read( 6548 &tmp_adev->gpu_reset_counter), 6549 tmp_adev->asic_reset_res); 6550 amdgpu_vf_error_put(tmp_adev, 6551 AMDGIM_ERROR_VF_GPU_RESET_FAIL, 0, 6552 tmp_adev->asic_reset_res); 6553 if (!r) 6554 r = tmp_adev->asic_reset_res; 6555 tmp_adev->asic_reset_res = 0; 6556 } else { 6557 dev_info(tmp_adev->dev, "GPU reset(%d) succeeded!\n", 6558 atomic_read(&tmp_adev->gpu_reset_counter)); 6559 if (amdgpu_acpi_smart_shift_update(tmp_adev, 6560 AMDGPU_SS_DEV_D0)) 6561 dev_warn(tmp_adev->dev, 6562 "smart shift update failed\n"); 6563 } 6564 } 6565 6566 return r; 6567 } 6568 6569 static void amdgpu_device_gpu_resume(struct amdgpu_device *adev, 6570 struct list_head *device_list, 6571 bool need_emergency_restart) 6572 { 6573 struct amdgpu_device *tmp_adev = NULL; 6574 6575 list_for_each_entry(tmp_adev, device_list, reset_list) { 6576 /* unlock kfd: SRIOV would do it separately */ 6577 if (!need_emergency_restart && !amdgpu_sriov_vf(tmp_adev)) 6578 amdgpu_amdkfd_post_reset(tmp_adev); 6579 6580 /* kfd_post_reset will do nothing if kfd device is not initialized, 6581 * need to bring up kfd here if it's not be initialized before 6582 */ 6583 if (!adev->kfd.init_complete) 6584 amdgpu_amdkfd_device_init(adev); 6585 6586 if (tmp_adev->pcie_reset_ctx.audio_suspended) 6587 amdgpu_device_resume_display_audio(tmp_adev); 6588 6589 amdgpu_device_unset_mp1_state(tmp_adev); 6590 6591 amdgpu_ras_set_error_query_ready(tmp_adev, true); 6592 6593 } 6594 } 6595 6596 6597 /** 6598 * amdgpu_device_gpu_recover - reset the asic and recover scheduler 6599 * 6600 * @adev: amdgpu_device pointer 6601 * @job: which job trigger hang 6602 * @reset_context: amdgpu reset context pointer 6603 * 6604 * Attempt to reset the GPU if it has hung (all asics). 6605 * Attempt to do soft-reset or full-reset and reinitialize Asic 6606 * Returns 0 for success or an error on failure. 6607 */ 6608 6609 int amdgpu_device_gpu_recover(struct amdgpu_device *adev, 6610 struct amdgpu_job *job, 6611 struct amdgpu_reset_context *reset_context) 6612 { 6613 struct list_head device_list; 6614 bool job_signaled = false; 6615 struct amdgpu_hive_info *hive = NULL; 6616 int r = 0; 6617 bool need_emergency_restart = false; 6618 6619 /* 6620 * If it reaches here because of hang/timeout and a RAS error is 6621 * detected at the same time, let RAS recovery take care of it. 6622 */ 6623 if (amdgpu_ras_is_err_state(adev, AMDGPU_RAS_BLOCK__ANY) && 6624 !amdgpu_sriov_vf(adev) && 6625 reset_context->src != AMDGPU_RESET_SRC_RAS) { 6626 dev_dbg(adev->dev, 6627 "Gpu recovery from source: %d yielding to RAS error recovery handling", 6628 reset_context->src); 6629 return 0; 6630 } 6631 6632 /* 6633 * Special case: RAS triggered and full reset isn't supported 6634 */ 6635 need_emergency_restart = amdgpu_ras_need_emergency_restart(adev); 6636 6637 /* 6638 * Flush RAM to disk so that after reboot 6639 * the user can read log and see why the system rebooted. 6640 */ 6641 if (need_emergency_restart && amdgpu_ras_get_context(adev) && 6642 amdgpu_ras_get_context(adev)->reboot) { 6643 dev_warn(adev->dev, "Emergency reboot."); 6644 6645 ksys_sync_helper(); 6646 emergency_restart(); 6647 } 6648 6649 dev_info(adev->dev, "GPU %s begin!. Source: %d\n", 6650 need_emergency_restart ? "jobs stop" : "reset", 6651 reset_context->src); 6652 6653 if (!amdgpu_sriov_vf(adev)) 6654 hive = amdgpu_get_xgmi_hive(adev); 6655 if (hive) 6656 mutex_lock(&hive->hive_lock); 6657 6658 reset_context->job = job; 6659 reset_context->hive = hive; 6660 INIT_LIST_HEAD(&device_list); 6661 6662 amdgpu_device_recovery_prepare(adev, &device_list, hive); 6663 6664 if (!amdgpu_sriov_vf(adev)) { 6665 r = amdgpu_device_health_check(&device_list); 6666 if (r) 6667 goto end_reset; 6668 } 6669 6670 /* Cannot be called after locking reset domain */ 6671 amdgpu_ras_pre_reset(adev, &device_list); 6672 6673 /* We need to lock reset domain only once both for XGMI and single device */ 6674 amdgpu_device_recovery_get_reset_lock(adev, &device_list); 6675 6676 amdgpu_device_halt_activities(adev, job, reset_context, &device_list, 6677 hive, need_emergency_restart); 6678 if (need_emergency_restart) 6679 goto skip_sched_resume; 6680 /* 6681 * Must check guilty signal here since after this point all old 6682 * HW fences are force signaled. 6683 * 6684 * job->base holds a reference to parent fence 6685 */ 6686 if (job && dma_fence_is_signaled(&job->hw_fence->base)) { 6687 job_signaled = true; 6688 dev_info(adev->dev, "Guilty job already signaled, skipping HW reset"); 6689 goto skip_hw_reset; 6690 } 6691 6692 r = amdgpu_device_asic_reset(adev, &device_list, reset_context); 6693 if (r) 6694 goto reset_unlock; 6695 skip_hw_reset: 6696 r = amdgpu_device_sched_resume(&device_list, reset_context, job_signaled); 6697 if (r) 6698 goto reset_unlock; 6699 skip_sched_resume: 6700 amdgpu_device_gpu_resume(adev, &device_list, need_emergency_restart); 6701 reset_unlock: 6702 amdgpu_device_recovery_put_reset_lock(adev, &device_list); 6703 amdgpu_ras_post_reset(adev, &device_list); 6704 end_reset: 6705 if (hive) { 6706 mutex_unlock(&hive->hive_lock); 6707 amdgpu_put_xgmi_hive(hive); 6708 } 6709 6710 if (r) 6711 dev_info(adev->dev, "GPU reset end with ret = %d\n", r); 6712 6713 atomic_set(&adev->reset_domain->reset_res, r); 6714 6715 if (!r) { 6716 struct amdgpu_task_info *ti = NULL; 6717 6718 if (job) 6719 ti = amdgpu_vm_get_task_info_pasid(adev, job->pasid); 6720 6721 drm_dev_wedged_event(adev_to_drm(adev), DRM_WEDGE_RECOVERY_NONE, 6722 ti ? &ti->task : NULL); 6723 6724 amdgpu_vm_put_task_info(ti); 6725 } 6726 6727 return r; 6728 } 6729 6730 /** 6731 * amdgpu_device_partner_bandwidth - find the bandwidth of appropriate partner 6732 * 6733 * @adev: amdgpu_device pointer 6734 * @speed: pointer to the speed of the link 6735 * @width: pointer to the width of the link 6736 * 6737 * Evaluate the hierarchy to find the speed and bandwidth capabilities of the 6738 * first physical partner to an AMD dGPU. 6739 * This will exclude any virtual switches and links. 6740 */ 6741 static void amdgpu_device_partner_bandwidth(struct amdgpu_device *adev, 6742 enum pci_bus_speed *speed, 6743 enum pcie_link_width *width) 6744 { 6745 struct pci_dev *parent = adev->pdev; 6746 6747 if (!speed || !width) 6748 return; 6749 6750 *speed = PCI_SPEED_UNKNOWN; 6751 *width = PCIE_LNK_WIDTH_UNKNOWN; 6752 6753 if (amdgpu_device_pcie_dynamic_switching_supported(adev)) { 6754 while ((parent = pci_upstream_bridge(parent))) { 6755 /* skip upstream/downstream switches internal to dGPU*/ 6756 if (parent->vendor == PCI_VENDOR_ID_ATI) 6757 continue; 6758 *speed = pcie_get_speed_cap(parent); 6759 *width = pcie_get_width_cap(parent); 6760 break; 6761 } 6762 } else { 6763 /* use the current speeds rather than max if switching is not supported */ 6764 pcie_bandwidth_available(adev->pdev, NULL, speed, width); 6765 } 6766 } 6767 6768 /** 6769 * amdgpu_device_gpu_bandwidth - find the bandwidth of the GPU 6770 * 6771 * @adev: amdgpu_device pointer 6772 * @speed: pointer to the speed of the link 6773 * @width: pointer to the width of the link 6774 * 6775 * Evaluate the hierarchy to find the speed and bandwidth capabilities of the 6776 * AMD dGPU which may be a virtual upstream bridge. 6777 */ 6778 static void amdgpu_device_gpu_bandwidth(struct amdgpu_device *adev, 6779 enum pci_bus_speed *speed, 6780 enum pcie_link_width *width) 6781 { 6782 struct pci_dev *parent = adev->pdev; 6783 6784 if (!speed || !width) 6785 return; 6786 6787 parent = pci_upstream_bridge(parent); 6788 if (parent && parent->vendor == PCI_VENDOR_ID_ATI) { 6789 /* use the upstream/downstream switches internal to dGPU */ 6790 *speed = pcie_get_speed_cap(parent); 6791 *width = pcie_get_width_cap(parent); 6792 while ((parent = pci_upstream_bridge(parent))) { 6793 if (parent->vendor == PCI_VENDOR_ID_ATI) { 6794 /* use the upstream/downstream switches internal to dGPU */ 6795 *speed = pcie_get_speed_cap(parent); 6796 *width = pcie_get_width_cap(parent); 6797 } 6798 } 6799 } else { 6800 /* use the device itself */ 6801 *speed = pcie_get_speed_cap(adev->pdev); 6802 *width = pcie_get_width_cap(adev->pdev); 6803 } 6804 } 6805 6806 /** 6807 * amdgpu_device_get_pcie_info - fence pcie info about the PCIE slot 6808 * 6809 * @adev: amdgpu_device pointer 6810 * 6811 * Fetches and stores in the driver the PCIE capabilities (gen speed 6812 * and lanes) of the slot the device is in. Handles APUs and 6813 * virtualized environments where PCIE config space may not be available. 6814 */ 6815 static void amdgpu_device_get_pcie_info(struct amdgpu_device *adev) 6816 { 6817 enum pci_bus_speed speed_cap, platform_speed_cap; 6818 enum pcie_link_width platform_link_width, link_width; 6819 6820 if (amdgpu_pcie_gen_cap) 6821 adev->pm.pcie_gen_mask = amdgpu_pcie_gen_cap; 6822 6823 if (amdgpu_pcie_lane_cap) 6824 adev->pm.pcie_mlw_mask = amdgpu_pcie_lane_cap; 6825 6826 /* covers APUs as well */ 6827 if (pci_is_root_bus(adev->pdev->bus) && !amdgpu_passthrough(adev)) { 6828 if (adev->pm.pcie_gen_mask == 0) 6829 adev->pm.pcie_gen_mask = AMDGPU_DEFAULT_PCIE_GEN_MASK; 6830 if (adev->pm.pcie_mlw_mask == 0) 6831 adev->pm.pcie_mlw_mask = AMDGPU_DEFAULT_PCIE_MLW_MASK; 6832 return; 6833 } 6834 6835 if (adev->pm.pcie_gen_mask && adev->pm.pcie_mlw_mask) 6836 return; 6837 6838 amdgpu_device_partner_bandwidth(adev, &platform_speed_cap, 6839 &platform_link_width); 6840 amdgpu_device_gpu_bandwidth(adev, &speed_cap, &link_width); 6841 6842 if (adev->pm.pcie_gen_mask == 0) { 6843 /* asic caps */ 6844 if (speed_cap == PCI_SPEED_UNKNOWN) { 6845 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 6846 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 | 6847 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3); 6848 } else { 6849 if (speed_cap == PCIE_SPEED_32_0GT) 6850 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 6851 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 | 6852 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3 | 6853 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN4 | 6854 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN5); 6855 else if (speed_cap == PCIE_SPEED_16_0GT) 6856 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 6857 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 | 6858 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3 | 6859 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN4); 6860 else if (speed_cap == PCIE_SPEED_8_0GT) 6861 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 6862 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 | 6863 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3); 6864 else if (speed_cap == PCIE_SPEED_5_0GT) 6865 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 6866 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2); 6867 else 6868 adev->pm.pcie_gen_mask |= CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1; 6869 } 6870 /* platform caps */ 6871 if (platform_speed_cap == PCI_SPEED_UNKNOWN) { 6872 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 6873 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2); 6874 } else { 6875 if (platform_speed_cap == PCIE_SPEED_32_0GT) 6876 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 6877 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 | 6878 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3 | 6879 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN4 | 6880 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN5); 6881 else if (platform_speed_cap == PCIE_SPEED_16_0GT) 6882 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 6883 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 | 6884 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3 | 6885 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN4); 6886 else if (platform_speed_cap == PCIE_SPEED_8_0GT) 6887 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 6888 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 | 6889 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3); 6890 else if (platform_speed_cap == PCIE_SPEED_5_0GT) 6891 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 6892 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2); 6893 else 6894 adev->pm.pcie_gen_mask |= CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1; 6895 6896 } 6897 } 6898 if (adev->pm.pcie_mlw_mask == 0) { 6899 /* asic caps */ 6900 if (link_width == PCIE_LNK_WIDTH_UNKNOWN) { 6901 adev->pm.pcie_mlw_mask |= AMDGPU_DEFAULT_ASIC_PCIE_MLW_MASK; 6902 } else { 6903 switch (link_width) { 6904 case PCIE_LNK_X32: 6905 adev->pm.pcie_mlw_mask |= (CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X32 | 6906 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X16 | 6907 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X12 | 6908 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X8 | 6909 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X4 | 6910 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X2 | 6911 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X1); 6912 break; 6913 case PCIE_LNK_X16: 6914 adev->pm.pcie_mlw_mask |= (CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X16 | 6915 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X12 | 6916 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X8 | 6917 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X4 | 6918 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X2 | 6919 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X1); 6920 break; 6921 case PCIE_LNK_X12: 6922 adev->pm.pcie_mlw_mask |= (CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X12 | 6923 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X8 | 6924 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X4 | 6925 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X2 | 6926 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X1); 6927 break; 6928 case PCIE_LNK_X8: 6929 adev->pm.pcie_mlw_mask |= (CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X8 | 6930 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X4 | 6931 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X2 | 6932 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X1); 6933 break; 6934 case PCIE_LNK_X4: 6935 adev->pm.pcie_mlw_mask |= (CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X4 | 6936 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X2 | 6937 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X1); 6938 break; 6939 case PCIE_LNK_X2: 6940 adev->pm.pcie_mlw_mask |= (CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X2 | 6941 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X1); 6942 break; 6943 case PCIE_LNK_X1: 6944 adev->pm.pcie_mlw_mask |= CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X1; 6945 break; 6946 default: 6947 break; 6948 } 6949 } 6950 /* platform caps */ 6951 if (platform_link_width == PCIE_LNK_WIDTH_UNKNOWN) { 6952 adev->pm.pcie_mlw_mask |= AMDGPU_DEFAULT_PCIE_MLW_MASK; 6953 } else { 6954 switch (platform_link_width) { 6955 case PCIE_LNK_X32: 6956 adev->pm.pcie_mlw_mask |= (CAIL_PCIE_LINK_WIDTH_SUPPORT_X32 | 6957 CAIL_PCIE_LINK_WIDTH_SUPPORT_X16 | 6958 CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 | 6959 CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 | 6960 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 6961 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 6962 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 6963 break; 6964 case PCIE_LNK_X16: 6965 adev->pm.pcie_mlw_mask |= (CAIL_PCIE_LINK_WIDTH_SUPPORT_X16 | 6966 CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 | 6967 CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 | 6968 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 6969 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 6970 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 6971 break; 6972 case PCIE_LNK_X12: 6973 adev->pm.pcie_mlw_mask |= (CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 | 6974 CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 | 6975 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 6976 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 6977 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 6978 break; 6979 case PCIE_LNK_X8: 6980 adev->pm.pcie_mlw_mask |= (CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 | 6981 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 6982 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 6983 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 6984 break; 6985 case PCIE_LNK_X4: 6986 adev->pm.pcie_mlw_mask |= (CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 6987 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 6988 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 6989 break; 6990 case PCIE_LNK_X2: 6991 adev->pm.pcie_mlw_mask |= (CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 6992 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 6993 break; 6994 case PCIE_LNK_X1: 6995 adev->pm.pcie_mlw_mask |= CAIL_PCIE_LINK_WIDTH_SUPPORT_X1; 6996 break; 6997 default: 6998 break; 6999 } 7000 } 7001 } 7002 } 7003 7004 /** 7005 * amdgpu_device_is_peer_accessible - Check peer access through PCIe BAR 7006 * 7007 * @adev: amdgpu_device pointer 7008 * @peer_adev: amdgpu_device pointer for peer device trying to access @adev 7009 * 7010 * Return true if @peer_adev can access (DMA) @adev through the PCIe 7011 * BAR, i.e. @adev is "large BAR" and the BAR matches the DMA mask of 7012 * @peer_adev. 7013 */ 7014 bool amdgpu_device_is_peer_accessible(struct amdgpu_device *adev, 7015 struct amdgpu_device *peer_adev) 7016 { 7017 #ifdef CONFIG_HSA_AMD_P2P 7018 bool p2p_access = 7019 !adev->gmc.xgmi.connected_to_cpu && 7020 !(pci_p2pdma_distance(adev->pdev, peer_adev->dev, false) < 0); 7021 if (!p2p_access) 7022 dev_info(adev->dev, "PCIe P2P access from peer device %s is not supported by the chipset\n", 7023 pci_name(peer_adev->pdev)); 7024 7025 bool is_large_bar = adev->gmc.visible_vram_size && 7026 adev->gmc.real_vram_size == adev->gmc.visible_vram_size; 7027 bool p2p_addressable = amdgpu_device_check_iommu_remap(peer_adev); 7028 7029 if (!p2p_addressable) { 7030 uint64_t address_mask = peer_adev->dev->dma_mask ? 7031 ~*peer_adev->dev->dma_mask : ~((1ULL << 32) - 1); 7032 resource_size_t aper_limit = 7033 adev->gmc.aper_base + adev->gmc.aper_size - 1; 7034 7035 p2p_addressable = !(adev->gmc.aper_base & address_mask || 7036 aper_limit & address_mask); 7037 } 7038 return pcie_p2p && is_large_bar && p2p_access && p2p_addressable; 7039 #else 7040 return false; 7041 #endif 7042 } 7043 7044 int amdgpu_device_baco_enter(struct amdgpu_device *adev) 7045 { 7046 struct amdgpu_ras *ras = amdgpu_ras_get_context(adev); 7047 7048 if (!amdgpu_device_supports_baco(adev)) 7049 return -ENOTSUPP; 7050 7051 if (ras && adev->ras_enabled && 7052 adev->nbio.funcs->enable_doorbell_interrupt) 7053 adev->nbio.funcs->enable_doorbell_interrupt(adev, false); 7054 7055 return amdgpu_dpm_baco_enter(adev); 7056 } 7057 7058 int amdgpu_device_baco_exit(struct amdgpu_device *adev) 7059 { 7060 struct amdgpu_ras *ras = amdgpu_ras_get_context(adev); 7061 int ret = 0; 7062 7063 if (!amdgpu_device_supports_baco(adev)) 7064 return -ENOTSUPP; 7065 7066 ret = amdgpu_dpm_baco_exit(adev); 7067 if (ret) 7068 return ret; 7069 7070 if (ras && adev->ras_enabled && 7071 adev->nbio.funcs->enable_doorbell_interrupt) 7072 adev->nbio.funcs->enable_doorbell_interrupt(adev, true); 7073 7074 if (amdgpu_passthrough(adev) && adev->nbio.funcs && 7075 adev->nbio.funcs->clear_doorbell_interrupt) 7076 adev->nbio.funcs->clear_doorbell_interrupt(adev); 7077 7078 return 0; 7079 } 7080 7081 /** 7082 * amdgpu_pci_error_detected - Called when a PCI error is detected. 7083 * @pdev: PCI device struct 7084 * @state: PCI channel state 7085 * 7086 * Description: Called when a PCI error is detected. 7087 * 7088 * Return: PCI_ERS_RESULT_NEED_RESET or PCI_ERS_RESULT_DISCONNECT. 7089 */ 7090 pci_ers_result_t amdgpu_pci_error_detected(struct pci_dev *pdev, pci_channel_state_t state) 7091 { 7092 struct drm_device *dev = pci_get_drvdata(pdev); 7093 struct amdgpu_device *adev = drm_to_adev(dev); 7094 struct amdgpu_hive_info *hive __free(xgmi_put_hive) = 7095 amdgpu_get_xgmi_hive(adev); 7096 struct amdgpu_reset_context reset_context; 7097 struct list_head device_list; 7098 7099 dev_info(adev->dev, "PCI error: detected callback!!\n"); 7100 7101 adev->pci_channel_state = state; 7102 7103 switch (state) { 7104 case pci_channel_io_normal: 7105 dev_info(adev->dev, "pci_channel_io_normal: state(%d)!!\n", state); 7106 return PCI_ERS_RESULT_CAN_RECOVER; 7107 case pci_channel_io_frozen: 7108 /* Fatal error, prepare for slot reset */ 7109 dev_info(adev->dev, "pci_channel_io_frozen: state(%d)!!\n", state); 7110 if (hive) { 7111 /* Hive devices should be able to support FW based 7112 * link reset on other devices, if not return. 7113 */ 7114 if (!amdgpu_dpm_is_link_reset_supported(adev)) { 7115 dev_warn(adev->dev, 7116 "No support for XGMI hive yet...\n"); 7117 return PCI_ERS_RESULT_DISCONNECT; 7118 } 7119 /* Set dpc status only if device is part of hive 7120 * Non-hive devices should be able to recover after 7121 * link reset. 7122 */ 7123 amdgpu_reset_set_dpc_status(adev, true); 7124 7125 mutex_lock(&hive->hive_lock); 7126 } 7127 memset(&reset_context, 0, sizeof(reset_context)); 7128 INIT_LIST_HEAD(&device_list); 7129 7130 amdgpu_device_recovery_prepare(adev, &device_list, hive); 7131 amdgpu_device_recovery_get_reset_lock(adev, &device_list); 7132 amdgpu_device_halt_activities(adev, NULL, &reset_context, &device_list, 7133 hive, false); 7134 if (hive) 7135 mutex_unlock(&hive->hive_lock); 7136 return PCI_ERS_RESULT_NEED_RESET; 7137 case pci_channel_io_perm_failure: 7138 /* Permanent error, prepare for device removal */ 7139 dev_info(adev->dev, "pci_channel_io_perm_failure: state(%d)!!\n", state); 7140 return PCI_ERS_RESULT_DISCONNECT; 7141 } 7142 7143 return PCI_ERS_RESULT_NEED_RESET; 7144 } 7145 7146 /** 7147 * amdgpu_pci_mmio_enabled - Enable MMIO and dump debug registers 7148 * @pdev: pointer to PCI device 7149 */ 7150 pci_ers_result_t amdgpu_pci_mmio_enabled(struct pci_dev *pdev) 7151 { 7152 struct drm_device *dev = pci_get_drvdata(pdev); 7153 struct amdgpu_device *adev = drm_to_adev(dev); 7154 7155 dev_info(adev->dev, "PCI error: mmio enabled callback!!\n"); 7156 7157 /* TODO - dump whatever for debugging purposes */ 7158 7159 /* This called only if amdgpu_pci_error_detected returns 7160 * PCI_ERS_RESULT_CAN_RECOVER. Read/write to the device still 7161 * works, no need to reset slot. 7162 */ 7163 7164 return PCI_ERS_RESULT_RECOVERED; 7165 } 7166 7167 /** 7168 * amdgpu_pci_slot_reset - Called when PCI slot has been reset. 7169 * @pdev: PCI device struct 7170 * 7171 * Description: This routine is called by the pci error recovery 7172 * code after the PCI slot has been reset, just before we 7173 * should resume normal operations. 7174 */ 7175 pci_ers_result_t amdgpu_pci_slot_reset(struct pci_dev *pdev) 7176 { 7177 struct drm_device *dev = pci_get_drvdata(pdev); 7178 struct amdgpu_device *adev = drm_to_adev(dev); 7179 struct amdgpu_reset_context reset_context; 7180 struct amdgpu_device *tmp_adev; 7181 struct amdgpu_hive_info *hive; 7182 struct list_head device_list; 7183 struct pci_dev *link_dev; 7184 int r = 0, i, timeout; 7185 u32 memsize; 7186 u16 status; 7187 7188 dev_info(adev->dev, "PCI error: slot reset callback!!\n"); 7189 7190 memset(&reset_context, 0, sizeof(reset_context)); 7191 7192 if (adev->pcie_reset_ctx.swus) 7193 link_dev = adev->pcie_reset_ctx.swus; 7194 else 7195 link_dev = adev->pdev; 7196 /* wait for asic to come out of reset, timeout = 10s */ 7197 timeout = 10000; 7198 do { 7199 usleep_range(10000, 10500); 7200 r = pci_read_config_word(link_dev, PCI_VENDOR_ID, &status); 7201 timeout -= 10; 7202 } while (timeout > 0 && (status != PCI_VENDOR_ID_ATI) && 7203 (status != PCI_VENDOR_ID_AMD)); 7204 7205 if ((status != PCI_VENDOR_ID_ATI) && (status != PCI_VENDOR_ID_AMD)) { 7206 r = -ETIME; 7207 goto out; 7208 } 7209 7210 amdgpu_device_load_switch_state(adev); 7211 /* Restore PCI confspace */ 7212 amdgpu_device_load_pci_state(pdev); 7213 7214 /* confirm ASIC came out of reset */ 7215 for (i = 0; i < adev->usec_timeout; i++) { 7216 memsize = amdgpu_asic_get_config_memsize(adev); 7217 7218 if (memsize != 0xffffffff) 7219 break; 7220 udelay(1); 7221 } 7222 if (memsize == 0xffffffff) { 7223 r = -ETIME; 7224 goto out; 7225 } 7226 7227 reset_context.method = AMD_RESET_METHOD_NONE; 7228 reset_context.reset_req_dev = adev; 7229 set_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags); 7230 set_bit(AMDGPU_SKIP_COREDUMP, &reset_context.flags); 7231 INIT_LIST_HEAD(&device_list); 7232 7233 hive = amdgpu_get_xgmi_hive(adev); 7234 if (hive) { 7235 mutex_lock(&hive->hive_lock); 7236 reset_context.hive = hive; 7237 list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head) { 7238 tmp_adev->pcie_reset_ctx.in_link_reset = true; 7239 list_add_tail(&tmp_adev->reset_list, &device_list); 7240 } 7241 } else { 7242 set_bit(AMDGPU_SKIP_HW_RESET, &reset_context.flags); 7243 list_add_tail(&adev->reset_list, &device_list); 7244 } 7245 7246 r = amdgpu_device_asic_reset(adev, &device_list, &reset_context); 7247 out: 7248 if (!r) { 7249 if (amdgpu_device_cache_pci_state(adev->pdev)) 7250 pci_restore_state(adev->pdev); 7251 dev_info(adev->dev, "PCIe error recovery succeeded\n"); 7252 } else { 7253 dev_err(adev->dev, "PCIe error recovery failed, err:%d\n", r); 7254 if (hive) { 7255 list_for_each_entry(tmp_adev, &device_list, reset_list) 7256 amdgpu_device_unset_mp1_state(tmp_adev); 7257 } 7258 amdgpu_device_recovery_put_reset_lock(adev, &device_list); 7259 } 7260 7261 if (hive) { 7262 mutex_unlock(&hive->hive_lock); 7263 amdgpu_put_xgmi_hive(hive); 7264 } 7265 7266 return r ? PCI_ERS_RESULT_DISCONNECT : PCI_ERS_RESULT_RECOVERED; 7267 } 7268 7269 /** 7270 * amdgpu_pci_resume() - resume normal ops after PCI reset 7271 * @pdev: pointer to PCI device 7272 * 7273 * Called when the error recovery driver tells us that its 7274 * OK to resume normal operation. 7275 */ 7276 void amdgpu_pci_resume(struct pci_dev *pdev) 7277 { 7278 struct drm_device *dev = pci_get_drvdata(pdev); 7279 struct amdgpu_device *adev = drm_to_adev(dev); 7280 struct list_head device_list; 7281 struct amdgpu_hive_info *hive = NULL; 7282 struct amdgpu_device *tmp_adev = NULL; 7283 7284 dev_info(adev->dev, "PCI error: resume callback!!\n"); 7285 7286 /* Only continue execution for the case of pci_channel_io_frozen */ 7287 if (adev->pci_channel_state != pci_channel_io_frozen) 7288 return; 7289 7290 INIT_LIST_HEAD(&device_list); 7291 7292 hive = amdgpu_get_xgmi_hive(adev); 7293 if (hive) { 7294 mutex_lock(&hive->hive_lock); 7295 list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head) { 7296 tmp_adev->pcie_reset_ctx.in_link_reset = false; 7297 list_add_tail(&tmp_adev->reset_list, &device_list); 7298 } 7299 } else 7300 list_add_tail(&adev->reset_list, &device_list); 7301 7302 amdgpu_device_sched_resume(&device_list, NULL, NULL); 7303 amdgpu_device_gpu_resume(adev, &device_list, false); 7304 amdgpu_device_recovery_put_reset_lock(adev, &device_list); 7305 7306 if (hive) { 7307 mutex_unlock(&hive->hive_lock); 7308 amdgpu_put_xgmi_hive(hive); 7309 } 7310 } 7311 7312 static void amdgpu_device_cache_switch_state(struct amdgpu_device *adev) 7313 { 7314 struct pci_dev *swus, *swds; 7315 int r; 7316 7317 swds = pci_upstream_bridge(adev->pdev); 7318 if (!swds || swds->vendor != PCI_VENDOR_ID_ATI || 7319 pci_pcie_type(swds) != PCI_EXP_TYPE_DOWNSTREAM) 7320 return; 7321 swus = pci_upstream_bridge(swds); 7322 if (!swus || 7323 (swus->vendor != PCI_VENDOR_ID_ATI && 7324 swus->vendor != PCI_VENDOR_ID_AMD) || 7325 pci_pcie_type(swus) != PCI_EXP_TYPE_UPSTREAM) 7326 return; 7327 7328 /* If already saved, return */ 7329 if (adev->pcie_reset_ctx.swus) 7330 return; 7331 /* Upstream bridge is ATI, assume it's SWUS/DS architecture */ 7332 r = pci_save_state(swds); 7333 if (r) 7334 return; 7335 adev->pcie_reset_ctx.swds_pcistate = pci_store_saved_state(swds); 7336 7337 r = pci_save_state(swus); 7338 if (r) 7339 return; 7340 adev->pcie_reset_ctx.swus_pcistate = pci_store_saved_state(swus); 7341 7342 adev->pcie_reset_ctx.swus = swus; 7343 } 7344 7345 static void amdgpu_device_load_switch_state(struct amdgpu_device *adev) 7346 { 7347 struct pci_dev *pdev; 7348 int r; 7349 7350 if (!adev->pcie_reset_ctx.swds_pcistate || 7351 !adev->pcie_reset_ctx.swus_pcistate) 7352 return; 7353 7354 pdev = adev->pcie_reset_ctx.swus; 7355 r = pci_load_saved_state(pdev, adev->pcie_reset_ctx.swus_pcistate); 7356 if (!r) { 7357 pci_restore_state(pdev); 7358 } else { 7359 dev_warn(adev->dev, "Failed to load SWUS state, err:%d\n", r); 7360 return; 7361 } 7362 7363 pdev = pci_upstream_bridge(adev->pdev); 7364 r = pci_load_saved_state(pdev, adev->pcie_reset_ctx.swds_pcistate); 7365 if (!r) 7366 pci_restore_state(pdev); 7367 else 7368 dev_warn(adev->dev, "Failed to load SWDS state, err:%d\n", r); 7369 } 7370 7371 bool amdgpu_device_cache_pci_state(struct pci_dev *pdev) 7372 { 7373 struct drm_device *dev = pci_get_drvdata(pdev); 7374 struct amdgpu_device *adev = drm_to_adev(dev); 7375 int r; 7376 7377 if (amdgpu_sriov_vf(adev)) 7378 return false; 7379 7380 r = pci_save_state(pdev); 7381 if (!r) { 7382 kfree(adev->pci_state); 7383 7384 adev->pci_state = pci_store_saved_state(pdev); 7385 7386 if (!adev->pci_state) { 7387 dev_err(adev->dev, "Failed to store PCI saved state"); 7388 return false; 7389 } 7390 } else { 7391 dev_warn(adev->dev, "Failed to save PCI state, err:%d\n", r); 7392 return false; 7393 } 7394 7395 amdgpu_device_cache_switch_state(adev); 7396 7397 return true; 7398 } 7399 7400 bool amdgpu_device_load_pci_state(struct pci_dev *pdev) 7401 { 7402 struct drm_device *dev = pci_get_drvdata(pdev); 7403 struct amdgpu_device *adev = drm_to_adev(dev); 7404 int r; 7405 7406 if (!adev->pci_state) 7407 return false; 7408 7409 r = pci_load_saved_state(pdev, adev->pci_state); 7410 7411 if (!r) { 7412 pci_restore_state(pdev); 7413 } else { 7414 dev_warn(adev->dev, "Failed to load PCI state, err:%d\n", r); 7415 return false; 7416 } 7417 7418 return true; 7419 } 7420 7421 void amdgpu_device_flush_hdp(struct amdgpu_device *adev, 7422 struct amdgpu_ring *ring) 7423 { 7424 #ifdef CONFIG_X86_64 7425 if ((adev->flags & AMD_IS_APU) && !amdgpu_passthrough(adev)) 7426 return; 7427 #endif 7428 if (adev->gmc.xgmi.connected_to_cpu) 7429 return; 7430 7431 if (ring && ring->funcs->emit_hdp_flush) { 7432 amdgpu_ring_emit_hdp_flush(ring); 7433 return; 7434 } 7435 7436 if (!ring && amdgpu_sriov_runtime(adev)) { 7437 if (!amdgpu_kiq_hdp_flush(adev)) 7438 return; 7439 } 7440 7441 amdgpu_hdp_flush(adev, ring); 7442 } 7443 7444 void amdgpu_device_invalidate_hdp(struct amdgpu_device *adev, 7445 struct amdgpu_ring *ring) 7446 { 7447 #ifdef CONFIG_X86_64 7448 if ((adev->flags & AMD_IS_APU) && !amdgpu_passthrough(adev)) 7449 return; 7450 #endif 7451 if (adev->gmc.xgmi.connected_to_cpu) 7452 return; 7453 7454 amdgpu_hdp_invalidate(adev, ring); 7455 } 7456 7457 int amdgpu_in_reset(struct amdgpu_device *adev) 7458 { 7459 return atomic_read(&adev->reset_domain->in_gpu_reset); 7460 } 7461 7462 /** 7463 * amdgpu_device_halt() - bring hardware to some kind of halt state 7464 * 7465 * @adev: amdgpu_device pointer 7466 * 7467 * Bring hardware to some kind of halt state so that no one can touch it 7468 * any more. It will help to maintain error context when error occurred. 7469 * Compare to a simple hang, the system will keep stable at least for SSH 7470 * access. Then it should be trivial to inspect the hardware state and 7471 * see what's going on. Implemented as following: 7472 * 7473 * 1. drm_dev_unplug() makes device inaccessible to user space(IOCTLs, etc), 7474 * clears all CPU mappings to device, disallows remappings through page faults 7475 * 2. amdgpu_irq_disable_all() disables all interrupts 7476 * 3. amdgpu_fence_driver_hw_fini() signals all HW fences 7477 * 4. set adev->no_hw_access to avoid potential crashes after setp 5 7478 * 5. amdgpu_device_unmap_mmio() clears all MMIO mappings 7479 * 6. pci_disable_device() and pci_wait_for_pending_transaction() 7480 * flush any in flight DMA operations 7481 */ 7482 void amdgpu_device_halt(struct amdgpu_device *adev) 7483 { 7484 struct pci_dev *pdev = adev->pdev; 7485 struct drm_device *ddev = adev_to_drm(adev); 7486 7487 amdgpu_xcp_dev_unplug(adev); 7488 drm_dev_unplug(ddev); 7489 7490 amdgpu_irq_disable_all(adev); 7491 7492 amdgpu_fence_driver_hw_fini(adev); 7493 7494 adev->no_hw_access = true; 7495 7496 amdgpu_device_unmap_mmio(adev); 7497 7498 pci_disable_device(pdev); 7499 pci_wait_for_pending_transaction(pdev); 7500 } 7501 7502 u32 amdgpu_device_pcie_port_rreg(struct amdgpu_device *adev, 7503 u32 reg) 7504 { 7505 unsigned long flags, address, data; 7506 u32 r; 7507 7508 address = adev->nbio.funcs->get_pcie_port_index_offset(adev); 7509 data = adev->nbio.funcs->get_pcie_port_data_offset(adev); 7510 7511 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 7512 WREG32(address, reg * 4); 7513 (void)RREG32(address); 7514 r = RREG32(data); 7515 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 7516 return r; 7517 } 7518 7519 void amdgpu_device_pcie_port_wreg(struct amdgpu_device *adev, 7520 u32 reg, u32 v) 7521 { 7522 unsigned long flags, address, data; 7523 7524 address = adev->nbio.funcs->get_pcie_port_index_offset(adev); 7525 data = adev->nbio.funcs->get_pcie_port_data_offset(adev); 7526 7527 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 7528 WREG32(address, reg * 4); 7529 (void)RREG32(address); 7530 WREG32(data, v); 7531 (void)RREG32(data); 7532 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 7533 } 7534 7535 /** 7536 * amdgpu_device_get_gang - return a reference to the current gang 7537 * @adev: amdgpu_device pointer 7538 * 7539 * Returns: A new reference to the current gang leader. 7540 */ 7541 struct dma_fence *amdgpu_device_get_gang(struct amdgpu_device *adev) 7542 { 7543 struct dma_fence *fence; 7544 7545 rcu_read_lock(); 7546 fence = dma_fence_get_rcu_safe(&adev->gang_submit); 7547 rcu_read_unlock(); 7548 return fence; 7549 } 7550 7551 /** 7552 * amdgpu_device_switch_gang - switch to a new gang 7553 * @adev: amdgpu_device pointer 7554 * @gang: the gang to switch to 7555 * 7556 * Try to switch to a new gang. 7557 * Returns: NULL if we switched to the new gang or a reference to the current 7558 * gang leader. 7559 */ 7560 struct dma_fence *amdgpu_device_switch_gang(struct amdgpu_device *adev, 7561 struct dma_fence *gang) 7562 { 7563 struct dma_fence *old = NULL; 7564 7565 dma_fence_get(gang); 7566 do { 7567 dma_fence_put(old); 7568 old = amdgpu_device_get_gang(adev); 7569 if (old == gang) 7570 break; 7571 7572 if (!dma_fence_is_signaled(old)) { 7573 dma_fence_put(gang); 7574 return old; 7575 } 7576 7577 } while (cmpxchg((struct dma_fence __force **)&adev->gang_submit, 7578 old, gang) != old); 7579 7580 /* 7581 * Drop it once for the exchanged reference in adev and once for the 7582 * thread local reference acquired in amdgpu_device_get_gang(). 7583 */ 7584 dma_fence_put(old); 7585 dma_fence_put(old); 7586 return NULL; 7587 } 7588 7589 /** 7590 * amdgpu_device_enforce_isolation - enforce HW isolation 7591 * @adev: the amdgpu device pointer 7592 * @ring: the HW ring the job is supposed to run on 7593 * @job: the job which is about to be pushed to the HW ring 7594 * 7595 * Makes sure that only one client at a time can use the GFX block. 7596 * Returns: The dependency to wait on before the job can be pushed to the HW. 7597 * The function is called multiple times until NULL is returned. 7598 */ 7599 struct dma_fence *amdgpu_device_enforce_isolation(struct amdgpu_device *adev, 7600 struct amdgpu_ring *ring, 7601 struct amdgpu_job *job) 7602 { 7603 struct amdgpu_isolation *isolation = &adev->isolation[ring->xcp_id]; 7604 struct drm_sched_fence *f = job->base.s_fence; 7605 struct dma_fence *dep; 7606 void *owner; 7607 int r; 7608 7609 /* 7610 * For now enforce isolation only for the GFX block since we only need 7611 * the cleaner shader on those rings. 7612 */ 7613 if (ring->funcs->type != AMDGPU_RING_TYPE_GFX && 7614 ring->funcs->type != AMDGPU_RING_TYPE_COMPUTE) 7615 return NULL; 7616 7617 /* 7618 * All submissions where enforce isolation is false are handled as if 7619 * they come from a single client. Use ~0l as the owner to distinct it 7620 * from kernel submissions where the owner is NULL. 7621 */ 7622 owner = job->enforce_isolation ? f->owner : (void *)~0l; 7623 7624 mutex_lock(&adev->enforce_isolation_mutex); 7625 7626 /* 7627 * The "spearhead" submission is the first one which changes the 7628 * ownership to its client. We always need to wait for it to be 7629 * pushed to the HW before proceeding with anything. 7630 */ 7631 if (&f->scheduled != isolation->spearhead && 7632 !dma_fence_is_signaled(isolation->spearhead)) { 7633 dep = isolation->spearhead; 7634 goto out_grab_ref; 7635 } 7636 7637 if (isolation->owner != owner) { 7638 7639 /* 7640 * Wait for any gang to be assembled before switching to a 7641 * different owner or otherwise we could deadlock the 7642 * submissions. 7643 */ 7644 if (!job->gang_submit) { 7645 dep = amdgpu_device_get_gang(adev); 7646 if (!dma_fence_is_signaled(dep)) 7647 goto out_return_dep; 7648 dma_fence_put(dep); 7649 } 7650 7651 dma_fence_put(isolation->spearhead); 7652 isolation->spearhead = dma_fence_get(&f->scheduled); 7653 amdgpu_sync_move(&isolation->active, &isolation->prev); 7654 trace_amdgpu_isolation(isolation->owner, owner); 7655 isolation->owner = owner; 7656 } 7657 7658 /* 7659 * Specifying the ring here helps to pipeline submissions even when 7660 * isolation is enabled. If that is not desired for testing NULL can be 7661 * used instead of the ring to enforce a CPU round trip while switching 7662 * between clients. 7663 */ 7664 dep = amdgpu_sync_peek_fence(&isolation->prev, ring); 7665 r = amdgpu_sync_fence(&isolation->active, &f->finished, GFP_NOWAIT); 7666 if (r) 7667 dev_warn(adev->dev, "OOM tracking isolation\n"); 7668 7669 out_grab_ref: 7670 dma_fence_get(dep); 7671 out_return_dep: 7672 mutex_unlock(&adev->enforce_isolation_mutex); 7673 return dep; 7674 } 7675 7676 bool amdgpu_device_has_display_hardware(struct amdgpu_device *adev) 7677 { 7678 switch (adev->asic_type) { 7679 #ifdef CONFIG_DRM_AMDGPU_SI 7680 case CHIP_HAINAN: 7681 #endif 7682 case CHIP_TOPAZ: 7683 /* chips with no display hardware */ 7684 return false; 7685 #ifdef CONFIG_DRM_AMDGPU_SI 7686 case CHIP_TAHITI: 7687 case CHIP_PITCAIRN: 7688 case CHIP_VERDE: 7689 case CHIP_OLAND: 7690 #endif 7691 #ifdef CONFIG_DRM_AMDGPU_CIK 7692 case CHIP_BONAIRE: 7693 case CHIP_HAWAII: 7694 case CHIP_KAVERI: 7695 case CHIP_KABINI: 7696 case CHIP_MULLINS: 7697 #endif 7698 case CHIP_TONGA: 7699 case CHIP_FIJI: 7700 case CHIP_POLARIS10: 7701 case CHIP_POLARIS11: 7702 case CHIP_POLARIS12: 7703 case CHIP_VEGAM: 7704 case CHIP_CARRIZO: 7705 case CHIP_STONEY: 7706 /* chips with display hardware */ 7707 return true; 7708 default: 7709 /* IP discovery */ 7710 if (!amdgpu_ip_version(adev, DCE_HWIP, 0) || 7711 (adev->harvest_ip_mask & AMD_HARVEST_IP_DMU_MASK)) 7712 return false; 7713 return true; 7714 } 7715 } 7716 7717 uint32_t amdgpu_device_wait_on_rreg(struct amdgpu_device *adev, 7718 uint32_t inst, uint32_t reg_addr, char reg_name[], 7719 uint32_t expected_value, uint32_t mask) 7720 { 7721 uint32_t ret = 0; 7722 uint32_t old_ = 0; 7723 uint32_t tmp_ = RREG32(reg_addr); 7724 uint32_t loop = adev->usec_timeout; 7725 7726 while ((tmp_ & (mask)) != (expected_value)) { 7727 if (old_ != tmp_) { 7728 loop = adev->usec_timeout; 7729 old_ = tmp_; 7730 } else 7731 udelay(1); 7732 tmp_ = RREG32(reg_addr); 7733 loop--; 7734 if (!loop) { 7735 dev_warn( 7736 adev->dev, 7737 "Register(%d) [%s] failed to reach value 0x%08x != 0x%08xn", 7738 inst, reg_name, (uint32_t)expected_value, 7739 (uint32_t)(tmp_ & (mask))); 7740 ret = -ETIMEDOUT; 7741 break; 7742 } 7743 } 7744 return ret; 7745 } 7746 7747 ssize_t amdgpu_get_soft_full_reset_mask(struct amdgpu_ring *ring) 7748 { 7749 ssize_t size = 0; 7750 7751 if (!ring || !ring->adev) 7752 return size; 7753 7754 if (amdgpu_device_should_recover_gpu(ring->adev)) 7755 size |= AMDGPU_RESET_TYPE_FULL; 7756 7757 if (unlikely(!ring->adev->debug_disable_soft_recovery) && 7758 !amdgpu_sriov_vf(ring->adev) && ring->funcs->soft_recovery) 7759 size |= AMDGPU_RESET_TYPE_SOFT_RESET; 7760 7761 return size; 7762 } 7763 7764 ssize_t amdgpu_show_reset_mask(char *buf, uint32_t supported_reset) 7765 { 7766 ssize_t size = 0; 7767 7768 if (supported_reset == 0) { 7769 size += sysfs_emit_at(buf, size, "unsupported"); 7770 size += sysfs_emit_at(buf, size, "\n"); 7771 return size; 7772 7773 } 7774 7775 if (supported_reset & AMDGPU_RESET_TYPE_SOFT_RESET) 7776 size += sysfs_emit_at(buf, size, "soft "); 7777 7778 if (supported_reset & AMDGPU_RESET_TYPE_PER_QUEUE) 7779 size += sysfs_emit_at(buf, size, "queue "); 7780 7781 if (supported_reset & AMDGPU_RESET_TYPE_PER_PIPE) 7782 size += sysfs_emit_at(buf, size, "pipe "); 7783 7784 if (supported_reset & AMDGPU_RESET_TYPE_FULL) 7785 size += sysfs_emit_at(buf, size, "full "); 7786 7787 size += sysfs_emit_at(buf, size, "\n"); 7788 return size; 7789 } 7790 7791 void amdgpu_device_set_uid(struct amdgpu_uid *uid_info, 7792 enum amdgpu_uid_type type, uint8_t inst, 7793 uint64_t uid) 7794 { 7795 if (!uid_info) 7796 return; 7797 7798 if (type >= AMDGPU_UID_TYPE_MAX) { 7799 dev_err_once(uid_info->adev->dev, "Invalid UID type %d\n", 7800 type); 7801 return; 7802 } 7803 7804 if (inst >= AMDGPU_UID_INST_MAX) { 7805 dev_err_once(uid_info->adev->dev, "Invalid UID instance %d\n", 7806 inst); 7807 return; 7808 } 7809 7810 if (uid_info->uid[type][inst] != 0) { 7811 dev_warn_once( 7812 uid_info->adev->dev, 7813 "Overwriting existing UID %llu for type %d instance %d\n", 7814 uid_info->uid[type][inst], type, inst); 7815 } 7816 7817 uid_info->uid[type][inst] = uid; 7818 } 7819 7820 u64 amdgpu_device_get_uid(struct amdgpu_uid *uid_info, 7821 enum amdgpu_uid_type type, uint8_t inst) 7822 { 7823 if (!uid_info) 7824 return 0; 7825 7826 if (type >= AMDGPU_UID_TYPE_MAX) { 7827 dev_err_once(uid_info->adev->dev, "Invalid UID type %d\n", 7828 type); 7829 return 0; 7830 } 7831 7832 if (inst >= AMDGPU_UID_INST_MAX) { 7833 dev_err_once(uid_info->adev->dev, "Invalid UID instance %d\n", 7834 inst); 7835 return 0; 7836 } 7837 7838 return uid_info->uid[type][inst]; 7839 } 7840