1 /* 2 * Copyright 2008 Advanced Micro Devices, Inc. 3 * Copyright 2008 Red Hat Inc. 4 * Copyright 2009 Jerome Glisse. 5 * 6 * Permission is hereby granted, free of charge, to any person obtaining a 7 * copy of this software and associated documentation files (the "Software"), 8 * to deal in the Software without restriction, including without limitation 9 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 10 * and/or sell copies of the Software, and to permit persons to whom the 11 * Software is furnished to do so, subject to the following conditions: 12 * 13 * The above copyright notice and this permission notice shall be included in 14 * all copies or substantial portions of the Software. 15 * 16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 19 * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR 20 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, 21 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR 22 * OTHER DEALINGS IN THE SOFTWARE. 23 * 24 * Authors: Dave Airlie 25 * Alex Deucher 26 * Jerome Glisse 27 */ 28 29 #include <linux/aperture.h> 30 #include <linux/power_supply.h> 31 #include <linux/kthread.h> 32 #include <linux/module.h> 33 #include <linux/console.h> 34 #include <linux/slab.h> 35 #include <linux/iommu.h> 36 #include <linux/pci.h> 37 #include <linux/pci-p2pdma.h> 38 #include <linux/apple-gmux.h> 39 40 #include <drm/drm_atomic_helper.h> 41 #include <drm/drm_client_event.h> 42 #include <drm/drm_crtc_helper.h> 43 #include <drm/drm_probe_helper.h> 44 #include <drm/amdgpu_drm.h> 45 #include <linux/device.h> 46 #include <linux/vgaarb.h> 47 #include <linux/vga_switcheroo.h> 48 #include <linux/efi.h> 49 #include "amdgpu.h" 50 #include "amdgpu_trace.h" 51 #include "amdgpu_i2c.h" 52 #include "atom.h" 53 #include "amdgpu_atombios.h" 54 #include "amdgpu_atomfirmware.h" 55 #include "amd_pcie.h" 56 #ifdef CONFIG_DRM_AMDGPU_SI 57 #include "si.h" 58 #endif 59 #ifdef CONFIG_DRM_AMDGPU_CIK 60 #include "cik.h" 61 #endif 62 #include "vi.h" 63 #include "soc15.h" 64 #include "nv.h" 65 #include "bif/bif_4_1_d.h" 66 #include <linux/firmware.h> 67 #include "amdgpu_vf_error.h" 68 69 #include "amdgpu_amdkfd.h" 70 #include "amdgpu_pm.h" 71 72 #include "amdgpu_xgmi.h" 73 #include "amdgpu_ras.h" 74 #include "amdgpu_pmu.h" 75 #include "amdgpu_fru_eeprom.h" 76 #include "amdgpu_reset.h" 77 #include "amdgpu_virt.h" 78 #include "amdgpu_dev_coredump.h" 79 80 #include <linux/suspend.h> 81 #include <drm/task_barrier.h> 82 #include <linux/pm_runtime.h> 83 84 #include <drm/drm_drv.h> 85 86 #if IS_ENABLED(CONFIG_X86) 87 #include <asm/intel-family.h> 88 #include <asm/cpu_device_id.h> 89 #endif 90 91 MODULE_FIRMWARE("amdgpu/vega10_gpu_info.bin"); 92 MODULE_FIRMWARE("amdgpu/vega12_gpu_info.bin"); 93 MODULE_FIRMWARE("amdgpu/raven_gpu_info.bin"); 94 MODULE_FIRMWARE("amdgpu/picasso_gpu_info.bin"); 95 MODULE_FIRMWARE("amdgpu/raven2_gpu_info.bin"); 96 MODULE_FIRMWARE("amdgpu/arcturus_gpu_info.bin"); 97 MODULE_FIRMWARE("amdgpu/navi12_gpu_info.bin"); 98 MODULE_FIRMWARE("amdgpu/cyan_skillfish_gpu_info.bin"); 99 100 #define AMDGPU_RESUME_MS 2000 101 #define AMDGPU_MAX_RETRY_LIMIT 2 102 #define AMDGPU_RETRY_SRIOV_RESET(r) ((r) == -EBUSY || (r) == -ETIMEDOUT || (r) == -EINVAL) 103 #define AMDGPU_PCIE_INDEX_FALLBACK (0x38 >> 2) 104 #define AMDGPU_PCIE_INDEX_HI_FALLBACK (0x44 >> 2) 105 #define AMDGPU_PCIE_DATA_FALLBACK (0x3C >> 2) 106 107 #define AMDGPU_VBIOS_SKIP (1U << 0) 108 #define AMDGPU_VBIOS_OPTIONAL (1U << 1) 109 110 static const struct drm_driver amdgpu_kms_driver; 111 112 const char *amdgpu_asic_name[] = { 113 "TAHITI", 114 "PITCAIRN", 115 "VERDE", 116 "OLAND", 117 "HAINAN", 118 "BONAIRE", 119 "KAVERI", 120 "KABINI", 121 "HAWAII", 122 "MULLINS", 123 "TOPAZ", 124 "TONGA", 125 "FIJI", 126 "CARRIZO", 127 "STONEY", 128 "POLARIS10", 129 "POLARIS11", 130 "POLARIS12", 131 "VEGAM", 132 "VEGA10", 133 "VEGA12", 134 "VEGA20", 135 "RAVEN", 136 "ARCTURUS", 137 "RENOIR", 138 "ALDEBARAN", 139 "NAVI10", 140 "CYAN_SKILLFISH", 141 "NAVI14", 142 "NAVI12", 143 "SIENNA_CICHLID", 144 "NAVY_FLOUNDER", 145 "VANGOGH", 146 "DIMGREY_CAVEFISH", 147 "BEIGE_GOBY", 148 "YELLOW_CARP", 149 "IP DISCOVERY", 150 "LAST", 151 }; 152 153 #define AMDGPU_IP_BLK_MASK_ALL GENMASK(AMD_IP_BLOCK_TYPE_NUM - 1, 0) 154 /* 155 * Default init level where all blocks are expected to be initialized. This is 156 * the level of initialization expected by default and also after a full reset 157 * of the device. 158 */ 159 struct amdgpu_init_level amdgpu_init_default = { 160 .level = AMDGPU_INIT_LEVEL_DEFAULT, 161 .hwini_ip_block_mask = AMDGPU_IP_BLK_MASK_ALL, 162 }; 163 164 struct amdgpu_init_level amdgpu_init_recovery = { 165 .level = AMDGPU_INIT_LEVEL_RESET_RECOVERY, 166 .hwini_ip_block_mask = AMDGPU_IP_BLK_MASK_ALL, 167 }; 168 169 /* 170 * Minimal blocks needed to be initialized before a XGMI hive can be reset. This 171 * is used for cases like reset on initialization where the entire hive needs to 172 * be reset before first use. 173 */ 174 struct amdgpu_init_level amdgpu_init_minimal_xgmi = { 175 .level = AMDGPU_INIT_LEVEL_MINIMAL_XGMI, 176 .hwini_ip_block_mask = 177 BIT(AMD_IP_BLOCK_TYPE_GMC) | BIT(AMD_IP_BLOCK_TYPE_SMC) | 178 BIT(AMD_IP_BLOCK_TYPE_COMMON) | BIT(AMD_IP_BLOCK_TYPE_IH) | 179 BIT(AMD_IP_BLOCK_TYPE_PSP) 180 }; 181 182 static int amdgpu_device_ip_resume_phase1(struct amdgpu_device *adev); 183 static int amdgpu_device_ip_resume_phase2(struct amdgpu_device *adev); 184 static int amdgpu_device_ip_resume_phase3(struct amdgpu_device *adev); 185 186 static void amdgpu_device_load_switch_state(struct amdgpu_device *adev); 187 188 static inline bool amdgpu_ip_member_of_hwini(struct amdgpu_device *adev, 189 enum amd_ip_block_type block) 190 { 191 return (adev->init_lvl->hwini_ip_block_mask & (1U << block)) != 0; 192 } 193 194 void amdgpu_set_init_level(struct amdgpu_device *adev, 195 enum amdgpu_init_lvl_id lvl) 196 { 197 switch (lvl) { 198 case AMDGPU_INIT_LEVEL_MINIMAL_XGMI: 199 adev->init_lvl = &amdgpu_init_minimal_xgmi; 200 break; 201 case AMDGPU_INIT_LEVEL_RESET_RECOVERY: 202 adev->init_lvl = &amdgpu_init_recovery; 203 break; 204 case AMDGPU_INIT_LEVEL_DEFAULT: 205 fallthrough; 206 default: 207 adev->init_lvl = &amdgpu_init_default; 208 break; 209 } 210 } 211 212 static inline void amdgpu_device_stop_pending_resets(struct amdgpu_device *adev); 213 static int amdgpu_device_pm_notifier(struct notifier_block *nb, unsigned long mode, 214 void *data); 215 216 /** 217 * DOC: pcie_replay_count 218 * 219 * The amdgpu driver provides a sysfs API for reporting the total number 220 * of PCIe replays (NAKs). 221 * The file pcie_replay_count is used for this and returns the total 222 * number of replays as a sum of the NAKs generated and NAKs received. 223 */ 224 225 static ssize_t amdgpu_device_get_pcie_replay_count(struct device *dev, 226 struct device_attribute *attr, char *buf) 227 { 228 struct drm_device *ddev = dev_get_drvdata(dev); 229 struct amdgpu_device *adev = drm_to_adev(ddev); 230 uint64_t cnt = amdgpu_asic_get_pcie_replay_count(adev); 231 232 return sysfs_emit(buf, "%llu\n", cnt); 233 } 234 235 static DEVICE_ATTR(pcie_replay_count, 0444, 236 amdgpu_device_get_pcie_replay_count, NULL); 237 238 static int amdgpu_device_attr_sysfs_init(struct amdgpu_device *adev) 239 { 240 int ret = 0; 241 242 if (amdgpu_nbio_is_replay_cnt_supported(adev)) 243 ret = sysfs_create_file(&adev->dev->kobj, 244 &dev_attr_pcie_replay_count.attr); 245 246 return ret; 247 } 248 249 static void amdgpu_device_attr_sysfs_fini(struct amdgpu_device *adev) 250 { 251 if (amdgpu_nbio_is_replay_cnt_supported(adev)) 252 sysfs_remove_file(&adev->dev->kobj, 253 &dev_attr_pcie_replay_count.attr); 254 } 255 256 static ssize_t amdgpu_sysfs_reg_state_get(struct file *f, struct kobject *kobj, 257 const struct bin_attribute *attr, char *buf, 258 loff_t ppos, size_t count) 259 { 260 struct device *dev = kobj_to_dev(kobj); 261 struct drm_device *ddev = dev_get_drvdata(dev); 262 struct amdgpu_device *adev = drm_to_adev(ddev); 263 ssize_t bytes_read; 264 265 switch (ppos) { 266 case AMDGPU_SYS_REG_STATE_XGMI: 267 bytes_read = amdgpu_asic_get_reg_state( 268 adev, AMDGPU_REG_STATE_TYPE_XGMI, buf, count); 269 break; 270 case AMDGPU_SYS_REG_STATE_WAFL: 271 bytes_read = amdgpu_asic_get_reg_state( 272 adev, AMDGPU_REG_STATE_TYPE_WAFL, buf, count); 273 break; 274 case AMDGPU_SYS_REG_STATE_PCIE: 275 bytes_read = amdgpu_asic_get_reg_state( 276 adev, AMDGPU_REG_STATE_TYPE_PCIE, buf, count); 277 break; 278 case AMDGPU_SYS_REG_STATE_USR: 279 bytes_read = amdgpu_asic_get_reg_state( 280 adev, AMDGPU_REG_STATE_TYPE_USR, buf, count); 281 break; 282 case AMDGPU_SYS_REG_STATE_USR_1: 283 bytes_read = amdgpu_asic_get_reg_state( 284 adev, AMDGPU_REG_STATE_TYPE_USR_1, buf, count); 285 break; 286 default: 287 return -EINVAL; 288 } 289 290 return bytes_read; 291 } 292 293 static const BIN_ATTR(reg_state, 0444, amdgpu_sysfs_reg_state_get, NULL, 294 AMDGPU_SYS_REG_STATE_END); 295 296 int amdgpu_reg_state_sysfs_init(struct amdgpu_device *adev) 297 { 298 int ret; 299 300 if (!amdgpu_asic_get_reg_state_supported(adev)) 301 return 0; 302 303 ret = sysfs_create_bin_file(&adev->dev->kobj, &bin_attr_reg_state); 304 305 return ret; 306 } 307 308 void amdgpu_reg_state_sysfs_fini(struct amdgpu_device *adev) 309 { 310 if (!amdgpu_asic_get_reg_state_supported(adev)) 311 return; 312 sysfs_remove_bin_file(&adev->dev->kobj, &bin_attr_reg_state); 313 } 314 315 int amdgpu_ip_block_suspend(struct amdgpu_ip_block *ip_block) 316 { 317 int r; 318 319 if (ip_block->version->funcs->suspend) { 320 r = ip_block->version->funcs->suspend(ip_block); 321 if (r) { 322 dev_err(ip_block->adev->dev, 323 "suspend of IP block <%s> failed %d\n", 324 ip_block->version->funcs->name, r); 325 return r; 326 } 327 } 328 329 ip_block->status.hw = false; 330 return 0; 331 } 332 333 int amdgpu_ip_block_resume(struct amdgpu_ip_block *ip_block) 334 { 335 int r; 336 337 if (ip_block->version->funcs->resume) { 338 r = ip_block->version->funcs->resume(ip_block); 339 if (r) { 340 dev_err(ip_block->adev->dev, 341 "resume of IP block <%s> failed %d\n", 342 ip_block->version->funcs->name, r); 343 return r; 344 } 345 } 346 347 ip_block->status.hw = true; 348 return 0; 349 } 350 351 /** 352 * DOC: board_info 353 * 354 * The amdgpu driver provides a sysfs API for giving board related information. 355 * It provides the form factor information in the format 356 * 357 * type : form factor 358 * 359 * Possible form factor values 360 * 361 * - "cem" - PCIE CEM card 362 * - "oam" - Open Compute Accelerator Module 363 * - "unknown" - Not known 364 * 365 */ 366 367 static ssize_t amdgpu_device_get_board_info(struct device *dev, 368 struct device_attribute *attr, 369 char *buf) 370 { 371 struct drm_device *ddev = dev_get_drvdata(dev); 372 struct amdgpu_device *adev = drm_to_adev(ddev); 373 enum amdgpu_pkg_type pkg_type = AMDGPU_PKG_TYPE_CEM; 374 const char *pkg; 375 376 if (adev->smuio.funcs && adev->smuio.funcs->get_pkg_type) 377 pkg_type = adev->smuio.funcs->get_pkg_type(adev); 378 379 switch (pkg_type) { 380 case AMDGPU_PKG_TYPE_CEM: 381 pkg = "cem"; 382 break; 383 case AMDGPU_PKG_TYPE_OAM: 384 pkg = "oam"; 385 break; 386 default: 387 pkg = "unknown"; 388 break; 389 } 390 391 return sysfs_emit(buf, "%s : %s\n", "type", pkg); 392 } 393 394 static DEVICE_ATTR(board_info, 0444, amdgpu_device_get_board_info, NULL); 395 396 static struct attribute *amdgpu_board_attrs[] = { 397 &dev_attr_board_info.attr, 398 NULL, 399 }; 400 401 static umode_t amdgpu_board_attrs_is_visible(struct kobject *kobj, 402 struct attribute *attr, int n) 403 { 404 struct device *dev = kobj_to_dev(kobj); 405 struct drm_device *ddev = dev_get_drvdata(dev); 406 struct amdgpu_device *adev = drm_to_adev(ddev); 407 408 if (adev->flags & AMD_IS_APU) 409 return 0; 410 411 return attr->mode; 412 } 413 414 static const struct attribute_group amdgpu_board_attrs_group = { 415 .attrs = amdgpu_board_attrs, 416 .is_visible = amdgpu_board_attrs_is_visible 417 }; 418 419 static void amdgpu_device_get_pcie_info(struct amdgpu_device *adev); 420 421 /** 422 * amdgpu_device_supports_px - Is the device a dGPU with ATPX power control 423 * 424 * @adev: amdgpu device pointer 425 * 426 * Returns true if the device is a dGPU with ATPX power control, 427 * otherwise return false. 428 */ 429 bool amdgpu_device_supports_px(struct amdgpu_device *adev) 430 { 431 if ((adev->flags & AMD_IS_PX) && !amdgpu_is_atpx_hybrid()) 432 return true; 433 return false; 434 } 435 436 /** 437 * amdgpu_device_supports_boco - Is the device a dGPU with ACPI power resources 438 * 439 * @adev: amdgpu device pointer 440 * 441 * Returns true if the device is a dGPU with ACPI power control, 442 * otherwise return false. 443 */ 444 bool amdgpu_device_supports_boco(struct amdgpu_device *adev) 445 { 446 if (!IS_ENABLED(CONFIG_HOTPLUG_PCI_PCIE)) 447 return false; 448 449 if (adev->has_pr3 || 450 ((adev->flags & AMD_IS_PX) && amdgpu_is_atpx_hybrid())) 451 return true; 452 return false; 453 } 454 455 /** 456 * amdgpu_device_supports_baco - Does the device support BACO 457 * 458 * @adev: amdgpu device pointer 459 * 460 * Return: 461 * 1 if the device supports BACO; 462 * 3 if the device supports MACO (only works if BACO is supported) 463 * otherwise return 0. 464 */ 465 int amdgpu_device_supports_baco(struct amdgpu_device *adev) 466 { 467 return amdgpu_asic_supports_baco(adev); 468 } 469 470 void amdgpu_device_detect_runtime_pm_mode(struct amdgpu_device *adev) 471 { 472 int bamaco_support; 473 474 adev->pm.rpm_mode = AMDGPU_RUNPM_NONE; 475 bamaco_support = amdgpu_device_supports_baco(adev); 476 477 switch (amdgpu_runtime_pm) { 478 case 2: 479 if (bamaco_support & MACO_SUPPORT) { 480 adev->pm.rpm_mode = AMDGPU_RUNPM_BAMACO; 481 dev_info(adev->dev, "Forcing BAMACO for runtime pm\n"); 482 } else if (bamaco_support == BACO_SUPPORT) { 483 adev->pm.rpm_mode = AMDGPU_RUNPM_BACO; 484 dev_info(adev->dev, "Requested mode BAMACO not available,fallback to use BACO\n"); 485 } 486 break; 487 case 1: 488 if (bamaco_support & BACO_SUPPORT) { 489 adev->pm.rpm_mode = AMDGPU_RUNPM_BACO; 490 dev_info(adev->dev, "Forcing BACO for runtime pm\n"); 491 } 492 break; 493 case -1: 494 case -2: 495 if (amdgpu_device_supports_px(adev)) { 496 /* enable PX as runtime mode */ 497 adev->pm.rpm_mode = AMDGPU_RUNPM_PX; 498 dev_info(adev->dev, "Using ATPX for runtime pm\n"); 499 } else if (amdgpu_device_supports_boco(adev)) { 500 /* enable boco as runtime mode */ 501 adev->pm.rpm_mode = AMDGPU_RUNPM_BOCO; 502 dev_info(adev->dev, "Using BOCO for runtime pm\n"); 503 } else { 504 if (!bamaco_support) 505 goto no_runtime_pm; 506 507 switch (adev->asic_type) { 508 case CHIP_VEGA20: 509 case CHIP_ARCTURUS: 510 /* BACO are not supported on vega20 and arctrus */ 511 break; 512 case CHIP_VEGA10: 513 /* enable BACO as runpm mode if noretry=0 */ 514 if (!adev->gmc.noretry && !amdgpu_passthrough(adev)) 515 adev->pm.rpm_mode = AMDGPU_RUNPM_BACO; 516 break; 517 default: 518 /* enable BACO as runpm mode on CI+ */ 519 if (!amdgpu_passthrough(adev)) 520 adev->pm.rpm_mode = AMDGPU_RUNPM_BACO; 521 break; 522 } 523 524 if (adev->pm.rpm_mode == AMDGPU_RUNPM_BACO) { 525 if (bamaco_support & MACO_SUPPORT) { 526 adev->pm.rpm_mode = AMDGPU_RUNPM_BAMACO; 527 dev_info(adev->dev, "Using BAMACO for runtime pm\n"); 528 } else { 529 dev_info(adev->dev, "Using BACO for runtime pm\n"); 530 } 531 } 532 } 533 break; 534 case 0: 535 dev_info(adev->dev, "runtime pm is manually disabled\n"); 536 break; 537 default: 538 break; 539 } 540 541 no_runtime_pm: 542 if (adev->pm.rpm_mode == AMDGPU_RUNPM_NONE) 543 dev_info(adev->dev, "Runtime PM not available\n"); 544 } 545 /** 546 * amdgpu_device_supports_smart_shift - Is the device dGPU with 547 * smart shift support 548 * 549 * @adev: amdgpu device pointer 550 * 551 * Returns true if the device is a dGPU with Smart Shift support, 552 * otherwise returns false. 553 */ 554 bool amdgpu_device_supports_smart_shift(struct amdgpu_device *adev) 555 { 556 return (amdgpu_device_supports_boco(adev) && 557 amdgpu_acpi_is_power_shift_control_supported()); 558 } 559 560 /* 561 * VRAM access helper functions 562 */ 563 564 /** 565 * amdgpu_device_mm_access - access vram by MM_INDEX/MM_DATA 566 * 567 * @adev: amdgpu_device pointer 568 * @pos: offset of the buffer in vram 569 * @buf: virtual address of the buffer in system memory 570 * @size: read/write size, sizeof(@buf) must > @size 571 * @write: true - write to vram, otherwise - read from vram 572 */ 573 void amdgpu_device_mm_access(struct amdgpu_device *adev, loff_t pos, 574 void *buf, size_t size, bool write) 575 { 576 unsigned long flags; 577 uint32_t hi = ~0, tmp = 0; 578 uint32_t *data = buf; 579 uint64_t last; 580 int idx; 581 582 if (!drm_dev_enter(adev_to_drm(adev), &idx)) 583 return; 584 585 BUG_ON(!IS_ALIGNED(pos, 4) || !IS_ALIGNED(size, 4)); 586 587 spin_lock_irqsave(&adev->mmio_idx_lock, flags); 588 for (last = pos + size; pos < last; pos += 4) { 589 tmp = pos >> 31; 590 591 WREG32_NO_KIQ(mmMM_INDEX, ((uint32_t)pos) | 0x80000000); 592 if (tmp != hi) { 593 WREG32_NO_KIQ(mmMM_INDEX_HI, tmp); 594 hi = tmp; 595 } 596 if (write) 597 WREG32_NO_KIQ(mmMM_DATA, *data++); 598 else 599 *data++ = RREG32_NO_KIQ(mmMM_DATA); 600 } 601 602 spin_unlock_irqrestore(&adev->mmio_idx_lock, flags); 603 drm_dev_exit(idx); 604 } 605 606 /** 607 * amdgpu_device_aper_access - access vram by vram aperture 608 * 609 * @adev: amdgpu_device pointer 610 * @pos: offset of the buffer in vram 611 * @buf: virtual address of the buffer in system memory 612 * @size: read/write size, sizeof(@buf) must > @size 613 * @write: true - write to vram, otherwise - read from vram 614 * 615 * The return value means how many bytes have been transferred. 616 */ 617 size_t amdgpu_device_aper_access(struct amdgpu_device *adev, loff_t pos, 618 void *buf, size_t size, bool write) 619 { 620 #ifdef CONFIG_64BIT 621 void __iomem *addr; 622 size_t count = 0; 623 uint64_t last; 624 625 if (!adev->mman.aper_base_kaddr) 626 return 0; 627 628 last = min(pos + size, adev->gmc.visible_vram_size); 629 if (last > pos) { 630 addr = adev->mman.aper_base_kaddr + pos; 631 count = last - pos; 632 633 if (write) { 634 memcpy_toio(addr, buf, count); 635 /* Make sure HDP write cache flush happens without any reordering 636 * after the system memory contents are sent over PCIe device 637 */ 638 mb(); 639 amdgpu_device_flush_hdp(adev, NULL); 640 } else { 641 amdgpu_device_invalidate_hdp(adev, NULL); 642 /* Make sure HDP read cache is invalidated before issuing a read 643 * to the PCIe device 644 */ 645 mb(); 646 memcpy_fromio(buf, addr, count); 647 } 648 649 } 650 651 return count; 652 #else 653 return 0; 654 #endif 655 } 656 657 /** 658 * amdgpu_device_vram_access - read/write a buffer in vram 659 * 660 * @adev: amdgpu_device pointer 661 * @pos: offset of the buffer in vram 662 * @buf: virtual address of the buffer in system memory 663 * @size: read/write size, sizeof(@buf) must > @size 664 * @write: true - write to vram, otherwise - read from vram 665 */ 666 void amdgpu_device_vram_access(struct amdgpu_device *adev, loff_t pos, 667 void *buf, size_t size, bool write) 668 { 669 size_t count; 670 671 /* try to using vram apreature to access vram first */ 672 count = amdgpu_device_aper_access(adev, pos, buf, size, write); 673 size -= count; 674 if (size) { 675 /* using MM to access rest vram */ 676 pos += count; 677 buf += count; 678 amdgpu_device_mm_access(adev, pos, buf, size, write); 679 } 680 } 681 682 /* 683 * register access helper functions. 684 */ 685 686 /* Check if hw access should be skipped because of hotplug or device error */ 687 bool amdgpu_device_skip_hw_access(struct amdgpu_device *adev) 688 { 689 if (adev->no_hw_access) 690 return true; 691 692 #ifdef CONFIG_LOCKDEP 693 /* 694 * This is a bit complicated to understand, so worth a comment. What we assert 695 * here is that the GPU reset is not running on another thread in parallel. 696 * 697 * For this we trylock the read side of the reset semaphore, if that succeeds 698 * we know that the reset is not running in parallel. 699 * 700 * If the trylock fails we assert that we are either already holding the read 701 * side of the lock or are the reset thread itself and hold the write side of 702 * the lock. 703 */ 704 if (in_task()) { 705 if (down_read_trylock(&adev->reset_domain->sem)) 706 up_read(&adev->reset_domain->sem); 707 else 708 lockdep_assert_held(&adev->reset_domain->sem); 709 } 710 #endif 711 return false; 712 } 713 714 /** 715 * amdgpu_device_rreg - read a memory mapped IO or indirect register 716 * 717 * @adev: amdgpu_device pointer 718 * @reg: dword aligned register offset 719 * @acc_flags: access flags which require special behavior 720 * 721 * Returns the 32 bit value from the offset specified. 722 */ 723 uint32_t amdgpu_device_rreg(struct amdgpu_device *adev, 724 uint32_t reg, uint32_t acc_flags) 725 { 726 uint32_t ret; 727 728 if (amdgpu_device_skip_hw_access(adev)) 729 return 0; 730 731 if ((reg * 4) < adev->rmmio_size) { 732 if (!(acc_flags & AMDGPU_REGS_NO_KIQ) && 733 amdgpu_sriov_runtime(adev) && 734 down_read_trylock(&adev->reset_domain->sem)) { 735 ret = amdgpu_kiq_rreg(adev, reg, 0); 736 up_read(&adev->reset_domain->sem); 737 } else { 738 ret = readl(((void __iomem *)adev->rmmio) + (reg * 4)); 739 } 740 } else { 741 ret = adev->pcie_rreg(adev, reg * 4); 742 } 743 744 trace_amdgpu_device_rreg(adev->pdev->device, reg, ret); 745 746 return ret; 747 } 748 749 /* 750 * MMIO register read with bytes helper functions 751 * @offset:bytes offset from MMIO start 752 */ 753 754 /** 755 * amdgpu_mm_rreg8 - read a memory mapped IO register 756 * 757 * @adev: amdgpu_device pointer 758 * @offset: byte aligned register offset 759 * 760 * Returns the 8 bit value from the offset specified. 761 */ 762 uint8_t amdgpu_mm_rreg8(struct amdgpu_device *adev, uint32_t offset) 763 { 764 if (amdgpu_device_skip_hw_access(adev)) 765 return 0; 766 767 if (offset < adev->rmmio_size) 768 return (readb(adev->rmmio + offset)); 769 BUG(); 770 } 771 772 773 /** 774 * amdgpu_device_xcc_rreg - read a memory mapped IO or indirect register with specific XCC 775 * 776 * @adev: amdgpu_device pointer 777 * @reg: dword aligned register offset 778 * @acc_flags: access flags which require special behavior 779 * @xcc_id: xcc accelerated compute core id 780 * 781 * Returns the 32 bit value from the offset specified. 782 */ 783 uint32_t amdgpu_device_xcc_rreg(struct amdgpu_device *adev, 784 uint32_t reg, uint32_t acc_flags, 785 uint32_t xcc_id) 786 { 787 uint32_t ret, rlcg_flag; 788 789 if (amdgpu_device_skip_hw_access(adev)) 790 return 0; 791 792 if ((reg * 4) < adev->rmmio_size) { 793 if (amdgpu_sriov_vf(adev) && 794 !amdgpu_sriov_runtime(adev) && 795 adev->gfx.rlc.rlcg_reg_access_supported && 796 amdgpu_virt_get_rlcg_reg_access_flag(adev, acc_flags, 797 GC_HWIP, false, 798 &rlcg_flag)) { 799 ret = amdgpu_virt_rlcg_reg_rw(adev, reg, 0, rlcg_flag, GET_INST(GC, xcc_id)); 800 } else if (!(acc_flags & AMDGPU_REGS_NO_KIQ) && 801 amdgpu_sriov_runtime(adev) && 802 down_read_trylock(&adev->reset_domain->sem)) { 803 ret = amdgpu_kiq_rreg(adev, reg, xcc_id); 804 up_read(&adev->reset_domain->sem); 805 } else { 806 ret = readl(((void __iomem *)adev->rmmio) + (reg * 4)); 807 } 808 } else { 809 ret = adev->pcie_rreg(adev, reg * 4); 810 } 811 812 return ret; 813 } 814 815 /* 816 * MMIO register write with bytes helper functions 817 * @offset:bytes offset from MMIO start 818 * @value: the value want to be written to the register 819 */ 820 821 /** 822 * amdgpu_mm_wreg8 - read a memory mapped IO register 823 * 824 * @adev: amdgpu_device pointer 825 * @offset: byte aligned register offset 826 * @value: 8 bit value to write 827 * 828 * Writes the value specified to the offset specified. 829 */ 830 void amdgpu_mm_wreg8(struct amdgpu_device *adev, uint32_t offset, uint8_t value) 831 { 832 if (amdgpu_device_skip_hw_access(adev)) 833 return; 834 835 if (offset < adev->rmmio_size) 836 writeb(value, adev->rmmio + offset); 837 else 838 BUG(); 839 } 840 841 /** 842 * amdgpu_device_wreg - write to a memory mapped IO or indirect register 843 * 844 * @adev: amdgpu_device pointer 845 * @reg: dword aligned register offset 846 * @v: 32 bit value to write to the register 847 * @acc_flags: access flags which require special behavior 848 * 849 * Writes the value specified to the offset specified. 850 */ 851 void amdgpu_device_wreg(struct amdgpu_device *adev, 852 uint32_t reg, uint32_t v, 853 uint32_t acc_flags) 854 { 855 if (amdgpu_device_skip_hw_access(adev)) 856 return; 857 858 if ((reg * 4) < adev->rmmio_size) { 859 if (!(acc_flags & AMDGPU_REGS_NO_KIQ) && 860 amdgpu_sriov_runtime(adev) && 861 down_read_trylock(&adev->reset_domain->sem)) { 862 amdgpu_kiq_wreg(adev, reg, v, 0); 863 up_read(&adev->reset_domain->sem); 864 } else { 865 writel(v, ((void __iomem *)adev->rmmio) + (reg * 4)); 866 } 867 } else { 868 adev->pcie_wreg(adev, reg * 4, v); 869 } 870 871 trace_amdgpu_device_wreg(adev->pdev->device, reg, v); 872 } 873 874 /** 875 * amdgpu_mm_wreg_mmio_rlc - write register either with direct/indirect mmio or with RLC path if in range 876 * 877 * @adev: amdgpu_device pointer 878 * @reg: mmio/rlc register 879 * @v: value to write 880 * @xcc_id: xcc accelerated compute core id 881 * 882 * this function is invoked only for the debugfs register access 883 */ 884 void amdgpu_mm_wreg_mmio_rlc(struct amdgpu_device *adev, 885 uint32_t reg, uint32_t v, 886 uint32_t xcc_id) 887 { 888 if (amdgpu_device_skip_hw_access(adev)) 889 return; 890 891 if (amdgpu_sriov_fullaccess(adev) && 892 adev->gfx.rlc.funcs && 893 adev->gfx.rlc.funcs->is_rlcg_access_range) { 894 if (adev->gfx.rlc.funcs->is_rlcg_access_range(adev, reg)) 895 return amdgpu_sriov_wreg(adev, reg, v, 0, 0, xcc_id); 896 } else if ((reg * 4) >= adev->rmmio_size) { 897 adev->pcie_wreg(adev, reg * 4, v); 898 } else { 899 writel(v, ((void __iomem *)adev->rmmio) + (reg * 4)); 900 } 901 } 902 903 /** 904 * amdgpu_device_xcc_wreg - write to a memory mapped IO or indirect register with specific XCC 905 * 906 * @adev: amdgpu_device pointer 907 * @reg: dword aligned register offset 908 * @v: 32 bit value to write to the register 909 * @acc_flags: access flags which require special behavior 910 * @xcc_id: xcc accelerated compute core id 911 * 912 * Writes the value specified to the offset specified. 913 */ 914 void amdgpu_device_xcc_wreg(struct amdgpu_device *adev, 915 uint32_t reg, uint32_t v, 916 uint32_t acc_flags, uint32_t xcc_id) 917 { 918 uint32_t rlcg_flag; 919 920 if (amdgpu_device_skip_hw_access(adev)) 921 return; 922 923 if ((reg * 4) < adev->rmmio_size) { 924 if (amdgpu_sriov_vf(adev) && 925 !amdgpu_sriov_runtime(adev) && 926 adev->gfx.rlc.rlcg_reg_access_supported && 927 amdgpu_virt_get_rlcg_reg_access_flag(adev, acc_flags, 928 GC_HWIP, true, 929 &rlcg_flag)) { 930 amdgpu_virt_rlcg_reg_rw(adev, reg, v, rlcg_flag, GET_INST(GC, xcc_id)); 931 } else if (!(acc_flags & AMDGPU_REGS_NO_KIQ) && 932 amdgpu_sriov_runtime(adev) && 933 down_read_trylock(&adev->reset_domain->sem)) { 934 amdgpu_kiq_wreg(adev, reg, v, xcc_id); 935 up_read(&adev->reset_domain->sem); 936 } else { 937 writel(v, ((void __iomem *)adev->rmmio) + (reg * 4)); 938 } 939 } else { 940 adev->pcie_wreg(adev, reg * 4, v); 941 } 942 } 943 944 /** 945 * amdgpu_device_indirect_rreg - read an indirect register 946 * 947 * @adev: amdgpu_device pointer 948 * @reg_addr: indirect register address to read from 949 * 950 * Returns the value of indirect register @reg_addr 951 */ 952 u32 amdgpu_device_indirect_rreg(struct amdgpu_device *adev, 953 u32 reg_addr) 954 { 955 unsigned long flags, pcie_index, pcie_data; 956 void __iomem *pcie_index_offset; 957 void __iomem *pcie_data_offset; 958 u32 r; 959 960 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 961 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 962 963 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 964 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 965 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 966 967 writel(reg_addr, pcie_index_offset); 968 readl(pcie_index_offset); 969 r = readl(pcie_data_offset); 970 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 971 972 return r; 973 } 974 975 u32 amdgpu_device_indirect_rreg_ext(struct amdgpu_device *adev, 976 u64 reg_addr) 977 { 978 unsigned long flags, pcie_index, pcie_index_hi, pcie_data; 979 u32 r; 980 void __iomem *pcie_index_offset; 981 void __iomem *pcie_index_hi_offset; 982 void __iomem *pcie_data_offset; 983 984 if (unlikely(!adev->nbio.funcs)) { 985 pcie_index = AMDGPU_PCIE_INDEX_FALLBACK; 986 pcie_data = AMDGPU_PCIE_DATA_FALLBACK; 987 } else { 988 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 989 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 990 } 991 992 if (reg_addr >> 32) { 993 if (unlikely(!adev->nbio.funcs)) 994 pcie_index_hi = AMDGPU_PCIE_INDEX_HI_FALLBACK; 995 else 996 pcie_index_hi = adev->nbio.funcs->get_pcie_index_hi_offset(adev); 997 } else { 998 pcie_index_hi = 0; 999 } 1000 1001 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 1002 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 1003 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 1004 if (pcie_index_hi != 0) 1005 pcie_index_hi_offset = (void __iomem *)adev->rmmio + 1006 pcie_index_hi * 4; 1007 1008 writel(reg_addr, pcie_index_offset); 1009 readl(pcie_index_offset); 1010 if (pcie_index_hi != 0) { 1011 writel((reg_addr >> 32) & 0xff, pcie_index_hi_offset); 1012 readl(pcie_index_hi_offset); 1013 } 1014 r = readl(pcie_data_offset); 1015 1016 /* clear the high bits */ 1017 if (pcie_index_hi != 0) { 1018 writel(0, pcie_index_hi_offset); 1019 readl(pcie_index_hi_offset); 1020 } 1021 1022 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 1023 1024 return r; 1025 } 1026 1027 /** 1028 * amdgpu_device_indirect_rreg64 - read a 64bits indirect register 1029 * 1030 * @adev: amdgpu_device pointer 1031 * @reg_addr: indirect register address to read from 1032 * 1033 * Returns the value of indirect register @reg_addr 1034 */ 1035 u64 amdgpu_device_indirect_rreg64(struct amdgpu_device *adev, 1036 u32 reg_addr) 1037 { 1038 unsigned long flags, pcie_index, pcie_data; 1039 void __iomem *pcie_index_offset; 1040 void __iomem *pcie_data_offset; 1041 u64 r; 1042 1043 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 1044 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 1045 1046 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 1047 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 1048 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 1049 1050 /* read low 32 bits */ 1051 writel(reg_addr, pcie_index_offset); 1052 readl(pcie_index_offset); 1053 r = readl(pcie_data_offset); 1054 /* read high 32 bits */ 1055 writel(reg_addr + 4, pcie_index_offset); 1056 readl(pcie_index_offset); 1057 r |= ((u64)readl(pcie_data_offset) << 32); 1058 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 1059 1060 return r; 1061 } 1062 1063 u64 amdgpu_device_indirect_rreg64_ext(struct amdgpu_device *adev, 1064 u64 reg_addr) 1065 { 1066 unsigned long flags, pcie_index, pcie_data; 1067 unsigned long pcie_index_hi = 0; 1068 void __iomem *pcie_index_offset; 1069 void __iomem *pcie_index_hi_offset; 1070 void __iomem *pcie_data_offset; 1071 u64 r; 1072 1073 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 1074 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 1075 if ((reg_addr >> 32) && (adev->nbio.funcs->get_pcie_index_hi_offset)) 1076 pcie_index_hi = adev->nbio.funcs->get_pcie_index_hi_offset(adev); 1077 1078 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 1079 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 1080 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 1081 if (pcie_index_hi != 0) 1082 pcie_index_hi_offset = (void __iomem *)adev->rmmio + 1083 pcie_index_hi * 4; 1084 1085 /* read low 32 bits */ 1086 writel(reg_addr, pcie_index_offset); 1087 readl(pcie_index_offset); 1088 if (pcie_index_hi != 0) { 1089 writel((reg_addr >> 32) & 0xff, pcie_index_hi_offset); 1090 readl(pcie_index_hi_offset); 1091 } 1092 r = readl(pcie_data_offset); 1093 /* read high 32 bits */ 1094 writel(reg_addr + 4, pcie_index_offset); 1095 readl(pcie_index_offset); 1096 if (pcie_index_hi != 0) { 1097 writel((reg_addr >> 32) & 0xff, pcie_index_hi_offset); 1098 readl(pcie_index_hi_offset); 1099 } 1100 r |= ((u64)readl(pcie_data_offset) << 32); 1101 1102 /* clear the high bits */ 1103 if (pcie_index_hi != 0) { 1104 writel(0, pcie_index_hi_offset); 1105 readl(pcie_index_hi_offset); 1106 } 1107 1108 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 1109 1110 return r; 1111 } 1112 1113 /** 1114 * amdgpu_device_indirect_wreg - write an indirect register address 1115 * 1116 * @adev: amdgpu_device pointer 1117 * @reg_addr: indirect register offset 1118 * @reg_data: indirect register data 1119 * 1120 */ 1121 void amdgpu_device_indirect_wreg(struct amdgpu_device *adev, 1122 u32 reg_addr, u32 reg_data) 1123 { 1124 unsigned long flags, pcie_index, pcie_data; 1125 void __iomem *pcie_index_offset; 1126 void __iomem *pcie_data_offset; 1127 1128 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 1129 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 1130 1131 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 1132 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 1133 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 1134 1135 writel(reg_addr, pcie_index_offset); 1136 readl(pcie_index_offset); 1137 writel(reg_data, pcie_data_offset); 1138 readl(pcie_data_offset); 1139 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 1140 } 1141 1142 void amdgpu_device_indirect_wreg_ext(struct amdgpu_device *adev, 1143 u64 reg_addr, u32 reg_data) 1144 { 1145 unsigned long flags, pcie_index, pcie_index_hi, pcie_data; 1146 void __iomem *pcie_index_offset; 1147 void __iomem *pcie_index_hi_offset; 1148 void __iomem *pcie_data_offset; 1149 1150 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 1151 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 1152 if ((reg_addr >> 32) && (adev->nbio.funcs->get_pcie_index_hi_offset)) 1153 pcie_index_hi = adev->nbio.funcs->get_pcie_index_hi_offset(adev); 1154 else 1155 pcie_index_hi = 0; 1156 1157 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 1158 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 1159 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 1160 if (pcie_index_hi != 0) 1161 pcie_index_hi_offset = (void __iomem *)adev->rmmio + 1162 pcie_index_hi * 4; 1163 1164 writel(reg_addr, pcie_index_offset); 1165 readl(pcie_index_offset); 1166 if (pcie_index_hi != 0) { 1167 writel((reg_addr >> 32) & 0xff, pcie_index_hi_offset); 1168 readl(pcie_index_hi_offset); 1169 } 1170 writel(reg_data, pcie_data_offset); 1171 readl(pcie_data_offset); 1172 1173 /* clear the high bits */ 1174 if (pcie_index_hi != 0) { 1175 writel(0, pcie_index_hi_offset); 1176 readl(pcie_index_hi_offset); 1177 } 1178 1179 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 1180 } 1181 1182 /** 1183 * amdgpu_device_indirect_wreg64 - write a 64bits indirect register address 1184 * 1185 * @adev: amdgpu_device pointer 1186 * @reg_addr: indirect register offset 1187 * @reg_data: indirect register data 1188 * 1189 */ 1190 void amdgpu_device_indirect_wreg64(struct amdgpu_device *adev, 1191 u32 reg_addr, u64 reg_data) 1192 { 1193 unsigned long flags, pcie_index, pcie_data; 1194 void __iomem *pcie_index_offset; 1195 void __iomem *pcie_data_offset; 1196 1197 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 1198 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 1199 1200 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 1201 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 1202 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 1203 1204 /* write low 32 bits */ 1205 writel(reg_addr, pcie_index_offset); 1206 readl(pcie_index_offset); 1207 writel((u32)(reg_data & 0xffffffffULL), pcie_data_offset); 1208 readl(pcie_data_offset); 1209 /* write high 32 bits */ 1210 writel(reg_addr + 4, pcie_index_offset); 1211 readl(pcie_index_offset); 1212 writel((u32)(reg_data >> 32), pcie_data_offset); 1213 readl(pcie_data_offset); 1214 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 1215 } 1216 1217 void amdgpu_device_indirect_wreg64_ext(struct amdgpu_device *adev, 1218 u64 reg_addr, u64 reg_data) 1219 { 1220 unsigned long flags, pcie_index, pcie_data; 1221 unsigned long pcie_index_hi = 0; 1222 void __iomem *pcie_index_offset; 1223 void __iomem *pcie_index_hi_offset; 1224 void __iomem *pcie_data_offset; 1225 1226 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 1227 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 1228 if ((reg_addr >> 32) && (adev->nbio.funcs->get_pcie_index_hi_offset)) 1229 pcie_index_hi = adev->nbio.funcs->get_pcie_index_hi_offset(adev); 1230 1231 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 1232 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 1233 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 1234 if (pcie_index_hi != 0) 1235 pcie_index_hi_offset = (void __iomem *)adev->rmmio + 1236 pcie_index_hi * 4; 1237 1238 /* write low 32 bits */ 1239 writel(reg_addr, pcie_index_offset); 1240 readl(pcie_index_offset); 1241 if (pcie_index_hi != 0) { 1242 writel((reg_addr >> 32) & 0xff, pcie_index_hi_offset); 1243 readl(pcie_index_hi_offset); 1244 } 1245 writel((u32)(reg_data & 0xffffffffULL), pcie_data_offset); 1246 readl(pcie_data_offset); 1247 /* write high 32 bits */ 1248 writel(reg_addr + 4, pcie_index_offset); 1249 readl(pcie_index_offset); 1250 if (pcie_index_hi != 0) { 1251 writel((reg_addr >> 32) & 0xff, pcie_index_hi_offset); 1252 readl(pcie_index_hi_offset); 1253 } 1254 writel((u32)(reg_data >> 32), pcie_data_offset); 1255 readl(pcie_data_offset); 1256 1257 /* clear the high bits */ 1258 if (pcie_index_hi != 0) { 1259 writel(0, pcie_index_hi_offset); 1260 readl(pcie_index_hi_offset); 1261 } 1262 1263 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 1264 } 1265 1266 /** 1267 * amdgpu_device_get_rev_id - query device rev_id 1268 * 1269 * @adev: amdgpu_device pointer 1270 * 1271 * Return device rev_id 1272 */ 1273 u32 amdgpu_device_get_rev_id(struct amdgpu_device *adev) 1274 { 1275 return adev->nbio.funcs->get_rev_id(adev); 1276 } 1277 1278 /** 1279 * amdgpu_invalid_rreg - dummy reg read function 1280 * 1281 * @adev: amdgpu_device pointer 1282 * @reg: offset of register 1283 * 1284 * Dummy register read function. Used for register blocks 1285 * that certain asics don't have (all asics). 1286 * Returns the value in the register. 1287 */ 1288 static uint32_t amdgpu_invalid_rreg(struct amdgpu_device *adev, uint32_t reg) 1289 { 1290 dev_err(adev->dev, "Invalid callback to read register 0x%04X\n", reg); 1291 BUG(); 1292 return 0; 1293 } 1294 1295 static uint32_t amdgpu_invalid_rreg_ext(struct amdgpu_device *adev, uint64_t reg) 1296 { 1297 dev_err(adev->dev, "Invalid callback to read register 0x%llX\n", reg); 1298 BUG(); 1299 return 0; 1300 } 1301 1302 /** 1303 * amdgpu_invalid_wreg - dummy reg write function 1304 * 1305 * @adev: amdgpu_device pointer 1306 * @reg: offset of register 1307 * @v: value to write to the register 1308 * 1309 * Dummy register read function. Used for register blocks 1310 * that certain asics don't have (all asics). 1311 */ 1312 static void amdgpu_invalid_wreg(struct amdgpu_device *adev, uint32_t reg, uint32_t v) 1313 { 1314 dev_err(adev->dev, 1315 "Invalid callback to write register 0x%04X with 0x%08X\n", reg, 1316 v); 1317 BUG(); 1318 } 1319 1320 static void amdgpu_invalid_wreg_ext(struct amdgpu_device *adev, uint64_t reg, uint32_t v) 1321 { 1322 dev_err(adev->dev, 1323 "Invalid callback to write register 0x%llX with 0x%08X\n", reg, 1324 v); 1325 BUG(); 1326 } 1327 1328 /** 1329 * amdgpu_invalid_rreg64 - dummy 64 bit reg read function 1330 * 1331 * @adev: amdgpu_device pointer 1332 * @reg: offset of register 1333 * 1334 * Dummy register read function. Used for register blocks 1335 * that certain asics don't have (all asics). 1336 * Returns the value in the register. 1337 */ 1338 static uint64_t amdgpu_invalid_rreg64(struct amdgpu_device *adev, uint32_t reg) 1339 { 1340 dev_err(adev->dev, "Invalid callback to read 64 bit register 0x%04X\n", 1341 reg); 1342 BUG(); 1343 return 0; 1344 } 1345 1346 static uint64_t amdgpu_invalid_rreg64_ext(struct amdgpu_device *adev, uint64_t reg) 1347 { 1348 dev_err(adev->dev, "Invalid callback to read register 0x%llX\n", reg); 1349 BUG(); 1350 return 0; 1351 } 1352 1353 /** 1354 * amdgpu_invalid_wreg64 - dummy reg write function 1355 * 1356 * @adev: amdgpu_device pointer 1357 * @reg: offset of register 1358 * @v: value to write to the register 1359 * 1360 * Dummy register read function. Used for register blocks 1361 * that certain asics don't have (all asics). 1362 */ 1363 static void amdgpu_invalid_wreg64(struct amdgpu_device *adev, uint32_t reg, uint64_t v) 1364 { 1365 dev_err(adev->dev, 1366 "Invalid callback to write 64 bit register 0x%04X with 0x%08llX\n", 1367 reg, v); 1368 BUG(); 1369 } 1370 1371 static void amdgpu_invalid_wreg64_ext(struct amdgpu_device *adev, uint64_t reg, uint64_t v) 1372 { 1373 dev_err(adev->dev, 1374 "Invalid callback to write 64 bit register 0x%llX with 0x%08llX\n", 1375 reg, v); 1376 BUG(); 1377 } 1378 1379 /** 1380 * amdgpu_block_invalid_rreg - dummy reg read function 1381 * 1382 * @adev: amdgpu_device pointer 1383 * @block: offset of instance 1384 * @reg: offset of register 1385 * 1386 * Dummy register read function. Used for register blocks 1387 * that certain asics don't have (all asics). 1388 * Returns the value in the register. 1389 */ 1390 static uint32_t amdgpu_block_invalid_rreg(struct amdgpu_device *adev, 1391 uint32_t block, uint32_t reg) 1392 { 1393 dev_err(adev->dev, 1394 "Invalid callback to read register 0x%04X in block 0x%04X\n", 1395 reg, block); 1396 BUG(); 1397 return 0; 1398 } 1399 1400 /** 1401 * amdgpu_block_invalid_wreg - dummy reg write function 1402 * 1403 * @adev: amdgpu_device pointer 1404 * @block: offset of instance 1405 * @reg: offset of register 1406 * @v: value to write to the register 1407 * 1408 * Dummy register read function. Used for register blocks 1409 * that certain asics don't have (all asics). 1410 */ 1411 static void amdgpu_block_invalid_wreg(struct amdgpu_device *adev, 1412 uint32_t block, 1413 uint32_t reg, uint32_t v) 1414 { 1415 dev_err(adev->dev, 1416 "Invalid block callback to write register 0x%04X in block 0x%04X with 0x%08X\n", 1417 reg, block, v); 1418 BUG(); 1419 } 1420 1421 static uint32_t amdgpu_device_get_vbios_flags(struct amdgpu_device *adev) 1422 { 1423 if (hweight32(adev->aid_mask) && (adev->flags & AMD_IS_APU)) 1424 return AMDGPU_VBIOS_SKIP; 1425 1426 if (hweight32(adev->aid_mask) && amdgpu_passthrough(adev)) 1427 return AMDGPU_VBIOS_OPTIONAL; 1428 1429 return 0; 1430 } 1431 1432 /** 1433 * amdgpu_device_asic_init - Wrapper for atom asic_init 1434 * 1435 * @adev: amdgpu_device pointer 1436 * 1437 * Does any asic specific work and then calls atom asic init. 1438 */ 1439 static int amdgpu_device_asic_init(struct amdgpu_device *adev) 1440 { 1441 uint32_t flags; 1442 bool optional; 1443 int ret; 1444 1445 amdgpu_asic_pre_asic_init(adev); 1446 flags = amdgpu_device_get_vbios_flags(adev); 1447 optional = !!(flags & (AMDGPU_VBIOS_OPTIONAL | AMDGPU_VBIOS_SKIP)); 1448 1449 if (amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 3) || 1450 amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 4) || 1451 amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 5, 0) || 1452 amdgpu_ip_version(adev, GC_HWIP, 0) >= IP_VERSION(11, 0, 0)) { 1453 amdgpu_psp_wait_for_bootloader(adev); 1454 if (optional && !adev->bios) 1455 return 0; 1456 1457 ret = amdgpu_atomfirmware_asic_init(adev, true); 1458 return ret; 1459 } else { 1460 if (optional && !adev->bios) 1461 return 0; 1462 1463 return amdgpu_atom_asic_init(adev->mode_info.atom_context); 1464 } 1465 1466 return 0; 1467 } 1468 1469 /** 1470 * amdgpu_device_mem_scratch_init - allocate the VRAM scratch page 1471 * 1472 * @adev: amdgpu_device pointer 1473 * 1474 * Allocates a scratch page of VRAM for use by various things in the 1475 * driver. 1476 */ 1477 static int amdgpu_device_mem_scratch_init(struct amdgpu_device *adev) 1478 { 1479 return amdgpu_bo_create_kernel(adev, AMDGPU_GPU_PAGE_SIZE, PAGE_SIZE, 1480 AMDGPU_GEM_DOMAIN_VRAM | 1481 AMDGPU_GEM_DOMAIN_GTT, 1482 &adev->mem_scratch.robj, 1483 &adev->mem_scratch.gpu_addr, 1484 (void **)&adev->mem_scratch.ptr); 1485 } 1486 1487 /** 1488 * amdgpu_device_mem_scratch_fini - Free the VRAM scratch page 1489 * 1490 * @adev: amdgpu_device pointer 1491 * 1492 * Frees the VRAM scratch page. 1493 */ 1494 static void amdgpu_device_mem_scratch_fini(struct amdgpu_device *adev) 1495 { 1496 amdgpu_bo_free_kernel(&adev->mem_scratch.robj, NULL, NULL); 1497 } 1498 1499 /** 1500 * amdgpu_device_program_register_sequence - program an array of registers. 1501 * 1502 * @adev: amdgpu_device pointer 1503 * @registers: pointer to the register array 1504 * @array_size: size of the register array 1505 * 1506 * Programs an array or registers with and or masks. 1507 * This is a helper for setting golden registers. 1508 */ 1509 void amdgpu_device_program_register_sequence(struct amdgpu_device *adev, 1510 const u32 *registers, 1511 const u32 array_size) 1512 { 1513 u32 tmp, reg, and_mask, or_mask; 1514 int i; 1515 1516 if (array_size % 3) 1517 return; 1518 1519 for (i = 0; i < array_size; i += 3) { 1520 reg = registers[i + 0]; 1521 and_mask = registers[i + 1]; 1522 or_mask = registers[i + 2]; 1523 1524 if (and_mask == 0xffffffff) { 1525 tmp = or_mask; 1526 } else { 1527 tmp = RREG32(reg); 1528 tmp &= ~and_mask; 1529 if (adev->family >= AMDGPU_FAMILY_AI) 1530 tmp |= (or_mask & and_mask); 1531 else 1532 tmp |= or_mask; 1533 } 1534 WREG32(reg, tmp); 1535 } 1536 } 1537 1538 /** 1539 * amdgpu_device_pci_config_reset - reset the GPU 1540 * 1541 * @adev: amdgpu_device pointer 1542 * 1543 * Resets the GPU using the pci config reset sequence. 1544 * Only applicable to asics prior to vega10. 1545 */ 1546 void amdgpu_device_pci_config_reset(struct amdgpu_device *adev) 1547 { 1548 pci_write_config_dword(adev->pdev, 0x7c, AMDGPU_ASIC_RESET_DATA); 1549 } 1550 1551 /** 1552 * amdgpu_device_pci_reset - reset the GPU using generic PCI means 1553 * 1554 * @adev: amdgpu_device pointer 1555 * 1556 * Resets the GPU using generic pci reset interfaces (FLR, SBR, etc.). 1557 */ 1558 int amdgpu_device_pci_reset(struct amdgpu_device *adev) 1559 { 1560 return pci_reset_function(adev->pdev); 1561 } 1562 1563 /* 1564 * amdgpu_device_wb_*() 1565 * Writeback is the method by which the GPU updates special pages in memory 1566 * with the status of certain GPU events (fences, ring pointers,etc.). 1567 */ 1568 1569 /** 1570 * amdgpu_device_wb_fini - Disable Writeback and free memory 1571 * 1572 * @adev: amdgpu_device pointer 1573 * 1574 * Disables Writeback and frees the Writeback memory (all asics). 1575 * Used at driver shutdown. 1576 */ 1577 static void amdgpu_device_wb_fini(struct amdgpu_device *adev) 1578 { 1579 if (adev->wb.wb_obj) { 1580 amdgpu_bo_free_kernel(&adev->wb.wb_obj, 1581 &adev->wb.gpu_addr, 1582 (void **)&adev->wb.wb); 1583 adev->wb.wb_obj = NULL; 1584 } 1585 } 1586 1587 /** 1588 * amdgpu_device_wb_init - Init Writeback driver info and allocate memory 1589 * 1590 * @adev: amdgpu_device pointer 1591 * 1592 * Initializes writeback and allocates writeback memory (all asics). 1593 * Used at driver startup. 1594 * Returns 0 on success or an -error on failure. 1595 */ 1596 static int amdgpu_device_wb_init(struct amdgpu_device *adev) 1597 { 1598 int r; 1599 1600 if (adev->wb.wb_obj == NULL) { 1601 /* AMDGPU_MAX_WB * sizeof(uint32_t) * 8 = AMDGPU_MAX_WB 256bit slots */ 1602 r = amdgpu_bo_create_kernel(adev, AMDGPU_MAX_WB * sizeof(uint32_t) * 8, 1603 PAGE_SIZE, AMDGPU_GEM_DOMAIN_GTT, 1604 &adev->wb.wb_obj, &adev->wb.gpu_addr, 1605 (void **)&adev->wb.wb); 1606 if (r) { 1607 dev_warn(adev->dev, "(%d) create WB bo failed\n", r); 1608 return r; 1609 } 1610 1611 adev->wb.num_wb = AMDGPU_MAX_WB; 1612 memset(&adev->wb.used, 0, sizeof(adev->wb.used)); 1613 1614 /* clear wb memory */ 1615 memset((char *)adev->wb.wb, 0, AMDGPU_MAX_WB * sizeof(uint32_t) * 8); 1616 } 1617 1618 return 0; 1619 } 1620 1621 /** 1622 * amdgpu_device_wb_get - Allocate a wb entry 1623 * 1624 * @adev: amdgpu_device pointer 1625 * @wb: wb index 1626 * 1627 * Allocate a wb slot for use by the driver (all asics). 1628 * Returns 0 on success or -EINVAL on failure. 1629 */ 1630 int amdgpu_device_wb_get(struct amdgpu_device *adev, u32 *wb) 1631 { 1632 unsigned long flags, offset; 1633 1634 spin_lock_irqsave(&adev->wb.lock, flags); 1635 offset = find_first_zero_bit(adev->wb.used, adev->wb.num_wb); 1636 if (offset < adev->wb.num_wb) { 1637 __set_bit(offset, adev->wb.used); 1638 spin_unlock_irqrestore(&adev->wb.lock, flags); 1639 *wb = offset << 3; /* convert to dw offset */ 1640 return 0; 1641 } else { 1642 spin_unlock_irqrestore(&adev->wb.lock, flags); 1643 return -EINVAL; 1644 } 1645 } 1646 1647 /** 1648 * amdgpu_device_wb_free - Free a wb entry 1649 * 1650 * @adev: amdgpu_device pointer 1651 * @wb: wb index 1652 * 1653 * Free a wb slot allocated for use by the driver (all asics) 1654 */ 1655 void amdgpu_device_wb_free(struct amdgpu_device *adev, u32 wb) 1656 { 1657 unsigned long flags; 1658 1659 wb >>= 3; 1660 spin_lock_irqsave(&adev->wb.lock, flags); 1661 if (wb < adev->wb.num_wb) 1662 __clear_bit(wb, adev->wb.used); 1663 spin_unlock_irqrestore(&adev->wb.lock, flags); 1664 } 1665 1666 /** 1667 * amdgpu_device_resize_fb_bar - try to resize FB BAR 1668 * 1669 * @adev: amdgpu_device pointer 1670 * 1671 * Try to resize FB BAR to make all VRAM CPU accessible. We try very hard not 1672 * to fail, but if any of the BARs is not accessible after the size we abort 1673 * driver loading by returning -ENODEV. 1674 */ 1675 int amdgpu_device_resize_fb_bar(struct amdgpu_device *adev) 1676 { 1677 int rbar_size = pci_rebar_bytes_to_size(adev->gmc.real_vram_size); 1678 struct pci_bus *root; 1679 struct resource *res; 1680 unsigned int i; 1681 u16 cmd; 1682 int r; 1683 1684 if (!IS_ENABLED(CONFIG_PHYS_ADDR_T_64BIT)) 1685 return 0; 1686 1687 /* Bypass for VF */ 1688 if (amdgpu_sriov_vf(adev)) 1689 return 0; 1690 1691 if (!amdgpu_rebar) 1692 return 0; 1693 1694 /* resizing on Dell G5 SE platforms causes problems with runtime pm */ 1695 if ((amdgpu_runtime_pm != 0) && 1696 adev->pdev->vendor == PCI_VENDOR_ID_ATI && 1697 adev->pdev->device == 0x731f && 1698 adev->pdev->subsystem_vendor == PCI_VENDOR_ID_DELL) 1699 return 0; 1700 1701 /* PCI_EXT_CAP_ID_VNDR extended capability is located at 0x100 */ 1702 if (!pci_find_ext_capability(adev->pdev, PCI_EXT_CAP_ID_VNDR)) 1703 dev_warn( 1704 adev->dev, 1705 "System can't access extended configuration space, please check!!\n"); 1706 1707 /* skip if the bios has already enabled large BAR */ 1708 if (adev->gmc.real_vram_size && 1709 (pci_resource_len(adev->pdev, 0) >= adev->gmc.real_vram_size)) 1710 return 0; 1711 1712 /* Check if the root BUS has 64bit memory resources */ 1713 root = adev->pdev->bus; 1714 while (root->parent) 1715 root = root->parent; 1716 1717 pci_bus_for_each_resource(root, res, i) { 1718 if (res && res->flags & (IORESOURCE_MEM | IORESOURCE_MEM_64) && 1719 res->start > 0x100000000ull) 1720 break; 1721 } 1722 1723 /* Trying to resize is pointless without a root hub window above 4GB */ 1724 if (!res) 1725 return 0; 1726 1727 /* Limit the BAR size to what is available */ 1728 rbar_size = min(fls(pci_rebar_get_possible_sizes(adev->pdev, 0)) - 1, 1729 rbar_size); 1730 1731 /* Disable memory decoding while we change the BAR addresses and size */ 1732 pci_read_config_word(adev->pdev, PCI_COMMAND, &cmd); 1733 pci_write_config_word(adev->pdev, PCI_COMMAND, 1734 cmd & ~PCI_COMMAND_MEMORY); 1735 1736 /* Free the VRAM and doorbell BAR, we most likely need to move both. */ 1737 amdgpu_doorbell_fini(adev); 1738 if (adev->asic_type >= CHIP_BONAIRE) 1739 pci_release_resource(adev->pdev, 2); 1740 1741 pci_release_resource(adev->pdev, 0); 1742 1743 r = pci_resize_resource(adev->pdev, 0, rbar_size); 1744 if (r == -ENOSPC) 1745 dev_info(adev->dev, 1746 "Not enough PCI address space for a large BAR."); 1747 else if (r && r != -ENOTSUPP) 1748 dev_err(adev->dev, "Problem resizing BAR0 (%d).", r); 1749 1750 pci_assign_unassigned_bus_resources(adev->pdev->bus); 1751 1752 /* When the doorbell or fb BAR isn't available we have no chance of 1753 * using the device. 1754 */ 1755 r = amdgpu_doorbell_init(adev); 1756 if (r || (pci_resource_flags(adev->pdev, 0) & IORESOURCE_UNSET)) 1757 return -ENODEV; 1758 1759 pci_write_config_word(adev->pdev, PCI_COMMAND, cmd); 1760 1761 return 0; 1762 } 1763 1764 /* 1765 * GPU helpers function. 1766 */ 1767 /** 1768 * amdgpu_device_need_post - check if the hw need post or not 1769 * 1770 * @adev: amdgpu_device pointer 1771 * 1772 * Check if the asic has been initialized (all asics) at driver startup 1773 * or post is needed if hw reset is performed. 1774 * Returns true if need or false if not. 1775 */ 1776 bool amdgpu_device_need_post(struct amdgpu_device *adev) 1777 { 1778 uint32_t reg, flags; 1779 1780 if (amdgpu_sriov_vf(adev)) 1781 return false; 1782 1783 flags = amdgpu_device_get_vbios_flags(adev); 1784 if (flags & AMDGPU_VBIOS_SKIP) 1785 return false; 1786 if ((flags & AMDGPU_VBIOS_OPTIONAL) && !adev->bios) 1787 return false; 1788 1789 if (amdgpu_passthrough(adev)) { 1790 /* for FIJI: In whole GPU pass-through virtualization case, after VM reboot 1791 * some old smc fw still need driver do vPost otherwise gpu hang, while 1792 * those smc fw version above 22.15 doesn't have this flaw, so we force 1793 * vpost executed for smc version below 22.15 1794 */ 1795 if (adev->asic_type == CHIP_FIJI) { 1796 int err; 1797 uint32_t fw_ver; 1798 1799 err = request_firmware(&adev->pm.fw, "amdgpu/fiji_smc.bin", adev->dev); 1800 /* force vPost if error occurred */ 1801 if (err) 1802 return true; 1803 1804 fw_ver = *((uint32_t *)adev->pm.fw->data + 69); 1805 release_firmware(adev->pm.fw); 1806 if (fw_ver < 0x00160e00) 1807 return true; 1808 } 1809 } 1810 1811 /* Don't post if we need to reset whole hive on init */ 1812 if (adev->init_lvl->level == AMDGPU_INIT_LEVEL_MINIMAL_XGMI) 1813 return false; 1814 1815 if (adev->has_hw_reset) { 1816 adev->has_hw_reset = false; 1817 return true; 1818 } 1819 1820 /* bios scratch used on CIK+ */ 1821 if (adev->asic_type >= CHIP_BONAIRE) 1822 return amdgpu_atombios_scratch_need_asic_init(adev); 1823 1824 /* check MEM_SIZE for older asics */ 1825 reg = amdgpu_asic_get_config_memsize(adev); 1826 1827 if ((reg != 0) && (reg != 0xffffffff)) 1828 return false; 1829 1830 return true; 1831 } 1832 1833 /* 1834 * Check whether seamless boot is supported. 1835 * 1836 * So far we only support seamless boot on DCE 3.0 or later. 1837 * If users report that it works on older ASICS as well, we may 1838 * loosen this. 1839 */ 1840 bool amdgpu_device_seamless_boot_supported(struct amdgpu_device *adev) 1841 { 1842 switch (amdgpu_seamless) { 1843 case -1: 1844 break; 1845 case 1: 1846 return true; 1847 case 0: 1848 return false; 1849 default: 1850 dev_err(adev->dev, "Invalid value for amdgpu.seamless: %d\n", 1851 amdgpu_seamless); 1852 return false; 1853 } 1854 1855 if (!(adev->flags & AMD_IS_APU)) 1856 return false; 1857 1858 if (adev->mman.keep_stolen_vga_memory) 1859 return false; 1860 1861 return amdgpu_ip_version(adev, DCE_HWIP, 0) >= IP_VERSION(3, 0, 0); 1862 } 1863 1864 /* 1865 * Intel hosts such as Rocket Lake, Alder Lake, Raptor Lake and Sapphire Rapids 1866 * don't support dynamic speed switching. Until we have confirmation from Intel 1867 * that a specific host supports it, it's safer that we keep it disabled for all. 1868 * 1869 * https://edc.intel.com/content/www/us/en/design/products/platforms/details/raptor-lake-s/13th-generation-core-processors-datasheet-volume-1-of-2/005/pci-express-support/ 1870 * https://gitlab.freedesktop.org/drm/amd/-/issues/2663 1871 */ 1872 static bool amdgpu_device_pcie_dynamic_switching_supported(struct amdgpu_device *adev) 1873 { 1874 #if IS_ENABLED(CONFIG_X86) 1875 struct cpuinfo_x86 *c = &cpu_data(0); 1876 1877 /* eGPU change speeds based on USB4 fabric conditions */ 1878 if (dev_is_removable(adev->dev)) 1879 return true; 1880 1881 if (c->x86_vendor == X86_VENDOR_INTEL) 1882 return false; 1883 #endif 1884 return true; 1885 } 1886 1887 static bool amdgpu_device_aspm_support_quirk(struct amdgpu_device *adev) 1888 { 1889 /* Enabling ASPM causes randoms hangs on Tahiti and Oland on Zen4. 1890 * It's unclear if this is a platform-specific or GPU-specific issue. 1891 * Disable ASPM on SI for the time being. 1892 */ 1893 if (adev->family == AMDGPU_FAMILY_SI) 1894 return true; 1895 1896 #if IS_ENABLED(CONFIG_X86) 1897 struct cpuinfo_x86 *c = &cpu_data(0); 1898 1899 if (!(amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(12, 0, 0) || 1900 amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(12, 0, 1))) 1901 return false; 1902 1903 if (c->x86 == 6 && 1904 adev->pm.pcie_gen_mask & CAIL_PCIE_LINK_SPEED_SUPPORT_GEN5) { 1905 switch (c->x86_model) { 1906 case VFM_MODEL(INTEL_ALDERLAKE): 1907 case VFM_MODEL(INTEL_ALDERLAKE_L): 1908 case VFM_MODEL(INTEL_RAPTORLAKE): 1909 case VFM_MODEL(INTEL_RAPTORLAKE_P): 1910 case VFM_MODEL(INTEL_RAPTORLAKE_S): 1911 return true; 1912 default: 1913 return false; 1914 } 1915 } else { 1916 return false; 1917 } 1918 #else 1919 return false; 1920 #endif 1921 } 1922 1923 /** 1924 * amdgpu_device_should_use_aspm - check if the device should program ASPM 1925 * 1926 * @adev: amdgpu_device pointer 1927 * 1928 * Confirm whether the module parameter and pcie bridge agree that ASPM should 1929 * be set for this device. 1930 * 1931 * Returns true if it should be used or false if not. 1932 */ 1933 bool amdgpu_device_should_use_aspm(struct amdgpu_device *adev) 1934 { 1935 switch (amdgpu_aspm) { 1936 case -1: 1937 break; 1938 case 0: 1939 return false; 1940 case 1: 1941 return true; 1942 default: 1943 return false; 1944 } 1945 if (adev->flags & AMD_IS_APU) 1946 return false; 1947 if (amdgpu_device_aspm_support_quirk(adev)) 1948 return false; 1949 return pcie_aspm_enabled(adev->pdev); 1950 } 1951 1952 /* if we get transitioned to only one device, take VGA back */ 1953 /** 1954 * amdgpu_device_vga_set_decode - enable/disable vga decode 1955 * 1956 * @pdev: PCI device pointer 1957 * @state: enable/disable vga decode 1958 * 1959 * Enable/disable vga decode (all asics). 1960 * Returns VGA resource flags. 1961 */ 1962 static unsigned int amdgpu_device_vga_set_decode(struct pci_dev *pdev, 1963 bool state) 1964 { 1965 struct amdgpu_device *adev = drm_to_adev(pci_get_drvdata(pdev)); 1966 1967 amdgpu_asic_set_vga_state(adev, state); 1968 if (state) 1969 return VGA_RSRC_LEGACY_IO | VGA_RSRC_LEGACY_MEM | 1970 VGA_RSRC_NORMAL_IO | VGA_RSRC_NORMAL_MEM; 1971 else 1972 return VGA_RSRC_NORMAL_IO | VGA_RSRC_NORMAL_MEM; 1973 } 1974 1975 /** 1976 * amdgpu_device_check_block_size - validate the vm block size 1977 * 1978 * @adev: amdgpu_device pointer 1979 * 1980 * Validates the vm block size specified via module parameter. 1981 * The vm block size defines number of bits in page table versus page directory, 1982 * a page is 4KB so we have 12 bits offset, minimum 9 bits in the 1983 * page table and the remaining bits are in the page directory. 1984 */ 1985 static void amdgpu_device_check_block_size(struct amdgpu_device *adev) 1986 { 1987 /* defines number of bits in page table versus page directory, 1988 * a page is 4KB so we have 12 bits offset, minimum 9 bits in the 1989 * page table and the remaining bits are in the page directory 1990 */ 1991 if (amdgpu_vm_block_size == -1) 1992 return; 1993 1994 if (amdgpu_vm_block_size < 9) { 1995 dev_warn(adev->dev, "VM page table size (%d) too small\n", 1996 amdgpu_vm_block_size); 1997 amdgpu_vm_block_size = -1; 1998 } 1999 } 2000 2001 /** 2002 * amdgpu_device_check_vm_size - validate the vm size 2003 * 2004 * @adev: amdgpu_device pointer 2005 * 2006 * Validates the vm size in GB specified via module parameter. 2007 * The VM size is the size of the GPU virtual memory space in GB. 2008 */ 2009 static void amdgpu_device_check_vm_size(struct amdgpu_device *adev) 2010 { 2011 /* no need to check the default value */ 2012 if (amdgpu_vm_size == -1) 2013 return; 2014 2015 if (amdgpu_vm_size < 1) { 2016 dev_warn(adev->dev, "VM size (%d) too small, min is 1GB\n", 2017 amdgpu_vm_size); 2018 amdgpu_vm_size = -1; 2019 } 2020 } 2021 2022 static void amdgpu_device_check_smu_prv_buffer_size(struct amdgpu_device *adev) 2023 { 2024 struct sysinfo si; 2025 bool is_os_64 = (sizeof(void *) == 8); 2026 uint64_t total_memory; 2027 uint64_t dram_size_seven_GB = 0x1B8000000; 2028 uint64_t dram_size_three_GB = 0xB8000000; 2029 2030 if (amdgpu_smu_memory_pool_size == 0) 2031 return; 2032 2033 if (!is_os_64) { 2034 dev_warn(adev->dev, "Not 64-bit OS, feature not supported\n"); 2035 goto def_value; 2036 } 2037 si_meminfo(&si); 2038 total_memory = (uint64_t)si.totalram * si.mem_unit; 2039 2040 if ((amdgpu_smu_memory_pool_size == 1) || 2041 (amdgpu_smu_memory_pool_size == 2)) { 2042 if (total_memory < dram_size_three_GB) 2043 goto def_value1; 2044 } else if ((amdgpu_smu_memory_pool_size == 4) || 2045 (amdgpu_smu_memory_pool_size == 8)) { 2046 if (total_memory < dram_size_seven_GB) 2047 goto def_value1; 2048 } else { 2049 dev_warn(adev->dev, "Smu memory pool size not supported\n"); 2050 goto def_value; 2051 } 2052 adev->pm.smu_prv_buffer_size = amdgpu_smu_memory_pool_size << 28; 2053 2054 return; 2055 2056 def_value1: 2057 dev_warn(adev->dev, "No enough system memory\n"); 2058 def_value: 2059 adev->pm.smu_prv_buffer_size = 0; 2060 } 2061 2062 static int amdgpu_device_init_apu_flags(struct amdgpu_device *adev) 2063 { 2064 if (!(adev->flags & AMD_IS_APU) || 2065 adev->asic_type < CHIP_RAVEN) 2066 return 0; 2067 2068 switch (adev->asic_type) { 2069 case CHIP_RAVEN: 2070 if (adev->pdev->device == 0x15dd) 2071 adev->apu_flags |= AMD_APU_IS_RAVEN; 2072 if (adev->pdev->device == 0x15d8) 2073 adev->apu_flags |= AMD_APU_IS_PICASSO; 2074 break; 2075 case CHIP_RENOIR: 2076 if ((adev->pdev->device == 0x1636) || 2077 (adev->pdev->device == 0x164c)) 2078 adev->apu_flags |= AMD_APU_IS_RENOIR; 2079 else 2080 adev->apu_flags |= AMD_APU_IS_GREEN_SARDINE; 2081 break; 2082 case CHIP_VANGOGH: 2083 adev->apu_flags |= AMD_APU_IS_VANGOGH; 2084 break; 2085 case CHIP_YELLOW_CARP: 2086 break; 2087 case CHIP_CYAN_SKILLFISH: 2088 if ((adev->pdev->device == 0x13FE) || 2089 (adev->pdev->device == 0x143F)) 2090 adev->apu_flags |= AMD_APU_IS_CYAN_SKILLFISH2; 2091 break; 2092 default: 2093 break; 2094 } 2095 2096 return 0; 2097 } 2098 2099 /** 2100 * amdgpu_device_check_arguments - validate module params 2101 * 2102 * @adev: amdgpu_device pointer 2103 * 2104 * Validates certain module parameters and updates 2105 * the associated values used by the driver (all asics). 2106 */ 2107 static int amdgpu_device_check_arguments(struct amdgpu_device *adev) 2108 { 2109 int i; 2110 2111 if (amdgpu_sched_jobs < 4) { 2112 dev_warn(adev->dev, "sched jobs (%d) must be at least 4\n", 2113 amdgpu_sched_jobs); 2114 amdgpu_sched_jobs = 4; 2115 } else if (!is_power_of_2(amdgpu_sched_jobs)) { 2116 dev_warn(adev->dev, "sched jobs (%d) must be a power of 2\n", 2117 amdgpu_sched_jobs); 2118 amdgpu_sched_jobs = roundup_pow_of_two(amdgpu_sched_jobs); 2119 } 2120 2121 if (amdgpu_gart_size != -1 && amdgpu_gart_size < 32) { 2122 /* gart size must be greater or equal to 32M */ 2123 dev_warn(adev->dev, "gart size (%d) too small\n", 2124 amdgpu_gart_size); 2125 amdgpu_gart_size = -1; 2126 } 2127 2128 if (amdgpu_gtt_size != -1 && amdgpu_gtt_size < 32) { 2129 /* gtt size must be greater or equal to 32M */ 2130 dev_warn(adev->dev, "gtt size (%d) too small\n", 2131 amdgpu_gtt_size); 2132 amdgpu_gtt_size = -1; 2133 } 2134 2135 /* valid range is between 4 and 9 inclusive */ 2136 if (amdgpu_vm_fragment_size != -1 && 2137 (amdgpu_vm_fragment_size > 9 || amdgpu_vm_fragment_size < 4)) { 2138 dev_warn(adev->dev, "valid range is between 4 and 9\n"); 2139 amdgpu_vm_fragment_size = -1; 2140 } 2141 2142 if (amdgpu_sched_hw_submission < 2) { 2143 dev_warn(adev->dev, "sched hw submission jobs (%d) must be at least 2\n", 2144 amdgpu_sched_hw_submission); 2145 amdgpu_sched_hw_submission = 2; 2146 } else if (!is_power_of_2(amdgpu_sched_hw_submission)) { 2147 dev_warn(adev->dev, "sched hw submission jobs (%d) must be a power of 2\n", 2148 amdgpu_sched_hw_submission); 2149 amdgpu_sched_hw_submission = roundup_pow_of_two(amdgpu_sched_hw_submission); 2150 } 2151 2152 if (amdgpu_reset_method < -1 || amdgpu_reset_method > 4) { 2153 dev_warn(adev->dev, "invalid option for reset method, reverting to default\n"); 2154 amdgpu_reset_method = -1; 2155 } 2156 2157 amdgpu_device_check_smu_prv_buffer_size(adev); 2158 2159 amdgpu_device_check_vm_size(adev); 2160 2161 amdgpu_device_check_block_size(adev); 2162 2163 adev->firmware.load_type = amdgpu_ucode_get_load_type(adev, amdgpu_fw_load_type); 2164 2165 for (i = 0; i < MAX_XCP; i++) { 2166 switch (amdgpu_enforce_isolation) { 2167 case -1: 2168 case 0: 2169 default: 2170 /* disable */ 2171 adev->enforce_isolation[i] = AMDGPU_ENFORCE_ISOLATION_DISABLE; 2172 break; 2173 case 1: 2174 /* enable */ 2175 adev->enforce_isolation[i] = 2176 AMDGPU_ENFORCE_ISOLATION_ENABLE; 2177 break; 2178 case 2: 2179 /* enable legacy mode */ 2180 adev->enforce_isolation[i] = 2181 AMDGPU_ENFORCE_ISOLATION_ENABLE_LEGACY; 2182 break; 2183 case 3: 2184 /* enable only process isolation without submitting cleaner shader */ 2185 adev->enforce_isolation[i] = 2186 AMDGPU_ENFORCE_ISOLATION_NO_CLEANER_SHADER; 2187 break; 2188 } 2189 } 2190 2191 return 0; 2192 } 2193 2194 /** 2195 * amdgpu_switcheroo_set_state - set switcheroo state 2196 * 2197 * @pdev: pci dev pointer 2198 * @state: vga_switcheroo state 2199 * 2200 * Callback for the switcheroo driver. Suspends or resumes 2201 * the asics before or after it is powered up using ACPI methods. 2202 */ 2203 static void amdgpu_switcheroo_set_state(struct pci_dev *pdev, 2204 enum vga_switcheroo_state state) 2205 { 2206 struct drm_device *dev = pci_get_drvdata(pdev); 2207 int r; 2208 2209 if (amdgpu_device_supports_px(drm_to_adev(dev)) && 2210 state == VGA_SWITCHEROO_OFF) 2211 return; 2212 2213 if (state == VGA_SWITCHEROO_ON) { 2214 pr_info("switched on\n"); 2215 /* don't suspend or resume card normally */ 2216 dev->switch_power_state = DRM_SWITCH_POWER_CHANGING; 2217 2218 pci_set_power_state(pdev, PCI_D0); 2219 amdgpu_device_load_pci_state(pdev); 2220 r = pci_enable_device(pdev); 2221 if (r) 2222 dev_warn(&pdev->dev, "pci_enable_device failed (%d)\n", 2223 r); 2224 amdgpu_device_resume(dev, true); 2225 2226 dev->switch_power_state = DRM_SWITCH_POWER_ON; 2227 } else { 2228 dev_info(&pdev->dev, "switched off\n"); 2229 dev->switch_power_state = DRM_SWITCH_POWER_CHANGING; 2230 amdgpu_device_prepare(dev); 2231 amdgpu_device_suspend(dev, true); 2232 amdgpu_device_cache_pci_state(pdev); 2233 /* Shut down the device */ 2234 pci_disable_device(pdev); 2235 pci_set_power_state(pdev, PCI_D3cold); 2236 dev->switch_power_state = DRM_SWITCH_POWER_OFF; 2237 } 2238 } 2239 2240 /** 2241 * amdgpu_switcheroo_can_switch - see if switcheroo state can change 2242 * 2243 * @pdev: pci dev pointer 2244 * 2245 * Callback for the switcheroo driver. Check of the switcheroo 2246 * state can be changed. 2247 * Returns true if the state can be changed, false if not. 2248 */ 2249 static bool amdgpu_switcheroo_can_switch(struct pci_dev *pdev) 2250 { 2251 struct drm_device *dev = pci_get_drvdata(pdev); 2252 2253 /* 2254 * FIXME: open_count is protected by drm_global_mutex but that would lead to 2255 * locking inversion with the driver load path. And the access here is 2256 * completely racy anyway. So don't bother with locking for now. 2257 */ 2258 return atomic_read(&dev->open_count) == 0; 2259 } 2260 2261 static const struct vga_switcheroo_client_ops amdgpu_switcheroo_ops = { 2262 .set_gpu_state = amdgpu_switcheroo_set_state, 2263 .reprobe = NULL, 2264 .can_switch = amdgpu_switcheroo_can_switch, 2265 }; 2266 2267 /** 2268 * amdgpu_device_ip_set_clockgating_state - set the CG state 2269 * 2270 * @dev: amdgpu_device pointer 2271 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.) 2272 * @state: clockgating state (gate or ungate) 2273 * 2274 * Sets the requested clockgating state for all instances of 2275 * the hardware IP specified. 2276 * Returns the error code from the last instance. 2277 */ 2278 int amdgpu_device_ip_set_clockgating_state(void *dev, 2279 enum amd_ip_block_type block_type, 2280 enum amd_clockgating_state state) 2281 { 2282 struct amdgpu_device *adev = dev; 2283 int i, r = 0; 2284 2285 for (i = 0; i < adev->num_ip_blocks; i++) { 2286 if (!adev->ip_blocks[i].status.valid) 2287 continue; 2288 if (adev->ip_blocks[i].version->type != block_type) 2289 continue; 2290 if (!adev->ip_blocks[i].version->funcs->set_clockgating_state) 2291 continue; 2292 r = adev->ip_blocks[i].version->funcs->set_clockgating_state( 2293 &adev->ip_blocks[i], state); 2294 if (r) 2295 dev_err(adev->dev, 2296 "set_clockgating_state of IP block <%s> failed %d\n", 2297 adev->ip_blocks[i].version->funcs->name, r); 2298 } 2299 return r; 2300 } 2301 2302 /** 2303 * amdgpu_device_ip_set_powergating_state - set the PG state 2304 * 2305 * @dev: amdgpu_device pointer 2306 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.) 2307 * @state: powergating state (gate or ungate) 2308 * 2309 * Sets the requested powergating state for all instances of 2310 * the hardware IP specified. 2311 * Returns the error code from the last instance. 2312 */ 2313 int amdgpu_device_ip_set_powergating_state(void *dev, 2314 enum amd_ip_block_type block_type, 2315 enum amd_powergating_state state) 2316 { 2317 struct amdgpu_device *adev = dev; 2318 int i, r = 0; 2319 2320 for (i = 0; i < adev->num_ip_blocks; i++) { 2321 if (!adev->ip_blocks[i].status.valid) 2322 continue; 2323 if (adev->ip_blocks[i].version->type != block_type) 2324 continue; 2325 if (!adev->ip_blocks[i].version->funcs->set_powergating_state) 2326 continue; 2327 r = adev->ip_blocks[i].version->funcs->set_powergating_state( 2328 &adev->ip_blocks[i], state); 2329 if (r) 2330 dev_err(adev->dev, 2331 "set_powergating_state of IP block <%s> failed %d\n", 2332 adev->ip_blocks[i].version->funcs->name, r); 2333 } 2334 return r; 2335 } 2336 2337 /** 2338 * amdgpu_device_ip_get_clockgating_state - get the CG state 2339 * 2340 * @adev: amdgpu_device pointer 2341 * @flags: clockgating feature flags 2342 * 2343 * Walks the list of IPs on the device and updates the clockgating 2344 * flags for each IP. 2345 * Updates @flags with the feature flags for each hardware IP where 2346 * clockgating is enabled. 2347 */ 2348 void amdgpu_device_ip_get_clockgating_state(struct amdgpu_device *adev, 2349 u64 *flags) 2350 { 2351 int i; 2352 2353 for (i = 0; i < adev->num_ip_blocks; i++) { 2354 if (!adev->ip_blocks[i].status.valid) 2355 continue; 2356 if (adev->ip_blocks[i].version->funcs->get_clockgating_state) 2357 adev->ip_blocks[i].version->funcs->get_clockgating_state( 2358 &adev->ip_blocks[i], flags); 2359 } 2360 } 2361 2362 /** 2363 * amdgpu_device_ip_wait_for_idle - wait for idle 2364 * 2365 * @adev: amdgpu_device pointer 2366 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.) 2367 * 2368 * Waits for the request hardware IP to be idle. 2369 * Returns 0 for success or a negative error code on failure. 2370 */ 2371 int amdgpu_device_ip_wait_for_idle(struct amdgpu_device *adev, 2372 enum amd_ip_block_type block_type) 2373 { 2374 int i, r; 2375 2376 for (i = 0; i < adev->num_ip_blocks; i++) { 2377 if (!adev->ip_blocks[i].status.valid) 2378 continue; 2379 if (adev->ip_blocks[i].version->type == block_type) { 2380 if (adev->ip_blocks[i].version->funcs->wait_for_idle) { 2381 r = adev->ip_blocks[i].version->funcs->wait_for_idle( 2382 &adev->ip_blocks[i]); 2383 if (r) 2384 return r; 2385 } 2386 break; 2387 } 2388 } 2389 return 0; 2390 2391 } 2392 2393 /** 2394 * amdgpu_device_ip_is_hw - is the hardware IP enabled 2395 * 2396 * @adev: amdgpu_device pointer 2397 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.) 2398 * 2399 * Check if the hardware IP is enable or not. 2400 * Returns true if it the IP is enable, false if not. 2401 */ 2402 bool amdgpu_device_ip_is_hw(struct amdgpu_device *adev, 2403 enum amd_ip_block_type block_type) 2404 { 2405 int i; 2406 2407 for (i = 0; i < adev->num_ip_blocks; i++) { 2408 if (adev->ip_blocks[i].version->type == block_type) 2409 return adev->ip_blocks[i].status.hw; 2410 } 2411 return false; 2412 } 2413 2414 /** 2415 * amdgpu_device_ip_is_valid - is the hardware IP valid 2416 * 2417 * @adev: amdgpu_device pointer 2418 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.) 2419 * 2420 * Check if the hardware IP is valid or not. 2421 * Returns true if it the IP is valid, false if not. 2422 */ 2423 bool amdgpu_device_ip_is_valid(struct amdgpu_device *adev, 2424 enum amd_ip_block_type block_type) 2425 { 2426 int i; 2427 2428 for (i = 0; i < adev->num_ip_blocks; i++) { 2429 if (adev->ip_blocks[i].version->type == block_type) 2430 return adev->ip_blocks[i].status.valid; 2431 } 2432 return false; 2433 2434 } 2435 2436 /** 2437 * amdgpu_device_ip_get_ip_block - get a hw IP pointer 2438 * 2439 * @adev: amdgpu_device pointer 2440 * @type: Type of hardware IP (SMU, GFX, UVD, etc.) 2441 * 2442 * Returns a pointer to the hardware IP block structure 2443 * if it exists for the asic, otherwise NULL. 2444 */ 2445 struct amdgpu_ip_block * 2446 amdgpu_device_ip_get_ip_block(struct amdgpu_device *adev, 2447 enum amd_ip_block_type type) 2448 { 2449 int i; 2450 2451 for (i = 0; i < adev->num_ip_blocks; i++) 2452 if (adev->ip_blocks[i].version->type == type) 2453 return &adev->ip_blocks[i]; 2454 2455 return NULL; 2456 } 2457 2458 /** 2459 * amdgpu_device_ip_block_version_cmp 2460 * 2461 * @adev: amdgpu_device pointer 2462 * @type: enum amd_ip_block_type 2463 * @major: major version 2464 * @minor: minor version 2465 * 2466 * return 0 if equal or greater 2467 * return 1 if smaller or the ip_block doesn't exist 2468 */ 2469 int amdgpu_device_ip_block_version_cmp(struct amdgpu_device *adev, 2470 enum amd_ip_block_type type, 2471 u32 major, u32 minor) 2472 { 2473 struct amdgpu_ip_block *ip_block = amdgpu_device_ip_get_ip_block(adev, type); 2474 2475 if (ip_block && ((ip_block->version->major > major) || 2476 ((ip_block->version->major == major) && 2477 (ip_block->version->minor >= minor)))) 2478 return 0; 2479 2480 return 1; 2481 } 2482 2483 static const char *ip_block_names[] = { 2484 [AMD_IP_BLOCK_TYPE_COMMON] = "common", 2485 [AMD_IP_BLOCK_TYPE_GMC] = "gmc", 2486 [AMD_IP_BLOCK_TYPE_IH] = "ih", 2487 [AMD_IP_BLOCK_TYPE_SMC] = "smu", 2488 [AMD_IP_BLOCK_TYPE_PSP] = "psp", 2489 [AMD_IP_BLOCK_TYPE_DCE] = "dce", 2490 [AMD_IP_BLOCK_TYPE_GFX] = "gfx", 2491 [AMD_IP_BLOCK_TYPE_SDMA] = "sdma", 2492 [AMD_IP_BLOCK_TYPE_UVD] = "uvd", 2493 [AMD_IP_BLOCK_TYPE_VCE] = "vce", 2494 [AMD_IP_BLOCK_TYPE_ACP] = "acp", 2495 [AMD_IP_BLOCK_TYPE_VCN] = "vcn", 2496 [AMD_IP_BLOCK_TYPE_MES] = "mes", 2497 [AMD_IP_BLOCK_TYPE_JPEG] = "jpeg", 2498 [AMD_IP_BLOCK_TYPE_VPE] = "vpe", 2499 [AMD_IP_BLOCK_TYPE_UMSCH_MM] = "umsch_mm", 2500 [AMD_IP_BLOCK_TYPE_ISP] = "isp", 2501 }; 2502 2503 static const char *ip_block_name(struct amdgpu_device *adev, enum amd_ip_block_type type) 2504 { 2505 int idx = (int)type; 2506 2507 return idx < ARRAY_SIZE(ip_block_names) ? ip_block_names[idx] : "unknown"; 2508 } 2509 2510 /** 2511 * amdgpu_device_ip_block_add 2512 * 2513 * @adev: amdgpu_device pointer 2514 * @ip_block_version: pointer to the IP to add 2515 * 2516 * Adds the IP block driver information to the collection of IPs 2517 * on the asic. 2518 */ 2519 int amdgpu_device_ip_block_add(struct amdgpu_device *adev, 2520 const struct amdgpu_ip_block_version *ip_block_version) 2521 { 2522 if (!ip_block_version) 2523 return -EINVAL; 2524 2525 switch (ip_block_version->type) { 2526 case AMD_IP_BLOCK_TYPE_VCN: 2527 if (adev->harvest_ip_mask & AMD_HARVEST_IP_VCN_MASK) 2528 return 0; 2529 break; 2530 case AMD_IP_BLOCK_TYPE_JPEG: 2531 if (adev->harvest_ip_mask & AMD_HARVEST_IP_JPEG_MASK) 2532 return 0; 2533 break; 2534 default: 2535 break; 2536 } 2537 2538 dev_info(adev->dev, "detected ip block number %d <%s_v%d_%d_%d> (%s)\n", 2539 adev->num_ip_blocks, 2540 ip_block_name(adev, ip_block_version->type), 2541 ip_block_version->major, 2542 ip_block_version->minor, 2543 ip_block_version->rev, 2544 ip_block_version->funcs->name); 2545 2546 adev->ip_blocks[adev->num_ip_blocks].adev = adev; 2547 2548 adev->ip_blocks[adev->num_ip_blocks++].version = ip_block_version; 2549 2550 return 0; 2551 } 2552 2553 /** 2554 * amdgpu_device_enable_virtual_display - enable virtual display feature 2555 * 2556 * @adev: amdgpu_device pointer 2557 * 2558 * Enabled the virtual display feature if the user has enabled it via 2559 * the module parameter virtual_display. This feature provides a virtual 2560 * display hardware on headless boards or in virtualized environments. 2561 * This function parses and validates the configuration string specified by 2562 * the user and configures the virtual display configuration (number of 2563 * virtual connectors, crtcs, etc.) specified. 2564 */ 2565 static void amdgpu_device_enable_virtual_display(struct amdgpu_device *adev) 2566 { 2567 adev->enable_virtual_display = false; 2568 2569 if (amdgpu_virtual_display) { 2570 const char *pci_address_name = pci_name(adev->pdev); 2571 char *pciaddstr, *pciaddstr_tmp, *pciaddname_tmp, *pciaddname; 2572 2573 pciaddstr = kstrdup(amdgpu_virtual_display, GFP_KERNEL); 2574 pciaddstr_tmp = pciaddstr; 2575 while ((pciaddname_tmp = strsep(&pciaddstr_tmp, ";"))) { 2576 pciaddname = strsep(&pciaddname_tmp, ","); 2577 if (!strcmp("all", pciaddname) 2578 || !strcmp(pci_address_name, pciaddname)) { 2579 long num_crtc; 2580 int res = -1; 2581 2582 adev->enable_virtual_display = true; 2583 2584 if (pciaddname_tmp) 2585 res = kstrtol(pciaddname_tmp, 10, 2586 &num_crtc); 2587 2588 if (!res) { 2589 if (num_crtc < 1) 2590 num_crtc = 1; 2591 if (num_crtc > 6) 2592 num_crtc = 6; 2593 adev->mode_info.num_crtc = num_crtc; 2594 } else { 2595 adev->mode_info.num_crtc = 1; 2596 } 2597 break; 2598 } 2599 } 2600 2601 dev_info( 2602 adev->dev, 2603 "virtual display string:%s, %s:virtual_display:%d, num_crtc:%d\n", 2604 amdgpu_virtual_display, pci_address_name, 2605 adev->enable_virtual_display, adev->mode_info.num_crtc); 2606 2607 kfree(pciaddstr); 2608 } 2609 } 2610 2611 void amdgpu_device_set_sriov_virtual_display(struct amdgpu_device *adev) 2612 { 2613 if (amdgpu_sriov_vf(adev) && !adev->enable_virtual_display) { 2614 adev->mode_info.num_crtc = 1; 2615 adev->enable_virtual_display = true; 2616 dev_info(adev->dev, "virtual_display:%d, num_crtc:%d\n", 2617 adev->enable_virtual_display, 2618 adev->mode_info.num_crtc); 2619 } 2620 } 2621 2622 /** 2623 * amdgpu_device_parse_gpu_info_fw - parse gpu info firmware 2624 * 2625 * @adev: amdgpu_device pointer 2626 * 2627 * Parses the asic configuration parameters specified in the gpu info 2628 * firmware and makes them available to the driver for use in configuring 2629 * the asic. 2630 * Returns 0 on success, -EINVAL on failure. 2631 */ 2632 static int amdgpu_device_parse_gpu_info_fw(struct amdgpu_device *adev) 2633 { 2634 const char *chip_name; 2635 int err; 2636 const struct gpu_info_firmware_header_v1_0 *hdr; 2637 2638 adev->firmware.gpu_info_fw = NULL; 2639 2640 switch (adev->asic_type) { 2641 default: 2642 return 0; 2643 case CHIP_VEGA10: 2644 chip_name = "vega10"; 2645 break; 2646 case CHIP_VEGA12: 2647 chip_name = "vega12"; 2648 break; 2649 case CHIP_RAVEN: 2650 if (adev->apu_flags & AMD_APU_IS_RAVEN2) 2651 chip_name = "raven2"; 2652 else if (adev->apu_flags & AMD_APU_IS_PICASSO) 2653 chip_name = "picasso"; 2654 else 2655 chip_name = "raven"; 2656 break; 2657 case CHIP_ARCTURUS: 2658 chip_name = "arcturus"; 2659 break; 2660 case CHIP_NAVI12: 2661 if (adev->discovery.bin) 2662 return 0; 2663 chip_name = "navi12"; 2664 break; 2665 case CHIP_CYAN_SKILLFISH: 2666 chip_name = "cyan_skillfish"; 2667 break; 2668 } 2669 2670 err = amdgpu_ucode_request(adev, &adev->firmware.gpu_info_fw, 2671 AMDGPU_UCODE_OPTIONAL, 2672 "amdgpu/%s_gpu_info.bin", chip_name); 2673 if (err) { 2674 dev_err(adev->dev, 2675 "Failed to get gpu_info firmware \"%s_gpu_info.bin\"\n", 2676 chip_name); 2677 goto out; 2678 } 2679 2680 hdr = (const struct gpu_info_firmware_header_v1_0 *)adev->firmware.gpu_info_fw->data; 2681 amdgpu_ucode_print_gpu_info_hdr(&hdr->header); 2682 2683 switch (hdr->version_major) { 2684 case 1: 2685 { 2686 const struct gpu_info_firmware_v1_0 *gpu_info_fw = 2687 (const struct gpu_info_firmware_v1_0 *)(adev->firmware.gpu_info_fw->data + 2688 le32_to_cpu(hdr->header.ucode_array_offset_bytes)); 2689 2690 /* 2691 * Should be dropped when DAL no longer needs it. 2692 */ 2693 if (adev->asic_type == CHIP_NAVI12) 2694 goto parse_soc_bounding_box; 2695 2696 adev->gfx.config.max_shader_engines = le32_to_cpu(gpu_info_fw->gc_num_se); 2697 adev->gfx.config.max_cu_per_sh = le32_to_cpu(gpu_info_fw->gc_num_cu_per_sh); 2698 adev->gfx.config.max_sh_per_se = le32_to_cpu(gpu_info_fw->gc_num_sh_per_se); 2699 adev->gfx.config.max_backends_per_se = le32_to_cpu(gpu_info_fw->gc_num_rb_per_se); 2700 adev->gfx.config.max_texture_channel_caches = 2701 le32_to_cpu(gpu_info_fw->gc_num_tccs); 2702 adev->gfx.config.max_gprs = le32_to_cpu(gpu_info_fw->gc_num_gprs); 2703 adev->gfx.config.max_gs_threads = le32_to_cpu(gpu_info_fw->gc_num_max_gs_thds); 2704 adev->gfx.config.gs_vgt_table_depth = le32_to_cpu(gpu_info_fw->gc_gs_table_depth); 2705 adev->gfx.config.gs_prim_buffer_depth = le32_to_cpu(gpu_info_fw->gc_gsprim_buff_depth); 2706 adev->gfx.config.double_offchip_lds_buf = 2707 le32_to_cpu(gpu_info_fw->gc_double_offchip_lds_buffer); 2708 adev->gfx.cu_info.wave_front_size = le32_to_cpu(gpu_info_fw->gc_wave_size); 2709 adev->gfx.cu_info.max_waves_per_simd = 2710 le32_to_cpu(gpu_info_fw->gc_max_waves_per_simd); 2711 adev->gfx.cu_info.max_scratch_slots_per_cu = 2712 le32_to_cpu(gpu_info_fw->gc_max_scratch_slots_per_cu); 2713 adev->gfx.cu_info.lds_size = le32_to_cpu(gpu_info_fw->gc_lds_size); 2714 if (hdr->version_minor >= 1) { 2715 const struct gpu_info_firmware_v1_1 *gpu_info_fw = 2716 (const struct gpu_info_firmware_v1_1 *)(adev->firmware.gpu_info_fw->data + 2717 le32_to_cpu(hdr->header.ucode_array_offset_bytes)); 2718 adev->gfx.config.num_sc_per_sh = 2719 le32_to_cpu(gpu_info_fw->num_sc_per_sh); 2720 adev->gfx.config.num_packer_per_sc = 2721 le32_to_cpu(gpu_info_fw->num_packer_per_sc); 2722 } 2723 2724 parse_soc_bounding_box: 2725 /* 2726 * soc bounding box info is not integrated in disocovery table, 2727 * we always need to parse it from gpu info firmware if needed. 2728 */ 2729 if (hdr->version_minor == 2) { 2730 const struct gpu_info_firmware_v1_2 *gpu_info_fw = 2731 (const struct gpu_info_firmware_v1_2 *)(adev->firmware.gpu_info_fw->data + 2732 le32_to_cpu(hdr->header.ucode_array_offset_bytes)); 2733 adev->dm.soc_bounding_box = &gpu_info_fw->soc_bounding_box; 2734 } 2735 break; 2736 } 2737 default: 2738 dev_err(adev->dev, 2739 "Unsupported gpu_info table %d\n", hdr->header.ucode_version); 2740 err = -EINVAL; 2741 goto out; 2742 } 2743 out: 2744 return err; 2745 } 2746 2747 static void amdgpu_uid_init(struct amdgpu_device *adev) 2748 { 2749 /* Initialize the UID for the device */ 2750 adev->uid_info = kzalloc(sizeof(struct amdgpu_uid), GFP_KERNEL); 2751 if (!adev->uid_info) { 2752 dev_warn(adev->dev, "Failed to allocate memory for UID\n"); 2753 return; 2754 } 2755 adev->uid_info->adev = adev; 2756 } 2757 2758 static void amdgpu_uid_fini(struct amdgpu_device *adev) 2759 { 2760 /* Free the UID memory */ 2761 kfree(adev->uid_info); 2762 adev->uid_info = NULL; 2763 } 2764 2765 /** 2766 * amdgpu_device_ip_early_init - run early init for hardware IPs 2767 * 2768 * @adev: amdgpu_device pointer 2769 * 2770 * Early initialization pass for hardware IPs. The hardware IPs that make 2771 * up each asic are discovered each IP's early_init callback is run. This 2772 * is the first stage in initializing the asic. 2773 * Returns 0 on success, negative error code on failure. 2774 */ 2775 static int amdgpu_device_ip_early_init(struct amdgpu_device *adev) 2776 { 2777 struct amdgpu_ip_block *ip_block; 2778 struct pci_dev *parent; 2779 bool total, skip_bios; 2780 uint32_t bios_flags; 2781 int i, r; 2782 2783 amdgpu_device_enable_virtual_display(adev); 2784 2785 if (amdgpu_sriov_vf(adev)) { 2786 r = amdgpu_virt_request_full_gpu(adev, true); 2787 if (r) 2788 return r; 2789 2790 r = amdgpu_virt_init_critical_region(adev); 2791 if (r) 2792 return r; 2793 } 2794 2795 switch (adev->asic_type) { 2796 #ifdef CONFIG_DRM_AMDGPU_SI 2797 case CHIP_VERDE: 2798 case CHIP_TAHITI: 2799 case CHIP_PITCAIRN: 2800 case CHIP_OLAND: 2801 case CHIP_HAINAN: 2802 adev->family = AMDGPU_FAMILY_SI; 2803 r = si_set_ip_blocks(adev); 2804 if (r) 2805 return r; 2806 break; 2807 #endif 2808 #ifdef CONFIG_DRM_AMDGPU_CIK 2809 case CHIP_BONAIRE: 2810 case CHIP_HAWAII: 2811 case CHIP_KAVERI: 2812 case CHIP_KABINI: 2813 case CHIP_MULLINS: 2814 if (adev->flags & AMD_IS_APU) 2815 adev->family = AMDGPU_FAMILY_KV; 2816 else 2817 adev->family = AMDGPU_FAMILY_CI; 2818 2819 r = cik_set_ip_blocks(adev); 2820 if (r) 2821 return r; 2822 break; 2823 #endif 2824 case CHIP_TOPAZ: 2825 case CHIP_TONGA: 2826 case CHIP_FIJI: 2827 case CHIP_POLARIS10: 2828 case CHIP_POLARIS11: 2829 case CHIP_POLARIS12: 2830 case CHIP_VEGAM: 2831 case CHIP_CARRIZO: 2832 case CHIP_STONEY: 2833 if (adev->flags & AMD_IS_APU) 2834 adev->family = AMDGPU_FAMILY_CZ; 2835 else 2836 adev->family = AMDGPU_FAMILY_VI; 2837 2838 r = vi_set_ip_blocks(adev); 2839 if (r) 2840 return r; 2841 break; 2842 default: 2843 r = amdgpu_discovery_set_ip_blocks(adev); 2844 if (r) 2845 return r; 2846 break; 2847 } 2848 2849 /* Check for IP version 9.4.3 with A0 hardware */ 2850 if (amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 3) && 2851 !amdgpu_device_get_rev_id(adev)) { 2852 dev_err(adev->dev, "Unsupported A0 hardware\n"); 2853 return -ENODEV; /* device unsupported - no device error */ 2854 } 2855 2856 if (amdgpu_has_atpx() && 2857 (amdgpu_is_atpx_hybrid() || 2858 amdgpu_has_atpx_dgpu_power_cntl()) && 2859 ((adev->flags & AMD_IS_APU) == 0) && 2860 !dev_is_removable(&adev->pdev->dev)) 2861 adev->flags |= AMD_IS_PX; 2862 2863 if (!(adev->flags & AMD_IS_APU)) { 2864 parent = pcie_find_root_port(adev->pdev); 2865 adev->has_pr3 = parent ? pci_pr3_present(parent) : false; 2866 } 2867 2868 adev->pm.pp_feature = amdgpu_pp_feature_mask; 2869 if (amdgpu_sriov_vf(adev) || sched_policy == KFD_SCHED_POLICY_NO_HWS) 2870 adev->pm.pp_feature &= ~PP_GFXOFF_MASK; 2871 if (amdgpu_sriov_vf(adev) && adev->asic_type == CHIP_SIENNA_CICHLID) 2872 adev->pm.pp_feature &= ~PP_OVERDRIVE_MASK; 2873 if (!amdgpu_device_pcie_dynamic_switching_supported(adev)) 2874 adev->pm.pp_feature &= ~PP_PCIE_DPM_MASK; 2875 2876 adev->virt.is_xgmi_node_migrate_enabled = false; 2877 if (amdgpu_sriov_vf(adev)) { 2878 adev->virt.is_xgmi_node_migrate_enabled = 2879 amdgpu_ip_version((adev), GC_HWIP, 0) == IP_VERSION(9, 4, 4); 2880 } 2881 2882 total = true; 2883 for (i = 0; i < adev->num_ip_blocks; i++) { 2884 ip_block = &adev->ip_blocks[i]; 2885 2886 if ((amdgpu_ip_block_mask & (1 << i)) == 0) { 2887 dev_warn(adev->dev, "disabled ip block: %d <%s>\n", i, 2888 adev->ip_blocks[i].version->funcs->name); 2889 adev->ip_blocks[i].status.valid = false; 2890 } else if (ip_block->version->funcs->early_init) { 2891 r = ip_block->version->funcs->early_init(ip_block); 2892 if (r == -ENOENT) { 2893 adev->ip_blocks[i].status.valid = false; 2894 } else if (r) { 2895 dev_err(adev->dev, 2896 "early_init of IP block <%s> failed %d\n", 2897 adev->ip_blocks[i].version->funcs->name, 2898 r); 2899 total = false; 2900 } else { 2901 adev->ip_blocks[i].status.valid = true; 2902 } 2903 } else { 2904 adev->ip_blocks[i].status.valid = true; 2905 } 2906 /* get the vbios after the asic_funcs are set up */ 2907 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON) { 2908 r = amdgpu_device_parse_gpu_info_fw(adev); 2909 if (r) 2910 return r; 2911 2912 bios_flags = amdgpu_device_get_vbios_flags(adev); 2913 skip_bios = !!(bios_flags & AMDGPU_VBIOS_SKIP); 2914 /* Read BIOS */ 2915 if (!skip_bios) { 2916 bool optional = 2917 !!(bios_flags & AMDGPU_VBIOS_OPTIONAL); 2918 if (!amdgpu_get_bios(adev) && !optional) 2919 return -EINVAL; 2920 2921 if (optional && !adev->bios) 2922 dev_info( 2923 adev->dev, 2924 "VBIOS image optional, proceeding without VBIOS image"); 2925 2926 if (adev->bios) { 2927 r = amdgpu_atombios_init(adev); 2928 if (r) { 2929 dev_err(adev->dev, 2930 "amdgpu_atombios_init failed\n"); 2931 amdgpu_vf_error_put( 2932 adev, 2933 AMDGIM_ERROR_VF_ATOMBIOS_INIT_FAIL, 2934 0, 0); 2935 return r; 2936 } 2937 } 2938 } 2939 2940 /*get pf2vf msg info at it's earliest time*/ 2941 if (amdgpu_sriov_vf(adev)) 2942 amdgpu_virt_init_data_exchange(adev); 2943 2944 } 2945 } 2946 if (!total) 2947 return -ENODEV; 2948 2949 if (adev->gmc.xgmi.supported) 2950 amdgpu_xgmi_early_init(adev); 2951 2952 if (amdgpu_is_multi_aid(adev)) 2953 amdgpu_uid_init(adev); 2954 ip_block = amdgpu_device_ip_get_ip_block(adev, AMD_IP_BLOCK_TYPE_GFX); 2955 if (ip_block->status.valid != false) 2956 amdgpu_amdkfd_device_probe(adev); 2957 2958 adev->cg_flags &= amdgpu_cg_mask; 2959 adev->pg_flags &= amdgpu_pg_mask; 2960 2961 return 0; 2962 } 2963 2964 static int amdgpu_device_ip_hw_init_phase1(struct amdgpu_device *adev) 2965 { 2966 int i, r; 2967 2968 for (i = 0; i < adev->num_ip_blocks; i++) { 2969 if (!adev->ip_blocks[i].status.sw) 2970 continue; 2971 if (adev->ip_blocks[i].status.hw) 2972 continue; 2973 if (!amdgpu_ip_member_of_hwini( 2974 adev, adev->ip_blocks[i].version->type)) 2975 continue; 2976 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON || 2977 (amdgpu_sriov_vf(adev) && (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP)) || 2978 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH) { 2979 r = adev->ip_blocks[i].version->funcs->hw_init(&adev->ip_blocks[i]); 2980 if (r) { 2981 dev_err(adev->dev, 2982 "hw_init of IP block <%s> failed %d\n", 2983 adev->ip_blocks[i].version->funcs->name, 2984 r); 2985 return r; 2986 } 2987 adev->ip_blocks[i].status.hw = true; 2988 } 2989 } 2990 2991 return 0; 2992 } 2993 2994 static int amdgpu_device_ip_hw_init_phase2(struct amdgpu_device *adev) 2995 { 2996 int i, r; 2997 2998 for (i = 0; i < adev->num_ip_blocks; i++) { 2999 if (!adev->ip_blocks[i].status.sw) 3000 continue; 3001 if (adev->ip_blocks[i].status.hw) 3002 continue; 3003 if (!amdgpu_ip_member_of_hwini( 3004 adev, adev->ip_blocks[i].version->type)) 3005 continue; 3006 r = adev->ip_blocks[i].version->funcs->hw_init(&adev->ip_blocks[i]); 3007 if (r) { 3008 dev_err(adev->dev, 3009 "hw_init of IP block <%s> failed %d\n", 3010 adev->ip_blocks[i].version->funcs->name, r); 3011 return r; 3012 } 3013 adev->ip_blocks[i].status.hw = true; 3014 } 3015 3016 return 0; 3017 } 3018 3019 static int amdgpu_device_fw_loading(struct amdgpu_device *adev) 3020 { 3021 int r = 0; 3022 int i; 3023 uint32_t smu_version; 3024 3025 if (adev->asic_type >= CHIP_VEGA10) { 3026 for (i = 0; i < adev->num_ip_blocks; i++) { 3027 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_PSP) 3028 continue; 3029 3030 if (!amdgpu_ip_member_of_hwini(adev, 3031 AMD_IP_BLOCK_TYPE_PSP)) 3032 break; 3033 3034 if (!adev->ip_blocks[i].status.sw) 3035 continue; 3036 3037 /* no need to do the fw loading again if already done*/ 3038 if (adev->ip_blocks[i].status.hw == true) 3039 break; 3040 3041 if (amdgpu_in_reset(adev) || adev->in_suspend) { 3042 r = amdgpu_ip_block_resume(&adev->ip_blocks[i]); 3043 if (r) 3044 return r; 3045 } else { 3046 r = adev->ip_blocks[i].version->funcs->hw_init(&adev->ip_blocks[i]); 3047 if (r) { 3048 dev_err(adev->dev, 3049 "hw_init of IP block <%s> failed %d\n", 3050 adev->ip_blocks[i] 3051 .version->funcs->name, 3052 r); 3053 return r; 3054 } 3055 adev->ip_blocks[i].status.hw = true; 3056 } 3057 break; 3058 } 3059 } 3060 3061 if (!amdgpu_sriov_vf(adev) || adev->asic_type == CHIP_TONGA) 3062 r = amdgpu_pm_load_smu_firmware(adev, &smu_version); 3063 3064 return r; 3065 } 3066 3067 static int amdgpu_device_init_schedulers(struct amdgpu_device *adev) 3068 { 3069 struct drm_sched_init_args args = { 3070 .ops = &amdgpu_sched_ops, 3071 .num_rqs = DRM_SCHED_PRIORITY_COUNT, 3072 .timeout_wq = adev->reset_domain->wq, 3073 .dev = adev->dev, 3074 }; 3075 long timeout; 3076 int r, i; 3077 3078 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 3079 struct amdgpu_ring *ring = adev->rings[i]; 3080 3081 /* No need to setup the GPU scheduler for rings that don't need it */ 3082 if (!ring || ring->no_scheduler) 3083 continue; 3084 3085 switch (ring->funcs->type) { 3086 case AMDGPU_RING_TYPE_GFX: 3087 timeout = adev->gfx_timeout; 3088 break; 3089 case AMDGPU_RING_TYPE_COMPUTE: 3090 timeout = adev->compute_timeout; 3091 break; 3092 case AMDGPU_RING_TYPE_SDMA: 3093 timeout = adev->sdma_timeout; 3094 break; 3095 default: 3096 timeout = adev->video_timeout; 3097 break; 3098 } 3099 3100 args.timeout = timeout; 3101 args.credit_limit = ring->num_hw_submission; 3102 args.score = ring->sched_score; 3103 args.name = ring->name; 3104 3105 r = drm_sched_init(&ring->sched, &args); 3106 if (r) { 3107 dev_err(adev->dev, 3108 "Failed to create scheduler on ring %s.\n", 3109 ring->name); 3110 return r; 3111 } 3112 r = amdgpu_uvd_entity_init(adev, ring); 3113 if (r) { 3114 dev_err(adev->dev, 3115 "Failed to create UVD scheduling entity on ring %s.\n", 3116 ring->name); 3117 return r; 3118 } 3119 r = amdgpu_vce_entity_init(adev, ring); 3120 if (r) { 3121 dev_err(adev->dev, 3122 "Failed to create VCE scheduling entity on ring %s.\n", 3123 ring->name); 3124 return r; 3125 } 3126 } 3127 3128 if (adev->xcp_mgr) 3129 amdgpu_xcp_update_partition_sched_list(adev); 3130 3131 return 0; 3132 } 3133 3134 3135 /** 3136 * amdgpu_device_ip_init - run init for hardware IPs 3137 * 3138 * @adev: amdgpu_device pointer 3139 * 3140 * Main initialization pass for hardware IPs. The list of all the hardware 3141 * IPs that make up the asic is walked and the sw_init and hw_init callbacks 3142 * are run. sw_init initializes the software state associated with each IP 3143 * and hw_init initializes the hardware associated with each IP. 3144 * Returns 0 on success, negative error code on failure. 3145 */ 3146 static int amdgpu_device_ip_init(struct amdgpu_device *adev) 3147 { 3148 bool init_badpage; 3149 int i, r; 3150 3151 r = amdgpu_ras_init(adev); 3152 if (r) 3153 return r; 3154 3155 for (i = 0; i < adev->num_ip_blocks; i++) { 3156 if (!adev->ip_blocks[i].status.valid) 3157 continue; 3158 if (adev->ip_blocks[i].version->funcs->sw_init) { 3159 r = adev->ip_blocks[i].version->funcs->sw_init(&adev->ip_blocks[i]); 3160 if (r) { 3161 dev_err(adev->dev, 3162 "sw_init of IP block <%s> failed %d\n", 3163 adev->ip_blocks[i].version->funcs->name, 3164 r); 3165 goto init_failed; 3166 } 3167 } 3168 adev->ip_blocks[i].status.sw = true; 3169 3170 if (!amdgpu_ip_member_of_hwini( 3171 adev, adev->ip_blocks[i].version->type)) 3172 continue; 3173 3174 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON) { 3175 /* need to do common hw init early so everything is set up for gmc */ 3176 r = adev->ip_blocks[i].version->funcs->hw_init(&adev->ip_blocks[i]); 3177 if (r) { 3178 dev_err(adev->dev, "hw_init %d failed %d\n", i, 3179 r); 3180 goto init_failed; 3181 } 3182 adev->ip_blocks[i].status.hw = true; 3183 } else if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) { 3184 /* need to do gmc hw init early so we can allocate gpu mem */ 3185 /* Try to reserve bad pages early */ 3186 if (amdgpu_sriov_vf(adev)) 3187 amdgpu_virt_exchange_data(adev); 3188 3189 r = amdgpu_device_mem_scratch_init(adev); 3190 if (r) { 3191 dev_err(adev->dev, 3192 "amdgpu_mem_scratch_init failed %d\n", 3193 r); 3194 goto init_failed; 3195 } 3196 r = adev->ip_blocks[i].version->funcs->hw_init(&adev->ip_blocks[i]); 3197 if (r) { 3198 dev_err(adev->dev, "hw_init %d failed %d\n", i, 3199 r); 3200 goto init_failed; 3201 } 3202 r = amdgpu_device_wb_init(adev); 3203 if (r) { 3204 dev_err(adev->dev, 3205 "amdgpu_device_wb_init failed %d\n", r); 3206 goto init_failed; 3207 } 3208 adev->ip_blocks[i].status.hw = true; 3209 3210 /* right after GMC hw init, we create CSA */ 3211 if (adev->gfx.mcbp) { 3212 r = amdgpu_allocate_static_csa(adev, &adev->virt.csa_obj, 3213 AMDGPU_GEM_DOMAIN_VRAM | 3214 AMDGPU_GEM_DOMAIN_GTT, 3215 AMDGPU_CSA_SIZE); 3216 if (r) { 3217 dev_err(adev->dev, 3218 "allocate CSA failed %d\n", r); 3219 goto init_failed; 3220 } 3221 } 3222 3223 r = amdgpu_seq64_init(adev); 3224 if (r) { 3225 dev_err(adev->dev, "allocate seq64 failed %d\n", 3226 r); 3227 goto init_failed; 3228 } 3229 } 3230 } 3231 3232 if (amdgpu_sriov_vf(adev)) 3233 amdgpu_virt_init_data_exchange(adev); 3234 3235 r = amdgpu_ib_pool_init(adev); 3236 if (r) { 3237 dev_err(adev->dev, "IB initialization failed (%d).\n", r); 3238 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_IB_INIT_FAIL, 0, r); 3239 goto init_failed; 3240 } 3241 3242 r = amdgpu_ucode_create_bo(adev); /* create ucode bo when sw_init complete*/ 3243 if (r) 3244 goto init_failed; 3245 3246 r = amdgpu_device_ip_hw_init_phase1(adev); 3247 if (r) 3248 goto init_failed; 3249 3250 r = amdgpu_device_fw_loading(adev); 3251 if (r) 3252 goto init_failed; 3253 3254 r = amdgpu_device_ip_hw_init_phase2(adev); 3255 if (r) 3256 goto init_failed; 3257 3258 /* 3259 * retired pages will be loaded from eeprom and reserved here, 3260 * it should be called after amdgpu_device_ip_hw_init_phase2 since 3261 * for some ASICs the RAS EEPROM code relies on SMU fully functioning 3262 * for I2C communication which only true at this point. 3263 * 3264 * amdgpu_ras_recovery_init may fail, but the upper only cares the 3265 * failure from bad gpu situation and stop amdgpu init process 3266 * accordingly. For other failed cases, it will still release all 3267 * the resource and print error message, rather than returning one 3268 * negative value to upper level. 3269 * 3270 * Note: theoretically, this should be called before all vram allocations 3271 * to protect retired page from abusing 3272 */ 3273 init_badpage = (adev->init_lvl->level != AMDGPU_INIT_LEVEL_MINIMAL_XGMI); 3274 r = amdgpu_ras_recovery_init(adev, init_badpage); 3275 if (r) 3276 goto init_failed; 3277 3278 /** 3279 * In case of XGMI grab extra reference for reset domain for this device 3280 */ 3281 if (adev->gmc.xgmi.num_physical_nodes > 1) { 3282 if (amdgpu_xgmi_add_device(adev) == 0) { 3283 if (!amdgpu_sriov_vf(adev)) { 3284 struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev); 3285 3286 if (WARN_ON(!hive)) { 3287 r = -ENOENT; 3288 goto init_failed; 3289 } 3290 3291 if (!hive->reset_domain || 3292 !amdgpu_reset_get_reset_domain(hive->reset_domain)) { 3293 r = -ENOENT; 3294 amdgpu_put_xgmi_hive(hive); 3295 goto init_failed; 3296 } 3297 3298 /* Drop the early temporary reset domain we created for device */ 3299 amdgpu_reset_put_reset_domain(adev->reset_domain); 3300 adev->reset_domain = hive->reset_domain; 3301 amdgpu_put_xgmi_hive(hive); 3302 } 3303 } 3304 } 3305 3306 r = amdgpu_device_init_schedulers(adev); 3307 if (r) 3308 goto init_failed; 3309 3310 if (adev->mman.buffer_funcs_ring->sched.ready) 3311 amdgpu_ttm_set_buffer_funcs_status(adev, true); 3312 3313 /* Don't init kfd if whole hive need to be reset during init */ 3314 if (adev->init_lvl->level != AMDGPU_INIT_LEVEL_MINIMAL_XGMI) { 3315 kgd2kfd_init_zone_device(adev); 3316 amdgpu_amdkfd_device_init(adev); 3317 } 3318 3319 amdgpu_fru_get_product_info(adev); 3320 3321 if (!amdgpu_sriov_vf(adev) || amdgpu_sriov_ras_cper_en(adev)) 3322 r = amdgpu_cper_init(adev); 3323 3324 init_failed: 3325 3326 return r; 3327 } 3328 3329 /** 3330 * amdgpu_device_fill_reset_magic - writes reset magic to gart pointer 3331 * 3332 * @adev: amdgpu_device pointer 3333 * 3334 * Writes a reset magic value to the gart pointer in VRAM. The driver calls 3335 * this function before a GPU reset. If the value is retained after a 3336 * GPU reset, VRAM has not been lost. Some GPU resets may destroy VRAM contents. 3337 */ 3338 static void amdgpu_device_fill_reset_magic(struct amdgpu_device *adev) 3339 { 3340 memcpy(adev->reset_magic, adev->gart.ptr, AMDGPU_RESET_MAGIC_NUM); 3341 } 3342 3343 /** 3344 * amdgpu_device_check_vram_lost - check if vram is valid 3345 * 3346 * @adev: amdgpu_device pointer 3347 * 3348 * Checks the reset magic value written to the gart pointer in VRAM. 3349 * The driver calls this after a GPU reset to see if the contents of 3350 * VRAM is lost or now. 3351 * returns true if vram is lost, false if not. 3352 */ 3353 static bool amdgpu_device_check_vram_lost(struct amdgpu_device *adev) 3354 { 3355 if (memcmp(adev->gart.ptr, adev->reset_magic, 3356 AMDGPU_RESET_MAGIC_NUM)) 3357 return true; 3358 3359 if (!amdgpu_in_reset(adev)) 3360 return false; 3361 3362 /* 3363 * For all ASICs with baco/mode1 reset, the VRAM is 3364 * always assumed to be lost. 3365 */ 3366 switch (amdgpu_asic_reset_method(adev)) { 3367 case AMD_RESET_METHOD_LEGACY: 3368 case AMD_RESET_METHOD_LINK: 3369 case AMD_RESET_METHOD_BACO: 3370 case AMD_RESET_METHOD_MODE1: 3371 return true; 3372 default: 3373 return false; 3374 } 3375 } 3376 3377 /** 3378 * amdgpu_device_set_cg_state - set clockgating for amdgpu device 3379 * 3380 * @adev: amdgpu_device pointer 3381 * @state: clockgating state (gate or ungate) 3382 * 3383 * The list of all the hardware IPs that make up the asic is walked and the 3384 * set_clockgating_state callbacks are run. 3385 * Late initialization pass enabling clockgating for hardware IPs. 3386 * Fini or suspend, pass disabling clockgating for hardware IPs. 3387 * Returns 0 on success, negative error code on failure. 3388 */ 3389 3390 int amdgpu_device_set_cg_state(struct amdgpu_device *adev, 3391 enum amd_clockgating_state state) 3392 { 3393 int i, j, r; 3394 3395 if (amdgpu_emu_mode == 1) 3396 return 0; 3397 3398 for (j = 0; j < adev->num_ip_blocks; j++) { 3399 i = state == AMD_CG_STATE_GATE ? j : adev->num_ip_blocks - j - 1; 3400 if (!adev->ip_blocks[i].status.late_initialized) 3401 continue; 3402 /* skip CG for GFX, SDMA on S0ix */ 3403 if (adev->in_s0ix && 3404 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GFX || 3405 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SDMA)) 3406 continue; 3407 /* skip CG for VCE/UVD, it's handled specially */ 3408 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_UVD && 3409 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCE && 3410 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCN && 3411 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_JPEG && 3412 adev->ip_blocks[i].version->funcs->set_clockgating_state) { 3413 /* enable clockgating to save power */ 3414 r = adev->ip_blocks[i].version->funcs->set_clockgating_state(&adev->ip_blocks[i], 3415 state); 3416 if (r) { 3417 dev_err(adev->dev, 3418 "set_clockgating_state(gate) of IP block <%s> failed %d\n", 3419 adev->ip_blocks[i].version->funcs->name, 3420 r); 3421 return r; 3422 } 3423 } 3424 } 3425 3426 return 0; 3427 } 3428 3429 int amdgpu_device_set_pg_state(struct amdgpu_device *adev, 3430 enum amd_powergating_state state) 3431 { 3432 int i, j, r; 3433 3434 if (amdgpu_emu_mode == 1) 3435 return 0; 3436 3437 for (j = 0; j < adev->num_ip_blocks; j++) { 3438 i = state == AMD_PG_STATE_GATE ? j : adev->num_ip_blocks - j - 1; 3439 if (!adev->ip_blocks[i].status.late_initialized) 3440 continue; 3441 /* skip PG for GFX, SDMA on S0ix */ 3442 if (adev->in_s0ix && 3443 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GFX || 3444 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SDMA)) 3445 continue; 3446 /* skip CG for VCE/UVD, it's handled specially */ 3447 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_UVD && 3448 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCE && 3449 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCN && 3450 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_JPEG && 3451 adev->ip_blocks[i].version->funcs->set_powergating_state) { 3452 /* enable powergating to save power */ 3453 r = adev->ip_blocks[i].version->funcs->set_powergating_state(&adev->ip_blocks[i], 3454 state); 3455 if (r) { 3456 dev_err(adev->dev, 3457 "set_powergating_state(gate) of IP block <%s> failed %d\n", 3458 adev->ip_blocks[i].version->funcs->name, 3459 r); 3460 return r; 3461 } 3462 } 3463 } 3464 return 0; 3465 } 3466 3467 static int amdgpu_device_enable_mgpu_fan_boost(void) 3468 { 3469 struct amdgpu_gpu_instance *gpu_ins; 3470 struct amdgpu_device *adev; 3471 int i, ret = 0; 3472 3473 mutex_lock(&mgpu_info.mutex); 3474 3475 /* 3476 * MGPU fan boost feature should be enabled 3477 * only when there are two or more dGPUs in 3478 * the system 3479 */ 3480 if (mgpu_info.num_dgpu < 2) 3481 goto out; 3482 3483 for (i = 0; i < mgpu_info.num_dgpu; i++) { 3484 gpu_ins = &(mgpu_info.gpu_ins[i]); 3485 adev = gpu_ins->adev; 3486 if (!(adev->flags & AMD_IS_APU || amdgpu_sriov_multi_vf_mode(adev)) && 3487 !gpu_ins->mgpu_fan_enabled) { 3488 ret = amdgpu_dpm_enable_mgpu_fan_boost(adev); 3489 if (ret) 3490 break; 3491 3492 gpu_ins->mgpu_fan_enabled = 1; 3493 } 3494 } 3495 3496 out: 3497 mutex_unlock(&mgpu_info.mutex); 3498 3499 return ret; 3500 } 3501 3502 /** 3503 * amdgpu_device_ip_late_init - run late init for hardware IPs 3504 * 3505 * @adev: amdgpu_device pointer 3506 * 3507 * Late initialization pass for hardware IPs. The list of all the hardware 3508 * IPs that make up the asic is walked and the late_init callbacks are run. 3509 * late_init covers any special initialization that an IP requires 3510 * after all of the have been initialized or something that needs to happen 3511 * late in the init process. 3512 * Returns 0 on success, negative error code on failure. 3513 */ 3514 static int amdgpu_device_ip_late_init(struct amdgpu_device *adev) 3515 { 3516 struct amdgpu_gpu_instance *gpu_instance; 3517 int i = 0, r; 3518 3519 for (i = 0; i < adev->num_ip_blocks; i++) { 3520 if (!adev->ip_blocks[i].status.hw) 3521 continue; 3522 if (adev->ip_blocks[i].version->funcs->late_init) { 3523 r = adev->ip_blocks[i].version->funcs->late_init(&adev->ip_blocks[i]); 3524 if (r) { 3525 dev_err(adev->dev, 3526 "late_init of IP block <%s> failed %d\n", 3527 adev->ip_blocks[i].version->funcs->name, 3528 r); 3529 return r; 3530 } 3531 } 3532 adev->ip_blocks[i].status.late_initialized = true; 3533 } 3534 3535 r = amdgpu_ras_late_init(adev); 3536 if (r) { 3537 dev_err(adev->dev, "amdgpu_ras_late_init failed %d", r); 3538 return r; 3539 } 3540 3541 if (!amdgpu_reset_in_recovery(adev)) 3542 amdgpu_ras_set_error_query_ready(adev, true); 3543 3544 amdgpu_device_set_cg_state(adev, AMD_CG_STATE_GATE); 3545 amdgpu_device_set_pg_state(adev, AMD_PG_STATE_GATE); 3546 3547 amdgpu_device_fill_reset_magic(adev); 3548 3549 r = amdgpu_device_enable_mgpu_fan_boost(); 3550 if (r) 3551 dev_err(adev->dev, "enable mgpu fan boost failed (%d).\n", r); 3552 3553 /* For passthrough configuration on arcturus and aldebaran, enable special handling SBR */ 3554 if (amdgpu_passthrough(adev) && 3555 ((adev->asic_type == CHIP_ARCTURUS && adev->gmc.xgmi.num_physical_nodes > 1) || 3556 adev->asic_type == CHIP_ALDEBARAN)) 3557 amdgpu_dpm_handle_passthrough_sbr(adev, true); 3558 3559 if (adev->gmc.xgmi.num_physical_nodes > 1) { 3560 mutex_lock(&mgpu_info.mutex); 3561 3562 /* 3563 * Reset device p-state to low as this was booted with high. 3564 * 3565 * This should be performed only after all devices from the same 3566 * hive get initialized. 3567 * 3568 * However, it's unknown how many device in the hive in advance. 3569 * As this is counted one by one during devices initializations. 3570 * 3571 * So, we wait for all XGMI interlinked devices initialized. 3572 * This may bring some delays as those devices may come from 3573 * different hives. But that should be OK. 3574 */ 3575 if (mgpu_info.num_dgpu == adev->gmc.xgmi.num_physical_nodes) { 3576 for (i = 0; i < mgpu_info.num_gpu; i++) { 3577 gpu_instance = &(mgpu_info.gpu_ins[i]); 3578 if (gpu_instance->adev->flags & AMD_IS_APU) 3579 continue; 3580 3581 r = amdgpu_xgmi_set_pstate(gpu_instance->adev, 3582 AMDGPU_XGMI_PSTATE_MIN); 3583 if (r) { 3584 dev_err(adev->dev, 3585 "pstate setting failed (%d).\n", 3586 r); 3587 break; 3588 } 3589 } 3590 } 3591 3592 mutex_unlock(&mgpu_info.mutex); 3593 } 3594 3595 return 0; 3596 } 3597 3598 static void amdgpu_ip_block_hw_fini(struct amdgpu_ip_block *ip_block) 3599 { 3600 struct amdgpu_device *adev = ip_block->adev; 3601 int r; 3602 3603 if (!ip_block->version->funcs->hw_fini) { 3604 dev_err(adev->dev, "hw_fini of IP block <%s> not defined\n", 3605 ip_block->version->funcs->name); 3606 } else { 3607 r = ip_block->version->funcs->hw_fini(ip_block); 3608 /* XXX handle errors */ 3609 if (r) { 3610 dev_dbg(adev->dev, 3611 "hw_fini of IP block <%s> failed %d\n", 3612 ip_block->version->funcs->name, r); 3613 } 3614 } 3615 3616 ip_block->status.hw = false; 3617 } 3618 3619 /** 3620 * amdgpu_device_smu_fini_early - smu hw_fini wrapper 3621 * 3622 * @adev: amdgpu_device pointer 3623 * 3624 * For ASICs need to disable SMC first 3625 */ 3626 static void amdgpu_device_smu_fini_early(struct amdgpu_device *adev) 3627 { 3628 int i; 3629 3630 if (amdgpu_ip_version(adev, GC_HWIP, 0) > IP_VERSION(9, 0, 0)) 3631 return; 3632 3633 for (i = 0; i < adev->num_ip_blocks; i++) { 3634 if (!adev->ip_blocks[i].status.hw) 3635 continue; 3636 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) { 3637 amdgpu_ip_block_hw_fini(&adev->ip_blocks[i]); 3638 break; 3639 } 3640 } 3641 } 3642 3643 static int amdgpu_device_ip_fini_early(struct amdgpu_device *adev) 3644 { 3645 int i, r; 3646 3647 for (i = 0; i < adev->num_ip_blocks; i++) { 3648 if (!adev->ip_blocks[i].version->funcs->early_fini) 3649 continue; 3650 3651 r = adev->ip_blocks[i].version->funcs->early_fini(&adev->ip_blocks[i]); 3652 if (r) { 3653 dev_dbg(adev->dev, 3654 "early_fini of IP block <%s> failed %d\n", 3655 adev->ip_blocks[i].version->funcs->name, r); 3656 } 3657 } 3658 3659 amdgpu_device_set_pg_state(adev, AMD_PG_STATE_UNGATE); 3660 amdgpu_device_set_cg_state(adev, AMD_CG_STATE_UNGATE); 3661 3662 amdgpu_amdkfd_suspend(adev, true); 3663 amdgpu_userq_suspend(adev); 3664 3665 /* Workaround for ASICs need to disable SMC first */ 3666 amdgpu_device_smu_fini_early(adev); 3667 3668 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 3669 if (!adev->ip_blocks[i].status.hw) 3670 continue; 3671 3672 amdgpu_ip_block_hw_fini(&adev->ip_blocks[i]); 3673 } 3674 3675 if (amdgpu_sriov_vf(adev)) { 3676 if (amdgpu_virt_release_full_gpu(adev, false)) 3677 dev_err(adev->dev, 3678 "failed to release exclusive mode on fini\n"); 3679 } 3680 3681 return 0; 3682 } 3683 3684 /** 3685 * amdgpu_device_ip_fini - run fini for hardware IPs 3686 * 3687 * @adev: amdgpu_device pointer 3688 * 3689 * Main teardown pass for hardware IPs. The list of all the hardware 3690 * IPs that make up the asic is walked and the hw_fini and sw_fini callbacks 3691 * are run. hw_fini tears down the hardware associated with each IP 3692 * and sw_fini tears down any software state associated with each IP. 3693 * Returns 0 on success, negative error code on failure. 3694 */ 3695 static int amdgpu_device_ip_fini(struct amdgpu_device *adev) 3696 { 3697 int i, r; 3698 3699 amdgpu_cper_fini(adev); 3700 3701 if (amdgpu_sriov_vf(adev) && adev->virt.ras_init_done) 3702 amdgpu_virt_release_ras_err_handler_data(adev); 3703 3704 if (adev->gmc.xgmi.num_physical_nodes > 1) 3705 amdgpu_xgmi_remove_device(adev); 3706 3707 amdgpu_amdkfd_device_fini_sw(adev); 3708 3709 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 3710 if (!adev->ip_blocks[i].status.sw) 3711 continue; 3712 3713 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) { 3714 amdgpu_ucode_free_bo(adev); 3715 amdgpu_free_static_csa(&adev->virt.csa_obj); 3716 amdgpu_device_wb_fini(adev); 3717 amdgpu_device_mem_scratch_fini(adev); 3718 amdgpu_ib_pool_fini(adev); 3719 amdgpu_seq64_fini(adev); 3720 amdgpu_doorbell_fini(adev); 3721 } 3722 if (adev->ip_blocks[i].version->funcs->sw_fini) { 3723 r = adev->ip_blocks[i].version->funcs->sw_fini(&adev->ip_blocks[i]); 3724 /* XXX handle errors */ 3725 if (r) { 3726 dev_dbg(adev->dev, 3727 "sw_fini of IP block <%s> failed %d\n", 3728 adev->ip_blocks[i].version->funcs->name, 3729 r); 3730 } 3731 } 3732 adev->ip_blocks[i].status.sw = false; 3733 adev->ip_blocks[i].status.valid = false; 3734 } 3735 3736 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 3737 if (!adev->ip_blocks[i].status.late_initialized) 3738 continue; 3739 if (adev->ip_blocks[i].version->funcs->late_fini) 3740 adev->ip_blocks[i].version->funcs->late_fini(&adev->ip_blocks[i]); 3741 adev->ip_blocks[i].status.late_initialized = false; 3742 } 3743 3744 amdgpu_ras_fini(adev); 3745 amdgpu_uid_fini(adev); 3746 3747 return 0; 3748 } 3749 3750 /** 3751 * amdgpu_device_delayed_init_work_handler - work handler for IB tests 3752 * 3753 * @work: work_struct. 3754 */ 3755 static void amdgpu_device_delayed_init_work_handler(struct work_struct *work) 3756 { 3757 struct amdgpu_device *adev = 3758 container_of(work, struct amdgpu_device, delayed_init_work.work); 3759 int r; 3760 3761 r = amdgpu_ib_ring_tests(adev); 3762 if (r) 3763 dev_err(adev->dev, "ib ring test failed (%d).\n", r); 3764 } 3765 3766 static void amdgpu_device_delay_enable_gfx_off(struct work_struct *work) 3767 { 3768 struct amdgpu_device *adev = 3769 container_of(work, struct amdgpu_device, gfx.gfx_off_delay_work.work); 3770 3771 WARN_ON_ONCE(adev->gfx.gfx_off_state); 3772 WARN_ON_ONCE(adev->gfx.gfx_off_req_count); 3773 3774 if (!amdgpu_dpm_set_powergating_by_smu(adev, AMD_IP_BLOCK_TYPE_GFX, true, 0)) 3775 adev->gfx.gfx_off_state = true; 3776 } 3777 3778 /** 3779 * amdgpu_device_ip_suspend_phase1 - run suspend for hardware IPs (phase 1) 3780 * 3781 * @adev: amdgpu_device pointer 3782 * 3783 * Main suspend function for hardware IPs. The list of all the hardware 3784 * IPs that make up the asic is walked, clockgating is disabled and the 3785 * suspend callbacks are run. suspend puts the hardware and software state 3786 * in each IP into a state suitable for suspend. 3787 * Returns 0 on success, negative error code on failure. 3788 */ 3789 static int amdgpu_device_ip_suspend_phase1(struct amdgpu_device *adev) 3790 { 3791 int i, r, rec; 3792 3793 amdgpu_device_set_pg_state(adev, AMD_PG_STATE_UNGATE); 3794 amdgpu_device_set_cg_state(adev, AMD_CG_STATE_UNGATE); 3795 3796 /* 3797 * Per PMFW team's suggestion, driver needs to handle gfxoff 3798 * and df cstate features disablement for gpu reset(e.g. Mode1Reset) 3799 * scenario. Add the missing df cstate disablement here. 3800 */ 3801 if (amdgpu_dpm_set_df_cstate(adev, DF_CSTATE_DISALLOW)) 3802 dev_warn(adev->dev, "Failed to disallow df cstate"); 3803 3804 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 3805 if (!adev->ip_blocks[i].status.valid) 3806 continue; 3807 3808 /* displays are handled separately */ 3809 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_DCE) 3810 continue; 3811 3812 r = amdgpu_ip_block_suspend(&adev->ip_blocks[i]); 3813 if (r) 3814 goto unwind; 3815 } 3816 3817 return 0; 3818 unwind: 3819 rec = amdgpu_device_ip_resume_phase3(adev); 3820 if (rec) 3821 dev_err(adev->dev, 3822 "amdgpu_device_ip_resume_phase3 failed during unwind: %d\n", 3823 rec); 3824 3825 amdgpu_dpm_set_df_cstate(adev, DF_CSTATE_ALLOW); 3826 3827 amdgpu_device_set_pg_state(adev, AMD_PG_STATE_GATE); 3828 amdgpu_device_set_cg_state(adev, AMD_CG_STATE_GATE); 3829 3830 return r; 3831 } 3832 3833 /** 3834 * amdgpu_device_ip_suspend_phase2 - run suspend for hardware IPs (phase 2) 3835 * 3836 * @adev: amdgpu_device pointer 3837 * 3838 * Main suspend function for hardware IPs. The list of all the hardware 3839 * IPs that make up the asic is walked, clockgating is disabled and the 3840 * suspend callbacks are run. suspend puts the hardware and software state 3841 * in each IP into a state suitable for suspend. 3842 * Returns 0 on success, negative error code on failure. 3843 */ 3844 static int amdgpu_device_ip_suspend_phase2(struct amdgpu_device *adev) 3845 { 3846 int i, r, rec; 3847 3848 if (adev->in_s0ix) 3849 amdgpu_dpm_gfx_state_change(adev, sGpuChangeState_D3Entry); 3850 3851 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 3852 if (!adev->ip_blocks[i].status.valid) 3853 continue; 3854 /* displays are handled in phase1 */ 3855 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE) 3856 continue; 3857 /* PSP lost connection when err_event_athub occurs */ 3858 if (amdgpu_ras_intr_triggered() && 3859 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) { 3860 adev->ip_blocks[i].status.hw = false; 3861 continue; 3862 } 3863 3864 /* skip unnecessary suspend if we do not initialize them yet */ 3865 if (!amdgpu_ip_member_of_hwini( 3866 adev, adev->ip_blocks[i].version->type)) 3867 continue; 3868 3869 /* Since we skip suspend for S0i3, we need to cancel the delayed 3870 * idle work here as the suspend callback never gets called. 3871 */ 3872 if (adev->in_s0ix && 3873 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GFX && 3874 amdgpu_ip_version(adev, GC_HWIP, 0) >= IP_VERSION(10, 0, 0)) 3875 cancel_delayed_work_sync(&adev->gfx.idle_work); 3876 /* skip suspend of gfx/mes and psp for S0ix 3877 * gfx is in gfxoff state, so on resume it will exit gfxoff just 3878 * like at runtime. PSP is also part of the always on hardware 3879 * so no need to suspend it. 3880 */ 3881 if (adev->in_s0ix && 3882 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP || 3883 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GFX || 3884 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_MES)) 3885 continue; 3886 3887 /* SDMA 5.x+ is part of GFX power domain so it's covered by GFXOFF */ 3888 if (adev->in_s0ix && 3889 (amdgpu_ip_version(adev, SDMA0_HWIP, 0) >= 3890 IP_VERSION(5, 0, 0)) && 3891 (adev->ip_blocks[i].version->type == 3892 AMD_IP_BLOCK_TYPE_SDMA)) 3893 continue; 3894 3895 /* Once swPSP provides the IMU, RLC FW binaries to TOS during cold-boot. 3896 * These are in TMR, hence are expected to be reused by PSP-TOS to reload 3897 * from this location and RLC Autoload automatically also gets loaded 3898 * from here based on PMFW -> PSP message during re-init sequence. 3899 * Therefore, the psp suspend & resume should be skipped to avoid destroy 3900 * the TMR and reload FWs again for IMU enabled APU ASICs. 3901 */ 3902 if (amdgpu_in_reset(adev) && 3903 (adev->flags & AMD_IS_APU) && adev->gfx.imu.funcs && 3904 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) 3905 continue; 3906 3907 r = amdgpu_ip_block_suspend(&adev->ip_blocks[i]); 3908 if (r) 3909 goto unwind; 3910 3911 /* handle putting the SMC in the appropriate state */ 3912 if (!amdgpu_sriov_vf(adev)) { 3913 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) { 3914 r = amdgpu_dpm_set_mp1_state(adev, adev->mp1_state); 3915 if (r) { 3916 dev_err(adev->dev, 3917 "SMC failed to set mp1 state %d, %d\n", 3918 adev->mp1_state, r); 3919 goto unwind; 3920 } 3921 } 3922 } 3923 } 3924 3925 return 0; 3926 unwind: 3927 /* suspend phase 2 = resume phase 1 + resume phase 2 */ 3928 rec = amdgpu_device_ip_resume_phase1(adev); 3929 if (rec) { 3930 dev_err(adev->dev, 3931 "amdgpu_device_ip_resume_phase1 failed during unwind: %d\n", 3932 rec); 3933 return r; 3934 } 3935 3936 rec = amdgpu_device_fw_loading(adev); 3937 if (rec) { 3938 dev_err(adev->dev, 3939 "amdgpu_device_fw_loading failed during unwind: %d\n", 3940 rec); 3941 return r; 3942 } 3943 3944 rec = amdgpu_device_ip_resume_phase2(adev); 3945 if (rec) { 3946 dev_err(adev->dev, 3947 "amdgpu_device_ip_resume_phase2 failed during unwind: %d\n", 3948 rec); 3949 return r; 3950 } 3951 3952 return r; 3953 } 3954 3955 /** 3956 * amdgpu_device_ip_suspend - run suspend for hardware IPs 3957 * 3958 * @adev: amdgpu_device pointer 3959 * 3960 * Main suspend function for hardware IPs. The list of all the hardware 3961 * IPs that make up the asic is walked, clockgating is disabled and the 3962 * suspend callbacks are run. suspend puts the hardware and software state 3963 * in each IP into a state suitable for suspend. 3964 * Returns 0 on success, negative error code on failure. 3965 */ 3966 static int amdgpu_device_ip_suspend(struct amdgpu_device *adev) 3967 { 3968 int r; 3969 3970 if (amdgpu_sriov_vf(adev)) { 3971 amdgpu_virt_fini_data_exchange(adev); 3972 amdgpu_virt_request_full_gpu(adev, false); 3973 } 3974 3975 amdgpu_ttm_set_buffer_funcs_status(adev, false); 3976 3977 r = amdgpu_device_ip_suspend_phase1(adev); 3978 if (r) 3979 return r; 3980 r = amdgpu_device_ip_suspend_phase2(adev); 3981 3982 if (amdgpu_sriov_vf(adev)) 3983 amdgpu_virt_release_full_gpu(adev, false); 3984 3985 return r; 3986 } 3987 3988 static int amdgpu_device_ip_reinit_early_sriov(struct amdgpu_device *adev) 3989 { 3990 int i, r; 3991 3992 static enum amd_ip_block_type ip_order[] = { 3993 AMD_IP_BLOCK_TYPE_COMMON, 3994 AMD_IP_BLOCK_TYPE_GMC, 3995 AMD_IP_BLOCK_TYPE_PSP, 3996 AMD_IP_BLOCK_TYPE_IH, 3997 }; 3998 3999 for (i = 0; i < adev->num_ip_blocks; i++) { 4000 int j; 4001 struct amdgpu_ip_block *block; 4002 4003 block = &adev->ip_blocks[i]; 4004 block->status.hw = false; 4005 4006 for (j = 0; j < ARRAY_SIZE(ip_order); j++) { 4007 4008 if (block->version->type != ip_order[j] || 4009 !block->status.valid) 4010 continue; 4011 4012 r = block->version->funcs->hw_init(&adev->ip_blocks[i]); 4013 if (r) { 4014 dev_err(adev->dev, "RE-INIT-early: %s failed\n", 4015 block->version->funcs->name); 4016 return r; 4017 } 4018 block->status.hw = true; 4019 } 4020 } 4021 4022 return 0; 4023 } 4024 4025 static int amdgpu_device_ip_reinit_late_sriov(struct amdgpu_device *adev) 4026 { 4027 struct amdgpu_ip_block *block; 4028 int i, r = 0; 4029 4030 static enum amd_ip_block_type ip_order[] = { 4031 AMD_IP_BLOCK_TYPE_SMC, 4032 AMD_IP_BLOCK_TYPE_DCE, 4033 AMD_IP_BLOCK_TYPE_GFX, 4034 AMD_IP_BLOCK_TYPE_SDMA, 4035 AMD_IP_BLOCK_TYPE_MES, 4036 AMD_IP_BLOCK_TYPE_UVD, 4037 AMD_IP_BLOCK_TYPE_VCE, 4038 AMD_IP_BLOCK_TYPE_VCN, 4039 AMD_IP_BLOCK_TYPE_JPEG 4040 }; 4041 4042 for (i = 0; i < ARRAY_SIZE(ip_order); i++) { 4043 block = amdgpu_device_ip_get_ip_block(adev, ip_order[i]); 4044 4045 if (!block) 4046 continue; 4047 4048 if (block->status.valid && !block->status.hw) { 4049 if (block->version->type == AMD_IP_BLOCK_TYPE_SMC) { 4050 r = amdgpu_ip_block_resume(block); 4051 } else { 4052 r = block->version->funcs->hw_init(block); 4053 } 4054 4055 if (r) { 4056 dev_err(adev->dev, "RE-INIT-late: %s failed\n", 4057 block->version->funcs->name); 4058 break; 4059 } 4060 block->status.hw = true; 4061 } 4062 } 4063 4064 return r; 4065 } 4066 4067 /** 4068 * amdgpu_device_ip_resume_phase1 - run resume for hardware IPs 4069 * 4070 * @adev: amdgpu_device pointer 4071 * 4072 * First resume function for hardware IPs. The list of all the hardware 4073 * IPs that make up the asic is walked and the resume callbacks are run for 4074 * COMMON, GMC, and IH. resume puts the hardware into a functional state 4075 * after a suspend and updates the software state as necessary. This 4076 * function is also used for restoring the GPU after a GPU reset. 4077 * Returns 0 on success, negative error code on failure. 4078 */ 4079 static int amdgpu_device_ip_resume_phase1(struct amdgpu_device *adev) 4080 { 4081 int i, r; 4082 4083 for (i = 0; i < adev->num_ip_blocks; i++) { 4084 if (!adev->ip_blocks[i].status.valid || adev->ip_blocks[i].status.hw) 4085 continue; 4086 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON || 4087 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC || 4088 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH || 4089 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP && amdgpu_sriov_vf(adev))) { 4090 4091 r = amdgpu_ip_block_resume(&adev->ip_blocks[i]); 4092 if (r) 4093 return r; 4094 } 4095 } 4096 4097 return 0; 4098 } 4099 4100 /** 4101 * amdgpu_device_ip_resume_phase2 - run resume for hardware IPs 4102 * 4103 * @adev: amdgpu_device pointer 4104 * 4105 * Second resume function for hardware IPs. The list of all the hardware 4106 * IPs that make up the asic is walked and the resume callbacks are run for 4107 * all blocks except COMMON, GMC, and IH. resume puts the hardware into a 4108 * functional state after a suspend and updates the software state as 4109 * necessary. This function is also used for restoring the GPU after a GPU 4110 * reset. 4111 * Returns 0 on success, negative error code on failure. 4112 */ 4113 static int amdgpu_device_ip_resume_phase2(struct amdgpu_device *adev) 4114 { 4115 int i, r; 4116 4117 for (i = 0; i < adev->num_ip_blocks; i++) { 4118 if (!adev->ip_blocks[i].status.valid || adev->ip_blocks[i].status.hw) 4119 continue; 4120 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON || 4121 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC || 4122 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH || 4123 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE || 4124 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) 4125 continue; 4126 r = amdgpu_ip_block_resume(&adev->ip_blocks[i]); 4127 if (r) 4128 return r; 4129 } 4130 4131 return 0; 4132 } 4133 4134 /** 4135 * amdgpu_device_ip_resume_phase3 - run resume for hardware IPs 4136 * 4137 * @adev: amdgpu_device pointer 4138 * 4139 * Third resume function for hardware IPs. The list of all the hardware 4140 * IPs that make up the asic is walked and the resume callbacks are run for 4141 * all DCE. resume puts the hardware into a functional state after a suspend 4142 * and updates the software state as necessary. This function is also used 4143 * for restoring the GPU after a GPU reset. 4144 * 4145 * Returns 0 on success, negative error code on failure. 4146 */ 4147 static int amdgpu_device_ip_resume_phase3(struct amdgpu_device *adev) 4148 { 4149 int i, r; 4150 4151 for (i = 0; i < adev->num_ip_blocks; i++) { 4152 if (!adev->ip_blocks[i].status.valid || adev->ip_blocks[i].status.hw) 4153 continue; 4154 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE) { 4155 r = amdgpu_ip_block_resume(&adev->ip_blocks[i]); 4156 if (r) 4157 return r; 4158 } 4159 } 4160 4161 return 0; 4162 } 4163 4164 /** 4165 * amdgpu_device_ip_resume - run resume for hardware IPs 4166 * 4167 * @adev: amdgpu_device pointer 4168 * 4169 * Main resume function for hardware IPs. The hardware IPs 4170 * are split into two resume functions because they are 4171 * also used in recovering from a GPU reset and some additional 4172 * steps need to be take between them. In this case (S3/S4) they are 4173 * run sequentially. 4174 * Returns 0 on success, negative error code on failure. 4175 */ 4176 static int amdgpu_device_ip_resume(struct amdgpu_device *adev) 4177 { 4178 int r; 4179 4180 r = amdgpu_device_ip_resume_phase1(adev); 4181 if (r) 4182 return r; 4183 4184 r = amdgpu_device_fw_loading(adev); 4185 if (r) 4186 return r; 4187 4188 r = amdgpu_device_ip_resume_phase2(adev); 4189 4190 if (adev->mman.buffer_funcs_ring->sched.ready) 4191 amdgpu_ttm_set_buffer_funcs_status(adev, true); 4192 4193 if (r) 4194 return r; 4195 4196 amdgpu_fence_driver_hw_init(adev); 4197 4198 r = amdgpu_device_ip_resume_phase3(adev); 4199 4200 return r; 4201 } 4202 4203 /** 4204 * amdgpu_device_detect_sriov_bios - determine if the board supports SR-IOV 4205 * 4206 * @adev: amdgpu_device pointer 4207 * 4208 * Query the VBIOS data tables to determine if the board supports SR-IOV. 4209 */ 4210 static void amdgpu_device_detect_sriov_bios(struct amdgpu_device *adev) 4211 { 4212 if (amdgpu_sriov_vf(adev)) { 4213 if (adev->is_atom_fw) { 4214 if (amdgpu_atomfirmware_gpu_virtualization_supported(adev)) 4215 adev->virt.caps |= AMDGPU_SRIOV_CAPS_SRIOV_VBIOS; 4216 } else { 4217 if (amdgpu_atombios_has_gpu_virtualization_table(adev)) 4218 adev->virt.caps |= AMDGPU_SRIOV_CAPS_SRIOV_VBIOS; 4219 } 4220 4221 if (!(adev->virt.caps & AMDGPU_SRIOV_CAPS_SRIOV_VBIOS)) 4222 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_NO_VBIOS, 0, 0); 4223 } 4224 } 4225 4226 /** 4227 * amdgpu_device_asic_has_dc_support - determine if DC supports the asic 4228 * 4229 * @pdev : pci device context 4230 * @asic_type: AMD asic type 4231 * 4232 * Check if there is DC (new modesetting infrastructre) support for an asic. 4233 * returns true if DC has support, false if not. 4234 */ 4235 bool amdgpu_device_asic_has_dc_support(struct pci_dev *pdev, 4236 enum amd_asic_type asic_type) 4237 { 4238 switch (asic_type) { 4239 #ifdef CONFIG_DRM_AMDGPU_SI 4240 case CHIP_HAINAN: 4241 #endif 4242 case CHIP_TOPAZ: 4243 /* chips with no display hardware */ 4244 return false; 4245 #if defined(CONFIG_DRM_AMD_DC) 4246 case CHIP_TAHITI: 4247 case CHIP_PITCAIRN: 4248 case CHIP_VERDE: 4249 case CHIP_OLAND: 4250 /* 4251 * We have systems in the wild with these ASICs that require 4252 * LVDS and VGA support which is not supported with DC. 4253 * 4254 * Fallback to the non-DC driver here by default so as not to 4255 * cause regressions. 4256 */ 4257 #if defined(CONFIG_DRM_AMD_DC_SI) 4258 return amdgpu_dc > 0; 4259 #else 4260 return false; 4261 #endif 4262 case CHIP_KAVERI: 4263 case CHIP_KABINI: 4264 case CHIP_MULLINS: 4265 /* 4266 * We have systems in the wild with these ASICs that require 4267 * VGA support which is not supported with DC. 4268 * 4269 * Fallback to the non-DC driver here by default so as not to 4270 * cause regressions. 4271 */ 4272 return amdgpu_dc > 0; 4273 default: 4274 return amdgpu_dc != 0; 4275 #else 4276 default: 4277 if (amdgpu_dc > 0) 4278 dev_info_once( 4279 &pdev->dev, 4280 "Display Core has been requested via kernel parameter but isn't supported by ASIC, ignoring\n"); 4281 return false; 4282 #endif 4283 } 4284 } 4285 4286 /** 4287 * amdgpu_device_has_dc_support - check if dc is supported 4288 * 4289 * @adev: amdgpu_device pointer 4290 * 4291 * Returns true for supported, false for not supported 4292 */ 4293 bool amdgpu_device_has_dc_support(struct amdgpu_device *adev) 4294 { 4295 if (adev->enable_virtual_display || 4296 (adev->harvest_ip_mask & AMD_HARVEST_IP_DMU_MASK)) 4297 return false; 4298 4299 return amdgpu_device_asic_has_dc_support(adev->pdev, adev->asic_type); 4300 } 4301 4302 static void amdgpu_device_xgmi_reset_func(struct work_struct *__work) 4303 { 4304 struct amdgpu_device *adev = 4305 container_of(__work, struct amdgpu_device, xgmi_reset_work); 4306 struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev); 4307 4308 /* It's a bug to not have a hive within this function */ 4309 if (WARN_ON(!hive)) 4310 return; 4311 4312 /* 4313 * Use task barrier to synchronize all xgmi reset works across the 4314 * hive. task_barrier_enter and task_barrier_exit will block 4315 * until all the threads running the xgmi reset works reach 4316 * those points. task_barrier_full will do both blocks. 4317 */ 4318 if (amdgpu_asic_reset_method(adev) == AMD_RESET_METHOD_BACO) { 4319 4320 task_barrier_enter(&hive->tb); 4321 adev->asic_reset_res = amdgpu_device_baco_enter(adev); 4322 4323 if (adev->asic_reset_res) 4324 goto fail; 4325 4326 task_barrier_exit(&hive->tb); 4327 adev->asic_reset_res = amdgpu_device_baco_exit(adev); 4328 4329 if (adev->asic_reset_res) 4330 goto fail; 4331 4332 amdgpu_ras_reset_error_count(adev, AMDGPU_RAS_BLOCK__MMHUB); 4333 } else { 4334 4335 task_barrier_full(&hive->tb); 4336 adev->asic_reset_res = amdgpu_asic_reset(adev); 4337 } 4338 4339 fail: 4340 if (adev->asic_reset_res) 4341 dev_warn(adev->dev, 4342 "ASIC reset failed with error, %d for drm dev, %s", 4343 adev->asic_reset_res, adev_to_drm(adev)->unique); 4344 amdgpu_put_xgmi_hive(hive); 4345 } 4346 4347 static int amdgpu_device_get_job_timeout_settings(struct amdgpu_device *adev) 4348 { 4349 char *input = amdgpu_lockup_timeout; 4350 char *timeout_setting = NULL; 4351 int index = 0; 4352 long timeout; 4353 int ret = 0; 4354 4355 /* By default timeout for all queues is 2 sec */ 4356 adev->gfx_timeout = adev->compute_timeout = adev->sdma_timeout = 4357 adev->video_timeout = msecs_to_jiffies(2000); 4358 4359 if (!strnlen(input, AMDGPU_MAX_TIMEOUT_PARAM_LENGTH)) 4360 return 0; 4361 4362 while ((timeout_setting = strsep(&input, ",")) && 4363 strnlen(timeout_setting, AMDGPU_MAX_TIMEOUT_PARAM_LENGTH)) { 4364 ret = kstrtol(timeout_setting, 0, &timeout); 4365 if (ret) 4366 return ret; 4367 4368 if (timeout == 0) { 4369 index++; 4370 continue; 4371 } else if (timeout < 0) { 4372 timeout = MAX_SCHEDULE_TIMEOUT; 4373 dev_warn(adev->dev, "lockup timeout disabled"); 4374 add_taint(TAINT_SOFTLOCKUP, LOCKDEP_STILL_OK); 4375 } else { 4376 timeout = msecs_to_jiffies(timeout); 4377 } 4378 4379 switch (index++) { 4380 case 0: 4381 adev->gfx_timeout = timeout; 4382 break; 4383 case 1: 4384 adev->compute_timeout = timeout; 4385 break; 4386 case 2: 4387 adev->sdma_timeout = timeout; 4388 break; 4389 case 3: 4390 adev->video_timeout = timeout; 4391 break; 4392 default: 4393 break; 4394 } 4395 } 4396 4397 /* When only one value specified apply it to all queues. */ 4398 if (index == 1) 4399 adev->gfx_timeout = adev->compute_timeout = adev->sdma_timeout = 4400 adev->video_timeout = timeout; 4401 4402 return ret; 4403 } 4404 4405 /** 4406 * amdgpu_device_check_iommu_direct_map - check if RAM direct mapped to GPU 4407 * 4408 * @adev: amdgpu_device pointer 4409 * 4410 * RAM direct mapped to GPU if IOMMU is not enabled or is pass through mode 4411 */ 4412 static void amdgpu_device_check_iommu_direct_map(struct amdgpu_device *adev) 4413 { 4414 struct iommu_domain *domain; 4415 4416 domain = iommu_get_domain_for_dev(adev->dev); 4417 if (!domain || domain->type == IOMMU_DOMAIN_IDENTITY) 4418 adev->ram_is_direct_mapped = true; 4419 } 4420 4421 #if defined(CONFIG_HSA_AMD_P2P) 4422 /** 4423 * amdgpu_device_check_iommu_remap - Check if DMA remapping is enabled. 4424 * 4425 * @adev: amdgpu_device pointer 4426 * 4427 * return if IOMMU remapping bar address 4428 */ 4429 static bool amdgpu_device_check_iommu_remap(struct amdgpu_device *adev) 4430 { 4431 struct iommu_domain *domain; 4432 4433 domain = iommu_get_domain_for_dev(adev->dev); 4434 if (domain && (domain->type == IOMMU_DOMAIN_DMA || 4435 domain->type == IOMMU_DOMAIN_DMA_FQ)) 4436 return true; 4437 4438 return false; 4439 } 4440 #endif 4441 4442 static void amdgpu_device_set_mcbp(struct amdgpu_device *adev) 4443 { 4444 if (amdgpu_mcbp == 1) 4445 adev->gfx.mcbp = true; 4446 else if (amdgpu_mcbp == 0) 4447 adev->gfx.mcbp = false; 4448 4449 if (amdgpu_sriov_vf(adev)) 4450 adev->gfx.mcbp = true; 4451 4452 if (adev->gfx.mcbp) 4453 dev_info(adev->dev, "MCBP is enabled\n"); 4454 } 4455 4456 static int amdgpu_device_sys_interface_init(struct amdgpu_device *adev) 4457 { 4458 int r; 4459 4460 r = amdgpu_atombios_sysfs_init(adev); 4461 if (r) 4462 drm_err(&adev->ddev, 4463 "registering atombios sysfs failed (%d).\n", r); 4464 4465 r = amdgpu_pm_sysfs_init(adev); 4466 if (r) 4467 dev_err(adev->dev, "registering pm sysfs failed (%d).\n", r); 4468 4469 r = amdgpu_ucode_sysfs_init(adev); 4470 if (r) { 4471 adev->ucode_sysfs_en = false; 4472 dev_err(adev->dev, "Creating firmware sysfs failed (%d).\n", r); 4473 } else 4474 adev->ucode_sysfs_en = true; 4475 4476 r = amdgpu_device_attr_sysfs_init(adev); 4477 if (r) 4478 dev_err(adev->dev, "Could not create amdgpu device attr\n"); 4479 4480 r = devm_device_add_group(adev->dev, &amdgpu_board_attrs_group); 4481 if (r) 4482 dev_err(adev->dev, 4483 "Could not create amdgpu board attributes\n"); 4484 4485 amdgpu_fru_sysfs_init(adev); 4486 amdgpu_reg_state_sysfs_init(adev); 4487 amdgpu_xcp_sysfs_init(adev); 4488 4489 return r; 4490 } 4491 4492 static void amdgpu_device_sys_interface_fini(struct amdgpu_device *adev) 4493 { 4494 if (adev->pm.sysfs_initialized) 4495 amdgpu_pm_sysfs_fini(adev); 4496 if (adev->ucode_sysfs_en) 4497 amdgpu_ucode_sysfs_fini(adev); 4498 amdgpu_device_attr_sysfs_fini(adev); 4499 amdgpu_fru_sysfs_fini(adev); 4500 4501 amdgpu_reg_state_sysfs_fini(adev); 4502 amdgpu_xcp_sysfs_fini(adev); 4503 } 4504 4505 /** 4506 * amdgpu_device_init - initialize the driver 4507 * 4508 * @adev: amdgpu_device pointer 4509 * @flags: driver flags 4510 * 4511 * Initializes the driver info and hw (all asics). 4512 * Returns 0 for success or an error on failure. 4513 * Called at driver startup. 4514 */ 4515 int amdgpu_device_init(struct amdgpu_device *adev, 4516 uint32_t flags) 4517 { 4518 struct pci_dev *pdev = adev->pdev; 4519 int r, i; 4520 bool px = false; 4521 u32 max_MBps; 4522 int tmp; 4523 4524 adev->shutdown = false; 4525 adev->flags = flags; 4526 4527 if (amdgpu_force_asic_type >= 0 && amdgpu_force_asic_type < CHIP_LAST) 4528 adev->asic_type = amdgpu_force_asic_type; 4529 else 4530 adev->asic_type = flags & AMD_ASIC_MASK; 4531 4532 adev->usec_timeout = AMDGPU_MAX_USEC_TIMEOUT; 4533 if (amdgpu_emu_mode == 1) 4534 adev->usec_timeout *= 10; 4535 adev->gmc.gart_size = 512 * 1024 * 1024; 4536 adev->accel_working = false; 4537 adev->num_rings = 0; 4538 RCU_INIT_POINTER(adev->gang_submit, dma_fence_get_stub()); 4539 adev->mman.buffer_funcs = NULL; 4540 adev->mman.buffer_funcs_ring = NULL; 4541 adev->vm_manager.vm_pte_funcs = NULL; 4542 adev->vm_manager.vm_pte_num_scheds = 0; 4543 adev->gmc.gmc_funcs = NULL; 4544 adev->harvest_ip_mask = 0x0; 4545 adev->fence_context = dma_fence_context_alloc(AMDGPU_MAX_RINGS); 4546 bitmap_zero(adev->gfx.pipe_reserve_bitmap, AMDGPU_MAX_COMPUTE_QUEUES); 4547 4548 adev->smc_rreg = &amdgpu_invalid_rreg; 4549 adev->smc_wreg = &amdgpu_invalid_wreg; 4550 adev->pcie_rreg = &amdgpu_invalid_rreg; 4551 adev->pcie_wreg = &amdgpu_invalid_wreg; 4552 adev->pcie_rreg_ext = &amdgpu_invalid_rreg_ext; 4553 adev->pcie_wreg_ext = &amdgpu_invalid_wreg_ext; 4554 adev->pciep_rreg = &amdgpu_invalid_rreg; 4555 adev->pciep_wreg = &amdgpu_invalid_wreg; 4556 adev->pcie_rreg64 = &amdgpu_invalid_rreg64; 4557 adev->pcie_wreg64 = &amdgpu_invalid_wreg64; 4558 adev->pcie_rreg64_ext = &amdgpu_invalid_rreg64_ext; 4559 adev->pcie_wreg64_ext = &amdgpu_invalid_wreg64_ext; 4560 adev->uvd_ctx_rreg = &amdgpu_invalid_rreg; 4561 adev->uvd_ctx_wreg = &amdgpu_invalid_wreg; 4562 adev->didt_rreg = &amdgpu_invalid_rreg; 4563 adev->didt_wreg = &amdgpu_invalid_wreg; 4564 adev->gc_cac_rreg = &amdgpu_invalid_rreg; 4565 adev->gc_cac_wreg = &amdgpu_invalid_wreg; 4566 adev->audio_endpt_rreg = &amdgpu_block_invalid_rreg; 4567 adev->audio_endpt_wreg = &amdgpu_block_invalid_wreg; 4568 4569 dev_info( 4570 adev->dev, 4571 "initializing kernel modesetting (%s 0x%04X:0x%04X 0x%04X:0x%04X 0x%02X).\n", 4572 amdgpu_asic_name[adev->asic_type], pdev->vendor, pdev->device, 4573 pdev->subsystem_vendor, pdev->subsystem_device, pdev->revision); 4574 4575 /* mutex initialization are all done here so we 4576 * can recall function without having locking issues 4577 */ 4578 mutex_init(&adev->firmware.mutex); 4579 mutex_init(&adev->pm.mutex); 4580 mutex_init(&adev->gfx.gpu_clock_mutex); 4581 mutex_init(&adev->srbm_mutex); 4582 mutex_init(&adev->gfx.pipe_reserve_mutex); 4583 mutex_init(&adev->gfx.gfx_off_mutex); 4584 mutex_init(&adev->gfx.partition_mutex); 4585 mutex_init(&adev->grbm_idx_mutex); 4586 mutex_init(&adev->mn_lock); 4587 mutex_init(&adev->virt.vf_errors.lock); 4588 hash_init(adev->mn_hash); 4589 mutex_init(&adev->psp.mutex); 4590 mutex_init(&adev->notifier_lock); 4591 mutex_init(&adev->pm.stable_pstate_ctx_lock); 4592 mutex_init(&adev->benchmark_mutex); 4593 mutex_init(&adev->gfx.reset_sem_mutex); 4594 /* Initialize the mutex for cleaner shader isolation between GFX and compute processes */ 4595 mutex_init(&adev->enforce_isolation_mutex); 4596 for (i = 0; i < MAX_XCP; ++i) { 4597 adev->isolation[i].spearhead = dma_fence_get_stub(); 4598 amdgpu_sync_create(&adev->isolation[i].active); 4599 amdgpu_sync_create(&adev->isolation[i].prev); 4600 } 4601 mutex_init(&adev->gfx.userq_sch_mutex); 4602 mutex_init(&adev->gfx.workload_profile_mutex); 4603 mutex_init(&adev->vcn.workload_profile_mutex); 4604 4605 amdgpu_device_init_apu_flags(adev); 4606 4607 r = amdgpu_device_check_arguments(adev); 4608 if (r) 4609 return r; 4610 4611 spin_lock_init(&adev->mmio_idx_lock); 4612 spin_lock_init(&adev->smc_idx_lock); 4613 spin_lock_init(&adev->pcie_idx_lock); 4614 spin_lock_init(&adev->uvd_ctx_idx_lock); 4615 spin_lock_init(&adev->didt_idx_lock); 4616 spin_lock_init(&adev->gc_cac_idx_lock); 4617 spin_lock_init(&adev->se_cac_idx_lock); 4618 spin_lock_init(&adev->audio_endpt_idx_lock); 4619 spin_lock_init(&adev->mm_stats.lock); 4620 spin_lock_init(&adev->virt.rlcg_reg_lock); 4621 spin_lock_init(&adev->wb.lock); 4622 4623 xa_init_flags(&adev->userq_xa, XA_FLAGS_LOCK_IRQ); 4624 4625 INIT_LIST_HEAD(&adev->reset_list); 4626 4627 INIT_LIST_HEAD(&adev->ras_list); 4628 4629 INIT_LIST_HEAD(&adev->pm.od_kobj_list); 4630 4631 xa_init(&adev->userq_doorbell_xa); 4632 4633 INIT_DELAYED_WORK(&adev->delayed_init_work, 4634 amdgpu_device_delayed_init_work_handler); 4635 INIT_DELAYED_WORK(&adev->gfx.gfx_off_delay_work, 4636 amdgpu_device_delay_enable_gfx_off); 4637 /* 4638 * Initialize the enforce_isolation work structures for each XCP 4639 * partition. This work handler is responsible for enforcing shader 4640 * isolation on AMD GPUs. It counts the number of emitted fences for 4641 * each GFX and compute ring. If there are any fences, it schedules 4642 * the `enforce_isolation_work` to be run after a delay. If there are 4643 * no fences, it signals the Kernel Fusion Driver (KFD) to resume the 4644 * runqueue. 4645 */ 4646 for (i = 0; i < MAX_XCP; i++) { 4647 INIT_DELAYED_WORK(&adev->gfx.enforce_isolation[i].work, 4648 amdgpu_gfx_enforce_isolation_handler); 4649 adev->gfx.enforce_isolation[i].adev = adev; 4650 adev->gfx.enforce_isolation[i].xcp_id = i; 4651 } 4652 4653 INIT_WORK(&adev->xgmi_reset_work, amdgpu_device_xgmi_reset_func); 4654 INIT_WORK(&adev->userq_reset_work, amdgpu_userq_reset_work); 4655 4656 adev->gfx.gfx_off_req_count = 1; 4657 adev->gfx.gfx_off_residency = 0; 4658 adev->gfx.gfx_off_entrycount = 0; 4659 adev->pm.ac_power = power_supply_is_system_supplied() > 0; 4660 4661 atomic_set(&adev->throttling_logging_enabled, 1); 4662 /* 4663 * If throttling continues, logging will be performed every minute 4664 * to avoid log flooding. "-1" is subtracted since the thermal 4665 * throttling interrupt comes every second. Thus, the total logging 4666 * interval is 59 seconds(retelimited printk interval) + 1(waiting 4667 * for throttling interrupt) = 60 seconds. 4668 */ 4669 ratelimit_state_init(&adev->throttling_logging_rs, (60 - 1) * HZ, 1); 4670 4671 ratelimit_set_flags(&adev->throttling_logging_rs, RATELIMIT_MSG_ON_RELEASE); 4672 4673 /* Registers mapping */ 4674 /* TODO: block userspace mapping of io register */ 4675 if (adev->asic_type >= CHIP_BONAIRE) { 4676 adev->rmmio_base = pci_resource_start(adev->pdev, 5); 4677 adev->rmmio_size = pci_resource_len(adev->pdev, 5); 4678 } else { 4679 adev->rmmio_base = pci_resource_start(adev->pdev, 2); 4680 adev->rmmio_size = pci_resource_len(adev->pdev, 2); 4681 } 4682 4683 for (i = 0; i < AMD_IP_BLOCK_TYPE_NUM; i++) 4684 atomic_set(&adev->pm.pwr_state[i], POWER_STATE_UNKNOWN); 4685 4686 adev->rmmio = ioremap(adev->rmmio_base, adev->rmmio_size); 4687 if (!adev->rmmio) 4688 return -ENOMEM; 4689 4690 dev_info(adev->dev, "register mmio base: 0x%08X\n", 4691 (uint32_t)adev->rmmio_base); 4692 dev_info(adev->dev, "register mmio size: %u\n", 4693 (unsigned int)adev->rmmio_size); 4694 4695 /* 4696 * Reset domain needs to be present early, before XGMI hive discovered 4697 * (if any) and initialized to use reset sem and in_gpu reset flag 4698 * early on during init and before calling to RREG32. 4699 */ 4700 adev->reset_domain = amdgpu_reset_create_reset_domain(SINGLE_DEVICE, "amdgpu-reset-dev"); 4701 if (!adev->reset_domain) 4702 return -ENOMEM; 4703 4704 /* detect hw virtualization here */ 4705 amdgpu_virt_init(adev); 4706 4707 amdgpu_device_get_pcie_info(adev); 4708 4709 r = amdgpu_device_get_job_timeout_settings(adev); 4710 if (r) { 4711 dev_err(adev->dev, "invalid lockup_timeout parameter syntax\n"); 4712 return r; 4713 } 4714 4715 amdgpu_device_set_mcbp(adev); 4716 4717 /* 4718 * By default, use default mode where all blocks are expected to be 4719 * initialized. At present a 'swinit' of blocks is required to be 4720 * completed before the need for a different level is detected. 4721 */ 4722 amdgpu_set_init_level(adev, AMDGPU_INIT_LEVEL_DEFAULT); 4723 /* early init functions */ 4724 r = amdgpu_device_ip_early_init(adev); 4725 if (r) 4726 return r; 4727 4728 /* 4729 * No need to remove conflicting FBs for non-display class devices. 4730 * This prevents the sysfb from being freed accidently. 4731 */ 4732 if ((pdev->class >> 8) == PCI_CLASS_DISPLAY_VGA || 4733 (pdev->class >> 8) == PCI_CLASS_DISPLAY_OTHER) { 4734 /* Get rid of things like offb */ 4735 r = aperture_remove_conflicting_pci_devices(adev->pdev, amdgpu_kms_driver.name); 4736 if (r) 4737 return r; 4738 } 4739 4740 /* Enable TMZ based on IP_VERSION */ 4741 amdgpu_gmc_tmz_set(adev); 4742 4743 if (amdgpu_sriov_vf(adev) && 4744 amdgpu_ip_version(adev, GC_HWIP, 0) >= IP_VERSION(10, 3, 0)) 4745 /* VF MMIO access (except mailbox range) from CPU 4746 * will be blocked during sriov runtime 4747 */ 4748 adev->virt.caps |= AMDGPU_VF_MMIO_ACCESS_PROTECT; 4749 4750 amdgpu_gmc_noretry_set(adev); 4751 /* Need to get xgmi info early to decide the reset behavior*/ 4752 if (adev->gmc.xgmi.supported) { 4753 r = adev->gfxhub.funcs->get_xgmi_info(adev); 4754 if (r) 4755 return r; 4756 } 4757 4758 /* enable PCIE atomic ops */ 4759 if (amdgpu_sriov_vf(adev)) { 4760 if (adev->virt.fw_reserve.p_pf2vf) 4761 adev->have_atomics_support = ((struct amd_sriov_msg_pf2vf_info *) 4762 adev->virt.fw_reserve.p_pf2vf)->pcie_atomic_ops_support_flags == 4763 (PCI_EXP_DEVCAP2_ATOMIC_COMP32 | PCI_EXP_DEVCAP2_ATOMIC_COMP64); 4764 /* APUs w/ gfx9 onwards doesn't reply on PCIe atomics, rather it is a 4765 * internal path natively support atomics, set have_atomics_support to true. 4766 */ 4767 } else if ((adev->flags & AMD_IS_APU) && 4768 (amdgpu_ip_version(adev, GC_HWIP, 0) > 4769 IP_VERSION(9, 0, 0))) { 4770 adev->have_atomics_support = true; 4771 } else { 4772 adev->have_atomics_support = 4773 !pci_enable_atomic_ops_to_root(adev->pdev, 4774 PCI_EXP_DEVCAP2_ATOMIC_COMP32 | 4775 PCI_EXP_DEVCAP2_ATOMIC_COMP64); 4776 } 4777 4778 if (!adev->have_atomics_support) 4779 dev_info(adev->dev, "PCIE atomic ops is not supported\n"); 4780 4781 /* doorbell bar mapping and doorbell index init*/ 4782 amdgpu_doorbell_init(adev); 4783 4784 if (amdgpu_emu_mode == 1) { 4785 /* post the asic on emulation mode */ 4786 emu_soc_asic_init(adev); 4787 goto fence_driver_init; 4788 } 4789 4790 amdgpu_reset_init(adev); 4791 4792 /* detect if we are with an SRIOV vbios */ 4793 if (adev->bios) 4794 amdgpu_device_detect_sriov_bios(adev); 4795 4796 /* check if we need to reset the asic 4797 * E.g., driver was not cleanly unloaded previously, etc. 4798 */ 4799 if (!amdgpu_sriov_vf(adev) && amdgpu_asic_need_reset_on_init(adev)) { 4800 if (adev->gmc.xgmi.num_physical_nodes) { 4801 dev_info(adev->dev, "Pending hive reset.\n"); 4802 amdgpu_set_init_level(adev, 4803 AMDGPU_INIT_LEVEL_MINIMAL_XGMI); 4804 } else if (amdgpu_ip_version(adev, MP1_HWIP, 0) == IP_VERSION(13, 0, 10) && 4805 !amdgpu_device_has_display_hardware(adev)) { 4806 r = psp_gpu_reset(adev); 4807 } else { 4808 tmp = amdgpu_reset_method; 4809 /* It should do a default reset when loading or reloading the driver, 4810 * regardless of the module parameter reset_method. 4811 */ 4812 amdgpu_reset_method = AMD_RESET_METHOD_NONE; 4813 r = amdgpu_asic_reset(adev); 4814 amdgpu_reset_method = tmp; 4815 } 4816 4817 if (r) { 4818 dev_err(adev->dev, "asic reset on init failed\n"); 4819 goto failed; 4820 } 4821 } 4822 4823 /* Post card if necessary */ 4824 if (amdgpu_device_need_post(adev)) { 4825 if (!adev->bios) { 4826 dev_err(adev->dev, "no vBIOS found\n"); 4827 r = -EINVAL; 4828 goto failed; 4829 } 4830 dev_info(adev->dev, "GPU posting now...\n"); 4831 r = amdgpu_device_asic_init(adev); 4832 if (r) { 4833 dev_err(adev->dev, "gpu post error!\n"); 4834 goto failed; 4835 } 4836 } 4837 4838 if (adev->bios) { 4839 if (adev->is_atom_fw) { 4840 /* Initialize clocks */ 4841 r = amdgpu_atomfirmware_get_clock_info(adev); 4842 if (r) { 4843 dev_err(adev->dev, "amdgpu_atomfirmware_get_clock_info failed\n"); 4844 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_GET_CLOCK_FAIL, 0, 0); 4845 goto failed; 4846 } 4847 } else { 4848 /* Initialize clocks */ 4849 r = amdgpu_atombios_get_clock_info(adev); 4850 if (r) { 4851 dev_err(adev->dev, "amdgpu_atombios_get_clock_info failed\n"); 4852 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_GET_CLOCK_FAIL, 0, 0); 4853 goto failed; 4854 } 4855 /* init i2c buses */ 4856 amdgpu_i2c_init(adev); 4857 } 4858 } 4859 4860 fence_driver_init: 4861 /* Fence driver */ 4862 r = amdgpu_fence_driver_sw_init(adev); 4863 if (r) { 4864 dev_err(adev->dev, "amdgpu_fence_driver_sw_init failed\n"); 4865 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_FENCE_INIT_FAIL, 0, 0); 4866 goto failed; 4867 } 4868 4869 /* init the mode config */ 4870 drm_mode_config_init(adev_to_drm(adev)); 4871 4872 r = amdgpu_device_ip_init(adev); 4873 if (r) { 4874 dev_err(adev->dev, "amdgpu_device_ip_init failed\n"); 4875 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_AMDGPU_INIT_FAIL, 0, 0); 4876 goto release_ras_con; 4877 } 4878 4879 amdgpu_fence_driver_hw_init(adev); 4880 4881 dev_info(adev->dev, 4882 "SE %d, SH per SE %d, CU per SH %d, active_cu_number %d\n", 4883 adev->gfx.config.max_shader_engines, 4884 adev->gfx.config.max_sh_per_se, 4885 adev->gfx.config.max_cu_per_sh, 4886 adev->gfx.cu_info.number); 4887 4888 adev->accel_working = true; 4889 4890 amdgpu_vm_check_compute_bug(adev); 4891 4892 /* Initialize the buffer migration limit. */ 4893 if (amdgpu_moverate >= 0) 4894 max_MBps = amdgpu_moverate; 4895 else 4896 max_MBps = 8; /* Allow 8 MB/s. */ 4897 /* Get a log2 for easy divisions. */ 4898 adev->mm_stats.log2_max_MBps = ilog2(max(1u, max_MBps)); 4899 4900 /* 4901 * Register gpu instance before amdgpu_device_enable_mgpu_fan_boost. 4902 * Otherwise the mgpu fan boost feature will be skipped due to the 4903 * gpu instance is counted less. 4904 */ 4905 amdgpu_register_gpu_instance(adev); 4906 4907 /* enable clockgating, etc. after ib tests, etc. since some blocks require 4908 * explicit gating rather than handling it automatically. 4909 */ 4910 if (adev->init_lvl->level != AMDGPU_INIT_LEVEL_MINIMAL_XGMI) { 4911 r = amdgpu_device_ip_late_init(adev); 4912 if (r) { 4913 dev_err(adev->dev, "amdgpu_device_ip_late_init failed\n"); 4914 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_AMDGPU_LATE_INIT_FAIL, 0, r); 4915 goto release_ras_con; 4916 } 4917 /* must succeed. */ 4918 amdgpu_ras_resume(adev); 4919 queue_delayed_work(system_wq, &adev->delayed_init_work, 4920 msecs_to_jiffies(AMDGPU_RESUME_MS)); 4921 } 4922 4923 if (amdgpu_sriov_vf(adev)) { 4924 amdgpu_virt_release_full_gpu(adev, true); 4925 flush_delayed_work(&adev->delayed_init_work); 4926 } 4927 4928 if (adev->init_lvl->level == AMDGPU_INIT_LEVEL_MINIMAL_XGMI) 4929 amdgpu_xgmi_reset_on_init(adev); 4930 /* 4931 * Place those sysfs registering after `late_init`. As some of those 4932 * operations performed in `late_init` might affect the sysfs 4933 * interfaces creating. 4934 */ 4935 r = amdgpu_device_sys_interface_init(adev); 4936 4937 if (IS_ENABLED(CONFIG_PERF_EVENTS)) 4938 r = amdgpu_pmu_init(adev); 4939 if (r) 4940 dev_err(adev->dev, "amdgpu_pmu_init failed\n"); 4941 4942 /* Have stored pci confspace at hand for restore in sudden PCI error */ 4943 if (amdgpu_device_cache_pci_state(adev->pdev)) 4944 pci_restore_state(pdev); 4945 4946 /* if we have > 1 VGA cards, then disable the amdgpu VGA resources */ 4947 /* this will fail for cards that aren't VGA class devices, just 4948 * ignore it 4949 */ 4950 if ((adev->pdev->class >> 8) == PCI_CLASS_DISPLAY_VGA) 4951 vga_client_register(adev->pdev, amdgpu_device_vga_set_decode); 4952 4953 px = amdgpu_device_supports_px(adev); 4954 4955 if (px || (!dev_is_removable(&adev->pdev->dev) && 4956 apple_gmux_detect(NULL, NULL))) 4957 vga_switcheroo_register_client(adev->pdev, 4958 &amdgpu_switcheroo_ops, px); 4959 4960 if (px) 4961 vga_switcheroo_init_domain_pm_ops(adev->dev, &adev->vga_pm_domain); 4962 4963 amdgpu_device_check_iommu_direct_map(adev); 4964 4965 adev->pm_nb.notifier_call = amdgpu_device_pm_notifier; 4966 r = register_pm_notifier(&adev->pm_nb); 4967 if (r) 4968 goto failed; 4969 4970 return 0; 4971 4972 release_ras_con: 4973 if (amdgpu_sriov_vf(adev)) 4974 amdgpu_virt_release_full_gpu(adev, true); 4975 4976 /* failed in exclusive mode due to timeout */ 4977 if (amdgpu_sriov_vf(adev) && 4978 !amdgpu_sriov_runtime(adev) && 4979 amdgpu_virt_mmio_blocked(adev) && 4980 !amdgpu_virt_wait_reset(adev)) { 4981 dev_err(adev->dev, "VF exclusive mode timeout\n"); 4982 /* Don't send request since VF is inactive. */ 4983 adev->virt.caps &= ~AMDGPU_SRIOV_CAPS_RUNTIME; 4984 adev->virt.ops = NULL; 4985 r = -EAGAIN; 4986 } 4987 amdgpu_release_ras_context(adev); 4988 4989 failed: 4990 amdgpu_vf_error_trans_all(adev); 4991 4992 return r; 4993 } 4994 4995 static void amdgpu_device_unmap_mmio(struct amdgpu_device *adev) 4996 { 4997 4998 /* Clear all CPU mappings pointing to this device */ 4999 unmap_mapping_range(adev->ddev.anon_inode->i_mapping, 0, 0, 1); 5000 5001 /* Unmap all mapped bars - Doorbell, registers and VRAM */ 5002 amdgpu_doorbell_fini(adev); 5003 5004 iounmap(adev->rmmio); 5005 adev->rmmio = NULL; 5006 if (adev->mman.aper_base_kaddr) 5007 iounmap(adev->mman.aper_base_kaddr); 5008 adev->mman.aper_base_kaddr = NULL; 5009 5010 /* Memory manager related */ 5011 if (!adev->gmc.xgmi.connected_to_cpu && !adev->gmc.is_app_apu) { 5012 arch_phys_wc_del(adev->gmc.vram_mtrr); 5013 arch_io_free_memtype_wc(adev->gmc.aper_base, adev->gmc.aper_size); 5014 } 5015 } 5016 5017 /** 5018 * amdgpu_device_fini_hw - tear down the driver 5019 * 5020 * @adev: amdgpu_device pointer 5021 * 5022 * Tear down the driver info (all asics). 5023 * Called at driver shutdown. 5024 */ 5025 void amdgpu_device_fini_hw(struct amdgpu_device *adev) 5026 { 5027 dev_info(adev->dev, "amdgpu: finishing device.\n"); 5028 flush_delayed_work(&adev->delayed_init_work); 5029 5030 if (adev->mman.initialized) 5031 drain_workqueue(adev->mman.bdev.wq); 5032 adev->shutdown = true; 5033 5034 unregister_pm_notifier(&adev->pm_nb); 5035 5036 /* make sure IB test finished before entering exclusive mode 5037 * to avoid preemption on IB test 5038 */ 5039 if (amdgpu_sriov_vf(adev)) { 5040 amdgpu_virt_request_full_gpu(adev, false); 5041 amdgpu_virt_fini_data_exchange(adev); 5042 } 5043 5044 /* disable all interrupts */ 5045 amdgpu_irq_disable_all(adev); 5046 if (adev->mode_info.mode_config_initialized) { 5047 if (!drm_drv_uses_atomic_modeset(adev_to_drm(adev))) 5048 drm_helper_force_disable_all(adev_to_drm(adev)); 5049 else 5050 drm_atomic_helper_shutdown(adev_to_drm(adev)); 5051 } 5052 amdgpu_fence_driver_hw_fini(adev); 5053 5054 amdgpu_device_sys_interface_fini(adev); 5055 5056 /* disable ras feature must before hw fini */ 5057 amdgpu_ras_pre_fini(adev); 5058 5059 amdgpu_ttm_set_buffer_funcs_status(adev, false); 5060 5061 amdgpu_device_ip_fini_early(adev); 5062 5063 amdgpu_irq_fini_hw(adev); 5064 5065 if (adev->mman.initialized) 5066 ttm_device_clear_dma_mappings(&adev->mman.bdev); 5067 5068 amdgpu_gart_dummy_page_fini(adev); 5069 5070 if (drm_dev_is_unplugged(adev_to_drm(adev))) 5071 amdgpu_device_unmap_mmio(adev); 5072 5073 } 5074 5075 void amdgpu_device_fini_sw(struct amdgpu_device *adev) 5076 { 5077 int i, idx; 5078 bool px; 5079 5080 amdgpu_device_ip_fini(adev); 5081 amdgpu_fence_driver_sw_fini(adev); 5082 amdgpu_ucode_release(&adev->firmware.gpu_info_fw); 5083 adev->accel_working = false; 5084 dma_fence_put(rcu_dereference_protected(adev->gang_submit, true)); 5085 for (i = 0; i < MAX_XCP; ++i) { 5086 dma_fence_put(adev->isolation[i].spearhead); 5087 amdgpu_sync_free(&adev->isolation[i].active); 5088 amdgpu_sync_free(&adev->isolation[i].prev); 5089 } 5090 5091 amdgpu_reset_fini(adev); 5092 5093 /* free i2c buses */ 5094 amdgpu_i2c_fini(adev); 5095 5096 if (adev->bios) { 5097 if (amdgpu_emu_mode != 1) 5098 amdgpu_atombios_fini(adev); 5099 amdgpu_bios_release(adev); 5100 } 5101 5102 kfree(adev->fru_info); 5103 adev->fru_info = NULL; 5104 5105 kfree(adev->xcp_mgr); 5106 adev->xcp_mgr = NULL; 5107 5108 px = amdgpu_device_supports_px(adev); 5109 5110 if (px || (!dev_is_removable(&adev->pdev->dev) && 5111 apple_gmux_detect(NULL, NULL))) 5112 vga_switcheroo_unregister_client(adev->pdev); 5113 5114 if (px) 5115 vga_switcheroo_fini_domain_pm_ops(adev->dev); 5116 5117 if ((adev->pdev->class >> 8) == PCI_CLASS_DISPLAY_VGA) 5118 vga_client_unregister(adev->pdev); 5119 5120 if (drm_dev_enter(adev_to_drm(adev), &idx)) { 5121 5122 iounmap(adev->rmmio); 5123 adev->rmmio = NULL; 5124 drm_dev_exit(idx); 5125 } 5126 5127 if (IS_ENABLED(CONFIG_PERF_EVENTS)) 5128 amdgpu_pmu_fini(adev); 5129 if (adev->discovery.bin) 5130 amdgpu_discovery_fini(adev); 5131 5132 amdgpu_reset_put_reset_domain(adev->reset_domain); 5133 adev->reset_domain = NULL; 5134 5135 kfree(adev->pci_state); 5136 kfree(adev->pcie_reset_ctx.swds_pcistate); 5137 kfree(adev->pcie_reset_ctx.swus_pcistate); 5138 } 5139 5140 /** 5141 * amdgpu_device_evict_resources - evict device resources 5142 * @adev: amdgpu device object 5143 * 5144 * Evicts all ttm device resources(vram BOs, gart table) from the lru list 5145 * of the vram memory type. Mainly used for evicting device resources 5146 * at suspend time. 5147 * 5148 */ 5149 static int amdgpu_device_evict_resources(struct amdgpu_device *adev) 5150 { 5151 int ret; 5152 5153 /* No need to evict vram on APUs unless going to S4 */ 5154 if (!adev->in_s4 && (adev->flags & AMD_IS_APU)) 5155 return 0; 5156 5157 /* No need to evict when going to S5 through S4 callbacks */ 5158 if (system_state == SYSTEM_POWER_OFF) 5159 return 0; 5160 5161 ret = amdgpu_ttm_evict_resources(adev, TTM_PL_VRAM); 5162 if (ret) { 5163 dev_warn(adev->dev, "evicting device resources failed\n"); 5164 return ret; 5165 } 5166 5167 if (adev->in_s4) { 5168 ret = ttm_device_prepare_hibernation(&adev->mman.bdev); 5169 if (ret) 5170 dev_err(adev->dev, "prepare hibernation failed, %d\n", ret); 5171 } 5172 return ret; 5173 } 5174 5175 /* 5176 * Suspend & resume. 5177 */ 5178 /** 5179 * amdgpu_device_pm_notifier - Notification block for Suspend/Hibernate events 5180 * @nb: notifier block 5181 * @mode: suspend mode 5182 * @data: data 5183 * 5184 * This function is called when the system is about to suspend or hibernate. 5185 * It is used to set the appropriate flags so that eviction can be optimized 5186 * in the pm prepare callback. 5187 */ 5188 static int amdgpu_device_pm_notifier(struct notifier_block *nb, unsigned long mode, 5189 void *data) 5190 { 5191 struct amdgpu_device *adev = container_of(nb, struct amdgpu_device, pm_nb); 5192 5193 switch (mode) { 5194 case PM_HIBERNATION_PREPARE: 5195 adev->in_s4 = true; 5196 break; 5197 case PM_POST_HIBERNATION: 5198 adev->in_s4 = false; 5199 break; 5200 } 5201 5202 return NOTIFY_DONE; 5203 } 5204 5205 /** 5206 * amdgpu_device_prepare - prepare for device suspend 5207 * 5208 * @dev: drm dev pointer 5209 * 5210 * Prepare to put the hw in the suspend state (all asics). 5211 * Returns 0 for success or an error on failure. 5212 * Called at driver suspend. 5213 */ 5214 int amdgpu_device_prepare(struct drm_device *dev) 5215 { 5216 struct amdgpu_device *adev = drm_to_adev(dev); 5217 int i, r; 5218 5219 if (dev->switch_power_state == DRM_SWITCH_POWER_OFF) 5220 return 0; 5221 5222 /* Evict the majority of BOs before starting suspend sequence */ 5223 r = amdgpu_device_evict_resources(adev); 5224 if (r) 5225 return r; 5226 5227 flush_delayed_work(&adev->gfx.gfx_off_delay_work); 5228 5229 for (i = 0; i < adev->num_ip_blocks; i++) { 5230 if (!adev->ip_blocks[i].status.valid) 5231 continue; 5232 if (!adev->ip_blocks[i].version->funcs->prepare_suspend) 5233 continue; 5234 r = adev->ip_blocks[i].version->funcs->prepare_suspend(&adev->ip_blocks[i]); 5235 if (r) 5236 return r; 5237 } 5238 5239 return 0; 5240 } 5241 5242 /** 5243 * amdgpu_device_complete - complete power state transition 5244 * 5245 * @dev: drm dev pointer 5246 * 5247 * Undo the changes from amdgpu_device_prepare. This will be 5248 * called on all resume transitions, including those that failed. 5249 */ 5250 void amdgpu_device_complete(struct drm_device *dev) 5251 { 5252 struct amdgpu_device *adev = drm_to_adev(dev); 5253 int i; 5254 5255 for (i = 0; i < adev->num_ip_blocks; i++) { 5256 if (!adev->ip_blocks[i].status.valid) 5257 continue; 5258 if (!adev->ip_blocks[i].version->funcs->complete) 5259 continue; 5260 adev->ip_blocks[i].version->funcs->complete(&adev->ip_blocks[i]); 5261 } 5262 } 5263 5264 /** 5265 * amdgpu_device_suspend - initiate device suspend 5266 * 5267 * @dev: drm dev pointer 5268 * @notify_clients: notify in-kernel DRM clients 5269 * 5270 * Puts the hw in the suspend state (all asics). 5271 * Returns 0 for success or an error on failure. 5272 * Called at driver suspend. 5273 */ 5274 int amdgpu_device_suspend(struct drm_device *dev, bool notify_clients) 5275 { 5276 struct amdgpu_device *adev = drm_to_adev(dev); 5277 int r, rec; 5278 5279 if (dev->switch_power_state == DRM_SWITCH_POWER_OFF) 5280 return 0; 5281 5282 adev->in_suspend = true; 5283 5284 if (amdgpu_sriov_vf(adev)) { 5285 if (!adev->in_runpm) 5286 amdgpu_amdkfd_suspend_process(adev); 5287 amdgpu_virt_fini_data_exchange(adev); 5288 r = amdgpu_virt_request_full_gpu(adev, false); 5289 if (r) 5290 return r; 5291 } 5292 5293 r = amdgpu_acpi_smart_shift_update(adev, AMDGPU_SS_DEV_D3); 5294 if (r) 5295 goto unwind_sriov; 5296 5297 if (notify_clients) 5298 drm_client_dev_suspend(adev_to_drm(adev)); 5299 5300 cancel_delayed_work_sync(&adev->delayed_init_work); 5301 5302 amdgpu_ras_suspend(adev); 5303 5304 r = amdgpu_device_ip_suspend_phase1(adev); 5305 if (r) 5306 goto unwind_smartshift; 5307 5308 amdgpu_amdkfd_suspend(adev, !amdgpu_sriov_vf(adev) && !adev->in_runpm); 5309 r = amdgpu_userq_suspend(adev); 5310 if (r) 5311 goto unwind_ip_phase1; 5312 5313 r = amdgpu_device_evict_resources(adev); 5314 if (r) 5315 goto unwind_userq; 5316 5317 amdgpu_ttm_set_buffer_funcs_status(adev, false); 5318 5319 amdgpu_fence_driver_hw_fini(adev); 5320 5321 r = amdgpu_device_ip_suspend_phase2(adev); 5322 if (r) 5323 goto unwind_evict; 5324 5325 if (amdgpu_sriov_vf(adev)) 5326 amdgpu_virt_release_full_gpu(adev, false); 5327 5328 return 0; 5329 5330 unwind_evict: 5331 if (adev->mman.buffer_funcs_ring->sched.ready) 5332 amdgpu_ttm_set_buffer_funcs_status(adev, true); 5333 amdgpu_fence_driver_hw_init(adev); 5334 5335 unwind_userq: 5336 rec = amdgpu_userq_resume(adev); 5337 if (rec) { 5338 dev_warn(adev->dev, "failed to re-initialize user queues: %d\n", rec); 5339 return r; 5340 } 5341 rec = amdgpu_amdkfd_resume(adev, !amdgpu_sriov_vf(adev) && !adev->in_runpm); 5342 if (rec) { 5343 dev_warn(adev->dev, "failed to re-initialize kfd: %d\n", rec); 5344 return r; 5345 } 5346 5347 unwind_ip_phase1: 5348 /* suspend phase 1 = resume phase 3 */ 5349 rec = amdgpu_device_ip_resume_phase3(adev); 5350 if (rec) { 5351 dev_warn(adev->dev, "failed to re-initialize IPs phase1: %d\n", rec); 5352 return r; 5353 } 5354 5355 unwind_smartshift: 5356 rec = amdgpu_acpi_smart_shift_update(adev, AMDGPU_SS_DEV_D0); 5357 if (rec) { 5358 dev_warn(adev->dev, "failed to re-update smart shift: %d\n", rec); 5359 return r; 5360 } 5361 5362 if (notify_clients) 5363 drm_client_dev_resume(adev_to_drm(adev)); 5364 5365 amdgpu_ras_resume(adev); 5366 5367 unwind_sriov: 5368 if (amdgpu_sriov_vf(adev)) { 5369 rec = amdgpu_virt_request_full_gpu(adev, true); 5370 if (rec) { 5371 dev_warn(adev->dev, "failed to reinitialize sriov: %d\n", rec); 5372 return r; 5373 } 5374 } 5375 5376 adev->in_suspend = adev->in_s0ix = adev->in_s3 = false; 5377 5378 return r; 5379 } 5380 5381 static inline int amdgpu_virt_resume(struct amdgpu_device *adev) 5382 { 5383 int r; 5384 unsigned int prev_physical_node_id = adev->gmc.xgmi.physical_node_id; 5385 5386 /* During VM resume, QEMU programming of VF MSIX table (register GFXMSIX_VECT0_ADDR_LO) 5387 * may not work. The access could be blocked by nBIF protection as VF isn't in 5388 * exclusive access mode. Exclusive access is enabled now, disable/enable MSIX 5389 * so that QEMU reprograms MSIX table. 5390 */ 5391 amdgpu_restore_msix(adev); 5392 5393 r = adev->gfxhub.funcs->get_xgmi_info(adev); 5394 if (r) 5395 return r; 5396 5397 dev_info(adev->dev, "xgmi node, old id %d, new id %d\n", 5398 prev_physical_node_id, adev->gmc.xgmi.physical_node_id); 5399 5400 adev->vm_manager.vram_base_offset = adev->gfxhub.funcs->get_mc_fb_offset(adev); 5401 adev->vm_manager.vram_base_offset += 5402 adev->gmc.xgmi.physical_node_id * adev->gmc.xgmi.node_segment_size; 5403 5404 return 0; 5405 } 5406 5407 /** 5408 * amdgpu_device_resume - initiate device resume 5409 * 5410 * @dev: drm dev pointer 5411 * @notify_clients: notify in-kernel DRM clients 5412 * 5413 * Bring the hw back to operating state (all asics). 5414 * Returns 0 for success or an error on failure. 5415 * Called at driver resume. 5416 */ 5417 int amdgpu_device_resume(struct drm_device *dev, bool notify_clients) 5418 { 5419 struct amdgpu_device *adev = drm_to_adev(dev); 5420 int r = 0; 5421 5422 if (amdgpu_sriov_vf(adev)) { 5423 r = amdgpu_virt_request_full_gpu(adev, true); 5424 if (r) 5425 return r; 5426 } 5427 5428 if (amdgpu_virt_xgmi_migrate_enabled(adev)) { 5429 r = amdgpu_virt_resume(adev); 5430 if (r) 5431 goto exit; 5432 } 5433 5434 if (dev->switch_power_state == DRM_SWITCH_POWER_OFF) 5435 return 0; 5436 5437 if (adev->in_s0ix) 5438 amdgpu_dpm_gfx_state_change(adev, sGpuChangeState_D0Entry); 5439 5440 /* post card */ 5441 if (amdgpu_device_need_post(adev)) { 5442 r = amdgpu_device_asic_init(adev); 5443 if (r) 5444 dev_err(adev->dev, "amdgpu asic init failed\n"); 5445 } 5446 5447 r = amdgpu_device_ip_resume(adev); 5448 5449 if (r) { 5450 dev_err(adev->dev, "amdgpu_device_ip_resume failed (%d).\n", r); 5451 goto exit; 5452 } 5453 5454 r = amdgpu_amdkfd_resume(adev, !amdgpu_sriov_vf(adev) && !adev->in_runpm); 5455 if (r) 5456 goto exit; 5457 5458 r = amdgpu_userq_resume(adev); 5459 if (r) 5460 goto exit; 5461 5462 r = amdgpu_device_ip_late_init(adev); 5463 if (r) 5464 goto exit; 5465 5466 queue_delayed_work(system_wq, &adev->delayed_init_work, 5467 msecs_to_jiffies(AMDGPU_RESUME_MS)); 5468 exit: 5469 if (amdgpu_sriov_vf(adev)) { 5470 amdgpu_virt_init_data_exchange(adev); 5471 amdgpu_virt_release_full_gpu(adev, true); 5472 5473 if (!r && !adev->in_runpm) 5474 r = amdgpu_amdkfd_resume_process(adev); 5475 } 5476 5477 if (r) 5478 return r; 5479 5480 /* Make sure IB tests flushed */ 5481 flush_delayed_work(&adev->delayed_init_work); 5482 5483 if (notify_clients) 5484 drm_client_dev_resume(adev_to_drm(adev)); 5485 5486 amdgpu_ras_resume(adev); 5487 5488 if (adev->mode_info.num_crtc) { 5489 /* 5490 * Most of the connector probing functions try to acquire runtime pm 5491 * refs to ensure that the GPU is powered on when connector polling is 5492 * performed. Since we're calling this from a runtime PM callback, 5493 * trying to acquire rpm refs will cause us to deadlock. 5494 * 5495 * Since we're guaranteed to be holding the rpm lock, it's safe to 5496 * temporarily disable the rpm helpers so this doesn't deadlock us. 5497 */ 5498 #ifdef CONFIG_PM 5499 dev->dev->power.disable_depth++; 5500 #endif 5501 if (!adev->dc_enabled) 5502 drm_helper_hpd_irq_event(dev); 5503 else 5504 drm_kms_helper_hotplug_event(dev); 5505 #ifdef CONFIG_PM 5506 dev->dev->power.disable_depth--; 5507 #endif 5508 } 5509 5510 amdgpu_vram_mgr_clear_reset_blocks(adev); 5511 adev->in_suspend = false; 5512 5513 if (amdgpu_acpi_smart_shift_update(adev, AMDGPU_SS_DEV_D0)) 5514 dev_warn(adev->dev, "smart shift update failed\n"); 5515 5516 return 0; 5517 } 5518 5519 /** 5520 * amdgpu_device_ip_check_soft_reset - did soft reset succeed 5521 * 5522 * @adev: amdgpu_device pointer 5523 * 5524 * The list of all the hardware IPs that make up the asic is walked and 5525 * the check_soft_reset callbacks are run. check_soft_reset determines 5526 * if the asic is still hung or not. 5527 * Returns true if any of the IPs are still in a hung state, false if not. 5528 */ 5529 static bool amdgpu_device_ip_check_soft_reset(struct amdgpu_device *adev) 5530 { 5531 int i; 5532 bool asic_hang = false; 5533 5534 if (amdgpu_sriov_vf(adev)) 5535 return true; 5536 5537 if (amdgpu_asic_need_full_reset(adev)) 5538 return true; 5539 5540 for (i = 0; i < adev->num_ip_blocks; i++) { 5541 if (!adev->ip_blocks[i].status.valid) 5542 continue; 5543 if (adev->ip_blocks[i].version->funcs->check_soft_reset) 5544 adev->ip_blocks[i].status.hang = 5545 adev->ip_blocks[i].version->funcs->check_soft_reset( 5546 &adev->ip_blocks[i]); 5547 if (adev->ip_blocks[i].status.hang) { 5548 dev_info(adev->dev, "IP block:%s is hung!\n", adev->ip_blocks[i].version->funcs->name); 5549 asic_hang = true; 5550 } 5551 } 5552 return asic_hang; 5553 } 5554 5555 /** 5556 * amdgpu_device_ip_pre_soft_reset - prepare for soft reset 5557 * 5558 * @adev: amdgpu_device pointer 5559 * 5560 * The list of all the hardware IPs that make up the asic is walked and the 5561 * pre_soft_reset callbacks are run if the block is hung. pre_soft_reset 5562 * handles any IP specific hardware or software state changes that are 5563 * necessary for a soft reset to succeed. 5564 * Returns 0 on success, negative error code on failure. 5565 */ 5566 static int amdgpu_device_ip_pre_soft_reset(struct amdgpu_device *adev) 5567 { 5568 int i, r = 0; 5569 5570 for (i = 0; i < adev->num_ip_blocks; i++) { 5571 if (!adev->ip_blocks[i].status.valid) 5572 continue; 5573 if (adev->ip_blocks[i].status.hang && 5574 adev->ip_blocks[i].version->funcs->pre_soft_reset) { 5575 r = adev->ip_blocks[i].version->funcs->pre_soft_reset(&adev->ip_blocks[i]); 5576 if (r) 5577 return r; 5578 } 5579 } 5580 5581 return 0; 5582 } 5583 5584 /** 5585 * amdgpu_device_ip_need_full_reset - check if a full asic reset is needed 5586 * 5587 * @adev: amdgpu_device pointer 5588 * 5589 * Some hardware IPs cannot be soft reset. If they are hung, a full gpu 5590 * reset is necessary to recover. 5591 * Returns true if a full asic reset is required, false if not. 5592 */ 5593 static bool amdgpu_device_ip_need_full_reset(struct amdgpu_device *adev) 5594 { 5595 int i; 5596 5597 if (amdgpu_asic_need_full_reset(adev)) 5598 return true; 5599 5600 for (i = 0; i < adev->num_ip_blocks; i++) { 5601 if (!adev->ip_blocks[i].status.valid) 5602 continue; 5603 if ((adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) || 5604 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) || 5605 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_ACP) || 5606 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE) || 5607 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) { 5608 if (adev->ip_blocks[i].status.hang) { 5609 dev_info(adev->dev, "Some block need full reset!\n"); 5610 return true; 5611 } 5612 } 5613 } 5614 return false; 5615 } 5616 5617 /** 5618 * amdgpu_device_ip_soft_reset - do a soft reset 5619 * 5620 * @adev: amdgpu_device pointer 5621 * 5622 * The list of all the hardware IPs that make up the asic is walked and the 5623 * soft_reset callbacks are run if the block is hung. soft_reset handles any 5624 * IP specific hardware or software state changes that are necessary to soft 5625 * reset the IP. 5626 * Returns 0 on success, negative error code on failure. 5627 */ 5628 static int amdgpu_device_ip_soft_reset(struct amdgpu_device *adev) 5629 { 5630 int i, r = 0; 5631 5632 for (i = 0; i < adev->num_ip_blocks; i++) { 5633 if (!adev->ip_blocks[i].status.valid) 5634 continue; 5635 if (adev->ip_blocks[i].status.hang && 5636 adev->ip_blocks[i].version->funcs->soft_reset) { 5637 r = adev->ip_blocks[i].version->funcs->soft_reset(&adev->ip_blocks[i]); 5638 if (r) 5639 return r; 5640 } 5641 } 5642 5643 return 0; 5644 } 5645 5646 /** 5647 * amdgpu_device_ip_post_soft_reset - clean up from soft reset 5648 * 5649 * @adev: amdgpu_device pointer 5650 * 5651 * The list of all the hardware IPs that make up the asic is walked and the 5652 * post_soft_reset callbacks are run if the asic was hung. post_soft_reset 5653 * handles any IP specific hardware or software state changes that are 5654 * necessary after the IP has been soft reset. 5655 * Returns 0 on success, negative error code on failure. 5656 */ 5657 static int amdgpu_device_ip_post_soft_reset(struct amdgpu_device *adev) 5658 { 5659 int i, r = 0; 5660 5661 for (i = 0; i < adev->num_ip_blocks; i++) { 5662 if (!adev->ip_blocks[i].status.valid) 5663 continue; 5664 if (adev->ip_blocks[i].status.hang && 5665 adev->ip_blocks[i].version->funcs->post_soft_reset) 5666 r = adev->ip_blocks[i].version->funcs->post_soft_reset(&adev->ip_blocks[i]); 5667 if (r) 5668 return r; 5669 } 5670 5671 return 0; 5672 } 5673 5674 /** 5675 * amdgpu_device_reset_sriov - reset ASIC for SR-IOV vf 5676 * 5677 * @adev: amdgpu_device pointer 5678 * @reset_context: amdgpu reset context pointer 5679 * 5680 * do VF FLR and reinitialize Asic 5681 * return 0 means succeeded otherwise failed 5682 */ 5683 static int amdgpu_device_reset_sriov(struct amdgpu_device *adev, 5684 struct amdgpu_reset_context *reset_context) 5685 { 5686 int r; 5687 struct amdgpu_hive_info *hive = NULL; 5688 5689 if (test_bit(AMDGPU_HOST_FLR, &reset_context->flags)) { 5690 if (!amdgpu_ras_get_fed_status(adev)) 5691 amdgpu_virt_ready_to_reset(adev); 5692 amdgpu_virt_wait_reset(adev); 5693 clear_bit(AMDGPU_HOST_FLR, &reset_context->flags); 5694 r = amdgpu_virt_request_full_gpu(adev, true); 5695 } else { 5696 r = amdgpu_virt_reset_gpu(adev); 5697 } 5698 if (r) 5699 return r; 5700 5701 amdgpu_ras_clear_err_state(adev); 5702 amdgpu_irq_gpu_reset_resume_helper(adev); 5703 5704 /* some sw clean up VF needs to do before recover */ 5705 amdgpu_virt_post_reset(adev); 5706 5707 /* Resume IP prior to SMC */ 5708 r = amdgpu_device_ip_reinit_early_sriov(adev); 5709 if (r) 5710 return r; 5711 5712 amdgpu_virt_init_data_exchange(adev); 5713 5714 r = amdgpu_device_fw_loading(adev); 5715 if (r) 5716 return r; 5717 5718 /* now we are okay to resume SMC/CP/SDMA */ 5719 r = amdgpu_device_ip_reinit_late_sriov(adev); 5720 if (r) 5721 return r; 5722 5723 hive = amdgpu_get_xgmi_hive(adev); 5724 /* Update PSP FW topology after reset */ 5725 if (hive && adev->gmc.xgmi.num_physical_nodes > 1) 5726 r = amdgpu_xgmi_update_topology(hive, adev); 5727 if (hive) 5728 amdgpu_put_xgmi_hive(hive); 5729 if (r) 5730 return r; 5731 5732 r = amdgpu_ib_ring_tests(adev); 5733 if (r) 5734 return r; 5735 5736 if (adev->virt.gim_feature & AMDGIM_FEATURE_GIM_FLR_VRAMLOST) 5737 amdgpu_inc_vram_lost(adev); 5738 5739 /* need to be called during full access so we can't do it later like 5740 * bare-metal does. 5741 */ 5742 amdgpu_amdkfd_post_reset(adev); 5743 amdgpu_virt_release_full_gpu(adev, true); 5744 5745 /* Aldebaran and gfx_11_0_3 support ras in SRIOV, so need resume ras during reset */ 5746 if (amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 2) || 5747 amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 3) || 5748 amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 4) || 5749 amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 5, 0) || 5750 amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(11, 0, 3)) 5751 amdgpu_ras_resume(adev); 5752 5753 amdgpu_virt_ras_telemetry_post_reset(adev); 5754 5755 return 0; 5756 } 5757 5758 /** 5759 * amdgpu_device_has_job_running - check if there is any unfinished job 5760 * 5761 * @adev: amdgpu_device pointer 5762 * 5763 * check if there is any job running on the device when guest driver receives 5764 * FLR notification from host driver. If there are still jobs running, then 5765 * the guest driver will not respond the FLR reset. Instead, let the job hit 5766 * the timeout and guest driver then issue the reset request. 5767 */ 5768 bool amdgpu_device_has_job_running(struct amdgpu_device *adev) 5769 { 5770 int i; 5771 5772 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 5773 struct amdgpu_ring *ring = adev->rings[i]; 5774 5775 if (!amdgpu_ring_sched_ready(ring)) 5776 continue; 5777 5778 if (amdgpu_fence_count_emitted(ring)) 5779 return true; 5780 } 5781 return false; 5782 } 5783 5784 /** 5785 * amdgpu_device_should_recover_gpu - check if we should try GPU recovery 5786 * 5787 * @adev: amdgpu_device pointer 5788 * 5789 * Check amdgpu_gpu_recovery and SRIOV status to see if we should try to recover 5790 * a hung GPU. 5791 */ 5792 bool amdgpu_device_should_recover_gpu(struct amdgpu_device *adev) 5793 { 5794 5795 if (amdgpu_gpu_recovery == 0) 5796 goto disabled; 5797 5798 /* Skip soft reset check in fatal error mode */ 5799 if (!amdgpu_ras_is_poison_mode_supported(adev)) 5800 return true; 5801 5802 if (amdgpu_sriov_vf(adev)) 5803 return true; 5804 5805 if (amdgpu_gpu_recovery == -1) { 5806 switch (adev->asic_type) { 5807 #ifdef CONFIG_DRM_AMDGPU_SI 5808 case CHIP_VERDE: 5809 case CHIP_TAHITI: 5810 case CHIP_PITCAIRN: 5811 case CHIP_OLAND: 5812 case CHIP_HAINAN: 5813 #endif 5814 #ifdef CONFIG_DRM_AMDGPU_CIK 5815 case CHIP_KAVERI: 5816 case CHIP_KABINI: 5817 case CHIP_MULLINS: 5818 #endif 5819 case CHIP_CARRIZO: 5820 case CHIP_STONEY: 5821 case CHIP_CYAN_SKILLFISH: 5822 goto disabled; 5823 default: 5824 break; 5825 } 5826 } 5827 5828 return true; 5829 5830 disabled: 5831 dev_info(adev->dev, "GPU recovery disabled.\n"); 5832 return false; 5833 } 5834 5835 int amdgpu_device_mode1_reset(struct amdgpu_device *adev) 5836 { 5837 u32 i; 5838 int ret = 0; 5839 5840 if (adev->bios) 5841 amdgpu_atombios_scratch_regs_engine_hung(adev, true); 5842 5843 dev_info(adev->dev, "GPU mode1 reset\n"); 5844 5845 /* Cache the state before bus master disable. The saved config space 5846 * values are used in other cases like restore after mode-2 reset. 5847 */ 5848 amdgpu_device_cache_pci_state(adev->pdev); 5849 5850 /* disable BM */ 5851 pci_clear_master(adev->pdev); 5852 5853 if (amdgpu_dpm_is_mode1_reset_supported(adev)) { 5854 dev_info(adev->dev, "GPU smu mode1 reset\n"); 5855 ret = amdgpu_dpm_mode1_reset(adev); 5856 } else { 5857 dev_info(adev->dev, "GPU psp mode1 reset\n"); 5858 ret = psp_gpu_reset(adev); 5859 } 5860 5861 if (ret) 5862 goto mode1_reset_failed; 5863 5864 amdgpu_device_load_pci_state(adev->pdev); 5865 ret = amdgpu_psp_wait_for_bootloader(adev); 5866 if (ret) 5867 goto mode1_reset_failed; 5868 5869 /* wait for asic to come out of reset */ 5870 for (i = 0; i < adev->usec_timeout; i++) { 5871 u32 memsize = adev->nbio.funcs->get_memsize(adev); 5872 5873 if (memsize != 0xffffffff) 5874 break; 5875 udelay(1); 5876 } 5877 5878 if (i >= adev->usec_timeout) { 5879 ret = -ETIMEDOUT; 5880 goto mode1_reset_failed; 5881 } 5882 5883 if (adev->bios) 5884 amdgpu_atombios_scratch_regs_engine_hung(adev, false); 5885 5886 return 0; 5887 5888 mode1_reset_failed: 5889 dev_err(adev->dev, "GPU mode1 reset failed\n"); 5890 return ret; 5891 } 5892 5893 int amdgpu_device_link_reset(struct amdgpu_device *adev) 5894 { 5895 int ret = 0; 5896 5897 dev_info(adev->dev, "GPU link reset\n"); 5898 5899 if (!amdgpu_reset_in_dpc(adev)) 5900 ret = amdgpu_dpm_link_reset(adev); 5901 5902 if (ret) 5903 goto link_reset_failed; 5904 5905 ret = amdgpu_psp_wait_for_bootloader(adev); 5906 if (ret) 5907 goto link_reset_failed; 5908 5909 return 0; 5910 5911 link_reset_failed: 5912 dev_err(adev->dev, "GPU link reset failed\n"); 5913 return ret; 5914 } 5915 5916 int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev, 5917 struct amdgpu_reset_context *reset_context) 5918 { 5919 int i, r = 0; 5920 struct amdgpu_job *job = NULL; 5921 struct amdgpu_device *tmp_adev = reset_context->reset_req_dev; 5922 bool need_full_reset = 5923 test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 5924 5925 if (reset_context->reset_req_dev == adev) 5926 job = reset_context->job; 5927 5928 if (amdgpu_sriov_vf(adev)) 5929 amdgpu_virt_pre_reset(adev); 5930 5931 amdgpu_fence_driver_isr_toggle(adev, true); 5932 5933 /* block all schedulers and reset given job's ring */ 5934 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 5935 struct amdgpu_ring *ring = adev->rings[i]; 5936 5937 if (!amdgpu_ring_sched_ready(ring)) 5938 continue; 5939 5940 /* after all hw jobs are reset, hw fence is meaningless, so force_completion */ 5941 amdgpu_fence_driver_force_completion(ring); 5942 } 5943 5944 amdgpu_fence_driver_isr_toggle(adev, false); 5945 5946 if (job && job->vm) 5947 drm_sched_increase_karma(&job->base); 5948 5949 r = amdgpu_reset_prepare_hwcontext(adev, reset_context); 5950 /* If reset handler not implemented, continue; otherwise return */ 5951 if (r == -EOPNOTSUPP) 5952 r = 0; 5953 else 5954 return r; 5955 5956 /* Don't suspend on bare metal if we are not going to HW reset the ASIC */ 5957 if (!amdgpu_sriov_vf(adev)) { 5958 5959 if (!need_full_reset) 5960 need_full_reset = amdgpu_device_ip_need_full_reset(adev); 5961 5962 if (!need_full_reset && amdgpu_gpu_recovery && 5963 amdgpu_device_ip_check_soft_reset(adev)) { 5964 amdgpu_device_ip_pre_soft_reset(adev); 5965 r = amdgpu_device_ip_soft_reset(adev); 5966 amdgpu_device_ip_post_soft_reset(adev); 5967 if (r || amdgpu_device_ip_check_soft_reset(adev)) { 5968 dev_info(adev->dev, "soft reset failed, will fallback to full reset!\n"); 5969 need_full_reset = true; 5970 } 5971 } 5972 5973 if (!test_bit(AMDGPU_SKIP_COREDUMP, &reset_context->flags)) { 5974 dev_info(tmp_adev->dev, "Dumping IP State\n"); 5975 /* Trigger ip dump before we reset the asic */ 5976 for (i = 0; i < tmp_adev->num_ip_blocks; i++) 5977 if (tmp_adev->ip_blocks[i].version->funcs->dump_ip_state) 5978 tmp_adev->ip_blocks[i].version->funcs 5979 ->dump_ip_state((void *)&tmp_adev->ip_blocks[i]); 5980 dev_info(tmp_adev->dev, "Dumping IP State Completed\n"); 5981 } 5982 5983 if (need_full_reset) 5984 r = amdgpu_device_ip_suspend(adev); 5985 if (need_full_reset) 5986 set_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 5987 else 5988 clear_bit(AMDGPU_NEED_FULL_RESET, 5989 &reset_context->flags); 5990 } 5991 5992 return r; 5993 } 5994 5995 int amdgpu_device_reinit_after_reset(struct amdgpu_reset_context *reset_context) 5996 { 5997 struct list_head *device_list_handle; 5998 bool full_reset, vram_lost = false; 5999 struct amdgpu_device *tmp_adev; 6000 int r, init_level; 6001 6002 device_list_handle = reset_context->reset_device_list; 6003 6004 if (!device_list_handle) 6005 return -EINVAL; 6006 6007 full_reset = test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 6008 6009 /** 6010 * If it's reset on init, it's default init level, otherwise keep level 6011 * as recovery level. 6012 */ 6013 if (reset_context->method == AMD_RESET_METHOD_ON_INIT) 6014 init_level = AMDGPU_INIT_LEVEL_DEFAULT; 6015 else 6016 init_level = AMDGPU_INIT_LEVEL_RESET_RECOVERY; 6017 6018 r = 0; 6019 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 6020 amdgpu_set_init_level(tmp_adev, init_level); 6021 if (full_reset) { 6022 /* post card */ 6023 amdgpu_reset_set_dpc_status(tmp_adev, false); 6024 amdgpu_ras_clear_err_state(tmp_adev); 6025 r = amdgpu_device_asic_init(tmp_adev); 6026 if (r) { 6027 dev_warn(tmp_adev->dev, "asic atom init failed!"); 6028 } else { 6029 dev_info(tmp_adev->dev, "GPU reset succeeded, trying to resume\n"); 6030 6031 r = amdgpu_device_ip_resume_phase1(tmp_adev); 6032 if (r) 6033 goto out; 6034 6035 vram_lost = amdgpu_device_check_vram_lost(tmp_adev); 6036 6037 if (!test_bit(AMDGPU_SKIP_COREDUMP, &reset_context->flags)) 6038 amdgpu_coredump(tmp_adev, false, vram_lost, reset_context->job); 6039 6040 if (vram_lost) { 6041 dev_info( 6042 tmp_adev->dev, 6043 "VRAM is lost due to GPU reset!\n"); 6044 amdgpu_inc_vram_lost(tmp_adev); 6045 } 6046 6047 r = amdgpu_device_fw_loading(tmp_adev); 6048 if (r) 6049 return r; 6050 6051 r = amdgpu_xcp_restore_partition_mode( 6052 tmp_adev->xcp_mgr); 6053 if (r) 6054 goto out; 6055 6056 r = amdgpu_device_ip_resume_phase2(tmp_adev); 6057 if (r) 6058 goto out; 6059 6060 if (tmp_adev->mman.buffer_funcs_ring->sched.ready) 6061 amdgpu_ttm_set_buffer_funcs_status(tmp_adev, true); 6062 6063 r = amdgpu_device_ip_resume_phase3(tmp_adev); 6064 if (r) 6065 goto out; 6066 6067 if (vram_lost) 6068 amdgpu_device_fill_reset_magic(tmp_adev); 6069 6070 /* 6071 * Add this ASIC as tracked as reset was already 6072 * complete successfully. 6073 */ 6074 amdgpu_register_gpu_instance(tmp_adev); 6075 6076 if (!reset_context->hive && 6077 tmp_adev->gmc.xgmi.num_physical_nodes > 1) 6078 amdgpu_xgmi_add_device(tmp_adev); 6079 6080 r = amdgpu_device_ip_late_init(tmp_adev); 6081 if (r) 6082 goto out; 6083 6084 r = amdgpu_userq_post_reset(tmp_adev, vram_lost); 6085 if (r) 6086 goto out; 6087 6088 drm_client_dev_resume(adev_to_drm(tmp_adev)); 6089 6090 /* 6091 * The GPU enters bad state once faulty pages 6092 * by ECC has reached the threshold, and ras 6093 * recovery is scheduled next. So add one check 6094 * here to break recovery if it indeed exceeds 6095 * bad page threshold, and remind user to 6096 * retire this GPU or setting one bigger 6097 * bad_page_threshold value to fix this once 6098 * probing driver again. 6099 */ 6100 if (!amdgpu_ras_is_rma(tmp_adev)) { 6101 /* must succeed. */ 6102 amdgpu_ras_resume(tmp_adev); 6103 } else { 6104 r = -EINVAL; 6105 goto out; 6106 } 6107 6108 /* Update PSP FW topology after reset */ 6109 if (reset_context->hive && 6110 tmp_adev->gmc.xgmi.num_physical_nodes > 1) 6111 r = amdgpu_xgmi_update_topology( 6112 reset_context->hive, tmp_adev); 6113 } 6114 } 6115 6116 out: 6117 if (!r) { 6118 /* IP init is complete now, set level as default */ 6119 amdgpu_set_init_level(tmp_adev, 6120 AMDGPU_INIT_LEVEL_DEFAULT); 6121 amdgpu_irq_gpu_reset_resume_helper(tmp_adev); 6122 r = amdgpu_ib_ring_tests(tmp_adev); 6123 if (r) { 6124 dev_err(tmp_adev->dev, "ib ring test failed (%d).\n", r); 6125 r = -EAGAIN; 6126 goto end; 6127 } 6128 } 6129 6130 if (r) 6131 tmp_adev->asic_reset_res = r; 6132 } 6133 6134 end: 6135 return r; 6136 } 6137 6138 int amdgpu_do_asic_reset(struct list_head *device_list_handle, 6139 struct amdgpu_reset_context *reset_context) 6140 { 6141 struct amdgpu_device *tmp_adev = NULL; 6142 bool need_full_reset, skip_hw_reset; 6143 int r = 0; 6144 6145 /* Try reset handler method first */ 6146 tmp_adev = list_first_entry(device_list_handle, struct amdgpu_device, 6147 reset_list); 6148 6149 reset_context->reset_device_list = device_list_handle; 6150 r = amdgpu_reset_perform_reset(tmp_adev, reset_context); 6151 /* If reset handler not implemented, continue; otherwise return */ 6152 if (r == -EOPNOTSUPP) 6153 r = 0; 6154 else 6155 return r; 6156 6157 /* Reset handler not implemented, use the default method */ 6158 need_full_reset = 6159 test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 6160 skip_hw_reset = test_bit(AMDGPU_SKIP_HW_RESET, &reset_context->flags); 6161 6162 /* 6163 * ASIC reset has to be done on all XGMI hive nodes ASAP 6164 * to allow proper links negotiation in FW (within 1 sec) 6165 */ 6166 if (!skip_hw_reset && need_full_reset) { 6167 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 6168 /* For XGMI run all resets in parallel to speed up the process */ 6169 if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) { 6170 if (!queue_work(system_unbound_wq, 6171 &tmp_adev->xgmi_reset_work)) 6172 r = -EALREADY; 6173 } else 6174 r = amdgpu_asic_reset(tmp_adev); 6175 6176 if (r) { 6177 dev_err(tmp_adev->dev, 6178 "ASIC reset failed with error, %d for drm dev, %s", 6179 r, adev_to_drm(tmp_adev)->unique); 6180 goto out; 6181 } 6182 } 6183 6184 /* For XGMI wait for all resets to complete before proceed */ 6185 if (!r) { 6186 list_for_each_entry(tmp_adev, device_list_handle, 6187 reset_list) { 6188 if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) { 6189 flush_work(&tmp_adev->xgmi_reset_work); 6190 r = tmp_adev->asic_reset_res; 6191 if (r) 6192 break; 6193 } 6194 } 6195 } 6196 } 6197 6198 if (!r && amdgpu_ras_intr_triggered()) { 6199 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 6200 amdgpu_ras_reset_error_count(tmp_adev, 6201 AMDGPU_RAS_BLOCK__MMHUB); 6202 } 6203 6204 amdgpu_ras_intr_cleared(); 6205 } 6206 6207 r = amdgpu_device_reinit_after_reset(reset_context); 6208 if (r == -EAGAIN) 6209 set_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 6210 else 6211 clear_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 6212 6213 out: 6214 return r; 6215 } 6216 6217 static void amdgpu_device_set_mp1_state(struct amdgpu_device *adev) 6218 { 6219 6220 switch (amdgpu_asic_reset_method(adev)) { 6221 case AMD_RESET_METHOD_MODE1: 6222 case AMD_RESET_METHOD_LINK: 6223 adev->mp1_state = PP_MP1_STATE_SHUTDOWN; 6224 break; 6225 case AMD_RESET_METHOD_MODE2: 6226 adev->mp1_state = PP_MP1_STATE_RESET; 6227 break; 6228 default: 6229 adev->mp1_state = PP_MP1_STATE_NONE; 6230 break; 6231 } 6232 } 6233 6234 static void amdgpu_device_unset_mp1_state(struct amdgpu_device *adev) 6235 { 6236 amdgpu_vf_error_trans_all(adev); 6237 adev->mp1_state = PP_MP1_STATE_NONE; 6238 } 6239 6240 static void amdgpu_device_resume_display_audio(struct amdgpu_device *adev) 6241 { 6242 struct pci_dev *p = NULL; 6243 6244 p = pci_get_domain_bus_and_slot(pci_domain_nr(adev->pdev->bus), 6245 adev->pdev->bus->number, 1); 6246 if (p) { 6247 pm_runtime_enable(&(p->dev)); 6248 pm_runtime_resume(&(p->dev)); 6249 } 6250 6251 pci_dev_put(p); 6252 } 6253 6254 static int amdgpu_device_suspend_display_audio(struct amdgpu_device *adev) 6255 { 6256 enum amd_reset_method reset_method; 6257 struct pci_dev *p = NULL; 6258 u64 expires; 6259 6260 /* 6261 * For now, only BACO and mode1 reset are confirmed 6262 * to suffer the audio issue without proper suspended. 6263 */ 6264 reset_method = amdgpu_asic_reset_method(adev); 6265 if ((reset_method != AMD_RESET_METHOD_BACO) && 6266 (reset_method != AMD_RESET_METHOD_MODE1)) 6267 return -EINVAL; 6268 6269 p = pci_get_domain_bus_and_slot(pci_domain_nr(adev->pdev->bus), 6270 adev->pdev->bus->number, 1); 6271 if (!p) 6272 return -ENODEV; 6273 6274 expires = pm_runtime_autosuspend_expiration(&(p->dev)); 6275 if (!expires) 6276 /* 6277 * If we cannot get the audio device autosuspend delay, 6278 * a fixed 4S interval will be used. Considering 3S is 6279 * the audio controller default autosuspend delay setting. 6280 * 4S used here is guaranteed to cover that. 6281 */ 6282 expires = ktime_get_mono_fast_ns() + NSEC_PER_SEC * 4ULL; 6283 6284 while (!pm_runtime_status_suspended(&(p->dev))) { 6285 if (!pm_runtime_suspend(&(p->dev))) 6286 break; 6287 6288 if (expires < ktime_get_mono_fast_ns()) { 6289 dev_warn(adev->dev, "failed to suspend display audio\n"); 6290 pci_dev_put(p); 6291 /* TODO: abort the succeeding gpu reset? */ 6292 return -ETIMEDOUT; 6293 } 6294 } 6295 6296 pm_runtime_disable(&(p->dev)); 6297 6298 pci_dev_put(p); 6299 return 0; 6300 } 6301 6302 static inline void amdgpu_device_stop_pending_resets(struct amdgpu_device *adev) 6303 { 6304 struct amdgpu_ras *con = amdgpu_ras_get_context(adev); 6305 6306 #if defined(CONFIG_DEBUG_FS) 6307 if (!amdgpu_sriov_vf(adev)) 6308 cancel_work(&adev->reset_work); 6309 #endif 6310 cancel_work(&adev->userq_reset_work); 6311 6312 if (adev->kfd.dev) 6313 cancel_work(&adev->kfd.reset_work); 6314 6315 if (amdgpu_sriov_vf(adev)) 6316 cancel_work(&adev->virt.flr_work); 6317 6318 if (con && adev->ras_enabled) 6319 cancel_work(&con->recovery_work); 6320 6321 } 6322 6323 static int amdgpu_device_health_check(struct list_head *device_list_handle) 6324 { 6325 struct amdgpu_device *tmp_adev; 6326 int ret = 0; 6327 6328 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 6329 ret |= amdgpu_device_bus_status_check(tmp_adev); 6330 } 6331 6332 return ret; 6333 } 6334 6335 static void amdgpu_device_recovery_prepare(struct amdgpu_device *adev, 6336 struct list_head *device_list, 6337 struct amdgpu_hive_info *hive) 6338 { 6339 struct amdgpu_device *tmp_adev = NULL; 6340 6341 /* 6342 * Build list of devices to reset. 6343 * In case we are in XGMI hive mode, resort the device list 6344 * to put adev in the 1st position. 6345 */ 6346 if (!amdgpu_sriov_vf(adev) && (adev->gmc.xgmi.num_physical_nodes > 1) && hive) { 6347 list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head) { 6348 list_add_tail(&tmp_adev->reset_list, device_list); 6349 if (adev->shutdown) 6350 tmp_adev->shutdown = true; 6351 if (amdgpu_reset_in_dpc(adev)) 6352 tmp_adev->pcie_reset_ctx.in_link_reset = true; 6353 } 6354 if (!list_is_first(&adev->reset_list, device_list)) 6355 list_rotate_to_front(&adev->reset_list, device_list); 6356 } else { 6357 list_add_tail(&adev->reset_list, device_list); 6358 } 6359 } 6360 6361 static void amdgpu_device_recovery_get_reset_lock(struct amdgpu_device *adev, 6362 struct list_head *device_list) 6363 { 6364 struct amdgpu_device *tmp_adev = NULL; 6365 6366 if (list_empty(device_list)) 6367 return; 6368 tmp_adev = 6369 list_first_entry(device_list, struct amdgpu_device, reset_list); 6370 amdgpu_device_lock_reset_domain(tmp_adev->reset_domain); 6371 } 6372 6373 static void amdgpu_device_recovery_put_reset_lock(struct amdgpu_device *adev, 6374 struct list_head *device_list) 6375 { 6376 struct amdgpu_device *tmp_adev = NULL; 6377 6378 if (list_empty(device_list)) 6379 return; 6380 tmp_adev = 6381 list_first_entry(device_list, struct amdgpu_device, reset_list); 6382 amdgpu_device_unlock_reset_domain(tmp_adev->reset_domain); 6383 } 6384 6385 static void amdgpu_device_halt_activities(struct amdgpu_device *adev, 6386 struct amdgpu_job *job, 6387 struct amdgpu_reset_context *reset_context, 6388 struct list_head *device_list, 6389 struct amdgpu_hive_info *hive, 6390 bool need_emergency_restart) 6391 { 6392 struct amdgpu_device *tmp_adev = NULL; 6393 int i; 6394 6395 /* block all schedulers and reset given job's ring */ 6396 list_for_each_entry(tmp_adev, device_list, reset_list) { 6397 amdgpu_device_set_mp1_state(tmp_adev); 6398 6399 /* 6400 * Try to put the audio codec into suspend state 6401 * before gpu reset started. 6402 * 6403 * Due to the power domain of the graphics device 6404 * is shared with AZ power domain. Without this, 6405 * we may change the audio hardware from behind 6406 * the audio driver's back. That will trigger 6407 * some audio codec errors. 6408 */ 6409 if (!amdgpu_device_suspend_display_audio(tmp_adev)) 6410 tmp_adev->pcie_reset_ctx.audio_suspended = true; 6411 6412 amdgpu_ras_set_error_query_ready(tmp_adev, false); 6413 6414 cancel_delayed_work_sync(&tmp_adev->delayed_init_work); 6415 6416 amdgpu_amdkfd_pre_reset(tmp_adev, reset_context); 6417 6418 /* 6419 * Mark these ASICs to be reset as untracked first 6420 * And add them back after reset completed 6421 */ 6422 amdgpu_unregister_gpu_instance(tmp_adev); 6423 6424 drm_client_dev_suspend(adev_to_drm(tmp_adev)); 6425 6426 /* disable ras on ALL IPs */ 6427 if (!need_emergency_restart && !amdgpu_reset_in_dpc(adev) && 6428 amdgpu_device_ip_need_full_reset(tmp_adev)) 6429 amdgpu_ras_suspend(tmp_adev); 6430 6431 amdgpu_userq_pre_reset(tmp_adev); 6432 6433 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 6434 struct amdgpu_ring *ring = tmp_adev->rings[i]; 6435 6436 if (!amdgpu_ring_sched_ready(ring)) 6437 continue; 6438 6439 drm_sched_stop(&ring->sched, job ? &job->base : NULL); 6440 6441 if (need_emergency_restart) 6442 amdgpu_job_stop_all_jobs_on_sched(&ring->sched); 6443 } 6444 atomic_inc(&tmp_adev->gpu_reset_counter); 6445 } 6446 } 6447 6448 static int amdgpu_device_asic_reset(struct amdgpu_device *adev, 6449 struct list_head *device_list, 6450 struct amdgpu_reset_context *reset_context) 6451 { 6452 struct amdgpu_device *tmp_adev = NULL; 6453 int retry_limit = AMDGPU_MAX_RETRY_LIMIT; 6454 int r = 0; 6455 6456 retry: /* Rest of adevs pre asic reset from XGMI hive. */ 6457 list_for_each_entry(tmp_adev, device_list, reset_list) { 6458 r = amdgpu_device_pre_asic_reset(tmp_adev, reset_context); 6459 /*TODO Should we stop ?*/ 6460 if (r) { 6461 dev_err(tmp_adev->dev, "GPU pre asic reset failed with err, %d for drm dev, %s ", 6462 r, adev_to_drm(tmp_adev)->unique); 6463 tmp_adev->asic_reset_res = r; 6464 } 6465 } 6466 6467 /* Actual ASIC resets if needed.*/ 6468 /* Host driver will handle XGMI hive reset for SRIOV */ 6469 if (amdgpu_sriov_vf(adev)) { 6470 6471 /* Bail out of reset early */ 6472 if (amdgpu_ras_is_rma(adev)) 6473 return -ENODEV; 6474 6475 if (amdgpu_ras_get_fed_status(adev) || amdgpu_virt_rcvd_ras_interrupt(adev)) { 6476 dev_dbg(adev->dev, "Detected RAS error, wait for FLR completion\n"); 6477 amdgpu_ras_set_fed(adev, true); 6478 set_bit(AMDGPU_HOST_FLR, &reset_context->flags); 6479 } 6480 6481 r = amdgpu_device_reset_sriov(adev, reset_context); 6482 if (AMDGPU_RETRY_SRIOV_RESET(r) && (retry_limit--) > 0) { 6483 amdgpu_virt_release_full_gpu(adev, true); 6484 goto retry; 6485 } 6486 if (r) 6487 adev->asic_reset_res = r; 6488 } else { 6489 r = amdgpu_do_asic_reset(device_list, reset_context); 6490 if (r && r == -EAGAIN) 6491 goto retry; 6492 } 6493 6494 list_for_each_entry(tmp_adev, device_list, reset_list) { 6495 /* 6496 * Drop any pending non scheduler resets queued before reset is done. 6497 * Any reset scheduled after this point would be valid. Scheduler resets 6498 * were already dropped during drm_sched_stop and no new ones can come 6499 * in before drm_sched_start. 6500 */ 6501 amdgpu_device_stop_pending_resets(tmp_adev); 6502 } 6503 6504 return r; 6505 } 6506 6507 static int amdgpu_device_sched_resume(struct list_head *device_list, 6508 struct amdgpu_reset_context *reset_context, 6509 bool job_signaled) 6510 { 6511 struct amdgpu_device *tmp_adev = NULL; 6512 int i, r = 0; 6513 6514 /* Post ASIC reset for all devs .*/ 6515 list_for_each_entry(tmp_adev, device_list, reset_list) { 6516 6517 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 6518 struct amdgpu_ring *ring = tmp_adev->rings[i]; 6519 6520 if (!amdgpu_ring_sched_ready(ring)) 6521 continue; 6522 6523 drm_sched_start(&ring->sched, 0); 6524 } 6525 6526 if (!drm_drv_uses_atomic_modeset(adev_to_drm(tmp_adev)) && !job_signaled) 6527 drm_helper_resume_force_mode(adev_to_drm(tmp_adev)); 6528 6529 if (tmp_adev->asic_reset_res) { 6530 /* bad news, how to tell it to userspace ? 6531 * for ras error, we should report GPU bad status instead of 6532 * reset failure 6533 */ 6534 if (reset_context->src != AMDGPU_RESET_SRC_RAS || 6535 !amdgpu_ras_eeprom_check_err_threshold(tmp_adev)) 6536 dev_info( 6537 tmp_adev->dev, 6538 "GPU reset(%d) failed with error %d \n", 6539 atomic_read( 6540 &tmp_adev->gpu_reset_counter), 6541 tmp_adev->asic_reset_res); 6542 amdgpu_vf_error_put(tmp_adev, 6543 AMDGIM_ERROR_VF_GPU_RESET_FAIL, 0, 6544 tmp_adev->asic_reset_res); 6545 if (!r) 6546 r = tmp_adev->asic_reset_res; 6547 tmp_adev->asic_reset_res = 0; 6548 } else { 6549 dev_info(tmp_adev->dev, "GPU reset(%d) succeeded!\n", 6550 atomic_read(&tmp_adev->gpu_reset_counter)); 6551 if (amdgpu_acpi_smart_shift_update(tmp_adev, 6552 AMDGPU_SS_DEV_D0)) 6553 dev_warn(tmp_adev->dev, 6554 "smart shift update failed\n"); 6555 } 6556 } 6557 6558 return r; 6559 } 6560 6561 static void amdgpu_device_gpu_resume(struct amdgpu_device *adev, 6562 struct list_head *device_list, 6563 bool need_emergency_restart) 6564 { 6565 struct amdgpu_device *tmp_adev = NULL; 6566 6567 list_for_each_entry(tmp_adev, device_list, reset_list) { 6568 /* unlock kfd: SRIOV would do it separately */ 6569 if (!need_emergency_restart && !amdgpu_sriov_vf(tmp_adev)) 6570 amdgpu_amdkfd_post_reset(tmp_adev); 6571 6572 /* kfd_post_reset will do nothing if kfd device is not initialized, 6573 * need to bring up kfd here if it's not be initialized before 6574 */ 6575 if (!adev->kfd.init_complete) 6576 amdgpu_amdkfd_device_init(adev); 6577 6578 if (tmp_adev->pcie_reset_ctx.audio_suspended) 6579 amdgpu_device_resume_display_audio(tmp_adev); 6580 6581 amdgpu_device_unset_mp1_state(tmp_adev); 6582 6583 amdgpu_ras_set_error_query_ready(tmp_adev, true); 6584 6585 } 6586 } 6587 6588 6589 /** 6590 * amdgpu_device_gpu_recover - reset the asic and recover scheduler 6591 * 6592 * @adev: amdgpu_device pointer 6593 * @job: which job trigger hang 6594 * @reset_context: amdgpu reset context pointer 6595 * 6596 * Attempt to reset the GPU if it has hung (all asics). 6597 * Attempt to do soft-reset or full-reset and reinitialize Asic 6598 * Returns 0 for success or an error on failure. 6599 */ 6600 6601 int amdgpu_device_gpu_recover(struct amdgpu_device *adev, 6602 struct amdgpu_job *job, 6603 struct amdgpu_reset_context *reset_context) 6604 { 6605 struct list_head device_list; 6606 bool job_signaled = false; 6607 struct amdgpu_hive_info *hive = NULL; 6608 int r = 0; 6609 bool need_emergency_restart = false; 6610 6611 /* 6612 * If it reaches here because of hang/timeout and a RAS error is 6613 * detected at the same time, let RAS recovery take care of it. 6614 */ 6615 if (amdgpu_ras_is_err_state(adev, AMDGPU_RAS_BLOCK__ANY) && 6616 !amdgpu_sriov_vf(adev) && 6617 reset_context->src != AMDGPU_RESET_SRC_RAS) { 6618 dev_dbg(adev->dev, 6619 "Gpu recovery from source: %d yielding to RAS error recovery handling", 6620 reset_context->src); 6621 return 0; 6622 } 6623 6624 /* 6625 * Special case: RAS triggered and full reset isn't supported 6626 */ 6627 need_emergency_restart = amdgpu_ras_need_emergency_restart(adev); 6628 6629 /* 6630 * Flush RAM to disk so that after reboot 6631 * the user can read log and see why the system rebooted. 6632 */ 6633 if (need_emergency_restart && amdgpu_ras_get_context(adev) && 6634 amdgpu_ras_get_context(adev)->reboot) { 6635 dev_warn(adev->dev, "Emergency reboot."); 6636 6637 ksys_sync_helper(); 6638 emergency_restart(); 6639 } 6640 6641 dev_info(adev->dev, "GPU %s begin!. Source: %d\n", 6642 need_emergency_restart ? "jobs stop" : "reset", 6643 reset_context->src); 6644 6645 if (!amdgpu_sriov_vf(adev)) 6646 hive = amdgpu_get_xgmi_hive(adev); 6647 if (hive) 6648 mutex_lock(&hive->hive_lock); 6649 6650 reset_context->job = job; 6651 reset_context->hive = hive; 6652 INIT_LIST_HEAD(&device_list); 6653 6654 amdgpu_device_recovery_prepare(adev, &device_list, hive); 6655 6656 if (!amdgpu_sriov_vf(adev)) { 6657 r = amdgpu_device_health_check(&device_list); 6658 if (r) 6659 goto end_reset; 6660 } 6661 6662 /* We need to lock reset domain only once both for XGMI and single device */ 6663 amdgpu_device_recovery_get_reset_lock(adev, &device_list); 6664 6665 amdgpu_device_halt_activities(adev, job, reset_context, &device_list, 6666 hive, need_emergency_restart); 6667 if (need_emergency_restart) 6668 goto skip_sched_resume; 6669 /* 6670 * Must check guilty signal here since after this point all old 6671 * HW fences are force signaled. 6672 * 6673 * job->base holds a reference to parent fence 6674 */ 6675 if (job && dma_fence_is_signaled(&job->hw_fence->base)) { 6676 job_signaled = true; 6677 dev_info(adev->dev, "Guilty job already signaled, skipping HW reset"); 6678 goto skip_hw_reset; 6679 } 6680 6681 r = amdgpu_device_asic_reset(adev, &device_list, reset_context); 6682 if (r) 6683 goto reset_unlock; 6684 skip_hw_reset: 6685 r = amdgpu_device_sched_resume(&device_list, reset_context, job_signaled); 6686 if (r) 6687 goto reset_unlock; 6688 skip_sched_resume: 6689 amdgpu_device_gpu_resume(adev, &device_list, need_emergency_restart); 6690 reset_unlock: 6691 amdgpu_device_recovery_put_reset_lock(adev, &device_list); 6692 end_reset: 6693 if (hive) { 6694 mutex_unlock(&hive->hive_lock); 6695 amdgpu_put_xgmi_hive(hive); 6696 } 6697 6698 if (r) 6699 dev_info(adev->dev, "GPU reset end with ret = %d\n", r); 6700 6701 atomic_set(&adev->reset_domain->reset_res, r); 6702 6703 if (!r) { 6704 struct amdgpu_task_info *ti = NULL; 6705 6706 if (job) 6707 ti = amdgpu_vm_get_task_info_pasid(adev, job->pasid); 6708 6709 drm_dev_wedged_event(adev_to_drm(adev), DRM_WEDGE_RECOVERY_NONE, 6710 ti ? &ti->task : NULL); 6711 6712 amdgpu_vm_put_task_info(ti); 6713 } 6714 6715 return r; 6716 } 6717 6718 /** 6719 * amdgpu_device_partner_bandwidth - find the bandwidth of appropriate partner 6720 * 6721 * @adev: amdgpu_device pointer 6722 * @speed: pointer to the speed of the link 6723 * @width: pointer to the width of the link 6724 * 6725 * Evaluate the hierarchy to find the speed and bandwidth capabilities of the 6726 * first physical partner to an AMD dGPU. 6727 * This will exclude any virtual switches and links. 6728 */ 6729 static void amdgpu_device_partner_bandwidth(struct amdgpu_device *adev, 6730 enum pci_bus_speed *speed, 6731 enum pcie_link_width *width) 6732 { 6733 struct pci_dev *parent = adev->pdev; 6734 6735 if (!speed || !width) 6736 return; 6737 6738 *speed = PCI_SPEED_UNKNOWN; 6739 *width = PCIE_LNK_WIDTH_UNKNOWN; 6740 6741 if (amdgpu_device_pcie_dynamic_switching_supported(adev)) { 6742 while ((parent = pci_upstream_bridge(parent))) { 6743 /* skip upstream/downstream switches internal to dGPU*/ 6744 if (parent->vendor == PCI_VENDOR_ID_ATI) 6745 continue; 6746 *speed = pcie_get_speed_cap(parent); 6747 *width = pcie_get_width_cap(parent); 6748 break; 6749 } 6750 } else { 6751 /* use the current speeds rather than max if switching is not supported */ 6752 pcie_bandwidth_available(adev->pdev, NULL, speed, width); 6753 } 6754 } 6755 6756 /** 6757 * amdgpu_device_gpu_bandwidth - find the bandwidth of the GPU 6758 * 6759 * @adev: amdgpu_device pointer 6760 * @speed: pointer to the speed of the link 6761 * @width: pointer to the width of the link 6762 * 6763 * Evaluate the hierarchy to find the speed and bandwidth capabilities of the 6764 * AMD dGPU which may be a virtual upstream bridge. 6765 */ 6766 static void amdgpu_device_gpu_bandwidth(struct amdgpu_device *adev, 6767 enum pci_bus_speed *speed, 6768 enum pcie_link_width *width) 6769 { 6770 struct pci_dev *parent = adev->pdev; 6771 6772 if (!speed || !width) 6773 return; 6774 6775 parent = pci_upstream_bridge(parent); 6776 if (parent && parent->vendor == PCI_VENDOR_ID_ATI) { 6777 /* use the upstream/downstream switches internal to dGPU */ 6778 *speed = pcie_get_speed_cap(parent); 6779 *width = pcie_get_width_cap(parent); 6780 while ((parent = pci_upstream_bridge(parent))) { 6781 if (parent->vendor == PCI_VENDOR_ID_ATI) { 6782 /* use the upstream/downstream switches internal to dGPU */ 6783 *speed = pcie_get_speed_cap(parent); 6784 *width = pcie_get_width_cap(parent); 6785 } 6786 } 6787 } else { 6788 /* use the device itself */ 6789 *speed = pcie_get_speed_cap(adev->pdev); 6790 *width = pcie_get_width_cap(adev->pdev); 6791 } 6792 } 6793 6794 /** 6795 * amdgpu_device_get_pcie_info - fence pcie info about the PCIE slot 6796 * 6797 * @adev: amdgpu_device pointer 6798 * 6799 * Fetches and stores in the driver the PCIE capabilities (gen speed 6800 * and lanes) of the slot the device is in. Handles APUs and 6801 * virtualized environments where PCIE config space may not be available. 6802 */ 6803 static void amdgpu_device_get_pcie_info(struct amdgpu_device *adev) 6804 { 6805 enum pci_bus_speed speed_cap, platform_speed_cap; 6806 enum pcie_link_width platform_link_width, link_width; 6807 6808 if (amdgpu_pcie_gen_cap) 6809 adev->pm.pcie_gen_mask = amdgpu_pcie_gen_cap; 6810 6811 if (amdgpu_pcie_lane_cap) 6812 adev->pm.pcie_mlw_mask = amdgpu_pcie_lane_cap; 6813 6814 /* covers APUs as well */ 6815 if (pci_is_root_bus(adev->pdev->bus) && !amdgpu_passthrough(adev)) { 6816 if (adev->pm.pcie_gen_mask == 0) 6817 adev->pm.pcie_gen_mask = AMDGPU_DEFAULT_PCIE_GEN_MASK; 6818 if (adev->pm.pcie_mlw_mask == 0) 6819 adev->pm.pcie_mlw_mask = AMDGPU_DEFAULT_PCIE_MLW_MASK; 6820 return; 6821 } 6822 6823 if (adev->pm.pcie_gen_mask && adev->pm.pcie_mlw_mask) 6824 return; 6825 6826 amdgpu_device_partner_bandwidth(adev, &platform_speed_cap, 6827 &platform_link_width); 6828 amdgpu_device_gpu_bandwidth(adev, &speed_cap, &link_width); 6829 6830 if (adev->pm.pcie_gen_mask == 0) { 6831 /* asic caps */ 6832 if (speed_cap == PCI_SPEED_UNKNOWN) { 6833 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 6834 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 | 6835 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3); 6836 } else { 6837 if (speed_cap == PCIE_SPEED_32_0GT) 6838 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 6839 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 | 6840 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3 | 6841 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN4 | 6842 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN5); 6843 else if (speed_cap == PCIE_SPEED_16_0GT) 6844 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 6845 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 | 6846 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3 | 6847 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN4); 6848 else if (speed_cap == PCIE_SPEED_8_0GT) 6849 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 6850 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 | 6851 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3); 6852 else if (speed_cap == PCIE_SPEED_5_0GT) 6853 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 6854 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2); 6855 else 6856 adev->pm.pcie_gen_mask |= CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1; 6857 } 6858 /* platform caps */ 6859 if (platform_speed_cap == PCI_SPEED_UNKNOWN) { 6860 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 6861 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2); 6862 } else { 6863 if (platform_speed_cap == PCIE_SPEED_32_0GT) 6864 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 6865 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 | 6866 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3 | 6867 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN4 | 6868 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN5); 6869 else if (platform_speed_cap == PCIE_SPEED_16_0GT) 6870 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 6871 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 | 6872 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3 | 6873 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN4); 6874 else if (platform_speed_cap == PCIE_SPEED_8_0GT) 6875 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 6876 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 | 6877 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3); 6878 else if (platform_speed_cap == PCIE_SPEED_5_0GT) 6879 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 6880 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2); 6881 else 6882 adev->pm.pcie_gen_mask |= CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1; 6883 6884 } 6885 } 6886 if (adev->pm.pcie_mlw_mask == 0) { 6887 /* asic caps */ 6888 if (link_width == PCIE_LNK_WIDTH_UNKNOWN) { 6889 adev->pm.pcie_mlw_mask |= AMDGPU_DEFAULT_ASIC_PCIE_MLW_MASK; 6890 } else { 6891 switch (link_width) { 6892 case PCIE_LNK_X32: 6893 adev->pm.pcie_mlw_mask |= (CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X32 | 6894 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X16 | 6895 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X12 | 6896 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X8 | 6897 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X4 | 6898 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X2 | 6899 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X1); 6900 break; 6901 case PCIE_LNK_X16: 6902 adev->pm.pcie_mlw_mask |= (CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X16 | 6903 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X12 | 6904 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X8 | 6905 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X4 | 6906 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X2 | 6907 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X1); 6908 break; 6909 case PCIE_LNK_X12: 6910 adev->pm.pcie_mlw_mask |= (CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X12 | 6911 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X8 | 6912 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X4 | 6913 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X2 | 6914 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X1); 6915 break; 6916 case PCIE_LNK_X8: 6917 adev->pm.pcie_mlw_mask |= (CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X8 | 6918 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X4 | 6919 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X2 | 6920 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X1); 6921 break; 6922 case PCIE_LNK_X4: 6923 adev->pm.pcie_mlw_mask |= (CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X4 | 6924 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X2 | 6925 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X1); 6926 break; 6927 case PCIE_LNK_X2: 6928 adev->pm.pcie_mlw_mask |= (CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X2 | 6929 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X1); 6930 break; 6931 case PCIE_LNK_X1: 6932 adev->pm.pcie_mlw_mask |= CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X1; 6933 break; 6934 default: 6935 break; 6936 } 6937 } 6938 /* platform caps */ 6939 if (platform_link_width == PCIE_LNK_WIDTH_UNKNOWN) { 6940 adev->pm.pcie_mlw_mask |= AMDGPU_DEFAULT_PCIE_MLW_MASK; 6941 } else { 6942 switch (platform_link_width) { 6943 case PCIE_LNK_X32: 6944 adev->pm.pcie_mlw_mask |= (CAIL_PCIE_LINK_WIDTH_SUPPORT_X32 | 6945 CAIL_PCIE_LINK_WIDTH_SUPPORT_X16 | 6946 CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 | 6947 CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 | 6948 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 6949 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 6950 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 6951 break; 6952 case PCIE_LNK_X16: 6953 adev->pm.pcie_mlw_mask |= (CAIL_PCIE_LINK_WIDTH_SUPPORT_X16 | 6954 CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 | 6955 CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 | 6956 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 6957 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 6958 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 6959 break; 6960 case PCIE_LNK_X12: 6961 adev->pm.pcie_mlw_mask |= (CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 | 6962 CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 | 6963 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 6964 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 6965 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 6966 break; 6967 case PCIE_LNK_X8: 6968 adev->pm.pcie_mlw_mask |= (CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 | 6969 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 6970 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 6971 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 6972 break; 6973 case PCIE_LNK_X4: 6974 adev->pm.pcie_mlw_mask |= (CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 6975 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 6976 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 6977 break; 6978 case PCIE_LNK_X2: 6979 adev->pm.pcie_mlw_mask |= (CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 6980 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 6981 break; 6982 case PCIE_LNK_X1: 6983 adev->pm.pcie_mlw_mask |= CAIL_PCIE_LINK_WIDTH_SUPPORT_X1; 6984 break; 6985 default: 6986 break; 6987 } 6988 } 6989 } 6990 } 6991 6992 /** 6993 * amdgpu_device_is_peer_accessible - Check peer access through PCIe BAR 6994 * 6995 * @adev: amdgpu_device pointer 6996 * @peer_adev: amdgpu_device pointer for peer device trying to access @adev 6997 * 6998 * Return true if @peer_adev can access (DMA) @adev through the PCIe 6999 * BAR, i.e. @adev is "large BAR" and the BAR matches the DMA mask of 7000 * @peer_adev. 7001 */ 7002 bool amdgpu_device_is_peer_accessible(struct amdgpu_device *adev, 7003 struct amdgpu_device *peer_adev) 7004 { 7005 #ifdef CONFIG_HSA_AMD_P2P 7006 bool p2p_access = 7007 !adev->gmc.xgmi.connected_to_cpu && 7008 !(pci_p2pdma_distance(adev->pdev, peer_adev->dev, false) < 0); 7009 if (!p2p_access) 7010 dev_info(adev->dev, "PCIe P2P access from peer device %s is not supported by the chipset\n", 7011 pci_name(peer_adev->pdev)); 7012 7013 bool is_large_bar = adev->gmc.visible_vram_size && 7014 adev->gmc.real_vram_size == adev->gmc.visible_vram_size; 7015 bool p2p_addressable = amdgpu_device_check_iommu_remap(peer_adev); 7016 7017 if (!p2p_addressable) { 7018 uint64_t address_mask = peer_adev->dev->dma_mask ? 7019 ~*peer_adev->dev->dma_mask : ~((1ULL << 32) - 1); 7020 resource_size_t aper_limit = 7021 adev->gmc.aper_base + adev->gmc.aper_size - 1; 7022 7023 p2p_addressable = !(adev->gmc.aper_base & address_mask || 7024 aper_limit & address_mask); 7025 } 7026 return pcie_p2p && is_large_bar && p2p_access && p2p_addressable; 7027 #else 7028 return false; 7029 #endif 7030 } 7031 7032 int amdgpu_device_baco_enter(struct amdgpu_device *adev) 7033 { 7034 struct amdgpu_ras *ras = amdgpu_ras_get_context(adev); 7035 7036 if (!amdgpu_device_supports_baco(adev)) 7037 return -ENOTSUPP; 7038 7039 if (ras && adev->ras_enabled && 7040 adev->nbio.funcs->enable_doorbell_interrupt) 7041 adev->nbio.funcs->enable_doorbell_interrupt(adev, false); 7042 7043 return amdgpu_dpm_baco_enter(adev); 7044 } 7045 7046 int amdgpu_device_baco_exit(struct amdgpu_device *adev) 7047 { 7048 struct amdgpu_ras *ras = amdgpu_ras_get_context(adev); 7049 int ret = 0; 7050 7051 if (!amdgpu_device_supports_baco(adev)) 7052 return -ENOTSUPP; 7053 7054 ret = amdgpu_dpm_baco_exit(adev); 7055 if (ret) 7056 return ret; 7057 7058 if (ras && adev->ras_enabled && 7059 adev->nbio.funcs->enable_doorbell_interrupt) 7060 adev->nbio.funcs->enable_doorbell_interrupt(adev, true); 7061 7062 if (amdgpu_passthrough(adev) && adev->nbio.funcs && 7063 adev->nbio.funcs->clear_doorbell_interrupt) 7064 adev->nbio.funcs->clear_doorbell_interrupt(adev); 7065 7066 return 0; 7067 } 7068 7069 /** 7070 * amdgpu_pci_error_detected - Called when a PCI error is detected. 7071 * @pdev: PCI device struct 7072 * @state: PCI channel state 7073 * 7074 * Description: Called when a PCI error is detected. 7075 * 7076 * Return: PCI_ERS_RESULT_NEED_RESET or PCI_ERS_RESULT_DISCONNECT. 7077 */ 7078 pci_ers_result_t amdgpu_pci_error_detected(struct pci_dev *pdev, pci_channel_state_t state) 7079 { 7080 struct drm_device *dev = pci_get_drvdata(pdev); 7081 struct amdgpu_device *adev = drm_to_adev(dev); 7082 struct amdgpu_hive_info *hive __free(xgmi_put_hive) = 7083 amdgpu_get_xgmi_hive(adev); 7084 struct amdgpu_reset_context reset_context; 7085 struct list_head device_list; 7086 7087 dev_info(adev->dev, "PCI error: detected callback!!\n"); 7088 7089 adev->pci_channel_state = state; 7090 7091 switch (state) { 7092 case pci_channel_io_normal: 7093 dev_info(adev->dev, "pci_channel_io_normal: state(%d)!!\n", state); 7094 return PCI_ERS_RESULT_CAN_RECOVER; 7095 case pci_channel_io_frozen: 7096 /* Fatal error, prepare for slot reset */ 7097 dev_info(adev->dev, "pci_channel_io_frozen: state(%d)!!\n", state); 7098 if (hive) { 7099 /* Hive devices should be able to support FW based 7100 * link reset on other devices, if not return. 7101 */ 7102 if (!amdgpu_dpm_is_link_reset_supported(adev)) { 7103 dev_warn(adev->dev, 7104 "No support for XGMI hive yet...\n"); 7105 return PCI_ERS_RESULT_DISCONNECT; 7106 } 7107 /* Set dpc status only if device is part of hive 7108 * Non-hive devices should be able to recover after 7109 * link reset. 7110 */ 7111 amdgpu_reset_set_dpc_status(adev, true); 7112 7113 mutex_lock(&hive->hive_lock); 7114 } 7115 memset(&reset_context, 0, sizeof(reset_context)); 7116 INIT_LIST_HEAD(&device_list); 7117 7118 amdgpu_device_recovery_prepare(adev, &device_list, hive); 7119 amdgpu_device_recovery_get_reset_lock(adev, &device_list); 7120 amdgpu_device_halt_activities(adev, NULL, &reset_context, &device_list, 7121 hive, false); 7122 if (hive) 7123 mutex_unlock(&hive->hive_lock); 7124 return PCI_ERS_RESULT_NEED_RESET; 7125 case pci_channel_io_perm_failure: 7126 /* Permanent error, prepare for device removal */ 7127 dev_info(adev->dev, "pci_channel_io_perm_failure: state(%d)!!\n", state); 7128 return PCI_ERS_RESULT_DISCONNECT; 7129 } 7130 7131 return PCI_ERS_RESULT_NEED_RESET; 7132 } 7133 7134 /** 7135 * amdgpu_pci_mmio_enabled - Enable MMIO and dump debug registers 7136 * @pdev: pointer to PCI device 7137 */ 7138 pci_ers_result_t amdgpu_pci_mmio_enabled(struct pci_dev *pdev) 7139 { 7140 struct drm_device *dev = pci_get_drvdata(pdev); 7141 struct amdgpu_device *adev = drm_to_adev(dev); 7142 7143 dev_info(adev->dev, "PCI error: mmio enabled callback!!\n"); 7144 7145 /* TODO - dump whatever for debugging purposes */ 7146 7147 /* This called only if amdgpu_pci_error_detected returns 7148 * PCI_ERS_RESULT_CAN_RECOVER. Read/write to the device still 7149 * works, no need to reset slot. 7150 */ 7151 7152 return PCI_ERS_RESULT_RECOVERED; 7153 } 7154 7155 /** 7156 * amdgpu_pci_slot_reset - Called when PCI slot has been reset. 7157 * @pdev: PCI device struct 7158 * 7159 * Description: This routine is called by the pci error recovery 7160 * code after the PCI slot has been reset, just before we 7161 * should resume normal operations. 7162 */ 7163 pci_ers_result_t amdgpu_pci_slot_reset(struct pci_dev *pdev) 7164 { 7165 struct drm_device *dev = pci_get_drvdata(pdev); 7166 struct amdgpu_device *adev = drm_to_adev(dev); 7167 struct amdgpu_reset_context reset_context; 7168 struct amdgpu_device *tmp_adev; 7169 struct amdgpu_hive_info *hive; 7170 struct list_head device_list; 7171 struct pci_dev *link_dev; 7172 int r = 0, i, timeout; 7173 u32 memsize; 7174 u16 status; 7175 7176 dev_info(adev->dev, "PCI error: slot reset callback!!\n"); 7177 7178 memset(&reset_context, 0, sizeof(reset_context)); 7179 7180 if (adev->pcie_reset_ctx.swus) 7181 link_dev = adev->pcie_reset_ctx.swus; 7182 else 7183 link_dev = adev->pdev; 7184 /* wait for asic to come out of reset, timeout = 10s */ 7185 timeout = 10000; 7186 do { 7187 usleep_range(10000, 10500); 7188 r = pci_read_config_word(link_dev, PCI_VENDOR_ID, &status); 7189 timeout -= 10; 7190 } while (timeout > 0 && (status != PCI_VENDOR_ID_ATI) && 7191 (status != PCI_VENDOR_ID_AMD)); 7192 7193 if ((status != PCI_VENDOR_ID_ATI) && (status != PCI_VENDOR_ID_AMD)) { 7194 r = -ETIME; 7195 goto out; 7196 } 7197 7198 amdgpu_device_load_switch_state(adev); 7199 /* Restore PCI confspace */ 7200 amdgpu_device_load_pci_state(pdev); 7201 7202 /* confirm ASIC came out of reset */ 7203 for (i = 0; i < adev->usec_timeout; i++) { 7204 memsize = amdgpu_asic_get_config_memsize(adev); 7205 7206 if (memsize != 0xffffffff) 7207 break; 7208 udelay(1); 7209 } 7210 if (memsize == 0xffffffff) { 7211 r = -ETIME; 7212 goto out; 7213 } 7214 7215 reset_context.method = AMD_RESET_METHOD_NONE; 7216 reset_context.reset_req_dev = adev; 7217 set_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags); 7218 set_bit(AMDGPU_SKIP_COREDUMP, &reset_context.flags); 7219 INIT_LIST_HEAD(&device_list); 7220 7221 hive = amdgpu_get_xgmi_hive(adev); 7222 if (hive) { 7223 mutex_lock(&hive->hive_lock); 7224 reset_context.hive = hive; 7225 list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head) { 7226 tmp_adev->pcie_reset_ctx.in_link_reset = true; 7227 list_add_tail(&tmp_adev->reset_list, &device_list); 7228 } 7229 } else { 7230 set_bit(AMDGPU_SKIP_HW_RESET, &reset_context.flags); 7231 list_add_tail(&adev->reset_list, &device_list); 7232 } 7233 7234 r = amdgpu_device_asic_reset(adev, &device_list, &reset_context); 7235 out: 7236 if (!r) { 7237 if (amdgpu_device_cache_pci_state(adev->pdev)) 7238 pci_restore_state(adev->pdev); 7239 dev_info(adev->dev, "PCIe error recovery succeeded\n"); 7240 } else { 7241 dev_err(adev->dev, "PCIe error recovery failed, err:%d\n", r); 7242 if (hive) { 7243 list_for_each_entry(tmp_adev, &device_list, reset_list) 7244 amdgpu_device_unset_mp1_state(tmp_adev); 7245 } 7246 amdgpu_device_recovery_put_reset_lock(adev, &device_list); 7247 } 7248 7249 if (hive) { 7250 mutex_unlock(&hive->hive_lock); 7251 amdgpu_put_xgmi_hive(hive); 7252 } 7253 7254 return r ? PCI_ERS_RESULT_DISCONNECT : PCI_ERS_RESULT_RECOVERED; 7255 } 7256 7257 /** 7258 * amdgpu_pci_resume() - resume normal ops after PCI reset 7259 * @pdev: pointer to PCI device 7260 * 7261 * Called when the error recovery driver tells us that its 7262 * OK to resume normal operation. 7263 */ 7264 void amdgpu_pci_resume(struct pci_dev *pdev) 7265 { 7266 struct drm_device *dev = pci_get_drvdata(pdev); 7267 struct amdgpu_device *adev = drm_to_adev(dev); 7268 struct list_head device_list; 7269 struct amdgpu_hive_info *hive = NULL; 7270 struct amdgpu_device *tmp_adev = NULL; 7271 7272 dev_info(adev->dev, "PCI error: resume callback!!\n"); 7273 7274 /* Only continue execution for the case of pci_channel_io_frozen */ 7275 if (adev->pci_channel_state != pci_channel_io_frozen) 7276 return; 7277 7278 INIT_LIST_HEAD(&device_list); 7279 7280 hive = amdgpu_get_xgmi_hive(adev); 7281 if (hive) { 7282 mutex_lock(&hive->hive_lock); 7283 list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head) { 7284 tmp_adev->pcie_reset_ctx.in_link_reset = false; 7285 list_add_tail(&tmp_adev->reset_list, &device_list); 7286 } 7287 } else 7288 list_add_tail(&adev->reset_list, &device_list); 7289 7290 amdgpu_device_sched_resume(&device_list, NULL, NULL); 7291 amdgpu_device_gpu_resume(adev, &device_list, false); 7292 amdgpu_device_recovery_put_reset_lock(adev, &device_list); 7293 7294 if (hive) { 7295 mutex_unlock(&hive->hive_lock); 7296 amdgpu_put_xgmi_hive(hive); 7297 } 7298 } 7299 7300 static void amdgpu_device_cache_switch_state(struct amdgpu_device *adev) 7301 { 7302 struct pci_dev *swus, *swds; 7303 int r; 7304 7305 swds = pci_upstream_bridge(adev->pdev); 7306 if (!swds || swds->vendor != PCI_VENDOR_ID_ATI || 7307 pci_pcie_type(swds) != PCI_EXP_TYPE_DOWNSTREAM) 7308 return; 7309 swus = pci_upstream_bridge(swds); 7310 if (!swus || 7311 (swus->vendor != PCI_VENDOR_ID_ATI && 7312 swus->vendor != PCI_VENDOR_ID_AMD) || 7313 pci_pcie_type(swus) != PCI_EXP_TYPE_UPSTREAM) 7314 return; 7315 7316 /* If already saved, return */ 7317 if (adev->pcie_reset_ctx.swus) 7318 return; 7319 /* Upstream bridge is ATI, assume it's SWUS/DS architecture */ 7320 r = pci_save_state(swds); 7321 if (r) 7322 return; 7323 adev->pcie_reset_ctx.swds_pcistate = pci_store_saved_state(swds); 7324 7325 r = pci_save_state(swus); 7326 if (r) 7327 return; 7328 adev->pcie_reset_ctx.swus_pcistate = pci_store_saved_state(swus); 7329 7330 adev->pcie_reset_ctx.swus = swus; 7331 } 7332 7333 static void amdgpu_device_load_switch_state(struct amdgpu_device *adev) 7334 { 7335 struct pci_dev *pdev; 7336 int r; 7337 7338 if (!adev->pcie_reset_ctx.swds_pcistate || 7339 !adev->pcie_reset_ctx.swus_pcistate) 7340 return; 7341 7342 pdev = adev->pcie_reset_ctx.swus; 7343 r = pci_load_saved_state(pdev, adev->pcie_reset_ctx.swus_pcistate); 7344 if (!r) { 7345 pci_restore_state(pdev); 7346 } else { 7347 dev_warn(adev->dev, "Failed to load SWUS state, err:%d\n", r); 7348 return; 7349 } 7350 7351 pdev = pci_upstream_bridge(adev->pdev); 7352 r = pci_load_saved_state(pdev, adev->pcie_reset_ctx.swds_pcistate); 7353 if (!r) 7354 pci_restore_state(pdev); 7355 else 7356 dev_warn(adev->dev, "Failed to load SWDS state, err:%d\n", r); 7357 } 7358 7359 bool amdgpu_device_cache_pci_state(struct pci_dev *pdev) 7360 { 7361 struct drm_device *dev = pci_get_drvdata(pdev); 7362 struct amdgpu_device *adev = drm_to_adev(dev); 7363 int r; 7364 7365 if (amdgpu_sriov_vf(adev)) 7366 return false; 7367 7368 r = pci_save_state(pdev); 7369 if (!r) { 7370 kfree(adev->pci_state); 7371 7372 adev->pci_state = pci_store_saved_state(pdev); 7373 7374 if (!adev->pci_state) { 7375 dev_err(adev->dev, "Failed to store PCI saved state"); 7376 return false; 7377 } 7378 } else { 7379 dev_warn(adev->dev, "Failed to save PCI state, err:%d\n", r); 7380 return false; 7381 } 7382 7383 amdgpu_device_cache_switch_state(adev); 7384 7385 return true; 7386 } 7387 7388 bool amdgpu_device_load_pci_state(struct pci_dev *pdev) 7389 { 7390 struct drm_device *dev = pci_get_drvdata(pdev); 7391 struct amdgpu_device *adev = drm_to_adev(dev); 7392 int r; 7393 7394 if (!adev->pci_state) 7395 return false; 7396 7397 r = pci_load_saved_state(pdev, adev->pci_state); 7398 7399 if (!r) { 7400 pci_restore_state(pdev); 7401 } else { 7402 dev_warn(adev->dev, "Failed to load PCI state, err:%d\n", r); 7403 return false; 7404 } 7405 7406 return true; 7407 } 7408 7409 void amdgpu_device_flush_hdp(struct amdgpu_device *adev, 7410 struct amdgpu_ring *ring) 7411 { 7412 #ifdef CONFIG_X86_64 7413 if ((adev->flags & AMD_IS_APU) && !amdgpu_passthrough(adev)) 7414 return; 7415 #endif 7416 if (adev->gmc.xgmi.connected_to_cpu) 7417 return; 7418 7419 if (ring && ring->funcs->emit_hdp_flush) { 7420 amdgpu_ring_emit_hdp_flush(ring); 7421 return; 7422 } 7423 7424 if (!ring && amdgpu_sriov_runtime(adev)) { 7425 if (!amdgpu_kiq_hdp_flush(adev)) 7426 return; 7427 } 7428 7429 amdgpu_hdp_flush(adev, ring); 7430 } 7431 7432 void amdgpu_device_invalidate_hdp(struct amdgpu_device *adev, 7433 struct amdgpu_ring *ring) 7434 { 7435 #ifdef CONFIG_X86_64 7436 if ((adev->flags & AMD_IS_APU) && !amdgpu_passthrough(adev)) 7437 return; 7438 #endif 7439 if (adev->gmc.xgmi.connected_to_cpu) 7440 return; 7441 7442 amdgpu_hdp_invalidate(adev, ring); 7443 } 7444 7445 int amdgpu_in_reset(struct amdgpu_device *adev) 7446 { 7447 return atomic_read(&adev->reset_domain->in_gpu_reset); 7448 } 7449 7450 /** 7451 * amdgpu_device_halt() - bring hardware to some kind of halt state 7452 * 7453 * @adev: amdgpu_device pointer 7454 * 7455 * Bring hardware to some kind of halt state so that no one can touch it 7456 * any more. It will help to maintain error context when error occurred. 7457 * Compare to a simple hang, the system will keep stable at least for SSH 7458 * access. Then it should be trivial to inspect the hardware state and 7459 * see what's going on. Implemented as following: 7460 * 7461 * 1. drm_dev_unplug() makes device inaccessible to user space(IOCTLs, etc), 7462 * clears all CPU mappings to device, disallows remappings through page faults 7463 * 2. amdgpu_irq_disable_all() disables all interrupts 7464 * 3. amdgpu_fence_driver_hw_fini() signals all HW fences 7465 * 4. set adev->no_hw_access to avoid potential crashes after setp 5 7466 * 5. amdgpu_device_unmap_mmio() clears all MMIO mappings 7467 * 6. pci_disable_device() and pci_wait_for_pending_transaction() 7468 * flush any in flight DMA operations 7469 */ 7470 void amdgpu_device_halt(struct amdgpu_device *adev) 7471 { 7472 struct pci_dev *pdev = adev->pdev; 7473 struct drm_device *ddev = adev_to_drm(adev); 7474 7475 amdgpu_xcp_dev_unplug(adev); 7476 drm_dev_unplug(ddev); 7477 7478 amdgpu_irq_disable_all(adev); 7479 7480 amdgpu_fence_driver_hw_fini(adev); 7481 7482 adev->no_hw_access = true; 7483 7484 amdgpu_device_unmap_mmio(adev); 7485 7486 pci_disable_device(pdev); 7487 pci_wait_for_pending_transaction(pdev); 7488 } 7489 7490 u32 amdgpu_device_pcie_port_rreg(struct amdgpu_device *adev, 7491 u32 reg) 7492 { 7493 unsigned long flags, address, data; 7494 u32 r; 7495 7496 address = adev->nbio.funcs->get_pcie_port_index_offset(adev); 7497 data = adev->nbio.funcs->get_pcie_port_data_offset(adev); 7498 7499 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 7500 WREG32(address, reg * 4); 7501 (void)RREG32(address); 7502 r = RREG32(data); 7503 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 7504 return r; 7505 } 7506 7507 void amdgpu_device_pcie_port_wreg(struct amdgpu_device *adev, 7508 u32 reg, u32 v) 7509 { 7510 unsigned long flags, address, data; 7511 7512 address = adev->nbio.funcs->get_pcie_port_index_offset(adev); 7513 data = adev->nbio.funcs->get_pcie_port_data_offset(adev); 7514 7515 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 7516 WREG32(address, reg * 4); 7517 (void)RREG32(address); 7518 WREG32(data, v); 7519 (void)RREG32(data); 7520 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 7521 } 7522 7523 /** 7524 * amdgpu_device_get_gang - return a reference to the current gang 7525 * @adev: amdgpu_device pointer 7526 * 7527 * Returns: A new reference to the current gang leader. 7528 */ 7529 struct dma_fence *amdgpu_device_get_gang(struct amdgpu_device *adev) 7530 { 7531 struct dma_fence *fence; 7532 7533 rcu_read_lock(); 7534 fence = dma_fence_get_rcu_safe(&adev->gang_submit); 7535 rcu_read_unlock(); 7536 return fence; 7537 } 7538 7539 /** 7540 * amdgpu_device_switch_gang - switch to a new gang 7541 * @adev: amdgpu_device pointer 7542 * @gang: the gang to switch to 7543 * 7544 * Try to switch to a new gang. 7545 * Returns: NULL if we switched to the new gang or a reference to the current 7546 * gang leader. 7547 */ 7548 struct dma_fence *amdgpu_device_switch_gang(struct amdgpu_device *adev, 7549 struct dma_fence *gang) 7550 { 7551 struct dma_fence *old = NULL; 7552 7553 dma_fence_get(gang); 7554 do { 7555 dma_fence_put(old); 7556 old = amdgpu_device_get_gang(adev); 7557 if (old == gang) 7558 break; 7559 7560 if (!dma_fence_is_signaled(old)) { 7561 dma_fence_put(gang); 7562 return old; 7563 } 7564 7565 } while (cmpxchg((struct dma_fence __force **)&adev->gang_submit, 7566 old, gang) != old); 7567 7568 /* 7569 * Drop it once for the exchanged reference in adev and once for the 7570 * thread local reference acquired in amdgpu_device_get_gang(). 7571 */ 7572 dma_fence_put(old); 7573 dma_fence_put(old); 7574 return NULL; 7575 } 7576 7577 /** 7578 * amdgpu_device_enforce_isolation - enforce HW isolation 7579 * @adev: the amdgpu device pointer 7580 * @ring: the HW ring the job is supposed to run on 7581 * @job: the job which is about to be pushed to the HW ring 7582 * 7583 * Makes sure that only one client at a time can use the GFX block. 7584 * Returns: The dependency to wait on before the job can be pushed to the HW. 7585 * The function is called multiple times until NULL is returned. 7586 */ 7587 struct dma_fence *amdgpu_device_enforce_isolation(struct amdgpu_device *adev, 7588 struct amdgpu_ring *ring, 7589 struct amdgpu_job *job) 7590 { 7591 struct amdgpu_isolation *isolation = &adev->isolation[ring->xcp_id]; 7592 struct drm_sched_fence *f = job->base.s_fence; 7593 struct dma_fence *dep; 7594 void *owner; 7595 int r; 7596 7597 /* 7598 * For now enforce isolation only for the GFX block since we only need 7599 * the cleaner shader on those rings. 7600 */ 7601 if (ring->funcs->type != AMDGPU_RING_TYPE_GFX && 7602 ring->funcs->type != AMDGPU_RING_TYPE_COMPUTE) 7603 return NULL; 7604 7605 /* 7606 * All submissions where enforce isolation is false are handled as if 7607 * they come from a single client. Use ~0l as the owner to distinct it 7608 * from kernel submissions where the owner is NULL. 7609 */ 7610 owner = job->enforce_isolation ? f->owner : (void *)~0l; 7611 7612 mutex_lock(&adev->enforce_isolation_mutex); 7613 7614 /* 7615 * The "spearhead" submission is the first one which changes the 7616 * ownership to its client. We always need to wait for it to be 7617 * pushed to the HW before proceeding with anything. 7618 */ 7619 if (&f->scheduled != isolation->spearhead && 7620 !dma_fence_is_signaled(isolation->spearhead)) { 7621 dep = isolation->spearhead; 7622 goto out_grab_ref; 7623 } 7624 7625 if (isolation->owner != owner) { 7626 7627 /* 7628 * Wait for any gang to be assembled before switching to a 7629 * different owner or otherwise we could deadlock the 7630 * submissions. 7631 */ 7632 if (!job->gang_submit) { 7633 dep = amdgpu_device_get_gang(adev); 7634 if (!dma_fence_is_signaled(dep)) 7635 goto out_return_dep; 7636 dma_fence_put(dep); 7637 } 7638 7639 dma_fence_put(isolation->spearhead); 7640 isolation->spearhead = dma_fence_get(&f->scheduled); 7641 amdgpu_sync_move(&isolation->active, &isolation->prev); 7642 trace_amdgpu_isolation(isolation->owner, owner); 7643 isolation->owner = owner; 7644 } 7645 7646 /* 7647 * Specifying the ring here helps to pipeline submissions even when 7648 * isolation is enabled. If that is not desired for testing NULL can be 7649 * used instead of the ring to enforce a CPU round trip while switching 7650 * between clients. 7651 */ 7652 dep = amdgpu_sync_peek_fence(&isolation->prev, ring); 7653 r = amdgpu_sync_fence(&isolation->active, &f->finished, GFP_NOWAIT); 7654 if (r) 7655 dev_warn(adev->dev, "OOM tracking isolation\n"); 7656 7657 out_grab_ref: 7658 dma_fence_get(dep); 7659 out_return_dep: 7660 mutex_unlock(&adev->enforce_isolation_mutex); 7661 return dep; 7662 } 7663 7664 bool amdgpu_device_has_display_hardware(struct amdgpu_device *adev) 7665 { 7666 switch (adev->asic_type) { 7667 #ifdef CONFIG_DRM_AMDGPU_SI 7668 case CHIP_HAINAN: 7669 #endif 7670 case CHIP_TOPAZ: 7671 /* chips with no display hardware */ 7672 return false; 7673 #ifdef CONFIG_DRM_AMDGPU_SI 7674 case CHIP_TAHITI: 7675 case CHIP_PITCAIRN: 7676 case CHIP_VERDE: 7677 case CHIP_OLAND: 7678 #endif 7679 #ifdef CONFIG_DRM_AMDGPU_CIK 7680 case CHIP_BONAIRE: 7681 case CHIP_HAWAII: 7682 case CHIP_KAVERI: 7683 case CHIP_KABINI: 7684 case CHIP_MULLINS: 7685 #endif 7686 case CHIP_TONGA: 7687 case CHIP_FIJI: 7688 case CHIP_POLARIS10: 7689 case CHIP_POLARIS11: 7690 case CHIP_POLARIS12: 7691 case CHIP_VEGAM: 7692 case CHIP_CARRIZO: 7693 case CHIP_STONEY: 7694 /* chips with display hardware */ 7695 return true; 7696 default: 7697 /* IP discovery */ 7698 if (!amdgpu_ip_version(adev, DCE_HWIP, 0) || 7699 (adev->harvest_ip_mask & AMD_HARVEST_IP_DMU_MASK)) 7700 return false; 7701 return true; 7702 } 7703 } 7704 7705 uint32_t amdgpu_device_wait_on_rreg(struct amdgpu_device *adev, 7706 uint32_t inst, uint32_t reg_addr, char reg_name[], 7707 uint32_t expected_value, uint32_t mask) 7708 { 7709 uint32_t ret = 0; 7710 uint32_t old_ = 0; 7711 uint32_t tmp_ = RREG32(reg_addr); 7712 uint32_t loop = adev->usec_timeout; 7713 7714 while ((tmp_ & (mask)) != (expected_value)) { 7715 if (old_ != tmp_) { 7716 loop = adev->usec_timeout; 7717 old_ = tmp_; 7718 } else 7719 udelay(1); 7720 tmp_ = RREG32(reg_addr); 7721 loop--; 7722 if (!loop) { 7723 dev_warn( 7724 adev->dev, 7725 "Register(%d) [%s] failed to reach value 0x%08x != 0x%08xn", 7726 inst, reg_name, (uint32_t)expected_value, 7727 (uint32_t)(tmp_ & (mask))); 7728 ret = -ETIMEDOUT; 7729 break; 7730 } 7731 } 7732 return ret; 7733 } 7734 7735 ssize_t amdgpu_get_soft_full_reset_mask(struct amdgpu_ring *ring) 7736 { 7737 ssize_t size = 0; 7738 7739 if (!ring || !ring->adev) 7740 return size; 7741 7742 if (amdgpu_device_should_recover_gpu(ring->adev)) 7743 size |= AMDGPU_RESET_TYPE_FULL; 7744 7745 if (unlikely(!ring->adev->debug_disable_soft_recovery) && 7746 !amdgpu_sriov_vf(ring->adev) && ring->funcs->soft_recovery) 7747 size |= AMDGPU_RESET_TYPE_SOFT_RESET; 7748 7749 return size; 7750 } 7751 7752 ssize_t amdgpu_show_reset_mask(char *buf, uint32_t supported_reset) 7753 { 7754 ssize_t size = 0; 7755 7756 if (supported_reset == 0) { 7757 size += sysfs_emit_at(buf, size, "unsupported"); 7758 size += sysfs_emit_at(buf, size, "\n"); 7759 return size; 7760 7761 } 7762 7763 if (supported_reset & AMDGPU_RESET_TYPE_SOFT_RESET) 7764 size += sysfs_emit_at(buf, size, "soft "); 7765 7766 if (supported_reset & AMDGPU_RESET_TYPE_PER_QUEUE) 7767 size += sysfs_emit_at(buf, size, "queue "); 7768 7769 if (supported_reset & AMDGPU_RESET_TYPE_PER_PIPE) 7770 size += sysfs_emit_at(buf, size, "pipe "); 7771 7772 if (supported_reset & AMDGPU_RESET_TYPE_FULL) 7773 size += sysfs_emit_at(buf, size, "full "); 7774 7775 size += sysfs_emit_at(buf, size, "\n"); 7776 return size; 7777 } 7778 7779 void amdgpu_device_set_uid(struct amdgpu_uid *uid_info, 7780 enum amdgpu_uid_type type, uint8_t inst, 7781 uint64_t uid) 7782 { 7783 if (!uid_info) 7784 return; 7785 7786 if (type >= AMDGPU_UID_TYPE_MAX) { 7787 dev_err_once(uid_info->adev->dev, "Invalid UID type %d\n", 7788 type); 7789 return; 7790 } 7791 7792 if (inst >= AMDGPU_UID_INST_MAX) { 7793 dev_err_once(uid_info->adev->dev, "Invalid UID instance %d\n", 7794 inst); 7795 return; 7796 } 7797 7798 if (uid_info->uid[type][inst] != 0) { 7799 dev_warn_once( 7800 uid_info->adev->dev, 7801 "Overwriting existing UID %llu for type %d instance %d\n", 7802 uid_info->uid[type][inst], type, inst); 7803 } 7804 7805 uid_info->uid[type][inst] = uid; 7806 } 7807 7808 u64 amdgpu_device_get_uid(struct amdgpu_uid *uid_info, 7809 enum amdgpu_uid_type type, uint8_t inst) 7810 { 7811 if (!uid_info) 7812 return 0; 7813 7814 if (type >= AMDGPU_UID_TYPE_MAX) { 7815 dev_err_once(uid_info->adev->dev, "Invalid UID type %d\n", 7816 type); 7817 return 0; 7818 } 7819 7820 if (inst >= AMDGPU_UID_INST_MAX) { 7821 dev_err_once(uid_info->adev->dev, "Invalid UID instance %d\n", 7822 inst); 7823 return 0; 7824 } 7825 7826 return uid_info->uid[type][inst]; 7827 } 7828