1 /* 2 * Copyright 2008 Advanced Micro Devices, Inc. 3 * Copyright 2008 Red Hat Inc. 4 * Copyright 2009 Jerome Glisse. 5 * 6 * Permission is hereby granted, free of charge, to any person obtaining a 7 * copy of this software and associated documentation files (the "Software"), 8 * to deal in the Software without restriction, including without limitation 9 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 10 * and/or sell copies of the Software, and to permit persons to whom the 11 * Software is furnished to do so, subject to the following conditions: 12 * 13 * The above copyright notice and this permission notice shall be included in 14 * all copies or substantial portions of the Software. 15 * 16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 19 * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR 20 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, 21 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR 22 * OTHER DEALINGS IN THE SOFTWARE. 23 * 24 * Authors: Dave Airlie 25 * Alex Deucher 26 * Jerome Glisse 27 */ 28 29 #include <linux/aperture.h> 30 #include <linux/power_supply.h> 31 #include <linux/kthread.h> 32 #include <linux/module.h> 33 #include <linux/console.h> 34 #include <linux/slab.h> 35 #include <linux/iommu.h> 36 #include <linux/pci.h> 37 #include <linux/pci-p2pdma.h> 38 #include <linux/apple-gmux.h> 39 40 #include <drm/drm_atomic_helper.h> 41 #include <drm/drm_client_event.h> 42 #include <drm/drm_crtc_helper.h> 43 #include <drm/drm_probe_helper.h> 44 #include <drm/amdgpu_drm.h> 45 #include <linux/device.h> 46 #include <linux/vgaarb.h> 47 #include <linux/vga_switcheroo.h> 48 #include <linux/efi.h> 49 #include "amdgpu.h" 50 #include "amdgpu_trace.h" 51 #include "amdgpu_i2c.h" 52 #include "atom.h" 53 #include "amdgpu_atombios.h" 54 #include "amdgpu_atomfirmware.h" 55 #include "amd_pcie.h" 56 #ifdef CONFIG_DRM_AMDGPU_SI 57 #include "si.h" 58 #endif 59 #ifdef CONFIG_DRM_AMDGPU_CIK 60 #include "cik.h" 61 #endif 62 #include "vi.h" 63 #include "soc15.h" 64 #include "nv.h" 65 #include "bif/bif_4_1_d.h" 66 #include <linux/firmware.h> 67 #include "amdgpu_vf_error.h" 68 69 #include "amdgpu_amdkfd.h" 70 #include "amdgpu_pm.h" 71 72 #include "amdgpu_xgmi.h" 73 #include "amdgpu_ras.h" 74 #include "amdgpu_ras_mgr.h" 75 #include "amdgpu_pmu.h" 76 #include "amdgpu_fru_eeprom.h" 77 #include "amdgpu_reset.h" 78 #include "amdgpu_virt.h" 79 #include "amdgpu_dev_coredump.h" 80 81 #include <linux/suspend.h> 82 #include <drm/task_barrier.h> 83 #include <linux/pm_runtime.h> 84 85 #include <drm/drm_drv.h> 86 87 #if IS_ENABLED(CONFIG_X86) 88 #include <asm/intel-family.h> 89 #include <asm/cpu_device_id.h> 90 #endif 91 92 MODULE_FIRMWARE("amdgpu/vega10_gpu_info.bin"); 93 MODULE_FIRMWARE("amdgpu/vega12_gpu_info.bin"); 94 MODULE_FIRMWARE("amdgpu/raven_gpu_info.bin"); 95 MODULE_FIRMWARE("amdgpu/picasso_gpu_info.bin"); 96 MODULE_FIRMWARE("amdgpu/raven2_gpu_info.bin"); 97 MODULE_FIRMWARE("amdgpu/arcturus_gpu_info.bin"); 98 MODULE_FIRMWARE("amdgpu/navi12_gpu_info.bin"); 99 MODULE_FIRMWARE("amdgpu/cyan_skillfish_gpu_info.bin"); 100 101 #define AMDGPU_RESUME_MS 2000 102 #define AMDGPU_MAX_RETRY_LIMIT 2 103 #define AMDGPU_RETRY_SRIOV_RESET(r) ((r) == -EBUSY || (r) == -ETIMEDOUT || (r) == -EINVAL) 104 #define AMDGPU_PCIE_INDEX_FALLBACK (0x38 >> 2) 105 #define AMDGPU_PCIE_INDEX_HI_FALLBACK (0x44 >> 2) 106 #define AMDGPU_PCIE_DATA_FALLBACK (0x3C >> 2) 107 108 #define AMDGPU_VBIOS_SKIP (1U << 0) 109 #define AMDGPU_VBIOS_OPTIONAL (1U << 1) 110 111 static const struct drm_driver amdgpu_kms_driver; 112 113 const char *amdgpu_asic_name[] = { 114 "TAHITI", 115 "PITCAIRN", 116 "VERDE", 117 "OLAND", 118 "HAINAN", 119 "BONAIRE", 120 "KAVERI", 121 "KABINI", 122 "HAWAII", 123 "MULLINS", 124 "TOPAZ", 125 "TONGA", 126 "FIJI", 127 "CARRIZO", 128 "STONEY", 129 "POLARIS10", 130 "POLARIS11", 131 "POLARIS12", 132 "VEGAM", 133 "VEGA10", 134 "VEGA12", 135 "VEGA20", 136 "RAVEN", 137 "ARCTURUS", 138 "RENOIR", 139 "ALDEBARAN", 140 "NAVI10", 141 "CYAN_SKILLFISH", 142 "NAVI14", 143 "NAVI12", 144 "SIENNA_CICHLID", 145 "NAVY_FLOUNDER", 146 "VANGOGH", 147 "DIMGREY_CAVEFISH", 148 "BEIGE_GOBY", 149 "YELLOW_CARP", 150 "IP DISCOVERY", 151 "LAST", 152 }; 153 154 #define AMDGPU_IP_BLK_MASK_ALL GENMASK(AMD_IP_BLOCK_TYPE_NUM - 1, 0) 155 /* 156 * Default init level where all blocks are expected to be initialized. This is 157 * the level of initialization expected by default and also after a full reset 158 * of the device. 159 */ 160 struct amdgpu_init_level amdgpu_init_default = { 161 .level = AMDGPU_INIT_LEVEL_DEFAULT, 162 .hwini_ip_block_mask = AMDGPU_IP_BLK_MASK_ALL, 163 }; 164 165 struct amdgpu_init_level amdgpu_init_recovery = { 166 .level = AMDGPU_INIT_LEVEL_RESET_RECOVERY, 167 .hwini_ip_block_mask = AMDGPU_IP_BLK_MASK_ALL, 168 }; 169 170 /* 171 * Minimal blocks needed to be initialized before a XGMI hive can be reset. This 172 * is used for cases like reset on initialization where the entire hive needs to 173 * be reset before first use. 174 */ 175 struct amdgpu_init_level amdgpu_init_minimal_xgmi = { 176 .level = AMDGPU_INIT_LEVEL_MINIMAL_XGMI, 177 .hwini_ip_block_mask = 178 BIT(AMD_IP_BLOCK_TYPE_GMC) | BIT(AMD_IP_BLOCK_TYPE_SMC) | 179 BIT(AMD_IP_BLOCK_TYPE_COMMON) | BIT(AMD_IP_BLOCK_TYPE_IH) | 180 BIT(AMD_IP_BLOCK_TYPE_PSP) 181 }; 182 183 static int amdgpu_device_ip_resume_phase1(struct amdgpu_device *adev); 184 static int amdgpu_device_ip_resume_phase2(struct amdgpu_device *adev); 185 static int amdgpu_device_ip_resume_phase3(struct amdgpu_device *adev); 186 187 static void amdgpu_device_load_switch_state(struct amdgpu_device *adev); 188 189 static inline bool amdgpu_ip_member_of_hwini(struct amdgpu_device *adev, 190 enum amd_ip_block_type block) 191 { 192 return (adev->init_lvl->hwini_ip_block_mask & (1U << block)) != 0; 193 } 194 195 void amdgpu_set_init_level(struct amdgpu_device *adev, 196 enum amdgpu_init_lvl_id lvl) 197 { 198 switch (lvl) { 199 case AMDGPU_INIT_LEVEL_MINIMAL_XGMI: 200 adev->init_lvl = &amdgpu_init_minimal_xgmi; 201 break; 202 case AMDGPU_INIT_LEVEL_RESET_RECOVERY: 203 adev->init_lvl = &amdgpu_init_recovery; 204 break; 205 case AMDGPU_INIT_LEVEL_DEFAULT: 206 fallthrough; 207 default: 208 adev->init_lvl = &amdgpu_init_default; 209 break; 210 } 211 } 212 213 static inline void amdgpu_device_stop_pending_resets(struct amdgpu_device *adev); 214 static int amdgpu_device_pm_notifier(struct notifier_block *nb, unsigned long mode, 215 void *data); 216 217 /** 218 * DOC: pcie_replay_count 219 * 220 * The amdgpu driver provides a sysfs API for reporting the total number 221 * of PCIe replays (NAKs). 222 * The file pcie_replay_count is used for this and returns the total 223 * number of replays as a sum of the NAKs generated and NAKs received. 224 */ 225 226 static ssize_t amdgpu_device_get_pcie_replay_count(struct device *dev, 227 struct device_attribute *attr, char *buf) 228 { 229 struct drm_device *ddev = dev_get_drvdata(dev); 230 struct amdgpu_device *adev = drm_to_adev(ddev); 231 uint64_t cnt = amdgpu_asic_get_pcie_replay_count(adev); 232 233 return sysfs_emit(buf, "%llu\n", cnt); 234 } 235 236 static DEVICE_ATTR(pcie_replay_count, 0444, 237 amdgpu_device_get_pcie_replay_count, NULL); 238 239 static int amdgpu_device_attr_sysfs_init(struct amdgpu_device *adev) 240 { 241 int ret = 0; 242 243 if (amdgpu_nbio_is_replay_cnt_supported(adev)) 244 ret = sysfs_create_file(&adev->dev->kobj, 245 &dev_attr_pcie_replay_count.attr); 246 247 return ret; 248 } 249 250 static void amdgpu_device_attr_sysfs_fini(struct amdgpu_device *adev) 251 { 252 if (amdgpu_nbio_is_replay_cnt_supported(adev)) 253 sysfs_remove_file(&adev->dev->kobj, 254 &dev_attr_pcie_replay_count.attr); 255 } 256 257 static ssize_t amdgpu_sysfs_reg_state_get(struct file *f, struct kobject *kobj, 258 const struct bin_attribute *attr, char *buf, 259 loff_t ppos, size_t count) 260 { 261 struct device *dev = kobj_to_dev(kobj); 262 struct drm_device *ddev = dev_get_drvdata(dev); 263 struct amdgpu_device *adev = drm_to_adev(ddev); 264 ssize_t bytes_read; 265 266 switch (ppos) { 267 case AMDGPU_SYS_REG_STATE_XGMI: 268 bytes_read = amdgpu_asic_get_reg_state( 269 adev, AMDGPU_REG_STATE_TYPE_XGMI, buf, count); 270 break; 271 case AMDGPU_SYS_REG_STATE_WAFL: 272 bytes_read = amdgpu_asic_get_reg_state( 273 adev, AMDGPU_REG_STATE_TYPE_WAFL, buf, count); 274 break; 275 case AMDGPU_SYS_REG_STATE_PCIE: 276 bytes_read = amdgpu_asic_get_reg_state( 277 adev, AMDGPU_REG_STATE_TYPE_PCIE, buf, count); 278 break; 279 case AMDGPU_SYS_REG_STATE_USR: 280 bytes_read = amdgpu_asic_get_reg_state( 281 adev, AMDGPU_REG_STATE_TYPE_USR, buf, count); 282 break; 283 case AMDGPU_SYS_REG_STATE_USR_1: 284 bytes_read = amdgpu_asic_get_reg_state( 285 adev, AMDGPU_REG_STATE_TYPE_USR_1, buf, count); 286 break; 287 default: 288 return -EINVAL; 289 } 290 291 return bytes_read; 292 } 293 294 static const BIN_ATTR(reg_state, 0444, amdgpu_sysfs_reg_state_get, NULL, 295 AMDGPU_SYS_REG_STATE_END); 296 297 int amdgpu_reg_state_sysfs_init(struct amdgpu_device *adev) 298 { 299 int ret; 300 301 if (!amdgpu_asic_get_reg_state_supported(adev)) 302 return 0; 303 304 ret = sysfs_create_bin_file(&adev->dev->kobj, &bin_attr_reg_state); 305 306 return ret; 307 } 308 309 void amdgpu_reg_state_sysfs_fini(struct amdgpu_device *adev) 310 { 311 if (!amdgpu_asic_get_reg_state_supported(adev)) 312 return; 313 sysfs_remove_bin_file(&adev->dev->kobj, &bin_attr_reg_state); 314 } 315 316 int amdgpu_ip_block_suspend(struct amdgpu_ip_block *ip_block) 317 { 318 int r; 319 320 if (ip_block->version->funcs->suspend) { 321 r = ip_block->version->funcs->suspend(ip_block); 322 if (r) { 323 dev_err(ip_block->adev->dev, 324 "suspend of IP block <%s> failed %d\n", 325 ip_block->version->funcs->name, r); 326 return r; 327 } 328 } 329 330 ip_block->status.hw = false; 331 return 0; 332 } 333 334 int amdgpu_ip_block_resume(struct amdgpu_ip_block *ip_block) 335 { 336 int r; 337 338 if (ip_block->version->funcs->resume) { 339 r = ip_block->version->funcs->resume(ip_block); 340 if (r) { 341 dev_err(ip_block->adev->dev, 342 "resume of IP block <%s> failed %d\n", 343 ip_block->version->funcs->name, r); 344 return r; 345 } 346 } 347 348 ip_block->status.hw = true; 349 return 0; 350 } 351 352 /** 353 * DOC: board_info 354 * 355 * The amdgpu driver provides a sysfs API for giving board related information. 356 * It provides the form factor information in the format 357 * 358 * type : form factor 359 * 360 * Possible form factor values 361 * 362 * - "cem" - PCIE CEM card 363 * - "oam" - Open Compute Accelerator Module 364 * - "unknown" - Not known 365 * 366 */ 367 368 static ssize_t amdgpu_device_get_board_info(struct device *dev, 369 struct device_attribute *attr, 370 char *buf) 371 { 372 struct drm_device *ddev = dev_get_drvdata(dev); 373 struct amdgpu_device *adev = drm_to_adev(ddev); 374 enum amdgpu_pkg_type pkg_type = AMDGPU_PKG_TYPE_CEM; 375 const char *pkg; 376 377 if (adev->smuio.funcs && adev->smuio.funcs->get_pkg_type) 378 pkg_type = adev->smuio.funcs->get_pkg_type(adev); 379 380 switch (pkg_type) { 381 case AMDGPU_PKG_TYPE_CEM: 382 pkg = "cem"; 383 break; 384 case AMDGPU_PKG_TYPE_OAM: 385 pkg = "oam"; 386 break; 387 default: 388 pkg = "unknown"; 389 break; 390 } 391 392 return sysfs_emit(buf, "%s : %s\n", "type", pkg); 393 } 394 395 static DEVICE_ATTR(board_info, 0444, amdgpu_device_get_board_info, NULL); 396 397 static struct attribute *amdgpu_board_attrs[] = { 398 &dev_attr_board_info.attr, 399 NULL, 400 }; 401 402 static umode_t amdgpu_board_attrs_is_visible(struct kobject *kobj, 403 struct attribute *attr, int n) 404 { 405 struct device *dev = kobj_to_dev(kobj); 406 struct drm_device *ddev = dev_get_drvdata(dev); 407 struct amdgpu_device *adev = drm_to_adev(ddev); 408 409 if (adev->flags & AMD_IS_APU) 410 return 0; 411 412 return attr->mode; 413 } 414 415 static const struct attribute_group amdgpu_board_attrs_group = { 416 .attrs = amdgpu_board_attrs, 417 .is_visible = amdgpu_board_attrs_is_visible 418 }; 419 420 static void amdgpu_device_get_pcie_info(struct amdgpu_device *adev); 421 422 /** 423 * amdgpu_device_supports_px - Is the device a dGPU with ATPX power control 424 * 425 * @adev: amdgpu device pointer 426 * 427 * Returns true if the device is a dGPU with ATPX power control, 428 * otherwise return false. 429 */ 430 bool amdgpu_device_supports_px(struct amdgpu_device *adev) 431 { 432 if ((adev->flags & AMD_IS_PX) && !amdgpu_is_atpx_hybrid()) 433 return true; 434 return false; 435 } 436 437 /** 438 * amdgpu_device_supports_boco - Is the device a dGPU with ACPI power resources 439 * 440 * @adev: amdgpu device pointer 441 * 442 * Returns true if the device is a dGPU with ACPI power control, 443 * otherwise return false. 444 */ 445 bool amdgpu_device_supports_boco(struct amdgpu_device *adev) 446 { 447 if (!IS_ENABLED(CONFIG_HOTPLUG_PCI_PCIE)) 448 return false; 449 450 if (adev->has_pr3 || 451 ((adev->flags & AMD_IS_PX) && amdgpu_is_atpx_hybrid())) 452 return true; 453 return false; 454 } 455 456 /** 457 * amdgpu_device_supports_baco - Does the device support BACO 458 * 459 * @adev: amdgpu device pointer 460 * 461 * Return: 462 * 1 if the device supports BACO; 463 * 3 if the device supports MACO (only works if BACO is supported) 464 * otherwise return 0. 465 */ 466 int amdgpu_device_supports_baco(struct amdgpu_device *adev) 467 { 468 return amdgpu_asic_supports_baco(adev); 469 } 470 471 void amdgpu_device_detect_runtime_pm_mode(struct amdgpu_device *adev) 472 { 473 int bamaco_support; 474 475 adev->pm.rpm_mode = AMDGPU_RUNPM_NONE; 476 bamaco_support = amdgpu_device_supports_baco(adev); 477 478 switch (amdgpu_runtime_pm) { 479 case 2: 480 if (bamaco_support & MACO_SUPPORT) { 481 adev->pm.rpm_mode = AMDGPU_RUNPM_BAMACO; 482 dev_info(adev->dev, "Forcing BAMACO for runtime pm\n"); 483 } else if (bamaco_support == BACO_SUPPORT) { 484 adev->pm.rpm_mode = AMDGPU_RUNPM_BACO; 485 dev_info(adev->dev, "Requested mode BAMACO not available,fallback to use BACO\n"); 486 } 487 break; 488 case 1: 489 if (bamaco_support & BACO_SUPPORT) { 490 adev->pm.rpm_mode = AMDGPU_RUNPM_BACO; 491 dev_info(adev->dev, "Forcing BACO for runtime pm\n"); 492 } 493 break; 494 case -1: 495 case -2: 496 if (amdgpu_device_supports_px(adev)) { 497 /* enable PX as runtime mode */ 498 adev->pm.rpm_mode = AMDGPU_RUNPM_PX; 499 dev_info(adev->dev, "Using ATPX for runtime pm\n"); 500 } else if (amdgpu_device_supports_boco(adev)) { 501 /* enable boco as runtime mode */ 502 adev->pm.rpm_mode = AMDGPU_RUNPM_BOCO; 503 dev_info(adev->dev, "Using BOCO for runtime pm\n"); 504 } else { 505 if (!bamaco_support) 506 goto no_runtime_pm; 507 508 switch (adev->asic_type) { 509 case CHIP_VEGA20: 510 case CHIP_ARCTURUS: 511 /* BACO are not supported on vega20 and arctrus */ 512 break; 513 case CHIP_VEGA10: 514 /* enable BACO as runpm mode if noretry=0 */ 515 if (!adev->gmc.noretry && !amdgpu_passthrough(adev)) 516 adev->pm.rpm_mode = AMDGPU_RUNPM_BACO; 517 break; 518 default: 519 /* enable BACO as runpm mode on CI+ */ 520 if (!amdgpu_passthrough(adev)) 521 adev->pm.rpm_mode = AMDGPU_RUNPM_BACO; 522 break; 523 } 524 525 if (adev->pm.rpm_mode == AMDGPU_RUNPM_BACO) { 526 if (bamaco_support & MACO_SUPPORT) { 527 adev->pm.rpm_mode = AMDGPU_RUNPM_BAMACO; 528 dev_info(adev->dev, "Using BAMACO for runtime pm\n"); 529 } else { 530 dev_info(adev->dev, "Using BACO for runtime pm\n"); 531 } 532 } 533 } 534 break; 535 case 0: 536 dev_info(adev->dev, "runtime pm is manually disabled\n"); 537 break; 538 default: 539 break; 540 } 541 542 no_runtime_pm: 543 if (adev->pm.rpm_mode == AMDGPU_RUNPM_NONE) 544 dev_info(adev->dev, "Runtime PM not available\n"); 545 } 546 /** 547 * amdgpu_device_supports_smart_shift - Is the device dGPU with 548 * smart shift support 549 * 550 * @adev: amdgpu device pointer 551 * 552 * Returns true if the device is a dGPU with Smart Shift support, 553 * otherwise returns false. 554 */ 555 bool amdgpu_device_supports_smart_shift(struct amdgpu_device *adev) 556 { 557 return (amdgpu_device_supports_boco(adev) && 558 amdgpu_acpi_is_power_shift_control_supported()); 559 } 560 561 /* 562 * VRAM access helper functions 563 */ 564 565 /** 566 * amdgpu_device_mm_access - access vram by MM_INDEX/MM_DATA 567 * 568 * @adev: amdgpu_device pointer 569 * @pos: offset of the buffer in vram 570 * @buf: virtual address of the buffer in system memory 571 * @size: read/write size, sizeof(@buf) must > @size 572 * @write: true - write to vram, otherwise - read from vram 573 */ 574 void amdgpu_device_mm_access(struct amdgpu_device *adev, loff_t pos, 575 void *buf, size_t size, bool write) 576 { 577 unsigned long flags; 578 uint32_t hi = ~0, tmp = 0; 579 uint32_t *data = buf; 580 uint64_t last; 581 int idx; 582 583 if (!drm_dev_enter(adev_to_drm(adev), &idx)) 584 return; 585 586 BUG_ON(!IS_ALIGNED(pos, 4) || !IS_ALIGNED(size, 4)); 587 588 spin_lock_irqsave(&adev->mmio_idx_lock, flags); 589 for (last = pos + size; pos < last; pos += 4) { 590 tmp = pos >> 31; 591 592 WREG32_NO_KIQ(mmMM_INDEX, ((uint32_t)pos) | 0x80000000); 593 if (tmp != hi) { 594 WREG32_NO_KIQ(mmMM_INDEX_HI, tmp); 595 hi = tmp; 596 } 597 if (write) 598 WREG32_NO_KIQ(mmMM_DATA, *data++); 599 else 600 *data++ = RREG32_NO_KIQ(mmMM_DATA); 601 } 602 603 spin_unlock_irqrestore(&adev->mmio_idx_lock, flags); 604 drm_dev_exit(idx); 605 } 606 607 /** 608 * amdgpu_device_aper_access - access vram by vram aperture 609 * 610 * @adev: amdgpu_device pointer 611 * @pos: offset of the buffer in vram 612 * @buf: virtual address of the buffer in system memory 613 * @size: read/write size, sizeof(@buf) must > @size 614 * @write: true - write to vram, otherwise - read from vram 615 * 616 * The return value means how many bytes have been transferred. 617 */ 618 size_t amdgpu_device_aper_access(struct amdgpu_device *adev, loff_t pos, 619 void *buf, size_t size, bool write) 620 { 621 #ifdef CONFIG_64BIT 622 void __iomem *addr; 623 size_t count = 0; 624 uint64_t last; 625 626 if (!adev->mman.aper_base_kaddr) 627 return 0; 628 629 last = min(pos + size, adev->gmc.visible_vram_size); 630 if (last > pos) { 631 addr = adev->mman.aper_base_kaddr + pos; 632 count = last - pos; 633 634 if (write) { 635 memcpy_toio(addr, buf, count); 636 /* Make sure HDP write cache flush happens without any reordering 637 * after the system memory contents are sent over PCIe device 638 */ 639 mb(); 640 amdgpu_device_flush_hdp(adev, NULL); 641 } else { 642 amdgpu_device_invalidate_hdp(adev, NULL); 643 /* Make sure HDP read cache is invalidated before issuing a read 644 * to the PCIe device 645 */ 646 mb(); 647 memcpy_fromio(buf, addr, count); 648 } 649 650 } 651 652 return count; 653 #else 654 return 0; 655 #endif 656 } 657 658 /** 659 * amdgpu_device_vram_access - read/write a buffer in vram 660 * 661 * @adev: amdgpu_device pointer 662 * @pos: offset of the buffer in vram 663 * @buf: virtual address of the buffer in system memory 664 * @size: read/write size, sizeof(@buf) must > @size 665 * @write: true - write to vram, otherwise - read from vram 666 */ 667 void amdgpu_device_vram_access(struct amdgpu_device *adev, loff_t pos, 668 void *buf, size_t size, bool write) 669 { 670 size_t count; 671 672 /* try to using vram apreature to access vram first */ 673 count = amdgpu_device_aper_access(adev, pos, buf, size, write); 674 size -= count; 675 if (size) { 676 /* using MM to access rest vram */ 677 pos += count; 678 buf += count; 679 amdgpu_device_mm_access(adev, pos, buf, size, write); 680 } 681 } 682 683 /* 684 * register access helper functions. 685 */ 686 687 /* Check if hw access should be skipped because of hotplug or device error */ 688 bool amdgpu_device_skip_hw_access(struct amdgpu_device *adev) 689 { 690 if (adev->no_hw_access) 691 return true; 692 693 #ifdef CONFIG_LOCKDEP 694 /* 695 * This is a bit complicated to understand, so worth a comment. What we assert 696 * here is that the GPU reset is not running on another thread in parallel. 697 * 698 * For this we trylock the read side of the reset semaphore, if that succeeds 699 * we know that the reset is not running in parallel. 700 * 701 * If the trylock fails we assert that we are either already holding the read 702 * side of the lock or are the reset thread itself and hold the write side of 703 * the lock. 704 */ 705 if (in_task()) { 706 if (down_read_trylock(&adev->reset_domain->sem)) 707 up_read(&adev->reset_domain->sem); 708 else 709 lockdep_assert_held(&adev->reset_domain->sem); 710 } 711 #endif 712 return false; 713 } 714 715 /** 716 * amdgpu_device_rreg - read a memory mapped IO or indirect register 717 * 718 * @adev: amdgpu_device pointer 719 * @reg: dword aligned register offset 720 * @acc_flags: access flags which require special behavior 721 * 722 * Returns the 32 bit value from the offset specified. 723 */ 724 uint32_t amdgpu_device_rreg(struct amdgpu_device *adev, 725 uint32_t reg, uint32_t acc_flags) 726 { 727 uint32_t ret; 728 729 if (amdgpu_device_skip_hw_access(adev)) 730 return 0; 731 732 if ((reg * 4) < adev->rmmio_size) { 733 if (!(acc_flags & AMDGPU_REGS_NO_KIQ) && 734 amdgpu_sriov_runtime(adev) && 735 down_read_trylock(&adev->reset_domain->sem)) { 736 ret = amdgpu_kiq_rreg(adev, reg, 0); 737 up_read(&adev->reset_domain->sem); 738 } else { 739 ret = readl(((void __iomem *)adev->rmmio) + (reg * 4)); 740 } 741 } else { 742 ret = adev->pcie_rreg(adev, reg * 4); 743 } 744 745 trace_amdgpu_device_rreg(adev->pdev->device, reg, ret); 746 747 return ret; 748 } 749 750 /* 751 * MMIO register read with bytes helper functions 752 * @offset:bytes offset from MMIO start 753 */ 754 755 /** 756 * amdgpu_mm_rreg8 - read a memory mapped IO register 757 * 758 * @adev: amdgpu_device pointer 759 * @offset: byte aligned register offset 760 * 761 * Returns the 8 bit value from the offset specified. 762 */ 763 uint8_t amdgpu_mm_rreg8(struct amdgpu_device *adev, uint32_t offset) 764 { 765 if (amdgpu_device_skip_hw_access(adev)) 766 return 0; 767 768 if (offset < adev->rmmio_size) 769 return (readb(adev->rmmio + offset)); 770 BUG(); 771 } 772 773 774 /** 775 * amdgpu_device_xcc_rreg - read a memory mapped IO or indirect register with specific XCC 776 * 777 * @adev: amdgpu_device pointer 778 * @reg: dword aligned register offset 779 * @acc_flags: access flags which require special behavior 780 * @xcc_id: xcc accelerated compute core id 781 * 782 * Returns the 32 bit value from the offset specified. 783 */ 784 uint32_t amdgpu_device_xcc_rreg(struct amdgpu_device *adev, 785 uint32_t reg, uint32_t acc_flags, 786 uint32_t xcc_id) 787 { 788 uint32_t ret, rlcg_flag; 789 790 if (amdgpu_device_skip_hw_access(adev)) 791 return 0; 792 793 if ((reg * 4) < adev->rmmio_size) { 794 if (amdgpu_sriov_vf(adev) && 795 !amdgpu_sriov_runtime(adev) && 796 adev->gfx.rlc.rlcg_reg_access_supported && 797 amdgpu_virt_get_rlcg_reg_access_flag(adev, acc_flags, 798 GC_HWIP, false, 799 &rlcg_flag)) { 800 ret = amdgpu_virt_rlcg_reg_rw(adev, reg, 0, rlcg_flag, GET_INST(GC, xcc_id)); 801 } else if (!(acc_flags & AMDGPU_REGS_NO_KIQ) && 802 amdgpu_sriov_runtime(adev) && 803 down_read_trylock(&adev->reset_domain->sem)) { 804 ret = amdgpu_kiq_rreg(adev, reg, xcc_id); 805 up_read(&adev->reset_domain->sem); 806 } else { 807 ret = readl(((void __iomem *)adev->rmmio) + (reg * 4)); 808 } 809 } else { 810 ret = adev->pcie_rreg(adev, reg * 4); 811 } 812 813 return ret; 814 } 815 816 /* 817 * MMIO register write with bytes helper functions 818 * @offset:bytes offset from MMIO start 819 * @value: the value want to be written to the register 820 */ 821 822 /** 823 * amdgpu_mm_wreg8 - read a memory mapped IO register 824 * 825 * @adev: amdgpu_device pointer 826 * @offset: byte aligned register offset 827 * @value: 8 bit value to write 828 * 829 * Writes the value specified to the offset specified. 830 */ 831 void amdgpu_mm_wreg8(struct amdgpu_device *adev, uint32_t offset, uint8_t value) 832 { 833 if (amdgpu_device_skip_hw_access(adev)) 834 return; 835 836 if (offset < adev->rmmio_size) 837 writeb(value, adev->rmmio + offset); 838 else 839 BUG(); 840 } 841 842 /** 843 * amdgpu_device_wreg - write to a memory mapped IO or indirect register 844 * 845 * @adev: amdgpu_device pointer 846 * @reg: dword aligned register offset 847 * @v: 32 bit value to write to the register 848 * @acc_flags: access flags which require special behavior 849 * 850 * Writes the value specified to the offset specified. 851 */ 852 void amdgpu_device_wreg(struct amdgpu_device *adev, 853 uint32_t reg, uint32_t v, 854 uint32_t acc_flags) 855 { 856 if (amdgpu_device_skip_hw_access(adev)) 857 return; 858 859 if ((reg * 4) < adev->rmmio_size) { 860 if (!(acc_flags & AMDGPU_REGS_NO_KIQ) && 861 amdgpu_sriov_runtime(adev) && 862 down_read_trylock(&adev->reset_domain->sem)) { 863 amdgpu_kiq_wreg(adev, reg, v, 0); 864 up_read(&adev->reset_domain->sem); 865 } else { 866 writel(v, ((void __iomem *)adev->rmmio) + (reg * 4)); 867 } 868 } else { 869 adev->pcie_wreg(adev, reg * 4, v); 870 } 871 872 trace_amdgpu_device_wreg(adev->pdev->device, reg, v); 873 } 874 875 /** 876 * amdgpu_mm_wreg_mmio_rlc - write register either with direct/indirect mmio or with RLC path if in range 877 * 878 * @adev: amdgpu_device pointer 879 * @reg: mmio/rlc register 880 * @v: value to write 881 * @xcc_id: xcc accelerated compute core id 882 * 883 * this function is invoked only for the debugfs register access 884 */ 885 void amdgpu_mm_wreg_mmio_rlc(struct amdgpu_device *adev, 886 uint32_t reg, uint32_t v, 887 uint32_t xcc_id) 888 { 889 if (amdgpu_device_skip_hw_access(adev)) 890 return; 891 892 if (amdgpu_sriov_fullaccess(adev) && 893 adev->gfx.rlc.funcs && 894 adev->gfx.rlc.funcs->is_rlcg_access_range) { 895 if (adev->gfx.rlc.funcs->is_rlcg_access_range(adev, reg)) 896 return amdgpu_sriov_wreg(adev, reg, v, 0, 0, xcc_id); 897 } else if ((reg * 4) >= adev->rmmio_size) { 898 adev->pcie_wreg(adev, reg * 4, v); 899 } else { 900 writel(v, ((void __iomem *)adev->rmmio) + (reg * 4)); 901 } 902 } 903 904 /** 905 * amdgpu_device_xcc_wreg - write to a memory mapped IO or indirect register with specific XCC 906 * 907 * @adev: amdgpu_device pointer 908 * @reg: dword aligned register offset 909 * @v: 32 bit value to write to the register 910 * @acc_flags: access flags which require special behavior 911 * @xcc_id: xcc accelerated compute core id 912 * 913 * Writes the value specified to the offset specified. 914 */ 915 void amdgpu_device_xcc_wreg(struct amdgpu_device *adev, 916 uint32_t reg, uint32_t v, 917 uint32_t acc_flags, uint32_t xcc_id) 918 { 919 uint32_t rlcg_flag; 920 921 if (amdgpu_device_skip_hw_access(adev)) 922 return; 923 924 if ((reg * 4) < adev->rmmio_size) { 925 if (amdgpu_sriov_vf(adev) && 926 !amdgpu_sriov_runtime(adev) && 927 adev->gfx.rlc.rlcg_reg_access_supported && 928 amdgpu_virt_get_rlcg_reg_access_flag(adev, acc_flags, 929 GC_HWIP, true, 930 &rlcg_flag)) { 931 amdgpu_virt_rlcg_reg_rw(adev, reg, v, rlcg_flag, GET_INST(GC, xcc_id)); 932 } else if (!(acc_flags & AMDGPU_REGS_NO_KIQ) && 933 amdgpu_sriov_runtime(adev) && 934 down_read_trylock(&adev->reset_domain->sem)) { 935 amdgpu_kiq_wreg(adev, reg, v, xcc_id); 936 up_read(&adev->reset_domain->sem); 937 } else { 938 writel(v, ((void __iomem *)adev->rmmio) + (reg * 4)); 939 } 940 } else { 941 adev->pcie_wreg(adev, reg * 4, v); 942 } 943 } 944 945 /** 946 * amdgpu_device_indirect_rreg - read an indirect register 947 * 948 * @adev: amdgpu_device pointer 949 * @reg_addr: indirect register address to read from 950 * 951 * Returns the value of indirect register @reg_addr 952 */ 953 u32 amdgpu_device_indirect_rreg(struct amdgpu_device *adev, 954 u32 reg_addr) 955 { 956 unsigned long flags, pcie_index, pcie_data; 957 void __iomem *pcie_index_offset; 958 void __iomem *pcie_data_offset; 959 u32 r; 960 961 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 962 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 963 964 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 965 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 966 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 967 968 writel(reg_addr, pcie_index_offset); 969 readl(pcie_index_offset); 970 r = readl(pcie_data_offset); 971 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 972 973 return r; 974 } 975 976 u32 amdgpu_device_indirect_rreg_ext(struct amdgpu_device *adev, 977 u64 reg_addr) 978 { 979 unsigned long flags, pcie_index, pcie_index_hi, pcie_data; 980 u32 r; 981 void __iomem *pcie_index_offset; 982 void __iomem *pcie_index_hi_offset; 983 void __iomem *pcie_data_offset; 984 985 if (unlikely(!adev->nbio.funcs)) { 986 pcie_index = AMDGPU_PCIE_INDEX_FALLBACK; 987 pcie_data = AMDGPU_PCIE_DATA_FALLBACK; 988 } else { 989 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 990 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 991 } 992 993 if (reg_addr >> 32) { 994 if (unlikely(!adev->nbio.funcs)) 995 pcie_index_hi = AMDGPU_PCIE_INDEX_HI_FALLBACK; 996 else 997 pcie_index_hi = adev->nbio.funcs->get_pcie_index_hi_offset(adev); 998 } else { 999 pcie_index_hi = 0; 1000 } 1001 1002 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 1003 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 1004 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 1005 if (pcie_index_hi != 0) 1006 pcie_index_hi_offset = (void __iomem *)adev->rmmio + 1007 pcie_index_hi * 4; 1008 1009 writel(reg_addr, pcie_index_offset); 1010 readl(pcie_index_offset); 1011 if (pcie_index_hi != 0) { 1012 writel((reg_addr >> 32) & 0xff, pcie_index_hi_offset); 1013 readl(pcie_index_hi_offset); 1014 } 1015 r = readl(pcie_data_offset); 1016 1017 /* clear the high bits */ 1018 if (pcie_index_hi != 0) { 1019 writel(0, pcie_index_hi_offset); 1020 readl(pcie_index_hi_offset); 1021 } 1022 1023 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 1024 1025 return r; 1026 } 1027 1028 /** 1029 * amdgpu_device_indirect_rreg64 - read a 64bits indirect register 1030 * 1031 * @adev: amdgpu_device pointer 1032 * @reg_addr: indirect register address to read from 1033 * 1034 * Returns the value of indirect register @reg_addr 1035 */ 1036 u64 amdgpu_device_indirect_rreg64(struct amdgpu_device *adev, 1037 u32 reg_addr) 1038 { 1039 unsigned long flags, pcie_index, pcie_data; 1040 void __iomem *pcie_index_offset; 1041 void __iomem *pcie_data_offset; 1042 u64 r; 1043 1044 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 1045 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 1046 1047 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 1048 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 1049 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 1050 1051 /* read low 32 bits */ 1052 writel(reg_addr, pcie_index_offset); 1053 readl(pcie_index_offset); 1054 r = readl(pcie_data_offset); 1055 /* read high 32 bits */ 1056 writel(reg_addr + 4, pcie_index_offset); 1057 readl(pcie_index_offset); 1058 r |= ((u64)readl(pcie_data_offset) << 32); 1059 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 1060 1061 return r; 1062 } 1063 1064 u64 amdgpu_device_indirect_rreg64_ext(struct amdgpu_device *adev, 1065 u64 reg_addr) 1066 { 1067 unsigned long flags, pcie_index, pcie_data; 1068 unsigned long pcie_index_hi = 0; 1069 void __iomem *pcie_index_offset; 1070 void __iomem *pcie_index_hi_offset; 1071 void __iomem *pcie_data_offset; 1072 u64 r; 1073 1074 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 1075 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 1076 if ((reg_addr >> 32) && (adev->nbio.funcs->get_pcie_index_hi_offset)) 1077 pcie_index_hi = adev->nbio.funcs->get_pcie_index_hi_offset(adev); 1078 1079 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 1080 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 1081 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 1082 if (pcie_index_hi != 0) 1083 pcie_index_hi_offset = (void __iomem *)adev->rmmio + 1084 pcie_index_hi * 4; 1085 1086 /* read low 32 bits */ 1087 writel(reg_addr, pcie_index_offset); 1088 readl(pcie_index_offset); 1089 if (pcie_index_hi != 0) { 1090 writel((reg_addr >> 32) & 0xff, pcie_index_hi_offset); 1091 readl(pcie_index_hi_offset); 1092 } 1093 r = readl(pcie_data_offset); 1094 /* read high 32 bits */ 1095 writel(reg_addr + 4, pcie_index_offset); 1096 readl(pcie_index_offset); 1097 if (pcie_index_hi != 0) { 1098 writel((reg_addr >> 32) & 0xff, pcie_index_hi_offset); 1099 readl(pcie_index_hi_offset); 1100 } 1101 r |= ((u64)readl(pcie_data_offset) << 32); 1102 1103 /* clear the high bits */ 1104 if (pcie_index_hi != 0) { 1105 writel(0, pcie_index_hi_offset); 1106 readl(pcie_index_hi_offset); 1107 } 1108 1109 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 1110 1111 return r; 1112 } 1113 1114 /** 1115 * amdgpu_device_indirect_wreg - write an indirect register address 1116 * 1117 * @adev: amdgpu_device pointer 1118 * @reg_addr: indirect register offset 1119 * @reg_data: indirect register data 1120 * 1121 */ 1122 void amdgpu_device_indirect_wreg(struct amdgpu_device *adev, 1123 u32 reg_addr, u32 reg_data) 1124 { 1125 unsigned long flags, pcie_index, pcie_data; 1126 void __iomem *pcie_index_offset; 1127 void __iomem *pcie_data_offset; 1128 1129 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 1130 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 1131 1132 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 1133 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 1134 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 1135 1136 writel(reg_addr, pcie_index_offset); 1137 readl(pcie_index_offset); 1138 writel(reg_data, pcie_data_offset); 1139 readl(pcie_data_offset); 1140 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 1141 } 1142 1143 void amdgpu_device_indirect_wreg_ext(struct amdgpu_device *adev, 1144 u64 reg_addr, u32 reg_data) 1145 { 1146 unsigned long flags, pcie_index, pcie_index_hi, pcie_data; 1147 void __iomem *pcie_index_offset; 1148 void __iomem *pcie_index_hi_offset; 1149 void __iomem *pcie_data_offset; 1150 1151 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 1152 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 1153 if ((reg_addr >> 32) && (adev->nbio.funcs->get_pcie_index_hi_offset)) 1154 pcie_index_hi = adev->nbio.funcs->get_pcie_index_hi_offset(adev); 1155 else 1156 pcie_index_hi = 0; 1157 1158 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 1159 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 1160 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 1161 if (pcie_index_hi != 0) 1162 pcie_index_hi_offset = (void __iomem *)adev->rmmio + 1163 pcie_index_hi * 4; 1164 1165 writel(reg_addr, pcie_index_offset); 1166 readl(pcie_index_offset); 1167 if (pcie_index_hi != 0) { 1168 writel((reg_addr >> 32) & 0xff, pcie_index_hi_offset); 1169 readl(pcie_index_hi_offset); 1170 } 1171 writel(reg_data, pcie_data_offset); 1172 readl(pcie_data_offset); 1173 1174 /* clear the high bits */ 1175 if (pcie_index_hi != 0) { 1176 writel(0, pcie_index_hi_offset); 1177 readl(pcie_index_hi_offset); 1178 } 1179 1180 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 1181 } 1182 1183 /** 1184 * amdgpu_device_indirect_wreg64 - write a 64bits indirect register address 1185 * 1186 * @adev: amdgpu_device pointer 1187 * @reg_addr: indirect register offset 1188 * @reg_data: indirect register data 1189 * 1190 */ 1191 void amdgpu_device_indirect_wreg64(struct amdgpu_device *adev, 1192 u32 reg_addr, u64 reg_data) 1193 { 1194 unsigned long flags, pcie_index, pcie_data; 1195 void __iomem *pcie_index_offset; 1196 void __iomem *pcie_data_offset; 1197 1198 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 1199 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 1200 1201 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 1202 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 1203 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 1204 1205 /* write low 32 bits */ 1206 writel(reg_addr, pcie_index_offset); 1207 readl(pcie_index_offset); 1208 writel((u32)(reg_data & 0xffffffffULL), pcie_data_offset); 1209 readl(pcie_data_offset); 1210 /* write high 32 bits */ 1211 writel(reg_addr + 4, pcie_index_offset); 1212 readl(pcie_index_offset); 1213 writel((u32)(reg_data >> 32), pcie_data_offset); 1214 readl(pcie_data_offset); 1215 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 1216 } 1217 1218 void amdgpu_device_indirect_wreg64_ext(struct amdgpu_device *adev, 1219 u64 reg_addr, u64 reg_data) 1220 { 1221 unsigned long flags, pcie_index, pcie_data; 1222 unsigned long pcie_index_hi = 0; 1223 void __iomem *pcie_index_offset; 1224 void __iomem *pcie_index_hi_offset; 1225 void __iomem *pcie_data_offset; 1226 1227 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 1228 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 1229 if ((reg_addr >> 32) && (adev->nbio.funcs->get_pcie_index_hi_offset)) 1230 pcie_index_hi = adev->nbio.funcs->get_pcie_index_hi_offset(adev); 1231 1232 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 1233 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 1234 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 1235 if (pcie_index_hi != 0) 1236 pcie_index_hi_offset = (void __iomem *)adev->rmmio + 1237 pcie_index_hi * 4; 1238 1239 /* write low 32 bits */ 1240 writel(reg_addr, pcie_index_offset); 1241 readl(pcie_index_offset); 1242 if (pcie_index_hi != 0) { 1243 writel((reg_addr >> 32) & 0xff, pcie_index_hi_offset); 1244 readl(pcie_index_hi_offset); 1245 } 1246 writel((u32)(reg_data & 0xffffffffULL), pcie_data_offset); 1247 readl(pcie_data_offset); 1248 /* write high 32 bits */ 1249 writel(reg_addr + 4, pcie_index_offset); 1250 readl(pcie_index_offset); 1251 if (pcie_index_hi != 0) { 1252 writel((reg_addr >> 32) & 0xff, pcie_index_hi_offset); 1253 readl(pcie_index_hi_offset); 1254 } 1255 writel((u32)(reg_data >> 32), pcie_data_offset); 1256 readl(pcie_data_offset); 1257 1258 /* clear the high bits */ 1259 if (pcie_index_hi != 0) { 1260 writel(0, pcie_index_hi_offset); 1261 readl(pcie_index_hi_offset); 1262 } 1263 1264 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 1265 } 1266 1267 /** 1268 * amdgpu_device_get_rev_id - query device rev_id 1269 * 1270 * @adev: amdgpu_device pointer 1271 * 1272 * Return device rev_id 1273 */ 1274 u32 amdgpu_device_get_rev_id(struct amdgpu_device *adev) 1275 { 1276 return adev->nbio.funcs->get_rev_id(adev); 1277 } 1278 1279 /** 1280 * amdgpu_invalid_rreg - dummy reg read function 1281 * 1282 * @adev: amdgpu_device pointer 1283 * @reg: offset of register 1284 * 1285 * Dummy register read function. Used for register blocks 1286 * that certain asics don't have (all asics). 1287 * Returns the value in the register. 1288 */ 1289 static uint32_t amdgpu_invalid_rreg(struct amdgpu_device *adev, uint32_t reg) 1290 { 1291 dev_err(adev->dev, "Invalid callback to read register 0x%04X\n", reg); 1292 BUG(); 1293 return 0; 1294 } 1295 1296 static uint32_t amdgpu_invalid_rreg_ext(struct amdgpu_device *adev, uint64_t reg) 1297 { 1298 dev_err(adev->dev, "Invalid callback to read register 0x%llX\n", reg); 1299 BUG(); 1300 return 0; 1301 } 1302 1303 /** 1304 * amdgpu_invalid_wreg - dummy reg write function 1305 * 1306 * @adev: amdgpu_device pointer 1307 * @reg: offset of register 1308 * @v: value to write to the register 1309 * 1310 * Dummy register read function. Used for register blocks 1311 * that certain asics don't have (all asics). 1312 */ 1313 static void amdgpu_invalid_wreg(struct amdgpu_device *adev, uint32_t reg, uint32_t v) 1314 { 1315 dev_err(adev->dev, 1316 "Invalid callback to write register 0x%04X with 0x%08X\n", reg, 1317 v); 1318 BUG(); 1319 } 1320 1321 static void amdgpu_invalid_wreg_ext(struct amdgpu_device *adev, uint64_t reg, uint32_t v) 1322 { 1323 dev_err(adev->dev, 1324 "Invalid callback to write register 0x%llX with 0x%08X\n", reg, 1325 v); 1326 BUG(); 1327 } 1328 1329 /** 1330 * amdgpu_invalid_rreg64 - dummy 64 bit reg read function 1331 * 1332 * @adev: amdgpu_device pointer 1333 * @reg: offset of register 1334 * 1335 * Dummy register read function. Used for register blocks 1336 * that certain asics don't have (all asics). 1337 * Returns the value in the register. 1338 */ 1339 static uint64_t amdgpu_invalid_rreg64(struct amdgpu_device *adev, uint32_t reg) 1340 { 1341 dev_err(adev->dev, "Invalid callback to read 64 bit register 0x%04X\n", 1342 reg); 1343 BUG(); 1344 return 0; 1345 } 1346 1347 static uint64_t amdgpu_invalid_rreg64_ext(struct amdgpu_device *adev, uint64_t reg) 1348 { 1349 dev_err(adev->dev, "Invalid callback to read register 0x%llX\n", reg); 1350 BUG(); 1351 return 0; 1352 } 1353 1354 /** 1355 * amdgpu_invalid_wreg64 - dummy reg write function 1356 * 1357 * @adev: amdgpu_device pointer 1358 * @reg: offset of register 1359 * @v: value to write to the register 1360 * 1361 * Dummy register read function. Used for register blocks 1362 * that certain asics don't have (all asics). 1363 */ 1364 static void amdgpu_invalid_wreg64(struct amdgpu_device *adev, uint32_t reg, uint64_t v) 1365 { 1366 dev_err(adev->dev, 1367 "Invalid callback to write 64 bit register 0x%04X with 0x%08llX\n", 1368 reg, v); 1369 BUG(); 1370 } 1371 1372 static void amdgpu_invalid_wreg64_ext(struct amdgpu_device *adev, uint64_t reg, uint64_t v) 1373 { 1374 dev_err(adev->dev, 1375 "Invalid callback to write 64 bit register 0x%llX with 0x%08llX\n", 1376 reg, v); 1377 BUG(); 1378 } 1379 1380 /** 1381 * amdgpu_block_invalid_rreg - dummy reg read function 1382 * 1383 * @adev: amdgpu_device pointer 1384 * @block: offset of instance 1385 * @reg: offset of register 1386 * 1387 * Dummy register read function. Used for register blocks 1388 * that certain asics don't have (all asics). 1389 * Returns the value in the register. 1390 */ 1391 static uint32_t amdgpu_block_invalid_rreg(struct amdgpu_device *adev, 1392 uint32_t block, uint32_t reg) 1393 { 1394 dev_err(adev->dev, 1395 "Invalid callback to read register 0x%04X in block 0x%04X\n", 1396 reg, block); 1397 BUG(); 1398 return 0; 1399 } 1400 1401 /** 1402 * amdgpu_block_invalid_wreg - dummy reg write function 1403 * 1404 * @adev: amdgpu_device pointer 1405 * @block: offset of instance 1406 * @reg: offset of register 1407 * @v: value to write to the register 1408 * 1409 * Dummy register read function. Used for register blocks 1410 * that certain asics don't have (all asics). 1411 */ 1412 static void amdgpu_block_invalid_wreg(struct amdgpu_device *adev, 1413 uint32_t block, 1414 uint32_t reg, uint32_t v) 1415 { 1416 dev_err(adev->dev, 1417 "Invalid block callback to write register 0x%04X in block 0x%04X with 0x%08X\n", 1418 reg, block, v); 1419 BUG(); 1420 } 1421 1422 static uint32_t amdgpu_device_get_vbios_flags(struct amdgpu_device *adev) 1423 { 1424 if (hweight32(adev->aid_mask) && (adev->flags & AMD_IS_APU)) 1425 return AMDGPU_VBIOS_SKIP; 1426 1427 if (hweight32(adev->aid_mask) && amdgpu_passthrough(adev)) 1428 return AMDGPU_VBIOS_OPTIONAL; 1429 1430 return 0; 1431 } 1432 1433 /** 1434 * amdgpu_device_asic_init - Wrapper for atom asic_init 1435 * 1436 * @adev: amdgpu_device pointer 1437 * 1438 * Does any asic specific work and then calls atom asic init. 1439 */ 1440 static int amdgpu_device_asic_init(struct amdgpu_device *adev) 1441 { 1442 uint32_t flags; 1443 bool optional; 1444 int ret; 1445 1446 amdgpu_asic_pre_asic_init(adev); 1447 flags = amdgpu_device_get_vbios_flags(adev); 1448 optional = !!(flags & (AMDGPU_VBIOS_OPTIONAL | AMDGPU_VBIOS_SKIP)); 1449 1450 if (amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 3) || 1451 amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 4) || 1452 amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 5, 0) || 1453 amdgpu_ip_version(adev, GC_HWIP, 0) >= IP_VERSION(11, 0, 0)) { 1454 amdgpu_psp_wait_for_bootloader(adev); 1455 if (optional && !adev->bios) 1456 return 0; 1457 1458 ret = amdgpu_atomfirmware_asic_init(adev, true); 1459 return ret; 1460 } else { 1461 if (optional && !adev->bios) 1462 return 0; 1463 1464 return amdgpu_atom_asic_init(adev->mode_info.atom_context); 1465 } 1466 1467 return 0; 1468 } 1469 1470 /** 1471 * amdgpu_device_mem_scratch_init - allocate the VRAM scratch page 1472 * 1473 * @adev: amdgpu_device pointer 1474 * 1475 * Allocates a scratch page of VRAM for use by various things in the 1476 * driver. 1477 */ 1478 static int amdgpu_device_mem_scratch_init(struct amdgpu_device *adev) 1479 { 1480 return amdgpu_bo_create_kernel(adev, AMDGPU_GPU_PAGE_SIZE, PAGE_SIZE, 1481 AMDGPU_GEM_DOMAIN_VRAM | 1482 AMDGPU_GEM_DOMAIN_GTT, 1483 &adev->mem_scratch.robj, 1484 &adev->mem_scratch.gpu_addr, 1485 (void **)&adev->mem_scratch.ptr); 1486 } 1487 1488 /** 1489 * amdgpu_device_mem_scratch_fini - Free the VRAM scratch page 1490 * 1491 * @adev: amdgpu_device pointer 1492 * 1493 * Frees the VRAM scratch page. 1494 */ 1495 static void amdgpu_device_mem_scratch_fini(struct amdgpu_device *adev) 1496 { 1497 amdgpu_bo_free_kernel(&adev->mem_scratch.robj, NULL, NULL); 1498 } 1499 1500 /** 1501 * amdgpu_device_program_register_sequence - program an array of registers. 1502 * 1503 * @adev: amdgpu_device pointer 1504 * @registers: pointer to the register array 1505 * @array_size: size of the register array 1506 * 1507 * Programs an array or registers with and or masks. 1508 * This is a helper for setting golden registers. 1509 */ 1510 void amdgpu_device_program_register_sequence(struct amdgpu_device *adev, 1511 const u32 *registers, 1512 const u32 array_size) 1513 { 1514 u32 tmp, reg, and_mask, or_mask; 1515 int i; 1516 1517 if (array_size % 3) 1518 return; 1519 1520 for (i = 0; i < array_size; i += 3) { 1521 reg = registers[i + 0]; 1522 and_mask = registers[i + 1]; 1523 or_mask = registers[i + 2]; 1524 1525 if (and_mask == 0xffffffff) { 1526 tmp = or_mask; 1527 } else { 1528 tmp = RREG32(reg); 1529 tmp &= ~and_mask; 1530 if (adev->family >= AMDGPU_FAMILY_AI) 1531 tmp |= (or_mask & and_mask); 1532 else 1533 tmp |= or_mask; 1534 } 1535 WREG32(reg, tmp); 1536 } 1537 } 1538 1539 /** 1540 * amdgpu_device_pci_config_reset - reset the GPU 1541 * 1542 * @adev: amdgpu_device pointer 1543 * 1544 * Resets the GPU using the pci config reset sequence. 1545 * Only applicable to asics prior to vega10. 1546 */ 1547 void amdgpu_device_pci_config_reset(struct amdgpu_device *adev) 1548 { 1549 pci_write_config_dword(adev->pdev, 0x7c, AMDGPU_ASIC_RESET_DATA); 1550 } 1551 1552 /** 1553 * amdgpu_device_pci_reset - reset the GPU using generic PCI means 1554 * 1555 * @adev: amdgpu_device pointer 1556 * 1557 * Resets the GPU using generic pci reset interfaces (FLR, SBR, etc.). 1558 */ 1559 int amdgpu_device_pci_reset(struct amdgpu_device *adev) 1560 { 1561 return pci_reset_function(adev->pdev); 1562 } 1563 1564 /* 1565 * amdgpu_device_wb_*() 1566 * Writeback is the method by which the GPU updates special pages in memory 1567 * with the status of certain GPU events (fences, ring pointers,etc.). 1568 */ 1569 1570 /** 1571 * amdgpu_device_wb_fini - Disable Writeback and free memory 1572 * 1573 * @adev: amdgpu_device pointer 1574 * 1575 * Disables Writeback and frees the Writeback memory (all asics). 1576 * Used at driver shutdown. 1577 */ 1578 static void amdgpu_device_wb_fini(struct amdgpu_device *adev) 1579 { 1580 if (adev->wb.wb_obj) { 1581 amdgpu_bo_free_kernel(&adev->wb.wb_obj, 1582 &adev->wb.gpu_addr, 1583 (void **)&adev->wb.wb); 1584 adev->wb.wb_obj = NULL; 1585 } 1586 } 1587 1588 /** 1589 * amdgpu_device_wb_init - Init Writeback driver info and allocate memory 1590 * 1591 * @adev: amdgpu_device pointer 1592 * 1593 * Initializes writeback and allocates writeback memory (all asics). 1594 * Used at driver startup. 1595 * Returns 0 on success or an -error on failure. 1596 */ 1597 static int amdgpu_device_wb_init(struct amdgpu_device *adev) 1598 { 1599 int r; 1600 1601 if (adev->wb.wb_obj == NULL) { 1602 /* AMDGPU_MAX_WB * sizeof(uint32_t) * 8 = AMDGPU_MAX_WB 256bit slots */ 1603 r = amdgpu_bo_create_kernel(adev, AMDGPU_MAX_WB * sizeof(uint32_t) * 8, 1604 PAGE_SIZE, AMDGPU_GEM_DOMAIN_GTT, 1605 &adev->wb.wb_obj, &adev->wb.gpu_addr, 1606 (void **)&adev->wb.wb); 1607 if (r) { 1608 dev_warn(adev->dev, "(%d) create WB bo failed\n", r); 1609 return r; 1610 } 1611 1612 adev->wb.num_wb = AMDGPU_MAX_WB; 1613 memset(&adev->wb.used, 0, sizeof(adev->wb.used)); 1614 1615 /* clear wb memory */ 1616 memset((char *)adev->wb.wb, 0, AMDGPU_MAX_WB * sizeof(uint32_t) * 8); 1617 } 1618 1619 return 0; 1620 } 1621 1622 /** 1623 * amdgpu_device_wb_get - Allocate a wb entry 1624 * 1625 * @adev: amdgpu_device pointer 1626 * @wb: wb index 1627 * 1628 * Allocate a wb slot for use by the driver (all asics). 1629 * Returns 0 on success or -EINVAL on failure. 1630 */ 1631 int amdgpu_device_wb_get(struct amdgpu_device *adev, u32 *wb) 1632 { 1633 unsigned long flags, offset; 1634 1635 spin_lock_irqsave(&adev->wb.lock, flags); 1636 offset = find_first_zero_bit(adev->wb.used, adev->wb.num_wb); 1637 if (offset < adev->wb.num_wb) { 1638 __set_bit(offset, adev->wb.used); 1639 spin_unlock_irqrestore(&adev->wb.lock, flags); 1640 *wb = offset << 3; /* convert to dw offset */ 1641 return 0; 1642 } else { 1643 spin_unlock_irqrestore(&adev->wb.lock, flags); 1644 return -EINVAL; 1645 } 1646 } 1647 1648 /** 1649 * amdgpu_device_wb_free - Free a wb entry 1650 * 1651 * @adev: amdgpu_device pointer 1652 * @wb: wb index 1653 * 1654 * Free a wb slot allocated for use by the driver (all asics) 1655 */ 1656 void amdgpu_device_wb_free(struct amdgpu_device *adev, u32 wb) 1657 { 1658 unsigned long flags; 1659 1660 wb >>= 3; 1661 spin_lock_irqsave(&adev->wb.lock, flags); 1662 if (wb < adev->wb.num_wb) 1663 __clear_bit(wb, adev->wb.used); 1664 spin_unlock_irqrestore(&adev->wb.lock, flags); 1665 } 1666 1667 /** 1668 * amdgpu_device_resize_fb_bar - try to resize FB BAR 1669 * 1670 * @adev: amdgpu_device pointer 1671 * 1672 * Try to resize FB BAR to make all VRAM CPU accessible. We try very hard not 1673 * to fail, but if any of the BARs is not accessible after the size we abort 1674 * driver loading by returning -ENODEV. 1675 */ 1676 int amdgpu_device_resize_fb_bar(struct amdgpu_device *adev) 1677 { 1678 int rbar_size = pci_rebar_bytes_to_size(adev->gmc.real_vram_size); 1679 struct pci_bus *root; 1680 struct resource *res; 1681 unsigned int i; 1682 u16 cmd; 1683 int r; 1684 1685 if (!IS_ENABLED(CONFIG_PHYS_ADDR_T_64BIT)) 1686 return 0; 1687 1688 /* Bypass for VF */ 1689 if (amdgpu_sriov_vf(adev)) 1690 return 0; 1691 1692 if (!amdgpu_rebar) 1693 return 0; 1694 1695 /* resizing on Dell G5 SE platforms causes problems with runtime pm */ 1696 if ((amdgpu_runtime_pm != 0) && 1697 adev->pdev->vendor == PCI_VENDOR_ID_ATI && 1698 adev->pdev->device == 0x731f && 1699 adev->pdev->subsystem_vendor == PCI_VENDOR_ID_DELL) 1700 return 0; 1701 1702 /* PCI_EXT_CAP_ID_VNDR extended capability is located at 0x100 */ 1703 if (!pci_find_ext_capability(adev->pdev, PCI_EXT_CAP_ID_VNDR)) 1704 dev_warn( 1705 adev->dev, 1706 "System can't access extended configuration space, please check!!\n"); 1707 1708 /* skip if the bios has already enabled large BAR */ 1709 if (adev->gmc.real_vram_size && 1710 (pci_resource_len(adev->pdev, 0) >= adev->gmc.real_vram_size)) 1711 return 0; 1712 1713 /* Check if the root BUS has 64bit memory resources */ 1714 root = adev->pdev->bus; 1715 while (root->parent) 1716 root = root->parent; 1717 1718 pci_bus_for_each_resource(root, res, i) { 1719 if (res && res->flags & (IORESOURCE_MEM | IORESOURCE_MEM_64) && 1720 res->start > 0x100000000ull) 1721 break; 1722 } 1723 1724 /* Trying to resize is pointless without a root hub window above 4GB */ 1725 if (!res) 1726 return 0; 1727 1728 /* Limit the BAR size to what is available */ 1729 rbar_size = min(fls(pci_rebar_get_possible_sizes(adev->pdev, 0)) - 1, 1730 rbar_size); 1731 1732 /* Disable memory decoding while we change the BAR addresses and size */ 1733 pci_read_config_word(adev->pdev, PCI_COMMAND, &cmd); 1734 pci_write_config_word(adev->pdev, PCI_COMMAND, 1735 cmd & ~PCI_COMMAND_MEMORY); 1736 1737 /* Free the VRAM and doorbell BAR, we most likely need to move both. */ 1738 amdgpu_doorbell_fini(adev); 1739 if (adev->asic_type >= CHIP_BONAIRE) 1740 pci_release_resource(adev->pdev, 2); 1741 1742 pci_release_resource(adev->pdev, 0); 1743 1744 r = pci_resize_resource(adev->pdev, 0, rbar_size); 1745 if (r == -ENOSPC) 1746 dev_info(adev->dev, 1747 "Not enough PCI address space for a large BAR."); 1748 else if (r && r != -ENOTSUPP) 1749 dev_err(adev->dev, "Problem resizing BAR0 (%d).", r); 1750 1751 pci_assign_unassigned_bus_resources(adev->pdev->bus); 1752 1753 /* When the doorbell or fb BAR isn't available we have no chance of 1754 * using the device. 1755 */ 1756 r = amdgpu_doorbell_init(adev); 1757 if (r || (pci_resource_flags(adev->pdev, 0) & IORESOURCE_UNSET)) 1758 return -ENODEV; 1759 1760 pci_write_config_word(adev->pdev, PCI_COMMAND, cmd); 1761 1762 return 0; 1763 } 1764 1765 /* 1766 * GPU helpers function. 1767 */ 1768 /** 1769 * amdgpu_device_need_post - check if the hw need post or not 1770 * 1771 * @adev: amdgpu_device pointer 1772 * 1773 * Check if the asic has been initialized (all asics) at driver startup 1774 * or post is needed if hw reset is performed. 1775 * Returns true if need or false if not. 1776 */ 1777 bool amdgpu_device_need_post(struct amdgpu_device *adev) 1778 { 1779 uint32_t reg, flags; 1780 1781 if (amdgpu_sriov_vf(adev)) 1782 return false; 1783 1784 flags = amdgpu_device_get_vbios_flags(adev); 1785 if (flags & AMDGPU_VBIOS_SKIP) 1786 return false; 1787 if ((flags & AMDGPU_VBIOS_OPTIONAL) && !adev->bios) 1788 return false; 1789 1790 if (amdgpu_passthrough(adev)) { 1791 /* for FIJI: In whole GPU pass-through virtualization case, after VM reboot 1792 * some old smc fw still need driver do vPost otherwise gpu hang, while 1793 * those smc fw version above 22.15 doesn't have this flaw, so we force 1794 * vpost executed for smc version below 22.15 1795 */ 1796 if (adev->asic_type == CHIP_FIJI) { 1797 int err; 1798 uint32_t fw_ver; 1799 1800 err = request_firmware(&adev->pm.fw, "amdgpu/fiji_smc.bin", adev->dev); 1801 /* force vPost if error occurred */ 1802 if (err) 1803 return true; 1804 1805 fw_ver = *((uint32_t *)adev->pm.fw->data + 69); 1806 release_firmware(adev->pm.fw); 1807 if (fw_ver < 0x00160e00) 1808 return true; 1809 } 1810 } 1811 1812 /* Don't post if we need to reset whole hive on init */ 1813 if (adev->init_lvl->level == AMDGPU_INIT_LEVEL_MINIMAL_XGMI) 1814 return false; 1815 1816 if (adev->has_hw_reset) { 1817 adev->has_hw_reset = false; 1818 return true; 1819 } 1820 1821 /* bios scratch used on CIK+ */ 1822 if (adev->asic_type >= CHIP_BONAIRE) 1823 return amdgpu_atombios_scratch_need_asic_init(adev); 1824 1825 /* check MEM_SIZE for older asics */ 1826 reg = amdgpu_asic_get_config_memsize(adev); 1827 1828 if ((reg != 0) && (reg != 0xffffffff)) 1829 return false; 1830 1831 return true; 1832 } 1833 1834 /* 1835 * Check whether seamless boot is supported. 1836 * 1837 * So far we only support seamless boot on DCE 3.0 or later. 1838 * If users report that it works on older ASICS as well, we may 1839 * loosen this. 1840 */ 1841 bool amdgpu_device_seamless_boot_supported(struct amdgpu_device *adev) 1842 { 1843 switch (amdgpu_seamless) { 1844 case -1: 1845 break; 1846 case 1: 1847 return true; 1848 case 0: 1849 return false; 1850 default: 1851 dev_err(adev->dev, "Invalid value for amdgpu.seamless: %d\n", 1852 amdgpu_seamless); 1853 return false; 1854 } 1855 1856 if (!(adev->flags & AMD_IS_APU)) 1857 return false; 1858 1859 if (adev->mman.keep_stolen_vga_memory) 1860 return false; 1861 1862 return amdgpu_ip_version(adev, DCE_HWIP, 0) >= IP_VERSION(3, 0, 0); 1863 } 1864 1865 /* 1866 * Intel hosts such as Rocket Lake, Alder Lake, Raptor Lake and Sapphire Rapids 1867 * don't support dynamic speed switching. Until we have confirmation from Intel 1868 * that a specific host supports it, it's safer that we keep it disabled for all. 1869 * 1870 * https://edc.intel.com/content/www/us/en/design/products/platforms/details/raptor-lake-s/13th-generation-core-processors-datasheet-volume-1-of-2/005/pci-express-support/ 1871 * https://gitlab.freedesktop.org/drm/amd/-/issues/2663 1872 */ 1873 static bool amdgpu_device_pcie_dynamic_switching_supported(struct amdgpu_device *adev) 1874 { 1875 #if IS_ENABLED(CONFIG_X86) 1876 struct cpuinfo_x86 *c = &cpu_data(0); 1877 1878 /* eGPU change speeds based on USB4 fabric conditions */ 1879 if (dev_is_removable(adev->dev)) 1880 return true; 1881 1882 if (c->x86_vendor == X86_VENDOR_INTEL) 1883 return false; 1884 #endif 1885 return true; 1886 } 1887 1888 static bool amdgpu_device_aspm_support_quirk(struct amdgpu_device *adev) 1889 { 1890 /* Enabling ASPM causes randoms hangs on Tahiti and Oland on Zen4. 1891 * It's unclear if this is a platform-specific or GPU-specific issue. 1892 * Disable ASPM on SI for the time being. 1893 */ 1894 if (adev->family == AMDGPU_FAMILY_SI) 1895 return true; 1896 1897 #if IS_ENABLED(CONFIG_X86) 1898 struct cpuinfo_x86 *c = &cpu_data(0); 1899 1900 if (!(amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(12, 0, 0) || 1901 amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(12, 0, 1))) 1902 return false; 1903 1904 if (c->x86 == 6 && 1905 adev->pm.pcie_gen_mask & CAIL_PCIE_LINK_SPEED_SUPPORT_GEN5) { 1906 switch (c->x86_model) { 1907 case VFM_MODEL(INTEL_ALDERLAKE): 1908 case VFM_MODEL(INTEL_ALDERLAKE_L): 1909 case VFM_MODEL(INTEL_RAPTORLAKE): 1910 case VFM_MODEL(INTEL_RAPTORLAKE_P): 1911 case VFM_MODEL(INTEL_RAPTORLAKE_S): 1912 return true; 1913 default: 1914 return false; 1915 } 1916 } else { 1917 return false; 1918 } 1919 #else 1920 return false; 1921 #endif 1922 } 1923 1924 /** 1925 * amdgpu_device_should_use_aspm - check if the device should program ASPM 1926 * 1927 * @adev: amdgpu_device pointer 1928 * 1929 * Confirm whether the module parameter and pcie bridge agree that ASPM should 1930 * be set for this device. 1931 * 1932 * Returns true if it should be used or false if not. 1933 */ 1934 bool amdgpu_device_should_use_aspm(struct amdgpu_device *adev) 1935 { 1936 switch (amdgpu_aspm) { 1937 case -1: 1938 break; 1939 case 0: 1940 return false; 1941 case 1: 1942 return true; 1943 default: 1944 return false; 1945 } 1946 if (adev->flags & AMD_IS_APU) 1947 return false; 1948 if (amdgpu_device_aspm_support_quirk(adev)) 1949 return false; 1950 return pcie_aspm_enabled(adev->pdev); 1951 } 1952 1953 /* if we get transitioned to only one device, take VGA back */ 1954 /** 1955 * amdgpu_device_vga_set_decode - enable/disable vga decode 1956 * 1957 * @pdev: PCI device pointer 1958 * @state: enable/disable vga decode 1959 * 1960 * Enable/disable vga decode (all asics). 1961 * Returns VGA resource flags. 1962 */ 1963 static unsigned int amdgpu_device_vga_set_decode(struct pci_dev *pdev, 1964 bool state) 1965 { 1966 struct amdgpu_device *adev = drm_to_adev(pci_get_drvdata(pdev)); 1967 1968 amdgpu_asic_set_vga_state(adev, state); 1969 if (state) 1970 return VGA_RSRC_LEGACY_IO | VGA_RSRC_LEGACY_MEM | 1971 VGA_RSRC_NORMAL_IO | VGA_RSRC_NORMAL_MEM; 1972 else 1973 return VGA_RSRC_NORMAL_IO | VGA_RSRC_NORMAL_MEM; 1974 } 1975 1976 /** 1977 * amdgpu_device_check_block_size - validate the vm block size 1978 * 1979 * @adev: amdgpu_device pointer 1980 * 1981 * Validates the vm block size specified via module parameter. 1982 * The vm block size defines number of bits in page table versus page directory, 1983 * a page is 4KB so we have 12 bits offset, minimum 9 bits in the 1984 * page table and the remaining bits are in the page directory. 1985 */ 1986 static void amdgpu_device_check_block_size(struct amdgpu_device *adev) 1987 { 1988 /* defines number of bits in page table versus page directory, 1989 * a page is 4KB so we have 12 bits offset, minimum 9 bits in the 1990 * page table and the remaining bits are in the page directory 1991 */ 1992 if (amdgpu_vm_block_size == -1) 1993 return; 1994 1995 if (amdgpu_vm_block_size < 9) { 1996 dev_warn(adev->dev, "VM page table size (%d) too small\n", 1997 amdgpu_vm_block_size); 1998 amdgpu_vm_block_size = -1; 1999 } 2000 } 2001 2002 /** 2003 * amdgpu_device_check_vm_size - validate the vm size 2004 * 2005 * @adev: amdgpu_device pointer 2006 * 2007 * Validates the vm size in GB specified via module parameter. 2008 * The VM size is the size of the GPU virtual memory space in GB. 2009 */ 2010 static void amdgpu_device_check_vm_size(struct amdgpu_device *adev) 2011 { 2012 /* no need to check the default value */ 2013 if (amdgpu_vm_size == -1) 2014 return; 2015 2016 if (amdgpu_vm_size < 1) { 2017 dev_warn(adev->dev, "VM size (%d) too small, min is 1GB\n", 2018 amdgpu_vm_size); 2019 amdgpu_vm_size = -1; 2020 } 2021 } 2022 2023 static void amdgpu_device_check_smu_prv_buffer_size(struct amdgpu_device *adev) 2024 { 2025 struct sysinfo si; 2026 bool is_os_64 = (sizeof(void *) == 8); 2027 uint64_t total_memory; 2028 uint64_t dram_size_seven_GB = 0x1B8000000; 2029 uint64_t dram_size_three_GB = 0xB8000000; 2030 2031 if (amdgpu_smu_memory_pool_size == 0) 2032 return; 2033 2034 if (!is_os_64) { 2035 dev_warn(adev->dev, "Not 64-bit OS, feature not supported\n"); 2036 goto def_value; 2037 } 2038 si_meminfo(&si); 2039 total_memory = (uint64_t)si.totalram * si.mem_unit; 2040 2041 if ((amdgpu_smu_memory_pool_size == 1) || 2042 (amdgpu_smu_memory_pool_size == 2)) { 2043 if (total_memory < dram_size_three_GB) 2044 goto def_value1; 2045 } else if ((amdgpu_smu_memory_pool_size == 4) || 2046 (amdgpu_smu_memory_pool_size == 8)) { 2047 if (total_memory < dram_size_seven_GB) 2048 goto def_value1; 2049 } else { 2050 dev_warn(adev->dev, "Smu memory pool size not supported\n"); 2051 goto def_value; 2052 } 2053 adev->pm.smu_prv_buffer_size = amdgpu_smu_memory_pool_size << 28; 2054 2055 return; 2056 2057 def_value1: 2058 dev_warn(adev->dev, "No enough system memory\n"); 2059 def_value: 2060 adev->pm.smu_prv_buffer_size = 0; 2061 } 2062 2063 static int amdgpu_device_init_apu_flags(struct amdgpu_device *adev) 2064 { 2065 if (!(adev->flags & AMD_IS_APU) || 2066 adev->asic_type < CHIP_RAVEN) 2067 return 0; 2068 2069 switch (adev->asic_type) { 2070 case CHIP_RAVEN: 2071 if (adev->pdev->device == 0x15dd) 2072 adev->apu_flags |= AMD_APU_IS_RAVEN; 2073 if (adev->pdev->device == 0x15d8) 2074 adev->apu_flags |= AMD_APU_IS_PICASSO; 2075 break; 2076 case CHIP_RENOIR: 2077 if ((adev->pdev->device == 0x1636) || 2078 (adev->pdev->device == 0x164c)) 2079 adev->apu_flags |= AMD_APU_IS_RENOIR; 2080 else 2081 adev->apu_flags |= AMD_APU_IS_GREEN_SARDINE; 2082 break; 2083 case CHIP_VANGOGH: 2084 adev->apu_flags |= AMD_APU_IS_VANGOGH; 2085 break; 2086 case CHIP_YELLOW_CARP: 2087 break; 2088 case CHIP_CYAN_SKILLFISH: 2089 if ((adev->pdev->device == 0x13FE) || 2090 (adev->pdev->device == 0x143F)) 2091 adev->apu_flags |= AMD_APU_IS_CYAN_SKILLFISH2; 2092 break; 2093 default: 2094 break; 2095 } 2096 2097 return 0; 2098 } 2099 2100 /** 2101 * amdgpu_device_check_arguments - validate module params 2102 * 2103 * @adev: amdgpu_device pointer 2104 * 2105 * Validates certain module parameters and updates 2106 * the associated values used by the driver (all asics). 2107 */ 2108 static int amdgpu_device_check_arguments(struct amdgpu_device *adev) 2109 { 2110 int i; 2111 2112 if (amdgpu_sched_jobs < 4) { 2113 dev_warn(adev->dev, "sched jobs (%d) must be at least 4\n", 2114 amdgpu_sched_jobs); 2115 amdgpu_sched_jobs = 4; 2116 } else if (!is_power_of_2(amdgpu_sched_jobs)) { 2117 dev_warn(adev->dev, "sched jobs (%d) must be a power of 2\n", 2118 amdgpu_sched_jobs); 2119 amdgpu_sched_jobs = roundup_pow_of_two(amdgpu_sched_jobs); 2120 } 2121 2122 if (amdgpu_gart_size != -1 && amdgpu_gart_size < 32) { 2123 /* gart size must be greater or equal to 32M */ 2124 dev_warn(adev->dev, "gart size (%d) too small\n", 2125 amdgpu_gart_size); 2126 amdgpu_gart_size = -1; 2127 } 2128 2129 if (amdgpu_gtt_size != -1 && amdgpu_gtt_size < 32) { 2130 /* gtt size must be greater or equal to 32M */ 2131 dev_warn(adev->dev, "gtt size (%d) too small\n", 2132 amdgpu_gtt_size); 2133 amdgpu_gtt_size = -1; 2134 } 2135 2136 /* valid range is between 4 and 9 inclusive */ 2137 if (amdgpu_vm_fragment_size != -1 && 2138 (amdgpu_vm_fragment_size > 9 || amdgpu_vm_fragment_size < 4)) { 2139 dev_warn(adev->dev, "valid range is between 4 and 9\n"); 2140 amdgpu_vm_fragment_size = -1; 2141 } 2142 2143 if (amdgpu_sched_hw_submission < 2) { 2144 dev_warn(adev->dev, "sched hw submission jobs (%d) must be at least 2\n", 2145 amdgpu_sched_hw_submission); 2146 amdgpu_sched_hw_submission = 2; 2147 } else if (!is_power_of_2(amdgpu_sched_hw_submission)) { 2148 dev_warn(adev->dev, "sched hw submission jobs (%d) must be a power of 2\n", 2149 amdgpu_sched_hw_submission); 2150 amdgpu_sched_hw_submission = roundup_pow_of_two(amdgpu_sched_hw_submission); 2151 } 2152 2153 if (amdgpu_reset_method < -1 || amdgpu_reset_method > 4) { 2154 dev_warn(adev->dev, "invalid option for reset method, reverting to default\n"); 2155 amdgpu_reset_method = -1; 2156 } 2157 2158 amdgpu_device_check_smu_prv_buffer_size(adev); 2159 2160 amdgpu_device_check_vm_size(adev); 2161 2162 amdgpu_device_check_block_size(adev); 2163 2164 adev->firmware.load_type = amdgpu_ucode_get_load_type(adev, amdgpu_fw_load_type); 2165 2166 for (i = 0; i < MAX_XCP; i++) { 2167 switch (amdgpu_enforce_isolation) { 2168 case -1: 2169 case 0: 2170 default: 2171 /* disable */ 2172 adev->enforce_isolation[i] = AMDGPU_ENFORCE_ISOLATION_DISABLE; 2173 break; 2174 case 1: 2175 /* enable */ 2176 adev->enforce_isolation[i] = 2177 AMDGPU_ENFORCE_ISOLATION_ENABLE; 2178 break; 2179 case 2: 2180 /* enable legacy mode */ 2181 adev->enforce_isolation[i] = 2182 AMDGPU_ENFORCE_ISOLATION_ENABLE_LEGACY; 2183 break; 2184 case 3: 2185 /* enable only process isolation without submitting cleaner shader */ 2186 adev->enforce_isolation[i] = 2187 AMDGPU_ENFORCE_ISOLATION_NO_CLEANER_SHADER; 2188 break; 2189 } 2190 } 2191 2192 return 0; 2193 } 2194 2195 /** 2196 * amdgpu_switcheroo_set_state - set switcheroo state 2197 * 2198 * @pdev: pci dev pointer 2199 * @state: vga_switcheroo state 2200 * 2201 * Callback for the switcheroo driver. Suspends or resumes 2202 * the asics before or after it is powered up using ACPI methods. 2203 */ 2204 static void amdgpu_switcheroo_set_state(struct pci_dev *pdev, 2205 enum vga_switcheroo_state state) 2206 { 2207 struct drm_device *dev = pci_get_drvdata(pdev); 2208 int r; 2209 2210 if (amdgpu_device_supports_px(drm_to_adev(dev)) && 2211 state == VGA_SWITCHEROO_OFF) 2212 return; 2213 2214 if (state == VGA_SWITCHEROO_ON) { 2215 pr_info("switched on\n"); 2216 /* don't suspend or resume card normally */ 2217 dev->switch_power_state = DRM_SWITCH_POWER_CHANGING; 2218 2219 pci_set_power_state(pdev, PCI_D0); 2220 amdgpu_device_load_pci_state(pdev); 2221 r = pci_enable_device(pdev); 2222 if (r) 2223 dev_warn(&pdev->dev, "pci_enable_device failed (%d)\n", 2224 r); 2225 amdgpu_device_resume(dev, true); 2226 2227 dev->switch_power_state = DRM_SWITCH_POWER_ON; 2228 } else { 2229 dev_info(&pdev->dev, "switched off\n"); 2230 dev->switch_power_state = DRM_SWITCH_POWER_CHANGING; 2231 amdgpu_device_prepare(dev); 2232 amdgpu_device_suspend(dev, true); 2233 amdgpu_device_cache_pci_state(pdev); 2234 /* Shut down the device */ 2235 pci_disable_device(pdev); 2236 pci_set_power_state(pdev, PCI_D3cold); 2237 dev->switch_power_state = DRM_SWITCH_POWER_OFF; 2238 } 2239 } 2240 2241 /** 2242 * amdgpu_switcheroo_can_switch - see if switcheroo state can change 2243 * 2244 * @pdev: pci dev pointer 2245 * 2246 * Callback for the switcheroo driver. Check of the switcheroo 2247 * state can be changed. 2248 * Returns true if the state can be changed, false if not. 2249 */ 2250 static bool amdgpu_switcheroo_can_switch(struct pci_dev *pdev) 2251 { 2252 struct drm_device *dev = pci_get_drvdata(pdev); 2253 2254 /* 2255 * FIXME: open_count is protected by drm_global_mutex but that would lead to 2256 * locking inversion with the driver load path. And the access here is 2257 * completely racy anyway. So don't bother with locking for now. 2258 */ 2259 return atomic_read(&dev->open_count) == 0; 2260 } 2261 2262 static const struct vga_switcheroo_client_ops amdgpu_switcheroo_ops = { 2263 .set_gpu_state = amdgpu_switcheroo_set_state, 2264 .reprobe = NULL, 2265 .can_switch = amdgpu_switcheroo_can_switch, 2266 }; 2267 2268 /** 2269 * amdgpu_device_ip_set_clockgating_state - set the CG state 2270 * 2271 * @dev: amdgpu_device pointer 2272 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.) 2273 * @state: clockgating state (gate or ungate) 2274 * 2275 * Sets the requested clockgating state for all instances of 2276 * the hardware IP specified. 2277 * Returns the error code from the last instance. 2278 */ 2279 int amdgpu_device_ip_set_clockgating_state(void *dev, 2280 enum amd_ip_block_type block_type, 2281 enum amd_clockgating_state state) 2282 { 2283 struct amdgpu_device *adev = dev; 2284 int i, r = 0; 2285 2286 for (i = 0; i < adev->num_ip_blocks; i++) { 2287 if (!adev->ip_blocks[i].status.valid) 2288 continue; 2289 if (adev->ip_blocks[i].version->type != block_type) 2290 continue; 2291 if (!adev->ip_blocks[i].version->funcs->set_clockgating_state) 2292 continue; 2293 r = adev->ip_blocks[i].version->funcs->set_clockgating_state( 2294 &adev->ip_blocks[i], state); 2295 if (r) 2296 dev_err(adev->dev, 2297 "set_clockgating_state of IP block <%s> failed %d\n", 2298 adev->ip_blocks[i].version->funcs->name, r); 2299 } 2300 return r; 2301 } 2302 2303 /** 2304 * amdgpu_device_ip_set_powergating_state - set the PG state 2305 * 2306 * @dev: amdgpu_device pointer 2307 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.) 2308 * @state: powergating state (gate or ungate) 2309 * 2310 * Sets the requested powergating state for all instances of 2311 * the hardware IP specified. 2312 * Returns the error code from the last instance. 2313 */ 2314 int amdgpu_device_ip_set_powergating_state(void *dev, 2315 enum amd_ip_block_type block_type, 2316 enum amd_powergating_state state) 2317 { 2318 struct amdgpu_device *adev = dev; 2319 int i, r = 0; 2320 2321 for (i = 0; i < adev->num_ip_blocks; i++) { 2322 if (!adev->ip_blocks[i].status.valid) 2323 continue; 2324 if (adev->ip_blocks[i].version->type != block_type) 2325 continue; 2326 if (!adev->ip_blocks[i].version->funcs->set_powergating_state) 2327 continue; 2328 r = adev->ip_blocks[i].version->funcs->set_powergating_state( 2329 &adev->ip_blocks[i], state); 2330 if (r) 2331 dev_err(adev->dev, 2332 "set_powergating_state of IP block <%s> failed %d\n", 2333 adev->ip_blocks[i].version->funcs->name, r); 2334 } 2335 return r; 2336 } 2337 2338 /** 2339 * amdgpu_device_ip_get_clockgating_state - get the CG state 2340 * 2341 * @adev: amdgpu_device pointer 2342 * @flags: clockgating feature flags 2343 * 2344 * Walks the list of IPs on the device and updates the clockgating 2345 * flags for each IP. 2346 * Updates @flags with the feature flags for each hardware IP where 2347 * clockgating is enabled. 2348 */ 2349 void amdgpu_device_ip_get_clockgating_state(struct amdgpu_device *adev, 2350 u64 *flags) 2351 { 2352 int i; 2353 2354 for (i = 0; i < adev->num_ip_blocks; i++) { 2355 if (!adev->ip_blocks[i].status.valid) 2356 continue; 2357 if (adev->ip_blocks[i].version->funcs->get_clockgating_state) 2358 adev->ip_blocks[i].version->funcs->get_clockgating_state( 2359 &adev->ip_blocks[i], flags); 2360 } 2361 } 2362 2363 /** 2364 * amdgpu_device_ip_wait_for_idle - wait for idle 2365 * 2366 * @adev: amdgpu_device pointer 2367 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.) 2368 * 2369 * Waits for the request hardware IP to be idle. 2370 * Returns 0 for success or a negative error code on failure. 2371 */ 2372 int amdgpu_device_ip_wait_for_idle(struct amdgpu_device *adev, 2373 enum amd_ip_block_type block_type) 2374 { 2375 int i, r; 2376 2377 for (i = 0; i < adev->num_ip_blocks; i++) { 2378 if (!adev->ip_blocks[i].status.valid) 2379 continue; 2380 if (adev->ip_blocks[i].version->type == block_type) { 2381 if (adev->ip_blocks[i].version->funcs->wait_for_idle) { 2382 r = adev->ip_blocks[i].version->funcs->wait_for_idle( 2383 &adev->ip_blocks[i]); 2384 if (r) 2385 return r; 2386 } 2387 break; 2388 } 2389 } 2390 return 0; 2391 2392 } 2393 2394 /** 2395 * amdgpu_device_ip_is_hw - is the hardware IP enabled 2396 * 2397 * @adev: amdgpu_device pointer 2398 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.) 2399 * 2400 * Check if the hardware IP is enable or not. 2401 * Returns true if it the IP is enable, false if not. 2402 */ 2403 bool amdgpu_device_ip_is_hw(struct amdgpu_device *adev, 2404 enum amd_ip_block_type block_type) 2405 { 2406 int i; 2407 2408 for (i = 0; i < adev->num_ip_blocks; i++) { 2409 if (adev->ip_blocks[i].version->type == block_type) 2410 return adev->ip_blocks[i].status.hw; 2411 } 2412 return false; 2413 } 2414 2415 /** 2416 * amdgpu_device_ip_is_valid - is the hardware IP valid 2417 * 2418 * @adev: amdgpu_device pointer 2419 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.) 2420 * 2421 * Check if the hardware IP is valid or not. 2422 * Returns true if it the IP is valid, false if not. 2423 */ 2424 bool amdgpu_device_ip_is_valid(struct amdgpu_device *adev, 2425 enum amd_ip_block_type block_type) 2426 { 2427 int i; 2428 2429 for (i = 0; i < adev->num_ip_blocks; i++) { 2430 if (adev->ip_blocks[i].version->type == block_type) 2431 return adev->ip_blocks[i].status.valid; 2432 } 2433 return false; 2434 2435 } 2436 2437 /** 2438 * amdgpu_device_ip_get_ip_block - get a hw IP pointer 2439 * 2440 * @adev: amdgpu_device pointer 2441 * @type: Type of hardware IP (SMU, GFX, UVD, etc.) 2442 * 2443 * Returns a pointer to the hardware IP block structure 2444 * if it exists for the asic, otherwise NULL. 2445 */ 2446 struct amdgpu_ip_block * 2447 amdgpu_device_ip_get_ip_block(struct amdgpu_device *adev, 2448 enum amd_ip_block_type type) 2449 { 2450 int i; 2451 2452 for (i = 0; i < adev->num_ip_blocks; i++) 2453 if (adev->ip_blocks[i].version->type == type) 2454 return &adev->ip_blocks[i]; 2455 2456 return NULL; 2457 } 2458 2459 /** 2460 * amdgpu_device_ip_block_version_cmp 2461 * 2462 * @adev: amdgpu_device pointer 2463 * @type: enum amd_ip_block_type 2464 * @major: major version 2465 * @minor: minor version 2466 * 2467 * return 0 if equal or greater 2468 * return 1 if smaller or the ip_block doesn't exist 2469 */ 2470 int amdgpu_device_ip_block_version_cmp(struct amdgpu_device *adev, 2471 enum amd_ip_block_type type, 2472 u32 major, u32 minor) 2473 { 2474 struct amdgpu_ip_block *ip_block = amdgpu_device_ip_get_ip_block(adev, type); 2475 2476 if (ip_block && ((ip_block->version->major > major) || 2477 ((ip_block->version->major == major) && 2478 (ip_block->version->minor >= minor)))) 2479 return 0; 2480 2481 return 1; 2482 } 2483 2484 static const char *ip_block_names[] = { 2485 [AMD_IP_BLOCK_TYPE_COMMON] = "common", 2486 [AMD_IP_BLOCK_TYPE_GMC] = "gmc", 2487 [AMD_IP_BLOCK_TYPE_IH] = "ih", 2488 [AMD_IP_BLOCK_TYPE_SMC] = "smu", 2489 [AMD_IP_BLOCK_TYPE_PSP] = "psp", 2490 [AMD_IP_BLOCK_TYPE_DCE] = "dce", 2491 [AMD_IP_BLOCK_TYPE_GFX] = "gfx", 2492 [AMD_IP_BLOCK_TYPE_SDMA] = "sdma", 2493 [AMD_IP_BLOCK_TYPE_UVD] = "uvd", 2494 [AMD_IP_BLOCK_TYPE_VCE] = "vce", 2495 [AMD_IP_BLOCK_TYPE_ACP] = "acp", 2496 [AMD_IP_BLOCK_TYPE_VCN] = "vcn", 2497 [AMD_IP_BLOCK_TYPE_MES] = "mes", 2498 [AMD_IP_BLOCK_TYPE_JPEG] = "jpeg", 2499 [AMD_IP_BLOCK_TYPE_VPE] = "vpe", 2500 [AMD_IP_BLOCK_TYPE_UMSCH_MM] = "umsch_mm", 2501 [AMD_IP_BLOCK_TYPE_ISP] = "isp", 2502 [AMD_IP_BLOCK_TYPE_RAS] = "ras", 2503 }; 2504 2505 static const char *ip_block_name(struct amdgpu_device *adev, enum amd_ip_block_type type) 2506 { 2507 int idx = (int)type; 2508 2509 return idx < ARRAY_SIZE(ip_block_names) ? ip_block_names[idx] : "unknown"; 2510 } 2511 2512 /** 2513 * amdgpu_device_ip_block_add 2514 * 2515 * @adev: amdgpu_device pointer 2516 * @ip_block_version: pointer to the IP to add 2517 * 2518 * Adds the IP block driver information to the collection of IPs 2519 * on the asic. 2520 */ 2521 int amdgpu_device_ip_block_add(struct amdgpu_device *adev, 2522 const struct amdgpu_ip_block_version *ip_block_version) 2523 { 2524 if (!ip_block_version) 2525 return -EINVAL; 2526 2527 switch (ip_block_version->type) { 2528 case AMD_IP_BLOCK_TYPE_VCN: 2529 if (adev->harvest_ip_mask & AMD_HARVEST_IP_VCN_MASK) 2530 return 0; 2531 break; 2532 case AMD_IP_BLOCK_TYPE_JPEG: 2533 if (adev->harvest_ip_mask & AMD_HARVEST_IP_JPEG_MASK) 2534 return 0; 2535 break; 2536 default: 2537 break; 2538 } 2539 2540 dev_info(adev->dev, "detected ip block number %d <%s_v%d_%d_%d> (%s)\n", 2541 adev->num_ip_blocks, 2542 ip_block_name(adev, ip_block_version->type), 2543 ip_block_version->major, 2544 ip_block_version->minor, 2545 ip_block_version->rev, 2546 ip_block_version->funcs->name); 2547 2548 adev->ip_blocks[adev->num_ip_blocks].adev = adev; 2549 2550 adev->ip_blocks[adev->num_ip_blocks++].version = ip_block_version; 2551 2552 return 0; 2553 } 2554 2555 /** 2556 * amdgpu_device_enable_virtual_display - enable virtual display feature 2557 * 2558 * @adev: amdgpu_device pointer 2559 * 2560 * Enabled the virtual display feature if the user has enabled it via 2561 * the module parameter virtual_display. This feature provides a virtual 2562 * display hardware on headless boards or in virtualized environments. 2563 * This function parses and validates the configuration string specified by 2564 * the user and configures the virtual display configuration (number of 2565 * virtual connectors, crtcs, etc.) specified. 2566 */ 2567 static void amdgpu_device_enable_virtual_display(struct amdgpu_device *adev) 2568 { 2569 adev->enable_virtual_display = false; 2570 2571 if (amdgpu_virtual_display) { 2572 const char *pci_address_name = pci_name(adev->pdev); 2573 char *pciaddstr, *pciaddstr_tmp, *pciaddname_tmp, *pciaddname; 2574 2575 pciaddstr = kstrdup(amdgpu_virtual_display, GFP_KERNEL); 2576 pciaddstr_tmp = pciaddstr; 2577 while ((pciaddname_tmp = strsep(&pciaddstr_tmp, ";"))) { 2578 pciaddname = strsep(&pciaddname_tmp, ","); 2579 if (!strcmp("all", pciaddname) 2580 || !strcmp(pci_address_name, pciaddname)) { 2581 long num_crtc; 2582 int res = -1; 2583 2584 adev->enable_virtual_display = true; 2585 2586 if (pciaddname_tmp) 2587 res = kstrtol(pciaddname_tmp, 10, 2588 &num_crtc); 2589 2590 if (!res) { 2591 if (num_crtc < 1) 2592 num_crtc = 1; 2593 if (num_crtc > 6) 2594 num_crtc = 6; 2595 adev->mode_info.num_crtc = num_crtc; 2596 } else { 2597 adev->mode_info.num_crtc = 1; 2598 } 2599 break; 2600 } 2601 } 2602 2603 dev_info( 2604 adev->dev, 2605 "virtual display string:%s, %s:virtual_display:%d, num_crtc:%d\n", 2606 amdgpu_virtual_display, pci_address_name, 2607 adev->enable_virtual_display, adev->mode_info.num_crtc); 2608 2609 kfree(pciaddstr); 2610 } 2611 } 2612 2613 void amdgpu_device_set_sriov_virtual_display(struct amdgpu_device *adev) 2614 { 2615 if (amdgpu_sriov_vf(adev) && !adev->enable_virtual_display) { 2616 adev->mode_info.num_crtc = 1; 2617 adev->enable_virtual_display = true; 2618 dev_info(adev->dev, "virtual_display:%d, num_crtc:%d\n", 2619 adev->enable_virtual_display, 2620 adev->mode_info.num_crtc); 2621 } 2622 } 2623 2624 /** 2625 * amdgpu_device_parse_gpu_info_fw - parse gpu info firmware 2626 * 2627 * @adev: amdgpu_device pointer 2628 * 2629 * Parses the asic configuration parameters specified in the gpu info 2630 * firmware and makes them available to the driver for use in configuring 2631 * the asic. 2632 * Returns 0 on success, -EINVAL on failure. 2633 */ 2634 static int amdgpu_device_parse_gpu_info_fw(struct amdgpu_device *adev) 2635 { 2636 const char *chip_name; 2637 int err; 2638 const struct gpu_info_firmware_header_v1_0 *hdr; 2639 2640 adev->firmware.gpu_info_fw = NULL; 2641 2642 switch (adev->asic_type) { 2643 default: 2644 return 0; 2645 case CHIP_VEGA10: 2646 chip_name = "vega10"; 2647 break; 2648 case CHIP_VEGA12: 2649 chip_name = "vega12"; 2650 break; 2651 case CHIP_RAVEN: 2652 if (adev->apu_flags & AMD_APU_IS_RAVEN2) 2653 chip_name = "raven2"; 2654 else if (adev->apu_flags & AMD_APU_IS_PICASSO) 2655 chip_name = "picasso"; 2656 else 2657 chip_name = "raven"; 2658 break; 2659 case CHIP_ARCTURUS: 2660 chip_name = "arcturus"; 2661 break; 2662 case CHIP_NAVI12: 2663 if (adev->discovery.bin) 2664 return 0; 2665 chip_name = "navi12"; 2666 break; 2667 case CHIP_CYAN_SKILLFISH: 2668 if (adev->discovery.bin) 2669 return 0; 2670 chip_name = "cyan_skillfish"; 2671 break; 2672 } 2673 2674 err = amdgpu_ucode_request(adev, &adev->firmware.gpu_info_fw, 2675 AMDGPU_UCODE_OPTIONAL, 2676 "amdgpu/%s_gpu_info.bin", chip_name); 2677 if (err) { 2678 dev_err(adev->dev, 2679 "Failed to get gpu_info firmware \"%s_gpu_info.bin\"\n", 2680 chip_name); 2681 goto out; 2682 } 2683 2684 hdr = (const struct gpu_info_firmware_header_v1_0 *)adev->firmware.gpu_info_fw->data; 2685 amdgpu_ucode_print_gpu_info_hdr(&hdr->header); 2686 2687 switch (hdr->version_major) { 2688 case 1: 2689 { 2690 const struct gpu_info_firmware_v1_0 *gpu_info_fw = 2691 (const struct gpu_info_firmware_v1_0 *)(adev->firmware.gpu_info_fw->data + 2692 le32_to_cpu(hdr->header.ucode_array_offset_bytes)); 2693 2694 /* 2695 * Should be dropped when DAL no longer needs it. 2696 */ 2697 if (adev->asic_type == CHIP_NAVI12) 2698 goto parse_soc_bounding_box; 2699 2700 adev->gfx.config.max_shader_engines = le32_to_cpu(gpu_info_fw->gc_num_se); 2701 adev->gfx.config.max_cu_per_sh = le32_to_cpu(gpu_info_fw->gc_num_cu_per_sh); 2702 adev->gfx.config.max_sh_per_se = le32_to_cpu(gpu_info_fw->gc_num_sh_per_se); 2703 adev->gfx.config.max_backends_per_se = le32_to_cpu(gpu_info_fw->gc_num_rb_per_se); 2704 adev->gfx.config.max_texture_channel_caches = 2705 le32_to_cpu(gpu_info_fw->gc_num_tccs); 2706 adev->gfx.config.max_gprs = le32_to_cpu(gpu_info_fw->gc_num_gprs); 2707 adev->gfx.config.max_gs_threads = le32_to_cpu(gpu_info_fw->gc_num_max_gs_thds); 2708 adev->gfx.config.gs_vgt_table_depth = le32_to_cpu(gpu_info_fw->gc_gs_table_depth); 2709 adev->gfx.config.gs_prim_buffer_depth = le32_to_cpu(gpu_info_fw->gc_gsprim_buff_depth); 2710 adev->gfx.config.double_offchip_lds_buf = 2711 le32_to_cpu(gpu_info_fw->gc_double_offchip_lds_buffer); 2712 adev->gfx.cu_info.wave_front_size = le32_to_cpu(gpu_info_fw->gc_wave_size); 2713 adev->gfx.cu_info.max_waves_per_simd = 2714 le32_to_cpu(gpu_info_fw->gc_max_waves_per_simd); 2715 adev->gfx.cu_info.max_scratch_slots_per_cu = 2716 le32_to_cpu(gpu_info_fw->gc_max_scratch_slots_per_cu); 2717 adev->gfx.cu_info.lds_size = le32_to_cpu(gpu_info_fw->gc_lds_size); 2718 if (hdr->version_minor >= 1) { 2719 const struct gpu_info_firmware_v1_1 *gpu_info_fw = 2720 (const struct gpu_info_firmware_v1_1 *)(adev->firmware.gpu_info_fw->data + 2721 le32_to_cpu(hdr->header.ucode_array_offset_bytes)); 2722 adev->gfx.config.num_sc_per_sh = 2723 le32_to_cpu(gpu_info_fw->num_sc_per_sh); 2724 adev->gfx.config.num_packer_per_sc = 2725 le32_to_cpu(gpu_info_fw->num_packer_per_sc); 2726 } 2727 2728 parse_soc_bounding_box: 2729 /* 2730 * soc bounding box info is not integrated in disocovery table, 2731 * we always need to parse it from gpu info firmware if needed. 2732 */ 2733 if (hdr->version_minor == 2) { 2734 const struct gpu_info_firmware_v1_2 *gpu_info_fw = 2735 (const struct gpu_info_firmware_v1_2 *)(adev->firmware.gpu_info_fw->data + 2736 le32_to_cpu(hdr->header.ucode_array_offset_bytes)); 2737 adev->dm.soc_bounding_box = &gpu_info_fw->soc_bounding_box; 2738 } 2739 break; 2740 } 2741 default: 2742 dev_err(adev->dev, 2743 "Unsupported gpu_info table %d\n", hdr->header.ucode_version); 2744 err = -EINVAL; 2745 goto out; 2746 } 2747 out: 2748 return err; 2749 } 2750 2751 static void amdgpu_uid_init(struct amdgpu_device *adev) 2752 { 2753 /* Initialize the UID for the device */ 2754 adev->uid_info = kzalloc(sizeof(struct amdgpu_uid), GFP_KERNEL); 2755 if (!adev->uid_info) { 2756 dev_warn(adev->dev, "Failed to allocate memory for UID\n"); 2757 return; 2758 } 2759 adev->uid_info->adev = adev; 2760 } 2761 2762 static void amdgpu_uid_fini(struct amdgpu_device *adev) 2763 { 2764 /* Free the UID memory */ 2765 kfree(adev->uid_info); 2766 adev->uid_info = NULL; 2767 } 2768 2769 /** 2770 * amdgpu_device_ip_early_init - run early init for hardware IPs 2771 * 2772 * @adev: amdgpu_device pointer 2773 * 2774 * Early initialization pass for hardware IPs. The hardware IPs that make 2775 * up each asic are discovered each IP's early_init callback is run. This 2776 * is the first stage in initializing the asic. 2777 * Returns 0 on success, negative error code on failure. 2778 */ 2779 static int amdgpu_device_ip_early_init(struct amdgpu_device *adev) 2780 { 2781 struct amdgpu_ip_block *ip_block; 2782 struct pci_dev *parent; 2783 bool total, skip_bios; 2784 uint32_t bios_flags; 2785 int i, r; 2786 2787 amdgpu_device_enable_virtual_display(adev); 2788 2789 if (amdgpu_sriov_vf(adev)) { 2790 r = amdgpu_virt_request_full_gpu(adev, true); 2791 if (r) 2792 return r; 2793 2794 r = amdgpu_virt_init_critical_region(adev); 2795 if (r) 2796 return r; 2797 } 2798 2799 switch (adev->asic_type) { 2800 #ifdef CONFIG_DRM_AMDGPU_SI 2801 case CHIP_VERDE: 2802 case CHIP_TAHITI: 2803 case CHIP_PITCAIRN: 2804 case CHIP_OLAND: 2805 case CHIP_HAINAN: 2806 adev->family = AMDGPU_FAMILY_SI; 2807 r = si_set_ip_blocks(adev); 2808 if (r) 2809 return r; 2810 break; 2811 #endif 2812 #ifdef CONFIG_DRM_AMDGPU_CIK 2813 case CHIP_BONAIRE: 2814 case CHIP_HAWAII: 2815 case CHIP_KAVERI: 2816 case CHIP_KABINI: 2817 case CHIP_MULLINS: 2818 if (adev->flags & AMD_IS_APU) 2819 adev->family = AMDGPU_FAMILY_KV; 2820 else 2821 adev->family = AMDGPU_FAMILY_CI; 2822 2823 r = cik_set_ip_blocks(adev); 2824 if (r) 2825 return r; 2826 break; 2827 #endif 2828 case CHIP_TOPAZ: 2829 case CHIP_TONGA: 2830 case CHIP_FIJI: 2831 case CHIP_POLARIS10: 2832 case CHIP_POLARIS11: 2833 case CHIP_POLARIS12: 2834 case CHIP_VEGAM: 2835 case CHIP_CARRIZO: 2836 case CHIP_STONEY: 2837 if (adev->flags & AMD_IS_APU) 2838 adev->family = AMDGPU_FAMILY_CZ; 2839 else 2840 adev->family = AMDGPU_FAMILY_VI; 2841 2842 r = vi_set_ip_blocks(adev); 2843 if (r) 2844 return r; 2845 break; 2846 default: 2847 r = amdgpu_discovery_set_ip_blocks(adev); 2848 if (r) 2849 return r; 2850 break; 2851 } 2852 2853 /* Check for IP version 9.4.3 with A0 hardware */ 2854 if (amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 3) && 2855 !amdgpu_device_get_rev_id(adev)) { 2856 dev_err(adev->dev, "Unsupported A0 hardware\n"); 2857 return -ENODEV; /* device unsupported - no device error */ 2858 } 2859 2860 if (amdgpu_has_atpx() && 2861 (amdgpu_is_atpx_hybrid() || 2862 amdgpu_has_atpx_dgpu_power_cntl()) && 2863 ((adev->flags & AMD_IS_APU) == 0) && 2864 !dev_is_removable(&adev->pdev->dev)) 2865 adev->flags |= AMD_IS_PX; 2866 2867 if (!(adev->flags & AMD_IS_APU)) { 2868 parent = pcie_find_root_port(adev->pdev); 2869 adev->has_pr3 = parent ? pci_pr3_present(parent) : false; 2870 } 2871 2872 adev->pm.pp_feature = amdgpu_pp_feature_mask; 2873 if (amdgpu_sriov_vf(adev) || sched_policy == KFD_SCHED_POLICY_NO_HWS) 2874 adev->pm.pp_feature &= ~PP_GFXOFF_MASK; 2875 if (amdgpu_sriov_vf(adev) && adev->asic_type == CHIP_SIENNA_CICHLID) 2876 adev->pm.pp_feature &= ~PP_OVERDRIVE_MASK; 2877 if (!amdgpu_device_pcie_dynamic_switching_supported(adev)) 2878 adev->pm.pp_feature &= ~PP_PCIE_DPM_MASK; 2879 2880 adev->virt.is_xgmi_node_migrate_enabled = false; 2881 if (amdgpu_sriov_vf(adev)) { 2882 adev->virt.is_xgmi_node_migrate_enabled = 2883 amdgpu_ip_version((adev), GC_HWIP, 0) == IP_VERSION(9, 4, 4); 2884 } 2885 2886 total = true; 2887 for (i = 0; i < adev->num_ip_blocks; i++) { 2888 ip_block = &adev->ip_blocks[i]; 2889 2890 if ((amdgpu_ip_block_mask & (1 << i)) == 0) { 2891 dev_warn(adev->dev, "disabled ip block: %d <%s>\n", i, 2892 adev->ip_blocks[i].version->funcs->name); 2893 adev->ip_blocks[i].status.valid = false; 2894 } else if (ip_block->version->funcs->early_init) { 2895 r = ip_block->version->funcs->early_init(ip_block); 2896 if (r == -ENOENT) { 2897 adev->ip_blocks[i].status.valid = false; 2898 } else if (r) { 2899 dev_err(adev->dev, 2900 "early_init of IP block <%s> failed %d\n", 2901 adev->ip_blocks[i].version->funcs->name, 2902 r); 2903 total = false; 2904 } else { 2905 adev->ip_blocks[i].status.valid = true; 2906 } 2907 } else { 2908 adev->ip_blocks[i].status.valid = true; 2909 } 2910 /* get the vbios after the asic_funcs are set up */ 2911 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON) { 2912 r = amdgpu_device_parse_gpu_info_fw(adev); 2913 if (r) 2914 return r; 2915 2916 bios_flags = amdgpu_device_get_vbios_flags(adev); 2917 skip_bios = !!(bios_flags & AMDGPU_VBIOS_SKIP); 2918 /* Read BIOS */ 2919 if (!skip_bios) { 2920 bool optional = 2921 !!(bios_flags & AMDGPU_VBIOS_OPTIONAL); 2922 if (!amdgpu_get_bios(adev) && !optional) 2923 return -EINVAL; 2924 2925 if (optional && !adev->bios) 2926 dev_info( 2927 adev->dev, 2928 "VBIOS image optional, proceeding without VBIOS image"); 2929 2930 if (adev->bios) { 2931 r = amdgpu_atombios_init(adev); 2932 if (r) { 2933 dev_err(adev->dev, 2934 "amdgpu_atombios_init failed\n"); 2935 amdgpu_vf_error_put( 2936 adev, 2937 AMDGIM_ERROR_VF_ATOMBIOS_INIT_FAIL, 2938 0, 0); 2939 return r; 2940 } 2941 } 2942 } 2943 2944 /*get pf2vf msg info at it's earliest time*/ 2945 if (amdgpu_sriov_vf(adev)) 2946 amdgpu_virt_init_data_exchange(adev); 2947 2948 } 2949 } 2950 if (!total) 2951 return -ENODEV; 2952 2953 if (adev->gmc.xgmi.supported) 2954 amdgpu_xgmi_early_init(adev); 2955 2956 if (amdgpu_is_multi_aid(adev)) 2957 amdgpu_uid_init(adev); 2958 ip_block = amdgpu_device_ip_get_ip_block(adev, AMD_IP_BLOCK_TYPE_GFX); 2959 if (ip_block->status.valid != false) 2960 amdgpu_amdkfd_device_probe(adev); 2961 2962 adev->cg_flags &= amdgpu_cg_mask; 2963 adev->pg_flags &= amdgpu_pg_mask; 2964 2965 return 0; 2966 } 2967 2968 static int amdgpu_device_ip_hw_init_phase1(struct amdgpu_device *adev) 2969 { 2970 int i, r; 2971 2972 for (i = 0; i < adev->num_ip_blocks; i++) { 2973 if (!adev->ip_blocks[i].status.sw) 2974 continue; 2975 if (adev->ip_blocks[i].status.hw) 2976 continue; 2977 if (!amdgpu_ip_member_of_hwini( 2978 adev, adev->ip_blocks[i].version->type)) 2979 continue; 2980 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON || 2981 (amdgpu_sriov_vf(adev) && (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP)) || 2982 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH) { 2983 r = adev->ip_blocks[i].version->funcs->hw_init(&adev->ip_blocks[i]); 2984 if (r) { 2985 dev_err(adev->dev, 2986 "hw_init of IP block <%s> failed %d\n", 2987 adev->ip_blocks[i].version->funcs->name, 2988 r); 2989 return r; 2990 } 2991 adev->ip_blocks[i].status.hw = true; 2992 } 2993 } 2994 2995 return 0; 2996 } 2997 2998 static int amdgpu_device_ip_hw_init_phase2(struct amdgpu_device *adev) 2999 { 3000 int i, r; 3001 3002 for (i = 0; i < adev->num_ip_blocks; i++) { 3003 if (!adev->ip_blocks[i].status.sw) 3004 continue; 3005 if (adev->ip_blocks[i].status.hw) 3006 continue; 3007 if (!amdgpu_ip_member_of_hwini( 3008 adev, adev->ip_blocks[i].version->type)) 3009 continue; 3010 r = adev->ip_blocks[i].version->funcs->hw_init(&adev->ip_blocks[i]); 3011 if (r) { 3012 dev_err(adev->dev, 3013 "hw_init of IP block <%s> failed %d\n", 3014 adev->ip_blocks[i].version->funcs->name, r); 3015 return r; 3016 } 3017 adev->ip_blocks[i].status.hw = true; 3018 } 3019 3020 return 0; 3021 } 3022 3023 static int amdgpu_device_fw_loading(struct amdgpu_device *adev) 3024 { 3025 int r = 0; 3026 int i; 3027 uint32_t smu_version; 3028 3029 if (adev->asic_type >= CHIP_VEGA10) { 3030 for (i = 0; i < adev->num_ip_blocks; i++) { 3031 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_PSP) 3032 continue; 3033 3034 if (!amdgpu_ip_member_of_hwini(adev, 3035 AMD_IP_BLOCK_TYPE_PSP)) 3036 break; 3037 3038 if (!adev->ip_blocks[i].status.sw) 3039 continue; 3040 3041 /* no need to do the fw loading again if already done*/ 3042 if (adev->ip_blocks[i].status.hw == true) 3043 break; 3044 3045 if (amdgpu_in_reset(adev) || adev->in_suspend) { 3046 r = amdgpu_ip_block_resume(&adev->ip_blocks[i]); 3047 if (r) 3048 return r; 3049 } else { 3050 r = adev->ip_blocks[i].version->funcs->hw_init(&adev->ip_blocks[i]); 3051 if (r) { 3052 dev_err(adev->dev, 3053 "hw_init of IP block <%s> failed %d\n", 3054 adev->ip_blocks[i] 3055 .version->funcs->name, 3056 r); 3057 return r; 3058 } 3059 adev->ip_blocks[i].status.hw = true; 3060 } 3061 break; 3062 } 3063 } 3064 3065 if (!amdgpu_sriov_vf(adev) || adev->asic_type == CHIP_TONGA) 3066 r = amdgpu_pm_load_smu_firmware(adev, &smu_version); 3067 3068 return r; 3069 } 3070 3071 static int amdgpu_device_init_schedulers(struct amdgpu_device *adev) 3072 { 3073 struct drm_sched_init_args args = { 3074 .ops = &amdgpu_sched_ops, 3075 .num_rqs = DRM_SCHED_PRIORITY_COUNT, 3076 .timeout_wq = adev->reset_domain->wq, 3077 .dev = adev->dev, 3078 }; 3079 long timeout; 3080 int r, i; 3081 3082 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 3083 struct amdgpu_ring *ring = adev->rings[i]; 3084 3085 /* No need to setup the GPU scheduler for rings that don't need it */ 3086 if (!ring || ring->no_scheduler) 3087 continue; 3088 3089 switch (ring->funcs->type) { 3090 case AMDGPU_RING_TYPE_GFX: 3091 timeout = adev->gfx_timeout; 3092 break; 3093 case AMDGPU_RING_TYPE_COMPUTE: 3094 timeout = adev->compute_timeout; 3095 break; 3096 case AMDGPU_RING_TYPE_SDMA: 3097 timeout = adev->sdma_timeout; 3098 break; 3099 default: 3100 timeout = adev->video_timeout; 3101 break; 3102 } 3103 3104 args.timeout = timeout; 3105 args.credit_limit = ring->num_hw_submission; 3106 args.score = ring->sched_score; 3107 args.name = ring->name; 3108 3109 r = drm_sched_init(&ring->sched, &args); 3110 if (r) { 3111 dev_err(adev->dev, 3112 "Failed to create scheduler on ring %s.\n", 3113 ring->name); 3114 return r; 3115 } 3116 r = amdgpu_uvd_entity_init(adev, ring); 3117 if (r) { 3118 dev_err(adev->dev, 3119 "Failed to create UVD scheduling entity on ring %s.\n", 3120 ring->name); 3121 return r; 3122 } 3123 r = amdgpu_vce_entity_init(adev, ring); 3124 if (r) { 3125 dev_err(adev->dev, 3126 "Failed to create VCE scheduling entity on ring %s.\n", 3127 ring->name); 3128 return r; 3129 } 3130 } 3131 3132 if (adev->xcp_mgr) 3133 amdgpu_xcp_update_partition_sched_list(adev); 3134 3135 return 0; 3136 } 3137 3138 3139 /** 3140 * amdgpu_device_ip_init - run init for hardware IPs 3141 * 3142 * @adev: amdgpu_device pointer 3143 * 3144 * Main initialization pass for hardware IPs. The list of all the hardware 3145 * IPs that make up the asic is walked and the sw_init and hw_init callbacks 3146 * are run. sw_init initializes the software state associated with each IP 3147 * and hw_init initializes the hardware associated with each IP. 3148 * Returns 0 on success, negative error code on failure. 3149 */ 3150 static int amdgpu_device_ip_init(struct amdgpu_device *adev) 3151 { 3152 bool init_badpage; 3153 int i, r; 3154 3155 r = amdgpu_ras_init(adev); 3156 if (r) 3157 return r; 3158 3159 for (i = 0; i < adev->num_ip_blocks; i++) { 3160 if (!adev->ip_blocks[i].status.valid) 3161 continue; 3162 if (adev->ip_blocks[i].version->funcs->sw_init) { 3163 r = adev->ip_blocks[i].version->funcs->sw_init(&adev->ip_blocks[i]); 3164 if (r) { 3165 dev_err(adev->dev, 3166 "sw_init of IP block <%s> failed %d\n", 3167 adev->ip_blocks[i].version->funcs->name, 3168 r); 3169 goto init_failed; 3170 } 3171 } 3172 adev->ip_blocks[i].status.sw = true; 3173 3174 if (!amdgpu_ip_member_of_hwini( 3175 adev, adev->ip_blocks[i].version->type)) 3176 continue; 3177 3178 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON) { 3179 /* need to do common hw init early so everything is set up for gmc */ 3180 r = adev->ip_blocks[i].version->funcs->hw_init(&adev->ip_blocks[i]); 3181 if (r) { 3182 dev_err(adev->dev, "hw_init %d failed %d\n", i, 3183 r); 3184 goto init_failed; 3185 } 3186 adev->ip_blocks[i].status.hw = true; 3187 } else if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) { 3188 /* need to do gmc hw init early so we can allocate gpu mem */ 3189 /* Try to reserve bad pages early */ 3190 if (amdgpu_sriov_vf(adev)) 3191 amdgpu_virt_exchange_data(adev); 3192 3193 r = amdgpu_device_mem_scratch_init(adev); 3194 if (r) { 3195 dev_err(adev->dev, 3196 "amdgpu_mem_scratch_init failed %d\n", 3197 r); 3198 goto init_failed; 3199 } 3200 r = adev->ip_blocks[i].version->funcs->hw_init(&adev->ip_blocks[i]); 3201 if (r) { 3202 dev_err(adev->dev, "hw_init %d failed %d\n", i, 3203 r); 3204 goto init_failed; 3205 } 3206 r = amdgpu_device_wb_init(adev); 3207 if (r) { 3208 dev_err(adev->dev, 3209 "amdgpu_device_wb_init failed %d\n", r); 3210 goto init_failed; 3211 } 3212 adev->ip_blocks[i].status.hw = true; 3213 3214 /* right after GMC hw init, we create CSA */ 3215 if (adev->gfx.mcbp) { 3216 r = amdgpu_allocate_static_csa(adev, &adev->virt.csa_obj, 3217 AMDGPU_GEM_DOMAIN_VRAM | 3218 AMDGPU_GEM_DOMAIN_GTT, 3219 AMDGPU_CSA_SIZE); 3220 if (r) { 3221 dev_err(adev->dev, 3222 "allocate CSA failed %d\n", r); 3223 goto init_failed; 3224 } 3225 } 3226 3227 r = amdgpu_seq64_init(adev); 3228 if (r) { 3229 dev_err(adev->dev, "allocate seq64 failed %d\n", 3230 r); 3231 goto init_failed; 3232 } 3233 } 3234 } 3235 3236 if (amdgpu_sriov_vf(adev)) 3237 amdgpu_virt_init_data_exchange(adev); 3238 3239 r = amdgpu_ib_pool_init(adev); 3240 if (r) { 3241 dev_err(adev->dev, "IB initialization failed (%d).\n", r); 3242 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_IB_INIT_FAIL, 0, r); 3243 goto init_failed; 3244 } 3245 3246 r = amdgpu_ucode_create_bo(adev); /* create ucode bo when sw_init complete*/ 3247 if (r) 3248 goto init_failed; 3249 3250 r = amdgpu_device_ip_hw_init_phase1(adev); 3251 if (r) 3252 goto init_failed; 3253 3254 r = amdgpu_device_fw_loading(adev); 3255 if (r) 3256 goto init_failed; 3257 3258 r = amdgpu_device_ip_hw_init_phase2(adev); 3259 if (r) 3260 goto init_failed; 3261 3262 /* 3263 * retired pages will be loaded from eeprom and reserved here, 3264 * it should be called after amdgpu_device_ip_hw_init_phase2 since 3265 * for some ASICs the RAS EEPROM code relies on SMU fully functioning 3266 * for I2C communication which only true at this point. 3267 * 3268 * amdgpu_ras_recovery_init may fail, but the upper only cares the 3269 * failure from bad gpu situation and stop amdgpu init process 3270 * accordingly. For other failed cases, it will still release all 3271 * the resource and print error message, rather than returning one 3272 * negative value to upper level. 3273 * 3274 * Note: theoretically, this should be called before all vram allocations 3275 * to protect retired page from abusing 3276 */ 3277 init_badpage = (adev->init_lvl->level != AMDGPU_INIT_LEVEL_MINIMAL_XGMI); 3278 r = amdgpu_ras_recovery_init(adev, init_badpage); 3279 if (r) 3280 goto init_failed; 3281 3282 /** 3283 * In case of XGMI grab extra reference for reset domain for this device 3284 */ 3285 if (adev->gmc.xgmi.num_physical_nodes > 1) { 3286 if (amdgpu_xgmi_add_device(adev) == 0) { 3287 if (!amdgpu_sriov_vf(adev)) { 3288 struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev); 3289 3290 if (WARN_ON(!hive)) { 3291 r = -ENOENT; 3292 goto init_failed; 3293 } 3294 3295 if (!hive->reset_domain || 3296 !amdgpu_reset_get_reset_domain(hive->reset_domain)) { 3297 r = -ENOENT; 3298 amdgpu_put_xgmi_hive(hive); 3299 goto init_failed; 3300 } 3301 3302 /* Drop the early temporary reset domain we created for device */ 3303 amdgpu_reset_put_reset_domain(adev->reset_domain); 3304 adev->reset_domain = hive->reset_domain; 3305 amdgpu_put_xgmi_hive(hive); 3306 } 3307 } 3308 } 3309 3310 r = amdgpu_device_init_schedulers(adev); 3311 if (r) 3312 goto init_failed; 3313 3314 if (adev->mman.buffer_funcs_ring->sched.ready) 3315 amdgpu_ttm_set_buffer_funcs_status(adev, true); 3316 3317 /* Don't init kfd if whole hive need to be reset during init */ 3318 if (adev->init_lvl->level != AMDGPU_INIT_LEVEL_MINIMAL_XGMI) { 3319 kgd2kfd_init_zone_device(adev); 3320 amdgpu_amdkfd_device_init(adev); 3321 } 3322 3323 amdgpu_fru_get_product_info(adev); 3324 3325 if (!amdgpu_sriov_vf(adev) || amdgpu_sriov_ras_cper_en(adev)) 3326 r = amdgpu_cper_init(adev); 3327 3328 init_failed: 3329 3330 return r; 3331 } 3332 3333 /** 3334 * amdgpu_device_fill_reset_magic - writes reset magic to gart pointer 3335 * 3336 * @adev: amdgpu_device pointer 3337 * 3338 * Writes a reset magic value to the gart pointer in VRAM. The driver calls 3339 * this function before a GPU reset. If the value is retained after a 3340 * GPU reset, VRAM has not been lost. Some GPU resets may destroy VRAM contents. 3341 */ 3342 static void amdgpu_device_fill_reset_magic(struct amdgpu_device *adev) 3343 { 3344 memcpy(adev->reset_magic, adev->gart.ptr, AMDGPU_RESET_MAGIC_NUM); 3345 } 3346 3347 /** 3348 * amdgpu_device_check_vram_lost - check if vram is valid 3349 * 3350 * @adev: amdgpu_device pointer 3351 * 3352 * Checks the reset magic value written to the gart pointer in VRAM. 3353 * The driver calls this after a GPU reset to see if the contents of 3354 * VRAM is lost or now. 3355 * returns true if vram is lost, false if not. 3356 */ 3357 static bool amdgpu_device_check_vram_lost(struct amdgpu_device *adev) 3358 { 3359 if (memcmp(adev->gart.ptr, adev->reset_magic, 3360 AMDGPU_RESET_MAGIC_NUM)) 3361 return true; 3362 3363 if (!amdgpu_in_reset(adev)) 3364 return false; 3365 3366 /* 3367 * For all ASICs with baco/mode1 reset, the VRAM is 3368 * always assumed to be lost. 3369 */ 3370 switch (amdgpu_asic_reset_method(adev)) { 3371 case AMD_RESET_METHOD_LEGACY: 3372 case AMD_RESET_METHOD_LINK: 3373 case AMD_RESET_METHOD_BACO: 3374 case AMD_RESET_METHOD_MODE1: 3375 return true; 3376 default: 3377 return false; 3378 } 3379 } 3380 3381 /** 3382 * amdgpu_device_set_cg_state - set clockgating for amdgpu device 3383 * 3384 * @adev: amdgpu_device pointer 3385 * @state: clockgating state (gate or ungate) 3386 * 3387 * The list of all the hardware IPs that make up the asic is walked and the 3388 * set_clockgating_state callbacks are run. 3389 * Late initialization pass enabling clockgating for hardware IPs. 3390 * Fini or suspend, pass disabling clockgating for hardware IPs. 3391 * Returns 0 on success, negative error code on failure. 3392 */ 3393 3394 int amdgpu_device_set_cg_state(struct amdgpu_device *adev, 3395 enum amd_clockgating_state state) 3396 { 3397 int i, j, r; 3398 3399 if (amdgpu_emu_mode == 1) 3400 return 0; 3401 3402 for (j = 0; j < adev->num_ip_blocks; j++) { 3403 i = state == AMD_CG_STATE_GATE ? j : adev->num_ip_blocks - j - 1; 3404 if (!adev->ip_blocks[i].status.late_initialized) 3405 continue; 3406 /* skip CG for GFX, SDMA on S0ix */ 3407 if (adev->in_s0ix && 3408 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GFX || 3409 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SDMA)) 3410 continue; 3411 /* skip CG for VCE/UVD, it's handled specially */ 3412 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_UVD && 3413 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCE && 3414 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCN && 3415 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_JPEG && 3416 adev->ip_blocks[i].version->funcs->set_clockgating_state) { 3417 /* enable clockgating to save power */ 3418 r = adev->ip_blocks[i].version->funcs->set_clockgating_state(&adev->ip_blocks[i], 3419 state); 3420 if (r) { 3421 dev_err(adev->dev, 3422 "set_clockgating_state(gate) of IP block <%s> failed %d\n", 3423 adev->ip_blocks[i].version->funcs->name, 3424 r); 3425 return r; 3426 } 3427 } 3428 } 3429 3430 return 0; 3431 } 3432 3433 int amdgpu_device_set_pg_state(struct amdgpu_device *adev, 3434 enum amd_powergating_state state) 3435 { 3436 int i, j, r; 3437 3438 if (amdgpu_emu_mode == 1) 3439 return 0; 3440 3441 for (j = 0; j < adev->num_ip_blocks; j++) { 3442 i = state == AMD_PG_STATE_GATE ? j : adev->num_ip_blocks - j - 1; 3443 if (!adev->ip_blocks[i].status.late_initialized) 3444 continue; 3445 /* skip PG for GFX, SDMA on S0ix */ 3446 if (adev->in_s0ix && 3447 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GFX || 3448 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SDMA)) 3449 continue; 3450 /* skip CG for VCE/UVD, it's handled specially */ 3451 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_UVD && 3452 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCE && 3453 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCN && 3454 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_JPEG && 3455 adev->ip_blocks[i].version->funcs->set_powergating_state) { 3456 /* enable powergating to save power */ 3457 r = adev->ip_blocks[i].version->funcs->set_powergating_state(&adev->ip_blocks[i], 3458 state); 3459 if (r) { 3460 dev_err(adev->dev, 3461 "set_powergating_state(gate) of IP block <%s> failed %d\n", 3462 adev->ip_blocks[i].version->funcs->name, 3463 r); 3464 return r; 3465 } 3466 } 3467 } 3468 return 0; 3469 } 3470 3471 static int amdgpu_device_enable_mgpu_fan_boost(void) 3472 { 3473 struct amdgpu_gpu_instance *gpu_ins; 3474 struct amdgpu_device *adev; 3475 int i, ret = 0; 3476 3477 mutex_lock(&mgpu_info.mutex); 3478 3479 /* 3480 * MGPU fan boost feature should be enabled 3481 * only when there are two or more dGPUs in 3482 * the system 3483 */ 3484 if (mgpu_info.num_dgpu < 2) 3485 goto out; 3486 3487 for (i = 0; i < mgpu_info.num_dgpu; i++) { 3488 gpu_ins = &(mgpu_info.gpu_ins[i]); 3489 adev = gpu_ins->adev; 3490 if (!(adev->flags & AMD_IS_APU || amdgpu_sriov_multi_vf_mode(adev)) && 3491 !gpu_ins->mgpu_fan_enabled) { 3492 ret = amdgpu_dpm_enable_mgpu_fan_boost(adev); 3493 if (ret) 3494 break; 3495 3496 gpu_ins->mgpu_fan_enabled = 1; 3497 } 3498 } 3499 3500 out: 3501 mutex_unlock(&mgpu_info.mutex); 3502 3503 return ret; 3504 } 3505 3506 /** 3507 * amdgpu_device_ip_late_init - run late init for hardware IPs 3508 * 3509 * @adev: amdgpu_device pointer 3510 * 3511 * Late initialization pass for hardware IPs. The list of all the hardware 3512 * IPs that make up the asic is walked and the late_init callbacks are run. 3513 * late_init covers any special initialization that an IP requires 3514 * after all of the have been initialized or something that needs to happen 3515 * late in the init process. 3516 * Returns 0 on success, negative error code on failure. 3517 */ 3518 static int amdgpu_device_ip_late_init(struct amdgpu_device *adev) 3519 { 3520 struct amdgpu_gpu_instance *gpu_instance; 3521 int i = 0, r; 3522 3523 for (i = 0; i < adev->num_ip_blocks; i++) { 3524 if (!adev->ip_blocks[i].status.hw) 3525 continue; 3526 if (adev->ip_blocks[i].version->funcs->late_init) { 3527 r = adev->ip_blocks[i].version->funcs->late_init(&adev->ip_blocks[i]); 3528 if (r) { 3529 dev_err(adev->dev, 3530 "late_init of IP block <%s> failed %d\n", 3531 adev->ip_blocks[i].version->funcs->name, 3532 r); 3533 return r; 3534 } 3535 } 3536 adev->ip_blocks[i].status.late_initialized = true; 3537 } 3538 3539 r = amdgpu_ras_late_init(adev); 3540 if (r) { 3541 dev_err(adev->dev, "amdgpu_ras_late_init failed %d", r); 3542 return r; 3543 } 3544 3545 if (!amdgpu_reset_in_recovery(adev)) 3546 amdgpu_ras_set_error_query_ready(adev, true); 3547 3548 amdgpu_device_set_cg_state(adev, AMD_CG_STATE_GATE); 3549 amdgpu_device_set_pg_state(adev, AMD_PG_STATE_GATE); 3550 3551 amdgpu_device_fill_reset_magic(adev); 3552 3553 r = amdgpu_device_enable_mgpu_fan_boost(); 3554 if (r) 3555 dev_err(adev->dev, "enable mgpu fan boost failed (%d).\n", r); 3556 3557 /* For passthrough configuration on arcturus and aldebaran, enable special handling SBR */ 3558 if (amdgpu_passthrough(adev) && 3559 ((adev->asic_type == CHIP_ARCTURUS && adev->gmc.xgmi.num_physical_nodes > 1) || 3560 adev->asic_type == CHIP_ALDEBARAN)) 3561 amdgpu_dpm_handle_passthrough_sbr(adev, true); 3562 3563 if (adev->gmc.xgmi.num_physical_nodes > 1) { 3564 mutex_lock(&mgpu_info.mutex); 3565 3566 /* 3567 * Reset device p-state to low as this was booted with high. 3568 * 3569 * This should be performed only after all devices from the same 3570 * hive get initialized. 3571 * 3572 * However, it's unknown how many device in the hive in advance. 3573 * As this is counted one by one during devices initializations. 3574 * 3575 * So, we wait for all XGMI interlinked devices initialized. 3576 * This may bring some delays as those devices may come from 3577 * different hives. But that should be OK. 3578 */ 3579 if (mgpu_info.num_dgpu == adev->gmc.xgmi.num_physical_nodes) { 3580 for (i = 0; i < mgpu_info.num_gpu; i++) { 3581 gpu_instance = &(mgpu_info.gpu_ins[i]); 3582 if (gpu_instance->adev->flags & AMD_IS_APU) 3583 continue; 3584 3585 r = amdgpu_xgmi_set_pstate(gpu_instance->adev, 3586 AMDGPU_XGMI_PSTATE_MIN); 3587 if (r) { 3588 dev_err(adev->dev, 3589 "pstate setting failed (%d).\n", 3590 r); 3591 break; 3592 } 3593 } 3594 } 3595 3596 mutex_unlock(&mgpu_info.mutex); 3597 } 3598 3599 return 0; 3600 } 3601 3602 static void amdgpu_ip_block_hw_fini(struct amdgpu_ip_block *ip_block) 3603 { 3604 struct amdgpu_device *adev = ip_block->adev; 3605 int r; 3606 3607 if (!ip_block->version->funcs->hw_fini) { 3608 dev_err(adev->dev, "hw_fini of IP block <%s> not defined\n", 3609 ip_block->version->funcs->name); 3610 } else { 3611 r = ip_block->version->funcs->hw_fini(ip_block); 3612 /* XXX handle errors */ 3613 if (r) { 3614 dev_dbg(adev->dev, 3615 "hw_fini of IP block <%s> failed %d\n", 3616 ip_block->version->funcs->name, r); 3617 } 3618 } 3619 3620 ip_block->status.hw = false; 3621 } 3622 3623 /** 3624 * amdgpu_device_smu_fini_early - smu hw_fini wrapper 3625 * 3626 * @adev: amdgpu_device pointer 3627 * 3628 * For ASICs need to disable SMC first 3629 */ 3630 static void amdgpu_device_smu_fini_early(struct amdgpu_device *adev) 3631 { 3632 int i; 3633 3634 if (amdgpu_ip_version(adev, GC_HWIP, 0) > IP_VERSION(9, 0, 0)) 3635 return; 3636 3637 for (i = 0; i < adev->num_ip_blocks; i++) { 3638 if (!adev->ip_blocks[i].status.hw) 3639 continue; 3640 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) { 3641 amdgpu_ip_block_hw_fini(&adev->ip_blocks[i]); 3642 break; 3643 } 3644 } 3645 } 3646 3647 static int amdgpu_device_ip_fini_early(struct amdgpu_device *adev) 3648 { 3649 int i, r; 3650 3651 for (i = 0; i < adev->num_ip_blocks; i++) { 3652 if (!adev->ip_blocks[i].version->funcs->early_fini) 3653 continue; 3654 3655 r = adev->ip_blocks[i].version->funcs->early_fini(&adev->ip_blocks[i]); 3656 if (r) { 3657 dev_dbg(adev->dev, 3658 "early_fini of IP block <%s> failed %d\n", 3659 adev->ip_blocks[i].version->funcs->name, r); 3660 } 3661 } 3662 3663 amdgpu_device_set_pg_state(adev, AMD_PG_STATE_UNGATE); 3664 amdgpu_device_set_cg_state(adev, AMD_CG_STATE_UNGATE); 3665 3666 amdgpu_amdkfd_suspend(adev, true); 3667 amdgpu_userq_suspend(adev); 3668 3669 /* Workaround for ASICs need to disable SMC first */ 3670 amdgpu_device_smu_fini_early(adev); 3671 3672 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 3673 if (!adev->ip_blocks[i].status.hw) 3674 continue; 3675 3676 amdgpu_ip_block_hw_fini(&adev->ip_blocks[i]); 3677 } 3678 3679 if (amdgpu_sriov_vf(adev)) { 3680 if (amdgpu_virt_release_full_gpu(adev, false)) 3681 dev_err(adev->dev, 3682 "failed to release exclusive mode on fini\n"); 3683 } 3684 3685 /* 3686 * Driver reload on the APU can fail due to firmware validation because 3687 * the PSP is always running, as it is shared across the whole SoC. 3688 * This same issue does not occur on dGPU because it has a mechanism 3689 * that checks whether the PSP is running. A solution for those issues 3690 * in the APU is to trigger a GPU reset, but this should be done during 3691 * the unload phase to avoid adding boot latency and screen flicker. 3692 */ 3693 if ((adev->flags & AMD_IS_APU) && !adev->gmc.is_app_apu) { 3694 r = amdgpu_asic_reset(adev); 3695 if (r) 3696 dev_err(adev->dev, "asic reset on %s failed\n", __func__); 3697 } 3698 3699 return 0; 3700 } 3701 3702 /** 3703 * amdgpu_device_ip_fini - run fini for hardware IPs 3704 * 3705 * @adev: amdgpu_device pointer 3706 * 3707 * Main teardown pass for hardware IPs. The list of all the hardware 3708 * IPs that make up the asic is walked and the hw_fini and sw_fini callbacks 3709 * are run. hw_fini tears down the hardware associated with each IP 3710 * and sw_fini tears down any software state associated with each IP. 3711 * Returns 0 on success, negative error code on failure. 3712 */ 3713 static int amdgpu_device_ip_fini(struct amdgpu_device *adev) 3714 { 3715 int i, r; 3716 3717 amdgpu_cper_fini(adev); 3718 3719 if (amdgpu_sriov_vf(adev) && adev->virt.ras_init_done) 3720 amdgpu_virt_release_ras_err_handler_data(adev); 3721 3722 if (adev->gmc.xgmi.num_physical_nodes > 1) 3723 amdgpu_xgmi_remove_device(adev); 3724 3725 amdgpu_amdkfd_device_fini_sw(adev); 3726 3727 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 3728 if (!adev->ip_blocks[i].status.sw) 3729 continue; 3730 3731 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) { 3732 amdgpu_ucode_free_bo(adev); 3733 amdgpu_free_static_csa(&adev->virt.csa_obj); 3734 amdgpu_device_wb_fini(adev); 3735 amdgpu_device_mem_scratch_fini(adev); 3736 amdgpu_ib_pool_fini(adev); 3737 amdgpu_seq64_fini(adev); 3738 amdgpu_doorbell_fini(adev); 3739 } 3740 if (adev->ip_blocks[i].version->funcs->sw_fini) { 3741 r = adev->ip_blocks[i].version->funcs->sw_fini(&adev->ip_blocks[i]); 3742 /* XXX handle errors */ 3743 if (r) { 3744 dev_dbg(adev->dev, 3745 "sw_fini of IP block <%s> failed %d\n", 3746 adev->ip_blocks[i].version->funcs->name, 3747 r); 3748 } 3749 } 3750 adev->ip_blocks[i].status.sw = false; 3751 adev->ip_blocks[i].status.valid = false; 3752 } 3753 3754 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 3755 if (!adev->ip_blocks[i].status.late_initialized) 3756 continue; 3757 if (adev->ip_blocks[i].version->funcs->late_fini) 3758 adev->ip_blocks[i].version->funcs->late_fini(&adev->ip_blocks[i]); 3759 adev->ip_blocks[i].status.late_initialized = false; 3760 } 3761 3762 amdgpu_ras_fini(adev); 3763 amdgpu_uid_fini(adev); 3764 3765 return 0; 3766 } 3767 3768 /** 3769 * amdgpu_device_delayed_init_work_handler - work handler for IB tests 3770 * 3771 * @work: work_struct. 3772 */ 3773 static void amdgpu_device_delayed_init_work_handler(struct work_struct *work) 3774 { 3775 struct amdgpu_device *adev = 3776 container_of(work, struct amdgpu_device, delayed_init_work.work); 3777 int r; 3778 3779 r = amdgpu_ib_ring_tests(adev); 3780 if (r) 3781 dev_err(adev->dev, "ib ring test failed (%d).\n", r); 3782 } 3783 3784 static void amdgpu_device_delay_enable_gfx_off(struct work_struct *work) 3785 { 3786 struct amdgpu_device *adev = 3787 container_of(work, struct amdgpu_device, gfx.gfx_off_delay_work.work); 3788 3789 WARN_ON_ONCE(adev->gfx.gfx_off_state); 3790 WARN_ON_ONCE(adev->gfx.gfx_off_req_count); 3791 3792 if (!amdgpu_dpm_set_powergating_by_smu(adev, AMD_IP_BLOCK_TYPE_GFX, true, 0)) 3793 adev->gfx.gfx_off_state = true; 3794 } 3795 3796 /** 3797 * amdgpu_device_ip_suspend_phase1 - run suspend for hardware IPs (phase 1) 3798 * 3799 * @adev: amdgpu_device pointer 3800 * 3801 * Main suspend function for hardware IPs. The list of all the hardware 3802 * IPs that make up the asic is walked, clockgating is disabled and the 3803 * suspend callbacks are run. suspend puts the hardware and software state 3804 * in each IP into a state suitable for suspend. 3805 * Returns 0 on success, negative error code on failure. 3806 */ 3807 static int amdgpu_device_ip_suspend_phase1(struct amdgpu_device *adev) 3808 { 3809 int i, r, rec; 3810 3811 amdgpu_device_set_pg_state(adev, AMD_PG_STATE_UNGATE); 3812 amdgpu_device_set_cg_state(adev, AMD_CG_STATE_UNGATE); 3813 3814 /* 3815 * Per PMFW team's suggestion, driver needs to handle gfxoff 3816 * and df cstate features disablement for gpu reset(e.g. Mode1Reset) 3817 * scenario. Add the missing df cstate disablement here. 3818 */ 3819 if (amdgpu_dpm_set_df_cstate(adev, DF_CSTATE_DISALLOW)) 3820 dev_warn(adev->dev, "Failed to disallow df cstate"); 3821 3822 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 3823 if (!adev->ip_blocks[i].status.valid) 3824 continue; 3825 3826 /* displays are handled separately */ 3827 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_DCE) 3828 continue; 3829 3830 r = amdgpu_ip_block_suspend(&adev->ip_blocks[i]); 3831 if (r) 3832 goto unwind; 3833 } 3834 3835 return 0; 3836 unwind: 3837 rec = amdgpu_device_ip_resume_phase3(adev); 3838 if (rec) 3839 dev_err(adev->dev, 3840 "amdgpu_device_ip_resume_phase3 failed during unwind: %d\n", 3841 rec); 3842 3843 amdgpu_dpm_set_df_cstate(adev, DF_CSTATE_ALLOW); 3844 3845 amdgpu_device_set_pg_state(adev, AMD_PG_STATE_GATE); 3846 amdgpu_device_set_cg_state(adev, AMD_CG_STATE_GATE); 3847 3848 return r; 3849 } 3850 3851 /** 3852 * amdgpu_device_ip_suspend_phase2 - run suspend for hardware IPs (phase 2) 3853 * 3854 * @adev: amdgpu_device pointer 3855 * 3856 * Main suspend function for hardware IPs. The list of all the hardware 3857 * IPs that make up the asic is walked, clockgating is disabled and the 3858 * suspend callbacks are run. suspend puts the hardware and software state 3859 * in each IP into a state suitable for suspend. 3860 * Returns 0 on success, negative error code on failure. 3861 */ 3862 static int amdgpu_device_ip_suspend_phase2(struct amdgpu_device *adev) 3863 { 3864 int i, r, rec; 3865 3866 if (adev->in_s0ix) 3867 amdgpu_dpm_gfx_state_change(adev, sGpuChangeState_D3Entry); 3868 3869 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 3870 if (!adev->ip_blocks[i].status.valid) 3871 continue; 3872 /* displays are handled in phase1 */ 3873 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE) 3874 continue; 3875 /* PSP lost connection when err_event_athub occurs */ 3876 if (amdgpu_ras_intr_triggered() && 3877 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) { 3878 adev->ip_blocks[i].status.hw = false; 3879 continue; 3880 } 3881 3882 /* skip unnecessary suspend if we do not initialize them yet */ 3883 if (!amdgpu_ip_member_of_hwini( 3884 adev, adev->ip_blocks[i].version->type)) 3885 continue; 3886 3887 /* Since we skip suspend for S0i3, we need to cancel the delayed 3888 * idle work here as the suspend callback never gets called. 3889 */ 3890 if (adev->in_s0ix && 3891 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GFX && 3892 amdgpu_ip_version(adev, GC_HWIP, 0) >= IP_VERSION(10, 0, 0)) 3893 cancel_delayed_work_sync(&adev->gfx.idle_work); 3894 /* skip suspend of gfx/mes and psp for S0ix 3895 * gfx is in gfxoff state, so on resume it will exit gfxoff just 3896 * like at runtime. PSP is also part of the always on hardware 3897 * so no need to suspend it. 3898 */ 3899 if (adev->in_s0ix && 3900 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP || 3901 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GFX || 3902 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_MES)) 3903 continue; 3904 3905 /* SDMA 5.x+ is part of GFX power domain so it's covered by GFXOFF */ 3906 if (adev->in_s0ix && 3907 (amdgpu_ip_version(adev, SDMA0_HWIP, 0) >= 3908 IP_VERSION(5, 0, 0)) && 3909 (adev->ip_blocks[i].version->type == 3910 AMD_IP_BLOCK_TYPE_SDMA)) 3911 continue; 3912 3913 /* Once swPSP provides the IMU, RLC FW binaries to TOS during cold-boot. 3914 * These are in TMR, hence are expected to be reused by PSP-TOS to reload 3915 * from this location and RLC Autoload automatically also gets loaded 3916 * from here based on PMFW -> PSP message during re-init sequence. 3917 * Therefore, the psp suspend & resume should be skipped to avoid destroy 3918 * the TMR and reload FWs again for IMU enabled APU ASICs. 3919 */ 3920 if (amdgpu_in_reset(adev) && 3921 (adev->flags & AMD_IS_APU) && adev->gfx.imu.funcs && 3922 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) 3923 continue; 3924 3925 r = amdgpu_ip_block_suspend(&adev->ip_blocks[i]); 3926 if (r) 3927 goto unwind; 3928 3929 /* handle putting the SMC in the appropriate state */ 3930 if (!amdgpu_sriov_vf(adev)) { 3931 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) { 3932 r = amdgpu_dpm_set_mp1_state(adev, adev->mp1_state); 3933 if (r) { 3934 dev_err(adev->dev, 3935 "SMC failed to set mp1 state %d, %d\n", 3936 adev->mp1_state, r); 3937 goto unwind; 3938 } 3939 } 3940 } 3941 } 3942 3943 return 0; 3944 unwind: 3945 /* suspend phase 2 = resume phase 1 + resume phase 2 */ 3946 rec = amdgpu_device_ip_resume_phase1(adev); 3947 if (rec) { 3948 dev_err(adev->dev, 3949 "amdgpu_device_ip_resume_phase1 failed during unwind: %d\n", 3950 rec); 3951 return r; 3952 } 3953 3954 rec = amdgpu_device_fw_loading(adev); 3955 if (rec) { 3956 dev_err(adev->dev, 3957 "amdgpu_device_fw_loading failed during unwind: %d\n", 3958 rec); 3959 return r; 3960 } 3961 3962 rec = amdgpu_device_ip_resume_phase2(adev); 3963 if (rec) { 3964 dev_err(adev->dev, 3965 "amdgpu_device_ip_resume_phase2 failed during unwind: %d\n", 3966 rec); 3967 return r; 3968 } 3969 3970 return r; 3971 } 3972 3973 /** 3974 * amdgpu_device_ip_suspend - run suspend for hardware IPs 3975 * 3976 * @adev: amdgpu_device pointer 3977 * 3978 * Main suspend function for hardware IPs. The list of all the hardware 3979 * IPs that make up the asic is walked, clockgating is disabled and the 3980 * suspend callbacks are run. suspend puts the hardware and software state 3981 * in each IP into a state suitable for suspend. 3982 * Returns 0 on success, negative error code on failure. 3983 */ 3984 static int amdgpu_device_ip_suspend(struct amdgpu_device *adev) 3985 { 3986 int r; 3987 3988 if (amdgpu_sriov_vf(adev)) { 3989 amdgpu_virt_fini_data_exchange(adev); 3990 amdgpu_virt_request_full_gpu(adev, false); 3991 } 3992 3993 amdgpu_ttm_set_buffer_funcs_status(adev, false); 3994 3995 r = amdgpu_device_ip_suspend_phase1(adev); 3996 if (r) 3997 return r; 3998 r = amdgpu_device_ip_suspend_phase2(adev); 3999 4000 if (amdgpu_sriov_vf(adev)) 4001 amdgpu_virt_release_full_gpu(adev, false); 4002 4003 return r; 4004 } 4005 4006 static int amdgpu_device_ip_reinit_early_sriov(struct amdgpu_device *adev) 4007 { 4008 int i, r; 4009 4010 static enum amd_ip_block_type ip_order[] = { 4011 AMD_IP_BLOCK_TYPE_COMMON, 4012 AMD_IP_BLOCK_TYPE_GMC, 4013 AMD_IP_BLOCK_TYPE_PSP, 4014 AMD_IP_BLOCK_TYPE_IH, 4015 }; 4016 4017 for (i = 0; i < adev->num_ip_blocks; i++) { 4018 int j; 4019 struct amdgpu_ip_block *block; 4020 4021 block = &adev->ip_blocks[i]; 4022 block->status.hw = false; 4023 4024 for (j = 0; j < ARRAY_SIZE(ip_order); j++) { 4025 4026 if (block->version->type != ip_order[j] || 4027 !block->status.valid) 4028 continue; 4029 4030 r = block->version->funcs->hw_init(&adev->ip_blocks[i]); 4031 if (r) { 4032 dev_err(adev->dev, "RE-INIT-early: %s failed\n", 4033 block->version->funcs->name); 4034 return r; 4035 } 4036 block->status.hw = true; 4037 } 4038 } 4039 4040 return 0; 4041 } 4042 4043 static int amdgpu_device_ip_reinit_late_sriov(struct amdgpu_device *adev) 4044 { 4045 struct amdgpu_ip_block *block; 4046 int i, r = 0; 4047 4048 static enum amd_ip_block_type ip_order[] = { 4049 AMD_IP_BLOCK_TYPE_SMC, 4050 AMD_IP_BLOCK_TYPE_DCE, 4051 AMD_IP_BLOCK_TYPE_GFX, 4052 AMD_IP_BLOCK_TYPE_SDMA, 4053 AMD_IP_BLOCK_TYPE_MES, 4054 AMD_IP_BLOCK_TYPE_UVD, 4055 AMD_IP_BLOCK_TYPE_VCE, 4056 AMD_IP_BLOCK_TYPE_VCN, 4057 AMD_IP_BLOCK_TYPE_JPEG 4058 }; 4059 4060 for (i = 0; i < ARRAY_SIZE(ip_order); i++) { 4061 block = amdgpu_device_ip_get_ip_block(adev, ip_order[i]); 4062 4063 if (!block) 4064 continue; 4065 4066 if (block->status.valid && !block->status.hw) { 4067 if (block->version->type == AMD_IP_BLOCK_TYPE_SMC) { 4068 r = amdgpu_ip_block_resume(block); 4069 } else { 4070 r = block->version->funcs->hw_init(block); 4071 } 4072 4073 if (r) { 4074 dev_err(adev->dev, "RE-INIT-late: %s failed\n", 4075 block->version->funcs->name); 4076 break; 4077 } 4078 block->status.hw = true; 4079 } 4080 } 4081 4082 return r; 4083 } 4084 4085 /** 4086 * amdgpu_device_ip_resume_phase1 - run resume for hardware IPs 4087 * 4088 * @adev: amdgpu_device pointer 4089 * 4090 * First resume function for hardware IPs. The list of all the hardware 4091 * IPs that make up the asic is walked and the resume callbacks are run for 4092 * COMMON, GMC, and IH. resume puts the hardware into a functional state 4093 * after a suspend and updates the software state as necessary. This 4094 * function is also used for restoring the GPU after a GPU reset. 4095 * Returns 0 on success, negative error code on failure. 4096 */ 4097 static int amdgpu_device_ip_resume_phase1(struct amdgpu_device *adev) 4098 { 4099 int i, r; 4100 4101 for (i = 0; i < adev->num_ip_blocks; i++) { 4102 if (!adev->ip_blocks[i].status.valid || adev->ip_blocks[i].status.hw) 4103 continue; 4104 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON || 4105 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC || 4106 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH || 4107 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP && amdgpu_sriov_vf(adev))) { 4108 4109 r = amdgpu_ip_block_resume(&adev->ip_blocks[i]); 4110 if (r) 4111 return r; 4112 } 4113 } 4114 4115 return 0; 4116 } 4117 4118 /** 4119 * amdgpu_device_ip_resume_phase2 - run resume for hardware IPs 4120 * 4121 * @adev: amdgpu_device pointer 4122 * 4123 * Second resume function for hardware IPs. The list of all the hardware 4124 * IPs that make up the asic is walked and the resume callbacks are run for 4125 * all blocks except COMMON, GMC, and IH. resume puts the hardware into a 4126 * functional state after a suspend and updates the software state as 4127 * necessary. This function is also used for restoring the GPU after a GPU 4128 * reset. 4129 * Returns 0 on success, negative error code on failure. 4130 */ 4131 static int amdgpu_device_ip_resume_phase2(struct amdgpu_device *adev) 4132 { 4133 int i, r; 4134 4135 for (i = 0; i < adev->num_ip_blocks; i++) { 4136 if (!adev->ip_blocks[i].status.valid || adev->ip_blocks[i].status.hw) 4137 continue; 4138 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON || 4139 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC || 4140 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH || 4141 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE || 4142 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) 4143 continue; 4144 r = amdgpu_ip_block_resume(&adev->ip_blocks[i]); 4145 if (r) 4146 return r; 4147 } 4148 4149 return 0; 4150 } 4151 4152 /** 4153 * amdgpu_device_ip_resume_phase3 - run resume for hardware IPs 4154 * 4155 * @adev: amdgpu_device pointer 4156 * 4157 * Third resume function for hardware IPs. The list of all the hardware 4158 * IPs that make up the asic is walked and the resume callbacks are run for 4159 * all DCE. resume puts the hardware into a functional state after a suspend 4160 * and updates the software state as necessary. This function is also used 4161 * for restoring the GPU after a GPU reset. 4162 * 4163 * Returns 0 on success, negative error code on failure. 4164 */ 4165 static int amdgpu_device_ip_resume_phase3(struct amdgpu_device *adev) 4166 { 4167 int i, r; 4168 4169 for (i = 0; i < adev->num_ip_blocks; i++) { 4170 if (!adev->ip_blocks[i].status.valid || adev->ip_blocks[i].status.hw) 4171 continue; 4172 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE) { 4173 r = amdgpu_ip_block_resume(&adev->ip_blocks[i]); 4174 if (r) 4175 return r; 4176 } 4177 } 4178 4179 return 0; 4180 } 4181 4182 /** 4183 * amdgpu_device_ip_resume - run resume for hardware IPs 4184 * 4185 * @adev: amdgpu_device pointer 4186 * 4187 * Main resume function for hardware IPs. The hardware IPs 4188 * are split into two resume functions because they are 4189 * also used in recovering from a GPU reset and some additional 4190 * steps need to be take between them. In this case (S3/S4) they are 4191 * run sequentially. 4192 * Returns 0 on success, negative error code on failure. 4193 */ 4194 static int amdgpu_device_ip_resume(struct amdgpu_device *adev) 4195 { 4196 int r; 4197 4198 r = amdgpu_device_ip_resume_phase1(adev); 4199 if (r) 4200 return r; 4201 4202 r = amdgpu_device_fw_loading(adev); 4203 if (r) 4204 return r; 4205 4206 r = amdgpu_device_ip_resume_phase2(adev); 4207 4208 if (adev->mman.buffer_funcs_ring->sched.ready) 4209 amdgpu_ttm_set_buffer_funcs_status(adev, true); 4210 4211 if (r) 4212 return r; 4213 4214 amdgpu_fence_driver_hw_init(adev); 4215 4216 r = amdgpu_device_ip_resume_phase3(adev); 4217 4218 return r; 4219 } 4220 4221 /** 4222 * amdgpu_device_detect_sriov_bios - determine if the board supports SR-IOV 4223 * 4224 * @adev: amdgpu_device pointer 4225 * 4226 * Query the VBIOS data tables to determine if the board supports SR-IOV. 4227 */ 4228 static void amdgpu_device_detect_sriov_bios(struct amdgpu_device *adev) 4229 { 4230 if (amdgpu_sriov_vf(adev)) { 4231 if (adev->is_atom_fw) { 4232 if (amdgpu_atomfirmware_gpu_virtualization_supported(adev)) 4233 adev->virt.caps |= AMDGPU_SRIOV_CAPS_SRIOV_VBIOS; 4234 } else { 4235 if (amdgpu_atombios_has_gpu_virtualization_table(adev)) 4236 adev->virt.caps |= AMDGPU_SRIOV_CAPS_SRIOV_VBIOS; 4237 } 4238 4239 if (!(adev->virt.caps & AMDGPU_SRIOV_CAPS_SRIOV_VBIOS)) 4240 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_NO_VBIOS, 0, 0); 4241 } 4242 } 4243 4244 /** 4245 * amdgpu_device_asic_has_dc_support - determine if DC supports the asic 4246 * 4247 * @pdev : pci device context 4248 * @asic_type: AMD asic type 4249 * 4250 * Check if there is DC (new modesetting infrastructre) support for an asic. 4251 * returns true if DC has support, false if not. 4252 */ 4253 bool amdgpu_device_asic_has_dc_support(struct pci_dev *pdev, 4254 enum amd_asic_type asic_type) 4255 { 4256 switch (asic_type) { 4257 #ifdef CONFIG_DRM_AMDGPU_SI 4258 case CHIP_HAINAN: 4259 #endif 4260 case CHIP_TOPAZ: 4261 /* chips with no display hardware */ 4262 return false; 4263 #if defined(CONFIG_DRM_AMD_DC) 4264 case CHIP_TAHITI: 4265 case CHIP_PITCAIRN: 4266 case CHIP_VERDE: 4267 case CHIP_OLAND: 4268 return amdgpu_dc != 0 && IS_ENABLED(CONFIG_DRM_AMD_DC_SI); 4269 case CHIP_KAVERI: 4270 case CHIP_KABINI: 4271 case CHIP_MULLINS: 4272 /* 4273 * We have systems in the wild with these ASICs that require 4274 * TRAVIS and NUTMEG support which is not supported with DC. 4275 * 4276 * Fallback to the non-DC driver here by default so as not to 4277 * cause regressions. 4278 */ 4279 return amdgpu_dc > 0; 4280 default: 4281 return amdgpu_dc != 0; 4282 #else 4283 default: 4284 if (amdgpu_dc > 0) 4285 dev_info_once( 4286 &pdev->dev, 4287 "Display Core has been requested via kernel parameter but isn't supported by ASIC, ignoring\n"); 4288 return false; 4289 #endif 4290 } 4291 } 4292 4293 /** 4294 * amdgpu_device_has_dc_support - check if dc is supported 4295 * 4296 * @adev: amdgpu_device pointer 4297 * 4298 * Returns true for supported, false for not supported 4299 */ 4300 bool amdgpu_device_has_dc_support(struct amdgpu_device *adev) 4301 { 4302 if (adev->enable_virtual_display || 4303 (adev->harvest_ip_mask & AMD_HARVEST_IP_DMU_MASK)) 4304 return false; 4305 4306 return amdgpu_device_asic_has_dc_support(adev->pdev, adev->asic_type); 4307 } 4308 4309 static void amdgpu_device_xgmi_reset_func(struct work_struct *__work) 4310 { 4311 struct amdgpu_device *adev = 4312 container_of(__work, struct amdgpu_device, xgmi_reset_work); 4313 struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev); 4314 4315 /* It's a bug to not have a hive within this function */ 4316 if (WARN_ON(!hive)) 4317 return; 4318 4319 /* 4320 * Use task barrier to synchronize all xgmi reset works across the 4321 * hive. task_barrier_enter and task_barrier_exit will block 4322 * until all the threads running the xgmi reset works reach 4323 * those points. task_barrier_full will do both blocks. 4324 */ 4325 if (amdgpu_asic_reset_method(adev) == AMD_RESET_METHOD_BACO) { 4326 4327 task_barrier_enter(&hive->tb); 4328 adev->asic_reset_res = amdgpu_device_baco_enter(adev); 4329 4330 if (adev->asic_reset_res) 4331 goto fail; 4332 4333 task_barrier_exit(&hive->tb); 4334 adev->asic_reset_res = amdgpu_device_baco_exit(adev); 4335 4336 if (adev->asic_reset_res) 4337 goto fail; 4338 4339 amdgpu_ras_reset_error_count(adev, AMDGPU_RAS_BLOCK__MMHUB); 4340 } else { 4341 4342 task_barrier_full(&hive->tb); 4343 adev->asic_reset_res = amdgpu_asic_reset(adev); 4344 } 4345 4346 fail: 4347 if (adev->asic_reset_res) 4348 dev_warn(adev->dev, 4349 "ASIC reset failed with error, %d for drm dev, %s", 4350 adev->asic_reset_res, adev_to_drm(adev)->unique); 4351 amdgpu_put_xgmi_hive(hive); 4352 } 4353 4354 static int amdgpu_device_get_job_timeout_settings(struct amdgpu_device *adev) 4355 { 4356 char *input = amdgpu_lockup_timeout; 4357 char *timeout_setting = NULL; 4358 int index = 0; 4359 long timeout; 4360 int ret = 0; 4361 4362 /* By default timeout for all queues is 2 sec */ 4363 adev->gfx_timeout = adev->compute_timeout = adev->sdma_timeout = 4364 adev->video_timeout = msecs_to_jiffies(2000); 4365 4366 if (!strnlen(input, AMDGPU_MAX_TIMEOUT_PARAM_LENGTH)) 4367 return 0; 4368 4369 while ((timeout_setting = strsep(&input, ",")) && 4370 strnlen(timeout_setting, AMDGPU_MAX_TIMEOUT_PARAM_LENGTH)) { 4371 ret = kstrtol(timeout_setting, 0, &timeout); 4372 if (ret) 4373 return ret; 4374 4375 if (timeout == 0) { 4376 index++; 4377 continue; 4378 } else if (timeout < 0) { 4379 timeout = MAX_SCHEDULE_TIMEOUT; 4380 dev_warn(adev->dev, "lockup timeout disabled"); 4381 add_taint(TAINT_SOFTLOCKUP, LOCKDEP_STILL_OK); 4382 } else { 4383 timeout = msecs_to_jiffies(timeout); 4384 } 4385 4386 switch (index++) { 4387 case 0: 4388 adev->gfx_timeout = timeout; 4389 break; 4390 case 1: 4391 adev->compute_timeout = timeout; 4392 break; 4393 case 2: 4394 adev->sdma_timeout = timeout; 4395 break; 4396 case 3: 4397 adev->video_timeout = timeout; 4398 break; 4399 default: 4400 break; 4401 } 4402 } 4403 4404 /* When only one value specified apply it to all queues. */ 4405 if (index == 1) 4406 adev->gfx_timeout = adev->compute_timeout = adev->sdma_timeout = 4407 adev->video_timeout = timeout; 4408 4409 return ret; 4410 } 4411 4412 /** 4413 * amdgpu_device_check_iommu_direct_map - check if RAM direct mapped to GPU 4414 * 4415 * @adev: amdgpu_device pointer 4416 * 4417 * RAM direct mapped to GPU if IOMMU is not enabled or is pass through mode 4418 */ 4419 static void amdgpu_device_check_iommu_direct_map(struct amdgpu_device *adev) 4420 { 4421 struct iommu_domain *domain; 4422 4423 domain = iommu_get_domain_for_dev(adev->dev); 4424 if (!domain || domain->type == IOMMU_DOMAIN_IDENTITY) 4425 adev->ram_is_direct_mapped = true; 4426 } 4427 4428 #if defined(CONFIG_HSA_AMD_P2P) 4429 /** 4430 * amdgpu_device_check_iommu_remap - Check if DMA remapping is enabled. 4431 * 4432 * @adev: amdgpu_device pointer 4433 * 4434 * return if IOMMU remapping bar address 4435 */ 4436 static bool amdgpu_device_check_iommu_remap(struct amdgpu_device *adev) 4437 { 4438 struct iommu_domain *domain; 4439 4440 domain = iommu_get_domain_for_dev(adev->dev); 4441 if (domain && (domain->type == IOMMU_DOMAIN_DMA || 4442 domain->type == IOMMU_DOMAIN_DMA_FQ)) 4443 return true; 4444 4445 return false; 4446 } 4447 #endif 4448 4449 static void amdgpu_device_set_mcbp(struct amdgpu_device *adev) 4450 { 4451 if (amdgpu_mcbp == 1) 4452 adev->gfx.mcbp = true; 4453 else if (amdgpu_mcbp == 0) 4454 adev->gfx.mcbp = false; 4455 4456 if (amdgpu_sriov_vf(adev)) 4457 adev->gfx.mcbp = true; 4458 4459 if (adev->gfx.mcbp) 4460 dev_info(adev->dev, "MCBP is enabled\n"); 4461 } 4462 4463 static int amdgpu_device_sys_interface_init(struct amdgpu_device *adev) 4464 { 4465 int r; 4466 4467 r = amdgpu_atombios_sysfs_init(adev); 4468 if (r) 4469 drm_err(&adev->ddev, 4470 "registering atombios sysfs failed (%d).\n", r); 4471 4472 r = amdgpu_pm_sysfs_init(adev); 4473 if (r) 4474 dev_err(adev->dev, "registering pm sysfs failed (%d).\n", r); 4475 4476 r = amdgpu_ucode_sysfs_init(adev); 4477 if (r) { 4478 adev->ucode_sysfs_en = false; 4479 dev_err(adev->dev, "Creating firmware sysfs failed (%d).\n", r); 4480 } else 4481 adev->ucode_sysfs_en = true; 4482 4483 r = amdgpu_device_attr_sysfs_init(adev); 4484 if (r) 4485 dev_err(adev->dev, "Could not create amdgpu device attr\n"); 4486 4487 r = devm_device_add_group(adev->dev, &amdgpu_board_attrs_group); 4488 if (r) 4489 dev_err(adev->dev, 4490 "Could not create amdgpu board attributes\n"); 4491 4492 amdgpu_fru_sysfs_init(adev); 4493 amdgpu_reg_state_sysfs_init(adev); 4494 amdgpu_xcp_sysfs_init(adev); 4495 4496 return r; 4497 } 4498 4499 static void amdgpu_device_sys_interface_fini(struct amdgpu_device *adev) 4500 { 4501 if (adev->pm.sysfs_initialized) 4502 amdgpu_pm_sysfs_fini(adev); 4503 if (adev->ucode_sysfs_en) 4504 amdgpu_ucode_sysfs_fini(adev); 4505 amdgpu_device_attr_sysfs_fini(adev); 4506 amdgpu_fru_sysfs_fini(adev); 4507 4508 amdgpu_reg_state_sysfs_fini(adev); 4509 amdgpu_xcp_sysfs_fini(adev); 4510 } 4511 4512 /** 4513 * amdgpu_device_init - initialize the driver 4514 * 4515 * @adev: amdgpu_device pointer 4516 * @flags: driver flags 4517 * 4518 * Initializes the driver info and hw (all asics). 4519 * Returns 0 for success or an error on failure. 4520 * Called at driver startup. 4521 */ 4522 int amdgpu_device_init(struct amdgpu_device *adev, 4523 uint32_t flags) 4524 { 4525 struct pci_dev *pdev = adev->pdev; 4526 int r, i; 4527 bool px = false; 4528 u32 max_MBps; 4529 int tmp; 4530 4531 adev->shutdown = false; 4532 adev->flags = flags; 4533 4534 if (amdgpu_force_asic_type >= 0 && amdgpu_force_asic_type < CHIP_LAST) 4535 adev->asic_type = amdgpu_force_asic_type; 4536 else 4537 adev->asic_type = flags & AMD_ASIC_MASK; 4538 4539 adev->usec_timeout = AMDGPU_MAX_USEC_TIMEOUT; 4540 if (amdgpu_emu_mode == 1) 4541 adev->usec_timeout *= 10; 4542 adev->gmc.gart_size = 512 * 1024 * 1024; 4543 adev->accel_working = false; 4544 adev->num_rings = 0; 4545 RCU_INIT_POINTER(adev->gang_submit, dma_fence_get_stub()); 4546 adev->mman.buffer_funcs = NULL; 4547 adev->mman.buffer_funcs_ring = NULL; 4548 adev->vm_manager.vm_pte_funcs = NULL; 4549 adev->vm_manager.vm_pte_num_scheds = 0; 4550 adev->gmc.gmc_funcs = NULL; 4551 adev->harvest_ip_mask = 0x0; 4552 adev->fence_context = dma_fence_context_alloc(AMDGPU_MAX_RINGS); 4553 bitmap_zero(adev->gfx.pipe_reserve_bitmap, AMDGPU_MAX_COMPUTE_QUEUES); 4554 4555 adev->smc_rreg = &amdgpu_invalid_rreg; 4556 adev->smc_wreg = &amdgpu_invalid_wreg; 4557 adev->pcie_rreg = &amdgpu_invalid_rreg; 4558 adev->pcie_wreg = &amdgpu_invalid_wreg; 4559 adev->pcie_rreg_ext = &amdgpu_invalid_rreg_ext; 4560 adev->pcie_wreg_ext = &amdgpu_invalid_wreg_ext; 4561 adev->pciep_rreg = &amdgpu_invalid_rreg; 4562 adev->pciep_wreg = &amdgpu_invalid_wreg; 4563 adev->pcie_rreg64 = &amdgpu_invalid_rreg64; 4564 adev->pcie_wreg64 = &amdgpu_invalid_wreg64; 4565 adev->pcie_rreg64_ext = &amdgpu_invalid_rreg64_ext; 4566 adev->pcie_wreg64_ext = &amdgpu_invalid_wreg64_ext; 4567 adev->uvd_ctx_rreg = &amdgpu_invalid_rreg; 4568 adev->uvd_ctx_wreg = &amdgpu_invalid_wreg; 4569 adev->didt_rreg = &amdgpu_invalid_rreg; 4570 adev->didt_wreg = &amdgpu_invalid_wreg; 4571 adev->gc_cac_rreg = &amdgpu_invalid_rreg; 4572 adev->gc_cac_wreg = &amdgpu_invalid_wreg; 4573 adev->audio_endpt_rreg = &amdgpu_block_invalid_rreg; 4574 adev->audio_endpt_wreg = &amdgpu_block_invalid_wreg; 4575 4576 dev_info( 4577 adev->dev, 4578 "initializing kernel modesetting (%s 0x%04X:0x%04X 0x%04X:0x%04X 0x%02X).\n", 4579 amdgpu_asic_name[adev->asic_type], pdev->vendor, pdev->device, 4580 pdev->subsystem_vendor, pdev->subsystem_device, pdev->revision); 4581 4582 /* mutex initialization are all done here so we 4583 * can recall function without having locking issues 4584 */ 4585 mutex_init(&adev->firmware.mutex); 4586 mutex_init(&adev->pm.mutex); 4587 mutex_init(&adev->gfx.gpu_clock_mutex); 4588 mutex_init(&adev->srbm_mutex); 4589 mutex_init(&adev->gfx.pipe_reserve_mutex); 4590 mutex_init(&adev->gfx.gfx_off_mutex); 4591 mutex_init(&adev->gfx.partition_mutex); 4592 mutex_init(&adev->grbm_idx_mutex); 4593 mutex_init(&adev->mn_lock); 4594 mutex_init(&adev->virt.vf_errors.lock); 4595 hash_init(adev->mn_hash); 4596 mutex_init(&adev->psp.mutex); 4597 mutex_init(&adev->notifier_lock); 4598 mutex_init(&adev->pm.stable_pstate_ctx_lock); 4599 mutex_init(&adev->benchmark_mutex); 4600 mutex_init(&adev->gfx.reset_sem_mutex); 4601 /* Initialize the mutex for cleaner shader isolation between GFX and compute processes */ 4602 mutex_init(&adev->enforce_isolation_mutex); 4603 for (i = 0; i < MAX_XCP; ++i) { 4604 adev->isolation[i].spearhead = dma_fence_get_stub(); 4605 amdgpu_sync_create(&adev->isolation[i].active); 4606 amdgpu_sync_create(&adev->isolation[i].prev); 4607 } 4608 mutex_init(&adev->gfx.userq_sch_mutex); 4609 mutex_init(&adev->gfx.workload_profile_mutex); 4610 mutex_init(&adev->vcn.workload_profile_mutex); 4611 4612 amdgpu_device_init_apu_flags(adev); 4613 4614 r = amdgpu_device_check_arguments(adev); 4615 if (r) 4616 return r; 4617 4618 spin_lock_init(&adev->mmio_idx_lock); 4619 spin_lock_init(&adev->smc_idx_lock); 4620 spin_lock_init(&adev->pcie_idx_lock); 4621 spin_lock_init(&adev->uvd_ctx_idx_lock); 4622 spin_lock_init(&adev->didt_idx_lock); 4623 spin_lock_init(&adev->gc_cac_idx_lock); 4624 spin_lock_init(&adev->se_cac_idx_lock); 4625 spin_lock_init(&adev->audio_endpt_idx_lock); 4626 spin_lock_init(&adev->mm_stats.lock); 4627 spin_lock_init(&adev->virt.rlcg_reg_lock); 4628 spin_lock_init(&adev->wb.lock); 4629 4630 xa_init_flags(&adev->userq_xa, XA_FLAGS_LOCK_IRQ); 4631 4632 INIT_LIST_HEAD(&adev->reset_list); 4633 4634 INIT_LIST_HEAD(&adev->ras_list); 4635 4636 INIT_LIST_HEAD(&adev->pm.od_kobj_list); 4637 4638 xa_init(&adev->userq_doorbell_xa); 4639 4640 INIT_DELAYED_WORK(&adev->delayed_init_work, 4641 amdgpu_device_delayed_init_work_handler); 4642 INIT_DELAYED_WORK(&adev->gfx.gfx_off_delay_work, 4643 amdgpu_device_delay_enable_gfx_off); 4644 /* 4645 * Initialize the enforce_isolation work structures for each XCP 4646 * partition. This work handler is responsible for enforcing shader 4647 * isolation on AMD GPUs. It counts the number of emitted fences for 4648 * each GFX and compute ring. If there are any fences, it schedules 4649 * the `enforce_isolation_work` to be run after a delay. If there are 4650 * no fences, it signals the Kernel Fusion Driver (KFD) to resume the 4651 * runqueue. 4652 */ 4653 for (i = 0; i < MAX_XCP; i++) { 4654 INIT_DELAYED_WORK(&adev->gfx.enforce_isolation[i].work, 4655 amdgpu_gfx_enforce_isolation_handler); 4656 adev->gfx.enforce_isolation[i].adev = adev; 4657 adev->gfx.enforce_isolation[i].xcp_id = i; 4658 } 4659 4660 INIT_WORK(&adev->xgmi_reset_work, amdgpu_device_xgmi_reset_func); 4661 INIT_WORK(&adev->userq_reset_work, amdgpu_userq_reset_work); 4662 4663 adev->gfx.gfx_off_req_count = 1; 4664 adev->gfx.gfx_off_residency = 0; 4665 adev->gfx.gfx_off_entrycount = 0; 4666 adev->pm.ac_power = power_supply_is_system_supplied() > 0; 4667 4668 atomic_set(&adev->throttling_logging_enabled, 1); 4669 /* 4670 * If throttling continues, logging will be performed every minute 4671 * to avoid log flooding. "-1" is subtracted since the thermal 4672 * throttling interrupt comes every second. Thus, the total logging 4673 * interval is 59 seconds(retelimited printk interval) + 1(waiting 4674 * for throttling interrupt) = 60 seconds. 4675 */ 4676 ratelimit_state_init(&adev->throttling_logging_rs, (60 - 1) * HZ, 1); 4677 4678 ratelimit_set_flags(&adev->throttling_logging_rs, RATELIMIT_MSG_ON_RELEASE); 4679 4680 /* Registers mapping */ 4681 /* TODO: block userspace mapping of io register */ 4682 if (adev->asic_type >= CHIP_BONAIRE) { 4683 adev->rmmio_base = pci_resource_start(adev->pdev, 5); 4684 adev->rmmio_size = pci_resource_len(adev->pdev, 5); 4685 } else { 4686 adev->rmmio_base = pci_resource_start(adev->pdev, 2); 4687 adev->rmmio_size = pci_resource_len(adev->pdev, 2); 4688 } 4689 4690 for (i = 0; i < AMD_IP_BLOCK_TYPE_NUM; i++) 4691 atomic_set(&adev->pm.pwr_state[i], POWER_STATE_UNKNOWN); 4692 4693 adev->rmmio = ioremap(adev->rmmio_base, adev->rmmio_size); 4694 if (!adev->rmmio) 4695 return -ENOMEM; 4696 4697 dev_info(adev->dev, "register mmio base: 0x%08X\n", 4698 (uint32_t)adev->rmmio_base); 4699 dev_info(adev->dev, "register mmio size: %u\n", 4700 (unsigned int)adev->rmmio_size); 4701 4702 /* 4703 * Reset domain needs to be present early, before XGMI hive discovered 4704 * (if any) and initialized to use reset sem and in_gpu reset flag 4705 * early on during init and before calling to RREG32. 4706 */ 4707 adev->reset_domain = amdgpu_reset_create_reset_domain(SINGLE_DEVICE, "amdgpu-reset-dev"); 4708 if (!adev->reset_domain) 4709 return -ENOMEM; 4710 4711 /* detect hw virtualization here */ 4712 amdgpu_virt_init(adev); 4713 4714 amdgpu_device_get_pcie_info(adev); 4715 4716 r = amdgpu_device_get_job_timeout_settings(adev); 4717 if (r) { 4718 dev_err(adev->dev, "invalid lockup_timeout parameter syntax\n"); 4719 return r; 4720 } 4721 4722 amdgpu_device_set_mcbp(adev); 4723 4724 /* 4725 * By default, use default mode where all blocks are expected to be 4726 * initialized. At present a 'swinit' of blocks is required to be 4727 * completed before the need for a different level is detected. 4728 */ 4729 amdgpu_set_init_level(adev, AMDGPU_INIT_LEVEL_DEFAULT); 4730 /* early init functions */ 4731 r = amdgpu_device_ip_early_init(adev); 4732 if (r) 4733 return r; 4734 4735 /* 4736 * No need to remove conflicting FBs for non-display class devices. 4737 * This prevents the sysfb from being freed accidently. 4738 */ 4739 if ((pdev->class >> 8) == PCI_CLASS_DISPLAY_VGA || 4740 (pdev->class >> 8) == PCI_CLASS_DISPLAY_OTHER) { 4741 /* Get rid of things like offb */ 4742 r = aperture_remove_conflicting_pci_devices(adev->pdev, amdgpu_kms_driver.name); 4743 if (r) 4744 return r; 4745 } 4746 4747 /* Enable TMZ based on IP_VERSION */ 4748 amdgpu_gmc_tmz_set(adev); 4749 4750 if (amdgpu_sriov_vf(adev) && 4751 amdgpu_ip_version(adev, GC_HWIP, 0) >= IP_VERSION(10, 3, 0)) 4752 /* VF MMIO access (except mailbox range) from CPU 4753 * will be blocked during sriov runtime 4754 */ 4755 adev->virt.caps |= AMDGPU_VF_MMIO_ACCESS_PROTECT; 4756 4757 amdgpu_gmc_noretry_set(adev); 4758 /* Need to get xgmi info early to decide the reset behavior*/ 4759 if (adev->gmc.xgmi.supported) { 4760 r = adev->gfxhub.funcs->get_xgmi_info(adev); 4761 if (r) 4762 return r; 4763 } 4764 4765 /* enable PCIE atomic ops */ 4766 if (amdgpu_sriov_vf(adev)) { 4767 if (adev->virt.fw_reserve.p_pf2vf) 4768 adev->have_atomics_support = ((struct amd_sriov_msg_pf2vf_info *) 4769 adev->virt.fw_reserve.p_pf2vf)->pcie_atomic_ops_support_flags == 4770 (PCI_EXP_DEVCAP2_ATOMIC_COMP32 | PCI_EXP_DEVCAP2_ATOMIC_COMP64); 4771 /* APUs w/ gfx9 onwards doesn't reply on PCIe atomics, rather it is a 4772 * internal path natively support atomics, set have_atomics_support to true. 4773 */ 4774 } else if ((adev->flags & AMD_IS_APU) && 4775 (amdgpu_ip_version(adev, GC_HWIP, 0) > 4776 IP_VERSION(9, 0, 0))) { 4777 adev->have_atomics_support = true; 4778 } else { 4779 adev->have_atomics_support = 4780 !pci_enable_atomic_ops_to_root(adev->pdev, 4781 PCI_EXP_DEVCAP2_ATOMIC_COMP32 | 4782 PCI_EXP_DEVCAP2_ATOMIC_COMP64); 4783 } 4784 4785 if (!adev->have_atomics_support) 4786 dev_info(adev->dev, "PCIE atomic ops is not supported\n"); 4787 4788 /* doorbell bar mapping and doorbell index init*/ 4789 amdgpu_doorbell_init(adev); 4790 4791 if (amdgpu_emu_mode == 1) { 4792 /* post the asic on emulation mode */ 4793 emu_soc_asic_init(adev); 4794 goto fence_driver_init; 4795 } 4796 4797 amdgpu_reset_init(adev); 4798 4799 /* detect if we are with an SRIOV vbios */ 4800 if (adev->bios) 4801 amdgpu_device_detect_sriov_bios(adev); 4802 4803 /* check if we need to reset the asic 4804 * E.g., driver was not cleanly unloaded previously, etc. 4805 */ 4806 if (!amdgpu_sriov_vf(adev) && amdgpu_asic_need_reset_on_init(adev)) { 4807 if (adev->gmc.xgmi.num_physical_nodes) { 4808 dev_info(adev->dev, "Pending hive reset.\n"); 4809 amdgpu_set_init_level(adev, 4810 AMDGPU_INIT_LEVEL_MINIMAL_XGMI); 4811 } else if (amdgpu_ip_version(adev, MP1_HWIP, 0) == IP_VERSION(13, 0, 10) && 4812 !amdgpu_device_has_display_hardware(adev)) { 4813 r = psp_gpu_reset(adev); 4814 } else { 4815 tmp = amdgpu_reset_method; 4816 /* It should do a default reset when loading or reloading the driver, 4817 * regardless of the module parameter reset_method. 4818 */ 4819 amdgpu_reset_method = AMD_RESET_METHOD_NONE; 4820 r = amdgpu_asic_reset(adev); 4821 amdgpu_reset_method = tmp; 4822 } 4823 4824 if (r) { 4825 dev_err(adev->dev, "asic reset on init failed\n"); 4826 goto failed; 4827 } 4828 } 4829 4830 /* Post card if necessary */ 4831 if (amdgpu_device_need_post(adev)) { 4832 if (!adev->bios) { 4833 dev_err(adev->dev, "no vBIOS found\n"); 4834 r = -EINVAL; 4835 goto failed; 4836 } 4837 dev_info(adev->dev, "GPU posting now...\n"); 4838 r = amdgpu_device_asic_init(adev); 4839 if (r) { 4840 dev_err(adev->dev, "gpu post error!\n"); 4841 goto failed; 4842 } 4843 } 4844 4845 if (adev->bios) { 4846 if (adev->is_atom_fw) { 4847 /* Initialize clocks */ 4848 r = amdgpu_atomfirmware_get_clock_info(adev); 4849 if (r) { 4850 dev_err(adev->dev, "amdgpu_atomfirmware_get_clock_info failed\n"); 4851 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_GET_CLOCK_FAIL, 0, 0); 4852 goto failed; 4853 } 4854 } else { 4855 /* Initialize clocks */ 4856 r = amdgpu_atombios_get_clock_info(adev); 4857 if (r) { 4858 dev_err(adev->dev, "amdgpu_atombios_get_clock_info failed\n"); 4859 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_GET_CLOCK_FAIL, 0, 0); 4860 goto failed; 4861 } 4862 /* init i2c buses */ 4863 amdgpu_i2c_init(adev); 4864 } 4865 } 4866 4867 fence_driver_init: 4868 /* Fence driver */ 4869 r = amdgpu_fence_driver_sw_init(adev); 4870 if (r) { 4871 dev_err(adev->dev, "amdgpu_fence_driver_sw_init failed\n"); 4872 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_FENCE_INIT_FAIL, 0, 0); 4873 goto failed; 4874 } 4875 4876 /* init the mode config */ 4877 drm_mode_config_init(adev_to_drm(adev)); 4878 4879 r = amdgpu_device_ip_init(adev); 4880 if (r) { 4881 dev_err(adev->dev, "amdgpu_device_ip_init failed\n"); 4882 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_AMDGPU_INIT_FAIL, 0, 0); 4883 goto release_ras_con; 4884 } 4885 4886 amdgpu_fence_driver_hw_init(adev); 4887 4888 dev_info(adev->dev, 4889 "SE %d, SH per SE %d, CU per SH %d, active_cu_number %d\n", 4890 adev->gfx.config.max_shader_engines, 4891 adev->gfx.config.max_sh_per_se, 4892 adev->gfx.config.max_cu_per_sh, 4893 adev->gfx.cu_info.number); 4894 4895 adev->accel_working = true; 4896 4897 amdgpu_vm_check_compute_bug(adev); 4898 4899 /* Initialize the buffer migration limit. */ 4900 if (amdgpu_moverate >= 0) 4901 max_MBps = amdgpu_moverate; 4902 else 4903 max_MBps = 8; /* Allow 8 MB/s. */ 4904 /* Get a log2 for easy divisions. */ 4905 adev->mm_stats.log2_max_MBps = ilog2(max(1u, max_MBps)); 4906 4907 /* 4908 * Register gpu instance before amdgpu_device_enable_mgpu_fan_boost. 4909 * Otherwise the mgpu fan boost feature will be skipped due to the 4910 * gpu instance is counted less. 4911 */ 4912 amdgpu_register_gpu_instance(adev); 4913 4914 /* enable clockgating, etc. after ib tests, etc. since some blocks require 4915 * explicit gating rather than handling it automatically. 4916 */ 4917 if (adev->init_lvl->level != AMDGPU_INIT_LEVEL_MINIMAL_XGMI) { 4918 r = amdgpu_device_ip_late_init(adev); 4919 if (r) { 4920 dev_err(adev->dev, "amdgpu_device_ip_late_init failed\n"); 4921 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_AMDGPU_LATE_INIT_FAIL, 0, r); 4922 goto release_ras_con; 4923 } 4924 /* must succeed. */ 4925 amdgpu_ras_resume(adev); 4926 queue_delayed_work(system_wq, &adev->delayed_init_work, 4927 msecs_to_jiffies(AMDGPU_RESUME_MS)); 4928 } 4929 4930 if (amdgpu_sriov_vf(adev)) { 4931 amdgpu_virt_release_full_gpu(adev, true); 4932 flush_delayed_work(&adev->delayed_init_work); 4933 } 4934 4935 if (adev->init_lvl->level == AMDGPU_INIT_LEVEL_MINIMAL_XGMI) 4936 amdgpu_xgmi_reset_on_init(adev); 4937 /* 4938 * Place those sysfs registering after `late_init`. As some of those 4939 * operations performed in `late_init` might affect the sysfs 4940 * interfaces creating. 4941 */ 4942 r = amdgpu_device_sys_interface_init(adev); 4943 4944 if (IS_ENABLED(CONFIG_PERF_EVENTS)) 4945 r = amdgpu_pmu_init(adev); 4946 if (r) 4947 dev_err(adev->dev, "amdgpu_pmu_init failed\n"); 4948 4949 /* Have stored pci confspace at hand for restore in sudden PCI error */ 4950 if (amdgpu_device_cache_pci_state(adev->pdev)) 4951 pci_restore_state(pdev); 4952 4953 /* if we have > 1 VGA cards, then disable the amdgpu VGA resources */ 4954 /* this will fail for cards that aren't VGA class devices, just 4955 * ignore it 4956 */ 4957 if ((adev->pdev->class >> 8) == PCI_CLASS_DISPLAY_VGA) 4958 vga_client_register(adev->pdev, amdgpu_device_vga_set_decode); 4959 4960 px = amdgpu_device_supports_px(adev); 4961 4962 if (px || (!dev_is_removable(&adev->pdev->dev) && 4963 apple_gmux_detect(NULL, NULL))) 4964 vga_switcheroo_register_client(adev->pdev, 4965 &amdgpu_switcheroo_ops, px); 4966 4967 if (px) 4968 vga_switcheroo_init_domain_pm_ops(adev->dev, &adev->vga_pm_domain); 4969 4970 amdgpu_device_check_iommu_direct_map(adev); 4971 4972 adev->pm_nb.notifier_call = amdgpu_device_pm_notifier; 4973 r = register_pm_notifier(&adev->pm_nb); 4974 if (r) 4975 goto failed; 4976 4977 return 0; 4978 4979 release_ras_con: 4980 if (amdgpu_sriov_vf(adev)) 4981 amdgpu_virt_release_full_gpu(adev, true); 4982 4983 /* failed in exclusive mode due to timeout */ 4984 if (amdgpu_sriov_vf(adev) && 4985 !amdgpu_sriov_runtime(adev) && 4986 amdgpu_virt_mmio_blocked(adev) && 4987 !amdgpu_virt_wait_reset(adev)) { 4988 dev_err(adev->dev, "VF exclusive mode timeout\n"); 4989 /* Don't send request since VF is inactive. */ 4990 adev->virt.caps &= ~AMDGPU_SRIOV_CAPS_RUNTIME; 4991 adev->virt.ops = NULL; 4992 r = -EAGAIN; 4993 } 4994 amdgpu_release_ras_context(adev); 4995 4996 failed: 4997 amdgpu_vf_error_trans_all(adev); 4998 4999 return r; 5000 } 5001 5002 static void amdgpu_device_unmap_mmio(struct amdgpu_device *adev) 5003 { 5004 5005 /* Clear all CPU mappings pointing to this device */ 5006 unmap_mapping_range(adev->ddev.anon_inode->i_mapping, 0, 0, 1); 5007 5008 /* Unmap all mapped bars - Doorbell, registers and VRAM */ 5009 amdgpu_doorbell_fini(adev); 5010 5011 iounmap(adev->rmmio); 5012 adev->rmmio = NULL; 5013 if (adev->mman.aper_base_kaddr) 5014 iounmap(adev->mman.aper_base_kaddr); 5015 adev->mman.aper_base_kaddr = NULL; 5016 5017 /* Memory manager related */ 5018 if (!adev->gmc.xgmi.connected_to_cpu && !adev->gmc.is_app_apu) { 5019 arch_phys_wc_del(adev->gmc.vram_mtrr); 5020 arch_io_free_memtype_wc(adev->gmc.aper_base, adev->gmc.aper_size); 5021 } 5022 } 5023 5024 /** 5025 * amdgpu_device_fini_hw - tear down the driver 5026 * 5027 * @adev: amdgpu_device pointer 5028 * 5029 * Tear down the driver info (all asics). 5030 * Called at driver shutdown. 5031 */ 5032 void amdgpu_device_fini_hw(struct amdgpu_device *adev) 5033 { 5034 dev_info(adev->dev, "amdgpu: finishing device.\n"); 5035 flush_delayed_work(&adev->delayed_init_work); 5036 5037 if (adev->mman.initialized) 5038 drain_workqueue(adev->mman.bdev.wq); 5039 adev->shutdown = true; 5040 5041 unregister_pm_notifier(&adev->pm_nb); 5042 5043 /* make sure IB test finished before entering exclusive mode 5044 * to avoid preemption on IB test 5045 */ 5046 if (amdgpu_sriov_vf(adev)) { 5047 amdgpu_virt_request_full_gpu(adev, false); 5048 amdgpu_virt_fini_data_exchange(adev); 5049 } 5050 5051 /* disable all interrupts */ 5052 amdgpu_irq_disable_all(adev); 5053 if (adev->mode_info.mode_config_initialized) { 5054 if (!drm_drv_uses_atomic_modeset(adev_to_drm(adev))) 5055 drm_helper_force_disable_all(adev_to_drm(adev)); 5056 else 5057 drm_atomic_helper_shutdown(adev_to_drm(adev)); 5058 } 5059 amdgpu_fence_driver_hw_fini(adev); 5060 5061 amdgpu_device_sys_interface_fini(adev); 5062 5063 /* disable ras feature must before hw fini */ 5064 amdgpu_ras_pre_fini(adev); 5065 5066 amdgpu_ttm_set_buffer_funcs_status(adev, false); 5067 5068 amdgpu_device_ip_fini_early(adev); 5069 5070 amdgpu_irq_fini_hw(adev); 5071 5072 if (adev->mman.initialized) 5073 ttm_device_clear_dma_mappings(&adev->mman.bdev); 5074 5075 amdgpu_gart_dummy_page_fini(adev); 5076 5077 if (drm_dev_is_unplugged(adev_to_drm(adev))) 5078 amdgpu_device_unmap_mmio(adev); 5079 5080 } 5081 5082 void amdgpu_device_fini_sw(struct amdgpu_device *adev) 5083 { 5084 int i, idx; 5085 bool px; 5086 5087 amdgpu_device_ip_fini(adev); 5088 amdgpu_fence_driver_sw_fini(adev); 5089 amdgpu_ucode_release(&adev->firmware.gpu_info_fw); 5090 adev->accel_working = false; 5091 dma_fence_put(rcu_dereference_protected(adev->gang_submit, true)); 5092 for (i = 0; i < MAX_XCP; ++i) { 5093 dma_fence_put(adev->isolation[i].spearhead); 5094 amdgpu_sync_free(&adev->isolation[i].active); 5095 amdgpu_sync_free(&adev->isolation[i].prev); 5096 } 5097 5098 amdgpu_reset_fini(adev); 5099 5100 /* free i2c buses */ 5101 amdgpu_i2c_fini(adev); 5102 5103 if (adev->bios) { 5104 if (amdgpu_emu_mode != 1) 5105 amdgpu_atombios_fini(adev); 5106 amdgpu_bios_release(adev); 5107 } 5108 5109 kfree(adev->fru_info); 5110 adev->fru_info = NULL; 5111 5112 kfree(adev->xcp_mgr); 5113 adev->xcp_mgr = NULL; 5114 5115 px = amdgpu_device_supports_px(adev); 5116 5117 if (px || (!dev_is_removable(&adev->pdev->dev) && 5118 apple_gmux_detect(NULL, NULL))) 5119 vga_switcheroo_unregister_client(adev->pdev); 5120 5121 if (px) 5122 vga_switcheroo_fini_domain_pm_ops(adev->dev); 5123 5124 if ((adev->pdev->class >> 8) == PCI_CLASS_DISPLAY_VGA) 5125 vga_client_unregister(adev->pdev); 5126 5127 if (drm_dev_enter(adev_to_drm(adev), &idx)) { 5128 5129 iounmap(adev->rmmio); 5130 adev->rmmio = NULL; 5131 drm_dev_exit(idx); 5132 } 5133 5134 if (IS_ENABLED(CONFIG_PERF_EVENTS)) 5135 amdgpu_pmu_fini(adev); 5136 if (adev->discovery.bin) 5137 amdgpu_discovery_fini(adev); 5138 5139 amdgpu_reset_put_reset_domain(adev->reset_domain); 5140 adev->reset_domain = NULL; 5141 5142 kfree(adev->pci_state); 5143 kfree(adev->pcie_reset_ctx.swds_pcistate); 5144 kfree(adev->pcie_reset_ctx.swus_pcistate); 5145 } 5146 5147 /** 5148 * amdgpu_device_evict_resources - evict device resources 5149 * @adev: amdgpu device object 5150 * 5151 * Evicts all ttm device resources(vram BOs, gart table) from the lru list 5152 * of the vram memory type. Mainly used for evicting device resources 5153 * at suspend time. 5154 * 5155 */ 5156 static int amdgpu_device_evict_resources(struct amdgpu_device *adev) 5157 { 5158 int ret; 5159 5160 /* No need to evict vram on APUs unless going to S4 */ 5161 if (!adev->in_s4 && (adev->flags & AMD_IS_APU)) 5162 return 0; 5163 5164 /* No need to evict when going to S5 through S4 callbacks */ 5165 if (system_state == SYSTEM_POWER_OFF) 5166 return 0; 5167 5168 ret = amdgpu_ttm_evict_resources(adev, TTM_PL_VRAM); 5169 if (ret) { 5170 dev_warn(adev->dev, "evicting device resources failed\n"); 5171 return ret; 5172 } 5173 5174 if (adev->in_s4) { 5175 ret = ttm_device_prepare_hibernation(&adev->mman.bdev); 5176 if (ret) 5177 dev_err(adev->dev, "prepare hibernation failed, %d\n", ret); 5178 } 5179 return ret; 5180 } 5181 5182 /* 5183 * Suspend & resume. 5184 */ 5185 /** 5186 * amdgpu_device_pm_notifier - Notification block for Suspend/Hibernate events 5187 * @nb: notifier block 5188 * @mode: suspend mode 5189 * @data: data 5190 * 5191 * This function is called when the system is about to suspend or hibernate. 5192 * It is used to set the appropriate flags so that eviction can be optimized 5193 * in the pm prepare callback. 5194 */ 5195 static int amdgpu_device_pm_notifier(struct notifier_block *nb, unsigned long mode, 5196 void *data) 5197 { 5198 struct amdgpu_device *adev = container_of(nb, struct amdgpu_device, pm_nb); 5199 5200 switch (mode) { 5201 case PM_HIBERNATION_PREPARE: 5202 adev->in_s4 = true; 5203 break; 5204 case PM_POST_HIBERNATION: 5205 adev->in_s4 = false; 5206 break; 5207 } 5208 5209 return NOTIFY_DONE; 5210 } 5211 5212 /** 5213 * amdgpu_device_prepare - prepare for device suspend 5214 * 5215 * @dev: drm dev pointer 5216 * 5217 * Prepare to put the hw in the suspend state (all asics). 5218 * Returns 0 for success or an error on failure. 5219 * Called at driver suspend. 5220 */ 5221 int amdgpu_device_prepare(struct drm_device *dev) 5222 { 5223 struct amdgpu_device *adev = drm_to_adev(dev); 5224 int i, r; 5225 5226 if (dev->switch_power_state == DRM_SWITCH_POWER_OFF) 5227 return 0; 5228 5229 /* Evict the majority of BOs before starting suspend sequence */ 5230 r = amdgpu_device_evict_resources(adev); 5231 if (r) 5232 return r; 5233 5234 flush_delayed_work(&adev->gfx.gfx_off_delay_work); 5235 5236 for (i = 0; i < adev->num_ip_blocks; i++) { 5237 if (!adev->ip_blocks[i].status.valid) 5238 continue; 5239 if (!adev->ip_blocks[i].version->funcs->prepare_suspend) 5240 continue; 5241 r = adev->ip_blocks[i].version->funcs->prepare_suspend(&adev->ip_blocks[i]); 5242 if (r) 5243 return r; 5244 } 5245 5246 return 0; 5247 } 5248 5249 /** 5250 * amdgpu_device_complete - complete power state transition 5251 * 5252 * @dev: drm dev pointer 5253 * 5254 * Undo the changes from amdgpu_device_prepare. This will be 5255 * called on all resume transitions, including those that failed. 5256 */ 5257 void amdgpu_device_complete(struct drm_device *dev) 5258 { 5259 struct amdgpu_device *adev = drm_to_adev(dev); 5260 int i; 5261 5262 for (i = 0; i < adev->num_ip_blocks; i++) { 5263 if (!adev->ip_blocks[i].status.valid) 5264 continue; 5265 if (!adev->ip_blocks[i].version->funcs->complete) 5266 continue; 5267 adev->ip_blocks[i].version->funcs->complete(&adev->ip_blocks[i]); 5268 } 5269 } 5270 5271 /** 5272 * amdgpu_device_suspend - initiate device suspend 5273 * 5274 * @dev: drm dev pointer 5275 * @notify_clients: notify in-kernel DRM clients 5276 * 5277 * Puts the hw in the suspend state (all asics). 5278 * Returns 0 for success or an error on failure. 5279 * Called at driver suspend. 5280 */ 5281 int amdgpu_device_suspend(struct drm_device *dev, bool notify_clients) 5282 { 5283 struct amdgpu_device *adev = drm_to_adev(dev); 5284 int r, rec; 5285 5286 if (dev->switch_power_state == DRM_SWITCH_POWER_OFF) 5287 return 0; 5288 5289 adev->in_suspend = true; 5290 5291 if (amdgpu_sriov_vf(adev)) { 5292 if (!adev->in_runpm) 5293 amdgpu_amdkfd_suspend_process(adev); 5294 amdgpu_virt_fini_data_exchange(adev); 5295 r = amdgpu_virt_request_full_gpu(adev, false); 5296 if (r) 5297 return r; 5298 } 5299 5300 r = amdgpu_acpi_smart_shift_update(adev, AMDGPU_SS_DEV_D3); 5301 if (r) 5302 goto unwind_sriov; 5303 5304 if (notify_clients) 5305 drm_client_dev_suspend(adev_to_drm(adev)); 5306 5307 cancel_delayed_work_sync(&adev->delayed_init_work); 5308 5309 amdgpu_ras_suspend(adev); 5310 5311 r = amdgpu_device_ip_suspend_phase1(adev); 5312 if (r) 5313 goto unwind_smartshift; 5314 5315 amdgpu_amdkfd_suspend(adev, !amdgpu_sriov_vf(adev) && !adev->in_runpm); 5316 r = amdgpu_userq_suspend(adev); 5317 if (r) 5318 goto unwind_ip_phase1; 5319 5320 r = amdgpu_device_evict_resources(adev); 5321 if (r) 5322 goto unwind_userq; 5323 5324 amdgpu_ttm_set_buffer_funcs_status(adev, false); 5325 5326 amdgpu_fence_driver_hw_fini(adev); 5327 5328 r = amdgpu_device_ip_suspend_phase2(adev); 5329 if (r) 5330 goto unwind_evict; 5331 5332 if (amdgpu_sriov_vf(adev)) 5333 amdgpu_virt_release_full_gpu(adev, false); 5334 5335 return 0; 5336 5337 unwind_evict: 5338 if (adev->mman.buffer_funcs_ring->sched.ready) 5339 amdgpu_ttm_set_buffer_funcs_status(adev, true); 5340 amdgpu_fence_driver_hw_init(adev); 5341 5342 unwind_userq: 5343 rec = amdgpu_userq_resume(adev); 5344 if (rec) { 5345 dev_warn(adev->dev, "failed to re-initialize user queues: %d\n", rec); 5346 return r; 5347 } 5348 rec = amdgpu_amdkfd_resume(adev, !amdgpu_sriov_vf(adev) && !adev->in_runpm); 5349 if (rec) { 5350 dev_warn(adev->dev, "failed to re-initialize kfd: %d\n", rec); 5351 return r; 5352 } 5353 5354 unwind_ip_phase1: 5355 /* suspend phase 1 = resume phase 3 */ 5356 rec = amdgpu_device_ip_resume_phase3(adev); 5357 if (rec) { 5358 dev_warn(adev->dev, "failed to re-initialize IPs phase1: %d\n", rec); 5359 return r; 5360 } 5361 5362 unwind_smartshift: 5363 rec = amdgpu_acpi_smart_shift_update(adev, AMDGPU_SS_DEV_D0); 5364 if (rec) { 5365 dev_warn(adev->dev, "failed to re-update smart shift: %d\n", rec); 5366 return r; 5367 } 5368 5369 if (notify_clients) 5370 drm_client_dev_resume(adev_to_drm(adev)); 5371 5372 amdgpu_ras_resume(adev); 5373 5374 unwind_sriov: 5375 if (amdgpu_sriov_vf(adev)) { 5376 rec = amdgpu_virt_request_full_gpu(adev, true); 5377 if (rec) { 5378 dev_warn(adev->dev, "failed to reinitialize sriov: %d\n", rec); 5379 return r; 5380 } 5381 } 5382 5383 adev->in_suspend = adev->in_s0ix = adev->in_s3 = false; 5384 5385 return r; 5386 } 5387 5388 static inline int amdgpu_virt_resume(struct amdgpu_device *adev) 5389 { 5390 int r; 5391 unsigned int prev_physical_node_id = adev->gmc.xgmi.physical_node_id; 5392 5393 /* During VM resume, QEMU programming of VF MSIX table (register GFXMSIX_VECT0_ADDR_LO) 5394 * may not work. The access could be blocked by nBIF protection as VF isn't in 5395 * exclusive access mode. Exclusive access is enabled now, disable/enable MSIX 5396 * so that QEMU reprograms MSIX table. 5397 */ 5398 amdgpu_restore_msix(adev); 5399 5400 r = adev->gfxhub.funcs->get_xgmi_info(adev); 5401 if (r) 5402 return r; 5403 5404 dev_info(adev->dev, "xgmi node, old id %d, new id %d\n", 5405 prev_physical_node_id, adev->gmc.xgmi.physical_node_id); 5406 5407 adev->vm_manager.vram_base_offset = adev->gfxhub.funcs->get_mc_fb_offset(adev); 5408 adev->vm_manager.vram_base_offset += 5409 adev->gmc.xgmi.physical_node_id * adev->gmc.xgmi.node_segment_size; 5410 5411 return 0; 5412 } 5413 5414 /** 5415 * amdgpu_device_resume - initiate device resume 5416 * 5417 * @dev: drm dev pointer 5418 * @notify_clients: notify in-kernel DRM clients 5419 * 5420 * Bring the hw back to operating state (all asics). 5421 * Returns 0 for success or an error on failure. 5422 * Called at driver resume. 5423 */ 5424 int amdgpu_device_resume(struct drm_device *dev, bool notify_clients) 5425 { 5426 struct amdgpu_device *adev = drm_to_adev(dev); 5427 int r = 0; 5428 5429 if (amdgpu_sriov_vf(adev)) { 5430 r = amdgpu_virt_request_full_gpu(adev, true); 5431 if (r) 5432 return r; 5433 } 5434 5435 if (amdgpu_virt_xgmi_migrate_enabled(adev)) { 5436 r = amdgpu_virt_resume(adev); 5437 if (r) 5438 goto exit; 5439 } 5440 5441 if (dev->switch_power_state == DRM_SWITCH_POWER_OFF) 5442 return 0; 5443 5444 if (adev->in_s0ix) 5445 amdgpu_dpm_gfx_state_change(adev, sGpuChangeState_D0Entry); 5446 5447 /* post card */ 5448 if (amdgpu_device_need_post(adev)) { 5449 r = amdgpu_device_asic_init(adev); 5450 if (r) 5451 dev_err(adev->dev, "amdgpu asic init failed\n"); 5452 } 5453 5454 r = amdgpu_device_ip_resume(adev); 5455 5456 if (r) { 5457 dev_err(adev->dev, "amdgpu_device_ip_resume failed (%d).\n", r); 5458 goto exit; 5459 } 5460 5461 r = amdgpu_amdkfd_resume(adev, !amdgpu_sriov_vf(adev) && !adev->in_runpm); 5462 if (r) 5463 goto exit; 5464 5465 r = amdgpu_userq_resume(adev); 5466 if (r) 5467 goto exit; 5468 5469 r = amdgpu_device_ip_late_init(adev); 5470 if (r) 5471 goto exit; 5472 5473 queue_delayed_work(system_wq, &adev->delayed_init_work, 5474 msecs_to_jiffies(AMDGPU_RESUME_MS)); 5475 exit: 5476 if (amdgpu_sriov_vf(adev)) { 5477 amdgpu_virt_init_data_exchange(adev); 5478 amdgpu_virt_release_full_gpu(adev, true); 5479 5480 if (!r && !adev->in_runpm) 5481 r = amdgpu_amdkfd_resume_process(adev); 5482 } 5483 5484 if (r) 5485 return r; 5486 5487 /* Make sure IB tests flushed */ 5488 flush_delayed_work(&adev->delayed_init_work); 5489 5490 if (notify_clients) 5491 drm_client_dev_resume(adev_to_drm(adev)); 5492 5493 amdgpu_ras_resume(adev); 5494 5495 if (adev->mode_info.num_crtc) { 5496 /* 5497 * Most of the connector probing functions try to acquire runtime pm 5498 * refs to ensure that the GPU is powered on when connector polling is 5499 * performed. Since we're calling this from a runtime PM callback, 5500 * trying to acquire rpm refs will cause us to deadlock. 5501 * 5502 * Since we're guaranteed to be holding the rpm lock, it's safe to 5503 * temporarily disable the rpm helpers so this doesn't deadlock us. 5504 */ 5505 #ifdef CONFIG_PM 5506 dev->dev->power.disable_depth++; 5507 #endif 5508 if (!adev->dc_enabled) 5509 drm_helper_hpd_irq_event(dev); 5510 else 5511 drm_kms_helper_hotplug_event(dev); 5512 #ifdef CONFIG_PM 5513 dev->dev->power.disable_depth--; 5514 #endif 5515 } 5516 5517 amdgpu_vram_mgr_clear_reset_blocks(adev); 5518 adev->in_suspend = false; 5519 5520 if (amdgpu_acpi_smart_shift_update(adev, AMDGPU_SS_DEV_D0)) 5521 dev_warn(adev->dev, "smart shift update failed\n"); 5522 5523 return 0; 5524 } 5525 5526 /** 5527 * amdgpu_device_ip_check_soft_reset - did soft reset succeed 5528 * 5529 * @adev: amdgpu_device pointer 5530 * 5531 * The list of all the hardware IPs that make up the asic is walked and 5532 * the check_soft_reset callbacks are run. check_soft_reset determines 5533 * if the asic is still hung or not. 5534 * Returns true if any of the IPs are still in a hung state, false if not. 5535 */ 5536 static bool amdgpu_device_ip_check_soft_reset(struct amdgpu_device *adev) 5537 { 5538 int i; 5539 bool asic_hang = false; 5540 5541 if (amdgpu_sriov_vf(adev)) 5542 return true; 5543 5544 if (amdgpu_asic_need_full_reset(adev)) 5545 return true; 5546 5547 for (i = 0; i < adev->num_ip_blocks; i++) { 5548 if (!adev->ip_blocks[i].status.valid) 5549 continue; 5550 if (adev->ip_blocks[i].version->funcs->check_soft_reset) 5551 adev->ip_blocks[i].status.hang = 5552 adev->ip_blocks[i].version->funcs->check_soft_reset( 5553 &adev->ip_blocks[i]); 5554 if (adev->ip_blocks[i].status.hang) { 5555 dev_info(adev->dev, "IP block:%s is hung!\n", adev->ip_blocks[i].version->funcs->name); 5556 asic_hang = true; 5557 } 5558 } 5559 return asic_hang; 5560 } 5561 5562 /** 5563 * amdgpu_device_ip_pre_soft_reset - prepare for soft reset 5564 * 5565 * @adev: amdgpu_device pointer 5566 * 5567 * The list of all the hardware IPs that make up the asic is walked and the 5568 * pre_soft_reset callbacks are run if the block is hung. pre_soft_reset 5569 * handles any IP specific hardware or software state changes that are 5570 * necessary for a soft reset to succeed. 5571 * Returns 0 on success, negative error code on failure. 5572 */ 5573 static int amdgpu_device_ip_pre_soft_reset(struct amdgpu_device *adev) 5574 { 5575 int i, r = 0; 5576 5577 for (i = 0; i < adev->num_ip_blocks; i++) { 5578 if (!adev->ip_blocks[i].status.valid) 5579 continue; 5580 if (adev->ip_blocks[i].status.hang && 5581 adev->ip_blocks[i].version->funcs->pre_soft_reset) { 5582 r = adev->ip_blocks[i].version->funcs->pre_soft_reset(&adev->ip_blocks[i]); 5583 if (r) 5584 return r; 5585 } 5586 } 5587 5588 return 0; 5589 } 5590 5591 /** 5592 * amdgpu_device_ip_need_full_reset - check if a full asic reset is needed 5593 * 5594 * @adev: amdgpu_device pointer 5595 * 5596 * Some hardware IPs cannot be soft reset. If they are hung, a full gpu 5597 * reset is necessary to recover. 5598 * Returns true if a full asic reset is required, false if not. 5599 */ 5600 static bool amdgpu_device_ip_need_full_reset(struct amdgpu_device *adev) 5601 { 5602 int i; 5603 5604 if (amdgpu_asic_need_full_reset(adev)) 5605 return true; 5606 5607 for (i = 0; i < adev->num_ip_blocks; i++) { 5608 if (!adev->ip_blocks[i].status.valid) 5609 continue; 5610 if ((adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) || 5611 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) || 5612 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_ACP) || 5613 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE) || 5614 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) { 5615 if (adev->ip_blocks[i].status.hang) { 5616 dev_info(adev->dev, "Some block need full reset!\n"); 5617 return true; 5618 } 5619 } 5620 } 5621 return false; 5622 } 5623 5624 /** 5625 * amdgpu_device_ip_soft_reset - do a soft reset 5626 * 5627 * @adev: amdgpu_device pointer 5628 * 5629 * The list of all the hardware IPs that make up the asic is walked and the 5630 * soft_reset callbacks are run if the block is hung. soft_reset handles any 5631 * IP specific hardware or software state changes that are necessary to soft 5632 * reset the IP. 5633 * Returns 0 on success, negative error code on failure. 5634 */ 5635 static int amdgpu_device_ip_soft_reset(struct amdgpu_device *adev) 5636 { 5637 int i, r = 0; 5638 5639 for (i = 0; i < adev->num_ip_blocks; i++) { 5640 if (!adev->ip_blocks[i].status.valid) 5641 continue; 5642 if (adev->ip_blocks[i].status.hang && 5643 adev->ip_blocks[i].version->funcs->soft_reset) { 5644 r = adev->ip_blocks[i].version->funcs->soft_reset(&adev->ip_blocks[i]); 5645 if (r) 5646 return r; 5647 } 5648 } 5649 5650 return 0; 5651 } 5652 5653 /** 5654 * amdgpu_device_ip_post_soft_reset - clean up from soft reset 5655 * 5656 * @adev: amdgpu_device pointer 5657 * 5658 * The list of all the hardware IPs that make up the asic is walked and the 5659 * post_soft_reset callbacks are run if the asic was hung. post_soft_reset 5660 * handles any IP specific hardware or software state changes that are 5661 * necessary after the IP has been soft reset. 5662 * Returns 0 on success, negative error code on failure. 5663 */ 5664 static int amdgpu_device_ip_post_soft_reset(struct amdgpu_device *adev) 5665 { 5666 int i, r = 0; 5667 5668 for (i = 0; i < adev->num_ip_blocks; i++) { 5669 if (!adev->ip_blocks[i].status.valid) 5670 continue; 5671 if (adev->ip_blocks[i].status.hang && 5672 adev->ip_blocks[i].version->funcs->post_soft_reset) 5673 r = adev->ip_blocks[i].version->funcs->post_soft_reset(&adev->ip_blocks[i]); 5674 if (r) 5675 return r; 5676 } 5677 5678 return 0; 5679 } 5680 5681 /** 5682 * amdgpu_device_reset_sriov - reset ASIC for SR-IOV vf 5683 * 5684 * @adev: amdgpu_device pointer 5685 * @reset_context: amdgpu reset context pointer 5686 * 5687 * do VF FLR and reinitialize Asic 5688 * return 0 means succeeded otherwise failed 5689 */ 5690 static int amdgpu_device_reset_sriov(struct amdgpu_device *adev, 5691 struct amdgpu_reset_context *reset_context) 5692 { 5693 int r; 5694 struct amdgpu_hive_info *hive = NULL; 5695 5696 if (test_bit(AMDGPU_HOST_FLR, &reset_context->flags)) { 5697 if (!amdgpu_ras_get_fed_status(adev)) 5698 amdgpu_virt_ready_to_reset(adev); 5699 amdgpu_virt_wait_reset(adev); 5700 clear_bit(AMDGPU_HOST_FLR, &reset_context->flags); 5701 r = amdgpu_virt_request_full_gpu(adev, true); 5702 } else { 5703 r = amdgpu_virt_reset_gpu(adev); 5704 } 5705 if (r) 5706 return r; 5707 5708 amdgpu_ras_clear_err_state(adev); 5709 amdgpu_irq_gpu_reset_resume_helper(adev); 5710 5711 /* some sw clean up VF needs to do before recover */ 5712 amdgpu_virt_post_reset(adev); 5713 5714 /* Resume IP prior to SMC */ 5715 r = amdgpu_device_ip_reinit_early_sriov(adev); 5716 if (r) 5717 return r; 5718 5719 amdgpu_virt_init_data_exchange(adev); 5720 5721 r = amdgpu_device_fw_loading(adev); 5722 if (r) 5723 return r; 5724 5725 /* now we are okay to resume SMC/CP/SDMA */ 5726 r = amdgpu_device_ip_reinit_late_sriov(adev); 5727 if (r) 5728 return r; 5729 5730 hive = amdgpu_get_xgmi_hive(adev); 5731 /* Update PSP FW topology after reset */ 5732 if (hive && adev->gmc.xgmi.num_physical_nodes > 1) 5733 r = amdgpu_xgmi_update_topology(hive, adev); 5734 if (hive) 5735 amdgpu_put_xgmi_hive(hive); 5736 if (r) 5737 return r; 5738 5739 r = amdgpu_ib_ring_tests(adev); 5740 if (r) 5741 return r; 5742 5743 if (adev->virt.gim_feature & AMDGIM_FEATURE_GIM_FLR_VRAMLOST) 5744 amdgpu_inc_vram_lost(adev); 5745 5746 /* need to be called during full access so we can't do it later like 5747 * bare-metal does. 5748 */ 5749 amdgpu_amdkfd_post_reset(adev); 5750 amdgpu_virt_release_full_gpu(adev, true); 5751 5752 /* Aldebaran and gfx_11_0_3 support ras in SRIOV, so need resume ras during reset */ 5753 if (amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 2) || 5754 amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 3) || 5755 amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 4) || 5756 amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 5, 0) || 5757 amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(11, 0, 3)) 5758 amdgpu_ras_resume(adev); 5759 5760 amdgpu_virt_ras_telemetry_post_reset(adev); 5761 5762 return 0; 5763 } 5764 5765 /** 5766 * amdgpu_device_has_job_running - check if there is any unfinished job 5767 * 5768 * @adev: amdgpu_device pointer 5769 * 5770 * check if there is any job running on the device when guest driver receives 5771 * FLR notification from host driver. If there are still jobs running, then 5772 * the guest driver will not respond the FLR reset. Instead, let the job hit 5773 * the timeout and guest driver then issue the reset request. 5774 */ 5775 bool amdgpu_device_has_job_running(struct amdgpu_device *adev) 5776 { 5777 int i; 5778 5779 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 5780 struct amdgpu_ring *ring = adev->rings[i]; 5781 5782 if (!amdgpu_ring_sched_ready(ring)) 5783 continue; 5784 5785 if (amdgpu_fence_count_emitted(ring)) 5786 return true; 5787 } 5788 return false; 5789 } 5790 5791 /** 5792 * amdgpu_device_should_recover_gpu - check if we should try GPU recovery 5793 * 5794 * @adev: amdgpu_device pointer 5795 * 5796 * Check amdgpu_gpu_recovery and SRIOV status to see if we should try to recover 5797 * a hung GPU. 5798 */ 5799 bool amdgpu_device_should_recover_gpu(struct amdgpu_device *adev) 5800 { 5801 5802 if (amdgpu_gpu_recovery == 0) 5803 goto disabled; 5804 5805 /* Skip soft reset check in fatal error mode */ 5806 if (!amdgpu_ras_is_poison_mode_supported(adev)) 5807 return true; 5808 5809 if (amdgpu_sriov_vf(adev)) 5810 return true; 5811 5812 if (amdgpu_gpu_recovery == -1) { 5813 switch (adev->asic_type) { 5814 #ifdef CONFIG_DRM_AMDGPU_SI 5815 case CHIP_VERDE: 5816 case CHIP_TAHITI: 5817 case CHIP_PITCAIRN: 5818 case CHIP_OLAND: 5819 case CHIP_HAINAN: 5820 #endif 5821 #ifdef CONFIG_DRM_AMDGPU_CIK 5822 case CHIP_KAVERI: 5823 case CHIP_KABINI: 5824 case CHIP_MULLINS: 5825 #endif 5826 case CHIP_CARRIZO: 5827 case CHIP_STONEY: 5828 case CHIP_CYAN_SKILLFISH: 5829 goto disabled; 5830 default: 5831 break; 5832 } 5833 } 5834 5835 return true; 5836 5837 disabled: 5838 dev_info(adev->dev, "GPU recovery disabled.\n"); 5839 return false; 5840 } 5841 5842 int amdgpu_device_mode1_reset(struct amdgpu_device *adev) 5843 { 5844 u32 i; 5845 int ret = 0; 5846 5847 if (adev->bios) 5848 amdgpu_atombios_scratch_regs_engine_hung(adev, true); 5849 5850 dev_info(adev->dev, "GPU mode1 reset\n"); 5851 5852 /* Cache the state before bus master disable. The saved config space 5853 * values are used in other cases like restore after mode-2 reset. 5854 */ 5855 amdgpu_device_cache_pci_state(adev->pdev); 5856 5857 /* disable BM */ 5858 pci_clear_master(adev->pdev); 5859 5860 if (amdgpu_dpm_is_mode1_reset_supported(adev)) { 5861 dev_info(adev->dev, "GPU smu mode1 reset\n"); 5862 ret = amdgpu_dpm_mode1_reset(adev); 5863 } else { 5864 dev_info(adev->dev, "GPU psp mode1 reset\n"); 5865 ret = psp_gpu_reset(adev); 5866 } 5867 5868 if (ret) 5869 goto mode1_reset_failed; 5870 5871 amdgpu_device_load_pci_state(adev->pdev); 5872 ret = amdgpu_psp_wait_for_bootloader(adev); 5873 if (ret) 5874 goto mode1_reset_failed; 5875 5876 /* wait for asic to come out of reset */ 5877 for (i = 0; i < adev->usec_timeout; i++) { 5878 u32 memsize = adev->nbio.funcs->get_memsize(adev); 5879 5880 if (memsize != 0xffffffff) 5881 break; 5882 udelay(1); 5883 } 5884 5885 if (i >= adev->usec_timeout) { 5886 ret = -ETIMEDOUT; 5887 goto mode1_reset_failed; 5888 } 5889 5890 if (adev->bios) 5891 amdgpu_atombios_scratch_regs_engine_hung(adev, false); 5892 5893 return 0; 5894 5895 mode1_reset_failed: 5896 dev_err(adev->dev, "GPU mode1 reset failed\n"); 5897 return ret; 5898 } 5899 5900 int amdgpu_device_link_reset(struct amdgpu_device *adev) 5901 { 5902 int ret = 0; 5903 5904 dev_info(adev->dev, "GPU link reset\n"); 5905 5906 if (!amdgpu_reset_in_dpc(adev)) 5907 ret = amdgpu_dpm_link_reset(adev); 5908 5909 if (ret) 5910 goto link_reset_failed; 5911 5912 ret = amdgpu_psp_wait_for_bootloader(adev); 5913 if (ret) 5914 goto link_reset_failed; 5915 5916 return 0; 5917 5918 link_reset_failed: 5919 dev_err(adev->dev, "GPU link reset failed\n"); 5920 return ret; 5921 } 5922 5923 int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev, 5924 struct amdgpu_reset_context *reset_context) 5925 { 5926 int i, r = 0; 5927 struct amdgpu_job *job = NULL; 5928 struct amdgpu_device *tmp_adev = reset_context->reset_req_dev; 5929 bool need_full_reset = 5930 test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 5931 5932 if (reset_context->reset_req_dev == adev) 5933 job = reset_context->job; 5934 5935 if (amdgpu_sriov_vf(adev)) 5936 amdgpu_virt_pre_reset(adev); 5937 5938 amdgpu_fence_driver_isr_toggle(adev, true); 5939 5940 /* block all schedulers and reset given job's ring */ 5941 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 5942 struct amdgpu_ring *ring = adev->rings[i]; 5943 5944 if (!amdgpu_ring_sched_ready(ring)) 5945 continue; 5946 5947 /* after all hw jobs are reset, hw fence is meaningless, so force_completion */ 5948 amdgpu_fence_driver_force_completion(ring); 5949 } 5950 5951 amdgpu_fence_driver_isr_toggle(adev, false); 5952 5953 if (job && job->vm) 5954 drm_sched_increase_karma(&job->base); 5955 5956 r = amdgpu_reset_prepare_hwcontext(adev, reset_context); 5957 /* If reset handler not implemented, continue; otherwise return */ 5958 if (r == -EOPNOTSUPP) 5959 r = 0; 5960 else 5961 return r; 5962 5963 /* Don't suspend on bare metal if we are not going to HW reset the ASIC */ 5964 if (!amdgpu_sriov_vf(adev)) { 5965 5966 if (!need_full_reset) 5967 need_full_reset = amdgpu_device_ip_need_full_reset(adev); 5968 5969 if (!need_full_reset && amdgpu_gpu_recovery && 5970 amdgpu_device_ip_check_soft_reset(adev)) { 5971 amdgpu_device_ip_pre_soft_reset(adev); 5972 r = amdgpu_device_ip_soft_reset(adev); 5973 amdgpu_device_ip_post_soft_reset(adev); 5974 if (r || amdgpu_device_ip_check_soft_reset(adev)) { 5975 dev_info(adev->dev, "soft reset failed, will fallback to full reset!\n"); 5976 need_full_reset = true; 5977 } 5978 } 5979 5980 if (!test_bit(AMDGPU_SKIP_COREDUMP, &reset_context->flags)) { 5981 dev_info(tmp_adev->dev, "Dumping IP State\n"); 5982 /* Trigger ip dump before we reset the asic */ 5983 for (i = 0; i < tmp_adev->num_ip_blocks; i++) 5984 if (tmp_adev->ip_blocks[i].version->funcs->dump_ip_state) 5985 tmp_adev->ip_blocks[i].version->funcs 5986 ->dump_ip_state((void *)&tmp_adev->ip_blocks[i]); 5987 dev_info(tmp_adev->dev, "Dumping IP State Completed\n"); 5988 } 5989 5990 if (need_full_reset) 5991 r = amdgpu_device_ip_suspend(adev); 5992 if (need_full_reset) 5993 set_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 5994 else 5995 clear_bit(AMDGPU_NEED_FULL_RESET, 5996 &reset_context->flags); 5997 } 5998 5999 return r; 6000 } 6001 6002 int amdgpu_device_reinit_after_reset(struct amdgpu_reset_context *reset_context) 6003 { 6004 struct list_head *device_list_handle; 6005 bool full_reset, vram_lost = false; 6006 struct amdgpu_device *tmp_adev; 6007 int r, init_level; 6008 6009 device_list_handle = reset_context->reset_device_list; 6010 6011 if (!device_list_handle) 6012 return -EINVAL; 6013 6014 full_reset = test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 6015 6016 /** 6017 * If it's reset on init, it's default init level, otherwise keep level 6018 * as recovery level. 6019 */ 6020 if (reset_context->method == AMD_RESET_METHOD_ON_INIT) 6021 init_level = AMDGPU_INIT_LEVEL_DEFAULT; 6022 else 6023 init_level = AMDGPU_INIT_LEVEL_RESET_RECOVERY; 6024 6025 r = 0; 6026 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 6027 amdgpu_set_init_level(tmp_adev, init_level); 6028 if (full_reset) { 6029 /* post card */ 6030 amdgpu_reset_set_dpc_status(tmp_adev, false); 6031 amdgpu_ras_clear_err_state(tmp_adev); 6032 r = amdgpu_device_asic_init(tmp_adev); 6033 if (r) { 6034 dev_warn(tmp_adev->dev, "asic atom init failed!"); 6035 } else { 6036 dev_info(tmp_adev->dev, "GPU reset succeeded, trying to resume\n"); 6037 6038 r = amdgpu_device_ip_resume_phase1(tmp_adev); 6039 if (r) 6040 goto out; 6041 6042 vram_lost = amdgpu_device_check_vram_lost(tmp_adev); 6043 6044 if (!test_bit(AMDGPU_SKIP_COREDUMP, &reset_context->flags)) 6045 amdgpu_coredump(tmp_adev, false, vram_lost, reset_context->job); 6046 6047 if (vram_lost) { 6048 dev_info( 6049 tmp_adev->dev, 6050 "VRAM is lost due to GPU reset!\n"); 6051 amdgpu_inc_vram_lost(tmp_adev); 6052 } 6053 6054 r = amdgpu_device_fw_loading(tmp_adev); 6055 if (r) 6056 return r; 6057 6058 r = amdgpu_xcp_restore_partition_mode( 6059 tmp_adev->xcp_mgr); 6060 if (r) 6061 goto out; 6062 6063 r = amdgpu_device_ip_resume_phase2(tmp_adev); 6064 if (r) 6065 goto out; 6066 6067 if (tmp_adev->mman.buffer_funcs_ring->sched.ready) 6068 amdgpu_ttm_set_buffer_funcs_status(tmp_adev, true); 6069 6070 r = amdgpu_device_ip_resume_phase3(tmp_adev); 6071 if (r) 6072 goto out; 6073 6074 if (vram_lost) 6075 amdgpu_device_fill_reset_magic(tmp_adev); 6076 6077 /* 6078 * Add this ASIC as tracked as reset was already 6079 * complete successfully. 6080 */ 6081 amdgpu_register_gpu_instance(tmp_adev); 6082 6083 if (!reset_context->hive && 6084 tmp_adev->gmc.xgmi.num_physical_nodes > 1) 6085 amdgpu_xgmi_add_device(tmp_adev); 6086 6087 r = amdgpu_device_ip_late_init(tmp_adev); 6088 if (r) 6089 goto out; 6090 6091 r = amdgpu_userq_post_reset(tmp_adev, vram_lost); 6092 if (r) 6093 goto out; 6094 6095 drm_client_dev_resume(adev_to_drm(tmp_adev)); 6096 6097 /* 6098 * The GPU enters bad state once faulty pages 6099 * by ECC has reached the threshold, and ras 6100 * recovery is scheduled next. So add one check 6101 * here to break recovery if it indeed exceeds 6102 * bad page threshold, and remind user to 6103 * retire this GPU or setting one bigger 6104 * bad_page_threshold value to fix this once 6105 * probing driver again. 6106 */ 6107 if (!amdgpu_ras_is_rma(tmp_adev)) { 6108 /* must succeed. */ 6109 amdgpu_ras_resume(tmp_adev); 6110 } else { 6111 r = -EINVAL; 6112 goto out; 6113 } 6114 6115 /* Update PSP FW topology after reset */ 6116 if (reset_context->hive && 6117 tmp_adev->gmc.xgmi.num_physical_nodes > 1) 6118 r = amdgpu_xgmi_update_topology( 6119 reset_context->hive, tmp_adev); 6120 } 6121 } 6122 6123 out: 6124 if (!r) { 6125 /* IP init is complete now, set level as default */ 6126 amdgpu_set_init_level(tmp_adev, 6127 AMDGPU_INIT_LEVEL_DEFAULT); 6128 amdgpu_irq_gpu_reset_resume_helper(tmp_adev); 6129 r = amdgpu_ib_ring_tests(tmp_adev); 6130 if (r) { 6131 dev_err(tmp_adev->dev, "ib ring test failed (%d).\n", r); 6132 r = -EAGAIN; 6133 goto end; 6134 } 6135 } 6136 6137 if (r) 6138 tmp_adev->asic_reset_res = r; 6139 } 6140 6141 end: 6142 return r; 6143 } 6144 6145 int amdgpu_do_asic_reset(struct list_head *device_list_handle, 6146 struct amdgpu_reset_context *reset_context) 6147 { 6148 struct amdgpu_device *tmp_adev = NULL; 6149 bool need_full_reset, skip_hw_reset; 6150 int r = 0; 6151 6152 /* Try reset handler method first */ 6153 tmp_adev = list_first_entry(device_list_handle, struct amdgpu_device, 6154 reset_list); 6155 6156 reset_context->reset_device_list = device_list_handle; 6157 r = amdgpu_reset_perform_reset(tmp_adev, reset_context); 6158 /* If reset handler not implemented, continue; otherwise return */ 6159 if (r == -EOPNOTSUPP) 6160 r = 0; 6161 else 6162 return r; 6163 6164 /* Reset handler not implemented, use the default method */ 6165 need_full_reset = 6166 test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 6167 skip_hw_reset = test_bit(AMDGPU_SKIP_HW_RESET, &reset_context->flags); 6168 6169 /* 6170 * ASIC reset has to be done on all XGMI hive nodes ASAP 6171 * to allow proper links negotiation in FW (within 1 sec) 6172 */ 6173 if (!skip_hw_reset && need_full_reset) { 6174 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 6175 /* For XGMI run all resets in parallel to speed up the process */ 6176 if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) { 6177 if (!queue_work(system_unbound_wq, 6178 &tmp_adev->xgmi_reset_work)) 6179 r = -EALREADY; 6180 } else 6181 r = amdgpu_asic_reset(tmp_adev); 6182 6183 if (r) { 6184 dev_err(tmp_adev->dev, 6185 "ASIC reset failed with error, %d for drm dev, %s", 6186 r, adev_to_drm(tmp_adev)->unique); 6187 goto out; 6188 } 6189 } 6190 6191 /* For XGMI wait for all resets to complete before proceed */ 6192 if (!r) { 6193 list_for_each_entry(tmp_adev, device_list_handle, 6194 reset_list) { 6195 if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) { 6196 flush_work(&tmp_adev->xgmi_reset_work); 6197 r = tmp_adev->asic_reset_res; 6198 if (r) 6199 break; 6200 } 6201 } 6202 } 6203 } 6204 6205 if (!r && amdgpu_ras_intr_triggered()) { 6206 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 6207 amdgpu_ras_reset_error_count(tmp_adev, 6208 AMDGPU_RAS_BLOCK__MMHUB); 6209 } 6210 6211 amdgpu_ras_intr_cleared(); 6212 } 6213 6214 r = amdgpu_device_reinit_after_reset(reset_context); 6215 if (r == -EAGAIN) 6216 set_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 6217 else 6218 clear_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 6219 6220 out: 6221 return r; 6222 } 6223 6224 static void amdgpu_device_set_mp1_state(struct amdgpu_device *adev) 6225 { 6226 6227 switch (amdgpu_asic_reset_method(adev)) { 6228 case AMD_RESET_METHOD_MODE1: 6229 case AMD_RESET_METHOD_LINK: 6230 adev->mp1_state = PP_MP1_STATE_SHUTDOWN; 6231 break; 6232 case AMD_RESET_METHOD_MODE2: 6233 adev->mp1_state = PP_MP1_STATE_RESET; 6234 break; 6235 default: 6236 adev->mp1_state = PP_MP1_STATE_NONE; 6237 break; 6238 } 6239 } 6240 6241 static void amdgpu_device_unset_mp1_state(struct amdgpu_device *adev) 6242 { 6243 amdgpu_vf_error_trans_all(adev); 6244 adev->mp1_state = PP_MP1_STATE_NONE; 6245 } 6246 6247 static void amdgpu_device_resume_display_audio(struct amdgpu_device *adev) 6248 { 6249 struct pci_dev *p = NULL; 6250 6251 p = pci_get_domain_bus_and_slot(pci_domain_nr(adev->pdev->bus), 6252 adev->pdev->bus->number, 1); 6253 if (p) { 6254 pm_runtime_enable(&(p->dev)); 6255 pm_runtime_resume(&(p->dev)); 6256 } 6257 6258 pci_dev_put(p); 6259 } 6260 6261 static int amdgpu_device_suspend_display_audio(struct amdgpu_device *adev) 6262 { 6263 enum amd_reset_method reset_method; 6264 struct pci_dev *p = NULL; 6265 u64 expires; 6266 6267 /* 6268 * For now, only BACO and mode1 reset are confirmed 6269 * to suffer the audio issue without proper suspended. 6270 */ 6271 reset_method = amdgpu_asic_reset_method(adev); 6272 if ((reset_method != AMD_RESET_METHOD_BACO) && 6273 (reset_method != AMD_RESET_METHOD_MODE1)) 6274 return -EINVAL; 6275 6276 p = pci_get_domain_bus_and_slot(pci_domain_nr(adev->pdev->bus), 6277 adev->pdev->bus->number, 1); 6278 if (!p) 6279 return -ENODEV; 6280 6281 expires = pm_runtime_autosuspend_expiration(&(p->dev)); 6282 if (!expires) 6283 /* 6284 * If we cannot get the audio device autosuspend delay, 6285 * a fixed 4S interval will be used. Considering 3S is 6286 * the audio controller default autosuspend delay setting. 6287 * 4S used here is guaranteed to cover that. 6288 */ 6289 expires = ktime_get_mono_fast_ns() + NSEC_PER_SEC * 4ULL; 6290 6291 while (!pm_runtime_status_suspended(&(p->dev))) { 6292 if (!pm_runtime_suspend(&(p->dev))) 6293 break; 6294 6295 if (expires < ktime_get_mono_fast_ns()) { 6296 dev_warn(adev->dev, "failed to suspend display audio\n"); 6297 pci_dev_put(p); 6298 /* TODO: abort the succeeding gpu reset? */ 6299 return -ETIMEDOUT; 6300 } 6301 } 6302 6303 pm_runtime_disable(&(p->dev)); 6304 6305 pci_dev_put(p); 6306 return 0; 6307 } 6308 6309 static inline void amdgpu_device_stop_pending_resets(struct amdgpu_device *adev) 6310 { 6311 struct amdgpu_ras *con = amdgpu_ras_get_context(adev); 6312 6313 #if defined(CONFIG_DEBUG_FS) 6314 if (!amdgpu_sriov_vf(adev)) 6315 cancel_work(&adev->reset_work); 6316 #endif 6317 cancel_work(&adev->userq_reset_work); 6318 6319 if (adev->kfd.dev) 6320 cancel_work(&adev->kfd.reset_work); 6321 6322 if (amdgpu_sriov_vf(adev)) 6323 cancel_work(&adev->virt.flr_work); 6324 6325 if (con && adev->ras_enabled) 6326 cancel_work(&con->recovery_work); 6327 6328 } 6329 6330 static int amdgpu_device_health_check(struct list_head *device_list_handle) 6331 { 6332 struct amdgpu_device *tmp_adev; 6333 int ret = 0; 6334 6335 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 6336 ret |= amdgpu_device_bus_status_check(tmp_adev); 6337 } 6338 6339 return ret; 6340 } 6341 6342 static void amdgpu_device_recovery_prepare(struct amdgpu_device *adev, 6343 struct list_head *device_list, 6344 struct amdgpu_hive_info *hive) 6345 { 6346 struct amdgpu_device *tmp_adev = NULL; 6347 6348 /* 6349 * Build list of devices to reset. 6350 * In case we are in XGMI hive mode, resort the device list 6351 * to put adev in the 1st position. 6352 */ 6353 if (!amdgpu_sriov_vf(adev) && (adev->gmc.xgmi.num_physical_nodes > 1) && hive) { 6354 list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head) { 6355 list_add_tail(&tmp_adev->reset_list, device_list); 6356 if (adev->shutdown) 6357 tmp_adev->shutdown = true; 6358 if (amdgpu_reset_in_dpc(adev)) 6359 tmp_adev->pcie_reset_ctx.in_link_reset = true; 6360 } 6361 if (!list_is_first(&adev->reset_list, device_list)) 6362 list_rotate_to_front(&adev->reset_list, device_list); 6363 } else { 6364 list_add_tail(&adev->reset_list, device_list); 6365 } 6366 } 6367 6368 static void amdgpu_device_recovery_get_reset_lock(struct amdgpu_device *adev, 6369 struct list_head *device_list) 6370 { 6371 struct amdgpu_device *tmp_adev = NULL; 6372 6373 if (list_empty(device_list)) 6374 return; 6375 tmp_adev = 6376 list_first_entry(device_list, struct amdgpu_device, reset_list); 6377 amdgpu_device_lock_reset_domain(tmp_adev->reset_domain); 6378 } 6379 6380 static void amdgpu_device_recovery_put_reset_lock(struct amdgpu_device *adev, 6381 struct list_head *device_list) 6382 { 6383 struct amdgpu_device *tmp_adev = NULL; 6384 6385 if (list_empty(device_list)) 6386 return; 6387 tmp_adev = 6388 list_first_entry(device_list, struct amdgpu_device, reset_list); 6389 amdgpu_device_unlock_reset_domain(tmp_adev->reset_domain); 6390 } 6391 6392 static void amdgpu_device_halt_activities(struct amdgpu_device *adev, 6393 struct amdgpu_job *job, 6394 struct amdgpu_reset_context *reset_context, 6395 struct list_head *device_list, 6396 struct amdgpu_hive_info *hive, 6397 bool need_emergency_restart) 6398 { 6399 struct amdgpu_device *tmp_adev = NULL; 6400 int i; 6401 6402 /* block all schedulers and reset given job's ring */ 6403 list_for_each_entry(tmp_adev, device_list, reset_list) { 6404 amdgpu_device_set_mp1_state(tmp_adev); 6405 6406 /* 6407 * Try to put the audio codec into suspend state 6408 * before gpu reset started. 6409 * 6410 * Due to the power domain of the graphics device 6411 * is shared with AZ power domain. Without this, 6412 * we may change the audio hardware from behind 6413 * the audio driver's back. That will trigger 6414 * some audio codec errors. 6415 */ 6416 if (!amdgpu_device_suspend_display_audio(tmp_adev)) 6417 tmp_adev->pcie_reset_ctx.audio_suspended = true; 6418 6419 amdgpu_ras_set_error_query_ready(tmp_adev, false); 6420 6421 cancel_delayed_work_sync(&tmp_adev->delayed_init_work); 6422 6423 amdgpu_amdkfd_pre_reset(tmp_adev, reset_context); 6424 6425 /* 6426 * Mark these ASICs to be reset as untracked first 6427 * And add them back after reset completed 6428 */ 6429 amdgpu_unregister_gpu_instance(tmp_adev); 6430 6431 drm_client_dev_suspend(adev_to_drm(tmp_adev)); 6432 6433 /* disable ras on ALL IPs */ 6434 if (!need_emergency_restart && !amdgpu_reset_in_dpc(adev) && 6435 amdgpu_device_ip_need_full_reset(tmp_adev)) 6436 amdgpu_ras_suspend(tmp_adev); 6437 6438 amdgpu_userq_pre_reset(tmp_adev); 6439 6440 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 6441 struct amdgpu_ring *ring = tmp_adev->rings[i]; 6442 6443 if (!amdgpu_ring_sched_ready(ring)) 6444 continue; 6445 6446 drm_sched_stop(&ring->sched, job ? &job->base : NULL); 6447 6448 if (need_emergency_restart) 6449 amdgpu_job_stop_all_jobs_on_sched(&ring->sched); 6450 } 6451 atomic_inc(&tmp_adev->gpu_reset_counter); 6452 } 6453 } 6454 6455 static int amdgpu_device_asic_reset(struct amdgpu_device *adev, 6456 struct list_head *device_list, 6457 struct amdgpu_reset_context *reset_context) 6458 { 6459 struct amdgpu_device *tmp_adev = NULL; 6460 int retry_limit = AMDGPU_MAX_RETRY_LIMIT; 6461 int r = 0; 6462 6463 retry: /* Rest of adevs pre asic reset from XGMI hive. */ 6464 list_for_each_entry(tmp_adev, device_list, reset_list) { 6465 r = amdgpu_device_pre_asic_reset(tmp_adev, reset_context); 6466 /*TODO Should we stop ?*/ 6467 if (r) { 6468 dev_err(tmp_adev->dev, "GPU pre asic reset failed with err, %d for drm dev, %s ", 6469 r, adev_to_drm(tmp_adev)->unique); 6470 tmp_adev->asic_reset_res = r; 6471 } 6472 } 6473 6474 /* Actual ASIC resets if needed.*/ 6475 /* Host driver will handle XGMI hive reset for SRIOV */ 6476 if (amdgpu_sriov_vf(adev)) { 6477 6478 /* Bail out of reset early */ 6479 if (amdgpu_ras_is_rma(adev)) 6480 return -ENODEV; 6481 6482 if (amdgpu_ras_get_fed_status(adev) || amdgpu_virt_rcvd_ras_interrupt(adev)) { 6483 dev_dbg(adev->dev, "Detected RAS error, wait for FLR completion\n"); 6484 amdgpu_ras_set_fed(adev, true); 6485 set_bit(AMDGPU_HOST_FLR, &reset_context->flags); 6486 } 6487 6488 r = amdgpu_device_reset_sriov(adev, reset_context); 6489 if (AMDGPU_RETRY_SRIOV_RESET(r) && (retry_limit--) > 0) { 6490 amdgpu_virt_release_full_gpu(adev, true); 6491 goto retry; 6492 } 6493 if (r) 6494 adev->asic_reset_res = r; 6495 } else { 6496 r = amdgpu_do_asic_reset(device_list, reset_context); 6497 if (r && r == -EAGAIN) 6498 goto retry; 6499 } 6500 6501 list_for_each_entry(tmp_adev, device_list, reset_list) { 6502 /* 6503 * Drop any pending non scheduler resets queued before reset is done. 6504 * Any reset scheduled after this point would be valid. Scheduler resets 6505 * were already dropped during drm_sched_stop and no new ones can come 6506 * in before drm_sched_start. 6507 */ 6508 amdgpu_device_stop_pending_resets(tmp_adev); 6509 } 6510 6511 return r; 6512 } 6513 6514 static int amdgpu_device_sched_resume(struct list_head *device_list, 6515 struct amdgpu_reset_context *reset_context, 6516 bool job_signaled) 6517 { 6518 struct amdgpu_device *tmp_adev = NULL; 6519 int i, r = 0; 6520 6521 /* Post ASIC reset for all devs .*/ 6522 list_for_each_entry(tmp_adev, device_list, reset_list) { 6523 6524 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 6525 struct amdgpu_ring *ring = tmp_adev->rings[i]; 6526 6527 if (!amdgpu_ring_sched_ready(ring)) 6528 continue; 6529 6530 drm_sched_start(&ring->sched, 0); 6531 } 6532 6533 if (!drm_drv_uses_atomic_modeset(adev_to_drm(tmp_adev)) && !job_signaled) 6534 drm_helper_resume_force_mode(adev_to_drm(tmp_adev)); 6535 6536 if (tmp_adev->asic_reset_res) { 6537 /* bad news, how to tell it to userspace ? 6538 * for ras error, we should report GPU bad status instead of 6539 * reset failure 6540 */ 6541 if (reset_context->src != AMDGPU_RESET_SRC_RAS || 6542 !amdgpu_ras_eeprom_check_err_threshold(tmp_adev)) 6543 dev_info( 6544 tmp_adev->dev, 6545 "GPU reset(%d) failed with error %d \n", 6546 atomic_read( 6547 &tmp_adev->gpu_reset_counter), 6548 tmp_adev->asic_reset_res); 6549 amdgpu_vf_error_put(tmp_adev, 6550 AMDGIM_ERROR_VF_GPU_RESET_FAIL, 0, 6551 tmp_adev->asic_reset_res); 6552 if (!r) 6553 r = tmp_adev->asic_reset_res; 6554 tmp_adev->asic_reset_res = 0; 6555 } else { 6556 dev_info(tmp_adev->dev, "GPU reset(%d) succeeded!\n", 6557 atomic_read(&tmp_adev->gpu_reset_counter)); 6558 if (amdgpu_acpi_smart_shift_update(tmp_adev, 6559 AMDGPU_SS_DEV_D0)) 6560 dev_warn(tmp_adev->dev, 6561 "smart shift update failed\n"); 6562 } 6563 } 6564 6565 return r; 6566 } 6567 6568 static void amdgpu_device_gpu_resume(struct amdgpu_device *adev, 6569 struct list_head *device_list, 6570 bool need_emergency_restart) 6571 { 6572 struct amdgpu_device *tmp_adev = NULL; 6573 6574 list_for_each_entry(tmp_adev, device_list, reset_list) { 6575 /* unlock kfd: SRIOV would do it separately */ 6576 if (!need_emergency_restart && !amdgpu_sriov_vf(tmp_adev)) 6577 amdgpu_amdkfd_post_reset(tmp_adev); 6578 6579 /* kfd_post_reset will do nothing if kfd device is not initialized, 6580 * need to bring up kfd here if it's not be initialized before 6581 */ 6582 if (!adev->kfd.init_complete) 6583 amdgpu_amdkfd_device_init(adev); 6584 6585 if (tmp_adev->pcie_reset_ctx.audio_suspended) 6586 amdgpu_device_resume_display_audio(tmp_adev); 6587 6588 amdgpu_device_unset_mp1_state(tmp_adev); 6589 6590 amdgpu_ras_set_error_query_ready(tmp_adev, true); 6591 6592 } 6593 } 6594 6595 6596 /** 6597 * amdgpu_device_gpu_recover - reset the asic and recover scheduler 6598 * 6599 * @adev: amdgpu_device pointer 6600 * @job: which job trigger hang 6601 * @reset_context: amdgpu reset context pointer 6602 * 6603 * Attempt to reset the GPU if it has hung (all asics). 6604 * Attempt to do soft-reset or full-reset and reinitialize Asic 6605 * Returns 0 for success or an error on failure. 6606 */ 6607 6608 int amdgpu_device_gpu_recover(struct amdgpu_device *adev, 6609 struct amdgpu_job *job, 6610 struct amdgpu_reset_context *reset_context) 6611 { 6612 struct list_head device_list; 6613 bool job_signaled = false; 6614 struct amdgpu_hive_info *hive = NULL; 6615 int r = 0; 6616 bool need_emergency_restart = false; 6617 6618 /* 6619 * If it reaches here because of hang/timeout and a RAS error is 6620 * detected at the same time, let RAS recovery take care of it. 6621 */ 6622 if (amdgpu_ras_is_err_state(adev, AMDGPU_RAS_BLOCK__ANY) && 6623 !amdgpu_sriov_vf(adev) && 6624 reset_context->src != AMDGPU_RESET_SRC_RAS) { 6625 dev_dbg(adev->dev, 6626 "Gpu recovery from source: %d yielding to RAS error recovery handling", 6627 reset_context->src); 6628 return 0; 6629 } 6630 6631 /* 6632 * Special case: RAS triggered and full reset isn't supported 6633 */ 6634 need_emergency_restart = amdgpu_ras_need_emergency_restart(adev); 6635 6636 /* 6637 * Flush RAM to disk so that after reboot 6638 * the user can read log and see why the system rebooted. 6639 */ 6640 if (need_emergency_restart && amdgpu_ras_get_context(adev) && 6641 amdgpu_ras_get_context(adev)->reboot) { 6642 dev_warn(adev->dev, "Emergency reboot."); 6643 6644 ksys_sync_helper(); 6645 emergency_restart(); 6646 } 6647 6648 dev_info(adev->dev, "GPU %s begin!. Source: %d\n", 6649 need_emergency_restart ? "jobs stop" : "reset", 6650 reset_context->src); 6651 6652 if (!amdgpu_sriov_vf(adev)) 6653 hive = amdgpu_get_xgmi_hive(adev); 6654 if (hive) 6655 mutex_lock(&hive->hive_lock); 6656 6657 reset_context->job = job; 6658 reset_context->hive = hive; 6659 INIT_LIST_HEAD(&device_list); 6660 6661 amdgpu_device_recovery_prepare(adev, &device_list, hive); 6662 6663 if (!amdgpu_sriov_vf(adev)) { 6664 r = amdgpu_device_health_check(&device_list); 6665 if (r) 6666 goto end_reset; 6667 } 6668 6669 /* Cannot be called after locking reset domain */ 6670 amdgpu_ras_pre_reset(adev, &device_list); 6671 6672 /* We need to lock reset domain only once both for XGMI and single device */ 6673 amdgpu_device_recovery_get_reset_lock(adev, &device_list); 6674 6675 amdgpu_device_halt_activities(adev, job, reset_context, &device_list, 6676 hive, need_emergency_restart); 6677 if (need_emergency_restart) 6678 goto skip_sched_resume; 6679 /* 6680 * Must check guilty signal here since after this point all old 6681 * HW fences are force signaled. 6682 * 6683 * job->base holds a reference to parent fence 6684 */ 6685 if (job && dma_fence_is_signaled(&job->hw_fence->base)) { 6686 job_signaled = true; 6687 dev_info(adev->dev, "Guilty job already signaled, skipping HW reset"); 6688 goto skip_hw_reset; 6689 } 6690 6691 r = amdgpu_device_asic_reset(adev, &device_list, reset_context); 6692 if (r) 6693 goto reset_unlock; 6694 skip_hw_reset: 6695 r = amdgpu_device_sched_resume(&device_list, reset_context, job_signaled); 6696 if (r) 6697 goto reset_unlock; 6698 skip_sched_resume: 6699 amdgpu_device_gpu_resume(adev, &device_list, need_emergency_restart); 6700 reset_unlock: 6701 amdgpu_device_recovery_put_reset_lock(adev, &device_list); 6702 amdgpu_ras_post_reset(adev, &device_list); 6703 end_reset: 6704 if (hive) { 6705 mutex_unlock(&hive->hive_lock); 6706 amdgpu_put_xgmi_hive(hive); 6707 } 6708 6709 if (r) 6710 dev_info(adev->dev, "GPU reset end with ret = %d\n", r); 6711 6712 atomic_set(&adev->reset_domain->reset_res, r); 6713 6714 if (!r) { 6715 struct amdgpu_task_info *ti = NULL; 6716 6717 if (job) 6718 ti = amdgpu_vm_get_task_info_pasid(adev, job->pasid); 6719 6720 drm_dev_wedged_event(adev_to_drm(adev), DRM_WEDGE_RECOVERY_NONE, 6721 ti ? &ti->task : NULL); 6722 6723 amdgpu_vm_put_task_info(ti); 6724 } 6725 6726 return r; 6727 } 6728 6729 /** 6730 * amdgpu_device_partner_bandwidth - find the bandwidth of appropriate partner 6731 * 6732 * @adev: amdgpu_device pointer 6733 * @speed: pointer to the speed of the link 6734 * @width: pointer to the width of the link 6735 * 6736 * Evaluate the hierarchy to find the speed and bandwidth capabilities of the 6737 * first physical partner to an AMD dGPU. 6738 * This will exclude any virtual switches and links. 6739 */ 6740 static void amdgpu_device_partner_bandwidth(struct amdgpu_device *adev, 6741 enum pci_bus_speed *speed, 6742 enum pcie_link_width *width) 6743 { 6744 struct pci_dev *parent = adev->pdev; 6745 6746 if (!speed || !width) 6747 return; 6748 6749 *speed = PCI_SPEED_UNKNOWN; 6750 *width = PCIE_LNK_WIDTH_UNKNOWN; 6751 6752 if (amdgpu_device_pcie_dynamic_switching_supported(adev)) { 6753 while ((parent = pci_upstream_bridge(parent))) { 6754 /* skip upstream/downstream switches internal to dGPU*/ 6755 if (parent->vendor == PCI_VENDOR_ID_ATI) 6756 continue; 6757 *speed = pcie_get_speed_cap(parent); 6758 *width = pcie_get_width_cap(parent); 6759 break; 6760 } 6761 } else { 6762 /* use the current speeds rather than max if switching is not supported */ 6763 pcie_bandwidth_available(adev->pdev, NULL, speed, width); 6764 } 6765 } 6766 6767 /** 6768 * amdgpu_device_gpu_bandwidth - find the bandwidth of the GPU 6769 * 6770 * @adev: amdgpu_device pointer 6771 * @speed: pointer to the speed of the link 6772 * @width: pointer to the width of the link 6773 * 6774 * Evaluate the hierarchy to find the speed and bandwidth capabilities of the 6775 * AMD dGPU which may be a virtual upstream bridge. 6776 */ 6777 static void amdgpu_device_gpu_bandwidth(struct amdgpu_device *adev, 6778 enum pci_bus_speed *speed, 6779 enum pcie_link_width *width) 6780 { 6781 struct pci_dev *parent = adev->pdev; 6782 6783 if (!speed || !width) 6784 return; 6785 6786 parent = pci_upstream_bridge(parent); 6787 if (parent && parent->vendor == PCI_VENDOR_ID_ATI) { 6788 /* use the upstream/downstream switches internal to dGPU */ 6789 *speed = pcie_get_speed_cap(parent); 6790 *width = pcie_get_width_cap(parent); 6791 while ((parent = pci_upstream_bridge(parent))) { 6792 if (parent->vendor == PCI_VENDOR_ID_ATI) { 6793 /* use the upstream/downstream switches internal to dGPU */ 6794 *speed = pcie_get_speed_cap(parent); 6795 *width = pcie_get_width_cap(parent); 6796 } 6797 } 6798 } else { 6799 /* use the device itself */ 6800 *speed = pcie_get_speed_cap(adev->pdev); 6801 *width = pcie_get_width_cap(adev->pdev); 6802 } 6803 } 6804 6805 /** 6806 * amdgpu_device_get_pcie_info - fence pcie info about the PCIE slot 6807 * 6808 * @adev: amdgpu_device pointer 6809 * 6810 * Fetches and stores in the driver the PCIE capabilities (gen speed 6811 * and lanes) of the slot the device is in. Handles APUs and 6812 * virtualized environments where PCIE config space may not be available. 6813 */ 6814 static void amdgpu_device_get_pcie_info(struct amdgpu_device *adev) 6815 { 6816 enum pci_bus_speed speed_cap, platform_speed_cap; 6817 enum pcie_link_width platform_link_width, link_width; 6818 6819 if (amdgpu_pcie_gen_cap) 6820 adev->pm.pcie_gen_mask = amdgpu_pcie_gen_cap; 6821 6822 if (amdgpu_pcie_lane_cap) 6823 adev->pm.pcie_mlw_mask = amdgpu_pcie_lane_cap; 6824 6825 /* covers APUs as well */ 6826 if (pci_is_root_bus(adev->pdev->bus) && !amdgpu_passthrough(adev)) { 6827 if (adev->pm.pcie_gen_mask == 0) 6828 adev->pm.pcie_gen_mask = AMDGPU_DEFAULT_PCIE_GEN_MASK; 6829 if (adev->pm.pcie_mlw_mask == 0) 6830 adev->pm.pcie_mlw_mask = AMDGPU_DEFAULT_PCIE_MLW_MASK; 6831 return; 6832 } 6833 6834 if (adev->pm.pcie_gen_mask && adev->pm.pcie_mlw_mask) 6835 return; 6836 6837 amdgpu_device_partner_bandwidth(adev, &platform_speed_cap, 6838 &platform_link_width); 6839 amdgpu_device_gpu_bandwidth(adev, &speed_cap, &link_width); 6840 6841 if (adev->pm.pcie_gen_mask == 0) { 6842 /* asic caps */ 6843 if (speed_cap == PCI_SPEED_UNKNOWN) { 6844 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 6845 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 | 6846 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3); 6847 } else { 6848 if (speed_cap == PCIE_SPEED_32_0GT) 6849 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 6850 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 | 6851 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3 | 6852 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN4 | 6853 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN5); 6854 else if (speed_cap == PCIE_SPEED_16_0GT) 6855 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 6856 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 | 6857 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3 | 6858 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN4); 6859 else if (speed_cap == PCIE_SPEED_8_0GT) 6860 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 6861 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 | 6862 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3); 6863 else if (speed_cap == PCIE_SPEED_5_0GT) 6864 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 6865 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2); 6866 else 6867 adev->pm.pcie_gen_mask |= CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1; 6868 } 6869 /* platform caps */ 6870 if (platform_speed_cap == PCI_SPEED_UNKNOWN) { 6871 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 6872 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2); 6873 } else { 6874 if (platform_speed_cap == PCIE_SPEED_32_0GT) 6875 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 6876 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 | 6877 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3 | 6878 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN4 | 6879 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN5); 6880 else if (platform_speed_cap == PCIE_SPEED_16_0GT) 6881 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 6882 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 | 6883 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3 | 6884 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN4); 6885 else if (platform_speed_cap == PCIE_SPEED_8_0GT) 6886 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 6887 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 | 6888 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3); 6889 else if (platform_speed_cap == PCIE_SPEED_5_0GT) 6890 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 6891 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2); 6892 else 6893 adev->pm.pcie_gen_mask |= CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1; 6894 6895 } 6896 } 6897 if (adev->pm.pcie_mlw_mask == 0) { 6898 /* asic caps */ 6899 if (link_width == PCIE_LNK_WIDTH_UNKNOWN) { 6900 adev->pm.pcie_mlw_mask |= AMDGPU_DEFAULT_ASIC_PCIE_MLW_MASK; 6901 } else { 6902 switch (link_width) { 6903 case PCIE_LNK_X32: 6904 adev->pm.pcie_mlw_mask |= (CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X32 | 6905 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X16 | 6906 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X12 | 6907 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X8 | 6908 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X4 | 6909 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X2 | 6910 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X1); 6911 break; 6912 case PCIE_LNK_X16: 6913 adev->pm.pcie_mlw_mask |= (CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X16 | 6914 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X12 | 6915 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X8 | 6916 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X4 | 6917 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X2 | 6918 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X1); 6919 break; 6920 case PCIE_LNK_X12: 6921 adev->pm.pcie_mlw_mask |= (CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X12 | 6922 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X8 | 6923 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X4 | 6924 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X2 | 6925 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X1); 6926 break; 6927 case PCIE_LNK_X8: 6928 adev->pm.pcie_mlw_mask |= (CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X8 | 6929 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X4 | 6930 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X2 | 6931 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X1); 6932 break; 6933 case PCIE_LNK_X4: 6934 adev->pm.pcie_mlw_mask |= (CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X4 | 6935 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X2 | 6936 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X1); 6937 break; 6938 case PCIE_LNK_X2: 6939 adev->pm.pcie_mlw_mask |= (CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X2 | 6940 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X1); 6941 break; 6942 case PCIE_LNK_X1: 6943 adev->pm.pcie_mlw_mask |= CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X1; 6944 break; 6945 default: 6946 break; 6947 } 6948 } 6949 /* platform caps */ 6950 if (platform_link_width == PCIE_LNK_WIDTH_UNKNOWN) { 6951 adev->pm.pcie_mlw_mask |= AMDGPU_DEFAULT_PCIE_MLW_MASK; 6952 } else { 6953 switch (platform_link_width) { 6954 case PCIE_LNK_X32: 6955 adev->pm.pcie_mlw_mask |= (CAIL_PCIE_LINK_WIDTH_SUPPORT_X32 | 6956 CAIL_PCIE_LINK_WIDTH_SUPPORT_X16 | 6957 CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 | 6958 CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 | 6959 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 6960 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 6961 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 6962 break; 6963 case PCIE_LNK_X16: 6964 adev->pm.pcie_mlw_mask |= (CAIL_PCIE_LINK_WIDTH_SUPPORT_X16 | 6965 CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 | 6966 CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 | 6967 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 6968 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 6969 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 6970 break; 6971 case PCIE_LNK_X12: 6972 adev->pm.pcie_mlw_mask |= (CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 | 6973 CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 | 6974 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 6975 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 6976 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 6977 break; 6978 case PCIE_LNK_X8: 6979 adev->pm.pcie_mlw_mask |= (CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 | 6980 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 6981 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 6982 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 6983 break; 6984 case PCIE_LNK_X4: 6985 adev->pm.pcie_mlw_mask |= (CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 6986 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 6987 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 6988 break; 6989 case PCIE_LNK_X2: 6990 adev->pm.pcie_mlw_mask |= (CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 6991 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 6992 break; 6993 case PCIE_LNK_X1: 6994 adev->pm.pcie_mlw_mask |= CAIL_PCIE_LINK_WIDTH_SUPPORT_X1; 6995 break; 6996 default: 6997 break; 6998 } 6999 } 7000 } 7001 } 7002 7003 /** 7004 * amdgpu_device_is_peer_accessible - Check peer access through PCIe BAR 7005 * 7006 * @adev: amdgpu_device pointer 7007 * @peer_adev: amdgpu_device pointer for peer device trying to access @adev 7008 * 7009 * Return true if @peer_adev can access (DMA) @adev through the PCIe 7010 * BAR, i.e. @adev is "large BAR" and the BAR matches the DMA mask of 7011 * @peer_adev. 7012 */ 7013 bool amdgpu_device_is_peer_accessible(struct amdgpu_device *adev, 7014 struct amdgpu_device *peer_adev) 7015 { 7016 #ifdef CONFIG_HSA_AMD_P2P 7017 bool p2p_access = 7018 !adev->gmc.xgmi.connected_to_cpu && 7019 !(pci_p2pdma_distance(adev->pdev, peer_adev->dev, false) < 0); 7020 if (!p2p_access) 7021 dev_info(adev->dev, "PCIe P2P access from peer device %s is not supported by the chipset\n", 7022 pci_name(peer_adev->pdev)); 7023 7024 bool is_large_bar = adev->gmc.visible_vram_size && 7025 adev->gmc.real_vram_size == adev->gmc.visible_vram_size; 7026 bool p2p_addressable = amdgpu_device_check_iommu_remap(peer_adev); 7027 7028 if (!p2p_addressable) { 7029 uint64_t address_mask = peer_adev->dev->dma_mask ? 7030 ~*peer_adev->dev->dma_mask : ~((1ULL << 32) - 1); 7031 resource_size_t aper_limit = 7032 adev->gmc.aper_base + adev->gmc.aper_size - 1; 7033 7034 p2p_addressable = !(adev->gmc.aper_base & address_mask || 7035 aper_limit & address_mask); 7036 } 7037 return pcie_p2p && is_large_bar && p2p_access && p2p_addressable; 7038 #else 7039 return false; 7040 #endif 7041 } 7042 7043 int amdgpu_device_baco_enter(struct amdgpu_device *adev) 7044 { 7045 struct amdgpu_ras *ras = amdgpu_ras_get_context(adev); 7046 7047 if (!amdgpu_device_supports_baco(adev)) 7048 return -ENOTSUPP; 7049 7050 if (ras && adev->ras_enabled && 7051 adev->nbio.funcs->enable_doorbell_interrupt) 7052 adev->nbio.funcs->enable_doorbell_interrupt(adev, false); 7053 7054 return amdgpu_dpm_baco_enter(adev); 7055 } 7056 7057 int amdgpu_device_baco_exit(struct amdgpu_device *adev) 7058 { 7059 struct amdgpu_ras *ras = amdgpu_ras_get_context(adev); 7060 int ret = 0; 7061 7062 if (!amdgpu_device_supports_baco(adev)) 7063 return -ENOTSUPP; 7064 7065 ret = amdgpu_dpm_baco_exit(adev); 7066 if (ret) 7067 return ret; 7068 7069 if (ras && adev->ras_enabled && 7070 adev->nbio.funcs->enable_doorbell_interrupt) 7071 adev->nbio.funcs->enable_doorbell_interrupt(adev, true); 7072 7073 if (amdgpu_passthrough(adev) && adev->nbio.funcs && 7074 adev->nbio.funcs->clear_doorbell_interrupt) 7075 adev->nbio.funcs->clear_doorbell_interrupt(adev); 7076 7077 return 0; 7078 } 7079 7080 /** 7081 * amdgpu_pci_error_detected - Called when a PCI error is detected. 7082 * @pdev: PCI device struct 7083 * @state: PCI channel state 7084 * 7085 * Description: Called when a PCI error is detected. 7086 * 7087 * Return: PCI_ERS_RESULT_NEED_RESET or PCI_ERS_RESULT_DISCONNECT. 7088 */ 7089 pci_ers_result_t amdgpu_pci_error_detected(struct pci_dev *pdev, pci_channel_state_t state) 7090 { 7091 struct drm_device *dev = pci_get_drvdata(pdev); 7092 struct amdgpu_device *adev = drm_to_adev(dev); 7093 struct amdgpu_hive_info *hive __free(xgmi_put_hive) = 7094 amdgpu_get_xgmi_hive(adev); 7095 struct amdgpu_reset_context reset_context; 7096 struct list_head device_list; 7097 7098 dev_info(adev->dev, "PCI error: detected callback!!\n"); 7099 7100 adev->pci_channel_state = state; 7101 7102 switch (state) { 7103 case pci_channel_io_normal: 7104 dev_info(adev->dev, "pci_channel_io_normal: state(%d)!!\n", state); 7105 return PCI_ERS_RESULT_CAN_RECOVER; 7106 case pci_channel_io_frozen: 7107 /* Fatal error, prepare for slot reset */ 7108 dev_info(adev->dev, "pci_channel_io_frozen: state(%d)!!\n", state); 7109 if (hive) { 7110 /* Hive devices should be able to support FW based 7111 * link reset on other devices, if not return. 7112 */ 7113 if (!amdgpu_dpm_is_link_reset_supported(adev)) { 7114 dev_warn(adev->dev, 7115 "No support for XGMI hive yet...\n"); 7116 return PCI_ERS_RESULT_DISCONNECT; 7117 } 7118 /* Set dpc status only if device is part of hive 7119 * Non-hive devices should be able to recover after 7120 * link reset. 7121 */ 7122 amdgpu_reset_set_dpc_status(adev, true); 7123 7124 mutex_lock(&hive->hive_lock); 7125 } 7126 memset(&reset_context, 0, sizeof(reset_context)); 7127 INIT_LIST_HEAD(&device_list); 7128 7129 amdgpu_device_recovery_prepare(adev, &device_list, hive); 7130 amdgpu_device_recovery_get_reset_lock(adev, &device_list); 7131 amdgpu_device_halt_activities(adev, NULL, &reset_context, &device_list, 7132 hive, false); 7133 if (hive) 7134 mutex_unlock(&hive->hive_lock); 7135 return PCI_ERS_RESULT_NEED_RESET; 7136 case pci_channel_io_perm_failure: 7137 /* Permanent error, prepare for device removal */ 7138 dev_info(adev->dev, "pci_channel_io_perm_failure: state(%d)!!\n", state); 7139 return PCI_ERS_RESULT_DISCONNECT; 7140 } 7141 7142 return PCI_ERS_RESULT_NEED_RESET; 7143 } 7144 7145 /** 7146 * amdgpu_pci_mmio_enabled - Enable MMIO and dump debug registers 7147 * @pdev: pointer to PCI device 7148 */ 7149 pci_ers_result_t amdgpu_pci_mmio_enabled(struct pci_dev *pdev) 7150 { 7151 struct drm_device *dev = pci_get_drvdata(pdev); 7152 struct amdgpu_device *adev = drm_to_adev(dev); 7153 7154 dev_info(adev->dev, "PCI error: mmio enabled callback!!\n"); 7155 7156 /* TODO - dump whatever for debugging purposes */ 7157 7158 /* This called only if amdgpu_pci_error_detected returns 7159 * PCI_ERS_RESULT_CAN_RECOVER. Read/write to the device still 7160 * works, no need to reset slot. 7161 */ 7162 7163 return PCI_ERS_RESULT_RECOVERED; 7164 } 7165 7166 /** 7167 * amdgpu_pci_slot_reset - Called when PCI slot has been reset. 7168 * @pdev: PCI device struct 7169 * 7170 * Description: This routine is called by the pci error recovery 7171 * code after the PCI slot has been reset, just before we 7172 * should resume normal operations. 7173 */ 7174 pci_ers_result_t amdgpu_pci_slot_reset(struct pci_dev *pdev) 7175 { 7176 struct drm_device *dev = pci_get_drvdata(pdev); 7177 struct amdgpu_device *adev = drm_to_adev(dev); 7178 struct amdgpu_reset_context reset_context; 7179 struct amdgpu_device *tmp_adev; 7180 struct amdgpu_hive_info *hive; 7181 struct list_head device_list; 7182 struct pci_dev *link_dev; 7183 int r = 0, i, timeout; 7184 u32 memsize; 7185 u16 status; 7186 7187 dev_info(adev->dev, "PCI error: slot reset callback!!\n"); 7188 7189 memset(&reset_context, 0, sizeof(reset_context)); 7190 7191 if (adev->pcie_reset_ctx.swus) 7192 link_dev = adev->pcie_reset_ctx.swus; 7193 else 7194 link_dev = adev->pdev; 7195 /* wait for asic to come out of reset, timeout = 10s */ 7196 timeout = 10000; 7197 do { 7198 usleep_range(10000, 10500); 7199 r = pci_read_config_word(link_dev, PCI_VENDOR_ID, &status); 7200 timeout -= 10; 7201 } while (timeout > 0 && (status != PCI_VENDOR_ID_ATI) && 7202 (status != PCI_VENDOR_ID_AMD)); 7203 7204 if ((status != PCI_VENDOR_ID_ATI) && (status != PCI_VENDOR_ID_AMD)) { 7205 r = -ETIME; 7206 goto out; 7207 } 7208 7209 amdgpu_device_load_switch_state(adev); 7210 /* Restore PCI confspace */ 7211 amdgpu_device_load_pci_state(pdev); 7212 7213 /* confirm ASIC came out of reset */ 7214 for (i = 0; i < adev->usec_timeout; i++) { 7215 memsize = amdgpu_asic_get_config_memsize(adev); 7216 7217 if (memsize != 0xffffffff) 7218 break; 7219 udelay(1); 7220 } 7221 if (memsize == 0xffffffff) { 7222 r = -ETIME; 7223 goto out; 7224 } 7225 7226 reset_context.method = AMD_RESET_METHOD_NONE; 7227 reset_context.reset_req_dev = adev; 7228 set_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags); 7229 set_bit(AMDGPU_SKIP_COREDUMP, &reset_context.flags); 7230 INIT_LIST_HEAD(&device_list); 7231 7232 hive = amdgpu_get_xgmi_hive(adev); 7233 if (hive) { 7234 mutex_lock(&hive->hive_lock); 7235 reset_context.hive = hive; 7236 list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head) { 7237 tmp_adev->pcie_reset_ctx.in_link_reset = true; 7238 list_add_tail(&tmp_adev->reset_list, &device_list); 7239 } 7240 } else { 7241 set_bit(AMDGPU_SKIP_HW_RESET, &reset_context.flags); 7242 list_add_tail(&adev->reset_list, &device_list); 7243 } 7244 7245 r = amdgpu_device_asic_reset(adev, &device_list, &reset_context); 7246 out: 7247 if (!r) { 7248 if (amdgpu_device_cache_pci_state(adev->pdev)) 7249 pci_restore_state(adev->pdev); 7250 dev_info(adev->dev, "PCIe error recovery succeeded\n"); 7251 } else { 7252 dev_err(adev->dev, "PCIe error recovery failed, err:%d\n", r); 7253 if (hive) { 7254 list_for_each_entry(tmp_adev, &device_list, reset_list) 7255 amdgpu_device_unset_mp1_state(tmp_adev); 7256 } 7257 amdgpu_device_recovery_put_reset_lock(adev, &device_list); 7258 } 7259 7260 if (hive) { 7261 mutex_unlock(&hive->hive_lock); 7262 amdgpu_put_xgmi_hive(hive); 7263 } 7264 7265 return r ? PCI_ERS_RESULT_DISCONNECT : PCI_ERS_RESULT_RECOVERED; 7266 } 7267 7268 /** 7269 * amdgpu_pci_resume() - resume normal ops after PCI reset 7270 * @pdev: pointer to PCI device 7271 * 7272 * Called when the error recovery driver tells us that its 7273 * OK to resume normal operation. 7274 */ 7275 void amdgpu_pci_resume(struct pci_dev *pdev) 7276 { 7277 struct drm_device *dev = pci_get_drvdata(pdev); 7278 struct amdgpu_device *adev = drm_to_adev(dev); 7279 struct list_head device_list; 7280 struct amdgpu_hive_info *hive = NULL; 7281 struct amdgpu_device *tmp_adev = NULL; 7282 7283 dev_info(adev->dev, "PCI error: resume callback!!\n"); 7284 7285 /* Only continue execution for the case of pci_channel_io_frozen */ 7286 if (adev->pci_channel_state != pci_channel_io_frozen) 7287 return; 7288 7289 INIT_LIST_HEAD(&device_list); 7290 7291 hive = amdgpu_get_xgmi_hive(adev); 7292 if (hive) { 7293 mutex_lock(&hive->hive_lock); 7294 list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head) { 7295 tmp_adev->pcie_reset_ctx.in_link_reset = false; 7296 list_add_tail(&tmp_adev->reset_list, &device_list); 7297 } 7298 } else 7299 list_add_tail(&adev->reset_list, &device_list); 7300 7301 amdgpu_device_sched_resume(&device_list, NULL, NULL); 7302 amdgpu_device_gpu_resume(adev, &device_list, false); 7303 amdgpu_device_recovery_put_reset_lock(adev, &device_list); 7304 7305 if (hive) { 7306 mutex_unlock(&hive->hive_lock); 7307 amdgpu_put_xgmi_hive(hive); 7308 } 7309 } 7310 7311 static void amdgpu_device_cache_switch_state(struct amdgpu_device *adev) 7312 { 7313 struct pci_dev *swus, *swds; 7314 int r; 7315 7316 swds = pci_upstream_bridge(adev->pdev); 7317 if (!swds || swds->vendor != PCI_VENDOR_ID_ATI || 7318 pci_pcie_type(swds) != PCI_EXP_TYPE_DOWNSTREAM) 7319 return; 7320 swus = pci_upstream_bridge(swds); 7321 if (!swus || 7322 (swus->vendor != PCI_VENDOR_ID_ATI && 7323 swus->vendor != PCI_VENDOR_ID_AMD) || 7324 pci_pcie_type(swus) != PCI_EXP_TYPE_UPSTREAM) 7325 return; 7326 7327 /* If already saved, return */ 7328 if (adev->pcie_reset_ctx.swus) 7329 return; 7330 /* Upstream bridge is ATI, assume it's SWUS/DS architecture */ 7331 r = pci_save_state(swds); 7332 if (r) 7333 return; 7334 adev->pcie_reset_ctx.swds_pcistate = pci_store_saved_state(swds); 7335 7336 r = pci_save_state(swus); 7337 if (r) 7338 return; 7339 adev->pcie_reset_ctx.swus_pcistate = pci_store_saved_state(swus); 7340 7341 adev->pcie_reset_ctx.swus = swus; 7342 } 7343 7344 static void amdgpu_device_load_switch_state(struct amdgpu_device *adev) 7345 { 7346 struct pci_dev *pdev; 7347 int r; 7348 7349 if (!adev->pcie_reset_ctx.swds_pcistate || 7350 !adev->pcie_reset_ctx.swus_pcistate) 7351 return; 7352 7353 pdev = adev->pcie_reset_ctx.swus; 7354 r = pci_load_saved_state(pdev, adev->pcie_reset_ctx.swus_pcistate); 7355 if (!r) { 7356 pci_restore_state(pdev); 7357 } else { 7358 dev_warn(adev->dev, "Failed to load SWUS state, err:%d\n", r); 7359 return; 7360 } 7361 7362 pdev = pci_upstream_bridge(adev->pdev); 7363 r = pci_load_saved_state(pdev, adev->pcie_reset_ctx.swds_pcistate); 7364 if (!r) 7365 pci_restore_state(pdev); 7366 else 7367 dev_warn(adev->dev, "Failed to load SWDS state, err:%d\n", r); 7368 } 7369 7370 bool amdgpu_device_cache_pci_state(struct pci_dev *pdev) 7371 { 7372 struct drm_device *dev = pci_get_drvdata(pdev); 7373 struct amdgpu_device *adev = drm_to_adev(dev); 7374 int r; 7375 7376 if (amdgpu_sriov_vf(adev)) 7377 return false; 7378 7379 r = pci_save_state(pdev); 7380 if (!r) { 7381 kfree(adev->pci_state); 7382 7383 adev->pci_state = pci_store_saved_state(pdev); 7384 7385 if (!adev->pci_state) { 7386 dev_err(adev->dev, "Failed to store PCI saved state"); 7387 return false; 7388 } 7389 } else { 7390 dev_warn(adev->dev, "Failed to save PCI state, err:%d\n", r); 7391 return false; 7392 } 7393 7394 amdgpu_device_cache_switch_state(adev); 7395 7396 return true; 7397 } 7398 7399 bool amdgpu_device_load_pci_state(struct pci_dev *pdev) 7400 { 7401 struct drm_device *dev = pci_get_drvdata(pdev); 7402 struct amdgpu_device *adev = drm_to_adev(dev); 7403 int r; 7404 7405 if (!adev->pci_state) 7406 return false; 7407 7408 r = pci_load_saved_state(pdev, adev->pci_state); 7409 7410 if (!r) { 7411 pci_restore_state(pdev); 7412 } else { 7413 dev_warn(adev->dev, "Failed to load PCI state, err:%d\n", r); 7414 return false; 7415 } 7416 7417 return true; 7418 } 7419 7420 void amdgpu_device_flush_hdp(struct amdgpu_device *adev, 7421 struct amdgpu_ring *ring) 7422 { 7423 #ifdef CONFIG_X86_64 7424 if ((adev->flags & AMD_IS_APU) && !amdgpu_passthrough(adev)) 7425 return; 7426 #endif 7427 if (adev->gmc.xgmi.connected_to_cpu) 7428 return; 7429 7430 if (ring && ring->funcs->emit_hdp_flush) { 7431 amdgpu_ring_emit_hdp_flush(ring); 7432 return; 7433 } 7434 7435 if (!ring && amdgpu_sriov_runtime(adev)) { 7436 if (!amdgpu_kiq_hdp_flush(adev)) 7437 return; 7438 } 7439 7440 amdgpu_hdp_flush(adev, ring); 7441 } 7442 7443 void amdgpu_device_invalidate_hdp(struct amdgpu_device *adev, 7444 struct amdgpu_ring *ring) 7445 { 7446 #ifdef CONFIG_X86_64 7447 if ((adev->flags & AMD_IS_APU) && !amdgpu_passthrough(adev)) 7448 return; 7449 #endif 7450 if (adev->gmc.xgmi.connected_to_cpu) 7451 return; 7452 7453 amdgpu_hdp_invalidate(adev, ring); 7454 } 7455 7456 int amdgpu_in_reset(struct amdgpu_device *adev) 7457 { 7458 return atomic_read(&adev->reset_domain->in_gpu_reset); 7459 } 7460 7461 /** 7462 * amdgpu_device_halt() - bring hardware to some kind of halt state 7463 * 7464 * @adev: amdgpu_device pointer 7465 * 7466 * Bring hardware to some kind of halt state so that no one can touch it 7467 * any more. It will help to maintain error context when error occurred. 7468 * Compare to a simple hang, the system will keep stable at least for SSH 7469 * access. Then it should be trivial to inspect the hardware state and 7470 * see what's going on. Implemented as following: 7471 * 7472 * 1. drm_dev_unplug() makes device inaccessible to user space(IOCTLs, etc), 7473 * clears all CPU mappings to device, disallows remappings through page faults 7474 * 2. amdgpu_irq_disable_all() disables all interrupts 7475 * 3. amdgpu_fence_driver_hw_fini() signals all HW fences 7476 * 4. set adev->no_hw_access to avoid potential crashes after setp 5 7477 * 5. amdgpu_device_unmap_mmio() clears all MMIO mappings 7478 * 6. pci_disable_device() and pci_wait_for_pending_transaction() 7479 * flush any in flight DMA operations 7480 */ 7481 void amdgpu_device_halt(struct amdgpu_device *adev) 7482 { 7483 struct pci_dev *pdev = adev->pdev; 7484 struct drm_device *ddev = adev_to_drm(adev); 7485 7486 amdgpu_xcp_dev_unplug(adev); 7487 drm_dev_unplug(ddev); 7488 7489 amdgpu_irq_disable_all(adev); 7490 7491 amdgpu_fence_driver_hw_fini(adev); 7492 7493 adev->no_hw_access = true; 7494 7495 amdgpu_device_unmap_mmio(adev); 7496 7497 pci_disable_device(pdev); 7498 pci_wait_for_pending_transaction(pdev); 7499 } 7500 7501 u32 amdgpu_device_pcie_port_rreg(struct amdgpu_device *adev, 7502 u32 reg) 7503 { 7504 unsigned long flags, address, data; 7505 u32 r; 7506 7507 address = adev->nbio.funcs->get_pcie_port_index_offset(adev); 7508 data = adev->nbio.funcs->get_pcie_port_data_offset(adev); 7509 7510 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 7511 WREG32(address, reg * 4); 7512 (void)RREG32(address); 7513 r = RREG32(data); 7514 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 7515 return r; 7516 } 7517 7518 void amdgpu_device_pcie_port_wreg(struct amdgpu_device *adev, 7519 u32 reg, u32 v) 7520 { 7521 unsigned long flags, address, data; 7522 7523 address = adev->nbio.funcs->get_pcie_port_index_offset(adev); 7524 data = adev->nbio.funcs->get_pcie_port_data_offset(adev); 7525 7526 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 7527 WREG32(address, reg * 4); 7528 (void)RREG32(address); 7529 WREG32(data, v); 7530 (void)RREG32(data); 7531 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 7532 } 7533 7534 /** 7535 * amdgpu_device_get_gang - return a reference to the current gang 7536 * @adev: amdgpu_device pointer 7537 * 7538 * Returns: A new reference to the current gang leader. 7539 */ 7540 struct dma_fence *amdgpu_device_get_gang(struct amdgpu_device *adev) 7541 { 7542 struct dma_fence *fence; 7543 7544 rcu_read_lock(); 7545 fence = dma_fence_get_rcu_safe(&adev->gang_submit); 7546 rcu_read_unlock(); 7547 return fence; 7548 } 7549 7550 /** 7551 * amdgpu_device_switch_gang - switch to a new gang 7552 * @adev: amdgpu_device pointer 7553 * @gang: the gang to switch to 7554 * 7555 * Try to switch to a new gang. 7556 * Returns: NULL if we switched to the new gang or a reference to the current 7557 * gang leader. 7558 */ 7559 struct dma_fence *amdgpu_device_switch_gang(struct amdgpu_device *adev, 7560 struct dma_fence *gang) 7561 { 7562 struct dma_fence *old = NULL; 7563 7564 dma_fence_get(gang); 7565 do { 7566 dma_fence_put(old); 7567 old = amdgpu_device_get_gang(adev); 7568 if (old == gang) 7569 break; 7570 7571 if (!dma_fence_is_signaled(old)) { 7572 dma_fence_put(gang); 7573 return old; 7574 } 7575 7576 } while (cmpxchg((struct dma_fence __force **)&adev->gang_submit, 7577 old, gang) != old); 7578 7579 /* 7580 * Drop it once for the exchanged reference in adev and once for the 7581 * thread local reference acquired in amdgpu_device_get_gang(). 7582 */ 7583 dma_fence_put(old); 7584 dma_fence_put(old); 7585 return NULL; 7586 } 7587 7588 /** 7589 * amdgpu_device_enforce_isolation - enforce HW isolation 7590 * @adev: the amdgpu device pointer 7591 * @ring: the HW ring the job is supposed to run on 7592 * @job: the job which is about to be pushed to the HW ring 7593 * 7594 * Makes sure that only one client at a time can use the GFX block. 7595 * Returns: The dependency to wait on before the job can be pushed to the HW. 7596 * The function is called multiple times until NULL is returned. 7597 */ 7598 struct dma_fence *amdgpu_device_enforce_isolation(struct amdgpu_device *adev, 7599 struct amdgpu_ring *ring, 7600 struct amdgpu_job *job) 7601 { 7602 struct amdgpu_isolation *isolation = &adev->isolation[ring->xcp_id]; 7603 struct drm_sched_fence *f = job->base.s_fence; 7604 struct dma_fence *dep; 7605 void *owner; 7606 int r; 7607 7608 /* 7609 * For now enforce isolation only for the GFX block since we only need 7610 * the cleaner shader on those rings. 7611 */ 7612 if (ring->funcs->type != AMDGPU_RING_TYPE_GFX && 7613 ring->funcs->type != AMDGPU_RING_TYPE_COMPUTE) 7614 return NULL; 7615 7616 /* 7617 * All submissions where enforce isolation is false are handled as if 7618 * they come from a single client. Use ~0l as the owner to distinct it 7619 * from kernel submissions where the owner is NULL. 7620 */ 7621 owner = job->enforce_isolation ? f->owner : (void *)~0l; 7622 7623 mutex_lock(&adev->enforce_isolation_mutex); 7624 7625 /* 7626 * The "spearhead" submission is the first one which changes the 7627 * ownership to its client. We always need to wait for it to be 7628 * pushed to the HW before proceeding with anything. 7629 */ 7630 if (&f->scheduled != isolation->spearhead && 7631 !dma_fence_is_signaled(isolation->spearhead)) { 7632 dep = isolation->spearhead; 7633 goto out_grab_ref; 7634 } 7635 7636 if (isolation->owner != owner) { 7637 7638 /* 7639 * Wait for any gang to be assembled before switching to a 7640 * different owner or otherwise we could deadlock the 7641 * submissions. 7642 */ 7643 if (!job->gang_submit) { 7644 dep = amdgpu_device_get_gang(adev); 7645 if (!dma_fence_is_signaled(dep)) 7646 goto out_return_dep; 7647 dma_fence_put(dep); 7648 } 7649 7650 dma_fence_put(isolation->spearhead); 7651 isolation->spearhead = dma_fence_get(&f->scheduled); 7652 amdgpu_sync_move(&isolation->active, &isolation->prev); 7653 trace_amdgpu_isolation(isolation->owner, owner); 7654 isolation->owner = owner; 7655 } 7656 7657 /* 7658 * Specifying the ring here helps to pipeline submissions even when 7659 * isolation is enabled. If that is not desired for testing NULL can be 7660 * used instead of the ring to enforce a CPU round trip while switching 7661 * between clients. 7662 */ 7663 dep = amdgpu_sync_peek_fence(&isolation->prev, ring); 7664 r = amdgpu_sync_fence(&isolation->active, &f->finished, GFP_NOWAIT); 7665 if (r) 7666 dev_warn(adev->dev, "OOM tracking isolation\n"); 7667 7668 out_grab_ref: 7669 dma_fence_get(dep); 7670 out_return_dep: 7671 mutex_unlock(&adev->enforce_isolation_mutex); 7672 return dep; 7673 } 7674 7675 bool amdgpu_device_has_display_hardware(struct amdgpu_device *adev) 7676 { 7677 switch (adev->asic_type) { 7678 #ifdef CONFIG_DRM_AMDGPU_SI 7679 case CHIP_HAINAN: 7680 #endif 7681 case CHIP_TOPAZ: 7682 /* chips with no display hardware */ 7683 return false; 7684 #ifdef CONFIG_DRM_AMDGPU_SI 7685 case CHIP_TAHITI: 7686 case CHIP_PITCAIRN: 7687 case CHIP_VERDE: 7688 case CHIP_OLAND: 7689 #endif 7690 #ifdef CONFIG_DRM_AMDGPU_CIK 7691 case CHIP_BONAIRE: 7692 case CHIP_HAWAII: 7693 case CHIP_KAVERI: 7694 case CHIP_KABINI: 7695 case CHIP_MULLINS: 7696 #endif 7697 case CHIP_TONGA: 7698 case CHIP_FIJI: 7699 case CHIP_POLARIS10: 7700 case CHIP_POLARIS11: 7701 case CHIP_POLARIS12: 7702 case CHIP_VEGAM: 7703 case CHIP_CARRIZO: 7704 case CHIP_STONEY: 7705 /* chips with display hardware */ 7706 return true; 7707 default: 7708 /* IP discovery */ 7709 if (!amdgpu_ip_version(adev, DCE_HWIP, 0) || 7710 (adev->harvest_ip_mask & AMD_HARVEST_IP_DMU_MASK)) 7711 return false; 7712 return true; 7713 } 7714 } 7715 7716 uint32_t amdgpu_device_wait_on_rreg(struct amdgpu_device *adev, 7717 uint32_t inst, uint32_t reg_addr, char reg_name[], 7718 uint32_t expected_value, uint32_t mask) 7719 { 7720 uint32_t ret = 0; 7721 uint32_t old_ = 0; 7722 uint32_t tmp_ = RREG32(reg_addr); 7723 uint32_t loop = adev->usec_timeout; 7724 7725 while ((tmp_ & (mask)) != (expected_value)) { 7726 if (old_ != tmp_) { 7727 loop = adev->usec_timeout; 7728 old_ = tmp_; 7729 } else 7730 udelay(1); 7731 tmp_ = RREG32(reg_addr); 7732 loop--; 7733 if (!loop) { 7734 dev_warn( 7735 adev->dev, 7736 "Register(%d) [%s] failed to reach value 0x%08x != 0x%08xn", 7737 inst, reg_name, (uint32_t)expected_value, 7738 (uint32_t)(tmp_ & (mask))); 7739 ret = -ETIMEDOUT; 7740 break; 7741 } 7742 } 7743 return ret; 7744 } 7745 7746 ssize_t amdgpu_get_soft_full_reset_mask(struct amdgpu_ring *ring) 7747 { 7748 ssize_t size = 0; 7749 7750 if (!ring || !ring->adev) 7751 return size; 7752 7753 if (amdgpu_device_should_recover_gpu(ring->adev)) 7754 size |= AMDGPU_RESET_TYPE_FULL; 7755 7756 if (unlikely(!ring->adev->debug_disable_soft_recovery) && 7757 !amdgpu_sriov_vf(ring->adev) && ring->funcs->soft_recovery) 7758 size |= AMDGPU_RESET_TYPE_SOFT_RESET; 7759 7760 return size; 7761 } 7762 7763 ssize_t amdgpu_show_reset_mask(char *buf, uint32_t supported_reset) 7764 { 7765 ssize_t size = 0; 7766 7767 if (supported_reset == 0) { 7768 size += sysfs_emit_at(buf, size, "unsupported"); 7769 size += sysfs_emit_at(buf, size, "\n"); 7770 return size; 7771 7772 } 7773 7774 if (supported_reset & AMDGPU_RESET_TYPE_SOFT_RESET) 7775 size += sysfs_emit_at(buf, size, "soft "); 7776 7777 if (supported_reset & AMDGPU_RESET_TYPE_PER_QUEUE) 7778 size += sysfs_emit_at(buf, size, "queue "); 7779 7780 if (supported_reset & AMDGPU_RESET_TYPE_PER_PIPE) 7781 size += sysfs_emit_at(buf, size, "pipe "); 7782 7783 if (supported_reset & AMDGPU_RESET_TYPE_FULL) 7784 size += sysfs_emit_at(buf, size, "full "); 7785 7786 size += sysfs_emit_at(buf, size, "\n"); 7787 return size; 7788 } 7789 7790 void amdgpu_device_set_uid(struct amdgpu_uid *uid_info, 7791 enum amdgpu_uid_type type, uint8_t inst, 7792 uint64_t uid) 7793 { 7794 if (!uid_info) 7795 return; 7796 7797 if (type >= AMDGPU_UID_TYPE_MAX) { 7798 dev_err_once(uid_info->adev->dev, "Invalid UID type %d\n", 7799 type); 7800 return; 7801 } 7802 7803 if (inst >= AMDGPU_UID_INST_MAX) { 7804 dev_err_once(uid_info->adev->dev, "Invalid UID instance %d\n", 7805 inst); 7806 return; 7807 } 7808 7809 if (uid_info->uid[type][inst] != 0) { 7810 dev_warn_once( 7811 uid_info->adev->dev, 7812 "Overwriting existing UID %llu for type %d instance %d\n", 7813 uid_info->uid[type][inst], type, inst); 7814 } 7815 7816 uid_info->uid[type][inst] = uid; 7817 } 7818 7819 u64 amdgpu_device_get_uid(struct amdgpu_uid *uid_info, 7820 enum amdgpu_uid_type type, uint8_t inst) 7821 { 7822 if (!uid_info) 7823 return 0; 7824 7825 if (type >= AMDGPU_UID_TYPE_MAX) { 7826 dev_err_once(uid_info->adev->dev, "Invalid UID type %d\n", 7827 type); 7828 return 0; 7829 } 7830 7831 if (inst >= AMDGPU_UID_INST_MAX) { 7832 dev_err_once(uid_info->adev->dev, "Invalid UID instance %d\n", 7833 inst); 7834 return 0; 7835 } 7836 7837 return uid_info->uid[type][inst]; 7838 } 7839