1 /* 2 * Copyright 2008 Advanced Micro Devices, Inc. 3 * Copyright 2008 Red Hat Inc. 4 * Copyright 2009 Jerome Glisse. 5 * 6 * Permission is hereby granted, free of charge, to any person obtaining a 7 * copy of this software and associated documentation files (the "Software"), 8 * to deal in the Software without restriction, including without limitation 9 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 10 * and/or sell copies of the Software, and to permit persons to whom the 11 * Software is furnished to do so, subject to the following conditions: 12 * 13 * The above copyright notice and this permission notice shall be included in 14 * all copies or substantial portions of the Software. 15 * 16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 19 * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR 20 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, 21 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR 22 * OTHER DEALINGS IN THE SOFTWARE. 23 * 24 * Authors: Dave Airlie 25 * Alex Deucher 26 * Jerome Glisse 27 */ 28 29 #include <linux/aperture.h> 30 #include <linux/power_supply.h> 31 #include <linux/kthread.h> 32 #include <linux/module.h> 33 #include <linux/console.h> 34 #include <linux/slab.h> 35 #include <linux/iommu.h> 36 #include <linux/pci.h> 37 #include <linux/pci-p2pdma.h> 38 #include <linux/apple-gmux.h> 39 40 #include <drm/drm_atomic_helper.h> 41 #include <drm/drm_client_event.h> 42 #include <drm/drm_crtc_helper.h> 43 #include <drm/drm_probe_helper.h> 44 #include <drm/amdgpu_drm.h> 45 #include <linux/device.h> 46 #include <linux/vgaarb.h> 47 #include <linux/vga_switcheroo.h> 48 #include <linux/efi.h> 49 #include "amdgpu.h" 50 #include "amdgpu_trace.h" 51 #include "amdgpu_i2c.h" 52 #include "atom.h" 53 #include "amdgpu_atombios.h" 54 #include "amdgpu_atomfirmware.h" 55 #include "amd_pcie.h" 56 #ifdef CONFIG_DRM_AMDGPU_SI 57 #include "si.h" 58 #endif 59 #ifdef CONFIG_DRM_AMDGPU_CIK 60 #include "cik.h" 61 #endif 62 #include "vi.h" 63 #include "soc15.h" 64 #include "nv.h" 65 #include "bif/bif_4_1_d.h" 66 #include <linux/firmware.h> 67 #include "amdgpu_vf_error.h" 68 69 #include "amdgpu_amdkfd.h" 70 #include "amdgpu_pm.h" 71 72 #include "amdgpu_xgmi.h" 73 #include "amdgpu_ras.h" 74 #include "amdgpu_pmu.h" 75 #include "amdgpu_fru_eeprom.h" 76 #include "amdgpu_reset.h" 77 #include "amdgpu_virt.h" 78 #include "amdgpu_dev_coredump.h" 79 80 #include <linux/suspend.h> 81 #include <drm/task_barrier.h> 82 #include <linux/pm_runtime.h> 83 84 #include <drm/drm_drv.h> 85 86 #if IS_ENABLED(CONFIG_X86) 87 #include <asm/intel-family.h> 88 #include <asm/cpu_device_id.h> 89 #endif 90 91 MODULE_FIRMWARE("amdgpu/vega10_gpu_info.bin"); 92 MODULE_FIRMWARE("amdgpu/vega12_gpu_info.bin"); 93 MODULE_FIRMWARE("amdgpu/raven_gpu_info.bin"); 94 MODULE_FIRMWARE("amdgpu/picasso_gpu_info.bin"); 95 MODULE_FIRMWARE("amdgpu/raven2_gpu_info.bin"); 96 MODULE_FIRMWARE("amdgpu/arcturus_gpu_info.bin"); 97 MODULE_FIRMWARE("amdgpu/navi12_gpu_info.bin"); 98 MODULE_FIRMWARE("amdgpu/cyan_skillfish_gpu_info.bin"); 99 100 #define AMDGPU_RESUME_MS 2000 101 #define AMDGPU_MAX_RETRY_LIMIT 2 102 #define AMDGPU_RETRY_SRIOV_RESET(r) ((r) == -EBUSY || (r) == -ETIMEDOUT || (r) == -EINVAL) 103 #define AMDGPU_PCIE_INDEX_FALLBACK (0x38 >> 2) 104 #define AMDGPU_PCIE_INDEX_HI_FALLBACK (0x44 >> 2) 105 #define AMDGPU_PCIE_DATA_FALLBACK (0x3C >> 2) 106 107 #define AMDGPU_VBIOS_SKIP (1U << 0) 108 #define AMDGPU_VBIOS_OPTIONAL (1U << 1) 109 110 static const struct drm_driver amdgpu_kms_driver; 111 112 const char *amdgpu_asic_name[] = { 113 "TAHITI", 114 "PITCAIRN", 115 "VERDE", 116 "OLAND", 117 "HAINAN", 118 "BONAIRE", 119 "KAVERI", 120 "KABINI", 121 "HAWAII", 122 "MULLINS", 123 "TOPAZ", 124 "TONGA", 125 "FIJI", 126 "CARRIZO", 127 "STONEY", 128 "POLARIS10", 129 "POLARIS11", 130 "POLARIS12", 131 "VEGAM", 132 "VEGA10", 133 "VEGA12", 134 "VEGA20", 135 "RAVEN", 136 "ARCTURUS", 137 "RENOIR", 138 "ALDEBARAN", 139 "NAVI10", 140 "CYAN_SKILLFISH", 141 "NAVI14", 142 "NAVI12", 143 "SIENNA_CICHLID", 144 "NAVY_FLOUNDER", 145 "VANGOGH", 146 "DIMGREY_CAVEFISH", 147 "BEIGE_GOBY", 148 "YELLOW_CARP", 149 "IP DISCOVERY", 150 "LAST", 151 }; 152 153 #define AMDGPU_IP_BLK_MASK_ALL GENMASK(AMD_IP_BLOCK_TYPE_NUM - 1, 0) 154 /* 155 * Default init level where all blocks are expected to be initialized. This is 156 * the level of initialization expected by default and also after a full reset 157 * of the device. 158 */ 159 struct amdgpu_init_level amdgpu_init_default = { 160 .level = AMDGPU_INIT_LEVEL_DEFAULT, 161 .hwini_ip_block_mask = AMDGPU_IP_BLK_MASK_ALL, 162 }; 163 164 struct amdgpu_init_level amdgpu_init_recovery = { 165 .level = AMDGPU_INIT_LEVEL_RESET_RECOVERY, 166 .hwini_ip_block_mask = AMDGPU_IP_BLK_MASK_ALL, 167 }; 168 169 /* 170 * Minimal blocks needed to be initialized before a XGMI hive can be reset. This 171 * is used for cases like reset on initialization where the entire hive needs to 172 * be reset before first use. 173 */ 174 struct amdgpu_init_level amdgpu_init_minimal_xgmi = { 175 .level = AMDGPU_INIT_LEVEL_MINIMAL_XGMI, 176 .hwini_ip_block_mask = 177 BIT(AMD_IP_BLOCK_TYPE_GMC) | BIT(AMD_IP_BLOCK_TYPE_SMC) | 178 BIT(AMD_IP_BLOCK_TYPE_COMMON) | BIT(AMD_IP_BLOCK_TYPE_IH) | 179 BIT(AMD_IP_BLOCK_TYPE_PSP) 180 }; 181 182 static int amdgpu_device_ip_resume_phase1(struct amdgpu_device *adev); 183 static int amdgpu_device_ip_resume_phase2(struct amdgpu_device *adev); 184 static int amdgpu_device_ip_resume_phase3(struct amdgpu_device *adev); 185 186 static void amdgpu_device_load_switch_state(struct amdgpu_device *adev); 187 188 static inline bool amdgpu_ip_member_of_hwini(struct amdgpu_device *adev, 189 enum amd_ip_block_type block) 190 { 191 return (adev->init_lvl->hwini_ip_block_mask & (1U << block)) != 0; 192 } 193 194 void amdgpu_set_init_level(struct amdgpu_device *adev, 195 enum amdgpu_init_lvl_id lvl) 196 { 197 switch (lvl) { 198 case AMDGPU_INIT_LEVEL_MINIMAL_XGMI: 199 adev->init_lvl = &amdgpu_init_minimal_xgmi; 200 break; 201 case AMDGPU_INIT_LEVEL_RESET_RECOVERY: 202 adev->init_lvl = &amdgpu_init_recovery; 203 break; 204 case AMDGPU_INIT_LEVEL_DEFAULT: 205 fallthrough; 206 default: 207 adev->init_lvl = &amdgpu_init_default; 208 break; 209 } 210 } 211 212 static inline void amdgpu_device_stop_pending_resets(struct amdgpu_device *adev); 213 static int amdgpu_device_pm_notifier(struct notifier_block *nb, unsigned long mode, 214 void *data); 215 216 /** 217 * DOC: pcie_replay_count 218 * 219 * The amdgpu driver provides a sysfs API for reporting the total number 220 * of PCIe replays (NAKs). 221 * The file pcie_replay_count is used for this and returns the total 222 * number of replays as a sum of the NAKs generated and NAKs received. 223 */ 224 225 static ssize_t amdgpu_device_get_pcie_replay_count(struct device *dev, 226 struct device_attribute *attr, char *buf) 227 { 228 struct drm_device *ddev = dev_get_drvdata(dev); 229 struct amdgpu_device *adev = drm_to_adev(ddev); 230 uint64_t cnt = amdgpu_asic_get_pcie_replay_count(adev); 231 232 return sysfs_emit(buf, "%llu\n", cnt); 233 } 234 235 static DEVICE_ATTR(pcie_replay_count, 0444, 236 amdgpu_device_get_pcie_replay_count, NULL); 237 238 static int amdgpu_device_attr_sysfs_init(struct amdgpu_device *adev) 239 { 240 int ret = 0; 241 242 if (amdgpu_nbio_is_replay_cnt_supported(adev)) 243 ret = sysfs_create_file(&adev->dev->kobj, 244 &dev_attr_pcie_replay_count.attr); 245 246 return ret; 247 } 248 249 static void amdgpu_device_attr_sysfs_fini(struct amdgpu_device *adev) 250 { 251 if (amdgpu_nbio_is_replay_cnt_supported(adev)) 252 sysfs_remove_file(&adev->dev->kobj, 253 &dev_attr_pcie_replay_count.attr); 254 } 255 256 static ssize_t amdgpu_sysfs_reg_state_get(struct file *f, struct kobject *kobj, 257 const struct bin_attribute *attr, char *buf, 258 loff_t ppos, size_t count) 259 { 260 struct device *dev = kobj_to_dev(kobj); 261 struct drm_device *ddev = dev_get_drvdata(dev); 262 struct amdgpu_device *adev = drm_to_adev(ddev); 263 ssize_t bytes_read; 264 265 switch (ppos) { 266 case AMDGPU_SYS_REG_STATE_XGMI: 267 bytes_read = amdgpu_asic_get_reg_state( 268 adev, AMDGPU_REG_STATE_TYPE_XGMI, buf, count); 269 break; 270 case AMDGPU_SYS_REG_STATE_WAFL: 271 bytes_read = amdgpu_asic_get_reg_state( 272 adev, AMDGPU_REG_STATE_TYPE_WAFL, buf, count); 273 break; 274 case AMDGPU_SYS_REG_STATE_PCIE: 275 bytes_read = amdgpu_asic_get_reg_state( 276 adev, AMDGPU_REG_STATE_TYPE_PCIE, buf, count); 277 break; 278 case AMDGPU_SYS_REG_STATE_USR: 279 bytes_read = amdgpu_asic_get_reg_state( 280 adev, AMDGPU_REG_STATE_TYPE_USR, buf, count); 281 break; 282 case AMDGPU_SYS_REG_STATE_USR_1: 283 bytes_read = amdgpu_asic_get_reg_state( 284 adev, AMDGPU_REG_STATE_TYPE_USR_1, buf, count); 285 break; 286 default: 287 return -EINVAL; 288 } 289 290 return bytes_read; 291 } 292 293 static const BIN_ATTR(reg_state, 0444, amdgpu_sysfs_reg_state_get, NULL, 294 AMDGPU_SYS_REG_STATE_END); 295 296 int amdgpu_reg_state_sysfs_init(struct amdgpu_device *adev) 297 { 298 int ret; 299 300 if (!amdgpu_asic_get_reg_state_supported(adev)) 301 return 0; 302 303 ret = sysfs_create_bin_file(&adev->dev->kobj, &bin_attr_reg_state); 304 305 return ret; 306 } 307 308 void amdgpu_reg_state_sysfs_fini(struct amdgpu_device *adev) 309 { 310 if (!amdgpu_asic_get_reg_state_supported(adev)) 311 return; 312 sysfs_remove_bin_file(&adev->dev->kobj, &bin_attr_reg_state); 313 } 314 315 int amdgpu_ip_block_suspend(struct amdgpu_ip_block *ip_block) 316 { 317 int r; 318 319 if (ip_block->version->funcs->suspend) { 320 r = ip_block->version->funcs->suspend(ip_block); 321 if (r) { 322 dev_err(ip_block->adev->dev, 323 "suspend of IP block <%s> failed %d\n", 324 ip_block->version->funcs->name, r); 325 return r; 326 } 327 } 328 329 ip_block->status.hw = false; 330 return 0; 331 } 332 333 int amdgpu_ip_block_resume(struct amdgpu_ip_block *ip_block) 334 { 335 int r; 336 337 if (ip_block->version->funcs->resume) { 338 r = ip_block->version->funcs->resume(ip_block); 339 if (r) { 340 dev_err(ip_block->adev->dev, 341 "resume of IP block <%s> failed %d\n", 342 ip_block->version->funcs->name, r); 343 return r; 344 } 345 } 346 347 ip_block->status.hw = true; 348 return 0; 349 } 350 351 /** 352 * DOC: board_info 353 * 354 * The amdgpu driver provides a sysfs API for giving board related information. 355 * It provides the form factor information in the format 356 * 357 * type : form factor 358 * 359 * Possible form factor values 360 * 361 * - "cem" - PCIE CEM card 362 * - "oam" - Open Compute Accelerator Module 363 * - "unknown" - Not known 364 * 365 */ 366 367 static ssize_t amdgpu_device_get_board_info(struct device *dev, 368 struct device_attribute *attr, 369 char *buf) 370 { 371 struct drm_device *ddev = dev_get_drvdata(dev); 372 struct amdgpu_device *adev = drm_to_adev(ddev); 373 enum amdgpu_pkg_type pkg_type = AMDGPU_PKG_TYPE_CEM; 374 const char *pkg; 375 376 if (adev->smuio.funcs && adev->smuio.funcs->get_pkg_type) 377 pkg_type = adev->smuio.funcs->get_pkg_type(adev); 378 379 switch (pkg_type) { 380 case AMDGPU_PKG_TYPE_CEM: 381 pkg = "cem"; 382 break; 383 case AMDGPU_PKG_TYPE_OAM: 384 pkg = "oam"; 385 break; 386 default: 387 pkg = "unknown"; 388 break; 389 } 390 391 return sysfs_emit(buf, "%s : %s\n", "type", pkg); 392 } 393 394 static DEVICE_ATTR(board_info, 0444, amdgpu_device_get_board_info, NULL); 395 396 static struct attribute *amdgpu_board_attrs[] = { 397 &dev_attr_board_info.attr, 398 NULL, 399 }; 400 401 static umode_t amdgpu_board_attrs_is_visible(struct kobject *kobj, 402 struct attribute *attr, int n) 403 { 404 struct device *dev = kobj_to_dev(kobj); 405 struct drm_device *ddev = dev_get_drvdata(dev); 406 struct amdgpu_device *adev = drm_to_adev(ddev); 407 408 if (adev->flags & AMD_IS_APU) 409 return 0; 410 411 return attr->mode; 412 } 413 414 static const struct attribute_group amdgpu_board_attrs_group = { 415 .attrs = amdgpu_board_attrs, 416 .is_visible = amdgpu_board_attrs_is_visible 417 }; 418 419 static void amdgpu_device_get_pcie_info(struct amdgpu_device *adev); 420 421 /** 422 * amdgpu_device_supports_px - Is the device a dGPU with ATPX power control 423 * 424 * @adev: amdgpu device pointer 425 * 426 * Returns true if the device is a dGPU with ATPX power control, 427 * otherwise return false. 428 */ 429 bool amdgpu_device_supports_px(struct amdgpu_device *adev) 430 { 431 if ((adev->flags & AMD_IS_PX) && !amdgpu_is_atpx_hybrid()) 432 return true; 433 return false; 434 } 435 436 /** 437 * amdgpu_device_supports_boco - Is the device a dGPU with ACPI power resources 438 * 439 * @adev: amdgpu device pointer 440 * 441 * Returns true if the device is a dGPU with ACPI power control, 442 * otherwise return false. 443 */ 444 bool amdgpu_device_supports_boco(struct amdgpu_device *adev) 445 { 446 if (!IS_ENABLED(CONFIG_HOTPLUG_PCI_PCIE)) 447 return false; 448 449 if (adev->has_pr3 || 450 ((adev->flags & AMD_IS_PX) && amdgpu_is_atpx_hybrid())) 451 return true; 452 return false; 453 } 454 455 /** 456 * amdgpu_device_supports_baco - Does the device support BACO 457 * 458 * @adev: amdgpu device pointer 459 * 460 * Return: 461 * 1 if the device supports BACO; 462 * 3 if the device supports MACO (only works if BACO is supported) 463 * otherwise return 0. 464 */ 465 int amdgpu_device_supports_baco(struct amdgpu_device *adev) 466 { 467 return amdgpu_asic_supports_baco(adev); 468 } 469 470 void amdgpu_device_detect_runtime_pm_mode(struct amdgpu_device *adev) 471 { 472 int bamaco_support; 473 474 adev->pm.rpm_mode = AMDGPU_RUNPM_NONE; 475 bamaco_support = amdgpu_device_supports_baco(adev); 476 477 switch (amdgpu_runtime_pm) { 478 case 2: 479 if (bamaco_support & MACO_SUPPORT) { 480 adev->pm.rpm_mode = AMDGPU_RUNPM_BAMACO; 481 dev_info(adev->dev, "Forcing BAMACO for runtime pm\n"); 482 } else if (bamaco_support == BACO_SUPPORT) { 483 adev->pm.rpm_mode = AMDGPU_RUNPM_BACO; 484 dev_info(adev->dev, "Requested mode BAMACO not available,fallback to use BACO\n"); 485 } 486 break; 487 case 1: 488 if (bamaco_support & BACO_SUPPORT) { 489 adev->pm.rpm_mode = AMDGPU_RUNPM_BACO; 490 dev_info(adev->dev, "Forcing BACO for runtime pm\n"); 491 } 492 break; 493 case -1: 494 case -2: 495 if (amdgpu_device_supports_px(adev)) { 496 /* enable PX as runtime mode */ 497 adev->pm.rpm_mode = AMDGPU_RUNPM_PX; 498 dev_info(adev->dev, "Using ATPX for runtime pm\n"); 499 } else if (amdgpu_device_supports_boco(adev)) { 500 /* enable boco as runtime mode */ 501 adev->pm.rpm_mode = AMDGPU_RUNPM_BOCO; 502 dev_info(adev->dev, "Using BOCO for runtime pm\n"); 503 } else { 504 if (!bamaco_support) 505 goto no_runtime_pm; 506 507 switch (adev->asic_type) { 508 case CHIP_VEGA20: 509 case CHIP_ARCTURUS: 510 /* BACO are not supported on vega20 and arctrus */ 511 break; 512 case CHIP_VEGA10: 513 /* enable BACO as runpm mode if noretry=0 */ 514 if (!adev->gmc.noretry && !amdgpu_passthrough(adev)) 515 adev->pm.rpm_mode = AMDGPU_RUNPM_BACO; 516 break; 517 default: 518 /* enable BACO as runpm mode on CI+ */ 519 if (!amdgpu_passthrough(adev)) 520 adev->pm.rpm_mode = AMDGPU_RUNPM_BACO; 521 break; 522 } 523 524 if (adev->pm.rpm_mode == AMDGPU_RUNPM_BACO) { 525 if (bamaco_support & MACO_SUPPORT) { 526 adev->pm.rpm_mode = AMDGPU_RUNPM_BAMACO; 527 dev_info(adev->dev, "Using BAMACO for runtime pm\n"); 528 } else { 529 dev_info(adev->dev, "Using BACO for runtime pm\n"); 530 } 531 } 532 } 533 break; 534 case 0: 535 dev_info(adev->dev, "runtime pm is manually disabled\n"); 536 break; 537 default: 538 break; 539 } 540 541 no_runtime_pm: 542 if (adev->pm.rpm_mode == AMDGPU_RUNPM_NONE) 543 dev_info(adev->dev, "Runtime PM not available\n"); 544 } 545 /** 546 * amdgpu_device_supports_smart_shift - Is the device dGPU with 547 * smart shift support 548 * 549 * @adev: amdgpu device pointer 550 * 551 * Returns true if the device is a dGPU with Smart Shift support, 552 * otherwise returns false. 553 */ 554 bool amdgpu_device_supports_smart_shift(struct amdgpu_device *adev) 555 { 556 return (amdgpu_device_supports_boco(adev) && 557 amdgpu_acpi_is_power_shift_control_supported()); 558 } 559 560 /* 561 * VRAM access helper functions 562 */ 563 564 /** 565 * amdgpu_device_mm_access - access vram by MM_INDEX/MM_DATA 566 * 567 * @adev: amdgpu_device pointer 568 * @pos: offset of the buffer in vram 569 * @buf: virtual address of the buffer in system memory 570 * @size: read/write size, sizeof(@buf) must > @size 571 * @write: true - write to vram, otherwise - read from vram 572 */ 573 void amdgpu_device_mm_access(struct amdgpu_device *adev, loff_t pos, 574 void *buf, size_t size, bool write) 575 { 576 unsigned long flags; 577 uint32_t hi = ~0, tmp = 0; 578 uint32_t *data = buf; 579 uint64_t last; 580 int idx; 581 582 if (!drm_dev_enter(adev_to_drm(adev), &idx)) 583 return; 584 585 BUG_ON(!IS_ALIGNED(pos, 4) || !IS_ALIGNED(size, 4)); 586 587 spin_lock_irqsave(&adev->mmio_idx_lock, flags); 588 for (last = pos + size; pos < last; pos += 4) { 589 tmp = pos >> 31; 590 591 WREG32_NO_KIQ(mmMM_INDEX, ((uint32_t)pos) | 0x80000000); 592 if (tmp != hi) { 593 WREG32_NO_KIQ(mmMM_INDEX_HI, tmp); 594 hi = tmp; 595 } 596 if (write) 597 WREG32_NO_KIQ(mmMM_DATA, *data++); 598 else 599 *data++ = RREG32_NO_KIQ(mmMM_DATA); 600 } 601 602 spin_unlock_irqrestore(&adev->mmio_idx_lock, flags); 603 drm_dev_exit(idx); 604 } 605 606 /** 607 * amdgpu_device_aper_access - access vram by vram aperture 608 * 609 * @adev: amdgpu_device pointer 610 * @pos: offset of the buffer in vram 611 * @buf: virtual address of the buffer in system memory 612 * @size: read/write size, sizeof(@buf) must > @size 613 * @write: true - write to vram, otherwise - read from vram 614 * 615 * The return value means how many bytes have been transferred. 616 */ 617 size_t amdgpu_device_aper_access(struct amdgpu_device *adev, loff_t pos, 618 void *buf, size_t size, bool write) 619 { 620 #ifdef CONFIG_64BIT 621 void __iomem *addr; 622 size_t count = 0; 623 uint64_t last; 624 625 if (!adev->mman.aper_base_kaddr) 626 return 0; 627 628 last = min(pos + size, adev->gmc.visible_vram_size); 629 if (last > pos) { 630 addr = adev->mman.aper_base_kaddr + pos; 631 count = last - pos; 632 633 if (write) { 634 memcpy_toio(addr, buf, count); 635 /* Make sure HDP write cache flush happens without any reordering 636 * after the system memory contents are sent over PCIe device 637 */ 638 mb(); 639 amdgpu_device_flush_hdp(adev, NULL); 640 } else { 641 amdgpu_device_invalidate_hdp(adev, NULL); 642 /* Make sure HDP read cache is invalidated before issuing a read 643 * to the PCIe device 644 */ 645 mb(); 646 memcpy_fromio(buf, addr, count); 647 } 648 649 } 650 651 return count; 652 #else 653 return 0; 654 #endif 655 } 656 657 /** 658 * amdgpu_device_vram_access - read/write a buffer in vram 659 * 660 * @adev: amdgpu_device pointer 661 * @pos: offset of the buffer in vram 662 * @buf: virtual address of the buffer in system memory 663 * @size: read/write size, sizeof(@buf) must > @size 664 * @write: true - write to vram, otherwise - read from vram 665 */ 666 void amdgpu_device_vram_access(struct amdgpu_device *adev, loff_t pos, 667 void *buf, size_t size, bool write) 668 { 669 size_t count; 670 671 /* try to using vram apreature to access vram first */ 672 count = amdgpu_device_aper_access(adev, pos, buf, size, write); 673 size -= count; 674 if (size) { 675 /* using MM to access rest vram */ 676 pos += count; 677 buf += count; 678 amdgpu_device_mm_access(adev, pos, buf, size, write); 679 } 680 } 681 682 /* 683 * register access helper functions. 684 */ 685 686 /* Check if hw access should be skipped because of hotplug or device error */ 687 bool amdgpu_device_skip_hw_access(struct amdgpu_device *adev) 688 { 689 if (adev->no_hw_access) 690 return true; 691 692 #ifdef CONFIG_LOCKDEP 693 /* 694 * This is a bit complicated to understand, so worth a comment. What we assert 695 * here is that the GPU reset is not running on another thread in parallel. 696 * 697 * For this we trylock the read side of the reset semaphore, if that succeeds 698 * we know that the reset is not running in parallel. 699 * 700 * If the trylock fails we assert that we are either already holding the read 701 * side of the lock or are the reset thread itself and hold the write side of 702 * the lock. 703 */ 704 if (in_task()) { 705 if (down_read_trylock(&adev->reset_domain->sem)) 706 up_read(&adev->reset_domain->sem); 707 else 708 lockdep_assert_held(&adev->reset_domain->sem); 709 } 710 #endif 711 return false; 712 } 713 714 /** 715 * amdgpu_device_rreg - read a memory mapped IO or indirect register 716 * 717 * @adev: amdgpu_device pointer 718 * @reg: dword aligned register offset 719 * @acc_flags: access flags which require special behavior 720 * 721 * Returns the 32 bit value from the offset specified. 722 */ 723 uint32_t amdgpu_device_rreg(struct amdgpu_device *adev, 724 uint32_t reg, uint32_t acc_flags) 725 { 726 uint32_t ret; 727 728 if (amdgpu_device_skip_hw_access(adev)) 729 return 0; 730 731 if ((reg * 4) < adev->rmmio_size) { 732 if (!(acc_flags & AMDGPU_REGS_NO_KIQ) && 733 amdgpu_sriov_runtime(adev) && 734 down_read_trylock(&adev->reset_domain->sem)) { 735 ret = amdgpu_kiq_rreg(adev, reg, 0); 736 up_read(&adev->reset_domain->sem); 737 } else { 738 ret = readl(((void __iomem *)adev->rmmio) + (reg * 4)); 739 } 740 } else { 741 ret = adev->pcie_rreg(adev, reg * 4); 742 } 743 744 trace_amdgpu_device_rreg(adev->pdev->device, reg, ret); 745 746 return ret; 747 } 748 749 /* 750 * MMIO register read with bytes helper functions 751 * @offset:bytes offset from MMIO start 752 */ 753 754 /** 755 * amdgpu_mm_rreg8 - read a memory mapped IO register 756 * 757 * @adev: amdgpu_device pointer 758 * @offset: byte aligned register offset 759 * 760 * Returns the 8 bit value from the offset specified. 761 */ 762 uint8_t amdgpu_mm_rreg8(struct amdgpu_device *adev, uint32_t offset) 763 { 764 if (amdgpu_device_skip_hw_access(adev)) 765 return 0; 766 767 if (offset < adev->rmmio_size) 768 return (readb(adev->rmmio + offset)); 769 BUG(); 770 } 771 772 773 /** 774 * amdgpu_device_xcc_rreg - read a memory mapped IO or indirect register with specific XCC 775 * 776 * @adev: amdgpu_device pointer 777 * @reg: dword aligned register offset 778 * @acc_flags: access flags which require special behavior 779 * @xcc_id: xcc accelerated compute core id 780 * 781 * Returns the 32 bit value from the offset specified. 782 */ 783 uint32_t amdgpu_device_xcc_rreg(struct amdgpu_device *adev, 784 uint32_t reg, uint32_t acc_flags, 785 uint32_t xcc_id) 786 { 787 uint32_t ret, rlcg_flag; 788 789 if (amdgpu_device_skip_hw_access(adev)) 790 return 0; 791 792 if ((reg * 4) < adev->rmmio_size) { 793 if (amdgpu_sriov_vf(adev) && 794 !amdgpu_sriov_runtime(adev) && 795 adev->gfx.rlc.rlcg_reg_access_supported && 796 amdgpu_virt_get_rlcg_reg_access_flag(adev, acc_flags, 797 GC_HWIP, false, 798 &rlcg_flag)) { 799 ret = amdgpu_virt_rlcg_reg_rw(adev, reg, 0, rlcg_flag, GET_INST(GC, xcc_id)); 800 } else if (!(acc_flags & AMDGPU_REGS_NO_KIQ) && 801 amdgpu_sriov_runtime(adev) && 802 down_read_trylock(&adev->reset_domain->sem)) { 803 ret = amdgpu_kiq_rreg(adev, reg, xcc_id); 804 up_read(&adev->reset_domain->sem); 805 } else { 806 ret = readl(((void __iomem *)adev->rmmio) + (reg * 4)); 807 } 808 } else { 809 ret = adev->pcie_rreg(adev, reg * 4); 810 } 811 812 return ret; 813 } 814 815 /* 816 * MMIO register write with bytes helper functions 817 * @offset:bytes offset from MMIO start 818 * @value: the value want to be written to the register 819 */ 820 821 /** 822 * amdgpu_mm_wreg8 - read a memory mapped IO register 823 * 824 * @adev: amdgpu_device pointer 825 * @offset: byte aligned register offset 826 * @value: 8 bit value to write 827 * 828 * Writes the value specified to the offset specified. 829 */ 830 void amdgpu_mm_wreg8(struct amdgpu_device *adev, uint32_t offset, uint8_t value) 831 { 832 if (amdgpu_device_skip_hw_access(adev)) 833 return; 834 835 if (offset < adev->rmmio_size) 836 writeb(value, adev->rmmio + offset); 837 else 838 BUG(); 839 } 840 841 /** 842 * amdgpu_device_wreg - write to a memory mapped IO or indirect register 843 * 844 * @adev: amdgpu_device pointer 845 * @reg: dword aligned register offset 846 * @v: 32 bit value to write to the register 847 * @acc_flags: access flags which require special behavior 848 * 849 * Writes the value specified to the offset specified. 850 */ 851 void amdgpu_device_wreg(struct amdgpu_device *adev, 852 uint32_t reg, uint32_t v, 853 uint32_t acc_flags) 854 { 855 if (amdgpu_device_skip_hw_access(adev)) 856 return; 857 858 if ((reg * 4) < adev->rmmio_size) { 859 if (!(acc_flags & AMDGPU_REGS_NO_KIQ) && 860 amdgpu_sriov_runtime(adev) && 861 down_read_trylock(&adev->reset_domain->sem)) { 862 amdgpu_kiq_wreg(adev, reg, v, 0); 863 up_read(&adev->reset_domain->sem); 864 } else { 865 writel(v, ((void __iomem *)adev->rmmio) + (reg * 4)); 866 } 867 } else { 868 adev->pcie_wreg(adev, reg * 4, v); 869 } 870 871 trace_amdgpu_device_wreg(adev->pdev->device, reg, v); 872 } 873 874 /** 875 * amdgpu_mm_wreg_mmio_rlc - write register either with direct/indirect mmio or with RLC path if in range 876 * 877 * @adev: amdgpu_device pointer 878 * @reg: mmio/rlc register 879 * @v: value to write 880 * @xcc_id: xcc accelerated compute core id 881 * 882 * this function is invoked only for the debugfs register access 883 */ 884 void amdgpu_mm_wreg_mmio_rlc(struct amdgpu_device *adev, 885 uint32_t reg, uint32_t v, 886 uint32_t xcc_id) 887 { 888 if (amdgpu_device_skip_hw_access(adev)) 889 return; 890 891 if (amdgpu_sriov_fullaccess(adev) && 892 adev->gfx.rlc.funcs && 893 adev->gfx.rlc.funcs->is_rlcg_access_range) { 894 if (adev->gfx.rlc.funcs->is_rlcg_access_range(adev, reg)) 895 return amdgpu_sriov_wreg(adev, reg, v, 0, 0, xcc_id); 896 } else if ((reg * 4) >= adev->rmmio_size) { 897 adev->pcie_wreg(adev, reg * 4, v); 898 } else { 899 writel(v, ((void __iomem *)adev->rmmio) + (reg * 4)); 900 } 901 } 902 903 /** 904 * amdgpu_device_xcc_wreg - write to a memory mapped IO or indirect register with specific XCC 905 * 906 * @adev: amdgpu_device pointer 907 * @reg: dword aligned register offset 908 * @v: 32 bit value to write to the register 909 * @acc_flags: access flags which require special behavior 910 * @xcc_id: xcc accelerated compute core id 911 * 912 * Writes the value specified to the offset specified. 913 */ 914 void amdgpu_device_xcc_wreg(struct amdgpu_device *adev, 915 uint32_t reg, uint32_t v, 916 uint32_t acc_flags, uint32_t xcc_id) 917 { 918 uint32_t rlcg_flag; 919 920 if (amdgpu_device_skip_hw_access(adev)) 921 return; 922 923 if ((reg * 4) < adev->rmmio_size) { 924 if (amdgpu_sriov_vf(adev) && 925 !amdgpu_sriov_runtime(adev) && 926 adev->gfx.rlc.rlcg_reg_access_supported && 927 amdgpu_virt_get_rlcg_reg_access_flag(adev, acc_flags, 928 GC_HWIP, true, 929 &rlcg_flag)) { 930 amdgpu_virt_rlcg_reg_rw(adev, reg, v, rlcg_flag, GET_INST(GC, xcc_id)); 931 } else if (!(acc_flags & AMDGPU_REGS_NO_KIQ) && 932 amdgpu_sriov_runtime(adev) && 933 down_read_trylock(&adev->reset_domain->sem)) { 934 amdgpu_kiq_wreg(adev, reg, v, xcc_id); 935 up_read(&adev->reset_domain->sem); 936 } else { 937 writel(v, ((void __iomem *)adev->rmmio) + (reg * 4)); 938 } 939 } else { 940 adev->pcie_wreg(adev, reg * 4, v); 941 } 942 } 943 944 /** 945 * amdgpu_device_indirect_rreg - read an indirect register 946 * 947 * @adev: amdgpu_device pointer 948 * @reg_addr: indirect register address to read from 949 * 950 * Returns the value of indirect register @reg_addr 951 */ 952 u32 amdgpu_device_indirect_rreg(struct amdgpu_device *adev, 953 u32 reg_addr) 954 { 955 unsigned long flags, pcie_index, pcie_data; 956 void __iomem *pcie_index_offset; 957 void __iomem *pcie_data_offset; 958 u32 r; 959 960 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 961 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 962 963 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 964 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 965 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 966 967 writel(reg_addr, pcie_index_offset); 968 readl(pcie_index_offset); 969 r = readl(pcie_data_offset); 970 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 971 972 return r; 973 } 974 975 u32 amdgpu_device_indirect_rreg_ext(struct amdgpu_device *adev, 976 u64 reg_addr) 977 { 978 unsigned long flags, pcie_index, pcie_index_hi, pcie_data; 979 u32 r; 980 void __iomem *pcie_index_offset; 981 void __iomem *pcie_index_hi_offset; 982 void __iomem *pcie_data_offset; 983 984 if (unlikely(!adev->nbio.funcs)) { 985 pcie_index = AMDGPU_PCIE_INDEX_FALLBACK; 986 pcie_data = AMDGPU_PCIE_DATA_FALLBACK; 987 } else { 988 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 989 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 990 } 991 992 if (reg_addr >> 32) { 993 if (unlikely(!adev->nbio.funcs)) 994 pcie_index_hi = AMDGPU_PCIE_INDEX_HI_FALLBACK; 995 else 996 pcie_index_hi = adev->nbio.funcs->get_pcie_index_hi_offset(adev); 997 } else { 998 pcie_index_hi = 0; 999 } 1000 1001 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 1002 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 1003 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 1004 if (pcie_index_hi != 0) 1005 pcie_index_hi_offset = (void __iomem *)adev->rmmio + 1006 pcie_index_hi * 4; 1007 1008 writel(reg_addr, pcie_index_offset); 1009 readl(pcie_index_offset); 1010 if (pcie_index_hi != 0) { 1011 writel((reg_addr >> 32) & 0xff, pcie_index_hi_offset); 1012 readl(pcie_index_hi_offset); 1013 } 1014 r = readl(pcie_data_offset); 1015 1016 /* clear the high bits */ 1017 if (pcie_index_hi != 0) { 1018 writel(0, pcie_index_hi_offset); 1019 readl(pcie_index_hi_offset); 1020 } 1021 1022 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 1023 1024 return r; 1025 } 1026 1027 /** 1028 * amdgpu_device_indirect_rreg64 - read a 64bits indirect register 1029 * 1030 * @adev: amdgpu_device pointer 1031 * @reg_addr: indirect register address to read from 1032 * 1033 * Returns the value of indirect register @reg_addr 1034 */ 1035 u64 amdgpu_device_indirect_rreg64(struct amdgpu_device *adev, 1036 u32 reg_addr) 1037 { 1038 unsigned long flags, pcie_index, pcie_data; 1039 void __iomem *pcie_index_offset; 1040 void __iomem *pcie_data_offset; 1041 u64 r; 1042 1043 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 1044 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 1045 1046 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 1047 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 1048 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 1049 1050 /* read low 32 bits */ 1051 writel(reg_addr, pcie_index_offset); 1052 readl(pcie_index_offset); 1053 r = readl(pcie_data_offset); 1054 /* read high 32 bits */ 1055 writel(reg_addr + 4, pcie_index_offset); 1056 readl(pcie_index_offset); 1057 r |= ((u64)readl(pcie_data_offset) << 32); 1058 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 1059 1060 return r; 1061 } 1062 1063 u64 amdgpu_device_indirect_rreg64_ext(struct amdgpu_device *adev, 1064 u64 reg_addr) 1065 { 1066 unsigned long flags, pcie_index, pcie_data; 1067 unsigned long pcie_index_hi = 0; 1068 void __iomem *pcie_index_offset; 1069 void __iomem *pcie_index_hi_offset; 1070 void __iomem *pcie_data_offset; 1071 u64 r; 1072 1073 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 1074 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 1075 if ((reg_addr >> 32) && (adev->nbio.funcs->get_pcie_index_hi_offset)) 1076 pcie_index_hi = adev->nbio.funcs->get_pcie_index_hi_offset(adev); 1077 1078 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 1079 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 1080 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 1081 if (pcie_index_hi != 0) 1082 pcie_index_hi_offset = (void __iomem *)adev->rmmio + 1083 pcie_index_hi * 4; 1084 1085 /* read low 32 bits */ 1086 writel(reg_addr, pcie_index_offset); 1087 readl(pcie_index_offset); 1088 if (pcie_index_hi != 0) { 1089 writel((reg_addr >> 32) & 0xff, pcie_index_hi_offset); 1090 readl(pcie_index_hi_offset); 1091 } 1092 r = readl(pcie_data_offset); 1093 /* read high 32 bits */ 1094 writel(reg_addr + 4, pcie_index_offset); 1095 readl(pcie_index_offset); 1096 if (pcie_index_hi != 0) { 1097 writel((reg_addr >> 32) & 0xff, pcie_index_hi_offset); 1098 readl(pcie_index_hi_offset); 1099 } 1100 r |= ((u64)readl(pcie_data_offset) << 32); 1101 1102 /* clear the high bits */ 1103 if (pcie_index_hi != 0) { 1104 writel(0, pcie_index_hi_offset); 1105 readl(pcie_index_hi_offset); 1106 } 1107 1108 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 1109 1110 return r; 1111 } 1112 1113 /** 1114 * amdgpu_device_indirect_wreg - write an indirect register address 1115 * 1116 * @adev: amdgpu_device pointer 1117 * @reg_addr: indirect register offset 1118 * @reg_data: indirect register data 1119 * 1120 */ 1121 void amdgpu_device_indirect_wreg(struct amdgpu_device *adev, 1122 u32 reg_addr, u32 reg_data) 1123 { 1124 unsigned long flags, pcie_index, pcie_data; 1125 void __iomem *pcie_index_offset; 1126 void __iomem *pcie_data_offset; 1127 1128 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 1129 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 1130 1131 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 1132 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 1133 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 1134 1135 writel(reg_addr, pcie_index_offset); 1136 readl(pcie_index_offset); 1137 writel(reg_data, pcie_data_offset); 1138 readl(pcie_data_offset); 1139 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 1140 } 1141 1142 void amdgpu_device_indirect_wreg_ext(struct amdgpu_device *adev, 1143 u64 reg_addr, u32 reg_data) 1144 { 1145 unsigned long flags, pcie_index, pcie_index_hi, pcie_data; 1146 void __iomem *pcie_index_offset; 1147 void __iomem *pcie_index_hi_offset; 1148 void __iomem *pcie_data_offset; 1149 1150 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 1151 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 1152 if ((reg_addr >> 32) && (adev->nbio.funcs->get_pcie_index_hi_offset)) 1153 pcie_index_hi = adev->nbio.funcs->get_pcie_index_hi_offset(adev); 1154 else 1155 pcie_index_hi = 0; 1156 1157 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 1158 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 1159 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 1160 if (pcie_index_hi != 0) 1161 pcie_index_hi_offset = (void __iomem *)adev->rmmio + 1162 pcie_index_hi * 4; 1163 1164 writel(reg_addr, pcie_index_offset); 1165 readl(pcie_index_offset); 1166 if (pcie_index_hi != 0) { 1167 writel((reg_addr >> 32) & 0xff, pcie_index_hi_offset); 1168 readl(pcie_index_hi_offset); 1169 } 1170 writel(reg_data, pcie_data_offset); 1171 readl(pcie_data_offset); 1172 1173 /* clear the high bits */ 1174 if (pcie_index_hi != 0) { 1175 writel(0, pcie_index_hi_offset); 1176 readl(pcie_index_hi_offset); 1177 } 1178 1179 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 1180 } 1181 1182 /** 1183 * amdgpu_device_indirect_wreg64 - write a 64bits indirect register address 1184 * 1185 * @adev: amdgpu_device pointer 1186 * @reg_addr: indirect register offset 1187 * @reg_data: indirect register data 1188 * 1189 */ 1190 void amdgpu_device_indirect_wreg64(struct amdgpu_device *adev, 1191 u32 reg_addr, u64 reg_data) 1192 { 1193 unsigned long flags, pcie_index, pcie_data; 1194 void __iomem *pcie_index_offset; 1195 void __iomem *pcie_data_offset; 1196 1197 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 1198 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 1199 1200 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 1201 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 1202 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 1203 1204 /* write low 32 bits */ 1205 writel(reg_addr, pcie_index_offset); 1206 readl(pcie_index_offset); 1207 writel((u32)(reg_data & 0xffffffffULL), pcie_data_offset); 1208 readl(pcie_data_offset); 1209 /* write high 32 bits */ 1210 writel(reg_addr + 4, pcie_index_offset); 1211 readl(pcie_index_offset); 1212 writel((u32)(reg_data >> 32), pcie_data_offset); 1213 readl(pcie_data_offset); 1214 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 1215 } 1216 1217 void amdgpu_device_indirect_wreg64_ext(struct amdgpu_device *adev, 1218 u64 reg_addr, u64 reg_data) 1219 { 1220 unsigned long flags, pcie_index, pcie_data; 1221 unsigned long pcie_index_hi = 0; 1222 void __iomem *pcie_index_offset; 1223 void __iomem *pcie_index_hi_offset; 1224 void __iomem *pcie_data_offset; 1225 1226 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 1227 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 1228 if ((reg_addr >> 32) && (adev->nbio.funcs->get_pcie_index_hi_offset)) 1229 pcie_index_hi = adev->nbio.funcs->get_pcie_index_hi_offset(adev); 1230 1231 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 1232 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 1233 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 1234 if (pcie_index_hi != 0) 1235 pcie_index_hi_offset = (void __iomem *)adev->rmmio + 1236 pcie_index_hi * 4; 1237 1238 /* write low 32 bits */ 1239 writel(reg_addr, pcie_index_offset); 1240 readl(pcie_index_offset); 1241 if (pcie_index_hi != 0) { 1242 writel((reg_addr >> 32) & 0xff, pcie_index_hi_offset); 1243 readl(pcie_index_hi_offset); 1244 } 1245 writel((u32)(reg_data & 0xffffffffULL), pcie_data_offset); 1246 readl(pcie_data_offset); 1247 /* write high 32 bits */ 1248 writel(reg_addr + 4, pcie_index_offset); 1249 readl(pcie_index_offset); 1250 if (pcie_index_hi != 0) { 1251 writel((reg_addr >> 32) & 0xff, pcie_index_hi_offset); 1252 readl(pcie_index_hi_offset); 1253 } 1254 writel((u32)(reg_data >> 32), pcie_data_offset); 1255 readl(pcie_data_offset); 1256 1257 /* clear the high bits */ 1258 if (pcie_index_hi != 0) { 1259 writel(0, pcie_index_hi_offset); 1260 readl(pcie_index_hi_offset); 1261 } 1262 1263 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 1264 } 1265 1266 /** 1267 * amdgpu_device_get_rev_id - query device rev_id 1268 * 1269 * @adev: amdgpu_device pointer 1270 * 1271 * Return device rev_id 1272 */ 1273 u32 amdgpu_device_get_rev_id(struct amdgpu_device *adev) 1274 { 1275 return adev->nbio.funcs->get_rev_id(adev); 1276 } 1277 1278 /** 1279 * amdgpu_invalid_rreg - dummy reg read function 1280 * 1281 * @adev: amdgpu_device pointer 1282 * @reg: offset of register 1283 * 1284 * Dummy register read function. Used for register blocks 1285 * that certain asics don't have (all asics). 1286 * Returns the value in the register. 1287 */ 1288 static uint32_t amdgpu_invalid_rreg(struct amdgpu_device *adev, uint32_t reg) 1289 { 1290 dev_err(adev->dev, "Invalid callback to read register 0x%04X\n", reg); 1291 BUG(); 1292 return 0; 1293 } 1294 1295 static uint32_t amdgpu_invalid_rreg_ext(struct amdgpu_device *adev, uint64_t reg) 1296 { 1297 dev_err(adev->dev, "Invalid callback to read register 0x%llX\n", reg); 1298 BUG(); 1299 return 0; 1300 } 1301 1302 /** 1303 * amdgpu_invalid_wreg - dummy reg write function 1304 * 1305 * @adev: amdgpu_device pointer 1306 * @reg: offset of register 1307 * @v: value to write to the register 1308 * 1309 * Dummy register read function. Used for register blocks 1310 * that certain asics don't have (all asics). 1311 */ 1312 static void amdgpu_invalid_wreg(struct amdgpu_device *adev, uint32_t reg, uint32_t v) 1313 { 1314 dev_err(adev->dev, 1315 "Invalid callback to write register 0x%04X with 0x%08X\n", reg, 1316 v); 1317 BUG(); 1318 } 1319 1320 static void amdgpu_invalid_wreg_ext(struct amdgpu_device *adev, uint64_t reg, uint32_t v) 1321 { 1322 dev_err(adev->dev, 1323 "Invalid callback to write register 0x%llX with 0x%08X\n", reg, 1324 v); 1325 BUG(); 1326 } 1327 1328 /** 1329 * amdgpu_invalid_rreg64 - dummy 64 bit reg read function 1330 * 1331 * @adev: amdgpu_device pointer 1332 * @reg: offset of register 1333 * 1334 * Dummy register read function. Used for register blocks 1335 * that certain asics don't have (all asics). 1336 * Returns the value in the register. 1337 */ 1338 static uint64_t amdgpu_invalid_rreg64(struct amdgpu_device *adev, uint32_t reg) 1339 { 1340 dev_err(adev->dev, "Invalid callback to read 64 bit register 0x%04X\n", 1341 reg); 1342 BUG(); 1343 return 0; 1344 } 1345 1346 static uint64_t amdgpu_invalid_rreg64_ext(struct amdgpu_device *adev, uint64_t reg) 1347 { 1348 dev_err(adev->dev, "Invalid callback to read register 0x%llX\n", reg); 1349 BUG(); 1350 return 0; 1351 } 1352 1353 /** 1354 * amdgpu_invalid_wreg64 - dummy reg write function 1355 * 1356 * @adev: amdgpu_device pointer 1357 * @reg: offset of register 1358 * @v: value to write to the register 1359 * 1360 * Dummy register read function. Used for register blocks 1361 * that certain asics don't have (all asics). 1362 */ 1363 static void amdgpu_invalid_wreg64(struct amdgpu_device *adev, uint32_t reg, uint64_t v) 1364 { 1365 dev_err(adev->dev, 1366 "Invalid callback to write 64 bit register 0x%04X with 0x%08llX\n", 1367 reg, v); 1368 BUG(); 1369 } 1370 1371 static void amdgpu_invalid_wreg64_ext(struct amdgpu_device *adev, uint64_t reg, uint64_t v) 1372 { 1373 dev_err(adev->dev, 1374 "Invalid callback to write 64 bit register 0x%llX with 0x%08llX\n", 1375 reg, v); 1376 BUG(); 1377 } 1378 1379 /** 1380 * amdgpu_block_invalid_rreg - dummy reg read function 1381 * 1382 * @adev: amdgpu_device pointer 1383 * @block: offset of instance 1384 * @reg: offset of register 1385 * 1386 * Dummy register read function. Used for register blocks 1387 * that certain asics don't have (all asics). 1388 * Returns the value in the register. 1389 */ 1390 static uint32_t amdgpu_block_invalid_rreg(struct amdgpu_device *adev, 1391 uint32_t block, uint32_t reg) 1392 { 1393 dev_err(adev->dev, 1394 "Invalid callback to read register 0x%04X in block 0x%04X\n", 1395 reg, block); 1396 BUG(); 1397 return 0; 1398 } 1399 1400 /** 1401 * amdgpu_block_invalid_wreg - dummy reg write function 1402 * 1403 * @adev: amdgpu_device pointer 1404 * @block: offset of instance 1405 * @reg: offset of register 1406 * @v: value to write to the register 1407 * 1408 * Dummy register read function. Used for register blocks 1409 * that certain asics don't have (all asics). 1410 */ 1411 static void amdgpu_block_invalid_wreg(struct amdgpu_device *adev, 1412 uint32_t block, 1413 uint32_t reg, uint32_t v) 1414 { 1415 dev_err(adev->dev, 1416 "Invalid block callback to write register 0x%04X in block 0x%04X with 0x%08X\n", 1417 reg, block, v); 1418 BUG(); 1419 } 1420 1421 static uint32_t amdgpu_device_get_vbios_flags(struct amdgpu_device *adev) 1422 { 1423 if (hweight32(adev->aid_mask) && (adev->flags & AMD_IS_APU)) 1424 return AMDGPU_VBIOS_SKIP; 1425 1426 if (hweight32(adev->aid_mask) && amdgpu_passthrough(adev)) 1427 return AMDGPU_VBIOS_OPTIONAL; 1428 1429 return 0; 1430 } 1431 1432 /** 1433 * amdgpu_device_asic_init - Wrapper for atom asic_init 1434 * 1435 * @adev: amdgpu_device pointer 1436 * 1437 * Does any asic specific work and then calls atom asic init. 1438 */ 1439 static int amdgpu_device_asic_init(struct amdgpu_device *adev) 1440 { 1441 uint32_t flags; 1442 bool optional; 1443 int ret; 1444 1445 amdgpu_asic_pre_asic_init(adev); 1446 flags = amdgpu_device_get_vbios_flags(adev); 1447 optional = !!(flags & (AMDGPU_VBIOS_OPTIONAL | AMDGPU_VBIOS_SKIP)); 1448 1449 if (amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 3) || 1450 amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 4) || 1451 amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 5, 0) || 1452 amdgpu_ip_version(adev, GC_HWIP, 0) >= IP_VERSION(11, 0, 0)) { 1453 amdgpu_psp_wait_for_bootloader(adev); 1454 if (optional && !adev->bios) 1455 return 0; 1456 1457 ret = amdgpu_atomfirmware_asic_init(adev, true); 1458 return ret; 1459 } else { 1460 if (optional && !adev->bios) 1461 return 0; 1462 1463 return amdgpu_atom_asic_init(adev->mode_info.atom_context); 1464 } 1465 1466 return 0; 1467 } 1468 1469 /** 1470 * amdgpu_device_mem_scratch_init - allocate the VRAM scratch page 1471 * 1472 * @adev: amdgpu_device pointer 1473 * 1474 * Allocates a scratch page of VRAM for use by various things in the 1475 * driver. 1476 */ 1477 static int amdgpu_device_mem_scratch_init(struct amdgpu_device *adev) 1478 { 1479 return amdgpu_bo_create_kernel(adev, AMDGPU_GPU_PAGE_SIZE, PAGE_SIZE, 1480 AMDGPU_GEM_DOMAIN_VRAM | 1481 AMDGPU_GEM_DOMAIN_GTT, 1482 &adev->mem_scratch.robj, 1483 &adev->mem_scratch.gpu_addr, 1484 (void **)&adev->mem_scratch.ptr); 1485 } 1486 1487 /** 1488 * amdgpu_device_mem_scratch_fini - Free the VRAM scratch page 1489 * 1490 * @adev: amdgpu_device pointer 1491 * 1492 * Frees the VRAM scratch page. 1493 */ 1494 static void amdgpu_device_mem_scratch_fini(struct amdgpu_device *adev) 1495 { 1496 amdgpu_bo_free_kernel(&adev->mem_scratch.robj, NULL, NULL); 1497 } 1498 1499 /** 1500 * amdgpu_device_program_register_sequence - program an array of registers. 1501 * 1502 * @adev: amdgpu_device pointer 1503 * @registers: pointer to the register array 1504 * @array_size: size of the register array 1505 * 1506 * Programs an array or registers with and or masks. 1507 * This is a helper for setting golden registers. 1508 */ 1509 void amdgpu_device_program_register_sequence(struct amdgpu_device *adev, 1510 const u32 *registers, 1511 const u32 array_size) 1512 { 1513 u32 tmp, reg, and_mask, or_mask; 1514 int i; 1515 1516 if (array_size % 3) 1517 return; 1518 1519 for (i = 0; i < array_size; i += 3) { 1520 reg = registers[i + 0]; 1521 and_mask = registers[i + 1]; 1522 or_mask = registers[i + 2]; 1523 1524 if (and_mask == 0xffffffff) { 1525 tmp = or_mask; 1526 } else { 1527 tmp = RREG32(reg); 1528 tmp &= ~and_mask; 1529 if (adev->family >= AMDGPU_FAMILY_AI) 1530 tmp |= (or_mask & and_mask); 1531 else 1532 tmp |= or_mask; 1533 } 1534 WREG32(reg, tmp); 1535 } 1536 } 1537 1538 /** 1539 * amdgpu_device_pci_config_reset - reset the GPU 1540 * 1541 * @adev: amdgpu_device pointer 1542 * 1543 * Resets the GPU using the pci config reset sequence. 1544 * Only applicable to asics prior to vega10. 1545 */ 1546 void amdgpu_device_pci_config_reset(struct amdgpu_device *adev) 1547 { 1548 pci_write_config_dword(adev->pdev, 0x7c, AMDGPU_ASIC_RESET_DATA); 1549 } 1550 1551 /** 1552 * amdgpu_device_pci_reset - reset the GPU using generic PCI means 1553 * 1554 * @adev: amdgpu_device pointer 1555 * 1556 * Resets the GPU using generic pci reset interfaces (FLR, SBR, etc.). 1557 */ 1558 int amdgpu_device_pci_reset(struct amdgpu_device *adev) 1559 { 1560 return pci_reset_function(adev->pdev); 1561 } 1562 1563 /* 1564 * amdgpu_device_wb_*() 1565 * Writeback is the method by which the GPU updates special pages in memory 1566 * with the status of certain GPU events (fences, ring pointers,etc.). 1567 */ 1568 1569 /** 1570 * amdgpu_device_wb_fini - Disable Writeback and free memory 1571 * 1572 * @adev: amdgpu_device pointer 1573 * 1574 * Disables Writeback and frees the Writeback memory (all asics). 1575 * Used at driver shutdown. 1576 */ 1577 static void amdgpu_device_wb_fini(struct amdgpu_device *adev) 1578 { 1579 if (adev->wb.wb_obj) { 1580 amdgpu_bo_free_kernel(&adev->wb.wb_obj, 1581 &adev->wb.gpu_addr, 1582 (void **)&adev->wb.wb); 1583 adev->wb.wb_obj = NULL; 1584 } 1585 } 1586 1587 /** 1588 * amdgpu_device_wb_init - Init Writeback driver info and allocate memory 1589 * 1590 * @adev: amdgpu_device pointer 1591 * 1592 * Initializes writeback and allocates writeback memory (all asics). 1593 * Used at driver startup. 1594 * Returns 0 on success or an -error on failure. 1595 */ 1596 static int amdgpu_device_wb_init(struct amdgpu_device *adev) 1597 { 1598 int r; 1599 1600 if (adev->wb.wb_obj == NULL) { 1601 /* AMDGPU_MAX_WB * sizeof(uint32_t) * 8 = AMDGPU_MAX_WB 256bit slots */ 1602 r = amdgpu_bo_create_kernel(adev, AMDGPU_MAX_WB * sizeof(uint32_t) * 8, 1603 PAGE_SIZE, AMDGPU_GEM_DOMAIN_GTT, 1604 &adev->wb.wb_obj, &adev->wb.gpu_addr, 1605 (void **)&adev->wb.wb); 1606 if (r) { 1607 dev_warn(adev->dev, "(%d) create WB bo failed\n", r); 1608 return r; 1609 } 1610 1611 adev->wb.num_wb = AMDGPU_MAX_WB; 1612 memset(&adev->wb.used, 0, sizeof(adev->wb.used)); 1613 1614 /* clear wb memory */ 1615 memset((char *)adev->wb.wb, 0, AMDGPU_MAX_WB * sizeof(uint32_t) * 8); 1616 } 1617 1618 return 0; 1619 } 1620 1621 /** 1622 * amdgpu_device_wb_get - Allocate a wb entry 1623 * 1624 * @adev: amdgpu_device pointer 1625 * @wb: wb index 1626 * 1627 * Allocate a wb slot for use by the driver (all asics). 1628 * Returns 0 on success or -EINVAL on failure. 1629 */ 1630 int amdgpu_device_wb_get(struct amdgpu_device *adev, u32 *wb) 1631 { 1632 unsigned long flags, offset; 1633 1634 spin_lock_irqsave(&adev->wb.lock, flags); 1635 offset = find_first_zero_bit(adev->wb.used, adev->wb.num_wb); 1636 if (offset < adev->wb.num_wb) { 1637 __set_bit(offset, adev->wb.used); 1638 spin_unlock_irqrestore(&adev->wb.lock, flags); 1639 *wb = offset << 3; /* convert to dw offset */ 1640 return 0; 1641 } else { 1642 spin_unlock_irqrestore(&adev->wb.lock, flags); 1643 return -EINVAL; 1644 } 1645 } 1646 1647 /** 1648 * amdgpu_device_wb_free - Free a wb entry 1649 * 1650 * @adev: amdgpu_device pointer 1651 * @wb: wb index 1652 * 1653 * Free a wb slot allocated for use by the driver (all asics) 1654 */ 1655 void amdgpu_device_wb_free(struct amdgpu_device *adev, u32 wb) 1656 { 1657 unsigned long flags; 1658 1659 wb >>= 3; 1660 spin_lock_irqsave(&adev->wb.lock, flags); 1661 if (wb < adev->wb.num_wb) 1662 __clear_bit(wb, adev->wb.used); 1663 spin_unlock_irqrestore(&adev->wb.lock, flags); 1664 } 1665 1666 /** 1667 * amdgpu_device_resize_fb_bar - try to resize FB BAR 1668 * 1669 * @adev: amdgpu_device pointer 1670 * 1671 * Try to resize FB BAR to make all VRAM CPU accessible. We try very hard not 1672 * to fail, but if any of the BARs is not accessible after the size we abort 1673 * driver loading by returning -ENODEV. 1674 */ 1675 int amdgpu_device_resize_fb_bar(struct amdgpu_device *adev) 1676 { 1677 int rbar_size = pci_rebar_bytes_to_size(adev->gmc.real_vram_size); 1678 struct pci_bus *root; 1679 struct resource *res; 1680 unsigned int i; 1681 u16 cmd; 1682 int r; 1683 1684 if (!IS_ENABLED(CONFIG_PHYS_ADDR_T_64BIT)) 1685 return 0; 1686 1687 /* Bypass for VF */ 1688 if (amdgpu_sriov_vf(adev)) 1689 return 0; 1690 1691 if (!amdgpu_rebar) 1692 return 0; 1693 1694 /* resizing on Dell G5 SE platforms causes problems with runtime pm */ 1695 if ((amdgpu_runtime_pm != 0) && 1696 adev->pdev->vendor == PCI_VENDOR_ID_ATI && 1697 adev->pdev->device == 0x731f && 1698 adev->pdev->subsystem_vendor == PCI_VENDOR_ID_DELL) 1699 return 0; 1700 1701 /* PCI_EXT_CAP_ID_VNDR extended capability is located at 0x100 */ 1702 if (!pci_find_ext_capability(adev->pdev, PCI_EXT_CAP_ID_VNDR)) 1703 dev_warn( 1704 adev->dev, 1705 "System can't access extended configuration space, please check!!\n"); 1706 1707 /* skip if the bios has already enabled large BAR */ 1708 if (adev->gmc.real_vram_size && 1709 (pci_resource_len(adev->pdev, 0) >= adev->gmc.real_vram_size)) 1710 return 0; 1711 1712 /* Check if the root BUS has 64bit memory resources */ 1713 root = adev->pdev->bus; 1714 while (root->parent) 1715 root = root->parent; 1716 1717 pci_bus_for_each_resource(root, res, i) { 1718 if (res && res->flags & (IORESOURCE_MEM | IORESOURCE_MEM_64) && 1719 res->start > 0x100000000ull) 1720 break; 1721 } 1722 1723 /* Trying to resize is pointless without a root hub window above 4GB */ 1724 if (!res) 1725 return 0; 1726 1727 /* Limit the BAR size to what is available */ 1728 rbar_size = min(fls(pci_rebar_get_possible_sizes(adev->pdev, 0)) - 1, 1729 rbar_size); 1730 1731 /* Disable memory decoding while we change the BAR addresses and size */ 1732 pci_read_config_word(adev->pdev, PCI_COMMAND, &cmd); 1733 pci_write_config_word(adev->pdev, PCI_COMMAND, 1734 cmd & ~PCI_COMMAND_MEMORY); 1735 1736 /* Free the VRAM and doorbell BAR, we most likely need to move both. */ 1737 amdgpu_doorbell_fini(adev); 1738 if (adev->asic_type >= CHIP_BONAIRE) 1739 pci_release_resource(adev->pdev, 2); 1740 1741 pci_release_resource(adev->pdev, 0); 1742 1743 r = pci_resize_resource(adev->pdev, 0, rbar_size); 1744 if (r == -ENOSPC) 1745 dev_info(adev->dev, 1746 "Not enough PCI address space for a large BAR."); 1747 else if (r && r != -ENOTSUPP) 1748 dev_err(adev->dev, "Problem resizing BAR0 (%d).", r); 1749 1750 pci_assign_unassigned_bus_resources(adev->pdev->bus); 1751 1752 /* When the doorbell or fb BAR isn't available we have no chance of 1753 * using the device. 1754 */ 1755 r = amdgpu_doorbell_init(adev); 1756 if (r || (pci_resource_flags(adev->pdev, 0) & IORESOURCE_UNSET)) 1757 return -ENODEV; 1758 1759 pci_write_config_word(adev->pdev, PCI_COMMAND, cmd); 1760 1761 return 0; 1762 } 1763 1764 /* 1765 * GPU helpers function. 1766 */ 1767 /** 1768 * amdgpu_device_need_post - check if the hw need post or not 1769 * 1770 * @adev: amdgpu_device pointer 1771 * 1772 * Check if the asic has been initialized (all asics) at driver startup 1773 * or post is needed if hw reset is performed. 1774 * Returns true if need or false if not. 1775 */ 1776 bool amdgpu_device_need_post(struct amdgpu_device *adev) 1777 { 1778 uint32_t reg, flags; 1779 1780 if (amdgpu_sriov_vf(adev)) 1781 return false; 1782 1783 flags = amdgpu_device_get_vbios_flags(adev); 1784 if (flags & AMDGPU_VBIOS_SKIP) 1785 return false; 1786 if ((flags & AMDGPU_VBIOS_OPTIONAL) && !adev->bios) 1787 return false; 1788 1789 if (amdgpu_passthrough(adev)) { 1790 /* for FIJI: In whole GPU pass-through virtualization case, after VM reboot 1791 * some old smc fw still need driver do vPost otherwise gpu hang, while 1792 * those smc fw version above 22.15 doesn't have this flaw, so we force 1793 * vpost executed for smc version below 22.15 1794 */ 1795 if (adev->asic_type == CHIP_FIJI) { 1796 int err; 1797 uint32_t fw_ver; 1798 1799 err = request_firmware(&adev->pm.fw, "amdgpu/fiji_smc.bin", adev->dev); 1800 /* force vPost if error occurred */ 1801 if (err) 1802 return true; 1803 1804 fw_ver = *((uint32_t *)adev->pm.fw->data + 69); 1805 release_firmware(adev->pm.fw); 1806 if (fw_ver < 0x00160e00) 1807 return true; 1808 } 1809 } 1810 1811 /* Don't post if we need to reset whole hive on init */ 1812 if (adev->init_lvl->level == AMDGPU_INIT_LEVEL_MINIMAL_XGMI) 1813 return false; 1814 1815 if (adev->has_hw_reset) { 1816 adev->has_hw_reset = false; 1817 return true; 1818 } 1819 1820 /* bios scratch used on CIK+ */ 1821 if (adev->asic_type >= CHIP_BONAIRE) 1822 return amdgpu_atombios_scratch_need_asic_init(adev); 1823 1824 /* check MEM_SIZE for older asics */ 1825 reg = amdgpu_asic_get_config_memsize(adev); 1826 1827 if ((reg != 0) && (reg != 0xffffffff)) 1828 return false; 1829 1830 return true; 1831 } 1832 1833 /* 1834 * Check whether seamless boot is supported. 1835 * 1836 * So far we only support seamless boot on DCE 3.0 or later. 1837 * If users report that it works on older ASICS as well, we may 1838 * loosen this. 1839 */ 1840 bool amdgpu_device_seamless_boot_supported(struct amdgpu_device *adev) 1841 { 1842 switch (amdgpu_seamless) { 1843 case -1: 1844 break; 1845 case 1: 1846 return true; 1847 case 0: 1848 return false; 1849 default: 1850 dev_err(adev->dev, "Invalid value for amdgpu.seamless: %d\n", 1851 amdgpu_seamless); 1852 return false; 1853 } 1854 1855 if (!(adev->flags & AMD_IS_APU)) 1856 return false; 1857 1858 if (adev->mman.keep_stolen_vga_memory) 1859 return false; 1860 1861 return amdgpu_ip_version(adev, DCE_HWIP, 0) >= IP_VERSION(3, 0, 0); 1862 } 1863 1864 /* 1865 * Intel hosts such as Rocket Lake, Alder Lake, Raptor Lake and Sapphire Rapids 1866 * don't support dynamic speed switching. Until we have confirmation from Intel 1867 * that a specific host supports it, it's safer that we keep it disabled for all. 1868 * 1869 * https://edc.intel.com/content/www/us/en/design/products/platforms/details/raptor-lake-s/13th-generation-core-processors-datasheet-volume-1-of-2/005/pci-express-support/ 1870 * https://gitlab.freedesktop.org/drm/amd/-/issues/2663 1871 */ 1872 static bool amdgpu_device_pcie_dynamic_switching_supported(struct amdgpu_device *adev) 1873 { 1874 #if IS_ENABLED(CONFIG_X86) 1875 struct cpuinfo_x86 *c = &cpu_data(0); 1876 1877 /* eGPU change speeds based on USB4 fabric conditions */ 1878 if (dev_is_removable(adev->dev)) 1879 return true; 1880 1881 if (c->x86_vendor == X86_VENDOR_INTEL) 1882 return false; 1883 #endif 1884 return true; 1885 } 1886 1887 static bool amdgpu_device_aspm_support_quirk(struct amdgpu_device *adev) 1888 { 1889 /* Enabling ASPM causes randoms hangs on Tahiti and Oland on Zen4. 1890 * It's unclear if this is a platform-specific or GPU-specific issue. 1891 * Disable ASPM on SI for the time being. 1892 */ 1893 if (adev->family == AMDGPU_FAMILY_SI) 1894 return true; 1895 1896 #if IS_ENABLED(CONFIG_X86) 1897 struct cpuinfo_x86 *c = &cpu_data(0); 1898 1899 if (!(amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(12, 0, 0) || 1900 amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(12, 0, 1))) 1901 return false; 1902 1903 if (c->x86 == 6 && 1904 adev->pm.pcie_gen_mask & CAIL_PCIE_LINK_SPEED_SUPPORT_GEN5) { 1905 switch (c->x86_model) { 1906 case VFM_MODEL(INTEL_ALDERLAKE): 1907 case VFM_MODEL(INTEL_ALDERLAKE_L): 1908 case VFM_MODEL(INTEL_RAPTORLAKE): 1909 case VFM_MODEL(INTEL_RAPTORLAKE_P): 1910 case VFM_MODEL(INTEL_RAPTORLAKE_S): 1911 return true; 1912 default: 1913 return false; 1914 } 1915 } else { 1916 return false; 1917 } 1918 #else 1919 return false; 1920 #endif 1921 } 1922 1923 /** 1924 * amdgpu_device_should_use_aspm - check if the device should program ASPM 1925 * 1926 * @adev: amdgpu_device pointer 1927 * 1928 * Confirm whether the module parameter and pcie bridge agree that ASPM should 1929 * be set for this device. 1930 * 1931 * Returns true if it should be used or false if not. 1932 */ 1933 bool amdgpu_device_should_use_aspm(struct amdgpu_device *adev) 1934 { 1935 switch (amdgpu_aspm) { 1936 case -1: 1937 break; 1938 case 0: 1939 return false; 1940 case 1: 1941 return true; 1942 default: 1943 return false; 1944 } 1945 if (adev->flags & AMD_IS_APU) 1946 return false; 1947 if (amdgpu_device_aspm_support_quirk(adev)) 1948 return false; 1949 return pcie_aspm_enabled(adev->pdev); 1950 } 1951 1952 /* if we get transitioned to only one device, take VGA back */ 1953 /** 1954 * amdgpu_device_vga_set_decode - enable/disable vga decode 1955 * 1956 * @pdev: PCI device pointer 1957 * @state: enable/disable vga decode 1958 * 1959 * Enable/disable vga decode (all asics). 1960 * Returns VGA resource flags. 1961 */ 1962 static unsigned int amdgpu_device_vga_set_decode(struct pci_dev *pdev, 1963 bool state) 1964 { 1965 struct amdgpu_device *adev = drm_to_adev(pci_get_drvdata(pdev)); 1966 1967 amdgpu_asic_set_vga_state(adev, state); 1968 if (state) 1969 return VGA_RSRC_LEGACY_IO | VGA_RSRC_LEGACY_MEM | 1970 VGA_RSRC_NORMAL_IO | VGA_RSRC_NORMAL_MEM; 1971 else 1972 return VGA_RSRC_NORMAL_IO | VGA_RSRC_NORMAL_MEM; 1973 } 1974 1975 /** 1976 * amdgpu_device_check_block_size - validate the vm block size 1977 * 1978 * @adev: amdgpu_device pointer 1979 * 1980 * Validates the vm block size specified via module parameter. 1981 * The vm block size defines number of bits in page table versus page directory, 1982 * a page is 4KB so we have 12 bits offset, minimum 9 bits in the 1983 * page table and the remaining bits are in the page directory. 1984 */ 1985 static void amdgpu_device_check_block_size(struct amdgpu_device *adev) 1986 { 1987 /* defines number of bits in page table versus page directory, 1988 * a page is 4KB so we have 12 bits offset, minimum 9 bits in the 1989 * page table and the remaining bits are in the page directory 1990 */ 1991 if (amdgpu_vm_block_size == -1) 1992 return; 1993 1994 if (amdgpu_vm_block_size < 9) { 1995 dev_warn(adev->dev, "VM page table size (%d) too small\n", 1996 amdgpu_vm_block_size); 1997 amdgpu_vm_block_size = -1; 1998 } 1999 } 2000 2001 /** 2002 * amdgpu_device_check_vm_size - validate the vm size 2003 * 2004 * @adev: amdgpu_device pointer 2005 * 2006 * Validates the vm size in GB specified via module parameter. 2007 * The VM size is the size of the GPU virtual memory space in GB. 2008 */ 2009 static void amdgpu_device_check_vm_size(struct amdgpu_device *adev) 2010 { 2011 /* no need to check the default value */ 2012 if (amdgpu_vm_size == -1) 2013 return; 2014 2015 if (amdgpu_vm_size < 1) { 2016 dev_warn(adev->dev, "VM size (%d) too small, min is 1GB\n", 2017 amdgpu_vm_size); 2018 amdgpu_vm_size = -1; 2019 } 2020 } 2021 2022 static void amdgpu_device_check_smu_prv_buffer_size(struct amdgpu_device *adev) 2023 { 2024 struct sysinfo si; 2025 bool is_os_64 = (sizeof(void *) == 8); 2026 uint64_t total_memory; 2027 uint64_t dram_size_seven_GB = 0x1B8000000; 2028 uint64_t dram_size_three_GB = 0xB8000000; 2029 2030 if (amdgpu_smu_memory_pool_size == 0) 2031 return; 2032 2033 if (!is_os_64) { 2034 dev_warn(adev->dev, "Not 64-bit OS, feature not supported\n"); 2035 goto def_value; 2036 } 2037 si_meminfo(&si); 2038 total_memory = (uint64_t)si.totalram * si.mem_unit; 2039 2040 if ((amdgpu_smu_memory_pool_size == 1) || 2041 (amdgpu_smu_memory_pool_size == 2)) { 2042 if (total_memory < dram_size_three_GB) 2043 goto def_value1; 2044 } else if ((amdgpu_smu_memory_pool_size == 4) || 2045 (amdgpu_smu_memory_pool_size == 8)) { 2046 if (total_memory < dram_size_seven_GB) 2047 goto def_value1; 2048 } else { 2049 dev_warn(adev->dev, "Smu memory pool size not supported\n"); 2050 goto def_value; 2051 } 2052 adev->pm.smu_prv_buffer_size = amdgpu_smu_memory_pool_size << 28; 2053 2054 return; 2055 2056 def_value1: 2057 dev_warn(adev->dev, "No enough system memory\n"); 2058 def_value: 2059 adev->pm.smu_prv_buffer_size = 0; 2060 } 2061 2062 static int amdgpu_device_init_apu_flags(struct amdgpu_device *adev) 2063 { 2064 if (!(adev->flags & AMD_IS_APU) || 2065 adev->asic_type < CHIP_RAVEN) 2066 return 0; 2067 2068 switch (adev->asic_type) { 2069 case CHIP_RAVEN: 2070 if (adev->pdev->device == 0x15dd) 2071 adev->apu_flags |= AMD_APU_IS_RAVEN; 2072 if (adev->pdev->device == 0x15d8) 2073 adev->apu_flags |= AMD_APU_IS_PICASSO; 2074 break; 2075 case CHIP_RENOIR: 2076 if ((adev->pdev->device == 0x1636) || 2077 (adev->pdev->device == 0x164c)) 2078 adev->apu_flags |= AMD_APU_IS_RENOIR; 2079 else 2080 adev->apu_flags |= AMD_APU_IS_GREEN_SARDINE; 2081 break; 2082 case CHIP_VANGOGH: 2083 adev->apu_flags |= AMD_APU_IS_VANGOGH; 2084 break; 2085 case CHIP_YELLOW_CARP: 2086 break; 2087 case CHIP_CYAN_SKILLFISH: 2088 if ((adev->pdev->device == 0x13FE) || 2089 (adev->pdev->device == 0x143F)) 2090 adev->apu_flags |= AMD_APU_IS_CYAN_SKILLFISH2; 2091 break; 2092 default: 2093 break; 2094 } 2095 2096 return 0; 2097 } 2098 2099 /** 2100 * amdgpu_device_check_arguments - validate module params 2101 * 2102 * @adev: amdgpu_device pointer 2103 * 2104 * Validates certain module parameters and updates 2105 * the associated values used by the driver (all asics). 2106 */ 2107 static int amdgpu_device_check_arguments(struct amdgpu_device *adev) 2108 { 2109 int i; 2110 2111 if (amdgpu_sched_jobs < 4) { 2112 dev_warn(adev->dev, "sched jobs (%d) must be at least 4\n", 2113 amdgpu_sched_jobs); 2114 amdgpu_sched_jobs = 4; 2115 } else if (!is_power_of_2(amdgpu_sched_jobs)) { 2116 dev_warn(adev->dev, "sched jobs (%d) must be a power of 2\n", 2117 amdgpu_sched_jobs); 2118 amdgpu_sched_jobs = roundup_pow_of_two(amdgpu_sched_jobs); 2119 } 2120 2121 if (amdgpu_gart_size != -1 && amdgpu_gart_size < 32) { 2122 /* gart size must be greater or equal to 32M */ 2123 dev_warn(adev->dev, "gart size (%d) too small\n", 2124 amdgpu_gart_size); 2125 amdgpu_gart_size = -1; 2126 } 2127 2128 if (amdgpu_gtt_size != -1 && amdgpu_gtt_size < 32) { 2129 /* gtt size must be greater or equal to 32M */ 2130 dev_warn(adev->dev, "gtt size (%d) too small\n", 2131 amdgpu_gtt_size); 2132 amdgpu_gtt_size = -1; 2133 } 2134 2135 /* valid range is between 4 and 9 inclusive */ 2136 if (amdgpu_vm_fragment_size != -1 && 2137 (amdgpu_vm_fragment_size > 9 || amdgpu_vm_fragment_size < 4)) { 2138 dev_warn(adev->dev, "valid range is between 4 and 9\n"); 2139 amdgpu_vm_fragment_size = -1; 2140 } 2141 2142 if (amdgpu_sched_hw_submission < 2) { 2143 dev_warn(adev->dev, "sched hw submission jobs (%d) must be at least 2\n", 2144 amdgpu_sched_hw_submission); 2145 amdgpu_sched_hw_submission = 2; 2146 } else if (!is_power_of_2(amdgpu_sched_hw_submission)) { 2147 dev_warn(adev->dev, "sched hw submission jobs (%d) must be a power of 2\n", 2148 amdgpu_sched_hw_submission); 2149 amdgpu_sched_hw_submission = roundup_pow_of_two(amdgpu_sched_hw_submission); 2150 } 2151 2152 if (amdgpu_reset_method < -1 || amdgpu_reset_method > 4) { 2153 dev_warn(adev->dev, "invalid option for reset method, reverting to default\n"); 2154 amdgpu_reset_method = -1; 2155 } 2156 2157 amdgpu_device_check_smu_prv_buffer_size(adev); 2158 2159 amdgpu_device_check_vm_size(adev); 2160 2161 amdgpu_device_check_block_size(adev); 2162 2163 adev->firmware.load_type = amdgpu_ucode_get_load_type(adev, amdgpu_fw_load_type); 2164 2165 for (i = 0; i < MAX_XCP; i++) { 2166 switch (amdgpu_enforce_isolation) { 2167 case -1: 2168 case 0: 2169 default: 2170 /* disable */ 2171 adev->enforce_isolation[i] = AMDGPU_ENFORCE_ISOLATION_DISABLE; 2172 break; 2173 case 1: 2174 /* enable */ 2175 adev->enforce_isolation[i] = 2176 AMDGPU_ENFORCE_ISOLATION_ENABLE; 2177 break; 2178 case 2: 2179 /* enable legacy mode */ 2180 adev->enforce_isolation[i] = 2181 AMDGPU_ENFORCE_ISOLATION_ENABLE_LEGACY; 2182 break; 2183 case 3: 2184 /* enable only process isolation without submitting cleaner shader */ 2185 adev->enforce_isolation[i] = 2186 AMDGPU_ENFORCE_ISOLATION_NO_CLEANER_SHADER; 2187 break; 2188 } 2189 } 2190 2191 return 0; 2192 } 2193 2194 /** 2195 * amdgpu_switcheroo_set_state - set switcheroo state 2196 * 2197 * @pdev: pci dev pointer 2198 * @state: vga_switcheroo state 2199 * 2200 * Callback for the switcheroo driver. Suspends or resumes 2201 * the asics before or after it is powered up using ACPI methods. 2202 */ 2203 static void amdgpu_switcheroo_set_state(struct pci_dev *pdev, 2204 enum vga_switcheroo_state state) 2205 { 2206 struct drm_device *dev = pci_get_drvdata(pdev); 2207 int r; 2208 2209 if (amdgpu_device_supports_px(drm_to_adev(dev)) && 2210 state == VGA_SWITCHEROO_OFF) 2211 return; 2212 2213 if (state == VGA_SWITCHEROO_ON) { 2214 pr_info("switched on\n"); 2215 /* don't suspend or resume card normally */ 2216 dev->switch_power_state = DRM_SWITCH_POWER_CHANGING; 2217 2218 pci_set_power_state(pdev, PCI_D0); 2219 amdgpu_device_load_pci_state(pdev); 2220 r = pci_enable_device(pdev); 2221 if (r) 2222 dev_warn(&pdev->dev, "pci_enable_device failed (%d)\n", 2223 r); 2224 amdgpu_device_resume(dev, true); 2225 2226 dev->switch_power_state = DRM_SWITCH_POWER_ON; 2227 } else { 2228 dev_info(&pdev->dev, "switched off\n"); 2229 dev->switch_power_state = DRM_SWITCH_POWER_CHANGING; 2230 amdgpu_device_prepare(dev); 2231 amdgpu_device_suspend(dev, true); 2232 amdgpu_device_cache_pci_state(pdev); 2233 /* Shut down the device */ 2234 pci_disable_device(pdev); 2235 pci_set_power_state(pdev, PCI_D3cold); 2236 dev->switch_power_state = DRM_SWITCH_POWER_OFF; 2237 } 2238 } 2239 2240 /** 2241 * amdgpu_switcheroo_can_switch - see if switcheroo state can change 2242 * 2243 * @pdev: pci dev pointer 2244 * 2245 * Callback for the switcheroo driver. Check of the switcheroo 2246 * state can be changed. 2247 * Returns true if the state can be changed, false if not. 2248 */ 2249 static bool amdgpu_switcheroo_can_switch(struct pci_dev *pdev) 2250 { 2251 struct drm_device *dev = pci_get_drvdata(pdev); 2252 2253 /* 2254 * FIXME: open_count is protected by drm_global_mutex but that would lead to 2255 * locking inversion with the driver load path. And the access here is 2256 * completely racy anyway. So don't bother with locking for now. 2257 */ 2258 return atomic_read(&dev->open_count) == 0; 2259 } 2260 2261 static const struct vga_switcheroo_client_ops amdgpu_switcheroo_ops = { 2262 .set_gpu_state = amdgpu_switcheroo_set_state, 2263 .reprobe = NULL, 2264 .can_switch = amdgpu_switcheroo_can_switch, 2265 }; 2266 2267 /** 2268 * amdgpu_device_ip_set_clockgating_state - set the CG state 2269 * 2270 * @dev: amdgpu_device pointer 2271 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.) 2272 * @state: clockgating state (gate or ungate) 2273 * 2274 * Sets the requested clockgating state for all instances of 2275 * the hardware IP specified. 2276 * Returns the error code from the last instance. 2277 */ 2278 int amdgpu_device_ip_set_clockgating_state(void *dev, 2279 enum amd_ip_block_type block_type, 2280 enum amd_clockgating_state state) 2281 { 2282 struct amdgpu_device *adev = dev; 2283 int i, r = 0; 2284 2285 for (i = 0; i < adev->num_ip_blocks; i++) { 2286 if (!adev->ip_blocks[i].status.valid) 2287 continue; 2288 if (adev->ip_blocks[i].version->type != block_type) 2289 continue; 2290 if (!adev->ip_blocks[i].version->funcs->set_clockgating_state) 2291 continue; 2292 r = adev->ip_blocks[i].version->funcs->set_clockgating_state( 2293 &adev->ip_blocks[i], state); 2294 if (r) 2295 dev_err(adev->dev, 2296 "set_clockgating_state of IP block <%s> failed %d\n", 2297 adev->ip_blocks[i].version->funcs->name, r); 2298 } 2299 return r; 2300 } 2301 2302 /** 2303 * amdgpu_device_ip_set_powergating_state - set the PG state 2304 * 2305 * @dev: amdgpu_device pointer 2306 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.) 2307 * @state: powergating state (gate or ungate) 2308 * 2309 * Sets the requested powergating state for all instances of 2310 * the hardware IP specified. 2311 * Returns the error code from the last instance. 2312 */ 2313 int amdgpu_device_ip_set_powergating_state(void *dev, 2314 enum amd_ip_block_type block_type, 2315 enum amd_powergating_state state) 2316 { 2317 struct amdgpu_device *adev = dev; 2318 int i, r = 0; 2319 2320 for (i = 0; i < adev->num_ip_blocks; i++) { 2321 if (!adev->ip_blocks[i].status.valid) 2322 continue; 2323 if (adev->ip_blocks[i].version->type != block_type) 2324 continue; 2325 if (!adev->ip_blocks[i].version->funcs->set_powergating_state) 2326 continue; 2327 r = adev->ip_blocks[i].version->funcs->set_powergating_state( 2328 &adev->ip_blocks[i], state); 2329 if (r) 2330 dev_err(adev->dev, 2331 "set_powergating_state of IP block <%s> failed %d\n", 2332 adev->ip_blocks[i].version->funcs->name, r); 2333 } 2334 return r; 2335 } 2336 2337 /** 2338 * amdgpu_device_ip_get_clockgating_state - get the CG state 2339 * 2340 * @adev: amdgpu_device pointer 2341 * @flags: clockgating feature flags 2342 * 2343 * Walks the list of IPs on the device and updates the clockgating 2344 * flags for each IP. 2345 * Updates @flags with the feature flags for each hardware IP where 2346 * clockgating is enabled. 2347 */ 2348 void amdgpu_device_ip_get_clockgating_state(struct amdgpu_device *adev, 2349 u64 *flags) 2350 { 2351 int i; 2352 2353 for (i = 0; i < adev->num_ip_blocks; i++) { 2354 if (!adev->ip_blocks[i].status.valid) 2355 continue; 2356 if (adev->ip_blocks[i].version->funcs->get_clockgating_state) 2357 adev->ip_blocks[i].version->funcs->get_clockgating_state( 2358 &adev->ip_blocks[i], flags); 2359 } 2360 } 2361 2362 /** 2363 * amdgpu_device_ip_wait_for_idle - wait for idle 2364 * 2365 * @adev: amdgpu_device pointer 2366 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.) 2367 * 2368 * Waits for the request hardware IP to be idle. 2369 * Returns 0 for success or a negative error code on failure. 2370 */ 2371 int amdgpu_device_ip_wait_for_idle(struct amdgpu_device *adev, 2372 enum amd_ip_block_type block_type) 2373 { 2374 int i, r; 2375 2376 for (i = 0; i < adev->num_ip_blocks; i++) { 2377 if (!adev->ip_blocks[i].status.valid) 2378 continue; 2379 if (adev->ip_blocks[i].version->type == block_type) { 2380 if (adev->ip_blocks[i].version->funcs->wait_for_idle) { 2381 r = adev->ip_blocks[i].version->funcs->wait_for_idle( 2382 &adev->ip_blocks[i]); 2383 if (r) 2384 return r; 2385 } 2386 break; 2387 } 2388 } 2389 return 0; 2390 2391 } 2392 2393 /** 2394 * amdgpu_device_ip_is_hw - is the hardware IP enabled 2395 * 2396 * @adev: amdgpu_device pointer 2397 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.) 2398 * 2399 * Check if the hardware IP is enable or not. 2400 * Returns true if it the IP is enable, false if not. 2401 */ 2402 bool amdgpu_device_ip_is_hw(struct amdgpu_device *adev, 2403 enum amd_ip_block_type block_type) 2404 { 2405 int i; 2406 2407 for (i = 0; i < adev->num_ip_blocks; i++) { 2408 if (adev->ip_blocks[i].version->type == block_type) 2409 return adev->ip_blocks[i].status.hw; 2410 } 2411 return false; 2412 } 2413 2414 /** 2415 * amdgpu_device_ip_is_valid - is the hardware IP valid 2416 * 2417 * @adev: amdgpu_device pointer 2418 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.) 2419 * 2420 * Check if the hardware IP is valid or not. 2421 * Returns true if it the IP is valid, false if not. 2422 */ 2423 bool amdgpu_device_ip_is_valid(struct amdgpu_device *adev, 2424 enum amd_ip_block_type block_type) 2425 { 2426 int i; 2427 2428 for (i = 0; i < adev->num_ip_blocks; i++) { 2429 if (adev->ip_blocks[i].version->type == block_type) 2430 return adev->ip_blocks[i].status.valid; 2431 } 2432 return false; 2433 2434 } 2435 2436 /** 2437 * amdgpu_device_ip_get_ip_block - get a hw IP pointer 2438 * 2439 * @adev: amdgpu_device pointer 2440 * @type: Type of hardware IP (SMU, GFX, UVD, etc.) 2441 * 2442 * Returns a pointer to the hardware IP block structure 2443 * if it exists for the asic, otherwise NULL. 2444 */ 2445 struct amdgpu_ip_block * 2446 amdgpu_device_ip_get_ip_block(struct amdgpu_device *adev, 2447 enum amd_ip_block_type type) 2448 { 2449 int i; 2450 2451 for (i = 0; i < adev->num_ip_blocks; i++) 2452 if (adev->ip_blocks[i].version->type == type) 2453 return &adev->ip_blocks[i]; 2454 2455 return NULL; 2456 } 2457 2458 /** 2459 * amdgpu_device_ip_block_version_cmp 2460 * 2461 * @adev: amdgpu_device pointer 2462 * @type: enum amd_ip_block_type 2463 * @major: major version 2464 * @minor: minor version 2465 * 2466 * return 0 if equal or greater 2467 * return 1 if smaller or the ip_block doesn't exist 2468 */ 2469 int amdgpu_device_ip_block_version_cmp(struct amdgpu_device *adev, 2470 enum amd_ip_block_type type, 2471 u32 major, u32 minor) 2472 { 2473 struct amdgpu_ip_block *ip_block = amdgpu_device_ip_get_ip_block(adev, type); 2474 2475 if (ip_block && ((ip_block->version->major > major) || 2476 ((ip_block->version->major == major) && 2477 (ip_block->version->minor >= minor)))) 2478 return 0; 2479 2480 return 1; 2481 } 2482 2483 static const char *ip_block_names[] = { 2484 [AMD_IP_BLOCK_TYPE_COMMON] = "common", 2485 [AMD_IP_BLOCK_TYPE_GMC] = "gmc", 2486 [AMD_IP_BLOCK_TYPE_IH] = "ih", 2487 [AMD_IP_BLOCK_TYPE_SMC] = "smu", 2488 [AMD_IP_BLOCK_TYPE_PSP] = "psp", 2489 [AMD_IP_BLOCK_TYPE_DCE] = "dce", 2490 [AMD_IP_BLOCK_TYPE_GFX] = "gfx", 2491 [AMD_IP_BLOCK_TYPE_SDMA] = "sdma", 2492 [AMD_IP_BLOCK_TYPE_UVD] = "uvd", 2493 [AMD_IP_BLOCK_TYPE_VCE] = "vce", 2494 [AMD_IP_BLOCK_TYPE_ACP] = "acp", 2495 [AMD_IP_BLOCK_TYPE_VCN] = "vcn", 2496 [AMD_IP_BLOCK_TYPE_MES] = "mes", 2497 [AMD_IP_BLOCK_TYPE_JPEG] = "jpeg", 2498 [AMD_IP_BLOCK_TYPE_VPE] = "vpe", 2499 [AMD_IP_BLOCK_TYPE_UMSCH_MM] = "umsch_mm", 2500 [AMD_IP_BLOCK_TYPE_ISP] = "isp", 2501 [AMD_IP_BLOCK_TYPE_RAS] = "ras", 2502 }; 2503 2504 static const char *ip_block_name(struct amdgpu_device *adev, enum amd_ip_block_type type) 2505 { 2506 int idx = (int)type; 2507 2508 return idx < ARRAY_SIZE(ip_block_names) ? ip_block_names[idx] : "unknown"; 2509 } 2510 2511 /** 2512 * amdgpu_device_ip_block_add 2513 * 2514 * @adev: amdgpu_device pointer 2515 * @ip_block_version: pointer to the IP to add 2516 * 2517 * Adds the IP block driver information to the collection of IPs 2518 * on the asic. 2519 */ 2520 int amdgpu_device_ip_block_add(struct amdgpu_device *adev, 2521 const struct amdgpu_ip_block_version *ip_block_version) 2522 { 2523 if (!ip_block_version) 2524 return -EINVAL; 2525 2526 switch (ip_block_version->type) { 2527 case AMD_IP_BLOCK_TYPE_VCN: 2528 if (adev->harvest_ip_mask & AMD_HARVEST_IP_VCN_MASK) 2529 return 0; 2530 break; 2531 case AMD_IP_BLOCK_TYPE_JPEG: 2532 if (adev->harvest_ip_mask & AMD_HARVEST_IP_JPEG_MASK) 2533 return 0; 2534 break; 2535 default: 2536 break; 2537 } 2538 2539 dev_info(adev->dev, "detected ip block number %d <%s_v%d_%d_%d> (%s)\n", 2540 adev->num_ip_blocks, 2541 ip_block_name(adev, ip_block_version->type), 2542 ip_block_version->major, 2543 ip_block_version->minor, 2544 ip_block_version->rev, 2545 ip_block_version->funcs->name); 2546 2547 adev->ip_blocks[adev->num_ip_blocks].adev = adev; 2548 2549 adev->ip_blocks[adev->num_ip_blocks++].version = ip_block_version; 2550 2551 return 0; 2552 } 2553 2554 /** 2555 * amdgpu_device_enable_virtual_display - enable virtual display feature 2556 * 2557 * @adev: amdgpu_device pointer 2558 * 2559 * Enabled the virtual display feature if the user has enabled it via 2560 * the module parameter virtual_display. This feature provides a virtual 2561 * display hardware on headless boards or in virtualized environments. 2562 * This function parses and validates the configuration string specified by 2563 * the user and configures the virtual display configuration (number of 2564 * virtual connectors, crtcs, etc.) specified. 2565 */ 2566 static void amdgpu_device_enable_virtual_display(struct amdgpu_device *adev) 2567 { 2568 adev->enable_virtual_display = false; 2569 2570 if (amdgpu_virtual_display) { 2571 const char *pci_address_name = pci_name(adev->pdev); 2572 char *pciaddstr, *pciaddstr_tmp, *pciaddname_tmp, *pciaddname; 2573 2574 pciaddstr = kstrdup(amdgpu_virtual_display, GFP_KERNEL); 2575 pciaddstr_tmp = pciaddstr; 2576 while ((pciaddname_tmp = strsep(&pciaddstr_tmp, ";"))) { 2577 pciaddname = strsep(&pciaddname_tmp, ","); 2578 if (!strcmp("all", pciaddname) 2579 || !strcmp(pci_address_name, pciaddname)) { 2580 long num_crtc; 2581 int res = -1; 2582 2583 adev->enable_virtual_display = true; 2584 2585 if (pciaddname_tmp) 2586 res = kstrtol(pciaddname_tmp, 10, 2587 &num_crtc); 2588 2589 if (!res) { 2590 if (num_crtc < 1) 2591 num_crtc = 1; 2592 if (num_crtc > 6) 2593 num_crtc = 6; 2594 adev->mode_info.num_crtc = num_crtc; 2595 } else { 2596 adev->mode_info.num_crtc = 1; 2597 } 2598 break; 2599 } 2600 } 2601 2602 dev_info( 2603 adev->dev, 2604 "virtual display string:%s, %s:virtual_display:%d, num_crtc:%d\n", 2605 amdgpu_virtual_display, pci_address_name, 2606 adev->enable_virtual_display, adev->mode_info.num_crtc); 2607 2608 kfree(pciaddstr); 2609 } 2610 } 2611 2612 void amdgpu_device_set_sriov_virtual_display(struct amdgpu_device *adev) 2613 { 2614 if (amdgpu_sriov_vf(adev) && !adev->enable_virtual_display) { 2615 adev->mode_info.num_crtc = 1; 2616 adev->enable_virtual_display = true; 2617 dev_info(adev->dev, "virtual_display:%d, num_crtc:%d\n", 2618 adev->enable_virtual_display, 2619 adev->mode_info.num_crtc); 2620 } 2621 } 2622 2623 /** 2624 * amdgpu_device_parse_gpu_info_fw - parse gpu info firmware 2625 * 2626 * @adev: amdgpu_device pointer 2627 * 2628 * Parses the asic configuration parameters specified in the gpu info 2629 * firmware and makes them available to the driver for use in configuring 2630 * the asic. 2631 * Returns 0 on success, -EINVAL on failure. 2632 */ 2633 static int amdgpu_device_parse_gpu_info_fw(struct amdgpu_device *adev) 2634 { 2635 const char *chip_name; 2636 int err; 2637 const struct gpu_info_firmware_header_v1_0 *hdr; 2638 2639 adev->firmware.gpu_info_fw = NULL; 2640 2641 switch (adev->asic_type) { 2642 default: 2643 return 0; 2644 case CHIP_VEGA10: 2645 chip_name = "vega10"; 2646 break; 2647 case CHIP_VEGA12: 2648 chip_name = "vega12"; 2649 break; 2650 case CHIP_RAVEN: 2651 if (adev->apu_flags & AMD_APU_IS_RAVEN2) 2652 chip_name = "raven2"; 2653 else if (adev->apu_flags & AMD_APU_IS_PICASSO) 2654 chip_name = "picasso"; 2655 else 2656 chip_name = "raven"; 2657 break; 2658 case CHIP_ARCTURUS: 2659 chip_name = "arcturus"; 2660 break; 2661 case CHIP_NAVI12: 2662 if (adev->discovery.bin) 2663 return 0; 2664 chip_name = "navi12"; 2665 break; 2666 case CHIP_CYAN_SKILLFISH: 2667 chip_name = "cyan_skillfish"; 2668 break; 2669 } 2670 2671 err = amdgpu_ucode_request(adev, &adev->firmware.gpu_info_fw, 2672 AMDGPU_UCODE_OPTIONAL, 2673 "amdgpu/%s_gpu_info.bin", chip_name); 2674 if (err) { 2675 dev_err(adev->dev, 2676 "Failed to get gpu_info firmware \"%s_gpu_info.bin\"\n", 2677 chip_name); 2678 goto out; 2679 } 2680 2681 hdr = (const struct gpu_info_firmware_header_v1_0 *)adev->firmware.gpu_info_fw->data; 2682 amdgpu_ucode_print_gpu_info_hdr(&hdr->header); 2683 2684 switch (hdr->version_major) { 2685 case 1: 2686 { 2687 const struct gpu_info_firmware_v1_0 *gpu_info_fw = 2688 (const struct gpu_info_firmware_v1_0 *)(adev->firmware.gpu_info_fw->data + 2689 le32_to_cpu(hdr->header.ucode_array_offset_bytes)); 2690 2691 /* 2692 * Should be dropped when DAL no longer needs it. 2693 */ 2694 if (adev->asic_type == CHIP_NAVI12) 2695 goto parse_soc_bounding_box; 2696 2697 adev->gfx.config.max_shader_engines = le32_to_cpu(gpu_info_fw->gc_num_se); 2698 adev->gfx.config.max_cu_per_sh = le32_to_cpu(gpu_info_fw->gc_num_cu_per_sh); 2699 adev->gfx.config.max_sh_per_se = le32_to_cpu(gpu_info_fw->gc_num_sh_per_se); 2700 adev->gfx.config.max_backends_per_se = le32_to_cpu(gpu_info_fw->gc_num_rb_per_se); 2701 adev->gfx.config.max_texture_channel_caches = 2702 le32_to_cpu(gpu_info_fw->gc_num_tccs); 2703 adev->gfx.config.max_gprs = le32_to_cpu(gpu_info_fw->gc_num_gprs); 2704 adev->gfx.config.max_gs_threads = le32_to_cpu(gpu_info_fw->gc_num_max_gs_thds); 2705 adev->gfx.config.gs_vgt_table_depth = le32_to_cpu(gpu_info_fw->gc_gs_table_depth); 2706 adev->gfx.config.gs_prim_buffer_depth = le32_to_cpu(gpu_info_fw->gc_gsprim_buff_depth); 2707 adev->gfx.config.double_offchip_lds_buf = 2708 le32_to_cpu(gpu_info_fw->gc_double_offchip_lds_buffer); 2709 adev->gfx.cu_info.wave_front_size = le32_to_cpu(gpu_info_fw->gc_wave_size); 2710 adev->gfx.cu_info.max_waves_per_simd = 2711 le32_to_cpu(gpu_info_fw->gc_max_waves_per_simd); 2712 adev->gfx.cu_info.max_scratch_slots_per_cu = 2713 le32_to_cpu(gpu_info_fw->gc_max_scratch_slots_per_cu); 2714 adev->gfx.cu_info.lds_size = le32_to_cpu(gpu_info_fw->gc_lds_size); 2715 if (hdr->version_minor >= 1) { 2716 const struct gpu_info_firmware_v1_1 *gpu_info_fw = 2717 (const struct gpu_info_firmware_v1_1 *)(adev->firmware.gpu_info_fw->data + 2718 le32_to_cpu(hdr->header.ucode_array_offset_bytes)); 2719 adev->gfx.config.num_sc_per_sh = 2720 le32_to_cpu(gpu_info_fw->num_sc_per_sh); 2721 adev->gfx.config.num_packer_per_sc = 2722 le32_to_cpu(gpu_info_fw->num_packer_per_sc); 2723 } 2724 2725 parse_soc_bounding_box: 2726 /* 2727 * soc bounding box info is not integrated in disocovery table, 2728 * we always need to parse it from gpu info firmware if needed. 2729 */ 2730 if (hdr->version_minor == 2) { 2731 const struct gpu_info_firmware_v1_2 *gpu_info_fw = 2732 (const struct gpu_info_firmware_v1_2 *)(adev->firmware.gpu_info_fw->data + 2733 le32_to_cpu(hdr->header.ucode_array_offset_bytes)); 2734 adev->dm.soc_bounding_box = &gpu_info_fw->soc_bounding_box; 2735 } 2736 break; 2737 } 2738 default: 2739 dev_err(adev->dev, 2740 "Unsupported gpu_info table %d\n", hdr->header.ucode_version); 2741 err = -EINVAL; 2742 goto out; 2743 } 2744 out: 2745 return err; 2746 } 2747 2748 static void amdgpu_uid_init(struct amdgpu_device *adev) 2749 { 2750 /* Initialize the UID for the device */ 2751 adev->uid_info = kzalloc(sizeof(struct amdgpu_uid), GFP_KERNEL); 2752 if (!adev->uid_info) { 2753 dev_warn(adev->dev, "Failed to allocate memory for UID\n"); 2754 return; 2755 } 2756 adev->uid_info->adev = adev; 2757 } 2758 2759 static void amdgpu_uid_fini(struct amdgpu_device *adev) 2760 { 2761 /* Free the UID memory */ 2762 kfree(adev->uid_info); 2763 adev->uid_info = NULL; 2764 } 2765 2766 /** 2767 * amdgpu_device_ip_early_init - run early init for hardware IPs 2768 * 2769 * @adev: amdgpu_device pointer 2770 * 2771 * Early initialization pass for hardware IPs. The hardware IPs that make 2772 * up each asic are discovered each IP's early_init callback is run. This 2773 * is the first stage in initializing the asic. 2774 * Returns 0 on success, negative error code on failure. 2775 */ 2776 static int amdgpu_device_ip_early_init(struct amdgpu_device *adev) 2777 { 2778 struct amdgpu_ip_block *ip_block; 2779 struct pci_dev *parent; 2780 bool total, skip_bios; 2781 uint32_t bios_flags; 2782 int i, r; 2783 2784 amdgpu_device_enable_virtual_display(adev); 2785 2786 if (amdgpu_sriov_vf(adev)) { 2787 r = amdgpu_virt_request_full_gpu(adev, true); 2788 if (r) 2789 return r; 2790 2791 r = amdgpu_virt_init_critical_region(adev); 2792 if (r) 2793 return r; 2794 } 2795 2796 switch (adev->asic_type) { 2797 #ifdef CONFIG_DRM_AMDGPU_SI 2798 case CHIP_VERDE: 2799 case CHIP_TAHITI: 2800 case CHIP_PITCAIRN: 2801 case CHIP_OLAND: 2802 case CHIP_HAINAN: 2803 adev->family = AMDGPU_FAMILY_SI; 2804 r = si_set_ip_blocks(adev); 2805 if (r) 2806 return r; 2807 break; 2808 #endif 2809 #ifdef CONFIG_DRM_AMDGPU_CIK 2810 case CHIP_BONAIRE: 2811 case CHIP_HAWAII: 2812 case CHIP_KAVERI: 2813 case CHIP_KABINI: 2814 case CHIP_MULLINS: 2815 if (adev->flags & AMD_IS_APU) 2816 adev->family = AMDGPU_FAMILY_KV; 2817 else 2818 adev->family = AMDGPU_FAMILY_CI; 2819 2820 r = cik_set_ip_blocks(adev); 2821 if (r) 2822 return r; 2823 break; 2824 #endif 2825 case CHIP_TOPAZ: 2826 case CHIP_TONGA: 2827 case CHIP_FIJI: 2828 case CHIP_POLARIS10: 2829 case CHIP_POLARIS11: 2830 case CHIP_POLARIS12: 2831 case CHIP_VEGAM: 2832 case CHIP_CARRIZO: 2833 case CHIP_STONEY: 2834 if (adev->flags & AMD_IS_APU) 2835 adev->family = AMDGPU_FAMILY_CZ; 2836 else 2837 adev->family = AMDGPU_FAMILY_VI; 2838 2839 r = vi_set_ip_blocks(adev); 2840 if (r) 2841 return r; 2842 break; 2843 default: 2844 r = amdgpu_discovery_set_ip_blocks(adev); 2845 if (r) 2846 return r; 2847 break; 2848 } 2849 2850 /* Check for IP version 9.4.3 with A0 hardware */ 2851 if (amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 3) && 2852 !amdgpu_device_get_rev_id(adev)) { 2853 dev_err(adev->dev, "Unsupported A0 hardware\n"); 2854 return -ENODEV; /* device unsupported - no device error */ 2855 } 2856 2857 if (amdgpu_has_atpx() && 2858 (amdgpu_is_atpx_hybrid() || 2859 amdgpu_has_atpx_dgpu_power_cntl()) && 2860 ((adev->flags & AMD_IS_APU) == 0) && 2861 !dev_is_removable(&adev->pdev->dev)) 2862 adev->flags |= AMD_IS_PX; 2863 2864 if (!(adev->flags & AMD_IS_APU)) { 2865 parent = pcie_find_root_port(adev->pdev); 2866 adev->has_pr3 = parent ? pci_pr3_present(parent) : false; 2867 } 2868 2869 adev->pm.pp_feature = amdgpu_pp_feature_mask; 2870 if (amdgpu_sriov_vf(adev) || sched_policy == KFD_SCHED_POLICY_NO_HWS) 2871 adev->pm.pp_feature &= ~PP_GFXOFF_MASK; 2872 if (amdgpu_sriov_vf(adev) && adev->asic_type == CHIP_SIENNA_CICHLID) 2873 adev->pm.pp_feature &= ~PP_OVERDRIVE_MASK; 2874 if (!amdgpu_device_pcie_dynamic_switching_supported(adev)) 2875 adev->pm.pp_feature &= ~PP_PCIE_DPM_MASK; 2876 2877 adev->virt.is_xgmi_node_migrate_enabled = false; 2878 if (amdgpu_sriov_vf(adev)) { 2879 adev->virt.is_xgmi_node_migrate_enabled = 2880 amdgpu_ip_version((adev), GC_HWIP, 0) == IP_VERSION(9, 4, 4); 2881 } 2882 2883 total = true; 2884 for (i = 0; i < adev->num_ip_blocks; i++) { 2885 ip_block = &adev->ip_blocks[i]; 2886 2887 if ((amdgpu_ip_block_mask & (1 << i)) == 0) { 2888 dev_warn(adev->dev, "disabled ip block: %d <%s>\n", i, 2889 adev->ip_blocks[i].version->funcs->name); 2890 adev->ip_blocks[i].status.valid = false; 2891 } else if (ip_block->version->funcs->early_init) { 2892 r = ip_block->version->funcs->early_init(ip_block); 2893 if (r == -ENOENT) { 2894 adev->ip_blocks[i].status.valid = false; 2895 } else if (r) { 2896 dev_err(adev->dev, 2897 "early_init of IP block <%s> failed %d\n", 2898 adev->ip_blocks[i].version->funcs->name, 2899 r); 2900 total = false; 2901 } else { 2902 adev->ip_blocks[i].status.valid = true; 2903 } 2904 } else { 2905 adev->ip_blocks[i].status.valid = true; 2906 } 2907 /* get the vbios after the asic_funcs are set up */ 2908 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON) { 2909 r = amdgpu_device_parse_gpu_info_fw(adev); 2910 if (r) 2911 return r; 2912 2913 bios_flags = amdgpu_device_get_vbios_flags(adev); 2914 skip_bios = !!(bios_flags & AMDGPU_VBIOS_SKIP); 2915 /* Read BIOS */ 2916 if (!skip_bios) { 2917 bool optional = 2918 !!(bios_flags & AMDGPU_VBIOS_OPTIONAL); 2919 if (!amdgpu_get_bios(adev) && !optional) 2920 return -EINVAL; 2921 2922 if (optional && !adev->bios) 2923 dev_info( 2924 adev->dev, 2925 "VBIOS image optional, proceeding without VBIOS image"); 2926 2927 if (adev->bios) { 2928 r = amdgpu_atombios_init(adev); 2929 if (r) { 2930 dev_err(adev->dev, 2931 "amdgpu_atombios_init failed\n"); 2932 amdgpu_vf_error_put( 2933 adev, 2934 AMDGIM_ERROR_VF_ATOMBIOS_INIT_FAIL, 2935 0, 0); 2936 return r; 2937 } 2938 } 2939 } 2940 2941 /*get pf2vf msg info at it's earliest time*/ 2942 if (amdgpu_sriov_vf(adev)) 2943 amdgpu_virt_init_data_exchange(adev); 2944 2945 } 2946 } 2947 if (!total) 2948 return -ENODEV; 2949 2950 if (adev->gmc.xgmi.supported) 2951 amdgpu_xgmi_early_init(adev); 2952 2953 if (amdgpu_is_multi_aid(adev)) 2954 amdgpu_uid_init(adev); 2955 ip_block = amdgpu_device_ip_get_ip_block(adev, AMD_IP_BLOCK_TYPE_GFX); 2956 if (ip_block->status.valid != false) 2957 amdgpu_amdkfd_device_probe(adev); 2958 2959 adev->cg_flags &= amdgpu_cg_mask; 2960 adev->pg_flags &= amdgpu_pg_mask; 2961 2962 return 0; 2963 } 2964 2965 static int amdgpu_device_ip_hw_init_phase1(struct amdgpu_device *adev) 2966 { 2967 int i, r; 2968 2969 for (i = 0; i < adev->num_ip_blocks; i++) { 2970 if (!adev->ip_blocks[i].status.sw) 2971 continue; 2972 if (adev->ip_blocks[i].status.hw) 2973 continue; 2974 if (!amdgpu_ip_member_of_hwini( 2975 adev, adev->ip_blocks[i].version->type)) 2976 continue; 2977 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON || 2978 (amdgpu_sriov_vf(adev) && (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP)) || 2979 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH) { 2980 r = adev->ip_blocks[i].version->funcs->hw_init(&adev->ip_blocks[i]); 2981 if (r) { 2982 dev_err(adev->dev, 2983 "hw_init of IP block <%s> failed %d\n", 2984 adev->ip_blocks[i].version->funcs->name, 2985 r); 2986 return r; 2987 } 2988 adev->ip_blocks[i].status.hw = true; 2989 } 2990 } 2991 2992 return 0; 2993 } 2994 2995 static int amdgpu_device_ip_hw_init_phase2(struct amdgpu_device *adev) 2996 { 2997 int i, r; 2998 2999 for (i = 0; i < adev->num_ip_blocks; i++) { 3000 if (!adev->ip_blocks[i].status.sw) 3001 continue; 3002 if (adev->ip_blocks[i].status.hw) 3003 continue; 3004 if (!amdgpu_ip_member_of_hwini( 3005 adev, adev->ip_blocks[i].version->type)) 3006 continue; 3007 r = adev->ip_blocks[i].version->funcs->hw_init(&adev->ip_blocks[i]); 3008 if (r) { 3009 dev_err(adev->dev, 3010 "hw_init of IP block <%s> failed %d\n", 3011 adev->ip_blocks[i].version->funcs->name, r); 3012 return r; 3013 } 3014 adev->ip_blocks[i].status.hw = true; 3015 } 3016 3017 return 0; 3018 } 3019 3020 static int amdgpu_device_fw_loading(struct amdgpu_device *adev) 3021 { 3022 int r = 0; 3023 int i; 3024 uint32_t smu_version; 3025 3026 if (adev->asic_type >= CHIP_VEGA10) { 3027 for (i = 0; i < adev->num_ip_blocks; i++) { 3028 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_PSP) 3029 continue; 3030 3031 if (!amdgpu_ip_member_of_hwini(adev, 3032 AMD_IP_BLOCK_TYPE_PSP)) 3033 break; 3034 3035 if (!adev->ip_blocks[i].status.sw) 3036 continue; 3037 3038 /* no need to do the fw loading again if already done*/ 3039 if (adev->ip_blocks[i].status.hw == true) 3040 break; 3041 3042 if (amdgpu_in_reset(adev) || adev->in_suspend) { 3043 r = amdgpu_ip_block_resume(&adev->ip_blocks[i]); 3044 if (r) 3045 return r; 3046 } else { 3047 r = adev->ip_blocks[i].version->funcs->hw_init(&adev->ip_blocks[i]); 3048 if (r) { 3049 dev_err(adev->dev, 3050 "hw_init of IP block <%s> failed %d\n", 3051 adev->ip_blocks[i] 3052 .version->funcs->name, 3053 r); 3054 return r; 3055 } 3056 adev->ip_blocks[i].status.hw = true; 3057 } 3058 break; 3059 } 3060 } 3061 3062 if (!amdgpu_sriov_vf(adev) || adev->asic_type == CHIP_TONGA) 3063 r = amdgpu_pm_load_smu_firmware(adev, &smu_version); 3064 3065 return r; 3066 } 3067 3068 static int amdgpu_device_init_schedulers(struct amdgpu_device *adev) 3069 { 3070 struct drm_sched_init_args args = { 3071 .ops = &amdgpu_sched_ops, 3072 .num_rqs = DRM_SCHED_PRIORITY_COUNT, 3073 .timeout_wq = adev->reset_domain->wq, 3074 .dev = adev->dev, 3075 }; 3076 long timeout; 3077 int r, i; 3078 3079 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 3080 struct amdgpu_ring *ring = adev->rings[i]; 3081 3082 /* No need to setup the GPU scheduler for rings that don't need it */ 3083 if (!ring || ring->no_scheduler) 3084 continue; 3085 3086 switch (ring->funcs->type) { 3087 case AMDGPU_RING_TYPE_GFX: 3088 timeout = adev->gfx_timeout; 3089 break; 3090 case AMDGPU_RING_TYPE_COMPUTE: 3091 timeout = adev->compute_timeout; 3092 break; 3093 case AMDGPU_RING_TYPE_SDMA: 3094 timeout = adev->sdma_timeout; 3095 break; 3096 default: 3097 timeout = adev->video_timeout; 3098 break; 3099 } 3100 3101 args.timeout = timeout; 3102 args.credit_limit = ring->num_hw_submission; 3103 args.score = ring->sched_score; 3104 args.name = ring->name; 3105 3106 r = drm_sched_init(&ring->sched, &args); 3107 if (r) { 3108 dev_err(adev->dev, 3109 "Failed to create scheduler on ring %s.\n", 3110 ring->name); 3111 return r; 3112 } 3113 r = amdgpu_uvd_entity_init(adev, ring); 3114 if (r) { 3115 dev_err(adev->dev, 3116 "Failed to create UVD scheduling entity on ring %s.\n", 3117 ring->name); 3118 return r; 3119 } 3120 r = amdgpu_vce_entity_init(adev, ring); 3121 if (r) { 3122 dev_err(adev->dev, 3123 "Failed to create VCE scheduling entity on ring %s.\n", 3124 ring->name); 3125 return r; 3126 } 3127 } 3128 3129 if (adev->xcp_mgr) 3130 amdgpu_xcp_update_partition_sched_list(adev); 3131 3132 return 0; 3133 } 3134 3135 3136 /** 3137 * amdgpu_device_ip_init - run init for hardware IPs 3138 * 3139 * @adev: amdgpu_device pointer 3140 * 3141 * Main initialization pass for hardware IPs. The list of all the hardware 3142 * IPs that make up the asic is walked and the sw_init and hw_init callbacks 3143 * are run. sw_init initializes the software state associated with each IP 3144 * and hw_init initializes the hardware associated with each IP. 3145 * Returns 0 on success, negative error code on failure. 3146 */ 3147 static int amdgpu_device_ip_init(struct amdgpu_device *adev) 3148 { 3149 bool init_badpage; 3150 int i, r; 3151 3152 r = amdgpu_ras_init(adev); 3153 if (r) 3154 return r; 3155 3156 for (i = 0; i < adev->num_ip_blocks; i++) { 3157 if (!adev->ip_blocks[i].status.valid) 3158 continue; 3159 if (adev->ip_blocks[i].version->funcs->sw_init) { 3160 r = adev->ip_blocks[i].version->funcs->sw_init(&adev->ip_blocks[i]); 3161 if (r) { 3162 dev_err(adev->dev, 3163 "sw_init of IP block <%s> failed %d\n", 3164 adev->ip_blocks[i].version->funcs->name, 3165 r); 3166 goto init_failed; 3167 } 3168 } 3169 adev->ip_blocks[i].status.sw = true; 3170 3171 if (!amdgpu_ip_member_of_hwini( 3172 adev, adev->ip_blocks[i].version->type)) 3173 continue; 3174 3175 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON) { 3176 /* need to do common hw init early so everything is set up for gmc */ 3177 r = adev->ip_blocks[i].version->funcs->hw_init(&adev->ip_blocks[i]); 3178 if (r) { 3179 dev_err(adev->dev, "hw_init %d failed %d\n", i, 3180 r); 3181 goto init_failed; 3182 } 3183 adev->ip_blocks[i].status.hw = true; 3184 } else if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) { 3185 /* need to do gmc hw init early so we can allocate gpu mem */ 3186 /* Try to reserve bad pages early */ 3187 if (amdgpu_sriov_vf(adev)) 3188 amdgpu_virt_exchange_data(adev); 3189 3190 r = amdgpu_device_mem_scratch_init(adev); 3191 if (r) { 3192 dev_err(adev->dev, 3193 "amdgpu_mem_scratch_init failed %d\n", 3194 r); 3195 goto init_failed; 3196 } 3197 r = adev->ip_blocks[i].version->funcs->hw_init(&adev->ip_blocks[i]); 3198 if (r) { 3199 dev_err(adev->dev, "hw_init %d failed %d\n", i, 3200 r); 3201 goto init_failed; 3202 } 3203 r = amdgpu_device_wb_init(adev); 3204 if (r) { 3205 dev_err(adev->dev, 3206 "amdgpu_device_wb_init failed %d\n", r); 3207 goto init_failed; 3208 } 3209 adev->ip_blocks[i].status.hw = true; 3210 3211 /* right after GMC hw init, we create CSA */ 3212 if (adev->gfx.mcbp) { 3213 r = amdgpu_allocate_static_csa(adev, &adev->virt.csa_obj, 3214 AMDGPU_GEM_DOMAIN_VRAM | 3215 AMDGPU_GEM_DOMAIN_GTT, 3216 AMDGPU_CSA_SIZE); 3217 if (r) { 3218 dev_err(adev->dev, 3219 "allocate CSA failed %d\n", r); 3220 goto init_failed; 3221 } 3222 } 3223 3224 r = amdgpu_seq64_init(adev); 3225 if (r) { 3226 dev_err(adev->dev, "allocate seq64 failed %d\n", 3227 r); 3228 goto init_failed; 3229 } 3230 } 3231 } 3232 3233 if (amdgpu_sriov_vf(adev)) 3234 amdgpu_virt_init_data_exchange(adev); 3235 3236 r = amdgpu_ib_pool_init(adev); 3237 if (r) { 3238 dev_err(adev->dev, "IB initialization failed (%d).\n", r); 3239 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_IB_INIT_FAIL, 0, r); 3240 goto init_failed; 3241 } 3242 3243 r = amdgpu_ucode_create_bo(adev); /* create ucode bo when sw_init complete*/ 3244 if (r) 3245 goto init_failed; 3246 3247 r = amdgpu_device_ip_hw_init_phase1(adev); 3248 if (r) 3249 goto init_failed; 3250 3251 r = amdgpu_device_fw_loading(adev); 3252 if (r) 3253 goto init_failed; 3254 3255 r = amdgpu_device_ip_hw_init_phase2(adev); 3256 if (r) 3257 goto init_failed; 3258 3259 /* 3260 * retired pages will be loaded from eeprom and reserved here, 3261 * it should be called after amdgpu_device_ip_hw_init_phase2 since 3262 * for some ASICs the RAS EEPROM code relies on SMU fully functioning 3263 * for I2C communication which only true at this point. 3264 * 3265 * amdgpu_ras_recovery_init may fail, but the upper only cares the 3266 * failure from bad gpu situation and stop amdgpu init process 3267 * accordingly. For other failed cases, it will still release all 3268 * the resource and print error message, rather than returning one 3269 * negative value to upper level. 3270 * 3271 * Note: theoretically, this should be called before all vram allocations 3272 * to protect retired page from abusing 3273 */ 3274 init_badpage = (adev->init_lvl->level != AMDGPU_INIT_LEVEL_MINIMAL_XGMI); 3275 r = amdgpu_ras_recovery_init(adev, init_badpage); 3276 if (r) 3277 goto init_failed; 3278 3279 /** 3280 * In case of XGMI grab extra reference for reset domain for this device 3281 */ 3282 if (adev->gmc.xgmi.num_physical_nodes > 1) { 3283 if (amdgpu_xgmi_add_device(adev) == 0) { 3284 if (!amdgpu_sriov_vf(adev)) { 3285 struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev); 3286 3287 if (WARN_ON(!hive)) { 3288 r = -ENOENT; 3289 goto init_failed; 3290 } 3291 3292 if (!hive->reset_domain || 3293 !amdgpu_reset_get_reset_domain(hive->reset_domain)) { 3294 r = -ENOENT; 3295 amdgpu_put_xgmi_hive(hive); 3296 goto init_failed; 3297 } 3298 3299 /* Drop the early temporary reset domain we created for device */ 3300 amdgpu_reset_put_reset_domain(adev->reset_domain); 3301 adev->reset_domain = hive->reset_domain; 3302 amdgpu_put_xgmi_hive(hive); 3303 } 3304 } 3305 } 3306 3307 r = amdgpu_device_init_schedulers(adev); 3308 if (r) 3309 goto init_failed; 3310 3311 if (adev->mman.buffer_funcs_ring->sched.ready) 3312 amdgpu_ttm_set_buffer_funcs_status(adev, true); 3313 3314 /* Don't init kfd if whole hive need to be reset during init */ 3315 if (adev->init_lvl->level != AMDGPU_INIT_LEVEL_MINIMAL_XGMI) { 3316 kgd2kfd_init_zone_device(adev); 3317 amdgpu_amdkfd_device_init(adev); 3318 } 3319 3320 amdgpu_fru_get_product_info(adev); 3321 3322 if (!amdgpu_sriov_vf(adev) || amdgpu_sriov_ras_cper_en(adev)) 3323 r = amdgpu_cper_init(adev); 3324 3325 init_failed: 3326 3327 return r; 3328 } 3329 3330 /** 3331 * amdgpu_device_fill_reset_magic - writes reset magic to gart pointer 3332 * 3333 * @adev: amdgpu_device pointer 3334 * 3335 * Writes a reset magic value to the gart pointer in VRAM. The driver calls 3336 * this function before a GPU reset. If the value is retained after a 3337 * GPU reset, VRAM has not been lost. Some GPU resets may destroy VRAM contents. 3338 */ 3339 static void amdgpu_device_fill_reset_magic(struct amdgpu_device *adev) 3340 { 3341 memcpy(adev->reset_magic, adev->gart.ptr, AMDGPU_RESET_MAGIC_NUM); 3342 } 3343 3344 /** 3345 * amdgpu_device_check_vram_lost - check if vram is valid 3346 * 3347 * @adev: amdgpu_device pointer 3348 * 3349 * Checks the reset magic value written to the gart pointer in VRAM. 3350 * The driver calls this after a GPU reset to see if the contents of 3351 * VRAM is lost or now. 3352 * returns true if vram is lost, false if not. 3353 */ 3354 static bool amdgpu_device_check_vram_lost(struct amdgpu_device *adev) 3355 { 3356 if (memcmp(adev->gart.ptr, adev->reset_magic, 3357 AMDGPU_RESET_MAGIC_NUM)) 3358 return true; 3359 3360 if (!amdgpu_in_reset(adev)) 3361 return false; 3362 3363 /* 3364 * For all ASICs with baco/mode1 reset, the VRAM is 3365 * always assumed to be lost. 3366 */ 3367 switch (amdgpu_asic_reset_method(adev)) { 3368 case AMD_RESET_METHOD_LEGACY: 3369 case AMD_RESET_METHOD_LINK: 3370 case AMD_RESET_METHOD_BACO: 3371 case AMD_RESET_METHOD_MODE1: 3372 return true; 3373 default: 3374 return false; 3375 } 3376 } 3377 3378 /** 3379 * amdgpu_device_set_cg_state - set clockgating for amdgpu device 3380 * 3381 * @adev: amdgpu_device pointer 3382 * @state: clockgating state (gate or ungate) 3383 * 3384 * The list of all the hardware IPs that make up the asic is walked and the 3385 * set_clockgating_state callbacks are run. 3386 * Late initialization pass enabling clockgating for hardware IPs. 3387 * Fini or suspend, pass disabling clockgating for hardware IPs. 3388 * Returns 0 on success, negative error code on failure. 3389 */ 3390 3391 int amdgpu_device_set_cg_state(struct amdgpu_device *adev, 3392 enum amd_clockgating_state state) 3393 { 3394 int i, j, r; 3395 3396 if (amdgpu_emu_mode == 1) 3397 return 0; 3398 3399 for (j = 0; j < adev->num_ip_blocks; j++) { 3400 i = state == AMD_CG_STATE_GATE ? j : adev->num_ip_blocks - j - 1; 3401 if (!adev->ip_blocks[i].status.late_initialized) 3402 continue; 3403 /* skip CG for GFX, SDMA on S0ix */ 3404 if (adev->in_s0ix && 3405 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GFX || 3406 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SDMA)) 3407 continue; 3408 /* skip CG for VCE/UVD, it's handled specially */ 3409 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_UVD && 3410 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCE && 3411 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCN && 3412 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_JPEG && 3413 adev->ip_blocks[i].version->funcs->set_clockgating_state) { 3414 /* enable clockgating to save power */ 3415 r = adev->ip_blocks[i].version->funcs->set_clockgating_state(&adev->ip_blocks[i], 3416 state); 3417 if (r) { 3418 dev_err(adev->dev, 3419 "set_clockgating_state(gate) of IP block <%s> failed %d\n", 3420 adev->ip_blocks[i].version->funcs->name, 3421 r); 3422 return r; 3423 } 3424 } 3425 } 3426 3427 return 0; 3428 } 3429 3430 int amdgpu_device_set_pg_state(struct amdgpu_device *adev, 3431 enum amd_powergating_state state) 3432 { 3433 int i, j, r; 3434 3435 if (amdgpu_emu_mode == 1) 3436 return 0; 3437 3438 for (j = 0; j < adev->num_ip_blocks; j++) { 3439 i = state == AMD_PG_STATE_GATE ? j : adev->num_ip_blocks - j - 1; 3440 if (!adev->ip_blocks[i].status.late_initialized) 3441 continue; 3442 /* skip PG for GFX, SDMA on S0ix */ 3443 if (adev->in_s0ix && 3444 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GFX || 3445 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SDMA)) 3446 continue; 3447 /* skip CG for VCE/UVD, it's handled specially */ 3448 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_UVD && 3449 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCE && 3450 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCN && 3451 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_JPEG && 3452 adev->ip_blocks[i].version->funcs->set_powergating_state) { 3453 /* enable powergating to save power */ 3454 r = adev->ip_blocks[i].version->funcs->set_powergating_state(&adev->ip_blocks[i], 3455 state); 3456 if (r) { 3457 dev_err(adev->dev, 3458 "set_powergating_state(gate) of IP block <%s> failed %d\n", 3459 adev->ip_blocks[i].version->funcs->name, 3460 r); 3461 return r; 3462 } 3463 } 3464 } 3465 return 0; 3466 } 3467 3468 static int amdgpu_device_enable_mgpu_fan_boost(void) 3469 { 3470 struct amdgpu_gpu_instance *gpu_ins; 3471 struct amdgpu_device *adev; 3472 int i, ret = 0; 3473 3474 mutex_lock(&mgpu_info.mutex); 3475 3476 /* 3477 * MGPU fan boost feature should be enabled 3478 * only when there are two or more dGPUs in 3479 * the system 3480 */ 3481 if (mgpu_info.num_dgpu < 2) 3482 goto out; 3483 3484 for (i = 0; i < mgpu_info.num_dgpu; i++) { 3485 gpu_ins = &(mgpu_info.gpu_ins[i]); 3486 adev = gpu_ins->adev; 3487 if (!(adev->flags & AMD_IS_APU || amdgpu_sriov_multi_vf_mode(adev)) && 3488 !gpu_ins->mgpu_fan_enabled) { 3489 ret = amdgpu_dpm_enable_mgpu_fan_boost(adev); 3490 if (ret) 3491 break; 3492 3493 gpu_ins->mgpu_fan_enabled = 1; 3494 } 3495 } 3496 3497 out: 3498 mutex_unlock(&mgpu_info.mutex); 3499 3500 return ret; 3501 } 3502 3503 /** 3504 * amdgpu_device_ip_late_init - run late init for hardware IPs 3505 * 3506 * @adev: amdgpu_device pointer 3507 * 3508 * Late initialization pass for hardware IPs. The list of all the hardware 3509 * IPs that make up the asic is walked and the late_init callbacks are run. 3510 * late_init covers any special initialization that an IP requires 3511 * after all of the have been initialized or something that needs to happen 3512 * late in the init process. 3513 * Returns 0 on success, negative error code on failure. 3514 */ 3515 static int amdgpu_device_ip_late_init(struct amdgpu_device *adev) 3516 { 3517 struct amdgpu_gpu_instance *gpu_instance; 3518 int i = 0, r; 3519 3520 for (i = 0; i < adev->num_ip_blocks; i++) { 3521 if (!adev->ip_blocks[i].status.hw) 3522 continue; 3523 if (adev->ip_blocks[i].version->funcs->late_init) { 3524 r = adev->ip_blocks[i].version->funcs->late_init(&adev->ip_blocks[i]); 3525 if (r) { 3526 dev_err(adev->dev, 3527 "late_init of IP block <%s> failed %d\n", 3528 adev->ip_blocks[i].version->funcs->name, 3529 r); 3530 return r; 3531 } 3532 } 3533 adev->ip_blocks[i].status.late_initialized = true; 3534 } 3535 3536 r = amdgpu_ras_late_init(adev); 3537 if (r) { 3538 dev_err(adev->dev, "amdgpu_ras_late_init failed %d", r); 3539 return r; 3540 } 3541 3542 if (!amdgpu_reset_in_recovery(adev)) 3543 amdgpu_ras_set_error_query_ready(adev, true); 3544 3545 amdgpu_device_set_cg_state(adev, AMD_CG_STATE_GATE); 3546 amdgpu_device_set_pg_state(adev, AMD_PG_STATE_GATE); 3547 3548 amdgpu_device_fill_reset_magic(adev); 3549 3550 r = amdgpu_device_enable_mgpu_fan_boost(); 3551 if (r) 3552 dev_err(adev->dev, "enable mgpu fan boost failed (%d).\n", r); 3553 3554 /* For passthrough configuration on arcturus and aldebaran, enable special handling SBR */ 3555 if (amdgpu_passthrough(adev) && 3556 ((adev->asic_type == CHIP_ARCTURUS && adev->gmc.xgmi.num_physical_nodes > 1) || 3557 adev->asic_type == CHIP_ALDEBARAN)) 3558 amdgpu_dpm_handle_passthrough_sbr(adev, true); 3559 3560 if (adev->gmc.xgmi.num_physical_nodes > 1) { 3561 mutex_lock(&mgpu_info.mutex); 3562 3563 /* 3564 * Reset device p-state to low as this was booted with high. 3565 * 3566 * This should be performed only after all devices from the same 3567 * hive get initialized. 3568 * 3569 * However, it's unknown how many device in the hive in advance. 3570 * As this is counted one by one during devices initializations. 3571 * 3572 * So, we wait for all XGMI interlinked devices initialized. 3573 * This may bring some delays as those devices may come from 3574 * different hives. But that should be OK. 3575 */ 3576 if (mgpu_info.num_dgpu == adev->gmc.xgmi.num_physical_nodes) { 3577 for (i = 0; i < mgpu_info.num_gpu; i++) { 3578 gpu_instance = &(mgpu_info.gpu_ins[i]); 3579 if (gpu_instance->adev->flags & AMD_IS_APU) 3580 continue; 3581 3582 r = amdgpu_xgmi_set_pstate(gpu_instance->adev, 3583 AMDGPU_XGMI_PSTATE_MIN); 3584 if (r) { 3585 dev_err(adev->dev, 3586 "pstate setting failed (%d).\n", 3587 r); 3588 break; 3589 } 3590 } 3591 } 3592 3593 mutex_unlock(&mgpu_info.mutex); 3594 } 3595 3596 return 0; 3597 } 3598 3599 static void amdgpu_ip_block_hw_fini(struct amdgpu_ip_block *ip_block) 3600 { 3601 struct amdgpu_device *adev = ip_block->adev; 3602 int r; 3603 3604 if (!ip_block->version->funcs->hw_fini) { 3605 dev_err(adev->dev, "hw_fini of IP block <%s> not defined\n", 3606 ip_block->version->funcs->name); 3607 } else { 3608 r = ip_block->version->funcs->hw_fini(ip_block); 3609 /* XXX handle errors */ 3610 if (r) { 3611 dev_dbg(adev->dev, 3612 "hw_fini of IP block <%s> failed %d\n", 3613 ip_block->version->funcs->name, r); 3614 } 3615 } 3616 3617 ip_block->status.hw = false; 3618 } 3619 3620 /** 3621 * amdgpu_device_smu_fini_early - smu hw_fini wrapper 3622 * 3623 * @adev: amdgpu_device pointer 3624 * 3625 * For ASICs need to disable SMC first 3626 */ 3627 static void amdgpu_device_smu_fini_early(struct amdgpu_device *adev) 3628 { 3629 int i; 3630 3631 if (amdgpu_ip_version(adev, GC_HWIP, 0) > IP_VERSION(9, 0, 0)) 3632 return; 3633 3634 for (i = 0; i < adev->num_ip_blocks; i++) { 3635 if (!adev->ip_blocks[i].status.hw) 3636 continue; 3637 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) { 3638 amdgpu_ip_block_hw_fini(&adev->ip_blocks[i]); 3639 break; 3640 } 3641 } 3642 } 3643 3644 static int amdgpu_device_ip_fini_early(struct amdgpu_device *adev) 3645 { 3646 int i, r; 3647 3648 for (i = 0; i < adev->num_ip_blocks; i++) { 3649 if (!adev->ip_blocks[i].version->funcs->early_fini) 3650 continue; 3651 3652 r = adev->ip_blocks[i].version->funcs->early_fini(&adev->ip_blocks[i]); 3653 if (r) { 3654 dev_dbg(adev->dev, 3655 "early_fini of IP block <%s> failed %d\n", 3656 adev->ip_blocks[i].version->funcs->name, r); 3657 } 3658 } 3659 3660 amdgpu_device_set_pg_state(adev, AMD_PG_STATE_UNGATE); 3661 amdgpu_device_set_cg_state(adev, AMD_CG_STATE_UNGATE); 3662 3663 amdgpu_amdkfd_suspend(adev, true); 3664 amdgpu_userq_suspend(adev); 3665 3666 /* Workaround for ASICs need to disable SMC first */ 3667 amdgpu_device_smu_fini_early(adev); 3668 3669 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 3670 if (!adev->ip_blocks[i].status.hw) 3671 continue; 3672 3673 amdgpu_ip_block_hw_fini(&adev->ip_blocks[i]); 3674 } 3675 3676 if (amdgpu_sriov_vf(adev)) { 3677 if (amdgpu_virt_release_full_gpu(adev, false)) 3678 dev_err(adev->dev, 3679 "failed to release exclusive mode on fini\n"); 3680 } 3681 3682 return 0; 3683 } 3684 3685 /** 3686 * amdgpu_device_ip_fini - run fini for hardware IPs 3687 * 3688 * @adev: amdgpu_device pointer 3689 * 3690 * Main teardown pass for hardware IPs. The list of all the hardware 3691 * IPs that make up the asic is walked and the hw_fini and sw_fini callbacks 3692 * are run. hw_fini tears down the hardware associated with each IP 3693 * and sw_fini tears down any software state associated with each IP. 3694 * Returns 0 on success, negative error code on failure. 3695 */ 3696 static int amdgpu_device_ip_fini(struct amdgpu_device *adev) 3697 { 3698 int i, r; 3699 3700 amdgpu_cper_fini(adev); 3701 3702 if (amdgpu_sriov_vf(adev) && adev->virt.ras_init_done) 3703 amdgpu_virt_release_ras_err_handler_data(adev); 3704 3705 if (adev->gmc.xgmi.num_physical_nodes > 1) 3706 amdgpu_xgmi_remove_device(adev); 3707 3708 amdgpu_amdkfd_device_fini_sw(adev); 3709 3710 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 3711 if (!adev->ip_blocks[i].status.sw) 3712 continue; 3713 3714 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) { 3715 amdgpu_ucode_free_bo(adev); 3716 amdgpu_free_static_csa(&adev->virt.csa_obj); 3717 amdgpu_device_wb_fini(adev); 3718 amdgpu_device_mem_scratch_fini(adev); 3719 amdgpu_ib_pool_fini(adev); 3720 amdgpu_seq64_fini(adev); 3721 amdgpu_doorbell_fini(adev); 3722 } 3723 if (adev->ip_blocks[i].version->funcs->sw_fini) { 3724 r = adev->ip_blocks[i].version->funcs->sw_fini(&adev->ip_blocks[i]); 3725 /* XXX handle errors */ 3726 if (r) { 3727 dev_dbg(adev->dev, 3728 "sw_fini of IP block <%s> failed %d\n", 3729 adev->ip_blocks[i].version->funcs->name, 3730 r); 3731 } 3732 } 3733 adev->ip_blocks[i].status.sw = false; 3734 adev->ip_blocks[i].status.valid = false; 3735 } 3736 3737 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 3738 if (!adev->ip_blocks[i].status.late_initialized) 3739 continue; 3740 if (adev->ip_blocks[i].version->funcs->late_fini) 3741 adev->ip_blocks[i].version->funcs->late_fini(&adev->ip_blocks[i]); 3742 adev->ip_blocks[i].status.late_initialized = false; 3743 } 3744 3745 amdgpu_ras_fini(adev); 3746 amdgpu_uid_fini(adev); 3747 3748 return 0; 3749 } 3750 3751 /** 3752 * amdgpu_device_delayed_init_work_handler - work handler for IB tests 3753 * 3754 * @work: work_struct. 3755 */ 3756 static void amdgpu_device_delayed_init_work_handler(struct work_struct *work) 3757 { 3758 struct amdgpu_device *adev = 3759 container_of(work, struct amdgpu_device, delayed_init_work.work); 3760 int r; 3761 3762 r = amdgpu_ib_ring_tests(adev); 3763 if (r) 3764 dev_err(adev->dev, "ib ring test failed (%d).\n", r); 3765 } 3766 3767 static void amdgpu_device_delay_enable_gfx_off(struct work_struct *work) 3768 { 3769 struct amdgpu_device *adev = 3770 container_of(work, struct amdgpu_device, gfx.gfx_off_delay_work.work); 3771 3772 WARN_ON_ONCE(adev->gfx.gfx_off_state); 3773 WARN_ON_ONCE(adev->gfx.gfx_off_req_count); 3774 3775 if (!amdgpu_dpm_set_powergating_by_smu(adev, AMD_IP_BLOCK_TYPE_GFX, true, 0)) 3776 adev->gfx.gfx_off_state = true; 3777 } 3778 3779 /** 3780 * amdgpu_device_ip_suspend_phase1 - run suspend for hardware IPs (phase 1) 3781 * 3782 * @adev: amdgpu_device pointer 3783 * 3784 * Main suspend function for hardware IPs. The list of all the hardware 3785 * IPs that make up the asic is walked, clockgating is disabled and the 3786 * suspend callbacks are run. suspend puts the hardware and software state 3787 * in each IP into a state suitable for suspend. 3788 * Returns 0 on success, negative error code on failure. 3789 */ 3790 static int amdgpu_device_ip_suspend_phase1(struct amdgpu_device *adev) 3791 { 3792 int i, r, rec; 3793 3794 amdgpu_device_set_pg_state(adev, AMD_PG_STATE_UNGATE); 3795 amdgpu_device_set_cg_state(adev, AMD_CG_STATE_UNGATE); 3796 3797 /* 3798 * Per PMFW team's suggestion, driver needs to handle gfxoff 3799 * and df cstate features disablement for gpu reset(e.g. Mode1Reset) 3800 * scenario. Add the missing df cstate disablement here. 3801 */ 3802 if (amdgpu_dpm_set_df_cstate(adev, DF_CSTATE_DISALLOW)) 3803 dev_warn(adev->dev, "Failed to disallow df cstate"); 3804 3805 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 3806 if (!adev->ip_blocks[i].status.valid) 3807 continue; 3808 3809 /* displays are handled separately */ 3810 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_DCE) 3811 continue; 3812 3813 r = amdgpu_ip_block_suspend(&adev->ip_blocks[i]); 3814 if (r) 3815 goto unwind; 3816 } 3817 3818 return 0; 3819 unwind: 3820 rec = amdgpu_device_ip_resume_phase3(adev); 3821 if (rec) 3822 dev_err(adev->dev, 3823 "amdgpu_device_ip_resume_phase3 failed during unwind: %d\n", 3824 rec); 3825 3826 amdgpu_dpm_set_df_cstate(adev, DF_CSTATE_ALLOW); 3827 3828 amdgpu_device_set_pg_state(adev, AMD_PG_STATE_GATE); 3829 amdgpu_device_set_cg_state(adev, AMD_CG_STATE_GATE); 3830 3831 return r; 3832 } 3833 3834 /** 3835 * amdgpu_device_ip_suspend_phase2 - run suspend for hardware IPs (phase 2) 3836 * 3837 * @adev: amdgpu_device pointer 3838 * 3839 * Main suspend function for hardware IPs. The list of all the hardware 3840 * IPs that make up the asic is walked, clockgating is disabled and the 3841 * suspend callbacks are run. suspend puts the hardware and software state 3842 * in each IP into a state suitable for suspend. 3843 * Returns 0 on success, negative error code on failure. 3844 */ 3845 static int amdgpu_device_ip_suspend_phase2(struct amdgpu_device *adev) 3846 { 3847 int i, r, rec; 3848 3849 if (adev->in_s0ix) 3850 amdgpu_dpm_gfx_state_change(adev, sGpuChangeState_D3Entry); 3851 3852 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 3853 if (!adev->ip_blocks[i].status.valid) 3854 continue; 3855 /* displays are handled in phase1 */ 3856 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE) 3857 continue; 3858 /* PSP lost connection when err_event_athub occurs */ 3859 if (amdgpu_ras_intr_triggered() && 3860 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) { 3861 adev->ip_blocks[i].status.hw = false; 3862 continue; 3863 } 3864 3865 /* skip unnecessary suspend if we do not initialize them yet */ 3866 if (!amdgpu_ip_member_of_hwini( 3867 adev, adev->ip_blocks[i].version->type)) 3868 continue; 3869 3870 /* Since we skip suspend for S0i3, we need to cancel the delayed 3871 * idle work here as the suspend callback never gets called. 3872 */ 3873 if (adev->in_s0ix && 3874 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GFX && 3875 amdgpu_ip_version(adev, GC_HWIP, 0) >= IP_VERSION(10, 0, 0)) 3876 cancel_delayed_work_sync(&adev->gfx.idle_work); 3877 /* skip suspend of gfx/mes and psp for S0ix 3878 * gfx is in gfxoff state, so on resume it will exit gfxoff just 3879 * like at runtime. PSP is also part of the always on hardware 3880 * so no need to suspend it. 3881 */ 3882 if (adev->in_s0ix && 3883 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP || 3884 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GFX || 3885 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_MES)) 3886 continue; 3887 3888 /* SDMA 5.x+ is part of GFX power domain so it's covered by GFXOFF */ 3889 if (adev->in_s0ix && 3890 (amdgpu_ip_version(adev, SDMA0_HWIP, 0) >= 3891 IP_VERSION(5, 0, 0)) && 3892 (adev->ip_blocks[i].version->type == 3893 AMD_IP_BLOCK_TYPE_SDMA)) 3894 continue; 3895 3896 /* Once swPSP provides the IMU, RLC FW binaries to TOS during cold-boot. 3897 * These are in TMR, hence are expected to be reused by PSP-TOS to reload 3898 * from this location and RLC Autoload automatically also gets loaded 3899 * from here based on PMFW -> PSP message during re-init sequence. 3900 * Therefore, the psp suspend & resume should be skipped to avoid destroy 3901 * the TMR and reload FWs again for IMU enabled APU ASICs. 3902 */ 3903 if (amdgpu_in_reset(adev) && 3904 (adev->flags & AMD_IS_APU) && adev->gfx.imu.funcs && 3905 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) 3906 continue; 3907 3908 r = amdgpu_ip_block_suspend(&adev->ip_blocks[i]); 3909 if (r) 3910 goto unwind; 3911 3912 /* handle putting the SMC in the appropriate state */ 3913 if (!amdgpu_sriov_vf(adev)) { 3914 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) { 3915 r = amdgpu_dpm_set_mp1_state(adev, adev->mp1_state); 3916 if (r) { 3917 dev_err(adev->dev, 3918 "SMC failed to set mp1 state %d, %d\n", 3919 adev->mp1_state, r); 3920 goto unwind; 3921 } 3922 } 3923 } 3924 } 3925 3926 return 0; 3927 unwind: 3928 /* suspend phase 2 = resume phase 1 + resume phase 2 */ 3929 rec = amdgpu_device_ip_resume_phase1(adev); 3930 if (rec) { 3931 dev_err(adev->dev, 3932 "amdgpu_device_ip_resume_phase1 failed during unwind: %d\n", 3933 rec); 3934 return r; 3935 } 3936 3937 rec = amdgpu_device_fw_loading(adev); 3938 if (rec) { 3939 dev_err(adev->dev, 3940 "amdgpu_device_fw_loading failed during unwind: %d\n", 3941 rec); 3942 return r; 3943 } 3944 3945 rec = amdgpu_device_ip_resume_phase2(adev); 3946 if (rec) { 3947 dev_err(adev->dev, 3948 "amdgpu_device_ip_resume_phase2 failed during unwind: %d\n", 3949 rec); 3950 return r; 3951 } 3952 3953 return r; 3954 } 3955 3956 /** 3957 * amdgpu_device_ip_suspend - run suspend for hardware IPs 3958 * 3959 * @adev: amdgpu_device pointer 3960 * 3961 * Main suspend function for hardware IPs. The list of all the hardware 3962 * IPs that make up the asic is walked, clockgating is disabled and the 3963 * suspend callbacks are run. suspend puts the hardware and software state 3964 * in each IP into a state suitable for suspend. 3965 * Returns 0 on success, negative error code on failure. 3966 */ 3967 static int amdgpu_device_ip_suspend(struct amdgpu_device *adev) 3968 { 3969 int r; 3970 3971 if (amdgpu_sriov_vf(adev)) { 3972 amdgpu_virt_fini_data_exchange(adev); 3973 amdgpu_virt_request_full_gpu(adev, false); 3974 } 3975 3976 amdgpu_ttm_set_buffer_funcs_status(adev, false); 3977 3978 r = amdgpu_device_ip_suspend_phase1(adev); 3979 if (r) 3980 return r; 3981 r = amdgpu_device_ip_suspend_phase2(adev); 3982 3983 if (amdgpu_sriov_vf(adev)) 3984 amdgpu_virt_release_full_gpu(adev, false); 3985 3986 return r; 3987 } 3988 3989 static int amdgpu_device_ip_reinit_early_sriov(struct amdgpu_device *adev) 3990 { 3991 int i, r; 3992 3993 static enum amd_ip_block_type ip_order[] = { 3994 AMD_IP_BLOCK_TYPE_COMMON, 3995 AMD_IP_BLOCK_TYPE_GMC, 3996 AMD_IP_BLOCK_TYPE_PSP, 3997 AMD_IP_BLOCK_TYPE_IH, 3998 }; 3999 4000 for (i = 0; i < adev->num_ip_blocks; i++) { 4001 int j; 4002 struct amdgpu_ip_block *block; 4003 4004 block = &adev->ip_blocks[i]; 4005 block->status.hw = false; 4006 4007 for (j = 0; j < ARRAY_SIZE(ip_order); j++) { 4008 4009 if (block->version->type != ip_order[j] || 4010 !block->status.valid) 4011 continue; 4012 4013 r = block->version->funcs->hw_init(&adev->ip_blocks[i]); 4014 if (r) { 4015 dev_err(adev->dev, "RE-INIT-early: %s failed\n", 4016 block->version->funcs->name); 4017 return r; 4018 } 4019 block->status.hw = true; 4020 } 4021 } 4022 4023 return 0; 4024 } 4025 4026 static int amdgpu_device_ip_reinit_late_sriov(struct amdgpu_device *adev) 4027 { 4028 struct amdgpu_ip_block *block; 4029 int i, r = 0; 4030 4031 static enum amd_ip_block_type ip_order[] = { 4032 AMD_IP_BLOCK_TYPE_SMC, 4033 AMD_IP_BLOCK_TYPE_DCE, 4034 AMD_IP_BLOCK_TYPE_GFX, 4035 AMD_IP_BLOCK_TYPE_SDMA, 4036 AMD_IP_BLOCK_TYPE_MES, 4037 AMD_IP_BLOCK_TYPE_UVD, 4038 AMD_IP_BLOCK_TYPE_VCE, 4039 AMD_IP_BLOCK_TYPE_VCN, 4040 AMD_IP_BLOCK_TYPE_JPEG 4041 }; 4042 4043 for (i = 0; i < ARRAY_SIZE(ip_order); i++) { 4044 block = amdgpu_device_ip_get_ip_block(adev, ip_order[i]); 4045 4046 if (!block) 4047 continue; 4048 4049 if (block->status.valid && !block->status.hw) { 4050 if (block->version->type == AMD_IP_BLOCK_TYPE_SMC) { 4051 r = amdgpu_ip_block_resume(block); 4052 } else { 4053 r = block->version->funcs->hw_init(block); 4054 } 4055 4056 if (r) { 4057 dev_err(adev->dev, "RE-INIT-late: %s failed\n", 4058 block->version->funcs->name); 4059 break; 4060 } 4061 block->status.hw = true; 4062 } 4063 } 4064 4065 return r; 4066 } 4067 4068 /** 4069 * amdgpu_device_ip_resume_phase1 - run resume for hardware IPs 4070 * 4071 * @adev: amdgpu_device pointer 4072 * 4073 * First resume function for hardware IPs. The list of all the hardware 4074 * IPs that make up the asic is walked and the resume callbacks are run for 4075 * COMMON, GMC, and IH. resume puts the hardware into a functional state 4076 * after a suspend and updates the software state as necessary. This 4077 * function is also used for restoring the GPU after a GPU reset. 4078 * Returns 0 on success, negative error code on failure. 4079 */ 4080 static int amdgpu_device_ip_resume_phase1(struct amdgpu_device *adev) 4081 { 4082 int i, r; 4083 4084 for (i = 0; i < adev->num_ip_blocks; i++) { 4085 if (!adev->ip_blocks[i].status.valid || adev->ip_blocks[i].status.hw) 4086 continue; 4087 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON || 4088 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC || 4089 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH || 4090 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP && amdgpu_sriov_vf(adev))) { 4091 4092 r = amdgpu_ip_block_resume(&adev->ip_blocks[i]); 4093 if (r) 4094 return r; 4095 } 4096 } 4097 4098 return 0; 4099 } 4100 4101 /** 4102 * amdgpu_device_ip_resume_phase2 - run resume for hardware IPs 4103 * 4104 * @adev: amdgpu_device pointer 4105 * 4106 * Second resume function for hardware IPs. The list of all the hardware 4107 * IPs that make up the asic is walked and the resume callbacks are run for 4108 * all blocks except COMMON, GMC, and IH. resume puts the hardware into a 4109 * functional state after a suspend and updates the software state as 4110 * necessary. This function is also used for restoring the GPU after a GPU 4111 * reset. 4112 * Returns 0 on success, negative error code on failure. 4113 */ 4114 static int amdgpu_device_ip_resume_phase2(struct amdgpu_device *adev) 4115 { 4116 int i, r; 4117 4118 for (i = 0; i < adev->num_ip_blocks; i++) { 4119 if (!adev->ip_blocks[i].status.valid || adev->ip_blocks[i].status.hw) 4120 continue; 4121 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON || 4122 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC || 4123 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH || 4124 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE || 4125 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) 4126 continue; 4127 r = amdgpu_ip_block_resume(&adev->ip_blocks[i]); 4128 if (r) 4129 return r; 4130 } 4131 4132 return 0; 4133 } 4134 4135 /** 4136 * amdgpu_device_ip_resume_phase3 - run resume for hardware IPs 4137 * 4138 * @adev: amdgpu_device pointer 4139 * 4140 * Third resume function for hardware IPs. The list of all the hardware 4141 * IPs that make up the asic is walked and the resume callbacks are run for 4142 * all DCE. resume puts the hardware into a functional state after a suspend 4143 * and updates the software state as necessary. This function is also used 4144 * for restoring the GPU after a GPU reset. 4145 * 4146 * Returns 0 on success, negative error code on failure. 4147 */ 4148 static int amdgpu_device_ip_resume_phase3(struct amdgpu_device *adev) 4149 { 4150 int i, r; 4151 4152 for (i = 0; i < adev->num_ip_blocks; i++) { 4153 if (!adev->ip_blocks[i].status.valid || adev->ip_blocks[i].status.hw) 4154 continue; 4155 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE) { 4156 r = amdgpu_ip_block_resume(&adev->ip_blocks[i]); 4157 if (r) 4158 return r; 4159 } 4160 } 4161 4162 return 0; 4163 } 4164 4165 /** 4166 * amdgpu_device_ip_resume - run resume for hardware IPs 4167 * 4168 * @adev: amdgpu_device pointer 4169 * 4170 * Main resume function for hardware IPs. The hardware IPs 4171 * are split into two resume functions because they are 4172 * also used in recovering from a GPU reset and some additional 4173 * steps need to be take between them. In this case (S3/S4) they are 4174 * run sequentially. 4175 * Returns 0 on success, negative error code on failure. 4176 */ 4177 static int amdgpu_device_ip_resume(struct amdgpu_device *adev) 4178 { 4179 int r; 4180 4181 r = amdgpu_device_ip_resume_phase1(adev); 4182 if (r) 4183 return r; 4184 4185 r = amdgpu_device_fw_loading(adev); 4186 if (r) 4187 return r; 4188 4189 r = amdgpu_device_ip_resume_phase2(adev); 4190 4191 if (adev->mman.buffer_funcs_ring->sched.ready) 4192 amdgpu_ttm_set_buffer_funcs_status(adev, true); 4193 4194 if (r) 4195 return r; 4196 4197 amdgpu_fence_driver_hw_init(adev); 4198 4199 r = amdgpu_device_ip_resume_phase3(adev); 4200 4201 return r; 4202 } 4203 4204 /** 4205 * amdgpu_device_detect_sriov_bios - determine if the board supports SR-IOV 4206 * 4207 * @adev: amdgpu_device pointer 4208 * 4209 * Query the VBIOS data tables to determine if the board supports SR-IOV. 4210 */ 4211 static void amdgpu_device_detect_sriov_bios(struct amdgpu_device *adev) 4212 { 4213 if (amdgpu_sriov_vf(adev)) { 4214 if (adev->is_atom_fw) { 4215 if (amdgpu_atomfirmware_gpu_virtualization_supported(adev)) 4216 adev->virt.caps |= AMDGPU_SRIOV_CAPS_SRIOV_VBIOS; 4217 } else { 4218 if (amdgpu_atombios_has_gpu_virtualization_table(adev)) 4219 adev->virt.caps |= AMDGPU_SRIOV_CAPS_SRIOV_VBIOS; 4220 } 4221 4222 if (!(adev->virt.caps & AMDGPU_SRIOV_CAPS_SRIOV_VBIOS)) 4223 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_NO_VBIOS, 0, 0); 4224 } 4225 } 4226 4227 /** 4228 * amdgpu_device_asic_has_dc_support - determine if DC supports the asic 4229 * 4230 * @pdev : pci device context 4231 * @asic_type: AMD asic type 4232 * 4233 * Check if there is DC (new modesetting infrastructre) support for an asic. 4234 * returns true if DC has support, false if not. 4235 */ 4236 bool amdgpu_device_asic_has_dc_support(struct pci_dev *pdev, 4237 enum amd_asic_type asic_type) 4238 { 4239 switch (asic_type) { 4240 #ifdef CONFIG_DRM_AMDGPU_SI 4241 case CHIP_HAINAN: 4242 #endif 4243 case CHIP_TOPAZ: 4244 /* chips with no display hardware */ 4245 return false; 4246 #if defined(CONFIG_DRM_AMD_DC) 4247 case CHIP_TAHITI: 4248 case CHIP_PITCAIRN: 4249 case CHIP_VERDE: 4250 case CHIP_OLAND: 4251 /* 4252 * We have systems in the wild with these ASICs that require 4253 * LVDS and VGA support which is not supported with DC. 4254 * 4255 * Fallback to the non-DC driver here by default so as not to 4256 * cause regressions. 4257 */ 4258 #if defined(CONFIG_DRM_AMD_DC_SI) 4259 return amdgpu_dc > 0; 4260 #else 4261 return false; 4262 #endif 4263 case CHIP_KAVERI: 4264 case CHIP_KABINI: 4265 case CHIP_MULLINS: 4266 /* 4267 * We have systems in the wild with these ASICs that require 4268 * VGA support which is not supported with DC. 4269 * 4270 * Fallback to the non-DC driver here by default so as not to 4271 * cause regressions. 4272 */ 4273 return amdgpu_dc > 0; 4274 default: 4275 return amdgpu_dc != 0; 4276 #else 4277 default: 4278 if (amdgpu_dc > 0) 4279 dev_info_once( 4280 &pdev->dev, 4281 "Display Core has been requested via kernel parameter but isn't supported by ASIC, ignoring\n"); 4282 return false; 4283 #endif 4284 } 4285 } 4286 4287 /** 4288 * amdgpu_device_has_dc_support - check if dc is supported 4289 * 4290 * @adev: amdgpu_device pointer 4291 * 4292 * Returns true for supported, false for not supported 4293 */ 4294 bool amdgpu_device_has_dc_support(struct amdgpu_device *adev) 4295 { 4296 if (adev->enable_virtual_display || 4297 (adev->harvest_ip_mask & AMD_HARVEST_IP_DMU_MASK)) 4298 return false; 4299 4300 return amdgpu_device_asic_has_dc_support(adev->pdev, adev->asic_type); 4301 } 4302 4303 static void amdgpu_device_xgmi_reset_func(struct work_struct *__work) 4304 { 4305 struct amdgpu_device *adev = 4306 container_of(__work, struct amdgpu_device, xgmi_reset_work); 4307 struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev); 4308 4309 /* It's a bug to not have a hive within this function */ 4310 if (WARN_ON(!hive)) 4311 return; 4312 4313 /* 4314 * Use task barrier to synchronize all xgmi reset works across the 4315 * hive. task_barrier_enter and task_barrier_exit will block 4316 * until all the threads running the xgmi reset works reach 4317 * those points. task_barrier_full will do both blocks. 4318 */ 4319 if (amdgpu_asic_reset_method(adev) == AMD_RESET_METHOD_BACO) { 4320 4321 task_barrier_enter(&hive->tb); 4322 adev->asic_reset_res = amdgpu_device_baco_enter(adev); 4323 4324 if (adev->asic_reset_res) 4325 goto fail; 4326 4327 task_barrier_exit(&hive->tb); 4328 adev->asic_reset_res = amdgpu_device_baco_exit(adev); 4329 4330 if (adev->asic_reset_res) 4331 goto fail; 4332 4333 amdgpu_ras_reset_error_count(adev, AMDGPU_RAS_BLOCK__MMHUB); 4334 } else { 4335 4336 task_barrier_full(&hive->tb); 4337 adev->asic_reset_res = amdgpu_asic_reset(adev); 4338 } 4339 4340 fail: 4341 if (adev->asic_reset_res) 4342 dev_warn(adev->dev, 4343 "ASIC reset failed with error, %d for drm dev, %s", 4344 adev->asic_reset_res, adev_to_drm(adev)->unique); 4345 amdgpu_put_xgmi_hive(hive); 4346 } 4347 4348 static int amdgpu_device_get_job_timeout_settings(struct amdgpu_device *adev) 4349 { 4350 char *input = amdgpu_lockup_timeout; 4351 char *timeout_setting = NULL; 4352 int index = 0; 4353 long timeout; 4354 int ret = 0; 4355 4356 /* By default timeout for all queues is 2 sec */ 4357 adev->gfx_timeout = adev->compute_timeout = adev->sdma_timeout = 4358 adev->video_timeout = msecs_to_jiffies(2000); 4359 4360 if (!strnlen(input, AMDGPU_MAX_TIMEOUT_PARAM_LENGTH)) 4361 return 0; 4362 4363 while ((timeout_setting = strsep(&input, ",")) && 4364 strnlen(timeout_setting, AMDGPU_MAX_TIMEOUT_PARAM_LENGTH)) { 4365 ret = kstrtol(timeout_setting, 0, &timeout); 4366 if (ret) 4367 return ret; 4368 4369 if (timeout == 0) { 4370 index++; 4371 continue; 4372 } else if (timeout < 0) { 4373 timeout = MAX_SCHEDULE_TIMEOUT; 4374 dev_warn(adev->dev, "lockup timeout disabled"); 4375 add_taint(TAINT_SOFTLOCKUP, LOCKDEP_STILL_OK); 4376 } else { 4377 timeout = msecs_to_jiffies(timeout); 4378 } 4379 4380 switch (index++) { 4381 case 0: 4382 adev->gfx_timeout = timeout; 4383 break; 4384 case 1: 4385 adev->compute_timeout = timeout; 4386 break; 4387 case 2: 4388 adev->sdma_timeout = timeout; 4389 break; 4390 case 3: 4391 adev->video_timeout = timeout; 4392 break; 4393 default: 4394 break; 4395 } 4396 } 4397 4398 /* When only one value specified apply it to all queues. */ 4399 if (index == 1) 4400 adev->gfx_timeout = adev->compute_timeout = adev->sdma_timeout = 4401 adev->video_timeout = timeout; 4402 4403 return ret; 4404 } 4405 4406 /** 4407 * amdgpu_device_check_iommu_direct_map - check if RAM direct mapped to GPU 4408 * 4409 * @adev: amdgpu_device pointer 4410 * 4411 * RAM direct mapped to GPU if IOMMU is not enabled or is pass through mode 4412 */ 4413 static void amdgpu_device_check_iommu_direct_map(struct amdgpu_device *adev) 4414 { 4415 struct iommu_domain *domain; 4416 4417 domain = iommu_get_domain_for_dev(adev->dev); 4418 if (!domain || domain->type == IOMMU_DOMAIN_IDENTITY) 4419 adev->ram_is_direct_mapped = true; 4420 } 4421 4422 #if defined(CONFIG_HSA_AMD_P2P) 4423 /** 4424 * amdgpu_device_check_iommu_remap - Check if DMA remapping is enabled. 4425 * 4426 * @adev: amdgpu_device pointer 4427 * 4428 * return if IOMMU remapping bar address 4429 */ 4430 static bool amdgpu_device_check_iommu_remap(struct amdgpu_device *adev) 4431 { 4432 struct iommu_domain *domain; 4433 4434 domain = iommu_get_domain_for_dev(adev->dev); 4435 if (domain && (domain->type == IOMMU_DOMAIN_DMA || 4436 domain->type == IOMMU_DOMAIN_DMA_FQ)) 4437 return true; 4438 4439 return false; 4440 } 4441 #endif 4442 4443 static void amdgpu_device_set_mcbp(struct amdgpu_device *adev) 4444 { 4445 if (amdgpu_mcbp == 1) 4446 adev->gfx.mcbp = true; 4447 else if (amdgpu_mcbp == 0) 4448 adev->gfx.mcbp = false; 4449 4450 if (amdgpu_sriov_vf(adev)) 4451 adev->gfx.mcbp = true; 4452 4453 if (adev->gfx.mcbp) 4454 dev_info(adev->dev, "MCBP is enabled\n"); 4455 } 4456 4457 static int amdgpu_device_sys_interface_init(struct amdgpu_device *adev) 4458 { 4459 int r; 4460 4461 r = amdgpu_atombios_sysfs_init(adev); 4462 if (r) 4463 drm_err(&adev->ddev, 4464 "registering atombios sysfs failed (%d).\n", r); 4465 4466 r = amdgpu_pm_sysfs_init(adev); 4467 if (r) 4468 dev_err(adev->dev, "registering pm sysfs failed (%d).\n", r); 4469 4470 r = amdgpu_ucode_sysfs_init(adev); 4471 if (r) { 4472 adev->ucode_sysfs_en = false; 4473 dev_err(adev->dev, "Creating firmware sysfs failed (%d).\n", r); 4474 } else 4475 adev->ucode_sysfs_en = true; 4476 4477 r = amdgpu_device_attr_sysfs_init(adev); 4478 if (r) 4479 dev_err(adev->dev, "Could not create amdgpu device attr\n"); 4480 4481 r = devm_device_add_group(adev->dev, &amdgpu_board_attrs_group); 4482 if (r) 4483 dev_err(adev->dev, 4484 "Could not create amdgpu board attributes\n"); 4485 4486 amdgpu_fru_sysfs_init(adev); 4487 amdgpu_reg_state_sysfs_init(adev); 4488 amdgpu_xcp_sysfs_init(adev); 4489 4490 return r; 4491 } 4492 4493 static void amdgpu_device_sys_interface_fini(struct amdgpu_device *adev) 4494 { 4495 if (adev->pm.sysfs_initialized) 4496 amdgpu_pm_sysfs_fini(adev); 4497 if (adev->ucode_sysfs_en) 4498 amdgpu_ucode_sysfs_fini(adev); 4499 amdgpu_device_attr_sysfs_fini(adev); 4500 amdgpu_fru_sysfs_fini(adev); 4501 4502 amdgpu_reg_state_sysfs_fini(adev); 4503 amdgpu_xcp_sysfs_fini(adev); 4504 } 4505 4506 /** 4507 * amdgpu_device_init - initialize the driver 4508 * 4509 * @adev: amdgpu_device pointer 4510 * @flags: driver flags 4511 * 4512 * Initializes the driver info and hw (all asics). 4513 * Returns 0 for success or an error on failure. 4514 * Called at driver startup. 4515 */ 4516 int amdgpu_device_init(struct amdgpu_device *adev, 4517 uint32_t flags) 4518 { 4519 struct pci_dev *pdev = adev->pdev; 4520 int r, i; 4521 bool px = false; 4522 u32 max_MBps; 4523 int tmp; 4524 4525 adev->shutdown = false; 4526 adev->flags = flags; 4527 4528 if (amdgpu_force_asic_type >= 0 && amdgpu_force_asic_type < CHIP_LAST) 4529 adev->asic_type = amdgpu_force_asic_type; 4530 else 4531 adev->asic_type = flags & AMD_ASIC_MASK; 4532 4533 adev->usec_timeout = AMDGPU_MAX_USEC_TIMEOUT; 4534 if (amdgpu_emu_mode == 1) 4535 adev->usec_timeout *= 10; 4536 adev->gmc.gart_size = 512 * 1024 * 1024; 4537 adev->accel_working = false; 4538 adev->num_rings = 0; 4539 RCU_INIT_POINTER(adev->gang_submit, dma_fence_get_stub()); 4540 adev->mman.buffer_funcs = NULL; 4541 adev->mman.buffer_funcs_ring = NULL; 4542 adev->vm_manager.vm_pte_funcs = NULL; 4543 adev->vm_manager.vm_pte_num_scheds = 0; 4544 adev->gmc.gmc_funcs = NULL; 4545 adev->harvest_ip_mask = 0x0; 4546 adev->fence_context = dma_fence_context_alloc(AMDGPU_MAX_RINGS); 4547 bitmap_zero(adev->gfx.pipe_reserve_bitmap, AMDGPU_MAX_COMPUTE_QUEUES); 4548 4549 adev->smc_rreg = &amdgpu_invalid_rreg; 4550 adev->smc_wreg = &amdgpu_invalid_wreg; 4551 adev->pcie_rreg = &amdgpu_invalid_rreg; 4552 adev->pcie_wreg = &amdgpu_invalid_wreg; 4553 adev->pcie_rreg_ext = &amdgpu_invalid_rreg_ext; 4554 adev->pcie_wreg_ext = &amdgpu_invalid_wreg_ext; 4555 adev->pciep_rreg = &amdgpu_invalid_rreg; 4556 adev->pciep_wreg = &amdgpu_invalid_wreg; 4557 adev->pcie_rreg64 = &amdgpu_invalid_rreg64; 4558 adev->pcie_wreg64 = &amdgpu_invalid_wreg64; 4559 adev->pcie_rreg64_ext = &amdgpu_invalid_rreg64_ext; 4560 adev->pcie_wreg64_ext = &amdgpu_invalid_wreg64_ext; 4561 adev->uvd_ctx_rreg = &amdgpu_invalid_rreg; 4562 adev->uvd_ctx_wreg = &amdgpu_invalid_wreg; 4563 adev->didt_rreg = &amdgpu_invalid_rreg; 4564 adev->didt_wreg = &amdgpu_invalid_wreg; 4565 adev->gc_cac_rreg = &amdgpu_invalid_rreg; 4566 adev->gc_cac_wreg = &amdgpu_invalid_wreg; 4567 adev->audio_endpt_rreg = &amdgpu_block_invalid_rreg; 4568 adev->audio_endpt_wreg = &amdgpu_block_invalid_wreg; 4569 4570 dev_info( 4571 adev->dev, 4572 "initializing kernel modesetting (%s 0x%04X:0x%04X 0x%04X:0x%04X 0x%02X).\n", 4573 amdgpu_asic_name[adev->asic_type], pdev->vendor, pdev->device, 4574 pdev->subsystem_vendor, pdev->subsystem_device, pdev->revision); 4575 4576 /* mutex initialization are all done here so we 4577 * can recall function without having locking issues 4578 */ 4579 mutex_init(&adev->firmware.mutex); 4580 mutex_init(&adev->pm.mutex); 4581 mutex_init(&adev->gfx.gpu_clock_mutex); 4582 mutex_init(&adev->srbm_mutex); 4583 mutex_init(&adev->gfx.pipe_reserve_mutex); 4584 mutex_init(&adev->gfx.gfx_off_mutex); 4585 mutex_init(&adev->gfx.partition_mutex); 4586 mutex_init(&adev->grbm_idx_mutex); 4587 mutex_init(&adev->mn_lock); 4588 mutex_init(&adev->virt.vf_errors.lock); 4589 hash_init(adev->mn_hash); 4590 mutex_init(&adev->psp.mutex); 4591 mutex_init(&adev->notifier_lock); 4592 mutex_init(&adev->pm.stable_pstate_ctx_lock); 4593 mutex_init(&adev->benchmark_mutex); 4594 mutex_init(&adev->gfx.reset_sem_mutex); 4595 /* Initialize the mutex for cleaner shader isolation between GFX and compute processes */ 4596 mutex_init(&adev->enforce_isolation_mutex); 4597 for (i = 0; i < MAX_XCP; ++i) { 4598 adev->isolation[i].spearhead = dma_fence_get_stub(); 4599 amdgpu_sync_create(&adev->isolation[i].active); 4600 amdgpu_sync_create(&adev->isolation[i].prev); 4601 } 4602 mutex_init(&adev->gfx.userq_sch_mutex); 4603 mutex_init(&adev->gfx.workload_profile_mutex); 4604 mutex_init(&adev->vcn.workload_profile_mutex); 4605 4606 amdgpu_device_init_apu_flags(adev); 4607 4608 r = amdgpu_device_check_arguments(adev); 4609 if (r) 4610 return r; 4611 4612 spin_lock_init(&adev->mmio_idx_lock); 4613 spin_lock_init(&adev->smc_idx_lock); 4614 spin_lock_init(&adev->pcie_idx_lock); 4615 spin_lock_init(&adev->uvd_ctx_idx_lock); 4616 spin_lock_init(&adev->didt_idx_lock); 4617 spin_lock_init(&adev->gc_cac_idx_lock); 4618 spin_lock_init(&adev->se_cac_idx_lock); 4619 spin_lock_init(&adev->audio_endpt_idx_lock); 4620 spin_lock_init(&adev->mm_stats.lock); 4621 spin_lock_init(&adev->virt.rlcg_reg_lock); 4622 spin_lock_init(&adev->wb.lock); 4623 4624 xa_init_flags(&adev->userq_xa, XA_FLAGS_LOCK_IRQ); 4625 4626 INIT_LIST_HEAD(&adev->reset_list); 4627 4628 INIT_LIST_HEAD(&adev->ras_list); 4629 4630 INIT_LIST_HEAD(&adev->pm.od_kobj_list); 4631 4632 xa_init(&adev->userq_doorbell_xa); 4633 4634 INIT_DELAYED_WORK(&adev->delayed_init_work, 4635 amdgpu_device_delayed_init_work_handler); 4636 INIT_DELAYED_WORK(&adev->gfx.gfx_off_delay_work, 4637 amdgpu_device_delay_enable_gfx_off); 4638 /* 4639 * Initialize the enforce_isolation work structures for each XCP 4640 * partition. This work handler is responsible for enforcing shader 4641 * isolation on AMD GPUs. It counts the number of emitted fences for 4642 * each GFX and compute ring. If there are any fences, it schedules 4643 * the `enforce_isolation_work` to be run after a delay. If there are 4644 * no fences, it signals the Kernel Fusion Driver (KFD) to resume the 4645 * runqueue. 4646 */ 4647 for (i = 0; i < MAX_XCP; i++) { 4648 INIT_DELAYED_WORK(&adev->gfx.enforce_isolation[i].work, 4649 amdgpu_gfx_enforce_isolation_handler); 4650 adev->gfx.enforce_isolation[i].adev = adev; 4651 adev->gfx.enforce_isolation[i].xcp_id = i; 4652 } 4653 4654 INIT_WORK(&adev->xgmi_reset_work, amdgpu_device_xgmi_reset_func); 4655 INIT_WORK(&adev->userq_reset_work, amdgpu_userq_reset_work); 4656 4657 adev->gfx.gfx_off_req_count = 1; 4658 adev->gfx.gfx_off_residency = 0; 4659 adev->gfx.gfx_off_entrycount = 0; 4660 adev->pm.ac_power = power_supply_is_system_supplied() > 0; 4661 4662 atomic_set(&adev->throttling_logging_enabled, 1); 4663 /* 4664 * If throttling continues, logging will be performed every minute 4665 * to avoid log flooding. "-1" is subtracted since the thermal 4666 * throttling interrupt comes every second. Thus, the total logging 4667 * interval is 59 seconds(retelimited printk interval) + 1(waiting 4668 * for throttling interrupt) = 60 seconds. 4669 */ 4670 ratelimit_state_init(&adev->throttling_logging_rs, (60 - 1) * HZ, 1); 4671 4672 ratelimit_set_flags(&adev->throttling_logging_rs, RATELIMIT_MSG_ON_RELEASE); 4673 4674 /* Registers mapping */ 4675 /* TODO: block userspace mapping of io register */ 4676 if (adev->asic_type >= CHIP_BONAIRE) { 4677 adev->rmmio_base = pci_resource_start(adev->pdev, 5); 4678 adev->rmmio_size = pci_resource_len(adev->pdev, 5); 4679 } else { 4680 adev->rmmio_base = pci_resource_start(adev->pdev, 2); 4681 adev->rmmio_size = pci_resource_len(adev->pdev, 2); 4682 } 4683 4684 for (i = 0; i < AMD_IP_BLOCK_TYPE_NUM; i++) 4685 atomic_set(&adev->pm.pwr_state[i], POWER_STATE_UNKNOWN); 4686 4687 adev->rmmio = ioremap(adev->rmmio_base, adev->rmmio_size); 4688 if (!adev->rmmio) 4689 return -ENOMEM; 4690 4691 dev_info(adev->dev, "register mmio base: 0x%08X\n", 4692 (uint32_t)adev->rmmio_base); 4693 dev_info(adev->dev, "register mmio size: %u\n", 4694 (unsigned int)adev->rmmio_size); 4695 4696 /* 4697 * Reset domain needs to be present early, before XGMI hive discovered 4698 * (if any) and initialized to use reset sem and in_gpu reset flag 4699 * early on during init and before calling to RREG32. 4700 */ 4701 adev->reset_domain = amdgpu_reset_create_reset_domain(SINGLE_DEVICE, "amdgpu-reset-dev"); 4702 if (!adev->reset_domain) 4703 return -ENOMEM; 4704 4705 /* detect hw virtualization here */ 4706 amdgpu_virt_init(adev); 4707 4708 amdgpu_device_get_pcie_info(adev); 4709 4710 r = amdgpu_device_get_job_timeout_settings(adev); 4711 if (r) { 4712 dev_err(adev->dev, "invalid lockup_timeout parameter syntax\n"); 4713 return r; 4714 } 4715 4716 amdgpu_device_set_mcbp(adev); 4717 4718 /* 4719 * By default, use default mode where all blocks are expected to be 4720 * initialized. At present a 'swinit' of blocks is required to be 4721 * completed before the need for a different level is detected. 4722 */ 4723 amdgpu_set_init_level(adev, AMDGPU_INIT_LEVEL_DEFAULT); 4724 /* early init functions */ 4725 r = amdgpu_device_ip_early_init(adev); 4726 if (r) 4727 return r; 4728 4729 /* 4730 * No need to remove conflicting FBs for non-display class devices. 4731 * This prevents the sysfb from being freed accidently. 4732 */ 4733 if ((pdev->class >> 8) == PCI_CLASS_DISPLAY_VGA || 4734 (pdev->class >> 8) == PCI_CLASS_DISPLAY_OTHER) { 4735 /* Get rid of things like offb */ 4736 r = aperture_remove_conflicting_pci_devices(adev->pdev, amdgpu_kms_driver.name); 4737 if (r) 4738 return r; 4739 } 4740 4741 /* Enable TMZ based on IP_VERSION */ 4742 amdgpu_gmc_tmz_set(adev); 4743 4744 if (amdgpu_sriov_vf(adev) && 4745 amdgpu_ip_version(adev, GC_HWIP, 0) >= IP_VERSION(10, 3, 0)) 4746 /* VF MMIO access (except mailbox range) from CPU 4747 * will be blocked during sriov runtime 4748 */ 4749 adev->virt.caps |= AMDGPU_VF_MMIO_ACCESS_PROTECT; 4750 4751 amdgpu_gmc_noretry_set(adev); 4752 /* Need to get xgmi info early to decide the reset behavior*/ 4753 if (adev->gmc.xgmi.supported) { 4754 r = adev->gfxhub.funcs->get_xgmi_info(adev); 4755 if (r) 4756 return r; 4757 } 4758 4759 /* enable PCIE atomic ops */ 4760 if (amdgpu_sriov_vf(adev)) { 4761 if (adev->virt.fw_reserve.p_pf2vf) 4762 adev->have_atomics_support = ((struct amd_sriov_msg_pf2vf_info *) 4763 adev->virt.fw_reserve.p_pf2vf)->pcie_atomic_ops_support_flags == 4764 (PCI_EXP_DEVCAP2_ATOMIC_COMP32 | PCI_EXP_DEVCAP2_ATOMIC_COMP64); 4765 /* APUs w/ gfx9 onwards doesn't reply on PCIe atomics, rather it is a 4766 * internal path natively support atomics, set have_atomics_support to true. 4767 */ 4768 } else if ((adev->flags & AMD_IS_APU) && 4769 (amdgpu_ip_version(adev, GC_HWIP, 0) > 4770 IP_VERSION(9, 0, 0))) { 4771 adev->have_atomics_support = true; 4772 } else { 4773 adev->have_atomics_support = 4774 !pci_enable_atomic_ops_to_root(adev->pdev, 4775 PCI_EXP_DEVCAP2_ATOMIC_COMP32 | 4776 PCI_EXP_DEVCAP2_ATOMIC_COMP64); 4777 } 4778 4779 if (!adev->have_atomics_support) 4780 dev_info(adev->dev, "PCIE atomic ops is not supported\n"); 4781 4782 /* doorbell bar mapping and doorbell index init*/ 4783 amdgpu_doorbell_init(adev); 4784 4785 if (amdgpu_emu_mode == 1) { 4786 /* post the asic on emulation mode */ 4787 emu_soc_asic_init(adev); 4788 goto fence_driver_init; 4789 } 4790 4791 amdgpu_reset_init(adev); 4792 4793 /* detect if we are with an SRIOV vbios */ 4794 if (adev->bios) 4795 amdgpu_device_detect_sriov_bios(adev); 4796 4797 /* check if we need to reset the asic 4798 * E.g., driver was not cleanly unloaded previously, etc. 4799 */ 4800 if (!amdgpu_sriov_vf(adev) && amdgpu_asic_need_reset_on_init(adev)) { 4801 if (adev->gmc.xgmi.num_physical_nodes) { 4802 dev_info(adev->dev, "Pending hive reset.\n"); 4803 amdgpu_set_init_level(adev, 4804 AMDGPU_INIT_LEVEL_MINIMAL_XGMI); 4805 } else if (amdgpu_ip_version(adev, MP1_HWIP, 0) == IP_VERSION(13, 0, 10) && 4806 !amdgpu_device_has_display_hardware(adev)) { 4807 r = psp_gpu_reset(adev); 4808 } else { 4809 tmp = amdgpu_reset_method; 4810 /* It should do a default reset when loading or reloading the driver, 4811 * regardless of the module parameter reset_method. 4812 */ 4813 amdgpu_reset_method = AMD_RESET_METHOD_NONE; 4814 r = amdgpu_asic_reset(adev); 4815 amdgpu_reset_method = tmp; 4816 } 4817 4818 if (r) { 4819 dev_err(adev->dev, "asic reset on init failed\n"); 4820 goto failed; 4821 } 4822 } 4823 4824 /* Post card if necessary */ 4825 if (amdgpu_device_need_post(adev)) { 4826 if (!adev->bios) { 4827 dev_err(adev->dev, "no vBIOS found\n"); 4828 r = -EINVAL; 4829 goto failed; 4830 } 4831 dev_info(adev->dev, "GPU posting now...\n"); 4832 r = amdgpu_device_asic_init(adev); 4833 if (r) { 4834 dev_err(adev->dev, "gpu post error!\n"); 4835 goto failed; 4836 } 4837 } 4838 4839 if (adev->bios) { 4840 if (adev->is_atom_fw) { 4841 /* Initialize clocks */ 4842 r = amdgpu_atomfirmware_get_clock_info(adev); 4843 if (r) { 4844 dev_err(adev->dev, "amdgpu_atomfirmware_get_clock_info failed\n"); 4845 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_GET_CLOCK_FAIL, 0, 0); 4846 goto failed; 4847 } 4848 } else { 4849 /* Initialize clocks */ 4850 r = amdgpu_atombios_get_clock_info(adev); 4851 if (r) { 4852 dev_err(adev->dev, "amdgpu_atombios_get_clock_info failed\n"); 4853 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_GET_CLOCK_FAIL, 0, 0); 4854 goto failed; 4855 } 4856 /* init i2c buses */ 4857 amdgpu_i2c_init(adev); 4858 } 4859 } 4860 4861 fence_driver_init: 4862 /* Fence driver */ 4863 r = amdgpu_fence_driver_sw_init(adev); 4864 if (r) { 4865 dev_err(adev->dev, "amdgpu_fence_driver_sw_init failed\n"); 4866 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_FENCE_INIT_FAIL, 0, 0); 4867 goto failed; 4868 } 4869 4870 /* init the mode config */ 4871 drm_mode_config_init(adev_to_drm(adev)); 4872 4873 r = amdgpu_device_ip_init(adev); 4874 if (r) { 4875 dev_err(adev->dev, "amdgpu_device_ip_init failed\n"); 4876 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_AMDGPU_INIT_FAIL, 0, 0); 4877 goto release_ras_con; 4878 } 4879 4880 amdgpu_fence_driver_hw_init(adev); 4881 4882 dev_info(adev->dev, 4883 "SE %d, SH per SE %d, CU per SH %d, active_cu_number %d\n", 4884 adev->gfx.config.max_shader_engines, 4885 adev->gfx.config.max_sh_per_se, 4886 adev->gfx.config.max_cu_per_sh, 4887 adev->gfx.cu_info.number); 4888 4889 adev->accel_working = true; 4890 4891 amdgpu_vm_check_compute_bug(adev); 4892 4893 /* Initialize the buffer migration limit. */ 4894 if (amdgpu_moverate >= 0) 4895 max_MBps = amdgpu_moverate; 4896 else 4897 max_MBps = 8; /* Allow 8 MB/s. */ 4898 /* Get a log2 for easy divisions. */ 4899 adev->mm_stats.log2_max_MBps = ilog2(max(1u, max_MBps)); 4900 4901 /* 4902 * Register gpu instance before amdgpu_device_enable_mgpu_fan_boost. 4903 * Otherwise the mgpu fan boost feature will be skipped due to the 4904 * gpu instance is counted less. 4905 */ 4906 amdgpu_register_gpu_instance(adev); 4907 4908 /* enable clockgating, etc. after ib tests, etc. since some blocks require 4909 * explicit gating rather than handling it automatically. 4910 */ 4911 if (adev->init_lvl->level != AMDGPU_INIT_LEVEL_MINIMAL_XGMI) { 4912 r = amdgpu_device_ip_late_init(adev); 4913 if (r) { 4914 dev_err(adev->dev, "amdgpu_device_ip_late_init failed\n"); 4915 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_AMDGPU_LATE_INIT_FAIL, 0, r); 4916 goto release_ras_con; 4917 } 4918 /* must succeed. */ 4919 amdgpu_ras_resume(adev); 4920 queue_delayed_work(system_wq, &adev->delayed_init_work, 4921 msecs_to_jiffies(AMDGPU_RESUME_MS)); 4922 } 4923 4924 if (amdgpu_sriov_vf(adev)) { 4925 amdgpu_virt_release_full_gpu(adev, true); 4926 flush_delayed_work(&adev->delayed_init_work); 4927 } 4928 4929 if (adev->init_lvl->level == AMDGPU_INIT_LEVEL_MINIMAL_XGMI) 4930 amdgpu_xgmi_reset_on_init(adev); 4931 /* 4932 * Place those sysfs registering after `late_init`. As some of those 4933 * operations performed in `late_init` might affect the sysfs 4934 * interfaces creating. 4935 */ 4936 r = amdgpu_device_sys_interface_init(adev); 4937 4938 if (IS_ENABLED(CONFIG_PERF_EVENTS)) 4939 r = amdgpu_pmu_init(adev); 4940 if (r) 4941 dev_err(adev->dev, "amdgpu_pmu_init failed\n"); 4942 4943 /* Have stored pci confspace at hand for restore in sudden PCI error */ 4944 if (amdgpu_device_cache_pci_state(adev->pdev)) 4945 pci_restore_state(pdev); 4946 4947 /* if we have > 1 VGA cards, then disable the amdgpu VGA resources */ 4948 /* this will fail for cards that aren't VGA class devices, just 4949 * ignore it 4950 */ 4951 if ((adev->pdev->class >> 8) == PCI_CLASS_DISPLAY_VGA) 4952 vga_client_register(adev->pdev, amdgpu_device_vga_set_decode); 4953 4954 px = amdgpu_device_supports_px(adev); 4955 4956 if (px || (!dev_is_removable(&adev->pdev->dev) && 4957 apple_gmux_detect(NULL, NULL))) 4958 vga_switcheroo_register_client(adev->pdev, 4959 &amdgpu_switcheroo_ops, px); 4960 4961 if (px) 4962 vga_switcheroo_init_domain_pm_ops(adev->dev, &adev->vga_pm_domain); 4963 4964 amdgpu_device_check_iommu_direct_map(adev); 4965 4966 adev->pm_nb.notifier_call = amdgpu_device_pm_notifier; 4967 r = register_pm_notifier(&adev->pm_nb); 4968 if (r) 4969 goto failed; 4970 4971 return 0; 4972 4973 release_ras_con: 4974 if (amdgpu_sriov_vf(adev)) 4975 amdgpu_virt_release_full_gpu(adev, true); 4976 4977 /* failed in exclusive mode due to timeout */ 4978 if (amdgpu_sriov_vf(adev) && 4979 !amdgpu_sriov_runtime(adev) && 4980 amdgpu_virt_mmio_blocked(adev) && 4981 !amdgpu_virt_wait_reset(adev)) { 4982 dev_err(adev->dev, "VF exclusive mode timeout\n"); 4983 /* Don't send request since VF is inactive. */ 4984 adev->virt.caps &= ~AMDGPU_SRIOV_CAPS_RUNTIME; 4985 adev->virt.ops = NULL; 4986 r = -EAGAIN; 4987 } 4988 amdgpu_release_ras_context(adev); 4989 4990 failed: 4991 amdgpu_vf_error_trans_all(adev); 4992 4993 return r; 4994 } 4995 4996 static void amdgpu_device_unmap_mmio(struct amdgpu_device *adev) 4997 { 4998 4999 /* Clear all CPU mappings pointing to this device */ 5000 unmap_mapping_range(adev->ddev.anon_inode->i_mapping, 0, 0, 1); 5001 5002 /* Unmap all mapped bars - Doorbell, registers and VRAM */ 5003 amdgpu_doorbell_fini(adev); 5004 5005 iounmap(adev->rmmio); 5006 adev->rmmio = NULL; 5007 if (adev->mman.aper_base_kaddr) 5008 iounmap(adev->mman.aper_base_kaddr); 5009 adev->mman.aper_base_kaddr = NULL; 5010 5011 /* Memory manager related */ 5012 if (!adev->gmc.xgmi.connected_to_cpu && !adev->gmc.is_app_apu) { 5013 arch_phys_wc_del(adev->gmc.vram_mtrr); 5014 arch_io_free_memtype_wc(adev->gmc.aper_base, adev->gmc.aper_size); 5015 } 5016 } 5017 5018 /** 5019 * amdgpu_device_fini_hw - tear down the driver 5020 * 5021 * @adev: amdgpu_device pointer 5022 * 5023 * Tear down the driver info (all asics). 5024 * Called at driver shutdown. 5025 */ 5026 void amdgpu_device_fini_hw(struct amdgpu_device *adev) 5027 { 5028 dev_info(adev->dev, "amdgpu: finishing device.\n"); 5029 flush_delayed_work(&adev->delayed_init_work); 5030 5031 if (adev->mman.initialized) 5032 drain_workqueue(adev->mman.bdev.wq); 5033 adev->shutdown = true; 5034 5035 unregister_pm_notifier(&adev->pm_nb); 5036 5037 /* make sure IB test finished before entering exclusive mode 5038 * to avoid preemption on IB test 5039 */ 5040 if (amdgpu_sriov_vf(adev)) { 5041 amdgpu_virt_request_full_gpu(adev, false); 5042 amdgpu_virt_fini_data_exchange(adev); 5043 } 5044 5045 /* disable all interrupts */ 5046 amdgpu_irq_disable_all(adev); 5047 if (adev->mode_info.mode_config_initialized) { 5048 if (!drm_drv_uses_atomic_modeset(adev_to_drm(adev))) 5049 drm_helper_force_disable_all(adev_to_drm(adev)); 5050 else 5051 drm_atomic_helper_shutdown(adev_to_drm(adev)); 5052 } 5053 amdgpu_fence_driver_hw_fini(adev); 5054 5055 amdgpu_device_sys_interface_fini(adev); 5056 5057 /* disable ras feature must before hw fini */ 5058 amdgpu_ras_pre_fini(adev); 5059 5060 amdgpu_ttm_set_buffer_funcs_status(adev, false); 5061 5062 amdgpu_device_ip_fini_early(adev); 5063 5064 amdgpu_irq_fini_hw(adev); 5065 5066 if (adev->mman.initialized) 5067 ttm_device_clear_dma_mappings(&adev->mman.bdev); 5068 5069 amdgpu_gart_dummy_page_fini(adev); 5070 5071 if (drm_dev_is_unplugged(adev_to_drm(adev))) 5072 amdgpu_device_unmap_mmio(adev); 5073 5074 } 5075 5076 void amdgpu_device_fini_sw(struct amdgpu_device *adev) 5077 { 5078 int i, idx; 5079 bool px; 5080 5081 amdgpu_device_ip_fini(adev); 5082 amdgpu_fence_driver_sw_fini(adev); 5083 amdgpu_ucode_release(&adev->firmware.gpu_info_fw); 5084 adev->accel_working = false; 5085 dma_fence_put(rcu_dereference_protected(adev->gang_submit, true)); 5086 for (i = 0; i < MAX_XCP; ++i) { 5087 dma_fence_put(adev->isolation[i].spearhead); 5088 amdgpu_sync_free(&adev->isolation[i].active); 5089 amdgpu_sync_free(&adev->isolation[i].prev); 5090 } 5091 5092 amdgpu_reset_fini(adev); 5093 5094 /* free i2c buses */ 5095 amdgpu_i2c_fini(adev); 5096 5097 if (adev->bios) { 5098 if (amdgpu_emu_mode != 1) 5099 amdgpu_atombios_fini(adev); 5100 amdgpu_bios_release(adev); 5101 } 5102 5103 kfree(adev->fru_info); 5104 adev->fru_info = NULL; 5105 5106 kfree(adev->xcp_mgr); 5107 adev->xcp_mgr = NULL; 5108 5109 px = amdgpu_device_supports_px(adev); 5110 5111 if (px || (!dev_is_removable(&adev->pdev->dev) && 5112 apple_gmux_detect(NULL, NULL))) 5113 vga_switcheroo_unregister_client(adev->pdev); 5114 5115 if (px) 5116 vga_switcheroo_fini_domain_pm_ops(adev->dev); 5117 5118 if ((adev->pdev->class >> 8) == PCI_CLASS_DISPLAY_VGA) 5119 vga_client_unregister(adev->pdev); 5120 5121 if (drm_dev_enter(adev_to_drm(adev), &idx)) { 5122 5123 iounmap(adev->rmmio); 5124 adev->rmmio = NULL; 5125 drm_dev_exit(idx); 5126 } 5127 5128 if (IS_ENABLED(CONFIG_PERF_EVENTS)) 5129 amdgpu_pmu_fini(adev); 5130 if (adev->discovery.bin) 5131 amdgpu_discovery_fini(adev); 5132 5133 amdgpu_reset_put_reset_domain(adev->reset_domain); 5134 adev->reset_domain = NULL; 5135 5136 kfree(adev->pci_state); 5137 kfree(adev->pcie_reset_ctx.swds_pcistate); 5138 kfree(adev->pcie_reset_ctx.swus_pcistate); 5139 } 5140 5141 /** 5142 * amdgpu_device_evict_resources - evict device resources 5143 * @adev: amdgpu device object 5144 * 5145 * Evicts all ttm device resources(vram BOs, gart table) from the lru list 5146 * of the vram memory type. Mainly used for evicting device resources 5147 * at suspend time. 5148 * 5149 */ 5150 static int amdgpu_device_evict_resources(struct amdgpu_device *adev) 5151 { 5152 int ret; 5153 5154 /* No need to evict vram on APUs unless going to S4 */ 5155 if (!adev->in_s4 && (adev->flags & AMD_IS_APU)) 5156 return 0; 5157 5158 /* No need to evict when going to S5 through S4 callbacks */ 5159 if (system_state == SYSTEM_POWER_OFF) 5160 return 0; 5161 5162 ret = amdgpu_ttm_evict_resources(adev, TTM_PL_VRAM); 5163 if (ret) { 5164 dev_warn(adev->dev, "evicting device resources failed\n"); 5165 return ret; 5166 } 5167 5168 if (adev->in_s4) { 5169 ret = ttm_device_prepare_hibernation(&adev->mman.bdev); 5170 if (ret) 5171 dev_err(adev->dev, "prepare hibernation failed, %d\n", ret); 5172 } 5173 return ret; 5174 } 5175 5176 /* 5177 * Suspend & resume. 5178 */ 5179 /** 5180 * amdgpu_device_pm_notifier - Notification block for Suspend/Hibernate events 5181 * @nb: notifier block 5182 * @mode: suspend mode 5183 * @data: data 5184 * 5185 * This function is called when the system is about to suspend or hibernate. 5186 * It is used to set the appropriate flags so that eviction can be optimized 5187 * in the pm prepare callback. 5188 */ 5189 static int amdgpu_device_pm_notifier(struct notifier_block *nb, unsigned long mode, 5190 void *data) 5191 { 5192 struct amdgpu_device *adev = container_of(nb, struct amdgpu_device, pm_nb); 5193 5194 switch (mode) { 5195 case PM_HIBERNATION_PREPARE: 5196 adev->in_s4 = true; 5197 break; 5198 case PM_POST_HIBERNATION: 5199 adev->in_s4 = false; 5200 break; 5201 } 5202 5203 return NOTIFY_DONE; 5204 } 5205 5206 /** 5207 * amdgpu_device_prepare - prepare for device suspend 5208 * 5209 * @dev: drm dev pointer 5210 * 5211 * Prepare to put the hw in the suspend state (all asics). 5212 * Returns 0 for success or an error on failure. 5213 * Called at driver suspend. 5214 */ 5215 int amdgpu_device_prepare(struct drm_device *dev) 5216 { 5217 struct amdgpu_device *adev = drm_to_adev(dev); 5218 int i, r; 5219 5220 if (dev->switch_power_state == DRM_SWITCH_POWER_OFF) 5221 return 0; 5222 5223 /* Evict the majority of BOs before starting suspend sequence */ 5224 r = amdgpu_device_evict_resources(adev); 5225 if (r) 5226 return r; 5227 5228 flush_delayed_work(&adev->gfx.gfx_off_delay_work); 5229 5230 for (i = 0; i < adev->num_ip_blocks; i++) { 5231 if (!adev->ip_blocks[i].status.valid) 5232 continue; 5233 if (!adev->ip_blocks[i].version->funcs->prepare_suspend) 5234 continue; 5235 r = adev->ip_blocks[i].version->funcs->prepare_suspend(&adev->ip_blocks[i]); 5236 if (r) 5237 return r; 5238 } 5239 5240 return 0; 5241 } 5242 5243 /** 5244 * amdgpu_device_complete - complete power state transition 5245 * 5246 * @dev: drm dev pointer 5247 * 5248 * Undo the changes from amdgpu_device_prepare. This will be 5249 * called on all resume transitions, including those that failed. 5250 */ 5251 void amdgpu_device_complete(struct drm_device *dev) 5252 { 5253 struct amdgpu_device *adev = drm_to_adev(dev); 5254 int i; 5255 5256 for (i = 0; i < adev->num_ip_blocks; i++) { 5257 if (!adev->ip_blocks[i].status.valid) 5258 continue; 5259 if (!adev->ip_blocks[i].version->funcs->complete) 5260 continue; 5261 adev->ip_blocks[i].version->funcs->complete(&adev->ip_blocks[i]); 5262 } 5263 } 5264 5265 /** 5266 * amdgpu_device_suspend - initiate device suspend 5267 * 5268 * @dev: drm dev pointer 5269 * @notify_clients: notify in-kernel DRM clients 5270 * 5271 * Puts the hw in the suspend state (all asics). 5272 * Returns 0 for success or an error on failure. 5273 * Called at driver suspend. 5274 */ 5275 int amdgpu_device_suspend(struct drm_device *dev, bool notify_clients) 5276 { 5277 struct amdgpu_device *adev = drm_to_adev(dev); 5278 int r, rec; 5279 5280 if (dev->switch_power_state == DRM_SWITCH_POWER_OFF) 5281 return 0; 5282 5283 adev->in_suspend = true; 5284 5285 if (amdgpu_sriov_vf(adev)) { 5286 if (!adev->in_runpm) 5287 amdgpu_amdkfd_suspend_process(adev); 5288 amdgpu_virt_fini_data_exchange(adev); 5289 r = amdgpu_virt_request_full_gpu(adev, false); 5290 if (r) 5291 return r; 5292 } 5293 5294 r = amdgpu_acpi_smart_shift_update(adev, AMDGPU_SS_DEV_D3); 5295 if (r) 5296 goto unwind_sriov; 5297 5298 if (notify_clients) 5299 drm_client_dev_suspend(adev_to_drm(adev)); 5300 5301 cancel_delayed_work_sync(&adev->delayed_init_work); 5302 5303 amdgpu_ras_suspend(adev); 5304 5305 r = amdgpu_device_ip_suspend_phase1(adev); 5306 if (r) 5307 goto unwind_smartshift; 5308 5309 amdgpu_amdkfd_suspend(adev, !amdgpu_sriov_vf(adev) && !adev->in_runpm); 5310 r = amdgpu_userq_suspend(adev); 5311 if (r) 5312 goto unwind_ip_phase1; 5313 5314 r = amdgpu_device_evict_resources(adev); 5315 if (r) 5316 goto unwind_userq; 5317 5318 amdgpu_ttm_set_buffer_funcs_status(adev, false); 5319 5320 amdgpu_fence_driver_hw_fini(adev); 5321 5322 r = amdgpu_device_ip_suspend_phase2(adev); 5323 if (r) 5324 goto unwind_evict; 5325 5326 if (amdgpu_sriov_vf(adev)) 5327 amdgpu_virt_release_full_gpu(adev, false); 5328 5329 return 0; 5330 5331 unwind_evict: 5332 if (adev->mman.buffer_funcs_ring->sched.ready) 5333 amdgpu_ttm_set_buffer_funcs_status(adev, true); 5334 amdgpu_fence_driver_hw_init(adev); 5335 5336 unwind_userq: 5337 rec = amdgpu_userq_resume(adev); 5338 if (rec) { 5339 dev_warn(adev->dev, "failed to re-initialize user queues: %d\n", rec); 5340 return r; 5341 } 5342 rec = amdgpu_amdkfd_resume(adev, !amdgpu_sriov_vf(adev) && !adev->in_runpm); 5343 if (rec) { 5344 dev_warn(adev->dev, "failed to re-initialize kfd: %d\n", rec); 5345 return r; 5346 } 5347 5348 unwind_ip_phase1: 5349 /* suspend phase 1 = resume phase 3 */ 5350 rec = amdgpu_device_ip_resume_phase3(adev); 5351 if (rec) { 5352 dev_warn(adev->dev, "failed to re-initialize IPs phase1: %d\n", rec); 5353 return r; 5354 } 5355 5356 unwind_smartshift: 5357 rec = amdgpu_acpi_smart_shift_update(adev, AMDGPU_SS_DEV_D0); 5358 if (rec) { 5359 dev_warn(adev->dev, "failed to re-update smart shift: %d\n", rec); 5360 return r; 5361 } 5362 5363 if (notify_clients) 5364 drm_client_dev_resume(adev_to_drm(adev)); 5365 5366 amdgpu_ras_resume(adev); 5367 5368 unwind_sriov: 5369 if (amdgpu_sriov_vf(adev)) { 5370 rec = amdgpu_virt_request_full_gpu(adev, true); 5371 if (rec) { 5372 dev_warn(adev->dev, "failed to reinitialize sriov: %d\n", rec); 5373 return r; 5374 } 5375 } 5376 5377 adev->in_suspend = adev->in_s0ix = adev->in_s3 = false; 5378 5379 return r; 5380 } 5381 5382 static inline int amdgpu_virt_resume(struct amdgpu_device *adev) 5383 { 5384 int r; 5385 unsigned int prev_physical_node_id = adev->gmc.xgmi.physical_node_id; 5386 5387 /* During VM resume, QEMU programming of VF MSIX table (register GFXMSIX_VECT0_ADDR_LO) 5388 * may not work. The access could be blocked by nBIF protection as VF isn't in 5389 * exclusive access mode. Exclusive access is enabled now, disable/enable MSIX 5390 * so that QEMU reprograms MSIX table. 5391 */ 5392 amdgpu_restore_msix(adev); 5393 5394 r = adev->gfxhub.funcs->get_xgmi_info(adev); 5395 if (r) 5396 return r; 5397 5398 dev_info(adev->dev, "xgmi node, old id %d, new id %d\n", 5399 prev_physical_node_id, adev->gmc.xgmi.physical_node_id); 5400 5401 adev->vm_manager.vram_base_offset = adev->gfxhub.funcs->get_mc_fb_offset(adev); 5402 adev->vm_manager.vram_base_offset += 5403 adev->gmc.xgmi.physical_node_id * adev->gmc.xgmi.node_segment_size; 5404 5405 return 0; 5406 } 5407 5408 /** 5409 * amdgpu_device_resume - initiate device resume 5410 * 5411 * @dev: drm dev pointer 5412 * @notify_clients: notify in-kernel DRM clients 5413 * 5414 * Bring the hw back to operating state (all asics). 5415 * Returns 0 for success or an error on failure. 5416 * Called at driver resume. 5417 */ 5418 int amdgpu_device_resume(struct drm_device *dev, bool notify_clients) 5419 { 5420 struct amdgpu_device *adev = drm_to_adev(dev); 5421 int r = 0; 5422 5423 if (amdgpu_sriov_vf(adev)) { 5424 r = amdgpu_virt_request_full_gpu(adev, true); 5425 if (r) 5426 return r; 5427 } 5428 5429 if (amdgpu_virt_xgmi_migrate_enabled(adev)) { 5430 r = amdgpu_virt_resume(adev); 5431 if (r) 5432 goto exit; 5433 } 5434 5435 if (dev->switch_power_state == DRM_SWITCH_POWER_OFF) 5436 return 0; 5437 5438 if (adev->in_s0ix) 5439 amdgpu_dpm_gfx_state_change(adev, sGpuChangeState_D0Entry); 5440 5441 /* post card */ 5442 if (amdgpu_device_need_post(adev)) { 5443 r = amdgpu_device_asic_init(adev); 5444 if (r) 5445 dev_err(adev->dev, "amdgpu asic init failed\n"); 5446 } 5447 5448 r = amdgpu_device_ip_resume(adev); 5449 5450 if (r) { 5451 dev_err(adev->dev, "amdgpu_device_ip_resume failed (%d).\n", r); 5452 goto exit; 5453 } 5454 5455 r = amdgpu_amdkfd_resume(adev, !amdgpu_sriov_vf(adev) && !adev->in_runpm); 5456 if (r) 5457 goto exit; 5458 5459 r = amdgpu_userq_resume(adev); 5460 if (r) 5461 goto exit; 5462 5463 r = amdgpu_device_ip_late_init(adev); 5464 if (r) 5465 goto exit; 5466 5467 queue_delayed_work(system_wq, &adev->delayed_init_work, 5468 msecs_to_jiffies(AMDGPU_RESUME_MS)); 5469 exit: 5470 if (amdgpu_sriov_vf(adev)) { 5471 amdgpu_virt_init_data_exchange(adev); 5472 amdgpu_virt_release_full_gpu(adev, true); 5473 5474 if (!r && !adev->in_runpm) 5475 r = amdgpu_amdkfd_resume_process(adev); 5476 } 5477 5478 if (r) 5479 return r; 5480 5481 /* Make sure IB tests flushed */ 5482 flush_delayed_work(&adev->delayed_init_work); 5483 5484 if (notify_clients) 5485 drm_client_dev_resume(adev_to_drm(adev)); 5486 5487 amdgpu_ras_resume(adev); 5488 5489 if (adev->mode_info.num_crtc) { 5490 /* 5491 * Most of the connector probing functions try to acquire runtime pm 5492 * refs to ensure that the GPU is powered on when connector polling is 5493 * performed. Since we're calling this from a runtime PM callback, 5494 * trying to acquire rpm refs will cause us to deadlock. 5495 * 5496 * Since we're guaranteed to be holding the rpm lock, it's safe to 5497 * temporarily disable the rpm helpers so this doesn't deadlock us. 5498 */ 5499 #ifdef CONFIG_PM 5500 dev->dev->power.disable_depth++; 5501 #endif 5502 if (!adev->dc_enabled) 5503 drm_helper_hpd_irq_event(dev); 5504 else 5505 drm_kms_helper_hotplug_event(dev); 5506 #ifdef CONFIG_PM 5507 dev->dev->power.disable_depth--; 5508 #endif 5509 } 5510 5511 amdgpu_vram_mgr_clear_reset_blocks(adev); 5512 adev->in_suspend = false; 5513 5514 if (amdgpu_acpi_smart_shift_update(adev, AMDGPU_SS_DEV_D0)) 5515 dev_warn(adev->dev, "smart shift update failed\n"); 5516 5517 return 0; 5518 } 5519 5520 /** 5521 * amdgpu_device_ip_check_soft_reset - did soft reset succeed 5522 * 5523 * @adev: amdgpu_device pointer 5524 * 5525 * The list of all the hardware IPs that make up the asic is walked and 5526 * the check_soft_reset callbacks are run. check_soft_reset determines 5527 * if the asic is still hung or not. 5528 * Returns true if any of the IPs are still in a hung state, false if not. 5529 */ 5530 static bool amdgpu_device_ip_check_soft_reset(struct amdgpu_device *adev) 5531 { 5532 int i; 5533 bool asic_hang = false; 5534 5535 if (amdgpu_sriov_vf(adev)) 5536 return true; 5537 5538 if (amdgpu_asic_need_full_reset(adev)) 5539 return true; 5540 5541 for (i = 0; i < adev->num_ip_blocks; i++) { 5542 if (!adev->ip_blocks[i].status.valid) 5543 continue; 5544 if (adev->ip_blocks[i].version->funcs->check_soft_reset) 5545 adev->ip_blocks[i].status.hang = 5546 adev->ip_blocks[i].version->funcs->check_soft_reset( 5547 &adev->ip_blocks[i]); 5548 if (adev->ip_blocks[i].status.hang) { 5549 dev_info(adev->dev, "IP block:%s is hung!\n", adev->ip_blocks[i].version->funcs->name); 5550 asic_hang = true; 5551 } 5552 } 5553 return asic_hang; 5554 } 5555 5556 /** 5557 * amdgpu_device_ip_pre_soft_reset - prepare for soft reset 5558 * 5559 * @adev: amdgpu_device pointer 5560 * 5561 * The list of all the hardware IPs that make up the asic is walked and the 5562 * pre_soft_reset callbacks are run if the block is hung. pre_soft_reset 5563 * handles any IP specific hardware or software state changes that are 5564 * necessary for a soft reset to succeed. 5565 * Returns 0 on success, negative error code on failure. 5566 */ 5567 static int amdgpu_device_ip_pre_soft_reset(struct amdgpu_device *adev) 5568 { 5569 int i, r = 0; 5570 5571 for (i = 0; i < adev->num_ip_blocks; i++) { 5572 if (!adev->ip_blocks[i].status.valid) 5573 continue; 5574 if (adev->ip_blocks[i].status.hang && 5575 adev->ip_blocks[i].version->funcs->pre_soft_reset) { 5576 r = adev->ip_blocks[i].version->funcs->pre_soft_reset(&adev->ip_blocks[i]); 5577 if (r) 5578 return r; 5579 } 5580 } 5581 5582 return 0; 5583 } 5584 5585 /** 5586 * amdgpu_device_ip_need_full_reset - check if a full asic reset is needed 5587 * 5588 * @adev: amdgpu_device pointer 5589 * 5590 * Some hardware IPs cannot be soft reset. If they are hung, a full gpu 5591 * reset is necessary to recover. 5592 * Returns true if a full asic reset is required, false if not. 5593 */ 5594 static bool amdgpu_device_ip_need_full_reset(struct amdgpu_device *adev) 5595 { 5596 int i; 5597 5598 if (amdgpu_asic_need_full_reset(adev)) 5599 return true; 5600 5601 for (i = 0; i < adev->num_ip_blocks; i++) { 5602 if (!adev->ip_blocks[i].status.valid) 5603 continue; 5604 if ((adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) || 5605 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) || 5606 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_ACP) || 5607 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE) || 5608 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) { 5609 if (adev->ip_blocks[i].status.hang) { 5610 dev_info(adev->dev, "Some block need full reset!\n"); 5611 return true; 5612 } 5613 } 5614 } 5615 return false; 5616 } 5617 5618 /** 5619 * amdgpu_device_ip_soft_reset - do a soft reset 5620 * 5621 * @adev: amdgpu_device pointer 5622 * 5623 * The list of all the hardware IPs that make up the asic is walked and the 5624 * soft_reset callbacks are run if the block is hung. soft_reset handles any 5625 * IP specific hardware or software state changes that are necessary to soft 5626 * reset the IP. 5627 * Returns 0 on success, negative error code on failure. 5628 */ 5629 static int amdgpu_device_ip_soft_reset(struct amdgpu_device *adev) 5630 { 5631 int i, r = 0; 5632 5633 for (i = 0; i < adev->num_ip_blocks; i++) { 5634 if (!adev->ip_blocks[i].status.valid) 5635 continue; 5636 if (adev->ip_blocks[i].status.hang && 5637 adev->ip_blocks[i].version->funcs->soft_reset) { 5638 r = adev->ip_blocks[i].version->funcs->soft_reset(&adev->ip_blocks[i]); 5639 if (r) 5640 return r; 5641 } 5642 } 5643 5644 return 0; 5645 } 5646 5647 /** 5648 * amdgpu_device_ip_post_soft_reset - clean up from soft reset 5649 * 5650 * @adev: amdgpu_device pointer 5651 * 5652 * The list of all the hardware IPs that make up the asic is walked and the 5653 * post_soft_reset callbacks are run if the asic was hung. post_soft_reset 5654 * handles any IP specific hardware or software state changes that are 5655 * necessary after the IP has been soft reset. 5656 * Returns 0 on success, negative error code on failure. 5657 */ 5658 static int amdgpu_device_ip_post_soft_reset(struct amdgpu_device *adev) 5659 { 5660 int i, r = 0; 5661 5662 for (i = 0; i < adev->num_ip_blocks; i++) { 5663 if (!adev->ip_blocks[i].status.valid) 5664 continue; 5665 if (adev->ip_blocks[i].status.hang && 5666 adev->ip_blocks[i].version->funcs->post_soft_reset) 5667 r = adev->ip_blocks[i].version->funcs->post_soft_reset(&adev->ip_blocks[i]); 5668 if (r) 5669 return r; 5670 } 5671 5672 return 0; 5673 } 5674 5675 /** 5676 * amdgpu_device_reset_sriov - reset ASIC for SR-IOV vf 5677 * 5678 * @adev: amdgpu_device pointer 5679 * @reset_context: amdgpu reset context pointer 5680 * 5681 * do VF FLR and reinitialize Asic 5682 * return 0 means succeeded otherwise failed 5683 */ 5684 static int amdgpu_device_reset_sriov(struct amdgpu_device *adev, 5685 struct amdgpu_reset_context *reset_context) 5686 { 5687 int r; 5688 struct amdgpu_hive_info *hive = NULL; 5689 5690 if (test_bit(AMDGPU_HOST_FLR, &reset_context->flags)) { 5691 if (!amdgpu_ras_get_fed_status(adev)) 5692 amdgpu_virt_ready_to_reset(adev); 5693 amdgpu_virt_wait_reset(adev); 5694 clear_bit(AMDGPU_HOST_FLR, &reset_context->flags); 5695 r = amdgpu_virt_request_full_gpu(adev, true); 5696 } else { 5697 r = amdgpu_virt_reset_gpu(adev); 5698 } 5699 if (r) 5700 return r; 5701 5702 amdgpu_ras_clear_err_state(adev); 5703 amdgpu_irq_gpu_reset_resume_helper(adev); 5704 5705 /* some sw clean up VF needs to do before recover */ 5706 amdgpu_virt_post_reset(adev); 5707 5708 /* Resume IP prior to SMC */ 5709 r = amdgpu_device_ip_reinit_early_sriov(adev); 5710 if (r) 5711 return r; 5712 5713 amdgpu_virt_init_data_exchange(adev); 5714 5715 r = amdgpu_device_fw_loading(adev); 5716 if (r) 5717 return r; 5718 5719 /* now we are okay to resume SMC/CP/SDMA */ 5720 r = amdgpu_device_ip_reinit_late_sriov(adev); 5721 if (r) 5722 return r; 5723 5724 hive = amdgpu_get_xgmi_hive(adev); 5725 /* Update PSP FW topology after reset */ 5726 if (hive && adev->gmc.xgmi.num_physical_nodes > 1) 5727 r = amdgpu_xgmi_update_topology(hive, adev); 5728 if (hive) 5729 amdgpu_put_xgmi_hive(hive); 5730 if (r) 5731 return r; 5732 5733 r = amdgpu_ib_ring_tests(adev); 5734 if (r) 5735 return r; 5736 5737 if (adev->virt.gim_feature & AMDGIM_FEATURE_GIM_FLR_VRAMLOST) 5738 amdgpu_inc_vram_lost(adev); 5739 5740 /* need to be called during full access so we can't do it later like 5741 * bare-metal does. 5742 */ 5743 amdgpu_amdkfd_post_reset(adev); 5744 amdgpu_virt_release_full_gpu(adev, true); 5745 5746 /* Aldebaran and gfx_11_0_3 support ras in SRIOV, so need resume ras during reset */ 5747 if (amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 2) || 5748 amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 3) || 5749 amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 4) || 5750 amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 5, 0) || 5751 amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(11, 0, 3)) 5752 amdgpu_ras_resume(adev); 5753 5754 amdgpu_virt_ras_telemetry_post_reset(adev); 5755 5756 return 0; 5757 } 5758 5759 /** 5760 * amdgpu_device_has_job_running - check if there is any unfinished job 5761 * 5762 * @adev: amdgpu_device pointer 5763 * 5764 * check if there is any job running on the device when guest driver receives 5765 * FLR notification from host driver. If there are still jobs running, then 5766 * the guest driver will not respond the FLR reset. Instead, let the job hit 5767 * the timeout and guest driver then issue the reset request. 5768 */ 5769 bool amdgpu_device_has_job_running(struct amdgpu_device *adev) 5770 { 5771 int i; 5772 5773 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 5774 struct amdgpu_ring *ring = adev->rings[i]; 5775 5776 if (!amdgpu_ring_sched_ready(ring)) 5777 continue; 5778 5779 if (amdgpu_fence_count_emitted(ring)) 5780 return true; 5781 } 5782 return false; 5783 } 5784 5785 /** 5786 * amdgpu_device_should_recover_gpu - check if we should try GPU recovery 5787 * 5788 * @adev: amdgpu_device pointer 5789 * 5790 * Check amdgpu_gpu_recovery and SRIOV status to see if we should try to recover 5791 * a hung GPU. 5792 */ 5793 bool amdgpu_device_should_recover_gpu(struct amdgpu_device *adev) 5794 { 5795 5796 if (amdgpu_gpu_recovery == 0) 5797 goto disabled; 5798 5799 /* Skip soft reset check in fatal error mode */ 5800 if (!amdgpu_ras_is_poison_mode_supported(adev)) 5801 return true; 5802 5803 if (amdgpu_sriov_vf(adev)) 5804 return true; 5805 5806 if (amdgpu_gpu_recovery == -1) { 5807 switch (adev->asic_type) { 5808 #ifdef CONFIG_DRM_AMDGPU_SI 5809 case CHIP_VERDE: 5810 case CHIP_TAHITI: 5811 case CHIP_PITCAIRN: 5812 case CHIP_OLAND: 5813 case CHIP_HAINAN: 5814 #endif 5815 #ifdef CONFIG_DRM_AMDGPU_CIK 5816 case CHIP_KAVERI: 5817 case CHIP_KABINI: 5818 case CHIP_MULLINS: 5819 #endif 5820 case CHIP_CARRIZO: 5821 case CHIP_STONEY: 5822 case CHIP_CYAN_SKILLFISH: 5823 goto disabled; 5824 default: 5825 break; 5826 } 5827 } 5828 5829 return true; 5830 5831 disabled: 5832 dev_info(adev->dev, "GPU recovery disabled.\n"); 5833 return false; 5834 } 5835 5836 int amdgpu_device_mode1_reset(struct amdgpu_device *adev) 5837 { 5838 u32 i; 5839 int ret = 0; 5840 5841 if (adev->bios) 5842 amdgpu_atombios_scratch_regs_engine_hung(adev, true); 5843 5844 dev_info(adev->dev, "GPU mode1 reset\n"); 5845 5846 /* Cache the state before bus master disable. The saved config space 5847 * values are used in other cases like restore after mode-2 reset. 5848 */ 5849 amdgpu_device_cache_pci_state(adev->pdev); 5850 5851 /* disable BM */ 5852 pci_clear_master(adev->pdev); 5853 5854 if (amdgpu_dpm_is_mode1_reset_supported(adev)) { 5855 dev_info(adev->dev, "GPU smu mode1 reset\n"); 5856 ret = amdgpu_dpm_mode1_reset(adev); 5857 } else { 5858 dev_info(adev->dev, "GPU psp mode1 reset\n"); 5859 ret = psp_gpu_reset(adev); 5860 } 5861 5862 if (ret) 5863 goto mode1_reset_failed; 5864 5865 amdgpu_device_load_pci_state(adev->pdev); 5866 ret = amdgpu_psp_wait_for_bootloader(adev); 5867 if (ret) 5868 goto mode1_reset_failed; 5869 5870 /* wait for asic to come out of reset */ 5871 for (i = 0; i < adev->usec_timeout; i++) { 5872 u32 memsize = adev->nbio.funcs->get_memsize(adev); 5873 5874 if (memsize != 0xffffffff) 5875 break; 5876 udelay(1); 5877 } 5878 5879 if (i >= adev->usec_timeout) { 5880 ret = -ETIMEDOUT; 5881 goto mode1_reset_failed; 5882 } 5883 5884 if (adev->bios) 5885 amdgpu_atombios_scratch_regs_engine_hung(adev, false); 5886 5887 return 0; 5888 5889 mode1_reset_failed: 5890 dev_err(adev->dev, "GPU mode1 reset failed\n"); 5891 return ret; 5892 } 5893 5894 int amdgpu_device_link_reset(struct amdgpu_device *adev) 5895 { 5896 int ret = 0; 5897 5898 dev_info(adev->dev, "GPU link reset\n"); 5899 5900 if (!amdgpu_reset_in_dpc(adev)) 5901 ret = amdgpu_dpm_link_reset(adev); 5902 5903 if (ret) 5904 goto link_reset_failed; 5905 5906 ret = amdgpu_psp_wait_for_bootloader(adev); 5907 if (ret) 5908 goto link_reset_failed; 5909 5910 return 0; 5911 5912 link_reset_failed: 5913 dev_err(adev->dev, "GPU link reset failed\n"); 5914 return ret; 5915 } 5916 5917 int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev, 5918 struct amdgpu_reset_context *reset_context) 5919 { 5920 int i, r = 0; 5921 struct amdgpu_job *job = NULL; 5922 struct amdgpu_device *tmp_adev = reset_context->reset_req_dev; 5923 bool need_full_reset = 5924 test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 5925 5926 if (reset_context->reset_req_dev == adev) 5927 job = reset_context->job; 5928 5929 if (amdgpu_sriov_vf(adev)) 5930 amdgpu_virt_pre_reset(adev); 5931 5932 amdgpu_fence_driver_isr_toggle(adev, true); 5933 5934 /* block all schedulers and reset given job's ring */ 5935 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 5936 struct amdgpu_ring *ring = adev->rings[i]; 5937 5938 if (!amdgpu_ring_sched_ready(ring)) 5939 continue; 5940 5941 /* after all hw jobs are reset, hw fence is meaningless, so force_completion */ 5942 amdgpu_fence_driver_force_completion(ring); 5943 } 5944 5945 amdgpu_fence_driver_isr_toggle(adev, false); 5946 5947 if (job && job->vm) 5948 drm_sched_increase_karma(&job->base); 5949 5950 r = amdgpu_reset_prepare_hwcontext(adev, reset_context); 5951 /* If reset handler not implemented, continue; otherwise return */ 5952 if (r == -EOPNOTSUPP) 5953 r = 0; 5954 else 5955 return r; 5956 5957 /* Don't suspend on bare metal if we are not going to HW reset the ASIC */ 5958 if (!amdgpu_sriov_vf(adev)) { 5959 5960 if (!need_full_reset) 5961 need_full_reset = amdgpu_device_ip_need_full_reset(adev); 5962 5963 if (!need_full_reset && amdgpu_gpu_recovery && 5964 amdgpu_device_ip_check_soft_reset(adev)) { 5965 amdgpu_device_ip_pre_soft_reset(adev); 5966 r = amdgpu_device_ip_soft_reset(adev); 5967 amdgpu_device_ip_post_soft_reset(adev); 5968 if (r || amdgpu_device_ip_check_soft_reset(adev)) { 5969 dev_info(adev->dev, "soft reset failed, will fallback to full reset!\n"); 5970 need_full_reset = true; 5971 } 5972 } 5973 5974 if (!test_bit(AMDGPU_SKIP_COREDUMP, &reset_context->flags)) { 5975 dev_info(tmp_adev->dev, "Dumping IP State\n"); 5976 /* Trigger ip dump before we reset the asic */ 5977 for (i = 0; i < tmp_adev->num_ip_blocks; i++) 5978 if (tmp_adev->ip_blocks[i].version->funcs->dump_ip_state) 5979 tmp_adev->ip_blocks[i].version->funcs 5980 ->dump_ip_state((void *)&tmp_adev->ip_blocks[i]); 5981 dev_info(tmp_adev->dev, "Dumping IP State Completed\n"); 5982 } 5983 5984 if (need_full_reset) 5985 r = amdgpu_device_ip_suspend(adev); 5986 if (need_full_reset) 5987 set_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 5988 else 5989 clear_bit(AMDGPU_NEED_FULL_RESET, 5990 &reset_context->flags); 5991 } 5992 5993 return r; 5994 } 5995 5996 int amdgpu_device_reinit_after_reset(struct amdgpu_reset_context *reset_context) 5997 { 5998 struct list_head *device_list_handle; 5999 bool full_reset, vram_lost = false; 6000 struct amdgpu_device *tmp_adev; 6001 int r, init_level; 6002 6003 device_list_handle = reset_context->reset_device_list; 6004 6005 if (!device_list_handle) 6006 return -EINVAL; 6007 6008 full_reset = test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 6009 6010 /** 6011 * If it's reset on init, it's default init level, otherwise keep level 6012 * as recovery level. 6013 */ 6014 if (reset_context->method == AMD_RESET_METHOD_ON_INIT) 6015 init_level = AMDGPU_INIT_LEVEL_DEFAULT; 6016 else 6017 init_level = AMDGPU_INIT_LEVEL_RESET_RECOVERY; 6018 6019 r = 0; 6020 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 6021 amdgpu_set_init_level(tmp_adev, init_level); 6022 if (full_reset) { 6023 /* post card */ 6024 amdgpu_reset_set_dpc_status(tmp_adev, false); 6025 amdgpu_ras_clear_err_state(tmp_adev); 6026 r = amdgpu_device_asic_init(tmp_adev); 6027 if (r) { 6028 dev_warn(tmp_adev->dev, "asic atom init failed!"); 6029 } else { 6030 dev_info(tmp_adev->dev, "GPU reset succeeded, trying to resume\n"); 6031 6032 r = amdgpu_device_ip_resume_phase1(tmp_adev); 6033 if (r) 6034 goto out; 6035 6036 vram_lost = amdgpu_device_check_vram_lost(tmp_adev); 6037 6038 if (!test_bit(AMDGPU_SKIP_COREDUMP, &reset_context->flags)) 6039 amdgpu_coredump(tmp_adev, false, vram_lost, reset_context->job); 6040 6041 if (vram_lost) { 6042 dev_info( 6043 tmp_adev->dev, 6044 "VRAM is lost due to GPU reset!\n"); 6045 amdgpu_inc_vram_lost(tmp_adev); 6046 } 6047 6048 r = amdgpu_device_fw_loading(tmp_adev); 6049 if (r) 6050 return r; 6051 6052 r = amdgpu_xcp_restore_partition_mode( 6053 tmp_adev->xcp_mgr); 6054 if (r) 6055 goto out; 6056 6057 r = amdgpu_device_ip_resume_phase2(tmp_adev); 6058 if (r) 6059 goto out; 6060 6061 if (tmp_adev->mman.buffer_funcs_ring->sched.ready) 6062 amdgpu_ttm_set_buffer_funcs_status(tmp_adev, true); 6063 6064 r = amdgpu_device_ip_resume_phase3(tmp_adev); 6065 if (r) 6066 goto out; 6067 6068 if (vram_lost) 6069 amdgpu_device_fill_reset_magic(tmp_adev); 6070 6071 /* 6072 * Add this ASIC as tracked as reset was already 6073 * complete successfully. 6074 */ 6075 amdgpu_register_gpu_instance(tmp_adev); 6076 6077 if (!reset_context->hive && 6078 tmp_adev->gmc.xgmi.num_physical_nodes > 1) 6079 amdgpu_xgmi_add_device(tmp_adev); 6080 6081 r = amdgpu_device_ip_late_init(tmp_adev); 6082 if (r) 6083 goto out; 6084 6085 r = amdgpu_userq_post_reset(tmp_adev, vram_lost); 6086 if (r) 6087 goto out; 6088 6089 drm_client_dev_resume(adev_to_drm(tmp_adev)); 6090 6091 /* 6092 * The GPU enters bad state once faulty pages 6093 * by ECC has reached the threshold, and ras 6094 * recovery is scheduled next. So add one check 6095 * here to break recovery if it indeed exceeds 6096 * bad page threshold, and remind user to 6097 * retire this GPU or setting one bigger 6098 * bad_page_threshold value to fix this once 6099 * probing driver again. 6100 */ 6101 if (!amdgpu_ras_is_rma(tmp_adev)) { 6102 /* must succeed. */ 6103 amdgpu_ras_resume(tmp_adev); 6104 } else { 6105 r = -EINVAL; 6106 goto out; 6107 } 6108 6109 /* Update PSP FW topology after reset */ 6110 if (reset_context->hive && 6111 tmp_adev->gmc.xgmi.num_physical_nodes > 1) 6112 r = amdgpu_xgmi_update_topology( 6113 reset_context->hive, tmp_adev); 6114 } 6115 } 6116 6117 out: 6118 if (!r) { 6119 /* IP init is complete now, set level as default */ 6120 amdgpu_set_init_level(tmp_adev, 6121 AMDGPU_INIT_LEVEL_DEFAULT); 6122 amdgpu_irq_gpu_reset_resume_helper(tmp_adev); 6123 r = amdgpu_ib_ring_tests(tmp_adev); 6124 if (r) { 6125 dev_err(tmp_adev->dev, "ib ring test failed (%d).\n", r); 6126 r = -EAGAIN; 6127 goto end; 6128 } 6129 } 6130 6131 if (r) 6132 tmp_adev->asic_reset_res = r; 6133 } 6134 6135 end: 6136 return r; 6137 } 6138 6139 int amdgpu_do_asic_reset(struct list_head *device_list_handle, 6140 struct amdgpu_reset_context *reset_context) 6141 { 6142 struct amdgpu_device *tmp_adev = NULL; 6143 bool need_full_reset, skip_hw_reset; 6144 int r = 0; 6145 6146 /* Try reset handler method first */ 6147 tmp_adev = list_first_entry(device_list_handle, struct amdgpu_device, 6148 reset_list); 6149 6150 reset_context->reset_device_list = device_list_handle; 6151 r = amdgpu_reset_perform_reset(tmp_adev, reset_context); 6152 /* If reset handler not implemented, continue; otherwise return */ 6153 if (r == -EOPNOTSUPP) 6154 r = 0; 6155 else 6156 return r; 6157 6158 /* Reset handler not implemented, use the default method */ 6159 need_full_reset = 6160 test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 6161 skip_hw_reset = test_bit(AMDGPU_SKIP_HW_RESET, &reset_context->flags); 6162 6163 /* 6164 * ASIC reset has to be done on all XGMI hive nodes ASAP 6165 * to allow proper links negotiation in FW (within 1 sec) 6166 */ 6167 if (!skip_hw_reset && need_full_reset) { 6168 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 6169 /* For XGMI run all resets in parallel to speed up the process */ 6170 if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) { 6171 if (!queue_work(system_unbound_wq, 6172 &tmp_adev->xgmi_reset_work)) 6173 r = -EALREADY; 6174 } else 6175 r = amdgpu_asic_reset(tmp_adev); 6176 6177 if (r) { 6178 dev_err(tmp_adev->dev, 6179 "ASIC reset failed with error, %d for drm dev, %s", 6180 r, adev_to_drm(tmp_adev)->unique); 6181 goto out; 6182 } 6183 } 6184 6185 /* For XGMI wait for all resets to complete before proceed */ 6186 if (!r) { 6187 list_for_each_entry(tmp_adev, device_list_handle, 6188 reset_list) { 6189 if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) { 6190 flush_work(&tmp_adev->xgmi_reset_work); 6191 r = tmp_adev->asic_reset_res; 6192 if (r) 6193 break; 6194 } 6195 } 6196 } 6197 } 6198 6199 if (!r && amdgpu_ras_intr_triggered()) { 6200 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 6201 amdgpu_ras_reset_error_count(tmp_adev, 6202 AMDGPU_RAS_BLOCK__MMHUB); 6203 } 6204 6205 amdgpu_ras_intr_cleared(); 6206 } 6207 6208 r = amdgpu_device_reinit_after_reset(reset_context); 6209 if (r == -EAGAIN) 6210 set_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 6211 else 6212 clear_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 6213 6214 out: 6215 return r; 6216 } 6217 6218 static void amdgpu_device_set_mp1_state(struct amdgpu_device *adev) 6219 { 6220 6221 switch (amdgpu_asic_reset_method(adev)) { 6222 case AMD_RESET_METHOD_MODE1: 6223 case AMD_RESET_METHOD_LINK: 6224 adev->mp1_state = PP_MP1_STATE_SHUTDOWN; 6225 break; 6226 case AMD_RESET_METHOD_MODE2: 6227 adev->mp1_state = PP_MP1_STATE_RESET; 6228 break; 6229 default: 6230 adev->mp1_state = PP_MP1_STATE_NONE; 6231 break; 6232 } 6233 } 6234 6235 static void amdgpu_device_unset_mp1_state(struct amdgpu_device *adev) 6236 { 6237 amdgpu_vf_error_trans_all(adev); 6238 adev->mp1_state = PP_MP1_STATE_NONE; 6239 } 6240 6241 static void amdgpu_device_resume_display_audio(struct amdgpu_device *adev) 6242 { 6243 struct pci_dev *p = NULL; 6244 6245 p = pci_get_domain_bus_and_slot(pci_domain_nr(adev->pdev->bus), 6246 adev->pdev->bus->number, 1); 6247 if (p) { 6248 pm_runtime_enable(&(p->dev)); 6249 pm_runtime_resume(&(p->dev)); 6250 } 6251 6252 pci_dev_put(p); 6253 } 6254 6255 static int amdgpu_device_suspend_display_audio(struct amdgpu_device *adev) 6256 { 6257 enum amd_reset_method reset_method; 6258 struct pci_dev *p = NULL; 6259 u64 expires; 6260 6261 /* 6262 * For now, only BACO and mode1 reset are confirmed 6263 * to suffer the audio issue without proper suspended. 6264 */ 6265 reset_method = amdgpu_asic_reset_method(adev); 6266 if ((reset_method != AMD_RESET_METHOD_BACO) && 6267 (reset_method != AMD_RESET_METHOD_MODE1)) 6268 return -EINVAL; 6269 6270 p = pci_get_domain_bus_and_slot(pci_domain_nr(adev->pdev->bus), 6271 adev->pdev->bus->number, 1); 6272 if (!p) 6273 return -ENODEV; 6274 6275 expires = pm_runtime_autosuspend_expiration(&(p->dev)); 6276 if (!expires) 6277 /* 6278 * If we cannot get the audio device autosuspend delay, 6279 * a fixed 4S interval will be used. Considering 3S is 6280 * the audio controller default autosuspend delay setting. 6281 * 4S used here is guaranteed to cover that. 6282 */ 6283 expires = ktime_get_mono_fast_ns() + NSEC_PER_SEC * 4ULL; 6284 6285 while (!pm_runtime_status_suspended(&(p->dev))) { 6286 if (!pm_runtime_suspend(&(p->dev))) 6287 break; 6288 6289 if (expires < ktime_get_mono_fast_ns()) { 6290 dev_warn(adev->dev, "failed to suspend display audio\n"); 6291 pci_dev_put(p); 6292 /* TODO: abort the succeeding gpu reset? */ 6293 return -ETIMEDOUT; 6294 } 6295 } 6296 6297 pm_runtime_disable(&(p->dev)); 6298 6299 pci_dev_put(p); 6300 return 0; 6301 } 6302 6303 static inline void amdgpu_device_stop_pending_resets(struct amdgpu_device *adev) 6304 { 6305 struct amdgpu_ras *con = amdgpu_ras_get_context(adev); 6306 6307 #if defined(CONFIG_DEBUG_FS) 6308 if (!amdgpu_sriov_vf(adev)) 6309 cancel_work(&adev->reset_work); 6310 #endif 6311 cancel_work(&adev->userq_reset_work); 6312 6313 if (adev->kfd.dev) 6314 cancel_work(&adev->kfd.reset_work); 6315 6316 if (amdgpu_sriov_vf(adev)) 6317 cancel_work(&adev->virt.flr_work); 6318 6319 if (con && adev->ras_enabled) 6320 cancel_work(&con->recovery_work); 6321 6322 } 6323 6324 static int amdgpu_device_health_check(struct list_head *device_list_handle) 6325 { 6326 struct amdgpu_device *tmp_adev; 6327 int ret = 0; 6328 6329 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 6330 ret |= amdgpu_device_bus_status_check(tmp_adev); 6331 } 6332 6333 return ret; 6334 } 6335 6336 static void amdgpu_device_recovery_prepare(struct amdgpu_device *adev, 6337 struct list_head *device_list, 6338 struct amdgpu_hive_info *hive) 6339 { 6340 struct amdgpu_device *tmp_adev = NULL; 6341 6342 /* 6343 * Build list of devices to reset. 6344 * In case we are in XGMI hive mode, resort the device list 6345 * to put adev in the 1st position. 6346 */ 6347 if (!amdgpu_sriov_vf(adev) && (adev->gmc.xgmi.num_physical_nodes > 1) && hive) { 6348 list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head) { 6349 list_add_tail(&tmp_adev->reset_list, device_list); 6350 if (adev->shutdown) 6351 tmp_adev->shutdown = true; 6352 if (amdgpu_reset_in_dpc(adev)) 6353 tmp_adev->pcie_reset_ctx.in_link_reset = true; 6354 } 6355 if (!list_is_first(&adev->reset_list, device_list)) 6356 list_rotate_to_front(&adev->reset_list, device_list); 6357 } else { 6358 list_add_tail(&adev->reset_list, device_list); 6359 } 6360 } 6361 6362 static void amdgpu_device_recovery_get_reset_lock(struct amdgpu_device *adev, 6363 struct list_head *device_list) 6364 { 6365 struct amdgpu_device *tmp_adev = NULL; 6366 6367 if (list_empty(device_list)) 6368 return; 6369 tmp_adev = 6370 list_first_entry(device_list, struct amdgpu_device, reset_list); 6371 amdgpu_device_lock_reset_domain(tmp_adev->reset_domain); 6372 } 6373 6374 static void amdgpu_device_recovery_put_reset_lock(struct amdgpu_device *adev, 6375 struct list_head *device_list) 6376 { 6377 struct amdgpu_device *tmp_adev = NULL; 6378 6379 if (list_empty(device_list)) 6380 return; 6381 tmp_adev = 6382 list_first_entry(device_list, struct amdgpu_device, reset_list); 6383 amdgpu_device_unlock_reset_domain(tmp_adev->reset_domain); 6384 } 6385 6386 static void amdgpu_device_halt_activities(struct amdgpu_device *adev, 6387 struct amdgpu_job *job, 6388 struct amdgpu_reset_context *reset_context, 6389 struct list_head *device_list, 6390 struct amdgpu_hive_info *hive, 6391 bool need_emergency_restart) 6392 { 6393 struct amdgpu_device *tmp_adev = NULL; 6394 int i; 6395 6396 /* block all schedulers and reset given job's ring */ 6397 list_for_each_entry(tmp_adev, device_list, reset_list) { 6398 amdgpu_device_set_mp1_state(tmp_adev); 6399 6400 /* 6401 * Try to put the audio codec into suspend state 6402 * before gpu reset started. 6403 * 6404 * Due to the power domain of the graphics device 6405 * is shared with AZ power domain. Without this, 6406 * we may change the audio hardware from behind 6407 * the audio driver's back. That will trigger 6408 * some audio codec errors. 6409 */ 6410 if (!amdgpu_device_suspend_display_audio(tmp_adev)) 6411 tmp_adev->pcie_reset_ctx.audio_suspended = true; 6412 6413 amdgpu_ras_set_error_query_ready(tmp_adev, false); 6414 6415 cancel_delayed_work_sync(&tmp_adev->delayed_init_work); 6416 6417 amdgpu_amdkfd_pre_reset(tmp_adev, reset_context); 6418 6419 /* 6420 * Mark these ASICs to be reset as untracked first 6421 * And add them back after reset completed 6422 */ 6423 amdgpu_unregister_gpu_instance(tmp_adev); 6424 6425 drm_client_dev_suspend(adev_to_drm(tmp_adev)); 6426 6427 /* disable ras on ALL IPs */ 6428 if (!need_emergency_restart && !amdgpu_reset_in_dpc(adev) && 6429 amdgpu_device_ip_need_full_reset(tmp_adev)) 6430 amdgpu_ras_suspend(tmp_adev); 6431 6432 amdgpu_userq_pre_reset(tmp_adev); 6433 6434 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 6435 struct amdgpu_ring *ring = tmp_adev->rings[i]; 6436 6437 if (!amdgpu_ring_sched_ready(ring)) 6438 continue; 6439 6440 drm_sched_stop(&ring->sched, job ? &job->base : NULL); 6441 6442 if (need_emergency_restart) 6443 amdgpu_job_stop_all_jobs_on_sched(&ring->sched); 6444 } 6445 atomic_inc(&tmp_adev->gpu_reset_counter); 6446 } 6447 } 6448 6449 static int amdgpu_device_asic_reset(struct amdgpu_device *adev, 6450 struct list_head *device_list, 6451 struct amdgpu_reset_context *reset_context) 6452 { 6453 struct amdgpu_device *tmp_adev = NULL; 6454 int retry_limit = AMDGPU_MAX_RETRY_LIMIT; 6455 int r = 0; 6456 6457 retry: /* Rest of adevs pre asic reset from XGMI hive. */ 6458 list_for_each_entry(tmp_adev, device_list, reset_list) { 6459 r = amdgpu_device_pre_asic_reset(tmp_adev, reset_context); 6460 /*TODO Should we stop ?*/ 6461 if (r) { 6462 dev_err(tmp_adev->dev, "GPU pre asic reset failed with err, %d for drm dev, %s ", 6463 r, adev_to_drm(tmp_adev)->unique); 6464 tmp_adev->asic_reset_res = r; 6465 } 6466 } 6467 6468 /* Actual ASIC resets if needed.*/ 6469 /* Host driver will handle XGMI hive reset for SRIOV */ 6470 if (amdgpu_sriov_vf(adev)) { 6471 6472 /* Bail out of reset early */ 6473 if (amdgpu_ras_is_rma(adev)) 6474 return -ENODEV; 6475 6476 if (amdgpu_ras_get_fed_status(adev) || amdgpu_virt_rcvd_ras_interrupt(adev)) { 6477 dev_dbg(adev->dev, "Detected RAS error, wait for FLR completion\n"); 6478 amdgpu_ras_set_fed(adev, true); 6479 set_bit(AMDGPU_HOST_FLR, &reset_context->flags); 6480 } 6481 6482 r = amdgpu_device_reset_sriov(adev, reset_context); 6483 if (AMDGPU_RETRY_SRIOV_RESET(r) && (retry_limit--) > 0) { 6484 amdgpu_virt_release_full_gpu(adev, true); 6485 goto retry; 6486 } 6487 if (r) 6488 adev->asic_reset_res = r; 6489 } else { 6490 r = amdgpu_do_asic_reset(device_list, reset_context); 6491 if (r && r == -EAGAIN) 6492 goto retry; 6493 } 6494 6495 list_for_each_entry(tmp_adev, device_list, reset_list) { 6496 /* 6497 * Drop any pending non scheduler resets queued before reset is done. 6498 * Any reset scheduled after this point would be valid. Scheduler resets 6499 * were already dropped during drm_sched_stop and no new ones can come 6500 * in before drm_sched_start. 6501 */ 6502 amdgpu_device_stop_pending_resets(tmp_adev); 6503 } 6504 6505 return r; 6506 } 6507 6508 static int amdgpu_device_sched_resume(struct list_head *device_list, 6509 struct amdgpu_reset_context *reset_context, 6510 bool job_signaled) 6511 { 6512 struct amdgpu_device *tmp_adev = NULL; 6513 int i, r = 0; 6514 6515 /* Post ASIC reset for all devs .*/ 6516 list_for_each_entry(tmp_adev, device_list, reset_list) { 6517 6518 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 6519 struct amdgpu_ring *ring = tmp_adev->rings[i]; 6520 6521 if (!amdgpu_ring_sched_ready(ring)) 6522 continue; 6523 6524 drm_sched_start(&ring->sched, 0); 6525 } 6526 6527 if (!drm_drv_uses_atomic_modeset(adev_to_drm(tmp_adev)) && !job_signaled) 6528 drm_helper_resume_force_mode(adev_to_drm(tmp_adev)); 6529 6530 if (tmp_adev->asic_reset_res) { 6531 /* bad news, how to tell it to userspace ? 6532 * for ras error, we should report GPU bad status instead of 6533 * reset failure 6534 */ 6535 if (reset_context->src != AMDGPU_RESET_SRC_RAS || 6536 !amdgpu_ras_eeprom_check_err_threshold(tmp_adev)) 6537 dev_info( 6538 tmp_adev->dev, 6539 "GPU reset(%d) failed with error %d \n", 6540 atomic_read( 6541 &tmp_adev->gpu_reset_counter), 6542 tmp_adev->asic_reset_res); 6543 amdgpu_vf_error_put(tmp_adev, 6544 AMDGIM_ERROR_VF_GPU_RESET_FAIL, 0, 6545 tmp_adev->asic_reset_res); 6546 if (!r) 6547 r = tmp_adev->asic_reset_res; 6548 tmp_adev->asic_reset_res = 0; 6549 } else { 6550 dev_info(tmp_adev->dev, "GPU reset(%d) succeeded!\n", 6551 atomic_read(&tmp_adev->gpu_reset_counter)); 6552 if (amdgpu_acpi_smart_shift_update(tmp_adev, 6553 AMDGPU_SS_DEV_D0)) 6554 dev_warn(tmp_adev->dev, 6555 "smart shift update failed\n"); 6556 } 6557 } 6558 6559 return r; 6560 } 6561 6562 static void amdgpu_device_gpu_resume(struct amdgpu_device *adev, 6563 struct list_head *device_list, 6564 bool need_emergency_restart) 6565 { 6566 struct amdgpu_device *tmp_adev = NULL; 6567 6568 list_for_each_entry(tmp_adev, device_list, reset_list) { 6569 /* unlock kfd: SRIOV would do it separately */ 6570 if (!need_emergency_restart && !amdgpu_sriov_vf(tmp_adev)) 6571 amdgpu_amdkfd_post_reset(tmp_adev); 6572 6573 /* kfd_post_reset will do nothing if kfd device is not initialized, 6574 * need to bring up kfd here if it's not be initialized before 6575 */ 6576 if (!adev->kfd.init_complete) 6577 amdgpu_amdkfd_device_init(adev); 6578 6579 if (tmp_adev->pcie_reset_ctx.audio_suspended) 6580 amdgpu_device_resume_display_audio(tmp_adev); 6581 6582 amdgpu_device_unset_mp1_state(tmp_adev); 6583 6584 amdgpu_ras_set_error_query_ready(tmp_adev, true); 6585 6586 } 6587 } 6588 6589 6590 /** 6591 * amdgpu_device_gpu_recover - reset the asic and recover scheduler 6592 * 6593 * @adev: amdgpu_device pointer 6594 * @job: which job trigger hang 6595 * @reset_context: amdgpu reset context pointer 6596 * 6597 * Attempt to reset the GPU if it has hung (all asics). 6598 * Attempt to do soft-reset or full-reset and reinitialize Asic 6599 * Returns 0 for success or an error on failure. 6600 */ 6601 6602 int amdgpu_device_gpu_recover(struct amdgpu_device *adev, 6603 struct amdgpu_job *job, 6604 struct amdgpu_reset_context *reset_context) 6605 { 6606 struct list_head device_list; 6607 bool job_signaled = false; 6608 struct amdgpu_hive_info *hive = NULL; 6609 int r = 0; 6610 bool need_emergency_restart = false; 6611 6612 /* 6613 * If it reaches here because of hang/timeout and a RAS error is 6614 * detected at the same time, let RAS recovery take care of it. 6615 */ 6616 if (amdgpu_ras_is_err_state(adev, AMDGPU_RAS_BLOCK__ANY) && 6617 !amdgpu_sriov_vf(adev) && 6618 reset_context->src != AMDGPU_RESET_SRC_RAS) { 6619 dev_dbg(adev->dev, 6620 "Gpu recovery from source: %d yielding to RAS error recovery handling", 6621 reset_context->src); 6622 return 0; 6623 } 6624 6625 /* 6626 * Special case: RAS triggered and full reset isn't supported 6627 */ 6628 need_emergency_restart = amdgpu_ras_need_emergency_restart(adev); 6629 6630 /* 6631 * Flush RAM to disk so that after reboot 6632 * the user can read log and see why the system rebooted. 6633 */ 6634 if (need_emergency_restart && amdgpu_ras_get_context(adev) && 6635 amdgpu_ras_get_context(adev)->reboot) { 6636 dev_warn(adev->dev, "Emergency reboot."); 6637 6638 ksys_sync_helper(); 6639 emergency_restart(); 6640 } 6641 6642 dev_info(adev->dev, "GPU %s begin!. Source: %d\n", 6643 need_emergency_restart ? "jobs stop" : "reset", 6644 reset_context->src); 6645 6646 if (!amdgpu_sriov_vf(adev)) 6647 hive = amdgpu_get_xgmi_hive(adev); 6648 if (hive) 6649 mutex_lock(&hive->hive_lock); 6650 6651 reset_context->job = job; 6652 reset_context->hive = hive; 6653 INIT_LIST_HEAD(&device_list); 6654 6655 amdgpu_device_recovery_prepare(adev, &device_list, hive); 6656 6657 if (!amdgpu_sriov_vf(adev)) { 6658 r = amdgpu_device_health_check(&device_list); 6659 if (r) 6660 goto end_reset; 6661 } 6662 6663 /* We need to lock reset domain only once both for XGMI and single device */ 6664 amdgpu_device_recovery_get_reset_lock(adev, &device_list); 6665 6666 amdgpu_device_halt_activities(adev, job, reset_context, &device_list, 6667 hive, need_emergency_restart); 6668 if (need_emergency_restart) 6669 goto skip_sched_resume; 6670 /* 6671 * Must check guilty signal here since after this point all old 6672 * HW fences are force signaled. 6673 * 6674 * job->base holds a reference to parent fence 6675 */ 6676 if (job && dma_fence_is_signaled(&job->hw_fence->base)) { 6677 job_signaled = true; 6678 dev_info(adev->dev, "Guilty job already signaled, skipping HW reset"); 6679 goto skip_hw_reset; 6680 } 6681 6682 r = amdgpu_device_asic_reset(adev, &device_list, reset_context); 6683 if (r) 6684 goto reset_unlock; 6685 skip_hw_reset: 6686 r = amdgpu_device_sched_resume(&device_list, reset_context, job_signaled); 6687 if (r) 6688 goto reset_unlock; 6689 skip_sched_resume: 6690 amdgpu_device_gpu_resume(adev, &device_list, need_emergency_restart); 6691 reset_unlock: 6692 amdgpu_device_recovery_put_reset_lock(adev, &device_list); 6693 end_reset: 6694 if (hive) { 6695 mutex_unlock(&hive->hive_lock); 6696 amdgpu_put_xgmi_hive(hive); 6697 } 6698 6699 if (r) 6700 dev_info(adev->dev, "GPU reset end with ret = %d\n", r); 6701 6702 atomic_set(&adev->reset_domain->reset_res, r); 6703 6704 if (!r) { 6705 struct amdgpu_task_info *ti = NULL; 6706 6707 if (job) 6708 ti = amdgpu_vm_get_task_info_pasid(adev, job->pasid); 6709 6710 drm_dev_wedged_event(adev_to_drm(adev), DRM_WEDGE_RECOVERY_NONE, 6711 ti ? &ti->task : NULL); 6712 6713 amdgpu_vm_put_task_info(ti); 6714 } 6715 6716 return r; 6717 } 6718 6719 /** 6720 * amdgpu_device_partner_bandwidth - find the bandwidth of appropriate partner 6721 * 6722 * @adev: amdgpu_device pointer 6723 * @speed: pointer to the speed of the link 6724 * @width: pointer to the width of the link 6725 * 6726 * Evaluate the hierarchy to find the speed and bandwidth capabilities of the 6727 * first physical partner to an AMD dGPU. 6728 * This will exclude any virtual switches and links. 6729 */ 6730 static void amdgpu_device_partner_bandwidth(struct amdgpu_device *adev, 6731 enum pci_bus_speed *speed, 6732 enum pcie_link_width *width) 6733 { 6734 struct pci_dev *parent = adev->pdev; 6735 6736 if (!speed || !width) 6737 return; 6738 6739 *speed = PCI_SPEED_UNKNOWN; 6740 *width = PCIE_LNK_WIDTH_UNKNOWN; 6741 6742 if (amdgpu_device_pcie_dynamic_switching_supported(adev)) { 6743 while ((parent = pci_upstream_bridge(parent))) { 6744 /* skip upstream/downstream switches internal to dGPU*/ 6745 if (parent->vendor == PCI_VENDOR_ID_ATI) 6746 continue; 6747 *speed = pcie_get_speed_cap(parent); 6748 *width = pcie_get_width_cap(parent); 6749 break; 6750 } 6751 } else { 6752 /* use the current speeds rather than max if switching is not supported */ 6753 pcie_bandwidth_available(adev->pdev, NULL, speed, width); 6754 } 6755 } 6756 6757 /** 6758 * amdgpu_device_gpu_bandwidth - find the bandwidth of the GPU 6759 * 6760 * @adev: amdgpu_device pointer 6761 * @speed: pointer to the speed of the link 6762 * @width: pointer to the width of the link 6763 * 6764 * Evaluate the hierarchy to find the speed and bandwidth capabilities of the 6765 * AMD dGPU which may be a virtual upstream bridge. 6766 */ 6767 static void amdgpu_device_gpu_bandwidth(struct amdgpu_device *adev, 6768 enum pci_bus_speed *speed, 6769 enum pcie_link_width *width) 6770 { 6771 struct pci_dev *parent = adev->pdev; 6772 6773 if (!speed || !width) 6774 return; 6775 6776 parent = pci_upstream_bridge(parent); 6777 if (parent && parent->vendor == PCI_VENDOR_ID_ATI) { 6778 /* use the upstream/downstream switches internal to dGPU */ 6779 *speed = pcie_get_speed_cap(parent); 6780 *width = pcie_get_width_cap(parent); 6781 while ((parent = pci_upstream_bridge(parent))) { 6782 if (parent->vendor == PCI_VENDOR_ID_ATI) { 6783 /* use the upstream/downstream switches internal to dGPU */ 6784 *speed = pcie_get_speed_cap(parent); 6785 *width = pcie_get_width_cap(parent); 6786 } 6787 } 6788 } else { 6789 /* use the device itself */ 6790 *speed = pcie_get_speed_cap(adev->pdev); 6791 *width = pcie_get_width_cap(adev->pdev); 6792 } 6793 } 6794 6795 /** 6796 * amdgpu_device_get_pcie_info - fence pcie info about the PCIE slot 6797 * 6798 * @adev: amdgpu_device pointer 6799 * 6800 * Fetches and stores in the driver the PCIE capabilities (gen speed 6801 * and lanes) of the slot the device is in. Handles APUs and 6802 * virtualized environments where PCIE config space may not be available. 6803 */ 6804 static void amdgpu_device_get_pcie_info(struct amdgpu_device *adev) 6805 { 6806 enum pci_bus_speed speed_cap, platform_speed_cap; 6807 enum pcie_link_width platform_link_width, link_width; 6808 6809 if (amdgpu_pcie_gen_cap) 6810 adev->pm.pcie_gen_mask = amdgpu_pcie_gen_cap; 6811 6812 if (amdgpu_pcie_lane_cap) 6813 adev->pm.pcie_mlw_mask = amdgpu_pcie_lane_cap; 6814 6815 /* covers APUs as well */ 6816 if (pci_is_root_bus(adev->pdev->bus) && !amdgpu_passthrough(adev)) { 6817 if (adev->pm.pcie_gen_mask == 0) 6818 adev->pm.pcie_gen_mask = AMDGPU_DEFAULT_PCIE_GEN_MASK; 6819 if (adev->pm.pcie_mlw_mask == 0) 6820 adev->pm.pcie_mlw_mask = AMDGPU_DEFAULT_PCIE_MLW_MASK; 6821 return; 6822 } 6823 6824 if (adev->pm.pcie_gen_mask && adev->pm.pcie_mlw_mask) 6825 return; 6826 6827 amdgpu_device_partner_bandwidth(adev, &platform_speed_cap, 6828 &platform_link_width); 6829 amdgpu_device_gpu_bandwidth(adev, &speed_cap, &link_width); 6830 6831 if (adev->pm.pcie_gen_mask == 0) { 6832 /* asic caps */ 6833 if (speed_cap == PCI_SPEED_UNKNOWN) { 6834 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 6835 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 | 6836 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3); 6837 } else { 6838 if (speed_cap == PCIE_SPEED_32_0GT) 6839 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 6840 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 | 6841 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3 | 6842 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN4 | 6843 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN5); 6844 else if (speed_cap == PCIE_SPEED_16_0GT) 6845 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 6846 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 | 6847 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3 | 6848 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN4); 6849 else if (speed_cap == PCIE_SPEED_8_0GT) 6850 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 6851 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 | 6852 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3); 6853 else if (speed_cap == PCIE_SPEED_5_0GT) 6854 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 6855 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2); 6856 else 6857 adev->pm.pcie_gen_mask |= CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1; 6858 } 6859 /* platform caps */ 6860 if (platform_speed_cap == PCI_SPEED_UNKNOWN) { 6861 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 6862 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2); 6863 } else { 6864 if (platform_speed_cap == PCIE_SPEED_32_0GT) 6865 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 6866 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 | 6867 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3 | 6868 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN4 | 6869 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN5); 6870 else if (platform_speed_cap == PCIE_SPEED_16_0GT) 6871 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 6872 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 | 6873 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3 | 6874 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN4); 6875 else if (platform_speed_cap == PCIE_SPEED_8_0GT) 6876 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 6877 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 | 6878 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3); 6879 else if (platform_speed_cap == PCIE_SPEED_5_0GT) 6880 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 6881 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2); 6882 else 6883 adev->pm.pcie_gen_mask |= CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1; 6884 6885 } 6886 } 6887 if (adev->pm.pcie_mlw_mask == 0) { 6888 /* asic caps */ 6889 if (link_width == PCIE_LNK_WIDTH_UNKNOWN) { 6890 adev->pm.pcie_mlw_mask |= AMDGPU_DEFAULT_ASIC_PCIE_MLW_MASK; 6891 } else { 6892 switch (link_width) { 6893 case PCIE_LNK_X32: 6894 adev->pm.pcie_mlw_mask |= (CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X32 | 6895 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X16 | 6896 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X12 | 6897 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X8 | 6898 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X4 | 6899 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X2 | 6900 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X1); 6901 break; 6902 case PCIE_LNK_X16: 6903 adev->pm.pcie_mlw_mask |= (CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X16 | 6904 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X12 | 6905 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X8 | 6906 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X4 | 6907 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X2 | 6908 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X1); 6909 break; 6910 case PCIE_LNK_X12: 6911 adev->pm.pcie_mlw_mask |= (CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X12 | 6912 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X8 | 6913 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X4 | 6914 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X2 | 6915 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X1); 6916 break; 6917 case PCIE_LNK_X8: 6918 adev->pm.pcie_mlw_mask |= (CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X8 | 6919 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X4 | 6920 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X2 | 6921 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X1); 6922 break; 6923 case PCIE_LNK_X4: 6924 adev->pm.pcie_mlw_mask |= (CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X4 | 6925 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X2 | 6926 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X1); 6927 break; 6928 case PCIE_LNK_X2: 6929 adev->pm.pcie_mlw_mask |= (CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X2 | 6930 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X1); 6931 break; 6932 case PCIE_LNK_X1: 6933 adev->pm.pcie_mlw_mask |= CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X1; 6934 break; 6935 default: 6936 break; 6937 } 6938 } 6939 /* platform caps */ 6940 if (platform_link_width == PCIE_LNK_WIDTH_UNKNOWN) { 6941 adev->pm.pcie_mlw_mask |= AMDGPU_DEFAULT_PCIE_MLW_MASK; 6942 } else { 6943 switch (platform_link_width) { 6944 case PCIE_LNK_X32: 6945 adev->pm.pcie_mlw_mask |= (CAIL_PCIE_LINK_WIDTH_SUPPORT_X32 | 6946 CAIL_PCIE_LINK_WIDTH_SUPPORT_X16 | 6947 CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 | 6948 CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 | 6949 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 6950 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 6951 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 6952 break; 6953 case PCIE_LNK_X16: 6954 adev->pm.pcie_mlw_mask |= (CAIL_PCIE_LINK_WIDTH_SUPPORT_X16 | 6955 CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 | 6956 CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 | 6957 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 6958 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 6959 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 6960 break; 6961 case PCIE_LNK_X12: 6962 adev->pm.pcie_mlw_mask |= (CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 | 6963 CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 | 6964 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 6965 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 6966 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 6967 break; 6968 case PCIE_LNK_X8: 6969 adev->pm.pcie_mlw_mask |= (CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 | 6970 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 6971 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 6972 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 6973 break; 6974 case PCIE_LNK_X4: 6975 adev->pm.pcie_mlw_mask |= (CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 6976 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 6977 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 6978 break; 6979 case PCIE_LNK_X2: 6980 adev->pm.pcie_mlw_mask |= (CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 6981 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 6982 break; 6983 case PCIE_LNK_X1: 6984 adev->pm.pcie_mlw_mask |= CAIL_PCIE_LINK_WIDTH_SUPPORT_X1; 6985 break; 6986 default: 6987 break; 6988 } 6989 } 6990 } 6991 } 6992 6993 /** 6994 * amdgpu_device_is_peer_accessible - Check peer access through PCIe BAR 6995 * 6996 * @adev: amdgpu_device pointer 6997 * @peer_adev: amdgpu_device pointer for peer device trying to access @adev 6998 * 6999 * Return true if @peer_adev can access (DMA) @adev through the PCIe 7000 * BAR, i.e. @adev is "large BAR" and the BAR matches the DMA mask of 7001 * @peer_adev. 7002 */ 7003 bool amdgpu_device_is_peer_accessible(struct amdgpu_device *adev, 7004 struct amdgpu_device *peer_adev) 7005 { 7006 #ifdef CONFIG_HSA_AMD_P2P 7007 bool p2p_access = 7008 !adev->gmc.xgmi.connected_to_cpu && 7009 !(pci_p2pdma_distance(adev->pdev, peer_adev->dev, false) < 0); 7010 if (!p2p_access) 7011 dev_info(adev->dev, "PCIe P2P access from peer device %s is not supported by the chipset\n", 7012 pci_name(peer_adev->pdev)); 7013 7014 bool is_large_bar = adev->gmc.visible_vram_size && 7015 adev->gmc.real_vram_size == adev->gmc.visible_vram_size; 7016 bool p2p_addressable = amdgpu_device_check_iommu_remap(peer_adev); 7017 7018 if (!p2p_addressable) { 7019 uint64_t address_mask = peer_adev->dev->dma_mask ? 7020 ~*peer_adev->dev->dma_mask : ~((1ULL << 32) - 1); 7021 resource_size_t aper_limit = 7022 adev->gmc.aper_base + adev->gmc.aper_size - 1; 7023 7024 p2p_addressable = !(adev->gmc.aper_base & address_mask || 7025 aper_limit & address_mask); 7026 } 7027 return pcie_p2p && is_large_bar && p2p_access && p2p_addressable; 7028 #else 7029 return false; 7030 #endif 7031 } 7032 7033 int amdgpu_device_baco_enter(struct amdgpu_device *adev) 7034 { 7035 struct amdgpu_ras *ras = amdgpu_ras_get_context(adev); 7036 7037 if (!amdgpu_device_supports_baco(adev)) 7038 return -ENOTSUPP; 7039 7040 if (ras && adev->ras_enabled && 7041 adev->nbio.funcs->enable_doorbell_interrupt) 7042 adev->nbio.funcs->enable_doorbell_interrupt(adev, false); 7043 7044 return amdgpu_dpm_baco_enter(adev); 7045 } 7046 7047 int amdgpu_device_baco_exit(struct amdgpu_device *adev) 7048 { 7049 struct amdgpu_ras *ras = amdgpu_ras_get_context(adev); 7050 int ret = 0; 7051 7052 if (!amdgpu_device_supports_baco(adev)) 7053 return -ENOTSUPP; 7054 7055 ret = amdgpu_dpm_baco_exit(adev); 7056 if (ret) 7057 return ret; 7058 7059 if (ras && adev->ras_enabled && 7060 adev->nbio.funcs->enable_doorbell_interrupt) 7061 adev->nbio.funcs->enable_doorbell_interrupt(adev, true); 7062 7063 if (amdgpu_passthrough(adev) && adev->nbio.funcs && 7064 adev->nbio.funcs->clear_doorbell_interrupt) 7065 adev->nbio.funcs->clear_doorbell_interrupt(adev); 7066 7067 return 0; 7068 } 7069 7070 /** 7071 * amdgpu_pci_error_detected - Called when a PCI error is detected. 7072 * @pdev: PCI device struct 7073 * @state: PCI channel state 7074 * 7075 * Description: Called when a PCI error is detected. 7076 * 7077 * Return: PCI_ERS_RESULT_NEED_RESET or PCI_ERS_RESULT_DISCONNECT. 7078 */ 7079 pci_ers_result_t amdgpu_pci_error_detected(struct pci_dev *pdev, pci_channel_state_t state) 7080 { 7081 struct drm_device *dev = pci_get_drvdata(pdev); 7082 struct amdgpu_device *adev = drm_to_adev(dev); 7083 struct amdgpu_hive_info *hive __free(xgmi_put_hive) = 7084 amdgpu_get_xgmi_hive(adev); 7085 struct amdgpu_reset_context reset_context; 7086 struct list_head device_list; 7087 7088 dev_info(adev->dev, "PCI error: detected callback!!\n"); 7089 7090 adev->pci_channel_state = state; 7091 7092 switch (state) { 7093 case pci_channel_io_normal: 7094 dev_info(adev->dev, "pci_channel_io_normal: state(%d)!!\n", state); 7095 return PCI_ERS_RESULT_CAN_RECOVER; 7096 case pci_channel_io_frozen: 7097 /* Fatal error, prepare for slot reset */ 7098 dev_info(adev->dev, "pci_channel_io_frozen: state(%d)!!\n", state); 7099 if (hive) { 7100 /* Hive devices should be able to support FW based 7101 * link reset on other devices, if not return. 7102 */ 7103 if (!amdgpu_dpm_is_link_reset_supported(adev)) { 7104 dev_warn(adev->dev, 7105 "No support for XGMI hive yet...\n"); 7106 return PCI_ERS_RESULT_DISCONNECT; 7107 } 7108 /* Set dpc status only if device is part of hive 7109 * Non-hive devices should be able to recover after 7110 * link reset. 7111 */ 7112 amdgpu_reset_set_dpc_status(adev, true); 7113 7114 mutex_lock(&hive->hive_lock); 7115 } 7116 memset(&reset_context, 0, sizeof(reset_context)); 7117 INIT_LIST_HEAD(&device_list); 7118 7119 amdgpu_device_recovery_prepare(adev, &device_list, hive); 7120 amdgpu_device_recovery_get_reset_lock(adev, &device_list); 7121 amdgpu_device_halt_activities(adev, NULL, &reset_context, &device_list, 7122 hive, false); 7123 if (hive) 7124 mutex_unlock(&hive->hive_lock); 7125 return PCI_ERS_RESULT_NEED_RESET; 7126 case pci_channel_io_perm_failure: 7127 /* Permanent error, prepare for device removal */ 7128 dev_info(adev->dev, "pci_channel_io_perm_failure: state(%d)!!\n", state); 7129 return PCI_ERS_RESULT_DISCONNECT; 7130 } 7131 7132 return PCI_ERS_RESULT_NEED_RESET; 7133 } 7134 7135 /** 7136 * amdgpu_pci_mmio_enabled - Enable MMIO and dump debug registers 7137 * @pdev: pointer to PCI device 7138 */ 7139 pci_ers_result_t amdgpu_pci_mmio_enabled(struct pci_dev *pdev) 7140 { 7141 struct drm_device *dev = pci_get_drvdata(pdev); 7142 struct amdgpu_device *adev = drm_to_adev(dev); 7143 7144 dev_info(adev->dev, "PCI error: mmio enabled callback!!\n"); 7145 7146 /* TODO - dump whatever for debugging purposes */ 7147 7148 /* This called only if amdgpu_pci_error_detected returns 7149 * PCI_ERS_RESULT_CAN_RECOVER. Read/write to the device still 7150 * works, no need to reset slot. 7151 */ 7152 7153 return PCI_ERS_RESULT_RECOVERED; 7154 } 7155 7156 /** 7157 * amdgpu_pci_slot_reset - Called when PCI slot has been reset. 7158 * @pdev: PCI device struct 7159 * 7160 * Description: This routine is called by the pci error recovery 7161 * code after the PCI slot has been reset, just before we 7162 * should resume normal operations. 7163 */ 7164 pci_ers_result_t amdgpu_pci_slot_reset(struct pci_dev *pdev) 7165 { 7166 struct drm_device *dev = pci_get_drvdata(pdev); 7167 struct amdgpu_device *adev = drm_to_adev(dev); 7168 struct amdgpu_reset_context reset_context; 7169 struct amdgpu_device *tmp_adev; 7170 struct amdgpu_hive_info *hive; 7171 struct list_head device_list; 7172 struct pci_dev *link_dev; 7173 int r = 0, i, timeout; 7174 u32 memsize; 7175 u16 status; 7176 7177 dev_info(adev->dev, "PCI error: slot reset callback!!\n"); 7178 7179 memset(&reset_context, 0, sizeof(reset_context)); 7180 7181 if (adev->pcie_reset_ctx.swus) 7182 link_dev = adev->pcie_reset_ctx.swus; 7183 else 7184 link_dev = adev->pdev; 7185 /* wait for asic to come out of reset, timeout = 10s */ 7186 timeout = 10000; 7187 do { 7188 usleep_range(10000, 10500); 7189 r = pci_read_config_word(link_dev, PCI_VENDOR_ID, &status); 7190 timeout -= 10; 7191 } while (timeout > 0 && (status != PCI_VENDOR_ID_ATI) && 7192 (status != PCI_VENDOR_ID_AMD)); 7193 7194 if ((status != PCI_VENDOR_ID_ATI) && (status != PCI_VENDOR_ID_AMD)) { 7195 r = -ETIME; 7196 goto out; 7197 } 7198 7199 amdgpu_device_load_switch_state(adev); 7200 /* Restore PCI confspace */ 7201 amdgpu_device_load_pci_state(pdev); 7202 7203 /* confirm ASIC came out of reset */ 7204 for (i = 0; i < adev->usec_timeout; i++) { 7205 memsize = amdgpu_asic_get_config_memsize(adev); 7206 7207 if (memsize != 0xffffffff) 7208 break; 7209 udelay(1); 7210 } 7211 if (memsize == 0xffffffff) { 7212 r = -ETIME; 7213 goto out; 7214 } 7215 7216 reset_context.method = AMD_RESET_METHOD_NONE; 7217 reset_context.reset_req_dev = adev; 7218 set_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags); 7219 set_bit(AMDGPU_SKIP_COREDUMP, &reset_context.flags); 7220 INIT_LIST_HEAD(&device_list); 7221 7222 hive = amdgpu_get_xgmi_hive(adev); 7223 if (hive) { 7224 mutex_lock(&hive->hive_lock); 7225 reset_context.hive = hive; 7226 list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head) { 7227 tmp_adev->pcie_reset_ctx.in_link_reset = true; 7228 list_add_tail(&tmp_adev->reset_list, &device_list); 7229 } 7230 } else { 7231 set_bit(AMDGPU_SKIP_HW_RESET, &reset_context.flags); 7232 list_add_tail(&adev->reset_list, &device_list); 7233 } 7234 7235 r = amdgpu_device_asic_reset(adev, &device_list, &reset_context); 7236 out: 7237 if (!r) { 7238 if (amdgpu_device_cache_pci_state(adev->pdev)) 7239 pci_restore_state(adev->pdev); 7240 dev_info(adev->dev, "PCIe error recovery succeeded\n"); 7241 } else { 7242 dev_err(adev->dev, "PCIe error recovery failed, err:%d\n", r); 7243 if (hive) { 7244 list_for_each_entry(tmp_adev, &device_list, reset_list) 7245 amdgpu_device_unset_mp1_state(tmp_adev); 7246 } 7247 amdgpu_device_recovery_put_reset_lock(adev, &device_list); 7248 } 7249 7250 if (hive) { 7251 mutex_unlock(&hive->hive_lock); 7252 amdgpu_put_xgmi_hive(hive); 7253 } 7254 7255 return r ? PCI_ERS_RESULT_DISCONNECT : PCI_ERS_RESULT_RECOVERED; 7256 } 7257 7258 /** 7259 * amdgpu_pci_resume() - resume normal ops after PCI reset 7260 * @pdev: pointer to PCI device 7261 * 7262 * Called when the error recovery driver tells us that its 7263 * OK to resume normal operation. 7264 */ 7265 void amdgpu_pci_resume(struct pci_dev *pdev) 7266 { 7267 struct drm_device *dev = pci_get_drvdata(pdev); 7268 struct amdgpu_device *adev = drm_to_adev(dev); 7269 struct list_head device_list; 7270 struct amdgpu_hive_info *hive = NULL; 7271 struct amdgpu_device *tmp_adev = NULL; 7272 7273 dev_info(adev->dev, "PCI error: resume callback!!\n"); 7274 7275 /* Only continue execution for the case of pci_channel_io_frozen */ 7276 if (adev->pci_channel_state != pci_channel_io_frozen) 7277 return; 7278 7279 INIT_LIST_HEAD(&device_list); 7280 7281 hive = amdgpu_get_xgmi_hive(adev); 7282 if (hive) { 7283 mutex_lock(&hive->hive_lock); 7284 list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head) { 7285 tmp_adev->pcie_reset_ctx.in_link_reset = false; 7286 list_add_tail(&tmp_adev->reset_list, &device_list); 7287 } 7288 } else 7289 list_add_tail(&adev->reset_list, &device_list); 7290 7291 amdgpu_device_sched_resume(&device_list, NULL, NULL); 7292 amdgpu_device_gpu_resume(adev, &device_list, false); 7293 amdgpu_device_recovery_put_reset_lock(adev, &device_list); 7294 7295 if (hive) { 7296 mutex_unlock(&hive->hive_lock); 7297 amdgpu_put_xgmi_hive(hive); 7298 } 7299 } 7300 7301 static void amdgpu_device_cache_switch_state(struct amdgpu_device *adev) 7302 { 7303 struct pci_dev *swus, *swds; 7304 int r; 7305 7306 swds = pci_upstream_bridge(adev->pdev); 7307 if (!swds || swds->vendor != PCI_VENDOR_ID_ATI || 7308 pci_pcie_type(swds) != PCI_EXP_TYPE_DOWNSTREAM) 7309 return; 7310 swus = pci_upstream_bridge(swds); 7311 if (!swus || 7312 (swus->vendor != PCI_VENDOR_ID_ATI && 7313 swus->vendor != PCI_VENDOR_ID_AMD) || 7314 pci_pcie_type(swus) != PCI_EXP_TYPE_UPSTREAM) 7315 return; 7316 7317 /* If already saved, return */ 7318 if (adev->pcie_reset_ctx.swus) 7319 return; 7320 /* Upstream bridge is ATI, assume it's SWUS/DS architecture */ 7321 r = pci_save_state(swds); 7322 if (r) 7323 return; 7324 adev->pcie_reset_ctx.swds_pcistate = pci_store_saved_state(swds); 7325 7326 r = pci_save_state(swus); 7327 if (r) 7328 return; 7329 adev->pcie_reset_ctx.swus_pcistate = pci_store_saved_state(swus); 7330 7331 adev->pcie_reset_ctx.swus = swus; 7332 } 7333 7334 static void amdgpu_device_load_switch_state(struct amdgpu_device *adev) 7335 { 7336 struct pci_dev *pdev; 7337 int r; 7338 7339 if (!adev->pcie_reset_ctx.swds_pcistate || 7340 !adev->pcie_reset_ctx.swus_pcistate) 7341 return; 7342 7343 pdev = adev->pcie_reset_ctx.swus; 7344 r = pci_load_saved_state(pdev, adev->pcie_reset_ctx.swus_pcistate); 7345 if (!r) { 7346 pci_restore_state(pdev); 7347 } else { 7348 dev_warn(adev->dev, "Failed to load SWUS state, err:%d\n", r); 7349 return; 7350 } 7351 7352 pdev = pci_upstream_bridge(adev->pdev); 7353 r = pci_load_saved_state(pdev, adev->pcie_reset_ctx.swds_pcistate); 7354 if (!r) 7355 pci_restore_state(pdev); 7356 else 7357 dev_warn(adev->dev, "Failed to load SWDS state, err:%d\n", r); 7358 } 7359 7360 bool amdgpu_device_cache_pci_state(struct pci_dev *pdev) 7361 { 7362 struct drm_device *dev = pci_get_drvdata(pdev); 7363 struct amdgpu_device *adev = drm_to_adev(dev); 7364 int r; 7365 7366 if (amdgpu_sriov_vf(adev)) 7367 return false; 7368 7369 r = pci_save_state(pdev); 7370 if (!r) { 7371 kfree(adev->pci_state); 7372 7373 adev->pci_state = pci_store_saved_state(pdev); 7374 7375 if (!adev->pci_state) { 7376 dev_err(adev->dev, "Failed to store PCI saved state"); 7377 return false; 7378 } 7379 } else { 7380 dev_warn(adev->dev, "Failed to save PCI state, err:%d\n", r); 7381 return false; 7382 } 7383 7384 amdgpu_device_cache_switch_state(adev); 7385 7386 return true; 7387 } 7388 7389 bool amdgpu_device_load_pci_state(struct pci_dev *pdev) 7390 { 7391 struct drm_device *dev = pci_get_drvdata(pdev); 7392 struct amdgpu_device *adev = drm_to_adev(dev); 7393 int r; 7394 7395 if (!adev->pci_state) 7396 return false; 7397 7398 r = pci_load_saved_state(pdev, adev->pci_state); 7399 7400 if (!r) { 7401 pci_restore_state(pdev); 7402 } else { 7403 dev_warn(adev->dev, "Failed to load PCI state, err:%d\n", r); 7404 return false; 7405 } 7406 7407 return true; 7408 } 7409 7410 void amdgpu_device_flush_hdp(struct amdgpu_device *adev, 7411 struct amdgpu_ring *ring) 7412 { 7413 #ifdef CONFIG_X86_64 7414 if ((adev->flags & AMD_IS_APU) && !amdgpu_passthrough(adev)) 7415 return; 7416 #endif 7417 if (adev->gmc.xgmi.connected_to_cpu) 7418 return; 7419 7420 if (ring && ring->funcs->emit_hdp_flush) { 7421 amdgpu_ring_emit_hdp_flush(ring); 7422 return; 7423 } 7424 7425 if (!ring && amdgpu_sriov_runtime(adev)) { 7426 if (!amdgpu_kiq_hdp_flush(adev)) 7427 return; 7428 } 7429 7430 amdgpu_hdp_flush(adev, ring); 7431 } 7432 7433 void amdgpu_device_invalidate_hdp(struct amdgpu_device *adev, 7434 struct amdgpu_ring *ring) 7435 { 7436 #ifdef CONFIG_X86_64 7437 if ((adev->flags & AMD_IS_APU) && !amdgpu_passthrough(adev)) 7438 return; 7439 #endif 7440 if (adev->gmc.xgmi.connected_to_cpu) 7441 return; 7442 7443 amdgpu_hdp_invalidate(adev, ring); 7444 } 7445 7446 int amdgpu_in_reset(struct amdgpu_device *adev) 7447 { 7448 return atomic_read(&adev->reset_domain->in_gpu_reset); 7449 } 7450 7451 /** 7452 * amdgpu_device_halt() - bring hardware to some kind of halt state 7453 * 7454 * @adev: amdgpu_device pointer 7455 * 7456 * Bring hardware to some kind of halt state so that no one can touch it 7457 * any more. It will help to maintain error context when error occurred. 7458 * Compare to a simple hang, the system will keep stable at least for SSH 7459 * access. Then it should be trivial to inspect the hardware state and 7460 * see what's going on. Implemented as following: 7461 * 7462 * 1. drm_dev_unplug() makes device inaccessible to user space(IOCTLs, etc), 7463 * clears all CPU mappings to device, disallows remappings through page faults 7464 * 2. amdgpu_irq_disable_all() disables all interrupts 7465 * 3. amdgpu_fence_driver_hw_fini() signals all HW fences 7466 * 4. set adev->no_hw_access to avoid potential crashes after setp 5 7467 * 5. amdgpu_device_unmap_mmio() clears all MMIO mappings 7468 * 6. pci_disable_device() and pci_wait_for_pending_transaction() 7469 * flush any in flight DMA operations 7470 */ 7471 void amdgpu_device_halt(struct amdgpu_device *adev) 7472 { 7473 struct pci_dev *pdev = adev->pdev; 7474 struct drm_device *ddev = adev_to_drm(adev); 7475 7476 amdgpu_xcp_dev_unplug(adev); 7477 drm_dev_unplug(ddev); 7478 7479 amdgpu_irq_disable_all(adev); 7480 7481 amdgpu_fence_driver_hw_fini(adev); 7482 7483 adev->no_hw_access = true; 7484 7485 amdgpu_device_unmap_mmio(adev); 7486 7487 pci_disable_device(pdev); 7488 pci_wait_for_pending_transaction(pdev); 7489 } 7490 7491 u32 amdgpu_device_pcie_port_rreg(struct amdgpu_device *adev, 7492 u32 reg) 7493 { 7494 unsigned long flags, address, data; 7495 u32 r; 7496 7497 address = adev->nbio.funcs->get_pcie_port_index_offset(adev); 7498 data = adev->nbio.funcs->get_pcie_port_data_offset(adev); 7499 7500 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 7501 WREG32(address, reg * 4); 7502 (void)RREG32(address); 7503 r = RREG32(data); 7504 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 7505 return r; 7506 } 7507 7508 void amdgpu_device_pcie_port_wreg(struct amdgpu_device *adev, 7509 u32 reg, u32 v) 7510 { 7511 unsigned long flags, address, data; 7512 7513 address = adev->nbio.funcs->get_pcie_port_index_offset(adev); 7514 data = adev->nbio.funcs->get_pcie_port_data_offset(adev); 7515 7516 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 7517 WREG32(address, reg * 4); 7518 (void)RREG32(address); 7519 WREG32(data, v); 7520 (void)RREG32(data); 7521 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 7522 } 7523 7524 /** 7525 * amdgpu_device_get_gang - return a reference to the current gang 7526 * @adev: amdgpu_device pointer 7527 * 7528 * Returns: A new reference to the current gang leader. 7529 */ 7530 struct dma_fence *amdgpu_device_get_gang(struct amdgpu_device *adev) 7531 { 7532 struct dma_fence *fence; 7533 7534 rcu_read_lock(); 7535 fence = dma_fence_get_rcu_safe(&adev->gang_submit); 7536 rcu_read_unlock(); 7537 return fence; 7538 } 7539 7540 /** 7541 * amdgpu_device_switch_gang - switch to a new gang 7542 * @adev: amdgpu_device pointer 7543 * @gang: the gang to switch to 7544 * 7545 * Try to switch to a new gang. 7546 * Returns: NULL if we switched to the new gang or a reference to the current 7547 * gang leader. 7548 */ 7549 struct dma_fence *amdgpu_device_switch_gang(struct amdgpu_device *adev, 7550 struct dma_fence *gang) 7551 { 7552 struct dma_fence *old = NULL; 7553 7554 dma_fence_get(gang); 7555 do { 7556 dma_fence_put(old); 7557 old = amdgpu_device_get_gang(adev); 7558 if (old == gang) 7559 break; 7560 7561 if (!dma_fence_is_signaled(old)) { 7562 dma_fence_put(gang); 7563 return old; 7564 } 7565 7566 } while (cmpxchg((struct dma_fence __force **)&adev->gang_submit, 7567 old, gang) != old); 7568 7569 /* 7570 * Drop it once for the exchanged reference in adev and once for the 7571 * thread local reference acquired in amdgpu_device_get_gang(). 7572 */ 7573 dma_fence_put(old); 7574 dma_fence_put(old); 7575 return NULL; 7576 } 7577 7578 /** 7579 * amdgpu_device_enforce_isolation - enforce HW isolation 7580 * @adev: the amdgpu device pointer 7581 * @ring: the HW ring the job is supposed to run on 7582 * @job: the job which is about to be pushed to the HW ring 7583 * 7584 * Makes sure that only one client at a time can use the GFX block. 7585 * Returns: The dependency to wait on before the job can be pushed to the HW. 7586 * The function is called multiple times until NULL is returned. 7587 */ 7588 struct dma_fence *amdgpu_device_enforce_isolation(struct amdgpu_device *adev, 7589 struct amdgpu_ring *ring, 7590 struct amdgpu_job *job) 7591 { 7592 struct amdgpu_isolation *isolation = &adev->isolation[ring->xcp_id]; 7593 struct drm_sched_fence *f = job->base.s_fence; 7594 struct dma_fence *dep; 7595 void *owner; 7596 int r; 7597 7598 /* 7599 * For now enforce isolation only for the GFX block since we only need 7600 * the cleaner shader on those rings. 7601 */ 7602 if (ring->funcs->type != AMDGPU_RING_TYPE_GFX && 7603 ring->funcs->type != AMDGPU_RING_TYPE_COMPUTE) 7604 return NULL; 7605 7606 /* 7607 * All submissions where enforce isolation is false are handled as if 7608 * they come from a single client. Use ~0l as the owner to distinct it 7609 * from kernel submissions where the owner is NULL. 7610 */ 7611 owner = job->enforce_isolation ? f->owner : (void *)~0l; 7612 7613 mutex_lock(&adev->enforce_isolation_mutex); 7614 7615 /* 7616 * The "spearhead" submission is the first one which changes the 7617 * ownership to its client. We always need to wait for it to be 7618 * pushed to the HW before proceeding with anything. 7619 */ 7620 if (&f->scheduled != isolation->spearhead && 7621 !dma_fence_is_signaled(isolation->spearhead)) { 7622 dep = isolation->spearhead; 7623 goto out_grab_ref; 7624 } 7625 7626 if (isolation->owner != owner) { 7627 7628 /* 7629 * Wait for any gang to be assembled before switching to a 7630 * different owner or otherwise we could deadlock the 7631 * submissions. 7632 */ 7633 if (!job->gang_submit) { 7634 dep = amdgpu_device_get_gang(adev); 7635 if (!dma_fence_is_signaled(dep)) 7636 goto out_return_dep; 7637 dma_fence_put(dep); 7638 } 7639 7640 dma_fence_put(isolation->spearhead); 7641 isolation->spearhead = dma_fence_get(&f->scheduled); 7642 amdgpu_sync_move(&isolation->active, &isolation->prev); 7643 trace_amdgpu_isolation(isolation->owner, owner); 7644 isolation->owner = owner; 7645 } 7646 7647 /* 7648 * Specifying the ring here helps to pipeline submissions even when 7649 * isolation is enabled. If that is not desired for testing NULL can be 7650 * used instead of the ring to enforce a CPU round trip while switching 7651 * between clients. 7652 */ 7653 dep = amdgpu_sync_peek_fence(&isolation->prev, ring); 7654 r = amdgpu_sync_fence(&isolation->active, &f->finished, GFP_NOWAIT); 7655 if (r) 7656 dev_warn(adev->dev, "OOM tracking isolation\n"); 7657 7658 out_grab_ref: 7659 dma_fence_get(dep); 7660 out_return_dep: 7661 mutex_unlock(&adev->enforce_isolation_mutex); 7662 return dep; 7663 } 7664 7665 bool amdgpu_device_has_display_hardware(struct amdgpu_device *adev) 7666 { 7667 switch (adev->asic_type) { 7668 #ifdef CONFIG_DRM_AMDGPU_SI 7669 case CHIP_HAINAN: 7670 #endif 7671 case CHIP_TOPAZ: 7672 /* chips with no display hardware */ 7673 return false; 7674 #ifdef CONFIG_DRM_AMDGPU_SI 7675 case CHIP_TAHITI: 7676 case CHIP_PITCAIRN: 7677 case CHIP_VERDE: 7678 case CHIP_OLAND: 7679 #endif 7680 #ifdef CONFIG_DRM_AMDGPU_CIK 7681 case CHIP_BONAIRE: 7682 case CHIP_HAWAII: 7683 case CHIP_KAVERI: 7684 case CHIP_KABINI: 7685 case CHIP_MULLINS: 7686 #endif 7687 case CHIP_TONGA: 7688 case CHIP_FIJI: 7689 case CHIP_POLARIS10: 7690 case CHIP_POLARIS11: 7691 case CHIP_POLARIS12: 7692 case CHIP_VEGAM: 7693 case CHIP_CARRIZO: 7694 case CHIP_STONEY: 7695 /* chips with display hardware */ 7696 return true; 7697 default: 7698 /* IP discovery */ 7699 if (!amdgpu_ip_version(adev, DCE_HWIP, 0) || 7700 (adev->harvest_ip_mask & AMD_HARVEST_IP_DMU_MASK)) 7701 return false; 7702 return true; 7703 } 7704 } 7705 7706 uint32_t amdgpu_device_wait_on_rreg(struct amdgpu_device *adev, 7707 uint32_t inst, uint32_t reg_addr, char reg_name[], 7708 uint32_t expected_value, uint32_t mask) 7709 { 7710 uint32_t ret = 0; 7711 uint32_t old_ = 0; 7712 uint32_t tmp_ = RREG32(reg_addr); 7713 uint32_t loop = adev->usec_timeout; 7714 7715 while ((tmp_ & (mask)) != (expected_value)) { 7716 if (old_ != tmp_) { 7717 loop = adev->usec_timeout; 7718 old_ = tmp_; 7719 } else 7720 udelay(1); 7721 tmp_ = RREG32(reg_addr); 7722 loop--; 7723 if (!loop) { 7724 dev_warn( 7725 adev->dev, 7726 "Register(%d) [%s] failed to reach value 0x%08x != 0x%08xn", 7727 inst, reg_name, (uint32_t)expected_value, 7728 (uint32_t)(tmp_ & (mask))); 7729 ret = -ETIMEDOUT; 7730 break; 7731 } 7732 } 7733 return ret; 7734 } 7735 7736 ssize_t amdgpu_get_soft_full_reset_mask(struct amdgpu_ring *ring) 7737 { 7738 ssize_t size = 0; 7739 7740 if (!ring || !ring->adev) 7741 return size; 7742 7743 if (amdgpu_device_should_recover_gpu(ring->adev)) 7744 size |= AMDGPU_RESET_TYPE_FULL; 7745 7746 if (unlikely(!ring->adev->debug_disable_soft_recovery) && 7747 !amdgpu_sriov_vf(ring->adev) && ring->funcs->soft_recovery) 7748 size |= AMDGPU_RESET_TYPE_SOFT_RESET; 7749 7750 return size; 7751 } 7752 7753 ssize_t amdgpu_show_reset_mask(char *buf, uint32_t supported_reset) 7754 { 7755 ssize_t size = 0; 7756 7757 if (supported_reset == 0) { 7758 size += sysfs_emit_at(buf, size, "unsupported"); 7759 size += sysfs_emit_at(buf, size, "\n"); 7760 return size; 7761 7762 } 7763 7764 if (supported_reset & AMDGPU_RESET_TYPE_SOFT_RESET) 7765 size += sysfs_emit_at(buf, size, "soft "); 7766 7767 if (supported_reset & AMDGPU_RESET_TYPE_PER_QUEUE) 7768 size += sysfs_emit_at(buf, size, "queue "); 7769 7770 if (supported_reset & AMDGPU_RESET_TYPE_PER_PIPE) 7771 size += sysfs_emit_at(buf, size, "pipe "); 7772 7773 if (supported_reset & AMDGPU_RESET_TYPE_FULL) 7774 size += sysfs_emit_at(buf, size, "full "); 7775 7776 size += sysfs_emit_at(buf, size, "\n"); 7777 return size; 7778 } 7779 7780 void amdgpu_device_set_uid(struct amdgpu_uid *uid_info, 7781 enum amdgpu_uid_type type, uint8_t inst, 7782 uint64_t uid) 7783 { 7784 if (!uid_info) 7785 return; 7786 7787 if (type >= AMDGPU_UID_TYPE_MAX) { 7788 dev_err_once(uid_info->adev->dev, "Invalid UID type %d\n", 7789 type); 7790 return; 7791 } 7792 7793 if (inst >= AMDGPU_UID_INST_MAX) { 7794 dev_err_once(uid_info->adev->dev, "Invalid UID instance %d\n", 7795 inst); 7796 return; 7797 } 7798 7799 if (uid_info->uid[type][inst] != 0) { 7800 dev_warn_once( 7801 uid_info->adev->dev, 7802 "Overwriting existing UID %llu for type %d instance %d\n", 7803 uid_info->uid[type][inst], type, inst); 7804 } 7805 7806 uid_info->uid[type][inst] = uid; 7807 } 7808 7809 u64 amdgpu_device_get_uid(struct amdgpu_uid *uid_info, 7810 enum amdgpu_uid_type type, uint8_t inst) 7811 { 7812 if (!uid_info) 7813 return 0; 7814 7815 if (type >= AMDGPU_UID_TYPE_MAX) { 7816 dev_err_once(uid_info->adev->dev, "Invalid UID type %d\n", 7817 type); 7818 return 0; 7819 } 7820 7821 if (inst >= AMDGPU_UID_INST_MAX) { 7822 dev_err_once(uid_info->adev->dev, "Invalid UID instance %d\n", 7823 inst); 7824 return 0; 7825 } 7826 7827 return uid_info->uid[type][inst]; 7828 } 7829