1 /* 2 * Copyright 2008 Advanced Micro Devices, Inc. 3 * Copyright 2008 Red Hat Inc. 4 * Copyright 2009 Jerome Glisse. 5 * 6 * Permission is hereby granted, free of charge, to any person obtaining a 7 * copy of this software and associated documentation files (the "Software"), 8 * to deal in the Software without restriction, including without limitation 9 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 10 * and/or sell copies of the Software, and to permit persons to whom the 11 * Software is furnished to do so, subject to the following conditions: 12 * 13 * The above copyright notice and this permission notice shall be included in 14 * all copies or substantial portions of the Software. 15 * 16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 19 * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR 20 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, 21 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR 22 * OTHER DEALINGS IN THE SOFTWARE. 23 * 24 * Authors: Dave Airlie 25 * Alex Deucher 26 * Jerome Glisse 27 */ 28 29 #include <linux/aperture.h> 30 #include <linux/power_supply.h> 31 #include <linux/kthread.h> 32 #include <linux/module.h> 33 #include <linux/console.h> 34 #include <linux/slab.h> 35 #include <linux/iommu.h> 36 #include <linux/pci.h> 37 #include <linux/pci-p2pdma.h> 38 #include <linux/apple-gmux.h> 39 40 #include <drm/drm_atomic_helper.h> 41 #include <drm/drm_client_event.h> 42 #include <drm/drm_crtc_helper.h> 43 #include <drm/drm_probe_helper.h> 44 #include <drm/amdgpu_drm.h> 45 #include <linux/device.h> 46 #include <linux/vgaarb.h> 47 #include <linux/vga_switcheroo.h> 48 #include <linux/efi.h> 49 #include "amdgpu.h" 50 #include "amdgpu_trace.h" 51 #include "amdgpu_i2c.h" 52 #include "atom.h" 53 #include "amdgpu_atombios.h" 54 #include "amdgpu_atomfirmware.h" 55 #include "amd_pcie.h" 56 #ifdef CONFIG_DRM_AMDGPU_SI 57 #include "si.h" 58 #endif 59 #ifdef CONFIG_DRM_AMDGPU_CIK 60 #include "cik.h" 61 #endif 62 #include "vi.h" 63 #include "soc15.h" 64 #include "nv.h" 65 #include "bif/bif_4_1_d.h" 66 #include <linux/firmware.h> 67 #include "amdgpu_vf_error.h" 68 69 #include "amdgpu_amdkfd.h" 70 #include "amdgpu_pm.h" 71 72 #include "amdgpu_xgmi.h" 73 #include "amdgpu_ras.h" 74 #include "amdgpu_ras_mgr.h" 75 #include "amdgpu_pmu.h" 76 #include "amdgpu_fru_eeprom.h" 77 #include "amdgpu_reset.h" 78 #include "amdgpu_virt.h" 79 #include "amdgpu_dev_coredump.h" 80 81 #include <linux/suspend.h> 82 #include <drm/task_barrier.h> 83 #include <linux/pm_runtime.h> 84 85 #include <drm/drm_drv.h> 86 87 #if IS_ENABLED(CONFIG_X86) 88 #include <asm/intel-family.h> 89 #include <asm/cpu_device_id.h> 90 #endif 91 92 MODULE_FIRMWARE("amdgpu/vega10_gpu_info.bin"); 93 MODULE_FIRMWARE("amdgpu/vega12_gpu_info.bin"); 94 MODULE_FIRMWARE("amdgpu/raven_gpu_info.bin"); 95 MODULE_FIRMWARE("amdgpu/picasso_gpu_info.bin"); 96 MODULE_FIRMWARE("amdgpu/raven2_gpu_info.bin"); 97 MODULE_FIRMWARE("amdgpu/arcturus_gpu_info.bin"); 98 MODULE_FIRMWARE("amdgpu/navi12_gpu_info.bin"); 99 MODULE_FIRMWARE("amdgpu/cyan_skillfish_gpu_info.bin"); 100 101 #define AMDGPU_RESUME_MS 2000 102 #define AMDGPU_MAX_RETRY_LIMIT 2 103 #define AMDGPU_RETRY_SRIOV_RESET(r) ((r) == -EBUSY || (r) == -ETIMEDOUT || (r) == -EINVAL) 104 #define AMDGPU_PCIE_INDEX_FALLBACK (0x38 >> 2) 105 #define AMDGPU_PCIE_INDEX_HI_FALLBACK (0x44 >> 2) 106 #define AMDGPU_PCIE_DATA_FALLBACK (0x3C >> 2) 107 108 #define AMDGPU_VBIOS_SKIP (1U << 0) 109 #define AMDGPU_VBIOS_OPTIONAL (1U << 1) 110 111 static const struct drm_driver amdgpu_kms_driver; 112 113 const char *amdgpu_asic_name[] = { 114 "TAHITI", 115 "PITCAIRN", 116 "VERDE", 117 "OLAND", 118 "HAINAN", 119 "BONAIRE", 120 "KAVERI", 121 "KABINI", 122 "HAWAII", 123 "MULLINS", 124 "TOPAZ", 125 "TONGA", 126 "FIJI", 127 "CARRIZO", 128 "STONEY", 129 "POLARIS10", 130 "POLARIS11", 131 "POLARIS12", 132 "VEGAM", 133 "VEGA10", 134 "VEGA12", 135 "VEGA20", 136 "RAVEN", 137 "ARCTURUS", 138 "RENOIR", 139 "ALDEBARAN", 140 "NAVI10", 141 "CYAN_SKILLFISH", 142 "NAVI14", 143 "NAVI12", 144 "SIENNA_CICHLID", 145 "NAVY_FLOUNDER", 146 "VANGOGH", 147 "DIMGREY_CAVEFISH", 148 "BEIGE_GOBY", 149 "YELLOW_CARP", 150 "IP DISCOVERY", 151 "LAST", 152 }; 153 154 #define AMDGPU_IP_BLK_MASK_ALL GENMASK(AMD_IP_BLOCK_TYPE_NUM - 1, 0) 155 /* 156 * Default init level where all blocks are expected to be initialized. This is 157 * the level of initialization expected by default and also after a full reset 158 * of the device. 159 */ 160 struct amdgpu_init_level amdgpu_init_default = { 161 .level = AMDGPU_INIT_LEVEL_DEFAULT, 162 .hwini_ip_block_mask = AMDGPU_IP_BLK_MASK_ALL, 163 }; 164 165 struct amdgpu_init_level amdgpu_init_recovery = { 166 .level = AMDGPU_INIT_LEVEL_RESET_RECOVERY, 167 .hwini_ip_block_mask = AMDGPU_IP_BLK_MASK_ALL, 168 }; 169 170 /* 171 * Minimal blocks needed to be initialized before a XGMI hive can be reset. This 172 * is used for cases like reset on initialization where the entire hive needs to 173 * be reset before first use. 174 */ 175 struct amdgpu_init_level amdgpu_init_minimal_xgmi = { 176 .level = AMDGPU_INIT_LEVEL_MINIMAL_XGMI, 177 .hwini_ip_block_mask = 178 BIT(AMD_IP_BLOCK_TYPE_GMC) | BIT(AMD_IP_BLOCK_TYPE_SMC) | 179 BIT(AMD_IP_BLOCK_TYPE_COMMON) | BIT(AMD_IP_BLOCK_TYPE_IH) | 180 BIT(AMD_IP_BLOCK_TYPE_PSP) 181 }; 182 183 static int amdgpu_device_ip_resume_phase1(struct amdgpu_device *adev); 184 static int amdgpu_device_ip_resume_phase2(struct amdgpu_device *adev); 185 static int amdgpu_device_ip_resume_phase3(struct amdgpu_device *adev); 186 187 static void amdgpu_device_load_switch_state(struct amdgpu_device *adev); 188 189 static inline bool amdgpu_ip_member_of_hwini(struct amdgpu_device *adev, 190 enum amd_ip_block_type block) 191 { 192 return (adev->init_lvl->hwini_ip_block_mask & (1U << block)) != 0; 193 } 194 195 void amdgpu_set_init_level(struct amdgpu_device *adev, 196 enum amdgpu_init_lvl_id lvl) 197 { 198 switch (lvl) { 199 case AMDGPU_INIT_LEVEL_MINIMAL_XGMI: 200 adev->init_lvl = &amdgpu_init_minimal_xgmi; 201 break; 202 case AMDGPU_INIT_LEVEL_RESET_RECOVERY: 203 adev->init_lvl = &amdgpu_init_recovery; 204 break; 205 case AMDGPU_INIT_LEVEL_DEFAULT: 206 fallthrough; 207 default: 208 adev->init_lvl = &amdgpu_init_default; 209 break; 210 } 211 } 212 213 static inline void amdgpu_device_stop_pending_resets(struct amdgpu_device *adev); 214 static int amdgpu_device_pm_notifier(struct notifier_block *nb, unsigned long mode, 215 void *data); 216 217 /** 218 * DOC: pcie_replay_count 219 * 220 * The amdgpu driver provides a sysfs API for reporting the total number 221 * of PCIe replays (NAKs). 222 * The file pcie_replay_count is used for this and returns the total 223 * number of replays as a sum of the NAKs generated and NAKs received. 224 */ 225 226 static ssize_t amdgpu_device_get_pcie_replay_count(struct device *dev, 227 struct device_attribute *attr, char *buf) 228 { 229 struct drm_device *ddev = dev_get_drvdata(dev); 230 struct amdgpu_device *adev = drm_to_adev(ddev); 231 uint64_t cnt = amdgpu_asic_get_pcie_replay_count(adev); 232 233 return sysfs_emit(buf, "%llu\n", cnt); 234 } 235 236 static DEVICE_ATTR(pcie_replay_count, 0444, 237 amdgpu_device_get_pcie_replay_count, NULL); 238 239 static int amdgpu_device_attr_sysfs_init(struct amdgpu_device *adev) 240 { 241 int ret = 0; 242 243 if (amdgpu_nbio_is_replay_cnt_supported(adev)) 244 ret = sysfs_create_file(&adev->dev->kobj, 245 &dev_attr_pcie_replay_count.attr); 246 247 return ret; 248 } 249 250 static void amdgpu_device_attr_sysfs_fini(struct amdgpu_device *adev) 251 { 252 if (amdgpu_nbio_is_replay_cnt_supported(adev)) 253 sysfs_remove_file(&adev->dev->kobj, 254 &dev_attr_pcie_replay_count.attr); 255 } 256 257 static ssize_t amdgpu_sysfs_reg_state_get(struct file *f, struct kobject *kobj, 258 const struct bin_attribute *attr, char *buf, 259 loff_t ppos, size_t count) 260 { 261 struct device *dev = kobj_to_dev(kobj); 262 struct drm_device *ddev = dev_get_drvdata(dev); 263 struct amdgpu_device *adev = drm_to_adev(ddev); 264 ssize_t bytes_read; 265 266 switch (ppos) { 267 case AMDGPU_SYS_REG_STATE_XGMI: 268 bytes_read = amdgpu_asic_get_reg_state( 269 adev, AMDGPU_REG_STATE_TYPE_XGMI, buf, count); 270 break; 271 case AMDGPU_SYS_REG_STATE_WAFL: 272 bytes_read = amdgpu_asic_get_reg_state( 273 adev, AMDGPU_REG_STATE_TYPE_WAFL, buf, count); 274 break; 275 case AMDGPU_SYS_REG_STATE_PCIE: 276 bytes_read = amdgpu_asic_get_reg_state( 277 adev, AMDGPU_REG_STATE_TYPE_PCIE, buf, count); 278 break; 279 case AMDGPU_SYS_REG_STATE_USR: 280 bytes_read = amdgpu_asic_get_reg_state( 281 adev, AMDGPU_REG_STATE_TYPE_USR, buf, count); 282 break; 283 case AMDGPU_SYS_REG_STATE_USR_1: 284 bytes_read = amdgpu_asic_get_reg_state( 285 adev, AMDGPU_REG_STATE_TYPE_USR_1, buf, count); 286 break; 287 default: 288 return -EINVAL; 289 } 290 291 return bytes_read; 292 } 293 294 static const BIN_ATTR(reg_state, 0444, amdgpu_sysfs_reg_state_get, NULL, 295 AMDGPU_SYS_REG_STATE_END); 296 297 int amdgpu_reg_state_sysfs_init(struct amdgpu_device *adev) 298 { 299 int ret; 300 301 if (!amdgpu_asic_get_reg_state_supported(adev)) 302 return 0; 303 304 ret = sysfs_create_bin_file(&adev->dev->kobj, &bin_attr_reg_state); 305 306 return ret; 307 } 308 309 void amdgpu_reg_state_sysfs_fini(struct amdgpu_device *adev) 310 { 311 if (!amdgpu_asic_get_reg_state_supported(adev)) 312 return; 313 sysfs_remove_bin_file(&adev->dev->kobj, &bin_attr_reg_state); 314 } 315 316 int amdgpu_ip_block_suspend(struct amdgpu_ip_block *ip_block) 317 { 318 int r; 319 320 if (ip_block->version->funcs->suspend) { 321 r = ip_block->version->funcs->suspend(ip_block); 322 if (r) { 323 dev_err(ip_block->adev->dev, 324 "suspend of IP block <%s> failed %d\n", 325 ip_block->version->funcs->name, r); 326 return r; 327 } 328 } 329 330 ip_block->status.hw = false; 331 return 0; 332 } 333 334 int amdgpu_ip_block_resume(struct amdgpu_ip_block *ip_block) 335 { 336 int r; 337 338 if (ip_block->version->funcs->resume) { 339 r = ip_block->version->funcs->resume(ip_block); 340 if (r) { 341 dev_err(ip_block->adev->dev, 342 "resume of IP block <%s> failed %d\n", 343 ip_block->version->funcs->name, r); 344 return r; 345 } 346 } 347 348 ip_block->status.hw = true; 349 return 0; 350 } 351 352 /** 353 * DOC: board_info 354 * 355 * The amdgpu driver provides a sysfs API for giving board related information. 356 * It provides the form factor information in the format 357 * 358 * type : form factor 359 * 360 * Possible form factor values 361 * 362 * - "cem" - PCIE CEM card 363 * - "oam" - Open Compute Accelerator Module 364 * - "unknown" - Not known 365 * 366 */ 367 368 static ssize_t amdgpu_device_get_board_info(struct device *dev, 369 struct device_attribute *attr, 370 char *buf) 371 { 372 struct drm_device *ddev = dev_get_drvdata(dev); 373 struct amdgpu_device *adev = drm_to_adev(ddev); 374 enum amdgpu_pkg_type pkg_type = AMDGPU_PKG_TYPE_CEM; 375 const char *pkg; 376 377 if (adev->smuio.funcs && adev->smuio.funcs->get_pkg_type) 378 pkg_type = adev->smuio.funcs->get_pkg_type(adev); 379 380 switch (pkg_type) { 381 case AMDGPU_PKG_TYPE_CEM: 382 pkg = "cem"; 383 break; 384 case AMDGPU_PKG_TYPE_OAM: 385 pkg = "oam"; 386 break; 387 default: 388 pkg = "unknown"; 389 break; 390 } 391 392 return sysfs_emit(buf, "%s : %s\n", "type", pkg); 393 } 394 395 static DEVICE_ATTR(board_info, 0444, amdgpu_device_get_board_info, NULL); 396 397 static struct attribute *amdgpu_board_attrs[] = { 398 &dev_attr_board_info.attr, 399 NULL, 400 }; 401 402 static umode_t amdgpu_board_attrs_is_visible(struct kobject *kobj, 403 struct attribute *attr, int n) 404 { 405 struct device *dev = kobj_to_dev(kobj); 406 struct drm_device *ddev = dev_get_drvdata(dev); 407 struct amdgpu_device *adev = drm_to_adev(ddev); 408 409 if (adev->flags & AMD_IS_APU) 410 return 0; 411 412 return attr->mode; 413 } 414 415 static const struct attribute_group amdgpu_board_attrs_group = { 416 .attrs = amdgpu_board_attrs, 417 .is_visible = amdgpu_board_attrs_is_visible 418 }; 419 420 static void amdgpu_device_get_pcie_info(struct amdgpu_device *adev); 421 422 /** 423 * amdgpu_device_supports_px - Is the device a dGPU with ATPX power control 424 * 425 * @adev: amdgpu device pointer 426 * 427 * Returns true if the device is a dGPU with ATPX power control, 428 * otherwise return false. 429 */ 430 bool amdgpu_device_supports_px(struct amdgpu_device *adev) 431 { 432 if ((adev->flags & AMD_IS_PX) && !amdgpu_is_atpx_hybrid()) 433 return true; 434 return false; 435 } 436 437 /** 438 * amdgpu_device_supports_boco - Is the device a dGPU with ACPI power resources 439 * 440 * @adev: amdgpu device pointer 441 * 442 * Returns true if the device is a dGPU with ACPI power control, 443 * otherwise return false. 444 */ 445 bool amdgpu_device_supports_boco(struct amdgpu_device *adev) 446 { 447 if (!IS_ENABLED(CONFIG_HOTPLUG_PCI_PCIE)) 448 return false; 449 450 if (adev->has_pr3 || 451 ((adev->flags & AMD_IS_PX) && amdgpu_is_atpx_hybrid())) 452 return true; 453 return false; 454 } 455 456 /** 457 * amdgpu_device_supports_baco - Does the device support BACO 458 * 459 * @adev: amdgpu device pointer 460 * 461 * Return: 462 * 1 if the device supports BACO; 463 * 3 if the device supports MACO (only works if BACO is supported) 464 * otherwise return 0. 465 */ 466 int amdgpu_device_supports_baco(struct amdgpu_device *adev) 467 { 468 return amdgpu_asic_supports_baco(adev); 469 } 470 471 void amdgpu_device_detect_runtime_pm_mode(struct amdgpu_device *adev) 472 { 473 int bamaco_support; 474 475 adev->pm.rpm_mode = AMDGPU_RUNPM_NONE; 476 bamaco_support = amdgpu_device_supports_baco(adev); 477 478 switch (amdgpu_runtime_pm) { 479 case 2: 480 if (bamaco_support & MACO_SUPPORT) { 481 adev->pm.rpm_mode = AMDGPU_RUNPM_BAMACO; 482 dev_info(adev->dev, "Forcing BAMACO for runtime pm\n"); 483 } else if (bamaco_support == BACO_SUPPORT) { 484 adev->pm.rpm_mode = AMDGPU_RUNPM_BACO; 485 dev_info(adev->dev, "Requested mode BAMACO not available,fallback to use BACO\n"); 486 } 487 break; 488 case 1: 489 if (bamaco_support & BACO_SUPPORT) { 490 adev->pm.rpm_mode = AMDGPU_RUNPM_BACO; 491 dev_info(adev->dev, "Forcing BACO for runtime pm\n"); 492 } 493 break; 494 case -1: 495 case -2: 496 if (amdgpu_device_supports_px(adev)) { 497 /* enable PX as runtime mode */ 498 adev->pm.rpm_mode = AMDGPU_RUNPM_PX; 499 dev_info(adev->dev, "Using ATPX for runtime pm\n"); 500 } else if (amdgpu_device_supports_boco(adev)) { 501 /* enable boco as runtime mode */ 502 adev->pm.rpm_mode = AMDGPU_RUNPM_BOCO; 503 dev_info(adev->dev, "Using BOCO for runtime pm\n"); 504 } else { 505 if (!bamaco_support) 506 goto no_runtime_pm; 507 508 switch (adev->asic_type) { 509 case CHIP_VEGA20: 510 case CHIP_ARCTURUS: 511 /* BACO are not supported on vega20 and arctrus */ 512 break; 513 case CHIP_VEGA10: 514 /* enable BACO as runpm mode if noretry=0 */ 515 if (!adev->gmc.noretry && !amdgpu_passthrough(adev)) 516 adev->pm.rpm_mode = AMDGPU_RUNPM_BACO; 517 break; 518 default: 519 /* enable BACO as runpm mode on CI+ */ 520 if (!amdgpu_passthrough(adev)) 521 adev->pm.rpm_mode = AMDGPU_RUNPM_BACO; 522 break; 523 } 524 525 if (adev->pm.rpm_mode == AMDGPU_RUNPM_BACO) { 526 if (bamaco_support & MACO_SUPPORT) { 527 adev->pm.rpm_mode = AMDGPU_RUNPM_BAMACO; 528 dev_info(adev->dev, "Using BAMACO for runtime pm\n"); 529 } else { 530 dev_info(adev->dev, "Using BACO for runtime pm\n"); 531 } 532 } 533 } 534 break; 535 case 0: 536 dev_info(adev->dev, "runtime pm is manually disabled\n"); 537 break; 538 default: 539 break; 540 } 541 542 no_runtime_pm: 543 if (adev->pm.rpm_mode == AMDGPU_RUNPM_NONE) 544 dev_info(adev->dev, "Runtime PM not available\n"); 545 } 546 /** 547 * amdgpu_device_supports_smart_shift - Is the device dGPU with 548 * smart shift support 549 * 550 * @adev: amdgpu device pointer 551 * 552 * Returns true if the device is a dGPU with Smart Shift support, 553 * otherwise returns false. 554 */ 555 bool amdgpu_device_supports_smart_shift(struct amdgpu_device *adev) 556 { 557 return (amdgpu_device_supports_boco(adev) && 558 amdgpu_acpi_is_power_shift_control_supported()); 559 } 560 561 /* 562 * VRAM access helper functions 563 */ 564 565 /** 566 * amdgpu_device_mm_access - access vram by MM_INDEX/MM_DATA 567 * 568 * @adev: amdgpu_device pointer 569 * @pos: offset of the buffer in vram 570 * @buf: virtual address of the buffer in system memory 571 * @size: read/write size, sizeof(@buf) must > @size 572 * @write: true - write to vram, otherwise - read from vram 573 */ 574 void amdgpu_device_mm_access(struct amdgpu_device *adev, loff_t pos, 575 void *buf, size_t size, bool write) 576 { 577 unsigned long flags; 578 uint32_t hi = ~0, tmp = 0; 579 uint32_t *data = buf; 580 uint64_t last; 581 int idx; 582 583 if (!drm_dev_enter(adev_to_drm(adev), &idx)) 584 return; 585 586 BUG_ON(!IS_ALIGNED(pos, 4) || !IS_ALIGNED(size, 4)); 587 588 spin_lock_irqsave(&adev->mmio_idx_lock, flags); 589 for (last = pos + size; pos < last; pos += 4) { 590 tmp = pos >> 31; 591 592 WREG32_NO_KIQ(mmMM_INDEX, ((uint32_t)pos) | 0x80000000); 593 if (tmp != hi) { 594 WREG32_NO_KIQ(mmMM_INDEX_HI, tmp); 595 hi = tmp; 596 } 597 if (write) 598 WREG32_NO_KIQ(mmMM_DATA, *data++); 599 else 600 *data++ = RREG32_NO_KIQ(mmMM_DATA); 601 } 602 603 spin_unlock_irqrestore(&adev->mmio_idx_lock, flags); 604 drm_dev_exit(idx); 605 } 606 607 /** 608 * amdgpu_device_aper_access - access vram by vram aperture 609 * 610 * @adev: amdgpu_device pointer 611 * @pos: offset of the buffer in vram 612 * @buf: virtual address of the buffer in system memory 613 * @size: read/write size, sizeof(@buf) must > @size 614 * @write: true - write to vram, otherwise - read from vram 615 * 616 * The return value means how many bytes have been transferred. 617 */ 618 size_t amdgpu_device_aper_access(struct amdgpu_device *adev, loff_t pos, 619 void *buf, size_t size, bool write) 620 { 621 #ifdef CONFIG_64BIT 622 void __iomem *addr; 623 size_t count = 0; 624 uint64_t last; 625 626 if (!adev->mman.aper_base_kaddr) 627 return 0; 628 629 last = min(pos + size, adev->gmc.visible_vram_size); 630 if (last > pos) { 631 addr = adev->mman.aper_base_kaddr + pos; 632 count = last - pos; 633 634 if (write) { 635 memcpy_toio(addr, buf, count); 636 /* Make sure HDP write cache flush happens without any reordering 637 * after the system memory contents are sent over PCIe device 638 */ 639 mb(); 640 amdgpu_device_flush_hdp(adev, NULL); 641 } else { 642 amdgpu_device_invalidate_hdp(adev, NULL); 643 /* Make sure HDP read cache is invalidated before issuing a read 644 * to the PCIe device 645 */ 646 mb(); 647 memcpy_fromio(buf, addr, count); 648 } 649 650 } 651 652 return count; 653 #else 654 return 0; 655 #endif 656 } 657 658 /** 659 * amdgpu_device_vram_access - read/write a buffer in vram 660 * 661 * @adev: amdgpu_device pointer 662 * @pos: offset of the buffer in vram 663 * @buf: virtual address of the buffer in system memory 664 * @size: read/write size, sizeof(@buf) must > @size 665 * @write: true - write to vram, otherwise - read from vram 666 */ 667 void amdgpu_device_vram_access(struct amdgpu_device *adev, loff_t pos, 668 void *buf, size_t size, bool write) 669 { 670 size_t count; 671 672 /* try to using vram apreature to access vram first */ 673 count = amdgpu_device_aper_access(adev, pos, buf, size, write); 674 size -= count; 675 if (size) { 676 /* using MM to access rest vram */ 677 pos += count; 678 buf += count; 679 amdgpu_device_mm_access(adev, pos, buf, size, write); 680 } 681 } 682 683 /* 684 * register access helper functions. 685 */ 686 687 /* Check if hw access should be skipped because of hotplug or device error */ 688 bool amdgpu_device_skip_hw_access(struct amdgpu_device *adev) 689 { 690 if (adev->no_hw_access) 691 return true; 692 693 #ifdef CONFIG_LOCKDEP 694 /* 695 * This is a bit complicated to understand, so worth a comment. What we assert 696 * here is that the GPU reset is not running on another thread in parallel. 697 * 698 * For this we trylock the read side of the reset semaphore, if that succeeds 699 * we know that the reset is not running in parallel. 700 * 701 * If the trylock fails we assert that we are either already holding the read 702 * side of the lock or are the reset thread itself and hold the write side of 703 * the lock. 704 */ 705 if (in_task()) { 706 if (down_read_trylock(&adev->reset_domain->sem)) 707 up_read(&adev->reset_domain->sem); 708 else 709 lockdep_assert_held(&adev->reset_domain->sem); 710 } 711 #endif 712 return false; 713 } 714 715 /** 716 * amdgpu_device_rreg - read a memory mapped IO or indirect register 717 * 718 * @adev: amdgpu_device pointer 719 * @reg: dword aligned register offset 720 * @acc_flags: access flags which require special behavior 721 * 722 * Returns the 32 bit value from the offset specified. 723 */ 724 uint32_t amdgpu_device_rreg(struct amdgpu_device *adev, 725 uint32_t reg, uint32_t acc_flags) 726 { 727 uint32_t ret; 728 729 if (amdgpu_device_skip_hw_access(adev)) 730 return 0; 731 732 if ((reg * 4) < adev->rmmio_size) { 733 if (!(acc_flags & AMDGPU_REGS_NO_KIQ) && 734 amdgpu_sriov_runtime(adev) && 735 down_read_trylock(&adev->reset_domain->sem)) { 736 ret = amdgpu_kiq_rreg(adev, reg, 0); 737 up_read(&adev->reset_domain->sem); 738 } else { 739 ret = readl(((void __iomem *)adev->rmmio) + (reg * 4)); 740 } 741 } else { 742 ret = adev->pcie_rreg(adev, reg * 4); 743 } 744 745 trace_amdgpu_device_rreg(adev->pdev->device, reg, ret); 746 747 return ret; 748 } 749 750 /* 751 * MMIO register read with bytes helper functions 752 * @offset:bytes offset from MMIO start 753 */ 754 755 /** 756 * amdgpu_mm_rreg8 - read a memory mapped IO register 757 * 758 * @adev: amdgpu_device pointer 759 * @offset: byte aligned register offset 760 * 761 * Returns the 8 bit value from the offset specified. 762 */ 763 uint8_t amdgpu_mm_rreg8(struct amdgpu_device *adev, uint32_t offset) 764 { 765 if (amdgpu_device_skip_hw_access(adev)) 766 return 0; 767 768 if (offset < adev->rmmio_size) 769 return (readb(adev->rmmio + offset)); 770 BUG(); 771 } 772 773 774 /** 775 * amdgpu_device_xcc_rreg - read a memory mapped IO or indirect register with specific XCC 776 * 777 * @adev: amdgpu_device pointer 778 * @reg: dword aligned register offset 779 * @acc_flags: access flags which require special behavior 780 * @xcc_id: xcc accelerated compute core id 781 * 782 * Returns the 32 bit value from the offset specified. 783 */ 784 uint32_t amdgpu_device_xcc_rreg(struct amdgpu_device *adev, 785 uint32_t reg, uint32_t acc_flags, 786 uint32_t xcc_id) 787 { 788 uint32_t ret, rlcg_flag; 789 790 if (amdgpu_device_skip_hw_access(adev)) 791 return 0; 792 793 if ((reg * 4) < adev->rmmio_size) { 794 if (amdgpu_sriov_vf(adev) && 795 !amdgpu_sriov_runtime(adev) && 796 adev->gfx.rlc.rlcg_reg_access_supported && 797 amdgpu_virt_get_rlcg_reg_access_flag(adev, acc_flags, 798 GC_HWIP, false, 799 &rlcg_flag)) { 800 ret = amdgpu_virt_rlcg_reg_rw(adev, reg, 0, rlcg_flag, GET_INST(GC, xcc_id)); 801 } else if (!(acc_flags & AMDGPU_REGS_NO_KIQ) && 802 amdgpu_sriov_runtime(adev) && 803 down_read_trylock(&adev->reset_domain->sem)) { 804 ret = amdgpu_kiq_rreg(adev, reg, xcc_id); 805 up_read(&adev->reset_domain->sem); 806 } else { 807 ret = readl(((void __iomem *)adev->rmmio) + (reg * 4)); 808 } 809 } else { 810 ret = adev->pcie_rreg(adev, reg * 4); 811 } 812 813 return ret; 814 } 815 816 /* 817 * MMIO register write with bytes helper functions 818 * @offset:bytes offset from MMIO start 819 * @value: the value want to be written to the register 820 */ 821 822 /** 823 * amdgpu_mm_wreg8 - read a memory mapped IO register 824 * 825 * @adev: amdgpu_device pointer 826 * @offset: byte aligned register offset 827 * @value: 8 bit value to write 828 * 829 * Writes the value specified to the offset specified. 830 */ 831 void amdgpu_mm_wreg8(struct amdgpu_device *adev, uint32_t offset, uint8_t value) 832 { 833 if (amdgpu_device_skip_hw_access(adev)) 834 return; 835 836 if (offset < adev->rmmio_size) 837 writeb(value, adev->rmmio + offset); 838 else 839 BUG(); 840 } 841 842 /** 843 * amdgpu_device_wreg - write to a memory mapped IO or indirect register 844 * 845 * @adev: amdgpu_device pointer 846 * @reg: dword aligned register offset 847 * @v: 32 bit value to write to the register 848 * @acc_flags: access flags which require special behavior 849 * 850 * Writes the value specified to the offset specified. 851 */ 852 void amdgpu_device_wreg(struct amdgpu_device *adev, 853 uint32_t reg, uint32_t v, 854 uint32_t acc_flags) 855 { 856 if (amdgpu_device_skip_hw_access(adev)) 857 return; 858 859 if ((reg * 4) < adev->rmmio_size) { 860 if (!(acc_flags & AMDGPU_REGS_NO_KIQ) && 861 amdgpu_sriov_runtime(adev) && 862 down_read_trylock(&adev->reset_domain->sem)) { 863 amdgpu_kiq_wreg(adev, reg, v, 0); 864 up_read(&adev->reset_domain->sem); 865 } else { 866 writel(v, ((void __iomem *)adev->rmmio) + (reg * 4)); 867 } 868 } else { 869 adev->pcie_wreg(adev, reg * 4, v); 870 } 871 872 trace_amdgpu_device_wreg(adev->pdev->device, reg, v); 873 } 874 875 /** 876 * amdgpu_mm_wreg_mmio_rlc - write register either with direct/indirect mmio or with RLC path if in range 877 * 878 * @adev: amdgpu_device pointer 879 * @reg: mmio/rlc register 880 * @v: value to write 881 * @xcc_id: xcc accelerated compute core id 882 * 883 * this function is invoked only for the debugfs register access 884 */ 885 void amdgpu_mm_wreg_mmio_rlc(struct amdgpu_device *adev, 886 uint32_t reg, uint32_t v, 887 uint32_t xcc_id) 888 { 889 if (amdgpu_device_skip_hw_access(adev)) 890 return; 891 892 if (amdgpu_sriov_fullaccess(adev) && 893 adev->gfx.rlc.funcs && 894 adev->gfx.rlc.funcs->is_rlcg_access_range) { 895 if (adev->gfx.rlc.funcs->is_rlcg_access_range(adev, reg)) 896 return amdgpu_sriov_wreg(adev, reg, v, 0, 0, xcc_id); 897 } else if ((reg * 4) >= adev->rmmio_size) { 898 adev->pcie_wreg(adev, reg * 4, v); 899 } else { 900 writel(v, ((void __iomem *)adev->rmmio) + (reg * 4)); 901 } 902 } 903 904 /** 905 * amdgpu_device_xcc_wreg - write to a memory mapped IO or indirect register with specific XCC 906 * 907 * @adev: amdgpu_device pointer 908 * @reg: dword aligned register offset 909 * @v: 32 bit value to write to the register 910 * @acc_flags: access flags which require special behavior 911 * @xcc_id: xcc accelerated compute core id 912 * 913 * Writes the value specified to the offset specified. 914 */ 915 void amdgpu_device_xcc_wreg(struct amdgpu_device *adev, 916 uint32_t reg, uint32_t v, 917 uint32_t acc_flags, uint32_t xcc_id) 918 { 919 uint32_t rlcg_flag; 920 921 if (amdgpu_device_skip_hw_access(adev)) 922 return; 923 924 if ((reg * 4) < adev->rmmio_size) { 925 if (amdgpu_sriov_vf(adev) && 926 !amdgpu_sriov_runtime(adev) && 927 adev->gfx.rlc.rlcg_reg_access_supported && 928 amdgpu_virt_get_rlcg_reg_access_flag(adev, acc_flags, 929 GC_HWIP, true, 930 &rlcg_flag)) { 931 amdgpu_virt_rlcg_reg_rw(adev, reg, v, rlcg_flag, GET_INST(GC, xcc_id)); 932 } else if (!(acc_flags & AMDGPU_REGS_NO_KIQ) && 933 amdgpu_sriov_runtime(adev) && 934 down_read_trylock(&adev->reset_domain->sem)) { 935 amdgpu_kiq_wreg(adev, reg, v, xcc_id); 936 up_read(&adev->reset_domain->sem); 937 } else { 938 writel(v, ((void __iomem *)adev->rmmio) + (reg * 4)); 939 } 940 } else { 941 adev->pcie_wreg(adev, reg * 4, v); 942 } 943 } 944 945 /** 946 * amdgpu_device_indirect_rreg - read an indirect register 947 * 948 * @adev: amdgpu_device pointer 949 * @reg_addr: indirect register address to read from 950 * 951 * Returns the value of indirect register @reg_addr 952 */ 953 u32 amdgpu_device_indirect_rreg(struct amdgpu_device *adev, 954 u32 reg_addr) 955 { 956 unsigned long flags, pcie_index, pcie_data; 957 void __iomem *pcie_index_offset; 958 void __iomem *pcie_data_offset; 959 u32 r; 960 961 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 962 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 963 964 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 965 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 966 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 967 968 writel(reg_addr, pcie_index_offset); 969 readl(pcie_index_offset); 970 r = readl(pcie_data_offset); 971 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 972 973 return r; 974 } 975 976 u32 amdgpu_device_indirect_rreg_ext(struct amdgpu_device *adev, 977 u64 reg_addr) 978 { 979 unsigned long flags, pcie_index, pcie_index_hi, pcie_data; 980 u32 r; 981 void __iomem *pcie_index_offset; 982 void __iomem *pcie_index_hi_offset; 983 void __iomem *pcie_data_offset; 984 985 if (unlikely(!adev->nbio.funcs)) { 986 pcie_index = AMDGPU_PCIE_INDEX_FALLBACK; 987 pcie_data = AMDGPU_PCIE_DATA_FALLBACK; 988 } else { 989 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 990 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 991 } 992 993 if (reg_addr >> 32) { 994 if (unlikely(!adev->nbio.funcs)) 995 pcie_index_hi = AMDGPU_PCIE_INDEX_HI_FALLBACK; 996 else 997 pcie_index_hi = adev->nbio.funcs->get_pcie_index_hi_offset(adev); 998 } else { 999 pcie_index_hi = 0; 1000 } 1001 1002 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 1003 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 1004 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 1005 if (pcie_index_hi != 0) 1006 pcie_index_hi_offset = (void __iomem *)adev->rmmio + 1007 pcie_index_hi * 4; 1008 1009 writel(reg_addr, pcie_index_offset); 1010 readl(pcie_index_offset); 1011 if (pcie_index_hi != 0) { 1012 writel((reg_addr >> 32) & 0xff, pcie_index_hi_offset); 1013 readl(pcie_index_hi_offset); 1014 } 1015 r = readl(pcie_data_offset); 1016 1017 /* clear the high bits */ 1018 if (pcie_index_hi != 0) { 1019 writel(0, pcie_index_hi_offset); 1020 readl(pcie_index_hi_offset); 1021 } 1022 1023 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 1024 1025 return r; 1026 } 1027 1028 /** 1029 * amdgpu_device_indirect_rreg64 - read a 64bits indirect register 1030 * 1031 * @adev: amdgpu_device pointer 1032 * @reg_addr: indirect register address to read from 1033 * 1034 * Returns the value of indirect register @reg_addr 1035 */ 1036 u64 amdgpu_device_indirect_rreg64(struct amdgpu_device *adev, 1037 u32 reg_addr) 1038 { 1039 unsigned long flags, pcie_index, pcie_data; 1040 void __iomem *pcie_index_offset; 1041 void __iomem *pcie_data_offset; 1042 u64 r; 1043 1044 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 1045 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 1046 1047 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 1048 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 1049 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 1050 1051 /* read low 32 bits */ 1052 writel(reg_addr, pcie_index_offset); 1053 readl(pcie_index_offset); 1054 r = readl(pcie_data_offset); 1055 /* read high 32 bits */ 1056 writel(reg_addr + 4, pcie_index_offset); 1057 readl(pcie_index_offset); 1058 r |= ((u64)readl(pcie_data_offset) << 32); 1059 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 1060 1061 return r; 1062 } 1063 1064 u64 amdgpu_device_indirect_rreg64_ext(struct amdgpu_device *adev, 1065 u64 reg_addr) 1066 { 1067 unsigned long flags, pcie_index, pcie_data; 1068 unsigned long pcie_index_hi = 0; 1069 void __iomem *pcie_index_offset; 1070 void __iomem *pcie_index_hi_offset; 1071 void __iomem *pcie_data_offset; 1072 u64 r; 1073 1074 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 1075 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 1076 if ((reg_addr >> 32) && (adev->nbio.funcs->get_pcie_index_hi_offset)) 1077 pcie_index_hi = adev->nbio.funcs->get_pcie_index_hi_offset(adev); 1078 1079 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 1080 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 1081 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 1082 if (pcie_index_hi != 0) 1083 pcie_index_hi_offset = (void __iomem *)adev->rmmio + 1084 pcie_index_hi * 4; 1085 1086 /* read low 32 bits */ 1087 writel(reg_addr, pcie_index_offset); 1088 readl(pcie_index_offset); 1089 if (pcie_index_hi != 0) { 1090 writel((reg_addr >> 32) & 0xff, pcie_index_hi_offset); 1091 readl(pcie_index_hi_offset); 1092 } 1093 r = readl(pcie_data_offset); 1094 /* read high 32 bits */ 1095 writel(reg_addr + 4, pcie_index_offset); 1096 readl(pcie_index_offset); 1097 if (pcie_index_hi != 0) { 1098 writel((reg_addr >> 32) & 0xff, pcie_index_hi_offset); 1099 readl(pcie_index_hi_offset); 1100 } 1101 r |= ((u64)readl(pcie_data_offset) << 32); 1102 1103 /* clear the high bits */ 1104 if (pcie_index_hi != 0) { 1105 writel(0, pcie_index_hi_offset); 1106 readl(pcie_index_hi_offset); 1107 } 1108 1109 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 1110 1111 return r; 1112 } 1113 1114 /** 1115 * amdgpu_device_indirect_wreg - write an indirect register address 1116 * 1117 * @adev: amdgpu_device pointer 1118 * @reg_addr: indirect register offset 1119 * @reg_data: indirect register data 1120 * 1121 */ 1122 void amdgpu_device_indirect_wreg(struct amdgpu_device *adev, 1123 u32 reg_addr, u32 reg_data) 1124 { 1125 unsigned long flags, pcie_index, pcie_data; 1126 void __iomem *pcie_index_offset; 1127 void __iomem *pcie_data_offset; 1128 1129 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 1130 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 1131 1132 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 1133 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 1134 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 1135 1136 writel(reg_addr, pcie_index_offset); 1137 readl(pcie_index_offset); 1138 writel(reg_data, pcie_data_offset); 1139 readl(pcie_data_offset); 1140 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 1141 } 1142 1143 void amdgpu_device_indirect_wreg_ext(struct amdgpu_device *adev, 1144 u64 reg_addr, u32 reg_data) 1145 { 1146 unsigned long flags, pcie_index, pcie_index_hi, pcie_data; 1147 void __iomem *pcie_index_offset; 1148 void __iomem *pcie_index_hi_offset; 1149 void __iomem *pcie_data_offset; 1150 1151 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 1152 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 1153 if ((reg_addr >> 32) && (adev->nbio.funcs->get_pcie_index_hi_offset)) 1154 pcie_index_hi = adev->nbio.funcs->get_pcie_index_hi_offset(adev); 1155 else 1156 pcie_index_hi = 0; 1157 1158 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 1159 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 1160 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 1161 if (pcie_index_hi != 0) 1162 pcie_index_hi_offset = (void __iomem *)adev->rmmio + 1163 pcie_index_hi * 4; 1164 1165 writel(reg_addr, pcie_index_offset); 1166 readl(pcie_index_offset); 1167 if (pcie_index_hi != 0) { 1168 writel((reg_addr >> 32) & 0xff, pcie_index_hi_offset); 1169 readl(pcie_index_hi_offset); 1170 } 1171 writel(reg_data, pcie_data_offset); 1172 readl(pcie_data_offset); 1173 1174 /* clear the high bits */ 1175 if (pcie_index_hi != 0) { 1176 writel(0, pcie_index_hi_offset); 1177 readl(pcie_index_hi_offset); 1178 } 1179 1180 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 1181 } 1182 1183 /** 1184 * amdgpu_device_indirect_wreg64 - write a 64bits indirect register address 1185 * 1186 * @adev: amdgpu_device pointer 1187 * @reg_addr: indirect register offset 1188 * @reg_data: indirect register data 1189 * 1190 */ 1191 void amdgpu_device_indirect_wreg64(struct amdgpu_device *adev, 1192 u32 reg_addr, u64 reg_data) 1193 { 1194 unsigned long flags, pcie_index, pcie_data; 1195 void __iomem *pcie_index_offset; 1196 void __iomem *pcie_data_offset; 1197 1198 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 1199 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 1200 1201 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 1202 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 1203 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 1204 1205 /* write low 32 bits */ 1206 writel(reg_addr, pcie_index_offset); 1207 readl(pcie_index_offset); 1208 writel((u32)(reg_data & 0xffffffffULL), pcie_data_offset); 1209 readl(pcie_data_offset); 1210 /* write high 32 bits */ 1211 writel(reg_addr + 4, pcie_index_offset); 1212 readl(pcie_index_offset); 1213 writel((u32)(reg_data >> 32), pcie_data_offset); 1214 readl(pcie_data_offset); 1215 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 1216 } 1217 1218 void amdgpu_device_indirect_wreg64_ext(struct amdgpu_device *adev, 1219 u64 reg_addr, u64 reg_data) 1220 { 1221 unsigned long flags, pcie_index, pcie_data; 1222 unsigned long pcie_index_hi = 0; 1223 void __iomem *pcie_index_offset; 1224 void __iomem *pcie_index_hi_offset; 1225 void __iomem *pcie_data_offset; 1226 1227 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 1228 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 1229 if ((reg_addr >> 32) && (adev->nbio.funcs->get_pcie_index_hi_offset)) 1230 pcie_index_hi = adev->nbio.funcs->get_pcie_index_hi_offset(adev); 1231 1232 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 1233 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 1234 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 1235 if (pcie_index_hi != 0) 1236 pcie_index_hi_offset = (void __iomem *)adev->rmmio + 1237 pcie_index_hi * 4; 1238 1239 /* write low 32 bits */ 1240 writel(reg_addr, pcie_index_offset); 1241 readl(pcie_index_offset); 1242 if (pcie_index_hi != 0) { 1243 writel((reg_addr >> 32) & 0xff, pcie_index_hi_offset); 1244 readl(pcie_index_hi_offset); 1245 } 1246 writel((u32)(reg_data & 0xffffffffULL), pcie_data_offset); 1247 readl(pcie_data_offset); 1248 /* write high 32 bits */ 1249 writel(reg_addr + 4, pcie_index_offset); 1250 readl(pcie_index_offset); 1251 if (pcie_index_hi != 0) { 1252 writel((reg_addr >> 32) & 0xff, pcie_index_hi_offset); 1253 readl(pcie_index_hi_offset); 1254 } 1255 writel((u32)(reg_data >> 32), pcie_data_offset); 1256 readl(pcie_data_offset); 1257 1258 /* clear the high bits */ 1259 if (pcie_index_hi != 0) { 1260 writel(0, pcie_index_hi_offset); 1261 readl(pcie_index_hi_offset); 1262 } 1263 1264 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 1265 } 1266 1267 /** 1268 * amdgpu_device_get_rev_id - query device rev_id 1269 * 1270 * @adev: amdgpu_device pointer 1271 * 1272 * Return device rev_id 1273 */ 1274 u32 amdgpu_device_get_rev_id(struct amdgpu_device *adev) 1275 { 1276 return adev->nbio.funcs->get_rev_id(adev); 1277 } 1278 1279 /** 1280 * amdgpu_invalid_rreg - dummy reg read function 1281 * 1282 * @adev: amdgpu_device pointer 1283 * @reg: offset of register 1284 * 1285 * Dummy register read function. Used for register blocks 1286 * that certain asics don't have (all asics). 1287 * Returns the value in the register. 1288 */ 1289 static uint32_t amdgpu_invalid_rreg(struct amdgpu_device *adev, uint32_t reg) 1290 { 1291 dev_err(adev->dev, "Invalid callback to read register 0x%04X\n", reg); 1292 BUG(); 1293 return 0; 1294 } 1295 1296 static uint32_t amdgpu_invalid_rreg_ext(struct amdgpu_device *adev, uint64_t reg) 1297 { 1298 dev_err(adev->dev, "Invalid callback to read register 0x%llX\n", reg); 1299 BUG(); 1300 return 0; 1301 } 1302 1303 /** 1304 * amdgpu_invalid_wreg - dummy reg write function 1305 * 1306 * @adev: amdgpu_device pointer 1307 * @reg: offset of register 1308 * @v: value to write to the register 1309 * 1310 * Dummy register read function. Used for register blocks 1311 * that certain asics don't have (all asics). 1312 */ 1313 static void amdgpu_invalid_wreg(struct amdgpu_device *adev, uint32_t reg, uint32_t v) 1314 { 1315 dev_err(adev->dev, 1316 "Invalid callback to write register 0x%04X with 0x%08X\n", reg, 1317 v); 1318 BUG(); 1319 } 1320 1321 static void amdgpu_invalid_wreg_ext(struct amdgpu_device *adev, uint64_t reg, uint32_t v) 1322 { 1323 dev_err(adev->dev, 1324 "Invalid callback to write register 0x%llX with 0x%08X\n", reg, 1325 v); 1326 BUG(); 1327 } 1328 1329 /** 1330 * amdgpu_invalid_rreg64 - dummy 64 bit reg read function 1331 * 1332 * @adev: amdgpu_device pointer 1333 * @reg: offset of register 1334 * 1335 * Dummy register read function. Used for register blocks 1336 * that certain asics don't have (all asics). 1337 * Returns the value in the register. 1338 */ 1339 static uint64_t amdgpu_invalid_rreg64(struct amdgpu_device *adev, uint32_t reg) 1340 { 1341 dev_err(adev->dev, "Invalid callback to read 64 bit register 0x%04X\n", 1342 reg); 1343 BUG(); 1344 return 0; 1345 } 1346 1347 static uint64_t amdgpu_invalid_rreg64_ext(struct amdgpu_device *adev, uint64_t reg) 1348 { 1349 dev_err(adev->dev, "Invalid callback to read register 0x%llX\n", reg); 1350 BUG(); 1351 return 0; 1352 } 1353 1354 /** 1355 * amdgpu_invalid_wreg64 - dummy reg write function 1356 * 1357 * @adev: amdgpu_device pointer 1358 * @reg: offset of register 1359 * @v: value to write to the register 1360 * 1361 * Dummy register read function. Used for register blocks 1362 * that certain asics don't have (all asics). 1363 */ 1364 static void amdgpu_invalid_wreg64(struct amdgpu_device *adev, uint32_t reg, uint64_t v) 1365 { 1366 dev_err(adev->dev, 1367 "Invalid callback to write 64 bit register 0x%04X with 0x%08llX\n", 1368 reg, v); 1369 BUG(); 1370 } 1371 1372 static void amdgpu_invalid_wreg64_ext(struct amdgpu_device *adev, uint64_t reg, uint64_t v) 1373 { 1374 dev_err(adev->dev, 1375 "Invalid callback to write 64 bit register 0x%llX with 0x%08llX\n", 1376 reg, v); 1377 BUG(); 1378 } 1379 1380 /** 1381 * amdgpu_block_invalid_rreg - dummy reg read function 1382 * 1383 * @adev: amdgpu_device pointer 1384 * @block: offset of instance 1385 * @reg: offset of register 1386 * 1387 * Dummy register read function. Used for register blocks 1388 * that certain asics don't have (all asics). 1389 * Returns the value in the register. 1390 */ 1391 static uint32_t amdgpu_block_invalid_rreg(struct amdgpu_device *adev, 1392 uint32_t block, uint32_t reg) 1393 { 1394 dev_err(adev->dev, 1395 "Invalid callback to read register 0x%04X in block 0x%04X\n", 1396 reg, block); 1397 BUG(); 1398 return 0; 1399 } 1400 1401 /** 1402 * amdgpu_block_invalid_wreg - dummy reg write function 1403 * 1404 * @adev: amdgpu_device pointer 1405 * @block: offset of instance 1406 * @reg: offset of register 1407 * @v: value to write to the register 1408 * 1409 * Dummy register read function. Used for register blocks 1410 * that certain asics don't have (all asics). 1411 */ 1412 static void amdgpu_block_invalid_wreg(struct amdgpu_device *adev, 1413 uint32_t block, 1414 uint32_t reg, uint32_t v) 1415 { 1416 dev_err(adev->dev, 1417 "Invalid block callback to write register 0x%04X in block 0x%04X with 0x%08X\n", 1418 reg, block, v); 1419 BUG(); 1420 } 1421 1422 static uint32_t amdgpu_device_get_vbios_flags(struct amdgpu_device *adev) 1423 { 1424 if (hweight32(adev->aid_mask) && (adev->flags & AMD_IS_APU)) 1425 return AMDGPU_VBIOS_SKIP; 1426 1427 if (hweight32(adev->aid_mask) && amdgpu_passthrough(adev)) 1428 return AMDGPU_VBIOS_OPTIONAL; 1429 1430 return 0; 1431 } 1432 1433 /** 1434 * amdgpu_device_asic_init - Wrapper for atom asic_init 1435 * 1436 * @adev: amdgpu_device pointer 1437 * 1438 * Does any asic specific work and then calls atom asic init. 1439 */ 1440 static int amdgpu_device_asic_init(struct amdgpu_device *adev) 1441 { 1442 uint32_t flags; 1443 bool optional; 1444 int ret; 1445 1446 amdgpu_asic_pre_asic_init(adev); 1447 flags = amdgpu_device_get_vbios_flags(adev); 1448 optional = !!(flags & (AMDGPU_VBIOS_OPTIONAL | AMDGPU_VBIOS_SKIP)); 1449 1450 if (amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 3) || 1451 amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 4) || 1452 amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 5, 0) || 1453 amdgpu_ip_version(adev, GC_HWIP, 0) >= IP_VERSION(11, 0, 0)) { 1454 amdgpu_psp_wait_for_bootloader(adev); 1455 if (optional && !adev->bios) 1456 return 0; 1457 1458 ret = amdgpu_atomfirmware_asic_init(adev, true); 1459 return ret; 1460 } else { 1461 if (optional && !adev->bios) 1462 return 0; 1463 1464 return amdgpu_atom_asic_init(adev->mode_info.atom_context); 1465 } 1466 1467 return 0; 1468 } 1469 1470 /** 1471 * amdgpu_device_mem_scratch_init - allocate the VRAM scratch page 1472 * 1473 * @adev: amdgpu_device pointer 1474 * 1475 * Allocates a scratch page of VRAM for use by various things in the 1476 * driver. 1477 */ 1478 static int amdgpu_device_mem_scratch_init(struct amdgpu_device *adev) 1479 { 1480 return amdgpu_bo_create_kernel(adev, AMDGPU_GPU_PAGE_SIZE, PAGE_SIZE, 1481 AMDGPU_GEM_DOMAIN_VRAM | 1482 AMDGPU_GEM_DOMAIN_GTT, 1483 &adev->mem_scratch.robj, 1484 &adev->mem_scratch.gpu_addr, 1485 (void **)&adev->mem_scratch.ptr); 1486 } 1487 1488 /** 1489 * amdgpu_device_mem_scratch_fini - Free the VRAM scratch page 1490 * 1491 * @adev: amdgpu_device pointer 1492 * 1493 * Frees the VRAM scratch page. 1494 */ 1495 static void amdgpu_device_mem_scratch_fini(struct amdgpu_device *adev) 1496 { 1497 amdgpu_bo_free_kernel(&adev->mem_scratch.robj, NULL, NULL); 1498 } 1499 1500 /** 1501 * amdgpu_device_program_register_sequence - program an array of registers. 1502 * 1503 * @adev: amdgpu_device pointer 1504 * @registers: pointer to the register array 1505 * @array_size: size of the register array 1506 * 1507 * Programs an array or registers with and or masks. 1508 * This is a helper for setting golden registers. 1509 */ 1510 void amdgpu_device_program_register_sequence(struct amdgpu_device *adev, 1511 const u32 *registers, 1512 const u32 array_size) 1513 { 1514 u32 tmp, reg, and_mask, or_mask; 1515 int i; 1516 1517 if (array_size % 3) 1518 return; 1519 1520 for (i = 0; i < array_size; i += 3) { 1521 reg = registers[i + 0]; 1522 and_mask = registers[i + 1]; 1523 or_mask = registers[i + 2]; 1524 1525 if (and_mask == 0xffffffff) { 1526 tmp = or_mask; 1527 } else { 1528 tmp = RREG32(reg); 1529 tmp &= ~and_mask; 1530 if (adev->family >= AMDGPU_FAMILY_AI) 1531 tmp |= (or_mask & and_mask); 1532 else 1533 tmp |= or_mask; 1534 } 1535 WREG32(reg, tmp); 1536 } 1537 } 1538 1539 /** 1540 * amdgpu_device_pci_config_reset - reset the GPU 1541 * 1542 * @adev: amdgpu_device pointer 1543 * 1544 * Resets the GPU using the pci config reset sequence. 1545 * Only applicable to asics prior to vega10. 1546 */ 1547 void amdgpu_device_pci_config_reset(struct amdgpu_device *adev) 1548 { 1549 pci_write_config_dword(adev->pdev, 0x7c, AMDGPU_ASIC_RESET_DATA); 1550 } 1551 1552 /** 1553 * amdgpu_device_pci_reset - reset the GPU using generic PCI means 1554 * 1555 * @adev: amdgpu_device pointer 1556 * 1557 * Resets the GPU using generic pci reset interfaces (FLR, SBR, etc.). 1558 */ 1559 int amdgpu_device_pci_reset(struct amdgpu_device *adev) 1560 { 1561 return pci_reset_function(adev->pdev); 1562 } 1563 1564 /* 1565 * amdgpu_device_wb_*() 1566 * Writeback is the method by which the GPU updates special pages in memory 1567 * with the status of certain GPU events (fences, ring pointers,etc.). 1568 */ 1569 1570 /** 1571 * amdgpu_device_wb_fini - Disable Writeback and free memory 1572 * 1573 * @adev: amdgpu_device pointer 1574 * 1575 * Disables Writeback and frees the Writeback memory (all asics). 1576 * Used at driver shutdown. 1577 */ 1578 static void amdgpu_device_wb_fini(struct amdgpu_device *adev) 1579 { 1580 if (adev->wb.wb_obj) { 1581 amdgpu_bo_free_kernel(&adev->wb.wb_obj, 1582 &adev->wb.gpu_addr, 1583 (void **)&adev->wb.wb); 1584 adev->wb.wb_obj = NULL; 1585 } 1586 } 1587 1588 /** 1589 * amdgpu_device_wb_init - Init Writeback driver info and allocate memory 1590 * 1591 * @adev: amdgpu_device pointer 1592 * 1593 * Initializes writeback and allocates writeback memory (all asics). 1594 * Used at driver startup. 1595 * Returns 0 on success or an -error on failure. 1596 */ 1597 static int amdgpu_device_wb_init(struct amdgpu_device *adev) 1598 { 1599 int r; 1600 1601 if (adev->wb.wb_obj == NULL) { 1602 /* AMDGPU_MAX_WB * sizeof(uint32_t) * 8 = AMDGPU_MAX_WB 256bit slots */ 1603 r = amdgpu_bo_create_kernel(adev, AMDGPU_MAX_WB * sizeof(uint32_t) * 8, 1604 PAGE_SIZE, AMDGPU_GEM_DOMAIN_GTT, 1605 &adev->wb.wb_obj, &adev->wb.gpu_addr, 1606 (void **)&adev->wb.wb); 1607 if (r) { 1608 dev_warn(adev->dev, "(%d) create WB bo failed\n", r); 1609 return r; 1610 } 1611 1612 adev->wb.num_wb = AMDGPU_MAX_WB; 1613 memset(&adev->wb.used, 0, sizeof(adev->wb.used)); 1614 1615 /* clear wb memory */ 1616 memset((char *)adev->wb.wb, 0, AMDGPU_MAX_WB * sizeof(uint32_t) * 8); 1617 } 1618 1619 return 0; 1620 } 1621 1622 /** 1623 * amdgpu_device_wb_get - Allocate a wb entry 1624 * 1625 * @adev: amdgpu_device pointer 1626 * @wb: wb index 1627 * 1628 * Allocate a wb slot for use by the driver (all asics). 1629 * Returns 0 on success or -EINVAL on failure. 1630 */ 1631 int amdgpu_device_wb_get(struct amdgpu_device *adev, u32 *wb) 1632 { 1633 unsigned long flags, offset; 1634 1635 spin_lock_irqsave(&adev->wb.lock, flags); 1636 offset = find_first_zero_bit(adev->wb.used, adev->wb.num_wb); 1637 if (offset < adev->wb.num_wb) { 1638 __set_bit(offset, adev->wb.used); 1639 spin_unlock_irqrestore(&adev->wb.lock, flags); 1640 *wb = offset << 3; /* convert to dw offset */ 1641 return 0; 1642 } else { 1643 spin_unlock_irqrestore(&adev->wb.lock, flags); 1644 return -EINVAL; 1645 } 1646 } 1647 1648 /** 1649 * amdgpu_device_wb_free - Free a wb entry 1650 * 1651 * @adev: amdgpu_device pointer 1652 * @wb: wb index 1653 * 1654 * Free a wb slot allocated for use by the driver (all asics) 1655 */ 1656 void amdgpu_device_wb_free(struct amdgpu_device *adev, u32 wb) 1657 { 1658 unsigned long flags; 1659 1660 wb >>= 3; 1661 spin_lock_irqsave(&adev->wb.lock, flags); 1662 if (wb < adev->wb.num_wb) 1663 __clear_bit(wb, adev->wb.used); 1664 spin_unlock_irqrestore(&adev->wb.lock, flags); 1665 } 1666 1667 /** 1668 * amdgpu_device_resize_fb_bar - try to resize FB BAR 1669 * 1670 * @adev: amdgpu_device pointer 1671 * 1672 * Try to resize FB BAR to make all VRAM CPU accessible. We try very hard not 1673 * to fail, but if any of the BARs is not accessible after the size we abort 1674 * driver loading by returning -ENODEV. 1675 */ 1676 int amdgpu_device_resize_fb_bar(struct amdgpu_device *adev) 1677 { 1678 int rbar_size = pci_rebar_bytes_to_size(adev->gmc.real_vram_size); 1679 struct pci_bus *root; 1680 struct resource *res; 1681 unsigned int i; 1682 u16 cmd; 1683 int r; 1684 1685 if (!IS_ENABLED(CONFIG_PHYS_ADDR_T_64BIT)) 1686 return 0; 1687 1688 /* Bypass for VF */ 1689 if (amdgpu_sriov_vf(adev)) 1690 return 0; 1691 1692 if (!amdgpu_rebar) 1693 return 0; 1694 1695 /* resizing on Dell G5 SE platforms causes problems with runtime pm */ 1696 if ((amdgpu_runtime_pm != 0) && 1697 adev->pdev->vendor == PCI_VENDOR_ID_ATI && 1698 adev->pdev->device == 0x731f && 1699 adev->pdev->subsystem_vendor == PCI_VENDOR_ID_DELL) 1700 return 0; 1701 1702 /* PCI_EXT_CAP_ID_VNDR extended capability is located at 0x100 */ 1703 if (!pci_find_ext_capability(adev->pdev, PCI_EXT_CAP_ID_VNDR)) 1704 dev_warn( 1705 adev->dev, 1706 "System can't access extended configuration space, please check!!\n"); 1707 1708 /* skip if the bios has already enabled large BAR */ 1709 if (adev->gmc.real_vram_size && 1710 (pci_resource_len(adev->pdev, 0) >= adev->gmc.real_vram_size)) 1711 return 0; 1712 1713 /* Check if the root BUS has 64bit memory resources */ 1714 root = adev->pdev->bus; 1715 while (root->parent) 1716 root = root->parent; 1717 1718 pci_bus_for_each_resource(root, res, i) { 1719 if (res && res->flags & (IORESOURCE_MEM | IORESOURCE_MEM_64) && 1720 res->start > 0x100000000ull) 1721 break; 1722 } 1723 1724 /* Trying to resize is pointless without a root hub window above 4GB */ 1725 if (!res) 1726 return 0; 1727 1728 /* Limit the BAR size to what is available */ 1729 rbar_size = min(fls(pci_rebar_get_possible_sizes(adev->pdev, 0)) - 1, 1730 rbar_size); 1731 1732 /* Disable memory decoding while we change the BAR addresses and size */ 1733 pci_read_config_word(adev->pdev, PCI_COMMAND, &cmd); 1734 pci_write_config_word(adev->pdev, PCI_COMMAND, 1735 cmd & ~PCI_COMMAND_MEMORY); 1736 1737 /* Free the VRAM and doorbell BAR, we most likely need to move both. */ 1738 amdgpu_doorbell_fini(adev); 1739 if (adev->asic_type >= CHIP_BONAIRE) 1740 pci_release_resource(adev->pdev, 2); 1741 1742 pci_release_resource(adev->pdev, 0); 1743 1744 r = pci_resize_resource(adev->pdev, 0, rbar_size); 1745 if (r == -ENOSPC) 1746 dev_info(adev->dev, 1747 "Not enough PCI address space for a large BAR."); 1748 else if (r && r != -ENOTSUPP) 1749 dev_err(adev->dev, "Problem resizing BAR0 (%d).", r); 1750 1751 pci_assign_unassigned_bus_resources(adev->pdev->bus); 1752 1753 /* When the doorbell or fb BAR isn't available we have no chance of 1754 * using the device. 1755 */ 1756 r = amdgpu_doorbell_init(adev); 1757 if (r || (pci_resource_flags(adev->pdev, 0) & IORESOURCE_UNSET)) 1758 return -ENODEV; 1759 1760 pci_write_config_word(adev->pdev, PCI_COMMAND, cmd); 1761 1762 return 0; 1763 } 1764 1765 /* 1766 * GPU helpers function. 1767 */ 1768 /** 1769 * amdgpu_device_need_post - check if the hw need post or not 1770 * 1771 * @adev: amdgpu_device pointer 1772 * 1773 * Check if the asic has been initialized (all asics) at driver startup 1774 * or post is needed if hw reset is performed. 1775 * Returns true if need or false if not. 1776 */ 1777 bool amdgpu_device_need_post(struct amdgpu_device *adev) 1778 { 1779 uint32_t reg, flags; 1780 1781 if (amdgpu_sriov_vf(adev)) 1782 return false; 1783 1784 flags = amdgpu_device_get_vbios_flags(adev); 1785 if (flags & AMDGPU_VBIOS_SKIP) 1786 return false; 1787 if ((flags & AMDGPU_VBIOS_OPTIONAL) && !adev->bios) 1788 return false; 1789 1790 if (amdgpu_passthrough(adev)) { 1791 /* for FIJI: In whole GPU pass-through virtualization case, after VM reboot 1792 * some old smc fw still need driver do vPost otherwise gpu hang, while 1793 * those smc fw version above 22.15 doesn't have this flaw, so we force 1794 * vpost executed for smc version below 22.15 1795 */ 1796 if (adev->asic_type == CHIP_FIJI) { 1797 int err; 1798 uint32_t fw_ver; 1799 1800 err = request_firmware(&adev->pm.fw, "amdgpu/fiji_smc.bin", adev->dev); 1801 /* force vPost if error occurred */ 1802 if (err) 1803 return true; 1804 1805 fw_ver = *((uint32_t *)adev->pm.fw->data + 69); 1806 release_firmware(adev->pm.fw); 1807 if (fw_ver < 0x00160e00) 1808 return true; 1809 } 1810 } 1811 1812 /* Don't post if we need to reset whole hive on init */ 1813 if (adev->init_lvl->level == AMDGPU_INIT_LEVEL_MINIMAL_XGMI) 1814 return false; 1815 1816 if (adev->has_hw_reset) { 1817 adev->has_hw_reset = false; 1818 return true; 1819 } 1820 1821 /* bios scratch used on CIK+ */ 1822 if (adev->asic_type >= CHIP_BONAIRE) 1823 return amdgpu_atombios_scratch_need_asic_init(adev); 1824 1825 /* check MEM_SIZE for older asics */ 1826 reg = amdgpu_asic_get_config_memsize(adev); 1827 1828 if ((reg != 0) && (reg != 0xffffffff)) 1829 return false; 1830 1831 return true; 1832 } 1833 1834 /* 1835 * Check whether seamless boot is supported. 1836 * 1837 * So far we only support seamless boot on DCE 3.0 or later. 1838 * If users report that it works on older ASICS as well, we may 1839 * loosen this. 1840 */ 1841 bool amdgpu_device_seamless_boot_supported(struct amdgpu_device *adev) 1842 { 1843 switch (amdgpu_seamless) { 1844 case -1: 1845 break; 1846 case 1: 1847 return true; 1848 case 0: 1849 return false; 1850 default: 1851 dev_err(adev->dev, "Invalid value for amdgpu.seamless: %d\n", 1852 amdgpu_seamless); 1853 return false; 1854 } 1855 1856 if (!(adev->flags & AMD_IS_APU)) 1857 return false; 1858 1859 if (adev->mman.keep_stolen_vga_memory) 1860 return false; 1861 1862 return amdgpu_ip_version(adev, DCE_HWIP, 0) >= IP_VERSION(3, 0, 0); 1863 } 1864 1865 /* 1866 * Intel hosts such as Rocket Lake, Alder Lake, Raptor Lake and Sapphire Rapids 1867 * don't support dynamic speed switching. Until we have confirmation from Intel 1868 * that a specific host supports it, it's safer that we keep it disabled for all. 1869 * 1870 * https://edc.intel.com/content/www/us/en/design/products/platforms/details/raptor-lake-s/13th-generation-core-processors-datasheet-volume-1-of-2/005/pci-express-support/ 1871 * https://gitlab.freedesktop.org/drm/amd/-/issues/2663 1872 */ 1873 static bool amdgpu_device_pcie_dynamic_switching_supported(struct amdgpu_device *adev) 1874 { 1875 #if IS_ENABLED(CONFIG_X86) 1876 struct cpuinfo_x86 *c = &cpu_data(0); 1877 1878 /* eGPU change speeds based on USB4 fabric conditions */ 1879 if (dev_is_removable(adev->dev)) 1880 return true; 1881 1882 if (c->x86_vendor == X86_VENDOR_INTEL) 1883 return false; 1884 #endif 1885 return true; 1886 } 1887 1888 static bool amdgpu_device_aspm_support_quirk(struct amdgpu_device *adev) 1889 { 1890 /* Enabling ASPM causes randoms hangs on Tahiti and Oland on Zen4. 1891 * It's unclear if this is a platform-specific or GPU-specific issue. 1892 * Disable ASPM on SI for the time being. 1893 */ 1894 if (adev->family == AMDGPU_FAMILY_SI) 1895 return true; 1896 1897 #if IS_ENABLED(CONFIG_X86) 1898 struct cpuinfo_x86 *c = &cpu_data(0); 1899 1900 if (!(amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(12, 0, 0) || 1901 amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(12, 0, 1))) 1902 return false; 1903 1904 if (c->x86 == 6 && 1905 adev->pm.pcie_gen_mask & CAIL_PCIE_LINK_SPEED_SUPPORT_GEN5) { 1906 switch (c->x86_model) { 1907 case VFM_MODEL(INTEL_ALDERLAKE): 1908 case VFM_MODEL(INTEL_ALDERLAKE_L): 1909 case VFM_MODEL(INTEL_RAPTORLAKE): 1910 case VFM_MODEL(INTEL_RAPTORLAKE_P): 1911 case VFM_MODEL(INTEL_RAPTORLAKE_S): 1912 return true; 1913 default: 1914 return false; 1915 } 1916 } else { 1917 return false; 1918 } 1919 #else 1920 return false; 1921 #endif 1922 } 1923 1924 /** 1925 * amdgpu_device_should_use_aspm - check if the device should program ASPM 1926 * 1927 * @adev: amdgpu_device pointer 1928 * 1929 * Confirm whether the module parameter and pcie bridge agree that ASPM should 1930 * be set for this device. 1931 * 1932 * Returns true if it should be used or false if not. 1933 */ 1934 bool amdgpu_device_should_use_aspm(struct amdgpu_device *adev) 1935 { 1936 switch (amdgpu_aspm) { 1937 case -1: 1938 break; 1939 case 0: 1940 return false; 1941 case 1: 1942 return true; 1943 default: 1944 return false; 1945 } 1946 if (adev->flags & AMD_IS_APU) 1947 return false; 1948 if (amdgpu_device_aspm_support_quirk(adev)) 1949 return false; 1950 return pcie_aspm_enabled(adev->pdev); 1951 } 1952 1953 /* if we get transitioned to only one device, take VGA back */ 1954 /** 1955 * amdgpu_device_vga_set_decode - enable/disable vga decode 1956 * 1957 * @pdev: PCI device pointer 1958 * @state: enable/disable vga decode 1959 * 1960 * Enable/disable vga decode (all asics). 1961 * Returns VGA resource flags. 1962 */ 1963 static unsigned int amdgpu_device_vga_set_decode(struct pci_dev *pdev, 1964 bool state) 1965 { 1966 struct amdgpu_device *adev = drm_to_adev(pci_get_drvdata(pdev)); 1967 1968 amdgpu_asic_set_vga_state(adev, state); 1969 if (state) 1970 return VGA_RSRC_LEGACY_IO | VGA_RSRC_LEGACY_MEM | 1971 VGA_RSRC_NORMAL_IO | VGA_RSRC_NORMAL_MEM; 1972 else 1973 return VGA_RSRC_NORMAL_IO | VGA_RSRC_NORMAL_MEM; 1974 } 1975 1976 /** 1977 * amdgpu_device_check_block_size - validate the vm block size 1978 * 1979 * @adev: amdgpu_device pointer 1980 * 1981 * Validates the vm block size specified via module parameter. 1982 * The vm block size defines number of bits in page table versus page directory, 1983 * a page is 4KB so we have 12 bits offset, minimum 9 bits in the 1984 * page table and the remaining bits are in the page directory. 1985 */ 1986 static void amdgpu_device_check_block_size(struct amdgpu_device *adev) 1987 { 1988 /* defines number of bits in page table versus page directory, 1989 * a page is 4KB so we have 12 bits offset, minimum 9 bits in the 1990 * page table and the remaining bits are in the page directory 1991 */ 1992 if (amdgpu_vm_block_size == -1) 1993 return; 1994 1995 if (amdgpu_vm_block_size < 9) { 1996 dev_warn(adev->dev, "VM page table size (%d) too small\n", 1997 amdgpu_vm_block_size); 1998 amdgpu_vm_block_size = -1; 1999 } 2000 } 2001 2002 /** 2003 * amdgpu_device_check_vm_size - validate the vm size 2004 * 2005 * @adev: amdgpu_device pointer 2006 * 2007 * Validates the vm size in GB specified via module parameter. 2008 * The VM size is the size of the GPU virtual memory space in GB. 2009 */ 2010 static void amdgpu_device_check_vm_size(struct amdgpu_device *adev) 2011 { 2012 /* no need to check the default value */ 2013 if (amdgpu_vm_size == -1) 2014 return; 2015 2016 if (amdgpu_vm_size < 1) { 2017 dev_warn(adev->dev, "VM size (%d) too small, min is 1GB\n", 2018 amdgpu_vm_size); 2019 amdgpu_vm_size = -1; 2020 } 2021 } 2022 2023 static void amdgpu_device_check_smu_prv_buffer_size(struct amdgpu_device *adev) 2024 { 2025 struct sysinfo si; 2026 bool is_os_64 = (sizeof(void *) == 8); 2027 uint64_t total_memory; 2028 uint64_t dram_size_seven_GB = 0x1B8000000; 2029 uint64_t dram_size_three_GB = 0xB8000000; 2030 2031 if (amdgpu_smu_memory_pool_size == 0) 2032 return; 2033 2034 if (!is_os_64) { 2035 dev_warn(adev->dev, "Not 64-bit OS, feature not supported\n"); 2036 goto def_value; 2037 } 2038 si_meminfo(&si); 2039 total_memory = (uint64_t)si.totalram * si.mem_unit; 2040 2041 if ((amdgpu_smu_memory_pool_size == 1) || 2042 (amdgpu_smu_memory_pool_size == 2)) { 2043 if (total_memory < dram_size_three_GB) 2044 goto def_value1; 2045 } else if ((amdgpu_smu_memory_pool_size == 4) || 2046 (amdgpu_smu_memory_pool_size == 8)) { 2047 if (total_memory < dram_size_seven_GB) 2048 goto def_value1; 2049 } else { 2050 dev_warn(adev->dev, "Smu memory pool size not supported\n"); 2051 goto def_value; 2052 } 2053 adev->pm.smu_prv_buffer_size = amdgpu_smu_memory_pool_size << 28; 2054 2055 return; 2056 2057 def_value1: 2058 dev_warn(adev->dev, "No enough system memory\n"); 2059 def_value: 2060 adev->pm.smu_prv_buffer_size = 0; 2061 } 2062 2063 static int amdgpu_device_init_apu_flags(struct amdgpu_device *adev) 2064 { 2065 if (!(adev->flags & AMD_IS_APU) || 2066 adev->asic_type < CHIP_RAVEN) 2067 return 0; 2068 2069 switch (adev->asic_type) { 2070 case CHIP_RAVEN: 2071 if (adev->pdev->device == 0x15dd) 2072 adev->apu_flags |= AMD_APU_IS_RAVEN; 2073 if (adev->pdev->device == 0x15d8) 2074 adev->apu_flags |= AMD_APU_IS_PICASSO; 2075 break; 2076 case CHIP_RENOIR: 2077 if ((adev->pdev->device == 0x1636) || 2078 (adev->pdev->device == 0x164c)) 2079 adev->apu_flags |= AMD_APU_IS_RENOIR; 2080 else 2081 adev->apu_flags |= AMD_APU_IS_GREEN_SARDINE; 2082 break; 2083 case CHIP_VANGOGH: 2084 adev->apu_flags |= AMD_APU_IS_VANGOGH; 2085 break; 2086 case CHIP_YELLOW_CARP: 2087 break; 2088 case CHIP_CYAN_SKILLFISH: 2089 if ((adev->pdev->device == 0x13FE) || 2090 (adev->pdev->device == 0x143F)) 2091 adev->apu_flags |= AMD_APU_IS_CYAN_SKILLFISH2; 2092 break; 2093 default: 2094 break; 2095 } 2096 2097 return 0; 2098 } 2099 2100 /** 2101 * amdgpu_device_check_arguments - validate module params 2102 * 2103 * @adev: amdgpu_device pointer 2104 * 2105 * Validates certain module parameters and updates 2106 * the associated values used by the driver (all asics). 2107 */ 2108 static int amdgpu_device_check_arguments(struct amdgpu_device *adev) 2109 { 2110 int i; 2111 2112 if (amdgpu_sched_jobs < 4) { 2113 dev_warn(adev->dev, "sched jobs (%d) must be at least 4\n", 2114 amdgpu_sched_jobs); 2115 amdgpu_sched_jobs = 4; 2116 } else if (!is_power_of_2(amdgpu_sched_jobs)) { 2117 dev_warn(adev->dev, "sched jobs (%d) must be a power of 2\n", 2118 amdgpu_sched_jobs); 2119 amdgpu_sched_jobs = roundup_pow_of_two(amdgpu_sched_jobs); 2120 } 2121 2122 if (amdgpu_gart_size != -1 && amdgpu_gart_size < 32) { 2123 /* gart size must be greater or equal to 32M */ 2124 dev_warn(adev->dev, "gart size (%d) too small\n", 2125 amdgpu_gart_size); 2126 amdgpu_gart_size = -1; 2127 } 2128 2129 if (amdgpu_gtt_size != -1 && amdgpu_gtt_size < 32) { 2130 /* gtt size must be greater or equal to 32M */ 2131 dev_warn(adev->dev, "gtt size (%d) too small\n", 2132 amdgpu_gtt_size); 2133 amdgpu_gtt_size = -1; 2134 } 2135 2136 /* valid range is between 4 and 9 inclusive */ 2137 if (amdgpu_vm_fragment_size != -1 && 2138 (amdgpu_vm_fragment_size > 9 || amdgpu_vm_fragment_size < 4)) { 2139 dev_warn(adev->dev, "valid range is between 4 and 9\n"); 2140 amdgpu_vm_fragment_size = -1; 2141 } 2142 2143 if (amdgpu_sched_hw_submission < 2) { 2144 dev_warn(adev->dev, "sched hw submission jobs (%d) must be at least 2\n", 2145 amdgpu_sched_hw_submission); 2146 amdgpu_sched_hw_submission = 2; 2147 } else if (!is_power_of_2(amdgpu_sched_hw_submission)) { 2148 dev_warn(adev->dev, "sched hw submission jobs (%d) must be a power of 2\n", 2149 amdgpu_sched_hw_submission); 2150 amdgpu_sched_hw_submission = roundup_pow_of_two(amdgpu_sched_hw_submission); 2151 } 2152 2153 if (amdgpu_reset_method < -1 || amdgpu_reset_method > 4) { 2154 dev_warn(adev->dev, "invalid option for reset method, reverting to default\n"); 2155 amdgpu_reset_method = -1; 2156 } 2157 2158 amdgpu_device_check_smu_prv_buffer_size(adev); 2159 2160 amdgpu_device_check_vm_size(adev); 2161 2162 amdgpu_device_check_block_size(adev); 2163 2164 adev->firmware.load_type = amdgpu_ucode_get_load_type(adev, amdgpu_fw_load_type); 2165 2166 for (i = 0; i < MAX_XCP; i++) { 2167 switch (amdgpu_enforce_isolation) { 2168 case -1: 2169 case 0: 2170 default: 2171 /* disable */ 2172 adev->enforce_isolation[i] = AMDGPU_ENFORCE_ISOLATION_DISABLE; 2173 break; 2174 case 1: 2175 /* enable */ 2176 adev->enforce_isolation[i] = 2177 AMDGPU_ENFORCE_ISOLATION_ENABLE; 2178 break; 2179 case 2: 2180 /* enable legacy mode */ 2181 adev->enforce_isolation[i] = 2182 AMDGPU_ENFORCE_ISOLATION_ENABLE_LEGACY; 2183 break; 2184 case 3: 2185 /* enable only process isolation without submitting cleaner shader */ 2186 adev->enforce_isolation[i] = 2187 AMDGPU_ENFORCE_ISOLATION_NO_CLEANER_SHADER; 2188 break; 2189 } 2190 } 2191 2192 return 0; 2193 } 2194 2195 /** 2196 * amdgpu_switcheroo_set_state - set switcheroo state 2197 * 2198 * @pdev: pci dev pointer 2199 * @state: vga_switcheroo state 2200 * 2201 * Callback for the switcheroo driver. Suspends or resumes 2202 * the asics before or after it is powered up using ACPI methods. 2203 */ 2204 static void amdgpu_switcheroo_set_state(struct pci_dev *pdev, 2205 enum vga_switcheroo_state state) 2206 { 2207 struct drm_device *dev = pci_get_drvdata(pdev); 2208 int r; 2209 2210 if (amdgpu_device_supports_px(drm_to_adev(dev)) && 2211 state == VGA_SWITCHEROO_OFF) 2212 return; 2213 2214 if (state == VGA_SWITCHEROO_ON) { 2215 pr_info("switched on\n"); 2216 /* don't suspend or resume card normally */ 2217 dev->switch_power_state = DRM_SWITCH_POWER_CHANGING; 2218 2219 pci_set_power_state(pdev, PCI_D0); 2220 amdgpu_device_load_pci_state(pdev); 2221 r = pci_enable_device(pdev); 2222 if (r) 2223 dev_warn(&pdev->dev, "pci_enable_device failed (%d)\n", 2224 r); 2225 amdgpu_device_resume(dev, true); 2226 2227 dev->switch_power_state = DRM_SWITCH_POWER_ON; 2228 } else { 2229 dev_info(&pdev->dev, "switched off\n"); 2230 dev->switch_power_state = DRM_SWITCH_POWER_CHANGING; 2231 amdgpu_device_prepare(dev); 2232 amdgpu_device_suspend(dev, true); 2233 amdgpu_device_cache_pci_state(pdev); 2234 /* Shut down the device */ 2235 pci_disable_device(pdev); 2236 pci_set_power_state(pdev, PCI_D3cold); 2237 dev->switch_power_state = DRM_SWITCH_POWER_OFF; 2238 } 2239 } 2240 2241 /** 2242 * amdgpu_switcheroo_can_switch - see if switcheroo state can change 2243 * 2244 * @pdev: pci dev pointer 2245 * 2246 * Callback for the switcheroo driver. Check of the switcheroo 2247 * state can be changed. 2248 * Returns true if the state can be changed, false if not. 2249 */ 2250 static bool amdgpu_switcheroo_can_switch(struct pci_dev *pdev) 2251 { 2252 struct drm_device *dev = pci_get_drvdata(pdev); 2253 2254 /* 2255 * FIXME: open_count is protected by drm_global_mutex but that would lead to 2256 * locking inversion with the driver load path. And the access here is 2257 * completely racy anyway. So don't bother with locking for now. 2258 */ 2259 return atomic_read(&dev->open_count) == 0; 2260 } 2261 2262 static const struct vga_switcheroo_client_ops amdgpu_switcheroo_ops = { 2263 .set_gpu_state = amdgpu_switcheroo_set_state, 2264 .reprobe = NULL, 2265 .can_switch = amdgpu_switcheroo_can_switch, 2266 }; 2267 2268 /** 2269 * amdgpu_device_ip_set_clockgating_state - set the CG state 2270 * 2271 * @dev: amdgpu_device pointer 2272 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.) 2273 * @state: clockgating state (gate or ungate) 2274 * 2275 * Sets the requested clockgating state for all instances of 2276 * the hardware IP specified. 2277 * Returns the error code from the last instance. 2278 */ 2279 int amdgpu_device_ip_set_clockgating_state(void *dev, 2280 enum amd_ip_block_type block_type, 2281 enum amd_clockgating_state state) 2282 { 2283 struct amdgpu_device *adev = dev; 2284 int i, r = 0; 2285 2286 for (i = 0; i < adev->num_ip_blocks; i++) { 2287 if (!adev->ip_blocks[i].status.valid) 2288 continue; 2289 if (adev->ip_blocks[i].version->type != block_type) 2290 continue; 2291 if (!adev->ip_blocks[i].version->funcs->set_clockgating_state) 2292 continue; 2293 r = adev->ip_blocks[i].version->funcs->set_clockgating_state( 2294 &adev->ip_blocks[i], state); 2295 if (r) 2296 dev_err(adev->dev, 2297 "set_clockgating_state of IP block <%s> failed %d\n", 2298 adev->ip_blocks[i].version->funcs->name, r); 2299 } 2300 return r; 2301 } 2302 2303 /** 2304 * amdgpu_device_ip_set_powergating_state - set the PG state 2305 * 2306 * @dev: amdgpu_device pointer 2307 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.) 2308 * @state: powergating state (gate or ungate) 2309 * 2310 * Sets the requested powergating state for all instances of 2311 * the hardware IP specified. 2312 * Returns the error code from the last instance. 2313 */ 2314 int amdgpu_device_ip_set_powergating_state(void *dev, 2315 enum amd_ip_block_type block_type, 2316 enum amd_powergating_state state) 2317 { 2318 struct amdgpu_device *adev = dev; 2319 int i, r = 0; 2320 2321 for (i = 0; i < adev->num_ip_blocks; i++) { 2322 if (!adev->ip_blocks[i].status.valid) 2323 continue; 2324 if (adev->ip_blocks[i].version->type != block_type) 2325 continue; 2326 if (!adev->ip_blocks[i].version->funcs->set_powergating_state) 2327 continue; 2328 r = adev->ip_blocks[i].version->funcs->set_powergating_state( 2329 &adev->ip_blocks[i], state); 2330 if (r) 2331 dev_err(adev->dev, 2332 "set_powergating_state of IP block <%s> failed %d\n", 2333 adev->ip_blocks[i].version->funcs->name, r); 2334 } 2335 return r; 2336 } 2337 2338 /** 2339 * amdgpu_device_ip_get_clockgating_state - get the CG state 2340 * 2341 * @adev: amdgpu_device pointer 2342 * @flags: clockgating feature flags 2343 * 2344 * Walks the list of IPs on the device and updates the clockgating 2345 * flags for each IP. 2346 * Updates @flags with the feature flags for each hardware IP where 2347 * clockgating is enabled. 2348 */ 2349 void amdgpu_device_ip_get_clockgating_state(struct amdgpu_device *adev, 2350 u64 *flags) 2351 { 2352 int i; 2353 2354 for (i = 0; i < adev->num_ip_blocks; i++) { 2355 if (!adev->ip_blocks[i].status.valid) 2356 continue; 2357 if (adev->ip_blocks[i].version->funcs->get_clockgating_state) 2358 adev->ip_blocks[i].version->funcs->get_clockgating_state( 2359 &adev->ip_blocks[i], flags); 2360 } 2361 } 2362 2363 /** 2364 * amdgpu_device_ip_wait_for_idle - wait for idle 2365 * 2366 * @adev: amdgpu_device pointer 2367 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.) 2368 * 2369 * Waits for the request hardware IP to be idle. 2370 * Returns 0 for success or a negative error code on failure. 2371 */ 2372 int amdgpu_device_ip_wait_for_idle(struct amdgpu_device *adev, 2373 enum amd_ip_block_type block_type) 2374 { 2375 int i, r; 2376 2377 for (i = 0; i < adev->num_ip_blocks; i++) { 2378 if (!adev->ip_blocks[i].status.valid) 2379 continue; 2380 if (adev->ip_blocks[i].version->type == block_type) { 2381 if (adev->ip_blocks[i].version->funcs->wait_for_idle) { 2382 r = adev->ip_blocks[i].version->funcs->wait_for_idle( 2383 &adev->ip_blocks[i]); 2384 if (r) 2385 return r; 2386 } 2387 break; 2388 } 2389 } 2390 return 0; 2391 2392 } 2393 2394 /** 2395 * amdgpu_device_ip_is_hw - is the hardware IP enabled 2396 * 2397 * @adev: amdgpu_device pointer 2398 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.) 2399 * 2400 * Check if the hardware IP is enable or not. 2401 * Returns true if it the IP is enable, false if not. 2402 */ 2403 bool amdgpu_device_ip_is_hw(struct amdgpu_device *adev, 2404 enum amd_ip_block_type block_type) 2405 { 2406 int i; 2407 2408 for (i = 0; i < adev->num_ip_blocks; i++) { 2409 if (adev->ip_blocks[i].version->type == block_type) 2410 return adev->ip_blocks[i].status.hw; 2411 } 2412 return false; 2413 } 2414 2415 /** 2416 * amdgpu_device_ip_is_valid - is the hardware IP valid 2417 * 2418 * @adev: amdgpu_device pointer 2419 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.) 2420 * 2421 * Check if the hardware IP is valid or not. 2422 * Returns true if it the IP is valid, false if not. 2423 */ 2424 bool amdgpu_device_ip_is_valid(struct amdgpu_device *adev, 2425 enum amd_ip_block_type block_type) 2426 { 2427 int i; 2428 2429 for (i = 0; i < adev->num_ip_blocks; i++) { 2430 if (adev->ip_blocks[i].version->type == block_type) 2431 return adev->ip_blocks[i].status.valid; 2432 } 2433 return false; 2434 2435 } 2436 2437 /** 2438 * amdgpu_device_ip_get_ip_block - get a hw IP pointer 2439 * 2440 * @adev: amdgpu_device pointer 2441 * @type: Type of hardware IP (SMU, GFX, UVD, etc.) 2442 * 2443 * Returns a pointer to the hardware IP block structure 2444 * if it exists for the asic, otherwise NULL. 2445 */ 2446 struct amdgpu_ip_block * 2447 amdgpu_device_ip_get_ip_block(struct amdgpu_device *adev, 2448 enum amd_ip_block_type type) 2449 { 2450 int i; 2451 2452 for (i = 0; i < adev->num_ip_blocks; i++) 2453 if (adev->ip_blocks[i].version->type == type) 2454 return &adev->ip_blocks[i]; 2455 2456 return NULL; 2457 } 2458 2459 /** 2460 * amdgpu_device_ip_block_version_cmp 2461 * 2462 * @adev: amdgpu_device pointer 2463 * @type: enum amd_ip_block_type 2464 * @major: major version 2465 * @minor: minor version 2466 * 2467 * return 0 if equal or greater 2468 * return 1 if smaller or the ip_block doesn't exist 2469 */ 2470 int amdgpu_device_ip_block_version_cmp(struct amdgpu_device *adev, 2471 enum amd_ip_block_type type, 2472 u32 major, u32 minor) 2473 { 2474 struct amdgpu_ip_block *ip_block = amdgpu_device_ip_get_ip_block(adev, type); 2475 2476 if (ip_block && ((ip_block->version->major > major) || 2477 ((ip_block->version->major == major) && 2478 (ip_block->version->minor >= minor)))) 2479 return 0; 2480 2481 return 1; 2482 } 2483 2484 static const char *ip_block_names[] = { 2485 [AMD_IP_BLOCK_TYPE_COMMON] = "common", 2486 [AMD_IP_BLOCK_TYPE_GMC] = "gmc", 2487 [AMD_IP_BLOCK_TYPE_IH] = "ih", 2488 [AMD_IP_BLOCK_TYPE_SMC] = "smu", 2489 [AMD_IP_BLOCK_TYPE_PSP] = "psp", 2490 [AMD_IP_BLOCK_TYPE_DCE] = "dce", 2491 [AMD_IP_BLOCK_TYPE_GFX] = "gfx", 2492 [AMD_IP_BLOCK_TYPE_SDMA] = "sdma", 2493 [AMD_IP_BLOCK_TYPE_UVD] = "uvd", 2494 [AMD_IP_BLOCK_TYPE_VCE] = "vce", 2495 [AMD_IP_BLOCK_TYPE_ACP] = "acp", 2496 [AMD_IP_BLOCK_TYPE_VCN] = "vcn", 2497 [AMD_IP_BLOCK_TYPE_MES] = "mes", 2498 [AMD_IP_BLOCK_TYPE_JPEG] = "jpeg", 2499 [AMD_IP_BLOCK_TYPE_VPE] = "vpe", 2500 [AMD_IP_BLOCK_TYPE_UMSCH_MM] = "umsch_mm", 2501 [AMD_IP_BLOCK_TYPE_ISP] = "isp", 2502 [AMD_IP_BLOCK_TYPE_RAS] = "ras", 2503 }; 2504 2505 static const char *ip_block_name(struct amdgpu_device *adev, enum amd_ip_block_type type) 2506 { 2507 int idx = (int)type; 2508 2509 return idx < ARRAY_SIZE(ip_block_names) ? ip_block_names[idx] : "unknown"; 2510 } 2511 2512 /** 2513 * amdgpu_device_ip_block_add 2514 * 2515 * @adev: amdgpu_device pointer 2516 * @ip_block_version: pointer to the IP to add 2517 * 2518 * Adds the IP block driver information to the collection of IPs 2519 * on the asic. 2520 */ 2521 int amdgpu_device_ip_block_add(struct amdgpu_device *adev, 2522 const struct amdgpu_ip_block_version *ip_block_version) 2523 { 2524 if (!ip_block_version) 2525 return -EINVAL; 2526 2527 switch (ip_block_version->type) { 2528 case AMD_IP_BLOCK_TYPE_VCN: 2529 if (adev->harvest_ip_mask & AMD_HARVEST_IP_VCN_MASK) 2530 return 0; 2531 break; 2532 case AMD_IP_BLOCK_TYPE_JPEG: 2533 if (adev->harvest_ip_mask & AMD_HARVEST_IP_JPEG_MASK) 2534 return 0; 2535 break; 2536 default: 2537 break; 2538 } 2539 2540 dev_info(adev->dev, "detected ip block number %d <%s_v%d_%d_%d> (%s)\n", 2541 adev->num_ip_blocks, 2542 ip_block_name(adev, ip_block_version->type), 2543 ip_block_version->major, 2544 ip_block_version->minor, 2545 ip_block_version->rev, 2546 ip_block_version->funcs->name); 2547 2548 adev->ip_blocks[adev->num_ip_blocks].adev = adev; 2549 2550 adev->ip_blocks[adev->num_ip_blocks++].version = ip_block_version; 2551 2552 return 0; 2553 } 2554 2555 /** 2556 * amdgpu_device_enable_virtual_display - enable virtual display feature 2557 * 2558 * @adev: amdgpu_device pointer 2559 * 2560 * Enabled the virtual display feature if the user has enabled it via 2561 * the module parameter virtual_display. This feature provides a virtual 2562 * display hardware on headless boards or in virtualized environments. 2563 * This function parses and validates the configuration string specified by 2564 * the user and configures the virtual display configuration (number of 2565 * virtual connectors, crtcs, etc.) specified. 2566 */ 2567 static void amdgpu_device_enable_virtual_display(struct amdgpu_device *adev) 2568 { 2569 adev->enable_virtual_display = false; 2570 2571 if (amdgpu_virtual_display) { 2572 const char *pci_address_name = pci_name(adev->pdev); 2573 char *pciaddstr, *pciaddstr_tmp, *pciaddname_tmp, *pciaddname; 2574 2575 pciaddstr = kstrdup(amdgpu_virtual_display, GFP_KERNEL); 2576 pciaddstr_tmp = pciaddstr; 2577 while ((pciaddname_tmp = strsep(&pciaddstr_tmp, ";"))) { 2578 pciaddname = strsep(&pciaddname_tmp, ","); 2579 if (!strcmp("all", pciaddname) 2580 || !strcmp(pci_address_name, pciaddname)) { 2581 long num_crtc; 2582 int res = -1; 2583 2584 adev->enable_virtual_display = true; 2585 2586 if (pciaddname_tmp) 2587 res = kstrtol(pciaddname_tmp, 10, 2588 &num_crtc); 2589 2590 if (!res) { 2591 if (num_crtc < 1) 2592 num_crtc = 1; 2593 if (num_crtc > 6) 2594 num_crtc = 6; 2595 adev->mode_info.num_crtc = num_crtc; 2596 } else { 2597 adev->mode_info.num_crtc = 1; 2598 } 2599 break; 2600 } 2601 } 2602 2603 dev_info( 2604 adev->dev, 2605 "virtual display string:%s, %s:virtual_display:%d, num_crtc:%d\n", 2606 amdgpu_virtual_display, pci_address_name, 2607 adev->enable_virtual_display, adev->mode_info.num_crtc); 2608 2609 kfree(pciaddstr); 2610 } 2611 } 2612 2613 void amdgpu_device_set_sriov_virtual_display(struct amdgpu_device *adev) 2614 { 2615 if (amdgpu_sriov_vf(adev) && !adev->enable_virtual_display) { 2616 adev->mode_info.num_crtc = 1; 2617 adev->enable_virtual_display = true; 2618 dev_info(adev->dev, "virtual_display:%d, num_crtc:%d\n", 2619 adev->enable_virtual_display, 2620 adev->mode_info.num_crtc); 2621 } 2622 } 2623 2624 /** 2625 * amdgpu_device_parse_gpu_info_fw - parse gpu info firmware 2626 * 2627 * @adev: amdgpu_device pointer 2628 * 2629 * Parses the asic configuration parameters specified in the gpu info 2630 * firmware and makes them available to the driver for use in configuring 2631 * the asic. 2632 * Returns 0 on success, -EINVAL on failure. 2633 */ 2634 static int amdgpu_device_parse_gpu_info_fw(struct amdgpu_device *adev) 2635 { 2636 const char *chip_name; 2637 int err; 2638 const struct gpu_info_firmware_header_v1_0 *hdr; 2639 2640 adev->firmware.gpu_info_fw = NULL; 2641 2642 switch (adev->asic_type) { 2643 default: 2644 return 0; 2645 case CHIP_VEGA10: 2646 chip_name = "vega10"; 2647 break; 2648 case CHIP_VEGA12: 2649 chip_name = "vega12"; 2650 break; 2651 case CHIP_RAVEN: 2652 if (adev->apu_flags & AMD_APU_IS_RAVEN2) 2653 chip_name = "raven2"; 2654 else if (adev->apu_flags & AMD_APU_IS_PICASSO) 2655 chip_name = "picasso"; 2656 else 2657 chip_name = "raven"; 2658 break; 2659 case CHIP_ARCTURUS: 2660 chip_name = "arcturus"; 2661 break; 2662 case CHIP_NAVI12: 2663 if (adev->discovery.bin) 2664 return 0; 2665 chip_name = "navi12"; 2666 break; 2667 case CHIP_CYAN_SKILLFISH: 2668 chip_name = "cyan_skillfish"; 2669 break; 2670 } 2671 2672 err = amdgpu_ucode_request(adev, &adev->firmware.gpu_info_fw, 2673 AMDGPU_UCODE_OPTIONAL, 2674 "amdgpu/%s_gpu_info.bin", chip_name); 2675 if (err) { 2676 dev_err(adev->dev, 2677 "Failed to get gpu_info firmware \"%s_gpu_info.bin\"\n", 2678 chip_name); 2679 goto out; 2680 } 2681 2682 hdr = (const struct gpu_info_firmware_header_v1_0 *)adev->firmware.gpu_info_fw->data; 2683 amdgpu_ucode_print_gpu_info_hdr(&hdr->header); 2684 2685 switch (hdr->version_major) { 2686 case 1: 2687 { 2688 const struct gpu_info_firmware_v1_0 *gpu_info_fw = 2689 (const struct gpu_info_firmware_v1_0 *)(adev->firmware.gpu_info_fw->data + 2690 le32_to_cpu(hdr->header.ucode_array_offset_bytes)); 2691 2692 /* 2693 * Should be dropped when DAL no longer needs it. 2694 */ 2695 if (adev->asic_type == CHIP_NAVI12) 2696 goto parse_soc_bounding_box; 2697 2698 adev->gfx.config.max_shader_engines = le32_to_cpu(gpu_info_fw->gc_num_se); 2699 adev->gfx.config.max_cu_per_sh = le32_to_cpu(gpu_info_fw->gc_num_cu_per_sh); 2700 adev->gfx.config.max_sh_per_se = le32_to_cpu(gpu_info_fw->gc_num_sh_per_se); 2701 adev->gfx.config.max_backends_per_se = le32_to_cpu(gpu_info_fw->gc_num_rb_per_se); 2702 adev->gfx.config.max_texture_channel_caches = 2703 le32_to_cpu(gpu_info_fw->gc_num_tccs); 2704 adev->gfx.config.max_gprs = le32_to_cpu(gpu_info_fw->gc_num_gprs); 2705 adev->gfx.config.max_gs_threads = le32_to_cpu(gpu_info_fw->gc_num_max_gs_thds); 2706 adev->gfx.config.gs_vgt_table_depth = le32_to_cpu(gpu_info_fw->gc_gs_table_depth); 2707 adev->gfx.config.gs_prim_buffer_depth = le32_to_cpu(gpu_info_fw->gc_gsprim_buff_depth); 2708 adev->gfx.config.double_offchip_lds_buf = 2709 le32_to_cpu(gpu_info_fw->gc_double_offchip_lds_buffer); 2710 adev->gfx.cu_info.wave_front_size = le32_to_cpu(gpu_info_fw->gc_wave_size); 2711 adev->gfx.cu_info.max_waves_per_simd = 2712 le32_to_cpu(gpu_info_fw->gc_max_waves_per_simd); 2713 adev->gfx.cu_info.max_scratch_slots_per_cu = 2714 le32_to_cpu(gpu_info_fw->gc_max_scratch_slots_per_cu); 2715 adev->gfx.cu_info.lds_size = le32_to_cpu(gpu_info_fw->gc_lds_size); 2716 if (hdr->version_minor >= 1) { 2717 const struct gpu_info_firmware_v1_1 *gpu_info_fw = 2718 (const struct gpu_info_firmware_v1_1 *)(adev->firmware.gpu_info_fw->data + 2719 le32_to_cpu(hdr->header.ucode_array_offset_bytes)); 2720 adev->gfx.config.num_sc_per_sh = 2721 le32_to_cpu(gpu_info_fw->num_sc_per_sh); 2722 adev->gfx.config.num_packer_per_sc = 2723 le32_to_cpu(gpu_info_fw->num_packer_per_sc); 2724 } 2725 2726 parse_soc_bounding_box: 2727 /* 2728 * soc bounding box info is not integrated in disocovery table, 2729 * we always need to parse it from gpu info firmware if needed. 2730 */ 2731 if (hdr->version_minor == 2) { 2732 const struct gpu_info_firmware_v1_2 *gpu_info_fw = 2733 (const struct gpu_info_firmware_v1_2 *)(adev->firmware.gpu_info_fw->data + 2734 le32_to_cpu(hdr->header.ucode_array_offset_bytes)); 2735 adev->dm.soc_bounding_box = &gpu_info_fw->soc_bounding_box; 2736 } 2737 break; 2738 } 2739 default: 2740 dev_err(adev->dev, 2741 "Unsupported gpu_info table %d\n", hdr->header.ucode_version); 2742 err = -EINVAL; 2743 goto out; 2744 } 2745 out: 2746 return err; 2747 } 2748 2749 static void amdgpu_uid_init(struct amdgpu_device *adev) 2750 { 2751 /* Initialize the UID for the device */ 2752 adev->uid_info = kzalloc(sizeof(struct amdgpu_uid), GFP_KERNEL); 2753 if (!adev->uid_info) { 2754 dev_warn(adev->dev, "Failed to allocate memory for UID\n"); 2755 return; 2756 } 2757 adev->uid_info->adev = adev; 2758 } 2759 2760 static void amdgpu_uid_fini(struct amdgpu_device *adev) 2761 { 2762 /* Free the UID memory */ 2763 kfree(adev->uid_info); 2764 adev->uid_info = NULL; 2765 } 2766 2767 /** 2768 * amdgpu_device_ip_early_init - run early init for hardware IPs 2769 * 2770 * @adev: amdgpu_device pointer 2771 * 2772 * Early initialization pass for hardware IPs. The hardware IPs that make 2773 * up each asic are discovered each IP's early_init callback is run. This 2774 * is the first stage in initializing the asic. 2775 * Returns 0 on success, negative error code on failure. 2776 */ 2777 static int amdgpu_device_ip_early_init(struct amdgpu_device *adev) 2778 { 2779 struct amdgpu_ip_block *ip_block; 2780 struct pci_dev *parent; 2781 bool total, skip_bios; 2782 uint32_t bios_flags; 2783 int i, r; 2784 2785 amdgpu_device_enable_virtual_display(adev); 2786 2787 if (amdgpu_sriov_vf(adev)) { 2788 r = amdgpu_virt_request_full_gpu(adev, true); 2789 if (r) 2790 return r; 2791 2792 r = amdgpu_virt_init_critical_region(adev); 2793 if (r) 2794 return r; 2795 } 2796 2797 switch (adev->asic_type) { 2798 #ifdef CONFIG_DRM_AMDGPU_SI 2799 case CHIP_VERDE: 2800 case CHIP_TAHITI: 2801 case CHIP_PITCAIRN: 2802 case CHIP_OLAND: 2803 case CHIP_HAINAN: 2804 adev->family = AMDGPU_FAMILY_SI; 2805 r = si_set_ip_blocks(adev); 2806 if (r) 2807 return r; 2808 break; 2809 #endif 2810 #ifdef CONFIG_DRM_AMDGPU_CIK 2811 case CHIP_BONAIRE: 2812 case CHIP_HAWAII: 2813 case CHIP_KAVERI: 2814 case CHIP_KABINI: 2815 case CHIP_MULLINS: 2816 if (adev->flags & AMD_IS_APU) 2817 adev->family = AMDGPU_FAMILY_KV; 2818 else 2819 adev->family = AMDGPU_FAMILY_CI; 2820 2821 r = cik_set_ip_blocks(adev); 2822 if (r) 2823 return r; 2824 break; 2825 #endif 2826 case CHIP_TOPAZ: 2827 case CHIP_TONGA: 2828 case CHIP_FIJI: 2829 case CHIP_POLARIS10: 2830 case CHIP_POLARIS11: 2831 case CHIP_POLARIS12: 2832 case CHIP_VEGAM: 2833 case CHIP_CARRIZO: 2834 case CHIP_STONEY: 2835 if (adev->flags & AMD_IS_APU) 2836 adev->family = AMDGPU_FAMILY_CZ; 2837 else 2838 adev->family = AMDGPU_FAMILY_VI; 2839 2840 r = vi_set_ip_blocks(adev); 2841 if (r) 2842 return r; 2843 break; 2844 default: 2845 r = amdgpu_discovery_set_ip_blocks(adev); 2846 if (r) 2847 return r; 2848 break; 2849 } 2850 2851 /* Check for IP version 9.4.3 with A0 hardware */ 2852 if (amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 3) && 2853 !amdgpu_device_get_rev_id(adev)) { 2854 dev_err(adev->dev, "Unsupported A0 hardware\n"); 2855 return -ENODEV; /* device unsupported - no device error */ 2856 } 2857 2858 if (amdgpu_has_atpx() && 2859 (amdgpu_is_atpx_hybrid() || 2860 amdgpu_has_atpx_dgpu_power_cntl()) && 2861 ((adev->flags & AMD_IS_APU) == 0) && 2862 !dev_is_removable(&adev->pdev->dev)) 2863 adev->flags |= AMD_IS_PX; 2864 2865 if (!(adev->flags & AMD_IS_APU)) { 2866 parent = pcie_find_root_port(adev->pdev); 2867 adev->has_pr3 = parent ? pci_pr3_present(parent) : false; 2868 } 2869 2870 adev->pm.pp_feature = amdgpu_pp_feature_mask; 2871 if (amdgpu_sriov_vf(adev) || sched_policy == KFD_SCHED_POLICY_NO_HWS) 2872 adev->pm.pp_feature &= ~PP_GFXOFF_MASK; 2873 if (amdgpu_sriov_vf(adev) && adev->asic_type == CHIP_SIENNA_CICHLID) 2874 adev->pm.pp_feature &= ~PP_OVERDRIVE_MASK; 2875 if (!amdgpu_device_pcie_dynamic_switching_supported(adev)) 2876 adev->pm.pp_feature &= ~PP_PCIE_DPM_MASK; 2877 2878 adev->virt.is_xgmi_node_migrate_enabled = false; 2879 if (amdgpu_sriov_vf(adev)) { 2880 adev->virt.is_xgmi_node_migrate_enabled = 2881 amdgpu_ip_version((adev), GC_HWIP, 0) == IP_VERSION(9, 4, 4); 2882 } 2883 2884 total = true; 2885 for (i = 0; i < adev->num_ip_blocks; i++) { 2886 ip_block = &adev->ip_blocks[i]; 2887 2888 if ((amdgpu_ip_block_mask & (1 << i)) == 0) { 2889 dev_warn(adev->dev, "disabled ip block: %d <%s>\n", i, 2890 adev->ip_blocks[i].version->funcs->name); 2891 adev->ip_blocks[i].status.valid = false; 2892 } else if (ip_block->version->funcs->early_init) { 2893 r = ip_block->version->funcs->early_init(ip_block); 2894 if (r == -ENOENT) { 2895 adev->ip_blocks[i].status.valid = false; 2896 } else if (r) { 2897 dev_err(adev->dev, 2898 "early_init of IP block <%s> failed %d\n", 2899 adev->ip_blocks[i].version->funcs->name, 2900 r); 2901 total = false; 2902 } else { 2903 adev->ip_blocks[i].status.valid = true; 2904 } 2905 } else { 2906 adev->ip_blocks[i].status.valid = true; 2907 } 2908 /* get the vbios after the asic_funcs are set up */ 2909 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON) { 2910 r = amdgpu_device_parse_gpu_info_fw(adev); 2911 if (r) 2912 return r; 2913 2914 bios_flags = amdgpu_device_get_vbios_flags(adev); 2915 skip_bios = !!(bios_flags & AMDGPU_VBIOS_SKIP); 2916 /* Read BIOS */ 2917 if (!skip_bios) { 2918 bool optional = 2919 !!(bios_flags & AMDGPU_VBIOS_OPTIONAL); 2920 if (!amdgpu_get_bios(adev) && !optional) 2921 return -EINVAL; 2922 2923 if (optional && !adev->bios) 2924 dev_info( 2925 adev->dev, 2926 "VBIOS image optional, proceeding without VBIOS image"); 2927 2928 if (adev->bios) { 2929 r = amdgpu_atombios_init(adev); 2930 if (r) { 2931 dev_err(adev->dev, 2932 "amdgpu_atombios_init failed\n"); 2933 amdgpu_vf_error_put( 2934 adev, 2935 AMDGIM_ERROR_VF_ATOMBIOS_INIT_FAIL, 2936 0, 0); 2937 return r; 2938 } 2939 } 2940 } 2941 2942 /*get pf2vf msg info at it's earliest time*/ 2943 if (amdgpu_sriov_vf(adev)) 2944 amdgpu_virt_init_data_exchange(adev); 2945 2946 } 2947 } 2948 if (!total) 2949 return -ENODEV; 2950 2951 if (adev->gmc.xgmi.supported) 2952 amdgpu_xgmi_early_init(adev); 2953 2954 if (amdgpu_is_multi_aid(adev)) 2955 amdgpu_uid_init(adev); 2956 ip_block = amdgpu_device_ip_get_ip_block(adev, AMD_IP_BLOCK_TYPE_GFX); 2957 if (ip_block->status.valid != false) 2958 amdgpu_amdkfd_device_probe(adev); 2959 2960 adev->cg_flags &= amdgpu_cg_mask; 2961 adev->pg_flags &= amdgpu_pg_mask; 2962 2963 return 0; 2964 } 2965 2966 static int amdgpu_device_ip_hw_init_phase1(struct amdgpu_device *adev) 2967 { 2968 int i, r; 2969 2970 for (i = 0; i < adev->num_ip_blocks; i++) { 2971 if (!adev->ip_blocks[i].status.sw) 2972 continue; 2973 if (adev->ip_blocks[i].status.hw) 2974 continue; 2975 if (!amdgpu_ip_member_of_hwini( 2976 adev, adev->ip_blocks[i].version->type)) 2977 continue; 2978 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON || 2979 (amdgpu_sriov_vf(adev) && (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP)) || 2980 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH) { 2981 r = adev->ip_blocks[i].version->funcs->hw_init(&adev->ip_blocks[i]); 2982 if (r) { 2983 dev_err(adev->dev, 2984 "hw_init of IP block <%s> failed %d\n", 2985 adev->ip_blocks[i].version->funcs->name, 2986 r); 2987 return r; 2988 } 2989 adev->ip_blocks[i].status.hw = true; 2990 } 2991 } 2992 2993 return 0; 2994 } 2995 2996 static int amdgpu_device_ip_hw_init_phase2(struct amdgpu_device *adev) 2997 { 2998 int i, r; 2999 3000 for (i = 0; i < adev->num_ip_blocks; i++) { 3001 if (!adev->ip_blocks[i].status.sw) 3002 continue; 3003 if (adev->ip_blocks[i].status.hw) 3004 continue; 3005 if (!amdgpu_ip_member_of_hwini( 3006 adev, adev->ip_blocks[i].version->type)) 3007 continue; 3008 r = adev->ip_blocks[i].version->funcs->hw_init(&adev->ip_blocks[i]); 3009 if (r) { 3010 dev_err(adev->dev, 3011 "hw_init of IP block <%s> failed %d\n", 3012 adev->ip_blocks[i].version->funcs->name, r); 3013 return r; 3014 } 3015 adev->ip_blocks[i].status.hw = true; 3016 } 3017 3018 return 0; 3019 } 3020 3021 static int amdgpu_device_fw_loading(struct amdgpu_device *adev) 3022 { 3023 int r = 0; 3024 int i; 3025 uint32_t smu_version; 3026 3027 if (adev->asic_type >= CHIP_VEGA10) { 3028 for (i = 0; i < adev->num_ip_blocks; i++) { 3029 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_PSP) 3030 continue; 3031 3032 if (!amdgpu_ip_member_of_hwini(adev, 3033 AMD_IP_BLOCK_TYPE_PSP)) 3034 break; 3035 3036 if (!adev->ip_blocks[i].status.sw) 3037 continue; 3038 3039 /* no need to do the fw loading again if already done*/ 3040 if (adev->ip_blocks[i].status.hw == true) 3041 break; 3042 3043 if (amdgpu_in_reset(adev) || adev->in_suspend) { 3044 r = amdgpu_ip_block_resume(&adev->ip_blocks[i]); 3045 if (r) 3046 return r; 3047 } else { 3048 r = adev->ip_blocks[i].version->funcs->hw_init(&adev->ip_blocks[i]); 3049 if (r) { 3050 dev_err(adev->dev, 3051 "hw_init of IP block <%s> failed %d\n", 3052 adev->ip_blocks[i] 3053 .version->funcs->name, 3054 r); 3055 return r; 3056 } 3057 adev->ip_blocks[i].status.hw = true; 3058 } 3059 break; 3060 } 3061 } 3062 3063 if (!amdgpu_sriov_vf(adev) || adev->asic_type == CHIP_TONGA) 3064 r = amdgpu_pm_load_smu_firmware(adev, &smu_version); 3065 3066 return r; 3067 } 3068 3069 static int amdgpu_device_init_schedulers(struct amdgpu_device *adev) 3070 { 3071 struct drm_sched_init_args args = { 3072 .ops = &amdgpu_sched_ops, 3073 .num_rqs = DRM_SCHED_PRIORITY_COUNT, 3074 .timeout_wq = adev->reset_domain->wq, 3075 .dev = adev->dev, 3076 }; 3077 long timeout; 3078 int r, i; 3079 3080 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 3081 struct amdgpu_ring *ring = adev->rings[i]; 3082 3083 /* No need to setup the GPU scheduler for rings that don't need it */ 3084 if (!ring || ring->no_scheduler) 3085 continue; 3086 3087 switch (ring->funcs->type) { 3088 case AMDGPU_RING_TYPE_GFX: 3089 timeout = adev->gfx_timeout; 3090 break; 3091 case AMDGPU_RING_TYPE_COMPUTE: 3092 timeout = adev->compute_timeout; 3093 break; 3094 case AMDGPU_RING_TYPE_SDMA: 3095 timeout = adev->sdma_timeout; 3096 break; 3097 default: 3098 timeout = adev->video_timeout; 3099 break; 3100 } 3101 3102 args.timeout = timeout; 3103 args.credit_limit = ring->num_hw_submission; 3104 args.score = ring->sched_score; 3105 args.name = ring->name; 3106 3107 r = drm_sched_init(&ring->sched, &args); 3108 if (r) { 3109 dev_err(adev->dev, 3110 "Failed to create scheduler on ring %s.\n", 3111 ring->name); 3112 return r; 3113 } 3114 r = amdgpu_uvd_entity_init(adev, ring); 3115 if (r) { 3116 dev_err(adev->dev, 3117 "Failed to create UVD scheduling entity on ring %s.\n", 3118 ring->name); 3119 return r; 3120 } 3121 r = amdgpu_vce_entity_init(adev, ring); 3122 if (r) { 3123 dev_err(adev->dev, 3124 "Failed to create VCE scheduling entity on ring %s.\n", 3125 ring->name); 3126 return r; 3127 } 3128 } 3129 3130 if (adev->xcp_mgr) 3131 amdgpu_xcp_update_partition_sched_list(adev); 3132 3133 return 0; 3134 } 3135 3136 3137 /** 3138 * amdgpu_device_ip_init - run init for hardware IPs 3139 * 3140 * @adev: amdgpu_device pointer 3141 * 3142 * Main initialization pass for hardware IPs. The list of all the hardware 3143 * IPs that make up the asic is walked and the sw_init and hw_init callbacks 3144 * are run. sw_init initializes the software state associated with each IP 3145 * and hw_init initializes the hardware associated with each IP. 3146 * Returns 0 on success, negative error code on failure. 3147 */ 3148 static int amdgpu_device_ip_init(struct amdgpu_device *adev) 3149 { 3150 bool init_badpage; 3151 int i, r; 3152 3153 r = amdgpu_ras_init(adev); 3154 if (r) 3155 return r; 3156 3157 for (i = 0; i < adev->num_ip_blocks; i++) { 3158 if (!adev->ip_blocks[i].status.valid) 3159 continue; 3160 if (adev->ip_blocks[i].version->funcs->sw_init) { 3161 r = adev->ip_blocks[i].version->funcs->sw_init(&adev->ip_blocks[i]); 3162 if (r) { 3163 dev_err(adev->dev, 3164 "sw_init of IP block <%s> failed %d\n", 3165 adev->ip_blocks[i].version->funcs->name, 3166 r); 3167 goto init_failed; 3168 } 3169 } 3170 adev->ip_blocks[i].status.sw = true; 3171 3172 if (!amdgpu_ip_member_of_hwini( 3173 adev, adev->ip_blocks[i].version->type)) 3174 continue; 3175 3176 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON) { 3177 /* need to do common hw init early so everything is set up for gmc */ 3178 r = adev->ip_blocks[i].version->funcs->hw_init(&adev->ip_blocks[i]); 3179 if (r) { 3180 dev_err(adev->dev, "hw_init %d failed %d\n", i, 3181 r); 3182 goto init_failed; 3183 } 3184 adev->ip_blocks[i].status.hw = true; 3185 } else if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) { 3186 /* need to do gmc hw init early so we can allocate gpu mem */ 3187 /* Try to reserve bad pages early */ 3188 if (amdgpu_sriov_vf(adev)) 3189 amdgpu_virt_exchange_data(adev); 3190 3191 r = amdgpu_device_mem_scratch_init(adev); 3192 if (r) { 3193 dev_err(adev->dev, 3194 "amdgpu_mem_scratch_init failed %d\n", 3195 r); 3196 goto init_failed; 3197 } 3198 r = adev->ip_blocks[i].version->funcs->hw_init(&adev->ip_blocks[i]); 3199 if (r) { 3200 dev_err(adev->dev, "hw_init %d failed %d\n", i, 3201 r); 3202 goto init_failed; 3203 } 3204 r = amdgpu_device_wb_init(adev); 3205 if (r) { 3206 dev_err(adev->dev, 3207 "amdgpu_device_wb_init failed %d\n", r); 3208 goto init_failed; 3209 } 3210 adev->ip_blocks[i].status.hw = true; 3211 3212 /* right after GMC hw init, we create CSA */ 3213 if (adev->gfx.mcbp) { 3214 r = amdgpu_allocate_static_csa(adev, &adev->virt.csa_obj, 3215 AMDGPU_GEM_DOMAIN_VRAM | 3216 AMDGPU_GEM_DOMAIN_GTT, 3217 AMDGPU_CSA_SIZE); 3218 if (r) { 3219 dev_err(adev->dev, 3220 "allocate CSA failed %d\n", r); 3221 goto init_failed; 3222 } 3223 } 3224 3225 r = amdgpu_seq64_init(adev); 3226 if (r) { 3227 dev_err(adev->dev, "allocate seq64 failed %d\n", 3228 r); 3229 goto init_failed; 3230 } 3231 } 3232 } 3233 3234 if (amdgpu_sriov_vf(adev)) 3235 amdgpu_virt_init_data_exchange(adev); 3236 3237 r = amdgpu_ib_pool_init(adev); 3238 if (r) { 3239 dev_err(adev->dev, "IB initialization failed (%d).\n", r); 3240 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_IB_INIT_FAIL, 0, r); 3241 goto init_failed; 3242 } 3243 3244 r = amdgpu_ucode_create_bo(adev); /* create ucode bo when sw_init complete*/ 3245 if (r) 3246 goto init_failed; 3247 3248 r = amdgpu_device_ip_hw_init_phase1(adev); 3249 if (r) 3250 goto init_failed; 3251 3252 r = amdgpu_device_fw_loading(adev); 3253 if (r) 3254 goto init_failed; 3255 3256 r = amdgpu_device_ip_hw_init_phase2(adev); 3257 if (r) 3258 goto init_failed; 3259 3260 /* 3261 * retired pages will be loaded from eeprom and reserved here, 3262 * it should be called after amdgpu_device_ip_hw_init_phase2 since 3263 * for some ASICs the RAS EEPROM code relies on SMU fully functioning 3264 * for I2C communication which only true at this point. 3265 * 3266 * amdgpu_ras_recovery_init may fail, but the upper only cares the 3267 * failure from bad gpu situation and stop amdgpu init process 3268 * accordingly. For other failed cases, it will still release all 3269 * the resource and print error message, rather than returning one 3270 * negative value to upper level. 3271 * 3272 * Note: theoretically, this should be called before all vram allocations 3273 * to protect retired page from abusing 3274 */ 3275 init_badpage = (adev->init_lvl->level != AMDGPU_INIT_LEVEL_MINIMAL_XGMI); 3276 r = amdgpu_ras_recovery_init(adev, init_badpage); 3277 if (r) 3278 goto init_failed; 3279 3280 /** 3281 * In case of XGMI grab extra reference for reset domain for this device 3282 */ 3283 if (adev->gmc.xgmi.num_physical_nodes > 1) { 3284 if (amdgpu_xgmi_add_device(adev) == 0) { 3285 if (!amdgpu_sriov_vf(adev)) { 3286 struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev); 3287 3288 if (WARN_ON(!hive)) { 3289 r = -ENOENT; 3290 goto init_failed; 3291 } 3292 3293 if (!hive->reset_domain || 3294 !amdgpu_reset_get_reset_domain(hive->reset_domain)) { 3295 r = -ENOENT; 3296 amdgpu_put_xgmi_hive(hive); 3297 goto init_failed; 3298 } 3299 3300 /* Drop the early temporary reset domain we created for device */ 3301 amdgpu_reset_put_reset_domain(adev->reset_domain); 3302 adev->reset_domain = hive->reset_domain; 3303 amdgpu_put_xgmi_hive(hive); 3304 } 3305 } 3306 } 3307 3308 r = amdgpu_device_init_schedulers(adev); 3309 if (r) 3310 goto init_failed; 3311 3312 if (adev->mman.buffer_funcs_ring->sched.ready) 3313 amdgpu_ttm_set_buffer_funcs_status(adev, true); 3314 3315 /* Don't init kfd if whole hive need to be reset during init */ 3316 if (adev->init_lvl->level != AMDGPU_INIT_LEVEL_MINIMAL_XGMI) { 3317 kgd2kfd_init_zone_device(adev); 3318 amdgpu_amdkfd_device_init(adev); 3319 } 3320 3321 amdgpu_fru_get_product_info(adev); 3322 3323 if (!amdgpu_sriov_vf(adev) || amdgpu_sriov_ras_cper_en(adev)) 3324 r = amdgpu_cper_init(adev); 3325 3326 init_failed: 3327 3328 return r; 3329 } 3330 3331 /** 3332 * amdgpu_device_fill_reset_magic - writes reset magic to gart pointer 3333 * 3334 * @adev: amdgpu_device pointer 3335 * 3336 * Writes a reset magic value to the gart pointer in VRAM. The driver calls 3337 * this function before a GPU reset. If the value is retained after a 3338 * GPU reset, VRAM has not been lost. Some GPU resets may destroy VRAM contents. 3339 */ 3340 static void amdgpu_device_fill_reset_magic(struct amdgpu_device *adev) 3341 { 3342 memcpy(adev->reset_magic, adev->gart.ptr, AMDGPU_RESET_MAGIC_NUM); 3343 } 3344 3345 /** 3346 * amdgpu_device_check_vram_lost - check if vram is valid 3347 * 3348 * @adev: amdgpu_device pointer 3349 * 3350 * Checks the reset magic value written to the gart pointer in VRAM. 3351 * The driver calls this after a GPU reset to see if the contents of 3352 * VRAM is lost or now. 3353 * returns true if vram is lost, false if not. 3354 */ 3355 static bool amdgpu_device_check_vram_lost(struct amdgpu_device *adev) 3356 { 3357 if (memcmp(adev->gart.ptr, adev->reset_magic, 3358 AMDGPU_RESET_MAGIC_NUM)) 3359 return true; 3360 3361 if (!amdgpu_in_reset(adev)) 3362 return false; 3363 3364 /* 3365 * For all ASICs with baco/mode1 reset, the VRAM is 3366 * always assumed to be lost. 3367 */ 3368 switch (amdgpu_asic_reset_method(adev)) { 3369 case AMD_RESET_METHOD_LEGACY: 3370 case AMD_RESET_METHOD_LINK: 3371 case AMD_RESET_METHOD_BACO: 3372 case AMD_RESET_METHOD_MODE1: 3373 return true; 3374 default: 3375 return false; 3376 } 3377 } 3378 3379 /** 3380 * amdgpu_device_set_cg_state - set clockgating for amdgpu device 3381 * 3382 * @adev: amdgpu_device pointer 3383 * @state: clockgating state (gate or ungate) 3384 * 3385 * The list of all the hardware IPs that make up the asic is walked and the 3386 * set_clockgating_state callbacks are run. 3387 * Late initialization pass enabling clockgating for hardware IPs. 3388 * Fini or suspend, pass disabling clockgating for hardware IPs. 3389 * Returns 0 on success, negative error code on failure. 3390 */ 3391 3392 int amdgpu_device_set_cg_state(struct amdgpu_device *adev, 3393 enum amd_clockgating_state state) 3394 { 3395 int i, j, r; 3396 3397 if (amdgpu_emu_mode == 1) 3398 return 0; 3399 3400 for (j = 0; j < adev->num_ip_blocks; j++) { 3401 i = state == AMD_CG_STATE_GATE ? j : adev->num_ip_blocks - j - 1; 3402 if (!adev->ip_blocks[i].status.late_initialized) 3403 continue; 3404 /* skip CG for GFX, SDMA on S0ix */ 3405 if (adev->in_s0ix && 3406 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GFX || 3407 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SDMA)) 3408 continue; 3409 /* skip CG for VCE/UVD, it's handled specially */ 3410 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_UVD && 3411 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCE && 3412 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCN && 3413 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_JPEG && 3414 adev->ip_blocks[i].version->funcs->set_clockgating_state) { 3415 /* enable clockgating to save power */ 3416 r = adev->ip_blocks[i].version->funcs->set_clockgating_state(&adev->ip_blocks[i], 3417 state); 3418 if (r) { 3419 dev_err(adev->dev, 3420 "set_clockgating_state(gate) of IP block <%s> failed %d\n", 3421 adev->ip_blocks[i].version->funcs->name, 3422 r); 3423 return r; 3424 } 3425 } 3426 } 3427 3428 return 0; 3429 } 3430 3431 int amdgpu_device_set_pg_state(struct amdgpu_device *adev, 3432 enum amd_powergating_state state) 3433 { 3434 int i, j, r; 3435 3436 if (amdgpu_emu_mode == 1) 3437 return 0; 3438 3439 for (j = 0; j < adev->num_ip_blocks; j++) { 3440 i = state == AMD_PG_STATE_GATE ? j : adev->num_ip_blocks - j - 1; 3441 if (!adev->ip_blocks[i].status.late_initialized) 3442 continue; 3443 /* skip PG for GFX, SDMA on S0ix */ 3444 if (adev->in_s0ix && 3445 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GFX || 3446 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SDMA)) 3447 continue; 3448 /* skip CG for VCE/UVD, it's handled specially */ 3449 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_UVD && 3450 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCE && 3451 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCN && 3452 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_JPEG && 3453 adev->ip_blocks[i].version->funcs->set_powergating_state) { 3454 /* enable powergating to save power */ 3455 r = adev->ip_blocks[i].version->funcs->set_powergating_state(&adev->ip_blocks[i], 3456 state); 3457 if (r) { 3458 dev_err(adev->dev, 3459 "set_powergating_state(gate) of IP block <%s> failed %d\n", 3460 adev->ip_blocks[i].version->funcs->name, 3461 r); 3462 return r; 3463 } 3464 } 3465 } 3466 return 0; 3467 } 3468 3469 static int amdgpu_device_enable_mgpu_fan_boost(void) 3470 { 3471 struct amdgpu_gpu_instance *gpu_ins; 3472 struct amdgpu_device *adev; 3473 int i, ret = 0; 3474 3475 mutex_lock(&mgpu_info.mutex); 3476 3477 /* 3478 * MGPU fan boost feature should be enabled 3479 * only when there are two or more dGPUs in 3480 * the system 3481 */ 3482 if (mgpu_info.num_dgpu < 2) 3483 goto out; 3484 3485 for (i = 0; i < mgpu_info.num_dgpu; i++) { 3486 gpu_ins = &(mgpu_info.gpu_ins[i]); 3487 adev = gpu_ins->adev; 3488 if (!(adev->flags & AMD_IS_APU || amdgpu_sriov_multi_vf_mode(adev)) && 3489 !gpu_ins->mgpu_fan_enabled) { 3490 ret = amdgpu_dpm_enable_mgpu_fan_boost(adev); 3491 if (ret) 3492 break; 3493 3494 gpu_ins->mgpu_fan_enabled = 1; 3495 } 3496 } 3497 3498 out: 3499 mutex_unlock(&mgpu_info.mutex); 3500 3501 return ret; 3502 } 3503 3504 /** 3505 * amdgpu_device_ip_late_init - run late init for hardware IPs 3506 * 3507 * @adev: amdgpu_device pointer 3508 * 3509 * Late initialization pass for hardware IPs. The list of all the hardware 3510 * IPs that make up the asic is walked and the late_init callbacks are run. 3511 * late_init covers any special initialization that an IP requires 3512 * after all of the have been initialized or something that needs to happen 3513 * late in the init process. 3514 * Returns 0 on success, negative error code on failure. 3515 */ 3516 static int amdgpu_device_ip_late_init(struct amdgpu_device *adev) 3517 { 3518 struct amdgpu_gpu_instance *gpu_instance; 3519 int i = 0, r; 3520 3521 for (i = 0; i < adev->num_ip_blocks; i++) { 3522 if (!adev->ip_blocks[i].status.hw) 3523 continue; 3524 if (adev->ip_blocks[i].version->funcs->late_init) { 3525 r = adev->ip_blocks[i].version->funcs->late_init(&adev->ip_blocks[i]); 3526 if (r) { 3527 dev_err(adev->dev, 3528 "late_init of IP block <%s> failed %d\n", 3529 adev->ip_blocks[i].version->funcs->name, 3530 r); 3531 return r; 3532 } 3533 } 3534 adev->ip_blocks[i].status.late_initialized = true; 3535 } 3536 3537 r = amdgpu_ras_late_init(adev); 3538 if (r) { 3539 dev_err(adev->dev, "amdgpu_ras_late_init failed %d", r); 3540 return r; 3541 } 3542 3543 if (!amdgpu_reset_in_recovery(adev)) 3544 amdgpu_ras_set_error_query_ready(adev, true); 3545 3546 amdgpu_device_set_cg_state(adev, AMD_CG_STATE_GATE); 3547 amdgpu_device_set_pg_state(adev, AMD_PG_STATE_GATE); 3548 3549 amdgpu_device_fill_reset_magic(adev); 3550 3551 r = amdgpu_device_enable_mgpu_fan_boost(); 3552 if (r) 3553 dev_err(adev->dev, "enable mgpu fan boost failed (%d).\n", r); 3554 3555 /* For passthrough configuration on arcturus and aldebaran, enable special handling SBR */ 3556 if (amdgpu_passthrough(adev) && 3557 ((adev->asic_type == CHIP_ARCTURUS && adev->gmc.xgmi.num_physical_nodes > 1) || 3558 adev->asic_type == CHIP_ALDEBARAN)) 3559 amdgpu_dpm_handle_passthrough_sbr(adev, true); 3560 3561 if (adev->gmc.xgmi.num_physical_nodes > 1) { 3562 mutex_lock(&mgpu_info.mutex); 3563 3564 /* 3565 * Reset device p-state to low as this was booted with high. 3566 * 3567 * This should be performed only after all devices from the same 3568 * hive get initialized. 3569 * 3570 * However, it's unknown how many device in the hive in advance. 3571 * As this is counted one by one during devices initializations. 3572 * 3573 * So, we wait for all XGMI interlinked devices initialized. 3574 * This may bring some delays as those devices may come from 3575 * different hives. But that should be OK. 3576 */ 3577 if (mgpu_info.num_dgpu == adev->gmc.xgmi.num_physical_nodes) { 3578 for (i = 0; i < mgpu_info.num_gpu; i++) { 3579 gpu_instance = &(mgpu_info.gpu_ins[i]); 3580 if (gpu_instance->adev->flags & AMD_IS_APU) 3581 continue; 3582 3583 r = amdgpu_xgmi_set_pstate(gpu_instance->adev, 3584 AMDGPU_XGMI_PSTATE_MIN); 3585 if (r) { 3586 dev_err(adev->dev, 3587 "pstate setting failed (%d).\n", 3588 r); 3589 break; 3590 } 3591 } 3592 } 3593 3594 mutex_unlock(&mgpu_info.mutex); 3595 } 3596 3597 return 0; 3598 } 3599 3600 static void amdgpu_ip_block_hw_fini(struct amdgpu_ip_block *ip_block) 3601 { 3602 struct amdgpu_device *adev = ip_block->adev; 3603 int r; 3604 3605 if (!ip_block->version->funcs->hw_fini) { 3606 dev_err(adev->dev, "hw_fini of IP block <%s> not defined\n", 3607 ip_block->version->funcs->name); 3608 } else { 3609 r = ip_block->version->funcs->hw_fini(ip_block); 3610 /* XXX handle errors */ 3611 if (r) { 3612 dev_dbg(adev->dev, 3613 "hw_fini of IP block <%s> failed %d\n", 3614 ip_block->version->funcs->name, r); 3615 } 3616 } 3617 3618 ip_block->status.hw = false; 3619 } 3620 3621 /** 3622 * amdgpu_device_smu_fini_early - smu hw_fini wrapper 3623 * 3624 * @adev: amdgpu_device pointer 3625 * 3626 * For ASICs need to disable SMC first 3627 */ 3628 static void amdgpu_device_smu_fini_early(struct amdgpu_device *adev) 3629 { 3630 int i; 3631 3632 if (amdgpu_ip_version(adev, GC_HWIP, 0) > IP_VERSION(9, 0, 0)) 3633 return; 3634 3635 for (i = 0; i < adev->num_ip_blocks; i++) { 3636 if (!adev->ip_blocks[i].status.hw) 3637 continue; 3638 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) { 3639 amdgpu_ip_block_hw_fini(&adev->ip_blocks[i]); 3640 break; 3641 } 3642 } 3643 } 3644 3645 static int amdgpu_device_ip_fini_early(struct amdgpu_device *adev) 3646 { 3647 int i, r; 3648 3649 for (i = 0; i < adev->num_ip_blocks; i++) { 3650 if (!adev->ip_blocks[i].version->funcs->early_fini) 3651 continue; 3652 3653 r = adev->ip_blocks[i].version->funcs->early_fini(&adev->ip_blocks[i]); 3654 if (r) { 3655 dev_dbg(adev->dev, 3656 "early_fini of IP block <%s> failed %d\n", 3657 adev->ip_blocks[i].version->funcs->name, r); 3658 } 3659 } 3660 3661 amdgpu_device_set_pg_state(adev, AMD_PG_STATE_UNGATE); 3662 amdgpu_device_set_cg_state(adev, AMD_CG_STATE_UNGATE); 3663 3664 amdgpu_amdkfd_suspend(adev, true); 3665 amdgpu_userq_suspend(adev); 3666 3667 /* Workaround for ASICs need to disable SMC first */ 3668 amdgpu_device_smu_fini_early(adev); 3669 3670 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 3671 if (!adev->ip_blocks[i].status.hw) 3672 continue; 3673 3674 amdgpu_ip_block_hw_fini(&adev->ip_blocks[i]); 3675 } 3676 3677 if (amdgpu_sriov_vf(adev)) { 3678 if (amdgpu_virt_release_full_gpu(adev, false)) 3679 dev_err(adev->dev, 3680 "failed to release exclusive mode on fini\n"); 3681 } 3682 3683 return 0; 3684 } 3685 3686 /** 3687 * amdgpu_device_ip_fini - run fini for hardware IPs 3688 * 3689 * @adev: amdgpu_device pointer 3690 * 3691 * Main teardown pass for hardware IPs. The list of all the hardware 3692 * IPs that make up the asic is walked and the hw_fini and sw_fini callbacks 3693 * are run. hw_fini tears down the hardware associated with each IP 3694 * and sw_fini tears down any software state associated with each IP. 3695 * Returns 0 on success, negative error code on failure. 3696 */ 3697 static int amdgpu_device_ip_fini(struct amdgpu_device *adev) 3698 { 3699 int i, r; 3700 3701 amdgpu_cper_fini(adev); 3702 3703 if (amdgpu_sriov_vf(adev) && adev->virt.ras_init_done) 3704 amdgpu_virt_release_ras_err_handler_data(adev); 3705 3706 if (adev->gmc.xgmi.num_physical_nodes > 1) 3707 amdgpu_xgmi_remove_device(adev); 3708 3709 amdgpu_amdkfd_device_fini_sw(adev); 3710 3711 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 3712 if (!adev->ip_blocks[i].status.sw) 3713 continue; 3714 3715 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) { 3716 amdgpu_ucode_free_bo(adev); 3717 amdgpu_free_static_csa(&adev->virt.csa_obj); 3718 amdgpu_device_wb_fini(adev); 3719 amdgpu_device_mem_scratch_fini(adev); 3720 amdgpu_ib_pool_fini(adev); 3721 amdgpu_seq64_fini(adev); 3722 amdgpu_doorbell_fini(adev); 3723 } 3724 if (adev->ip_blocks[i].version->funcs->sw_fini) { 3725 r = adev->ip_blocks[i].version->funcs->sw_fini(&adev->ip_blocks[i]); 3726 /* XXX handle errors */ 3727 if (r) { 3728 dev_dbg(adev->dev, 3729 "sw_fini of IP block <%s> failed %d\n", 3730 adev->ip_blocks[i].version->funcs->name, 3731 r); 3732 } 3733 } 3734 adev->ip_blocks[i].status.sw = false; 3735 adev->ip_blocks[i].status.valid = false; 3736 } 3737 3738 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 3739 if (!adev->ip_blocks[i].status.late_initialized) 3740 continue; 3741 if (adev->ip_blocks[i].version->funcs->late_fini) 3742 adev->ip_blocks[i].version->funcs->late_fini(&adev->ip_blocks[i]); 3743 adev->ip_blocks[i].status.late_initialized = false; 3744 } 3745 3746 amdgpu_ras_fini(adev); 3747 amdgpu_uid_fini(adev); 3748 3749 return 0; 3750 } 3751 3752 /** 3753 * amdgpu_device_delayed_init_work_handler - work handler for IB tests 3754 * 3755 * @work: work_struct. 3756 */ 3757 static void amdgpu_device_delayed_init_work_handler(struct work_struct *work) 3758 { 3759 struct amdgpu_device *adev = 3760 container_of(work, struct amdgpu_device, delayed_init_work.work); 3761 int r; 3762 3763 r = amdgpu_ib_ring_tests(adev); 3764 if (r) 3765 dev_err(adev->dev, "ib ring test failed (%d).\n", r); 3766 } 3767 3768 static void amdgpu_device_delay_enable_gfx_off(struct work_struct *work) 3769 { 3770 struct amdgpu_device *adev = 3771 container_of(work, struct amdgpu_device, gfx.gfx_off_delay_work.work); 3772 3773 WARN_ON_ONCE(adev->gfx.gfx_off_state); 3774 WARN_ON_ONCE(adev->gfx.gfx_off_req_count); 3775 3776 if (!amdgpu_dpm_set_powergating_by_smu(adev, AMD_IP_BLOCK_TYPE_GFX, true, 0)) 3777 adev->gfx.gfx_off_state = true; 3778 } 3779 3780 /** 3781 * amdgpu_device_ip_suspend_phase1 - run suspend for hardware IPs (phase 1) 3782 * 3783 * @adev: amdgpu_device pointer 3784 * 3785 * Main suspend function for hardware IPs. The list of all the hardware 3786 * IPs that make up the asic is walked, clockgating is disabled and the 3787 * suspend callbacks are run. suspend puts the hardware and software state 3788 * in each IP into a state suitable for suspend. 3789 * Returns 0 on success, negative error code on failure. 3790 */ 3791 static int amdgpu_device_ip_suspend_phase1(struct amdgpu_device *adev) 3792 { 3793 int i, r, rec; 3794 3795 amdgpu_device_set_pg_state(adev, AMD_PG_STATE_UNGATE); 3796 amdgpu_device_set_cg_state(adev, AMD_CG_STATE_UNGATE); 3797 3798 /* 3799 * Per PMFW team's suggestion, driver needs to handle gfxoff 3800 * and df cstate features disablement for gpu reset(e.g. Mode1Reset) 3801 * scenario. Add the missing df cstate disablement here. 3802 */ 3803 if (amdgpu_dpm_set_df_cstate(adev, DF_CSTATE_DISALLOW)) 3804 dev_warn(adev->dev, "Failed to disallow df cstate"); 3805 3806 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 3807 if (!adev->ip_blocks[i].status.valid) 3808 continue; 3809 3810 /* displays are handled separately */ 3811 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_DCE) 3812 continue; 3813 3814 r = amdgpu_ip_block_suspend(&adev->ip_blocks[i]); 3815 if (r) 3816 goto unwind; 3817 } 3818 3819 return 0; 3820 unwind: 3821 rec = amdgpu_device_ip_resume_phase3(adev); 3822 if (rec) 3823 dev_err(adev->dev, 3824 "amdgpu_device_ip_resume_phase3 failed during unwind: %d\n", 3825 rec); 3826 3827 amdgpu_dpm_set_df_cstate(adev, DF_CSTATE_ALLOW); 3828 3829 amdgpu_device_set_pg_state(adev, AMD_PG_STATE_GATE); 3830 amdgpu_device_set_cg_state(adev, AMD_CG_STATE_GATE); 3831 3832 return r; 3833 } 3834 3835 /** 3836 * amdgpu_device_ip_suspend_phase2 - run suspend for hardware IPs (phase 2) 3837 * 3838 * @adev: amdgpu_device pointer 3839 * 3840 * Main suspend function for hardware IPs. The list of all the hardware 3841 * IPs that make up the asic is walked, clockgating is disabled and the 3842 * suspend callbacks are run. suspend puts the hardware and software state 3843 * in each IP into a state suitable for suspend. 3844 * Returns 0 on success, negative error code on failure. 3845 */ 3846 static int amdgpu_device_ip_suspend_phase2(struct amdgpu_device *adev) 3847 { 3848 int i, r, rec; 3849 3850 if (adev->in_s0ix) 3851 amdgpu_dpm_gfx_state_change(adev, sGpuChangeState_D3Entry); 3852 3853 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 3854 if (!adev->ip_blocks[i].status.valid) 3855 continue; 3856 /* displays are handled in phase1 */ 3857 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE) 3858 continue; 3859 /* PSP lost connection when err_event_athub occurs */ 3860 if (amdgpu_ras_intr_triggered() && 3861 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) { 3862 adev->ip_blocks[i].status.hw = false; 3863 continue; 3864 } 3865 3866 /* skip unnecessary suspend if we do not initialize them yet */ 3867 if (!amdgpu_ip_member_of_hwini( 3868 adev, adev->ip_blocks[i].version->type)) 3869 continue; 3870 3871 /* Since we skip suspend for S0i3, we need to cancel the delayed 3872 * idle work here as the suspend callback never gets called. 3873 */ 3874 if (adev->in_s0ix && 3875 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GFX && 3876 amdgpu_ip_version(adev, GC_HWIP, 0) >= IP_VERSION(10, 0, 0)) 3877 cancel_delayed_work_sync(&adev->gfx.idle_work); 3878 /* skip suspend of gfx/mes and psp for S0ix 3879 * gfx is in gfxoff state, so on resume it will exit gfxoff just 3880 * like at runtime. PSP is also part of the always on hardware 3881 * so no need to suspend it. 3882 */ 3883 if (adev->in_s0ix && 3884 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP || 3885 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GFX || 3886 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_MES)) 3887 continue; 3888 3889 /* SDMA 5.x+ is part of GFX power domain so it's covered by GFXOFF */ 3890 if (adev->in_s0ix && 3891 (amdgpu_ip_version(adev, SDMA0_HWIP, 0) >= 3892 IP_VERSION(5, 0, 0)) && 3893 (adev->ip_blocks[i].version->type == 3894 AMD_IP_BLOCK_TYPE_SDMA)) 3895 continue; 3896 3897 /* Once swPSP provides the IMU, RLC FW binaries to TOS during cold-boot. 3898 * These are in TMR, hence are expected to be reused by PSP-TOS to reload 3899 * from this location and RLC Autoload automatically also gets loaded 3900 * from here based on PMFW -> PSP message during re-init sequence. 3901 * Therefore, the psp suspend & resume should be skipped to avoid destroy 3902 * the TMR and reload FWs again for IMU enabled APU ASICs. 3903 */ 3904 if (amdgpu_in_reset(adev) && 3905 (adev->flags & AMD_IS_APU) && adev->gfx.imu.funcs && 3906 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) 3907 continue; 3908 3909 r = amdgpu_ip_block_suspend(&adev->ip_blocks[i]); 3910 if (r) 3911 goto unwind; 3912 3913 /* handle putting the SMC in the appropriate state */ 3914 if (!amdgpu_sriov_vf(adev)) { 3915 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) { 3916 r = amdgpu_dpm_set_mp1_state(adev, adev->mp1_state); 3917 if (r) { 3918 dev_err(adev->dev, 3919 "SMC failed to set mp1 state %d, %d\n", 3920 adev->mp1_state, r); 3921 goto unwind; 3922 } 3923 } 3924 } 3925 } 3926 3927 return 0; 3928 unwind: 3929 /* suspend phase 2 = resume phase 1 + resume phase 2 */ 3930 rec = amdgpu_device_ip_resume_phase1(adev); 3931 if (rec) { 3932 dev_err(adev->dev, 3933 "amdgpu_device_ip_resume_phase1 failed during unwind: %d\n", 3934 rec); 3935 return r; 3936 } 3937 3938 rec = amdgpu_device_fw_loading(adev); 3939 if (rec) { 3940 dev_err(adev->dev, 3941 "amdgpu_device_fw_loading failed during unwind: %d\n", 3942 rec); 3943 return r; 3944 } 3945 3946 rec = amdgpu_device_ip_resume_phase2(adev); 3947 if (rec) { 3948 dev_err(adev->dev, 3949 "amdgpu_device_ip_resume_phase2 failed during unwind: %d\n", 3950 rec); 3951 return r; 3952 } 3953 3954 return r; 3955 } 3956 3957 /** 3958 * amdgpu_device_ip_suspend - run suspend for hardware IPs 3959 * 3960 * @adev: amdgpu_device pointer 3961 * 3962 * Main suspend function for hardware IPs. The list of all the hardware 3963 * IPs that make up the asic is walked, clockgating is disabled and the 3964 * suspend callbacks are run. suspend puts the hardware and software state 3965 * in each IP into a state suitable for suspend. 3966 * Returns 0 on success, negative error code on failure. 3967 */ 3968 static int amdgpu_device_ip_suspend(struct amdgpu_device *adev) 3969 { 3970 int r; 3971 3972 if (amdgpu_sriov_vf(adev)) { 3973 amdgpu_virt_fini_data_exchange(adev); 3974 amdgpu_virt_request_full_gpu(adev, false); 3975 } 3976 3977 amdgpu_ttm_set_buffer_funcs_status(adev, false); 3978 3979 r = amdgpu_device_ip_suspend_phase1(adev); 3980 if (r) 3981 return r; 3982 r = amdgpu_device_ip_suspend_phase2(adev); 3983 3984 if (amdgpu_sriov_vf(adev)) 3985 amdgpu_virt_release_full_gpu(adev, false); 3986 3987 return r; 3988 } 3989 3990 static int amdgpu_device_ip_reinit_early_sriov(struct amdgpu_device *adev) 3991 { 3992 int i, r; 3993 3994 static enum amd_ip_block_type ip_order[] = { 3995 AMD_IP_BLOCK_TYPE_COMMON, 3996 AMD_IP_BLOCK_TYPE_GMC, 3997 AMD_IP_BLOCK_TYPE_PSP, 3998 AMD_IP_BLOCK_TYPE_IH, 3999 }; 4000 4001 for (i = 0; i < adev->num_ip_blocks; i++) { 4002 int j; 4003 struct amdgpu_ip_block *block; 4004 4005 block = &adev->ip_blocks[i]; 4006 block->status.hw = false; 4007 4008 for (j = 0; j < ARRAY_SIZE(ip_order); j++) { 4009 4010 if (block->version->type != ip_order[j] || 4011 !block->status.valid) 4012 continue; 4013 4014 r = block->version->funcs->hw_init(&adev->ip_blocks[i]); 4015 if (r) { 4016 dev_err(adev->dev, "RE-INIT-early: %s failed\n", 4017 block->version->funcs->name); 4018 return r; 4019 } 4020 block->status.hw = true; 4021 } 4022 } 4023 4024 return 0; 4025 } 4026 4027 static int amdgpu_device_ip_reinit_late_sriov(struct amdgpu_device *adev) 4028 { 4029 struct amdgpu_ip_block *block; 4030 int i, r = 0; 4031 4032 static enum amd_ip_block_type ip_order[] = { 4033 AMD_IP_BLOCK_TYPE_SMC, 4034 AMD_IP_BLOCK_TYPE_DCE, 4035 AMD_IP_BLOCK_TYPE_GFX, 4036 AMD_IP_BLOCK_TYPE_SDMA, 4037 AMD_IP_BLOCK_TYPE_MES, 4038 AMD_IP_BLOCK_TYPE_UVD, 4039 AMD_IP_BLOCK_TYPE_VCE, 4040 AMD_IP_BLOCK_TYPE_VCN, 4041 AMD_IP_BLOCK_TYPE_JPEG 4042 }; 4043 4044 for (i = 0; i < ARRAY_SIZE(ip_order); i++) { 4045 block = amdgpu_device_ip_get_ip_block(adev, ip_order[i]); 4046 4047 if (!block) 4048 continue; 4049 4050 if (block->status.valid && !block->status.hw) { 4051 if (block->version->type == AMD_IP_BLOCK_TYPE_SMC) { 4052 r = amdgpu_ip_block_resume(block); 4053 } else { 4054 r = block->version->funcs->hw_init(block); 4055 } 4056 4057 if (r) { 4058 dev_err(adev->dev, "RE-INIT-late: %s failed\n", 4059 block->version->funcs->name); 4060 break; 4061 } 4062 block->status.hw = true; 4063 } 4064 } 4065 4066 return r; 4067 } 4068 4069 /** 4070 * amdgpu_device_ip_resume_phase1 - run resume for hardware IPs 4071 * 4072 * @adev: amdgpu_device pointer 4073 * 4074 * First resume function for hardware IPs. The list of all the hardware 4075 * IPs that make up the asic is walked and the resume callbacks are run for 4076 * COMMON, GMC, and IH. resume puts the hardware into a functional state 4077 * after a suspend and updates the software state as necessary. This 4078 * function is also used for restoring the GPU after a GPU reset. 4079 * Returns 0 on success, negative error code on failure. 4080 */ 4081 static int amdgpu_device_ip_resume_phase1(struct amdgpu_device *adev) 4082 { 4083 int i, r; 4084 4085 for (i = 0; i < adev->num_ip_blocks; i++) { 4086 if (!adev->ip_blocks[i].status.valid || adev->ip_blocks[i].status.hw) 4087 continue; 4088 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON || 4089 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC || 4090 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH || 4091 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP && amdgpu_sriov_vf(adev))) { 4092 4093 r = amdgpu_ip_block_resume(&adev->ip_blocks[i]); 4094 if (r) 4095 return r; 4096 } 4097 } 4098 4099 return 0; 4100 } 4101 4102 /** 4103 * amdgpu_device_ip_resume_phase2 - run resume for hardware IPs 4104 * 4105 * @adev: amdgpu_device pointer 4106 * 4107 * Second resume function for hardware IPs. The list of all the hardware 4108 * IPs that make up the asic is walked and the resume callbacks are run for 4109 * all blocks except COMMON, GMC, and IH. resume puts the hardware into a 4110 * functional state after a suspend and updates the software state as 4111 * necessary. This function is also used for restoring the GPU after a GPU 4112 * reset. 4113 * Returns 0 on success, negative error code on failure. 4114 */ 4115 static int amdgpu_device_ip_resume_phase2(struct amdgpu_device *adev) 4116 { 4117 int i, r; 4118 4119 for (i = 0; i < adev->num_ip_blocks; i++) { 4120 if (!adev->ip_blocks[i].status.valid || adev->ip_blocks[i].status.hw) 4121 continue; 4122 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON || 4123 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC || 4124 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH || 4125 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE || 4126 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) 4127 continue; 4128 r = amdgpu_ip_block_resume(&adev->ip_blocks[i]); 4129 if (r) 4130 return r; 4131 } 4132 4133 return 0; 4134 } 4135 4136 /** 4137 * amdgpu_device_ip_resume_phase3 - run resume for hardware IPs 4138 * 4139 * @adev: amdgpu_device pointer 4140 * 4141 * Third resume function for hardware IPs. The list of all the hardware 4142 * IPs that make up the asic is walked and the resume callbacks are run for 4143 * all DCE. resume puts the hardware into a functional state after a suspend 4144 * and updates the software state as necessary. This function is also used 4145 * for restoring the GPU after a GPU reset. 4146 * 4147 * Returns 0 on success, negative error code on failure. 4148 */ 4149 static int amdgpu_device_ip_resume_phase3(struct amdgpu_device *adev) 4150 { 4151 int i, r; 4152 4153 for (i = 0; i < adev->num_ip_blocks; i++) { 4154 if (!adev->ip_blocks[i].status.valid || adev->ip_blocks[i].status.hw) 4155 continue; 4156 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE) { 4157 r = amdgpu_ip_block_resume(&adev->ip_blocks[i]); 4158 if (r) 4159 return r; 4160 } 4161 } 4162 4163 return 0; 4164 } 4165 4166 /** 4167 * amdgpu_device_ip_resume - run resume for hardware IPs 4168 * 4169 * @adev: amdgpu_device pointer 4170 * 4171 * Main resume function for hardware IPs. The hardware IPs 4172 * are split into two resume functions because they are 4173 * also used in recovering from a GPU reset and some additional 4174 * steps need to be take between them. In this case (S3/S4) they are 4175 * run sequentially. 4176 * Returns 0 on success, negative error code on failure. 4177 */ 4178 static int amdgpu_device_ip_resume(struct amdgpu_device *adev) 4179 { 4180 int r; 4181 4182 r = amdgpu_device_ip_resume_phase1(adev); 4183 if (r) 4184 return r; 4185 4186 r = amdgpu_device_fw_loading(adev); 4187 if (r) 4188 return r; 4189 4190 r = amdgpu_device_ip_resume_phase2(adev); 4191 4192 if (adev->mman.buffer_funcs_ring->sched.ready) 4193 amdgpu_ttm_set_buffer_funcs_status(adev, true); 4194 4195 if (r) 4196 return r; 4197 4198 amdgpu_fence_driver_hw_init(adev); 4199 4200 r = amdgpu_device_ip_resume_phase3(adev); 4201 4202 return r; 4203 } 4204 4205 /** 4206 * amdgpu_device_detect_sriov_bios - determine if the board supports SR-IOV 4207 * 4208 * @adev: amdgpu_device pointer 4209 * 4210 * Query the VBIOS data tables to determine if the board supports SR-IOV. 4211 */ 4212 static void amdgpu_device_detect_sriov_bios(struct amdgpu_device *adev) 4213 { 4214 if (amdgpu_sriov_vf(adev)) { 4215 if (adev->is_atom_fw) { 4216 if (amdgpu_atomfirmware_gpu_virtualization_supported(adev)) 4217 adev->virt.caps |= AMDGPU_SRIOV_CAPS_SRIOV_VBIOS; 4218 } else { 4219 if (amdgpu_atombios_has_gpu_virtualization_table(adev)) 4220 adev->virt.caps |= AMDGPU_SRIOV_CAPS_SRIOV_VBIOS; 4221 } 4222 4223 if (!(adev->virt.caps & AMDGPU_SRIOV_CAPS_SRIOV_VBIOS)) 4224 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_NO_VBIOS, 0, 0); 4225 } 4226 } 4227 4228 /** 4229 * amdgpu_device_asic_has_dc_support - determine if DC supports the asic 4230 * 4231 * @pdev : pci device context 4232 * @asic_type: AMD asic type 4233 * 4234 * Check if there is DC (new modesetting infrastructre) support for an asic. 4235 * returns true if DC has support, false if not. 4236 */ 4237 bool amdgpu_device_asic_has_dc_support(struct pci_dev *pdev, 4238 enum amd_asic_type asic_type) 4239 { 4240 switch (asic_type) { 4241 #ifdef CONFIG_DRM_AMDGPU_SI 4242 case CHIP_HAINAN: 4243 #endif 4244 case CHIP_TOPAZ: 4245 /* chips with no display hardware */ 4246 return false; 4247 #if defined(CONFIG_DRM_AMD_DC) 4248 case CHIP_TAHITI: 4249 case CHIP_PITCAIRN: 4250 case CHIP_VERDE: 4251 case CHIP_OLAND: 4252 /* 4253 * We have systems in the wild with these ASICs that require 4254 * LVDS and VGA support which is not supported with DC. 4255 * 4256 * Fallback to the non-DC driver here by default so as not to 4257 * cause regressions. 4258 */ 4259 #if defined(CONFIG_DRM_AMD_DC_SI) 4260 return amdgpu_dc > 0; 4261 #else 4262 return false; 4263 #endif 4264 case CHIP_KAVERI: 4265 case CHIP_KABINI: 4266 case CHIP_MULLINS: 4267 /* 4268 * We have systems in the wild with these ASICs that require 4269 * VGA support which is not supported with DC. 4270 * 4271 * Fallback to the non-DC driver here by default so as not to 4272 * cause regressions. 4273 */ 4274 return amdgpu_dc > 0; 4275 default: 4276 return amdgpu_dc != 0; 4277 #else 4278 default: 4279 if (amdgpu_dc > 0) 4280 dev_info_once( 4281 &pdev->dev, 4282 "Display Core has been requested via kernel parameter but isn't supported by ASIC, ignoring\n"); 4283 return false; 4284 #endif 4285 } 4286 } 4287 4288 /** 4289 * amdgpu_device_has_dc_support - check if dc is supported 4290 * 4291 * @adev: amdgpu_device pointer 4292 * 4293 * Returns true for supported, false for not supported 4294 */ 4295 bool amdgpu_device_has_dc_support(struct amdgpu_device *adev) 4296 { 4297 if (adev->enable_virtual_display || 4298 (adev->harvest_ip_mask & AMD_HARVEST_IP_DMU_MASK)) 4299 return false; 4300 4301 return amdgpu_device_asic_has_dc_support(adev->pdev, adev->asic_type); 4302 } 4303 4304 static void amdgpu_device_xgmi_reset_func(struct work_struct *__work) 4305 { 4306 struct amdgpu_device *adev = 4307 container_of(__work, struct amdgpu_device, xgmi_reset_work); 4308 struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev); 4309 4310 /* It's a bug to not have a hive within this function */ 4311 if (WARN_ON(!hive)) 4312 return; 4313 4314 /* 4315 * Use task barrier to synchronize all xgmi reset works across the 4316 * hive. task_barrier_enter and task_barrier_exit will block 4317 * until all the threads running the xgmi reset works reach 4318 * those points. task_barrier_full will do both blocks. 4319 */ 4320 if (amdgpu_asic_reset_method(adev) == AMD_RESET_METHOD_BACO) { 4321 4322 task_barrier_enter(&hive->tb); 4323 adev->asic_reset_res = amdgpu_device_baco_enter(adev); 4324 4325 if (adev->asic_reset_res) 4326 goto fail; 4327 4328 task_barrier_exit(&hive->tb); 4329 adev->asic_reset_res = amdgpu_device_baco_exit(adev); 4330 4331 if (adev->asic_reset_res) 4332 goto fail; 4333 4334 amdgpu_ras_reset_error_count(adev, AMDGPU_RAS_BLOCK__MMHUB); 4335 } else { 4336 4337 task_barrier_full(&hive->tb); 4338 adev->asic_reset_res = amdgpu_asic_reset(adev); 4339 } 4340 4341 fail: 4342 if (adev->asic_reset_res) 4343 dev_warn(adev->dev, 4344 "ASIC reset failed with error, %d for drm dev, %s", 4345 adev->asic_reset_res, adev_to_drm(adev)->unique); 4346 amdgpu_put_xgmi_hive(hive); 4347 } 4348 4349 static int amdgpu_device_get_job_timeout_settings(struct amdgpu_device *adev) 4350 { 4351 char *input = amdgpu_lockup_timeout; 4352 char *timeout_setting = NULL; 4353 int index = 0; 4354 long timeout; 4355 int ret = 0; 4356 4357 /* By default timeout for all queues is 2 sec */ 4358 adev->gfx_timeout = adev->compute_timeout = adev->sdma_timeout = 4359 adev->video_timeout = msecs_to_jiffies(2000); 4360 4361 if (!strnlen(input, AMDGPU_MAX_TIMEOUT_PARAM_LENGTH)) 4362 return 0; 4363 4364 while ((timeout_setting = strsep(&input, ",")) && 4365 strnlen(timeout_setting, AMDGPU_MAX_TIMEOUT_PARAM_LENGTH)) { 4366 ret = kstrtol(timeout_setting, 0, &timeout); 4367 if (ret) 4368 return ret; 4369 4370 if (timeout == 0) { 4371 index++; 4372 continue; 4373 } else if (timeout < 0) { 4374 timeout = MAX_SCHEDULE_TIMEOUT; 4375 dev_warn(adev->dev, "lockup timeout disabled"); 4376 add_taint(TAINT_SOFTLOCKUP, LOCKDEP_STILL_OK); 4377 } else { 4378 timeout = msecs_to_jiffies(timeout); 4379 } 4380 4381 switch (index++) { 4382 case 0: 4383 adev->gfx_timeout = timeout; 4384 break; 4385 case 1: 4386 adev->compute_timeout = timeout; 4387 break; 4388 case 2: 4389 adev->sdma_timeout = timeout; 4390 break; 4391 case 3: 4392 adev->video_timeout = timeout; 4393 break; 4394 default: 4395 break; 4396 } 4397 } 4398 4399 /* When only one value specified apply it to all queues. */ 4400 if (index == 1) 4401 adev->gfx_timeout = adev->compute_timeout = adev->sdma_timeout = 4402 adev->video_timeout = timeout; 4403 4404 return ret; 4405 } 4406 4407 /** 4408 * amdgpu_device_check_iommu_direct_map - check if RAM direct mapped to GPU 4409 * 4410 * @adev: amdgpu_device pointer 4411 * 4412 * RAM direct mapped to GPU if IOMMU is not enabled or is pass through mode 4413 */ 4414 static void amdgpu_device_check_iommu_direct_map(struct amdgpu_device *adev) 4415 { 4416 struct iommu_domain *domain; 4417 4418 domain = iommu_get_domain_for_dev(adev->dev); 4419 if (!domain || domain->type == IOMMU_DOMAIN_IDENTITY) 4420 adev->ram_is_direct_mapped = true; 4421 } 4422 4423 #if defined(CONFIG_HSA_AMD_P2P) 4424 /** 4425 * amdgpu_device_check_iommu_remap - Check if DMA remapping is enabled. 4426 * 4427 * @adev: amdgpu_device pointer 4428 * 4429 * return if IOMMU remapping bar address 4430 */ 4431 static bool amdgpu_device_check_iommu_remap(struct amdgpu_device *adev) 4432 { 4433 struct iommu_domain *domain; 4434 4435 domain = iommu_get_domain_for_dev(adev->dev); 4436 if (domain && (domain->type == IOMMU_DOMAIN_DMA || 4437 domain->type == IOMMU_DOMAIN_DMA_FQ)) 4438 return true; 4439 4440 return false; 4441 } 4442 #endif 4443 4444 static void amdgpu_device_set_mcbp(struct amdgpu_device *adev) 4445 { 4446 if (amdgpu_mcbp == 1) 4447 adev->gfx.mcbp = true; 4448 else if (amdgpu_mcbp == 0) 4449 adev->gfx.mcbp = false; 4450 4451 if (amdgpu_sriov_vf(adev)) 4452 adev->gfx.mcbp = true; 4453 4454 if (adev->gfx.mcbp) 4455 dev_info(adev->dev, "MCBP is enabled\n"); 4456 } 4457 4458 static int amdgpu_device_sys_interface_init(struct amdgpu_device *adev) 4459 { 4460 int r; 4461 4462 r = amdgpu_atombios_sysfs_init(adev); 4463 if (r) 4464 drm_err(&adev->ddev, 4465 "registering atombios sysfs failed (%d).\n", r); 4466 4467 r = amdgpu_pm_sysfs_init(adev); 4468 if (r) 4469 dev_err(adev->dev, "registering pm sysfs failed (%d).\n", r); 4470 4471 r = amdgpu_ucode_sysfs_init(adev); 4472 if (r) { 4473 adev->ucode_sysfs_en = false; 4474 dev_err(adev->dev, "Creating firmware sysfs failed (%d).\n", r); 4475 } else 4476 adev->ucode_sysfs_en = true; 4477 4478 r = amdgpu_device_attr_sysfs_init(adev); 4479 if (r) 4480 dev_err(adev->dev, "Could not create amdgpu device attr\n"); 4481 4482 r = devm_device_add_group(adev->dev, &amdgpu_board_attrs_group); 4483 if (r) 4484 dev_err(adev->dev, 4485 "Could not create amdgpu board attributes\n"); 4486 4487 amdgpu_fru_sysfs_init(adev); 4488 amdgpu_reg_state_sysfs_init(adev); 4489 amdgpu_xcp_sysfs_init(adev); 4490 4491 return r; 4492 } 4493 4494 static void amdgpu_device_sys_interface_fini(struct amdgpu_device *adev) 4495 { 4496 if (adev->pm.sysfs_initialized) 4497 amdgpu_pm_sysfs_fini(adev); 4498 if (adev->ucode_sysfs_en) 4499 amdgpu_ucode_sysfs_fini(adev); 4500 amdgpu_device_attr_sysfs_fini(adev); 4501 amdgpu_fru_sysfs_fini(adev); 4502 4503 amdgpu_reg_state_sysfs_fini(adev); 4504 amdgpu_xcp_sysfs_fini(adev); 4505 } 4506 4507 /** 4508 * amdgpu_device_init - initialize the driver 4509 * 4510 * @adev: amdgpu_device pointer 4511 * @flags: driver flags 4512 * 4513 * Initializes the driver info and hw (all asics). 4514 * Returns 0 for success or an error on failure. 4515 * Called at driver startup. 4516 */ 4517 int amdgpu_device_init(struct amdgpu_device *adev, 4518 uint32_t flags) 4519 { 4520 struct pci_dev *pdev = adev->pdev; 4521 int r, i; 4522 bool px = false; 4523 u32 max_MBps; 4524 int tmp; 4525 4526 adev->shutdown = false; 4527 adev->flags = flags; 4528 4529 if (amdgpu_force_asic_type >= 0 && amdgpu_force_asic_type < CHIP_LAST) 4530 adev->asic_type = amdgpu_force_asic_type; 4531 else 4532 adev->asic_type = flags & AMD_ASIC_MASK; 4533 4534 adev->usec_timeout = AMDGPU_MAX_USEC_TIMEOUT; 4535 if (amdgpu_emu_mode == 1) 4536 adev->usec_timeout *= 10; 4537 adev->gmc.gart_size = 512 * 1024 * 1024; 4538 adev->accel_working = false; 4539 adev->num_rings = 0; 4540 RCU_INIT_POINTER(adev->gang_submit, dma_fence_get_stub()); 4541 adev->mman.buffer_funcs = NULL; 4542 adev->mman.buffer_funcs_ring = NULL; 4543 adev->vm_manager.vm_pte_funcs = NULL; 4544 adev->vm_manager.vm_pte_num_scheds = 0; 4545 adev->gmc.gmc_funcs = NULL; 4546 adev->harvest_ip_mask = 0x0; 4547 adev->fence_context = dma_fence_context_alloc(AMDGPU_MAX_RINGS); 4548 bitmap_zero(adev->gfx.pipe_reserve_bitmap, AMDGPU_MAX_COMPUTE_QUEUES); 4549 4550 adev->smc_rreg = &amdgpu_invalid_rreg; 4551 adev->smc_wreg = &amdgpu_invalid_wreg; 4552 adev->pcie_rreg = &amdgpu_invalid_rreg; 4553 adev->pcie_wreg = &amdgpu_invalid_wreg; 4554 adev->pcie_rreg_ext = &amdgpu_invalid_rreg_ext; 4555 adev->pcie_wreg_ext = &amdgpu_invalid_wreg_ext; 4556 adev->pciep_rreg = &amdgpu_invalid_rreg; 4557 adev->pciep_wreg = &amdgpu_invalid_wreg; 4558 adev->pcie_rreg64 = &amdgpu_invalid_rreg64; 4559 adev->pcie_wreg64 = &amdgpu_invalid_wreg64; 4560 adev->pcie_rreg64_ext = &amdgpu_invalid_rreg64_ext; 4561 adev->pcie_wreg64_ext = &amdgpu_invalid_wreg64_ext; 4562 adev->uvd_ctx_rreg = &amdgpu_invalid_rreg; 4563 adev->uvd_ctx_wreg = &amdgpu_invalid_wreg; 4564 adev->didt_rreg = &amdgpu_invalid_rreg; 4565 adev->didt_wreg = &amdgpu_invalid_wreg; 4566 adev->gc_cac_rreg = &amdgpu_invalid_rreg; 4567 adev->gc_cac_wreg = &amdgpu_invalid_wreg; 4568 adev->audio_endpt_rreg = &amdgpu_block_invalid_rreg; 4569 adev->audio_endpt_wreg = &amdgpu_block_invalid_wreg; 4570 4571 dev_info( 4572 adev->dev, 4573 "initializing kernel modesetting (%s 0x%04X:0x%04X 0x%04X:0x%04X 0x%02X).\n", 4574 amdgpu_asic_name[adev->asic_type], pdev->vendor, pdev->device, 4575 pdev->subsystem_vendor, pdev->subsystem_device, pdev->revision); 4576 4577 /* mutex initialization are all done here so we 4578 * can recall function without having locking issues 4579 */ 4580 mutex_init(&adev->firmware.mutex); 4581 mutex_init(&adev->pm.mutex); 4582 mutex_init(&adev->gfx.gpu_clock_mutex); 4583 mutex_init(&adev->srbm_mutex); 4584 mutex_init(&adev->gfx.pipe_reserve_mutex); 4585 mutex_init(&adev->gfx.gfx_off_mutex); 4586 mutex_init(&adev->gfx.partition_mutex); 4587 mutex_init(&adev->grbm_idx_mutex); 4588 mutex_init(&adev->mn_lock); 4589 mutex_init(&adev->virt.vf_errors.lock); 4590 hash_init(adev->mn_hash); 4591 mutex_init(&adev->psp.mutex); 4592 mutex_init(&adev->notifier_lock); 4593 mutex_init(&adev->pm.stable_pstate_ctx_lock); 4594 mutex_init(&adev->benchmark_mutex); 4595 mutex_init(&adev->gfx.reset_sem_mutex); 4596 /* Initialize the mutex for cleaner shader isolation between GFX and compute processes */ 4597 mutex_init(&adev->enforce_isolation_mutex); 4598 for (i = 0; i < MAX_XCP; ++i) { 4599 adev->isolation[i].spearhead = dma_fence_get_stub(); 4600 amdgpu_sync_create(&adev->isolation[i].active); 4601 amdgpu_sync_create(&adev->isolation[i].prev); 4602 } 4603 mutex_init(&adev->gfx.userq_sch_mutex); 4604 mutex_init(&adev->gfx.workload_profile_mutex); 4605 mutex_init(&adev->vcn.workload_profile_mutex); 4606 4607 amdgpu_device_init_apu_flags(adev); 4608 4609 r = amdgpu_device_check_arguments(adev); 4610 if (r) 4611 return r; 4612 4613 spin_lock_init(&adev->mmio_idx_lock); 4614 spin_lock_init(&adev->smc_idx_lock); 4615 spin_lock_init(&adev->pcie_idx_lock); 4616 spin_lock_init(&adev->uvd_ctx_idx_lock); 4617 spin_lock_init(&adev->didt_idx_lock); 4618 spin_lock_init(&adev->gc_cac_idx_lock); 4619 spin_lock_init(&adev->se_cac_idx_lock); 4620 spin_lock_init(&adev->audio_endpt_idx_lock); 4621 spin_lock_init(&adev->mm_stats.lock); 4622 spin_lock_init(&adev->virt.rlcg_reg_lock); 4623 spin_lock_init(&adev->wb.lock); 4624 4625 xa_init_flags(&adev->userq_xa, XA_FLAGS_LOCK_IRQ); 4626 4627 INIT_LIST_HEAD(&adev->reset_list); 4628 4629 INIT_LIST_HEAD(&adev->ras_list); 4630 4631 INIT_LIST_HEAD(&adev->pm.od_kobj_list); 4632 4633 xa_init(&adev->userq_doorbell_xa); 4634 4635 INIT_DELAYED_WORK(&adev->delayed_init_work, 4636 amdgpu_device_delayed_init_work_handler); 4637 INIT_DELAYED_WORK(&adev->gfx.gfx_off_delay_work, 4638 amdgpu_device_delay_enable_gfx_off); 4639 /* 4640 * Initialize the enforce_isolation work structures for each XCP 4641 * partition. This work handler is responsible for enforcing shader 4642 * isolation on AMD GPUs. It counts the number of emitted fences for 4643 * each GFX and compute ring. If there are any fences, it schedules 4644 * the `enforce_isolation_work` to be run after a delay. If there are 4645 * no fences, it signals the Kernel Fusion Driver (KFD) to resume the 4646 * runqueue. 4647 */ 4648 for (i = 0; i < MAX_XCP; i++) { 4649 INIT_DELAYED_WORK(&adev->gfx.enforce_isolation[i].work, 4650 amdgpu_gfx_enforce_isolation_handler); 4651 adev->gfx.enforce_isolation[i].adev = adev; 4652 adev->gfx.enforce_isolation[i].xcp_id = i; 4653 } 4654 4655 INIT_WORK(&adev->xgmi_reset_work, amdgpu_device_xgmi_reset_func); 4656 INIT_WORK(&adev->userq_reset_work, amdgpu_userq_reset_work); 4657 4658 adev->gfx.gfx_off_req_count = 1; 4659 adev->gfx.gfx_off_residency = 0; 4660 adev->gfx.gfx_off_entrycount = 0; 4661 adev->pm.ac_power = power_supply_is_system_supplied() > 0; 4662 4663 atomic_set(&adev->throttling_logging_enabled, 1); 4664 /* 4665 * If throttling continues, logging will be performed every minute 4666 * to avoid log flooding. "-1" is subtracted since the thermal 4667 * throttling interrupt comes every second. Thus, the total logging 4668 * interval is 59 seconds(retelimited printk interval) + 1(waiting 4669 * for throttling interrupt) = 60 seconds. 4670 */ 4671 ratelimit_state_init(&adev->throttling_logging_rs, (60 - 1) * HZ, 1); 4672 4673 ratelimit_set_flags(&adev->throttling_logging_rs, RATELIMIT_MSG_ON_RELEASE); 4674 4675 /* Registers mapping */ 4676 /* TODO: block userspace mapping of io register */ 4677 if (adev->asic_type >= CHIP_BONAIRE) { 4678 adev->rmmio_base = pci_resource_start(adev->pdev, 5); 4679 adev->rmmio_size = pci_resource_len(adev->pdev, 5); 4680 } else { 4681 adev->rmmio_base = pci_resource_start(adev->pdev, 2); 4682 adev->rmmio_size = pci_resource_len(adev->pdev, 2); 4683 } 4684 4685 for (i = 0; i < AMD_IP_BLOCK_TYPE_NUM; i++) 4686 atomic_set(&adev->pm.pwr_state[i], POWER_STATE_UNKNOWN); 4687 4688 adev->rmmio = ioremap(adev->rmmio_base, adev->rmmio_size); 4689 if (!adev->rmmio) 4690 return -ENOMEM; 4691 4692 dev_info(adev->dev, "register mmio base: 0x%08X\n", 4693 (uint32_t)adev->rmmio_base); 4694 dev_info(adev->dev, "register mmio size: %u\n", 4695 (unsigned int)adev->rmmio_size); 4696 4697 /* 4698 * Reset domain needs to be present early, before XGMI hive discovered 4699 * (if any) and initialized to use reset sem and in_gpu reset flag 4700 * early on during init and before calling to RREG32. 4701 */ 4702 adev->reset_domain = amdgpu_reset_create_reset_domain(SINGLE_DEVICE, "amdgpu-reset-dev"); 4703 if (!adev->reset_domain) 4704 return -ENOMEM; 4705 4706 /* detect hw virtualization here */ 4707 amdgpu_virt_init(adev); 4708 4709 amdgpu_device_get_pcie_info(adev); 4710 4711 r = amdgpu_device_get_job_timeout_settings(adev); 4712 if (r) { 4713 dev_err(adev->dev, "invalid lockup_timeout parameter syntax\n"); 4714 return r; 4715 } 4716 4717 amdgpu_device_set_mcbp(adev); 4718 4719 /* 4720 * By default, use default mode where all blocks are expected to be 4721 * initialized. At present a 'swinit' of blocks is required to be 4722 * completed before the need for a different level is detected. 4723 */ 4724 amdgpu_set_init_level(adev, AMDGPU_INIT_LEVEL_DEFAULT); 4725 /* early init functions */ 4726 r = amdgpu_device_ip_early_init(adev); 4727 if (r) 4728 return r; 4729 4730 /* 4731 * No need to remove conflicting FBs for non-display class devices. 4732 * This prevents the sysfb from being freed accidently. 4733 */ 4734 if ((pdev->class >> 8) == PCI_CLASS_DISPLAY_VGA || 4735 (pdev->class >> 8) == PCI_CLASS_DISPLAY_OTHER) { 4736 /* Get rid of things like offb */ 4737 r = aperture_remove_conflicting_pci_devices(adev->pdev, amdgpu_kms_driver.name); 4738 if (r) 4739 return r; 4740 } 4741 4742 /* Enable TMZ based on IP_VERSION */ 4743 amdgpu_gmc_tmz_set(adev); 4744 4745 if (amdgpu_sriov_vf(adev) && 4746 amdgpu_ip_version(adev, GC_HWIP, 0) >= IP_VERSION(10, 3, 0)) 4747 /* VF MMIO access (except mailbox range) from CPU 4748 * will be blocked during sriov runtime 4749 */ 4750 adev->virt.caps |= AMDGPU_VF_MMIO_ACCESS_PROTECT; 4751 4752 amdgpu_gmc_noretry_set(adev); 4753 /* Need to get xgmi info early to decide the reset behavior*/ 4754 if (adev->gmc.xgmi.supported) { 4755 r = adev->gfxhub.funcs->get_xgmi_info(adev); 4756 if (r) 4757 return r; 4758 } 4759 4760 /* enable PCIE atomic ops */ 4761 if (amdgpu_sriov_vf(adev)) { 4762 if (adev->virt.fw_reserve.p_pf2vf) 4763 adev->have_atomics_support = ((struct amd_sriov_msg_pf2vf_info *) 4764 adev->virt.fw_reserve.p_pf2vf)->pcie_atomic_ops_support_flags == 4765 (PCI_EXP_DEVCAP2_ATOMIC_COMP32 | PCI_EXP_DEVCAP2_ATOMIC_COMP64); 4766 /* APUs w/ gfx9 onwards doesn't reply on PCIe atomics, rather it is a 4767 * internal path natively support atomics, set have_atomics_support to true. 4768 */ 4769 } else if ((adev->flags & AMD_IS_APU) && 4770 (amdgpu_ip_version(adev, GC_HWIP, 0) > 4771 IP_VERSION(9, 0, 0))) { 4772 adev->have_atomics_support = true; 4773 } else { 4774 adev->have_atomics_support = 4775 !pci_enable_atomic_ops_to_root(adev->pdev, 4776 PCI_EXP_DEVCAP2_ATOMIC_COMP32 | 4777 PCI_EXP_DEVCAP2_ATOMIC_COMP64); 4778 } 4779 4780 if (!adev->have_atomics_support) 4781 dev_info(adev->dev, "PCIE atomic ops is not supported\n"); 4782 4783 /* doorbell bar mapping and doorbell index init*/ 4784 amdgpu_doorbell_init(adev); 4785 4786 if (amdgpu_emu_mode == 1) { 4787 /* post the asic on emulation mode */ 4788 emu_soc_asic_init(adev); 4789 goto fence_driver_init; 4790 } 4791 4792 amdgpu_reset_init(adev); 4793 4794 /* detect if we are with an SRIOV vbios */ 4795 if (adev->bios) 4796 amdgpu_device_detect_sriov_bios(adev); 4797 4798 /* check if we need to reset the asic 4799 * E.g., driver was not cleanly unloaded previously, etc. 4800 */ 4801 if (!amdgpu_sriov_vf(adev) && amdgpu_asic_need_reset_on_init(adev)) { 4802 if (adev->gmc.xgmi.num_physical_nodes) { 4803 dev_info(adev->dev, "Pending hive reset.\n"); 4804 amdgpu_set_init_level(adev, 4805 AMDGPU_INIT_LEVEL_MINIMAL_XGMI); 4806 } else if (amdgpu_ip_version(adev, MP1_HWIP, 0) == IP_VERSION(13, 0, 10) && 4807 !amdgpu_device_has_display_hardware(adev)) { 4808 r = psp_gpu_reset(adev); 4809 } else { 4810 tmp = amdgpu_reset_method; 4811 /* It should do a default reset when loading or reloading the driver, 4812 * regardless of the module parameter reset_method. 4813 */ 4814 amdgpu_reset_method = AMD_RESET_METHOD_NONE; 4815 r = amdgpu_asic_reset(adev); 4816 amdgpu_reset_method = tmp; 4817 } 4818 4819 if (r) { 4820 dev_err(adev->dev, "asic reset on init failed\n"); 4821 goto failed; 4822 } 4823 } 4824 4825 /* Post card if necessary */ 4826 if (amdgpu_device_need_post(adev)) { 4827 if (!adev->bios) { 4828 dev_err(adev->dev, "no vBIOS found\n"); 4829 r = -EINVAL; 4830 goto failed; 4831 } 4832 dev_info(adev->dev, "GPU posting now...\n"); 4833 r = amdgpu_device_asic_init(adev); 4834 if (r) { 4835 dev_err(adev->dev, "gpu post error!\n"); 4836 goto failed; 4837 } 4838 } 4839 4840 if (adev->bios) { 4841 if (adev->is_atom_fw) { 4842 /* Initialize clocks */ 4843 r = amdgpu_atomfirmware_get_clock_info(adev); 4844 if (r) { 4845 dev_err(adev->dev, "amdgpu_atomfirmware_get_clock_info failed\n"); 4846 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_GET_CLOCK_FAIL, 0, 0); 4847 goto failed; 4848 } 4849 } else { 4850 /* Initialize clocks */ 4851 r = amdgpu_atombios_get_clock_info(adev); 4852 if (r) { 4853 dev_err(adev->dev, "amdgpu_atombios_get_clock_info failed\n"); 4854 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_GET_CLOCK_FAIL, 0, 0); 4855 goto failed; 4856 } 4857 /* init i2c buses */ 4858 amdgpu_i2c_init(adev); 4859 } 4860 } 4861 4862 fence_driver_init: 4863 /* Fence driver */ 4864 r = amdgpu_fence_driver_sw_init(adev); 4865 if (r) { 4866 dev_err(adev->dev, "amdgpu_fence_driver_sw_init failed\n"); 4867 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_FENCE_INIT_FAIL, 0, 0); 4868 goto failed; 4869 } 4870 4871 /* init the mode config */ 4872 drm_mode_config_init(adev_to_drm(adev)); 4873 4874 r = amdgpu_device_ip_init(adev); 4875 if (r) { 4876 dev_err(adev->dev, "amdgpu_device_ip_init failed\n"); 4877 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_AMDGPU_INIT_FAIL, 0, 0); 4878 goto release_ras_con; 4879 } 4880 4881 amdgpu_fence_driver_hw_init(adev); 4882 4883 dev_info(adev->dev, 4884 "SE %d, SH per SE %d, CU per SH %d, active_cu_number %d\n", 4885 adev->gfx.config.max_shader_engines, 4886 adev->gfx.config.max_sh_per_se, 4887 adev->gfx.config.max_cu_per_sh, 4888 adev->gfx.cu_info.number); 4889 4890 adev->accel_working = true; 4891 4892 amdgpu_vm_check_compute_bug(adev); 4893 4894 /* Initialize the buffer migration limit. */ 4895 if (amdgpu_moverate >= 0) 4896 max_MBps = amdgpu_moverate; 4897 else 4898 max_MBps = 8; /* Allow 8 MB/s. */ 4899 /* Get a log2 for easy divisions. */ 4900 adev->mm_stats.log2_max_MBps = ilog2(max(1u, max_MBps)); 4901 4902 /* 4903 * Register gpu instance before amdgpu_device_enable_mgpu_fan_boost. 4904 * Otherwise the mgpu fan boost feature will be skipped due to the 4905 * gpu instance is counted less. 4906 */ 4907 amdgpu_register_gpu_instance(adev); 4908 4909 /* enable clockgating, etc. after ib tests, etc. since some blocks require 4910 * explicit gating rather than handling it automatically. 4911 */ 4912 if (adev->init_lvl->level != AMDGPU_INIT_LEVEL_MINIMAL_XGMI) { 4913 r = amdgpu_device_ip_late_init(adev); 4914 if (r) { 4915 dev_err(adev->dev, "amdgpu_device_ip_late_init failed\n"); 4916 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_AMDGPU_LATE_INIT_FAIL, 0, r); 4917 goto release_ras_con; 4918 } 4919 /* must succeed. */ 4920 amdgpu_ras_resume(adev); 4921 queue_delayed_work(system_wq, &adev->delayed_init_work, 4922 msecs_to_jiffies(AMDGPU_RESUME_MS)); 4923 } 4924 4925 if (amdgpu_sriov_vf(adev)) { 4926 amdgpu_virt_release_full_gpu(adev, true); 4927 flush_delayed_work(&adev->delayed_init_work); 4928 } 4929 4930 if (adev->init_lvl->level == AMDGPU_INIT_LEVEL_MINIMAL_XGMI) 4931 amdgpu_xgmi_reset_on_init(adev); 4932 /* 4933 * Place those sysfs registering after `late_init`. As some of those 4934 * operations performed in `late_init` might affect the sysfs 4935 * interfaces creating. 4936 */ 4937 r = amdgpu_device_sys_interface_init(adev); 4938 4939 if (IS_ENABLED(CONFIG_PERF_EVENTS)) 4940 r = amdgpu_pmu_init(adev); 4941 if (r) 4942 dev_err(adev->dev, "amdgpu_pmu_init failed\n"); 4943 4944 /* Have stored pci confspace at hand for restore in sudden PCI error */ 4945 if (amdgpu_device_cache_pci_state(adev->pdev)) 4946 pci_restore_state(pdev); 4947 4948 /* if we have > 1 VGA cards, then disable the amdgpu VGA resources */ 4949 /* this will fail for cards that aren't VGA class devices, just 4950 * ignore it 4951 */ 4952 if ((adev->pdev->class >> 8) == PCI_CLASS_DISPLAY_VGA) 4953 vga_client_register(adev->pdev, amdgpu_device_vga_set_decode); 4954 4955 px = amdgpu_device_supports_px(adev); 4956 4957 if (px || (!dev_is_removable(&adev->pdev->dev) && 4958 apple_gmux_detect(NULL, NULL))) 4959 vga_switcheroo_register_client(adev->pdev, 4960 &amdgpu_switcheroo_ops, px); 4961 4962 if (px) 4963 vga_switcheroo_init_domain_pm_ops(adev->dev, &adev->vga_pm_domain); 4964 4965 amdgpu_device_check_iommu_direct_map(adev); 4966 4967 adev->pm_nb.notifier_call = amdgpu_device_pm_notifier; 4968 r = register_pm_notifier(&adev->pm_nb); 4969 if (r) 4970 goto failed; 4971 4972 return 0; 4973 4974 release_ras_con: 4975 if (amdgpu_sriov_vf(adev)) 4976 amdgpu_virt_release_full_gpu(adev, true); 4977 4978 /* failed in exclusive mode due to timeout */ 4979 if (amdgpu_sriov_vf(adev) && 4980 !amdgpu_sriov_runtime(adev) && 4981 amdgpu_virt_mmio_blocked(adev) && 4982 !amdgpu_virt_wait_reset(adev)) { 4983 dev_err(adev->dev, "VF exclusive mode timeout\n"); 4984 /* Don't send request since VF is inactive. */ 4985 adev->virt.caps &= ~AMDGPU_SRIOV_CAPS_RUNTIME; 4986 adev->virt.ops = NULL; 4987 r = -EAGAIN; 4988 } 4989 amdgpu_release_ras_context(adev); 4990 4991 failed: 4992 amdgpu_vf_error_trans_all(adev); 4993 4994 return r; 4995 } 4996 4997 static void amdgpu_device_unmap_mmio(struct amdgpu_device *adev) 4998 { 4999 5000 /* Clear all CPU mappings pointing to this device */ 5001 unmap_mapping_range(adev->ddev.anon_inode->i_mapping, 0, 0, 1); 5002 5003 /* Unmap all mapped bars - Doorbell, registers and VRAM */ 5004 amdgpu_doorbell_fini(adev); 5005 5006 iounmap(adev->rmmio); 5007 adev->rmmio = NULL; 5008 if (adev->mman.aper_base_kaddr) 5009 iounmap(adev->mman.aper_base_kaddr); 5010 adev->mman.aper_base_kaddr = NULL; 5011 5012 /* Memory manager related */ 5013 if (!adev->gmc.xgmi.connected_to_cpu && !adev->gmc.is_app_apu) { 5014 arch_phys_wc_del(adev->gmc.vram_mtrr); 5015 arch_io_free_memtype_wc(adev->gmc.aper_base, adev->gmc.aper_size); 5016 } 5017 } 5018 5019 /** 5020 * amdgpu_device_fini_hw - tear down the driver 5021 * 5022 * @adev: amdgpu_device pointer 5023 * 5024 * Tear down the driver info (all asics). 5025 * Called at driver shutdown. 5026 */ 5027 void amdgpu_device_fini_hw(struct amdgpu_device *adev) 5028 { 5029 dev_info(adev->dev, "amdgpu: finishing device.\n"); 5030 flush_delayed_work(&adev->delayed_init_work); 5031 5032 if (adev->mman.initialized) 5033 drain_workqueue(adev->mman.bdev.wq); 5034 adev->shutdown = true; 5035 5036 unregister_pm_notifier(&adev->pm_nb); 5037 5038 /* make sure IB test finished before entering exclusive mode 5039 * to avoid preemption on IB test 5040 */ 5041 if (amdgpu_sriov_vf(adev)) { 5042 amdgpu_virt_request_full_gpu(adev, false); 5043 amdgpu_virt_fini_data_exchange(adev); 5044 } 5045 5046 /* disable all interrupts */ 5047 amdgpu_irq_disable_all(adev); 5048 if (adev->mode_info.mode_config_initialized) { 5049 if (!drm_drv_uses_atomic_modeset(adev_to_drm(adev))) 5050 drm_helper_force_disable_all(adev_to_drm(adev)); 5051 else 5052 drm_atomic_helper_shutdown(adev_to_drm(adev)); 5053 } 5054 amdgpu_fence_driver_hw_fini(adev); 5055 5056 amdgpu_device_sys_interface_fini(adev); 5057 5058 /* disable ras feature must before hw fini */ 5059 amdgpu_ras_pre_fini(adev); 5060 5061 amdgpu_ttm_set_buffer_funcs_status(adev, false); 5062 5063 amdgpu_device_ip_fini_early(adev); 5064 5065 amdgpu_irq_fini_hw(adev); 5066 5067 if (adev->mman.initialized) 5068 ttm_device_clear_dma_mappings(&adev->mman.bdev); 5069 5070 amdgpu_gart_dummy_page_fini(adev); 5071 5072 if (drm_dev_is_unplugged(adev_to_drm(adev))) 5073 amdgpu_device_unmap_mmio(adev); 5074 5075 } 5076 5077 void amdgpu_device_fini_sw(struct amdgpu_device *adev) 5078 { 5079 int i, idx; 5080 bool px; 5081 5082 amdgpu_device_ip_fini(adev); 5083 amdgpu_fence_driver_sw_fini(adev); 5084 amdgpu_ucode_release(&adev->firmware.gpu_info_fw); 5085 adev->accel_working = false; 5086 dma_fence_put(rcu_dereference_protected(adev->gang_submit, true)); 5087 for (i = 0; i < MAX_XCP; ++i) { 5088 dma_fence_put(adev->isolation[i].spearhead); 5089 amdgpu_sync_free(&adev->isolation[i].active); 5090 amdgpu_sync_free(&adev->isolation[i].prev); 5091 } 5092 5093 amdgpu_reset_fini(adev); 5094 5095 /* free i2c buses */ 5096 amdgpu_i2c_fini(adev); 5097 5098 if (adev->bios) { 5099 if (amdgpu_emu_mode != 1) 5100 amdgpu_atombios_fini(adev); 5101 amdgpu_bios_release(adev); 5102 } 5103 5104 kfree(adev->fru_info); 5105 adev->fru_info = NULL; 5106 5107 kfree(adev->xcp_mgr); 5108 adev->xcp_mgr = NULL; 5109 5110 px = amdgpu_device_supports_px(adev); 5111 5112 if (px || (!dev_is_removable(&adev->pdev->dev) && 5113 apple_gmux_detect(NULL, NULL))) 5114 vga_switcheroo_unregister_client(adev->pdev); 5115 5116 if (px) 5117 vga_switcheroo_fini_domain_pm_ops(adev->dev); 5118 5119 if ((adev->pdev->class >> 8) == PCI_CLASS_DISPLAY_VGA) 5120 vga_client_unregister(adev->pdev); 5121 5122 if (drm_dev_enter(adev_to_drm(adev), &idx)) { 5123 5124 iounmap(adev->rmmio); 5125 adev->rmmio = NULL; 5126 drm_dev_exit(idx); 5127 } 5128 5129 if (IS_ENABLED(CONFIG_PERF_EVENTS)) 5130 amdgpu_pmu_fini(adev); 5131 if (adev->discovery.bin) 5132 amdgpu_discovery_fini(adev); 5133 5134 amdgpu_reset_put_reset_domain(adev->reset_domain); 5135 adev->reset_domain = NULL; 5136 5137 kfree(adev->pci_state); 5138 kfree(adev->pcie_reset_ctx.swds_pcistate); 5139 kfree(adev->pcie_reset_ctx.swus_pcistate); 5140 } 5141 5142 /** 5143 * amdgpu_device_evict_resources - evict device resources 5144 * @adev: amdgpu device object 5145 * 5146 * Evicts all ttm device resources(vram BOs, gart table) from the lru list 5147 * of the vram memory type. Mainly used for evicting device resources 5148 * at suspend time. 5149 * 5150 */ 5151 static int amdgpu_device_evict_resources(struct amdgpu_device *adev) 5152 { 5153 int ret; 5154 5155 /* No need to evict vram on APUs unless going to S4 */ 5156 if (!adev->in_s4 && (adev->flags & AMD_IS_APU)) 5157 return 0; 5158 5159 /* No need to evict when going to S5 through S4 callbacks */ 5160 if (system_state == SYSTEM_POWER_OFF) 5161 return 0; 5162 5163 ret = amdgpu_ttm_evict_resources(adev, TTM_PL_VRAM); 5164 if (ret) { 5165 dev_warn(adev->dev, "evicting device resources failed\n"); 5166 return ret; 5167 } 5168 5169 if (adev->in_s4) { 5170 ret = ttm_device_prepare_hibernation(&adev->mman.bdev); 5171 if (ret) 5172 dev_err(adev->dev, "prepare hibernation failed, %d\n", ret); 5173 } 5174 return ret; 5175 } 5176 5177 /* 5178 * Suspend & resume. 5179 */ 5180 /** 5181 * amdgpu_device_pm_notifier - Notification block for Suspend/Hibernate events 5182 * @nb: notifier block 5183 * @mode: suspend mode 5184 * @data: data 5185 * 5186 * This function is called when the system is about to suspend or hibernate. 5187 * It is used to set the appropriate flags so that eviction can be optimized 5188 * in the pm prepare callback. 5189 */ 5190 static int amdgpu_device_pm_notifier(struct notifier_block *nb, unsigned long mode, 5191 void *data) 5192 { 5193 struct amdgpu_device *adev = container_of(nb, struct amdgpu_device, pm_nb); 5194 5195 switch (mode) { 5196 case PM_HIBERNATION_PREPARE: 5197 adev->in_s4 = true; 5198 break; 5199 case PM_POST_HIBERNATION: 5200 adev->in_s4 = false; 5201 break; 5202 } 5203 5204 return NOTIFY_DONE; 5205 } 5206 5207 /** 5208 * amdgpu_device_prepare - prepare for device suspend 5209 * 5210 * @dev: drm dev pointer 5211 * 5212 * Prepare to put the hw in the suspend state (all asics). 5213 * Returns 0 for success or an error on failure. 5214 * Called at driver suspend. 5215 */ 5216 int amdgpu_device_prepare(struct drm_device *dev) 5217 { 5218 struct amdgpu_device *adev = drm_to_adev(dev); 5219 int i, r; 5220 5221 if (dev->switch_power_state == DRM_SWITCH_POWER_OFF) 5222 return 0; 5223 5224 /* Evict the majority of BOs before starting suspend sequence */ 5225 r = amdgpu_device_evict_resources(adev); 5226 if (r) 5227 return r; 5228 5229 flush_delayed_work(&adev->gfx.gfx_off_delay_work); 5230 5231 for (i = 0; i < adev->num_ip_blocks; i++) { 5232 if (!adev->ip_blocks[i].status.valid) 5233 continue; 5234 if (!adev->ip_blocks[i].version->funcs->prepare_suspend) 5235 continue; 5236 r = adev->ip_blocks[i].version->funcs->prepare_suspend(&adev->ip_blocks[i]); 5237 if (r) 5238 return r; 5239 } 5240 5241 return 0; 5242 } 5243 5244 /** 5245 * amdgpu_device_complete - complete power state transition 5246 * 5247 * @dev: drm dev pointer 5248 * 5249 * Undo the changes from amdgpu_device_prepare. This will be 5250 * called on all resume transitions, including those that failed. 5251 */ 5252 void amdgpu_device_complete(struct drm_device *dev) 5253 { 5254 struct amdgpu_device *adev = drm_to_adev(dev); 5255 int i; 5256 5257 for (i = 0; i < adev->num_ip_blocks; i++) { 5258 if (!adev->ip_blocks[i].status.valid) 5259 continue; 5260 if (!adev->ip_blocks[i].version->funcs->complete) 5261 continue; 5262 adev->ip_blocks[i].version->funcs->complete(&adev->ip_blocks[i]); 5263 } 5264 } 5265 5266 /** 5267 * amdgpu_device_suspend - initiate device suspend 5268 * 5269 * @dev: drm dev pointer 5270 * @notify_clients: notify in-kernel DRM clients 5271 * 5272 * Puts the hw in the suspend state (all asics). 5273 * Returns 0 for success or an error on failure. 5274 * Called at driver suspend. 5275 */ 5276 int amdgpu_device_suspend(struct drm_device *dev, bool notify_clients) 5277 { 5278 struct amdgpu_device *adev = drm_to_adev(dev); 5279 int r, rec; 5280 5281 if (dev->switch_power_state == DRM_SWITCH_POWER_OFF) 5282 return 0; 5283 5284 adev->in_suspend = true; 5285 5286 if (amdgpu_sriov_vf(adev)) { 5287 if (!adev->in_runpm) 5288 amdgpu_amdkfd_suspend_process(adev); 5289 amdgpu_virt_fini_data_exchange(adev); 5290 r = amdgpu_virt_request_full_gpu(adev, false); 5291 if (r) 5292 return r; 5293 } 5294 5295 r = amdgpu_acpi_smart_shift_update(adev, AMDGPU_SS_DEV_D3); 5296 if (r) 5297 goto unwind_sriov; 5298 5299 if (notify_clients) 5300 drm_client_dev_suspend(adev_to_drm(adev)); 5301 5302 cancel_delayed_work_sync(&adev->delayed_init_work); 5303 5304 amdgpu_ras_suspend(adev); 5305 5306 r = amdgpu_device_ip_suspend_phase1(adev); 5307 if (r) 5308 goto unwind_smartshift; 5309 5310 amdgpu_amdkfd_suspend(adev, !amdgpu_sriov_vf(adev) && !adev->in_runpm); 5311 r = amdgpu_userq_suspend(adev); 5312 if (r) 5313 goto unwind_ip_phase1; 5314 5315 r = amdgpu_device_evict_resources(adev); 5316 if (r) 5317 goto unwind_userq; 5318 5319 amdgpu_ttm_set_buffer_funcs_status(adev, false); 5320 5321 amdgpu_fence_driver_hw_fini(adev); 5322 5323 r = amdgpu_device_ip_suspend_phase2(adev); 5324 if (r) 5325 goto unwind_evict; 5326 5327 if (amdgpu_sriov_vf(adev)) 5328 amdgpu_virt_release_full_gpu(adev, false); 5329 5330 return 0; 5331 5332 unwind_evict: 5333 if (adev->mman.buffer_funcs_ring->sched.ready) 5334 amdgpu_ttm_set_buffer_funcs_status(adev, true); 5335 amdgpu_fence_driver_hw_init(adev); 5336 5337 unwind_userq: 5338 rec = amdgpu_userq_resume(adev); 5339 if (rec) { 5340 dev_warn(adev->dev, "failed to re-initialize user queues: %d\n", rec); 5341 return r; 5342 } 5343 rec = amdgpu_amdkfd_resume(adev, !amdgpu_sriov_vf(adev) && !adev->in_runpm); 5344 if (rec) { 5345 dev_warn(adev->dev, "failed to re-initialize kfd: %d\n", rec); 5346 return r; 5347 } 5348 5349 unwind_ip_phase1: 5350 /* suspend phase 1 = resume phase 3 */ 5351 rec = amdgpu_device_ip_resume_phase3(adev); 5352 if (rec) { 5353 dev_warn(adev->dev, "failed to re-initialize IPs phase1: %d\n", rec); 5354 return r; 5355 } 5356 5357 unwind_smartshift: 5358 rec = amdgpu_acpi_smart_shift_update(adev, AMDGPU_SS_DEV_D0); 5359 if (rec) { 5360 dev_warn(adev->dev, "failed to re-update smart shift: %d\n", rec); 5361 return r; 5362 } 5363 5364 if (notify_clients) 5365 drm_client_dev_resume(adev_to_drm(adev)); 5366 5367 amdgpu_ras_resume(adev); 5368 5369 unwind_sriov: 5370 if (amdgpu_sriov_vf(adev)) { 5371 rec = amdgpu_virt_request_full_gpu(adev, true); 5372 if (rec) { 5373 dev_warn(adev->dev, "failed to reinitialize sriov: %d\n", rec); 5374 return r; 5375 } 5376 } 5377 5378 adev->in_suspend = adev->in_s0ix = adev->in_s3 = false; 5379 5380 return r; 5381 } 5382 5383 static inline int amdgpu_virt_resume(struct amdgpu_device *adev) 5384 { 5385 int r; 5386 unsigned int prev_physical_node_id = adev->gmc.xgmi.physical_node_id; 5387 5388 /* During VM resume, QEMU programming of VF MSIX table (register GFXMSIX_VECT0_ADDR_LO) 5389 * may not work. The access could be blocked by nBIF protection as VF isn't in 5390 * exclusive access mode. Exclusive access is enabled now, disable/enable MSIX 5391 * so that QEMU reprograms MSIX table. 5392 */ 5393 amdgpu_restore_msix(adev); 5394 5395 r = adev->gfxhub.funcs->get_xgmi_info(adev); 5396 if (r) 5397 return r; 5398 5399 dev_info(adev->dev, "xgmi node, old id %d, new id %d\n", 5400 prev_physical_node_id, adev->gmc.xgmi.physical_node_id); 5401 5402 adev->vm_manager.vram_base_offset = adev->gfxhub.funcs->get_mc_fb_offset(adev); 5403 adev->vm_manager.vram_base_offset += 5404 adev->gmc.xgmi.physical_node_id * adev->gmc.xgmi.node_segment_size; 5405 5406 return 0; 5407 } 5408 5409 /** 5410 * amdgpu_device_resume - initiate device resume 5411 * 5412 * @dev: drm dev pointer 5413 * @notify_clients: notify in-kernel DRM clients 5414 * 5415 * Bring the hw back to operating state (all asics). 5416 * Returns 0 for success or an error on failure. 5417 * Called at driver resume. 5418 */ 5419 int amdgpu_device_resume(struct drm_device *dev, bool notify_clients) 5420 { 5421 struct amdgpu_device *adev = drm_to_adev(dev); 5422 int r = 0; 5423 5424 if (amdgpu_sriov_vf(adev)) { 5425 r = amdgpu_virt_request_full_gpu(adev, true); 5426 if (r) 5427 return r; 5428 } 5429 5430 if (amdgpu_virt_xgmi_migrate_enabled(adev)) { 5431 r = amdgpu_virt_resume(adev); 5432 if (r) 5433 goto exit; 5434 } 5435 5436 if (dev->switch_power_state == DRM_SWITCH_POWER_OFF) 5437 return 0; 5438 5439 if (adev->in_s0ix) 5440 amdgpu_dpm_gfx_state_change(adev, sGpuChangeState_D0Entry); 5441 5442 /* post card */ 5443 if (amdgpu_device_need_post(adev)) { 5444 r = amdgpu_device_asic_init(adev); 5445 if (r) 5446 dev_err(adev->dev, "amdgpu asic init failed\n"); 5447 } 5448 5449 r = amdgpu_device_ip_resume(adev); 5450 5451 if (r) { 5452 dev_err(adev->dev, "amdgpu_device_ip_resume failed (%d).\n", r); 5453 goto exit; 5454 } 5455 5456 r = amdgpu_amdkfd_resume(adev, !amdgpu_sriov_vf(adev) && !adev->in_runpm); 5457 if (r) 5458 goto exit; 5459 5460 r = amdgpu_userq_resume(adev); 5461 if (r) 5462 goto exit; 5463 5464 r = amdgpu_device_ip_late_init(adev); 5465 if (r) 5466 goto exit; 5467 5468 queue_delayed_work(system_wq, &adev->delayed_init_work, 5469 msecs_to_jiffies(AMDGPU_RESUME_MS)); 5470 exit: 5471 if (amdgpu_sriov_vf(adev)) { 5472 amdgpu_virt_init_data_exchange(adev); 5473 amdgpu_virt_release_full_gpu(adev, true); 5474 5475 if (!r && !adev->in_runpm) 5476 r = amdgpu_amdkfd_resume_process(adev); 5477 } 5478 5479 if (r) 5480 return r; 5481 5482 /* Make sure IB tests flushed */ 5483 flush_delayed_work(&adev->delayed_init_work); 5484 5485 if (notify_clients) 5486 drm_client_dev_resume(adev_to_drm(adev)); 5487 5488 amdgpu_ras_resume(adev); 5489 5490 if (adev->mode_info.num_crtc) { 5491 /* 5492 * Most of the connector probing functions try to acquire runtime pm 5493 * refs to ensure that the GPU is powered on when connector polling is 5494 * performed. Since we're calling this from a runtime PM callback, 5495 * trying to acquire rpm refs will cause us to deadlock. 5496 * 5497 * Since we're guaranteed to be holding the rpm lock, it's safe to 5498 * temporarily disable the rpm helpers so this doesn't deadlock us. 5499 */ 5500 #ifdef CONFIG_PM 5501 dev->dev->power.disable_depth++; 5502 #endif 5503 if (!adev->dc_enabled) 5504 drm_helper_hpd_irq_event(dev); 5505 else 5506 drm_kms_helper_hotplug_event(dev); 5507 #ifdef CONFIG_PM 5508 dev->dev->power.disable_depth--; 5509 #endif 5510 } 5511 5512 amdgpu_vram_mgr_clear_reset_blocks(adev); 5513 adev->in_suspend = false; 5514 5515 if (amdgpu_acpi_smart_shift_update(adev, AMDGPU_SS_DEV_D0)) 5516 dev_warn(adev->dev, "smart shift update failed\n"); 5517 5518 return 0; 5519 } 5520 5521 /** 5522 * amdgpu_device_ip_check_soft_reset - did soft reset succeed 5523 * 5524 * @adev: amdgpu_device pointer 5525 * 5526 * The list of all the hardware IPs that make up the asic is walked and 5527 * the check_soft_reset callbacks are run. check_soft_reset determines 5528 * if the asic is still hung or not. 5529 * Returns true if any of the IPs are still in a hung state, false if not. 5530 */ 5531 static bool amdgpu_device_ip_check_soft_reset(struct amdgpu_device *adev) 5532 { 5533 int i; 5534 bool asic_hang = false; 5535 5536 if (amdgpu_sriov_vf(adev)) 5537 return true; 5538 5539 if (amdgpu_asic_need_full_reset(adev)) 5540 return true; 5541 5542 for (i = 0; i < adev->num_ip_blocks; i++) { 5543 if (!adev->ip_blocks[i].status.valid) 5544 continue; 5545 if (adev->ip_blocks[i].version->funcs->check_soft_reset) 5546 adev->ip_blocks[i].status.hang = 5547 adev->ip_blocks[i].version->funcs->check_soft_reset( 5548 &adev->ip_blocks[i]); 5549 if (adev->ip_blocks[i].status.hang) { 5550 dev_info(adev->dev, "IP block:%s is hung!\n", adev->ip_blocks[i].version->funcs->name); 5551 asic_hang = true; 5552 } 5553 } 5554 return asic_hang; 5555 } 5556 5557 /** 5558 * amdgpu_device_ip_pre_soft_reset - prepare for soft reset 5559 * 5560 * @adev: amdgpu_device pointer 5561 * 5562 * The list of all the hardware IPs that make up the asic is walked and the 5563 * pre_soft_reset callbacks are run if the block is hung. pre_soft_reset 5564 * handles any IP specific hardware or software state changes that are 5565 * necessary for a soft reset to succeed. 5566 * Returns 0 on success, negative error code on failure. 5567 */ 5568 static int amdgpu_device_ip_pre_soft_reset(struct amdgpu_device *adev) 5569 { 5570 int i, r = 0; 5571 5572 for (i = 0; i < adev->num_ip_blocks; i++) { 5573 if (!adev->ip_blocks[i].status.valid) 5574 continue; 5575 if (adev->ip_blocks[i].status.hang && 5576 adev->ip_blocks[i].version->funcs->pre_soft_reset) { 5577 r = adev->ip_blocks[i].version->funcs->pre_soft_reset(&adev->ip_blocks[i]); 5578 if (r) 5579 return r; 5580 } 5581 } 5582 5583 return 0; 5584 } 5585 5586 /** 5587 * amdgpu_device_ip_need_full_reset - check if a full asic reset is needed 5588 * 5589 * @adev: amdgpu_device pointer 5590 * 5591 * Some hardware IPs cannot be soft reset. If they are hung, a full gpu 5592 * reset is necessary to recover. 5593 * Returns true if a full asic reset is required, false if not. 5594 */ 5595 static bool amdgpu_device_ip_need_full_reset(struct amdgpu_device *adev) 5596 { 5597 int i; 5598 5599 if (amdgpu_asic_need_full_reset(adev)) 5600 return true; 5601 5602 for (i = 0; i < adev->num_ip_blocks; i++) { 5603 if (!adev->ip_blocks[i].status.valid) 5604 continue; 5605 if ((adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) || 5606 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) || 5607 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_ACP) || 5608 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE) || 5609 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) { 5610 if (adev->ip_blocks[i].status.hang) { 5611 dev_info(adev->dev, "Some block need full reset!\n"); 5612 return true; 5613 } 5614 } 5615 } 5616 return false; 5617 } 5618 5619 /** 5620 * amdgpu_device_ip_soft_reset - do a soft reset 5621 * 5622 * @adev: amdgpu_device pointer 5623 * 5624 * The list of all the hardware IPs that make up the asic is walked and the 5625 * soft_reset callbacks are run if the block is hung. soft_reset handles any 5626 * IP specific hardware or software state changes that are necessary to soft 5627 * reset the IP. 5628 * Returns 0 on success, negative error code on failure. 5629 */ 5630 static int amdgpu_device_ip_soft_reset(struct amdgpu_device *adev) 5631 { 5632 int i, r = 0; 5633 5634 for (i = 0; i < adev->num_ip_blocks; i++) { 5635 if (!adev->ip_blocks[i].status.valid) 5636 continue; 5637 if (adev->ip_blocks[i].status.hang && 5638 adev->ip_blocks[i].version->funcs->soft_reset) { 5639 r = adev->ip_blocks[i].version->funcs->soft_reset(&adev->ip_blocks[i]); 5640 if (r) 5641 return r; 5642 } 5643 } 5644 5645 return 0; 5646 } 5647 5648 /** 5649 * amdgpu_device_ip_post_soft_reset - clean up from soft reset 5650 * 5651 * @adev: amdgpu_device pointer 5652 * 5653 * The list of all the hardware IPs that make up the asic is walked and the 5654 * post_soft_reset callbacks are run if the asic was hung. post_soft_reset 5655 * handles any IP specific hardware or software state changes that are 5656 * necessary after the IP has been soft reset. 5657 * Returns 0 on success, negative error code on failure. 5658 */ 5659 static int amdgpu_device_ip_post_soft_reset(struct amdgpu_device *adev) 5660 { 5661 int i, r = 0; 5662 5663 for (i = 0; i < adev->num_ip_blocks; i++) { 5664 if (!adev->ip_blocks[i].status.valid) 5665 continue; 5666 if (adev->ip_blocks[i].status.hang && 5667 adev->ip_blocks[i].version->funcs->post_soft_reset) 5668 r = adev->ip_blocks[i].version->funcs->post_soft_reset(&adev->ip_blocks[i]); 5669 if (r) 5670 return r; 5671 } 5672 5673 return 0; 5674 } 5675 5676 /** 5677 * amdgpu_device_reset_sriov - reset ASIC for SR-IOV vf 5678 * 5679 * @adev: amdgpu_device pointer 5680 * @reset_context: amdgpu reset context pointer 5681 * 5682 * do VF FLR and reinitialize Asic 5683 * return 0 means succeeded otherwise failed 5684 */ 5685 static int amdgpu_device_reset_sriov(struct amdgpu_device *adev, 5686 struct amdgpu_reset_context *reset_context) 5687 { 5688 int r; 5689 struct amdgpu_hive_info *hive = NULL; 5690 5691 if (test_bit(AMDGPU_HOST_FLR, &reset_context->flags)) { 5692 if (!amdgpu_ras_get_fed_status(adev)) 5693 amdgpu_virt_ready_to_reset(adev); 5694 amdgpu_virt_wait_reset(adev); 5695 clear_bit(AMDGPU_HOST_FLR, &reset_context->flags); 5696 r = amdgpu_virt_request_full_gpu(adev, true); 5697 } else { 5698 r = amdgpu_virt_reset_gpu(adev); 5699 } 5700 if (r) 5701 return r; 5702 5703 amdgpu_ras_clear_err_state(adev); 5704 amdgpu_irq_gpu_reset_resume_helper(adev); 5705 5706 /* some sw clean up VF needs to do before recover */ 5707 amdgpu_virt_post_reset(adev); 5708 5709 /* Resume IP prior to SMC */ 5710 r = amdgpu_device_ip_reinit_early_sriov(adev); 5711 if (r) 5712 return r; 5713 5714 amdgpu_virt_init_data_exchange(adev); 5715 5716 r = amdgpu_device_fw_loading(adev); 5717 if (r) 5718 return r; 5719 5720 /* now we are okay to resume SMC/CP/SDMA */ 5721 r = amdgpu_device_ip_reinit_late_sriov(adev); 5722 if (r) 5723 return r; 5724 5725 hive = amdgpu_get_xgmi_hive(adev); 5726 /* Update PSP FW topology after reset */ 5727 if (hive && adev->gmc.xgmi.num_physical_nodes > 1) 5728 r = amdgpu_xgmi_update_topology(hive, adev); 5729 if (hive) 5730 amdgpu_put_xgmi_hive(hive); 5731 if (r) 5732 return r; 5733 5734 r = amdgpu_ib_ring_tests(adev); 5735 if (r) 5736 return r; 5737 5738 if (adev->virt.gim_feature & AMDGIM_FEATURE_GIM_FLR_VRAMLOST) 5739 amdgpu_inc_vram_lost(adev); 5740 5741 /* need to be called during full access so we can't do it later like 5742 * bare-metal does. 5743 */ 5744 amdgpu_amdkfd_post_reset(adev); 5745 amdgpu_virt_release_full_gpu(adev, true); 5746 5747 /* Aldebaran and gfx_11_0_3 support ras in SRIOV, so need resume ras during reset */ 5748 if (amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 2) || 5749 amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 3) || 5750 amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 4) || 5751 amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 5, 0) || 5752 amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(11, 0, 3)) 5753 amdgpu_ras_resume(adev); 5754 5755 amdgpu_virt_ras_telemetry_post_reset(adev); 5756 5757 return 0; 5758 } 5759 5760 /** 5761 * amdgpu_device_has_job_running - check if there is any unfinished job 5762 * 5763 * @adev: amdgpu_device pointer 5764 * 5765 * check if there is any job running on the device when guest driver receives 5766 * FLR notification from host driver. If there are still jobs running, then 5767 * the guest driver will not respond the FLR reset. Instead, let the job hit 5768 * the timeout and guest driver then issue the reset request. 5769 */ 5770 bool amdgpu_device_has_job_running(struct amdgpu_device *adev) 5771 { 5772 int i; 5773 5774 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 5775 struct amdgpu_ring *ring = adev->rings[i]; 5776 5777 if (!amdgpu_ring_sched_ready(ring)) 5778 continue; 5779 5780 if (amdgpu_fence_count_emitted(ring)) 5781 return true; 5782 } 5783 return false; 5784 } 5785 5786 /** 5787 * amdgpu_device_should_recover_gpu - check if we should try GPU recovery 5788 * 5789 * @adev: amdgpu_device pointer 5790 * 5791 * Check amdgpu_gpu_recovery and SRIOV status to see if we should try to recover 5792 * a hung GPU. 5793 */ 5794 bool amdgpu_device_should_recover_gpu(struct amdgpu_device *adev) 5795 { 5796 5797 if (amdgpu_gpu_recovery == 0) 5798 goto disabled; 5799 5800 /* Skip soft reset check in fatal error mode */ 5801 if (!amdgpu_ras_is_poison_mode_supported(adev)) 5802 return true; 5803 5804 if (amdgpu_sriov_vf(adev)) 5805 return true; 5806 5807 if (amdgpu_gpu_recovery == -1) { 5808 switch (adev->asic_type) { 5809 #ifdef CONFIG_DRM_AMDGPU_SI 5810 case CHIP_VERDE: 5811 case CHIP_TAHITI: 5812 case CHIP_PITCAIRN: 5813 case CHIP_OLAND: 5814 case CHIP_HAINAN: 5815 #endif 5816 #ifdef CONFIG_DRM_AMDGPU_CIK 5817 case CHIP_KAVERI: 5818 case CHIP_KABINI: 5819 case CHIP_MULLINS: 5820 #endif 5821 case CHIP_CARRIZO: 5822 case CHIP_STONEY: 5823 case CHIP_CYAN_SKILLFISH: 5824 goto disabled; 5825 default: 5826 break; 5827 } 5828 } 5829 5830 return true; 5831 5832 disabled: 5833 dev_info(adev->dev, "GPU recovery disabled.\n"); 5834 return false; 5835 } 5836 5837 int amdgpu_device_mode1_reset(struct amdgpu_device *adev) 5838 { 5839 u32 i; 5840 int ret = 0; 5841 5842 if (adev->bios) 5843 amdgpu_atombios_scratch_regs_engine_hung(adev, true); 5844 5845 dev_info(adev->dev, "GPU mode1 reset\n"); 5846 5847 /* Cache the state before bus master disable. The saved config space 5848 * values are used in other cases like restore after mode-2 reset. 5849 */ 5850 amdgpu_device_cache_pci_state(adev->pdev); 5851 5852 /* disable BM */ 5853 pci_clear_master(adev->pdev); 5854 5855 if (amdgpu_dpm_is_mode1_reset_supported(adev)) { 5856 dev_info(adev->dev, "GPU smu mode1 reset\n"); 5857 ret = amdgpu_dpm_mode1_reset(adev); 5858 } else { 5859 dev_info(adev->dev, "GPU psp mode1 reset\n"); 5860 ret = psp_gpu_reset(adev); 5861 } 5862 5863 if (ret) 5864 goto mode1_reset_failed; 5865 5866 amdgpu_device_load_pci_state(adev->pdev); 5867 ret = amdgpu_psp_wait_for_bootloader(adev); 5868 if (ret) 5869 goto mode1_reset_failed; 5870 5871 /* wait for asic to come out of reset */ 5872 for (i = 0; i < adev->usec_timeout; i++) { 5873 u32 memsize = adev->nbio.funcs->get_memsize(adev); 5874 5875 if (memsize != 0xffffffff) 5876 break; 5877 udelay(1); 5878 } 5879 5880 if (i >= adev->usec_timeout) { 5881 ret = -ETIMEDOUT; 5882 goto mode1_reset_failed; 5883 } 5884 5885 if (adev->bios) 5886 amdgpu_atombios_scratch_regs_engine_hung(adev, false); 5887 5888 return 0; 5889 5890 mode1_reset_failed: 5891 dev_err(adev->dev, "GPU mode1 reset failed\n"); 5892 return ret; 5893 } 5894 5895 int amdgpu_device_link_reset(struct amdgpu_device *adev) 5896 { 5897 int ret = 0; 5898 5899 dev_info(adev->dev, "GPU link reset\n"); 5900 5901 if (!amdgpu_reset_in_dpc(adev)) 5902 ret = amdgpu_dpm_link_reset(adev); 5903 5904 if (ret) 5905 goto link_reset_failed; 5906 5907 ret = amdgpu_psp_wait_for_bootloader(adev); 5908 if (ret) 5909 goto link_reset_failed; 5910 5911 return 0; 5912 5913 link_reset_failed: 5914 dev_err(adev->dev, "GPU link reset failed\n"); 5915 return ret; 5916 } 5917 5918 int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev, 5919 struct amdgpu_reset_context *reset_context) 5920 { 5921 int i, r = 0; 5922 struct amdgpu_job *job = NULL; 5923 struct amdgpu_device *tmp_adev = reset_context->reset_req_dev; 5924 bool need_full_reset = 5925 test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 5926 5927 if (reset_context->reset_req_dev == adev) 5928 job = reset_context->job; 5929 5930 if (amdgpu_sriov_vf(adev)) 5931 amdgpu_virt_pre_reset(adev); 5932 5933 amdgpu_fence_driver_isr_toggle(adev, true); 5934 5935 /* block all schedulers and reset given job's ring */ 5936 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 5937 struct amdgpu_ring *ring = adev->rings[i]; 5938 5939 if (!amdgpu_ring_sched_ready(ring)) 5940 continue; 5941 5942 /* after all hw jobs are reset, hw fence is meaningless, so force_completion */ 5943 amdgpu_fence_driver_force_completion(ring); 5944 } 5945 5946 amdgpu_fence_driver_isr_toggle(adev, false); 5947 5948 if (job && job->vm) 5949 drm_sched_increase_karma(&job->base); 5950 5951 r = amdgpu_reset_prepare_hwcontext(adev, reset_context); 5952 /* If reset handler not implemented, continue; otherwise return */ 5953 if (r == -EOPNOTSUPP) 5954 r = 0; 5955 else 5956 return r; 5957 5958 /* Don't suspend on bare metal if we are not going to HW reset the ASIC */ 5959 if (!amdgpu_sriov_vf(adev)) { 5960 5961 if (!need_full_reset) 5962 need_full_reset = amdgpu_device_ip_need_full_reset(adev); 5963 5964 if (!need_full_reset && amdgpu_gpu_recovery && 5965 amdgpu_device_ip_check_soft_reset(adev)) { 5966 amdgpu_device_ip_pre_soft_reset(adev); 5967 r = amdgpu_device_ip_soft_reset(adev); 5968 amdgpu_device_ip_post_soft_reset(adev); 5969 if (r || amdgpu_device_ip_check_soft_reset(adev)) { 5970 dev_info(adev->dev, "soft reset failed, will fallback to full reset!\n"); 5971 need_full_reset = true; 5972 } 5973 } 5974 5975 if (!test_bit(AMDGPU_SKIP_COREDUMP, &reset_context->flags)) { 5976 dev_info(tmp_adev->dev, "Dumping IP State\n"); 5977 /* Trigger ip dump before we reset the asic */ 5978 for (i = 0; i < tmp_adev->num_ip_blocks; i++) 5979 if (tmp_adev->ip_blocks[i].version->funcs->dump_ip_state) 5980 tmp_adev->ip_blocks[i].version->funcs 5981 ->dump_ip_state((void *)&tmp_adev->ip_blocks[i]); 5982 dev_info(tmp_adev->dev, "Dumping IP State Completed\n"); 5983 } 5984 5985 if (need_full_reset) 5986 r = amdgpu_device_ip_suspend(adev); 5987 if (need_full_reset) 5988 set_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 5989 else 5990 clear_bit(AMDGPU_NEED_FULL_RESET, 5991 &reset_context->flags); 5992 } 5993 5994 return r; 5995 } 5996 5997 int amdgpu_device_reinit_after_reset(struct amdgpu_reset_context *reset_context) 5998 { 5999 struct list_head *device_list_handle; 6000 bool full_reset, vram_lost = false; 6001 struct amdgpu_device *tmp_adev; 6002 int r, init_level; 6003 6004 device_list_handle = reset_context->reset_device_list; 6005 6006 if (!device_list_handle) 6007 return -EINVAL; 6008 6009 full_reset = test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 6010 6011 /** 6012 * If it's reset on init, it's default init level, otherwise keep level 6013 * as recovery level. 6014 */ 6015 if (reset_context->method == AMD_RESET_METHOD_ON_INIT) 6016 init_level = AMDGPU_INIT_LEVEL_DEFAULT; 6017 else 6018 init_level = AMDGPU_INIT_LEVEL_RESET_RECOVERY; 6019 6020 r = 0; 6021 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 6022 amdgpu_set_init_level(tmp_adev, init_level); 6023 if (full_reset) { 6024 /* post card */ 6025 amdgpu_reset_set_dpc_status(tmp_adev, false); 6026 amdgpu_ras_clear_err_state(tmp_adev); 6027 r = amdgpu_device_asic_init(tmp_adev); 6028 if (r) { 6029 dev_warn(tmp_adev->dev, "asic atom init failed!"); 6030 } else { 6031 dev_info(tmp_adev->dev, "GPU reset succeeded, trying to resume\n"); 6032 6033 r = amdgpu_device_ip_resume_phase1(tmp_adev); 6034 if (r) 6035 goto out; 6036 6037 vram_lost = amdgpu_device_check_vram_lost(tmp_adev); 6038 6039 if (!test_bit(AMDGPU_SKIP_COREDUMP, &reset_context->flags)) 6040 amdgpu_coredump(tmp_adev, false, vram_lost, reset_context->job); 6041 6042 if (vram_lost) { 6043 dev_info( 6044 tmp_adev->dev, 6045 "VRAM is lost due to GPU reset!\n"); 6046 amdgpu_inc_vram_lost(tmp_adev); 6047 } 6048 6049 r = amdgpu_device_fw_loading(tmp_adev); 6050 if (r) 6051 return r; 6052 6053 r = amdgpu_xcp_restore_partition_mode( 6054 tmp_adev->xcp_mgr); 6055 if (r) 6056 goto out; 6057 6058 r = amdgpu_device_ip_resume_phase2(tmp_adev); 6059 if (r) 6060 goto out; 6061 6062 if (tmp_adev->mman.buffer_funcs_ring->sched.ready) 6063 amdgpu_ttm_set_buffer_funcs_status(tmp_adev, true); 6064 6065 r = amdgpu_device_ip_resume_phase3(tmp_adev); 6066 if (r) 6067 goto out; 6068 6069 if (vram_lost) 6070 amdgpu_device_fill_reset_magic(tmp_adev); 6071 6072 /* 6073 * Add this ASIC as tracked as reset was already 6074 * complete successfully. 6075 */ 6076 amdgpu_register_gpu_instance(tmp_adev); 6077 6078 if (!reset_context->hive && 6079 tmp_adev->gmc.xgmi.num_physical_nodes > 1) 6080 amdgpu_xgmi_add_device(tmp_adev); 6081 6082 r = amdgpu_device_ip_late_init(tmp_adev); 6083 if (r) 6084 goto out; 6085 6086 r = amdgpu_userq_post_reset(tmp_adev, vram_lost); 6087 if (r) 6088 goto out; 6089 6090 drm_client_dev_resume(adev_to_drm(tmp_adev)); 6091 6092 /* 6093 * The GPU enters bad state once faulty pages 6094 * by ECC has reached the threshold, and ras 6095 * recovery is scheduled next. So add one check 6096 * here to break recovery if it indeed exceeds 6097 * bad page threshold, and remind user to 6098 * retire this GPU or setting one bigger 6099 * bad_page_threshold value to fix this once 6100 * probing driver again. 6101 */ 6102 if (!amdgpu_ras_is_rma(tmp_adev)) { 6103 /* must succeed. */ 6104 amdgpu_ras_resume(tmp_adev); 6105 } else { 6106 r = -EINVAL; 6107 goto out; 6108 } 6109 6110 /* Update PSP FW topology after reset */ 6111 if (reset_context->hive && 6112 tmp_adev->gmc.xgmi.num_physical_nodes > 1) 6113 r = amdgpu_xgmi_update_topology( 6114 reset_context->hive, tmp_adev); 6115 } 6116 } 6117 6118 out: 6119 if (!r) { 6120 /* IP init is complete now, set level as default */ 6121 amdgpu_set_init_level(tmp_adev, 6122 AMDGPU_INIT_LEVEL_DEFAULT); 6123 amdgpu_irq_gpu_reset_resume_helper(tmp_adev); 6124 r = amdgpu_ib_ring_tests(tmp_adev); 6125 if (r) { 6126 dev_err(tmp_adev->dev, "ib ring test failed (%d).\n", r); 6127 r = -EAGAIN; 6128 goto end; 6129 } 6130 } 6131 6132 if (r) 6133 tmp_adev->asic_reset_res = r; 6134 } 6135 6136 end: 6137 return r; 6138 } 6139 6140 int amdgpu_do_asic_reset(struct list_head *device_list_handle, 6141 struct amdgpu_reset_context *reset_context) 6142 { 6143 struct amdgpu_device *tmp_adev = NULL; 6144 bool need_full_reset, skip_hw_reset; 6145 int r = 0; 6146 6147 /* Try reset handler method first */ 6148 tmp_adev = list_first_entry(device_list_handle, struct amdgpu_device, 6149 reset_list); 6150 6151 reset_context->reset_device_list = device_list_handle; 6152 r = amdgpu_reset_perform_reset(tmp_adev, reset_context); 6153 /* If reset handler not implemented, continue; otherwise return */ 6154 if (r == -EOPNOTSUPP) 6155 r = 0; 6156 else 6157 return r; 6158 6159 /* Reset handler not implemented, use the default method */ 6160 need_full_reset = 6161 test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 6162 skip_hw_reset = test_bit(AMDGPU_SKIP_HW_RESET, &reset_context->flags); 6163 6164 /* 6165 * ASIC reset has to be done on all XGMI hive nodes ASAP 6166 * to allow proper links negotiation in FW (within 1 sec) 6167 */ 6168 if (!skip_hw_reset && need_full_reset) { 6169 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 6170 /* For XGMI run all resets in parallel to speed up the process */ 6171 if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) { 6172 if (!queue_work(system_unbound_wq, 6173 &tmp_adev->xgmi_reset_work)) 6174 r = -EALREADY; 6175 } else 6176 r = amdgpu_asic_reset(tmp_adev); 6177 6178 if (r) { 6179 dev_err(tmp_adev->dev, 6180 "ASIC reset failed with error, %d for drm dev, %s", 6181 r, adev_to_drm(tmp_adev)->unique); 6182 goto out; 6183 } 6184 } 6185 6186 /* For XGMI wait for all resets to complete before proceed */ 6187 if (!r) { 6188 list_for_each_entry(tmp_adev, device_list_handle, 6189 reset_list) { 6190 if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) { 6191 flush_work(&tmp_adev->xgmi_reset_work); 6192 r = tmp_adev->asic_reset_res; 6193 if (r) 6194 break; 6195 } 6196 } 6197 } 6198 } 6199 6200 if (!r && amdgpu_ras_intr_triggered()) { 6201 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 6202 amdgpu_ras_reset_error_count(tmp_adev, 6203 AMDGPU_RAS_BLOCK__MMHUB); 6204 } 6205 6206 amdgpu_ras_intr_cleared(); 6207 } 6208 6209 r = amdgpu_device_reinit_after_reset(reset_context); 6210 if (r == -EAGAIN) 6211 set_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 6212 else 6213 clear_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 6214 6215 out: 6216 return r; 6217 } 6218 6219 static void amdgpu_device_set_mp1_state(struct amdgpu_device *adev) 6220 { 6221 6222 switch (amdgpu_asic_reset_method(adev)) { 6223 case AMD_RESET_METHOD_MODE1: 6224 case AMD_RESET_METHOD_LINK: 6225 adev->mp1_state = PP_MP1_STATE_SHUTDOWN; 6226 break; 6227 case AMD_RESET_METHOD_MODE2: 6228 adev->mp1_state = PP_MP1_STATE_RESET; 6229 break; 6230 default: 6231 adev->mp1_state = PP_MP1_STATE_NONE; 6232 break; 6233 } 6234 } 6235 6236 static void amdgpu_device_unset_mp1_state(struct amdgpu_device *adev) 6237 { 6238 amdgpu_vf_error_trans_all(adev); 6239 adev->mp1_state = PP_MP1_STATE_NONE; 6240 } 6241 6242 static void amdgpu_device_resume_display_audio(struct amdgpu_device *adev) 6243 { 6244 struct pci_dev *p = NULL; 6245 6246 p = pci_get_domain_bus_and_slot(pci_domain_nr(adev->pdev->bus), 6247 adev->pdev->bus->number, 1); 6248 if (p) { 6249 pm_runtime_enable(&(p->dev)); 6250 pm_runtime_resume(&(p->dev)); 6251 } 6252 6253 pci_dev_put(p); 6254 } 6255 6256 static int amdgpu_device_suspend_display_audio(struct amdgpu_device *adev) 6257 { 6258 enum amd_reset_method reset_method; 6259 struct pci_dev *p = NULL; 6260 u64 expires; 6261 6262 /* 6263 * For now, only BACO and mode1 reset are confirmed 6264 * to suffer the audio issue without proper suspended. 6265 */ 6266 reset_method = amdgpu_asic_reset_method(adev); 6267 if ((reset_method != AMD_RESET_METHOD_BACO) && 6268 (reset_method != AMD_RESET_METHOD_MODE1)) 6269 return -EINVAL; 6270 6271 p = pci_get_domain_bus_and_slot(pci_domain_nr(adev->pdev->bus), 6272 adev->pdev->bus->number, 1); 6273 if (!p) 6274 return -ENODEV; 6275 6276 expires = pm_runtime_autosuspend_expiration(&(p->dev)); 6277 if (!expires) 6278 /* 6279 * If we cannot get the audio device autosuspend delay, 6280 * a fixed 4S interval will be used. Considering 3S is 6281 * the audio controller default autosuspend delay setting. 6282 * 4S used here is guaranteed to cover that. 6283 */ 6284 expires = ktime_get_mono_fast_ns() + NSEC_PER_SEC * 4ULL; 6285 6286 while (!pm_runtime_status_suspended(&(p->dev))) { 6287 if (!pm_runtime_suspend(&(p->dev))) 6288 break; 6289 6290 if (expires < ktime_get_mono_fast_ns()) { 6291 dev_warn(adev->dev, "failed to suspend display audio\n"); 6292 pci_dev_put(p); 6293 /* TODO: abort the succeeding gpu reset? */ 6294 return -ETIMEDOUT; 6295 } 6296 } 6297 6298 pm_runtime_disable(&(p->dev)); 6299 6300 pci_dev_put(p); 6301 return 0; 6302 } 6303 6304 static inline void amdgpu_device_stop_pending_resets(struct amdgpu_device *adev) 6305 { 6306 struct amdgpu_ras *con = amdgpu_ras_get_context(adev); 6307 6308 #if defined(CONFIG_DEBUG_FS) 6309 if (!amdgpu_sriov_vf(adev)) 6310 cancel_work(&adev->reset_work); 6311 #endif 6312 cancel_work(&adev->userq_reset_work); 6313 6314 if (adev->kfd.dev) 6315 cancel_work(&adev->kfd.reset_work); 6316 6317 if (amdgpu_sriov_vf(adev)) 6318 cancel_work(&adev->virt.flr_work); 6319 6320 if (con && adev->ras_enabled) 6321 cancel_work(&con->recovery_work); 6322 6323 } 6324 6325 static int amdgpu_device_health_check(struct list_head *device_list_handle) 6326 { 6327 struct amdgpu_device *tmp_adev; 6328 int ret = 0; 6329 6330 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 6331 ret |= amdgpu_device_bus_status_check(tmp_adev); 6332 } 6333 6334 return ret; 6335 } 6336 6337 static void amdgpu_device_recovery_prepare(struct amdgpu_device *adev, 6338 struct list_head *device_list, 6339 struct amdgpu_hive_info *hive) 6340 { 6341 struct amdgpu_device *tmp_adev = NULL; 6342 6343 /* 6344 * Build list of devices to reset. 6345 * In case we are in XGMI hive mode, resort the device list 6346 * to put adev in the 1st position. 6347 */ 6348 if (!amdgpu_sriov_vf(adev) && (adev->gmc.xgmi.num_physical_nodes > 1) && hive) { 6349 list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head) { 6350 list_add_tail(&tmp_adev->reset_list, device_list); 6351 if (adev->shutdown) 6352 tmp_adev->shutdown = true; 6353 if (amdgpu_reset_in_dpc(adev)) 6354 tmp_adev->pcie_reset_ctx.in_link_reset = true; 6355 } 6356 if (!list_is_first(&adev->reset_list, device_list)) 6357 list_rotate_to_front(&adev->reset_list, device_list); 6358 } else { 6359 list_add_tail(&adev->reset_list, device_list); 6360 } 6361 } 6362 6363 static void amdgpu_device_recovery_get_reset_lock(struct amdgpu_device *adev, 6364 struct list_head *device_list) 6365 { 6366 struct amdgpu_device *tmp_adev = NULL; 6367 6368 if (list_empty(device_list)) 6369 return; 6370 tmp_adev = 6371 list_first_entry(device_list, struct amdgpu_device, reset_list); 6372 amdgpu_device_lock_reset_domain(tmp_adev->reset_domain); 6373 } 6374 6375 static void amdgpu_device_recovery_put_reset_lock(struct amdgpu_device *adev, 6376 struct list_head *device_list) 6377 { 6378 struct amdgpu_device *tmp_adev = NULL; 6379 6380 if (list_empty(device_list)) 6381 return; 6382 tmp_adev = 6383 list_first_entry(device_list, struct amdgpu_device, reset_list); 6384 amdgpu_device_unlock_reset_domain(tmp_adev->reset_domain); 6385 } 6386 6387 static void amdgpu_device_halt_activities(struct amdgpu_device *adev, 6388 struct amdgpu_job *job, 6389 struct amdgpu_reset_context *reset_context, 6390 struct list_head *device_list, 6391 struct amdgpu_hive_info *hive, 6392 bool need_emergency_restart) 6393 { 6394 struct amdgpu_device *tmp_adev = NULL; 6395 int i; 6396 6397 /* block all schedulers and reset given job's ring */ 6398 list_for_each_entry(tmp_adev, device_list, reset_list) { 6399 amdgpu_device_set_mp1_state(tmp_adev); 6400 6401 /* 6402 * Try to put the audio codec into suspend state 6403 * before gpu reset started. 6404 * 6405 * Due to the power domain of the graphics device 6406 * is shared with AZ power domain. Without this, 6407 * we may change the audio hardware from behind 6408 * the audio driver's back. That will trigger 6409 * some audio codec errors. 6410 */ 6411 if (!amdgpu_device_suspend_display_audio(tmp_adev)) 6412 tmp_adev->pcie_reset_ctx.audio_suspended = true; 6413 6414 amdgpu_ras_set_error_query_ready(tmp_adev, false); 6415 6416 cancel_delayed_work_sync(&tmp_adev->delayed_init_work); 6417 6418 amdgpu_amdkfd_pre_reset(tmp_adev, reset_context); 6419 6420 /* 6421 * Mark these ASICs to be reset as untracked first 6422 * And add them back after reset completed 6423 */ 6424 amdgpu_unregister_gpu_instance(tmp_adev); 6425 6426 drm_client_dev_suspend(adev_to_drm(tmp_adev)); 6427 6428 /* disable ras on ALL IPs */ 6429 if (!need_emergency_restart && !amdgpu_reset_in_dpc(adev) && 6430 amdgpu_device_ip_need_full_reset(tmp_adev)) 6431 amdgpu_ras_suspend(tmp_adev); 6432 6433 amdgpu_userq_pre_reset(tmp_adev); 6434 6435 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 6436 struct amdgpu_ring *ring = tmp_adev->rings[i]; 6437 6438 if (!amdgpu_ring_sched_ready(ring)) 6439 continue; 6440 6441 drm_sched_stop(&ring->sched, job ? &job->base : NULL); 6442 6443 if (need_emergency_restart) 6444 amdgpu_job_stop_all_jobs_on_sched(&ring->sched); 6445 } 6446 atomic_inc(&tmp_adev->gpu_reset_counter); 6447 } 6448 } 6449 6450 static int amdgpu_device_asic_reset(struct amdgpu_device *adev, 6451 struct list_head *device_list, 6452 struct amdgpu_reset_context *reset_context) 6453 { 6454 struct amdgpu_device *tmp_adev = NULL; 6455 int retry_limit = AMDGPU_MAX_RETRY_LIMIT; 6456 int r = 0; 6457 6458 retry: /* Rest of adevs pre asic reset from XGMI hive. */ 6459 list_for_each_entry(tmp_adev, device_list, reset_list) { 6460 r = amdgpu_device_pre_asic_reset(tmp_adev, reset_context); 6461 /*TODO Should we stop ?*/ 6462 if (r) { 6463 dev_err(tmp_adev->dev, "GPU pre asic reset failed with err, %d for drm dev, %s ", 6464 r, adev_to_drm(tmp_adev)->unique); 6465 tmp_adev->asic_reset_res = r; 6466 } 6467 } 6468 6469 /* Actual ASIC resets if needed.*/ 6470 /* Host driver will handle XGMI hive reset for SRIOV */ 6471 if (amdgpu_sriov_vf(adev)) { 6472 6473 /* Bail out of reset early */ 6474 if (amdgpu_ras_is_rma(adev)) 6475 return -ENODEV; 6476 6477 if (amdgpu_ras_get_fed_status(adev) || amdgpu_virt_rcvd_ras_interrupt(adev)) { 6478 dev_dbg(adev->dev, "Detected RAS error, wait for FLR completion\n"); 6479 amdgpu_ras_set_fed(adev, true); 6480 set_bit(AMDGPU_HOST_FLR, &reset_context->flags); 6481 } 6482 6483 r = amdgpu_device_reset_sriov(adev, reset_context); 6484 if (AMDGPU_RETRY_SRIOV_RESET(r) && (retry_limit--) > 0) { 6485 amdgpu_virt_release_full_gpu(adev, true); 6486 goto retry; 6487 } 6488 if (r) 6489 adev->asic_reset_res = r; 6490 } else { 6491 r = amdgpu_do_asic_reset(device_list, reset_context); 6492 if (r && r == -EAGAIN) 6493 goto retry; 6494 } 6495 6496 list_for_each_entry(tmp_adev, device_list, reset_list) { 6497 /* 6498 * Drop any pending non scheduler resets queued before reset is done. 6499 * Any reset scheduled after this point would be valid. Scheduler resets 6500 * were already dropped during drm_sched_stop and no new ones can come 6501 * in before drm_sched_start. 6502 */ 6503 amdgpu_device_stop_pending_resets(tmp_adev); 6504 } 6505 6506 return r; 6507 } 6508 6509 static int amdgpu_device_sched_resume(struct list_head *device_list, 6510 struct amdgpu_reset_context *reset_context, 6511 bool job_signaled) 6512 { 6513 struct amdgpu_device *tmp_adev = NULL; 6514 int i, r = 0; 6515 6516 /* Post ASIC reset for all devs .*/ 6517 list_for_each_entry(tmp_adev, device_list, reset_list) { 6518 6519 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 6520 struct amdgpu_ring *ring = tmp_adev->rings[i]; 6521 6522 if (!amdgpu_ring_sched_ready(ring)) 6523 continue; 6524 6525 drm_sched_start(&ring->sched, 0); 6526 } 6527 6528 if (!drm_drv_uses_atomic_modeset(adev_to_drm(tmp_adev)) && !job_signaled) 6529 drm_helper_resume_force_mode(adev_to_drm(tmp_adev)); 6530 6531 if (tmp_adev->asic_reset_res) { 6532 /* bad news, how to tell it to userspace ? 6533 * for ras error, we should report GPU bad status instead of 6534 * reset failure 6535 */ 6536 if (reset_context->src != AMDGPU_RESET_SRC_RAS || 6537 !amdgpu_ras_eeprom_check_err_threshold(tmp_adev)) 6538 dev_info( 6539 tmp_adev->dev, 6540 "GPU reset(%d) failed with error %d \n", 6541 atomic_read( 6542 &tmp_adev->gpu_reset_counter), 6543 tmp_adev->asic_reset_res); 6544 amdgpu_vf_error_put(tmp_adev, 6545 AMDGIM_ERROR_VF_GPU_RESET_FAIL, 0, 6546 tmp_adev->asic_reset_res); 6547 if (!r) 6548 r = tmp_adev->asic_reset_res; 6549 tmp_adev->asic_reset_res = 0; 6550 } else { 6551 dev_info(tmp_adev->dev, "GPU reset(%d) succeeded!\n", 6552 atomic_read(&tmp_adev->gpu_reset_counter)); 6553 if (amdgpu_acpi_smart_shift_update(tmp_adev, 6554 AMDGPU_SS_DEV_D0)) 6555 dev_warn(tmp_adev->dev, 6556 "smart shift update failed\n"); 6557 } 6558 } 6559 6560 return r; 6561 } 6562 6563 static void amdgpu_device_gpu_resume(struct amdgpu_device *adev, 6564 struct list_head *device_list, 6565 bool need_emergency_restart) 6566 { 6567 struct amdgpu_device *tmp_adev = NULL; 6568 6569 list_for_each_entry(tmp_adev, device_list, reset_list) { 6570 /* unlock kfd: SRIOV would do it separately */ 6571 if (!need_emergency_restart && !amdgpu_sriov_vf(tmp_adev)) 6572 amdgpu_amdkfd_post_reset(tmp_adev); 6573 6574 /* kfd_post_reset will do nothing if kfd device is not initialized, 6575 * need to bring up kfd here if it's not be initialized before 6576 */ 6577 if (!adev->kfd.init_complete) 6578 amdgpu_amdkfd_device_init(adev); 6579 6580 if (tmp_adev->pcie_reset_ctx.audio_suspended) 6581 amdgpu_device_resume_display_audio(tmp_adev); 6582 6583 amdgpu_device_unset_mp1_state(tmp_adev); 6584 6585 amdgpu_ras_set_error_query_ready(tmp_adev, true); 6586 6587 } 6588 } 6589 6590 6591 /** 6592 * amdgpu_device_gpu_recover - reset the asic and recover scheduler 6593 * 6594 * @adev: amdgpu_device pointer 6595 * @job: which job trigger hang 6596 * @reset_context: amdgpu reset context pointer 6597 * 6598 * Attempt to reset the GPU if it has hung (all asics). 6599 * Attempt to do soft-reset or full-reset and reinitialize Asic 6600 * Returns 0 for success or an error on failure. 6601 */ 6602 6603 int amdgpu_device_gpu_recover(struct amdgpu_device *adev, 6604 struct amdgpu_job *job, 6605 struct amdgpu_reset_context *reset_context) 6606 { 6607 struct list_head device_list; 6608 bool job_signaled = false; 6609 struct amdgpu_hive_info *hive = NULL; 6610 int r = 0; 6611 bool need_emergency_restart = false; 6612 6613 /* 6614 * If it reaches here because of hang/timeout and a RAS error is 6615 * detected at the same time, let RAS recovery take care of it. 6616 */ 6617 if (amdgpu_ras_is_err_state(adev, AMDGPU_RAS_BLOCK__ANY) && 6618 !amdgpu_sriov_vf(adev) && 6619 reset_context->src != AMDGPU_RESET_SRC_RAS) { 6620 dev_dbg(adev->dev, 6621 "Gpu recovery from source: %d yielding to RAS error recovery handling", 6622 reset_context->src); 6623 return 0; 6624 } 6625 6626 /* 6627 * Special case: RAS triggered and full reset isn't supported 6628 */ 6629 need_emergency_restart = amdgpu_ras_need_emergency_restart(adev); 6630 6631 /* 6632 * Flush RAM to disk so that after reboot 6633 * the user can read log and see why the system rebooted. 6634 */ 6635 if (need_emergency_restart && amdgpu_ras_get_context(adev) && 6636 amdgpu_ras_get_context(adev)->reboot) { 6637 dev_warn(adev->dev, "Emergency reboot."); 6638 6639 ksys_sync_helper(); 6640 emergency_restart(); 6641 } 6642 6643 dev_info(adev->dev, "GPU %s begin!. Source: %d\n", 6644 need_emergency_restart ? "jobs stop" : "reset", 6645 reset_context->src); 6646 6647 if (!amdgpu_sriov_vf(adev)) 6648 hive = amdgpu_get_xgmi_hive(adev); 6649 if (hive) 6650 mutex_lock(&hive->hive_lock); 6651 6652 reset_context->job = job; 6653 reset_context->hive = hive; 6654 INIT_LIST_HEAD(&device_list); 6655 6656 amdgpu_device_recovery_prepare(adev, &device_list, hive); 6657 6658 if (!amdgpu_sriov_vf(adev)) { 6659 r = amdgpu_device_health_check(&device_list); 6660 if (r) 6661 goto end_reset; 6662 } 6663 6664 /* Cannot be called after locking reset domain */ 6665 amdgpu_ras_pre_reset(adev, &device_list); 6666 6667 /* We need to lock reset domain only once both for XGMI and single device */ 6668 amdgpu_device_recovery_get_reset_lock(adev, &device_list); 6669 6670 amdgpu_device_halt_activities(adev, job, reset_context, &device_list, 6671 hive, need_emergency_restart); 6672 if (need_emergency_restart) 6673 goto skip_sched_resume; 6674 /* 6675 * Must check guilty signal here since after this point all old 6676 * HW fences are force signaled. 6677 * 6678 * job->base holds a reference to parent fence 6679 */ 6680 if (job && dma_fence_is_signaled(&job->hw_fence->base)) { 6681 job_signaled = true; 6682 dev_info(adev->dev, "Guilty job already signaled, skipping HW reset"); 6683 goto skip_hw_reset; 6684 } 6685 6686 r = amdgpu_device_asic_reset(adev, &device_list, reset_context); 6687 if (r) 6688 goto reset_unlock; 6689 skip_hw_reset: 6690 r = amdgpu_device_sched_resume(&device_list, reset_context, job_signaled); 6691 if (r) 6692 goto reset_unlock; 6693 skip_sched_resume: 6694 amdgpu_device_gpu_resume(adev, &device_list, need_emergency_restart); 6695 reset_unlock: 6696 amdgpu_device_recovery_put_reset_lock(adev, &device_list); 6697 amdgpu_ras_post_reset(adev, &device_list); 6698 end_reset: 6699 if (hive) { 6700 mutex_unlock(&hive->hive_lock); 6701 amdgpu_put_xgmi_hive(hive); 6702 } 6703 6704 if (r) 6705 dev_info(adev->dev, "GPU reset end with ret = %d\n", r); 6706 6707 atomic_set(&adev->reset_domain->reset_res, r); 6708 6709 if (!r) { 6710 struct amdgpu_task_info *ti = NULL; 6711 6712 if (job) 6713 ti = amdgpu_vm_get_task_info_pasid(adev, job->pasid); 6714 6715 drm_dev_wedged_event(adev_to_drm(adev), DRM_WEDGE_RECOVERY_NONE, 6716 ti ? &ti->task : NULL); 6717 6718 amdgpu_vm_put_task_info(ti); 6719 } 6720 6721 return r; 6722 } 6723 6724 /** 6725 * amdgpu_device_partner_bandwidth - find the bandwidth of appropriate partner 6726 * 6727 * @adev: amdgpu_device pointer 6728 * @speed: pointer to the speed of the link 6729 * @width: pointer to the width of the link 6730 * 6731 * Evaluate the hierarchy to find the speed and bandwidth capabilities of the 6732 * first physical partner to an AMD dGPU. 6733 * This will exclude any virtual switches and links. 6734 */ 6735 static void amdgpu_device_partner_bandwidth(struct amdgpu_device *adev, 6736 enum pci_bus_speed *speed, 6737 enum pcie_link_width *width) 6738 { 6739 struct pci_dev *parent = adev->pdev; 6740 6741 if (!speed || !width) 6742 return; 6743 6744 *speed = PCI_SPEED_UNKNOWN; 6745 *width = PCIE_LNK_WIDTH_UNKNOWN; 6746 6747 if (amdgpu_device_pcie_dynamic_switching_supported(adev)) { 6748 while ((parent = pci_upstream_bridge(parent))) { 6749 /* skip upstream/downstream switches internal to dGPU*/ 6750 if (parent->vendor == PCI_VENDOR_ID_ATI) 6751 continue; 6752 *speed = pcie_get_speed_cap(parent); 6753 *width = pcie_get_width_cap(parent); 6754 break; 6755 } 6756 } else { 6757 /* use the current speeds rather than max if switching is not supported */ 6758 pcie_bandwidth_available(adev->pdev, NULL, speed, width); 6759 } 6760 } 6761 6762 /** 6763 * amdgpu_device_gpu_bandwidth - find the bandwidth of the GPU 6764 * 6765 * @adev: amdgpu_device pointer 6766 * @speed: pointer to the speed of the link 6767 * @width: pointer to the width of the link 6768 * 6769 * Evaluate the hierarchy to find the speed and bandwidth capabilities of the 6770 * AMD dGPU which may be a virtual upstream bridge. 6771 */ 6772 static void amdgpu_device_gpu_bandwidth(struct amdgpu_device *adev, 6773 enum pci_bus_speed *speed, 6774 enum pcie_link_width *width) 6775 { 6776 struct pci_dev *parent = adev->pdev; 6777 6778 if (!speed || !width) 6779 return; 6780 6781 parent = pci_upstream_bridge(parent); 6782 if (parent && parent->vendor == PCI_VENDOR_ID_ATI) { 6783 /* use the upstream/downstream switches internal to dGPU */ 6784 *speed = pcie_get_speed_cap(parent); 6785 *width = pcie_get_width_cap(parent); 6786 while ((parent = pci_upstream_bridge(parent))) { 6787 if (parent->vendor == PCI_VENDOR_ID_ATI) { 6788 /* use the upstream/downstream switches internal to dGPU */ 6789 *speed = pcie_get_speed_cap(parent); 6790 *width = pcie_get_width_cap(parent); 6791 } 6792 } 6793 } else { 6794 /* use the device itself */ 6795 *speed = pcie_get_speed_cap(adev->pdev); 6796 *width = pcie_get_width_cap(adev->pdev); 6797 } 6798 } 6799 6800 /** 6801 * amdgpu_device_get_pcie_info - fence pcie info about the PCIE slot 6802 * 6803 * @adev: amdgpu_device pointer 6804 * 6805 * Fetches and stores in the driver the PCIE capabilities (gen speed 6806 * and lanes) of the slot the device is in. Handles APUs and 6807 * virtualized environments where PCIE config space may not be available. 6808 */ 6809 static void amdgpu_device_get_pcie_info(struct amdgpu_device *adev) 6810 { 6811 enum pci_bus_speed speed_cap, platform_speed_cap; 6812 enum pcie_link_width platform_link_width, link_width; 6813 6814 if (amdgpu_pcie_gen_cap) 6815 adev->pm.pcie_gen_mask = amdgpu_pcie_gen_cap; 6816 6817 if (amdgpu_pcie_lane_cap) 6818 adev->pm.pcie_mlw_mask = amdgpu_pcie_lane_cap; 6819 6820 /* covers APUs as well */ 6821 if (pci_is_root_bus(adev->pdev->bus) && !amdgpu_passthrough(adev)) { 6822 if (adev->pm.pcie_gen_mask == 0) 6823 adev->pm.pcie_gen_mask = AMDGPU_DEFAULT_PCIE_GEN_MASK; 6824 if (adev->pm.pcie_mlw_mask == 0) 6825 adev->pm.pcie_mlw_mask = AMDGPU_DEFAULT_PCIE_MLW_MASK; 6826 return; 6827 } 6828 6829 if (adev->pm.pcie_gen_mask && adev->pm.pcie_mlw_mask) 6830 return; 6831 6832 amdgpu_device_partner_bandwidth(adev, &platform_speed_cap, 6833 &platform_link_width); 6834 amdgpu_device_gpu_bandwidth(adev, &speed_cap, &link_width); 6835 6836 if (adev->pm.pcie_gen_mask == 0) { 6837 /* asic caps */ 6838 if (speed_cap == PCI_SPEED_UNKNOWN) { 6839 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 6840 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 | 6841 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3); 6842 } else { 6843 if (speed_cap == PCIE_SPEED_32_0GT) 6844 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 6845 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 | 6846 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3 | 6847 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN4 | 6848 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN5); 6849 else if (speed_cap == PCIE_SPEED_16_0GT) 6850 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 6851 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 | 6852 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3 | 6853 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN4); 6854 else if (speed_cap == PCIE_SPEED_8_0GT) 6855 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 6856 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 | 6857 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3); 6858 else if (speed_cap == PCIE_SPEED_5_0GT) 6859 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 6860 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2); 6861 else 6862 adev->pm.pcie_gen_mask |= CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1; 6863 } 6864 /* platform caps */ 6865 if (platform_speed_cap == PCI_SPEED_UNKNOWN) { 6866 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 6867 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2); 6868 } else { 6869 if (platform_speed_cap == PCIE_SPEED_32_0GT) 6870 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 6871 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 | 6872 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3 | 6873 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN4 | 6874 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN5); 6875 else if (platform_speed_cap == PCIE_SPEED_16_0GT) 6876 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 6877 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 | 6878 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3 | 6879 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN4); 6880 else if (platform_speed_cap == PCIE_SPEED_8_0GT) 6881 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 6882 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 | 6883 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3); 6884 else if (platform_speed_cap == PCIE_SPEED_5_0GT) 6885 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 6886 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2); 6887 else 6888 adev->pm.pcie_gen_mask |= CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1; 6889 6890 } 6891 } 6892 if (adev->pm.pcie_mlw_mask == 0) { 6893 /* asic caps */ 6894 if (link_width == PCIE_LNK_WIDTH_UNKNOWN) { 6895 adev->pm.pcie_mlw_mask |= AMDGPU_DEFAULT_ASIC_PCIE_MLW_MASK; 6896 } else { 6897 switch (link_width) { 6898 case PCIE_LNK_X32: 6899 adev->pm.pcie_mlw_mask |= (CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X32 | 6900 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X16 | 6901 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X12 | 6902 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X8 | 6903 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X4 | 6904 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X2 | 6905 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X1); 6906 break; 6907 case PCIE_LNK_X16: 6908 adev->pm.pcie_mlw_mask |= (CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X16 | 6909 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X12 | 6910 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X8 | 6911 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X4 | 6912 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X2 | 6913 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X1); 6914 break; 6915 case PCIE_LNK_X12: 6916 adev->pm.pcie_mlw_mask |= (CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X12 | 6917 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X8 | 6918 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X4 | 6919 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X2 | 6920 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X1); 6921 break; 6922 case PCIE_LNK_X8: 6923 adev->pm.pcie_mlw_mask |= (CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X8 | 6924 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X4 | 6925 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X2 | 6926 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X1); 6927 break; 6928 case PCIE_LNK_X4: 6929 adev->pm.pcie_mlw_mask |= (CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X4 | 6930 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X2 | 6931 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X1); 6932 break; 6933 case PCIE_LNK_X2: 6934 adev->pm.pcie_mlw_mask |= (CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X2 | 6935 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X1); 6936 break; 6937 case PCIE_LNK_X1: 6938 adev->pm.pcie_mlw_mask |= CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X1; 6939 break; 6940 default: 6941 break; 6942 } 6943 } 6944 /* platform caps */ 6945 if (platform_link_width == PCIE_LNK_WIDTH_UNKNOWN) { 6946 adev->pm.pcie_mlw_mask |= AMDGPU_DEFAULT_PCIE_MLW_MASK; 6947 } else { 6948 switch (platform_link_width) { 6949 case PCIE_LNK_X32: 6950 adev->pm.pcie_mlw_mask |= (CAIL_PCIE_LINK_WIDTH_SUPPORT_X32 | 6951 CAIL_PCIE_LINK_WIDTH_SUPPORT_X16 | 6952 CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 | 6953 CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 | 6954 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 6955 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 6956 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 6957 break; 6958 case PCIE_LNK_X16: 6959 adev->pm.pcie_mlw_mask |= (CAIL_PCIE_LINK_WIDTH_SUPPORT_X16 | 6960 CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 | 6961 CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 | 6962 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 6963 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 6964 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 6965 break; 6966 case PCIE_LNK_X12: 6967 adev->pm.pcie_mlw_mask |= (CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 | 6968 CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 | 6969 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 6970 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 6971 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 6972 break; 6973 case PCIE_LNK_X8: 6974 adev->pm.pcie_mlw_mask |= (CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 | 6975 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 6976 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 6977 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 6978 break; 6979 case PCIE_LNK_X4: 6980 adev->pm.pcie_mlw_mask |= (CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 6981 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 6982 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 6983 break; 6984 case PCIE_LNK_X2: 6985 adev->pm.pcie_mlw_mask |= (CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 6986 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 6987 break; 6988 case PCIE_LNK_X1: 6989 adev->pm.pcie_mlw_mask |= CAIL_PCIE_LINK_WIDTH_SUPPORT_X1; 6990 break; 6991 default: 6992 break; 6993 } 6994 } 6995 } 6996 } 6997 6998 /** 6999 * amdgpu_device_is_peer_accessible - Check peer access through PCIe BAR 7000 * 7001 * @adev: amdgpu_device pointer 7002 * @peer_adev: amdgpu_device pointer for peer device trying to access @adev 7003 * 7004 * Return true if @peer_adev can access (DMA) @adev through the PCIe 7005 * BAR, i.e. @adev is "large BAR" and the BAR matches the DMA mask of 7006 * @peer_adev. 7007 */ 7008 bool amdgpu_device_is_peer_accessible(struct amdgpu_device *adev, 7009 struct amdgpu_device *peer_adev) 7010 { 7011 #ifdef CONFIG_HSA_AMD_P2P 7012 bool p2p_access = 7013 !adev->gmc.xgmi.connected_to_cpu && 7014 !(pci_p2pdma_distance(adev->pdev, peer_adev->dev, false) < 0); 7015 if (!p2p_access) 7016 dev_info(adev->dev, "PCIe P2P access from peer device %s is not supported by the chipset\n", 7017 pci_name(peer_adev->pdev)); 7018 7019 bool is_large_bar = adev->gmc.visible_vram_size && 7020 adev->gmc.real_vram_size == adev->gmc.visible_vram_size; 7021 bool p2p_addressable = amdgpu_device_check_iommu_remap(peer_adev); 7022 7023 if (!p2p_addressable) { 7024 uint64_t address_mask = peer_adev->dev->dma_mask ? 7025 ~*peer_adev->dev->dma_mask : ~((1ULL << 32) - 1); 7026 resource_size_t aper_limit = 7027 adev->gmc.aper_base + adev->gmc.aper_size - 1; 7028 7029 p2p_addressable = !(adev->gmc.aper_base & address_mask || 7030 aper_limit & address_mask); 7031 } 7032 return pcie_p2p && is_large_bar && p2p_access && p2p_addressable; 7033 #else 7034 return false; 7035 #endif 7036 } 7037 7038 int amdgpu_device_baco_enter(struct amdgpu_device *adev) 7039 { 7040 struct amdgpu_ras *ras = amdgpu_ras_get_context(adev); 7041 7042 if (!amdgpu_device_supports_baco(adev)) 7043 return -ENOTSUPP; 7044 7045 if (ras && adev->ras_enabled && 7046 adev->nbio.funcs->enable_doorbell_interrupt) 7047 adev->nbio.funcs->enable_doorbell_interrupt(adev, false); 7048 7049 return amdgpu_dpm_baco_enter(adev); 7050 } 7051 7052 int amdgpu_device_baco_exit(struct amdgpu_device *adev) 7053 { 7054 struct amdgpu_ras *ras = amdgpu_ras_get_context(adev); 7055 int ret = 0; 7056 7057 if (!amdgpu_device_supports_baco(adev)) 7058 return -ENOTSUPP; 7059 7060 ret = amdgpu_dpm_baco_exit(adev); 7061 if (ret) 7062 return ret; 7063 7064 if (ras && adev->ras_enabled && 7065 adev->nbio.funcs->enable_doorbell_interrupt) 7066 adev->nbio.funcs->enable_doorbell_interrupt(adev, true); 7067 7068 if (amdgpu_passthrough(adev) && adev->nbio.funcs && 7069 adev->nbio.funcs->clear_doorbell_interrupt) 7070 adev->nbio.funcs->clear_doorbell_interrupt(adev); 7071 7072 return 0; 7073 } 7074 7075 /** 7076 * amdgpu_pci_error_detected - Called when a PCI error is detected. 7077 * @pdev: PCI device struct 7078 * @state: PCI channel state 7079 * 7080 * Description: Called when a PCI error is detected. 7081 * 7082 * Return: PCI_ERS_RESULT_NEED_RESET or PCI_ERS_RESULT_DISCONNECT. 7083 */ 7084 pci_ers_result_t amdgpu_pci_error_detected(struct pci_dev *pdev, pci_channel_state_t state) 7085 { 7086 struct drm_device *dev = pci_get_drvdata(pdev); 7087 struct amdgpu_device *adev = drm_to_adev(dev); 7088 struct amdgpu_hive_info *hive __free(xgmi_put_hive) = 7089 amdgpu_get_xgmi_hive(adev); 7090 struct amdgpu_reset_context reset_context; 7091 struct list_head device_list; 7092 7093 dev_info(adev->dev, "PCI error: detected callback!!\n"); 7094 7095 adev->pci_channel_state = state; 7096 7097 switch (state) { 7098 case pci_channel_io_normal: 7099 dev_info(adev->dev, "pci_channel_io_normal: state(%d)!!\n", state); 7100 return PCI_ERS_RESULT_CAN_RECOVER; 7101 case pci_channel_io_frozen: 7102 /* Fatal error, prepare for slot reset */ 7103 dev_info(adev->dev, "pci_channel_io_frozen: state(%d)!!\n", state); 7104 if (hive) { 7105 /* Hive devices should be able to support FW based 7106 * link reset on other devices, if not return. 7107 */ 7108 if (!amdgpu_dpm_is_link_reset_supported(adev)) { 7109 dev_warn(adev->dev, 7110 "No support for XGMI hive yet...\n"); 7111 return PCI_ERS_RESULT_DISCONNECT; 7112 } 7113 /* Set dpc status only if device is part of hive 7114 * Non-hive devices should be able to recover after 7115 * link reset. 7116 */ 7117 amdgpu_reset_set_dpc_status(adev, true); 7118 7119 mutex_lock(&hive->hive_lock); 7120 } 7121 memset(&reset_context, 0, sizeof(reset_context)); 7122 INIT_LIST_HEAD(&device_list); 7123 7124 amdgpu_device_recovery_prepare(adev, &device_list, hive); 7125 amdgpu_device_recovery_get_reset_lock(adev, &device_list); 7126 amdgpu_device_halt_activities(adev, NULL, &reset_context, &device_list, 7127 hive, false); 7128 if (hive) 7129 mutex_unlock(&hive->hive_lock); 7130 return PCI_ERS_RESULT_NEED_RESET; 7131 case pci_channel_io_perm_failure: 7132 /* Permanent error, prepare for device removal */ 7133 dev_info(adev->dev, "pci_channel_io_perm_failure: state(%d)!!\n", state); 7134 return PCI_ERS_RESULT_DISCONNECT; 7135 } 7136 7137 return PCI_ERS_RESULT_NEED_RESET; 7138 } 7139 7140 /** 7141 * amdgpu_pci_mmio_enabled - Enable MMIO and dump debug registers 7142 * @pdev: pointer to PCI device 7143 */ 7144 pci_ers_result_t amdgpu_pci_mmio_enabled(struct pci_dev *pdev) 7145 { 7146 struct drm_device *dev = pci_get_drvdata(pdev); 7147 struct amdgpu_device *adev = drm_to_adev(dev); 7148 7149 dev_info(adev->dev, "PCI error: mmio enabled callback!!\n"); 7150 7151 /* TODO - dump whatever for debugging purposes */ 7152 7153 /* This called only if amdgpu_pci_error_detected returns 7154 * PCI_ERS_RESULT_CAN_RECOVER. Read/write to the device still 7155 * works, no need to reset slot. 7156 */ 7157 7158 return PCI_ERS_RESULT_RECOVERED; 7159 } 7160 7161 /** 7162 * amdgpu_pci_slot_reset - Called when PCI slot has been reset. 7163 * @pdev: PCI device struct 7164 * 7165 * Description: This routine is called by the pci error recovery 7166 * code after the PCI slot has been reset, just before we 7167 * should resume normal operations. 7168 */ 7169 pci_ers_result_t amdgpu_pci_slot_reset(struct pci_dev *pdev) 7170 { 7171 struct drm_device *dev = pci_get_drvdata(pdev); 7172 struct amdgpu_device *adev = drm_to_adev(dev); 7173 struct amdgpu_reset_context reset_context; 7174 struct amdgpu_device *tmp_adev; 7175 struct amdgpu_hive_info *hive; 7176 struct list_head device_list; 7177 struct pci_dev *link_dev; 7178 int r = 0, i, timeout; 7179 u32 memsize; 7180 u16 status; 7181 7182 dev_info(adev->dev, "PCI error: slot reset callback!!\n"); 7183 7184 memset(&reset_context, 0, sizeof(reset_context)); 7185 7186 if (adev->pcie_reset_ctx.swus) 7187 link_dev = adev->pcie_reset_ctx.swus; 7188 else 7189 link_dev = adev->pdev; 7190 /* wait for asic to come out of reset, timeout = 10s */ 7191 timeout = 10000; 7192 do { 7193 usleep_range(10000, 10500); 7194 r = pci_read_config_word(link_dev, PCI_VENDOR_ID, &status); 7195 timeout -= 10; 7196 } while (timeout > 0 && (status != PCI_VENDOR_ID_ATI) && 7197 (status != PCI_VENDOR_ID_AMD)); 7198 7199 if ((status != PCI_VENDOR_ID_ATI) && (status != PCI_VENDOR_ID_AMD)) { 7200 r = -ETIME; 7201 goto out; 7202 } 7203 7204 amdgpu_device_load_switch_state(adev); 7205 /* Restore PCI confspace */ 7206 amdgpu_device_load_pci_state(pdev); 7207 7208 /* confirm ASIC came out of reset */ 7209 for (i = 0; i < adev->usec_timeout; i++) { 7210 memsize = amdgpu_asic_get_config_memsize(adev); 7211 7212 if (memsize != 0xffffffff) 7213 break; 7214 udelay(1); 7215 } 7216 if (memsize == 0xffffffff) { 7217 r = -ETIME; 7218 goto out; 7219 } 7220 7221 reset_context.method = AMD_RESET_METHOD_NONE; 7222 reset_context.reset_req_dev = adev; 7223 set_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags); 7224 set_bit(AMDGPU_SKIP_COREDUMP, &reset_context.flags); 7225 INIT_LIST_HEAD(&device_list); 7226 7227 hive = amdgpu_get_xgmi_hive(adev); 7228 if (hive) { 7229 mutex_lock(&hive->hive_lock); 7230 reset_context.hive = hive; 7231 list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head) { 7232 tmp_adev->pcie_reset_ctx.in_link_reset = true; 7233 list_add_tail(&tmp_adev->reset_list, &device_list); 7234 } 7235 } else { 7236 set_bit(AMDGPU_SKIP_HW_RESET, &reset_context.flags); 7237 list_add_tail(&adev->reset_list, &device_list); 7238 } 7239 7240 r = amdgpu_device_asic_reset(adev, &device_list, &reset_context); 7241 out: 7242 if (!r) { 7243 if (amdgpu_device_cache_pci_state(adev->pdev)) 7244 pci_restore_state(adev->pdev); 7245 dev_info(adev->dev, "PCIe error recovery succeeded\n"); 7246 } else { 7247 dev_err(adev->dev, "PCIe error recovery failed, err:%d\n", r); 7248 if (hive) { 7249 list_for_each_entry(tmp_adev, &device_list, reset_list) 7250 amdgpu_device_unset_mp1_state(tmp_adev); 7251 } 7252 amdgpu_device_recovery_put_reset_lock(adev, &device_list); 7253 } 7254 7255 if (hive) { 7256 mutex_unlock(&hive->hive_lock); 7257 amdgpu_put_xgmi_hive(hive); 7258 } 7259 7260 return r ? PCI_ERS_RESULT_DISCONNECT : PCI_ERS_RESULT_RECOVERED; 7261 } 7262 7263 /** 7264 * amdgpu_pci_resume() - resume normal ops after PCI reset 7265 * @pdev: pointer to PCI device 7266 * 7267 * Called when the error recovery driver tells us that its 7268 * OK to resume normal operation. 7269 */ 7270 void amdgpu_pci_resume(struct pci_dev *pdev) 7271 { 7272 struct drm_device *dev = pci_get_drvdata(pdev); 7273 struct amdgpu_device *adev = drm_to_adev(dev); 7274 struct list_head device_list; 7275 struct amdgpu_hive_info *hive = NULL; 7276 struct amdgpu_device *tmp_adev = NULL; 7277 7278 dev_info(adev->dev, "PCI error: resume callback!!\n"); 7279 7280 /* Only continue execution for the case of pci_channel_io_frozen */ 7281 if (adev->pci_channel_state != pci_channel_io_frozen) 7282 return; 7283 7284 INIT_LIST_HEAD(&device_list); 7285 7286 hive = amdgpu_get_xgmi_hive(adev); 7287 if (hive) { 7288 mutex_lock(&hive->hive_lock); 7289 list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head) { 7290 tmp_adev->pcie_reset_ctx.in_link_reset = false; 7291 list_add_tail(&tmp_adev->reset_list, &device_list); 7292 } 7293 } else 7294 list_add_tail(&adev->reset_list, &device_list); 7295 7296 amdgpu_device_sched_resume(&device_list, NULL, NULL); 7297 amdgpu_device_gpu_resume(adev, &device_list, false); 7298 amdgpu_device_recovery_put_reset_lock(adev, &device_list); 7299 7300 if (hive) { 7301 mutex_unlock(&hive->hive_lock); 7302 amdgpu_put_xgmi_hive(hive); 7303 } 7304 } 7305 7306 static void amdgpu_device_cache_switch_state(struct amdgpu_device *adev) 7307 { 7308 struct pci_dev *swus, *swds; 7309 int r; 7310 7311 swds = pci_upstream_bridge(adev->pdev); 7312 if (!swds || swds->vendor != PCI_VENDOR_ID_ATI || 7313 pci_pcie_type(swds) != PCI_EXP_TYPE_DOWNSTREAM) 7314 return; 7315 swus = pci_upstream_bridge(swds); 7316 if (!swus || 7317 (swus->vendor != PCI_VENDOR_ID_ATI && 7318 swus->vendor != PCI_VENDOR_ID_AMD) || 7319 pci_pcie_type(swus) != PCI_EXP_TYPE_UPSTREAM) 7320 return; 7321 7322 /* If already saved, return */ 7323 if (adev->pcie_reset_ctx.swus) 7324 return; 7325 /* Upstream bridge is ATI, assume it's SWUS/DS architecture */ 7326 r = pci_save_state(swds); 7327 if (r) 7328 return; 7329 adev->pcie_reset_ctx.swds_pcistate = pci_store_saved_state(swds); 7330 7331 r = pci_save_state(swus); 7332 if (r) 7333 return; 7334 adev->pcie_reset_ctx.swus_pcistate = pci_store_saved_state(swus); 7335 7336 adev->pcie_reset_ctx.swus = swus; 7337 } 7338 7339 static void amdgpu_device_load_switch_state(struct amdgpu_device *adev) 7340 { 7341 struct pci_dev *pdev; 7342 int r; 7343 7344 if (!adev->pcie_reset_ctx.swds_pcistate || 7345 !adev->pcie_reset_ctx.swus_pcistate) 7346 return; 7347 7348 pdev = adev->pcie_reset_ctx.swus; 7349 r = pci_load_saved_state(pdev, adev->pcie_reset_ctx.swus_pcistate); 7350 if (!r) { 7351 pci_restore_state(pdev); 7352 } else { 7353 dev_warn(adev->dev, "Failed to load SWUS state, err:%d\n", r); 7354 return; 7355 } 7356 7357 pdev = pci_upstream_bridge(adev->pdev); 7358 r = pci_load_saved_state(pdev, adev->pcie_reset_ctx.swds_pcistate); 7359 if (!r) 7360 pci_restore_state(pdev); 7361 else 7362 dev_warn(adev->dev, "Failed to load SWDS state, err:%d\n", r); 7363 } 7364 7365 bool amdgpu_device_cache_pci_state(struct pci_dev *pdev) 7366 { 7367 struct drm_device *dev = pci_get_drvdata(pdev); 7368 struct amdgpu_device *adev = drm_to_adev(dev); 7369 int r; 7370 7371 if (amdgpu_sriov_vf(adev)) 7372 return false; 7373 7374 r = pci_save_state(pdev); 7375 if (!r) { 7376 kfree(adev->pci_state); 7377 7378 adev->pci_state = pci_store_saved_state(pdev); 7379 7380 if (!adev->pci_state) { 7381 dev_err(adev->dev, "Failed to store PCI saved state"); 7382 return false; 7383 } 7384 } else { 7385 dev_warn(adev->dev, "Failed to save PCI state, err:%d\n", r); 7386 return false; 7387 } 7388 7389 amdgpu_device_cache_switch_state(adev); 7390 7391 return true; 7392 } 7393 7394 bool amdgpu_device_load_pci_state(struct pci_dev *pdev) 7395 { 7396 struct drm_device *dev = pci_get_drvdata(pdev); 7397 struct amdgpu_device *adev = drm_to_adev(dev); 7398 int r; 7399 7400 if (!adev->pci_state) 7401 return false; 7402 7403 r = pci_load_saved_state(pdev, adev->pci_state); 7404 7405 if (!r) { 7406 pci_restore_state(pdev); 7407 } else { 7408 dev_warn(adev->dev, "Failed to load PCI state, err:%d\n", r); 7409 return false; 7410 } 7411 7412 return true; 7413 } 7414 7415 void amdgpu_device_flush_hdp(struct amdgpu_device *adev, 7416 struct amdgpu_ring *ring) 7417 { 7418 #ifdef CONFIG_X86_64 7419 if ((adev->flags & AMD_IS_APU) && !amdgpu_passthrough(adev)) 7420 return; 7421 #endif 7422 if (adev->gmc.xgmi.connected_to_cpu) 7423 return; 7424 7425 if (ring && ring->funcs->emit_hdp_flush) { 7426 amdgpu_ring_emit_hdp_flush(ring); 7427 return; 7428 } 7429 7430 if (!ring && amdgpu_sriov_runtime(adev)) { 7431 if (!amdgpu_kiq_hdp_flush(adev)) 7432 return; 7433 } 7434 7435 amdgpu_hdp_flush(adev, ring); 7436 } 7437 7438 void amdgpu_device_invalidate_hdp(struct amdgpu_device *adev, 7439 struct amdgpu_ring *ring) 7440 { 7441 #ifdef CONFIG_X86_64 7442 if ((adev->flags & AMD_IS_APU) && !amdgpu_passthrough(adev)) 7443 return; 7444 #endif 7445 if (adev->gmc.xgmi.connected_to_cpu) 7446 return; 7447 7448 amdgpu_hdp_invalidate(adev, ring); 7449 } 7450 7451 int amdgpu_in_reset(struct amdgpu_device *adev) 7452 { 7453 return atomic_read(&adev->reset_domain->in_gpu_reset); 7454 } 7455 7456 /** 7457 * amdgpu_device_halt() - bring hardware to some kind of halt state 7458 * 7459 * @adev: amdgpu_device pointer 7460 * 7461 * Bring hardware to some kind of halt state so that no one can touch it 7462 * any more. It will help to maintain error context when error occurred. 7463 * Compare to a simple hang, the system will keep stable at least for SSH 7464 * access. Then it should be trivial to inspect the hardware state and 7465 * see what's going on. Implemented as following: 7466 * 7467 * 1. drm_dev_unplug() makes device inaccessible to user space(IOCTLs, etc), 7468 * clears all CPU mappings to device, disallows remappings through page faults 7469 * 2. amdgpu_irq_disable_all() disables all interrupts 7470 * 3. amdgpu_fence_driver_hw_fini() signals all HW fences 7471 * 4. set adev->no_hw_access to avoid potential crashes after setp 5 7472 * 5. amdgpu_device_unmap_mmio() clears all MMIO mappings 7473 * 6. pci_disable_device() and pci_wait_for_pending_transaction() 7474 * flush any in flight DMA operations 7475 */ 7476 void amdgpu_device_halt(struct amdgpu_device *adev) 7477 { 7478 struct pci_dev *pdev = adev->pdev; 7479 struct drm_device *ddev = adev_to_drm(adev); 7480 7481 amdgpu_xcp_dev_unplug(adev); 7482 drm_dev_unplug(ddev); 7483 7484 amdgpu_irq_disable_all(adev); 7485 7486 amdgpu_fence_driver_hw_fini(adev); 7487 7488 adev->no_hw_access = true; 7489 7490 amdgpu_device_unmap_mmio(adev); 7491 7492 pci_disable_device(pdev); 7493 pci_wait_for_pending_transaction(pdev); 7494 } 7495 7496 u32 amdgpu_device_pcie_port_rreg(struct amdgpu_device *adev, 7497 u32 reg) 7498 { 7499 unsigned long flags, address, data; 7500 u32 r; 7501 7502 address = adev->nbio.funcs->get_pcie_port_index_offset(adev); 7503 data = adev->nbio.funcs->get_pcie_port_data_offset(adev); 7504 7505 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 7506 WREG32(address, reg * 4); 7507 (void)RREG32(address); 7508 r = RREG32(data); 7509 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 7510 return r; 7511 } 7512 7513 void amdgpu_device_pcie_port_wreg(struct amdgpu_device *adev, 7514 u32 reg, u32 v) 7515 { 7516 unsigned long flags, address, data; 7517 7518 address = adev->nbio.funcs->get_pcie_port_index_offset(adev); 7519 data = adev->nbio.funcs->get_pcie_port_data_offset(adev); 7520 7521 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 7522 WREG32(address, reg * 4); 7523 (void)RREG32(address); 7524 WREG32(data, v); 7525 (void)RREG32(data); 7526 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 7527 } 7528 7529 /** 7530 * amdgpu_device_get_gang - return a reference to the current gang 7531 * @adev: amdgpu_device pointer 7532 * 7533 * Returns: A new reference to the current gang leader. 7534 */ 7535 struct dma_fence *amdgpu_device_get_gang(struct amdgpu_device *adev) 7536 { 7537 struct dma_fence *fence; 7538 7539 rcu_read_lock(); 7540 fence = dma_fence_get_rcu_safe(&adev->gang_submit); 7541 rcu_read_unlock(); 7542 return fence; 7543 } 7544 7545 /** 7546 * amdgpu_device_switch_gang - switch to a new gang 7547 * @adev: amdgpu_device pointer 7548 * @gang: the gang to switch to 7549 * 7550 * Try to switch to a new gang. 7551 * Returns: NULL if we switched to the new gang or a reference to the current 7552 * gang leader. 7553 */ 7554 struct dma_fence *amdgpu_device_switch_gang(struct amdgpu_device *adev, 7555 struct dma_fence *gang) 7556 { 7557 struct dma_fence *old = NULL; 7558 7559 dma_fence_get(gang); 7560 do { 7561 dma_fence_put(old); 7562 old = amdgpu_device_get_gang(adev); 7563 if (old == gang) 7564 break; 7565 7566 if (!dma_fence_is_signaled(old)) { 7567 dma_fence_put(gang); 7568 return old; 7569 } 7570 7571 } while (cmpxchg((struct dma_fence __force **)&adev->gang_submit, 7572 old, gang) != old); 7573 7574 /* 7575 * Drop it once for the exchanged reference in adev and once for the 7576 * thread local reference acquired in amdgpu_device_get_gang(). 7577 */ 7578 dma_fence_put(old); 7579 dma_fence_put(old); 7580 return NULL; 7581 } 7582 7583 /** 7584 * amdgpu_device_enforce_isolation - enforce HW isolation 7585 * @adev: the amdgpu device pointer 7586 * @ring: the HW ring the job is supposed to run on 7587 * @job: the job which is about to be pushed to the HW ring 7588 * 7589 * Makes sure that only one client at a time can use the GFX block. 7590 * Returns: The dependency to wait on before the job can be pushed to the HW. 7591 * The function is called multiple times until NULL is returned. 7592 */ 7593 struct dma_fence *amdgpu_device_enforce_isolation(struct amdgpu_device *adev, 7594 struct amdgpu_ring *ring, 7595 struct amdgpu_job *job) 7596 { 7597 struct amdgpu_isolation *isolation = &adev->isolation[ring->xcp_id]; 7598 struct drm_sched_fence *f = job->base.s_fence; 7599 struct dma_fence *dep; 7600 void *owner; 7601 int r; 7602 7603 /* 7604 * For now enforce isolation only for the GFX block since we only need 7605 * the cleaner shader on those rings. 7606 */ 7607 if (ring->funcs->type != AMDGPU_RING_TYPE_GFX && 7608 ring->funcs->type != AMDGPU_RING_TYPE_COMPUTE) 7609 return NULL; 7610 7611 /* 7612 * All submissions where enforce isolation is false are handled as if 7613 * they come from a single client. Use ~0l as the owner to distinct it 7614 * from kernel submissions where the owner is NULL. 7615 */ 7616 owner = job->enforce_isolation ? f->owner : (void *)~0l; 7617 7618 mutex_lock(&adev->enforce_isolation_mutex); 7619 7620 /* 7621 * The "spearhead" submission is the first one which changes the 7622 * ownership to its client. We always need to wait for it to be 7623 * pushed to the HW before proceeding with anything. 7624 */ 7625 if (&f->scheduled != isolation->spearhead && 7626 !dma_fence_is_signaled(isolation->spearhead)) { 7627 dep = isolation->spearhead; 7628 goto out_grab_ref; 7629 } 7630 7631 if (isolation->owner != owner) { 7632 7633 /* 7634 * Wait for any gang to be assembled before switching to a 7635 * different owner or otherwise we could deadlock the 7636 * submissions. 7637 */ 7638 if (!job->gang_submit) { 7639 dep = amdgpu_device_get_gang(adev); 7640 if (!dma_fence_is_signaled(dep)) 7641 goto out_return_dep; 7642 dma_fence_put(dep); 7643 } 7644 7645 dma_fence_put(isolation->spearhead); 7646 isolation->spearhead = dma_fence_get(&f->scheduled); 7647 amdgpu_sync_move(&isolation->active, &isolation->prev); 7648 trace_amdgpu_isolation(isolation->owner, owner); 7649 isolation->owner = owner; 7650 } 7651 7652 /* 7653 * Specifying the ring here helps to pipeline submissions even when 7654 * isolation is enabled. If that is not desired for testing NULL can be 7655 * used instead of the ring to enforce a CPU round trip while switching 7656 * between clients. 7657 */ 7658 dep = amdgpu_sync_peek_fence(&isolation->prev, ring); 7659 r = amdgpu_sync_fence(&isolation->active, &f->finished, GFP_NOWAIT); 7660 if (r) 7661 dev_warn(adev->dev, "OOM tracking isolation\n"); 7662 7663 out_grab_ref: 7664 dma_fence_get(dep); 7665 out_return_dep: 7666 mutex_unlock(&adev->enforce_isolation_mutex); 7667 return dep; 7668 } 7669 7670 bool amdgpu_device_has_display_hardware(struct amdgpu_device *adev) 7671 { 7672 switch (adev->asic_type) { 7673 #ifdef CONFIG_DRM_AMDGPU_SI 7674 case CHIP_HAINAN: 7675 #endif 7676 case CHIP_TOPAZ: 7677 /* chips with no display hardware */ 7678 return false; 7679 #ifdef CONFIG_DRM_AMDGPU_SI 7680 case CHIP_TAHITI: 7681 case CHIP_PITCAIRN: 7682 case CHIP_VERDE: 7683 case CHIP_OLAND: 7684 #endif 7685 #ifdef CONFIG_DRM_AMDGPU_CIK 7686 case CHIP_BONAIRE: 7687 case CHIP_HAWAII: 7688 case CHIP_KAVERI: 7689 case CHIP_KABINI: 7690 case CHIP_MULLINS: 7691 #endif 7692 case CHIP_TONGA: 7693 case CHIP_FIJI: 7694 case CHIP_POLARIS10: 7695 case CHIP_POLARIS11: 7696 case CHIP_POLARIS12: 7697 case CHIP_VEGAM: 7698 case CHIP_CARRIZO: 7699 case CHIP_STONEY: 7700 /* chips with display hardware */ 7701 return true; 7702 default: 7703 /* IP discovery */ 7704 if (!amdgpu_ip_version(adev, DCE_HWIP, 0) || 7705 (adev->harvest_ip_mask & AMD_HARVEST_IP_DMU_MASK)) 7706 return false; 7707 return true; 7708 } 7709 } 7710 7711 uint32_t amdgpu_device_wait_on_rreg(struct amdgpu_device *adev, 7712 uint32_t inst, uint32_t reg_addr, char reg_name[], 7713 uint32_t expected_value, uint32_t mask) 7714 { 7715 uint32_t ret = 0; 7716 uint32_t old_ = 0; 7717 uint32_t tmp_ = RREG32(reg_addr); 7718 uint32_t loop = adev->usec_timeout; 7719 7720 while ((tmp_ & (mask)) != (expected_value)) { 7721 if (old_ != tmp_) { 7722 loop = adev->usec_timeout; 7723 old_ = tmp_; 7724 } else 7725 udelay(1); 7726 tmp_ = RREG32(reg_addr); 7727 loop--; 7728 if (!loop) { 7729 dev_warn( 7730 adev->dev, 7731 "Register(%d) [%s] failed to reach value 0x%08x != 0x%08xn", 7732 inst, reg_name, (uint32_t)expected_value, 7733 (uint32_t)(tmp_ & (mask))); 7734 ret = -ETIMEDOUT; 7735 break; 7736 } 7737 } 7738 return ret; 7739 } 7740 7741 ssize_t amdgpu_get_soft_full_reset_mask(struct amdgpu_ring *ring) 7742 { 7743 ssize_t size = 0; 7744 7745 if (!ring || !ring->adev) 7746 return size; 7747 7748 if (amdgpu_device_should_recover_gpu(ring->adev)) 7749 size |= AMDGPU_RESET_TYPE_FULL; 7750 7751 if (unlikely(!ring->adev->debug_disable_soft_recovery) && 7752 !amdgpu_sriov_vf(ring->adev) && ring->funcs->soft_recovery) 7753 size |= AMDGPU_RESET_TYPE_SOFT_RESET; 7754 7755 return size; 7756 } 7757 7758 ssize_t amdgpu_show_reset_mask(char *buf, uint32_t supported_reset) 7759 { 7760 ssize_t size = 0; 7761 7762 if (supported_reset == 0) { 7763 size += sysfs_emit_at(buf, size, "unsupported"); 7764 size += sysfs_emit_at(buf, size, "\n"); 7765 return size; 7766 7767 } 7768 7769 if (supported_reset & AMDGPU_RESET_TYPE_SOFT_RESET) 7770 size += sysfs_emit_at(buf, size, "soft "); 7771 7772 if (supported_reset & AMDGPU_RESET_TYPE_PER_QUEUE) 7773 size += sysfs_emit_at(buf, size, "queue "); 7774 7775 if (supported_reset & AMDGPU_RESET_TYPE_PER_PIPE) 7776 size += sysfs_emit_at(buf, size, "pipe "); 7777 7778 if (supported_reset & AMDGPU_RESET_TYPE_FULL) 7779 size += sysfs_emit_at(buf, size, "full "); 7780 7781 size += sysfs_emit_at(buf, size, "\n"); 7782 return size; 7783 } 7784 7785 void amdgpu_device_set_uid(struct amdgpu_uid *uid_info, 7786 enum amdgpu_uid_type type, uint8_t inst, 7787 uint64_t uid) 7788 { 7789 if (!uid_info) 7790 return; 7791 7792 if (type >= AMDGPU_UID_TYPE_MAX) { 7793 dev_err_once(uid_info->adev->dev, "Invalid UID type %d\n", 7794 type); 7795 return; 7796 } 7797 7798 if (inst >= AMDGPU_UID_INST_MAX) { 7799 dev_err_once(uid_info->adev->dev, "Invalid UID instance %d\n", 7800 inst); 7801 return; 7802 } 7803 7804 if (uid_info->uid[type][inst] != 0) { 7805 dev_warn_once( 7806 uid_info->adev->dev, 7807 "Overwriting existing UID %llu for type %d instance %d\n", 7808 uid_info->uid[type][inst], type, inst); 7809 } 7810 7811 uid_info->uid[type][inst] = uid; 7812 } 7813 7814 u64 amdgpu_device_get_uid(struct amdgpu_uid *uid_info, 7815 enum amdgpu_uid_type type, uint8_t inst) 7816 { 7817 if (!uid_info) 7818 return 0; 7819 7820 if (type >= AMDGPU_UID_TYPE_MAX) { 7821 dev_err_once(uid_info->adev->dev, "Invalid UID type %d\n", 7822 type); 7823 return 0; 7824 } 7825 7826 if (inst >= AMDGPU_UID_INST_MAX) { 7827 dev_err_once(uid_info->adev->dev, "Invalid UID instance %d\n", 7828 inst); 7829 return 0; 7830 } 7831 7832 return uid_info->uid[type][inst]; 7833 } 7834