1 /* 2 * Copyright 2008 Advanced Micro Devices, Inc. 3 * Copyright 2008 Red Hat Inc. 4 * Copyright 2009 Jerome Glisse. 5 * 6 * Permission is hereby granted, free of charge, to any person obtaining a 7 * copy of this software and associated documentation files (the "Software"), 8 * to deal in the Software without restriction, including without limitation 9 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 10 * and/or sell copies of the Software, and to permit persons to whom the 11 * Software is furnished to do so, subject to the following conditions: 12 * 13 * The above copyright notice and this permission notice shall be included in 14 * all copies or substantial portions of the Software. 15 * 16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 19 * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR 20 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, 21 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR 22 * OTHER DEALINGS IN THE SOFTWARE. 23 * 24 * Authors: Dave Airlie 25 * Alex Deucher 26 * Jerome Glisse 27 */ 28 29 #include <linux/aperture.h> 30 #include <linux/power_supply.h> 31 #include <linux/kthread.h> 32 #include <linux/module.h> 33 #include <linux/console.h> 34 #include <linux/slab.h> 35 #include <linux/iommu.h> 36 #include <linux/pci.h> 37 #include <linux/pci-p2pdma.h> 38 #include <linux/apple-gmux.h> 39 40 #include <drm/drm_atomic_helper.h> 41 #include <drm/drm_client_event.h> 42 #include <drm/drm_crtc_helper.h> 43 #include <drm/drm_probe_helper.h> 44 #include <drm/amdgpu_drm.h> 45 #include <linux/device.h> 46 #include <linux/vgaarb.h> 47 #include <linux/vga_switcheroo.h> 48 #include <linux/efi.h> 49 #include "amdgpu.h" 50 #include "amdgpu_trace.h" 51 #include "amdgpu_i2c.h" 52 #include "atom.h" 53 #include "amdgpu_atombios.h" 54 #include "amdgpu_atomfirmware.h" 55 #include "amd_pcie.h" 56 #ifdef CONFIG_DRM_AMDGPU_SI 57 #include "si.h" 58 #endif 59 #ifdef CONFIG_DRM_AMDGPU_CIK 60 #include "cik.h" 61 #endif 62 #include "vi.h" 63 #include "soc15.h" 64 #include "nv.h" 65 #include "bif/bif_4_1_d.h" 66 #include <linux/firmware.h> 67 #include "amdgpu_vf_error.h" 68 69 #include "amdgpu_amdkfd.h" 70 #include "amdgpu_pm.h" 71 72 #include "amdgpu_xgmi.h" 73 #include "amdgpu_ras.h" 74 #include "amdgpu_ras_mgr.h" 75 #include "amdgpu_pmu.h" 76 #include "amdgpu_fru_eeprom.h" 77 #include "amdgpu_reset.h" 78 #include "amdgpu_virt.h" 79 #include "amdgpu_dev_coredump.h" 80 81 #include <linux/suspend.h> 82 #include <drm/task_barrier.h> 83 #include <linux/pm_runtime.h> 84 85 #include <drm/drm_drv.h> 86 87 #if IS_ENABLED(CONFIG_X86) 88 #include <asm/intel-family.h> 89 #include <asm/cpu_device_id.h> 90 #endif 91 92 MODULE_FIRMWARE("amdgpu/vega10_gpu_info.bin"); 93 MODULE_FIRMWARE("amdgpu/vega12_gpu_info.bin"); 94 MODULE_FIRMWARE("amdgpu/raven_gpu_info.bin"); 95 MODULE_FIRMWARE("amdgpu/picasso_gpu_info.bin"); 96 MODULE_FIRMWARE("amdgpu/raven2_gpu_info.bin"); 97 MODULE_FIRMWARE("amdgpu/arcturus_gpu_info.bin"); 98 MODULE_FIRMWARE("amdgpu/navi12_gpu_info.bin"); 99 MODULE_FIRMWARE("amdgpu/cyan_skillfish_gpu_info.bin"); 100 101 #define AMDGPU_RESUME_MS 2000 102 #define AMDGPU_MAX_RETRY_LIMIT 2 103 #define AMDGPU_RETRY_SRIOV_RESET(r) ((r) == -EBUSY || (r) == -ETIMEDOUT || (r) == -EINVAL) 104 #define AMDGPU_PCIE_INDEX_FALLBACK (0x38 >> 2) 105 #define AMDGPU_PCIE_INDEX_HI_FALLBACK (0x44 >> 2) 106 #define AMDGPU_PCIE_DATA_FALLBACK (0x3C >> 2) 107 108 #define AMDGPU_VBIOS_SKIP (1U << 0) 109 #define AMDGPU_VBIOS_OPTIONAL (1U << 1) 110 111 static const struct drm_driver amdgpu_kms_driver; 112 113 const char *amdgpu_asic_name[] = { 114 "TAHITI", 115 "PITCAIRN", 116 "VERDE", 117 "OLAND", 118 "HAINAN", 119 "BONAIRE", 120 "KAVERI", 121 "KABINI", 122 "HAWAII", 123 "MULLINS", 124 "TOPAZ", 125 "TONGA", 126 "FIJI", 127 "CARRIZO", 128 "STONEY", 129 "POLARIS10", 130 "POLARIS11", 131 "POLARIS12", 132 "VEGAM", 133 "VEGA10", 134 "VEGA12", 135 "VEGA20", 136 "RAVEN", 137 "ARCTURUS", 138 "RENOIR", 139 "ALDEBARAN", 140 "NAVI10", 141 "CYAN_SKILLFISH", 142 "NAVI14", 143 "NAVI12", 144 "SIENNA_CICHLID", 145 "NAVY_FLOUNDER", 146 "VANGOGH", 147 "DIMGREY_CAVEFISH", 148 "BEIGE_GOBY", 149 "YELLOW_CARP", 150 "IP DISCOVERY", 151 "LAST", 152 }; 153 154 #define AMDGPU_IP_BLK_MASK_ALL GENMASK(AMD_IP_BLOCK_TYPE_NUM - 1, 0) 155 /* 156 * Default init level where all blocks are expected to be initialized. This is 157 * the level of initialization expected by default and also after a full reset 158 * of the device. 159 */ 160 struct amdgpu_init_level amdgpu_init_default = { 161 .level = AMDGPU_INIT_LEVEL_DEFAULT, 162 .hwini_ip_block_mask = AMDGPU_IP_BLK_MASK_ALL, 163 }; 164 165 struct amdgpu_init_level amdgpu_init_recovery = { 166 .level = AMDGPU_INIT_LEVEL_RESET_RECOVERY, 167 .hwini_ip_block_mask = AMDGPU_IP_BLK_MASK_ALL, 168 }; 169 170 /* 171 * Minimal blocks needed to be initialized before a XGMI hive can be reset. This 172 * is used for cases like reset on initialization where the entire hive needs to 173 * be reset before first use. 174 */ 175 struct amdgpu_init_level amdgpu_init_minimal_xgmi = { 176 .level = AMDGPU_INIT_LEVEL_MINIMAL_XGMI, 177 .hwini_ip_block_mask = 178 BIT(AMD_IP_BLOCK_TYPE_GMC) | BIT(AMD_IP_BLOCK_TYPE_SMC) | 179 BIT(AMD_IP_BLOCK_TYPE_COMMON) | BIT(AMD_IP_BLOCK_TYPE_IH) | 180 BIT(AMD_IP_BLOCK_TYPE_PSP) 181 }; 182 183 static int amdgpu_device_ip_resume_phase1(struct amdgpu_device *adev); 184 static int amdgpu_device_ip_resume_phase2(struct amdgpu_device *adev); 185 static int amdgpu_device_ip_resume_phase3(struct amdgpu_device *adev); 186 187 static void amdgpu_device_load_switch_state(struct amdgpu_device *adev); 188 189 static inline bool amdgpu_ip_member_of_hwini(struct amdgpu_device *adev, 190 enum amd_ip_block_type block) 191 { 192 return (adev->init_lvl->hwini_ip_block_mask & (1U << block)) != 0; 193 } 194 195 void amdgpu_set_init_level(struct amdgpu_device *adev, 196 enum amdgpu_init_lvl_id lvl) 197 { 198 switch (lvl) { 199 case AMDGPU_INIT_LEVEL_MINIMAL_XGMI: 200 adev->init_lvl = &amdgpu_init_minimal_xgmi; 201 break; 202 case AMDGPU_INIT_LEVEL_RESET_RECOVERY: 203 adev->init_lvl = &amdgpu_init_recovery; 204 break; 205 case AMDGPU_INIT_LEVEL_DEFAULT: 206 fallthrough; 207 default: 208 adev->init_lvl = &amdgpu_init_default; 209 break; 210 } 211 } 212 213 static inline void amdgpu_device_stop_pending_resets(struct amdgpu_device *adev); 214 static int amdgpu_device_pm_notifier(struct notifier_block *nb, unsigned long mode, 215 void *data); 216 217 /** 218 * DOC: pcie_replay_count 219 * 220 * The amdgpu driver provides a sysfs API for reporting the total number 221 * of PCIe replays (NAKs). 222 * The file pcie_replay_count is used for this and returns the total 223 * number of replays as a sum of the NAKs generated and NAKs received. 224 */ 225 226 static ssize_t amdgpu_device_get_pcie_replay_count(struct device *dev, 227 struct device_attribute *attr, char *buf) 228 { 229 struct drm_device *ddev = dev_get_drvdata(dev); 230 struct amdgpu_device *adev = drm_to_adev(ddev); 231 uint64_t cnt = amdgpu_asic_get_pcie_replay_count(adev); 232 233 return sysfs_emit(buf, "%llu\n", cnt); 234 } 235 236 static DEVICE_ATTR(pcie_replay_count, 0444, 237 amdgpu_device_get_pcie_replay_count, NULL); 238 239 static int amdgpu_device_attr_sysfs_init(struct amdgpu_device *adev) 240 { 241 int ret = 0; 242 243 if (amdgpu_nbio_is_replay_cnt_supported(adev)) 244 ret = sysfs_create_file(&adev->dev->kobj, 245 &dev_attr_pcie_replay_count.attr); 246 247 return ret; 248 } 249 250 static void amdgpu_device_attr_sysfs_fini(struct amdgpu_device *adev) 251 { 252 if (amdgpu_nbio_is_replay_cnt_supported(adev)) 253 sysfs_remove_file(&adev->dev->kobj, 254 &dev_attr_pcie_replay_count.attr); 255 } 256 257 static ssize_t amdgpu_sysfs_reg_state_get(struct file *f, struct kobject *kobj, 258 const struct bin_attribute *attr, char *buf, 259 loff_t ppos, size_t count) 260 { 261 struct device *dev = kobj_to_dev(kobj); 262 struct drm_device *ddev = dev_get_drvdata(dev); 263 struct amdgpu_device *adev = drm_to_adev(ddev); 264 ssize_t bytes_read; 265 266 switch (ppos) { 267 case AMDGPU_SYS_REG_STATE_XGMI: 268 bytes_read = amdgpu_asic_get_reg_state( 269 adev, AMDGPU_REG_STATE_TYPE_XGMI, buf, count); 270 break; 271 case AMDGPU_SYS_REG_STATE_WAFL: 272 bytes_read = amdgpu_asic_get_reg_state( 273 adev, AMDGPU_REG_STATE_TYPE_WAFL, buf, count); 274 break; 275 case AMDGPU_SYS_REG_STATE_PCIE: 276 bytes_read = amdgpu_asic_get_reg_state( 277 adev, AMDGPU_REG_STATE_TYPE_PCIE, buf, count); 278 break; 279 case AMDGPU_SYS_REG_STATE_USR: 280 bytes_read = amdgpu_asic_get_reg_state( 281 adev, AMDGPU_REG_STATE_TYPE_USR, buf, count); 282 break; 283 case AMDGPU_SYS_REG_STATE_USR_1: 284 bytes_read = amdgpu_asic_get_reg_state( 285 adev, AMDGPU_REG_STATE_TYPE_USR_1, buf, count); 286 break; 287 default: 288 return -EINVAL; 289 } 290 291 return bytes_read; 292 } 293 294 static const BIN_ATTR(reg_state, 0444, amdgpu_sysfs_reg_state_get, NULL, 295 AMDGPU_SYS_REG_STATE_END); 296 297 int amdgpu_reg_state_sysfs_init(struct amdgpu_device *adev) 298 { 299 int ret; 300 301 if (!amdgpu_asic_get_reg_state_supported(adev)) 302 return 0; 303 304 ret = sysfs_create_bin_file(&adev->dev->kobj, &bin_attr_reg_state); 305 306 return ret; 307 } 308 309 void amdgpu_reg_state_sysfs_fini(struct amdgpu_device *adev) 310 { 311 if (!amdgpu_asic_get_reg_state_supported(adev)) 312 return; 313 sysfs_remove_bin_file(&adev->dev->kobj, &bin_attr_reg_state); 314 } 315 316 int amdgpu_ip_block_suspend(struct amdgpu_ip_block *ip_block) 317 { 318 int r; 319 320 if (ip_block->version->funcs->suspend) { 321 r = ip_block->version->funcs->suspend(ip_block); 322 if (r) { 323 dev_err(ip_block->adev->dev, 324 "suspend of IP block <%s> failed %d\n", 325 ip_block->version->funcs->name, r); 326 return r; 327 } 328 } 329 330 ip_block->status.hw = false; 331 return 0; 332 } 333 334 int amdgpu_ip_block_resume(struct amdgpu_ip_block *ip_block) 335 { 336 int r; 337 338 if (ip_block->version->funcs->resume) { 339 r = ip_block->version->funcs->resume(ip_block); 340 if (r) { 341 dev_err(ip_block->adev->dev, 342 "resume of IP block <%s> failed %d\n", 343 ip_block->version->funcs->name, r); 344 return r; 345 } 346 } 347 348 ip_block->status.hw = true; 349 return 0; 350 } 351 352 /** 353 * DOC: board_info 354 * 355 * The amdgpu driver provides a sysfs API for giving board related information. 356 * It provides the form factor information in the format 357 * 358 * type : form factor 359 * 360 * Possible form factor values 361 * 362 * - "cem" - PCIE CEM card 363 * - "oam" - Open Compute Accelerator Module 364 * - "unknown" - Not known 365 * 366 */ 367 368 static ssize_t amdgpu_device_get_board_info(struct device *dev, 369 struct device_attribute *attr, 370 char *buf) 371 { 372 struct drm_device *ddev = dev_get_drvdata(dev); 373 struct amdgpu_device *adev = drm_to_adev(ddev); 374 enum amdgpu_pkg_type pkg_type = AMDGPU_PKG_TYPE_CEM; 375 const char *pkg; 376 377 if (adev->smuio.funcs && adev->smuio.funcs->get_pkg_type) 378 pkg_type = adev->smuio.funcs->get_pkg_type(adev); 379 380 switch (pkg_type) { 381 case AMDGPU_PKG_TYPE_CEM: 382 pkg = "cem"; 383 break; 384 case AMDGPU_PKG_TYPE_OAM: 385 pkg = "oam"; 386 break; 387 default: 388 pkg = "unknown"; 389 break; 390 } 391 392 return sysfs_emit(buf, "%s : %s\n", "type", pkg); 393 } 394 395 static DEVICE_ATTR(board_info, 0444, amdgpu_device_get_board_info, NULL); 396 397 static struct attribute *amdgpu_board_attrs[] = { 398 &dev_attr_board_info.attr, 399 NULL, 400 }; 401 402 static umode_t amdgpu_board_attrs_is_visible(struct kobject *kobj, 403 struct attribute *attr, int n) 404 { 405 struct device *dev = kobj_to_dev(kobj); 406 struct drm_device *ddev = dev_get_drvdata(dev); 407 struct amdgpu_device *adev = drm_to_adev(ddev); 408 409 if (adev->flags & AMD_IS_APU) 410 return 0; 411 412 return attr->mode; 413 } 414 415 static const struct attribute_group amdgpu_board_attrs_group = { 416 .attrs = amdgpu_board_attrs, 417 .is_visible = amdgpu_board_attrs_is_visible 418 }; 419 420 static void amdgpu_device_get_pcie_info(struct amdgpu_device *adev); 421 422 /** 423 * amdgpu_device_supports_px - Is the device a dGPU with ATPX power control 424 * 425 * @adev: amdgpu device pointer 426 * 427 * Returns true if the device is a dGPU with ATPX power control, 428 * otherwise return false. 429 */ 430 bool amdgpu_device_supports_px(struct amdgpu_device *adev) 431 { 432 if ((adev->flags & AMD_IS_PX) && !amdgpu_is_atpx_hybrid()) 433 return true; 434 return false; 435 } 436 437 /** 438 * amdgpu_device_supports_boco - Is the device a dGPU with ACPI power resources 439 * 440 * @adev: amdgpu device pointer 441 * 442 * Returns true if the device is a dGPU with ACPI power control, 443 * otherwise return false. 444 */ 445 bool amdgpu_device_supports_boco(struct amdgpu_device *adev) 446 { 447 if (!IS_ENABLED(CONFIG_HOTPLUG_PCI_PCIE)) 448 return false; 449 450 if (adev->has_pr3 || 451 ((adev->flags & AMD_IS_PX) && amdgpu_is_atpx_hybrid())) 452 return true; 453 return false; 454 } 455 456 /** 457 * amdgpu_device_supports_baco - Does the device support BACO 458 * 459 * @adev: amdgpu device pointer 460 * 461 * Return: 462 * 1 if the device supports BACO; 463 * 3 if the device supports MACO (only works if BACO is supported) 464 * otherwise return 0. 465 */ 466 int amdgpu_device_supports_baco(struct amdgpu_device *adev) 467 { 468 return amdgpu_asic_supports_baco(adev); 469 } 470 471 void amdgpu_device_detect_runtime_pm_mode(struct amdgpu_device *adev) 472 { 473 int bamaco_support; 474 475 adev->pm.rpm_mode = AMDGPU_RUNPM_NONE; 476 bamaco_support = amdgpu_device_supports_baco(adev); 477 478 switch (amdgpu_runtime_pm) { 479 case 2: 480 if (bamaco_support & MACO_SUPPORT) { 481 adev->pm.rpm_mode = AMDGPU_RUNPM_BAMACO; 482 dev_info(adev->dev, "Forcing BAMACO for runtime pm\n"); 483 } else if (bamaco_support == BACO_SUPPORT) { 484 adev->pm.rpm_mode = AMDGPU_RUNPM_BACO; 485 dev_info(adev->dev, "Requested mode BAMACO not available,fallback to use BACO\n"); 486 } 487 break; 488 case 1: 489 if (bamaco_support & BACO_SUPPORT) { 490 adev->pm.rpm_mode = AMDGPU_RUNPM_BACO; 491 dev_info(adev->dev, "Forcing BACO for runtime pm\n"); 492 } 493 break; 494 case -1: 495 case -2: 496 if (amdgpu_device_supports_px(adev)) { 497 /* enable PX as runtime mode */ 498 adev->pm.rpm_mode = AMDGPU_RUNPM_PX; 499 dev_info(adev->dev, "Using ATPX for runtime pm\n"); 500 } else if (amdgpu_device_supports_boco(adev)) { 501 /* enable boco as runtime mode */ 502 adev->pm.rpm_mode = AMDGPU_RUNPM_BOCO; 503 dev_info(adev->dev, "Using BOCO for runtime pm\n"); 504 } else { 505 if (!bamaco_support) 506 goto no_runtime_pm; 507 508 switch (adev->asic_type) { 509 case CHIP_VEGA20: 510 case CHIP_ARCTURUS: 511 /* BACO are not supported on vega20 and arctrus */ 512 break; 513 case CHIP_VEGA10: 514 /* enable BACO as runpm mode if noretry=0 */ 515 if (!adev->gmc.noretry && !amdgpu_passthrough(adev)) 516 adev->pm.rpm_mode = AMDGPU_RUNPM_BACO; 517 break; 518 default: 519 /* enable BACO as runpm mode on CI+ */ 520 if (!amdgpu_passthrough(adev)) 521 adev->pm.rpm_mode = AMDGPU_RUNPM_BACO; 522 break; 523 } 524 525 if (adev->pm.rpm_mode == AMDGPU_RUNPM_BACO) { 526 if (bamaco_support & MACO_SUPPORT) { 527 adev->pm.rpm_mode = AMDGPU_RUNPM_BAMACO; 528 dev_info(adev->dev, "Using BAMACO for runtime pm\n"); 529 } else { 530 dev_info(adev->dev, "Using BACO for runtime pm\n"); 531 } 532 } 533 } 534 break; 535 case 0: 536 dev_info(adev->dev, "runtime pm is manually disabled\n"); 537 break; 538 default: 539 break; 540 } 541 542 no_runtime_pm: 543 if (adev->pm.rpm_mode == AMDGPU_RUNPM_NONE) 544 dev_info(adev->dev, "Runtime PM not available\n"); 545 } 546 /** 547 * amdgpu_device_supports_smart_shift - Is the device dGPU with 548 * smart shift support 549 * 550 * @adev: amdgpu device pointer 551 * 552 * Returns true if the device is a dGPU with Smart Shift support, 553 * otherwise returns false. 554 */ 555 bool amdgpu_device_supports_smart_shift(struct amdgpu_device *adev) 556 { 557 return (amdgpu_device_supports_boco(adev) && 558 amdgpu_acpi_is_power_shift_control_supported()); 559 } 560 561 /* 562 * VRAM access helper functions 563 */ 564 565 /** 566 * amdgpu_device_mm_access - access vram by MM_INDEX/MM_DATA 567 * 568 * @adev: amdgpu_device pointer 569 * @pos: offset of the buffer in vram 570 * @buf: virtual address of the buffer in system memory 571 * @size: read/write size, sizeof(@buf) must > @size 572 * @write: true - write to vram, otherwise - read from vram 573 */ 574 void amdgpu_device_mm_access(struct amdgpu_device *adev, loff_t pos, 575 void *buf, size_t size, bool write) 576 { 577 unsigned long flags; 578 uint32_t hi = ~0, tmp = 0; 579 uint32_t *data = buf; 580 uint64_t last; 581 int idx; 582 583 if (!drm_dev_enter(adev_to_drm(adev), &idx)) 584 return; 585 586 BUG_ON(!IS_ALIGNED(pos, 4) || !IS_ALIGNED(size, 4)); 587 588 spin_lock_irqsave(&adev->mmio_idx_lock, flags); 589 for (last = pos + size; pos < last; pos += 4) { 590 tmp = pos >> 31; 591 592 WREG32_NO_KIQ(mmMM_INDEX, ((uint32_t)pos) | 0x80000000); 593 if (tmp != hi) { 594 WREG32_NO_KIQ(mmMM_INDEX_HI, tmp); 595 hi = tmp; 596 } 597 if (write) 598 WREG32_NO_KIQ(mmMM_DATA, *data++); 599 else 600 *data++ = RREG32_NO_KIQ(mmMM_DATA); 601 } 602 603 spin_unlock_irqrestore(&adev->mmio_idx_lock, flags); 604 drm_dev_exit(idx); 605 } 606 607 /** 608 * amdgpu_device_aper_access - access vram by vram aperture 609 * 610 * @adev: amdgpu_device pointer 611 * @pos: offset of the buffer in vram 612 * @buf: virtual address of the buffer in system memory 613 * @size: read/write size, sizeof(@buf) must > @size 614 * @write: true - write to vram, otherwise - read from vram 615 * 616 * The return value means how many bytes have been transferred. 617 */ 618 size_t amdgpu_device_aper_access(struct amdgpu_device *adev, loff_t pos, 619 void *buf, size_t size, bool write) 620 { 621 #ifdef CONFIG_64BIT 622 void __iomem *addr; 623 size_t count = 0; 624 uint64_t last; 625 626 if (!adev->mman.aper_base_kaddr) 627 return 0; 628 629 last = min(pos + size, adev->gmc.visible_vram_size); 630 if (last > pos) { 631 addr = adev->mman.aper_base_kaddr + pos; 632 count = last - pos; 633 634 if (write) { 635 memcpy_toio(addr, buf, count); 636 /* Make sure HDP write cache flush happens without any reordering 637 * after the system memory contents are sent over PCIe device 638 */ 639 mb(); 640 amdgpu_device_flush_hdp(adev, NULL); 641 } else { 642 amdgpu_device_invalidate_hdp(adev, NULL); 643 /* Make sure HDP read cache is invalidated before issuing a read 644 * to the PCIe device 645 */ 646 mb(); 647 memcpy_fromio(buf, addr, count); 648 } 649 650 } 651 652 return count; 653 #else 654 return 0; 655 #endif 656 } 657 658 /** 659 * amdgpu_device_vram_access - read/write a buffer in vram 660 * 661 * @adev: amdgpu_device pointer 662 * @pos: offset of the buffer in vram 663 * @buf: virtual address of the buffer in system memory 664 * @size: read/write size, sizeof(@buf) must > @size 665 * @write: true - write to vram, otherwise - read from vram 666 */ 667 void amdgpu_device_vram_access(struct amdgpu_device *adev, loff_t pos, 668 void *buf, size_t size, bool write) 669 { 670 size_t count; 671 672 /* try to using vram apreature to access vram first */ 673 count = amdgpu_device_aper_access(adev, pos, buf, size, write); 674 size -= count; 675 if (size) { 676 /* using MM to access rest vram */ 677 pos += count; 678 buf += count; 679 amdgpu_device_mm_access(adev, pos, buf, size, write); 680 } 681 } 682 683 /* 684 * register access helper functions. 685 */ 686 687 /* Check if hw access should be skipped because of hotplug or device error */ 688 bool amdgpu_device_skip_hw_access(struct amdgpu_device *adev) 689 { 690 if (adev->no_hw_access) 691 return true; 692 693 #ifdef CONFIG_LOCKDEP 694 /* 695 * This is a bit complicated to understand, so worth a comment. What we assert 696 * here is that the GPU reset is not running on another thread in parallel. 697 * 698 * For this we trylock the read side of the reset semaphore, if that succeeds 699 * we know that the reset is not running in parallel. 700 * 701 * If the trylock fails we assert that we are either already holding the read 702 * side of the lock or are the reset thread itself and hold the write side of 703 * the lock. 704 */ 705 if (in_task()) { 706 if (down_read_trylock(&adev->reset_domain->sem)) 707 up_read(&adev->reset_domain->sem); 708 else 709 lockdep_assert_held(&adev->reset_domain->sem); 710 } 711 #endif 712 return false; 713 } 714 715 /** 716 * amdgpu_device_rreg - read a memory mapped IO or indirect register 717 * 718 * @adev: amdgpu_device pointer 719 * @reg: dword aligned register offset 720 * @acc_flags: access flags which require special behavior 721 * 722 * Returns the 32 bit value from the offset specified. 723 */ 724 uint32_t amdgpu_device_rreg(struct amdgpu_device *adev, 725 uint32_t reg, uint32_t acc_flags) 726 { 727 uint32_t ret; 728 729 if (amdgpu_device_skip_hw_access(adev)) 730 return 0; 731 732 if ((reg * 4) < adev->rmmio_size) { 733 if (!(acc_flags & AMDGPU_REGS_NO_KIQ) && 734 amdgpu_sriov_runtime(adev) && 735 down_read_trylock(&adev->reset_domain->sem)) { 736 ret = amdgpu_kiq_rreg(adev, reg, 0); 737 up_read(&adev->reset_domain->sem); 738 } else { 739 ret = readl(((void __iomem *)adev->rmmio) + (reg * 4)); 740 } 741 } else { 742 ret = adev->pcie_rreg(adev, reg * 4); 743 } 744 745 trace_amdgpu_device_rreg(adev->pdev->device, reg, ret); 746 747 return ret; 748 } 749 750 /* 751 * MMIO register read with bytes helper functions 752 * @offset:bytes offset from MMIO start 753 */ 754 755 /** 756 * amdgpu_mm_rreg8 - read a memory mapped IO register 757 * 758 * @adev: amdgpu_device pointer 759 * @offset: byte aligned register offset 760 * 761 * Returns the 8 bit value from the offset specified. 762 */ 763 uint8_t amdgpu_mm_rreg8(struct amdgpu_device *adev, uint32_t offset) 764 { 765 if (amdgpu_device_skip_hw_access(adev)) 766 return 0; 767 768 if (offset < adev->rmmio_size) 769 return (readb(adev->rmmio + offset)); 770 BUG(); 771 } 772 773 774 /** 775 * amdgpu_device_xcc_rreg - read a memory mapped IO or indirect register with specific XCC 776 * 777 * @adev: amdgpu_device pointer 778 * @reg: dword aligned register offset 779 * @acc_flags: access flags which require special behavior 780 * @xcc_id: xcc accelerated compute core id 781 * 782 * Returns the 32 bit value from the offset specified. 783 */ 784 uint32_t amdgpu_device_xcc_rreg(struct amdgpu_device *adev, 785 uint32_t reg, uint32_t acc_flags, 786 uint32_t xcc_id) 787 { 788 uint32_t ret, rlcg_flag; 789 790 if (amdgpu_device_skip_hw_access(adev)) 791 return 0; 792 793 if ((reg * 4) < adev->rmmio_size) { 794 if (amdgpu_sriov_vf(adev) && 795 !amdgpu_sriov_runtime(adev) && 796 adev->gfx.rlc.rlcg_reg_access_supported && 797 amdgpu_virt_get_rlcg_reg_access_flag(adev, acc_flags, 798 GC_HWIP, false, 799 &rlcg_flag)) { 800 ret = amdgpu_virt_rlcg_reg_rw(adev, reg, 0, rlcg_flag, GET_INST(GC, xcc_id)); 801 } else if (!(acc_flags & AMDGPU_REGS_NO_KIQ) && 802 amdgpu_sriov_runtime(adev) && 803 down_read_trylock(&adev->reset_domain->sem)) { 804 ret = amdgpu_kiq_rreg(adev, reg, xcc_id); 805 up_read(&adev->reset_domain->sem); 806 } else { 807 ret = readl(((void __iomem *)adev->rmmio) + (reg * 4)); 808 } 809 } else { 810 ret = adev->pcie_rreg(adev, reg * 4); 811 } 812 813 return ret; 814 } 815 816 /* 817 * MMIO register write with bytes helper functions 818 * @offset:bytes offset from MMIO start 819 * @value: the value want to be written to the register 820 */ 821 822 /** 823 * amdgpu_mm_wreg8 - read a memory mapped IO register 824 * 825 * @adev: amdgpu_device pointer 826 * @offset: byte aligned register offset 827 * @value: 8 bit value to write 828 * 829 * Writes the value specified to the offset specified. 830 */ 831 void amdgpu_mm_wreg8(struct amdgpu_device *adev, uint32_t offset, uint8_t value) 832 { 833 if (amdgpu_device_skip_hw_access(adev)) 834 return; 835 836 if (offset < adev->rmmio_size) 837 writeb(value, adev->rmmio + offset); 838 else 839 BUG(); 840 } 841 842 /** 843 * amdgpu_device_wreg - write to a memory mapped IO or indirect register 844 * 845 * @adev: amdgpu_device pointer 846 * @reg: dword aligned register offset 847 * @v: 32 bit value to write to the register 848 * @acc_flags: access flags which require special behavior 849 * 850 * Writes the value specified to the offset specified. 851 */ 852 void amdgpu_device_wreg(struct amdgpu_device *adev, 853 uint32_t reg, uint32_t v, 854 uint32_t acc_flags) 855 { 856 if (amdgpu_device_skip_hw_access(adev)) 857 return; 858 859 if ((reg * 4) < adev->rmmio_size) { 860 if (!(acc_flags & AMDGPU_REGS_NO_KIQ) && 861 amdgpu_sriov_runtime(adev) && 862 down_read_trylock(&adev->reset_domain->sem)) { 863 amdgpu_kiq_wreg(adev, reg, v, 0); 864 up_read(&adev->reset_domain->sem); 865 } else { 866 writel(v, ((void __iomem *)adev->rmmio) + (reg * 4)); 867 } 868 } else { 869 adev->pcie_wreg(adev, reg * 4, v); 870 } 871 872 trace_amdgpu_device_wreg(adev->pdev->device, reg, v); 873 } 874 875 /** 876 * amdgpu_mm_wreg_mmio_rlc - write register either with direct/indirect mmio or with RLC path if in range 877 * 878 * @adev: amdgpu_device pointer 879 * @reg: mmio/rlc register 880 * @v: value to write 881 * @xcc_id: xcc accelerated compute core id 882 * 883 * this function is invoked only for the debugfs register access 884 */ 885 void amdgpu_mm_wreg_mmio_rlc(struct amdgpu_device *adev, 886 uint32_t reg, uint32_t v, 887 uint32_t xcc_id) 888 { 889 if (amdgpu_device_skip_hw_access(adev)) 890 return; 891 892 if (amdgpu_sriov_fullaccess(adev) && 893 adev->gfx.rlc.funcs && 894 adev->gfx.rlc.funcs->is_rlcg_access_range) { 895 if (adev->gfx.rlc.funcs->is_rlcg_access_range(adev, reg)) 896 return amdgpu_sriov_wreg(adev, reg, v, 0, 0, xcc_id); 897 } else if ((reg * 4) >= adev->rmmio_size) { 898 adev->pcie_wreg(adev, reg * 4, v); 899 } else { 900 writel(v, ((void __iomem *)adev->rmmio) + (reg * 4)); 901 } 902 } 903 904 /** 905 * amdgpu_device_xcc_wreg - write to a memory mapped IO or indirect register with specific XCC 906 * 907 * @adev: amdgpu_device pointer 908 * @reg: dword aligned register offset 909 * @v: 32 bit value to write to the register 910 * @acc_flags: access flags which require special behavior 911 * @xcc_id: xcc accelerated compute core id 912 * 913 * Writes the value specified to the offset specified. 914 */ 915 void amdgpu_device_xcc_wreg(struct amdgpu_device *adev, 916 uint32_t reg, uint32_t v, 917 uint32_t acc_flags, uint32_t xcc_id) 918 { 919 uint32_t rlcg_flag; 920 921 if (amdgpu_device_skip_hw_access(adev)) 922 return; 923 924 if ((reg * 4) < adev->rmmio_size) { 925 if (amdgpu_sriov_vf(adev) && 926 !amdgpu_sriov_runtime(adev) && 927 adev->gfx.rlc.rlcg_reg_access_supported && 928 amdgpu_virt_get_rlcg_reg_access_flag(adev, acc_flags, 929 GC_HWIP, true, 930 &rlcg_flag)) { 931 amdgpu_virt_rlcg_reg_rw(adev, reg, v, rlcg_flag, GET_INST(GC, xcc_id)); 932 } else if (!(acc_flags & AMDGPU_REGS_NO_KIQ) && 933 amdgpu_sriov_runtime(adev) && 934 down_read_trylock(&adev->reset_domain->sem)) { 935 amdgpu_kiq_wreg(adev, reg, v, xcc_id); 936 up_read(&adev->reset_domain->sem); 937 } else { 938 writel(v, ((void __iomem *)adev->rmmio) + (reg * 4)); 939 } 940 } else { 941 adev->pcie_wreg(adev, reg * 4, v); 942 } 943 } 944 945 /** 946 * amdgpu_device_indirect_rreg - read an indirect register 947 * 948 * @adev: amdgpu_device pointer 949 * @reg_addr: indirect register address to read from 950 * 951 * Returns the value of indirect register @reg_addr 952 */ 953 u32 amdgpu_device_indirect_rreg(struct amdgpu_device *adev, 954 u32 reg_addr) 955 { 956 unsigned long flags, pcie_index, pcie_data; 957 void __iomem *pcie_index_offset; 958 void __iomem *pcie_data_offset; 959 u32 r; 960 961 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 962 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 963 964 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 965 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 966 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 967 968 writel(reg_addr, pcie_index_offset); 969 readl(pcie_index_offset); 970 r = readl(pcie_data_offset); 971 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 972 973 return r; 974 } 975 976 u32 amdgpu_device_indirect_rreg_ext(struct amdgpu_device *adev, 977 u64 reg_addr) 978 { 979 unsigned long flags, pcie_index, pcie_index_hi, pcie_data; 980 u32 r; 981 void __iomem *pcie_index_offset; 982 void __iomem *pcie_index_hi_offset; 983 void __iomem *pcie_data_offset; 984 985 if (unlikely(!adev->nbio.funcs)) { 986 pcie_index = AMDGPU_PCIE_INDEX_FALLBACK; 987 pcie_data = AMDGPU_PCIE_DATA_FALLBACK; 988 } else { 989 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 990 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 991 } 992 993 if (reg_addr >> 32) { 994 if (unlikely(!adev->nbio.funcs)) 995 pcie_index_hi = AMDGPU_PCIE_INDEX_HI_FALLBACK; 996 else 997 pcie_index_hi = adev->nbio.funcs->get_pcie_index_hi_offset(adev); 998 } else { 999 pcie_index_hi = 0; 1000 } 1001 1002 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 1003 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 1004 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 1005 if (pcie_index_hi != 0) 1006 pcie_index_hi_offset = (void __iomem *)adev->rmmio + 1007 pcie_index_hi * 4; 1008 1009 writel(reg_addr, pcie_index_offset); 1010 readl(pcie_index_offset); 1011 if (pcie_index_hi != 0) { 1012 writel((reg_addr >> 32) & 0xff, pcie_index_hi_offset); 1013 readl(pcie_index_hi_offset); 1014 } 1015 r = readl(pcie_data_offset); 1016 1017 /* clear the high bits */ 1018 if (pcie_index_hi != 0) { 1019 writel(0, pcie_index_hi_offset); 1020 readl(pcie_index_hi_offset); 1021 } 1022 1023 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 1024 1025 return r; 1026 } 1027 1028 /** 1029 * amdgpu_device_indirect_rreg64 - read a 64bits indirect register 1030 * 1031 * @adev: amdgpu_device pointer 1032 * @reg_addr: indirect register address to read from 1033 * 1034 * Returns the value of indirect register @reg_addr 1035 */ 1036 u64 amdgpu_device_indirect_rreg64(struct amdgpu_device *adev, 1037 u32 reg_addr) 1038 { 1039 unsigned long flags, pcie_index, pcie_data; 1040 void __iomem *pcie_index_offset; 1041 void __iomem *pcie_data_offset; 1042 u64 r; 1043 1044 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 1045 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 1046 1047 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 1048 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 1049 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 1050 1051 /* read low 32 bits */ 1052 writel(reg_addr, pcie_index_offset); 1053 readl(pcie_index_offset); 1054 r = readl(pcie_data_offset); 1055 /* read high 32 bits */ 1056 writel(reg_addr + 4, pcie_index_offset); 1057 readl(pcie_index_offset); 1058 r |= ((u64)readl(pcie_data_offset) << 32); 1059 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 1060 1061 return r; 1062 } 1063 1064 u64 amdgpu_device_indirect_rreg64_ext(struct amdgpu_device *adev, 1065 u64 reg_addr) 1066 { 1067 unsigned long flags, pcie_index, pcie_data; 1068 unsigned long pcie_index_hi = 0; 1069 void __iomem *pcie_index_offset; 1070 void __iomem *pcie_index_hi_offset; 1071 void __iomem *pcie_data_offset; 1072 u64 r; 1073 1074 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 1075 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 1076 if ((reg_addr >> 32) && (adev->nbio.funcs->get_pcie_index_hi_offset)) 1077 pcie_index_hi = adev->nbio.funcs->get_pcie_index_hi_offset(adev); 1078 1079 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 1080 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 1081 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 1082 if (pcie_index_hi != 0) 1083 pcie_index_hi_offset = (void __iomem *)adev->rmmio + 1084 pcie_index_hi * 4; 1085 1086 /* read low 32 bits */ 1087 writel(reg_addr, pcie_index_offset); 1088 readl(pcie_index_offset); 1089 if (pcie_index_hi != 0) { 1090 writel((reg_addr >> 32) & 0xff, pcie_index_hi_offset); 1091 readl(pcie_index_hi_offset); 1092 } 1093 r = readl(pcie_data_offset); 1094 /* read high 32 bits */ 1095 writel(reg_addr + 4, pcie_index_offset); 1096 readl(pcie_index_offset); 1097 if (pcie_index_hi != 0) { 1098 writel((reg_addr >> 32) & 0xff, pcie_index_hi_offset); 1099 readl(pcie_index_hi_offset); 1100 } 1101 r |= ((u64)readl(pcie_data_offset) << 32); 1102 1103 /* clear the high bits */ 1104 if (pcie_index_hi != 0) { 1105 writel(0, pcie_index_hi_offset); 1106 readl(pcie_index_hi_offset); 1107 } 1108 1109 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 1110 1111 return r; 1112 } 1113 1114 /** 1115 * amdgpu_device_indirect_wreg - write an indirect register address 1116 * 1117 * @adev: amdgpu_device pointer 1118 * @reg_addr: indirect register offset 1119 * @reg_data: indirect register data 1120 * 1121 */ 1122 void amdgpu_device_indirect_wreg(struct amdgpu_device *adev, 1123 u32 reg_addr, u32 reg_data) 1124 { 1125 unsigned long flags, pcie_index, pcie_data; 1126 void __iomem *pcie_index_offset; 1127 void __iomem *pcie_data_offset; 1128 1129 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 1130 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 1131 1132 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 1133 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 1134 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 1135 1136 writel(reg_addr, pcie_index_offset); 1137 readl(pcie_index_offset); 1138 writel(reg_data, pcie_data_offset); 1139 readl(pcie_data_offset); 1140 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 1141 } 1142 1143 void amdgpu_device_indirect_wreg_ext(struct amdgpu_device *adev, 1144 u64 reg_addr, u32 reg_data) 1145 { 1146 unsigned long flags, pcie_index, pcie_index_hi, pcie_data; 1147 void __iomem *pcie_index_offset; 1148 void __iomem *pcie_index_hi_offset; 1149 void __iomem *pcie_data_offset; 1150 1151 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 1152 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 1153 if ((reg_addr >> 32) && (adev->nbio.funcs->get_pcie_index_hi_offset)) 1154 pcie_index_hi = adev->nbio.funcs->get_pcie_index_hi_offset(adev); 1155 else 1156 pcie_index_hi = 0; 1157 1158 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 1159 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 1160 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 1161 if (pcie_index_hi != 0) 1162 pcie_index_hi_offset = (void __iomem *)adev->rmmio + 1163 pcie_index_hi * 4; 1164 1165 writel(reg_addr, pcie_index_offset); 1166 readl(pcie_index_offset); 1167 if (pcie_index_hi != 0) { 1168 writel((reg_addr >> 32) & 0xff, pcie_index_hi_offset); 1169 readl(pcie_index_hi_offset); 1170 } 1171 writel(reg_data, pcie_data_offset); 1172 readl(pcie_data_offset); 1173 1174 /* clear the high bits */ 1175 if (pcie_index_hi != 0) { 1176 writel(0, pcie_index_hi_offset); 1177 readl(pcie_index_hi_offset); 1178 } 1179 1180 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 1181 } 1182 1183 /** 1184 * amdgpu_device_indirect_wreg64 - write a 64bits indirect register address 1185 * 1186 * @adev: amdgpu_device pointer 1187 * @reg_addr: indirect register offset 1188 * @reg_data: indirect register data 1189 * 1190 */ 1191 void amdgpu_device_indirect_wreg64(struct amdgpu_device *adev, 1192 u32 reg_addr, u64 reg_data) 1193 { 1194 unsigned long flags, pcie_index, pcie_data; 1195 void __iomem *pcie_index_offset; 1196 void __iomem *pcie_data_offset; 1197 1198 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 1199 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 1200 1201 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 1202 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 1203 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 1204 1205 /* write low 32 bits */ 1206 writel(reg_addr, pcie_index_offset); 1207 readl(pcie_index_offset); 1208 writel((u32)(reg_data & 0xffffffffULL), pcie_data_offset); 1209 readl(pcie_data_offset); 1210 /* write high 32 bits */ 1211 writel(reg_addr + 4, pcie_index_offset); 1212 readl(pcie_index_offset); 1213 writel((u32)(reg_data >> 32), pcie_data_offset); 1214 readl(pcie_data_offset); 1215 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 1216 } 1217 1218 void amdgpu_device_indirect_wreg64_ext(struct amdgpu_device *adev, 1219 u64 reg_addr, u64 reg_data) 1220 { 1221 unsigned long flags, pcie_index, pcie_data; 1222 unsigned long pcie_index_hi = 0; 1223 void __iomem *pcie_index_offset; 1224 void __iomem *pcie_index_hi_offset; 1225 void __iomem *pcie_data_offset; 1226 1227 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 1228 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 1229 if ((reg_addr >> 32) && (adev->nbio.funcs->get_pcie_index_hi_offset)) 1230 pcie_index_hi = adev->nbio.funcs->get_pcie_index_hi_offset(adev); 1231 1232 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 1233 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 1234 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 1235 if (pcie_index_hi != 0) 1236 pcie_index_hi_offset = (void __iomem *)adev->rmmio + 1237 pcie_index_hi * 4; 1238 1239 /* write low 32 bits */ 1240 writel(reg_addr, pcie_index_offset); 1241 readl(pcie_index_offset); 1242 if (pcie_index_hi != 0) { 1243 writel((reg_addr >> 32) & 0xff, pcie_index_hi_offset); 1244 readl(pcie_index_hi_offset); 1245 } 1246 writel((u32)(reg_data & 0xffffffffULL), pcie_data_offset); 1247 readl(pcie_data_offset); 1248 /* write high 32 bits */ 1249 writel(reg_addr + 4, pcie_index_offset); 1250 readl(pcie_index_offset); 1251 if (pcie_index_hi != 0) { 1252 writel((reg_addr >> 32) & 0xff, pcie_index_hi_offset); 1253 readl(pcie_index_hi_offset); 1254 } 1255 writel((u32)(reg_data >> 32), pcie_data_offset); 1256 readl(pcie_data_offset); 1257 1258 /* clear the high bits */ 1259 if (pcie_index_hi != 0) { 1260 writel(0, pcie_index_hi_offset); 1261 readl(pcie_index_hi_offset); 1262 } 1263 1264 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 1265 } 1266 1267 /** 1268 * amdgpu_device_get_rev_id - query device rev_id 1269 * 1270 * @adev: amdgpu_device pointer 1271 * 1272 * Return device rev_id 1273 */ 1274 u32 amdgpu_device_get_rev_id(struct amdgpu_device *adev) 1275 { 1276 return adev->nbio.funcs->get_rev_id(adev); 1277 } 1278 1279 /** 1280 * amdgpu_invalid_rreg - dummy reg read function 1281 * 1282 * @adev: amdgpu_device pointer 1283 * @reg: offset of register 1284 * 1285 * Dummy register read function. Used for register blocks 1286 * that certain asics don't have (all asics). 1287 * Returns the value in the register. 1288 */ 1289 static uint32_t amdgpu_invalid_rreg(struct amdgpu_device *adev, uint32_t reg) 1290 { 1291 dev_err(adev->dev, "Invalid callback to read register 0x%04X\n", reg); 1292 BUG(); 1293 return 0; 1294 } 1295 1296 static uint32_t amdgpu_invalid_rreg_ext(struct amdgpu_device *adev, uint64_t reg) 1297 { 1298 dev_err(adev->dev, "Invalid callback to read register 0x%llX\n", reg); 1299 BUG(); 1300 return 0; 1301 } 1302 1303 /** 1304 * amdgpu_invalid_wreg - dummy reg write function 1305 * 1306 * @adev: amdgpu_device pointer 1307 * @reg: offset of register 1308 * @v: value to write to the register 1309 * 1310 * Dummy register read function. Used for register blocks 1311 * that certain asics don't have (all asics). 1312 */ 1313 static void amdgpu_invalid_wreg(struct amdgpu_device *adev, uint32_t reg, uint32_t v) 1314 { 1315 dev_err(adev->dev, 1316 "Invalid callback to write register 0x%04X with 0x%08X\n", reg, 1317 v); 1318 BUG(); 1319 } 1320 1321 static void amdgpu_invalid_wreg_ext(struct amdgpu_device *adev, uint64_t reg, uint32_t v) 1322 { 1323 dev_err(adev->dev, 1324 "Invalid callback to write register 0x%llX with 0x%08X\n", reg, 1325 v); 1326 BUG(); 1327 } 1328 1329 /** 1330 * amdgpu_invalid_rreg64 - dummy 64 bit reg read function 1331 * 1332 * @adev: amdgpu_device pointer 1333 * @reg: offset of register 1334 * 1335 * Dummy register read function. Used for register blocks 1336 * that certain asics don't have (all asics). 1337 * Returns the value in the register. 1338 */ 1339 static uint64_t amdgpu_invalid_rreg64(struct amdgpu_device *adev, uint32_t reg) 1340 { 1341 dev_err(adev->dev, "Invalid callback to read 64 bit register 0x%04X\n", 1342 reg); 1343 BUG(); 1344 return 0; 1345 } 1346 1347 static uint64_t amdgpu_invalid_rreg64_ext(struct amdgpu_device *adev, uint64_t reg) 1348 { 1349 dev_err(adev->dev, "Invalid callback to read register 0x%llX\n", reg); 1350 BUG(); 1351 return 0; 1352 } 1353 1354 /** 1355 * amdgpu_invalid_wreg64 - dummy reg write function 1356 * 1357 * @adev: amdgpu_device pointer 1358 * @reg: offset of register 1359 * @v: value to write to the register 1360 * 1361 * Dummy register read function. Used for register blocks 1362 * that certain asics don't have (all asics). 1363 */ 1364 static void amdgpu_invalid_wreg64(struct amdgpu_device *adev, uint32_t reg, uint64_t v) 1365 { 1366 dev_err(adev->dev, 1367 "Invalid callback to write 64 bit register 0x%04X with 0x%08llX\n", 1368 reg, v); 1369 BUG(); 1370 } 1371 1372 static void amdgpu_invalid_wreg64_ext(struct amdgpu_device *adev, uint64_t reg, uint64_t v) 1373 { 1374 dev_err(adev->dev, 1375 "Invalid callback to write 64 bit register 0x%llX with 0x%08llX\n", 1376 reg, v); 1377 BUG(); 1378 } 1379 1380 /** 1381 * amdgpu_block_invalid_rreg - dummy reg read function 1382 * 1383 * @adev: amdgpu_device pointer 1384 * @block: offset of instance 1385 * @reg: offset of register 1386 * 1387 * Dummy register read function. Used for register blocks 1388 * that certain asics don't have (all asics). 1389 * Returns the value in the register. 1390 */ 1391 static uint32_t amdgpu_block_invalid_rreg(struct amdgpu_device *adev, 1392 uint32_t block, uint32_t reg) 1393 { 1394 dev_err(adev->dev, 1395 "Invalid callback to read register 0x%04X in block 0x%04X\n", 1396 reg, block); 1397 BUG(); 1398 return 0; 1399 } 1400 1401 /** 1402 * amdgpu_block_invalid_wreg - dummy reg write function 1403 * 1404 * @adev: amdgpu_device pointer 1405 * @block: offset of instance 1406 * @reg: offset of register 1407 * @v: value to write to the register 1408 * 1409 * Dummy register read function. Used for register blocks 1410 * that certain asics don't have (all asics). 1411 */ 1412 static void amdgpu_block_invalid_wreg(struct amdgpu_device *adev, 1413 uint32_t block, 1414 uint32_t reg, uint32_t v) 1415 { 1416 dev_err(adev->dev, 1417 "Invalid block callback to write register 0x%04X in block 0x%04X with 0x%08X\n", 1418 reg, block, v); 1419 BUG(); 1420 } 1421 1422 static uint32_t amdgpu_device_get_vbios_flags(struct amdgpu_device *adev) 1423 { 1424 if (hweight32(adev->aid_mask) && (adev->flags & AMD_IS_APU)) 1425 return AMDGPU_VBIOS_SKIP; 1426 1427 if (hweight32(adev->aid_mask) && amdgpu_passthrough(adev)) 1428 return AMDGPU_VBIOS_OPTIONAL; 1429 1430 return 0; 1431 } 1432 1433 /** 1434 * amdgpu_device_asic_init - Wrapper for atom asic_init 1435 * 1436 * @adev: amdgpu_device pointer 1437 * 1438 * Does any asic specific work and then calls atom asic init. 1439 */ 1440 static int amdgpu_device_asic_init(struct amdgpu_device *adev) 1441 { 1442 uint32_t flags; 1443 bool optional; 1444 int ret; 1445 1446 amdgpu_asic_pre_asic_init(adev); 1447 flags = amdgpu_device_get_vbios_flags(adev); 1448 optional = !!(flags & (AMDGPU_VBIOS_OPTIONAL | AMDGPU_VBIOS_SKIP)); 1449 1450 if (amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 3) || 1451 amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 4) || 1452 amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 5, 0) || 1453 amdgpu_ip_version(adev, GC_HWIP, 0) >= IP_VERSION(11, 0, 0)) { 1454 amdgpu_psp_wait_for_bootloader(adev); 1455 if (optional && !adev->bios) 1456 return 0; 1457 1458 ret = amdgpu_atomfirmware_asic_init(adev, true); 1459 return ret; 1460 } else { 1461 if (optional && !adev->bios) 1462 return 0; 1463 1464 return amdgpu_atom_asic_init(adev->mode_info.atom_context); 1465 } 1466 1467 return 0; 1468 } 1469 1470 /** 1471 * amdgpu_device_mem_scratch_init - allocate the VRAM scratch page 1472 * 1473 * @adev: amdgpu_device pointer 1474 * 1475 * Allocates a scratch page of VRAM for use by various things in the 1476 * driver. 1477 */ 1478 static int amdgpu_device_mem_scratch_init(struct amdgpu_device *adev) 1479 { 1480 return amdgpu_bo_create_kernel(adev, AMDGPU_GPU_PAGE_SIZE, PAGE_SIZE, 1481 AMDGPU_GEM_DOMAIN_VRAM | 1482 AMDGPU_GEM_DOMAIN_GTT, 1483 &adev->mem_scratch.robj, 1484 &adev->mem_scratch.gpu_addr, 1485 (void **)&adev->mem_scratch.ptr); 1486 } 1487 1488 /** 1489 * amdgpu_device_mem_scratch_fini - Free the VRAM scratch page 1490 * 1491 * @adev: amdgpu_device pointer 1492 * 1493 * Frees the VRAM scratch page. 1494 */ 1495 static void amdgpu_device_mem_scratch_fini(struct amdgpu_device *adev) 1496 { 1497 amdgpu_bo_free_kernel(&adev->mem_scratch.robj, NULL, NULL); 1498 } 1499 1500 /** 1501 * amdgpu_device_program_register_sequence - program an array of registers. 1502 * 1503 * @adev: amdgpu_device pointer 1504 * @registers: pointer to the register array 1505 * @array_size: size of the register array 1506 * 1507 * Programs an array or registers with and or masks. 1508 * This is a helper for setting golden registers. 1509 */ 1510 void amdgpu_device_program_register_sequence(struct amdgpu_device *adev, 1511 const u32 *registers, 1512 const u32 array_size) 1513 { 1514 u32 tmp, reg, and_mask, or_mask; 1515 int i; 1516 1517 if (array_size % 3) 1518 return; 1519 1520 for (i = 0; i < array_size; i += 3) { 1521 reg = registers[i + 0]; 1522 and_mask = registers[i + 1]; 1523 or_mask = registers[i + 2]; 1524 1525 if (and_mask == 0xffffffff) { 1526 tmp = or_mask; 1527 } else { 1528 tmp = RREG32(reg); 1529 tmp &= ~and_mask; 1530 if (adev->family >= AMDGPU_FAMILY_AI) 1531 tmp |= (or_mask & and_mask); 1532 else 1533 tmp |= or_mask; 1534 } 1535 WREG32(reg, tmp); 1536 } 1537 } 1538 1539 /** 1540 * amdgpu_device_pci_config_reset - reset the GPU 1541 * 1542 * @adev: amdgpu_device pointer 1543 * 1544 * Resets the GPU using the pci config reset sequence. 1545 * Only applicable to asics prior to vega10. 1546 */ 1547 void amdgpu_device_pci_config_reset(struct amdgpu_device *adev) 1548 { 1549 pci_write_config_dword(adev->pdev, 0x7c, AMDGPU_ASIC_RESET_DATA); 1550 } 1551 1552 /** 1553 * amdgpu_device_pci_reset - reset the GPU using generic PCI means 1554 * 1555 * @adev: amdgpu_device pointer 1556 * 1557 * Resets the GPU using generic pci reset interfaces (FLR, SBR, etc.). 1558 */ 1559 int amdgpu_device_pci_reset(struct amdgpu_device *adev) 1560 { 1561 return pci_reset_function(adev->pdev); 1562 } 1563 1564 /* 1565 * amdgpu_device_wb_*() 1566 * Writeback is the method by which the GPU updates special pages in memory 1567 * with the status of certain GPU events (fences, ring pointers,etc.). 1568 */ 1569 1570 /** 1571 * amdgpu_device_wb_fini - Disable Writeback and free memory 1572 * 1573 * @adev: amdgpu_device pointer 1574 * 1575 * Disables Writeback and frees the Writeback memory (all asics). 1576 * Used at driver shutdown. 1577 */ 1578 static void amdgpu_device_wb_fini(struct amdgpu_device *adev) 1579 { 1580 if (adev->wb.wb_obj) { 1581 amdgpu_bo_free_kernel(&adev->wb.wb_obj, 1582 &adev->wb.gpu_addr, 1583 (void **)&adev->wb.wb); 1584 adev->wb.wb_obj = NULL; 1585 } 1586 } 1587 1588 /** 1589 * amdgpu_device_wb_init - Init Writeback driver info and allocate memory 1590 * 1591 * @adev: amdgpu_device pointer 1592 * 1593 * Initializes writeback and allocates writeback memory (all asics). 1594 * Used at driver startup. 1595 * Returns 0 on success or an -error on failure. 1596 */ 1597 static int amdgpu_device_wb_init(struct amdgpu_device *adev) 1598 { 1599 int r; 1600 1601 if (adev->wb.wb_obj == NULL) { 1602 /* AMDGPU_MAX_WB * sizeof(uint32_t) * 8 = AMDGPU_MAX_WB 256bit slots */ 1603 r = amdgpu_bo_create_kernel(adev, AMDGPU_MAX_WB * sizeof(uint32_t) * 8, 1604 PAGE_SIZE, AMDGPU_GEM_DOMAIN_GTT, 1605 &adev->wb.wb_obj, &adev->wb.gpu_addr, 1606 (void **)&adev->wb.wb); 1607 if (r) { 1608 dev_warn(adev->dev, "(%d) create WB bo failed\n", r); 1609 return r; 1610 } 1611 1612 adev->wb.num_wb = AMDGPU_MAX_WB; 1613 memset(&adev->wb.used, 0, sizeof(adev->wb.used)); 1614 1615 /* clear wb memory */ 1616 memset((char *)adev->wb.wb, 0, AMDGPU_MAX_WB * sizeof(uint32_t) * 8); 1617 } 1618 1619 return 0; 1620 } 1621 1622 /** 1623 * amdgpu_device_wb_get - Allocate a wb entry 1624 * 1625 * @adev: amdgpu_device pointer 1626 * @wb: wb index 1627 * 1628 * Allocate a wb slot for use by the driver (all asics). 1629 * Returns 0 on success or -EINVAL on failure. 1630 */ 1631 int amdgpu_device_wb_get(struct amdgpu_device *adev, u32 *wb) 1632 { 1633 unsigned long flags, offset; 1634 1635 spin_lock_irqsave(&adev->wb.lock, flags); 1636 offset = find_first_zero_bit(adev->wb.used, adev->wb.num_wb); 1637 if (offset < adev->wb.num_wb) { 1638 __set_bit(offset, adev->wb.used); 1639 spin_unlock_irqrestore(&adev->wb.lock, flags); 1640 *wb = offset << 3; /* convert to dw offset */ 1641 return 0; 1642 } else { 1643 spin_unlock_irqrestore(&adev->wb.lock, flags); 1644 return -EINVAL; 1645 } 1646 } 1647 1648 /** 1649 * amdgpu_device_wb_free - Free a wb entry 1650 * 1651 * @adev: amdgpu_device pointer 1652 * @wb: wb index 1653 * 1654 * Free a wb slot allocated for use by the driver (all asics) 1655 */ 1656 void amdgpu_device_wb_free(struct amdgpu_device *adev, u32 wb) 1657 { 1658 unsigned long flags; 1659 1660 wb >>= 3; 1661 spin_lock_irqsave(&adev->wb.lock, flags); 1662 if (wb < adev->wb.num_wb) 1663 __clear_bit(wb, adev->wb.used); 1664 spin_unlock_irqrestore(&adev->wb.lock, flags); 1665 } 1666 1667 /** 1668 * amdgpu_device_resize_fb_bar - try to resize FB BAR 1669 * 1670 * @adev: amdgpu_device pointer 1671 * 1672 * Try to resize FB BAR to make all VRAM CPU accessible. We try very hard not 1673 * to fail, but if any of the BARs is not accessible after the size we abort 1674 * driver loading by returning -ENODEV. 1675 */ 1676 int amdgpu_device_resize_fb_bar(struct amdgpu_device *adev) 1677 { 1678 int rbar_size = pci_rebar_bytes_to_size(adev->gmc.real_vram_size); 1679 struct pci_bus *root; 1680 struct resource *res; 1681 int max_size, r; 1682 unsigned int i; 1683 u16 cmd; 1684 1685 if (!IS_ENABLED(CONFIG_PHYS_ADDR_T_64BIT)) 1686 return 0; 1687 1688 /* Bypass for VF */ 1689 if (amdgpu_sriov_vf(adev)) 1690 return 0; 1691 1692 if (!amdgpu_rebar) 1693 return 0; 1694 1695 /* resizing on Dell G5 SE platforms causes problems with runtime pm */ 1696 if ((amdgpu_runtime_pm != 0) && 1697 adev->pdev->vendor == PCI_VENDOR_ID_ATI && 1698 adev->pdev->device == 0x731f && 1699 adev->pdev->subsystem_vendor == PCI_VENDOR_ID_DELL) 1700 return 0; 1701 1702 /* PCI_EXT_CAP_ID_VNDR extended capability is located at 0x100 */ 1703 if (!pci_find_ext_capability(adev->pdev, PCI_EXT_CAP_ID_VNDR)) 1704 dev_warn( 1705 adev->dev, 1706 "System can't access extended configuration space, please check!!\n"); 1707 1708 /* skip if the bios has already enabled large BAR */ 1709 if (adev->gmc.real_vram_size && 1710 (pci_resource_len(adev->pdev, 0) >= adev->gmc.real_vram_size)) 1711 return 0; 1712 1713 /* Check if the root BUS has 64bit memory resources */ 1714 root = adev->pdev->bus; 1715 while (root->parent) 1716 root = root->parent; 1717 1718 pci_bus_for_each_resource(root, res, i) { 1719 if (res && res->flags & (IORESOURCE_MEM | IORESOURCE_MEM_64) && 1720 res->start > 0x100000000ull) 1721 break; 1722 } 1723 1724 /* Trying to resize is pointless without a root hub window above 4GB */ 1725 if (!res) 1726 return 0; 1727 1728 /* Limit the BAR size to what is available */ 1729 max_size = pci_rebar_get_max_size(adev->pdev, 0); 1730 if (max_size < 0) 1731 return 0; 1732 rbar_size = min(max_size, rbar_size); 1733 1734 /* Disable memory decoding while we change the BAR addresses and size */ 1735 pci_read_config_word(adev->pdev, PCI_COMMAND, &cmd); 1736 pci_write_config_word(adev->pdev, PCI_COMMAND, 1737 cmd & ~PCI_COMMAND_MEMORY); 1738 1739 /* Tear down doorbell as resizing will release BARs */ 1740 amdgpu_doorbell_fini(adev); 1741 1742 r = pci_resize_resource(adev->pdev, 0, rbar_size, 1743 (adev->asic_type >= CHIP_BONAIRE) ? 1 << 5 1744 : 1 << 2); 1745 if (r == -ENOSPC) 1746 dev_info(adev->dev, 1747 "Not enough PCI address space for a large BAR."); 1748 else if (r && r != -ENOTSUPP) 1749 dev_err(adev->dev, "Problem resizing BAR0 (%d).", r); 1750 1751 /* When the doorbell or fb BAR isn't available we have no chance of 1752 * using the device. 1753 */ 1754 r = amdgpu_doorbell_init(adev); 1755 if (r || (pci_resource_flags(adev->pdev, 0) & IORESOURCE_UNSET)) 1756 return -ENODEV; 1757 1758 pci_write_config_word(adev->pdev, PCI_COMMAND, cmd); 1759 1760 return 0; 1761 } 1762 1763 /* 1764 * GPU helpers function. 1765 */ 1766 /** 1767 * amdgpu_device_need_post - check if the hw need post or not 1768 * 1769 * @adev: amdgpu_device pointer 1770 * 1771 * Check if the asic has been initialized (all asics) at driver startup 1772 * or post is needed if hw reset is performed. 1773 * Returns true if need or false if not. 1774 */ 1775 bool amdgpu_device_need_post(struct amdgpu_device *adev) 1776 { 1777 uint32_t reg, flags; 1778 1779 if (amdgpu_sriov_vf(adev)) 1780 return false; 1781 1782 flags = amdgpu_device_get_vbios_flags(adev); 1783 if (flags & AMDGPU_VBIOS_SKIP) 1784 return false; 1785 if ((flags & AMDGPU_VBIOS_OPTIONAL) && !adev->bios) 1786 return false; 1787 1788 if (amdgpu_passthrough(adev)) { 1789 /* for FIJI: In whole GPU pass-through virtualization case, after VM reboot 1790 * some old smc fw still need driver do vPost otherwise gpu hang, while 1791 * those smc fw version above 22.15 doesn't have this flaw, so we force 1792 * vpost executed for smc version below 22.15 1793 */ 1794 if (adev->asic_type == CHIP_FIJI) { 1795 int err; 1796 uint32_t fw_ver; 1797 1798 err = request_firmware(&adev->pm.fw, "amdgpu/fiji_smc.bin", adev->dev); 1799 /* force vPost if error occurred */ 1800 if (err) 1801 return true; 1802 1803 fw_ver = *((uint32_t *)adev->pm.fw->data + 69); 1804 release_firmware(adev->pm.fw); 1805 if (fw_ver < 0x00160e00) 1806 return true; 1807 } 1808 } 1809 1810 /* Don't post if we need to reset whole hive on init */ 1811 if (adev->init_lvl->level == AMDGPU_INIT_LEVEL_MINIMAL_XGMI) 1812 return false; 1813 1814 if (adev->has_hw_reset) { 1815 adev->has_hw_reset = false; 1816 return true; 1817 } 1818 1819 /* bios scratch used on CIK+ */ 1820 if (adev->asic_type >= CHIP_BONAIRE) 1821 return amdgpu_atombios_scratch_need_asic_init(adev); 1822 1823 /* check MEM_SIZE for older asics */ 1824 reg = amdgpu_asic_get_config_memsize(adev); 1825 1826 if ((reg != 0) && (reg != 0xffffffff)) 1827 return false; 1828 1829 return true; 1830 } 1831 1832 /* 1833 * Check whether seamless boot is supported. 1834 * 1835 * So far we only support seamless boot on DCE 3.0 or later. 1836 * If users report that it works on older ASICS as well, we may 1837 * loosen this. 1838 */ 1839 bool amdgpu_device_seamless_boot_supported(struct amdgpu_device *adev) 1840 { 1841 switch (amdgpu_seamless) { 1842 case -1: 1843 break; 1844 case 1: 1845 return true; 1846 case 0: 1847 return false; 1848 default: 1849 dev_err(adev->dev, "Invalid value for amdgpu.seamless: %d\n", 1850 amdgpu_seamless); 1851 return false; 1852 } 1853 1854 if (!(adev->flags & AMD_IS_APU)) 1855 return false; 1856 1857 if (adev->mman.keep_stolen_vga_memory) 1858 return false; 1859 1860 return amdgpu_ip_version(adev, DCE_HWIP, 0) >= IP_VERSION(3, 0, 0); 1861 } 1862 1863 /* 1864 * Intel hosts such as Rocket Lake, Alder Lake, Raptor Lake and Sapphire Rapids 1865 * don't support dynamic speed switching. Until we have confirmation from Intel 1866 * that a specific host supports it, it's safer that we keep it disabled for all. 1867 * 1868 * https://edc.intel.com/content/www/us/en/design/products/platforms/details/raptor-lake-s/13th-generation-core-processors-datasheet-volume-1-of-2/005/pci-express-support/ 1869 * https://gitlab.freedesktop.org/drm/amd/-/issues/2663 1870 */ 1871 static bool amdgpu_device_pcie_dynamic_switching_supported(struct amdgpu_device *adev) 1872 { 1873 #if IS_ENABLED(CONFIG_X86) 1874 struct cpuinfo_x86 *c = &cpu_data(0); 1875 1876 /* eGPU change speeds based on USB4 fabric conditions */ 1877 if (dev_is_removable(adev->dev)) 1878 return true; 1879 1880 if (c->x86_vendor == X86_VENDOR_INTEL) 1881 return false; 1882 #endif 1883 return true; 1884 } 1885 1886 static bool amdgpu_device_aspm_support_quirk(struct amdgpu_device *adev) 1887 { 1888 /* Enabling ASPM causes randoms hangs on Tahiti and Oland on Zen4. 1889 * It's unclear if this is a platform-specific or GPU-specific issue. 1890 * Disable ASPM on SI for the time being. 1891 */ 1892 if (adev->family == AMDGPU_FAMILY_SI) 1893 return true; 1894 1895 #if IS_ENABLED(CONFIG_X86) 1896 struct cpuinfo_x86 *c = &cpu_data(0); 1897 1898 if (!(amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(12, 0, 0) || 1899 amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(12, 0, 1))) 1900 return false; 1901 1902 if (c->x86 == 6 && 1903 adev->pm.pcie_gen_mask & CAIL_PCIE_LINK_SPEED_SUPPORT_GEN5) { 1904 switch (c->x86_model) { 1905 case VFM_MODEL(INTEL_ALDERLAKE): 1906 case VFM_MODEL(INTEL_ALDERLAKE_L): 1907 case VFM_MODEL(INTEL_RAPTORLAKE): 1908 case VFM_MODEL(INTEL_RAPTORLAKE_P): 1909 case VFM_MODEL(INTEL_RAPTORLAKE_S): 1910 return true; 1911 default: 1912 return false; 1913 } 1914 } else { 1915 return false; 1916 } 1917 #else 1918 return false; 1919 #endif 1920 } 1921 1922 /** 1923 * amdgpu_device_should_use_aspm - check if the device should program ASPM 1924 * 1925 * @adev: amdgpu_device pointer 1926 * 1927 * Confirm whether the module parameter and pcie bridge agree that ASPM should 1928 * be set for this device. 1929 * 1930 * Returns true if it should be used or false if not. 1931 */ 1932 bool amdgpu_device_should_use_aspm(struct amdgpu_device *adev) 1933 { 1934 switch (amdgpu_aspm) { 1935 case -1: 1936 break; 1937 case 0: 1938 return false; 1939 case 1: 1940 return true; 1941 default: 1942 return false; 1943 } 1944 if (adev->flags & AMD_IS_APU) 1945 return false; 1946 if (amdgpu_device_aspm_support_quirk(adev)) 1947 return false; 1948 return pcie_aspm_enabled(adev->pdev); 1949 } 1950 1951 /* if we get transitioned to only one device, take VGA back */ 1952 /** 1953 * amdgpu_device_vga_set_decode - enable/disable vga decode 1954 * 1955 * @pdev: PCI device pointer 1956 * @state: enable/disable vga decode 1957 * 1958 * Enable/disable vga decode (all asics). 1959 * Returns VGA resource flags. 1960 */ 1961 static unsigned int amdgpu_device_vga_set_decode(struct pci_dev *pdev, 1962 bool state) 1963 { 1964 struct amdgpu_device *adev = drm_to_adev(pci_get_drvdata(pdev)); 1965 1966 amdgpu_asic_set_vga_state(adev, state); 1967 if (state) 1968 return VGA_RSRC_LEGACY_IO | VGA_RSRC_LEGACY_MEM | 1969 VGA_RSRC_NORMAL_IO | VGA_RSRC_NORMAL_MEM; 1970 else 1971 return VGA_RSRC_NORMAL_IO | VGA_RSRC_NORMAL_MEM; 1972 } 1973 1974 /** 1975 * amdgpu_device_check_block_size - validate the vm block size 1976 * 1977 * @adev: amdgpu_device pointer 1978 * 1979 * Validates the vm block size specified via module parameter. 1980 * The vm block size defines number of bits in page table versus page directory, 1981 * a page is 4KB so we have 12 bits offset, minimum 9 bits in the 1982 * page table and the remaining bits are in the page directory. 1983 */ 1984 static void amdgpu_device_check_block_size(struct amdgpu_device *adev) 1985 { 1986 /* defines number of bits in page table versus page directory, 1987 * a page is 4KB so we have 12 bits offset, minimum 9 bits in the 1988 * page table and the remaining bits are in the page directory 1989 */ 1990 if (amdgpu_vm_block_size == -1) 1991 return; 1992 1993 if (amdgpu_vm_block_size < 9) { 1994 dev_warn(adev->dev, "VM page table size (%d) too small\n", 1995 amdgpu_vm_block_size); 1996 amdgpu_vm_block_size = -1; 1997 } 1998 } 1999 2000 /** 2001 * amdgpu_device_check_vm_size - validate the vm size 2002 * 2003 * @adev: amdgpu_device pointer 2004 * 2005 * Validates the vm size in GB specified via module parameter. 2006 * The VM size is the size of the GPU virtual memory space in GB. 2007 */ 2008 static void amdgpu_device_check_vm_size(struct amdgpu_device *adev) 2009 { 2010 /* no need to check the default value */ 2011 if (amdgpu_vm_size == -1) 2012 return; 2013 2014 if (amdgpu_vm_size < 1) { 2015 dev_warn(adev->dev, "VM size (%d) too small, min is 1GB\n", 2016 amdgpu_vm_size); 2017 amdgpu_vm_size = -1; 2018 } 2019 } 2020 2021 static void amdgpu_device_check_smu_prv_buffer_size(struct amdgpu_device *adev) 2022 { 2023 struct sysinfo si; 2024 bool is_os_64 = (sizeof(void *) == 8); 2025 uint64_t total_memory; 2026 uint64_t dram_size_seven_GB = 0x1B8000000; 2027 uint64_t dram_size_three_GB = 0xB8000000; 2028 2029 if (amdgpu_smu_memory_pool_size == 0) 2030 return; 2031 2032 if (!is_os_64) { 2033 dev_warn(adev->dev, "Not 64-bit OS, feature not supported\n"); 2034 goto def_value; 2035 } 2036 si_meminfo(&si); 2037 total_memory = (uint64_t)si.totalram * si.mem_unit; 2038 2039 if ((amdgpu_smu_memory_pool_size == 1) || 2040 (amdgpu_smu_memory_pool_size == 2)) { 2041 if (total_memory < dram_size_three_GB) 2042 goto def_value1; 2043 } else if ((amdgpu_smu_memory_pool_size == 4) || 2044 (amdgpu_smu_memory_pool_size == 8)) { 2045 if (total_memory < dram_size_seven_GB) 2046 goto def_value1; 2047 } else { 2048 dev_warn(adev->dev, "Smu memory pool size not supported\n"); 2049 goto def_value; 2050 } 2051 adev->pm.smu_prv_buffer_size = amdgpu_smu_memory_pool_size << 28; 2052 2053 return; 2054 2055 def_value1: 2056 dev_warn(adev->dev, "No enough system memory\n"); 2057 def_value: 2058 adev->pm.smu_prv_buffer_size = 0; 2059 } 2060 2061 static int amdgpu_device_init_apu_flags(struct amdgpu_device *adev) 2062 { 2063 if (!(adev->flags & AMD_IS_APU) || 2064 adev->asic_type < CHIP_RAVEN) 2065 return 0; 2066 2067 switch (adev->asic_type) { 2068 case CHIP_RAVEN: 2069 if (adev->pdev->device == 0x15dd) 2070 adev->apu_flags |= AMD_APU_IS_RAVEN; 2071 if (adev->pdev->device == 0x15d8) 2072 adev->apu_flags |= AMD_APU_IS_PICASSO; 2073 break; 2074 case CHIP_RENOIR: 2075 if ((adev->pdev->device == 0x1636) || 2076 (adev->pdev->device == 0x164c)) 2077 adev->apu_flags |= AMD_APU_IS_RENOIR; 2078 else 2079 adev->apu_flags |= AMD_APU_IS_GREEN_SARDINE; 2080 break; 2081 case CHIP_VANGOGH: 2082 adev->apu_flags |= AMD_APU_IS_VANGOGH; 2083 break; 2084 case CHIP_YELLOW_CARP: 2085 break; 2086 case CHIP_CYAN_SKILLFISH: 2087 if ((adev->pdev->device == 0x13FE) || 2088 (adev->pdev->device == 0x143F)) 2089 adev->apu_flags |= AMD_APU_IS_CYAN_SKILLFISH2; 2090 break; 2091 default: 2092 break; 2093 } 2094 2095 return 0; 2096 } 2097 2098 /** 2099 * amdgpu_device_check_arguments - validate module params 2100 * 2101 * @adev: amdgpu_device pointer 2102 * 2103 * Validates certain module parameters and updates 2104 * the associated values used by the driver (all asics). 2105 */ 2106 static int amdgpu_device_check_arguments(struct amdgpu_device *adev) 2107 { 2108 int i; 2109 2110 if (amdgpu_sched_jobs < 4) { 2111 dev_warn(adev->dev, "sched jobs (%d) must be at least 4\n", 2112 amdgpu_sched_jobs); 2113 amdgpu_sched_jobs = 4; 2114 } else if (!is_power_of_2(amdgpu_sched_jobs)) { 2115 dev_warn(adev->dev, "sched jobs (%d) must be a power of 2\n", 2116 amdgpu_sched_jobs); 2117 amdgpu_sched_jobs = roundup_pow_of_two(amdgpu_sched_jobs); 2118 } 2119 2120 if (amdgpu_gart_size != -1 && amdgpu_gart_size < 32) { 2121 /* gart size must be greater or equal to 32M */ 2122 dev_warn(adev->dev, "gart size (%d) too small\n", 2123 amdgpu_gart_size); 2124 amdgpu_gart_size = -1; 2125 } 2126 2127 if (amdgpu_gtt_size != -1 && amdgpu_gtt_size < 32) { 2128 /* gtt size must be greater or equal to 32M */ 2129 dev_warn(adev->dev, "gtt size (%d) too small\n", 2130 amdgpu_gtt_size); 2131 amdgpu_gtt_size = -1; 2132 } 2133 2134 /* valid range is between 4 and 9 inclusive */ 2135 if (amdgpu_vm_fragment_size != -1 && 2136 (amdgpu_vm_fragment_size > 9 || amdgpu_vm_fragment_size < 4)) { 2137 dev_warn(adev->dev, "valid range is between 4 and 9\n"); 2138 amdgpu_vm_fragment_size = -1; 2139 } 2140 2141 if (amdgpu_sched_hw_submission < 2) { 2142 dev_warn(adev->dev, "sched hw submission jobs (%d) must be at least 2\n", 2143 amdgpu_sched_hw_submission); 2144 amdgpu_sched_hw_submission = 2; 2145 } else if (!is_power_of_2(amdgpu_sched_hw_submission)) { 2146 dev_warn(adev->dev, "sched hw submission jobs (%d) must be a power of 2\n", 2147 amdgpu_sched_hw_submission); 2148 amdgpu_sched_hw_submission = roundup_pow_of_two(amdgpu_sched_hw_submission); 2149 } 2150 2151 if (amdgpu_reset_method < -1 || amdgpu_reset_method > 4) { 2152 dev_warn(adev->dev, "invalid option for reset method, reverting to default\n"); 2153 amdgpu_reset_method = -1; 2154 } 2155 2156 amdgpu_device_check_smu_prv_buffer_size(adev); 2157 2158 amdgpu_device_check_vm_size(adev); 2159 2160 amdgpu_device_check_block_size(adev); 2161 2162 adev->firmware.load_type = amdgpu_ucode_get_load_type(adev, amdgpu_fw_load_type); 2163 2164 for (i = 0; i < MAX_XCP; i++) { 2165 switch (amdgpu_enforce_isolation) { 2166 case -1: 2167 case 0: 2168 default: 2169 /* disable */ 2170 adev->enforce_isolation[i] = AMDGPU_ENFORCE_ISOLATION_DISABLE; 2171 break; 2172 case 1: 2173 /* enable */ 2174 adev->enforce_isolation[i] = 2175 AMDGPU_ENFORCE_ISOLATION_ENABLE; 2176 break; 2177 case 2: 2178 /* enable legacy mode */ 2179 adev->enforce_isolation[i] = 2180 AMDGPU_ENFORCE_ISOLATION_ENABLE_LEGACY; 2181 break; 2182 case 3: 2183 /* enable only process isolation without submitting cleaner shader */ 2184 adev->enforce_isolation[i] = 2185 AMDGPU_ENFORCE_ISOLATION_NO_CLEANER_SHADER; 2186 break; 2187 } 2188 } 2189 2190 return 0; 2191 } 2192 2193 /** 2194 * amdgpu_switcheroo_set_state - set switcheroo state 2195 * 2196 * @pdev: pci dev pointer 2197 * @state: vga_switcheroo state 2198 * 2199 * Callback for the switcheroo driver. Suspends or resumes 2200 * the asics before or after it is powered up using ACPI methods. 2201 */ 2202 static void amdgpu_switcheroo_set_state(struct pci_dev *pdev, 2203 enum vga_switcheroo_state state) 2204 { 2205 struct drm_device *dev = pci_get_drvdata(pdev); 2206 int r; 2207 2208 if (amdgpu_device_supports_px(drm_to_adev(dev)) && 2209 state == VGA_SWITCHEROO_OFF) 2210 return; 2211 2212 if (state == VGA_SWITCHEROO_ON) { 2213 pr_info("switched on\n"); 2214 /* don't suspend or resume card normally */ 2215 dev->switch_power_state = DRM_SWITCH_POWER_CHANGING; 2216 2217 pci_set_power_state(pdev, PCI_D0); 2218 amdgpu_device_load_pci_state(pdev); 2219 r = pci_enable_device(pdev); 2220 if (r) 2221 dev_warn(&pdev->dev, "pci_enable_device failed (%d)\n", 2222 r); 2223 amdgpu_device_resume(dev, true); 2224 2225 dev->switch_power_state = DRM_SWITCH_POWER_ON; 2226 } else { 2227 dev_info(&pdev->dev, "switched off\n"); 2228 dev->switch_power_state = DRM_SWITCH_POWER_CHANGING; 2229 amdgpu_device_prepare(dev); 2230 amdgpu_device_suspend(dev, true); 2231 amdgpu_device_cache_pci_state(pdev); 2232 /* Shut down the device */ 2233 pci_disable_device(pdev); 2234 pci_set_power_state(pdev, PCI_D3cold); 2235 dev->switch_power_state = DRM_SWITCH_POWER_OFF; 2236 } 2237 } 2238 2239 /** 2240 * amdgpu_switcheroo_can_switch - see if switcheroo state can change 2241 * 2242 * @pdev: pci dev pointer 2243 * 2244 * Callback for the switcheroo driver. Check of the switcheroo 2245 * state can be changed. 2246 * Returns true if the state can be changed, false if not. 2247 */ 2248 static bool amdgpu_switcheroo_can_switch(struct pci_dev *pdev) 2249 { 2250 struct drm_device *dev = pci_get_drvdata(pdev); 2251 2252 /* 2253 * FIXME: open_count is protected by drm_global_mutex but that would lead to 2254 * locking inversion with the driver load path. And the access here is 2255 * completely racy anyway. So don't bother with locking for now. 2256 */ 2257 return atomic_read(&dev->open_count) == 0; 2258 } 2259 2260 static const struct vga_switcheroo_client_ops amdgpu_switcheroo_ops = { 2261 .set_gpu_state = amdgpu_switcheroo_set_state, 2262 .reprobe = NULL, 2263 .can_switch = amdgpu_switcheroo_can_switch, 2264 }; 2265 2266 /** 2267 * amdgpu_device_ip_set_clockgating_state - set the CG state 2268 * 2269 * @dev: amdgpu_device pointer 2270 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.) 2271 * @state: clockgating state (gate or ungate) 2272 * 2273 * Sets the requested clockgating state for all instances of 2274 * the hardware IP specified. 2275 * Returns the error code from the last instance. 2276 */ 2277 int amdgpu_device_ip_set_clockgating_state(void *dev, 2278 enum amd_ip_block_type block_type, 2279 enum amd_clockgating_state state) 2280 { 2281 struct amdgpu_device *adev = dev; 2282 int i, r = 0; 2283 2284 for (i = 0; i < adev->num_ip_blocks; i++) { 2285 if (!adev->ip_blocks[i].status.valid) 2286 continue; 2287 if (adev->ip_blocks[i].version->type != block_type) 2288 continue; 2289 if (!adev->ip_blocks[i].version->funcs->set_clockgating_state) 2290 continue; 2291 r = adev->ip_blocks[i].version->funcs->set_clockgating_state( 2292 &adev->ip_blocks[i], state); 2293 if (r) 2294 dev_err(adev->dev, 2295 "set_clockgating_state of IP block <%s> failed %d\n", 2296 adev->ip_blocks[i].version->funcs->name, r); 2297 } 2298 return r; 2299 } 2300 2301 /** 2302 * amdgpu_device_ip_set_powergating_state - set the PG state 2303 * 2304 * @dev: amdgpu_device pointer 2305 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.) 2306 * @state: powergating state (gate or ungate) 2307 * 2308 * Sets the requested powergating state for all instances of 2309 * the hardware IP specified. 2310 * Returns the error code from the last instance. 2311 */ 2312 int amdgpu_device_ip_set_powergating_state(void *dev, 2313 enum amd_ip_block_type block_type, 2314 enum amd_powergating_state state) 2315 { 2316 struct amdgpu_device *adev = dev; 2317 int i, r = 0; 2318 2319 for (i = 0; i < adev->num_ip_blocks; i++) { 2320 if (!adev->ip_blocks[i].status.valid) 2321 continue; 2322 if (adev->ip_blocks[i].version->type != block_type) 2323 continue; 2324 if (!adev->ip_blocks[i].version->funcs->set_powergating_state) 2325 continue; 2326 r = adev->ip_blocks[i].version->funcs->set_powergating_state( 2327 &adev->ip_blocks[i], state); 2328 if (r) 2329 dev_err(adev->dev, 2330 "set_powergating_state of IP block <%s> failed %d\n", 2331 adev->ip_blocks[i].version->funcs->name, r); 2332 } 2333 return r; 2334 } 2335 2336 /** 2337 * amdgpu_device_ip_get_clockgating_state - get the CG state 2338 * 2339 * @adev: amdgpu_device pointer 2340 * @flags: clockgating feature flags 2341 * 2342 * Walks the list of IPs on the device and updates the clockgating 2343 * flags for each IP. 2344 * Updates @flags with the feature flags for each hardware IP where 2345 * clockgating is enabled. 2346 */ 2347 void amdgpu_device_ip_get_clockgating_state(struct amdgpu_device *adev, 2348 u64 *flags) 2349 { 2350 int i; 2351 2352 for (i = 0; i < adev->num_ip_blocks; i++) { 2353 if (!adev->ip_blocks[i].status.valid) 2354 continue; 2355 if (adev->ip_blocks[i].version->funcs->get_clockgating_state) 2356 adev->ip_blocks[i].version->funcs->get_clockgating_state( 2357 &adev->ip_blocks[i], flags); 2358 } 2359 } 2360 2361 /** 2362 * amdgpu_device_ip_wait_for_idle - wait for idle 2363 * 2364 * @adev: amdgpu_device pointer 2365 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.) 2366 * 2367 * Waits for the request hardware IP to be idle. 2368 * Returns 0 for success or a negative error code on failure. 2369 */ 2370 int amdgpu_device_ip_wait_for_idle(struct amdgpu_device *adev, 2371 enum amd_ip_block_type block_type) 2372 { 2373 int i, r; 2374 2375 for (i = 0; i < adev->num_ip_blocks; i++) { 2376 if (!adev->ip_blocks[i].status.valid) 2377 continue; 2378 if (adev->ip_blocks[i].version->type == block_type) { 2379 if (adev->ip_blocks[i].version->funcs->wait_for_idle) { 2380 r = adev->ip_blocks[i].version->funcs->wait_for_idle( 2381 &adev->ip_blocks[i]); 2382 if (r) 2383 return r; 2384 } 2385 break; 2386 } 2387 } 2388 return 0; 2389 2390 } 2391 2392 /** 2393 * amdgpu_device_ip_is_hw - is the hardware IP enabled 2394 * 2395 * @adev: amdgpu_device pointer 2396 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.) 2397 * 2398 * Check if the hardware IP is enable or not. 2399 * Returns true if it the IP is enable, false if not. 2400 */ 2401 bool amdgpu_device_ip_is_hw(struct amdgpu_device *adev, 2402 enum amd_ip_block_type block_type) 2403 { 2404 int i; 2405 2406 for (i = 0; i < adev->num_ip_blocks; i++) { 2407 if (adev->ip_blocks[i].version->type == block_type) 2408 return adev->ip_blocks[i].status.hw; 2409 } 2410 return false; 2411 } 2412 2413 /** 2414 * amdgpu_device_ip_is_valid - is the hardware IP valid 2415 * 2416 * @adev: amdgpu_device pointer 2417 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.) 2418 * 2419 * Check if the hardware IP is valid or not. 2420 * Returns true if it the IP is valid, false if not. 2421 */ 2422 bool amdgpu_device_ip_is_valid(struct amdgpu_device *adev, 2423 enum amd_ip_block_type block_type) 2424 { 2425 int i; 2426 2427 for (i = 0; i < adev->num_ip_blocks; i++) { 2428 if (adev->ip_blocks[i].version->type == block_type) 2429 return adev->ip_blocks[i].status.valid; 2430 } 2431 return false; 2432 2433 } 2434 2435 /** 2436 * amdgpu_device_ip_get_ip_block - get a hw IP pointer 2437 * 2438 * @adev: amdgpu_device pointer 2439 * @type: Type of hardware IP (SMU, GFX, UVD, etc.) 2440 * 2441 * Returns a pointer to the hardware IP block structure 2442 * if it exists for the asic, otherwise NULL. 2443 */ 2444 struct amdgpu_ip_block * 2445 amdgpu_device_ip_get_ip_block(struct amdgpu_device *adev, 2446 enum amd_ip_block_type type) 2447 { 2448 int i; 2449 2450 for (i = 0; i < adev->num_ip_blocks; i++) 2451 if (adev->ip_blocks[i].version->type == type) 2452 return &adev->ip_blocks[i]; 2453 2454 return NULL; 2455 } 2456 2457 /** 2458 * amdgpu_device_ip_block_version_cmp 2459 * 2460 * @adev: amdgpu_device pointer 2461 * @type: enum amd_ip_block_type 2462 * @major: major version 2463 * @minor: minor version 2464 * 2465 * return 0 if equal or greater 2466 * return 1 if smaller or the ip_block doesn't exist 2467 */ 2468 int amdgpu_device_ip_block_version_cmp(struct amdgpu_device *adev, 2469 enum amd_ip_block_type type, 2470 u32 major, u32 minor) 2471 { 2472 struct amdgpu_ip_block *ip_block = amdgpu_device_ip_get_ip_block(adev, type); 2473 2474 if (ip_block && ((ip_block->version->major > major) || 2475 ((ip_block->version->major == major) && 2476 (ip_block->version->minor >= minor)))) 2477 return 0; 2478 2479 return 1; 2480 } 2481 2482 static const char *ip_block_names[] = { 2483 [AMD_IP_BLOCK_TYPE_COMMON] = "common", 2484 [AMD_IP_BLOCK_TYPE_GMC] = "gmc", 2485 [AMD_IP_BLOCK_TYPE_IH] = "ih", 2486 [AMD_IP_BLOCK_TYPE_SMC] = "smu", 2487 [AMD_IP_BLOCK_TYPE_PSP] = "psp", 2488 [AMD_IP_BLOCK_TYPE_DCE] = "dce", 2489 [AMD_IP_BLOCK_TYPE_GFX] = "gfx", 2490 [AMD_IP_BLOCK_TYPE_SDMA] = "sdma", 2491 [AMD_IP_BLOCK_TYPE_UVD] = "uvd", 2492 [AMD_IP_BLOCK_TYPE_VCE] = "vce", 2493 [AMD_IP_BLOCK_TYPE_ACP] = "acp", 2494 [AMD_IP_BLOCK_TYPE_VCN] = "vcn", 2495 [AMD_IP_BLOCK_TYPE_MES] = "mes", 2496 [AMD_IP_BLOCK_TYPE_JPEG] = "jpeg", 2497 [AMD_IP_BLOCK_TYPE_VPE] = "vpe", 2498 [AMD_IP_BLOCK_TYPE_UMSCH_MM] = "umsch_mm", 2499 [AMD_IP_BLOCK_TYPE_ISP] = "isp", 2500 [AMD_IP_BLOCK_TYPE_RAS] = "ras", 2501 }; 2502 2503 static const char *ip_block_name(struct amdgpu_device *adev, enum amd_ip_block_type type) 2504 { 2505 int idx = (int)type; 2506 2507 return idx < ARRAY_SIZE(ip_block_names) ? ip_block_names[idx] : "unknown"; 2508 } 2509 2510 /** 2511 * amdgpu_device_ip_block_add 2512 * 2513 * @adev: amdgpu_device pointer 2514 * @ip_block_version: pointer to the IP to add 2515 * 2516 * Adds the IP block driver information to the collection of IPs 2517 * on the asic. 2518 */ 2519 int amdgpu_device_ip_block_add(struct amdgpu_device *adev, 2520 const struct amdgpu_ip_block_version *ip_block_version) 2521 { 2522 if (!ip_block_version) 2523 return -EINVAL; 2524 2525 switch (ip_block_version->type) { 2526 case AMD_IP_BLOCK_TYPE_VCN: 2527 if (adev->harvest_ip_mask & AMD_HARVEST_IP_VCN_MASK) 2528 return 0; 2529 break; 2530 case AMD_IP_BLOCK_TYPE_JPEG: 2531 if (adev->harvest_ip_mask & AMD_HARVEST_IP_JPEG_MASK) 2532 return 0; 2533 break; 2534 default: 2535 break; 2536 } 2537 2538 dev_info(adev->dev, "detected ip block number %d <%s_v%d_%d_%d> (%s)\n", 2539 adev->num_ip_blocks, 2540 ip_block_name(adev, ip_block_version->type), 2541 ip_block_version->major, 2542 ip_block_version->minor, 2543 ip_block_version->rev, 2544 ip_block_version->funcs->name); 2545 2546 adev->ip_blocks[adev->num_ip_blocks].adev = adev; 2547 2548 adev->ip_blocks[adev->num_ip_blocks++].version = ip_block_version; 2549 2550 return 0; 2551 } 2552 2553 /** 2554 * amdgpu_device_enable_virtual_display - enable virtual display feature 2555 * 2556 * @adev: amdgpu_device pointer 2557 * 2558 * Enabled the virtual display feature if the user has enabled it via 2559 * the module parameter virtual_display. This feature provides a virtual 2560 * display hardware on headless boards or in virtualized environments. 2561 * This function parses and validates the configuration string specified by 2562 * the user and configures the virtual display configuration (number of 2563 * virtual connectors, crtcs, etc.) specified. 2564 */ 2565 static void amdgpu_device_enable_virtual_display(struct amdgpu_device *adev) 2566 { 2567 adev->enable_virtual_display = false; 2568 2569 if (amdgpu_virtual_display) { 2570 const char *pci_address_name = pci_name(adev->pdev); 2571 char *pciaddstr, *pciaddstr_tmp, *pciaddname_tmp, *pciaddname; 2572 2573 pciaddstr = kstrdup(amdgpu_virtual_display, GFP_KERNEL); 2574 pciaddstr_tmp = pciaddstr; 2575 while ((pciaddname_tmp = strsep(&pciaddstr_tmp, ";"))) { 2576 pciaddname = strsep(&pciaddname_tmp, ","); 2577 if (!strcmp("all", pciaddname) 2578 || !strcmp(pci_address_name, pciaddname)) { 2579 long num_crtc; 2580 int res = -1; 2581 2582 adev->enable_virtual_display = true; 2583 2584 if (pciaddname_tmp) 2585 res = kstrtol(pciaddname_tmp, 10, 2586 &num_crtc); 2587 2588 if (!res) { 2589 if (num_crtc < 1) 2590 num_crtc = 1; 2591 if (num_crtc > 6) 2592 num_crtc = 6; 2593 adev->mode_info.num_crtc = num_crtc; 2594 } else { 2595 adev->mode_info.num_crtc = 1; 2596 } 2597 break; 2598 } 2599 } 2600 2601 dev_info( 2602 adev->dev, 2603 "virtual display string:%s, %s:virtual_display:%d, num_crtc:%d\n", 2604 amdgpu_virtual_display, pci_address_name, 2605 adev->enable_virtual_display, adev->mode_info.num_crtc); 2606 2607 kfree(pciaddstr); 2608 } 2609 } 2610 2611 void amdgpu_device_set_sriov_virtual_display(struct amdgpu_device *adev) 2612 { 2613 if (amdgpu_sriov_vf(adev) && !adev->enable_virtual_display) { 2614 adev->mode_info.num_crtc = 1; 2615 adev->enable_virtual_display = true; 2616 dev_info(adev->dev, "virtual_display:%d, num_crtc:%d\n", 2617 adev->enable_virtual_display, 2618 adev->mode_info.num_crtc); 2619 } 2620 } 2621 2622 /** 2623 * amdgpu_device_parse_gpu_info_fw - parse gpu info firmware 2624 * 2625 * @adev: amdgpu_device pointer 2626 * 2627 * Parses the asic configuration parameters specified in the gpu info 2628 * firmware and makes them available to the driver for use in configuring 2629 * the asic. 2630 * Returns 0 on success, -EINVAL on failure. 2631 */ 2632 static int amdgpu_device_parse_gpu_info_fw(struct amdgpu_device *adev) 2633 { 2634 const char *chip_name; 2635 int err; 2636 const struct gpu_info_firmware_header_v1_0 *hdr; 2637 2638 adev->firmware.gpu_info_fw = NULL; 2639 2640 switch (adev->asic_type) { 2641 default: 2642 return 0; 2643 case CHIP_VEGA10: 2644 chip_name = "vega10"; 2645 break; 2646 case CHIP_VEGA12: 2647 chip_name = "vega12"; 2648 break; 2649 case CHIP_RAVEN: 2650 if (adev->apu_flags & AMD_APU_IS_RAVEN2) 2651 chip_name = "raven2"; 2652 else if (adev->apu_flags & AMD_APU_IS_PICASSO) 2653 chip_name = "picasso"; 2654 else 2655 chip_name = "raven"; 2656 break; 2657 case CHIP_ARCTURUS: 2658 chip_name = "arcturus"; 2659 break; 2660 case CHIP_NAVI12: 2661 if (adev->discovery.bin) 2662 return 0; 2663 chip_name = "navi12"; 2664 break; 2665 case CHIP_CYAN_SKILLFISH: 2666 if (adev->discovery.bin) 2667 return 0; 2668 chip_name = "cyan_skillfish"; 2669 break; 2670 } 2671 2672 err = amdgpu_ucode_request(adev, &adev->firmware.gpu_info_fw, 2673 AMDGPU_UCODE_OPTIONAL, 2674 "amdgpu/%s_gpu_info.bin", chip_name); 2675 if (err) { 2676 dev_err(adev->dev, 2677 "Failed to get gpu_info firmware \"%s_gpu_info.bin\"\n", 2678 chip_name); 2679 goto out; 2680 } 2681 2682 hdr = (const struct gpu_info_firmware_header_v1_0 *)adev->firmware.gpu_info_fw->data; 2683 amdgpu_ucode_print_gpu_info_hdr(&hdr->header); 2684 2685 switch (hdr->version_major) { 2686 case 1: 2687 { 2688 const struct gpu_info_firmware_v1_0 *gpu_info_fw = 2689 (const struct gpu_info_firmware_v1_0 *)(adev->firmware.gpu_info_fw->data + 2690 le32_to_cpu(hdr->header.ucode_array_offset_bytes)); 2691 2692 /* 2693 * Should be dropped when DAL no longer needs it. 2694 */ 2695 if (adev->asic_type == CHIP_NAVI12) 2696 goto parse_soc_bounding_box; 2697 2698 adev->gfx.config.max_shader_engines = le32_to_cpu(gpu_info_fw->gc_num_se); 2699 adev->gfx.config.max_cu_per_sh = le32_to_cpu(gpu_info_fw->gc_num_cu_per_sh); 2700 adev->gfx.config.max_sh_per_se = le32_to_cpu(gpu_info_fw->gc_num_sh_per_se); 2701 adev->gfx.config.max_backends_per_se = le32_to_cpu(gpu_info_fw->gc_num_rb_per_se); 2702 adev->gfx.config.max_texture_channel_caches = 2703 le32_to_cpu(gpu_info_fw->gc_num_tccs); 2704 adev->gfx.config.max_gprs = le32_to_cpu(gpu_info_fw->gc_num_gprs); 2705 adev->gfx.config.max_gs_threads = le32_to_cpu(gpu_info_fw->gc_num_max_gs_thds); 2706 adev->gfx.config.gs_vgt_table_depth = le32_to_cpu(gpu_info_fw->gc_gs_table_depth); 2707 adev->gfx.config.gs_prim_buffer_depth = le32_to_cpu(gpu_info_fw->gc_gsprim_buff_depth); 2708 adev->gfx.config.double_offchip_lds_buf = 2709 le32_to_cpu(gpu_info_fw->gc_double_offchip_lds_buffer); 2710 adev->gfx.cu_info.wave_front_size = le32_to_cpu(gpu_info_fw->gc_wave_size); 2711 adev->gfx.cu_info.max_waves_per_simd = 2712 le32_to_cpu(gpu_info_fw->gc_max_waves_per_simd); 2713 adev->gfx.cu_info.max_scratch_slots_per_cu = 2714 le32_to_cpu(gpu_info_fw->gc_max_scratch_slots_per_cu); 2715 adev->gfx.cu_info.lds_size = le32_to_cpu(gpu_info_fw->gc_lds_size); 2716 if (hdr->version_minor >= 1) { 2717 const struct gpu_info_firmware_v1_1 *gpu_info_fw = 2718 (const struct gpu_info_firmware_v1_1 *)(adev->firmware.gpu_info_fw->data + 2719 le32_to_cpu(hdr->header.ucode_array_offset_bytes)); 2720 adev->gfx.config.num_sc_per_sh = 2721 le32_to_cpu(gpu_info_fw->num_sc_per_sh); 2722 adev->gfx.config.num_packer_per_sc = 2723 le32_to_cpu(gpu_info_fw->num_packer_per_sc); 2724 } 2725 2726 parse_soc_bounding_box: 2727 /* 2728 * soc bounding box info is not integrated in disocovery table, 2729 * we always need to parse it from gpu info firmware if needed. 2730 */ 2731 if (hdr->version_minor == 2) { 2732 const struct gpu_info_firmware_v1_2 *gpu_info_fw = 2733 (const struct gpu_info_firmware_v1_2 *)(adev->firmware.gpu_info_fw->data + 2734 le32_to_cpu(hdr->header.ucode_array_offset_bytes)); 2735 adev->dm.soc_bounding_box = &gpu_info_fw->soc_bounding_box; 2736 } 2737 break; 2738 } 2739 default: 2740 dev_err(adev->dev, 2741 "Unsupported gpu_info table %d\n", hdr->header.ucode_version); 2742 err = -EINVAL; 2743 goto out; 2744 } 2745 out: 2746 return err; 2747 } 2748 2749 static void amdgpu_uid_init(struct amdgpu_device *adev) 2750 { 2751 /* Initialize the UID for the device */ 2752 adev->uid_info = kzalloc(sizeof(struct amdgpu_uid), GFP_KERNEL); 2753 if (!adev->uid_info) { 2754 dev_warn(adev->dev, "Failed to allocate memory for UID\n"); 2755 return; 2756 } 2757 adev->uid_info->adev = adev; 2758 } 2759 2760 static void amdgpu_uid_fini(struct amdgpu_device *adev) 2761 { 2762 /* Free the UID memory */ 2763 kfree(adev->uid_info); 2764 adev->uid_info = NULL; 2765 } 2766 2767 /** 2768 * amdgpu_device_ip_early_init - run early init for hardware IPs 2769 * 2770 * @adev: amdgpu_device pointer 2771 * 2772 * Early initialization pass for hardware IPs. The hardware IPs that make 2773 * up each asic are discovered each IP's early_init callback is run. This 2774 * is the first stage in initializing the asic. 2775 * Returns 0 on success, negative error code on failure. 2776 */ 2777 static int amdgpu_device_ip_early_init(struct amdgpu_device *adev) 2778 { 2779 struct amdgpu_ip_block *ip_block; 2780 struct pci_dev *parent; 2781 bool total, skip_bios; 2782 uint32_t bios_flags; 2783 int i, r; 2784 2785 amdgpu_device_enable_virtual_display(adev); 2786 2787 if (amdgpu_sriov_vf(adev)) { 2788 r = amdgpu_virt_request_full_gpu(adev, true); 2789 if (r) 2790 return r; 2791 2792 r = amdgpu_virt_init_critical_region(adev); 2793 if (r) 2794 return r; 2795 } 2796 2797 switch (adev->asic_type) { 2798 #ifdef CONFIG_DRM_AMDGPU_SI 2799 case CHIP_VERDE: 2800 case CHIP_TAHITI: 2801 case CHIP_PITCAIRN: 2802 case CHIP_OLAND: 2803 case CHIP_HAINAN: 2804 adev->family = AMDGPU_FAMILY_SI; 2805 r = si_set_ip_blocks(adev); 2806 if (r) 2807 return r; 2808 break; 2809 #endif 2810 #ifdef CONFIG_DRM_AMDGPU_CIK 2811 case CHIP_BONAIRE: 2812 case CHIP_HAWAII: 2813 case CHIP_KAVERI: 2814 case CHIP_KABINI: 2815 case CHIP_MULLINS: 2816 if (adev->flags & AMD_IS_APU) 2817 adev->family = AMDGPU_FAMILY_KV; 2818 else 2819 adev->family = AMDGPU_FAMILY_CI; 2820 2821 r = cik_set_ip_blocks(adev); 2822 if (r) 2823 return r; 2824 break; 2825 #endif 2826 case CHIP_TOPAZ: 2827 case CHIP_TONGA: 2828 case CHIP_FIJI: 2829 case CHIP_POLARIS10: 2830 case CHIP_POLARIS11: 2831 case CHIP_POLARIS12: 2832 case CHIP_VEGAM: 2833 case CHIP_CARRIZO: 2834 case CHIP_STONEY: 2835 if (adev->flags & AMD_IS_APU) 2836 adev->family = AMDGPU_FAMILY_CZ; 2837 else 2838 adev->family = AMDGPU_FAMILY_VI; 2839 2840 r = vi_set_ip_blocks(adev); 2841 if (r) 2842 return r; 2843 break; 2844 default: 2845 r = amdgpu_discovery_set_ip_blocks(adev); 2846 if (r) 2847 return r; 2848 break; 2849 } 2850 2851 /* Check for IP version 9.4.3 with A0 hardware */ 2852 if (amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 3) && 2853 !amdgpu_device_get_rev_id(adev)) { 2854 dev_err(adev->dev, "Unsupported A0 hardware\n"); 2855 return -ENODEV; /* device unsupported - no device error */ 2856 } 2857 2858 if (amdgpu_has_atpx() && 2859 (amdgpu_is_atpx_hybrid() || 2860 amdgpu_has_atpx_dgpu_power_cntl()) && 2861 ((adev->flags & AMD_IS_APU) == 0) && 2862 !dev_is_removable(&adev->pdev->dev)) 2863 adev->flags |= AMD_IS_PX; 2864 2865 if (!(adev->flags & AMD_IS_APU)) { 2866 parent = pcie_find_root_port(adev->pdev); 2867 adev->has_pr3 = parent ? pci_pr3_present(parent) : false; 2868 } 2869 2870 adev->pm.pp_feature = amdgpu_pp_feature_mask; 2871 if (amdgpu_sriov_vf(adev) || sched_policy == KFD_SCHED_POLICY_NO_HWS) 2872 adev->pm.pp_feature &= ~PP_GFXOFF_MASK; 2873 if (amdgpu_sriov_vf(adev) && adev->asic_type == CHIP_SIENNA_CICHLID) 2874 adev->pm.pp_feature &= ~PP_OVERDRIVE_MASK; 2875 if (!amdgpu_device_pcie_dynamic_switching_supported(adev)) 2876 adev->pm.pp_feature &= ~PP_PCIE_DPM_MASK; 2877 2878 adev->virt.is_xgmi_node_migrate_enabled = false; 2879 if (amdgpu_sriov_vf(adev)) { 2880 adev->virt.is_xgmi_node_migrate_enabled = 2881 amdgpu_ip_version((adev), GC_HWIP, 0) == IP_VERSION(9, 4, 4); 2882 } 2883 2884 total = true; 2885 for (i = 0; i < adev->num_ip_blocks; i++) { 2886 ip_block = &adev->ip_blocks[i]; 2887 2888 if ((amdgpu_ip_block_mask & (1 << i)) == 0) { 2889 dev_warn(adev->dev, "disabled ip block: %d <%s>\n", i, 2890 adev->ip_blocks[i].version->funcs->name); 2891 adev->ip_blocks[i].status.valid = false; 2892 } else if (ip_block->version->funcs->early_init) { 2893 r = ip_block->version->funcs->early_init(ip_block); 2894 if (r == -ENOENT) { 2895 adev->ip_blocks[i].status.valid = false; 2896 } else if (r) { 2897 dev_err(adev->dev, 2898 "early_init of IP block <%s> failed %d\n", 2899 adev->ip_blocks[i].version->funcs->name, 2900 r); 2901 total = false; 2902 } else { 2903 adev->ip_blocks[i].status.valid = true; 2904 } 2905 } else { 2906 adev->ip_blocks[i].status.valid = true; 2907 } 2908 /* get the vbios after the asic_funcs are set up */ 2909 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON) { 2910 r = amdgpu_device_parse_gpu_info_fw(adev); 2911 if (r) 2912 return r; 2913 2914 bios_flags = amdgpu_device_get_vbios_flags(adev); 2915 skip_bios = !!(bios_flags & AMDGPU_VBIOS_SKIP); 2916 /* Read BIOS */ 2917 if (!skip_bios) { 2918 bool optional = 2919 !!(bios_flags & AMDGPU_VBIOS_OPTIONAL); 2920 if (!amdgpu_get_bios(adev) && !optional) 2921 return -EINVAL; 2922 2923 if (optional && !adev->bios) 2924 dev_info( 2925 adev->dev, 2926 "VBIOS image optional, proceeding without VBIOS image"); 2927 2928 if (adev->bios) { 2929 r = amdgpu_atombios_init(adev); 2930 if (r) { 2931 dev_err(adev->dev, 2932 "amdgpu_atombios_init failed\n"); 2933 amdgpu_vf_error_put( 2934 adev, 2935 AMDGIM_ERROR_VF_ATOMBIOS_INIT_FAIL, 2936 0, 0); 2937 return r; 2938 } 2939 } 2940 } 2941 2942 /*get pf2vf msg info at it's earliest time*/ 2943 if (amdgpu_sriov_vf(adev)) 2944 amdgpu_virt_init_data_exchange(adev); 2945 2946 } 2947 } 2948 if (!total) 2949 return -ENODEV; 2950 2951 if (adev->gmc.xgmi.supported) 2952 amdgpu_xgmi_early_init(adev); 2953 2954 if (amdgpu_is_multi_aid(adev)) 2955 amdgpu_uid_init(adev); 2956 ip_block = amdgpu_device_ip_get_ip_block(adev, AMD_IP_BLOCK_TYPE_GFX); 2957 if (ip_block->status.valid != false) 2958 amdgpu_amdkfd_device_probe(adev); 2959 2960 adev->cg_flags &= amdgpu_cg_mask; 2961 adev->pg_flags &= amdgpu_pg_mask; 2962 2963 return 0; 2964 } 2965 2966 static int amdgpu_device_ip_hw_init_phase1(struct amdgpu_device *adev) 2967 { 2968 int i, r; 2969 2970 for (i = 0; i < adev->num_ip_blocks; i++) { 2971 if (!adev->ip_blocks[i].status.sw) 2972 continue; 2973 if (adev->ip_blocks[i].status.hw) 2974 continue; 2975 if (!amdgpu_ip_member_of_hwini( 2976 adev, adev->ip_blocks[i].version->type)) 2977 continue; 2978 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON || 2979 (amdgpu_sriov_vf(adev) && (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP)) || 2980 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH) { 2981 r = adev->ip_blocks[i].version->funcs->hw_init(&adev->ip_blocks[i]); 2982 if (r) { 2983 dev_err(adev->dev, 2984 "hw_init of IP block <%s> failed %d\n", 2985 adev->ip_blocks[i].version->funcs->name, 2986 r); 2987 return r; 2988 } 2989 adev->ip_blocks[i].status.hw = true; 2990 } 2991 } 2992 2993 return 0; 2994 } 2995 2996 static int amdgpu_device_ip_hw_init_phase2(struct amdgpu_device *adev) 2997 { 2998 int i, r; 2999 3000 for (i = 0; i < adev->num_ip_blocks; i++) { 3001 if (!adev->ip_blocks[i].status.sw) 3002 continue; 3003 if (adev->ip_blocks[i].status.hw) 3004 continue; 3005 if (!amdgpu_ip_member_of_hwini( 3006 adev, adev->ip_blocks[i].version->type)) 3007 continue; 3008 r = adev->ip_blocks[i].version->funcs->hw_init(&adev->ip_blocks[i]); 3009 if (r) { 3010 dev_err(adev->dev, 3011 "hw_init of IP block <%s> failed %d\n", 3012 adev->ip_blocks[i].version->funcs->name, r); 3013 return r; 3014 } 3015 adev->ip_blocks[i].status.hw = true; 3016 } 3017 3018 return 0; 3019 } 3020 3021 static int amdgpu_device_fw_loading(struct amdgpu_device *adev) 3022 { 3023 int r = 0; 3024 int i; 3025 uint32_t smu_version; 3026 3027 if (adev->asic_type >= CHIP_VEGA10) { 3028 for (i = 0; i < adev->num_ip_blocks; i++) { 3029 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_PSP) 3030 continue; 3031 3032 if (!amdgpu_ip_member_of_hwini(adev, 3033 AMD_IP_BLOCK_TYPE_PSP)) 3034 break; 3035 3036 if (!adev->ip_blocks[i].status.sw) 3037 continue; 3038 3039 /* no need to do the fw loading again if already done*/ 3040 if (adev->ip_blocks[i].status.hw == true) 3041 break; 3042 3043 if (amdgpu_in_reset(adev) || adev->in_suspend) { 3044 r = amdgpu_ip_block_resume(&adev->ip_blocks[i]); 3045 if (r) 3046 return r; 3047 } else { 3048 r = adev->ip_blocks[i].version->funcs->hw_init(&adev->ip_blocks[i]); 3049 if (r) { 3050 dev_err(adev->dev, 3051 "hw_init of IP block <%s> failed %d\n", 3052 adev->ip_blocks[i] 3053 .version->funcs->name, 3054 r); 3055 return r; 3056 } 3057 adev->ip_blocks[i].status.hw = true; 3058 } 3059 break; 3060 } 3061 } 3062 3063 if (!amdgpu_sriov_vf(adev) || adev->asic_type == CHIP_TONGA) 3064 r = amdgpu_pm_load_smu_firmware(adev, &smu_version); 3065 3066 return r; 3067 } 3068 3069 static int amdgpu_device_init_schedulers(struct amdgpu_device *adev) 3070 { 3071 struct drm_sched_init_args args = { 3072 .ops = &amdgpu_sched_ops, 3073 .num_rqs = DRM_SCHED_PRIORITY_COUNT, 3074 .timeout_wq = adev->reset_domain->wq, 3075 .dev = adev->dev, 3076 }; 3077 long timeout; 3078 int r, i; 3079 3080 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 3081 struct amdgpu_ring *ring = adev->rings[i]; 3082 3083 /* No need to setup the GPU scheduler for rings that don't need it */ 3084 if (!ring || ring->no_scheduler) 3085 continue; 3086 3087 switch (ring->funcs->type) { 3088 case AMDGPU_RING_TYPE_GFX: 3089 timeout = adev->gfx_timeout; 3090 break; 3091 case AMDGPU_RING_TYPE_COMPUTE: 3092 timeout = adev->compute_timeout; 3093 break; 3094 case AMDGPU_RING_TYPE_SDMA: 3095 timeout = adev->sdma_timeout; 3096 break; 3097 default: 3098 timeout = adev->video_timeout; 3099 break; 3100 } 3101 3102 args.timeout = timeout; 3103 args.credit_limit = ring->num_hw_submission; 3104 args.score = ring->sched_score; 3105 args.name = ring->name; 3106 3107 r = drm_sched_init(&ring->sched, &args); 3108 if (r) { 3109 dev_err(adev->dev, 3110 "Failed to create scheduler on ring %s.\n", 3111 ring->name); 3112 return r; 3113 } 3114 r = amdgpu_uvd_entity_init(adev, ring); 3115 if (r) { 3116 dev_err(adev->dev, 3117 "Failed to create UVD scheduling entity on ring %s.\n", 3118 ring->name); 3119 return r; 3120 } 3121 r = amdgpu_vce_entity_init(adev, ring); 3122 if (r) { 3123 dev_err(adev->dev, 3124 "Failed to create VCE scheduling entity on ring %s.\n", 3125 ring->name); 3126 return r; 3127 } 3128 } 3129 3130 if (adev->xcp_mgr) 3131 amdgpu_xcp_update_partition_sched_list(adev); 3132 3133 return 0; 3134 } 3135 3136 3137 /** 3138 * amdgpu_device_ip_init - run init for hardware IPs 3139 * 3140 * @adev: amdgpu_device pointer 3141 * 3142 * Main initialization pass for hardware IPs. The list of all the hardware 3143 * IPs that make up the asic is walked and the sw_init and hw_init callbacks 3144 * are run. sw_init initializes the software state associated with each IP 3145 * and hw_init initializes the hardware associated with each IP. 3146 * Returns 0 on success, negative error code on failure. 3147 */ 3148 static int amdgpu_device_ip_init(struct amdgpu_device *adev) 3149 { 3150 bool init_badpage; 3151 int i, r; 3152 3153 r = amdgpu_ras_init(adev); 3154 if (r) 3155 return r; 3156 3157 for (i = 0; i < adev->num_ip_blocks; i++) { 3158 if (!adev->ip_blocks[i].status.valid) 3159 continue; 3160 if (adev->ip_blocks[i].version->funcs->sw_init) { 3161 r = adev->ip_blocks[i].version->funcs->sw_init(&adev->ip_blocks[i]); 3162 if (r) { 3163 dev_err(adev->dev, 3164 "sw_init of IP block <%s> failed %d\n", 3165 adev->ip_blocks[i].version->funcs->name, 3166 r); 3167 goto init_failed; 3168 } 3169 } 3170 adev->ip_blocks[i].status.sw = true; 3171 3172 if (!amdgpu_ip_member_of_hwini( 3173 adev, adev->ip_blocks[i].version->type)) 3174 continue; 3175 3176 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON) { 3177 /* need to do common hw init early so everything is set up for gmc */ 3178 r = adev->ip_blocks[i].version->funcs->hw_init(&adev->ip_blocks[i]); 3179 if (r) { 3180 dev_err(adev->dev, "hw_init %d failed %d\n", i, 3181 r); 3182 goto init_failed; 3183 } 3184 adev->ip_blocks[i].status.hw = true; 3185 } else if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) { 3186 /* need to do gmc hw init early so we can allocate gpu mem */ 3187 /* Try to reserve bad pages early */ 3188 if (amdgpu_sriov_vf(adev)) 3189 amdgpu_virt_exchange_data(adev); 3190 3191 r = amdgpu_device_mem_scratch_init(adev); 3192 if (r) { 3193 dev_err(adev->dev, 3194 "amdgpu_mem_scratch_init failed %d\n", 3195 r); 3196 goto init_failed; 3197 } 3198 r = adev->ip_blocks[i].version->funcs->hw_init(&adev->ip_blocks[i]); 3199 if (r) { 3200 dev_err(adev->dev, "hw_init %d failed %d\n", i, 3201 r); 3202 goto init_failed; 3203 } 3204 r = amdgpu_device_wb_init(adev); 3205 if (r) { 3206 dev_err(adev->dev, 3207 "amdgpu_device_wb_init failed %d\n", r); 3208 goto init_failed; 3209 } 3210 adev->ip_blocks[i].status.hw = true; 3211 3212 /* right after GMC hw init, we create CSA */ 3213 if (adev->gfx.mcbp) { 3214 r = amdgpu_allocate_static_csa(adev, &adev->virt.csa_obj, 3215 AMDGPU_GEM_DOMAIN_VRAM | 3216 AMDGPU_GEM_DOMAIN_GTT, 3217 AMDGPU_CSA_SIZE); 3218 if (r) { 3219 dev_err(adev->dev, 3220 "allocate CSA failed %d\n", r); 3221 goto init_failed; 3222 } 3223 } 3224 3225 r = amdgpu_seq64_init(adev); 3226 if (r) { 3227 dev_err(adev->dev, "allocate seq64 failed %d\n", 3228 r); 3229 goto init_failed; 3230 } 3231 } 3232 } 3233 3234 if (amdgpu_sriov_vf(adev)) 3235 amdgpu_virt_init_data_exchange(adev); 3236 3237 r = amdgpu_ib_pool_init(adev); 3238 if (r) { 3239 dev_err(adev->dev, "IB initialization failed (%d).\n", r); 3240 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_IB_INIT_FAIL, 0, r); 3241 goto init_failed; 3242 } 3243 3244 r = amdgpu_ucode_create_bo(adev); /* create ucode bo when sw_init complete*/ 3245 if (r) 3246 goto init_failed; 3247 3248 r = amdgpu_device_ip_hw_init_phase1(adev); 3249 if (r) 3250 goto init_failed; 3251 3252 r = amdgpu_device_fw_loading(adev); 3253 if (r) 3254 goto init_failed; 3255 3256 r = amdgpu_device_ip_hw_init_phase2(adev); 3257 if (r) 3258 goto init_failed; 3259 3260 /* 3261 * retired pages will be loaded from eeprom and reserved here, 3262 * it should be called after amdgpu_device_ip_hw_init_phase2 since 3263 * for some ASICs the RAS EEPROM code relies on SMU fully functioning 3264 * for I2C communication which only true at this point. 3265 * 3266 * amdgpu_ras_recovery_init may fail, but the upper only cares the 3267 * failure from bad gpu situation and stop amdgpu init process 3268 * accordingly. For other failed cases, it will still release all 3269 * the resource and print error message, rather than returning one 3270 * negative value to upper level. 3271 * 3272 * Note: theoretically, this should be called before all vram allocations 3273 * to protect retired page from abusing 3274 */ 3275 init_badpage = (adev->init_lvl->level != AMDGPU_INIT_LEVEL_MINIMAL_XGMI); 3276 r = amdgpu_ras_recovery_init(adev, init_badpage); 3277 if (r) 3278 goto init_failed; 3279 3280 /** 3281 * In case of XGMI grab extra reference for reset domain for this device 3282 */ 3283 if (adev->gmc.xgmi.num_physical_nodes > 1) { 3284 if (amdgpu_xgmi_add_device(adev) == 0) { 3285 if (!amdgpu_sriov_vf(adev)) { 3286 struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev); 3287 3288 if (WARN_ON(!hive)) { 3289 r = -ENOENT; 3290 goto init_failed; 3291 } 3292 3293 if (!hive->reset_domain || 3294 !amdgpu_reset_get_reset_domain(hive->reset_domain)) { 3295 r = -ENOENT; 3296 amdgpu_put_xgmi_hive(hive); 3297 goto init_failed; 3298 } 3299 3300 /* Drop the early temporary reset domain we created for device */ 3301 amdgpu_reset_put_reset_domain(adev->reset_domain); 3302 adev->reset_domain = hive->reset_domain; 3303 amdgpu_put_xgmi_hive(hive); 3304 } 3305 } 3306 } 3307 3308 r = amdgpu_device_init_schedulers(adev); 3309 if (r) 3310 goto init_failed; 3311 3312 if (adev->mman.buffer_funcs_ring->sched.ready) 3313 amdgpu_ttm_set_buffer_funcs_status(adev, true); 3314 3315 /* Don't init kfd if whole hive need to be reset during init */ 3316 if (adev->init_lvl->level != AMDGPU_INIT_LEVEL_MINIMAL_XGMI) { 3317 kgd2kfd_init_zone_device(adev); 3318 amdgpu_amdkfd_device_init(adev); 3319 } 3320 3321 amdgpu_fru_get_product_info(adev); 3322 3323 if (!amdgpu_sriov_vf(adev) || amdgpu_sriov_ras_cper_en(adev)) 3324 r = amdgpu_cper_init(adev); 3325 3326 init_failed: 3327 3328 return r; 3329 } 3330 3331 /** 3332 * amdgpu_device_fill_reset_magic - writes reset magic to gart pointer 3333 * 3334 * @adev: amdgpu_device pointer 3335 * 3336 * Writes a reset magic value to the gart pointer in VRAM. The driver calls 3337 * this function before a GPU reset. If the value is retained after a 3338 * GPU reset, VRAM has not been lost. Some GPU resets may destroy VRAM contents. 3339 */ 3340 static void amdgpu_device_fill_reset_magic(struct amdgpu_device *adev) 3341 { 3342 memcpy(adev->reset_magic, adev->gart.ptr, AMDGPU_RESET_MAGIC_NUM); 3343 } 3344 3345 /** 3346 * amdgpu_device_check_vram_lost - check if vram is valid 3347 * 3348 * @adev: amdgpu_device pointer 3349 * 3350 * Checks the reset magic value written to the gart pointer in VRAM. 3351 * The driver calls this after a GPU reset to see if the contents of 3352 * VRAM is lost or now. 3353 * returns true if vram is lost, false if not. 3354 */ 3355 static bool amdgpu_device_check_vram_lost(struct amdgpu_device *adev) 3356 { 3357 if (memcmp(adev->gart.ptr, adev->reset_magic, 3358 AMDGPU_RESET_MAGIC_NUM)) 3359 return true; 3360 3361 if (!amdgpu_in_reset(adev)) 3362 return false; 3363 3364 /* 3365 * For all ASICs with baco/mode1 reset, the VRAM is 3366 * always assumed to be lost. 3367 */ 3368 switch (amdgpu_asic_reset_method(adev)) { 3369 case AMD_RESET_METHOD_LEGACY: 3370 case AMD_RESET_METHOD_LINK: 3371 case AMD_RESET_METHOD_BACO: 3372 case AMD_RESET_METHOD_MODE1: 3373 return true; 3374 default: 3375 return false; 3376 } 3377 } 3378 3379 /** 3380 * amdgpu_device_set_cg_state - set clockgating for amdgpu device 3381 * 3382 * @adev: amdgpu_device pointer 3383 * @state: clockgating state (gate or ungate) 3384 * 3385 * The list of all the hardware IPs that make up the asic is walked and the 3386 * set_clockgating_state callbacks are run. 3387 * Late initialization pass enabling clockgating for hardware IPs. 3388 * Fini or suspend, pass disabling clockgating for hardware IPs. 3389 * Returns 0 on success, negative error code on failure. 3390 */ 3391 3392 int amdgpu_device_set_cg_state(struct amdgpu_device *adev, 3393 enum amd_clockgating_state state) 3394 { 3395 int i, j, r; 3396 3397 if (amdgpu_emu_mode == 1) 3398 return 0; 3399 3400 for (j = 0; j < adev->num_ip_blocks; j++) { 3401 i = state == AMD_CG_STATE_GATE ? j : adev->num_ip_blocks - j - 1; 3402 if (!adev->ip_blocks[i].status.late_initialized) 3403 continue; 3404 /* skip CG for GFX, SDMA on S0ix */ 3405 if (adev->in_s0ix && 3406 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GFX || 3407 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SDMA)) 3408 continue; 3409 /* skip CG for VCE/UVD, it's handled specially */ 3410 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_UVD && 3411 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCE && 3412 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCN && 3413 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_JPEG && 3414 adev->ip_blocks[i].version->funcs->set_clockgating_state) { 3415 /* enable clockgating to save power */ 3416 r = adev->ip_blocks[i].version->funcs->set_clockgating_state(&adev->ip_blocks[i], 3417 state); 3418 if (r) { 3419 dev_err(adev->dev, 3420 "set_clockgating_state(gate) of IP block <%s> failed %d\n", 3421 adev->ip_blocks[i].version->funcs->name, 3422 r); 3423 return r; 3424 } 3425 } 3426 } 3427 3428 return 0; 3429 } 3430 3431 int amdgpu_device_set_pg_state(struct amdgpu_device *adev, 3432 enum amd_powergating_state state) 3433 { 3434 int i, j, r; 3435 3436 if (amdgpu_emu_mode == 1) 3437 return 0; 3438 3439 for (j = 0; j < adev->num_ip_blocks; j++) { 3440 i = state == AMD_PG_STATE_GATE ? j : adev->num_ip_blocks - j - 1; 3441 if (!adev->ip_blocks[i].status.late_initialized) 3442 continue; 3443 /* skip PG for GFX, SDMA on S0ix */ 3444 if (adev->in_s0ix && 3445 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GFX || 3446 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SDMA)) 3447 continue; 3448 /* skip CG for VCE/UVD/VPE, it's handled specially */ 3449 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_UVD && 3450 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCE && 3451 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCN && 3452 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VPE && 3453 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_JPEG && 3454 adev->ip_blocks[i].version->funcs->set_powergating_state) { 3455 /* enable powergating to save power */ 3456 r = adev->ip_blocks[i].version->funcs->set_powergating_state(&adev->ip_blocks[i], 3457 state); 3458 if (r) { 3459 dev_err(adev->dev, 3460 "set_powergating_state(gate) of IP block <%s> failed %d\n", 3461 adev->ip_blocks[i].version->funcs->name, 3462 r); 3463 return r; 3464 } 3465 } 3466 } 3467 return 0; 3468 } 3469 3470 static int amdgpu_device_enable_mgpu_fan_boost(void) 3471 { 3472 struct amdgpu_gpu_instance *gpu_ins; 3473 struct amdgpu_device *adev; 3474 int i, ret = 0; 3475 3476 mutex_lock(&mgpu_info.mutex); 3477 3478 /* 3479 * MGPU fan boost feature should be enabled 3480 * only when there are two or more dGPUs in 3481 * the system 3482 */ 3483 if (mgpu_info.num_dgpu < 2) 3484 goto out; 3485 3486 for (i = 0; i < mgpu_info.num_dgpu; i++) { 3487 gpu_ins = &(mgpu_info.gpu_ins[i]); 3488 adev = gpu_ins->adev; 3489 if (!(adev->flags & AMD_IS_APU || amdgpu_sriov_multi_vf_mode(adev)) && 3490 !gpu_ins->mgpu_fan_enabled) { 3491 ret = amdgpu_dpm_enable_mgpu_fan_boost(adev); 3492 if (ret) 3493 break; 3494 3495 gpu_ins->mgpu_fan_enabled = 1; 3496 } 3497 } 3498 3499 out: 3500 mutex_unlock(&mgpu_info.mutex); 3501 3502 return ret; 3503 } 3504 3505 /** 3506 * amdgpu_device_ip_late_init - run late init for hardware IPs 3507 * 3508 * @adev: amdgpu_device pointer 3509 * 3510 * Late initialization pass for hardware IPs. The list of all the hardware 3511 * IPs that make up the asic is walked and the late_init callbacks are run. 3512 * late_init covers any special initialization that an IP requires 3513 * after all of the have been initialized or something that needs to happen 3514 * late in the init process. 3515 * Returns 0 on success, negative error code on failure. 3516 */ 3517 static int amdgpu_device_ip_late_init(struct amdgpu_device *adev) 3518 { 3519 struct amdgpu_gpu_instance *gpu_instance; 3520 int i = 0, r; 3521 3522 for (i = 0; i < adev->num_ip_blocks; i++) { 3523 if (!adev->ip_blocks[i].status.hw) 3524 continue; 3525 if (adev->ip_blocks[i].version->funcs->late_init) { 3526 r = adev->ip_blocks[i].version->funcs->late_init(&adev->ip_blocks[i]); 3527 if (r) { 3528 dev_err(adev->dev, 3529 "late_init of IP block <%s> failed %d\n", 3530 adev->ip_blocks[i].version->funcs->name, 3531 r); 3532 return r; 3533 } 3534 } 3535 adev->ip_blocks[i].status.late_initialized = true; 3536 } 3537 3538 r = amdgpu_ras_late_init(adev); 3539 if (r) { 3540 dev_err(adev->dev, "amdgpu_ras_late_init failed %d", r); 3541 return r; 3542 } 3543 3544 if (!amdgpu_reset_in_recovery(adev)) 3545 amdgpu_ras_set_error_query_ready(adev, true); 3546 3547 amdgpu_device_set_cg_state(adev, AMD_CG_STATE_GATE); 3548 amdgpu_device_set_pg_state(adev, AMD_PG_STATE_GATE); 3549 3550 amdgpu_device_fill_reset_magic(adev); 3551 3552 r = amdgpu_device_enable_mgpu_fan_boost(); 3553 if (r) 3554 dev_err(adev->dev, "enable mgpu fan boost failed (%d).\n", r); 3555 3556 /* For passthrough configuration on arcturus and aldebaran, enable special handling SBR */ 3557 if (amdgpu_passthrough(adev) && 3558 ((adev->asic_type == CHIP_ARCTURUS && adev->gmc.xgmi.num_physical_nodes > 1) || 3559 adev->asic_type == CHIP_ALDEBARAN)) 3560 amdgpu_dpm_handle_passthrough_sbr(adev, true); 3561 3562 if (adev->gmc.xgmi.num_physical_nodes > 1) { 3563 mutex_lock(&mgpu_info.mutex); 3564 3565 /* 3566 * Reset device p-state to low as this was booted with high. 3567 * 3568 * This should be performed only after all devices from the same 3569 * hive get initialized. 3570 * 3571 * However, it's unknown how many device in the hive in advance. 3572 * As this is counted one by one during devices initializations. 3573 * 3574 * So, we wait for all XGMI interlinked devices initialized. 3575 * This may bring some delays as those devices may come from 3576 * different hives. But that should be OK. 3577 */ 3578 if (mgpu_info.num_dgpu == adev->gmc.xgmi.num_physical_nodes) { 3579 for (i = 0; i < mgpu_info.num_gpu; i++) { 3580 gpu_instance = &(mgpu_info.gpu_ins[i]); 3581 if (gpu_instance->adev->flags & AMD_IS_APU) 3582 continue; 3583 3584 r = amdgpu_xgmi_set_pstate(gpu_instance->adev, 3585 AMDGPU_XGMI_PSTATE_MIN); 3586 if (r) { 3587 dev_err(adev->dev, 3588 "pstate setting failed (%d).\n", 3589 r); 3590 break; 3591 } 3592 } 3593 } 3594 3595 mutex_unlock(&mgpu_info.mutex); 3596 } 3597 3598 return 0; 3599 } 3600 3601 static void amdgpu_ip_block_hw_fini(struct amdgpu_ip_block *ip_block) 3602 { 3603 struct amdgpu_device *adev = ip_block->adev; 3604 int r; 3605 3606 if (!ip_block->version->funcs->hw_fini) { 3607 dev_err(adev->dev, "hw_fini of IP block <%s> not defined\n", 3608 ip_block->version->funcs->name); 3609 } else { 3610 r = ip_block->version->funcs->hw_fini(ip_block); 3611 /* XXX handle errors */ 3612 if (r) { 3613 dev_dbg(adev->dev, 3614 "hw_fini of IP block <%s> failed %d\n", 3615 ip_block->version->funcs->name, r); 3616 } 3617 } 3618 3619 ip_block->status.hw = false; 3620 } 3621 3622 /** 3623 * amdgpu_device_smu_fini_early - smu hw_fini wrapper 3624 * 3625 * @adev: amdgpu_device pointer 3626 * 3627 * For ASICs need to disable SMC first 3628 */ 3629 static void amdgpu_device_smu_fini_early(struct amdgpu_device *adev) 3630 { 3631 int i; 3632 3633 if (amdgpu_ip_version(adev, GC_HWIP, 0) > IP_VERSION(9, 0, 0)) 3634 return; 3635 3636 for (i = 0; i < adev->num_ip_blocks; i++) { 3637 if (!adev->ip_blocks[i].status.hw) 3638 continue; 3639 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) { 3640 amdgpu_ip_block_hw_fini(&adev->ip_blocks[i]); 3641 break; 3642 } 3643 } 3644 } 3645 3646 static int amdgpu_device_ip_fini_early(struct amdgpu_device *adev) 3647 { 3648 int i, r; 3649 3650 for (i = 0; i < adev->num_ip_blocks; i++) { 3651 if (!adev->ip_blocks[i].version->funcs->early_fini) 3652 continue; 3653 3654 r = adev->ip_blocks[i].version->funcs->early_fini(&adev->ip_blocks[i]); 3655 if (r) { 3656 dev_dbg(adev->dev, 3657 "early_fini of IP block <%s> failed %d\n", 3658 adev->ip_blocks[i].version->funcs->name, r); 3659 } 3660 } 3661 3662 amdgpu_device_set_pg_state(adev, AMD_PG_STATE_UNGATE); 3663 amdgpu_device_set_cg_state(adev, AMD_CG_STATE_UNGATE); 3664 3665 amdgpu_amdkfd_suspend(adev, true); 3666 amdgpu_userq_suspend(adev); 3667 3668 /* Workaround for ASICs need to disable SMC first */ 3669 amdgpu_device_smu_fini_early(adev); 3670 3671 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 3672 if (!adev->ip_blocks[i].status.hw) 3673 continue; 3674 3675 amdgpu_ip_block_hw_fini(&adev->ip_blocks[i]); 3676 } 3677 3678 if (amdgpu_sriov_vf(adev)) { 3679 if (amdgpu_virt_release_full_gpu(adev, false)) 3680 dev_err(adev->dev, 3681 "failed to release exclusive mode on fini\n"); 3682 } 3683 3684 /* 3685 * Driver reload on the APU can fail due to firmware validation because 3686 * the PSP is always running, as it is shared across the whole SoC. 3687 * This same issue does not occur on dGPU because it has a mechanism 3688 * that checks whether the PSP is running. A solution for those issues 3689 * in the APU is to trigger a GPU reset, but this should be done during 3690 * the unload phase to avoid adding boot latency and screen flicker. 3691 */ 3692 if ((adev->flags & AMD_IS_APU) && !adev->gmc.is_app_apu) { 3693 r = amdgpu_asic_reset(adev); 3694 if (r) 3695 dev_err(adev->dev, "asic reset on %s failed\n", __func__); 3696 } 3697 3698 return 0; 3699 } 3700 3701 /** 3702 * amdgpu_device_ip_fini - run fini for hardware IPs 3703 * 3704 * @adev: amdgpu_device pointer 3705 * 3706 * Main teardown pass for hardware IPs. The list of all the hardware 3707 * IPs that make up the asic is walked and the hw_fini and sw_fini callbacks 3708 * are run. hw_fini tears down the hardware associated with each IP 3709 * and sw_fini tears down any software state associated with each IP. 3710 * Returns 0 on success, negative error code on failure. 3711 */ 3712 static int amdgpu_device_ip_fini(struct amdgpu_device *adev) 3713 { 3714 int i, r; 3715 3716 amdgpu_cper_fini(adev); 3717 3718 if (amdgpu_sriov_vf(adev) && adev->virt.ras_init_done) 3719 amdgpu_virt_release_ras_err_handler_data(adev); 3720 3721 if (adev->gmc.xgmi.num_physical_nodes > 1) 3722 amdgpu_xgmi_remove_device(adev); 3723 3724 amdgpu_amdkfd_device_fini_sw(adev); 3725 3726 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 3727 if (!adev->ip_blocks[i].status.sw) 3728 continue; 3729 3730 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) { 3731 amdgpu_ucode_free_bo(adev); 3732 amdgpu_free_static_csa(&adev->virt.csa_obj); 3733 amdgpu_device_wb_fini(adev); 3734 amdgpu_device_mem_scratch_fini(adev); 3735 amdgpu_ib_pool_fini(adev); 3736 amdgpu_seq64_fini(adev); 3737 amdgpu_doorbell_fini(adev); 3738 } 3739 if (adev->ip_blocks[i].version->funcs->sw_fini) { 3740 r = adev->ip_blocks[i].version->funcs->sw_fini(&adev->ip_blocks[i]); 3741 /* XXX handle errors */ 3742 if (r) { 3743 dev_dbg(adev->dev, 3744 "sw_fini of IP block <%s> failed %d\n", 3745 adev->ip_blocks[i].version->funcs->name, 3746 r); 3747 } 3748 } 3749 adev->ip_blocks[i].status.sw = false; 3750 adev->ip_blocks[i].status.valid = false; 3751 } 3752 3753 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 3754 if (!adev->ip_blocks[i].status.late_initialized) 3755 continue; 3756 if (adev->ip_blocks[i].version->funcs->late_fini) 3757 adev->ip_blocks[i].version->funcs->late_fini(&adev->ip_blocks[i]); 3758 adev->ip_blocks[i].status.late_initialized = false; 3759 } 3760 3761 amdgpu_ras_fini(adev); 3762 amdgpu_uid_fini(adev); 3763 3764 return 0; 3765 } 3766 3767 /** 3768 * amdgpu_device_delayed_init_work_handler - work handler for IB tests 3769 * 3770 * @work: work_struct. 3771 */ 3772 static void amdgpu_device_delayed_init_work_handler(struct work_struct *work) 3773 { 3774 struct amdgpu_device *adev = 3775 container_of(work, struct amdgpu_device, delayed_init_work.work); 3776 int r; 3777 3778 r = amdgpu_ib_ring_tests(adev); 3779 if (r) 3780 dev_err(adev->dev, "ib ring test failed (%d).\n", r); 3781 } 3782 3783 static void amdgpu_device_delay_enable_gfx_off(struct work_struct *work) 3784 { 3785 struct amdgpu_device *adev = 3786 container_of(work, struct amdgpu_device, gfx.gfx_off_delay_work.work); 3787 3788 WARN_ON_ONCE(adev->gfx.gfx_off_state); 3789 WARN_ON_ONCE(adev->gfx.gfx_off_req_count); 3790 3791 if (!amdgpu_dpm_set_powergating_by_smu(adev, AMD_IP_BLOCK_TYPE_GFX, true, 0)) 3792 adev->gfx.gfx_off_state = true; 3793 } 3794 3795 /** 3796 * amdgpu_device_ip_suspend_phase1 - run suspend for hardware IPs (phase 1) 3797 * 3798 * @adev: amdgpu_device pointer 3799 * 3800 * Main suspend function for hardware IPs. The list of all the hardware 3801 * IPs that make up the asic is walked, clockgating is disabled and the 3802 * suspend callbacks are run. suspend puts the hardware and software state 3803 * in each IP into a state suitable for suspend. 3804 * Returns 0 on success, negative error code on failure. 3805 */ 3806 static int amdgpu_device_ip_suspend_phase1(struct amdgpu_device *adev) 3807 { 3808 int i, r, rec; 3809 3810 amdgpu_device_set_pg_state(adev, AMD_PG_STATE_UNGATE); 3811 amdgpu_device_set_cg_state(adev, AMD_CG_STATE_UNGATE); 3812 3813 /* 3814 * Per PMFW team's suggestion, driver needs to handle gfxoff 3815 * and df cstate features disablement for gpu reset(e.g. Mode1Reset) 3816 * scenario. Add the missing df cstate disablement here. 3817 */ 3818 if (amdgpu_dpm_set_df_cstate(adev, DF_CSTATE_DISALLOW)) 3819 dev_warn(adev->dev, "Failed to disallow df cstate"); 3820 3821 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 3822 if (!adev->ip_blocks[i].status.valid) 3823 continue; 3824 3825 /* displays are handled separately */ 3826 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_DCE) 3827 continue; 3828 3829 r = amdgpu_ip_block_suspend(&adev->ip_blocks[i]); 3830 if (r) 3831 goto unwind; 3832 } 3833 3834 return 0; 3835 unwind: 3836 rec = amdgpu_device_ip_resume_phase3(adev); 3837 if (rec) 3838 dev_err(adev->dev, 3839 "amdgpu_device_ip_resume_phase3 failed during unwind: %d\n", 3840 rec); 3841 3842 amdgpu_dpm_set_df_cstate(adev, DF_CSTATE_ALLOW); 3843 3844 amdgpu_device_set_pg_state(adev, AMD_PG_STATE_GATE); 3845 amdgpu_device_set_cg_state(adev, AMD_CG_STATE_GATE); 3846 3847 return r; 3848 } 3849 3850 /** 3851 * amdgpu_device_ip_suspend_phase2 - run suspend for hardware IPs (phase 2) 3852 * 3853 * @adev: amdgpu_device pointer 3854 * 3855 * Main suspend function for hardware IPs. The list of all the hardware 3856 * IPs that make up the asic is walked, clockgating is disabled and the 3857 * suspend callbacks are run. suspend puts the hardware and software state 3858 * in each IP into a state suitable for suspend. 3859 * Returns 0 on success, negative error code on failure. 3860 */ 3861 static int amdgpu_device_ip_suspend_phase2(struct amdgpu_device *adev) 3862 { 3863 int i, r, rec; 3864 3865 if (adev->in_s0ix) 3866 amdgpu_dpm_gfx_state_change(adev, sGpuChangeState_D3Entry); 3867 3868 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 3869 if (!adev->ip_blocks[i].status.valid) 3870 continue; 3871 /* displays are handled in phase1 */ 3872 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE) 3873 continue; 3874 /* PSP lost connection when err_event_athub occurs */ 3875 if (amdgpu_ras_intr_triggered() && 3876 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) { 3877 adev->ip_blocks[i].status.hw = false; 3878 continue; 3879 } 3880 3881 /* skip unnecessary suspend if we do not initialize them yet */ 3882 if (!amdgpu_ip_member_of_hwini( 3883 adev, adev->ip_blocks[i].version->type)) 3884 continue; 3885 3886 /* Since we skip suspend for S0i3, we need to cancel the delayed 3887 * idle work here as the suspend callback never gets called. 3888 */ 3889 if (adev->in_s0ix && 3890 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GFX && 3891 amdgpu_ip_version(adev, GC_HWIP, 0) >= IP_VERSION(10, 0, 0)) 3892 cancel_delayed_work_sync(&adev->gfx.idle_work); 3893 /* skip suspend of gfx/mes and psp for S0ix 3894 * gfx is in gfxoff state, so on resume it will exit gfxoff just 3895 * like at runtime. PSP is also part of the always on hardware 3896 * so no need to suspend it. 3897 */ 3898 if (adev->in_s0ix && 3899 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP || 3900 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GFX || 3901 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_MES)) 3902 continue; 3903 3904 /* SDMA 5.x+ is part of GFX power domain so it's covered by GFXOFF */ 3905 if (adev->in_s0ix && 3906 (amdgpu_ip_version(adev, SDMA0_HWIP, 0) >= 3907 IP_VERSION(5, 0, 0)) && 3908 (adev->ip_blocks[i].version->type == 3909 AMD_IP_BLOCK_TYPE_SDMA)) 3910 continue; 3911 3912 /* Once swPSP provides the IMU, RLC FW binaries to TOS during cold-boot. 3913 * These are in TMR, hence are expected to be reused by PSP-TOS to reload 3914 * from this location and RLC Autoload automatically also gets loaded 3915 * from here based on PMFW -> PSP message during re-init sequence. 3916 * Therefore, the psp suspend & resume should be skipped to avoid destroy 3917 * the TMR and reload FWs again for IMU enabled APU ASICs. 3918 */ 3919 if (amdgpu_in_reset(adev) && 3920 (adev->flags & AMD_IS_APU) && adev->gfx.imu.funcs && 3921 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) 3922 continue; 3923 3924 r = amdgpu_ip_block_suspend(&adev->ip_blocks[i]); 3925 if (r) 3926 goto unwind; 3927 3928 /* handle putting the SMC in the appropriate state */ 3929 if (!amdgpu_sriov_vf(adev)) { 3930 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) { 3931 r = amdgpu_dpm_set_mp1_state(adev, adev->mp1_state); 3932 if (r) { 3933 dev_err(adev->dev, 3934 "SMC failed to set mp1 state %d, %d\n", 3935 adev->mp1_state, r); 3936 goto unwind; 3937 } 3938 } 3939 } 3940 } 3941 3942 return 0; 3943 unwind: 3944 /* suspend phase 2 = resume phase 1 + resume phase 2 */ 3945 rec = amdgpu_device_ip_resume_phase1(adev); 3946 if (rec) { 3947 dev_err(adev->dev, 3948 "amdgpu_device_ip_resume_phase1 failed during unwind: %d\n", 3949 rec); 3950 return r; 3951 } 3952 3953 rec = amdgpu_device_fw_loading(adev); 3954 if (rec) { 3955 dev_err(adev->dev, 3956 "amdgpu_device_fw_loading failed during unwind: %d\n", 3957 rec); 3958 return r; 3959 } 3960 3961 rec = amdgpu_device_ip_resume_phase2(adev); 3962 if (rec) { 3963 dev_err(adev->dev, 3964 "amdgpu_device_ip_resume_phase2 failed during unwind: %d\n", 3965 rec); 3966 return r; 3967 } 3968 3969 return r; 3970 } 3971 3972 /** 3973 * amdgpu_device_ip_suspend - run suspend for hardware IPs 3974 * 3975 * @adev: amdgpu_device pointer 3976 * 3977 * Main suspend function for hardware IPs. The list of all the hardware 3978 * IPs that make up the asic is walked, clockgating is disabled and the 3979 * suspend callbacks are run. suspend puts the hardware and software state 3980 * in each IP into a state suitable for suspend. 3981 * Returns 0 on success, negative error code on failure. 3982 */ 3983 static int amdgpu_device_ip_suspend(struct amdgpu_device *adev) 3984 { 3985 int r; 3986 3987 if (amdgpu_sriov_vf(adev)) { 3988 amdgpu_virt_fini_data_exchange(adev); 3989 amdgpu_virt_request_full_gpu(adev, false); 3990 } 3991 3992 amdgpu_ttm_set_buffer_funcs_status(adev, false); 3993 3994 r = amdgpu_device_ip_suspend_phase1(adev); 3995 if (r) 3996 return r; 3997 r = amdgpu_device_ip_suspend_phase2(adev); 3998 3999 if (amdgpu_sriov_vf(adev)) 4000 amdgpu_virt_release_full_gpu(adev, false); 4001 4002 return r; 4003 } 4004 4005 static int amdgpu_device_ip_reinit_early_sriov(struct amdgpu_device *adev) 4006 { 4007 int i, r; 4008 4009 static enum amd_ip_block_type ip_order[] = { 4010 AMD_IP_BLOCK_TYPE_COMMON, 4011 AMD_IP_BLOCK_TYPE_GMC, 4012 AMD_IP_BLOCK_TYPE_PSP, 4013 AMD_IP_BLOCK_TYPE_IH, 4014 }; 4015 4016 for (i = 0; i < adev->num_ip_blocks; i++) { 4017 int j; 4018 struct amdgpu_ip_block *block; 4019 4020 block = &adev->ip_blocks[i]; 4021 block->status.hw = false; 4022 4023 for (j = 0; j < ARRAY_SIZE(ip_order); j++) { 4024 4025 if (block->version->type != ip_order[j] || 4026 !block->status.valid) 4027 continue; 4028 4029 r = block->version->funcs->hw_init(&adev->ip_blocks[i]); 4030 if (r) { 4031 dev_err(adev->dev, "RE-INIT-early: %s failed\n", 4032 block->version->funcs->name); 4033 return r; 4034 } 4035 block->status.hw = true; 4036 } 4037 } 4038 4039 return 0; 4040 } 4041 4042 static int amdgpu_device_ip_reinit_late_sriov(struct amdgpu_device *adev) 4043 { 4044 struct amdgpu_ip_block *block; 4045 int i, r = 0; 4046 4047 static enum amd_ip_block_type ip_order[] = { 4048 AMD_IP_BLOCK_TYPE_SMC, 4049 AMD_IP_BLOCK_TYPE_DCE, 4050 AMD_IP_BLOCK_TYPE_GFX, 4051 AMD_IP_BLOCK_TYPE_SDMA, 4052 AMD_IP_BLOCK_TYPE_MES, 4053 AMD_IP_BLOCK_TYPE_UVD, 4054 AMD_IP_BLOCK_TYPE_VCE, 4055 AMD_IP_BLOCK_TYPE_VCN, 4056 AMD_IP_BLOCK_TYPE_JPEG 4057 }; 4058 4059 for (i = 0; i < ARRAY_SIZE(ip_order); i++) { 4060 block = amdgpu_device_ip_get_ip_block(adev, ip_order[i]); 4061 4062 if (!block) 4063 continue; 4064 4065 if (block->status.valid && !block->status.hw) { 4066 if (block->version->type == AMD_IP_BLOCK_TYPE_SMC) { 4067 r = amdgpu_ip_block_resume(block); 4068 } else { 4069 r = block->version->funcs->hw_init(block); 4070 } 4071 4072 if (r) { 4073 dev_err(adev->dev, "RE-INIT-late: %s failed\n", 4074 block->version->funcs->name); 4075 break; 4076 } 4077 block->status.hw = true; 4078 } 4079 } 4080 4081 return r; 4082 } 4083 4084 /** 4085 * amdgpu_device_ip_resume_phase1 - run resume for hardware IPs 4086 * 4087 * @adev: amdgpu_device pointer 4088 * 4089 * First resume function for hardware IPs. The list of all the hardware 4090 * IPs that make up the asic is walked and the resume callbacks are run for 4091 * COMMON, GMC, and IH. resume puts the hardware into a functional state 4092 * after a suspend and updates the software state as necessary. This 4093 * function is also used for restoring the GPU after a GPU reset. 4094 * Returns 0 on success, negative error code on failure. 4095 */ 4096 static int amdgpu_device_ip_resume_phase1(struct amdgpu_device *adev) 4097 { 4098 int i, r; 4099 4100 for (i = 0; i < adev->num_ip_blocks; i++) { 4101 if (!adev->ip_blocks[i].status.valid || adev->ip_blocks[i].status.hw) 4102 continue; 4103 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON || 4104 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC || 4105 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH || 4106 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP && amdgpu_sriov_vf(adev))) { 4107 4108 r = amdgpu_ip_block_resume(&adev->ip_blocks[i]); 4109 if (r) 4110 return r; 4111 } 4112 } 4113 4114 return 0; 4115 } 4116 4117 /** 4118 * amdgpu_device_ip_resume_phase2 - run resume for hardware IPs 4119 * 4120 * @adev: amdgpu_device pointer 4121 * 4122 * Second resume function for hardware IPs. The list of all the hardware 4123 * IPs that make up the asic is walked and the resume callbacks are run for 4124 * all blocks except COMMON, GMC, and IH. resume puts the hardware into a 4125 * functional state after a suspend and updates the software state as 4126 * necessary. This function is also used for restoring the GPU after a GPU 4127 * reset. 4128 * Returns 0 on success, negative error code on failure. 4129 */ 4130 static int amdgpu_device_ip_resume_phase2(struct amdgpu_device *adev) 4131 { 4132 int i, r; 4133 4134 for (i = 0; i < adev->num_ip_blocks; i++) { 4135 if (!adev->ip_blocks[i].status.valid || adev->ip_blocks[i].status.hw) 4136 continue; 4137 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON || 4138 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC || 4139 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH || 4140 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE || 4141 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) 4142 continue; 4143 r = amdgpu_ip_block_resume(&adev->ip_blocks[i]); 4144 if (r) 4145 return r; 4146 } 4147 4148 return 0; 4149 } 4150 4151 /** 4152 * amdgpu_device_ip_resume_phase3 - run resume for hardware IPs 4153 * 4154 * @adev: amdgpu_device pointer 4155 * 4156 * Third resume function for hardware IPs. The list of all the hardware 4157 * IPs that make up the asic is walked and the resume callbacks are run for 4158 * all DCE. resume puts the hardware into a functional state after a suspend 4159 * and updates the software state as necessary. This function is also used 4160 * for restoring the GPU after a GPU reset. 4161 * 4162 * Returns 0 on success, negative error code on failure. 4163 */ 4164 static int amdgpu_device_ip_resume_phase3(struct amdgpu_device *adev) 4165 { 4166 int i, r; 4167 4168 for (i = 0; i < adev->num_ip_blocks; i++) { 4169 if (!adev->ip_blocks[i].status.valid || adev->ip_blocks[i].status.hw) 4170 continue; 4171 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE) { 4172 r = amdgpu_ip_block_resume(&adev->ip_blocks[i]); 4173 if (r) 4174 return r; 4175 } 4176 } 4177 4178 return 0; 4179 } 4180 4181 /** 4182 * amdgpu_device_ip_resume - run resume for hardware IPs 4183 * 4184 * @adev: amdgpu_device pointer 4185 * 4186 * Main resume function for hardware IPs. The hardware IPs 4187 * are split into two resume functions because they are 4188 * also used in recovering from a GPU reset and some additional 4189 * steps need to be take between them. In this case (S3/S4) they are 4190 * run sequentially. 4191 * Returns 0 on success, negative error code on failure. 4192 */ 4193 static int amdgpu_device_ip_resume(struct amdgpu_device *adev) 4194 { 4195 int r; 4196 4197 r = amdgpu_device_ip_resume_phase1(adev); 4198 if (r) 4199 return r; 4200 4201 r = amdgpu_device_fw_loading(adev); 4202 if (r) 4203 return r; 4204 4205 r = amdgpu_device_ip_resume_phase2(adev); 4206 4207 if (adev->mman.buffer_funcs_ring->sched.ready) 4208 amdgpu_ttm_set_buffer_funcs_status(adev, true); 4209 4210 if (r) 4211 return r; 4212 4213 amdgpu_fence_driver_hw_init(adev); 4214 4215 r = amdgpu_device_ip_resume_phase3(adev); 4216 4217 return r; 4218 } 4219 4220 /** 4221 * amdgpu_device_detect_sriov_bios - determine if the board supports SR-IOV 4222 * 4223 * @adev: amdgpu_device pointer 4224 * 4225 * Query the VBIOS data tables to determine if the board supports SR-IOV. 4226 */ 4227 static void amdgpu_device_detect_sriov_bios(struct amdgpu_device *adev) 4228 { 4229 if (amdgpu_sriov_vf(adev)) { 4230 if (adev->is_atom_fw) { 4231 if (amdgpu_atomfirmware_gpu_virtualization_supported(adev)) 4232 adev->virt.caps |= AMDGPU_SRIOV_CAPS_SRIOV_VBIOS; 4233 } else { 4234 if (amdgpu_atombios_has_gpu_virtualization_table(adev)) 4235 adev->virt.caps |= AMDGPU_SRIOV_CAPS_SRIOV_VBIOS; 4236 } 4237 4238 if (!(adev->virt.caps & AMDGPU_SRIOV_CAPS_SRIOV_VBIOS)) 4239 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_NO_VBIOS, 0, 0); 4240 } 4241 } 4242 4243 /** 4244 * amdgpu_device_asic_has_dc_support - determine if DC supports the asic 4245 * 4246 * @pdev : pci device context 4247 * @asic_type: AMD asic type 4248 * 4249 * Check if there is DC (new modesetting infrastructre) support for an asic. 4250 * returns true if DC has support, false if not. 4251 */ 4252 bool amdgpu_device_asic_has_dc_support(struct pci_dev *pdev, 4253 enum amd_asic_type asic_type) 4254 { 4255 switch (asic_type) { 4256 #ifdef CONFIG_DRM_AMDGPU_SI 4257 case CHIP_HAINAN: 4258 #endif 4259 case CHIP_TOPAZ: 4260 /* chips with no display hardware */ 4261 return false; 4262 #if defined(CONFIG_DRM_AMD_DC) 4263 case CHIP_TAHITI: 4264 case CHIP_PITCAIRN: 4265 case CHIP_VERDE: 4266 case CHIP_OLAND: 4267 return amdgpu_dc != 0 && IS_ENABLED(CONFIG_DRM_AMD_DC_SI); 4268 case CHIP_KAVERI: 4269 case CHIP_KABINI: 4270 case CHIP_MULLINS: 4271 /* 4272 * We have systems in the wild with these ASICs that require 4273 * TRAVIS and NUTMEG support which is not supported with DC. 4274 * 4275 * Fallback to the non-DC driver here by default so as not to 4276 * cause regressions. 4277 */ 4278 return amdgpu_dc > 0; 4279 default: 4280 return amdgpu_dc != 0; 4281 #else 4282 default: 4283 if (amdgpu_dc > 0) 4284 dev_info_once( 4285 &pdev->dev, 4286 "Display Core has been requested via kernel parameter but isn't supported by ASIC, ignoring\n"); 4287 return false; 4288 #endif 4289 } 4290 } 4291 4292 /** 4293 * amdgpu_device_has_dc_support - check if dc is supported 4294 * 4295 * @adev: amdgpu_device pointer 4296 * 4297 * Returns true for supported, false for not supported 4298 */ 4299 bool amdgpu_device_has_dc_support(struct amdgpu_device *adev) 4300 { 4301 if (adev->enable_virtual_display || 4302 (adev->harvest_ip_mask & AMD_HARVEST_IP_DMU_MASK)) 4303 return false; 4304 4305 return amdgpu_device_asic_has_dc_support(adev->pdev, adev->asic_type); 4306 } 4307 4308 static void amdgpu_device_xgmi_reset_func(struct work_struct *__work) 4309 { 4310 struct amdgpu_device *adev = 4311 container_of(__work, struct amdgpu_device, xgmi_reset_work); 4312 struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev); 4313 4314 /* It's a bug to not have a hive within this function */ 4315 if (WARN_ON(!hive)) 4316 return; 4317 4318 /* 4319 * Use task barrier to synchronize all xgmi reset works across the 4320 * hive. task_barrier_enter and task_barrier_exit will block 4321 * until all the threads running the xgmi reset works reach 4322 * those points. task_barrier_full will do both blocks. 4323 */ 4324 if (amdgpu_asic_reset_method(adev) == AMD_RESET_METHOD_BACO) { 4325 4326 task_barrier_enter(&hive->tb); 4327 adev->asic_reset_res = amdgpu_device_baco_enter(adev); 4328 4329 if (adev->asic_reset_res) 4330 goto fail; 4331 4332 task_barrier_exit(&hive->tb); 4333 adev->asic_reset_res = amdgpu_device_baco_exit(adev); 4334 4335 if (adev->asic_reset_res) 4336 goto fail; 4337 4338 amdgpu_ras_reset_error_count(adev, AMDGPU_RAS_BLOCK__MMHUB); 4339 } else { 4340 4341 task_barrier_full(&hive->tb); 4342 adev->asic_reset_res = amdgpu_asic_reset(adev); 4343 } 4344 4345 fail: 4346 if (adev->asic_reset_res) 4347 dev_warn(adev->dev, 4348 "ASIC reset failed with error, %d for drm dev, %s", 4349 adev->asic_reset_res, adev_to_drm(adev)->unique); 4350 amdgpu_put_xgmi_hive(hive); 4351 } 4352 4353 static int amdgpu_device_get_job_timeout_settings(struct amdgpu_device *adev) 4354 { 4355 char *input = amdgpu_lockup_timeout; 4356 char *timeout_setting = NULL; 4357 int index = 0; 4358 long timeout; 4359 int ret = 0; 4360 4361 /* By default timeout for all queues is 2 sec */ 4362 adev->gfx_timeout = adev->compute_timeout = adev->sdma_timeout = 4363 adev->video_timeout = msecs_to_jiffies(2000); 4364 4365 if (!strnlen(input, AMDGPU_MAX_TIMEOUT_PARAM_LENGTH)) 4366 return 0; 4367 4368 while ((timeout_setting = strsep(&input, ",")) && 4369 strnlen(timeout_setting, AMDGPU_MAX_TIMEOUT_PARAM_LENGTH)) { 4370 ret = kstrtol(timeout_setting, 0, &timeout); 4371 if (ret) 4372 return ret; 4373 4374 if (timeout == 0) { 4375 index++; 4376 continue; 4377 } else if (timeout < 0) { 4378 timeout = MAX_SCHEDULE_TIMEOUT; 4379 dev_warn(adev->dev, "lockup timeout disabled"); 4380 add_taint(TAINT_SOFTLOCKUP, LOCKDEP_STILL_OK); 4381 } else { 4382 timeout = msecs_to_jiffies(timeout); 4383 } 4384 4385 switch (index++) { 4386 case 0: 4387 adev->gfx_timeout = timeout; 4388 break; 4389 case 1: 4390 adev->compute_timeout = timeout; 4391 break; 4392 case 2: 4393 adev->sdma_timeout = timeout; 4394 break; 4395 case 3: 4396 adev->video_timeout = timeout; 4397 break; 4398 default: 4399 break; 4400 } 4401 } 4402 4403 /* When only one value specified apply it to all queues. */ 4404 if (index == 1) 4405 adev->gfx_timeout = adev->compute_timeout = adev->sdma_timeout = 4406 adev->video_timeout = timeout; 4407 4408 return ret; 4409 } 4410 4411 /** 4412 * amdgpu_device_check_iommu_direct_map - check if RAM direct mapped to GPU 4413 * 4414 * @adev: amdgpu_device pointer 4415 * 4416 * RAM direct mapped to GPU if IOMMU is not enabled or is pass through mode 4417 */ 4418 static void amdgpu_device_check_iommu_direct_map(struct amdgpu_device *adev) 4419 { 4420 struct iommu_domain *domain; 4421 4422 domain = iommu_get_domain_for_dev(adev->dev); 4423 if (!domain || domain->type == IOMMU_DOMAIN_IDENTITY) 4424 adev->ram_is_direct_mapped = true; 4425 } 4426 4427 #if defined(CONFIG_HSA_AMD_P2P) 4428 /** 4429 * amdgpu_device_check_iommu_remap - Check if DMA remapping is enabled. 4430 * 4431 * @adev: amdgpu_device pointer 4432 * 4433 * return if IOMMU remapping bar address 4434 */ 4435 static bool amdgpu_device_check_iommu_remap(struct amdgpu_device *adev) 4436 { 4437 struct iommu_domain *domain; 4438 4439 domain = iommu_get_domain_for_dev(adev->dev); 4440 if (domain && (domain->type == IOMMU_DOMAIN_DMA || 4441 domain->type == IOMMU_DOMAIN_DMA_FQ)) 4442 return true; 4443 4444 return false; 4445 } 4446 #endif 4447 4448 static void amdgpu_device_set_mcbp(struct amdgpu_device *adev) 4449 { 4450 if (amdgpu_mcbp == 1) 4451 adev->gfx.mcbp = true; 4452 else if (amdgpu_mcbp == 0) 4453 adev->gfx.mcbp = false; 4454 4455 if (amdgpu_sriov_vf(adev)) 4456 adev->gfx.mcbp = true; 4457 4458 if (adev->gfx.mcbp) 4459 dev_info(adev->dev, "MCBP is enabled\n"); 4460 } 4461 4462 static int amdgpu_device_sys_interface_init(struct amdgpu_device *adev) 4463 { 4464 int r; 4465 4466 r = amdgpu_atombios_sysfs_init(adev); 4467 if (r) 4468 drm_err(&adev->ddev, 4469 "registering atombios sysfs failed (%d).\n", r); 4470 4471 r = amdgpu_pm_sysfs_init(adev); 4472 if (r) 4473 dev_err(adev->dev, "registering pm sysfs failed (%d).\n", r); 4474 4475 r = amdgpu_ucode_sysfs_init(adev); 4476 if (r) { 4477 adev->ucode_sysfs_en = false; 4478 dev_err(adev->dev, "Creating firmware sysfs failed (%d).\n", r); 4479 } else 4480 adev->ucode_sysfs_en = true; 4481 4482 r = amdgpu_device_attr_sysfs_init(adev); 4483 if (r) 4484 dev_err(adev->dev, "Could not create amdgpu device attr\n"); 4485 4486 r = devm_device_add_group(adev->dev, &amdgpu_board_attrs_group); 4487 if (r) 4488 dev_err(adev->dev, 4489 "Could not create amdgpu board attributes\n"); 4490 4491 amdgpu_fru_sysfs_init(adev); 4492 amdgpu_reg_state_sysfs_init(adev); 4493 amdgpu_xcp_sysfs_init(adev); 4494 4495 return r; 4496 } 4497 4498 static void amdgpu_device_sys_interface_fini(struct amdgpu_device *adev) 4499 { 4500 if (adev->pm.sysfs_initialized) 4501 amdgpu_pm_sysfs_fini(adev); 4502 if (adev->ucode_sysfs_en) 4503 amdgpu_ucode_sysfs_fini(adev); 4504 amdgpu_device_attr_sysfs_fini(adev); 4505 amdgpu_fru_sysfs_fini(adev); 4506 4507 amdgpu_reg_state_sysfs_fini(adev); 4508 amdgpu_xcp_sysfs_fini(adev); 4509 } 4510 4511 /** 4512 * amdgpu_device_init - initialize the driver 4513 * 4514 * @adev: amdgpu_device pointer 4515 * @flags: driver flags 4516 * 4517 * Initializes the driver info and hw (all asics). 4518 * Returns 0 for success or an error on failure. 4519 * Called at driver startup. 4520 */ 4521 int amdgpu_device_init(struct amdgpu_device *adev, 4522 uint32_t flags) 4523 { 4524 struct pci_dev *pdev = adev->pdev; 4525 int r, i; 4526 bool px = false; 4527 u32 max_MBps; 4528 int tmp; 4529 4530 adev->shutdown = false; 4531 adev->flags = flags; 4532 4533 if (amdgpu_force_asic_type >= 0 && amdgpu_force_asic_type < CHIP_LAST) 4534 adev->asic_type = amdgpu_force_asic_type; 4535 else 4536 adev->asic_type = flags & AMD_ASIC_MASK; 4537 4538 adev->usec_timeout = AMDGPU_MAX_USEC_TIMEOUT; 4539 if (amdgpu_emu_mode == 1) 4540 adev->usec_timeout *= 10; 4541 adev->gmc.gart_size = 512 * 1024 * 1024; 4542 adev->accel_working = false; 4543 adev->num_rings = 0; 4544 RCU_INIT_POINTER(adev->gang_submit, dma_fence_get_stub()); 4545 adev->mman.buffer_funcs = NULL; 4546 adev->mman.buffer_funcs_ring = NULL; 4547 adev->vm_manager.vm_pte_funcs = NULL; 4548 adev->vm_manager.vm_pte_num_scheds = 0; 4549 adev->gmc.gmc_funcs = NULL; 4550 adev->harvest_ip_mask = 0x0; 4551 adev->fence_context = dma_fence_context_alloc(AMDGPU_MAX_RINGS); 4552 bitmap_zero(adev->gfx.pipe_reserve_bitmap, AMDGPU_MAX_COMPUTE_QUEUES); 4553 4554 adev->smc_rreg = &amdgpu_invalid_rreg; 4555 adev->smc_wreg = &amdgpu_invalid_wreg; 4556 adev->pcie_rreg = &amdgpu_invalid_rreg; 4557 adev->pcie_wreg = &amdgpu_invalid_wreg; 4558 adev->pcie_rreg_ext = &amdgpu_invalid_rreg_ext; 4559 adev->pcie_wreg_ext = &amdgpu_invalid_wreg_ext; 4560 adev->pciep_rreg = &amdgpu_invalid_rreg; 4561 adev->pciep_wreg = &amdgpu_invalid_wreg; 4562 adev->pcie_rreg64 = &amdgpu_invalid_rreg64; 4563 adev->pcie_wreg64 = &amdgpu_invalid_wreg64; 4564 adev->pcie_rreg64_ext = &amdgpu_invalid_rreg64_ext; 4565 adev->pcie_wreg64_ext = &amdgpu_invalid_wreg64_ext; 4566 adev->uvd_ctx_rreg = &amdgpu_invalid_rreg; 4567 adev->uvd_ctx_wreg = &amdgpu_invalid_wreg; 4568 adev->didt_rreg = &amdgpu_invalid_rreg; 4569 adev->didt_wreg = &amdgpu_invalid_wreg; 4570 adev->gc_cac_rreg = &amdgpu_invalid_rreg; 4571 adev->gc_cac_wreg = &amdgpu_invalid_wreg; 4572 adev->audio_endpt_rreg = &amdgpu_block_invalid_rreg; 4573 adev->audio_endpt_wreg = &amdgpu_block_invalid_wreg; 4574 4575 dev_info( 4576 adev->dev, 4577 "initializing kernel modesetting (%s 0x%04X:0x%04X 0x%04X:0x%04X 0x%02X).\n", 4578 amdgpu_asic_name[adev->asic_type], pdev->vendor, pdev->device, 4579 pdev->subsystem_vendor, pdev->subsystem_device, pdev->revision); 4580 4581 /* mutex initialization are all done here so we 4582 * can recall function without having locking issues 4583 */ 4584 mutex_init(&adev->firmware.mutex); 4585 mutex_init(&adev->pm.mutex); 4586 mutex_init(&adev->gfx.gpu_clock_mutex); 4587 mutex_init(&adev->srbm_mutex); 4588 mutex_init(&adev->gfx.pipe_reserve_mutex); 4589 mutex_init(&adev->gfx.gfx_off_mutex); 4590 mutex_init(&adev->gfx.partition_mutex); 4591 mutex_init(&adev->grbm_idx_mutex); 4592 mutex_init(&adev->mn_lock); 4593 mutex_init(&adev->virt.vf_errors.lock); 4594 hash_init(adev->mn_hash); 4595 mutex_init(&adev->psp.mutex); 4596 mutex_init(&adev->notifier_lock); 4597 mutex_init(&adev->pm.stable_pstate_ctx_lock); 4598 mutex_init(&adev->benchmark_mutex); 4599 mutex_init(&adev->gfx.reset_sem_mutex); 4600 /* Initialize the mutex for cleaner shader isolation between GFX and compute processes */ 4601 mutex_init(&adev->enforce_isolation_mutex); 4602 for (i = 0; i < MAX_XCP; ++i) { 4603 adev->isolation[i].spearhead = dma_fence_get_stub(); 4604 amdgpu_sync_create(&adev->isolation[i].active); 4605 amdgpu_sync_create(&adev->isolation[i].prev); 4606 } 4607 mutex_init(&adev->gfx.userq_sch_mutex); 4608 mutex_init(&adev->gfx.workload_profile_mutex); 4609 mutex_init(&adev->vcn.workload_profile_mutex); 4610 4611 amdgpu_device_init_apu_flags(adev); 4612 4613 r = amdgpu_device_check_arguments(adev); 4614 if (r) 4615 return r; 4616 4617 spin_lock_init(&adev->mmio_idx_lock); 4618 spin_lock_init(&adev->smc_idx_lock); 4619 spin_lock_init(&adev->pcie_idx_lock); 4620 spin_lock_init(&adev->uvd_ctx_idx_lock); 4621 spin_lock_init(&adev->didt_idx_lock); 4622 spin_lock_init(&adev->gc_cac_idx_lock); 4623 spin_lock_init(&adev->se_cac_idx_lock); 4624 spin_lock_init(&adev->audio_endpt_idx_lock); 4625 spin_lock_init(&adev->mm_stats.lock); 4626 spin_lock_init(&adev->virt.rlcg_reg_lock); 4627 spin_lock_init(&adev->wb.lock); 4628 4629 xa_init_flags(&adev->userq_xa, XA_FLAGS_LOCK_IRQ); 4630 4631 INIT_LIST_HEAD(&adev->reset_list); 4632 4633 INIT_LIST_HEAD(&adev->ras_list); 4634 4635 INIT_LIST_HEAD(&adev->pm.od_kobj_list); 4636 4637 xa_init(&adev->userq_doorbell_xa); 4638 4639 INIT_DELAYED_WORK(&adev->delayed_init_work, 4640 amdgpu_device_delayed_init_work_handler); 4641 INIT_DELAYED_WORK(&adev->gfx.gfx_off_delay_work, 4642 amdgpu_device_delay_enable_gfx_off); 4643 /* 4644 * Initialize the enforce_isolation work structures for each XCP 4645 * partition. This work handler is responsible for enforcing shader 4646 * isolation on AMD GPUs. It counts the number of emitted fences for 4647 * each GFX and compute ring. If there are any fences, it schedules 4648 * the `enforce_isolation_work` to be run after a delay. If there are 4649 * no fences, it signals the Kernel Fusion Driver (KFD) to resume the 4650 * runqueue. 4651 */ 4652 for (i = 0; i < MAX_XCP; i++) { 4653 INIT_DELAYED_WORK(&adev->gfx.enforce_isolation[i].work, 4654 amdgpu_gfx_enforce_isolation_handler); 4655 adev->gfx.enforce_isolation[i].adev = adev; 4656 adev->gfx.enforce_isolation[i].xcp_id = i; 4657 } 4658 4659 INIT_WORK(&adev->xgmi_reset_work, amdgpu_device_xgmi_reset_func); 4660 INIT_WORK(&adev->userq_reset_work, amdgpu_userq_reset_work); 4661 4662 adev->gfx.gfx_off_req_count = 1; 4663 adev->gfx.gfx_off_residency = 0; 4664 adev->gfx.gfx_off_entrycount = 0; 4665 adev->pm.ac_power = power_supply_is_system_supplied() > 0; 4666 4667 atomic_set(&adev->throttling_logging_enabled, 1); 4668 /* 4669 * If throttling continues, logging will be performed every minute 4670 * to avoid log flooding. "-1" is subtracted since the thermal 4671 * throttling interrupt comes every second. Thus, the total logging 4672 * interval is 59 seconds(retelimited printk interval) + 1(waiting 4673 * for throttling interrupt) = 60 seconds. 4674 */ 4675 ratelimit_state_init(&adev->throttling_logging_rs, (60 - 1) * HZ, 1); 4676 4677 ratelimit_set_flags(&adev->throttling_logging_rs, RATELIMIT_MSG_ON_RELEASE); 4678 4679 /* Registers mapping */ 4680 /* TODO: block userspace mapping of io register */ 4681 if (adev->asic_type >= CHIP_BONAIRE) { 4682 adev->rmmio_base = pci_resource_start(adev->pdev, 5); 4683 adev->rmmio_size = pci_resource_len(adev->pdev, 5); 4684 } else { 4685 adev->rmmio_base = pci_resource_start(adev->pdev, 2); 4686 adev->rmmio_size = pci_resource_len(adev->pdev, 2); 4687 } 4688 4689 for (i = 0; i < AMD_IP_BLOCK_TYPE_NUM; i++) 4690 atomic_set(&adev->pm.pwr_state[i], POWER_STATE_UNKNOWN); 4691 4692 adev->rmmio = ioremap(adev->rmmio_base, adev->rmmio_size); 4693 if (!adev->rmmio) 4694 return -ENOMEM; 4695 4696 dev_info(adev->dev, "register mmio base: 0x%08X\n", 4697 (uint32_t)adev->rmmio_base); 4698 dev_info(adev->dev, "register mmio size: %u\n", 4699 (unsigned int)adev->rmmio_size); 4700 4701 /* 4702 * Reset domain needs to be present early, before XGMI hive discovered 4703 * (if any) and initialized to use reset sem and in_gpu reset flag 4704 * early on during init and before calling to RREG32. 4705 */ 4706 adev->reset_domain = amdgpu_reset_create_reset_domain(SINGLE_DEVICE, "amdgpu-reset-dev"); 4707 if (!adev->reset_domain) 4708 return -ENOMEM; 4709 4710 /* detect hw virtualization here */ 4711 amdgpu_virt_init(adev); 4712 4713 amdgpu_device_get_pcie_info(adev); 4714 4715 r = amdgpu_device_get_job_timeout_settings(adev); 4716 if (r) { 4717 dev_err(adev->dev, "invalid lockup_timeout parameter syntax\n"); 4718 return r; 4719 } 4720 4721 amdgpu_device_set_mcbp(adev); 4722 4723 /* 4724 * By default, use default mode where all blocks are expected to be 4725 * initialized. At present a 'swinit' of blocks is required to be 4726 * completed before the need for a different level is detected. 4727 */ 4728 amdgpu_set_init_level(adev, AMDGPU_INIT_LEVEL_DEFAULT); 4729 /* early init functions */ 4730 r = amdgpu_device_ip_early_init(adev); 4731 if (r) 4732 return r; 4733 4734 /* 4735 * No need to remove conflicting FBs for non-display class devices. 4736 * This prevents the sysfb from being freed accidently. 4737 */ 4738 if ((pdev->class >> 8) == PCI_CLASS_DISPLAY_VGA || 4739 (pdev->class >> 8) == PCI_CLASS_DISPLAY_OTHER) { 4740 /* Get rid of things like offb */ 4741 r = aperture_remove_conflicting_pci_devices(adev->pdev, amdgpu_kms_driver.name); 4742 if (r) 4743 return r; 4744 } 4745 4746 /* Enable TMZ based on IP_VERSION */ 4747 amdgpu_gmc_tmz_set(adev); 4748 4749 if (amdgpu_sriov_vf(adev) && 4750 amdgpu_ip_version(adev, GC_HWIP, 0) >= IP_VERSION(10, 3, 0)) 4751 /* VF MMIO access (except mailbox range) from CPU 4752 * will be blocked during sriov runtime 4753 */ 4754 adev->virt.caps |= AMDGPU_VF_MMIO_ACCESS_PROTECT; 4755 4756 amdgpu_gmc_noretry_set(adev); 4757 /* Need to get xgmi info early to decide the reset behavior*/ 4758 if (adev->gmc.xgmi.supported) { 4759 r = adev->gfxhub.funcs->get_xgmi_info(adev); 4760 if (r) 4761 return r; 4762 } 4763 4764 /* enable PCIE atomic ops */ 4765 if (amdgpu_sriov_vf(adev)) { 4766 if (adev->virt.fw_reserve.p_pf2vf) 4767 adev->have_atomics_support = ((struct amd_sriov_msg_pf2vf_info *) 4768 adev->virt.fw_reserve.p_pf2vf)->pcie_atomic_ops_support_flags == 4769 (PCI_EXP_DEVCAP2_ATOMIC_COMP32 | PCI_EXP_DEVCAP2_ATOMIC_COMP64); 4770 /* APUs w/ gfx9 onwards doesn't reply on PCIe atomics, rather it is a 4771 * internal path natively support atomics, set have_atomics_support to true. 4772 */ 4773 } else if ((adev->flags & AMD_IS_APU) && 4774 (amdgpu_ip_version(adev, GC_HWIP, 0) > 4775 IP_VERSION(9, 0, 0))) { 4776 adev->have_atomics_support = true; 4777 } else { 4778 adev->have_atomics_support = 4779 !pci_enable_atomic_ops_to_root(adev->pdev, 4780 PCI_EXP_DEVCAP2_ATOMIC_COMP32 | 4781 PCI_EXP_DEVCAP2_ATOMIC_COMP64); 4782 } 4783 4784 if (!adev->have_atomics_support) 4785 dev_info(adev->dev, "PCIE atomic ops is not supported\n"); 4786 4787 /* doorbell bar mapping and doorbell index init*/ 4788 amdgpu_doorbell_init(adev); 4789 4790 if (amdgpu_emu_mode == 1) { 4791 /* post the asic on emulation mode */ 4792 emu_soc_asic_init(adev); 4793 goto fence_driver_init; 4794 } 4795 4796 amdgpu_reset_init(adev); 4797 4798 /* detect if we are with an SRIOV vbios */ 4799 if (adev->bios) 4800 amdgpu_device_detect_sriov_bios(adev); 4801 4802 /* check if we need to reset the asic 4803 * E.g., driver was not cleanly unloaded previously, etc. 4804 */ 4805 if (!amdgpu_sriov_vf(adev) && amdgpu_asic_need_reset_on_init(adev)) { 4806 if (adev->gmc.xgmi.num_physical_nodes) { 4807 dev_info(adev->dev, "Pending hive reset.\n"); 4808 amdgpu_set_init_level(adev, 4809 AMDGPU_INIT_LEVEL_MINIMAL_XGMI); 4810 } else if (amdgpu_ip_version(adev, MP1_HWIP, 0) == IP_VERSION(13, 0, 10) && 4811 !amdgpu_device_has_display_hardware(adev)) { 4812 r = psp_gpu_reset(adev); 4813 } else { 4814 tmp = amdgpu_reset_method; 4815 /* It should do a default reset when loading or reloading the driver, 4816 * regardless of the module parameter reset_method. 4817 */ 4818 amdgpu_reset_method = AMD_RESET_METHOD_NONE; 4819 r = amdgpu_asic_reset(adev); 4820 amdgpu_reset_method = tmp; 4821 } 4822 4823 if (r) { 4824 dev_err(adev->dev, "asic reset on init failed\n"); 4825 goto failed; 4826 } 4827 } 4828 4829 /* Post card if necessary */ 4830 if (amdgpu_device_need_post(adev)) { 4831 if (!adev->bios) { 4832 dev_err(adev->dev, "no vBIOS found\n"); 4833 r = -EINVAL; 4834 goto failed; 4835 } 4836 dev_info(adev->dev, "GPU posting now...\n"); 4837 r = amdgpu_device_asic_init(adev); 4838 if (r) { 4839 dev_err(adev->dev, "gpu post error!\n"); 4840 goto failed; 4841 } 4842 } 4843 4844 if (adev->bios) { 4845 if (adev->is_atom_fw) { 4846 /* Initialize clocks */ 4847 r = amdgpu_atomfirmware_get_clock_info(adev); 4848 if (r) { 4849 dev_err(adev->dev, "amdgpu_atomfirmware_get_clock_info failed\n"); 4850 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_GET_CLOCK_FAIL, 0, 0); 4851 goto failed; 4852 } 4853 } else { 4854 /* Initialize clocks */ 4855 r = amdgpu_atombios_get_clock_info(adev); 4856 if (r) { 4857 dev_err(adev->dev, "amdgpu_atombios_get_clock_info failed\n"); 4858 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_GET_CLOCK_FAIL, 0, 0); 4859 goto failed; 4860 } 4861 /* init i2c buses */ 4862 amdgpu_i2c_init(adev); 4863 } 4864 } 4865 4866 fence_driver_init: 4867 /* Fence driver */ 4868 r = amdgpu_fence_driver_sw_init(adev); 4869 if (r) { 4870 dev_err(adev->dev, "amdgpu_fence_driver_sw_init failed\n"); 4871 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_FENCE_INIT_FAIL, 0, 0); 4872 goto failed; 4873 } 4874 4875 /* init the mode config */ 4876 drm_mode_config_init(adev_to_drm(adev)); 4877 4878 r = amdgpu_device_ip_init(adev); 4879 if (r) { 4880 dev_err(adev->dev, "amdgpu_device_ip_init failed\n"); 4881 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_AMDGPU_INIT_FAIL, 0, 0); 4882 goto release_ras_con; 4883 } 4884 4885 amdgpu_fence_driver_hw_init(adev); 4886 4887 dev_info(adev->dev, 4888 "SE %d, SH per SE %d, CU per SH %d, active_cu_number %d\n", 4889 adev->gfx.config.max_shader_engines, 4890 adev->gfx.config.max_sh_per_se, 4891 adev->gfx.config.max_cu_per_sh, 4892 adev->gfx.cu_info.number); 4893 4894 adev->accel_working = true; 4895 4896 amdgpu_vm_check_compute_bug(adev); 4897 4898 /* Initialize the buffer migration limit. */ 4899 if (amdgpu_moverate >= 0) 4900 max_MBps = amdgpu_moverate; 4901 else 4902 max_MBps = 8; /* Allow 8 MB/s. */ 4903 /* Get a log2 for easy divisions. */ 4904 adev->mm_stats.log2_max_MBps = ilog2(max(1u, max_MBps)); 4905 4906 /* 4907 * Register gpu instance before amdgpu_device_enable_mgpu_fan_boost. 4908 * Otherwise the mgpu fan boost feature will be skipped due to the 4909 * gpu instance is counted less. 4910 */ 4911 amdgpu_register_gpu_instance(adev); 4912 4913 /* enable clockgating, etc. after ib tests, etc. since some blocks require 4914 * explicit gating rather than handling it automatically. 4915 */ 4916 if (adev->init_lvl->level != AMDGPU_INIT_LEVEL_MINIMAL_XGMI) { 4917 r = amdgpu_device_ip_late_init(adev); 4918 if (r) { 4919 dev_err(adev->dev, "amdgpu_device_ip_late_init failed\n"); 4920 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_AMDGPU_LATE_INIT_FAIL, 0, r); 4921 goto release_ras_con; 4922 } 4923 /* must succeed. */ 4924 amdgpu_ras_resume(adev); 4925 queue_delayed_work(system_wq, &adev->delayed_init_work, 4926 msecs_to_jiffies(AMDGPU_RESUME_MS)); 4927 } 4928 4929 if (amdgpu_sriov_vf(adev)) { 4930 amdgpu_virt_release_full_gpu(adev, true); 4931 flush_delayed_work(&adev->delayed_init_work); 4932 } 4933 4934 if (adev->init_lvl->level == AMDGPU_INIT_LEVEL_MINIMAL_XGMI) 4935 amdgpu_xgmi_reset_on_init(adev); 4936 /* 4937 * Place those sysfs registering after `late_init`. As some of those 4938 * operations performed in `late_init` might affect the sysfs 4939 * interfaces creating. 4940 */ 4941 r = amdgpu_device_sys_interface_init(adev); 4942 4943 if (IS_ENABLED(CONFIG_PERF_EVENTS)) 4944 r = amdgpu_pmu_init(adev); 4945 if (r) 4946 dev_err(adev->dev, "amdgpu_pmu_init failed\n"); 4947 4948 /* Have stored pci confspace at hand for restore in sudden PCI error */ 4949 if (amdgpu_device_cache_pci_state(adev->pdev)) 4950 pci_restore_state(pdev); 4951 4952 /* if we have > 1 VGA cards, then disable the amdgpu VGA resources */ 4953 /* this will fail for cards that aren't VGA class devices, just 4954 * ignore it 4955 */ 4956 if ((adev->pdev->class >> 8) == PCI_CLASS_DISPLAY_VGA) 4957 vga_client_register(adev->pdev, amdgpu_device_vga_set_decode); 4958 4959 px = amdgpu_device_supports_px(adev); 4960 4961 if (px || (!dev_is_removable(&adev->pdev->dev) && 4962 apple_gmux_detect(NULL, NULL))) 4963 vga_switcheroo_register_client(adev->pdev, 4964 &amdgpu_switcheroo_ops, px); 4965 4966 if (px) 4967 vga_switcheroo_init_domain_pm_ops(adev->dev, &adev->vga_pm_domain); 4968 4969 amdgpu_device_check_iommu_direct_map(adev); 4970 4971 adev->pm_nb.notifier_call = amdgpu_device_pm_notifier; 4972 r = register_pm_notifier(&adev->pm_nb); 4973 if (r) 4974 goto failed; 4975 4976 return 0; 4977 4978 release_ras_con: 4979 if (amdgpu_sriov_vf(adev)) 4980 amdgpu_virt_release_full_gpu(adev, true); 4981 4982 /* failed in exclusive mode due to timeout */ 4983 if (amdgpu_sriov_vf(adev) && 4984 !amdgpu_sriov_runtime(adev) && 4985 amdgpu_virt_mmio_blocked(adev) && 4986 !amdgpu_virt_wait_reset(adev)) { 4987 dev_err(adev->dev, "VF exclusive mode timeout\n"); 4988 /* Don't send request since VF is inactive. */ 4989 adev->virt.caps &= ~AMDGPU_SRIOV_CAPS_RUNTIME; 4990 adev->virt.ops = NULL; 4991 r = -EAGAIN; 4992 } 4993 amdgpu_release_ras_context(adev); 4994 4995 failed: 4996 amdgpu_vf_error_trans_all(adev); 4997 4998 return r; 4999 } 5000 5001 static void amdgpu_device_unmap_mmio(struct amdgpu_device *adev) 5002 { 5003 5004 /* Clear all CPU mappings pointing to this device */ 5005 unmap_mapping_range(adev->ddev.anon_inode->i_mapping, 0, 0, 1); 5006 5007 /* Unmap all mapped bars - Doorbell, registers and VRAM */ 5008 amdgpu_doorbell_fini(adev); 5009 5010 iounmap(adev->rmmio); 5011 adev->rmmio = NULL; 5012 if (adev->mman.aper_base_kaddr) 5013 iounmap(adev->mman.aper_base_kaddr); 5014 adev->mman.aper_base_kaddr = NULL; 5015 5016 /* Memory manager related */ 5017 if (!adev->gmc.xgmi.connected_to_cpu && !adev->gmc.is_app_apu) { 5018 arch_phys_wc_del(adev->gmc.vram_mtrr); 5019 arch_io_free_memtype_wc(adev->gmc.aper_base, adev->gmc.aper_size); 5020 } 5021 } 5022 5023 /** 5024 * amdgpu_device_fini_hw - tear down the driver 5025 * 5026 * @adev: amdgpu_device pointer 5027 * 5028 * Tear down the driver info (all asics). 5029 * Called at driver shutdown. 5030 */ 5031 void amdgpu_device_fini_hw(struct amdgpu_device *adev) 5032 { 5033 dev_info(adev->dev, "amdgpu: finishing device.\n"); 5034 flush_delayed_work(&adev->delayed_init_work); 5035 5036 if (adev->mman.initialized) 5037 drain_workqueue(adev->mman.bdev.wq); 5038 adev->shutdown = true; 5039 5040 unregister_pm_notifier(&adev->pm_nb); 5041 5042 /* make sure IB test finished before entering exclusive mode 5043 * to avoid preemption on IB test 5044 */ 5045 if (amdgpu_sriov_vf(adev)) { 5046 amdgpu_virt_request_full_gpu(adev, false); 5047 amdgpu_virt_fini_data_exchange(adev); 5048 } 5049 5050 /* disable all interrupts */ 5051 amdgpu_irq_disable_all(adev); 5052 if (adev->mode_info.mode_config_initialized) { 5053 if (!drm_drv_uses_atomic_modeset(adev_to_drm(adev))) 5054 drm_helper_force_disable_all(adev_to_drm(adev)); 5055 else 5056 drm_atomic_helper_shutdown(adev_to_drm(adev)); 5057 } 5058 amdgpu_fence_driver_hw_fini(adev); 5059 5060 amdgpu_device_sys_interface_fini(adev); 5061 5062 /* disable ras feature must before hw fini */ 5063 amdgpu_ras_pre_fini(adev); 5064 5065 amdgpu_ttm_set_buffer_funcs_status(adev, false); 5066 5067 amdgpu_device_ip_fini_early(adev); 5068 5069 amdgpu_irq_fini_hw(adev); 5070 5071 if (adev->mman.initialized) 5072 ttm_device_clear_dma_mappings(&adev->mman.bdev); 5073 5074 amdgpu_gart_dummy_page_fini(adev); 5075 5076 if (drm_dev_is_unplugged(adev_to_drm(adev))) 5077 amdgpu_device_unmap_mmio(adev); 5078 5079 } 5080 5081 void amdgpu_device_fini_sw(struct amdgpu_device *adev) 5082 { 5083 int i, idx; 5084 bool px; 5085 5086 amdgpu_device_ip_fini(adev); 5087 amdgpu_fence_driver_sw_fini(adev); 5088 amdgpu_ucode_release(&adev->firmware.gpu_info_fw); 5089 adev->accel_working = false; 5090 dma_fence_put(rcu_dereference_protected(adev->gang_submit, true)); 5091 for (i = 0; i < MAX_XCP; ++i) { 5092 dma_fence_put(adev->isolation[i].spearhead); 5093 amdgpu_sync_free(&adev->isolation[i].active); 5094 amdgpu_sync_free(&adev->isolation[i].prev); 5095 } 5096 5097 amdgpu_reset_fini(adev); 5098 5099 /* free i2c buses */ 5100 amdgpu_i2c_fini(adev); 5101 5102 if (adev->bios) { 5103 if (amdgpu_emu_mode != 1) 5104 amdgpu_atombios_fini(adev); 5105 amdgpu_bios_release(adev); 5106 } 5107 5108 kfree(adev->fru_info); 5109 adev->fru_info = NULL; 5110 5111 kfree(adev->xcp_mgr); 5112 adev->xcp_mgr = NULL; 5113 5114 px = amdgpu_device_supports_px(adev); 5115 5116 if (px || (!dev_is_removable(&adev->pdev->dev) && 5117 apple_gmux_detect(NULL, NULL))) 5118 vga_switcheroo_unregister_client(adev->pdev); 5119 5120 if (px) 5121 vga_switcheroo_fini_domain_pm_ops(adev->dev); 5122 5123 if ((adev->pdev->class >> 8) == PCI_CLASS_DISPLAY_VGA) 5124 vga_client_unregister(adev->pdev); 5125 5126 if (drm_dev_enter(adev_to_drm(adev), &idx)) { 5127 5128 iounmap(adev->rmmio); 5129 adev->rmmio = NULL; 5130 drm_dev_exit(idx); 5131 } 5132 5133 if (IS_ENABLED(CONFIG_PERF_EVENTS)) 5134 amdgpu_pmu_fini(adev); 5135 if (adev->discovery.bin) 5136 amdgpu_discovery_fini(adev); 5137 5138 amdgpu_reset_put_reset_domain(adev->reset_domain); 5139 adev->reset_domain = NULL; 5140 5141 kfree(adev->pci_state); 5142 kfree(adev->pcie_reset_ctx.swds_pcistate); 5143 kfree(adev->pcie_reset_ctx.swus_pcistate); 5144 } 5145 5146 /** 5147 * amdgpu_device_evict_resources - evict device resources 5148 * @adev: amdgpu device object 5149 * 5150 * Evicts all ttm device resources(vram BOs, gart table) from the lru list 5151 * of the vram memory type. Mainly used for evicting device resources 5152 * at suspend time. 5153 * 5154 */ 5155 static int amdgpu_device_evict_resources(struct amdgpu_device *adev) 5156 { 5157 int ret; 5158 5159 /* No need to evict vram on APUs unless going to S4 */ 5160 if (!adev->in_s4 && (adev->flags & AMD_IS_APU)) 5161 return 0; 5162 5163 /* No need to evict when going to S5 through S4 callbacks */ 5164 if (system_state == SYSTEM_POWER_OFF) 5165 return 0; 5166 5167 ret = amdgpu_ttm_evict_resources(adev, TTM_PL_VRAM); 5168 if (ret) { 5169 dev_warn(adev->dev, "evicting device resources failed\n"); 5170 return ret; 5171 } 5172 5173 if (adev->in_s4) { 5174 ret = ttm_device_prepare_hibernation(&adev->mman.bdev); 5175 if (ret) 5176 dev_err(adev->dev, "prepare hibernation failed, %d\n", ret); 5177 } 5178 return ret; 5179 } 5180 5181 /* 5182 * Suspend & resume. 5183 */ 5184 /** 5185 * amdgpu_device_pm_notifier - Notification block for Suspend/Hibernate events 5186 * @nb: notifier block 5187 * @mode: suspend mode 5188 * @data: data 5189 * 5190 * This function is called when the system is about to suspend or hibernate. 5191 * It is used to set the appropriate flags so that eviction can be optimized 5192 * in the pm prepare callback. 5193 */ 5194 static int amdgpu_device_pm_notifier(struct notifier_block *nb, unsigned long mode, 5195 void *data) 5196 { 5197 struct amdgpu_device *adev = container_of(nb, struct amdgpu_device, pm_nb); 5198 5199 switch (mode) { 5200 case PM_HIBERNATION_PREPARE: 5201 adev->in_s4 = true; 5202 break; 5203 case PM_POST_HIBERNATION: 5204 adev->in_s4 = false; 5205 break; 5206 } 5207 5208 return NOTIFY_DONE; 5209 } 5210 5211 /** 5212 * amdgpu_device_prepare - prepare for device suspend 5213 * 5214 * @dev: drm dev pointer 5215 * 5216 * Prepare to put the hw in the suspend state (all asics). 5217 * Returns 0 for success or an error on failure. 5218 * Called at driver suspend. 5219 */ 5220 int amdgpu_device_prepare(struct drm_device *dev) 5221 { 5222 struct amdgpu_device *adev = drm_to_adev(dev); 5223 int i, r; 5224 5225 if (dev->switch_power_state == DRM_SWITCH_POWER_OFF) 5226 return 0; 5227 5228 /* Evict the majority of BOs before starting suspend sequence */ 5229 r = amdgpu_device_evict_resources(adev); 5230 if (r) 5231 return r; 5232 5233 flush_delayed_work(&adev->gfx.gfx_off_delay_work); 5234 5235 for (i = 0; i < adev->num_ip_blocks; i++) { 5236 if (!adev->ip_blocks[i].status.valid) 5237 continue; 5238 if (!adev->ip_blocks[i].version->funcs->prepare_suspend) 5239 continue; 5240 r = adev->ip_blocks[i].version->funcs->prepare_suspend(&adev->ip_blocks[i]); 5241 if (r) 5242 return r; 5243 } 5244 5245 return 0; 5246 } 5247 5248 /** 5249 * amdgpu_device_complete - complete power state transition 5250 * 5251 * @dev: drm dev pointer 5252 * 5253 * Undo the changes from amdgpu_device_prepare. This will be 5254 * called on all resume transitions, including those that failed. 5255 */ 5256 void amdgpu_device_complete(struct drm_device *dev) 5257 { 5258 struct amdgpu_device *adev = drm_to_adev(dev); 5259 int i; 5260 5261 for (i = 0; i < adev->num_ip_blocks; i++) { 5262 if (!adev->ip_blocks[i].status.valid) 5263 continue; 5264 if (!adev->ip_blocks[i].version->funcs->complete) 5265 continue; 5266 adev->ip_blocks[i].version->funcs->complete(&adev->ip_blocks[i]); 5267 } 5268 } 5269 5270 /** 5271 * amdgpu_device_suspend - initiate device suspend 5272 * 5273 * @dev: drm dev pointer 5274 * @notify_clients: notify in-kernel DRM clients 5275 * 5276 * Puts the hw in the suspend state (all asics). 5277 * Returns 0 for success or an error on failure. 5278 * Called at driver suspend. 5279 */ 5280 int amdgpu_device_suspend(struct drm_device *dev, bool notify_clients) 5281 { 5282 struct amdgpu_device *adev = drm_to_adev(dev); 5283 int r, rec; 5284 5285 if (dev->switch_power_state == DRM_SWITCH_POWER_OFF) 5286 return 0; 5287 5288 adev->in_suspend = true; 5289 5290 if (amdgpu_sriov_vf(adev)) { 5291 if (!adev->in_runpm) 5292 amdgpu_amdkfd_suspend_process(adev); 5293 amdgpu_virt_fini_data_exchange(adev); 5294 r = amdgpu_virt_request_full_gpu(adev, false); 5295 if (r) 5296 return r; 5297 } 5298 5299 r = amdgpu_acpi_smart_shift_update(adev, AMDGPU_SS_DEV_D3); 5300 if (r) 5301 goto unwind_sriov; 5302 5303 if (notify_clients) 5304 drm_client_dev_suspend(adev_to_drm(adev)); 5305 5306 cancel_delayed_work_sync(&adev->delayed_init_work); 5307 5308 amdgpu_ras_suspend(adev); 5309 5310 r = amdgpu_device_ip_suspend_phase1(adev); 5311 if (r) 5312 goto unwind_smartshift; 5313 5314 amdgpu_amdkfd_suspend(adev, !amdgpu_sriov_vf(adev) && !adev->in_runpm); 5315 r = amdgpu_userq_suspend(adev); 5316 if (r) 5317 goto unwind_ip_phase1; 5318 5319 r = amdgpu_device_evict_resources(adev); 5320 if (r) 5321 goto unwind_userq; 5322 5323 amdgpu_ttm_set_buffer_funcs_status(adev, false); 5324 5325 amdgpu_fence_driver_hw_fini(adev); 5326 5327 r = amdgpu_device_ip_suspend_phase2(adev); 5328 if (r) 5329 goto unwind_evict; 5330 5331 if (amdgpu_sriov_vf(adev)) 5332 amdgpu_virt_release_full_gpu(adev, false); 5333 5334 return 0; 5335 5336 unwind_evict: 5337 if (adev->mman.buffer_funcs_ring->sched.ready) 5338 amdgpu_ttm_set_buffer_funcs_status(adev, true); 5339 amdgpu_fence_driver_hw_init(adev); 5340 5341 unwind_userq: 5342 rec = amdgpu_userq_resume(adev); 5343 if (rec) { 5344 dev_warn(adev->dev, "failed to re-initialize user queues: %d\n", rec); 5345 return r; 5346 } 5347 rec = amdgpu_amdkfd_resume(adev, !amdgpu_sriov_vf(adev) && !adev->in_runpm); 5348 if (rec) { 5349 dev_warn(adev->dev, "failed to re-initialize kfd: %d\n", rec); 5350 return r; 5351 } 5352 5353 unwind_ip_phase1: 5354 /* suspend phase 1 = resume phase 3 */ 5355 rec = amdgpu_device_ip_resume_phase3(adev); 5356 if (rec) { 5357 dev_warn(adev->dev, "failed to re-initialize IPs phase1: %d\n", rec); 5358 return r; 5359 } 5360 5361 unwind_smartshift: 5362 rec = amdgpu_acpi_smart_shift_update(adev, AMDGPU_SS_DEV_D0); 5363 if (rec) { 5364 dev_warn(adev->dev, "failed to re-update smart shift: %d\n", rec); 5365 return r; 5366 } 5367 5368 if (notify_clients) 5369 drm_client_dev_resume(adev_to_drm(adev)); 5370 5371 amdgpu_ras_resume(adev); 5372 5373 unwind_sriov: 5374 if (amdgpu_sriov_vf(adev)) { 5375 rec = amdgpu_virt_request_full_gpu(adev, true); 5376 if (rec) { 5377 dev_warn(adev->dev, "failed to reinitialize sriov: %d\n", rec); 5378 return r; 5379 } 5380 } 5381 5382 adev->in_suspend = adev->in_s0ix = adev->in_s3 = false; 5383 5384 return r; 5385 } 5386 5387 static inline int amdgpu_virt_resume(struct amdgpu_device *adev) 5388 { 5389 int r; 5390 unsigned int prev_physical_node_id = adev->gmc.xgmi.physical_node_id; 5391 5392 /* During VM resume, QEMU programming of VF MSIX table (register GFXMSIX_VECT0_ADDR_LO) 5393 * may not work. The access could be blocked by nBIF protection as VF isn't in 5394 * exclusive access mode. Exclusive access is enabled now, disable/enable MSIX 5395 * so that QEMU reprograms MSIX table. 5396 */ 5397 amdgpu_restore_msix(adev); 5398 5399 r = adev->gfxhub.funcs->get_xgmi_info(adev); 5400 if (r) 5401 return r; 5402 5403 dev_info(adev->dev, "xgmi node, old id %d, new id %d\n", 5404 prev_physical_node_id, adev->gmc.xgmi.physical_node_id); 5405 5406 adev->vm_manager.vram_base_offset = adev->gfxhub.funcs->get_mc_fb_offset(adev); 5407 adev->vm_manager.vram_base_offset += 5408 adev->gmc.xgmi.physical_node_id * adev->gmc.xgmi.node_segment_size; 5409 5410 return 0; 5411 } 5412 5413 /** 5414 * amdgpu_device_resume - initiate device resume 5415 * 5416 * @dev: drm dev pointer 5417 * @notify_clients: notify in-kernel DRM clients 5418 * 5419 * Bring the hw back to operating state (all asics). 5420 * Returns 0 for success or an error on failure. 5421 * Called at driver resume. 5422 */ 5423 int amdgpu_device_resume(struct drm_device *dev, bool notify_clients) 5424 { 5425 struct amdgpu_device *adev = drm_to_adev(dev); 5426 int r = 0; 5427 5428 if (amdgpu_sriov_vf(adev)) { 5429 r = amdgpu_virt_request_full_gpu(adev, true); 5430 if (r) 5431 return r; 5432 } 5433 5434 if (amdgpu_virt_xgmi_migrate_enabled(adev)) { 5435 r = amdgpu_virt_resume(adev); 5436 if (r) 5437 goto exit; 5438 } 5439 5440 if (dev->switch_power_state == DRM_SWITCH_POWER_OFF) 5441 return 0; 5442 5443 if (adev->in_s0ix) 5444 amdgpu_dpm_gfx_state_change(adev, sGpuChangeState_D0Entry); 5445 5446 /* post card */ 5447 if (amdgpu_device_need_post(adev)) { 5448 r = amdgpu_device_asic_init(adev); 5449 if (r) 5450 dev_err(adev->dev, "amdgpu asic init failed\n"); 5451 } 5452 5453 r = amdgpu_device_ip_resume(adev); 5454 5455 if (r) { 5456 dev_err(adev->dev, "amdgpu_device_ip_resume failed (%d).\n", r); 5457 goto exit; 5458 } 5459 5460 r = amdgpu_amdkfd_resume(adev, !amdgpu_sriov_vf(adev) && !adev->in_runpm); 5461 if (r) 5462 goto exit; 5463 5464 r = amdgpu_userq_resume(adev); 5465 if (r) 5466 goto exit; 5467 5468 r = amdgpu_device_ip_late_init(adev); 5469 if (r) 5470 goto exit; 5471 5472 queue_delayed_work(system_wq, &adev->delayed_init_work, 5473 msecs_to_jiffies(AMDGPU_RESUME_MS)); 5474 exit: 5475 if (amdgpu_sriov_vf(adev)) { 5476 amdgpu_virt_init_data_exchange(adev); 5477 amdgpu_virt_release_full_gpu(adev, true); 5478 5479 if (!r && !adev->in_runpm) 5480 r = amdgpu_amdkfd_resume_process(adev); 5481 } 5482 5483 if (r) 5484 return r; 5485 5486 /* Make sure IB tests flushed */ 5487 flush_delayed_work(&adev->delayed_init_work); 5488 5489 if (notify_clients) 5490 drm_client_dev_resume(adev_to_drm(adev)); 5491 5492 amdgpu_ras_resume(adev); 5493 5494 if (adev->mode_info.num_crtc) { 5495 /* 5496 * Most of the connector probing functions try to acquire runtime pm 5497 * refs to ensure that the GPU is powered on when connector polling is 5498 * performed. Since we're calling this from a runtime PM callback, 5499 * trying to acquire rpm refs will cause us to deadlock. 5500 * 5501 * Since we're guaranteed to be holding the rpm lock, it's safe to 5502 * temporarily disable the rpm helpers so this doesn't deadlock us. 5503 */ 5504 #ifdef CONFIG_PM 5505 dev->dev->power.disable_depth++; 5506 #endif 5507 if (!adev->dc_enabled) 5508 drm_helper_hpd_irq_event(dev); 5509 else 5510 drm_kms_helper_hotplug_event(dev); 5511 #ifdef CONFIG_PM 5512 dev->dev->power.disable_depth--; 5513 #endif 5514 } 5515 5516 amdgpu_vram_mgr_clear_reset_blocks(adev); 5517 adev->in_suspend = false; 5518 5519 if (amdgpu_acpi_smart_shift_update(adev, AMDGPU_SS_DEV_D0)) 5520 dev_warn(adev->dev, "smart shift update failed\n"); 5521 5522 return 0; 5523 } 5524 5525 /** 5526 * amdgpu_device_ip_check_soft_reset - did soft reset succeed 5527 * 5528 * @adev: amdgpu_device pointer 5529 * 5530 * The list of all the hardware IPs that make up the asic is walked and 5531 * the check_soft_reset callbacks are run. check_soft_reset determines 5532 * if the asic is still hung or not. 5533 * Returns true if any of the IPs are still in a hung state, false if not. 5534 */ 5535 static bool amdgpu_device_ip_check_soft_reset(struct amdgpu_device *adev) 5536 { 5537 int i; 5538 bool asic_hang = false; 5539 5540 if (amdgpu_sriov_vf(adev)) 5541 return true; 5542 5543 if (amdgpu_asic_need_full_reset(adev)) 5544 return true; 5545 5546 for (i = 0; i < adev->num_ip_blocks; i++) { 5547 if (!adev->ip_blocks[i].status.valid) 5548 continue; 5549 if (adev->ip_blocks[i].version->funcs->check_soft_reset) 5550 adev->ip_blocks[i].status.hang = 5551 adev->ip_blocks[i].version->funcs->check_soft_reset( 5552 &adev->ip_blocks[i]); 5553 if (adev->ip_blocks[i].status.hang) { 5554 dev_info(adev->dev, "IP block:%s is hung!\n", adev->ip_blocks[i].version->funcs->name); 5555 asic_hang = true; 5556 } 5557 } 5558 return asic_hang; 5559 } 5560 5561 /** 5562 * amdgpu_device_ip_pre_soft_reset - prepare for soft reset 5563 * 5564 * @adev: amdgpu_device pointer 5565 * 5566 * The list of all the hardware IPs that make up the asic is walked and the 5567 * pre_soft_reset callbacks are run if the block is hung. pre_soft_reset 5568 * handles any IP specific hardware or software state changes that are 5569 * necessary for a soft reset to succeed. 5570 * Returns 0 on success, negative error code on failure. 5571 */ 5572 static int amdgpu_device_ip_pre_soft_reset(struct amdgpu_device *adev) 5573 { 5574 int i, r = 0; 5575 5576 for (i = 0; i < adev->num_ip_blocks; i++) { 5577 if (!adev->ip_blocks[i].status.valid) 5578 continue; 5579 if (adev->ip_blocks[i].status.hang && 5580 adev->ip_blocks[i].version->funcs->pre_soft_reset) { 5581 r = adev->ip_blocks[i].version->funcs->pre_soft_reset(&adev->ip_blocks[i]); 5582 if (r) 5583 return r; 5584 } 5585 } 5586 5587 return 0; 5588 } 5589 5590 /** 5591 * amdgpu_device_ip_need_full_reset - check if a full asic reset is needed 5592 * 5593 * @adev: amdgpu_device pointer 5594 * 5595 * Some hardware IPs cannot be soft reset. If they are hung, a full gpu 5596 * reset is necessary to recover. 5597 * Returns true if a full asic reset is required, false if not. 5598 */ 5599 static bool amdgpu_device_ip_need_full_reset(struct amdgpu_device *adev) 5600 { 5601 int i; 5602 5603 if (amdgpu_asic_need_full_reset(adev)) 5604 return true; 5605 5606 for (i = 0; i < adev->num_ip_blocks; i++) { 5607 if (!adev->ip_blocks[i].status.valid) 5608 continue; 5609 if ((adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) || 5610 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) || 5611 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_ACP) || 5612 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE) || 5613 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) { 5614 if (adev->ip_blocks[i].status.hang) { 5615 dev_info(adev->dev, "Some block need full reset!\n"); 5616 return true; 5617 } 5618 } 5619 } 5620 return false; 5621 } 5622 5623 /** 5624 * amdgpu_device_ip_soft_reset - do a soft reset 5625 * 5626 * @adev: amdgpu_device pointer 5627 * 5628 * The list of all the hardware IPs that make up the asic is walked and the 5629 * soft_reset callbacks are run if the block is hung. soft_reset handles any 5630 * IP specific hardware or software state changes that are necessary to soft 5631 * reset the IP. 5632 * Returns 0 on success, negative error code on failure. 5633 */ 5634 static int amdgpu_device_ip_soft_reset(struct amdgpu_device *adev) 5635 { 5636 int i, r = 0; 5637 5638 for (i = 0; i < adev->num_ip_blocks; i++) { 5639 if (!adev->ip_blocks[i].status.valid) 5640 continue; 5641 if (adev->ip_blocks[i].status.hang && 5642 adev->ip_blocks[i].version->funcs->soft_reset) { 5643 r = adev->ip_blocks[i].version->funcs->soft_reset(&adev->ip_blocks[i]); 5644 if (r) 5645 return r; 5646 } 5647 } 5648 5649 return 0; 5650 } 5651 5652 /** 5653 * amdgpu_device_ip_post_soft_reset - clean up from soft reset 5654 * 5655 * @adev: amdgpu_device pointer 5656 * 5657 * The list of all the hardware IPs that make up the asic is walked and the 5658 * post_soft_reset callbacks are run if the asic was hung. post_soft_reset 5659 * handles any IP specific hardware or software state changes that are 5660 * necessary after the IP has been soft reset. 5661 * Returns 0 on success, negative error code on failure. 5662 */ 5663 static int amdgpu_device_ip_post_soft_reset(struct amdgpu_device *adev) 5664 { 5665 int i, r = 0; 5666 5667 for (i = 0; i < adev->num_ip_blocks; i++) { 5668 if (!adev->ip_blocks[i].status.valid) 5669 continue; 5670 if (adev->ip_blocks[i].status.hang && 5671 adev->ip_blocks[i].version->funcs->post_soft_reset) 5672 r = adev->ip_blocks[i].version->funcs->post_soft_reset(&adev->ip_blocks[i]); 5673 if (r) 5674 return r; 5675 } 5676 5677 return 0; 5678 } 5679 5680 /** 5681 * amdgpu_device_reset_sriov - reset ASIC for SR-IOV vf 5682 * 5683 * @adev: amdgpu_device pointer 5684 * @reset_context: amdgpu reset context pointer 5685 * 5686 * do VF FLR and reinitialize Asic 5687 * return 0 means succeeded otherwise failed 5688 */ 5689 static int amdgpu_device_reset_sriov(struct amdgpu_device *adev, 5690 struct amdgpu_reset_context *reset_context) 5691 { 5692 int r; 5693 struct amdgpu_hive_info *hive = NULL; 5694 5695 if (test_bit(AMDGPU_HOST_FLR, &reset_context->flags)) { 5696 if (!amdgpu_ras_get_fed_status(adev)) 5697 amdgpu_virt_ready_to_reset(adev); 5698 amdgpu_virt_wait_reset(adev); 5699 clear_bit(AMDGPU_HOST_FLR, &reset_context->flags); 5700 r = amdgpu_virt_request_full_gpu(adev, true); 5701 } else { 5702 r = amdgpu_virt_reset_gpu(adev); 5703 } 5704 if (r) 5705 return r; 5706 5707 amdgpu_ras_clear_err_state(adev); 5708 amdgpu_irq_gpu_reset_resume_helper(adev); 5709 5710 /* some sw clean up VF needs to do before recover */ 5711 amdgpu_virt_post_reset(adev); 5712 5713 /* Resume IP prior to SMC */ 5714 r = amdgpu_device_ip_reinit_early_sriov(adev); 5715 if (r) 5716 return r; 5717 5718 amdgpu_virt_init_data_exchange(adev); 5719 5720 r = amdgpu_device_fw_loading(adev); 5721 if (r) 5722 return r; 5723 5724 /* now we are okay to resume SMC/CP/SDMA */ 5725 r = amdgpu_device_ip_reinit_late_sriov(adev); 5726 if (r) 5727 return r; 5728 5729 hive = amdgpu_get_xgmi_hive(adev); 5730 /* Update PSP FW topology after reset */ 5731 if (hive && adev->gmc.xgmi.num_physical_nodes > 1) 5732 r = amdgpu_xgmi_update_topology(hive, adev); 5733 if (hive) 5734 amdgpu_put_xgmi_hive(hive); 5735 if (r) 5736 return r; 5737 5738 r = amdgpu_ib_ring_tests(adev); 5739 if (r) 5740 return r; 5741 5742 if (adev->virt.gim_feature & AMDGIM_FEATURE_GIM_FLR_VRAMLOST) 5743 amdgpu_inc_vram_lost(adev); 5744 5745 /* need to be called during full access so we can't do it later like 5746 * bare-metal does. 5747 */ 5748 amdgpu_amdkfd_post_reset(adev); 5749 amdgpu_virt_release_full_gpu(adev, true); 5750 5751 /* Aldebaran and gfx_11_0_3 support ras in SRIOV, so need resume ras during reset */ 5752 if (amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 2) || 5753 amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 3) || 5754 amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 4) || 5755 amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 5, 0) || 5756 amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(11, 0, 3)) 5757 amdgpu_ras_resume(adev); 5758 5759 amdgpu_virt_ras_telemetry_post_reset(adev); 5760 5761 return 0; 5762 } 5763 5764 /** 5765 * amdgpu_device_has_job_running - check if there is any unfinished job 5766 * 5767 * @adev: amdgpu_device pointer 5768 * 5769 * check if there is any job running on the device when guest driver receives 5770 * FLR notification from host driver. If there are still jobs running, then 5771 * the guest driver will not respond the FLR reset. Instead, let the job hit 5772 * the timeout and guest driver then issue the reset request. 5773 */ 5774 bool amdgpu_device_has_job_running(struct amdgpu_device *adev) 5775 { 5776 int i; 5777 5778 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 5779 struct amdgpu_ring *ring = adev->rings[i]; 5780 5781 if (!amdgpu_ring_sched_ready(ring)) 5782 continue; 5783 5784 if (amdgpu_fence_count_emitted(ring)) 5785 return true; 5786 } 5787 return false; 5788 } 5789 5790 /** 5791 * amdgpu_device_should_recover_gpu - check if we should try GPU recovery 5792 * 5793 * @adev: amdgpu_device pointer 5794 * 5795 * Check amdgpu_gpu_recovery and SRIOV status to see if we should try to recover 5796 * a hung GPU. 5797 */ 5798 bool amdgpu_device_should_recover_gpu(struct amdgpu_device *adev) 5799 { 5800 5801 if (amdgpu_gpu_recovery == 0) 5802 goto disabled; 5803 5804 /* Skip soft reset check in fatal error mode */ 5805 if (!amdgpu_ras_is_poison_mode_supported(adev)) 5806 return true; 5807 5808 if (amdgpu_sriov_vf(adev)) 5809 return true; 5810 5811 if (amdgpu_gpu_recovery == -1) { 5812 switch (adev->asic_type) { 5813 #ifdef CONFIG_DRM_AMDGPU_SI 5814 case CHIP_VERDE: 5815 case CHIP_TAHITI: 5816 case CHIP_PITCAIRN: 5817 case CHIP_OLAND: 5818 case CHIP_HAINAN: 5819 #endif 5820 #ifdef CONFIG_DRM_AMDGPU_CIK 5821 case CHIP_KAVERI: 5822 case CHIP_KABINI: 5823 case CHIP_MULLINS: 5824 #endif 5825 case CHIP_CARRIZO: 5826 case CHIP_STONEY: 5827 case CHIP_CYAN_SKILLFISH: 5828 goto disabled; 5829 default: 5830 break; 5831 } 5832 } 5833 5834 return true; 5835 5836 disabled: 5837 dev_info(adev->dev, "GPU recovery disabled.\n"); 5838 return false; 5839 } 5840 5841 int amdgpu_device_mode1_reset(struct amdgpu_device *adev) 5842 { 5843 u32 i; 5844 int ret = 0; 5845 5846 if (adev->bios) 5847 amdgpu_atombios_scratch_regs_engine_hung(adev, true); 5848 5849 dev_info(adev->dev, "GPU mode1 reset\n"); 5850 5851 /* Cache the state before bus master disable. The saved config space 5852 * values are used in other cases like restore after mode-2 reset. 5853 */ 5854 amdgpu_device_cache_pci_state(adev->pdev); 5855 5856 /* disable BM */ 5857 pci_clear_master(adev->pdev); 5858 5859 if (amdgpu_dpm_is_mode1_reset_supported(adev)) { 5860 dev_info(adev->dev, "GPU smu mode1 reset\n"); 5861 ret = amdgpu_dpm_mode1_reset(adev); 5862 } else { 5863 dev_info(adev->dev, "GPU psp mode1 reset\n"); 5864 ret = psp_gpu_reset(adev); 5865 } 5866 5867 if (ret) 5868 goto mode1_reset_failed; 5869 5870 amdgpu_device_load_pci_state(adev->pdev); 5871 ret = amdgpu_psp_wait_for_bootloader(adev); 5872 if (ret) 5873 goto mode1_reset_failed; 5874 5875 /* wait for asic to come out of reset */ 5876 for (i = 0; i < adev->usec_timeout; i++) { 5877 u32 memsize = adev->nbio.funcs->get_memsize(adev); 5878 5879 if (memsize != 0xffffffff) 5880 break; 5881 udelay(1); 5882 } 5883 5884 if (i >= adev->usec_timeout) { 5885 ret = -ETIMEDOUT; 5886 goto mode1_reset_failed; 5887 } 5888 5889 if (adev->bios) 5890 amdgpu_atombios_scratch_regs_engine_hung(adev, false); 5891 5892 return 0; 5893 5894 mode1_reset_failed: 5895 dev_err(adev->dev, "GPU mode1 reset failed\n"); 5896 return ret; 5897 } 5898 5899 int amdgpu_device_link_reset(struct amdgpu_device *adev) 5900 { 5901 int ret = 0; 5902 5903 dev_info(adev->dev, "GPU link reset\n"); 5904 5905 if (!amdgpu_reset_in_dpc(adev)) 5906 ret = amdgpu_dpm_link_reset(adev); 5907 5908 if (ret) 5909 goto link_reset_failed; 5910 5911 ret = amdgpu_psp_wait_for_bootloader(adev); 5912 if (ret) 5913 goto link_reset_failed; 5914 5915 return 0; 5916 5917 link_reset_failed: 5918 dev_err(adev->dev, "GPU link reset failed\n"); 5919 return ret; 5920 } 5921 5922 int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev, 5923 struct amdgpu_reset_context *reset_context) 5924 { 5925 int i, r = 0; 5926 struct amdgpu_job *job = NULL; 5927 struct amdgpu_device *tmp_adev = reset_context->reset_req_dev; 5928 bool need_full_reset = 5929 test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 5930 5931 if (reset_context->reset_req_dev == adev) 5932 job = reset_context->job; 5933 5934 if (amdgpu_sriov_vf(adev)) 5935 amdgpu_virt_pre_reset(adev); 5936 5937 amdgpu_fence_driver_isr_toggle(adev, true); 5938 5939 /* block all schedulers and reset given job's ring */ 5940 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 5941 struct amdgpu_ring *ring = adev->rings[i]; 5942 5943 if (!amdgpu_ring_sched_ready(ring)) 5944 continue; 5945 5946 /* after all hw jobs are reset, hw fence is meaningless, so force_completion */ 5947 amdgpu_fence_driver_force_completion(ring); 5948 } 5949 5950 amdgpu_fence_driver_isr_toggle(adev, false); 5951 5952 if (job && job->vm) 5953 drm_sched_increase_karma(&job->base); 5954 5955 r = amdgpu_reset_prepare_hwcontext(adev, reset_context); 5956 /* If reset handler not implemented, continue; otherwise return */ 5957 if (r == -EOPNOTSUPP) 5958 r = 0; 5959 else 5960 return r; 5961 5962 /* Don't suspend on bare metal if we are not going to HW reset the ASIC */ 5963 if (!amdgpu_sriov_vf(adev)) { 5964 5965 if (!need_full_reset) 5966 need_full_reset = amdgpu_device_ip_need_full_reset(adev); 5967 5968 if (!need_full_reset && amdgpu_gpu_recovery && 5969 amdgpu_device_ip_check_soft_reset(adev)) { 5970 amdgpu_device_ip_pre_soft_reset(adev); 5971 r = amdgpu_device_ip_soft_reset(adev); 5972 amdgpu_device_ip_post_soft_reset(adev); 5973 if (r || amdgpu_device_ip_check_soft_reset(adev)) { 5974 dev_info(adev->dev, "soft reset failed, will fallback to full reset!\n"); 5975 need_full_reset = true; 5976 } 5977 } 5978 5979 if (!test_bit(AMDGPU_SKIP_COREDUMP, &reset_context->flags)) { 5980 dev_info(tmp_adev->dev, "Dumping IP State\n"); 5981 /* Trigger ip dump before we reset the asic */ 5982 for (i = 0; i < tmp_adev->num_ip_blocks; i++) 5983 if (tmp_adev->ip_blocks[i].version->funcs->dump_ip_state) 5984 tmp_adev->ip_blocks[i].version->funcs 5985 ->dump_ip_state((void *)&tmp_adev->ip_blocks[i]); 5986 dev_info(tmp_adev->dev, "Dumping IP State Completed\n"); 5987 } 5988 5989 if (need_full_reset) 5990 r = amdgpu_device_ip_suspend(adev); 5991 if (need_full_reset) 5992 set_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 5993 else 5994 clear_bit(AMDGPU_NEED_FULL_RESET, 5995 &reset_context->flags); 5996 } 5997 5998 return r; 5999 } 6000 6001 int amdgpu_device_reinit_after_reset(struct amdgpu_reset_context *reset_context) 6002 { 6003 struct list_head *device_list_handle; 6004 bool full_reset, vram_lost = false; 6005 struct amdgpu_device *tmp_adev; 6006 int r, init_level; 6007 6008 device_list_handle = reset_context->reset_device_list; 6009 6010 if (!device_list_handle) 6011 return -EINVAL; 6012 6013 full_reset = test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 6014 6015 /** 6016 * If it's reset on init, it's default init level, otherwise keep level 6017 * as recovery level. 6018 */ 6019 if (reset_context->method == AMD_RESET_METHOD_ON_INIT) 6020 init_level = AMDGPU_INIT_LEVEL_DEFAULT; 6021 else 6022 init_level = AMDGPU_INIT_LEVEL_RESET_RECOVERY; 6023 6024 r = 0; 6025 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 6026 amdgpu_set_init_level(tmp_adev, init_level); 6027 if (full_reset) { 6028 /* post card */ 6029 amdgpu_reset_set_dpc_status(tmp_adev, false); 6030 amdgpu_ras_clear_err_state(tmp_adev); 6031 r = amdgpu_device_asic_init(tmp_adev); 6032 if (r) { 6033 dev_warn(tmp_adev->dev, "asic atom init failed!"); 6034 } else { 6035 dev_info(tmp_adev->dev, "GPU reset succeeded, trying to resume\n"); 6036 6037 r = amdgpu_device_ip_resume_phase1(tmp_adev); 6038 if (r) 6039 goto out; 6040 6041 vram_lost = amdgpu_device_check_vram_lost(tmp_adev); 6042 6043 if (!test_bit(AMDGPU_SKIP_COREDUMP, &reset_context->flags)) 6044 amdgpu_coredump(tmp_adev, false, vram_lost, reset_context->job); 6045 6046 if (vram_lost) { 6047 dev_info( 6048 tmp_adev->dev, 6049 "VRAM is lost due to GPU reset!\n"); 6050 amdgpu_inc_vram_lost(tmp_adev); 6051 } 6052 6053 r = amdgpu_device_fw_loading(tmp_adev); 6054 if (r) 6055 return r; 6056 6057 r = amdgpu_xcp_restore_partition_mode( 6058 tmp_adev->xcp_mgr); 6059 if (r) 6060 goto out; 6061 6062 r = amdgpu_device_ip_resume_phase2(tmp_adev); 6063 if (r) 6064 goto out; 6065 6066 if (tmp_adev->mman.buffer_funcs_ring->sched.ready) 6067 amdgpu_ttm_set_buffer_funcs_status(tmp_adev, true); 6068 6069 r = amdgpu_device_ip_resume_phase3(tmp_adev); 6070 if (r) 6071 goto out; 6072 6073 if (vram_lost) 6074 amdgpu_device_fill_reset_magic(tmp_adev); 6075 6076 /* 6077 * Add this ASIC as tracked as reset was already 6078 * complete successfully. 6079 */ 6080 amdgpu_register_gpu_instance(tmp_adev); 6081 6082 if (!reset_context->hive && 6083 tmp_adev->gmc.xgmi.num_physical_nodes > 1) 6084 amdgpu_xgmi_add_device(tmp_adev); 6085 6086 r = amdgpu_device_ip_late_init(tmp_adev); 6087 if (r) 6088 goto out; 6089 6090 r = amdgpu_userq_post_reset(tmp_adev, vram_lost); 6091 if (r) 6092 goto out; 6093 6094 drm_client_dev_resume(adev_to_drm(tmp_adev)); 6095 6096 /* 6097 * The GPU enters bad state once faulty pages 6098 * by ECC has reached the threshold, and ras 6099 * recovery is scheduled next. So add one check 6100 * here to break recovery if it indeed exceeds 6101 * bad page threshold, and remind user to 6102 * retire this GPU or setting one bigger 6103 * bad_page_threshold value to fix this once 6104 * probing driver again. 6105 */ 6106 if (!amdgpu_ras_is_rma(tmp_adev)) { 6107 /* must succeed. */ 6108 amdgpu_ras_resume(tmp_adev); 6109 } else { 6110 r = -EINVAL; 6111 goto out; 6112 } 6113 6114 /* Update PSP FW topology after reset */ 6115 if (reset_context->hive && 6116 tmp_adev->gmc.xgmi.num_physical_nodes > 1) 6117 r = amdgpu_xgmi_update_topology( 6118 reset_context->hive, tmp_adev); 6119 } 6120 } 6121 6122 out: 6123 if (!r) { 6124 /* IP init is complete now, set level as default */ 6125 amdgpu_set_init_level(tmp_adev, 6126 AMDGPU_INIT_LEVEL_DEFAULT); 6127 amdgpu_irq_gpu_reset_resume_helper(tmp_adev); 6128 r = amdgpu_ib_ring_tests(tmp_adev); 6129 if (r) { 6130 dev_err(tmp_adev->dev, "ib ring test failed (%d).\n", r); 6131 r = -EAGAIN; 6132 goto end; 6133 } 6134 } 6135 6136 if (r) 6137 tmp_adev->asic_reset_res = r; 6138 } 6139 6140 end: 6141 return r; 6142 } 6143 6144 int amdgpu_do_asic_reset(struct list_head *device_list_handle, 6145 struct amdgpu_reset_context *reset_context) 6146 { 6147 struct amdgpu_device *tmp_adev = NULL; 6148 bool need_full_reset, skip_hw_reset; 6149 int r = 0; 6150 6151 /* Try reset handler method first */ 6152 tmp_adev = list_first_entry(device_list_handle, struct amdgpu_device, 6153 reset_list); 6154 6155 reset_context->reset_device_list = device_list_handle; 6156 r = amdgpu_reset_perform_reset(tmp_adev, reset_context); 6157 /* If reset handler not implemented, continue; otherwise return */ 6158 if (r == -EOPNOTSUPP) 6159 r = 0; 6160 else 6161 return r; 6162 6163 /* Reset handler not implemented, use the default method */ 6164 need_full_reset = 6165 test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 6166 skip_hw_reset = test_bit(AMDGPU_SKIP_HW_RESET, &reset_context->flags); 6167 6168 /* 6169 * ASIC reset has to be done on all XGMI hive nodes ASAP 6170 * to allow proper links negotiation in FW (within 1 sec) 6171 */ 6172 if (!skip_hw_reset && need_full_reset) { 6173 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 6174 /* For XGMI run all resets in parallel to speed up the process */ 6175 if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) { 6176 if (!queue_work(system_unbound_wq, 6177 &tmp_adev->xgmi_reset_work)) 6178 r = -EALREADY; 6179 } else 6180 r = amdgpu_asic_reset(tmp_adev); 6181 6182 if (r) { 6183 dev_err(tmp_adev->dev, 6184 "ASIC reset failed with error, %d for drm dev, %s", 6185 r, adev_to_drm(tmp_adev)->unique); 6186 goto out; 6187 } 6188 } 6189 6190 /* For XGMI wait for all resets to complete before proceed */ 6191 if (!r) { 6192 list_for_each_entry(tmp_adev, device_list_handle, 6193 reset_list) { 6194 if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) { 6195 flush_work(&tmp_adev->xgmi_reset_work); 6196 r = tmp_adev->asic_reset_res; 6197 if (r) 6198 break; 6199 } 6200 } 6201 } 6202 } 6203 6204 if (!r && amdgpu_ras_intr_triggered()) { 6205 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 6206 amdgpu_ras_reset_error_count(tmp_adev, 6207 AMDGPU_RAS_BLOCK__MMHUB); 6208 } 6209 6210 amdgpu_ras_intr_cleared(); 6211 } 6212 6213 r = amdgpu_device_reinit_after_reset(reset_context); 6214 if (r == -EAGAIN) 6215 set_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 6216 else 6217 clear_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 6218 6219 out: 6220 return r; 6221 } 6222 6223 static void amdgpu_device_set_mp1_state(struct amdgpu_device *adev) 6224 { 6225 6226 switch (amdgpu_asic_reset_method(adev)) { 6227 case AMD_RESET_METHOD_MODE1: 6228 case AMD_RESET_METHOD_LINK: 6229 adev->mp1_state = PP_MP1_STATE_SHUTDOWN; 6230 break; 6231 case AMD_RESET_METHOD_MODE2: 6232 adev->mp1_state = PP_MP1_STATE_RESET; 6233 break; 6234 default: 6235 adev->mp1_state = PP_MP1_STATE_NONE; 6236 break; 6237 } 6238 } 6239 6240 static void amdgpu_device_unset_mp1_state(struct amdgpu_device *adev) 6241 { 6242 amdgpu_vf_error_trans_all(adev); 6243 adev->mp1_state = PP_MP1_STATE_NONE; 6244 } 6245 6246 static void amdgpu_device_resume_display_audio(struct amdgpu_device *adev) 6247 { 6248 struct pci_dev *p = NULL; 6249 6250 p = pci_get_domain_bus_and_slot(pci_domain_nr(adev->pdev->bus), 6251 adev->pdev->bus->number, 1); 6252 if (p) { 6253 pm_runtime_enable(&(p->dev)); 6254 pm_runtime_resume(&(p->dev)); 6255 } 6256 6257 pci_dev_put(p); 6258 } 6259 6260 static int amdgpu_device_suspend_display_audio(struct amdgpu_device *adev) 6261 { 6262 enum amd_reset_method reset_method; 6263 struct pci_dev *p = NULL; 6264 u64 expires; 6265 6266 /* 6267 * For now, only BACO and mode1 reset are confirmed 6268 * to suffer the audio issue without proper suspended. 6269 */ 6270 reset_method = amdgpu_asic_reset_method(adev); 6271 if ((reset_method != AMD_RESET_METHOD_BACO) && 6272 (reset_method != AMD_RESET_METHOD_MODE1)) 6273 return -EINVAL; 6274 6275 p = pci_get_domain_bus_and_slot(pci_domain_nr(adev->pdev->bus), 6276 adev->pdev->bus->number, 1); 6277 if (!p) 6278 return -ENODEV; 6279 6280 expires = pm_runtime_autosuspend_expiration(&(p->dev)); 6281 if (!expires) 6282 /* 6283 * If we cannot get the audio device autosuspend delay, 6284 * a fixed 4S interval will be used. Considering 3S is 6285 * the audio controller default autosuspend delay setting. 6286 * 4S used here is guaranteed to cover that. 6287 */ 6288 expires = ktime_get_mono_fast_ns() + NSEC_PER_SEC * 4ULL; 6289 6290 while (!pm_runtime_status_suspended(&(p->dev))) { 6291 if (!pm_runtime_suspend(&(p->dev))) 6292 break; 6293 6294 if (expires < ktime_get_mono_fast_ns()) { 6295 dev_warn(adev->dev, "failed to suspend display audio\n"); 6296 pci_dev_put(p); 6297 /* TODO: abort the succeeding gpu reset? */ 6298 return -ETIMEDOUT; 6299 } 6300 } 6301 6302 pm_runtime_disable(&(p->dev)); 6303 6304 pci_dev_put(p); 6305 return 0; 6306 } 6307 6308 static inline void amdgpu_device_stop_pending_resets(struct amdgpu_device *adev) 6309 { 6310 struct amdgpu_ras *con = amdgpu_ras_get_context(adev); 6311 6312 #if defined(CONFIG_DEBUG_FS) 6313 if (!amdgpu_sriov_vf(adev)) 6314 cancel_work(&adev->reset_work); 6315 #endif 6316 cancel_work(&adev->userq_reset_work); 6317 6318 if (adev->kfd.dev) 6319 cancel_work(&adev->kfd.reset_work); 6320 6321 if (amdgpu_sriov_vf(adev)) 6322 cancel_work(&adev->virt.flr_work); 6323 6324 if (con && adev->ras_enabled) 6325 cancel_work(&con->recovery_work); 6326 6327 } 6328 6329 static int amdgpu_device_health_check(struct list_head *device_list_handle) 6330 { 6331 struct amdgpu_device *tmp_adev; 6332 int ret = 0; 6333 6334 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 6335 ret |= amdgpu_device_bus_status_check(tmp_adev); 6336 } 6337 6338 return ret; 6339 } 6340 6341 static void amdgpu_device_recovery_prepare(struct amdgpu_device *adev, 6342 struct list_head *device_list, 6343 struct amdgpu_hive_info *hive) 6344 { 6345 struct amdgpu_device *tmp_adev = NULL; 6346 6347 /* 6348 * Build list of devices to reset. 6349 * In case we are in XGMI hive mode, resort the device list 6350 * to put adev in the 1st position. 6351 */ 6352 if (!amdgpu_sriov_vf(adev) && (adev->gmc.xgmi.num_physical_nodes > 1) && hive) { 6353 list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head) { 6354 list_add_tail(&tmp_adev->reset_list, device_list); 6355 if (adev->shutdown) 6356 tmp_adev->shutdown = true; 6357 if (amdgpu_reset_in_dpc(adev)) 6358 tmp_adev->pcie_reset_ctx.in_link_reset = true; 6359 } 6360 if (!list_is_first(&adev->reset_list, device_list)) 6361 list_rotate_to_front(&adev->reset_list, device_list); 6362 } else { 6363 list_add_tail(&adev->reset_list, device_list); 6364 } 6365 } 6366 6367 static void amdgpu_device_recovery_get_reset_lock(struct amdgpu_device *adev, 6368 struct list_head *device_list) 6369 { 6370 struct amdgpu_device *tmp_adev = NULL; 6371 6372 if (list_empty(device_list)) 6373 return; 6374 tmp_adev = 6375 list_first_entry(device_list, struct amdgpu_device, reset_list); 6376 amdgpu_device_lock_reset_domain(tmp_adev->reset_domain); 6377 } 6378 6379 static void amdgpu_device_recovery_put_reset_lock(struct amdgpu_device *adev, 6380 struct list_head *device_list) 6381 { 6382 struct amdgpu_device *tmp_adev = NULL; 6383 6384 if (list_empty(device_list)) 6385 return; 6386 tmp_adev = 6387 list_first_entry(device_list, struct amdgpu_device, reset_list); 6388 amdgpu_device_unlock_reset_domain(tmp_adev->reset_domain); 6389 } 6390 6391 static void amdgpu_device_halt_activities(struct amdgpu_device *adev, 6392 struct amdgpu_job *job, 6393 struct amdgpu_reset_context *reset_context, 6394 struct list_head *device_list, 6395 struct amdgpu_hive_info *hive, 6396 bool need_emergency_restart) 6397 { 6398 struct amdgpu_device *tmp_adev = NULL; 6399 int i; 6400 6401 /* block all schedulers and reset given job's ring */ 6402 list_for_each_entry(tmp_adev, device_list, reset_list) { 6403 amdgpu_device_set_mp1_state(tmp_adev); 6404 6405 /* 6406 * Try to put the audio codec into suspend state 6407 * before gpu reset started. 6408 * 6409 * Due to the power domain of the graphics device 6410 * is shared with AZ power domain. Without this, 6411 * we may change the audio hardware from behind 6412 * the audio driver's back. That will trigger 6413 * some audio codec errors. 6414 */ 6415 if (!amdgpu_device_suspend_display_audio(tmp_adev)) 6416 tmp_adev->pcie_reset_ctx.audio_suspended = true; 6417 6418 amdgpu_ras_set_error_query_ready(tmp_adev, false); 6419 6420 cancel_delayed_work_sync(&tmp_adev->delayed_init_work); 6421 6422 amdgpu_amdkfd_pre_reset(tmp_adev, reset_context); 6423 6424 /* 6425 * Mark these ASICs to be reset as untracked first 6426 * And add them back after reset completed 6427 */ 6428 amdgpu_unregister_gpu_instance(tmp_adev); 6429 6430 drm_client_dev_suspend(adev_to_drm(tmp_adev)); 6431 6432 /* disable ras on ALL IPs */ 6433 if (!need_emergency_restart && !amdgpu_reset_in_dpc(adev) && 6434 amdgpu_device_ip_need_full_reset(tmp_adev)) 6435 amdgpu_ras_suspend(tmp_adev); 6436 6437 amdgpu_userq_pre_reset(tmp_adev); 6438 6439 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 6440 struct amdgpu_ring *ring = tmp_adev->rings[i]; 6441 6442 if (!amdgpu_ring_sched_ready(ring)) 6443 continue; 6444 6445 drm_sched_stop(&ring->sched, job ? &job->base : NULL); 6446 6447 if (need_emergency_restart) 6448 amdgpu_job_stop_all_jobs_on_sched(&ring->sched); 6449 } 6450 atomic_inc(&tmp_adev->gpu_reset_counter); 6451 } 6452 } 6453 6454 static int amdgpu_device_asic_reset(struct amdgpu_device *adev, 6455 struct list_head *device_list, 6456 struct amdgpu_reset_context *reset_context) 6457 { 6458 struct amdgpu_device *tmp_adev = NULL; 6459 int retry_limit = AMDGPU_MAX_RETRY_LIMIT; 6460 int r = 0; 6461 6462 retry: /* Rest of adevs pre asic reset from XGMI hive. */ 6463 list_for_each_entry(tmp_adev, device_list, reset_list) { 6464 r = amdgpu_device_pre_asic_reset(tmp_adev, reset_context); 6465 /*TODO Should we stop ?*/ 6466 if (r) { 6467 dev_err(tmp_adev->dev, "GPU pre asic reset failed with err, %d for drm dev, %s ", 6468 r, adev_to_drm(tmp_adev)->unique); 6469 tmp_adev->asic_reset_res = r; 6470 } 6471 } 6472 6473 /* Actual ASIC resets if needed.*/ 6474 /* Host driver will handle XGMI hive reset for SRIOV */ 6475 if (amdgpu_sriov_vf(adev)) { 6476 6477 /* Bail out of reset early */ 6478 if (amdgpu_ras_is_rma(adev)) 6479 return -ENODEV; 6480 6481 if (amdgpu_ras_get_fed_status(adev) || amdgpu_virt_rcvd_ras_interrupt(adev)) { 6482 dev_dbg(adev->dev, "Detected RAS error, wait for FLR completion\n"); 6483 amdgpu_ras_set_fed(adev, true); 6484 set_bit(AMDGPU_HOST_FLR, &reset_context->flags); 6485 } 6486 6487 r = amdgpu_device_reset_sriov(adev, reset_context); 6488 if (AMDGPU_RETRY_SRIOV_RESET(r) && (retry_limit--) > 0) { 6489 amdgpu_virt_release_full_gpu(adev, true); 6490 goto retry; 6491 } 6492 if (r) 6493 adev->asic_reset_res = r; 6494 } else { 6495 r = amdgpu_do_asic_reset(device_list, reset_context); 6496 if (r && r == -EAGAIN) 6497 goto retry; 6498 } 6499 6500 list_for_each_entry(tmp_adev, device_list, reset_list) { 6501 /* 6502 * Drop any pending non scheduler resets queued before reset is done. 6503 * Any reset scheduled after this point would be valid. Scheduler resets 6504 * were already dropped during drm_sched_stop and no new ones can come 6505 * in before drm_sched_start. 6506 */ 6507 amdgpu_device_stop_pending_resets(tmp_adev); 6508 } 6509 6510 return r; 6511 } 6512 6513 static int amdgpu_device_sched_resume(struct list_head *device_list, 6514 struct amdgpu_reset_context *reset_context, 6515 bool job_signaled) 6516 { 6517 struct amdgpu_device *tmp_adev = NULL; 6518 int i, r = 0; 6519 6520 /* Post ASIC reset for all devs .*/ 6521 list_for_each_entry(tmp_adev, device_list, reset_list) { 6522 6523 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 6524 struct amdgpu_ring *ring = tmp_adev->rings[i]; 6525 6526 if (!amdgpu_ring_sched_ready(ring)) 6527 continue; 6528 6529 drm_sched_start(&ring->sched, 0); 6530 } 6531 6532 if (!drm_drv_uses_atomic_modeset(adev_to_drm(tmp_adev)) && !job_signaled) 6533 drm_helper_resume_force_mode(adev_to_drm(tmp_adev)); 6534 6535 if (tmp_adev->asic_reset_res) { 6536 /* bad news, how to tell it to userspace ? 6537 * for ras error, we should report GPU bad status instead of 6538 * reset failure 6539 */ 6540 if (reset_context->src != AMDGPU_RESET_SRC_RAS || 6541 !amdgpu_ras_eeprom_check_err_threshold(tmp_adev)) 6542 dev_info( 6543 tmp_adev->dev, 6544 "GPU reset(%d) failed with error %d \n", 6545 atomic_read( 6546 &tmp_adev->gpu_reset_counter), 6547 tmp_adev->asic_reset_res); 6548 amdgpu_vf_error_put(tmp_adev, 6549 AMDGIM_ERROR_VF_GPU_RESET_FAIL, 0, 6550 tmp_adev->asic_reset_res); 6551 if (!r) 6552 r = tmp_adev->asic_reset_res; 6553 tmp_adev->asic_reset_res = 0; 6554 } else { 6555 dev_info(tmp_adev->dev, "GPU reset(%d) succeeded!\n", 6556 atomic_read(&tmp_adev->gpu_reset_counter)); 6557 if (amdgpu_acpi_smart_shift_update(tmp_adev, 6558 AMDGPU_SS_DEV_D0)) 6559 dev_warn(tmp_adev->dev, 6560 "smart shift update failed\n"); 6561 } 6562 } 6563 6564 return r; 6565 } 6566 6567 static void amdgpu_device_gpu_resume(struct amdgpu_device *adev, 6568 struct list_head *device_list, 6569 bool need_emergency_restart) 6570 { 6571 struct amdgpu_device *tmp_adev = NULL; 6572 6573 list_for_each_entry(tmp_adev, device_list, reset_list) { 6574 /* unlock kfd: SRIOV would do it separately */ 6575 if (!need_emergency_restart && !amdgpu_sriov_vf(tmp_adev)) 6576 amdgpu_amdkfd_post_reset(tmp_adev); 6577 6578 /* kfd_post_reset will do nothing if kfd device is not initialized, 6579 * need to bring up kfd here if it's not be initialized before 6580 */ 6581 if (!adev->kfd.init_complete) 6582 amdgpu_amdkfd_device_init(adev); 6583 6584 if (tmp_adev->pcie_reset_ctx.audio_suspended) 6585 amdgpu_device_resume_display_audio(tmp_adev); 6586 6587 amdgpu_device_unset_mp1_state(tmp_adev); 6588 6589 amdgpu_ras_set_error_query_ready(tmp_adev, true); 6590 6591 } 6592 } 6593 6594 6595 /** 6596 * amdgpu_device_gpu_recover - reset the asic and recover scheduler 6597 * 6598 * @adev: amdgpu_device pointer 6599 * @job: which job trigger hang 6600 * @reset_context: amdgpu reset context pointer 6601 * 6602 * Attempt to reset the GPU if it has hung (all asics). 6603 * Attempt to do soft-reset or full-reset and reinitialize Asic 6604 * Returns 0 for success or an error on failure. 6605 */ 6606 6607 int amdgpu_device_gpu_recover(struct amdgpu_device *adev, 6608 struct amdgpu_job *job, 6609 struct amdgpu_reset_context *reset_context) 6610 { 6611 struct list_head device_list; 6612 bool job_signaled = false; 6613 struct amdgpu_hive_info *hive = NULL; 6614 int r = 0; 6615 bool need_emergency_restart = false; 6616 6617 /* 6618 * If it reaches here because of hang/timeout and a RAS error is 6619 * detected at the same time, let RAS recovery take care of it. 6620 */ 6621 if (amdgpu_ras_is_err_state(adev, AMDGPU_RAS_BLOCK__ANY) && 6622 !amdgpu_sriov_vf(adev) && 6623 reset_context->src != AMDGPU_RESET_SRC_RAS) { 6624 dev_dbg(adev->dev, 6625 "Gpu recovery from source: %d yielding to RAS error recovery handling", 6626 reset_context->src); 6627 return 0; 6628 } 6629 6630 /* 6631 * Special case: RAS triggered and full reset isn't supported 6632 */ 6633 need_emergency_restart = amdgpu_ras_need_emergency_restart(adev); 6634 6635 /* 6636 * Flush RAM to disk so that after reboot 6637 * the user can read log and see why the system rebooted. 6638 */ 6639 if (need_emergency_restart && amdgpu_ras_get_context(adev) && 6640 amdgpu_ras_get_context(adev)->reboot) { 6641 dev_warn(adev->dev, "Emergency reboot."); 6642 6643 ksys_sync_helper(); 6644 emergency_restart(); 6645 } 6646 6647 dev_info(adev->dev, "GPU %s begin!. Source: %d\n", 6648 need_emergency_restart ? "jobs stop" : "reset", 6649 reset_context->src); 6650 6651 if (!amdgpu_sriov_vf(adev)) 6652 hive = amdgpu_get_xgmi_hive(adev); 6653 if (hive) 6654 mutex_lock(&hive->hive_lock); 6655 6656 reset_context->job = job; 6657 reset_context->hive = hive; 6658 INIT_LIST_HEAD(&device_list); 6659 6660 amdgpu_device_recovery_prepare(adev, &device_list, hive); 6661 6662 if (!amdgpu_sriov_vf(adev)) { 6663 r = amdgpu_device_health_check(&device_list); 6664 if (r) 6665 goto end_reset; 6666 } 6667 6668 /* Cannot be called after locking reset domain */ 6669 amdgpu_ras_pre_reset(adev, &device_list); 6670 6671 /* We need to lock reset domain only once both for XGMI and single device */ 6672 amdgpu_device_recovery_get_reset_lock(adev, &device_list); 6673 6674 amdgpu_device_halt_activities(adev, job, reset_context, &device_list, 6675 hive, need_emergency_restart); 6676 if (need_emergency_restart) 6677 goto skip_sched_resume; 6678 /* 6679 * Must check guilty signal here since after this point all old 6680 * HW fences are force signaled. 6681 * 6682 * job->base holds a reference to parent fence 6683 */ 6684 if (job && dma_fence_is_signaled(&job->hw_fence->base)) { 6685 job_signaled = true; 6686 dev_info(adev->dev, "Guilty job already signaled, skipping HW reset"); 6687 goto skip_hw_reset; 6688 } 6689 6690 r = amdgpu_device_asic_reset(adev, &device_list, reset_context); 6691 if (r) 6692 goto reset_unlock; 6693 skip_hw_reset: 6694 r = amdgpu_device_sched_resume(&device_list, reset_context, job_signaled); 6695 if (r) 6696 goto reset_unlock; 6697 skip_sched_resume: 6698 amdgpu_device_gpu_resume(adev, &device_list, need_emergency_restart); 6699 reset_unlock: 6700 amdgpu_device_recovery_put_reset_lock(adev, &device_list); 6701 amdgpu_ras_post_reset(adev, &device_list); 6702 end_reset: 6703 if (hive) { 6704 mutex_unlock(&hive->hive_lock); 6705 amdgpu_put_xgmi_hive(hive); 6706 } 6707 6708 if (r) 6709 dev_info(adev->dev, "GPU reset end with ret = %d\n", r); 6710 6711 atomic_set(&adev->reset_domain->reset_res, r); 6712 6713 if (!r) { 6714 struct amdgpu_task_info *ti = NULL; 6715 6716 if (job) 6717 ti = amdgpu_vm_get_task_info_pasid(adev, job->pasid); 6718 6719 drm_dev_wedged_event(adev_to_drm(adev), DRM_WEDGE_RECOVERY_NONE, 6720 ti ? &ti->task : NULL); 6721 6722 amdgpu_vm_put_task_info(ti); 6723 } 6724 6725 return r; 6726 } 6727 6728 /** 6729 * amdgpu_device_partner_bandwidth - find the bandwidth of appropriate partner 6730 * 6731 * @adev: amdgpu_device pointer 6732 * @speed: pointer to the speed of the link 6733 * @width: pointer to the width of the link 6734 * 6735 * Evaluate the hierarchy to find the speed and bandwidth capabilities of the 6736 * first physical partner to an AMD dGPU. 6737 * This will exclude any virtual switches and links. 6738 */ 6739 static void amdgpu_device_partner_bandwidth(struct amdgpu_device *adev, 6740 enum pci_bus_speed *speed, 6741 enum pcie_link_width *width) 6742 { 6743 struct pci_dev *parent = adev->pdev; 6744 6745 if (!speed || !width) 6746 return; 6747 6748 *speed = PCI_SPEED_UNKNOWN; 6749 *width = PCIE_LNK_WIDTH_UNKNOWN; 6750 6751 if (amdgpu_device_pcie_dynamic_switching_supported(adev)) { 6752 while ((parent = pci_upstream_bridge(parent))) { 6753 /* skip upstream/downstream switches internal to dGPU*/ 6754 if (parent->vendor == PCI_VENDOR_ID_ATI) 6755 continue; 6756 *speed = pcie_get_speed_cap(parent); 6757 *width = pcie_get_width_cap(parent); 6758 break; 6759 } 6760 } else { 6761 /* use the current speeds rather than max if switching is not supported */ 6762 pcie_bandwidth_available(adev->pdev, NULL, speed, width); 6763 } 6764 } 6765 6766 /** 6767 * amdgpu_device_gpu_bandwidth - find the bandwidth of the GPU 6768 * 6769 * @adev: amdgpu_device pointer 6770 * @speed: pointer to the speed of the link 6771 * @width: pointer to the width of the link 6772 * 6773 * Evaluate the hierarchy to find the speed and bandwidth capabilities of the 6774 * AMD dGPU which may be a virtual upstream bridge. 6775 */ 6776 static void amdgpu_device_gpu_bandwidth(struct amdgpu_device *adev, 6777 enum pci_bus_speed *speed, 6778 enum pcie_link_width *width) 6779 { 6780 struct pci_dev *parent = adev->pdev; 6781 6782 if (!speed || !width) 6783 return; 6784 6785 parent = pci_upstream_bridge(parent); 6786 if (parent && parent->vendor == PCI_VENDOR_ID_ATI) { 6787 /* use the upstream/downstream switches internal to dGPU */ 6788 *speed = pcie_get_speed_cap(parent); 6789 *width = pcie_get_width_cap(parent); 6790 while ((parent = pci_upstream_bridge(parent))) { 6791 if (parent->vendor == PCI_VENDOR_ID_ATI) { 6792 /* use the upstream/downstream switches internal to dGPU */ 6793 *speed = pcie_get_speed_cap(parent); 6794 *width = pcie_get_width_cap(parent); 6795 } 6796 } 6797 } else { 6798 /* use the device itself */ 6799 *speed = pcie_get_speed_cap(adev->pdev); 6800 *width = pcie_get_width_cap(adev->pdev); 6801 } 6802 } 6803 6804 /** 6805 * amdgpu_device_get_pcie_info - fence pcie info about the PCIE slot 6806 * 6807 * @adev: amdgpu_device pointer 6808 * 6809 * Fetches and stores in the driver the PCIE capabilities (gen speed 6810 * and lanes) of the slot the device is in. Handles APUs and 6811 * virtualized environments where PCIE config space may not be available. 6812 */ 6813 static void amdgpu_device_get_pcie_info(struct amdgpu_device *adev) 6814 { 6815 enum pci_bus_speed speed_cap, platform_speed_cap; 6816 enum pcie_link_width platform_link_width, link_width; 6817 6818 if (amdgpu_pcie_gen_cap) 6819 adev->pm.pcie_gen_mask = amdgpu_pcie_gen_cap; 6820 6821 if (amdgpu_pcie_lane_cap) 6822 adev->pm.pcie_mlw_mask = amdgpu_pcie_lane_cap; 6823 6824 /* covers APUs as well */ 6825 if (pci_is_root_bus(adev->pdev->bus) && !amdgpu_passthrough(adev)) { 6826 if (adev->pm.pcie_gen_mask == 0) 6827 adev->pm.pcie_gen_mask = AMDGPU_DEFAULT_PCIE_GEN_MASK; 6828 if (adev->pm.pcie_mlw_mask == 0) 6829 adev->pm.pcie_mlw_mask = AMDGPU_DEFAULT_PCIE_MLW_MASK; 6830 return; 6831 } 6832 6833 if (adev->pm.pcie_gen_mask && adev->pm.pcie_mlw_mask) 6834 return; 6835 6836 amdgpu_device_partner_bandwidth(adev, &platform_speed_cap, 6837 &platform_link_width); 6838 amdgpu_device_gpu_bandwidth(adev, &speed_cap, &link_width); 6839 6840 if (adev->pm.pcie_gen_mask == 0) { 6841 /* asic caps */ 6842 if (speed_cap == PCI_SPEED_UNKNOWN) { 6843 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 6844 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 | 6845 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3); 6846 } else { 6847 if (speed_cap == PCIE_SPEED_32_0GT) 6848 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 6849 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 | 6850 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3 | 6851 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN4 | 6852 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN5); 6853 else if (speed_cap == PCIE_SPEED_16_0GT) 6854 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 6855 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 | 6856 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3 | 6857 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN4); 6858 else if (speed_cap == PCIE_SPEED_8_0GT) 6859 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 6860 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 | 6861 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3); 6862 else if (speed_cap == PCIE_SPEED_5_0GT) 6863 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 6864 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2); 6865 else 6866 adev->pm.pcie_gen_mask |= CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1; 6867 } 6868 /* platform caps */ 6869 if (platform_speed_cap == PCI_SPEED_UNKNOWN) { 6870 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 6871 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2); 6872 } else { 6873 if (platform_speed_cap == PCIE_SPEED_32_0GT) 6874 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 6875 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 | 6876 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3 | 6877 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN4 | 6878 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN5); 6879 else if (platform_speed_cap == PCIE_SPEED_16_0GT) 6880 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 6881 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 | 6882 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3 | 6883 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN4); 6884 else if (platform_speed_cap == PCIE_SPEED_8_0GT) 6885 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 6886 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 | 6887 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3); 6888 else if (platform_speed_cap == PCIE_SPEED_5_0GT) 6889 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 6890 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2); 6891 else 6892 adev->pm.pcie_gen_mask |= CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1; 6893 6894 } 6895 } 6896 if (adev->pm.pcie_mlw_mask == 0) { 6897 /* asic caps */ 6898 if (link_width == PCIE_LNK_WIDTH_UNKNOWN) { 6899 adev->pm.pcie_mlw_mask |= AMDGPU_DEFAULT_ASIC_PCIE_MLW_MASK; 6900 } else { 6901 switch (link_width) { 6902 case PCIE_LNK_X32: 6903 adev->pm.pcie_mlw_mask |= (CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X32 | 6904 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X16 | 6905 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X12 | 6906 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X8 | 6907 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X4 | 6908 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X2 | 6909 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X1); 6910 break; 6911 case PCIE_LNK_X16: 6912 adev->pm.pcie_mlw_mask |= (CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X16 | 6913 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X12 | 6914 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X8 | 6915 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X4 | 6916 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X2 | 6917 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X1); 6918 break; 6919 case PCIE_LNK_X12: 6920 adev->pm.pcie_mlw_mask |= (CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X12 | 6921 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X8 | 6922 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X4 | 6923 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X2 | 6924 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X1); 6925 break; 6926 case PCIE_LNK_X8: 6927 adev->pm.pcie_mlw_mask |= (CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X8 | 6928 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X4 | 6929 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X2 | 6930 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X1); 6931 break; 6932 case PCIE_LNK_X4: 6933 adev->pm.pcie_mlw_mask |= (CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X4 | 6934 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X2 | 6935 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X1); 6936 break; 6937 case PCIE_LNK_X2: 6938 adev->pm.pcie_mlw_mask |= (CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X2 | 6939 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X1); 6940 break; 6941 case PCIE_LNK_X1: 6942 adev->pm.pcie_mlw_mask |= CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X1; 6943 break; 6944 default: 6945 break; 6946 } 6947 } 6948 /* platform caps */ 6949 if (platform_link_width == PCIE_LNK_WIDTH_UNKNOWN) { 6950 adev->pm.pcie_mlw_mask |= AMDGPU_DEFAULT_PCIE_MLW_MASK; 6951 } else { 6952 switch (platform_link_width) { 6953 case PCIE_LNK_X32: 6954 adev->pm.pcie_mlw_mask |= (CAIL_PCIE_LINK_WIDTH_SUPPORT_X32 | 6955 CAIL_PCIE_LINK_WIDTH_SUPPORT_X16 | 6956 CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 | 6957 CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 | 6958 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 6959 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 6960 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 6961 break; 6962 case PCIE_LNK_X16: 6963 adev->pm.pcie_mlw_mask |= (CAIL_PCIE_LINK_WIDTH_SUPPORT_X16 | 6964 CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 | 6965 CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 | 6966 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 6967 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 6968 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 6969 break; 6970 case PCIE_LNK_X12: 6971 adev->pm.pcie_mlw_mask |= (CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 | 6972 CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 | 6973 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 6974 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 6975 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 6976 break; 6977 case PCIE_LNK_X8: 6978 adev->pm.pcie_mlw_mask |= (CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 | 6979 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 6980 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 6981 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 6982 break; 6983 case PCIE_LNK_X4: 6984 adev->pm.pcie_mlw_mask |= (CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 6985 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 6986 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 6987 break; 6988 case PCIE_LNK_X2: 6989 adev->pm.pcie_mlw_mask |= (CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 6990 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 6991 break; 6992 case PCIE_LNK_X1: 6993 adev->pm.pcie_mlw_mask |= CAIL_PCIE_LINK_WIDTH_SUPPORT_X1; 6994 break; 6995 default: 6996 break; 6997 } 6998 } 6999 } 7000 } 7001 7002 /** 7003 * amdgpu_device_is_peer_accessible - Check peer access through PCIe BAR 7004 * 7005 * @adev: amdgpu_device pointer 7006 * @peer_adev: amdgpu_device pointer for peer device trying to access @adev 7007 * 7008 * Return true if @peer_adev can access (DMA) @adev through the PCIe 7009 * BAR, i.e. @adev is "large BAR" and the BAR matches the DMA mask of 7010 * @peer_adev. 7011 */ 7012 bool amdgpu_device_is_peer_accessible(struct amdgpu_device *adev, 7013 struct amdgpu_device *peer_adev) 7014 { 7015 #ifdef CONFIG_HSA_AMD_P2P 7016 bool p2p_access = 7017 !adev->gmc.xgmi.connected_to_cpu && 7018 !(pci_p2pdma_distance(adev->pdev, peer_adev->dev, false) < 0); 7019 if (!p2p_access) 7020 dev_info(adev->dev, "PCIe P2P access from peer device %s is not supported by the chipset\n", 7021 pci_name(peer_adev->pdev)); 7022 7023 bool is_large_bar = adev->gmc.visible_vram_size && 7024 adev->gmc.real_vram_size == adev->gmc.visible_vram_size; 7025 bool p2p_addressable = amdgpu_device_check_iommu_remap(peer_adev); 7026 7027 if (!p2p_addressable) { 7028 uint64_t address_mask = peer_adev->dev->dma_mask ? 7029 ~*peer_adev->dev->dma_mask : ~((1ULL << 32) - 1); 7030 resource_size_t aper_limit = 7031 adev->gmc.aper_base + adev->gmc.aper_size - 1; 7032 7033 p2p_addressable = !(adev->gmc.aper_base & address_mask || 7034 aper_limit & address_mask); 7035 } 7036 return pcie_p2p && is_large_bar && p2p_access && p2p_addressable; 7037 #else 7038 return false; 7039 #endif 7040 } 7041 7042 int amdgpu_device_baco_enter(struct amdgpu_device *adev) 7043 { 7044 struct amdgpu_ras *ras = amdgpu_ras_get_context(adev); 7045 7046 if (!amdgpu_device_supports_baco(adev)) 7047 return -ENOTSUPP; 7048 7049 if (ras && adev->ras_enabled && 7050 adev->nbio.funcs->enable_doorbell_interrupt) 7051 adev->nbio.funcs->enable_doorbell_interrupt(adev, false); 7052 7053 return amdgpu_dpm_baco_enter(adev); 7054 } 7055 7056 int amdgpu_device_baco_exit(struct amdgpu_device *adev) 7057 { 7058 struct amdgpu_ras *ras = amdgpu_ras_get_context(adev); 7059 int ret = 0; 7060 7061 if (!amdgpu_device_supports_baco(adev)) 7062 return -ENOTSUPP; 7063 7064 ret = amdgpu_dpm_baco_exit(adev); 7065 if (ret) 7066 return ret; 7067 7068 if (ras && adev->ras_enabled && 7069 adev->nbio.funcs->enable_doorbell_interrupt) 7070 adev->nbio.funcs->enable_doorbell_interrupt(adev, true); 7071 7072 if (amdgpu_passthrough(adev) && adev->nbio.funcs && 7073 adev->nbio.funcs->clear_doorbell_interrupt) 7074 adev->nbio.funcs->clear_doorbell_interrupt(adev); 7075 7076 return 0; 7077 } 7078 7079 /** 7080 * amdgpu_pci_error_detected - Called when a PCI error is detected. 7081 * @pdev: PCI device struct 7082 * @state: PCI channel state 7083 * 7084 * Description: Called when a PCI error is detected. 7085 * 7086 * Return: PCI_ERS_RESULT_NEED_RESET or PCI_ERS_RESULT_DISCONNECT. 7087 */ 7088 pci_ers_result_t amdgpu_pci_error_detected(struct pci_dev *pdev, pci_channel_state_t state) 7089 { 7090 struct drm_device *dev = pci_get_drvdata(pdev); 7091 struct amdgpu_device *adev = drm_to_adev(dev); 7092 struct amdgpu_hive_info *hive __free(xgmi_put_hive) = 7093 amdgpu_get_xgmi_hive(adev); 7094 struct amdgpu_reset_context reset_context; 7095 struct list_head device_list; 7096 7097 dev_info(adev->dev, "PCI error: detected callback!!\n"); 7098 7099 adev->pci_channel_state = state; 7100 7101 switch (state) { 7102 case pci_channel_io_normal: 7103 dev_info(adev->dev, "pci_channel_io_normal: state(%d)!!\n", state); 7104 return PCI_ERS_RESULT_CAN_RECOVER; 7105 case pci_channel_io_frozen: 7106 /* Fatal error, prepare for slot reset */ 7107 dev_info(adev->dev, "pci_channel_io_frozen: state(%d)!!\n", state); 7108 if (hive) { 7109 /* Hive devices should be able to support FW based 7110 * link reset on other devices, if not return. 7111 */ 7112 if (!amdgpu_dpm_is_link_reset_supported(adev)) { 7113 dev_warn(adev->dev, 7114 "No support for XGMI hive yet...\n"); 7115 return PCI_ERS_RESULT_DISCONNECT; 7116 } 7117 /* Set dpc status only if device is part of hive 7118 * Non-hive devices should be able to recover after 7119 * link reset. 7120 */ 7121 amdgpu_reset_set_dpc_status(adev, true); 7122 7123 mutex_lock(&hive->hive_lock); 7124 } 7125 memset(&reset_context, 0, sizeof(reset_context)); 7126 INIT_LIST_HEAD(&device_list); 7127 7128 amdgpu_device_recovery_prepare(adev, &device_list, hive); 7129 amdgpu_device_recovery_get_reset_lock(adev, &device_list); 7130 amdgpu_device_halt_activities(adev, NULL, &reset_context, &device_list, 7131 hive, false); 7132 if (hive) 7133 mutex_unlock(&hive->hive_lock); 7134 return PCI_ERS_RESULT_NEED_RESET; 7135 case pci_channel_io_perm_failure: 7136 /* Permanent error, prepare for device removal */ 7137 dev_info(adev->dev, "pci_channel_io_perm_failure: state(%d)!!\n", state); 7138 return PCI_ERS_RESULT_DISCONNECT; 7139 } 7140 7141 return PCI_ERS_RESULT_NEED_RESET; 7142 } 7143 7144 /** 7145 * amdgpu_pci_mmio_enabled - Enable MMIO and dump debug registers 7146 * @pdev: pointer to PCI device 7147 */ 7148 pci_ers_result_t amdgpu_pci_mmio_enabled(struct pci_dev *pdev) 7149 { 7150 struct drm_device *dev = pci_get_drvdata(pdev); 7151 struct amdgpu_device *adev = drm_to_adev(dev); 7152 7153 dev_info(adev->dev, "PCI error: mmio enabled callback!!\n"); 7154 7155 /* TODO - dump whatever for debugging purposes */ 7156 7157 /* This called only if amdgpu_pci_error_detected returns 7158 * PCI_ERS_RESULT_CAN_RECOVER. Read/write to the device still 7159 * works, no need to reset slot. 7160 */ 7161 7162 return PCI_ERS_RESULT_RECOVERED; 7163 } 7164 7165 /** 7166 * amdgpu_pci_slot_reset - Called when PCI slot has been reset. 7167 * @pdev: PCI device struct 7168 * 7169 * Description: This routine is called by the pci error recovery 7170 * code after the PCI slot has been reset, just before we 7171 * should resume normal operations. 7172 */ 7173 pci_ers_result_t amdgpu_pci_slot_reset(struct pci_dev *pdev) 7174 { 7175 struct drm_device *dev = pci_get_drvdata(pdev); 7176 struct amdgpu_device *adev = drm_to_adev(dev); 7177 struct amdgpu_reset_context reset_context; 7178 struct amdgpu_device *tmp_adev; 7179 struct amdgpu_hive_info *hive; 7180 struct list_head device_list; 7181 struct pci_dev *link_dev; 7182 int r = 0, i, timeout; 7183 u32 memsize; 7184 u16 status; 7185 7186 dev_info(adev->dev, "PCI error: slot reset callback!!\n"); 7187 7188 memset(&reset_context, 0, sizeof(reset_context)); 7189 7190 if (adev->pcie_reset_ctx.swus) 7191 link_dev = adev->pcie_reset_ctx.swus; 7192 else 7193 link_dev = adev->pdev; 7194 /* wait for asic to come out of reset, timeout = 10s */ 7195 timeout = 10000; 7196 do { 7197 usleep_range(10000, 10500); 7198 r = pci_read_config_word(link_dev, PCI_VENDOR_ID, &status); 7199 timeout -= 10; 7200 } while (timeout > 0 && (status != PCI_VENDOR_ID_ATI) && 7201 (status != PCI_VENDOR_ID_AMD)); 7202 7203 if ((status != PCI_VENDOR_ID_ATI) && (status != PCI_VENDOR_ID_AMD)) { 7204 r = -ETIME; 7205 goto out; 7206 } 7207 7208 amdgpu_device_load_switch_state(adev); 7209 /* Restore PCI confspace */ 7210 amdgpu_device_load_pci_state(pdev); 7211 7212 /* confirm ASIC came out of reset */ 7213 for (i = 0; i < adev->usec_timeout; i++) { 7214 memsize = amdgpu_asic_get_config_memsize(adev); 7215 7216 if (memsize != 0xffffffff) 7217 break; 7218 udelay(1); 7219 } 7220 if (memsize == 0xffffffff) { 7221 r = -ETIME; 7222 goto out; 7223 } 7224 7225 reset_context.method = AMD_RESET_METHOD_NONE; 7226 reset_context.reset_req_dev = adev; 7227 set_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags); 7228 set_bit(AMDGPU_SKIP_COREDUMP, &reset_context.flags); 7229 INIT_LIST_HEAD(&device_list); 7230 7231 hive = amdgpu_get_xgmi_hive(adev); 7232 if (hive) { 7233 mutex_lock(&hive->hive_lock); 7234 reset_context.hive = hive; 7235 list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head) { 7236 tmp_adev->pcie_reset_ctx.in_link_reset = true; 7237 list_add_tail(&tmp_adev->reset_list, &device_list); 7238 } 7239 } else { 7240 set_bit(AMDGPU_SKIP_HW_RESET, &reset_context.flags); 7241 list_add_tail(&adev->reset_list, &device_list); 7242 } 7243 7244 r = amdgpu_device_asic_reset(adev, &device_list, &reset_context); 7245 out: 7246 if (!r) { 7247 if (amdgpu_device_cache_pci_state(adev->pdev)) 7248 pci_restore_state(adev->pdev); 7249 dev_info(adev->dev, "PCIe error recovery succeeded\n"); 7250 } else { 7251 dev_err(adev->dev, "PCIe error recovery failed, err:%d\n", r); 7252 if (hive) { 7253 list_for_each_entry(tmp_adev, &device_list, reset_list) 7254 amdgpu_device_unset_mp1_state(tmp_adev); 7255 } 7256 amdgpu_device_recovery_put_reset_lock(adev, &device_list); 7257 } 7258 7259 if (hive) { 7260 mutex_unlock(&hive->hive_lock); 7261 amdgpu_put_xgmi_hive(hive); 7262 } 7263 7264 return r ? PCI_ERS_RESULT_DISCONNECT : PCI_ERS_RESULT_RECOVERED; 7265 } 7266 7267 /** 7268 * amdgpu_pci_resume() - resume normal ops after PCI reset 7269 * @pdev: pointer to PCI device 7270 * 7271 * Called when the error recovery driver tells us that its 7272 * OK to resume normal operation. 7273 */ 7274 void amdgpu_pci_resume(struct pci_dev *pdev) 7275 { 7276 struct drm_device *dev = pci_get_drvdata(pdev); 7277 struct amdgpu_device *adev = drm_to_adev(dev); 7278 struct list_head device_list; 7279 struct amdgpu_hive_info *hive = NULL; 7280 struct amdgpu_device *tmp_adev = NULL; 7281 7282 dev_info(adev->dev, "PCI error: resume callback!!\n"); 7283 7284 /* Only continue execution for the case of pci_channel_io_frozen */ 7285 if (adev->pci_channel_state != pci_channel_io_frozen) 7286 return; 7287 7288 INIT_LIST_HEAD(&device_list); 7289 7290 hive = amdgpu_get_xgmi_hive(adev); 7291 if (hive) { 7292 mutex_lock(&hive->hive_lock); 7293 list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head) { 7294 tmp_adev->pcie_reset_ctx.in_link_reset = false; 7295 list_add_tail(&tmp_adev->reset_list, &device_list); 7296 } 7297 } else 7298 list_add_tail(&adev->reset_list, &device_list); 7299 7300 amdgpu_device_sched_resume(&device_list, NULL, NULL); 7301 amdgpu_device_gpu_resume(adev, &device_list, false); 7302 amdgpu_device_recovery_put_reset_lock(adev, &device_list); 7303 7304 if (hive) { 7305 mutex_unlock(&hive->hive_lock); 7306 amdgpu_put_xgmi_hive(hive); 7307 } 7308 } 7309 7310 static void amdgpu_device_cache_switch_state(struct amdgpu_device *adev) 7311 { 7312 struct pci_dev *swus, *swds; 7313 int r; 7314 7315 swds = pci_upstream_bridge(adev->pdev); 7316 if (!swds || swds->vendor != PCI_VENDOR_ID_ATI || 7317 pci_pcie_type(swds) != PCI_EXP_TYPE_DOWNSTREAM) 7318 return; 7319 swus = pci_upstream_bridge(swds); 7320 if (!swus || 7321 (swus->vendor != PCI_VENDOR_ID_ATI && 7322 swus->vendor != PCI_VENDOR_ID_AMD) || 7323 pci_pcie_type(swus) != PCI_EXP_TYPE_UPSTREAM) 7324 return; 7325 7326 /* If already saved, return */ 7327 if (adev->pcie_reset_ctx.swus) 7328 return; 7329 /* Upstream bridge is ATI, assume it's SWUS/DS architecture */ 7330 r = pci_save_state(swds); 7331 if (r) 7332 return; 7333 adev->pcie_reset_ctx.swds_pcistate = pci_store_saved_state(swds); 7334 7335 r = pci_save_state(swus); 7336 if (r) 7337 return; 7338 adev->pcie_reset_ctx.swus_pcistate = pci_store_saved_state(swus); 7339 7340 adev->pcie_reset_ctx.swus = swus; 7341 } 7342 7343 static void amdgpu_device_load_switch_state(struct amdgpu_device *adev) 7344 { 7345 struct pci_dev *pdev; 7346 int r; 7347 7348 if (!adev->pcie_reset_ctx.swds_pcistate || 7349 !adev->pcie_reset_ctx.swus_pcistate) 7350 return; 7351 7352 pdev = adev->pcie_reset_ctx.swus; 7353 r = pci_load_saved_state(pdev, adev->pcie_reset_ctx.swus_pcistate); 7354 if (!r) { 7355 pci_restore_state(pdev); 7356 } else { 7357 dev_warn(adev->dev, "Failed to load SWUS state, err:%d\n", r); 7358 return; 7359 } 7360 7361 pdev = pci_upstream_bridge(adev->pdev); 7362 r = pci_load_saved_state(pdev, adev->pcie_reset_ctx.swds_pcistate); 7363 if (!r) 7364 pci_restore_state(pdev); 7365 else 7366 dev_warn(adev->dev, "Failed to load SWDS state, err:%d\n", r); 7367 } 7368 7369 bool amdgpu_device_cache_pci_state(struct pci_dev *pdev) 7370 { 7371 struct drm_device *dev = pci_get_drvdata(pdev); 7372 struct amdgpu_device *adev = drm_to_adev(dev); 7373 int r; 7374 7375 if (amdgpu_sriov_vf(adev)) 7376 return false; 7377 7378 r = pci_save_state(pdev); 7379 if (!r) { 7380 kfree(adev->pci_state); 7381 7382 adev->pci_state = pci_store_saved_state(pdev); 7383 7384 if (!adev->pci_state) { 7385 dev_err(adev->dev, "Failed to store PCI saved state"); 7386 return false; 7387 } 7388 } else { 7389 dev_warn(adev->dev, "Failed to save PCI state, err:%d\n", r); 7390 return false; 7391 } 7392 7393 amdgpu_device_cache_switch_state(adev); 7394 7395 return true; 7396 } 7397 7398 bool amdgpu_device_load_pci_state(struct pci_dev *pdev) 7399 { 7400 struct drm_device *dev = pci_get_drvdata(pdev); 7401 struct amdgpu_device *adev = drm_to_adev(dev); 7402 int r; 7403 7404 if (!adev->pci_state) 7405 return false; 7406 7407 r = pci_load_saved_state(pdev, adev->pci_state); 7408 7409 if (!r) { 7410 pci_restore_state(pdev); 7411 } else { 7412 dev_warn(adev->dev, "Failed to load PCI state, err:%d\n", r); 7413 return false; 7414 } 7415 7416 return true; 7417 } 7418 7419 void amdgpu_device_flush_hdp(struct amdgpu_device *adev, 7420 struct amdgpu_ring *ring) 7421 { 7422 #ifdef CONFIG_X86_64 7423 if ((adev->flags & AMD_IS_APU) && !amdgpu_passthrough(adev)) 7424 return; 7425 #endif 7426 if (adev->gmc.xgmi.connected_to_cpu) 7427 return; 7428 7429 if (ring && ring->funcs->emit_hdp_flush) { 7430 amdgpu_ring_emit_hdp_flush(ring); 7431 return; 7432 } 7433 7434 if (!ring && amdgpu_sriov_runtime(adev)) { 7435 if (!amdgpu_kiq_hdp_flush(adev)) 7436 return; 7437 } 7438 7439 amdgpu_hdp_flush(adev, ring); 7440 } 7441 7442 void amdgpu_device_invalidate_hdp(struct amdgpu_device *adev, 7443 struct amdgpu_ring *ring) 7444 { 7445 #ifdef CONFIG_X86_64 7446 if ((adev->flags & AMD_IS_APU) && !amdgpu_passthrough(adev)) 7447 return; 7448 #endif 7449 if (adev->gmc.xgmi.connected_to_cpu) 7450 return; 7451 7452 amdgpu_hdp_invalidate(adev, ring); 7453 } 7454 7455 int amdgpu_in_reset(struct amdgpu_device *adev) 7456 { 7457 return atomic_read(&adev->reset_domain->in_gpu_reset); 7458 } 7459 7460 /** 7461 * amdgpu_device_halt() - bring hardware to some kind of halt state 7462 * 7463 * @adev: amdgpu_device pointer 7464 * 7465 * Bring hardware to some kind of halt state so that no one can touch it 7466 * any more. It will help to maintain error context when error occurred. 7467 * Compare to a simple hang, the system will keep stable at least for SSH 7468 * access. Then it should be trivial to inspect the hardware state and 7469 * see what's going on. Implemented as following: 7470 * 7471 * 1. drm_dev_unplug() makes device inaccessible to user space(IOCTLs, etc), 7472 * clears all CPU mappings to device, disallows remappings through page faults 7473 * 2. amdgpu_irq_disable_all() disables all interrupts 7474 * 3. amdgpu_fence_driver_hw_fini() signals all HW fences 7475 * 4. set adev->no_hw_access to avoid potential crashes after setp 5 7476 * 5. amdgpu_device_unmap_mmio() clears all MMIO mappings 7477 * 6. pci_disable_device() and pci_wait_for_pending_transaction() 7478 * flush any in flight DMA operations 7479 */ 7480 void amdgpu_device_halt(struct amdgpu_device *adev) 7481 { 7482 struct pci_dev *pdev = adev->pdev; 7483 struct drm_device *ddev = adev_to_drm(adev); 7484 7485 amdgpu_xcp_dev_unplug(adev); 7486 drm_dev_unplug(ddev); 7487 7488 amdgpu_irq_disable_all(adev); 7489 7490 amdgpu_fence_driver_hw_fini(adev); 7491 7492 adev->no_hw_access = true; 7493 7494 amdgpu_device_unmap_mmio(adev); 7495 7496 pci_disable_device(pdev); 7497 pci_wait_for_pending_transaction(pdev); 7498 } 7499 7500 u32 amdgpu_device_pcie_port_rreg(struct amdgpu_device *adev, 7501 u32 reg) 7502 { 7503 unsigned long flags, address, data; 7504 u32 r; 7505 7506 address = adev->nbio.funcs->get_pcie_port_index_offset(adev); 7507 data = adev->nbio.funcs->get_pcie_port_data_offset(adev); 7508 7509 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 7510 WREG32(address, reg * 4); 7511 (void)RREG32(address); 7512 r = RREG32(data); 7513 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 7514 return r; 7515 } 7516 7517 void amdgpu_device_pcie_port_wreg(struct amdgpu_device *adev, 7518 u32 reg, u32 v) 7519 { 7520 unsigned long flags, address, data; 7521 7522 address = adev->nbio.funcs->get_pcie_port_index_offset(adev); 7523 data = adev->nbio.funcs->get_pcie_port_data_offset(adev); 7524 7525 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 7526 WREG32(address, reg * 4); 7527 (void)RREG32(address); 7528 WREG32(data, v); 7529 (void)RREG32(data); 7530 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 7531 } 7532 7533 /** 7534 * amdgpu_device_get_gang - return a reference to the current gang 7535 * @adev: amdgpu_device pointer 7536 * 7537 * Returns: A new reference to the current gang leader. 7538 */ 7539 struct dma_fence *amdgpu_device_get_gang(struct amdgpu_device *adev) 7540 { 7541 struct dma_fence *fence; 7542 7543 rcu_read_lock(); 7544 fence = dma_fence_get_rcu_safe(&adev->gang_submit); 7545 rcu_read_unlock(); 7546 return fence; 7547 } 7548 7549 /** 7550 * amdgpu_device_switch_gang - switch to a new gang 7551 * @adev: amdgpu_device pointer 7552 * @gang: the gang to switch to 7553 * 7554 * Try to switch to a new gang. 7555 * Returns: NULL if we switched to the new gang or a reference to the current 7556 * gang leader. 7557 */ 7558 struct dma_fence *amdgpu_device_switch_gang(struct amdgpu_device *adev, 7559 struct dma_fence *gang) 7560 { 7561 struct dma_fence *old = NULL; 7562 7563 dma_fence_get(gang); 7564 do { 7565 dma_fence_put(old); 7566 old = amdgpu_device_get_gang(adev); 7567 if (old == gang) 7568 break; 7569 7570 if (!dma_fence_is_signaled(old)) { 7571 dma_fence_put(gang); 7572 return old; 7573 } 7574 7575 } while (cmpxchg((struct dma_fence __force **)&adev->gang_submit, 7576 old, gang) != old); 7577 7578 /* 7579 * Drop it once for the exchanged reference in adev and once for the 7580 * thread local reference acquired in amdgpu_device_get_gang(). 7581 */ 7582 dma_fence_put(old); 7583 dma_fence_put(old); 7584 return NULL; 7585 } 7586 7587 /** 7588 * amdgpu_device_enforce_isolation - enforce HW isolation 7589 * @adev: the amdgpu device pointer 7590 * @ring: the HW ring the job is supposed to run on 7591 * @job: the job which is about to be pushed to the HW ring 7592 * 7593 * Makes sure that only one client at a time can use the GFX block. 7594 * Returns: The dependency to wait on before the job can be pushed to the HW. 7595 * The function is called multiple times until NULL is returned. 7596 */ 7597 struct dma_fence *amdgpu_device_enforce_isolation(struct amdgpu_device *adev, 7598 struct amdgpu_ring *ring, 7599 struct amdgpu_job *job) 7600 { 7601 struct amdgpu_isolation *isolation = &adev->isolation[ring->xcp_id]; 7602 struct drm_sched_fence *f = job->base.s_fence; 7603 struct dma_fence *dep; 7604 void *owner; 7605 int r; 7606 7607 /* 7608 * For now enforce isolation only for the GFX block since we only need 7609 * the cleaner shader on those rings. 7610 */ 7611 if (ring->funcs->type != AMDGPU_RING_TYPE_GFX && 7612 ring->funcs->type != AMDGPU_RING_TYPE_COMPUTE) 7613 return NULL; 7614 7615 /* 7616 * All submissions where enforce isolation is false are handled as if 7617 * they come from a single client. Use ~0l as the owner to distinct it 7618 * from kernel submissions where the owner is NULL. 7619 */ 7620 owner = job->enforce_isolation ? f->owner : (void *)~0l; 7621 7622 mutex_lock(&adev->enforce_isolation_mutex); 7623 7624 /* 7625 * The "spearhead" submission is the first one which changes the 7626 * ownership to its client. We always need to wait for it to be 7627 * pushed to the HW before proceeding with anything. 7628 */ 7629 if (&f->scheduled != isolation->spearhead && 7630 !dma_fence_is_signaled(isolation->spearhead)) { 7631 dep = isolation->spearhead; 7632 goto out_grab_ref; 7633 } 7634 7635 if (isolation->owner != owner) { 7636 7637 /* 7638 * Wait for any gang to be assembled before switching to a 7639 * different owner or otherwise we could deadlock the 7640 * submissions. 7641 */ 7642 if (!job->gang_submit) { 7643 dep = amdgpu_device_get_gang(adev); 7644 if (!dma_fence_is_signaled(dep)) 7645 goto out_return_dep; 7646 dma_fence_put(dep); 7647 } 7648 7649 dma_fence_put(isolation->spearhead); 7650 isolation->spearhead = dma_fence_get(&f->scheduled); 7651 amdgpu_sync_move(&isolation->active, &isolation->prev); 7652 trace_amdgpu_isolation(isolation->owner, owner); 7653 isolation->owner = owner; 7654 } 7655 7656 /* 7657 * Specifying the ring here helps to pipeline submissions even when 7658 * isolation is enabled. If that is not desired for testing NULL can be 7659 * used instead of the ring to enforce a CPU round trip while switching 7660 * between clients. 7661 */ 7662 dep = amdgpu_sync_peek_fence(&isolation->prev, ring); 7663 r = amdgpu_sync_fence(&isolation->active, &f->finished, GFP_NOWAIT); 7664 if (r) 7665 dev_warn(adev->dev, "OOM tracking isolation\n"); 7666 7667 out_grab_ref: 7668 dma_fence_get(dep); 7669 out_return_dep: 7670 mutex_unlock(&adev->enforce_isolation_mutex); 7671 return dep; 7672 } 7673 7674 bool amdgpu_device_has_display_hardware(struct amdgpu_device *adev) 7675 { 7676 switch (adev->asic_type) { 7677 #ifdef CONFIG_DRM_AMDGPU_SI 7678 case CHIP_HAINAN: 7679 #endif 7680 case CHIP_TOPAZ: 7681 /* chips with no display hardware */ 7682 return false; 7683 #ifdef CONFIG_DRM_AMDGPU_SI 7684 case CHIP_TAHITI: 7685 case CHIP_PITCAIRN: 7686 case CHIP_VERDE: 7687 case CHIP_OLAND: 7688 #endif 7689 #ifdef CONFIG_DRM_AMDGPU_CIK 7690 case CHIP_BONAIRE: 7691 case CHIP_HAWAII: 7692 case CHIP_KAVERI: 7693 case CHIP_KABINI: 7694 case CHIP_MULLINS: 7695 #endif 7696 case CHIP_TONGA: 7697 case CHIP_FIJI: 7698 case CHIP_POLARIS10: 7699 case CHIP_POLARIS11: 7700 case CHIP_POLARIS12: 7701 case CHIP_VEGAM: 7702 case CHIP_CARRIZO: 7703 case CHIP_STONEY: 7704 /* chips with display hardware */ 7705 return true; 7706 default: 7707 /* IP discovery */ 7708 if (!amdgpu_ip_version(adev, DCE_HWIP, 0) || 7709 (adev->harvest_ip_mask & AMD_HARVEST_IP_DMU_MASK)) 7710 return false; 7711 return true; 7712 } 7713 } 7714 7715 uint32_t amdgpu_device_wait_on_rreg(struct amdgpu_device *adev, 7716 uint32_t inst, uint32_t reg_addr, char reg_name[], 7717 uint32_t expected_value, uint32_t mask) 7718 { 7719 uint32_t ret = 0; 7720 uint32_t old_ = 0; 7721 uint32_t tmp_ = RREG32(reg_addr); 7722 uint32_t loop = adev->usec_timeout; 7723 7724 while ((tmp_ & (mask)) != (expected_value)) { 7725 if (old_ != tmp_) { 7726 loop = adev->usec_timeout; 7727 old_ = tmp_; 7728 } else 7729 udelay(1); 7730 tmp_ = RREG32(reg_addr); 7731 loop--; 7732 if (!loop) { 7733 dev_warn( 7734 adev->dev, 7735 "Register(%d) [%s] failed to reach value 0x%08x != 0x%08xn", 7736 inst, reg_name, (uint32_t)expected_value, 7737 (uint32_t)(tmp_ & (mask))); 7738 ret = -ETIMEDOUT; 7739 break; 7740 } 7741 } 7742 return ret; 7743 } 7744 7745 ssize_t amdgpu_get_soft_full_reset_mask(struct amdgpu_ring *ring) 7746 { 7747 ssize_t size = 0; 7748 7749 if (!ring || !ring->adev) 7750 return size; 7751 7752 if (amdgpu_device_should_recover_gpu(ring->adev)) 7753 size |= AMDGPU_RESET_TYPE_FULL; 7754 7755 if (unlikely(!ring->adev->debug_disable_soft_recovery) && 7756 !amdgpu_sriov_vf(ring->adev) && ring->funcs->soft_recovery) 7757 size |= AMDGPU_RESET_TYPE_SOFT_RESET; 7758 7759 return size; 7760 } 7761 7762 ssize_t amdgpu_show_reset_mask(char *buf, uint32_t supported_reset) 7763 { 7764 ssize_t size = 0; 7765 7766 if (supported_reset == 0) { 7767 size += sysfs_emit_at(buf, size, "unsupported"); 7768 size += sysfs_emit_at(buf, size, "\n"); 7769 return size; 7770 7771 } 7772 7773 if (supported_reset & AMDGPU_RESET_TYPE_SOFT_RESET) 7774 size += sysfs_emit_at(buf, size, "soft "); 7775 7776 if (supported_reset & AMDGPU_RESET_TYPE_PER_QUEUE) 7777 size += sysfs_emit_at(buf, size, "queue "); 7778 7779 if (supported_reset & AMDGPU_RESET_TYPE_PER_PIPE) 7780 size += sysfs_emit_at(buf, size, "pipe "); 7781 7782 if (supported_reset & AMDGPU_RESET_TYPE_FULL) 7783 size += sysfs_emit_at(buf, size, "full "); 7784 7785 size += sysfs_emit_at(buf, size, "\n"); 7786 return size; 7787 } 7788 7789 void amdgpu_device_set_uid(struct amdgpu_uid *uid_info, 7790 enum amdgpu_uid_type type, uint8_t inst, 7791 uint64_t uid) 7792 { 7793 if (!uid_info) 7794 return; 7795 7796 if (type >= AMDGPU_UID_TYPE_MAX) { 7797 dev_err_once(uid_info->adev->dev, "Invalid UID type %d\n", 7798 type); 7799 return; 7800 } 7801 7802 if (inst >= AMDGPU_UID_INST_MAX) { 7803 dev_err_once(uid_info->adev->dev, "Invalid UID instance %d\n", 7804 inst); 7805 return; 7806 } 7807 7808 if (uid_info->uid[type][inst] != 0) { 7809 dev_warn_once( 7810 uid_info->adev->dev, 7811 "Overwriting existing UID %llu for type %d instance %d\n", 7812 uid_info->uid[type][inst], type, inst); 7813 } 7814 7815 uid_info->uid[type][inst] = uid; 7816 } 7817 7818 u64 amdgpu_device_get_uid(struct amdgpu_uid *uid_info, 7819 enum amdgpu_uid_type type, uint8_t inst) 7820 { 7821 if (!uid_info) 7822 return 0; 7823 7824 if (type >= AMDGPU_UID_TYPE_MAX) { 7825 dev_err_once(uid_info->adev->dev, "Invalid UID type %d\n", 7826 type); 7827 return 0; 7828 } 7829 7830 if (inst >= AMDGPU_UID_INST_MAX) { 7831 dev_err_once(uid_info->adev->dev, "Invalid UID instance %d\n", 7832 inst); 7833 return 0; 7834 } 7835 7836 return uid_info->uid[type][inst]; 7837 } 7838