1 /* 2 * Copyright 2008 Advanced Micro Devices, Inc. 3 * Copyright 2008 Red Hat Inc. 4 * Copyright 2009 Jerome Glisse. 5 * 6 * Permission is hereby granted, free of charge, to any person obtaining a 7 * copy of this software and associated documentation files (the "Software"), 8 * to deal in the Software without restriction, including without limitation 9 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 10 * and/or sell copies of the Software, and to permit persons to whom the 11 * Software is furnished to do so, subject to the following conditions: 12 * 13 * The above copyright notice and this permission notice shall be included in 14 * all copies or substantial portions of the Software. 15 * 16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 19 * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR 20 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, 21 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR 22 * OTHER DEALINGS IN THE SOFTWARE. 23 * 24 * Authors: Dave Airlie 25 * Alex Deucher 26 * Jerome Glisse 27 */ 28 29 #include <linux/aperture.h> 30 #include <linux/power_supply.h> 31 #include <linux/kthread.h> 32 #include <linux/module.h> 33 #include <linux/console.h> 34 #include <linux/slab.h> 35 #include <linux/iommu.h> 36 #include <linux/pci.h> 37 #include <linux/pci-p2pdma.h> 38 #include <linux/apple-gmux.h> 39 40 #include <drm/drm_atomic_helper.h> 41 #include <drm/drm_client_event.h> 42 #include <drm/drm_crtc_helper.h> 43 #include <drm/drm_probe_helper.h> 44 #include <drm/amdgpu_drm.h> 45 #include <linux/device.h> 46 #include <linux/vgaarb.h> 47 #include <linux/vga_switcheroo.h> 48 #include <linux/efi.h> 49 #include "amdgpu.h" 50 #include "amdgpu_trace.h" 51 #include "amdgpu_i2c.h" 52 #include "atom.h" 53 #include "amdgpu_atombios.h" 54 #include "amdgpu_atomfirmware.h" 55 #include "amd_pcie.h" 56 #ifdef CONFIG_DRM_AMDGPU_SI 57 #include "si.h" 58 #endif 59 #ifdef CONFIG_DRM_AMDGPU_CIK 60 #include "cik.h" 61 #endif 62 #include "vi.h" 63 #include "soc15.h" 64 #include "nv.h" 65 #include "bif/bif_4_1_d.h" 66 #include <linux/firmware.h> 67 #include "amdgpu_vf_error.h" 68 69 #include "amdgpu_amdkfd.h" 70 #include "amdgpu_pm.h" 71 72 #include "amdgpu_xgmi.h" 73 #include "amdgpu_ras.h" 74 #include "amdgpu_ras_mgr.h" 75 #include "amdgpu_pmu.h" 76 #include "amdgpu_fru_eeprom.h" 77 #include "amdgpu_reset.h" 78 #include "amdgpu_virt.h" 79 #include "amdgpu_dev_coredump.h" 80 81 #include <linux/suspend.h> 82 #include <drm/task_barrier.h> 83 #include <linux/pm_runtime.h> 84 85 #include <drm/drm_drv.h> 86 87 #if IS_ENABLED(CONFIG_X86) 88 #include <asm/intel-family.h> 89 #include <asm/cpu_device_id.h> 90 #endif 91 92 MODULE_FIRMWARE("amdgpu/vega10_gpu_info.bin"); 93 MODULE_FIRMWARE("amdgpu/vega12_gpu_info.bin"); 94 MODULE_FIRMWARE("amdgpu/raven_gpu_info.bin"); 95 MODULE_FIRMWARE("amdgpu/picasso_gpu_info.bin"); 96 MODULE_FIRMWARE("amdgpu/raven2_gpu_info.bin"); 97 MODULE_FIRMWARE("amdgpu/arcturus_gpu_info.bin"); 98 MODULE_FIRMWARE("amdgpu/navi12_gpu_info.bin"); 99 MODULE_FIRMWARE("amdgpu/cyan_skillfish_gpu_info.bin"); 100 101 #define AMDGPU_RESUME_MS 2000 102 #define AMDGPU_MAX_RETRY_LIMIT 2 103 #define AMDGPU_RETRY_SRIOV_RESET(r) ((r) == -EBUSY || (r) == -ETIMEDOUT || (r) == -EINVAL) 104 #define AMDGPU_PCIE_INDEX_FALLBACK (0x38 >> 2) 105 #define AMDGPU_PCIE_INDEX_HI_FALLBACK (0x44 >> 2) 106 #define AMDGPU_PCIE_DATA_FALLBACK (0x3C >> 2) 107 108 #define AMDGPU_VBIOS_SKIP (1U << 0) 109 #define AMDGPU_VBIOS_OPTIONAL (1U << 1) 110 111 static const struct drm_driver amdgpu_kms_driver; 112 113 const char *amdgpu_asic_name[] = { 114 "TAHITI", 115 "PITCAIRN", 116 "VERDE", 117 "OLAND", 118 "HAINAN", 119 "BONAIRE", 120 "KAVERI", 121 "KABINI", 122 "HAWAII", 123 "MULLINS", 124 "TOPAZ", 125 "TONGA", 126 "FIJI", 127 "CARRIZO", 128 "STONEY", 129 "POLARIS10", 130 "POLARIS11", 131 "POLARIS12", 132 "VEGAM", 133 "VEGA10", 134 "VEGA12", 135 "VEGA20", 136 "RAVEN", 137 "ARCTURUS", 138 "RENOIR", 139 "ALDEBARAN", 140 "NAVI10", 141 "CYAN_SKILLFISH", 142 "NAVI14", 143 "NAVI12", 144 "SIENNA_CICHLID", 145 "NAVY_FLOUNDER", 146 "VANGOGH", 147 "DIMGREY_CAVEFISH", 148 "BEIGE_GOBY", 149 "YELLOW_CARP", 150 "IP DISCOVERY", 151 "LAST", 152 }; 153 154 #define AMDGPU_IP_BLK_MASK_ALL GENMASK(AMD_IP_BLOCK_TYPE_NUM - 1, 0) 155 /* 156 * Default init level where all blocks are expected to be initialized. This is 157 * the level of initialization expected by default and also after a full reset 158 * of the device. 159 */ 160 struct amdgpu_init_level amdgpu_init_default = { 161 .level = AMDGPU_INIT_LEVEL_DEFAULT, 162 .hwini_ip_block_mask = AMDGPU_IP_BLK_MASK_ALL, 163 }; 164 165 struct amdgpu_init_level amdgpu_init_recovery = { 166 .level = AMDGPU_INIT_LEVEL_RESET_RECOVERY, 167 .hwini_ip_block_mask = AMDGPU_IP_BLK_MASK_ALL, 168 }; 169 170 /* 171 * Minimal blocks needed to be initialized before a XGMI hive can be reset. This 172 * is used for cases like reset on initialization where the entire hive needs to 173 * be reset before first use. 174 */ 175 struct amdgpu_init_level amdgpu_init_minimal_xgmi = { 176 .level = AMDGPU_INIT_LEVEL_MINIMAL_XGMI, 177 .hwini_ip_block_mask = 178 BIT(AMD_IP_BLOCK_TYPE_GMC) | BIT(AMD_IP_BLOCK_TYPE_SMC) | 179 BIT(AMD_IP_BLOCK_TYPE_COMMON) | BIT(AMD_IP_BLOCK_TYPE_IH) | 180 BIT(AMD_IP_BLOCK_TYPE_PSP) 181 }; 182 183 static int amdgpu_device_ip_resume_phase1(struct amdgpu_device *adev); 184 static int amdgpu_device_ip_resume_phase2(struct amdgpu_device *adev); 185 static int amdgpu_device_ip_resume_phase3(struct amdgpu_device *adev); 186 187 static void amdgpu_device_load_switch_state(struct amdgpu_device *adev); 188 189 static inline bool amdgpu_ip_member_of_hwini(struct amdgpu_device *adev, 190 enum amd_ip_block_type block) 191 { 192 return (adev->init_lvl->hwini_ip_block_mask & (1U << block)) != 0; 193 } 194 195 void amdgpu_set_init_level(struct amdgpu_device *adev, 196 enum amdgpu_init_lvl_id lvl) 197 { 198 switch (lvl) { 199 case AMDGPU_INIT_LEVEL_MINIMAL_XGMI: 200 adev->init_lvl = &amdgpu_init_minimal_xgmi; 201 break; 202 case AMDGPU_INIT_LEVEL_RESET_RECOVERY: 203 adev->init_lvl = &amdgpu_init_recovery; 204 break; 205 case AMDGPU_INIT_LEVEL_DEFAULT: 206 fallthrough; 207 default: 208 adev->init_lvl = &amdgpu_init_default; 209 break; 210 } 211 } 212 213 static inline void amdgpu_device_stop_pending_resets(struct amdgpu_device *adev); 214 static int amdgpu_device_pm_notifier(struct notifier_block *nb, unsigned long mode, 215 void *data); 216 217 /** 218 * DOC: pcie_replay_count 219 * 220 * The amdgpu driver provides a sysfs API for reporting the total number 221 * of PCIe replays (NAKs). 222 * The file pcie_replay_count is used for this and returns the total 223 * number of replays as a sum of the NAKs generated and NAKs received. 224 */ 225 226 static ssize_t amdgpu_device_get_pcie_replay_count(struct device *dev, 227 struct device_attribute *attr, char *buf) 228 { 229 struct drm_device *ddev = dev_get_drvdata(dev); 230 struct amdgpu_device *adev = drm_to_adev(ddev); 231 uint64_t cnt = amdgpu_asic_get_pcie_replay_count(adev); 232 233 return sysfs_emit(buf, "%llu\n", cnt); 234 } 235 236 static DEVICE_ATTR(pcie_replay_count, 0444, 237 amdgpu_device_get_pcie_replay_count, NULL); 238 239 static int amdgpu_device_attr_sysfs_init(struct amdgpu_device *adev) 240 { 241 int ret = 0; 242 243 if (amdgpu_nbio_is_replay_cnt_supported(adev)) 244 ret = sysfs_create_file(&adev->dev->kobj, 245 &dev_attr_pcie_replay_count.attr); 246 247 return ret; 248 } 249 250 static void amdgpu_device_attr_sysfs_fini(struct amdgpu_device *adev) 251 { 252 if (amdgpu_nbio_is_replay_cnt_supported(adev)) 253 sysfs_remove_file(&adev->dev->kobj, 254 &dev_attr_pcie_replay_count.attr); 255 } 256 257 static ssize_t amdgpu_sysfs_reg_state_get(struct file *f, struct kobject *kobj, 258 const struct bin_attribute *attr, char *buf, 259 loff_t ppos, size_t count) 260 { 261 struct device *dev = kobj_to_dev(kobj); 262 struct drm_device *ddev = dev_get_drvdata(dev); 263 struct amdgpu_device *adev = drm_to_adev(ddev); 264 ssize_t bytes_read; 265 266 switch (ppos) { 267 case AMDGPU_SYS_REG_STATE_XGMI: 268 bytes_read = amdgpu_asic_get_reg_state( 269 adev, AMDGPU_REG_STATE_TYPE_XGMI, buf, count); 270 break; 271 case AMDGPU_SYS_REG_STATE_WAFL: 272 bytes_read = amdgpu_asic_get_reg_state( 273 adev, AMDGPU_REG_STATE_TYPE_WAFL, buf, count); 274 break; 275 case AMDGPU_SYS_REG_STATE_PCIE: 276 bytes_read = amdgpu_asic_get_reg_state( 277 adev, AMDGPU_REG_STATE_TYPE_PCIE, buf, count); 278 break; 279 case AMDGPU_SYS_REG_STATE_USR: 280 bytes_read = amdgpu_asic_get_reg_state( 281 adev, AMDGPU_REG_STATE_TYPE_USR, buf, count); 282 break; 283 case AMDGPU_SYS_REG_STATE_USR_1: 284 bytes_read = amdgpu_asic_get_reg_state( 285 adev, AMDGPU_REG_STATE_TYPE_USR_1, buf, count); 286 break; 287 default: 288 return -EINVAL; 289 } 290 291 return bytes_read; 292 } 293 294 static const BIN_ATTR(reg_state, 0444, amdgpu_sysfs_reg_state_get, NULL, 295 AMDGPU_SYS_REG_STATE_END); 296 297 int amdgpu_reg_state_sysfs_init(struct amdgpu_device *adev) 298 { 299 int ret; 300 301 if (!amdgpu_asic_get_reg_state_supported(adev)) 302 return 0; 303 304 ret = sysfs_create_bin_file(&adev->dev->kobj, &bin_attr_reg_state); 305 306 return ret; 307 } 308 309 void amdgpu_reg_state_sysfs_fini(struct amdgpu_device *adev) 310 { 311 if (!amdgpu_asic_get_reg_state_supported(adev)) 312 return; 313 sysfs_remove_bin_file(&adev->dev->kobj, &bin_attr_reg_state); 314 } 315 316 int amdgpu_ip_block_suspend(struct amdgpu_ip_block *ip_block) 317 { 318 int r; 319 320 if (ip_block->version->funcs->suspend) { 321 r = ip_block->version->funcs->suspend(ip_block); 322 if (r) { 323 dev_err(ip_block->adev->dev, 324 "suspend of IP block <%s> failed %d\n", 325 ip_block->version->funcs->name, r); 326 return r; 327 } 328 } 329 330 ip_block->status.hw = false; 331 return 0; 332 } 333 334 int amdgpu_ip_block_resume(struct amdgpu_ip_block *ip_block) 335 { 336 int r; 337 338 if (ip_block->version->funcs->resume) { 339 r = ip_block->version->funcs->resume(ip_block); 340 if (r) { 341 dev_err(ip_block->adev->dev, 342 "resume of IP block <%s> failed %d\n", 343 ip_block->version->funcs->name, r); 344 return r; 345 } 346 } 347 348 ip_block->status.hw = true; 349 return 0; 350 } 351 352 /** 353 * DOC: board_info 354 * 355 * The amdgpu driver provides a sysfs API for giving board related information. 356 * It provides the form factor information in the format 357 * 358 * type : form factor 359 * 360 * Possible form factor values 361 * 362 * - "cem" - PCIE CEM card 363 * - "oam" - Open Compute Accelerator Module 364 * - "unknown" - Not known 365 * 366 */ 367 368 static ssize_t amdgpu_device_get_board_info(struct device *dev, 369 struct device_attribute *attr, 370 char *buf) 371 { 372 struct drm_device *ddev = dev_get_drvdata(dev); 373 struct amdgpu_device *adev = drm_to_adev(ddev); 374 enum amdgpu_pkg_type pkg_type = AMDGPU_PKG_TYPE_CEM; 375 const char *pkg; 376 377 if (adev->smuio.funcs && adev->smuio.funcs->get_pkg_type) 378 pkg_type = adev->smuio.funcs->get_pkg_type(adev); 379 380 switch (pkg_type) { 381 case AMDGPU_PKG_TYPE_CEM: 382 pkg = "cem"; 383 break; 384 case AMDGPU_PKG_TYPE_OAM: 385 pkg = "oam"; 386 break; 387 default: 388 pkg = "unknown"; 389 break; 390 } 391 392 return sysfs_emit(buf, "%s : %s\n", "type", pkg); 393 } 394 395 static DEVICE_ATTR(board_info, 0444, amdgpu_device_get_board_info, NULL); 396 397 static struct attribute *amdgpu_board_attrs[] = { 398 &dev_attr_board_info.attr, 399 NULL, 400 }; 401 402 static umode_t amdgpu_board_attrs_is_visible(struct kobject *kobj, 403 struct attribute *attr, int n) 404 { 405 struct device *dev = kobj_to_dev(kobj); 406 struct drm_device *ddev = dev_get_drvdata(dev); 407 struct amdgpu_device *adev = drm_to_adev(ddev); 408 409 if (adev->flags & AMD_IS_APU) 410 return 0; 411 412 return attr->mode; 413 } 414 415 static const struct attribute_group amdgpu_board_attrs_group = { 416 .attrs = amdgpu_board_attrs, 417 .is_visible = amdgpu_board_attrs_is_visible 418 }; 419 420 static void amdgpu_device_get_pcie_info(struct amdgpu_device *adev); 421 422 /** 423 * amdgpu_device_supports_px - Is the device a dGPU with ATPX power control 424 * 425 * @adev: amdgpu device pointer 426 * 427 * Returns true if the device is a dGPU with ATPX power control, 428 * otherwise return false. 429 */ 430 bool amdgpu_device_supports_px(struct amdgpu_device *adev) 431 { 432 if ((adev->flags & AMD_IS_PX) && !amdgpu_is_atpx_hybrid()) 433 return true; 434 return false; 435 } 436 437 /** 438 * amdgpu_device_supports_boco - Is the device a dGPU with ACPI power resources 439 * 440 * @adev: amdgpu device pointer 441 * 442 * Returns true if the device is a dGPU with ACPI power control, 443 * otherwise return false. 444 */ 445 bool amdgpu_device_supports_boco(struct amdgpu_device *adev) 446 { 447 if (!IS_ENABLED(CONFIG_HOTPLUG_PCI_PCIE)) 448 return false; 449 450 if (adev->has_pr3 || 451 ((adev->flags & AMD_IS_PX) && amdgpu_is_atpx_hybrid())) 452 return true; 453 return false; 454 } 455 456 /** 457 * amdgpu_device_supports_baco - Does the device support BACO 458 * 459 * @adev: amdgpu device pointer 460 * 461 * Return: 462 * 1 if the device supports BACO; 463 * 3 if the device supports MACO (only works if BACO is supported) 464 * otherwise return 0. 465 */ 466 int amdgpu_device_supports_baco(struct amdgpu_device *adev) 467 { 468 return amdgpu_asic_supports_baco(adev); 469 } 470 471 void amdgpu_device_detect_runtime_pm_mode(struct amdgpu_device *adev) 472 { 473 int bamaco_support; 474 475 adev->pm.rpm_mode = AMDGPU_RUNPM_NONE; 476 bamaco_support = amdgpu_device_supports_baco(adev); 477 478 switch (amdgpu_runtime_pm) { 479 case 2: 480 if (bamaco_support & MACO_SUPPORT) { 481 adev->pm.rpm_mode = AMDGPU_RUNPM_BAMACO; 482 dev_info(adev->dev, "Forcing BAMACO for runtime pm\n"); 483 } else if (bamaco_support == BACO_SUPPORT) { 484 adev->pm.rpm_mode = AMDGPU_RUNPM_BACO; 485 dev_info(adev->dev, "Requested mode BAMACO not available,fallback to use BACO\n"); 486 } 487 break; 488 case 1: 489 if (bamaco_support & BACO_SUPPORT) { 490 adev->pm.rpm_mode = AMDGPU_RUNPM_BACO; 491 dev_info(adev->dev, "Forcing BACO for runtime pm\n"); 492 } 493 break; 494 case -1: 495 case -2: 496 if (amdgpu_device_supports_px(adev)) { 497 /* enable PX as runtime mode */ 498 adev->pm.rpm_mode = AMDGPU_RUNPM_PX; 499 dev_info(adev->dev, "Using ATPX for runtime pm\n"); 500 } else if (amdgpu_device_supports_boco(adev)) { 501 /* enable boco as runtime mode */ 502 adev->pm.rpm_mode = AMDGPU_RUNPM_BOCO; 503 dev_info(adev->dev, "Using BOCO for runtime pm\n"); 504 } else { 505 if (!bamaco_support) 506 goto no_runtime_pm; 507 508 switch (adev->asic_type) { 509 case CHIP_VEGA20: 510 case CHIP_ARCTURUS: 511 /* BACO are not supported on vega20 and arctrus */ 512 break; 513 case CHIP_VEGA10: 514 /* enable BACO as runpm mode if noretry=0 */ 515 if (!adev->gmc.noretry && !amdgpu_passthrough(adev)) 516 adev->pm.rpm_mode = AMDGPU_RUNPM_BACO; 517 break; 518 default: 519 /* enable BACO as runpm mode on CI+ */ 520 if (!amdgpu_passthrough(adev)) 521 adev->pm.rpm_mode = AMDGPU_RUNPM_BACO; 522 break; 523 } 524 525 if (adev->pm.rpm_mode == AMDGPU_RUNPM_BACO) { 526 if (bamaco_support & MACO_SUPPORT) { 527 adev->pm.rpm_mode = AMDGPU_RUNPM_BAMACO; 528 dev_info(adev->dev, "Using BAMACO for runtime pm\n"); 529 } else { 530 dev_info(adev->dev, "Using BACO for runtime pm\n"); 531 } 532 } 533 } 534 break; 535 case 0: 536 dev_info(adev->dev, "runtime pm is manually disabled\n"); 537 break; 538 default: 539 break; 540 } 541 542 no_runtime_pm: 543 if (adev->pm.rpm_mode == AMDGPU_RUNPM_NONE) 544 dev_info(adev->dev, "Runtime PM not available\n"); 545 } 546 /** 547 * amdgpu_device_supports_smart_shift - Is the device dGPU with 548 * smart shift support 549 * 550 * @adev: amdgpu device pointer 551 * 552 * Returns true if the device is a dGPU with Smart Shift support, 553 * otherwise returns false. 554 */ 555 bool amdgpu_device_supports_smart_shift(struct amdgpu_device *adev) 556 { 557 return (amdgpu_device_supports_boco(adev) && 558 amdgpu_acpi_is_power_shift_control_supported()); 559 } 560 561 /* 562 * VRAM access helper functions 563 */ 564 565 /** 566 * amdgpu_device_mm_access - access vram by MM_INDEX/MM_DATA 567 * 568 * @adev: amdgpu_device pointer 569 * @pos: offset of the buffer in vram 570 * @buf: virtual address of the buffer in system memory 571 * @size: read/write size, sizeof(@buf) must > @size 572 * @write: true - write to vram, otherwise - read from vram 573 */ 574 void amdgpu_device_mm_access(struct amdgpu_device *adev, loff_t pos, 575 void *buf, size_t size, bool write) 576 { 577 unsigned long flags; 578 uint32_t hi = ~0, tmp = 0; 579 uint32_t *data = buf; 580 uint64_t last; 581 int idx; 582 583 if (!drm_dev_enter(adev_to_drm(adev), &idx)) 584 return; 585 586 BUG_ON(!IS_ALIGNED(pos, 4) || !IS_ALIGNED(size, 4)); 587 588 spin_lock_irqsave(&adev->mmio_idx_lock, flags); 589 for (last = pos + size; pos < last; pos += 4) { 590 tmp = pos >> 31; 591 592 WREG32_NO_KIQ(mmMM_INDEX, ((uint32_t)pos) | 0x80000000); 593 if (tmp != hi) { 594 WREG32_NO_KIQ(mmMM_INDEX_HI, tmp); 595 hi = tmp; 596 } 597 if (write) 598 WREG32_NO_KIQ(mmMM_DATA, *data++); 599 else 600 *data++ = RREG32_NO_KIQ(mmMM_DATA); 601 } 602 603 spin_unlock_irqrestore(&adev->mmio_idx_lock, flags); 604 drm_dev_exit(idx); 605 } 606 607 /** 608 * amdgpu_device_aper_access - access vram by vram aperture 609 * 610 * @adev: amdgpu_device pointer 611 * @pos: offset of the buffer in vram 612 * @buf: virtual address of the buffer in system memory 613 * @size: read/write size, sizeof(@buf) must > @size 614 * @write: true - write to vram, otherwise - read from vram 615 * 616 * The return value means how many bytes have been transferred. 617 */ 618 size_t amdgpu_device_aper_access(struct amdgpu_device *adev, loff_t pos, 619 void *buf, size_t size, bool write) 620 { 621 #ifdef CONFIG_64BIT 622 void __iomem *addr; 623 size_t count = 0; 624 uint64_t last; 625 626 if (!adev->mman.aper_base_kaddr) 627 return 0; 628 629 last = min(pos + size, adev->gmc.visible_vram_size); 630 if (last > pos) { 631 addr = adev->mman.aper_base_kaddr + pos; 632 count = last - pos; 633 634 if (write) { 635 memcpy_toio(addr, buf, count); 636 /* Make sure HDP write cache flush happens without any reordering 637 * after the system memory contents are sent over PCIe device 638 */ 639 mb(); 640 amdgpu_device_flush_hdp(adev, NULL); 641 } else { 642 amdgpu_device_invalidate_hdp(adev, NULL); 643 /* Make sure HDP read cache is invalidated before issuing a read 644 * to the PCIe device 645 */ 646 mb(); 647 memcpy_fromio(buf, addr, count); 648 } 649 650 } 651 652 return count; 653 #else 654 return 0; 655 #endif 656 } 657 658 /** 659 * amdgpu_device_vram_access - read/write a buffer in vram 660 * 661 * @adev: amdgpu_device pointer 662 * @pos: offset of the buffer in vram 663 * @buf: virtual address of the buffer in system memory 664 * @size: read/write size, sizeof(@buf) must > @size 665 * @write: true - write to vram, otherwise - read from vram 666 */ 667 void amdgpu_device_vram_access(struct amdgpu_device *adev, loff_t pos, 668 void *buf, size_t size, bool write) 669 { 670 size_t count; 671 672 /* try to using vram apreature to access vram first */ 673 count = amdgpu_device_aper_access(adev, pos, buf, size, write); 674 size -= count; 675 if (size) { 676 /* using MM to access rest vram */ 677 pos += count; 678 buf += count; 679 amdgpu_device_mm_access(adev, pos, buf, size, write); 680 } 681 } 682 683 /* 684 * register access helper functions. 685 */ 686 687 /* Check if hw access should be skipped because of hotplug or device error */ 688 bool amdgpu_device_skip_hw_access(struct amdgpu_device *adev) 689 { 690 if (adev->no_hw_access) 691 return true; 692 693 #ifdef CONFIG_LOCKDEP 694 /* 695 * This is a bit complicated to understand, so worth a comment. What we assert 696 * here is that the GPU reset is not running on another thread in parallel. 697 * 698 * For this we trylock the read side of the reset semaphore, if that succeeds 699 * we know that the reset is not running in parallel. 700 * 701 * If the trylock fails we assert that we are either already holding the read 702 * side of the lock or are the reset thread itself and hold the write side of 703 * the lock. 704 */ 705 if (in_task()) { 706 if (down_read_trylock(&adev->reset_domain->sem)) 707 up_read(&adev->reset_domain->sem); 708 else 709 lockdep_assert_held(&adev->reset_domain->sem); 710 } 711 #endif 712 return false; 713 } 714 715 /** 716 * amdgpu_device_rreg - read a memory mapped IO or indirect register 717 * 718 * @adev: amdgpu_device pointer 719 * @reg: dword aligned register offset 720 * @acc_flags: access flags which require special behavior 721 * 722 * Returns the 32 bit value from the offset specified. 723 */ 724 uint32_t amdgpu_device_rreg(struct amdgpu_device *adev, 725 uint32_t reg, uint32_t acc_flags) 726 { 727 uint32_t ret; 728 729 if (amdgpu_device_skip_hw_access(adev)) 730 return 0; 731 732 if ((reg * 4) < adev->rmmio_size) { 733 if (!(acc_flags & AMDGPU_REGS_NO_KIQ) && 734 amdgpu_sriov_runtime(adev) && 735 down_read_trylock(&adev->reset_domain->sem)) { 736 ret = amdgpu_kiq_rreg(adev, reg, 0); 737 up_read(&adev->reset_domain->sem); 738 } else { 739 ret = readl(((void __iomem *)adev->rmmio) + (reg * 4)); 740 } 741 } else { 742 ret = adev->pcie_rreg(adev, reg * 4); 743 } 744 745 trace_amdgpu_device_rreg(adev->pdev->device, reg, ret); 746 747 return ret; 748 } 749 750 /* 751 * MMIO register read with bytes helper functions 752 * @offset:bytes offset from MMIO start 753 */ 754 755 /** 756 * amdgpu_mm_rreg8 - read a memory mapped IO register 757 * 758 * @adev: amdgpu_device pointer 759 * @offset: byte aligned register offset 760 * 761 * Returns the 8 bit value from the offset specified. 762 */ 763 uint8_t amdgpu_mm_rreg8(struct amdgpu_device *adev, uint32_t offset) 764 { 765 if (amdgpu_device_skip_hw_access(adev)) 766 return 0; 767 768 if (offset < adev->rmmio_size) 769 return (readb(adev->rmmio + offset)); 770 BUG(); 771 } 772 773 774 /** 775 * amdgpu_device_xcc_rreg - read a memory mapped IO or indirect register with specific XCC 776 * 777 * @adev: amdgpu_device pointer 778 * @reg: dword aligned register offset 779 * @acc_flags: access flags which require special behavior 780 * @xcc_id: xcc accelerated compute core id 781 * 782 * Returns the 32 bit value from the offset specified. 783 */ 784 uint32_t amdgpu_device_xcc_rreg(struct amdgpu_device *adev, 785 uint32_t reg, uint32_t acc_flags, 786 uint32_t xcc_id) 787 { 788 uint32_t ret, rlcg_flag; 789 790 if (amdgpu_device_skip_hw_access(adev)) 791 return 0; 792 793 if ((reg * 4) < adev->rmmio_size) { 794 if (amdgpu_sriov_vf(adev) && 795 !amdgpu_sriov_runtime(adev) && 796 adev->gfx.rlc.rlcg_reg_access_supported && 797 amdgpu_virt_get_rlcg_reg_access_flag(adev, acc_flags, 798 GC_HWIP, false, 799 &rlcg_flag)) { 800 ret = amdgpu_virt_rlcg_reg_rw(adev, reg, 0, rlcg_flag, GET_INST(GC, xcc_id)); 801 } else if (!(acc_flags & AMDGPU_REGS_NO_KIQ) && 802 amdgpu_sriov_runtime(adev) && 803 down_read_trylock(&adev->reset_domain->sem)) { 804 ret = amdgpu_kiq_rreg(adev, reg, xcc_id); 805 up_read(&adev->reset_domain->sem); 806 } else { 807 ret = readl(((void __iomem *)adev->rmmio) + (reg * 4)); 808 } 809 } else { 810 ret = adev->pcie_rreg(adev, reg * 4); 811 } 812 813 return ret; 814 } 815 816 /* 817 * MMIO register write with bytes helper functions 818 * @offset:bytes offset from MMIO start 819 * @value: the value want to be written to the register 820 */ 821 822 /** 823 * amdgpu_mm_wreg8 - read a memory mapped IO register 824 * 825 * @adev: amdgpu_device pointer 826 * @offset: byte aligned register offset 827 * @value: 8 bit value to write 828 * 829 * Writes the value specified to the offset specified. 830 */ 831 void amdgpu_mm_wreg8(struct amdgpu_device *adev, uint32_t offset, uint8_t value) 832 { 833 if (amdgpu_device_skip_hw_access(adev)) 834 return; 835 836 if (offset < adev->rmmio_size) 837 writeb(value, adev->rmmio + offset); 838 else 839 BUG(); 840 } 841 842 /** 843 * amdgpu_device_wreg - write to a memory mapped IO or indirect register 844 * 845 * @adev: amdgpu_device pointer 846 * @reg: dword aligned register offset 847 * @v: 32 bit value to write to the register 848 * @acc_flags: access flags which require special behavior 849 * 850 * Writes the value specified to the offset specified. 851 */ 852 void amdgpu_device_wreg(struct amdgpu_device *adev, 853 uint32_t reg, uint32_t v, 854 uint32_t acc_flags) 855 { 856 if (amdgpu_device_skip_hw_access(adev)) 857 return; 858 859 if ((reg * 4) < adev->rmmio_size) { 860 if (!(acc_flags & AMDGPU_REGS_NO_KIQ) && 861 amdgpu_sriov_runtime(adev) && 862 down_read_trylock(&adev->reset_domain->sem)) { 863 amdgpu_kiq_wreg(adev, reg, v, 0); 864 up_read(&adev->reset_domain->sem); 865 } else { 866 writel(v, ((void __iomem *)adev->rmmio) + (reg * 4)); 867 } 868 } else { 869 adev->pcie_wreg(adev, reg * 4, v); 870 } 871 872 trace_amdgpu_device_wreg(adev->pdev->device, reg, v); 873 } 874 875 /** 876 * amdgpu_mm_wreg_mmio_rlc - write register either with direct/indirect mmio or with RLC path if in range 877 * 878 * @adev: amdgpu_device pointer 879 * @reg: mmio/rlc register 880 * @v: value to write 881 * @xcc_id: xcc accelerated compute core id 882 * 883 * this function is invoked only for the debugfs register access 884 */ 885 void amdgpu_mm_wreg_mmio_rlc(struct amdgpu_device *adev, 886 uint32_t reg, uint32_t v, 887 uint32_t xcc_id) 888 { 889 if (amdgpu_device_skip_hw_access(adev)) 890 return; 891 892 if (amdgpu_sriov_fullaccess(adev) && 893 adev->gfx.rlc.funcs && 894 adev->gfx.rlc.funcs->is_rlcg_access_range) { 895 if (adev->gfx.rlc.funcs->is_rlcg_access_range(adev, reg)) 896 return amdgpu_sriov_wreg(adev, reg, v, 0, 0, xcc_id); 897 } else if ((reg * 4) >= adev->rmmio_size) { 898 adev->pcie_wreg(adev, reg * 4, v); 899 } else { 900 writel(v, ((void __iomem *)adev->rmmio) + (reg * 4)); 901 } 902 } 903 904 /** 905 * amdgpu_device_xcc_wreg - write to a memory mapped IO or indirect register with specific XCC 906 * 907 * @adev: amdgpu_device pointer 908 * @reg: dword aligned register offset 909 * @v: 32 bit value to write to the register 910 * @acc_flags: access flags which require special behavior 911 * @xcc_id: xcc accelerated compute core id 912 * 913 * Writes the value specified to the offset specified. 914 */ 915 void amdgpu_device_xcc_wreg(struct amdgpu_device *adev, 916 uint32_t reg, uint32_t v, 917 uint32_t acc_flags, uint32_t xcc_id) 918 { 919 uint32_t rlcg_flag; 920 921 if (amdgpu_device_skip_hw_access(adev)) 922 return; 923 924 if ((reg * 4) < adev->rmmio_size) { 925 if (amdgpu_sriov_vf(adev) && 926 !amdgpu_sriov_runtime(adev) && 927 adev->gfx.rlc.rlcg_reg_access_supported && 928 amdgpu_virt_get_rlcg_reg_access_flag(adev, acc_flags, 929 GC_HWIP, true, 930 &rlcg_flag)) { 931 amdgpu_virt_rlcg_reg_rw(adev, reg, v, rlcg_flag, GET_INST(GC, xcc_id)); 932 } else if (!(acc_flags & AMDGPU_REGS_NO_KIQ) && 933 amdgpu_sriov_runtime(adev) && 934 down_read_trylock(&adev->reset_domain->sem)) { 935 amdgpu_kiq_wreg(adev, reg, v, xcc_id); 936 up_read(&adev->reset_domain->sem); 937 } else { 938 writel(v, ((void __iomem *)adev->rmmio) + (reg * 4)); 939 } 940 } else { 941 adev->pcie_wreg(adev, reg * 4, v); 942 } 943 } 944 945 /** 946 * amdgpu_device_indirect_rreg - read an indirect register 947 * 948 * @adev: amdgpu_device pointer 949 * @reg_addr: indirect register address to read from 950 * 951 * Returns the value of indirect register @reg_addr 952 */ 953 u32 amdgpu_device_indirect_rreg(struct amdgpu_device *adev, 954 u32 reg_addr) 955 { 956 unsigned long flags, pcie_index, pcie_data; 957 void __iomem *pcie_index_offset; 958 void __iomem *pcie_data_offset; 959 u32 r; 960 961 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 962 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 963 964 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 965 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 966 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 967 968 writel(reg_addr, pcie_index_offset); 969 readl(pcie_index_offset); 970 r = readl(pcie_data_offset); 971 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 972 973 return r; 974 } 975 976 u32 amdgpu_device_indirect_rreg_ext(struct amdgpu_device *adev, 977 u64 reg_addr) 978 { 979 unsigned long flags, pcie_index, pcie_index_hi, pcie_data; 980 u32 r; 981 void __iomem *pcie_index_offset; 982 void __iomem *pcie_index_hi_offset; 983 void __iomem *pcie_data_offset; 984 985 if (unlikely(!adev->nbio.funcs)) { 986 pcie_index = AMDGPU_PCIE_INDEX_FALLBACK; 987 pcie_data = AMDGPU_PCIE_DATA_FALLBACK; 988 } else { 989 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 990 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 991 } 992 993 if (reg_addr >> 32) { 994 if (unlikely(!adev->nbio.funcs)) 995 pcie_index_hi = AMDGPU_PCIE_INDEX_HI_FALLBACK; 996 else 997 pcie_index_hi = adev->nbio.funcs->get_pcie_index_hi_offset(adev); 998 } else { 999 pcie_index_hi = 0; 1000 } 1001 1002 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 1003 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 1004 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 1005 if (pcie_index_hi != 0) 1006 pcie_index_hi_offset = (void __iomem *)adev->rmmio + 1007 pcie_index_hi * 4; 1008 1009 writel(reg_addr, pcie_index_offset); 1010 readl(pcie_index_offset); 1011 if (pcie_index_hi != 0) { 1012 writel((reg_addr >> 32) & 0xff, pcie_index_hi_offset); 1013 readl(pcie_index_hi_offset); 1014 } 1015 r = readl(pcie_data_offset); 1016 1017 /* clear the high bits */ 1018 if (pcie_index_hi != 0) { 1019 writel(0, pcie_index_hi_offset); 1020 readl(pcie_index_hi_offset); 1021 } 1022 1023 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 1024 1025 return r; 1026 } 1027 1028 /** 1029 * amdgpu_device_indirect_rreg64 - read a 64bits indirect register 1030 * 1031 * @adev: amdgpu_device pointer 1032 * @reg_addr: indirect register address to read from 1033 * 1034 * Returns the value of indirect register @reg_addr 1035 */ 1036 u64 amdgpu_device_indirect_rreg64(struct amdgpu_device *adev, 1037 u32 reg_addr) 1038 { 1039 unsigned long flags, pcie_index, pcie_data; 1040 void __iomem *pcie_index_offset; 1041 void __iomem *pcie_data_offset; 1042 u64 r; 1043 1044 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 1045 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 1046 1047 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 1048 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 1049 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 1050 1051 /* read low 32 bits */ 1052 writel(reg_addr, pcie_index_offset); 1053 readl(pcie_index_offset); 1054 r = readl(pcie_data_offset); 1055 /* read high 32 bits */ 1056 writel(reg_addr + 4, pcie_index_offset); 1057 readl(pcie_index_offset); 1058 r |= ((u64)readl(pcie_data_offset) << 32); 1059 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 1060 1061 return r; 1062 } 1063 1064 u64 amdgpu_device_indirect_rreg64_ext(struct amdgpu_device *adev, 1065 u64 reg_addr) 1066 { 1067 unsigned long flags, pcie_index, pcie_data; 1068 unsigned long pcie_index_hi = 0; 1069 void __iomem *pcie_index_offset; 1070 void __iomem *pcie_index_hi_offset; 1071 void __iomem *pcie_data_offset; 1072 u64 r; 1073 1074 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 1075 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 1076 if ((reg_addr >> 32) && (adev->nbio.funcs->get_pcie_index_hi_offset)) 1077 pcie_index_hi = adev->nbio.funcs->get_pcie_index_hi_offset(adev); 1078 1079 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 1080 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 1081 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 1082 if (pcie_index_hi != 0) 1083 pcie_index_hi_offset = (void __iomem *)adev->rmmio + 1084 pcie_index_hi * 4; 1085 1086 /* read low 32 bits */ 1087 writel(reg_addr, pcie_index_offset); 1088 readl(pcie_index_offset); 1089 if (pcie_index_hi != 0) { 1090 writel((reg_addr >> 32) & 0xff, pcie_index_hi_offset); 1091 readl(pcie_index_hi_offset); 1092 } 1093 r = readl(pcie_data_offset); 1094 /* read high 32 bits */ 1095 writel(reg_addr + 4, pcie_index_offset); 1096 readl(pcie_index_offset); 1097 if (pcie_index_hi != 0) { 1098 writel((reg_addr >> 32) & 0xff, pcie_index_hi_offset); 1099 readl(pcie_index_hi_offset); 1100 } 1101 r |= ((u64)readl(pcie_data_offset) << 32); 1102 1103 /* clear the high bits */ 1104 if (pcie_index_hi != 0) { 1105 writel(0, pcie_index_hi_offset); 1106 readl(pcie_index_hi_offset); 1107 } 1108 1109 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 1110 1111 return r; 1112 } 1113 1114 /** 1115 * amdgpu_device_indirect_wreg - write an indirect register address 1116 * 1117 * @adev: amdgpu_device pointer 1118 * @reg_addr: indirect register offset 1119 * @reg_data: indirect register data 1120 * 1121 */ 1122 void amdgpu_device_indirect_wreg(struct amdgpu_device *adev, 1123 u32 reg_addr, u32 reg_data) 1124 { 1125 unsigned long flags, pcie_index, pcie_data; 1126 void __iomem *pcie_index_offset; 1127 void __iomem *pcie_data_offset; 1128 1129 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 1130 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 1131 1132 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 1133 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 1134 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 1135 1136 writel(reg_addr, pcie_index_offset); 1137 readl(pcie_index_offset); 1138 writel(reg_data, pcie_data_offset); 1139 readl(pcie_data_offset); 1140 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 1141 } 1142 1143 void amdgpu_device_indirect_wreg_ext(struct amdgpu_device *adev, 1144 u64 reg_addr, u32 reg_data) 1145 { 1146 unsigned long flags, pcie_index, pcie_index_hi, pcie_data; 1147 void __iomem *pcie_index_offset; 1148 void __iomem *pcie_index_hi_offset; 1149 void __iomem *pcie_data_offset; 1150 1151 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 1152 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 1153 if ((reg_addr >> 32) && (adev->nbio.funcs->get_pcie_index_hi_offset)) 1154 pcie_index_hi = adev->nbio.funcs->get_pcie_index_hi_offset(adev); 1155 else 1156 pcie_index_hi = 0; 1157 1158 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 1159 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 1160 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 1161 if (pcie_index_hi != 0) 1162 pcie_index_hi_offset = (void __iomem *)adev->rmmio + 1163 pcie_index_hi * 4; 1164 1165 writel(reg_addr, pcie_index_offset); 1166 readl(pcie_index_offset); 1167 if (pcie_index_hi != 0) { 1168 writel((reg_addr >> 32) & 0xff, pcie_index_hi_offset); 1169 readl(pcie_index_hi_offset); 1170 } 1171 writel(reg_data, pcie_data_offset); 1172 readl(pcie_data_offset); 1173 1174 /* clear the high bits */ 1175 if (pcie_index_hi != 0) { 1176 writel(0, pcie_index_hi_offset); 1177 readl(pcie_index_hi_offset); 1178 } 1179 1180 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 1181 } 1182 1183 /** 1184 * amdgpu_device_indirect_wreg64 - write a 64bits indirect register address 1185 * 1186 * @adev: amdgpu_device pointer 1187 * @reg_addr: indirect register offset 1188 * @reg_data: indirect register data 1189 * 1190 */ 1191 void amdgpu_device_indirect_wreg64(struct amdgpu_device *adev, 1192 u32 reg_addr, u64 reg_data) 1193 { 1194 unsigned long flags, pcie_index, pcie_data; 1195 void __iomem *pcie_index_offset; 1196 void __iomem *pcie_data_offset; 1197 1198 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 1199 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 1200 1201 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 1202 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 1203 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 1204 1205 /* write low 32 bits */ 1206 writel(reg_addr, pcie_index_offset); 1207 readl(pcie_index_offset); 1208 writel((u32)(reg_data & 0xffffffffULL), pcie_data_offset); 1209 readl(pcie_data_offset); 1210 /* write high 32 bits */ 1211 writel(reg_addr + 4, pcie_index_offset); 1212 readl(pcie_index_offset); 1213 writel((u32)(reg_data >> 32), pcie_data_offset); 1214 readl(pcie_data_offset); 1215 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 1216 } 1217 1218 void amdgpu_device_indirect_wreg64_ext(struct amdgpu_device *adev, 1219 u64 reg_addr, u64 reg_data) 1220 { 1221 unsigned long flags, pcie_index, pcie_data; 1222 unsigned long pcie_index_hi = 0; 1223 void __iomem *pcie_index_offset; 1224 void __iomem *pcie_index_hi_offset; 1225 void __iomem *pcie_data_offset; 1226 1227 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 1228 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 1229 if ((reg_addr >> 32) && (adev->nbio.funcs->get_pcie_index_hi_offset)) 1230 pcie_index_hi = adev->nbio.funcs->get_pcie_index_hi_offset(adev); 1231 1232 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 1233 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 1234 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 1235 if (pcie_index_hi != 0) 1236 pcie_index_hi_offset = (void __iomem *)adev->rmmio + 1237 pcie_index_hi * 4; 1238 1239 /* write low 32 bits */ 1240 writel(reg_addr, pcie_index_offset); 1241 readl(pcie_index_offset); 1242 if (pcie_index_hi != 0) { 1243 writel((reg_addr >> 32) & 0xff, pcie_index_hi_offset); 1244 readl(pcie_index_hi_offset); 1245 } 1246 writel((u32)(reg_data & 0xffffffffULL), pcie_data_offset); 1247 readl(pcie_data_offset); 1248 /* write high 32 bits */ 1249 writel(reg_addr + 4, pcie_index_offset); 1250 readl(pcie_index_offset); 1251 if (pcie_index_hi != 0) { 1252 writel((reg_addr >> 32) & 0xff, pcie_index_hi_offset); 1253 readl(pcie_index_hi_offset); 1254 } 1255 writel((u32)(reg_data >> 32), pcie_data_offset); 1256 readl(pcie_data_offset); 1257 1258 /* clear the high bits */ 1259 if (pcie_index_hi != 0) { 1260 writel(0, pcie_index_hi_offset); 1261 readl(pcie_index_hi_offset); 1262 } 1263 1264 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 1265 } 1266 1267 /** 1268 * amdgpu_device_get_rev_id - query device rev_id 1269 * 1270 * @adev: amdgpu_device pointer 1271 * 1272 * Return device rev_id 1273 */ 1274 u32 amdgpu_device_get_rev_id(struct amdgpu_device *adev) 1275 { 1276 return adev->nbio.funcs->get_rev_id(adev); 1277 } 1278 1279 /** 1280 * amdgpu_invalid_rreg - dummy reg read function 1281 * 1282 * @adev: amdgpu_device pointer 1283 * @reg: offset of register 1284 * 1285 * Dummy register read function. Used for register blocks 1286 * that certain asics don't have (all asics). 1287 * Returns the value in the register. 1288 */ 1289 static uint32_t amdgpu_invalid_rreg(struct amdgpu_device *adev, uint32_t reg) 1290 { 1291 dev_err(adev->dev, "Invalid callback to read register 0x%04X\n", reg); 1292 BUG(); 1293 return 0; 1294 } 1295 1296 static uint32_t amdgpu_invalid_rreg_ext(struct amdgpu_device *adev, uint64_t reg) 1297 { 1298 dev_err(adev->dev, "Invalid callback to read register 0x%llX\n", reg); 1299 BUG(); 1300 return 0; 1301 } 1302 1303 /** 1304 * amdgpu_invalid_wreg - dummy reg write function 1305 * 1306 * @adev: amdgpu_device pointer 1307 * @reg: offset of register 1308 * @v: value to write to the register 1309 * 1310 * Dummy register read function. Used for register blocks 1311 * that certain asics don't have (all asics). 1312 */ 1313 static void amdgpu_invalid_wreg(struct amdgpu_device *adev, uint32_t reg, uint32_t v) 1314 { 1315 dev_err(adev->dev, 1316 "Invalid callback to write register 0x%04X with 0x%08X\n", reg, 1317 v); 1318 BUG(); 1319 } 1320 1321 static void amdgpu_invalid_wreg_ext(struct amdgpu_device *adev, uint64_t reg, uint32_t v) 1322 { 1323 dev_err(adev->dev, 1324 "Invalid callback to write register 0x%llX with 0x%08X\n", reg, 1325 v); 1326 BUG(); 1327 } 1328 1329 /** 1330 * amdgpu_invalid_rreg64 - dummy 64 bit reg read function 1331 * 1332 * @adev: amdgpu_device pointer 1333 * @reg: offset of register 1334 * 1335 * Dummy register read function. Used for register blocks 1336 * that certain asics don't have (all asics). 1337 * Returns the value in the register. 1338 */ 1339 static uint64_t amdgpu_invalid_rreg64(struct amdgpu_device *adev, uint32_t reg) 1340 { 1341 dev_err(adev->dev, "Invalid callback to read 64 bit register 0x%04X\n", 1342 reg); 1343 BUG(); 1344 return 0; 1345 } 1346 1347 static uint64_t amdgpu_invalid_rreg64_ext(struct amdgpu_device *adev, uint64_t reg) 1348 { 1349 dev_err(adev->dev, "Invalid callback to read register 0x%llX\n", reg); 1350 BUG(); 1351 return 0; 1352 } 1353 1354 /** 1355 * amdgpu_invalid_wreg64 - dummy reg write function 1356 * 1357 * @adev: amdgpu_device pointer 1358 * @reg: offset of register 1359 * @v: value to write to the register 1360 * 1361 * Dummy register read function. Used for register blocks 1362 * that certain asics don't have (all asics). 1363 */ 1364 static void amdgpu_invalid_wreg64(struct amdgpu_device *adev, uint32_t reg, uint64_t v) 1365 { 1366 dev_err(adev->dev, 1367 "Invalid callback to write 64 bit register 0x%04X with 0x%08llX\n", 1368 reg, v); 1369 BUG(); 1370 } 1371 1372 static void amdgpu_invalid_wreg64_ext(struct amdgpu_device *adev, uint64_t reg, uint64_t v) 1373 { 1374 dev_err(adev->dev, 1375 "Invalid callback to write 64 bit register 0x%llX with 0x%08llX\n", 1376 reg, v); 1377 BUG(); 1378 } 1379 1380 /** 1381 * amdgpu_block_invalid_rreg - dummy reg read function 1382 * 1383 * @adev: amdgpu_device pointer 1384 * @block: offset of instance 1385 * @reg: offset of register 1386 * 1387 * Dummy register read function. Used for register blocks 1388 * that certain asics don't have (all asics). 1389 * Returns the value in the register. 1390 */ 1391 static uint32_t amdgpu_block_invalid_rreg(struct amdgpu_device *adev, 1392 uint32_t block, uint32_t reg) 1393 { 1394 dev_err(adev->dev, 1395 "Invalid callback to read register 0x%04X in block 0x%04X\n", 1396 reg, block); 1397 BUG(); 1398 return 0; 1399 } 1400 1401 /** 1402 * amdgpu_block_invalid_wreg - dummy reg write function 1403 * 1404 * @adev: amdgpu_device pointer 1405 * @block: offset of instance 1406 * @reg: offset of register 1407 * @v: value to write to the register 1408 * 1409 * Dummy register read function. Used for register blocks 1410 * that certain asics don't have (all asics). 1411 */ 1412 static void amdgpu_block_invalid_wreg(struct amdgpu_device *adev, 1413 uint32_t block, 1414 uint32_t reg, uint32_t v) 1415 { 1416 dev_err(adev->dev, 1417 "Invalid block callback to write register 0x%04X in block 0x%04X with 0x%08X\n", 1418 reg, block, v); 1419 BUG(); 1420 } 1421 1422 static uint32_t amdgpu_device_get_vbios_flags(struct amdgpu_device *adev) 1423 { 1424 if (hweight32(adev->aid_mask) && (adev->flags & AMD_IS_APU)) 1425 return AMDGPU_VBIOS_SKIP; 1426 1427 if (hweight32(adev->aid_mask) && amdgpu_passthrough(adev)) 1428 return AMDGPU_VBIOS_OPTIONAL; 1429 1430 return 0; 1431 } 1432 1433 /** 1434 * amdgpu_device_asic_init - Wrapper for atom asic_init 1435 * 1436 * @adev: amdgpu_device pointer 1437 * 1438 * Does any asic specific work and then calls atom asic init. 1439 */ 1440 static int amdgpu_device_asic_init(struct amdgpu_device *adev) 1441 { 1442 uint32_t flags; 1443 bool optional; 1444 int ret; 1445 1446 amdgpu_asic_pre_asic_init(adev); 1447 flags = amdgpu_device_get_vbios_flags(adev); 1448 optional = !!(flags & (AMDGPU_VBIOS_OPTIONAL | AMDGPU_VBIOS_SKIP)); 1449 1450 if (amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 3) || 1451 amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 4) || 1452 amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 5, 0) || 1453 amdgpu_ip_version(adev, GC_HWIP, 0) >= IP_VERSION(11, 0, 0)) { 1454 amdgpu_psp_wait_for_bootloader(adev); 1455 if (optional && !adev->bios) 1456 return 0; 1457 1458 ret = amdgpu_atomfirmware_asic_init(adev, true); 1459 return ret; 1460 } else { 1461 if (optional && !adev->bios) 1462 return 0; 1463 1464 return amdgpu_atom_asic_init(adev->mode_info.atom_context); 1465 } 1466 1467 return 0; 1468 } 1469 1470 /** 1471 * amdgpu_device_mem_scratch_init - allocate the VRAM scratch page 1472 * 1473 * @adev: amdgpu_device pointer 1474 * 1475 * Allocates a scratch page of VRAM for use by various things in the 1476 * driver. 1477 */ 1478 static int amdgpu_device_mem_scratch_init(struct amdgpu_device *adev) 1479 { 1480 return amdgpu_bo_create_kernel(adev, AMDGPU_GPU_PAGE_SIZE, PAGE_SIZE, 1481 AMDGPU_GEM_DOMAIN_VRAM | 1482 AMDGPU_GEM_DOMAIN_GTT, 1483 &adev->mem_scratch.robj, 1484 &adev->mem_scratch.gpu_addr, 1485 (void **)&adev->mem_scratch.ptr); 1486 } 1487 1488 /** 1489 * amdgpu_device_mem_scratch_fini - Free the VRAM scratch page 1490 * 1491 * @adev: amdgpu_device pointer 1492 * 1493 * Frees the VRAM scratch page. 1494 */ 1495 static void amdgpu_device_mem_scratch_fini(struct amdgpu_device *adev) 1496 { 1497 amdgpu_bo_free_kernel(&adev->mem_scratch.robj, NULL, NULL); 1498 } 1499 1500 /** 1501 * amdgpu_device_program_register_sequence - program an array of registers. 1502 * 1503 * @adev: amdgpu_device pointer 1504 * @registers: pointer to the register array 1505 * @array_size: size of the register array 1506 * 1507 * Programs an array or registers with and or masks. 1508 * This is a helper for setting golden registers. 1509 */ 1510 void amdgpu_device_program_register_sequence(struct amdgpu_device *adev, 1511 const u32 *registers, 1512 const u32 array_size) 1513 { 1514 u32 tmp, reg, and_mask, or_mask; 1515 int i; 1516 1517 if (array_size % 3) 1518 return; 1519 1520 for (i = 0; i < array_size; i += 3) { 1521 reg = registers[i + 0]; 1522 and_mask = registers[i + 1]; 1523 or_mask = registers[i + 2]; 1524 1525 if (and_mask == 0xffffffff) { 1526 tmp = or_mask; 1527 } else { 1528 tmp = RREG32(reg); 1529 tmp &= ~and_mask; 1530 if (adev->family >= AMDGPU_FAMILY_AI) 1531 tmp |= (or_mask & and_mask); 1532 else 1533 tmp |= or_mask; 1534 } 1535 WREG32(reg, tmp); 1536 } 1537 } 1538 1539 /** 1540 * amdgpu_device_pci_config_reset - reset the GPU 1541 * 1542 * @adev: amdgpu_device pointer 1543 * 1544 * Resets the GPU using the pci config reset sequence. 1545 * Only applicable to asics prior to vega10. 1546 */ 1547 void amdgpu_device_pci_config_reset(struct amdgpu_device *adev) 1548 { 1549 pci_write_config_dword(adev->pdev, 0x7c, AMDGPU_ASIC_RESET_DATA); 1550 } 1551 1552 /** 1553 * amdgpu_device_pci_reset - reset the GPU using generic PCI means 1554 * 1555 * @adev: amdgpu_device pointer 1556 * 1557 * Resets the GPU using generic pci reset interfaces (FLR, SBR, etc.). 1558 */ 1559 int amdgpu_device_pci_reset(struct amdgpu_device *adev) 1560 { 1561 return pci_reset_function(adev->pdev); 1562 } 1563 1564 /* 1565 * amdgpu_device_wb_*() 1566 * Writeback is the method by which the GPU updates special pages in memory 1567 * with the status of certain GPU events (fences, ring pointers,etc.). 1568 */ 1569 1570 /** 1571 * amdgpu_device_wb_fini - Disable Writeback and free memory 1572 * 1573 * @adev: amdgpu_device pointer 1574 * 1575 * Disables Writeback and frees the Writeback memory (all asics). 1576 * Used at driver shutdown. 1577 */ 1578 static void amdgpu_device_wb_fini(struct amdgpu_device *adev) 1579 { 1580 if (adev->wb.wb_obj) { 1581 amdgpu_bo_free_kernel(&adev->wb.wb_obj, 1582 &adev->wb.gpu_addr, 1583 (void **)&adev->wb.wb); 1584 adev->wb.wb_obj = NULL; 1585 } 1586 } 1587 1588 /** 1589 * amdgpu_device_wb_init - Init Writeback driver info and allocate memory 1590 * 1591 * @adev: amdgpu_device pointer 1592 * 1593 * Initializes writeback and allocates writeback memory (all asics). 1594 * Used at driver startup. 1595 * Returns 0 on success or an -error on failure. 1596 */ 1597 static int amdgpu_device_wb_init(struct amdgpu_device *adev) 1598 { 1599 int r; 1600 1601 if (adev->wb.wb_obj == NULL) { 1602 /* AMDGPU_MAX_WB * sizeof(uint32_t) * 8 = AMDGPU_MAX_WB 256bit slots */ 1603 r = amdgpu_bo_create_kernel(adev, AMDGPU_MAX_WB * sizeof(uint32_t) * 8, 1604 PAGE_SIZE, AMDGPU_GEM_DOMAIN_GTT, 1605 &adev->wb.wb_obj, &adev->wb.gpu_addr, 1606 (void **)&adev->wb.wb); 1607 if (r) { 1608 dev_warn(adev->dev, "(%d) create WB bo failed\n", r); 1609 return r; 1610 } 1611 1612 adev->wb.num_wb = AMDGPU_MAX_WB; 1613 memset(&adev->wb.used, 0, sizeof(adev->wb.used)); 1614 1615 /* clear wb memory */ 1616 memset((char *)adev->wb.wb, 0, AMDGPU_MAX_WB * sizeof(uint32_t) * 8); 1617 } 1618 1619 return 0; 1620 } 1621 1622 /** 1623 * amdgpu_device_wb_get - Allocate a wb entry 1624 * 1625 * @adev: amdgpu_device pointer 1626 * @wb: wb index 1627 * 1628 * Allocate a wb slot for use by the driver (all asics). 1629 * Returns 0 on success or -EINVAL on failure. 1630 */ 1631 int amdgpu_device_wb_get(struct amdgpu_device *adev, u32 *wb) 1632 { 1633 unsigned long flags, offset; 1634 1635 spin_lock_irqsave(&adev->wb.lock, flags); 1636 offset = find_first_zero_bit(adev->wb.used, adev->wb.num_wb); 1637 if (offset < adev->wb.num_wb) { 1638 __set_bit(offset, adev->wb.used); 1639 spin_unlock_irqrestore(&adev->wb.lock, flags); 1640 *wb = offset << 3; /* convert to dw offset */ 1641 return 0; 1642 } else { 1643 spin_unlock_irqrestore(&adev->wb.lock, flags); 1644 return -EINVAL; 1645 } 1646 } 1647 1648 /** 1649 * amdgpu_device_wb_free - Free a wb entry 1650 * 1651 * @adev: amdgpu_device pointer 1652 * @wb: wb index 1653 * 1654 * Free a wb slot allocated for use by the driver (all asics) 1655 */ 1656 void amdgpu_device_wb_free(struct amdgpu_device *adev, u32 wb) 1657 { 1658 unsigned long flags; 1659 1660 wb >>= 3; 1661 spin_lock_irqsave(&adev->wb.lock, flags); 1662 if (wb < adev->wb.num_wb) 1663 __clear_bit(wb, adev->wb.used); 1664 spin_unlock_irqrestore(&adev->wb.lock, flags); 1665 } 1666 1667 /** 1668 * amdgpu_device_resize_fb_bar - try to resize FB BAR 1669 * 1670 * @adev: amdgpu_device pointer 1671 * 1672 * Try to resize FB BAR to make all VRAM CPU accessible. We try very hard not 1673 * to fail, but if any of the BARs is not accessible after the size we abort 1674 * driver loading by returning -ENODEV. 1675 */ 1676 int amdgpu_device_resize_fb_bar(struct amdgpu_device *adev) 1677 { 1678 int rbar_size = pci_rebar_bytes_to_size(adev->gmc.real_vram_size); 1679 struct pci_bus *root; 1680 struct resource *res; 1681 unsigned int i; 1682 u16 cmd; 1683 int r; 1684 1685 if (!IS_ENABLED(CONFIG_PHYS_ADDR_T_64BIT)) 1686 return 0; 1687 1688 /* Bypass for VF */ 1689 if (amdgpu_sriov_vf(adev)) 1690 return 0; 1691 1692 if (!amdgpu_rebar) 1693 return 0; 1694 1695 /* resizing on Dell G5 SE platforms causes problems with runtime pm */ 1696 if ((amdgpu_runtime_pm != 0) && 1697 adev->pdev->vendor == PCI_VENDOR_ID_ATI && 1698 adev->pdev->device == 0x731f && 1699 adev->pdev->subsystem_vendor == PCI_VENDOR_ID_DELL) 1700 return 0; 1701 1702 /* PCI_EXT_CAP_ID_VNDR extended capability is located at 0x100 */ 1703 if (!pci_find_ext_capability(adev->pdev, PCI_EXT_CAP_ID_VNDR)) 1704 dev_warn( 1705 adev->dev, 1706 "System can't access extended configuration space, please check!!\n"); 1707 1708 /* skip if the bios has already enabled large BAR */ 1709 if (adev->gmc.real_vram_size && 1710 (pci_resource_len(adev->pdev, 0) >= adev->gmc.real_vram_size)) 1711 return 0; 1712 1713 /* Check if the root BUS has 64bit memory resources */ 1714 root = adev->pdev->bus; 1715 while (root->parent) 1716 root = root->parent; 1717 1718 pci_bus_for_each_resource(root, res, i) { 1719 if (res && res->flags & (IORESOURCE_MEM | IORESOURCE_MEM_64) && 1720 res->start > 0x100000000ull) 1721 break; 1722 } 1723 1724 /* Trying to resize is pointless without a root hub window above 4GB */ 1725 if (!res) 1726 return 0; 1727 1728 /* Limit the BAR size to what is available */ 1729 rbar_size = min(fls(pci_rebar_get_possible_sizes(adev->pdev, 0)) - 1, 1730 rbar_size); 1731 1732 /* Disable memory decoding while we change the BAR addresses and size */ 1733 pci_read_config_word(adev->pdev, PCI_COMMAND, &cmd); 1734 pci_write_config_word(adev->pdev, PCI_COMMAND, 1735 cmd & ~PCI_COMMAND_MEMORY); 1736 1737 /* Free the VRAM and doorbell BAR, we most likely need to move both. */ 1738 amdgpu_doorbell_fini(adev); 1739 if (adev->asic_type >= CHIP_BONAIRE) 1740 pci_release_resource(adev->pdev, 2); 1741 1742 pci_release_resource(adev->pdev, 0); 1743 1744 r = pci_resize_resource(adev->pdev, 0, rbar_size); 1745 if (r == -ENOSPC) 1746 dev_info(adev->dev, 1747 "Not enough PCI address space for a large BAR."); 1748 else if (r && r != -ENOTSUPP) 1749 dev_err(adev->dev, "Problem resizing BAR0 (%d).", r); 1750 1751 pci_assign_unassigned_bus_resources(adev->pdev->bus); 1752 1753 /* When the doorbell or fb BAR isn't available we have no chance of 1754 * using the device. 1755 */ 1756 r = amdgpu_doorbell_init(adev); 1757 if (r || (pci_resource_flags(adev->pdev, 0) & IORESOURCE_UNSET)) 1758 return -ENODEV; 1759 1760 pci_write_config_word(adev->pdev, PCI_COMMAND, cmd); 1761 1762 return 0; 1763 } 1764 1765 /* 1766 * GPU helpers function. 1767 */ 1768 /** 1769 * amdgpu_device_need_post - check if the hw need post or not 1770 * 1771 * @adev: amdgpu_device pointer 1772 * 1773 * Check if the asic has been initialized (all asics) at driver startup 1774 * or post is needed if hw reset is performed. 1775 * Returns true if need or false if not. 1776 */ 1777 bool amdgpu_device_need_post(struct amdgpu_device *adev) 1778 { 1779 uint32_t reg, flags; 1780 1781 if (amdgpu_sriov_vf(adev)) 1782 return false; 1783 1784 flags = amdgpu_device_get_vbios_flags(adev); 1785 if (flags & AMDGPU_VBIOS_SKIP) 1786 return false; 1787 if ((flags & AMDGPU_VBIOS_OPTIONAL) && !adev->bios) 1788 return false; 1789 1790 if (amdgpu_passthrough(adev)) { 1791 /* for FIJI: In whole GPU pass-through virtualization case, after VM reboot 1792 * some old smc fw still need driver do vPost otherwise gpu hang, while 1793 * those smc fw version above 22.15 doesn't have this flaw, so we force 1794 * vpost executed for smc version below 22.15 1795 */ 1796 if (adev->asic_type == CHIP_FIJI) { 1797 int err; 1798 uint32_t fw_ver; 1799 1800 err = request_firmware(&adev->pm.fw, "amdgpu/fiji_smc.bin", adev->dev); 1801 /* force vPost if error occurred */ 1802 if (err) 1803 return true; 1804 1805 fw_ver = *((uint32_t *)adev->pm.fw->data + 69); 1806 release_firmware(adev->pm.fw); 1807 if (fw_ver < 0x00160e00) 1808 return true; 1809 } 1810 } 1811 1812 /* Don't post if we need to reset whole hive on init */ 1813 if (adev->init_lvl->level == AMDGPU_INIT_LEVEL_MINIMAL_XGMI) 1814 return false; 1815 1816 if (adev->has_hw_reset) { 1817 adev->has_hw_reset = false; 1818 return true; 1819 } 1820 1821 /* bios scratch used on CIK+ */ 1822 if (adev->asic_type >= CHIP_BONAIRE) 1823 return amdgpu_atombios_scratch_need_asic_init(adev); 1824 1825 /* check MEM_SIZE for older asics */ 1826 reg = amdgpu_asic_get_config_memsize(adev); 1827 1828 if ((reg != 0) && (reg != 0xffffffff)) 1829 return false; 1830 1831 return true; 1832 } 1833 1834 /* 1835 * Check whether seamless boot is supported. 1836 * 1837 * So far we only support seamless boot on DCE 3.0 or later. 1838 * If users report that it works on older ASICS as well, we may 1839 * loosen this. 1840 */ 1841 bool amdgpu_device_seamless_boot_supported(struct amdgpu_device *adev) 1842 { 1843 switch (amdgpu_seamless) { 1844 case -1: 1845 break; 1846 case 1: 1847 return true; 1848 case 0: 1849 return false; 1850 default: 1851 dev_err(adev->dev, "Invalid value for amdgpu.seamless: %d\n", 1852 amdgpu_seamless); 1853 return false; 1854 } 1855 1856 if (!(adev->flags & AMD_IS_APU)) 1857 return false; 1858 1859 if (adev->mman.keep_stolen_vga_memory) 1860 return false; 1861 1862 return amdgpu_ip_version(adev, DCE_HWIP, 0) >= IP_VERSION(3, 0, 0); 1863 } 1864 1865 /* 1866 * Intel hosts such as Rocket Lake, Alder Lake, Raptor Lake and Sapphire Rapids 1867 * don't support dynamic speed switching. Until we have confirmation from Intel 1868 * that a specific host supports it, it's safer that we keep it disabled for all. 1869 * 1870 * https://edc.intel.com/content/www/us/en/design/products/platforms/details/raptor-lake-s/13th-generation-core-processors-datasheet-volume-1-of-2/005/pci-express-support/ 1871 * https://gitlab.freedesktop.org/drm/amd/-/issues/2663 1872 */ 1873 static bool amdgpu_device_pcie_dynamic_switching_supported(struct amdgpu_device *adev) 1874 { 1875 #if IS_ENABLED(CONFIG_X86) 1876 struct cpuinfo_x86 *c = &cpu_data(0); 1877 1878 /* eGPU change speeds based on USB4 fabric conditions */ 1879 if (dev_is_removable(adev->dev)) 1880 return true; 1881 1882 if (c->x86_vendor == X86_VENDOR_INTEL) 1883 return false; 1884 #endif 1885 return true; 1886 } 1887 1888 static bool amdgpu_device_aspm_support_quirk(struct amdgpu_device *adev) 1889 { 1890 /* Enabling ASPM causes randoms hangs on Tahiti and Oland on Zen4. 1891 * It's unclear if this is a platform-specific or GPU-specific issue. 1892 * Disable ASPM on SI for the time being. 1893 */ 1894 if (adev->family == AMDGPU_FAMILY_SI) 1895 return true; 1896 1897 #if IS_ENABLED(CONFIG_X86) 1898 struct cpuinfo_x86 *c = &cpu_data(0); 1899 1900 if (!(amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(12, 0, 0) || 1901 amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(12, 0, 1))) 1902 return false; 1903 1904 if (c->x86 == 6 && 1905 adev->pm.pcie_gen_mask & CAIL_PCIE_LINK_SPEED_SUPPORT_GEN5) { 1906 switch (c->x86_model) { 1907 case VFM_MODEL(INTEL_ALDERLAKE): 1908 case VFM_MODEL(INTEL_ALDERLAKE_L): 1909 case VFM_MODEL(INTEL_RAPTORLAKE): 1910 case VFM_MODEL(INTEL_RAPTORLAKE_P): 1911 case VFM_MODEL(INTEL_RAPTORLAKE_S): 1912 return true; 1913 default: 1914 return false; 1915 } 1916 } else { 1917 return false; 1918 } 1919 #else 1920 return false; 1921 #endif 1922 } 1923 1924 /** 1925 * amdgpu_device_should_use_aspm - check if the device should program ASPM 1926 * 1927 * @adev: amdgpu_device pointer 1928 * 1929 * Confirm whether the module parameter and pcie bridge agree that ASPM should 1930 * be set for this device. 1931 * 1932 * Returns true if it should be used or false if not. 1933 */ 1934 bool amdgpu_device_should_use_aspm(struct amdgpu_device *adev) 1935 { 1936 switch (amdgpu_aspm) { 1937 case -1: 1938 break; 1939 case 0: 1940 return false; 1941 case 1: 1942 return true; 1943 default: 1944 return false; 1945 } 1946 if (adev->flags & AMD_IS_APU) 1947 return false; 1948 if (amdgpu_device_aspm_support_quirk(adev)) 1949 return false; 1950 return pcie_aspm_enabled(adev->pdev); 1951 } 1952 1953 /* if we get transitioned to only one device, take VGA back */ 1954 /** 1955 * amdgpu_device_vga_set_decode - enable/disable vga decode 1956 * 1957 * @pdev: PCI device pointer 1958 * @state: enable/disable vga decode 1959 * 1960 * Enable/disable vga decode (all asics). 1961 * Returns VGA resource flags. 1962 */ 1963 static unsigned int amdgpu_device_vga_set_decode(struct pci_dev *pdev, 1964 bool state) 1965 { 1966 struct amdgpu_device *adev = drm_to_adev(pci_get_drvdata(pdev)); 1967 1968 amdgpu_asic_set_vga_state(adev, state); 1969 if (state) 1970 return VGA_RSRC_LEGACY_IO | VGA_RSRC_LEGACY_MEM | 1971 VGA_RSRC_NORMAL_IO | VGA_RSRC_NORMAL_MEM; 1972 else 1973 return VGA_RSRC_NORMAL_IO | VGA_RSRC_NORMAL_MEM; 1974 } 1975 1976 /** 1977 * amdgpu_device_check_block_size - validate the vm block size 1978 * 1979 * @adev: amdgpu_device pointer 1980 * 1981 * Validates the vm block size specified via module parameter. 1982 * The vm block size defines number of bits in page table versus page directory, 1983 * a page is 4KB so we have 12 bits offset, minimum 9 bits in the 1984 * page table and the remaining bits are in the page directory. 1985 */ 1986 static void amdgpu_device_check_block_size(struct amdgpu_device *adev) 1987 { 1988 /* defines number of bits in page table versus page directory, 1989 * a page is 4KB so we have 12 bits offset, minimum 9 bits in the 1990 * page table and the remaining bits are in the page directory 1991 */ 1992 if (amdgpu_vm_block_size == -1) 1993 return; 1994 1995 if (amdgpu_vm_block_size < 9) { 1996 dev_warn(adev->dev, "VM page table size (%d) too small\n", 1997 amdgpu_vm_block_size); 1998 amdgpu_vm_block_size = -1; 1999 } 2000 } 2001 2002 /** 2003 * amdgpu_device_check_vm_size - validate the vm size 2004 * 2005 * @adev: amdgpu_device pointer 2006 * 2007 * Validates the vm size in GB specified via module parameter. 2008 * The VM size is the size of the GPU virtual memory space in GB. 2009 */ 2010 static void amdgpu_device_check_vm_size(struct amdgpu_device *adev) 2011 { 2012 /* no need to check the default value */ 2013 if (amdgpu_vm_size == -1) 2014 return; 2015 2016 if (amdgpu_vm_size < 1) { 2017 dev_warn(adev->dev, "VM size (%d) too small, min is 1GB\n", 2018 amdgpu_vm_size); 2019 amdgpu_vm_size = -1; 2020 } 2021 } 2022 2023 static void amdgpu_device_check_smu_prv_buffer_size(struct amdgpu_device *adev) 2024 { 2025 struct sysinfo si; 2026 bool is_os_64 = (sizeof(void *) == 8); 2027 uint64_t total_memory; 2028 uint64_t dram_size_seven_GB = 0x1B8000000; 2029 uint64_t dram_size_three_GB = 0xB8000000; 2030 2031 if (amdgpu_smu_memory_pool_size == 0) 2032 return; 2033 2034 if (!is_os_64) { 2035 dev_warn(adev->dev, "Not 64-bit OS, feature not supported\n"); 2036 goto def_value; 2037 } 2038 si_meminfo(&si); 2039 total_memory = (uint64_t)si.totalram * si.mem_unit; 2040 2041 if ((amdgpu_smu_memory_pool_size == 1) || 2042 (amdgpu_smu_memory_pool_size == 2)) { 2043 if (total_memory < dram_size_three_GB) 2044 goto def_value1; 2045 } else if ((amdgpu_smu_memory_pool_size == 4) || 2046 (amdgpu_smu_memory_pool_size == 8)) { 2047 if (total_memory < dram_size_seven_GB) 2048 goto def_value1; 2049 } else { 2050 dev_warn(adev->dev, "Smu memory pool size not supported\n"); 2051 goto def_value; 2052 } 2053 adev->pm.smu_prv_buffer_size = amdgpu_smu_memory_pool_size << 28; 2054 2055 return; 2056 2057 def_value1: 2058 dev_warn(adev->dev, "No enough system memory\n"); 2059 def_value: 2060 adev->pm.smu_prv_buffer_size = 0; 2061 } 2062 2063 static int amdgpu_device_init_apu_flags(struct amdgpu_device *adev) 2064 { 2065 if (!(adev->flags & AMD_IS_APU) || 2066 adev->asic_type < CHIP_RAVEN) 2067 return 0; 2068 2069 switch (adev->asic_type) { 2070 case CHIP_RAVEN: 2071 if (adev->pdev->device == 0x15dd) 2072 adev->apu_flags |= AMD_APU_IS_RAVEN; 2073 if (adev->pdev->device == 0x15d8) 2074 adev->apu_flags |= AMD_APU_IS_PICASSO; 2075 break; 2076 case CHIP_RENOIR: 2077 if ((adev->pdev->device == 0x1636) || 2078 (adev->pdev->device == 0x164c)) 2079 adev->apu_flags |= AMD_APU_IS_RENOIR; 2080 else 2081 adev->apu_flags |= AMD_APU_IS_GREEN_SARDINE; 2082 break; 2083 case CHIP_VANGOGH: 2084 adev->apu_flags |= AMD_APU_IS_VANGOGH; 2085 break; 2086 case CHIP_YELLOW_CARP: 2087 break; 2088 case CHIP_CYAN_SKILLFISH: 2089 if ((adev->pdev->device == 0x13FE) || 2090 (adev->pdev->device == 0x143F)) 2091 adev->apu_flags |= AMD_APU_IS_CYAN_SKILLFISH2; 2092 break; 2093 default: 2094 break; 2095 } 2096 2097 return 0; 2098 } 2099 2100 /** 2101 * amdgpu_device_check_arguments - validate module params 2102 * 2103 * @adev: amdgpu_device pointer 2104 * 2105 * Validates certain module parameters and updates 2106 * the associated values used by the driver (all asics). 2107 */ 2108 static int amdgpu_device_check_arguments(struct amdgpu_device *adev) 2109 { 2110 int i; 2111 2112 if (amdgpu_sched_jobs < 4) { 2113 dev_warn(adev->dev, "sched jobs (%d) must be at least 4\n", 2114 amdgpu_sched_jobs); 2115 amdgpu_sched_jobs = 4; 2116 } else if (!is_power_of_2(amdgpu_sched_jobs)) { 2117 dev_warn(adev->dev, "sched jobs (%d) must be a power of 2\n", 2118 amdgpu_sched_jobs); 2119 amdgpu_sched_jobs = roundup_pow_of_two(amdgpu_sched_jobs); 2120 } 2121 2122 if (amdgpu_gart_size != -1 && amdgpu_gart_size < 32) { 2123 /* gart size must be greater or equal to 32M */ 2124 dev_warn(adev->dev, "gart size (%d) too small\n", 2125 amdgpu_gart_size); 2126 amdgpu_gart_size = -1; 2127 } 2128 2129 if (amdgpu_gtt_size != -1 && amdgpu_gtt_size < 32) { 2130 /* gtt size must be greater or equal to 32M */ 2131 dev_warn(adev->dev, "gtt size (%d) too small\n", 2132 amdgpu_gtt_size); 2133 amdgpu_gtt_size = -1; 2134 } 2135 2136 /* valid range is between 4 and 9 inclusive */ 2137 if (amdgpu_vm_fragment_size != -1 && 2138 (amdgpu_vm_fragment_size > 9 || amdgpu_vm_fragment_size < 4)) { 2139 dev_warn(adev->dev, "valid range is between 4 and 9\n"); 2140 amdgpu_vm_fragment_size = -1; 2141 } 2142 2143 if (amdgpu_sched_hw_submission < 2) { 2144 dev_warn(adev->dev, "sched hw submission jobs (%d) must be at least 2\n", 2145 amdgpu_sched_hw_submission); 2146 amdgpu_sched_hw_submission = 2; 2147 } else if (!is_power_of_2(amdgpu_sched_hw_submission)) { 2148 dev_warn(adev->dev, "sched hw submission jobs (%d) must be a power of 2\n", 2149 amdgpu_sched_hw_submission); 2150 amdgpu_sched_hw_submission = roundup_pow_of_two(amdgpu_sched_hw_submission); 2151 } 2152 2153 if (amdgpu_reset_method < -1 || amdgpu_reset_method > 4) { 2154 dev_warn(adev->dev, "invalid option for reset method, reverting to default\n"); 2155 amdgpu_reset_method = -1; 2156 } 2157 2158 amdgpu_device_check_smu_prv_buffer_size(adev); 2159 2160 amdgpu_device_check_vm_size(adev); 2161 2162 amdgpu_device_check_block_size(adev); 2163 2164 adev->firmware.load_type = amdgpu_ucode_get_load_type(adev, amdgpu_fw_load_type); 2165 2166 for (i = 0; i < MAX_XCP; i++) { 2167 switch (amdgpu_enforce_isolation) { 2168 case -1: 2169 case 0: 2170 default: 2171 /* disable */ 2172 adev->enforce_isolation[i] = AMDGPU_ENFORCE_ISOLATION_DISABLE; 2173 break; 2174 case 1: 2175 /* enable */ 2176 adev->enforce_isolation[i] = 2177 AMDGPU_ENFORCE_ISOLATION_ENABLE; 2178 break; 2179 case 2: 2180 /* enable legacy mode */ 2181 adev->enforce_isolation[i] = 2182 AMDGPU_ENFORCE_ISOLATION_ENABLE_LEGACY; 2183 break; 2184 case 3: 2185 /* enable only process isolation without submitting cleaner shader */ 2186 adev->enforce_isolation[i] = 2187 AMDGPU_ENFORCE_ISOLATION_NO_CLEANER_SHADER; 2188 break; 2189 } 2190 } 2191 2192 return 0; 2193 } 2194 2195 /** 2196 * amdgpu_switcheroo_set_state - set switcheroo state 2197 * 2198 * @pdev: pci dev pointer 2199 * @state: vga_switcheroo state 2200 * 2201 * Callback for the switcheroo driver. Suspends or resumes 2202 * the asics before or after it is powered up using ACPI methods. 2203 */ 2204 static void amdgpu_switcheroo_set_state(struct pci_dev *pdev, 2205 enum vga_switcheroo_state state) 2206 { 2207 struct drm_device *dev = pci_get_drvdata(pdev); 2208 int r; 2209 2210 if (amdgpu_device_supports_px(drm_to_adev(dev)) && 2211 state == VGA_SWITCHEROO_OFF) 2212 return; 2213 2214 if (state == VGA_SWITCHEROO_ON) { 2215 pr_info("switched on\n"); 2216 /* don't suspend or resume card normally */ 2217 dev->switch_power_state = DRM_SWITCH_POWER_CHANGING; 2218 2219 pci_set_power_state(pdev, PCI_D0); 2220 amdgpu_device_load_pci_state(pdev); 2221 r = pci_enable_device(pdev); 2222 if (r) 2223 dev_warn(&pdev->dev, "pci_enable_device failed (%d)\n", 2224 r); 2225 amdgpu_device_resume(dev, true); 2226 2227 dev->switch_power_state = DRM_SWITCH_POWER_ON; 2228 } else { 2229 dev_info(&pdev->dev, "switched off\n"); 2230 dev->switch_power_state = DRM_SWITCH_POWER_CHANGING; 2231 amdgpu_device_prepare(dev); 2232 amdgpu_device_suspend(dev, true); 2233 amdgpu_device_cache_pci_state(pdev); 2234 /* Shut down the device */ 2235 pci_disable_device(pdev); 2236 pci_set_power_state(pdev, PCI_D3cold); 2237 dev->switch_power_state = DRM_SWITCH_POWER_OFF; 2238 } 2239 } 2240 2241 /** 2242 * amdgpu_switcheroo_can_switch - see if switcheroo state can change 2243 * 2244 * @pdev: pci dev pointer 2245 * 2246 * Callback for the switcheroo driver. Check of the switcheroo 2247 * state can be changed. 2248 * Returns true if the state can be changed, false if not. 2249 */ 2250 static bool amdgpu_switcheroo_can_switch(struct pci_dev *pdev) 2251 { 2252 struct drm_device *dev = pci_get_drvdata(pdev); 2253 2254 /* 2255 * FIXME: open_count is protected by drm_global_mutex but that would lead to 2256 * locking inversion with the driver load path. And the access here is 2257 * completely racy anyway. So don't bother with locking for now. 2258 */ 2259 return atomic_read(&dev->open_count) == 0; 2260 } 2261 2262 static const struct vga_switcheroo_client_ops amdgpu_switcheroo_ops = { 2263 .set_gpu_state = amdgpu_switcheroo_set_state, 2264 .reprobe = NULL, 2265 .can_switch = amdgpu_switcheroo_can_switch, 2266 }; 2267 2268 /** 2269 * amdgpu_device_ip_set_clockgating_state - set the CG state 2270 * 2271 * @dev: amdgpu_device pointer 2272 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.) 2273 * @state: clockgating state (gate or ungate) 2274 * 2275 * Sets the requested clockgating state for all instances of 2276 * the hardware IP specified. 2277 * Returns the error code from the last instance. 2278 */ 2279 int amdgpu_device_ip_set_clockgating_state(void *dev, 2280 enum amd_ip_block_type block_type, 2281 enum amd_clockgating_state state) 2282 { 2283 struct amdgpu_device *adev = dev; 2284 int i, r = 0; 2285 2286 for (i = 0; i < adev->num_ip_blocks; i++) { 2287 if (!adev->ip_blocks[i].status.valid) 2288 continue; 2289 if (adev->ip_blocks[i].version->type != block_type) 2290 continue; 2291 if (!adev->ip_blocks[i].version->funcs->set_clockgating_state) 2292 continue; 2293 r = adev->ip_blocks[i].version->funcs->set_clockgating_state( 2294 &adev->ip_blocks[i], state); 2295 if (r) 2296 dev_err(adev->dev, 2297 "set_clockgating_state of IP block <%s> failed %d\n", 2298 adev->ip_blocks[i].version->funcs->name, r); 2299 } 2300 return r; 2301 } 2302 2303 /** 2304 * amdgpu_device_ip_set_powergating_state - set the PG state 2305 * 2306 * @dev: amdgpu_device pointer 2307 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.) 2308 * @state: powergating state (gate or ungate) 2309 * 2310 * Sets the requested powergating state for all instances of 2311 * the hardware IP specified. 2312 * Returns the error code from the last instance. 2313 */ 2314 int amdgpu_device_ip_set_powergating_state(void *dev, 2315 enum amd_ip_block_type block_type, 2316 enum amd_powergating_state state) 2317 { 2318 struct amdgpu_device *adev = dev; 2319 int i, r = 0; 2320 2321 for (i = 0; i < adev->num_ip_blocks; i++) { 2322 if (!adev->ip_blocks[i].status.valid) 2323 continue; 2324 if (adev->ip_blocks[i].version->type != block_type) 2325 continue; 2326 if (!adev->ip_blocks[i].version->funcs->set_powergating_state) 2327 continue; 2328 r = adev->ip_blocks[i].version->funcs->set_powergating_state( 2329 &adev->ip_blocks[i], state); 2330 if (r) 2331 dev_err(adev->dev, 2332 "set_powergating_state of IP block <%s> failed %d\n", 2333 adev->ip_blocks[i].version->funcs->name, r); 2334 } 2335 return r; 2336 } 2337 2338 /** 2339 * amdgpu_device_ip_get_clockgating_state - get the CG state 2340 * 2341 * @adev: amdgpu_device pointer 2342 * @flags: clockgating feature flags 2343 * 2344 * Walks the list of IPs on the device and updates the clockgating 2345 * flags for each IP. 2346 * Updates @flags with the feature flags for each hardware IP where 2347 * clockgating is enabled. 2348 */ 2349 void amdgpu_device_ip_get_clockgating_state(struct amdgpu_device *adev, 2350 u64 *flags) 2351 { 2352 int i; 2353 2354 for (i = 0; i < adev->num_ip_blocks; i++) { 2355 if (!adev->ip_blocks[i].status.valid) 2356 continue; 2357 if (adev->ip_blocks[i].version->funcs->get_clockgating_state) 2358 adev->ip_blocks[i].version->funcs->get_clockgating_state( 2359 &adev->ip_blocks[i], flags); 2360 } 2361 } 2362 2363 /** 2364 * amdgpu_device_ip_wait_for_idle - wait for idle 2365 * 2366 * @adev: amdgpu_device pointer 2367 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.) 2368 * 2369 * Waits for the request hardware IP to be idle. 2370 * Returns 0 for success or a negative error code on failure. 2371 */ 2372 int amdgpu_device_ip_wait_for_idle(struct amdgpu_device *adev, 2373 enum amd_ip_block_type block_type) 2374 { 2375 int i, r; 2376 2377 for (i = 0; i < adev->num_ip_blocks; i++) { 2378 if (!adev->ip_blocks[i].status.valid) 2379 continue; 2380 if (adev->ip_blocks[i].version->type == block_type) { 2381 if (adev->ip_blocks[i].version->funcs->wait_for_idle) { 2382 r = adev->ip_blocks[i].version->funcs->wait_for_idle( 2383 &adev->ip_blocks[i]); 2384 if (r) 2385 return r; 2386 } 2387 break; 2388 } 2389 } 2390 return 0; 2391 2392 } 2393 2394 /** 2395 * amdgpu_device_ip_is_hw - is the hardware IP enabled 2396 * 2397 * @adev: amdgpu_device pointer 2398 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.) 2399 * 2400 * Check if the hardware IP is enable or not. 2401 * Returns true if it the IP is enable, false if not. 2402 */ 2403 bool amdgpu_device_ip_is_hw(struct amdgpu_device *adev, 2404 enum amd_ip_block_type block_type) 2405 { 2406 int i; 2407 2408 for (i = 0; i < adev->num_ip_blocks; i++) { 2409 if (adev->ip_blocks[i].version->type == block_type) 2410 return adev->ip_blocks[i].status.hw; 2411 } 2412 return false; 2413 } 2414 2415 /** 2416 * amdgpu_device_ip_is_valid - is the hardware IP valid 2417 * 2418 * @adev: amdgpu_device pointer 2419 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.) 2420 * 2421 * Check if the hardware IP is valid or not. 2422 * Returns true if it the IP is valid, false if not. 2423 */ 2424 bool amdgpu_device_ip_is_valid(struct amdgpu_device *adev, 2425 enum amd_ip_block_type block_type) 2426 { 2427 int i; 2428 2429 for (i = 0; i < adev->num_ip_blocks; i++) { 2430 if (adev->ip_blocks[i].version->type == block_type) 2431 return adev->ip_blocks[i].status.valid; 2432 } 2433 return false; 2434 2435 } 2436 2437 /** 2438 * amdgpu_device_ip_get_ip_block - get a hw IP pointer 2439 * 2440 * @adev: amdgpu_device pointer 2441 * @type: Type of hardware IP (SMU, GFX, UVD, etc.) 2442 * 2443 * Returns a pointer to the hardware IP block structure 2444 * if it exists for the asic, otherwise NULL. 2445 */ 2446 struct amdgpu_ip_block * 2447 amdgpu_device_ip_get_ip_block(struct amdgpu_device *adev, 2448 enum amd_ip_block_type type) 2449 { 2450 int i; 2451 2452 for (i = 0; i < adev->num_ip_blocks; i++) 2453 if (adev->ip_blocks[i].version->type == type) 2454 return &adev->ip_blocks[i]; 2455 2456 return NULL; 2457 } 2458 2459 /** 2460 * amdgpu_device_ip_block_version_cmp 2461 * 2462 * @adev: amdgpu_device pointer 2463 * @type: enum amd_ip_block_type 2464 * @major: major version 2465 * @minor: minor version 2466 * 2467 * return 0 if equal or greater 2468 * return 1 if smaller or the ip_block doesn't exist 2469 */ 2470 int amdgpu_device_ip_block_version_cmp(struct amdgpu_device *adev, 2471 enum amd_ip_block_type type, 2472 u32 major, u32 minor) 2473 { 2474 struct amdgpu_ip_block *ip_block = amdgpu_device_ip_get_ip_block(adev, type); 2475 2476 if (ip_block && ((ip_block->version->major > major) || 2477 ((ip_block->version->major == major) && 2478 (ip_block->version->minor >= minor)))) 2479 return 0; 2480 2481 return 1; 2482 } 2483 2484 static const char *ip_block_names[] = { 2485 [AMD_IP_BLOCK_TYPE_COMMON] = "common", 2486 [AMD_IP_BLOCK_TYPE_GMC] = "gmc", 2487 [AMD_IP_BLOCK_TYPE_IH] = "ih", 2488 [AMD_IP_BLOCK_TYPE_SMC] = "smu", 2489 [AMD_IP_BLOCK_TYPE_PSP] = "psp", 2490 [AMD_IP_BLOCK_TYPE_DCE] = "dce", 2491 [AMD_IP_BLOCK_TYPE_GFX] = "gfx", 2492 [AMD_IP_BLOCK_TYPE_SDMA] = "sdma", 2493 [AMD_IP_BLOCK_TYPE_UVD] = "uvd", 2494 [AMD_IP_BLOCK_TYPE_VCE] = "vce", 2495 [AMD_IP_BLOCK_TYPE_ACP] = "acp", 2496 [AMD_IP_BLOCK_TYPE_VCN] = "vcn", 2497 [AMD_IP_BLOCK_TYPE_MES] = "mes", 2498 [AMD_IP_BLOCK_TYPE_JPEG] = "jpeg", 2499 [AMD_IP_BLOCK_TYPE_VPE] = "vpe", 2500 [AMD_IP_BLOCK_TYPE_UMSCH_MM] = "umsch_mm", 2501 [AMD_IP_BLOCK_TYPE_ISP] = "isp", 2502 [AMD_IP_BLOCK_TYPE_RAS] = "ras", 2503 }; 2504 2505 static const char *ip_block_name(struct amdgpu_device *adev, enum amd_ip_block_type type) 2506 { 2507 int idx = (int)type; 2508 2509 return idx < ARRAY_SIZE(ip_block_names) ? ip_block_names[idx] : "unknown"; 2510 } 2511 2512 /** 2513 * amdgpu_device_ip_block_add 2514 * 2515 * @adev: amdgpu_device pointer 2516 * @ip_block_version: pointer to the IP to add 2517 * 2518 * Adds the IP block driver information to the collection of IPs 2519 * on the asic. 2520 */ 2521 int amdgpu_device_ip_block_add(struct amdgpu_device *adev, 2522 const struct amdgpu_ip_block_version *ip_block_version) 2523 { 2524 if (!ip_block_version) 2525 return -EINVAL; 2526 2527 switch (ip_block_version->type) { 2528 case AMD_IP_BLOCK_TYPE_VCN: 2529 if (adev->harvest_ip_mask & AMD_HARVEST_IP_VCN_MASK) 2530 return 0; 2531 break; 2532 case AMD_IP_BLOCK_TYPE_JPEG: 2533 if (adev->harvest_ip_mask & AMD_HARVEST_IP_JPEG_MASK) 2534 return 0; 2535 break; 2536 default: 2537 break; 2538 } 2539 2540 dev_info(adev->dev, "detected ip block number %d <%s_v%d_%d_%d> (%s)\n", 2541 adev->num_ip_blocks, 2542 ip_block_name(adev, ip_block_version->type), 2543 ip_block_version->major, 2544 ip_block_version->minor, 2545 ip_block_version->rev, 2546 ip_block_version->funcs->name); 2547 2548 adev->ip_blocks[adev->num_ip_blocks].adev = adev; 2549 2550 adev->ip_blocks[adev->num_ip_blocks++].version = ip_block_version; 2551 2552 return 0; 2553 } 2554 2555 /** 2556 * amdgpu_device_enable_virtual_display - enable virtual display feature 2557 * 2558 * @adev: amdgpu_device pointer 2559 * 2560 * Enabled the virtual display feature if the user has enabled it via 2561 * the module parameter virtual_display. This feature provides a virtual 2562 * display hardware on headless boards or in virtualized environments. 2563 * This function parses and validates the configuration string specified by 2564 * the user and configures the virtual display configuration (number of 2565 * virtual connectors, crtcs, etc.) specified. 2566 */ 2567 static void amdgpu_device_enable_virtual_display(struct amdgpu_device *adev) 2568 { 2569 adev->enable_virtual_display = false; 2570 2571 if (amdgpu_virtual_display) { 2572 const char *pci_address_name = pci_name(adev->pdev); 2573 char *pciaddstr, *pciaddstr_tmp, *pciaddname_tmp, *pciaddname; 2574 2575 pciaddstr = kstrdup(amdgpu_virtual_display, GFP_KERNEL); 2576 pciaddstr_tmp = pciaddstr; 2577 while ((pciaddname_tmp = strsep(&pciaddstr_tmp, ";"))) { 2578 pciaddname = strsep(&pciaddname_tmp, ","); 2579 if (!strcmp("all", pciaddname) 2580 || !strcmp(pci_address_name, pciaddname)) { 2581 long num_crtc; 2582 int res = -1; 2583 2584 adev->enable_virtual_display = true; 2585 2586 if (pciaddname_tmp) 2587 res = kstrtol(pciaddname_tmp, 10, 2588 &num_crtc); 2589 2590 if (!res) { 2591 if (num_crtc < 1) 2592 num_crtc = 1; 2593 if (num_crtc > 6) 2594 num_crtc = 6; 2595 adev->mode_info.num_crtc = num_crtc; 2596 } else { 2597 adev->mode_info.num_crtc = 1; 2598 } 2599 break; 2600 } 2601 } 2602 2603 dev_info( 2604 adev->dev, 2605 "virtual display string:%s, %s:virtual_display:%d, num_crtc:%d\n", 2606 amdgpu_virtual_display, pci_address_name, 2607 adev->enable_virtual_display, adev->mode_info.num_crtc); 2608 2609 kfree(pciaddstr); 2610 } 2611 } 2612 2613 void amdgpu_device_set_sriov_virtual_display(struct amdgpu_device *adev) 2614 { 2615 if (amdgpu_sriov_vf(adev) && !adev->enable_virtual_display) { 2616 adev->mode_info.num_crtc = 1; 2617 adev->enable_virtual_display = true; 2618 dev_info(adev->dev, "virtual_display:%d, num_crtc:%d\n", 2619 adev->enable_virtual_display, 2620 adev->mode_info.num_crtc); 2621 } 2622 } 2623 2624 /** 2625 * amdgpu_device_parse_gpu_info_fw - parse gpu info firmware 2626 * 2627 * @adev: amdgpu_device pointer 2628 * 2629 * Parses the asic configuration parameters specified in the gpu info 2630 * firmware and makes them available to the driver for use in configuring 2631 * the asic. 2632 * Returns 0 on success, -EINVAL on failure. 2633 */ 2634 static int amdgpu_device_parse_gpu_info_fw(struct amdgpu_device *adev) 2635 { 2636 const char *chip_name; 2637 int err; 2638 const struct gpu_info_firmware_header_v1_0 *hdr; 2639 2640 adev->firmware.gpu_info_fw = NULL; 2641 2642 switch (adev->asic_type) { 2643 default: 2644 return 0; 2645 case CHIP_VEGA10: 2646 chip_name = "vega10"; 2647 break; 2648 case CHIP_VEGA12: 2649 chip_name = "vega12"; 2650 break; 2651 case CHIP_RAVEN: 2652 if (adev->apu_flags & AMD_APU_IS_RAVEN2) 2653 chip_name = "raven2"; 2654 else if (adev->apu_flags & AMD_APU_IS_PICASSO) 2655 chip_name = "picasso"; 2656 else 2657 chip_name = "raven"; 2658 break; 2659 case CHIP_ARCTURUS: 2660 chip_name = "arcturus"; 2661 break; 2662 case CHIP_NAVI12: 2663 if (adev->discovery.bin) 2664 return 0; 2665 chip_name = "navi12"; 2666 break; 2667 case CHIP_CYAN_SKILLFISH: 2668 chip_name = "cyan_skillfish"; 2669 break; 2670 } 2671 2672 err = amdgpu_ucode_request(adev, &adev->firmware.gpu_info_fw, 2673 AMDGPU_UCODE_OPTIONAL, 2674 "amdgpu/%s_gpu_info.bin", chip_name); 2675 if (err) { 2676 dev_err(adev->dev, 2677 "Failed to get gpu_info firmware \"%s_gpu_info.bin\"\n", 2678 chip_name); 2679 goto out; 2680 } 2681 2682 hdr = (const struct gpu_info_firmware_header_v1_0 *)adev->firmware.gpu_info_fw->data; 2683 amdgpu_ucode_print_gpu_info_hdr(&hdr->header); 2684 2685 switch (hdr->version_major) { 2686 case 1: 2687 { 2688 const struct gpu_info_firmware_v1_0 *gpu_info_fw = 2689 (const struct gpu_info_firmware_v1_0 *)(adev->firmware.gpu_info_fw->data + 2690 le32_to_cpu(hdr->header.ucode_array_offset_bytes)); 2691 2692 /* 2693 * Should be dropped when DAL no longer needs it. 2694 */ 2695 if (adev->asic_type == CHIP_NAVI12) 2696 goto parse_soc_bounding_box; 2697 2698 adev->gfx.config.max_shader_engines = le32_to_cpu(gpu_info_fw->gc_num_se); 2699 adev->gfx.config.max_cu_per_sh = le32_to_cpu(gpu_info_fw->gc_num_cu_per_sh); 2700 adev->gfx.config.max_sh_per_se = le32_to_cpu(gpu_info_fw->gc_num_sh_per_se); 2701 adev->gfx.config.max_backends_per_se = le32_to_cpu(gpu_info_fw->gc_num_rb_per_se); 2702 adev->gfx.config.max_texture_channel_caches = 2703 le32_to_cpu(gpu_info_fw->gc_num_tccs); 2704 adev->gfx.config.max_gprs = le32_to_cpu(gpu_info_fw->gc_num_gprs); 2705 adev->gfx.config.max_gs_threads = le32_to_cpu(gpu_info_fw->gc_num_max_gs_thds); 2706 adev->gfx.config.gs_vgt_table_depth = le32_to_cpu(gpu_info_fw->gc_gs_table_depth); 2707 adev->gfx.config.gs_prim_buffer_depth = le32_to_cpu(gpu_info_fw->gc_gsprim_buff_depth); 2708 adev->gfx.config.double_offchip_lds_buf = 2709 le32_to_cpu(gpu_info_fw->gc_double_offchip_lds_buffer); 2710 adev->gfx.cu_info.wave_front_size = le32_to_cpu(gpu_info_fw->gc_wave_size); 2711 adev->gfx.cu_info.max_waves_per_simd = 2712 le32_to_cpu(gpu_info_fw->gc_max_waves_per_simd); 2713 adev->gfx.cu_info.max_scratch_slots_per_cu = 2714 le32_to_cpu(gpu_info_fw->gc_max_scratch_slots_per_cu); 2715 adev->gfx.cu_info.lds_size = le32_to_cpu(gpu_info_fw->gc_lds_size); 2716 if (hdr->version_minor >= 1) { 2717 const struct gpu_info_firmware_v1_1 *gpu_info_fw = 2718 (const struct gpu_info_firmware_v1_1 *)(adev->firmware.gpu_info_fw->data + 2719 le32_to_cpu(hdr->header.ucode_array_offset_bytes)); 2720 adev->gfx.config.num_sc_per_sh = 2721 le32_to_cpu(gpu_info_fw->num_sc_per_sh); 2722 adev->gfx.config.num_packer_per_sc = 2723 le32_to_cpu(gpu_info_fw->num_packer_per_sc); 2724 } 2725 2726 parse_soc_bounding_box: 2727 /* 2728 * soc bounding box info is not integrated in disocovery table, 2729 * we always need to parse it from gpu info firmware if needed. 2730 */ 2731 if (hdr->version_minor == 2) { 2732 const struct gpu_info_firmware_v1_2 *gpu_info_fw = 2733 (const struct gpu_info_firmware_v1_2 *)(adev->firmware.gpu_info_fw->data + 2734 le32_to_cpu(hdr->header.ucode_array_offset_bytes)); 2735 adev->dm.soc_bounding_box = &gpu_info_fw->soc_bounding_box; 2736 } 2737 break; 2738 } 2739 default: 2740 dev_err(adev->dev, 2741 "Unsupported gpu_info table %d\n", hdr->header.ucode_version); 2742 err = -EINVAL; 2743 goto out; 2744 } 2745 out: 2746 return err; 2747 } 2748 2749 static void amdgpu_uid_init(struct amdgpu_device *adev) 2750 { 2751 /* Initialize the UID for the device */ 2752 adev->uid_info = kzalloc(sizeof(struct amdgpu_uid), GFP_KERNEL); 2753 if (!adev->uid_info) { 2754 dev_warn(adev->dev, "Failed to allocate memory for UID\n"); 2755 return; 2756 } 2757 adev->uid_info->adev = adev; 2758 } 2759 2760 static void amdgpu_uid_fini(struct amdgpu_device *adev) 2761 { 2762 /* Free the UID memory */ 2763 kfree(adev->uid_info); 2764 adev->uid_info = NULL; 2765 } 2766 2767 /** 2768 * amdgpu_device_ip_early_init - run early init for hardware IPs 2769 * 2770 * @adev: amdgpu_device pointer 2771 * 2772 * Early initialization pass for hardware IPs. The hardware IPs that make 2773 * up each asic are discovered each IP's early_init callback is run. This 2774 * is the first stage in initializing the asic. 2775 * Returns 0 on success, negative error code on failure. 2776 */ 2777 static int amdgpu_device_ip_early_init(struct amdgpu_device *adev) 2778 { 2779 struct amdgpu_ip_block *ip_block; 2780 struct pci_dev *parent; 2781 bool total, skip_bios; 2782 uint32_t bios_flags; 2783 int i, r; 2784 2785 amdgpu_device_enable_virtual_display(adev); 2786 2787 if (amdgpu_sriov_vf(adev)) { 2788 r = amdgpu_virt_request_full_gpu(adev, true); 2789 if (r) 2790 return r; 2791 2792 r = amdgpu_virt_init_critical_region(adev); 2793 if (r) 2794 return r; 2795 } 2796 2797 switch (adev->asic_type) { 2798 #ifdef CONFIG_DRM_AMDGPU_SI 2799 case CHIP_VERDE: 2800 case CHIP_TAHITI: 2801 case CHIP_PITCAIRN: 2802 case CHIP_OLAND: 2803 case CHIP_HAINAN: 2804 adev->family = AMDGPU_FAMILY_SI; 2805 r = si_set_ip_blocks(adev); 2806 if (r) 2807 return r; 2808 break; 2809 #endif 2810 #ifdef CONFIG_DRM_AMDGPU_CIK 2811 case CHIP_BONAIRE: 2812 case CHIP_HAWAII: 2813 case CHIP_KAVERI: 2814 case CHIP_KABINI: 2815 case CHIP_MULLINS: 2816 if (adev->flags & AMD_IS_APU) 2817 adev->family = AMDGPU_FAMILY_KV; 2818 else 2819 adev->family = AMDGPU_FAMILY_CI; 2820 2821 r = cik_set_ip_blocks(adev); 2822 if (r) 2823 return r; 2824 break; 2825 #endif 2826 case CHIP_TOPAZ: 2827 case CHIP_TONGA: 2828 case CHIP_FIJI: 2829 case CHIP_POLARIS10: 2830 case CHIP_POLARIS11: 2831 case CHIP_POLARIS12: 2832 case CHIP_VEGAM: 2833 case CHIP_CARRIZO: 2834 case CHIP_STONEY: 2835 if (adev->flags & AMD_IS_APU) 2836 adev->family = AMDGPU_FAMILY_CZ; 2837 else 2838 adev->family = AMDGPU_FAMILY_VI; 2839 2840 r = vi_set_ip_blocks(adev); 2841 if (r) 2842 return r; 2843 break; 2844 default: 2845 r = amdgpu_discovery_set_ip_blocks(adev); 2846 if (r) 2847 return r; 2848 break; 2849 } 2850 2851 /* Check for IP version 9.4.3 with A0 hardware */ 2852 if (amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 3) && 2853 !amdgpu_device_get_rev_id(adev)) { 2854 dev_err(adev->dev, "Unsupported A0 hardware\n"); 2855 return -ENODEV; /* device unsupported - no device error */ 2856 } 2857 2858 if (amdgpu_has_atpx() && 2859 (amdgpu_is_atpx_hybrid() || 2860 amdgpu_has_atpx_dgpu_power_cntl()) && 2861 ((adev->flags & AMD_IS_APU) == 0) && 2862 !dev_is_removable(&adev->pdev->dev)) 2863 adev->flags |= AMD_IS_PX; 2864 2865 if (!(adev->flags & AMD_IS_APU)) { 2866 parent = pcie_find_root_port(adev->pdev); 2867 adev->has_pr3 = parent ? pci_pr3_present(parent) : false; 2868 } 2869 2870 adev->pm.pp_feature = amdgpu_pp_feature_mask; 2871 if (amdgpu_sriov_vf(adev) || sched_policy == KFD_SCHED_POLICY_NO_HWS) 2872 adev->pm.pp_feature &= ~PP_GFXOFF_MASK; 2873 if (amdgpu_sriov_vf(adev) && adev->asic_type == CHIP_SIENNA_CICHLID) 2874 adev->pm.pp_feature &= ~PP_OVERDRIVE_MASK; 2875 if (!amdgpu_device_pcie_dynamic_switching_supported(adev)) 2876 adev->pm.pp_feature &= ~PP_PCIE_DPM_MASK; 2877 2878 adev->virt.is_xgmi_node_migrate_enabled = false; 2879 if (amdgpu_sriov_vf(adev)) { 2880 adev->virt.is_xgmi_node_migrate_enabled = 2881 amdgpu_ip_version((adev), GC_HWIP, 0) == IP_VERSION(9, 4, 4); 2882 } 2883 2884 total = true; 2885 for (i = 0; i < adev->num_ip_blocks; i++) { 2886 ip_block = &adev->ip_blocks[i]; 2887 2888 if ((amdgpu_ip_block_mask & (1 << i)) == 0) { 2889 dev_warn(adev->dev, "disabled ip block: %d <%s>\n", i, 2890 adev->ip_blocks[i].version->funcs->name); 2891 adev->ip_blocks[i].status.valid = false; 2892 } else if (ip_block->version->funcs->early_init) { 2893 r = ip_block->version->funcs->early_init(ip_block); 2894 if (r == -ENOENT) { 2895 adev->ip_blocks[i].status.valid = false; 2896 } else if (r) { 2897 dev_err(adev->dev, 2898 "early_init of IP block <%s> failed %d\n", 2899 adev->ip_blocks[i].version->funcs->name, 2900 r); 2901 total = false; 2902 } else { 2903 adev->ip_blocks[i].status.valid = true; 2904 } 2905 } else { 2906 adev->ip_blocks[i].status.valid = true; 2907 } 2908 /* get the vbios after the asic_funcs are set up */ 2909 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON) { 2910 r = amdgpu_device_parse_gpu_info_fw(adev); 2911 if (r) 2912 return r; 2913 2914 bios_flags = amdgpu_device_get_vbios_flags(adev); 2915 skip_bios = !!(bios_flags & AMDGPU_VBIOS_SKIP); 2916 /* Read BIOS */ 2917 if (!skip_bios) { 2918 bool optional = 2919 !!(bios_flags & AMDGPU_VBIOS_OPTIONAL); 2920 if (!amdgpu_get_bios(adev) && !optional) 2921 return -EINVAL; 2922 2923 if (optional && !adev->bios) 2924 dev_info( 2925 adev->dev, 2926 "VBIOS image optional, proceeding without VBIOS image"); 2927 2928 if (adev->bios) { 2929 r = amdgpu_atombios_init(adev); 2930 if (r) { 2931 dev_err(adev->dev, 2932 "amdgpu_atombios_init failed\n"); 2933 amdgpu_vf_error_put( 2934 adev, 2935 AMDGIM_ERROR_VF_ATOMBIOS_INIT_FAIL, 2936 0, 0); 2937 return r; 2938 } 2939 } 2940 } 2941 2942 /*get pf2vf msg info at it's earliest time*/ 2943 if (amdgpu_sriov_vf(adev)) 2944 amdgpu_virt_init_data_exchange(adev); 2945 2946 } 2947 } 2948 if (!total) 2949 return -ENODEV; 2950 2951 if (adev->gmc.xgmi.supported) 2952 amdgpu_xgmi_early_init(adev); 2953 2954 if (amdgpu_is_multi_aid(adev)) 2955 amdgpu_uid_init(adev); 2956 ip_block = amdgpu_device_ip_get_ip_block(adev, AMD_IP_BLOCK_TYPE_GFX); 2957 if (ip_block->status.valid != false) 2958 amdgpu_amdkfd_device_probe(adev); 2959 2960 adev->cg_flags &= amdgpu_cg_mask; 2961 adev->pg_flags &= amdgpu_pg_mask; 2962 2963 return 0; 2964 } 2965 2966 static int amdgpu_device_ip_hw_init_phase1(struct amdgpu_device *adev) 2967 { 2968 int i, r; 2969 2970 for (i = 0; i < adev->num_ip_blocks; i++) { 2971 if (!adev->ip_blocks[i].status.sw) 2972 continue; 2973 if (adev->ip_blocks[i].status.hw) 2974 continue; 2975 if (!amdgpu_ip_member_of_hwini( 2976 adev, adev->ip_blocks[i].version->type)) 2977 continue; 2978 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON || 2979 (amdgpu_sriov_vf(adev) && (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP)) || 2980 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH) { 2981 r = adev->ip_blocks[i].version->funcs->hw_init(&adev->ip_blocks[i]); 2982 if (r) { 2983 dev_err(adev->dev, 2984 "hw_init of IP block <%s> failed %d\n", 2985 adev->ip_blocks[i].version->funcs->name, 2986 r); 2987 return r; 2988 } 2989 adev->ip_blocks[i].status.hw = true; 2990 } 2991 } 2992 2993 return 0; 2994 } 2995 2996 static int amdgpu_device_ip_hw_init_phase2(struct amdgpu_device *adev) 2997 { 2998 int i, r; 2999 3000 for (i = 0; i < adev->num_ip_blocks; i++) { 3001 if (!adev->ip_blocks[i].status.sw) 3002 continue; 3003 if (adev->ip_blocks[i].status.hw) 3004 continue; 3005 if (!amdgpu_ip_member_of_hwini( 3006 adev, adev->ip_blocks[i].version->type)) 3007 continue; 3008 r = adev->ip_blocks[i].version->funcs->hw_init(&adev->ip_blocks[i]); 3009 if (r) { 3010 dev_err(adev->dev, 3011 "hw_init of IP block <%s> failed %d\n", 3012 adev->ip_blocks[i].version->funcs->name, r); 3013 return r; 3014 } 3015 adev->ip_blocks[i].status.hw = true; 3016 } 3017 3018 return 0; 3019 } 3020 3021 static int amdgpu_device_fw_loading(struct amdgpu_device *adev) 3022 { 3023 int r = 0; 3024 int i; 3025 uint32_t smu_version; 3026 3027 if (adev->asic_type >= CHIP_VEGA10) { 3028 for (i = 0; i < adev->num_ip_blocks; i++) { 3029 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_PSP) 3030 continue; 3031 3032 if (!amdgpu_ip_member_of_hwini(adev, 3033 AMD_IP_BLOCK_TYPE_PSP)) 3034 break; 3035 3036 if (!adev->ip_blocks[i].status.sw) 3037 continue; 3038 3039 /* no need to do the fw loading again if already done*/ 3040 if (adev->ip_blocks[i].status.hw == true) 3041 break; 3042 3043 if (amdgpu_in_reset(adev) || adev->in_suspend) { 3044 r = amdgpu_ip_block_resume(&adev->ip_blocks[i]); 3045 if (r) 3046 return r; 3047 } else { 3048 r = adev->ip_blocks[i].version->funcs->hw_init(&adev->ip_blocks[i]); 3049 if (r) { 3050 dev_err(adev->dev, 3051 "hw_init of IP block <%s> failed %d\n", 3052 adev->ip_blocks[i] 3053 .version->funcs->name, 3054 r); 3055 return r; 3056 } 3057 adev->ip_blocks[i].status.hw = true; 3058 } 3059 break; 3060 } 3061 } 3062 3063 if (!amdgpu_sriov_vf(adev) || adev->asic_type == CHIP_TONGA) 3064 r = amdgpu_pm_load_smu_firmware(adev, &smu_version); 3065 3066 return r; 3067 } 3068 3069 static int amdgpu_device_init_schedulers(struct amdgpu_device *adev) 3070 { 3071 struct drm_sched_init_args args = { 3072 .ops = &amdgpu_sched_ops, 3073 .num_rqs = DRM_SCHED_PRIORITY_COUNT, 3074 .timeout_wq = adev->reset_domain->wq, 3075 .dev = adev->dev, 3076 }; 3077 long timeout; 3078 int r, i; 3079 3080 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 3081 struct amdgpu_ring *ring = adev->rings[i]; 3082 3083 /* No need to setup the GPU scheduler for rings that don't need it */ 3084 if (!ring || ring->no_scheduler) 3085 continue; 3086 3087 switch (ring->funcs->type) { 3088 case AMDGPU_RING_TYPE_GFX: 3089 timeout = adev->gfx_timeout; 3090 break; 3091 case AMDGPU_RING_TYPE_COMPUTE: 3092 timeout = adev->compute_timeout; 3093 break; 3094 case AMDGPU_RING_TYPE_SDMA: 3095 timeout = adev->sdma_timeout; 3096 break; 3097 default: 3098 timeout = adev->video_timeout; 3099 break; 3100 } 3101 3102 args.timeout = timeout; 3103 args.credit_limit = ring->num_hw_submission; 3104 args.score = ring->sched_score; 3105 args.name = ring->name; 3106 3107 r = drm_sched_init(&ring->sched, &args); 3108 if (r) { 3109 dev_err(adev->dev, 3110 "Failed to create scheduler on ring %s.\n", 3111 ring->name); 3112 return r; 3113 } 3114 r = amdgpu_uvd_entity_init(adev, ring); 3115 if (r) { 3116 dev_err(adev->dev, 3117 "Failed to create UVD scheduling entity on ring %s.\n", 3118 ring->name); 3119 return r; 3120 } 3121 r = amdgpu_vce_entity_init(adev, ring); 3122 if (r) { 3123 dev_err(adev->dev, 3124 "Failed to create VCE scheduling entity on ring %s.\n", 3125 ring->name); 3126 return r; 3127 } 3128 } 3129 3130 if (adev->xcp_mgr) 3131 amdgpu_xcp_update_partition_sched_list(adev); 3132 3133 return 0; 3134 } 3135 3136 3137 /** 3138 * amdgpu_device_ip_init - run init for hardware IPs 3139 * 3140 * @adev: amdgpu_device pointer 3141 * 3142 * Main initialization pass for hardware IPs. The list of all the hardware 3143 * IPs that make up the asic is walked and the sw_init and hw_init callbacks 3144 * are run. sw_init initializes the software state associated with each IP 3145 * and hw_init initializes the hardware associated with each IP. 3146 * Returns 0 on success, negative error code on failure. 3147 */ 3148 static int amdgpu_device_ip_init(struct amdgpu_device *adev) 3149 { 3150 bool init_badpage; 3151 int i, r; 3152 3153 r = amdgpu_ras_init(adev); 3154 if (r) 3155 return r; 3156 3157 for (i = 0; i < adev->num_ip_blocks; i++) { 3158 if (!adev->ip_blocks[i].status.valid) 3159 continue; 3160 if (adev->ip_blocks[i].version->funcs->sw_init) { 3161 r = adev->ip_blocks[i].version->funcs->sw_init(&adev->ip_blocks[i]); 3162 if (r) { 3163 dev_err(adev->dev, 3164 "sw_init of IP block <%s> failed %d\n", 3165 adev->ip_blocks[i].version->funcs->name, 3166 r); 3167 goto init_failed; 3168 } 3169 } 3170 adev->ip_blocks[i].status.sw = true; 3171 3172 if (!amdgpu_ip_member_of_hwini( 3173 adev, adev->ip_blocks[i].version->type)) 3174 continue; 3175 3176 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON) { 3177 /* need to do common hw init early so everything is set up for gmc */ 3178 r = adev->ip_blocks[i].version->funcs->hw_init(&adev->ip_blocks[i]); 3179 if (r) { 3180 dev_err(adev->dev, "hw_init %d failed %d\n", i, 3181 r); 3182 goto init_failed; 3183 } 3184 adev->ip_blocks[i].status.hw = true; 3185 } else if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) { 3186 /* need to do gmc hw init early so we can allocate gpu mem */ 3187 /* Try to reserve bad pages early */ 3188 if (amdgpu_sriov_vf(adev)) 3189 amdgpu_virt_exchange_data(adev); 3190 3191 r = amdgpu_device_mem_scratch_init(adev); 3192 if (r) { 3193 dev_err(adev->dev, 3194 "amdgpu_mem_scratch_init failed %d\n", 3195 r); 3196 goto init_failed; 3197 } 3198 r = adev->ip_blocks[i].version->funcs->hw_init(&adev->ip_blocks[i]); 3199 if (r) { 3200 dev_err(adev->dev, "hw_init %d failed %d\n", i, 3201 r); 3202 goto init_failed; 3203 } 3204 r = amdgpu_device_wb_init(adev); 3205 if (r) { 3206 dev_err(adev->dev, 3207 "amdgpu_device_wb_init failed %d\n", r); 3208 goto init_failed; 3209 } 3210 adev->ip_blocks[i].status.hw = true; 3211 3212 /* right after GMC hw init, we create CSA */ 3213 if (adev->gfx.mcbp) { 3214 r = amdgpu_allocate_static_csa(adev, &adev->virt.csa_obj, 3215 AMDGPU_GEM_DOMAIN_VRAM | 3216 AMDGPU_GEM_DOMAIN_GTT, 3217 AMDGPU_CSA_SIZE); 3218 if (r) { 3219 dev_err(adev->dev, 3220 "allocate CSA failed %d\n", r); 3221 goto init_failed; 3222 } 3223 } 3224 3225 r = amdgpu_seq64_init(adev); 3226 if (r) { 3227 dev_err(adev->dev, "allocate seq64 failed %d\n", 3228 r); 3229 goto init_failed; 3230 } 3231 } 3232 } 3233 3234 if (amdgpu_sriov_vf(adev)) 3235 amdgpu_virt_init_data_exchange(adev); 3236 3237 r = amdgpu_ib_pool_init(adev); 3238 if (r) { 3239 dev_err(adev->dev, "IB initialization failed (%d).\n", r); 3240 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_IB_INIT_FAIL, 0, r); 3241 goto init_failed; 3242 } 3243 3244 r = amdgpu_ucode_create_bo(adev); /* create ucode bo when sw_init complete*/ 3245 if (r) 3246 goto init_failed; 3247 3248 r = amdgpu_device_ip_hw_init_phase1(adev); 3249 if (r) 3250 goto init_failed; 3251 3252 r = amdgpu_device_fw_loading(adev); 3253 if (r) 3254 goto init_failed; 3255 3256 r = amdgpu_device_ip_hw_init_phase2(adev); 3257 if (r) 3258 goto init_failed; 3259 3260 /* 3261 * retired pages will be loaded from eeprom and reserved here, 3262 * it should be called after amdgpu_device_ip_hw_init_phase2 since 3263 * for some ASICs the RAS EEPROM code relies on SMU fully functioning 3264 * for I2C communication which only true at this point. 3265 * 3266 * amdgpu_ras_recovery_init may fail, but the upper only cares the 3267 * failure from bad gpu situation and stop amdgpu init process 3268 * accordingly. For other failed cases, it will still release all 3269 * the resource and print error message, rather than returning one 3270 * negative value to upper level. 3271 * 3272 * Note: theoretically, this should be called before all vram allocations 3273 * to protect retired page from abusing 3274 */ 3275 init_badpage = (adev->init_lvl->level != AMDGPU_INIT_LEVEL_MINIMAL_XGMI); 3276 r = amdgpu_ras_recovery_init(adev, init_badpage); 3277 if (r) 3278 goto init_failed; 3279 3280 /** 3281 * In case of XGMI grab extra reference for reset domain for this device 3282 */ 3283 if (adev->gmc.xgmi.num_physical_nodes > 1) { 3284 if (amdgpu_xgmi_add_device(adev) == 0) { 3285 if (!amdgpu_sriov_vf(adev)) { 3286 struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev); 3287 3288 if (WARN_ON(!hive)) { 3289 r = -ENOENT; 3290 goto init_failed; 3291 } 3292 3293 if (!hive->reset_domain || 3294 !amdgpu_reset_get_reset_domain(hive->reset_domain)) { 3295 r = -ENOENT; 3296 amdgpu_put_xgmi_hive(hive); 3297 goto init_failed; 3298 } 3299 3300 /* Drop the early temporary reset domain we created for device */ 3301 amdgpu_reset_put_reset_domain(adev->reset_domain); 3302 adev->reset_domain = hive->reset_domain; 3303 amdgpu_put_xgmi_hive(hive); 3304 } 3305 } 3306 } 3307 3308 r = amdgpu_device_init_schedulers(adev); 3309 if (r) 3310 goto init_failed; 3311 3312 if (adev->mman.buffer_funcs_ring->sched.ready) 3313 amdgpu_ttm_set_buffer_funcs_status(adev, true); 3314 3315 /* Don't init kfd if whole hive need to be reset during init */ 3316 if (adev->init_lvl->level != AMDGPU_INIT_LEVEL_MINIMAL_XGMI) { 3317 kgd2kfd_init_zone_device(adev); 3318 amdgpu_amdkfd_device_init(adev); 3319 } 3320 3321 amdgpu_fru_get_product_info(adev); 3322 3323 if (!amdgpu_sriov_vf(adev) || amdgpu_sriov_ras_cper_en(adev)) 3324 r = amdgpu_cper_init(adev); 3325 3326 init_failed: 3327 3328 return r; 3329 } 3330 3331 /** 3332 * amdgpu_device_fill_reset_magic - writes reset magic to gart pointer 3333 * 3334 * @adev: amdgpu_device pointer 3335 * 3336 * Writes a reset magic value to the gart pointer in VRAM. The driver calls 3337 * this function before a GPU reset. If the value is retained after a 3338 * GPU reset, VRAM has not been lost. Some GPU resets may destroy VRAM contents. 3339 */ 3340 static void amdgpu_device_fill_reset_magic(struct amdgpu_device *adev) 3341 { 3342 memcpy(adev->reset_magic, adev->gart.ptr, AMDGPU_RESET_MAGIC_NUM); 3343 } 3344 3345 /** 3346 * amdgpu_device_check_vram_lost - check if vram is valid 3347 * 3348 * @adev: amdgpu_device pointer 3349 * 3350 * Checks the reset magic value written to the gart pointer in VRAM. 3351 * The driver calls this after a GPU reset to see if the contents of 3352 * VRAM is lost or now. 3353 * returns true if vram is lost, false if not. 3354 */ 3355 static bool amdgpu_device_check_vram_lost(struct amdgpu_device *adev) 3356 { 3357 if (memcmp(adev->gart.ptr, adev->reset_magic, 3358 AMDGPU_RESET_MAGIC_NUM)) 3359 return true; 3360 3361 if (!amdgpu_in_reset(adev)) 3362 return false; 3363 3364 /* 3365 * For all ASICs with baco/mode1 reset, the VRAM is 3366 * always assumed to be lost. 3367 */ 3368 switch (amdgpu_asic_reset_method(adev)) { 3369 case AMD_RESET_METHOD_LEGACY: 3370 case AMD_RESET_METHOD_LINK: 3371 case AMD_RESET_METHOD_BACO: 3372 case AMD_RESET_METHOD_MODE1: 3373 return true; 3374 default: 3375 return false; 3376 } 3377 } 3378 3379 /** 3380 * amdgpu_device_set_cg_state - set clockgating for amdgpu device 3381 * 3382 * @adev: amdgpu_device pointer 3383 * @state: clockgating state (gate or ungate) 3384 * 3385 * The list of all the hardware IPs that make up the asic is walked and the 3386 * set_clockgating_state callbacks are run. 3387 * Late initialization pass enabling clockgating for hardware IPs. 3388 * Fini or suspend, pass disabling clockgating for hardware IPs. 3389 * Returns 0 on success, negative error code on failure. 3390 */ 3391 3392 int amdgpu_device_set_cg_state(struct amdgpu_device *adev, 3393 enum amd_clockgating_state state) 3394 { 3395 int i, j, r; 3396 3397 if (amdgpu_emu_mode == 1) 3398 return 0; 3399 3400 for (j = 0; j < adev->num_ip_blocks; j++) { 3401 i = state == AMD_CG_STATE_GATE ? j : adev->num_ip_blocks - j - 1; 3402 if (!adev->ip_blocks[i].status.late_initialized) 3403 continue; 3404 /* skip CG for GFX, SDMA on S0ix */ 3405 if (adev->in_s0ix && 3406 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GFX || 3407 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SDMA)) 3408 continue; 3409 /* skip CG for VCE/UVD, it's handled specially */ 3410 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_UVD && 3411 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCE && 3412 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCN && 3413 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_JPEG && 3414 adev->ip_blocks[i].version->funcs->set_clockgating_state) { 3415 /* enable clockgating to save power */ 3416 r = adev->ip_blocks[i].version->funcs->set_clockgating_state(&adev->ip_blocks[i], 3417 state); 3418 if (r) { 3419 dev_err(adev->dev, 3420 "set_clockgating_state(gate) of IP block <%s> failed %d\n", 3421 adev->ip_blocks[i].version->funcs->name, 3422 r); 3423 return r; 3424 } 3425 } 3426 } 3427 3428 return 0; 3429 } 3430 3431 int amdgpu_device_set_pg_state(struct amdgpu_device *adev, 3432 enum amd_powergating_state state) 3433 { 3434 int i, j, r; 3435 3436 if (amdgpu_emu_mode == 1) 3437 return 0; 3438 3439 for (j = 0; j < adev->num_ip_blocks; j++) { 3440 i = state == AMD_PG_STATE_GATE ? j : adev->num_ip_blocks - j - 1; 3441 if (!adev->ip_blocks[i].status.late_initialized) 3442 continue; 3443 /* skip PG for GFX, SDMA on S0ix */ 3444 if (adev->in_s0ix && 3445 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GFX || 3446 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SDMA)) 3447 continue; 3448 /* skip CG for VCE/UVD, it's handled specially */ 3449 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_UVD && 3450 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCE && 3451 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCN && 3452 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_JPEG && 3453 adev->ip_blocks[i].version->funcs->set_powergating_state) { 3454 /* enable powergating to save power */ 3455 r = adev->ip_blocks[i].version->funcs->set_powergating_state(&adev->ip_blocks[i], 3456 state); 3457 if (r) { 3458 dev_err(adev->dev, 3459 "set_powergating_state(gate) of IP block <%s> failed %d\n", 3460 adev->ip_blocks[i].version->funcs->name, 3461 r); 3462 return r; 3463 } 3464 } 3465 } 3466 return 0; 3467 } 3468 3469 static int amdgpu_device_enable_mgpu_fan_boost(void) 3470 { 3471 struct amdgpu_gpu_instance *gpu_ins; 3472 struct amdgpu_device *adev; 3473 int i, ret = 0; 3474 3475 mutex_lock(&mgpu_info.mutex); 3476 3477 /* 3478 * MGPU fan boost feature should be enabled 3479 * only when there are two or more dGPUs in 3480 * the system 3481 */ 3482 if (mgpu_info.num_dgpu < 2) 3483 goto out; 3484 3485 for (i = 0; i < mgpu_info.num_dgpu; i++) { 3486 gpu_ins = &(mgpu_info.gpu_ins[i]); 3487 adev = gpu_ins->adev; 3488 if (!(adev->flags & AMD_IS_APU || amdgpu_sriov_multi_vf_mode(adev)) && 3489 !gpu_ins->mgpu_fan_enabled) { 3490 ret = amdgpu_dpm_enable_mgpu_fan_boost(adev); 3491 if (ret) 3492 break; 3493 3494 gpu_ins->mgpu_fan_enabled = 1; 3495 } 3496 } 3497 3498 out: 3499 mutex_unlock(&mgpu_info.mutex); 3500 3501 return ret; 3502 } 3503 3504 /** 3505 * amdgpu_device_ip_late_init - run late init for hardware IPs 3506 * 3507 * @adev: amdgpu_device pointer 3508 * 3509 * Late initialization pass for hardware IPs. The list of all the hardware 3510 * IPs that make up the asic is walked and the late_init callbacks are run. 3511 * late_init covers any special initialization that an IP requires 3512 * after all of the have been initialized or something that needs to happen 3513 * late in the init process. 3514 * Returns 0 on success, negative error code on failure. 3515 */ 3516 static int amdgpu_device_ip_late_init(struct amdgpu_device *adev) 3517 { 3518 struct amdgpu_gpu_instance *gpu_instance; 3519 int i = 0, r; 3520 3521 for (i = 0; i < adev->num_ip_blocks; i++) { 3522 if (!adev->ip_blocks[i].status.hw) 3523 continue; 3524 if (adev->ip_blocks[i].version->funcs->late_init) { 3525 r = adev->ip_blocks[i].version->funcs->late_init(&adev->ip_blocks[i]); 3526 if (r) { 3527 dev_err(adev->dev, 3528 "late_init of IP block <%s> failed %d\n", 3529 adev->ip_blocks[i].version->funcs->name, 3530 r); 3531 return r; 3532 } 3533 } 3534 adev->ip_blocks[i].status.late_initialized = true; 3535 } 3536 3537 r = amdgpu_ras_late_init(adev); 3538 if (r) { 3539 dev_err(adev->dev, "amdgpu_ras_late_init failed %d", r); 3540 return r; 3541 } 3542 3543 if (!amdgpu_reset_in_recovery(adev)) 3544 amdgpu_ras_set_error_query_ready(adev, true); 3545 3546 amdgpu_device_set_cg_state(adev, AMD_CG_STATE_GATE); 3547 amdgpu_device_set_pg_state(adev, AMD_PG_STATE_GATE); 3548 3549 amdgpu_device_fill_reset_magic(adev); 3550 3551 r = amdgpu_device_enable_mgpu_fan_boost(); 3552 if (r) 3553 dev_err(adev->dev, "enable mgpu fan boost failed (%d).\n", r); 3554 3555 /* For passthrough configuration on arcturus and aldebaran, enable special handling SBR */ 3556 if (amdgpu_passthrough(adev) && 3557 ((adev->asic_type == CHIP_ARCTURUS && adev->gmc.xgmi.num_physical_nodes > 1) || 3558 adev->asic_type == CHIP_ALDEBARAN)) 3559 amdgpu_dpm_handle_passthrough_sbr(adev, true); 3560 3561 if (adev->gmc.xgmi.num_physical_nodes > 1) { 3562 mutex_lock(&mgpu_info.mutex); 3563 3564 /* 3565 * Reset device p-state to low as this was booted with high. 3566 * 3567 * This should be performed only after all devices from the same 3568 * hive get initialized. 3569 * 3570 * However, it's unknown how many device in the hive in advance. 3571 * As this is counted one by one during devices initializations. 3572 * 3573 * So, we wait for all XGMI interlinked devices initialized. 3574 * This may bring some delays as those devices may come from 3575 * different hives. But that should be OK. 3576 */ 3577 if (mgpu_info.num_dgpu == adev->gmc.xgmi.num_physical_nodes) { 3578 for (i = 0; i < mgpu_info.num_gpu; i++) { 3579 gpu_instance = &(mgpu_info.gpu_ins[i]); 3580 if (gpu_instance->adev->flags & AMD_IS_APU) 3581 continue; 3582 3583 r = amdgpu_xgmi_set_pstate(gpu_instance->adev, 3584 AMDGPU_XGMI_PSTATE_MIN); 3585 if (r) { 3586 dev_err(adev->dev, 3587 "pstate setting failed (%d).\n", 3588 r); 3589 break; 3590 } 3591 } 3592 } 3593 3594 mutex_unlock(&mgpu_info.mutex); 3595 } 3596 3597 return 0; 3598 } 3599 3600 static void amdgpu_ip_block_hw_fini(struct amdgpu_ip_block *ip_block) 3601 { 3602 struct amdgpu_device *adev = ip_block->adev; 3603 int r; 3604 3605 if (!ip_block->version->funcs->hw_fini) { 3606 dev_err(adev->dev, "hw_fini of IP block <%s> not defined\n", 3607 ip_block->version->funcs->name); 3608 } else { 3609 r = ip_block->version->funcs->hw_fini(ip_block); 3610 /* XXX handle errors */ 3611 if (r) { 3612 dev_dbg(adev->dev, 3613 "hw_fini of IP block <%s> failed %d\n", 3614 ip_block->version->funcs->name, r); 3615 } 3616 } 3617 3618 ip_block->status.hw = false; 3619 } 3620 3621 /** 3622 * amdgpu_device_smu_fini_early - smu hw_fini wrapper 3623 * 3624 * @adev: amdgpu_device pointer 3625 * 3626 * For ASICs need to disable SMC first 3627 */ 3628 static void amdgpu_device_smu_fini_early(struct amdgpu_device *adev) 3629 { 3630 int i; 3631 3632 if (amdgpu_ip_version(adev, GC_HWIP, 0) > IP_VERSION(9, 0, 0)) 3633 return; 3634 3635 for (i = 0; i < adev->num_ip_blocks; i++) { 3636 if (!adev->ip_blocks[i].status.hw) 3637 continue; 3638 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) { 3639 amdgpu_ip_block_hw_fini(&adev->ip_blocks[i]); 3640 break; 3641 } 3642 } 3643 } 3644 3645 static int amdgpu_device_ip_fini_early(struct amdgpu_device *adev) 3646 { 3647 int i, r; 3648 3649 for (i = 0; i < adev->num_ip_blocks; i++) { 3650 if (!adev->ip_blocks[i].version->funcs->early_fini) 3651 continue; 3652 3653 r = adev->ip_blocks[i].version->funcs->early_fini(&adev->ip_blocks[i]); 3654 if (r) { 3655 dev_dbg(adev->dev, 3656 "early_fini of IP block <%s> failed %d\n", 3657 adev->ip_blocks[i].version->funcs->name, r); 3658 } 3659 } 3660 3661 amdgpu_device_set_pg_state(adev, AMD_PG_STATE_UNGATE); 3662 amdgpu_device_set_cg_state(adev, AMD_CG_STATE_UNGATE); 3663 3664 amdgpu_amdkfd_suspend(adev, true); 3665 amdgpu_userq_suspend(adev); 3666 3667 /* Workaround for ASICs need to disable SMC first */ 3668 amdgpu_device_smu_fini_early(adev); 3669 3670 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 3671 if (!adev->ip_blocks[i].status.hw) 3672 continue; 3673 3674 amdgpu_ip_block_hw_fini(&adev->ip_blocks[i]); 3675 } 3676 3677 if (amdgpu_sriov_vf(adev)) { 3678 if (amdgpu_virt_release_full_gpu(adev, false)) 3679 dev_err(adev->dev, 3680 "failed to release exclusive mode on fini\n"); 3681 } 3682 3683 return 0; 3684 } 3685 3686 /** 3687 * amdgpu_device_ip_fini - run fini for hardware IPs 3688 * 3689 * @adev: amdgpu_device pointer 3690 * 3691 * Main teardown pass for hardware IPs. The list of all the hardware 3692 * IPs that make up the asic is walked and the hw_fini and sw_fini callbacks 3693 * are run. hw_fini tears down the hardware associated with each IP 3694 * and sw_fini tears down any software state associated with each IP. 3695 * Returns 0 on success, negative error code on failure. 3696 */ 3697 static int amdgpu_device_ip_fini(struct amdgpu_device *adev) 3698 { 3699 int i, r; 3700 3701 amdgpu_cper_fini(adev); 3702 3703 if (amdgpu_sriov_vf(adev) && adev->virt.ras_init_done) 3704 amdgpu_virt_release_ras_err_handler_data(adev); 3705 3706 if (adev->gmc.xgmi.num_physical_nodes > 1) 3707 amdgpu_xgmi_remove_device(adev); 3708 3709 amdgpu_amdkfd_device_fini_sw(adev); 3710 3711 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 3712 if (!adev->ip_blocks[i].status.sw) 3713 continue; 3714 3715 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) { 3716 amdgpu_ucode_free_bo(adev); 3717 amdgpu_free_static_csa(&adev->virt.csa_obj); 3718 amdgpu_device_wb_fini(adev); 3719 amdgpu_device_mem_scratch_fini(adev); 3720 amdgpu_ib_pool_fini(adev); 3721 amdgpu_seq64_fini(adev); 3722 amdgpu_doorbell_fini(adev); 3723 } 3724 if (adev->ip_blocks[i].version->funcs->sw_fini) { 3725 r = adev->ip_blocks[i].version->funcs->sw_fini(&adev->ip_blocks[i]); 3726 /* XXX handle errors */ 3727 if (r) { 3728 dev_dbg(adev->dev, 3729 "sw_fini of IP block <%s> failed %d\n", 3730 adev->ip_blocks[i].version->funcs->name, 3731 r); 3732 } 3733 } 3734 adev->ip_blocks[i].status.sw = false; 3735 adev->ip_blocks[i].status.valid = false; 3736 } 3737 3738 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 3739 if (!adev->ip_blocks[i].status.late_initialized) 3740 continue; 3741 if (adev->ip_blocks[i].version->funcs->late_fini) 3742 adev->ip_blocks[i].version->funcs->late_fini(&adev->ip_blocks[i]); 3743 adev->ip_blocks[i].status.late_initialized = false; 3744 } 3745 3746 amdgpu_ras_fini(adev); 3747 amdgpu_uid_fini(adev); 3748 3749 return 0; 3750 } 3751 3752 /** 3753 * amdgpu_device_delayed_init_work_handler - work handler for IB tests 3754 * 3755 * @work: work_struct. 3756 */ 3757 static void amdgpu_device_delayed_init_work_handler(struct work_struct *work) 3758 { 3759 struct amdgpu_device *adev = 3760 container_of(work, struct amdgpu_device, delayed_init_work.work); 3761 int r; 3762 3763 r = amdgpu_ib_ring_tests(adev); 3764 if (r) 3765 dev_err(adev->dev, "ib ring test failed (%d).\n", r); 3766 } 3767 3768 static void amdgpu_device_delay_enable_gfx_off(struct work_struct *work) 3769 { 3770 struct amdgpu_device *adev = 3771 container_of(work, struct amdgpu_device, gfx.gfx_off_delay_work.work); 3772 3773 WARN_ON_ONCE(adev->gfx.gfx_off_state); 3774 WARN_ON_ONCE(adev->gfx.gfx_off_req_count); 3775 3776 if (!amdgpu_dpm_set_powergating_by_smu(adev, AMD_IP_BLOCK_TYPE_GFX, true, 0)) 3777 adev->gfx.gfx_off_state = true; 3778 } 3779 3780 /** 3781 * amdgpu_device_ip_suspend_phase1 - run suspend for hardware IPs (phase 1) 3782 * 3783 * @adev: amdgpu_device pointer 3784 * 3785 * Main suspend function for hardware IPs. The list of all the hardware 3786 * IPs that make up the asic is walked, clockgating is disabled and the 3787 * suspend callbacks are run. suspend puts the hardware and software state 3788 * in each IP into a state suitable for suspend. 3789 * Returns 0 on success, negative error code on failure. 3790 */ 3791 static int amdgpu_device_ip_suspend_phase1(struct amdgpu_device *adev) 3792 { 3793 int i, r, rec; 3794 3795 amdgpu_device_set_pg_state(adev, AMD_PG_STATE_UNGATE); 3796 amdgpu_device_set_cg_state(adev, AMD_CG_STATE_UNGATE); 3797 3798 /* 3799 * Per PMFW team's suggestion, driver needs to handle gfxoff 3800 * and df cstate features disablement for gpu reset(e.g. Mode1Reset) 3801 * scenario. Add the missing df cstate disablement here. 3802 */ 3803 if (amdgpu_dpm_set_df_cstate(adev, DF_CSTATE_DISALLOW)) 3804 dev_warn(adev->dev, "Failed to disallow df cstate"); 3805 3806 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 3807 if (!adev->ip_blocks[i].status.valid) 3808 continue; 3809 3810 /* displays are handled separately */ 3811 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_DCE) 3812 continue; 3813 3814 r = amdgpu_ip_block_suspend(&adev->ip_blocks[i]); 3815 if (r) 3816 goto unwind; 3817 } 3818 3819 return 0; 3820 unwind: 3821 rec = amdgpu_device_ip_resume_phase3(adev); 3822 if (rec) 3823 dev_err(adev->dev, 3824 "amdgpu_device_ip_resume_phase3 failed during unwind: %d\n", 3825 rec); 3826 3827 amdgpu_dpm_set_df_cstate(adev, DF_CSTATE_ALLOW); 3828 3829 amdgpu_device_set_pg_state(adev, AMD_PG_STATE_GATE); 3830 amdgpu_device_set_cg_state(adev, AMD_CG_STATE_GATE); 3831 3832 return r; 3833 } 3834 3835 /** 3836 * amdgpu_device_ip_suspend_phase2 - run suspend for hardware IPs (phase 2) 3837 * 3838 * @adev: amdgpu_device pointer 3839 * 3840 * Main suspend function for hardware IPs. The list of all the hardware 3841 * IPs that make up the asic is walked, clockgating is disabled and the 3842 * suspend callbacks are run. suspend puts the hardware and software state 3843 * in each IP into a state suitable for suspend. 3844 * Returns 0 on success, negative error code on failure. 3845 */ 3846 static int amdgpu_device_ip_suspend_phase2(struct amdgpu_device *adev) 3847 { 3848 int i, r, rec; 3849 3850 if (adev->in_s0ix) 3851 amdgpu_dpm_gfx_state_change(adev, sGpuChangeState_D3Entry); 3852 3853 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 3854 if (!adev->ip_blocks[i].status.valid) 3855 continue; 3856 /* displays are handled in phase1 */ 3857 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE) 3858 continue; 3859 /* PSP lost connection when err_event_athub occurs */ 3860 if (amdgpu_ras_intr_triggered() && 3861 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) { 3862 adev->ip_blocks[i].status.hw = false; 3863 continue; 3864 } 3865 3866 /* skip unnecessary suspend if we do not initialize them yet */ 3867 if (!amdgpu_ip_member_of_hwini( 3868 adev, adev->ip_blocks[i].version->type)) 3869 continue; 3870 3871 /* Since we skip suspend for S0i3, we need to cancel the delayed 3872 * idle work here as the suspend callback never gets called. 3873 */ 3874 if (adev->in_s0ix && 3875 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GFX && 3876 amdgpu_ip_version(adev, GC_HWIP, 0) >= IP_VERSION(10, 0, 0)) 3877 cancel_delayed_work_sync(&adev->gfx.idle_work); 3878 /* skip suspend of gfx/mes and psp for S0ix 3879 * gfx is in gfxoff state, so on resume it will exit gfxoff just 3880 * like at runtime. PSP is also part of the always on hardware 3881 * so no need to suspend it. 3882 */ 3883 if (adev->in_s0ix && 3884 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP || 3885 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GFX || 3886 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_MES)) 3887 continue; 3888 3889 /* SDMA 5.x+ is part of GFX power domain so it's covered by GFXOFF */ 3890 if (adev->in_s0ix && 3891 (amdgpu_ip_version(adev, SDMA0_HWIP, 0) >= 3892 IP_VERSION(5, 0, 0)) && 3893 (adev->ip_blocks[i].version->type == 3894 AMD_IP_BLOCK_TYPE_SDMA)) 3895 continue; 3896 3897 /* Once swPSP provides the IMU, RLC FW binaries to TOS during cold-boot. 3898 * These are in TMR, hence are expected to be reused by PSP-TOS to reload 3899 * from this location and RLC Autoload automatically also gets loaded 3900 * from here based on PMFW -> PSP message during re-init sequence. 3901 * Therefore, the psp suspend & resume should be skipped to avoid destroy 3902 * the TMR and reload FWs again for IMU enabled APU ASICs. 3903 */ 3904 if (amdgpu_in_reset(adev) && 3905 (adev->flags & AMD_IS_APU) && adev->gfx.imu.funcs && 3906 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) 3907 continue; 3908 3909 r = amdgpu_ip_block_suspend(&adev->ip_blocks[i]); 3910 if (r) 3911 goto unwind; 3912 3913 /* handle putting the SMC in the appropriate state */ 3914 if (!amdgpu_sriov_vf(adev)) { 3915 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) { 3916 r = amdgpu_dpm_set_mp1_state(adev, adev->mp1_state); 3917 if (r) { 3918 dev_err(adev->dev, 3919 "SMC failed to set mp1 state %d, %d\n", 3920 adev->mp1_state, r); 3921 goto unwind; 3922 } 3923 } 3924 } 3925 } 3926 3927 return 0; 3928 unwind: 3929 /* suspend phase 2 = resume phase 1 + resume phase 2 */ 3930 rec = amdgpu_device_ip_resume_phase1(adev); 3931 if (rec) { 3932 dev_err(adev->dev, 3933 "amdgpu_device_ip_resume_phase1 failed during unwind: %d\n", 3934 rec); 3935 return r; 3936 } 3937 3938 rec = amdgpu_device_fw_loading(adev); 3939 if (rec) { 3940 dev_err(adev->dev, 3941 "amdgpu_device_fw_loading failed during unwind: %d\n", 3942 rec); 3943 return r; 3944 } 3945 3946 rec = amdgpu_device_ip_resume_phase2(adev); 3947 if (rec) { 3948 dev_err(adev->dev, 3949 "amdgpu_device_ip_resume_phase2 failed during unwind: %d\n", 3950 rec); 3951 return r; 3952 } 3953 3954 return r; 3955 } 3956 3957 /** 3958 * amdgpu_device_ip_suspend - run suspend for hardware IPs 3959 * 3960 * @adev: amdgpu_device pointer 3961 * 3962 * Main suspend function for hardware IPs. The list of all the hardware 3963 * IPs that make up the asic is walked, clockgating is disabled and the 3964 * suspend callbacks are run. suspend puts the hardware and software state 3965 * in each IP into a state suitable for suspend. 3966 * Returns 0 on success, negative error code on failure. 3967 */ 3968 static int amdgpu_device_ip_suspend(struct amdgpu_device *adev) 3969 { 3970 int r; 3971 3972 if (amdgpu_sriov_vf(adev)) { 3973 amdgpu_virt_fini_data_exchange(adev); 3974 amdgpu_virt_request_full_gpu(adev, false); 3975 } 3976 3977 amdgpu_ttm_set_buffer_funcs_status(adev, false); 3978 3979 r = amdgpu_device_ip_suspend_phase1(adev); 3980 if (r) 3981 return r; 3982 r = amdgpu_device_ip_suspend_phase2(adev); 3983 3984 if (amdgpu_sriov_vf(adev)) 3985 amdgpu_virt_release_full_gpu(adev, false); 3986 3987 return r; 3988 } 3989 3990 static int amdgpu_device_ip_reinit_early_sriov(struct amdgpu_device *adev) 3991 { 3992 int i, r; 3993 3994 static enum amd_ip_block_type ip_order[] = { 3995 AMD_IP_BLOCK_TYPE_COMMON, 3996 AMD_IP_BLOCK_TYPE_GMC, 3997 AMD_IP_BLOCK_TYPE_PSP, 3998 AMD_IP_BLOCK_TYPE_IH, 3999 }; 4000 4001 for (i = 0; i < adev->num_ip_blocks; i++) { 4002 int j; 4003 struct amdgpu_ip_block *block; 4004 4005 block = &adev->ip_blocks[i]; 4006 block->status.hw = false; 4007 4008 for (j = 0; j < ARRAY_SIZE(ip_order); j++) { 4009 4010 if (block->version->type != ip_order[j] || 4011 !block->status.valid) 4012 continue; 4013 4014 r = block->version->funcs->hw_init(&adev->ip_blocks[i]); 4015 if (r) { 4016 dev_err(adev->dev, "RE-INIT-early: %s failed\n", 4017 block->version->funcs->name); 4018 return r; 4019 } 4020 block->status.hw = true; 4021 } 4022 } 4023 4024 return 0; 4025 } 4026 4027 static int amdgpu_device_ip_reinit_late_sriov(struct amdgpu_device *adev) 4028 { 4029 struct amdgpu_ip_block *block; 4030 int i, r = 0; 4031 4032 static enum amd_ip_block_type ip_order[] = { 4033 AMD_IP_BLOCK_TYPE_SMC, 4034 AMD_IP_BLOCK_TYPE_DCE, 4035 AMD_IP_BLOCK_TYPE_GFX, 4036 AMD_IP_BLOCK_TYPE_SDMA, 4037 AMD_IP_BLOCK_TYPE_MES, 4038 AMD_IP_BLOCK_TYPE_UVD, 4039 AMD_IP_BLOCK_TYPE_VCE, 4040 AMD_IP_BLOCK_TYPE_VCN, 4041 AMD_IP_BLOCK_TYPE_JPEG 4042 }; 4043 4044 for (i = 0; i < ARRAY_SIZE(ip_order); i++) { 4045 block = amdgpu_device_ip_get_ip_block(adev, ip_order[i]); 4046 4047 if (!block) 4048 continue; 4049 4050 if (block->status.valid && !block->status.hw) { 4051 if (block->version->type == AMD_IP_BLOCK_TYPE_SMC) { 4052 r = amdgpu_ip_block_resume(block); 4053 } else { 4054 r = block->version->funcs->hw_init(block); 4055 } 4056 4057 if (r) { 4058 dev_err(adev->dev, "RE-INIT-late: %s failed\n", 4059 block->version->funcs->name); 4060 break; 4061 } 4062 block->status.hw = true; 4063 } 4064 } 4065 4066 return r; 4067 } 4068 4069 /** 4070 * amdgpu_device_ip_resume_phase1 - run resume for hardware IPs 4071 * 4072 * @adev: amdgpu_device pointer 4073 * 4074 * First resume function for hardware IPs. The list of all the hardware 4075 * IPs that make up the asic is walked and the resume callbacks are run for 4076 * COMMON, GMC, and IH. resume puts the hardware into a functional state 4077 * after a suspend and updates the software state as necessary. This 4078 * function is also used for restoring the GPU after a GPU reset. 4079 * Returns 0 on success, negative error code on failure. 4080 */ 4081 static int amdgpu_device_ip_resume_phase1(struct amdgpu_device *adev) 4082 { 4083 int i, r; 4084 4085 for (i = 0; i < adev->num_ip_blocks; i++) { 4086 if (!adev->ip_blocks[i].status.valid || adev->ip_blocks[i].status.hw) 4087 continue; 4088 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON || 4089 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC || 4090 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH || 4091 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP && amdgpu_sriov_vf(adev))) { 4092 4093 r = amdgpu_ip_block_resume(&adev->ip_blocks[i]); 4094 if (r) 4095 return r; 4096 } 4097 } 4098 4099 return 0; 4100 } 4101 4102 /** 4103 * amdgpu_device_ip_resume_phase2 - run resume for hardware IPs 4104 * 4105 * @adev: amdgpu_device pointer 4106 * 4107 * Second resume function for hardware IPs. The list of all the hardware 4108 * IPs that make up the asic is walked and the resume callbacks are run for 4109 * all blocks except COMMON, GMC, and IH. resume puts the hardware into a 4110 * functional state after a suspend and updates the software state as 4111 * necessary. This function is also used for restoring the GPU after a GPU 4112 * reset. 4113 * Returns 0 on success, negative error code on failure. 4114 */ 4115 static int amdgpu_device_ip_resume_phase2(struct amdgpu_device *adev) 4116 { 4117 int i, r; 4118 4119 for (i = 0; i < adev->num_ip_blocks; i++) { 4120 if (!adev->ip_blocks[i].status.valid || adev->ip_blocks[i].status.hw) 4121 continue; 4122 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON || 4123 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC || 4124 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH || 4125 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE || 4126 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) 4127 continue; 4128 r = amdgpu_ip_block_resume(&adev->ip_blocks[i]); 4129 if (r) 4130 return r; 4131 } 4132 4133 return 0; 4134 } 4135 4136 /** 4137 * amdgpu_device_ip_resume_phase3 - run resume for hardware IPs 4138 * 4139 * @adev: amdgpu_device pointer 4140 * 4141 * Third resume function for hardware IPs. The list of all the hardware 4142 * IPs that make up the asic is walked and the resume callbacks are run for 4143 * all DCE. resume puts the hardware into a functional state after a suspend 4144 * and updates the software state as necessary. This function is also used 4145 * for restoring the GPU after a GPU reset. 4146 * 4147 * Returns 0 on success, negative error code on failure. 4148 */ 4149 static int amdgpu_device_ip_resume_phase3(struct amdgpu_device *adev) 4150 { 4151 int i, r; 4152 4153 for (i = 0; i < adev->num_ip_blocks; i++) { 4154 if (!adev->ip_blocks[i].status.valid || adev->ip_blocks[i].status.hw) 4155 continue; 4156 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE) { 4157 r = amdgpu_ip_block_resume(&adev->ip_blocks[i]); 4158 if (r) 4159 return r; 4160 } 4161 } 4162 4163 return 0; 4164 } 4165 4166 /** 4167 * amdgpu_device_ip_resume - run resume for hardware IPs 4168 * 4169 * @adev: amdgpu_device pointer 4170 * 4171 * Main resume function for hardware IPs. The hardware IPs 4172 * are split into two resume functions because they are 4173 * also used in recovering from a GPU reset and some additional 4174 * steps need to be take between them. In this case (S3/S4) they are 4175 * run sequentially. 4176 * Returns 0 on success, negative error code on failure. 4177 */ 4178 static int amdgpu_device_ip_resume(struct amdgpu_device *adev) 4179 { 4180 int r; 4181 4182 r = amdgpu_device_ip_resume_phase1(adev); 4183 if (r) 4184 return r; 4185 4186 r = amdgpu_device_fw_loading(adev); 4187 if (r) 4188 return r; 4189 4190 r = amdgpu_device_ip_resume_phase2(adev); 4191 4192 if (adev->mman.buffer_funcs_ring->sched.ready) 4193 amdgpu_ttm_set_buffer_funcs_status(adev, true); 4194 4195 if (r) 4196 return r; 4197 4198 amdgpu_fence_driver_hw_init(adev); 4199 4200 r = amdgpu_device_ip_resume_phase3(adev); 4201 4202 return r; 4203 } 4204 4205 /** 4206 * amdgpu_device_detect_sriov_bios - determine if the board supports SR-IOV 4207 * 4208 * @adev: amdgpu_device pointer 4209 * 4210 * Query the VBIOS data tables to determine if the board supports SR-IOV. 4211 */ 4212 static void amdgpu_device_detect_sriov_bios(struct amdgpu_device *adev) 4213 { 4214 if (amdgpu_sriov_vf(adev)) { 4215 if (adev->is_atom_fw) { 4216 if (amdgpu_atomfirmware_gpu_virtualization_supported(adev)) 4217 adev->virt.caps |= AMDGPU_SRIOV_CAPS_SRIOV_VBIOS; 4218 } else { 4219 if (amdgpu_atombios_has_gpu_virtualization_table(adev)) 4220 adev->virt.caps |= AMDGPU_SRIOV_CAPS_SRIOV_VBIOS; 4221 } 4222 4223 if (!(adev->virt.caps & AMDGPU_SRIOV_CAPS_SRIOV_VBIOS)) 4224 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_NO_VBIOS, 0, 0); 4225 } 4226 } 4227 4228 /** 4229 * amdgpu_device_asic_has_dc_support - determine if DC supports the asic 4230 * 4231 * @pdev : pci device context 4232 * @asic_type: AMD asic type 4233 * 4234 * Check if there is DC (new modesetting infrastructre) support for an asic. 4235 * returns true if DC has support, false if not. 4236 */ 4237 bool amdgpu_device_asic_has_dc_support(struct pci_dev *pdev, 4238 enum amd_asic_type asic_type) 4239 { 4240 switch (asic_type) { 4241 #ifdef CONFIG_DRM_AMDGPU_SI 4242 case CHIP_HAINAN: 4243 #endif 4244 case CHIP_TOPAZ: 4245 /* chips with no display hardware */ 4246 return false; 4247 #if defined(CONFIG_DRM_AMD_DC) 4248 case CHIP_TAHITI: 4249 case CHIP_PITCAIRN: 4250 case CHIP_VERDE: 4251 case CHIP_OLAND: 4252 return amdgpu_dc != 0 && IS_ENABLED(CONFIG_DRM_AMD_DC_SI); 4253 case CHIP_KAVERI: 4254 case CHIP_KABINI: 4255 case CHIP_MULLINS: 4256 /* 4257 * We have systems in the wild with these ASICs that require 4258 * TRAVIS and NUTMEG support which is not supported with DC. 4259 * 4260 * Fallback to the non-DC driver here by default so as not to 4261 * cause regressions. 4262 */ 4263 return amdgpu_dc > 0; 4264 default: 4265 return amdgpu_dc != 0; 4266 #else 4267 default: 4268 if (amdgpu_dc > 0) 4269 dev_info_once( 4270 &pdev->dev, 4271 "Display Core has been requested via kernel parameter but isn't supported by ASIC, ignoring\n"); 4272 return false; 4273 #endif 4274 } 4275 } 4276 4277 /** 4278 * amdgpu_device_has_dc_support - check if dc is supported 4279 * 4280 * @adev: amdgpu_device pointer 4281 * 4282 * Returns true for supported, false for not supported 4283 */ 4284 bool amdgpu_device_has_dc_support(struct amdgpu_device *adev) 4285 { 4286 if (adev->enable_virtual_display || 4287 (adev->harvest_ip_mask & AMD_HARVEST_IP_DMU_MASK)) 4288 return false; 4289 4290 return amdgpu_device_asic_has_dc_support(adev->pdev, adev->asic_type); 4291 } 4292 4293 static void amdgpu_device_xgmi_reset_func(struct work_struct *__work) 4294 { 4295 struct amdgpu_device *adev = 4296 container_of(__work, struct amdgpu_device, xgmi_reset_work); 4297 struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev); 4298 4299 /* It's a bug to not have a hive within this function */ 4300 if (WARN_ON(!hive)) 4301 return; 4302 4303 /* 4304 * Use task barrier to synchronize all xgmi reset works across the 4305 * hive. task_barrier_enter and task_barrier_exit will block 4306 * until all the threads running the xgmi reset works reach 4307 * those points. task_barrier_full will do both blocks. 4308 */ 4309 if (amdgpu_asic_reset_method(adev) == AMD_RESET_METHOD_BACO) { 4310 4311 task_barrier_enter(&hive->tb); 4312 adev->asic_reset_res = amdgpu_device_baco_enter(adev); 4313 4314 if (adev->asic_reset_res) 4315 goto fail; 4316 4317 task_barrier_exit(&hive->tb); 4318 adev->asic_reset_res = amdgpu_device_baco_exit(adev); 4319 4320 if (adev->asic_reset_res) 4321 goto fail; 4322 4323 amdgpu_ras_reset_error_count(adev, AMDGPU_RAS_BLOCK__MMHUB); 4324 } else { 4325 4326 task_barrier_full(&hive->tb); 4327 adev->asic_reset_res = amdgpu_asic_reset(adev); 4328 } 4329 4330 fail: 4331 if (adev->asic_reset_res) 4332 dev_warn(adev->dev, 4333 "ASIC reset failed with error, %d for drm dev, %s", 4334 adev->asic_reset_res, adev_to_drm(adev)->unique); 4335 amdgpu_put_xgmi_hive(hive); 4336 } 4337 4338 static int amdgpu_device_get_job_timeout_settings(struct amdgpu_device *adev) 4339 { 4340 char *input = amdgpu_lockup_timeout; 4341 char *timeout_setting = NULL; 4342 int index = 0; 4343 long timeout; 4344 int ret = 0; 4345 4346 /* By default timeout for all queues is 2 sec */ 4347 adev->gfx_timeout = adev->compute_timeout = adev->sdma_timeout = 4348 adev->video_timeout = msecs_to_jiffies(2000); 4349 4350 if (!strnlen(input, AMDGPU_MAX_TIMEOUT_PARAM_LENGTH)) 4351 return 0; 4352 4353 while ((timeout_setting = strsep(&input, ",")) && 4354 strnlen(timeout_setting, AMDGPU_MAX_TIMEOUT_PARAM_LENGTH)) { 4355 ret = kstrtol(timeout_setting, 0, &timeout); 4356 if (ret) 4357 return ret; 4358 4359 if (timeout == 0) { 4360 index++; 4361 continue; 4362 } else if (timeout < 0) { 4363 timeout = MAX_SCHEDULE_TIMEOUT; 4364 dev_warn(adev->dev, "lockup timeout disabled"); 4365 add_taint(TAINT_SOFTLOCKUP, LOCKDEP_STILL_OK); 4366 } else { 4367 timeout = msecs_to_jiffies(timeout); 4368 } 4369 4370 switch (index++) { 4371 case 0: 4372 adev->gfx_timeout = timeout; 4373 break; 4374 case 1: 4375 adev->compute_timeout = timeout; 4376 break; 4377 case 2: 4378 adev->sdma_timeout = timeout; 4379 break; 4380 case 3: 4381 adev->video_timeout = timeout; 4382 break; 4383 default: 4384 break; 4385 } 4386 } 4387 4388 /* When only one value specified apply it to all queues. */ 4389 if (index == 1) 4390 adev->gfx_timeout = adev->compute_timeout = adev->sdma_timeout = 4391 adev->video_timeout = timeout; 4392 4393 return ret; 4394 } 4395 4396 /** 4397 * amdgpu_device_check_iommu_direct_map - check if RAM direct mapped to GPU 4398 * 4399 * @adev: amdgpu_device pointer 4400 * 4401 * RAM direct mapped to GPU if IOMMU is not enabled or is pass through mode 4402 */ 4403 static void amdgpu_device_check_iommu_direct_map(struct amdgpu_device *adev) 4404 { 4405 struct iommu_domain *domain; 4406 4407 domain = iommu_get_domain_for_dev(adev->dev); 4408 if (!domain || domain->type == IOMMU_DOMAIN_IDENTITY) 4409 adev->ram_is_direct_mapped = true; 4410 } 4411 4412 #if defined(CONFIG_HSA_AMD_P2P) 4413 /** 4414 * amdgpu_device_check_iommu_remap - Check if DMA remapping is enabled. 4415 * 4416 * @adev: amdgpu_device pointer 4417 * 4418 * return if IOMMU remapping bar address 4419 */ 4420 static bool amdgpu_device_check_iommu_remap(struct amdgpu_device *adev) 4421 { 4422 struct iommu_domain *domain; 4423 4424 domain = iommu_get_domain_for_dev(adev->dev); 4425 if (domain && (domain->type == IOMMU_DOMAIN_DMA || 4426 domain->type == IOMMU_DOMAIN_DMA_FQ)) 4427 return true; 4428 4429 return false; 4430 } 4431 #endif 4432 4433 static void amdgpu_device_set_mcbp(struct amdgpu_device *adev) 4434 { 4435 if (amdgpu_mcbp == 1) 4436 adev->gfx.mcbp = true; 4437 else if (amdgpu_mcbp == 0) 4438 adev->gfx.mcbp = false; 4439 4440 if (amdgpu_sriov_vf(adev)) 4441 adev->gfx.mcbp = true; 4442 4443 if (adev->gfx.mcbp) 4444 dev_info(adev->dev, "MCBP is enabled\n"); 4445 } 4446 4447 static int amdgpu_device_sys_interface_init(struct amdgpu_device *adev) 4448 { 4449 int r; 4450 4451 r = amdgpu_atombios_sysfs_init(adev); 4452 if (r) 4453 drm_err(&adev->ddev, 4454 "registering atombios sysfs failed (%d).\n", r); 4455 4456 r = amdgpu_pm_sysfs_init(adev); 4457 if (r) 4458 dev_err(adev->dev, "registering pm sysfs failed (%d).\n", r); 4459 4460 r = amdgpu_ucode_sysfs_init(adev); 4461 if (r) { 4462 adev->ucode_sysfs_en = false; 4463 dev_err(adev->dev, "Creating firmware sysfs failed (%d).\n", r); 4464 } else 4465 adev->ucode_sysfs_en = true; 4466 4467 r = amdgpu_device_attr_sysfs_init(adev); 4468 if (r) 4469 dev_err(adev->dev, "Could not create amdgpu device attr\n"); 4470 4471 r = devm_device_add_group(adev->dev, &amdgpu_board_attrs_group); 4472 if (r) 4473 dev_err(adev->dev, 4474 "Could not create amdgpu board attributes\n"); 4475 4476 amdgpu_fru_sysfs_init(adev); 4477 amdgpu_reg_state_sysfs_init(adev); 4478 amdgpu_xcp_sysfs_init(adev); 4479 4480 return r; 4481 } 4482 4483 static void amdgpu_device_sys_interface_fini(struct amdgpu_device *adev) 4484 { 4485 if (adev->pm.sysfs_initialized) 4486 amdgpu_pm_sysfs_fini(adev); 4487 if (adev->ucode_sysfs_en) 4488 amdgpu_ucode_sysfs_fini(adev); 4489 amdgpu_device_attr_sysfs_fini(adev); 4490 amdgpu_fru_sysfs_fini(adev); 4491 4492 amdgpu_reg_state_sysfs_fini(adev); 4493 amdgpu_xcp_sysfs_fini(adev); 4494 } 4495 4496 /** 4497 * amdgpu_device_init - initialize the driver 4498 * 4499 * @adev: amdgpu_device pointer 4500 * @flags: driver flags 4501 * 4502 * Initializes the driver info and hw (all asics). 4503 * Returns 0 for success or an error on failure. 4504 * Called at driver startup. 4505 */ 4506 int amdgpu_device_init(struct amdgpu_device *adev, 4507 uint32_t flags) 4508 { 4509 struct pci_dev *pdev = adev->pdev; 4510 int r, i; 4511 bool px = false; 4512 u32 max_MBps; 4513 int tmp; 4514 4515 adev->shutdown = false; 4516 adev->flags = flags; 4517 4518 if (amdgpu_force_asic_type >= 0 && amdgpu_force_asic_type < CHIP_LAST) 4519 adev->asic_type = amdgpu_force_asic_type; 4520 else 4521 adev->asic_type = flags & AMD_ASIC_MASK; 4522 4523 adev->usec_timeout = AMDGPU_MAX_USEC_TIMEOUT; 4524 if (amdgpu_emu_mode == 1) 4525 adev->usec_timeout *= 10; 4526 adev->gmc.gart_size = 512 * 1024 * 1024; 4527 adev->accel_working = false; 4528 adev->num_rings = 0; 4529 RCU_INIT_POINTER(adev->gang_submit, dma_fence_get_stub()); 4530 adev->mman.buffer_funcs = NULL; 4531 adev->mman.buffer_funcs_ring = NULL; 4532 adev->vm_manager.vm_pte_funcs = NULL; 4533 adev->vm_manager.vm_pte_num_scheds = 0; 4534 adev->gmc.gmc_funcs = NULL; 4535 adev->harvest_ip_mask = 0x0; 4536 adev->fence_context = dma_fence_context_alloc(AMDGPU_MAX_RINGS); 4537 bitmap_zero(adev->gfx.pipe_reserve_bitmap, AMDGPU_MAX_COMPUTE_QUEUES); 4538 4539 adev->smc_rreg = &amdgpu_invalid_rreg; 4540 adev->smc_wreg = &amdgpu_invalid_wreg; 4541 adev->pcie_rreg = &amdgpu_invalid_rreg; 4542 adev->pcie_wreg = &amdgpu_invalid_wreg; 4543 adev->pcie_rreg_ext = &amdgpu_invalid_rreg_ext; 4544 adev->pcie_wreg_ext = &amdgpu_invalid_wreg_ext; 4545 adev->pciep_rreg = &amdgpu_invalid_rreg; 4546 adev->pciep_wreg = &amdgpu_invalid_wreg; 4547 adev->pcie_rreg64 = &amdgpu_invalid_rreg64; 4548 adev->pcie_wreg64 = &amdgpu_invalid_wreg64; 4549 adev->pcie_rreg64_ext = &amdgpu_invalid_rreg64_ext; 4550 adev->pcie_wreg64_ext = &amdgpu_invalid_wreg64_ext; 4551 adev->uvd_ctx_rreg = &amdgpu_invalid_rreg; 4552 adev->uvd_ctx_wreg = &amdgpu_invalid_wreg; 4553 adev->didt_rreg = &amdgpu_invalid_rreg; 4554 adev->didt_wreg = &amdgpu_invalid_wreg; 4555 adev->gc_cac_rreg = &amdgpu_invalid_rreg; 4556 adev->gc_cac_wreg = &amdgpu_invalid_wreg; 4557 adev->audio_endpt_rreg = &amdgpu_block_invalid_rreg; 4558 adev->audio_endpt_wreg = &amdgpu_block_invalid_wreg; 4559 4560 dev_info( 4561 adev->dev, 4562 "initializing kernel modesetting (%s 0x%04X:0x%04X 0x%04X:0x%04X 0x%02X).\n", 4563 amdgpu_asic_name[adev->asic_type], pdev->vendor, pdev->device, 4564 pdev->subsystem_vendor, pdev->subsystem_device, pdev->revision); 4565 4566 /* mutex initialization are all done here so we 4567 * can recall function without having locking issues 4568 */ 4569 mutex_init(&adev->firmware.mutex); 4570 mutex_init(&adev->pm.mutex); 4571 mutex_init(&adev->gfx.gpu_clock_mutex); 4572 mutex_init(&adev->srbm_mutex); 4573 mutex_init(&adev->gfx.pipe_reserve_mutex); 4574 mutex_init(&adev->gfx.gfx_off_mutex); 4575 mutex_init(&adev->gfx.partition_mutex); 4576 mutex_init(&adev->grbm_idx_mutex); 4577 mutex_init(&adev->mn_lock); 4578 mutex_init(&adev->virt.vf_errors.lock); 4579 hash_init(adev->mn_hash); 4580 mutex_init(&adev->psp.mutex); 4581 mutex_init(&adev->notifier_lock); 4582 mutex_init(&adev->pm.stable_pstate_ctx_lock); 4583 mutex_init(&adev->benchmark_mutex); 4584 mutex_init(&adev->gfx.reset_sem_mutex); 4585 /* Initialize the mutex for cleaner shader isolation between GFX and compute processes */ 4586 mutex_init(&adev->enforce_isolation_mutex); 4587 for (i = 0; i < MAX_XCP; ++i) { 4588 adev->isolation[i].spearhead = dma_fence_get_stub(); 4589 amdgpu_sync_create(&adev->isolation[i].active); 4590 amdgpu_sync_create(&adev->isolation[i].prev); 4591 } 4592 mutex_init(&adev->gfx.userq_sch_mutex); 4593 mutex_init(&adev->gfx.workload_profile_mutex); 4594 mutex_init(&adev->vcn.workload_profile_mutex); 4595 4596 amdgpu_device_init_apu_flags(adev); 4597 4598 r = amdgpu_device_check_arguments(adev); 4599 if (r) 4600 return r; 4601 4602 spin_lock_init(&adev->mmio_idx_lock); 4603 spin_lock_init(&adev->smc_idx_lock); 4604 spin_lock_init(&adev->pcie_idx_lock); 4605 spin_lock_init(&adev->uvd_ctx_idx_lock); 4606 spin_lock_init(&adev->didt_idx_lock); 4607 spin_lock_init(&adev->gc_cac_idx_lock); 4608 spin_lock_init(&adev->se_cac_idx_lock); 4609 spin_lock_init(&adev->audio_endpt_idx_lock); 4610 spin_lock_init(&adev->mm_stats.lock); 4611 spin_lock_init(&adev->virt.rlcg_reg_lock); 4612 spin_lock_init(&adev->wb.lock); 4613 4614 xa_init_flags(&adev->userq_xa, XA_FLAGS_LOCK_IRQ); 4615 4616 INIT_LIST_HEAD(&adev->reset_list); 4617 4618 INIT_LIST_HEAD(&adev->ras_list); 4619 4620 INIT_LIST_HEAD(&adev->pm.od_kobj_list); 4621 4622 xa_init(&adev->userq_doorbell_xa); 4623 4624 INIT_DELAYED_WORK(&adev->delayed_init_work, 4625 amdgpu_device_delayed_init_work_handler); 4626 INIT_DELAYED_WORK(&adev->gfx.gfx_off_delay_work, 4627 amdgpu_device_delay_enable_gfx_off); 4628 /* 4629 * Initialize the enforce_isolation work structures for each XCP 4630 * partition. This work handler is responsible for enforcing shader 4631 * isolation on AMD GPUs. It counts the number of emitted fences for 4632 * each GFX and compute ring. If there are any fences, it schedules 4633 * the `enforce_isolation_work` to be run after a delay. If there are 4634 * no fences, it signals the Kernel Fusion Driver (KFD) to resume the 4635 * runqueue. 4636 */ 4637 for (i = 0; i < MAX_XCP; i++) { 4638 INIT_DELAYED_WORK(&adev->gfx.enforce_isolation[i].work, 4639 amdgpu_gfx_enforce_isolation_handler); 4640 adev->gfx.enforce_isolation[i].adev = adev; 4641 adev->gfx.enforce_isolation[i].xcp_id = i; 4642 } 4643 4644 INIT_WORK(&adev->xgmi_reset_work, amdgpu_device_xgmi_reset_func); 4645 INIT_WORK(&adev->userq_reset_work, amdgpu_userq_reset_work); 4646 4647 adev->gfx.gfx_off_req_count = 1; 4648 adev->gfx.gfx_off_residency = 0; 4649 adev->gfx.gfx_off_entrycount = 0; 4650 adev->pm.ac_power = power_supply_is_system_supplied() > 0; 4651 4652 atomic_set(&adev->throttling_logging_enabled, 1); 4653 /* 4654 * If throttling continues, logging will be performed every minute 4655 * to avoid log flooding. "-1" is subtracted since the thermal 4656 * throttling interrupt comes every second. Thus, the total logging 4657 * interval is 59 seconds(retelimited printk interval) + 1(waiting 4658 * for throttling interrupt) = 60 seconds. 4659 */ 4660 ratelimit_state_init(&adev->throttling_logging_rs, (60 - 1) * HZ, 1); 4661 4662 ratelimit_set_flags(&adev->throttling_logging_rs, RATELIMIT_MSG_ON_RELEASE); 4663 4664 /* Registers mapping */ 4665 /* TODO: block userspace mapping of io register */ 4666 if (adev->asic_type >= CHIP_BONAIRE) { 4667 adev->rmmio_base = pci_resource_start(adev->pdev, 5); 4668 adev->rmmio_size = pci_resource_len(adev->pdev, 5); 4669 } else { 4670 adev->rmmio_base = pci_resource_start(adev->pdev, 2); 4671 adev->rmmio_size = pci_resource_len(adev->pdev, 2); 4672 } 4673 4674 for (i = 0; i < AMD_IP_BLOCK_TYPE_NUM; i++) 4675 atomic_set(&adev->pm.pwr_state[i], POWER_STATE_UNKNOWN); 4676 4677 adev->rmmio = ioremap(adev->rmmio_base, adev->rmmio_size); 4678 if (!adev->rmmio) 4679 return -ENOMEM; 4680 4681 dev_info(adev->dev, "register mmio base: 0x%08X\n", 4682 (uint32_t)adev->rmmio_base); 4683 dev_info(adev->dev, "register mmio size: %u\n", 4684 (unsigned int)adev->rmmio_size); 4685 4686 /* 4687 * Reset domain needs to be present early, before XGMI hive discovered 4688 * (if any) and initialized to use reset sem and in_gpu reset flag 4689 * early on during init and before calling to RREG32. 4690 */ 4691 adev->reset_domain = amdgpu_reset_create_reset_domain(SINGLE_DEVICE, "amdgpu-reset-dev"); 4692 if (!adev->reset_domain) 4693 return -ENOMEM; 4694 4695 /* detect hw virtualization here */ 4696 amdgpu_virt_init(adev); 4697 4698 amdgpu_device_get_pcie_info(adev); 4699 4700 r = amdgpu_device_get_job_timeout_settings(adev); 4701 if (r) { 4702 dev_err(adev->dev, "invalid lockup_timeout parameter syntax\n"); 4703 return r; 4704 } 4705 4706 amdgpu_device_set_mcbp(adev); 4707 4708 /* 4709 * By default, use default mode where all blocks are expected to be 4710 * initialized. At present a 'swinit' of blocks is required to be 4711 * completed before the need for a different level is detected. 4712 */ 4713 amdgpu_set_init_level(adev, AMDGPU_INIT_LEVEL_DEFAULT); 4714 /* early init functions */ 4715 r = amdgpu_device_ip_early_init(adev); 4716 if (r) 4717 return r; 4718 4719 /* 4720 * No need to remove conflicting FBs for non-display class devices. 4721 * This prevents the sysfb from being freed accidently. 4722 */ 4723 if ((pdev->class >> 8) == PCI_CLASS_DISPLAY_VGA || 4724 (pdev->class >> 8) == PCI_CLASS_DISPLAY_OTHER) { 4725 /* Get rid of things like offb */ 4726 r = aperture_remove_conflicting_pci_devices(adev->pdev, amdgpu_kms_driver.name); 4727 if (r) 4728 return r; 4729 } 4730 4731 /* Enable TMZ based on IP_VERSION */ 4732 amdgpu_gmc_tmz_set(adev); 4733 4734 if (amdgpu_sriov_vf(adev) && 4735 amdgpu_ip_version(adev, GC_HWIP, 0) >= IP_VERSION(10, 3, 0)) 4736 /* VF MMIO access (except mailbox range) from CPU 4737 * will be blocked during sriov runtime 4738 */ 4739 adev->virt.caps |= AMDGPU_VF_MMIO_ACCESS_PROTECT; 4740 4741 amdgpu_gmc_noretry_set(adev); 4742 /* Need to get xgmi info early to decide the reset behavior*/ 4743 if (adev->gmc.xgmi.supported) { 4744 r = adev->gfxhub.funcs->get_xgmi_info(adev); 4745 if (r) 4746 return r; 4747 } 4748 4749 /* enable PCIE atomic ops */ 4750 if (amdgpu_sriov_vf(adev)) { 4751 if (adev->virt.fw_reserve.p_pf2vf) 4752 adev->have_atomics_support = ((struct amd_sriov_msg_pf2vf_info *) 4753 adev->virt.fw_reserve.p_pf2vf)->pcie_atomic_ops_support_flags == 4754 (PCI_EXP_DEVCAP2_ATOMIC_COMP32 | PCI_EXP_DEVCAP2_ATOMIC_COMP64); 4755 /* APUs w/ gfx9 onwards doesn't reply on PCIe atomics, rather it is a 4756 * internal path natively support atomics, set have_atomics_support to true. 4757 */ 4758 } else if ((adev->flags & AMD_IS_APU) && 4759 (amdgpu_ip_version(adev, GC_HWIP, 0) > 4760 IP_VERSION(9, 0, 0))) { 4761 adev->have_atomics_support = true; 4762 } else { 4763 adev->have_atomics_support = 4764 !pci_enable_atomic_ops_to_root(adev->pdev, 4765 PCI_EXP_DEVCAP2_ATOMIC_COMP32 | 4766 PCI_EXP_DEVCAP2_ATOMIC_COMP64); 4767 } 4768 4769 if (!adev->have_atomics_support) 4770 dev_info(adev->dev, "PCIE atomic ops is not supported\n"); 4771 4772 /* doorbell bar mapping and doorbell index init*/ 4773 amdgpu_doorbell_init(adev); 4774 4775 if (amdgpu_emu_mode == 1) { 4776 /* post the asic on emulation mode */ 4777 emu_soc_asic_init(adev); 4778 goto fence_driver_init; 4779 } 4780 4781 amdgpu_reset_init(adev); 4782 4783 /* detect if we are with an SRIOV vbios */ 4784 if (adev->bios) 4785 amdgpu_device_detect_sriov_bios(adev); 4786 4787 /* check if we need to reset the asic 4788 * E.g., driver was not cleanly unloaded previously, etc. 4789 */ 4790 if (!amdgpu_sriov_vf(adev) && amdgpu_asic_need_reset_on_init(adev)) { 4791 if (adev->gmc.xgmi.num_physical_nodes) { 4792 dev_info(adev->dev, "Pending hive reset.\n"); 4793 amdgpu_set_init_level(adev, 4794 AMDGPU_INIT_LEVEL_MINIMAL_XGMI); 4795 } else if (amdgpu_ip_version(adev, MP1_HWIP, 0) == IP_VERSION(13, 0, 10) && 4796 !amdgpu_device_has_display_hardware(adev)) { 4797 r = psp_gpu_reset(adev); 4798 } else { 4799 tmp = amdgpu_reset_method; 4800 /* It should do a default reset when loading or reloading the driver, 4801 * regardless of the module parameter reset_method. 4802 */ 4803 amdgpu_reset_method = AMD_RESET_METHOD_NONE; 4804 r = amdgpu_asic_reset(adev); 4805 amdgpu_reset_method = tmp; 4806 } 4807 4808 if (r) { 4809 dev_err(adev->dev, "asic reset on init failed\n"); 4810 goto failed; 4811 } 4812 } 4813 4814 /* Post card if necessary */ 4815 if (amdgpu_device_need_post(adev)) { 4816 if (!adev->bios) { 4817 dev_err(adev->dev, "no vBIOS found\n"); 4818 r = -EINVAL; 4819 goto failed; 4820 } 4821 dev_info(adev->dev, "GPU posting now...\n"); 4822 r = amdgpu_device_asic_init(adev); 4823 if (r) { 4824 dev_err(adev->dev, "gpu post error!\n"); 4825 goto failed; 4826 } 4827 } 4828 4829 if (adev->bios) { 4830 if (adev->is_atom_fw) { 4831 /* Initialize clocks */ 4832 r = amdgpu_atomfirmware_get_clock_info(adev); 4833 if (r) { 4834 dev_err(adev->dev, "amdgpu_atomfirmware_get_clock_info failed\n"); 4835 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_GET_CLOCK_FAIL, 0, 0); 4836 goto failed; 4837 } 4838 } else { 4839 /* Initialize clocks */ 4840 r = amdgpu_atombios_get_clock_info(adev); 4841 if (r) { 4842 dev_err(adev->dev, "amdgpu_atombios_get_clock_info failed\n"); 4843 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_GET_CLOCK_FAIL, 0, 0); 4844 goto failed; 4845 } 4846 /* init i2c buses */ 4847 amdgpu_i2c_init(adev); 4848 } 4849 } 4850 4851 fence_driver_init: 4852 /* Fence driver */ 4853 r = amdgpu_fence_driver_sw_init(adev); 4854 if (r) { 4855 dev_err(adev->dev, "amdgpu_fence_driver_sw_init failed\n"); 4856 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_FENCE_INIT_FAIL, 0, 0); 4857 goto failed; 4858 } 4859 4860 /* init the mode config */ 4861 drm_mode_config_init(adev_to_drm(adev)); 4862 4863 r = amdgpu_device_ip_init(adev); 4864 if (r) { 4865 dev_err(adev->dev, "amdgpu_device_ip_init failed\n"); 4866 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_AMDGPU_INIT_FAIL, 0, 0); 4867 goto release_ras_con; 4868 } 4869 4870 amdgpu_fence_driver_hw_init(adev); 4871 4872 dev_info(adev->dev, 4873 "SE %d, SH per SE %d, CU per SH %d, active_cu_number %d\n", 4874 adev->gfx.config.max_shader_engines, 4875 adev->gfx.config.max_sh_per_se, 4876 adev->gfx.config.max_cu_per_sh, 4877 adev->gfx.cu_info.number); 4878 4879 adev->accel_working = true; 4880 4881 amdgpu_vm_check_compute_bug(adev); 4882 4883 /* Initialize the buffer migration limit. */ 4884 if (amdgpu_moverate >= 0) 4885 max_MBps = amdgpu_moverate; 4886 else 4887 max_MBps = 8; /* Allow 8 MB/s. */ 4888 /* Get a log2 for easy divisions. */ 4889 adev->mm_stats.log2_max_MBps = ilog2(max(1u, max_MBps)); 4890 4891 /* 4892 * Register gpu instance before amdgpu_device_enable_mgpu_fan_boost. 4893 * Otherwise the mgpu fan boost feature will be skipped due to the 4894 * gpu instance is counted less. 4895 */ 4896 amdgpu_register_gpu_instance(adev); 4897 4898 /* enable clockgating, etc. after ib tests, etc. since some blocks require 4899 * explicit gating rather than handling it automatically. 4900 */ 4901 if (adev->init_lvl->level != AMDGPU_INIT_LEVEL_MINIMAL_XGMI) { 4902 r = amdgpu_device_ip_late_init(adev); 4903 if (r) { 4904 dev_err(adev->dev, "amdgpu_device_ip_late_init failed\n"); 4905 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_AMDGPU_LATE_INIT_FAIL, 0, r); 4906 goto release_ras_con; 4907 } 4908 /* must succeed. */ 4909 amdgpu_ras_resume(adev); 4910 queue_delayed_work(system_wq, &adev->delayed_init_work, 4911 msecs_to_jiffies(AMDGPU_RESUME_MS)); 4912 } 4913 4914 if (amdgpu_sriov_vf(adev)) { 4915 amdgpu_virt_release_full_gpu(adev, true); 4916 flush_delayed_work(&adev->delayed_init_work); 4917 } 4918 4919 if (adev->init_lvl->level == AMDGPU_INIT_LEVEL_MINIMAL_XGMI) 4920 amdgpu_xgmi_reset_on_init(adev); 4921 /* 4922 * Place those sysfs registering after `late_init`. As some of those 4923 * operations performed in `late_init` might affect the sysfs 4924 * interfaces creating. 4925 */ 4926 r = amdgpu_device_sys_interface_init(adev); 4927 4928 if (IS_ENABLED(CONFIG_PERF_EVENTS)) 4929 r = amdgpu_pmu_init(adev); 4930 if (r) 4931 dev_err(adev->dev, "amdgpu_pmu_init failed\n"); 4932 4933 /* Have stored pci confspace at hand for restore in sudden PCI error */ 4934 if (amdgpu_device_cache_pci_state(adev->pdev)) 4935 pci_restore_state(pdev); 4936 4937 /* if we have > 1 VGA cards, then disable the amdgpu VGA resources */ 4938 /* this will fail for cards that aren't VGA class devices, just 4939 * ignore it 4940 */ 4941 if ((adev->pdev->class >> 8) == PCI_CLASS_DISPLAY_VGA) 4942 vga_client_register(adev->pdev, amdgpu_device_vga_set_decode); 4943 4944 px = amdgpu_device_supports_px(adev); 4945 4946 if (px || (!dev_is_removable(&adev->pdev->dev) && 4947 apple_gmux_detect(NULL, NULL))) 4948 vga_switcheroo_register_client(adev->pdev, 4949 &amdgpu_switcheroo_ops, px); 4950 4951 if (px) 4952 vga_switcheroo_init_domain_pm_ops(adev->dev, &adev->vga_pm_domain); 4953 4954 amdgpu_device_check_iommu_direct_map(adev); 4955 4956 adev->pm_nb.notifier_call = amdgpu_device_pm_notifier; 4957 r = register_pm_notifier(&adev->pm_nb); 4958 if (r) 4959 goto failed; 4960 4961 return 0; 4962 4963 release_ras_con: 4964 if (amdgpu_sriov_vf(adev)) 4965 amdgpu_virt_release_full_gpu(adev, true); 4966 4967 /* failed in exclusive mode due to timeout */ 4968 if (amdgpu_sriov_vf(adev) && 4969 !amdgpu_sriov_runtime(adev) && 4970 amdgpu_virt_mmio_blocked(adev) && 4971 !amdgpu_virt_wait_reset(adev)) { 4972 dev_err(adev->dev, "VF exclusive mode timeout\n"); 4973 /* Don't send request since VF is inactive. */ 4974 adev->virt.caps &= ~AMDGPU_SRIOV_CAPS_RUNTIME; 4975 adev->virt.ops = NULL; 4976 r = -EAGAIN; 4977 } 4978 amdgpu_release_ras_context(adev); 4979 4980 failed: 4981 amdgpu_vf_error_trans_all(adev); 4982 4983 return r; 4984 } 4985 4986 static void amdgpu_device_unmap_mmio(struct amdgpu_device *adev) 4987 { 4988 4989 /* Clear all CPU mappings pointing to this device */ 4990 unmap_mapping_range(adev->ddev.anon_inode->i_mapping, 0, 0, 1); 4991 4992 /* Unmap all mapped bars - Doorbell, registers and VRAM */ 4993 amdgpu_doorbell_fini(adev); 4994 4995 iounmap(adev->rmmio); 4996 adev->rmmio = NULL; 4997 if (adev->mman.aper_base_kaddr) 4998 iounmap(adev->mman.aper_base_kaddr); 4999 adev->mman.aper_base_kaddr = NULL; 5000 5001 /* Memory manager related */ 5002 if (!adev->gmc.xgmi.connected_to_cpu && !adev->gmc.is_app_apu) { 5003 arch_phys_wc_del(adev->gmc.vram_mtrr); 5004 arch_io_free_memtype_wc(adev->gmc.aper_base, adev->gmc.aper_size); 5005 } 5006 } 5007 5008 /** 5009 * amdgpu_device_fini_hw - tear down the driver 5010 * 5011 * @adev: amdgpu_device pointer 5012 * 5013 * Tear down the driver info (all asics). 5014 * Called at driver shutdown. 5015 */ 5016 void amdgpu_device_fini_hw(struct amdgpu_device *adev) 5017 { 5018 dev_info(adev->dev, "amdgpu: finishing device.\n"); 5019 flush_delayed_work(&adev->delayed_init_work); 5020 5021 if (adev->mman.initialized) 5022 drain_workqueue(adev->mman.bdev.wq); 5023 adev->shutdown = true; 5024 5025 unregister_pm_notifier(&adev->pm_nb); 5026 5027 /* make sure IB test finished before entering exclusive mode 5028 * to avoid preemption on IB test 5029 */ 5030 if (amdgpu_sriov_vf(adev)) { 5031 amdgpu_virt_request_full_gpu(adev, false); 5032 amdgpu_virt_fini_data_exchange(adev); 5033 } 5034 5035 /* disable all interrupts */ 5036 amdgpu_irq_disable_all(adev); 5037 if (adev->mode_info.mode_config_initialized) { 5038 if (!drm_drv_uses_atomic_modeset(adev_to_drm(adev))) 5039 drm_helper_force_disable_all(adev_to_drm(adev)); 5040 else 5041 drm_atomic_helper_shutdown(adev_to_drm(adev)); 5042 } 5043 amdgpu_fence_driver_hw_fini(adev); 5044 5045 amdgpu_device_sys_interface_fini(adev); 5046 5047 /* disable ras feature must before hw fini */ 5048 amdgpu_ras_pre_fini(adev); 5049 5050 amdgpu_ttm_set_buffer_funcs_status(adev, false); 5051 5052 amdgpu_device_ip_fini_early(adev); 5053 5054 amdgpu_irq_fini_hw(adev); 5055 5056 if (adev->mman.initialized) 5057 ttm_device_clear_dma_mappings(&adev->mman.bdev); 5058 5059 amdgpu_gart_dummy_page_fini(adev); 5060 5061 if (drm_dev_is_unplugged(adev_to_drm(adev))) 5062 amdgpu_device_unmap_mmio(adev); 5063 5064 } 5065 5066 void amdgpu_device_fini_sw(struct amdgpu_device *adev) 5067 { 5068 int i, idx; 5069 bool px; 5070 5071 amdgpu_device_ip_fini(adev); 5072 amdgpu_fence_driver_sw_fini(adev); 5073 amdgpu_ucode_release(&adev->firmware.gpu_info_fw); 5074 adev->accel_working = false; 5075 dma_fence_put(rcu_dereference_protected(adev->gang_submit, true)); 5076 for (i = 0; i < MAX_XCP; ++i) { 5077 dma_fence_put(adev->isolation[i].spearhead); 5078 amdgpu_sync_free(&adev->isolation[i].active); 5079 amdgpu_sync_free(&adev->isolation[i].prev); 5080 } 5081 5082 amdgpu_reset_fini(adev); 5083 5084 /* free i2c buses */ 5085 amdgpu_i2c_fini(adev); 5086 5087 if (adev->bios) { 5088 if (amdgpu_emu_mode != 1) 5089 amdgpu_atombios_fini(adev); 5090 amdgpu_bios_release(adev); 5091 } 5092 5093 kfree(adev->fru_info); 5094 adev->fru_info = NULL; 5095 5096 kfree(adev->xcp_mgr); 5097 adev->xcp_mgr = NULL; 5098 5099 px = amdgpu_device_supports_px(adev); 5100 5101 if (px || (!dev_is_removable(&adev->pdev->dev) && 5102 apple_gmux_detect(NULL, NULL))) 5103 vga_switcheroo_unregister_client(adev->pdev); 5104 5105 if (px) 5106 vga_switcheroo_fini_domain_pm_ops(adev->dev); 5107 5108 if ((adev->pdev->class >> 8) == PCI_CLASS_DISPLAY_VGA) 5109 vga_client_unregister(adev->pdev); 5110 5111 if (drm_dev_enter(adev_to_drm(adev), &idx)) { 5112 5113 iounmap(adev->rmmio); 5114 adev->rmmio = NULL; 5115 drm_dev_exit(idx); 5116 } 5117 5118 if (IS_ENABLED(CONFIG_PERF_EVENTS)) 5119 amdgpu_pmu_fini(adev); 5120 if (adev->discovery.bin) 5121 amdgpu_discovery_fini(adev); 5122 5123 amdgpu_reset_put_reset_domain(adev->reset_domain); 5124 adev->reset_domain = NULL; 5125 5126 kfree(adev->pci_state); 5127 kfree(adev->pcie_reset_ctx.swds_pcistate); 5128 kfree(adev->pcie_reset_ctx.swus_pcistate); 5129 } 5130 5131 /** 5132 * amdgpu_device_evict_resources - evict device resources 5133 * @adev: amdgpu device object 5134 * 5135 * Evicts all ttm device resources(vram BOs, gart table) from the lru list 5136 * of the vram memory type. Mainly used for evicting device resources 5137 * at suspend time. 5138 * 5139 */ 5140 static int amdgpu_device_evict_resources(struct amdgpu_device *adev) 5141 { 5142 int ret; 5143 5144 /* No need to evict vram on APUs unless going to S4 */ 5145 if (!adev->in_s4 && (adev->flags & AMD_IS_APU)) 5146 return 0; 5147 5148 /* No need to evict when going to S5 through S4 callbacks */ 5149 if (system_state == SYSTEM_POWER_OFF) 5150 return 0; 5151 5152 ret = amdgpu_ttm_evict_resources(adev, TTM_PL_VRAM); 5153 if (ret) { 5154 dev_warn(adev->dev, "evicting device resources failed\n"); 5155 return ret; 5156 } 5157 5158 if (adev->in_s4) { 5159 ret = ttm_device_prepare_hibernation(&adev->mman.bdev); 5160 if (ret) 5161 dev_err(adev->dev, "prepare hibernation failed, %d\n", ret); 5162 } 5163 return ret; 5164 } 5165 5166 /* 5167 * Suspend & resume. 5168 */ 5169 /** 5170 * amdgpu_device_pm_notifier - Notification block for Suspend/Hibernate events 5171 * @nb: notifier block 5172 * @mode: suspend mode 5173 * @data: data 5174 * 5175 * This function is called when the system is about to suspend or hibernate. 5176 * It is used to set the appropriate flags so that eviction can be optimized 5177 * in the pm prepare callback. 5178 */ 5179 static int amdgpu_device_pm_notifier(struct notifier_block *nb, unsigned long mode, 5180 void *data) 5181 { 5182 struct amdgpu_device *adev = container_of(nb, struct amdgpu_device, pm_nb); 5183 5184 switch (mode) { 5185 case PM_HIBERNATION_PREPARE: 5186 adev->in_s4 = true; 5187 break; 5188 case PM_POST_HIBERNATION: 5189 adev->in_s4 = false; 5190 break; 5191 } 5192 5193 return NOTIFY_DONE; 5194 } 5195 5196 /** 5197 * amdgpu_device_prepare - prepare for device suspend 5198 * 5199 * @dev: drm dev pointer 5200 * 5201 * Prepare to put the hw in the suspend state (all asics). 5202 * Returns 0 for success or an error on failure. 5203 * Called at driver suspend. 5204 */ 5205 int amdgpu_device_prepare(struct drm_device *dev) 5206 { 5207 struct amdgpu_device *adev = drm_to_adev(dev); 5208 int i, r; 5209 5210 if (dev->switch_power_state == DRM_SWITCH_POWER_OFF) 5211 return 0; 5212 5213 /* Evict the majority of BOs before starting suspend sequence */ 5214 r = amdgpu_device_evict_resources(adev); 5215 if (r) 5216 return r; 5217 5218 flush_delayed_work(&adev->gfx.gfx_off_delay_work); 5219 5220 for (i = 0; i < adev->num_ip_blocks; i++) { 5221 if (!adev->ip_blocks[i].status.valid) 5222 continue; 5223 if (!adev->ip_blocks[i].version->funcs->prepare_suspend) 5224 continue; 5225 r = adev->ip_blocks[i].version->funcs->prepare_suspend(&adev->ip_blocks[i]); 5226 if (r) 5227 return r; 5228 } 5229 5230 return 0; 5231 } 5232 5233 /** 5234 * amdgpu_device_complete - complete power state transition 5235 * 5236 * @dev: drm dev pointer 5237 * 5238 * Undo the changes from amdgpu_device_prepare. This will be 5239 * called on all resume transitions, including those that failed. 5240 */ 5241 void amdgpu_device_complete(struct drm_device *dev) 5242 { 5243 struct amdgpu_device *adev = drm_to_adev(dev); 5244 int i; 5245 5246 for (i = 0; i < adev->num_ip_blocks; i++) { 5247 if (!adev->ip_blocks[i].status.valid) 5248 continue; 5249 if (!adev->ip_blocks[i].version->funcs->complete) 5250 continue; 5251 adev->ip_blocks[i].version->funcs->complete(&adev->ip_blocks[i]); 5252 } 5253 } 5254 5255 /** 5256 * amdgpu_device_suspend - initiate device suspend 5257 * 5258 * @dev: drm dev pointer 5259 * @notify_clients: notify in-kernel DRM clients 5260 * 5261 * Puts the hw in the suspend state (all asics). 5262 * Returns 0 for success or an error on failure. 5263 * Called at driver suspend. 5264 */ 5265 int amdgpu_device_suspend(struct drm_device *dev, bool notify_clients) 5266 { 5267 struct amdgpu_device *adev = drm_to_adev(dev); 5268 int r, rec; 5269 5270 if (dev->switch_power_state == DRM_SWITCH_POWER_OFF) 5271 return 0; 5272 5273 adev->in_suspend = true; 5274 5275 if (amdgpu_sriov_vf(adev)) { 5276 if (!adev->in_runpm) 5277 amdgpu_amdkfd_suspend_process(adev); 5278 amdgpu_virt_fini_data_exchange(adev); 5279 r = amdgpu_virt_request_full_gpu(adev, false); 5280 if (r) 5281 return r; 5282 } 5283 5284 r = amdgpu_acpi_smart_shift_update(adev, AMDGPU_SS_DEV_D3); 5285 if (r) 5286 goto unwind_sriov; 5287 5288 if (notify_clients) 5289 drm_client_dev_suspend(adev_to_drm(adev)); 5290 5291 cancel_delayed_work_sync(&adev->delayed_init_work); 5292 5293 amdgpu_ras_suspend(adev); 5294 5295 r = amdgpu_device_ip_suspend_phase1(adev); 5296 if (r) 5297 goto unwind_smartshift; 5298 5299 amdgpu_amdkfd_suspend(adev, !amdgpu_sriov_vf(adev) && !adev->in_runpm); 5300 r = amdgpu_userq_suspend(adev); 5301 if (r) 5302 goto unwind_ip_phase1; 5303 5304 r = amdgpu_device_evict_resources(adev); 5305 if (r) 5306 goto unwind_userq; 5307 5308 amdgpu_ttm_set_buffer_funcs_status(adev, false); 5309 5310 amdgpu_fence_driver_hw_fini(adev); 5311 5312 r = amdgpu_device_ip_suspend_phase2(adev); 5313 if (r) 5314 goto unwind_evict; 5315 5316 if (amdgpu_sriov_vf(adev)) 5317 amdgpu_virt_release_full_gpu(adev, false); 5318 5319 return 0; 5320 5321 unwind_evict: 5322 if (adev->mman.buffer_funcs_ring->sched.ready) 5323 amdgpu_ttm_set_buffer_funcs_status(adev, true); 5324 amdgpu_fence_driver_hw_init(adev); 5325 5326 unwind_userq: 5327 rec = amdgpu_userq_resume(adev); 5328 if (rec) { 5329 dev_warn(adev->dev, "failed to re-initialize user queues: %d\n", rec); 5330 return r; 5331 } 5332 rec = amdgpu_amdkfd_resume(adev, !amdgpu_sriov_vf(adev) && !adev->in_runpm); 5333 if (rec) { 5334 dev_warn(adev->dev, "failed to re-initialize kfd: %d\n", rec); 5335 return r; 5336 } 5337 5338 unwind_ip_phase1: 5339 /* suspend phase 1 = resume phase 3 */ 5340 rec = amdgpu_device_ip_resume_phase3(adev); 5341 if (rec) { 5342 dev_warn(adev->dev, "failed to re-initialize IPs phase1: %d\n", rec); 5343 return r; 5344 } 5345 5346 unwind_smartshift: 5347 rec = amdgpu_acpi_smart_shift_update(adev, AMDGPU_SS_DEV_D0); 5348 if (rec) { 5349 dev_warn(adev->dev, "failed to re-update smart shift: %d\n", rec); 5350 return r; 5351 } 5352 5353 if (notify_clients) 5354 drm_client_dev_resume(adev_to_drm(adev)); 5355 5356 amdgpu_ras_resume(adev); 5357 5358 unwind_sriov: 5359 if (amdgpu_sriov_vf(adev)) { 5360 rec = amdgpu_virt_request_full_gpu(adev, true); 5361 if (rec) { 5362 dev_warn(adev->dev, "failed to reinitialize sriov: %d\n", rec); 5363 return r; 5364 } 5365 } 5366 5367 adev->in_suspend = adev->in_s0ix = adev->in_s3 = false; 5368 5369 return r; 5370 } 5371 5372 static inline int amdgpu_virt_resume(struct amdgpu_device *adev) 5373 { 5374 int r; 5375 unsigned int prev_physical_node_id = adev->gmc.xgmi.physical_node_id; 5376 5377 /* During VM resume, QEMU programming of VF MSIX table (register GFXMSIX_VECT0_ADDR_LO) 5378 * may not work. The access could be blocked by nBIF protection as VF isn't in 5379 * exclusive access mode. Exclusive access is enabled now, disable/enable MSIX 5380 * so that QEMU reprograms MSIX table. 5381 */ 5382 amdgpu_restore_msix(adev); 5383 5384 r = adev->gfxhub.funcs->get_xgmi_info(adev); 5385 if (r) 5386 return r; 5387 5388 dev_info(adev->dev, "xgmi node, old id %d, new id %d\n", 5389 prev_physical_node_id, adev->gmc.xgmi.physical_node_id); 5390 5391 adev->vm_manager.vram_base_offset = adev->gfxhub.funcs->get_mc_fb_offset(adev); 5392 adev->vm_manager.vram_base_offset += 5393 adev->gmc.xgmi.physical_node_id * adev->gmc.xgmi.node_segment_size; 5394 5395 return 0; 5396 } 5397 5398 /** 5399 * amdgpu_device_resume - initiate device resume 5400 * 5401 * @dev: drm dev pointer 5402 * @notify_clients: notify in-kernel DRM clients 5403 * 5404 * Bring the hw back to operating state (all asics). 5405 * Returns 0 for success or an error on failure. 5406 * Called at driver resume. 5407 */ 5408 int amdgpu_device_resume(struct drm_device *dev, bool notify_clients) 5409 { 5410 struct amdgpu_device *adev = drm_to_adev(dev); 5411 int r = 0; 5412 5413 if (amdgpu_sriov_vf(adev)) { 5414 r = amdgpu_virt_request_full_gpu(adev, true); 5415 if (r) 5416 return r; 5417 } 5418 5419 if (amdgpu_virt_xgmi_migrate_enabled(adev)) { 5420 r = amdgpu_virt_resume(adev); 5421 if (r) 5422 goto exit; 5423 } 5424 5425 if (dev->switch_power_state == DRM_SWITCH_POWER_OFF) 5426 return 0; 5427 5428 if (adev->in_s0ix) 5429 amdgpu_dpm_gfx_state_change(adev, sGpuChangeState_D0Entry); 5430 5431 /* post card */ 5432 if (amdgpu_device_need_post(adev)) { 5433 r = amdgpu_device_asic_init(adev); 5434 if (r) 5435 dev_err(adev->dev, "amdgpu asic init failed\n"); 5436 } 5437 5438 r = amdgpu_device_ip_resume(adev); 5439 5440 if (r) { 5441 dev_err(adev->dev, "amdgpu_device_ip_resume failed (%d).\n", r); 5442 goto exit; 5443 } 5444 5445 r = amdgpu_amdkfd_resume(adev, !amdgpu_sriov_vf(adev) && !adev->in_runpm); 5446 if (r) 5447 goto exit; 5448 5449 r = amdgpu_userq_resume(adev); 5450 if (r) 5451 goto exit; 5452 5453 r = amdgpu_device_ip_late_init(adev); 5454 if (r) 5455 goto exit; 5456 5457 queue_delayed_work(system_wq, &adev->delayed_init_work, 5458 msecs_to_jiffies(AMDGPU_RESUME_MS)); 5459 exit: 5460 if (amdgpu_sriov_vf(adev)) { 5461 amdgpu_virt_init_data_exchange(adev); 5462 amdgpu_virt_release_full_gpu(adev, true); 5463 5464 if (!r && !adev->in_runpm) 5465 r = amdgpu_amdkfd_resume_process(adev); 5466 } 5467 5468 if (r) 5469 return r; 5470 5471 /* Make sure IB tests flushed */ 5472 flush_delayed_work(&adev->delayed_init_work); 5473 5474 if (notify_clients) 5475 drm_client_dev_resume(adev_to_drm(adev)); 5476 5477 amdgpu_ras_resume(adev); 5478 5479 if (adev->mode_info.num_crtc) { 5480 /* 5481 * Most of the connector probing functions try to acquire runtime pm 5482 * refs to ensure that the GPU is powered on when connector polling is 5483 * performed. Since we're calling this from a runtime PM callback, 5484 * trying to acquire rpm refs will cause us to deadlock. 5485 * 5486 * Since we're guaranteed to be holding the rpm lock, it's safe to 5487 * temporarily disable the rpm helpers so this doesn't deadlock us. 5488 */ 5489 #ifdef CONFIG_PM 5490 dev->dev->power.disable_depth++; 5491 #endif 5492 if (!adev->dc_enabled) 5493 drm_helper_hpd_irq_event(dev); 5494 else 5495 drm_kms_helper_hotplug_event(dev); 5496 #ifdef CONFIG_PM 5497 dev->dev->power.disable_depth--; 5498 #endif 5499 } 5500 5501 amdgpu_vram_mgr_clear_reset_blocks(adev); 5502 adev->in_suspend = false; 5503 5504 if (amdgpu_acpi_smart_shift_update(adev, AMDGPU_SS_DEV_D0)) 5505 dev_warn(adev->dev, "smart shift update failed\n"); 5506 5507 return 0; 5508 } 5509 5510 /** 5511 * amdgpu_device_ip_check_soft_reset - did soft reset succeed 5512 * 5513 * @adev: amdgpu_device pointer 5514 * 5515 * The list of all the hardware IPs that make up the asic is walked and 5516 * the check_soft_reset callbacks are run. check_soft_reset determines 5517 * if the asic is still hung or not. 5518 * Returns true if any of the IPs are still in a hung state, false if not. 5519 */ 5520 static bool amdgpu_device_ip_check_soft_reset(struct amdgpu_device *adev) 5521 { 5522 int i; 5523 bool asic_hang = false; 5524 5525 if (amdgpu_sriov_vf(adev)) 5526 return true; 5527 5528 if (amdgpu_asic_need_full_reset(adev)) 5529 return true; 5530 5531 for (i = 0; i < adev->num_ip_blocks; i++) { 5532 if (!adev->ip_blocks[i].status.valid) 5533 continue; 5534 if (adev->ip_blocks[i].version->funcs->check_soft_reset) 5535 adev->ip_blocks[i].status.hang = 5536 adev->ip_blocks[i].version->funcs->check_soft_reset( 5537 &adev->ip_blocks[i]); 5538 if (adev->ip_blocks[i].status.hang) { 5539 dev_info(adev->dev, "IP block:%s is hung!\n", adev->ip_blocks[i].version->funcs->name); 5540 asic_hang = true; 5541 } 5542 } 5543 return asic_hang; 5544 } 5545 5546 /** 5547 * amdgpu_device_ip_pre_soft_reset - prepare for soft reset 5548 * 5549 * @adev: amdgpu_device pointer 5550 * 5551 * The list of all the hardware IPs that make up the asic is walked and the 5552 * pre_soft_reset callbacks are run if the block is hung. pre_soft_reset 5553 * handles any IP specific hardware or software state changes that are 5554 * necessary for a soft reset to succeed. 5555 * Returns 0 on success, negative error code on failure. 5556 */ 5557 static int amdgpu_device_ip_pre_soft_reset(struct amdgpu_device *adev) 5558 { 5559 int i, r = 0; 5560 5561 for (i = 0; i < adev->num_ip_blocks; i++) { 5562 if (!adev->ip_blocks[i].status.valid) 5563 continue; 5564 if (adev->ip_blocks[i].status.hang && 5565 adev->ip_blocks[i].version->funcs->pre_soft_reset) { 5566 r = adev->ip_blocks[i].version->funcs->pre_soft_reset(&adev->ip_blocks[i]); 5567 if (r) 5568 return r; 5569 } 5570 } 5571 5572 return 0; 5573 } 5574 5575 /** 5576 * amdgpu_device_ip_need_full_reset - check if a full asic reset is needed 5577 * 5578 * @adev: amdgpu_device pointer 5579 * 5580 * Some hardware IPs cannot be soft reset. If they are hung, a full gpu 5581 * reset is necessary to recover. 5582 * Returns true if a full asic reset is required, false if not. 5583 */ 5584 static bool amdgpu_device_ip_need_full_reset(struct amdgpu_device *adev) 5585 { 5586 int i; 5587 5588 if (amdgpu_asic_need_full_reset(adev)) 5589 return true; 5590 5591 for (i = 0; i < adev->num_ip_blocks; i++) { 5592 if (!adev->ip_blocks[i].status.valid) 5593 continue; 5594 if ((adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) || 5595 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) || 5596 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_ACP) || 5597 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE) || 5598 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) { 5599 if (adev->ip_blocks[i].status.hang) { 5600 dev_info(adev->dev, "Some block need full reset!\n"); 5601 return true; 5602 } 5603 } 5604 } 5605 return false; 5606 } 5607 5608 /** 5609 * amdgpu_device_ip_soft_reset - do a soft reset 5610 * 5611 * @adev: amdgpu_device pointer 5612 * 5613 * The list of all the hardware IPs that make up the asic is walked and the 5614 * soft_reset callbacks are run if the block is hung. soft_reset handles any 5615 * IP specific hardware or software state changes that are necessary to soft 5616 * reset the IP. 5617 * Returns 0 on success, negative error code on failure. 5618 */ 5619 static int amdgpu_device_ip_soft_reset(struct amdgpu_device *adev) 5620 { 5621 int i, r = 0; 5622 5623 for (i = 0; i < adev->num_ip_blocks; i++) { 5624 if (!adev->ip_blocks[i].status.valid) 5625 continue; 5626 if (adev->ip_blocks[i].status.hang && 5627 adev->ip_blocks[i].version->funcs->soft_reset) { 5628 r = adev->ip_blocks[i].version->funcs->soft_reset(&adev->ip_blocks[i]); 5629 if (r) 5630 return r; 5631 } 5632 } 5633 5634 return 0; 5635 } 5636 5637 /** 5638 * amdgpu_device_ip_post_soft_reset - clean up from soft reset 5639 * 5640 * @adev: amdgpu_device pointer 5641 * 5642 * The list of all the hardware IPs that make up the asic is walked and the 5643 * post_soft_reset callbacks are run if the asic was hung. post_soft_reset 5644 * handles any IP specific hardware or software state changes that are 5645 * necessary after the IP has been soft reset. 5646 * Returns 0 on success, negative error code on failure. 5647 */ 5648 static int amdgpu_device_ip_post_soft_reset(struct amdgpu_device *adev) 5649 { 5650 int i, r = 0; 5651 5652 for (i = 0; i < adev->num_ip_blocks; i++) { 5653 if (!adev->ip_blocks[i].status.valid) 5654 continue; 5655 if (adev->ip_blocks[i].status.hang && 5656 adev->ip_blocks[i].version->funcs->post_soft_reset) 5657 r = adev->ip_blocks[i].version->funcs->post_soft_reset(&adev->ip_blocks[i]); 5658 if (r) 5659 return r; 5660 } 5661 5662 return 0; 5663 } 5664 5665 /** 5666 * amdgpu_device_reset_sriov - reset ASIC for SR-IOV vf 5667 * 5668 * @adev: amdgpu_device pointer 5669 * @reset_context: amdgpu reset context pointer 5670 * 5671 * do VF FLR and reinitialize Asic 5672 * return 0 means succeeded otherwise failed 5673 */ 5674 static int amdgpu_device_reset_sriov(struct amdgpu_device *adev, 5675 struct amdgpu_reset_context *reset_context) 5676 { 5677 int r; 5678 struct amdgpu_hive_info *hive = NULL; 5679 5680 if (test_bit(AMDGPU_HOST_FLR, &reset_context->flags)) { 5681 if (!amdgpu_ras_get_fed_status(adev)) 5682 amdgpu_virt_ready_to_reset(adev); 5683 amdgpu_virt_wait_reset(adev); 5684 clear_bit(AMDGPU_HOST_FLR, &reset_context->flags); 5685 r = amdgpu_virt_request_full_gpu(adev, true); 5686 } else { 5687 r = amdgpu_virt_reset_gpu(adev); 5688 } 5689 if (r) 5690 return r; 5691 5692 amdgpu_ras_clear_err_state(adev); 5693 amdgpu_irq_gpu_reset_resume_helper(adev); 5694 5695 /* some sw clean up VF needs to do before recover */ 5696 amdgpu_virt_post_reset(adev); 5697 5698 /* Resume IP prior to SMC */ 5699 r = amdgpu_device_ip_reinit_early_sriov(adev); 5700 if (r) 5701 return r; 5702 5703 amdgpu_virt_init_data_exchange(adev); 5704 5705 r = amdgpu_device_fw_loading(adev); 5706 if (r) 5707 return r; 5708 5709 /* now we are okay to resume SMC/CP/SDMA */ 5710 r = amdgpu_device_ip_reinit_late_sriov(adev); 5711 if (r) 5712 return r; 5713 5714 hive = amdgpu_get_xgmi_hive(adev); 5715 /* Update PSP FW topology after reset */ 5716 if (hive && adev->gmc.xgmi.num_physical_nodes > 1) 5717 r = amdgpu_xgmi_update_topology(hive, adev); 5718 if (hive) 5719 amdgpu_put_xgmi_hive(hive); 5720 if (r) 5721 return r; 5722 5723 r = amdgpu_ib_ring_tests(adev); 5724 if (r) 5725 return r; 5726 5727 if (adev->virt.gim_feature & AMDGIM_FEATURE_GIM_FLR_VRAMLOST) 5728 amdgpu_inc_vram_lost(adev); 5729 5730 /* need to be called during full access so we can't do it later like 5731 * bare-metal does. 5732 */ 5733 amdgpu_amdkfd_post_reset(adev); 5734 amdgpu_virt_release_full_gpu(adev, true); 5735 5736 /* Aldebaran and gfx_11_0_3 support ras in SRIOV, so need resume ras during reset */ 5737 if (amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 2) || 5738 amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 3) || 5739 amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 4) || 5740 amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 5, 0) || 5741 amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(11, 0, 3)) 5742 amdgpu_ras_resume(adev); 5743 5744 amdgpu_virt_ras_telemetry_post_reset(adev); 5745 5746 return 0; 5747 } 5748 5749 /** 5750 * amdgpu_device_has_job_running - check if there is any unfinished job 5751 * 5752 * @adev: amdgpu_device pointer 5753 * 5754 * check if there is any job running on the device when guest driver receives 5755 * FLR notification from host driver. If there are still jobs running, then 5756 * the guest driver will not respond the FLR reset. Instead, let the job hit 5757 * the timeout and guest driver then issue the reset request. 5758 */ 5759 bool amdgpu_device_has_job_running(struct amdgpu_device *adev) 5760 { 5761 int i; 5762 5763 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 5764 struct amdgpu_ring *ring = adev->rings[i]; 5765 5766 if (!amdgpu_ring_sched_ready(ring)) 5767 continue; 5768 5769 if (amdgpu_fence_count_emitted(ring)) 5770 return true; 5771 } 5772 return false; 5773 } 5774 5775 /** 5776 * amdgpu_device_should_recover_gpu - check if we should try GPU recovery 5777 * 5778 * @adev: amdgpu_device pointer 5779 * 5780 * Check amdgpu_gpu_recovery and SRIOV status to see if we should try to recover 5781 * a hung GPU. 5782 */ 5783 bool amdgpu_device_should_recover_gpu(struct amdgpu_device *adev) 5784 { 5785 5786 if (amdgpu_gpu_recovery == 0) 5787 goto disabled; 5788 5789 /* Skip soft reset check in fatal error mode */ 5790 if (!amdgpu_ras_is_poison_mode_supported(adev)) 5791 return true; 5792 5793 if (amdgpu_sriov_vf(adev)) 5794 return true; 5795 5796 if (amdgpu_gpu_recovery == -1) { 5797 switch (adev->asic_type) { 5798 #ifdef CONFIG_DRM_AMDGPU_SI 5799 case CHIP_VERDE: 5800 case CHIP_TAHITI: 5801 case CHIP_PITCAIRN: 5802 case CHIP_OLAND: 5803 case CHIP_HAINAN: 5804 #endif 5805 #ifdef CONFIG_DRM_AMDGPU_CIK 5806 case CHIP_KAVERI: 5807 case CHIP_KABINI: 5808 case CHIP_MULLINS: 5809 #endif 5810 case CHIP_CARRIZO: 5811 case CHIP_STONEY: 5812 case CHIP_CYAN_SKILLFISH: 5813 goto disabled; 5814 default: 5815 break; 5816 } 5817 } 5818 5819 return true; 5820 5821 disabled: 5822 dev_info(adev->dev, "GPU recovery disabled.\n"); 5823 return false; 5824 } 5825 5826 int amdgpu_device_mode1_reset(struct amdgpu_device *adev) 5827 { 5828 u32 i; 5829 int ret = 0; 5830 5831 if (adev->bios) 5832 amdgpu_atombios_scratch_regs_engine_hung(adev, true); 5833 5834 dev_info(adev->dev, "GPU mode1 reset\n"); 5835 5836 /* Cache the state before bus master disable. The saved config space 5837 * values are used in other cases like restore after mode-2 reset. 5838 */ 5839 amdgpu_device_cache_pci_state(adev->pdev); 5840 5841 /* disable BM */ 5842 pci_clear_master(adev->pdev); 5843 5844 if (amdgpu_dpm_is_mode1_reset_supported(adev)) { 5845 dev_info(adev->dev, "GPU smu mode1 reset\n"); 5846 ret = amdgpu_dpm_mode1_reset(adev); 5847 } else { 5848 dev_info(adev->dev, "GPU psp mode1 reset\n"); 5849 ret = psp_gpu_reset(adev); 5850 } 5851 5852 if (ret) 5853 goto mode1_reset_failed; 5854 5855 amdgpu_device_load_pci_state(adev->pdev); 5856 ret = amdgpu_psp_wait_for_bootloader(adev); 5857 if (ret) 5858 goto mode1_reset_failed; 5859 5860 /* wait for asic to come out of reset */ 5861 for (i = 0; i < adev->usec_timeout; i++) { 5862 u32 memsize = adev->nbio.funcs->get_memsize(adev); 5863 5864 if (memsize != 0xffffffff) 5865 break; 5866 udelay(1); 5867 } 5868 5869 if (i >= adev->usec_timeout) { 5870 ret = -ETIMEDOUT; 5871 goto mode1_reset_failed; 5872 } 5873 5874 if (adev->bios) 5875 amdgpu_atombios_scratch_regs_engine_hung(adev, false); 5876 5877 return 0; 5878 5879 mode1_reset_failed: 5880 dev_err(adev->dev, "GPU mode1 reset failed\n"); 5881 return ret; 5882 } 5883 5884 int amdgpu_device_link_reset(struct amdgpu_device *adev) 5885 { 5886 int ret = 0; 5887 5888 dev_info(adev->dev, "GPU link reset\n"); 5889 5890 if (!amdgpu_reset_in_dpc(adev)) 5891 ret = amdgpu_dpm_link_reset(adev); 5892 5893 if (ret) 5894 goto link_reset_failed; 5895 5896 ret = amdgpu_psp_wait_for_bootloader(adev); 5897 if (ret) 5898 goto link_reset_failed; 5899 5900 return 0; 5901 5902 link_reset_failed: 5903 dev_err(adev->dev, "GPU link reset failed\n"); 5904 return ret; 5905 } 5906 5907 int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev, 5908 struct amdgpu_reset_context *reset_context) 5909 { 5910 int i, r = 0; 5911 struct amdgpu_job *job = NULL; 5912 struct amdgpu_device *tmp_adev = reset_context->reset_req_dev; 5913 bool need_full_reset = 5914 test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 5915 5916 if (reset_context->reset_req_dev == adev) 5917 job = reset_context->job; 5918 5919 if (amdgpu_sriov_vf(adev)) 5920 amdgpu_virt_pre_reset(adev); 5921 5922 amdgpu_fence_driver_isr_toggle(adev, true); 5923 5924 /* block all schedulers and reset given job's ring */ 5925 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 5926 struct amdgpu_ring *ring = adev->rings[i]; 5927 5928 if (!amdgpu_ring_sched_ready(ring)) 5929 continue; 5930 5931 /* after all hw jobs are reset, hw fence is meaningless, so force_completion */ 5932 amdgpu_fence_driver_force_completion(ring); 5933 } 5934 5935 amdgpu_fence_driver_isr_toggle(adev, false); 5936 5937 if (job && job->vm) 5938 drm_sched_increase_karma(&job->base); 5939 5940 r = amdgpu_reset_prepare_hwcontext(adev, reset_context); 5941 /* If reset handler not implemented, continue; otherwise return */ 5942 if (r == -EOPNOTSUPP) 5943 r = 0; 5944 else 5945 return r; 5946 5947 /* Don't suspend on bare metal if we are not going to HW reset the ASIC */ 5948 if (!amdgpu_sriov_vf(adev)) { 5949 5950 if (!need_full_reset) 5951 need_full_reset = amdgpu_device_ip_need_full_reset(adev); 5952 5953 if (!need_full_reset && amdgpu_gpu_recovery && 5954 amdgpu_device_ip_check_soft_reset(adev)) { 5955 amdgpu_device_ip_pre_soft_reset(adev); 5956 r = amdgpu_device_ip_soft_reset(adev); 5957 amdgpu_device_ip_post_soft_reset(adev); 5958 if (r || amdgpu_device_ip_check_soft_reset(adev)) { 5959 dev_info(adev->dev, "soft reset failed, will fallback to full reset!\n"); 5960 need_full_reset = true; 5961 } 5962 } 5963 5964 if (!test_bit(AMDGPU_SKIP_COREDUMP, &reset_context->flags)) { 5965 dev_info(tmp_adev->dev, "Dumping IP State\n"); 5966 /* Trigger ip dump before we reset the asic */ 5967 for (i = 0; i < tmp_adev->num_ip_blocks; i++) 5968 if (tmp_adev->ip_blocks[i].version->funcs->dump_ip_state) 5969 tmp_adev->ip_blocks[i].version->funcs 5970 ->dump_ip_state((void *)&tmp_adev->ip_blocks[i]); 5971 dev_info(tmp_adev->dev, "Dumping IP State Completed\n"); 5972 } 5973 5974 if (need_full_reset) 5975 r = amdgpu_device_ip_suspend(adev); 5976 if (need_full_reset) 5977 set_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 5978 else 5979 clear_bit(AMDGPU_NEED_FULL_RESET, 5980 &reset_context->flags); 5981 } 5982 5983 return r; 5984 } 5985 5986 int amdgpu_device_reinit_after_reset(struct amdgpu_reset_context *reset_context) 5987 { 5988 struct list_head *device_list_handle; 5989 bool full_reset, vram_lost = false; 5990 struct amdgpu_device *tmp_adev; 5991 int r, init_level; 5992 5993 device_list_handle = reset_context->reset_device_list; 5994 5995 if (!device_list_handle) 5996 return -EINVAL; 5997 5998 full_reset = test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 5999 6000 /** 6001 * If it's reset on init, it's default init level, otherwise keep level 6002 * as recovery level. 6003 */ 6004 if (reset_context->method == AMD_RESET_METHOD_ON_INIT) 6005 init_level = AMDGPU_INIT_LEVEL_DEFAULT; 6006 else 6007 init_level = AMDGPU_INIT_LEVEL_RESET_RECOVERY; 6008 6009 r = 0; 6010 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 6011 amdgpu_set_init_level(tmp_adev, init_level); 6012 if (full_reset) { 6013 /* post card */ 6014 amdgpu_reset_set_dpc_status(tmp_adev, false); 6015 amdgpu_ras_clear_err_state(tmp_adev); 6016 r = amdgpu_device_asic_init(tmp_adev); 6017 if (r) { 6018 dev_warn(tmp_adev->dev, "asic atom init failed!"); 6019 } else { 6020 dev_info(tmp_adev->dev, "GPU reset succeeded, trying to resume\n"); 6021 6022 r = amdgpu_device_ip_resume_phase1(tmp_adev); 6023 if (r) 6024 goto out; 6025 6026 vram_lost = amdgpu_device_check_vram_lost(tmp_adev); 6027 6028 if (!test_bit(AMDGPU_SKIP_COREDUMP, &reset_context->flags)) 6029 amdgpu_coredump(tmp_adev, false, vram_lost, reset_context->job); 6030 6031 if (vram_lost) { 6032 dev_info( 6033 tmp_adev->dev, 6034 "VRAM is lost due to GPU reset!\n"); 6035 amdgpu_inc_vram_lost(tmp_adev); 6036 } 6037 6038 r = amdgpu_device_fw_loading(tmp_adev); 6039 if (r) 6040 return r; 6041 6042 r = amdgpu_xcp_restore_partition_mode( 6043 tmp_adev->xcp_mgr); 6044 if (r) 6045 goto out; 6046 6047 r = amdgpu_device_ip_resume_phase2(tmp_adev); 6048 if (r) 6049 goto out; 6050 6051 if (tmp_adev->mman.buffer_funcs_ring->sched.ready) 6052 amdgpu_ttm_set_buffer_funcs_status(tmp_adev, true); 6053 6054 r = amdgpu_device_ip_resume_phase3(tmp_adev); 6055 if (r) 6056 goto out; 6057 6058 if (vram_lost) 6059 amdgpu_device_fill_reset_magic(tmp_adev); 6060 6061 /* 6062 * Add this ASIC as tracked as reset was already 6063 * complete successfully. 6064 */ 6065 amdgpu_register_gpu_instance(tmp_adev); 6066 6067 if (!reset_context->hive && 6068 tmp_adev->gmc.xgmi.num_physical_nodes > 1) 6069 amdgpu_xgmi_add_device(tmp_adev); 6070 6071 r = amdgpu_device_ip_late_init(tmp_adev); 6072 if (r) 6073 goto out; 6074 6075 r = amdgpu_userq_post_reset(tmp_adev, vram_lost); 6076 if (r) 6077 goto out; 6078 6079 drm_client_dev_resume(adev_to_drm(tmp_adev)); 6080 6081 /* 6082 * The GPU enters bad state once faulty pages 6083 * by ECC has reached the threshold, and ras 6084 * recovery is scheduled next. So add one check 6085 * here to break recovery if it indeed exceeds 6086 * bad page threshold, and remind user to 6087 * retire this GPU or setting one bigger 6088 * bad_page_threshold value to fix this once 6089 * probing driver again. 6090 */ 6091 if (!amdgpu_ras_is_rma(tmp_adev)) { 6092 /* must succeed. */ 6093 amdgpu_ras_resume(tmp_adev); 6094 } else { 6095 r = -EINVAL; 6096 goto out; 6097 } 6098 6099 /* Update PSP FW topology after reset */ 6100 if (reset_context->hive && 6101 tmp_adev->gmc.xgmi.num_physical_nodes > 1) 6102 r = amdgpu_xgmi_update_topology( 6103 reset_context->hive, tmp_adev); 6104 } 6105 } 6106 6107 out: 6108 if (!r) { 6109 /* IP init is complete now, set level as default */ 6110 amdgpu_set_init_level(tmp_adev, 6111 AMDGPU_INIT_LEVEL_DEFAULT); 6112 amdgpu_irq_gpu_reset_resume_helper(tmp_adev); 6113 r = amdgpu_ib_ring_tests(tmp_adev); 6114 if (r) { 6115 dev_err(tmp_adev->dev, "ib ring test failed (%d).\n", r); 6116 r = -EAGAIN; 6117 goto end; 6118 } 6119 } 6120 6121 if (r) 6122 tmp_adev->asic_reset_res = r; 6123 } 6124 6125 end: 6126 return r; 6127 } 6128 6129 int amdgpu_do_asic_reset(struct list_head *device_list_handle, 6130 struct amdgpu_reset_context *reset_context) 6131 { 6132 struct amdgpu_device *tmp_adev = NULL; 6133 bool need_full_reset, skip_hw_reset; 6134 int r = 0; 6135 6136 /* Try reset handler method first */ 6137 tmp_adev = list_first_entry(device_list_handle, struct amdgpu_device, 6138 reset_list); 6139 6140 reset_context->reset_device_list = device_list_handle; 6141 r = amdgpu_reset_perform_reset(tmp_adev, reset_context); 6142 /* If reset handler not implemented, continue; otherwise return */ 6143 if (r == -EOPNOTSUPP) 6144 r = 0; 6145 else 6146 return r; 6147 6148 /* Reset handler not implemented, use the default method */ 6149 need_full_reset = 6150 test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 6151 skip_hw_reset = test_bit(AMDGPU_SKIP_HW_RESET, &reset_context->flags); 6152 6153 /* 6154 * ASIC reset has to be done on all XGMI hive nodes ASAP 6155 * to allow proper links negotiation in FW (within 1 sec) 6156 */ 6157 if (!skip_hw_reset && need_full_reset) { 6158 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 6159 /* For XGMI run all resets in parallel to speed up the process */ 6160 if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) { 6161 if (!queue_work(system_unbound_wq, 6162 &tmp_adev->xgmi_reset_work)) 6163 r = -EALREADY; 6164 } else 6165 r = amdgpu_asic_reset(tmp_adev); 6166 6167 if (r) { 6168 dev_err(tmp_adev->dev, 6169 "ASIC reset failed with error, %d for drm dev, %s", 6170 r, adev_to_drm(tmp_adev)->unique); 6171 goto out; 6172 } 6173 } 6174 6175 /* For XGMI wait for all resets to complete before proceed */ 6176 if (!r) { 6177 list_for_each_entry(tmp_adev, device_list_handle, 6178 reset_list) { 6179 if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) { 6180 flush_work(&tmp_adev->xgmi_reset_work); 6181 r = tmp_adev->asic_reset_res; 6182 if (r) 6183 break; 6184 } 6185 } 6186 } 6187 } 6188 6189 if (!r && amdgpu_ras_intr_triggered()) { 6190 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 6191 amdgpu_ras_reset_error_count(tmp_adev, 6192 AMDGPU_RAS_BLOCK__MMHUB); 6193 } 6194 6195 amdgpu_ras_intr_cleared(); 6196 } 6197 6198 r = amdgpu_device_reinit_after_reset(reset_context); 6199 if (r == -EAGAIN) 6200 set_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 6201 else 6202 clear_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 6203 6204 out: 6205 return r; 6206 } 6207 6208 static void amdgpu_device_set_mp1_state(struct amdgpu_device *adev) 6209 { 6210 6211 switch (amdgpu_asic_reset_method(adev)) { 6212 case AMD_RESET_METHOD_MODE1: 6213 case AMD_RESET_METHOD_LINK: 6214 adev->mp1_state = PP_MP1_STATE_SHUTDOWN; 6215 break; 6216 case AMD_RESET_METHOD_MODE2: 6217 adev->mp1_state = PP_MP1_STATE_RESET; 6218 break; 6219 default: 6220 adev->mp1_state = PP_MP1_STATE_NONE; 6221 break; 6222 } 6223 } 6224 6225 static void amdgpu_device_unset_mp1_state(struct amdgpu_device *adev) 6226 { 6227 amdgpu_vf_error_trans_all(adev); 6228 adev->mp1_state = PP_MP1_STATE_NONE; 6229 } 6230 6231 static void amdgpu_device_resume_display_audio(struct amdgpu_device *adev) 6232 { 6233 struct pci_dev *p = NULL; 6234 6235 p = pci_get_domain_bus_and_slot(pci_domain_nr(adev->pdev->bus), 6236 adev->pdev->bus->number, 1); 6237 if (p) { 6238 pm_runtime_enable(&(p->dev)); 6239 pm_runtime_resume(&(p->dev)); 6240 } 6241 6242 pci_dev_put(p); 6243 } 6244 6245 static int amdgpu_device_suspend_display_audio(struct amdgpu_device *adev) 6246 { 6247 enum amd_reset_method reset_method; 6248 struct pci_dev *p = NULL; 6249 u64 expires; 6250 6251 /* 6252 * For now, only BACO and mode1 reset are confirmed 6253 * to suffer the audio issue without proper suspended. 6254 */ 6255 reset_method = amdgpu_asic_reset_method(adev); 6256 if ((reset_method != AMD_RESET_METHOD_BACO) && 6257 (reset_method != AMD_RESET_METHOD_MODE1)) 6258 return -EINVAL; 6259 6260 p = pci_get_domain_bus_and_slot(pci_domain_nr(adev->pdev->bus), 6261 adev->pdev->bus->number, 1); 6262 if (!p) 6263 return -ENODEV; 6264 6265 expires = pm_runtime_autosuspend_expiration(&(p->dev)); 6266 if (!expires) 6267 /* 6268 * If we cannot get the audio device autosuspend delay, 6269 * a fixed 4S interval will be used. Considering 3S is 6270 * the audio controller default autosuspend delay setting. 6271 * 4S used here is guaranteed to cover that. 6272 */ 6273 expires = ktime_get_mono_fast_ns() + NSEC_PER_SEC * 4ULL; 6274 6275 while (!pm_runtime_status_suspended(&(p->dev))) { 6276 if (!pm_runtime_suspend(&(p->dev))) 6277 break; 6278 6279 if (expires < ktime_get_mono_fast_ns()) { 6280 dev_warn(adev->dev, "failed to suspend display audio\n"); 6281 pci_dev_put(p); 6282 /* TODO: abort the succeeding gpu reset? */ 6283 return -ETIMEDOUT; 6284 } 6285 } 6286 6287 pm_runtime_disable(&(p->dev)); 6288 6289 pci_dev_put(p); 6290 return 0; 6291 } 6292 6293 static inline void amdgpu_device_stop_pending_resets(struct amdgpu_device *adev) 6294 { 6295 struct amdgpu_ras *con = amdgpu_ras_get_context(adev); 6296 6297 #if defined(CONFIG_DEBUG_FS) 6298 if (!amdgpu_sriov_vf(adev)) 6299 cancel_work(&adev->reset_work); 6300 #endif 6301 cancel_work(&adev->userq_reset_work); 6302 6303 if (adev->kfd.dev) 6304 cancel_work(&adev->kfd.reset_work); 6305 6306 if (amdgpu_sriov_vf(adev)) 6307 cancel_work(&adev->virt.flr_work); 6308 6309 if (con && adev->ras_enabled) 6310 cancel_work(&con->recovery_work); 6311 6312 } 6313 6314 static int amdgpu_device_health_check(struct list_head *device_list_handle) 6315 { 6316 struct amdgpu_device *tmp_adev; 6317 int ret = 0; 6318 6319 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 6320 ret |= amdgpu_device_bus_status_check(tmp_adev); 6321 } 6322 6323 return ret; 6324 } 6325 6326 static void amdgpu_device_recovery_prepare(struct amdgpu_device *adev, 6327 struct list_head *device_list, 6328 struct amdgpu_hive_info *hive) 6329 { 6330 struct amdgpu_device *tmp_adev = NULL; 6331 6332 /* 6333 * Build list of devices to reset. 6334 * In case we are in XGMI hive mode, resort the device list 6335 * to put adev in the 1st position. 6336 */ 6337 if (!amdgpu_sriov_vf(adev) && (adev->gmc.xgmi.num_physical_nodes > 1) && hive) { 6338 list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head) { 6339 list_add_tail(&tmp_adev->reset_list, device_list); 6340 if (adev->shutdown) 6341 tmp_adev->shutdown = true; 6342 if (amdgpu_reset_in_dpc(adev)) 6343 tmp_adev->pcie_reset_ctx.in_link_reset = true; 6344 } 6345 if (!list_is_first(&adev->reset_list, device_list)) 6346 list_rotate_to_front(&adev->reset_list, device_list); 6347 } else { 6348 list_add_tail(&adev->reset_list, device_list); 6349 } 6350 } 6351 6352 static void amdgpu_device_recovery_get_reset_lock(struct amdgpu_device *adev, 6353 struct list_head *device_list) 6354 { 6355 struct amdgpu_device *tmp_adev = NULL; 6356 6357 if (list_empty(device_list)) 6358 return; 6359 tmp_adev = 6360 list_first_entry(device_list, struct amdgpu_device, reset_list); 6361 amdgpu_device_lock_reset_domain(tmp_adev->reset_domain); 6362 } 6363 6364 static void amdgpu_device_recovery_put_reset_lock(struct amdgpu_device *adev, 6365 struct list_head *device_list) 6366 { 6367 struct amdgpu_device *tmp_adev = NULL; 6368 6369 if (list_empty(device_list)) 6370 return; 6371 tmp_adev = 6372 list_first_entry(device_list, struct amdgpu_device, reset_list); 6373 amdgpu_device_unlock_reset_domain(tmp_adev->reset_domain); 6374 } 6375 6376 static void amdgpu_device_halt_activities(struct amdgpu_device *adev, 6377 struct amdgpu_job *job, 6378 struct amdgpu_reset_context *reset_context, 6379 struct list_head *device_list, 6380 struct amdgpu_hive_info *hive, 6381 bool need_emergency_restart) 6382 { 6383 struct amdgpu_device *tmp_adev = NULL; 6384 int i; 6385 6386 /* block all schedulers and reset given job's ring */ 6387 list_for_each_entry(tmp_adev, device_list, reset_list) { 6388 amdgpu_device_set_mp1_state(tmp_adev); 6389 6390 /* 6391 * Try to put the audio codec into suspend state 6392 * before gpu reset started. 6393 * 6394 * Due to the power domain of the graphics device 6395 * is shared with AZ power domain. Without this, 6396 * we may change the audio hardware from behind 6397 * the audio driver's back. That will trigger 6398 * some audio codec errors. 6399 */ 6400 if (!amdgpu_device_suspend_display_audio(tmp_adev)) 6401 tmp_adev->pcie_reset_ctx.audio_suspended = true; 6402 6403 amdgpu_ras_set_error_query_ready(tmp_adev, false); 6404 6405 cancel_delayed_work_sync(&tmp_adev->delayed_init_work); 6406 6407 amdgpu_amdkfd_pre_reset(tmp_adev, reset_context); 6408 6409 /* 6410 * Mark these ASICs to be reset as untracked first 6411 * And add them back after reset completed 6412 */ 6413 amdgpu_unregister_gpu_instance(tmp_adev); 6414 6415 drm_client_dev_suspend(adev_to_drm(tmp_adev)); 6416 6417 /* disable ras on ALL IPs */ 6418 if (!need_emergency_restart && !amdgpu_reset_in_dpc(adev) && 6419 amdgpu_device_ip_need_full_reset(tmp_adev)) 6420 amdgpu_ras_suspend(tmp_adev); 6421 6422 amdgpu_userq_pre_reset(tmp_adev); 6423 6424 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 6425 struct amdgpu_ring *ring = tmp_adev->rings[i]; 6426 6427 if (!amdgpu_ring_sched_ready(ring)) 6428 continue; 6429 6430 drm_sched_stop(&ring->sched, job ? &job->base : NULL); 6431 6432 if (need_emergency_restart) 6433 amdgpu_job_stop_all_jobs_on_sched(&ring->sched); 6434 } 6435 atomic_inc(&tmp_adev->gpu_reset_counter); 6436 } 6437 } 6438 6439 static int amdgpu_device_asic_reset(struct amdgpu_device *adev, 6440 struct list_head *device_list, 6441 struct amdgpu_reset_context *reset_context) 6442 { 6443 struct amdgpu_device *tmp_adev = NULL; 6444 int retry_limit = AMDGPU_MAX_RETRY_LIMIT; 6445 int r = 0; 6446 6447 retry: /* Rest of adevs pre asic reset from XGMI hive. */ 6448 list_for_each_entry(tmp_adev, device_list, reset_list) { 6449 r = amdgpu_device_pre_asic_reset(tmp_adev, reset_context); 6450 /*TODO Should we stop ?*/ 6451 if (r) { 6452 dev_err(tmp_adev->dev, "GPU pre asic reset failed with err, %d for drm dev, %s ", 6453 r, adev_to_drm(tmp_adev)->unique); 6454 tmp_adev->asic_reset_res = r; 6455 } 6456 } 6457 6458 /* Actual ASIC resets if needed.*/ 6459 /* Host driver will handle XGMI hive reset for SRIOV */ 6460 if (amdgpu_sriov_vf(adev)) { 6461 6462 /* Bail out of reset early */ 6463 if (amdgpu_ras_is_rma(adev)) 6464 return -ENODEV; 6465 6466 if (amdgpu_ras_get_fed_status(adev) || amdgpu_virt_rcvd_ras_interrupt(adev)) { 6467 dev_dbg(adev->dev, "Detected RAS error, wait for FLR completion\n"); 6468 amdgpu_ras_set_fed(adev, true); 6469 set_bit(AMDGPU_HOST_FLR, &reset_context->flags); 6470 } 6471 6472 r = amdgpu_device_reset_sriov(adev, reset_context); 6473 if (AMDGPU_RETRY_SRIOV_RESET(r) && (retry_limit--) > 0) { 6474 amdgpu_virt_release_full_gpu(adev, true); 6475 goto retry; 6476 } 6477 if (r) 6478 adev->asic_reset_res = r; 6479 } else { 6480 r = amdgpu_do_asic_reset(device_list, reset_context); 6481 if (r && r == -EAGAIN) 6482 goto retry; 6483 } 6484 6485 list_for_each_entry(tmp_adev, device_list, reset_list) { 6486 /* 6487 * Drop any pending non scheduler resets queued before reset is done. 6488 * Any reset scheduled after this point would be valid. Scheduler resets 6489 * were already dropped during drm_sched_stop and no new ones can come 6490 * in before drm_sched_start. 6491 */ 6492 amdgpu_device_stop_pending_resets(tmp_adev); 6493 } 6494 6495 return r; 6496 } 6497 6498 static int amdgpu_device_sched_resume(struct list_head *device_list, 6499 struct amdgpu_reset_context *reset_context, 6500 bool job_signaled) 6501 { 6502 struct amdgpu_device *tmp_adev = NULL; 6503 int i, r = 0; 6504 6505 /* Post ASIC reset for all devs .*/ 6506 list_for_each_entry(tmp_adev, device_list, reset_list) { 6507 6508 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 6509 struct amdgpu_ring *ring = tmp_adev->rings[i]; 6510 6511 if (!amdgpu_ring_sched_ready(ring)) 6512 continue; 6513 6514 drm_sched_start(&ring->sched, 0); 6515 } 6516 6517 if (!drm_drv_uses_atomic_modeset(adev_to_drm(tmp_adev)) && !job_signaled) 6518 drm_helper_resume_force_mode(adev_to_drm(tmp_adev)); 6519 6520 if (tmp_adev->asic_reset_res) { 6521 /* bad news, how to tell it to userspace ? 6522 * for ras error, we should report GPU bad status instead of 6523 * reset failure 6524 */ 6525 if (reset_context->src != AMDGPU_RESET_SRC_RAS || 6526 !amdgpu_ras_eeprom_check_err_threshold(tmp_adev)) 6527 dev_info( 6528 tmp_adev->dev, 6529 "GPU reset(%d) failed with error %d \n", 6530 atomic_read( 6531 &tmp_adev->gpu_reset_counter), 6532 tmp_adev->asic_reset_res); 6533 amdgpu_vf_error_put(tmp_adev, 6534 AMDGIM_ERROR_VF_GPU_RESET_FAIL, 0, 6535 tmp_adev->asic_reset_res); 6536 if (!r) 6537 r = tmp_adev->asic_reset_res; 6538 tmp_adev->asic_reset_res = 0; 6539 } else { 6540 dev_info(tmp_adev->dev, "GPU reset(%d) succeeded!\n", 6541 atomic_read(&tmp_adev->gpu_reset_counter)); 6542 if (amdgpu_acpi_smart_shift_update(tmp_adev, 6543 AMDGPU_SS_DEV_D0)) 6544 dev_warn(tmp_adev->dev, 6545 "smart shift update failed\n"); 6546 } 6547 } 6548 6549 return r; 6550 } 6551 6552 static void amdgpu_device_gpu_resume(struct amdgpu_device *adev, 6553 struct list_head *device_list, 6554 bool need_emergency_restart) 6555 { 6556 struct amdgpu_device *tmp_adev = NULL; 6557 6558 list_for_each_entry(tmp_adev, device_list, reset_list) { 6559 /* unlock kfd: SRIOV would do it separately */ 6560 if (!need_emergency_restart && !amdgpu_sriov_vf(tmp_adev)) 6561 amdgpu_amdkfd_post_reset(tmp_adev); 6562 6563 /* kfd_post_reset will do nothing if kfd device is not initialized, 6564 * need to bring up kfd here if it's not be initialized before 6565 */ 6566 if (!adev->kfd.init_complete) 6567 amdgpu_amdkfd_device_init(adev); 6568 6569 if (tmp_adev->pcie_reset_ctx.audio_suspended) 6570 amdgpu_device_resume_display_audio(tmp_adev); 6571 6572 amdgpu_device_unset_mp1_state(tmp_adev); 6573 6574 amdgpu_ras_set_error_query_ready(tmp_adev, true); 6575 6576 } 6577 } 6578 6579 6580 /** 6581 * amdgpu_device_gpu_recover - reset the asic and recover scheduler 6582 * 6583 * @adev: amdgpu_device pointer 6584 * @job: which job trigger hang 6585 * @reset_context: amdgpu reset context pointer 6586 * 6587 * Attempt to reset the GPU if it has hung (all asics). 6588 * Attempt to do soft-reset or full-reset and reinitialize Asic 6589 * Returns 0 for success or an error on failure. 6590 */ 6591 6592 int amdgpu_device_gpu_recover(struct amdgpu_device *adev, 6593 struct amdgpu_job *job, 6594 struct amdgpu_reset_context *reset_context) 6595 { 6596 struct list_head device_list; 6597 bool job_signaled = false; 6598 struct amdgpu_hive_info *hive = NULL; 6599 int r = 0; 6600 bool need_emergency_restart = false; 6601 6602 /* 6603 * If it reaches here because of hang/timeout and a RAS error is 6604 * detected at the same time, let RAS recovery take care of it. 6605 */ 6606 if (amdgpu_ras_is_err_state(adev, AMDGPU_RAS_BLOCK__ANY) && 6607 !amdgpu_sriov_vf(adev) && 6608 reset_context->src != AMDGPU_RESET_SRC_RAS) { 6609 dev_dbg(adev->dev, 6610 "Gpu recovery from source: %d yielding to RAS error recovery handling", 6611 reset_context->src); 6612 return 0; 6613 } 6614 6615 /* 6616 * Special case: RAS triggered and full reset isn't supported 6617 */ 6618 need_emergency_restart = amdgpu_ras_need_emergency_restart(adev); 6619 6620 /* 6621 * Flush RAM to disk so that after reboot 6622 * the user can read log and see why the system rebooted. 6623 */ 6624 if (need_emergency_restart && amdgpu_ras_get_context(adev) && 6625 amdgpu_ras_get_context(adev)->reboot) { 6626 dev_warn(adev->dev, "Emergency reboot."); 6627 6628 ksys_sync_helper(); 6629 emergency_restart(); 6630 } 6631 6632 dev_info(adev->dev, "GPU %s begin!. Source: %d\n", 6633 need_emergency_restart ? "jobs stop" : "reset", 6634 reset_context->src); 6635 6636 if (!amdgpu_sriov_vf(adev)) 6637 hive = amdgpu_get_xgmi_hive(adev); 6638 if (hive) 6639 mutex_lock(&hive->hive_lock); 6640 6641 reset_context->job = job; 6642 reset_context->hive = hive; 6643 INIT_LIST_HEAD(&device_list); 6644 6645 amdgpu_device_recovery_prepare(adev, &device_list, hive); 6646 6647 if (!amdgpu_sriov_vf(adev)) { 6648 r = amdgpu_device_health_check(&device_list); 6649 if (r) 6650 goto end_reset; 6651 } 6652 6653 /* Cannot be called after locking reset domain */ 6654 amdgpu_ras_pre_reset(adev, &device_list); 6655 6656 /* We need to lock reset domain only once both for XGMI and single device */ 6657 amdgpu_device_recovery_get_reset_lock(adev, &device_list); 6658 6659 amdgpu_device_halt_activities(adev, job, reset_context, &device_list, 6660 hive, need_emergency_restart); 6661 if (need_emergency_restart) 6662 goto skip_sched_resume; 6663 /* 6664 * Must check guilty signal here since after this point all old 6665 * HW fences are force signaled. 6666 * 6667 * job->base holds a reference to parent fence 6668 */ 6669 if (job && dma_fence_is_signaled(&job->hw_fence->base)) { 6670 job_signaled = true; 6671 dev_info(adev->dev, "Guilty job already signaled, skipping HW reset"); 6672 goto skip_hw_reset; 6673 } 6674 6675 r = amdgpu_device_asic_reset(adev, &device_list, reset_context); 6676 if (r) 6677 goto reset_unlock; 6678 skip_hw_reset: 6679 r = amdgpu_device_sched_resume(&device_list, reset_context, job_signaled); 6680 if (r) 6681 goto reset_unlock; 6682 skip_sched_resume: 6683 amdgpu_device_gpu_resume(adev, &device_list, need_emergency_restart); 6684 reset_unlock: 6685 amdgpu_device_recovery_put_reset_lock(adev, &device_list); 6686 amdgpu_ras_post_reset(adev, &device_list); 6687 end_reset: 6688 if (hive) { 6689 mutex_unlock(&hive->hive_lock); 6690 amdgpu_put_xgmi_hive(hive); 6691 } 6692 6693 if (r) 6694 dev_info(adev->dev, "GPU reset end with ret = %d\n", r); 6695 6696 atomic_set(&adev->reset_domain->reset_res, r); 6697 6698 if (!r) { 6699 struct amdgpu_task_info *ti = NULL; 6700 6701 if (job) 6702 ti = amdgpu_vm_get_task_info_pasid(adev, job->pasid); 6703 6704 drm_dev_wedged_event(adev_to_drm(adev), DRM_WEDGE_RECOVERY_NONE, 6705 ti ? &ti->task : NULL); 6706 6707 amdgpu_vm_put_task_info(ti); 6708 } 6709 6710 return r; 6711 } 6712 6713 /** 6714 * amdgpu_device_partner_bandwidth - find the bandwidth of appropriate partner 6715 * 6716 * @adev: amdgpu_device pointer 6717 * @speed: pointer to the speed of the link 6718 * @width: pointer to the width of the link 6719 * 6720 * Evaluate the hierarchy to find the speed and bandwidth capabilities of the 6721 * first physical partner to an AMD dGPU. 6722 * This will exclude any virtual switches and links. 6723 */ 6724 static void amdgpu_device_partner_bandwidth(struct amdgpu_device *adev, 6725 enum pci_bus_speed *speed, 6726 enum pcie_link_width *width) 6727 { 6728 struct pci_dev *parent = adev->pdev; 6729 6730 if (!speed || !width) 6731 return; 6732 6733 *speed = PCI_SPEED_UNKNOWN; 6734 *width = PCIE_LNK_WIDTH_UNKNOWN; 6735 6736 if (amdgpu_device_pcie_dynamic_switching_supported(adev)) { 6737 while ((parent = pci_upstream_bridge(parent))) { 6738 /* skip upstream/downstream switches internal to dGPU*/ 6739 if (parent->vendor == PCI_VENDOR_ID_ATI) 6740 continue; 6741 *speed = pcie_get_speed_cap(parent); 6742 *width = pcie_get_width_cap(parent); 6743 break; 6744 } 6745 } else { 6746 /* use the current speeds rather than max if switching is not supported */ 6747 pcie_bandwidth_available(adev->pdev, NULL, speed, width); 6748 } 6749 } 6750 6751 /** 6752 * amdgpu_device_gpu_bandwidth - find the bandwidth of the GPU 6753 * 6754 * @adev: amdgpu_device pointer 6755 * @speed: pointer to the speed of the link 6756 * @width: pointer to the width of the link 6757 * 6758 * Evaluate the hierarchy to find the speed and bandwidth capabilities of the 6759 * AMD dGPU which may be a virtual upstream bridge. 6760 */ 6761 static void amdgpu_device_gpu_bandwidth(struct amdgpu_device *adev, 6762 enum pci_bus_speed *speed, 6763 enum pcie_link_width *width) 6764 { 6765 struct pci_dev *parent = adev->pdev; 6766 6767 if (!speed || !width) 6768 return; 6769 6770 parent = pci_upstream_bridge(parent); 6771 if (parent && parent->vendor == PCI_VENDOR_ID_ATI) { 6772 /* use the upstream/downstream switches internal to dGPU */ 6773 *speed = pcie_get_speed_cap(parent); 6774 *width = pcie_get_width_cap(parent); 6775 while ((parent = pci_upstream_bridge(parent))) { 6776 if (parent->vendor == PCI_VENDOR_ID_ATI) { 6777 /* use the upstream/downstream switches internal to dGPU */ 6778 *speed = pcie_get_speed_cap(parent); 6779 *width = pcie_get_width_cap(parent); 6780 } 6781 } 6782 } else { 6783 /* use the device itself */ 6784 *speed = pcie_get_speed_cap(adev->pdev); 6785 *width = pcie_get_width_cap(adev->pdev); 6786 } 6787 } 6788 6789 /** 6790 * amdgpu_device_get_pcie_info - fence pcie info about the PCIE slot 6791 * 6792 * @adev: amdgpu_device pointer 6793 * 6794 * Fetches and stores in the driver the PCIE capabilities (gen speed 6795 * and lanes) of the slot the device is in. Handles APUs and 6796 * virtualized environments where PCIE config space may not be available. 6797 */ 6798 static void amdgpu_device_get_pcie_info(struct amdgpu_device *adev) 6799 { 6800 enum pci_bus_speed speed_cap, platform_speed_cap; 6801 enum pcie_link_width platform_link_width, link_width; 6802 6803 if (amdgpu_pcie_gen_cap) 6804 adev->pm.pcie_gen_mask = amdgpu_pcie_gen_cap; 6805 6806 if (amdgpu_pcie_lane_cap) 6807 adev->pm.pcie_mlw_mask = amdgpu_pcie_lane_cap; 6808 6809 /* covers APUs as well */ 6810 if (pci_is_root_bus(adev->pdev->bus) && !amdgpu_passthrough(adev)) { 6811 if (adev->pm.pcie_gen_mask == 0) 6812 adev->pm.pcie_gen_mask = AMDGPU_DEFAULT_PCIE_GEN_MASK; 6813 if (adev->pm.pcie_mlw_mask == 0) 6814 adev->pm.pcie_mlw_mask = AMDGPU_DEFAULT_PCIE_MLW_MASK; 6815 return; 6816 } 6817 6818 if (adev->pm.pcie_gen_mask && adev->pm.pcie_mlw_mask) 6819 return; 6820 6821 amdgpu_device_partner_bandwidth(adev, &platform_speed_cap, 6822 &platform_link_width); 6823 amdgpu_device_gpu_bandwidth(adev, &speed_cap, &link_width); 6824 6825 if (adev->pm.pcie_gen_mask == 0) { 6826 /* asic caps */ 6827 if (speed_cap == PCI_SPEED_UNKNOWN) { 6828 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 6829 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 | 6830 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3); 6831 } else { 6832 if (speed_cap == PCIE_SPEED_32_0GT) 6833 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 6834 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 | 6835 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3 | 6836 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN4 | 6837 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN5); 6838 else if (speed_cap == PCIE_SPEED_16_0GT) 6839 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 6840 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 | 6841 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3 | 6842 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN4); 6843 else if (speed_cap == PCIE_SPEED_8_0GT) 6844 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 6845 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 | 6846 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3); 6847 else if (speed_cap == PCIE_SPEED_5_0GT) 6848 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 6849 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2); 6850 else 6851 adev->pm.pcie_gen_mask |= CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1; 6852 } 6853 /* platform caps */ 6854 if (platform_speed_cap == PCI_SPEED_UNKNOWN) { 6855 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 6856 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2); 6857 } else { 6858 if (platform_speed_cap == PCIE_SPEED_32_0GT) 6859 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 6860 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 | 6861 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3 | 6862 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN4 | 6863 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN5); 6864 else if (platform_speed_cap == PCIE_SPEED_16_0GT) 6865 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 6866 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 | 6867 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3 | 6868 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN4); 6869 else if (platform_speed_cap == PCIE_SPEED_8_0GT) 6870 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 6871 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 | 6872 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3); 6873 else if (platform_speed_cap == PCIE_SPEED_5_0GT) 6874 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 6875 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2); 6876 else 6877 adev->pm.pcie_gen_mask |= CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1; 6878 6879 } 6880 } 6881 if (adev->pm.pcie_mlw_mask == 0) { 6882 /* asic caps */ 6883 if (link_width == PCIE_LNK_WIDTH_UNKNOWN) { 6884 adev->pm.pcie_mlw_mask |= AMDGPU_DEFAULT_ASIC_PCIE_MLW_MASK; 6885 } else { 6886 switch (link_width) { 6887 case PCIE_LNK_X32: 6888 adev->pm.pcie_mlw_mask |= (CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X32 | 6889 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X16 | 6890 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X12 | 6891 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X8 | 6892 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X4 | 6893 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X2 | 6894 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X1); 6895 break; 6896 case PCIE_LNK_X16: 6897 adev->pm.pcie_mlw_mask |= (CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X16 | 6898 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X12 | 6899 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X8 | 6900 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X4 | 6901 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X2 | 6902 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X1); 6903 break; 6904 case PCIE_LNK_X12: 6905 adev->pm.pcie_mlw_mask |= (CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X12 | 6906 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X8 | 6907 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X4 | 6908 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X2 | 6909 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X1); 6910 break; 6911 case PCIE_LNK_X8: 6912 adev->pm.pcie_mlw_mask |= (CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X8 | 6913 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X4 | 6914 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X2 | 6915 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X1); 6916 break; 6917 case PCIE_LNK_X4: 6918 adev->pm.pcie_mlw_mask |= (CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X4 | 6919 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X2 | 6920 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X1); 6921 break; 6922 case PCIE_LNK_X2: 6923 adev->pm.pcie_mlw_mask |= (CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X2 | 6924 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X1); 6925 break; 6926 case PCIE_LNK_X1: 6927 adev->pm.pcie_mlw_mask |= CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X1; 6928 break; 6929 default: 6930 break; 6931 } 6932 } 6933 /* platform caps */ 6934 if (platform_link_width == PCIE_LNK_WIDTH_UNKNOWN) { 6935 adev->pm.pcie_mlw_mask |= AMDGPU_DEFAULT_PCIE_MLW_MASK; 6936 } else { 6937 switch (platform_link_width) { 6938 case PCIE_LNK_X32: 6939 adev->pm.pcie_mlw_mask |= (CAIL_PCIE_LINK_WIDTH_SUPPORT_X32 | 6940 CAIL_PCIE_LINK_WIDTH_SUPPORT_X16 | 6941 CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 | 6942 CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 | 6943 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 6944 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 6945 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 6946 break; 6947 case PCIE_LNK_X16: 6948 adev->pm.pcie_mlw_mask |= (CAIL_PCIE_LINK_WIDTH_SUPPORT_X16 | 6949 CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 | 6950 CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 | 6951 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 6952 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 6953 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 6954 break; 6955 case PCIE_LNK_X12: 6956 adev->pm.pcie_mlw_mask |= (CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 | 6957 CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 | 6958 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 6959 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 6960 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 6961 break; 6962 case PCIE_LNK_X8: 6963 adev->pm.pcie_mlw_mask |= (CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 | 6964 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 6965 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 6966 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 6967 break; 6968 case PCIE_LNK_X4: 6969 adev->pm.pcie_mlw_mask |= (CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 6970 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 6971 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 6972 break; 6973 case PCIE_LNK_X2: 6974 adev->pm.pcie_mlw_mask |= (CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 6975 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 6976 break; 6977 case PCIE_LNK_X1: 6978 adev->pm.pcie_mlw_mask |= CAIL_PCIE_LINK_WIDTH_SUPPORT_X1; 6979 break; 6980 default: 6981 break; 6982 } 6983 } 6984 } 6985 } 6986 6987 /** 6988 * amdgpu_device_is_peer_accessible - Check peer access through PCIe BAR 6989 * 6990 * @adev: amdgpu_device pointer 6991 * @peer_adev: amdgpu_device pointer for peer device trying to access @adev 6992 * 6993 * Return true if @peer_adev can access (DMA) @adev through the PCIe 6994 * BAR, i.e. @adev is "large BAR" and the BAR matches the DMA mask of 6995 * @peer_adev. 6996 */ 6997 bool amdgpu_device_is_peer_accessible(struct amdgpu_device *adev, 6998 struct amdgpu_device *peer_adev) 6999 { 7000 #ifdef CONFIG_HSA_AMD_P2P 7001 bool p2p_access = 7002 !adev->gmc.xgmi.connected_to_cpu && 7003 !(pci_p2pdma_distance(adev->pdev, peer_adev->dev, false) < 0); 7004 if (!p2p_access) 7005 dev_info(adev->dev, "PCIe P2P access from peer device %s is not supported by the chipset\n", 7006 pci_name(peer_adev->pdev)); 7007 7008 bool is_large_bar = adev->gmc.visible_vram_size && 7009 adev->gmc.real_vram_size == adev->gmc.visible_vram_size; 7010 bool p2p_addressable = amdgpu_device_check_iommu_remap(peer_adev); 7011 7012 if (!p2p_addressable) { 7013 uint64_t address_mask = peer_adev->dev->dma_mask ? 7014 ~*peer_adev->dev->dma_mask : ~((1ULL << 32) - 1); 7015 resource_size_t aper_limit = 7016 adev->gmc.aper_base + adev->gmc.aper_size - 1; 7017 7018 p2p_addressable = !(adev->gmc.aper_base & address_mask || 7019 aper_limit & address_mask); 7020 } 7021 return pcie_p2p && is_large_bar && p2p_access && p2p_addressable; 7022 #else 7023 return false; 7024 #endif 7025 } 7026 7027 int amdgpu_device_baco_enter(struct amdgpu_device *adev) 7028 { 7029 struct amdgpu_ras *ras = amdgpu_ras_get_context(adev); 7030 7031 if (!amdgpu_device_supports_baco(adev)) 7032 return -ENOTSUPP; 7033 7034 if (ras && adev->ras_enabled && 7035 adev->nbio.funcs->enable_doorbell_interrupt) 7036 adev->nbio.funcs->enable_doorbell_interrupt(adev, false); 7037 7038 return amdgpu_dpm_baco_enter(adev); 7039 } 7040 7041 int amdgpu_device_baco_exit(struct amdgpu_device *adev) 7042 { 7043 struct amdgpu_ras *ras = amdgpu_ras_get_context(adev); 7044 int ret = 0; 7045 7046 if (!amdgpu_device_supports_baco(adev)) 7047 return -ENOTSUPP; 7048 7049 ret = amdgpu_dpm_baco_exit(adev); 7050 if (ret) 7051 return ret; 7052 7053 if (ras && adev->ras_enabled && 7054 adev->nbio.funcs->enable_doorbell_interrupt) 7055 adev->nbio.funcs->enable_doorbell_interrupt(adev, true); 7056 7057 if (amdgpu_passthrough(adev) && adev->nbio.funcs && 7058 adev->nbio.funcs->clear_doorbell_interrupt) 7059 adev->nbio.funcs->clear_doorbell_interrupt(adev); 7060 7061 return 0; 7062 } 7063 7064 /** 7065 * amdgpu_pci_error_detected - Called when a PCI error is detected. 7066 * @pdev: PCI device struct 7067 * @state: PCI channel state 7068 * 7069 * Description: Called when a PCI error is detected. 7070 * 7071 * Return: PCI_ERS_RESULT_NEED_RESET or PCI_ERS_RESULT_DISCONNECT. 7072 */ 7073 pci_ers_result_t amdgpu_pci_error_detected(struct pci_dev *pdev, pci_channel_state_t state) 7074 { 7075 struct drm_device *dev = pci_get_drvdata(pdev); 7076 struct amdgpu_device *adev = drm_to_adev(dev); 7077 struct amdgpu_hive_info *hive __free(xgmi_put_hive) = 7078 amdgpu_get_xgmi_hive(adev); 7079 struct amdgpu_reset_context reset_context; 7080 struct list_head device_list; 7081 7082 dev_info(adev->dev, "PCI error: detected callback!!\n"); 7083 7084 adev->pci_channel_state = state; 7085 7086 switch (state) { 7087 case pci_channel_io_normal: 7088 dev_info(adev->dev, "pci_channel_io_normal: state(%d)!!\n", state); 7089 return PCI_ERS_RESULT_CAN_RECOVER; 7090 case pci_channel_io_frozen: 7091 /* Fatal error, prepare for slot reset */ 7092 dev_info(adev->dev, "pci_channel_io_frozen: state(%d)!!\n", state); 7093 if (hive) { 7094 /* Hive devices should be able to support FW based 7095 * link reset on other devices, if not return. 7096 */ 7097 if (!amdgpu_dpm_is_link_reset_supported(adev)) { 7098 dev_warn(adev->dev, 7099 "No support for XGMI hive yet...\n"); 7100 return PCI_ERS_RESULT_DISCONNECT; 7101 } 7102 /* Set dpc status only if device is part of hive 7103 * Non-hive devices should be able to recover after 7104 * link reset. 7105 */ 7106 amdgpu_reset_set_dpc_status(adev, true); 7107 7108 mutex_lock(&hive->hive_lock); 7109 } 7110 memset(&reset_context, 0, sizeof(reset_context)); 7111 INIT_LIST_HEAD(&device_list); 7112 7113 amdgpu_device_recovery_prepare(adev, &device_list, hive); 7114 amdgpu_device_recovery_get_reset_lock(adev, &device_list); 7115 amdgpu_device_halt_activities(adev, NULL, &reset_context, &device_list, 7116 hive, false); 7117 if (hive) 7118 mutex_unlock(&hive->hive_lock); 7119 return PCI_ERS_RESULT_NEED_RESET; 7120 case pci_channel_io_perm_failure: 7121 /* Permanent error, prepare for device removal */ 7122 dev_info(adev->dev, "pci_channel_io_perm_failure: state(%d)!!\n", state); 7123 return PCI_ERS_RESULT_DISCONNECT; 7124 } 7125 7126 return PCI_ERS_RESULT_NEED_RESET; 7127 } 7128 7129 /** 7130 * amdgpu_pci_mmio_enabled - Enable MMIO and dump debug registers 7131 * @pdev: pointer to PCI device 7132 */ 7133 pci_ers_result_t amdgpu_pci_mmio_enabled(struct pci_dev *pdev) 7134 { 7135 struct drm_device *dev = pci_get_drvdata(pdev); 7136 struct amdgpu_device *adev = drm_to_adev(dev); 7137 7138 dev_info(adev->dev, "PCI error: mmio enabled callback!!\n"); 7139 7140 /* TODO - dump whatever for debugging purposes */ 7141 7142 /* This called only if amdgpu_pci_error_detected returns 7143 * PCI_ERS_RESULT_CAN_RECOVER. Read/write to the device still 7144 * works, no need to reset slot. 7145 */ 7146 7147 return PCI_ERS_RESULT_RECOVERED; 7148 } 7149 7150 /** 7151 * amdgpu_pci_slot_reset - Called when PCI slot has been reset. 7152 * @pdev: PCI device struct 7153 * 7154 * Description: This routine is called by the pci error recovery 7155 * code after the PCI slot has been reset, just before we 7156 * should resume normal operations. 7157 */ 7158 pci_ers_result_t amdgpu_pci_slot_reset(struct pci_dev *pdev) 7159 { 7160 struct drm_device *dev = pci_get_drvdata(pdev); 7161 struct amdgpu_device *adev = drm_to_adev(dev); 7162 struct amdgpu_reset_context reset_context; 7163 struct amdgpu_device *tmp_adev; 7164 struct amdgpu_hive_info *hive; 7165 struct list_head device_list; 7166 struct pci_dev *link_dev; 7167 int r = 0, i, timeout; 7168 u32 memsize; 7169 u16 status; 7170 7171 dev_info(adev->dev, "PCI error: slot reset callback!!\n"); 7172 7173 memset(&reset_context, 0, sizeof(reset_context)); 7174 7175 if (adev->pcie_reset_ctx.swus) 7176 link_dev = adev->pcie_reset_ctx.swus; 7177 else 7178 link_dev = adev->pdev; 7179 /* wait for asic to come out of reset, timeout = 10s */ 7180 timeout = 10000; 7181 do { 7182 usleep_range(10000, 10500); 7183 r = pci_read_config_word(link_dev, PCI_VENDOR_ID, &status); 7184 timeout -= 10; 7185 } while (timeout > 0 && (status != PCI_VENDOR_ID_ATI) && 7186 (status != PCI_VENDOR_ID_AMD)); 7187 7188 if ((status != PCI_VENDOR_ID_ATI) && (status != PCI_VENDOR_ID_AMD)) { 7189 r = -ETIME; 7190 goto out; 7191 } 7192 7193 amdgpu_device_load_switch_state(adev); 7194 /* Restore PCI confspace */ 7195 amdgpu_device_load_pci_state(pdev); 7196 7197 /* confirm ASIC came out of reset */ 7198 for (i = 0; i < adev->usec_timeout; i++) { 7199 memsize = amdgpu_asic_get_config_memsize(adev); 7200 7201 if (memsize != 0xffffffff) 7202 break; 7203 udelay(1); 7204 } 7205 if (memsize == 0xffffffff) { 7206 r = -ETIME; 7207 goto out; 7208 } 7209 7210 reset_context.method = AMD_RESET_METHOD_NONE; 7211 reset_context.reset_req_dev = adev; 7212 set_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags); 7213 set_bit(AMDGPU_SKIP_COREDUMP, &reset_context.flags); 7214 INIT_LIST_HEAD(&device_list); 7215 7216 hive = amdgpu_get_xgmi_hive(adev); 7217 if (hive) { 7218 mutex_lock(&hive->hive_lock); 7219 reset_context.hive = hive; 7220 list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head) { 7221 tmp_adev->pcie_reset_ctx.in_link_reset = true; 7222 list_add_tail(&tmp_adev->reset_list, &device_list); 7223 } 7224 } else { 7225 set_bit(AMDGPU_SKIP_HW_RESET, &reset_context.flags); 7226 list_add_tail(&adev->reset_list, &device_list); 7227 } 7228 7229 r = amdgpu_device_asic_reset(adev, &device_list, &reset_context); 7230 out: 7231 if (!r) { 7232 if (amdgpu_device_cache_pci_state(adev->pdev)) 7233 pci_restore_state(adev->pdev); 7234 dev_info(adev->dev, "PCIe error recovery succeeded\n"); 7235 } else { 7236 dev_err(adev->dev, "PCIe error recovery failed, err:%d\n", r); 7237 if (hive) { 7238 list_for_each_entry(tmp_adev, &device_list, reset_list) 7239 amdgpu_device_unset_mp1_state(tmp_adev); 7240 } 7241 amdgpu_device_recovery_put_reset_lock(adev, &device_list); 7242 } 7243 7244 if (hive) { 7245 mutex_unlock(&hive->hive_lock); 7246 amdgpu_put_xgmi_hive(hive); 7247 } 7248 7249 return r ? PCI_ERS_RESULT_DISCONNECT : PCI_ERS_RESULT_RECOVERED; 7250 } 7251 7252 /** 7253 * amdgpu_pci_resume() - resume normal ops after PCI reset 7254 * @pdev: pointer to PCI device 7255 * 7256 * Called when the error recovery driver tells us that its 7257 * OK to resume normal operation. 7258 */ 7259 void amdgpu_pci_resume(struct pci_dev *pdev) 7260 { 7261 struct drm_device *dev = pci_get_drvdata(pdev); 7262 struct amdgpu_device *adev = drm_to_adev(dev); 7263 struct list_head device_list; 7264 struct amdgpu_hive_info *hive = NULL; 7265 struct amdgpu_device *tmp_adev = NULL; 7266 7267 dev_info(adev->dev, "PCI error: resume callback!!\n"); 7268 7269 /* Only continue execution for the case of pci_channel_io_frozen */ 7270 if (adev->pci_channel_state != pci_channel_io_frozen) 7271 return; 7272 7273 INIT_LIST_HEAD(&device_list); 7274 7275 hive = amdgpu_get_xgmi_hive(adev); 7276 if (hive) { 7277 mutex_lock(&hive->hive_lock); 7278 list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head) { 7279 tmp_adev->pcie_reset_ctx.in_link_reset = false; 7280 list_add_tail(&tmp_adev->reset_list, &device_list); 7281 } 7282 } else 7283 list_add_tail(&adev->reset_list, &device_list); 7284 7285 amdgpu_device_sched_resume(&device_list, NULL, NULL); 7286 amdgpu_device_gpu_resume(adev, &device_list, false); 7287 amdgpu_device_recovery_put_reset_lock(adev, &device_list); 7288 7289 if (hive) { 7290 mutex_unlock(&hive->hive_lock); 7291 amdgpu_put_xgmi_hive(hive); 7292 } 7293 } 7294 7295 static void amdgpu_device_cache_switch_state(struct amdgpu_device *adev) 7296 { 7297 struct pci_dev *swus, *swds; 7298 int r; 7299 7300 swds = pci_upstream_bridge(adev->pdev); 7301 if (!swds || swds->vendor != PCI_VENDOR_ID_ATI || 7302 pci_pcie_type(swds) != PCI_EXP_TYPE_DOWNSTREAM) 7303 return; 7304 swus = pci_upstream_bridge(swds); 7305 if (!swus || 7306 (swus->vendor != PCI_VENDOR_ID_ATI && 7307 swus->vendor != PCI_VENDOR_ID_AMD) || 7308 pci_pcie_type(swus) != PCI_EXP_TYPE_UPSTREAM) 7309 return; 7310 7311 /* If already saved, return */ 7312 if (adev->pcie_reset_ctx.swus) 7313 return; 7314 /* Upstream bridge is ATI, assume it's SWUS/DS architecture */ 7315 r = pci_save_state(swds); 7316 if (r) 7317 return; 7318 adev->pcie_reset_ctx.swds_pcistate = pci_store_saved_state(swds); 7319 7320 r = pci_save_state(swus); 7321 if (r) 7322 return; 7323 adev->pcie_reset_ctx.swus_pcistate = pci_store_saved_state(swus); 7324 7325 adev->pcie_reset_ctx.swus = swus; 7326 } 7327 7328 static void amdgpu_device_load_switch_state(struct amdgpu_device *adev) 7329 { 7330 struct pci_dev *pdev; 7331 int r; 7332 7333 if (!adev->pcie_reset_ctx.swds_pcistate || 7334 !adev->pcie_reset_ctx.swus_pcistate) 7335 return; 7336 7337 pdev = adev->pcie_reset_ctx.swus; 7338 r = pci_load_saved_state(pdev, adev->pcie_reset_ctx.swus_pcistate); 7339 if (!r) { 7340 pci_restore_state(pdev); 7341 } else { 7342 dev_warn(adev->dev, "Failed to load SWUS state, err:%d\n", r); 7343 return; 7344 } 7345 7346 pdev = pci_upstream_bridge(adev->pdev); 7347 r = pci_load_saved_state(pdev, adev->pcie_reset_ctx.swds_pcistate); 7348 if (!r) 7349 pci_restore_state(pdev); 7350 else 7351 dev_warn(adev->dev, "Failed to load SWDS state, err:%d\n", r); 7352 } 7353 7354 bool amdgpu_device_cache_pci_state(struct pci_dev *pdev) 7355 { 7356 struct drm_device *dev = pci_get_drvdata(pdev); 7357 struct amdgpu_device *adev = drm_to_adev(dev); 7358 int r; 7359 7360 if (amdgpu_sriov_vf(adev)) 7361 return false; 7362 7363 r = pci_save_state(pdev); 7364 if (!r) { 7365 kfree(adev->pci_state); 7366 7367 adev->pci_state = pci_store_saved_state(pdev); 7368 7369 if (!adev->pci_state) { 7370 dev_err(adev->dev, "Failed to store PCI saved state"); 7371 return false; 7372 } 7373 } else { 7374 dev_warn(adev->dev, "Failed to save PCI state, err:%d\n", r); 7375 return false; 7376 } 7377 7378 amdgpu_device_cache_switch_state(adev); 7379 7380 return true; 7381 } 7382 7383 bool amdgpu_device_load_pci_state(struct pci_dev *pdev) 7384 { 7385 struct drm_device *dev = pci_get_drvdata(pdev); 7386 struct amdgpu_device *adev = drm_to_adev(dev); 7387 int r; 7388 7389 if (!adev->pci_state) 7390 return false; 7391 7392 r = pci_load_saved_state(pdev, adev->pci_state); 7393 7394 if (!r) { 7395 pci_restore_state(pdev); 7396 } else { 7397 dev_warn(adev->dev, "Failed to load PCI state, err:%d\n", r); 7398 return false; 7399 } 7400 7401 return true; 7402 } 7403 7404 void amdgpu_device_flush_hdp(struct amdgpu_device *adev, 7405 struct amdgpu_ring *ring) 7406 { 7407 #ifdef CONFIG_X86_64 7408 if ((adev->flags & AMD_IS_APU) && !amdgpu_passthrough(adev)) 7409 return; 7410 #endif 7411 if (adev->gmc.xgmi.connected_to_cpu) 7412 return; 7413 7414 if (ring && ring->funcs->emit_hdp_flush) { 7415 amdgpu_ring_emit_hdp_flush(ring); 7416 return; 7417 } 7418 7419 if (!ring && amdgpu_sriov_runtime(adev)) { 7420 if (!amdgpu_kiq_hdp_flush(adev)) 7421 return; 7422 } 7423 7424 amdgpu_hdp_flush(adev, ring); 7425 } 7426 7427 void amdgpu_device_invalidate_hdp(struct amdgpu_device *adev, 7428 struct amdgpu_ring *ring) 7429 { 7430 #ifdef CONFIG_X86_64 7431 if ((adev->flags & AMD_IS_APU) && !amdgpu_passthrough(adev)) 7432 return; 7433 #endif 7434 if (adev->gmc.xgmi.connected_to_cpu) 7435 return; 7436 7437 amdgpu_hdp_invalidate(adev, ring); 7438 } 7439 7440 int amdgpu_in_reset(struct amdgpu_device *adev) 7441 { 7442 return atomic_read(&adev->reset_domain->in_gpu_reset); 7443 } 7444 7445 /** 7446 * amdgpu_device_halt() - bring hardware to some kind of halt state 7447 * 7448 * @adev: amdgpu_device pointer 7449 * 7450 * Bring hardware to some kind of halt state so that no one can touch it 7451 * any more. It will help to maintain error context when error occurred. 7452 * Compare to a simple hang, the system will keep stable at least for SSH 7453 * access. Then it should be trivial to inspect the hardware state and 7454 * see what's going on. Implemented as following: 7455 * 7456 * 1. drm_dev_unplug() makes device inaccessible to user space(IOCTLs, etc), 7457 * clears all CPU mappings to device, disallows remappings through page faults 7458 * 2. amdgpu_irq_disable_all() disables all interrupts 7459 * 3. amdgpu_fence_driver_hw_fini() signals all HW fences 7460 * 4. set adev->no_hw_access to avoid potential crashes after setp 5 7461 * 5. amdgpu_device_unmap_mmio() clears all MMIO mappings 7462 * 6. pci_disable_device() and pci_wait_for_pending_transaction() 7463 * flush any in flight DMA operations 7464 */ 7465 void amdgpu_device_halt(struct amdgpu_device *adev) 7466 { 7467 struct pci_dev *pdev = adev->pdev; 7468 struct drm_device *ddev = adev_to_drm(adev); 7469 7470 amdgpu_xcp_dev_unplug(adev); 7471 drm_dev_unplug(ddev); 7472 7473 amdgpu_irq_disable_all(adev); 7474 7475 amdgpu_fence_driver_hw_fini(adev); 7476 7477 adev->no_hw_access = true; 7478 7479 amdgpu_device_unmap_mmio(adev); 7480 7481 pci_disable_device(pdev); 7482 pci_wait_for_pending_transaction(pdev); 7483 } 7484 7485 u32 amdgpu_device_pcie_port_rreg(struct amdgpu_device *adev, 7486 u32 reg) 7487 { 7488 unsigned long flags, address, data; 7489 u32 r; 7490 7491 address = adev->nbio.funcs->get_pcie_port_index_offset(adev); 7492 data = adev->nbio.funcs->get_pcie_port_data_offset(adev); 7493 7494 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 7495 WREG32(address, reg * 4); 7496 (void)RREG32(address); 7497 r = RREG32(data); 7498 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 7499 return r; 7500 } 7501 7502 void amdgpu_device_pcie_port_wreg(struct amdgpu_device *adev, 7503 u32 reg, u32 v) 7504 { 7505 unsigned long flags, address, data; 7506 7507 address = adev->nbio.funcs->get_pcie_port_index_offset(adev); 7508 data = adev->nbio.funcs->get_pcie_port_data_offset(adev); 7509 7510 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 7511 WREG32(address, reg * 4); 7512 (void)RREG32(address); 7513 WREG32(data, v); 7514 (void)RREG32(data); 7515 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 7516 } 7517 7518 /** 7519 * amdgpu_device_get_gang - return a reference to the current gang 7520 * @adev: amdgpu_device pointer 7521 * 7522 * Returns: A new reference to the current gang leader. 7523 */ 7524 struct dma_fence *amdgpu_device_get_gang(struct amdgpu_device *adev) 7525 { 7526 struct dma_fence *fence; 7527 7528 rcu_read_lock(); 7529 fence = dma_fence_get_rcu_safe(&adev->gang_submit); 7530 rcu_read_unlock(); 7531 return fence; 7532 } 7533 7534 /** 7535 * amdgpu_device_switch_gang - switch to a new gang 7536 * @adev: amdgpu_device pointer 7537 * @gang: the gang to switch to 7538 * 7539 * Try to switch to a new gang. 7540 * Returns: NULL if we switched to the new gang or a reference to the current 7541 * gang leader. 7542 */ 7543 struct dma_fence *amdgpu_device_switch_gang(struct amdgpu_device *adev, 7544 struct dma_fence *gang) 7545 { 7546 struct dma_fence *old = NULL; 7547 7548 dma_fence_get(gang); 7549 do { 7550 dma_fence_put(old); 7551 old = amdgpu_device_get_gang(adev); 7552 if (old == gang) 7553 break; 7554 7555 if (!dma_fence_is_signaled(old)) { 7556 dma_fence_put(gang); 7557 return old; 7558 } 7559 7560 } while (cmpxchg((struct dma_fence __force **)&adev->gang_submit, 7561 old, gang) != old); 7562 7563 /* 7564 * Drop it once for the exchanged reference in adev and once for the 7565 * thread local reference acquired in amdgpu_device_get_gang(). 7566 */ 7567 dma_fence_put(old); 7568 dma_fence_put(old); 7569 return NULL; 7570 } 7571 7572 /** 7573 * amdgpu_device_enforce_isolation - enforce HW isolation 7574 * @adev: the amdgpu device pointer 7575 * @ring: the HW ring the job is supposed to run on 7576 * @job: the job which is about to be pushed to the HW ring 7577 * 7578 * Makes sure that only one client at a time can use the GFX block. 7579 * Returns: The dependency to wait on before the job can be pushed to the HW. 7580 * The function is called multiple times until NULL is returned. 7581 */ 7582 struct dma_fence *amdgpu_device_enforce_isolation(struct amdgpu_device *adev, 7583 struct amdgpu_ring *ring, 7584 struct amdgpu_job *job) 7585 { 7586 struct amdgpu_isolation *isolation = &adev->isolation[ring->xcp_id]; 7587 struct drm_sched_fence *f = job->base.s_fence; 7588 struct dma_fence *dep; 7589 void *owner; 7590 int r; 7591 7592 /* 7593 * For now enforce isolation only for the GFX block since we only need 7594 * the cleaner shader on those rings. 7595 */ 7596 if (ring->funcs->type != AMDGPU_RING_TYPE_GFX && 7597 ring->funcs->type != AMDGPU_RING_TYPE_COMPUTE) 7598 return NULL; 7599 7600 /* 7601 * All submissions where enforce isolation is false are handled as if 7602 * they come from a single client. Use ~0l as the owner to distinct it 7603 * from kernel submissions where the owner is NULL. 7604 */ 7605 owner = job->enforce_isolation ? f->owner : (void *)~0l; 7606 7607 mutex_lock(&adev->enforce_isolation_mutex); 7608 7609 /* 7610 * The "spearhead" submission is the first one which changes the 7611 * ownership to its client. We always need to wait for it to be 7612 * pushed to the HW before proceeding with anything. 7613 */ 7614 if (&f->scheduled != isolation->spearhead && 7615 !dma_fence_is_signaled(isolation->spearhead)) { 7616 dep = isolation->spearhead; 7617 goto out_grab_ref; 7618 } 7619 7620 if (isolation->owner != owner) { 7621 7622 /* 7623 * Wait for any gang to be assembled before switching to a 7624 * different owner or otherwise we could deadlock the 7625 * submissions. 7626 */ 7627 if (!job->gang_submit) { 7628 dep = amdgpu_device_get_gang(adev); 7629 if (!dma_fence_is_signaled(dep)) 7630 goto out_return_dep; 7631 dma_fence_put(dep); 7632 } 7633 7634 dma_fence_put(isolation->spearhead); 7635 isolation->spearhead = dma_fence_get(&f->scheduled); 7636 amdgpu_sync_move(&isolation->active, &isolation->prev); 7637 trace_amdgpu_isolation(isolation->owner, owner); 7638 isolation->owner = owner; 7639 } 7640 7641 /* 7642 * Specifying the ring here helps to pipeline submissions even when 7643 * isolation is enabled. If that is not desired for testing NULL can be 7644 * used instead of the ring to enforce a CPU round trip while switching 7645 * between clients. 7646 */ 7647 dep = amdgpu_sync_peek_fence(&isolation->prev, ring); 7648 r = amdgpu_sync_fence(&isolation->active, &f->finished, GFP_NOWAIT); 7649 if (r) 7650 dev_warn(adev->dev, "OOM tracking isolation\n"); 7651 7652 out_grab_ref: 7653 dma_fence_get(dep); 7654 out_return_dep: 7655 mutex_unlock(&adev->enforce_isolation_mutex); 7656 return dep; 7657 } 7658 7659 bool amdgpu_device_has_display_hardware(struct amdgpu_device *adev) 7660 { 7661 switch (adev->asic_type) { 7662 #ifdef CONFIG_DRM_AMDGPU_SI 7663 case CHIP_HAINAN: 7664 #endif 7665 case CHIP_TOPAZ: 7666 /* chips with no display hardware */ 7667 return false; 7668 #ifdef CONFIG_DRM_AMDGPU_SI 7669 case CHIP_TAHITI: 7670 case CHIP_PITCAIRN: 7671 case CHIP_VERDE: 7672 case CHIP_OLAND: 7673 #endif 7674 #ifdef CONFIG_DRM_AMDGPU_CIK 7675 case CHIP_BONAIRE: 7676 case CHIP_HAWAII: 7677 case CHIP_KAVERI: 7678 case CHIP_KABINI: 7679 case CHIP_MULLINS: 7680 #endif 7681 case CHIP_TONGA: 7682 case CHIP_FIJI: 7683 case CHIP_POLARIS10: 7684 case CHIP_POLARIS11: 7685 case CHIP_POLARIS12: 7686 case CHIP_VEGAM: 7687 case CHIP_CARRIZO: 7688 case CHIP_STONEY: 7689 /* chips with display hardware */ 7690 return true; 7691 default: 7692 /* IP discovery */ 7693 if (!amdgpu_ip_version(adev, DCE_HWIP, 0) || 7694 (adev->harvest_ip_mask & AMD_HARVEST_IP_DMU_MASK)) 7695 return false; 7696 return true; 7697 } 7698 } 7699 7700 uint32_t amdgpu_device_wait_on_rreg(struct amdgpu_device *adev, 7701 uint32_t inst, uint32_t reg_addr, char reg_name[], 7702 uint32_t expected_value, uint32_t mask) 7703 { 7704 uint32_t ret = 0; 7705 uint32_t old_ = 0; 7706 uint32_t tmp_ = RREG32(reg_addr); 7707 uint32_t loop = adev->usec_timeout; 7708 7709 while ((tmp_ & (mask)) != (expected_value)) { 7710 if (old_ != tmp_) { 7711 loop = adev->usec_timeout; 7712 old_ = tmp_; 7713 } else 7714 udelay(1); 7715 tmp_ = RREG32(reg_addr); 7716 loop--; 7717 if (!loop) { 7718 dev_warn( 7719 adev->dev, 7720 "Register(%d) [%s] failed to reach value 0x%08x != 0x%08xn", 7721 inst, reg_name, (uint32_t)expected_value, 7722 (uint32_t)(tmp_ & (mask))); 7723 ret = -ETIMEDOUT; 7724 break; 7725 } 7726 } 7727 return ret; 7728 } 7729 7730 ssize_t amdgpu_get_soft_full_reset_mask(struct amdgpu_ring *ring) 7731 { 7732 ssize_t size = 0; 7733 7734 if (!ring || !ring->adev) 7735 return size; 7736 7737 if (amdgpu_device_should_recover_gpu(ring->adev)) 7738 size |= AMDGPU_RESET_TYPE_FULL; 7739 7740 if (unlikely(!ring->adev->debug_disable_soft_recovery) && 7741 !amdgpu_sriov_vf(ring->adev) && ring->funcs->soft_recovery) 7742 size |= AMDGPU_RESET_TYPE_SOFT_RESET; 7743 7744 return size; 7745 } 7746 7747 ssize_t amdgpu_show_reset_mask(char *buf, uint32_t supported_reset) 7748 { 7749 ssize_t size = 0; 7750 7751 if (supported_reset == 0) { 7752 size += sysfs_emit_at(buf, size, "unsupported"); 7753 size += sysfs_emit_at(buf, size, "\n"); 7754 return size; 7755 7756 } 7757 7758 if (supported_reset & AMDGPU_RESET_TYPE_SOFT_RESET) 7759 size += sysfs_emit_at(buf, size, "soft "); 7760 7761 if (supported_reset & AMDGPU_RESET_TYPE_PER_QUEUE) 7762 size += sysfs_emit_at(buf, size, "queue "); 7763 7764 if (supported_reset & AMDGPU_RESET_TYPE_PER_PIPE) 7765 size += sysfs_emit_at(buf, size, "pipe "); 7766 7767 if (supported_reset & AMDGPU_RESET_TYPE_FULL) 7768 size += sysfs_emit_at(buf, size, "full "); 7769 7770 size += sysfs_emit_at(buf, size, "\n"); 7771 return size; 7772 } 7773 7774 void amdgpu_device_set_uid(struct amdgpu_uid *uid_info, 7775 enum amdgpu_uid_type type, uint8_t inst, 7776 uint64_t uid) 7777 { 7778 if (!uid_info) 7779 return; 7780 7781 if (type >= AMDGPU_UID_TYPE_MAX) { 7782 dev_err_once(uid_info->adev->dev, "Invalid UID type %d\n", 7783 type); 7784 return; 7785 } 7786 7787 if (inst >= AMDGPU_UID_INST_MAX) { 7788 dev_err_once(uid_info->adev->dev, "Invalid UID instance %d\n", 7789 inst); 7790 return; 7791 } 7792 7793 if (uid_info->uid[type][inst] != 0) { 7794 dev_warn_once( 7795 uid_info->adev->dev, 7796 "Overwriting existing UID %llu for type %d instance %d\n", 7797 uid_info->uid[type][inst], type, inst); 7798 } 7799 7800 uid_info->uid[type][inst] = uid; 7801 } 7802 7803 u64 amdgpu_device_get_uid(struct amdgpu_uid *uid_info, 7804 enum amdgpu_uid_type type, uint8_t inst) 7805 { 7806 if (!uid_info) 7807 return 0; 7808 7809 if (type >= AMDGPU_UID_TYPE_MAX) { 7810 dev_err_once(uid_info->adev->dev, "Invalid UID type %d\n", 7811 type); 7812 return 0; 7813 } 7814 7815 if (inst >= AMDGPU_UID_INST_MAX) { 7816 dev_err_once(uid_info->adev->dev, "Invalid UID instance %d\n", 7817 inst); 7818 return 0; 7819 } 7820 7821 return uid_info->uid[type][inst]; 7822 } 7823