1 /* 2 * Copyright 2008 Advanced Micro Devices, Inc. 3 * Copyright 2008 Red Hat Inc. 4 * Copyright 2009 Jerome Glisse. 5 * 6 * Permission is hereby granted, free of charge, to any person obtaining a 7 * copy of this software and associated documentation files (the "Software"), 8 * to deal in the Software without restriction, including without limitation 9 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 10 * and/or sell copies of the Software, and to permit persons to whom the 11 * Software is furnished to do so, subject to the following conditions: 12 * 13 * The above copyright notice and this permission notice shall be included in 14 * all copies or substantial portions of the Software. 15 * 16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 19 * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR 20 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, 21 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR 22 * OTHER DEALINGS IN THE SOFTWARE. 23 * 24 * Authors: Dave Airlie 25 * Alex Deucher 26 * Jerome Glisse 27 */ 28 29 #include <linux/aperture.h> 30 #include <linux/power_supply.h> 31 #include <linux/kthread.h> 32 #include <linux/module.h> 33 #include <linux/console.h> 34 #include <linux/slab.h> 35 #include <linux/iommu.h> 36 #include <linux/pci.h> 37 #include <linux/pci-p2pdma.h> 38 #include <linux/apple-gmux.h> 39 40 #include <drm/drm_atomic_helper.h> 41 #include <drm/drm_client_event.h> 42 #include <drm/drm_crtc_helper.h> 43 #include <drm/drm_probe_helper.h> 44 #include <drm/amdgpu_drm.h> 45 #include <linux/device.h> 46 #include <linux/vgaarb.h> 47 #include <linux/vga_switcheroo.h> 48 #include <linux/efi.h> 49 #include "amdgpu.h" 50 #include "amdgpu_trace.h" 51 #include "amdgpu_i2c.h" 52 #include "atom.h" 53 #include "amdgpu_atombios.h" 54 #include "amdgpu_atomfirmware.h" 55 #include "amd_pcie.h" 56 #ifdef CONFIG_DRM_AMDGPU_SI 57 #include "si.h" 58 #endif 59 #ifdef CONFIG_DRM_AMDGPU_CIK 60 #include "cik.h" 61 #endif 62 #include "vi.h" 63 #include "soc15.h" 64 #include "nv.h" 65 #include "bif/bif_4_1_d.h" 66 #include <linux/firmware.h> 67 #include "amdgpu_vf_error.h" 68 69 #include "amdgpu_amdkfd.h" 70 #include "amdgpu_pm.h" 71 72 #include "amdgpu_xgmi.h" 73 #include "amdgpu_ras.h" 74 #include "amdgpu_pmu.h" 75 #include "amdgpu_fru_eeprom.h" 76 #include "amdgpu_reset.h" 77 #include "amdgpu_virt.h" 78 #include "amdgpu_dev_coredump.h" 79 80 #include <linux/suspend.h> 81 #include <drm/task_barrier.h> 82 #include <linux/pm_runtime.h> 83 84 #include <drm/drm_drv.h> 85 86 #if IS_ENABLED(CONFIG_X86) 87 #include <asm/intel-family.h> 88 #include <asm/cpu_device_id.h> 89 #endif 90 91 MODULE_FIRMWARE("amdgpu/vega10_gpu_info.bin"); 92 MODULE_FIRMWARE("amdgpu/vega12_gpu_info.bin"); 93 MODULE_FIRMWARE("amdgpu/raven_gpu_info.bin"); 94 MODULE_FIRMWARE("amdgpu/picasso_gpu_info.bin"); 95 MODULE_FIRMWARE("amdgpu/raven2_gpu_info.bin"); 96 MODULE_FIRMWARE("amdgpu/arcturus_gpu_info.bin"); 97 MODULE_FIRMWARE("amdgpu/navi12_gpu_info.bin"); 98 99 #define AMDGPU_RESUME_MS 2000 100 #define AMDGPU_MAX_RETRY_LIMIT 2 101 #define AMDGPU_RETRY_SRIOV_RESET(r) ((r) == -EBUSY || (r) == -ETIMEDOUT || (r) == -EINVAL) 102 #define AMDGPU_PCIE_INDEX_FALLBACK (0x38 >> 2) 103 #define AMDGPU_PCIE_INDEX_HI_FALLBACK (0x44 >> 2) 104 #define AMDGPU_PCIE_DATA_FALLBACK (0x3C >> 2) 105 106 #define AMDGPU_VBIOS_SKIP (1U << 0) 107 #define AMDGPU_VBIOS_OPTIONAL (1U << 1) 108 109 static const struct drm_driver amdgpu_kms_driver; 110 111 const char *amdgpu_asic_name[] = { 112 "TAHITI", 113 "PITCAIRN", 114 "VERDE", 115 "OLAND", 116 "HAINAN", 117 "BONAIRE", 118 "KAVERI", 119 "KABINI", 120 "HAWAII", 121 "MULLINS", 122 "TOPAZ", 123 "TONGA", 124 "FIJI", 125 "CARRIZO", 126 "STONEY", 127 "POLARIS10", 128 "POLARIS11", 129 "POLARIS12", 130 "VEGAM", 131 "VEGA10", 132 "VEGA12", 133 "VEGA20", 134 "RAVEN", 135 "ARCTURUS", 136 "RENOIR", 137 "ALDEBARAN", 138 "NAVI10", 139 "CYAN_SKILLFISH", 140 "NAVI14", 141 "NAVI12", 142 "SIENNA_CICHLID", 143 "NAVY_FLOUNDER", 144 "VANGOGH", 145 "DIMGREY_CAVEFISH", 146 "BEIGE_GOBY", 147 "YELLOW_CARP", 148 "IP DISCOVERY", 149 "LAST", 150 }; 151 152 #define AMDGPU_IP_BLK_MASK_ALL GENMASK(AMD_IP_BLOCK_TYPE_NUM - 1, 0) 153 /* 154 * Default init level where all blocks are expected to be initialized. This is 155 * the level of initialization expected by default and also after a full reset 156 * of the device. 157 */ 158 struct amdgpu_init_level amdgpu_init_default = { 159 .level = AMDGPU_INIT_LEVEL_DEFAULT, 160 .hwini_ip_block_mask = AMDGPU_IP_BLK_MASK_ALL, 161 }; 162 163 struct amdgpu_init_level amdgpu_init_recovery = { 164 .level = AMDGPU_INIT_LEVEL_RESET_RECOVERY, 165 .hwini_ip_block_mask = AMDGPU_IP_BLK_MASK_ALL, 166 }; 167 168 /* 169 * Minimal blocks needed to be initialized before a XGMI hive can be reset. This 170 * is used for cases like reset on initialization where the entire hive needs to 171 * be reset before first use. 172 */ 173 struct amdgpu_init_level amdgpu_init_minimal_xgmi = { 174 .level = AMDGPU_INIT_LEVEL_MINIMAL_XGMI, 175 .hwini_ip_block_mask = 176 BIT(AMD_IP_BLOCK_TYPE_GMC) | BIT(AMD_IP_BLOCK_TYPE_SMC) | 177 BIT(AMD_IP_BLOCK_TYPE_COMMON) | BIT(AMD_IP_BLOCK_TYPE_IH) | 178 BIT(AMD_IP_BLOCK_TYPE_PSP) 179 }; 180 181 static inline bool amdgpu_ip_member_of_hwini(struct amdgpu_device *adev, 182 enum amd_ip_block_type block) 183 { 184 return (adev->init_lvl->hwini_ip_block_mask & (1U << block)) != 0; 185 } 186 187 void amdgpu_set_init_level(struct amdgpu_device *adev, 188 enum amdgpu_init_lvl_id lvl) 189 { 190 switch (lvl) { 191 case AMDGPU_INIT_LEVEL_MINIMAL_XGMI: 192 adev->init_lvl = &amdgpu_init_minimal_xgmi; 193 break; 194 case AMDGPU_INIT_LEVEL_RESET_RECOVERY: 195 adev->init_lvl = &amdgpu_init_recovery; 196 break; 197 case AMDGPU_INIT_LEVEL_DEFAULT: 198 fallthrough; 199 default: 200 adev->init_lvl = &amdgpu_init_default; 201 break; 202 } 203 } 204 205 static inline void amdgpu_device_stop_pending_resets(struct amdgpu_device *adev); 206 static int amdgpu_device_pm_notifier(struct notifier_block *nb, unsigned long mode, 207 void *data); 208 209 /** 210 * DOC: pcie_replay_count 211 * 212 * The amdgpu driver provides a sysfs API for reporting the total number 213 * of PCIe replays (NAKs). 214 * The file pcie_replay_count is used for this and returns the total 215 * number of replays as a sum of the NAKs generated and NAKs received. 216 */ 217 218 static ssize_t amdgpu_device_get_pcie_replay_count(struct device *dev, 219 struct device_attribute *attr, char *buf) 220 { 221 struct drm_device *ddev = dev_get_drvdata(dev); 222 struct amdgpu_device *adev = drm_to_adev(ddev); 223 uint64_t cnt = amdgpu_asic_get_pcie_replay_count(adev); 224 225 return sysfs_emit(buf, "%llu\n", cnt); 226 } 227 228 static DEVICE_ATTR(pcie_replay_count, 0444, 229 amdgpu_device_get_pcie_replay_count, NULL); 230 231 static int amdgpu_device_attr_sysfs_init(struct amdgpu_device *adev) 232 { 233 int ret = 0; 234 235 if (amdgpu_nbio_is_replay_cnt_supported(adev)) 236 ret = sysfs_create_file(&adev->dev->kobj, 237 &dev_attr_pcie_replay_count.attr); 238 239 return ret; 240 } 241 242 static void amdgpu_device_attr_sysfs_fini(struct amdgpu_device *adev) 243 { 244 if (amdgpu_nbio_is_replay_cnt_supported(adev)) 245 sysfs_remove_file(&adev->dev->kobj, 246 &dev_attr_pcie_replay_count.attr); 247 } 248 249 static ssize_t amdgpu_sysfs_reg_state_get(struct file *f, struct kobject *kobj, 250 const struct bin_attribute *attr, char *buf, 251 loff_t ppos, size_t count) 252 { 253 struct device *dev = kobj_to_dev(kobj); 254 struct drm_device *ddev = dev_get_drvdata(dev); 255 struct amdgpu_device *adev = drm_to_adev(ddev); 256 ssize_t bytes_read; 257 258 switch (ppos) { 259 case AMDGPU_SYS_REG_STATE_XGMI: 260 bytes_read = amdgpu_asic_get_reg_state( 261 adev, AMDGPU_REG_STATE_TYPE_XGMI, buf, count); 262 break; 263 case AMDGPU_SYS_REG_STATE_WAFL: 264 bytes_read = amdgpu_asic_get_reg_state( 265 adev, AMDGPU_REG_STATE_TYPE_WAFL, buf, count); 266 break; 267 case AMDGPU_SYS_REG_STATE_PCIE: 268 bytes_read = amdgpu_asic_get_reg_state( 269 adev, AMDGPU_REG_STATE_TYPE_PCIE, buf, count); 270 break; 271 case AMDGPU_SYS_REG_STATE_USR: 272 bytes_read = amdgpu_asic_get_reg_state( 273 adev, AMDGPU_REG_STATE_TYPE_USR, buf, count); 274 break; 275 case AMDGPU_SYS_REG_STATE_USR_1: 276 bytes_read = amdgpu_asic_get_reg_state( 277 adev, AMDGPU_REG_STATE_TYPE_USR_1, buf, count); 278 break; 279 default: 280 return -EINVAL; 281 } 282 283 return bytes_read; 284 } 285 286 static const BIN_ATTR(reg_state, 0444, amdgpu_sysfs_reg_state_get, NULL, 287 AMDGPU_SYS_REG_STATE_END); 288 289 int amdgpu_reg_state_sysfs_init(struct amdgpu_device *adev) 290 { 291 int ret; 292 293 if (!amdgpu_asic_get_reg_state_supported(adev)) 294 return 0; 295 296 ret = sysfs_create_bin_file(&adev->dev->kobj, &bin_attr_reg_state); 297 298 return ret; 299 } 300 301 void amdgpu_reg_state_sysfs_fini(struct amdgpu_device *adev) 302 { 303 if (!amdgpu_asic_get_reg_state_supported(adev)) 304 return; 305 sysfs_remove_bin_file(&adev->dev->kobj, &bin_attr_reg_state); 306 } 307 308 int amdgpu_ip_block_suspend(struct amdgpu_ip_block *ip_block) 309 { 310 int r; 311 312 if (ip_block->version->funcs->suspend) { 313 r = ip_block->version->funcs->suspend(ip_block); 314 if (r) { 315 dev_err(ip_block->adev->dev, 316 "suspend of IP block <%s> failed %d\n", 317 ip_block->version->funcs->name, r); 318 return r; 319 } 320 } 321 322 ip_block->status.hw = false; 323 return 0; 324 } 325 326 int amdgpu_ip_block_resume(struct amdgpu_ip_block *ip_block) 327 { 328 int r; 329 330 if (ip_block->version->funcs->resume) { 331 r = ip_block->version->funcs->resume(ip_block); 332 if (r) { 333 dev_err(ip_block->adev->dev, 334 "resume of IP block <%s> failed %d\n", 335 ip_block->version->funcs->name, r); 336 return r; 337 } 338 } 339 340 ip_block->status.hw = true; 341 return 0; 342 } 343 344 /** 345 * DOC: board_info 346 * 347 * The amdgpu driver provides a sysfs API for giving board related information. 348 * It provides the form factor information in the format 349 * 350 * type : form factor 351 * 352 * Possible form factor values 353 * 354 * - "cem" - PCIE CEM card 355 * - "oam" - Open Compute Accelerator Module 356 * - "unknown" - Not known 357 * 358 */ 359 360 static ssize_t amdgpu_device_get_board_info(struct device *dev, 361 struct device_attribute *attr, 362 char *buf) 363 { 364 struct drm_device *ddev = dev_get_drvdata(dev); 365 struct amdgpu_device *adev = drm_to_adev(ddev); 366 enum amdgpu_pkg_type pkg_type = AMDGPU_PKG_TYPE_CEM; 367 const char *pkg; 368 369 if (adev->smuio.funcs && adev->smuio.funcs->get_pkg_type) 370 pkg_type = adev->smuio.funcs->get_pkg_type(adev); 371 372 switch (pkg_type) { 373 case AMDGPU_PKG_TYPE_CEM: 374 pkg = "cem"; 375 break; 376 case AMDGPU_PKG_TYPE_OAM: 377 pkg = "oam"; 378 break; 379 default: 380 pkg = "unknown"; 381 break; 382 } 383 384 return sysfs_emit(buf, "%s : %s\n", "type", pkg); 385 } 386 387 static DEVICE_ATTR(board_info, 0444, amdgpu_device_get_board_info, NULL); 388 389 static struct attribute *amdgpu_board_attrs[] = { 390 &dev_attr_board_info.attr, 391 NULL, 392 }; 393 394 static umode_t amdgpu_board_attrs_is_visible(struct kobject *kobj, 395 struct attribute *attr, int n) 396 { 397 struct device *dev = kobj_to_dev(kobj); 398 struct drm_device *ddev = dev_get_drvdata(dev); 399 struct amdgpu_device *adev = drm_to_adev(ddev); 400 401 if (adev->flags & AMD_IS_APU) 402 return 0; 403 404 return attr->mode; 405 } 406 407 static const struct attribute_group amdgpu_board_attrs_group = { 408 .attrs = amdgpu_board_attrs, 409 .is_visible = amdgpu_board_attrs_is_visible 410 }; 411 412 static void amdgpu_device_get_pcie_info(struct amdgpu_device *adev); 413 414 /** 415 * amdgpu_device_supports_px - Is the device a dGPU with ATPX power control 416 * 417 * @adev: amdgpu device pointer 418 * 419 * Returns true if the device is a dGPU with ATPX power control, 420 * otherwise return false. 421 */ 422 bool amdgpu_device_supports_px(struct amdgpu_device *adev) 423 { 424 if ((adev->flags & AMD_IS_PX) && !amdgpu_is_atpx_hybrid()) 425 return true; 426 return false; 427 } 428 429 /** 430 * amdgpu_device_supports_boco - Is the device a dGPU with ACPI power resources 431 * 432 * @adev: amdgpu device pointer 433 * 434 * Returns true if the device is a dGPU with ACPI power control, 435 * otherwise return false. 436 */ 437 bool amdgpu_device_supports_boco(struct amdgpu_device *adev) 438 { 439 if (!IS_ENABLED(CONFIG_HOTPLUG_PCI_PCIE)) 440 return false; 441 442 if (adev->has_pr3 || 443 ((adev->flags & AMD_IS_PX) && amdgpu_is_atpx_hybrid())) 444 return true; 445 return false; 446 } 447 448 /** 449 * amdgpu_device_supports_baco - Does the device support BACO 450 * 451 * @adev: amdgpu device pointer 452 * 453 * Return: 454 * 1 if the device supports BACO; 455 * 3 if the device supports MACO (only works if BACO is supported) 456 * otherwise return 0. 457 */ 458 int amdgpu_device_supports_baco(struct amdgpu_device *adev) 459 { 460 return amdgpu_asic_supports_baco(adev); 461 } 462 463 void amdgpu_device_detect_runtime_pm_mode(struct amdgpu_device *adev) 464 { 465 int bamaco_support; 466 467 adev->pm.rpm_mode = AMDGPU_RUNPM_NONE; 468 bamaco_support = amdgpu_device_supports_baco(adev); 469 470 switch (amdgpu_runtime_pm) { 471 case 2: 472 if (bamaco_support & MACO_SUPPORT) { 473 adev->pm.rpm_mode = AMDGPU_RUNPM_BAMACO; 474 dev_info(adev->dev, "Forcing BAMACO for runtime pm\n"); 475 } else if (bamaco_support == BACO_SUPPORT) { 476 adev->pm.rpm_mode = AMDGPU_RUNPM_BACO; 477 dev_info(adev->dev, "Requested mode BAMACO not available,fallback to use BACO\n"); 478 } 479 break; 480 case 1: 481 if (bamaco_support & BACO_SUPPORT) { 482 adev->pm.rpm_mode = AMDGPU_RUNPM_BACO; 483 dev_info(adev->dev, "Forcing BACO for runtime pm\n"); 484 } 485 break; 486 case -1: 487 case -2: 488 if (amdgpu_device_supports_px(adev)) { 489 /* enable PX as runtime mode */ 490 adev->pm.rpm_mode = AMDGPU_RUNPM_PX; 491 dev_info(adev->dev, "Using ATPX for runtime pm\n"); 492 } else if (amdgpu_device_supports_boco(adev)) { 493 /* enable boco as runtime mode */ 494 adev->pm.rpm_mode = AMDGPU_RUNPM_BOCO; 495 dev_info(adev->dev, "Using BOCO for runtime pm\n"); 496 } else { 497 if (!bamaco_support) 498 goto no_runtime_pm; 499 500 switch (adev->asic_type) { 501 case CHIP_VEGA20: 502 case CHIP_ARCTURUS: 503 /* BACO are not supported on vega20 and arctrus */ 504 break; 505 case CHIP_VEGA10: 506 /* enable BACO as runpm mode if noretry=0 */ 507 if (!adev->gmc.noretry && !amdgpu_passthrough(adev)) 508 adev->pm.rpm_mode = AMDGPU_RUNPM_BACO; 509 break; 510 default: 511 /* enable BACO as runpm mode on CI+ */ 512 if (!amdgpu_passthrough(adev)) 513 adev->pm.rpm_mode = AMDGPU_RUNPM_BACO; 514 break; 515 } 516 517 if (adev->pm.rpm_mode == AMDGPU_RUNPM_BACO) { 518 if (bamaco_support & MACO_SUPPORT) { 519 adev->pm.rpm_mode = AMDGPU_RUNPM_BAMACO; 520 dev_info(adev->dev, "Using BAMACO for runtime pm\n"); 521 } else { 522 dev_info(adev->dev, "Using BACO for runtime pm\n"); 523 } 524 } 525 } 526 break; 527 case 0: 528 dev_info(adev->dev, "runtime pm is manually disabled\n"); 529 break; 530 default: 531 break; 532 } 533 534 no_runtime_pm: 535 if (adev->pm.rpm_mode == AMDGPU_RUNPM_NONE) 536 dev_info(adev->dev, "Runtime PM not available\n"); 537 } 538 /** 539 * amdgpu_device_supports_smart_shift - Is the device dGPU with 540 * smart shift support 541 * 542 * @adev: amdgpu device pointer 543 * 544 * Returns true if the device is a dGPU with Smart Shift support, 545 * otherwise returns false. 546 */ 547 bool amdgpu_device_supports_smart_shift(struct amdgpu_device *adev) 548 { 549 return (amdgpu_device_supports_boco(adev) && 550 amdgpu_acpi_is_power_shift_control_supported()); 551 } 552 553 /* 554 * VRAM access helper functions 555 */ 556 557 /** 558 * amdgpu_device_mm_access - access vram by MM_INDEX/MM_DATA 559 * 560 * @adev: amdgpu_device pointer 561 * @pos: offset of the buffer in vram 562 * @buf: virtual address of the buffer in system memory 563 * @size: read/write size, sizeof(@buf) must > @size 564 * @write: true - write to vram, otherwise - read from vram 565 */ 566 void amdgpu_device_mm_access(struct amdgpu_device *adev, loff_t pos, 567 void *buf, size_t size, bool write) 568 { 569 unsigned long flags; 570 uint32_t hi = ~0, tmp = 0; 571 uint32_t *data = buf; 572 uint64_t last; 573 int idx; 574 575 if (!drm_dev_enter(adev_to_drm(adev), &idx)) 576 return; 577 578 BUG_ON(!IS_ALIGNED(pos, 4) || !IS_ALIGNED(size, 4)); 579 580 spin_lock_irqsave(&adev->mmio_idx_lock, flags); 581 for (last = pos + size; pos < last; pos += 4) { 582 tmp = pos >> 31; 583 584 WREG32_NO_KIQ(mmMM_INDEX, ((uint32_t)pos) | 0x80000000); 585 if (tmp != hi) { 586 WREG32_NO_KIQ(mmMM_INDEX_HI, tmp); 587 hi = tmp; 588 } 589 if (write) 590 WREG32_NO_KIQ(mmMM_DATA, *data++); 591 else 592 *data++ = RREG32_NO_KIQ(mmMM_DATA); 593 } 594 595 spin_unlock_irqrestore(&adev->mmio_idx_lock, flags); 596 drm_dev_exit(idx); 597 } 598 599 /** 600 * amdgpu_device_aper_access - access vram by vram aperture 601 * 602 * @adev: amdgpu_device pointer 603 * @pos: offset of the buffer in vram 604 * @buf: virtual address of the buffer in system memory 605 * @size: read/write size, sizeof(@buf) must > @size 606 * @write: true - write to vram, otherwise - read from vram 607 * 608 * The return value means how many bytes have been transferred. 609 */ 610 size_t amdgpu_device_aper_access(struct amdgpu_device *adev, loff_t pos, 611 void *buf, size_t size, bool write) 612 { 613 #ifdef CONFIG_64BIT 614 void __iomem *addr; 615 size_t count = 0; 616 uint64_t last; 617 618 if (!adev->mman.aper_base_kaddr) 619 return 0; 620 621 last = min(pos + size, adev->gmc.visible_vram_size); 622 if (last > pos) { 623 addr = adev->mman.aper_base_kaddr + pos; 624 count = last - pos; 625 626 if (write) { 627 memcpy_toio(addr, buf, count); 628 /* Make sure HDP write cache flush happens without any reordering 629 * after the system memory contents are sent over PCIe device 630 */ 631 mb(); 632 amdgpu_device_flush_hdp(adev, NULL); 633 } else { 634 amdgpu_device_invalidate_hdp(adev, NULL); 635 /* Make sure HDP read cache is invalidated before issuing a read 636 * to the PCIe device 637 */ 638 mb(); 639 memcpy_fromio(buf, addr, count); 640 } 641 642 } 643 644 return count; 645 #else 646 return 0; 647 #endif 648 } 649 650 /** 651 * amdgpu_device_vram_access - read/write a buffer in vram 652 * 653 * @adev: amdgpu_device pointer 654 * @pos: offset of the buffer in vram 655 * @buf: virtual address of the buffer in system memory 656 * @size: read/write size, sizeof(@buf) must > @size 657 * @write: true - write to vram, otherwise - read from vram 658 */ 659 void amdgpu_device_vram_access(struct amdgpu_device *adev, loff_t pos, 660 void *buf, size_t size, bool write) 661 { 662 size_t count; 663 664 /* try to using vram apreature to access vram first */ 665 count = amdgpu_device_aper_access(adev, pos, buf, size, write); 666 size -= count; 667 if (size) { 668 /* using MM to access rest vram */ 669 pos += count; 670 buf += count; 671 amdgpu_device_mm_access(adev, pos, buf, size, write); 672 } 673 } 674 675 /* 676 * register access helper functions. 677 */ 678 679 /* Check if hw access should be skipped because of hotplug or device error */ 680 bool amdgpu_device_skip_hw_access(struct amdgpu_device *adev) 681 { 682 if (adev->no_hw_access) 683 return true; 684 685 #ifdef CONFIG_LOCKDEP 686 /* 687 * This is a bit complicated to understand, so worth a comment. What we assert 688 * here is that the GPU reset is not running on another thread in parallel. 689 * 690 * For this we trylock the read side of the reset semaphore, if that succeeds 691 * we know that the reset is not running in parallel. 692 * 693 * If the trylock fails we assert that we are either already holding the read 694 * side of the lock or are the reset thread itself and hold the write side of 695 * the lock. 696 */ 697 if (in_task()) { 698 if (down_read_trylock(&adev->reset_domain->sem)) 699 up_read(&adev->reset_domain->sem); 700 else 701 lockdep_assert_held(&adev->reset_domain->sem); 702 } 703 #endif 704 return false; 705 } 706 707 /** 708 * amdgpu_device_rreg - read a memory mapped IO or indirect register 709 * 710 * @adev: amdgpu_device pointer 711 * @reg: dword aligned register offset 712 * @acc_flags: access flags which require special behavior 713 * 714 * Returns the 32 bit value from the offset specified. 715 */ 716 uint32_t amdgpu_device_rreg(struct amdgpu_device *adev, 717 uint32_t reg, uint32_t acc_flags) 718 { 719 uint32_t ret; 720 721 if (amdgpu_device_skip_hw_access(adev)) 722 return 0; 723 724 if ((reg * 4) < adev->rmmio_size) { 725 if (!(acc_flags & AMDGPU_REGS_NO_KIQ) && 726 amdgpu_sriov_runtime(adev) && 727 down_read_trylock(&adev->reset_domain->sem)) { 728 ret = amdgpu_kiq_rreg(adev, reg, 0); 729 up_read(&adev->reset_domain->sem); 730 } else { 731 ret = readl(((void __iomem *)adev->rmmio) + (reg * 4)); 732 } 733 } else { 734 ret = adev->pcie_rreg(adev, reg * 4); 735 } 736 737 trace_amdgpu_device_rreg(adev->pdev->device, reg, ret); 738 739 return ret; 740 } 741 742 /* 743 * MMIO register read with bytes helper functions 744 * @offset:bytes offset from MMIO start 745 */ 746 747 /** 748 * amdgpu_mm_rreg8 - read a memory mapped IO register 749 * 750 * @adev: amdgpu_device pointer 751 * @offset: byte aligned register offset 752 * 753 * Returns the 8 bit value from the offset specified. 754 */ 755 uint8_t amdgpu_mm_rreg8(struct amdgpu_device *adev, uint32_t offset) 756 { 757 if (amdgpu_device_skip_hw_access(adev)) 758 return 0; 759 760 if (offset < adev->rmmio_size) 761 return (readb(adev->rmmio + offset)); 762 BUG(); 763 } 764 765 766 /** 767 * amdgpu_device_xcc_rreg - read a memory mapped IO or indirect register with specific XCC 768 * 769 * @adev: amdgpu_device pointer 770 * @reg: dword aligned register offset 771 * @acc_flags: access flags which require special behavior 772 * @xcc_id: xcc accelerated compute core id 773 * 774 * Returns the 32 bit value from the offset specified. 775 */ 776 uint32_t amdgpu_device_xcc_rreg(struct amdgpu_device *adev, 777 uint32_t reg, uint32_t acc_flags, 778 uint32_t xcc_id) 779 { 780 uint32_t ret, rlcg_flag; 781 782 if (amdgpu_device_skip_hw_access(adev)) 783 return 0; 784 785 if ((reg * 4) < adev->rmmio_size) { 786 if (amdgpu_sriov_vf(adev) && 787 !amdgpu_sriov_runtime(adev) && 788 adev->gfx.rlc.rlcg_reg_access_supported && 789 amdgpu_virt_get_rlcg_reg_access_flag(adev, acc_flags, 790 GC_HWIP, false, 791 &rlcg_flag)) { 792 ret = amdgpu_virt_rlcg_reg_rw(adev, reg, 0, rlcg_flag, GET_INST(GC, xcc_id)); 793 } else if (!(acc_flags & AMDGPU_REGS_NO_KIQ) && 794 amdgpu_sriov_runtime(adev) && 795 down_read_trylock(&adev->reset_domain->sem)) { 796 ret = amdgpu_kiq_rreg(adev, reg, xcc_id); 797 up_read(&adev->reset_domain->sem); 798 } else { 799 ret = readl(((void __iomem *)adev->rmmio) + (reg * 4)); 800 } 801 } else { 802 ret = adev->pcie_rreg(adev, reg * 4); 803 } 804 805 return ret; 806 } 807 808 /* 809 * MMIO register write with bytes helper functions 810 * @offset:bytes offset from MMIO start 811 * @value: the value want to be written to the register 812 */ 813 814 /** 815 * amdgpu_mm_wreg8 - read a memory mapped IO register 816 * 817 * @adev: amdgpu_device pointer 818 * @offset: byte aligned register offset 819 * @value: 8 bit value to write 820 * 821 * Writes the value specified to the offset specified. 822 */ 823 void amdgpu_mm_wreg8(struct amdgpu_device *adev, uint32_t offset, uint8_t value) 824 { 825 if (amdgpu_device_skip_hw_access(adev)) 826 return; 827 828 if (offset < adev->rmmio_size) 829 writeb(value, adev->rmmio + offset); 830 else 831 BUG(); 832 } 833 834 /** 835 * amdgpu_device_wreg - write to a memory mapped IO or indirect register 836 * 837 * @adev: amdgpu_device pointer 838 * @reg: dword aligned register offset 839 * @v: 32 bit value to write to the register 840 * @acc_flags: access flags which require special behavior 841 * 842 * Writes the value specified to the offset specified. 843 */ 844 void amdgpu_device_wreg(struct amdgpu_device *adev, 845 uint32_t reg, uint32_t v, 846 uint32_t acc_flags) 847 { 848 if (amdgpu_device_skip_hw_access(adev)) 849 return; 850 851 if ((reg * 4) < adev->rmmio_size) { 852 if (!(acc_flags & AMDGPU_REGS_NO_KIQ) && 853 amdgpu_sriov_runtime(adev) && 854 down_read_trylock(&adev->reset_domain->sem)) { 855 amdgpu_kiq_wreg(adev, reg, v, 0); 856 up_read(&adev->reset_domain->sem); 857 } else { 858 writel(v, ((void __iomem *)adev->rmmio) + (reg * 4)); 859 } 860 } else { 861 adev->pcie_wreg(adev, reg * 4, v); 862 } 863 864 trace_amdgpu_device_wreg(adev->pdev->device, reg, v); 865 } 866 867 /** 868 * amdgpu_mm_wreg_mmio_rlc - write register either with direct/indirect mmio or with RLC path if in range 869 * 870 * @adev: amdgpu_device pointer 871 * @reg: mmio/rlc register 872 * @v: value to write 873 * @xcc_id: xcc accelerated compute core id 874 * 875 * this function is invoked only for the debugfs register access 876 */ 877 void amdgpu_mm_wreg_mmio_rlc(struct amdgpu_device *adev, 878 uint32_t reg, uint32_t v, 879 uint32_t xcc_id) 880 { 881 if (amdgpu_device_skip_hw_access(adev)) 882 return; 883 884 if (amdgpu_sriov_fullaccess(adev) && 885 adev->gfx.rlc.funcs && 886 adev->gfx.rlc.funcs->is_rlcg_access_range) { 887 if (adev->gfx.rlc.funcs->is_rlcg_access_range(adev, reg)) 888 return amdgpu_sriov_wreg(adev, reg, v, 0, 0, xcc_id); 889 } else if ((reg * 4) >= adev->rmmio_size) { 890 adev->pcie_wreg(adev, reg * 4, v); 891 } else { 892 writel(v, ((void __iomem *)adev->rmmio) + (reg * 4)); 893 } 894 } 895 896 /** 897 * amdgpu_device_xcc_wreg - write to a memory mapped IO or indirect register with specific XCC 898 * 899 * @adev: amdgpu_device pointer 900 * @reg: dword aligned register offset 901 * @v: 32 bit value to write to the register 902 * @acc_flags: access flags which require special behavior 903 * @xcc_id: xcc accelerated compute core id 904 * 905 * Writes the value specified to the offset specified. 906 */ 907 void amdgpu_device_xcc_wreg(struct amdgpu_device *adev, 908 uint32_t reg, uint32_t v, 909 uint32_t acc_flags, uint32_t xcc_id) 910 { 911 uint32_t rlcg_flag; 912 913 if (amdgpu_device_skip_hw_access(adev)) 914 return; 915 916 if ((reg * 4) < adev->rmmio_size) { 917 if (amdgpu_sriov_vf(adev) && 918 !amdgpu_sriov_runtime(adev) && 919 adev->gfx.rlc.rlcg_reg_access_supported && 920 amdgpu_virt_get_rlcg_reg_access_flag(adev, acc_flags, 921 GC_HWIP, true, 922 &rlcg_flag)) { 923 amdgpu_virt_rlcg_reg_rw(adev, reg, v, rlcg_flag, GET_INST(GC, xcc_id)); 924 } else if (!(acc_flags & AMDGPU_REGS_NO_KIQ) && 925 amdgpu_sriov_runtime(adev) && 926 down_read_trylock(&adev->reset_domain->sem)) { 927 amdgpu_kiq_wreg(adev, reg, v, xcc_id); 928 up_read(&adev->reset_domain->sem); 929 } else { 930 writel(v, ((void __iomem *)adev->rmmio) + (reg * 4)); 931 } 932 } else { 933 adev->pcie_wreg(adev, reg * 4, v); 934 } 935 } 936 937 /** 938 * amdgpu_device_indirect_rreg - read an indirect register 939 * 940 * @adev: amdgpu_device pointer 941 * @reg_addr: indirect register address to read from 942 * 943 * Returns the value of indirect register @reg_addr 944 */ 945 u32 amdgpu_device_indirect_rreg(struct amdgpu_device *adev, 946 u32 reg_addr) 947 { 948 unsigned long flags, pcie_index, pcie_data; 949 void __iomem *pcie_index_offset; 950 void __iomem *pcie_data_offset; 951 u32 r; 952 953 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 954 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 955 956 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 957 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 958 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 959 960 writel(reg_addr, pcie_index_offset); 961 readl(pcie_index_offset); 962 r = readl(pcie_data_offset); 963 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 964 965 return r; 966 } 967 968 u32 amdgpu_device_indirect_rreg_ext(struct amdgpu_device *adev, 969 u64 reg_addr) 970 { 971 unsigned long flags, pcie_index, pcie_index_hi, pcie_data; 972 u32 r; 973 void __iomem *pcie_index_offset; 974 void __iomem *pcie_index_hi_offset; 975 void __iomem *pcie_data_offset; 976 977 if (unlikely(!adev->nbio.funcs)) { 978 pcie_index = AMDGPU_PCIE_INDEX_FALLBACK; 979 pcie_data = AMDGPU_PCIE_DATA_FALLBACK; 980 } else { 981 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 982 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 983 } 984 985 if (reg_addr >> 32) { 986 if (unlikely(!adev->nbio.funcs)) 987 pcie_index_hi = AMDGPU_PCIE_INDEX_HI_FALLBACK; 988 else 989 pcie_index_hi = adev->nbio.funcs->get_pcie_index_hi_offset(adev); 990 } else { 991 pcie_index_hi = 0; 992 } 993 994 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 995 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 996 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 997 if (pcie_index_hi != 0) 998 pcie_index_hi_offset = (void __iomem *)adev->rmmio + 999 pcie_index_hi * 4; 1000 1001 writel(reg_addr, pcie_index_offset); 1002 readl(pcie_index_offset); 1003 if (pcie_index_hi != 0) { 1004 writel((reg_addr >> 32) & 0xff, pcie_index_hi_offset); 1005 readl(pcie_index_hi_offset); 1006 } 1007 r = readl(pcie_data_offset); 1008 1009 /* clear the high bits */ 1010 if (pcie_index_hi != 0) { 1011 writel(0, pcie_index_hi_offset); 1012 readl(pcie_index_hi_offset); 1013 } 1014 1015 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 1016 1017 return r; 1018 } 1019 1020 /** 1021 * amdgpu_device_indirect_rreg64 - read a 64bits indirect register 1022 * 1023 * @adev: amdgpu_device pointer 1024 * @reg_addr: indirect register address to read from 1025 * 1026 * Returns the value of indirect register @reg_addr 1027 */ 1028 u64 amdgpu_device_indirect_rreg64(struct amdgpu_device *adev, 1029 u32 reg_addr) 1030 { 1031 unsigned long flags, pcie_index, pcie_data; 1032 void __iomem *pcie_index_offset; 1033 void __iomem *pcie_data_offset; 1034 u64 r; 1035 1036 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 1037 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 1038 1039 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 1040 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 1041 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 1042 1043 /* read low 32 bits */ 1044 writel(reg_addr, pcie_index_offset); 1045 readl(pcie_index_offset); 1046 r = readl(pcie_data_offset); 1047 /* read high 32 bits */ 1048 writel(reg_addr + 4, pcie_index_offset); 1049 readl(pcie_index_offset); 1050 r |= ((u64)readl(pcie_data_offset) << 32); 1051 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 1052 1053 return r; 1054 } 1055 1056 u64 amdgpu_device_indirect_rreg64_ext(struct amdgpu_device *adev, 1057 u64 reg_addr) 1058 { 1059 unsigned long flags, pcie_index, pcie_data; 1060 unsigned long pcie_index_hi = 0; 1061 void __iomem *pcie_index_offset; 1062 void __iomem *pcie_index_hi_offset; 1063 void __iomem *pcie_data_offset; 1064 u64 r; 1065 1066 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 1067 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 1068 if ((reg_addr >> 32) && (adev->nbio.funcs->get_pcie_index_hi_offset)) 1069 pcie_index_hi = adev->nbio.funcs->get_pcie_index_hi_offset(adev); 1070 1071 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 1072 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 1073 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 1074 if (pcie_index_hi != 0) 1075 pcie_index_hi_offset = (void __iomem *)adev->rmmio + 1076 pcie_index_hi * 4; 1077 1078 /* read low 32 bits */ 1079 writel(reg_addr, pcie_index_offset); 1080 readl(pcie_index_offset); 1081 if (pcie_index_hi != 0) { 1082 writel((reg_addr >> 32) & 0xff, pcie_index_hi_offset); 1083 readl(pcie_index_hi_offset); 1084 } 1085 r = readl(pcie_data_offset); 1086 /* read high 32 bits */ 1087 writel(reg_addr + 4, pcie_index_offset); 1088 readl(pcie_index_offset); 1089 if (pcie_index_hi != 0) { 1090 writel((reg_addr >> 32) & 0xff, pcie_index_hi_offset); 1091 readl(pcie_index_hi_offset); 1092 } 1093 r |= ((u64)readl(pcie_data_offset) << 32); 1094 1095 /* clear the high bits */ 1096 if (pcie_index_hi != 0) { 1097 writel(0, pcie_index_hi_offset); 1098 readl(pcie_index_hi_offset); 1099 } 1100 1101 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 1102 1103 return r; 1104 } 1105 1106 /** 1107 * amdgpu_device_indirect_wreg - write an indirect register address 1108 * 1109 * @adev: amdgpu_device pointer 1110 * @reg_addr: indirect register offset 1111 * @reg_data: indirect register data 1112 * 1113 */ 1114 void amdgpu_device_indirect_wreg(struct amdgpu_device *adev, 1115 u32 reg_addr, u32 reg_data) 1116 { 1117 unsigned long flags, pcie_index, pcie_data; 1118 void __iomem *pcie_index_offset; 1119 void __iomem *pcie_data_offset; 1120 1121 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 1122 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 1123 1124 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 1125 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 1126 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 1127 1128 writel(reg_addr, pcie_index_offset); 1129 readl(pcie_index_offset); 1130 writel(reg_data, pcie_data_offset); 1131 readl(pcie_data_offset); 1132 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 1133 } 1134 1135 void amdgpu_device_indirect_wreg_ext(struct amdgpu_device *adev, 1136 u64 reg_addr, u32 reg_data) 1137 { 1138 unsigned long flags, pcie_index, pcie_index_hi, pcie_data; 1139 void __iomem *pcie_index_offset; 1140 void __iomem *pcie_index_hi_offset; 1141 void __iomem *pcie_data_offset; 1142 1143 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 1144 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 1145 if ((reg_addr >> 32) && (adev->nbio.funcs->get_pcie_index_hi_offset)) 1146 pcie_index_hi = adev->nbio.funcs->get_pcie_index_hi_offset(adev); 1147 else 1148 pcie_index_hi = 0; 1149 1150 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 1151 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 1152 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 1153 if (pcie_index_hi != 0) 1154 pcie_index_hi_offset = (void __iomem *)adev->rmmio + 1155 pcie_index_hi * 4; 1156 1157 writel(reg_addr, pcie_index_offset); 1158 readl(pcie_index_offset); 1159 if (pcie_index_hi != 0) { 1160 writel((reg_addr >> 32) & 0xff, pcie_index_hi_offset); 1161 readl(pcie_index_hi_offset); 1162 } 1163 writel(reg_data, pcie_data_offset); 1164 readl(pcie_data_offset); 1165 1166 /* clear the high bits */ 1167 if (pcie_index_hi != 0) { 1168 writel(0, pcie_index_hi_offset); 1169 readl(pcie_index_hi_offset); 1170 } 1171 1172 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 1173 } 1174 1175 /** 1176 * amdgpu_device_indirect_wreg64 - write a 64bits indirect register address 1177 * 1178 * @adev: amdgpu_device pointer 1179 * @reg_addr: indirect register offset 1180 * @reg_data: indirect register data 1181 * 1182 */ 1183 void amdgpu_device_indirect_wreg64(struct amdgpu_device *adev, 1184 u32 reg_addr, u64 reg_data) 1185 { 1186 unsigned long flags, pcie_index, pcie_data; 1187 void __iomem *pcie_index_offset; 1188 void __iomem *pcie_data_offset; 1189 1190 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 1191 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 1192 1193 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 1194 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 1195 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 1196 1197 /* write low 32 bits */ 1198 writel(reg_addr, pcie_index_offset); 1199 readl(pcie_index_offset); 1200 writel((u32)(reg_data & 0xffffffffULL), pcie_data_offset); 1201 readl(pcie_data_offset); 1202 /* write high 32 bits */ 1203 writel(reg_addr + 4, pcie_index_offset); 1204 readl(pcie_index_offset); 1205 writel((u32)(reg_data >> 32), pcie_data_offset); 1206 readl(pcie_data_offset); 1207 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 1208 } 1209 1210 void amdgpu_device_indirect_wreg64_ext(struct amdgpu_device *adev, 1211 u64 reg_addr, u64 reg_data) 1212 { 1213 unsigned long flags, pcie_index, pcie_data; 1214 unsigned long pcie_index_hi = 0; 1215 void __iomem *pcie_index_offset; 1216 void __iomem *pcie_index_hi_offset; 1217 void __iomem *pcie_data_offset; 1218 1219 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 1220 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 1221 if ((reg_addr >> 32) && (adev->nbio.funcs->get_pcie_index_hi_offset)) 1222 pcie_index_hi = adev->nbio.funcs->get_pcie_index_hi_offset(adev); 1223 1224 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 1225 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 1226 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 1227 if (pcie_index_hi != 0) 1228 pcie_index_hi_offset = (void __iomem *)adev->rmmio + 1229 pcie_index_hi * 4; 1230 1231 /* write low 32 bits */ 1232 writel(reg_addr, pcie_index_offset); 1233 readl(pcie_index_offset); 1234 if (pcie_index_hi != 0) { 1235 writel((reg_addr >> 32) & 0xff, pcie_index_hi_offset); 1236 readl(pcie_index_hi_offset); 1237 } 1238 writel((u32)(reg_data & 0xffffffffULL), pcie_data_offset); 1239 readl(pcie_data_offset); 1240 /* write high 32 bits */ 1241 writel(reg_addr + 4, pcie_index_offset); 1242 readl(pcie_index_offset); 1243 if (pcie_index_hi != 0) { 1244 writel((reg_addr >> 32) & 0xff, pcie_index_hi_offset); 1245 readl(pcie_index_hi_offset); 1246 } 1247 writel((u32)(reg_data >> 32), pcie_data_offset); 1248 readl(pcie_data_offset); 1249 1250 /* clear the high bits */ 1251 if (pcie_index_hi != 0) { 1252 writel(0, pcie_index_hi_offset); 1253 readl(pcie_index_hi_offset); 1254 } 1255 1256 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 1257 } 1258 1259 /** 1260 * amdgpu_device_get_rev_id - query device rev_id 1261 * 1262 * @adev: amdgpu_device pointer 1263 * 1264 * Return device rev_id 1265 */ 1266 u32 amdgpu_device_get_rev_id(struct amdgpu_device *adev) 1267 { 1268 return adev->nbio.funcs->get_rev_id(adev); 1269 } 1270 1271 /** 1272 * amdgpu_invalid_rreg - dummy reg read function 1273 * 1274 * @adev: amdgpu_device pointer 1275 * @reg: offset of register 1276 * 1277 * Dummy register read function. Used for register blocks 1278 * that certain asics don't have (all asics). 1279 * Returns the value in the register. 1280 */ 1281 static uint32_t amdgpu_invalid_rreg(struct amdgpu_device *adev, uint32_t reg) 1282 { 1283 dev_err(adev->dev, "Invalid callback to read register 0x%04X\n", reg); 1284 BUG(); 1285 return 0; 1286 } 1287 1288 static uint32_t amdgpu_invalid_rreg_ext(struct amdgpu_device *adev, uint64_t reg) 1289 { 1290 dev_err(adev->dev, "Invalid callback to read register 0x%llX\n", reg); 1291 BUG(); 1292 return 0; 1293 } 1294 1295 /** 1296 * amdgpu_invalid_wreg - dummy reg write function 1297 * 1298 * @adev: amdgpu_device pointer 1299 * @reg: offset of register 1300 * @v: value to write to the register 1301 * 1302 * Dummy register read function. Used for register blocks 1303 * that certain asics don't have (all asics). 1304 */ 1305 static void amdgpu_invalid_wreg(struct amdgpu_device *adev, uint32_t reg, uint32_t v) 1306 { 1307 dev_err(adev->dev, 1308 "Invalid callback to write register 0x%04X with 0x%08X\n", reg, 1309 v); 1310 BUG(); 1311 } 1312 1313 static void amdgpu_invalid_wreg_ext(struct amdgpu_device *adev, uint64_t reg, uint32_t v) 1314 { 1315 dev_err(adev->dev, 1316 "Invalid callback to write register 0x%llX with 0x%08X\n", reg, 1317 v); 1318 BUG(); 1319 } 1320 1321 /** 1322 * amdgpu_invalid_rreg64 - dummy 64 bit reg read function 1323 * 1324 * @adev: amdgpu_device pointer 1325 * @reg: offset of register 1326 * 1327 * Dummy register read function. Used for register blocks 1328 * that certain asics don't have (all asics). 1329 * Returns the value in the register. 1330 */ 1331 static uint64_t amdgpu_invalid_rreg64(struct amdgpu_device *adev, uint32_t reg) 1332 { 1333 dev_err(adev->dev, "Invalid callback to read 64 bit register 0x%04X\n", 1334 reg); 1335 BUG(); 1336 return 0; 1337 } 1338 1339 static uint64_t amdgpu_invalid_rreg64_ext(struct amdgpu_device *adev, uint64_t reg) 1340 { 1341 dev_err(adev->dev, "Invalid callback to read register 0x%llX\n", reg); 1342 BUG(); 1343 return 0; 1344 } 1345 1346 /** 1347 * amdgpu_invalid_wreg64 - dummy reg write function 1348 * 1349 * @adev: amdgpu_device pointer 1350 * @reg: offset of register 1351 * @v: value to write to the register 1352 * 1353 * Dummy register read function. Used for register blocks 1354 * that certain asics don't have (all asics). 1355 */ 1356 static void amdgpu_invalid_wreg64(struct amdgpu_device *adev, uint32_t reg, uint64_t v) 1357 { 1358 dev_err(adev->dev, 1359 "Invalid callback to write 64 bit register 0x%04X with 0x%08llX\n", 1360 reg, v); 1361 BUG(); 1362 } 1363 1364 static void amdgpu_invalid_wreg64_ext(struct amdgpu_device *adev, uint64_t reg, uint64_t v) 1365 { 1366 dev_err(adev->dev, 1367 "Invalid callback to write 64 bit register 0x%llX with 0x%08llX\n", 1368 reg, v); 1369 BUG(); 1370 } 1371 1372 /** 1373 * amdgpu_block_invalid_rreg - dummy reg read function 1374 * 1375 * @adev: amdgpu_device pointer 1376 * @block: offset of instance 1377 * @reg: offset of register 1378 * 1379 * Dummy register read function. Used for register blocks 1380 * that certain asics don't have (all asics). 1381 * Returns the value in the register. 1382 */ 1383 static uint32_t amdgpu_block_invalid_rreg(struct amdgpu_device *adev, 1384 uint32_t block, uint32_t reg) 1385 { 1386 dev_err(adev->dev, 1387 "Invalid callback to read register 0x%04X in block 0x%04X\n", 1388 reg, block); 1389 BUG(); 1390 return 0; 1391 } 1392 1393 /** 1394 * amdgpu_block_invalid_wreg - dummy reg write function 1395 * 1396 * @adev: amdgpu_device pointer 1397 * @block: offset of instance 1398 * @reg: offset of register 1399 * @v: value to write to the register 1400 * 1401 * Dummy register read function. Used for register blocks 1402 * that certain asics don't have (all asics). 1403 */ 1404 static void amdgpu_block_invalid_wreg(struct amdgpu_device *adev, 1405 uint32_t block, 1406 uint32_t reg, uint32_t v) 1407 { 1408 dev_err(adev->dev, 1409 "Invalid block callback to write register 0x%04X in block 0x%04X with 0x%08X\n", 1410 reg, block, v); 1411 BUG(); 1412 } 1413 1414 static uint32_t amdgpu_device_get_vbios_flags(struct amdgpu_device *adev) 1415 { 1416 if (hweight32(adev->aid_mask) && (adev->flags & AMD_IS_APU)) 1417 return AMDGPU_VBIOS_SKIP; 1418 1419 if (hweight32(adev->aid_mask) && amdgpu_passthrough(adev)) 1420 return AMDGPU_VBIOS_OPTIONAL; 1421 1422 return 0; 1423 } 1424 1425 /** 1426 * amdgpu_device_asic_init - Wrapper for atom asic_init 1427 * 1428 * @adev: amdgpu_device pointer 1429 * 1430 * Does any asic specific work and then calls atom asic init. 1431 */ 1432 static int amdgpu_device_asic_init(struct amdgpu_device *adev) 1433 { 1434 uint32_t flags; 1435 bool optional; 1436 int ret; 1437 1438 amdgpu_asic_pre_asic_init(adev); 1439 flags = amdgpu_device_get_vbios_flags(adev); 1440 optional = !!(flags & (AMDGPU_VBIOS_OPTIONAL | AMDGPU_VBIOS_SKIP)); 1441 1442 if (amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 3) || 1443 amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 4) || 1444 amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 5, 0) || 1445 amdgpu_ip_version(adev, GC_HWIP, 0) >= IP_VERSION(11, 0, 0)) { 1446 amdgpu_psp_wait_for_bootloader(adev); 1447 if (optional && !adev->bios) 1448 return 0; 1449 1450 ret = amdgpu_atomfirmware_asic_init(adev, true); 1451 return ret; 1452 } else { 1453 if (optional && !adev->bios) 1454 return 0; 1455 1456 return amdgpu_atom_asic_init(adev->mode_info.atom_context); 1457 } 1458 1459 return 0; 1460 } 1461 1462 /** 1463 * amdgpu_device_mem_scratch_init - allocate the VRAM scratch page 1464 * 1465 * @adev: amdgpu_device pointer 1466 * 1467 * Allocates a scratch page of VRAM for use by various things in the 1468 * driver. 1469 */ 1470 static int amdgpu_device_mem_scratch_init(struct amdgpu_device *adev) 1471 { 1472 return amdgpu_bo_create_kernel(adev, AMDGPU_GPU_PAGE_SIZE, PAGE_SIZE, 1473 AMDGPU_GEM_DOMAIN_VRAM | 1474 AMDGPU_GEM_DOMAIN_GTT, 1475 &adev->mem_scratch.robj, 1476 &adev->mem_scratch.gpu_addr, 1477 (void **)&adev->mem_scratch.ptr); 1478 } 1479 1480 /** 1481 * amdgpu_device_mem_scratch_fini - Free the VRAM scratch page 1482 * 1483 * @adev: amdgpu_device pointer 1484 * 1485 * Frees the VRAM scratch page. 1486 */ 1487 static void amdgpu_device_mem_scratch_fini(struct amdgpu_device *adev) 1488 { 1489 amdgpu_bo_free_kernel(&adev->mem_scratch.robj, NULL, NULL); 1490 } 1491 1492 /** 1493 * amdgpu_device_program_register_sequence - program an array of registers. 1494 * 1495 * @adev: amdgpu_device pointer 1496 * @registers: pointer to the register array 1497 * @array_size: size of the register array 1498 * 1499 * Programs an array or registers with and or masks. 1500 * This is a helper for setting golden registers. 1501 */ 1502 void amdgpu_device_program_register_sequence(struct amdgpu_device *adev, 1503 const u32 *registers, 1504 const u32 array_size) 1505 { 1506 u32 tmp, reg, and_mask, or_mask; 1507 int i; 1508 1509 if (array_size % 3) 1510 return; 1511 1512 for (i = 0; i < array_size; i += 3) { 1513 reg = registers[i + 0]; 1514 and_mask = registers[i + 1]; 1515 or_mask = registers[i + 2]; 1516 1517 if (and_mask == 0xffffffff) { 1518 tmp = or_mask; 1519 } else { 1520 tmp = RREG32(reg); 1521 tmp &= ~and_mask; 1522 if (adev->family >= AMDGPU_FAMILY_AI) 1523 tmp |= (or_mask & and_mask); 1524 else 1525 tmp |= or_mask; 1526 } 1527 WREG32(reg, tmp); 1528 } 1529 } 1530 1531 /** 1532 * amdgpu_device_pci_config_reset - reset the GPU 1533 * 1534 * @adev: amdgpu_device pointer 1535 * 1536 * Resets the GPU using the pci config reset sequence. 1537 * Only applicable to asics prior to vega10. 1538 */ 1539 void amdgpu_device_pci_config_reset(struct amdgpu_device *adev) 1540 { 1541 pci_write_config_dword(adev->pdev, 0x7c, AMDGPU_ASIC_RESET_DATA); 1542 } 1543 1544 /** 1545 * amdgpu_device_pci_reset - reset the GPU using generic PCI means 1546 * 1547 * @adev: amdgpu_device pointer 1548 * 1549 * Resets the GPU using generic pci reset interfaces (FLR, SBR, etc.). 1550 */ 1551 int amdgpu_device_pci_reset(struct amdgpu_device *adev) 1552 { 1553 return pci_reset_function(adev->pdev); 1554 } 1555 1556 /* 1557 * amdgpu_device_wb_*() 1558 * Writeback is the method by which the GPU updates special pages in memory 1559 * with the status of certain GPU events (fences, ring pointers,etc.). 1560 */ 1561 1562 /** 1563 * amdgpu_device_wb_fini - Disable Writeback and free memory 1564 * 1565 * @adev: amdgpu_device pointer 1566 * 1567 * Disables Writeback and frees the Writeback memory (all asics). 1568 * Used at driver shutdown. 1569 */ 1570 static void amdgpu_device_wb_fini(struct amdgpu_device *adev) 1571 { 1572 if (adev->wb.wb_obj) { 1573 amdgpu_bo_free_kernel(&adev->wb.wb_obj, 1574 &adev->wb.gpu_addr, 1575 (void **)&adev->wb.wb); 1576 adev->wb.wb_obj = NULL; 1577 } 1578 } 1579 1580 /** 1581 * amdgpu_device_wb_init - Init Writeback driver info and allocate memory 1582 * 1583 * @adev: amdgpu_device pointer 1584 * 1585 * Initializes writeback and allocates writeback memory (all asics). 1586 * Used at driver startup. 1587 * Returns 0 on success or an -error on failure. 1588 */ 1589 static int amdgpu_device_wb_init(struct amdgpu_device *adev) 1590 { 1591 int r; 1592 1593 if (adev->wb.wb_obj == NULL) { 1594 /* AMDGPU_MAX_WB * sizeof(uint32_t) * 8 = AMDGPU_MAX_WB 256bit slots */ 1595 r = amdgpu_bo_create_kernel(adev, AMDGPU_MAX_WB * sizeof(uint32_t) * 8, 1596 PAGE_SIZE, AMDGPU_GEM_DOMAIN_GTT, 1597 &adev->wb.wb_obj, &adev->wb.gpu_addr, 1598 (void **)&adev->wb.wb); 1599 if (r) { 1600 dev_warn(adev->dev, "(%d) create WB bo failed\n", r); 1601 return r; 1602 } 1603 1604 adev->wb.num_wb = AMDGPU_MAX_WB; 1605 memset(&adev->wb.used, 0, sizeof(adev->wb.used)); 1606 1607 /* clear wb memory */ 1608 memset((char *)adev->wb.wb, 0, AMDGPU_MAX_WB * sizeof(uint32_t) * 8); 1609 } 1610 1611 return 0; 1612 } 1613 1614 /** 1615 * amdgpu_device_wb_get - Allocate a wb entry 1616 * 1617 * @adev: amdgpu_device pointer 1618 * @wb: wb index 1619 * 1620 * Allocate a wb slot for use by the driver (all asics). 1621 * Returns 0 on success or -EINVAL on failure. 1622 */ 1623 int amdgpu_device_wb_get(struct amdgpu_device *adev, u32 *wb) 1624 { 1625 unsigned long flags, offset; 1626 1627 spin_lock_irqsave(&adev->wb.lock, flags); 1628 offset = find_first_zero_bit(adev->wb.used, adev->wb.num_wb); 1629 if (offset < adev->wb.num_wb) { 1630 __set_bit(offset, adev->wb.used); 1631 spin_unlock_irqrestore(&adev->wb.lock, flags); 1632 *wb = offset << 3; /* convert to dw offset */ 1633 return 0; 1634 } else { 1635 spin_unlock_irqrestore(&adev->wb.lock, flags); 1636 return -EINVAL; 1637 } 1638 } 1639 1640 /** 1641 * amdgpu_device_wb_free - Free a wb entry 1642 * 1643 * @adev: amdgpu_device pointer 1644 * @wb: wb index 1645 * 1646 * Free a wb slot allocated for use by the driver (all asics) 1647 */ 1648 void amdgpu_device_wb_free(struct amdgpu_device *adev, u32 wb) 1649 { 1650 unsigned long flags; 1651 1652 wb >>= 3; 1653 spin_lock_irqsave(&adev->wb.lock, flags); 1654 if (wb < adev->wb.num_wb) 1655 __clear_bit(wb, adev->wb.used); 1656 spin_unlock_irqrestore(&adev->wb.lock, flags); 1657 } 1658 1659 /** 1660 * amdgpu_device_resize_fb_bar - try to resize FB BAR 1661 * 1662 * @adev: amdgpu_device pointer 1663 * 1664 * Try to resize FB BAR to make all VRAM CPU accessible. We try very hard not 1665 * to fail, but if any of the BARs is not accessible after the size we abort 1666 * driver loading by returning -ENODEV. 1667 */ 1668 int amdgpu_device_resize_fb_bar(struct amdgpu_device *adev) 1669 { 1670 int rbar_size = pci_rebar_bytes_to_size(adev->gmc.real_vram_size); 1671 struct pci_bus *root; 1672 struct resource *res; 1673 unsigned int i; 1674 u16 cmd; 1675 int r; 1676 1677 if (!IS_ENABLED(CONFIG_PHYS_ADDR_T_64BIT)) 1678 return 0; 1679 1680 /* Bypass for VF */ 1681 if (amdgpu_sriov_vf(adev)) 1682 return 0; 1683 1684 if (!amdgpu_rebar) 1685 return 0; 1686 1687 /* resizing on Dell G5 SE platforms causes problems with runtime pm */ 1688 if ((amdgpu_runtime_pm != 0) && 1689 adev->pdev->vendor == PCI_VENDOR_ID_ATI && 1690 adev->pdev->device == 0x731f && 1691 adev->pdev->subsystem_vendor == PCI_VENDOR_ID_DELL) 1692 return 0; 1693 1694 /* PCI_EXT_CAP_ID_VNDR extended capability is located at 0x100 */ 1695 if (!pci_find_ext_capability(adev->pdev, PCI_EXT_CAP_ID_VNDR)) 1696 dev_warn( 1697 adev->dev, 1698 "System can't access extended configuration space, please check!!\n"); 1699 1700 /* skip if the bios has already enabled large BAR */ 1701 if (adev->gmc.real_vram_size && 1702 (pci_resource_len(adev->pdev, 0) >= adev->gmc.real_vram_size)) 1703 return 0; 1704 1705 /* Check if the root BUS has 64bit memory resources */ 1706 root = adev->pdev->bus; 1707 while (root->parent) 1708 root = root->parent; 1709 1710 pci_bus_for_each_resource(root, res, i) { 1711 if (res && res->flags & (IORESOURCE_MEM | IORESOURCE_MEM_64) && 1712 res->start > 0x100000000ull) 1713 break; 1714 } 1715 1716 /* Trying to resize is pointless without a root hub window above 4GB */ 1717 if (!res) 1718 return 0; 1719 1720 /* Limit the BAR size to what is available */ 1721 rbar_size = min(fls(pci_rebar_get_possible_sizes(adev->pdev, 0)) - 1, 1722 rbar_size); 1723 1724 /* Disable memory decoding while we change the BAR addresses and size */ 1725 pci_read_config_word(adev->pdev, PCI_COMMAND, &cmd); 1726 pci_write_config_word(adev->pdev, PCI_COMMAND, 1727 cmd & ~PCI_COMMAND_MEMORY); 1728 1729 /* Free the VRAM and doorbell BAR, we most likely need to move both. */ 1730 amdgpu_doorbell_fini(adev); 1731 if (adev->asic_type >= CHIP_BONAIRE) 1732 pci_release_resource(adev->pdev, 2); 1733 1734 pci_release_resource(adev->pdev, 0); 1735 1736 r = pci_resize_resource(adev->pdev, 0, rbar_size); 1737 if (r == -ENOSPC) 1738 dev_info(adev->dev, 1739 "Not enough PCI address space for a large BAR."); 1740 else if (r && r != -ENOTSUPP) 1741 dev_err(adev->dev, "Problem resizing BAR0 (%d).", r); 1742 1743 pci_assign_unassigned_bus_resources(adev->pdev->bus); 1744 1745 /* When the doorbell or fb BAR isn't available we have no chance of 1746 * using the device. 1747 */ 1748 r = amdgpu_doorbell_init(adev); 1749 if (r || (pci_resource_flags(adev->pdev, 0) & IORESOURCE_UNSET)) 1750 return -ENODEV; 1751 1752 pci_write_config_word(adev->pdev, PCI_COMMAND, cmd); 1753 1754 return 0; 1755 } 1756 1757 /* 1758 * GPU helpers function. 1759 */ 1760 /** 1761 * amdgpu_device_need_post - check if the hw need post or not 1762 * 1763 * @adev: amdgpu_device pointer 1764 * 1765 * Check if the asic has been initialized (all asics) at driver startup 1766 * or post is needed if hw reset is performed. 1767 * Returns true if need or false if not. 1768 */ 1769 bool amdgpu_device_need_post(struct amdgpu_device *adev) 1770 { 1771 uint32_t reg, flags; 1772 1773 if (amdgpu_sriov_vf(adev)) 1774 return false; 1775 1776 flags = amdgpu_device_get_vbios_flags(adev); 1777 if (flags & AMDGPU_VBIOS_SKIP) 1778 return false; 1779 if ((flags & AMDGPU_VBIOS_OPTIONAL) && !adev->bios) 1780 return false; 1781 1782 if (amdgpu_passthrough(adev)) { 1783 /* for FIJI: In whole GPU pass-through virtualization case, after VM reboot 1784 * some old smc fw still need driver do vPost otherwise gpu hang, while 1785 * those smc fw version above 22.15 doesn't have this flaw, so we force 1786 * vpost executed for smc version below 22.15 1787 */ 1788 if (adev->asic_type == CHIP_FIJI) { 1789 int err; 1790 uint32_t fw_ver; 1791 1792 err = request_firmware(&adev->pm.fw, "amdgpu/fiji_smc.bin", adev->dev); 1793 /* force vPost if error occurred */ 1794 if (err) 1795 return true; 1796 1797 fw_ver = *((uint32_t *)adev->pm.fw->data + 69); 1798 release_firmware(adev->pm.fw); 1799 if (fw_ver < 0x00160e00) 1800 return true; 1801 } 1802 } 1803 1804 /* Don't post if we need to reset whole hive on init */ 1805 if (adev->init_lvl->level == AMDGPU_INIT_LEVEL_MINIMAL_XGMI) 1806 return false; 1807 1808 if (adev->has_hw_reset) { 1809 adev->has_hw_reset = false; 1810 return true; 1811 } 1812 1813 /* bios scratch used on CIK+ */ 1814 if (adev->asic_type >= CHIP_BONAIRE) 1815 return amdgpu_atombios_scratch_need_asic_init(adev); 1816 1817 /* check MEM_SIZE for older asics */ 1818 reg = amdgpu_asic_get_config_memsize(adev); 1819 1820 if ((reg != 0) && (reg != 0xffffffff)) 1821 return false; 1822 1823 return true; 1824 } 1825 1826 /* 1827 * Check whether seamless boot is supported. 1828 * 1829 * So far we only support seamless boot on DCE 3.0 or later. 1830 * If users report that it works on older ASICS as well, we may 1831 * loosen this. 1832 */ 1833 bool amdgpu_device_seamless_boot_supported(struct amdgpu_device *adev) 1834 { 1835 switch (amdgpu_seamless) { 1836 case -1: 1837 break; 1838 case 1: 1839 return true; 1840 case 0: 1841 return false; 1842 default: 1843 dev_err(adev->dev, "Invalid value for amdgpu.seamless: %d\n", 1844 amdgpu_seamless); 1845 return false; 1846 } 1847 1848 if (!(adev->flags & AMD_IS_APU)) 1849 return false; 1850 1851 if (adev->mman.keep_stolen_vga_memory) 1852 return false; 1853 1854 return amdgpu_ip_version(adev, DCE_HWIP, 0) >= IP_VERSION(3, 0, 0); 1855 } 1856 1857 /* 1858 * Intel hosts such as Rocket Lake, Alder Lake, Raptor Lake and Sapphire Rapids 1859 * don't support dynamic speed switching. Until we have confirmation from Intel 1860 * that a specific host supports it, it's safer that we keep it disabled for all. 1861 * 1862 * https://edc.intel.com/content/www/us/en/design/products/platforms/details/raptor-lake-s/13th-generation-core-processors-datasheet-volume-1-of-2/005/pci-express-support/ 1863 * https://gitlab.freedesktop.org/drm/amd/-/issues/2663 1864 */ 1865 static bool amdgpu_device_pcie_dynamic_switching_supported(struct amdgpu_device *adev) 1866 { 1867 #if IS_ENABLED(CONFIG_X86) 1868 struct cpuinfo_x86 *c = &cpu_data(0); 1869 1870 /* eGPU change speeds based on USB4 fabric conditions */ 1871 if (dev_is_removable(adev->dev)) 1872 return true; 1873 1874 if (c->x86_vendor == X86_VENDOR_INTEL) 1875 return false; 1876 #endif 1877 return true; 1878 } 1879 1880 static bool amdgpu_device_aspm_support_quirk(struct amdgpu_device *adev) 1881 { 1882 #if IS_ENABLED(CONFIG_X86) 1883 struct cpuinfo_x86 *c = &cpu_data(0); 1884 1885 if (!(amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(12, 0, 0) || 1886 amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(12, 0, 1))) 1887 return false; 1888 1889 if (c->x86 == 6 && 1890 adev->pm.pcie_gen_mask & CAIL_PCIE_LINK_SPEED_SUPPORT_GEN5) { 1891 switch (c->x86_model) { 1892 case VFM_MODEL(INTEL_ALDERLAKE): 1893 case VFM_MODEL(INTEL_ALDERLAKE_L): 1894 case VFM_MODEL(INTEL_RAPTORLAKE): 1895 case VFM_MODEL(INTEL_RAPTORLAKE_P): 1896 case VFM_MODEL(INTEL_RAPTORLAKE_S): 1897 return true; 1898 default: 1899 return false; 1900 } 1901 } else { 1902 return false; 1903 } 1904 #else 1905 return false; 1906 #endif 1907 } 1908 1909 /** 1910 * amdgpu_device_should_use_aspm - check if the device should program ASPM 1911 * 1912 * @adev: amdgpu_device pointer 1913 * 1914 * Confirm whether the module parameter and pcie bridge agree that ASPM should 1915 * be set for this device. 1916 * 1917 * Returns true if it should be used or false if not. 1918 */ 1919 bool amdgpu_device_should_use_aspm(struct amdgpu_device *adev) 1920 { 1921 switch (amdgpu_aspm) { 1922 case -1: 1923 break; 1924 case 0: 1925 return false; 1926 case 1: 1927 return true; 1928 default: 1929 return false; 1930 } 1931 if (adev->flags & AMD_IS_APU) 1932 return false; 1933 if (amdgpu_device_aspm_support_quirk(adev)) 1934 return false; 1935 return pcie_aspm_enabled(adev->pdev); 1936 } 1937 1938 /* if we get transitioned to only one device, take VGA back */ 1939 /** 1940 * amdgpu_device_vga_set_decode - enable/disable vga decode 1941 * 1942 * @pdev: PCI device pointer 1943 * @state: enable/disable vga decode 1944 * 1945 * Enable/disable vga decode (all asics). 1946 * Returns VGA resource flags. 1947 */ 1948 static unsigned int amdgpu_device_vga_set_decode(struct pci_dev *pdev, 1949 bool state) 1950 { 1951 struct amdgpu_device *adev = drm_to_adev(pci_get_drvdata(pdev)); 1952 1953 amdgpu_asic_set_vga_state(adev, state); 1954 if (state) 1955 return VGA_RSRC_LEGACY_IO | VGA_RSRC_LEGACY_MEM | 1956 VGA_RSRC_NORMAL_IO | VGA_RSRC_NORMAL_MEM; 1957 else 1958 return VGA_RSRC_NORMAL_IO | VGA_RSRC_NORMAL_MEM; 1959 } 1960 1961 /** 1962 * amdgpu_device_check_block_size - validate the vm block size 1963 * 1964 * @adev: amdgpu_device pointer 1965 * 1966 * Validates the vm block size specified via module parameter. 1967 * The vm block size defines number of bits in page table versus page directory, 1968 * a page is 4KB so we have 12 bits offset, minimum 9 bits in the 1969 * page table and the remaining bits are in the page directory. 1970 */ 1971 static void amdgpu_device_check_block_size(struct amdgpu_device *adev) 1972 { 1973 /* defines number of bits in page table versus page directory, 1974 * a page is 4KB so we have 12 bits offset, minimum 9 bits in the 1975 * page table and the remaining bits are in the page directory 1976 */ 1977 if (amdgpu_vm_block_size == -1) 1978 return; 1979 1980 if (amdgpu_vm_block_size < 9) { 1981 dev_warn(adev->dev, "VM page table size (%d) too small\n", 1982 amdgpu_vm_block_size); 1983 amdgpu_vm_block_size = -1; 1984 } 1985 } 1986 1987 /** 1988 * amdgpu_device_check_vm_size - validate the vm size 1989 * 1990 * @adev: amdgpu_device pointer 1991 * 1992 * Validates the vm size in GB specified via module parameter. 1993 * The VM size is the size of the GPU virtual memory space in GB. 1994 */ 1995 static void amdgpu_device_check_vm_size(struct amdgpu_device *adev) 1996 { 1997 /* no need to check the default value */ 1998 if (amdgpu_vm_size == -1) 1999 return; 2000 2001 if (amdgpu_vm_size < 1) { 2002 dev_warn(adev->dev, "VM size (%d) too small, min is 1GB\n", 2003 amdgpu_vm_size); 2004 amdgpu_vm_size = -1; 2005 } 2006 } 2007 2008 static void amdgpu_device_check_smu_prv_buffer_size(struct amdgpu_device *adev) 2009 { 2010 struct sysinfo si; 2011 bool is_os_64 = (sizeof(void *) == 8); 2012 uint64_t total_memory; 2013 uint64_t dram_size_seven_GB = 0x1B8000000; 2014 uint64_t dram_size_three_GB = 0xB8000000; 2015 2016 if (amdgpu_smu_memory_pool_size == 0) 2017 return; 2018 2019 if (!is_os_64) { 2020 dev_warn(adev->dev, "Not 64-bit OS, feature not supported\n"); 2021 goto def_value; 2022 } 2023 si_meminfo(&si); 2024 total_memory = (uint64_t)si.totalram * si.mem_unit; 2025 2026 if ((amdgpu_smu_memory_pool_size == 1) || 2027 (amdgpu_smu_memory_pool_size == 2)) { 2028 if (total_memory < dram_size_three_GB) 2029 goto def_value1; 2030 } else if ((amdgpu_smu_memory_pool_size == 4) || 2031 (amdgpu_smu_memory_pool_size == 8)) { 2032 if (total_memory < dram_size_seven_GB) 2033 goto def_value1; 2034 } else { 2035 dev_warn(adev->dev, "Smu memory pool size not supported\n"); 2036 goto def_value; 2037 } 2038 adev->pm.smu_prv_buffer_size = amdgpu_smu_memory_pool_size << 28; 2039 2040 return; 2041 2042 def_value1: 2043 dev_warn(adev->dev, "No enough system memory\n"); 2044 def_value: 2045 adev->pm.smu_prv_buffer_size = 0; 2046 } 2047 2048 static int amdgpu_device_init_apu_flags(struct amdgpu_device *adev) 2049 { 2050 if (!(adev->flags & AMD_IS_APU) || 2051 adev->asic_type < CHIP_RAVEN) 2052 return 0; 2053 2054 switch (adev->asic_type) { 2055 case CHIP_RAVEN: 2056 if (adev->pdev->device == 0x15dd) 2057 adev->apu_flags |= AMD_APU_IS_RAVEN; 2058 if (adev->pdev->device == 0x15d8) 2059 adev->apu_flags |= AMD_APU_IS_PICASSO; 2060 break; 2061 case CHIP_RENOIR: 2062 if ((adev->pdev->device == 0x1636) || 2063 (adev->pdev->device == 0x164c)) 2064 adev->apu_flags |= AMD_APU_IS_RENOIR; 2065 else 2066 adev->apu_flags |= AMD_APU_IS_GREEN_SARDINE; 2067 break; 2068 case CHIP_VANGOGH: 2069 adev->apu_flags |= AMD_APU_IS_VANGOGH; 2070 break; 2071 case CHIP_YELLOW_CARP: 2072 break; 2073 case CHIP_CYAN_SKILLFISH: 2074 if ((adev->pdev->device == 0x13FE) || 2075 (adev->pdev->device == 0x143F)) 2076 adev->apu_flags |= AMD_APU_IS_CYAN_SKILLFISH2; 2077 break; 2078 default: 2079 break; 2080 } 2081 2082 return 0; 2083 } 2084 2085 /** 2086 * amdgpu_device_check_arguments - validate module params 2087 * 2088 * @adev: amdgpu_device pointer 2089 * 2090 * Validates certain module parameters and updates 2091 * the associated values used by the driver (all asics). 2092 */ 2093 static int amdgpu_device_check_arguments(struct amdgpu_device *adev) 2094 { 2095 int i; 2096 2097 if (amdgpu_sched_jobs < 4) { 2098 dev_warn(adev->dev, "sched jobs (%d) must be at least 4\n", 2099 amdgpu_sched_jobs); 2100 amdgpu_sched_jobs = 4; 2101 } else if (!is_power_of_2(amdgpu_sched_jobs)) { 2102 dev_warn(adev->dev, "sched jobs (%d) must be a power of 2\n", 2103 amdgpu_sched_jobs); 2104 amdgpu_sched_jobs = roundup_pow_of_two(amdgpu_sched_jobs); 2105 } 2106 2107 if (amdgpu_gart_size != -1 && amdgpu_gart_size < 32) { 2108 /* gart size must be greater or equal to 32M */ 2109 dev_warn(adev->dev, "gart size (%d) too small\n", 2110 amdgpu_gart_size); 2111 amdgpu_gart_size = -1; 2112 } 2113 2114 if (amdgpu_gtt_size != -1 && amdgpu_gtt_size < 32) { 2115 /* gtt size must be greater or equal to 32M */ 2116 dev_warn(adev->dev, "gtt size (%d) too small\n", 2117 amdgpu_gtt_size); 2118 amdgpu_gtt_size = -1; 2119 } 2120 2121 /* valid range is between 4 and 9 inclusive */ 2122 if (amdgpu_vm_fragment_size != -1 && 2123 (amdgpu_vm_fragment_size > 9 || amdgpu_vm_fragment_size < 4)) { 2124 dev_warn(adev->dev, "valid range is between 4 and 9\n"); 2125 amdgpu_vm_fragment_size = -1; 2126 } 2127 2128 if (amdgpu_sched_hw_submission < 2) { 2129 dev_warn(adev->dev, "sched hw submission jobs (%d) must be at least 2\n", 2130 amdgpu_sched_hw_submission); 2131 amdgpu_sched_hw_submission = 2; 2132 } else if (!is_power_of_2(amdgpu_sched_hw_submission)) { 2133 dev_warn(adev->dev, "sched hw submission jobs (%d) must be a power of 2\n", 2134 amdgpu_sched_hw_submission); 2135 amdgpu_sched_hw_submission = roundup_pow_of_two(amdgpu_sched_hw_submission); 2136 } 2137 2138 if (amdgpu_reset_method < -1 || amdgpu_reset_method > 4) { 2139 dev_warn(adev->dev, "invalid option for reset method, reverting to default\n"); 2140 amdgpu_reset_method = -1; 2141 } 2142 2143 amdgpu_device_check_smu_prv_buffer_size(adev); 2144 2145 amdgpu_device_check_vm_size(adev); 2146 2147 amdgpu_device_check_block_size(adev); 2148 2149 adev->firmware.load_type = amdgpu_ucode_get_load_type(adev, amdgpu_fw_load_type); 2150 2151 for (i = 0; i < MAX_XCP; i++) { 2152 switch (amdgpu_enforce_isolation) { 2153 case -1: 2154 case 0: 2155 default: 2156 /* disable */ 2157 adev->enforce_isolation[i] = AMDGPU_ENFORCE_ISOLATION_DISABLE; 2158 break; 2159 case 1: 2160 /* enable */ 2161 adev->enforce_isolation[i] = 2162 AMDGPU_ENFORCE_ISOLATION_ENABLE; 2163 break; 2164 case 2: 2165 /* enable legacy mode */ 2166 adev->enforce_isolation[i] = 2167 AMDGPU_ENFORCE_ISOLATION_ENABLE_LEGACY; 2168 break; 2169 case 3: 2170 /* enable only process isolation without submitting cleaner shader */ 2171 adev->enforce_isolation[i] = 2172 AMDGPU_ENFORCE_ISOLATION_NO_CLEANER_SHADER; 2173 break; 2174 } 2175 } 2176 2177 return 0; 2178 } 2179 2180 /** 2181 * amdgpu_switcheroo_set_state - set switcheroo state 2182 * 2183 * @pdev: pci dev pointer 2184 * @state: vga_switcheroo state 2185 * 2186 * Callback for the switcheroo driver. Suspends or resumes 2187 * the asics before or after it is powered up using ACPI methods. 2188 */ 2189 static void amdgpu_switcheroo_set_state(struct pci_dev *pdev, 2190 enum vga_switcheroo_state state) 2191 { 2192 struct drm_device *dev = pci_get_drvdata(pdev); 2193 int r; 2194 2195 if (amdgpu_device_supports_px(drm_to_adev(dev)) && 2196 state == VGA_SWITCHEROO_OFF) 2197 return; 2198 2199 if (state == VGA_SWITCHEROO_ON) { 2200 pr_info("switched on\n"); 2201 /* don't suspend or resume card normally */ 2202 dev->switch_power_state = DRM_SWITCH_POWER_CHANGING; 2203 2204 pci_set_power_state(pdev, PCI_D0); 2205 amdgpu_device_load_pci_state(pdev); 2206 r = pci_enable_device(pdev); 2207 if (r) 2208 dev_warn(&pdev->dev, "pci_enable_device failed (%d)\n", 2209 r); 2210 amdgpu_device_resume(dev, true); 2211 2212 dev->switch_power_state = DRM_SWITCH_POWER_ON; 2213 } else { 2214 dev_info(&pdev->dev, "switched off\n"); 2215 dev->switch_power_state = DRM_SWITCH_POWER_CHANGING; 2216 amdgpu_device_prepare(dev); 2217 amdgpu_device_suspend(dev, true); 2218 amdgpu_device_cache_pci_state(pdev); 2219 /* Shut down the device */ 2220 pci_disable_device(pdev); 2221 pci_set_power_state(pdev, PCI_D3cold); 2222 dev->switch_power_state = DRM_SWITCH_POWER_OFF; 2223 } 2224 } 2225 2226 /** 2227 * amdgpu_switcheroo_can_switch - see if switcheroo state can change 2228 * 2229 * @pdev: pci dev pointer 2230 * 2231 * Callback for the switcheroo driver. Check of the switcheroo 2232 * state can be changed. 2233 * Returns true if the state can be changed, false if not. 2234 */ 2235 static bool amdgpu_switcheroo_can_switch(struct pci_dev *pdev) 2236 { 2237 struct drm_device *dev = pci_get_drvdata(pdev); 2238 2239 /* 2240 * FIXME: open_count is protected by drm_global_mutex but that would lead to 2241 * locking inversion with the driver load path. And the access here is 2242 * completely racy anyway. So don't bother with locking for now. 2243 */ 2244 return atomic_read(&dev->open_count) == 0; 2245 } 2246 2247 static const struct vga_switcheroo_client_ops amdgpu_switcheroo_ops = { 2248 .set_gpu_state = amdgpu_switcheroo_set_state, 2249 .reprobe = NULL, 2250 .can_switch = amdgpu_switcheroo_can_switch, 2251 }; 2252 2253 /** 2254 * amdgpu_device_ip_set_clockgating_state - set the CG state 2255 * 2256 * @dev: amdgpu_device pointer 2257 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.) 2258 * @state: clockgating state (gate or ungate) 2259 * 2260 * Sets the requested clockgating state for all instances of 2261 * the hardware IP specified. 2262 * Returns the error code from the last instance. 2263 */ 2264 int amdgpu_device_ip_set_clockgating_state(void *dev, 2265 enum amd_ip_block_type block_type, 2266 enum amd_clockgating_state state) 2267 { 2268 struct amdgpu_device *adev = dev; 2269 int i, r = 0; 2270 2271 for (i = 0; i < adev->num_ip_blocks; i++) { 2272 if (!adev->ip_blocks[i].status.valid) 2273 continue; 2274 if (adev->ip_blocks[i].version->type != block_type) 2275 continue; 2276 if (!adev->ip_blocks[i].version->funcs->set_clockgating_state) 2277 continue; 2278 r = adev->ip_blocks[i].version->funcs->set_clockgating_state( 2279 &adev->ip_blocks[i], state); 2280 if (r) 2281 dev_err(adev->dev, 2282 "set_clockgating_state of IP block <%s> failed %d\n", 2283 adev->ip_blocks[i].version->funcs->name, r); 2284 } 2285 return r; 2286 } 2287 2288 /** 2289 * amdgpu_device_ip_set_powergating_state - set the PG state 2290 * 2291 * @dev: amdgpu_device pointer 2292 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.) 2293 * @state: powergating state (gate or ungate) 2294 * 2295 * Sets the requested powergating state for all instances of 2296 * the hardware IP specified. 2297 * Returns the error code from the last instance. 2298 */ 2299 int amdgpu_device_ip_set_powergating_state(void *dev, 2300 enum amd_ip_block_type block_type, 2301 enum amd_powergating_state state) 2302 { 2303 struct amdgpu_device *adev = dev; 2304 int i, r = 0; 2305 2306 for (i = 0; i < adev->num_ip_blocks; i++) { 2307 if (!adev->ip_blocks[i].status.valid) 2308 continue; 2309 if (adev->ip_blocks[i].version->type != block_type) 2310 continue; 2311 if (!adev->ip_blocks[i].version->funcs->set_powergating_state) 2312 continue; 2313 r = adev->ip_blocks[i].version->funcs->set_powergating_state( 2314 &adev->ip_blocks[i], state); 2315 if (r) 2316 dev_err(adev->dev, 2317 "set_powergating_state of IP block <%s> failed %d\n", 2318 adev->ip_blocks[i].version->funcs->name, r); 2319 } 2320 return r; 2321 } 2322 2323 /** 2324 * amdgpu_device_ip_get_clockgating_state - get the CG state 2325 * 2326 * @adev: amdgpu_device pointer 2327 * @flags: clockgating feature flags 2328 * 2329 * Walks the list of IPs on the device and updates the clockgating 2330 * flags for each IP. 2331 * Updates @flags with the feature flags for each hardware IP where 2332 * clockgating is enabled. 2333 */ 2334 void amdgpu_device_ip_get_clockgating_state(struct amdgpu_device *adev, 2335 u64 *flags) 2336 { 2337 int i; 2338 2339 for (i = 0; i < adev->num_ip_blocks; i++) { 2340 if (!adev->ip_blocks[i].status.valid) 2341 continue; 2342 if (adev->ip_blocks[i].version->funcs->get_clockgating_state) 2343 adev->ip_blocks[i].version->funcs->get_clockgating_state( 2344 &adev->ip_blocks[i], flags); 2345 } 2346 } 2347 2348 /** 2349 * amdgpu_device_ip_wait_for_idle - wait for idle 2350 * 2351 * @adev: amdgpu_device pointer 2352 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.) 2353 * 2354 * Waits for the request hardware IP to be idle. 2355 * Returns 0 for success or a negative error code on failure. 2356 */ 2357 int amdgpu_device_ip_wait_for_idle(struct amdgpu_device *adev, 2358 enum amd_ip_block_type block_type) 2359 { 2360 int i, r; 2361 2362 for (i = 0; i < adev->num_ip_blocks; i++) { 2363 if (!adev->ip_blocks[i].status.valid) 2364 continue; 2365 if (adev->ip_blocks[i].version->type == block_type) { 2366 if (adev->ip_blocks[i].version->funcs->wait_for_idle) { 2367 r = adev->ip_blocks[i].version->funcs->wait_for_idle( 2368 &adev->ip_blocks[i]); 2369 if (r) 2370 return r; 2371 } 2372 break; 2373 } 2374 } 2375 return 0; 2376 2377 } 2378 2379 /** 2380 * amdgpu_device_ip_is_valid - is the hardware IP enabled 2381 * 2382 * @adev: amdgpu_device pointer 2383 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.) 2384 * 2385 * Check if the hardware IP is enable or not. 2386 * Returns true if it the IP is enable, false if not. 2387 */ 2388 bool amdgpu_device_ip_is_valid(struct amdgpu_device *adev, 2389 enum amd_ip_block_type block_type) 2390 { 2391 int i; 2392 2393 for (i = 0; i < adev->num_ip_blocks; i++) { 2394 if (adev->ip_blocks[i].version->type == block_type) 2395 return adev->ip_blocks[i].status.valid; 2396 } 2397 return false; 2398 2399 } 2400 2401 /** 2402 * amdgpu_device_ip_get_ip_block - get a hw IP pointer 2403 * 2404 * @adev: amdgpu_device pointer 2405 * @type: Type of hardware IP (SMU, GFX, UVD, etc.) 2406 * 2407 * Returns a pointer to the hardware IP block structure 2408 * if it exists for the asic, otherwise NULL. 2409 */ 2410 struct amdgpu_ip_block * 2411 amdgpu_device_ip_get_ip_block(struct amdgpu_device *adev, 2412 enum amd_ip_block_type type) 2413 { 2414 int i; 2415 2416 for (i = 0; i < adev->num_ip_blocks; i++) 2417 if (adev->ip_blocks[i].version->type == type) 2418 return &adev->ip_blocks[i]; 2419 2420 return NULL; 2421 } 2422 2423 /** 2424 * amdgpu_device_ip_block_version_cmp 2425 * 2426 * @adev: amdgpu_device pointer 2427 * @type: enum amd_ip_block_type 2428 * @major: major version 2429 * @minor: minor version 2430 * 2431 * return 0 if equal or greater 2432 * return 1 if smaller or the ip_block doesn't exist 2433 */ 2434 int amdgpu_device_ip_block_version_cmp(struct amdgpu_device *adev, 2435 enum amd_ip_block_type type, 2436 u32 major, u32 minor) 2437 { 2438 struct amdgpu_ip_block *ip_block = amdgpu_device_ip_get_ip_block(adev, type); 2439 2440 if (ip_block && ((ip_block->version->major > major) || 2441 ((ip_block->version->major == major) && 2442 (ip_block->version->minor >= minor)))) 2443 return 0; 2444 2445 return 1; 2446 } 2447 2448 /** 2449 * amdgpu_device_ip_block_add 2450 * 2451 * @adev: amdgpu_device pointer 2452 * @ip_block_version: pointer to the IP to add 2453 * 2454 * Adds the IP block driver information to the collection of IPs 2455 * on the asic. 2456 */ 2457 int amdgpu_device_ip_block_add(struct amdgpu_device *adev, 2458 const struct amdgpu_ip_block_version *ip_block_version) 2459 { 2460 if (!ip_block_version) 2461 return -EINVAL; 2462 2463 switch (ip_block_version->type) { 2464 case AMD_IP_BLOCK_TYPE_VCN: 2465 if (adev->harvest_ip_mask & AMD_HARVEST_IP_VCN_MASK) 2466 return 0; 2467 break; 2468 case AMD_IP_BLOCK_TYPE_JPEG: 2469 if (adev->harvest_ip_mask & AMD_HARVEST_IP_JPEG_MASK) 2470 return 0; 2471 break; 2472 default: 2473 break; 2474 } 2475 2476 dev_info(adev->dev, "detected ip block number %d <%s>\n", 2477 adev->num_ip_blocks, ip_block_version->funcs->name); 2478 2479 adev->ip_blocks[adev->num_ip_blocks].adev = adev; 2480 2481 adev->ip_blocks[adev->num_ip_blocks++].version = ip_block_version; 2482 2483 return 0; 2484 } 2485 2486 /** 2487 * amdgpu_device_enable_virtual_display - enable virtual display feature 2488 * 2489 * @adev: amdgpu_device pointer 2490 * 2491 * Enabled the virtual display feature if the user has enabled it via 2492 * the module parameter virtual_display. This feature provides a virtual 2493 * display hardware on headless boards or in virtualized environments. 2494 * This function parses and validates the configuration string specified by 2495 * the user and configures the virtual display configuration (number of 2496 * virtual connectors, crtcs, etc.) specified. 2497 */ 2498 static void amdgpu_device_enable_virtual_display(struct amdgpu_device *adev) 2499 { 2500 adev->enable_virtual_display = false; 2501 2502 if (amdgpu_virtual_display) { 2503 const char *pci_address_name = pci_name(adev->pdev); 2504 char *pciaddstr, *pciaddstr_tmp, *pciaddname_tmp, *pciaddname; 2505 2506 pciaddstr = kstrdup(amdgpu_virtual_display, GFP_KERNEL); 2507 pciaddstr_tmp = pciaddstr; 2508 while ((pciaddname_tmp = strsep(&pciaddstr_tmp, ";"))) { 2509 pciaddname = strsep(&pciaddname_tmp, ","); 2510 if (!strcmp("all", pciaddname) 2511 || !strcmp(pci_address_name, pciaddname)) { 2512 long num_crtc; 2513 int res = -1; 2514 2515 adev->enable_virtual_display = true; 2516 2517 if (pciaddname_tmp) 2518 res = kstrtol(pciaddname_tmp, 10, 2519 &num_crtc); 2520 2521 if (!res) { 2522 if (num_crtc < 1) 2523 num_crtc = 1; 2524 if (num_crtc > 6) 2525 num_crtc = 6; 2526 adev->mode_info.num_crtc = num_crtc; 2527 } else { 2528 adev->mode_info.num_crtc = 1; 2529 } 2530 break; 2531 } 2532 } 2533 2534 dev_info( 2535 adev->dev, 2536 "virtual display string:%s, %s:virtual_display:%d, num_crtc:%d\n", 2537 amdgpu_virtual_display, pci_address_name, 2538 adev->enable_virtual_display, adev->mode_info.num_crtc); 2539 2540 kfree(pciaddstr); 2541 } 2542 } 2543 2544 void amdgpu_device_set_sriov_virtual_display(struct amdgpu_device *adev) 2545 { 2546 if (amdgpu_sriov_vf(adev) && !adev->enable_virtual_display) { 2547 adev->mode_info.num_crtc = 1; 2548 adev->enable_virtual_display = true; 2549 dev_info(adev->dev, "virtual_display:%d, num_crtc:%d\n", 2550 adev->enable_virtual_display, 2551 adev->mode_info.num_crtc); 2552 } 2553 } 2554 2555 /** 2556 * amdgpu_device_parse_gpu_info_fw - parse gpu info firmware 2557 * 2558 * @adev: amdgpu_device pointer 2559 * 2560 * Parses the asic configuration parameters specified in the gpu info 2561 * firmware and makes them available to the driver for use in configuring 2562 * the asic. 2563 * Returns 0 on success, -EINVAL on failure. 2564 */ 2565 static int amdgpu_device_parse_gpu_info_fw(struct amdgpu_device *adev) 2566 { 2567 const char *chip_name; 2568 int err; 2569 const struct gpu_info_firmware_header_v1_0 *hdr; 2570 2571 adev->firmware.gpu_info_fw = NULL; 2572 2573 if (adev->mman.discovery_bin) 2574 return 0; 2575 2576 switch (adev->asic_type) { 2577 default: 2578 return 0; 2579 case CHIP_VEGA10: 2580 chip_name = "vega10"; 2581 break; 2582 case CHIP_VEGA12: 2583 chip_name = "vega12"; 2584 break; 2585 case CHIP_RAVEN: 2586 if (adev->apu_flags & AMD_APU_IS_RAVEN2) 2587 chip_name = "raven2"; 2588 else if (adev->apu_flags & AMD_APU_IS_PICASSO) 2589 chip_name = "picasso"; 2590 else 2591 chip_name = "raven"; 2592 break; 2593 case CHIP_ARCTURUS: 2594 chip_name = "arcturus"; 2595 break; 2596 case CHIP_NAVI12: 2597 chip_name = "navi12"; 2598 break; 2599 } 2600 2601 err = amdgpu_ucode_request(adev, &adev->firmware.gpu_info_fw, 2602 AMDGPU_UCODE_OPTIONAL, 2603 "amdgpu/%s_gpu_info.bin", chip_name); 2604 if (err) { 2605 dev_err(adev->dev, 2606 "Failed to get gpu_info firmware \"%s_gpu_info.bin\"\n", 2607 chip_name); 2608 goto out; 2609 } 2610 2611 hdr = (const struct gpu_info_firmware_header_v1_0 *)adev->firmware.gpu_info_fw->data; 2612 amdgpu_ucode_print_gpu_info_hdr(&hdr->header); 2613 2614 switch (hdr->version_major) { 2615 case 1: 2616 { 2617 const struct gpu_info_firmware_v1_0 *gpu_info_fw = 2618 (const struct gpu_info_firmware_v1_0 *)(adev->firmware.gpu_info_fw->data + 2619 le32_to_cpu(hdr->header.ucode_array_offset_bytes)); 2620 2621 /* 2622 * Should be dropped when DAL no longer needs it. 2623 */ 2624 if (adev->asic_type == CHIP_NAVI12) 2625 goto parse_soc_bounding_box; 2626 2627 adev->gfx.config.max_shader_engines = le32_to_cpu(gpu_info_fw->gc_num_se); 2628 adev->gfx.config.max_cu_per_sh = le32_to_cpu(gpu_info_fw->gc_num_cu_per_sh); 2629 adev->gfx.config.max_sh_per_se = le32_to_cpu(gpu_info_fw->gc_num_sh_per_se); 2630 adev->gfx.config.max_backends_per_se = le32_to_cpu(gpu_info_fw->gc_num_rb_per_se); 2631 adev->gfx.config.max_texture_channel_caches = 2632 le32_to_cpu(gpu_info_fw->gc_num_tccs); 2633 adev->gfx.config.max_gprs = le32_to_cpu(gpu_info_fw->gc_num_gprs); 2634 adev->gfx.config.max_gs_threads = le32_to_cpu(gpu_info_fw->gc_num_max_gs_thds); 2635 adev->gfx.config.gs_vgt_table_depth = le32_to_cpu(gpu_info_fw->gc_gs_table_depth); 2636 adev->gfx.config.gs_prim_buffer_depth = le32_to_cpu(gpu_info_fw->gc_gsprim_buff_depth); 2637 adev->gfx.config.double_offchip_lds_buf = 2638 le32_to_cpu(gpu_info_fw->gc_double_offchip_lds_buffer); 2639 adev->gfx.cu_info.wave_front_size = le32_to_cpu(gpu_info_fw->gc_wave_size); 2640 adev->gfx.cu_info.max_waves_per_simd = 2641 le32_to_cpu(gpu_info_fw->gc_max_waves_per_simd); 2642 adev->gfx.cu_info.max_scratch_slots_per_cu = 2643 le32_to_cpu(gpu_info_fw->gc_max_scratch_slots_per_cu); 2644 adev->gfx.cu_info.lds_size = le32_to_cpu(gpu_info_fw->gc_lds_size); 2645 if (hdr->version_minor >= 1) { 2646 const struct gpu_info_firmware_v1_1 *gpu_info_fw = 2647 (const struct gpu_info_firmware_v1_1 *)(adev->firmware.gpu_info_fw->data + 2648 le32_to_cpu(hdr->header.ucode_array_offset_bytes)); 2649 adev->gfx.config.num_sc_per_sh = 2650 le32_to_cpu(gpu_info_fw->num_sc_per_sh); 2651 adev->gfx.config.num_packer_per_sc = 2652 le32_to_cpu(gpu_info_fw->num_packer_per_sc); 2653 } 2654 2655 parse_soc_bounding_box: 2656 /* 2657 * soc bounding box info is not integrated in disocovery table, 2658 * we always need to parse it from gpu info firmware if needed. 2659 */ 2660 if (hdr->version_minor == 2) { 2661 const struct gpu_info_firmware_v1_2 *gpu_info_fw = 2662 (const struct gpu_info_firmware_v1_2 *)(adev->firmware.gpu_info_fw->data + 2663 le32_to_cpu(hdr->header.ucode_array_offset_bytes)); 2664 adev->dm.soc_bounding_box = &gpu_info_fw->soc_bounding_box; 2665 } 2666 break; 2667 } 2668 default: 2669 dev_err(adev->dev, 2670 "Unsupported gpu_info table %d\n", hdr->header.ucode_version); 2671 err = -EINVAL; 2672 goto out; 2673 } 2674 out: 2675 return err; 2676 } 2677 2678 /** 2679 * amdgpu_device_ip_early_init - run early init for hardware IPs 2680 * 2681 * @adev: amdgpu_device pointer 2682 * 2683 * Early initialization pass for hardware IPs. The hardware IPs that make 2684 * up each asic are discovered each IP's early_init callback is run. This 2685 * is the first stage in initializing the asic. 2686 * Returns 0 on success, negative error code on failure. 2687 */ 2688 static int amdgpu_device_ip_early_init(struct amdgpu_device *adev) 2689 { 2690 struct amdgpu_ip_block *ip_block; 2691 struct pci_dev *parent; 2692 bool total, skip_bios; 2693 uint32_t bios_flags; 2694 int i, r; 2695 2696 amdgpu_device_enable_virtual_display(adev); 2697 2698 if (amdgpu_sriov_vf(adev)) { 2699 r = amdgpu_virt_request_full_gpu(adev, true); 2700 if (r) 2701 return r; 2702 } 2703 2704 switch (adev->asic_type) { 2705 #ifdef CONFIG_DRM_AMDGPU_SI 2706 case CHIP_VERDE: 2707 case CHIP_TAHITI: 2708 case CHIP_PITCAIRN: 2709 case CHIP_OLAND: 2710 case CHIP_HAINAN: 2711 adev->family = AMDGPU_FAMILY_SI; 2712 r = si_set_ip_blocks(adev); 2713 if (r) 2714 return r; 2715 break; 2716 #endif 2717 #ifdef CONFIG_DRM_AMDGPU_CIK 2718 case CHIP_BONAIRE: 2719 case CHIP_HAWAII: 2720 case CHIP_KAVERI: 2721 case CHIP_KABINI: 2722 case CHIP_MULLINS: 2723 if (adev->flags & AMD_IS_APU) 2724 adev->family = AMDGPU_FAMILY_KV; 2725 else 2726 adev->family = AMDGPU_FAMILY_CI; 2727 2728 r = cik_set_ip_blocks(adev); 2729 if (r) 2730 return r; 2731 break; 2732 #endif 2733 case CHIP_TOPAZ: 2734 case CHIP_TONGA: 2735 case CHIP_FIJI: 2736 case CHIP_POLARIS10: 2737 case CHIP_POLARIS11: 2738 case CHIP_POLARIS12: 2739 case CHIP_VEGAM: 2740 case CHIP_CARRIZO: 2741 case CHIP_STONEY: 2742 if (adev->flags & AMD_IS_APU) 2743 adev->family = AMDGPU_FAMILY_CZ; 2744 else 2745 adev->family = AMDGPU_FAMILY_VI; 2746 2747 r = vi_set_ip_blocks(adev); 2748 if (r) 2749 return r; 2750 break; 2751 default: 2752 r = amdgpu_discovery_set_ip_blocks(adev); 2753 if (r) 2754 return r; 2755 break; 2756 } 2757 2758 /* Check for IP version 9.4.3 with A0 hardware */ 2759 if (amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 3) && 2760 !amdgpu_device_get_rev_id(adev)) { 2761 dev_err(adev->dev, "Unsupported A0 hardware\n"); 2762 return -ENODEV; /* device unsupported - no device error */ 2763 } 2764 2765 if (amdgpu_has_atpx() && 2766 (amdgpu_is_atpx_hybrid() || 2767 amdgpu_has_atpx_dgpu_power_cntl()) && 2768 ((adev->flags & AMD_IS_APU) == 0) && 2769 !dev_is_removable(&adev->pdev->dev)) 2770 adev->flags |= AMD_IS_PX; 2771 2772 if (!(adev->flags & AMD_IS_APU)) { 2773 parent = pcie_find_root_port(adev->pdev); 2774 adev->has_pr3 = parent ? pci_pr3_present(parent) : false; 2775 } 2776 2777 adev->pm.pp_feature = amdgpu_pp_feature_mask; 2778 if (amdgpu_sriov_vf(adev) || sched_policy == KFD_SCHED_POLICY_NO_HWS) 2779 adev->pm.pp_feature &= ~PP_GFXOFF_MASK; 2780 if (amdgpu_sriov_vf(adev) && adev->asic_type == CHIP_SIENNA_CICHLID) 2781 adev->pm.pp_feature &= ~PP_OVERDRIVE_MASK; 2782 if (!amdgpu_device_pcie_dynamic_switching_supported(adev)) 2783 adev->pm.pp_feature &= ~PP_PCIE_DPM_MASK; 2784 2785 adev->virt.is_xgmi_node_migrate_enabled = false; 2786 if (amdgpu_sriov_vf(adev)) { 2787 adev->virt.is_xgmi_node_migrate_enabled = 2788 amdgpu_ip_version((adev), GC_HWIP, 0) == IP_VERSION(9, 4, 4); 2789 } 2790 2791 total = true; 2792 for (i = 0; i < adev->num_ip_blocks; i++) { 2793 ip_block = &adev->ip_blocks[i]; 2794 2795 if ((amdgpu_ip_block_mask & (1 << i)) == 0) { 2796 dev_warn(adev->dev, "disabled ip block: %d <%s>\n", i, 2797 adev->ip_blocks[i].version->funcs->name); 2798 adev->ip_blocks[i].status.valid = false; 2799 } else if (ip_block->version->funcs->early_init) { 2800 r = ip_block->version->funcs->early_init(ip_block); 2801 if (r == -ENOENT) { 2802 adev->ip_blocks[i].status.valid = false; 2803 } else if (r) { 2804 dev_err(adev->dev, 2805 "early_init of IP block <%s> failed %d\n", 2806 adev->ip_blocks[i].version->funcs->name, 2807 r); 2808 total = false; 2809 } else { 2810 adev->ip_blocks[i].status.valid = true; 2811 } 2812 } else { 2813 adev->ip_blocks[i].status.valid = true; 2814 } 2815 /* get the vbios after the asic_funcs are set up */ 2816 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON) { 2817 r = amdgpu_device_parse_gpu_info_fw(adev); 2818 if (r) 2819 return r; 2820 2821 bios_flags = amdgpu_device_get_vbios_flags(adev); 2822 skip_bios = !!(bios_flags & AMDGPU_VBIOS_SKIP); 2823 /* Read BIOS */ 2824 if (!skip_bios) { 2825 bool optional = 2826 !!(bios_flags & AMDGPU_VBIOS_OPTIONAL); 2827 if (!amdgpu_get_bios(adev) && !optional) 2828 return -EINVAL; 2829 2830 if (optional && !adev->bios) 2831 dev_info( 2832 adev->dev, 2833 "VBIOS image optional, proceeding without VBIOS image"); 2834 2835 if (adev->bios) { 2836 r = amdgpu_atombios_init(adev); 2837 if (r) { 2838 dev_err(adev->dev, 2839 "amdgpu_atombios_init failed\n"); 2840 amdgpu_vf_error_put( 2841 adev, 2842 AMDGIM_ERROR_VF_ATOMBIOS_INIT_FAIL, 2843 0, 0); 2844 return r; 2845 } 2846 } 2847 } 2848 2849 /*get pf2vf msg info at it's earliest time*/ 2850 if (amdgpu_sriov_vf(adev)) 2851 amdgpu_virt_init_data_exchange(adev); 2852 2853 } 2854 } 2855 if (!total) 2856 return -ENODEV; 2857 2858 if (adev->gmc.xgmi.supported) 2859 amdgpu_xgmi_early_init(adev); 2860 2861 ip_block = amdgpu_device_ip_get_ip_block(adev, AMD_IP_BLOCK_TYPE_GFX); 2862 if (ip_block->status.valid != false) 2863 amdgpu_amdkfd_device_probe(adev); 2864 2865 adev->cg_flags &= amdgpu_cg_mask; 2866 adev->pg_flags &= amdgpu_pg_mask; 2867 2868 return 0; 2869 } 2870 2871 static int amdgpu_device_ip_hw_init_phase1(struct amdgpu_device *adev) 2872 { 2873 int i, r; 2874 2875 for (i = 0; i < adev->num_ip_blocks; i++) { 2876 if (!adev->ip_blocks[i].status.sw) 2877 continue; 2878 if (adev->ip_blocks[i].status.hw) 2879 continue; 2880 if (!amdgpu_ip_member_of_hwini( 2881 adev, adev->ip_blocks[i].version->type)) 2882 continue; 2883 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON || 2884 (amdgpu_sriov_vf(adev) && (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP)) || 2885 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH) { 2886 r = adev->ip_blocks[i].version->funcs->hw_init(&adev->ip_blocks[i]); 2887 if (r) { 2888 dev_err(adev->dev, 2889 "hw_init of IP block <%s> failed %d\n", 2890 adev->ip_blocks[i].version->funcs->name, 2891 r); 2892 return r; 2893 } 2894 adev->ip_blocks[i].status.hw = true; 2895 } 2896 } 2897 2898 return 0; 2899 } 2900 2901 static int amdgpu_device_ip_hw_init_phase2(struct amdgpu_device *adev) 2902 { 2903 int i, r; 2904 2905 for (i = 0; i < adev->num_ip_blocks; i++) { 2906 if (!adev->ip_blocks[i].status.sw) 2907 continue; 2908 if (adev->ip_blocks[i].status.hw) 2909 continue; 2910 if (!amdgpu_ip_member_of_hwini( 2911 adev, adev->ip_blocks[i].version->type)) 2912 continue; 2913 r = adev->ip_blocks[i].version->funcs->hw_init(&adev->ip_blocks[i]); 2914 if (r) { 2915 dev_err(adev->dev, 2916 "hw_init of IP block <%s> failed %d\n", 2917 adev->ip_blocks[i].version->funcs->name, r); 2918 return r; 2919 } 2920 adev->ip_blocks[i].status.hw = true; 2921 } 2922 2923 return 0; 2924 } 2925 2926 static int amdgpu_device_fw_loading(struct amdgpu_device *adev) 2927 { 2928 int r = 0; 2929 int i; 2930 uint32_t smu_version; 2931 2932 if (adev->asic_type >= CHIP_VEGA10) { 2933 for (i = 0; i < adev->num_ip_blocks; i++) { 2934 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_PSP) 2935 continue; 2936 2937 if (!amdgpu_ip_member_of_hwini(adev, 2938 AMD_IP_BLOCK_TYPE_PSP)) 2939 break; 2940 2941 if (!adev->ip_blocks[i].status.sw) 2942 continue; 2943 2944 /* no need to do the fw loading again if already done*/ 2945 if (adev->ip_blocks[i].status.hw == true) 2946 break; 2947 2948 if (amdgpu_in_reset(adev) || adev->in_suspend) { 2949 r = amdgpu_ip_block_resume(&adev->ip_blocks[i]); 2950 if (r) 2951 return r; 2952 } else { 2953 r = adev->ip_blocks[i].version->funcs->hw_init(&adev->ip_blocks[i]); 2954 if (r) { 2955 dev_err(adev->dev, 2956 "hw_init of IP block <%s> failed %d\n", 2957 adev->ip_blocks[i] 2958 .version->funcs->name, 2959 r); 2960 return r; 2961 } 2962 adev->ip_blocks[i].status.hw = true; 2963 } 2964 break; 2965 } 2966 } 2967 2968 if (!amdgpu_sriov_vf(adev) || adev->asic_type == CHIP_TONGA) 2969 r = amdgpu_pm_load_smu_firmware(adev, &smu_version); 2970 2971 return r; 2972 } 2973 2974 static int amdgpu_device_init_schedulers(struct amdgpu_device *adev) 2975 { 2976 struct drm_sched_init_args args = { 2977 .ops = &amdgpu_sched_ops, 2978 .num_rqs = DRM_SCHED_PRIORITY_COUNT, 2979 .timeout_wq = adev->reset_domain->wq, 2980 .dev = adev->dev, 2981 }; 2982 long timeout; 2983 int r, i; 2984 2985 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 2986 struct amdgpu_ring *ring = adev->rings[i]; 2987 2988 /* No need to setup the GPU scheduler for rings that don't need it */ 2989 if (!ring || ring->no_scheduler) 2990 continue; 2991 2992 switch (ring->funcs->type) { 2993 case AMDGPU_RING_TYPE_GFX: 2994 timeout = adev->gfx_timeout; 2995 break; 2996 case AMDGPU_RING_TYPE_COMPUTE: 2997 timeout = adev->compute_timeout; 2998 break; 2999 case AMDGPU_RING_TYPE_SDMA: 3000 timeout = adev->sdma_timeout; 3001 break; 3002 default: 3003 timeout = adev->video_timeout; 3004 break; 3005 } 3006 3007 args.timeout = timeout; 3008 args.credit_limit = ring->num_hw_submission; 3009 args.score = ring->sched_score; 3010 args.name = ring->name; 3011 3012 r = drm_sched_init(&ring->sched, &args); 3013 if (r) { 3014 dev_err(adev->dev, 3015 "Failed to create scheduler on ring %s.\n", 3016 ring->name); 3017 return r; 3018 } 3019 r = amdgpu_uvd_entity_init(adev, ring); 3020 if (r) { 3021 dev_err(adev->dev, 3022 "Failed to create UVD scheduling entity on ring %s.\n", 3023 ring->name); 3024 return r; 3025 } 3026 r = amdgpu_vce_entity_init(adev, ring); 3027 if (r) { 3028 dev_err(adev->dev, 3029 "Failed to create VCE scheduling entity on ring %s.\n", 3030 ring->name); 3031 return r; 3032 } 3033 } 3034 3035 if (adev->xcp_mgr) 3036 amdgpu_xcp_update_partition_sched_list(adev); 3037 3038 return 0; 3039 } 3040 3041 3042 /** 3043 * amdgpu_device_ip_init - run init for hardware IPs 3044 * 3045 * @adev: amdgpu_device pointer 3046 * 3047 * Main initialization pass for hardware IPs. The list of all the hardware 3048 * IPs that make up the asic is walked and the sw_init and hw_init callbacks 3049 * are run. sw_init initializes the software state associated with each IP 3050 * and hw_init initializes the hardware associated with each IP. 3051 * Returns 0 on success, negative error code on failure. 3052 */ 3053 static int amdgpu_device_ip_init(struct amdgpu_device *adev) 3054 { 3055 bool init_badpage; 3056 int i, r; 3057 3058 r = amdgpu_ras_init(adev); 3059 if (r) 3060 return r; 3061 3062 for (i = 0; i < adev->num_ip_blocks; i++) { 3063 if (!adev->ip_blocks[i].status.valid) 3064 continue; 3065 if (adev->ip_blocks[i].version->funcs->sw_init) { 3066 r = adev->ip_blocks[i].version->funcs->sw_init(&adev->ip_blocks[i]); 3067 if (r) { 3068 dev_err(adev->dev, 3069 "sw_init of IP block <%s> failed %d\n", 3070 adev->ip_blocks[i].version->funcs->name, 3071 r); 3072 goto init_failed; 3073 } 3074 } 3075 adev->ip_blocks[i].status.sw = true; 3076 3077 if (!amdgpu_ip_member_of_hwini( 3078 adev, adev->ip_blocks[i].version->type)) 3079 continue; 3080 3081 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON) { 3082 /* need to do common hw init early so everything is set up for gmc */ 3083 r = adev->ip_blocks[i].version->funcs->hw_init(&adev->ip_blocks[i]); 3084 if (r) { 3085 dev_err(adev->dev, "hw_init %d failed %d\n", i, 3086 r); 3087 goto init_failed; 3088 } 3089 adev->ip_blocks[i].status.hw = true; 3090 } else if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) { 3091 /* need to do gmc hw init early so we can allocate gpu mem */ 3092 /* Try to reserve bad pages early */ 3093 if (amdgpu_sriov_vf(adev)) 3094 amdgpu_virt_exchange_data(adev); 3095 3096 r = amdgpu_device_mem_scratch_init(adev); 3097 if (r) { 3098 dev_err(adev->dev, 3099 "amdgpu_mem_scratch_init failed %d\n", 3100 r); 3101 goto init_failed; 3102 } 3103 r = adev->ip_blocks[i].version->funcs->hw_init(&adev->ip_blocks[i]); 3104 if (r) { 3105 dev_err(adev->dev, "hw_init %d failed %d\n", i, 3106 r); 3107 goto init_failed; 3108 } 3109 r = amdgpu_device_wb_init(adev); 3110 if (r) { 3111 dev_err(adev->dev, 3112 "amdgpu_device_wb_init failed %d\n", r); 3113 goto init_failed; 3114 } 3115 adev->ip_blocks[i].status.hw = true; 3116 3117 /* right after GMC hw init, we create CSA */ 3118 if (adev->gfx.mcbp) { 3119 r = amdgpu_allocate_static_csa(adev, &adev->virt.csa_obj, 3120 AMDGPU_GEM_DOMAIN_VRAM | 3121 AMDGPU_GEM_DOMAIN_GTT, 3122 AMDGPU_CSA_SIZE); 3123 if (r) { 3124 dev_err(adev->dev, 3125 "allocate CSA failed %d\n", r); 3126 goto init_failed; 3127 } 3128 } 3129 3130 r = amdgpu_seq64_init(adev); 3131 if (r) { 3132 dev_err(adev->dev, "allocate seq64 failed %d\n", 3133 r); 3134 goto init_failed; 3135 } 3136 } 3137 } 3138 3139 if (amdgpu_sriov_vf(adev)) 3140 amdgpu_virt_init_data_exchange(adev); 3141 3142 r = amdgpu_ib_pool_init(adev); 3143 if (r) { 3144 dev_err(adev->dev, "IB initialization failed (%d).\n", r); 3145 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_IB_INIT_FAIL, 0, r); 3146 goto init_failed; 3147 } 3148 3149 r = amdgpu_ucode_create_bo(adev); /* create ucode bo when sw_init complete*/ 3150 if (r) 3151 goto init_failed; 3152 3153 r = amdgpu_device_ip_hw_init_phase1(adev); 3154 if (r) 3155 goto init_failed; 3156 3157 r = amdgpu_device_fw_loading(adev); 3158 if (r) 3159 goto init_failed; 3160 3161 r = amdgpu_device_ip_hw_init_phase2(adev); 3162 if (r) 3163 goto init_failed; 3164 3165 /* 3166 * retired pages will be loaded from eeprom and reserved here, 3167 * it should be called after amdgpu_device_ip_hw_init_phase2 since 3168 * for some ASICs the RAS EEPROM code relies on SMU fully functioning 3169 * for I2C communication which only true at this point. 3170 * 3171 * amdgpu_ras_recovery_init may fail, but the upper only cares the 3172 * failure from bad gpu situation and stop amdgpu init process 3173 * accordingly. For other failed cases, it will still release all 3174 * the resource and print error message, rather than returning one 3175 * negative value to upper level. 3176 * 3177 * Note: theoretically, this should be called before all vram allocations 3178 * to protect retired page from abusing 3179 */ 3180 init_badpage = (adev->init_lvl->level != AMDGPU_INIT_LEVEL_MINIMAL_XGMI); 3181 r = amdgpu_ras_recovery_init(adev, init_badpage); 3182 if (r) 3183 goto init_failed; 3184 3185 /** 3186 * In case of XGMI grab extra reference for reset domain for this device 3187 */ 3188 if (adev->gmc.xgmi.num_physical_nodes > 1) { 3189 if (amdgpu_xgmi_add_device(adev) == 0) { 3190 if (!amdgpu_sriov_vf(adev)) { 3191 struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev); 3192 3193 if (WARN_ON(!hive)) { 3194 r = -ENOENT; 3195 goto init_failed; 3196 } 3197 3198 if (!hive->reset_domain || 3199 !amdgpu_reset_get_reset_domain(hive->reset_domain)) { 3200 r = -ENOENT; 3201 amdgpu_put_xgmi_hive(hive); 3202 goto init_failed; 3203 } 3204 3205 /* Drop the early temporary reset domain we created for device */ 3206 amdgpu_reset_put_reset_domain(adev->reset_domain); 3207 adev->reset_domain = hive->reset_domain; 3208 amdgpu_put_xgmi_hive(hive); 3209 } 3210 } 3211 } 3212 3213 r = amdgpu_device_init_schedulers(adev); 3214 if (r) 3215 goto init_failed; 3216 3217 if (adev->mman.buffer_funcs_ring->sched.ready) 3218 amdgpu_ttm_set_buffer_funcs_status(adev, true); 3219 3220 /* Don't init kfd if whole hive need to be reset during init */ 3221 if (adev->init_lvl->level != AMDGPU_INIT_LEVEL_MINIMAL_XGMI) { 3222 kgd2kfd_init_zone_device(adev); 3223 amdgpu_amdkfd_device_init(adev); 3224 } 3225 3226 amdgpu_fru_get_product_info(adev); 3227 3228 if (!amdgpu_sriov_vf(adev) || amdgpu_sriov_ras_cper_en(adev)) 3229 r = amdgpu_cper_init(adev); 3230 3231 init_failed: 3232 3233 return r; 3234 } 3235 3236 /** 3237 * amdgpu_device_fill_reset_magic - writes reset magic to gart pointer 3238 * 3239 * @adev: amdgpu_device pointer 3240 * 3241 * Writes a reset magic value to the gart pointer in VRAM. The driver calls 3242 * this function before a GPU reset. If the value is retained after a 3243 * GPU reset, VRAM has not been lost. Some GPU resets may destroy VRAM contents. 3244 */ 3245 static void amdgpu_device_fill_reset_magic(struct amdgpu_device *adev) 3246 { 3247 memcpy(adev->reset_magic, adev->gart.ptr, AMDGPU_RESET_MAGIC_NUM); 3248 } 3249 3250 /** 3251 * amdgpu_device_check_vram_lost - check if vram is valid 3252 * 3253 * @adev: amdgpu_device pointer 3254 * 3255 * Checks the reset magic value written to the gart pointer in VRAM. 3256 * The driver calls this after a GPU reset to see if the contents of 3257 * VRAM is lost or now. 3258 * returns true if vram is lost, false if not. 3259 */ 3260 static bool amdgpu_device_check_vram_lost(struct amdgpu_device *adev) 3261 { 3262 if (memcmp(adev->gart.ptr, adev->reset_magic, 3263 AMDGPU_RESET_MAGIC_NUM)) 3264 return true; 3265 3266 if (!amdgpu_in_reset(adev)) 3267 return false; 3268 3269 /* 3270 * For all ASICs with baco/mode1 reset, the VRAM is 3271 * always assumed to be lost. 3272 */ 3273 switch (amdgpu_asic_reset_method(adev)) { 3274 case AMD_RESET_METHOD_LINK: 3275 case AMD_RESET_METHOD_BACO: 3276 case AMD_RESET_METHOD_MODE1: 3277 return true; 3278 default: 3279 return false; 3280 } 3281 } 3282 3283 /** 3284 * amdgpu_device_set_cg_state - set clockgating for amdgpu device 3285 * 3286 * @adev: amdgpu_device pointer 3287 * @state: clockgating state (gate or ungate) 3288 * 3289 * The list of all the hardware IPs that make up the asic is walked and the 3290 * set_clockgating_state callbacks are run. 3291 * Late initialization pass enabling clockgating for hardware IPs. 3292 * Fini or suspend, pass disabling clockgating for hardware IPs. 3293 * Returns 0 on success, negative error code on failure. 3294 */ 3295 3296 int amdgpu_device_set_cg_state(struct amdgpu_device *adev, 3297 enum amd_clockgating_state state) 3298 { 3299 int i, j, r; 3300 3301 if (amdgpu_emu_mode == 1) 3302 return 0; 3303 3304 for (j = 0; j < adev->num_ip_blocks; j++) { 3305 i = state == AMD_CG_STATE_GATE ? j : adev->num_ip_blocks - j - 1; 3306 if (!adev->ip_blocks[i].status.late_initialized) 3307 continue; 3308 /* skip CG for GFX, SDMA on S0ix */ 3309 if (adev->in_s0ix && 3310 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GFX || 3311 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SDMA)) 3312 continue; 3313 /* skip CG for VCE/UVD, it's handled specially */ 3314 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_UVD && 3315 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCE && 3316 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCN && 3317 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_JPEG && 3318 adev->ip_blocks[i].version->funcs->set_clockgating_state) { 3319 /* enable clockgating to save power */ 3320 r = adev->ip_blocks[i].version->funcs->set_clockgating_state(&adev->ip_blocks[i], 3321 state); 3322 if (r) { 3323 dev_err(adev->dev, 3324 "set_clockgating_state(gate) of IP block <%s> failed %d\n", 3325 adev->ip_blocks[i].version->funcs->name, 3326 r); 3327 return r; 3328 } 3329 } 3330 } 3331 3332 return 0; 3333 } 3334 3335 int amdgpu_device_set_pg_state(struct amdgpu_device *adev, 3336 enum amd_powergating_state state) 3337 { 3338 int i, j, r; 3339 3340 if (amdgpu_emu_mode == 1) 3341 return 0; 3342 3343 for (j = 0; j < adev->num_ip_blocks; j++) { 3344 i = state == AMD_PG_STATE_GATE ? j : adev->num_ip_blocks - j - 1; 3345 if (!adev->ip_blocks[i].status.late_initialized) 3346 continue; 3347 /* skip PG for GFX, SDMA on S0ix */ 3348 if (adev->in_s0ix && 3349 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GFX || 3350 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SDMA)) 3351 continue; 3352 /* skip CG for VCE/UVD, it's handled specially */ 3353 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_UVD && 3354 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCE && 3355 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCN && 3356 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_JPEG && 3357 adev->ip_blocks[i].version->funcs->set_powergating_state) { 3358 /* enable powergating to save power */ 3359 r = adev->ip_blocks[i].version->funcs->set_powergating_state(&adev->ip_blocks[i], 3360 state); 3361 if (r) { 3362 dev_err(adev->dev, 3363 "set_powergating_state(gate) of IP block <%s> failed %d\n", 3364 adev->ip_blocks[i].version->funcs->name, 3365 r); 3366 return r; 3367 } 3368 } 3369 } 3370 return 0; 3371 } 3372 3373 static int amdgpu_device_enable_mgpu_fan_boost(void) 3374 { 3375 struct amdgpu_gpu_instance *gpu_ins; 3376 struct amdgpu_device *adev; 3377 int i, ret = 0; 3378 3379 mutex_lock(&mgpu_info.mutex); 3380 3381 /* 3382 * MGPU fan boost feature should be enabled 3383 * only when there are two or more dGPUs in 3384 * the system 3385 */ 3386 if (mgpu_info.num_dgpu < 2) 3387 goto out; 3388 3389 for (i = 0; i < mgpu_info.num_dgpu; i++) { 3390 gpu_ins = &(mgpu_info.gpu_ins[i]); 3391 adev = gpu_ins->adev; 3392 if (!(adev->flags & AMD_IS_APU) && 3393 !gpu_ins->mgpu_fan_enabled) { 3394 ret = amdgpu_dpm_enable_mgpu_fan_boost(adev); 3395 if (ret) 3396 break; 3397 3398 gpu_ins->mgpu_fan_enabled = 1; 3399 } 3400 } 3401 3402 out: 3403 mutex_unlock(&mgpu_info.mutex); 3404 3405 return ret; 3406 } 3407 3408 /** 3409 * amdgpu_device_ip_late_init - run late init for hardware IPs 3410 * 3411 * @adev: amdgpu_device pointer 3412 * 3413 * Late initialization pass for hardware IPs. The list of all the hardware 3414 * IPs that make up the asic is walked and the late_init callbacks are run. 3415 * late_init covers any special initialization that an IP requires 3416 * after all of the have been initialized or something that needs to happen 3417 * late in the init process. 3418 * Returns 0 on success, negative error code on failure. 3419 */ 3420 static int amdgpu_device_ip_late_init(struct amdgpu_device *adev) 3421 { 3422 struct amdgpu_gpu_instance *gpu_instance; 3423 int i = 0, r; 3424 3425 for (i = 0; i < adev->num_ip_blocks; i++) { 3426 if (!adev->ip_blocks[i].status.hw) 3427 continue; 3428 if (adev->ip_blocks[i].version->funcs->late_init) { 3429 r = adev->ip_blocks[i].version->funcs->late_init(&adev->ip_blocks[i]); 3430 if (r) { 3431 dev_err(adev->dev, 3432 "late_init of IP block <%s> failed %d\n", 3433 adev->ip_blocks[i].version->funcs->name, 3434 r); 3435 return r; 3436 } 3437 } 3438 adev->ip_blocks[i].status.late_initialized = true; 3439 } 3440 3441 r = amdgpu_ras_late_init(adev); 3442 if (r) { 3443 dev_err(adev->dev, "amdgpu_ras_late_init failed %d", r); 3444 return r; 3445 } 3446 3447 if (!amdgpu_reset_in_recovery(adev)) 3448 amdgpu_ras_set_error_query_ready(adev, true); 3449 3450 amdgpu_device_set_cg_state(adev, AMD_CG_STATE_GATE); 3451 amdgpu_device_set_pg_state(adev, AMD_PG_STATE_GATE); 3452 3453 amdgpu_device_fill_reset_magic(adev); 3454 3455 r = amdgpu_device_enable_mgpu_fan_boost(); 3456 if (r) 3457 dev_err(adev->dev, "enable mgpu fan boost failed (%d).\n", r); 3458 3459 /* For passthrough configuration on arcturus and aldebaran, enable special handling SBR */ 3460 if (amdgpu_passthrough(adev) && 3461 ((adev->asic_type == CHIP_ARCTURUS && adev->gmc.xgmi.num_physical_nodes > 1) || 3462 adev->asic_type == CHIP_ALDEBARAN)) 3463 amdgpu_dpm_handle_passthrough_sbr(adev, true); 3464 3465 if (adev->gmc.xgmi.num_physical_nodes > 1) { 3466 mutex_lock(&mgpu_info.mutex); 3467 3468 /* 3469 * Reset device p-state to low as this was booted with high. 3470 * 3471 * This should be performed only after all devices from the same 3472 * hive get initialized. 3473 * 3474 * However, it's unknown how many device in the hive in advance. 3475 * As this is counted one by one during devices initializations. 3476 * 3477 * So, we wait for all XGMI interlinked devices initialized. 3478 * This may bring some delays as those devices may come from 3479 * different hives. But that should be OK. 3480 */ 3481 if (mgpu_info.num_dgpu == adev->gmc.xgmi.num_physical_nodes) { 3482 for (i = 0; i < mgpu_info.num_gpu; i++) { 3483 gpu_instance = &(mgpu_info.gpu_ins[i]); 3484 if (gpu_instance->adev->flags & AMD_IS_APU) 3485 continue; 3486 3487 r = amdgpu_xgmi_set_pstate(gpu_instance->adev, 3488 AMDGPU_XGMI_PSTATE_MIN); 3489 if (r) { 3490 dev_err(adev->dev, 3491 "pstate setting failed (%d).\n", 3492 r); 3493 break; 3494 } 3495 } 3496 } 3497 3498 mutex_unlock(&mgpu_info.mutex); 3499 } 3500 3501 return 0; 3502 } 3503 3504 static void amdgpu_ip_block_hw_fini(struct amdgpu_ip_block *ip_block) 3505 { 3506 struct amdgpu_device *adev = ip_block->adev; 3507 int r; 3508 3509 if (!ip_block->version->funcs->hw_fini) { 3510 dev_err(adev->dev, "hw_fini of IP block <%s> not defined\n", 3511 ip_block->version->funcs->name); 3512 } else { 3513 r = ip_block->version->funcs->hw_fini(ip_block); 3514 /* XXX handle errors */ 3515 if (r) { 3516 dev_dbg(adev->dev, 3517 "hw_fini of IP block <%s> failed %d\n", 3518 ip_block->version->funcs->name, r); 3519 } 3520 } 3521 3522 ip_block->status.hw = false; 3523 } 3524 3525 /** 3526 * amdgpu_device_smu_fini_early - smu hw_fini wrapper 3527 * 3528 * @adev: amdgpu_device pointer 3529 * 3530 * For ASICs need to disable SMC first 3531 */ 3532 static void amdgpu_device_smu_fini_early(struct amdgpu_device *adev) 3533 { 3534 int i; 3535 3536 if (amdgpu_ip_version(adev, GC_HWIP, 0) > IP_VERSION(9, 0, 0)) 3537 return; 3538 3539 for (i = 0; i < adev->num_ip_blocks; i++) { 3540 if (!adev->ip_blocks[i].status.hw) 3541 continue; 3542 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) { 3543 amdgpu_ip_block_hw_fini(&adev->ip_blocks[i]); 3544 break; 3545 } 3546 } 3547 } 3548 3549 static int amdgpu_device_ip_fini_early(struct amdgpu_device *adev) 3550 { 3551 int i, r; 3552 3553 for (i = 0; i < adev->num_ip_blocks; i++) { 3554 if (!adev->ip_blocks[i].version->funcs->early_fini) 3555 continue; 3556 3557 r = adev->ip_blocks[i].version->funcs->early_fini(&adev->ip_blocks[i]); 3558 if (r) { 3559 dev_dbg(adev->dev, 3560 "early_fini of IP block <%s> failed %d\n", 3561 adev->ip_blocks[i].version->funcs->name, r); 3562 } 3563 } 3564 3565 amdgpu_device_set_pg_state(adev, AMD_PG_STATE_UNGATE); 3566 amdgpu_device_set_cg_state(adev, AMD_CG_STATE_UNGATE); 3567 3568 amdgpu_amdkfd_suspend(adev, true); 3569 amdgpu_userq_suspend(adev); 3570 3571 /* Workaround for ASICs need to disable SMC first */ 3572 amdgpu_device_smu_fini_early(adev); 3573 3574 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 3575 if (!adev->ip_blocks[i].status.hw) 3576 continue; 3577 3578 amdgpu_ip_block_hw_fini(&adev->ip_blocks[i]); 3579 } 3580 3581 if (amdgpu_sriov_vf(adev)) { 3582 if (amdgpu_virt_release_full_gpu(adev, false)) 3583 dev_err(adev->dev, 3584 "failed to release exclusive mode on fini\n"); 3585 } 3586 3587 return 0; 3588 } 3589 3590 /** 3591 * amdgpu_device_ip_fini - run fini for hardware IPs 3592 * 3593 * @adev: amdgpu_device pointer 3594 * 3595 * Main teardown pass for hardware IPs. The list of all the hardware 3596 * IPs that make up the asic is walked and the hw_fini and sw_fini callbacks 3597 * are run. hw_fini tears down the hardware associated with each IP 3598 * and sw_fini tears down any software state associated with each IP. 3599 * Returns 0 on success, negative error code on failure. 3600 */ 3601 static int amdgpu_device_ip_fini(struct amdgpu_device *adev) 3602 { 3603 int i, r; 3604 3605 amdgpu_cper_fini(adev); 3606 3607 if (amdgpu_sriov_vf(adev) && adev->virt.ras_init_done) 3608 amdgpu_virt_release_ras_err_handler_data(adev); 3609 3610 if (adev->gmc.xgmi.num_physical_nodes > 1) 3611 amdgpu_xgmi_remove_device(adev); 3612 3613 amdgpu_amdkfd_device_fini_sw(adev); 3614 3615 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 3616 if (!adev->ip_blocks[i].status.sw) 3617 continue; 3618 3619 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) { 3620 amdgpu_ucode_free_bo(adev); 3621 amdgpu_free_static_csa(&adev->virt.csa_obj); 3622 amdgpu_device_wb_fini(adev); 3623 amdgpu_device_mem_scratch_fini(adev); 3624 amdgpu_ib_pool_fini(adev); 3625 amdgpu_seq64_fini(adev); 3626 amdgpu_doorbell_fini(adev); 3627 } 3628 if (adev->ip_blocks[i].version->funcs->sw_fini) { 3629 r = adev->ip_blocks[i].version->funcs->sw_fini(&adev->ip_blocks[i]); 3630 /* XXX handle errors */ 3631 if (r) { 3632 dev_dbg(adev->dev, 3633 "sw_fini of IP block <%s> failed %d\n", 3634 adev->ip_blocks[i].version->funcs->name, 3635 r); 3636 } 3637 } 3638 adev->ip_blocks[i].status.sw = false; 3639 adev->ip_blocks[i].status.valid = false; 3640 } 3641 3642 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 3643 if (!adev->ip_blocks[i].status.late_initialized) 3644 continue; 3645 if (adev->ip_blocks[i].version->funcs->late_fini) 3646 adev->ip_blocks[i].version->funcs->late_fini(&adev->ip_blocks[i]); 3647 adev->ip_blocks[i].status.late_initialized = false; 3648 } 3649 3650 amdgpu_ras_fini(adev); 3651 3652 return 0; 3653 } 3654 3655 /** 3656 * amdgpu_device_delayed_init_work_handler - work handler for IB tests 3657 * 3658 * @work: work_struct. 3659 */ 3660 static void amdgpu_device_delayed_init_work_handler(struct work_struct *work) 3661 { 3662 struct amdgpu_device *adev = 3663 container_of(work, struct amdgpu_device, delayed_init_work.work); 3664 int r; 3665 3666 r = amdgpu_ib_ring_tests(adev); 3667 if (r) 3668 dev_err(adev->dev, "ib ring test failed (%d).\n", r); 3669 } 3670 3671 static void amdgpu_device_delay_enable_gfx_off(struct work_struct *work) 3672 { 3673 struct amdgpu_device *adev = 3674 container_of(work, struct amdgpu_device, gfx.gfx_off_delay_work.work); 3675 3676 WARN_ON_ONCE(adev->gfx.gfx_off_state); 3677 WARN_ON_ONCE(adev->gfx.gfx_off_req_count); 3678 3679 if (!amdgpu_dpm_set_powergating_by_smu(adev, AMD_IP_BLOCK_TYPE_GFX, true, 0)) 3680 adev->gfx.gfx_off_state = true; 3681 } 3682 3683 /** 3684 * amdgpu_device_ip_suspend_phase1 - run suspend for hardware IPs (phase 1) 3685 * 3686 * @adev: amdgpu_device pointer 3687 * 3688 * Main suspend function for hardware IPs. The list of all the hardware 3689 * IPs that make up the asic is walked, clockgating is disabled and the 3690 * suspend callbacks are run. suspend puts the hardware and software state 3691 * in each IP into a state suitable for suspend. 3692 * Returns 0 on success, negative error code on failure. 3693 */ 3694 static int amdgpu_device_ip_suspend_phase1(struct amdgpu_device *adev) 3695 { 3696 int i, r; 3697 3698 amdgpu_device_set_pg_state(adev, AMD_PG_STATE_UNGATE); 3699 amdgpu_device_set_cg_state(adev, AMD_CG_STATE_UNGATE); 3700 3701 /* 3702 * Per PMFW team's suggestion, driver needs to handle gfxoff 3703 * and df cstate features disablement for gpu reset(e.g. Mode1Reset) 3704 * scenario. Add the missing df cstate disablement here. 3705 */ 3706 if (amdgpu_dpm_set_df_cstate(adev, DF_CSTATE_DISALLOW)) 3707 dev_warn(adev->dev, "Failed to disallow df cstate"); 3708 3709 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 3710 if (!adev->ip_blocks[i].status.valid) 3711 continue; 3712 3713 /* displays are handled separately */ 3714 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_DCE) 3715 continue; 3716 3717 /* XXX handle errors */ 3718 r = amdgpu_ip_block_suspend(&adev->ip_blocks[i]); 3719 if (r) 3720 return r; 3721 } 3722 3723 return 0; 3724 } 3725 3726 /** 3727 * amdgpu_device_ip_suspend_phase2 - run suspend for hardware IPs (phase 2) 3728 * 3729 * @adev: amdgpu_device pointer 3730 * 3731 * Main suspend function for hardware IPs. The list of all the hardware 3732 * IPs that make up the asic is walked, clockgating is disabled and the 3733 * suspend callbacks are run. suspend puts the hardware and software state 3734 * in each IP into a state suitable for suspend. 3735 * Returns 0 on success, negative error code on failure. 3736 */ 3737 static int amdgpu_device_ip_suspend_phase2(struct amdgpu_device *adev) 3738 { 3739 int i, r; 3740 3741 if (adev->in_s0ix) 3742 amdgpu_dpm_gfx_state_change(adev, sGpuChangeState_D3Entry); 3743 3744 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 3745 if (!adev->ip_blocks[i].status.valid) 3746 continue; 3747 /* displays are handled in phase1 */ 3748 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE) 3749 continue; 3750 /* PSP lost connection when err_event_athub occurs */ 3751 if (amdgpu_ras_intr_triggered() && 3752 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) { 3753 adev->ip_blocks[i].status.hw = false; 3754 continue; 3755 } 3756 3757 /* skip unnecessary suspend if we do not initialize them yet */ 3758 if (!amdgpu_ip_member_of_hwini( 3759 adev, adev->ip_blocks[i].version->type)) 3760 continue; 3761 3762 /* Since we skip suspend for S0i3, we need to cancel the delayed 3763 * idle work here as the suspend callback never gets called. 3764 */ 3765 if (adev->in_s0ix && 3766 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GFX && 3767 amdgpu_ip_version(adev, GC_HWIP, 0) >= IP_VERSION(10, 0, 0)) 3768 cancel_delayed_work_sync(&adev->gfx.idle_work); 3769 /* skip suspend of gfx/mes and psp for S0ix 3770 * gfx is in gfxoff state, so on resume it will exit gfxoff just 3771 * like at runtime. PSP is also part of the always on hardware 3772 * so no need to suspend it. 3773 */ 3774 if (adev->in_s0ix && 3775 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP || 3776 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GFX || 3777 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_MES)) 3778 continue; 3779 3780 /* SDMA 5.x+ is part of GFX power domain so it's covered by GFXOFF */ 3781 if (adev->in_s0ix && 3782 (amdgpu_ip_version(adev, SDMA0_HWIP, 0) >= 3783 IP_VERSION(5, 0, 0)) && 3784 (adev->ip_blocks[i].version->type == 3785 AMD_IP_BLOCK_TYPE_SDMA)) 3786 continue; 3787 3788 /* Once swPSP provides the IMU, RLC FW binaries to TOS during cold-boot. 3789 * These are in TMR, hence are expected to be reused by PSP-TOS to reload 3790 * from this location and RLC Autoload automatically also gets loaded 3791 * from here based on PMFW -> PSP message during re-init sequence. 3792 * Therefore, the psp suspend & resume should be skipped to avoid destroy 3793 * the TMR and reload FWs again for IMU enabled APU ASICs. 3794 */ 3795 if (amdgpu_in_reset(adev) && 3796 (adev->flags & AMD_IS_APU) && adev->gfx.imu.funcs && 3797 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) 3798 continue; 3799 3800 /* XXX handle errors */ 3801 r = amdgpu_ip_block_suspend(&adev->ip_blocks[i]); 3802 adev->ip_blocks[i].status.hw = false; 3803 3804 /* handle putting the SMC in the appropriate state */ 3805 if (!amdgpu_sriov_vf(adev)) { 3806 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) { 3807 r = amdgpu_dpm_set_mp1_state(adev, adev->mp1_state); 3808 if (r) { 3809 dev_err(adev->dev, 3810 "SMC failed to set mp1 state %d, %d\n", 3811 adev->mp1_state, r); 3812 return r; 3813 } 3814 } 3815 } 3816 } 3817 3818 return 0; 3819 } 3820 3821 /** 3822 * amdgpu_device_ip_suspend - run suspend for hardware IPs 3823 * 3824 * @adev: amdgpu_device pointer 3825 * 3826 * Main suspend function for hardware IPs. The list of all the hardware 3827 * IPs that make up the asic is walked, clockgating is disabled and the 3828 * suspend callbacks are run. suspend puts the hardware and software state 3829 * in each IP into a state suitable for suspend. 3830 * Returns 0 on success, negative error code on failure. 3831 */ 3832 int amdgpu_device_ip_suspend(struct amdgpu_device *adev) 3833 { 3834 int r; 3835 3836 if (amdgpu_sriov_vf(adev)) { 3837 amdgpu_virt_fini_data_exchange(adev); 3838 amdgpu_virt_request_full_gpu(adev, false); 3839 } 3840 3841 amdgpu_ttm_set_buffer_funcs_status(adev, false); 3842 3843 r = amdgpu_device_ip_suspend_phase1(adev); 3844 if (r) 3845 return r; 3846 r = amdgpu_device_ip_suspend_phase2(adev); 3847 3848 if (amdgpu_sriov_vf(adev)) 3849 amdgpu_virt_release_full_gpu(adev, false); 3850 3851 return r; 3852 } 3853 3854 static int amdgpu_device_ip_reinit_early_sriov(struct amdgpu_device *adev) 3855 { 3856 int i, r; 3857 3858 static enum amd_ip_block_type ip_order[] = { 3859 AMD_IP_BLOCK_TYPE_COMMON, 3860 AMD_IP_BLOCK_TYPE_GMC, 3861 AMD_IP_BLOCK_TYPE_PSP, 3862 AMD_IP_BLOCK_TYPE_IH, 3863 }; 3864 3865 for (i = 0; i < adev->num_ip_blocks; i++) { 3866 int j; 3867 struct amdgpu_ip_block *block; 3868 3869 block = &adev->ip_blocks[i]; 3870 block->status.hw = false; 3871 3872 for (j = 0; j < ARRAY_SIZE(ip_order); j++) { 3873 3874 if (block->version->type != ip_order[j] || 3875 !block->status.valid) 3876 continue; 3877 3878 r = block->version->funcs->hw_init(&adev->ip_blocks[i]); 3879 if (r) { 3880 dev_err(adev->dev, "RE-INIT-early: %s failed\n", 3881 block->version->funcs->name); 3882 return r; 3883 } 3884 block->status.hw = true; 3885 } 3886 } 3887 3888 return 0; 3889 } 3890 3891 static int amdgpu_device_ip_reinit_late_sriov(struct amdgpu_device *adev) 3892 { 3893 struct amdgpu_ip_block *block; 3894 int i, r = 0; 3895 3896 static enum amd_ip_block_type ip_order[] = { 3897 AMD_IP_BLOCK_TYPE_SMC, 3898 AMD_IP_BLOCK_TYPE_DCE, 3899 AMD_IP_BLOCK_TYPE_GFX, 3900 AMD_IP_BLOCK_TYPE_SDMA, 3901 AMD_IP_BLOCK_TYPE_MES, 3902 AMD_IP_BLOCK_TYPE_UVD, 3903 AMD_IP_BLOCK_TYPE_VCE, 3904 AMD_IP_BLOCK_TYPE_VCN, 3905 AMD_IP_BLOCK_TYPE_JPEG 3906 }; 3907 3908 for (i = 0; i < ARRAY_SIZE(ip_order); i++) { 3909 block = amdgpu_device_ip_get_ip_block(adev, ip_order[i]); 3910 3911 if (!block) 3912 continue; 3913 3914 if (block->status.valid && !block->status.hw) { 3915 if (block->version->type == AMD_IP_BLOCK_TYPE_SMC) { 3916 r = amdgpu_ip_block_resume(block); 3917 } else { 3918 r = block->version->funcs->hw_init(block); 3919 } 3920 3921 if (r) { 3922 dev_err(adev->dev, "RE-INIT-late: %s failed\n", 3923 block->version->funcs->name); 3924 break; 3925 } 3926 block->status.hw = true; 3927 } 3928 } 3929 3930 return r; 3931 } 3932 3933 /** 3934 * amdgpu_device_ip_resume_phase1 - run resume for hardware IPs 3935 * 3936 * @adev: amdgpu_device pointer 3937 * 3938 * First resume function for hardware IPs. The list of all the hardware 3939 * IPs that make up the asic is walked and the resume callbacks are run for 3940 * COMMON, GMC, and IH. resume puts the hardware into a functional state 3941 * after a suspend and updates the software state as necessary. This 3942 * function is also used for restoring the GPU after a GPU reset. 3943 * Returns 0 on success, negative error code on failure. 3944 */ 3945 static int amdgpu_device_ip_resume_phase1(struct amdgpu_device *adev) 3946 { 3947 int i, r; 3948 3949 for (i = 0; i < adev->num_ip_blocks; i++) { 3950 if (!adev->ip_blocks[i].status.valid || adev->ip_blocks[i].status.hw) 3951 continue; 3952 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON || 3953 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC || 3954 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH || 3955 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP && amdgpu_sriov_vf(adev))) { 3956 3957 r = amdgpu_ip_block_resume(&adev->ip_blocks[i]); 3958 if (r) 3959 return r; 3960 } 3961 } 3962 3963 return 0; 3964 } 3965 3966 /** 3967 * amdgpu_device_ip_resume_phase2 - run resume for hardware IPs 3968 * 3969 * @adev: amdgpu_device pointer 3970 * 3971 * Second resume function for hardware IPs. The list of all the hardware 3972 * IPs that make up the asic is walked and the resume callbacks are run for 3973 * all blocks except COMMON, GMC, and IH. resume puts the hardware into a 3974 * functional state after a suspend and updates the software state as 3975 * necessary. This function is also used for restoring the GPU after a GPU 3976 * reset. 3977 * Returns 0 on success, negative error code on failure. 3978 */ 3979 static int amdgpu_device_ip_resume_phase2(struct amdgpu_device *adev) 3980 { 3981 int i, r; 3982 3983 for (i = 0; i < adev->num_ip_blocks; i++) { 3984 if (!adev->ip_blocks[i].status.valid || adev->ip_blocks[i].status.hw) 3985 continue; 3986 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON || 3987 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC || 3988 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH || 3989 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE || 3990 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) 3991 continue; 3992 r = amdgpu_ip_block_resume(&adev->ip_blocks[i]); 3993 if (r) 3994 return r; 3995 } 3996 3997 return 0; 3998 } 3999 4000 /** 4001 * amdgpu_device_ip_resume_phase3 - run resume for hardware IPs 4002 * 4003 * @adev: amdgpu_device pointer 4004 * 4005 * Third resume function for hardware IPs. The list of all the hardware 4006 * IPs that make up the asic is walked and the resume callbacks are run for 4007 * all DCE. resume puts the hardware into a functional state after a suspend 4008 * and updates the software state as necessary. This function is also used 4009 * for restoring the GPU after a GPU reset. 4010 * 4011 * Returns 0 on success, negative error code on failure. 4012 */ 4013 static int amdgpu_device_ip_resume_phase3(struct amdgpu_device *adev) 4014 { 4015 int i, r; 4016 4017 for (i = 0; i < adev->num_ip_blocks; i++) { 4018 if (!adev->ip_blocks[i].status.valid || adev->ip_blocks[i].status.hw) 4019 continue; 4020 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE) { 4021 r = amdgpu_ip_block_resume(&adev->ip_blocks[i]); 4022 if (r) 4023 return r; 4024 } 4025 } 4026 4027 return 0; 4028 } 4029 4030 /** 4031 * amdgpu_device_ip_resume - run resume for hardware IPs 4032 * 4033 * @adev: amdgpu_device pointer 4034 * 4035 * Main resume function for hardware IPs. The hardware IPs 4036 * are split into two resume functions because they are 4037 * also used in recovering from a GPU reset and some additional 4038 * steps need to be take between them. In this case (S3/S4) they are 4039 * run sequentially. 4040 * Returns 0 on success, negative error code on failure. 4041 */ 4042 static int amdgpu_device_ip_resume(struct amdgpu_device *adev) 4043 { 4044 int r; 4045 4046 r = amdgpu_device_ip_resume_phase1(adev); 4047 if (r) 4048 return r; 4049 4050 r = amdgpu_device_fw_loading(adev); 4051 if (r) 4052 return r; 4053 4054 r = amdgpu_device_ip_resume_phase2(adev); 4055 4056 if (adev->mman.buffer_funcs_ring->sched.ready) 4057 amdgpu_ttm_set_buffer_funcs_status(adev, true); 4058 4059 if (r) 4060 return r; 4061 4062 amdgpu_fence_driver_hw_init(adev); 4063 4064 r = amdgpu_device_ip_resume_phase3(adev); 4065 4066 return r; 4067 } 4068 4069 /** 4070 * amdgpu_device_detect_sriov_bios - determine if the board supports SR-IOV 4071 * 4072 * @adev: amdgpu_device pointer 4073 * 4074 * Query the VBIOS data tables to determine if the board supports SR-IOV. 4075 */ 4076 static void amdgpu_device_detect_sriov_bios(struct amdgpu_device *adev) 4077 { 4078 if (amdgpu_sriov_vf(adev)) { 4079 if (adev->is_atom_fw) { 4080 if (amdgpu_atomfirmware_gpu_virtualization_supported(adev)) 4081 adev->virt.caps |= AMDGPU_SRIOV_CAPS_SRIOV_VBIOS; 4082 } else { 4083 if (amdgpu_atombios_has_gpu_virtualization_table(adev)) 4084 adev->virt.caps |= AMDGPU_SRIOV_CAPS_SRIOV_VBIOS; 4085 } 4086 4087 if (!(adev->virt.caps & AMDGPU_SRIOV_CAPS_SRIOV_VBIOS)) 4088 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_NO_VBIOS, 0, 0); 4089 } 4090 } 4091 4092 /** 4093 * amdgpu_device_asic_has_dc_support - determine if DC supports the asic 4094 * 4095 * @pdev : pci device context 4096 * @asic_type: AMD asic type 4097 * 4098 * Check if there is DC (new modesetting infrastructre) support for an asic. 4099 * returns true if DC has support, false if not. 4100 */ 4101 bool amdgpu_device_asic_has_dc_support(struct pci_dev *pdev, 4102 enum amd_asic_type asic_type) 4103 { 4104 switch (asic_type) { 4105 #ifdef CONFIG_DRM_AMDGPU_SI 4106 case CHIP_HAINAN: 4107 #endif 4108 case CHIP_TOPAZ: 4109 /* chips with no display hardware */ 4110 return false; 4111 #if defined(CONFIG_DRM_AMD_DC) 4112 case CHIP_TAHITI: 4113 case CHIP_PITCAIRN: 4114 case CHIP_VERDE: 4115 case CHIP_OLAND: 4116 /* 4117 * We have systems in the wild with these ASICs that require 4118 * LVDS and VGA support which is not supported with DC. 4119 * 4120 * Fallback to the non-DC driver here by default so as not to 4121 * cause regressions. 4122 */ 4123 #if defined(CONFIG_DRM_AMD_DC_SI) 4124 return amdgpu_dc > 0; 4125 #else 4126 return false; 4127 #endif 4128 case CHIP_BONAIRE: 4129 case CHIP_KAVERI: 4130 case CHIP_KABINI: 4131 case CHIP_MULLINS: 4132 /* 4133 * We have systems in the wild with these ASICs that require 4134 * VGA support which is not supported with DC. 4135 * 4136 * Fallback to the non-DC driver here by default so as not to 4137 * cause regressions. 4138 */ 4139 return amdgpu_dc > 0; 4140 default: 4141 return amdgpu_dc != 0; 4142 #else 4143 default: 4144 if (amdgpu_dc > 0) 4145 dev_info_once( 4146 &pdev->dev, 4147 "Display Core has been requested via kernel parameter but isn't supported by ASIC, ignoring\n"); 4148 return false; 4149 #endif 4150 } 4151 } 4152 4153 /** 4154 * amdgpu_device_has_dc_support - check if dc is supported 4155 * 4156 * @adev: amdgpu_device pointer 4157 * 4158 * Returns true for supported, false for not supported 4159 */ 4160 bool amdgpu_device_has_dc_support(struct amdgpu_device *adev) 4161 { 4162 if (adev->enable_virtual_display || 4163 (adev->harvest_ip_mask & AMD_HARVEST_IP_DMU_MASK)) 4164 return false; 4165 4166 return amdgpu_device_asic_has_dc_support(adev->pdev, adev->asic_type); 4167 } 4168 4169 static void amdgpu_device_xgmi_reset_func(struct work_struct *__work) 4170 { 4171 struct amdgpu_device *adev = 4172 container_of(__work, struct amdgpu_device, xgmi_reset_work); 4173 struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev); 4174 4175 /* It's a bug to not have a hive within this function */ 4176 if (WARN_ON(!hive)) 4177 return; 4178 4179 /* 4180 * Use task barrier to synchronize all xgmi reset works across the 4181 * hive. task_barrier_enter and task_barrier_exit will block 4182 * until all the threads running the xgmi reset works reach 4183 * those points. task_barrier_full will do both blocks. 4184 */ 4185 if (amdgpu_asic_reset_method(adev) == AMD_RESET_METHOD_BACO) { 4186 4187 task_barrier_enter(&hive->tb); 4188 adev->asic_reset_res = amdgpu_device_baco_enter(adev); 4189 4190 if (adev->asic_reset_res) 4191 goto fail; 4192 4193 task_barrier_exit(&hive->tb); 4194 adev->asic_reset_res = amdgpu_device_baco_exit(adev); 4195 4196 if (adev->asic_reset_res) 4197 goto fail; 4198 4199 amdgpu_ras_reset_error_count(adev, AMDGPU_RAS_BLOCK__MMHUB); 4200 } else { 4201 4202 task_barrier_full(&hive->tb); 4203 adev->asic_reset_res = amdgpu_asic_reset(adev); 4204 } 4205 4206 fail: 4207 if (adev->asic_reset_res) 4208 dev_warn(adev->dev, 4209 "ASIC reset failed with error, %d for drm dev, %s", 4210 adev->asic_reset_res, adev_to_drm(adev)->unique); 4211 amdgpu_put_xgmi_hive(hive); 4212 } 4213 4214 static int amdgpu_device_get_job_timeout_settings(struct amdgpu_device *adev) 4215 { 4216 char *input = amdgpu_lockup_timeout; 4217 char *timeout_setting = NULL; 4218 int index = 0; 4219 long timeout; 4220 int ret = 0; 4221 4222 /* 4223 * By default timeout for non compute jobs is 10000 4224 * and 60000 for compute jobs. 4225 * In SR-IOV or passthrough mode, timeout for compute 4226 * jobs are 60000 by default. 4227 */ 4228 adev->gfx_timeout = msecs_to_jiffies(10000); 4229 adev->sdma_timeout = adev->video_timeout = adev->gfx_timeout; 4230 if (amdgpu_sriov_vf(adev)) 4231 adev->compute_timeout = amdgpu_sriov_is_pp_one_vf(adev) ? 4232 msecs_to_jiffies(60000) : msecs_to_jiffies(10000); 4233 else 4234 adev->compute_timeout = msecs_to_jiffies(60000); 4235 4236 if (strnlen(input, AMDGPU_MAX_TIMEOUT_PARAM_LENGTH)) { 4237 while ((timeout_setting = strsep(&input, ",")) && 4238 strnlen(timeout_setting, AMDGPU_MAX_TIMEOUT_PARAM_LENGTH)) { 4239 ret = kstrtol(timeout_setting, 0, &timeout); 4240 if (ret) 4241 return ret; 4242 4243 if (timeout == 0) { 4244 index++; 4245 continue; 4246 } else if (timeout < 0) { 4247 timeout = MAX_SCHEDULE_TIMEOUT; 4248 dev_warn(adev->dev, "lockup timeout disabled"); 4249 add_taint(TAINT_SOFTLOCKUP, LOCKDEP_STILL_OK); 4250 } else { 4251 timeout = msecs_to_jiffies(timeout); 4252 } 4253 4254 switch (index++) { 4255 case 0: 4256 adev->gfx_timeout = timeout; 4257 break; 4258 case 1: 4259 adev->compute_timeout = timeout; 4260 break; 4261 case 2: 4262 adev->sdma_timeout = timeout; 4263 break; 4264 case 3: 4265 adev->video_timeout = timeout; 4266 break; 4267 default: 4268 break; 4269 } 4270 } 4271 /* 4272 * There is only one value specified and 4273 * it should apply to all non-compute jobs. 4274 */ 4275 if (index == 1) { 4276 adev->sdma_timeout = adev->video_timeout = adev->gfx_timeout; 4277 if (amdgpu_sriov_vf(adev) || amdgpu_passthrough(adev)) 4278 adev->compute_timeout = adev->gfx_timeout; 4279 } 4280 } 4281 4282 return ret; 4283 } 4284 4285 /** 4286 * amdgpu_device_check_iommu_direct_map - check if RAM direct mapped to GPU 4287 * 4288 * @adev: amdgpu_device pointer 4289 * 4290 * RAM direct mapped to GPU if IOMMU is not enabled or is pass through mode 4291 */ 4292 static void amdgpu_device_check_iommu_direct_map(struct amdgpu_device *adev) 4293 { 4294 struct iommu_domain *domain; 4295 4296 domain = iommu_get_domain_for_dev(adev->dev); 4297 if (!domain || domain->type == IOMMU_DOMAIN_IDENTITY) 4298 adev->ram_is_direct_mapped = true; 4299 } 4300 4301 #if defined(CONFIG_HSA_AMD_P2P) 4302 /** 4303 * amdgpu_device_check_iommu_remap - Check if DMA remapping is enabled. 4304 * 4305 * @adev: amdgpu_device pointer 4306 * 4307 * return if IOMMU remapping bar address 4308 */ 4309 static bool amdgpu_device_check_iommu_remap(struct amdgpu_device *adev) 4310 { 4311 struct iommu_domain *domain; 4312 4313 domain = iommu_get_domain_for_dev(adev->dev); 4314 if (domain && (domain->type == IOMMU_DOMAIN_DMA || 4315 domain->type == IOMMU_DOMAIN_DMA_FQ)) 4316 return true; 4317 4318 return false; 4319 } 4320 #endif 4321 4322 static void amdgpu_device_set_mcbp(struct amdgpu_device *adev) 4323 { 4324 if (amdgpu_mcbp == 1) 4325 adev->gfx.mcbp = true; 4326 else if (amdgpu_mcbp == 0) 4327 adev->gfx.mcbp = false; 4328 4329 if (amdgpu_sriov_vf(adev)) 4330 adev->gfx.mcbp = true; 4331 4332 if (adev->gfx.mcbp) 4333 dev_info(adev->dev, "MCBP is enabled\n"); 4334 } 4335 4336 /** 4337 * amdgpu_device_init - initialize the driver 4338 * 4339 * @adev: amdgpu_device pointer 4340 * @flags: driver flags 4341 * 4342 * Initializes the driver info and hw (all asics). 4343 * Returns 0 for success or an error on failure. 4344 * Called at driver startup. 4345 */ 4346 int amdgpu_device_init(struct amdgpu_device *adev, 4347 uint32_t flags) 4348 { 4349 struct pci_dev *pdev = adev->pdev; 4350 int r, i; 4351 bool px = false; 4352 u32 max_MBps; 4353 int tmp; 4354 4355 adev->shutdown = false; 4356 adev->flags = flags; 4357 4358 if (amdgpu_force_asic_type >= 0 && amdgpu_force_asic_type < CHIP_LAST) 4359 adev->asic_type = amdgpu_force_asic_type; 4360 else 4361 adev->asic_type = flags & AMD_ASIC_MASK; 4362 4363 adev->usec_timeout = AMDGPU_MAX_USEC_TIMEOUT; 4364 if (amdgpu_emu_mode == 1) 4365 adev->usec_timeout *= 10; 4366 adev->gmc.gart_size = 512 * 1024 * 1024; 4367 adev->accel_working = false; 4368 adev->num_rings = 0; 4369 RCU_INIT_POINTER(adev->gang_submit, dma_fence_get_stub()); 4370 adev->mman.buffer_funcs = NULL; 4371 adev->mman.buffer_funcs_ring = NULL; 4372 adev->vm_manager.vm_pte_funcs = NULL; 4373 adev->vm_manager.vm_pte_num_scheds = 0; 4374 adev->gmc.gmc_funcs = NULL; 4375 adev->harvest_ip_mask = 0x0; 4376 adev->fence_context = dma_fence_context_alloc(AMDGPU_MAX_RINGS); 4377 bitmap_zero(adev->gfx.pipe_reserve_bitmap, AMDGPU_MAX_COMPUTE_QUEUES); 4378 4379 adev->smc_rreg = &amdgpu_invalid_rreg; 4380 adev->smc_wreg = &amdgpu_invalid_wreg; 4381 adev->pcie_rreg = &amdgpu_invalid_rreg; 4382 adev->pcie_wreg = &amdgpu_invalid_wreg; 4383 adev->pcie_rreg_ext = &amdgpu_invalid_rreg_ext; 4384 adev->pcie_wreg_ext = &amdgpu_invalid_wreg_ext; 4385 adev->pciep_rreg = &amdgpu_invalid_rreg; 4386 adev->pciep_wreg = &amdgpu_invalid_wreg; 4387 adev->pcie_rreg64 = &amdgpu_invalid_rreg64; 4388 adev->pcie_wreg64 = &amdgpu_invalid_wreg64; 4389 adev->pcie_rreg64_ext = &amdgpu_invalid_rreg64_ext; 4390 adev->pcie_wreg64_ext = &amdgpu_invalid_wreg64_ext; 4391 adev->uvd_ctx_rreg = &amdgpu_invalid_rreg; 4392 adev->uvd_ctx_wreg = &amdgpu_invalid_wreg; 4393 adev->didt_rreg = &amdgpu_invalid_rreg; 4394 adev->didt_wreg = &amdgpu_invalid_wreg; 4395 adev->gc_cac_rreg = &amdgpu_invalid_rreg; 4396 adev->gc_cac_wreg = &amdgpu_invalid_wreg; 4397 adev->audio_endpt_rreg = &amdgpu_block_invalid_rreg; 4398 adev->audio_endpt_wreg = &amdgpu_block_invalid_wreg; 4399 4400 dev_info( 4401 adev->dev, 4402 "initializing kernel modesetting (%s 0x%04X:0x%04X 0x%04X:0x%04X 0x%02X).\n", 4403 amdgpu_asic_name[adev->asic_type], pdev->vendor, pdev->device, 4404 pdev->subsystem_vendor, pdev->subsystem_device, pdev->revision); 4405 4406 /* mutex initialization are all done here so we 4407 * can recall function without having locking issues 4408 */ 4409 mutex_init(&adev->firmware.mutex); 4410 mutex_init(&adev->pm.mutex); 4411 mutex_init(&adev->gfx.gpu_clock_mutex); 4412 mutex_init(&adev->srbm_mutex); 4413 mutex_init(&adev->gfx.pipe_reserve_mutex); 4414 mutex_init(&adev->gfx.gfx_off_mutex); 4415 mutex_init(&adev->gfx.partition_mutex); 4416 mutex_init(&adev->grbm_idx_mutex); 4417 mutex_init(&adev->mn_lock); 4418 mutex_init(&adev->virt.vf_errors.lock); 4419 hash_init(adev->mn_hash); 4420 mutex_init(&adev->psp.mutex); 4421 mutex_init(&adev->notifier_lock); 4422 mutex_init(&adev->pm.stable_pstate_ctx_lock); 4423 mutex_init(&adev->benchmark_mutex); 4424 mutex_init(&adev->gfx.reset_sem_mutex); 4425 /* Initialize the mutex for cleaner shader isolation between GFX and compute processes */ 4426 mutex_init(&adev->enforce_isolation_mutex); 4427 for (i = 0; i < MAX_XCP; ++i) { 4428 adev->isolation[i].spearhead = dma_fence_get_stub(); 4429 amdgpu_sync_create(&adev->isolation[i].active); 4430 amdgpu_sync_create(&adev->isolation[i].prev); 4431 } 4432 mutex_init(&adev->gfx.userq_sch_mutex); 4433 mutex_init(&adev->gfx.workload_profile_mutex); 4434 mutex_init(&adev->vcn.workload_profile_mutex); 4435 mutex_init(&adev->userq_mutex); 4436 4437 amdgpu_device_init_apu_flags(adev); 4438 4439 r = amdgpu_device_check_arguments(adev); 4440 if (r) 4441 return r; 4442 4443 spin_lock_init(&adev->mmio_idx_lock); 4444 spin_lock_init(&adev->smc_idx_lock); 4445 spin_lock_init(&adev->pcie_idx_lock); 4446 spin_lock_init(&adev->uvd_ctx_idx_lock); 4447 spin_lock_init(&adev->didt_idx_lock); 4448 spin_lock_init(&adev->gc_cac_idx_lock); 4449 spin_lock_init(&adev->se_cac_idx_lock); 4450 spin_lock_init(&adev->audio_endpt_idx_lock); 4451 spin_lock_init(&adev->mm_stats.lock); 4452 spin_lock_init(&adev->virt.rlcg_reg_lock); 4453 spin_lock_init(&adev->wb.lock); 4454 4455 xa_init_flags(&adev->userq_xa, XA_FLAGS_LOCK_IRQ); 4456 4457 INIT_LIST_HEAD(&adev->reset_list); 4458 4459 INIT_LIST_HEAD(&adev->ras_list); 4460 4461 INIT_LIST_HEAD(&adev->pm.od_kobj_list); 4462 4463 INIT_LIST_HEAD(&adev->userq_mgr_list); 4464 4465 INIT_DELAYED_WORK(&adev->delayed_init_work, 4466 amdgpu_device_delayed_init_work_handler); 4467 INIT_DELAYED_WORK(&adev->gfx.gfx_off_delay_work, 4468 amdgpu_device_delay_enable_gfx_off); 4469 /* 4470 * Initialize the enforce_isolation work structures for each XCP 4471 * partition. This work handler is responsible for enforcing shader 4472 * isolation on AMD GPUs. It counts the number of emitted fences for 4473 * each GFX and compute ring. If there are any fences, it schedules 4474 * the `enforce_isolation_work` to be run after a delay. If there are 4475 * no fences, it signals the Kernel Fusion Driver (KFD) to resume the 4476 * runqueue. 4477 */ 4478 for (i = 0; i < MAX_XCP; i++) { 4479 INIT_DELAYED_WORK(&adev->gfx.enforce_isolation[i].work, 4480 amdgpu_gfx_enforce_isolation_handler); 4481 adev->gfx.enforce_isolation[i].adev = adev; 4482 adev->gfx.enforce_isolation[i].xcp_id = i; 4483 } 4484 4485 INIT_WORK(&adev->xgmi_reset_work, amdgpu_device_xgmi_reset_func); 4486 4487 adev->gfx.gfx_off_req_count = 1; 4488 adev->gfx.gfx_off_residency = 0; 4489 adev->gfx.gfx_off_entrycount = 0; 4490 adev->pm.ac_power = power_supply_is_system_supplied() > 0; 4491 4492 atomic_set(&adev->throttling_logging_enabled, 1); 4493 /* 4494 * If throttling continues, logging will be performed every minute 4495 * to avoid log flooding. "-1" is subtracted since the thermal 4496 * throttling interrupt comes every second. Thus, the total logging 4497 * interval is 59 seconds(retelimited printk interval) + 1(waiting 4498 * for throttling interrupt) = 60 seconds. 4499 */ 4500 ratelimit_state_init(&adev->throttling_logging_rs, (60 - 1) * HZ, 1); 4501 4502 ratelimit_set_flags(&adev->throttling_logging_rs, RATELIMIT_MSG_ON_RELEASE); 4503 4504 /* Registers mapping */ 4505 /* TODO: block userspace mapping of io register */ 4506 if (adev->asic_type >= CHIP_BONAIRE) { 4507 adev->rmmio_base = pci_resource_start(adev->pdev, 5); 4508 adev->rmmio_size = pci_resource_len(adev->pdev, 5); 4509 } else { 4510 adev->rmmio_base = pci_resource_start(adev->pdev, 2); 4511 adev->rmmio_size = pci_resource_len(adev->pdev, 2); 4512 } 4513 4514 for (i = 0; i < AMD_IP_BLOCK_TYPE_NUM; i++) 4515 atomic_set(&adev->pm.pwr_state[i], POWER_STATE_UNKNOWN); 4516 4517 adev->rmmio = ioremap(adev->rmmio_base, adev->rmmio_size); 4518 if (!adev->rmmio) 4519 return -ENOMEM; 4520 4521 dev_info(adev->dev, "register mmio base: 0x%08X\n", 4522 (uint32_t)adev->rmmio_base); 4523 dev_info(adev->dev, "register mmio size: %u\n", 4524 (unsigned int)adev->rmmio_size); 4525 4526 /* 4527 * Reset domain needs to be present early, before XGMI hive discovered 4528 * (if any) and initialized to use reset sem and in_gpu reset flag 4529 * early on during init and before calling to RREG32. 4530 */ 4531 adev->reset_domain = amdgpu_reset_create_reset_domain(SINGLE_DEVICE, "amdgpu-reset-dev"); 4532 if (!adev->reset_domain) 4533 return -ENOMEM; 4534 4535 /* detect hw virtualization here */ 4536 amdgpu_virt_init(adev); 4537 4538 amdgpu_device_get_pcie_info(adev); 4539 4540 r = amdgpu_device_get_job_timeout_settings(adev); 4541 if (r) { 4542 dev_err(adev->dev, "invalid lockup_timeout parameter syntax\n"); 4543 return r; 4544 } 4545 4546 amdgpu_device_set_mcbp(adev); 4547 4548 /* 4549 * By default, use default mode where all blocks are expected to be 4550 * initialized. At present a 'swinit' of blocks is required to be 4551 * completed before the need for a different level is detected. 4552 */ 4553 amdgpu_set_init_level(adev, AMDGPU_INIT_LEVEL_DEFAULT); 4554 /* early init functions */ 4555 r = amdgpu_device_ip_early_init(adev); 4556 if (r) 4557 return r; 4558 4559 /* 4560 * No need to remove conflicting FBs for non-display class devices. 4561 * This prevents the sysfb from being freed accidently. 4562 */ 4563 if ((pdev->class >> 8) == PCI_CLASS_DISPLAY_VGA || 4564 (pdev->class >> 8) == PCI_CLASS_DISPLAY_OTHER) { 4565 /* Get rid of things like offb */ 4566 r = aperture_remove_conflicting_pci_devices(adev->pdev, amdgpu_kms_driver.name); 4567 if (r) 4568 return r; 4569 } 4570 4571 /* Enable TMZ based on IP_VERSION */ 4572 amdgpu_gmc_tmz_set(adev); 4573 4574 if (amdgpu_sriov_vf(adev) && 4575 amdgpu_ip_version(adev, GC_HWIP, 0) >= IP_VERSION(10, 3, 0)) 4576 /* VF MMIO access (except mailbox range) from CPU 4577 * will be blocked during sriov runtime 4578 */ 4579 adev->virt.caps |= AMDGPU_VF_MMIO_ACCESS_PROTECT; 4580 4581 amdgpu_gmc_noretry_set(adev); 4582 /* Need to get xgmi info early to decide the reset behavior*/ 4583 if (adev->gmc.xgmi.supported) { 4584 r = adev->gfxhub.funcs->get_xgmi_info(adev); 4585 if (r) 4586 return r; 4587 } 4588 4589 /* enable PCIE atomic ops */ 4590 if (amdgpu_sriov_vf(adev)) { 4591 if (adev->virt.fw_reserve.p_pf2vf) 4592 adev->have_atomics_support = ((struct amd_sriov_msg_pf2vf_info *) 4593 adev->virt.fw_reserve.p_pf2vf)->pcie_atomic_ops_support_flags == 4594 (PCI_EXP_DEVCAP2_ATOMIC_COMP32 | PCI_EXP_DEVCAP2_ATOMIC_COMP64); 4595 /* APUs w/ gfx9 onwards doesn't reply on PCIe atomics, rather it is a 4596 * internal path natively support atomics, set have_atomics_support to true. 4597 */ 4598 } else if ((adev->flags & AMD_IS_APU) && 4599 (amdgpu_ip_version(adev, GC_HWIP, 0) > 4600 IP_VERSION(9, 0, 0))) { 4601 adev->have_atomics_support = true; 4602 } else { 4603 adev->have_atomics_support = 4604 !pci_enable_atomic_ops_to_root(adev->pdev, 4605 PCI_EXP_DEVCAP2_ATOMIC_COMP32 | 4606 PCI_EXP_DEVCAP2_ATOMIC_COMP64); 4607 } 4608 4609 if (!adev->have_atomics_support) 4610 dev_info(adev->dev, "PCIE atomic ops is not supported\n"); 4611 4612 /* doorbell bar mapping and doorbell index init*/ 4613 amdgpu_doorbell_init(adev); 4614 4615 if (amdgpu_emu_mode == 1) { 4616 /* post the asic on emulation mode */ 4617 emu_soc_asic_init(adev); 4618 goto fence_driver_init; 4619 } 4620 4621 amdgpu_reset_init(adev); 4622 4623 /* detect if we are with an SRIOV vbios */ 4624 if (adev->bios) 4625 amdgpu_device_detect_sriov_bios(adev); 4626 4627 /* check if we need to reset the asic 4628 * E.g., driver was not cleanly unloaded previously, etc. 4629 */ 4630 if (!amdgpu_sriov_vf(adev) && amdgpu_asic_need_reset_on_init(adev)) { 4631 if (adev->gmc.xgmi.num_physical_nodes) { 4632 dev_info(adev->dev, "Pending hive reset.\n"); 4633 amdgpu_set_init_level(adev, 4634 AMDGPU_INIT_LEVEL_MINIMAL_XGMI); 4635 } else if (amdgpu_ip_version(adev, MP1_HWIP, 0) == IP_VERSION(13, 0, 10) && 4636 !amdgpu_device_has_display_hardware(adev)) { 4637 r = psp_gpu_reset(adev); 4638 } else { 4639 tmp = amdgpu_reset_method; 4640 /* It should do a default reset when loading or reloading the driver, 4641 * regardless of the module parameter reset_method. 4642 */ 4643 amdgpu_reset_method = AMD_RESET_METHOD_NONE; 4644 r = amdgpu_asic_reset(adev); 4645 amdgpu_reset_method = tmp; 4646 } 4647 4648 if (r) { 4649 dev_err(adev->dev, "asic reset on init failed\n"); 4650 goto failed; 4651 } 4652 } 4653 4654 /* Post card if necessary */ 4655 if (amdgpu_device_need_post(adev)) { 4656 if (!adev->bios) { 4657 dev_err(adev->dev, "no vBIOS found\n"); 4658 r = -EINVAL; 4659 goto failed; 4660 } 4661 dev_info(adev->dev, "GPU posting now...\n"); 4662 r = amdgpu_device_asic_init(adev); 4663 if (r) { 4664 dev_err(adev->dev, "gpu post error!\n"); 4665 goto failed; 4666 } 4667 } 4668 4669 if (adev->bios) { 4670 if (adev->is_atom_fw) { 4671 /* Initialize clocks */ 4672 r = amdgpu_atomfirmware_get_clock_info(adev); 4673 if (r) { 4674 dev_err(adev->dev, "amdgpu_atomfirmware_get_clock_info failed\n"); 4675 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_GET_CLOCK_FAIL, 0, 0); 4676 goto failed; 4677 } 4678 } else { 4679 /* Initialize clocks */ 4680 r = amdgpu_atombios_get_clock_info(adev); 4681 if (r) { 4682 dev_err(adev->dev, "amdgpu_atombios_get_clock_info failed\n"); 4683 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_GET_CLOCK_FAIL, 0, 0); 4684 goto failed; 4685 } 4686 /* init i2c buses */ 4687 amdgpu_i2c_init(adev); 4688 } 4689 } 4690 4691 fence_driver_init: 4692 /* Fence driver */ 4693 r = amdgpu_fence_driver_sw_init(adev); 4694 if (r) { 4695 dev_err(adev->dev, "amdgpu_fence_driver_sw_init failed\n"); 4696 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_FENCE_INIT_FAIL, 0, 0); 4697 goto failed; 4698 } 4699 4700 /* init the mode config */ 4701 drm_mode_config_init(adev_to_drm(adev)); 4702 4703 r = amdgpu_device_ip_init(adev); 4704 if (r) { 4705 dev_err(adev->dev, "amdgpu_device_ip_init failed\n"); 4706 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_AMDGPU_INIT_FAIL, 0, 0); 4707 goto release_ras_con; 4708 } 4709 4710 amdgpu_fence_driver_hw_init(adev); 4711 4712 dev_info(adev->dev, 4713 "SE %d, SH per SE %d, CU per SH %d, active_cu_number %d\n", 4714 adev->gfx.config.max_shader_engines, 4715 adev->gfx.config.max_sh_per_se, 4716 adev->gfx.config.max_cu_per_sh, 4717 adev->gfx.cu_info.number); 4718 4719 adev->accel_working = true; 4720 4721 amdgpu_vm_check_compute_bug(adev); 4722 4723 /* Initialize the buffer migration limit. */ 4724 if (amdgpu_moverate >= 0) 4725 max_MBps = amdgpu_moverate; 4726 else 4727 max_MBps = 8; /* Allow 8 MB/s. */ 4728 /* Get a log2 for easy divisions. */ 4729 adev->mm_stats.log2_max_MBps = ilog2(max(1u, max_MBps)); 4730 4731 /* 4732 * Register gpu instance before amdgpu_device_enable_mgpu_fan_boost. 4733 * Otherwise the mgpu fan boost feature will be skipped due to the 4734 * gpu instance is counted less. 4735 */ 4736 amdgpu_register_gpu_instance(adev); 4737 4738 /* enable clockgating, etc. after ib tests, etc. since some blocks require 4739 * explicit gating rather than handling it automatically. 4740 */ 4741 if (adev->init_lvl->level != AMDGPU_INIT_LEVEL_MINIMAL_XGMI) { 4742 r = amdgpu_device_ip_late_init(adev); 4743 if (r) { 4744 dev_err(adev->dev, "amdgpu_device_ip_late_init failed\n"); 4745 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_AMDGPU_LATE_INIT_FAIL, 0, r); 4746 goto release_ras_con; 4747 } 4748 /* must succeed. */ 4749 amdgpu_ras_resume(adev); 4750 queue_delayed_work(system_wq, &adev->delayed_init_work, 4751 msecs_to_jiffies(AMDGPU_RESUME_MS)); 4752 } 4753 4754 if (amdgpu_sriov_vf(adev)) { 4755 amdgpu_virt_release_full_gpu(adev, true); 4756 flush_delayed_work(&adev->delayed_init_work); 4757 } 4758 4759 /* 4760 * Place those sysfs registering after `late_init`. As some of those 4761 * operations performed in `late_init` might affect the sysfs 4762 * interfaces creating. 4763 */ 4764 r = amdgpu_atombios_sysfs_init(adev); 4765 if (r) 4766 drm_err(&adev->ddev, 4767 "registering atombios sysfs failed (%d).\n", r); 4768 4769 r = amdgpu_pm_sysfs_init(adev); 4770 if (r) 4771 dev_err(adev->dev, "registering pm sysfs failed (%d).\n", r); 4772 4773 r = amdgpu_ucode_sysfs_init(adev); 4774 if (r) { 4775 adev->ucode_sysfs_en = false; 4776 dev_err(adev->dev, "Creating firmware sysfs failed (%d).\n", r); 4777 } else 4778 adev->ucode_sysfs_en = true; 4779 4780 r = amdgpu_device_attr_sysfs_init(adev); 4781 if (r) 4782 dev_err(adev->dev, "Could not create amdgpu device attr\n"); 4783 4784 r = devm_device_add_group(adev->dev, &amdgpu_board_attrs_group); 4785 if (r) 4786 dev_err(adev->dev, 4787 "Could not create amdgpu board attributes\n"); 4788 4789 amdgpu_fru_sysfs_init(adev); 4790 amdgpu_reg_state_sysfs_init(adev); 4791 amdgpu_xcp_sysfs_init(adev); 4792 4793 if (IS_ENABLED(CONFIG_PERF_EVENTS)) 4794 r = amdgpu_pmu_init(adev); 4795 if (r) 4796 dev_err(adev->dev, "amdgpu_pmu_init failed\n"); 4797 4798 /* Have stored pci confspace at hand for restore in sudden PCI error */ 4799 if (amdgpu_device_cache_pci_state(adev->pdev)) 4800 pci_restore_state(pdev); 4801 4802 /* if we have > 1 VGA cards, then disable the amdgpu VGA resources */ 4803 /* this will fail for cards that aren't VGA class devices, just 4804 * ignore it 4805 */ 4806 if ((adev->pdev->class >> 8) == PCI_CLASS_DISPLAY_VGA) 4807 vga_client_register(adev->pdev, amdgpu_device_vga_set_decode); 4808 4809 px = amdgpu_device_supports_px(adev); 4810 4811 if (px || (!dev_is_removable(&adev->pdev->dev) && 4812 apple_gmux_detect(NULL, NULL))) 4813 vga_switcheroo_register_client(adev->pdev, 4814 &amdgpu_switcheroo_ops, px); 4815 4816 if (px) 4817 vga_switcheroo_init_domain_pm_ops(adev->dev, &adev->vga_pm_domain); 4818 4819 if (adev->init_lvl->level == AMDGPU_INIT_LEVEL_MINIMAL_XGMI) 4820 amdgpu_xgmi_reset_on_init(adev); 4821 4822 amdgpu_device_check_iommu_direct_map(adev); 4823 4824 adev->pm_nb.notifier_call = amdgpu_device_pm_notifier; 4825 r = register_pm_notifier(&adev->pm_nb); 4826 if (r) 4827 goto failed; 4828 4829 return 0; 4830 4831 release_ras_con: 4832 if (amdgpu_sriov_vf(adev)) 4833 amdgpu_virt_release_full_gpu(adev, true); 4834 4835 /* failed in exclusive mode due to timeout */ 4836 if (amdgpu_sriov_vf(adev) && 4837 !amdgpu_sriov_runtime(adev) && 4838 amdgpu_virt_mmio_blocked(adev) && 4839 !amdgpu_virt_wait_reset(adev)) { 4840 dev_err(adev->dev, "VF exclusive mode timeout\n"); 4841 /* Don't send request since VF is inactive. */ 4842 adev->virt.caps &= ~AMDGPU_SRIOV_CAPS_RUNTIME; 4843 adev->virt.ops = NULL; 4844 r = -EAGAIN; 4845 } 4846 amdgpu_release_ras_context(adev); 4847 4848 failed: 4849 amdgpu_vf_error_trans_all(adev); 4850 4851 return r; 4852 } 4853 4854 static void amdgpu_device_unmap_mmio(struct amdgpu_device *adev) 4855 { 4856 4857 /* Clear all CPU mappings pointing to this device */ 4858 unmap_mapping_range(adev->ddev.anon_inode->i_mapping, 0, 0, 1); 4859 4860 /* Unmap all mapped bars - Doorbell, registers and VRAM */ 4861 amdgpu_doorbell_fini(adev); 4862 4863 iounmap(adev->rmmio); 4864 adev->rmmio = NULL; 4865 if (adev->mman.aper_base_kaddr) 4866 iounmap(adev->mman.aper_base_kaddr); 4867 adev->mman.aper_base_kaddr = NULL; 4868 4869 /* Memory manager related */ 4870 if (!adev->gmc.xgmi.connected_to_cpu && !adev->gmc.is_app_apu) { 4871 arch_phys_wc_del(adev->gmc.vram_mtrr); 4872 arch_io_free_memtype_wc(adev->gmc.aper_base, adev->gmc.aper_size); 4873 } 4874 } 4875 4876 /** 4877 * amdgpu_device_fini_hw - tear down the driver 4878 * 4879 * @adev: amdgpu_device pointer 4880 * 4881 * Tear down the driver info (all asics). 4882 * Called at driver shutdown. 4883 */ 4884 void amdgpu_device_fini_hw(struct amdgpu_device *adev) 4885 { 4886 dev_info(adev->dev, "amdgpu: finishing device.\n"); 4887 flush_delayed_work(&adev->delayed_init_work); 4888 4889 if (adev->mman.initialized) 4890 drain_workqueue(adev->mman.bdev.wq); 4891 adev->shutdown = true; 4892 4893 unregister_pm_notifier(&adev->pm_nb); 4894 4895 /* make sure IB test finished before entering exclusive mode 4896 * to avoid preemption on IB test 4897 */ 4898 if (amdgpu_sriov_vf(adev)) { 4899 amdgpu_virt_request_full_gpu(adev, false); 4900 amdgpu_virt_fini_data_exchange(adev); 4901 } 4902 4903 /* disable all interrupts */ 4904 amdgpu_irq_disable_all(adev); 4905 if (adev->mode_info.mode_config_initialized) { 4906 if (!drm_drv_uses_atomic_modeset(adev_to_drm(adev))) 4907 drm_helper_force_disable_all(adev_to_drm(adev)); 4908 else 4909 drm_atomic_helper_shutdown(adev_to_drm(adev)); 4910 } 4911 amdgpu_fence_driver_hw_fini(adev); 4912 4913 if (adev->pm.sysfs_initialized) 4914 amdgpu_pm_sysfs_fini(adev); 4915 if (adev->ucode_sysfs_en) 4916 amdgpu_ucode_sysfs_fini(adev); 4917 amdgpu_device_attr_sysfs_fini(adev); 4918 amdgpu_fru_sysfs_fini(adev); 4919 4920 amdgpu_reg_state_sysfs_fini(adev); 4921 amdgpu_xcp_sysfs_fini(adev); 4922 4923 /* disable ras feature must before hw fini */ 4924 amdgpu_ras_pre_fini(adev); 4925 4926 amdgpu_ttm_set_buffer_funcs_status(adev, false); 4927 4928 amdgpu_device_ip_fini_early(adev); 4929 4930 amdgpu_irq_fini_hw(adev); 4931 4932 if (adev->mman.initialized) 4933 ttm_device_clear_dma_mappings(&adev->mman.bdev); 4934 4935 amdgpu_gart_dummy_page_fini(adev); 4936 4937 if (drm_dev_is_unplugged(adev_to_drm(adev))) 4938 amdgpu_device_unmap_mmio(adev); 4939 4940 } 4941 4942 void amdgpu_device_fini_sw(struct amdgpu_device *adev) 4943 { 4944 int i, idx; 4945 bool px; 4946 4947 amdgpu_device_ip_fini(adev); 4948 amdgpu_fence_driver_sw_fini(adev); 4949 amdgpu_ucode_release(&adev->firmware.gpu_info_fw); 4950 adev->accel_working = false; 4951 dma_fence_put(rcu_dereference_protected(adev->gang_submit, true)); 4952 for (i = 0; i < MAX_XCP; ++i) { 4953 dma_fence_put(adev->isolation[i].spearhead); 4954 amdgpu_sync_free(&adev->isolation[i].active); 4955 amdgpu_sync_free(&adev->isolation[i].prev); 4956 } 4957 4958 amdgpu_reset_fini(adev); 4959 4960 /* free i2c buses */ 4961 amdgpu_i2c_fini(adev); 4962 4963 if (adev->bios) { 4964 if (amdgpu_emu_mode != 1) 4965 amdgpu_atombios_fini(adev); 4966 amdgpu_bios_release(adev); 4967 } 4968 4969 kfree(adev->fru_info); 4970 adev->fru_info = NULL; 4971 4972 kfree(adev->xcp_mgr); 4973 adev->xcp_mgr = NULL; 4974 4975 px = amdgpu_device_supports_px(adev); 4976 4977 if (px || (!dev_is_removable(&adev->pdev->dev) && 4978 apple_gmux_detect(NULL, NULL))) 4979 vga_switcheroo_unregister_client(adev->pdev); 4980 4981 if (px) 4982 vga_switcheroo_fini_domain_pm_ops(adev->dev); 4983 4984 if ((adev->pdev->class >> 8) == PCI_CLASS_DISPLAY_VGA) 4985 vga_client_unregister(adev->pdev); 4986 4987 if (drm_dev_enter(adev_to_drm(adev), &idx)) { 4988 4989 iounmap(adev->rmmio); 4990 adev->rmmio = NULL; 4991 drm_dev_exit(idx); 4992 } 4993 4994 if (IS_ENABLED(CONFIG_PERF_EVENTS)) 4995 amdgpu_pmu_fini(adev); 4996 if (adev->mman.discovery_bin) 4997 amdgpu_discovery_fini(adev); 4998 4999 amdgpu_reset_put_reset_domain(adev->reset_domain); 5000 adev->reset_domain = NULL; 5001 5002 kfree(adev->pci_state); 5003 5004 } 5005 5006 /** 5007 * amdgpu_device_evict_resources - evict device resources 5008 * @adev: amdgpu device object 5009 * 5010 * Evicts all ttm device resources(vram BOs, gart table) from the lru list 5011 * of the vram memory type. Mainly used for evicting device resources 5012 * at suspend time. 5013 * 5014 */ 5015 static int amdgpu_device_evict_resources(struct amdgpu_device *adev) 5016 { 5017 int ret; 5018 5019 /* No need to evict vram on APUs unless going to S4 */ 5020 if (!adev->in_s4 && (adev->flags & AMD_IS_APU)) 5021 return 0; 5022 5023 ret = amdgpu_ttm_evict_resources(adev, TTM_PL_VRAM); 5024 if (ret) 5025 dev_warn(adev->dev, "evicting device resources failed\n"); 5026 return ret; 5027 } 5028 5029 /* 5030 * Suspend & resume. 5031 */ 5032 /** 5033 * amdgpu_device_pm_notifier - Notification block for Suspend/Hibernate events 5034 * @nb: notifier block 5035 * @mode: suspend mode 5036 * @data: data 5037 * 5038 * This function is called when the system is about to suspend or hibernate. 5039 * It is used to set the appropriate flags so that eviction can be optimized 5040 * in the pm prepare callback. 5041 */ 5042 static int amdgpu_device_pm_notifier(struct notifier_block *nb, unsigned long mode, 5043 void *data) 5044 { 5045 struct amdgpu_device *adev = container_of(nb, struct amdgpu_device, pm_nb); 5046 5047 switch (mode) { 5048 case PM_HIBERNATION_PREPARE: 5049 adev->in_s4 = true; 5050 break; 5051 case PM_POST_HIBERNATION: 5052 adev->in_s4 = false; 5053 break; 5054 } 5055 5056 return NOTIFY_DONE; 5057 } 5058 5059 /** 5060 * amdgpu_device_prepare - prepare for device suspend 5061 * 5062 * @dev: drm dev pointer 5063 * 5064 * Prepare to put the hw in the suspend state (all asics). 5065 * Returns 0 for success or an error on failure. 5066 * Called at driver suspend. 5067 */ 5068 int amdgpu_device_prepare(struct drm_device *dev) 5069 { 5070 struct amdgpu_device *adev = drm_to_adev(dev); 5071 int i, r; 5072 5073 if (dev->switch_power_state == DRM_SWITCH_POWER_OFF) 5074 return 0; 5075 5076 /* Evict the majority of BOs before starting suspend sequence */ 5077 r = amdgpu_device_evict_resources(adev); 5078 if (r) 5079 return r; 5080 5081 flush_delayed_work(&adev->gfx.gfx_off_delay_work); 5082 5083 for (i = 0; i < adev->num_ip_blocks; i++) { 5084 if (!adev->ip_blocks[i].status.valid) 5085 continue; 5086 if (!adev->ip_blocks[i].version->funcs->prepare_suspend) 5087 continue; 5088 r = adev->ip_blocks[i].version->funcs->prepare_suspend(&adev->ip_blocks[i]); 5089 if (r) 5090 return r; 5091 } 5092 5093 return 0; 5094 } 5095 5096 /** 5097 * amdgpu_device_complete - complete power state transition 5098 * 5099 * @dev: drm dev pointer 5100 * 5101 * Undo the changes from amdgpu_device_prepare. This will be 5102 * called on all resume transitions, including those that failed. 5103 */ 5104 void amdgpu_device_complete(struct drm_device *dev) 5105 { 5106 struct amdgpu_device *adev = drm_to_adev(dev); 5107 int i; 5108 5109 for (i = 0; i < adev->num_ip_blocks; i++) { 5110 if (!adev->ip_blocks[i].status.valid) 5111 continue; 5112 if (!adev->ip_blocks[i].version->funcs->complete) 5113 continue; 5114 adev->ip_blocks[i].version->funcs->complete(&adev->ip_blocks[i]); 5115 } 5116 } 5117 5118 /** 5119 * amdgpu_device_suspend - initiate device suspend 5120 * 5121 * @dev: drm dev pointer 5122 * @notify_clients: notify in-kernel DRM clients 5123 * 5124 * Puts the hw in the suspend state (all asics). 5125 * Returns 0 for success or an error on failure. 5126 * Called at driver suspend. 5127 */ 5128 int amdgpu_device_suspend(struct drm_device *dev, bool notify_clients) 5129 { 5130 struct amdgpu_device *adev = drm_to_adev(dev); 5131 int r = 0; 5132 5133 if (dev->switch_power_state == DRM_SWITCH_POWER_OFF) 5134 return 0; 5135 5136 adev->in_suspend = true; 5137 5138 if (amdgpu_sriov_vf(adev)) { 5139 if (!adev->in_s0ix && !adev->in_runpm) 5140 amdgpu_amdkfd_suspend_process(adev); 5141 amdgpu_virt_fini_data_exchange(adev); 5142 r = amdgpu_virt_request_full_gpu(adev, false); 5143 if (r) 5144 return r; 5145 } 5146 5147 if (amdgpu_acpi_smart_shift_update(adev, AMDGPU_SS_DEV_D3)) 5148 dev_warn(adev->dev, "smart shift update failed\n"); 5149 5150 if (notify_clients) 5151 drm_client_dev_suspend(adev_to_drm(adev), false); 5152 5153 cancel_delayed_work_sync(&adev->delayed_init_work); 5154 5155 amdgpu_ras_suspend(adev); 5156 5157 amdgpu_device_ip_suspend_phase1(adev); 5158 5159 if (!adev->in_s0ix) { 5160 amdgpu_amdkfd_suspend(adev, !amdgpu_sriov_vf(adev) && !adev->in_runpm); 5161 amdgpu_userq_suspend(adev); 5162 } 5163 5164 r = amdgpu_device_evict_resources(adev); 5165 if (r) 5166 return r; 5167 5168 amdgpu_ttm_set_buffer_funcs_status(adev, false); 5169 5170 amdgpu_fence_driver_hw_fini(adev); 5171 5172 amdgpu_device_ip_suspend_phase2(adev); 5173 5174 if (amdgpu_sriov_vf(adev)) 5175 amdgpu_virt_release_full_gpu(adev, false); 5176 5177 r = amdgpu_dpm_notify_rlc_state(adev, false); 5178 if (r) 5179 return r; 5180 5181 return 0; 5182 } 5183 5184 static inline int amdgpu_virt_resume(struct amdgpu_device *adev) 5185 { 5186 int r; 5187 unsigned int prev_physical_node_id = adev->gmc.xgmi.physical_node_id; 5188 5189 /* During VM resume, QEMU programming of VF MSIX table (register GFXMSIX_VECT0_ADDR_LO) 5190 * may not work. The access could be blocked by nBIF protection as VF isn't in 5191 * exclusive access mode. Exclusive access is enabled now, disable/enable MSIX 5192 * so that QEMU reprograms MSIX table. 5193 */ 5194 amdgpu_restore_msix(adev); 5195 5196 r = adev->gfxhub.funcs->get_xgmi_info(adev); 5197 if (r) 5198 return r; 5199 5200 dev_info(adev->dev, "xgmi node, old id %d, new id %d\n", 5201 prev_physical_node_id, adev->gmc.xgmi.physical_node_id); 5202 5203 adev->vm_manager.vram_base_offset = adev->gfxhub.funcs->get_mc_fb_offset(adev); 5204 adev->vm_manager.vram_base_offset += 5205 adev->gmc.xgmi.physical_node_id * adev->gmc.xgmi.node_segment_size; 5206 5207 return 0; 5208 } 5209 5210 /** 5211 * amdgpu_device_resume - initiate device resume 5212 * 5213 * @dev: drm dev pointer 5214 * @notify_clients: notify in-kernel DRM clients 5215 * 5216 * Bring the hw back to operating state (all asics). 5217 * Returns 0 for success or an error on failure. 5218 * Called at driver resume. 5219 */ 5220 int amdgpu_device_resume(struct drm_device *dev, bool notify_clients) 5221 { 5222 struct amdgpu_device *adev = drm_to_adev(dev); 5223 int r = 0; 5224 5225 if (amdgpu_sriov_vf(adev)) { 5226 r = amdgpu_virt_request_full_gpu(adev, true); 5227 if (r) 5228 return r; 5229 } 5230 5231 if (amdgpu_virt_xgmi_migrate_enabled(adev)) { 5232 r = amdgpu_virt_resume(adev); 5233 if (r) 5234 goto exit; 5235 } 5236 5237 if (dev->switch_power_state == DRM_SWITCH_POWER_OFF) 5238 return 0; 5239 5240 if (adev->in_s0ix) 5241 amdgpu_dpm_gfx_state_change(adev, sGpuChangeState_D0Entry); 5242 5243 /* post card */ 5244 if (amdgpu_device_need_post(adev)) { 5245 r = amdgpu_device_asic_init(adev); 5246 if (r) 5247 dev_err(adev->dev, "amdgpu asic init failed\n"); 5248 } 5249 5250 r = amdgpu_device_ip_resume(adev); 5251 5252 if (r) { 5253 dev_err(adev->dev, "amdgpu_device_ip_resume failed (%d).\n", r); 5254 goto exit; 5255 } 5256 5257 if (!adev->in_s0ix) { 5258 r = amdgpu_amdkfd_resume(adev, !amdgpu_sriov_vf(adev) && !adev->in_runpm); 5259 if (r) 5260 goto exit; 5261 5262 r = amdgpu_userq_resume(adev); 5263 if (r) 5264 goto exit; 5265 } 5266 5267 r = amdgpu_device_ip_late_init(adev); 5268 if (r) 5269 goto exit; 5270 5271 queue_delayed_work(system_wq, &adev->delayed_init_work, 5272 msecs_to_jiffies(AMDGPU_RESUME_MS)); 5273 exit: 5274 if (amdgpu_sriov_vf(adev)) { 5275 amdgpu_virt_init_data_exchange(adev); 5276 amdgpu_virt_release_full_gpu(adev, true); 5277 5278 if (!adev->in_s0ix && !r && !adev->in_runpm) 5279 r = amdgpu_amdkfd_resume_process(adev); 5280 } 5281 5282 if (r) 5283 return r; 5284 5285 /* Make sure IB tests flushed */ 5286 flush_delayed_work(&adev->delayed_init_work); 5287 5288 if (notify_clients) 5289 drm_client_dev_resume(adev_to_drm(adev), false); 5290 5291 amdgpu_ras_resume(adev); 5292 5293 if (adev->mode_info.num_crtc) { 5294 /* 5295 * Most of the connector probing functions try to acquire runtime pm 5296 * refs to ensure that the GPU is powered on when connector polling is 5297 * performed. Since we're calling this from a runtime PM callback, 5298 * trying to acquire rpm refs will cause us to deadlock. 5299 * 5300 * Since we're guaranteed to be holding the rpm lock, it's safe to 5301 * temporarily disable the rpm helpers so this doesn't deadlock us. 5302 */ 5303 #ifdef CONFIG_PM 5304 dev->dev->power.disable_depth++; 5305 #endif 5306 if (!adev->dc_enabled) 5307 drm_helper_hpd_irq_event(dev); 5308 else 5309 drm_kms_helper_hotplug_event(dev); 5310 #ifdef CONFIG_PM 5311 dev->dev->power.disable_depth--; 5312 #endif 5313 } 5314 adev->in_suspend = false; 5315 5316 if (amdgpu_acpi_smart_shift_update(adev, AMDGPU_SS_DEV_D0)) 5317 dev_warn(adev->dev, "smart shift update failed\n"); 5318 5319 return 0; 5320 } 5321 5322 /** 5323 * amdgpu_device_ip_check_soft_reset - did soft reset succeed 5324 * 5325 * @adev: amdgpu_device pointer 5326 * 5327 * The list of all the hardware IPs that make up the asic is walked and 5328 * the check_soft_reset callbacks are run. check_soft_reset determines 5329 * if the asic is still hung or not. 5330 * Returns true if any of the IPs are still in a hung state, false if not. 5331 */ 5332 static bool amdgpu_device_ip_check_soft_reset(struct amdgpu_device *adev) 5333 { 5334 int i; 5335 bool asic_hang = false; 5336 5337 if (amdgpu_sriov_vf(adev)) 5338 return true; 5339 5340 if (amdgpu_asic_need_full_reset(adev)) 5341 return true; 5342 5343 for (i = 0; i < adev->num_ip_blocks; i++) { 5344 if (!adev->ip_blocks[i].status.valid) 5345 continue; 5346 if (adev->ip_blocks[i].version->funcs->check_soft_reset) 5347 adev->ip_blocks[i].status.hang = 5348 adev->ip_blocks[i].version->funcs->check_soft_reset( 5349 &adev->ip_blocks[i]); 5350 if (adev->ip_blocks[i].status.hang) { 5351 dev_info(adev->dev, "IP block:%s is hung!\n", adev->ip_blocks[i].version->funcs->name); 5352 asic_hang = true; 5353 } 5354 } 5355 return asic_hang; 5356 } 5357 5358 /** 5359 * amdgpu_device_ip_pre_soft_reset - prepare for soft reset 5360 * 5361 * @adev: amdgpu_device pointer 5362 * 5363 * The list of all the hardware IPs that make up the asic is walked and the 5364 * pre_soft_reset callbacks are run if the block is hung. pre_soft_reset 5365 * handles any IP specific hardware or software state changes that are 5366 * necessary for a soft reset to succeed. 5367 * Returns 0 on success, negative error code on failure. 5368 */ 5369 static int amdgpu_device_ip_pre_soft_reset(struct amdgpu_device *adev) 5370 { 5371 int i, r = 0; 5372 5373 for (i = 0; i < adev->num_ip_blocks; i++) { 5374 if (!adev->ip_blocks[i].status.valid) 5375 continue; 5376 if (adev->ip_blocks[i].status.hang && 5377 adev->ip_blocks[i].version->funcs->pre_soft_reset) { 5378 r = adev->ip_blocks[i].version->funcs->pre_soft_reset(&adev->ip_blocks[i]); 5379 if (r) 5380 return r; 5381 } 5382 } 5383 5384 return 0; 5385 } 5386 5387 /** 5388 * amdgpu_device_ip_need_full_reset - check if a full asic reset is needed 5389 * 5390 * @adev: amdgpu_device pointer 5391 * 5392 * Some hardware IPs cannot be soft reset. If they are hung, a full gpu 5393 * reset is necessary to recover. 5394 * Returns true if a full asic reset is required, false if not. 5395 */ 5396 static bool amdgpu_device_ip_need_full_reset(struct amdgpu_device *adev) 5397 { 5398 int i; 5399 5400 if (amdgpu_asic_need_full_reset(adev)) 5401 return true; 5402 5403 for (i = 0; i < adev->num_ip_blocks; i++) { 5404 if (!adev->ip_blocks[i].status.valid) 5405 continue; 5406 if ((adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) || 5407 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) || 5408 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_ACP) || 5409 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE) || 5410 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) { 5411 if (adev->ip_blocks[i].status.hang) { 5412 dev_info(adev->dev, "Some block need full reset!\n"); 5413 return true; 5414 } 5415 } 5416 } 5417 return false; 5418 } 5419 5420 /** 5421 * amdgpu_device_ip_soft_reset - do a soft reset 5422 * 5423 * @adev: amdgpu_device pointer 5424 * 5425 * The list of all the hardware IPs that make up the asic is walked and the 5426 * soft_reset callbacks are run if the block is hung. soft_reset handles any 5427 * IP specific hardware or software state changes that are necessary to soft 5428 * reset the IP. 5429 * Returns 0 on success, negative error code on failure. 5430 */ 5431 static int amdgpu_device_ip_soft_reset(struct amdgpu_device *adev) 5432 { 5433 int i, r = 0; 5434 5435 for (i = 0; i < adev->num_ip_blocks; i++) { 5436 if (!adev->ip_blocks[i].status.valid) 5437 continue; 5438 if (adev->ip_blocks[i].status.hang && 5439 adev->ip_blocks[i].version->funcs->soft_reset) { 5440 r = adev->ip_blocks[i].version->funcs->soft_reset(&adev->ip_blocks[i]); 5441 if (r) 5442 return r; 5443 } 5444 } 5445 5446 return 0; 5447 } 5448 5449 /** 5450 * amdgpu_device_ip_post_soft_reset - clean up from soft reset 5451 * 5452 * @adev: amdgpu_device pointer 5453 * 5454 * The list of all the hardware IPs that make up the asic is walked and the 5455 * post_soft_reset callbacks are run if the asic was hung. post_soft_reset 5456 * handles any IP specific hardware or software state changes that are 5457 * necessary after the IP has been soft reset. 5458 * Returns 0 on success, negative error code on failure. 5459 */ 5460 static int amdgpu_device_ip_post_soft_reset(struct amdgpu_device *adev) 5461 { 5462 int i, r = 0; 5463 5464 for (i = 0; i < adev->num_ip_blocks; i++) { 5465 if (!adev->ip_blocks[i].status.valid) 5466 continue; 5467 if (adev->ip_blocks[i].status.hang && 5468 adev->ip_blocks[i].version->funcs->post_soft_reset) 5469 r = adev->ip_blocks[i].version->funcs->post_soft_reset(&adev->ip_blocks[i]); 5470 if (r) 5471 return r; 5472 } 5473 5474 return 0; 5475 } 5476 5477 /** 5478 * amdgpu_device_reset_sriov - reset ASIC for SR-IOV vf 5479 * 5480 * @adev: amdgpu_device pointer 5481 * @reset_context: amdgpu reset context pointer 5482 * 5483 * do VF FLR and reinitialize Asic 5484 * return 0 means succeeded otherwise failed 5485 */ 5486 static int amdgpu_device_reset_sriov(struct amdgpu_device *adev, 5487 struct amdgpu_reset_context *reset_context) 5488 { 5489 int r; 5490 struct amdgpu_hive_info *hive = NULL; 5491 5492 if (test_bit(AMDGPU_HOST_FLR, &reset_context->flags)) { 5493 if (!amdgpu_ras_get_fed_status(adev)) 5494 amdgpu_virt_ready_to_reset(adev); 5495 amdgpu_virt_wait_reset(adev); 5496 clear_bit(AMDGPU_HOST_FLR, &reset_context->flags); 5497 r = amdgpu_virt_request_full_gpu(adev, true); 5498 } else { 5499 r = amdgpu_virt_reset_gpu(adev); 5500 } 5501 if (r) 5502 return r; 5503 5504 amdgpu_ras_clear_err_state(adev); 5505 amdgpu_irq_gpu_reset_resume_helper(adev); 5506 5507 /* some sw clean up VF needs to do before recover */ 5508 amdgpu_virt_post_reset(adev); 5509 5510 /* Resume IP prior to SMC */ 5511 r = amdgpu_device_ip_reinit_early_sriov(adev); 5512 if (r) 5513 return r; 5514 5515 amdgpu_virt_init_data_exchange(adev); 5516 5517 r = amdgpu_device_fw_loading(adev); 5518 if (r) 5519 return r; 5520 5521 /* now we are okay to resume SMC/CP/SDMA */ 5522 r = amdgpu_device_ip_reinit_late_sriov(adev); 5523 if (r) 5524 return r; 5525 5526 hive = amdgpu_get_xgmi_hive(adev); 5527 /* Update PSP FW topology after reset */ 5528 if (hive && adev->gmc.xgmi.num_physical_nodes > 1) 5529 r = amdgpu_xgmi_update_topology(hive, adev); 5530 if (hive) 5531 amdgpu_put_xgmi_hive(hive); 5532 if (r) 5533 return r; 5534 5535 r = amdgpu_ib_ring_tests(adev); 5536 if (r) 5537 return r; 5538 5539 if (adev->virt.gim_feature & AMDGIM_FEATURE_GIM_FLR_VRAMLOST) 5540 amdgpu_inc_vram_lost(adev); 5541 5542 /* need to be called during full access so we can't do it later like 5543 * bare-metal does. 5544 */ 5545 amdgpu_amdkfd_post_reset(adev); 5546 amdgpu_virt_release_full_gpu(adev, true); 5547 5548 /* Aldebaran and gfx_11_0_3 support ras in SRIOV, so need resume ras during reset */ 5549 if (amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 2) || 5550 amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 3) || 5551 amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 4) || 5552 amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 5, 0) || 5553 amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(11, 0, 3)) 5554 amdgpu_ras_resume(adev); 5555 5556 amdgpu_virt_ras_telemetry_post_reset(adev); 5557 5558 return 0; 5559 } 5560 5561 /** 5562 * amdgpu_device_has_job_running - check if there is any unfinished job 5563 * 5564 * @adev: amdgpu_device pointer 5565 * 5566 * check if there is any job running on the device when guest driver receives 5567 * FLR notification from host driver. If there are still jobs running, then 5568 * the guest driver will not respond the FLR reset. Instead, let the job hit 5569 * the timeout and guest driver then issue the reset request. 5570 */ 5571 bool amdgpu_device_has_job_running(struct amdgpu_device *adev) 5572 { 5573 int i; 5574 5575 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 5576 struct amdgpu_ring *ring = adev->rings[i]; 5577 5578 if (!amdgpu_ring_sched_ready(ring)) 5579 continue; 5580 5581 if (amdgpu_fence_count_emitted(ring)) 5582 return true; 5583 } 5584 return false; 5585 } 5586 5587 /** 5588 * amdgpu_device_should_recover_gpu - check if we should try GPU recovery 5589 * 5590 * @adev: amdgpu_device pointer 5591 * 5592 * Check amdgpu_gpu_recovery and SRIOV status to see if we should try to recover 5593 * a hung GPU. 5594 */ 5595 bool amdgpu_device_should_recover_gpu(struct amdgpu_device *adev) 5596 { 5597 5598 if (amdgpu_gpu_recovery == 0) 5599 goto disabled; 5600 5601 /* Skip soft reset check in fatal error mode */ 5602 if (!amdgpu_ras_is_poison_mode_supported(adev)) 5603 return true; 5604 5605 if (amdgpu_sriov_vf(adev)) 5606 return true; 5607 5608 if (amdgpu_gpu_recovery == -1) { 5609 switch (adev->asic_type) { 5610 #ifdef CONFIG_DRM_AMDGPU_SI 5611 case CHIP_VERDE: 5612 case CHIP_TAHITI: 5613 case CHIP_PITCAIRN: 5614 case CHIP_OLAND: 5615 case CHIP_HAINAN: 5616 #endif 5617 #ifdef CONFIG_DRM_AMDGPU_CIK 5618 case CHIP_KAVERI: 5619 case CHIP_KABINI: 5620 case CHIP_MULLINS: 5621 #endif 5622 case CHIP_CARRIZO: 5623 case CHIP_STONEY: 5624 case CHIP_CYAN_SKILLFISH: 5625 goto disabled; 5626 default: 5627 break; 5628 } 5629 } 5630 5631 return true; 5632 5633 disabled: 5634 dev_info(adev->dev, "GPU recovery disabled.\n"); 5635 return false; 5636 } 5637 5638 int amdgpu_device_mode1_reset(struct amdgpu_device *adev) 5639 { 5640 u32 i; 5641 int ret = 0; 5642 5643 if (adev->bios) 5644 amdgpu_atombios_scratch_regs_engine_hung(adev, true); 5645 5646 dev_info(adev->dev, "GPU mode1 reset\n"); 5647 5648 /* Cache the state before bus master disable. The saved config space 5649 * values are used in other cases like restore after mode-2 reset. 5650 */ 5651 amdgpu_device_cache_pci_state(adev->pdev); 5652 5653 /* disable BM */ 5654 pci_clear_master(adev->pdev); 5655 5656 if (amdgpu_dpm_is_mode1_reset_supported(adev)) { 5657 dev_info(adev->dev, "GPU smu mode1 reset\n"); 5658 ret = amdgpu_dpm_mode1_reset(adev); 5659 } else { 5660 dev_info(adev->dev, "GPU psp mode1 reset\n"); 5661 ret = psp_gpu_reset(adev); 5662 } 5663 5664 if (ret) 5665 goto mode1_reset_failed; 5666 5667 amdgpu_device_load_pci_state(adev->pdev); 5668 ret = amdgpu_psp_wait_for_bootloader(adev); 5669 if (ret) 5670 goto mode1_reset_failed; 5671 5672 /* wait for asic to come out of reset */ 5673 for (i = 0; i < adev->usec_timeout; i++) { 5674 u32 memsize = adev->nbio.funcs->get_memsize(adev); 5675 5676 if (memsize != 0xffffffff) 5677 break; 5678 udelay(1); 5679 } 5680 5681 if (i >= adev->usec_timeout) { 5682 ret = -ETIMEDOUT; 5683 goto mode1_reset_failed; 5684 } 5685 5686 if (adev->bios) 5687 amdgpu_atombios_scratch_regs_engine_hung(adev, false); 5688 5689 return 0; 5690 5691 mode1_reset_failed: 5692 dev_err(adev->dev, "GPU mode1 reset failed\n"); 5693 return ret; 5694 } 5695 5696 int amdgpu_device_link_reset(struct amdgpu_device *adev) 5697 { 5698 int ret = 0; 5699 5700 dev_info(adev->dev, "GPU link reset\n"); 5701 5702 if (!adev->pcie_reset_ctx.occurs_dpc) 5703 ret = amdgpu_dpm_link_reset(adev); 5704 5705 if (ret) 5706 goto link_reset_failed; 5707 5708 ret = amdgpu_psp_wait_for_bootloader(adev); 5709 if (ret) 5710 goto link_reset_failed; 5711 5712 return 0; 5713 5714 link_reset_failed: 5715 dev_err(adev->dev, "GPU link reset failed\n"); 5716 return ret; 5717 } 5718 5719 int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev, 5720 struct amdgpu_reset_context *reset_context) 5721 { 5722 int i, r = 0; 5723 struct amdgpu_job *job = NULL; 5724 struct amdgpu_device *tmp_adev = reset_context->reset_req_dev; 5725 bool need_full_reset = 5726 test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 5727 5728 if (reset_context->reset_req_dev == adev) 5729 job = reset_context->job; 5730 5731 if (amdgpu_sriov_vf(adev)) 5732 amdgpu_virt_pre_reset(adev); 5733 5734 amdgpu_fence_driver_isr_toggle(adev, true); 5735 5736 /* block all schedulers and reset given job's ring */ 5737 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 5738 struct amdgpu_ring *ring = adev->rings[i]; 5739 5740 if (!amdgpu_ring_sched_ready(ring)) 5741 continue; 5742 5743 /* Clear job fence from fence drv to avoid force_completion 5744 * leave NULL and vm flush fence in fence drv 5745 */ 5746 amdgpu_fence_driver_clear_job_fences(ring); 5747 5748 /* after all hw jobs are reset, hw fence is meaningless, so force_completion */ 5749 amdgpu_fence_driver_force_completion(ring); 5750 } 5751 5752 amdgpu_fence_driver_isr_toggle(adev, false); 5753 5754 if (job && job->vm) 5755 drm_sched_increase_karma(&job->base); 5756 5757 r = amdgpu_reset_prepare_hwcontext(adev, reset_context); 5758 /* If reset handler not implemented, continue; otherwise return */ 5759 if (r == -EOPNOTSUPP) 5760 r = 0; 5761 else 5762 return r; 5763 5764 /* Don't suspend on bare metal if we are not going to HW reset the ASIC */ 5765 if (!amdgpu_sriov_vf(adev)) { 5766 5767 if (!need_full_reset) 5768 need_full_reset = amdgpu_device_ip_need_full_reset(adev); 5769 5770 if (!need_full_reset && amdgpu_gpu_recovery && 5771 amdgpu_device_ip_check_soft_reset(adev)) { 5772 amdgpu_device_ip_pre_soft_reset(adev); 5773 r = amdgpu_device_ip_soft_reset(adev); 5774 amdgpu_device_ip_post_soft_reset(adev); 5775 if (r || amdgpu_device_ip_check_soft_reset(adev)) { 5776 dev_info(adev->dev, "soft reset failed, will fallback to full reset!\n"); 5777 need_full_reset = true; 5778 } 5779 } 5780 5781 if (!test_bit(AMDGPU_SKIP_COREDUMP, &reset_context->flags)) { 5782 dev_info(tmp_adev->dev, "Dumping IP State\n"); 5783 /* Trigger ip dump before we reset the asic */ 5784 for (i = 0; i < tmp_adev->num_ip_blocks; i++) 5785 if (tmp_adev->ip_blocks[i].version->funcs->dump_ip_state) 5786 tmp_adev->ip_blocks[i].version->funcs 5787 ->dump_ip_state((void *)&tmp_adev->ip_blocks[i]); 5788 dev_info(tmp_adev->dev, "Dumping IP State Completed\n"); 5789 } 5790 5791 if (need_full_reset) 5792 r = amdgpu_device_ip_suspend(adev); 5793 if (need_full_reset) 5794 set_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 5795 else 5796 clear_bit(AMDGPU_NEED_FULL_RESET, 5797 &reset_context->flags); 5798 } 5799 5800 return r; 5801 } 5802 5803 int amdgpu_device_reinit_after_reset(struct amdgpu_reset_context *reset_context) 5804 { 5805 struct list_head *device_list_handle; 5806 bool full_reset, vram_lost = false; 5807 struct amdgpu_device *tmp_adev; 5808 int r, init_level; 5809 5810 device_list_handle = reset_context->reset_device_list; 5811 5812 if (!device_list_handle) 5813 return -EINVAL; 5814 5815 full_reset = test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 5816 5817 /** 5818 * If it's reset on init, it's default init level, otherwise keep level 5819 * as recovery level. 5820 */ 5821 if (reset_context->method == AMD_RESET_METHOD_ON_INIT) 5822 init_level = AMDGPU_INIT_LEVEL_DEFAULT; 5823 else 5824 init_level = AMDGPU_INIT_LEVEL_RESET_RECOVERY; 5825 5826 r = 0; 5827 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 5828 amdgpu_set_init_level(tmp_adev, init_level); 5829 if (full_reset) { 5830 /* post card */ 5831 amdgpu_ras_clear_err_state(tmp_adev); 5832 r = amdgpu_device_asic_init(tmp_adev); 5833 if (r) { 5834 dev_warn(tmp_adev->dev, "asic atom init failed!"); 5835 } else { 5836 dev_info(tmp_adev->dev, "GPU reset succeeded, trying to resume\n"); 5837 5838 r = amdgpu_device_ip_resume_phase1(tmp_adev); 5839 if (r) 5840 goto out; 5841 5842 vram_lost = amdgpu_device_check_vram_lost(tmp_adev); 5843 5844 if (!test_bit(AMDGPU_SKIP_COREDUMP, &reset_context->flags)) 5845 amdgpu_coredump(tmp_adev, false, vram_lost, reset_context->job); 5846 5847 if (vram_lost) { 5848 dev_info( 5849 tmp_adev->dev, 5850 "VRAM is lost due to GPU reset!\n"); 5851 amdgpu_inc_vram_lost(tmp_adev); 5852 } 5853 5854 r = amdgpu_device_fw_loading(tmp_adev); 5855 if (r) 5856 return r; 5857 5858 r = amdgpu_xcp_restore_partition_mode( 5859 tmp_adev->xcp_mgr); 5860 if (r) 5861 goto out; 5862 5863 r = amdgpu_device_ip_resume_phase2(tmp_adev); 5864 if (r) 5865 goto out; 5866 5867 if (tmp_adev->mman.buffer_funcs_ring->sched.ready) 5868 amdgpu_ttm_set_buffer_funcs_status(tmp_adev, true); 5869 5870 r = amdgpu_device_ip_resume_phase3(tmp_adev); 5871 if (r) 5872 goto out; 5873 5874 if (vram_lost) 5875 amdgpu_device_fill_reset_magic(tmp_adev); 5876 5877 /* 5878 * Add this ASIC as tracked as reset was already 5879 * complete successfully. 5880 */ 5881 amdgpu_register_gpu_instance(tmp_adev); 5882 5883 if (!reset_context->hive && 5884 tmp_adev->gmc.xgmi.num_physical_nodes > 1) 5885 amdgpu_xgmi_add_device(tmp_adev); 5886 5887 r = amdgpu_device_ip_late_init(tmp_adev); 5888 if (r) 5889 goto out; 5890 5891 drm_client_dev_resume(adev_to_drm(tmp_adev), false); 5892 5893 /* 5894 * The GPU enters bad state once faulty pages 5895 * by ECC has reached the threshold, and ras 5896 * recovery is scheduled next. So add one check 5897 * here to break recovery if it indeed exceeds 5898 * bad page threshold, and remind user to 5899 * retire this GPU or setting one bigger 5900 * bad_page_threshold value to fix this once 5901 * probing driver again. 5902 */ 5903 if (!amdgpu_ras_is_rma(tmp_adev)) { 5904 /* must succeed. */ 5905 amdgpu_ras_resume(tmp_adev); 5906 } else { 5907 r = -EINVAL; 5908 goto out; 5909 } 5910 5911 /* Update PSP FW topology after reset */ 5912 if (reset_context->hive && 5913 tmp_adev->gmc.xgmi.num_physical_nodes > 1) 5914 r = amdgpu_xgmi_update_topology( 5915 reset_context->hive, tmp_adev); 5916 } 5917 } 5918 5919 out: 5920 if (!r) { 5921 /* IP init is complete now, set level as default */ 5922 amdgpu_set_init_level(tmp_adev, 5923 AMDGPU_INIT_LEVEL_DEFAULT); 5924 amdgpu_irq_gpu_reset_resume_helper(tmp_adev); 5925 r = amdgpu_ib_ring_tests(tmp_adev); 5926 if (r) { 5927 dev_err(tmp_adev->dev, "ib ring test failed (%d).\n", r); 5928 r = -EAGAIN; 5929 goto end; 5930 } 5931 } 5932 5933 if (r) 5934 tmp_adev->asic_reset_res = r; 5935 } 5936 5937 end: 5938 return r; 5939 } 5940 5941 int amdgpu_do_asic_reset(struct list_head *device_list_handle, 5942 struct amdgpu_reset_context *reset_context) 5943 { 5944 struct amdgpu_device *tmp_adev = NULL; 5945 bool need_full_reset, skip_hw_reset; 5946 int r = 0; 5947 5948 /* Try reset handler method first */ 5949 tmp_adev = list_first_entry(device_list_handle, struct amdgpu_device, 5950 reset_list); 5951 5952 reset_context->reset_device_list = device_list_handle; 5953 r = amdgpu_reset_perform_reset(tmp_adev, reset_context); 5954 /* If reset handler not implemented, continue; otherwise return */ 5955 if (r == -EOPNOTSUPP) 5956 r = 0; 5957 else 5958 return r; 5959 5960 /* Reset handler not implemented, use the default method */ 5961 need_full_reset = 5962 test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 5963 skip_hw_reset = test_bit(AMDGPU_SKIP_HW_RESET, &reset_context->flags); 5964 5965 /* 5966 * ASIC reset has to be done on all XGMI hive nodes ASAP 5967 * to allow proper links negotiation in FW (within 1 sec) 5968 */ 5969 if (!skip_hw_reset && need_full_reset) { 5970 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 5971 /* For XGMI run all resets in parallel to speed up the process */ 5972 if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) { 5973 if (!queue_work(system_unbound_wq, 5974 &tmp_adev->xgmi_reset_work)) 5975 r = -EALREADY; 5976 } else 5977 r = amdgpu_asic_reset(tmp_adev); 5978 5979 if (r) { 5980 dev_err(tmp_adev->dev, 5981 "ASIC reset failed with error, %d for drm dev, %s", 5982 r, adev_to_drm(tmp_adev)->unique); 5983 goto out; 5984 } 5985 } 5986 5987 /* For XGMI wait for all resets to complete before proceed */ 5988 if (!r) { 5989 list_for_each_entry(tmp_adev, device_list_handle, 5990 reset_list) { 5991 if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) { 5992 flush_work(&tmp_adev->xgmi_reset_work); 5993 r = tmp_adev->asic_reset_res; 5994 if (r) 5995 break; 5996 } 5997 } 5998 } 5999 } 6000 6001 if (!r && amdgpu_ras_intr_triggered()) { 6002 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 6003 amdgpu_ras_reset_error_count(tmp_adev, 6004 AMDGPU_RAS_BLOCK__MMHUB); 6005 } 6006 6007 amdgpu_ras_intr_cleared(); 6008 } 6009 6010 r = amdgpu_device_reinit_after_reset(reset_context); 6011 if (r == -EAGAIN) 6012 set_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 6013 else 6014 clear_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 6015 6016 out: 6017 return r; 6018 } 6019 6020 static void amdgpu_device_set_mp1_state(struct amdgpu_device *adev) 6021 { 6022 6023 switch (amdgpu_asic_reset_method(adev)) { 6024 case AMD_RESET_METHOD_MODE1: 6025 case AMD_RESET_METHOD_LINK: 6026 adev->mp1_state = PP_MP1_STATE_SHUTDOWN; 6027 break; 6028 case AMD_RESET_METHOD_MODE2: 6029 adev->mp1_state = PP_MP1_STATE_RESET; 6030 break; 6031 default: 6032 adev->mp1_state = PP_MP1_STATE_NONE; 6033 break; 6034 } 6035 } 6036 6037 static void amdgpu_device_unset_mp1_state(struct amdgpu_device *adev) 6038 { 6039 amdgpu_vf_error_trans_all(adev); 6040 adev->mp1_state = PP_MP1_STATE_NONE; 6041 } 6042 6043 static void amdgpu_device_resume_display_audio(struct amdgpu_device *adev) 6044 { 6045 struct pci_dev *p = NULL; 6046 6047 p = pci_get_domain_bus_and_slot(pci_domain_nr(adev->pdev->bus), 6048 adev->pdev->bus->number, 1); 6049 if (p) { 6050 pm_runtime_enable(&(p->dev)); 6051 pm_runtime_resume(&(p->dev)); 6052 } 6053 6054 pci_dev_put(p); 6055 } 6056 6057 static int amdgpu_device_suspend_display_audio(struct amdgpu_device *adev) 6058 { 6059 enum amd_reset_method reset_method; 6060 struct pci_dev *p = NULL; 6061 u64 expires; 6062 6063 /* 6064 * For now, only BACO and mode1 reset are confirmed 6065 * to suffer the audio issue without proper suspended. 6066 */ 6067 reset_method = amdgpu_asic_reset_method(adev); 6068 if ((reset_method != AMD_RESET_METHOD_BACO) && 6069 (reset_method != AMD_RESET_METHOD_MODE1)) 6070 return -EINVAL; 6071 6072 p = pci_get_domain_bus_and_slot(pci_domain_nr(adev->pdev->bus), 6073 adev->pdev->bus->number, 1); 6074 if (!p) 6075 return -ENODEV; 6076 6077 expires = pm_runtime_autosuspend_expiration(&(p->dev)); 6078 if (!expires) 6079 /* 6080 * If we cannot get the audio device autosuspend delay, 6081 * a fixed 4S interval will be used. Considering 3S is 6082 * the audio controller default autosuspend delay setting. 6083 * 4S used here is guaranteed to cover that. 6084 */ 6085 expires = ktime_get_mono_fast_ns() + NSEC_PER_SEC * 4ULL; 6086 6087 while (!pm_runtime_status_suspended(&(p->dev))) { 6088 if (!pm_runtime_suspend(&(p->dev))) 6089 break; 6090 6091 if (expires < ktime_get_mono_fast_ns()) { 6092 dev_warn(adev->dev, "failed to suspend display audio\n"); 6093 pci_dev_put(p); 6094 /* TODO: abort the succeeding gpu reset? */ 6095 return -ETIMEDOUT; 6096 } 6097 } 6098 6099 pm_runtime_disable(&(p->dev)); 6100 6101 pci_dev_put(p); 6102 return 0; 6103 } 6104 6105 static inline void amdgpu_device_stop_pending_resets(struct amdgpu_device *adev) 6106 { 6107 struct amdgpu_ras *con = amdgpu_ras_get_context(adev); 6108 6109 #if defined(CONFIG_DEBUG_FS) 6110 if (!amdgpu_sriov_vf(adev)) 6111 cancel_work(&adev->reset_work); 6112 #endif 6113 6114 if (adev->kfd.dev) 6115 cancel_work(&adev->kfd.reset_work); 6116 6117 if (amdgpu_sriov_vf(adev)) 6118 cancel_work(&adev->virt.flr_work); 6119 6120 if (con && adev->ras_enabled) 6121 cancel_work(&con->recovery_work); 6122 6123 } 6124 6125 static int amdgpu_device_health_check(struct list_head *device_list_handle) 6126 { 6127 struct amdgpu_device *tmp_adev; 6128 int ret = 0; 6129 6130 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 6131 ret |= amdgpu_device_bus_status_check(tmp_adev); 6132 } 6133 6134 return ret; 6135 } 6136 6137 static int amdgpu_device_recovery_prepare(struct amdgpu_device *adev, 6138 struct list_head *device_list, 6139 struct amdgpu_hive_info *hive) 6140 { 6141 struct amdgpu_device *tmp_adev = NULL; 6142 int r; 6143 6144 /* 6145 * Build list of devices to reset. 6146 * In case we are in XGMI hive mode, resort the device list 6147 * to put adev in the 1st position. 6148 */ 6149 if (!amdgpu_sriov_vf(adev) && (adev->gmc.xgmi.num_physical_nodes > 1) && hive) { 6150 list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head) { 6151 list_add_tail(&tmp_adev->reset_list, device_list); 6152 if (adev->shutdown) 6153 tmp_adev->shutdown = true; 6154 if (adev->pcie_reset_ctx.occurs_dpc) 6155 tmp_adev->pcie_reset_ctx.in_link_reset = true; 6156 } 6157 if (!list_is_first(&adev->reset_list, device_list)) 6158 list_rotate_to_front(&adev->reset_list, device_list); 6159 } else { 6160 list_add_tail(&adev->reset_list, device_list); 6161 } 6162 6163 if (!amdgpu_sriov_vf(adev) && (!adev->pcie_reset_ctx.occurs_dpc)) { 6164 r = amdgpu_device_health_check(device_list); 6165 if (r) 6166 return r; 6167 } 6168 6169 return 0; 6170 } 6171 6172 static void amdgpu_device_recovery_get_reset_lock(struct amdgpu_device *adev, 6173 struct list_head *device_list) 6174 { 6175 struct amdgpu_device *tmp_adev = NULL; 6176 6177 if (list_empty(device_list)) 6178 return; 6179 tmp_adev = 6180 list_first_entry(device_list, struct amdgpu_device, reset_list); 6181 amdgpu_device_lock_reset_domain(tmp_adev->reset_domain); 6182 } 6183 6184 static void amdgpu_device_recovery_put_reset_lock(struct amdgpu_device *adev, 6185 struct list_head *device_list) 6186 { 6187 struct amdgpu_device *tmp_adev = NULL; 6188 6189 if (list_empty(device_list)) 6190 return; 6191 tmp_adev = 6192 list_first_entry(device_list, struct amdgpu_device, reset_list); 6193 amdgpu_device_unlock_reset_domain(tmp_adev->reset_domain); 6194 } 6195 6196 static void amdgpu_device_halt_activities(struct amdgpu_device *adev, 6197 struct amdgpu_job *job, 6198 struct amdgpu_reset_context *reset_context, 6199 struct list_head *device_list, 6200 struct amdgpu_hive_info *hive, 6201 bool need_emergency_restart) 6202 { 6203 struct amdgpu_device *tmp_adev = NULL; 6204 int i; 6205 6206 /* block all schedulers and reset given job's ring */ 6207 list_for_each_entry(tmp_adev, device_list, reset_list) { 6208 amdgpu_device_set_mp1_state(tmp_adev); 6209 6210 /* 6211 * Try to put the audio codec into suspend state 6212 * before gpu reset started. 6213 * 6214 * Due to the power domain of the graphics device 6215 * is shared with AZ power domain. Without this, 6216 * we may change the audio hardware from behind 6217 * the audio driver's back. That will trigger 6218 * some audio codec errors. 6219 */ 6220 if (!amdgpu_device_suspend_display_audio(tmp_adev)) 6221 tmp_adev->pcie_reset_ctx.audio_suspended = true; 6222 6223 amdgpu_ras_set_error_query_ready(tmp_adev, false); 6224 6225 cancel_delayed_work_sync(&tmp_adev->delayed_init_work); 6226 6227 amdgpu_amdkfd_pre_reset(tmp_adev, reset_context); 6228 6229 /* 6230 * Mark these ASICs to be reset as untracked first 6231 * And add them back after reset completed 6232 */ 6233 amdgpu_unregister_gpu_instance(tmp_adev); 6234 6235 drm_client_dev_suspend(adev_to_drm(tmp_adev), false); 6236 6237 /* disable ras on ALL IPs */ 6238 if (!need_emergency_restart && 6239 (!adev->pcie_reset_ctx.occurs_dpc) && 6240 amdgpu_device_ip_need_full_reset(tmp_adev)) 6241 amdgpu_ras_suspend(tmp_adev); 6242 6243 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 6244 struct amdgpu_ring *ring = tmp_adev->rings[i]; 6245 6246 if (!amdgpu_ring_sched_ready(ring)) 6247 continue; 6248 6249 drm_sched_stop(&ring->sched, job ? &job->base : NULL); 6250 6251 if (need_emergency_restart) 6252 amdgpu_job_stop_all_jobs_on_sched(&ring->sched); 6253 } 6254 atomic_inc(&tmp_adev->gpu_reset_counter); 6255 } 6256 } 6257 6258 static int amdgpu_device_asic_reset(struct amdgpu_device *adev, 6259 struct list_head *device_list, 6260 struct amdgpu_reset_context *reset_context) 6261 { 6262 struct amdgpu_device *tmp_adev = NULL; 6263 int retry_limit = AMDGPU_MAX_RETRY_LIMIT; 6264 int r = 0; 6265 6266 retry: /* Rest of adevs pre asic reset from XGMI hive. */ 6267 list_for_each_entry(tmp_adev, device_list, reset_list) { 6268 if (adev->pcie_reset_ctx.occurs_dpc) 6269 tmp_adev->no_hw_access = true; 6270 r = amdgpu_device_pre_asic_reset(tmp_adev, reset_context); 6271 if (adev->pcie_reset_ctx.occurs_dpc) 6272 tmp_adev->no_hw_access = false; 6273 /*TODO Should we stop ?*/ 6274 if (r) { 6275 dev_err(tmp_adev->dev, "GPU pre asic reset failed with err, %d for drm dev, %s ", 6276 r, adev_to_drm(tmp_adev)->unique); 6277 tmp_adev->asic_reset_res = r; 6278 } 6279 } 6280 6281 /* Actual ASIC resets if needed.*/ 6282 /* Host driver will handle XGMI hive reset for SRIOV */ 6283 if (amdgpu_sriov_vf(adev)) { 6284 6285 /* Bail out of reset early */ 6286 if (amdgpu_ras_is_rma(adev)) 6287 return -ENODEV; 6288 6289 if (amdgpu_ras_get_fed_status(adev) || amdgpu_virt_rcvd_ras_interrupt(adev)) { 6290 dev_dbg(adev->dev, "Detected RAS error, wait for FLR completion\n"); 6291 amdgpu_ras_set_fed(adev, true); 6292 set_bit(AMDGPU_HOST_FLR, &reset_context->flags); 6293 } 6294 6295 r = amdgpu_device_reset_sriov(adev, reset_context); 6296 if (AMDGPU_RETRY_SRIOV_RESET(r) && (retry_limit--) > 0) { 6297 amdgpu_virt_release_full_gpu(adev, true); 6298 goto retry; 6299 } 6300 if (r) 6301 adev->asic_reset_res = r; 6302 } else { 6303 r = amdgpu_do_asic_reset(device_list, reset_context); 6304 if (r && r == -EAGAIN) 6305 goto retry; 6306 } 6307 6308 list_for_each_entry(tmp_adev, device_list, reset_list) { 6309 /* 6310 * Drop any pending non scheduler resets queued before reset is done. 6311 * Any reset scheduled after this point would be valid. Scheduler resets 6312 * were already dropped during drm_sched_stop and no new ones can come 6313 * in before drm_sched_start. 6314 */ 6315 amdgpu_device_stop_pending_resets(tmp_adev); 6316 } 6317 6318 return r; 6319 } 6320 6321 static int amdgpu_device_sched_resume(struct list_head *device_list, 6322 struct amdgpu_reset_context *reset_context, 6323 bool job_signaled) 6324 { 6325 struct amdgpu_device *tmp_adev = NULL; 6326 int i, r = 0; 6327 6328 /* Post ASIC reset for all devs .*/ 6329 list_for_each_entry(tmp_adev, device_list, reset_list) { 6330 6331 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 6332 struct amdgpu_ring *ring = tmp_adev->rings[i]; 6333 6334 if (!amdgpu_ring_sched_ready(ring)) 6335 continue; 6336 6337 drm_sched_start(&ring->sched, 0); 6338 } 6339 6340 if (!drm_drv_uses_atomic_modeset(adev_to_drm(tmp_adev)) && !job_signaled) 6341 drm_helper_resume_force_mode(adev_to_drm(tmp_adev)); 6342 6343 if (tmp_adev->asic_reset_res) 6344 r = tmp_adev->asic_reset_res; 6345 6346 tmp_adev->asic_reset_res = 0; 6347 6348 if (r) { 6349 /* bad news, how to tell it to userspace ? 6350 * for ras error, we should report GPU bad status instead of 6351 * reset failure 6352 */ 6353 if (reset_context->src != AMDGPU_RESET_SRC_RAS || 6354 !amdgpu_ras_eeprom_check_err_threshold(tmp_adev)) 6355 dev_info(tmp_adev->dev, "GPU reset(%d) failed\n", 6356 atomic_read(&tmp_adev->gpu_reset_counter)); 6357 amdgpu_vf_error_put(tmp_adev, AMDGIM_ERROR_VF_GPU_RESET_FAIL, 0, r); 6358 } else { 6359 dev_info(tmp_adev->dev, "GPU reset(%d) succeeded!\n", atomic_read(&tmp_adev->gpu_reset_counter)); 6360 if (amdgpu_acpi_smart_shift_update(tmp_adev, 6361 AMDGPU_SS_DEV_D0)) 6362 dev_warn(tmp_adev->dev, 6363 "smart shift update failed\n"); 6364 } 6365 } 6366 6367 return r; 6368 } 6369 6370 static void amdgpu_device_gpu_resume(struct amdgpu_device *adev, 6371 struct list_head *device_list, 6372 bool need_emergency_restart) 6373 { 6374 struct amdgpu_device *tmp_adev = NULL; 6375 6376 list_for_each_entry(tmp_adev, device_list, reset_list) { 6377 /* unlock kfd: SRIOV would do it separately */ 6378 if (!need_emergency_restart && !amdgpu_sriov_vf(tmp_adev)) 6379 amdgpu_amdkfd_post_reset(tmp_adev); 6380 6381 /* kfd_post_reset will do nothing if kfd device is not initialized, 6382 * need to bring up kfd here if it's not be initialized before 6383 */ 6384 if (!adev->kfd.init_complete) 6385 amdgpu_amdkfd_device_init(adev); 6386 6387 if (tmp_adev->pcie_reset_ctx.audio_suspended) 6388 amdgpu_device_resume_display_audio(tmp_adev); 6389 6390 amdgpu_device_unset_mp1_state(tmp_adev); 6391 6392 amdgpu_ras_set_error_query_ready(tmp_adev, true); 6393 6394 } 6395 } 6396 6397 6398 /** 6399 * amdgpu_device_gpu_recover - reset the asic and recover scheduler 6400 * 6401 * @adev: amdgpu_device pointer 6402 * @job: which job trigger hang 6403 * @reset_context: amdgpu reset context pointer 6404 * 6405 * Attempt to reset the GPU if it has hung (all asics). 6406 * Attempt to do soft-reset or full-reset and reinitialize Asic 6407 * Returns 0 for success or an error on failure. 6408 */ 6409 6410 int amdgpu_device_gpu_recover(struct amdgpu_device *adev, 6411 struct amdgpu_job *job, 6412 struct amdgpu_reset_context *reset_context) 6413 { 6414 struct list_head device_list; 6415 bool job_signaled = false; 6416 struct amdgpu_hive_info *hive = NULL; 6417 int r = 0; 6418 bool need_emergency_restart = false; 6419 6420 /* 6421 * If it reaches here because of hang/timeout and a RAS error is 6422 * detected at the same time, let RAS recovery take care of it. 6423 */ 6424 if (amdgpu_ras_is_err_state(adev, AMDGPU_RAS_BLOCK__ANY) && 6425 !amdgpu_sriov_vf(adev) && 6426 reset_context->src != AMDGPU_RESET_SRC_RAS) { 6427 dev_dbg(adev->dev, 6428 "Gpu recovery from source: %d yielding to RAS error recovery handling", 6429 reset_context->src); 6430 return 0; 6431 } 6432 6433 /* 6434 * Special case: RAS triggered and full reset isn't supported 6435 */ 6436 need_emergency_restart = amdgpu_ras_need_emergency_restart(adev); 6437 6438 /* 6439 * Flush RAM to disk so that after reboot 6440 * the user can read log and see why the system rebooted. 6441 */ 6442 if (need_emergency_restart && amdgpu_ras_get_context(adev) && 6443 amdgpu_ras_get_context(adev)->reboot) { 6444 dev_warn(adev->dev, "Emergency reboot."); 6445 6446 ksys_sync_helper(); 6447 emergency_restart(); 6448 } 6449 6450 dev_info(adev->dev, "GPU %s begin!\n", 6451 need_emergency_restart ? "jobs stop":"reset"); 6452 6453 if (!amdgpu_sriov_vf(adev)) 6454 hive = amdgpu_get_xgmi_hive(adev); 6455 if (hive) 6456 mutex_lock(&hive->hive_lock); 6457 6458 reset_context->job = job; 6459 reset_context->hive = hive; 6460 INIT_LIST_HEAD(&device_list); 6461 6462 if (amdgpu_device_recovery_prepare(adev, &device_list, hive)) 6463 goto end_reset; 6464 6465 /* We need to lock reset domain only once both for XGMI and single device */ 6466 amdgpu_device_recovery_get_reset_lock(adev, &device_list); 6467 6468 amdgpu_device_halt_activities(adev, job, reset_context, &device_list, 6469 hive, need_emergency_restart); 6470 if (need_emergency_restart) 6471 goto skip_sched_resume; 6472 /* 6473 * Must check guilty signal here since after this point all old 6474 * HW fences are force signaled. 6475 * 6476 * job->base holds a reference to parent fence 6477 */ 6478 if (job && dma_fence_is_signaled(&job->hw_fence.base)) { 6479 job_signaled = true; 6480 dev_info(adev->dev, "Guilty job already signaled, skipping HW reset"); 6481 goto skip_hw_reset; 6482 } 6483 6484 r = amdgpu_device_asic_reset(adev, &device_list, reset_context); 6485 if (r) 6486 goto reset_unlock; 6487 skip_hw_reset: 6488 r = amdgpu_device_sched_resume(&device_list, reset_context, job_signaled); 6489 if (r) 6490 goto reset_unlock; 6491 skip_sched_resume: 6492 amdgpu_device_gpu_resume(adev, &device_list, need_emergency_restart); 6493 reset_unlock: 6494 amdgpu_device_recovery_put_reset_lock(adev, &device_list); 6495 end_reset: 6496 if (hive) { 6497 mutex_unlock(&hive->hive_lock); 6498 amdgpu_put_xgmi_hive(hive); 6499 } 6500 6501 if (r) 6502 dev_info(adev->dev, "GPU reset end with ret = %d\n", r); 6503 6504 atomic_set(&adev->reset_domain->reset_res, r); 6505 6506 if (!r) { 6507 struct amdgpu_task_info *ti = NULL; 6508 6509 if (job) 6510 ti = amdgpu_vm_get_task_info_pasid(adev, job->pasid); 6511 6512 drm_dev_wedged_event(adev_to_drm(adev), DRM_WEDGE_RECOVERY_NONE, 6513 ti ? &ti->task : NULL); 6514 6515 amdgpu_vm_put_task_info(ti); 6516 } 6517 6518 return r; 6519 } 6520 6521 /** 6522 * amdgpu_device_partner_bandwidth - find the bandwidth of appropriate partner 6523 * 6524 * @adev: amdgpu_device pointer 6525 * @speed: pointer to the speed of the link 6526 * @width: pointer to the width of the link 6527 * 6528 * Evaluate the hierarchy to find the speed and bandwidth capabilities of the 6529 * first physical partner to an AMD dGPU. 6530 * This will exclude any virtual switches and links. 6531 */ 6532 static void amdgpu_device_partner_bandwidth(struct amdgpu_device *adev, 6533 enum pci_bus_speed *speed, 6534 enum pcie_link_width *width) 6535 { 6536 struct pci_dev *parent = adev->pdev; 6537 6538 if (!speed || !width) 6539 return; 6540 6541 *speed = PCI_SPEED_UNKNOWN; 6542 *width = PCIE_LNK_WIDTH_UNKNOWN; 6543 6544 if (amdgpu_device_pcie_dynamic_switching_supported(adev)) { 6545 while ((parent = pci_upstream_bridge(parent))) { 6546 /* skip upstream/downstream switches internal to dGPU*/ 6547 if (parent->vendor == PCI_VENDOR_ID_ATI) 6548 continue; 6549 *speed = pcie_get_speed_cap(parent); 6550 *width = pcie_get_width_cap(parent); 6551 break; 6552 } 6553 } else { 6554 /* use the current speeds rather than max if switching is not supported */ 6555 pcie_bandwidth_available(adev->pdev, NULL, speed, width); 6556 } 6557 } 6558 6559 /** 6560 * amdgpu_device_gpu_bandwidth - find the bandwidth of the GPU 6561 * 6562 * @adev: amdgpu_device pointer 6563 * @speed: pointer to the speed of the link 6564 * @width: pointer to the width of the link 6565 * 6566 * Evaluate the hierarchy to find the speed and bandwidth capabilities of the 6567 * AMD dGPU which may be a virtual upstream bridge. 6568 */ 6569 static void amdgpu_device_gpu_bandwidth(struct amdgpu_device *adev, 6570 enum pci_bus_speed *speed, 6571 enum pcie_link_width *width) 6572 { 6573 struct pci_dev *parent = adev->pdev; 6574 6575 if (!speed || !width) 6576 return; 6577 6578 parent = pci_upstream_bridge(parent); 6579 if (parent && parent->vendor == PCI_VENDOR_ID_ATI) { 6580 /* use the upstream/downstream switches internal to dGPU */ 6581 *speed = pcie_get_speed_cap(parent); 6582 *width = pcie_get_width_cap(parent); 6583 while ((parent = pci_upstream_bridge(parent))) { 6584 if (parent->vendor == PCI_VENDOR_ID_ATI) { 6585 /* use the upstream/downstream switches internal to dGPU */ 6586 *speed = pcie_get_speed_cap(parent); 6587 *width = pcie_get_width_cap(parent); 6588 } 6589 } 6590 } else { 6591 /* use the device itself */ 6592 *speed = pcie_get_speed_cap(adev->pdev); 6593 *width = pcie_get_width_cap(adev->pdev); 6594 } 6595 } 6596 6597 /** 6598 * amdgpu_device_get_pcie_info - fence pcie info about the PCIE slot 6599 * 6600 * @adev: amdgpu_device pointer 6601 * 6602 * Fetches and stores in the driver the PCIE capabilities (gen speed 6603 * and lanes) of the slot the device is in. Handles APUs and 6604 * virtualized environments where PCIE config space may not be available. 6605 */ 6606 static void amdgpu_device_get_pcie_info(struct amdgpu_device *adev) 6607 { 6608 enum pci_bus_speed speed_cap, platform_speed_cap; 6609 enum pcie_link_width platform_link_width, link_width; 6610 6611 if (amdgpu_pcie_gen_cap) 6612 adev->pm.pcie_gen_mask = amdgpu_pcie_gen_cap; 6613 6614 if (amdgpu_pcie_lane_cap) 6615 adev->pm.pcie_mlw_mask = amdgpu_pcie_lane_cap; 6616 6617 /* covers APUs as well */ 6618 if (pci_is_root_bus(adev->pdev->bus) && !amdgpu_passthrough(adev)) { 6619 if (adev->pm.pcie_gen_mask == 0) 6620 adev->pm.pcie_gen_mask = AMDGPU_DEFAULT_PCIE_GEN_MASK; 6621 if (adev->pm.pcie_mlw_mask == 0) 6622 adev->pm.pcie_mlw_mask = AMDGPU_DEFAULT_PCIE_MLW_MASK; 6623 return; 6624 } 6625 6626 if (adev->pm.pcie_gen_mask && adev->pm.pcie_mlw_mask) 6627 return; 6628 6629 amdgpu_device_partner_bandwidth(adev, &platform_speed_cap, 6630 &platform_link_width); 6631 amdgpu_device_gpu_bandwidth(adev, &speed_cap, &link_width); 6632 6633 if (adev->pm.pcie_gen_mask == 0) { 6634 /* asic caps */ 6635 if (speed_cap == PCI_SPEED_UNKNOWN) { 6636 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 6637 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 | 6638 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3); 6639 } else { 6640 if (speed_cap == PCIE_SPEED_32_0GT) 6641 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 6642 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 | 6643 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3 | 6644 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN4 | 6645 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN5); 6646 else if (speed_cap == PCIE_SPEED_16_0GT) 6647 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 6648 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 | 6649 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3 | 6650 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN4); 6651 else if (speed_cap == PCIE_SPEED_8_0GT) 6652 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 6653 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 | 6654 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3); 6655 else if (speed_cap == PCIE_SPEED_5_0GT) 6656 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 6657 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2); 6658 else 6659 adev->pm.pcie_gen_mask |= CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1; 6660 } 6661 /* platform caps */ 6662 if (platform_speed_cap == PCI_SPEED_UNKNOWN) { 6663 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 6664 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2); 6665 } else { 6666 if (platform_speed_cap == PCIE_SPEED_32_0GT) 6667 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 6668 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 | 6669 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3 | 6670 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN4 | 6671 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN5); 6672 else if (platform_speed_cap == PCIE_SPEED_16_0GT) 6673 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 6674 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 | 6675 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3 | 6676 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN4); 6677 else if (platform_speed_cap == PCIE_SPEED_8_0GT) 6678 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 6679 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 | 6680 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3); 6681 else if (platform_speed_cap == PCIE_SPEED_5_0GT) 6682 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 6683 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2); 6684 else 6685 adev->pm.pcie_gen_mask |= CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1; 6686 6687 } 6688 } 6689 if (adev->pm.pcie_mlw_mask == 0) { 6690 /* asic caps */ 6691 if (link_width == PCIE_LNK_WIDTH_UNKNOWN) { 6692 adev->pm.pcie_mlw_mask |= AMDGPU_DEFAULT_ASIC_PCIE_MLW_MASK; 6693 } else { 6694 switch (link_width) { 6695 case PCIE_LNK_X32: 6696 adev->pm.pcie_mlw_mask |= (CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X32 | 6697 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X16 | 6698 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X12 | 6699 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X8 | 6700 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X4 | 6701 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X2 | 6702 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X1); 6703 break; 6704 case PCIE_LNK_X16: 6705 adev->pm.pcie_mlw_mask |= (CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X16 | 6706 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X12 | 6707 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X8 | 6708 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X4 | 6709 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X2 | 6710 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X1); 6711 break; 6712 case PCIE_LNK_X12: 6713 adev->pm.pcie_mlw_mask |= (CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X12 | 6714 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X8 | 6715 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X4 | 6716 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X2 | 6717 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X1); 6718 break; 6719 case PCIE_LNK_X8: 6720 adev->pm.pcie_mlw_mask |= (CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X8 | 6721 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X4 | 6722 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X2 | 6723 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X1); 6724 break; 6725 case PCIE_LNK_X4: 6726 adev->pm.pcie_mlw_mask |= (CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X4 | 6727 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X2 | 6728 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X1); 6729 break; 6730 case PCIE_LNK_X2: 6731 adev->pm.pcie_mlw_mask |= (CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X2 | 6732 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X1); 6733 break; 6734 case PCIE_LNK_X1: 6735 adev->pm.pcie_mlw_mask |= CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X1; 6736 break; 6737 default: 6738 break; 6739 } 6740 } 6741 /* platform caps */ 6742 if (platform_link_width == PCIE_LNK_WIDTH_UNKNOWN) { 6743 adev->pm.pcie_mlw_mask |= AMDGPU_DEFAULT_PCIE_MLW_MASK; 6744 } else { 6745 switch (platform_link_width) { 6746 case PCIE_LNK_X32: 6747 adev->pm.pcie_mlw_mask |= (CAIL_PCIE_LINK_WIDTH_SUPPORT_X32 | 6748 CAIL_PCIE_LINK_WIDTH_SUPPORT_X16 | 6749 CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 | 6750 CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 | 6751 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 6752 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 6753 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 6754 break; 6755 case PCIE_LNK_X16: 6756 adev->pm.pcie_mlw_mask |= (CAIL_PCIE_LINK_WIDTH_SUPPORT_X16 | 6757 CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 | 6758 CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 | 6759 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 6760 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 6761 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 6762 break; 6763 case PCIE_LNK_X12: 6764 adev->pm.pcie_mlw_mask |= (CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 | 6765 CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 | 6766 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 6767 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 6768 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 6769 break; 6770 case PCIE_LNK_X8: 6771 adev->pm.pcie_mlw_mask |= (CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 | 6772 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 6773 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 6774 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 6775 break; 6776 case PCIE_LNK_X4: 6777 adev->pm.pcie_mlw_mask |= (CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 6778 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 6779 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 6780 break; 6781 case PCIE_LNK_X2: 6782 adev->pm.pcie_mlw_mask |= (CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 6783 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 6784 break; 6785 case PCIE_LNK_X1: 6786 adev->pm.pcie_mlw_mask |= CAIL_PCIE_LINK_WIDTH_SUPPORT_X1; 6787 break; 6788 default: 6789 break; 6790 } 6791 } 6792 } 6793 } 6794 6795 /** 6796 * amdgpu_device_is_peer_accessible - Check peer access through PCIe BAR 6797 * 6798 * @adev: amdgpu_device pointer 6799 * @peer_adev: amdgpu_device pointer for peer device trying to access @adev 6800 * 6801 * Return true if @peer_adev can access (DMA) @adev through the PCIe 6802 * BAR, i.e. @adev is "large BAR" and the BAR matches the DMA mask of 6803 * @peer_adev. 6804 */ 6805 bool amdgpu_device_is_peer_accessible(struct amdgpu_device *adev, 6806 struct amdgpu_device *peer_adev) 6807 { 6808 #ifdef CONFIG_HSA_AMD_P2P 6809 bool p2p_access = 6810 !adev->gmc.xgmi.connected_to_cpu && 6811 !(pci_p2pdma_distance(adev->pdev, peer_adev->dev, false) < 0); 6812 if (!p2p_access) 6813 dev_info(adev->dev, "PCIe P2P access from peer device %s is not supported by the chipset\n", 6814 pci_name(peer_adev->pdev)); 6815 6816 bool is_large_bar = adev->gmc.visible_vram_size && 6817 adev->gmc.real_vram_size == adev->gmc.visible_vram_size; 6818 bool p2p_addressable = amdgpu_device_check_iommu_remap(peer_adev); 6819 6820 if (!p2p_addressable) { 6821 uint64_t address_mask = peer_adev->dev->dma_mask ? 6822 ~*peer_adev->dev->dma_mask : ~((1ULL << 32) - 1); 6823 resource_size_t aper_limit = 6824 adev->gmc.aper_base + adev->gmc.aper_size - 1; 6825 6826 p2p_addressable = !(adev->gmc.aper_base & address_mask || 6827 aper_limit & address_mask); 6828 } 6829 return pcie_p2p && is_large_bar && p2p_access && p2p_addressable; 6830 #else 6831 return false; 6832 #endif 6833 } 6834 6835 int amdgpu_device_baco_enter(struct amdgpu_device *adev) 6836 { 6837 struct amdgpu_ras *ras = amdgpu_ras_get_context(adev); 6838 6839 if (!amdgpu_device_supports_baco(adev)) 6840 return -ENOTSUPP; 6841 6842 if (ras && adev->ras_enabled && 6843 adev->nbio.funcs->enable_doorbell_interrupt) 6844 adev->nbio.funcs->enable_doorbell_interrupt(adev, false); 6845 6846 return amdgpu_dpm_baco_enter(adev); 6847 } 6848 6849 int amdgpu_device_baco_exit(struct amdgpu_device *adev) 6850 { 6851 struct amdgpu_ras *ras = amdgpu_ras_get_context(adev); 6852 int ret = 0; 6853 6854 if (!amdgpu_device_supports_baco(adev)) 6855 return -ENOTSUPP; 6856 6857 ret = amdgpu_dpm_baco_exit(adev); 6858 if (ret) 6859 return ret; 6860 6861 if (ras && adev->ras_enabled && 6862 adev->nbio.funcs->enable_doorbell_interrupt) 6863 adev->nbio.funcs->enable_doorbell_interrupt(adev, true); 6864 6865 if (amdgpu_passthrough(adev) && adev->nbio.funcs && 6866 adev->nbio.funcs->clear_doorbell_interrupt) 6867 adev->nbio.funcs->clear_doorbell_interrupt(adev); 6868 6869 return 0; 6870 } 6871 6872 /** 6873 * amdgpu_pci_error_detected - Called when a PCI error is detected. 6874 * @pdev: PCI device struct 6875 * @state: PCI channel state 6876 * 6877 * Description: Called when a PCI error is detected. 6878 * 6879 * Return: PCI_ERS_RESULT_NEED_RESET or PCI_ERS_RESULT_DISCONNECT. 6880 */ 6881 pci_ers_result_t amdgpu_pci_error_detected(struct pci_dev *pdev, pci_channel_state_t state) 6882 { 6883 struct drm_device *dev = pci_get_drvdata(pdev); 6884 struct amdgpu_device *adev = drm_to_adev(dev); 6885 struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev); 6886 struct amdgpu_reset_context reset_context; 6887 struct list_head device_list; 6888 6889 dev_info(adev->dev, "PCI error: detected callback!!\n"); 6890 6891 if (!amdgpu_dpm_is_link_reset_supported(adev)) { 6892 dev_warn(adev->dev, "No support for XGMI hive yet...\n"); 6893 return PCI_ERS_RESULT_DISCONNECT; 6894 } 6895 6896 adev->pci_channel_state = state; 6897 6898 switch (state) { 6899 case pci_channel_io_normal: 6900 dev_info(adev->dev, "pci_channel_io_normal: state(%d)!!\n", state); 6901 return PCI_ERS_RESULT_CAN_RECOVER; 6902 case pci_channel_io_frozen: 6903 /* Fatal error, prepare for slot reset */ 6904 dev_info(adev->dev, "pci_channel_io_frozen: state(%d)!!\n", state); 6905 6906 if (hive) 6907 mutex_lock(&hive->hive_lock); 6908 adev->pcie_reset_ctx.occurs_dpc = true; 6909 memset(&reset_context, 0, sizeof(reset_context)); 6910 INIT_LIST_HEAD(&device_list); 6911 6912 amdgpu_device_recovery_prepare(adev, &device_list, hive); 6913 amdgpu_device_recovery_get_reset_lock(adev, &device_list); 6914 amdgpu_device_halt_activities(adev, NULL, &reset_context, &device_list, 6915 hive, false); 6916 if (hive) { 6917 mutex_unlock(&hive->hive_lock); 6918 amdgpu_put_xgmi_hive(hive); 6919 } 6920 return PCI_ERS_RESULT_NEED_RESET; 6921 case pci_channel_io_perm_failure: 6922 /* Permanent error, prepare for device removal */ 6923 dev_info(adev->dev, "pci_channel_io_perm_failure: state(%d)!!\n", state); 6924 return PCI_ERS_RESULT_DISCONNECT; 6925 } 6926 6927 return PCI_ERS_RESULT_NEED_RESET; 6928 } 6929 6930 /** 6931 * amdgpu_pci_mmio_enabled - Enable MMIO and dump debug registers 6932 * @pdev: pointer to PCI device 6933 */ 6934 pci_ers_result_t amdgpu_pci_mmio_enabled(struct pci_dev *pdev) 6935 { 6936 struct drm_device *dev = pci_get_drvdata(pdev); 6937 struct amdgpu_device *adev = drm_to_adev(dev); 6938 6939 dev_info(adev->dev, "PCI error: mmio enabled callback!!\n"); 6940 6941 /* TODO - dump whatever for debugging purposes */ 6942 6943 /* This called only if amdgpu_pci_error_detected returns 6944 * PCI_ERS_RESULT_CAN_RECOVER. Read/write to the device still 6945 * works, no need to reset slot. 6946 */ 6947 6948 return PCI_ERS_RESULT_RECOVERED; 6949 } 6950 6951 /** 6952 * amdgpu_pci_slot_reset - Called when PCI slot has been reset. 6953 * @pdev: PCI device struct 6954 * 6955 * Description: This routine is called by the pci error recovery 6956 * code after the PCI slot has been reset, just before we 6957 * should resume normal operations. 6958 */ 6959 pci_ers_result_t amdgpu_pci_slot_reset(struct pci_dev *pdev) 6960 { 6961 struct drm_device *dev = pci_get_drvdata(pdev); 6962 struct amdgpu_device *adev = drm_to_adev(dev); 6963 struct amdgpu_reset_context reset_context; 6964 struct amdgpu_device *tmp_adev; 6965 struct amdgpu_hive_info *hive; 6966 struct list_head device_list; 6967 int r = 0, i; 6968 u32 memsize; 6969 6970 /* PCI error slot reset should be skipped During RAS recovery */ 6971 if ((amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 3) || 6972 amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 4)) && 6973 amdgpu_ras_in_recovery(adev)) 6974 return PCI_ERS_RESULT_RECOVERED; 6975 6976 dev_info(adev->dev, "PCI error: slot reset callback!!\n"); 6977 6978 memset(&reset_context, 0, sizeof(reset_context)); 6979 6980 /* wait for asic to come out of reset */ 6981 msleep(700); 6982 6983 /* Restore PCI confspace */ 6984 amdgpu_device_load_pci_state(pdev); 6985 6986 /* confirm ASIC came out of reset */ 6987 for (i = 0; i < adev->usec_timeout; i++) { 6988 memsize = amdgpu_asic_get_config_memsize(adev); 6989 6990 if (memsize != 0xffffffff) 6991 break; 6992 udelay(1); 6993 } 6994 if (memsize == 0xffffffff) { 6995 r = -ETIME; 6996 goto out; 6997 } 6998 6999 reset_context.method = AMD_RESET_METHOD_NONE; 7000 reset_context.reset_req_dev = adev; 7001 set_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags); 7002 set_bit(AMDGPU_SKIP_COREDUMP, &reset_context.flags); 7003 INIT_LIST_HEAD(&device_list); 7004 7005 hive = amdgpu_get_xgmi_hive(adev); 7006 if (hive) { 7007 mutex_lock(&hive->hive_lock); 7008 reset_context.hive = hive; 7009 list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head) { 7010 tmp_adev->pcie_reset_ctx.in_link_reset = true; 7011 list_add_tail(&tmp_adev->reset_list, &device_list); 7012 } 7013 } else { 7014 set_bit(AMDGPU_SKIP_HW_RESET, &reset_context.flags); 7015 list_add_tail(&adev->reset_list, &device_list); 7016 } 7017 7018 r = amdgpu_device_asic_reset(adev, &device_list, &reset_context); 7019 out: 7020 if (!r) { 7021 if (amdgpu_device_cache_pci_state(adev->pdev)) 7022 pci_restore_state(adev->pdev); 7023 dev_info(adev->dev, "PCIe error recovery succeeded\n"); 7024 } else { 7025 dev_err(adev->dev, "PCIe error recovery failed, err:%d\n", r); 7026 if (hive) { 7027 list_for_each_entry(tmp_adev, &device_list, reset_list) 7028 amdgpu_device_unset_mp1_state(tmp_adev); 7029 } 7030 amdgpu_device_recovery_put_reset_lock(adev, &device_list); 7031 } 7032 7033 if (hive) { 7034 mutex_unlock(&hive->hive_lock); 7035 amdgpu_put_xgmi_hive(hive); 7036 } 7037 7038 return r ? PCI_ERS_RESULT_DISCONNECT : PCI_ERS_RESULT_RECOVERED; 7039 } 7040 7041 /** 7042 * amdgpu_pci_resume() - resume normal ops after PCI reset 7043 * @pdev: pointer to PCI device 7044 * 7045 * Called when the error recovery driver tells us that its 7046 * OK to resume normal operation. 7047 */ 7048 void amdgpu_pci_resume(struct pci_dev *pdev) 7049 { 7050 struct drm_device *dev = pci_get_drvdata(pdev); 7051 struct amdgpu_device *adev = drm_to_adev(dev); 7052 struct list_head device_list; 7053 struct amdgpu_hive_info *hive = NULL; 7054 struct amdgpu_device *tmp_adev = NULL; 7055 7056 dev_info(adev->dev, "PCI error: resume callback!!\n"); 7057 7058 /* Only continue execution for the case of pci_channel_io_frozen */ 7059 if (adev->pci_channel_state != pci_channel_io_frozen) 7060 return; 7061 7062 INIT_LIST_HEAD(&device_list); 7063 7064 hive = amdgpu_get_xgmi_hive(adev); 7065 if (hive) { 7066 mutex_lock(&hive->hive_lock); 7067 list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head) { 7068 tmp_adev->pcie_reset_ctx.in_link_reset = false; 7069 list_add_tail(&tmp_adev->reset_list, &device_list); 7070 } 7071 } else 7072 list_add_tail(&adev->reset_list, &device_list); 7073 7074 amdgpu_device_sched_resume(&device_list, NULL, NULL); 7075 amdgpu_device_gpu_resume(adev, &device_list, false); 7076 amdgpu_device_recovery_put_reset_lock(adev, &device_list); 7077 adev->pcie_reset_ctx.occurs_dpc = false; 7078 7079 if (hive) { 7080 mutex_unlock(&hive->hive_lock); 7081 amdgpu_put_xgmi_hive(hive); 7082 } 7083 } 7084 7085 bool amdgpu_device_cache_pci_state(struct pci_dev *pdev) 7086 { 7087 struct drm_device *dev = pci_get_drvdata(pdev); 7088 struct amdgpu_device *adev = drm_to_adev(dev); 7089 int r; 7090 7091 if (amdgpu_sriov_vf(adev)) 7092 return false; 7093 7094 r = pci_save_state(pdev); 7095 if (!r) { 7096 kfree(adev->pci_state); 7097 7098 adev->pci_state = pci_store_saved_state(pdev); 7099 7100 if (!adev->pci_state) { 7101 dev_err(adev->dev, "Failed to store PCI saved state"); 7102 return false; 7103 } 7104 } else { 7105 dev_warn(adev->dev, "Failed to save PCI state, err:%d\n", r); 7106 return false; 7107 } 7108 7109 return true; 7110 } 7111 7112 bool amdgpu_device_load_pci_state(struct pci_dev *pdev) 7113 { 7114 struct drm_device *dev = pci_get_drvdata(pdev); 7115 struct amdgpu_device *adev = drm_to_adev(dev); 7116 int r; 7117 7118 if (!adev->pci_state) 7119 return false; 7120 7121 r = pci_load_saved_state(pdev, adev->pci_state); 7122 7123 if (!r) { 7124 pci_restore_state(pdev); 7125 } else { 7126 dev_warn(adev->dev, "Failed to load PCI state, err:%d\n", r); 7127 return false; 7128 } 7129 7130 return true; 7131 } 7132 7133 void amdgpu_device_flush_hdp(struct amdgpu_device *adev, 7134 struct amdgpu_ring *ring) 7135 { 7136 #ifdef CONFIG_X86_64 7137 if ((adev->flags & AMD_IS_APU) && !amdgpu_passthrough(adev)) 7138 return; 7139 #endif 7140 if (adev->gmc.xgmi.connected_to_cpu) 7141 return; 7142 7143 if (ring && ring->funcs->emit_hdp_flush) 7144 amdgpu_ring_emit_hdp_flush(ring); 7145 else 7146 amdgpu_asic_flush_hdp(adev, ring); 7147 } 7148 7149 void amdgpu_device_invalidate_hdp(struct amdgpu_device *adev, 7150 struct amdgpu_ring *ring) 7151 { 7152 #ifdef CONFIG_X86_64 7153 if ((adev->flags & AMD_IS_APU) && !amdgpu_passthrough(adev)) 7154 return; 7155 #endif 7156 if (adev->gmc.xgmi.connected_to_cpu) 7157 return; 7158 7159 amdgpu_asic_invalidate_hdp(adev, ring); 7160 } 7161 7162 int amdgpu_in_reset(struct amdgpu_device *adev) 7163 { 7164 return atomic_read(&adev->reset_domain->in_gpu_reset); 7165 } 7166 7167 /** 7168 * amdgpu_device_halt() - bring hardware to some kind of halt state 7169 * 7170 * @adev: amdgpu_device pointer 7171 * 7172 * Bring hardware to some kind of halt state so that no one can touch it 7173 * any more. It will help to maintain error context when error occurred. 7174 * Compare to a simple hang, the system will keep stable at least for SSH 7175 * access. Then it should be trivial to inspect the hardware state and 7176 * see what's going on. Implemented as following: 7177 * 7178 * 1. drm_dev_unplug() makes device inaccessible to user space(IOCTLs, etc), 7179 * clears all CPU mappings to device, disallows remappings through page faults 7180 * 2. amdgpu_irq_disable_all() disables all interrupts 7181 * 3. amdgpu_fence_driver_hw_fini() signals all HW fences 7182 * 4. set adev->no_hw_access to avoid potential crashes after setp 5 7183 * 5. amdgpu_device_unmap_mmio() clears all MMIO mappings 7184 * 6. pci_disable_device() and pci_wait_for_pending_transaction() 7185 * flush any in flight DMA operations 7186 */ 7187 void amdgpu_device_halt(struct amdgpu_device *adev) 7188 { 7189 struct pci_dev *pdev = adev->pdev; 7190 struct drm_device *ddev = adev_to_drm(adev); 7191 7192 amdgpu_xcp_dev_unplug(adev); 7193 drm_dev_unplug(ddev); 7194 7195 amdgpu_irq_disable_all(adev); 7196 7197 amdgpu_fence_driver_hw_fini(adev); 7198 7199 adev->no_hw_access = true; 7200 7201 amdgpu_device_unmap_mmio(adev); 7202 7203 pci_disable_device(pdev); 7204 pci_wait_for_pending_transaction(pdev); 7205 } 7206 7207 u32 amdgpu_device_pcie_port_rreg(struct amdgpu_device *adev, 7208 u32 reg) 7209 { 7210 unsigned long flags, address, data; 7211 u32 r; 7212 7213 address = adev->nbio.funcs->get_pcie_port_index_offset(adev); 7214 data = adev->nbio.funcs->get_pcie_port_data_offset(adev); 7215 7216 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 7217 WREG32(address, reg * 4); 7218 (void)RREG32(address); 7219 r = RREG32(data); 7220 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 7221 return r; 7222 } 7223 7224 void amdgpu_device_pcie_port_wreg(struct amdgpu_device *adev, 7225 u32 reg, u32 v) 7226 { 7227 unsigned long flags, address, data; 7228 7229 address = adev->nbio.funcs->get_pcie_port_index_offset(adev); 7230 data = adev->nbio.funcs->get_pcie_port_data_offset(adev); 7231 7232 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 7233 WREG32(address, reg * 4); 7234 (void)RREG32(address); 7235 WREG32(data, v); 7236 (void)RREG32(data); 7237 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 7238 } 7239 7240 /** 7241 * amdgpu_device_get_gang - return a reference to the current gang 7242 * @adev: amdgpu_device pointer 7243 * 7244 * Returns: A new reference to the current gang leader. 7245 */ 7246 struct dma_fence *amdgpu_device_get_gang(struct amdgpu_device *adev) 7247 { 7248 struct dma_fence *fence; 7249 7250 rcu_read_lock(); 7251 fence = dma_fence_get_rcu_safe(&adev->gang_submit); 7252 rcu_read_unlock(); 7253 return fence; 7254 } 7255 7256 /** 7257 * amdgpu_device_switch_gang - switch to a new gang 7258 * @adev: amdgpu_device pointer 7259 * @gang: the gang to switch to 7260 * 7261 * Try to switch to a new gang. 7262 * Returns: NULL if we switched to the new gang or a reference to the current 7263 * gang leader. 7264 */ 7265 struct dma_fence *amdgpu_device_switch_gang(struct amdgpu_device *adev, 7266 struct dma_fence *gang) 7267 { 7268 struct dma_fence *old = NULL; 7269 7270 dma_fence_get(gang); 7271 do { 7272 dma_fence_put(old); 7273 old = amdgpu_device_get_gang(adev); 7274 if (old == gang) 7275 break; 7276 7277 if (!dma_fence_is_signaled(old)) { 7278 dma_fence_put(gang); 7279 return old; 7280 } 7281 7282 } while (cmpxchg((struct dma_fence __force **)&adev->gang_submit, 7283 old, gang) != old); 7284 7285 /* 7286 * Drop it once for the exchanged reference in adev and once for the 7287 * thread local reference acquired in amdgpu_device_get_gang(). 7288 */ 7289 dma_fence_put(old); 7290 dma_fence_put(old); 7291 return NULL; 7292 } 7293 7294 /** 7295 * amdgpu_device_enforce_isolation - enforce HW isolation 7296 * @adev: the amdgpu device pointer 7297 * @ring: the HW ring the job is supposed to run on 7298 * @job: the job which is about to be pushed to the HW ring 7299 * 7300 * Makes sure that only one client at a time can use the GFX block. 7301 * Returns: The dependency to wait on before the job can be pushed to the HW. 7302 * The function is called multiple times until NULL is returned. 7303 */ 7304 struct dma_fence *amdgpu_device_enforce_isolation(struct amdgpu_device *adev, 7305 struct amdgpu_ring *ring, 7306 struct amdgpu_job *job) 7307 { 7308 struct amdgpu_isolation *isolation = &adev->isolation[ring->xcp_id]; 7309 struct drm_sched_fence *f = job->base.s_fence; 7310 struct dma_fence *dep; 7311 void *owner; 7312 int r; 7313 7314 /* 7315 * For now enforce isolation only for the GFX block since we only need 7316 * the cleaner shader on those rings. 7317 */ 7318 if (ring->funcs->type != AMDGPU_RING_TYPE_GFX && 7319 ring->funcs->type != AMDGPU_RING_TYPE_COMPUTE) 7320 return NULL; 7321 7322 /* 7323 * All submissions where enforce isolation is false are handled as if 7324 * they come from a single client. Use ~0l as the owner to distinct it 7325 * from kernel submissions where the owner is NULL. 7326 */ 7327 owner = job->enforce_isolation ? f->owner : (void *)~0l; 7328 7329 mutex_lock(&adev->enforce_isolation_mutex); 7330 7331 /* 7332 * The "spearhead" submission is the first one which changes the 7333 * ownership to its client. We always need to wait for it to be 7334 * pushed to the HW before proceeding with anything. 7335 */ 7336 if (&f->scheduled != isolation->spearhead && 7337 !dma_fence_is_signaled(isolation->spearhead)) { 7338 dep = isolation->spearhead; 7339 goto out_grab_ref; 7340 } 7341 7342 if (isolation->owner != owner) { 7343 7344 /* 7345 * Wait for any gang to be assembled before switching to a 7346 * different owner or otherwise we could deadlock the 7347 * submissions. 7348 */ 7349 if (!job->gang_submit) { 7350 dep = amdgpu_device_get_gang(adev); 7351 if (!dma_fence_is_signaled(dep)) 7352 goto out_return_dep; 7353 dma_fence_put(dep); 7354 } 7355 7356 dma_fence_put(isolation->spearhead); 7357 isolation->spearhead = dma_fence_get(&f->scheduled); 7358 amdgpu_sync_move(&isolation->active, &isolation->prev); 7359 trace_amdgpu_isolation(isolation->owner, owner); 7360 isolation->owner = owner; 7361 } 7362 7363 /* 7364 * Specifying the ring here helps to pipeline submissions even when 7365 * isolation is enabled. If that is not desired for testing NULL can be 7366 * used instead of the ring to enforce a CPU round trip while switching 7367 * between clients. 7368 */ 7369 dep = amdgpu_sync_peek_fence(&isolation->prev, ring); 7370 r = amdgpu_sync_fence(&isolation->active, &f->finished, GFP_NOWAIT); 7371 if (r) 7372 dev_warn(adev->dev, "OOM tracking isolation\n"); 7373 7374 out_grab_ref: 7375 dma_fence_get(dep); 7376 out_return_dep: 7377 mutex_unlock(&adev->enforce_isolation_mutex); 7378 return dep; 7379 } 7380 7381 bool amdgpu_device_has_display_hardware(struct amdgpu_device *adev) 7382 { 7383 switch (adev->asic_type) { 7384 #ifdef CONFIG_DRM_AMDGPU_SI 7385 case CHIP_HAINAN: 7386 #endif 7387 case CHIP_TOPAZ: 7388 /* chips with no display hardware */ 7389 return false; 7390 #ifdef CONFIG_DRM_AMDGPU_SI 7391 case CHIP_TAHITI: 7392 case CHIP_PITCAIRN: 7393 case CHIP_VERDE: 7394 case CHIP_OLAND: 7395 #endif 7396 #ifdef CONFIG_DRM_AMDGPU_CIK 7397 case CHIP_BONAIRE: 7398 case CHIP_HAWAII: 7399 case CHIP_KAVERI: 7400 case CHIP_KABINI: 7401 case CHIP_MULLINS: 7402 #endif 7403 case CHIP_TONGA: 7404 case CHIP_FIJI: 7405 case CHIP_POLARIS10: 7406 case CHIP_POLARIS11: 7407 case CHIP_POLARIS12: 7408 case CHIP_VEGAM: 7409 case CHIP_CARRIZO: 7410 case CHIP_STONEY: 7411 /* chips with display hardware */ 7412 return true; 7413 default: 7414 /* IP discovery */ 7415 if (!amdgpu_ip_version(adev, DCE_HWIP, 0) || 7416 (adev->harvest_ip_mask & AMD_HARVEST_IP_DMU_MASK)) 7417 return false; 7418 return true; 7419 } 7420 } 7421 7422 uint32_t amdgpu_device_wait_on_rreg(struct amdgpu_device *adev, 7423 uint32_t inst, uint32_t reg_addr, char reg_name[], 7424 uint32_t expected_value, uint32_t mask) 7425 { 7426 uint32_t ret = 0; 7427 uint32_t old_ = 0; 7428 uint32_t tmp_ = RREG32(reg_addr); 7429 uint32_t loop = adev->usec_timeout; 7430 7431 while ((tmp_ & (mask)) != (expected_value)) { 7432 if (old_ != tmp_) { 7433 loop = adev->usec_timeout; 7434 old_ = tmp_; 7435 } else 7436 udelay(1); 7437 tmp_ = RREG32(reg_addr); 7438 loop--; 7439 if (!loop) { 7440 dev_warn( 7441 adev->dev, 7442 "Register(%d) [%s] failed to reach value 0x%08x != 0x%08xn", 7443 inst, reg_name, (uint32_t)expected_value, 7444 (uint32_t)(tmp_ & (mask))); 7445 ret = -ETIMEDOUT; 7446 break; 7447 } 7448 } 7449 return ret; 7450 } 7451 7452 ssize_t amdgpu_get_soft_full_reset_mask(struct amdgpu_ring *ring) 7453 { 7454 ssize_t size = 0; 7455 7456 if (!ring || !ring->adev) 7457 return size; 7458 7459 if (amdgpu_device_should_recover_gpu(ring->adev)) 7460 size |= AMDGPU_RESET_TYPE_FULL; 7461 7462 if (unlikely(!ring->adev->debug_disable_soft_recovery) && 7463 !amdgpu_sriov_vf(ring->adev) && ring->funcs->soft_recovery) 7464 size |= AMDGPU_RESET_TYPE_SOFT_RESET; 7465 7466 return size; 7467 } 7468 7469 ssize_t amdgpu_show_reset_mask(char *buf, uint32_t supported_reset) 7470 { 7471 ssize_t size = 0; 7472 7473 if (supported_reset == 0) { 7474 size += sysfs_emit_at(buf, size, "unsupported"); 7475 size += sysfs_emit_at(buf, size, "\n"); 7476 return size; 7477 7478 } 7479 7480 if (supported_reset & AMDGPU_RESET_TYPE_SOFT_RESET) 7481 size += sysfs_emit_at(buf, size, "soft "); 7482 7483 if (supported_reset & AMDGPU_RESET_TYPE_PER_QUEUE) 7484 size += sysfs_emit_at(buf, size, "queue "); 7485 7486 if (supported_reset & AMDGPU_RESET_TYPE_PER_PIPE) 7487 size += sysfs_emit_at(buf, size, "pipe "); 7488 7489 if (supported_reset & AMDGPU_RESET_TYPE_FULL) 7490 size += sysfs_emit_at(buf, size, "full "); 7491 7492 size += sysfs_emit_at(buf, size, "\n"); 7493 return size; 7494 } 7495