1 /* 2 * Copyright 2008 Advanced Micro Devices, Inc. 3 * Copyright 2008 Red Hat Inc. 4 * Copyright 2009 Jerome Glisse. 5 * 6 * Permission is hereby granted, free of charge, to any person obtaining a 7 * copy of this software and associated documentation files (the "Software"), 8 * to deal in the Software without restriction, including without limitation 9 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 10 * and/or sell copies of the Software, and to permit persons to whom the 11 * Software is furnished to do so, subject to the following conditions: 12 * 13 * The above copyright notice and this permission notice shall be included in 14 * all copies or substantial portions of the Software. 15 * 16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 19 * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR 20 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, 21 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR 22 * OTHER DEALINGS IN THE SOFTWARE. 23 * 24 * Authors: Dave Airlie 25 * Alex Deucher 26 * Jerome Glisse 27 */ 28 29 #include <linux/aperture.h> 30 #include <linux/power_supply.h> 31 #include <linux/kthread.h> 32 #include <linux/module.h> 33 #include <linux/console.h> 34 #include <linux/slab.h> 35 #include <linux/iommu.h> 36 #include <linux/pci.h> 37 #include <linux/pci-p2pdma.h> 38 #include <linux/apple-gmux.h> 39 40 #include <drm/drm_atomic_helper.h> 41 #include <drm/drm_client_event.h> 42 #include <drm/drm_crtc_helper.h> 43 #include <drm/drm_probe_helper.h> 44 #include <drm/amdgpu_drm.h> 45 #include <linux/device.h> 46 #include <linux/vgaarb.h> 47 #include <linux/vga_switcheroo.h> 48 #include <linux/efi.h> 49 #include "amdgpu.h" 50 #include "amdgpu_trace.h" 51 #include "amdgpu_i2c.h" 52 #include "atom.h" 53 #include "amdgpu_atombios.h" 54 #include "amdgpu_atomfirmware.h" 55 #include "amd_pcie.h" 56 #ifdef CONFIG_DRM_AMDGPU_SI 57 #include "si.h" 58 #endif 59 #ifdef CONFIG_DRM_AMDGPU_CIK 60 #include "cik.h" 61 #endif 62 #include "vi.h" 63 #include "soc15.h" 64 #include "nv.h" 65 #include "bif/bif_4_1_d.h" 66 #include <linux/firmware.h> 67 #include "amdgpu_vf_error.h" 68 69 #include "amdgpu_amdkfd.h" 70 #include "amdgpu_pm.h" 71 72 #include "amdgpu_xgmi.h" 73 #include "amdgpu_ras.h" 74 #include "amdgpu_pmu.h" 75 #include "amdgpu_fru_eeprom.h" 76 #include "amdgpu_reset.h" 77 #include "amdgpu_virt.h" 78 #include "amdgpu_dev_coredump.h" 79 80 #include <linux/suspend.h> 81 #include <drm/task_barrier.h> 82 #include <linux/pm_runtime.h> 83 84 #include <drm/drm_drv.h> 85 86 #if IS_ENABLED(CONFIG_X86) 87 #include <asm/intel-family.h> 88 #include <asm/cpu_device_id.h> 89 #endif 90 91 MODULE_FIRMWARE("amdgpu/vega10_gpu_info.bin"); 92 MODULE_FIRMWARE("amdgpu/vega12_gpu_info.bin"); 93 MODULE_FIRMWARE("amdgpu/raven_gpu_info.bin"); 94 MODULE_FIRMWARE("amdgpu/picasso_gpu_info.bin"); 95 MODULE_FIRMWARE("amdgpu/raven2_gpu_info.bin"); 96 MODULE_FIRMWARE("amdgpu/arcturus_gpu_info.bin"); 97 MODULE_FIRMWARE("amdgpu/navi12_gpu_info.bin"); 98 99 #define AMDGPU_RESUME_MS 2000 100 #define AMDGPU_MAX_RETRY_LIMIT 2 101 #define AMDGPU_RETRY_SRIOV_RESET(r) ((r) == -EBUSY || (r) == -ETIMEDOUT || (r) == -EINVAL) 102 #define AMDGPU_PCIE_INDEX_FALLBACK (0x38 >> 2) 103 #define AMDGPU_PCIE_INDEX_HI_FALLBACK (0x44 >> 2) 104 #define AMDGPU_PCIE_DATA_FALLBACK (0x3C >> 2) 105 106 #define AMDGPU_VBIOS_SKIP (1U << 0) 107 #define AMDGPU_VBIOS_OPTIONAL (1U << 1) 108 109 static const struct drm_driver amdgpu_kms_driver; 110 111 const char *amdgpu_asic_name[] = { 112 "TAHITI", 113 "PITCAIRN", 114 "VERDE", 115 "OLAND", 116 "HAINAN", 117 "BONAIRE", 118 "KAVERI", 119 "KABINI", 120 "HAWAII", 121 "MULLINS", 122 "TOPAZ", 123 "TONGA", 124 "FIJI", 125 "CARRIZO", 126 "STONEY", 127 "POLARIS10", 128 "POLARIS11", 129 "POLARIS12", 130 "VEGAM", 131 "VEGA10", 132 "VEGA12", 133 "VEGA20", 134 "RAVEN", 135 "ARCTURUS", 136 "RENOIR", 137 "ALDEBARAN", 138 "NAVI10", 139 "CYAN_SKILLFISH", 140 "NAVI14", 141 "NAVI12", 142 "SIENNA_CICHLID", 143 "NAVY_FLOUNDER", 144 "VANGOGH", 145 "DIMGREY_CAVEFISH", 146 "BEIGE_GOBY", 147 "YELLOW_CARP", 148 "IP DISCOVERY", 149 "LAST", 150 }; 151 152 #define AMDGPU_IP_BLK_MASK_ALL GENMASK(AMD_IP_BLOCK_TYPE_NUM - 1, 0) 153 /* 154 * Default init level where all blocks are expected to be initialized. This is 155 * the level of initialization expected by default and also after a full reset 156 * of the device. 157 */ 158 struct amdgpu_init_level amdgpu_init_default = { 159 .level = AMDGPU_INIT_LEVEL_DEFAULT, 160 .hwini_ip_block_mask = AMDGPU_IP_BLK_MASK_ALL, 161 }; 162 163 struct amdgpu_init_level amdgpu_init_recovery = { 164 .level = AMDGPU_INIT_LEVEL_RESET_RECOVERY, 165 .hwini_ip_block_mask = AMDGPU_IP_BLK_MASK_ALL, 166 }; 167 168 /* 169 * Minimal blocks needed to be initialized before a XGMI hive can be reset. This 170 * is used for cases like reset on initialization where the entire hive needs to 171 * be reset before first use. 172 */ 173 struct amdgpu_init_level amdgpu_init_minimal_xgmi = { 174 .level = AMDGPU_INIT_LEVEL_MINIMAL_XGMI, 175 .hwini_ip_block_mask = 176 BIT(AMD_IP_BLOCK_TYPE_GMC) | BIT(AMD_IP_BLOCK_TYPE_SMC) | 177 BIT(AMD_IP_BLOCK_TYPE_COMMON) | BIT(AMD_IP_BLOCK_TYPE_IH) | 178 BIT(AMD_IP_BLOCK_TYPE_PSP) 179 }; 180 181 static inline bool amdgpu_ip_member_of_hwini(struct amdgpu_device *adev, 182 enum amd_ip_block_type block) 183 { 184 return (adev->init_lvl->hwini_ip_block_mask & (1U << block)) != 0; 185 } 186 187 void amdgpu_set_init_level(struct amdgpu_device *adev, 188 enum amdgpu_init_lvl_id lvl) 189 { 190 switch (lvl) { 191 case AMDGPU_INIT_LEVEL_MINIMAL_XGMI: 192 adev->init_lvl = &amdgpu_init_minimal_xgmi; 193 break; 194 case AMDGPU_INIT_LEVEL_RESET_RECOVERY: 195 adev->init_lvl = &amdgpu_init_recovery; 196 break; 197 case AMDGPU_INIT_LEVEL_DEFAULT: 198 fallthrough; 199 default: 200 adev->init_lvl = &amdgpu_init_default; 201 break; 202 } 203 } 204 205 static inline void amdgpu_device_stop_pending_resets(struct amdgpu_device *adev); 206 static int amdgpu_device_pm_notifier(struct notifier_block *nb, unsigned long mode, 207 void *data); 208 209 /** 210 * DOC: pcie_replay_count 211 * 212 * The amdgpu driver provides a sysfs API for reporting the total number 213 * of PCIe replays (NAKs). 214 * The file pcie_replay_count is used for this and returns the total 215 * number of replays as a sum of the NAKs generated and NAKs received. 216 */ 217 218 static ssize_t amdgpu_device_get_pcie_replay_count(struct device *dev, 219 struct device_attribute *attr, char *buf) 220 { 221 struct drm_device *ddev = dev_get_drvdata(dev); 222 struct amdgpu_device *adev = drm_to_adev(ddev); 223 uint64_t cnt = amdgpu_asic_get_pcie_replay_count(adev); 224 225 return sysfs_emit(buf, "%llu\n", cnt); 226 } 227 228 static DEVICE_ATTR(pcie_replay_count, 0444, 229 amdgpu_device_get_pcie_replay_count, NULL); 230 231 static int amdgpu_device_attr_sysfs_init(struct amdgpu_device *adev) 232 { 233 int ret = 0; 234 235 if (amdgpu_nbio_is_replay_cnt_supported(adev)) 236 ret = sysfs_create_file(&adev->dev->kobj, 237 &dev_attr_pcie_replay_count.attr); 238 239 return ret; 240 } 241 242 static void amdgpu_device_attr_sysfs_fini(struct amdgpu_device *adev) 243 { 244 if (amdgpu_nbio_is_replay_cnt_supported(adev)) 245 sysfs_remove_file(&adev->dev->kobj, 246 &dev_attr_pcie_replay_count.attr); 247 } 248 249 static ssize_t amdgpu_sysfs_reg_state_get(struct file *f, struct kobject *kobj, 250 const struct bin_attribute *attr, char *buf, 251 loff_t ppos, size_t count) 252 { 253 struct device *dev = kobj_to_dev(kobj); 254 struct drm_device *ddev = dev_get_drvdata(dev); 255 struct amdgpu_device *adev = drm_to_adev(ddev); 256 ssize_t bytes_read; 257 258 switch (ppos) { 259 case AMDGPU_SYS_REG_STATE_XGMI: 260 bytes_read = amdgpu_asic_get_reg_state( 261 adev, AMDGPU_REG_STATE_TYPE_XGMI, buf, count); 262 break; 263 case AMDGPU_SYS_REG_STATE_WAFL: 264 bytes_read = amdgpu_asic_get_reg_state( 265 adev, AMDGPU_REG_STATE_TYPE_WAFL, buf, count); 266 break; 267 case AMDGPU_SYS_REG_STATE_PCIE: 268 bytes_read = amdgpu_asic_get_reg_state( 269 adev, AMDGPU_REG_STATE_TYPE_PCIE, buf, count); 270 break; 271 case AMDGPU_SYS_REG_STATE_USR: 272 bytes_read = amdgpu_asic_get_reg_state( 273 adev, AMDGPU_REG_STATE_TYPE_USR, buf, count); 274 break; 275 case AMDGPU_SYS_REG_STATE_USR_1: 276 bytes_read = amdgpu_asic_get_reg_state( 277 adev, AMDGPU_REG_STATE_TYPE_USR_1, buf, count); 278 break; 279 default: 280 return -EINVAL; 281 } 282 283 return bytes_read; 284 } 285 286 static const BIN_ATTR(reg_state, 0444, amdgpu_sysfs_reg_state_get, NULL, 287 AMDGPU_SYS_REG_STATE_END); 288 289 int amdgpu_reg_state_sysfs_init(struct amdgpu_device *adev) 290 { 291 int ret; 292 293 if (!amdgpu_asic_get_reg_state_supported(adev)) 294 return 0; 295 296 ret = sysfs_create_bin_file(&adev->dev->kobj, &bin_attr_reg_state); 297 298 return ret; 299 } 300 301 void amdgpu_reg_state_sysfs_fini(struct amdgpu_device *adev) 302 { 303 if (!amdgpu_asic_get_reg_state_supported(adev)) 304 return; 305 sysfs_remove_bin_file(&adev->dev->kobj, &bin_attr_reg_state); 306 } 307 308 int amdgpu_ip_block_suspend(struct amdgpu_ip_block *ip_block) 309 { 310 int r; 311 312 if (ip_block->version->funcs->suspend) { 313 r = ip_block->version->funcs->suspend(ip_block); 314 if (r) { 315 dev_err(ip_block->adev->dev, 316 "suspend of IP block <%s> failed %d\n", 317 ip_block->version->funcs->name, r); 318 return r; 319 } 320 } 321 322 ip_block->status.hw = false; 323 return 0; 324 } 325 326 int amdgpu_ip_block_resume(struct amdgpu_ip_block *ip_block) 327 { 328 int r; 329 330 if (ip_block->version->funcs->resume) { 331 r = ip_block->version->funcs->resume(ip_block); 332 if (r) { 333 dev_err(ip_block->adev->dev, 334 "resume of IP block <%s> failed %d\n", 335 ip_block->version->funcs->name, r); 336 return r; 337 } 338 } 339 340 ip_block->status.hw = true; 341 return 0; 342 } 343 344 /** 345 * DOC: board_info 346 * 347 * The amdgpu driver provides a sysfs API for giving board related information. 348 * It provides the form factor information in the format 349 * 350 * type : form factor 351 * 352 * Possible form factor values 353 * 354 * - "cem" - PCIE CEM card 355 * - "oam" - Open Compute Accelerator Module 356 * - "unknown" - Not known 357 * 358 */ 359 360 static ssize_t amdgpu_device_get_board_info(struct device *dev, 361 struct device_attribute *attr, 362 char *buf) 363 { 364 struct drm_device *ddev = dev_get_drvdata(dev); 365 struct amdgpu_device *adev = drm_to_adev(ddev); 366 enum amdgpu_pkg_type pkg_type = AMDGPU_PKG_TYPE_CEM; 367 const char *pkg; 368 369 if (adev->smuio.funcs && adev->smuio.funcs->get_pkg_type) 370 pkg_type = adev->smuio.funcs->get_pkg_type(adev); 371 372 switch (pkg_type) { 373 case AMDGPU_PKG_TYPE_CEM: 374 pkg = "cem"; 375 break; 376 case AMDGPU_PKG_TYPE_OAM: 377 pkg = "oam"; 378 break; 379 default: 380 pkg = "unknown"; 381 break; 382 } 383 384 return sysfs_emit(buf, "%s : %s\n", "type", pkg); 385 } 386 387 static DEVICE_ATTR(board_info, 0444, amdgpu_device_get_board_info, NULL); 388 389 static struct attribute *amdgpu_board_attrs[] = { 390 &dev_attr_board_info.attr, 391 NULL, 392 }; 393 394 static umode_t amdgpu_board_attrs_is_visible(struct kobject *kobj, 395 struct attribute *attr, int n) 396 { 397 struct device *dev = kobj_to_dev(kobj); 398 struct drm_device *ddev = dev_get_drvdata(dev); 399 struct amdgpu_device *adev = drm_to_adev(ddev); 400 401 if (adev->flags & AMD_IS_APU) 402 return 0; 403 404 return attr->mode; 405 } 406 407 static const struct attribute_group amdgpu_board_attrs_group = { 408 .attrs = amdgpu_board_attrs, 409 .is_visible = amdgpu_board_attrs_is_visible 410 }; 411 412 static void amdgpu_device_get_pcie_info(struct amdgpu_device *adev); 413 414 415 /** 416 * amdgpu_device_supports_px - Is the device a dGPU with ATPX power control 417 * 418 * @dev: drm_device pointer 419 * 420 * Returns true if the device is a dGPU with ATPX power control, 421 * otherwise return false. 422 */ 423 bool amdgpu_device_supports_px(struct drm_device *dev) 424 { 425 struct amdgpu_device *adev = drm_to_adev(dev); 426 427 if ((adev->flags & AMD_IS_PX) && !amdgpu_is_atpx_hybrid()) 428 return true; 429 return false; 430 } 431 432 /** 433 * amdgpu_device_supports_boco - Is the device a dGPU with ACPI power resources 434 * 435 * @dev: drm_device pointer 436 * 437 * Returns true if the device is a dGPU with ACPI power control, 438 * otherwise return false. 439 */ 440 bool amdgpu_device_supports_boco(struct drm_device *dev) 441 { 442 struct amdgpu_device *adev = drm_to_adev(dev); 443 444 if (!IS_ENABLED(CONFIG_HOTPLUG_PCI_PCIE)) 445 return false; 446 447 if (adev->has_pr3 || 448 ((adev->flags & AMD_IS_PX) && amdgpu_is_atpx_hybrid())) 449 return true; 450 return false; 451 } 452 453 /** 454 * amdgpu_device_supports_baco - Does the device support BACO 455 * 456 * @dev: drm_device pointer 457 * 458 * Return: 459 * 1 if the device supports BACO; 460 * 3 if the device supports MACO (only works if BACO is supported) 461 * otherwise return 0. 462 */ 463 int amdgpu_device_supports_baco(struct drm_device *dev) 464 { 465 struct amdgpu_device *adev = drm_to_adev(dev); 466 467 return amdgpu_asic_supports_baco(adev); 468 } 469 470 void amdgpu_device_detect_runtime_pm_mode(struct amdgpu_device *adev) 471 { 472 struct drm_device *dev; 473 int bamaco_support; 474 475 dev = adev_to_drm(adev); 476 477 adev->pm.rpm_mode = AMDGPU_RUNPM_NONE; 478 bamaco_support = amdgpu_device_supports_baco(dev); 479 480 switch (amdgpu_runtime_pm) { 481 case 2: 482 if (bamaco_support & MACO_SUPPORT) { 483 adev->pm.rpm_mode = AMDGPU_RUNPM_BAMACO; 484 dev_info(adev->dev, "Forcing BAMACO for runtime pm\n"); 485 } else if (bamaco_support == BACO_SUPPORT) { 486 adev->pm.rpm_mode = AMDGPU_RUNPM_BACO; 487 dev_info(adev->dev, "Requested mode BAMACO not available,fallback to use BACO\n"); 488 } 489 break; 490 case 1: 491 if (bamaco_support & BACO_SUPPORT) { 492 adev->pm.rpm_mode = AMDGPU_RUNPM_BACO; 493 dev_info(adev->dev, "Forcing BACO for runtime pm\n"); 494 } 495 break; 496 case -1: 497 case -2: 498 if (amdgpu_device_supports_px(dev)) { /* enable PX as runtime mode */ 499 adev->pm.rpm_mode = AMDGPU_RUNPM_PX; 500 dev_info(adev->dev, "Using ATPX for runtime pm\n"); 501 } else if (amdgpu_device_supports_boco(dev)) { /* enable boco as runtime mode */ 502 adev->pm.rpm_mode = AMDGPU_RUNPM_BOCO; 503 dev_info(adev->dev, "Using BOCO for runtime pm\n"); 504 } else { 505 if (!bamaco_support) 506 goto no_runtime_pm; 507 508 switch (adev->asic_type) { 509 case CHIP_VEGA20: 510 case CHIP_ARCTURUS: 511 /* BACO are not supported on vega20 and arctrus */ 512 break; 513 case CHIP_VEGA10: 514 /* enable BACO as runpm mode if noretry=0 */ 515 if (!adev->gmc.noretry && !amdgpu_passthrough(adev)) 516 adev->pm.rpm_mode = AMDGPU_RUNPM_BACO; 517 break; 518 default: 519 /* enable BACO as runpm mode on CI+ */ 520 if (!amdgpu_passthrough(adev)) 521 adev->pm.rpm_mode = AMDGPU_RUNPM_BACO; 522 break; 523 } 524 525 if (adev->pm.rpm_mode == AMDGPU_RUNPM_BACO) { 526 if (bamaco_support & MACO_SUPPORT) { 527 adev->pm.rpm_mode = AMDGPU_RUNPM_BAMACO; 528 dev_info(adev->dev, "Using BAMACO for runtime pm\n"); 529 } else { 530 dev_info(adev->dev, "Using BACO for runtime pm\n"); 531 } 532 } 533 } 534 break; 535 case 0: 536 dev_info(adev->dev, "runtime pm is manually disabled\n"); 537 break; 538 default: 539 break; 540 } 541 542 no_runtime_pm: 543 if (adev->pm.rpm_mode == AMDGPU_RUNPM_NONE) 544 dev_info(adev->dev, "Runtime PM not available\n"); 545 } 546 /** 547 * amdgpu_device_supports_smart_shift - Is the device dGPU with 548 * smart shift support 549 * 550 * @dev: drm_device pointer 551 * 552 * Returns true if the device is a dGPU with Smart Shift support, 553 * otherwise returns false. 554 */ 555 bool amdgpu_device_supports_smart_shift(struct drm_device *dev) 556 { 557 return (amdgpu_device_supports_boco(dev) && 558 amdgpu_acpi_is_power_shift_control_supported()); 559 } 560 561 /* 562 * VRAM access helper functions 563 */ 564 565 /** 566 * amdgpu_device_mm_access - access vram by MM_INDEX/MM_DATA 567 * 568 * @adev: amdgpu_device pointer 569 * @pos: offset of the buffer in vram 570 * @buf: virtual address of the buffer in system memory 571 * @size: read/write size, sizeof(@buf) must > @size 572 * @write: true - write to vram, otherwise - read from vram 573 */ 574 void amdgpu_device_mm_access(struct amdgpu_device *adev, loff_t pos, 575 void *buf, size_t size, bool write) 576 { 577 unsigned long flags; 578 uint32_t hi = ~0, tmp = 0; 579 uint32_t *data = buf; 580 uint64_t last; 581 int idx; 582 583 if (!drm_dev_enter(adev_to_drm(adev), &idx)) 584 return; 585 586 BUG_ON(!IS_ALIGNED(pos, 4) || !IS_ALIGNED(size, 4)); 587 588 spin_lock_irqsave(&adev->mmio_idx_lock, flags); 589 for (last = pos + size; pos < last; pos += 4) { 590 tmp = pos >> 31; 591 592 WREG32_NO_KIQ(mmMM_INDEX, ((uint32_t)pos) | 0x80000000); 593 if (tmp != hi) { 594 WREG32_NO_KIQ(mmMM_INDEX_HI, tmp); 595 hi = tmp; 596 } 597 if (write) 598 WREG32_NO_KIQ(mmMM_DATA, *data++); 599 else 600 *data++ = RREG32_NO_KIQ(mmMM_DATA); 601 } 602 603 spin_unlock_irqrestore(&adev->mmio_idx_lock, flags); 604 drm_dev_exit(idx); 605 } 606 607 /** 608 * amdgpu_device_aper_access - access vram by vram aperture 609 * 610 * @adev: amdgpu_device pointer 611 * @pos: offset of the buffer in vram 612 * @buf: virtual address of the buffer in system memory 613 * @size: read/write size, sizeof(@buf) must > @size 614 * @write: true - write to vram, otherwise - read from vram 615 * 616 * The return value means how many bytes have been transferred. 617 */ 618 size_t amdgpu_device_aper_access(struct amdgpu_device *adev, loff_t pos, 619 void *buf, size_t size, bool write) 620 { 621 #ifdef CONFIG_64BIT 622 void __iomem *addr; 623 size_t count = 0; 624 uint64_t last; 625 626 if (!adev->mman.aper_base_kaddr) 627 return 0; 628 629 last = min(pos + size, adev->gmc.visible_vram_size); 630 if (last > pos) { 631 addr = adev->mman.aper_base_kaddr + pos; 632 count = last - pos; 633 634 if (write) { 635 memcpy_toio(addr, buf, count); 636 /* Make sure HDP write cache flush happens without any reordering 637 * after the system memory contents are sent over PCIe device 638 */ 639 mb(); 640 amdgpu_device_flush_hdp(adev, NULL); 641 } else { 642 amdgpu_device_invalidate_hdp(adev, NULL); 643 /* Make sure HDP read cache is invalidated before issuing a read 644 * to the PCIe device 645 */ 646 mb(); 647 memcpy_fromio(buf, addr, count); 648 } 649 650 } 651 652 return count; 653 #else 654 return 0; 655 #endif 656 } 657 658 /** 659 * amdgpu_device_vram_access - read/write a buffer in vram 660 * 661 * @adev: amdgpu_device pointer 662 * @pos: offset of the buffer in vram 663 * @buf: virtual address of the buffer in system memory 664 * @size: read/write size, sizeof(@buf) must > @size 665 * @write: true - write to vram, otherwise - read from vram 666 */ 667 void amdgpu_device_vram_access(struct amdgpu_device *adev, loff_t pos, 668 void *buf, size_t size, bool write) 669 { 670 size_t count; 671 672 /* try to using vram apreature to access vram first */ 673 count = amdgpu_device_aper_access(adev, pos, buf, size, write); 674 size -= count; 675 if (size) { 676 /* using MM to access rest vram */ 677 pos += count; 678 buf += count; 679 amdgpu_device_mm_access(adev, pos, buf, size, write); 680 } 681 } 682 683 /* 684 * register access helper functions. 685 */ 686 687 /* Check if hw access should be skipped because of hotplug or device error */ 688 bool amdgpu_device_skip_hw_access(struct amdgpu_device *adev) 689 { 690 if (adev->no_hw_access) 691 return true; 692 693 #ifdef CONFIG_LOCKDEP 694 /* 695 * This is a bit complicated to understand, so worth a comment. What we assert 696 * here is that the GPU reset is not running on another thread in parallel. 697 * 698 * For this we trylock the read side of the reset semaphore, if that succeeds 699 * we know that the reset is not running in parallel. 700 * 701 * If the trylock fails we assert that we are either already holding the read 702 * side of the lock or are the reset thread itself and hold the write side of 703 * the lock. 704 */ 705 if (in_task()) { 706 if (down_read_trylock(&adev->reset_domain->sem)) 707 up_read(&adev->reset_domain->sem); 708 else 709 lockdep_assert_held(&adev->reset_domain->sem); 710 } 711 #endif 712 return false; 713 } 714 715 /** 716 * amdgpu_device_rreg - read a memory mapped IO or indirect register 717 * 718 * @adev: amdgpu_device pointer 719 * @reg: dword aligned register offset 720 * @acc_flags: access flags which require special behavior 721 * 722 * Returns the 32 bit value from the offset specified. 723 */ 724 uint32_t amdgpu_device_rreg(struct amdgpu_device *adev, 725 uint32_t reg, uint32_t acc_flags) 726 { 727 uint32_t ret; 728 729 if (amdgpu_device_skip_hw_access(adev)) 730 return 0; 731 732 if ((reg * 4) < adev->rmmio_size) { 733 if (!(acc_flags & AMDGPU_REGS_NO_KIQ) && 734 amdgpu_sriov_runtime(adev) && 735 down_read_trylock(&adev->reset_domain->sem)) { 736 ret = amdgpu_kiq_rreg(adev, reg, 0); 737 up_read(&adev->reset_domain->sem); 738 } else { 739 ret = readl(((void __iomem *)adev->rmmio) + (reg * 4)); 740 } 741 } else { 742 ret = adev->pcie_rreg(adev, reg * 4); 743 } 744 745 trace_amdgpu_device_rreg(adev->pdev->device, reg, ret); 746 747 return ret; 748 } 749 750 /* 751 * MMIO register read with bytes helper functions 752 * @offset:bytes offset from MMIO start 753 */ 754 755 /** 756 * amdgpu_mm_rreg8 - read a memory mapped IO register 757 * 758 * @adev: amdgpu_device pointer 759 * @offset: byte aligned register offset 760 * 761 * Returns the 8 bit value from the offset specified. 762 */ 763 uint8_t amdgpu_mm_rreg8(struct amdgpu_device *adev, uint32_t offset) 764 { 765 if (amdgpu_device_skip_hw_access(adev)) 766 return 0; 767 768 if (offset < adev->rmmio_size) 769 return (readb(adev->rmmio + offset)); 770 BUG(); 771 } 772 773 774 /** 775 * amdgpu_device_xcc_rreg - read a memory mapped IO or indirect register with specific XCC 776 * 777 * @adev: amdgpu_device pointer 778 * @reg: dword aligned register offset 779 * @acc_flags: access flags which require special behavior 780 * @xcc_id: xcc accelerated compute core id 781 * 782 * Returns the 32 bit value from the offset specified. 783 */ 784 uint32_t amdgpu_device_xcc_rreg(struct amdgpu_device *adev, 785 uint32_t reg, uint32_t acc_flags, 786 uint32_t xcc_id) 787 { 788 uint32_t ret, rlcg_flag; 789 790 if (amdgpu_device_skip_hw_access(adev)) 791 return 0; 792 793 if ((reg * 4) < adev->rmmio_size) { 794 if (amdgpu_sriov_vf(adev) && 795 !amdgpu_sriov_runtime(adev) && 796 adev->gfx.rlc.rlcg_reg_access_supported && 797 amdgpu_virt_get_rlcg_reg_access_flag(adev, acc_flags, 798 GC_HWIP, false, 799 &rlcg_flag)) { 800 ret = amdgpu_virt_rlcg_reg_rw(adev, reg, 0, rlcg_flag, GET_INST(GC, xcc_id)); 801 } else if (!(acc_flags & AMDGPU_REGS_NO_KIQ) && 802 amdgpu_sriov_runtime(adev) && 803 down_read_trylock(&adev->reset_domain->sem)) { 804 ret = amdgpu_kiq_rreg(adev, reg, xcc_id); 805 up_read(&adev->reset_domain->sem); 806 } else { 807 ret = readl(((void __iomem *)adev->rmmio) + (reg * 4)); 808 } 809 } else { 810 ret = adev->pcie_rreg(adev, reg * 4); 811 } 812 813 return ret; 814 } 815 816 /* 817 * MMIO register write with bytes helper functions 818 * @offset:bytes offset from MMIO start 819 * @value: the value want to be written to the register 820 */ 821 822 /** 823 * amdgpu_mm_wreg8 - read a memory mapped IO register 824 * 825 * @adev: amdgpu_device pointer 826 * @offset: byte aligned register offset 827 * @value: 8 bit value to write 828 * 829 * Writes the value specified to the offset specified. 830 */ 831 void amdgpu_mm_wreg8(struct amdgpu_device *adev, uint32_t offset, uint8_t value) 832 { 833 if (amdgpu_device_skip_hw_access(adev)) 834 return; 835 836 if (offset < adev->rmmio_size) 837 writeb(value, adev->rmmio + offset); 838 else 839 BUG(); 840 } 841 842 /** 843 * amdgpu_device_wreg - write to a memory mapped IO or indirect register 844 * 845 * @adev: amdgpu_device pointer 846 * @reg: dword aligned register offset 847 * @v: 32 bit value to write to the register 848 * @acc_flags: access flags which require special behavior 849 * 850 * Writes the value specified to the offset specified. 851 */ 852 void amdgpu_device_wreg(struct amdgpu_device *adev, 853 uint32_t reg, uint32_t v, 854 uint32_t acc_flags) 855 { 856 if (amdgpu_device_skip_hw_access(adev)) 857 return; 858 859 if ((reg * 4) < adev->rmmio_size) { 860 if (!(acc_flags & AMDGPU_REGS_NO_KIQ) && 861 amdgpu_sriov_runtime(adev) && 862 down_read_trylock(&adev->reset_domain->sem)) { 863 amdgpu_kiq_wreg(adev, reg, v, 0); 864 up_read(&adev->reset_domain->sem); 865 } else { 866 writel(v, ((void __iomem *)adev->rmmio) + (reg * 4)); 867 } 868 } else { 869 adev->pcie_wreg(adev, reg * 4, v); 870 } 871 872 trace_amdgpu_device_wreg(adev->pdev->device, reg, v); 873 } 874 875 /** 876 * amdgpu_mm_wreg_mmio_rlc - write register either with direct/indirect mmio or with RLC path if in range 877 * 878 * @adev: amdgpu_device pointer 879 * @reg: mmio/rlc register 880 * @v: value to write 881 * @xcc_id: xcc accelerated compute core id 882 * 883 * this function is invoked only for the debugfs register access 884 */ 885 void amdgpu_mm_wreg_mmio_rlc(struct amdgpu_device *adev, 886 uint32_t reg, uint32_t v, 887 uint32_t xcc_id) 888 { 889 if (amdgpu_device_skip_hw_access(adev)) 890 return; 891 892 if (amdgpu_sriov_fullaccess(adev) && 893 adev->gfx.rlc.funcs && 894 adev->gfx.rlc.funcs->is_rlcg_access_range) { 895 if (adev->gfx.rlc.funcs->is_rlcg_access_range(adev, reg)) 896 return amdgpu_sriov_wreg(adev, reg, v, 0, 0, xcc_id); 897 } else if ((reg * 4) >= adev->rmmio_size) { 898 adev->pcie_wreg(adev, reg * 4, v); 899 } else { 900 writel(v, ((void __iomem *)adev->rmmio) + (reg * 4)); 901 } 902 } 903 904 /** 905 * amdgpu_device_xcc_wreg - write to a memory mapped IO or indirect register with specific XCC 906 * 907 * @adev: amdgpu_device pointer 908 * @reg: dword aligned register offset 909 * @v: 32 bit value to write to the register 910 * @acc_flags: access flags which require special behavior 911 * @xcc_id: xcc accelerated compute core id 912 * 913 * Writes the value specified to the offset specified. 914 */ 915 void amdgpu_device_xcc_wreg(struct amdgpu_device *adev, 916 uint32_t reg, uint32_t v, 917 uint32_t acc_flags, uint32_t xcc_id) 918 { 919 uint32_t rlcg_flag; 920 921 if (amdgpu_device_skip_hw_access(adev)) 922 return; 923 924 if ((reg * 4) < adev->rmmio_size) { 925 if (amdgpu_sriov_vf(adev) && 926 !amdgpu_sriov_runtime(adev) && 927 adev->gfx.rlc.rlcg_reg_access_supported && 928 amdgpu_virt_get_rlcg_reg_access_flag(adev, acc_flags, 929 GC_HWIP, true, 930 &rlcg_flag)) { 931 amdgpu_virt_rlcg_reg_rw(adev, reg, v, rlcg_flag, GET_INST(GC, xcc_id)); 932 } else if (!(acc_flags & AMDGPU_REGS_NO_KIQ) && 933 amdgpu_sriov_runtime(adev) && 934 down_read_trylock(&adev->reset_domain->sem)) { 935 amdgpu_kiq_wreg(adev, reg, v, xcc_id); 936 up_read(&adev->reset_domain->sem); 937 } else { 938 writel(v, ((void __iomem *)adev->rmmio) + (reg * 4)); 939 } 940 } else { 941 adev->pcie_wreg(adev, reg * 4, v); 942 } 943 } 944 945 /** 946 * amdgpu_device_indirect_rreg - read an indirect register 947 * 948 * @adev: amdgpu_device pointer 949 * @reg_addr: indirect register address to read from 950 * 951 * Returns the value of indirect register @reg_addr 952 */ 953 u32 amdgpu_device_indirect_rreg(struct amdgpu_device *adev, 954 u32 reg_addr) 955 { 956 unsigned long flags, pcie_index, pcie_data; 957 void __iomem *pcie_index_offset; 958 void __iomem *pcie_data_offset; 959 u32 r; 960 961 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 962 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 963 964 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 965 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 966 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 967 968 writel(reg_addr, pcie_index_offset); 969 readl(pcie_index_offset); 970 r = readl(pcie_data_offset); 971 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 972 973 return r; 974 } 975 976 u32 amdgpu_device_indirect_rreg_ext(struct amdgpu_device *adev, 977 u64 reg_addr) 978 { 979 unsigned long flags, pcie_index, pcie_index_hi, pcie_data; 980 u32 r; 981 void __iomem *pcie_index_offset; 982 void __iomem *pcie_index_hi_offset; 983 void __iomem *pcie_data_offset; 984 985 if (unlikely(!adev->nbio.funcs)) { 986 pcie_index = AMDGPU_PCIE_INDEX_FALLBACK; 987 pcie_data = AMDGPU_PCIE_DATA_FALLBACK; 988 } else { 989 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 990 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 991 } 992 993 if (reg_addr >> 32) { 994 if (unlikely(!adev->nbio.funcs)) 995 pcie_index_hi = AMDGPU_PCIE_INDEX_HI_FALLBACK; 996 else 997 pcie_index_hi = adev->nbio.funcs->get_pcie_index_hi_offset(adev); 998 } else { 999 pcie_index_hi = 0; 1000 } 1001 1002 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 1003 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 1004 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 1005 if (pcie_index_hi != 0) 1006 pcie_index_hi_offset = (void __iomem *)adev->rmmio + 1007 pcie_index_hi * 4; 1008 1009 writel(reg_addr, pcie_index_offset); 1010 readl(pcie_index_offset); 1011 if (pcie_index_hi != 0) { 1012 writel((reg_addr >> 32) & 0xff, pcie_index_hi_offset); 1013 readl(pcie_index_hi_offset); 1014 } 1015 r = readl(pcie_data_offset); 1016 1017 /* clear the high bits */ 1018 if (pcie_index_hi != 0) { 1019 writel(0, pcie_index_hi_offset); 1020 readl(pcie_index_hi_offset); 1021 } 1022 1023 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 1024 1025 return r; 1026 } 1027 1028 /** 1029 * amdgpu_device_indirect_rreg64 - read a 64bits indirect register 1030 * 1031 * @adev: amdgpu_device pointer 1032 * @reg_addr: indirect register address to read from 1033 * 1034 * Returns the value of indirect register @reg_addr 1035 */ 1036 u64 amdgpu_device_indirect_rreg64(struct amdgpu_device *adev, 1037 u32 reg_addr) 1038 { 1039 unsigned long flags, pcie_index, pcie_data; 1040 void __iomem *pcie_index_offset; 1041 void __iomem *pcie_data_offset; 1042 u64 r; 1043 1044 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 1045 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 1046 1047 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 1048 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 1049 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 1050 1051 /* read low 32 bits */ 1052 writel(reg_addr, pcie_index_offset); 1053 readl(pcie_index_offset); 1054 r = readl(pcie_data_offset); 1055 /* read high 32 bits */ 1056 writel(reg_addr + 4, pcie_index_offset); 1057 readl(pcie_index_offset); 1058 r |= ((u64)readl(pcie_data_offset) << 32); 1059 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 1060 1061 return r; 1062 } 1063 1064 u64 amdgpu_device_indirect_rreg64_ext(struct amdgpu_device *adev, 1065 u64 reg_addr) 1066 { 1067 unsigned long flags, pcie_index, pcie_data; 1068 unsigned long pcie_index_hi = 0; 1069 void __iomem *pcie_index_offset; 1070 void __iomem *pcie_index_hi_offset; 1071 void __iomem *pcie_data_offset; 1072 u64 r; 1073 1074 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 1075 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 1076 if ((reg_addr >> 32) && (adev->nbio.funcs->get_pcie_index_hi_offset)) 1077 pcie_index_hi = adev->nbio.funcs->get_pcie_index_hi_offset(adev); 1078 1079 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 1080 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 1081 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 1082 if (pcie_index_hi != 0) 1083 pcie_index_hi_offset = (void __iomem *)adev->rmmio + 1084 pcie_index_hi * 4; 1085 1086 /* read low 32 bits */ 1087 writel(reg_addr, pcie_index_offset); 1088 readl(pcie_index_offset); 1089 if (pcie_index_hi != 0) { 1090 writel((reg_addr >> 32) & 0xff, pcie_index_hi_offset); 1091 readl(pcie_index_hi_offset); 1092 } 1093 r = readl(pcie_data_offset); 1094 /* read high 32 bits */ 1095 writel(reg_addr + 4, pcie_index_offset); 1096 readl(pcie_index_offset); 1097 if (pcie_index_hi != 0) { 1098 writel((reg_addr >> 32) & 0xff, pcie_index_hi_offset); 1099 readl(pcie_index_hi_offset); 1100 } 1101 r |= ((u64)readl(pcie_data_offset) << 32); 1102 1103 /* clear the high bits */ 1104 if (pcie_index_hi != 0) { 1105 writel(0, pcie_index_hi_offset); 1106 readl(pcie_index_hi_offset); 1107 } 1108 1109 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 1110 1111 return r; 1112 } 1113 1114 /** 1115 * amdgpu_device_indirect_wreg - write an indirect register address 1116 * 1117 * @adev: amdgpu_device pointer 1118 * @reg_addr: indirect register offset 1119 * @reg_data: indirect register data 1120 * 1121 */ 1122 void amdgpu_device_indirect_wreg(struct amdgpu_device *adev, 1123 u32 reg_addr, u32 reg_data) 1124 { 1125 unsigned long flags, pcie_index, pcie_data; 1126 void __iomem *pcie_index_offset; 1127 void __iomem *pcie_data_offset; 1128 1129 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 1130 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 1131 1132 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 1133 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 1134 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 1135 1136 writel(reg_addr, pcie_index_offset); 1137 readl(pcie_index_offset); 1138 writel(reg_data, pcie_data_offset); 1139 readl(pcie_data_offset); 1140 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 1141 } 1142 1143 void amdgpu_device_indirect_wreg_ext(struct amdgpu_device *adev, 1144 u64 reg_addr, u32 reg_data) 1145 { 1146 unsigned long flags, pcie_index, pcie_index_hi, pcie_data; 1147 void __iomem *pcie_index_offset; 1148 void __iomem *pcie_index_hi_offset; 1149 void __iomem *pcie_data_offset; 1150 1151 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 1152 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 1153 if ((reg_addr >> 32) && (adev->nbio.funcs->get_pcie_index_hi_offset)) 1154 pcie_index_hi = adev->nbio.funcs->get_pcie_index_hi_offset(adev); 1155 else 1156 pcie_index_hi = 0; 1157 1158 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 1159 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 1160 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 1161 if (pcie_index_hi != 0) 1162 pcie_index_hi_offset = (void __iomem *)adev->rmmio + 1163 pcie_index_hi * 4; 1164 1165 writel(reg_addr, pcie_index_offset); 1166 readl(pcie_index_offset); 1167 if (pcie_index_hi != 0) { 1168 writel((reg_addr >> 32) & 0xff, pcie_index_hi_offset); 1169 readl(pcie_index_hi_offset); 1170 } 1171 writel(reg_data, pcie_data_offset); 1172 readl(pcie_data_offset); 1173 1174 /* clear the high bits */ 1175 if (pcie_index_hi != 0) { 1176 writel(0, pcie_index_hi_offset); 1177 readl(pcie_index_hi_offset); 1178 } 1179 1180 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 1181 } 1182 1183 /** 1184 * amdgpu_device_indirect_wreg64 - write a 64bits indirect register address 1185 * 1186 * @adev: amdgpu_device pointer 1187 * @reg_addr: indirect register offset 1188 * @reg_data: indirect register data 1189 * 1190 */ 1191 void amdgpu_device_indirect_wreg64(struct amdgpu_device *adev, 1192 u32 reg_addr, u64 reg_data) 1193 { 1194 unsigned long flags, pcie_index, pcie_data; 1195 void __iomem *pcie_index_offset; 1196 void __iomem *pcie_data_offset; 1197 1198 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 1199 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 1200 1201 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 1202 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 1203 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 1204 1205 /* write low 32 bits */ 1206 writel(reg_addr, pcie_index_offset); 1207 readl(pcie_index_offset); 1208 writel((u32)(reg_data & 0xffffffffULL), pcie_data_offset); 1209 readl(pcie_data_offset); 1210 /* write high 32 bits */ 1211 writel(reg_addr + 4, pcie_index_offset); 1212 readl(pcie_index_offset); 1213 writel((u32)(reg_data >> 32), pcie_data_offset); 1214 readl(pcie_data_offset); 1215 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 1216 } 1217 1218 void amdgpu_device_indirect_wreg64_ext(struct amdgpu_device *adev, 1219 u64 reg_addr, u64 reg_data) 1220 { 1221 unsigned long flags, pcie_index, pcie_data; 1222 unsigned long pcie_index_hi = 0; 1223 void __iomem *pcie_index_offset; 1224 void __iomem *pcie_index_hi_offset; 1225 void __iomem *pcie_data_offset; 1226 1227 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 1228 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 1229 if ((reg_addr >> 32) && (adev->nbio.funcs->get_pcie_index_hi_offset)) 1230 pcie_index_hi = adev->nbio.funcs->get_pcie_index_hi_offset(adev); 1231 1232 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 1233 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 1234 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 1235 if (pcie_index_hi != 0) 1236 pcie_index_hi_offset = (void __iomem *)adev->rmmio + 1237 pcie_index_hi * 4; 1238 1239 /* write low 32 bits */ 1240 writel(reg_addr, pcie_index_offset); 1241 readl(pcie_index_offset); 1242 if (pcie_index_hi != 0) { 1243 writel((reg_addr >> 32) & 0xff, pcie_index_hi_offset); 1244 readl(pcie_index_hi_offset); 1245 } 1246 writel((u32)(reg_data & 0xffffffffULL), pcie_data_offset); 1247 readl(pcie_data_offset); 1248 /* write high 32 bits */ 1249 writel(reg_addr + 4, pcie_index_offset); 1250 readl(pcie_index_offset); 1251 if (pcie_index_hi != 0) { 1252 writel((reg_addr >> 32) & 0xff, pcie_index_hi_offset); 1253 readl(pcie_index_hi_offset); 1254 } 1255 writel((u32)(reg_data >> 32), pcie_data_offset); 1256 readl(pcie_data_offset); 1257 1258 /* clear the high bits */ 1259 if (pcie_index_hi != 0) { 1260 writel(0, pcie_index_hi_offset); 1261 readl(pcie_index_hi_offset); 1262 } 1263 1264 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 1265 } 1266 1267 /** 1268 * amdgpu_device_get_rev_id - query device rev_id 1269 * 1270 * @adev: amdgpu_device pointer 1271 * 1272 * Return device rev_id 1273 */ 1274 u32 amdgpu_device_get_rev_id(struct amdgpu_device *adev) 1275 { 1276 return adev->nbio.funcs->get_rev_id(adev); 1277 } 1278 1279 /** 1280 * amdgpu_invalid_rreg - dummy reg read function 1281 * 1282 * @adev: amdgpu_device pointer 1283 * @reg: offset of register 1284 * 1285 * Dummy register read function. Used for register blocks 1286 * that certain asics don't have (all asics). 1287 * Returns the value in the register. 1288 */ 1289 static uint32_t amdgpu_invalid_rreg(struct amdgpu_device *adev, uint32_t reg) 1290 { 1291 dev_err(adev->dev, "Invalid callback to read register 0x%04X\n", reg); 1292 BUG(); 1293 return 0; 1294 } 1295 1296 static uint32_t amdgpu_invalid_rreg_ext(struct amdgpu_device *adev, uint64_t reg) 1297 { 1298 dev_err(adev->dev, "Invalid callback to read register 0x%llX\n", reg); 1299 BUG(); 1300 return 0; 1301 } 1302 1303 /** 1304 * amdgpu_invalid_wreg - dummy reg write function 1305 * 1306 * @adev: amdgpu_device pointer 1307 * @reg: offset of register 1308 * @v: value to write to the register 1309 * 1310 * Dummy register read function. Used for register blocks 1311 * that certain asics don't have (all asics). 1312 */ 1313 static void amdgpu_invalid_wreg(struct amdgpu_device *adev, uint32_t reg, uint32_t v) 1314 { 1315 dev_err(adev->dev, 1316 "Invalid callback to write register 0x%04X with 0x%08X\n", reg, 1317 v); 1318 BUG(); 1319 } 1320 1321 static void amdgpu_invalid_wreg_ext(struct amdgpu_device *adev, uint64_t reg, uint32_t v) 1322 { 1323 dev_err(adev->dev, 1324 "Invalid callback to write register 0x%llX with 0x%08X\n", reg, 1325 v); 1326 BUG(); 1327 } 1328 1329 /** 1330 * amdgpu_invalid_rreg64 - dummy 64 bit reg read function 1331 * 1332 * @adev: amdgpu_device pointer 1333 * @reg: offset of register 1334 * 1335 * Dummy register read function. Used for register blocks 1336 * that certain asics don't have (all asics). 1337 * Returns the value in the register. 1338 */ 1339 static uint64_t amdgpu_invalid_rreg64(struct amdgpu_device *adev, uint32_t reg) 1340 { 1341 dev_err(adev->dev, "Invalid callback to read 64 bit register 0x%04X\n", 1342 reg); 1343 BUG(); 1344 return 0; 1345 } 1346 1347 static uint64_t amdgpu_invalid_rreg64_ext(struct amdgpu_device *adev, uint64_t reg) 1348 { 1349 dev_err(adev->dev, "Invalid callback to read register 0x%llX\n", reg); 1350 BUG(); 1351 return 0; 1352 } 1353 1354 /** 1355 * amdgpu_invalid_wreg64 - dummy reg write function 1356 * 1357 * @adev: amdgpu_device pointer 1358 * @reg: offset of register 1359 * @v: value to write to the register 1360 * 1361 * Dummy register read function. Used for register blocks 1362 * that certain asics don't have (all asics). 1363 */ 1364 static void amdgpu_invalid_wreg64(struct amdgpu_device *adev, uint32_t reg, uint64_t v) 1365 { 1366 dev_err(adev->dev, 1367 "Invalid callback to write 64 bit register 0x%04X with 0x%08llX\n", 1368 reg, v); 1369 BUG(); 1370 } 1371 1372 static void amdgpu_invalid_wreg64_ext(struct amdgpu_device *adev, uint64_t reg, uint64_t v) 1373 { 1374 dev_err(adev->dev, 1375 "Invalid callback to write 64 bit register 0x%llX with 0x%08llX\n", 1376 reg, v); 1377 BUG(); 1378 } 1379 1380 /** 1381 * amdgpu_block_invalid_rreg - dummy reg read function 1382 * 1383 * @adev: amdgpu_device pointer 1384 * @block: offset of instance 1385 * @reg: offset of register 1386 * 1387 * Dummy register read function. Used for register blocks 1388 * that certain asics don't have (all asics). 1389 * Returns the value in the register. 1390 */ 1391 static uint32_t amdgpu_block_invalid_rreg(struct amdgpu_device *adev, 1392 uint32_t block, uint32_t reg) 1393 { 1394 dev_err(adev->dev, 1395 "Invalid callback to read register 0x%04X in block 0x%04X\n", 1396 reg, block); 1397 BUG(); 1398 return 0; 1399 } 1400 1401 /** 1402 * amdgpu_block_invalid_wreg - dummy reg write function 1403 * 1404 * @adev: amdgpu_device pointer 1405 * @block: offset of instance 1406 * @reg: offset of register 1407 * @v: value to write to the register 1408 * 1409 * Dummy register read function. Used for register blocks 1410 * that certain asics don't have (all asics). 1411 */ 1412 static void amdgpu_block_invalid_wreg(struct amdgpu_device *adev, 1413 uint32_t block, 1414 uint32_t reg, uint32_t v) 1415 { 1416 dev_err(adev->dev, 1417 "Invalid block callback to write register 0x%04X in block 0x%04X with 0x%08X\n", 1418 reg, block, v); 1419 BUG(); 1420 } 1421 1422 static uint32_t amdgpu_device_get_vbios_flags(struct amdgpu_device *adev) 1423 { 1424 if (hweight32(adev->aid_mask) && (adev->flags & AMD_IS_APU)) 1425 return AMDGPU_VBIOS_SKIP; 1426 1427 if (hweight32(adev->aid_mask) && amdgpu_passthrough(adev)) 1428 return AMDGPU_VBIOS_OPTIONAL; 1429 1430 return 0; 1431 } 1432 1433 /** 1434 * amdgpu_device_asic_init - Wrapper for atom asic_init 1435 * 1436 * @adev: amdgpu_device pointer 1437 * 1438 * Does any asic specific work and then calls atom asic init. 1439 */ 1440 static int amdgpu_device_asic_init(struct amdgpu_device *adev) 1441 { 1442 uint32_t flags; 1443 bool optional; 1444 int ret; 1445 1446 amdgpu_asic_pre_asic_init(adev); 1447 flags = amdgpu_device_get_vbios_flags(adev); 1448 optional = !!(flags & (AMDGPU_VBIOS_OPTIONAL | AMDGPU_VBIOS_SKIP)); 1449 1450 if (amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 3) || 1451 amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 4) || 1452 amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 5, 0) || 1453 amdgpu_ip_version(adev, GC_HWIP, 0) >= IP_VERSION(11, 0, 0)) { 1454 amdgpu_psp_wait_for_bootloader(adev); 1455 if (optional && !adev->bios) 1456 return 0; 1457 1458 ret = amdgpu_atomfirmware_asic_init(adev, true); 1459 return ret; 1460 } else { 1461 if (optional && !adev->bios) 1462 return 0; 1463 1464 return amdgpu_atom_asic_init(adev->mode_info.atom_context); 1465 } 1466 1467 return 0; 1468 } 1469 1470 /** 1471 * amdgpu_device_mem_scratch_init - allocate the VRAM scratch page 1472 * 1473 * @adev: amdgpu_device pointer 1474 * 1475 * Allocates a scratch page of VRAM for use by various things in the 1476 * driver. 1477 */ 1478 static int amdgpu_device_mem_scratch_init(struct amdgpu_device *adev) 1479 { 1480 return amdgpu_bo_create_kernel(adev, AMDGPU_GPU_PAGE_SIZE, PAGE_SIZE, 1481 AMDGPU_GEM_DOMAIN_VRAM | 1482 AMDGPU_GEM_DOMAIN_GTT, 1483 &adev->mem_scratch.robj, 1484 &adev->mem_scratch.gpu_addr, 1485 (void **)&adev->mem_scratch.ptr); 1486 } 1487 1488 /** 1489 * amdgpu_device_mem_scratch_fini - Free the VRAM scratch page 1490 * 1491 * @adev: amdgpu_device pointer 1492 * 1493 * Frees the VRAM scratch page. 1494 */ 1495 static void amdgpu_device_mem_scratch_fini(struct amdgpu_device *adev) 1496 { 1497 amdgpu_bo_free_kernel(&adev->mem_scratch.robj, NULL, NULL); 1498 } 1499 1500 /** 1501 * amdgpu_device_program_register_sequence - program an array of registers. 1502 * 1503 * @adev: amdgpu_device pointer 1504 * @registers: pointer to the register array 1505 * @array_size: size of the register array 1506 * 1507 * Programs an array or registers with and or masks. 1508 * This is a helper for setting golden registers. 1509 */ 1510 void amdgpu_device_program_register_sequence(struct amdgpu_device *adev, 1511 const u32 *registers, 1512 const u32 array_size) 1513 { 1514 u32 tmp, reg, and_mask, or_mask; 1515 int i; 1516 1517 if (array_size % 3) 1518 return; 1519 1520 for (i = 0; i < array_size; i += 3) { 1521 reg = registers[i + 0]; 1522 and_mask = registers[i + 1]; 1523 or_mask = registers[i + 2]; 1524 1525 if (and_mask == 0xffffffff) { 1526 tmp = or_mask; 1527 } else { 1528 tmp = RREG32(reg); 1529 tmp &= ~and_mask; 1530 if (adev->family >= AMDGPU_FAMILY_AI) 1531 tmp |= (or_mask & and_mask); 1532 else 1533 tmp |= or_mask; 1534 } 1535 WREG32(reg, tmp); 1536 } 1537 } 1538 1539 /** 1540 * amdgpu_device_pci_config_reset - reset the GPU 1541 * 1542 * @adev: amdgpu_device pointer 1543 * 1544 * Resets the GPU using the pci config reset sequence. 1545 * Only applicable to asics prior to vega10. 1546 */ 1547 void amdgpu_device_pci_config_reset(struct amdgpu_device *adev) 1548 { 1549 pci_write_config_dword(adev->pdev, 0x7c, AMDGPU_ASIC_RESET_DATA); 1550 } 1551 1552 /** 1553 * amdgpu_device_pci_reset - reset the GPU using generic PCI means 1554 * 1555 * @adev: amdgpu_device pointer 1556 * 1557 * Resets the GPU using generic pci reset interfaces (FLR, SBR, etc.). 1558 */ 1559 int amdgpu_device_pci_reset(struct amdgpu_device *adev) 1560 { 1561 return pci_reset_function(adev->pdev); 1562 } 1563 1564 /* 1565 * amdgpu_device_wb_*() 1566 * Writeback is the method by which the GPU updates special pages in memory 1567 * with the status of certain GPU events (fences, ring pointers,etc.). 1568 */ 1569 1570 /** 1571 * amdgpu_device_wb_fini - Disable Writeback and free memory 1572 * 1573 * @adev: amdgpu_device pointer 1574 * 1575 * Disables Writeback and frees the Writeback memory (all asics). 1576 * Used at driver shutdown. 1577 */ 1578 static void amdgpu_device_wb_fini(struct amdgpu_device *adev) 1579 { 1580 if (adev->wb.wb_obj) { 1581 amdgpu_bo_free_kernel(&adev->wb.wb_obj, 1582 &adev->wb.gpu_addr, 1583 (void **)&adev->wb.wb); 1584 adev->wb.wb_obj = NULL; 1585 } 1586 } 1587 1588 /** 1589 * amdgpu_device_wb_init - Init Writeback driver info and allocate memory 1590 * 1591 * @adev: amdgpu_device pointer 1592 * 1593 * Initializes writeback and allocates writeback memory (all asics). 1594 * Used at driver startup. 1595 * Returns 0 on success or an -error on failure. 1596 */ 1597 static int amdgpu_device_wb_init(struct amdgpu_device *adev) 1598 { 1599 int r; 1600 1601 if (adev->wb.wb_obj == NULL) { 1602 /* AMDGPU_MAX_WB * sizeof(uint32_t) * 8 = AMDGPU_MAX_WB 256bit slots */ 1603 r = amdgpu_bo_create_kernel(adev, AMDGPU_MAX_WB * sizeof(uint32_t) * 8, 1604 PAGE_SIZE, AMDGPU_GEM_DOMAIN_GTT, 1605 &adev->wb.wb_obj, &adev->wb.gpu_addr, 1606 (void **)&adev->wb.wb); 1607 if (r) { 1608 dev_warn(adev->dev, "(%d) create WB bo failed\n", r); 1609 return r; 1610 } 1611 1612 adev->wb.num_wb = AMDGPU_MAX_WB; 1613 memset(&adev->wb.used, 0, sizeof(adev->wb.used)); 1614 1615 /* clear wb memory */ 1616 memset((char *)adev->wb.wb, 0, AMDGPU_MAX_WB * sizeof(uint32_t) * 8); 1617 } 1618 1619 return 0; 1620 } 1621 1622 /** 1623 * amdgpu_device_wb_get - Allocate a wb entry 1624 * 1625 * @adev: amdgpu_device pointer 1626 * @wb: wb index 1627 * 1628 * Allocate a wb slot for use by the driver (all asics). 1629 * Returns 0 on success or -EINVAL on failure. 1630 */ 1631 int amdgpu_device_wb_get(struct amdgpu_device *adev, u32 *wb) 1632 { 1633 unsigned long flags, offset; 1634 1635 spin_lock_irqsave(&adev->wb.lock, flags); 1636 offset = find_first_zero_bit(adev->wb.used, adev->wb.num_wb); 1637 if (offset < adev->wb.num_wb) { 1638 __set_bit(offset, adev->wb.used); 1639 spin_unlock_irqrestore(&adev->wb.lock, flags); 1640 *wb = offset << 3; /* convert to dw offset */ 1641 return 0; 1642 } else { 1643 spin_unlock_irqrestore(&adev->wb.lock, flags); 1644 return -EINVAL; 1645 } 1646 } 1647 1648 /** 1649 * amdgpu_device_wb_free - Free a wb entry 1650 * 1651 * @adev: amdgpu_device pointer 1652 * @wb: wb index 1653 * 1654 * Free a wb slot allocated for use by the driver (all asics) 1655 */ 1656 void amdgpu_device_wb_free(struct amdgpu_device *adev, u32 wb) 1657 { 1658 unsigned long flags; 1659 1660 wb >>= 3; 1661 spin_lock_irqsave(&adev->wb.lock, flags); 1662 if (wb < adev->wb.num_wb) 1663 __clear_bit(wb, adev->wb.used); 1664 spin_unlock_irqrestore(&adev->wb.lock, flags); 1665 } 1666 1667 /** 1668 * amdgpu_device_resize_fb_bar - try to resize FB BAR 1669 * 1670 * @adev: amdgpu_device pointer 1671 * 1672 * Try to resize FB BAR to make all VRAM CPU accessible. We try very hard not 1673 * to fail, but if any of the BARs is not accessible after the size we abort 1674 * driver loading by returning -ENODEV. 1675 */ 1676 int amdgpu_device_resize_fb_bar(struct amdgpu_device *adev) 1677 { 1678 int rbar_size = pci_rebar_bytes_to_size(adev->gmc.real_vram_size); 1679 struct pci_bus *root; 1680 struct resource *res; 1681 unsigned int i; 1682 u16 cmd; 1683 int r; 1684 1685 if (!IS_ENABLED(CONFIG_PHYS_ADDR_T_64BIT)) 1686 return 0; 1687 1688 /* Bypass for VF */ 1689 if (amdgpu_sriov_vf(adev)) 1690 return 0; 1691 1692 if (!amdgpu_rebar) 1693 return 0; 1694 1695 /* resizing on Dell G5 SE platforms causes problems with runtime pm */ 1696 if ((amdgpu_runtime_pm != 0) && 1697 adev->pdev->vendor == PCI_VENDOR_ID_ATI && 1698 adev->pdev->device == 0x731f && 1699 adev->pdev->subsystem_vendor == PCI_VENDOR_ID_DELL) 1700 return 0; 1701 1702 /* PCI_EXT_CAP_ID_VNDR extended capability is located at 0x100 */ 1703 if (!pci_find_ext_capability(adev->pdev, PCI_EXT_CAP_ID_VNDR)) 1704 dev_warn( 1705 adev->dev, 1706 "System can't access extended configuration space, please check!!\n"); 1707 1708 /* skip if the bios has already enabled large BAR */ 1709 if (adev->gmc.real_vram_size && 1710 (pci_resource_len(adev->pdev, 0) >= adev->gmc.real_vram_size)) 1711 return 0; 1712 1713 /* Check if the root BUS has 64bit memory resources */ 1714 root = adev->pdev->bus; 1715 while (root->parent) 1716 root = root->parent; 1717 1718 pci_bus_for_each_resource(root, res, i) { 1719 if (res && res->flags & (IORESOURCE_MEM | IORESOURCE_MEM_64) && 1720 res->start > 0x100000000ull) 1721 break; 1722 } 1723 1724 /* Trying to resize is pointless without a root hub window above 4GB */ 1725 if (!res) 1726 return 0; 1727 1728 /* Limit the BAR size to what is available */ 1729 rbar_size = min(fls(pci_rebar_get_possible_sizes(adev->pdev, 0)) - 1, 1730 rbar_size); 1731 1732 /* Disable memory decoding while we change the BAR addresses and size */ 1733 pci_read_config_word(adev->pdev, PCI_COMMAND, &cmd); 1734 pci_write_config_word(adev->pdev, PCI_COMMAND, 1735 cmd & ~PCI_COMMAND_MEMORY); 1736 1737 /* Free the VRAM and doorbell BAR, we most likely need to move both. */ 1738 amdgpu_doorbell_fini(adev); 1739 if (adev->asic_type >= CHIP_BONAIRE) 1740 pci_release_resource(adev->pdev, 2); 1741 1742 pci_release_resource(adev->pdev, 0); 1743 1744 r = pci_resize_resource(adev->pdev, 0, rbar_size); 1745 if (r == -ENOSPC) 1746 dev_info(adev->dev, 1747 "Not enough PCI address space for a large BAR."); 1748 else if (r && r != -ENOTSUPP) 1749 dev_err(adev->dev, "Problem resizing BAR0 (%d).", r); 1750 1751 pci_assign_unassigned_bus_resources(adev->pdev->bus); 1752 1753 /* When the doorbell or fb BAR isn't available we have no chance of 1754 * using the device. 1755 */ 1756 r = amdgpu_doorbell_init(adev); 1757 if (r || (pci_resource_flags(adev->pdev, 0) & IORESOURCE_UNSET)) 1758 return -ENODEV; 1759 1760 pci_write_config_word(adev->pdev, PCI_COMMAND, cmd); 1761 1762 return 0; 1763 } 1764 1765 /* 1766 * GPU helpers function. 1767 */ 1768 /** 1769 * amdgpu_device_need_post - check if the hw need post or not 1770 * 1771 * @adev: amdgpu_device pointer 1772 * 1773 * Check if the asic has been initialized (all asics) at driver startup 1774 * or post is needed if hw reset is performed. 1775 * Returns true if need or false if not. 1776 */ 1777 bool amdgpu_device_need_post(struct amdgpu_device *adev) 1778 { 1779 uint32_t reg, flags; 1780 1781 if (amdgpu_sriov_vf(adev)) 1782 return false; 1783 1784 flags = amdgpu_device_get_vbios_flags(adev); 1785 if (flags & AMDGPU_VBIOS_SKIP) 1786 return false; 1787 if ((flags & AMDGPU_VBIOS_OPTIONAL) && !adev->bios) 1788 return false; 1789 1790 if (amdgpu_passthrough(adev)) { 1791 /* for FIJI: In whole GPU pass-through virtualization case, after VM reboot 1792 * some old smc fw still need driver do vPost otherwise gpu hang, while 1793 * those smc fw version above 22.15 doesn't have this flaw, so we force 1794 * vpost executed for smc version below 22.15 1795 */ 1796 if (adev->asic_type == CHIP_FIJI) { 1797 int err; 1798 uint32_t fw_ver; 1799 1800 err = request_firmware(&adev->pm.fw, "amdgpu/fiji_smc.bin", adev->dev); 1801 /* force vPost if error occurred */ 1802 if (err) 1803 return true; 1804 1805 fw_ver = *((uint32_t *)adev->pm.fw->data + 69); 1806 release_firmware(adev->pm.fw); 1807 if (fw_ver < 0x00160e00) 1808 return true; 1809 } 1810 } 1811 1812 /* Don't post if we need to reset whole hive on init */ 1813 if (adev->init_lvl->level == AMDGPU_INIT_LEVEL_MINIMAL_XGMI) 1814 return false; 1815 1816 if (adev->has_hw_reset) { 1817 adev->has_hw_reset = false; 1818 return true; 1819 } 1820 1821 /* bios scratch used on CIK+ */ 1822 if (adev->asic_type >= CHIP_BONAIRE) 1823 return amdgpu_atombios_scratch_need_asic_init(adev); 1824 1825 /* check MEM_SIZE for older asics */ 1826 reg = amdgpu_asic_get_config_memsize(adev); 1827 1828 if ((reg != 0) && (reg != 0xffffffff)) 1829 return false; 1830 1831 return true; 1832 } 1833 1834 /* 1835 * Check whether seamless boot is supported. 1836 * 1837 * So far we only support seamless boot on DCE 3.0 or later. 1838 * If users report that it works on older ASICS as well, we may 1839 * loosen this. 1840 */ 1841 bool amdgpu_device_seamless_boot_supported(struct amdgpu_device *adev) 1842 { 1843 switch (amdgpu_seamless) { 1844 case -1: 1845 break; 1846 case 1: 1847 return true; 1848 case 0: 1849 return false; 1850 default: 1851 dev_err(adev->dev, "Invalid value for amdgpu.seamless: %d\n", 1852 amdgpu_seamless); 1853 return false; 1854 } 1855 1856 if (!(adev->flags & AMD_IS_APU)) 1857 return false; 1858 1859 if (adev->mman.keep_stolen_vga_memory) 1860 return false; 1861 1862 return amdgpu_ip_version(adev, DCE_HWIP, 0) >= IP_VERSION(3, 0, 0); 1863 } 1864 1865 /* 1866 * Intel hosts such as Rocket Lake, Alder Lake, Raptor Lake and Sapphire Rapids 1867 * don't support dynamic speed switching. Until we have confirmation from Intel 1868 * that a specific host supports it, it's safer that we keep it disabled for all. 1869 * 1870 * https://edc.intel.com/content/www/us/en/design/products/platforms/details/raptor-lake-s/13th-generation-core-processors-datasheet-volume-1-of-2/005/pci-express-support/ 1871 * https://gitlab.freedesktop.org/drm/amd/-/issues/2663 1872 */ 1873 static bool amdgpu_device_pcie_dynamic_switching_supported(struct amdgpu_device *adev) 1874 { 1875 #if IS_ENABLED(CONFIG_X86) 1876 struct cpuinfo_x86 *c = &cpu_data(0); 1877 1878 /* eGPU change speeds based on USB4 fabric conditions */ 1879 if (dev_is_removable(adev->dev)) 1880 return true; 1881 1882 if (c->x86_vendor == X86_VENDOR_INTEL) 1883 return false; 1884 #endif 1885 return true; 1886 } 1887 1888 static bool amdgpu_device_aspm_support_quirk(struct amdgpu_device *adev) 1889 { 1890 #if IS_ENABLED(CONFIG_X86) 1891 struct cpuinfo_x86 *c = &cpu_data(0); 1892 1893 if (!(amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(12, 0, 0) || 1894 amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(12, 0, 1))) 1895 return false; 1896 1897 if (c->x86 == 6 && 1898 adev->pm.pcie_gen_mask & CAIL_PCIE_LINK_SPEED_SUPPORT_GEN5) { 1899 switch (c->x86_model) { 1900 case VFM_MODEL(INTEL_ALDERLAKE): 1901 case VFM_MODEL(INTEL_ALDERLAKE_L): 1902 case VFM_MODEL(INTEL_RAPTORLAKE): 1903 case VFM_MODEL(INTEL_RAPTORLAKE_P): 1904 case VFM_MODEL(INTEL_RAPTORLAKE_S): 1905 return true; 1906 default: 1907 return false; 1908 } 1909 } else { 1910 return false; 1911 } 1912 #else 1913 return false; 1914 #endif 1915 } 1916 1917 /** 1918 * amdgpu_device_should_use_aspm - check if the device should program ASPM 1919 * 1920 * @adev: amdgpu_device pointer 1921 * 1922 * Confirm whether the module parameter and pcie bridge agree that ASPM should 1923 * be set for this device. 1924 * 1925 * Returns true if it should be used or false if not. 1926 */ 1927 bool amdgpu_device_should_use_aspm(struct amdgpu_device *adev) 1928 { 1929 switch (amdgpu_aspm) { 1930 case -1: 1931 break; 1932 case 0: 1933 return false; 1934 case 1: 1935 return true; 1936 default: 1937 return false; 1938 } 1939 if (adev->flags & AMD_IS_APU) 1940 return false; 1941 if (amdgpu_device_aspm_support_quirk(adev)) 1942 return false; 1943 return pcie_aspm_enabled(adev->pdev); 1944 } 1945 1946 /* if we get transitioned to only one device, take VGA back */ 1947 /** 1948 * amdgpu_device_vga_set_decode - enable/disable vga decode 1949 * 1950 * @pdev: PCI device pointer 1951 * @state: enable/disable vga decode 1952 * 1953 * Enable/disable vga decode (all asics). 1954 * Returns VGA resource flags. 1955 */ 1956 static unsigned int amdgpu_device_vga_set_decode(struct pci_dev *pdev, 1957 bool state) 1958 { 1959 struct amdgpu_device *adev = drm_to_adev(pci_get_drvdata(pdev)); 1960 1961 amdgpu_asic_set_vga_state(adev, state); 1962 if (state) 1963 return VGA_RSRC_LEGACY_IO | VGA_RSRC_LEGACY_MEM | 1964 VGA_RSRC_NORMAL_IO | VGA_RSRC_NORMAL_MEM; 1965 else 1966 return VGA_RSRC_NORMAL_IO | VGA_RSRC_NORMAL_MEM; 1967 } 1968 1969 /** 1970 * amdgpu_device_check_block_size - validate the vm block size 1971 * 1972 * @adev: amdgpu_device pointer 1973 * 1974 * Validates the vm block size specified via module parameter. 1975 * The vm block size defines number of bits in page table versus page directory, 1976 * a page is 4KB so we have 12 bits offset, minimum 9 bits in the 1977 * page table and the remaining bits are in the page directory. 1978 */ 1979 static void amdgpu_device_check_block_size(struct amdgpu_device *adev) 1980 { 1981 /* defines number of bits in page table versus page directory, 1982 * a page is 4KB so we have 12 bits offset, minimum 9 bits in the 1983 * page table and the remaining bits are in the page directory 1984 */ 1985 if (amdgpu_vm_block_size == -1) 1986 return; 1987 1988 if (amdgpu_vm_block_size < 9) { 1989 dev_warn(adev->dev, "VM page table size (%d) too small\n", 1990 amdgpu_vm_block_size); 1991 amdgpu_vm_block_size = -1; 1992 } 1993 } 1994 1995 /** 1996 * amdgpu_device_check_vm_size - validate the vm size 1997 * 1998 * @adev: amdgpu_device pointer 1999 * 2000 * Validates the vm size in GB specified via module parameter. 2001 * The VM size is the size of the GPU virtual memory space in GB. 2002 */ 2003 static void amdgpu_device_check_vm_size(struct amdgpu_device *adev) 2004 { 2005 /* no need to check the default value */ 2006 if (amdgpu_vm_size == -1) 2007 return; 2008 2009 if (amdgpu_vm_size < 1) { 2010 dev_warn(adev->dev, "VM size (%d) too small, min is 1GB\n", 2011 amdgpu_vm_size); 2012 amdgpu_vm_size = -1; 2013 } 2014 } 2015 2016 static void amdgpu_device_check_smu_prv_buffer_size(struct amdgpu_device *adev) 2017 { 2018 struct sysinfo si; 2019 bool is_os_64 = (sizeof(void *) == 8); 2020 uint64_t total_memory; 2021 uint64_t dram_size_seven_GB = 0x1B8000000; 2022 uint64_t dram_size_three_GB = 0xB8000000; 2023 2024 if (amdgpu_smu_memory_pool_size == 0) 2025 return; 2026 2027 if (!is_os_64) { 2028 dev_warn(adev->dev, "Not 64-bit OS, feature not supported\n"); 2029 goto def_value; 2030 } 2031 si_meminfo(&si); 2032 total_memory = (uint64_t)si.totalram * si.mem_unit; 2033 2034 if ((amdgpu_smu_memory_pool_size == 1) || 2035 (amdgpu_smu_memory_pool_size == 2)) { 2036 if (total_memory < dram_size_three_GB) 2037 goto def_value1; 2038 } else if ((amdgpu_smu_memory_pool_size == 4) || 2039 (amdgpu_smu_memory_pool_size == 8)) { 2040 if (total_memory < dram_size_seven_GB) 2041 goto def_value1; 2042 } else { 2043 dev_warn(adev->dev, "Smu memory pool size not supported\n"); 2044 goto def_value; 2045 } 2046 adev->pm.smu_prv_buffer_size = amdgpu_smu_memory_pool_size << 28; 2047 2048 return; 2049 2050 def_value1: 2051 dev_warn(adev->dev, "No enough system memory\n"); 2052 def_value: 2053 adev->pm.smu_prv_buffer_size = 0; 2054 } 2055 2056 static int amdgpu_device_init_apu_flags(struct amdgpu_device *adev) 2057 { 2058 if (!(adev->flags & AMD_IS_APU) || 2059 adev->asic_type < CHIP_RAVEN) 2060 return 0; 2061 2062 switch (adev->asic_type) { 2063 case CHIP_RAVEN: 2064 if (adev->pdev->device == 0x15dd) 2065 adev->apu_flags |= AMD_APU_IS_RAVEN; 2066 if (adev->pdev->device == 0x15d8) 2067 adev->apu_flags |= AMD_APU_IS_PICASSO; 2068 break; 2069 case CHIP_RENOIR: 2070 if ((adev->pdev->device == 0x1636) || 2071 (adev->pdev->device == 0x164c)) 2072 adev->apu_flags |= AMD_APU_IS_RENOIR; 2073 else 2074 adev->apu_flags |= AMD_APU_IS_GREEN_SARDINE; 2075 break; 2076 case CHIP_VANGOGH: 2077 adev->apu_flags |= AMD_APU_IS_VANGOGH; 2078 break; 2079 case CHIP_YELLOW_CARP: 2080 break; 2081 case CHIP_CYAN_SKILLFISH: 2082 if ((adev->pdev->device == 0x13FE) || 2083 (adev->pdev->device == 0x143F)) 2084 adev->apu_flags |= AMD_APU_IS_CYAN_SKILLFISH2; 2085 break; 2086 default: 2087 break; 2088 } 2089 2090 return 0; 2091 } 2092 2093 /** 2094 * amdgpu_device_check_arguments - validate module params 2095 * 2096 * @adev: amdgpu_device pointer 2097 * 2098 * Validates certain module parameters and updates 2099 * the associated values used by the driver (all asics). 2100 */ 2101 static int amdgpu_device_check_arguments(struct amdgpu_device *adev) 2102 { 2103 int i; 2104 2105 if (amdgpu_sched_jobs < 4) { 2106 dev_warn(adev->dev, "sched jobs (%d) must be at least 4\n", 2107 amdgpu_sched_jobs); 2108 amdgpu_sched_jobs = 4; 2109 } else if (!is_power_of_2(amdgpu_sched_jobs)) { 2110 dev_warn(adev->dev, "sched jobs (%d) must be a power of 2\n", 2111 amdgpu_sched_jobs); 2112 amdgpu_sched_jobs = roundup_pow_of_two(amdgpu_sched_jobs); 2113 } 2114 2115 if (amdgpu_gart_size != -1 && amdgpu_gart_size < 32) { 2116 /* gart size must be greater or equal to 32M */ 2117 dev_warn(adev->dev, "gart size (%d) too small\n", 2118 amdgpu_gart_size); 2119 amdgpu_gart_size = -1; 2120 } 2121 2122 if (amdgpu_gtt_size != -1 && amdgpu_gtt_size < 32) { 2123 /* gtt size must be greater or equal to 32M */ 2124 dev_warn(adev->dev, "gtt size (%d) too small\n", 2125 amdgpu_gtt_size); 2126 amdgpu_gtt_size = -1; 2127 } 2128 2129 /* valid range is between 4 and 9 inclusive */ 2130 if (amdgpu_vm_fragment_size != -1 && 2131 (amdgpu_vm_fragment_size > 9 || amdgpu_vm_fragment_size < 4)) { 2132 dev_warn(adev->dev, "valid range is between 4 and 9\n"); 2133 amdgpu_vm_fragment_size = -1; 2134 } 2135 2136 if (amdgpu_sched_hw_submission < 2) { 2137 dev_warn(adev->dev, "sched hw submission jobs (%d) must be at least 2\n", 2138 amdgpu_sched_hw_submission); 2139 amdgpu_sched_hw_submission = 2; 2140 } else if (!is_power_of_2(amdgpu_sched_hw_submission)) { 2141 dev_warn(adev->dev, "sched hw submission jobs (%d) must be a power of 2\n", 2142 amdgpu_sched_hw_submission); 2143 amdgpu_sched_hw_submission = roundup_pow_of_two(amdgpu_sched_hw_submission); 2144 } 2145 2146 if (amdgpu_reset_method < -1 || amdgpu_reset_method > 4) { 2147 dev_warn(adev->dev, "invalid option for reset method, reverting to default\n"); 2148 amdgpu_reset_method = -1; 2149 } 2150 2151 amdgpu_device_check_smu_prv_buffer_size(adev); 2152 2153 amdgpu_device_check_vm_size(adev); 2154 2155 amdgpu_device_check_block_size(adev); 2156 2157 adev->firmware.load_type = amdgpu_ucode_get_load_type(adev, amdgpu_fw_load_type); 2158 2159 for (i = 0; i < MAX_XCP; i++) { 2160 switch (amdgpu_enforce_isolation) { 2161 case -1: 2162 case 0: 2163 default: 2164 /* disable */ 2165 adev->enforce_isolation[i] = AMDGPU_ENFORCE_ISOLATION_DISABLE; 2166 break; 2167 case 1: 2168 /* enable */ 2169 adev->enforce_isolation[i] = 2170 AMDGPU_ENFORCE_ISOLATION_ENABLE; 2171 break; 2172 case 2: 2173 /* enable legacy mode */ 2174 adev->enforce_isolation[i] = 2175 AMDGPU_ENFORCE_ISOLATION_ENABLE_LEGACY; 2176 break; 2177 case 3: 2178 /* enable only process isolation without submitting cleaner shader */ 2179 adev->enforce_isolation[i] = 2180 AMDGPU_ENFORCE_ISOLATION_NO_CLEANER_SHADER; 2181 break; 2182 } 2183 } 2184 2185 return 0; 2186 } 2187 2188 /** 2189 * amdgpu_switcheroo_set_state - set switcheroo state 2190 * 2191 * @pdev: pci dev pointer 2192 * @state: vga_switcheroo state 2193 * 2194 * Callback for the switcheroo driver. Suspends or resumes 2195 * the asics before or after it is powered up using ACPI methods. 2196 */ 2197 static void amdgpu_switcheroo_set_state(struct pci_dev *pdev, 2198 enum vga_switcheroo_state state) 2199 { 2200 struct drm_device *dev = pci_get_drvdata(pdev); 2201 int r; 2202 2203 if (amdgpu_device_supports_px(dev) && state == VGA_SWITCHEROO_OFF) 2204 return; 2205 2206 if (state == VGA_SWITCHEROO_ON) { 2207 pr_info("switched on\n"); 2208 /* don't suspend or resume card normally */ 2209 dev->switch_power_state = DRM_SWITCH_POWER_CHANGING; 2210 2211 pci_set_power_state(pdev, PCI_D0); 2212 amdgpu_device_load_pci_state(pdev); 2213 r = pci_enable_device(pdev); 2214 if (r) 2215 dev_warn(&pdev->dev, "pci_enable_device failed (%d)\n", 2216 r); 2217 amdgpu_device_resume(dev, true); 2218 2219 dev->switch_power_state = DRM_SWITCH_POWER_ON; 2220 } else { 2221 dev_info(&pdev->dev, "switched off\n"); 2222 dev->switch_power_state = DRM_SWITCH_POWER_CHANGING; 2223 amdgpu_device_prepare(dev); 2224 amdgpu_device_suspend(dev, true); 2225 amdgpu_device_cache_pci_state(pdev); 2226 /* Shut down the device */ 2227 pci_disable_device(pdev); 2228 pci_set_power_state(pdev, PCI_D3cold); 2229 dev->switch_power_state = DRM_SWITCH_POWER_OFF; 2230 } 2231 } 2232 2233 /** 2234 * amdgpu_switcheroo_can_switch - see if switcheroo state can change 2235 * 2236 * @pdev: pci dev pointer 2237 * 2238 * Callback for the switcheroo driver. Check of the switcheroo 2239 * state can be changed. 2240 * Returns true if the state can be changed, false if not. 2241 */ 2242 static bool amdgpu_switcheroo_can_switch(struct pci_dev *pdev) 2243 { 2244 struct drm_device *dev = pci_get_drvdata(pdev); 2245 2246 /* 2247 * FIXME: open_count is protected by drm_global_mutex but that would lead to 2248 * locking inversion with the driver load path. And the access here is 2249 * completely racy anyway. So don't bother with locking for now. 2250 */ 2251 return atomic_read(&dev->open_count) == 0; 2252 } 2253 2254 static const struct vga_switcheroo_client_ops amdgpu_switcheroo_ops = { 2255 .set_gpu_state = amdgpu_switcheroo_set_state, 2256 .reprobe = NULL, 2257 .can_switch = amdgpu_switcheroo_can_switch, 2258 }; 2259 2260 /** 2261 * amdgpu_device_ip_set_clockgating_state - set the CG state 2262 * 2263 * @dev: amdgpu_device pointer 2264 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.) 2265 * @state: clockgating state (gate or ungate) 2266 * 2267 * Sets the requested clockgating state for all instances of 2268 * the hardware IP specified. 2269 * Returns the error code from the last instance. 2270 */ 2271 int amdgpu_device_ip_set_clockgating_state(void *dev, 2272 enum amd_ip_block_type block_type, 2273 enum amd_clockgating_state state) 2274 { 2275 struct amdgpu_device *adev = dev; 2276 int i, r = 0; 2277 2278 for (i = 0; i < adev->num_ip_blocks; i++) { 2279 if (!adev->ip_blocks[i].status.valid) 2280 continue; 2281 if (adev->ip_blocks[i].version->type != block_type) 2282 continue; 2283 if (!adev->ip_blocks[i].version->funcs->set_clockgating_state) 2284 continue; 2285 r = adev->ip_blocks[i].version->funcs->set_clockgating_state( 2286 &adev->ip_blocks[i], state); 2287 if (r) 2288 dev_err(adev->dev, 2289 "set_clockgating_state of IP block <%s> failed %d\n", 2290 adev->ip_blocks[i].version->funcs->name, r); 2291 } 2292 return r; 2293 } 2294 2295 /** 2296 * amdgpu_device_ip_set_powergating_state - set the PG state 2297 * 2298 * @dev: amdgpu_device pointer 2299 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.) 2300 * @state: powergating state (gate or ungate) 2301 * 2302 * Sets the requested powergating state for all instances of 2303 * the hardware IP specified. 2304 * Returns the error code from the last instance. 2305 */ 2306 int amdgpu_device_ip_set_powergating_state(void *dev, 2307 enum amd_ip_block_type block_type, 2308 enum amd_powergating_state state) 2309 { 2310 struct amdgpu_device *adev = dev; 2311 int i, r = 0; 2312 2313 for (i = 0; i < adev->num_ip_blocks; i++) { 2314 if (!adev->ip_blocks[i].status.valid) 2315 continue; 2316 if (adev->ip_blocks[i].version->type != block_type) 2317 continue; 2318 if (!adev->ip_blocks[i].version->funcs->set_powergating_state) 2319 continue; 2320 r = adev->ip_blocks[i].version->funcs->set_powergating_state( 2321 &adev->ip_blocks[i], state); 2322 if (r) 2323 dev_err(adev->dev, 2324 "set_powergating_state of IP block <%s> failed %d\n", 2325 adev->ip_blocks[i].version->funcs->name, r); 2326 } 2327 return r; 2328 } 2329 2330 /** 2331 * amdgpu_device_ip_get_clockgating_state - get the CG state 2332 * 2333 * @adev: amdgpu_device pointer 2334 * @flags: clockgating feature flags 2335 * 2336 * Walks the list of IPs on the device and updates the clockgating 2337 * flags for each IP. 2338 * Updates @flags with the feature flags for each hardware IP where 2339 * clockgating is enabled. 2340 */ 2341 void amdgpu_device_ip_get_clockgating_state(struct amdgpu_device *adev, 2342 u64 *flags) 2343 { 2344 int i; 2345 2346 for (i = 0; i < adev->num_ip_blocks; i++) { 2347 if (!adev->ip_blocks[i].status.valid) 2348 continue; 2349 if (adev->ip_blocks[i].version->funcs->get_clockgating_state) 2350 adev->ip_blocks[i].version->funcs->get_clockgating_state( 2351 &adev->ip_blocks[i], flags); 2352 } 2353 } 2354 2355 /** 2356 * amdgpu_device_ip_wait_for_idle - wait for idle 2357 * 2358 * @adev: amdgpu_device pointer 2359 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.) 2360 * 2361 * Waits for the request hardware IP to be idle. 2362 * Returns 0 for success or a negative error code on failure. 2363 */ 2364 int amdgpu_device_ip_wait_for_idle(struct amdgpu_device *adev, 2365 enum amd_ip_block_type block_type) 2366 { 2367 int i, r; 2368 2369 for (i = 0; i < adev->num_ip_blocks; i++) { 2370 if (!adev->ip_blocks[i].status.valid) 2371 continue; 2372 if (adev->ip_blocks[i].version->type == block_type) { 2373 if (adev->ip_blocks[i].version->funcs->wait_for_idle) { 2374 r = adev->ip_blocks[i].version->funcs->wait_for_idle( 2375 &adev->ip_blocks[i]); 2376 if (r) 2377 return r; 2378 } 2379 break; 2380 } 2381 } 2382 return 0; 2383 2384 } 2385 2386 /** 2387 * amdgpu_device_ip_is_valid - is the hardware IP enabled 2388 * 2389 * @adev: amdgpu_device pointer 2390 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.) 2391 * 2392 * Check if the hardware IP is enable or not. 2393 * Returns true if it the IP is enable, false if not. 2394 */ 2395 bool amdgpu_device_ip_is_valid(struct amdgpu_device *adev, 2396 enum amd_ip_block_type block_type) 2397 { 2398 int i; 2399 2400 for (i = 0; i < adev->num_ip_blocks; i++) { 2401 if (adev->ip_blocks[i].version->type == block_type) 2402 return adev->ip_blocks[i].status.valid; 2403 } 2404 return false; 2405 2406 } 2407 2408 /** 2409 * amdgpu_device_ip_get_ip_block - get a hw IP pointer 2410 * 2411 * @adev: amdgpu_device pointer 2412 * @type: Type of hardware IP (SMU, GFX, UVD, etc.) 2413 * 2414 * Returns a pointer to the hardware IP block structure 2415 * if it exists for the asic, otherwise NULL. 2416 */ 2417 struct amdgpu_ip_block * 2418 amdgpu_device_ip_get_ip_block(struct amdgpu_device *adev, 2419 enum amd_ip_block_type type) 2420 { 2421 int i; 2422 2423 for (i = 0; i < adev->num_ip_blocks; i++) 2424 if (adev->ip_blocks[i].version->type == type) 2425 return &adev->ip_blocks[i]; 2426 2427 return NULL; 2428 } 2429 2430 /** 2431 * amdgpu_device_ip_block_version_cmp 2432 * 2433 * @adev: amdgpu_device pointer 2434 * @type: enum amd_ip_block_type 2435 * @major: major version 2436 * @minor: minor version 2437 * 2438 * return 0 if equal or greater 2439 * return 1 if smaller or the ip_block doesn't exist 2440 */ 2441 int amdgpu_device_ip_block_version_cmp(struct amdgpu_device *adev, 2442 enum amd_ip_block_type type, 2443 u32 major, u32 minor) 2444 { 2445 struct amdgpu_ip_block *ip_block = amdgpu_device_ip_get_ip_block(adev, type); 2446 2447 if (ip_block && ((ip_block->version->major > major) || 2448 ((ip_block->version->major == major) && 2449 (ip_block->version->minor >= minor)))) 2450 return 0; 2451 2452 return 1; 2453 } 2454 2455 /** 2456 * amdgpu_device_ip_block_add 2457 * 2458 * @adev: amdgpu_device pointer 2459 * @ip_block_version: pointer to the IP to add 2460 * 2461 * Adds the IP block driver information to the collection of IPs 2462 * on the asic. 2463 */ 2464 int amdgpu_device_ip_block_add(struct amdgpu_device *adev, 2465 const struct amdgpu_ip_block_version *ip_block_version) 2466 { 2467 if (!ip_block_version) 2468 return -EINVAL; 2469 2470 switch (ip_block_version->type) { 2471 case AMD_IP_BLOCK_TYPE_VCN: 2472 if (adev->harvest_ip_mask & AMD_HARVEST_IP_VCN_MASK) 2473 return 0; 2474 break; 2475 case AMD_IP_BLOCK_TYPE_JPEG: 2476 if (adev->harvest_ip_mask & AMD_HARVEST_IP_JPEG_MASK) 2477 return 0; 2478 break; 2479 default: 2480 break; 2481 } 2482 2483 dev_info(adev->dev, "detected ip block number %d <%s>\n", 2484 adev->num_ip_blocks, ip_block_version->funcs->name); 2485 2486 adev->ip_blocks[adev->num_ip_blocks].adev = adev; 2487 2488 adev->ip_blocks[adev->num_ip_blocks++].version = ip_block_version; 2489 2490 return 0; 2491 } 2492 2493 /** 2494 * amdgpu_device_enable_virtual_display - enable virtual display feature 2495 * 2496 * @adev: amdgpu_device pointer 2497 * 2498 * Enabled the virtual display feature if the user has enabled it via 2499 * the module parameter virtual_display. This feature provides a virtual 2500 * display hardware on headless boards or in virtualized environments. 2501 * This function parses and validates the configuration string specified by 2502 * the user and configures the virtual display configuration (number of 2503 * virtual connectors, crtcs, etc.) specified. 2504 */ 2505 static void amdgpu_device_enable_virtual_display(struct amdgpu_device *adev) 2506 { 2507 adev->enable_virtual_display = false; 2508 2509 if (amdgpu_virtual_display) { 2510 const char *pci_address_name = pci_name(adev->pdev); 2511 char *pciaddstr, *pciaddstr_tmp, *pciaddname_tmp, *pciaddname; 2512 2513 pciaddstr = kstrdup(amdgpu_virtual_display, GFP_KERNEL); 2514 pciaddstr_tmp = pciaddstr; 2515 while ((pciaddname_tmp = strsep(&pciaddstr_tmp, ";"))) { 2516 pciaddname = strsep(&pciaddname_tmp, ","); 2517 if (!strcmp("all", pciaddname) 2518 || !strcmp(pci_address_name, pciaddname)) { 2519 long num_crtc; 2520 int res = -1; 2521 2522 adev->enable_virtual_display = true; 2523 2524 if (pciaddname_tmp) 2525 res = kstrtol(pciaddname_tmp, 10, 2526 &num_crtc); 2527 2528 if (!res) { 2529 if (num_crtc < 1) 2530 num_crtc = 1; 2531 if (num_crtc > 6) 2532 num_crtc = 6; 2533 adev->mode_info.num_crtc = num_crtc; 2534 } else { 2535 adev->mode_info.num_crtc = 1; 2536 } 2537 break; 2538 } 2539 } 2540 2541 dev_info( 2542 adev->dev, 2543 "virtual display string:%s, %s:virtual_display:%d, num_crtc:%d\n", 2544 amdgpu_virtual_display, pci_address_name, 2545 adev->enable_virtual_display, adev->mode_info.num_crtc); 2546 2547 kfree(pciaddstr); 2548 } 2549 } 2550 2551 void amdgpu_device_set_sriov_virtual_display(struct amdgpu_device *adev) 2552 { 2553 if (amdgpu_sriov_vf(adev) && !adev->enable_virtual_display) { 2554 adev->mode_info.num_crtc = 1; 2555 adev->enable_virtual_display = true; 2556 dev_info(adev->dev, "virtual_display:%d, num_crtc:%d\n", 2557 adev->enable_virtual_display, 2558 adev->mode_info.num_crtc); 2559 } 2560 } 2561 2562 /** 2563 * amdgpu_device_parse_gpu_info_fw - parse gpu info firmware 2564 * 2565 * @adev: amdgpu_device pointer 2566 * 2567 * Parses the asic configuration parameters specified in the gpu info 2568 * firmware and makes them available to the driver for use in configuring 2569 * the asic. 2570 * Returns 0 on success, -EINVAL on failure. 2571 */ 2572 static int amdgpu_device_parse_gpu_info_fw(struct amdgpu_device *adev) 2573 { 2574 const char *chip_name; 2575 int err; 2576 const struct gpu_info_firmware_header_v1_0 *hdr; 2577 2578 adev->firmware.gpu_info_fw = NULL; 2579 2580 if (adev->mman.discovery_bin) 2581 return 0; 2582 2583 switch (adev->asic_type) { 2584 default: 2585 return 0; 2586 case CHIP_VEGA10: 2587 chip_name = "vega10"; 2588 break; 2589 case CHIP_VEGA12: 2590 chip_name = "vega12"; 2591 break; 2592 case CHIP_RAVEN: 2593 if (adev->apu_flags & AMD_APU_IS_RAVEN2) 2594 chip_name = "raven2"; 2595 else if (adev->apu_flags & AMD_APU_IS_PICASSO) 2596 chip_name = "picasso"; 2597 else 2598 chip_name = "raven"; 2599 break; 2600 case CHIP_ARCTURUS: 2601 chip_name = "arcturus"; 2602 break; 2603 case CHIP_NAVI12: 2604 chip_name = "navi12"; 2605 break; 2606 } 2607 2608 err = amdgpu_ucode_request(adev, &adev->firmware.gpu_info_fw, 2609 AMDGPU_UCODE_OPTIONAL, 2610 "amdgpu/%s_gpu_info.bin", chip_name); 2611 if (err) { 2612 dev_err(adev->dev, 2613 "Failed to get gpu_info firmware \"%s_gpu_info.bin\"\n", 2614 chip_name); 2615 goto out; 2616 } 2617 2618 hdr = (const struct gpu_info_firmware_header_v1_0 *)adev->firmware.gpu_info_fw->data; 2619 amdgpu_ucode_print_gpu_info_hdr(&hdr->header); 2620 2621 switch (hdr->version_major) { 2622 case 1: 2623 { 2624 const struct gpu_info_firmware_v1_0 *gpu_info_fw = 2625 (const struct gpu_info_firmware_v1_0 *)(adev->firmware.gpu_info_fw->data + 2626 le32_to_cpu(hdr->header.ucode_array_offset_bytes)); 2627 2628 /* 2629 * Should be dropped when DAL no longer needs it. 2630 */ 2631 if (adev->asic_type == CHIP_NAVI12) 2632 goto parse_soc_bounding_box; 2633 2634 adev->gfx.config.max_shader_engines = le32_to_cpu(gpu_info_fw->gc_num_se); 2635 adev->gfx.config.max_cu_per_sh = le32_to_cpu(gpu_info_fw->gc_num_cu_per_sh); 2636 adev->gfx.config.max_sh_per_se = le32_to_cpu(gpu_info_fw->gc_num_sh_per_se); 2637 adev->gfx.config.max_backends_per_se = le32_to_cpu(gpu_info_fw->gc_num_rb_per_se); 2638 adev->gfx.config.max_texture_channel_caches = 2639 le32_to_cpu(gpu_info_fw->gc_num_tccs); 2640 adev->gfx.config.max_gprs = le32_to_cpu(gpu_info_fw->gc_num_gprs); 2641 adev->gfx.config.max_gs_threads = le32_to_cpu(gpu_info_fw->gc_num_max_gs_thds); 2642 adev->gfx.config.gs_vgt_table_depth = le32_to_cpu(gpu_info_fw->gc_gs_table_depth); 2643 adev->gfx.config.gs_prim_buffer_depth = le32_to_cpu(gpu_info_fw->gc_gsprim_buff_depth); 2644 adev->gfx.config.double_offchip_lds_buf = 2645 le32_to_cpu(gpu_info_fw->gc_double_offchip_lds_buffer); 2646 adev->gfx.cu_info.wave_front_size = le32_to_cpu(gpu_info_fw->gc_wave_size); 2647 adev->gfx.cu_info.max_waves_per_simd = 2648 le32_to_cpu(gpu_info_fw->gc_max_waves_per_simd); 2649 adev->gfx.cu_info.max_scratch_slots_per_cu = 2650 le32_to_cpu(gpu_info_fw->gc_max_scratch_slots_per_cu); 2651 adev->gfx.cu_info.lds_size = le32_to_cpu(gpu_info_fw->gc_lds_size); 2652 if (hdr->version_minor >= 1) { 2653 const struct gpu_info_firmware_v1_1 *gpu_info_fw = 2654 (const struct gpu_info_firmware_v1_1 *)(adev->firmware.gpu_info_fw->data + 2655 le32_to_cpu(hdr->header.ucode_array_offset_bytes)); 2656 adev->gfx.config.num_sc_per_sh = 2657 le32_to_cpu(gpu_info_fw->num_sc_per_sh); 2658 adev->gfx.config.num_packer_per_sc = 2659 le32_to_cpu(gpu_info_fw->num_packer_per_sc); 2660 } 2661 2662 parse_soc_bounding_box: 2663 /* 2664 * soc bounding box info is not integrated in disocovery table, 2665 * we always need to parse it from gpu info firmware if needed. 2666 */ 2667 if (hdr->version_minor == 2) { 2668 const struct gpu_info_firmware_v1_2 *gpu_info_fw = 2669 (const struct gpu_info_firmware_v1_2 *)(adev->firmware.gpu_info_fw->data + 2670 le32_to_cpu(hdr->header.ucode_array_offset_bytes)); 2671 adev->dm.soc_bounding_box = &gpu_info_fw->soc_bounding_box; 2672 } 2673 break; 2674 } 2675 default: 2676 dev_err(adev->dev, 2677 "Unsupported gpu_info table %d\n", hdr->header.ucode_version); 2678 err = -EINVAL; 2679 goto out; 2680 } 2681 out: 2682 return err; 2683 } 2684 2685 /** 2686 * amdgpu_device_ip_early_init - run early init for hardware IPs 2687 * 2688 * @adev: amdgpu_device pointer 2689 * 2690 * Early initialization pass for hardware IPs. The hardware IPs that make 2691 * up each asic are discovered each IP's early_init callback is run. This 2692 * is the first stage in initializing the asic. 2693 * Returns 0 on success, negative error code on failure. 2694 */ 2695 static int amdgpu_device_ip_early_init(struct amdgpu_device *adev) 2696 { 2697 struct amdgpu_ip_block *ip_block; 2698 struct pci_dev *parent; 2699 bool total, skip_bios; 2700 uint32_t bios_flags; 2701 int i, r; 2702 2703 amdgpu_device_enable_virtual_display(adev); 2704 2705 if (amdgpu_sriov_vf(adev)) { 2706 r = amdgpu_virt_request_full_gpu(adev, true); 2707 if (r) 2708 return r; 2709 } 2710 2711 switch (adev->asic_type) { 2712 #ifdef CONFIG_DRM_AMDGPU_SI 2713 case CHIP_VERDE: 2714 case CHIP_TAHITI: 2715 case CHIP_PITCAIRN: 2716 case CHIP_OLAND: 2717 case CHIP_HAINAN: 2718 adev->family = AMDGPU_FAMILY_SI; 2719 r = si_set_ip_blocks(adev); 2720 if (r) 2721 return r; 2722 break; 2723 #endif 2724 #ifdef CONFIG_DRM_AMDGPU_CIK 2725 case CHIP_BONAIRE: 2726 case CHIP_HAWAII: 2727 case CHIP_KAVERI: 2728 case CHIP_KABINI: 2729 case CHIP_MULLINS: 2730 if (adev->flags & AMD_IS_APU) 2731 adev->family = AMDGPU_FAMILY_KV; 2732 else 2733 adev->family = AMDGPU_FAMILY_CI; 2734 2735 r = cik_set_ip_blocks(adev); 2736 if (r) 2737 return r; 2738 break; 2739 #endif 2740 case CHIP_TOPAZ: 2741 case CHIP_TONGA: 2742 case CHIP_FIJI: 2743 case CHIP_POLARIS10: 2744 case CHIP_POLARIS11: 2745 case CHIP_POLARIS12: 2746 case CHIP_VEGAM: 2747 case CHIP_CARRIZO: 2748 case CHIP_STONEY: 2749 if (adev->flags & AMD_IS_APU) 2750 adev->family = AMDGPU_FAMILY_CZ; 2751 else 2752 adev->family = AMDGPU_FAMILY_VI; 2753 2754 r = vi_set_ip_blocks(adev); 2755 if (r) 2756 return r; 2757 break; 2758 default: 2759 r = amdgpu_discovery_set_ip_blocks(adev); 2760 if (r) 2761 return r; 2762 break; 2763 } 2764 2765 /* Check for IP version 9.4.3 with A0 hardware */ 2766 if (amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 3) && 2767 !amdgpu_device_get_rev_id(adev)) { 2768 dev_err(adev->dev, "Unsupported A0 hardware\n"); 2769 return -ENODEV; /* device unsupported - no device error */ 2770 } 2771 2772 if (amdgpu_has_atpx() && 2773 (amdgpu_is_atpx_hybrid() || 2774 amdgpu_has_atpx_dgpu_power_cntl()) && 2775 ((adev->flags & AMD_IS_APU) == 0) && 2776 !dev_is_removable(&adev->pdev->dev)) 2777 adev->flags |= AMD_IS_PX; 2778 2779 if (!(adev->flags & AMD_IS_APU)) { 2780 parent = pcie_find_root_port(adev->pdev); 2781 adev->has_pr3 = parent ? pci_pr3_present(parent) : false; 2782 } 2783 2784 adev->pm.pp_feature = amdgpu_pp_feature_mask; 2785 if (amdgpu_sriov_vf(adev) || sched_policy == KFD_SCHED_POLICY_NO_HWS) 2786 adev->pm.pp_feature &= ~PP_GFXOFF_MASK; 2787 if (amdgpu_sriov_vf(adev) && adev->asic_type == CHIP_SIENNA_CICHLID) 2788 adev->pm.pp_feature &= ~PP_OVERDRIVE_MASK; 2789 if (!amdgpu_device_pcie_dynamic_switching_supported(adev)) 2790 adev->pm.pp_feature &= ~PP_PCIE_DPM_MASK; 2791 2792 adev->virt.is_xgmi_node_migrate_enabled = false; 2793 if (amdgpu_sriov_vf(adev)) { 2794 adev->virt.is_xgmi_node_migrate_enabled = 2795 amdgpu_ip_version((adev), GC_HWIP, 0) == IP_VERSION(9, 4, 4); 2796 } 2797 2798 total = true; 2799 for (i = 0; i < adev->num_ip_blocks; i++) { 2800 ip_block = &adev->ip_blocks[i]; 2801 2802 if ((amdgpu_ip_block_mask & (1 << i)) == 0) { 2803 dev_warn(adev->dev, "disabled ip block: %d <%s>\n", i, 2804 adev->ip_blocks[i].version->funcs->name); 2805 adev->ip_blocks[i].status.valid = false; 2806 } else if (ip_block->version->funcs->early_init) { 2807 r = ip_block->version->funcs->early_init(ip_block); 2808 if (r == -ENOENT) { 2809 adev->ip_blocks[i].status.valid = false; 2810 } else if (r) { 2811 dev_err(adev->dev, 2812 "early_init of IP block <%s> failed %d\n", 2813 adev->ip_blocks[i].version->funcs->name, 2814 r); 2815 total = false; 2816 } else { 2817 adev->ip_blocks[i].status.valid = true; 2818 } 2819 } else { 2820 adev->ip_blocks[i].status.valid = true; 2821 } 2822 /* get the vbios after the asic_funcs are set up */ 2823 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON) { 2824 r = amdgpu_device_parse_gpu_info_fw(adev); 2825 if (r) 2826 return r; 2827 2828 bios_flags = amdgpu_device_get_vbios_flags(adev); 2829 skip_bios = !!(bios_flags & AMDGPU_VBIOS_SKIP); 2830 /* Read BIOS */ 2831 if (!skip_bios) { 2832 bool optional = 2833 !!(bios_flags & AMDGPU_VBIOS_OPTIONAL); 2834 if (!amdgpu_get_bios(adev) && !optional) 2835 return -EINVAL; 2836 2837 if (optional && !adev->bios) 2838 dev_info( 2839 adev->dev, 2840 "VBIOS image optional, proceeding without VBIOS image"); 2841 2842 if (adev->bios) { 2843 r = amdgpu_atombios_init(adev); 2844 if (r) { 2845 dev_err(adev->dev, 2846 "amdgpu_atombios_init failed\n"); 2847 amdgpu_vf_error_put( 2848 adev, 2849 AMDGIM_ERROR_VF_ATOMBIOS_INIT_FAIL, 2850 0, 0); 2851 return r; 2852 } 2853 } 2854 } 2855 2856 /*get pf2vf msg info at it's earliest time*/ 2857 if (amdgpu_sriov_vf(adev)) 2858 amdgpu_virt_init_data_exchange(adev); 2859 2860 } 2861 } 2862 if (!total) 2863 return -ENODEV; 2864 2865 if (adev->gmc.xgmi.supported) 2866 amdgpu_xgmi_early_init(adev); 2867 2868 ip_block = amdgpu_device_ip_get_ip_block(adev, AMD_IP_BLOCK_TYPE_GFX); 2869 if (ip_block->status.valid != false) 2870 amdgpu_amdkfd_device_probe(adev); 2871 2872 adev->cg_flags &= amdgpu_cg_mask; 2873 adev->pg_flags &= amdgpu_pg_mask; 2874 2875 return 0; 2876 } 2877 2878 static int amdgpu_device_ip_hw_init_phase1(struct amdgpu_device *adev) 2879 { 2880 int i, r; 2881 2882 for (i = 0; i < adev->num_ip_blocks; i++) { 2883 if (!adev->ip_blocks[i].status.sw) 2884 continue; 2885 if (adev->ip_blocks[i].status.hw) 2886 continue; 2887 if (!amdgpu_ip_member_of_hwini( 2888 adev, adev->ip_blocks[i].version->type)) 2889 continue; 2890 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON || 2891 (amdgpu_sriov_vf(adev) && (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP)) || 2892 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH) { 2893 r = adev->ip_blocks[i].version->funcs->hw_init(&adev->ip_blocks[i]); 2894 if (r) { 2895 dev_err(adev->dev, 2896 "hw_init of IP block <%s> failed %d\n", 2897 adev->ip_blocks[i].version->funcs->name, 2898 r); 2899 return r; 2900 } 2901 adev->ip_blocks[i].status.hw = true; 2902 } 2903 } 2904 2905 return 0; 2906 } 2907 2908 static int amdgpu_device_ip_hw_init_phase2(struct amdgpu_device *adev) 2909 { 2910 int i, r; 2911 2912 for (i = 0; i < adev->num_ip_blocks; i++) { 2913 if (!adev->ip_blocks[i].status.sw) 2914 continue; 2915 if (adev->ip_blocks[i].status.hw) 2916 continue; 2917 if (!amdgpu_ip_member_of_hwini( 2918 adev, adev->ip_blocks[i].version->type)) 2919 continue; 2920 r = adev->ip_blocks[i].version->funcs->hw_init(&adev->ip_blocks[i]); 2921 if (r) { 2922 dev_err(adev->dev, 2923 "hw_init of IP block <%s> failed %d\n", 2924 adev->ip_blocks[i].version->funcs->name, r); 2925 return r; 2926 } 2927 adev->ip_blocks[i].status.hw = true; 2928 } 2929 2930 return 0; 2931 } 2932 2933 static int amdgpu_device_fw_loading(struct amdgpu_device *adev) 2934 { 2935 int r = 0; 2936 int i; 2937 uint32_t smu_version; 2938 2939 if (adev->asic_type >= CHIP_VEGA10) { 2940 for (i = 0; i < adev->num_ip_blocks; i++) { 2941 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_PSP) 2942 continue; 2943 2944 if (!amdgpu_ip_member_of_hwini(adev, 2945 AMD_IP_BLOCK_TYPE_PSP)) 2946 break; 2947 2948 if (!adev->ip_blocks[i].status.sw) 2949 continue; 2950 2951 /* no need to do the fw loading again if already done*/ 2952 if (adev->ip_blocks[i].status.hw == true) 2953 break; 2954 2955 if (amdgpu_in_reset(adev) || adev->in_suspend) { 2956 r = amdgpu_ip_block_resume(&adev->ip_blocks[i]); 2957 if (r) 2958 return r; 2959 } else { 2960 r = adev->ip_blocks[i].version->funcs->hw_init(&adev->ip_blocks[i]); 2961 if (r) { 2962 dev_err(adev->dev, 2963 "hw_init of IP block <%s> failed %d\n", 2964 adev->ip_blocks[i] 2965 .version->funcs->name, 2966 r); 2967 return r; 2968 } 2969 adev->ip_blocks[i].status.hw = true; 2970 } 2971 break; 2972 } 2973 } 2974 2975 if (!amdgpu_sriov_vf(adev) || adev->asic_type == CHIP_TONGA) 2976 r = amdgpu_pm_load_smu_firmware(adev, &smu_version); 2977 2978 return r; 2979 } 2980 2981 static int amdgpu_device_init_schedulers(struct amdgpu_device *adev) 2982 { 2983 struct drm_sched_init_args args = { 2984 .ops = &amdgpu_sched_ops, 2985 .num_rqs = DRM_SCHED_PRIORITY_COUNT, 2986 .timeout_wq = adev->reset_domain->wq, 2987 .dev = adev->dev, 2988 }; 2989 long timeout; 2990 int r, i; 2991 2992 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 2993 struct amdgpu_ring *ring = adev->rings[i]; 2994 2995 /* No need to setup the GPU scheduler for rings that don't need it */ 2996 if (!ring || ring->no_scheduler) 2997 continue; 2998 2999 switch (ring->funcs->type) { 3000 case AMDGPU_RING_TYPE_GFX: 3001 timeout = adev->gfx_timeout; 3002 break; 3003 case AMDGPU_RING_TYPE_COMPUTE: 3004 timeout = adev->compute_timeout; 3005 break; 3006 case AMDGPU_RING_TYPE_SDMA: 3007 timeout = adev->sdma_timeout; 3008 break; 3009 default: 3010 timeout = adev->video_timeout; 3011 break; 3012 } 3013 3014 args.timeout = timeout; 3015 args.credit_limit = ring->num_hw_submission; 3016 args.score = ring->sched_score; 3017 args.name = ring->name; 3018 3019 r = drm_sched_init(&ring->sched, &args); 3020 if (r) { 3021 dev_err(adev->dev, 3022 "Failed to create scheduler on ring %s.\n", 3023 ring->name); 3024 return r; 3025 } 3026 r = amdgpu_uvd_entity_init(adev, ring); 3027 if (r) { 3028 dev_err(adev->dev, 3029 "Failed to create UVD scheduling entity on ring %s.\n", 3030 ring->name); 3031 return r; 3032 } 3033 r = amdgpu_vce_entity_init(adev, ring); 3034 if (r) { 3035 dev_err(adev->dev, 3036 "Failed to create VCE scheduling entity on ring %s.\n", 3037 ring->name); 3038 return r; 3039 } 3040 } 3041 3042 if (adev->xcp_mgr) 3043 amdgpu_xcp_update_partition_sched_list(adev); 3044 3045 return 0; 3046 } 3047 3048 3049 /** 3050 * amdgpu_device_ip_init - run init for hardware IPs 3051 * 3052 * @adev: amdgpu_device pointer 3053 * 3054 * Main initialization pass for hardware IPs. The list of all the hardware 3055 * IPs that make up the asic is walked and the sw_init and hw_init callbacks 3056 * are run. sw_init initializes the software state associated with each IP 3057 * and hw_init initializes the hardware associated with each IP. 3058 * Returns 0 on success, negative error code on failure. 3059 */ 3060 static int amdgpu_device_ip_init(struct amdgpu_device *adev) 3061 { 3062 bool init_badpage; 3063 int i, r; 3064 3065 r = amdgpu_ras_init(adev); 3066 if (r) 3067 return r; 3068 3069 for (i = 0; i < adev->num_ip_blocks; i++) { 3070 if (!adev->ip_blocks[i].status.valid) 3071 continue; 3072 if (adev->ip_blocks[i].version->funcs->sw_init) { 3073 r = adev->ip_blocks[i].version->funcs->sw_init(&adev->ip_blocks[i]); 3074 if (r) { 3075 dev_err(adev->dev, 3076 "sw_init of IP block <%s> failed %d\n", 3077 adev->ip_blocks[i].version->funcs->name, 3078 r); 3079 goto init_failed; 3080 } 3081 } 3082 adev->ip_blocks[i].status.sw = true; 3083 3084 if (!amdgpu_ip_member_of_hwini( 3085 adev, adev->ip_blocks[i].version->type)) 3086 continue; 3087 3088 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON) { 3089 /* need to do common hw init early so everything is set up for gmc */ 3090 r = adev->ip_blocks[i].version->funcs->hw_init(&adev->ip_blocks[i]); 3091 if (r) { 3092 dev_err(adev->dev, "hw_init %d failed %d\n", i, 3093 r); 3094 goto init_failed; 3095 } 3096 adev->ip_blocks[i].status.hw = true; 3097 } else if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) { 3098 /* need to do gmc hw init early so we can allocate gpu mem */ 3099 /* Try to reserve bad pages early */ 3100 if (amdgpu_sriov_vf(adev)) 3101 amdgpu_virt_exchange_data(adev); 3102 3103 r = amdgpu_device_mem_scratch_init(adev); 3104 if (r) { 3105 dev_err(adev->dev, 3106 "amdgpu_mem_scratch_init failed %d\n", 3107 r); 3108 goto init_failed; 3109 } 3110 r = adev->ip_blocks[i].version->funcs->hw_init(&adev->ip_blocks[i]); 3111 if (r) { 3112 dev_err(adev->dev, "hw_init %d failed %d\n", i, 3113 r); 3114 goto init_failed; 3115 } 3116 r = amdgpu_device_wb_init(adev); 3117 if (r) { 3118 dev_err(adev->dev, 3119 "amdgpu_device_wb_init failed %d\n", r); 3120 goto init_failed; 3121 } 3122 adev->ip_blocks[i].status.hw = true; 3123 3124 /* right after GMC hw init, we create CSA */ 3125 if (adev->gfx.mcbp) { 3126 r = amdgpu_allocate_static_csa(adev, &adev->virt.csa_obj, 3127 AMDGPU_GEM_DOMAIN_VRAM | 3128 AMDGPU_GEM_DOMAIN_GTT, 3129 AMDGPU_CSA_SIZE); 3130 if (r) { 3131 dev_err(adev->dev, 3132 "allocate CSA failed %d\n", r); 3133 goto init_failed; 3134 } 3135 } 3136 3137 r = amdgpu_seq64_init(adev); 3138 if (r) { 3139 dev_err(adev->dev, "allocate seq64 failed %d\n", 3140 r); 3141 goto init_failed; 3142 } 3143 } 3144 } 3145 3146 if (amdgpu_sriov_vf(adev)) 3147 amdgpu_virt_init_data_exchange(adev); 3148 3149 r = amdgpu_ib_pool_init(adev); 3150 if (r) { 3151 dev_err(adev->dev, "IB initialization failed (%d).\n", r); 3152 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_IB_INIT_FAIL, 0, r); 3153 goto init_failed; 3154 } 3155 3156 r = amdgpu_ucode_create_bo(adev); /* create ucode bo when sw_init complete*/ 3157 if (r) 3158 goto init_failed; 3159 3160 r = amdgpu_device_ip_hw_init_phase1(adev); 3161 if (r) 3162 goto init_failed; 3163 3164 r = amdgpu_device_fw_loading(adev); 3165 if (r) 3166 goto init_failed; 3167 3168 r = amdgpu_device_ip_hw_init_phase2(adev); 3169 if (r) 3170 goto init_failed; 3171 3172 /* 3173 * retired pages will be loaded from eeprom and reserved here, 3174 * it should be called after amdgpu_device_ip_hw_init_phase2 since 3175 * for some ASICs the RAS EEPROM code relies on SMU fully functioning 3176 * for I2C communication which only true at this point. 3177 * 3178 * amdgpu_ras_recovery_init may fail, but the upper only cares the 3179 * failure from bad gpu situation and stop amdgpu init process 3180 * accordingly. For other failed cases, it will still release all 3181 * the resource and print error message, rather than returning one 3182 * negative value to upper level. 3183 * 3184 * Note: theoretically, this should be called before all vram allocations 3185 * to protect retired page from abusing 3186 */ 3187 init_badpage = (adev->init_lvl->level != AMDGPU_INIT_LEVEL_MINIMAL_XGMI); 3188 r = amdgpu_ras_recovery_init(adev, init_badpage); 3189 if (r) 3190 goto init_failed; 3191 3192 /** 3193 * In case of XGMI grab extra reference for reset domain for this device 3194 */ 3195 if (adev->gmc.xgmi.num_physical_nodes > 1) { 3196 if (amdgpu_xgmi_add_device(adev) == 0) { 3197 if (!amdgpu_sriov_vf(adev)) { 3198 struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev); 3199 3200 if (WARN_ON(!hive)) { 3201 r = -ENOENT; 3202 goto init_failed; 3203 } 3204 3205 if (!hive->reset_domain || 3206 !amdgpu_reset_get_reset_domain(hive->reset_domain)) { 3207 r = -ENOENT; 3208 amdgpu_put_xgmi_hive(hive); 3209 goto init_failed; 3210 } 3211 3212 /* Drop the early temporary reset domain we created for device */ 3213 amdgpu_reset_put_reset_domain(adev->reset_domain); 3214 adev->reset_domain = hive->reset_domain; 3215 amdgpu_put_xgmi_hive(hive); 3216 } 3217 } 3218 } 3219 3220 r = amdgpu_device_init_schedulers(adev); 3221 if (r) 3222 goto init_failed; 3223 3224 if (adev->mman.buffer_funcs_ring->sched.ready) 3225 amdgpu_ttm_set_buffer_funcs_status(adev, true); 3226 3227 /* Don't init kfd if whole hive need to be reset during init */ 3228 if (adev->init_lvl->level != AMDGPU_INIT_LEVEL_MINIMAL_XGMI) { 3229 kgd2kfd_init_zone_device(adev); 3230 amdgpu_amdkfd_device_init(adev); 3231 } 3232 3233 amdgpu_fru_get_product_info(adev); 3234 3235 if (!amdgpu_sriov_vf(adev) || amdgpu_sriov_ras_cper_en(adev)) 3236 r = amdgpu_cper_init(adev); 3237 3238 init_failed: 3239 3240 return r; 3241 } 3242 3243 /** 3244 * amdgpu_device_fill_reset_magic - writes reset magic to gart pointer 3245 * 3246 * @adev: amdgpu_device pointer 3247 * 3248 * Writes a reset magic value to the gart pointer in VRAM. The driver calls 3249 * this function before a GPU reset. If the value is retained after a 3250 * GPU reset, VRAM has not been lost. Some GPU resets may destroy VRAM contents. 3251 */ 3252 static void amdgpu_device_fill_reset_magic(struct amdgpu_device *adev) 3253 { 3254 memcpy(adev->reset_magic, adev->gart.ptr, AMDGPU_RESET_MAGIC_NUM); 3255 } 3256 3257 /** 3258 * amdgpu_device_check_vram_lost - check if vram is valid 3259 * 3260 * @adev: amdgpu_device pointer 3261 * 3262 * Checks the reset magic value written to the gart pointer in VRAM. 3263 * The driver calls this after a GPU reset to see if the contents of 3264 * VRAM is lost or now. 3265 * returns true if vram is lost, false if not. 3266 */ 3267 static bool amdgpu_device_check_vram_lost(struct amdgpu_device *adev) 3268 { 3269 if (memcmp(adev->gart.ptr, adev->reset_magic, 3270 AMDGPU_RESET_MAGIC_NUM)) 3271 return true; 3272 3273 if (!amdgpu_in_reset(adev)) 3274 return false; 3275 3276 /* 3277 * For all ASICs with baco/mode1 reset, the VRAM is 3278 * always assumed to be lost. 3279 */ 3280 switch (amdgpu_asic_reset_method(adev)) { 3281 case AMD_RESET_METHOD_LINK: 3282 case AMD_RESET_METHOD_BACO: 3283 case AMD_RESET_METHOD_MODE1: 3284 return true; 3285 default: 3286 return false; 3287 } 3288 } 3289 3290 /** 3291 * amdgpu_device_set_cg_state - set clockgating for amdgpu device 3292 * 3293 * @adev: amdgpu_device pointer 3294 * @state: clockgating state (gate or ungate) 3295 * 3296 * The list of all the hardware IPs that make up the asic is walked and the 3297 * set_clockgating_state callbacks are run. 3298 * Late initialization pass enabling clockgating for hardware IPs. 3299 * Fini or suspend, pass disabling clockgating for hardware IPs. 3300 * Returns 0 on success, negative error code on failure. 3301 */ 3302 3303 int amdgpu_device_set_cg_state(struct amdgpu_device *adev, 3304 enum amd_clockgating_state state) 3305 { 3306 int i, j, r; 3307 3308 if (amdgpu_emu_mode == 1) 3309 return 0; 3310 3311 for (j = 0; j < adev->num_ip_blocks; j++) { 3312 i = state == AMD_CG_STATE_GATE ? j : adev->num_ip_blocks - j - 1; 3313 if (!adev->ip_blocks[i].status.late_initialized) 3314 continue; 3315 /* skip CG for GFX, SDMA on S0ix */ 3316 if (adev->in_s0ix && 3317 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GFX || 3318 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SDMA)) 3319 continue; 3320 /* skip CG for VCE/UVD, it's handled specially */ 3321 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_UVD && 3322 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCE && 3323 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCN && 3324 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_JPEG && 3325 adev->ip_blocks[i].version->funcs->set_clockgating_state) { 3326 /* enable clockgating to save power */ 3327 r = adev->ip_blocks[i].version->funcs->set_clockgating_state(&adev->ip_blocks[i], 3328 state); 3329 if (r) { 3330 dev_err(adev->dev, 3331 "set_clockgating_state(gate) of IP block <%s> failed %d\n", 3332 adev->ip_blocks[i].version->funcs->name, 3333 r); 3334 return r; 3335 } 3336 } 3337 } 3338 3339 return 0; 3340 } 3341 3342 int amdgpu_device_set_pg_state(struct amdgpu_device *adev, 3343 enum amd_powergating_state state) 3344 { 3345 int i, j, r; 3346 3347 if (amdgpu_emu_mode == 1) 3348 return 0; 3349 3350 for (j = 0; j < adev->num_ip_blocks; j++) { 3351 i = state == AMD_PG_STATE_GATE ? j : adev->num_ip_blocks - j - 1; 3352 if (!adev->ip_blocks[i].status.late_initialized) 3353 continue; 3354 /* skip PG for GFX, SDMA on S0ix */ 3355 if (adev->in_s0ix && 3356 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GFX || 3357 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SDMA)) 3358 continue; 3359 /* skip CG for VCE/UVD, it's handled specially */ 3360 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_UVD && 3361 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCE && 3362 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCN && 3363 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_JPEG && 3364 adev->ip_blocks[i].version->funcs->set_powergating_state) { 3365 /* enable powergating to save power */ 3366 r = adev->ip_blocks[i].version->funcs->set_powergating_state(&adev->ip_blocks[i], 3367 state); 3368 if (r) { 3369 dev_err(adev->dev, 3370 "set_powergating_state(gate) of IP block <%s> failed %d\n", 3371 adev->ip_blocks[i].version->funcs->name, 3372 r); 3373 return r; 3374 } 3375 } 3376 } 3377 return 0; 3378 } 3379 3380 static int amdgpu_device_enable_mgpu_fan_boost(void) 3381 { 3382 struct amdgpu_gpu_instance *gpu_ins; 3383 struct amdgpu_device *adev; 3384 int i, ret = 0; 3385 3386 mutex_lock(&mgpu_info.mutex); 3387 3388 /* 3389 * MGPU fan boost feature should be enabled 3390 * only when there are two or more dGPUs in 3391 * the system 3392 */ 3393 if (mgpu_info.num_dgpu < 2) 3394 goto out; 3395 3396 for (i = 0; i < mgpu_info.num_dgpu; i++) { 3397 gpu_ins = &(mgpu_info.gpu_ins[i]); 3398 adev = gpu_ins->adev; 3399 if (!(adev->flags & AMD_IS_APU) && 3400 !gpu_ins->mgpu_fan_enabled) { 3401 ret = amdgpu_dpm_enable_mgpu_fan_boost(adev); 3402 if (ret) 3403 break; 3404 3405 gpu_ins->mgpu_fan_enabled = 1; 3406 } 3407 } 3408 3409 out: 3410 mutex_unlock(&mgpu_info.mutex); 3411 3412 return ret; 3413 } 3414 3415 /** 3416 * amdgpu_device_ip_late_init - run late init for hardware IPs 3417 * 3418 * @adev: amdgpu_device pointer 3419 * 3420 * Late initialization pass for hardware IPs. The list of all the hardware 3421 * IPs that make up the asic is walked and the late_init callbacks are run. 3422 * late_init covers any special initialization that an IP requires 3423 * after all of the have been initialized or something that needs to happen 3424 * late in the init process. 3425 * Returns 0 on success, negative error code on failure. 3426 */ 3427 static int amdgpu_device_ip_late_init(struct amdgpu_device *adev) 3428 { 3429 struct amdgpu_gpu_instance *gpu_instance; 3430 int i = 0, r; 3431 3432 for (i = 0; i < adev->num_ip_blocks; i++) { 3433 if (!adev->ip_blocks[i].status.hw) 3434 continue; 3435 if (adev->ip_blocks[i].version->funcs->late_init) { 3436 r = adev->ip_blocks[i].version->funcs->late_init(&adev->ip_blocks[i]); 3437 if (r) { 3438 dev_err(adev->dev, 3439 "late_init of IP block <%s> failed %d\n", 3440 adev->ip_blocks[i].version->funcs->name, 3441 r); 3442 return r; 3443 } 3444 } 3445 adev->ip_blocks[i].status.late_initialized = true; 3446 } 3447 3448 r = amdgpu_ras_late_init(adev); 3449 if (r) { 3450 dev_err(adev->dev, "amdgpu_ras_late_init failed %d", r); 3451 return r; 3452 } 3453 3454 if (!amdgpu_reset_in_recovery(adev)) 3455 amdgpu_ras_set_error_query_ready(adev, true); 3456 3457 amdgpu_device_set_cg_state(adev, AMD_CG_STATE_GATE); 3458 amdgpu_device_set_pg_state(adev, AMD_PG_STATE_GATE); 3459 3460 amdgpu_device_fill_reset_magic(adev); 3461 3462 r = amdgpu_device_enable_mgpu_fan_boost(); 3463 if (r) 3464 dev_err(adev->dev, "enable mgpu fan boost failed (%d).\n", r); 3465 3466 /* For passthrough configuration on arcturus and aldebaran, enable special handling SBR */ 3467 if (amdgpu_passthrough(adev) && 3468 ((adev->asic_type == CHIP_ARCTURUS && adev->gmc.xgmi.num_physical_nodes > 1) || 3469 adev->asic_type == CHIP_ALDEBARAN)) 3470 amdgpu_dpm_handle_passthrough_sbr(adev, true); 3471 3472 if (adev->gmc.xgmi.num_physical_nodes > 1) { 3473 mutex_lock(&mgpu_info.mutex); 3474 3475 /* 3476 * Reset device p-state to low as this was booted with high. 3477 * 3478 * This should be performed only after all devices from the same 3479 * hive get initialized. 3480 * 3481 * However, it's unknown how many device in the hive in advance. 3482 * As this is counted one by one during devices initializations. 3483 * 3484 * So, we wait for all XGMI interlinked devices initialized. 3485 * This may bring some delays as those devices may come from 3486 * different hives. But that should be OK. 3487 */ 3488 if (mgpu_info.num_dgpu == adev->gmc.xgmi.num_physical_nodes) { 3489 for (i = 0; i < mgpu_info.num_gpu; i++) { 3490 gpu_instance = &(mgpu_info.gpu_ins[i]); 3491 if (gpu_instance->adev->flags & AMD_IS_APU) 3492 continue; 3493 3494 r = amdgpu_xgmi_set_pstate(gpu_instance->adev, 3495 AMDGPU_XGMI_PSTATE_MIN); 3496 if (r) { 3497 dev_err(adev->dev, 3498 "pstate setting failed (%d).\n", 3499 r); 3500 break; 3501 } 3502 } 3503 } 3504 3505 mutex_unlock(&mgpu_info.mutex); 3506 } 3507 3508 return 0; 3509 } 3510 3511 static void amdgpu_ip_block_hw_fini(struct amdgpu_ip_block *ip_block) 3512 { 3513 struct amdgpu_device *adev = ip_block->adev; 3514 int r; 3515 3516 if (!ip_block->version->funcs->hw_fini) { 3517 dev_err(adev->dev, "hw_fini of IP block <%s> not defined\n", 3518 ip_block->version->funcs->name); 3519 } else { 3520 r = ip_block->version->funcs->hw_fini(ip_block); 3521 /* XXX handle errors */ 3522 if (r) { 3523 dev_dbg(adev->dev, 3524 "hw_fini of IP block <%s> failed %d\n", 3525 ip_block->version->funcs->name, r); 3526 } 3527 } 3528 3529 ip_block->status.hw = false; 3530 } 3531 3532 /** 3533 * amdgpu_device_smu_fini_early - smu hw_fini wrapper 3534 * 3535 * @adev: amdgpu_device pointer 3536 * 3537 * For ASICs need to disable SMC first 3538 */ 3539 static void amdgpu_device_smu_fini_early(struct amdgpu_device *adev) 3540 { 3541 int i; 3542 3543 if (amdgpu_ip_version(adev, GC_HWIP, 0) > IP_VERSION(9, 0, 0)) 3544 return; 3545 3546 for (i = 0; i < adev->num_ip_blocks; i++) { 3547 if (!adev->ip_blocks[i].status.hw) 3548 continue; 3549 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) { 3550 amdgpu_ip_block_hw_fini(&adev->ip_blocks[i]); 3551 break; 3552 } 3553 } 3554 } 3555 3556 static int amdgpu_device_ip_fini_early(struct amdgpu_device *adev) 3557 { 3558 int i, r; 3559 3560 for (i = 0; i < adev->num_ip_blocks; i++) { 3561 if (!adev->ip_blocks[i].version->funcs->early_fini) 3562 continue; 3563 3564 r = adev->ip_blocks[i].version->funcs->early_fini(&adev->ip_blocks[i]); 3565 if (r) { 3566 dev_dbg(adev->dev, 3567 "early_fini of IP block <%s> failed %d\n", 3568 adev->ip_blocks[i].version->funcs->name, r); 3569 } 3570 } 3571 3572 amdgpu_device_set_pg_state(adev, AMD_PG_STATE_UNGATE); 3573 amdgpu_device_set_cg_state(adev, AMD_CG_STATE_UNGATE); 3574 3575 amdgpu_amdkfd_suspend(adev, true); 3576 amdgpu_userq_suspend(adev); 3577 3578 /* Workaround for ASICs need to disable SMC first */ 3579 amdgpu_device_smu_fini_early(adev); 3580 3581 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 3582 if (!adev->ip_blocks[i].status.hw) 3583 continue; 3584 3585 amdgpu_ip_block_hw_fini(&adev->ip_blocks[i]); 3586 } 3587 3588 if (amdgpu_sriov_vf(adev)) { 3589 if (amdgpu_virt_release_full_gpu(adev, false)) 3590 dev_err(adev->dev, 3591 "failed to release exclusive mode on fini\n"); 3592 } 3593 3594 return 0; 3595 } 3596 3597 /** 3598 * amdgpu_device_ip_fini - run fini for hardware IPs 3599 * 3600 * @adev: amdgpu_device pointer 3601 * 3602 * Main teardown pass for hardware IPs. The list of all the hardware 3603 * IPs that make up the asic is walked and the hw_fini and sw_fini callbacks 3604 * are run. hw_fini tears down the hardware associated with each IP 3605 * and sw_fini tears down any software state associated with each IP. 3606 * Returns 0 on success, negative error code on failure. 3607 */ 3608 static int amdgpu_device_ip_fini(struct amdgpu_device *adev) 3609 { 3610 int i, r; 3611 3612 amdgpu_cper_fini(adev); 3613 3614 if (amdgpu_sriov_vf(adev) && adev->virt.ras_init_done) 3615 amdgpu_virt_release_ras_err_handler_data(adev); 3616 3617 if (adev->gmc.xgmi.num_physical_nodes > 1) 3618 amdgpu_xgmi_remove_device(adev); 3619 3620 amdgpu_amdkfd_device_fini_sw(adev); 3621 3622 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 3623 if (!adev->ip_blocks[i].status.sw) 3624 continue; 3625 3626 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) { 3627 amdgpu_ucode_free_bo(adev); 3628 amdgpu_free_static_csa(&adev->virt.csa_obj); 3629 amdgpu_device_wb_fini(adev); 3630 amdgpu_device_mem_scratch_fini(adev); 3631 amdgpu_ib_pool_fini(adev); 3632 amdgpu_seq64_fini(adev); 3633 amdgpu_doorbell_fini(adev); 3634 } 3635 if (adev->ip_blocks[i].version->funcs->sw_fini) { 3636 r = adev->ip_blocks[i].version->funcs->sw_fini(&adev->ip_blocks[i]); 3637 /* XXX handle errors */ 3638 if (r) { 3639 dev_dbg(adev->dev, 3640 "sw_fini of IP block <%s> failed %d\n", 3641 adev->ip_blocks[i].version->funcs->name, 3642 r); 3643 } 3644 } 3645 adev->ip_blocks[i].status.sw = false; 3646 adev->ip_blocks[i].status.valid = false; 3647 } 3648 3649 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 3650 if (!adev->ip_blocks[i].status.late_initialized) 3651 continue; 3652 if (adev->ip_blocks[i].version->funcs->late_fini) 3653 adev->ip_blocks[i].version->funcs->late_fini(&adev->ip_blocks[i]); 3654 adev->ip_blocks[i].status.late_initialized = false; 3655 } 3656 3657 amdgpu_ras_fini(adev); 3658 3659 return 0; 3660 } 3661 3662 /** 3663 * amdgpu_device_delayed_init_work_handler - work handler for IB tests 3664 * 3665 * @work: work_struct. 3666 */ 3667 static void amdgpu_device_delayed_init_work_handler(struct work_struct *work) 3668 { 3669 struct amdgpu_device *adev = 3670 container_of(work, struct amdgpu_device, delayed_init_work.work); 3671 int r; 3672 3673 r = amdgpu_ib_ring_tests(adev); 3674 if (r) 3675 dev_err(adev->dev, "ib ring test failed (%d).\n", r); 3676 } 3677 3678 static void amdgpu_device_delay_enable_gfx_off(struct work_struct *work) 3679 { 3680 struct amdgpu_device *adev = 3681 container_of(work, struct amdgpu_device, gfx.gfx_off_delay_work.work); 3682 3683 WARN_ON_ONCE(adev->gfx.gfx_off_state); 3684 WARN_ON_ONCE(adev->gfx.gfx_off_req_count); 3685 3686 if (!amdgpu_dpm_set_powergating_by_smu(adev, AMD_IP_BLOCK_TYPE_GFX, true, 0)) 3687 adev->gfx.gfx_off_state = true; 3688 } 3689 3690 /** 3691 * amdgpu_device_ip_suspend_phase1 - run suspend for hardware IPs (phase 1) 3692 * 3693 * @adev: amdgpu_device pointer 3694 * 3695 * Main suspend function for hardware IPs. The list of all the hardware 3696 * IPs that make up the asic is walked, clockgating is disabled and the 3697 * suspend callbacks are run. suspend puts the hardware and software state 3698 * in each IP into a state suitable for suspend. 3699 * Returns 0 on success, negative error code on failure. 3700 */ 3701 static int amdgpu_device_ip_suspend_phase1(struct amdgpu_device *adev) 3702 { 3703 int i, r; 3704 3705 amdgpu_device_set_pg_state(adev, AMD_PG_STATE_UNGATE); 3706 amdgpu_device_set_cg_state(adev, AMD_CG_STATE_UNGATE); 3707 3708 /* 3709 * Per PMFW team's suggestion, driver needs to handle gfxoff 3710 * and df cstate features disablement for gpu reset(e.g. Mode1Reset) 3711 * scenario. Add the missing df cstate disablement here. 3712 */ 3713 if (amdgpu_dpm_set_df_cstate(adev, DF_CSTATE_DISALLOW)) 3714 dev_warn(adev->dev, "Failed to disallow df cstate"); 3715 3716 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 3717 if (!adev->ip_blocks[i].status.valid) 3718 continue; 3719 3720 /* displays are handled separately */ 3721 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_DCE) 3722 continue; 3723 3724 /* XXX handle errors */ 3725 r = amdgpu_ip_block_suspend(&adev->ip_blocks[i]); 3726 if (r) 3727 return r; 3728 } 3729 3730 return 0; 3731 } 3732 3733 /** 3734 * amdgpu_device_ip_suspend_phase2 - run suspend for hardware IPs (phase 2) 3735 * 3736 * @adev: amdgpu_device pointer 3737 * 3738 * Main suspend function for hardware IPs. The list of all the hardware 3739 * IPs that make up the asic is walked, clockgating is disabled and the 3740 * suspend callbacks are run. suspend puts the hardware and software state 3741 * in each IP into a state suitable for suspend. 3742 * Returns 0 on success, negative error code on failure. 3743 */ 3744 static int amdgpu_device_ip_suspend_phase2(struct amdgpu_device *adev) 3745 { 3746 int i, r; 3747 3748 if (adev->in_s0ix) 3749 amdgpu_dpm_gfx_state_change(adev, sGpuChangeState_D3Entry); 3750 3751 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 3752 if (!adev->ip_blocks[i].status.valid) 3753 continue; 3754 /* displays are handled in phase1 */ 3755 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE) 3756 continue; 3757 /* PSP lost connection when err_event_athub occurs */ 3758 if (amdgpu_ras_intr_triggered() && 3759 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) { 3760 adev->ip_blocks[i].status.hw = false; 3761 continue; 3762 } 3763 3764 /* skip unnecessary suspend if we do not initialize them yet */ 3765 if (!amdgpu_ip_member_of_hwini( 3766 adev, adev->ip_blocks[i].version->type)) 3767 continue; 3768 3769 /* Since we skip suspend for S0i3, we need to cancel the delayed 3770 * idle work here as the suspend callback never gets called. 3771 */ 3772 if (adev->in_s0ix && 3773 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GFX && 3774 amdgpu_ip_version(adev, GC_HWIP, 0) >= IP_VERSION(10, 0, 0)) 3775 cancel_delayed_work_sync(&adev->gfx.idle_work); 3776 /* skip suspend of gfx/mes and psp for S0ix 3777 * gfx is in gfxoff state, so on resume it will exit gfxoff just 3778 * like at runtime. PSP is also part of the always on hardware 3779 * so no need to suspend it. 3780 */ 3781 if (adev->in_s0ix && 3782 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP || 3783 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GFX || 3784 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_MES)) 3785 continue; 3786 3787 /* SDMA 5.x+ is part of GFX power domain so it's covered by GFXOFF */ 3788 if (adev->in_s0ix && 3789 (amdgpu_ip_version(adev, SDMA0_HWIP, 0) >= 3790 IP_VERSION(5, 0, 0)) && 3791 (adev->ip_blocks[i].version->type == 3792 AMD_IP_BLOCK_TYPE_SDMA)) 3793 continue; 3794 3795 /* Once swPSP provides the IMU, RLC FW binaries to TOS during cold-boot. 3796 * These are in TMR, hence are expected to be reused by PSP-TOS to reload 3797 * from this location and RLC Autoload automatically also gets loaded 3798 * from here based on PMFW -> PSP message during re-init sequence. 3799 * Therefore, the psp suspend & resume should be skipped to avoid destroy 3800 * the TMR and reload FWs again for IMU enabled APU ASICs. 3801 */ 3802 if (amdgpu_in_reset(adev) && 3803 (adev->flags & AMD_IS_APU) && adev->gfx.imu.funcs && 3804 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) 3805 continue; 3806 3807 /* XXX handle errors */ 3808 r = amdgpu_ip_block_suspend(&adev->ip_blocks[i]); 3809 adev->ip_blocks[i].status.hw = false; 3810 3811 /* handle putting the SMC in the appropriate state */ 3812 if (!amdgpu_sriov_vf(adev)) { 3813 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) { 3814 r = amdgpu_dpm_set_mp1_state(adev, adev->mp1_state); 3815 if (r) { 3816 dev_err(adev->dev, 3817 "SMC failed to set mp1 state %d, %d\n", 3818 adev->mp1_state, r); 3819 return r; 3820 } 3821 } 3822 } 3823 } 3824 3825 return 0; 3826 } 3827 3828 /** 3829 * amdgpu_device_ip_suspend - run suspend for hardware IPs 3830 * 3831 * @adev: amdgpu_device pointer 3832 * 3833 * Main suspend function for hardware IPs. The list of all the hardware 3834 * IPs that make up the asic is walked, clockgating is disabled and the 3835 * suspend callbacks are run. suspend puts the hardware and software state 3836 * in each IP into a state suitable for suspend. 3837 * Returns 0 on success, negative error code on failure. 3838 */ 3839 int amdgpu_device_ip_suspend(struct amdgpu_device *adev) 3840 { 3841 int r; 3842 3843 if (amdgpu_sriov_vf(adev)) { 3844 amdgpu_virt_fini_data_exchange(adev); 3845 amdgpu_virt_request_full_gpu(adev, false); 3846 } 3847 3848 amdgpu_ttm_set_buffer_funcs_status(adev, false); 3849 3850 r = amdgpu_device_ip_suspend_phase1(adev); 3851 if (r) 3852 return r; 3853 r = amdgpu_device_ip_suspend_phase2(adev); 3854 3855 if (amdgpu_sriov_vf(adev)) 3856 amdgpu_virt_release_full_gpu(adev, false); 3857 3858 return r; 3859 } 3860 3861 static int amdgpu_device_ip_reinit_early_sriov(struct amdgpu_device *adev) 3862 { 3863 int i, r; 3864 3865 static enum amd_ip_block_type ip_order[] = { 3866 AMD_IP_BLOCK_TYPE_COMMON, 3867 AMD_IP_BLOCK_TYPE_GMC, 3868 AMD_IP_BLOCK_TYPE_PSP, 3869 AMD_IP_BLOCK_TYPE_IH, 3870 }; 3871 3872 for (i = 0; i < adev->num_ip_blocks; i++) { 3873 int j; 3874 struct amdgpu_ip_block *block; 3875 3876 block = &adev->ip_blocks[i]; 3877 block->status.hw = false; 3878 3879 for (j = 0; j < ARRAY_SIZE(ip_order); j++) { 3880 3881 if (block->version->type != ip_order[j] || 3882 !block->status.valid) 3883 continue; 3884 3885 r = block->version->funcs->hw_init(&adev->ip_blocks[i]); 3886 if (r) { 3887 dev_err(adev->dev, "RE-INIT-early: %s failed\n", 3888 block->version->funcs->name); 3889 return r; 3890 } 3891 block->status.hw = true; 3892 } 3893 } 3894 3895 return 0; 3896 } 3897 3898 static int amdgpu_device_ip_reinit_late_sriov(struct amdgpu_device *adev) 3899 { 3900 struct amdgpu_ip_block *block; 3901 int i, r = 0; 3902 3903 static enum amd_ip_block_type ip_order[] = { 3904 AMD_IP_BLOCK_TYPE_SMC, 3905 AMD_IP_BLOCK_TYPE_DCE, 3906 AMD_IP_BLOCK_TYPE_GFX, 3907 AMD_IP_BLOCK_TYPE_SDMA, 3908 AMD_IP_BLOCK_TYPE_MES, 3909 AMD_IP_BLOCK_TYPE_UVD, 3910 AMD_IP_BLOCK_TYPE_VCE, 3911 AMD_IP_BLOCK_TYPE_VCN, 3912 AMD_IP_BLOCK_TYPE_JPEG 3913 }; 3914 3915 for (i = 0; i < ARRAY_SIZE(ip_order); i++) { 3916 block = amdgpu_device_ip_get_ip_block(adev, ip_order[i]); 3917 3918 if (!block) 3919 continue; 3920 3921 if (block->status.valid && !block->status.hw) { 3922 if (block->version->type == AMD_IP_BLOCK_TYPE_SMC) { 3923 r = amdgpu_ip_block_resume(block); 3924 } else { 3925 r = block->version->funcs->hw_init(block); 3926 } 3927 3928 if (r) { 3929 dev_err(adev->dev, "RE-INIT-late: %s failed\n", 3930 block->version->funcs->name); 3931 break; 3932 } 3933 block->status.hw = true; 3934 } 3935 } 3936 3937 return r; 3938 } 3939 3940 /** 3941 * amdgpu_device_ip_resume_phase1 - run resume for hardware IPs 3942 * 3943 * @adev: amdgpu_device pointer 3944 * 3945 * First resume function for hardware IPs. The list of all the hardware 3946 * IPs that make up the asic is walked and the resume callbacks are run for 3947 * COMMON, GMC, and IH. resume puts the hardware into a functional state 3948 * after a suspend and updates the software state as necessary. This 3949 * function is also used for restoring the GPU after a GPU reset. 3950 * Returns 0 on success, negative error code on failure. 3951 */ 3952 static int amdgpu_device_ip_resume_phase1(struct amdgpu_device *adev) 3953 { 3954 int i, r; 3955 3956 for (i = 0; i < adev->num_ip_blocks; i++) { 3957 if (!adev->ip_blocks[i].status.valid || adev->ip_blocks[i].status.hw) 3958 continue; 3959 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON || 3960 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC || 3961 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH || 3962 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP && amdgpu_sriov_vf(adev))) { 3963 3964 r = amdgpu_ip_block_resume(&adev->ip_blocks[i]); 3965 if (r) 3966 return r; 3967 } 3968 } 3969 3970 return 0; 3971 } 3972 3973 /** 3974 * amdgpu_device_ip_resume_phase2 - run resume for hardware IPs 3975 * 3976 * @adev: amdgpu_device pointer 3977 * 3978 * Second resume function for hardware IPs. The list of all the hardware 3979 * IPs that make up the asic is walked and the resume callbacks are run for 3980 * all blocks except COMMON, GMC, and IH. resume puts the hardware into a 3981 * functional state after a suspend and updates the software state as 3982 * necessary. This function is also used for restoring the GPU after a GPU 3983 * reset. 3984 * Returns 0 on success, negative error code on failure. 3985 */ 3986 static int amdgpu_device_ip_resume_phase2(struct amdgpu_device *adev) 3987 { 3988 int i, r; 3989 3990 for (i = 0; i < adev->num_ip_blocks; i++) { 3991 if (!adev->ip_blocks[i].status.valid || adev->ip_blocks[i].status.hw) 3992 continue; 3993 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON || 3994 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC || 3995 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH || 3996 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE || 3997 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) 3998 continue; 3999 r = amdgpu_ip_block_resume(&adev->ip_blocks[i]); 4000 if (r) 4001 return r; 4002 } 4003 4004 return 0; 4005 } 4006 4007 /** 4008 * amdgpu_device_ip_resume_phase3 - run resume for hardware IPs 4009 * 4010 * @adev: amdgpu_device pointer 4011 * 4012 * Third resume function for hardware IPs. The list of all the hardware 4013 * IPs that make up the asic is walked and the resume callbacks are run for 4014 * all DCE. resume puts the hardware into a functional state after a suspend 4015 * and updates the software state as necessary. This function is also used 4016 * for restoring the GPU after a GPU reset. 4017 * 4018 * Returns 0 on success, negative error code on failure. 4019 */ 4020 static int amdgpu_device_ip_resume_phase3(struct amdgpu_device *adev) 4021 { 4022 int i, r; 4023 4024 for (i = 0; i < adev->num_ip_blocks; i++) { 4025 if (!adev->ip_blocks[i].status.valid || adev->ip_blocks[i].status.hw) 4026 continue; 4027 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE) { 4028 r = amdgpu_ip_block_resume(&adev->ip_blocks[i]); 4029 if (r) 4030 return r; 4031 } 4032 } 4033 4034 return 0; 4035 } 4036 4037 /** 4038 * amdgpu_device_ip_resume - run resume for hardware IPs 4039 * 4040 * @adev: amdgpu_device pointer 4041 * 4042 * Main resume function for hardware IPs. The hardware IPs 4043 * are split into two resume functions because they are 4044 * also used in recovering from a GPU reset and some additional 4045 * steps need to be take between them. In this case (S3/S4) they are 4046 * run sequentially. 4047 * Returns 0 on success, negative error code on failure. 4048 */ 4049 static int amdgpu_device_ip_resume(struct amdgpu_device *adev) 4050 { 4051 int r; 4052 4053 r = amdgpu_device_ip_resume_phase1(adev); 4054 if (r) 4055 return r; 4056 4057 r = amdgpu_device_fw_loading(adev); 4058 if (r) 4059 return r; 4060 4061 r = amdgpu_device_ip_resume_phase2(adev); 4062 4063 if (adev->mman.buffer_funcs_ring->sched.ready) 4064 amdgpu_ttm_set_buffer_funcs_status(adev, true); 4065 4066 if (r) 4067 return r; 4068 4069 amdgpu_fence_driver_hw_init(adev); 4070 4071 r = amdgpu_device_ip_resume_phase3(adev); 4072 4073 return r; 4074 } 4075 4076 /** 4077 * amdgpu_device_detect_sriov_bios - determine if the board supports SR-IOV 4078 * 4079 * @adev: amdgpu_device pointer 4080 * 4081 * Query the VBIOS data tables to determine if the board supports SR-IOV. 4082 */ 4083 static void amdgpu_device_detect_sriov_bios(struct amdgpu_device *adev) 4084 { 4085 if (amdgpu_sriov_vf(adev)) { 4086 if (adev->is_atom_fw) { 4087 if (amdgpu_atomfirmware_gpu_virtualization_supported(adev)) 4088 adev->virt.caps |= AMDGPU_SRIOV_CAPS_SRIOV_VBIOS; 4089 } else { 4090 if (amdgpu_atombios_has_gpu_virtualization_table(adev)) 4091 adev->virt.caps |= AMDGPU_SRIOV_CAPS_SRIOV_VBIOS; 4092 } 4093 4094 if (!(adev->virt.caps & AMDGPU_SRIOV_CAPS_SRIOV_VBIOS)) 4095 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_NO_VBIOS, 0, 0); 4096 } 4097 } 4098 4099 /** 4100 * amdgpu_device_asic_has_dc_support - determine if DC supports the asic 4101 * 4102 * @pdev : pci device context 4103 * @asic_type: AMD asic type 4104 * 4105 * Check if there is DC (new modesetting infrastructre) support for an asic. 4106 * returns true if DC has support, false if not. 4107 */ 4108 bool amdgpu_device_asic_has_dc_support(struct pci_dev *pdev, 4109 enum amd_asic_type asic_type) 4110 { 4111 switch (asic_type) { 4112 #ifdef CONFIG_DRM_AMDGPU_SI 4113 case CHIP_HAINAN: 4114 #endif 4115 case CHIP_TOPAZ: 4116 /* chips with no display hardware */ 4117 return false; 4118 #if defined(CONFIG_DRM_AMD_DC) 4119 case CHIP_TAHITI: 4120 case CHIP_PITCAIRN: 4121 case CHIP_VERDE: 4122 case CHIP_OLAND: 4123 /* 4124 * We have systems in the wild with these ASICs that require 4125 * LVDS and VGA support which is not supported with DC. 4126 * 4127 * Fallback to the non-DC driver here by default so as not to 4128 * cause regressions. 4129 */ 4130 #if defined(CONFIG_DRM_AMD_DC_SI) 4131 return amdgpu_dc > 0; 4132 #else 4133 return false; 4134 #endif 4135 case CHIP_BONAIRE: 4136 case CHIP_KAVERI: 4137 case CHIP_KABINI: 4138 case CHIP_MULLINS: 4139 /* 4140 * We have systems in the wild with these ASICs that require 4141 * VGA support which is not supported with DC. 4142 * 4143 * Fallback to the non-DC driver here by default so as not to 4144 * cause regressions. 4145 */ 4146 return amdgpu_dc > 0; 4147 default: 4148 return amdgpu_dc != 0; 4149 #else 4150 default: 4151 if (amdgpu_dc > 0) 4152 dev_info_once( 4153 &pdev->dev, 4154 "Display Core has been requested via kernel parameter but isn't supported by ASIC, ignoring\n"); 4155 return false; 4156 #endif 4157 } 4158 } 4159 4160 /** 4161 * amdgpu_device_has_dc_support - check if dc is supported 4162 * 4163 * @adev: amdgpu_device pointer 4164 * 4165 * Returns true for supported, false for not supported 4166 */ 4167 bool amdgpu_device_has_dc_support(struct amdgpu_device *adev) 4168 { 4169 if (adev->enable_virtual_display || 4170 (adev->harvest_ip_mask & AMD_HARVEST_IP_DMU_MASK)) 4171 return false; 4172 4173 return amdgpu_device_asic_has_dc_support(adev->pdev, adev->asic_type); 4174 } 4175 4176 static void amdgpu_device_xgmi_reset_func(struct work_struct *__work) 4177 { 4178 struct amdgpu_device *adev = 4179 container_of(__work, struct amdgpu_device, xgmi_reset_work); 4180 struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev); 4181 4182 /* It's a bug to not have a hive within this function */ 4183 if (WARN_ON(!hive)) 4184 return; 4185 4186 /* 4187 * Use task barrier to synchronize all xgmi reset works across the 4188 * hive. task_barrier_enter and task_barrier_exit will block 4189 * until all the threads running the xgmi reset works reach 4190 * those points. task_barrier_full will do both blocks. 4191 */ 4192 if (amdgpu_asic_reset_method(adev) == AMD_RESET_METHOD_BACO) { 4193 4194 task_barrier_enter(&hive->tb); 4195 adev->asic_reset_res = amdgpu_device_baco_enter(adev_to_drm(adev)); 4196 4197 if (adev->asic_reset_res) 4198 goto fail; 4199 4200 task_barrier_exit(&hive->tb); 4201 adev->asic_reset_res = amdgpu_device_baco_exit(adev_to_drm(adev)); 4202 4203 if (adev->asic_reset_res) 4204 goto fail; 4205 4206 amdgpu_ras_reset_error_count(adev, AMDGPU_RAS_BLOCK__MMHUB); 4207 } else { 4208 4209 task_barrier_full(&hive->tb); 4210 adev->asic_reset_res = amdgpu_asic_reset(adev); 4211 } 4212 4213 fail: 4214 if (adev->asic_reset_res) 4215 dev_warn(adev->dev, 4216 "ASIC reset failed with error, %d for drm dev, %s", 4217 adev->asic_reset_res, adev_to_drm(adev)->unique); 4218 amdgpu_put_xgmi_hive(hive); 4219 } 4220 4221 static int amdgpu_device_get_job_timeout_settings(struct amdgpu_device *adev) 4222 { 4223 char *input = amdgpu_lockup_timeout; 4224 char *timeout_setting = NULL; 4225 int index = 0; 4226 long timeout; 4227 int ret = 0; 4228 4229 /* 4230 * By default timeout for non compute jobs is 10000 4231 * and 60000 for compute jobs. 4232 * In SR-IOV or passthrough mode, timeout for compute 4233 * jobs are 60000 by default. 4234 */ 4235 adev->gfx_timeout = msecs_to_jiffies(10000); 4236 adev->sdma_timeout = adev->video_timeout = adev->gfx_timeout; 4237 if (amdgpu_sriov_vf(adev)) 4238 adev->compute_timeout = amdgpu_sriov_is_pp_one_vf(adev) ? 4239 msecs_to_jiffies(60000) : msecs_to_jiffies(10000); 4240 else 4241 adev->compute_timeout = msecs_to_jiffies(60000); 4242 4243 if (strnlen(input, AMDGPU_MAX_TIMEOUT_PARAM_LENGTH)) { 4244 while ((timeout_setting = strsep(&input, ",")) && 4245 strnlen(timeout_setting, AMDGPU_MAX_TIMEOUT_PARAM_LENGTH)) { 4246 ret = kstrtol(timeout_setting, 0, &timeout); 4247 if (ret) 4248 return ret; 4249 4250 if (timeout == 0) { 4251 index++; 4252 continue; 4253 } else if (timeout < 0) { 4254 timeout = MAX_SCHEDULE_TIMEOUT; 4255 dev_warn(adev->dev, "lockup timeout disabled"); 4256 add_taint(TAINT_SOFTLOCKUP, LOCKDEP_STILL_OK); 4257 } else { 4258 timeout = msecs_to_jiffies(timeout); 4259 } 4260 4261 switch (index++) { 4262 case 0: 4263 adev->gfx_timeout = timeout; 4264 break; 4265 case 1: 4266 adev->compute_timeout = timeout; 4267 break; 4268 case 2: 4269 adev->sdma_timeout = timeout; 4270 break; 4271 case 3: 4272 adev->video_timeout = timeout; 4273 break; 4274 default: 4275 break; 4276 } 4277 } 4278 /* 4279 * There is only one value specified and 4280 * it should apply to all non-compute jobs. 4281 */ 4282 if (index == 1) { 4283 adev->sdma_timeout = adev->video_timeout = adev->gfx_timeout; 4284 if (amdgpu_sriov_vf(adev) || amdgpu_passthrough(adev)) 4285 adev->compute_timeout = adev->gfx_timeout; 4286 } 4287 } 4288 4289 return ret; 4290 } 4291 4292 /** 4293 * amdgpu_device_check_iommu_direct_map - check if RAM direct mapped to GPU 4294 * 4295 * @adev: amdgpu_device pointer 4296 * 4297 * RAM direct mapped to GPU if IOMMU is not enabled or is pass through mode 4298 */ 4299 static void amdgpu_device_check_iommu_direct_map(struct amdgpu_device *adev) 4300 { 4301 struct iommu_domain *domain; 4302 4303 domain = iommu_get_domain_for_dev(adev->dev); 4304 if (!domain || domain->type == IOMMU_DOMAIN_IDENTITY) 4305 adev->ram_is_direct_mapped = true; 4306 } 4307 4308 #if defined(CONFIG_HSA_AMD_P2P) 4309 /** 4310 * amdgpu_device_check_iommu_remap - Check if DMA remapping is enabled. 4311 * 4312 * @adev: amdgpu_device pointer 4313 * 4314 * return if IOMMU remapping bar address 4315 */ 4316 static bool amdgpu_device_check_iommu_remap(struct amdgpu_device *adev) 4317 { 4318 struct iommu_domain *domain; 4319 4320 domain = iommu_get_domain_for_dev(adev->dev); 4321 if (domain && (domain->type == IOMMU_DOMAIN_DMA || 4322 domain->type == IOMMU_DOMAIN_DMA_FQ)) 4323 return true; 4324 4325 return false; 4326 } 4327 #endif 4328 4329 static void amdgpu_device_set_mcbp(struct amdgpu_device *adev) 4330 { 4331 if (amdgpu_mcbp == 1) 4332 adev->gfx.mcbp = true; 4333 else if (amdgpu_mcbp == 0) 4334 adev->gfx.mcbp = false; 4335 4336 if (amdgpu_sriov_vf(adev)) 4337 adev->gfx.mcbp = true; 4338 4339 if (adev->gfx.mcbp) 4340 dev_info(adev->dev, "MCBP is enabled\n"); 4341 } 4342 4343 /** 4344 * amdgpu_device_init - initialize the driver 4345 * 4346 * @adev: amdgpu_device pointer 4347 * @flags: driver flags 4348 * 4349 * Initializes the driver info and hw (all asics). 4350 * Returns 0 for success or an error on failure. 4351 * Called at driver startup. 4352 */ 4353 int amdgpu_device_init(struct amdgpu_device *adev, 4354 uint32_t flags) 4355 { 4356 struct drm_device *ddev = adev_to_drm(adev); 4357 struct pci_dev *pdev = adev->pdev; 4358 int r, i; 4359 bool px = false; 4360 u32 max_MBps; 4361 int tmp; 4362 4363 adev->shutdown = false; 4364 adev->flags = flags; 4365 4366 if (amdgpu_force_asic_type >= 0 && amdgpu_force_asic_type < CHIP_LAST) 4367 adev->asic_type = amdgpu_force_asic_type; 4368 else 4369 adev->asic_type = flags & AMD_ASIC_MASK; 4370 4371 adev->usec_timeout = AMDGPU_MAX_USEC_TIMEOUT; 4372 if (amdgpu_emu_mode == 1) 4373 adev->usec_timeout *= 10; 4374 adev->gmc.gart_size = 512 * 1024 * 1024; 4375 adev->accel_working = false; 4376 adev->num_rings = 0; 4377 RCU_INIT_POINTER(adev->gang_submit, dma_fence_get_stub()); 4378 adev->mman.buffer_funcs = NULL; 4379 adev->mman.buffer_funcs_ring = NULL; 4380 adev->vm_manager.vm_pte_funcs = NULL; 4381 adev->vm_manager.vm_pte_num_scheds = 0; 4382 adev->gmc.gmc_funcs = NULL; 4383 adev->harvest_ip_mask = 0x0; 4384 adev->fence_context = dma_fence_context_alloc(AMDGPU_MAX_RINGS); 4385 bitmap_zero(adev->gfx.pipe_reserve_bitmap, AMDGPU_MAX_COMPUTE_QUEUES); 4386 4387 adev->smc_rreg = &amdgpu_invalid_rreg; 4388 adev->smc_wreg = &amdgpu_invalid_wreg; 4389 adev->pcie_rreg = &amdgpu_invalid_rreg; 4390 adev->pcie_wreg = &amdgpu_invalid_wreg; 4391 adev->pcie_rreg_ext = &amdgpu_invalid_rreg_ext; 4392 adev->pcie_wreg_ext = &amdgpu_invalid_wreg_ext; 4393 adev->pciep_rreg = &amdgpu_invalid_rreg; 4394 adev->pciep_wreg = &amdgpu_invalid_wreg; 4395 adev->pcie_rreg64 = &amdgpu_invalid_rreg64; 4396 adev->pcie_wreg64 = &amdgpu_invalid_wreg64; 4397 adev->pcie_rreg64_ext = &amdgpu_invalid_rreg64_ext; 4398 adev->pcie_wreg64_ext = &amdgpu_invalid_wreg64_ext; 4399 adev->uvd_ctx_rreg = &amdgpu_invalid_rreg; 4400 adev->uvd_ctx_wreg = &amdgpu_invalid_wreg; 4401 adev->didt_rreg = &amdgpu_invalid_rreg; 4402 adev->didt_wreg = &amdgpu_invalid_wreg; 4403 adev->gc_cac_rreg = &amdgpu_invalid_rreg; 4404 adev->gc_cac_wreg = &amdgpu_invalid_wreg; 4405 adev->audio_endpt_rreg = &amdgpu_block_invalid_rreg; 4406 adev->audio_endpt_wreg = &amdgpu_block_invalid_wreg; 4407 4408 dev_info( 4409 adev->dev, 4410 "initializing kernel modesetting (%s 0x%04X:0x%04X 0x%04X:0x%04X 0x%02X).\n", 4411 amdgpu_asic_name[adev->asic_type], pdev->vendor, pdev->device, 4412 pdev->subsystem_vendor, pdev->subsystem_device, pdev->revision); 4413 4414 /* mutex initialization are all done here so we 4415 * can recall function without having locking issues 4416 */ 4417 mutex_init(&adev->firmware.mutex); 4418 mutex_init(&adev->pm.mutex); 4419 mutex_init(&adev->gfx.gpu_clock_mutex); 4420 mutex_init(&adev->srbm_mutex); 4421 mutex_init(&adev->gfx.pipe_reserve_mutex); 4422 mutex_init(&adev->gfx.gfx_off_mutex); 4423 mutex_init(&adev->gfx.partition_mutex); 4424 mutex_init(&adev->grbm_idx_mutex); 4425 mutex_init(&adev->mn_lock); 4426 mutex_init(&adev->virt.vf_errors.lock); 4427 hash_init(adev->mn_hash); 4428 mutex_init(&adev->psp.mutex); 4429 mutex_init(&adev->notifier_lock); 4430 mutex_init(&adev->pm.stable_pstate_ctx_lock); 4431 mutex_init(&adev->benchmark_mutex); 4432 mutex_init(&adev->gfx.reset_sem_mutex); 4433 /* Initialize the mutex for cleaner shader isolation between GFX and compute processes */ 4434 mutex_init(&adev->enforce_isolation_mutex); 4435 for (i = 0; i < MAX_XCP; ++i) { 4436 adev->isolation[i].spearhead = dma_fence_get_stub(); 4437 amdgpu_sync_create(&adev->isolation[i].active); 4438 amdgpu_sync_create(&adev->isolation[i].prev); 4439 } 4440 mutex_init(&adev->gfx.userq_sch_mutex); 4441 mutex_init(&adev->gfx.workload_profile_mutex); 4442 mutex_init(&adev->vcn.workload_profile_mutex); 4443 mutex_init(&adev->userq_mutex); 4444 4445 amdgpu_device_init_apu_flags(adev); 4446 4447 r = amdgpu_device_check_arguments(adev); 4448 if (r) 4449 return r; 4450 4451 spin_lock_init(&adev->mmio_idx_lock); 4452 spin_lock_init(&adev->smc_idx_lock); 4453 spin_lock_init(&adev->pcie_idx_lock); 4454 spin_lock_init(&adev->uvd_ctx_idx_lock); 4455 spin_lock_init(&adev->didt_idx_lock); 4456 spin_lock_init(&adev->gc_cac_idx_lock); 4457 spin_lock_init(&adev->se_cac_idx_lock); 4458 spin_lock_init(&adev->audio_endpt_idx_lock); 4459 spin_lock_init(&adev->mm_stats.lock); 4460 spin_lock_init(&adev->virt.rlcg_reg_lock); 4461 spin_lock_init(&adev->wb.lock); 4462 4463 xa_init_flags(&adev->userq_xa, XA_FLAGS_LOCK_IRQ); 4464 4465 INIT_LIST_HEAD(&adev->reset_list); 4466 4467 INIT_LIST_HEAD(&adev->ras_list); 4468 4469 INIT_LIST_HEAD(&adev->pm.od_kobj_list); 4470 4471 INIT_LIST_HEAD(&adev->userq_mgr_list); 4472 4473 INIT_DELAYED_WORK(&adev->delayed_init_work, 4474 amdgpu_device_delayed_init_work_handler); 4475 INIT_DELAYED_WORK(&adev->gfx.gfx_off_delay_work, 4476 amdgpu_device_delay_enable_gfx_off); 4477 /* 4478 * Initialize the enforce_isolation work structures for each XCP 4479 * partition. This work handler is responsible for enforcing shader 4480 * isolation on AMD GPUs. It counts the number of emitted fences for 4481 * each GFX and compute ring. If there are any fences, it schedules 4482 * the `enforce_isolation_work` to be run after a delay. If there are 4483 * no fences, it signals the Kernel Fusion Driver (KFD) to resume the 4484 * runqueue. 4485 */ 4486 for (i = 0; i < MAX_XCP; i++) { 4487 INIT_DELAYED_WORK(&adev->gfx.enforce_isolation[i].work, 4488 amdgpu_gfx_enforce_isolation_handler); 4489 adev->gfx.enforce_isolation[i].adev = adev; 4490 adev->gfx.enforce_isolation[i].xcp_id = i; 4491 } 4492 4493 INIT_WORK(&adev->xgmi_reset_work, amdgpu_device_xgmi_reset_func); 4494 4495 adev->gfx.gfx_off_req_count = 1; 4496 adev->gfx.gfx_off_residency = 0; 4497 adev->gfx.gfx_off_entrycount = 0; 4498 adev->pm.ac_power = power_supply_is_system_supplied() > 0; 4499 4500 atomic_set(&adev->throttling_logging_enabled, 1); 4501 /* 4502 * If throttling continues, logging will be performed every minute 4503 * to avoid log flooding. "-1" is subtracted since the thermal 4504 * throttling interrupt comes every second. Thus, the total logging 4505 * interval is 59 seconds(retelimited printk interval) + 1(waiting 4506 * for throttling interrupt) = 60 seconds. 4507 */ 4508 ratelimit_state_init(&adev->throttling_logging_rs, (60 - 1) * HZ, 1); 4509 4510 ratelimit_set_flags(&adev->throttling_logging_rs, RATELIMIT_MSG_ON_RELEASE); 4511 4512 /* Registers mapping */ 4513 /* TODO: block userspace mapping of io register */ 4514 if (adev->asic_type >= CHIP_BONAIRE) { 4515 adev->rmmio_base = pci_resource_start(adev->pdev, 5); 4516 adev->rmmio_size = pci_resource_len(adev->pdev, 5); 4517 } else { 4518 adev->rmmio_base = pci_resource_start(adev->pdev, 2); 4519 adev->rmmio_size = pci_resource_len(adev->pdev, 2); 4520 } 4521 4522 for (i = 0; i < AMD_IP_BLOCK_TYPE_NUM; i++) 4523 atomic_set(&adev->pm.pwr_state[i], POWER_STATE_UNKNOWN); 4524 4525 adev->rmmio = ioremap(adev->rmmio_base, adev->rmmio_size); 4526 if (!adev->rmmio) 4527 return -ENOMEM; 4528 4529 dev_info(adev->dev, "register mmio base: 0x%08X\n", 4530 (uint32_t)adev->rmmio_base); 4531 dev_info(adev->dev, "register mmio size: %u\n", 4532 (unsigned int)adev->rmmio_size); 4533 4534 /* 4535 * Reset domain needs to be present early, before XGMI hive discovered 4536 * (if any) and initialized to use reset sem and in_gpu reset flag 4537 * early on during init and before calling to RREG32. 4538 */ 4539 adev->reset_domain = amdgpu_reset_create_reset_domain(SINGLE_DEVICE, "amdgpu-reset-dev"); 4540 if (!adev->reset_domain) 4541 return -ENOMEM; 4542 4543 /* detect hw virtualization here */ 4544 amdgpu_virt_init(adev); 4545 4546 amdgpu_device_get_pcie_info(adev); 4547 4548 r = amdgpu_device_get_job_timeout_settings(adev); 4549 if (r) { 4550 dev_err(adev->dev, "invalid lockup_timeout parameter syntax\n"); 4551 return r; 4552 } 4553 4554 amdgpu_device_set_mcbp(adev); 4555 4556 /* 4557 * By default, use default mode where all blocks are expected to be 4558 * initialized. At present a 'swinit' of blocks is required to be 4559 * completed before the need for a different level is detected. 4560 */ 4561 amdgpu_set_init_level(adev, AMDGPU_INIT_LEVEL_DEFAULT); 4562 /* early init functions */ 4563 r = amdgpu_device_ip_early_init(adev); 4564 if (r) 4565 return r; 4566 4567 /* 4568 * No need to remove conflicting FBs for non-display class devices. 4569 * This prevents the sysfb from being freed accidently. 4570 */ 4571 if ((pdev->class >> 8) == PCI_CLASS_DISPLAY_VGA || 4572 (pdev->class >> 8) == PCI_CLASS_DISPLAY_OTHER) { 4573 /* Get rid of things like offb */ 4574 r = aperture_remove_conflicting_pci_devices(adev->pdev, amdgpu_kms_driver.name); 4575 if (r) 4576 return r; 4577 } 4578 4579 /* Enable TMZ based on IP_VERSION */ 4580 amdgpu_gmc_tmz_set(adev); 4581 4582 if (amdgpu_sriov_vf(adev) && 4583 amdgpu_ip_version(adev, GC_HWIP, 0) >= IP_VERSION(10, 3, 0)) 4584 /* VF MMIO access (except mailbox range) from CPU 4585 * will be blocked during sriov runtime 4586 */ 4587 adev->virt.caps |= AMDGPU_VF_MMIO_ACCESS_PROTECT; 4588 4589 amdgpu_gmc_noretry_set(adev); 4590 /* Need to get xgmi info early to decide the reset behavior*/ 4591 if (adev->gmc.xgmi.supported) { 4592 r = adev->gfxhub.funcs->get_xgmi_info(adev); 4593 if (r) 4594 return r; 4595 } 4596 4597 /* enable PCIE atomic ops */ 4598 if (amdgpu_sriov_vf(adev)) { 4599 if (adev->virt.fw_reserve.p_pf2vf) 4600 adev->have_atomics_support = ((struct amd_sriov_msg_pf2vf_info *) 4601 adev->virt.fw_reserve.p_pf2vf)->pcie_atomic_ops_support_flags == 4602 (PCI_EXP_DEVCAP2_ATOMIC_COMP32 | PCI_EXP_DEVCAP2_ATOMIC_COMP64); 4603 /* APUs w/ gfx9 onwards doesn't reply on PCIe atomics, rather it is a 4604 * internal path natively support atomics, set have_atomics_support to true. 4605 */ 4606 } else if ((adev->flags & AMD_IS_APU) && 4607 (amdgpu_ip_version(adev, GC_HWIP, 0) > 4608 IP_VERSION(9, 0, 0))) { 4609 adev->have_atomics_support = true; 4610 } else { 4611 adev->have_atomics_support = 4612 !pci_enable_atomic_ops_to_root(adev->pdev, 4613 PCI_EXP_DEVCAP2_ATOMIC_COMP32 | 4614 PCI_EXP_DEVCAP2_ATOMIC_COMP64); 4615 } 4616 4617 if (!adev->have_atomics_support) 4618 dev_info(adev->dev, "PCIE atomic ops is not supported\n"); 4619 4620 /* doorbell bar mapping and doorbell index init*/ 4621 amdgpu_doorbell_init(adev); 4622 4623 if (amdgpu_emu_mode == 1) { 4624 /* post the asic on emulation mode */ 4625 emu_soc_asic_init(adev); 4626 goto fence_driver_init; 4627 } 4628 4629 amdgpu_reset_init(adev); 4630 4631 /* detect if we are with an SRIOV vbios */ 4632 if (adev->bios) 4633 amdgpu_device_detect_sriov_bios(adev); 4634 4635 /* check if we need to reset the asic 4636 * E.g., driver was not cleanly unloaded previously, etc. 4637 */ 4638 if (!amdgpu_sriov_vf(adev) && amdgpu_asic_need_reset_on_init(adev)) { 4639 if (adev->gmc.xgmi.num_physical_nodes) { 4640 dev_info(adev->dev, "Pending hive reset.\n"); 4641 amdgpu_set_init_level(adev, 4642 AMDGPU_INIT_LEVEL_MINIMAL_XGMI); 4643 } else if (amdgpu_ip_version(adev, MP1_HWIP, 0) == IP_VERSION(13, 0, 10) && 4644 !amdgpu_device_has_display_hardware(adev)) { 4645 r = psp_gpu_reset(adev); 4646 } else { 4647 tmp = amdgpu_reset_method; 4648 /* It should do a default reset when loading or reloading the driver, 4649 * regardless of the module parameter reset_method. 4650 */ 4651 amdgpu_reset_method = AMD_RESET_METHOD_NONE; 4652 r = amdgpu_asic_reset(adev); 4653 amdgpu_reset_method = tmp; 4654 } 4655 4656 if (r) { 4657 dev_err(adev->dev, "asic reset on init failed\n"); 4658 goto failed; 4659 } 4660 } 4661 4662 /* Post card if necessary */ 4663 if (amdgpu_device_need_post(adev)) { 4664 if (!adev->bios) { 4665 dev_err(adev->dev, "no vBIOS found\n"); 4666 r = -EINVAL; 4667 goto failed; 4668 } 4669 dev_info(adev->dev, "GPU posting now...\n"); 4670 r = amdgpu_device_asic_init(adev); 4671 if (r) { 4672 dev_err(adev->dev, "gpu post error!\n"); 4673 goto failed; 4674 } 4675 } 4676 4677 if (adev->bios) { 4678 if (adev->is_atom_fw) { 4679 /* Initialize clocks */ 4680 r = amdgpu_atomfirmware_get_clock_info(adev); 4681 if (r) { 4682 dev_err(adev->dev, "amdgpu_atomfirmware_get_clock_info failed\n"); 4683 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_GET_CLOCK_FAIL, 0, 0); 4684 goto failed; 4685 } 4686 } else { 4687 /* Initialize clocks */ 4688 r = amdgpu_atombios_get_clock_info(adev); 4689 if (r) { 4690 dev_err(adev->dev, "amdgpu_atombios_get_clock_info failed\n"); 4691 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_GET_CLOCK_FAIL, 0, 0); 4692 goto failed; 4693 } 4694 /* init i2c buses */ 4695 amdgpu_i2c_init(adev); 4696 } 4697 } 4698 4699 fence_driver_init: 4700 /* Fence driver */ 4701 r = amdgpu_fence_driver_sw_init(adev); 4702 if (r) { 4703 dev_err(adev->dev, "amdgpu_fence_driver_sw_init failed\n"); 4704 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_FENCE_INIT_FAIL, 0, 0); 4705 goto failed; 4706 } 4707 4708 /* init the mode config */ 4709 drm_mode_config_init(adev_to_drm(adev)); 4710 4711 r = amdgpu_device_ip_init(adev); 4712 if (r) { 4713 dev_err(adev->dev, "amdgpu_device_ip_init failed\n"); 4714 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_AMDGPU_INIT_FAIL, 0, 0); 4715 goto release_ras_con; 4716 } 4717 4718 amdgpu_fence_driver_hw_init(adev); 4719 4720 dev_info(adev->dev, 4721 "SE %d, SH per SE %d, CU per SH %d, active_cu_number %d\n", 4722 adev->gfx.config.max_shader_engines, 4723 adev->gfx.config.max_sh_per_se, 4724 adev->gfx.config.max_cu_per_sh, 4725 adev->gfx.cu_info.number); 4726 4727 adev->accel_working = true; 4728 4729 amdgpu_vm_check_compute_bug(adev); 4730 4731 /* Initialize the buffer migration limit. */ 4732 if (amdgpu_moverate >= 0) 4733 max_MBps = amdgpu_moverate; 4734 else 4735 max_MBps = 8; /* Allow 8 MB/s. */ 4736 /* Get a log2 for easy divisions. */ 4737 adev->mm_stats.log2_max_MBps = ilog2(max(1u, max_MBps)); 4738 4739 /* 4740 * Register gpu instance before amdgpu_device_enable_mgpu_fan_boost. 4741 * Otherwise the mgpu fan boost feature will be skipped due to the 4742 * gpu instance is counted less. 4743 */ 4744 amdgpu_register_gpu_instance(adev); 4745 4746 /* enable clockgating, etc. after ib tests, etc. since some blocks require 4747 * explicit gating rather than handling it automatically. 4748 */ 4749 if (adev->init_lvl->level != AMDGPU_INIT_LEVEL_MINIMAL_XGMI) { 4750 r = amdgpu_device_ip_late_init(adev); 4751 if (r) { 4752 dev_err(adev->dev, "amdgpu_device_ip_late_init failed\n"); 4753 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_AMDGPU_LATE_INIT_FAIL, 0, r); 4754 goto release_ras_con; 4755 } 4756 /* must succeed. */ 4757 amdgpu_ras_resume(adev); 4758 queue_delayed_work(system_wq, &adev->delayed_init_work, 4759 msecs_to_jiffies(AMDGPU_RESUME_MS)); 4760 } 4761 4762 if (amdgpu_sriov_vf(adev)) { 4763 amdgpu_virt_release_full_gpu(adev, true); 4764 flush_delayed_work(&adev->delayed_init_work); 4765 } 4766 4767 /* 4768 * Place those sysfs registering after `late_init`. As some of those 4769 * operations performed in `late_init` might affect the sysfs 4770 * interfaces creating. 4771 */ 4772 r = amdgpu_atombios_sysfs_init(adev); 4773 if (r) 4774 drm_err(&adev->ddev, 4775 "registering atombios sysfs failed (%d).\n", r); 4776 4777 r = amdgpu_pm_sysfs_init(adev); 4778 if (r) 4779 dev_err(adev->dev, "registering pm sysfs failed (%d).\n", r); 4780 4781 r = amdgpu_ucode_sysfs_init(adev); 4782 if (r) { 4783 adev->ucode_sysfs_en = false; 4784 dev_err(adev->dev, "Creating firmware sysfs failed (%d).\n", r); 4785 } else 4786 adev->ucode_sysfs_en = true; 4787 4788 r = amdgpu_device_attr_sysfs_init(adev); 4789 if (r) 4790 dev_err(adev->dev, "Could not create amdgpu device attr\n"); 4791 4792 r = devm_device_add_group(adev->dev, &amdgpu_board_attrs_group); 4793 if (r) 4794 dev_err(adev->dev, 4795 "Could not create amdgpu board attributes\n"); 4796 4797 amdgpu_fru_sysfs_init(adev); 4798 amdgpu_reg_state_sysfs_init(adev); 4799 amdgpu_xcp_sysfs_init(adev); 4800 4801 if (IS_ENABLED(CONFIG_PERF_EVENTS)) 4802 r = amdgpu_pmu_init(adev); 4803 if (r) 4804 dev_err(adev->dev, "amdgpu_pmu_init failed\n"); 4805 4806 /* Have stored pci confspace at hand for restore in sudden PCI error */ 4807 if (amdgpu_device_cache_pci_state(adev->pdev)) 4808 pci_restore_state(pdev); 4809 4810 /* if we have > 1 VGA cards, then disable the amdgpu VGA resources */ 4811 /* this will fail for cards that aren't VGA class devices, just 4812 * ignore it 4813 */ 4814 if ((adev->pdev->class >> 8) == PCI_CLASS_DISPLAY_VGA) 4815 vga_client_register(adev->pdev, amdgpu_device_vga_set_decode); 4816 4817 px = amdgpu_device_supports_px(ddev); 4818 4819 if (px || (!dev_is_removable(&adev->pdev->dev) && 4820 apple_gmux_detect(NULL, NULL))) 4821 vga_switcheroo_register_client(adev->pdev, 4822 &amdgpu_switcheroo_ops, px); 4823 4824 if (px) 4825 vga_switcheroo_init_domain_pm_ops(adev->dev, &adev->vga_pm_domain); 4826 4827 if (adev->init_lvl->level == AMDGPU_INIT_LEVEL_MINIMAL_XGMI) 4828 amdgpu_xgmi_reset_on_init(adev); 4829 4830 amdgpu_device_check_iommu_direct_map(adev); 4831 4832 adev->pm_nb.notifier_call = amdgpu_device_pm_notifier; 4833 r = register_pm_notifier(&adev->pm_nb); 4834 if (r) 4835 goto failed; 4836 4837 return 0; 4838 4839 release_ras_con: 4840 if (amdgpu_sriov_vf(adev)) 4841 amdgpu_virt_release_full_gpu(adev, true); 4842 4843 /* failed in exclusive mode due to timeout */ 4844 if (amdgpu_sriov_vf(adev) && 4845 !amdgpu_sriov_runtime(adev) && 4846 amdgpu_virt_mmio_blocked(adev) && 4847 !amdgpu_virt_wait_reset(adev)) { 4848 dev_err(adev->dev, "VF exclusive mode timeout\n"); 4849 /* Don't send request since VF is inactive. */ 4850 adev->virt.caps &= ~AMDGPU_SRIOV_CAPS_RUNTIME; 4851 adev->virt.ops = NULL; 4852 r = -EAGAIN; 4853 } 4854 amdgpu_release_ras_context(adev); 4855 4856 failed: 4857 amdgpu_vf_error_trans_all(adev); 4858 4859 return r; 4860 } 4861 4862 static void amdgpu_device_unmap_mmio(struct amdgpu_device *adev) 4863 { 4864 4865 /* Clear all CPU mappings pointing to this device */ 4866 unmap_mapping_range(adev->ddev.anon_inode->i_mapping, 0, 0, 1); 4867 4868 /* Unmap all mapped bars - Doorbell, registers and VRAM */ 4869 amdgpu_doorbell_fini(adev); 4870 4871 iounmap(adev->rmmio); 4872 adev->rmmio = NULL; 4873 if (adev->mman.aper_base_kaddr) 4874 iounmap(adev->mman.aper_base_kaddr); 4875 adev->mman.aper_base_kaddr = NULL; 4876 4877 /* Memory manager related */ 4878 if (!adev->gmc.xgmi.connected_to_cpu && !adev->gmc.is_app_apu) { 4879 arch_phys_wc_del(adev->gmc.vram_mtrr); 4880 arch_io_free_memtype_wc(adev->gmc.aper_base, adev->gmc.aper_size); 4881 } 4882 } 4883 4884 /** 4885 * amdgpu_device_fini_hw - tear down the driver 4886 * 4887 * @adev: amdgpu_device pointer 4888 * 4889 * Tear down the driver info (all asics). 4890 * Called at driver shutdown. 4891 */ 4892 void amdgpu_device_fini_hw(struct amdgpu_device *adev) 4893 { 4894 dev_info(adev->dev, "amdgpu: finishing device.\n"); 4895 flush_delayed_work(&adev->delayed_init_work); 4896 4897 if (adev->mman.initialized) 4898 drain_workqueue(adev->mman.bdev.wq); 4899 adev->shutdown = true; 4900 4901 unregister_pm_notifier(&adev->pm_nb); 4902 4903 /* make sure IB test finished before entering exclusive mode 4904 * to avoid preemption on IB test 4905 */ 4906 if (amdgpu_sriov_vf(adev)) { 4907 amdgpu_virt_request_full_gpu(adev, false); 4908 amdgpu_virt_fini_data_exchange(adev); 4909 } 4910 4911 /* disable all interrupts */ 4912 amdgpu_irq_disable_all(adev); 4913 if (adev->mode_info.mode_config_initialized) { 4914 if (!drm_drv_uses_atomic_modeset(adev_to_drm(adev))) 4915 drm_helper_force_disable_all(adev_to_drm(adev)); 4916 else 4917 drm_atomic_helper_shutdown(adev_to_drm(adev)); 4918 } 4919 amdgpu_fence_driver_hw_fini(adev); 4920 4921 if (adev->pm.sysfs_initialized) 4922 amdgpu_pm_sysfs_fini(adev); 4923 if (adev->ucode_sysfs_en) 4924 amdgpu_ucode_sysfs_fini(adev); 4925 amdgpu_device_attr_sysfs_fini(adev); 4926 amdgpu_fru_sysfs_fini(adev); 4927 4928 amdgpu_reg_state_sysfs_fini(adev); 4929 amdgpu_xcp_sysfs_fini(adev); 4930 4931 /* disable ras feature must before hw fini */ 4932 amdgpu_ras_pre_fini(adev); 4933 4934 amdgpu_ttm_set_buffer_funcs_status(adev, false); 4935 4936 amdgpu_device_ip_fini_early(adev); 4937 4938 amdgpu_irq_fini_hw(adev); 4939 4940 if (adev->mman.initialized) 4941 ttm_device_clear_dma_mappings(&adev->mman.bdev); 4942 4943 amdgpu_gart_dummy_page_fini(adev); 4944 4945 if (drm_dev_is_unplugged(adev_to_drm(adev))) 4946 amdgpu_device_unmap_mmio(adev); 4947 4948 } 4949 4950 void amdgpu_device_fini_sw(struct amdgpu_device *adev) 4951 { 4952 int i, idx; 4953 bool px; 4954 4955 amdgpu_device_ip_fini(adev); 4956 amdgpu_fence_driver_sw_fini(adev); 4957 amdgpu_ucode_release(&adev->firmware.gpu_info_fw); 4958 adev->accel_working = false; 4959 dma_fence_put(rcu_dereference_protected(adev->gang_submit, true)); 4960 for (i = 0; i < MAX_XCP; ++i) { 4961 dma_fence_put(adev->isolation[i].spearhead); 4962 amdgpu_sync_free(&adev->isolation[i].active); 4963 amdgpu_sync_free(&adev->isolation[i].prev); 4964 } 4965 4966 amdgpu_reset_fini(adev); 4967 4968 /* free i2c buses */ 4969 amdgpu_i2c_fini(adev); 4970 4971 if (adev->bios) { 4972 if (amdgpu_emu_mode != 1) 4973 amdgpu_atombios_fini(adev); 4974 amdgpu_bios_release(adev); 4975 } 4976 4977 kfree(adev->fru_info); 4978 adev->fru_info = NULL; 4979 4980 kfree(adev->xcp_mgr); 4981 adev->xcp_mgr = NULL; 4982 4983 px = amdgpu_device_supports_px(adev_to_drm(adev)); 4984 4985 if (px || (!dev_is_removable(&adev->pdev->dev) && 4986 apple_gmux_detect(NULL, NULL))) 4987 vga_switcheroo_unregister_client(adev->pdev); 4988 4989 if (px) 4990 vga_switcheroo_fini_domain_pm_ops(adev->dev); 4991 4992 if ((adev->pdev->class >> 8) == PCI_CLASS_DISPLAY_VGA) 4993 vga_client_unregister(adev->pdev); 4994 4995 if (drm_dev_enter(adev_to_drm(adev), &idx)) { 4996 4997 iounmap(adev->rmmio); 4998 adev->rmmio = NULL; 4999 drm_dev_exit(idx); 5000 } 5001 5002 if (IS_ENABLED(CONFIG_PERF_EVENTS)) 5003 amdgpu_pmu_fini(adev); 5004 if (adev->mman.discovery_bin) 5005 amdgpu_discovery_fini(adev); 5006 5007 amdgpu_reset_put_reset_domain(adev->reset_domain); 5008 adev->reset_domain = NULL; 5009 5010 kfree(adev->pci_state); 5011 5012 } 5013 5014 /** 5015 * amdgpu_device_evict_resources - evict device resources 5016 * @adev: amdgpu device object 5017 * 5018 * Evicts all ttm device resources(vram BOs, gart table) from the lru list 5019 * of the vram memory type. Mainly used for evicting device resources 5020 * at suspend time. 5021 * 5022 */ 5023 static int amdgpu_device_evict_resources(struct amdgpu_device *adev) 5024 { 5025 int ret; 5026 5027 /* No need to evict vram on APUs unless going to S4 */ 5028 if (!adev->in_s4 && (adev->flags & AMD_IS_APU)) 5029 return 0; 5030 5031 ret = amdgpu_ttm_evict_resources(adev, TTM_PL_VRAM); 5032 if (ret) { 5033 dev_warn(adev->dev, "evicting device resources failed\n"); 5034 return ret; 5035 } 5036 5037 if (adev->in_s4) { 5038 ret = ttm_device_prepare_hibernation(&adev->mman.bdev); 5039 if (ret) 5040 dev_err(adev->dev, "prepare hibernation failed, %d\n", ret); 5041 } 5042 return ret; 5043 } 5044 5045 /* 5046 * Suspend & resume. 5047 */ 5048 /** 5049 * amdgpu_device_pm_notifier - Notification block for Suspend/Hibernate events 5050 * @nb: notifier block 5051 * @mode: suspend mode 5052 * @data: data 5053 * 5054 * This function is called when the system is about to suspend or hibernate. 5055 * It is used to set the appropriate flags so that eviction can be optimized 5056 * in the pm prepare callback. 5057 */ 5058 static int amdgpu_device_pm_notifier(struct notifier_block *nb, unsigned long mode, 5059 void *data) 5060 { 5061 struct amdgpu_device *adev = container_of(nb, struct amdgpu_device, pm_nb); 5062 5063 switch (mode) { 5064 case PM_HIBERNATION_PREPARE: 5065 adev->in_s4 = true; 5066 break; 5067 case PM_POST_HIBERNATION: 5068 adev->in_s4 = false; 5069 break; 5070 } 5071 5072 return NOTIFY_DONE; 5073 } 5074 5075 /** 5076 * amdgpu_device_prepare - prepare for device suspend 5077 * 5078 * @dev: drm dev pointer 5079 * 5080 * Prepare to put the hw in the suspend state (all asics). 5081 * Returns 0 for success or an error on failure. 5082 * Called at driver suspend. 5083 */ 5084 int amdgpu_device_prepare(struct drm_device *dev) 5085 { 5086 struct amdgpu_device *adev = drm_to_adev(dev); 5087 int i, r; 5088 5089 if (dev->switch_power_state == DRM_SWITCH_POWER_OFF) 5090 return 0; 5091 5092 /* Evict the majority of BOs before starting suspend sequence */ 5093 r = amdgpu_device_evict_resources(adev); 5094 if (r) 5095 return r; 5096 5097 flush_delayed_work(&adev->gfx.gfx_off_delay_work); 5098 5099 for (i = 0; i < adev->num_ip_blocks; i++) { 5100 if (!adev->ip_blocks[i].status.valid) 5101 continue; 5102 if (!adev->ip_blocks[i].version->funcs->prepare_suspend) 5103 continue; 5104 r = adev->ip_blocks[i].version->funcs->prepare_suspend(&adev->ip_blocks[i]); 5105 if (r) 5106 return r; 5107 } 5108 5109 return 0; 5110 } 5111 5112 /** 5113 * amdgpu_device_complete - complete power state transition 5114 * 5115 * @dev: drm dev pointer 5116 * 5117 * Undo the changes from amdgpu_device_prepare. This will be 5118 * called on all resume transitions, including those that failed. 5119 */ 5120 void amdgpu_device_complete(struct drm_device *dev) 5121 { 5122 struct amdgpu_device *adev = drm_to_adev(dev); 5123 int i; 5124 5125 for (i = 0; i < adev->num_ip_blocks; i++) { 5126 if (!adev->ip_blocks[i].status.valid) 5127 continue; 5128 if (!adev->ip_blocks[i].version->funcs->complete) 5129 continue; 5130 adev->ip_blocks[i].version->funcs->complete(&adev->ip_blocks[i]); 5131 } 5132 } 5133 5134 /** 5135 * amdgpu_device_suspend - initiate device suspend 5136 * 5137 * @dev: drm dev pointer 5138 * @notify_clients: notify in-kernel DRM clients 5139 * 5140 * Puts the hw in the suspend state (all asics). 5141 * Returns 0 for success or an error on failure. 5142 * Called at driver suspend. 5143 */ 5144 int amdgpu_device_suspend(struct drm_device *dev, bool notify_clients) 5145 { 5146 struct amdgpu_device *adev = drm_to_adev(dev); 5147 int r = 0; 5148 5149 if (dev->switch_power_state == DRM_SWITCH_POWER_OFF) 5150 return 0; 5151 5152 adev->in_suspend = true; 5153 5154 if (amdgpu_sriov_vf(adev)) { 5155 if (!adev->in_s0ix && !adev->in_runpm) 5156 amdgpu_amdkfd_suspend_process(adev); 5157 amdgpu_virt_fini_data_exchange(adev); 5158 r = amdgpu_virt_request_full_gpu(adev, false); 5159 if (r) 5160 return r; 5161 } 5162 5163 if (amdgpu_acpi_smart_shift_update(dev, AMDGPU_SS_DEV_D3)) 5164 dev_warn(adev->dev, "smart shift update failed\n"); 5165 5166 if (notify_clients) 5167 drm_client_dev_suspend(adev_to_drm(adev), false); 5168 5169 cancel_delayed_work_sync(&adev->delayed_init_work); 5170 5171 amdgpu_ras_suspend(adev); 5172 5173 amdgpu_device_ip_suspend_phase1(adev); 5174 5175 if (!adev->in_s0ix) { 5176 amdgpu_amdkfd_suspend(adev, !amdgpu_sriov_vf(adev) && !adev->in_runpm); 5177 amdgpu_userq_suspend(adev); 5178 } 5179 5180 r = amdgpu_device_evict_resources(adev); 5181 if (r) 5182 return r; 5183 5184 amdgpu_ttm_set_buffer_funcs_status(adev, false); 5185 5186 amdgpu_fence_driver_hw_fini(adev); 5187 5188 amdgpu_device_ip_suspend_phase2(adev); 5189 5190 if (amdgpu_sriov_vf(adev)) 5191 amdgpu_virt_release_full_gpu(adev, false); 5192 5193 r = amdgpu_dpm_notify_rlc_state(adev, false); 5194 if (r) 5195 return r; 5196 5197 return 0; 5198 } 5199 5200 static inline int amdgpu_virt_resume(struct amdgpu_device *adev) 5201 { 5202 int r; 5203 unsigned int prev_physical_node_id = adev->gmc.xgmi.physical_node_id; 5204 5205 /* During VM resume, QEMU programming of VF MSIX table (register GFXMSIX_VECT0_ADDR_LO) 5206 * may not work. The access could be blocked by nBIF protection as VF isn't in 5207 * exclusive access mode. Exclusive access is enabled now, disable/enable MSIX 5208 * so that QEMU reprograms MSIX table. 5209 */ 5210 amdgpu_restore_msix(adev); 5211 5212 r = adev->gfxhub.funcs->get_xgmi_info(adev); 5213 if (r) 5214 return r; 5215 5216 dev_info(adev->dev, "xgmi node, old id %d, new id %d\n", 5217 prev_physical_node_id, adev->gmc.xgmi.physical_node_id); 5218 5219 adev->vm_manager.vram_base_offset = adev->gfxhub.funcs->get_mc_fb_offset(adev); 5220 adev->vm_manager.vram_base_offset += 5221 adev->gmc.xgmi.physical_node_id * adev->gmc.xgmi.node_segment_size; 5222 5223 return 0; 5224 } 5225 5226 /** 5227 * amdgpu_device_resume - initiate device resume 5228 * 5229 * @dev: drm dev pointer 5230 * @notify_clients: notify in-kernel DRM clients 5231 * 5232 * Bring the hw back to operating state (all asics). 5233 * Returns 0 for success or an error on failure. 5234 * Called at driver resume. 5235 */ 5236 int amdgpu_device_resume(struct drm_device *dev, bool notify_clients) 5237 { 5238 struct amdgpu_device *adev = drm_to_adev(dev); 5239 int r = 0; 5240 5241 if (amdgpu_sriov_vf(adev)) { 5242 r = amdgpu_virt_request_full_gpu(adev, true); 5243 if (r) 5244 return r; 5245 } 5246 5247 if (amdgpu_virt_xgmi_migrate_enabled(adev)) { 5248 r = amdgpu_virt_resume(adev); 5249 if (r) 5250 goto exit; 5251 } 5252 5253 if (dev->switch_power_state == DRM_SWITCH_POWER_OFF) 5254 return 0; 5255 5256 if (adev->in_s0ix) 5257 amdgpu_dpm_gfx_state_change(adev, sGpuChangeState_D0Entry); 5258 5259 /* post card */ 5260 if (amdgpu_device_need_post(adev)) { 5261 r = amdgpu_device_asic_init(adev); 5262 if (r) 5263 dev_err(adev->dev, "amdgpu asic init failed\n"); 5264 } 5265 5266 r = amdgpu_device_ip_resume(adev); 5267 5268 if (r) { 5269 dev_err(adev->dev, "amdgpu_device_ip_resume failed (%d).\n", r); 5270 goto exit; 5271 } 5272 5273 if (!adev->in_s0ix) { 5274 r = amdgpu_amdkfd_resume(adev, !amdgpu_sriov_vf(adev) && !adev->in_runpm); 5275 if (r) 5276 goto exit; 5277 5278 r = amdgpu_userq_resume(adev); 5279 if (r) 5280 goto exit; 5281 } 5282 5283 r = amdgpu_device_ip_late_init(adev); 5284 if (r) 5285 goto exit; 5286 5287 queue_delayed_work(system_wq, &adev->delayed_init_work, 5288 msecs_to_jiffies(AMDGPU_RESUME_MS)); 5289 exit: 5290 if (amdgpu_sriov_vf(adev)) { 5291 amdgpu_virt_init_data_exchange(adev); 5292 amdgpu_virt_release_full_gpu(adev, true); 5293 5294 if (!adev->in_s0ix && !r && !adev->in_runpm) 5295 r = amdgpu_amdkfd_resume_process(adev); 5296 } 5297 5298 if (r) 5299 return r; 5300 5301 /* Make sure IB tests flushed */ 5302 flush_delayed_work(&adev->delayed_init_work); 5303 5304 if (notify_clients) 5305 drm_client_dev_resume(adev_to_drm(adev), false); 5306 5307 amdgpu_ras_resume(adev); 5308 5309 if (adev->mode_info.num_crtc) { 5310 /* 5311 * Most of the connector probing functions try to acquire runtime pm 5312 * refs to ensure that the GPU is powered on when connector polling is 5313 * performed. Since we're calling this from a runtime PM callback, 5314 * trying to acquire rpm refs will cause us to deadlock. 5315 * 5316 * Since we're guaranteed to be holding the rpm lock, it's safe to 5317 * temporarily disable the rpm helpers so this doesn't deadlock us. 5318 */ 5319 #ifdef CONFIG_PM 5320 dev->dev->power.disable_depth++; 5321 #endif 5322 if (!adev->dc_enabled) 5323 drm_helper_hpd_irq_event(dev); 5324 else 5325 drm_kms_helper_hotplug_event(dev); 5326 #ifdef CONFIG_PM 5327 dev->dev->power.disable_depth--; 5328 #endif 5329 } 5330 adev->in_suspend = false; 5331 5332 if (amdgpu_acpi_smart_shift_update(dev, AMDGPU_SS_DEV_D0)) 5333 dev_warn(adev->dev, "smart shift update failed\n"); 5334 5335 return 0; 5336 } 5337 5338 /** 5339 * amdgpu_device_ip_check_soft_reset - did soft reset succeed 5340 * 5341 * @adev: amdgpu_device pointer 5342 * 5343 * The list of all the hardware IPs that make up the asic is walked and 5344 * the check_soft_reset callbacks are run. check_soft_reset determines 5345 * if the asic is still hung or not. 5346 * Returns true if any of the IPs are still in a hung state, false if not. 5347 */ 5348 static bool amdgpu_device_ip_check_soft_reset(struct amdgpu_device *adev) 5349 { 5350 int i; 5351 bool asic_hang = false; 5352 5353 if (amdgpu_sriov_vf(adev)) 5354 return true; 5355 5356 if (amdgpu_asic_need_full_reset(adev)) 5357 return true; 5358 5359 for (i = 0; i < adev->num_ip_blocks; i++) { 5360 if (!adev->ip_blocks[i].status.valid) 5361 continue; 5362 if (adev->ip_blocks[i].version->funcs->check_soft_reset) 5363 adev->ip_blocks[i].status.hang = 5364 adev->ip_blocks[i].version->funcs->check_soft_reset( 5365 &adev->ip_blocks[i]); 5366 if (adev->ip_blocks[i].status.hang) { 5367 dev_info(adev->dev, "IP block:%s is hung!\n", adev->ip_blocks[i].version->funcs->name); 5368 asic_hang = true; 5369 } 5370 } 5371 return asic_hang; 5372 } 5373 5374 /** 5375 * amdgpu_device_ip_pre_soft_reset - prepare for soft reset 5376 * 5377 * @adev: amdgpu_device pointer 5378 * 5379 * The list of all the hardware IPs that make up the asic is walked and the 5380 * pre_soft_reset callbacks are run if the block is hung. pre_soft_reset 5381 * handles any IP specific hardware or software state changes that are 5382 * necessary for a soft reset to succeed. 5383 * Returns 0 on success, negative error code on failure. 5384 */ 5385 static int amdgpu_device_ip_pre_soft_reset(struct amdgpu_device *adev) 5386 { 5387 int i, r = 0; 5388 5389 for (i = 0; i < adev->num_ip_blocks; i++) { 5390 if (!adev->ip_blocks[i].status.valid) 5391 continue; 5392 if (adev->ip_blocks[i].status.hang && 5393 adev->ip_blocks[i].version->funcs->pre_soft_reset) { 5394 r = adev->ip_blocks[i].version->funcs->pre_soft_reset(&adev->ip_blocks[i]); 5395 if (r) 5396 return r; 5397 } 5398 } 5399 5400 return 0; 5401 } 5402 5403 /** 5404 * amdgpu_device_ip_need_full_reset - check if a full asic reset is needed 5405 * 5406 * @adev: amdgpu_device pointer 5407 * 5408 * Some hardware IPs cannot be soft reset. If they are hung, a full gpu 5409 * reset is necessary to recover. 5410 * Returns true if a full asic reset is required, false if not. 5411 */ 5412 static bool amdgpu_device_ip_need_full_reset(struct amdgpu_device *adev) 5413 { 5414 int i; 5415 5416 if (amdgpu_asic_need_full_reset(adev)) 5417 return true; 5418 5419 for (i = 0; i < adev->num_ip_blocks; i++) { 5420 if (!adev->ip_blocks[i].status.valid) 5421 continue; 5422 if ((adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) || 5423 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) || 5424 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_ACP) || 5425 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE) || 5426 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) { 5427 if (adev->ip_blocks[i].status.hang) { 5428 dev_info(adev->dev, "Some block need full reset!\n"); 5429 return true; 5430 } 5431 } 5432 } 5433 return false; 5434 } 5435 5436 /** 5437 * amdgpu_device_ip_soft_reset - do a soft reset 5438 * 5439 * @adev: amdgpu_device pointer 5440 * 5441 * The list of all the hardware IPs that make up the asic is walked and the 5442 * soft_reset callbacks are run if the block is hung. soft_reset handles any 5443 * IP specific hardware or software state changes that are necessary to soft 5444 * reset the IP. 5445 * Returns 0 on success, negative error code on failure. 5446 */ 5447 static int amdgpu_device_ip_soft_reset(struct amdgpu_device *adev) 5448 { 5449 int i, r = 0; 5450 5451 for (i = 0; i < adev->num_ip_blocks; i++) { 5452 if (!adev->ip_blocks[i].status.valid) 5453 continue; 5454 if (adev->ip_blocks[i].status.hang && 5455 adev->ip_blocks[i].version->funcs->soft_reset) { 5456 r = adev->ip_blocks[i].version->funcs->soft_reset(&adev->ip_blocks[i]); 5457 if (r) 5458 return r; 5459 } 5460 } 5461 5462 return 0; 5463 } 5464 5465 /** 5466 * amdgpu_device_ip_post_soft_reset - clean up from soft reset 5467 * 5468 * @adev: amdgpu_device pointer 5469 * 5470 * The list of all the hardware IPs that make up the asic is walked and the 5471 * post_soft_reset callbacks are run if the asic was hung. post_soft_reset 5472 * handles any IP specific hardware or software state changes that are 5473 * necessary after the IP has been soft reset. 5474 * Returns 0 on success, negative error code on failure. 5475 */ 5476 static int amdgpu_device_ip_post_soft_reset(struct amdgpu_device *adev) 5477 { 5478 int i, r = 0; 5479 5480 for (i = 0; i < adev->num_ip_blocks; i++) { 5481 if (!adev->ip_blocks[i].status.valid) 5482 continue; 5483 if (adev->ip_blocks[i].status.hang && 5484 adev->ip_blocks[i].version->funcs->post_soft_reset) 5485 r = adev->ip_blocks[i].version->funcs->post_soft_reset(&adev->ip_blocks[i]); 5486 if (r) 5487 return r; 5488 } 5489 5490 return 0; 5491 } 5492 5493 /** 5494 * amdgpu_device_reset_sriov - reset ASIC for SR-IOV vf 5495 * 5496 * @adev: amdgpu_device pointer 5497 * @reset_context: amdgpu reset context pointer 5498 * 5499 * do VF FLR and reinitialize Asic 5500 * return 0 means succeeded otherwise failed 5501 */ 5502 static int amdgpu_device_reset_sriov(struct amdgpu_device *adev, 5503 struct amdgpu_reset_context *reset_context) 5504 { 5505 int r; 5506 struct amdgpu_hive_info *hive = NULL; 5507 5508 if (test_bit(AMDGPU_HOST_FLR, &reset_context->flags)) { 5509 if (!amdgpu_ras_get_fed_status(adev)) 5510 amdgpu_virt_ready_to_reset(adev); 5511 amdgpu_virt_wait_reset(adev); 5512 clear_bit(AMDGPU_HOST_FLR, &reset_context->flags); 5513 r = amdgpu_virt_request_full_gpu(adev, true); 5514 } else { 5515 r = amdgpu_virt_reset_gpu(adev); 5516 } 5517 if (r) 5518 return r; 5519 5520 amdgpu_ras_clear_err_state(adev); 5521 amdgpu_irq_gpu_reset_resume_helper(adev); 5522 5523 /* some sw clean up VF needs to do before recover */ 5524 amdgpu_virt_post_reset(adev); 5525 5526 /* Resume IP prior to SMC */ 5527 r = amdgpu_device_ip_reinit_early_sriov(adev); 5528 if (r) 5529 return r; 5530 5531 amdgpu_virt_init_data_exchange(adev); 5532 5533 r = amdgpu_device_fw_loading(adev); 5534 if (r) 5535 return r; 5536 5537 /* now we are okay to resume SMC/CP/SDMA */ 5538 r = amdgpu_device_ip_reinit_late_sriov(adev); 5539 if (r) 5540 return r; 5541 5542 hive = amdgpu_get_xgmi_hive(adev); 5543 /* Update PSP FW topology after reset */ 5544 if (hive && adev->gmc.xgmi.num_physical_nodes > 1) 5545 r = amdgpu_xgmi_update_topology(hive, adev); 5546 if (hive) 5547 amdgpu_put_xgmi_hive(hive); 5548 if (r) 5549 return r; 5550 5551 r = amdgpu_ib_ring_tests(adev); 5552 if (r) 5553 return r; 5554 5555 if (adev->virt.gim_feature & AMDGIM_FEATURE_GIM_FLR_VRAMLOST) 5556 amdgpu_inc_vram_lost(adev); 5557 5558 /* need to be called during full access so we can't do it later like 5559 * bare-metal does. 5560 */ 5561 amdgpu_amdkfd_post_reset(adev); 5562 amdgpu_virt_release_full_gpu(adev, true); 5563 5564 /* Aldebaran and gfx_11_0_3 support ras in SRIOV, so need resume ras during reset */ 5565 if (amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 2) || 5566 amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 3) || 5567 amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 4) || 5568 amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 5, 0) || 5569 amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(11, 0, 3)) 5570 amdgpu_ras_resume(adev); 5571 5572 amdgpu_virt_ras_telemetry_post_reset(adev); 5573 5574 return 0; 5575 } 5576 5577 /** 5578 * amdgpu_device_has_job_running - check if there is any unfinished job 5579 * 5580 * @adev: amdgpu_device pointer 5581 * 5582 * check if there is any job running on the device when guest driver receives 5583 * FLR notification from host driver. If there are still jobs running, then 5584 * the guest driver will not respond the FLR reset. Instead, let the job hit 5585 * the timeout and guest driver then issue the reset request. 5586 */ 5587 bool amdgpu_device_has_job_running(struct amdgpu_device *adev) 5588 { 5589 int i; 5590 5591 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 5592 struct amdgpu_ring *ring = adev->rings[i]; 5593 5594 if (!amdgpu_ring_sched_ready(ring)) 5595 continue; 5596 5597 if (amdgpu_fence_count_emitted(ring)) 5598 return true; 5599 } 5600 return false; 5601 } 5602 5603 /** 5604 * amdgpu_device_should_recover_gpu - check if we should try GPU recovery 5605 * 5606 * @adev: amdgpu_device pointer 5607 * 5608 * Check amdgpu_gpu_recovery and SRIOV status to see if we should try to recover 5609 * a hung GPU. 5610 */ 5611 bool amdgpu_device_should_recover_gpu(struct amdgpu_device *adev) 5612 { 5613 5614 if (amdgpu_gpu_recovery == 0) 5615 goto disabled; 5616 5617 /* Skip soft reset check in fatal error mode */ 5618 if (!amdgpu_ras_is_poison_mode_supported(adev)) 5619 return true; 5620 5621 if (amdgpu_sriov_vf(adev)) 5622 return true; 5623 5624 if (amdgpu_gpu_recovery == -1) { 5625 switch (adev->asic_type) { 5626 #ifdef CONFIG_DRM_AMDGPU_SI 5627 case CHIP_VERDE: 5628 case CHIP_TAHITI: 5629 case CHIP_PITCAIRN: 5630 case CHIP_OLAND: 5631 case CHIP_HAINAN: 5632 #endif 5633 #ifdef CONFIG_DRM_AMDGPU_CIK 5634 case CHIP_KAVERI: 5635 case CHIP_KABINI: 5636 case CHIP_MULLINS: 5637 #endif 5638 case CHIP_CARRIZO: 5639 case CHIP_STONEY: 5640 case CHIP_CYAN_SKILLFISH: 5641 goto disabled; 5642 default: 5643 break; 5644 } 5645 } 5646 5647 return true; 5648 5649 disabled: 5650 dev_info(adev->dev, "GPU recovery disabled.\n"); 5651 return false; 5652 } 5653 5654 int amdgpu_device_mode1_reset(struct amdgpu_device *adev) 5655 { 5656 u32 i; 5657 int ret = 0; 5658 5659 if (adev->bios) 5660 amdgpu_atombios_scratch_regs_engine_hung(adev, true); 5661 5662 dev_info(adev->dev, "GPU mode1 reset\n"); 5663 5664 /* Cache the state before bus master disable. The saved config space 5665 * values are used in other cases like restore after mode-2 reset. 5666 */ 5667 amdgpu_device_cache_pci_state(adev->pdev); 5668 5669 /* disable BM */ 5670 pci_clear_master(adev->pdev); 5671 5672 if (amdgpu_dpm_is_mode1_reset_supported(adev)) { 5673 dev_info(adev->dev, "GPU smu mode1 reset\n"); 5674 ret = amdgpu_dpm_mode1_reset(adev); 5675 } else { 5676 dev_info(adev->dev, "GPU psp mode1 reset\n"); 5677 ret = psp_gpu_reset(adev); 5678 } 5679 5680 if (ret) 5681 goto mode1_reset_failed; 5682 5683 amdgpu_device_load_pci_state(adev->pdev); 5684 ret = amdgpu_psp_wait_for_bootloader(adev); 5685 if (ret) 5686 goto mode1_reset_failed; 5687 5688 /* wait for asic to come out of reset */ 5689 for (i = 0; i < adev->usec_timeout; i++) { 5690 u32 memsize = adev->nbio.funcs->get_memsize(adev); 5691 5692 if (memsize != 0xffffffff) 5693 break; 5694 udelay(1); 5695 } 5696 5697 if (i >= adev->usec_timeout) { 5698 ret = -ETIMEDOUT; 5699 goto mode1_reset_failed; 5700 } 5701 5702 if (adev->bios) 5703 amdgpu_atombios_scratch_regs_engine_hung(adev, false); 5704 5705 return 0; 5706 5707 mode1_reset_failed: 5708 dev_err(adev->dev, "GPU mode1 reset failed\n"); 5709 return ret; 5710 } 5711 5712 int amdgpu_device_link_reset(struct amdgpu_device *adev) 5713 { 5714 int ret = 0; 5715 5716 dev_info(adev->dev, "GPU link reset\n"); 5717 5718 if (!adev->pcie_reset_ctx.occurs_dpc) 5719 ret = amdgpu_dpm_link_reset(adev); 5720 5721 if (ret) 5722 goto link_reset_failed; 5723 5724 ret = amdgpu_psp_wait_for_bootloader(adev); 5725 if (ret) 5726 goto link_reset_failed; 5727 5728 return 0; 5729 5730 link_reset_failed: 5731 dev_err(adev->dev, "GPU link reset failed\n"); 5732 return ret; 5733 } 5734 5735 int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev, 5736 struct amdgpu_reset_context *reset_context) 5737 { 5738 int i, r = 0; 5739 struct amdgpu_job *job = NULL; 5740 struct amdgpu_device *tmp_adev = reset_context->reset_req_dev; 5741 bool need_full_reset = 5742 test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 5743 5744 if (reset_context->reset_req_dev == adev) 5745 job = reset_context->job; 5746 5747 if (amdgpu_sriov_vf(adev)) 5748 amdgpu_virt_pre_reset(adev); 5749 5750 amdgpu_fence_driver_isr_toggle(adev, true); 5751 5752 /* block all schedulers and reset given job's ring */ 5753 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 5754 struct amdgpu_ring *ring = adev->rings[i]; 5755 5756 if (!amdgpu_ring_sched_ready(ring)) 5757 continue; 5758 5759 /* Clear job fence from fence drv to avoid force_completion 5760 * leave NULL and vm flush fence in fence drv 5761 */ 5762 amdgpu_fence_driver_clear_job_fences(ring); 5763 5764 /* after all hw jobs are reset, hw fence is meaningless, so force_completion */ 5765 amdgpu_fence_driver_force_completion(ring); 5766 } 5767 5768 amdgpu_fence_driver_isr_toggle(adev, false); 5769 5770 if (job && job->vm) 5771 drm_sched_increase_karma(&job->base); 5772 5773 r = amdgpu_reset_prepare_hwcontext(adev, reset_context); 5774 /* If reset handler not implemented, continue; otherwise return */ 5775 if (r == -EOPNOTSUPP) 5776 r = 0; 5777 else 5778 return r; 5779 5780 /* Don't suspend on bare metal if we are not going to HW reset the ASIC */ 5781 if (!amdgpu_sriov_vf(adev)) { 5782 5783 if (!need_full_reset) 5784 need_full_reset = amdgpu_device_ip_need_full_reset(adev); 5785 5786 if (!need_full_reset && amdgpu_gpu_recovery && 5787 amdgpu_device_ip_check_soft_reset(adev)) { 5788 amdgpu_device_ip_pre_soft_reset(adev); 5789 r = amdgpu_device_ip_soft_reset(adev); 5790 amdgpu_device_ip_post_soft_reset(adev); 5791 if (r || amdgpu_device_ip_check_soft_reset(adev)) { 5792 dev_info(adev->dev, "soft reset failed, will fallback to full reset!\n"); 5793 need_full_reset = true; 5794 } 5795 } 5796 5797 if (!test_bit(AMDGPU_SKIP_COREDUMP, &reset_context->flags)) { 5798 dev_info(tmp_adev->dev, "Dumping IP State\n"); 5799 /* Trigger ip dump before we reset the asic */ 5800 for (i = 0; i < tmp_adev->num_ip_blocks; i++) 5801 if (tmp_adev->ip_blocks[i].version->funcs->dump_ip_state) 5802 tmp_adev->ip_blocks[i].version->funcs 5803 ->dump_ip_state((void *)&tmp_adev->ip_blocks[i]); 5804 dev_info(tmp_adev->dev, "Dumping IP State Completed\n"); 5805 } 5806 5807 if (need_full_reset) 5808 r = amdgpu_device_ip_suspend(adev); 5809 if (need_full_reset) 5810 set_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 5811 else 5812 clear_bit(AMDGPU_NEED_FULL_RESET, 5813 &reset_context->flags); 5814 } 5815 5816 return r; 5817 } 5818 5819 int amdgpu_device_reinit_after_reset(struct amdgpu_reset_context *reset_context) 5820 { 5821 struct list_head *device_list_handle; 5822 bool full_reset, vram_lost = false; 5823 struct amdgpu_device *tmp_adev; 5824 int r, init_level; 5825 5826 device_list_handle = reset_context->reset_device_list; 5827 5828 if (!device_list_handle) 5829 return -EINVAL; 5830 5831 full_reset = test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 5832 5833 /** 5834 * If it's reset on init, it's default init level, otherwise keep level 5835 * as recovery level. 5836 */ 5837 if (reset_context->method == AMD_RESET_METHOD_ON_INIT) 5838 init_level = AMDGPU_INIT_LEVEL_DEFAULT; 5839 else 5840 init_level = AMDGPU_INIT_LEVEL_RESET_RECOVERY; 5841 5842 r = 0; 5843 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 5844 amdgpu_set_init_level(tmp_adev, init_level); 5845 if (full_reset) { 5846 /* post card */ 5847 amdgpu_ras_clear_err_state(tmp_adev); 5848 r = amdgpu_device_asic_init(tmp_adev); 5849 if (r) { 5850 dev_warn(tmp_adev->dev, "asic atom init failed!"); 5851 } else { 5852 dev_info(tmp_adev->dev, "GPU reset succeeded, trying to resume\n"); 5853 5854 r = amdgpu_device_ip_resume_phase1(tmp_adev); 5855 if (r) 5856 goto out; 5857 5858 vram_lost = amdgpu_device_check_vram_lost(tmp_adev); 5859 5860 if (!test_bit(AMDGPU_SKIP_COREDUMP, &reset_context->flags)) 5861 amdgpu_coredump(tmp_adev, false, vram_lost, reset_context->job); 5862 5863 if (vram_lost) { 5864 dev_info( 5865 tmp_adev->dev, 5866 "VRAM is lost due to GPU reset!\n"); 5867 amdgpu_inc_vram_lost(tmp_adev); 5868 } 5869 5870 r = amdgpu_device_fw_loading(tmp_adev); 5871 if (r) 5872 return r; 5873 5874 r = amdgpu_xcp_restore_partition_mode( 5875 tmp_adev->xcp_mgr); 5876 if (r) 5877 goto out; 5878 5879 r = amdgpu_device_ip_resume_phase2(tmp_adev); 5880 if (r) 5881 goto out; 5882 5883 if (tmp_adev->mman.buffer_funcs_ring->sched.ready) 5884 amdgpu_ttm_set_buffer_funcs_status(tmp_adev, true); 5885 5886 r = amdgpu_device_ip_resume_phase3(tmp_adev); 5887 if (r) 5888 goto out; 5889 5890 if (vram_lost) 5891 amdgpu_device_fill_reset_magic(tmp_adev); 5892 5893 /* 5894 * Add this ASIC as tracked as reset was already 5895 * complete successfully. 5896 */ 5897 amdgpu_register_gpu_instance(tmp_adev); 5898 5899 if (!reset_context->hive && 5900 tmp_adev->gmc.xgmi.num_physical_nodes > 1) 5901 amdgpu_xgmi_add_device(tmp_adev); 5902 5903 r = amdgpu_device_ip_late_init(tmp_adev); 5904 if (r) 5905 goto out; 5906 5907 drm_client_dev_resume(adev_to_drm(tmp_adev), false); 5908 5909 /* 5910 * The GPU enters bad state once faulty pages 5911 * by ECC has reached the threshold, and ras 5912 * recovery is scheduled next. So add one check 5913 * here to break recovery if it indeed exceeds 5914 * bad page threshold, and remind user to 5915 * retire this GPU or setting one bigger 5916 * bad_page_threshold value to fix this once 5917 * probing driver again. 5918 */ 5919 if (!amdgpu_ras_is_rma(tmp_adev)) { 5920 /* must succeed. */ 5921 amdgpu_ras_resume(tmp_adev); 5922 } else { 5923 r = -EINVAL; 5924 goto out; 5925 } 5926 5927 /* Update PSP FW topology after reset */ 5928 if (reset_context->hive && 5929 tmp_adev->gmc.xgmi.num_physical_nodes > 1) 5930 r = amdgpu_xgmi_update_topology( 5931 reset_context->hive, tmp_adev); 5932 } 5933 } 5934 5935 out: 5936 if (!r) { 5937 /* IP init is complete now, set level as default */ 5938 amdgpu_set_init_level(tmp_adev, 5939 AMDGPU_INIT_LEVEL_DEFAULT); 5940 amdgpu_irq_gpu_reset_resume_helper(tmp_adev); 5941 r = amdgpu_ib_ring_tests(tmp_adev); 5942 if (r) { 5943 dev_err(tmp_adev->dev, "ib ring test failed (%d).\n", r); 5944 r = -EAGAIN; 5945 goto end; 5946 } 5947 } 5948 5949 if (r) 5950 tmp_adev->asic_reset_res = r; 5951 } 5952 5953 end: 5954 return r; 5955 } 5956 5957 int amdgpu_do_asic_reset(struct list_head *device_list_handle, 5958 struct amdgpu_reset_context *reset_context) 5959 { 5960 struct amdgpu_device *tmp_adev = NULL; 5961 bool need_full_reset, skip_hw_reset; 5962 int r = 0; 5963 5964 /* Try reset handler method first */ 5965 tmp_adev = list_first_entry(device_list_handle, struct amdgpu_device, 5966 reset_list); 5967 5968 reset_context->reset_device_list = device_list_handle; 5969 r = amdgpu_reset_perform_reset(tmp_adev, reset_context); 5970 /* If reset handler not implemented, continue; otherwise return */ 5971 if (r == -EOPNOTSUPP) 5972 r = 0; 5973 else 5974 return r; 5975 5976 /* Reset handler not implemented, use the default method */ 5977 need_full_reset = 5978 test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 5979 skip_hw_reset = test_bit(AMDGPU_SKIP_HW_RESET, &reset_context->flags); 5980 5981 /* 5982 * ASIC reset has to be done on all XGMI hive nodes ASAP 5983 * to allow proper links negotiation in FW (within 1 sec) 5984 */ 5985 if (!skip_hw_reset && need_full_reset) { 5986 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 5987 /* For XGMI run all resets in parallel to speed up the process */ 5988 if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) { 5989 if (!queue_work(system_unbound_wq, 5990 &tmp_adev->xgmi_reset_work)) 5991 r = -EALREADY; 5992 } else 5993 r = amdgpu_asic_reset(tmp_adev); 5994 5995 if (r) { 5996 dev_err(tmp_adev->dev, 5997 "ASIC reset failed with error, %d for drm dev, %s", 5998 r, adev_to_drm(tmp_adev)->unique); 5999 goto out; 6000 } 6001 } 6002 6003 /* For XGMI wait for all resets to complete before proceed */ 6004 if (!r) { 6005 list_for_each_entry(tmp_adev, device_list_handle, 6006 reset_list) { 6007 if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) { 6008 flush_work(&tmp_adev->xgmi_reset_work); 6009 r = tmp_adev->asic_reset_res; 6010 if (r) 6011 break; 6012 } 6013 } 6014 } 6015 } 6016 6017 if (!r && amdgpu_ras_intr_triggered()) { 6018 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 6019 amdgpu_ras_reset_error_count(tmp_adev, 6020 AMDGPU_RAS_BLOCK__MMHUB); 6021 } 6022 6023 amdgpu_ras_intr_cleared(); 6024 } 6025 6026 r = amdgpu_device_reinit_after_reset(reset_context); 6027 if (r == -EAGAIN) 6028 set_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 6029 else 6030 clear_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 6031 6032 out: 6033 return r; 6034 } 6035 6036 static void amdgpu_device_set_mp1_state(struct amdgpu_device *adev) 6037 { 6038 6039 switch (amdgpu_asic_reset_method(adev)) { 6040 case AMD_RESET_METHOD_MODE1: 6041 case AMD_RESET_METHOD_LINK: 6042 adev->mp1_state = PP_MP1_STATE_SHUTDOWN; 6043 break; 6044 case AMD_RESET_METHOD_MODE2: 6045 adev->mp1_state = PP_MP1_STATE_RESET; 6046 break; 6047 default: 6048 adev->mp1_state = PP_MP1_STATE_NONE; 6049 break; 6050 } 6051 } 6052 6053 static void amdgpu_device_unset_mp1_state(struct amdgpu_device *adev) 6054 { 6055 amdgpu_vf_error_trans_all(adev); 6056 adev->mp1_state = PP_MP1_STATE_NONE; 6057 } 6058 6059 static void amdgpu_device_resume_display_audio(struct amdgpu_device *adev) 6060 { 6061 struct pci_dev *p = NULL; 6062 6063 p = pci_get_domain_bus_and_slot(pci_domain_nr(adev->pdev->bus), 6064 adev->pdev->bus->number, 1); 6065 if (p) { 6066 pm_runtime_enable(&(p->dev)); 6067 pm_runtime_resume(&(p->dev)); 6068 } 6069 6070 pci_dev_put(p); 6071 } 6072 6073 static int amdgpu_device_suspend_display_audio(struct amdgpu_device *adev) 6074 { 6075 enum amd_reset_method reset_method; 6076 struct pci_dev *p = NULL; 6077 u64 expires; 6078 6079 /* 6080 * For now, only BACO and mode1 reset are confirmed 6081 * to suffer the audio issue without proper suspended. 6082 */ 6083 reset_method = amdgpu_asic_reset_method(adev); 6084 if ((reset_method != AMD_RESET_METHOD_BACO) && 6085 (reset_method != AMD_RESET_METHOD_MODE1)) 6086 return -EINVAL; 6087 6088 p = pci_get_domain_bus_and_slot(pci_domain_nr(adev->pdev->bus), 6089 adev->pdev->bus->number, 1); 6090 if (!p) 6091 return -ENODEV; 6092 6093 expires = pm_runtime_autosuspend_expiration(&(p->dev)); 6094 if (!expires) 6095 /* 6096 * If we cannot get the audio device autosuspend delay, 6097 * a fixed 4S interval will be used. Considering 3S is 6098 * the audio controller default autosuspend delay setting. 6099 * 4S used here is guaranteed to cover that. 6100 */ 6101 expires = ktime_get_mono_fast_ns() + NSEC_PER_SEC * 4ULL; 6102 6103 while (!pm_runtime_status_suspended(&(p->dev))) { 6104 if (!pm_runtime_suspend(&(p->dev))) 6105 break; 6106 6107 if (expires < ktime_get_mono_fast_ns()) { 6108 dev_warn(adev->dev, "failed to suspend display audio\n"); 6109 pci_dev_put(p); 6110 /* TODO: abort the succeeding gpu reset? */ 6111 return -ETIMEDOUT; 6112 } 6113 } 6114 6115 pm_runtime_disable(&(p->dev)); 6116 6117 pci_dev_put(p); 6118 return 0; 6119 } 6120 6121 static inline void amdgpu_device_stop_pending_resets(struct amdgpu_device *adev) 6122 { 6123 struct amdgpu_ras *con = amdgpu_ras_get_context(adev); 6124 6125 #if defined(CONFIG_DEBUG_FS) 6126 if (!amdgpu_sriov_vf(adev)) 6127 cancel_work(&adev->reset_work); 6128 #endif 6129 6130 if (adev->kfd.dev) 6131 cancel_work(&adev->kfd.reset_work); 6132 6133 if (amdgpu_sriov_vf(adev)) 6134 cancel_work(&adev->virt.flr_work); 6135 6136 if (con && adev->ras_enabled) 6137 cancel_work(&con->recovery_work); 6138 6139 } 6140 6141 static int amdgpu_device_health_check(struct list_head *device_list_handle) 6142 { 6143 struct amdgpu_device *tmp_adev; 6144 int ret = 0; 6145 6146 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 6147 ret |= amdgpu_device_bus_status_check(tmp_adev); 6148 } 6149 6150 return ret; 6151 } 6152 6153 static int amdgpu_device_recovery_prepare(struct amdgpu_device *adev, 6154 struct list_head *device_list, 6155 struct amdgpu_hive_info *hive) 6156 { 6157 struct amdgpu_device *tmp_adev = NULL; 6158 int r; 6159 6160 /* 6161 * Build list of devices to reset. 6162 * In case we are in XGMI hive mode, resort the device list 6163 * to put adev in the 1st position. 6164 */ 6165 if (!amdgpu_sriov_vf(adev) && (adev->gmc.xgmi.num_physical_nodes > 1) && hive) { 6166 list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head) { 6167 list_add_tail(&tmp_adev->reset_list, device_list); 6168 if (adev->shutdown) 6169 tmp_adev->shutdown = true; 6170 if (adev->pcie_reset_ctx.occurs_dpc) 6171 tmp_adev->pcie_reset_ctx.in_link_reset = true; 6172 } 6173 if (!list_is_first(&adev->reset_list, device_list)) 6174 list_rotate_to_front(&adev->reset_list, device_list); 6175 } else { 6176 list_add_tail(&adev->reset_list, device_list); 6177 } 6178 6179 if (!amdgpu_sriov_vf(adev) && (!adev->pcie_reset_ctx.occurs_dpc)) { 6180 r = amdgpu_device_health_check(device_list); 6181 if (r) 6182 return r; 6183 } 6184 6185 return 0; 6186 } 6187 6188 static void amdgpu_device_recovery_get_reset_lock(struct amdgpu_device *adev, 6189 struct list_head *device_list) 6190 { 6191 struct amdgpu_device *tmp_adev = NULL; 6192 6193 if (list_empty(device_list)) 6194 return; 6195 tmp_adev = 6196 list_first_entry(device_list, struct amdgpu_device, reset_list); 6197 amdgpu_device_lock_reset_domain(tmp_adev->reset_domain); 6198 } 6199 6200 static void amdgpu_device_recovery_put_reset_lock(struct amdgpu_device *adev, 6201 struct list_head *device_list) 6202 { 6203 struct amdgpu_device *tmp_adev = NULL; 6204 6205 if (list_empty(device_list)) 6206 return; 6207 tmp_adev = 6208 list_first_entry(device_list, struct amdgpu_device, reset_list); 6209 amdgpu_device_unlock_reset_domain(tmp_adev->reset_domain); 6210 } 6211 6212 static void amdgpu_device_halt_activities(struct amdgpu_device *adev, 6213 struct amdgpu_job *job, 6214 struct amdgpu_reset_context *reset_context, 6215 struct list_head *device_list, 6216 struct amdgpu_hive_info *hive, 6217 bool need_emergency_restart) 6218 { 6219 struct amdgpu_device *tmp_adev = NULL; 6220 int i; 6221 6222 /* block all schedulers and reset given job's ring */ 6223 list_for_each_entry(tmp_adev, device_list, reset_list) { 6224 amdgpu_device_set_mp1_state(tmp_adev); 6225 6226 /* 6227 * Try to put the audio codec into suspend state 6228 * before gpu reset started. 6229 * 6230 * Due to the power domain of the graphics device 6231 * is shared with AZ power domain. Without this, 6232 * we may change the audio hardware from behind 6233 * the audio driver's back. That will trigger 6234 * some audio codec errors. 6235 */ 6236 if (!amdgpu_device_suspend_display_audio(tmp_adev)) 6237 tmp_adev->pcie_reset_ctx.audio_suspended = true; 6238 6239 amdgpu_ras_set_error_query_ready(tmp_adev, false); 6240 6241 cancel_delayed_work_sync(&tmp_adev->delayed_init_work); 6242 6243 amdgpu_amdkfd_pre_reset(tmp_adev, reset_context); 6244 6245 /* 6246 * Mark these ASICs to be reset as untracked first 6247 * And add them back after reset completed 6248 */ 6249 amdgpu_unregister_gpu_instance(tmp_adev); 6250 6251 drm_client_dev_suspend(adev_to_drm(tmp_adev), false); 6252 6253 /* disable ras on ALL IPs */ 6254 if (!need_emergency_restart && 6255 (!adev->pcie_reset_ctx.occurs_dpc) && 6256 amdgpu_device_ip_need_full_reset(tmp_adev)) 6257 amdgpu_ras_suspend(tmp_adev); 6258 6259 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 6260 struct amdgpu_ring *ring = tmp_adev->rings[i]; 6261 6262 if (!amdgpu_ring_sched_ready(ring)) 6263 continue; 6264 6265 drm_sched_stop(&ring->sched, job ? &job->base : NULL); 6266 6267 if (need_emergency_restart) 6268 amdgpu_job_stop_all_jobs_on_sched(&ring->sched); 6269 } 6270 atomic_inc(&tmp_adev->gpu_reset_counter); 6271 } 6272 } 6273 6274 static int amdgpu_device_asic_reset(struct amdgpu_device *adev, 6275 struct list_head *device_list, 6276 struct amdgpu_reset_context *reset_context) 6277 { 6278 struct amdgpu_device *tmp_adev = NULL; 6279 int retry_limit = AMDGPU_MAX_RETRY_LIMIT; 6280 int r = 0; 6281 6282 retry: /* Rest of adevs pre asic reset from XGMI hive. */ 6283 list_for_each_entry(tmp_adev, device_list, reset_list) { 6284 if (adev->pcie_reset_ctx.occurs_dpc) 6285 tmp_adev->no_hw_access = true; 6286 r = amdgpu_device_pre_asic_reset(tmp_adev, reset_context); 6287 if (adev->pcie_reset_ctx.occurs_dpc) 6288 tmp_adev->no_hw_access = false; 6289 /*TODO Should we stop ?*/ 6290 if (r) { 6291 dev_err(tmp_adev->dev, "GPU pre asic reset failed with err, %d for drm dev, %s ", 6292 r, adev_to_drm(tmp_adev)->unique); 6293 tmp_adev->asic_reset_res = r; 6294 } 6295 } 6296 6297 /* Actual ASIC resets if needed.*/ 6298 /* Host driver will handle XGMI hive reset for SRIOV */ 6299 if (amdgpu_sriov_vf(adev)) { 6300 6301 /* Bail out of reset early */ 6302 if (amdgpu_ras_is_rma(adev)) 6303 return -ENODEV; 6304 6305 if (amdgpu_ras_get_fed_status(adev) || amdgpu_virt_rcvd_ras_interrupt(adev)) { 6306 dev_dbg(adev->dev, "Detected RAS error, wait for FLR completion\n"); 6307 amdgpu_ras_set_fed(adev, true); 6308 set_bit(AMDGPU_HOST_FLR, &reset_context->flags); 6309 } 6310 6311 r = amdgpu_device_reset_sriov(adev, reset_context); 6312 if (AMDGPU_RETRY_SRIOV_RESET(r) && (retry_limit--) > 0) { 6313 amdgpu_virt_release_full_gpu(adev, true); 6314 goto retry; 6315 } 6316 if (r) 6317 adev->asic_reset_res = r; 6318 } else { 6319 r = amdgpu_do_asic_reset(device_list, reset_context); 6320 if (r && r == -EAGAIN) 6321 goto retry; 6322 } 6323 6324 list_for_each_entry(tmp_adev, device_list, reset_list) { 6325 /* 6326 * Drop any pending non scheduler resets queued before reset is done. 6327 * Any reset scheduled after this point would be valid. Scheduler resets 6328 * were already dropped during drm_sched_stop and no new ones can come 6329 * in before drm_sched_start. 6330 */ 6331 amdgpu_device_stop_pending_resets(tmp_adev); 6332 } 6333 6334 return r; 6335 } 6336 6337 static int amdgpu_device_sched_resume(struct list_head *device_list, 6338 struct amdgpu_reset_context *reset_context, 6339 bool job_signaled) 6340 { 6341 struct amdgpu_device *tmp_adev = NULL; 6342 int i, r = 0; 6343 6344 /* Post ASIC reset for all devs .*/ 6345 list_for_each_entry(tmp_adev, device_list, reset_list) { 6346 6347 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 6348 struct amdgpu_ring *ring = tmp_adev->rings[i]; 6349 6350 if (!amdgpu_ring_sched_ready(ring)) 6351 continue; 6352 6353 drm_sched_start(&ring->sched, 0); 6354 } 6355 6356 if (!drm_drv_uses_atomic_modeset(adev_to_drm(tmp_adev)) && !job_signaled) 6357 drm_helper_resume_force_mode(adev_to_drm(tmp_adev)); 6358 6359 if (tmp_adev->asic_reset_res) 6360 r = tmp_adev->asic_reset_res; 6361 6362 tmp_adev->asic_reset_res = 0; 6363 6364 if (r) { 6365 /* bad news, how to tell it to userspace ? 6366 * for ras error, we should report GPU bad status instead of 6367 * reset failure 6368 */ 6369 if (reset_context->src != AMDGPU_RESET_SRC_RAS || 6370 !amdgpu_ras_eeprom_check_err_threshold(tmp_adev)) 6371 dev_info(tmp_adev->dev, "GPU reset(%d) failed\n", 6372 atomic_read(&tmp_adev->gpu_reset_counter)); 6373 amdgpu_vf_error_put(tmp_adev, AMDGIM_ERROR_VF_GPU_RESET_FAIL, 0, r); 6374 } else { 6375 dev_info(tmp_adev->dev, "GPU reset(%d) succeeded!\n", atomic_read(&tmp_adev->gpu_reset_counter)); 6376 if (amdgpu_acpi_smart_shift_update(adev_to_drm(tmp_adev), AMDGPU_SS_DEV_D0)) 6377 dev_warn(tmp_adev->dev, 6378 "smart shift update failed\n"); 6379 } 6380 } 6381 6382 return r; 6383 } 6384 6385 static void amdgpu_device_gpu_resume(struct amdgpu_device *adev, 6386 struct list_head *device_list, 6387 bool need_emergency_restart) 6388 { 6389 struct amdgpu_device *tmp_adev = NULL; 6390 6391 list_for_each_entry(tmp_adev, device_list, reset_list) { 6392 /* unlock kfd: SRIOV would do it separately */ 6393 if (!need_emergency_restart && !amdgpu_sriov_vf(tmp_adev)) 6394 amdgpu_amdkfd_post_reset(tmp_adev); 6395 6396 /* kfd_post_reset will do nothing if kfd device is not initialized, 6397 * need to bring up kfd here if it's not be initialized before 6398 */ 6399 if (!adev->kfd.init_complete) 6400 amdgpu_amdkfd_device_init(adev); 6401 6402 if (tmp_adev->pcie_reset_ctx.audio_suspended) 6403 amdgpu_device_resume_display_audio(tmp_adev); 6404 6405 amdgpu_device_unset_mp1_state(tmp_adev); 6406 6407 amdgpu_ras_set_error_query_ready(tmp_adev, true); 6408 6409 } 6410 } 6411 6412 6413 /** 6414 * amdgpu_device_gpu_recover - reset the asic and recover scheduler 6415 * 6416 * @adev: amdgpu_device pointer 6417 * @job: which job trigger hang 6418 * @reset_context: amdgpu reset context pointer 6419 * 6420 * Attempt to reset the GPU if it has hung (all asics). 6421 * Attempt to do soft-reset or full-reset and reinitialize Asic 6422 * Returns 0 for success or an error on failure. 6423 */ 6424 6425 int amdgpu_device_gpu_recover(struct amdgpu_device *adev, 6426 struct amdgpu_job *job, 6427 struct amdgpu_reset_context *reset_context) 6428 { 6429 struct list_head device_list; 6430 bool job_signaled = false; 6431 struct amdgpu_hive_info *hive = NULL; 6432 int r = 0; 6433 bool need_emergency_restart = false; 6434 6435 /* 6436 * If it reaches here because of hang/timeout and a RAS error is 6437 * detected at the same time, let RAS recovery take care of it. 6438 */ 6439 if (amdgpu_ras_is_err_state(adev, AMDGPU_RAS_BLOCK__ANY) && 6440 !amdgpu_sriov_vf(adev) && 6441 reset_context->src != AMDGPU_RESET_SRC_RAS) { 6442 dev_dbg(adev->dev, 6443 "Gpu recovery from source: %d yielding to RAS error recovery handling", 6444 reset_context->src); 6445 return 0; 6446 } 6447 6448 /* 6449 * Special case: RAS triggered and full reset isn't supported 6450 */ 6451 need_emergency_restart = amdgpu_ras_need_emergency_restart(adev); 6452 6453 /* 6454 * Flush RAM to disk so that after reboot 6455 * the user can read log and see why the system rebooted. 6456 */ 6457 if (need_emergency_restart && amdgpu_ras_get_context(adev) && 6458 amdgpu_ras_get_context(adev)->reboot) { 6459 dev_warn(adev->dev, "Emergency reboot."); 6460 6461 ksys_sync_helper(); 6462 emergency_restart(); 6463 } 6464 6465 dev_info(adev->dev, "GPU %s begin!\n", 6466 need_emergency_restart ? "jobs stop":"reset"); 6467 6468 if (!amdgpu_sriov_vf(adev)) 6469 hive = amdgpu_get_xgmi_hive(adev); 6470 if (hive) 6471 mutex_lock(&hive->hive_lock); 6472 6473 reset_context->job = job; 6474 reset_context->hive = hive; 6475 INIT_LIST_HEAD(&device_list); 6476 6477 if (amdgpu_device_recovery_prepare(adev, &device_list, hive)) 6478 goto end_reset; 6479 6480 /* We need to lock reset domain only once both for XGMI and single device */ 6481 amdgpu_device_recovery_get_reset_lock(adev, &device_list); 6482 6483 amdgpu_device_halt_activities(adev, job, reset_context, &device_list, 6484 hive, need_emergency_restart); 6485 if (need_emergency_restart) 6486 goto skip_sched_resume; 6487 /* 6488 * Must check guilty signal here since after this point all old 6489 * HW fences are force signaled. 6490 * 6491 * job->base holds a reference to parent fence 6492 */ 6493 if (job && dma_fence_is_signaled(&job->hw_fence.base)) { 6494 job_signaled = true; 6495 dev_info(adev->dev, "Guilty job already signaled, skipping HW reset"); 6496 goto skip_hw_reset; 6497 } 6498 6499 r = amdgpu_device_asic_reset(adev, &device_list, reset_context); 6500 if (r) 6501 goto reset_unlock; 6502 skip_hw_reset: 6503 r = amdgpu_device_sched_resume(&device_list, reset_context, job_signaled); 6504 if (r) 6505 goto reset_unlock; 6506 skip_sched_resume: 6507 amdgpu_device_gpu_resume(adev, &device_list, need_emergency_restart); 6508 reset_unlock: 6509 amdgpu_device_recovery_put_reset_lock(adev, &device_list); 6510 end_reset: 6511 if (hive) { 6512 mutex_unlock(&hive->hive_lock); 6513 amdgpu_put_xgmi_hive(hive); 6514 } 6515 6516 if (r) 6517 dev_info(adev->dev, "GPU reset end with ret = %d\n", r); 6518 6519 atomic_set(&adev->reset_domain->reset_res, r); 6520 6521 if (!r) { 6522 struct amdgpu_task_info *ti = NULL; 6523 6524 if (job) 6525 ti = amdgpu_vm_get_task_info_pasid(adev, job->pasid); 6526 6527 drm_dev_wedged_event(adev_to_drm(adev), DRM_WEDGE_RECOVERY_NONE, 6528 ti ? &ti->task : NULL); 6529 6530 amdgpu_vm_put_task_info(ti); 6531 } 6532 6533 return r; 6534 } 6535 6536 /** 6537 * amdgpu_device_partner_bandwidth - find the bandwidth of appropriate partner 6538 * 6539 * @adev: amdgpu_device pointer 6540 * @speed: pointer to the speed of the link 6541 * @width: pointer to the width of the link 6542 * 6543 * Evaluate the hierarchy to find the speed and bandwidth capabilities of the 6544 * first physical partner to an AMD dGPU. 6545 * This will exclude any virtual switches and links. 6546 */ 6547 static void amdgpu_device_partner_bandwidth(struct amdgpu_device *adev, 6548 enum pci_bus_speed *speed, 6549 enum pcie_link_width *width) 6550 { 6551 struct pci_dev *parent = adev->pdev; 6552 6553 if (!speed || !width) 6554 return; 6555 6556 *speed = PCI_SPEED_UNKNOWN; 6557 *width = PCIE_LNK_WIDTH_UNKNOWN; 6558 6559 if (amdgpu_device_pcie_dynamic_switching_supported(adev)) { 6560 while ((parent = pci_upstream_bridge(parent))) { 6561 /* skip upstream/downstream switches internal to dGPU*/ 6562 if (parent->vendor == PCI_VENDOR_ID_ATI) 6563 continue; 6564 *speed = pcie_get_speed_cap(parent); 6565 *width = pcie_get_width_cap(parent); 6566 break; 6567 } 6568 } else { 6569 /* use the current speeds rather than max if switching is not supported */ 6570 pcie_bandwidth_available(adev->pdev, NULL, speed, width); 6571 } 6572 } 6573 6574 /** 6575 * amdgpu_device_gpu_bandwidth - find the bandwidth of the GPU 6576 * 6577 * @adev: amdgpu_device pointer 6578 * @speed: pointer to the speed of the link 6579 * @width: pointer to the width of the link 6580 * 6581 * Evaluate the hierarchy to find the speed and bandwidth capabilities of the 6582 * AMD dGPU which may be a virtual upstream bridge. 6583 */ 6584 static void amdgpu_device_gpu_bandwidth(struct amdgpu_device *adev, 6585 enum pci_bus_speed *speed, 6586 enum pcie_link_width *width) 6587 { 6588 struct pci_dev *parent = adev->pdev; 6589 6590 if (!speed || !width) 6591 return; 6592 6593 parent = pci_upstream_bridge(parent); 6594 if (parent && parent->vendor == PCI_VENDOR_ID_ATI) { 6595 /* use the upstream/downstream switches internal to dGPU */ 6596 *speed = pcie_get_speed_cap(parent); 6597 *width = pcie_get_width_cap(parent); 6598 while ((parent = pci_upstream_bridge(parent))) { 6599 if (parent->vendor == PCI_VENDOR_ID_ATI) { 6600 /* use the upstream/downstream switches internal to dGPU */ 6601 *speed = pcie_get_speed_cap(parent); 6602 *width = pcie_get_width_cap(parent); 6603 } 6604 } 6605 } else { 6606 /* use the device itself */ 6607 *speed = pcie_get_speed_cap(adev->pdev); 6608 *width = pcie_get_width_cap(adev->pdev); 6609 } 6610 } 6611 6612 /** 6613 * amdgpu_device_get_pcie_info - fence pcie info about the PCIE slot 6614 * 6615 * @adev: amdgpu_device pointer 6616 * 6617 * Fetches and stores in the driver the PCIE capabilities (gen speed 6618 * and lanes) of the slot the device is in. Handles APUs and 6619 * virtualized environments where PCIE config space may not be available. 6620 */ 6621 static void amdgpu_device_get_pcie_info(struct amdgpu_device *adev) 6622 { 6623 enum pci_bus_speed speed_cap, platform_speed_cap; 6624 enum pcie_link_width platform_link_width, link_width; 6625 6626 if (amdgpu_pcie_gen_cap) 6627 adev->pm.pcie_gen_mask = amdgpu_pcie_gen_cap; 6628 6629 if (amdgpu_pcie_lane_cap) 6630 adev->pm.pcie_mlw_mask = amdgpu_pcie_lane_cap; 6631 6632 /* covers APUs as well */ 6633 if (pci_is_root_bus(adev->pdev->bus) && !amdgpu_passthrough(adev)) { 6634 if (adev->pm.pcie_gen_mask == 0) 6635 adev->pm.pcie_gen_mask = AMDGPU_DEFAULT_PCIE_GEN_MASK; 6636 if (adev->pm.pcie_mlw_mask == 0) 6637 adev->pm.pcie_mlw_mask = AMDGPU_DEFAULT_PCIE_MLW_MASK; 6638 return; 6639 } 6640 6641 if (adev->pm.pcie_gen_mask && adev->pm.pcie_mlw_mask) 6642 return; 6643 6644 amdgpu_device_partner_bandwidth(adev, &platform_speed_cap, 6645 &platform_link_width); 6646 amdgpu_device_gpu_bandwidth(adev, &speed_cap, &link_width); 6647 6648 if (adev->pm.pcie_gen_mask == 0) { 6649 /* asic caps */ 6650 if (speed_cap == PCI_SPEED_UNKNOWN) { 6651 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 6652 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 | 6653 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3); 6654 } else { 6655 if (speed_cap == PCIE_SPEED_32_0GT) 6656 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 6657 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 | 6658 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3 | 6659 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN4 | 6660 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN5); 6661 else if (speed_cap == PCIE_SPEED_16_0GT) 6662 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 6663 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 | 6664 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3 | 6665 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN4); 6666 else if (speed_cap == PCIE_SPEED_8_0GT) 6667 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 6668 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 | 6669 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3); 6670 else if (speed_cap == PCIE_SPEED_5_0GT) 6671 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 6672 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2); 6673 else 6674 adev->pm.pcie_gen_mask |= CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1; 6675 } 6676 /* platform caps */ 6677 if (platform_speed_cap == PCI_SPEED_UNKNOWN) { 6678 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 6679 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2); 6680 } else { 6681 if (platform_speed_cap == PCIE_SPEED_32_0GT) 6682 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 6683 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 | 6684 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3 | 6685 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN4 | 6686 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN5); 6687 else if (platform_speed_cap == PCIE_SPEED_16_0GT) 6688 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 6689 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 | 6690 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3 | 6691 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN4); 6692 else if (platform_speed_cap == PCIE_SPEED_8_0GT) 6693 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 6694 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 | 6695 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3); 6696 else if (platform_speed_cap == PCIE_SPEED_5_0GT) 6697 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 6698 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2); 6699 else 6700 adev->pm.pcie_gen_mask |= CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1; 6701 6702 } 6703 } 6704 if (adev->pm.pcie_mlw_mask == 0) { 6705 /* asic caps */ 6706 if (link_width == PCIE_LNK_WIDTH_UNKNOWN) { 6707 adev->pm.pcie_mlw_mask |= AMDGPU_DEFAULT_ASIC_PCIE_MLW_MASK; 6708 } else { 6709 switch (link_width) { 6710 case PCIE_LNK_X32: 6711 adev->pm.pcie_mlw_mask |= (CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X32 | 6712 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X16 | 6713 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X12 | 6714 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X8 | 6715 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X4 | 6716 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X2 | 6717 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X1); 6718 break; 6719 case PCIE_LNK_X16: 6720 adev->pm.pcie_mlw_mask |= (CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X16 | 6721 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X12 | 6722 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X8 | 6723 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X4 | 6724 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X2 | 6725 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X1); 6726 break; 6727 case PCIE_LNK_X12: 6728 adev->pm.pcie_mlw_mask |= (CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X12 | 6729 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X8 | 6730 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X4 | 6731 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X2 | 6732 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X1); 6733 break; 6734 case PCIE_LNK_X8: 6735 adev->pm.pcie_mlw_mask |= (CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X8 | 6736 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X4 | 6737 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X2 | 6738 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X1); 6739 break; 6740 case PCIE_LNK_X4: 6741 adev->pm.pcie_mlw_mask |= (CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X4 | 6742 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X2 | 6743 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X1); 6744 break; 6745 case PCIE_LNK_X2: 6746 adev->pm.pcie_mlw_mask |= (CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X2 | 6747 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X1); 6748 break; 6749 case PCIE_LNK_X1: 6750 adev->pm.pcie_mlw_mask |= CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X1; 6751 break; 6752 default: 6753 break; 6754 } 6755 } 6756 /* platform caps */ 6757 if (platform_link_width == PCIE_LNK_WIDTH_UNKNOWN) { 6758 adev->pm.pcie_mlw_mask |= AMDGPU_DEFAULT_PCIE_MLW_MASK; 6759 } else { 6760 switch (platform_link_width) { 6761 case PCIE_LNK_X32: 6762 adev->pm.pcie_mlw_mask |= (CAIL_PCIE_LINK_WIDTH_SUPPORT_X32 | 6763 CAIL_PCIE_LINK_WIDTH_SUPPORT_X16 | 6764 CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 | 6765 CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 | 6766 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 6767 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 6768 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 6769 break; 6770 case PCIE_LNK_X16: 6771 adev->pm.pcie_mlw_mask |= (CAIL_PCIE_LINK_WIDTH_SUPPORT_X16 | 6772 CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 | 6773 CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 | 6774 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 6775 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 6776 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 6777 break; 6778 case PCIE_LNK_X12: 6779 adev->pm.pcie_mlw_mask |= (CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 | 6780 CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 | 6781 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 6782 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 6783 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 6784 break; 6785 case PCIE_LNK_X8: 6786 adev->pm.pcie_mlw_mask |= (CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 | 6787 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 6788 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 6789 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 6790 break; 6791 case PCIE_LNK_X4: 6792 adev->pm.pcie_mlw_mask |= (CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 6793 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 6794 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 6795 break; 6796 case PCIE_LNK_X2: 6797 adev->pm.pcie_mlw_mask |= (CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 6798 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 6799 break; 6800 case PCIE_LNK_X1: 6801 adev->pm.pcie_mlw_mask |= CAIL_PCIE_LINK_WIDTH_SUPPORT_X1; 6802 break; 6803 default: 6804 break; 6805 } 6806 } 6807 } 6808 } 6809 6810 /** 6811 * amdgpu_device_is_peer_accessible - Check peer access through PCIe BAR 6812 * 6813 * @adev: amdgpu_device pointer 6814 * @peer_adev: amdgpu_device pointer for peer device trying to access @adev 6815 * 6816 * Return true if @peer_adev can access (DMA) @adev through the PCIe 6817 * BAR, i.e. @adev is "large BAR" and the BAR matches the DMA mask of 6818 * @peer_adev. 6819 */ 6820 bool amdgpu_device_is_peer_accessible(struct amdgpu_device *adev, 6821 struct amdgpu_device *peer_adev) 6822 { 6823 #ifdef CONFIG_HSA_AMD_P2P 6824 bool p2p_access = 6825 !adev->gmc.xgmi.connected_to_cpu && 6826 !(pci_p2pdma_distance(adev->pdev, peer_adev->dev, false) < 0); 6827 if (!p2p_access) 6828 dev_info(adev->dev, "PCIe P2P access from peer device %s is not supported by the chipset\n", 6829 pci_name(peer_adev->pdev)); 6830 6831 bool is_large_bar = adev->gmc.visible_vram_size && 6832 adev->gmc.real_vram_size == adev->gmc.visible_vram_size; 6833 bool p2p_addressable = amdgpu_device_check_iommu_remap(peer_adev); 6834 6835 if (!p2p_addressable) { 6836 uint64_t address_mask = peer_adev->dev->dma_mask ? 6837 ~*peer_adev->dev->dma_mask : ~((1ULL << 32) - 1); 6838 resource_size_t aper_limit = 6839 adev->gmc.aper_base + adev->gmc.aper_size - 1; 6840 6841 p2p_addressable = !(adev->gmc.aper_base & address_mask || 6842 aper_limit & address_mask); 6843 } 6844 return pcie_p2p && is_large_bar && p2p_access && p2p_addressable; 6845 #else 6846 return false; 6847 #endif 6848 } 6849 6850 int amdgpu_device_baco_enter(struct drm_device *dev) 6851 { 6852 struct amdgpu_device *adev = drm_to_adev(dev); 6853 struct amdgpu_ras *ras = amdgpu_ras_get_context(adev); 6854 6855 if (!amdgpu_device_supports_baco(dev)) 6856 return -ENOTSUPP; 6857 6858 if (ras && adev->ras_enabled && 6859 adev->nbio.funcs->enable_doorbell_interrupt) 6860 adev->nbio.funcs->enable_doorbell_interrupt(adev, false); 6861 6862 return amdgpu_dpm_baco_enter(adev); 6863 } 6864 6865 int amdgpu_device_baco_exit(struct drm_device *dev) 6866 { 6867 struct amdgpu_device *adev = drm_to_adev(dev); 6868 struct amdgpu_ras *ras = amdgpu_ras_get_context(adev); 6869 int ret = 0; 6870 6871 if (!amdgpu_device_supports_baco(dev)) 6872 return -ENOTSUPP; 6873 6874 ret = amdgpu_dpm_baco_exit(adev); 6875 if (ret) 6876 return ret; 6877 6878 if (ras && adev->ras_enabled && 6879 adev->nbio.funcs->enable_doorbell_interrupt) 6880 adev->nbio.funcs->enable_doorbell_interrupt(adev, true); 6881 6882 if (amdgpu_passthrough(adev) && adev->nbio.funcs && 6883 adev->nbio.funcs->clear_doorbell_interrupt) 6884 adev->nbio.funcs->clear_doorbell_interrupt(adev); 6885 6886 return 0; 6887 } 6888 6889 /** 6890 * amdgpu_pci_error_detected - Called when a PCI error is detected. 6891 * @pdev: PCI device struct 6892 * @state: PCI channel state 6893 * 6894 * Description: Called when a PCI error is detected. 6895 * 6896 * Return: PCI_ERS_RESULT_NEED_RESET or PCI_ERS_RESULT_DISCONNECT. 6897 */ 6898 pci_ers_result_t amdgpu_pci_error_detected(struct pci_dev *pdev, pci_channel_state_t state) 6899 { 6900 struct drm_device *dev = pci_get_drvdata(pdev); 6901 struct amdgpu_device *adev = drm_to_adev(dev); 6902 struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev); 6903 struct amdgpu_reset_context reset_context; 6904 struct list_head device_list; 6905 6906 dev_info(adev->dev, "PCI error: detected callback!!\n"); 6907 6908 if (!amdgpu_dpm_is_link_reset_supported(adev)) { 6909 dev_warn(adev->dev, "No support for XGMI hive yet...\n"); 6910 return PCI_ERS_RESULT_DISCONNECT; 6911 } 6912 6913 adev->pci_channel_state = state; 6914 6915 switch (state) { 6916 case pci_channel_io_normal: 6917 dev_info(adev->dev, "pci_channel_io_normal: state(%d)!!\n", state); 6918 return PCI_ERS_RESULT_CAN_RECOVER; 6919 case pci_channel_io_frozen: 6920 /* Fatal error, prepare for slot reset */ 6921 dev_info(adev->dev, "pci_channel_io_frozen: state(%d)!!\n", state); 6922 6923 if (hive) 6924 mutex_lock(&hive->hive_lock); 6925 adev->pcie_reset_ctx.occurs_dpc = true; 6926 memset(&reset_context, 0, sizeof(reset_context)); 6927 INIT_LIST_HEAD(&device_list); 6928 6929 amdgpu_device_recovery_prepare(adev, &device_list, hive); 6930 amdgpu_device_recovery_get_reset_lock(adev, &device_list); 6931 amdgpu_device_halt_activities(adev, NULL, &reset_context, &device_list, 6932 hive, false); 6933 if (hive) { 6934 mutex_unlock(&hive->hive_lock); 6935 amdgpu_put_xgmi_hive(hive); 6936 } 6937 return PCI_ERS_RESULT_NEED_RESET; 6938 case pci_channel_io_perm_failure: 6939 /* Permanent error, prepare for device removal */ 6940 dev_info(adev->dev, "pci_channel_io_perm_failure: state(%d)!!\n", state); 6941 return PCI_ERS_RESULT_DISCONNECT; 6942 } 6943 6944 return PCI_ERS_RESULT_NEED_RESET; 6945 } 6946 6947 /** 6948 * amdgpu_pci_mmio_enabled - Enable MMIO and dump debug registers 6949 * @pdev: pointer to PCI device 6950 */ 6951 pci_ers_result_t amdgpu_pci_mmio_enabled(struct pci_dev *pdev) 6952 { 6953 struct drm_device *dev = pci_get_drvdata(pdev); 6954 struct amdgpu_device *adev = drm_to_adev(dev); 6955 6956 dev_info(adev->dev, "PCI error: mmio enabled callback!!\n"); 6957 6958 /* TODO - dump whatever for debugging purposes */ 6959 6960 /* This called only if amdgpu_pci_error_detected returns 6961 * PCI_ERS_RESULT_CAN_RECOVER. Read/write to the device still 6962 * works, no need to reset slot. 6963 */ 6964 6965 return PCI_ERS_RESULT_RECOVERED; 6966 } 6967 6968 /** 6969 * amdgpu_pci_slot_reset - Called when PCI slot has been reset. 6970 * @pdev: PCI device struct 6971 * 6972 * Description: This routine is called by the pci error recovery 6973 * code after the PCI slot has been reset, just before we 6974 * should resume normal operations. 6975 */ 6976 pci_ers_result_t amdgpu_pci_slot_reset(struct pci_dev *pdev) 6977 { 6978 struct drm_device *dev = pci_get_drvdata(pdev); 6979 struct amdgpu_device *adev = drm_to_adev(dev); 6980 struct amdgpu_reset_context reset_context; 6981 struct amdgpu_device *tmp_adev; 6982 struct amdgpu_hive_info *hive; 6983 struct list_head device_list; 6984 int r = 0, i; 6985 u32 memsize; 6986 6987 /* PCI error slot reset should be skipped During RAS recovery */ 6988 if ((amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 3) || 6989 amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 4)) && 6990 amdgpu_ras_in_recovery(adev)) 6991 return PCI_ERS_RESULT_RECOVERED; 6992 6993 dev_info(adev->dev, "PCI error: slot reset callback!!\n"); 6994 6995 memset(&reset_context, 0, sizeof(reset_context)); 6996 6997 /* wait for asic to come out of reset */ 6998 msleep(700); 6999 7000 /* Restore PCI confspace */ 7001 amdgpu_device_load_pci_state(pdev); 7002 7003 /* confirm ASIC came out of reset */ 7004 for (i = 0; i < adev->usec_timeout; i++) { 7005 memsize = amdgpu_asic_get_config_memsize(adev); 7006 7007 if (memsize != 0xffffffff) 7008 break; 7009 udelay(1); 7010 } 7011 if (memsize == 0xffffffff) { 7012 r = -ETIME; 7013 goto out; 7014 } 7015 7016 reset_context.method = AMD_RESET_METHOD_NONE; 7017 reset_context.reset_req_dev = adev; 7018 set_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags); 7019 set_bit(AMDGPU_SKIP_COREDUMP, &reset_context.flags); 7020 INIT_LIST_HEAD(&device_list); 7021 7022 hive = amdgpu_get_xgmi_hive(adev); 7023 if (hive) { 7024 mutex_lock(&hive->hive_lock); 7025 reset_context.hive = hive; 7026 list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head) { 7027 tmp_adev->pcie_reset_ctx.in_link_reset = true; 7028 list_add_tail(&tmp_adev->reset_list, &device_list); 7029 } 7030 } else { 7031 set_bit(AMDGPU_SKIP_HW_RESET, &reset_context.flags); 7032 list_add_tail(&adev->reset_list, &device_list); 7033 } 7034 7035 r = amdgpu_device_asic_reset(adev, &device_list, &reset_context); 7036 out: 7037 if (!r) { 7038 if (amdgpu_device_cache_pci_state(adev->pdev)) 7039 pci_restore_state(adev->pdev); 7040 dev_info(adev->dev, "PCIe error recovery succeeded\n"); 7041 } else { 7042 dev_err(adev->dev, "PCIe error recovery failed, err:%d\n", r); 7043 if (hive) { 7044 list_for_each_entry(tmp_adev, &device_list, reset_list) 7045 amdgpu_device_unset_mp1_state(tmp_adev); 7046 } 7047 amdgpu_device_recovery_put_reset_lock(adev, &device_list); 7048 } 7049 7050 if (hive) { 7051 mutex_unlock(&hive->hive_lock); 7052 amdgpu_put_xgmi_hive(hive); 7053 } 7054 7055 return r ? PCI_ERS_RESULT_DISCONNECT : PCI_ERS_RESULT_RECOVERED; 7056 } 7057 7058 /** 7059 * amdgpu_pci_resume() - resume normal ops after PCI reset 7060 * @pdev: pointer to PCI device 7061 * 7062 * Called when the error recovery driver tells us that its 7063 * OK to resume normal operation. 7064 */ 7065 void amdgpu_pci_resume(struct pci_dev *pdev) 7066 { 7067 struct drm_device *dev = pci_get_drvdata(pdev); 7068 struct amdgpu_device *adev = drm_to_adev(dev); 7069 struct list_head device_list; 7070 struct amdgpu_hive_info *hive = NULL; 7071 struct amdgpu_device *tmp_adev = NULL; 7072 7073 dev_info(adev->dev, "PCI error: resume callback!!\n"); 7074 7075 /* Only continue execution for the case of pci_channel_io_frozen */ 7076 if (adev->pci_channel_state != pci_channel_io_frozen) 7077 return; 7078 7079 INIT_LIST_HEAD(&device_list); 7080 7081 hive = amdgpu_get_xgmi_hive(adev); 7082 if (hive) { 7083 mutex_lock(&hive->hive_lock); 7084 list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head) { 7085 tmp_adev->pcie_reset_ctx.in_link_reset = false; 7086 list_add_tail(&tmp_adev->reset_list, &device_list); 7087 } 7088 } else 7089 list_add_tail(&adev->reset_list, &device_list); 7090 7091 amdgpu_device_sched_resume(&device_list, NULL, NULL); 7092 amdgpu_device_gpu_resume(adev, &device_list, false); 7093 amdgpu_device_recovery_put_reset_lock(adev, &device_list); 7094 adev->pcie_reset_ctx.occurs_dpc = false; 7095 7096 if (hive) { 7097 mutex_unlock(&hive->hive_lock); 7098 amdgpu_put_xgmi_hive(hive); 7099 } 7100 } 7101 7102 bool amdgpu_device_cache_pci_state(struct pci_dev *pdev) 7103 { 7104 struct drm_device *dev = pci_get_drvdata(pdev); 7105 struct amdgpu_device *adev = drm_to_adev(dev); 7106 int r; 7107 7108 if (amdgpu_sriov_vf(adev)) 7109 return false; 7110 7111 r = pci_save_state(pdev); 7112 if (!r) { 7113 kfree(adev->pci_state); 7114 7115 adev->pci_state = pci_store_saved_state(pdev); 7116 7117 if (!adev->pci_state) { 7118 dev_err(adev->dev, "Failed to store PCI saved state"); 7119 return false; 7120 } 7121 } else { 7122 dev_warn(adev->dev, "Failed to save PCI state, err:%d\n", r); 7123 return false; 7124 } 7125 7126 return true; 7127 } 7128 7129 bool amdgpu_device_load_pci_state(struct pci_dev *pdev) 7130 { 7131 struct drm_device *dev = pci_get_drvdata(pdev); 7132 struct amdgpu_device *adev = drm_to_adev(dev); 7133 int r; 7134 7135 if (!adev->pci_state) 7136 return false; 7137 7138 r = pci_load_saved_state(pdev, adev->pci_state); 7139 7140 if (!r) { 7141 pci_restore_state(pdev); 7142 } else { 7143 dev_warn(adev->dev, "Failed to load PCI state, err:%d\n", r); 7144 return false; 7145 } 7146 7147 return true; 7148 } 7149 7150 void amdgpu_device_flush_hdp(struct amdgpu_device *adev, 7151 struct amdgpu_ring *ring) 7152 { 7153 #ifdef CONFIG_X86_64 7154 if ((adev->flags & AMD_IS_APU) && !amdgpu_passthrough(adev)) 7155 return; 7156 #endif 7157 if (adev->gmc.xgmi.connected_to_cpu) 7158 return; 7159 7160 if (ring && ring->funcs->emit_hdp_flush) 7161 amdgpu_ring_emit_hdp_flush(ring); 7162 else 7163 amdgpu_asic_flush_hdp(adev, ring); 7164 } 7165 7166 void amdgpu_device_invalidate_hdp(struct amdgpu_device *adev, 7167 struct amdgpu_ring *ring) 7168 { 7169 #ifdef CONFIG_X86_64 7170 if ((adev->flags & AMD_IS_APU) && !amdgpu_passthrough(adev)) 7171 return; 7172 #endif 7173 if (adev->gmc.xgmi.connected_to_cpu) 7174 return; 7175 7176 amdgpu_asic_invalidate_hdp(adev, ring); 7177 } 7178 7179 int amdgpu_in_reset(struct amdgpu_device *adev) 7180 { 7181 return atomic_read(&adev->reset_domain->in_gpu_reset); 7182 } 7183 7184 /** 7185 * amdgpu_device_halt() - bring hardware to some kind of halt state 7186 * 7187 * @adev: amdgpu_device pointer 7188 * 7189 * Bring hardware to some kind of halt state so that no one can touch it 7190 * any more. It will help to maintain error context when error occurred. 7191 * Compare to a simple hang, the system will keep stable at least for SSH 7192 * access. Then it should be trivial to inspect the hardware state and 7193 * see what's going on. Implemented as following: 7194 * 7195 * 1. drm_dev_unplug() makes device inaccessible to user space(IOCTLs, etc), 7196 * clears all CPU mappings to device, disallows remappings through page faults 7197 * 2. amdgpu_irq_disable_all() disables all interrupts 7198 * 3. amdgpu_fence_driver_hw_fini() signals all HW fences 7199 * 4. set adev->no_hw_access to avoid potential crashes after setp 5 7200 * 5. amdgpu_device_unmap_mmio() clears all MMIO mappings 7201 * 6. pci_disable_device() and pci_wait_for_pending_transaction() 7202 * flush any in flight DMA operations 7203 */ 7204 void amdgpu_device_halt(struct amdgpu_device *adev) 7205 { 7206 struct pci_dev *pdev = adev->pdev; 7207 struct drm_device *ddev = adev_to_drm(adev); 7208 7209 amdgpu_xcp_dev_unplug(adev); 7210 drm_dev_unplug(ddev); 7211 7212 amdgpu_irq_disable_all(adev); 7213 7214 amdgpu_fence_driver_hw_fini(adev); 7215 7216 adev->no_hw_access = true; 7217 7218 amdgpu_device_unmap_mmio(adev); 7219 7220 pci_disable_device(pdev); 7221 pci_wait_for_pending_transaction(pdev); 7222 } 7223 7224 u32 amdgpu_device_pcie_port_rreg(struct amdgpu_device *adev, 7225 u32 reg) 7226 { 7227 unsigned long flags, address, data; 7228 u32 r; 7229 7230 address = adev->nbio.funcs->get_pcie_port_index_offset(adev); 7231 data = adev->nbio.funcs->get_pcie_port_data_offset(adev); 7232 7233 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 7234 WREG32(address, reg * 4); 7235 (void)RREG32(address); 7236 r = RREG32(data); 7237 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 7238 return r; 7239 } 7240 7241 void amdgpu_device_pcie_port_wreg(struct amdgpu_device *adev, 7242 u32 reg, u32 v) 7243 { 7244 unsigned long flags, address, data; 7245 7246 address = adev->nbio.funcs->get_pcie_port_index_offset(adev); 7247 data = adev->nbio.funcs->get_pcie_port_data_offset(adev); 7248 7249 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 7250 WREG32(address, reg * 4); 7251 (void)RREG32(address); 7252 WREG32(data, v); 7253 (void)RREG32(data); 7254 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 7255 } 7256 7257 /** 7258 * amdgpu_device_get_gang - return a reference to the current gang 7259 * @adev: amdgpu_device pointer 7260 * 7261 * Returns: A new reference to the current gang leader. 7262 */ 7263 struct dma_fence *amdgpu_device_get_gang(struct amdgpu_device *adev) 7264 { 7265 struct dma_fence *fence; 7266 7267 rcu_read_lock(); 7268 fence = dma_fence_get_rcu_safe(&adev->gang_submit); 7269 rcu_read_unlock(); 7270 return fence; 7271 } 7272 7273 /** 7274 * amdgpu_device_switch_gang - switch to a new gang 7275 * @adev: amdgpu_device pointer 7276 * @gang: the gang to switch to 7277 * 7278 * Try to switch to a new gang. 7279 * Returns: NULL if we switched to the new gang or a reference to the current 7280 * gang leader. 7281 */ 7282 struct dma_fence *amdgpu_device_switch_gang(struct amdgpu_device *adev, 7283 struct dma_fence *gang) 7284 { 7285 struct dma_fence *old = NULL; 7286 7287 dma_fence_get(gang); 7288 do { 7289 dma_fence_put(old); 7290 old = amdgpu_device_get_gang(adev); 7291 if (old == gang) 7292 break; 7293 7294 if (!dma_fence_is_signaled(old)) { 7295 dma_fence_put(gang); 7296 return old; 7297 } 7298 7299 } while (cmpxchg((struct dma_fence __force **)&adev->gang_submit, 7300 old, gang) != old); 7301 7302 /* 7303 * Drop it once for the exchanged reference in adev and once for the 7304 * thread local reference acquired in amdgpu_device_get_gang(). 7305 */ 7306 dma_fence_put(old); 7307 dma_fence_put(old); 7308 return NULL; 7309 } 7310 7311 /** 7312 * amdgpu_device_enforce_isolation - enforce HW isolation 7313 * @adev: the amdgpu device pointer 7314 * @ring: the HW ring the job is supposed to run on 7315 * @job: the job which is about to be pushed to the HW ring 7316 * 7317 * Makes sure that only one client at a time can use the GFX block. 7318 * Returns: The dependency to wait on before the job can be pushed to the HW. 7319 * The function is called multiple times until NULL is returned. 7320 */ 7321 struct dma_fence *amdgpu_device_enforce_isolation(struct amdgpu_device *adev, 7322 struct amdgpu_ring *ring, 7323 struct amdgpu_job *job) 7324 { 7325 struct amdgpu_isolation *isolation = &adev->isolation[ring->xcp_id]; 7326 struct drm_sched_fence *f = job->base.s_fence; 7327 struct dma_fence *dep; 7328 void *owner; 7329 int r; 7330 7331 /* 7332 * For now enforce isolation only for the GFX block since we only need 7333 * the cleaner shader on those rings. 7334 */ 7335 if (ring->funcs->type != AMDGPU_RING_TYPE_GFX && 7336 ring->funcs->type != AMDGPU_RING_TYPE_COMPUTE) 7337 return NULL; 7338 7339 /* 7340 * All submissions where enforce isolation is false are handled as if 7341 * they come from a single client. Use ~0l as the owner to distinct it 7342 * from kernel submissions where the owner is NULL. 7343 */ 7344 owner = job->enforce_isolation ? f->owner : (void *)~0l; 7345 7346 mutex_lock(&adev->enforce_isolation_mutex); 7347 7348 /* 7349 * The "spearhead" submission is the first one which changes the 7350 * ownership to its client. We always need to wait for it to be 7351 * pushed to the HW before proceeding with anything. 7352 */ 7353 if (&f->scheduled != isolation->spearhead && 7354 !dma_fence_is_signaled(isolation->spearhead)) { 7355 dep = isolation->spearhead; 7356 goto out_grab_ref; 7357 } 7358 7359 if (isolation->owner != owner) { 7360 7361 /* 7362 * Wait for any gang to be assembled before switching to a 7363 * different owner or otherwise we could deadlock the 7364 * submissions. 7365 */ 7366 if (!job->gang_submit) { 7367 dep = amdgpu_device_get_gang(adev); 7368 if (!dma_fence_is_signaled(dep)) 7369 goto out_return_dep; 7370 dma_fence_put(dep); 7371 } 7372 7373 dma_fence_put(isolation->spearhead); 7374 isolation->spearhead = dma_fence_get(&f->scheduled); 7375 amdgpu_sync_move(&isolation->active, &isolation->prev); 7376 trace_amdgpu_isolation(isolation->owner, owner); 7377 isolation->owner = owner; 7378 } 7379 7380 /* 7381 * Specifying the ring here helps to pipeline submissions even when 7382 * isolation is enabled. If that is not desired for testing NULL can be 7383 * used instead of the ring to enforce a CPU round trip while switching 7384 * between clients. 7385 */ 7386 dep = amdgpu_sync_peek_fence(&isolation->prev, ring); 7387 r = amdgpu_sync_fence(&isolation->active, &f->finished, GFP_NOWAIT); 7388 if (r) 7389 dev_warn(adev->dev, "OOM tracking isolation\n"); 7390 7391 out_grab_ref: 7392 dma_fence_get(dep); 7393 out_return_dep: 7394 mutex_unlock(&adev->enforce_isolation_mutex); 7395 return dep; 7396 } 7397 7398 bool amdgpu_device_has_display_hardware(struct amdgpu_device *adev) 7399 { 7400 switch (adev->asic_type) { 7401 #ifdef CONFIG_DRM_AMDGPU_SI 7402 case CHIP_HAINAN: 7403 #endif 7404 case CHIP_TOPAZ: 7405 /* chips with no display hardware */ 7406 return false; 7407 #ifdef CONFIG_DRM_AMDGPU_SI 7408 case CHIP_TAHITI: 7409 case CHIP_PITCAIRN: 7410 case CHIP_VERDE: 7411 case CHIP_OLAND: 7412 #endif 7413 #ifdef CONFIG_DRM_AMDGPU_CIK 7414 case CHIP_BONAIRE: 7415 case CHIP_HAWAII: 7416 case CHIP_KAVERI: 7417 case CHIP_KABINI: 7418 case CHIP_MULLINS: 7419 #endif 7420 case CHIP_TONGA: 7421 case CHIP_FIJI: 7422 case CHIP_POLARIS10: 7423 case CHIP_POLARIS11: 7424 case CHIP_POLARIS12: 7425 case CHIP_VEGAM: 7426 case CHIP_CARRIZO: 7427 case CHIP_STONEY: 7428 /* chips with display hardware */ 7429 return true; 7430 default: 7431 /* IP discovery */ 7432 if (!amdgpu_ip_version(adev, DCE_HWIP, 0) || 7433 (adev->harvest_ip_mask & AMD_HARVEST_IP_DMU_MASK)) 7434 return false; 7435 return true; 7436 } 7437 } 7438 7439 uint32_t amdgpu_device_wait_on_rreg(struct amdgpu_device *adev, 7440 uint32_t inst, uint32_t reg_addr, char reg_name[], 7441 uint32_t expected_value, uint32_t mask) 7442 { 7443 uint32_t ret = 0; 7444 uint32_t old_ = 0; 7445 uint32_t tmp_ = RREG32(reg_addr); 7446 uint32_t loop = adev->usec_timeout; 7447 7448 while ((tmp_ & (mask)) != (expected_value)) { 7449 if (old_ != tmp_) { 7450 loop = adev->usec_timeout; 7451 old_ = tmp_; 7452 } else 7453 udelay(1); 7454 tmp_ = RREG32(reg_addr); 7455 loop--; 7456 if (!loop) { 7457 dev_warn( 7458 adev->dev, 7459 "Register(%d) [%s] failed to reach value 0x%08x != 0x%08xn", 7460 inst, reg_name, (uint32_t)expected_value, 7461 (uint32_t)(tmp_ & (mask))); 7462 ret = -ETIMEDOUT; 7463 break; 7464 } 7465 } 7466 return ret; 7467 } 7468 7469 ssize_t amdgpu_get_soft_full_reset_mask(struct amdgpu_ring *ring) 7470 { 7471 ssize_t size = 0; 7472 7473 if (!ring || !ring->adev) 7474 return size; 7475 7476 if (amdgpu_device_should_recover_gpu(ring->adev)) 7477 size |= AMDGPU_RESET_TYPE_FULL; 7478 7479 if (unlikely(!ring->adev->debug_disable_soft_recovery) && 7480 !amdgpu_sriov_vf(ring->adev) && ring->funcs->soft_recovery) 7481 size |= AMDGPU_RESET_TYPE_SOFT_RESET; 7482 7483 return size; 7484 } 7485 7486 ssize_t amdgpu_show_reset_mask(char *buf, uint32_t supported_reset) 7487 { 7488 ssize_t size = 0; 7489 7490 if (supported_reset == 0) { 7491 size += sysfs_emit_at(buf, size, "unsupported"); 7492 size += sysfs_emit_at(buf, size, "\n"); 7493 return size; 7494 7495 } 7496 7497 if (supported_reset & AMDGPU_RESET_TYPE_SOFT_RESET) 7498 size += sysfs_emit_at(buf, size, "soft "); 7499 7500 if (supported_reset & AMDGPU_RESET_TYPE_PER_QUEUE) 7501 size += sysfs_emit_at(buf, size, "queue "); 7502 7503 if (supported_reset & AMDGPU_RESET_TYPE_PER_PIPE) 7504 size += sysfs_emit_at(buf, size, "pipe "); 7505 7506 if (supported_reset & AMDGPU_RESET_TYPE_FULL) 7507 size += sysfs_emit_at(buf, size, "full "); 7508 7509 size += sysfs_emit_at(buf, size, "\n"); 7510 return size; 7511 } 7512