1 /* 2 * Copyright 2008 Advanced Micro Devices, Inc. 3 * Copyright 2008 Red Hat Inc. 4 * Copyright 2009 Jerome Glisse. 5 * 6 * Permission is hereby granted, free of charge, to any person obtaining a 7 * copy of this software and associated documentation files (the "Software"), 8 * to deal in the Software without restriction, including without limitation 9 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 10 * and/or sell copies of the Software, and to permit persons to whom the 11 * Software is furnished to do so, subject to the following conditions: 12 * 13 * The above copyright notice and this permission notice shall be included in 14 * all copies or substantial portions of the Software. 15 * 16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 19 * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR 20 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, 21 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR 22 * OTHER DEALINGS IN THE SOFTWARE. 23 * 24 * Authors: Dave Airlie 25 * Alex Deucher 26 * Jerome Glisse 27 */ 28 29 #include <linux/aperture.h> 30 #include <linux/power_supply.h> 31 #include <linux/kthread.h> 32 #include <linux/module.h> 33 #include <linux/console.h> 34 #include <linux/slab.h> 35 #include <linux/iommu.h> 36 #include <linux/pci.h> 37 #include <linux/pci-p2pdma.h> 38 #include <linux/apple-gmux.h> 39 40 #include <drm/drm_atomic_helper.h> 41 #include <drm/drm_client_event.h> 42 #include <drm/drm_crtc_helper.h> 43 #include <drm/drm_probe_helper.h> 44 #include <drm/amdgpu_drm.h> 45 #include <linux/device.h> 46 #include <linux/vgaarb.h> 47 #include <linux/vga_switcheroo.h> 48 #include <linux/efi.h> 49 #include "amdgpu.h" 50 #include "amdgpu_trace.h" 51 #include "amdgpu_i2c.h" 52 #include "atom.h" 53 #include "amdgpu_atombios.h" 54 #include "amdgpu_atomfirmware.h" 55 #include "amd_pcie.h" 56 #ifdef CONFIG_DRM_AMDGPU_SI 57 #include "si.h" 58 #endif 59 #ifdef CONFIG_DRM_AMDGPU_CIK 60 #include "cik.h" 61 #endif 62 #include "vi.h" 63 #include "soc15.h" 64 #include "nv.h" 65 #include "bif/bif_4_1_d.h" 66 #include <linux/firmware.h> 67 #include "amdgpu_vf_error.h" 68 69 #include "amdgpu_amdkfd.h" 70 #include "amdgpu_pm.h" 71 72 #include "amdgpu_xgmi.h" 73 #include "amdgpu_ras.h" 74 #include "amdgpu_pmu.h" 75 #include "amdgpu_fru_eeprom.h" 76 #include "amdgpu_reset.h" 77 #include "amdgpu_virt.h" 78 #include "amdgpu_dev_coredump.h" 79 80 #include <linux/suspend.h> 81 #include <drm/task_barrier.h> 82 #include <linux/pm_runtime.h> 83 84 #include <drm/drm_drv.h> 85 86 #if IS_ENABLED(CONFIG_X86) 87 #include <asm/intel-family.h> 88 #include <asm/cpu_device_id.h> 89 #endif 90 91 MODULE_FIRMWARE("amdgpu/vega10_gpu_info.bin"); 92 MODULE_FIRMWARE("amdgpu/vega12_gpu_info.bin"); 93 MODULE_FIRMWARE("amdgpu/raven_gpu_info.bin"); 94 MODULE_FIRMWARE("amdgpu/picasso_gpu_info.bin"); 95 MODULE_FIRMWARE("amdgpu/raven2_gpu_info.bin"); 96 MODULE_FIRMWARE("amdgpu/arcturus_gpu_info.bin"); 97 MODULE_FIRMWARE("amdgpu/navi12_gpu_info.bin"); 98 99 #define AMDGPU_RESUME_MS 2000 100 #define AMDGPU_MAX_RETRY_LIMIT 2 101 #define AMDGPU_RETRY_SRIOV_RESET(r) ((r) == -EBUSY || (r) == -ETIMEDOUT || (r) == -EINVAL) 102 #define AMDGPU_PCIE_INDEX_FALLBACK (0x38 >> 2) 103 #define AMDGPU_PCIE_INDEX_HI_FALLBACK (0x44 >> 2) 104 #define AMDGPU_PCIE_DATA_FALLBACK (0x3C >> 2) 105 106 #define AMDGPU_VBIOS_SKIP (1U << 0) 107 #define AMDGPU_VBIOS_OPTIONAL (1U << 1) 108 109 static const struct drm_driver amdgpu_kms_driver; 110 111 const char *amdgpu_asic_name[] = { 112 "TAHITI", 113 "PITCAIRN", 114 "VERDE", 115 "OLAND", 116 "HAINAN", 117 "BONAIRE", 118 "KAVERI", 119 "KABINI", 120 "HAWAII", 121 "MULLINS", 122 "TOPAZ", 123 "TONGA", 124 "FIJI", 125 "CARRIZO", 126 "STONEY", 127 "POLARIS10", 128 "POLARIS11", 129 "POLARIS12", 130 "VEGAM", 131 "VEGA10", 132 "VEGA12", 133 "VEGA20", 134 "RAVEN", 135 "ARCTURUS", 136 "RENOIR", 137 "ALDEBARAN", 138 "NAVI10", 139 "CYAN_SKILLFISH", 140 "NAVI14", 141 "NAVI12", 142 "SIENNA_CICHLID", 143 "NAVY_FLOUNDER", 144 "VANGOGH", 145 "DIMGREY_CAVEFISH", 146 "BEIGE_GOBY", 147 "YELLOW_CARP", 148 "IP DISCOVERY", 149 "LAST", 150 }; 151 152 #define AMDGPU_IP_BLK_MASK_ALL GENMASK(AMD_IP_BLOCK_TYPE_NUM - 1, 0) 153 /* 154 * Default init level where all blocks are expected to be initialized. This is 155 * the level of initialization expected by default and also after a full reset 156 * of the device. 157 */ 158 struct amdgpu_init_level amdgpu_init_default = { 159 .level = AMDGPU_INIT_LEVEL_DEFAULT, 160 .hwini_ip_block_mask = AMDGPU_IP_BLK_MASK_ALL, 161 }; 162 163 struct amdgpu_init_level amdgpu_init_recovery = { 164 .level = AMDGPU_INIT_LEVEL_RESET_RECOVERY, 165 .hwini_ip_block_mask = AMDGPU_IP_BLK_MASK_ALL, 166 }; 167 168 /* 169 * Minimal blocks needed to be initialized before a XGMI hive can be reset. This 170 * is used for cases like reset on initialization where the entire hive needs to 171 * be reset before first use. 172 */ 173 struct amdgpu_init_level amdgpu_init_minimal_xgmi = { 174 .level = AMDGPU_INIT_LEVEL_MINIMAL_XGMI, 175 .hwini_ip_block_mask = 176 BIT(AMD_IP_BLOCK_TYPE_GMC) | BIT(AMD_IP_BLOCK_TYPE_SMC) | 177 BIT(AMD_IP_BLOCK_TYPE_COMMON) | BIT(AMD_IP_BLOCK_TYPE_IH) | 178 BIT(AMD_IP_BLOCK_TYPE_PSP) 179 }; 180 181 static inline bool amdgpu_ip_member_of_hwini(struct amdgpu_device *adev, 182 enum amd_ip_block_type block) 183 { 184 return (adev->init_lvl->hwini_ip_block_mask & (1U << block)) != 0; 185 } 186 187 void amdgpu_set_init_level(struct amdgpu_device *adev, 188 enum amdgpu_init_lvl_id lvl) 189 { 190 switch (lvl) { 191 case AMDGPU_INIT_LEVEL_MINIMAL_XGMI: 192 adev->init_lvl = &amdgpu_init_minimal_xgmi; 193 break; 194 case AMDGPU_INIT_LEVEL_RESET_RECOVERY: 195 adev->init_lvl = &amdgpu_init_recovery; 196 break; 197 case AMDGPU_INIT_LEVEL_DEFAULT: 198 fallthrough; 199 default: 200 adev->init_lvl = &amdgpu_init_default; 201 break; 202 } 203 } 204 205 static inline void amdgpu_device_stop_pending_resets(struct amdgpu_device *adev); 206 static int amdgpu_device_pm_notifier(struct notifier_block *nb, unsigned long mode, 207 void *data); 208 209 /** 210 * DOC: pcie_replay_count 211 * 212 * The amdgpu driver provides a sysfs API for reporting the total number 213 * of PCIe replays (NAKs). 214 * The file pcie_replay_count is used for this and returns the total 215 * number of replays as a sum of the NAKs generated and NAKs received. 216 */ 217 218 static ssize_t amdgpu_device_get_pcie_replay_count(struct device *dev, 219 struct device_attribute *attr, char *buf) 220 { 221 struct drm_device *ddev = dev_get_drvdata(dev); 222 struct amdgpu_device *adev = drm_to_adev(ddev); 223 uint64_t cnt = amdgpu_asic_get_pcie_replay_count(adev); 224 225 return sysfs_emit(buf, "%llu\n", cnt); 226 } 227 228 static DEVICE_ATTR(pcie_replay_count, 0444, 229 amdgpu_device_get_pcie_replay_count, NULL); 230 231 static int amdgpu_device_attr_sysfs_init(struct amdgpu_device *adev) 232 { 233 int ret = 0; 234 235 if (amdgpu_nbio_is_replay_cnt_supported(adev)) 236 ret = sysfs_create_file(&adev->dev->kobj, 237 &dev_attr_pcie_replay_count.attr); 238 239 return ret; 240 } 241 242 static void amdgpu_device_attr_sysfs_fini(struct amdgpu_device *adev) 243 { 244 if (amdgpu_nbio_is_replay_cnt_supported(adev)) 245 sysfs_remove_file(&adev->dev->kobj, 246 &dev_attr_pcie_replay_count.attr); 247 } 248 249 static ssize_t amdgpu_sysfs_reg_state_get(struct file *f, struct kobject *kobj, 250 const struct bin_attribute *attr, char *buf, 251 loff_t ppos, size_t count) 252 { 253 struct device *dev = kobj_to_dev(kobj); 254 struct drm_device *ddev = dev_get_drvdata(dev); 255 struct amdgpu_device *adev = drm_to_adev(ddev); 256 ssize_t bytes_read; 257 258 switch (ppos) { 259 case AMDGPU_SYS_REG_STATE_XGMI: 260 bytes_read = amdgpu_asic_get_reg_state( 261 adev, AMDGPU_REG_STATE_TYPE_XGMI, buf, count); 262 break; 263 case AMDGPU_SYS_REG_STATE_WAFL: 264 bytes_read = amdgpu_asic_get_reg_state( 265 adev, AMDGPU_REG_STATE_TYPE_WAFL, buf, count); 266 break; 267 case AMDGPU_SYS_REG_STATE_PCIE: 268 bytes_read = amdgpu_asic_get_reg_state( 269 adev, AMDGPU_REG_STATE_TYPE_PCIE, buf, count); 270 break; 271 case AMDGPU_SYS_REG_STATE_USR: 272 bytes_read = amdgpu_asic_get_reg_state( 273 adev, AMDGPU_REG_STATE_TYPE_USR, buf, count); 274 break; 275 case AMDGPU_SYS_REG_STATE_USR_1: 276 bytes_read = amdgpu_asic_get_reg_state( 277 adev, AMDGPU_REG_STATE_TYPE_USR_1, buf, count); 278 break; 279 default: 280 return -EINVAL; 281 } 282 283 return bytes_read; 284 } 285 286 static const BIN_ATTR(reg_state, 0444, amdgpu_sysfs_reg_state_get, NULL, 287 AMDGPU_SYS_REG_STATE_END); 288 289 int amdgpu_reg_state_sysfs_init(struct amdgpu_device *adev) 290 { 291 int ret; 292 293 if (!amdgpu_asic_get_reg_state_supported(adev)) 294 return 0; 295 296 ret = sysfs_create_bin_file(&adev->dev->kobj, &bin_attr_reg_state); 297 298 return ret; 299 } 300 301 void amdgpu_reg_state_sysfs_fini(struct amdgpu_device *adev) 302 { 303 if (!amdgpu_asic_get_reg_state_supported(adev)) 304 return; 305 sysfs_remove_bin_file(&adev->dev->kobj, &bin_attr_reg_state); 306 } 307 308 int amdgpu_ip_block_suspend(struct amdgpu_ip_block *ip_block) 309 { 310 int r; 311 312 if (ip_block->version->funcs->suspend) { 313 r = ip_block->version->funcs->suspend(ip_block); 314 if (r) { 315 dev_err(ip_block->adev->dev, 316 "suspend of IP block <%s> failed %d\n", 317 ip_block->version->funcs->name, r); 318 return r; 319 } 320 } 321 322 ip_block->status.hw = false; 323 return 0; 324 } 325 326 int amdgpu_ip_block_resume(struct amdgpu_ip_block *ip_block) 327 { 328 int r; 329 330 if (ip_block->version->funcs->resume) { 331 r = ip_block->version->funcs->resume(ip_block); 332 if (r) { 333 dev_err(ip_block->adev->dev, 334 "resume of IP block <%s> failed %d\n", 335 ip_block->version->funcs->name, r); 336 return r; 337 } 338 } 339 340 ip_block->status.hw = true; 341 return 0; 342 } 343 344 /** 345 * DOC: board_info 346 * 347 * The amdgpu driver provides a sysfs API for giving board related information. 348 * It provides the form factor information in the format 349 * 350 * type : form factor 351 * 352 * Possible form factor values 353 * 354 * - "cem" - PCIE CEM card 355 * - "oam" - Open Compute Accelerator Module 356 * - "unknown" - Not known 357 * 358 */ 359 360 static ssize_t amdgpu_device_get_board_info(struct device *dev, 361 struct device_attribute *attr, 362 char *buf) 363 { 364 struct drm_device *ddev = dev_get_drvdata(dev); 365 struct amdgpu_device *adev = drm_to_adev(ddev); 366 enum amdgpu_pkg_type pkg_type = AMDGPU_PKG_TYPE_CEM; 367 const char *pkg; 368 369 if (adev->smuio.funcs && adev->smuio.funcs->get_pkg_type) 370 pkg_type = adev->smuio.funcs->get_pkg_type(adev); 371 372 switch (pkg_type) { 373 case AMDGPU_PKG_TYPE_CEM: 374 pkg = "cem"; 375 break; 376 case AMDGPU_PKG_TYPE_OAM: 377 pkg = "oam"; 378 break; 379 default: 380 pkg = "unknown"; 381 break; 382 } 383 384 return sysfs_emit(buf, "%s : %s\n", "type", pkg); 385 } 386 387 static DEVICE_ATTR(board_info, 0444, amdgpu_device_get_board_info, NULL); 388 389 static struct attribute *amdgpu_board_attrs[] = { 390 &dev_attr_board_info.attr, 391 NULL, 392 }; 393 394 static umode_t amdgpu_board_attrs_is_visible(struct kobject *kobj, 395 struct attribute *attr, int n) 396 { 397 struct device *dev = kobj_to_dev(kobj); 398 struct drm_device *ddev = dev_get_drvdata(dev); 399 struct amdgpu_device *adev = drm_to_adev(ddev); 400 401 if (adev->flags & AMD_IS_APU) 402 return 0; 403 404 return attr->mode; 405 } 406 407 static const struct attribute_group amdgpu_board_attrs_group = { 408 .attrs = amdgpu_board_attrs, 409 .is_visible = amdgpu_board_attrs_is_visible 410 }; 411 412 static void amdgpu_device_get_pcie_info(struct amdgpu_device *adev); 413 414 415 /** 416 * amdgpu_device_supports_px - Is the device a dGPU with ATPX power control 417 * 418 * @dev: drm_device pointer 419 * 420 * Returns true if the device is a dGPU with ATPX power control, 421 * otherwise return false. 422 */ 423 bool amdgpu_device_supports_px(struct drm_device *dev) 424 { 425 struct amdgpu_device *adev = drm_to_adev(dev); 426 427 if ((adev->flags & AMD_IS_PX) && !amdgpu_is_atpx_hybrid()) 428 return true; 429 return false; 430 } 431 432 /** 433 * amdgpu_device_supports_boco - Is the device a dGPU with ACPI power resources 434 * 435 * @dev: drm_device pointer 436 * 437 * Returns true if the device is a dGPU with ACPI power control, 438 * otherwise return false. 439 */ 440 bool amdgpu_device_supports_boco(struct drm_device *dev) 441 { 442 struct amdgpu_device *adev = drm_to_adev(dev); 443 444 if (!IS_ENABLED(CONFIG_HOTPLUG_PCI_PCIE)) 445 return false; 446 447 if (adev->has_pr3 || 448 ((adev->flags & AMD_IS_PX) && amdgpu_is_atpx_hybrid())) 449 return true; 450 return false; 451 } 452 453 /** 454 * amdgpu_device_supports_baco - Does the device support BACO 455 * 456 * @dev: drm_device pointer 457 * 458 * Return: 459 * 1 if the device supports BACO; 460 * 3 if the device supports MACO (only works if BACO is supported) 461 * otherwise return 0. 462 */ 463 int amdgpu_device_supports_baco(struct drm_device *dev) 464 { 465 struct amdgpu_device *adev = drm_to_adev(dev); 466 467 return amdgpu_asic_supports_baco(adev); 468 } 469 470 void amdgpu_device_detect_runtime_pm_mode(struct amdgpu_device *adev) 471 { 472 struct drm_device *dev; 473 int bamaco_support; 474 475 dev = adev_to_drm(adev); 476 477 adev->pm.rpm_mode = AMDGPU_RUNPM_NONE; 478 bamaco_support = amdgpu_device_supports_baco(dev); 479 480 switch (amdgpu_runtime_pm) { 481 case 2: 482 if (bamaco_support & MACO_SUPPORT) { 483 adev->pm.rpm_mode = AMDGPU_RUNPM_BAMACO; 484 dev_info(adev->dev, "Forcing BAMACO for runtime pm\n"); 485 } else if (bamaco_support == BACO_SUPPORT) { 486 adev->pm.rpm_mode = AMDGPU_RUNPM_BACO; 487 dev_info(adev->dev, "Requested mode BAMACO not available,fallback to use BACO\n"); 488 } 489 break; 490 case 1: 491 if (bamaco_support & BACO_SUPPORT) { 492 adev->pm.rpm_mode = AMDGPU_RUNPM_BACO; 493 dev_info(adev->dev, "Forcing BACO for runtime pm\n"); 494 } 495 break; 496 case -1: 497 case -2: 498 if (amdgpu_device_supports_px(dev)) { /* enable PX as runtime mode */ 499 adev->pm.rpm_mode = AMDGPU_RUNPM_PX; 500 dev_info(adev->dev, "Using ATPX for runtime pm\n"); 501 } else if (amdgpu_device_supports_boco(dev)) { /* enable boco as runtime mode */ 502 adev->pm.rpm_mode = AMDGPU_RUNPM_BOCO; 503 dev_info(adev->dev, "Using BOCO for runtime pm\n"); 504 } else { 505 if (!bamaco_support) 506 goto no_runtime_pm; 507 508 switch (adev->asic_type) { 509 case CHIP_VEGA20: 510 case CHIP_ARCTURUS: 511 /* BACO are not supported on vega20 and arctrus */ 512 break; 513 case CHIP_VEGA10: 514 /* enable BACO as runpm mode if noretry=0 */ 515 if (!adev->gmc.noretry && !amdgpu_passthrough(adev)) 516 adev->pm.rpm_mode = AMDGPU_RUNPM_BACO; 517 break; 518 default: 519 /* enable BACO as runpm mode on CI+ */ 520 if (!amdgpu_passthrough(adev)) 521 adev->pm.rpm_mode = AMDGPU_RUNPM_BACO; 522 break; 523 } 524 525 if (adev->pm.rpm_mode == AMDGPU_RUNPM_BACO) { 526 if (bamaco_support & MACO_SUPPORT) { 527 adev->pm.rpm_mode = AMDGPU_RUNPM_BAMACO; 528 dev_info(adev->dev, "Using BAMACO for runtime pm\n"); 529 } else { 530 dev_info(adev->dev, "Using BACO for runtime pm\n"); 531 } 532 } 533 } 534 break; 535 case 0: 536 dev_info(adev->dev, "runtime pm is manually disabled\n"); 537 break; 538 default: 539 break; 540 } 541 542 no_runtime_pm: 543 if (adev->pm.rpm_mode == AMDGPU_RUNPM_NONE) 544 dev_info(adev->dev, "Runtime PM not available\n"); 545 } 546 /** 547 * amdgpu_device_supports_smart_shift - Is the device dGPU with 548 * smart shift support 549 * 550 * @dev: drm_device pointer 551 * 552 * Returns true if the device is a dGPU with Smart Shift support, 553 * otherwise returns false. 554 */ 555 bool amdgpu_device_supports_smart_shift(struct drm_device *dev) 556 { 557 return (amdgpu_device_supports_boco(dev) && 558 amdgpu_acpi_is_power_shift_control_supported()); 559 } 560 561 /* 562 * VRAM access helper functions 563 */ 564 565 /** 566 * amdgpu_device_mm_access - access vram by MM_INDEX/MM_DATA 567 * 568 * @adev: amdgpu_device pointer 569 * @pos: offset of the buffer in vram 570 * @buf: virtual address of the buffer in system memory 571 * @size: read/write size, sizeof(@buf) must > @size 572 * @write: true - write to vram, otherwise - read from vram 573 */ 574 void amdgpu_device_mm_access(struct amdgpu_device *adev, loff_t pos, 575 void *buf, size_t size, bool write) 576 { 577 unsigned long flags; 578 uint32_t hi = ~0, tmp = 0; 579 uint32_t *data = buf; 580 uint64_t last; 581 int idx; 582 583 if (!drm_dev_enter(adev_to_drm(adev), &idx)) 584 return; 585 586 BUG_ON(!IS_ALIGNED(pos, 4) || !IS_ALIGNED(size, 4)); 587 588 spin_lock_irqsave(&adev->mmio_idx_lock, flags); 589 for (last = pos + size; pos < last; pos += 4) { 590 tmp = pos >> 31; 591 592 WREG32_NO_KIQ(mmMM_INDEX, ((uint32_t)pos) | 0x80000000); 593 if (tmp != hi) { 594 WREG32_NO_KIQ(mmMM_INDEX_HI, tmp); 595 hi = tmp; 596 } 597 if (write) 598 WREG32_NO_KIQ(mmMM_DATA, *data++); 599 else 600 *data++ = RREG32_NO_KIQ(mmMM_DATA); 601 } 602 603 spin_unlock_irqrestore(&adev->mmio_idx_lock, flags); 604 drm_dev_exit(idx); 605 } 606 607 /** 608 * amdgpu_device_aper_access - access vram by vram aperture 609 * 610 * @adev: amdgpu_device pointer 611 * @pos: offset of the buffer in vram 612 * @buf: virtual address of the buffer in system memory 613 * @size: read/write size, sizeof(@buf) must > @size 614 * @write: true - write to vram, otherwise - read from vram 615 * 616 * The return value means how many bytes have been transferred. 617 */ 618 size_t amdgpu_device_aper_access(struct amdgpu_device *adev, loff_t pos, 619 void *buf, size_t size, bool write) 620 { 621 #ifdef CONFIG_64BIT 622 void __iomem *addr; 623 size_t count = 0; 624 uint64_t last; 625 626 if (!adev->mman.aper_base_kaddr) 627 return 0; 628 629 last = min(pos + size, adev->gmc.visible_vram_size); 630 if (last > pos) { 631 addr = adev->mman.aper_base_kaddr + pos; 632 count = last - pos; 633 634 if (write) { 635 memcpy_toio(addr, buf, count); 636 /* Make sure HDP write cache flush happens without any reordering 637 * after the system memory contents are sent over PCIe device 638 */ 639 mb(); 640 amdgpu_device_flush_hdp(adev, NULL); 641 } else { 642 amdgpu_device_invalidate_hdp(adev, NULL); 643 /* Make sure HDP read cache is invalidated before issuing a read 644 * to the PCIe device 645 */ 646 mb(); 647 memcpy_fromio(buf, addr, count); 648 } 649 650 } 651 652 return count; 653 #else 654 return 0; 655 #endif 656 } 657 658 /** 659 * amdgpu_device_vram_access - read/write a buffer in vram 660 * 661 * @adev: amdgpu_device pointer 662 * @pos: offset of the buffer in vram 663 * @buf: virtual address of the buffer in system memory 664 * @size: read/write size, sizeof(@buf) must > @size 665 * @write: true - write to vram, otherwise - read from vram 666 */ 667 void amdgpu_device_vram_access(struct amdgpu_device *adev, loff_t pos, 668 void *buf, size_t size, bool write) 669 { 670 size_t count; 671 672 /* try to using vram apreature to access vram first */ 673 count = amdgpu_device_aper_access(adev, pos, buf, size, write); 674 size -= count; 675 if (size) { 676 /* using MM to access rest vram */ 677 pos += count; 678 buf += count; 679 amdgpu_device_mm_access(adev, pos, buf, size, write); 680 } 681 } 682 683 /* 684 * register access helper functions. 685 */ 686 687 /* Check if hw access should be skipped because of hotplug or device error */ 688 bool amdgpu_device_skip_hw_access(struct amdgpu_device *adev) 689 { 690 if (adev->no_hw_access) 691 return true; 692 693 #ifdef CONFIG_LOCKDEP 694 /* 695 * This is a bit complicated to understand, so worth a comment. What we assert 696 * here is that the GPU reset is not running on another thread in parallel. 697 * 698 * For this we trylock the read side of the reset semaphore, if that succeeds 699 * we know that the reset is not running in parallel. 700 * 701 * If the trylock fails we assert that we are either already holding the read 702 * side of the lock or are the reset thread itself and hold the write side of 703 * the lock. 704 */ 705 if (in_task()) { 706 if (down_read_trylock(&adev->reset_domain->sem)) 707 up_read(&adev->reset_domain->sem); 708 else 709 lockdep_assert_held(&adev->reset_domain->sem); 710 } 711 #endif 712 return false; 713 } 714 715 /** 716 * amdgpu_device_rreg - read a memory mapped IO or indirect register 717 * 718 * @adev: amdgpu_device pointer 719 * @reg: dword aligned register offset 720 * @acc_flags: access flags which require special behavior 721 * 722 * Returns the 32 bit value from the offset specified. 723 */ 724 uint32_t amdgpu_device_rreg(struct amdgpu_device *adev, 725 uint32_t reg, uint32_t acc_flags) 726 { 727 uint32_t ret; 728 729 if (amdgpu_device_skip_hw_access(adev)) 730 return 0; 731 732 if ((reg * 4) < adev->rmmio_size) { 733 if (!(acc_flags & AMDGPU_REGS_NO_KIQ) && 734 amdgpu_sriov_runtime(adev) && 735 down_read_trylock(&adev->reset_domain->sem)) { 736 ret = amdgpu_kiq_rreg(adev, reg, 0); 737 up_read(&adev->reset_domain->sem); 738 } else { 739 ret = readl(((void __iomem *)adev->rmmio) + (reg * 4)); 740 } 741 } else { 742 ret = adev->pcie_rreg(adev, reg * 4); 743 } 744 745 trace_amdgpu_device_rreg(adev->pdev->device, reg, ret); 746 747 return ret; 748 } 749 750 /* 751 * MMIO register read with bytes helper functions 752 * @offset:bytes offset from MMIO start 753 */ 754 755 /** 756 * amdgpu_mm_rreg8 - read a memory mapped IO register 757 * 758 * @adev: amdgpu_device pointer 759 * @offset: byte aligned register offset 760 * 761 * Returns the 8 bit value from the offset specified. 762 */ 763 uint8_t amdgpu_mm_rreg8(struct amdgpu_device *adev, uint32_t offset) 764 { 765 if (amdgpu_device_skip_hw_access(adev)) 766 return 0; 767 768 if (offset < adev->rmmio_size) 769 return (readb(adev->rmmio + offset)); 770 BUG(); 771 } 772 773 774 /** 775 * amdgpu_device_xcc_rreg - read a memory mapped IO or indirect register with specific XCC 776 * 777 * @adev: amdgpu_device pointer 778 * @reg: dword aligned register offset 779 * @acc_flags: access flags which require special behavior 780 * @xcc_id: xcc accelerated compute core id 781 * 782 * Returns the 32 bit value from the offset specified. 783 */ 784 uint32_t amdgpu_device_xcc_rreg(struct amdgpu_device *adev, 785 uint32_t reg, uint32_t acc_flags, 786 uint32_t xcc_id) 787 { 788 uint32_t ret, rlcg_flag; 789 790 if (amdgpu_device_skip_hw_access(adev)) 791 return 0; 792 793 if ((reg * 4) < adev->rmmio_size) { 794 if (amdgpu_sriov_vf(adev) && 795 !amdgpu_sriov_runtime(adev) && 796 adev->gfx.rlc.rlcg_reg_access_supported && 797 amdgpu_virt_get_rlcg_reg_access_flag(adev, acc_flags, 798 GC_HWIP, false, 799 &rlcg_flag)) { 800 ret = amdgpu_virt_rlcg_reg_rw(adev, reg, 0, rlcg_flag, GET_INST(GC, xcc_id)); 801 } else if (!(acc_flags & AMDGPU_REGS_NO_KIQ) && 802 amdgpu_sriov_runtime(adev) && 803 down_read_trylock(&adev->reset_domain->sem)) { 804 ret = amdgpu_kiq_rreg(adev, reg, xcc_id); 805 up_read(&adev->reset_domain->sem); 806 } else { 807 ret = readl(((void __iomem *)adev->rmmio) + (reg * 4)); 808 } 809 } else { 810 ret = adev->pcie_rreg(adev, reg * 4); 811 } 812 813 return ret; 814 } 815 816 /* 817 * MMIO register write with bytes helper functions 818 * @offset:bytes offset from MMIO start 819 * @value: the value want to be written to the register 820 */ 821 822 /** 823 * amdgpu_mm_wreg8 - read a memory mapped IO register 824 * 825 * @adev: amdgpu_device pointer 826 * @offset: byte aligned register offset 827 * @value: 8 bit value to write 828 * 829 * Writes the value specified to the offset specified. 830 */ 831 void amdgpu_mm_wreg8(struct amdgpu_device *adev, uint32_t offset, uint8_t value) 832 { 833 if (amdgpu_device_skip_hw_access(adev)) 834 return; 835 836 if (offset < adev->rmmio_size) 837 writeb(value, adev->rmmio + offset); 838 else 839 BUG(); 840 } 841 842 /** 843 * amdgpu_device_wreg - write to a memory mapped IO or indirect register 844 * 845 * @adev: amdgpu_device pointer 846 * @reg: dword aligned register offset 847 * @v: 32 bit value to write to the register 848 * @acc_flags: access flags which require special behavior 849 * 850 * Writes the value specified to the offset specified. 851 */ 852 void amdgpu_device_wreg(struct amdgpu_device *adev, 853 uint32_t reg, uint32_t v, 854 uint32_t acc_flags) 855 { 856 if (amdgpu_device_skip_hw_access(adev)) 857 return; 858 859 if ((reg * 4) < adev->rmmio_size) { 860 if (!(acc_flags & AMDGPU_REGS_NO_KIQ) && 861 amdgpu_sriov_runtime(adev) && 862 down_read_trylock(&adev->reset_domain->sem)) { 863 amdgpu_kiq_wreg(adev, reg, v, 0); 864 up_read(&adev->reset_domain->sem); 865 } else { 866 writel(v, ((void __iomem *)adev->rmmio) + (reg * 4)); 867 } 868 } else { 869 adev->pcie_wreg(adev, reg * 4, v); 870 } 871 872 trace_amdgpu_device_wreg(adev->pdev->device, reg, v); 873 } 874 875 /** 876 * amdgpu_mm_wreg_mmio_rlc - write register either with direct/indirect mmio or with RLC path if in range 877 * 878 * @adev: amdgpu_device pointer 879 * @reg: mmio/rlc register 880 * @v: value to write 881 * @xcc_id: xcc accelerated compute core id 882 * 883 * this function is invoked only for the debugfs register access 884 */ 885 void amdgpu_mm_wreg_mmio_rlc(struct amdgpu_device *adev, 886 uint32_t reg, uint32_t v, 887 uint32_t xcc_id) 888 { 889 if (amdgpu_device_skip_hw_access(adev)) 890 return; 891 892 if (amdgpu_sriov_fullaccess(adev) && 893 adev->gfx.rlc.funcs && 894 adev->gfx.rlc.funcs->is_rlcg_access_range) { 895 if (adev->gfx.rlc.funcs->is_rlcg_access_range(adev, reg)) 896 return amdgpu_sriov_wreg(adev, reg, v, 0, 0, xcc_id); 897 } else if ((reg * 4) >= adev->rmmio_size) { 898 adev->pcie_wreg(adev, reg * 4, v); 899 } else { 900 writel(v, ((void __iomem *)adev->rmmio) + (reg * 4)); 901 } 902 } 903 904 /** 905 * amdgpu_device_xcc_wreg - write to a memory mapped IO or indirect register with specific XCC 906 * 907 * @adev: amdgpu_device pointer 908 * @reg: dword aligned register offset 909 * @v: 32 bit value to write to the register 910 * @acc_flags: access flags which require special behavior 911 * @xcc_id: xcc accelerated compute core id 912 * 913 * Writes the value specified to the offset specified. 914 */ 915 void amdgpu_device_xcc_wreg(struct amdgpu_device *adev, 916 uint32_t reg, uint32_t v, 917 uint32_t acc_flags, uint32_t xcc_id) 918 { 919 uint32_t rlcg_flag; 920 921 if (amdgpu_device_skip_hw_access(adev)) 922 return; 923 924 if ((reg * 4) < adev->rmmio_size) { 925 if (amdgpu_sriov_vf(adev) && 926 !amdgpu_sriov_runtime(adev) && 927 adev->gfx.rlc.rlcg_reg_access_supported && 928 amdgpu_virt_get_rlcg_reg_access_flag(adev, acc_flags, 929 GC_HWIP, true, 930 &rlcg_flag)) { 931 amdgpu_virt_rlcg_reg_rw(adev, reg, v, rlcg_flag, GET_INST(GC, xcc_id)); 932 } else if (!(acc_flags & AMDGPU_REGS_NO_KIQ) && 933 amdgpu_sriov_runtime(adev) && 934 down_read_trylock(&adev->reset_domain->sem)) { 935 amdgpu_kiq_wreg(adev, reg, v, xcc_id); 936 up_read(&adev->reset_domain->sem); 937 } else { 938 writel(v, ((void __iomem *)adev->rmmio) + (reg * 4)); 939 } 940 } else { 941 adev->pcie_wreg(adev, reg * 4, v); 942 } 943 } 944 945 /** 946 * amdgpu_device_indirect_rreg - read an indirect register 947 * 948 * @adev: amdgpu_device pointer 949 * @reg_addr: indirect register address to read from 950 * 951 * Returns the value of indirect register @reg_addr 952 */ 953 u32 amdgpu_device_indirect_rreg(struct amdgpu_device *adev, 954 u32 reg_addr) 955 { 956 unsigned long flags, pcie_index, pcie_data; 957 void __iomem *pcie_index_offset; 958 void __iomem *pcie_data_offset; 959 u32 r; 960 961 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 962 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 963 964 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 965 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 966 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 967 968 writel(reg_addr, pcie_index_offset); 969 readl(pcie_index_offset); 970 r = readl(pcie_data_offset); 971 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 972 973 return r; 974 } 975 976 u32 amdgpu_device_indirect_rreg_ext(struct amdgpu_device *adev, 977 u64 reg_addr) 978 { 979 unsigned long flags, pcie_index, pcie_index_hi, pcie_data; 980 u32 r; 981 void __iomem *pcie_index_offset; 982 void __iomem *pcie_index_hi_offset; 983 void __iomem *pcie_data_offset; 984 985 if (unlikely(!adev->nbio.funcs)) { 986 pcie_index = AMDGPU_PCIE_INDEX_FALLBACK; 987 pcie_data = AMDGPU_PCIE_DATA_FALLBACK; 988 } else { 989 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 990 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 991 } 992 993 if (reg_addr >> 32) { 994 if (unlikely(!adev->nbio.funcs)) 995 pcie_index_hi = AMDGPU_PCIE_INDEX_HI_FALLBACK; 996 else 997 pcie_index_hi = adev->nbio.funcs->get_pcie_index_hi_offset(adev); 998 } else { 999 pcie_index_hi = 0; 1000 } 1001 1002 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 1003 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 1004 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 1005 if (pcie_index_hi != 0) 1006 pcie_index_hi_offset = (void __iomem *)adev->rmmio + 1007 pcie_index_hi * 4; 1008 1009 writel(reg_addr, pcie_index_offset); 1010 readl(pcie_index_offset); 1011 if (pcie_index_hi != 0) { 1012 writel((reg_addr >> 32) & 0xff, pcie_index_hi_offset); 1013 readl(pcie_index_hi_offset); 1014 } 1015 r = readl(pcie_data_offset); 1016 1017 /* clear the high bits */ 1018 if (pcie_index_hi != 0) { 1019 writel(0, pcie_index_hi_offset); 1020 readl(pcie_index_hi_offset); 1021 } 1022 1023 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 1024 1025 return r; 1026 } 1027 1028 /** 1029 * amdgpu_device_indirect_rreg64 - read a 64bits indirect register 1030 * 1031 * @adev: amdgpu_device pointer 1032 * @reg_addr: indirect register address to read from 1033 * 1034 * Returns the value of indirect register @reg_addr 1035 */ 1036 u64 amdgpu_device_indirect_rreg64(struct amdgpu_device *adev, 1037 u32 reg_addr) 1038 { 1039 unsigned long flags, pcie_index, pcie_data; 1040 void __iomem *pcie_index_offset; 1041 void __iomem *pcie_data_offset; 1042 u64 r; 1043 1044 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 1045 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 1046 1047 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 1048 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 1049 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 1050 1051 /* read low 32 bits */ 1052 writel(reg_addr, pcie_index_offset); 1053 readl(pcie_index_offset); 1054 r = readl(pcie_data_offset); 1055 /* read high 32 bits */ 1056 writel(reg_addr + 4, pcie_index_offset); 1057 readl(pcie_index_offset); 1058 r |= ((u64)readl(pcie_data_offset) << 32); 1059 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 1060 1061 return r; 1062 } 1063 1064 u64 amdgpu_device_indirect_rreg64_ext(struct amdgpu_device *adev, 1065 u64 reg_addr) 1066 { 1067 unsigned long flags, pcie_index, pcie_data; 1068 unsigned long pcie_index_hi = 0; 1069 void __iomem *pcie_index_offset; 1070 void __iomem *pcie_index_hi_offset; 1071 void __iomem *pcie_data_offset; 1072 u64 r; 1073 1074 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 1075 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 1076 if ((reg_addr >> 32) && (adev->nbio.funcs->get_pcie_index_hi_offset)) 1077 pcie_index_hi = adev->nbio.funcs->get_pcie_index_hi_offset(adev); 1078 1079 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 1080 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 1081 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 1082 if (pcie_index_hi != 0) 1083 pcie_index_hi_offset = (void __iomem *)adev->rmmio + 1084 pcie_index_hi * 4; 1085 1086 /* read low 32 bits */ 1087 writel(reg_addr, pcie_index_offset); 1088 readl(pcie_index_offset); 1089 if (pcie_index_hi != 0) { 1090 writel((reg_addr >> 32) & 0xff, pcie_index_hi_offset); 1091 readl(pcie_index_hi_offset); 1092 } 1093 r = readl(pcie_data_offset); 1094 /* read high 32 bits */ 1095 writel(reg_addr + 4, pcie_index_offset); 1096 readl(pcie_index_offset); 1097 if (pcie_index_hi != 0) { 1098 writel((reg_addr >> 32) & 0xff, pcie_index_hi_offset); 1099 readl(pcie_index_hi_offset); 1100 } 1101 r |= ((u64)readl(pcie_data_offset) << 32); 1102 1103 /* clear the high bits */ 1104 if (pcie_index_hi != 0) { 1105 writel(0, pcie_index_hi_offset); 1106 readl(pcie_index_hi_offset); 1107 } 1108 1109 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 1110 1111 return r; 1112 } 1113 1114 /** 1115 * amdgpu_device_indirect_wreg - write an indirect register address 1116 * 1117 * @adev: amdgpu_device pointer 1118 * @reg_addr: indirect register offset 1119 * @reg_data: indirect register data 1120 * 1121 */ 1122 void amdgpu_device_indirect_wreg(struct amdgpu_device *adev, 1123 u32 reg_addr, u32 reg_data) 1124 { 1125 unsigned long flags, pcie_index, pcie_data; 1126 void __iomem *pcie_index_offset; 1127 void __iomem *pcie_data_offset; 1128 1129 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 1130 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 1131 1132 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 1133 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 1134 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 1135 1136 writel(reg_addr, pcie_index_offset); 1137 readl(pcie_index_offset); 1138 writel(reg_data, pcie_data_offset); 1139 readl(pcie_data_offset); 1140 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 1141 } 1142 1143 void amdgpu_device_indirect_wreg_ext(struct amdgpu_device *adev, 1144 u64 reg_addr, u32 reg_data) 1145 { 1146 unsigned long flags, pcie_index, pcie_index_hi, pcie_data; 1147 void __iomem *pcie_index_offset; 1148 void __iomem *pcie_index_hi_offset; 1149 void __iomem *pcie_data_offset; 1150 1151 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 1152 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 1153 if ((reg_addr >> 32) && (adev->nbio.funcs->get_pcie_index_hi_offset)) 1154 pcie_index_hi = adev->nbio.funcs->get_pcie_index_hi_offset(adev); 1155 else 1156 pcie_index_hi = 0; 1157 1158 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 1159 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 1160 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 1161 if (pcie_index_hi != 0) 1162 pcie_index_hi_offset = (void __iomem *)adev->rmmio + 1163 pcie_index_hi * 4; 1164 1165 writel(reg_addr, pcie_index_offset); 1166 readl(pcie_index_offset); 1167 if (pcie_index_hi != 0) { 1168 writel((reg_addr >> 32) & 0xff, pcie_index_hi_offset); 1169 readl(pcie_index_hi_offset); 1170 } 1171 writel(reg_data, pcie_data_offset); 1172 readl(pcie_data_offset); 1173 1174 /* clear the high bits */ 1175 if (pcie_index_hi != 0) { 1176 writel(0, pcie_index_hi_offset); 1177 readl(pcie_index_hi_offset); 1178 } 1179 1180 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 1181 } 1182 1183 /** 1184 * amdgpu_device_indirect_wreg64 - write a 64bits indirect register address 1185 * 1186 * @adev: amdgpu_device pointer 1187 * @reg_addr: indirect register offset 1188 * @reg_data: indirect register data 1189 * 1190 */ 1191 void amdgpu_device_indirect_wreg64(struct amdgpu_device *adev, 1192 u32 reg_addr, u64 reg_data) 1193 { 1194 unsigned long flags, pcie_index, pcie_data; 1195 void __iomem *pcie_index_offset; 1196 void __iomem *pcie_data_offset; 1197 1198 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 1199 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 1200 1201 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 1202 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 1203 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 1204 1205 /* write low 32 bits */ 1206 writel(reg_addr, pcie_index_offset); 1207 readl(pcie_index_offset); 1208 writel((u32)(reg_data & 0xffffffffULL), pcie_data_offset); 1209 readl(pcie_data_offset); 1210 /* write high 32 bits */ 1211 writel(reg_addr + 4, pcie_index_offset); 1212 readl(pcie_index_offset); 1213 writel((u32)(reg_data >> 32), pcie_data_offset); 1214 readl(pcie_data_offset); 1215 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 1216 } 1217 1218 void amdgpu_device_indirect_wreg64_ext(struct amdgpu_device *adev, 1219 u64 reg_addr, u64 reg_data) 1220 { 1221 unsigned long flags, pcie_index, pcie_data; 1222 unsigned long pcie_index_hi = 0; 1223 void __iomem *pcie_index_offset; 1224 void __iomem *pcie_index_hi_offset; 1225 void __iomem *pcie_data_offset; 1226 1227 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 1228 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 1229 if ((reg_addr >> 32) && (adev->nbio.funcs->get_pcie_index_hi_offset)) 1230 pcie_index_hi = adev->nbio.funcs->get_pcie_index_hi_offset(adev); 1231 1232 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 1233 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 1234 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 1235 if (pcie_index_hi != 0) 1236 pcie_index_hi_offset = (void __iomem *)adev->rmmio + 1237 pcie_index_hi * 4; 1238 1239 /* write low 32 bits */ 1240 writel(reg_addr, pcie_index_offset); 1241 readl(pcie_index_offset); 1242 if (pcie_index_hi != 0) { 1243 writel((reg_addr >> 32) & 0xff, pcie_index_hi_offset); 1244 readl(pcie_index_hi_offset); 1245 } 1246 writel((u32)(reg_data & 0xffffffffULL), pcie_data_offset); 1247 readl(pcie_data_offset); 1248 /* write high 32 bits */ 1249 writel(reg_addr + 4, pcie_index_offset); 1250 readl(pcie_index_offset); 1251 if (pcie_index_hi != 0) { 1252 writel((reg_addr >> 32) & 0xff, pcie_index_hi_offset); 1253 readl(pcie_index_hi_offset); 1254 } 1255 writel((u32)(reg_data >> 32), pcie_data_offset); 1256 readl(pcie_data_offset); 1257 1258 /* clear the high bits */ 1259 if (pcie_index_hi != 0) { 1260 writel(0, pcie_index_hi_offset); 1261 readl(pcie_index_hi_offset); 1262 } 1263 1264 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 1265 } 1266 1267 /** 1268 * amdgpu_device_get_rev_id - query device rev_id 1269 * 1270 * @adev: amdgpu_device pointer 1271 * 1272 * Return device rev_id 1273 */ 1274 u32 amdgpu_device_get_rev_id(struct amdgpu_device *adev) 1275 { 1276 return adev->nbio.funcs->get_rev_id(adev); 1277 } 1278 1279 /** 1280 * amdgpu_invalid_rreg - dummy reg read function 1281 * 1282 * @adev: amdgpu_device pointer 1283 * @reg: offset of register 1284 * 1285 * Dummy register read function. Used for register blocks 1286 * that certain asics don't have (all asics). 1287 * Returns the value in the register. 1288 */ 1289 static uint32_t amdgpu_invalid_rreg(struct amdgpu_device *adev, uint32_t reg) 1290 { 1291 DRM_ERROR("Invalid callback to read register 0x%04X\n", reg); 1292 BUG(); 1293 return 0; 1294 } 1295 1296 static uint32_t amdgpu_invalid_rreg_ext(struct amdgpu_device *adev, uint64_t reg) 1297 { 1298 DRM_ERROR("Invalid callback to read register 0x%llX\n", reg); 1299 BUG(); 1300 return 0; 1301 } 1302 1303 /** 1304 * amdgpu_invalid_wreg - dummy reg write function 1305 * 1306 * @adev: amdgpu_device pointer 1307 * @reg: offset of register 1308 * @v: value to write to the register 1309 * 1310 * Dummy register read function. Used for register blocks 1311 * that certain asics don't have (all asics). 1312 */ 1313 static void amdgpu_invalid_wreg(struct amdgpu_device *adev, uint32_t reg, uint32_t v) 1314 { 1315 DRM_ERROR("Invalid callback to write register 0x%04X with 0x%08X\n", 1316 reg, v); 1317 BUG(); 1318 } 1319 1320 static void amdgpu_invalid_wreg_ext(struct amdgpu_device *adev, uint64_t reg, uint32_t v) 1321 { 1322 DRM_ERROR("Invalid callback to write register 0x%llX with 0x%08X\n", 1323 reg, v); 1324 BUG(); 1325 } 1326 1327 /** 1328 * amdgpu_invalid_rreg64 - dummy 64 bit reg read function 1329 * 1330 * @adev: amdgpu_device pointer 1331 * @reg: offset of register 1332 * 1333 * Dummy register read function. Used for register blocks 1334 * that certain asics don't have (all asics). 1335 * Returns the value in the register. 1336 */ 1337 static uint64_t amdgpu_invalid_rreg64(struct amdgpu_device *adev, uint32_t reg) 1338 { 1339 DRM_ERROR("Invalid callback to read 64 bit register 0x%04X\n", reg); 1340 BUG(); 1341 return 0; 1342 } 1343 1344 static uint64_t amdgpu_invalid_rreg64_ext(struct amdgpu_device *adev, uint64_t reg) 1345 { 1346 DRM_ERROR("Invalid callback to read register 0x%llX\n", reg); 1347 BUG(); 1348 return 0; 1349 } 1350 1351 /** 1352 * amdgpu_invalid_wreg64 - dummy reg write function 1353 * 1354 * @adev: amdgpu_device pointer 1355 * @reg: offset of register 1356 * @v: value to write to the register 1357 * 1358 * Dummy register read function. Used for register blocks 1359 * that certain asics don't have (all asics). 1360 */ 1361 static void amdgpu_invalid_wreg64(struct amdgpu_device *adev, uint32_t reg, uint64_t v) 1362 { 1363 DRM_ERROR("Invalid callback to write 64 bit register 0x%04X with 0x%08llX\n", 1364 reg, v); 1365 BUG(); 1366 } 1367 1368 static void amdgpu_invalid_wreg64_ext(struct amdgpu_device *adev, uint64_t reg, uint64_t v) 1369 { 1370 DRM_ERROR("Invalid callback to write 64 bit register 0x%llX with 0x%08llX\n", 1371 reg, v); 1372 BUG(); 1373 } 1374 1375 /** 1376 * amdgpu_block_invalid_rreg - dummy reg read function 1377 * 1378 * @adev: amdgpu_device pointer 1379 * @block: offset of instance 1380 * @reg: offset of register 1381 * 1382 * Dummy register read function. Used for register blocks 1383 * that certain asics don't have (all asics). 1384 * Returns the value in the register. 1385 */ 1386 static uint32_t amdgpu_block_invalid_rreg(struct amdgpu_device *adev, 1387 uint32_t block, uint32_t reg) 1388 { 1389 DRM_ERROR("Invalid callback to read register 0x%04X in block 0x%04X\n", 1390 reg, block); 1391 BUG(); 1392 return 0; 1393 } 1394 1395 /** 1396 * amdgpu_block_invalid_wreg - dummy reg write function 1397 * 1398 * @adev: amdgpu_device pointer 1399 * @block: offset of instance 1400 * @reg: offset of register 1401 * @v: value to write to the register 1402 * 1403 * Dummy register read function. Used for register blocks 1404 * that certain asics don't have (all asics). 1405 */ 1406 static void amdgpu_block_invalid_wreg(struct amdgpu_device *adev, 1407 uint32_t block, 1408 uint32_t reg, uint32_t v) 1409 { 1410 DRM_ERROR("Invalid block callback to write register 0x%04X in block 0x%04X with 0x%08X\n", 1411 reg, block, v); 1412 BUG(); 1413 } 1414 1415 static uint32_t amdgpu_device_get_vbios_flags(struct amdgpu_device *adev) 1416 { 1417 if (hweight32(adev->aid_mask) && (adev->flags & AMD_IS_APU)) 1418 return AMDGPU_VBIOS_SKIP; 1419 1420 if (hweight32(adev->aid_mask) && amdgpu_passthrough(adev)) 1421 return AMDGPU_VBIOS_OPTIONAL; 1422 1423 return 0; 1424 } 1425 1426 /** 1427 * amdgpu_device_asic_init - Wrapper for atom asic_init 1428 * 1429 * @adev: amdgpu_device pointer 1430 * 1431 * Does any asic specific work and then calls atom asic init. 1432 */ 1433 static int amdgpu_device_asic_init(struct amdgpu_device *adev) 1434 { 1435 uint32_t flags; 1436 bool optional; 1437 int ret; 1438 1439 amdgpu_asic_pre_asic_init(adev); 1440 flags = amdgpu_device_get_vbios_flags(adev); 1441 optional = !!(flags & (AMDGPU_VBIOS_OPTIONAL | AMDGPU_VBIOS_SKIP)); 1442 1443 if (amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 3) || 1444 amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 4) || 1445 amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 5, 0) || 1446 amdgpu_ip_version(adev, GC_HWIP, 0) >= IP_VERSION(11, 0, 0)) { 1447 amdgpu_psp_wait_for_bootloader(adev); 1448 if (optional && !adev->bios) 1449 return 0; 1450 1451 ret = amdgpu_atomfirmware_asic_init(adev, true); 1452 return ret; 1453 } else { 1454 if (optional && !adev->bios) 1455 return 0; 1456 1457 return amdgpu_atom_asic_init(adev->mode_info.atom_context); 1458 } 1459 1460 return 0; 1461 } 1462 1463 /** 1464 * amdgpu_device_mem_scratch_init - allocate the VRAM scratch page 1465 * 1466 * @adev: amdgpu_device pointer 1467 * 1468 * Allocates a scratch page of VRAM for use by various things in the 1469 * driver. 1470 */ 1471 static int amdgpu_device_mem_scratch_init(struct amdgpu_device *adev) 1472 { 1473 return amdgpu_bo_create_kernel(adev, AMDGPU_GPU_PAGE_SIZE, PAGE_SIZE, 1474 AMDGPU_GEM_DOMAIN_VRAM | 1475 AMDGPU_GEM_DOMAIN_GTT, 1476 &adev->mem_scratch.robj, 1477 &adev->mem_scratch.gpu_addr, 1478 (void **)&adev->mem_scratch.ptr); 1479 } 1480 1481 /** 1482 * amdgpu_device_mem_scratch_fini - Free the VRAM scratch page 1483 * 1484 * @adev: amdgpu_device pointer 1485 * 1486 * Frees the VRAM scratch page. 1487 */ 1488 static void amdgpu_device_mem_scratch_fini(struct amdgpu_device *adev) 1489 { 1490 amdgpu_bo_free_kernel(&adev->mem_scratch.robj, NULL, NULL); 1491 } 1492 1493 /** 1494 * amdgpu_device_program_register_sequence - program an array of registers. 1495 * 1496 * @adev: amdgpu_device pointer 1497 * @registers: pointer to the register array 1498 * @array_size: size of the register array 1499 * 1500 * Programs an array or registers with and or masks. 1501 * This is a helper for setting golden registers. 1502 */ 1503 void amdgpu_device_program_register_sequence(struct amdgpu_device *adev, 1504 const u32 *registers, 1505 const u32 array_size) 1506 { 1507 u32 tmp, reg, and_mask, or_mask; 1508 int i; 1509 1510 if (array_size % 3) 1511 return; 1512 1513 for (i = 0; i < array_size; i += 3) { 1514 reg = registers[i + 0]; 1515 and_mask = registers[i + 1]; 1516 or_mask = registers[i + 2]; 1517 1518 if (and_mask == 0xffffffff) { 1519 tmp = or_mask; 1520 } else { 1521 tmp = RREG32(reg); 1522 tmp &= ~and_mask; 1523 if (adev->family >= AMDGPU_FAMILY_AI) 1524 tmp |= (or_mask & and_mask); 1525 else 1526 tmp |= or_mask; 1527 } 1528 WREG32(reg, tmp); 1529 } 1530 } 1531 1532 /** 1533 * amdgpu_device_pci_config_reset - reset the GPU 1534 * 1535 * @adev: amdgpu_device pointer 1536 * 1537 * Resets the GPU using the pci config reset sequence. 1538 * Only applicable to asics prior to vega10. 1539 */ 1540 void amdgpu_device_pci_config_reset(struct amdgpu_device *adev) 1541 { 1542 pci_write_config_dword(adev->pdev, 0x7c, AMDGPU_ASIC_RESET_DATA); 1543 } 1544 1545 /** 1546 * amdgpu_device_pci_reset - reset the GPU using generic PCI means 1547 * 1548 * @adev: amdgpu_device pointer 1549 * 1550 * Resets the GPU using generic pci reset interfaces (FLR, SBR, etc.). 1551 */ 1552 int amdgpu_device_pci_reset(struct amdgpu_device *adev) 1553 { 1554 return pci_reset_function(adev->pdev); 1555 } 1556 1557 /* 1558 * amdgpu_device_wb_*() 1559 * Writeback is the method by which the GPU updates special pages in memory 1560 * with the status of certain GPU events (fences, ring pointers,etc.). 1561 */ 1562 1563 /** 1564 * amdgpu_device_wb_fini - Disable Writeback and free memory 1565 * 1566 * @adev: amdgpu_device pointer 1567 * 1568 * Disables Writeback and frees the Writeback memory (all asics). 1569 * Used at driver shutdown. 1570 */ 1571 static void amdgpu_device_wb_fini(struct amdgpu_device *adev) 1572 { 1573 if (adev->wb.wb_obj) { 1574 amdgpu_bo_free_kernel(&adev->wb.wb_obj, 1575 &adev->wb.gpu_addr, 1576 (void **)&adev->wb.wb); 1577 adev->wb.wb_obj = NULL; 1578 } 1579 } 1580 1581 /** 1582 * amdgpu_device_wb_init - Init Writeback driver info and allocate memory 1583 * 1584 * @adev: amdgpu_device pointer 1585 * 1586 * Initializes writeback and allocates writeback memory (all asics). 1587 * Used at driver startup. 1588 * Returns 0 on success or an -error on failure. 1589 */ 1590 static int amdgpu_device_wb_init(struct amdgpu_device *adev) 1591 { 1592 int r; 1593 1594 if (adev->wb.wb_obj == NULL) { 1595 /* AMDGPU_MAX_WB * sizeof(uint32_t) * 8 = AMDGPU_MAX_WB 256bit slots */ 1596 r = amdgpu_bo_create_kernel(adev, AMDGPU_MAX_WB * sizeof(uint32_t) * 8, 1597 PAGE_SIZE, AMDGPU_GEM_DOMAIN_GTT, 1598 &adev->wb.wb_obj, &adev->wb.gpu_addr, 1599 (void **)&adev->wb.wb); 1600 if (r) { 1601 dev_warn(adev->dev, "(%d) create WB bo failed\n", r); 1602 return r; 1603 } 1604 1605 adev->wb.num_wb = AMDGPU_MAX_WB; 1606 memset(&adev->wb.used, 0, sizeof(adev->wb.used)); 1607 1608 /* clear wb memory */ 1609 memset((char *)adev->wb.wb, 0, AMDGPU_MAX_WB * sizeof(uint32_t) * 8); 1610 } 1611 1612 return 0; 1613 } 1614 1615 /** 1616 * amdgpu_device_wb_get - Allocate a wb entry 1617 * 1618 * @adev: amdgpu_device pointer 1619 * @wb: wb index 1620 * 1621 * Allocate a wb slot for use by the driver (all asics). 1622 * Returns 0 on success or -EINVAL on failure. 1623 */ 1624 int amdgpu_device_wb_get(struct amdgpu_device *adev, u32 *wb) 1625 { 1626 unsigned long flags, offset; 1627 1628 spin_lock_irqsave(&adev->wb.lock, flags); 1629 offset = find_first_zero_bit(adev->wb.used, adev->wb.num_wb); 1630 if (offset < adev->wb.num_wb) { 1631 __set_bit(offset, adev->wb.used); 1632 spin_unlock_irqrestore(&adev->wb.lock, flags); 1633 *wb = offset << 3; /* convert to dw offset */ 1634 return 0; 1635 } else { 1636 spin_unlock_irqrestore(&adev->wb.lock, flags); 1637 return -EINVAL; 1638 } 1639 } 1640 1641 /** 1642 * amdgpu_device_wb_free - Free a wb entry 1643 * 1644 * @adev: amdgpu_device pointer 1645 * @wb: wb index 1646 * 1647 * Free a wb slot allocated for use by the driver (all asics) 1648 */ 1649 void amdgpu_device_wb_free(struct amdgpu_device *adev, u32 wb) 1650 { 1651 unsigned long flags; 1652 1653 wb >>= 3; 1654 spin_lock_irqsave(&adev->wb.lock, flags); 1655 if (wb < adev->wb.num_wb) 1656 __clear_bit(wb, adev->wb.used); 1657 spin_unlock_irqrestore(&adev->wb.lock, flags); 1658 } 1659 1660 /** 1661 * amdgpu_device_resize_fb_bar - try to resize FB BAR 1662 * 1663 * @adev: amdgpu_device pointer 1664 * 1665 * Try to resize FB BAR to make all VRAM CPU accessible. We try very hard not 1666 * to fail, but if any of the BARs is not accessible after the size we abort 1667 * driver loading by returning -ENODEV. 1668 */ 1669 int amdgpu_device_resize_fb_bar(struct amdgpu_device *adev) 1670 { 1671 int rbar_size = pci_rebar_bytes_to_size(adev->gmc.real_vram_size); 1672 struct pci_bus *root; 1673 struct resource *res; 1674 unsigned int i; 1675 u16 cmd; 1676 int r; 1677 1678 if (!IS_ENABLED(CONFIG_PHYS_ADDR_T_64BIT)) 1679 return 0; 1680 1681 /* Bypass for VF */ 1682 if (amdgpu_sriov_vf(adev)) 1683 return 0; 1684 1685 if (!amdgpu_rebar) 1686 return 0; 1687 1688 /* resizing on Dell G5 SE platforms causes problems with runtime pm */ 1689 if ((amdgpu_runtime_pm != 0) && 1690 adev->pdev->vendor == PCI_VENDOR_ID_ATI && 1691 adev->pdev->device == 0x731f && 1692 adev->pdev->subsystem_vendor == PCI_VENDOR_ID_DELL) 1693 return 0; 1694 1695 /* PCI_EXT_CAP_ID_VNDR extended capability is located at 0x100 */ 1696 if (!pci_find_ext_capability(adev->pdev, PCI_EXT_CAP_ID_VNDR)) 1697 DRM_WARN("System can't access extended configuration space, please check!!\n"); 1698 1699 /* skip if the bios has already enabled large BAR */ 1700 if (adev->gmc.real_vram_size && 1701 (pci_resource_len(adev->pdev, 0) >= adev->gmc.real_vram_size)) 1702 return 0; 1703 1704 /* Check if the root BUS has 64bit memory resources */ 1705 root = adev->pdev->bus; 1706 while (root->parent) 1707 root = root->parent; 1708 1709 pci_bus_for_each_resource(root, res, i) { 1710 if (res && res->flags & (IORESOURCE_MEM | IORESOURCE_MEM_64) && 1711 res->start > 0x100000000ull) 1712 break; 1713 } 1714 1715 /* Trying to resize is pointless without a root hub window above 4GB */ 1716 if (!res) 1717 return 0; 1718 1719 /* Limit the BAR size to what is available */ 1720 rbar_size = min(fls(pci_rebar_get_possible_sizes(adev->pdev, 0)) - 1, 1721 rbar_size); 1722 1723 /* Disable memory decoding while we change the BAR addresses and size */ 1724 pci_read_config_word(adev->pdev, PCI_COMMAND, &cmd); 1725 pci_write_config_word(adev->pdev, PCI_COMMAND, 1726 cmd & ~PCI_COMMAND_MEMORY); 1727 1728 /* Free the VRAM and doorbell BAR, we most likely need to move both. */ 1729 amdgpu_doorbell_fini(adev); 1730 if (adev->asic_type >= CHIP_BONAIRE) 1731 pci_release_resource(adev->pdev, 2); 1732 1733 pci_release_resource(adev->pdev, 0); 1734 1735 r = pci_resize_resource(adev->pdev, 0, rbar_size); 1736 if (r == -ENOSPC) 1737 DRM_INFO("Not enough PCI address space for a large BAR."); 1738 else if (r && r != -ENOTSUPP) 1739 DRM_ERROR("Problem resizing BAR0 (%d).", r); 1740 1741 pci_assign_unassigned_bus_resources(adev->pdev->bus); 1742 1743 /* When the doorbell or fb BAR isn't available we have no chance of 1744 * using the device. 1745 */ 1746 r = amdgpu_doorbell_init(adev); 1747 if (r || (pci_resource_flags(adev->pdev, 0) & IORESOURCE_UNSET)) 1748 return -ENODEV; 1749 1750 pci_write_config_word(adev->pdev, PCI_COMMAND, cmd); 1751 1752 return 0; 1753 } 1754 1755 /* 1756 * GPU helpers function. 1757 */ 1758 /** 1759 * amdgpu_device_need_post - check if the hw need post or not 1760 * 1761 * @adev: amdgpu_device pointer 1762 * 1763 * Check if the asic has been initialized (all asics) at driver startup 1764 * or post is needed if hw reset is performed. 1765 * Returns true if need or false if not. 1766 */ 1767 bool amdgpu_device_need_post(struct amdgpu_device *adev) 1768 { 1769 uint32_t reg, flags; 1770 1771 if (amdgpu_sriov_vf(adev)) 1772 return false; 1773 1774 flags = amdgpu_device_get_vbios_flags(adev); 1775 if (flags & AMDGPU_VBIOS_SKIP) 1776 return false; 1777 if ((flags & AMDGPU_VBIOS_OPTIONAL) && !adev->bios) 1778 return false; 1779 1780 if (amdgpu_passthrough(adev)) { 1781 /* for FIJI: In whole GPU pass-through virtualization case, after VM reboot 1782 * some old smc fw still need driver do vPost otherwise gpu hang, while 1783 * those smc fw version above 22.15 doesn't have this flaw, so we force 1784 * vpost executed for smc version below 22.15 1785 */ 1786 if (adev->asic_type == CHIP_FIJI) { 1787 int err; 1788 uint32_t fw_ver; 1789 1790 err = request_firmware(&adev->pm.fw, "amdgpu/fiji_smc.bin", adev->dev); 1791 /* force vPost if error occurred */ 1792 if (err) 1793 return true; 1794 1795 fw_ver = *((uint32_t *)adev->pm.fw->data + 69); 1796 release_firmware(adev->pm.fw); 1797 if (fw_ver < 0x00160e00) 1798 return true; 1799 } 1800 } 1801 1802 /* Don't post if we need to reset whole hive on init */ 1803 if (adev->init_lvl->level == AMDGPU_INIT_LEVEL_MINIMAL_XGMI) 1804 return false; 1805 1806 if (adev->has_hw_reset) { 1807 adev->has_hw_reset = false; 1808 return true; 1809 } 1810 1811 /* bios scratch used on CIK+ */ 1812 if (adev->asic_type >= CHIP_BONAIRE) 1813 return amdgpu_atombios_scratch_need_asic_init(adev); 1814 1815 /* check MEM_SIZE for older asics */ 1816 reg = amdgpu_asic_get_config_memsize(adev); 1817 1818 if ((reg != 0) && (reg != 0xffffffff)) 1819 return false; 1820 1821 return true; 1822 } 1823 1824 /* 1825 * Check whether seamless boot is supported. 1826 * 1827 * So far we only support seamless boot on DCE 3.0 or later. 1828 * If users report that it works on older ASICS as well, we may 1829 * loosen this. 1830 */ 1831 bool amdgpu_device_seamless_boot_supported(struct amdgpu_device *adev) 1832 { 1833 switch (amdgpu_seamless) { 1834 case -1: 1835 break; 1836 case 1: 1837 return true; 1838 case 0: 1839 return false; 1840 default: 1841 DRM_ERROR("Invalid value for amdgpu.seamless: %d\n", 1842 amdgpu_seamless); 1843 return false; 1844 } 1845 1846 if (!(adev->flags & AMD_IS_APU)) 1847 return false; 1848 1849 if (adev->mman.keep_stolen_vga_memory) 1850 return false; 1851 1852 return amdgpu_ip_version(adev, DCE_HWIP, 0) >= IP_VERSION(3, 0, 0); 1853 } 1854 1855 /* 1856 * Intel hosts such as Rocket Lake, Alder Lake, Raptor Lake and Sapphire Rapids 1857 * don't support dynamic speed switching. Until we have confirmation from Intel 1858 * that a specific host supports it, it's safer that we keep it disabled for all. 1859 * 1860 * https://edc.intel.com/content/www/us/en/design/products/platforms/details/raptor-lake-s/13th-generation-core-processors-datasheet-volume-1-of-2/005/pci-express-support/ 1861 * https://gitlab.freedesktop.org/drm/amd/-/issues/2663 1862 */ 1863 static bool amdgpu_device_pcie_dynamic_switching_supported(struct amdgpu_device *adev) 1864 { 1865 #if IS_ENABLED(CONFIG_X86) 1866 struct cpuinfo_x86 *c = &cpu_data(0); 1867 1868 /* eGPU change speeds based on USB4 fabric conditions */ 1869 if (dev_is_removable(adev->dev)) 1870 return true; 1871 1872 if (c->x86_vendor == X86_VENDOR_INTEL) 1873 return false; 1874 #endif 1875 return true; 1876 } 1877 1878 static bool amdgpu_device_aspm_support_quirk(struct amdgpu_device *adev) 1879 { 1880 #if IS_ENABLED(CONFIG_X86) 1881 struct cpuinfo_x86 *c = &cpu_data(0); 1882 1883 if (!(amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(12, 0, 0) || 1884 amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(12, 0, 1))) 1885 return false; 1886 1887 if (c->x86 == 6 && 1888 adev->pm.pcie_gen_mask & CAIL_PCIE_LINK_SPEED_SUPPORT_GEN5) { 1889 switch (c->x86_model) { 1890 case VFM_MODEL(INTEL_ALDERLAKE): 1891 case VFM_MODEL(INTEL_ALDERLAKE_L): 1892 case VFM_MODEL(INTEL_RAPTORLAKE): 1893 case VFM_MODEL(INTEL_RAPTORLAKE_P): 1894 case VFM_MODEL(INTEL_RAPTORLAKE_S): 1895 return true; 1896 default: 1897 return false; 1898 } 1899 } else { 1900 return false; 1901 } 1902 #else 1903 return false; 1904 #endif 1905 } 1906 1907 /** 1908 * amdgpu_device_should_use_aspm - check if the device should program ASPM 1909 * 1910 * @adev: amdgpu_device pointer 1911 * 1912 * Confirm whether the module parameter and pcie bridge agree that ASPM should 1913 * be set for this device. 1914 * 1915 * Returns true if it should be used or false if not. 1916 */ 1917 bool amdgpu_device_should_use_aspm(struct amdgpu_device *adev) 1918 { 1919 switch (amdgpu_aspm) { 1920 case -1: 1921 break; 1922 case 0: 1923 return false; 1924 case 1: 1925 return true; 1926 default: 1927 return false; 1928 } 1929 if (adev->flags & AMD_IS_APU) 1930 return false; 1931 if (amdgpu_device_aspm_support_quirk(adev)) 1932 return false; 1933 return pcie_aspm_enabled(adev->pdev); 1934 } 1935 1936 /* if we get transitioned to only one device, take VGA back */ 1937 /** 1938 * amdgpu_device_vga_set_decode - enable/disable vga decode 1939 * 1940 * @pdev: PCI device pointer 1941 * @state: enable/disable vga decode 1942 * 1943 * Enable/disable vga decode (all asics). 1944 * Returns VGA resource flags. 1945 */ 1946 static unsigned int amdgpu_device_vga_set_decode(struct pci_dev *pdev, 1947 bool state) 1948 { 1949 struct amdgpu_device *adev = drm_to_adev(pci_get_drvdata(pdev)); 1950 1951 amdgpu_asic_set_vga_state(adev, state); 1952 if (state) 1953 return VGA_RSRC_LEGACY_IO | VGA_RSRC_LEGACY_MEM | 1954 VGA_RSRC_NORMAL_IO | VGA_RSRC_NORMAL_MEM; 1955 else 1956 return VGA_RSRC_NORMAL_IO | VGA_RSRC_NORMAL_MEM; 1957 } 1958 1959 /** 1960 * amdgpu_device_check_block_size - validate the vm block size 1961 * 1962 * @adev: amdgpu_device pointer 1963 * 1964 * Validates the vm block size specified via module parameter. 1965 * The vm block size defines number of bits in page table versus page directory, 1966 * a page is 4KB so we have 12 bits offset, minimum 9 bits in the 1967 * page table and the remaining bits are in the page directory. 1968 */ 1969 static void amdgpu_device_check_block_size(struct amdgpu_device *adev) 1970 { 1971 /* defines number of bits in page table versus page directory, 1972 * a page is 4KB so we have 12 bits offset, minimum 9 bits in the 1973 * page table and the remaining bits are in the page directory 1974 */ 1975 if (amdgpu_vm_block_size == -1) 1976 return; 1977 1978 if (amdgpu_vm_block_size < 9) { 1979 dev_warn(adev->dev, "VM page table size (%d) too small\n", 1980 amdgpu_vm_block_size); 1981 amdgpu_vm_block_size = -1; 1982 } 1983 } 1984 1985 /** 1986 * amdgpu_device_check_vm_size - validate the vm size 1987 * 1988 * @adev: amdgpu_device pointer 1989 * 1990 * Validates the vm size in GB specified via module parameter. 1991 * The VM size is the size of the GPU virtual memory space in GB. 1992 */ 1993 static void amdgpu_device_check_vm_size(struct amdgpu_device *adev) 1994 { 1995 /* no need to check the default value */ 1996 if (amdgpu_vm_size == -1) 1997 return; 1998 1999 if (amdgpu_vm_size < 1) { 2000 dev_warn(adev->dev, "VM size (%d) too small, min is 1GB\n", 2001 amdgpu_vm_size); 2002 amdgpu_vm_size = -1; 2003 } 2004 } 2005 2006 static void amdgpu_device_check_smu_prv_buffer_size(struct amdgpu_device *adev) 2007 { 2008 struct sysinfo si; 2009 bool is_os_64 = (sizeof(void *) == 8); 2010 uint64_t total_memory; 2011 uint64_t dram_size_seven_GB = 0x1B8000000; 2012 uint64_t dram_size_three_GB = 0xB8000000; 2013 2014 if (amdgpu_smu_memory_pool_size == 0) 2015 return; 2016 2017 if (!is_os_64) { 2018 DRM_WARN("Not 64-bit OS, feature not supported\n"); 2019 goto def_value; 2020 } 2021 si_meminfo(&si); 2022 total_memory = (uint64_t)si.totalram * si.mem_unit; 2023 2024 if ((amdgpu_smu_memory_pool_size == 1) || 2025 (amdgpu_smu_memory_pool_size == 2)) { 2026 if (total_memory < dram_size_three_GB) 2027 goto def_value1; 2028 } else if ((amdgpu_smu_memory_pool_size == 4) || 2029 (amdgpu_smu_memory_pool_size == 8)) { 2030 if (total_memory < dram_size_seven_GB) 2031 goto def_value1; 2032 } else { 2033 DRM_WARN("Smu memory pool size not supported\n"); 2034 goto def_value; 2035 } 2036 adev->pm.smu_prv_buffer_size = amdgpu_smu_memory_pool_size << 28; 2037 2038 return; 2039 2040 def_value1: 2041 DRM_WARN("No enough system memory\n"); 2042 def_value: 2043 adev->pm.smu_prv_buffer_size = 0; 2044 } 2045 2046 static int amdgpu_device_init_apu_flags(struct amdgpu_device *adev) 2047 { 2048 if (!(adev->flags & AMD_IS_APU) || 2049 adev->asic_type < CHIP_RAVEN) 2050 return 0; 2051 2052 switch (adev->asic_type) { 2053 case CHIP_RAVEN: 2054 if (adev->pdev->device == 0x15dd) 2055 adev->apu_flags |= AMD_APU_IS_RAVEN; 2056 if (adev->pdev->device == 0x15d8) 2057 adev->apu_flags |= AMD_APU_IS_PICASSO; 2058 break; 2059 case CHIP_RENOIR: 2060 if ((adev->pdev->device == 0x1636) || 2061 (adev->pdev->device == 0x164c)) 2062 adev->apu_flags |= AMD_APU_IS_RENOIR; 2063 else 2064 adev->apu_flags |= AMD_APU_IS_GREEN_SARDINE; 2065 break; 2066 case CHIP_VANGOGH: 2067 adev->apu_flags |= AMD_APU_IS_VANGOGH; 2068 break; 2069 case CHIP_YELLOW_CARP: 2070 break; 2071 case CHIP_CYAN_SKILLFISH: 2072 if ((adev->pdev->device == 0x13FE) || 2073 (adev->pdev->device == 0x143F)) 2074 adev->apu_flags |= AMD_APU_IS_CYAN_SKILLFISH2; 2075 break; 2076 default: 2077 break; 2078 } 2079 2080 return 0; 2081 } 2082 2083 /** 2084 * amdgpu_device_check_arguments - validate module params 2085 * 2086 * @adev: amdgpu_device pointer 2087 * 2088 * Validates certain module parameters and updates 2089 * the associated values used by the driver (all asics). 2090 */ 2091 static int amdgpu_device_check_arguments(struct amdgpu_device *adev) 2092 { 2093 int i; 2094 2095 if (amdgpu_sched_jobs < 4) { 2096 dev_warn(adev->dev, "sched jobs (%d) must be at least 4\n", 2097 amdgpu_sched_jobs); 2098 amdgpu_sched_jobs = 4; 2099 } else if (!is_power_of_2(amdgpu_sched_jobs)) { 2100 dev_warn(adev->dev, "sched jobs (%d) must be a power of 2\n", 2101 amdgpu_sched_jobs); 2102 amdgpu_sched_jobs = roundup_pow_of_two(amdgpu_sched_jobs); 2103 } 2104 2105 if (amdgpu_gart_size != -1 && amdgpu_gart_size < 32) { 2106 /* gart size must be greater or equal to 32M */ 2107 dev_warn(adev->dev, "gart size (%d) too small\n", 2108 amdgpu_gart_size); 2109 amdgpu_gart_size = -1; 2110 } 2111 2112 if (amdgpu_gtt_size != -1 && amdgpu_gtt_size < 32) { 2113 /* gtt size must be greater or equal to 32M */ 2114 dev_warn(adev->dev, "gtt size (%d) too small\n", 2115 amdgpu_gtt_size); 2116 amdgpu_gtt_size = -1; 2117 } 2118 2119 /* valid range is between 4 and 9 inclusive */ 2120 if (amdgpu_vm_fragment_size != -1 && 2121 (amdgpu_vm_fragment_size > 9 || amdgpu_vm_fragment_size < 4)) { 2122 dev_warn(adev->dev, "valid range is between 4 and 9\n"); 2123 amdgpu_vm_fragment_size = -1; 2124 } 2125 2126 if (amdgpu_sched_hw_submission < 2) { 2127 dev_warn(adev->dev, "sched hw submission jobs (%d) must be at least 2\n", 2128 amdgpu_sched_hw_submission); 2129 amdgpu_sched_hw_submission = 2; 2130 } else if (!is_power_of_2(amdgpu_sched_hw_submission)) { 2131 dev_warn(adev->dev, "sched hw submission jobs (%d) must be a power of 2\n", 2132 amdgpu_sched_hw_submission); 2133 amdgpu_sched_hw_submission = roundup_pow_of_two(amdgpu_sched_hw_submission); 2134 } 2135 2136 if (amdgpu_reset_method < -1 || amdgpu_reset_method > 4) { 2137 dev_warn(adev->dev, "invalid option for reset method, reverting to default\n"); 2138 amdgpu_reset_method = -1; 2139 } 2140 2141 amdgpu_device_check_smu_prv_buffer_size(adev); 2142 2143 amdgpu_device_check_vm_size(adev); 2144 2145 amdgpu_device_check_block_size(adev); 2146 2147 adev->firmware.load_type = amdgpu_ucode_get_load_type(adev, amdgpu_fw_load_type); 2148 2149 for (i = 0; i < MAX_XCP; i++) { 2150 switch (amdgpu_enforce_isolation) { 2151 case -1: 2152 case 0: 2153 default: 2154 /* disable */ 2155 adev->enforce_isolation[i] = AMDGPU_ENFORCE_ISOLATION_DISABLE; 2156 break; 2157 case 1: 2158 /* enable */ 2159 adev->enforce_isolation[i] = 2160 AMDGPU_ENFORCE_ISOLATION_ENABLE; 2161 break; 2162 case 2: 2163 /* enable legacy mode */ 2164 adev->enforce_isolation[i] = 2165 AMDGPU_ENFORCE_ISOLATION_ENABLE_LEGACY; 2166 break; 2167 case 3: 2168 /* enable only process isolation without submitting cleaner shader */ 2169 adev->enforce_isolation[i] = 2170 AMDGPU_ENFORCE_ISOLATION_NO_CLEANER_SHADER; 2171 break; 2172 } 2173 } 2174 2175 return 0; 2176 } 2177 2178 /** 2179 * amdgpu_switcheroo_set_state - set switcheroo state 2180 * 2181 * @pdev: pci dev pointer 2182 * @state: vga_switcheroo state 2183 * 2184 * Callback for the switcheroo driver. Suspends or resumes 2185 * the asics before or after it is powered up using ACPI methods. 2186 */ 2187 static void amdgpu_switcheroo_set_state(struct pci_dev *pdev, 2188 enum vga_switcheroo_state state) 2189 { 2190 struct drm_device *dev = pci_get_drvdata(pdev); 2191 int r; 2192 2193 if (amdgpu_device_supports_px(dev) && state == VGA_SWITCHEROO_OFF) 2194 return; 2195 2196 if (state == VGA_SWITCHEROO_ON) { 2197 pr_info("switched on\n"); 2198 /* don't suspend or resume card normally */ 2199 dev->switch_power_state = DRM_SWITCH_POWER_CHANGING; 2200 2201 pci_set_power_state(pdev, PCI_D0); 2202 amdgpu_device_load_pci_state(pdev); 2203 r = pci_enable_device(pdev); 2204 if (r) 2205 DRM_WARN("pci_enable_device failed (%d)\n", r); 2206 amdgpu_device_resume(dev, true); 2207 2208 dev->switch_power_state = DRM_SWITCH_POWER_ON; 2209 } else { 2210 pr_info("switched off\n"); 2211 dev->switch_power_state = DRM_SWITCH_POWER_CHANGING; 2212 amdgpu_device_prepare(dev); 2213 amdgpu_device_suspend(dev, true); 2214 amdgpu_device_cache_pci_state(pdev); 2215 /* Shut down the device */ 2216 pci_disable_device(pdev); 2217 pci_set_power_state(pdev, PCI_D3cold); 2218 dev->switch_power_state = DRM_SWITCH_POWER_OFF; 2219 } 2220 } 2221 2222 /** 2223 * amdgpu_switcheroo_can_switch - see if switcheroo state can change 2224 * 2225 * @pdev: pci dev pointer 2226 * 2227 * Callback for the switcheroo driver. Check of the switcheroo 2228 * state can be changed. 2229 * Returns true if the state can be changed, false if not. 2230 */ 2231 static bool amdgpu_switcheroo_can_switch(struct pci_dev *pdev) 2232 { 2233 struct drm_device *dev = pci_get_drvdata(pdev); 2234 2235 /* 2236 * FIXME: open_count is protected by drm_global_mutex but that would lead to 2237 * locking inversion with the driver load path. And the access here is 2238 * completely racy anyway. So don't bother with locking for now. 2239 */ 2240 return atomic_read(&dev->open_count) == 0; 2241 } 2242 2243 static const struct vga_switcheroo_client_ops amdgpu_switcheroo_ops = { 2244 .set_gpu_state = amdgpu_switcheroo_set_state, 2245 .reprobe = NULL, 2246 .can_switch = amdgpu_switcheroo_can_switch, 2247 }; 2248 2249 /** 2250 * amdgpu_device_ip_set_clockgating_state - set the CG state 2251 * 2252 * @dev: amdgpu_device pointer 2253 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.) 2254 * @state: clockgating state (gate or ungate) 2255 * 2256 * Sets the requested clockgating state for all instances of 2257 * the hardware IP specified. 2258 * Returns the error code from the last instance. 2259 */ 2260 int amdgpu_device_ip_set_clockgating_state(void *dev, 2261 enum amd_ip_block_type block_type, 2262 enum amd_clockgating_state state) 2263 { 2264 struct amdgpu_device *adev = dev; 2265 int i, r = 0; 2266 2267 for (i = 0; i < adev->num_ip_blocks; i++) { 2268 if (!adev->ip_blocks[i].status.valid) 2269 continue; 2270 if (adev->ip_blocks[i].version->type != block_type) 2271 continue; 2272 if (!adev->ip_blocks[i].version->funcs->set_clockgating_state) 2273 continue; 2274 r = adev->ip_blocks[i].version->funcs->set_clockgating_state( 2275 &adev->ip_blocks[i], state); 2276 if (r) 2277 DRM_ERROR("set_clockgating_state of IP block <%s> failed %d\n", 2278 adev->ip_blocks[i].version->funcs->name, r); 2279 } 2280 return r; 2281 } 2282 2283 /** 2284 * amdgpu_device_ip_set_powergating_state - set the PG state 2285 * 2286 * @dev: amdgpu_device pointer 2287 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.) 2288 * @state: powergating state (gate or ungate) 2289 * 2290 * Sets the requested powergating state for all instances of 2291 * the hardware IP specified. 2292 * Returns the error code from the last instance. 2293 */ 2294 int amdgpu_device_ip_set_powergating_state(void *dev, 2295 enum amd_ip_block_type block_type, 2296 enum amd_powergating_state state) 2297 { 2298 struct amdgpu_device *adev = dev; 2299 int i, r = 0; 2300 2301 for (i = 0; i < adev->num_ip_blocks; i++) { 2302 if (!adev->ip_blocks[i].status.valid) 2303 continue; 2304 if (adev->ip_blocks[i].version->type != block_type) 2305 continue; 2306 if (!adev->ip_blocks[i].version->funcs->set_powergating_state) 2307 continue; 2308 r = adev->ip_blocks[i].version->funcs->set_powergating_state( 2309 &adev->ip_blocks[i], state); 2310 if (r) 2311 DRM_ERROR("set_powergating_state of IP block <%s> failed %d\n", 2312 adev->ip_blocks[i].version->funcs->name, r); 2313 } 2314 return r; 2315 } 2316 2317 /** 2318 * amdgpu_device_ip_get_clockgating_state - get the CG state 2319 * 2320 * @adev: amdgpu_device pointer 2321 * @flags: clockgating feature flags 2322 * 2323 * Walks the list of IPs on the device and updates the clockgating 2324 * flags for each IP. 2325 * Updates @flags with the feature flags for each hardware IP where 2326 * clockgating is enabled. 2327 */ 2328 void amdgpu_device_ip_get_clockgating_state(struct amdgpu_device *adev, 2329 u64 *flags) 2330 { 2331 int i; 2332 2333 for (i = 0; i < adev->num_ip_blocks; i++) { 2334 if (!adev->ip_blocks[i].status.valid) 2335 continue; 2336 if (adev->ip_blocks[i].version->funcs->get_clockgating_state) 2337 adev->ip_blocks[i].version->funcs->get_clockgating_state( 2338 &adev->ip_blocks[i], flags); 2339 } 2340 } 2341 2342 /** 2343 * amdgpu_device_ip_wait_for_idle - wait for idle 2344 * 2345 * @adev: amdgpu_device pointer 2346 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.) 2347 * 2348 * Waits for the request hardware IP to be idle. 2349 * Returns 0 for success or a negative error code on failure. 2350 */ 2351 int amdgpu_device_ip_wait_for_idle(struct amdgpu_device *adev, 2352 enum amd_ip_block_type block_type) 2353 { 2354 int i, r; 2355 2356 for (i = 0; i < adev->num_ip_blocks; i++) { 2357 if (!adev->ip_blocks[i].status.valid) 2358 continue; 2359 if (adev->ip_blocks[i].version->type == block_type) { 2360 if (adev->ip_blocks[i].version->funcs->wait_for_idle) { 2361 r = adev->ip_blocks[i].version->funcs->wait_for_idle( 2362 &adev->ip_blocks[i]); 2363 if (r) 2364 return r; 2365 } 2366 break; 2367 } 2368 } 2369 return 0; 2370 2371 } 2372 2373 /** 2374 * amdgpu_device_ip_is_valid - is the hardware IP enabled 2375 * 2376 * @adev: amdgpu_device pointer 2377 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.) 2378 * 2379 * Check if the hardware IP is enable or not. 2380 * Returns true if it the IP is enable, false if not. 2381 */ 2382 bool amdgpu_device_ip_is_valid(struct amdgpu_device *adev, 2383 enum amd_ip_block_type block_type) 2384 { 2385 int i; 2386 2387 for (i = 0; i < adev->num_ip_blocks; i++) { 2388 if (adev->ip_blocks[i].version->type == block_type) 2389 return adev->ip_blocks[i].status.valid; 2390 } 2391 return false; 2392 2393 } 2394 2395 /** 2396 * amdgpu_device_ip_get_ip_block - get a hw IP pointer 2397 * 2398 * @adev: amdgpu_device pointer 2399 * @type: Type of hardware IP (SMU, GFX, UVD, etc.) 2400 * 2401 * Returns a pointer to the hardware IP block structure 2402 * if it exists for the asic, otherwise NULL. 2403 */ 2404 struct amdgpu_ip_block * 2405 amdgpu_device_ip_get_ip_block(struct amdgpu_device *adev, 2406 enum amd_ip_block_type type) 2407 { 2408 int i; 2409 2410 for (i = 0; i < adev->num_ip_blocks; i++) 2411 if (adev->ip_blocks[i].version->type == type) 2412 return &adev->ip_blocks[i]; 2413 2414 return NULL; 2415 } 2416 2417 /** 2418 * amdgpu_device_ip_block_version_cmp 2419 * 2420 * @adev: amdgpu_device pointer 2421 * @type: enum amd_ip_block_type 2422 * @major: major version 2423 * @minor: minor version 2424 * 2425 * return 0 if equal or greater 2426 * return 1 if smaller or the ip_block doesn't exist 2427 */ 2428 int amdgpu_device_ip_block_version_cmp(struct amdgpu_device *adev, 2429 enum amd_ip_block_type type, 2430 u32 major, u32 minor) 2431 { 2432 struct amdgpu_ip_block *ip_block = amdgpu_device_ip_get_ip_block(adev, type); 2433 2434 if (ip_block && ((ip_block->version->major > major) || 2435 ((ip_block->version->major == major) && 2436 (ip_block->version->minor >= minor)))) 2437 return 0; 2438 2439 return 1; 2440 } 2441 2442 /** 2443 * amdgpu_device_ip_block_add 2444 * 2445 * @adev: amdgpu_device pointer 2446 * @ip_block_version: pointer to the IP to add 2447 * 2448 * Adds the IP block driver information to the collection of IPs 2449 * on the asic. 2450 */ 2451 int amdgpu_device_ip_block_add(struct amdgpu_device *adev, 2452 const struct amdgpu_ip_block_version *ip_block_version) 2453 { 2454 if (!ip_block_version) 2455 return -EINVAL; 2456 2457 switch (ip_block_version->type) { 2458 case AMD_IP_BLOCK_TYPE_VCN: 2459 if (adev->harvest_ip_mask & AMD_HARVEST_IP_VCN_MASK) 2460 return 0; 2461 break; 2462 case AMD_IP_BLOCK_TYPE_JPEG: 2463 if (adev->harvest_ip_mask & AMD_HARVEST_IP_JPEG_MASK) 2464 return 0; 2465 break; 2466 default: 2467 break; 2468 } 2469 2470 dev_info(adev->dev, "detected ip block number %d <%s>\n", 2471 adev->num_ip_blocks, ip_block_version->funcs->name); 2472 2473 adev->ip_blocks[adev->num_ip_blocks].adev = adev; 2474 2475 adev->ip_blocks[adev->num_ip_blocks++].version = ip_block_version; 2476 2477 return 0; 2478 } 2479 2480 /** 2481 * amdgpu_device_enable_virtual_display - enable virtual display feature 2482 * 2483 * @adev: amdgpu_device pointer 2484 * 2485 * Enabled the virtual display feature if the user has enabled it via 2486 * the module parameter virtual_display. This feature provides a virtual 2487 * display hardware on headless boards or in virtualized environments. 2488 * This function parses and validates the configuration string specified by 2489 * the user and configures the virtual display configuration (number of 2490 * virtual connectors, crtcs, etc.) specified. 2491 */ 2492 static void amdgpu_device_enable_virtual_display(struct amdgpu_device *adev) 2493 { 2494 adev->enable_virtual_display = false; 2495 2496 if (amdgpu_virtual_display) { 2497 const char *pci_address_name = pci_name(adev->pdev); 2498 char *pciaddstr, *pciaddstr_tmp, *pciaddname_tmp, *pciaddname; 2499 2500 pciaddstr = kstrdup(amdgpu_virtual_display, GFP_KERNEL); 2501 pciaddstr_tmp = pciaddstr; 2502 while ((pciaddname_tmp = strsep(&pciaddstr_tmp, ";"))) { 2503 pciaddname = strsep(&pciaddname_tmp, ","); 2504 if (!strcmp("all", pciaddname) 2505 || !strcmp(pci_address_name, pciaddname)) { 2506 long num_crtc; 2507 int res = -1; 2508 2509 adev->enable_virtual_display = true; 2510 2511 if (pciaddname_tmp) 2512 res = kstrtol(pciaddname_tmp, 10, 2513 &num_crtc); 2514 2515 if (!res) { 2516 if (num_crtc < 1) 2517 num_crtc = 1; 2518 if (num_crtc > 6) 2519 num_crtc = 6; 2520 adev->mode_info.num_crtc = num_crtc; 2521 } else { 2522 adev->mode_info.num_crtc = 1; 2523 } 2524 break; 2525 } 2526 } 2527 2528 DRM_INFO("virtual display string:%s, %s:virtual_display:%d, num_crtc:%d\n", 2529 amdgpu_virtual_display, pci_address_name, 2530 adev->enable_virtual_display, adev->mode_info.num_crtc); 2531 2532 kfree(pciaddstr); 2533 } 2534 } 2535 2536 void amdgpu_device_set_sriov_virtual_display(struct amdgpu_device *adev) 2537 { 2538 if (amdgpu_sriov_vf(adev) && !adev->enable_virtual_display) { 2539 adev->mode_info.num_crtc = 1; 2540 adev->enable_virtual_display = true; 2541 DRM_INFO("virtual_display:%d, num_crtc:%d\n", 2542 adev->enable_virtual_display, adev->mode_info.num_crtc); 2543 } 2544 } 2545 2546 /** 2547 * amdgpu_device_parse_gpu_info_fw - parse gpu info firmware 2548 * 2549 * @adev: amdgpu_device pointer 2550 * 2551 * Parses the asic configuration parameters specified in the gpu info 2552 * firmware and makes them available to the driver for use in configuring 2553 * the asic. 2554 * Returns 0 on success, -EINVAL on failure. 2555 */ 2556 static int amdgpu_device_parse_gpu_info_fw(struct amdgpu_device *adev) 2557 { 2558 const char *chip_name; 2559 int err; 2560 const struct gpu_info_firmware_header_v1_0 *hdr; 2561 2562 adev->firmware.gpu_info_fw = NULL; 2563 2564 if (adev->mman.discovery_bin) 2565 return 0; 2566 2567 switch (adev->asic_type) { 2568 default: 2569 return 0; 2570 case CHIP_VEGA10: 2571 chip_name = "vega10"; 2572 break; 2573 case CHIP_VEGA12: 2574 chip_name = "vega12"; 2575 break; 2576 case CHIP_RAVEN: 2577 if (adev->apu_flags & AMD_APU_IS_RAVEN2) 2578 chip_name = "raven2"; 2579 else if (adev->apu_flags & AMD_APU_IS_PICASSO) 2580 chip_name = "picasso"; 2581 else 2582 chip_name = "raven"; 2583 break; 2584 case CHIP_ARCTURUS: 2585 chip_name = "arcturus"; 2586 break; 2587 case CHIP_NAVI12: 2588 chip_name = "navi12"; 2589 break; 2590 } 2591 2592 err = amdgpu_ucode_request(adev, &adev->firmware.gpu_info_fw, 2593 AMDGPU_UCODE_OPTIONAL, 2594 "amdgpu/%s_gpu_info.bin", chip_name); 2595 if (err) { 2596 dev_err(adev->dev, 2597 "Failed to get gpu_info firmware \"%s_gpu_info.bin\"\n", 2598 chip_name); 2599 goto out; 2600 } 2601 2602 hdr = (const struct gpu_info_firmware_header_v1_0 *)adev->firmware.gpu_info_fw->data; 2603 amdgpu_ucode_print_gpu_info_hdr(&hdr->header); 2604 2605 switch (hdr->version_major) { 2606 case 1: 2607 { 2608 const struct gpu_info_firmware_v1_0 *gpu_info_fw = 2609 (const struct gpu_info_firmware_v1_0 *)(adev->firmware.gpu_info_fw->data + 2610 le32_to_cpu(hdr->header.ucode_array_offset_bytes)); 2611 2612 /* 2613 * Should be dropped when DAL no longer needs it. 2614 */ 2615 if (adev->asic_type == CHIP_NAVI12) 2616 goto parse_soc_bounding_box; 2617 2618 adev->gfx.config.max_shader_engines = le32_to_cpu(gpu_info_fw->gc_num_se); 2619 adev->gfx.config.max_cu_per_sh = le32_to_cpu(gpu_info_fw->gc_num_cu_per_sh); 2620 adev->gfx.config.max_sh_per_se = le32_to_cpu(gpu_info_fw->gc_num_sh_per_se); 2621 adev->gfx.config.max_backends_per_se = le32_to_cpu(gpu_info_fw->gc_num_rb_per_se); 2622 adev->gfx.config.max_texture_channel_caches = 2623 le32_to_cpu(gpu_info_fw->gc_num_tccs); 2624 adev->gfx.config.max_gprs = le32_to_cpu(gpu_info_fw->gc_num_gprs); 2625 adev->gfx.config.max_gs_threads = le32_to_cpu(gpu_info_fw->gc_num_max_gs_thds); 2626 adev->gfx.config.gs_vgt_table_depth = le32_to_cpu(gpu_info_fw->gc_gs_table_depth); 2627 adev->gfx.config.gs_prim_buffer_depth = le32_to_cpu(gpu_info_fw->gc_gsprim_buff_depth); 2628 adev->gfx.config.double_offchip_lds_buf = 2629 le32_to_cpu(gpu_info_fw->gc_double_offchip_lds_buffer); 2630 adev->gfx.cu_info.wave_front_size = le32_to_cpu(gpu_info_fw->gc_wave_size); 2631 adev->gfx.cu_info.max_waves_per_simd = 2632 le32_to_cpu(gpu_info_fw->gc_max_waves_per_simd); 2633 adev->gfx.cu_info.max_scratch_slots_per_cu = 2634 le32_to_cpu(gpu_info_fw->gc_max_scratch_slots_per_cu); 2635 adev->gfx.cu_info.lds_size = le32_to_cpu(gpu_info_fw->gc_lds_size); 2636 if (hdr->version_minor >= 1) { 2637 const struct gpu_info_firmware_v1_1 *gpu_info_fw = 2638 (const struct gpu_info_firmware_v1_1 *)(adev->firmware.gpu_info_fw->data + 2639 le32_to_cpu(hdr->header.ucode_array_offset_bytes)); 2640 adev->gfx.config.num_sc_per_sh = 2641 le32_to_cpu(gpu_info_fw->num_sc_per_sh); 2642 adev->gfx.config.num_packer_per_sc = 2643 le32_to_cpu(gpu_info_fw->num_packer_per_sc); 2644 } 2645 2646 parse_soc_bounding_box: 2647 /* 2648 * soc bounding box info is not integrated in disocovery table, 2649 * we always need to parse it from gpu info firmware if needed. 2650 */ 2651 if (hdr->version_minor == 2) { 2652 const struct gpu_info_firmware_v1_2 *gpu_info_fw = 2653 (const struct gpu_info_firmware_v1_2 *)(adev->firmware.gpu_info_fw->data + 2654 le32_to_cpu(hdr->header.ucode_array_offset_bytes)); 2655 adev->dm.soc_bounding_box = &gpu_info_fw->soc_bounding_box; 2656 } 2657 break; 2658 } 2659 default: 2660 dev_err(adev->dev, 2661 "Unsupported gpu_info table %d\n", hdr->header.ucode_version); 2662 err = -EINVAL; 2663 goto out; 2664 } 2665 out: 2666 return err; 2667 } 2668 2669 /** 2670 * amdgpu_device_ip_early_init - run early init for hardware IPs 2671 * 2672 * @adev: amdgpu_device pointer 2673 * 2674 * Early initialization pass for hardware IPs. The hardware IPs that make 2675 * up each asic are discovered each IP's early_init callback is run. This 2676 * is the first stage in initializing the asic. 2677 * Returns 0 on success, negative error code on failure. 2678 */ 2679 static int amdgpu_device_ip_early_init(struct amdgpu_device *adev) 2680 { 2681 struct amdgpu_ip_block *ip_block; 2682 struct pci_dev *parent; 2683 bool total, skip_bios; 2684 uint32_t bios_flags; 2685 int i, r; 2686 2687 amdgpu_device_enable_virtual_display(adev); 2688 2689 if (amdgpu_sriov_vf(adev)) { 2690 r = amdgpu_virt_request_full_gpu(adev, true); 2691 if (r) 2692 return r; 2693 } 2694 2695 switch (adev->asic_type) { 2696 #ifdef CONFIG_DRM_AMDGPU_SI 2697 case CHIP_VERDE: 2698 case CHIP_TAHITI: 2699 case CHIP_PITCAIRN: 2700 case CHIP_OLAND: 2701 case CHIP_HAINAN: 2702 adev->family = AMDGPU_FAMILY_SI; 2703 r = si_set_ip_blocks(adev); 2704 if (r) 2705 return r; 2706 break; 2707 #endif 2708 #ifdef CONFIG_DRM_AMDGPU_CIK 2709 case CHIP_BONAIRE: 2710 case CHIP_HAWAII: 2711 case CHIP_KAVERI: 2712 case CHIP_KABINI: 2713 case CHIP_MULLINS: 2714 if (adev->flags & AMD_IS_APU) 2715 adev->family = AMDGPU_FAMILY_KV; 2716 else 2717 adev->family = AMDGPU_FAMILY_CI; 2718 2719 r = cik_set_ip_blocks(adev); 2720 if (r) 2721 return r; 2722 break; 2723 #endif 2724 case CHIP_TOPAZ: 2725 case CHIP_TONGA: 2726 case CHIP_FIJI: 2727 case CHIP_POLARIS10: 2728 case CHIP_POLARIS11: 2729 case CHIP_POLARIS12: 2730 case CHIP_VEGAM: 2731 case CHIP_CARRIZO: 2732 case CHIP_STONEY: 2733 if (adev->flags & AMD_IS_APU) 2734 adev->family = AMDGPU_FAMILY_CZ; 2735 else 2736 adev->family = AMDGPU_FAMILY_VI; 2737 2738 r = vi_set_ip_blocks(adev); 2739 if (r) 2740 return r; 2741 break; 2742 default: 2743 r = amdgpu_discovery_set_ip_blocks(adev); 2744 if (r) 2745 return r; 2746 break; 2747 } 2748 2749 /* Check for IP version 9.4.3 with A0 hardware */ 2750 if (amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 3) && 2751 !amdgpu_device_get_rev_id(adev)) { 2752 dev_err(adev->dev, "Unsupported A0 hardware\n"); 2753 return -ENODEV; /* device unsupported - no device error */ 2754 } 2755 2756 if (amdgpu_has_atpx() && 2757 (amdgpu_is_atpx_hybrid() || 2758 amdgpu_has_atpx_dgpu_power_cntl()) && 2759 ((adev->flags & AMD_IS_APU) == 0) && 2760 !dev_is_removable(&adev->pdev->dev)) 2761 adev->flags |= AMD_IS_PX; 2762 2763 if (!(adev->flags & AMD_IS_APU)) { 2764 parent = pcie_find_root_port(adev->pdev); 2765 adev->has_pr3 = parent ? pci_pr3_present(parent) : false; 2766 } 2767 2768 adev->pm.pp_feature = amdgpu_pp_feature_mask; 2769 if (amdgpu_sriov_vf(adev) || sched_policy == KFD_SCHED_POLICY_NO_HWS) 2770 adev->pm.pp_feature &= ~PP_GFXOFF_MASK; 2771 if (amdgpu_sriov_vf(adev) && adev->asic_type == CHIP_SIENNA_CICHLID) 2772 adev->pm.pp_feature &= ~PP_OVERDRIVE_MASK; 2773 if (!amdgpu_device_pcie_dynamic_switching_supported(adev)) 2774 adev->pm.pp_feature &= ~PP_PCIE_DPM_MASK; 2775 2776 adev->virt.is_xgmi_node_migrate_enabled = false; 2777 if (amdgpu_sriov_vf(adev)) { 2778 adev->virt.is_xgmi_node_migrate_enabled = 2779 amdgpu_ip_version((adev), GC_HWIP, 0) == IP_VERSION(9, 4, 4); 2780 } 2781 2782 total = true; 2783 for (i = 0; i < adev->num_ip_blocks; i++) { 2784 ip_block = &adev->ip_blocks[i]; 2785 2786 if ((amdgpu_ip_block_mask & (1 << i)) == 0) { 2787 DRM_WARN("disabled ip block: %d <%s>\n", 2788 i, adev->ip_blocks[i].version->funcs->name); 2789 adev->ip_blocks[i].status.valid = false; 2790 } else if (ip_block->version->funcs->early_init) { 2791 r = ip_block->version->funcs->early_init(ip_block); 2792 if (r == -ENOENT) { 2793 adev->ip_blocks[i].status.valid = false; 2794 } else if (r) { 2795 DRM_ERROR("early_init of IP block <%s> failed %d\n", 2796 adev->ip_blocks[i].version->funcs->name, r); 2797 total = false; 2798 } else { 2799 adev->ip_blocks[i].status.valid = true; 2800 } 2801 } else { 2802 adev->ip_blocks[i].status.valid = true; 2803 } 2804 /* get the vbios after the asic_funcs are set up */ 2805 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON) { 2806 r = amdgpu_device_parse_gpu_info_fw(adev); 2807 if (r) 2808 return r; 2809 2810 bios_flags = amdgpu_device_get_vbios_flags(adev); 2811 skip_bios = !!(bios_flags & AMDGPU_VBIOS_SKIP); 2812 /* Read BIOS */ 2813 if (!skip_bios) { 2814 bool optional = 2815 !!(bios_flags & AMDGPU_VBIOS_OPTIONAL); 2816 if (!amdgpu_get_bios(adev) && !optional) 2817 return -EINVAL; 2818 2819 if (optional && !adev->bios) 2820 dev_info( 2821 adev->dev, 2822 "VBIOS image optional, proceeding without VBIOS image"); 2823 2824 if (adev->bios) { 2825 r = amdgpu_atombios_init(adev); 2826 if (r) { 2827 dev_err(adev->dev, 2828 "amdgpu_atombios_init failed\n"); 2829 amdgpu_vf_error_put( 2830 adev, 2831 AMDGIM_ERROR_VF_ATOMBIOS_INIT_FAIL, 2832 0, 0); 2833 return r; 2834 } 2835 } 2836 } 2837 2838 /*get pf2vf msg info at it's earliest time*/ 2839 if (amdgpu_sriov_vf(adev)) 2840 amdgpu_virt_init_data_exchange(adev); 2841 2842 } 2843 } 2844 if (!total) 2845 return -ENODEV; 2846 2847 if (adev->gmc.xgmi.supported) 2848 amdgpu_xgmi_early_init(adev); 2849 2850 ip_block = amdgpu_device_ip_get_ip_block(adev, AMD_IP_BLOCK_TYPE_GFX); 2851 if (ip_block->status.valid != false) 2852 amdgpu_amdkfd_device_probe(adev); 2853 2854 adev->cg_flags &= amdgpu_cg_mask; 2855 adev->pg_flags &= amdgpu_pg_mask; 2856 2857 return 0; 2858 } 2859 2860 static int amdgpu_device_ip_hw_init_phase1(struct amdgpu_device *adev) 2861 { 2862 int i, r; 2863 2864 for (i = 0; i < adev->num_ip_blocks; i++) { 2865 if (!adev->ip_blocks[i].status.sw) 2866 continue; 2867 if (adev->ip_blocks[i].status.hw) 2868 continue; 2869 if (!amdgpu_ip_member_of_hwini( 2870 adev, adev->ip_blocks[i].version->type)) 2871 continue; 2872 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON || 2873 (amdgpu_sriov_vf(adev) && (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP)) || 2874 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH) { 2875 r = adev->ip_blocks[i].version->funcs->hw_init(&adev->ip_blocks[i]); 2876 if (r) { 2877 DRM_ERROR("hw_init of IP block <%s> failed %d\n", 2878 adev->ip_blocks[i].version->funcs->name, r); 2879 return r; 2880 } 2881 adev->ip_blocks[i].status.hw = true; 2882 } 2883 } 2884 2885 return 0; 2886 } 2887 2888 static int amdgpu_device_ip_hw_init_phase2(struct amdgpu_device *adev) 2889 { 2890 int i, r; 2891 2892 for (i = 0; i < adev->num_ip_blocks; i++) { 2893 if (!adev->ip_blocks[i].status.sw) 2894 continue; 2895 if (adev->ip_blocks[i].status.hw) 2896 continue; 2897 if (!amdgpu_ip_member_of_hwini( 2898 adev, adev->ip_blocks[i].version->type)) 2899 continue; 2900 r = adev->ip_blocks[i].version->funcs->hw_init(&adev->ip_blocks[i]); 2901 if (r) { 2902 DRM_ERROR("hw_init of IP block <%s> failed %d\n", 2903 adev->ip_blocks[i].version->funcs->name, r); 2904 return r; 2905 } 2906 adev->ip_blocks[i].status.hw = true; 2907 } 2908 2909 return 0; 2910 } 2911 2912 static int amdgpu_device_fw_loading(struct amdgpu_device *adev) 2913 { 2914 int r = 0; 2915 int i; 2916 uint32_t smu_version; 2917 2918 if (adev->asic_type >= CHIP_VEGA10) { 2919 for (i = 0; i < adev->num_ip_blocks; i++) { 2920 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_PSP) 2921 continue; 2922 2923 if (!amdgpu_ip_member_of_hwini(adev, 2924 AMD_IP_BLOCK_TYPE_PSP)) 2925 break; 2926 2927 if (!adev->ip_blocks[i].status.sw) 2928 continue; 2929 2930 /* no need to do the fw loading again if already done*/ 2931 if (adev->ip_blocks[i].status.hw == true) 2932 break; 2933 2934 if (amdgpu_in_reset(adev) || adev->in_suspend) { 2935 r = amdgpu_ip_block_resume(&adev->ip_blocks[i]); 2936 if (r) 2937 return r; 2938 } else { 2939 r = adev->ip_blocks[i].version->funcs->hw_init(&adev->ip_blocks[i]); 2940 if (r) { 2941 DRM_ERROR("hw_init of IP block <%s> failed %d\n", 2942 adev->ip_blocks[i].version->funcs->name, r); 2943 return r; 2944 } 2945 adev->ip_blocks[i].status.hw = true; 2946 } 2947 break; 2948 } 2949 } 2950 2951 if (!amdgpu_sriov_vf(adev) || adev->asic_type == CHIP_TONGA) 2952 r = amdgpu_pm_load_smu_firmware(adev, &smu_version); 2953 2954 return r; 2955 } 2956 2957 static int amdgpu_device_init_schedulers(struct amdgpu_device *adev) 2958 { 2959 struct drm_sched_init_args args = { 2960 .ops = &amdgpu_sched_ops, 2961 .num_rqs = DRM_SCHED_PRIORITY_COUNT, 2962 .timeout_wq = adev->reset_domain->wq, 2963 .dev = adev->dev, 2964 }; 2965 long timeout; 2966 int r, i; 2967 2968 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 2969 struct amdgpu_ring *ring = adev->rings[i]; 2970 2971 /* No need to setup the GPU scheduler for rings that don't need it */ 2972 if (!ring || ring->no_scheduler) 2973 continue; 2974 2975 switch (ring->funcs->type) { 2976 case AMDGPU_RING_TYPE_GFX: 2977 timeout = adev->gfx_timeout; 2978 break; 2979 case AMDGPU_RING_TYPE_COMPUTE: 2980 timeout = adev->compute_timeout; 2981 break; 2982 case AMDGPU_RING_TYPE_SDMA: 2983 timeout = adev->sdma_timeout; 2984 break; 2985 default: 2986 timeout = adev->video_timeout; 2987 break; 2988 } 2989 2990 args.timeout = timeout; 2991 args.credit_limit = ring->num_hw_submission; 2992 args.score = ring->sched_score; 2993 args.name = ring->name; 2994 2995 r = drm_sched_init(&ring->sched, &args); 2996 if (r) { 2997 DRM_ERROR("Failed to create scheduler on ring %s.\n", 2998 ring->name); 2999 return r; 3000 } 3001 r = amdgpu_uvd_entity_init(adev, ring); 3002 if (r) { 3003 DRM_ERROR("Failed to create UVD scheduling entity on ring %s.\n", 3004 ring->name); 3005 return r; 3006 } 3007 r = amdgpu_vce_entity_init(adev, ring); 3008 if (r) { 3009 DRM_ERROR("Failed to create VCE scheduling entity on ring %s.\n", 3010 ring->name); 3011 return r; 3012 } 3013 } 3014 3015 if (adev->xcp_mgr) 3016 amdgpu_xcp_update_partition_sched_list(adev); 3017 3018 return 0; 3019 } 3020 3021 3022 /** 3023 * amdgpu_device_ip_init - run init for hardware IPs 3024 * 3025 * @adev: amdgpu_device pointer 3026 * 3027 * Main initialization pass for hardware IPs. The list of all the hardware 3028 * IPs that make up the asic is walked and the sw_init and hw_init callbacks 3029 * are run. sw_init initializes the software state associated with each IP 3030 * and hw_init initializes the hardware associated with each IP. 3031 * Returns 0 on success, negative error code on failure. 3032 */ 3033 static int amdgpu_device_ip_init(struct amdgpu_device *adev) 3034 { 3035 bool init_badpage; 3036 int i, r; 3037 3038 r = amdgpu_ras_init(adev); 3039 if (r) 3040 return r; 3041 3042 for (i = 0; i < adev->num_ip_blocks; i++) { 3043 if (!adev->ip_blocks[i].status.valid) 3044 continue; 3045 if (adev->ip_blocks[i].version->funcs->sw_init) { 3046 r = adev->ip_blocks[i].version->funcs->sw_init(&adev->ip_blocks[i]); 3047 if (r) { 3048 DRM_ERROR("sw_init of IP block <%s> failed %d\n", 3049 adev->ip_blocks[i].version->funcs->name, r); 3050 goto init_failed; 3051 } 3052 } 3053 adev->ip_blocks[i].status.sw = true; 3054 3055 if (!amdgpu_ip_member_of_hwini( 3056 adev, adev->ip_blocks[i].version->type)) 3057 continue; 3058 3059 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON) { 3060 /* need to do common hw init early so everything is set up for gmc */ 3061 r = adev->ip_blocks[i].version->funcs->hw_init(&adev->ip_blocks[i]); 3062 if (r) { 3063 DRM_ERROR("hw_init %d failed %d\n", i, r); 3064 goto init_failed; 3065 } 3066 adev->ip_blocks[i].status.hw = true; 3067 } else if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) { 3068 /* need to do gmc hw init early so we can allocate gpu mem */ 3069 /* Try to reserve bad pages early */ 3070 if (amdgpu_sriov_vf(adev)) 3071 amdgpu_virt_exchange_data(adev); 3072 3073 r = amdgpu_device_mem_scratch_init(adev); 3074 if (r) { 3075 DRM_ERROR("amdgpu_mem_scratch_init failed %d\n", r); 3076 goto init_failed; 3077 } 3078 r = adev->ip_blocks[i].version->funcs->hw_init(&adev->ip_blocks[i]); 3079 if (r) { 3080 DRM_ERROR("hw_init %d failed %d\n", i, r); 3081 goto init_failed; 3082 } 3083 r = amdgpu_device_wb_init(adev); 3084 if (r) { 3085 DRM_ERROR("amdgpu_device_wb_init failed %d\n", r); 3086 goto init_failed; 3087 } 3088 adev->ip_blocks[i].status.hw = true; 3089 3090 /* right after GMC hw init, we create CSA */ 3091 if (adev->gfx.mcbp) { 3092 r = amdgpu_allocate_static_csa(adev, &adev->virt.csa_obj, 3093 AMDGPU_GEM_DOMAIN_VRAM | 3094 AMDGPU_GEM_DOMAIN_GTT, 3095 AMDGPU_CSA_SIZE); 3096 if (r) { 3097 DRM_ERROR("allocate CSA failed %d\n", r); 3098 goto init_failed; 3099 } 3100 } 3101 3102 r = amdgpu_seq64_init(adev); 3103 if (r) { 3104 DRM_ERROR("allocate seq64 failed %d\n", r); 3105 goto init_failed; 3106 } 3107 } 3108 } 3109 3110 if (amdgpu_sriov_vf(adev)) 3111 amdgpu_virt_init_data_exchange(adev); 3112 3113 r = amdgpu_ib_pool_init(adev); 3114 if (r) { 3115 dev_err(adev->dev, "IB initialization failed (%d).\n", r); 3116 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_IB_INIT_FAIL, 0, r); 3117 goto init_failed; 3118 } 3119 3120 r = amdgpu_ucode_create_bo(adev); /* create ucode bo when sw_init complete*/ 3121 if (r) 3122 goto init_failed; 3123 3124 r = amdgpu_device_ip_hw_init_phase1(adev); 3125 if (r) 3126 goto init_failed; 3127 3128 r = amdgpu_device_fw_loading(adev); 3129 if (r) 3130 goto init_failed; 3131 3132 r = amdgpu_device_ip_hw_init_phase2(adev); 3133 if (r) 3134 goto init_failed; 3135 3136 /* 3137 * retired pages will be loaded from eeprom and reserved here, 3138 * it should be called after amdgpu_device_ip_hw_init_phase2 since 3139 * for some ASICs the RAS EEPROM code relies on SMU fully functioning 3140 * for I2C communication which only true at this point. 3141 * 3142 * amdgpu_ras_recovery_init may fail, but the upper only cares the 3143 * failure from bad gpu situation and stop amdgpu init process 3144 * accordingly. For other failed cases, it will still release all 3145 * the resource and print error message, rather than returning one 3146 * negative value to upper level. 3147 * 3148 * Note: theoretically, this should be called before all vram allocations 3149 * to protect retired page from abusing 3150 */ 3151 init_badpage = (adev->init_lvl->level != AMDGPU_INIT_LEVEL_MINIMAL_XGMI); 3152 r = amdgpu_ras_recovery_init(adev, init_badpage); 3153 if (r) 3154 goto init_failed; 3155 3156 /** 3157 * In case of XGMI grab extra reference for reset domain for this device 3158 */ 3159 if (adev->gmc.xgmi.num_physical_nodes > 1) { 3160 if (amdgpu_xgmi_add_device(adev) == 0) { 3161 if (!amdgpu_sriov_vf(adev)) { 3162 struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev); 3163 3164 if (WARN_ON(!hive)) { 3165 r = -ENOENT; 3166 goto init_failed; 3167 } 3168 3169 if (!hive->reset_domain || 3170 !amdgpu_reset_get_reset_domain(hive->reset_domain)) { 3171 r = -ENOENT; 3172 amdgpu_put_xgmi_hive(hive); 3173 goto init_failed; 3174 } 3175 3176 /* Drop the early temporary reset domain we created for device */ 3177 amdgpu_reset_put_reset_domain(adev->reset_domain); 3178 adev->reset_domain = hive->reset_domain; 3179 amdgpu_put_xgmi_hive(hive); 3180 } 3181 } 3182 } 3183 3184 r = amdgpu_device_init_schedulers(adev); 3185 if (r) 3186 goto init_failed; 3187 3188 if (adev->mman.buffer_funcs_ring->sched.ready) 3189 amdgpu_ttm_set_buffer_funcs_status(adev, true); 3190 3191 /* Don't init kfd if whole hive need to be reset during init */ 3192 if (adev->init_lvl->level != AMDGPU_INIT_LEVEL_MINIMAL_XGMI) { 3193 kgd2kfd_init_zone_device(adev); 3194 amdgpu_amdkfd_device_init(adev); 3195 } 3196 3197 amdgpu_fru_get_product_info(adev); 3198 3199 if (!amdgpu_sriov_vf(adev) || amdgpu_sriov_ras_cper_en(adev)) 3200 r = amdgpu_cper_init(adev); 3201 3202 init_failed: 3203 3204 return r; 3205 } 3206 3207 /** 3208 * amdgpu_device_fill_reset_magic - writes reset magic to gart pointer 3209 * 3210 * @adev: amdgpu_device pointer 3211 * 3212 * Writes a reset magic value to the gart pointer in VRAM. The driver calls 3213 * this function before a GPU reset. If the value is retained after a 3214 * GPU reset, VRAM has not been lost. Some GPU resets may destroy VRAM contents. 3215 */ 3216 static void amdgpu_device_fill_reset_magic(struct amdgpu_device *adev) 3217 { 3218 memcpy(adev->reset_magic, adev->gart.ptr, AMDGPU_RESET_MAGIC_NUM); 3219 } 3220 3221 /** 3222 * amdgpu_device_check_vram_lost - check if vram is valid 3223 * 3224 * @adev: amdgpu_device pointer 3225 * 3226 * Checks the reset magic value written to the gart pointer in VRAM. 3227 * The driver calls this after a GPU reset to see if the contents of 3228 * VRAM is lost or now. 3229 * returns true if vram is lost, false if not. 3230 */ 3231 static bool amdgpu_device_check_vram_lost(struct amdgpu_device *adev) 3232 { 3233 if (memcmp(adev->gart.ptr, adev->reset_magic, 3234 AMDGPU_RESET_MAGIC_NUM)) 3235 return true; 3236 3237 if (!amdgpu_in_reset(adev)) 3238 return false; 3239 3240 /* 3241 * For all ASICs with baco/mode1 reset, the VRAM is 3242 * always assumed to be lost. 3243 */ 3244 switch (amdgpu_asic_reset_method(adev)) { 3245 case AMD_RESET_METHOD_LINK: 3246 case AMD_RESET_METHOD_BACO: 3247 case AMD_RESET_METHOD_MODE1: 3248 return true; 3249 default: 3250 return false; 3251 } 3252 } 3253 3254 /** 3255 * amdgpu_device_set_cg_state - set clockgating for amdgpu device 3256 * 3257 * @adev: amdgpu_device pointer 3258 * @state: clockgating state (gate or ungate) 3259 * 3260 * The list of all the hardware IPs that make up the asic is walked and the 3261 * set_clockgating_state callbacks are run. 3262 * Late initialization pass enabling clockgating for hardware IPs. 3263 * Fini or suspend, pass disabling clockgating for hardware IPs. 3264 * Returns 0 on success, negative error code on failure. 3265 */ 3266 3267 int amdgpu_device_set_cg_state(struct amdgpu_device *adev, 3268 enum amd_clockgating_state state) 3269 { 3270 int i, j, r; 3271 3272 if (amdgpu_emu_mode == 1) 3273 return 0; 3274 3275 for (j = 0; j < adev->num_ip_blocks; j++) { 3276 i = state == AMD_CG_STATE_GATE ? j : adev->num_ip_blocks - j - 1; 3277 if (!adev->ip_blocks[i].status.late_initialized) 3278 continue; 3279 /* skip CG for GFX, SDMA on S0ix */ 3280 if (adev->in_s0ix && 3281 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GFX || 3282 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SDMA)) 3283 continue; 3284 /* skip CG for VCE/UVD, it's handled specially */ 3285 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_UVD && 3286 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCE && 3287 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCN && 3288 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_JPEG && 3289 adev->ip_blocks[i].version->funcs->set_clockgating_state) { 3290 /* enable clockgating to save power */ 3291 r = adev->ip_blocks[i].version->funcs->set_clockgating_state(&adev->ip_blocks[i], 3292 state); 3293 if (r) { 3294 DRM_ERROR("set_clockgating_state(gate) of IP block <%s> failed %d\n", 3295 adev->ip_blocks[i].version->funcs->name, r); 3296 return r; 3297 } 3298 } 3299 } 3300 3301 return 0; 3302 } 3303 3304 int amdgpu_device_set_pg_state(struct amdgpu_device *adev, 3305 enum amd_powergating_state state) 3306 { 3307 int i, j, r; 3308 3309 if (amdgpu_emu_mode == 1) 3310 return 0; 3311 3312 for (j = 0; j < adev->num_ip_blocks; j++) { 3313 i = state == AMD_PG_STATE_GATE ? j : adev->num_ip_blocks - j - 1; 3314 if (!adev->ip_blocks[i].status.late_initialized) 3315 continue; 3316 /* skip PG for GFX, SDMA on S0ix */ 3317 if (adev->in_s0ix && 3318 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GFX || 3319 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SDMA)) 3320 continue; 3321 /* skip CG for VCE/UVD, it's handled specially */ 3322 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_UVD && 3323 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCE && 3324 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCN && 3325 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_JPEG && 3326 adev->ip_blocks[i].version->funcs->set_powergating_state) { 3327 /* enable powergating to save power */ 3328 r = adev->ip_blocks[i].version->funcs->set_powergating_state(&adev->ip_blocks[i], 3329 state); 3330 if (r) { 3331 DRM_ERROR("set_powergating_state(gate) of IP block <%s> failed %d\n", 3332 adev->ip_blocks[i].version->funcs->name, r); 3333 return r; 3334 } 3335 } 3336 } 3337 return 0; 3338 } 3339 3340 static int amdgpu_device_enable_mgpu_fan_boost(void) 3341 { 3342 struct amdgpu_gpu_instance *gpu_ins; 3343 struct amdgpu_device *adev; 3344 int i, ret = 0; 3345 3346 mutex_lock(&mgpu_info.mutex); 3347 3348 /* 3349 * MGPU fan boost feature should be enabled 3350 * only when there are two or more dGPUs in 3351 * the system 3352 */ 3353 if (mgpu_info.num_dgpu < 2) 3354 goto out; 3355 3356 for (i = 0; i < mgpu_info.num_dgpu; i++) { 3357 gpu_ins = &(mgpu_info.gpu_ins[i]); 3358 adev = gpu_ins->adev; 3359 if (!(adev->flags & AMD_IS_APU) && 3360 !gpu_ins->mgpu_fan_enabled) { 3361 ret = amdgpu_dpm_enable_mgpu_fan_boost(adev); 3362 if (ret) 3363 break; 3364 3365 gpu_ins->mgpu_fan_enabled = 1; 3366 } 3367 } 3368 3369 out: 3370 mutex_unlock(&mgpu_info.mutex); 3371 3372 return ret; 3373 } 3374 3375 /** 3376 * amdgpu_device_ip_late_init - run late init for hardware IPs 3377 * 3378 * @adev: amdgpu_device pointer 3379 * 3380 * Late initialization pass for hardware IPs. The list of all the hardware 3381 * IPs that make up the asic is walked and the late_init callbacks are run. 3382 * late_init covers any special initialization that an IP requires 3383 * after all of the have been initialized or something that needs to happen 3384 * late in the init process. 3385 * Returns 0 on success, negative error code on failure. 3386 */ 3387 static int amdgpu_device_ip_late_init(struct amdgpu_device *adev) 3388 { 3389 struct amdgpu_gpu_instance *gpu_instance; 3390 int i = 0, r; 3391 3392 for (i = 0; i < adev->num_ip_blocks; i++) { 3393 if (!adev->ip_blocks[i].status.hw) 3394 continue; 3395 if (adev->ip_blocks[i].version->funcs->late_init) { 3396 r = adev->ip_blocks[i].version->funcs->late_init(&adev->ip_blocks[i]); 3397 if (r) { 3398 DRM_ERROR("late_init of IP block <%s> failed %d\n", 3399 adev->ip_blocks[i].version->funcs->name, r); 3400 return r; 3401 } 3402 } 3403 adev->ip_blocks[i].status.late_initialized = true; 3404 } 3405 3406 r = amdgpu_ras_late_init(adev); 3407 if (r) { 3408 DRM_ERROR("amdgpu_ras_late_init failed %d", r); 3409 return r; 3410 } 3411 3412 if (!amdgpu_reset_in_recovery(adev)) 3413 amdgpu_ras_set_error_query_ready(adev, true); 3414 3415 amdgpu_device_set_cg_state(adev, AMD_CG_STATE_GATE); 3416 amdgpu_device_set_pg_state(adev, AMD_PG_STATE_GATE); 3417 3418 amdgpu_device_fill_reset_magic(adev); 3419 3420 r = amdgpu_device_enable_mgpu_fan_boost(); 3421 if (r) 3422 DRM_ERROR("enable mgpu fan boost failed (%d).\n", r); 3423 3424 /* For passthrough configuration on arcturus and aldebaran, enable special handling SBR */ 3425 if (amdgpu_passthrough(adev) && 3426 ((adev->asic_type == CHIP_ARCTURUS && adev->gmc.xgmi.num_physical_nodes > 1) || 3427 adev->asic_type == CHIP_ALDEBARAN)) 3428 amdgpu_dpm_handle_passthrough_sbr(adev, true); 3429 3430 if (adev->gmc.xgmi.num_physical_nodes > 1) { 3431 mutex_lock(&mgpu_info.mutex); 3432 3433 /* 3434 * Reset device p-state to low as this was booted with high. 3435 * 3436 * This should be performed only after all devices from the same 3437 * hive get initialized. 3438 * 3439 * However, it's unknown how many device in the hive in advance. 3440 * As this is counted one by one during devices initializations. 3441 * 3442 * So, we wait for all XGMI interlinked devices initialized. 3443 * This may bring some delays as those devices may come from 3444 * different hives. But that should be OK. 3445 */ 3446 if (mgpu_info.num_dgpu == adev->gmc.xgmi.num_physical_nodes) { 3447 for (i = 0; i < mgpu_info.num_gpu; i++) { 3448 gpu_instance = &(mgpu_info.gpu_ins[i]); 3449 if (gpu_instance->adev->flags & AMD_IS_APU) 3450 continue; 3451 3452 r = amdgpu_xgmi_set_pstate(gpu_instance->adev, 3453 AMDGPU_XGMI_PSTATE_MIN); 3454 if (r) { 3455 DRM_ERROR("pstate setting failed (%d).\n", r); 3456 break; 3457 } 3458 } 3459 } 3460 3461 mutex_unlock(&mgpu_info.mutex); 3462 } 3463 3464 return 0; 3465 } 3466 3467 static void amdgpu_ip_block_hw_fini(struct amdgpu_ip_block *ip_block) 3468 { 3469 int r; 3470 3471 if (!ip_block->version->funcs->hw_fini) { 3472 DRM_ERROR("hw_fini of IP block <%s> not defined\n", 3473 ip_block->version->funcs->name); 3474 } else { 3475 r = ip_block->version->funcs->hw_fini(ip_block); 3476 /* XXX handle errors */ 3477 if (r) { 3478 DRM_DEBUG("hw_fini of IP block <%s> failed %d\n", 3479 ip_block->version->funcs->name, r); 3480 } 3481 } 3482 3483 ip_block->status.hw = false; 3484 } 3485 3486 /** 3487 * amdgpu_device_smu_fini_early - smu hw_fini wrapper 3488 * 3489 * @adev: amdgpu_device pointer 3490 * 3491 * For ASICs need to disable SMC first 3492 */ 3493 static void amdgpu_device_smu_fini_early(struct amdgpu_device *adev) 3494 { 3495 int i; 3496 3497 if (amdgpu_ip_version(adev, GC_HWIP, 0) > IP_VERSION(9, 0, 0)) 3498 return; 3499 3500 for (i = 0; i < adev->num_ip_blocks; i++) { 3501 if (!adev->ip_blocks[i].status.hw) 3502 continue; 3503 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) { 3504 amdgpu_ip_block_hw_fini(&adev->ip_blocks[i]); 3505 break; 3506 } 3507 } 3508 } 3509 3510 static int amdgpu_device_ip_fini_early(struct amdgpu_device *adev) 3511 { 3512 int i, r; 3513 3514 for (i = 0; i < adev->num_ip_blocks; i++) { 3515 if (!adev->ip_blocks[i].version->funcs->early_fini) 3516 continue; 3517 3518 r = adev->ip_blocks[i].version->funcs->early_fini(&adev->ip_blocks[i]); 3519 if (r) { 3520 DRM_DEBUG("early_fini of IP block <%s> failed %d\n", 3521 adev->ip_blocks[i].version->funcs->name, r); 3522 } 3523 } 3524 3525 amdgpu_device_set_pg_state(adev, AMD_PG_STATE_UNGATE); 3526 amdgpu_device_set_cg_state(adev, AMD_CG_STATE_UNGATE); 3527 3528 amdgpu_amdkfd_suspend(adev, true); 3529 amdgpu_userq_suspend(adev); 3530 3531 /* Workaround for ASICs need to disable SMC first */ 3532 amdgpu_device_smu_fini_early(adev); 3533 3534 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 3535 if (!adev->ip_blocks[i].status.hw) 3536 continue; 3537 3538 amdgpu_ip_block_hw_fini(&adev->ip_blocks[i]); 3539 } 3540 3541 if (amdgpu_sriov_vf(adev)) { 3542 if (amdgpu_virt_release_full_gpu(adev, false)) 3543 DRM_ERROR("failed to release exclusive mode on fini\n"); 3544 } 3545 3546 return 0; 3547 } 3548 3549 /** 3550 * amdgpu_device_ip_fini - run fini for hardware IPs 3551 * 3552 * @adev: amdgpu_device pointer 3553 * 3554 * Main teardown pass for hardware IPs. The list of all the hardware 3555 * IPs that make up the asic is walked and the hw_fini and sw_fini callbacks 3556 * are run. hw_fini tears down the hardware associated with each IP 3557 * and sw_fini tears down any software state associated with each IP. 3558 * Returns 0 on success, negative error code on failure. 3559 */ 3560 static int amdgpu_device_ip_fini(struct amdgpu_device *adev) 3561 { 3562 int i, r; 3563 3564 amdgpu_cper_fini(adev); 3565 3566 if (amdgpu_sriov_vf(adev) && adev->virt.ras_init_done) 3567 amdgpu_virt_release_ras_err_handler_data(adev); 3568 3569 if (adev->gmc.xgmi.num_physical_nodes > 1) 3570 amdgpu_xgmi_remove_device(adev); 3571 3572 amdgpu_amdkfd_device_fini_sw(adev); 3573 3574 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 3575 if (!adev->ip_blocks[i].status.sw) 3576 continue; 3577 3578 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) { 3579 amdgpu_ucode_free_bo(adev); 3580 amdgpu_free_static_csa(&adev->virt.csa_obj); 3581 amdgpu_device_wb_fini(adev); 3582 amdgpu_device_mem_scratch_fini(adev); 3583 amdgpu_ib_pool_fini(adev); 3584 amdgpu_seq64_fini(adev); 3585 amdgpu_doorbell_fini(adev); 3586 } 3587 if (adev->ip_blocks[i].version->funcs->sw_fini) { 3588 r = adev->ip_blocks[i].version->funcs->sw_fini(&adev->ip_blocks[i]); 3589 /* XXX handle errors */ 3590 if (r) { 3591 DRM_DEBUG("sw_fini of IP block <%s> failed %d\n", 3592 adev->ip_blocks[i].version->funcs->name, r); 3593 } 3594 } 3595 adev->ip_blocks[i].status.sw = false; 3596 adev->ip_blocks[i].status.valid = false; 3597 } 3598 3599 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 3600 if (!adev->ip_blocks[i].status.late_initialized) 3601 continue; 3602 if (adev->ip_blocks[i].version->funcs->late_fini) 3603 adev->ip_blocks[i].version->funcs->late_fini(&adev->ip_blocks[i]); 3604 adev->ip_blocks[i].status.late_initialized = false; 3605 } 3606 3607 amdgpu_ras_fini(adev); 3608 3609 return 0; 3610 } 3611 3612 /** 3613 * amdgpu_device_delayed_init_work_handler - work handler for IB tests 3614 * 3615 * @work: work_struct. 3616 */ 3617 static void amdgpu_device_delayed_init_work_handler(struct work_struct *work) 3618 { 3619 struct amdgpu_device *adev = 3620 container_of(work, struct amdgpu_device, delayed_init_work.work); 3621 int r; 3622 3623 r = amdgpu_ib_ring_tests(adev); 3624 if (r) 3625 DRM_ERROR("ib ring test failed (%d).\n", r); 3626 } 3627 3628 static void amdgpu_device_delay_enable_gfx_off(struct work_struct *work) 3629 { 3630 struct amdgpu_device *adev = 3631 container_of(work, struct amdgpu_device, gfx.gfx_off_delay_work.work); 3632 3633 WARN_ON_ONCE(adev->gfx.gfx_off_state); 3634 WARN_ON_ONCE(adev->gfx.gfx_off_req_count); 3635 3636 if (!amdgpu_dpm_set_powergating_by_smu(adev, AMD_IP_BLOCK_TYPE_GFX, true, 0)) 3637 adev->gfx.gfx_off_state = true; 3638 } 3639 3640 /** 3641 * amdgpu_device_ip_suspend_phase1 - run suspend for hardware IPs (phase 1) 3642 * 3643 * @adev: amdgpu_device pointer 3644 * 3645 * Main suspend function for hardware IPs. The list of all the hardware 3646 * IPs that make up the asic is walked, clockgating is disabled and the 3647 * suspend callbacks are run. suspend puts the hardware and software state 3648 * in each IP into a state suitable for suspend. 3649 * Returns 0 on success, negative error code on failure. 3650 */ 3651 static int amdgpu_device_ip_suspend_phase1(struct amdgpu_device *adev) 3652 { 3653 int i, r; 3654 3655 amdgpu_device_set_pg_state(adev, AMD_PG_STATE_UNGATE); 3656 amdgpu_device_set_cg_state(adev, AMD_CG_STATE_UNGATE); 3657 3658 /* 3659 * Per PMFW team's suggestion, driver needs to handle gfxoff 3660 * and df cstate features disablement for gpu reset(e.g. Mode1Reset) 3661 * scenario. Add the missing df cstate disablement here. 3662 */ 3663 if (amdgpu_dpm_set_df_cstate(adev, DF_CSTATE_DISALLOW)) 3664 dev_warn(adev->dev, "Failed to disallow df cstate"); 3665 3666 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 3667 if (!adev->ip_blocks[i].status.valid) 3668 continue; 3669 3670 /* displays are handled separately */ 3671 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_DCE) 3672 continue; 3673 3674 /* XXX handle errors */ 3675 r = amdgpu_ip_block_suspend(&adev->ip_blocks[i]); 3676 if (r) 3677 return r; 3678 } 3679 3680 return 0; 3681 } 3682 3683 /** 3684 * amdgpu_device_ip_suspend_phase2 - run suspend for hardware IPs (phase 2) 3685 * 3686 * @adev: amdgpu_device pointer 3687 * 3688 * Main suspend function for hardware IPs. The list of all the hardware 3689 * IPs that make up the asic is walked, clockgating is disabled and the 3690 * suspend callbacks are run. suspend puts the hardware and software state 3691 * in each IP into a state suitable for suspend. 3692 * Returns 0 on success, negative error code on failure. 3693 */ 3694 static int amdgpu_device_ip_suspend_phase2(struct amdgpu_device *adev) 3695 { 3696 int i, r; 3697 3698 if (adev->in_s0ix) 3699 amdgpu_dpm_gfx_state_change(adev, sGpuChangeState_D3Entry); 3700 3701 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 3702 if (!adev->ip_blocks[i].status.valid) 3703 continue; 3704 /* displays are handled in phase1 */ 3705 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE) 3706 continue; 3707 /* PSP lost connection when err_event_athub occurs */ 3708 if (amdgpu_ras_intr_triggered() && 3709 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) { 3710 adev->ip_blocks[i].status.hw = false; 3711 continue; 3712 } 3713 3714 /* skip unnecessary suspend if we do not initialize them yet */ 3715 if (!amdgpu_ip_member_of_hwini( 3716 adev, adev->ip_blocks[i].version->type)) 3717 continue; 3718 3719 /* Since we skip suspend for S0i3, we need to cancel the delayed 3720 * idle work here as the suspend callback never gets called. 3721 */ 3722 if (adev->in_s0ix && 3723 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GFX && 3724 amdgpu_ip_version(adev, GC_HWIP, 0) >= IP_VERSION(10, 0, 0)) 3725 cancel_delayed_work_sync(&adev->gfx.idle_work); 3726 /* skip suspend of gfx/mes and psp for S0ix 3727 * gfx is in gfxoff state, so on resume it will exit gfxoff just 3728 * like at runtime. PSP is also part of the always on hardware 3729 * so no need to suspend it. 3730 */ 3731 if (adev->in_s0ix && 3732 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP || 3733 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GFX || 3734 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_MES)) 3735 continue; 3736 3737 /* SDMA 5.x+ is part of GFX power domain so it's covered by GFXOFF */ 3738 if (adev->in_s0ix && 3739 (amdgpu_ip_version(adev, SDMA0_HWIP, 0) >= 3740 IP_VERSION(5, 0, 0)) && 3741 (adev->ip_blocks[i].version->type == 3742 AMD_IP_BLOCK_TYPE_SDMA)) 3743 continue; 3744 3745 /* Once swPSP provides the IMU, RLC FW binaries to TOS during cold-boot. 3746 * These are in TMR, hence are expected to be reused by PSP-TOS to reload 3747 * from this location and RLC Autoload automatically also gets loaded 3748 * from here based on PMFW -> PSP message during re-init sequence. 3749 * Therefore, the psp suspend & resume should be skipped to avoid destroy 3750 * the TMR and reload FWs again for IMU enabled APU ASICs. 3751 */ 3752 if (amdgpu_in_reset(adev) && 3753 (adev->flags & AMD_IS_APU) && adev->gfx.imu.funcs && 3754 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) 3755 continue; 3756 3757 /* XXX handle errors */ 3758 r = amdgpu_ip_block_suspend(&adev->ip_blocks[i]); 3759 adev->ip_blocks[i].status.hw = false; 3760 3761 /* handle putting the SMC in the appropriate state */ 3762 if (!amdgpu_sriov_vf(adev)) { 3763 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) { 3764 r = amdgpu_dpm_set_mp1_state(adev, adev->mp1_state); 3765 if (r) { 3766 DRM_ERROR("SMC failed to set mp1 state %d, %d\n", 3767 adev->mp1_state, r); 3768 return r; 3769 } 3770 } 3771 } 3772 } 3773 3774 return 0; 3775 } 3776 3777 /** 3778 * amdgpu_device_ip_suspend - run suspend for hardware IPs 3779 * 3780 * @adev: amdgpu_device pointer 3781 * 3782 * Main suspend function for hardware IPs. The list of all the hardware 3783 * IPs that make up the asic is walked, clockgating is disabled and the 3784 * suspend callbacks are run. suspend puts the hardware and software state 3785 * in each IP into a state suitable for suspend. 3786 * Returns 0 on success, negative error code on failure. 3787 */ 3788 int amdgpu_device_ip_suspend(struct amdgpu_device *adev) 3789 { 3790 int r; 3791 3792 if (amdgpu_sriov_vf(adev)) { 3793 amdgpu_virt_fini_data_exchange(adev); 3794 amdgpu_virt_request_full_gpu(adev, false); 3795 } 3796 3797 amdgpu_ttm_set_buffer_funcs_status(adev, false); 3798 3799 r = amdgpu_device_ip_suspend_phase1(adev); 3800 if (r) 3801 return r; 3802 r = amdgpu_device_ip_suspend_phase2(adev); 3803 3804 if (amdgpu_sriov_vf(adev)) 3805 amdgpu_virt_release_full_gpu(adev, false); 3806 3807 return r; 3808 } 3809 3810 static int amdgpu_device_ip_reinit_early_sriov(struct amdgpu_device *adev) 3811 { 3812 int i, r; 3813 3814 static enum amd_ip_block_type ip_order[] = { 3815 AMD_IP_BLOCK_TYPE_COMMON, 3816 AMD_IP_BLOCK_TYPE_GMC, 3817 AMD_IP_BLOCK_TYPE_PSP, 3818 AMD_IP_BLOCK_TYPE_IH, 3819 }; 3820 3821 for (i = 0; i < adev->num_ip_blocks; i++) { 3822 int j; 3823 struct amdgpu_ip_block *block; 3824 3825 block = &adev->ip_blocks[i]; 3826 block->status.hw = false; 3827 3828 for (j = 0; j < ARRAY_SIZE(ip_order); j++) { 3829 3830 if (block->version->type != ip_order[j] || 3831 !block->status.valid) 3832 continue; 3833 3834 r = block->version->funcs->hw_init(&adev->ip_blocks[i]); 3835 if (r) { 3836 dev_err(adev->dev, "RE-INIT-early: %s failed\n", 3837 block->version->funcs->name); 3838 return r; 3839 } 3840 block->status.hw = true; 3841 } 3842 } 3843 3844 return 0; 3845 } 3846 3847 static int amdgpu_device_ip_reinit_late_sriov(struct amdgpu_device *adev) 3848 { 3849 struct amdgpu_ip_block *block; 3850 int i, r = 0; 3851 3852 static enum amd_ip_block_type ip_order[] = { 3853 AMD_IP_BLOCK_TYPE_SMC, 3854 AMD_IP_BLOCK_TYPE_DCE, 3855 AMD_IP_BLOCK_TYPE_GFX, 3856 AMD_IP_BLOCK_TYPE_SDMA, 3857 AMD_IP_BLOCK_TYPE_MES, 3858 AMD_IP_BLOCK_TYPE_UVD, 3859 AMD_IP_BLOCK_TYPE_VCE, 3860 AMD_IP_BLOCK_TYPE_VCN, 3861 AMD_IP_BLOCK_TYPE_JPEG 3862 }; 3863 3864 for (i = 0; i < ARRAY_SIZE(ip_order); i++) { 3865 block = amdgpu_device_ip_get_ip_block(adev, ip_order[i]); 3866 3867 if (!block) 3868 continue; 3869 3870 if (block->status.valid && !block->status.hw) { 3871 if (block->version->type == AMD_IP_BLOCK_TYPE_SMC) { 3872 r = amdgpu_ip_block_resume(block); 3873 } else { 3874 r = block->version->funcs->hw_init(block); 3875 } 3876 3877 if (r) { 3878 dev_err(adev->dev, "RE-INIT-late: %s failed\n", 3879 block->version->funcs->name); 3880 break; 3881 } 3882 block->status.hw = true; 3883 } 3884 } 3885 3886 return r; 3887 } 3888 3889 /** 3890 * amdgpu_device_ip_resume_phase1 - run resume for hardware IPs 3891 * 3892 * @adev: amdgpu_device pointer 3893 * 3894 * First resume function for hardware IPs. The list of all the hardware 3895 * IPs that make up the asic is walked and the resume callbacks are run for 3896 * COMMON, GMC, and IH. resume puts the hardware into a functional state 3897 * after a suspend and updates the software state as necessary. This 3898 * function is also used for restoring the GPU after a GPU reset. 3899 * Returns 0 on success, negative error code on failure. 3900 */ 3901 static int amdgpu_device_ip_resume_phase1(struct amdgpu_device *adev) 3902 { 3903 int i, r; 3904 3905 for (i = 0; i < adev->num_ip_blocks; i++) { 3906 if (!adev->ip_blocks[i].status.valid || adev->ip_blocks[i].status.hw) 3907 continue; 3908 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON || 3909 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC || 3910 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH || 3911 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP && amdgpu_sriov_vf(adev))) { 3912 3913 r = amdgpu_ip_block_resume(&adev->ip_blocks[i]); 3914 if (r) 3915 return r; 3916 } 3917 } 3918 3919 return 0; 3920 } 3921 3922 /** 3923 * amdgpu_device_ip_resume_phase2 - run resume for hardware IPs 3924 * 3925 * @adev: amdgpu_device pointer 3926 * 3927 * Second resume function for hardware IPs. The list of all the hardware 3928 * IPs that make up the asic is walked and the resume callbacks are run for 3929 * all blocks except COMMON, GMC, and IH. resume puts the hardware into a 3930 * functional state after a suspend and updates the software state as 3931 * necessary. This function is also used for restoring the GPU after a GPU 3932 * reset. 3933 * Returns 0 on success, negative error code on failure. 3934 */ 3935 static int amdgpu_device_ip_resume_phase2(struct amdgpu_device *adev) 3936 { 3937 int i, r; 3938 3939 for (i = 0; i < adev->num_ip_blocks; i++) { 3940 if (!adev->ip_blocks[i].status.valid || adev->ip_blocks[i].status.hw) 3941 continue; 3942 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON || 3943 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC || 3944 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH || 3945 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE || 3946 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) 3947 continue; 3948 r = amdgpu_ip_block_resume(&adev->ip_blocks[i]); 3949 if (r) 3950 return r; 3951 } 3952 3953 return 0; 3954 } 3955 3956 /** 3957 * amdgpu_device_ip_resume_phase3 - run resume for hardware IPs 3958 * 3959 * @adev: amdgpu_device pointer 3960 * 3961 * Third resume function for hardware IPs. The list of all the hardware 3962 * IPs that make up the asic is walked and the resume callbacks are run for 3963 * all DCE. resume puts the hardware into a functional state after a suspend 3964 * and updates the software state as necessary. This function is also used 3965 * for restoring the GPU after a GPU reset. 3966 * 3967 * Returns 0 on success, negative error code on failure. 3968 */ 3969 static int amdgpu_device_ip_resume_phase3(struct amdgpu_device *adev) 3970 { 3971 int i, r; 3972 3973 for (i = 0; i < adev->num_ip_blocks; i++) { 3974 if (!adev->ip_blocks[i].status.valid || adev->ip_blocks[i].status.hw) 3975 continue; 3976 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE) { 3977 r = amdgpu_ip_block_resume(&adev->ip_blocks[i]); 3978 if (r) 3979 return r; 3980 } 3981 } 3982 3983 return 0; 3984 } 3985 3986 /** 3987 * amdgpu_device_ip_resume - run resume for hardware IPs 3988 * 3989 * @adev: amdgpu_device pointer 3990 * 3991 * Main resume function for hardware IPs. The hardware IPs 3992 * are split into two resume functions because they are 3993 * also used in recovering from a GPU reset and some additional 3994 * steps need to be take between them. In this case (S3/S4) they are 3995 * run sequentially. 3996 * Returns 0 on success, negative error code on failure. 3997 */ 3998 static int amdgpu_device_ip_resume(struct amdgpu_device *adev) 3999 { 4000 int r; 4001 4002 r = amdgpu_device_ip_resume_phase1(adev); 4003 if (r) 4004 return r; 4005 4006 r = amdgpu_device_fw_loading(adev); 4007 if (r) 4008 return r; 4009 4010 r = amdgpu_device_ip_resume_phase2(adev); 4011 4012 if (adev->mman.buffer_funcs_ring->sched.ready) 4013 amdgpu_ttm_set_buffer_funcs_status(adev, true); 4014 4015 if (r) 4016 return r; 4017 4018 amdgpu_fence_driver_hw_init(adev); 4019 4020 r = amdgpu_device_ip_resume_phase3(adev); 4021 4022 return r; 4023 } 4024 4025 /** 4026 * amdgpu_device_detect_sriov_bios - determine if the board supports SR-IOV 4027 * 4028 * @adev: amdgpu_device pointer 4029 * 4030 * Query the VBIOS data tables to determine if the board supports SR-IOV. 4031 */ 4032 static void amdgpu_device_detect_sriov_bios(struct amdgpu_device *adev) 4033 { 4034 if (amdgpu_sriov_vf(adev)) { 4035 if (adev->is_atom_fw) { 4036 if (amdgpu_atomfirmware_gpu_virtualization_supported(adev)) 4037 adev->virt.caps |= AMDGPU_SRIOV_CAPS_SRIOV_VBIOS; 4038 } else { 4039 if (amdgpu_atombios_has_gpu_virtualization_table(adev)) 4040 adev->virt.caps |= AMDGPU_SRIOV_CAPS_SRIOV_VBIOS; 4041 } 4042 4043 if (!(adev->virt.caps & AMDGPU_SRIOV_CAPS_SRIOV_VBIOS)) 4044 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_NO_VBIOS, 0, 0); 4045 } 4046 } 4047 4048 /** 4049 * amdgpu_device_asic_has_dc_support - determine if DC supports the asic 4050 * 4051 * @asic_type: AMD asic type 4052 * 4053 * Check if there is DC (new modesetting infrastructre) support for an asic. 4054 * returns true if DC has support, false if not. 4055 */ 4056 bool amdgpu_device_asic_has_dc_support(enum amd_asic_type asic_type) 4057 { 4058 switch (asic_type) { 4059 #ifdef CONFIG_DRM_AMDGPU_SI 4060 case CHIP_HAINAN: 4061 #endif 4062 case CHIP_TOPAZ: 4063 /* chips with no display hardware */ 4064 return false; 4065 #if defined(CONFIG_DRM_AMD_DC) 4066 case CHIP_TAHITI: 4067 case CHIP_PITCAIRN: 4068 case CHIP_VERDE: 4069 case CHIP_OLAND: 4070 /* 4071 * We have systems in the wild with these ASICs that require 4072 * LVDS and VGA support which is not supported with DC. 4073 * 4074 * Fallback to the non-DC driver here by default so as not to 4075 * cause regressions. 4076 */ 4077 #if defined(CONFIG_DRM_AMD_DC_SI) 4078 return amdgpu_dc > 0; 4079 #else 4080 return false; 4081 #endif 4082 case CHIP_BONAIRE: 4083 case CHIP_KAVERI: 4084 case CHIP_KABINI: 4085 case CHIP_MULLINS: 4086 /* 4087 * We have systems in the wild with these ASICs that require 4088 * VGA support which is not supported with DC. 4089 * 4090 * Fallback to the non-DC driver here by default so as not to 4091 * cause regressions. 4092 */ 4093 return amdgpu_dc > 0; 4094 default: 4095 return amdgpu_dc != 0; 4096 #else 4097 default: 4098 if (amdgpu_dc > 0) 4099 DRM_INFO_ONCE("Display Core has been requested via kernel parameter but isn't supported by ASIC, ignoring\n"); 4100 return false; 4101 #endif 4102 } 4103 } 4104 4105 /** 4106 * amdgpu_device_has_dc_support - check if dc is supported 4107 * 4108 * @adev: amdgpu_device pointer 4109 * 4110 * Returns true for supported, false for not supported 4111 */ 4112 bool amdgpu_device_has_dc_support(struct amdgpu_device *adev) 4113 { 4114 if (adev->enable_virtual_display || 4115 (adev->harvest_ip_mask & AMD_HARVEST_IP_DMU_MASK)) 4116 return false; 4117 4118 return amdgpu_device_asic_has_dc_support(adev->asic_type); 4119 } 4120 4121 static void amdgpu_device_xgmi_reset_func(struct work_struct *__work) 4122 { 4123 struct amdgpu_device *adev = 4124 container_of(__work, struct amdgpu_device, xgmi_reset_work); 4125 struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev); 4126 4127 /* It's a bug to not have a hive within this function */ 4128 if (WARN_ON(!hive)) 4129 return; 4130 4131 /* 4132 * Use task barrier to synchronize all xgmi reset works across the 4133 * hive. task_barrier_enter and task_barrier_exit will block 4134 * until all the threads running the xgmi reset works reach 4135 * those points. task_barrier_full will do both blocks. 4136 */ 4137 if (amdgpu_asic_reset_method(adev) == AMD_RESET_METHOD_BACO) { 4138 4139 task_barrier_enter(&hive->tb); 4140 adev->asic_reset_res = amdgpu_device_baco_enter(adev_to_drm(adev)); 4141 4142 if (adev->asic_reset_res) 4143 goto fail; 4144 4145 task_barrier_exit(&hive->tb); 4146 adev->asic_reset_res = amdgpu_device_baco_exit(adev_to_drm(adev)); 4147 4148 if (adev->asic_reset_res) 4149 goto fail; 4150 4151 amdgpu_ras_reset_error_count(adev, AMDGPU_RAS_BLOCK__MMHUB); 4152 } else { 4153 4154 task_barrier_full(&hive->tb); 4155 adev->asic_reset_res = amdgpu_asic_reset(adev); 4156 } 4157 4158 fail: 4159 if (adev->asic_reset_res) 4160 DRM_WARN("ASIC reset failed with error, %d for drm dev, %s", 4161 adev->asic_reset_res, adev_to_drm(adev)->unique); 4162 amdgpu_put_xgmi_hive(hive); 4163 } 4164 4165 static int amdgpu_device_get_job_timeout_settings(struct amdgpu_device *adev) 4166 { 4167 char *input = amdgpu_lockup_timeout; 4168 char *timeout_setting = NULL; 4169 int index = 0; 4170 long timeout; 4171 int ret = 0; 4172 4173 /* 4174 * By default timeout for non compute jobs is 10000 4175 * and 60000 for compute jobs. 4176 * In SR-IOV or passthrough mode, timeout for compute 4177 * jobs are 60000 by default. 4178 */ 4179 adev->gfx_timeout = msecs_to_jiffies(10000); 4180 adev->sdma_timeout = adev->video_timeout = adev->gfx_timeout; 4181 if (amdgpu_sriov_vf(adev)) 4182 adev->compute_timeout = amdgpu_sriov_is_pp_one_vf(adev) ? 4183 msecs_to_jiffies(60000) : msecs_to_jiffies(10000); 4184 else 4185 adev->compute_timeout = msecs_to_jiffies(60000); 4186 4187 if (strnlen(input, AMDGPU_MAX_TIMEOUT_PARAM_LENGTH)) { 4188 while ((timeout_setting = strsep(&input, ",")) && 4189 strnlen(timeout_setting, AMDGPU_MAX_TIMEOUT_PARAM_LENGTH)) { 4190 ret = kstrtol(timeout_setting, 0, &timeout); 4191 if (ret) 4192 return ret; 4193 4194 if (timeout == 0) { 4195 index++; 4196 continue; 4197 } else if (timeout < 0) { 4198 timeout = MAX_SCHEDULE_TIMEOUT; 4199 dev_warn(adev->dev, "lockup timeout disabled"); 4200 add_taint(TAINT_SOFTLOCKUP, LOCKDEP_STILL_OK); 4201 } else { 4202 timeout = msecs_to_jiffies(timeout); 4203 } 4204 4205 switch (index++) { 4206 case 0: 4207 adev->gfx_timeout = timeout; 4208 break; 4209 case 1: 4210 adev->compute_timeout = timeout; 4211 break; 4212 case 2: 4213 adev->sdma_timeout = timeout; 4214 break; 4215 case 3: 4216 adev->video_timeout = timeout; 4217 break; 4218 default: 4219 break; 4220 } 4221 } 4222 /* 4223 * There is only one value specified and 4224 * it should apply to all non-compute jobs. 4225 */ 4226 if (index == 1) { 4227 adev->sdma_timeout = adev->video_timeout = adev->gfx_timeout; 4228 if (amdgpu_sriov_vf(adev) || amdgpu_passthrough(adev)) 4229 adev->compute_timeout = adev->gfx_timeout; 4230 } 4231 } 4232 4233 return ret; 4234 } 4235 4236 /** 4237 * amdgpu_device_check_iommu_direct_map - check if RAM direct mapped to GPU 4238 * 4239 * @adev: amdgpu_device pointer 4240 * 4241 * RAM direct mapped to GPU if IOMMU is not enabled or is pass through mode 4242 */ 4243 static void amdgpu_device_check_iommu_direct_map(struct amdgpu_device *adev) 4244 { 4245 struct iommu_domain *domain; 4246 4247 domain = iommu_get_domain_for_dev(adev->dev); 4248 if (!domain || domain->type == IOMMU_DOMAIN_IDENTITY) 4249 adev->ram_is_direct_mapped = true; 4250 } 4251 4252 #if defined(CONFIG_HSA_AMD_P2P) 4253 /** 4254 * amdgpu_device_check_iommu_remap - Check if DMA remapping is enabled. 4255 * 4256 * @adev: amdgpu_device pointer 4257 * 4258 * return if IOMMU remapping bar address 4259 */ 4260 static bool amdgpu_device_check_iommu_remap(struct amdgpu_device *adev) 4261 { 4262 struct iommu_domain *domain; 4263 4264 domain = iommu_get_domain_for_dev(adev->dev); 4265 if (domain && (domain->type == IOMMU_DOMAIN_DMA || 4266 domain->type == IOMMU_DOMAIN_DMA_FQ)) 4267 return true; 4268 4269 return false; 4270 } 4271 #endif 4272 4273 static void amdgpu_device_set_mcbp(struct amdgpu_device *adev) 4274 { 4275 if (amdgpu_mcbp == 1) 4276 adev->gfx.mcbp = true; 4277 else if (amdgpu_mcbp == 0) 4278 adev->gfx.mcbp = false; 4279 4280 if (amdgpu_sriov_vf(adev)) 4281 adev->gfx.mcbp = true; 4282 4283 if (adev->gfx.mcbp) 4284 DRM_INFO("MCBP is enabled\n"); 4285 } 4286 4287 /** 4288 * amdgpu_device_init - initialize the driver 4289 * 4290 * @adev: amdgpu_device pointer 4291 * @flags: driver flags 4292 * 4293 * Initializes the driver info and hw (all asics). 4294 * Returns 0 for success or an error on failure. 4295 * Called at driver startup. 4296 */ 4297 int amdgpu_device_init(struct amdgpu_device *adev, 4298 uint32_t flags) 4299 { 4300 struct drm_device *ddev = adev_to_drm(adev); 4301 struct pci_dev *pdev = adev->pdev; 4302 int r, i; 4303 bool px = false; 4304 u32 max_MBps; 4305 int tmp; 4306 4307 adev->shutdown = false; 4308 adev->flags = flags; 4309 4310 if (amdgpu_force_asic_type >= 0 && amdgpu_force_asic_type < CHIP_LAST) 4311 adev->asic_type = amdgpu_force_asic_type; 4312 else 4313 adev->asic_type = flags & AMD_ASIC_MASK; 4314 4315 adev->usec_timeout = AMDGPU_MAX_USEC_TIMEOUT; 4316 if (amdgpu_emu_mode == 1) 4317 adev->usec_timeout *= 10; 4318 adev->gmc.gart_size = 512 * 1024 * 1024; 4319 adev->accel_working = false; 4320 adev->num_rings = 0; 4321 RCU_INIT_POINTER(adev->gang_submit, dma_fence_get_stub()); 4322 adev->mman.buffer_funcs = NULL; 4323 adev->mman.buffer_funcs_ring = NULL; 4324 adev->vm_manager.vm_pte_funcs = NULL; 4325 adev->vm_manager.vm_pte_num_scheds = 0; 4326 adev->gmc.gmc_funcs = NULL; 4327 adev->harvest_ip_mask = 0x0; 4328 adev->fence_context = dma_fence_context_alloc(AMDGPU_MAX_RINGS); 4329 bitmap_zero(adev->gfx.pipe_reserve_bitmap, AMDGPU_MAX_COMPUTE_QUEUES); 4330 4331 adev->smc_rreg = &amdgpu_invalid_rreg; 4332 adev->smc_wreg = &amdgpu_invalid_wreg; 4333 adev->pcie_rreg = &amdgpu_invalid_rreg; 4334 adev->pcie_wreg = &amdgpu_invalid_wreg; 4335 adev->pcie_rreg_ext = &amdgpu_invalid_rreg_ext; 4336 adev->pcie_wreg_ext = &amdgpu_invalid_wreg_ext; 4337 adev->pciep_rreg = &amdgpu_invalid_rreg; 4338 adev->pciep_wreg = &amdgpu_invalid_wreg; 4339 adev->pcie_rreg64 = &amdgpu_invalid_rreg64; 4340 adev->pcie_wreg64 = &amdgpu_invalid_wreg64; 4341 adev->pcie_rreg64_ext = &amdgpu_invalid_rreg64_ext; 4342 adev->pcie_wreg64_ext = &amdgpu_invalid_wreg64_ext; 4343 adev->uvd_ctx_rreg = &amdgpu_invalid_rreg; 4344 adev->uvd_ctx_wreg = &amdgpu_invalid_wreg; 4345 adev->didt_rreg = &amdgpu_invalid_rreg; 4346 adev->didt_wreg = &amdgpu_invalid_wreg; 4347 adev->gc_cac_rreg = &amdgpu_invalid_rreg; 4348 adev->gc_cac_wreg = &amdgpu_invalid_wreg; 4349 adev->audio_endpt_rreg = &amdgpu_block_invalid_rreg; 4350 adev->audio_endpt_wreg = &amdgpu_block_invalid_wreg; 4351 4352 DRM_INFO("initializing kernel modesetting (%s 0x%04X:0x%04X 0x%04X:0x%04X 0x%02X).\n", 4353 amdgpu_asic_name[adev->asic_type], pdev->vendor, pdev->device, 4354 pdev->subsystem_vendor, pdev->subsystem_device, pdev->revision); 4355 4356 /* mutex initialization are all done here so we 4357 * can recall function without having locking issues 4358 */ 4359 mutex_init(&adev->firmware.mutex); 4360 mutex_init(&adev->pm.mutex); 4361 mutex_init(&adev->gfx.gpu_clock_mutex); 4362 mutex_init(&adev->srbm_mutex); 4363 mutex_init(&adev->gfx.pipe_reserve_mutex); 4364 mutex_init(&adev->gfx.gfx_off_mutex); 4365 mutex_init(&adev->gfx.partition_mutex); 4366 mutex_init(&adev->grbm_idx_mutex); 4367 mutex_init(&adev->mn_lock); 4368 mutex_init(&adev->virt.vf_errors.lock); 4369 hash_init(adev->mn_hash); 4370 mutex_init(&adev->psp.mutex); 4371 mutex_init(&adev->notifier_lock); 4372 mutex_init(&adev->pm.stable_pstate_ctx_lock); 4373 mutex_init(&adev->benchmark_mutex); 4374 mutex_init(&adev->gfx.reset_sem_mutex); 4375 /* Initialize the mutex for cleaner shader isolation between GFX and compute processes */ 4376 mutex_init(&adev->enforce_isolation_mutex); 4377 for (i = 0; i < MAX_XCP; ++i) { 4378 adev->isolation[i].spearhead = dma_fence_get_stub(); 4379 amdgpu_sync_create(&adev->isolation[i].active); 4380 amdgpu_sync_create(&adev->isolation[i].prev); 4381 } 4382 mutex_init(&adev->gfx.userq_sch_mutex); 4383 mutex_init(&adev->gfx.workload_profile_mutex); 4384 mutex_init(&adev->vcn.workload_profile_mutex); 4385 mutex_init(&adev->userq_mutex); 4386 4387 amdgpu_device_init_apu_flags(adev); 4388 4389 r = amdgpu_device_check_arguments(adev); 4390 if (r) 4391 return r; 4392 4393 spin_lock_init(&adev->mmio_idx_lock); 4394 spin_lock_init(&adev->smc_idx_lock); 4395 spin_lock_init(&adev->pcie_idx_lock); 4396 spin_lock_init(&adev->uvd_ctx_idx_lock); 4397 spin_lock_init(&adev->didt_idx_lock); 4398 spin_lock_init(&adev->gc_cac_idx_lock); 4399 spin_lock_init(&adev->se_cac_idx_lock); 4400 spin_lock_init(&adev->audio_endpt_idx_lock); 4401 spin_lock_init(&adev->mm_stats.lock); 4402 spin_lock_init(&adev->virt.rlcg_reg_lock); 4403 spin_lock_init(&adev->wb.lock); 4404 4405 xa_init_flags(&adev->userq_xa, XA_FLAGS_LOCK_IRQ); 4406 4407 INIT_LIST_HEAD(&adev->reset_list); 4408 4409 INIT_LIST_HEAD(&adev->ras_list); 4410 4411 INIT_LIST_HEAD(&adev->pm.od_kobj_list); 4412 4413 INIT_LIST_HEAD(&adev->userq_mgr_list); 4414 4415 INIT_DELAYED_WORK(&adev->delayed_init_work, 4416 amdgpu_device_delayed_init_work_handler); 4417 INIT_DELAYED_WORK(&adev->gfx.gfx_off_delay_work, 4418 amdgpu_device_delay_enable_gfx_off); 4419 /* 4420 * Initialize the enforce_isolation work structures for each XCP 4421 * partition. This work handler is responsible for enforcing shader 4422 * isolation on AMD GPUs. It counts the number of emitted fences for 4423 * each GFX and compute ring. If there are any fences, it schedules 4424 * the `enforce_isolation_work` to be run after a delay. If there are 4425 * no fences, it signals the Kernel Fusion Driver (KFD) to resume the 4426 * runqueue. 4427 */ 4428 for (i = 0; i < MAX_XCP; i++) { 4429 INIT_DELAYED_WORK(&adev->gfx.enforce_isolation[i].work, 4430 amdgpu_gfx_enforce_isolation_handler); 4431 adev->gfx.enforce_isolation[i].adev = adev; 4432 adev->gfx.enforce_isolation[i].xcp_id = i; 4433 } 4434 4435 INIT_WORK(&adev->xgmi_reset_work, amdgpu_device_xgmi_reset_func); 4436 4437 adev->gfx.gfx_off_req_count = 1; 4438 adev->gfx.gfx_off_residency = 0; 4439 adev->gfx.gfx_off_entrycount = 0; 4440 adev->pm.ac_power = power_supply_is_system_supplied() > 0; 4441 4442 atomic_set(&adev->throttling_logging_enabled, 1); 4443 /* 4444 * If throttling continues, logging will be performed every minute 4445 * to avoid log flooding. "-1" is subtracted since the thermal 4446 * throttling interrupt comes every second. Thus, the total logging 4447 * interval is 59 seconds(retelimited printk interval) + 1(waiting 4448 * for throttling interrupt) = 60 seconds. 4449 */ 4450 ratelimit_state_init(&adev->throttling_logging_rs, (60 - 1) * HZ, 1); 4451 4452 ratelimit_set_flags(&adev->throttling_logging_rs, RATELIMIT_MSG_ON_RELEASE); 4453 4454 /* Registers mapping */ 4455 /* TODO: block userspace mapping of io register */ 4456 if (adev->asic_type >= CHIP_BONAIRE) { 4457 adev->rmmio_base = pci_resource_start(adev->pdev, 5); 4458 adev->rmmio_size = pci_resource_len(adev->pdev, 5); 4459 } else { 4460 adev->rmmio_base = pci_resource_start(adev->pdev, 2); 4461 adev->rmmio_size = pci_resource_len(adev->pdev, 2); 4462 } 4463 4464 for (i = 0; i < AMD_IP_BLOCK_TYPE_NUM; i++) 4465 atomic_set(&adev->pm.pwr_state[i], POWER_STATE_UNKNOWN); 4466 4467 adev->rmmio = ioremap(adev->rmmio_base, adev->rmmio_size); 4468 if (!adev->rmmio) 4469 return -ENOMEM; 4470 4471 DRM_INFO("register mmio base: 0x%08X\n", (uint32_t)adev->rmmio_base); 4472 DRM_INFO("register mmio size: %u\n", (unsigned int)adev->rmmio_size); 4473 4474 /* 4475 * Reset domain needs to be present early, before XGMI hive discovered 4476 * (if any) and initialized to use reset sem and in_gpu reset flag 4477 * early on during init and before calling to RREG32. 4478 */ 4479 adev->reset_domain = amdgpu_reset_create_reset_domain(SINGLE_DEVICE, "amdgpu-reset-dev"); 4480 if (!adev->reset_domain) 4481 return -ENOMEM; 4482 4483 /* detect hw virtualization here */ 4484 amdgpu_virt_init(adev); 4485 4486 amdgpu_device_get_pcie_info(adev); 4487 4488 r = amdgpu_device_get_job_timeout_settings(adev); 4489 if (r) { 4490 dev_err(adev->dev, "invalid lockup_timeout parameter syntax\n"); 4491 return r; 4492 } 4493 4494 amdgpu_device_set_mcbp(adev); 4495 4496 /* 4497 * By default, use default mode where all blocks are expected to be 4498 * initialized. At present a 'swinit' of blocks is required to be 4499 * completed before the need for a different level is detected. 4500 */ 4501 amdgpu_set_init_level(adev, AMDGPU_INIT_LEVEL_DEFAULT); 4502 /* early init functions */ 4503 r = amdgpu_device_ip_early_init(adev); 4504 if (r) 4505 return r; 4506 4507 /* 4508 * No need to remove conflicting FBs for non-display class devices. 4509 * This prevents the sysfb from being freed accidently. 4510 */ 4511 if ((pdev->class >> 8) == PCI_CLASS_DISPLAY_VGA || 4512 (pdev->class >> 8) == PCI_CLASS_DISPLAY_OTHER) { 4513 /* Get rid of things like offb */ 4514 r = aperture_remove_conflicting_pci_devices(adev->pdev, amdgpu_kms_driver.name); 4515 if (r) 4516 return r; 4517 } 4518 4519 /* Enable TMZ based on IP_VERSION */ 4520 amdgpu_gmc_tmz_set(adev); 4521 4522 if (amdgpu_sriov_vf(adev) && 4523 amdgpu_ip_version(adev, GC_HWIP, 0) >= IP_VERSION(10, 3, 0)) 4524 /* VF MMIO access (except mailbox range) from CPU 4525 * will be blocked during sriov runtime 4526 */ 4527 adev->virt.caps |= AMDGPU_VF_MMIO_ACCESS_PROTECT; 4528 4529 amdgpu_gmc_noretry_set(adev); 4530 /* Need to get xgmi info early to decide the reset behavior*/ 4531 if (adev->gmc.xgmi.supported) { 4532 r = adev->gfxhub.funcs->get_xgmi_info(adev); 4533 if (r) 4534 return r; 4535 } 4536 4537 /* enable PCIE atomic ops */ 4538 if (amdgpu_sriov_vf(adev)) { 4539 if (adev->virt.fw_reserve.p_pf2vf) 4540 adev->have_atomics_support = ((struct amd_sriov_msg_pf2vf_info *) 4541 adev->virt.fw_reserve.p_pf2vf)->pcie_atomic_ops_support_flags == 4542 (PCI_EXP_DEVCAP2_ATOMIC_COMP32 | PCI_EXP_DEVCAP2_ATOMIC_COMP64); 4543 /* APUs w/ gfx9 onwards doesn't reply on PCIe atomics, rather it is a 4544 * internal path natively support atomics, set have_atomics_support to true. 4545 */ 4546 } else if ((adev->flags & AMD_IS_APU) && 4547 (amdgpu_ip_version(adev, GC_HWIP, 0) > 4548 IP_VERSION(9, 0, 0))) { 4549 adev->have_atomics_support = true; 4550 } else { 4551 adev->have_atomics_support = 4552 !pci_enable_atomic_ops_to_root(adev->pdev, 4553 PCI_EXP_DEVCAP2_ATOMIC_COMP32 | 4554 PCI_EXP_DEVCAP2_ATOMIC_COMP64); 4555 } 4556 4557 if (!adev->have_atomics_support) 4558 dev_info(adev->dev, "PCIE atomic ops is not supported\n"); 4559 4560 /* doorbell bar mapping and doorbell index init*/ 4561 amdgpu_doorbell_init(adev); 4562 4563 if (amdgpu_emu_mode == 1) { 4564 /* post the asic on emulation mode */ 4565 emu_soc_asic_init(adev); 4566 goto fence_driver_init; 4567 } 4568 4569 amdgpu_reset_init(adev); 4570 4571 /* detect if we are with an SRIOV vbios */ 4572 if (adev->bios) 4573 amdgpu_device_detect_sriov_bios(adev); 4574 4575 /* check if we need to reset the asic 4576 * E.g., driver was not cleanly unloaded previously, etc. 4577 */ 4578 if (!amdgpu_sriov_vf(adev) && amdgpu_asic_need_reset_on_init(adev)) { 4579 if (adev->gmc.xgmi.num_physical_nodes) { 4580 dev_info(adev->dev, "Pending hive reset.\n"); 4581 amdgpu_set_init_level(adev, 4582 AMDGPU_INIT_LEVEL_MINIMAL_XGMI); 4583 } else if (amdgpu_ip_version(adev, MP1_HWIP, 0) == IP_VERSION(13, 0, 10) && 4584 !amdgpu_device_has_display_hardware(adev)) { 4585 r = psp_gpu_reset(adev); 4586 } else { 4587 tmp = amdgpu_reset_method; 4588 /* It should do a default reset when loading or reloading the driver, 4589 * regardless of the module parameter reset_method. 4590 */ 4591 amdgpu_reset_method = AMD_RESET_METHOD_NONE; 4592 r = amdgpu_asic_reset(adev); 4593 amdgpu_reset_method = tmp; 4594 } 4595 4596 if (r) { 4597 dev_err(adev->dev, "asic reset on init failed\n"); 4598 goto failed; 4599 } 4600 } 4601 4602 /* Post card if necessary */ 4603 if (amdgpu_device_need_post(adev)) { 4604 if (!adev->bios) { 4605 dev_err(adev->dev, "no vBIOS found\n"); 4606 r = -EINVAL; 4607 goto failed; 4608 } 4609 DRM_INFO("GPU posting now...\n"); 4610 r = amdgpu_device_asic_init(adev); 4611 if (r) { 4612 dev_err(adev->dev, "gpu post error!\n"); 4613 goto failed; 4614 } 4615 } 4616 4617 if (adev->bios) { 4618 if (adev->is_atom_fw) { 4619 /* Initialize clocks */ 4620 r = amdgpu_atomfirmware_get_clock_info(adev); 4621 if (r) { 4622 dev_err(adev->dev, "amdgpu_atomfirmware_get_clock_info failed\n"); 4623 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_GET_CLOCK_FAIL, 0, 0); 4624 goto failed; 4625 } 4626 } else { 4627 /* Initialize clocks */ 4628 r = amdgpu_atombios_get_clock_info(adev); 4629 if (r) { 4630 dev_err(adev->dev, "amdgpu_atombios_get_clock_info failed\n"); 4631 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_GET_CLOCK_FAIL, 0, 0); 4632 goto failed; 4633 } 4634 /* init i2c buses */ 4635 amdgpu_i2c_init(adev); 4636 } 4637 } 4638 4639 fence_driver_init: 4640 /* Fence driver */ 4641 r = amdgpu_fence_driver_sw_init(adev); 4642 if (r) { 4643 dev_err(adev->dev, "amdgpu_fence_driver_sw_init failed\n"); 4644 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_FENCE_INIT_FAIL, 0, 0); 4645 goto failed; 4646 } 4647 4648 /* init the mode config */ 4649 drm_mode_config_init(adev_to_drm(adev)); 4650 4651 r = amdgpu_device_ip_init(adev); 4652 if (r) { 4653 dev_err(adev->dev, "amdgpu_device_ip_init failed\n"); 4654 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_AMDGPU_INIT_FAIL, 0, 0); 4655 goto release_ras_con; 4656 } 4657 4658 amdgpu_fence_driver_hw_init(adev); 4659 4660 dev_info(adev->dev, 4661 "SE %d, SH per SE %d, CU per SH %d, active_cu_number %d\n", 4662 adev->gfx.config.max_shader_engines, 4663 adev->gfx.config.max_sh_per_se, 4664 adev->gfx.config.max_cu_per_sh, 4665 adev->gfx.cu_info.number); 4666 4667 adev->accel_working = true; 4668 4669 amdgpu_vm_check_compute_bug(adev); 4670 4671 /* Initialize the buffer migration limit. */ 4672 if (amdgpu_moverate >= 0) 4673 max_MBps = amdgpu_moverate; 4674 else 4675 max_MBps = 8; /* Allow 8 MB/s. */ 4676 /* Get a log2 for easy divisions. */ 4677 adev->mm_stats.log2_max_MBps = ilog2(max(1u, max_MBps)); 4678 4679 /* 4680 * Register gpu instance before amdgpu_device_enable_mgpu_fan_boost. 4681 * Otherwise the mgpu fan boost feature will be skipped due to the 4682 * gpu instance is counted less. 4683 */ 4684 amdgpu_register_gpu_instance(adev); 4685 4686 /* enable clockgating, etc. after ib tests, etc. since some blocks require 4687 * explicit gating rather than handling it automatically. 4688 */ 4689 if (adev->init_lvl->level != AMDGPU_INIT_LEVEL_MINIMAL_XGMI) { 4690 r = amdgpu_device_ip_late_init(adev); 4691 if (r) { 4692 dev_err(adev->dev, "amdgpu_device_ip_late_init failed\n"); 4693 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_AMDGPU_LATE_INIT_FAIL, 0, r); 4694 goto release_ras_con; 4695 } 4696 /* must succeed. */ 4697 amdgpu_ras_resume(adev); 4698 queue_delayed_work(system_wq, &adev->delayed_init_work, 4699 msecs_to_jiffies(AMDGPU_RESUME_MS)); 4700 } 4701 4702 if (amdgpu_sriov_vf(adev)) { 4703 amdgpu_virt_release_full_gpu(adev, true); 4704 flush_delayed_work(&adev->delayed_init_work); 4705 } 4706 4707 /* 4708 * Place those sysfs registering after `late_init`. As some of those 4709 * operations performed in `late_init` might affect the sysfs 4710 * interfaces creating. 4711 */ 4712 r = amdgpu_atombios_sysfs_init(adev); 4713 if (r) 4714 drm_err(&adev->ddev, 4715 "registering atombios sysfs failed (%d).\n", r); 4716 4717 r = amdgpu_pm_sysfs_init(adev); 4718 if (r) 4719 DRM_ERROR("registering pm sysfs failed (%d).\n", r); 4720 4721 r = amdgpu_ucode_sysfs_init(adev); 4722 if (r) { 4723 adev->ucode_sysfs_en = false; 4724 DRM_ERROR("Creating firmware sysfs failed (%d).\n", r); 4725 } else 4726 adev->ucode_sysfs_en = true; 4727 4728 r = amdgpu_device_attr_sysfs_init(adev); 4729 if (r) 4730 dev_err(adev->dev, "Could not create amdgpu device attr\n"); 4731 4732 r = devm_device_add_group(adev->dev, &amdgpu_board_attrs_group); 4733 if (r) 4734 dev_err(adev->dev, 4735 "Could not create amdgpu board attributes\n"); 4736 4737 amdgpu_fru_sysfs_init(adev); 4738 amdgpu_reg_state_sysfs_init(adev); 4739 amdgpu_xcp_sysfs_init(adev); 4740 4741 if (IS_ENABLED(CONFIG_PERF_EVENTS)) 4742 r = amdgpu_pmu_init(adev); 4743 if (r) 4744 dev_err(adev->dev, "amdgpu_pmu_init failed\n"); 4745 4746 /* Have stored pci confspace at hand for restore in sudden PCI error */ 4747 if (amdgpu_device_cache_pci_state(adev->pdev)) 4748 pci_restore_state(pdev); 4749 4750 /* if we have > 1 VGA cards, then disable the amdgpu VGA resources */ 4751 /* this will fail for cards that aren't VGA class devices, just 4752 * ignore it 4753 */ 4754 if ((adev->pdev->class >> 8) == PCI_CLASS_DISPLAY_VGA) 4755 vga_client_register(adev->pdev, amdgpu_device_vga_set_decode); 4756 4757 px = amdgpu_device_supports_px(ddev); 4758 4759 if (px || (!dev_is_removable(&adev->pdev->dev) && 4760 apple_gmux_detect(NULL, NULL))) 4761 vga_switcheroo_register_client(adev->pdev, 4762 &amdgpu_switcheroo_ops, px); 4763 4764 if (px) 4765 vga_switcheroo_init_domain_pm_ops(adev->dev, &adev->vga_pm_domain); 4766 4767 if (adev->init_lvl->level == AMDGPU_INIT_LEVEL_MINIMAL_XGMI) 4768 amdgpu_xgmi_reset_on_init(adev); 4769 4770 amdgpu_device_check_iommu_direct_map(adev); 4771 4772 adev->pm_nb.notifier_call = amdgpu_device_pm_notifier; 4773 r = register_pm_notifier(&adev->pm_nb); 4774 if (r) 4775 goto failed; 4776 4777 return 0; 4778 4779 release_ras_con: 4780 if (amdgpu_sriov_vf(adev)) 4781 amdgpu_virt_release_full_gpu(adev, true); 4782 4783 /* failed in exclusive mode due to timeout */ 4784 if (amdgpu_sriov_vf(adev) && 4785 !amdgpu_sriov_runtime(adev) && 4786 amdgpu_virt_mmio_blocked(adev) && 4787 !amdgpu_virt_wait_reset(adev)) { 4788 dev_err(adev->dev, "VF exclusive mode timeout\n"); 4789 /* Don't send request since VF is inactive. */ 4790 adev->virt.caps &= ~AMDGPU_SRIOV_CAPS_RUNTIME; 4791 adev->virt.ops = NULL; 4792 r = -EAGAIN; 4793 } 4794 amdgpu_release_ras_context(adev); 4795 4796 failed: 4797 amdgpu_vf_error_trans_all(adev); 4798 4799 return r; 4800 } 4801 4802 static void amdgpu_device_unmap_mmio(struct amdgpu_device *adev) 4803 { 4804 4805 /* Clear all CPU mappings pointing to this device */ 4806 unmap_mapping_range(adev->ddev.anon_inode->i_mapping, 0, 0, 1); 4807 4808 /* Unmap all mapped bars - Doorbell, registers and VRAM */ 4809 amdgpu_doorbell_fini(adev); 4810 4811 iounmap(adev->rmmio); 4812 adev->rmmio = NULL; 4813 if (adev->mman.aper_base_kaddr) 4814 iounmap(adev->mman.aper_base_kaddr); 4815 adev->mman.aper_base_kaddr = NULL; 4816 4817 /* Memory manager related */ 4818 if (!adev->gmc.xgmi.connected_to_cpu && !adev->gmc.is_app_apu) { 4819 arch_phys_wc_del(adev->gmc.vram_mtrr); 4820 arch_io_free_memtype_wc(adev->gmc.aper_base, adev->gmc.aper_size); 4821 } 4822 } 4823 4824 /** 4825 * amdgpu_device_fini_hw - tear down the driver 4826 * 4827 * @adev: amdgpu_device pointer 4828 * 4829 * Tear down the driver info (all asics). 4830 * Called at driver shutdown. 4831 */ 4832 void amdgpu_device_fini_hw(struct amdgpu_device *adev) 4833 { 4834 dev_info(adev->dev, "amdgpu: finishing device.\n"); 4835 flush_delayed_work(&adev->delayed_init_work); 4836 4837 if (adev->mman.initialized) 4838 drain_workqueue(adev->mman.bdev.wq); 4839 adev->shutdown = true; 4840 4841 unregister_pm_notifier(&adev->pm_nb); 4842 4843 /* make sure IB test finished before entering exclusive mode 4844 * to avoid preemption on IB test 4845 */ 4846 if (amdgpu_sriov_vf(adev)) { 4847 amdgpu_virt_request_full_gpu(adev, false); 4848 amdgpu_virt_fini_data_exchange(adev); 4849 } 4850 4851 /* disable all interrupts */ 4852 amdgpu_irq_disable_all(adev); 4853 if (adev->mode_info.mode_config_initialized) { 4854 if (!drm_drv_uses_atomic_modeset(adev_to_drm(adev))) 4855 drm_helper_force_disable_all(adev_to_drm(adev)); 4856 else 4857 drm_atomic_helper_shutdown(adev_to_drm(adev)); 4858 } 4859 amdgpu_fence_driver_hw_fini(adev); 4860 4861 if (adev->pm.sysfs_initialized) 4862 amdgpu_pm_sysfs_fini(adev); 4863 if (adev->ucode_sysfs_en) 4864 amdgpu_ucode_sysfs_fini(adev); 4865 amdgpu_device_attr_sysfs_fini(adev); 4866 amdgpu_fru_sysfs_fini(adev); 4867 4868 amdgpu_reg_state_sysfs_fini(adev); 4869 amdgpu_xcp_sysfs_fini(adev); 4870 4871 /* disable ras feature must before hw fini */ 4872 amdgpu_ras_pre_fini(adev); 4873 4874 amdgpu_ttm_set_buffer_funcs_status(adev, false); 4875 4876 amdgpu_device_ip_fini_early(adev); 4877 4878 amdgpu_irq_fini_hw(adev); 4879 4880 if (adev->mman.initialized) 4881 ttm_device_clear_dma_mappings(&adev->mman.bdev); 4882 4883 amdgpu_gart_dummy_page_fini(adev); 4884 4885 if (drm_dev_is_unplugged(adev_to_drm(adev))) 4886 amdgpu_device_unmap_mmio(adev); 4887 4888 } 4889 4890 void amdgpu_device_fini_sw(struct amdgpu_device *adev) 4891 { 4892 int i, idx; 4893 bool px; 4894 4895 amdgpu_device_ip_fini(adev); 4896 amdgpu_fence_driver_sw_fini(adev); 4897 amdgpu_ucode_release(&adev->firmware.gpu_info_fw); 4898 adev->accel_working = false; 4899 dma_fence_put(rcu_dereference_protected(adev->gang_submit, true)); 4900 for (i = 0; i < MAX_XCP; ++i) { 4901 dma_fence_put(adev->isolation[i].spearhead); 4902 amdgpu_sync_free(&adev->isolation[i].active); 4903 amdgpu_sync_free(&adev->isolation[i].prev); 4904 } 4905 4906 amdgpu_reset_fini(adev); 4907 4908 /* free i2c buses */ 4909 amdgpu_i2c_fini(adev); 4910 4911 if (adev->bios) { 4912 if (amdgpu_emu_mode != 1) 4913 amdgpu_atombios_fini(adev); 4914 amdgpu_bios_release(adev); 4915 } 4916 4917 kfree(adev->fru_info); 4918 adev->fru_info = NULL; 4919 4920 kfree(adev->xcp_mgr); 4921 adev->xcp_mgr = NULL; 4922 4923 px = amdgpu_device_supports_px(adev_to_drm(adev)); 4924 4925 if (px || (!dev_is_removable(&adev->pdev->dev) && 4926 apple_gmux_detect(NULL, NULL))) 4927 vga_switcheroo_unregister_client(adev->pdev); 4928 4929 if (px) 4930 vga_switcheroo_fini_domain_pm_ops(adev->dev); 4931 4932 if ((adev->pdev->class >> 8) == PCI_CLASS_DISPLAY_VGA) 4933 vga_client_unregister(adev->pdev); 4934 4935 if (drm_dev_enter(adev_to_drm(adev), &idx)) { 4936 4937 iounmap(adev->rmmio); 4938 adev->rmmio = NULL; 4939 drm_dev_exit(idx); 4940 } 4941 4942 if (IS_ENABLED(CONFIG_PERF_EVENTS)) 4943 amdgpu_pmu_fini(adev); 4944 if (adev->mman.discovery_bin) 4945 amdgpu_discovery_fini(adev); 4946 4947 amdgpu_reset_put_reset_domain(adev->reset_domain); 4948 adev->reset_domain = NULL; 4949 4950 kfree(adev->pci_state); 4951 4952 } 4953 4954 /** 4955 * amdgpu_device_evict_resources - evict device resources 4956 * @adev: amdgpu device object 4957 * 4958 * Evicts all ttm device resources(vram BOs, gart table) from the lru list 4959 * of the vram memory type. Mainly used for evicting device resources 4960 * at suspend time. 4961 * 4962 */ 4963 static int amdgpu_device_evict_resources(struct amdgpu_device *adev) 4964 { 4965 int ret; 4966 4967 /* No need to evict vram on APUs unless going to S4 */ 4968 if (!adev->in_s4 && (adev->flags & AMD_IS_APU)) 4969 return 0; 4970 4971 ret = amdgpu_ttm_evict_resources(adev, TTM_PL_VRAM); 4972 if (ret) 4973 DRM_WARN("evicting device resources failed\n"); 4974 return ret; 4975 } 4976 4977 /* 4978 * Suspend & resume. 4979 */ 4980 /** 4981 * amdgpu_device_pm_notifier - Notification block for Suspend/Hibernate events 4982 * @nb: notifier block 4983 * @mode: suspend mode 4984 * @data: data 4985 * 4986 * This function is called when the system is about to suspend or hibernate. 4987 * It is used to set the appropriate flags so that eviction can be optimized 4988 * in the pm prepare callback. 4989 */ 4990 static int amdgpu_device_pm_notifier(struct notifier_block *nb, unsigned long mode, 4991 void *data) 4992 { 4993 struct amdgpu_device *adev = container_of(nb, struct amdgpu_device, pm_nb); 4994 4995 switch (mode) { 4996 case PM_HIBERNATION_PREPARE: 4997 adev->in_s4 = true; 4998 break; 4999 case PM_POST_HIBERNATION: 5000 adev->in_s4 = false; 5001 break; 5002 } 5003 5004 return NOTIFY_DONE; 5005 } 5006 5007 /** 5008 * amdgpu_device_prepare - prepare for device suspend 5009 * 5010 * @dev: drm dev pointer 5011 * 5012 * Prepare to put the hw in the suspend state (all asics). 5013 * Returns 0 for success or an error on failure. 5014 * Called at driver suspend. 5015 */ 5016 int amdgpu_device_prepare(struct drm_device *dev) 5017 { 5018 struct amdgpu_device *adev = drm_to_adev(dev); 5019 int i, r; 5020 5021 if (dev->switch_power_state == DRM_SWITCH_POWER_OFF) 5022 return 0; 5023 5024 /* Evict the majority of BOs before starting suspend sequence */ 5025 r = amdgpu_device_evict_resources(adev); 5026 if (r) 5027 return r; 5028 5029 flush_delayed_work(&adev->gfx.gfx_off_delay_work); 5030 5031 for (i = 0; i < adev->num_ip_blocks; i++) { 5032 if (!adev->ip_blocks[i].status.valid) 5033 continue; 5034 if (!adev->ip_blocks[i].version->funcs->prepare_suspend) 5035 continue; 5036 r = adev->ip_blocks[i].version->funcs->prepare_suspend(&adev->ip_blocks[i]); 5037 if (r) 5038 return r; 5039 } 5040 5041 return 0; 5042 } 5043 5044 /** 5045 * amdgpu_device_complete - complete power state transition 5046 * 5047 * @dev: drm dev pointer 5048 * 5049 * Undo the changes from amdgpu_device_prepare. This will be 5050 * called on all resume transitions, including those that failed. 5051 */ 5052 void amdgpu_device_complete(struct drm_device *dev) 5053 { 5054 struct amdgpu_device *adev = drm_to_adev(dev); 5055 int i; 5056 5057 for (i = 0; i < adev->num_ip_blocks; i++) { 5058 if (!adev->ip_blocks[i].status.valid) 5059 continue; 5060 if (!adev->ip_blocks[i].version->funcs->complete) 5061 continue; 5062 adev->ip_blocks[i].version->funcs->complete(&adev->ip_blocks[i]); 5063 } 5064 } 5065 5066 /** 5067 * amdgpu_device_suspend - initiate device suspend 5068 * 5069 * @dev: drm dev pointer 5070 * @notify_clients: notify in-kernel DRM clients 5071 * 5072 * Puts the hw in the suspend state (all asics). 5073 * Returns 0 for success or an error on failure. 5074 * Called at driver suspend. 5075 */ 5076 int amdgpu_device_suspend(struct drm_device *dev, bool notify_clients) 5077 { 5078 struct amdgpu_device *adev = drm_to_adev(dev); 5079 int r = 0; 5080 5081 if (dev->switch_power_state == DRM_SWITCH_POWER_OFF) 5082 return 0; 5083 5084 adev->in_suspend = true; 5085 5086 if (amdgpu_sriov_vf(adev)) { 5087 if (!adev->in_s0ix && !adev->in_runpm) 5088 amdgpu_amdkfd_suspend_process(adev); 5089 amdgpu_virt_fini_data_exchange(adev); 5090 r = amdgpu_virt_request_full_gpu(adev, false); 5091 if (r) 5092 return r; 5093 } 5094 5095 if (amdgpu_acpi_smart_shift_update(dev, AMDGPU_SS_DEV_D3)) 5096 DRM_WARN("smart shift update failed\n"); 5097 5098 if (notify_clients) 5099 drm_client_dev_suspend(adev_to_drm(adev), false); 5100 5101 cancel_delayed_work_sync(&adev->delayed_init_work); 5102 5103 amdgpu_ras_suspend(adev); 5104 5105 amdgpu_device_ip_suspend_phase1(adev); 5106 5107 if (!adev->in_s0ix) { 5108 amdgpu_amdkfd_suspend(adev, !amdgpu_sriov_vf(adev) && !adev->in_runpm); 5109 amdgpu_userq_suspend(adev); 5110 } 5111 5112 r = amdgpu_device_evict_resources(adev); 5113 if (r) 5114 return r; 5115 5116 amdgpu_ttm_set_buffer_funcs_status(adev, false); 5117 5118 amdgpu_fence_driver_hw_fini(adev); 5119 5120 amdgpu_device_ip_suspend_phase2(adev); 5121 5122 if (amdgpu_sriov_vf(adev)) 5123 amdgpu_virt_release_full_gpu(adev, false); 5124 5125 r = amdgpu_dpm_notify_rlc_state(adev, false); 5126 if (r) 5127 return r; 5128 5129 return 0; 5130 } 5131 5132 static inline int amdgpu_virt_resume(struct amdgpu_device *adev) 5133 { 5134 int r; 5135 unsigned int prev_physical_node_id = adev->gmc.xgmi.physical_node_id; 5136 5137 /* During VM resume, QEMU programming of VF MSIX table (register GFXMSIX_VECT0_ADDR_LO) 5138 * may not work. The access could be blocked by nBIF protection as VF isn't in 5139 * exclusive access mode. Exclusive access is enabled now, disable/enable MSIX 5140 * so that QEMU reprograms MSIX table. 5141 */ 5142 amdgpu_restore_msix(adev); 5143 5144 r = adev->gfxhub.funcs->get_xgmi_info(adev); 5145 if (r) 5146 return r; 5147 5148 dev_info(adev->dev, "xgmi node, old id %d, new id %d\n", 5149 prev_physical_node_id, adev->gmc.xgmi.physical_node_id); 5150 5151 adev->vm_manager.vram_base_offset = adev->gfxhub.funcs->get_mc_fb_offset(adev); 5152 adev->vm_manager.vram_base_offset += 5153 adev->gmc.xgmi.physical_node_id * adev->gmc.xgmi.node_segment_size; 5154 5155 return 0; 5156 } 5157 5158 /** 5159 * amdgpu_device_resume - initiate device resume 5160 * 5161 * @dev: drm dev pointer 5162 * @notify_clients: notify in-kernel DRM clients 5163 * 5164 * Bring the hw back to operating state (all asics). 5165 * Returns 0 for success or an error on failure. 5166 * Called at driver resume. 5167 */ 5168 int amdgpu_device_resume(struct drm_device *dev, bool notify_clients) 5169 { 5170 struct amdgpu_device *adev = drm_to_adev(dev); 5171 int r = 0; 5172 5173 if (amdgpu_sriov_vf(adev)) { 5174 r = amdgpu_virt_request_full_gpu(adev, true); 5175 if (r) 5176 return r; 5177 } 5178 5179 if (amdgpu_virt_xgmi_migrate_enabled(adev)) { 5180 r = amdgpu_virt_resume(adev); 5181 if (r) 5182 goto exit; 5183 } 5184 5185 if (dev->switch_power_state == DRM_SWITCH_POWER_OFF) 5186 return 0; 5187 5188 if (adev->in_s0ix) 5189 amdgpu_dpm_gfx_state_change(adev, sGpuChangeState_D0Entry); 5190 5191 /* post card */ 5192 if (amdgpu_device_need_post(adev)) { 5193 r = amdgpu_device_asic_init(adev); 5194 if (r) 5195 dev_err(adev->dev, "amdgpu asic init failed\n"); 5196 } 5197 5198 r = amdgpu_device_ip_resume(adev); 5199 5200 if (r) { 5201 dev_err(adev->dev, "amdgpu_device_ip_resume failed (%d).\n", r); 5202 goto exit; 5203 } 5204 5205 if (!adev->in_s0ix) { 5206 r = amdgpu_amdkfd_resume(adev, !amdgpu_sriov_vf(adev) && !adev->in_runpm); 5207 if (r) 5208 goto exit; 5209 5210 r = amdgpu_userq_resume(adev); 5211 if (r) 5212 goto exit; 5213 } 5214 5215 r = amdgpu_device_ip_late_init(adev); 5216 if (r) 5217 goto exit; 5218 5219 queue_delayed_work(system_wq, &adev->delayed_init_work, 5220 msecs_to_jiffies(AMDGPU_RESUME_MS)); 5221 exit: 5222 if (amdgpu_sriov_vf(adev)) { 5223 amdgpu_virt_init_data_exchange(adev); 5224 amdgpu_virt_release_full_gpu(adev, true); 5225 5226 if (!adev->in_s0ix && !r && !adev->in_runpm) 5227 r = amdgpu_amdkfd_resume_process(adev); 5228 } 5229 5230 if (r) 5231 return r; 5232 5233 /* Make sure IB tests flushed */ 5234 flush_delayed_work(&adev->delayed_init_work); 5235 5236 if (notify_clients) 5237 drm_client_dev_resume(adev_to_drm(adev), false); 5238 5239 amdgpu_ras_resume(adev); 5240 5241 if (adev->mode_info.num_crtc) { 5242 /* 5243 * Most of the connector probing functions try to acquire runtime pm 5244 * refs to ensure that the GPU is powered on when connector polling is 5245 * performed. Since we're calling this from a runtime PM callback, 5246 * trying to acquire rpm refs will cause us to deadlock. 5247 * 5248 * Since we're guaranteed to be holding the rpm lock, it's safe to 5249 * temporarily disable the rpm helpers so this doesn't deadlock us. 5250 */ 5251 #ifdef CONFIG_PM 5252 dev->dev->power.disable_depth++; 5253 #endif 5254 if (!adev->dc_enabled) 5255 drm_helper_hpd_irq_event(dev); 5256 else 5257 drm_kms_helper_hotplug_event(dev); 5258 #ifdef CONFIG_PM 5259 dev->dev->power.disable_depth--; 5260 #endif 5261 } 5262 adev->in_suspend = false; 5263 5264 if (amdgpu_acpi_smart_shift_update(dev, AMDGPU_SS_DEV_D0)) 5265 DRM_WARN("smart shift update failed\n"); 5266 5267 return 0; 5268 } 5269 5270 /** 5271 * amdgpu_device_ip_check_soft_reset - did soft reset succeed 5272 * 5273 * @adev: amdgpu_device pointer 5274 * 5275 * The list of all the hardware IPs that make up the asic is walked and 5276 * the check_soft_reset callbacks are run. check_soft_reset determines 5277 * if the asic is still hung or not. 5278 * Returns true if any of the IPs are still in a hung state, false if not. 5279 */ 5280 static bool amdgpu_device_ip_check_soft_reset(struct amdgpu_device *adev) 5281 { 5282 int i; 5283 bool asic_hang = false; 5284 5285 if (amdgpu_sriov_vf(adev)) 5286 return true; 5287 5288 if (amdgpu_asic_need_full_reset(adev)) 5289 return true; 5290 5291 for (i = 0; i < adev->num_ip_blocks; i++) { 5292 if (!adev->ip_blocks[i].status.valid) 5293 continue; 5294 if (adev->ip_blocks[i].version->funcs->check_soft_reset) 5295 adev->ip_blocks[i].status.hang = 5296 adev->ip_blocks[i].version->funcs->check_soft_reset( 5297 &adev->ip_blocks[i]); 5298 if (adev->ip_blocks[i].status.hang) { 5299 dev_info(adev->dev, "IP block:%s is hung!\n", adev->ip_blocks[i].version->funcs->name); 5300 asic_hang = true; 5301 } 5302 } 5303 return asic_hang; 5304 } 5305 5306 /** 5307 * amdgpu_device_ip_pre_soft_reset - prepare for soft reset 5308 * 5309 * @adev: amdgpu_device pointer 5310 * 5311 * The list of all the hardware IPs that make up the asic is walked and the 5312 * pre_soft_reset callbacks are run if the block is hung. pre_soft_reset 5313 * handles any IP specific hardware or software state changes that are 5314 * necessary for a soft reset to succeed. 5315 * Returns 0 on success, negative error code on failure. 5316 */ 5317 static int amdgpu_device_ip_pre_soft_reset(struct amdgpu_device *adev) 5318 { 5319 int i, r = 0; 5320 5321 for (i = 0; i < adev->num_ip_blocks; i++) { 5322 if (!adev->ip_blocks[i].status.valid) 5323 continue; 5324 if (adev->ip_blocks[i].status.hang && 5325 adev->ip_blocks[i].version->funcs->pre_soft_reset) { 5326 r = adev->ip_blocks[i].version->funcs->pre_soft_reset(&adev->ip_blocks[i]); 5327 if (r) 5328 return r; 5329 } 5330 } 5331 5332 return 0; 5333 } 5334 5335 /** 5336 * amdgpu_device_ip_need_full_reset - check if a full asic reset is needed 5337 * 5338 * @adev: amdgpu_device pointer 5339 * 5340 * Some hardware IPs cannot be soft reset. If they are hung, a full gpu 5341 * reset is necessary to recover. 5342 * Returns true if a full asic reset is required, false if not. 5343 */ 5344 static bool amdgpu_device_ip_need_full_reset(struct amdgpu_device *adev) 5345 { 5346 int i; 5347 5348 if (amdgpu_asic_need_full_reset(adev)) 5349 return true; 5350 5351 for (i = 0; i < adev->num_ip_blocks; i++) { 5352 if (!adev->ip_blocks[i].status.valid) 5353 continue; 5354 if ((adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) || 5355 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) || 5356 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_ACP) || 5357 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE) || 5358 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) { 5359 if (adev->ip_blocks[i].status.hang) { 5360 dev_info(adev->dev, "Some block need full reset!\n"); 5361 return true; 5362 } 5363 } 5364 } 5365 return false; 5366 } 5367 5368 /** 5369 * amdgpu_device_ip_soft_reset - do a soft reset 5370 * 5371 * @adev: amdgpu_device pointer 5372 * 5373 * The list of all the hardware IPs that make up the asic is walked and the 5374 * soft_reset callbacks are run if the block is hung. soft_reset handles any 5375 * IP specific hardware or software state changes that are necessary to soft 5376 * reset the IP. 5377 * Returns 0 on success, negative error code on failure. 5378 */ 5379 static int amdgpu_device_ip_soft_reset(struct amdgpu_device *adev) 5380 { 5381 int i, r = 0; 5382 5383 for (i = 0; i < adev->num_ip_blocks; i++) { 5384 if (!adev->ip_blocks[i].status.valid) 5385 continue; 5386 if (adev->ip_blocks[i].status.hang && 5387 adev->ip_blocks[i].version->funcs->soft_reset) { 5388 r = adev->ip_blocks[i].version->funcs->soft_reset(&adev->ip_blocks[i]); 5389 if (r) 5390 return r; 5391 } 5392 } 5393 5394 return 0; 5395 } 5396 5397 /** 5398 * amdgpu_device_ip_post_soft_reset - clean up from soft reset 5399 * 5400 * @adev: amdgpu_device pointer 5401 * 5402 * The list of all the hardware IPs that make up the asic is walked and the 5403 * post_soft_reset callbacks are run if the asic was hung. post_soft_reset 5404 * handles any IP specific hardware or software state changes that are 5405 * necessary after the IP has been soft reset. 5406 * Returns 0 on success, negative error code on failure. 5407 */ 5408 static int amdgpu_device_ip_post_soft_reset(struct amdgpu_device *adev) 5409 { 5410 int i, r = 0; 5411 5412 for (i = 0; i < adev->num_ip_blocks; i++) { 5413 if (!adev->ip_blocks[i].status.valid) 5414 continue; 5415 if (adev->ip_blocks[i].status.hang && 5416 adev->ip_blocks[i].version->funcs->post_soft_reset) 5417 r = adev->ip_blocks[i].version->funcs->post_soft_reset(&adev->ip_blocks[i]); 5418 if (r) 5419 return r; 5420 } 5421 5422 return 0; 5423 } 5424 5425 /** 5426 * amdgpu_device_reset_sriov - reset ASIC for SR-IOV vf 5427 * 5428 * @adev: amdgpu_device pointer 5429 * @reset_context: amdgpu reset context pointer 5430 * 5431 * do VF FLR and reinitialize Asic 5432 * return 0 means succeeded otherwise failed 5433 */ 5434 static int amdgpu_device_reset_sriov(struct amdgpu_device *adev, 5435 struct amdgpu_reset_context *reset_context) 5436 { 5437 int r; 5438 struct amdgpu_hive_info *hive = NULL; 5439 5440 if (test_bit(AMDGPU_HOST_FLR, &reset_context->flags)) { 5441 if (!amdgpu_ras_get_fed_status(adev)) 5442 amdgpu_virt_ready_to_reset(adev); 5443 amdgpu_virt_wait_reset(adev); 5444 clear_bit(AMDGPU_HOST_FLR, &reset_context->flags); 5445 r = amdgpu_virt_request_full_gpu(adev, true); 5446 } else { 5447 r = amdgpu_virt_reset_gpu(adev); 5448 } 5449 if (r) 5450 return r; 5451 5452 amdgpu_ras_clear_err_state(adev); 5453 amdgpu_irq_gpu_reset_resume_helper(adev); 5454 5455 /* some sw clean up VF needs to do before recover */ 5456 amdgpu_virt_post_reset(adev); 5457 5458 /* Resume IP prior to SMC */ 5459 r = amdgpu_device_ip_reinit_early_sriov(adev); 5460 if (r) 5461 return r; 5462 5463 amdgpu_virt_init_data_exchange(adev); 5464 5465 r = amdgpu_device_fw_loading(adev); 5466 if (r) 5467 return r; 5468 5469 /* now we are okay to resume SMC/CP/SDMA */ 5470 r = amdgpu_device_ip_reinit_late_sriov(adev); 5471 if (r) 5472 return r; 5473 5474 hive = amdgpu_get_xgmi_hive(adev); 5475 /* Update PSP FW topology after reset */ 5476 if (hive && adev->gmc.xgmi.num_physical_nodes > 1) 5477 r = amdgpu_xgmi_update_topology(hive, adev); 5478 if (hive) 5479 amdgpu_put_xgmi_hive(hive); 5480 if (r) 5481 return r; 5482 5483 r = amdgpu_ib_ring_tests(adev); 5484 if (r) 5485 return r; 5486 5487 if (adev->virt.gim_feature & AMDGIM_FEATURE_GIM_FLR_VRAMLOST) 5488 amdgpu_inc_vram_lost(adev); 5489 5490 /* need to be called during full access so we can't do it later like 5491 * bare-metal does. 5492 */ 5493 amdgpu_amdkfd_post_reset(adev); 5494 amdgpu_virt_release_full_gpu(adev, true); 5495 5496 /* Aldebaran and gfx_11_0_3 support ras in SRIOV, so need resume ras during reset */ 5497 if (amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 2) || 5498 amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 3) || 5499 amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 4) || 5500 amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 5, 0) || 5501 amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(11, 0, 3)) 5502 amdgpu_ras_resume(adev); 5503 5504 amdgpu_virt_ras_telemetry_post_reset(adev); 5505 5506 return 0; 5507 } 5508 5509 /** 5510 * amdgpu_device_has_job_running - check if there is any unfinished job 5511 * 5512 * @adev: amdgpu_device pointer 5513 * 5514 * check if there is any job running on the device when guest driver receives 5515 * FLR notification from host driver. If there are still jobs running, then 5516 * the guest driver will not respond the FLR reset. Instead, let the job hit 5517 * the timeout and guest driver then issue the reset request. 5518 */ 5519 bool amdgpu_device_has_job_running(struct amdgpu_device *adev) 5520 { 5521 int i; 5522 5523 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 5524 struct amdgpu_ring *ring = adev->rings[i]; 5525 5526 if (!amdgpu_ring_sched_ready(ring)) 5527 continue; 5528 5529 if (amdgpu_fence_count_emitted(ring)) 5530 return true; 5531 } 5532 return false; 5533 } 5534 5535 /** 5536 * amdgpu_device_should_recover_gpu - check if we should try GPU recovery 5537 * 5538 * @adev: amdgpu_device pointer 5539 * 5540 * Check amdgpu_gpu_recovery and SRIOV status to see if we should try to recover 5541 * a hung GPU. 5542 */ 5543 bool amdgpu_device_should_recover_gpu(struct amdgpu_device *adev) 5544 { 5545 5546 if (amdgpu_gpu_recovery == 0) 5547 goto disabled; 5548 5549 /* Skip soft reset check in fatal error mode */ 5550 if (!amdgpu_ras_is_poison_mode_supported(adev)) 5551 return true; 5552 5553 if (amdgpu_sriov_vf(adev)) 5554 return true; 5555 5556 if (amdgpu_gpu_recovery == -1) { 5557 switch (adev->asic_type) { 5558 #ifdef CONFIG_DRM_AMDGPU_SI 5559 case CHIP_VERDE: 5560 case CHIP_TAHITI: 5561 case CHIP_PITCAIRN: 5562 case CHIP_OLAND: 5563 case CHIP_HAINAN: 5564 #endif 5565 #ifdef CONFIG_DRM_AMDGPU_CIK 5566 case CHIP_KAVERI: 5567 case CHIP_KABINI: 5568 case CHIP_MULLINS: 5569 #endif 5570 case CHIP_CARRIZO: 5571 case CHIP_STONEY: 5572 case CHIP_CYAN_SKILLFISH: 5573 goto disabled; 5574 default: 5575 break; 5576 } 5577 } 5578 5579 return true; 5580 5581 disabled: 5582 dev_info(adev->dev, "GPU recovery disabled.\n"); 5583 return false; 5584 } 5585 5586 int amdgpu_device_mode1_reset(struct amdgpu_device *adev) 5587 { 5588 u32 i; 5589 int ret = 0; 5590 5591 if (adev->bios) 5592 amdgpu_atombios_scratch_regs_engine_hung(adev, true); 5593 5594 dev_info(adev->dev, "GPU mode1 reset\n"); 5595 5596 /* Cache the state before bus master disable. The saved config space 5597 * values are used in other cases like restore after mode-2 reset. 5598 */ 5599 amdgpu_device_cache_pci_state(adev->pdev); 5600 5601 /* disable BM */ 5602 pci_clear_master(adev->pdev); 5603 5604 if (amdgpu_dpm_is_mode1_reset_supported(adev)) { 5605 dev_info(adev->dev, "GPU smu mode1 reset\n"); 5606 ret = amdgpu_dpm_mode1_reset(adev); 5607 } else { 5608 dev_info(adev->dev, "GPU psp mode1 reset\n"); 5609 ret = psp_gpu_reset(adev); 5610 } 5611 5612 if (ret) 5613 goto mode1_reset_failed; 5614 5615 amdgpu_device_load_pci_state(adev->pdev); 5616 ret = amdgpu_psp_wait_for_bootloader(adev); 5617 if (ret) 5618 goto mode1_reset_failed; 5619 5620 /* wait for asic to come out of reset */ 5621 for (i = 0; i < adev->usec_timeout; i++) { 5622 u32 memsize = adev->nbio.funcs->get_memsize(adev); 5623 5624 if (memsize != 0xffffffff) 5625 break; 5626 udelay(1); 5627 } 5628 5629 if (i >= adev->usec_timeout) { 5630 ret = -ETIMEDOUT; 5631 goto mode1_reset_failed; 5632 } 5633 5634 if (adev->bios) 5635 amdgpu_atombios_scratch_regs_engine_hung(adev, false); 5636 5637 return 0; 5638 5639 mode1_reset_failed: 5640 dev_err(adev->dev, "GPU mode1 reset failed\n"); 5641 return ret; 5642 } 5643 5644 int amdgpu_device_link_reset(struct amdgpu_device *adev) 5645 { 5646 int ret = 0; 5647 5648 dev_info(adev->dev, "GPU link reset\n"); 5649 5650 if (!adev->pcie_reset_ctx.occurs_dpc) 5651 ret = amdgpu_dpm_link_reset(adev); 5652 5653 if (ret) 5654 goto link_reset_failed; 5655 5656 ret = amdgpu_psp_wait_for_bootloader(adev); 5657 if (ret) 5658 goto link_reset_failed; 5659 5660 return 0; 5661 5662 link_reset_failed: 5663 dev_err(adev->dev, "GPU link reset failed\n"); 5664 return ret; 5665 } 5666 5667 int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev, 5668 struct amdgpu_reset_context *reset_context) 5669 { 5670 int i, r = 0; 5671 struct amdgpu_job *job = NULL; 5672 struct amdgpu_device *tmp_adev = reset_context->reset_req_dev; 5673 bool need_full_reset = 5674 test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 5675 5676 if (reset_context->reset_req_dev == adev) 5677 job = reset_context->job; 5678 5679 if (amdgpu_sriov_vf(adev)) 5680 amdgpu_virt_pre_reset(adev); 5681 5682 amdgpu_fence_driver_isr_toggle(adev, true); 5683 5684 /* block all schedulers and reset given job's ring */ 5685 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 5686 struct amdgpu_ring *ring = adev->rings[i]; 5687 5688 if (!amdgpu_ring_sched_ready(ring)) 5689 continue; 5690 5691 /* Clear job fence from fence drv to avoid force_completion 5692 * leave NULL and vm flush fence in fence drv 5693 */ 5694 amdgpu_fence_driver_clear_job_fences(ring); 5695 5696 /* after all hw jobs are reset, hw fence is meaningless, so force_completion */ 5697 amdgpu_fence_driver_force_completion(ring); 5698 } 5699 5700 amdgpu_fence_driver_isr_toggle(adev, false); 5701 5702 if (job && job->vm) 5703 drm_sched_increase_karma(&job->base); 5704 5705 r = amdgpu_reset_prepare_hwcontext(adev, reset_context); 5706 /* If reset handler not implemented, continue; otherwise return */ 5707 if (r == -EOPNOTSUPP) 5708 r = 0; 5709 else 5710 return r; 5711 5712 /* Don't suspend on bare metal if we are not going to HW reset the ASIC */ 5713 if (!amdgpu_sriov_vf(adev)) { 5714 5715 if (!need_full_reset) 5716 need_full_reset = amdgpu_device_ip_need_full_reset(adev); 5717 5718 if (!need_full_reset && amdgpu_gpu_recovery && 5719 amdgpu_device_ip_check_soft_reset(adev)) { 5720 amdgpu_device_ip_pre_soft_reset(adev); 5721 r = amdgpu_device_ip_soft_reset(adev); 5722 amdgpu_device_ip_post_soft_reset(adev); 5723 if (r || amdgpu_device_ip_check_soft_reset(adev)) { 5724 dev_info(adev->dev, "soft reset failed, will fallback to full reset!\n"); 5725 need_full_reset = true; 5726 } 5727 } 5728 5729 if (!test_bit(AMDGPU_SKIP_COREDUMP, &reset_context->flags)) { 5730 dev_info(tmp_adev->dev, "Dumping IP State\n"); 5731 /* Trigger ip dump before we reset the asic */ 5732 for (i = 0; i < tmp_adev->num_ip_blocks; i++) 5733 if (tmp_adev->ip_blocks[i].version->funcs->dump_ip_state) 5734 tmp_adev->ip_blocks[i].version->funcs 5735 ->dump_ip_state((void *)&tmp_adev->ip_blocks[i]); 5736 dev_info(tmp_adev->dev, "Dumping IP State Completed\n"); 5737 } 5738 5739 if (need_full_reset) 5740 r = amdgpu_device_ip_suspend(adev); 5741 if (need_full_reset) 5742 set_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 5743 else 5744 clear_bit(AMDGPU_NEED_FULL_RESET, 5745 &reset_context->flags); 5746 } 5747 5748 return r; 5749 } 5750 5751 int amdgpu_device_reinit_after_reset(struct amdgpu_reset_context *reset_context) 5752 { 5753 struct list_head *device_list_handle; 5754 bool full_reset, vram_lost = false; 5755 struct amdgpu_device *tmp_adev; 5756 int r, init_level; 5757 5758 device_list_handle = reset_context->reset_device_list; 5759 5760 if (!device_list_handle) 5761 return -EINVAL; 5762 5763 full_reset = test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 5764 5765 /** 5766 * If it's reset on init, it's default init level, otherwise keep level 5767 * as recovery level. 5768 */ 5769 if (reset_context->method == AMD_RESET_METHOD_ON_INIT) 5770 init_level = AMDGPU_INIT_LEVEL_DEFAULT; 5771 else 5772 init_level = AMDGPU_INIT_LEVEL_RESET_RECOVERY; 5773 5774 r = 0; 5775 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 5776 amdgpu_set_init_level(tmp_adev, init_level); 5777 if (full_reset) { 5778 /* post card */ 5779 amdgpu_ras_clear_err_state(tmp_adev); 5780 r = amdgpu_device_asic_init(tmp_adev); 5781 if (r) { 5782 dev_warn(tmp_adev->dev, "asic atom init failed!"); 5783 } else { 5784 dev_info(tmp_adev->dev, "GPU reset succeeded, trying to resume\n"); 5785 5786 r = amdgpu_device_ip_resume_phase1(tmp_adev); 5787 if (r) 5788 goto out; 5789 5790 vram_lost = amdgpu_device_check_vram_lost(tmp_adev); 5791 5792 if (!test_bit(AMDGPU_SKIP_COREDUMP, &reset_context->flags)) 5793 amdgpu_coredump(tmp_adev, false, vram_lost, reset_context->job); 5794 5795 if (vram_lost) { 5796 DRM_INFO("VRAM is lost due to GPU reset!\n"); 5797 amdgpu_inc_vram_lost(tmp_adev); 5798 } 5799 5800 r = amdgpu_device_fw_loading(tmp_adev); 5801 if (r) 5802 return r; 5803 5804 r = amdgpu_xcp_restore_partition_mode( 5805 tmp_adev->xcp_mgr); 5806 if (r) 5807 goto out; 5808 5809 r = amdgpu_device_ip_resume_phase2(tmp_adev); 5810 if (r) 5811 goto out; 5812 5813 if (tmp_adev->mman.buffer_funcs_ring->sched.ready) 5814 amdgpu_ttm_set_buffer_funcs_status(tmp_adev, true); 5815 5816 r = amdgpu_device_ip_resume_phase3(tmp_adev); 5817 if (r) 5818 goto out; 5819 5820 if (vram_lost) 5821 amdgpu_device_fill_reset_magic(tmp_adev); 5822 5823 /* 5824 * Add this ASIC as tracked as reset was already 5825 * complete successfully. 5826 */ 5827 amdgpu_register_gpu_instance(tmp_adev); 5828 5829 if (!reset_context->hive && 5830 tmp_adev->gmc.xgmi.num_physical_nodes > 1) 5831 amdgpu_xgmi_add_device(tmp_adev); 5832 5833 r = amdgpu_device_ip_late_init(tmp_adev); 5834 if (r) 5835 goto out; 5836 5837 drm_client_dev_resume(adev_to_drm(tmp_adev), false); 5838 5839 /* 5840 * The GPU enters bad state once faulty pages 5841 * by ECC has reached the threshold, and ras 5842 * recovery is scheduled next. So add one check 5843 * here to break recovery if it indeed exceeds 5844 * bad page threshold, and remind user to 5845 * retire this GPU or setting one bigger 5846 * bad_page_threshold value to fix this once 5847 * probing driver again. 5848 */ 5849 if (!amdgpu_ras_is_rma(tmp_adev)) { 5850 /* must succeed. */ 5851 amdgpu_ras_resume(tmp_adev); 5852 } else { 5853 r = -EINVAL; 5854 goto out; 5855 } 5856 5857 /* Update PSP FW topology after reset */ 5858 if (reset_context->hive && 5859 tmp_adev->gmc.xgmi.num_physical_nodes > 1) 5860 r = amdgpu_xgmi_update_topology( 5861 reset_context->hive, tmp_adev); 5862 } 5863 } 5864 5865 out: 5866 if (!r) { 5867 /* IP init is complete now, set level as default */ 5868 amdgpu_set_init_level(tmp_adev, 5869 AMDGPU_INIT_LEVEL_DEFAULT); 5870 amdgpu_irq_gpu_reset_resume_helper(tmp_adev); 5871 r = amdgpu_ib_ring_tests(tmp_adev); 5872 if (r) { 5873 dev_err(tmp_adev->dev, "ib ring test failed (%d).\n", r); 5874 r = -EAGAIN; 5875 goto end; 5876 } 5877 } 5878 5879 if (r) 5880 tmp_adev->asic_reset_res = r; 5881 } 5882 5883 end: 5884 return r; 5885 } 5886 5887 int amdgpu_do_asic_reset(struct list_head *device_list_handle, 5888 struct amdgpu_reset_context *reset_context) 5889 { 5890 struct amdgpu_device *tmp_adev = NULL; 5891 bool need_full_reset, skip_hw_reset; 5892 int r = 0; 5893 5894 /* Try reset handler method first */ 5895 tmp_adev = list_first_entry(device_list_handle, struct amdgpu_device, 5896 reset_list); 5897 5898 reset_context->reset_device_list = device_list_handle; 5899 r = amdgpu_reset_perform_reset(tmp_adev, reset_context); 5900 /* If reset handler not implemented, continue; otherwise return */ 5901 if (r == -EOPNOTSUPP) 5902 r = 0; 5903 else 5904 return r; 5905 5906 /* Reset handler not implemented, use the default method */ 5907 need_full_reset = 5908 test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 5909 skip_hw_reset = test_bit(AMDGPU_SKIP_HW_RESET, &reset_context->flags); 5910 5911 /* 5912 * ASIC reset has to be done on all XGMI hive nodes ASAP 5913 * to allow proper links negotiation in FW (within 1 sec) 5914 */ 5915 if (!skip_hw_reset && need_full_reset) { 5916 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 5917 /* For XGMI run all resets in parallel to speed up the process */ 5918 if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) { 5919 if (!queue_work(system_unbound_wq, 5920 &tmp_adev->xgmi_reset_work)) 5921 r = -EALREADY; 5922 } else 5923 r = amdgpu_asic_reset(tmp_adev); 5924 5925 if (r) { 5926 dev_err(tmp_adev->dev, 5927 "ASIC reset failed with error, %d for drm dev, %s", 5928 r, adev_to_drm(tmp_adev)->unique); 5929 goto out; 5930 } 5931 } 5932 5933 /* For XGMI wait for all resets to complete before proceed */ 5934 if (!r) { 5935 list_for_each_entry(tmp_adev, device_list_handle, 5936 reset_list) { 5937 if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) { 5938 flush_work(&tmp_adev->xgmi_reset_work); 5939 r = tmp_adev->asic_reset_res; 5940 if (r) 5941 break; 5942 } 5943 } 5944 } 5945 } 5946 5947 if (!r && amdgpu_ras_intr_triggered()) { 5948 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 5949 amdgpu_ras_reset_error_count(tmp_adev, 5950 AMDGPU_RAS_BLOCK__MMHUB); 5951 } 5952 5953 amdgpu_ras_intr_cleared(); 5954 } 5955 5956 r = amdgpu_device_reinit_after_reset(reset_context); 5957 if (r == -EAGAIN) 5958 set_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 5959 else 5960 clear_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 5961 5962 out: 5963 return r; 5964 } 5965 5966 static void amdgpu_device_set_mp1_state(struct amdgpu_device *adev) 5967 { 5968 5969 switch (amdgpu_asic_reset_method(adev)) { 5970 case AMD_RESET_METHOD_MODE1: 5971 case AMD_RESET_METHOD_LINK: 5972 adev->mp1_state = PP_MP1_STATE_SHUTDOWN; 5973 break; 5974 case AMD_RESET_METHOD_MODE2: 5975 adev->mp1_state = PP_MP1_STATE_RESET; 5976 break; 5977 default: 5978 adev->mp1_state = PP_MP1_STATE_NONE; 5979 break; 5980 } 5981 } 5982 5983 static void amdgpu_device_unset_mp1_state(struct amdgpu_device *adev) 5984 { 5985 amdgpu_vf_error_trans_all(adev); 5986 adev->mp1_state = PP_MP1_STATE_NONE; 5987 } 5988 5989 static void amdgpu_device_resume_display_audio(struct amdgpu_device *adev) 5990 { 5991 struct pci_dev *p = NULL; 5992 5993 p = pci_get_domain_bus_and_slot(pci_domain_nr(adev->pdev->bus), 5994 adev->pdev->bus->number, 1); 5995 if (p) { 5996 pm_runtime_enable(&(p->dev)); 5997 pm_runtime_resume(&(p->dev)); 5998 } 5999 6000 pci_dev_put(p); 6001 } 6002 6003 static int amdgpu_device_suspend_display_audio(struct amdgpu_device *adev) 6004 { 6005 enum amd_reset_method reset_method; 6006 struct pci_dev *p = NULL; 6007 u64 expires; 6008 6009 /* 6010 * For now, only BACO and mode1 reset are confirmed 6011 * to suffer the audio issue without proper suspended. 6012 */ 6013 reset_method = amdgpu_asic_reset_method(adev); 6014 if ((reset_method != AMD_RESET_METHOD_BACO) && 6015 (reset_method != AMD_RESET_METHOD_MODE1)) 6016 return -EINVAL; 6017 6018 p = pci_get_domain_bus_and_slot(pci_domain_nr(adev->pdev->bus), 6019 adev->pdev->bus->number, 1); 6020 if (!p) 6021 return -ENODEV; 6022 6023 expires = pm_runtime_autosuspend_expiration(&(p->dev)); 6024 if (!expires) 6025 /* 6026 * If we cannot get the audio device autosuspend delay, 6027 * a fixed 4S interval will be used. Considering 3S is 6028 * the audio controller default autosuspend delay setting. 6029 * 4S used here is guaranteed to cover that. 6030 */ 6031 expires = ktime_get_mono_fast_ns() + NSEC_PER_SEC * 4ULL; 6032 6033 while (!pm_runtime_status_suspended(&(p->dev))) { 6034 if (!pm_runtime_suspend(&(p->dev))) 6035 break; 6036 6037 if (expires < ktime_get_mono_fast_ns()) { 6038 dev_warn(adev->dev, "failed to suspend display audio\n"); 6039 pci_dev_put(p); 6040 /* TODO: abort the succeeding gpu reset? */ 6041 return -ETIMEDOUT; 6042 } 6043 } 6044 6045 pm_runtime_disable(&(p->dev)); 6046 6047 pci_dev_put(p); 6048 return 0; 6049 } 6050 6051 static inline void amdgpu_device_stop_pending_resets(struct amdgpu_device *adev) 6052 { 6053 struct amdgpu_ras *con = amdgpu_ras_get_context(adev); 6054 6055 #if defined(CONFIG_DEBUG_FS) 6056 if (!amdgpu_sriov_vf(adev)) 6057 cancel_work(&adev->reset_work); 6058 #endif 6059 6060 if (adev->kfd.dev) 6061 cancel_work(&adev->kfd.reset_work); 6062 6063 if (amdgpu_sriov_vf(adev)) 6064 cancel_work(&adev->virt.flr_work); 6065 6066 if (con && adev->ras_enabled) 6067 cancel_work(&con->recovery_work); 6068 6069 } 6070 6071 static int amdgpu_device_health_check(struct list_head *device_list_handle) 6072 { 6073 struct amdgpu_device *tmp_adev; 6074 int ret = 0; 6075 6076 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 6077 ret |= amdgpu_device_bus_status_check(tmp_adev); 6078 } 6079 6080 return ret; 6081 } 6082 6083 static int amdgpu_device_recovery_prepare(struct amdgpu_device *adev, 6084 struct list_head *device_list, 6085 struct amdgpu_hive_info *hive) 6086 { 6087 struct amdgpu_device *tmp_adev = NULL; 6088 int r; 6089 6090 /* 6091 * Build list of devices to reset. 6092 * In case we are in XGMI hive mode, resort the device list 6093 * to put adev in the 1st position. 6094 */ 6095 if (!amdgpu_sriov_vf(adev) && (adev->gmc.xgmi.num_physical_nodes > 1) && hive) { 6096 list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head) { 6097 list_add_tail(&tmp_adev->reset_list, device_list); 6098 if (adev->shutdown) 6099 tmp_adev->shutdown = true; 6100 if (adev->pcie_reset_ctx.occurs_dpc) 6101 tmp_adev->pcie_reset_ctx.in_link_reset = true; 6102 } 6103 if (!list_is_first(&adev->reset_list, device_list)) 6104 list_rotate_to_front(&adev->reset_list, device_list); 6105 } else { 6106 list_add_tail(&adev->reset_list, device_list); 6107 } 6108 6109 if (!amdgpu_sriov_vf(adev) && (!adev->pcie_reset_ctx.occurs_dpc)) { 6110 r = amdgpu_device_health_check(device_list); 6111 if (r) 6112 return r; 6113 } 6114 6115 return 0; 6116 } 6117 6118 static void amdgpu_device_recovery_get_reset_lock(struct amdgpu_device *adev, 6119 struct list_head *device_list) 6120 { 6121 struct amdgpu_device *tmp_adev = NULL; 6122 6123 if (list_empty(device_list)) 6124 return; 6125 tmp_adev = 6126 list_first_entry(device_list, struct amdgpu_device, reset_list); 6127 amdgpu_device_lock_reset_domain(tmp_adev->reset_domain); 6128 } 6129 6130 static void amdgpu_device_recovery_put_reset_lock(struct amdgpu_device *adev, 6131 struct list_head *device_list) 6132 { 6133 struct amdgpu_device *tmp_adev = NULL; 6134 6135 if (list_empty(device_list)) 6136 return; 6137 tmp_adev = 6138 list_first_entry(device_list, struct amdgpu_device, reset_list); 6139 amdgpu_device_unlock_reset_domain(tmp_adev->reset_domain); 6140 } 6141 6142 static int amdgpu_device_halt_activities( 6143 struct amdgpu_device *adev, struct amdgpu_job *job, 6144 struct amdgpu_reset_context *reset_context, 6145 struct list_head *device_list, struct amdgpu_hive_info *hive, 6146 bool need_emergency_restart) 6147 { 6148 struct amdgpu_device *tmp_adev = NULL; 6149 int i, r = 0; 6150 6151 /* block all schedulers and reset given job's ring */ 6152 list_for_each_entry(tmp_adev, device_list, reset_list) { 6153 amdgpu_device_set_mp1_state(tmp_adev); 6154 6155 /* 6156 * Try to put the audio codec into suspend state 6157 * before gpu reset started. 6158 * 6159 * Due to the power domain of the graphics device 6160 * is shared with AZ power domain. Without this, 6161 * we may change the audio hardware from behind 6162 * the audio driver's back. That will trigger 6163 * some audio codec errors. 6164 */ 6165 if (!amdgpu_device_suspend_display_audio(tmp_adev)) 6166 tmp_adev->pcie_reset_ctx.audio_suspended = true; 6167 6168 amdgpu_ras_set_error_query_ready(tmp_adev, false); 6169 6170 cancel_delayed_work_sync(&tmp_adev->delayed_init_work); 6171 6172 amdgpu_amdkfd_pre_reset(tmp_adev, reset_context); 6173 6174 /* 6175 * Mark these ASICs to be reset as untracked first 6176 * And add them back after reset completed 6177 */ 6178 amdgpu_unregister_gpu_instance(tmp_adev); 6179 6180 drm_client_dev_suspend(adev_to_drm(tmp_adev), false); 6181 6182 /* disable ras on ALL IPs */ 6183 if (!need_emergency_restart && 6184 (!adev->pcie_reset_ctx.occurs_dpc) && 6185 amdgpu_device_ip_need_full_reset(tmp_adev)) 6186 amdgpu_ras_suspend(tmp_adev); 6187 6188 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 6189 struct amdgpu_ring *ring = tmp_adev->rings[i]; 6190 6191 if (!amdgpu_ring_sched_ready(ring)) 6192 continue; 6193 6194 drm_sched_stop(&ring->sched, job ? &job->base : NULL); 6195 6196 if (need_emergency_restart) 6197 amdgpu_job_stop_all_jobs_on_sched(&ring->sched); 6198 } 6199 atomic_inc(&tmp_adev->gpu_reset_counter); 6200 } 6201 6202 return r; 6203 } 6204 6205 static int amdgpu_device_asic_reset(struct amdgpu_device *adev, 6206 struct list_head *device_list, 6207 struct amdgpu_reset_context *reset_context) 6208 { 6209 struct amdgpu_device *tmp_adev = NULL; 6210 int retry_limit = AMDGPU_MAX_RETRY_LIMIT; 6211 int r = 0; 6212 6213 retry: /* Rest of adevs pre asic reset from XGMI hive. */ 6214 list_for_each_entry(tmp_adev, device_list, reset_list) { 6215 if (adev->pcie_reset_ctx.occurs_dpc) 6216 tmp_adev->no_hw_access = true; 6217 r = amdgpu_device_pre_asic_reset(tmp_adev, reset_context); 6218 if (adev->pcie_reset_ctx.occurs_dpc) 6219 tmp_adev->no_hw_access = false; 6220 /*TODO Should we stop ?*/ 6221 if (r) { 6222 dev_err(tmp_adev->dev, "GPU pre asic reset failed with err, %d for drm dev, %s ", 6223 r, adev_to_drm(tmp_adev)->unique); 6224 tmp_adev->asic_reset_res = r; 6225 } 6226 } 6227 6228 /* Actual ASIC resets if needed.*/ 6229 /* Host driver will handle XGMI hive reset for SRIOV */ 6230 if (amdgpu_sriov_vf(adev)) { 6231 6232 /* Bail out of reset early */ 6233 if (amdgpu_ras_is_rma(adev)) 6234 return -ENODEV; 6235 6236 if (amdgpu_ras_get_fed_status(adev) || amdgpu_virt_rcvd_ras_interrupt(adev)) { 6237 dev_dbg(adev->dev, "Detected RAS error, wait for FLR completion\n"); 6238 amdgpu_ras_set_fed(adev, true); 6239 set_bit(AMDGPU_HOST_FLR, &reset_context->flags); 6240 } 6241 6242 r = amdgpu_device_reset_sriov(adev, reset_context); 6243 if (AMDGPU_RETRY_SRIOV_RESET(r) && (retry_limit--) > 0) { 6244 amdgpu_virt_release_full_gpu(adev, true); 6245 goto retry; 6246 } 6247 if (r) 6248 adev->asic_reset_res = r; 6249 } else { 6250 r = amdgpu_do_asic_reset(device_list, reset_context); 6251 if (r && r == -EAGAIN) 6252 goto retry; 6253 } 6254 6255 list_for_each_entry(tmp_adev, device_list, reset_list) { 6256 /* 6257 * Drop any pending non scheduler resets queued before reset is done. 6258 * Any reset scheduled after this point would be valid. Scheduler resets 6259 * were already dropped during drm_sched_stop and no new ones can come 6260 * in before drm_sched_start. 6261 */ 6262 amdgpu_device_stop_pending_resets(tmp_adev); 6263 } 6264 6265 return r; 6266 } 6267 6268 static int amdgpu_device_sched_resume(struct list_head *device_list, 6269 struct amdgpu_reset_context *reset_context, 6270 bool job_signaled) 6271 { 6272 struct amdgpu_device *tmp_adev = NULL; 6273 int i, r = 0; 6274 6275 /* Post ASIC reset for all devs .*/ 6276 list_for_each_entry(tmp_adev, device_list, reset_list) { 6277 6278 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 6279 struct amdgpu_ring *ring = tmp_adev->rings[i]; 6280 6281 if (!amdgpu_ring_sched_ready(ring)) 6282 continue; 6283 6284 drm_sched_start(&ring->sched, 0); 6285 } 6286 6287 if (!drm_drv_uses_atomic_modeset(adev_to_drm(tmp_adev)) && !job_signaled) 6288 drm_helper_resume_force_mode(adev_to_drm(tmp_adev)); 6289 6290 if (tmp_adev->asic_reset_res) 6291 r = tmp_adev->asic_reset_res; 6292 6293 tmp_adev->asic_reset_res = 0; 6294 6295 if (r) { 6296 /* bad news, how to tell it to userspace ? 6297 * for ras error, we should report GPU bad status instead of 6298 * reset failure 6299 */ 6300 if (reset_context->src != AMDGPU_RESET_SRC_RAS || 6301 !amdgpu_ras_eeprom_check_err_threshold(tmp_adev)) 6302 dev_info(tmp_adev->dev, "GPU reset(%d) failed\n", 6303 atomic_read(&tmp_adev->gpu_reset_counter)); 6304 amdgpu_vf_error_put(tmp_adev, AMDGIM_ERROR_VF_GPU_RESET_FAIL, 0, r); 6305 } else { 6306 dev_info(tmp_adev->dev, "GPU reset(%d) succeeded!\n", atomic_read(&tmp_adev->gpu_reset_counter)); 6307 if (amdgpu_acpi_smart_shift_update(adev_to_drm(tmp_adev), AMDGPU_SS_DEV_D0)) 6308 DRM_WARN("smart shift update failed\n"); 6309 } 6310 } 6311 6312 return r; 6313 } 6314 6315 static void amdgpu_device_gpu_resume(struct amdgpu_device *adev, 6316 struct list_head *device_list, 6317 bool need_emergency_restart) 6318 { 6319 struct amdgpu_device *tmp_adev = NULL; 6320 6321 list_for_each_entry(tmp_adev, device_list, reset_list) { 6322 /* unlock kfd: SRIOV would do it separately */ 6323 if (!need_emergency_restart && !amdgpu_sriov_vf(tmp_adev)) 6324 amdgpu_amdkfd_post_reset(tmp_adev); 6325 6326 /* kfd_post_reset will do nothing if kfd device is not initialized, 6327 * need to bring up kfd here if it's not be initialized before 6328 */ 6329 if (!adev->kfd.init_complete) 6330 amdgpu_amdkfd_device_init(adev); 6331 6332 if (tmp_adev->pcie_reset_ctx.audio_suspended) 6333 amdgpu_device_resume_display_audio(tmp_adev); 6334 6335 amdgpu_device_unset_mp1_state(tmp_adev); 6336 6337 amdgpu_ras_set_error_query_ready(tmp_adev, true); 6338 6339 } 6340 } 6341 6342 6343 /** 6344 * amdgpu_device_gpu_recover - reset the asic and recover scheduler 6345 * 6346 * @adev: amdgpu_device pointer 6347 * @job: which job trigger hang 6348 * @reset_context: amdgpu reset context pointer 6349 * 6350 * Attempt to reset the GPU if it has hung (all asics). 6351 * Attempt to do soft-reset or full-reset and reinitialize Asic 6352 * Returns 0 for success or an error on failure. 6353 */ 6354 6355 int amdgpu_device_gpu_recover(struct amdgpu_device *adev, 6356 struct amdgpu_job *job, 6357 struct amdgpu_reset_context *reset_context) 6358 { 6359 struct list_head device_list; 6360 bool job_signaled = false; 6361 struct amdgpu_hive_info *hive = NULL; 6362 int r = 0; 6363 bool need_emergency_restart = false; 6364 6365 /* 6366 * If it reaches here because of hang/timeout and a RAS error is 6367 * detected at the same time, let RAS recovery take care of it. 6368 */ 6369 if (amdgpu_ras_is_err_state(adev, AMDGPU_RAS_BLOCK__ANY) && 6370 !amdgpu_sriov_vf(adev) && 6371 reset_context->src != AMDGPU_RESET_SRC_RAS) { 6372 dev_dbg(adev->dev, 6373 "Gpu recovery from source: %d yielding to RAS error recovery handling", 6374 reset_context->src); 6375 return 0; 6376 } 6377 6378 /* 6379 * Special case: RAS triggered and full reset isn't supported 6380 */ 6381 need_emergency_restart = amdgpu_ras_need_emergency_restart(adev); 6382 6383 /* 6384 * Flush RAM to disk so that after reboot 6385 * the user can read log and see why the system rebooted. 6386 */ 6387 if (need_emergency_restart && amdgpu_ras_get_context(adev) && 6388 amdgpu_ras_get_context(adev)->reboot) { 6389 DRM_WARN("Emergency reboot."); 6390 6391 ksys_sync_helper(); 6392 emergency_restart(); 6393 } 6394 6395 dev_info(adev->dev, "GPU %s begin!\n", 6396 need_emergency_restart ? "jobs stop":"reset"); 6397 6398 if (!amdgpu_sriov_vf(adev)) 6399 hive = amdgpu_get_xgmi_hive(adev); 6400 if (hive) 6401 mutex_lock(&hive->hive_lock); 6402 6403 reset_context->job = job; 6404 reset_context->hive = hive; 6405 INIT_LIST_HEAD(&device_list); 6406 6407 if (amdgpu_device_recovery_prepare(adev, &device_list, hive)) 6408 goto end_reset; 6409 6410 /* We need to lock reset domain only once both for XGMI and single device */ 6411 amdgpu_device_recovery_get_reset_lock(adev, &device_list); 6412 6413 r = amdgpu_device_halt_activities(adev, job, reset_context, &device_list, 6414 hive, need_emergency_restart); 6415 if (r) 6416 goto reset_unlock; 6417 6418 if (need_emergency_restart) 6419 goto skip_sched_resume; 6420 /* 6421 * Must check guilty signal here since after this point all old 6422 * HW fences are force signaled. 6423 * 6424 * job->base holds a reference to parent fence 6425 */ 6426 if (job && dma_fence_is_signaled(&job->hw_fence.base)) { 6427 job_signaled = true; 6428 dev_info(adev->dev, "Guilty job already signaled, skipping HW reset"); 6429 goto skip_hw_reset; 6430 } 6431 6432 r = amdgpu_device_asic_reset(adev, &device_list, reset_context); 6433 if (r) 6434 goto reset_unlock; 6435 skip_hw_reset: 6436 r = amdgpu_device_sched_resume(&device_list, reset_context, job_signaled); 6437 if (r) 6438 goto reset_unlock; 6439 skip_sched_resume: 6440 amdgpu_device_gpu_resume(adev, &device_list, need_emergency_restart); 6441 reset_unlock: 6442 amdgpu_device_recovery_put_reset_lock(adev, &device_list); 6443 end_reset: 6444 if (hive) { 6445 mutex_unlock(&hive->hive_lock); 6446 amdgpu_put_xgmi_hive(hive); 6447 } 6448 6449 if (r) 6450 dev_info(adev->dev, "GPU reset end with ret = %d\n", r); 6451 6452 atomic_set(&adev->reset_domain->reset_res, r); 6453 6454 if (!r) 6455 drm_dev_wedged_event(adev_to_drm(adev), DRM_WEDGE_RECOVERY_NONE); 6456 6457 return r; 6458 } 6459 6460 /** 6461 * amdgpu_device_partner_bandwidth - find the bandwidth of appropriate partner 6462 * 6463 * @adev: amdgpu_device pointer 6464 * @speed: pointer to the speed of the link 6465 * @width: pointer to the width of the link 6466 * 6467 * Evaluate the hierarchy to find the speed and bandwidth capabilities of the 6468 * first physical partner to an AMD dGPU. 6469 * This will exclude any virtual switches and links. 6470 */ 6471 static void amdgpu_device_partner_bandwidth(struct amdgpu_device *adev, 6472 enum pci_bus_speed *speed, 6473 enum pcie_link_width *width) 6474 { 6475 struct pci_dev *parent = adev->pdev; 6476 6477 if (!speed || !width) 6478 return; 6479 6480 *speed = PCI_SPEED_UNKNOWN; 6481 *width = PCIE_LNK_WIDTH_UNKNOWN; 6482 6483 if (amdgpu_device_pcie_dynamic_switching_supported(adev)) { 6484 while ((parent = pci_upstream_bridge(parent))) { 6485 /* skip upstream/downstream switches internal to dGPU*/ 6486 if (parent->vendor == PCI_VENDOR_ID_ATI) 6487 continue; 6488 *speed = pcie_get_speed_cap(parent); 6489 *width = pcie_get_width_cap(parent); 6490 break; 6491 } 6492 } else { 6493 /* use the current speeds rather than max if switching is not supported */ 6494 pcie_bandwidth_available(adev->pdev, NULL, speed, width); 6495 } 6496 } 6497 6498 /** 6499 * amdgpu_device_gpu_bandwidth - find the bandwidth of the GPU 6500 * 6501 * @adev: amdgpu_device pointer 6502 * @speed: pointer to the speed of the link 6503 * @width: pointer to the width of the link 6504 * 6505 * Evaluate the hierarchy to find the speed and bandwidth capabilities of the 6506 * AMD dGPU which may be a virtual upstream bridge. 6507 */ 6508 static void amdgpu_device_gpu_bandwidth(struct amdgpu_device *adev, 6509 enum pci_bus_speed *speed, 6510 enum pcie_link_width *width) 6511 { 6512 struct pci_dev *parent = adev->pdev; 6513 6514 if (!speed || !width) 6515 return; 6516 6517 parent = pci_upstream_bridge(parent); 6518 if (parent && parent->vendor == PCI_VENDOR_ID_ATI) { 6519 /* use the upstream/downstream switches internal to dGPU */ 6520 *speed = pcie_get_speed_cap(parent); 6521 *width = pcie_get_width_cap(parent); 6522 while ((parent = pci_upstream_bridge(parent))) { 6523 if (parent->vendor == PCI_VENDOR_ID_ATI) { 6524 /* use the upstream/downstream switches internal to dGPU */ 6525 *speed = pcie_get_speed_cap(parent); 6526 *width = pcie_get_width_cap(parent); 6527 } 6528 } 6529 } else { 6530 /* use the device itself */ 6531 *speed = pcie_get_speed_cap(adev->pdev); 6532 *width = pcie_get_width_cap(adev->pdev); 6533 } 6534 } 6535 6536 /** 6537 * amdgpu_device_get_pcie_info - fence pcie info about the PCIE slot 6538 * 6539 * @adev: amdgpu_device pointer 6540 * 6541 * Fetches and stores in the driver the PCIE capabilities (gen speed 6542 * and lanes) of the slot the device is in. Handles APUs and 6543 * virtualized environments where PCIE config space may not be available. 6544 */ 6545 static void amdgpu_device_get_pcie_info(struct amdgpu_device *adev) 6546 { 6547 enum pci_bus_speed speed_cap, platform_speed_cap; 6548 enum pcie_link_width platform_link_width, link_width; 6549 6550 if (amdgpu_pcie_gen_cap) 6551 adev->pm.pcie_gen_mask = amdgpu_pcie_gen_cap; 6552 6553 if (amdgpu_pcie_lane_cap) 6554 adev->pm.pcie_mlw_mask = amdgpu_pcie_lane_cap; 6555 6556 /* covers APUs as well */ 6557 if (pci_is_root_bus(adev->pdev->bus) && !amdgpu_passthrough(adev)) { 6558 if (adev->pm.pcie_gen_mask == 0) 6559 adev->pm.pcie_gen_mask = AMDGPU_DEFAULT_PCIE_GEN_MASK; 6560 if (adev->pm.pcie_mlw_mask == 0) 6561 adev->pm.pcie_mlw_mask = AMDGPU_DEFAULT_PCIE_MLW_MASK; 6562 return; 6563 } 6564 6565 if (adev->pm.pcie_gen_mask && adev->pm.pcie_mlw_mask) 6566 return; 6567 6568 amdgpu_device_partner_bandwidth(adev, &platform_speed_cap, 6569 &platform_link_width); 6570 amdgpu_device_gpu_bandwidth(adev, &speed_cap, &link_width); 6571 6572 if (adev->pm.pcie_gen_mask == 0) { 6573 /* asic caps */ 6574 if (speed_cap == PCI_SPEED_UNKNOWN) { 6575 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 6576 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 | 6577 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3); 6578 } else { 6579 if (speed_cap == PCIE_SPEED_32_0GT) 6580 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 6581 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 | 6582 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3 | 6583 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN4 | 6584 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN5); 6585 else if (speed_cap == PCIE_SPEED_16_0GT) 6586 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 6587 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 | 6588 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3 | 6589 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN4); 6590 else if (speed_cap == PCIE_SPEED_8_0GT) 6591 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 6592 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 | 6593 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3); 6594 else if (speed_cap == PCIE_SPEED_5_0GT) 6595 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 6596 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2); 6597 else 6598 adev->pm.pcie_gen_mask |= CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1; 6599 } 6600 /* platform caps */ 6601 if (platform_speed_cap == PCI_SPEED_UNKNOWN) { 6602 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 6603 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2); 6604 } else { 6605 if (platform_speed_cap == PCIE_SPEED_32_0GT) 6606 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 6607 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 | 6608 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3 | 6609 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN4 | 6610 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN5); 6611 else if (platform_speed_cap == PCIE_SPEED_16_0GT) 6612 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 6613 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 | 6614 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3 | 6615 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN4); 6616 else if (platform_speed_cap == PCIE_SPEED_8_0GT) 6617 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 6618 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 | 6619 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3); 6620 else if (platform_speed_cap == PCIE_SPEED_5_0GT) 6621 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 6622 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2); 6623 else 6624 adev->pm.pcie_gen_mask |= CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1; 6625 6626 } 6627 } 6628 if (adev->pm.pcie_mlw_mask == 0) { 6629 /* asic caps */ 6630 if (link_width == PCIE_LNK_WIDTH_UNKNOWN) { 6631 adev->pm.pcie_mlw_mask |= AMDGPU_DEFAULT_ASIC_PCIE_MLW_MASK; 6632 } else { 6633 switch (link_width) { 6634 case PCIE_LNK_X32: 6635 adev->pm.pcie_mlw_mask |= (CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X32 | 6636 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X16 | 6637 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X12 | 6638 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X8 | 6639 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X4 | 6640 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X2 | 6641 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X1); 6642 break; 6643 case PCIE_LNK_X16: 6644 adev->pm.pcie_mlw_mask |= (CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X16 | 6645 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X12 | 6646 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X8 | 6647 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X4 | 6648 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X2 | 6649 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X1); 6650 break; 6651 case PCIE_LNK_X12: 6652 adev->pm.pcie_mlw_mask |= (CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X12 | 6653 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X8 | 6654 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X4 | 6655 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X2 | 6656 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X1); 6657 break; 6658 case PCIE_LNK_X8: 6659 adev->pm.pcie_mlw_mask |= (CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X8 | 6660 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X4 | 6661 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X2 | 6662 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X1); 6663 break; 6664 case PCIE_LNK_X4: 6665 adev->pm.pcie_mlw_mask |= (CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X4 | 6666 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X2 | 6667 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X1); 6668 break; 6669 case PCIE_LNK_X2: 6670 adev->pm.pcie_mlw_mask |= (CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X2 | 6671 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X1); 6672 break; 6673 case PCIE_LNK_X1: 6674 adev->pm.pcie_mlw_mask |= CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X1; 6675 break; 6676 default: 6677 break; 6678 } 6679 } 6680 /* platform caps */ 6681 if (platform_link_width == PCIE_LNK_WIDTH_UNKNOWN) { 6682 adev->pm.pcie_mlw_mask |= AMDGPU_DEFAULT_PCIE_MLW_MASK; 6683 } else { 6684 switch (platform_link_width) { 6685 case PCIE_LNK_X32: 6686 adev->pm.pcie_mlw_mask |= (CAIL_PCIE_LINK_WIDTH_SUPPORT_X32 | 6687 CAIL_PCIE_LINK_WIDTH_SUPPORT_X16 | 6688 CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 | 6689 CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 | 6690 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 6691 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 6692 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 6693 break; 6694 case PCIE_LNK_X16: 6695 adev->pm.pcie_mlw_mask |= (CAIL_PCIE_LINK_WIDTH_SUPPORT_X16 | 6696 CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 | 6697 CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 | 6698 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 6699 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 6700 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 6701 break; 6702 case PCIE_LNK_X12: 6703 adev->pm.pcie_mlw_mask |= (CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 | 6704 CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 | 6705 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 6706 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 6707 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 6708 break; 6709 case PCIE_LNK_X8: 6710 adev->pm.pcie_mlw_mask |= (CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 | 6711 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 6712 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 6713 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 6714 break; 6715 case PCIE_LNK_X4: 6716 adev->pm.pcie_mlw_mask |= (CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 6717 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 6718 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 6719 break; 6720 case PCIE_LNK_X2: 6721 adev->pm.pcie_mlw_mask |= (CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 6722 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 6723 break; 6724 case PCIE_LNK_X1: 6725 adev->pm.pcie_mlw_mask |= CAIL_PCIE_LINK_WIDTH_SUPPORT_X1; 6726 break; 6727 default: 6728 break; 6729 } 6730 } 6731 } 6732 } 6733 6734 /** 6735 * amdgpu_device_is_peer_accessible - Check peer access through PCIe BAR 6736 * 6737 * @adev: amdgpu_device pointer 6738 * @peer_adev: amdgpu_device pointer for peer device trying to access @adev 6739 * 6740 * Return true if @peer_adev can access (DMA) @adev through the PCIe 6741 * BAR, i.e. @adev is "large BAR" and the BAR matches the DMA mask of 6742 * @peer_adev. 6743 */ 6744 bool amdgpu_device_is_peer_accessible(struct amdgpu_device *adev, 6745 struct amdgpu_device *peer_adev) 6746 { 6747 #ifdef CONFIG_HSA_AMD_P2P 6748 bool p2p_access = 6749 !adev->gmc.xgmi.connected_to_cpu && 6750 !(pci_p2pdma_distance(adev->pdev, peer_adev->dev, false) < 0); 6751 if (!p2p_access) 6752 dev_info(adev->dev, "PCIe P2P access from peer device %s is not supported by the chipset\n", 6753 pci_name(peer_adev->pdev)); 6754 6755 bool is_large_bar = adev->gmc.visible_vram_size && 6756 adev->gmc.real_vram_size == adev->gmc.visible_vram_size; 6757 bool p2p_addressable = amdgpu_device_check_iommu_remap(peer_adev); 6758 6759 if (!p2p_addressable) { 6760 uint64_t address_mask = peer_adev->dev->dma_mask ? 6761 ~*peer_adev->dev->dma_mask : ~((1ULL << 32) - 1); 6762 resource_size_t aper_limit = 6763 adev->gmc.aper_base + adev->gmc.aper_size - 1; 6764 6765 p2p_addressable = !(adev->gmc.aper_base & address_mask || 6766 aper_limit & address_mask); 6767 } 6768 return pcie_p2p && is_large_bar && p2p_access && p2p_addressable; 6769 #else 6770 return false; 6771 #endif 6772 } 6773 6774 int amdgpu_device_baco_enter(struct drm_device *dev) 6775 { 6776 struct amdgpu_device *adev = drm_to_adev(dev); 6777 struct amdgpu_ras *ras = amdgpu_ras_get_context(adev); 6778 6779 if (!amdgpu_device_supports_baco(dev)) 6780 return -ENOTSUPP; 6781 6782 if (ras && adev->ras_enabled && 6783 adev->nbio.funcs->enable_doorbell_interrupt) 6784 adev->nbio.funcs->enable_doorbell_interrupt(adev, false); 6785 6786 return amdgpu_dpm_baco_enter(adev); 6787 } 6788 6789 int amdgpu_device_baco_exit(struct drm_device *dev) 6790 { 6791 struct amdgpu_device *adev = drm_to_adev(dev); 6792 struct amdgpu_ras *ras = amdgpu_ras_get_context(adev); 6793 int ret = 0; 6794 6795 if (!amdgpu_device_supports_baco(dev)) 6796 return -ENOTSUPP; 6797 6798 ret = amdgpu_dpm_baco_exit(adev); 6799 if (ret) 6800 return ret; 6801 6802 if (ras && adev->ras_enabled && 6803 adev->nbio.funcs->enable_doorbell_interrupt) 6804 adev->nbio.funcs->enable_doorbell_interrupt(adev, true); 6805 6806 if (amdgpu_passthrough(adev) && adev->nbio.funcs && 6807 adev->nbio.funcs->clear_doorbell_interrupt) 6808 adev->nbio.funcs->clear_doorbell_interrupt(adev); 6809 6810 return 0; 6811 } 6812 6813 /** 6814 * amdgpu_pci_error_detected - Called when a PCI error is detected. 6815 * @pdev: PCI device struct 6816 * @state: PCI channel state 6817 * 6818 * Description: Called when a PCI error is detected. 6819 * 6820 * Return: PCI_ERS_RESULT_NEED_RESET or PCI_ERS_RESULT_DISCONNECT. 6821 */ 6822 pci_ers_result_t amdgpu_pci_error_detected(struct pci_dev *pdev, pci_channel_state_t state) 6823 { 6824 struct drm_device *dev = pci_get_drvdata(pdev); 6825 struct amdgpu_device *adev = drm_to_adev(dev); 6826 struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev); 6827 struct amdgpu_reset_context reset_context; 6828 struct list_head device_list; 6829 int r = 0; 6830 6831 dev_info(adev->dev, "PCI error: detected callback!!\n"); 6832 6833 if (!amdgpu_dpm_is_link_reset_supported(adev)) { 6834 dev_warn(adev->dev, "No support for XGMI hive yet...\n"); 6835 return PCI_ERS_RESULT_DISCONNECT; 6836 } 6837 6838 adev->pci_channel_state = state; 6839 6840 switch (state) { 6841 case pci_channel_io_normal: 6842 dev_info(adev->dev, "pci_channel_io_normal: state(%d)!!\n", state); 6843 return PCI_ERS_RESULT_CAN_RECOVER; 6844 case pci_channel_io_frozen: 6845 /* Fatal error, prepare for slot reset */ 6846 dev_info(adev->dev, "pci_channel_io_frozen: state(%d)!!\n", state); 6847 6848 if (hive) 6849 mutex_lock(&hive->hive_lock); 6850 adev->pcie_reset_ctx.occurs_dpc = true; 6851 memset(&reset_context, 0, sizeof(reset_context)); 6852 INIT_LIST_HEAD(&device_list); 6853 6854 amdgpu_device_recovery_prepare(adev, &device_list, hive); 6855 amdgpu_device_recovery_get_reset_lock(adev, &device_list); 6856 r = amdgpu_device_halt_activities(adev, NULL, &reset_context, &device_list, 6857 hive, false); 6858 if (hive) { 6859 mutex_unlock(&hive->hive_lock); 6860 amdgpu_put_xgmi_hive(hive); 6861 } 6862 if (r) 6863 return PCI_ERS_RESULT_DISCONNECT; 6864 return PCI_ERS_RESULT_NEED_RESET; 6865 case pci_channel_io_perm_failure: 6866 /* Permanent error, prepare for device removal */ 6867 dev_info(adev->dev, "pci_channel_io_perm_failure: state(%d)!!\n", state); 6868 return PCI_ERS_RESULT_DISCONNECT; 6869 } 6870 6871 return PCI_ERS_RESULT_NEED_RESET; 6872 } 6873 6874 /** 6875 * amdgpu_pci_mmio_enabled - Enable MMIO and dump debug registers 6876 * @pdev: pointer to PCI device 6877 */ 6878 pci_ers_result_t amdgpu_pci_mmio_enabled(struct pci_dev *pdev) 6879 { 6880 struct drm_device *dev = pci_get_drvdata(pdev); 6881 struct amdgpu_device *adev = drm_to_adev(dev); 6882 6883 dev_info(adev->dev, "PCI error: mmio enabled callback!!\n"); 6884 6885 /* TODO - dump whatever for debugging purposes */ 6886 6887 /* This called only if amdgpu_pci_error_detected returns 6888 * PCI_ERS_RESULT_CAN_RECOVER. Read/write to the device still 6889 * works, no need to reset slot. 6890 */ 6891 6892 return PCI_ERS_RESULT_RECOVERED; 6893 } 6894 6895 /** 6896 * amdgpu_pci_slot_reset - Called when PCI slot has been reset. 6897 * @pdev: PCI device struct 6898 * 6899 * Description: This routine is called by the pci error recovery 6900 * code after the PCI slot has been reset, just before we 6901 * should resume normal operations. 6902 */ 6903 pci_ers_result_t amdgpu_pci_slot_reset(struct pci_dev *pdev) 6904 { 6905 struct drm_device *dev = pci_get_drvdata(pdev); 6906 struct amdgpu_device *adev = drm_to_adev(dev); 6907 struct amdgpu_reset_context reset_context; 6908 struct amdgpu_device *tmp_adev; 6909 struct amdgpu_hive_info *hive; 6910 struct list_head device_list; 6911 int r = 0, i; 6912 u32 memsize; 6913 6914 /* PCI error slot reset should be skipped During RAS recovery */ 6915 if ((amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 3) || 6916 amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 4)) && 6917 amdgpu_ras_in_recovery(adev)) 6918 return PCI_ERS_RESULT_RECOVERED; 6919 6920 dev_info(adev->dev, "PCI error: slot reset callback!!\n"); 6921 6922 memset(&reset_context, 0, sizeof(reset_context)); 6923 6924 /* wait for asic to come out of reset */ 6925 msleep(700); 6926 6927 /* Restore PCI confspace */ 6928 amdgpu_device_load_pci_state(pdev); 6929 6930 /* confirm ASIC came out of reset */ 6931 for (i = 0; i < adev->usec_timeout; i++) { 6932 memsize = amdgpu_asic_get_config_memsize(adev); 6933 6934 if (memsize != 0xffffffff) 6935 break; 6936 udelay(1); 6937 } 6938 if (memsize == 0xffffffff) { 6939 r = -ETIME; 6940 goto out; 6941 } 6942 6943 reset_context.method = AMD_RESET_METHOD_NONE; 6944 reset_context.reset_req_dev = adev; 6945 set_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags); 6946 set_bit(AMDGPU_SKIP_COREDUMP, &reset_context.flags); 6947 INIT_LIST_HEAD(&device_list); 6948 6949 hive = amdgpu_get_xgmi_hive(adev); 6950 if (hive) { 6951 mutex_lock(&hive->hive_lock); 6952 reset_context.hive = hive; 6953 list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head) { 6954 tmp_adev->pcie_reset_ctx.in_link_reset = true; 6955 list_add_tail(&tmp_adev->reset_list, &device_list); 6956 } 6957 } else { 6958 set_bit(AMDGPU_SKIP_HW_RESET, &reset_context.flags); 6959 list_add_tail(&adev->reset_list, &device_list); 6960 } 6961 6962 r = amdgpu_device_asic_reset(adev, &device_list, &reset_context); 6963 out: 6964 if (!r) { 6965 if (amdgpu_device_cache_pci_state(adev->pdev)) 6966 pci_restore_state(adev->pdev); 6967 dev_info(adev->dev, "PCIe error recovery succeeded\n"); 6968 } else { 6969 dev_err(adev->dev, "PCIe error recovery failed, err:%d\n", r); 6970 if (hive) { 6971 list_for_each_entry(tmp_adev, &device_list, reset_list) 6972 amdgpu_device_unset_mp1_state(tmp_adev); 6973 } 6974 amdgpu_device_recovery_put_reset_lock(adev, &device_list); 6975 } 6976 6977 if (hive) { 6978 mutex_unlock(&hive->hive_lock); 6979 amdgpu_put_xgmi_hive(hive); 6980 } 6981 6982 return r ? PCI_ERS_RESULT_DISCONNECT : PCI_ERS_RESULT_RECOVERED; 6983 } 6984 6985 /** 6986 * amdgpu_pci_resume() - resume normal ops after PCI reset 6987 * @pdev: pointer to PCI device 6988 * 6989 * Called when the error recovery driver tells us that its 6990 * OK to resume normal operation. 6991 */ 6992 void amdgpu_pci_resume(struct pci_dev *pdev) 6993 { 6994 struct drm_device *dev = pci_get_drvdata(pdev); 6995 struct amdgpu_device *adev = drm_to_adev(dev); 6996 struct list_head device_list; 6997 struct amdgpu_hive_info *hive = NULL; 6998 struct amdgpu_device *tmp_adev = NULL; 6999 7000 dev_info(adev->dev, "PCI error: resume callback!!\n"); 7001 7002 /* Only continue execution for the case of pci_channel_io_frozen */ 7003 if (adev->pci_channel_state != pci_channel_io_frozen) 7004 return; 7005 7006 INIT_LIST_HEAD(&device_list); 7007 7008 hive = amdgpu_get_xgmi_hive(adev); 7009 if (hive) { 7010 mutex_lock(&hive->hive_lock); 7011 list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head) { 7012 tmp_adev->pcie_reset_ctx.in_link_reset = false; 7013 list_add_tail(&tmp_adev->reset_list, &device_list); 7014 } 7015 } else 7016 list_add_tail(&adev->reset_list, &device_list); 7017 7018 amdgpu_device_sched_resume(&device_list, NULL, NULL); 7019 amdgpu_device_gpu_resume(adev, &device_list, false); 7020 amdgpu_device_recovery_put_reset_lock(adev, &device_list); 7021 adev->pcie_reset_ctx.occurs_dpc = false; 7022 7023 if (hive) { 7024 mutex_unlock(&hive->hive_lock); 7025 amdgpu_put_xgmi_hive(hive); 7026 } 7027 } 7028 7029 bool amdgpu_device_cache_pci_state(struct pci_dev *pdev) 7030 { 7031 struct drm_device *dev = pci_get_drvdata(pdev); 7032 struct amdgpu_device *adev = drm_to_adev(dev); 7033 int r; 7034 7035 if (amdgpu_sriov_vf(adev)) 7036 return false; 7037 7038 r = pci_save_state(pdev); 7039 if (!r) { 7040 kfree(adev->pci_state); 7041 7042 adev->pci_state = pci_store_saved_state(pdev); 7043 7044 if (!adev->pci_state) { 7045 DRM_ERROR("Failed to store PCI saved state"); 7046 return false; 7047 } 7048 } else { 7049 DRM_WARN("Failed to save PCI state, err:%d\n", r); 7050 return false; 7051 } 7052 7053 return true; 7054 } 7055 7056 bool amdgpu_device_load_pci_state(struct pci_dev *pdev) 7057 { 7058 struct drm_device *dev = pci_get_drvdata(pdev); 7059 struct amdgpu_device *adev = drm_to_adev(dev); 7060 int r; 7061 7062 if (!adev->pci_state) 7063 return false; 7064 7065 r = pci_load_saved_state(pdev, adev->pci_state); 7066 7067 if (!r) { 7068 pci_restore_state(pdev); 7069 } else { 7070 DRM_WARN("Failed to load PCI state, err:%d\n", r); 7071 return false; 7072 } 7073 7074 return true; 7075 } 7076 7077 void amdgpu_device_flush_hdp(struct amdgpu_device *adev, 7078 struct amdgpu_ring *ring) 7079 { 7080 #ifdef CONFIG_X86_64 7081 if ((adev->flags & AMD_IS_APU) && !amdgpu_passthrough(adev)) 7082 return; 7083 #endif 7084 if (adev->gmc.xgmi.connected_to_cpu) 7085 return; 7086 7087 if (ring && ring->funcs->emit_hdp_flush) 7088 amdgpu_ring_emit_hdp_flush(ring); 7089 else 7090 amdgpu_asic_flush_hdp(adev, ring); 7091 } 7092 7093 void amdgpu_device_invalidate_hdp(struct amdgpu_device *adev, 7094 struct amdgpu_ring *ring) 7095 { 7096 #ifdef CONFIG_X86_64 7097 if ((adev->flags & AMD_IS_APU) && !amdgpu_passthrough(adev)) 7098 return; 7099 #endif 7100 if (adev->gmc.xgmi.connected_to_cpu) 7101 return; 7102 7103 amdgpu_asic_invalidate_hdp(adev, ring); 7104 } 7105 7106 int amdgpu_in_reset(struct amdgpu_device *adev) 7107 { 7108 return atomic_read(&adev->reset_domain->in_gpu_reset); 7109 } 7110 7111 /** 7112 * amdgpu_device_halt() - bring hardware to some kind of halt state 7113 * 7114 * @adev: amdgpu_device pointer 7115 * 7116 * Bring hardware to some kind of halt state so that no one can touch it 7117 * any more. It will help to maintain error context when error occurred. 7118 * Compare to a simple hang, the system will keep stable at least for SSH 7119 * access. Then it should be trivial to inspect the hardware state and 7120 * see what's going on. Implemented as following: 7121 * 7122 * 1. drm_dev_unplug() makes device inaccessible to user space(IOCTLs, etc), 7123 * clears all CPU mappings to device, disallows remappings through page faults 7124 * 2. amdgpu_irq_disable_all() disables all interrupts 7125 * 3. amdgpu_fence_driver_hw_fini() signals all HW fences 7126 * 4. set adev->no_hw_access to avoid potential crashes after setp 5 7127 * 5. amdgpu_device_unmap_mmio() clears all MMIO mappings 7128 * 6. pci_disable_device() and pci_wait_for_pending_transaction() 7129 * flush any in flight DMA operations 7130 */ 7131 void amdgpu_device_halt(struct amdgpu_device *adev) 7132 { 7133 struct pci_dev *pdev = adev->pdev; 7134 struct drm_device *ddev = adev_to_drm(adev); 7135 7136 amdgpu_xcp_dev_unplug(adev); 7137 drm_dev_unplug(ddev); 7138 7139 amdgpu_irq_disable_all(adev); 7140 7141 amdgpu_fence_driver_hw_fini(adev); 7142 7143 adev->no_hw_access = true; 7144 7145 amdgpu_device_unmap_mmio(adev); 7146 7147 pci_disable_device(pdev); 7148 pci_wait_for_pending_transaction(pdev); 7149 } 7150 7151 u32 amdgpu_device_pcie_port_rreg(struct amdgpu_device *adev, 7152 u32 reg) 7153 { 7154 unsigned long flags, address, data; 7155 u32 r; 7156 7157 address = adev->nbio.funcs->get_pcie_port_index_offset(adev); 7158 data = adev->nbio.funcs->get_pcie_port_data_offset(adev); 7159 7160 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 7161 WREG32(address, reg * 4); 7162 (void)RREG32(address); 7163 r = RREG32(data); 7164 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 7165 return r; 7166 } 7167 7168 void amdgpu_device_pcie_port_wreg(struct amdgpu_device *adev, 7169 u32 reg, u32 v) 7170 { 7171 unsigned long flags, address, data; 7172 7173 address = adev->nbio.funcs->get_pcie_port_index_offset(adev); 7174 data = adev->nbio.funcs->get_pcie_port_data_offset(adev); 7175 7176 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 7177 WREG32(address, reg * 4); 7178 (void)RREG32(address); 7179 WREG32(data, v); 7180 (void)RREG32(data); 7181 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 7182 } 7183 7184 /** 7185 * amdgpu_device_get_gang - return a reference to the current gang 7186 * @adev: amdgpu_device pointer 7187 * 7188 * Returns: A new reference to the current gang leader. 7189 */ 7190 struct dma_fence *amdgpu_device_get_gang(struct amdgpu_device *adev) 7191 { 7192 struct dma_fence *fence; 7193 7194 rcu_read_lock(); 7195 fence = dma_fence_get_rcu_safe(&adev->gang_submit); 7196 rcu_read_unlock(); 7197 return fence; 7198 } 7199 7200 /** 7201 * amdgpu_device_switch_gang - switch to a new gang 7202 * @adev: amdgpu_device pointer 7203 * @gang: the gang to switch to 7204 * 7205 * Try to switch to a new gang. 7206 * Returns: NULL if we switched to the new gang or a reference to the current 7207 * gang leader. 7208 */ 7209 struct dma_fence *amdgpu_device_switch_gang(struct amdgpu_device *adev, 7210 struct dma_fence *gang) 7211 { 7212 struct dma_fence *old = NULL; 7213 7214 dma_fence_get(gang); 7215 do { 7216 dma_fence_put(old); 7217 old = amdgpu_device_get_gang(adev); 7218 if (old == gang) 7219 break; 7220 7221 if (!dma_fence_is_signaled(old)) { 7222 dma_fence_put(gang); 7223 return old; 7224 } 7225 7226 } while (cmpxchg((struct dma_fence __force **)&adev->gang_submit, 7227 old, gang) != old); 7228 7229 /* 7230 * Drop it once for the exchanged reference in adev and once for the 7231 * thread local reference acquired in amdgpu_device_get_gang(). 7232 */ 7233 dma_fence_put(old); 7234 dma_fence_put(old); 7235 return NULL; 7236 } 7237 7238 /** 7239 * amdgpu_device_enforce_isolation - enforce HW isolation 7240 * @adev: the amdgpu device pointer 7241 * @ring: the HW ring the job is supposed to run on 7242 * @job: the job which is about to be pushed to the HW ring 7243 * 7244 * Makes sure that only one client at a time can use the GFX block. 7245 * Returns: The dependency to wait on before the job can be pushed to the HW. 7246 * The function is called multiple times until NULL is returned. 7247 */ 7248 struct dma_fence *amdgpu_device_enforce_isolation(struct amdgpu_device *adev, 7249 struct amdgpu_ring *ring, 7250 struct amdgpu_job *job) 7251 { 7252 struct amdgpu_isolation *isolation = &adev->isolation[ring->xcp_id]; 7253 struct drm_sched_fence *f = job->base.s_fence; 7254 struct dma_fence *dep; 7255 void *owner; 7256 int r; 7257 7258 /* 7259 * For now enforce isolation only for the GFX block since we only need 7260 * the cleaner shader on those rings. 7261 */ 7262 if (ring->funcs->type != AMDGPU_RING_TYPE_GFX && 7263 ring->funcs->type != AMDGPU_RING_TYPE_COMPUTE) 7264 return NULL; 7265 7266 /* 7267 * All submissions where enforce isolation is false are handled as if 7268 * they come from a single client. Use ~0l as the owner to distinct it 7269 * from kernel submissions where the owner is NULL. 7270 */ 7271 owner = job->enforce_isolation ? f->owner : (void *)~0l; 7272 7273 mutex_lock(&adev->enforce_isolation_mutex); 7274 7275 /* 7276 * The "spearhead" submission is the first one which changes the 7277 * ownership to its client. We always need to wait for it to be 7278 * pushed to the HW before proceeding with anything. 7279 */ 7280 if (&f->scheduled != isolation->spearhead && 7281 !dma_fence_is_signaled(isolation->spearhead)) { 7282 dep = isolation->spearhead; 7283 goto out_grab_ref; 7284 } 7285 7286 if (isolation->owner != owner) { 7287 7288 /* 7289 * Wait for any gang to be assembled before switching to a 7290 * different owner or otherwise we could deadlock the 7291 * submissions. 7292 */ 7293 if (!job->gang_submit) { 7294 dep = amdgpu_device_get_gang(adev); 7295 if (!dma_fence_is_signaled(dep)) 7296 goto out_return_dep; 7297 dma_fence_put(dep); 7298 } 7299 7300 dma_fence_put(isolation->spearhead); 7301 isolation->spearhead = dma_fence_get(&f->scheduled); 7302 amdgpu_sync_move(&isolation->active, &isolation->prev); 7303 trace_amdgpu_isolation(isolation->owner, owner); 7304 isolation->owner = owner; 7305 } 7306 7307 /* 7308 * Specifying the ring here helps to pipeline submissions even when 7309 * isolation is enabled. If that is not desired for testing NULL can be 7310 * used instead of the ring to enforce a CPU round trip while switching 7311 * between clients. 7312 */ 7313 dep = amdgpu_sync_peek_fence(&isolation->prev, ring); 7314 r = amdgpu_sync_fence(&isolation->active, &f->finished, GFP_NOWAIT); 7315 if (r) 7316 DRM_WARN("OOM tracking isolation\n"); 7317 7318 out_grab_ref: 7319 dma_fence_get(dep); 7320 out_return_dep: 7321 mutex_unlock(&adev->enforce_isolation_mutex); 7322 return dep; 7323 } 7324 7325 bool amdgpu_device_has_display_hardware(struct amdgpu_device *adev) 7326 { 7327 switch (adev->asic_type) { 7328 #ifdef CONFIG_DRM_AMDGPU_SI 7329 case CHIP_HAINAN: 7330 #endif 7331 case CHIP_TOPAZ: 7332 /* chips with no display hardware */ 7333 return false; 7334 #ifdef CONFIG_DRM_AMDGPU_SI 7335 case CHIP_TAHITI: 7336 case CHIP_PITCAIRN: 7337 case CHIP_VERDE: 7338 case CHIP_OLAND: 7339 #endif 7340 #ifdef CONFIG_DRM_AMDGPU_CIK 7341 case CHIP_BONAIRE: 7342 case CHIP_HAWAII: 7343 case CHIP_KAVERI: 7344 case CHIP_KABINI: 7345 case CHIP_MULLINS: 7346 #endif 7347 case CHIP_TONGA: 7348 case CHIP_FIJI: 7349 case CHIP_POLARIS10: 7350 case CHIP_POLARIS11: 7351 case CHIP_POLARIS12: 7352 case CHIP_VEGAM: 7353 case CHIP_CARRIZO: 7354 case CHIP_STONEY: 7355 /* chips with display hardware */ 7356 return true; 7357 default: 7358 /* IP discovery */ 7359 if (!amdgpu_ip_version(adev, DCE_HWIP, 0) || 7360 (adev->harvest_ip_mask & AMD_HARVEST_IP_DMU_MASK)) 7361 return false; 7362 return true; 7363 } 7364 } 7365 7366 uint32_t amdgpu_device_wait_on_rreg(struct amdgpu_device *adev, 7367 uint32_t inst, uint32_t reg_addr, char reg_name[], 7368 uint32_t expected_value, uint32_t mask) 7369 { 7370 uint32_t ret = 0; 7371 uint32_t old_ = 0; 7372 uint32_t tmp_ = RREG32(reg_addr); 7373 uint32_t loop = adev->usec_timeout; 7374 7375 while ((tmp_ & (mask)) != (expected_value)) { 7376 if (old_ != tmp_) { 7377 loop = adev->usec_timeout; 7378 old_ = tmp_; 7379 } else 7380 udelay(1); 7381 tmp_ = RREG32(reg_addr); 7382 loop--; 7383 if (!loop) { 7384 DRM_WARN("Register(%d) [%s] failed to reach value 0x%08x != 0x%08xn", 7385 inst, reg_name, (uint32_t)expected_value, 7386 (uint32_t)(tmp_ & (mask))); 7387 ret = -ETIMEDOUT; 7388 break; 7389 } 7390 } 7391 return ret; 7392 } 7393 7394 ssize_t amdgpu_get_soft_full_reset_mask(struct amdgpu_ring *ring) 7395 { 7396 ssize_t size = 0; 7397 7398 if (!ring || !ring->adev) 7399 return size; 7400 7401 if (amdgpu_device_should_recover_gpu(ring->adev)) 7402 size |= AMDGPU_RESET_TYPE_FULL; 7403 7404 if (unlikely(!ring->adev->debug_disable_soft_recovery) && 7405 !amdgpu_sriov_vf(ring->adev) && ring->funcs->soft_recovery) 7406 size |= AMDGPU_RESET_TYPE_SOFT_RESET; 7407 7408 return size; 7409 } 7410 7411 ssize_t amdgpu_show_reset_mask(char *buf, uint32_t supported_reset) 7412 { 7413 ssize_t size = 0; 7414 7415 if (supported_reset == 0) { 7416 size += sysfs_emit_at(buf, size, "unsupported"); 7417 size += sysfs_emit_at(buf, size, "\n"); 7418 return size; 7419 7420 } 7421 7422 if (supported_reset & AMDGPU_RESET_TYPE_SOFT_RESET) 7423 size += sysfs_emit_at(buf, size, "soft "); 7424 7425 if (supported_reset & AMDGPU_RESET_TYPE_PER_QUEUE) 7426 size += sysfs_emit_at(buf, size, "queue "); 7427 7428 if (supported_reset & AMDGPU_RESET_TYPE_PER_PIPE) 7429 size += sysfs_emit_at(buf, size, "pipe "); 7430 7431 if (supported_reset & AMDGPU_RESET_TYPE_FULL) 7432 size += sysfs_emit_at(buf, size, "full "); 7433 7434 size += sysfs_emit_at(buf, size, "\n"); 7435 return size; 7436 } 7437