1 /* 2 * Copyright 2008 Advanced Micro Devices, Inc. 3 * Copyright 2008 Red Hat Inc. 4 * Copyright 2009 Jerome Glisse. 5 * 6 * Permission is hereby granted, free of charge, to any person obtaining a 7 * copy of this software and associated documentation files (the "Software"), 8 * to deal in the Software without restriction, including without limitation 9 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 10 * and/or sell copies of the Software, and to permit persons to whom the 11 * Software is furnished to do so, subject to the following conditions: 12 * 13 * The above copyright notice and this permission notice shall be included in 14 * all copies or substantial portions of the Software. 15 * 16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 19 * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR 20 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, 21 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR 22 * OTHER DEALINGS IN THE SOFTWARE. 23 * 24 * Authors: Dave Airlie 25 * Alex Deucher 26 * Jerome Glisse 27 */ 28 29 #include <linux/aperture.h> 30 #include <linux/power_supply.h> 31 #include <linux/kthread.h> 32 #include <linux/module.h> 33 #include <linux/console.h> 34 #include <linux/slab.h> 35 #include <linux/iommu.h> 36 #include <linux/pci.h> 37 #include <linux/pci-p2pdma.h> 38 #include <linux/apple-gmux.h> 39 40 #include <drm/drm_atomic_helper.h> 41 #include <drm/drm_client_event.h> 42 #include <drm/drm_crtc_helper.h> 43 #include <drm/drm_probe_helper.h> 44 #include <drm/amdgpu_drm.h> 45 #include <linux/device.h> 46 #include <linux/vgaarb.h> 47 #include <linux/vga_switcheroo.h> 48 #include <linux/efi.h> 49 #include "amdgpu.h" 50 #include "amdgpu_trace.h" 51 #include "amdgpu_i2c.h" 52 #include "atom.h" 53 #include "amdgpu_atombios.h" 54 #include "amdgpu_atomfirmware.h" 55 #include "amd_pcie.h" 56 #ifdef CONFIG_DRM_AMDGPU_SI 57 #include "si.h" 58 #endif 59 #ifdef CONFIG_DRM_AMDGPU_CIK 60 #include "cik.h" 61 #endif 62 #include "vi.h" 63 #include "soc15.h" 64 #include "nv.h" 65 #include "bif/bif_4_1_d.h" 66 #include <linux/firmware.h> 67 #include "amdgpu_vf_error.h" 68 69 #include "amdgpu_amdkfd.h" 70 #include "amdgpu_pm.h" 71 72 #include "amdgpu_xgmi.h" 73 #include "amdgpu_ras.h" 74 #include "amdgpu_pmu.h" 75 #include "amdgpu_fru_eeprom.h" 76 #include "amdgpu_reset.h" 77 #include "amdgpu_virt.h" 78 #include "amdgpu_dev_coredump.h" 79 80 #include <linux/suspend.h> 81 #include <drm/task_barrier.h> 82 #include <linux/pm_runtime.h> 83 84 #include <drm/drm_drv.h> 85 86 #if IS_ENABLED(CONFIG_X86) 87 #include <asm/intel-family.h> 88 #include <asm/cpu_device_id.h> 89 #endif 90 91 MODULE_FIRMWARE("amdgpu/vega10_gpu_info.bin"); 92 MODULE_FIRMWARE("amdgpu/vega12_gpu_info.bin"); 93 MODULE_FIRMWARE("amdgpu/raven_gpu_info.bin"); 94 MODULE_FIRMWARE("amdgpu/picasso_gpu_info.bin"); 95 MODULE_FIRMWARE("amdgpu/raven2_gpu_info.bin"); 96 MODULE_FIRMWARE("amdgpu/arcturus_gpu_info.bin"); 97 MODULE_FIRMWARE("amdgpu/navi12_gpu_info.bin"); 98 99 #define AMDGPU_RESUME_MS 2000 100 #define AMDGPU_MAX_RETRY_LIMIT 2 101 #define AMDGPU_RETRY_SRIOV_RESET(r) ((r) == -EBUSY || (r) == -ETIMEDOUT || (r) == -EINVAL) 102 #define AMDGPU_PCIE_INDEX_FALLBACK (0x38 >> 2) 103 #define AMDGPU_PCIE_INDEX_HI_FALLBACK (0x44 >> 2) 104 #define AMDGPU_PCIE_DATA_FALLBACK (0x3C >> 2) 105 106 #define AMDGPU_VBIOS_SKIP (1U << 0) 107 #define AMDGPU_VBIOS_OPTIONAL (1U << 1) 108 109 static const struct drm_driver amdgpu_kms_driver; 110 111 const char *amdgpu_asic_name[] = { 112 "TAHITI", 113 "PITCAIRN", 114 "VERDE", 115 "OLAND", 116 "HAINAN", 117 "BONAIRE", 118 "KAVERI", 119 "KABINI", 120 "HAWAII", 121 "MULLINS", 122 "TOPAZ", 123 "TONGA", 124 "FIJI", 125 "CARRIZO", 126 "STONEY", 127 "POLARIS10", 128 "POLARIS11", 129 "POLARIS12", 130 "VEGAM", 131 "VEGA10", 132 "VEGA12", 133 "VEGA20", 134 "RAVEN", 135 "ARCTURUS", 136 "RENOIR", 137 "ALDEBARAN", 138 "NAVI10", 139 "CYAN_SKILLFISH", 140 "NAVI14", 141 "NAVI12", 142 "SIENNA_CICHLID", 143 "NAVY_FLOUNDER", 144 "VANGOGH", 145 "DIMGREY_CAVEFISH", 146 "BEIGE_GOBY", 147 "YELLOW_CARP", 148 "IP DISCOVERY", 149 "LAST", 150 }; 151 152 #define AMDGPU_IP_BLK_MASK_ALL GENMASK(AMD_IP_BLOCK_TYPE_NUM - 1, 0) 153 /* 154 * Default init level where all blocks are expected to be initialized. This is 155 * the level of initialization expected by default and also after a full reset 156 * of the device. 157 */ 158 struct amdgpu_init_level amdgpu_init_default = { 159 .level = AMDGPU_INIT_LEVEL_DEFAULT, 160 .hwini_ip_block_mask = AMDGPU_IP_BLK_MASK_ALL, 161 }; 162 163 struct amdgpu_init_level amdgpu_init_recovery = { 164 .level = AMDGPU_INIT_LEVEL_RESET_RECOVERY, 165 .hwini_ip_block_mask = AMDGPU_IP_BLK_MASK_ALL, 166 }; 167 168 /* 169 * Minimal blocks needed to be initialized before a XGMI hive can be reset. This 170 * is used for cases like reset on initialization where the entire hive needs to 171 * be reset before first use. 172 */ 173 struct amdgpu_init_level amdgpu_init_minimal_xgmi = { 174 .level = AMDGPU_INIT_LEVEL_MINIMAL_XGMI, 175 .hwini_ip_block_mask = 176 BIT(AMD_IP_BLOCK_TYPE_GMC) | BIT(AMD_IP_BLOCK_TYPE_SMC) | 177 BIT(AMD_IP_BLOCK_TYPE_COMMON) | BIT(AMD_IP_BLOCK_TYPE_IH) | 178 BIT(AMD_IP_BLOCK_TYPE_PSP) 179 }; 180 181 static inline bool amdgpu_ip_member_of_hwini(struct amdgpu_device *adev, 182 enum amd_ip_block_type block) 183 { 184 return (adev->init_lvl->hwini_ip_block_mask & (1U << block)) != 0; 185 } 186 187 void amdgpu_set_init_level(struct amdgpu_device *adev, 188 enum amdgpu_init_lvl_id lvl) 189 { 190 switch (lvl) { 191 case AMDGPU_INIT_LEVEL_MINIMAL_XGMI: 192 adev->init_lvl = &amdgpu_init_minimal_xgmi; 193 break; 194 case AMDGPU_INIT_LEVEL_RESET_RECOVERY: 195 adev->init_lvl = &amdgpu_init_recovery; 196 break; 197 case AMDGPU_INIT_LEVEL_DEFAULT: 198 fallthrough; 199 default: 200 adev->init_lvl = &amdgpu_init_default; 201 break; 202 } 203 } 204 205 static inline void amdgpu_device_stop_pending_resets(struct amdgpu_device *adev); 206 static int amdgpu_device_pm_notifier(struct notifier_block *nb, unsigned long mode, 207 void *data); 208 209 /** 210 * DOC: pcie_replay_count 211 * 212 * The amdgpu driver provides a sysfs API for reporting the total number 213 * of PCIe replays (NAKs). 214 * The file pcie_replay_count is used for this and returns the total 215 * number of replays as a sum of the NAKs generated and NAKs received. 216 */ 217 218 static ssize_t amdgpu_device_get_pcie_replay_count(struct device *dev, 219 struct device_attribute *attr, char *buf) 220 { 221 struct drm_device *ddev = dev_get_drvdata(dev); 222 struct amdgpu_device *adev = drm_to_adev(ddev); 223 uint64_t cnt = amdgpu_asic_get_pcie_replay_count(adev); 224 225 return sysfs_emit(buf, "%llu\n", cnt); 226 } 227 228 static DEVICE_ATTR(pcie_replay_count, 0444, 229 amdgpu_device_get_pcie_replay_count, NULL); 230 231 static int amdgpu_device_attr_sysfs_init(struct amdgpu_device *adev) 232 { 233 int ret = 0; 234 235 if (amdgpu_nbio_is_replay_cnt_supported(adev)) 236 ret = sysfs_create_file(&adev->dev->kobj, 237 &dev_attr_pcie_replay_count.attr); 238 239 return ret; 240 } 241 242 static void amdgpu_device_attr_sysfs_fini(struct amdgpu_device *adev) 243 { 244 if (amdgpu_nbio_is_replay_cnt_supported(adev)) 245 sysfs_remove_file(&adev->dev->kobj, 246 &dev_attr_pcie_replay_count.attr); 247 } 248 249 static ssize_t amdgpu_sysfs_reg_state_get(struct file *f, struct kobject *kobj, 250 const struct bin_attribute *attr, char *buf, 251 loff_t ppos, size_t count) 252 { 253 struct device *dev = kobj_to_dev(kobj); 254 struct drm_device *ddev = dev_get_drvdata(dev); 255 struct amdgpu_device *adev = drm_to_adev(ddev); 256 ssize_t bytes_read; 257 258 switch (ppos) { 259 case AMDGPU_SYS_REG_STATE_XGMI: 260 bytes_read = amdgpu_asic_get_reg_state( 261 adev, AMDGPU_REG_STATE_TYPE_XGMI, buf, count); 262 break; 263 case AMDGPU_SYS_REG_STATE_WAFL: 264 bytes_read = amdgpu_asic_get_reg_state( 265 adev, AMDGPU_REG_STATE_TYPE_WAFL, buf, count); 266 break; 267 case AMDGPU_SYS_REG_STATE_PCIE: 268 bytes_read = amdgpu_asic_get_reg_state( 269 adev, AMDGPU_REG_STATE_TYPE_PCIE, buf, count); 270 break; 271 case AMDGPU_SYS_REG_STATE_USR: 272 bytes_read = amdgpu_asic_get_reg_state( 273 adev, AMDGPU_REG_STATE_TYPE_USR, buf, count); 274 break; 275 case AMDGPU_SYS_REG_STATE_USR_1: 276 bytes_read = amdgpu_asic_get_reg_state( 277 adev, AMDGPU_REG_STATE_TYPE_USR_1, buf, count); 278 break; 279 default: 280 return -EINVAL; 281 } 282 283 return bytes_read; 284 } 285 286 static const BIN_ATTR(reg_state, 0444, amdgpu_sysfs_reg_state_get, NULL, 287 AMDGPU_SYS_REG_STATE_END); 288 289 int amdgpu_reg_state_sysfs_init(struct amdgpu_device *adev) 290 { 291 int ret; 292 293 if (!amdgpu_asic_get_reg_state_supported(adev)) 294 return 0; 295 296 ret = sysfs_create_bin_file(&adev->dev->kobj, &bin_attr_reg_state); 297 298 return ret; 299 } 300 301 void amdgpu_reg_state_sysfs_fini(struct amdgpu_device *adev) 302 { 303 if (!amdgpu_asic_get_reg_state_supported(adev)) 304 return; 305 sysfs_remove_bin_file(&adev->dev->kobj, &bin_attr_reg_state); 306 } 307 308 int amdgpu_ip_block_suspend(struct amdgpu_ip_block *ip_block) 309 { 310 int r; 311 312 if (ip_block->version->funcs->suspend) { 313 r = ip_block->version->funcs->suspend(ip_block); 314 if (r) { 315 dev_err(ip_block->adev->dev, 316 "suspend of IP block <%s> failed %d\n", 317 ip_block->version->funcs->name, r); 318 return r; 319 } 320 } 321 322 ip_block->status.hw = false; 323 return 0; 324 } 325 326 int amdgpu_ip_block_resume(struct amdgpu_ip_block *ip_block) 327 { 328 int r; 329 330 if (ip_block->version->funcs->resume) { 331 r = ip_block->version->funcs->resume(ip_block); 332 if (r) { 333 dev_err(ip_block->adev->dev, 334 "resume of IP block <%s> failed %d\n", 335 ip_block->version->funcs->name, r); 336 return r; 337 } 338 } 339 340 ip_block->status.hw = true; 341 return 0; 342 } 343 344 /** 345 * DOC: board_info 346 * 347 * The amdgpu driver provides a sysfs API for giving board related information. 348 * It provides the form factor information in the format 349 * 350 * type : form factor 351 * 352 * Possible form factor values 353 * 354 * - "cem" - PCIE CEM card 355 * - "oam" - Open Compute Accelerator Module 356 * - "unknown" - Not known 357 * 358 */ 359 360 static ssize_t amdgpu_device_get_board_info(struct device *dev, 361 struct device_attribute *attr, 362 char *buf) 363 { 364 struct drm_device *ddev = dev_get_drvdata(dev); 365 struct amdgpu_device *adev = drm_to_adev(ddev); 366 enum amdgpu_pkg_type pkg_type = AMDGPU_PKG_TYPE_CEM; 367 const char *pkg; 368 369 if (adev->smuio.funcs && adev->smuio.funcs->get_pkg_type) 370 pkg_type = adev->smuio.funcs->get_pkg_type(adev); 371 372 switch (pkg_type) { 373 case AMDGPU_PKG_TYPE_CEM: 374 pkg = "cem"; 375 break; 376 case AMDGPU_PKG_TYPE_OAM: 377 pkg = "oam"; 378 break; 379 default: 380 pkg = "unknown"; 381 break; 382 } 383 384 return sysfs_emit(buf, "%s : %s\n", "type", pkg); 385 } 386 387 static DEVICE_ATTR(board_info, 0444, amdgpu_device_get_board_info, NULL); 388 389 static struct attribute *amdgpu_board_attrs[] = { 390 &dev_attr_board_info.attr, 391 NULL, 392 }; 393 394 static umode_t amdgpu_board_attrs_is_visible(struct kobject *kobj, 395 struct attribute *attr, int n) 396 { 397 struct device *dev = kobj_to_dev(kobj); 398 struct drm_device *ddev = dev_get_drvdata(dev); 399 struct amdgpu_device *adev = drm_to_adev(ddev); 400 401 if (adev->flags & AMD_IS_APU) 402 return 0; 403 404 return attr->mode; 405 } 406 407 static const struct attribute_group amdgpu_board_attrs_group = { 408 .attrs = amdgpu_board_attrs, 409 .is_visible = amdgpu_board_attrs_is_visible 410 }; 411 412 static void amdgpu_device_get_pcie_info(struct amdgpu_device *adev); 413 414 415 /** 416 * amdgpu_device_supports_px - Is the device a dGPU with ATPX power control 417 * 418 * @dev: drm_device pointer 419 * 420 * Returns true if the device is a dGPU with ATPX power control, 421 * otherwise return false. 422 */ 423 bool amdgpu_device_supports_px(struct drm_device *dev) 424 { 425 struct amdgpu_device *adev = drm_to_adev(dev); 426 427 if ((adev->flags & AMD_IS_PX) && !amdgpu_is_atpx_hybrid()) 428 return true; 429 return false; 430 } 431 432 /** 433 * amdgpu_device_supports_boco - Is the device a dGPU with ACPI power resources 434 * 435 * @dev: drm_device pointer 436 * 437 * Returns true if the device is a dGPU with ACPI power control, 438 * otherwise return false. 439 */ 440 bool amdgpu_device_supports_boco(struct drm_device *dev) 441 { 442 struct amdgpu_device *adev = drm_to_adev(dev); 443 444 if (!IS_ENABLED(CONFIG_HOTPLUG_PCI_PCIE)) 445 return false; 446 447 if (adev->has_pr3 || 448 ((adev->flags & AMD_IS_PX) && amdgpu_is_atpx_hybrid())) 449 return true; 450 return false; 451 } 452 453 /** 454 * amdgpu_device_supports_baco - Does the device support BACO 455 * 456 * @dev: drm_device pointer 457 * 458 * Return: 459 * 1 if the device supports BACO; 460 * 3 if the device supports MACO (only works if BACO is supported) 461 * otherwise return 0. 462 */ 463 int amdgpu_device_supports_baco(struct drm_device *dev) 464 { 465 struct amdgpu_device *adev = drm_to_adev(dev); 466 467 return amdgpu_asic_supports_baco(adev); 468 } 469 470 void amdgpu_device_detect_runtime_pm_mode(struct amdgpu_device *adev) 471 { 472 struct drm_device *dev; 473 int bamaco_support; 474 475 dev = adev_to_drm(adev); 476 477 adev->pm.rpm_mode = AMDGPU_RUNPM_NONE; 478 bamaco_support = amdgpu_device_supports_baco(dev); 479 480 switch (amdgpu_runtime_pm) { 481 case 2: 482 if (bamaco_support & MACO_SUPPORT) { 483 adev->pm.rpm_mode = AMDGPU_RUNPM_BAMACO; 484 dev_info(adev->dev, "Forcing BAMACO for runtime pm\n"); 485 } else if (bamaco_support == BACO_SUPPORT) { 486 adev->pm.rpm_mode = AMDGPU_RUNPM_BACO; 487 dev_info(adev->dev, "Requested mode BAMACO not available,fallback to use BACO\n"); 488 } 489 break; 490 case 1: 491 if (bamaco_support & BACO_SUPPORT) { 492 adev->pm.rpm_mode = AMDGPU_RUNPM_BACO; 493 dev_info(adev->dev, "Forcing BACO for runtime pm\n"); 494 } 495 break; 496 case -1: 497 case -2: 498 if (amdgpu_device_supports_px(dev)) { /* enable PX as runtime mode */ 499 adev->pm.rpm_mode = AMDGPU_RUNPM_PX; 500 dev_info(adev->dev, "Using ATPX for runtime pm\n"); 501 } else if (amdgpu_device_supports_boco(dev)) { /* enable boco as runtime mode */ 502 adev->pm.rpm_mode = AMDGPU_RUNPM_BOCO; 503 dev_info(adev->dev, "Using BOCO for runtime pm\n"); 504 } else { 505 if (!bamaco_support) 506 goto no_runtime_pm; 507 508 switch (adev->asic_type) { 509 case CHIP_VEGA20: 510 case CHIP_ARCTURUS: 511 /* BACO are not supported on vega20 and arctrus */ 512 break; 513 case CHIP_VEGA10: 514 /* enable BACO as runpm mode if noretry=0 */ 515 if (!adev->gmc.noretry && !amdgpu_passthrough(adev)) 516 adev->pm.rpm_mode = AMDGPU_RUNPM_BACO; 517 break; 518 default: 519 /* enable BACO as runpm mode on CI+ */ 520 if (!amdgpu_passthrough(adev)) 521 adev->pm.rpm_mode = AMDGPU_RUNPM_BACO; 522 break; 523 } 524 525 if (adev->pm.rpm_mode == AMDGPU_RUNPM_BACO) { 526 if (bamaco_support & MACO_SUPPORT) { 527 adev->pm.rpm_mode = AMDGPU_RUNPM_BAMACO; 528 dev_info(adev->dev, "Using BAMACO for runtime pm\n"); 529 } else { 530 dev_info(adev->dev, "Using BACO for runtime pm\n"); 531 } 532 } 533 } 534 break; 535 case 0: 536 dev_info(adev->dev, "runtime pm is manually disabled\n"); 537 break; 538 default: 539 break; 540 } 541 542 no_runtime_pm: 543 if (adev->pm.rpm_mode == AMDGPU_RUNPM_NONE) 544 dev_info(adev->dev, "Runtime PM not available\n"); 545 } 546 /** 547 * amdgpu_device_supports_smart_shift - Is the device dGPU with 548 * smart shift support 549 * 550 * @dev: drm_device pointer 551 * 552 * Returns true if the device is a dGPU with Smart Shift support, 553 * otherwise returns false. 554 */ 555 bool amdgpu_device_supports_smart_shift(struct drm_device *dev) 556 { 557 return (amdgpu_device_supports_boco(dev) && 558 amdgpu_acpi_is_power_shift_control_supported()); 559 } 560 561 /* 562 * VRAM access helper functions 563 */ 564 565 /** 566 * amdgpu_device_mm_access - access vram by MM_INDEX/MM_DATA 567 * 568 * @adev: amdgpu_device pointer 569 * @pos: offset of the buffer in vram 570 * @buf: virtual address of the buffer in system memory 571 * @size: read/write size, sizeof(@buf) must > @size 572 * @write: true - write to vram, otherwise - read from vram 573 */ 574 void amdgpu_device_mm_access(struct amdgpu_device *adev, loff_t pos, 575 void *buf, size_t size, bool write) 576 { 577 unsigned long flags; 578 uint32_t hi = ~0, tmp = 0; 579 uint32_t *data = buf; 580 uint64_t last; 581 int idx; 582 583 if (!drm_dev_enter(adev_to_drm(adev), &idx)) 584 return; 585 586 BUG_ON(!IS_ALIGNED(pos, 4) || !IS_ALIGNED(size, 4)); 587 588 spin_lock_irqsave(&adev->mmio_idx_lock, flags); 589 for (last = pos + size; pos < last; pos += 4) { 590 tmp = pos >> 31; 591 592 WREG32_NO_KIQ(mmMM_INDEX, ((uint32_t)pos) | 0x80000000); 593 if (tmp != hi) { 594 WREG32_NO_KIQ(mmMM_INDEX_HI, tmp); 595 hi = tmp; 596 } 597 if (write) 598 WREG32_NO_KIQ(mmMM_DATA, *data++); 599 else 600 *data++ = RREG32_NO_KIQ(mmMM_DATA); 601 } 602 603 spin_unlock_irqrestore(&adev->mmio_idx_lock, flags); 604 drm_dev_exit(idx); 605 } 606 607 /** 608 * amdgpu_device_aper_access - access vram by vram aperture 609 * 610 * @adev: amdgpu_device pointer 611 * @pos: offset of the buffer in vram 612 * @buf: virtual address of the buffer in system memory 613 * @size: read/write size, sizeof(@buf) must > @size 614 * @write: true - write to vram, otherwise - read from vram 615 * 616 * The return value means how many bytes have been transferred. 617 */ 618 size_t amdgpu_device_aper_access(struct amdgpu_device *adev, loff_t pos, 619 void *buf, size_t size, bool write) 620 { 621 #ifdef CONFIG_64BIT 622 void __iomem *addr; 623 size_t count = 0; 624 uint64_t last; 625 626 if (!adev->mman.aper_base_kaddr) 627 return 0; 628 629 last = min(pos + size, adev->gmc.visible_vram_size); 630 if (last > pos) { 631 addr = adev->mman.aper_base_kaddr + pos; 632 count = last - pos; 633 634 if (write) { 635 memcpy_toio(addr, buf, count); 636 /* Make sure HDP write cache flush happens without any reordering 637 * after the system memory contents are sent over PCIe device 638 */ 639 mb(); 640 amdgpu_device_flush_hdp(adev, NULL); 641 } else { 642 amdgpu_device_invalidate_hdp(adev, NULL); 643 /* Make sure HDP read cache is invalidated before issuing a read 644 * to the PCIe device 645 */ 646 mb(); 647 memcpy_fromio(buf, addr, count); 648 } 649 650 } 651 652 return count; 653 #else 654 return 0; 655 #endif 656 } 657 658 /** 659 * amdgpu_device_vram_access - read/write a buffer in vram 660 * 661 * @adev: amdgpu_device pointer 662 * @pos: offset of the buffer in vram 663 * @buf: virtual address of the buffer in system memory 664 * @size: read/write size, sizeof(@buf) must > @size 665 * @write: true - write to vram, otherwise - read from vram 666 */ 667 void amdgpu_device_vram_access(struct amdgpu_device *adev, loff_t pos, 668 void *buf, size_t size, bool write) 669 { 670 size_t count; 671 672 /* try to using vram apreature to access vram first */ 673 count = amdgpu_device_aper_access(adev, pos, buf, size, write); 674 size -= count; 675 if (size) { 676 /* using MM to access rest vram */ 677 pos += count; 678 buf += count; 679 amdgpu_device_mm_access(adev, pos, buf, size, write); 680 } 681 } 682 683 /* 684 * register access helper functions. 685 */ 686 687 /* Check if hw access should be skipped because of hotplug or device error */ 688 bool amdgpu_device_skip_hw_access(struct amdgpu_device *adev) 689 { 690 if (adev->no_hw_access) 691 return true; 692 693 #ifdef CONFIG_LOCKDEP 694 /* 695 * This is a bit complicated to understand, so worth a comment. What we assert 696 * here is that the GPU reset is not running on another thread in parallel. 697 * 698 * For this we trylock the read side of the reset semaphore, if that succeeds 699 * we know that the reset is not running in parallel. 700 * 701 * If the trylock fails we assert that we are either already holding the read 702 * side of the lock or are the reset thread itself and hold the write side of 703 * the lock. 704 */ 705 if (in_task()) { 706 if (down_read_trylock(&adev->reset_domain->sem)) 707 up_read(&adev->reset_domain->sem); 708 else 709 lockdep_assert_held(&adev->reset_domain->sem); 710 } 711 #endif 712 return false; 713 } 714 715 /** 716 * amdgpu_device_rreg - read a memory mapped IO or indirect register 717 * 718 * @adev: amdgpu_device pointer 719 * @reg: dword aligned register offset 720 * @acc_flags: access flags which require special behavior 721 * 722 * Returns the 32 bit value from the offset specified. 723 */ 724 uint32_t amdgpu_device_rreg(struct amdgpu_device *adev, 725 uint32_t reg, uint32_t acc_flags) 726 { 727 uint32_t ret; 728 729 if (amdgpu_device_skip_hw_access(adev)) 730 return 0; 731 732 if ((reg * 4) < adev->rmmio_size) { 733 if (!(acc_flags & AMDGPU_REGS_NO_KIQ) && 734 amdgpu_sriov_runtime(adev) && 735 down_read_trylock(&adev->reset_domain->sem)) { 736 ret = amdgpu_kiq_rreg(adev, reg, 0); 737 up_read(&adev->reset_domain->sem); 738 } else { 739 ret = readl(((void __iomem *)adev->rmmio) + (reg * 4)); 740 } 741 } else { 742 ret = adev->pcie_rreg(adev, reg * 4); 743 } 744 745 trace_amdgpu_device_rreg(adev->pdev->device, reg, ret); 746 747 return ret; 748 } 749 750 /* 751 * MMIO register read with bytes helper functions 752 * @offset:bytes offset from MMIO start 753 */ 754 755 /** 756 * amdgpu_mm_rreg8 - read a memory mapped IO register 757 * 758 * @adev: amdgpu_device pointer 759 * @offset: byte aligned register offset 760 * 761 * Returns the 8 bit value from the offset specified. 762 */ 763 uint8_t amdgpu_mm_rreg8(struct amdgpu_device *adev, uint32_t offset) 764 { 765 if (amdgpu_device_skip_hw_access(adev)) 766 return 0; 767 768 if (offset < adev->rmmio_size) 769 return (readb(adev->rmmio + offset)); 770 BUG(); 771 } 772 773 774 /** 775 * amdgpu_device_xcc_rreg - read a memory mapped IO or indirect register with specific XCC 776 * 777 * @adev: amdgpu_device pointer 778 * @reg: dword aligned register offset 779 * @acc_flags: access flags which require special behavior 780 * @xcc_id: xcc accelerated compute core id 781 * 782 * Returns the 32 bit value from the offset specified. 783 */ 784 uint32_t amdgpu_device_xcc_rreg(struct amdgpu_device *adev, 785 uint32_t reg, uint32_t acc_flags, 786 uint32_t xcc_id) 787 { 788 uint32_t ret, rlcg_flag; 789 790 if (amdgpu_device_skip_hw_access(adev)) 791 return 0; 792 793 if ((reg * 4) < adev->rmmio_size) { 794 if (amdgpu_sriov_vf(adev) && 795 !amdgpu_sriov_runtime(adev) && 796 adev->gfx.rlc.rlcg_reg_access_supported && 797 amdgpu_virt_get_rlcg_reg_access_flag(adev, acc_flags, 798 GC_HWIP, false, 799 &rlcg_flag)) { 800 ret = amdgpu_virt_rlcg_reg_rw(adev, reg, 0, rlcg_flag, GET_INST(GC, xcc_id)); 801 } else if (!(acc_flags & AMDGPU_REGS_NO_KIQ) && 802 amdgpu_sriov_runtime(adev) && 803 down_read_trylock(&adev->reset_domain->sem)) { 804 ret = amdgpu_kiq_rreg(adev, reg, xcc_id); 805 up_read(&adev->reset_domain->sem); 806 } else { 807 ret = readl(((void __iomem *)adev->rmmio) + (reg * 4)); 808 } 809 } else { 810 ret = adev->pcie_rreg(adev, reg * 4); 811 } 812 813 return ret; 814 } 815 816 /* 817 * MMIO register write with bytes helper functions 818 * @offset:bytes offset from MMIO start 819 * @value: the value want to be written to the register 820 */ 821 822 /** 823 * amdgpu_mm_wreg8 - read a memory mapped IO register 824 * 825 * @adev: amdgpu_device pointer 826 * @offset: byte aligned register offset 827 * @value: 8 bit value to write 828 * 829 * Writes the value specified to the offset specified. 830 */ 831 void amdgpu_mm_wreg8(struct amdgpu_device *adev, uint32_t offset, uint8_t value) 832 { 833 if (amdgpu_device_skip_hw_access(adev)) 834 return; 835 836 if (offset < adev->rmmio_size) 837 writeb(value, adev->rmmio + offset); 838 else 839 BUG(); 840 } 841 842 /** 843 * amdgpu_device_wreg - write to a memory mapped IO or indirect register 844 * 845 * @adev: amdgpu_device pointer 846 * @reg: dword aligned register offset 847 * @v: 32 bit value to write to the register 848 * @acc_flags: access flags which require special behavior 849 * 850 * Writes the value specified to the offset specified. 851 */ 852 void amdgpu_device_wreg(struct amdgpu_device *adev, 853 uint32_t reg, uint32_t v, 854 uint32_t acc_flags) 855 { 856 if (amdgpu_device_skip_hw_access(adev)) 857 return; 858 859 if ((reg * 4) < adev->rmmio_size) { 860 if (!(acc_flags & AMDGPU_REGS_NO_KIQ) && 861 amdgpu_sriov_runtime(adev) && 862 down_read_trylock(&adev->reset_domain->sem)) { 863 amdgpu_kiq_wreg(adev, reg, v, 0); 864 up_read(&adev->reset_domain->sem); 865 } else { 866 writel(v, ((void __iomem *)adev->rmmio) + (reg * 4)); 867 } 868 } else { 869 adev->pcie_wreg(adev, reg * 4, v); 870 } 871 872 trace_amdgpu_device_wreg(adev->pdev->device, reg, v); 873 } 874 875 /** 876 * amdgpu_mm_wreg_mmio_rlc - write register either with direct/indirect mmio or with RLC path if in range 877 * 878 * @adev: amdgpu_device pointer 879 * @reg: mmio/rlc register 880 * @v: value to write 881 * @xcc_id: xcc accelerated compute core id 882 * 883 * this function is invoked only for the debugfs register access 884 */ 885 void amdgpu_mm_wreg_mmio_rlc(struct amdgpu_device *adev, 886 uint32_t reg, uint32_t v, 887 uint32_t xcc_id) 888 { 889 if (amdgpu_device_skip_hw_access(adev)) 890 return; 891 892 if (amdgpu_sriov_fullaccess(adev) && 893 adev->gfx.rlc.funcs && 894 adev->gfx.rlc.funcs->is_rlcg_access_range) { 895 if (adev->gfx.rlc.funcs->is_rlcg_access_range(adev, reg)) 896 return amdgpu_sriov_wreg(adev, reg, v, 0, 0, xcc_id); 897 } else if ((reg * 4) >= adev->rmmio_size) { 898 adev->pcie_wreg(adev, reg * 4, v); 899 } else { 900 writel(v, ((void __iomem *)adev->rmmio) + (reg * 4)); 901 } 902 } 903 904 /** 905 * amdgpu_device_xcc_wreg - write to a memory mapped IO or indirect register with specific XCC 906 * 907 * @adev: amdgpu_device pointer 908 * @reg: dword aligned register offset 909 * @v: 32 bit value to write to the register 910 * @acc_flags: access flags which require special behavior 911 * @xcc_id: xcc accelerated compute core id 912 * 913 * Writes the value specified to the offset specified. 914 */ 915 void amdgpu_device_xcc_wreg(struct amdgpu_device *adev, 916 uint32_t reg, uint32_t v, 917 uint32_t acc_flags, uint32_t xcc_id) 918 { 919 uint32_t rlcg_flag; 920 921 if (amdgpu_device_skip_hw_access(adev)) 922 return; 923 924 if ((reg * 4) < adev->rmmio_size) { 925 if (amdgpu_sriov_vf(adev) && 926 !amdgpu_sriov_runtime(adev) && 927 adev->gfx.rlc.rlcg_reg_access_supported && 928 amdgpu_virt_get_rlcg_reg_access_flag(adev, acc_flags, 929 GC_HWIP, true, 930 &rlcg_flag)) { 931 amdgpu_virt_rlcg_reg_rw(adev, reg, v, rlcg_flag, GET_INST(GC, xcc_id)); 932 } else if (!(acc_flags & AMDGPU_REGS_NO_KIQ) && 933 amdgpu_sriov_runtime(adev) && 934 down_read_trylock(&adev->reset_domain->sem)) { 935 amdgpu_kiq_wreg(adev, reg, v, xcc_id); 936 up_read(&adev->reset_domain->sem); 937 } else { 938 writel(v, ((void __iomem *)adev->rmmio) + (reg * 4)); 939 } 940 } else { 941 adev->pcie_wreg(adev, reg * 4, v); 942 } 943 } 944 945 /** 946 * amdgpu_device_indirect_rreg - read an indirect register 947 * 948 * @adev: amdgpu_device pointer 949 * @reg_addr: indirect register address to read from 950 * 951 * Returns the value of indirect register @reg_addr 952 */ 953 u32 amdgpu_device_indirect_rreg(struct amdgpu_device *adev, 954 u32 reg_addr) 955 { 956 unsigned long flags, pcie_index, pcie_data; 957 void __iomem *pcie_index_offset; 958 void __iomem *pcie_data_offset; 959 u32 r; 960 961 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 962 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 963 964 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 965 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 966 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 967 968 writel(reg_addr, pcie_index_offset); 969 readl(pcie_index_offset); 970 r = readl(pcie_data_offset); 971 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 972 973 return r; 974 } 975 976 u32 amdgpu_device_indirect_rreg_ext(struct amdgpu_device *adev, 977 u64 reg_addr) 978 { 979 unsigned long flags, pcie_index, pcie_index_hi, pcie_data; 980 u32 r; 981 void __iomem *pcie_index_offset; 982 void __iomem *pcie_index_hi_offset; 983 void __iomem *pcie_data_offset; 984 985 if (unlikely(!adev->nbio.funcs)) { 986 pcie_index = AMDGPU_PCIE_INDEX_FALLBACK; 987 pcie_data = AMDGPU_PCIE_DATA_FALLBACK; 988 } else { 989 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 990 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 991 } 992 993 if (reg_addr >> 32) { 994 if (unlikely(!adev->nbio.funcs)) 995 pcie_index_hi = AMDGPU_PCIE_INDEX_HI_FALLBACK; 996 else 997 pcie_index_hi = adev->nbio.funcs->get_pcie_index_hi_offset(adev); 998 } else { 999 pcie_index_hi = 0; 1000 } 1001 1002 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 1003 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 1004 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 1005 if (pcie_index_hi != 0) 1006 pcie_index_hi_offset = (void __iomem *)adev->rmmio + 1007 pcie_index_hi * 4; 1008 1009 writel(reg_addr, pcie_index_offset); 1010 readl(pcie_index_offset); 1011 if (pcie_index_hi != 0) { 1012 writel((reg_addr >> 32) & 0xff, pcie_index_hi_offset); 1013 readl(pcie_index_hi_offset); 1014 } 1015 r = readl(pcie_data_offset); 1016 1017 /* clear the high bits */ 1018 if (pcie_index_hi != 0) { 1019 writel(0, pcie_index_hi_offset); 1020 readl(pcie_index_hi_offset); 1021 } 1022 1023 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 1024 1025 return r; 1026 } 1027 1028 /** 1029 * amdgpu_device_indirect_rreg64 - read a 64bits indirect register 1030 * 1031 * @adev: amdgpu_device pointer 1032 * @reg_addr: indirect register address to read from 1033 * 1034 * Returns the value of indirect register @reg_addr 1035 */ 1036 u64 amdgpu_device_indirect_rreg64(struct amdgpu_device *adev, 1037 u32 reg_addr) 1038 { 1039 unsigned long flags, pcie_index, pcie_data; 1040 void __iomem *pcie_index_offset; 1041 void __iomem *pcie_data_offset; 1042 u64 r; 1043 1044 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 1045 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 1046 1047 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 1048 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 1049 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 1050 1051 /* read low 32 bits */ 1052 writel(reg_addr, pcie_index_offset); 1053 readl(pcie_index_offset); 1054 r = readl(pcie_data_offset); 1055 /* read high 32 bits */ 1056 writel(reg_addr + 4, pcie_index_offset); 1057 readl(pcie_index_offset); 1058 r |= ((u64)readl(pcie_data_offset) << 32); 1059 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 1060 1061 return r; 1062 } 1063 1064 u64 amdgpu_device_indirect_rreg64_ext(struct amdgpu_device *adev, 1065 u64 reg_addr) 1066 { 1067 unsigned long flags, pcie_index, pcie_data; 1068 unsigned long pcie_index_hi = 0; 1069 void __iomem *pcie_index_offset; 1070 void __iomem *pcie_index_hi_offset; 1071 void __iomem *pcie_data_offset; 1072 u64 r; 1073 1074 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 1075 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 1076 if ((reg_addr >> 32) && (adev->nbio.funcs->get_pcie_index_hi_offset)) 1077 pcie_index_hi = adev->nbio.funcs->get_pcie_index_hi_offset(adev); 1078 1079 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 1080 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 1081 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 1082 if (pcie_index_hi != 0) 1083 pcie_index_hi_offset = (void __iomem *)adev->rmmio + 1084 pcie_index_hi * 4; 1085 1086 /* read low 32 bits */ 1087 writel(reg_addr, pcie_index_offset); 1088 readl(pcie_index_offset); 1089 if (pcie_index_hi != 0) { 1090 writel((reg_addr >> 32) & 0xff, pcie_index_hi_offset); 1091 readl(pcie_index_hi_offset); 1092 } 1093 r = readl(pcie_data_offset); 1094 /* read high 32 bits */ 1095 writel(reg_addr + 4, pcie_index_offset); 1096 readl(pcie_index_offset); 1097 if (pcie_index_hi != 0) { 1098 writel((reg_addr >> 32) & 0xff, pcie_index_hi_offset); 1099 readl(pcie_index_hi_offset); 1100 } 1101 r |= ((u64)readl(pcie_data_offset) << 32); 1102 1103 /* clear the high bits */ 1104 if (pcie_index_hi != 0) { 1105 writel(0, pcie_index_hi_offset); 1106 readl(pcie_index_hi_offset); 1107 } 1108 1109 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 1110 1111 return r; 1112 } 1113 1114 /** 1115 * amdgpu_device_indirect_wreg - write an indirect register address 1116 * 1117 * @adev: amdgpu_device pointer 1118 * @reg_addr: indirect register offset 1119 * @reg_data: indirect register data 1120 * 1121 */ 1122 void amdgpu_device_indirect_wreg(struct amdgpu_device *adev, 1123 u32 reg_addr, u32 reg_data) 1124 { 1125 unsigned long flags, pcie_index, pcie_data; 1126 void __iomem *pcie_index_offset; 1127 void __iomem *pcie_data_offset; 1128 1129 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 1130 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 1131 1132 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 1133 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 1134 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 1135 1136 writel(reg_addr, pcie_index_offset); 1137 readl(pcie_index_offset); 1138 writel(reg_data, pcie_data_offset); 1139 readl(pcie_data_offset); 1140 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 1141 } 1142 1143 void amdgpu_device_indirect_wreg_ext(struct amdgpu_device *adev, 1144 u64 reg_addr, u32 reg_data) 1145 { 1146 unsigned long flags, pcie_index, pcie_index_hi, pcie_data; 1147 void __iomem *pcie_index_offset; 1148 void __iomem *pcie_index_hi_offset; 1149 void __iomem *pcie_data_offset; 1150 1151 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 1152 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 1153 if ((reg_addr >> 32) && (adev->nbio.funcs->get_pcie_index_hi_offset)) 1154 pcie_index_hi = adev->nbio.funcs->get_pcie_index_hi_offset(adev); 1155 else 1156 pcie_index_hi = 0; 1157 1158 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 1159 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 1160 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 1161 if (pcie_index_hi != 0) 1162 pcie_index_hi_offset = (void __iomem *)adev->rmmio + 1163 pcie_index_hi * 4; 1164 1165 writel(reg_addr, pcie_index_offset); 1166 readl(pcie_index_offset); 1167 if (pcie_index_hi != 0) { 1168 writel((reg_addr >> 32) & 0xff, pcie_index_hi_offset); 1169 readl(pcie_index_hi_offset); 1170 } 1171 writel(reg_data, pcie_data_offset); 1172 readl(pcie_data_offset); 1173 1174 /* clear the high bits */ 1175 if (pcie_index_hi != 0) { 1176 writel(0, pcie_index_hi_offset); 1177 readl(pcie_index_hi_offset); 1178 } 1179 1180 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 1181 } 1182 1183 /** 1184 * amdgpu_device_indirect_wreg64 - write a 64bits indirect register address 1185 * 1186 * @adev: amdgpu_device pointer 1187 * @reg_addr: indirect register offset 1188 * @reg_data: indirect register data 1189 * 1190 */ 1191 void amdgpu_device_indirect_wreg64(struct amdgpu_device *adev, 1192 u32 reg_addr, u64 reg_data) 1193 { 1194 unsigned long flags, pcie_index, pcie_data; 1195 void __iomem *pcie_index_offset; 1196 void __iomem *pcie_data_offset; 1197 1198 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 1199 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 1200 1201 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 1202 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 1203 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 1204 1205 /* write low 32 bits */ 1206 writel(reg_addr, pcie_index_offset); 1207 readl(pcie_index_offset); 1208 writel((u32)(reg_data & 0xffffffffULL), pcie_data_offset); 1209 readl(pcie_data_offset); 1210 /* write high 32 bits */ 1211 writel(reg_addr + 4, pcie_index_offset); 1212 readl(pcie_index_offset); 1213 writel((u32)(reg_data >> 32), pcie_data_offset); 1214 readl(pcie_data_offset); 1215 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 1216 } 1217 1218 void amdgpu_device_indirect_wreg64_ext(struct amdgpu_device *adev, 1219 u64 reg_addr, u64 reg_data) 1220 { 1221 unsigned long flags, pcie_index, pcie_data; 1222 unsigned long pcie_index_hi = 0; 1223 void __iomem *pcie_index_offset; 1224 void __iomem *pcie_index_hi_offset; 1225 void __iomem *pcie_data_offset; 1226 1227 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 1228 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 1229 if ((reg_addr >> 32) && (adev->nbio.funcs->get_pcie_index_hi_offset)) 1230 pcie_index_hi = adev->nbio.funcs->get_pcie_index_hi_offset(adev); 1231 1232 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 1233 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 1234 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 1235 if (pcie_index_hi != 0) 1236 pcie_index_hi_offset = (void __iomem *)adev->rmmio + 1237 pcie_index_hi * 4; 1238 1239 /* write low 32 bits */ 1240 writel(reg_addr, pcie_index_offset); 1241 readl(pcie_index_offset); 1242 if (pcie_index_hi != 0) { 1243 writel((reg_addr >> 32) & 0xff, pcie_index_hi_offset); 1244 readl(pcie_index_hi_offset); 1245 } 1246 writel((u32)(reg_data & 0xffffffffULL), pcie_data_offset); 1247 readl(pcie_data_offset); 1248 /* write high 32 bits */ 1249 writel(reg_addr + 4, pcie_index_offset); 1250 readl(pcie_index_offset); 1251 if (pcie_index_hi != 0) { 1252 writel((reg_addr >> 32) & 0xff, pcie_index_hi_offset); 1253 readl(pcie_index_hi_offset); 1254 } 1255 writel((u32)(reg_data >> 32), pcie_data_offset); 1256 readl(pcie_data_offset); 1257 1258 /* clear the high bits */ 1259 if (pcie_index_hi != 0) { 1260 writel(0, pcie_index_hi_offset); 1261 readl(pcie_index_hi_offset); 1262 } 1263 1264 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 1265 } 1266 1267 /** 1268 * amdgpu_device_get_rev_id - query device rev_id 1269 * 1270 * @adev: amdgpu_device pointer 1271 * 1272 * Return device rev_id 1273 */ 1274 u32 amdgpu_device_get_rev_id(struct amdgpu_device *adev) 1275 { 1276 return adev->nbio.funcs->get_rev_id(adev); 1277 } 1278 1279 /** 1280 * amdgpu_invalid_rreg - dummy reg read function 1281 * 1282 * @adev: amdgpu_device pointer 1283 * @reg: offset of register 1284 * 1285 * Dummy register read function. Used for register blocks 1286 * that certain asics don't have (all asics). 1287 * Returns the value in the register. 1288 */ 1289 static uint32_t amdgpu_invalid_rreg(struct amdgpu_device *adev, uint32_t reg) 1290 { 1291 dev_err(adev->dev, "Invalid callback to read register 0x%04X\n", reg); 1292 BUG(); 1293 return 0; 1294 } 1295 1296 static uint32_t amdgpu_invalid_rreg_ext(struct amdgpu_device *adev, uint64_t reg) 1297 { 1298 dev_err(adev->dev, "Invalid callback to read register 0x%llX\n", reg); 1299 BUG(); 1300 return 0; 1301 } 1302 1303 /** 1304 * amdgpu_invalid_wreg - dummy reg write function 1305 * 1306 * @adev: amdgpu_device pointer 1307 * @reg: offset of register 1308 * @v: value to write to the register 1309 * 1310 * Dummy register read function. Used for register blocks 1311 * that certain asics don't have (all asics). 1312 */ 1313 static void amdgpu_invalid_wreg(struct amdgpu_device *adev, uint32_t reg, uint32_t v) 1314 { 1315 dev_err(adev->dev, 1316 "Invalid callback to write register 0x%04X with 0x%08X\n", reg, 1317 v); 1318 BUG(); 1319 } 1320 1321 static void amdgpu_invalid_wreg_ext(struct amdgpu_device *adev, uint64_t reg, uint32_t v) 1322 { 1323 dev_err(adev->dev, 1324 "Invalid callback to write register 0x%llX with 0x%08X\n", reg, 1325 v); 1326 BUG(); 1327 } 1328 1329 /** 1330 * amdgpu_invalid_rreg64 - dummy 64 bit reg read function 1331 * 1332 * @adev: amdgpu_device pointer 1333 * @reg: offset of register 1334 * 1335 * Dummy register read function. Used for register blocks 1336 * that certain asics don't have (all asics). 1337 * Returns the value in the register. 1338 */ 1339 static uint64_t amdgpu_invalid_rreg64(struct amdgpu_device *adev, uint32_t reg) 1340 { 1341 dev_err(adev->dev, "Invalid callback to read 64 bit register 0x%04X\n", 1342 reg); 1343 BUG(); 1344 return 0; 1345 } 1346 1347 static uint64_t amdgpu_invalid_rreg64_ext(struct amdgpu_device *adev, uint64_t reg) 1348 { 1349 dev_err(adev->dev, "Invalid callback to read register 0x%llX\n", reg); 1350 BUG(); 1351 return 0; 1352 } 1353 1354 /** 1355 * amdgpu_invalid_wreg64 - dummy reg write function 1356 * 1357 * @adev: amdgpu_device pointer 1358 * @reg: offset of register 1359 * @v: value to write to the register 1360 * 1361 * Dummy register read function. Used for register blocks 1362 * that certain asics don't have (all asics). 1363 */ 1364 static void amdgpu_invalid_wreg64(struct amdgpu_device *adev, uint32_t reg, uint64_t v) 1365 { 1366 dev_err(adev->dev, 1367 "Invalid callback to write 64 bit register 0x%04X with 0x%08llX\n", 1368 reg, v); 1369 BUG(); 1370 } 1371 1372 static void amdgpu_invalid_wreg64_ext(struct amdgpu_device *adev, uint64_t reg, uint64_t v) 1373 { 1374 dev_err(adev->dev, 1375 "Invalid callback to write 64 bit register 0x%llX with 0x%08llX\n", 1376 reg, v); 1377 BUG(); 1378 } 1379 1380 /** 1381 * amdgpu_block_invalid_rreg - dummy reg read function 1382 * 1383 * @adev: amdgpu_device pointer 1384 * @block: offset of instance 1385 * @reg: offset of register 1386 * 1387 * Dummy register read function. Used for register blocks 1388 * that certain asics don't have (all asics). 1389 * Returns the value in the register. 1390 */ 1391 static uint32_t amdgpu_block_invalid_rreg(struct amdgpu_device *adev, 1392 uint32_t block, uint32_t reg) 1393 { 1394 dev_err(adev->dev, 1395 "Invalid callback to read register 0x%04X in block 0x%04X\n", 1396 reg, block); 1397 BUG(); 1398 return 0; 1399 } 1400 1401 /** 1402 * amdgpu_block_invalid_wreg - dummy reg write function 1403 * 1404 * @adev: amdgpu_device pointer 1405 * @block: offset of instance 1406 * @reg: offset of register 1407 * @v: value to write to the register 1408 * 1409 * Dummy register read function. Used for register blocks 1410 * that certain asics don't have (all asics). 1411 */ 1412 static void amdgpu_block_invalid_wreg(struct amdgpu_device *adev, 1413 uint32_t block, 1414 uint32_t reg, uint32_t v) 1415 { 1416 dev_err(adev->dev, 1417 "Invalid block callback to write register 0x%04X in block 0x%04X with 0x%08X\n", 1418 reg, block, v); 1419 BUG(); 1420 } 1421 1422 static uint32_t amdgpu_device_get_vbios_flags(struct amdgpu_device *adev) 1423 { 1424 if (hweight32(adev->aid_mask) && (adev->flags & AMD_IS_APU)) 1425 return AMDGPU_VBIOS_SKIP; 1426 1427 if (hweight32(adev->aid_mask) && amdgpu_passthrough(adev)) 1428 return AMDGPU_VBIOS_OPTIONAL; 1429 1430 return 0; 1431 } 1432 1433 /** 1434 * amdgpu_device_asic_init - Wrapper for atom asic_init 1435 * 1436 * @adev: amdgpu_device pointer 1437 * 1438 * Does any asic specific work and then calls atom asic init. 1439 */ 1440 static int amdgpu_device_asic_init(struct amdgpu_device *adev) 1441 { 1442 uint32_t flags; 1443 bool optional; 1444 int ret; 1445 1446 amdgpu_asic_pre_asic_init(adev); 1447 flags = amdgpu_device_get_vbios_flags(adev); 1448 optional = !!(flags & (AMDGPU_VBIOS_OPTIONAL | AMDGPU_VBIOS_SKIP)); 1449 1450 if (amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 3) || 1451 amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 4) || 1452 amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 5, 0) || 1453 amdgpu_ip_version(adev, GC_HWIP, 0) >= IP_VERSION(11, 0, 0)) { 1454 amdgpu_psp_wait_for_bootloader(adev); 1455 if (optional && !adev->bios) 1456 return 0; 1457 1458 ret = amdgpu_atomfirmware_asic_init(adev, true); 1459 return ret; 1460 } else { 1461 if (optional && !adev->bios) 1462 return 0; 1463 1464 return amdgpu_atom_asic_init(adev->mode_info.atom_context); 1465 } 1466 1467 return 0; 1468 } 1469 1470 /** 1471 * amdgpu_device_mem_scratch_init - allocate the VRAM scratch page 1472 * 1473 * @adev: amdgpu_device pointer 1474 * 1475 * Allocates a scratch page of VRAM for use by various things in the 1476 * driver. 1477 */ 1478 static int amdgpu_device_mem_scratch_init(struct amdgpu_device *adev) 1479 { 1480 return amdgpu_bo_create_kernel(adev, AMDGPU_GPU_PAGE_SIZE, PAGE_SIZE, 1481 AMDGPU_GEM_DOMAIN_VRAM | 1482 AMDGPU_GEM_DOMAIN_GTT, 1483 &adev->mem_scratch.robj, 1484 &adev->mem_scratch.gpu_addr, 1485 (void **)&adev->mem_scratch.ptr); 1486 } 1487 1488 /** 1489 * amdgpu_device_mem_scratch_fini - Free the VRAM scratch page 1490 * 1491 * @adev: amdgpu_device pointer 1492 * 1493 * Frees the VRAM scratch page. 1494 */ 1495 static void amdgpu_device_mem_scratch_fini(struct amdgpu_device *adev) 1496 { 1497 amdgpu_bo_free_kernel(&adev->mem_scratch.robj, NULL, NULL); 1498 } 1499 1500 /** 1501 * amdgpu_device_program_register_sequence - program an array of registers. 1502 * 1503 * @adev: amdgpu_device pointer 1504 * @registers: pointer to the register array 1505 * @array_size: size of the register array 1506 * 1507 * Programs an array or registers with and or masks. 1508 * This is a helper for setting golden registers. 1509 */ 1510 void amdgpu_device_program_register_sequence(struct amdgpu_device *adev, 1511 const u32 *registers, 1512 const u32 array_size) 1513 { 1514 u32 tmp, reg, and_mask, or_mask; 1515 int i; 1516 1517 if (array_size % 3) 1518 return; 1519 1520 for (i = 0; i < array_size; i += 3) { 1521 reg = registers[i + 0]; 1522 and_mask = registers[i + 1]; 1523 or_mask = registers[i + 2]; 1524 1525 if (and_mask == 0xffffffff) { 1526 tmp = or_mask; 1527 } else { 1528 tmp = RREG32(reg); 1529 tmp &= ~and_mask; 1530 if (adev->family >= AMDGPU_FAMILY_AI) 1531 tmp |= (or_mask & and_mask); 1532 else 1533 tmp |= or_mask; 1534 } 1535 WREG32(reg, tmp); 1536 } 1537 } 1538 1539 /** 1540 * amdgpu_device_pci_config_reset - reset the GPU 1541 * 1542 * @adev: amdgpu_device pointer 1543 * 1544 * Resets the GPU using the pci config reset sequence. 1545 * Only applicable to asics prior to vega10. 1546 */ 1547 void amdgpu_device_pci_config_reset(struct amdgpu_device *adev) 1548 { 1549 pci_write_config_dword(adev->pdev, 0x7c, AMDGPU_ASIC_RESET_DATA); 1550 } 1551 1552 /** 1553 * amdgpu_device_pci_reset - reset the GPU using generic PCI means 1554 * 1555 * @adev: amdgpu_device pointer 1556 * 1557 * Resets the GPU using generic pci reset interfaces (FLR, SBR, etc.). 1558 */ 1559 int amdgpu_device_pci_reset(struct amdgpu_device *adev) 1560 { 1561 return pci_reset_function(adev->pdev); 1562 } 1563 1564 /* 1565 * amdgpu_device_wb_*() 1566 * Writeback is the method by which the GPU updates special pages in memory 1567 * with the status of certain GPU events (fences, ring pointers,etc.). 1568 */ 1569 1570 /** 1571 * amdgpu_device_wb_fini - Disable Writeback and free memory 1572 * 1573 * @adev: amdgpu_device pointer 1574 * 1575 * Disables Writeback and frees the Writeback memory (all asics). 1576 * Used at driver shutdown. 1577 */ 1578 static void amdgpu_device_wb_fini(struct amdgpu_device *adev) 1579 { 1580 if (adev->wb.wb_obj) { 1581 amdgpu_bo_free_kernel(&adev->wb.wb_obj, 1582 &adev->wb.gpu_addr, 1583 (void **)&adev->wb.wb); 1584 adev->wb.wb_obj = NULL; 1585 } 1586 } 1587 1588 /** 1589 * amdgpu_device_wb_init - Init Writeback driver info and allocate memory 1590 * 1591 * @adev: amdgpu_device pointer 1592 * 1593 * Initializes writeback and allocates writeback memory (all asics). 1594 * Used at driver startup. 1595 * Returns 0 on success or an -error on failure. 1596 */ 1597 static int amdgpu_device_wb_init(struct amdgpu_device *adev) 1598 { 1599 int r; 1600 1601 if (adev->wb.wb_obj == NULL) { 1602 /* AMDGPU_MAX_WB * sizeof(uint32_t) * 8 = AMDGPU_MAX_WB 256bit slots */ 1603 r = amdgpu_bo_create_kernel(adev, AMDGPU_MAX_WB * sizeof(uint32_t) * 8, 1604 PAGE_SIZE, AMDGPU_GEM_DOMAIN_GTT, 1605 &adev->wb.wb_obj, &adev->wb.gpu_addr, 1606 (void **)&adev->wb.wb); 1607 if (r) { 1608 dev_warn(adev->dev, "(%d) create WB bo failed\n", r); 1609 return r; 1610 } 1611 1612 adev->wb.num_wb = AMDGPU_MAX_WB; 1613 memset(&adev->wb.used, 0, sizeof(adev->wb.used)); 1614 1615 /* clear wb memory */ 1616 memset((char *)adev->wb.wb, 0, AMDGPU_MAX_WB * sizeof(uint32_t) * 8); 1617 } 1618 1619 return 0; 1620 } 1621 1622 /** 1623 * amdgpu_device_wb_get - Allocate a wb entry 1624 * 1625 * @adev: amdgpu_device pointer 1626 * @wb: wb index 1627 * 1628 * Allocate a wb slot for use by the driver (all asics). 1629 * Returns 0 on success or -EINVAL on failure. 1630 */ 1631 int amdgpu_device_wb_get(struct amdgpu_device *adev, u32 *wb) 1632 { 1633 unsigned long flags, offset; 1634 1635 spin_lock_irqsave(&adev->wb.lock, flags); 1636 offset = find_first_zero_bit(adev->wb.used, adev->wb.num_wb); 1637 if (offset < adev->wb.num_wb) { 1638 __set_bit(offset, adev->wb.used); 1639 spin_unlock_irqrestore(&adev->wb.lock, flags); 1640 *wb = offset << 3; /* convert to dw offset */ 1641 return 0; 1642 } else { 1643 spin_unlock_irqrestore(&adev->wb.lock, flags); 1644 return -EINVAL; 1645 } 1646 } 1647 1648 /** 1649 * amdgpu_device_wb_free - Free a wb entry 1650 * 1651 * @adev: amdgpu_device pointer 1652 * @wb: wb index 1653 * 1654 * Free a wb slot allocated for use by the driver (all asics) 1655 */ 1656 void amdgpu_device_wb_free(struct amdgpu_device *adev, u32 wb) 1657 { 1658 unsigned long flags; 1659 1660 wb >>= 3; 1661 spin_lock_irqsave(&adev->wb.lock, flags); 1662 if (wb < adev->wb.num_wb) 1663 __clear_bit(wb, adev->wb.used); 1664 spin_unlock_irqrestore(&adev->wb.lock, flags); 1665 } 1666 1667 /** 1668 * amdgpu_device_resize_fb_bar - try to resize FB BAR 1669 * 1670 * @adev: amdgpu_device pointer 1671 * 1672 * Try to resize FB BAR to make all VRAM CPU accessible. We try very hard not 1673 * to fail, but if any of the BARs is not accessible after the size we abort 1674 * driver loading by returning -ENODEV. 1675 */ 1676 int amdgpu_device_resize_fb_bar(struct amdgpu_device *adev) 1677 { 1678 int rbar_size = pci_rebar_bytes_to_size(adev->gmc.real_vram_size); 1679 struct pci_bus *root; 1680 struct resource *res; 1681 unsigned int i; 1682 u16 cmd; 1683 int r; 1684 1685 if (!IS_ENABLED(CONFIG_PHYS_ADDR_T_64BIT)) 1686 return 0; 1687 1688 /* Bypass for VF */ 1689 if (amdgpu_sriov_vf(adev)) 1690 return 0; 1691 1692 if (!amdgpu_rebar) 1693 return 0; 1694 1695 /* resizing on Dell G5 SE platforms causes problems with runtime pm */ 1696 if ((amdgpu_runtime_pm != 0) && 1697 adev->pdev->vendor == PCI_VENDOR_ID_ATI && 1698 adev->pdev->device == 0x731f && 1699 adev->pdev->subsystem_vendor == PCI_VENDOR_ID_DELL) 1700 return 0; 1701 1702 /* PCI_EXT_CAP_ID_VNDR extended capability is located at 0x100 */ 1703 if (!pci_find_ext_capability(adev->pdev, PCI_EXT_CAP_ID_VNDR)) 1704 dev_warn( 1705 adev->dev, 1706 "System can't access extended configuration space, please check!!\n"); 1707 1708 /* skip if the bios has already enabled large BAR */ 1709 if (adev->gmc.real_vram_size && 1710 (pci_resource_len(adev->pdev, 0) >= adev->gmc.real_vram_size)) 1711 return 0; 1712 1713 /* Check if the root BUS has 64bit memory resources */ 1714 root = adev->pdev->bus; 1715 while (root->parent) 1716 root = root->parent; 1717 1718 pci_bus_for_each_resource(root, res, i) { 1719 if (res && res->flags & (IORESOURCE_MEM | IORESOURCE_MEM_64) && 1720 res->start > 0x100000000ull) 1721 break; 1722 } 1723 1724 /* Trying to resize is pointless without a root hub window above 4GB */ 1725 if (!res) 1726 return 0; 1727 1728 /* Limit the BAR size to what is available */ 1729 rbar_size = min(fls(pci_rebar_get_possible_sizes(adev->pdev, 0)) - 1, 1730 rbar_size); 1731 1732 /* Disable memory decoding while we change the BAR addresses and size */ 1733 pci_read_config_word(adev->pdev, PCI_COMMAND, &cmd); 1734 pci_write_config_word(adev->pdev, PCI_COMMAND, 1735 cmd & ~PCI_COMMAND_MEMORY); 1736 1737 /* Free the VRAM and doorbell BAR, we most likely need to move both. */ 1738 amdgpu_doorbell_fini(adev); 1739 if (adev->asic_type >= CHIP_BONAIRE) 1740 pci_release_resource(adev->pdev, 2); 1741 1742 pci_release_resource(adev->pdev, 0); 1743 1744 r = pci_resize_resource(adev->pdev, 0, rbar_size); 1745 if (r == -ENOSPC) 1746 dev_info(adev->dev, 1747 "Not enough PCI address space for a large BAR."); 1748 else if (r && r != -ENOTSUPP) 1749 dev_err(adev->dev, "Problem resizing BAR0 (%d).", r); 1750 1751 pci_assign_unassigned_bus_resources(adev->pdev->bus); 1752 1753 /* When the doorbell or fb BAR isn't available we have no chance of 1754 * using the device. 1755 */ 1756 r = amdgpu_doorbell_init(adev); 1757 if (r || (pci_resource_flags(adev->pdev, 0) & IORESOURCE_UNSET)) 1758 return -ENODEV; 1759 1760 pci_write_config_word(adev->pdev, PCI_COMMAND, cmd); 1761 1762 return 0; 1763 } 1764 1765 /* 1766 * GPU helpers function. 1767 */ 1768 /** 1769 * amdgpu_device_need_post - check if the hw need post or not 1770 * 1771 * @adev: amdgpu_device pointer 1772 * 1773 * Check if the asic has been initialized (all asics) at driver startup 1774 * or post is needed if hw reset is performed. 1775 * Returns true if need or false if not. 1776 */ 1777 bool amdgpu_device_need_post(struct amdgpu_device *adev) 1778 { 1779 uint32_t reg, flags; 1780 1781 if (amdgpu_sriov_vf(adev)) 1782 return false; 1783 1784 flags = amdgpu_device_get_vbios_flags(adev); 1785 if (flags & AMDGPU_VBIOS_SKIP) 1786 return false; 1787 if ((flags & AMDGPU_VBIOS_OPTIONAL) && !adev->bios) 1788 return false; 1789 1790 if (amdgpu_passthrough(adev)) { 1791 /* for FIJI: In whole GPU pass-through virtualization case, after VM reboot 1792 * some old smc fw still need driver do vPost otherwise gpu hang, while 1793 * those smc fw version above 22.15 doesn't have this flaw, so we force 1794 * vpost executed for smc version below 22.15 1795 */ 1796 if (adev->asic_type == CHIP_FIJI) { 1797 int err; 1798 uint32_t fw_ver; 1799 1800 err = request_firmware(&adev->pm.fw, "amdgpu/fiji_smc.bin", adev->dev); 1801 /* force vPost if error occurred */ 1802 if (err) 1803 return true; 1804 1805 fw_ver = *((uint32_t *)adev->pm.fw->data + 69); 1806 release_firmware(adev->pm.fw); 1807 if (fw_ver < 0x00160e00) 1808 return true; 1809 } 1810 } 1811 1812 /* Don't post if we need to reset whole hive on init */ 1813 if (adev->init_lvl->level == AMDGPU_INIT_LEVEL_MINIMAL_XGMI) 1814 return false; 1815 1816 if (adev->has_hw_reset) { 1817 adev->has_hw_reset = false; 1818 return true; 1819 } 1820 1821 /* bios scratch used on CIK+ */ 1822 if (adev->asic_type >= CHIP_BONAIRE) 1823 return amdgpu_atombios_scratch_need_asic_init(adev); 1824 1825 /* check MEM_SIZE for older asics */ 1826 reg = amdgpu_asic_get_config_memsize(adev); 1827 1828 if ((reg != 0) && (reg != 0xffffffff)) 1829 return false; 1830 1831 return true; 1832 } 1833 1834 /* 1835 * Check whether seamless boot is supported. 1836 * 1837 * So far we only support seamless boot on DCE 3.0 or later. 1838 * If users report that it works on older ASICS as well, we may 1839 * loosen this. 1840 */ 1841 bool amdgpu_device_seamless_boot_supported(struct amdgpu_device *adev) 1842 { 1843 switch (amdgpu_seamless) { 1844 case -1: 1845 break; 1846 case 1: 1847 return true; 1848 case 0: 1849 return false; 1850 default: 1851 dev_err(adev->dev, "Invalid value for amdgpu.seamless: %d\n", 1852 amdgpu_seamless); 1853 return false; 1854 } 1855 1856 if (!(adev->flags & AMD_IS_APU)) 1857 return false; 1858 1859 if (adev->mman.keep_stolen_vga_memory) 1860 return false; 1861 1862 return amdgpu_ip_version(adev, DCE_HWIP, 0) >= IP_VERSION(3, 0, 0); 1863 } 1864 1865 /* 1866 * Intel hosts such as Rocket Lake, Alder Lake, Raptor Lake and Sapphire Rapids 1867 * don't support dynamic speed switching. Until we have confirmation from Intel 1868 * that a specific host supports it, it's safer that we keep it disabled for all. 1869 * 1870 * https://edc.intel.com/content/www/us/en/design/products/platforms/details/raptor-lake-s/13th-generation-core-processors-datasheet-volume-1-of-2/005/pci-express-support/ 1871 * https://gitlab.freedesktop.org/drm/amd/-/issues/2663 1872 */ 1873 static bool amdgpu_device_pcie_dynamic_switching_supported(struct amdgpu_device *adev) 1874 { 1875 #if IS_ENABLED(CONFIG_X86) 1876 struct cpuinfo_x86 *c = &cpu_data(0); 1877 1878 /* eGPU change speeds based on USB4 fabric conditions */ 1879 if (dev_is_removable(adev->dev)) 1880 return true; 1881 1882 if (c->x86_vendor == X86_VENDOR_INTEL) 1883 return false; 1884 #endif 1885 return true; 1886 } 1887 1888 static bool amdgpu_device_aspm_support_quirk(struct amdgpu_device *adev) 1889 { 1890 #if IS_ENABLED(CONFIG_X86) 1891 struct cpuinfo_x86 *c = &cpu_data(0); 1892 1893 if (!(amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(12, 0, 0) || 1894 amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(12, 0, 1))) 1895 return false; 1896 1897 if (c->x86 == 6 && 1898 adev->pm.pcie_gen_mask & CAIL_PCIE_LINK_SPEED_SUPPORT_GEN5) { 1899 switch (c->x86_model) { 1900 case VFM_MODEL(INTEL_ALDERLAKE): 1901 case VFM_MODEL(INTEL_ALDERLAKE_L): 1902 case VFM_MODEL(INTEL_RAPTORLAKE): 1903 case VFM_MODEL(INTEL_RAPTORLAKE_P): 1904 case VFM_MODEL(INTEL_RAPTORLAKE_S): 1905 return true; 1906 default: 1907 return false; 1908 } 1909 } else { 1910 return false; 1911 } 1912 #else 1913 return false; 1914 #endif 1915 } 1916 1917 /** 1918 * amdgpu_device_should_use_aspm - check if the device should program ASPM 1919 * 1920 * @adev: amdgpu_device pointer 1921 * 1922 * Confirm whether the module parameter and pcie bridge agree that ASPM should 1923 * be set for this device. 1924 * 1925 * Returns true if it should be used or false if not. 1926 */ 1927 bool amdgpu_device_should_use_aspm(struct amdgpu_device *adev) 1928 { 1929 switch (amdgpu_aspm) { 1930 case -1: 1931 break; 1932 case 0: 1933 return false; 1934 case 1: 1935 return true; 1936 default: 1937 return false; 1938 } 1939 if (adev->flags & AMD_IS_APU) 1940 return false; 1941 if (amdgpu_device_aspm_support_quirk(adev)) 1942 return false; 1943 return pcie_aspm_enabled(adev->pdev); 1944 } 1945 1946 /* if we get transitioned to only one device, take VGA back */ 1947 /** 1948 * amdgpu_device_vga_set_decode - enable/disable vga decode 1949 * 1950 * @pdev: PCI device pointer 1951 * @state: enable/disable vga decode 1952 * 1953 * Enable/disable vga decode (all asics). 1954 * Returns VGA resource flags. 1955 */ 1956 static unsigned int amdgpu_device_vga_set_decode(struct pci_dev *pdev, 1957 bool state) 1958 { 1959 struct amdgpu_device *adev = drm_to_adev(pci_get_drvdata(pdev)); 1960 1961 amdgpu_asic_set_vga_state(adev, state); 1962 if (state) 1963 return VGA_RSRC_LEGACY_IO | VGA_RSRC_LEGACY_MEM | 1964 VGA_RSRC_NORMAL_IO | VGA_RSRC_NORMAL_MEM; 1965 else 1966 return VGA_RSRC_NORMAL_IO | VGA_RSRC_NORMAL_MEM; 1967 } 1968 1969 /** 1970 * amdgpu_device_check_block_size - validate the vm block size 1971 * 1972 * @adev: amdgpu_device pointer 1973 * 1974 * Validates the vm block size specified via module parameter. 1975 * The vm block size defines number of bits in page table versus page directory, 1976 * a page is 4KB so we have 12 bits offset, minimum 9 bits in the 1977 * page table and the remaining bits are in the page directory. 1978 */ 1979 static void amdgpu_device_check_block_size(struct amdgpu_device *adev) 1980 { 1981 /* defines number of bits in page table versus page directory, 1982 * a page is 4KB so we have 12 bits offset, minimum 9 bits in the 1983 * page table and the remaining bits are in the page directory 1984 */ 1985 if (amdgpu_vm_block_size == -1) 1986 return; 1987 1988 if (amdgpu_vm_block_size < 9) { 1989 dev_warn(adev->dev, "VM page table size (%d) too small\n", 1990 amdgpu_vm_block_size); 1991 amdgpu_vm_block_size = -1; 1992 } 1993 } 1994 1995 /** 1996 * amdgpu_device_check_vm_size - validate the vm size 1997 * 1998 * @adev: amdgpu_device pointer 1999 * 2000 * Validates the vm size in GB specified via module parameter. 2001 * The VM size is the size of the GPU virtual memory space in GB. 2002 */ 2003 static void amdgpu_device_check_vm_size(struct amdgpu_device *adev) 2004 { 2005 /* no need to check the default value */ 2006 if (amdgpu_vm_size == -1) 2007 return; 2008 2009 if (amdgpu_vm_size < 1) { 2010 dev_warn(adev->dev, "VM size (%d) too small, min is 1GB\n", 2011 amdgpu_vm_size); 2012 amdgpu_vm_size = -1; 2013 } 2014 } 2015 2016 static void amdgpu_device_check_smu_prv_buffer_size(struct amdgpu_device *adev) 2017 { 2018 struct sysinfo si; 2019 bool is_os_64 = (sizeof(void *) == 8); 2020 uint64_t total_memory; 2021 uint64_t dram_size_seven_GB = 0x1B8000000; 2022 uint64_t dram_size_three_GB = 0xB8000000; 2023 2024 if (amdgpu_smu_memory_pool_size == 0) 2025 return; 2026 2027 if (!is_os_64) { 2028 dev_warn(adev->dev, "Not 64-bit OS, feature not supported\n"); 2029 goto def_value; 2030 } 2031 si_meminfo(&si); 2032 total_memory = (uint64_t)si.totalram * si.mem_unit; 2033 2034 if ((amdgpu_smu_memory_pool_size == 1) || 2035 (amdgpu_smu_memory_pool_size == 2)) { 2036 if (total_memory < dram_size_three_GB) 2037 goto def_value1; 2038 } else if ((amdgpu_smu_memory_pool_size == 4) || 2039 (amdgpu_smu_memory_pool_size == 8)) { 2040 if (total_memory < dram_size_seven_GB) 2041 goto def_value1; 2042 } else { 2043 dev_warn(adev->dev, "Smu memory pool size not supported\n"); 2044 goto def_value; 2045 } 2046 adev->pm.smu_prv_buffer_size = amdgpu_smu_memory_pool_size << 28; 2047 2048 return; 2049 2050 def_value1: 2051 dev_warn(adev->dev, "No enough system memory\n"); 2052 def_value: 2053 adev->pm.smu_prv_buffer_size = 0; 2054 } 2055 2056 static int amdgpu_device_init_apu_flags(struct amdgpu_device *adev) 2057 { 2058 if (!(adev->flags & AMD_IS_APU) || 2059 adev->asic_type < CHIP_RAVEN) 2060 return 0; 2061 2062 switch (adev->asic_type) { 2063 case CHIP_RAVEN: 2064 if (adev->pdev->device == 0x15dd) 2065 adev->apu_flags |= AMD_APU_IS_RAVEN; 2066 if (adev->pdev->device == 0x15d8) 2067 adev->apu_flags |= AMD_APU_IS_PICASSO; 2068 break; 2069 case CHIP_RENOIR: 2070 if ((adev->pdev->device == 0x1636) || 2071 (adev->pdev->device == 0x164c)) 2072 adev->apu_flags |= AMD_APU_IS_RENOIR; 2073 else 2074 adev->apu_flags |= AMD_APU_IS_GREEN_SARDINE; 2075 break; 2076 case CHIP_VANGOGH: 2077 adev->apu_flags |= AMD_APU_IS_VANGOGH; 2078 break; 2079 case CHIP_YELLOW_CARP: 2080 break; 2081 case CHIP_CYAN_SKILLFISH: 2082 if ((adev->pdev->device == 0x13FE) || 2083 (adev->pdev->device == 0x143F)) 2084 adev->apu_flags |= AMD_APU_IS_CYAN_SKILLFISH2; 2085 break; 2086 default: 2087 break; 2088 } 2089 2090 return 0; 2091 } 2092 2093 /** 2094 * amdgpu_device_check_arguments - validate module params 2095 * 2096 * @adev: amdgpu_device pointer 2097 * 2098 * Validates certain module parameters and updates 2099 * the associated values used by the driver (all asics). 2100 */ 2101 static int amdgpu_device_check_arguments(struct amdgpu_device *adev) 2102 { 2103 int i; 2104 2105 if (amdgpu_sched_jobs < 4) { 2106 dev_warn(adev->dev, "sched jobs (%d) must be at least 4\n", 2107 amdgpu_sched_jobs); 2108 amdgpu_sched_jobs = 4; 2109 } else if (!is_power_of_2(amdgpu_sched_jobs)) { 2110 dev_warn(adev->dev, "sched jobs (%d) must be a power of 2\n", 2111 amdgpu_sched_jobs); 2112 amdgpu_sched_jobs = roundup_pow_of_two(amdgpu_sched_jobs); 2113 } 2114 2115 if (amdgpu_gart_size != -1 && amdgpu_gart_size < 32) { 2116 /* gart size must be greater or equal to 32M */ 2117 dev_warn(adev->dev, "gart size (%d) too small\n", 2118 amdgpu_gart_size); 2119 amdgpu_gart_size = -1; 2120 } 2121 2122 if (amdgpu_gtt_size != -1 && amdgpu_gtt_size < 32) { 2123 /* gtt size must be greater or equal to 32M */ 2124 dev_warn(adev->dev, "gtt size (%d) too small\n", 2125 amdgpu_gtt_size); 2126 amdgpu_gtt_size = -1; 2127 } 2128 2129 /* valid range is between 4 and 9 inclusive */ 2130 if (amdgpu_vm_fragment_size != -1 && 2131 (amdgpu_vm_fragment_size > 9 || amdgpu_vm_fragment_size < 4)) { 2132 dev_warn(adev->dev, "valid range is between 4 and 9\n"); 2133 amdgpu_vm_fragment_size = -1; 2134 } 2135 2136 if (amdgpu_sched_hw_submission < 2) { 2137 dev_warn(adev->dev, "sched hw submission jobs (%d) must be at least 2\n", 2138 amdgpu_sched_hw_submission); 2139 amdgpu_sched_hw_submission = 2; 2140 } else if (!is_power_of_2(amdgpu_sched_hw_submission)) { 2141 dev_warn(adev->dev, "sched hw submission jobs (%d) must be a power of 2\n", 2142 amdgpu_sched_hw_submission); 2143 amdgpu_sched_hw_submission = roundup_pow_of_two(amdgpu_sched_hw_submission); 2144 } 2145 2146 if (amdgpu_reset_method < -1 || amdgpu_reset_method > 4) { 2147 dev_warn(adev->dev, "invalid option for reset method, reverting to default\n"); 2148 amdgpu_reset_method = -1; 2149 } 2150 2151 amdgpu_device_check_smu_prv_buffer_size(adev); 2152 2153 amdgpu_device_check_vm_size(adev); 2154 2155 amdgpu_device_check_block_size(adev); 2156 2157 adev->firmware.load_type = amdgpu_ucode_get_load_type(adev, amdgpu_fw_load_type); 2158 2159 for (i = 0; i < MAX_XCP; i++) { 2160 switch (amdgpu_enforce_isolation) { 2161 case -1: 2162 case 0: 2163 default: 2164 /* disable */ 2165 adev->enforce_isolation[i] = AMDGPU_ENFORCE_ISOLATION_DISABLE; 2166 break; 2167 case 1: 2168 /* enable */ 2169 adev->enforce_isolation[i] = 2170 AMDGPU_ENFORCE_ISOLATION_ENABLE; 2171 break; 2172 case 2: 2173 /* enable legacy mode */ 2174 adev->enforce_isolation[i] = 2175 AMDGPU_ENFORCE_ISOLATION_ENABLE_LEGACY; 2176 break; 2177 case 3: 2178 /* enable only process isolation without submitting cleaner shader */ 2179 adev->enforce_isolation[i] = 2180 AMDGPU_ENFORCE_ISOLATION_NO_CLEANER_SHADER; 2181 break; 2182 } 2183 } 2184 2185 return 0; 2186 } 2187 2188 /** 2189 * amdgpu_switcheroo_set_state - set switcheroo state 2190 * 2191 * @pdev: pci dev pointer 2192 * @state: vga_switcheroo state 2193 * 2194 * Callback for the switcheroo driver. Suspends or resumes 2195 * the asics before or after it is powered up using ACPI methods. 2196 */ 2197 static void amdgpu_switcheroo_set_state(struct pci_dev *pdev, 2198 enum vga_switcheroo_state state) 2199 { 2200 struct drm_device *dev = pci_get_drvdata(pdev); 2201 int r; 2202 2203 if (amdgpu_device_supports_px(dev) && state == VGA_SWITCHEROO_OFF) 2204 return; 2205 2206 if (state == VGA_SWITCHEROO_ON) { 2207 pr_info("switched on\n"); 2208 /* don't suspend or resume card normally */ 2209 dev->switch_power_state = DRM_SWITCH_POWER_CHANGING; 2210 2211 pci_set_power_state(pdev, PCI_D0); 2212 amdgpu_device_load_pci_state(pdev); 2213 r = pci_enable_device(pdev); 2214 if (r) 2215 dev_warn(&pdev->dev, "pci_enable_device failed (%d)\n", 2216 r); 2217 amdgpu_device_resume(dev, true); 2218 2219 dev->switch_power_state = DRM_SWITCH_POWER_ON; 2220 } else { 2221 dev_info(&pdev->dev, "switched off\n"); 2222 dev->switch_power_state = DRM_SWITCH_POWER_CHANGING; 2223 amdgpu_device_prepare(dev); 2224 amdgpu_device_suspend(dev, true); 2225 amdgpu_device_cache_pci_state(pdev); 2226 /* Shut down the device */ 2227 pci_disable_device(pdev); 2228 pci_set_power_state(pdev, PCI_D3cold); 2229 dev->switch_power_state = DRM_SWITCH_POWER_OFF; 2230 } 2231 } 2232 2233 /** 2234 * amdgpu_switcheroo_can_switch - see if switcheroo state can change 2235 * 2236 * @pdev: pci dev pointer 2237 * 2238 * Callback for the switcheroo driver. Check of the switcheroo 2239 * state can be changed. 2240 * Returns true if the state can be changed, false if not. 2241 */ 2242 static bool amdgpu_switcheroo_can_switch(struct pci_dev *pdev) 2243 { 2244 struct drm_device *dev = pci_get_drvdata(pdev); 2245 2246 /* 2247 * FIXME: open_count is protected by drm_global_mutex but that would lead to 2248 * locking inversion with the driver load path. And the access here is 2249 * completely racy anyway. So don't bother with locking for now. 2250 */ 2251 return atomic_read(&dev->open_count) == 0; 2252 } 2253 2254 static const struct vga_switcheroo_client_ops amdgpu_switcheroo_ops = { 2255 .set_gpu_state = amdgpu_switcheroo_set_state, 2256 .reprobe = NULL, 2257 .can_switch = amdgpu_switcheroo_can_switch, 2258 }; 2259 2260 /** 2261 * amdgpu_device_ip_set_clockgating_state - set the CG state 2262 * 2263 * @dev: amdgpu_device pointer 2264 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.) 2265 * @state: clockgating state (gate or ungate) 2266 * 2267 * Sets the requested clockgating state for all instances of 2268 * the hardware IP specified. 2269 * Returns the error code from the last instance. 2270 */ 2271 int amdgpu_device_ip_set_clockgating_state(void *dev, 2272 enum amd_ip_block_type block_type, 2273 enum amd_clockgating_state state) 2274 { 2275 struct amdgpu_device *adev = dev; 2276 int i, r = 0; 2277 2278 for (i = 0; i < adev->num_ip_blocks; i++) { 2279 if (!adev->ip_blocks[i].status.valid) 2280 continue; 2281 if (adev->ip_blocks[i].version->type != block_type) 2282 continue; 2283 if (!adev->ip_blocks[i].version->funcs->set_clockgating_state) 2284 continue; 2285 r = adev->ip_blocks[i].version->funcs->set_clockgating_state( 2286 &adev->ip_blocks[i], state); 2287 if (r) 2288 dev_err(adev->dev, 2289 "set_clockgating_state of IP block <%s> failed %d\n", 2290 adev->ip_blocks[i].version->funcs->name, r); 2291 } 2292 return r; 2293 } 2294 2295 /** 2296 * amdgpu_device_ip_set_powergating_state - set the PG state 2297 * 2298 * @dev: amdgpu_device pointer 2299 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.) 2300 * @state: powergating state (gate or ungate) 2301 * 2302 * Sets the requested powergating state for all instances of 2303 * the hardware IP specified. 2304 * Returns the error code from the last instance. 2305 */ 2306 int amdgpu_device_ip_set_powergating_state(void *dev, 2307 enum amd_ip_block_type block_type, 2308 enum amd_powergating_state state) 2309 { 2310 struct amdgpu_device *adev = dev; 2311 int i, r = 0; 2312 2313 for (i = 0; i < adev->num_ip_blocks; i++) { 2314 if (!adev->ip_blocks[i].status.valid) 2315 continue; 2316 if (adev->ip_blocks[i].version->type != block_type) 2317 continue; 2318 if (!adev->ip_blocks[i].version->funcs->set_powergating_state) 2319 continue; 2320 r = adev->ip_blocks[i].version->funcs->set_powergating_state( 2321 &adev->ip_blocks[i], state); 2322 if (r) 2323 dev_err(adev->dev, 2324 "set_powergating_state of IP block <%s> failed %d\n", 2325 adev->ip_blocks[i].version->funcs->name, r); 2326 } 2327 return r; 2328 } 2329 2330 /** 2331 * amdgpu_device_ip_get_clockgating_state - get the CG state 2332 * 2333 * @adev: amdgpu_device pointer 2334 * @flags: clockgating feature flags 2335 * 2336 * Walks the list of IPs on the device and updates the clockgating 2337 * flags for each IP. 2338 * Updates @flags with the feature flags for each hardware IP where 2339 * clockgating is enabled. 2340 */ 2341 void amdgpu_device_ip_get_clockgating_state(struct amdgpu_device *adev, 2342 u64 *flags) 2343 { 2344 int i; 2345 2346 for (i = 0; i < adev->num_ip_blocks; i++) { 2347 if (!adev->ip_blocks[i].status.valid) 2348 continue; 2349 if (adev->ip_blocks[i].version->funcs->get_clockgating_state) 2350 adev->ip_blocks[i].version->funcs->get_clockgating_state( 2351 &adev->ip_blocks[i], flags); 2352 } 2353 } 2354 2355 /** 2356 * amdgpu_device_ip_wait_for_idle - wait for idle 2357 * 2358 * @adev: amdgpu_device pointer 2359 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.) 2360 * 2361 * Waits for the request hardware IP to be idle. 2362 * Returns 0 for success or a negative error code on failure. 2363 */ 2364 int amdgpu_device_ip_wait_for_idle(struct amdgpu_device *adev, 2365 enum amd_ip_block_type block_type) 2366 { 2367 int i, r; 2368 2369 for (i = 0; i < adev->num_ip_blocks; i++) { 2370 if (!adev->ip_blocks[i].status.valid) 2371 continue; 2372 if (adev->ip_blocks[i].version->type == block_type) { 2373 if (adev->ip_blocks[i].version->funcs->wait_for_idle) { 2374 r = adev->ip_blocks[i].version->funcs->wait_for_idle( 2375 &adev->ip_blocks[i]); 2376 if (r) 2377 return r; 2378 } 2379 break; 2380 } 2381 } 2382 return 0; 2383 2384 } 2385 2386 /** 2387 * amdgpu_device_ip_is_valid - is the hardware IP enabled 2388 * 2389 * @adev: amdgpu_device pointer 2390 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.) 2391 * 2392 * Check if the hardware IP is enable or not. 2393 * Returns true if it the IP is enable, false if not. 2394 */ 2395 bool amdgpu_device_ip_is_valid(struct amdgpu_device *adev, 2396 enum amd_ip_block_type block_type) 2397 { 2398 int i; 2399 2400 for (i = 0; i < adev->num_ip_blocks; i++) { 2401 if (adev->ip_blocks[i].version->type == block_type) 2402 return adev->ip_blocks[i].status.valid; 2403 } 2404 return false; 2405 2406 } 2407 2408 /** 2409 * amdgpu_device_ip_get_ip_block - get a hw IP pointer 2410 * 2411 * @adev: amdgpu_device pointer 2412 * @type: Type of hardware IP (SMU, GFX, UVD, etc.) 2413 * 2414 * Returns a pointer to the hardware IP block structure 2415 * if it exists for the asic, otherwise NULL. 2416 */ 2417 struct amdgpu_ip_block * 2418 amdgpu_device_ip_get_ip_block(struct amdgpu_device *adev, 2419 enum amd_ip_block_type type) 2420 { 2421 int i; 2422 2423 for (i = 0; i < adev->num_ip_blocks; i++) 2424 if (adev->ip_blocks[i].version->type == type) 2425 return &adev->ip_blocks[i]; 2426 2427 return NULL; 2428 } 2429 2430 /** 2431 * amdgpu_device_ip_block_version_cmp 2432 * 2433 * @adev: amdgpu_device pointer 2434 * @type: enum amd_ip_block_type 2435 * @major: major version 2436 * @minor: minor version 2437 * 2438 * return 0 if equal or greater 2439 * return 1 if smaller or the ip_block doesn't exist 2440 */ 2441 int amdgpu_device_ip_block_version_cmp(struct amdgpu_device *adev, 2442 enum amd_ip_block_type type, 2443 u32 major, u32 minor) 2444 { 2445 struct amdgpu_ip_block *ip_block = amdgpu_device_ip_get_ip_block(adev, type); 2446 2447 if (ip_block && ((ip_block->version->major > major) || 2448 ((ip_block->version->major == major) && 2449 (ip_block->version->minor >= minor)))) 2450 return 0; 2451 2452 return 1; 2453 } 2454 2455 /** 2456 * amdgpu_device_ip_block_add 2457 * 2458 * @adev: amdgpu_device pointer 2459 * @ip_block_version: pointer to the IP to add 2460 * 2461 * Adds the IP block driver information to the collection of IPs 2462 * on the asic. 2463 */ 2464 int amdgpu_device_ip_block_add(struct amdgpu_device *adev, 2465 const struct amdgpu_ip_block_version *ip_block_version) 2466 { 2467 if (!ip_block_version) 2468 return -EINVAL; 2469 2470 switch (ip_block_version->type) { 2471 case AMD_IP_BLOCK_TYPE_VCN: 2472 if (adev->harvest_ip_mask & AMD_HARVEST_IP_VCN_MASK) 2473 return 0; 2474 break; 2475 case AMD_IP_BLOCK_TYPE_JPEG: 2476 if (adev->harvest_ip_mask & AMD_HARVEST_IP_JPEG_MASK) 2477 return 0; 2478 break; 2479 default: 2480 break; 2481 } 2482 2483 dev_info(adev->dev, "detected ip block number %d <%s>\n", 2484 adev->num_ip_blocks, ip_block_version->funcs->name); 2485 2486 adev->ip_blocks[adev->num_ip_blocks].adev = adev; 2487 2488 adev->ip_blocks[adev->num_ip_blocks++].version = ip_block_version; 2489 2490 return 0; 2491 } 2492 2493 /** 2494 * amdgpu_device_enable_virtual_display - enable virtual display feature 2495 * 2496 * @adev: amdgpu_device pointer 2497 * 2498 * Enabled the virtual display feature if the user has enabled it via 2499 * the module parameter virtual_display. This feature provides a virtual 2500 * display hardware on headless boards or in virtualized environments. 2501 * This function parses and validates the configuration string specified by 2502 * the user and configures the virtual display configuration (number of 2503 * virtual connectors, crtcs, etc.) specified. 2504 */ 2505 static void amdgpu_device_enable_virtual_display(struct amdgpu_device *adev) 2506 { 2507 adev->enable_virtual_display = false; 2508 2509 if (amdgpu_virtual_display) { 2510 const char *pci_address_name = pci_name(adev->pdev); 2511 char *pciaddstr, *pciaddstr_tmp, *pciaddname_tmp, *pciaddname; 2512 2513 pciaddstr = kstrdup(amdgpu_virtual_display, GFP_KERNEL); 2514 pciaddstr_tmp = pciaddstr; 2515 while ((pciaddname_tmp = strsep(&pciaddstr_tmp, ";"))) { 2516 pciaddname = strsep(&pciaddname_tmp, ","); 2517 if (!strcmp("all", pciaddname) 2518 || !strcmp(pci_address_name, pciaddname)) { 2519 long num_crtc; 2520 int res = -1; 2521 2522 adev->enable_virtual_display = true; 2523 2524 if (pciaddname_tmp) 2525 res = kstrtol(pciaddname_tmp, 10, 2526 &num_crtc); 2527 2528 if (!res) { 2529 if (num_crtc < 1) 2530 num_crtc = 1; 2531 if (num_crtc > 6) 2532 num_crtc = 6; 2533 adev->mode_info.num_crtc = num_crtc; 2534 } else { 2535 adev->mode_info.num_crtc = 1; 2536 } 2537 break; 2538 } 2539 } 2540 2541 dev_info( 2542 adev->dev, 2543 "virtual display string:%s, %s:virtual_display:%d, num_crtc:%d\n", 2544 amdgpu_virtual_display, pci_address_name, 2545 adev->enable_virtual_display, adev->mode_info.num_crtc); 2546 2547 kfree(pciaddstr); 2548 } 2549 } 2550 2551 void amdgpu_device_set_sriov_virtual_display(struct amdgpu_device *adev) 2552 { 2553 if (amdgpu_sriov_vf(adev) && !adev->enable_virtual_display) { 2554 adev->mode_info.num_crtc = 1; 2555 adev->enable_virtual_display = true; 2556 dev_info(adev->dev, "virtual_display:%d, num_crtc:%d\n", 2557 adev->enable_virtual_display, 2558 adev->mode_info.num_crtc); 2559 } 2560 } 2561 2562 /** 2563 * amdgpu_device_parse_gpu_info_fw - parse gpu info firmware 2564 * 2565 * @adev: amdgpu_device pointer 2566 * 2567 * Parses the asic configuration parameters specified in the gpu info 2568 * firmware and makes them available to the driver for use in configuring 2569 * the asic. 2570 * Returns 0 on success, -EINVAL on failure. 2571 */ 2572 static int amdgpu_device_parse_gpu_info_fw(struct amdgpu_device *adev) 2573 { 2574 const char *chip_name; 2575 int err; 2576 const struct gpu_info_firmware_header_v1_0 *hdr; 2577 2578 adev->firmware.gpu_info_fw = NULL; 2579 2580 if (adev->mman.discovery_bin) 2581 return 0; 2582 2583 switch (adev->asic_type) { 2584 default: 2585 return 0; 2586 case CHIP_VEGA10: 2587 chip_name = "vega10"; 2588 break; 2589 case CHIP_VEGA12: 2590 chip_name = "vega12"; 2591 break; 2592 case CHIP_RAVEN: 2593 if (adev->apu_flags & AMD_APU_IS_RAVEN2) 2594 chip_name = "raven2"; 2595 else if (adev->apu_flags & AMD_APU_IS_PICASSO) 2596 chip_name = "picasso"; 2597 else 2598 chip_name = "raven"; 2599 break; 2600 case CHIP_ARCTURUS: 2601 chip_name = "arcturus"; 2602 break; 2603 case CHIP_NAVI12: 2604 chip_name = "navi12"; 2605 break; 2606 } 2607 2608 err = amdgpu_ucode_request(adev, &adev->firmware.gpu_info_fw, 2609 AMDGPU_UCODE_OPTIONAL, 2610 "amdgpu/%s_gpu_info.bin", chip_name); 2611 if (err) { 2612 dev_err(adev->dev, 2613 "Failed to get gpu_info firmware \"%s_gpu_info.bin\"\n", 2614 chip_name); 2615 goto out; 2616 } 2617 2618 hdr = (const struct gpu_info_firmware_header_v1_0 *)adev->firmware.gpu_info_fw->data; 2619 amdgpu_ucode_print_gpu_info_hdr(&hdr->header); 2620 2621 switch (hdr->version_major) { 2622 case 1: 2623 { 2624 const struct gpu_info_firmware_v1_0 *gpu_info_fw = 2625 (const struct gpu_info_firmware_v1_0 *)(adev->firmware.gpu_info_fw->data + 2626 le32_to_cpu(hdr->header.ucode_array_offset_bytes)); 2627 2628 /* 2629 * Should be dropped when DAL no longer needs it. 2630 */ 2631 if (adev->asic_type == CHIP_NAVI12) 2632 goto parse_soc_bounding_box; 2633 2634 adev->gfx.config.max_shader_engines = le32_to_cpu(gpu_info_fw->gc_num_se); 2635 adev->gfx.config.max_cu_per_sh = le32_to_cpu(gpu_info_fw->gc_num_cu_per_sh); 2636 adev->gfx.config.max_sh_per_se = le32_to_cpu(gpu_info_fw->gc_num_sh_per_se); 2637 adev->gfx.config.max_backends_per_se = le32_to_cpu(gpu_info_fw->gc_num_rb_per_se); 2638 adev->gfx.config.max_texture_channel_caches = 2639 le32_to_cpu(gpu_info_fw->gc_num_tccs); 2640 adev->gfx.config.max_gprs = le32_to_cpu(gpu_info_fw->gc_num_gprs); 2641 adev->gfx.config.max_gs_threads = le32_to_cpu(gpu_info_fw->gc_num_max_gs_thds); 2642 adev->gfx.config.gs_vgt_table_depth = le32_to_cpu(gpu_info_fw->gc_gs_table_depth); 2643 adev->gfx.config.gs_prim_buffer_depth = le32_to_cpu(gpu_info_fw->gc_gsprim_buff_depth); 2644 adev->gfx.config.double_offchip_lds_buf = 2645 le32_to_cpu(gpu_info_fw->gc_double_offchip_lds_buffer); 2646 adev->gfx.cu_info.wave_front_size = le32_to_cpu(gpu_info_fw->gc_wave_size); 2647 adev->gfx.cu_info.max_waves_per_simd = 2648 le32_to_cpu(gpu_info_fw->gc_max_waves_per_simd); 2649 adev->gfx.cu_info.max_scratch_slots_per_cu = 2650 le32_to_cpu(gpu_info_fw->gc_max_scratch_slots_per_cu); 2651 adev->gfx.cu_info.lds_size = le32_to_cpu(gpu_info_fw->gc_lds_size); 2652 if (hdr->version_minor >= 1) { 2653 const struct gpu_info_firmware_v1_1 *gpu_info_fw = 2654 (const struct gpu_info_firmware_v1_1 *)(adev->firmware.gpu_info_fw->data + 2655 le32_to_cpu(hdr->header.ucode_array_offset_bytes)); 2656 adev->gfx.config.num_sc_per_sh = 2657 le32_to_cpu(gpu_info_fw->num_sc_per_sh); 2658 adev->gfx.config.num_packer_per_sc = 2659 le32_to_cpu(gpu_info_fw->num_packer_per_sc); 2660 } 2661 2662 parse_soc_bounding_box: 2663 /* 2664 * soc bounding box info is not integrated in disocovery table, 2665 * we always need to parse it from gpu info firmware if needed. 2666 */ 2667 if (hdr->version_minor == 2) { 2668 const struct gpu_info_firmware_v1_2 *gpu_info_fw = 2669 (const struct gpu_info_firmware_v1_2 *)(adev->firmware.gpu_info_fw->data + 2670 le32_to_cpu(hdr->header.ucode_array_offset_bytes)); 2671 adev->dm.soc_bounding_box = &gpu_info_fw->soc_bounding_box; 2672 } 2673 break; 2674 } 2675 default: 2676 dev_err(adev->dev, 2677 "Unsupported gpu_info table %d\n", hdr->header.ucode_version); 2678 err = -EINVAL; 2679 goto out; 2680 } 2681 out: 2682 return err; 2683 } 2684 2685 /** 2686 * amdgpu_device_ip_early_init - run early init for hardware IPs 2687 * 2688 * @adev: amdgpu_device pointer 2689 * 2690 * Early initialization pass for hardware IPs. The hardware IPs that make 2691 * up each asic are discovered each IP's early_init callback is run. This 2692 * is the first stage in initializing the asic. 2693 * Returns 0 on success, negative error code on failure. 2694 */ 2695 static int amdgpu_device_ip_early_init(struct amdgpu_device *adev) 2696 { 2697 struct amdgpu_ip_block *ip_block; 2698 struct pci_dev *parent; 2699 bool total, skip_bios; 2700 uint32_t bios_flags; 2701 int i, r; 2702 2703 amdgpu_device_enable_virtual_display(adev); 2704 2705 if (amdgpu_sriov_vf(adev)) { 2706 r = amdgpu_virt_request_full_gpu(adev, true); 2707 if (r) 2708 return r; 2709 } 2710 2711 switch (adev->asic_type) { 2712 #ifdef CONFIG_DRM_AMDGPU_SI 2713 case CHIP_VERDE: 2714 case CHIP_TAHITI: 2715 case CHIP_PITCAIRN: 2716 case CHIP_OLAND: 2717 case CHIP_HAINAN: 2718 adev->family = AMDGPU_FAMILY_SI; 2719 r = si_set_ip_blocks(adev); 2720 if (r) 2721 return r; 2722 break; 2723 #endif 2724 #ifdef CONFIG_DRM_AMDGPU_CIK 2725 case CHIP_BONAIRE: 2726 case CHIP_HAWAII: 2727 case CHIP_KAVERI: 2728 case CHIP_KABINI: 2729 case CHIP_MULLINS: 2730 if (adev->flags & AMD_IS_APU) 2731 adev->family = AMDGPU_FAMILY_KV; 2732 else 2733 adev->family = AMDGPU_FAMILY_CI; 2734 2735 r = cik_set_ip_blocks(adev); 2736 if (r) 2737 return r; 2738 break; 2739 #endif 2740 case CHIP_TOPAZ: 2741 case CHIP_TONGA: 2742 case CHIP_FIJI: 2743 case CHIP_POLARIS10: 2744 case CHIP_POLARIS11: 2745 case CHIP_POLARIS12: 2746 case CHIP_VEGAM: 2747 case CHIP_CARRIZO: 2748 case CHIP_STONEY: 2749 if (adev->flags & AMD_IS_APU) 2750 adev->family = AMDGPU_FAMILY_CZ; 2751 else 2752 adev->family = AMDGPU_FAMILY_VI; 2753 2754 r = vi_set_ip_blocks(adev); 2755 if (r) 2756 return r; 2757 break; 2758 default: 2759 r = amdgpu_discovery_set_ip_blocks(adev); 2760 if (r) 2761 return r; 2762 break; 2763 } 2764 2765 /* Check for IP version 9.4.3 with A0 hardware */ 2766 if (amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 3) && 2767 !amdgpu_device_get_rev_id(adev)) { 2768 dev_err(adev->dev, "Unsupported A0 hardware\n"); 2769 return -ENODEV; /* device unsupported - no device error */ 2770 } 2771 2772 if (amdgpu_has_atpx() && 2773 (amdgpu_is_atpx_hybrid() || 2774 amdgpu_has_atpx_dgpu_power_cntl()) && 2775 ((adev->flags & AMD_IS_APU) == 0) && 2776 !dev_is_removable(&adev->pdev->dev)) 2777 adev->flags |= AMD_IS_PX; 2778 2779 if (!(adev->flags & AMD_IS_APU)) { 2780 parent = pcie_find_root_port(adev->pdev); 2781 adev->has_pr3 = parent ? pci_pr3_present(parent) : false; 2782 } 2783 2784 adev->pm.pp_feature = amdgpu_pp_feature_mask; 2785 if (amdgpu_sriov_vf(adev) || sched_policy == KFD_SCHED_POLICY_NO_HWS) 2786 adev->pm.pp_feature &= ~PP_GFXOFF_MASK; 2787 if (amdgpu_sriov_vf(adev) && adev->asic_type == CHIP_SIENNA_CICHLID) 2788 adev->pm.pp_feature &= ~PP_OVERDRIVE_MASK; 2789 if (!amdgpu_device_pcie_dynamic_switching_supported(adev)) 2790 adev->pm.pp_feature &= ~PP_PCIE_DPM_MASK; 2791 2792 adev->virt.is_xgmi_node_migrate_enabled = false; 2793 if (amdgpu_sriov_vf(adev)) { 2794 adev->virt.is_xgmi_node_migrate_enabled = 2795 amdgpu_ip_version((adev), GC_HWIP, 0) == IP_VERSION(9, 4, 4); 2796 } 2797 2798 total = true; 2799 for (i = 0; i < adev->num_ip_blocks; i++) { 2800 ip_block = &adev->ip_blocks[i]; 2801 2802 if ((amdgpu_ip_block_mask & (1 << i)) == 0) { 2803 dev_warn(adev->dev, "disabled ip block: %d <%s>\n", i, 2804 adev->ip_blocks[i].version->funcs->name); 2805 adev->ip_blocks[i].status.valid = false; 2806 } else if (ip_block->version->funcs->early_init) { 2807 r = ip_block->version->funcs->early_init(ip_block); 2808 if (r == -ENOENT) { 2809 adev->ip_blocks[i].status.valid = false; 2810 } else if (r) { 2811 dev_err(adev->dev, 2812 "early_init of IP block <%s> failed %d\n", 2813 adev->ip_blocks[i].version->funcs->name, 2814 r); 2815 total = false; 2816 } else { 2817 adev->ip_blocks[i].status.valid = true; 2818 } 2819 } else { 2820 adev->ip_blocks[i].status.valid = true; 2821 } 2822 /* get the vbios after the asic_funcs are set up */ 2823 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON) { 2824 r = amdgpu_device_parse_gpu_info_fw(adev); 2825 if (r) 2826 return r; 2827 2828 bios_flags = amdgpu_device_get_vbios_flags(adev); 2829 skip_bios = !!(bios_flags & AMDGPU_VBIOS_SKIP); 2830 /* Read BIOS */ 2831 if (!skip_bios) { 2832 bool optional = 2833 !!(bios_flags & AMDGPU_VBIOS_OPTIONAL); 2834 if (!amdgpu_get_bios(adev) && !optional) 2835 return -EINVAL; 2836 2837 if (optional && !adev->bios) 2838 dev_info( 2839 adev->dev, 2840 "VBIOS image optional, proceeding without VBIOS image"); 2841 2842 if (adev->bios) { 2843 r = amdgpu_atombios_init(adev); 2844 if (r) { 2845 dev_err(adev->dev, 2846 "amdgpu_atombios_init failed\n"); 2847 amdgpu_vf_error_put( 2848 adev, 2849 AMDGIM_ERROR_VF_ATOMBIOS_INIT_FAIL, 2850 0, 0); 2851 return r; 2852 } 2853 } 2854 } 2855 2856 /*get pf2vf msg info at it's earliest time*/ 2857 if (amdgpu_sriov_vf(adev)) 2858 amdgpu_virt_init_data_exchange(adev); 2859 2860 } 2861 } 2862 if (!total) 2863 return -ENODEV; 2864 2865 if (adev->gmc.xgmi.supported) 2866 amdgpu_xgmi_early_init(adev); 2867 2868 ip_block = amdgpu_device_ip_get_ip_block(adev, AMD_IP_BLOCK_TYPE_GFX); 2869 if (ip_block->status.valid != false) 2870 amdgpu_amdkfd_device_probe(adev); 2871 2872 adev->cg_flags &= amdgpu_cg_mask; 2873 adev->pg_flags &= amdgpu_pg_mask; 2874 2875 return 0; 2876 } 2877 2878 static int amdgpu_device_ip_hw_init_phase1(struct amdgpu_device *adev) 2879 { 2880 int i, r; 2881 2882 for (i = 0; i < adev->num_ip_blocks; i++) { 2883 if (!adev->ip_blocks[i].status.sw) 2884 continue; 2885 if (adev->ip_blocks[i].status.hw) 2886 continue; 2887 if (!amdgpu_ip_member_of_hwini( 2888 adev, adev->ip_blocks[i].version->type)) 2889 continue; 2890 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON || 2891 (amdgpu_sriov_vf(adev) && (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP)) || 2892 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH) { 2893 r = adev->ip_blocks[i].version->funcs->hw_init(&adev->ip_blocks[i]); 2894 if (r) { 2895 dev_err(adev->dev, 2896 "hw_init of IP block <%s> failed %d\n", 2897 adev->ip_blocks[i].version->funcs->name, 2898 r); 2899 return r; 2900 } 2901 adev->ip_blocks[i].status.hw = true; 2902 } 2903 } 2904 2905 return 0; 2906 } 2907 2908 static int amdgpu_device_ip_hw_init_phase2(struct amdgpu_device *adev) 2909 { 2910 int i, r; 2911 2912 for (i = 0; i < adev->num_ip_blocks; i++) { 2913 if (!adev->ip_blocks[i].status.sw) 2914 continue; 2915 if (adev->ip_blocks[i].status.hw) 2916 continue; 2917 if (!amdgpu_ip_member_of_hwini( 2918 adev, adev->ip_blocks[i].version->type)) 2919 continue; 2920 r = adev->ip_blocks[i].version->funcs->hw_init(&adev->ip_blocks[i]); 2921 if (r) { 2922 dev_err(adev->dev, 2923 "hw_init of IP block <%s> failed %d\n", 2924 adev->ip_blocks[i].version->funcs->name, r); 2925 return r; 2926 } 2927 adev->ip_blocks[i].status.hw = true; 2928 } 2929 2930 return 0; 2931 } 2932 2933 static int amdgpu_device_fw_loading(struct amdgpu_device *adev) 2934 { 2935 int r = 0; 2936 int i; 2937 uint32_t smu_version; 2938 2939 if (adev->asic_type >= CHIP_VEGA10) { 2940 for (i = 0; i < adev->num_ip_blocks; i++) { 2941 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_PSP) 2942 continue; 2943 2944 if (!amdgpu_ip_member_of_hwini(adev, 2945 AMD_IP_BLOCK_TYPE_PSP)) 2946 break; 2947 2948 if (!adev->ip_blocks[i].status.sw) 2949 continue; 2950 2951 /* no need to do the fw loading again if already done*/ 2952 if (adev->ip_blocks[i].status.hw == true) 2953 break; 2954 2955 if (amdgpu_in_reset(adev) || adev->in_suspend) { 2956 r = amdgpu_ip_block_resume(&adev->ip_blocks[i]); 2957 if (r) 2958 return r; 2959 } else { 2960 r = adev->ip_blocks[i].version->funcs->hw_init(&adev->ip_blocks[i]); 2961 if (r) { 2962 dev_err(adev->dev, 2963 "hw_init of IP block <%s> failed %d\n", 2964 adev->ip_blocks[i] 2965 .version->funcs->name, 2966 r); 2967 return r; 2968 } 2969 adev->ip_blocks[i].status.hw = true; 2970 } 2971 break; 2972 } 2973 } 2974 2975 if (!amdgpu_sriov_vf(adev) || adev->asic_type == CHIP_TONGA) 2976 r = amdgpu_pm_load_smu_firmware(adev, &smu_version); 2977 2978 return r; 2979 } 2980 2981 static int amdgpu_device_init_schedulers(struct amdgpu_device *adev) 2982 { 2983 struct drm_sched_init_args args = { 2984 .ops = &amdgpu_sched_ops, 2985 .num_rqs = DRM_SCHED_PRIORITY_COUNT, 2986 .timeout_wq = adev->reset_domain->wq, 2987 .dev = adev->dev, 2988 }; 2989 long timeout; 2990 int r, i; 2991 2992 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 2993 struct amdgpu_ring *ring = adev->rings[i]; 2994 2995 /* No need to setup the GPU scheduler for rings that don't need it */ 2996 if (!ring || ring->no_scheduler) 2997 continue; 2998 2999 switch (ring->funcs->type) { 3000 case AMDGPU_RING_TYPE_GFX: 3001 timeout = adev->gfx_timeout; 3002 break; 3003 case AMDGPU_RING_TYPE_COMPUTE: 3004 timeout = adev->compute_timeout; 3005 break; 3006 case AMDGPU_RING_TYPE_SDMA: 3007 timeout = adev->sdma_timeout; 3008 break; 3009 default: 3010 timeout = adev->video_timeout; 3011 break; 3012 } 3013 3014 args.timeout = timeout; 3015 args.credit_limit = ring->num_hw_submission; 3016 args.score = ring->sched_score; 3017 args.name = ring->name; 3018 3019 r = drm_sched_init(&ring->sched, &args); 3020 if (r) { 3021 dev_err(adev->dev, 3022 "Failed to create scheduler on ring %s.\n", 3023 ring->name); 3024 return r; 3025 } 3026 r = amdgpu_uvd_entity_init(adev, ring); 3027 if (r) { 3028 dev_err(adev->dev, 3029 "Failed to create UVD scheduling entity on ring %s.\n", 3030 ring->name); 3031 return r; 3032 } 3033 r = amdgpu_vce_entity_init(adev, ring); 3034 if (r) { 3035 dev_err(adev->dev, 3036 "Failed to create VCE scheduling entity on ring %s.\n", 3037 ring->name); 3038 return r; 3039 } 3040 } 3041 3042 if (adev->xcp_mgr) 3043 amdgpu_xcp_update_partition_sched_list(adev); 3044 3045 return 0; 3046 } 3047 3048 3049 /** 3050 * amdgpu_device_ip_init - run init for hardware IPs 3051 * 3052 * @adev: amdgpu_device pointer 3053 * 3054 * Main initialization pass for hardware IPs. The list of all the hardware 3055 * IPs that make up the asic is walked and the sw_init and hw_init callbacks 3056 * are run. sw_init initializes the software state associated with each IP 3057 * and hw_init initializes the hardware associated with each IP. 3058 * Returns 0 on success, negative error code on failure. 3059 */ 3060 static int amdgpu_device_ip_init(struct amdgpu_device *adev) 3061 { 3062 bool init_badpage; 3063 int i, r; 3064 3065 r = amdgpu_ras_init(adev); 3066 if (r) 3067 return r; 3068 3069 for (i = 0; i < adev->num_ip_blocks; i++) { 3070 if (!adev->ip_blocks[i].status.valid) 3071 continue; 3072 if (adev->ip_blocks[i].version->funcs->sw_init) { 3073 r = adev->ip_blocks[i].version->funcs->sw_init(&adev->ip_blocks[i]); 3074 if (r) { 3075 dev_err(adev->dev, 3076 "sw_init of IP block <%s> failed %d\n", 3077 adev->ip_blocks[i].version->funcs->name, 3078 r); 3079 goto init_failed; 3080 } 3081 } 3082 adev->ip_blocks[i].status.sw = true; 3083 3084 if (!amdgpu_ip_member_of_hwini( 3085 adev, adev->ip_blocks[i].version->type)) 3086 continue; 3087 3088 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON) { 3089 /* need to do common hw init early so everything is set up for gmc */ 3090 r = adev->ip_blocks[i].version->funcs->hw_init(&adev->ip_blocks[i]); 3091 if (r) { 3092 dev_err(adev->dev, "hw_init %d failed %d\n", i, 3093 r); 3094 goto init_failed; 3095 } 3096 adev->ip_blocks[i].status.hw = true; 3097 } else if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) { 3098 /* need to do gmc hw init early so we can allocate gpu mem */ 3099 /* Try to reserve bad pages early */ 3100 if (amdgpu_sriov_vf(adev)) 3101 amdgpu_virt_exchange_data(adev); 3102 3103 r = amdgpu_device_mem_scratch_init(adev); 3104 if (r) { 3105 dev_err(adev->dev, 3106 "amdgpu_mem_scratch_init failed %d\n", 3107 r); 3108 goto init_failed; 3109 } 3110 r = adev->ip_blocks[i].version->funcs->hw_init(&adev->ip_blocks[i]); 3111 if (r) { 3112 dev_err(adev->dev, "hw_init %d failed %d\n", i, 3113 r); 3114 goto init_failed; 3115 } 3116 r = amdgpu_device_wb_init(adev); 3117 if (r) { 3118 dev_err(adev->dev, 3119 "amdgpu_device_wb_init failed %d\n", r); 3120 goto init_failed; 3121 } 3122 adev->ip_blocks[i].status.hw = true; 3123 3124 /* right after GMC hw init, we create CSA */ 3125 if (adev->gfx.mcbp) { 3126 r = amdgpu_allocate_static_csa(adev, &adev->virt.csa_obj, 3127 AMDGPU_GEM_DOMAIN_VRAM | 3128 AMDGPU_GEM_DOMAIN_GTT, 3129 AMDGPU_CSA_SIZE); 3130 if (r) { 3131 dev_err(adev->dev, 3132 "allocate CSA failed %d\n", r); 3133 goto init_failed; 3134 } 3135 } 3136 3137 r = amdgpu_seq64_init(adev); 3138 if (r) { 3139 dev_err(adev->dev, "allocate seq64 failed %d\n", 3140 r); 3141 goto init_failed; 3142 } 3143 } 3144 } 3145 3146 if (amdgpu_sriov_vf(adev)) 3147 amdgpu_virt_init_data_exchange(adev); 3148 3149 r = amdgpu_ib_pool_init(adev); 3150 if (r) { 3151 dev_err(adev->dev, "IB initialization failed (%d).\n", r); 3152 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_IB_INIT_FAIL, 0, r); 3153 goto init_failed; 3154 } 3155 3156 r = amdgpu_ucode_create_bo(adev); /* create ucode bo when sw_init complete*/ 3157 if (r) 3158 goto init_failed; 3159 3160 r = amdgpu_device_ip_hw_init_phase1(adev); 3161 if (r) 3162 goto init_failed; 3163 3164 r = amdgpu_device_fw_loading(adev); 3165 if (r) 3166 goto init_failed; 3167 3168 r = amdgpu_device_ip_hw_init_phase2(adev); 3169 if (r) 3170 goto init_failed; 3171 3172 /* 3173 * retired pages will be loaded from eeprom and reserved here, 3174 * it should be called after amdgpu_device_ip_hw_init_phase2 since 3175 * for some ASICs the RAS EEPROM code relies on SMU fully functioning 3176 * for I2C communication which only true at this point. 3177 * 3178 * amdgpu_ras_recovery_init may fail, but the upper only cares the 3179 * failure from bad gpu situation and stop amdgpu init process 3180 * accordingly. For other failed cases, it will still release all 3181 * the resource and print error message, rather than returning one 3182 * negative value to upper level. 3183 * 3184 * Note: theoretically, this should be called before all vram allocations 3185 * to protect retired page from abusing 3186 */ 3187 init_badpage = (adev->init_lvl->level != AMDGPU_INIT_LEVEL_MINIMAL_XGMI); 3188 r = amdgpu_ras_recovery_init(adev, init_badpage); 3189 if (r) 3190 goto init_failed; 3191 3192 /** 3193 * In case of XGMI grab extra reference for reset domain for this device 3194 */ 3195 if (adev->gmc.xgmi.num_physical_nodes > 1) { 3196 if (amdgpu_xgmi_add_device(adev) == 0) { 3197 if (!amdgpu_sriov_vf(adev)) { 3198 struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev); 3199 3200 if (WARN_ON(!hive)) { 3201 r = -ENOENT; 3202 goto init_failed; 3203 } 3204 3205 if (!hive->reset_domain || 3206 !amdgpu_reset_get_reset_domain(hive->reset_domain)) { 3207 r = -ENOENT; 3208 amdgpu_put_xgmi_hive(hive); 3209 goto init_failed; 3210 } 3211 3212 /* Drop the early temporary reset domain we created for device */ 3213 amdgpu_reset_put_reset_domain(adev->reset_domain); 3214 adev->reset_domain = hive->reset_domain; 3215 amdgpu_put_xgmi_hive(hive); 3216 } 3217 } 3218 } 3219 3220 r = amdgpu_device_init_schedulers(adev); 3221 if (r) 3222 goto init_failed; 3223 3224 if (adev->mman.buffer_funcs_ring->sched.ready) 3225 amdgpu_ttm_set_buffer_funcs_status(adev, true); 3226 3227 /* Don't init kfd if whole hive need to be reset during init */ 3228 if (adev->init_lvl->level != AMDGPU_INIT_LEVEL_MINIMAL_XGMI) { 3229 kgd2kfd_init_zone_device(adev); 3230 amdgpu_amdkfd_device_init(adev); 3231 } 3232 3233 amdgpu_fru_get_product_info(adev); 3234 3235 if (!amdgpu_sriov_vf(adev) || amdgpu_sriov_ras_cper_en(adev)) 3236 r = amdgpu_cper_init(adev); 3237 3238 init_failed: 3239 3240 return r; 3241 } 3242 3243 /** 3244 * amdgpu_device_fill_reset_magic - writes reset magic to gart pointer 3245 * 3246 * @adev: amdgpu_device pointer 3247 * 3248 * Writes a reset magic value to the gart pointer in VRAM. The driver calls 3249 * this function before a GPU reset. If the value is retained after a 3250 * GPU reset, VRAM has not been lost. Some GPU resets may destroy VRAM contents. 3251 */ 3252 static void amdgpu_device_fill_reset_magic(struct amdgpu_device *adev) 3253 { 3254 memcpy(adev->reset_magic, adev->gart.ptr, AMDGPU_RESET_MAGIC_NUM); 3255 } 3256 3257 /** 3258 * amdgpu_device_check_vram_lost - check if vram is valid 3259 * 3260 * @adev: amdgpu_device pointer 3261 * 3262 * Checks the reset magic value written to the gart pointer in VRAM. 3263 * The driver calls this after a GPU reset to see if the contents of 3264 * VRAM is lost or now. 3265 * returns true if vram is lost, false if not. 3266 */ 3267 static bool amdgpu_device_check_vram_lost(struct amdgpu_device *adev) 3268 { 3269 if (memcmp(adev->gart.ptr, adev->reset_magic, 3270 AMDGPU_RESET_MAGIC_NUM)) 3271 return true; 3272 3273 if (!amdgpu_in_reset(adev)) 3274 return false; 3275 3276 /* 3277 * For all ASICs with baco/mode1 reset, the VRAM is 3278 * always assumed to be lost. 3279 */ 3280 switch (amdgpu_asic_reset_method(adev)) { 3281 case AMD_RESET_METHOD_LINK: 3282 case AMD_RESET_METHOD_BACO: 3283 case AMD_RESET_METHOD_MODE1: 3284 return true; 3285 default: 3286 return false; 3287 } 3288 } 3289 3290 /** 3291 * amdgpu_device_set_cg_state - set clockgating for amdgpu device 3292 * 3293 * @adev: amdgpu_device pointer 3294 * @state: clockgating state (gate or ungate) 3295 * 3296 * The list of all the hardware IPs that make up the asic is walked and the 3297 * set_clockgating_state callbacks are run. 3298 * Late initialization pass enabling clockgating for hardware IPs. 3299 * Fini or suspend, pass disabling clockgating for hardware IPs. 3300 * Returns 0 on success, negative error code on failure. 3301 */ 3302 3303 int amdgpu_device_set_cg_state(struct amdgpu_device *adev, 3304 enum amd_clockgating_state state) 3305 { 3306 int i, j, r; 3307 3308 if (amdgpu_emu_mode == 1) 3309 return 0; 3310 3311 for (j = 0; j < adev->num_ip_blocks; j++) { 3312 i = state == AMD_CG_STATE_GATE ? j : adev->num_ip_blocks - j - 1; 3313 if (!adev->ip_blocks[i].status.late_initialized) 3314 continue; 3315 /* skip CG for GFX, SDMA on S0ix */ 3316 if (adev->in_s0ix && 3317 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GFX || 3318 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SDMA)) 3319 continue; 3320 /* skip CG for VCE/UVD, it's handled specially */ 3321 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_UVD && 3322 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCE && 3323 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCN && 3324 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_JPEG && 3325 adev->ip_blocks[i].version->funcs->set_clockgating_state) { 3326 /* enable clockgating to save power */ 3327 r = adev->ip_blocks[i].version->funcs->set_clockgating_state(&adev->ip_blocks[i], 3328 state); 3329 if (r) { 3330 dev_err(adev->dev, 3331 "set_clockgating_state(gate) of IP block <%s> failed %d\n", 3332 adev->ip_blocks[i].version->funcs->name, 3333 r); 3334 return r; 3335 } 3336 } 3337 } 3338 3339 return 0; 3340 } 3341 3342 int amdgpu_device_set_pg_state(struct amdgpu_device *adev, 3343 enum amd_powergating_state state) 3344 { 3345 int i, j, r; 3346 3347 if (amdgpu_emu_mode == 1) 3348 return 0; 3349 3350 for (j = 0; j < adev->num_ip_blocks; j++) { 3351 i = state == AMD_PG_STATE_GATE ? j : adev->num_ip_blocks - j - 1; 3352 if (!adev->ip_blocks[i].status.late_initialized) 3353 continue; 3354 /* skip PG for GFX, SDMA on S0ix */ 3355 if (adev->in_s0ix && 3356 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GFX || 3357 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SDMA)) 3358 continue; 3359 /* skip CG for VCE/UVD, it's handled specially */ 3360 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_UVD && 3361 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCE && 3362 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCN && 3363 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_JPEG && 3364 adev->ip_blocks[i].version->funcs->set_powergating_state) { 3365 /* enable powergating to save power */ 3366 r = adev->ip_blocks[i].version->funcs->set_powergating_state(&adev->ip_blocks[i], 3367 state); 3368 if (r) { 3369 dev_err(adev->dev, 3370 "set_powergating_state(gate) of IP block <%s> failed %d\n", 3371 adev->ip_blocks[i].version->funcs->name, 3372 r); 3373 return r; 3374 } 3375 } 3376 } 3377 return 0; 3378 } 3379 3380 static int amdgpu_device_enable_mgpu_fan_boost(void) 3381 { 3382 struct amdgpu_gpu_instance *gpu_ins; 3383 struct amdgpu_device *adev; 3384 int i, ret = 0; 3385 3386 mutex_lock(&mgpu_info.mutex); 3387 3388 /* 3389 * MGPU fan boost feature should be enabled 3390 * only when there are two or more dGPUs in 3391 * the system 3392 */ 3393 if (mgpu_info.num_dgpu < 2) 3394 goto out; 3395 3396 for (i = 0; i < mgpu_info.num_dgpu; i++) { 3397 gpu_ins = &(mgpu_info.gpu_ins[i]); 3398 adev = gpu_ins->adev; 3399 if (!(adev->flags & AMD_IS_APU) && 3400 !gpu_ins->mgpu_fan_enabled) { 3401 ret = amdgpu_dpm_enable_mgpu_fan_boost(adev); 3402 if (ret) 3403 break; 3404 3405 gpu_ins->mgpu_fan_enabled = 1; 3406 } 3407 } 3408 3409 out: 3410 mutex_unlock(&mgpu_info.mutex); 3411 3412 return ret; 3413 } 3414 3415 /** 3416 * amdgpu_device_ip_late_init - run late init for hardware IPs 3417 * 3418 * @adev: amdgpu_device pointer 3419 * 3420 * Late initialization pass for hardware IPs. The list of all the hardware 3421 * IPs that make up the asic is walked and the late_init callbacks are run. 3422 * late_init covers any special initialization that an IP requires 3423 * after all of the have been initialized or something that needs to happen 3424 * late in the init process. 3425 * Returns 0 on success, negative error code on failure. 3426 */ 3427 static int amdgpu_device_ip_late_init(struct amdgpu_device *adev) 3428 { 3429 struct amdgpu_gpu_instance *gpu_instance; 3430 int i = 0, r; 3431 3432 for (i = 0; i < adev->num_ip_blocks; i++) { 3433 if (!adev->ip_blocks[i].status.hw) 3434 continue; 3435 if (adev->ip_blocks[i].version->funcs->late_init) { 3436 r = adev->ip_blocks[i].version->funcs->late_init(&adev->ip_blocks[i]); 3437 if (r) { 3438 dev_err(adev->dev, 3439 "late_init of IP block <%s> failed %d\n", 3440 adev->ip_blocks[i].version->funcs->name, 3441 r); 3442 return r; 3443 } 3444 } 3445 adev->ip_blocks[i].status.late_initialized = true; 3446 } 3447 3448 r = amdgpu_ras_late_init(adev); 3449 if (r) { 3450 dev_err(adev->dev, "amdgpu_ras_late_init failed %d", r); 3451 return r; 3452 } 3453 3454 if (!amdgpu_reset_in_recovery(adev)) 3455 amdgpu_ras_set_error_query_ready(adev, true); 3456 3457 amdgpu_device_set_cg_state(adev, AMD_CG_STATE_GATE); 3458 amdgpu_device_set_pg_state(adev, AMD_PG_STATE_GATE); 3459 3460 amdgpu_device_fill_reset_magic(adev); 3461 3462 r = amdgpu_device_enable_mgpu_fan_boost(); 3463 if (r) 3464 dev_err(adev->dev, "enable mgpu fan boost failed (%d).\n", r); 3465 3466 /* For passthrough configuration on arcturus and aldebaran, enable special handling SBR */ 3467 if (amdgpu_passthrough(adev) && 3468 ((adev->asic_type == CHIP_ARCTURUS && adev->gmc.xgmi.num_physical_nodes > 1) || 3469 adev->asic_type == CHIP_ALDEBARAN)) 3470 amdgpu_dpm_handle_passthrough_sbr(adev, true); 3471 3472 if (adev->gmc.xgmi.num_physical_nodes > 1) { 3473 mutex_lock(&mgpu_info.mutex); 3474 3475 /* 3476 * Reset device p-state to low as this was booted with high. 3477 * 3478 * This should be performed only after all devices from the same 3479 * hive get initialized. 3480 * 3481 * However, it's unknown how many device in the hive in advance. 3482 * As this is counted one by one during devices initializations. 3483 * 3484 * So, we wait for all XGMI interlinked devices initialized. 3485 * This may bring some delays as those devices may come from 3486 * different hives. But that should be OK. 3487 */ 3488 if (mgpu_info.num_dgpu == adev->gmc.xgmi.num_physical_nodes) { 3489 for (i = 0; i < mgpu_info.num_gpu; i++) { 3490 gpu_instance = &(mgpu_info.gpu_ins[i]); 3491 if (gpu_instance->adev->flags & AMD_IS_APU) 3492 continue; 3493 3494 r = amdgpu_xgmi_set_pstate(gpu_instance->adev, 3495 AMDGPU_XGMI_PSTATE_MIN); 3496 if (r) { 3497 dev_err(adev->dev, 3498 "pstate setting failed (%d).\n", 3499 r); 3500 break; 3501 } 3502 } 3503 } 3504 3505 mutex_unlock(&mgpu_info.mutex); 3506 } 3507 3508 return 0; 3509 } 3510 3511 static void amdgpu_ip_block_hw_fini(struct amdgpu_ip_block *ip_block) 3512 { 3513 struct amdgpu_device *adev = ip_block->adev; 3514 int r; 3515 3516 if (!ip_block->version->funcs->hw_fini) { 3517 dev_err(adev->dev, "hw_fini of IP block <%s> not defined\n", 3518 ip_block->version->funcs->name); 3519 } else { 3520 r = ip_block->version->funcs->hw_fini(ip_block); 3521 /* XXX handle errors */ 3522 if (r) { 3523 dev_dbg(adev->dev, 3524 "hw_fini of IP block <%s> failed %d\n", 3525 ip_block->version->funcs->name, r); 3526 } 3527 } 3528 3529 ip_block->status.hw = false; 3530 } 3531 3532 /** 3533 * amdgpu_device_smu_fini_early - smu hw_fini wrapper 3534 * 3535 * @adev: amdgpu_device pointer 3536 * 3537 * For ASICs need to disable SMC first 3538 */ 3539 static void amdgpu_device_smu_fini_early(struct amdgpu_device *adev) 3540 { 3541 int i; 3542 3543 if (amdgpu_ip_version(adev, GC_HWIP, 0) > IP_VERSION(9, 0, 0)) 3544 return; 3545 3546 for (i = 0; i < adev->num_ip_blocks; i++) { 3547 if (!adev->ip_blocks[i].status.hw) 3548 continue; 3549 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) { 3550 amdgpu_ip_block_hw_fini(&adev->ip_blocks[i]); 3551 break; 3552 } 3553 } 3554 } 3555 3556 static int amdgpu_device_ip_fini_early(struct amdgpu_device *adev) 3557 { 3558 int i, r; 3559 3560 for (i = 0; i < adev->num_ip_blocks; i++) { 3561 if (!adev->ip_blocks[i].version->funcs->early_fini) 3562 continue; 3563 3564 r = adev->ip_blocks[i].version->funcs->early_fini(&adev->ip_blocks[i]); 3565 if (r) { 3566 dev_dbg(adev->dev, 3567 "early_fini of IP block <%s> failed %d\n", 3568 adev->ip_blocks[i].version->funcs->name, r); 3569 } 3570 } 3571 3572 amdgpu_device_set_pg_state(adev, AMD_PG_STATE_UNGATE); 3573 amdgpu_device_set_cg_state(adev, AMD_CG_STATE_UNGATE); 3574 3575 amdgpu_amdkfd_suspend(adev, true); 3576 amdgpu_userq_suspend(adev); 3577 3578 /* Workaround for ASICs need to disable SMC first */ 3579 amdgpu_device_smu_fini_early(adev); 3580 3581 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 3582 if (!adev->ip_blocks[i].status.hw) 3583 continue; 3584 3585 amdgpu_ip_block_hw_fini(&adev->ip_blocks[i]); 3586 } 3587 3588 if (amdgpu_sriov_vf(adev)) { 3589 if (amdgpu_virt_release_full_gpu(adev, false)) 3590 dev_err(adev->dev, 3591 "failed to release exclusive mode on fini\n"); 3592 } 3593 3594 return 0; 3595 } 3596 3597 /** 3598 * amdgpu_device_ip_fini - run fini for hardware IPs 3599 * 3600 * @adev: amdgpu_device pointer 3601 * 3602 * Main teardown pass for hardware IPs. The list of all the hardware 3603 * IPs that make up the asic is walked and the hw_fini and sw_fini callbacks 3604 * are run. hw_fini tears down the hardware associated with each IP 3605 * and sw_fini tears down any software state associated with each IP. 3606 * Returns 0 on success, negative error code on failure. 3607 */ 3608 static int amdgpu_device_ip_fini(struct amdgpu_device *adev) 3609 { 3610 int i, r; 3611 3612 amdgpu_cper_fini(adev); 3613 3614 if (amdgpu_sriov_vf(adev) && adev->virt.ras_init_done) 3615 amdgpu_virt_release_ras_err_handler_data(adev); 3616 3617 if (adev->gmc.xgmi.num_physical_nodes > 1) 3618 amdgpu_xgmi_remove_device(adev); 3619 3620 amdgpu_amdkfd_device_fini_sw(adev); 3621 3622 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 3623 if (!adev->ip_blocks[i].status.sw) 3624 continue; 3625 3626 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) { 3627 amdgpu_ucode_free_bo(adev); 3628 amdgpu_free_static_csa(&adev->virt.csa_obj); 3629 amdgpu_device_wb_fini(adev); 3630 amdgpu_device_mem_scratch_fini(adev); 3631 amdgpu_ib_pool_fini(adev); 3632 amdgpu_seq64_fini(adev); 3633 amdgpu_doorbell_fini(adev); 3634 } 3635 if (adev->ip_blocks[i].version->funcs->sw_fini) { 3636 r = adev->ip_blocks[i].version->funcs->sw_fini(&adev->ip_blocks[i]); 3637 /* XXX handle errors */ 3638 if (r) { 3639 dev_dbg(adev->dev, 3640 "sw_fini of IP block <%s> failed %d\n", 3641 adev->ip_blocks[i].version->funcs->name, 3642 r); 3643 } 3644 } 3645 adev->ip_blocks[i].status.sw = false; 3646 adev->ip_blocks[i].status.valid = false; 3647 } 3648 3649 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 3650 if (!adev->ip_blocks[i].status.late_initialized) 3651 continue; 3652 if (adev->ip_blocks[i].version->funcs->late_fini) 3653 adev->ip_blocks[i].version->funcs->late_fini(&adev->ip_blocks[i]); 3654 adev->ip_blocks[i].status.late_initialized = false; 3655 } 3656 3657 amdgpu_ras_fini(adev); 3658 3659 return 0; 3660 } 3661 3662 /** 3663 * amdgpu_device_delayed_init_work_handler - work handler for IB tests 3664 * 3665 * @work: work_struct. 3666 */ 3667 static void amdgpu_device_delayed_init_work_handler(struct work_struct *work) 3668 { 3669 struct amdgpu_device *adev = 3670 container_of(work, struct amdgpu_device, delayed_init_work.work); 3671 int r; 3672 3673 r = amdgpu_ib_ring_tests(adev); 3674 if (r) 3675 dev_err(adev->dev, "ib ring test failed (%d).\n", r); 3676 } 3677 3678 static void amdgpu_device_delay_enable_gfx_off(struct work_struct *work) 3679 { 3680 struct amdgpu_device *adev = 3681 container_of(work, struct amdgpu_device, gfx.gfx_off_delay_work.work); 3682 3683 WARN_ON_ONCE(adev->gfx.gfx_off_state); 3684 WARN_ON_ONCE(adev->gfx.gfx_off_req_count); 3685 3686 if (!amdgpu_dpm_set_powergating_by_smu(adev, AMD_IP_BLOCK_TYPE_GFX, true, 0)) 3687 adev->gfx.gfx_off_state = true; 3688 } 3689 3690 /** 3691 * amdgpu_device_ip_suspend_phase1 - run suspend for hardware IPs (phase 1) 3692 * 3693 * @adev: amdgpu_device pointer 3694 * 3695 * Main suspend function for hardware IPs. The list of all the hardware 3696 * IPs that make up the asic is walked, clockgating is disabled and the 3697 * suspend callbacks are run. suspend puts the hardware and software state 3698 * in each IP into a state suitable for suspend. 3699 * Returns 0 on success, negative error code on failure. 3700 */ 3701 static int amdgpu_device_ip_suspend_phase1(struct amdgpu_device *adev) 3702 { 3703 int i, r; 3704 3705 amdgpu_device_set_pg_state(adev, AMD_PG_STATE_UNGATE); 3706 amdgpu_device_set_cg_state(adev, AMD_CG_STATE_UNGATE); 3707 3708 /* 3709 * Per PMFW team's suggestion, driver needs to handle gfxoff 3710 * and df cstate features disablement for gpu reset(e.g. Mode1Reset) 3711 * scenario. Add the missing df cstate disablement here. 3712 */ 3713 if (amdgpu_dpm_set_df_cstate(adev, DF_CSTATE_DISALLOW)) 3714 dev_warn(adev->dev, "Failed to disallow df cstate"); 3715 3716 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 3717 if (!adev->ip_blocks[i].status.valid) 3718 continue; 3719 3720 /* displays are handled separately */ 3721 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_DCE) 3722 continue; 3723 3724 /* XXX handle errors */ 3725 r = amdgpu_ip_block_suspend(&adev->ip_blocks[i]); 3726 if (r) 3727 return r; 3728 } 3729 3730 return 0; 3731 } 3732 3733 /** 3734 * amdgpu_device_ip_suspend_phase2 - run suspend for hardware IPs (phase 2) 3735 * 3736 * @adev: amdgpu_device pointer 3737 * 3738 * Main suspend function for hardware IPs. The list of all the hardware 3739 * IPs that make up the asic is walked, clockgating is disabled and the 3740 * suspend callbacks are run. suspend puts the hardware and software state 3741 * in each IP into a state suitable for suspend. 3742 * Returns 0 on success, negative error code on failure. 3743 */ 3744 static int amdgpu_device_ip_suspend_phase2(struct amdgpu_device *adev) 3745 { 3746 int i, r; 3747 3748 if (adev->in_s0ix) 3749 amdgpu_dpm_gfx_state_change(adev, sGpuChangeState_D3Entry); 3750 3751 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 3752 if (!adev->ip_blocks[i].status.valid) 3753 continue; 3754 /* displays are handled in phase1 */ 3755 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE) 3756 continue; 3757 /* PSP lost connection when err_event_athub occurs */ 3758 if (amdgpu_ras_intr_triggered() && 3759 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) { 3760 adev->ip_blocks[i].status.hw = false; 3761 continue; 3762 } 3763 3764 /* skip unnecessary suspend if we do not initialize them yet */ 3765 if (!amdgpu_ip_member_of_hwini( 3766 adev, adev->ip_blocks[i].version->type)) 3767 continue; 3768 3769 /* Since we skip suspend for S0i3, we need to cancel the delayed 3770 * idle work here as the suspend callback never gets called. 3771 */ 3772 if (adev->in_s0ix && 3773 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GFX && 3774 amdgpu_ip_version(adev, GC_HWIP, 0) >= IP_VERSION(10, 0, 0)) 3775 cancel_delayed_work_sync(&adev->gfx.idle_work); 3776 /* skip suspend of gfx/mes and psp for S0ix 3777 * gfx is in gfxoff state, so on resume it will exit gfxoff just 3778 * like at runtime. PSP is also part of the always on hardware 3779 * so no need to suspend it. 3780 */ 3781 if (adev->in_s0ix && 3782 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP || 3783 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GFX || 3784 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_MES)) 3785 continue; 3786 3787 /* SDMA 5.x+ is part of GFX power domain so it's covered by GFXOFF */ 3788 if (adev->in_s0ix && 3789 (amdgpu_ip_version(adev, SDMA0_HWIP, 0) >= 3790 IP_VERSION(5, 0, 0)) && 3791 (adev->ip_blocks[i].version->type == 3792 AMD_IP_BLOCK_TYPE_SDMA)) 3793 continue; 3794 3795 /* Once swPSP provides the IMU, RLC FW binaries to TOS during cold-boot. 3796 * These are in TMR, hence are expected to be reused by PSP-TOS to reload 3797 * from this location and RLC Autoload automatically also gets loaded 3798 * from here based on PMFW -> PSP message during re-init sequence. 3799 * Therefore, the psp suspend & resume should be skipped to avoid destroy 3800 * the TMR and reload FWs again for IMU enabled APU ASICs. 3801 */ 3802 if (amdgpu_in_reset(adev) && 3803 (adev->flags & AMD_IS_APU) && adev->gfx.imu.funcs && 3804 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) 3805 continue; 3806 3807 /* XXX handle errors */ 3808 r = amdgpu_ip_block_suspend(&adev->ip_blocks[i]); 3809 adev->ip_blocks[i].status.hw = false; 3810 3811 /* handle putting the SMC in the appropriate state */ 3812 if (!amdgpu_sriov_vf(adev)) { 3813 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) { 3814 r = amdgpu_dpm_set_mp1_state(adev, adev->mp1_state); 3815 if (r) { 3816 dev_err(adev->dev, 3817 "SMC failed to set mp1 state %d, %d\n", 3818 adev->mp1_state, r); 3819 return r; 3820 } 3821 } 3822 } 3823 } 3824 3825 return 0; 3826 } 3827 3828 /** 3829 * amdgpu_device_ip_suspend - run suspend for hardware IPs 3830 * 3831 * @adev: amdgpu_device pointer 3832 * 3833 * Main suspend function for hardware IPs. The list of all the hardware 3834 * IPs that make up the asic is walked, clockgating is disabled and the 3835 * suspend callbacks are run. suspend puts the hardware and software state 3836 * in each IP into a state suitable for suspend. 3837 * Returns 0 on success, negative error code on failure. 3838 */ 3839 int amdgpu_device_ip_suspend(struct amdgpu_device *adev) 3840 { 3841 int r; 3842 3843 if (amdgpu_sriov_vf(adev)) { 3844 amdgpu_virt_fini_data_exchange(adev); 3845 amdgpu_virt_request_full_gpu(adev, false); 3846 } 3847 3848 amdgpu_ttm_set_buffer_funcs_status(adev, false); 3849 3850 r = amdgpu_device_ip_suspend_phase1(adev); 3851 if (r) 3852 return r; 3853 r = amdgpu_device_ip_suspend_phase2(adev); 3854 3855 if (amdgpu_sriov_vf(adev)) 3856 amdgpu_virt_release_full_gpu(adev, false); 3857 3858 return r; 3859 } 3860 3861 static int amdgpu_device_ip_reinit_early_sriov(struct amdgpu_device *adev) 3862 { 3863 int i, r; 3864 3865 static enum amd_ip_block_type ip_order[] = { 3866 AMD_IP_BLOCK_TYPE_COMMON, 3867 AMD_IP_BLOCK_TYPE_GMC, 3868 AMD_IP_BLOCK_TYPE_PSP, 3869 AMD_IP_BLOCK_TYPE_IH, 3870 }; 3871 3872 for (i = 0; i < adev->num_ip_blocks; i++) { 3873 int j; 3874 struct amdgpu_ip_block *block; 3875 3876 block = &adev->ip_blocks[i]; 3877 block->status.hw = false; 3878 3879 for (j = 0; j < ARRAY_SIZE(ip_order); j++) { 3880 3881 if (block->version->type != ip_order[j] || 3882 !block->status.valid) 3883 continue; 3884 3885 r = block->version->funcs->hw_init(&adev->ip_blocks[i]); 3886 if (r) { 3887 dev_err(adev->dev, "RE-INIT-early: %s failed\n", 3888 block->version->funcs->name); 3889 return r; 3890 } 3891 block->status.hw = true; 3892 } 3893 } 3894 3895 return 0; 3896 } 3897 3898 static int amdgpu_device_ip_reinit_late_sriov(struct amdgpu_device *adev) 3899 { 3900 struct amdgpu_ip_block *block; 3901 int i, r = 0; 3902 3903 static enum amd_ip_block_type ip_order[] = { 3904 AMD_IP_BLOCK_TYPE_SMC, 3905 AMD_IP_BLOCK_TYPE_DCE, 3906 AMD_IP_BLOCK_TYPE_GFX, 3907 AMD_IP_BLOCK_TYPE_SDMA, 3908 AMD_IP_BLOCK_TYPE_MES, 3909 AMD_IP_BLOCK_TYPE_UVD, 3910 AMD_IP_BLOCK_TYPE_VCE, 3911 AMD_IP_BLOCK_TYPE_VCN, 3912 AMD_IP_BLOCK_TYPE_JPEG 3913 }; 3914 3915 for (i = 0; i < ARRAY_SIZE(ip_order); i++) { 3916 block = amdgpu_device_ip_get_ip_block(adev, ip_order[i]); 3917 3918 if (!block) 3919 continue; 3920 3921 if (block->status.valid && !block->status.hw) { 3922 if (block->version->type == AMD_IP_BLOCK_TYPE_SMC) { 3923 r = amdgpu_ip_block_resume(block); 3924 } else { 3925 r = block->version->funcs->hw_init(block); 3926 } 3927 3928 if (r) { 3929 dev_err(adev->dev, "RE-INIT-late: %s failed\n", 3930 block->version->funcs->name); 3931 break; 3932 } 3933 block->status.hw = true; 3934 } 3935 } 3936 3937 return r; 3938 } 3939 3940 /** 3941 * amdgpu_device_ip_resume_phase1 - run resume for hardware IPs 3942 * 3943 * @adev: amdgpu_device pointer 3944 * 3945 * First resume function for hardware IPs. The list of all the hardware 3946 * IPs that make up the asic is walked and the resume callbacks are run for 3947 * COMMON, GMC, and IH. resume puts the hardware into a functional state 3948 * after a suspend and updates the software state as necessary. This 3949 * function is also used for restoring the GPU after a GPU reset. 3950 * Returns 0 on success, negative error code on failure. 3951 */ 3952 static int amdgpu_device_ip_resume_phase1(struct amdgpu_device *adev) 3953 { 3954 int i, r; 3955 3956 for (i = 0; i < adev->num_ip_blocks; i++) { 3957 if (!adev->ip_blocks[i].status.valid || adev->ip_blocks[i].status.hw) 3958 continue; 3959 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON || 3960 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC || 3961 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH || 3962 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP && amdgpu_sriov_vf(adev))) { 3963 3964 r = amdgpu_ip_block_resume(&adev->ip_blocks[i]); 3965 if (r) 3966 return r; 3967 } 3968 } 3969 3970 return 0; 3971 } 3972 3973 /** 3974 * amdgpu_device_ip_resume_phase2 - run resume for hardware IPs 3975 * 3976 * @adev: amdgpu_device pointer 3977 * 3978 * Second resume function for hardware IPs. The list of all the hardware 3979 * IPs that make up the asic is walked and the resume callbacks are run for 3980 * all blocks except COMMON, GMC, and IH. resume puts the hardware into a 3981 * functional state after a suspend and updates the software state as 3982 * necessary. This function is also used for restoring the GPU after a GPU 3983 * reset. 3984 * Returns 0 on success, negative error code on failure. 3985 */ 3986 static int amdgpu_device_ip_resume_phase2(struct amdgpu_device *adev) 3987 { 3988 int i, r; 3989 3990 for (i = 0; i < adev->num_ip_blocks; i++) { 3991 if (!adev->ip_blocks[i].status.valid || adev->ip_blocks[i].status.hw) 3992 continue; 3993 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON || 3994 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC || 3995 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH || 3996 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE || 3997 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) 3998 continue; 3999 r = amdgpu_ip_block_resume(&adev->ip_blocks[i]); 4000 if (r) 4001 return r; 4002 } 4003 4004 return 0; 4005 } 4006 4007 /** 4008 * amdgpu_device_ip_resume_phase3 - run resume for hardware IPs 4009 * 4010 * @adev: amdgpu_device pointer 4011 * 4012 * Third resume function for hardware IPs. The list of all the hardware 4013 * IPs that make up the asic is walked and the resume callbacks are run for 4014 * all DCE. resume puts the hardware into a functional state after a suspend 4015 * and updates the software state as necessary. This function is also used 4016 * for restoring the GPU after a GPU reset. 4017 * 4018 * Returns 0 on success, negative error code on failure. 4019 */ 4020 static int amdgpu_device_ip_resume_phase3(struct amdgpu_device *adev) 4021 { 4022 int i, r; 4023 4024 for (i = 0; i < adev->num_ip_blocks; i++) { 4025 if (!adev->ip_blocks[i].status.valid || adev->ip_blocks[i].status.hw) 4026 continue; 4027 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE) { 4028 r = amdgpu_ip_block_resume(&adev->ip_blocks[i]); 4029 if (r) 4030 return r; 4031 } 4032 } 4033 4034 return 0; 4035 } 4036 4037 /** 4038 * amdgpu_device_ip_resume - run resume for hardware IPs 4039 * 4040 * @adev: amdgpu_device pointer 4041 * 4042 * Main resume function for hardware IPs. The hardware IPs 4043 * are split into two resume functions because they are 4044 * also used in recovering from a GPU reset and some additional 4045 * steps need to be take between them. In this case (S3/S4) they are 4046 * run sequentially. 4047 * Returns 0 on success, negative error code on failure. 4048 */ 4049 static int amdgpu_device_ip_resume(struct amdgpu_device *adev) 4050 { 4051 int r; 4052 4053 r = amdgpu_device_ip_resume_phase1(adev); 4054 if (r) 4055 return r; 4056 4057 r = amdgpu_device_fw_loading(adev); 4058 if (r) 4059 return r; 4060 4061 r = amdgpu_device_ip_resume_phase2(adev); 4062 4063 if (adev->mman.buffer_funcs_ring->sched.ready) 4064 amdgpu_ttm_set_buffer_funcs_status(adev, true); 4065 4066 if (r) 4067 return r; 4068 4069 amdgpu_fence_driver_hw_init(adev); 4070 4071 r = amdgpu_device_ip_resume_phase3(adev); 4072 4073 return r; 4074 } 4075 4076 /** 4077 * amdgpu_device_detect_sriov_bios - determine if the board supports SR-IOV 4078 * 4079 * @adev: amdgpu_device pointer 4080 * 4081 * Query the VBIOS data tables to determine if the board supports SR-IOV. 4082 */ 4083 static void amdgpu_device_detect_sriov_bios(struct amdgpu_device *adev) 4084 { 4085 if (amdgpu_sriov_vf(adev)) { 4086 if (adev->is_atom_fw) { 4087 if (amdgpu_atomfirmware_gpu_virtualization_supported(adev)) 4088 adev->virt.caps |= AMDGPU_SRIOV_CAPS_SRIOV_VBIOS; 4089 } else { 4090 if (amdgpu_atombios_has_gpu_virtualization_table(adev)) 4091 adev->virt.caps |= AMDGPU_SRIOV_CAPS_SRIOV_VBIOS; 4092 } 4093 4094 if (!(adev->virt.caps & AMDGPU_SRIOV_CAPS_SRIOV_VBIOS)) 4095 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_NO_VBIOS, 0, 0); 4096 } 4097 } 4098 4099 /** 4100 * amdgpu_device_asic_has_dc_support - determine if DC supports the asic 4101 * 4102 * @pdev : pci device context 4103 * @asic_type: AMD asic type 4104 * 4105 * Check if there is DC (new modesetting infrastructre) support for an asic. 4106 * returns true if DC has support, false if not. 4107 */ 4108 bool amdgpu_device_asic_has_dc_support(struct pci_dev *pdev, 4109 enum amd_asic_type asic_type) 4110 { 4111 switch (asic_type) { 4112 #ifdef CONFIG_DRM_AMDGPU_SI 4113 case CHIP_HAINAN: 4114 #endif 4115 case CHIP_TOPAZ: 4116 /* chips with no display hardware */ 4117 return false; 4118 #if defined(CONFIG_DRM_AMD_DC) 4119 case CHIP_TAHITI: 4120 case CHIP_PITCAIRN: 4121 case CHIP_VERDE: 4122 case CHIP_OLAND: 4123 /* 4124 * We have systems in the wild with these ASICs that require 4125 * LVDS and VGA support which is not supported with DC. 4126 * 4127 * Fallback to the non-DC driver here by default so as not to 4128 * cause regressions. 4129 */ 4130 #if defined(CONFIG_DRM_AMD_DC_SI) 4131 return amdgpu_dc > 0; 4132 #else 4133 return false; 4134 #endif 4135 case CHIP_BONAIRE: 4136 case CHIP_KAVERI: 4137 case CHIP_KABINI: 4138 case CHIP_MULLINS: 4139 /* 4140 * We have systems in the wild with these ASICs that require 4141 * VGA support which is not supported with DC. 4142 * 4143 * Fallback to the non-DC driver here by default so as not to 4144 * cause regressions. 4145 */ 4146 return amdgpu_dc > 0; 4147 default: 4148 return amdgpu_dc != 0; 4149 #else 4150 default: 4151 if (amdgpu_dc > 0) 4152 dev_info_once( 4153 &pdev->dev, 4154 "Display Core has been requested via kernel parameter but isn't supported by ASIC, ignoring\n"); 4155 return false; 4156 #endif 4157 } 4158 } 4159 4160 /** 4161 * amdgpu_device_has_dc_support - check if dc is supported 4162 * 4163 * @adev: amdgpu_device pointer 4164 * 4165 * Returns true for supported, false for not supported 4166 */ 4167 bool amdgpu_device_has_dc_support(struct amdgpu_device *adev) 4168 { 4169 if (adev->enable_virtual_display || 4170 (adev->harvest_ip_mask & AMD_HARVEST_IP_DMU_MASK)) 4171 return false; 4172 4173 return amdgpu_device_asic_has_dc_support(adev->pdev, adev->asic_type); 4174 } 4175 4176 static void amdgpu_device_xgmi_reset_func(struct work_struct *__work) 4177 { 4178 struct amdgpu_device *adev = 4179 container_of(__work, struct amdgpu_device, xgmi_reset_work); 4180 struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev); 4181 4182 /* It's a bug to not have a hive within this function */ 4183 if (WARN_ON(!hive)) 4184 return; 4185 4186 /* 4187 * Use task barrier to synchronize all xgmi reset works across the 4188 * hive. task_barrier_enter and task_barrier_exit will block 4189 * until all the threads running the xgmi reset works reach 4190 * those points. task_barrier_full will do both blocks. 4191 */ 4192 if (amdgpu_asic_reset_method(adev) == AMD_RESET_METHOD_BACO) { 4193 4194 task_barrier_enter(&hive->tb); 4195 adev->asic_reset_res = amdgpu_device_baco_enter(adev_to_drm(adev)); 4196 4197 if (adev->asic_reset_res) 4198 goto fail; 4199 4200 task_barrier_exit(&hive->tb); 4201 adev->asic_reset_res = amdgpu_device_baco_exit(adev_to_drm(adev)); 4202 4203 if (adev->asic_reset_res) 4204 goto fail; 4205 4206 amdgpu_ras_reset_error_count(adev, AMDGPU_RAS_BLOCK__MMHUB); 4207 } else { 4208 4209 task_barrier_full(&hive->tb); 4210 adev->asic_reset_res = amdgpu_asic_reset(adev); 4211 } 4212 4213 fail: 4214 if (adev->asic_reset_res) 4215 dev_warn(adev->dev, 4216 "ASIC reset failed with error, %d for drm dev, %s", 4217 adev->asic_reset_res, adev_to_drm(adev)->unique); 4218 amdgpu_put_xgmi_hive(hive); 4219 } 4220 4221 static int amdgpu_device_get_job_timeout_settings(struct amdgpu_device *adev) 4222 { 4223 char *input = amdgpu_lockup_timeout; 4224 char *timeout_setting = NULL; 4225 int index = 0; 4226 long timeout; 4227 int ret = 0; 4228 4229 /* 4230 * By default timeout for non compute jobs is 10000 4231 * and 60000 for compute jobs. 4232 * In SR-IOV or passthrough mode, timeout for compute 4233 * jobs are 60000 by default. 4234 */ 4235 adev->gfx_timeout = msecs_to_jiffies(10000); 4236 adev->sdma_timeout = adev->video_timeout = adev->gfx_timeout; 4237 if (amdgpu_sriov_vf(adev)) 4238 adev->compute_timeout = amdgpu_sriov_is_pp_one_vf(adev) ? 4239 msecs_to_jiffies(60000) : msecs_to_jiffies(10000); 4240 else 4241 adev->compute_timeout = msecs_to_jiffies(60000); 4242 4243 if (strnlen(input, AMDGPU_MAX_TIMEOUT_PARAM_LENGTH)) { 4244 while ((timeout_setting = strsep(&input, ",")) && 4245 strnlen(timeout_setting, AMDGPU_MAX_TIMEOUT_PARAM_LENGTH)) { 4246 ret = kstrtol(timeout_setting, 0, &timeout); 4247 if (ret) 4248 return ret; 4249 4250 if (timeout == 0) { 4251 index++; 4252 continue; 4253 } else if (timeout < 0) { 4254 timeout = MAX_SCHEDULE_TIMEOUT; 4255 dev_warn(adev->dev, "lockup timeout disabled"); 4256 add_taint(TAINT_SOFTLOCKUP, LOCKDEP_STILL_OK); 4257 } else { 4258 timeout = msecs_to_jiffies(timeout); 4259 } 4260 4261 switch (index++) { 4262 case 0: 4263 adev->gfx_timeout = timeout; 4264 break; 4265 case 1: 4266 adev->compute_timeout = timeout; 4267 break; 4268 case 2: 4269 adev->sdma_timeout = timeout; 4270 break; 4271 case 3: 4272 adev->video_timeout = timeout; 4273 break; 4274 default: 4275 break; 4276 } 4277 } 4278 /* 4279 * There is only one value specified and 4280 * it should apply to all non-compute jobs. 4281 */ 4282 if (index == 1) { 4283 adev->sdma_timeout = adev->video_timeout = adev->gfx_timeout; 4284 if (amdgpu_sriov_vf(adev) || amdgpu_passthrough(adev)) 4285 adev->compute_timeout = adev->gfx_timeout; 4286 } 4287 } 4288 4289 return ret; 4290 } 4291 4292 /** 4293 * amdgpu_device_check_iommu_direct_map - check if RAM direct mapped to GPU 4294 * 4295 * @adev: amdgpu_device pointer 4296 * 4297 * RAM direct mapped to GPU if IOMMU is not enabled or is pass through mode 4298 */ 4299 static void amdgpu_device_check_iommu_direct_map(struct amdgpu_device *adev) 4300 { 4301 struct iommu_domain *domain; 4302 4303 domain = iommu_get_domain_for_dev(adev->dev); 4304 if (!domain || domain->type == IOMMU_DOMAIN_IDENTITY) 4305 adev->ram_is_direct_mapped = true; 4306 } 4307 4308 #if defined(CONFIG_HSA_AMD_P2P) 4309 /** 4310 * amdgpu_device_check_iommu_remap - Check if DMA remapping is enabled. 4311 * 4312 * @adev: amdgpu_device pointer 4313 * 4314 * return if IOMMU remapping bar address 4315 */ 4316 static bool amdgpu_device_check_iommu_remap(struct amdgpu_device *adev) 4317 { 4318 struct iommu_domain *domain; 4319 4320 domain = iommu_get_domain_for_dev(adev->dev); 4321 if (domain && (domain->type == IOMMU_DOMAIN_DMA || 4322 domain->type == IOMMU_DOMAIN_DMA_FQ)) 4323 return true; 4324 4325 return false; 4326 } 4327 #endif 4328 4329 static void amdgpu_device_set_mcbp(struct amdgpu_device *adev) 4330 { 4331 if (amdgpu_mcbp == 1) 4332 adev->gfx.mcbp = true; 4333 else if (amdgpu_mcbp == 0) 4334 adev->gfx.mcbp = false; 4335 4336 if (amdgpu_sriov_vf(adev)) 4337 adev->gfx.mcbp = true; 4338 4339 if (adev->gfx.mcbp) 4340 dev_info(adev->dev, "MCBP is enabled\n"); 4341 } 4342 4343 /** 4344 * amdgpu_device_init - initialize the driver 4345 * 4346 * @adev: amdgpu_device pointer 4347 * @flags: driver flags 4348 * 4349 * Initializes the driver info and hw (all asics). 4350 * Returns 0 for success or an error on failure. 4351 * Called at driver startup. 4352 */ 4353 int amdgpu_device_init(struct amdgpu_device *adev, 4354 uint32_t flags) 4355 { 4356 struct drm_device *ddev = adev_to_drm(adev); 4357 struct pci_dev *pdev = adev->pdev; 4358 int r, i; 4359 bool px = false; 4360 u32 max_MBps; 4361 int tmp; 4362 4363 adev->shutdown = false; 4364 adev->flags = flags; 4365 4366 if (amdgpu_force_asic_type >= 0 && amdgpu_force_asic_type < CHIP_LAST) 4367 adev->asic_type = amdgpu_force_asic_type; 4368 else 4369 adev->asic_type = flags & AMD_ASIC_MASK; 4370 4371 adev->usec_timeout = AMDGPU_MAX_USEC_TIMEOUT; 4372 if (amdgpu_emu_mode == 1) 4373 adev->usec_timeout *= 10; 4374 adev->gmc.gart_size = 512 * 1024 * 1024; 4375 adev->accel_working = false; 4376 adev->num_rings = 0; 4377 RCU_INIT_POINTER(adev->gang_submit, dma_fence_get_stub()); 4378 adev->mman.buffer_funcs = NULL; 4379 adev->mman.buffer_funcs_ring = NULL; 4380 adev->vm_manager.vm_pte_funcs = NULL; 4381 adev->vm_manager.vm_pte_num_scheds = 0; 4382 adev->gmc.gmc_funcs = NULL; 4383 adev->harvest_ip_mask = 0x0; 4384 adev->fence_context = dma_fence_context_alloc(AMDGPU_MAX_RINGS); 4385 bitmap_zero(adev->gfx.pipe_reserve_bitmap, AMDGPU_MAX_COMPUTE_QUEUES); 4386 4387 adev->smc_rreg = &amdgpu_invalid_rreg; 4388 adev->smc_wreg = &amdgpu_invalid_wreg; 4389 adev->pcie_rreg = &amdgpu_invalid_rreg; 4390 adev->pcie_wreg = &amdgpu_invalid_wreg; 4391 adev->pcie_rreg_ext = &amdgpu_invalid_rreg_ext; 4392 adev->pcie_wreg_ext = &amdgpu_invalid_wreg_ext; 4393 adev->pciep_rreg = &amdgpu_invalid_rreg; 4394 adev->pciep_wreg = &amdgpu_invalid_wreg; 4395 adev->pcie_rreg64 = &amdgpu_invalid_rreg64; 4396 adev->pcie_wreg64 = &amdgpu_invalid_wreg64; 4397 adev->pcie_rreg64_ext = &amdgpu_invalid_rreg64_ext; 4398 adev->pcie_wreg64_ext = &amdgpu_invalid_wreg64_ext; 4399 adev->uvd_ctx_rreg = &amdgpu_invalid_rreg; 4400 adev->uvd_ctx_wreg = &amdgpu_invalid_wreg; 4401 adev->didt_rreg = &amdgpu_invalid_rreg; 4402 adev->didt_wreg = &amdgpu_invalid_wreg; 4403 adev->gc_cac_rreg = &amdgpu_invalid_rreg; 4404 adev->gc_cac_wreg = &amdgpu_invalid_wreg; 4405 adev->audio_endpt_rreg = &amdgpu_block_invalid_rreg; 4406 adev->audio_endpt_wreg = &amdgpu_block_invalid_wreg; 4407 4408 dev_info( 4409 adev->dev, 4410 "initializing kernel modesetting (%s 0x%04X:0x%04X 0x%04X:0x%04X 0x%02X).\n", 4411 amdgpu_asic_name[adev->asic_type], pdev->vendor, pdev->device, 4412 pdev->subsystem_vendor, pdev->subsystem_device, pdev->revision); 4413 4414 /* mutex initialization are all done here so we 4415 * can recall function without having locking issues 4416 */ 4417 mutex_init(&adev->firmware.mutex); 4418 mutex_init(&adev->pm.mutex); 4419 mutex_init(&adev->gfx.gpu_clock_mutex); 4420 mutex_init(&adev->srbm_mutex); 4421 mutex_init(&adev->gfx.pipe_reserve_mutex); 4422 mutex_init(&adev->gfx.gfx_off_mutex); 4423 mutex_init(&adev->gfx.partition_mutex); 4424 mutex_init(&adev->grbm_idx_mutex); 4425 mutex_init(&adev->mn_lock); 4426 mutex_init(&adev->virt.vf_errors.lock); 4427 hash_init(adev->mn_hash); 4428 mutex_init(&adev->psp.mutex); 4429 mutex_init(&adev->notifier_lock); 4430 mutex_init(&adev->pm.stable_pstate_ctx_lock); 4431 mutex_init(&adev->benchmark_mutex); 4432 mutex_init(&adev->gfx.reset_sem_mutex); 4433 /* Initialize the mutex for cleaner shader isolation between GFX and compute processes */ 4434 mutex_init(&adev->enforce_isolation_mutex); 4435 for (i = 0; i < MAX_XCP; ++i) { 4436 adev->isolation[i].spearhead = dma_fence_get_stub(); 4437 amdgpu_sync_create(&adev->isolation[i].active); 4438 amdgpu_sync_create(&adev->isolation[i].prev); 4439 } 4440 mutex_init(&adev->gfx.userq_sch_mutex); 4441 mutex_init(&adev->gfx.workload_profile_mutex); 4442 mutex_init(&adev->vcn.workload_profile_mutex); 4443 mutex_init(&adev->userq_mutex); 4444 4445 amdgpu_device_init_apu_flags(adev); 4446 4447 r = amdgpu_device_check_arguments(adev); 4448 if (r) 4449 return r; 4450 4451 spin_lock_init(&adev->mmio_idx_lock); 4452 spin_lock_init(&adev->smc_idx_lock); 4453 spin_lock_init(&adev->pcie_idx_lock); 4454 spin_lock_init(&adev->uvd_ctx_idx_lock); 4455 spin_lock_init(&adev->didt_idx_lock); 4456 spin_lock_init(&adev->gc_cac_idx_lock); 4457 spin_lock_init(&adev->se_cac_idx_lock); 4458 spin_lock_init(&adev->audio_endpt_idx_lock); 4459 spin_lock_init(&adev->mm_stats.lock); 4460 spin_lock_init(&adev->virt.rlcg_reg_lock); 4461 spin_lock_init(&adev->wb.lock); 4462 4463 xa_init_flags(&adev->userq_xa, XA_FLAGS_LOCK_IRQ); 4464 4465 INIT_LIST_HEAD(&adev->reset_list); 4466 4467 INIT_LIST_HEAD(&adev->ras_list); 4468 4469 INIT_LIST_HEAD(&adev->pm.od_kobj_list); 4470 4471 INIT_LIST_HEAD(&adev->userq_mgr_list); 4472 4473 INIT_DELAYED_WORK(&adev->delayed_init_work, 4474 amdgpu_device_delayed_init_work_handler); 4475 INIT_DELAYED_WORK(&adev->gfx.gfx_off_delay_work, 4476 amdgpu_device_delay_enable_gfx_off); 4477 /* 4478 * Initialize the enforce_isolation work structures for each XCP 4479 * partition. This work handler is responsible for enforcing shader 4480 * isolation on AMD GPUs. It counts the number of emitted fences for 4481 * each GFX and compute ring. If there are any fences, it schedules 4482 * the `enforce_isolation_work` to be run after a delay. If there are 4483 * no fences, it signals the Kernel Fusion Driver (KFD) to resume the 4484 * runqueue. 4485 */ 4486 for (i = 0; i < MAX_XCP; i++) { 4487 INIT_DELAYED_WORK(&adev->gfx.enforce_isolation[i].work, 4488 amdgpu_gfx_enforce_isolation_handler); 4489 adev->gfx.enforce_isolation[i].adev = adev; 4490 adev->gfx.enforce_isolation[i].xcp_id = i; 4491 } 4492 4493 INIT_WORK(&adev->xgmi_reset_work, amdgpu_device_xgmi_reset_func); 4494 4495 adev->gfx.gfx_off_req_count = 1; 4496 adev->gfx.gfx_off_residency = 0; 4497 adev->gfx.gfx_off_entrycount = 0; 4498 adev->pm.ac_power = power_supply_is_system_supplied() > 0; 4499 4500 atomic_set(&adev->throttling_logging_enabled, 1); 4501 /* 4502 * If throttling continues, logging will be performed every minute 4503 * to avoid log flooding. "-1" is subtracted since the thermal 4504 * throttling interrupt comes every second. Thus, the total logging 4505 * interval is 59 seconds(retelimited printk interval) + 1(waiting 4506 * for throttling interrupt) = 60 seconds. 4507 */ 4508 ratelimit_state_init(&adev->throttling_logging_rs, (60 - 1) * HZ, 1); 4509 4510 ratelimit_set_flags(&adev->throttling_logging_rs, RATELIMIT_MSG_ON_RELEASE); 4511 4512 /* Registers mapping */ 4513 /* TODO: block userspace mapping of io register */ 4514 if (adev->asic_type >= CHIP_BONAIRE) { 4515 adev->rmmio_base = pci_resource_start(adev->pdev, 5); 4516 adev->rmmio_size = pci_resource_len(adev->pdev, 5); 4517 } else { 4518 adev->rmmio_base = pci_resource_start(adev->pdev, 2); 4519 adev->rmmio_size = pci_resource_len(adev->pdev, 2); 4520 } 4521 4522 for (i = 0; i < AMD_IP_BLOCK_TYPE_NUM; i++) 4523 atomic_set(&adev->pm.pwr_state[i], POWER_STATE_UNKNOWN); 4524 4525 adev->rmmio = ioremap(adev->rmmio_base, adev->rmmio_size); 4526 if (!adev->rmmio) 4527 return -ENOMEM; 4528 4529 dev_info(adev->dev, "register mmio base: 0x%08X\n", 4530 (uint32_t)adev->rmmio_base); 4531 dev_info(adev->dev, "register mmio size: %u\n", 4532 (unsigned int)adev->rmmio_size); 4533 4534 /* 4535 * Reset domain needs to be present early, before XGMI hive discovered 4536 * (if any) and initialized to use reset sem and in_gpu reset flag 4537 * early on during init and before calling to RREG32. 4538 */ 4539 adev->reset_domain = amdgpu_reset_create_reset_domain(SINGLE_DEVICE, "amdgpu-reset-dev"); 4540 if (!adev->reset_domain) 4541 return -ENOMEM; 4542 4543 /* detect hw virtualization here */ 4544 amdgpu_virt_init(adev); 4545 4546 amdgpu_device_get_pcie_info(adev); 4547 4548 r = amdgpu_device_get_job_timeout_settings(adev); 4549 if (r) { 4550 dev_err(adev->dev, "invalid lockup_timeout parameter syntax\n"); 4551 return r; 4552 } 4553 4554 amdgpu_device_set_mcbp(adev); 4555 4556 /* 4557 * By default, use default mode where all blocks are expected to be 4558 * initialized. At present a 'swinit' of blocks is required to be 4559 * completed before the need for a different level is detected. 4560 */ 4561 amdgpu_set_init_level(adev, AMDGPU_INIT_LEVEL_DEFAULT); 4562 /* early init functions */ 4563 r = amdgpu_device_ip_early_init(adev); 4564 if (r) 4565 return r; 4566 4567 /* 4568 * No need to remove conflicting FBs for non-display class devices. 4569 * This prevents the sysfb from being freed accidently. 4570 */ 4571 if ((pdev->class >> 8) == PCI_CLASS_DISPLAY_VGA || 4572 (pdev->class >> 8) == PCI_CLASS_DISPLAY_OTHER) { 4573 /* Get rid of things like offb */ 4574 r = aperture_remove_conflicting_pci_devices(adev->pdev, amdgpu_kms_driver.name); 4575 if (r) 4576 return r; 4577 } 4578 4579 /* Enable TMZ based on IP_VERSION */ 4580 amdgpu_gmc_tmz_set(adev); 4581 4582 if (amdgpu_sriov_vf(adev) && 4583 amdgpu_ip_version(adev, GC_HWIP, 0) >= IP_VERSION(10, 3, 0)) 4584 /* VF MMIO access (except mailbox range) from CPU 4585 * will be blocked during sriov runtime 4586 */ 4587 adev->virt.caps |= AMDGPU_VF_MMIO_ACCESS_PROTECT; 4588 4589 amdgpu_gmc_noretry_set(adev); 4590 /* Need to get xgmi info early to decide the reset behavior*/ 4591 if (adev->gmc.xgmi.supported) { 4592 r = adev->gfxhub.funcs->get_xgmi_info(adev); 4593 if (r) 4594 return r; 4595 } 4596 4597 /* enable PCIE atomic ops */ 4598 if (amdgpu_sriov_vf(adev)) { 4599 if (adev->virt.fw_reserve.p_pf2vf) 4600 adev->have_atomics_support = ((struct amd_sriov_msg_pf2vf_info *) 4601 adev->virt.fw_reserve.p_pf2vf)->pcie_atomic_ops_support_flags == 4602 (PCI_EXP_DEVCAP2_ATOMIC_COMP32 | PCI_EXP_DEVCAP2_ATOMIC_COMP64); 4603 /* APUs w/ gfx9 onwards doesn't reply on PCIe atomics, rather it is a 4604 * internal path natively support atomics, set have_atomics_support to true. 4605 */ 4606 } else if ((adev->flags & AMD_IS_APU) && 4607 (amdgpu_ip_version(adev, GC_HWIP, 0) > 4608 IP_VERSION(9, 0, 0))) { 4609 adev->have_atomics_support = true; 4610 } else { 4611 adev->have_atomics_support = 4612 !pci_enable_atomic_ops_to_root(adev->pdev, 4613 PCI_EXP_DEVCAP2_ATOMIC_COMP32 | 4614 PCI_EXP_DEVCAP2_ATOMIC_COMP64); 4615 } 4616 4617 if (!adev->have_atomics_support) 4618 dev_info(adev->dev, "PCIE atomic ops is not supported\n"); 4619 4620 /* doorbell bar mapping and doorbell index init*/ 4621 amdgpu_doorbell_init(adev); 4622 4623 if (amdgpu_emu_mode == 1) { 4624 /* post the asic on emulation mode */ 4625 emu_soc_asic_init(adev); 4626 goto fence_driver_init; 4627 } 4628 4629 amdgpu_reset_init(adev); 4630 4631 /* detect if we are with an SRIOV vbios */ 4632 if (adev->bios) 4633 amdgpu_device_detect_sriov_bios(adev); 4634 4635 /* check if we need to reset the asic 4636 * E.g., driver was not cleanly unloaded previously, etc. 4637 */ 4638 if (!amdgpu_sriov_vf(adev) && amdgpu_asic_need_reset_on_init(adev)) { 4639 if (adev->gmc.xgmi.num_physical_nodes) { 4640 dev_info(adev->dev, "Pending hive reset.\n"); 4641 amdgpu_set_init_level(adev, 4642 AMDGPU_INIT_LEVEL_MINIMAL_XGMI); 4643 } else if (amdgpu_ip_version(adev, MP1_HWIP, 0) == IP_VERSION(13, 0, 10) && 4644 !amdgpu_device_has_display_hardware(adev)) { 4645 r = psp_gpu_reset(adev); 4646 } else { 4647 tmp = amdgpu_reset_method; 4648 /* It should do a default reset when loading or reloading the driver, 4649 * regardless of the module parameter reset_method. 4650 */ 4651 amdgpu_reset_method = AMD_RESET_METHOD_NONE; 4652 r = amdgpu_asic_reset(adev); 4653 amdgpu_reset_method = tmp; 4654 } 4655 4656 if (r) { 4657 dev_err(adev->dev, "asic reset on init failed\n"); 4658 goto failed; 4659 } 4660 } 4661 4662 /* Post card if necessary */ 4663 if (amdgpu_device_need_post(adev)) { 4664 if (!adev->bios) { 4665 dev_err(adev->dev, "no vBIOS found\n"); 4666 r = -EINVAL; 4667 goto failed; 4668 } 4669 dev_info(adev->dev, "GPU posting now...\n"); 4670 r = amdgpu_device_asic_init(adev); 4671 if (r) { 4672 dev_err(adev->dev, "gpu post error!\n"); 4673 goto failed; 4674 } 4675 } 4676 4677 if (adev->bios) { 4678 if (adev->is_atom_fw) { 4679 /* Initialize clocks */ 4680 r = amdgpu_atomfirmware_get_clock_info(adev); 4681 if (r) { 4682 dev_err(adev->dev, "amdgpu_atomfirmware_get_clock_info failed\n"); 4683 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_GET_CLOCK_FAIL, 0, 0); 4684 goto failed; 4685 } 4686 } else { 4687 /* Initialize clocks */ 4688 r = amdgpu_atombios_get_clock_info(adev); 4689 if (r) { 4690 dev_err(adev->dev, "amdgpu_atombios_get_clock_info failed\n"); 4691 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_GET_CLOCK_FAIL, 0, 0); 4692 goto failed; 4693 } 4694 /* init i2c buses */ 4695 amdgpu_i2c_init(adev); 4696 } 4697 } 4698 4699 fence_driver_init: 4700 /* Fence driver */ 4701 r = amdgpu_fence_driver_sw_init(adev); 4702 if (r) { 4703 dev_err(adev->dev, "amdgpu_fence_driver_sw_init failed\n"); 4704 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_FENCE_INIT_FAIL, 0, 0); 4705 goto failed; 4706 } 4707 4708 /* init the mode config */ 4709 drm_mode_config_init(adev_to_drm(adev)); 4710 4711 r = amdgpu_device_ip_init(adev); 4712 if (r) { 4713 dev_err(adev->dev, "amdgpu_device_ip_init failed\n"); 4714 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_AMDGPU_INIT_FAIL, 0, 0); 4715 goto release_ras_con; 4716 } 4717 4718 amdgpu_fence_driver_hw_init(adev); 4719 4720 dev_info(adev->dev, 4721 "SE %d, SH per SE %d, CU per SH %d, active_cu_number %d\n", 4722 adev->gfx.config.max_shader_engines, 4723 adev->gfx.config.max_sh_per_se, 4724 adev->gfx.config.max_cu_per_sh, 4725 adev->gfx.cu_info.number); 4726 4727 adev->accel_working = true; 4728 4729 amdgpu_vm_check_compute_bug(adev); 4730 4731 /* Initialize the buffer migration limit. */ 4732 if (amdgpu_moverate >= 0) 4733 max_MBps = amdgpu_moverate; 4734 else 4735 max_MBps = 8; /* Allow 8 MB/s. */ 4736 /* Get a log2 for easy divisions. */ 4737 adev->mm_stats.log2_max_MBps = ilog2(max(1u, max_MBps)); 4738 4739 /* 4740 * Register gpu instance before amdgpu_device_enable_mgpu_fan_boost. 4741 * Otherwise the mgpu fan boost feature will be skipped due to the 4742 * gpu instance is counted less. 4743 */ 4744 amdgpu_register_gpu_instance(adev); 4745 4746 /* enable clockgating, etc. after ib tests, etc. since some blocks require 4747 * explicit gating rather than handling it automatically. 4748 */ 4749 if (adev->init_lvl->level != AMDGPU_INIT_LEVEL_MINIMAL_XGMI) { 4750 r = amdgpu_device_ip_late_init(adev); 4751 if (r) { 4752 dev_err(adev->dev, "amdgpu_device_ip_late_init failed\n"); 4753 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_AMDGPU_LATE_INIT_FAIL, 0, r); 4754 goto release_ras_con; 4755 } 4756 /* must succeed. */ 4757 amdgpu_ras_resume(adev); 4758 queue_delayed_work(system_wq, &adev->delayed_init_work, 4759 msecs_to_jiffies(AMDGPU_RESUME_MS)); 4760 } 4761 4762 if (amdgpu_sriov_vf(adev)) { 4763 amdgpu_virt_release_full_gpu(adev, true); 4764 flush_delayed_work(&adev->delayed_init_work); 4765 } 4766 4767 /* 4768 * Place those sysfs registering after `late_init`. As some of those 4769 * operations performed in `late_init` might affect the sysfs 4770 * interfaces creating. 4771 */ 4772 r = amdgpu_atombios_sysfs_init(adev); 4773 if (r) 4774 drm_err(&adev->ddev, 4775 "registering atombios sysfs failed (%d).\n", r); 4776 4777 r = amdgpu_pm_sysfs_init(adev); 4778 if (r) 4779 dev_err(adev->dev, "registering pm sysfs failed (%d).\n", r); 4780 4781 r = amdgpu_ucode_sysfs_init(adev); 4782 if (r) { 4783 adev->ucode_sysfs_en = false; 4784 dev_err(adev->dev, "Creating firmware sysfs failed (%d).\n", r); 4785 } else 4786 adev->ucode_sysfs_en = true; 4787 4788 r = amdgpu_device_attr_sysfs_init(adev); 4789 if (r) 4790 dev_err(adev->dev, "Could not create amdgpu device attr\n"); 4791 4792 r = devm_device_add_group(adev->dev, &amdgpu_board_attrs_group); 4793 if (r) 4794 dev_err(adev->dev, 4795 "Could not create amdgpu board attributes\n"); 4796 4797 amdgpu_fru_sysfs_init(adev); 4798 amdgpu_reg_state_sysfs_init(adev); 4799 amdgpu_xcp_sysfs_init(adev); 4800 4801 if (IS_ENABLED(CONFIG_PERF_EVENTS)) 4802 r = amdgpu_pmu_init(adev); 4803 if (r) 4804 dev_err(adev->dev, "amdgpu_pmu_init failed\n"); 4805 4806 /* Have stored pci confspace at hand for restore in sudden PCI error */ 4807 if (amdgpu_device_cache_pci_state(adev->pdev)) 4808 pci_restore_state(pdev); 4809 4810 /* if we have > 1 VGA cards, then disable the amdgpu VGA resources */ 4811 /* this will fail for cards that aren't VGA class devices, just 4812 * ignore it 4813 */ 4814 if ((adev->pdev->class >> 8) == PCI_CLASS_DISPLAY_VGA) 4815 vga_client_register(adev->pdev, amdgpu_device_vga_set_decode); 4816 4817 px = amdgpu_device_supports_px(ddev); 4818 4819 if (px || (!dev_is_removable(&adev->pdev->dev) && 4820 apple_gmux_detect(NULL, NULL))) 4821 vga_switcheroo_register_client(adev->pdev, 4822 &amdgpu_switcheroo_ops, px); 4823 4824 if (px) 4825 vga_switcheroo_init_domain_pm_ops(adev->dev, &adev->vga_pm_domain); 4826 4827 if (adev->init_lvl->level == AMDGPU_INIT_LEVEL_MINIMAL_XGMI) 4828 amdgpu_xgmi_reset_on_init(adev); 4829 4830 amdgpu_device_check_iommu_direct_map(adev); 4831 4832 adev->pm_nb.notifier_call = amdgpu_device_pm_notifier; 4833 r = register_pm_notifier(&adev->pm_nb); 4834 if (r) 4835 goto failed; 4836 4837 return 0; 4838 4839 release_ras_con: 4840 if (amdgpu_sriov_vf(adev)) 4841 amdgpu_virt_release_full_gpu(adev, true); 4842 4843 /* failed in exclusive mode due to timeout */ 4844 if (amdgpu_sriov_vf(adev) && 4845 !amdgpu_sriov_runtime(adev) && 4846 amdgpu_virt_mmio_blocked(adev) && 4847 !amdgpu_virt_wait_reset(adev)) { 4848 dev_err(adev->dev, "VF exclusive mode timeout\n"); 4849 /* Don't send request since VF is inactive. */ 4850 adev->virt.caps &= ~AMDGPU_SRIOV_CAPS_RUNTIME; 4851 adev->virt.ops = NULL; 4852 r = -EAGAIN; 4853 } 4854 amdgpu_release_ras_context(adev); 4855 4856 failed: 4857 amdgpu_vf_error_trans_all(adev); 4858 4859 return r; 4860 } 4861 4862 static void amdgpu_device_unmap_mmio(struct amdgpu_device *adev) 4863 { 4864 4865 /* Clear all CPU mappings pointing to this device */ 4866 unmap_mapping_range(adev->ddev.anon_inode->i_mapping, 0, 0, 1); 4867 4868 /* Unmap all mapped bars - Doorbell, registers and VRAM */ 4869 amdgpu_doorbell_fini(adev); 4870 4871 iounmap(adev->rmmio); 4872 adev->rmmio = NULL; 4873 if (adev->mman.aper_base_kaddr) 4874 iounmap(adev->mman.aper_base_kaddr); 4875 adev->mman.aper_base_kaddr = NULL; 4876 4877 /* Memory manager related */ 4878 if (!adev->gmc.xgmi.connected_to_cpu && !adev->gmc.is_app_apu) { 4879 arch_phys_wc_del(adev->gmc.vram_mtrr); 4880 arch_io_free_memtype_wc(adev->gmc.aper_base, adev->gmc.aper_size); 4881 } 4882 } 4883 4884 /** 4885 * amdgpu_device_fini_hw - tear down the driver 4886 * 4887 * @adev: amdgpu_device pointer 4888 * 4889 * Tear down the driver info (all asics). 4890 * Called at driver shutdown. 4891 */ 4892 void amdgpu_device_fini_hw(struct amdgpu_device *adev) 4893 { 4894 dev_info(adev->dev, "amdgpu: finishing device.\n"); 4895 flush_delayed_work(&adev->delayed_init_work); 4896 4897 if (adev->mman.initialized) 4898 drain_workqueue(adev->mman.bdev.wq); 4899 adev->shutdown = true; 4900 4901 unregister_pm_notifier(&adev->pm_nb); 4902 4903 /* make sure IB test finished before entering exclusive mode 4904 * to avoid preemption on IB test 4905 */ 4906 if (amdgpu_sriov_vf(adev)) { 4907 amdgpu_virt_request_full_gpu(adev, false); 4908 amdgpu_virt_fini_data_exchange(adev); 4909 } 4910 4911 /* disable all interrupts */ 4912 amdgpu_irq_disable_all(adev); 4913 if (adev->mode_info.mode_config_initialized) { 4914 if (!drm_drv_uses_atomic_modeset(adev_to_drm(adev))) 4915 drm_helper_force_disable_all(adev_to_drm(adev)); 4916 else 4917 drm_atomic_helper_shutdown(adev_to_drm(adev)); 4918 } 4919 amdgpu_fence_driver_hw_fini(adev); 4920 4921 if (adev->pm.sysfs_initialized) 4922 amdgpu_pm_sysfs_fini(adev); 4923 if (adev->ucode_sysfs_en) 4924 amdgpu_ucode_sysfs_fini(adev); 4925 amdgpu_device_attr_sysfs_fini(adev); 4926 amdgpu_fru_sysfs_fini(adev); 4927 4928 amdgpu_reg_state_sysfs_fini(adev); 4929 amdgpu_xcp_sysfs_fini(adev); 4930 4931 /* disable ras feature must before hw fini */ 4932 amdgpu_ras_pre_fini(adev); 4933 4934 amdgpu_ttm_set_buffer_funcs_status(adev, false); 4935 4936 amdgpu_device_ip_fini_early(adev); 4937 4938 amdgpu_irq_fini_hw(adev); 4939 4940 if (adev->mman.initialized) 4941 ttm_device_clear_dma_mappings(&adev->mman.bdev); 4942 4943 amdgpu_gart_dummy_page_fini(adev); 4944 4945 if (drm_dev_is_unplugged(adev_to_drm(adev))) 4946 amdgpu_device_unmap_mmio(adev); 4947 4948 } 4949 4950 void amdgpu_device_fini_sw(struct amdgpu_device *adev) 4951 { 4952 int i, idx; 4953 bool px; 4954 4955 amdgpu_device_ip_fini(adev); 4956 amdgpu_fence_driver_sw_fini(adev); 4957 amdgpu_ucode_release(&adev->firmware.gpu_info_fw); 4958 adev->accel_working = false; 4959 dma_fence_put(rcu_dereference_protected(adev->gang_submit, true)); 4960 for (i = 0; i < MAX_XCP; ++i) { 4961 dma_fence_put(adev->isolation[i].spearhead); 4962 amdgpu_sync_free(&adev->isolation[i].active); 4963 amdgpu_sync_free(&adev->isolation[i].prev); 4964 } 4965 4966 amdgpu_reset_fini(adev); 4967 4968 /* free i2c buses */ 4969 amdgpu_i2c_fini(adev); 4970 4971 if (adev->bios) { 4972 if (amdgpu_emu_mode != 1) 4973 amdgpu_atombios_fini(adev); 4974 amdgpu_bios_release(adev); 4975 } 4976 4977 kfree(adev->fru_info); 4978 adev->fru_info = NULL; 4979 4980 kfree(adev->xcp_mgr); 4981 adev->xcp_mgr = NULL; 4982 4983 px = amdgpu_device_supports_px(adev_to_drm(adev)); 4984 4985 if (px || (!dev_is_removable(&adev->pdev->dev) && 4986 apple_gmux_detect(NULL, NULL))) 4987 vga_switcheroo_unregister_client(adev->pdev); 4988 4989 if (px) 4990 vga_switcheroo_fini_domain_pm_ops(adev->dev); 4991 4992 if ((adev->pdev->class >> 8) == PCI_CLASS_DISPLAY_VGA) 4993 vga_client_unregister(adev->pdev); 4994 4995 if (drm_dev_enter(adev_to_drm(adev), &idx)) { 4996 4997 iounmap(adev->rmmio); 4998 adev->rmmio = NULL; 4999 drm_dev_exit(idx); 5000 } 5001 5002 if (IS_ENABLED(CONFIG_PERF_EVENTS)) 5003 amdgpu_pmu_fini(adev); 5004 if (adev->mman.discovery_bin) 5005 amdgpu_discovery_fini(adev); 5006 5007 amdgpu_reset_put_reset_domain(adev->reset_domain); 5008 adev->reset_domain = NULL; 5009 5010 kfree(adev->pci_state); 5011 5012 } 5013 5014 /** 5015 * amdgpu_device_evict_resources - evict device resources 5016 * @adev: amdgpu device object 5017 * 5018 * Evicts all ttm device resources(vram BOs, gart table) from the lru list 5019 * of the vram memory type. Mainly used for evicting device resources 5020 * at suspend time. 5021 * 5022 */ 5023 static int amdgpu_device_evict_resources(struct amdgpu_device *adev) 5024 { 5025 int ret; 5026 5027 /* No need to evict vram on APUs unless going to S4 */ 5028 if (!adev->in_s4 && (adev->flags & AMD_IS_APU)) 5029 return 0; 5030 5031 ret = amdgpu_ttm_evict_resources(adev, TTM_PL_VRAM); 5032 if (ret) 5033 dev_warn(adev->dev, "evicting device resources failed\n"); 5034 return ret; 5035 } 5036 5037 /* 5038 * Suspend & resume. 5039 */ 5040 /** 5041 * amdgpu_device_pm_notifier - Notification block for Suspend/Hibernate events 5042 * @nb: notifier block 5043 * @mode: suspend mode 5044 * @data: data 5045 * 5046 * This function is called when the system is about to suspend or hibernate. 5047 * It is used to set the appropriate flags so that eviction can be optimized 5048 * in the pm prepare callback. 5049 */ 5050 static int amdgpu_device_pm_notifier(struct notifier_block *nb, unsigned long mode, 5051 void *data) 5052 { 5053 struct amdgpu_device *adev = container_of(nb, struct amdgpu_device, pm_nb); 5054 5055 switch (mode) { 5056 case PM_HIBERNATION_PREPARE: 5057 adev->in_s4 = true; 5058 break; 5059 case PM_POST_HIBERNATION: 5060 adev->in_s4 = false; 5061 break; 5062 } 5063 5064 return NOTIFY_DONE; 5065 } 5066 5067 /** 5068 * amdgpu_device_prepare - prepare for device suspend 5069 * 5070 * @dev: drm dev pointer 5071 * 5072 * Prepare to put the hw in the suspend state (all asics). 5073 * Returns 0 for success or an error on failure. 5074 * Called at driver suspend. 5075 */ 5076 int amdgpu_device_prepare(struct drm_device *dev) 5077 { 5078 struct amdgpu_device *adev = drm_to_adev(dev); 5079 int i, r; 5080 5081 if (dev->switch_power_state == DRM_SWITCH_POWER_OFF) 5082 return 0; 5083 5084 /* Evict the majority of BOs before starting suspend sequence */ 5085 r = amdgpu_device_evict_resources(adev); 5086 if (r) 5087 return r; 5088 5089 flush_delayed_work(&adev->gfx.gfx_off_delay_work); 5090 5091 for (i = 0; i < adev->num_ip_blocks; i++) { 5092 if (!adev->ip_blocks[i].status.valid) 5093 continue; 5094 if (!adev->ip_blocks[i].version->funcs->prepare_suspend) 5095 continue; 5096 r = adev->ip_blocks[i].version->funcs->prepare_suspend(&adev->ip_blocks[i]); 5097 if (r) 5098 return r; 5099 } 5100 5101 return 0; 5102 } 5103 5104 /** 5105 * amdgpu_device_complete - complete power state transition 5106 * 5107 * @dev: drm dev pointer 5108 * 5109 * Undo the changes from amdgpu_device_prepare. This will be 5110 * called on all resume transitions, including those that failed. 5111 */ 5112 void amdgpu_device_complete(struct drm_device *dev) 5113 { 5114 struct amdgpu_device *adev = drm_to_adev(dev); 5115 int i; 5116 5117 for (i = 0; i < adev->num_ip_blocks; i++) { 5118 if (!adev->ip_blocks[i].status.valid) 5119 continue; 5120 if (!adev->ip_blocks[i].version->funcs->complete) 5121 continue; 5122 adev->ip_blocks[i].version->funcs->complete(&adev->ip_blocks[i]); 5123 } 5124 } 5125 5126 /** 5127 * amdgpu_device_suspend - initiate device suspend 5128 * 5129 * @dev: drm dev pointer 5130 * @notify_clients: notify in-kernel DRM clients 5131 * 5132 * Puts the hw in the suspend state (all asics). 5133 * Returns 0 for success or an error on failure. 5134 * Called at driver suspend. 5135 */ 5136 int amdgpu_device_suspend(struct drm_device *dev, bool notify_clients) 5137 { 5138 struct amdgpu_device *adev = drm_to_adev(dev); 5139 int r = 0; 5140 5141 if (dev->switch_power_state == DRM_SWITCH_POWER_OFF) 5142 return 0; 5143 5144 adev->in_suspend = true; 5145 5146 if (amdgpu_sriov_vf(adev)) { 5147 if (!adev->in_s0ix && !adev->in_runpm) 5148 amdgpu_amdkfd_suspend_process(adev); 5149 amdgpu_virt_fini_data_exchange(adev); 5150 r = amdgpu_virt_request_full_gpu(adev, false); 5151 if (r) 5152 return r; 5153 } 5154 5155 if (amdgpu_acpi_smart_shift_update(dev, AMDGPU_SS_DEV_D3)) 5156 dev_warn(adev->dev, "smart shift update failed\n"); 5157 5158 if (notify_clients) 5159 drm_client_dev_suspend(adev_to_drm(adev), false); 5160 5161 cancel_delayed_work_sync(&adev->delayed_init_work); 5162 5163 amdgpu_ras_suspend(adev); 5164 5165 amdgpu_device_ip_suspend_phase1(adev); 5166 5167 if (!adev->in_s0ix) { 5168 amdgpu_amdkfd_suspend(adev, !amdgpu_sriov_vf(adev) && !adev->in_runpm); 5169 amdgpu_userq_suspend(adev); 5170 } 5171 5172 r = amdgpu_device_evict_resources(adev); 5173 if (r) 5174 return r; 5175 5176 amdgpu_ttm_set_buffer_funcs_status(adev, false); 5177 5178 amdgpu_fence_driver_hw_fini(adev); 5179 5180 amdgpu_device_ip_suspend_phase2(adev); 5181 5182 if (amdgpu_sriov_vf(adev)) 5183 amdgpu_virt_release_full_gpu(adev, false); 5184 5185 r = amdgpu_dpm_notify_rlc_state(adev, false); 5186 if (r) 5187 return r; 5188 5189 return 0; 5190 } 5191 5192 static inline int amdgpu_virt_resume(struct amdgpu_device *adev) 5193 { 5194 int r; 5195 unsigned int prev_physical_node_id = adev->gmc.xgmi.physical_node_id; 5196 5197 /* During VM resume, QEMU programming of VF MSIX table (register GFXMSIX_VECT0_ADDR_LO) 5198 * may not work. The access could be blocked by nBIF protection as VF isn't in 5199 * exclusive access mode. Exclusive access is enabled now, disable/enable MSIX 5200 * so that QEMU reprograms MSIX table. 5201 */ 5202 amdgpu_restore_msix(adev); 5203 5204 r = adev->gfxhub.funcs->get_xgmi_info(adev); 5205 if (r) 5206 return r; 5207 5208 dev_info(adev->dev, "xgmi node, old id %d, new id %d\n", 5209 prev_physical_node_id, adev->gmc.xgmi.physical_node_id); 5210 5211 adev->vm_manager.vram_base_offset = adev->gfxhub.funcs->get_mc_fb_offset(adev); 5212 adev->vm_manager.vram_base_offset += 5213 adev->gmc.xgmi.physical_node_id * adev->gmc.xgmi.node_segment_size; 5214 5215 return 0; 5216 } 5217 5218 /** 5219 * amdgpu_device_resume - initiate device resume 5220 * 5221 * @dev: drm dev pointer 5222 * @notify_clients: notify in-kernel DRM clients 5223 * 5224 * Bring the hw back to operating state (all asics). 5225 * Returns 0 for success or an error on failure. 5226 * Called at driver resume. 5227 */ 5228 int amdgpu_device_resume(struct drm_device *dev, bool notify_clients) 5229 { 5230 struct amdgpu_device *adev = drm_to_adev(dev); 5231 int r = 0; 5232 5233 if (amdgpu_sriov_vf(adev)) { 5234 r = amdgpu_virt_request_full_gpu(adev, true); 5235 if (r) 5236 return r; 5237 } 5238 5239 if (amdgpu_virt_xgmi_migrate_enabled(adev)) { 5240 r = amdgpu_virt_resume(adev); 5241 if (r) 5242 goto exit; 5243 } 5244 5245 if (dev->switch_power_state == DRM_SWITCH_POWER_OFF) 5246 return 0; 5247 5248 if (adev->in_s0ix) 5249 amdgpu_dpm_gfx_state_change(adev, sGpuChangeState_D0Entry); 5250 5251 /* post card */ 5252 if (amdgpu_device_need_post(adev)) { 5253 r = amdgpu_device_asic_init(adev); 5254 if (r) 5255 dev_err(adev->dev, "amdgpu asic init failed\n"); 5256 } 5257 5258 r = amdgpu_device_ip_resume(adev); 5259 5260 if (r) { 5261 dev_err(adev->dev, "amdgpu_device_ip_resume failed (%d).\n", r); 5262 goto exit; 5263 } 5264 5265 if (!adev->in_s0ix) { 5266 r = amdgpu_amdkfd_resume(adev, !amdgpu_sriov_vf(adev) && !adev->in_runpm); 5267 if (r) 5268 goto exit; 5269 5270 r = amdgpu_userq_resume(adev); 5271 if (r) 5272 goto exit; 5273 } 5274 5275 r = amdgpu_device_ip_late_init(adev); 5276 if (r) 5277 goto exit; 5278 5279 queue_delayed_work(system_wq, &adev->delayed_init_work, 5280 msecs_to_jiffies(AMDGPU_RESUME_MS)); 5281 exit: 5282 if (amdgpu_sriov_vf(adev)) { 5283 amdgpu_virt_init_data_exchange(adev); 5284 amdgpu_virt_release_full_gpu(adev, true); 5285 5286 if (!adev->in_s0ix && !r && !adev->in_runpm) 5287 r = amdgpu_amdkfd_resume_process(adev); 5288 } 5289 5290 if (r) 5291 return r; 5292 5293 /* Make sure IB tests flushed */ 5294 flush_delayed_work(&adev->delayed_init_work); 5295 5296 if (notify_clients) 5297 drm_client_dev_resume(adev_to_drm(adev), false); 5298 5299 amdgpu_ras_resume(adev); 5300 5301 if (adev->mode_info.num_crtc) { 5302 /* 5303 * Most of the connector probing functions try to acquire runtime pm 5304 * refs to ensure that the GPU is powered on when connector polling is 5305 * performed. Since we're calling this from a runtime PM callback, 5306 * trying to acquire rpm refs will cause us to deadlock. 5307 * 5308 * Since we're guaranteed to be holding the rpm lock, it's safe to 5309 * temporarily disable the rpm helpers so this doesn't deadlock us. 5310 */ 5311 #ifdef CONFIG_PM 5312 dev->dev->power.disable_depth++; 5313 #endif 5314 if (!adev->dc_enabled) 5315 drm_helper_hpd_irq_event(dev); 5316 else 5317 drm_kms_helper_hotplug_event(dev); 5318 #ifdef CONFIG_PM 5319 dev->dev->power.disable_depth--; 5320 #endif 5321 } 5322 adev->in_suspend = false; 5323 5324 if (amdgpu_acpi_smart_shift_update(dev, AMDGPU_SS_DEV_D0)) 5325 dev_warn(adev->dev, "smart shift update failed\n"); 5326 5327 return 0; 5328 } 5329 5330 /** 5331 * amdgpu_device_ip_check_soft_reset - did soft reset succeed 5332 * 5333 * @adev: amdgpu_device pointer 5334 * 5335 * The list of all the hardware IPs that make up the asic is walked and 5336 * the check_soft_reset callbacks are run. check_soft_reset determines 5337 * if the asic is still hung or not. 5338 * Returns true if any of the IPs are still in a hung state, false if not. 5339 */ 5340 static bool amdgpu_device_ip_check_soft_reset(struct amdgpu_device *adev) 5341 { 5342 int i; 5343 bool asic_hang = false; 5344 5345 if (amdgpu_sriov_vf(adev)) 5346 return true; 5347 5348 if (amdgpu_asic_need_full_reset(adev)) 5349 return true; 5350 5351 for (i = 0; i < adev->num_ip_blocks; i++) { 5352 if (!adev->ip_blocks[i].status.valid) 5353 continue; 5354 if (adev->ip_blocks[i].version->funcs->check_soft_reset) 5355 adev->ip_blocks[i].status.hang = 5356 adev->ip_blocks[i].version->funcs->check_soft_reset( 5357 &adev->ip_blocks[i]); 5358 if (adev->ip_blocks[i].status.hang) { 5359 dev_info(adev->dev, "IP block:%s is hung!\n", adev->ip_blocks[i].version->funcs->name); 5360 asic_hang = true; 5361 } 5362 } 5363 return asic_hang; 5364 } 5365 5366 /** 5367 * amdgpu_device_ip_pre_soft_reset - prepare for soft reset 5368 * 5369 * @adev: amdgpu_device pointer 5370 * 5371 * The list of all the hardware IPs that make up the asic is walked and the 5372 * pre_soft_reset callbacks are run if the block is hung. pre_soft_reset 5373 * handles any IP specific hardware or software state changes that are 5374 * necessary for a soft reset to succeed. 5375 * Returns 0 on success, negative error code on failure. 5376 */ 5377 static int amdgpu_device_ip_pre_soft_reset(struct amdgpu_device *adev) 5378 { 5379 int i, r = 0; 5380 5381 for (i = 0; i < adev->num_ip_blocks; i++) { 5382 if (!adev->ip_blocks[i].status.valid) 5383 continue; 5384 if (adev->ip_blocks[i].status.hang && 5385 adev->ip_blocks[i].version->funcs->pre_soft_reset) { 5386 r = adev->ip_blocks[i].version->funcs->pre_soft_reset(&adev->ip_blocks[i]); 5387 if (r) 5388 return r; 5389 } 5390 } 5391 5392 return 0; 5393 } 5394 5395 /** 5396 * amdgpu_device_ip_need_full_reset - check if a full asic reset is needed 5397 * 5398 * @adev: amdgpu_device pointer 5399 * 5400 * Some hardware IPs cannot be soft reset. If they are hung, a full gpu 5401 * reset is necessary to recover. 5402 * Returns true if a full asic reset is required, false if not. 5403 */ 5404 static bool amdgpu_device_ip_need_full_reset(struct amdgpu_device *adev) 5405 { 5406 int i; 5407 5408 if (amdgpu_asic_need_full_reset(adev)) 5409 return true; 5410 5411 for (i = 0; i < adev->num_ip_blocks; i++) { 5412 if (!adev->ip_blocks[i].status.valid) 5413 continue; 5414 if ((adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) || 5415 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) || 5416 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_ACP) || 5417 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE) || 5418 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) { 5419 if (adev->ip_blocks[i].status.hang) { 5420 dev_info(adev->dev, "Some block need full reset!\n"); 5421 return true; 5422 } 5423 } 5424 } 5425 return false; 5426 } 5427 5428 /** 5429 * amdgpu_device_ip_soft_reset - do a soft reset 5430 * 5431 * @adev: amdgpu_device pointer 5432 * 5433 * The list of all the hardware IPs that make up the asic is walked and the 5434 * soft_reset callbacks are run if the block is hung. soft_reset handles any 5435 * IP specific hardware or software state changes that are necessary to soft 5436 * reset the IP. 5437 * Returns 0 on success, negative error code on failure. 5438 */ 5439 static int amdgpu_device_ip_soft_reset(struct amdgpu_device *adev) 5440 { 5441 int i, r = 0; 5442 5443 for (i = 0; i < adev->num_ip_blocks; i++) { 5444 if (!adev->ip_blocks[i].status.valid) 5445 continue; 5446 if (adev->ip_blocks[i].status.hang && 5447 adev->ip_blocks[i].version->funcs->soft_reset) { 5448 r = adev->ip_blocks[i].version->funcs->soft_reset(&adev->ip_blocks[i]); 5449 if (r) 5450 return r; 5451 } 5452 } 5453 5454 return 0; 5455 } 5456 5457 /** 5458 * amdgpu_device_ip_post_soft_reset - clean up from soft reset 5459 * 5460 * @adev: amdgpu_device pointer 5461 * 5462 * The list of all the hardware IPs that make up the asic is walked and the 5463 * post_soft_reset callbacks are run if the asic was hung. post_soft_reset 5464 * handles any IP specific hardware or software state changes that are 5465 * necessary after the IP has been soft reset. 5466 * Returns 0 on success, negative error code on failure. 5467 */ 5468 static int amdgpu_device_ip_post_soft_reset(struct amdgpu_device *adev) 5469 { 5470 int i, r = 0; 5471 5472 for (i = 0; i < adev->num_ip_blocks; i++) { 5473 if (!adev->ip_blocks[i].status.valid) 5474 continue; 5475 if (adev->ip_blocks[i].status.hang && 5476 adev->ip_blocks[i].version->funcs->post_soft_reset) 5477 r = adev->ip_blocks[i].version->funcs->post_soft_reset(&adev->ip_blocks[i]); 5478 if (r) 5479 return r; 5480 } 5481 5482 return 0; 5483 } 5484 5485 /** 5486 * amdgpu_device_reset_sriov - reset ASIC for SR-IOV vf 5487 * 5488 * @adev: amdgpu_device pointer 5489 * @reset_context: amdgpu reset context pointer 5490 * 5491 * do VF FLR and reinitialize Asic 5492 * return 0 means succeeded otherwise failed 5493 */ 5494 static int amdgpu_device_reset_sriov(struct amdgpu_device *adev, 5495 struct amdgpu_reset_context *reset_context) 5496 { 5497 int r; 5498 struct amdgpu_hive_info *hive = NULL; 5499 5500 if (test_bit(AMDGPU_HOST_FLR, &reset_context->flags)) { 5501 if (!amdgpu_ras_get_fed_status(adev)) 5502 amdgpu_virt_ready_to_reset(adev); 5503 amdgpu_virt_wait_reset(adev); 5504 clear_bit(AMDGPU_HOST_FLR, &reset_context->flags); 5505 r = amdgpu_virt_request_full_gpu(adev, true); 5506 } else { 5507 r = amdgpu_virt_reset_gpu(adev); 5508 } 5509 if (r) 5510 return r; 5511 5512 amdgpu_ras_clear_err_state(adev); 5513 amdgpu_irq_gpu_reset_resume_helper(adev); 5514 5515 /* some sw clean up VF needs to do before recover */ 5516 amdgpu_virt_post_reset(adev); 5517 5518 /* Resume IP prior to SMC */ 5519 r = amdgpu_device_ip_reinit_early_sriov(adev); 5520 if (r) 5521 return r; 5522 5523 amdgpu_virt_init_data_exchange(adev); 5524 5525 r = amdgpu_device_fw_loading(adev); 5526 if (r) 5527 return r; 5528 5529 /* now we are okay to resume SMC/CP/SDMA */ 5530 r = amdgpu_device_ip_reinit_late_sriov(adev); 5531 if (r) 5532 return r; 5533 5534 hive = amdgpu_get_xgmi_hive(adev); 5535 /* Update PSP FW topology after reset */ 5536 if (hive && adev->gmc.xgmi.num_physical_nodes > 1) 5537 r = amdgpu_xgmi_update_topology(hive, adev); 5538 if (hive) 5539 amdgpu_put_xgmi_hive(hive); 5540 if (r) 5541 return r; 5542 5543 r = amdgpu_ib_ring_tests(adev); 5544 if (r) 5545 return r; 5546 5547 if (adev->virt.gim_feature & AMDGIM_FEATURE_GIM_FLR_VRAMLOST) 5548 amdgpu_inc_vram_lost(adev); 5549 5550 /* need to be called during full access so we can't do it later like 5551 * bare-metal does. 5552 */ 5553 amdgpu_amdkfd_post_reset(adev); 5554 amdgpu_virt_release_full_gpu(adev, true); 5555 5556 /* Aldebaran and gfx_11_0_3 support ras in SRIOV, so need resume ras during reset */ 5557 if (amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 2) || 5558 amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 3) || 5559 amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 4) || 5560 amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 5, 0) || 5561 amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(11, 0, 3)) 5562 amdgpu_ras_resume(adev); 5563 5564 amdgpu_virt_ras_telemetry_post_reset(adev); 5565 5566 return 0; 5567 } 5568 5569 /** 5570 * amdgpu_device_has_job_running - check if there is any unfinished job 5571 * 5572 * @adev: amdgpu_device pointer 5573 * 5574 * check if there is any job running on the device when guest driver receives 5575 * FLR notification from host driver. If there are still jobs running, then 5576 * the guest driver will not respond the FLR reset. Instead, let the job hit 5577 * the timeout and guest driver then issue the reset request. 5578 */ 5579 bool amdgpu_device_has_job_running(struct amdgpu_device *adev) 5580 { 5581 int i; 5582 5583 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 5584 struct amdgpu_ring *ring = adev->rings[i]; 5585 5586 if (!amdgpu_ring_sched_ready(ring)) 5587 continue; 5588 5589 if (amdgpu_fence_count_emitted(ring)) 5590 return true; 5591 } 5592 return false; 5593 } 5594 5595 /** 5596 * amdgpu_device_should_recover_gpu - check if we should try GPU recovery 5597 * 5598 * @adev: amdgpu_device pointer 5599 * 5600 * Check amdgpu_gpu_recovery and SRIOV status to see if we should try to recover 5601 * a hung GPU. 5602 */ 5603 bool amdgpu_device_should_recover_gpu(struct amdgpu_device *adev) 5604 { 5605 5606 if (amdgpu_gpu_recovery == 0) 5607 goto disabled; 5608 5609 /* Skip soft reset check in fatal error mode */ 5610 if (!amdgpu_ras_is_poison_mode_supported(adev)) 5611 return true; 5612 5613 if (amdgpu_sriov_vf(adev)) 5614 return true; 5615 5616 if (amdgpu_gpu_recovery == -1) { 5617 switch (adev->asic_type) { 5618 #ifdef CONFIG_DRM_AMDGPU_SI 5619 case CHIP_VERDE: 5620 case CHIP_TAHITI: 5621 case CHIP_PITCAIRN: 5622 case CHIP_OLAND: 5623 case CHIP_HAINAN: 5624 #endif 5625 #ifdef CONFIG_DRM_AMDGPU_CIK 5626 case CHIP_KAVERI: 5627 case CHIP_KABINI: 5628 case CHIP_MULLINS: 5629 #endif 5630 case CHIP_CARRIZO: 5631 case CHIP_STONEY: 5632 case CHIP_CYAN_SKILLFISH: 5633 goto disabled; 5634 default: 5635 break; 5636 } 5637 } 5638 5639 return true; 5640 5641 disabled: 5642 dev_info(adev->dev, "GPU recovery disabled.\n"); 5643 return false; 5644 } 5645 5646 int amdgpu_device_mode1_reset(struct amdgpu_device *adev) 5647 { 5648 u32 i; 5649 int ret = 0; 5650 5651 if (adev->bios) 5652 amdgpu_atombios_scratch_regs_engine_hung(adev, true); 5653 5654 dev_info(adev->dev, "GPU mode1 reset\n"); 5655 5656 /* Cache the state before bus master disable. The saved config space 5657 * values are used in other cases like restore after mode-2 reset. 5658 */ 5659 amdgpu_device_cache_pci_state(adev->pdev); 5660 5661 /* disable BM */ 5662 pci_clear_master(adev->pdev); 5663 5664 if (amdgpu_dpm_is_mode1_reset_supported(adev)) { 5665 dev_info(adev->dev, "GPU smu mode1 reset\n"); 5666 ret = amdgpu_dpm_mode1_reset(adev); 5667 } else { 5668 dev_info(adev->dev, "GPU psp mode1 reset\n"); 5669 ret = psp_gpu_reset(adev); 5670 } 5671 5672 if (ret) 5673 goto mode1_reset_failed; 5674 5675 amdgpu_device_load_pci_state(adev->pdev); 5676 ret = amdgpu_psp_wait_for_bootloader(adev); 5677 if (ret) 5678 goto mode1_reset_failed; 5679 5680 /* wait for asic to come out of reset */ 5681 for (i = 0; i < adev->usec_timeout; i++) { 5682 u32 memsize = adev->nbio.funcs->get_memsize(adev); 5683 5684 if (memsize != 0xffffffff) 5685 break; 5686 udelay(1); 5687 } 5688 5689 if (i >= adev->usec_timeout) { 5690 ret = -ETIMEDOUT; 5691 goto mode1_reset_failed; 5692 } 5693 5694 if (adev->bios) 5695 amdgpu_atombios_scratch_regs_engine_hung(adev, false); 5696 5697 return 0; 5698 5699 mode1_reset_failed: 5700 dev_err(adev->dev, "GPU mode1 reset failed\n"); 5701 return ret; 5702 } 5703 5704 int amdgpu_device_link_reset(struct amdgpu_device *adev) 5705 { 5706 int ret = 0; 5707 5708 dev_info(adev->dev, "GPU link reset\n"); 5709 5710 if (!adev->pcie_reset_ctx.occurs_dpc) 5711 ret = amdgpu_dpm_link_reset(adev); 5712 5713 if (ret) 5714 goto link_reset_failed; 5715 5716 ret = amdgpu_psp_wait_for_bootloader(adev); 5717 if (ret) 5718 goto link_reset_failed; 5719 5720 return 0; 5721 5722 link_reset_failed: 5723 dev_err(adev->dev, "GPU link reset failed\n"); 5724 return ret; 5725 } 5726 5727 int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev, 5728 struct amdgpu_reset_context *reset_context) 5729 { 5730 int i, r = 0; 5731 struct amdgpu_job *job = NULL; 5732 struct amdgpu_device *tmp_adev = reset_context->reset_req_dev; 5733 bool need_full_reset = 5734 test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 5735 5736 if (reset_context->reset_req_dev == adev) 5737 job = reset_context->job; 5738 5739 if (amdgpu_sriov_vf(adev)) 5740 amdgpu_virt_pre_reset(adev); 5741 5742 amdgpu_fence_driver_isr_toggle(adev, true); 5743 5744 /* block all schedulers and reset given job's ring */ 5745 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 5746 struct amdgpu_ring *ring = adev->rings[i]; 5747 5748 if (!amdgpu_ring_sched_ready(ring)) 5749 continue; 5750 5751 /* Clear job fence from fence drv to avoid force_completion 5752 * leave NULL and vm flush fence in fence drv 5753 */ 5754 amdgpu_fence_driver_clear_job_fences(ring); 5755 5756 /* after all hw jobs are reset, hw fence is meaningless, so force_completion */ 5757 amdgpu_fence_driver_force_completion(ring); 5758 } 5759 5760 amdgpu_fence_driver_isr_toggle(adev, false); 5761 5762 if (job && job->vm) 5763 drm_sched_increase_karma(&job->base); 5764 5765 r = amdgpu_reset_prepare_hwcontext(adev, reset_context); 5766 /* If reset handler not implemented, continue; otherwise return */ 5767 if (r == -EOPNOTSUPP) 5768 r = 0; 5769 else 5770 return r; 5771 5772 /* Don't suspend on bare metal if we are not going to HW reset the ASIC */ 5773 if (!amdgpu_sriov_vf(adev)) { 5774 5775 if (!need_full_reset) 5776 need_full_reset = amdgpu_device_ip_need_full_reset(adev); 5777 5778 if (!need_full_reset && amdgpu_gpu_recovery && 5779 amdgpu_device_ip_check_soft_reset(adev)) { 5780 amdgpu_device_ip_pre_soft_reset(adev); 5781 r = amdgpu_device_ip_soft_reset(adev); 5782 amdgpu_device_ip_post_soft_reset(adev); 5783 if (r || amdgpu_device_ip_check_soft_reset(adev)) { 5784 dev_info(adev->dev, "soft reset failed, will fallback to full reset!\n"); 5785 need_full_reset = true; 5786 } 5787 } 5788 5789 if (!test_bit(AMDGPU_SKIP_COREDUMP, &reset_context->flags)) { 5790 dev_info(tmp_adev->dev, "Dumping IP State\n"); 5791 /* Trigger ip dump before we reset the asic */ 5792 for (i = 0; i < tmp_adev->num_ip_blocks; i++) 5793 if (tmp_adev->ip_blocks[i].version->funcs->dump_ip_state) 5794 tmp_adev->ip_blocks[i].version->funcs 5795 ->dump_ip_state((void *)&tmp_adev->ip_blocks[i]); 5796 dev_info(tmp_adev->dev, "Dumping IP State Completed\n"); 5797 } 5798 5799 if (need_full_reset) 5800 r = amdgpu_device_ip_suspend(adev); 5801 if (need_full_reset) 5802 set_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 5803 else 5804 clear_bit(AMDGPU_NEED_FULL_RESET, 5805 &reset_context->flags); 5806 } 5807 5808 return r; 5809 } 5810 5811 int amdgpu_device_reinit_after_reset(struct amdgpu_reset_context *reset_context) 5812 { 5813 struct list_head *device_list_handle; 5814 bool full_reset, vram_lost = false; 5815 struct amdgpu_device *tmp_adev; 5816 int r, init_level; 5817 5818 device_list_handle = reset_context->reset_device_list; 5819 5820 if (!device_list_handle) 5821 return -EINVAL; 5822 5823 full_reset = test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 5824 5825 /** 5826 * If it's reset on init, it's default init level, otherwise keep level 5827 * as recovery level. 5828 */ 5829 if (reset_context->method == AMD_RESET_METHOD_ON_INIT) 5830 init_level = AMDGPU_INIT_LEVEL_DEFAULT; 5831 else 5832 init_level = AMDGPU_INIT_LEVEL_RESET_RECOVERY; 5833 5834 r = 0; 5835 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 5836 amdgpu_set_init_level(tmp_adev, init_level); 5837 if (full_reset) { 5838 /* post card */ 5839 amdgpu_ras_clear_err_state(tmp_adev); 5840 r = amdgpu_device_asic_init(tmp_adev); 5841 if (r) { 5842 dev_warn(tmp_adev->dev, "asic atom init failed!"); 5843 } else { 5844 dev_info(tmp_adev->dev, "GPU reset succeeded, trying to resume\n"); 5845 5846 r = amdgpu_device_ip_resume_phase1(tmp_adev); 5847 if (r) 5848 goto out; 5849 5850 vram_lost = amdgpu_device_check_vram_lost(tmp_adev); 5851 5852 if (!test_bit(AMDGPU_SKIP_COREDUMP, &reset_context->flags)) 5853 amdgpu_coredump(tmp_adev, false, vram_lost, reset_context->job); 5854 5855 if (vram_lost) { 5856 dev_info( 5857 tmp_adev->dev, 5858 "VRAM is lost due to GPU reset!\n"); 5859 amdgpu_inc_vram_lost(tmp_adev); 5860 } 5861 5862 r = amdgpu_device_fw_loading(tmp_adev); 5863 if (r) 5864 return r; 5865 5866 r = amdgpu_xcp_restore_partition_mode( 5867 tmp_adev->xcp_mgr); 5868 if (r) 5869 goto out; 5870 5871 r = amdgpu_device_ip_resume_phase2(tmp_adev); 5872 if (r) 5873 goto out; 5874 5875 if (tmp_adev->mman.buffer_funcs_ring->sched.ready) 5876 amdgpu_ttm_set_buffer_funcs_status(tmp_adev, true); 5877 5878 r = amdgpu_device_ip_resume_phase3(tmp_adev); 5879 if (r) 5880 goto out; 5881 5882 if (vram_lost) 5883 amdgpu_device_fill_reset_magic(tmp_adev); 5884 5885 /* 5886 * Add this ASIC as tracked as reset was already 5887 * complete successfully. 5888 */ 5889 amdgpu_register_gpu_instance(tmp_adev); 5890 5891 if (!reset_context->hive && 5892 tmp_adev->gmc.xgmi.num_physical_nodes > 1) 5893 amdgpu_xgmi_add_device(tmp_adev); 5894 5895 r = amdgpu_device_ip_late_init(tmp_adev); 5896 if (r) 5897 goto out; 5898 5899 drm_client_dev_resume(adev_to_drm(tmp_adev), false); 5900 5901 /* 5902 * The GPU enters bad state once faulty pages 5903 * by ECC has reached the threshold, and ras 5904 * recovery is scheduled next. So add one check 5905 * here to break recovery if it indeed exceeds 5906 * bad page threshold, and remind user to 5907 * retire this GPU or setting one bigger 5908 * bad_page_threshold value to fix this once 5909 * probing driver again. 5910 */ 5911 if (!amdgpu_ras_is_rma(tmp_adev)) { 5912 /* must succeed. */ 5913 amdgpu_ras_resume(tmp_adev); 5914 } else { 5915 r = -EINVAL; 5916 goto out; 5917 } 5918 5919 /* Update PSP FW topology after reset */ 5920 if (reset_context->hive && 5921 tmp_adev->gmc.xgmi.num_physical_nodes > 1) 5922 r = amdgpu_xgmi_update_topology( 5923 reset_context->hive, tmp_adev); 5924 } 5925 } 5926 5927 out: 5928 if (!r) { 5929 /* IP init is complete now, set level as default */ 5930 amdgpu_set_init_level(tmp_adev, 5931 AMDGPU_INIT_LEVEL_DEFAULT); 5932 amdgpu_irq_gpu_reset_resume_helper(tmp_adev); 5933 r = amdgpu_ib_ring_tests(tmp_adev); 5934 if (r) { 5935 dev_err(tmp_adev->dev, "ib ring test failed (%d).\n", r); 5936 r = -EAGAIN; 5937 goto end; 5938 } 5939 } 5940 5941 if (r) 5942 tmp_adev->asic_reset_res = r; 5943 } 5944 5945 end: 5946 return r; 5947 } 5948 5949 int amdgpu_do_asic_reset(struct list_head *device_list_handle, 5950 struct amdgpu_reset_context *reset_context) 5951 { 5952 struct amdgpu_device *tmp_adev = NULL; 5953 bool need_full_reset, skip_hw_reset; 5954 int r = 0; 5955 5956 /* Try reset handler method first */ 5957 tmp_adev = list_first_entry(device_list_handle, struct amdgpu_device, 5958 reset_list); 5959 5960 reset_context->reset_device_list = device_list_handle; 5961 r = amdgpu_reset_perform_reset(tmp_adev, reset_context); 5962 /* If reset handler not implemented, continue; otherwise return */ 5963 if (r == -EOPNOTSUPP) 5964 r = 0; 5965 else 5966 return r; 5967 5968 /* Reset handler not implemented, use the default method */ 5969 need_full_reset = 5970 test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 5971 skip_hw_reset = test_bit(AMDGPU_SKIP_HW_RESET, &reset_context->flags); 5972 5973 /* 5974 * ASIC reset has to be done on all XGMI hive nodes ASAP 5975 * to allow proper links negotiation in FW (within 1 sec) 5976 */ 5977 if (!skip_hw_reset && need_full_reset) { 5978 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 5979 /* For XGMI run all resets in parallel to speed up the process */ 5980 if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) { 5981 if (!queue_work(system_unbound_wq, 5982 &tmp_adev->xgmi_reset_work)) 5983 r = -EALREADY; 5984 } else 5985 r = amdgpu_asic_reset(tmp_adev); 5986 5987 if (r) { 5988 dev_err(tmp_adev->dev, 5989 "ASIC reset failed with error, %d for drm dev, %s", 5990 r, adev_to_drm(tmp_adev)->unique); 5991 goto out; 5992 } 5993 } 5994 5995 /* For XGMI wait for all resets to complete before proceed */ 5996 if (!r) { 5997 list_for_each_entry(tmp_adev, device_list_handle, 5998 reset_list) { 5999 if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) { 6000 flush_work(&tmp_adev->xgmi_reset_work); 6001 r = tmp_adev->asic_reset_res; 6002 if (r) 6003 break; 6004 } 6005 } 6006 } 6007 } 6008 6009 if (!r && amdgpu_ras_intr_triggered()) { 6010 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 6011 amdgpu_ras_reset_error_count(tmp_adev, 6012 AMDGPU_RAS_BLOCK__MMHUB); 6013 } 6014 6015 amdgpu_ras_intr_cleared(); 6016 } 6017 6018 r = amdgpu_device_reinit_after_reset(reset_context); 6019 if (r == -EAGAIN) 6020 set_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 6021 else 6022 clear_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 6023 6024 out: 6025 return r; 6026 } 6027 6028 static void amdgpu_device_set_mp1_state(struct amdgpu_device *adev) 6029 { 6030 6031 switch (amdgpu_asic_reset_method(adev)) { 6032 case AMD_RESET_METHOD_MODE1: 6033 case AMD_RESET_METHOD_LINK: 6034 adev->mp1_state = PP_MP1_STATE_SHUTDOWN; 6035 break; 6036 case AMD_RESET_METHOD_MODE2: 6037 adev->mp1_state = PP_MP1_STATE_RESET; 6038 break; 6039 default: 6040 adev->mp1_state = PP_MP1_STATE_NONE; 6041 break; 6042 } 6043 } 6044 6045 static void amdgpu_device_unset_mp1_state(struct amdgpu_device *adev) 6046 { 6047 amdgpu_vf_error_trans_all(adev); 6048 adev->mp1_state = PP_MP1_STATE_NONE; 6049 } 6050 6051 static void amdgpu_device_resume_display_audio(struct amdgpu_device *adev) 6052 { 6053 struct pci_dev *p = NULL; 6054 6055 p = pci_get_domain_bus_and_slot(pci_domain_nr(adev->pdev->bus), 6056 adev->pdev->bus->number, 1); 6057 if (p) { 6058 pm_runtime_enable(&(p->dev)); 6059 pm_runtime_resume(&(p->dev)); 6060 } 6061 6062 pci_dev_put(p); 6063 } 6064 6065 static int amdgpu_device_suspend_display_audio(struct amdgpu_device *adev) 6066 { 6067 enum amd_reset_method reset_method; 6068 struct pci_dev *p = NULL; 6069 u64 expires; 6070 6071 /* 6072 * For now, only BACO and mode1 reset are confirmed 6073 * to suffer the audio issue without proper suspended. 6074 */ 6075 reset_method = amdgpu_asic_reset_method(adev); 6076 if ((reset_method != AMD_RESET_METHOD_BACO) && 6077 (reset_method != AMD_RESET_METHOD_MODE1)) 6078 return -EINVAL; 6079 6080 p = pci_get_domain_bus_and_slot(pci_domain_nr(adev->pdev->bus), 6081 adev->pdev->bus->number, 1); 6082 if (!p) 6083 return -ENODEV; 6084 6085 expires = pm_runtime_autosuspend_expiration(&(p->dev)); 6086 if (!expires) 6087 /* 6088 * If we cannot get the audio device autosuspend delay, 6089 * a fixed 4S interval will be used. Considering 3S is 6090 * the audio controller default autosuspend delay setting. 6091 * 4S used here is guaranteed to cover that. 6092 */ 6093 expires = ktime_get_mono_fast_ns() + NSEC_PER_SEC * 4ULL; 6094 6095 while (!pm_runtime_status_suspended(&(p->dev))) { 6096 if (!pm_runtime_suspend(&(p->dev))) 6097 break; 6098 6099 if (expires < ktime_get_mono_fast_ns()) { 6100 dev_warn(adev->dev, "failed to suspend display audio\n"); 6101 pci_dev_put(p); 6102 /* TODO: abort the succeeding gpu reset? */ 6103 return -ETIMEDOUT; 6104 } 6105 } 6106 6107 pm_runtime_disable(&(p->dev)); 6108 6109 pci_dev_put(p); 6110 return 0; 6111 } 6112 6113 static inline void amdgpu_device_stop_pending_resets(struct amdgpu_device *adev) 6114 { 6115 struct amdgpu_ras *con = amdgpu_ras_get_context(adev); 6116 6117 #if defined(CONFIG_DEBUG_FS) 6118 if (!amdgpu_sriov_vf(adev)) 6119 cancel_work(&adev->reset_work); 6120 #endif 6121 6122 if (adev->kfd.dev) 6123 cancel_work(&adev->kfd.reset_work); 6124 6125 if (amdgpu_sriov_vf(adev)) 6126 cancel_work(&adev->virt.flr_work); 6127 6128 if (con && adev->ras_enabled) 6129 cancel_work(&con->recovery_work); 6130 6131 } 6132 6133 static int amdgpu_device_health_check(struct list_head *device_list_handle) 6134 { 6135 struct amdgpu_device *tmp_adev; 6136 int ret = 0; 6137 6138 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 6139 ret |= amdgpu_device_bus_status_check(tmp_adev); 6140 } 6141 6142 return ret; 6143 } 6144 6145 static int amdgpu_device_recovery_prepare(struct amdgpu_device *adev, 6146 struct list_head *device_list, 6147 struct amdgpu_hive_info *hive) 6148 { 6149 struct amdgpu_device *tmp_adev = NULL; 6150 int r; 6151 6152 /* 6153 * Build list of devices to reset. 6154 * In case we are in XGMI hive mode, resort the device list 6155 * to put adev in the 1st position. 6156 */ 6157 if (!amdgpu_sriov_vf(adev) && (adev->gmc.xgmi.num_physical_nodes > 1) && hive) { 6158 list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head) { 6159 list_add_tail(&tmp_adev->reset_list, device_list); 6160 if (adev->shutdown) 6161 tmp_adev->shutdown = true; 6162 if (adev->pcie_reset_ctx.occurs_dpc) 6163 tmp_adev->pcie_reset_ctx.in_link_reset = true; 6164 } 6165 if (!list_is_first(&adev->reset_list, device_list)) 6166 list_rotate_to_front(&adev->reset_list, device_list); 6167 } else { 6168 list_add_tail(&adev->reset_list, device_list); 6169 } 6170 6171 if (!amdgpu_sriov_vf(adev) && (!adev->pcie_reset_ctx.occurs_dpc)) { 6172 r = amdgpu_device_health_check(device_list); 6173 if (r) 6174 return r; 6175 } 6176 6177 return 0; 6178 } 6179 6180 static void amdgpu_device_recovery_get_reset_lock(struct amdgpu_device *adev, 6181 struct list_head *device_list) 6182 { 6183 struct amdgpu_device *tmp_adev = NULL; 6184 6185 if (list_empty(device_list)) 6186 return; 6187 tmp_adev = 6188 list_first_entry(device_list, struct amdgpu_device, reset_list); 6189 amdgpu_device_lock_reset_domain(tmp_adev->reset_domain); 6190 } 6191 6192 static void amdgpu_device_recovery_put_reset_lock(struct amdgpu_device *adev, 6193 struct list_head *device_list) 6194 { 6195 struct amdgpu_device *tmp_adev = NULL; 6196 6197 if (list_empty(device_list)) 6198 return; 6199 tmp_adev = 6200 list_first_entry(device_list, struct amdgpu_device, reset_list); 6201 amdgpu_device_unlock_reset_domain(tmp_adev->reset_domain); 6202 } 6203 6204 static void amdgpu_device_halt_activities(struct amdgpu_device *adev, 6205 struct amdgpu_job *job, 6206 struct amdgpu_reset_context *reset_context, 6207 struct list_head *device_list, 6208 struct amdgpu_hive_info *hive, 6209 bool need_emergency_restart) 6210 { 6211 struct amdgpu_device *tmp_adev = NULL; 6212 int i; 6213 6214 /* block all schedulers and reset given job's ring */ 6215 list_for_each_entry(tmp_adev, device_list, reset_list) { 6216 amdgpu_device_set_mp1_state(tmp_adev); 6217 6218 /* 6219 * Try to put the audio codec into suspend state 6220 * before gpu reset started. 6221 * 6222 * Due to the power domain of the graphics device 6223 * is shared with AZ power domain. Without this, 6224 * we may change the audio hardware from behind 6225 * the audio driver's back. That will trigger 6226 * some audio codec errors. 6227 */ 6228 if (!amdgpu_device_suspend_display_audio(tmp_adev)) 6229 tmp_adev->pcie_reset_ctx.audio_suspended = true; 6230 6231 amdgpu_ras_set_error_query_ready(tmp_adev, false); 6232 6233 cancel_delayed_work_sync(&tmp_adev->delayed_init_work); 6234 6235 amdgpu_amdkfd_pre_reset(tmp_adev, reset_context); 6236 6237 /* 6238 * Mark these ASICs to be reset as untracked first 6239 * And add them back after reset completed 6240 */ 6241 amdgpu_unregister_gpu_instance(tmp_adev); 6242 6243 drm_client_dev_suspend(adev_to_drm(tmp_adev), false); 6244 6245 /* disable ras on ALL IPs */ 6246 if (!need_emergency_restart && 6247 (!adev->pcie_reset_ctx.occurs_dpc) && 6248 amdgpu_device_ip_need_full_reset(tmp_adev)) 6249 amdgpu_ras_suspend(tmp_adev); 6250 6251 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 6252 struct amdgpu_ring *ring = tmp_adev->rings[i]; 6253 6254 if (!amdgpu_ring_sched_ready(ring)) 6255 continue; 6256 6257 drm_sched_stop(&ring->sched, job ? &job->base : NULL); 6258 6259 if (need_emergency_restart) 6260 amdgpu_job_stop_all_jobs_on_sched(&ring->sched); 6261 } 6262 atomic_inc(&tmp_adev->gpu_reset_counter); 6263 } 6264 } 6265 6266 static int amdgpu_device_asic_reset(struct amdgpu_device *adev, 6267 struct list_head *device_list, 6268 struct amdgpu_reset_context *reset_context) 6269 { 6270 struct amdgpu_device *tmp_adev = NULL; 6271 int retry_limit = AMDGPU_MAX_RETRY_LIMIT; 6272 int r = 0; 6273 6274 retry: /* Rest of adevs pre asic reset from XGMI hive. */ 6275 list_for_each_entry(tmp_adev, device_list, reset_list) { 6276 if (adev->pcie_reset_ctx.occurs_dpc) 6277 tmp_adev->no_hw_access = true; 6278 r = amdgpu_device_pre_asic_reset(tmp_adev, reset_context); 6279 if (adev->pcie_reset_ctx.occurs_dpc) 6280 tmp_adev->no_hw_access = false; 6281 /*TODO Should we stop ?*/ 6282 if (r) { 6283 dev_err(tmp_adev->dev, "GPU pre asic reset failed with err, %d for drm dev, %s ", 6284 r, adev_to_drm(tmp_adev)->unique); 6285 tmp_adev->asic_reset_res = r; 6286 } 6287 } 6288 6289 /* Actual ASIC resets if needed.*/ 6290 /* Host driver will handle XGMI hive reset for SRIOV */ 6291 if (amdgpu_sriov_vf(adev)) { 6292 6293 /* Bail out of reset early */ 6294 if (amdgpu_ras_is_rma(adev)) 6295 return -ENODEV; 6296 6297 if (amdgpu_ras_get_fed_status(adev) || amdgpu_virt_rcvd_ras_interrupt(adev)) { 6298 dev_dbg(adev->dev, "Detected RAS error, wait for FLR completion\n"); 6299 amdgpu_ras_set_fed(adev, true); 6300 set_bit(AMDGPU_HOST_FLR, &reset_context->flags); 6301 } 6302 6303 r = amdgpu_device_reset_sriov(adev, reset_context); 6304 if (AMDGPU_RETRY_SRIOV_RESET(r) && (retry_limit--) > 0) { 6305 amdgpu_virt_release_full_gpu(adev, true); 6306 goto retry; 6307 } 6308 if (r) 6309 adev->asic_reset_res = r; 6310 } else { 6311 r = amdgpu_do_asic_reset(device_list, reset_context); 6312 if (r && r == -EAGAIN) 6313 goto retry; 6314 } 6315 6316 list_for_each_entry(tmp_adev, device_list, reset_list) { 6317 /* 6318 * Drop any pending non scheduler resets queued before reset is done. 6319 * Any reset scheduled after this point would be valid. Scheduler resets 6320 * were already dropped during drm_sched_stop and no new ones can come 6321 * in before drm_sched_start. 6322 */ 6323 amdgpu_device_stop_pending_resets(tmp_adev); 6324 } 6325 6326 return r; 6327 } 6328 6329 static int amdgpu_device_sched_resume(struct list_head *device_list, 6330 struct amdgpu_reset_context *reset_context, 6331 bool job_signaled) 6332 { 6333 struct amdgpu_device *tmp_adev = NULL; 6334 int i, r = 0; 6335 6336 /* Post ASIC reset for all devs .*/ 6337 list_for_each_entry(tmp_adev, device_list, reset_list) { 6338 6339 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 6340 struct amdgpu_ring *ring = tmp_adev->rings[i]; 6341 6342 if (!amdgpu_ring_sched_ready(ring)) 6343 continue; 6344 6345 drm_sched_start(&ring->sched, 0); 6346 } 6347 6348 if (!drm_drv_uses_atomic_modeset(adev_to_drm(tmp_adev)) && !job_signaled) 6349 drm_helper_resume_force_mode(adev_to_drm(tmp_adev)); 6350 6351 if (tmp_adev->asic_reset_res) 6352 r = tmp_adev->asic_reset_res; 6353 6354 tmp_adev->asic_reset_res = 0; 6355 6356 if (r) { 6357 /* bad news, how to tell it to userspace ? 6358 * for ras error, we should report GPU bad status instead of 6359 * reset failure 6360 */ 6361 if (reset_context->src != AMDGPU_RESET_SRC_RAS || 6362 !amdgpu_ras_eeprom_check_err_threshold(tmp_adev)) 6363 dev_info(tmp_adev->dev, "GPU reset(%d) failed\n", 6364 atomic_read(&tmp_adev->gpu_reset_counter)); 6365 amdgpu_vf_error_put(tmp_adev, AMDGIM_ERROR_VF_GPU_RESET_FAIL, 0, r); 6366 } else { 6367 dev_info(tmp_adev->dev, "GPU reset(%d) succeeded!\n", atomic_read(&tmp_adev->gpu_reset_counter)); 6368 if (amdgpu_acpi_smart_shift_update(adev_to_drm(tmp_adev), AMDGPU_SS_DEV_D0)) 6369 dev_warn(tmp_adev->dev, 6370 "smart shift update failed\n"); 6371 } 6372 } 6373 6374 return r; 6375 } 6376 6377 static void amdgpu_device_gpu_resume(struct amdgpu_device *adev, 6378 struct list_head *device_list, 6379 bool need_emergency_restart) 6380 { 6381 struct amdgpu_device *tmp_adev = NULL; 6382 6383 list_for_each_entry(tmp_adev, device_list, reset_list) { 6384 /* unlock kfd: SRIOV would do it separately */ 6385 if (!need_emergency_restart && !amdgpu_sriov_vf(tmp_adev)) 6386 amdgpu_amdkfd_post_reset(tmp_adev); 6387 6388 /* kfd_post_reset will do nothing if kfd device is not initialized, 6389 * need to bring up kfd here if it's not be initialized before 6390 */ 6391 if (!adev->kfd.init_complete) 6392 amdgpu_amdkfd_device_init(adev); 6393 6394 if (tmp_adev->pcie_reset_ctx.audio_suspended) 6395 amdgpu_device_resume_display_audio(tmp_adev); 6396 6397 amdgpu_device_unset_mp1_state(tmp_adev); 6398 6399 amdgpu_ras_set_error_query_ready(tmp_adev, true); 6400 6401 } 6402 } 6403 6404 6405 /** 6406 * amdgpu_device_gpu_recover - reset the asic and recover scheduler 6407 * 6408 * @adev: amdgpu_device pointer 6409 * @job: which job trigger hang 6410 * @reset_context: amdgpu reset context pointer 6411 * 6412 * Attempt to reset the GPU if it has hung (all asics). 6413 * Attempt to do soft-reset or full-reset and reinitialize Asic 6414 * Returns 0 for success or an error on failure. 6415 */ 6416 6417 int amdgpu_device_gpu_recover(struct amdgpu_device *adev, 6418 struct amdgpu_job *job, 6419 struct amdgpu_reset_context *reset_context) 6420 { 6421 struct list_head device_list; 6422 bool job_signaled = false; 6423 struct amdgpu_hive_info *hive = NULL; 6424 int r = 0; 6425 bool need_emergency_restart = false; 6426 6427 /* 6428 * If it reaches here because of hang/timeout and a RAS error is 6429 * detected at the same time, let RAS recovery take care of it. 6430 */ 6431 if (amdgpu_ras_is_err_state(adev, AMDGPU_RAS_BLOCK__ANY) && 6432 !amdgpu_sriov_vf(adev) && 6433 reset_context->src != AMDGPU_RESET_SRC_RAS) { 6434 dev_dbg(adev->dev, 6435 "Gpu recovery from source: %d yielding to RAS error recovery handling", 6436 reset_context->src); 6437 return 0; 6438 } 6439 6440 /* 6441 * Special case: RAS triggered and full reset isn't supported 6442 */ 6443 need_emergency_restart = amdgpu_ras_need_emergency_restart(adev); 6444 6445 /* 6446 * Flush RAM to disk so that after reboot 6447 * the user can read log and see why the system rebooted. 6448 */ 6449 if (need_emergency_restart && amdgpu_ras_get_context(adev) && 6450 amdgpu_ras_get_context(adev)->reboot) { 6451 dev_warn(adev->dev, "Emergency reboot."); 6452 6453 ksys_sync_helper(); 6454 emergency_restart(); 6455 } 6456 6457 dev_info(adev->dev, "GPU %s begin!\n", 6458 need_emergency_restart ? "jobs stop":"reset"); 6459 6460 if (!amdgpu_sriov_vf(adev)) 6461 hive = amdgpu_get_xgmi_hive(adev); 6462 if (hive) 6463 mutex_lock(&hive->hive_lock); 6464 6465 reset_context->job = job; 6466 reset_context->hive = hive; 6467 INIT_LIST_HEAD(&device_list); 6468 6469 if (amdgpu_device_recovery_prepare(adev, &device_list, hive)) 6470 goto end_reset; 6471 6472 /* We need to lock reset domain only once both for XGMI and single device */ 6473 amdgpu_device_recovery_get_reset_lock(adev, &device_list); 6474 6475 amdgpu_device_halt_activities(adev, job, reset_context, &device_list, 6476 hive, need_emergency_restart); 6477 if (need_emergency_restart) 6478 goto skip_sched_resume; 6479 /* 6480 * Must check guilty signal here since after this point all old 6481 * HW fences are force signaled. 6482 * 6483 * job->base holds a reference to parent fence 6484 */ 6485 if (job && dma_fence_is_signaled(&job->hw_fence.base)) { 6486 job_signaled = true; 6487 dev_info(adev->dev, "Guilty job already signaled, skipping HW reset"); 6488 goto skip_hw_reset; 6489 } 6490 6491 r = amdgpu_device_asic_reset(adev, &device_list, reset_context); 6492 if (r) 6493 goto reset_unlock; 6494 skip_hw_reset: 6495 r = amdgpu_device_sched_resume(&device_list, reset_context, job_signaled); 6496 if (r) 6497 goto reset_unlock; 6498 skip_sched_resume: 6499 amdgpu_device_gpu_resume(adev, &device_list, need_emergency_restart); 6500 reset_unlock: 6501 amdgpu_device_recovery_put_reset_lock(adev, &device_list); 6502 end_reset: 6503 if (hive) { 6504 mutex_unlock(&hive->hive_lock); 6505 amdgpu_put_xgmi_hive(hive); 6506 } 6507 6508 if (r) 6509 dev_info(adev->dev, "GPU reset end with ret = %d\n", r); 6510 6511 atomic_set(&adev->reset_domain->reset_res, r); 6512 6513 if (!r) { 6514 struct amdgpu_task_info *ti = NULL; 6515 6516 if (job) 6517 ti = amdgpu_vm_get_task_info_pasid(adev, job->pasid); 6518 6519 drm_dev_wedged_event(adev_to_drm(adev), DRM_WEDGE_RECOVERY_NONE, 6520 ti ? &ti->task : NULL); 6521 6522 amdgpu_vm_put_task_info(ti); 6523 } 6524 6525 return r; 6526 } 6527 6528 /** 6529 * amdgpu_device_partner_bandwidth - find the bandwidth of appropriate partner 6530 * 6531 * @adev: amdgpu_device pointer 6532 * @speed: pointer to the speed of the link 6533 * @width: pointer to the width of the link 6534 * 6535 * Evaluate the hierarchy to find the speed and bandwidth capabilities of the 6536 * first physical partner to an AMD dGPU. 6537 * This will exclude any virtual switches and links. 6538 */ 6539 static void amdgpu_device_partner_bandwidth(struct amdgpu_device *adev, 6540 enum pci_bus_speed *speed, 6541 enum pcie_link_width *width) 6542 { 6543 struct pci_dev *parent = adev->pdev; 6544 6545 if (!speed || !width) 6546 return; 6547 6548 *speed = PCI_SPEED_UNKNOWN; 6549 *width = PCIE_LNK_WIDTH_UNKNOWN; 6550 6551 if (amdgpu_device_pcie_dynamic_switching_supported(adev)) { 6552 while ((parent = pci_upstream_bridge(parent))) { 6553 /* skip upstream/downstream switches internal to dGPU*/ 6554 if (parent->vendor == PCI_VENDOR_ID_ATI) 6555 continue; 6556 *speed = pcie_get_speed_cap(parent); 6557 *width = pcie_get_width_cap(parent); 6558 break; 6559 } 6560 } else { 6561 /* use the current speeds rather than max if switching is not supported */ 6562 pcie_bandwidth_available(adev->pdev, NULL, speed, width); 6563 } 6564 } 6565 6566 /** 6567 * amdgpu_device_gpu_bandwidth - find the bandwidth of the GPU 6568 * 6569 * @adev: amdgpu_device pointer 6570 * @speed: pointer to the speed of the link 6571 * @width: pointer to the width of the link 6572 * 6573 * Evaluate the hierarchy to find the speed and bandwidth capabilities of the 6574 * AMD dGPU which may be a virtual upstream bridge. 6575 */ 6576 static void amdgpu_device_gpu_bandwidth(struct amdgpu_device *adev, 6577 enum pci_bus_speed *speed, 6578 enum pcie_link_width *width) 6579 { 6580 struct pci_dev *parent = adev->pdev; 6581 6582 if (!speed || !width) 6583 return; 6584 6585 parent = pci_upstream_bridge(parent); 6586 if (parent && parent->vendor == PCI_VENDOR_ID_ATI) { 6587 /* use the upstream/downstream switches internal to dGPU */ 6588 *speed = pcie_get_speed_cap(parent); 6589 *width = pcie_get_width_cap(parent); 6590 while ((parent = pci_upstream_bridge(parent))) { 6591 if (parent->vendor == PCI_VENDOR_ID_ATI) { 6592 /* use the upstream/downstream switches internal to dGPU */ 6593 *speed = pcie_get_speed_cap(parent); 6594 *width = pcie_get_width_cap(parent); 6595 } 6596 } 6597 } else { 6598 /* use the device itself */ 6599 *speed = pcie_get_speed_cap(adev->pdev); 6600 *width = pcie_get_width_cap(adev->pdev); 6601 } 6602 } 6603 6604 /** 6605 * amdgpu_device_get_pcie_info - fence pcie info about the PCIE slot 6606 * 6607 * @adev: amdgpu_device pointer 6608 * 6609 * Fetches and stores in the driver the PCIE capabilities (gen speed 6610 * and lanes) of the slot the device is in. Handles APUs and 6611 * virtualized environments where PCIE config space may not be available. 6612 */ 6613 static void amdgpu_device_get_pcie_info(struct amdgpu_device *adev) 6614 { 6615 enum pci_bus_speed speed_cap, platform_speed_cap; 6616 enum pcie_link_width platform_link_width, link_width; 6617 6618 if (amdgpu_pcie_gen_cap) 6619 adev->pm.pcie_gen_mask = amdgpu_pcie_gen_cap; 6620 6621 if (amdgpu_pcie_lane_cap) 6622 adev->pm.pcie_mlw_mask = amdgpu_pcie_lane_cap; 6623 6624 /* covers APUs as well */ 6625 if (pci_is_root_bus(adev->pdev->bus) && !amdgpu_passthrough(adev)) { 6626 if (adev->pm.pcie_gen_mask == 0) 6627 adev->pm.pcie_gen_mask = AMDGPU_DEFAULT_PCIE_GEN_MASK; 6628 if (adev->pm.pcie_mlw_mask == 0) 6629 adev->pm.pcie_mlw_mask = AMDGPU_DEFAULT_PCIE_MLW_MASK; 6630 return; 6631 } 6632 6633 if (adev->pm.pcie_gen_mask && adev->pm.pcie_mlw_mask) 6634 return; 6635 6636 amdgpu_device_partner_bandwidth(adev, &platform_speed_cap, 6637 &platform_link_width); 6638 amdgpu_device_gpu_bandwidth(adev, &speed_cap, &link_width); 6639 6640 if (adev->pm.pcie_gen_mask == 0) { 6641 /* asic caps */ 6642 if (speed_cap == PCI_SPEED_UNKNOWN) { 6643 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 6644 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 | 6645 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3); 6646 } else { 6647 if (speed_cap == PCIE_SPEED_32_0GT) 6648 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 6649 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 | 6650 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3 | 6651 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN4 | 6652 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN5); 6653 else if (speed_cap == PCIE_SPEED_16_0GT) 6654 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 6655 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 | 6656 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3 | 6657 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN4); 6658 else if (speed_cap == PCIE_SPEED_8_0GT) 6659 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 6660 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 | 6661 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3); 6662 else if (speed_cap == PCIE_SPEED_5_0GT) 6663 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 6664 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2); 6665 else 6666 adev->pm.pcie_gen_mask |= CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1; 6667 } 6668 /* platform caps */ 6669 if (platform_speed_cap == PCI_SPEED_UNKNOWN) { 6670 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 6671 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2); 6672 } else { 6673 if (platform_speed_cap == PCIE_SPEED_32_0GT) 6674 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 6675 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 | 6676 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3 | 6677 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN4 | 6678 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN5); 6679 else if (platform_speed_cap == PCIE_SPEED_16_0GT) 6680 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 6681 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 | 6682 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3 | 6683 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN4); 6684 else if (platform_speed_cap == PCIE_SPEED_8_0GT) 6685 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 6686 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 | 6687 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3); 6688 else if (platform_speed_cap == PCIE_SPEED_5_0GT) 6689 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 6690 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2); 6691 else 6692 adev->pm.pcie_gen_mask |= CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1; 6693 6694 } 6695 } 6696 if (adev->pm.pcie_mlw_mask == 0) { 6697 /* asic caps */ 6698 if (link_width == PCIE_LNK_WIDTH_UNKNOWN) { 6699 adev->pm.pcie_mlw_mask |= AMDGPU_DEFAULT_ASIC_PCIE_MLW_MASK; 6700 } else { 6701 switch (link_width) { 6702 case PCIE_LNK_X32: 6703 adev->pm.pcie_mlw_mask |= (CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X32 | 6704 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X16 | 6705 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X12 | 6706 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X8 | 6707 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X4 | 6708 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X2 | 6709 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X1); 6710 break; 6711 case PCIE_LNK_X16: 6712 adev->pm.pcie_mlw_mask |= (CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X16 | 6713 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X12 | 6714 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X8 | 6715 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X4 | 6716 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X2 | 6717 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X1); 6718 break; 6719 case PCIE_LNK_X12: 6720 adev->pm.pcie_mlw_mask |= (CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X12 | 6721 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X8 | 6722 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X4 | 6723 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X2 | 6724 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X1); 6725 break; 6726 case PCIE_LNK_X8: 6727 adev->pm.pcie_mlw_mask |= (CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X8 | 6728 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X4 | 6729 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X2 | 6730 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X1); 6731 break; 6732 case PCIE_LNK_X4: 6733 adev->pm.pcie_mlw_mask |= (CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X4 | 6734 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X2 | 6735 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X1); 6736 break; 6737 case PCIE_LNK_X2: 6738 adev->pm.pcie_mlw_mask |= (CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X2 | 6739 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X1); 6740 break; 6741 case PCIE_LNK_X1: 6742 adev->pm.pcie_mlw_mask |= CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X1; 6743 break; 6744 default: 6745 break; 6746 } 6747 } 6748 /* platform caps */ 6749 if (platform_link_width == PCIE_LNK_WIDTH_UNKNOWN) { 6750 adev->pm.pcie_mlw_mask |= AMDGPU_DEFAULT_PCIE_MLW_MASK; 6751 } else { 6752 switch (platform_link_width) { 6753 case PCIE_LNK_X32: 6754 adev->pm.pcie_mlw_mask |= (CAIL_PCIE_LINK_WIDTH_SUPPORT_X32 | 6755 CAIL_PCIE_LINK_WIDTH_SUPPORT_X16 | 6756 CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 | 6757 CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 | 6758 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 6759 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 6760 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 6761 break; 6762 case PCIE_LNK_X16: 6763 adev->pm.pcie_mlw_mask |= (CAIL_PCIE_LINK_WIDTH_SUPPORT_X16 | 6764 CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 | 6765 CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 | 6766 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 6767 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 6768 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 6769 break; 6770 case PCIE_LNK_X12: 6771 adev->pm.pcie_mlw_mask |= (CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 | 6772 CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 | 6773 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 6774 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 6775 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 6776 break; 6777 case PCIE_LNK_X8: 6778 adev->pm.pcie_mlw_mask |= (CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 | 6779 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 6780 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 6781 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 6782 break; 6783 case PCIE_LNK_X4: 6784 adev->pm.pcie_mlw_mask |= (CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 6785 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 6786 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 6787 break; 6788 case PCIE_LNK_X2: 6789 adev->pm.pcie_mlw_mask |= (CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 6790 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 6791 break; 6792 case PCIE_LNK_X1: 6793 adev->pm.pcie_mlw_mask |= CAIL_PCIE_LINK_WIDTH_SUPPORT_X1; 6794 break; 6795 default: 6796 break; 6797 } 6798 } 6799 } 6800 } 6801 6802 /** 6803 * amdgpu_device_is_peer_accessible - Check peer access through PCIe BAR 6804 * 6805 * @adev: amdgpu_device pointer 6806 * @peer_adev: amdgpu_device pointer for peer device trying to access @adev 6807 * 6808 * Return true if @peer_adev can access (DMA) @adev through the PCIe 6809 * BAR, i.e. @adev is "large BAR" and the BAR matches the DMA mask of 6810 * @peer_adev. 6811 */ 6812 bool amdgpu_device_is_peer_accessible(struct amdgpu_device *adev, 6813 struct amdgpu_device *peer_adev) 6814 { 6815 #ifdef CONFIG_HSA_AMD_P2P 6816 bool p2p_access = 6817 !adev->gmc.xgmi.connected_to_cpu && 6818 !(pci_p2pdma_distance(adev->pdev, peer_adev->dev, false) < 0); 6819 if (!p2p_access) 6820 dev_info(adev->dev, "PCIe P2P access from peer device %s is not supported by the chipset\n", 6821 pci_name(peer_adev->pdev)); 6822 6823 bool is_large_bar = adev->gmc.visible_vram_size && 6824 adev->gmc.real_vram_size == adev->gmc.visible_vram_size; 6825 bool p2p_addressable = amdgpu_device_check_iommu_remap(peer_adev); 6826 6827 if (!p2p_addressable) { 6828 uint64_t address_mask = peer_adev->dev->dma_mask ? 6829 ~*peer_adev->dev->dma_mask : ~((1ULL << 32) - 1); 6830 resource_size_t aper_limit = 6831 adev->gmc.aper_base + adev->gmc.aper_size - 1; 6832 6833 p2p_addressable = !(adev->gmc.aper_base & address_mask || 6834 aper_limit & address_mask); 6835 } 6836 return pcie_p2p && is_large_bar && p2p_access && p2p_addressable; 6837 #else 6838 return false; 6839 #endif 6840 } 6841 6842 int amdgpu_device_baco_enter(struct drm_device *dev) 6843 { 6844 struct amdgpu_device *adev = drm_to_adev(dev); 6845 struct amdgpu_ras *ras = amdgpu_ras_get_context(adev); 6846 6847 if (!amdgpu_device_supports_baco(dev)) 6848 return -ENOTSUPP; 6849 6850 if (ras && adev->ras_enabled && 6851 adev->nbio.funcs->enable_doorbell_interrupt) 6852 adev->nbio.funcs->enable_doorbell_interrupt(adev, false); 6853 6854 return amdgpu_dpm_baco_enter(adev); 6855 } 6856 6857 int amdgpu_device_baco_exit(struct drm_device *dev) 6858 { 6859 struct amdgpu_device *adev = drm_to_adev(dev); 6860 struct amdgpu_ras *ras = amdgpu_ras_get_context(adev); 6861 int ret = 0; 6862 6863 if (!amdgpu_device_supports_baco(dev)) 6864 return -ENOTSUPP; 6865 6866 ret = amdgpu_dpm_baco_exit(adev); 6867 if (ret) 6868 return ret; 6869 6870 if (ras && adev->ras_enabled && 6871 adev->nbio.funcs->enable_doorbell_interrupt) 6872 adev->nbio.funcs->enable_doorbell_interrupt(adev, true); 6873 6874 if (amdgpu_passthrough(adev) && adev->nbio.funcs && 6875 adev->nbio.funcs->clear_doorbell_interrupt) 6876 adev->nbio.funcs->clear_doorbell_interrupt(adev); 6877 6878 return 0; 6879 } 6880 6881 /** 6882 * amdgpu_pci_error_detected - Called when a PCI error is detected. 6883 * @pdev: PCI device struct 6884 * @state: PCI channel state 6885 * 6886 * Description: Called when a PCI error is detected. 6887 * 6888 * Return: PCI_ERS_RESULT_NEED_RESET or PCI_ERS_RESULT_DISCONNECT. 6889 */ 6890 pci_ers_result_t amdgpu_pci_error_detected(struct pci_dev *pdev, pci_channel_state_t state) 6891 { 6892 struct drm_device *dev = pci_get_drvdata(pdev); 6893 struct amdgpu_device *adev = drm_to_adev(dev); 6894 struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev); 6895 struct amdgpu_reset_context reset_context; 6896 struct list_head device_list; 6897 6898 dev_info(adev->dev, "PCI error: detected callback!!\n"); 6899 6900 if (!amdgpu_dpm_is_link_reset_supported(adev)) { 6901 dev_warn(adev->dev, "No support for XGMI hive yet...\n"); 6902 return PCI_ERS_RESULT_DISCONNECT; 6903 } 6904 6905 adev->pci_channel_state = state; 6906 6907 switch (state) { 6908 case pci_channel_io_normal: 6909 dev_info(adev->dev, "pci_channel_io_normal: state(%d)!!\n", state); 6910 return PCI_ERS_RESULT_CAN_RECOVER; 6911 case pci_channel_io_frozen: 6912 /* Fatal error, prepare for slot reset */ 6913 dev_info(adev->dev, "pci_channel_io_frozen: state(%d)!!\n", state); 6914 6915 if (hive) 6916 mutex_lock(&hive->hive_lock); 6917 adev->pcie_reset_ctx.occurs_dpc = true; 6918 memset(&reset_context, 0, sizeof(reset_context)); 6919 INIT_LIST_HEAD(&device_list); 6920 6921 amdgpu_device_recovery_prepare(adev, &device_list, hive); 6922 amdgpu_device_recovery_get_reset_lock(adev, &device_list); 6923 amdgpu_device_halt_activities(adev, NULL, &reset_context, &device_list, 6924 hive, false); 6925 if (hive) { 6926 mutex_unlock(&hive->hive_lock); 6927 amdgpu_put_xgmi_hive(hive); 6928 } 6929 return PCI_ERS_RESULT_NEED_RESET; 6930 case pci_channel_io_perm_failure: 6931 /* Permanent error, prepare for device removal */ 6932 dev_info(adev->dev, "pci_channel_io_perm_failure: state(%d)!!\n", state); 6933 return PCI_ERS_RESULT_DISCONNECT; 6934 } 6935 6936 return PCI_ERS_RESULT_NEED_RESET; 6937 } 6938 6939 /** 6940 * amdgpu_pci_mmio_enabled - Enable MMIO and dump debug registers 6941 * @pdev: pointer to PCI device 6942 */ 6943 pci_ers_result_t amdgpu_pci_mmio_enabled(struct pci_dev *pdev) 6944 { 6945 struct drm_device *dev = pci_get_drvdata(pdev); 6946 struct amdgpu_device *adev = drm_to_adev(dev); 6947 6948 dev_info(adev->dev, "PCI error: mmio enabled callback!!\n"); 6949 6950 /* TODO - dump whatever for debugging purposes */ 6951 6952 /* This called only if amdgpu_pci_error_detected returns 6953 * PCI_ERS_RESULT_CAN_RECOVER. Read/write to the device still 6954 * works, no need to reset slot. 6955 */ 6956 6957 return PCI_ERS_RESULT_RECOVERED; 6958 } 6959 6960 /** 6961 * amdgpu_pci_slot_reset - Called when PCI slot has been reset. 6962 * @pdev: PCI device struct 6963 * 6964 * Description: This routine is called by the pci error recovery 6965 * code after the PCI slot has been reset, just before we 6966 * should resume normal operations. 6967 */ 6968 pci_ers_result_t amdgpu_pci_slot_reset(struct pci_dev *pdev) 6969 { 6970 struct drm_device *dev = pci_get_drvdata(pdev); 6971 struct amdgpu_device *adev = drm_to_adev(dev); 6972 struct amdgpu_reset_context reset_context; 6973 struct amdgpu_device *tmp_adev; 6974 struct amdgpu_hive_info *hive; 6975 struct list_head device_list; 6976 int r = 0, i; 6977 u32 memsize; 6978 6979 /* PCI error slot reset should be skipped During RAS recovery */ 6980 if ((amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 3) || 6981 amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 4)) && 6982 amdgpu_ras_in_recovery(adev)) 6983 return PCI_ERS_RESULT_RECOVERED; 6984 6985 dev_info(adev->dev, "PCI error: slot reset callback!!\n"); 6986 6987 memset(&reset_context, 0, sizeof(reset_context)); 6988 6989 /* wait for asic to come out of reset */ 6990 msleep(700); 6991 6992 /* Restore PCI confspace */ 6993 amdgpu_device_load_pci_state(pdev); 6994 6995 /* confirm ASIC came out of reset */ 6996 for (i = 0; i < adev->usec_timeout; i++) { 6997 memsize = amdgpu_asic_get_config_memsize(adev); 6998 6999 if (memsize != 0xffffffff) 7000 break; 7001 udelay(1); 7002 } 7003 if (memsize == 0xffffffff) { 7004 r = -ETIME; 7005 goto out; 7006 } 7007 7008 reset_context.method = AMD_RESET_METHOD_NONE; 7009 reset_context.reset_req_dev = adev; 7010 set_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags); 7011 set_bit(AMDGPU_SKIP_COREDUMP, &reset_context.flags); 7012 INIT_LIST_HEAD(&device_list); 7013 7014 hive = amdgpu_get_xgmi_hive(adev); 7015 if (hive) { 7016 mutex_lock(&hive->hive_lock); 7017 reset_context.hive = hive; 7018 list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head) { 7019 tmp_adev->pcie_reset_ctx.in_link_reset = true; 7020 list_add_tail(&tmp_adev->reset_list, &device_list); 7021 } 7022 } else { 7023 set_bit(AMDGPU_SKIP_HW_RESET, &reset_context.flags); 7024 list_add_tail(&adev->reset_list, &device_list); 7025 } 7026 7027 r = amdgpu_device_asic_reset(adev, &device_list, &reset_context); 7028 out: 7029 if (!r) { 7030 if (amdgpu_device_cache_pci_state(adev->pdev)) 7031 pci_restore_state(adev->pdev); 7032 dev_info(adev->dev, "PCIe error recovery succeeded\n"); 7033 } else { 7034 dev_err(adev->dev, "PCIe error recovery failed, err:%d\n", r); 7035 if (hive) { 7036 list_for_each_entry(tmp_adev, &device_list, reset_list) 7037 amdgpu_device_unset_mp1_state(tmp_adev); 7038 } 7039 amdgpu_device_recovery_put_reset_lock(adev, &device_list); 7040 } 7041 7042 if (hive) { 7043 mutex_unlock(&hive->hive_lock); 7044 amdgpu_put_xgmi_hive(hive); 7045 } 7046 7047 return r ? PCI_ERS_RESULT_DISCONNECT : PCI_ERS_RESULT_RECOVERED; 7048 } 7049 7050 /** 7051 * amdgpu_pci_resume() - resume normal ops after PCI reset 7052 * @pdev: pointer to PCI device 7053 * 7054 * Called when the error recovery driver tells us that its 7055 * OK to resume normal operation. 7056 */ 7057 void amdgpu_pci_resume(struct pci_dev *pdev) 7058 { 7059 struct drm_device *dev = pci_get_drvdata(pdev); 7060 struct amdgpu_device *adev = drm_to_adev(dev); 7061 struct list_head device_list; 7062 struct amdgpu_hive_info *hive = NULL; 7063 struct amdgpu_device *tmp_adev = NULL; 7064 7065 dev_info(adev->dev, "PCI error: resume callback!!\n"); 7066 7067 /* Only continue execution for the case of pci_channel_io_frozen */ 7068 if (adev->pci_channel_state != pci_channel_io_frozen) 7069 return; 7070 7071 INIT_LIST_HEAD(&device_list); 7072 7073 hive = amdgpu_get_xgmi_hive(adev); 7074 if (hive) { 7075 mutex_lock(&hive->hive_lock); 7076 list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head) { 7077 tmp_adev->pcie_reset_ctx.in_link_reset = false; 7078 list_add_tail(&tmp_adev->reset_list, &device_list); 7079 } 7080 } else 7081 list_add_tail(&adev->reset_list, &device_list); 7082 7083 amdgpu_device_sched_resume(&device_list, NULL, NULL); 7084 amdgpu_device_gpu_resume(adev, &device_list, false); 7085 amdgpu_device_recovery_put_reset_lock(adev, &device_list); 7086 adev->pcie_reset_ctx.occurs_dpc = false; 7087 7088 if (hive) { 7089 mutex_unlock(&hive->hive_lock); 7090 amdgpu_put_xgmi_hive(hive); 7091 } 7092 } 7093 7094 bool amdgpu_device_cache_pci_state(struct pci_dev *pdev) 7095 { 7096 struct drm_device *dev = pci_get_drvdata(pdev); 7097 struct amdgpu_device *adev = drm_to_adev(dev); 7098 int r; 7099 7100 if (amdgpu_sriov_vf(adev)) 7101 return false; 7102 7103 r = pci_save_state(pdev); 7104 if (!r) { 7105 kfree(adev->pci_state); 7106 7107 adev->pci_state = pci_store_saved_state(pdev); 7108 7109 if (!adev->pci_state) { 7110 dev_err(adev->dev, "Failed to store PCI saved state"); 7111 return false; 7112 } 7113 } else { 7114 dev_warn(adev->dev, "Failed to save PCI state, err:%d\n", r); 7115 return false; 7116 } 7117 7118 return true; 7119 } 7120 7121 bool amdgpu_device_load_pci_state(struct pci_dev *pdev) 7122 { 7123 struct drm_device *dev = pci_get_drvdata(pdev); 7124 struct amdgpu_device *adev = drm_to_adev(dev); 7125 int r; 7126 7127 if (!adev->pci_state) 7128 return false; 7129 7130 r = pci_load_saved_state(pdev, adev->pci_state); 7131 7132 if (!r) { 7133 pci_restore_state(pdev); 7134 } else { 7135 dev_warn(adev->dev, "Failed to load PCI state, err:%d\n", r); 7136 return false; 7137 } 7138 7139 return true; 7140 } 7141 7142 void amdgpu_device_flush_hdp(struct amdgpu_device *adev, 7143 struct amdgpu_ring *ring) 7144 { 7145 #ifdef CONFIG_X86_64 7146 if ((adev->flags & AMD_IS_APU) && !amdgpu_passthrough(adev)) 7147 return; 7148 #endif 7149 if (adev->gmc.xgmi.connected_to_cpu) 7150 return; 7151 7152 if (ring && ring->funcs->emit_hdp_flush) 7153 amdgpu_ring_emit_hdp_flush(ring); 7154 else 7155 amdgpu_asic_flush_hdp(adev, ring); 7156 } 7157 7158 void amdgpu_device_invalidate_hdp(struct amdgpu_device *adev, 7159 struct amdgpu_ring *ring) 7160 { 7161 #ifdef CONFIG_X86_64 7162 if ((adev->flags & AMD_IS_APU) && !amdgpu_passthrough(adev)) 7163 return; 7164 #endif 7165 if (adev->gmc.xgmi.connected_to_cpu) 7166 return; 7167 7168 amdgpu_asic_invalidate_hdp(adev, ring); 7169 } 7170 7171 int amdgpu_in_reset(struct amdgpu_device *adev) 7172 { 7173 return atomic_read(&adev->reset_domain->in_gpu_reset); 7174 } 7175 7176 /** 7177 * amdgpu_device_halt() - bring hardware to some kind of halt state 7178 * 7179 * @adev: amdgpu_device pointer 7180 * 7181 * Bring hardware to some kind of halt state so that no one can touch it 7182 * any more. It will help to maintain error context when error occurred. 7183 * Compare to a simple hang, the system will keep stable at least for SSH 7184 * access. Then it should be trivial to inspect the hardware state and 7185 * see what's going on. Implemented as following: 7186 * 7187 * 1. drm_dev_unplug() makes device inaccessible to user space(IOCTLs, etc), 7188 * clears all CPU mappings to device, disallows remappings through page faults 7189 * 2. amdgpu_irq_disable_all() disables all interrupts 7190 * 3. amdgpu_fence_driver_hw_fini() signals all HW fences 7191 * 4. set adev->no_hw_access to avoid potential crashes after setp 5 7192 * 5. amdgpu_device_unmap_mmio() clears all MMIO mappings 7193 * 6. pci_disable_device() and pci_wait_for_pending_transaction() 7194 * flush any in flight DMA operations 7195 */ 7196 void amdgpu_device_halt(struct amdgpu_device *adev) 7197 { 7198 struct pci_dev *pdev = adev->pdev; 7199 struct drm_device *ddev = adev_to_drm(adev); 7200 7201 amdgpu_xcp_dev_unplug(adev); 7202 drm_dev_unplug(ddev); 7203 7204 amdgpu_irq_disable_all(adev); 7205 7206 amdgpu_fence_driver_hw_fini(adev); 7207 7208 adev->no_hw_access = true; 7209 7210 amdgpu_device_unmap_mmio(adev); 7211 7212 pci_disable_device(pdev); 7213 pci_wait_for_pending_transaction(pdev); 7214 } 7215 7216 u32 amdgpu_device_pcie_port_rreg(struct amdgpu_device *adev, 7217 u32 reg) 7218 { 7219 unsigned long flags, address, data; 7220 u32 r; 7221 7222 address = adev->nbio.funcs->get_pcie_port_index_offset(adev); 7223 data = adev->nbio.funcs->get_pcie_port_data_offset(adev); 7224 7225 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 7226 WREG32(address, reg * 4); 7227 (void)RREG32(address); 7228 r = RREG32(data); 7229 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 7230 return r; 7231 } 7232 7233 void amdgpu_device_pcie_port_wreg(struct amdgpu_device *adev, 7234 u32 reg, u32 v) 7235 { 7236 unsigned long flags, address, data; 7237 7238 address = adev->nbio.funcs->get_pcie_port_index_offset(adev); 7239 data = adev->nbio.funcs->get_pcie_port_data_offset(adev); 7240 7241 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 7242 WREG32(address, reg * 4); 7243 (void)RREG32(address); 7244 WREG32(data, v); 7245 (void)RREG32(data); 7246 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 7247 } 7248 7249 /** 7250 * amdgpu_device_get_gang - return a reference to the current gang 7251 * @adev: amdgpu_device pointer 7252 * 7253 * Returns: A new reference to the current gang leader. 7254 */ 7255 struct dma_fence *amdgpu_device_get_gang(struct amdgpu_device *adev) 7256 { 7257 struct dma_fence *fence; 7258 7259 rcu_read_lock(); 7260 fence = dma_fence_get_rcu_safe(&adev->gang_submit); 7261 rcu_read_unlock(); 7262 return fence; 7263 } 7264 7265 /** 7266 * amdgpu_device_switch_gang - switch to a new gang 7267 * @adev: amdgpu_device pointer 7268 * @gang: the gang to switch to 7269 * 7270 * Try to switch to a new gang. 7271 * Returns: NULL if we switched to the new gang or a reference to the current 7272 * gang leader. 7273 */ 7274 struct dma_fence *amdgpu_device_switch_gang(struct amdgpu_device *adev, 7275 struct dma_fence *gang) 7276 { 7277 struct dma_fence *old = NULL; 7278 7279 dma_fence_get(gang); 7280 do { 7281 dma_fence_put(old); 7282 old = amdgpu_device_get_gang(adev); 7283 if (old == gang) 7284 break; 7285 7286 if (!dma_fence_is_signaled(old)) { 7287 dma_fence_put(gang); 7288 return old; 7289 } 7290 7291 } while (cmpxchg((struct dma_fence __force **)&adev->gang_submit, 7292 old, gang) != old); 7293 7294 /* 7295 * Drop it once for the exchanged reference in adev and once for the 7296 * thread local reference acquired in amdgpu_device_get_gang(). 7297 */ 7298 dma_fence_put(old); 7299 dma_fence_put(old); 7300 return NULL; 7301 } 7302 7303 /** 7304 * amdgpu_device_enforce_isolation - enforce HW isolation 7305 * @adev: the amdgpu device pointer 7306 * @ring: the HW ring the job is supposed to run on 7307 * @job: the job which is about to be pushed to the HW ring 7308 * 7309 * Makes sure that only one client at a time can use the GFX block. 7310 * Returns: The dependency to wait on before the job can be pushed to the HW. 7311 * The function is called multiple times until NULL is returned. 7312 */ 7313 struct dma_fence *amdgpu_device_enforce_isolation(struct amdgpu_device *adev, 7314 struct amdgpu_ring *ring, 7315 struct amdgpu_job *job) 7316 { 7317 struct amdgpu_isolation *isolation = &adev->isolation[ring->xcp_id]; 7318 struct drm_sched_fence *f = job->base.s_fence; 7319 struct dma_fence *dep; 7320 void *owner; 7321 int r; 7322 7323 /* 7324 * For now enforce isolation only for the GFX block since we only need 7325 * the cleaner shader on those rings. 7326 */ 7327 if (ring->funcs->type != AMDGPU_RING_TYPE_GFX && 7328 ring->funcs->type != AMDGPU_RING_TYPE_COMPUTE) 7329 return NULL; 7330 7331 /* 7332 * All submissions where enforce isolation is false are handled as if 7333 * they come from a single client. Use ~0l as the owner to distinct it 7334 * from kernel submissions where the owner is NULL. 7335 */ 7336 owner = job->enforce_isolation ? f->owner : (void *)~0l; 7337 7338 mutex_lock(&adev->enforce_isolation_mutex); 7339 7340 /* 7341 * The "spearhead" submission is the first one which changes the 7342 * ownership to its client. We always need to wait for it to be 7343 * pushed to the HW before proceeding with anything. 7344 */ 7345 if (&f->scheduled != isolation->spearhead && 7346 !dma_fence_is_signaled(isolation->spearhead)) { 7347 dep = isolation->spearhead; 7348 goto out_grab_ref; 7349 } 7350 7351 if (isolation->owner != owner) { 7352 7353 /* 7354 * Wait for any gang to be assembled before switching to a 7355 * different owner or otherwise we could deadlock the 7356 * submissions. 7357 */ 7358 if (!job->gang_submit) { 7359 dep = amdgpu_device_get_gang(adev); 7360 if (!dma_fence_is_signaled(dep)) 7361 goto out_return_dep; 7362 dma_fence_put(dep); 7363 } 7364 7365 dma_fence_put(isolation->spearhead); 7366 isolation->spearhead = dma_fence_get(&f->scheduled); 7367 amdgpu_sync_move(&isolation->active, &isolation->prev); 7368 trace_amdgpu_isolation(isolation->owner, owner); 7369 isolation->owner = owner; 7370 } 7371 7372 /* 7373 * Specifying the ring here helps to pipeline submissions even when 7374 * isolation is enabled. If that is not desired for testing NULL can be 7375 * used instead of the ring to enforce a CPU round trip while switching 7376 * between clients. 7377 */ 7378 dep = amdgpu_sync_peek_fence(&isolation->prev, ring); 7379 r = amdgpu_sync_fence(&isolation->active, &f->finished, GFP_NOWAIT); 7380 if (r) 7381 dev_warn(adev->dev, "OOM tracking isolation\n"); 7382 7383 out_grab_ref: 7384 dma_fence_get(dep); 7385 out_return_dep: 7386 mutex_unlock(&adev->enforce_isolation_mutex); 7387 return dep; 7388 } 7389 7390 bool amdgpu_device_has_display_hardware(struct amdgpu_device *adev) 7391 { 7392 switch (adev->asic_type) { 7393 #ifdef CONFIG_DRM_AMDGPU_SI 7394 case CHIP_HAINAN: 7395 #endif 7396 case CHIP_TOPAZ: 7397 /* chips with no display hardware */ 7398 return false; 7399 #ifdef CONFIG_DRM_AMDGPU_SI 7400 case CHIP_TAHITI: 7401 case CHIP_PITCAIRN: 7402 case CHIP_VERDE: 7403 case CHIP_OLAND: 7404 #endif 7405 #ifdef CONFIG_DRM_AMDGPU_CIK 7406 case CHIP_BONAIRE: 7407 case CHIP_HAWAII: 7408 case CHIP_KAVERI: 7409 case CHIP_KABINI: 7410 case CHIP_MULLINS: 7411 #endif 7412 case CHIP_TONGA: 7413 case CHIP_FIJI: 7414 case CHIP_POLARIS10: 7415 case CHIP_POLARIS11: 7416 case CHIP_POLARIS12: 7417 case CHIP_VEGAM: 7418 case CHIP_CARRIZO: 7419 case CHIP_STONEY: 7420 /* chips with display hardware */ 7421 return true; 7422 default: 7423 /* IP discovery */ 7424 if (!amdgpu_ip_version(adev, DCE_HWIP, 0) || 7425 (adev->harvest_ip_mask & AMD_HARVEST_IP_DMU_MASK)) 7426 return false; 7427 return true; 7428 } 7429 } 7430 7431 uint32_t amdgpu_device_wait_on_rreg(struct amdgpu_device *adev, 7432 uint32_t inst, uint32_t reg_addr, char reg_name[], 7433 uint32_t expected_value, uint32_t mask) 7434 { 7435 uint32_t ret = 0; 7436 uint32_t old_ = 0; 7437 uint32_t tmp_ = RREG32(reg_addr); 7438 uint32_t loop = adev->usec_timeout; 7439 7440 while ((tmp_ & (mask)) != (expected_value)) { 7441 if (old_ != tmp_) { 7442 loop = adev->usec_timeout; 7443 old_ = tmp_; 7444 } else 7445 udelay(1); 7446 tmp_ = RREG32(reg_addr); 7447 loop--; 7448 if (!loop) { 7449 dev_warn( 7450 adev->dev, 7451 "Register(%d) [%s] failed to reach value 0x%08x != 0x%08xn", 7452 inst, reg_name, (uint32_t)expected_value, 7453 (uint32_t)(tmp_ & (mask))); 7454 ret = -ETIMEDOUT; 7455 break; 7456 } 7457 } 7458 return ret; 7459 } 7460 7461 ssize_t amdgpu_get_soft_full_reset_mask(struct amdgpu_ring *ring) 7462 { 7463 ssize_t size = 0; 7464 7465 if (!ring || !ring->adev) 7466 return size; 7467 7468 if (amdgpu_device_should_recover_gpu(ring->adev)) 7469 size |= AMDGPU_RESET_TYPE_FULL; 7470 7471 if (unlikely(!ring->adev->debug_disable_soft_recovery) && 7472 !amdgpu_sriov_vf(ring->adev) && ring->funcs->soft_recovery) 7473 size |= AMDGPU_RESET_TYPE_SOFT_RESET; 7474 7475 return size; 7476 } 7477 7478 ssize_t amdgpu_show_reset_mask(char *buf, uint32_t supported_reset) 7479 { 7480 ssize_t size = 0; 7481 7482 if (supported_reset == 0) { 7483 size += sysfs_emit_at(buf, size, "unsupported"); 7484 size += sysfs_emit_at(buf, size, "\n"); 7485 return size; 7486 7487 } 7488 7489 if (supported_reset & AMDGPU_RESET_TYPE_SOFT_RESET) 7490 size += sysfs_emit_at(buf, size, "soft "); 7491 7492 if (supported_reset & AMDGPU_RESET_TYPE_PER_QUEUE) 7493 size += sysfs_emit_at(buf, size, "queue "); 7494 7495 if (supported_reset & AMDGPU_RESET_TYPE_PER_PIPE) 7496 size += sysfs_emit_at(buf, size, "pipe "); 7497 7498 if (supported_reset & AMDGPU_RESET_TYPE_FULL) 7499 size += sysfs_emit_at(buf, size, "full "); 7500 7501 size += sysfs_emit_at(buf, size, "\n"); 7502 return size; 7503 } 7504