1 /* 2 * Copyright 2008 Advanced Micro Devices, Inc. 3 * Copyright 2008 Red Hat Inc. 4 * Copyright 2009 Jerome Glisse. 5 * 6 * Permission is hereby granted, free of charge, to any person obtaining a 7 * copy of this software and associated documentation files (the "Software"), 8 * to deal in the Software without restriction, including without limitation 9 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 10 * and/or sell copies of the Software, and to permit persons to whom the 11 * Software is furnished to do so, subject to the following conditions: 12 * 13 * The above copyright notice and this permission notice shall be included in 14 * all copies or substantial portions of the Software. 15 * 16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 19 * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR 20 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, 21 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR 22 * OTHER DEALINGS IN THE SOFTWARE. 23 * 24 * Authors: Dave Airlie 25 * Alex Deucher 26 * Jerome Glisse 27 */ 28 29 #include <linux/aperture.h> 30 #include <linux/power_supply.h> 31 #include <linux/kthread.h> 32 #include <linux/module.h> 33 #include <linux/console.h> 34 #include <linux/slab.h> 35 #include <linux/iommu.h> 36 #include <linux/pci.h> 37 #include <linux/pci-p2pdma.h> 38 #include <linux/apple-gmux.h> 39 40 #include <drm/drm_atomic_helper.h> 41 #include <drm/drm_client_event.h> 42 #include <drm/drm_crtc_helper.h> 43 #include <drm/drm_probe_helper.h> 44 #include <drm/amdgpu_drm.h> 45 #include <linux/device.h> 46 #include <linux/vgaarb.h> 47 #include <linux/vga_switcheroo.h> 48 #include <linux/efi.h> 49 #include "amdgpu.h" 50 #include "amdgpu_trace.h" 51 #include "amdgpu_i2c.h" 52 #include "atom.h" 53 #include "amdgpu_atombios.h" 54 #include "amdgpu_atomfirmware.h" 55 #include "amd_pcie.h" 56 #ifdef CONFIG_DRM_AMDGPU_SI 57 #include "si.h" 58 #endif 59 #ifdef CONFIG_DRM_AMDGPU_CIK 60 #include "cik.h" 61 #endif 62 #include "vi.h" 63 #include "soc15.h" 64 #include "nv.h" 65 #include "bif/bif_4_1_d.h" 66 #include <linux/firmware.h> 67 #include "amdgpu_vf_error.h" 68 69 #include "amdgpu_amdkfd.h" 70 #include "amdgpu_pm.h" 71 72 #include "amdgpu_xgmi.h" 73 #include "amdgpu_ras.h" 74 #include "amdgpu_pmu.h" 75 #include "amdgpu_fru_eeprom.h" 76 #include "amdgpu_reset.h" 77 #include "amdgpu_virt.h" 78 #include "amdgpu_dev_coredump.h" 79 80 #include <linux/suspend.h> 81 #include <drm/task_barrier.h> 82 #include <linux/pm_runtime.h> 83 84 #include <drm/drm_drv.h> 85 86 #if IS_ENABLED(CONFIG_X86) 87 #include <asm/intel-family.h> 88 #include <asm/cpu_device_id.h> 89 #endif 90 91 MODULE_FIRMWARE("amdgpu/vega10_gpu_info.bin"); 92 MODULE_FIRMWARE("amdgpu/vega12_gpu_info.bin"); 93 MODULE_FIRMWARE("amdgpu/raven_gpu_info.bin"); 94 MODULE_FIRMWARE("amdgpu/picasso_gpu_info.bin"); 95 MODULE_FIRMWARE("amdgpu/raven2_gpu_info.bin"); 96 MODULE_FIRMWARE("amdgpu/arcturus_gpu_info.bin"); 97 MODULE_FIRMWARE("amdgpu/navi12_gpu_info.bin"); 98 99 #define AMDGPU_RESUME_MS 2000 100 #define AMDGPU_MAX_RETRY_LIMIT 2 101 #define AMDGPU_RETRY_SRIOV_RESET(r) ((r) == -EBUSY || (r) == -ETIMEDOUT || (r) == -EINVAL) 102 #define AMDGPU_PCIE_INDEX_FALLBACK (0x38 >> 2) 103 #define AMDGPU_PCIE_INDEX_HI_FALLBACK (0x44 >> 2) 104 #define AMDGPU_PCIE_DATA_FALLBACK (0x3C >> 2) 105 106 #define AMDGPU_VBIOS_SKIP (1U << 0) 107 #define AMDGPU_VBIOS_OPTIONAL (1U << 1) 108 109 static const struct drm_driver amdgpu_kms_driver; 110 111 const char *amdgpu_asic_name[] = { 112 "TAHITI", 113 "PITCAIRN", 114 "VERDE", 115 "OLAND", 116 "HAINAN", 117 "BONAIRE", 118 "KAVERI", 119 "KABINI", 120 "HAWAII", 121 "MULLINS", 122 "TOPAZ", 123 "TONGA", 124 "FIJI", 125 "CARRIZO", 126 "STONEY", 127 "POLARIS10", 128 "POLARIS11", 129 "POLARIS12", 130 "VEGAM", 131 "VEGA10", 132 "VEGA12", 133 "VEGA20", 134 "RAVEN", 135 "ARCTURUS", 136 "RENOIR", 137 "ALDEBARAN", 138 "NAVI10", 139 "CYAN_SKILLFISH", 140 "NAVI14", 141 "NAVI12", 142 "SIENNA_CICHLID", 143 "NAVY_FLOUNDER", 144 "VANGOGH", 145 "DIMGREY_CAVEFISH", 146 "BEIGE_GOBY", 147 "YELLOW_CARP", 148 "IP DISCOVERY", 149 "LAST", 150 }; 151 152 #define AMDGPU_IP_BLK_MASK_ALL GENMASK(AMD_IP_BLOCK_TYPE_NUM - 1, 0) 153 /* 154 * Default init level where all blocks are expected to be initialized. This is 155 * the level of initialization expected by default and also after a full reset 156 * of the device. 157 */ 158 struct amdgpu_init_level amdgpu_init_default = { 159 .level = AMDGPU_INIT_LEVEL_DEFAULT, 160 .hwini_ip_block_mask = AMDGPU_IP_BLK_MASK_ALL, 161 }; 162 163 struct amdgpu_init_level amdgpu_init_recovery = { 164 .level = AMDGPU_INIT_LEVEL_RESET_RECOVERY, 165 .hwini_ip_block_mask = AMDGPU_IP_BLK_MASK_ALL, 166 }; 167 168 /* 169 * Minimal blocks needed to be initialized before a XGMI hive can be reset. This 170 * is used for cases like reset on initialization where the entire hive needs to 171 * be reset before first use. 172 */ 173 struct amdgpu_init_level amdgpu_init_minimal_xgmi = { 174 .level = AMDGPU_INIT_LEVEL_MINIMAL_XGMI, 175 .hwini_ip_block_mask = 176 BIT(AMD_IP_BLOCK_TYPE_GMC) | BIT(AMD_IP_BLOCK_TYPE_SMC) | 177 BIT(AMD_IP_BLOCK_TYPE_COMMON) | BIT(AMD_IP_BLOCK_TYPE_IH) | 178 BIT(AMD_IP_BLOCK_TYPE_PSP) 179 }; 180 181 static inline bool amdgpu_ip_member_of_hwini(struct amdgpu_device *adev, 182 enum amd_ip_block_type block) 183 { 184 return (adev->init_lvl->hwini_ip_block_mask & (1U << block)) != 0; 185 } 186 187 void amdgpu_set_init_level(struct amdgpu_device *adev, 188 enum amdgpu_init_lvl_id lvl) 189 { 190 switch (lvl) { 191 case AMDGPU_INIT_LEVEL_MINIMAL_XGMI: 192 adev->init_lvl = &amdgpu_init_minimal_xgmi; 193 break; 194 case AMDGPU_INIT_LEVEL_RESET_RECOVERY: 195 adev->init_lvl = &amdgpu_init_recovery; 196 break; 197 case AMDGPU_INIT_LEVEL_DEFAULT: 198 fallthrough; 199 default: 200 adev->init_lvl = &amdgpu_init_default; 201 break; 202 } 203 } 204 205 static inline void amdgpu_device_stop_pending_resets(struct amdgpu_device *adev); 206 static int amdgpu_device_pm_notifier(struct notifier_block *nb, unsigned long mode, 207 void *data); 208 209 /** 210 * DOC: pcie_replay_count 211 * 212 * The amdgpu driver provides a sysfs API for reporting the total number 213 * of PCIe replays (NAKs). 214 * The file pcie_replay_count is used for this and returns the total 215 * number of replays as a sum of the NAKs generated and NAKs received. 216 */ 217 218 static ssize_t amdgpu_device_get_pcie_replay_count(struct device *dev, 219 struct device_attribute *attr, char *buf) 220 { 221 struct drm_device *ddev = dev_get_drvdata(dev); 222 struct amdgpu_device *adev = drm_to_adev(ddev); 223 uint64_t cnt = amdgpu_asic_get_pcie_replay_count(adev); 224 225 return sysfs_emit(buf, "%llu\n", cnt); 226 } 227 228 static DEVICE_ATTR(pcie_replay_count, 0444, 229 amdgpu_device_get_pcie_replay_count, NULL); 230 231 static int amdgpu_device_attr_sysfs_init(struct amdgpu_device *adev) 232 { 233 int ret = 0; 234 235 if (amdgpu_nbio_is_replay_cnt_supported(adev)) 236 ret = sysfs_create_file(&adev->dev->kobj, 237 &dev_attr_pcie_replay_count.attr); 238 239 return ret; 240 } 241 242 static void amdgpu_device_attr_sysfs_fini(struct amdgpu_device *adev) 243 { 244 if (amdgpu_nbio_is_replay_cnt_supported(adev)) 245 sysfs_remove_file(&adev->dev->kobj, 246 &dev_attr_pcie_replay_count.attr); 247 } 248 249 static ssize_t amdgpu_sysfs_reg_state_get(struct file *f, struct kobject *kobj, 250 const struct bin_attribute *attr, char *buf, 251 loff_t ppos, size_t count) 252 { 253 struct device *dev = kobj_to_dev(kobj); 254 struct drm_device *ddev = dev_get_drvdata(dev); 255 struct amdgpu_device *adev = drm_to_adev(ddev); 256 ssize_t bytes_read; 257 258 switch (ppos) { 259 case AMDGPU_SYS_REG_STATE_XGMI: 260 bytes_read = amdgpu_asic_get_reg_state( 261 adev, AMDGPU_REG_STATE_TYPE_XGMI, buf, count); 262 break; 263 case AMDGPU_SYS_REG_STATE_WAFL: 264 bytes_read = amdgpu_asic_get_reg_state( 265 adev, AMDGPU_REG_STATE_TYPE_WAFL, buf, count); 266 break; 267 case AMDGPU_SYS_REG_STATE_PCIE: 268 bytes_read = amdgpu_asic_get_reg_state( 269 adev, AMDGPU_REG_STATE_TYPE_PCIE, buf, count); 270 break; 271 case AMDGPU_SYS_REG_STATE_USR: 272 bytes_read = amdgpu_asic_get_reg_state( 273 adev, AMDGPU_REG_STATE_TYPE_USR, buf, count); 274 break; 275 case AMDGPU_SYS_REG_STATE_USR_1: 276 bytes_read = amdgpu_asic_get_reg_state( 277 adev, AMDGPU_REG_STATE_TYPE_USR_1, buf, count); 278 break; 279 default: 280 return -EINVAL; 281 } 282 283 return bytes_read; 284 } 285 286 static const BIN_ATTR(reg_state, 0444, amdgpu_sysfs_reg_state_get, NULL, 287 AMDGPU_SYS_REG_STATE_END); 288 289 int amdgpu_reg_state_sysfs_init(struct amdgpu_device *adev) 290 { 291 int ret; 292 293 if (!amdgpu_asic_get_reg_state_supported(adev)) 294 return 0; 295 296 ret = sysfs_create_bin_file(&adev->dev->kobj, &bin_attr_reg_state); 297 298 return ret; 299 } 300 301 void amdgpu_reg_state_sysfs_fini(struct amdgpu_device *adev) 302 { 303 if (!amdgpu_asic_get_reg_state_supported(adev)) 304 return; 305 sysfs_remove_bin_file(&adev->dev->kobj, &bin_attr_reg_state); 306 } 307 308 int amdgpu_ip_block_suspend(struct amdgpu_ip_block *ip_block) 309 { 310 int r; 311 312 if (ip_block->version->funcs->suspend) { 313 r = ip_block->version->funcs->suspend(ip_block); 314 if (r) { 315 dev_err(ip_block->adev->dev, 316 "suspend of IP block <%s> failed %d\n", 317 ip_block->version->funcs->name, r); 318 return r; 319 } 320 } 321 322 ip_block->status.hw = false; 323 return 0; 324 } 325 326 int amdgpu_ip_block_resume(struct amdgpu_ip_block *ip_block) 327 { 328 int r; 329 330 if (ip_block->version->funcs->resume) { 331 r = ip_block->version->funcs->resume(ip_block); 332 if (r) { 333 dev_err(ip_block->adev->dev, 334 "resume of IP block <%s> failed %d\n", 335 ip_block->version->funcs->name, r); 336 return r; 337 } 338 } 339 340 ip_block->status.hw = true; 341 return 0; 342 } 343 344 /** 345 * DOC: board_info 346 * 347 * The amdgpu driver provides a sysfs API for giving board related information. 348 * It provides the form factor information in the format 349 * 350 * type : form factor 351 * 352 * Possible form factor values 353 * 354 * - "cem" - PCIE CEM card 355 * - "oam" - Open Compute Accelerator Module 356 * - "unknown" - Not known 357 * 358 */ 359 360 static ssize_t amdgpu_device_get_board_info(struct device *dev, 361 struct device_attribute *attr, 362 char *buf) 363 { 364 struct drm_device *ddev = dev_get_drvdata(dev); 365 struct amdgpu_device *adev = drm_to_adev(ddev); 366 enum amdgpu_pkg_type pkg_type = AMDGPU_PKG_TYPE_CEM; 367 const char *pkg; 368 369 if (adev->smuio.funcs && adev->smuio.funcs->get_pkg_type) 370 pkg_type = adev->smuio.funcs->get_pkg_type(adev); 371 372 switch (pkg_type) { 373 case AMDGPU_PKG_TYPE_CEM: 374 pkg = "cem"; 375 break; 376 case AMDGPU_PKG_TYPE_OAM: 377 pkg = "oam"; 378 break; 379 default: 380 pkg = "unknown"; 381 break; 382 } 383 384 return sysfs_emit(buf, "%s : %s\n", "type", pkg); 385 } 386 387 static DEVICE_ATTR(board_info, 0444, amdgpu_device_get_board_info, NULL); 388 389 static struct attribute *amdgpu_board_attrs[] = { 390 &dev_attr_board_info.attr, 391 NULL, 392 }; 393 394 static umode_t amdgpu_board_attrs_is_visible(struct kobject *kobj, 395 struct attribute *attr, int n) 396 { 397 struct device *dev = kobj_to_dev(kobj); 398 struct drm_device *ddev = dev_get_drvdata(dev); 399 struct amdgpu_device *adev = drm_to_adev(ddev); 400 401 if (adev->flags & AMD_IS_APU) 402 return 0; 403 404 return attr->mode; 405 } 406 407 static const struct attribute_group amdgpu_board_attrs_group = { 408 .attrs = amdgpu_board_attrs, 409 .is_visible = amdgpu_board_attrs_is_visible 410 }; 411 412 static void amdgpu_device_get_pcie_info(struct amdgpu_device *adev); 413 414 415 /** 416 * amdgpu_device_supports_px - Is the device a dGPU with ATPX power control 417 * 418 * @dev: drm_device pointer 419 * 420 * Returns true if the device is a dGPU with ATPX power control, 421 * otherwise return false. 422 */ 423 bool amdgpu_device_supports_px(struct drm_device *dev) 424 { 425 struct amdgpu_device *adev = drm_to_adev(dev); 426 427 if ((adev->flags & AMD_IS_PX) && !amdgpu_is_atpx_hybrid()) 428 return true; 429 return false; 430 } 431 432 /** 433 * amdgpu_device_supports_boco - Is the device a dGPU with ACPI power resources 434 * 435 * @dev: drm_device pointer 436 * 437 * Returns true if the device is a dGPU with ACPI power control, 438 * otherwise return false. 439 */ 440 bool amdgpu_device_supports_boco(struct drm_device *dev) 441 { 442 struct amdgpu_device *adev = drm_to_adev(dev); 443 444 if (!IS_ENABLED(CONFIG_HOTPLUG_PCI_PCIE)) 445 return false; 446 447 if (adev->has_pr3 || 448 ((adev->flags & AMD_IS_PX) && amdgpu_is_atpx_hybrid())) 449 return true; 450 return false; 451 } 452 453 /** 454 * amdgpu_device_supports_baco - Does the device support BACO 455 * 456 * @dev: drm_device pointer 457 * 458 * Return: 459 * 1 if the device supports BACO; 460 * 3 if the device supports MACO (only works if BACO is supported) 461 * otherwise return 0. 462 */ 463 int amdgpu_device_supports_baco(struct drm_device *dev) 464 { 465 struct amdgpu_device *adev = drm_to_adev(dev); 466 467 return amdgpu_asic_supports_baco(adev); 468 } 469 470 void amdgpu_device_detect_runtime_pm_mode(struct amdgpu_device *adev) 471 { 472 struct drm_device *dev; 473 int bamaco_support; 474 475 dev = adev_to_drm(adev); 476 477 adev->pm.rpm_mode = AMDGPU_RUNPM_NONE; 478 bamaco_support = amdgpu_device_supports_baco(dev); 479 480 switch (amdgpu_runtime_pm) { 481 case 2: 482 if (bamaco_support & MACO_SUPPORT) { 483 adev->pm.rpm_mode = AMDGPU_RUNPM_BAMACO; 484 dev_info(adev->dev, "Forcing BAMACO for runtime pm\n"); 485 } else if (bamaco_support == BACO_SUPPORT) { 486 adev->pm.rpm_mode = AMDGPU_RUNPM_BACO; 487 dev_info(adev->dev, "Requested mode BAMACO not available,fallback to use BACO\n"); 488 } 489 break; 490 case 1: 491 if (bamaco_support & BACO_SUPPORT) { 492 adev->pm.rpm_mode = AMDGPU_RUNPM_BACO; 493 dev_info(adev->dev, "Forcing BACO for runtime pm\n"); 494 } 495 break; 496 case -1: 497 case -2: 498 if (amdgpu_device_supports_px(dev)) { /* enable PX as runtime mode */ 499 adev->pm.rpm_mode = AMDGPU_RUNPM_PX; 500 dev_info(adev->dev, "Using ATPX for runtime pm\n"); 501 } else if (amdgpu_device_supports_boco(dev)) { /* enable boco as runtime mode */ 502 adev->pm.rpm_mode = AMDGPU_RUNPM_BOCO; 503 dev_info(adev->dev, "Using BOCO for runtime pm\n"); 504 } else { 505 if (!bamaco_support) 506 goto no_runtime_pm; 507 508 switch (adev->asic_type) { 509 case CHIP_VEGA20: 510 case CHIP_ARCTURUS: 511 /* BACO are not supported on vega20 and arctrus */ 512 break; 513 case CHIP_VEGA10: 514 /* enable BACO as runpm mode if noretry=0 */ 515 if (!adev->gmc.noretry && !amdgpu_passthrough(adev)) 516 adev->pm.rpm_mode = AMDGPU_RUNPM_BACO; 517 break; 518 default: 519 /* enable BACO as runpm mode on CI+ */ 520 if (!amdgpu_passthrough(adev)) 521 adev->pm.rpm_mode = AMDGPU_RUNPM_BACO; 522 break; 523 } 524 525 if (adev->pm.rpm_mode == AMDGPU_RUNPM_BACO) { 526 if (bamaco_support & MACO_SUPPORT) { 527 adev->pm.rpm_mode = AMDGPU_RUNPM_BAMACO; 528 dev_info(adev->dev, "Using BAMACO for runtime pm\n"); 529 } else { 530 dev_info(adev->dev, "Using BACO for runtime pm\n"); 531 } 532 } 533 } 534 break; 535 case 0: 536 dev_info(adev->dev, "runtime pm is manually disabled\n"); 537 break; 538 default: 539 break; 540 } 541 542 no_runtime_pm: 543 if (adev->pm.rpm_mode == AMDGPU_RUNPM_NONE) 544 dev_info(adev->dev, "Runtime PM not available\n"); 545 } 546 /** 547 * amdgpu_device_supports_smart_shift - Is the device dGPU with 548 * smart shift support 549 * 550 * @dev: drm_device pointer 551 * 552 * Returns true if the device is a dGPU with Smart Shift support, 553 * otherwise returns false. 554 */ 555 bool amdgpu_device_supports_smart_shift(struct drm_device *dev) 556 { 557 return (amdgpu_device_supports_boco(dev) && 558 amdgpu_acpi_is_power_shift_control_supported()); 559 } 560 561 /* 562 * VRAM access helper functions 563 */ 564 565 /** 566 * amdgpu_device_mm_access - access vram by MM_INDEX/MM_DATA 567 * 568 * @adev: amdgpu_device pointer 569 * @pos: offset of the buffer in vram 570 * @buf: virtual address of the buffer in system memory 571 * @size: read/write size, sizeof(@buf) must > @size 572 * @write: true - write to vram, otherwise - read from vram 573 */ 574 void amdgpu_device_mm_access(struct amdgpu_device *adev, loff_t pos, 575 void *buf, size_t size, bool write) 576 { 577 unsigned long flags; 578 uint32_t hi = ~0, tmp = 0; 579 uint32_t *data = buf; 580 uint64_t last; 581 int idx; 582 583 if (!drm_dev_enter(adev_to_drm(adev), &idx)) 584 return; 585 586 BUG_ON(!IS_ALIGNED(pos, 4) || !IS_ALIGNED(size, 4)); 587 588 spin_lock_irqsave(&adev->mmio_idx_lock, flags); 589 for (last = pos + size; pos < last; pos += 4) { 590 tmp = pos >> 31; 591 592 WREG32_NO_KIQ(mmMM_INDEX, ((uint32_t)pos) | 0x80000000); 593 if (tmp != hi) { 594 WREG32_NO_KIQ(mmMM_INDEX_HI, tmp); 595 hi = tmp; 596 } 597 if (write) 598 WREG32_NO_KIQ(mmMM_DATA, *data++); 599 else 600 *data++ = RREG32_NO_KIQ(mmMM_DATA); 601 } 602 603 spin_unlock_irqrestore(&adev->mmio_idx_lock, flags); 604 drm_dev_exit(idx); 605 } 606 607 /** 608 * amdgpu_device_aper_access - access vram by vram aperture 609 * 610 * @adev: amdgpu_device pointer 611 * @pos: offset of the buffer in vram 612 * @buf: virtual address of the buffer in system memory 613 * @size: read/write size, sizeof(@buf) must > @size 614 * @write: true - write to vram, otherwise - read from vram 615 * 616 * The return value means how many bytes have been transferred. 617 */ 618 size_t amdgpu_device_aper_access(struct amdgpu_device *adev, loff_t pos, 619 void *buf, size_t size, bool write) 620 { 621 #ifdef CONFIG_64BIT 622 void __iomem *addr; 623 size_t count = 0; 624 uint64_t last; 625 626 if (!adev->mman.aper_base_kaddr) 627 return 0; 628 629 last = min(pos + size, adev->gmc.visible_vram_size); 630 if (last > pos) { 631 addr = adev->mman.aper_base_kaddr + pos; 632 count = last - pos; 633 634 if (write) { 635 memcpy_toio(addr, buf, count); 636 /* Make sure HDP write cache flush happens without any reordering 637 * after the system memory contents are sent over PCIe device 638 */ 639 mb(); 640 amdgpu_device_flush_hdp(adev, NULL); 641 } else { 642 amdgpu_device_invalidate_hdp(adev, NULL); 643 /* Make sure HDP read cache is invalidated before issuing a read 644 * to the PCIe device 645 */ 646 mb(); 647 memcpy_fromio(buf, addr, count); 648 } 649 650 } 651 652 return count; 653 #else 654 return 0; 655 #endif 656 } 657 658 /** 659 * amdgpu_device_vram_access - read/write a buffer in vram 660 * 661 * @adev: amdgpu_device pointer 662 * @pos: offset of the buffer in vram 663 * @buf: virtual address of the buffer in system memory 664 * @size: read/write size, sizeof(@buf) must > @size 665 * @write: true - write to vram, otherwise - read from vram 666 */ 667 void amdgpu_device_vram_access(struct amdgpu_device *adev, loff_t pos, 668 void *buf, size_t size, bool write) 669 { 670 size_t count; 671 672 /* try to using vram apreature to access vram first */ 673 count = amdgpu_device_aper_access(adev, pos, buf, size, write); 674 size -= count; 675 if (size) { 676 /* using MM to access rest vram */ 677 pos += count; 678 buf += count; 679 amdgpu_device_mm_access(adev, pos, buf, size, write); 680 } 681 } 682 683 /* 684 * register access helper functions. 685 */ 686 687 /* Check if hw access should be skipped because of hotplug or device error */ 688 bool amdgpu_device_skip_hw_access(struct amdgpu_device *adev) 689 { 690 if (adev->no_hw_access) 691 return true; 692 693 #ifdef CONFIG_LOCKDEP 694 /* 695 * This is a bit complicated to understand, so worth a comment. What we assert 696 * here is that the GPU reset is not running on another thread in parallel. 697 * 698 * For this we trylock the read side of the reset semaphore, if that succeeds 699 * we know that the reset is not running in parallel. 700 * 701 * If the trylock fails we assert that we are either already holding the read 702 * side of the lock or are the reset thread itself and hold the write side of 703 * the lock. 704 */ 705 if (in_task()) { 706 if (down_read_trylock(&adev->reset_domain->sem)) 707 up_read(&adev->reset_domain->sem); 708 else 709 lockdep_assert_held(&adev->reset_domain->sem); 710 } 711 #endif 712 return false; 713 } 714 715 /** 716 * amdgpu_device_rreg - read a memory mapped IO or indirect register 717 * 718 * @adev: amdgpu_device pointer 719 * @reg: dword aligned register offset 720 * @acc_flags: access flags which require special behavior 721 * 722 * Returns the 32 bit value from the offset specified. 723 */ 724 uint32_t amdgpu_device_rreg(struct amdgpu_device *adev, 725 uint32_t reg, uint32_t acc_flags) 726 { 727 uint32_t ret; 728 729 if (amdgpu_device_skip_hw_access(adev)) 730 return 0; 731 732 if ((reg * 4) < adev->rmmio_size) { 733 if (!(acc_flags & AMDGPU_REGS_NO_KIQ) && 734 amdgpu_sriov_runtime(adev) && 735 down_read_trylock(&adev->reset_domain->sem)) { 736 ret = amdgpu_kiq_rreg(adev, reg, 0); 737 up_read(&adev->reset_domain->sem); 738 } else { 739 ret = readl(((void __iomem *)adev->rmmio) + (reg * 4)); 740 } 741 } else { 742 ret = adev->pcie_rreg(adev, reg * 4); 743 } 744 745 trace_amdgpu_device_rreg(adev->pdev->device, reg, ret); 746 747 return ret; 748 } 749 750 /* 751 * MMIO register read with bytes helper functions 752 * @offset:bytes offset from MMIO start 753 */ 754 755 /** 756 * amdgpu_mm_rreg8 - read a memory mapped IO register 757 * 758 * @adev: amdgpu_device pointer 759 * @offset: byte aligned register offset 760 * 761 * Returns the 8 bit value from the offset specified. 762 */ 763 uint8_t amdgpu_mm_rreg8(struct amdgpu_device *adev, uint32_t offset) 764 { 765 if (amdgpu_device_skip_hw_access(adev)) 766 return 0; 767 768 if (offset < adev->rmmio_size) 769 return (readb(adev->rmmio + offset)); 770 BUG(); 771 } 772 773 774 /** 775 * amdgpu_device_xcc_rreg - read a memory mapped IO or indirect register with specific XCC 776 * 777 * @adev: amdgpu_device pointer 778 * @reg: dword aligned register offset 779 * @acc_flags: access flags which require special behavior 780 * @xcc_id: xcc accelerated compute core id 781 * 782 * Returns the 32 bit value from the offset specified. 783 */ 784 uint32_t amdgpu_device_xcc_rreg(struct amdgpu_device *adev, 785 uint32_t reg, uint32_t acc_flags, 786 uint32_t xcc_id) 787 { 788 uint32_t ret, rlcg_flag; 789 790 if (amdgpu_device_skip_hw_access(adev)) 791 return 0; 792 793 if ((reg * 4) < adev->rmmio_size) { 794 if (amdgpu_sriov_vf(adev) && 795 !amdgpu_sriov_runtime(adev) && 796 adev->gfx.rlc.rlcg_reg_access_supported && 797 amdgpu_virt_get_rlcg_reg_access_flag(adev, acc_flags, 798 GC_HWIP, false, 799 &rlcg_flag)) { 800 ret = amdgpu_virt_rlcg_reg_rw(adev, reg, 0, rlcg_flag, GET_INST(GC, xcc_id)); 801 } else if (!(acc_flags & AMDGPU_REGS_NO_KIQ) && 802 amdgpu_sriov_runtime(adev) && 803 down_read_trylock(&adev->reset_domain->sem)) { 804 ret = amdgpu_kiq_rreg(adev, reg, xcc_id); 805 up_read(&adev->reset_domain->sem); 806 } else { 807 ret = readl(((void __iomem *)adev->rmmio) + (reg * 4)); 808 } 809 } else { 810 ret = adev->pcie_rreg(adev, reg * 4); 811 } 812 813 return ret; 814 } 815 816 /* 817 * MMIO register write with bytes helper functions 818 * @offset:bytes offset from MMIO start 819 * @value: the value want to be written to the register 820 */ 821 822 /** 823 * amdgpu_mm_wreg8 - read a memory mapped IO register 824 * 825 * @adev: amdgpu_device pointer 826 * @offset: byte aligned register offset 827 * @value: 8 bit value to write 828 * 829 * Writes the value specified to the offset specified. 830 */ 831 void amdgpu_mm_wreg8(struct amdgpu_device *adev, uint32_t offset, uint8_t value) 832 { 833 if (amdgpu_device_skip_hw_access(adev)) 834 return; 835 836 if (offset < adev->rmmio_size) 837 writeb(value, adev->rmmio + offset); 838 else 839 BUG(); 840 } 841 842 /** 843 * amdgpu_device_wreg - write to a memory mapped IO or indirect register 844 * 845 * @adev: amdgpu_device pointer 846 * @reg: dword aligned register offset 847 * @v: 32 bit value to write to the register 848 * @acc_flags: access flags which require special behavior 849 * 850 * Writes the value specified to the offset specified. 851 */ 852 void amdgpu_device_wreg(struct amdgpu_device *adev, 853 uint32_t reg, uint32_t v, 854 uint32_t acc_flags) 855 { 856 if (amdgpu_device_skip_hw_access(adev)) 857 return; 858 859 if ((reg * 4) < adev->rmmio_size) { 860 if (!(acc_flags & AMDGPU_REGS_NO_KIQ) && 861 amdgpu_sriov_runtime(adev) && 862 down_read_trylock(&adev->reset_domain->sem)) { 863 amdgpu_kiq_wreg(adev, reg, v, 0); 864 up_read(&adev->reset_domain->sem); 865 } else { 866 writel(v, ((void __iomem *)adev->rmmio) + (reg * 4)); 867 } 868 } else { 869 adev->pcie_wreg(adev, reg * 4, v); 870 } 871 872 trace_amdgpu_device_wreg(adev->pdev->device, reg, v); 873 } 874 875 /** 876 * amdgpu_mm_wreg_mmio_rlc - write register either with direct/indirect mmio or with RLC path if in range 877 * 878 * @adev: amdgpu_device pointer 879 * @reg: mmio/rlc register 880 * @v: value to write 881 * @xcc_id: xcc accelerated compute core id 882 * 883 * this function is invoked only for the debugfs register access 884 */ 885 void amdgpu_mm_wreg_mmio_rlc(struct amdgpu_device *adev, 886 uint32_t reg, uint32_t v, 887 uint32_t xcc_id) 888 { 889 if (amdgpu_device_skip_hw_access(adev)) 890 return; 891 892 if (amdgpu_sriov_fullaccess(adev) && 893 adev->gfx.rlc.funcs && 894 adev->gfx.rlc.funcs->is_rlcg_access_range) { 895 if (adev->gfx.rlc.funcs->is_rlcg_access_range(adev, reg)) 896 return amdgpu_sriov_wreg(adev, reg, v, 0, 0, xcc_id); 897 } else if ((reg * 4) >= adev->rmmio_size) { 898 adev->pcie_wreg(adev, reg * 4, v); 899 } else { 900 writel(v, ((void __iomem *)adev->rmmio) + (reg * 4)); 901 } 902 } 903 904 /** 905 * amdgpu_device_xcc_wreg - write to a memory mapped IO or indirect register with specific XCC 906 * 907 * @adev: amdgpu_device pointer 908 * @reg: dword aligned register offset 909 * @v: 32 bit value to write to the register 910 * @acc_flags: access flags which require special behavior 911 * @xcc_id: xcc accelerated compute core id 912 * 913 * Writes the value specified to the offset specified. 914 */ 915 void amdgpu_device_xcc_wreg(struct amdgpu_device *adev, 916 uint32_t reg, uint32_t v, 917 uint32_t acc_flags, uint32_t xcc_id) 918 { 919 uint32_t rlcg_flag; 920 921 if (amdgpu_device_skip_hw_access(adev)) 922 return; 923 924 if ((reg * 4) < adev->rmmio_size) { 925 if (amdgpu_sriov_vf(adev) && 926 !amdgpu_sriov_runtime(adev) && 927 adev->gfx.rlc.rlcg_reg_access_supported && 928 amdgpu_virt_get_rlcg_reg_access_flag(adev, acc_flags, 929 GC_HWIP, true, 930 &rlcg_flag)) { 931 amdgpu_virt_rlcg_reg_rw(adev, reg, v, rlcg_flag, GET_INST(GC, xcc_id)); 932 } else if (!(acc_flags & AMDGPU_REGS_NO_KIQ) && 933 amdgpu_sriov_runtime(adev) && 934 down_read_trylock(&adev->reset_domain->sem)) { 935 amdgpu_kiq_wreg(adev, reg, v, xcc_id); 936 up_read(&adev->reset_domain->sem); 937 } else { 938 writel(v, ((void __iomem *)adev->rmmio) + (reg * 4)); 939 } 940 } else { 941 adev->pcie_wreg(adev, reg * 4, v); 942 } 943 } 944 945 /** 946 * amdgpu_device_indirect_rreg - read an indirect register 947 * 948 * @adev: amdgpu_device pointer 949 * @reg_addr: indirect register address to read from 950 * 951 * Returns the value of indirect register @reg_addr 952 */ 953 u32 amdgpu_device_indirect_rreg(struct amdgpu_device *adev, 954 u32 reg_addr) 955 { 956 unsigned long flags, pcie_index, pcie_data; 957 void __iomem *pcie_index_offset; 958 void __iomem *pcie_data_offset; 959 u32 r; 960 961 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 962 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 963 964 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 965 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 966 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 967 968 writel(reg_addr, pcie_index_offset); 969 readl(pcie_index_offset); 970 r = readl(pcie_data_offset); 971 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 972 973 return r; 974 } 975 976 u32 amdgpu_device_indirect_rreg_ext(struct amdgpu_device *adev, 977 u64 reg_addr) 978 { 979 unsigned long flags, pcie_index, pcie_index_hi, pcie_data; 980 u32 r; 981 void __iomem *pcie_index_offset; 982 void __iomem *pcie_index_hi_offset; 983 void __iomem *pcie_data_offset; 984 985 if (unlikely(!adev->nbio.funcs)) { 986 pcie_index = AMDGPU_PCIE_INDEX_FALLBACK; 987 pcie_data = AMDGPU_PCIE_DATA_FALLBACK; 988 } else { 989 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 990 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 991 } 992 993 if (reg_addr >> 32) { 994 if (unlikely(!adev->nbio.funcs)) 995 pcie_index_hi = AMDGPU_PCIE_INDEX_HI_FALLBACK; 996 else 997 pcie_index_hi = adev->nbio.funcs->get_pcie_index_hi_offset(adev); 998 } else { 999 pcie_index_hi = 0; 1000 } 1001 1002 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 1003 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 1004 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 1005 if (pcie_index_hi != 0) 1006 pcie_index_hi_offset = (void __iomem *)adev->rmmio + 1007 pcie_index_hi * 4; 1008 1009 writel(reg_addr, pcie_index_offset); 1010 readl(pcie_index_offset); 1011 if (pcie_index_hi != 0) { 1012 writel((reg_addr >> 32) & 0xff, pcie_index_hi_offset); 1013 readl(pcie_index_hi_offset); 1014 } 1015 r = readl(pcie_data_offset); 1016 1017 /* clear the high bits */ 1018 if (pcie_index_hi != 0) { 1019 writel(0, pcie_index_hi_offset); 1020 readl(pcie_index_hi_offset); 1021 } 1022 1023 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 1024 1025 return r; 1026 } 1027 1028 /** 1029 * amdgpu_device_indirect_rreg64 - read a 64bits indirect register 1030 * 1031 * @adev: amdgpu_device pointer 1032 * @reg_addr: indirect register address to read from 1033 * 1034 * Returns the value of indirect register @reg_addr 1035 */ 1036 u64 amdgpu_device_indirect_rreg64(struct amdgpu_device *adev, 1037 u32 reg_addr) 1038 { 1039 unsigned long flags, pcie_index, pcie_data; 1040 void __iomem *pcie_index_offset; 1041 void __iomem *pcie_data_offset; 1042 u64 r; 1043 1044 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 1045 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 1046 1047 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 1048 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 1049 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 1050 1051 /* read low 32 bits */ 1052 writel(reg_addr, pcie_index_offset); 1053 readl(pcie_index_offset); 1054 r = readl(pcie_data_offset); 1055 /* read high 32 bits */ 1056 writel(reg_addr + 4, pcie_index_offset); 1057 readl(pcie_index_offset); 1058 r |= ((u64)readl(pcie_data_offset) << 32); 1059 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 1060 1061 return r; 1062 } 1063 1064 u64 amdgpu_device_indirect_rreg64_ext(struct amdgpu_device *adev, 1065 u64 reg_addr) 1066 { 1067 unsigned long flags, pcie_index, pcie_data; 1068 unsigned long pcie_index_hi = 0; 1069 void __iomem *pcie_index_offset; 1070 void __iomem *pcie_index_hi_offset; 1071 void __iomem *pcie_data_offset; 1072 u64 r; 1073 1074 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 1075 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 1076 if ((reg_addr >> 32) && (adev->nbio.funcs->get_pcie_index_hi_offset)) 1077 pcie_index_hi = adev->nbio.funcs->get_pcie_index_hi_offset(adev); 1078 1079 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 1080 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 1081 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 1082 if (pcie_index_hi != 0) 1083 pcie_index_hi_offset = (void __iomem *)adev->rmmio + 1084 pcie_index_hi * 4; 1085 1086 /* read low 32 bits */ 1087 writel(reg_addr, pcie_index_offset); 1088 readl(pcie_index_offset); 1089 if (pcie_index_hi != 0) { 1090 writel((reg_addr >> 32) & 0xff, pcie_index_hi_offset); 1091 readl(pcie_index_hi_offset); 1092 } 1093 r = readl(pcie_data_offset); 1094 /* read high 32 bits */ 1095 writel(reg_addr + 4, pcie_index_offset); 1096 readl(pcie_index_offset); 1097 if (pcie_index_hi != 0) { 1098 writel((reg_addr >> 32) & 0xff, pcie_index_hi_offset); 1099 readl(pcie_index_hi_offset); 1100 } 1101 r |= ((u64)readl(pcie_data_offset) << 32); 1102 1103 /* clear the high bits */ 1104 if (pcie_index_hi != 0) { 1105 writel(0, pcie_index_hi_offset); 1106 readl(pcie_index_hi_offset); 1107 } 1108 1109 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 1110 1111 return r; 1112 } 1113 1114 /** 1115 * amdgpu_device_indirect_wreg - write an indirect register address 1116 * 1117 * @adev: amdgpu_device pointer 1118 * @reg_addr: indirect register offset 1119 * @reg_data: indirect register data 1120 * 1121 */ 1122 void amdgpu_device_indirect_wreg(struct amdgpu_device *adev, 1123 u32 reg_addr, u32 reg_data) 1124 { 1125 unsigned long flags, pcie_index, pcie_data; 1126 void __iomem *pcie_index_offset; 1127 void __iomem *pcie_data_offset; 1128 1129 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 1130 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 1131 1132 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 1133 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 1134 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 1135 1136 writel(reg_addr, pcie_index_offset); 1137 readl(pcie_index_offset); 1138 writel(reg_data, pcie_data_offset); 1139 readl(pcie_data_offset); 1140 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 1141 } 1142 1143 void amdgpu_device_indirect_wreg_ext(struct amdgpu_device *adev, 1144 u64 reg_addr, u32 reg_data) 1145 { 1146 unsigned long flags, pcie_index, pcie_index_hi, pcie_data; 1147 void __iomem *pcie_index_offset; 1148 void __iomem *pcie_index_hi_offset; 1149 void __iomem *pcie_data_offset; 1150 1151 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 1152 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 1153 if ((reg_addr >> 32) && (adev->nbio.funcs->get_pcie_index_hi_offset)) 1154 pcie_index_hi = adev->nbio.funcs->get_pcie_index_hi_offset(adev); 1155 else 1156 pcie_index_hi = 0; 1157 1158 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 1159 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 1160 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 1161 if (pcie_index_hi != 0) 1162 pcie_index_hi_offset = (void __iomem *)adev->rmmio + 1163 pcie_index_hi * 4; 1164 1165 writel(reg_addr, pcie_index_offset); 1166 readl(pcie_index_offset); 1167 if (pcie_index_hi != 0) { 1168 writel((reg_addr >> 32) & 0xff, pcie_index_hi_offset); 1169 readl(pcie_index_hi_offset); 1170 } 1171 writel(reg_data, pcie_data_offset); 1172 readl(pcie_data_offset); 1173 1174 /* clear the high bits */ 1175 if (pcie_index_hi != 0) { 1176 writel(0, pcie_index_hi_offset); 1177 readl(pcie_index_hi_offset); 1178 } 1179 1180 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 1181 } 1182 1183 /** 1184 * amdgpu_device_indirect_wreg64 - write a 64bits indirect register address 1185 * 1186 * @adev: amdgpu_device pointer 1187 * @reg_addr: indirect register offset 1188 * @reg_data: indirect register data 1189 * 1190 */ 1191 void amdgpu_device_indirect_wreg64(struct amdgpu_device *adev, 1192 u32 reg_addr, u64 reg_data) 1193 { 1194 unsigned long flags, pcie_index, pcie_data; 1195 void __iomem *pcie_index_offset; 1196 void __iomem *pcie_data_offset; 1197 1198 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 1199 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 1200 1201 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 1202 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 1203 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 1204 1205 /* write low 32 bits */ 1206 writel(reg_addr, pcie_index_offset); 1207 readl(pcie_index_offset); 1208 writel((u32)(reg_data & 0xffffffffULL), pcie_data_offset); 1209 readl(pcie_data_offset); 1210 /* write high 32 bits */ 1211 writel(reg_addr + 4, pcie_index_offset); 1212 readl(pcie_index_offset); 1213 writel((u32)(reg_data >> 32), pcie_data_offset); 1214 readl(pcie_data_offset); 1215 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 1216 } 1217 1218 void amdgpu_device_indirect_wreg64_ext(struct amdgpu_device *adev, 1219 u64 reg_addr, u64 reg_data) 1220 { 1221 unsigned long flags, pcie_index, pcie_data; 1222 unsigned long pcie_index_hi = 0; 1223 void __iomem *pcie_index_offset; 1224 void __iomem *pcie_index_hi_offset; 1225 void __iomem *pcie_data_offset; 1226 1227 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 1228 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 1229 if ((reg_addr >> 32) && (adev->nbio.funcs->get_pcie_index_hi_offset)) 1230 pcie_index_hi = adev->nbio.funcs->get_pcie_index_hi_offset(adev); 1231 1232 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 1233 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 1234 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 1235 if (pcie_index_hi != 0) 1236 pcie_index_hi_offset = (void __iomem *)adev->rmmio + 1237 pcie_index_hi * 4; 1238 1239 /* write low 32 bits */ 1240 writel(reg_addr, pcie_index_offset); 1241 readl(pcie_index_offset); 1242 if (pcie_index_hi != 0) { 1243 writel((reg_addr >> 32) & 0xff, pcie_index_hi_offset); 1244 readl(pcie_index_hi_offset); 1245 } 1246 writel((u32)(reg_data & 0xffffffffULL), pcie_data_offset); 1247 readl(pcie_data_offset); 1248 /* write high 32 bits */ 1249 writel(reg_addr + 4, pcie_index_offset); 1250 readl(pcie_index_offset); 1251 if (pcie_index_hi != 0) { 1252 writel((reg_addr >> 32) & 0xff, pcie_index_hi_offset); 1253 readl(pcie_index_hi_offset); 1254 } 1255 writel((u32)(reg_data >> 32), pcie_data_offset); 1256 readl(pcie_data_offset); 1257 1258 /* clear the high bits */ 1259 if (pcie_index_hi != 0) { 1260 writel(0, pcie_index_hi_offset); 1261 readl(pcie_index_hi_offset); 1262 } 1263 1264 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 1265 } 1266 1267 /** 1268 * amdgpu_device_get_rev_id - query device rev_id 1269 * 1270 * @adev: amdgpu_device pointer 1271 * 1272 * Return device rev_id 1273 */ 1274 u32 amdgpu_device_get_rev_id(struct amdgpu_device *adev) 1275 { 1276 return adev->nbio.funcs->get_rev_id(adev); 1277 } 1278 1279 /** 1280 * amdgpu_invalid_rreg - dummy reg read function 1281 * 1282 * @adev: amdgpu_device pointer 1283 * @reg: offset of register 1284 * 1285 * Dummy register read function. Used for register blocks 1286 * that certain asics don't have (all asics). 1287 * Returns the value in the register. 1288 */ 1289 static uint32_t amdgpu_invalid_rreg(struct amdgpu_device *adev, uint32_t reg) 1290 { 1291 dev_err(adev->dev, "Invalid callback to read register 0x%04X\n", reg); 1292 BUG(); 1293 return 0; 1294 } 1295 1296 static uint32_t amdgpu_invalid_rreg_ext(struct amdgpu_device *adev, uint64_t reg) 1297 { 1298 dev_err(adev->dev, "Invalid callback to read register 0x%llX\n", reg); 1299 BUG(); 1300 return 0; 1301 } 1302 1303 /** 1304 * amdgpu_invalid_wreg - dummy reg write function 1305 * 1306 * @adev: amdgpu_device pointer 1307 * @reg: offset of register 1308 * @v: value to write to the register 1309 * 1310 * Dummy register read function. Used for register blocks 1311 * that certain asics don't have (all asics). 1312 */ 1313 static void amdgpu_invalid_wreg(struct amdgpu_device *adev, uint32_t reg, uint32_t v) 1314 { 1315 dev_err(adev->dev, 1316 "Invalid callback to write register 0x%04X with 0x%08X\n", reg, 1317 v); 1318 BUG(); 1319 } 1320 1321 static void amdgpu_invalid_wreg_ext(struct amdgpu_device *adev, uint64_t reg, uint32_t v) 1322 { 1323 dev_err(adev->dev, 1324 "Invalid callback to write register 0x%llX with 0x%08X\n", reg, 1325 v); 1326 BUG(); 1327 } 1328 1329 /** 1330 * amdgpu_invalid_rreg64 - dummy 64 bit reg read function 1331 * 1332 * @adev: amdgpu_device pointer 1333 * @reg: offset of register 1334 * 1335 * Dummy register read function. Used for register blocks 1336 * that certain asics don't have (all asics). 1337 * Returns the value in the register. 1338 */ 1339 static uint64_t amdgpu_invalid_rreg64(struct amdgpu_device *adev, uint32_t reg) 1340 { 1341 dev_err(adev->dev, "Invalid callback to read 64 bit register 0x%04X\n", 1342 reg); 1343 BUG(); 1344 return 0; 1345 } 1346 1347 static uint64_t amdgpu_invalid_rreg64_ext(struct amdgpu_device *adev, uint64_t reg) 1348 { 1349 dev_err(adev->dev, "Invalid callback to read register 0x%llX\n", reg); 1350 BUG(); 1351 return 0; 1352 } 1353 1354 /** 1355 * amdgpu_invalid_wreg64 - dummy reg write function 1356 * 1357 * @adev: amdgpu_device pointer 1358 * @reg: offset of register 1359 * @v: value to write to the register 1360 * 1361 * Dummy register read function. Used for register blocks 1362 * that certain asics don't have (all asics). 1363 */ 1364 static void amdgpu_invalid_wreg64(struct amdgpu_device *adev, uint32_t reg, uint64_t v) 1365 { 1366 dev_err(adev->dev, 1367 "Invalid callback to write 64 bit register 0x%04X with 0x%08llX\n", 1368 reg, v); 1369 BUG(); 1370 } 1371 1372 static void amdgpu_invalid_wreg64_ext(struct amdgpu_device *adev, uint64_t reg, uint64_t v) 1373 { 1374 dev_err(adev->dev, 1375 "Invalid callback to write 64 bit register 0x%llX with 0x%08llX\n", 1376 reg, v); 1377 BUG(); 1378 } 1379 1380 /** 1381 * amdgpu_block_invalid_rreg - dummy reg read function 1382 * 1383 * @adev: amdgpu_device pointer 1384 * @block: offset of instance 1385 * @reg: offset of register 1386 * 1387 * Dummy register read function. Used for register blocks 1388 * that certain asics don't have (all asics). 1389 * Returns the value in the register. 1390 */ 1391 static uint32_t amdgpu_block_invalid_rreg(struct amdgpu_device *adev, 1392 uint32_t block, uint32_t reg) 1393 { 1394 dev_err(adev->dev, 1395 "Invalid callback to read register 0x%04X in block 0x%04X\n", 1396 reg, block); 1397 BUG(); 1398 return 0; 1399 } 1400 1401 /** 1402 * amdgpu_block_invalid_wreg - dummy reg write function 1403 * 1404 * @adev: amdgpu_device pointer 1405 * @block: offset of instance 1406 * @reg: offset of register 1407 * @v: value to write to the register 1408 * 1409 * Dummy register read function. Used for register blocks 1410 * that certain asics don't have (all asics). 1411 */ 1412 static void amdgpu_block_invalid_wreg(struct amdgpu_device *adev, 1413 uint32_t block, 1414 uint32_t reg, uint32_t v) 1415 { 1416 dev_err(adev->dev, 1417 "Invalid block callback to write register 0x%04X in block 0x%04X with 0x%08X\n", 1418 reg, block, v); 1419 BUG(); 1420 } 1421 1422 static uint32_t amdgpu_device_get_vbios_flags(struct amdgpu_device *adev) 1423 { 1424 if (hweight32(adev->aid_mask) && (adev->flags & AMD_IS_APU)) 1425 return AMDGPU_VBIOS_SKIP; 1426 1427 if (hweight32(adev->aid_mask) && amdgpu_passthrough(adev)) 1428 return AMDGPU_VBIOS_OPTIONAL; 1429 1430 return 0; 1431 } 1432 1433 /** 1434 * amdgpu_device_asic_init - Wrapper for atom asic_init 1435 * 1436 * @adev: amdgpu_device pointer 1437 * 1438 * Does any asic specific work and then calls atom asic init. 1439 */ 1440 static int amdgpu_device_asic_init(struct amdgpu_device *adev) 1441 { 1442 uint32_t flags; 1443 bool optional; 1444 int ret; 1445 1446 amdgpu_asic_pre_asic_init(adev); 1447 flags = amdgpu_device_get_vbios_flags(adev); 1448 optional = !!(flags & (AMDGPU_VBIOS_OPTIONAL | AMDGPU_VBIOS_SKIP)); 1449 1450 if (amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 3) || 1451 amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 4) || 1452 amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 5, 0) || 1453 amdgpu_ip_version(adev, GC_HWIP, 0) >= IP_VERSION(11, 0, 0)) { 1454 amdgpu_psp_wait_for_bootloader(adev); 1455 if (optional && !adev->bios) 1456 return 0; 1457 1458 ret = amdgpu_atomfirmware_asic_init(adev, true); 1459 return ret; 1460 } else { 1461 if (optional && !adev->bios) 1462 return 0; 1463 1464 return amdgpu_atom_asic_init(adev->mode_info.atom_context); 1465 } 1466 1467 return 0; 1468 } 1469 1470 /** 1471 * amdgpu_device_mem_scratch_init - allocate the VRAM scratch page 1472 * 1473 * @adev: amdgpu_device pointer 1474 * 1475 * Allocates a scratch page of VRAM for use by various things in the 1476 * driver. 1477 */ 1478 static int amdgpu_device_mem_scratch_init(struct amdgpu_device *adev) 1479 { 1480 return amdgpu_bo_create_kernel(adev, AMDGPU_GPU_PAGE_SIZE, PAGE_SIZE, 1481 AMDGPU_GEM_DOMAIN_VRAM | 1482 AMDGPU_GEM_DOMAIN_GTT, 1483 &adev->mem_scratch.robj, 1484 &adev->mem_scratch.gpu_addr, 1485 (void **)&adev->mem_scratch.ptr); 1486 } 1487 1488 /** 1489 * amdgpu_device_mem_scratch_fini - Free the VRAM scratch page 1490 * 1491 * @adev: amdgpu_device pointer 1492 * 1493 * Frees the VRAM scratch page. 1494 */ 1495 static void amdgpu_device_mem_scratch_fini(struct amdgpu_device *adev) 1496 { 1497 amdgpu_bo_free_kernel(&adev->mem_scratch.robj, NULL, NULL); 1498 } 1499 1500 /** 1501 * amdgpu_device_program_register_sequence - program an array of registers. 1502 * 1503 * @adev: amdgpu_device pointer 1504 * @registers: pointer to the register array 1505 * @array_size: size of the register array 1506 * 1507 * Programs an array or registers with and or masks. 1508 * This is a helper for setting golden registers. 1509 */ 1510 void amdgpu_device_program_register_sequence(struct amdgpu_device *adev, 1511 const u32 *registers, 1512 const u32 array_size) 1513 { 1514 u32 tmp, reg, and_mask, or_mask; 1515 int i; 1516 1517 if (array_size % 3) 1518 return; 1519 1520 for (i = 0; i < array_size; i += 3) { 1521 reg = registers[i + 0]; 1522 and_mask = registers[i + 1]; 1523 or_mask = registers[i + 2]; 1524 1525 if (and_mask == 0xffffffff) { 1526 tmp = or_mask; 1527 } else { 1528 tmp = RREG32(reg); 1529 tmp &= ~and_mask; 1530 if (adev->family >= AMDGPU_FAMILY_AI) 1531 tmp |= (or_mask & and_mask); 1532 else 1533 tmp |= or_mask; 1534 } 1535 WREG32(reg, tmp); 1536 } 1537 } 1538 1539 /** 1540 * amdgpu_device_pci_config_reset - reset the GPU 1541 * 1542 * @adev: amdgpu_device pointer 1543 * 1544 * Resets the GPU using the pci config reset sequence. 1545 * Only applicable to asics prior to vega10. 1546 */ 1547 void amdgpu_device_pci_config_reset(struct amdgpu_device *adev) 1548 { 1549 pci_write_config_dword(adev->pdev, 0x7c, AMDGPU_ASIC_RESET_DATA); 1550 } 1551 1552 /** 1553 * amdgpu_device_pci_reset - reset the GPU using generic PCI means 1554 * 1555 * @adev: amdgpu_device pointer 1556 * 1557 * Resets the GPU using generic pci reset interfaces (FLR, SBR, etc.). 1558 */ 1559 int amdgpu_device_pci_reset(struct amdgpu_device *adev) 1560 { 1561 return pci_reset_function(adev->pdev); 1562 } 1563 1564 /* 1565 * amdgpu_device_wb_*() 1566 * Writeback is the method by which the GPU updates special pages in memory 1567 * with the status of certain GPU events (fences, ring pointers,etc.). 1568 */ 1569 1570 /** 1571 * amdgpu_device_wb_fini - Disable Writeback and free memory 1572 * 1573 * @adev: amdgpu_device pointer 1574 * 1575 * Disables Writeback and frees the Writeback memory (all asics). 1576 * Used at driver shutdown. 1577 */ 1578 static void amdgpu_device_wb_fini(struct amdgpu_device *adev) 1579 { 1580 if (adev->wb.wb_obj) { 1581 amdgpu_bo_free_kernel(&adev->wb.wb_obj, 1582 &adev->wb.gpu_addr, 1583 (void **)&adev->wb.wb); 1584 adev->wb.wb_obj = NULL; 1585 } 1586 } 1587 1588 /** 1589 * amdgpu_device_wb_init - Init Writeback driver info and allocate memory 1590 * 1591 * @adev: amdgpu_device pointer 1592 * 1593 * Initializes writeback and allocates writeback memory (all asics). 1594 * Used at driver startup. 1595 * Returns 0 on success or an -error on failure. 1596 */ 1597 static int amdgpu_device_wb_init(struct amdgpu_device *adev) 1598 { 1599 int r; 1600 1601 if (adev->wb.wb_obj == NULL) { 1602 /* AMDGPU_MAX_WB * sizeof(uint32_t) * 8 = AMDGPU_MAX_WB 256bit slots */ 1603 r = amdgpu_bo_create_kernel(adev, AMDGPU_MAX_WB * sizeof(uint32_t) * 8, 1604 PAGE_SIZE, AMDGPU_GEM_DOMAIN_GTT, 1605 &adev->wb.wb_obj, &adev->wb.gpu_addr, 1606 (void **)&adev->wb.wb); 1607 if (r) { 1608 dev_warn(adev->dev, "(%d) create WB bo failed\n", r); 1609 return r; 1610 } 1611 1612 adev->wb.num_wb = AMDGPU_MAX_WB; 1613 memset(&adev->wb.used, 0, sizeof(adev->wb.used)); 1614 1615 /* clear wb memory */ 1616 memset((char *)adev->wb.wb, 0, AMDGPU_MAX_WB * sizeof(uint32_t) * 8); 1617 } 1618 1619 return 0; 1620 } 1621 1622 /** 1623 * amdgpu_device_wb_get - Allocate a wb entry 1624 * 1625 * @adev: amdgpu_device pointer 1626 * @wb: wb index 1627 * 1628 * Allocate a wb slot for use by the driver (all asics). 1629 * Returns 0 on success or -EINVAL on failure. 1630 */ 1631 int amdgpu_device_wb_get(struct amdgpu_device *adev, u32 *wb) 1632 { 1633 unsigned long flags, offset; 1634 1635 spin_lock_irqsave(&adev->wb.lock, flags); 1636 offset = find_first_zero_bit(adev->wb.used, adev->wb.num_wb); 1637 if (offset < adev->wb.num_wb) { 1638 __set_bit(offset, adev->wb.used); 1639 spin_unlock_irqrestore(&adev->wb.lock, flags); 1640 *wb = offset << 3; /* convert to dw offset */ 1641 return 0; 1642 } else { 1643 spin_unlock_irqrestore(&adev->wb.lock, flags); 1644 return -EINVAL; 1645 } 1646 } 1647 1648 /** 1649 * amdgpu_device_wb_free - Free a wb entry 1650 * 1651 * @adev: amdgpu_device pointer 1652 * @wb: wb index 1653 * 1654 * Free a wb slot allocated for use by the driver (all asics) 1655 */ 1656 void amdgpu_device_wb_free(struct amdgpu_device *adev, u32 wb) 1657 { 1658 unsigned long flags; 1659 1660 wb >>= 3; 1661 spin_lock_irqsave(&adev->wb.lock, flags); 1662 if (wb < adev->wb.num_wb) 1663 __clear_bit(wb, adev->wb.used); 1664 spin_unlock_irqrestore(&adev->wb.lock, flags); 1665 } 1666 1667 /** 1668 * amdgpu_device_resize_fb_bar - try to resize FB BAR 1669 * 1670 * @adev: amdgpu_device pointer 1671 * 1672 * Try to resize FB BAR to make all VRAM CPU accessible. We try very hard not 1673 * to fail, but if any of the BARs is not accessible after the size we abort 1674 * driver loading by returning -ENODEV. 1675 */ 1676 int amdgpu_device_resize_fb_bar(struct amdgpu_device *adev) 1677 { 1678 int rbar_size = pci_rebar_bytes_to_size(adev->gmc.real_vram_size); 1679 struct pci_bus *root; 1680 struct resource *res; 1681 unsigned int i; 1682 u16 cmd; 1683 int r; 1684 1685 if (!IS_ENABLED(CONFIG_PHYS_ADDR_T_64BIT)) 1686 return 0; 1687 1688 /* Bypass for VF */ 1689 if (amdgpu_sriov_vf(adev)) 1690 return 0; 1691 1692 if (!amdgpu_rebar) 1693 return 0; 1694 1695 /* resizing on Dell G5 SE platforms causes problems with runtime pm */ 1696 if ((amdgpu_runtime_pm != 0) && 1697 adev->pdev->vendor == PCI_VENDOR_ID_ATI && 1698 adev->pdev->device == 0x731f && 1699 adev->pdev->subsystem_vendor == PCI_VENDOR_ID_DELL) 1700 return 0; 1701 1702 /* PCI_EXT_CAP_ID_VNDR extended capability is located at 0x100 */ 1703 if (!pci_find_ext_capability(adev->pdev, PCI_EXT_CAP_ID_VNDR)) 1704 dev_warn( 1705 adev->dev, 1706 "System can't access extended configuration space, please check!!\n"); 1707 1708 /* skip if the bios has already enabled large BAR */ 1709 if (adev->gmc.real_vram_size && 1710 (pci_resource_len(adev->pdev, 0) >= adev->gmc.real_vram_size)) 1711 return 0; 1712 1713 /* Check if the root BUS has 64bit memory resources */ 1714 root = adev->pdev->bus; 1715 while (root->parent) 1716 root = root->parent; 1717 1718 pci_bus_for_each_resource(root, res, i) { 1719 if (res && res->flags & (IORESOURCE_MEM | IORESOURCE_MEM_64) && 1720 res->start > 0x100000000ull) 1721 break; 1722 } 1723 1724 /* Trying to resize is pointless without a root hub window above 4GB */ 1725 if (!res) 1726 return 0; 1727 1728 /* Limit the BAR size to what is available */ 1729 rbar_size = min(fls(pci_rebar_get_possible_sizes(adev->pdev, 0)) - 1, 1730 rbar_size); 1731 1732 /* Disable memory decoding while we change the BAR addresses and size */ 1733 pci_read_config_word(adev->pdev, PCI_COMMAND, &cmd); 1734 pci_write_config_word(adev->pdev, PCI_COMMAND, 1735 cmd & ~PCI_COMMAND_MEMORY); 1736 1737 /* Free the VRAM and doorbell BAR, we most likely need to move both. */ 1738 amdgpu_doorbell_fini(adev); 1739 if (adev->asic_type >= CHIP_BONAIRE) 1740 pci_release_resource(adev->pdev, 2); 1741 1742 pci_release_resource(adev->pdev, 0); 1743 1744 r = pci_resize_resource(adev->pdev, 0, rbar_size); 1745 if (r == -ENOSPC) 1746 dev_info(adev->dev, 1747 "Not enough PCI address space for a large BAR."); 1748 else if (r && r != -ENOTSUPP) 1749 dev_err(adev->dev, "Problem resizing BAR0 (%d).", r); 1750 1751 pci_assign_unassigned_bus_resources(adev->pdev->bus); 1752 1753 /* When the doorbell or fb BAR isn't available we have no chance of 1754 * using the device. 1755 */ 1756 r = amdgpu_doorbell_init(adev); 1757 if (r || (pci_resource_flags(adev->pdev, 0) & IORESOURCE_UNSET)) 1758 return -ENODEV; 1759 1760 pci_write_config_word(adev->pdev, PCI_COMMAND, cmd); 1761 1762 return 0; 1763 } 1764 1765 /* 1766 * GPU helpers function. 1767 */ 1768 /** 1769 * amdgpu_device_need_post - check if the hw need post or not 1770 * 1771 * @adev: amdgpu_device pointer 1772 * 1773 * Check if the asic has been initialized (all asics) at driver startup 1774 * or post is needed if hw reset is performed. 1775 * Returns true if need or false if not. 1776 */ 1777 bool amdgpu_device_need_post(struct amdgpu_device *adev) 1778 { 1779 uint32_t reg, flags; 1780 1781 if (amdgpu_sriov_vf(adev)) 1782 return false; 1783 1784 flags = amdgpu_device_get_vbios_flags(adev); 1785 if (flags & AMDGPU_VBIOS_SKIP) 1786 return false; 1787 if ((flags & AMDGPU_VBIOS_OPTIONAL) && !adev->bios) 1788 return false; 1789 1790 if (amdgpu_passthrough(adev)) { 1791 /* for FIJI: In whole GPU pass-through virtualization case, after VM reboot 1792 * some old smc fw still need driver do vPost otherwise gpu hang, while 1793 * those smc fw version above 22.15 doesn't have this flaw, so we force 1794 * vpost executed for smc version below 22.15 1795 */ 1796 if (adev->asic_type == CHIP_FIJI) { 1797 int err; 1798 uint32_t fw_ver; 1799 1800 err = request_firmware(&adev->pm.fw, "amdgpu/fiji_smc.bin", adev->dev); 1801 /* force vPost if error occurred */ 1802 if (err) 1803 return true; 1804 1805 fw_ver = *((uint32_t *)adev->pm.fw->data + 69); 1806 release_firmware(adev->pm.fw); 1807 if (fw_ver < 0x00160e00) 1808 return true; 1809 } 1810 } 1811 1812 /* Don't post if we need to reset whole hive on init */ 1813 if (adev->init_lvl->level == AMDGPU_INIT_LEVEL_MINIMAL_XGMI) 1814 return false; 1815 1816 if (adev->has_hw_reset) { 1817 adev->has_hw_reset = false; 1818 return true; 1819 } 1820 1821 /* bios scratch used on CIK+ */ 1822 if (adev->asic_type >= CHIP_BONAIRE) 1823 return amdgpu_atombios_scratch_need_asic_init(adev); 1824 1825 /* check MEM_SIZE for older asics */ 1826 reg = amdgpu_asic_get_config_memsize(adev); 1827 1828 if ((reg != 0) && (reg != 0xffffffff)) 1829 return false; 1830 1831 return true; 1832 } 1833 1834 /* 1835 * Check whether seamless boot is supported. 1836 * 1837 * So far we only support seamless boot on DCE 3.0 or later. 1838 * If users report that it works on older ASICS as well, we may 1839 * loosen this. 1840 */ 1841 bool amdgpu_device_seamless_boot_supported(struct amdgpu_device *adev) 1842 { 1843 switch (amdgpu_seamless) { 1844 case -1: 1845 break; 1846 case 1: 1847 return true; 1848 case 0: 1849 return false; 1850 default: 1851 dev_err(adev->dev, "Invalid value for amdgpu.seamless: %d\n", 1852 amdgpu_seamless); 1853 return false; 1854 } 1855 1856 if (!(adev->flags & AMD_IS_APU)) 1857 return false; 1858 1859 if (adev->mman.keep_stolen_vga_memory) 1860 return false; 1861 1862 return amdgpu_ip_version(adev, DCE_HWIP, 0) >= IP_VERSION(3, 0, 0); 1863 } 1864 1865 /* 1866 * Intel hosts such as Rocket Lake, Alder Lake, Raptor Lake and Sapphire Rapids 1867 * don't support dynamic speed switching. Until we have confirmation from Intel 1868 * that a specific host supports it, it's safer that we keep it disabled for all. 1869 * 1870 * https://edc.intel.com/content/www/us/en/design/products/platforms/details/raptor-lake-s/13th-generation-core-processors-datasheet-volume-1-of-2/005/pci-express-support/ 1871 * https://gitlab.freedesktop.org/drm/amd/-/issues/2663 1872 */ 1873 static bool amdgpu_device_pcie_dynamic_switching_supported(struct amdgpu_device *adev) 1874 { 1875 #if IS_ENABLED(CONFIG_X86) 1876 struct cpuinfo_x86 *c = &cpu_data(0); 1877 1878 /* eGPU change speeds based on USB4 fabric conditions */ 1879 if (dev_is_removable(adev->dev)) 1880 return true; 1881 1882 if (c->x86_vendor == X86_VENDOR_INTEL) 1883 return false; 1884 #endif 1885 return true; 1886 } 1887 1888 static bool amdgpu_device_aspm_support_quirk(struct amdgpu_device *adev) 1889 { 1890 #if IS_ENABLED(CONFIG_X86) 1891 struct cpuinfo_x86 *c = &cpu_data(0); 1892 1893 if (!(amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(12, 0, 0) || 1894 amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(12, 0, 1))) 1895 return false; 1896 1897 if (c->x86 == 6 && 1898 adev->pm.pcie_gen_mask & CAIL_PCIE_LINK_SPEED_SUPPORT_GEN5) { 1899 switch (c->x86_model) { 1900 case VFM_MODEL(INTEL_ALDERLAKE): 1901 case VFM_MODEL(INTEL_ALDERLAKE_L): 1902 case VFM_MODEL(INTEL_RAPTORLAKE): 1903 case VFM_MODEL(INTEL_RAPTORLAKE_P): 1904 case VFM_MODEL(INTEL_RAPTORLAKE_S): 1905 return true; 1906 default: 1907 return false; 1908 } 1909 } else { 1910 return false; 1911 } 1912 #else 1913 return false; 1914 #endif 1915 } 1916 1917 /** 1918 * amdgpu_device_should_use_aspm - check if the device should program ASPM 1919 * 1920 * @adev: amdgpu_device pointer 1921 * 1922 * Confirm whether the module parameter and pcie bridge agree that ASPM should 1923 * be set for this device. 1924 * 1925 * Returns true if it should be used or false if not. 1926 */ 1927 bool amdgpu_device_should_use_aspm(struct amdgpu_device *adev) 1928 { 1929 switch (amdgpu_aspm) { 1930 case -1: 1931 break; 1932 case 0: 1933 return false; 1934 case 1: 1935 return true; 1936 default: 1937 return false; 1938 } 1939 if (adev->flags & AMD_IS_APU) 1940 return false; 1941 if (amdgpu_device_aspm_support_quirk(adev)) 1942 return false; 1943 return pcie_aspm_enabled(adev->pdev); 1944 } 1945 1946 /* if we get transitioned to only one device, take VGA back */ 1947 /** 1948 * amdgpu_device_vga_set_decode - enable/disable vga decode 1949 * 1950 * @pdev: PCI device pointer 1951 * @state: enable/disable vga decode 1952 * 1953 * Enable/disable vga decode (all asics). 1954 * Returns VGA resource flags. 1955 */ 1956 static unsigned int amdgpu_device_vga_set_decode(struct pci_dev *pdev, 1957 bool state) 1958 { 1959 struct amdgpu_device *adev = drm_to_adev(pci_get_drvdata(pdev)); 1960 1961 amdgpu_asic_set_vga_state(adev, state); 1962 if (state) 1963 return VGA_RSRC_LEGACY_IO | VGA_RSRC_LEGACY_MEM | 1964 VGA_RSRC_NORMAL_IO | VGA_RSRC_NORMAL_MEM; 1965 else 1966 return VGA_RSRC_NORMAL_IO | VGA_RSRC_NORMAL_MEM; 1967 } 1968 1969 /** 1970 * amdgpu_device_check_block_size - validate the vm block size 1971 * 1972 * @adev: amdgpu_device pointer 1973 * 1974 * Validates the vm block size specified via module parameter. 1975 * The vm block size defines number of bits in page table versus page directory, 1976 * a page is 4KB so we have 12 bits offset, minimum 9 bits in the 1977 * page table and the remaining bits are in the page directory. 1978 */ 1979 static void amdgpu_device_check_block_size(struct amdgpu_device *adev) 1980 { 1981 /* defines number of bits in page table versus page directory, 1982 * a page is 4KB so we have 12 bits offset, minimum 9 bits in the 1983 * page table and the remaining bits are in the page directory 1984 */ 1985 if (amdgpu_vm_block_size == -1) 1986 return; 1987 1988 if (amdgpu_vm_block_size < 9) { 1989 dev_warn(adev->dev, "VM page table size (%d) too small\n", 1990 amdgpu_vm_block_size); 1991 amdgpu_vm_block_size = -1; 1992 } 1993 } 1994 1995 /** 1996 * amdgpu_device_check_vm_size - validate the vm size 1997 * 1998 * @adev: amdgpu_device pointer 1999 * 2000 * Validates the vm size in GB specified via module parameter. 2001 * The VM size is the size of the GPU virtual memory space in GB. 2002 */ 2003 static void amdgpu_device_check_vm_size(struct amdgpu_device *adev) 2004 { 2005 /* no need to check the default value */ 2006 if (amdgpu_vm_size == -1) 2007 return; 2008 2009 if (amdgpu_vm_size < 1) { 2010 dev_warn(adev->dev, "VM size (%d) too small, min is 1GB\n", 2011 amdgpu_vm_size); 2012 amdgpu_vm_size = -1; 2013 } 2014 } 2015 2016 static void amdgpu_device_check_smu_prv_buffer_size(struct amdgpu_device *adev) 2017 { 2018 struct sysinfo si; 2019 bool is_os_64 = (sizeof(void *) == 8); 2020 uint64_t total_memory; 2021 uint64_t dram_size_seven_GB = 0x1B8000000; 2022 uint64_t dram_size_three_GB = 0xB8000000; 2023 2024 if (amdgpu_smu_memory_pool_size == 0) 2025 return; 2026 2027 if (!is_os_64) { 2028 dev_warn(adev->dev, "Not 64-bit OS, feature not supported\n"); 2029 goto def_value; 2030 } 2031 si_meminfo(&si); 2032 total_memory = (uint64_t)si.totalram * si.mem_unit; 2033 2034 if ((amdgpu_smu_memory_pool_size == 1) || 2035 (amdgpu_smu_memory_pool_size == 2)) { 2036 if (total_memory < dram_size_three_GB) 2037 goto def_value1; 2038 } else if ((amdgpu_smu_memory_pool_size == 4) || 2039 (amdgpu_smu_memory_pool_size == 8)) { 2040 if (total_memory < dram_size_seven_GB) 2041 goto def_value1; 2042 } else { 2043 dev_warn(adev->dev, "Smu memory pool size not supported\n"); 2044 goto def_value; 2045 } 2046 adev->pm.smu_prv_buffer_size = amdgpu_smu_memory_pool_size << 28; 2047 2048 return; 2049 2050 def_value1: 2051 dev_warn(adev->dev, "No enough system memory\n"); 2052 def_value: 2053 adev->pm.smu_prv_buffer_size = 0; 2054 } 2055 2056 static int amdgpu_device_init_apu_flags(struct amdgpu_device *adev) 2057 { 2058 if (!(adev->flags & AMD_IS_APU) || 2059 adev->asic_type < CHIP_RAVEN) 2060 return 0; 2061 2062 switch (adev->asic_type) { 2063 case CHIP_RAVEN: 2064 if (adev->pdev->device == 0x15dd) 2065 adev->apu_flags |= AMD_APU_IS_RAVEN; 2066 if (adev->pdev->device == 0x15d8) 2067 adev->apu_flags |= AMD_APU_IS_PICASSO; 2068 break; 2069 case CHIP_RENOIR: 2070 if ((adev->pdev->device == 0x1636) || 2071 (adev->pdev->device == 0x164c)) 2072 adev->apu_flags |= AMD_APU_IS_RENOIR; 2073 else 2074 adev->apu_flags |= AMD_APU_IS_GREEN_SARDINE; 2075 break; 2076 case CHIP_VANGOGH: 2077 adev->apu_flags |= AMD_APU_IS_VANGOGH; 2078 break; 2079 case CHIP_YELLOW_CARP: 2080 break; 2081 case CHIP_CYAN_SKILLFISH: 2082 if ((adev->pdev->device == 0x13FE) || 2083 (adev->pdev->device == 0x143F)) 2084 adev->apu_flags |= AMD_APU_IS_CYAN_SKILLFISH2; 2085 break; 2086 default: 2087 break; 2088 } 2089 2090 return 0; 2091 } 2092 2093 /** 2094 * amdgpu_device_check_arguments - validate module params 2095 * 2096 * @adev: amdgpu_device pointer 2097 * 2098 * Validates certain module parameters and updates 2099 * the associated values used by the driver (all asics). 2100 */ 2101 static int amdgpu_device_check_arguments(struct amdgpu_device *adev) 2102 { 2103 int i; 2104 2105 if (amdgpu_sched_jobs < 4) { 2106 dev_warn(adev->dev, "sched jobs (%d) must be at least 4\n", 2107 amdgpu_sched_jobs); 2108 amdgpu_sched_jobs = 4; 2109 } else if (!is_power_of_2(amdgpu_sched_jobs)) { 2110 dev_warn(adev->dev, "sched jobs (%d) must be a power of 2\n", 2111 amdgpu_sched_jobs); 2112 amdgpu_sched_jobs = roundup_pow_of_two(amdgpu_sched_jobs); 2113 } 2114 2115 if (amdgpu_gart_size != -1 && amdgpu_gart_size < 32) { 2116 /* gart size must be greater or equal to 32M */ 2117 dev_warn(adev->dev, "gart size (%d) too small\n", 2118 amdgpu_gart_size); 2119 amdgpu_gart_size = -1; 2120 } 2121 2122 if (amdgpu_gtt_size != -1 && amdgpu_gtt_size < 32) { 2123 /* gtt size must be greater or equal to 32M */ 2124 dev_warn(adev->dev, "gtt size (%d) too small\n", 2125 amdgpu_gtt_size); 2126 amdgpu_gtt_size = -1; 2127 } 2128 2129 /* valid range is between 4 and 9 inclusive */ 2130 if (amdgpu_vm_fragment_size != -1 && 2131 (amdgpu_vm_fragment_size > 9 || amdgpu_vm_fragment_size < 4)) { 2132 dev_warn(adev->dev, "valid range is between 4 and 9\n"); 2133 amdgpu_vm_fragment_size = -1; 2134 } 2135 2136 if (amdgpu_sched_hw_submission < 2) { 2137 dev_warn(adev->dev, "sched hw submission jobs (%d) must be at least 2\n", 2138 amdgpu_sched_hw_submission); 2139 amdgpu_sched_hw_submission = 2; 2140 } else if (!is_power_of_2(amdgpu_sched_hw_submission)) { 2141 dev_warn(adev->dev, "sched hw submission jobs (%d) must be a power of 2\n", 2142 amdgpu_sched_hw_submission); 2143 amdgpu_sched_hw_submission = roundup_pow_of_two(amdgpu_sched_hw_submission); 2144 } 2145 2146 if (amdgpu_reset_method < -1 || amdgpu_reset_method > 4) { 2147 dev_warn(adev->dev, "invalid option for reset method, reverting to default\n"); 2148 amdgpu_reset_method = -1; 2149 } 2150 2151 amdgpu_device_check_smu_prv_buffer_size(adev); 2152 2153 amdgpu_device_check_vm_size(adev); 2154 2155 amdgpu_device_check_block_size(adev); 2156 2157 adev->firmware.load_type = amdgpu_ucode_get_load_type(adev, amdgpu_fw_load_type); 2158 2159 for (i = 0; i < MAX_XCP; i++) { 2160 switch (amdgpu_enforce_isolation) { 2161 case -1: 2162 case 0: 2163 default: 2164 /* disable */ 2165 adev->enforce_isolation[i] = AMDGPU_ENFORCE_ISOLATION_DISABLE; 2166 break; 2167 case 1: 2168 /* enable */ 2169 adev->enforce_isolation[i] = 2170 AMDGPU_ENFORCE_ISOLATION_ENABLE; 2171 break; 2172 case 2: 2173 /* enable legacy mode */ 2174 adev->enforce_isolation[i] = 2175 AMDGPU_ENFORCE_ISOLATION_ENABLE_LEGACY; 2176 break; 2177 case 3: 2178 /* enable only process isolation without submitting cleaner shader */ 2179 adev->enforce_isolation[i] = 2180 AMDGPU_ENFORCE_ISOLATION_NO_CLEANER_SHADER; 2181 break; 2182 } 2183 } 2184 2185 return 0; 2186 } 2187 2188 /** 2189 * amdgpu_switcheroo_set_state - set switcheroo state 2190 * 2191 * @pdev: pci dev pointer 2192 * @state: vga_switcheroo state 2193 * 2194 * Callback for the switcheroo driver. Suspends or resumes 2195 * the asics before or after it is powered up using ACPI methods. 2196 */ 2197 static void amdgpu_switcheroo_set_state(struct pci_dev *pdev, 2198 enum vga_switcheroo_state state) 2199 { 2200 struct drm_device *dev = pci_get_drvdata(pdev); 2201 int r; 2202 2203 if (amdgpu_device_supports_px(dev) && state == VGA_SWITCHEROO_OFF) 2204 return; 2205 2206 if (state == VGA_SWITCHEROO_ON) { 2207 pr_info("switched on\n"); 2208 /* don't suspend or resume card normally */ 2209 dev->switch_power_state = DRM_SWITCH_POWER_CHANGING; 2210 2211 pci_set_power_state(pdev, PCI_D0); 2212 amdgpu_device_load_pci_state(pdev); 2213 r = pci_enable_device(pdev); 2214 if (r) 2215 dev_warn(&pdev->dev, "pci_enable_device failed (%d)\n", 2216 r); 2217 amdgpu_device_resume(dev, true); 2218 2219 dev->switch_power_state = DRM_SWITCH_POWER_ON; 2220 } else { 2221 dev_info(&pdev->dev, "switched off\n"); 2222 dev->switch_power_state = DRM_SWITCH_POWER_CHANGING; 2223 amdgpu_device_prepare(dev); 2224 amdgpu_device_suspend(dev, true); 2225 amdgpu_device_cache_pci_state(pdev); 2226 /* Shut down the device */ 2227 pci_disable_device(pdev); 2228 pci_set_power_state(pdev, PCI_D3cold); 2229 dev->switch_power_state = DRM_SWITCH_POWER_OFF; 2230 } 2231 } 2232 2233 /** 2234 * amdgpu_switcheroo_can_switch - see if switcheroo state can change 2235 * 2236 * @pdev: pci dev pointer 2237 * 2238 * Callback for the switcheroo driver. Check of the switcheroo 2239 * state can be changed. 2240 * Returns true if the state can be changed, false if not. 2241 */ 2242 static bool amdgpu_switcheroo_can_switch(struct pci_dev *pdev) 2243 { 2244 struct drm_device *dev = pci_get_drvdata(pdev); 2245 2246 /* 2247 * FIXME: open_count is protected by drm_global_mutex but that would lead to 2248 * locking inversion with the driver load path. And the access here is 2249 * completely racy anyway. So don't bother with locking for now. 2250 */ 2251 return atomic_read(&dev->open_count) == 0; 2252 } 2253 2254 static const struct vga_switcheroo_client_ops amdgpu_switcheroo_ops = { 2255 .set_gpu_state = amdgpu_switcheroo_set_state, 2256 .reprobe = NULL, 2257 .can_switch = amdgpu_switcheroo_can_switch, 2258 }; 2259 2260 /** 2261 * amdgpu_device_ip_set_clockgating_state - set the CG state 2262 * 2263 * @dev: amdgpu_device pointer 2264 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.) 2265 * @state: clockgating state (gate or ungate) 2266 * 2267 * Sets the requested clockgating state for all instances of 2268 * the hardware IP specified. 2269 * Returns the error code from the last instance. 2270 */ 2271 int amdgpu_device_ip_set_clockgating_state(void *dev, 2272 enum amd_ip_block_type block_type, 2273 enum amd_clockgating_state state) 2274 { 2275 struct amdgpu_device *adev = dev; 2276 int i, r = 0; 2277 2278 for (i = 0; i < adev->num_ip_blocks; i++) { 2279 if (!adev->ip_blocks[i].status.valid) 2280 continue; 2281 if (adev->ip_blocks[i].version->type != block_type) 2282 continue; 2283 if (!adev->ip_blocks[i].version->funcs->set_clockgating_state) 2284 continue; 2285 r = adev->ip_blocks[i].version->funcs->set_clockgating_state( 2286 &adev->ip_blocks[i], state); 2287 if (r) 2288 dev_err(adev->dev, 2289 "set_clockgating_state of IP block <%s> failed %d\n", 2290 adev->ip_blocks[i].version->funcs->name, r); 2291 } 2292 return r; 2293 } 2294 2295 /** 2296 * amdgpu_device_ip_set_powergating_state - set the PG state 2297 * 2298 * @dev: amdgpu_device pointer 2299 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.) 2300 * @state: powergating state (gate or ungate) 2301 * 2302 * Sets the requested powergating state for all instances of 2303 * the hardware IP specified. 2304 * Returns the error code from the last instance. 2305 */ 2306 int amdgpu_device_ip_set_powergating_state(void *dev, 2307 enum amd_ip_block_type block_type, 2308 enum amd_powergating_state state) 2309 { 2310 struct amdgpu_device *adev = dev; 2311 int i, r = 0; 2312 2313 for (i = 0; i < adev->num_ip_blocks; i++) { 2314 if (!adev->ip_blocks[i].status.valid) 2315 continue; 2316 if (adev->ip_blocks[i].version->type != block_type) 2317 continue; 2318 if (!adev->ip_blocks[i].version->funcs->set_powergating_state) 2319 continue; 2320 r = adev->ip_blocks[i].version->funcs->set_powergating_state( 2321 &adev->ip_blocks[i], state); 2322 if (r) 2323 dev_err(adev->dev, 2324 "set_powergating_state of IP block <%s> failed %d\n", 2325 adev->ip_blocks[i].version->funcs->name, r); 2326 } 2327 return r; 2328 } 2329 2330 /** 2331 * amdgpu_device_ip_get_clockgating_state - get the CG state 2332 * 2333 * @adev: amdgpu_device pointer 2334 * @flags: clockgating feature flags 2335 * 2336 * Walks the list of IPs on the device and updates the clockgating 2337 * flags for each IP. 2338 * Updates @flags with the feature flags for each hardware IP where 2339 * clockgating is enabled. 2340 */ 2341 void amdgpu_device_ip_get_clockgating_state(struct amdgpu_device *adev, 2342 u64 *flags) 2343 { 2344 int i; 2345 2346 for (i = 0; i < adev->num_ip_blocks; i++) { 2347 if (!adev->ip_blocks[i].status.valid) 2348 continue; 2349 if (adev->ip_blocks[i].version->funcs->get_clockgating_state) 2350 adev->ip_blocks[i].version->funcs->get_clockgating_state( 2351 &adev->ip_blocks[i], flags); 2352 } 2353 } 2354 2355 /** 2356 * amdgpu_device_ip_wait_for_idle - wait for idle 2357 * 2358 * @adev: amdgpu_device pointer 2359 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.) 2360 * 2361 * Waits for the request hardware IP to be idle. 2362 * Returns 0 for success or a negative error code on failure. 2363 */ 2364 int amdgpu_device_ip_wait_for_idle(struct amdgpu_device *adev, 2365 enum amd_ip_block_type block_type) 2366 { 2367 int i, r; 2368 2369 for (i = 0; i < adev->num_ip_blocks; i++) { 2370 if (!adev->ip_blocks[i].status.valid) 2371 continue; 2372 if (adev->ip_blocks[i].version->type == block_type) { 2373 if (adev->ip_blocks[i].version->funcs->wait_for_idle) { 2374 r = adev->ip_blocks[i].version->funcs->wait_for_idle( 2375 &adev->ip_blocks[i]); 2376 if (r) 2377 return r; 2378 } 2379 break; 2380 } 2381 } 2382 return 0; 2383 2384 } 2385 2386 /** 2387 * amdgpu_device_ip_is_valid - is the hardware IP enabled 2388 * 2389 * @adev: amdgpu_device pointer 2390 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.) 2391 * 2392 * Check if the hardware IP is enable or not. 2393 * Returns true if it the IP is enable, false if not. 2394 */ 2395 bool amdgpu_device_ip_is_valid(struct amdgpu_device *adev, 2396 enum amd_ip_block_type block_type) 2397 { 2398 int i; 2399 2400 for (i = 0; i < adev->num_ip_blocks; i++) { 2401 if (adev->ip_blocks[i].version->type == block_type) 2402 return adev->ip_blocks[i].status.valid; 2403 } 2404 return false; 2405 2406 } 2407 2408 /** 2409 * amdgpu_device_ip_get_ip_block - get a hw IP pointer 2410 * 2411 * @adev: amdgpu_device pointer 2412 * @type: Type of hardware IP (SMU, GFX, UVD, etc.) 2413 * 2414 * Returns a pointer to the hardware IP block structure 2415 * if it exists for the asic, otherwise NULL. 2416 */ 2417 struct amdgpu_ip_block * 2418 amdgpu_device_ip_get_ip_block(struct amdgpu_device *adev, 2419 enum amd_ip_block_type type) 2420 { 2421 int i; 2422 2423 for (i = 0; i < adev->num_ip_blocks; i++) 2424 if (adev->ip_blocks[i].version->type == type) 2425 return &adev->ip_blocks[i]; 2426 2427 return NULL; 2428 } 2429 2430 /** 2431 * amdgpu_device_ip_block_version_cmp 2432 * 2433 * @adev: amdgpu_device pointer 2434 * @type: enum amd_ip_block_type 2435 * @major: major version 2436 * @minor: minor version 2437 * 2438 * return 0 if equal or greater 2439 * return 1 if smaller or the ip_block doesn't exist 2440 */ 2441 int amdgpu_device_ip_block_version_cmp(struct amdgpu_device *adev, 2442 enum amd_ip_block_type type, 2443 u32 major, u32 minor) 2444 { 2445 struct amdgpu_ip_block *ip_block = amdgpu_device_ip_get_ip_block(adev, type); 2446 2447 if (ip_block && ((ip_block->version->major > major) || 2448 ((ip_block->version->major == major) && 2449 (ip_block->version->minor >= minor)))) 2450 return 0; 2451 2452 return 1; 2453 } 2454 2455 /** 2456 * amdgpu_device_ip_block_add 2457 * 2458 * @adev: amdgpu_device pointer 2459 * @ip_block_version: pointer to the IP to add 2460 * 2461 * Adds the IP block driver information to the collection of IPs 2462 * on the asic. 2463 */ 2464 int amdgpu_device_ip_block_add(struct amdgpu_device *adev, 2465 const struct amdgpu_ip_block_version *ip_block_version) 2466 { 2467 if (!ip_block_version) 2468 return -EINVAL; 2469 2470 switch (ip_block_version->type) { 2471 case AMD_IP_BLOCK_TYPE_VCN: 2472 if (adev->harvest_ip_mask & AMD_HARVEST_IP_VCN_MASK) 2473 return 0; 2474 break; 2475 case AMD_IP_BLOCK_TYPE_JPEG: 2476 if (adev->harvest_ip_mask & AMD_HARVEST_IP_JPEG_MASK) 2477 return 0; 2478 break; 2479 default: 2480 break; 2481 } 2482 2483 dev_info(adev->dev, "detected ip block number %d <%s>\n", 2484 adev->num_ip_blocks, ip_block_version->funcs->name); 2485 2486 adev->ip_blocks[adev->num_ip_blocks].adev = adev; 2487 2488 adev->ip_blocks[adev->num_ip_blocks++].version = ip_block_version; 2489 2490 return 0; 2491 } 2492 2493 /** 2494 * amdgpu_device_enable_virtual_display - enable virtual display feature 2495 * 2496 * @adev: amdgpu_device pointer 2497 * 2498 * Enabled the virtual display feature if the user has enabled it via 2499 * the module parameter virtual_display. This feature provides a virtual 2500 * display hardware on headless boards or in virtualized environments. 2501 * This function parses and validates the configuration string specified by 2502 * the user and configures the virtual display configuration (number of 2503 * virtual connectors, crtcs, etc.) specified. 2504 */ 2505 static void amdgpu_device_enable_virtual_display(struct amdgpu_device *adev) 2506 { 2507 adev->enable_virtual_display = false; 2508 2509 if (amdgpu_virtual_display) { 2510 const char *pci_address_name = pci_name(adev->pdev); 2511 char *pciaddstr, *pciaddstr_tmp, *pciaddname_tmp, *pciaddname; 2512 2513 pciaddstr = kstrdup(amdgpu_virtual_display, GFP_KERNEL); 2514 pciaddstr_tmp = pciaddstr; 2515 while ((pciaddname_tmp = strsep(&pciaddstr_tmp, ";"))) { 2516 pciaddname = strsep(&pciaddname_tmp, ","); 2517 if (!strcmp("all", pciaddname) 2518 || !strcmp(pci_address_name, pciaddname)) { 2519 long num_crtc; 2520 int res = -1; 2521 2522 adev->enable_virtual_display = true; 2523 2524 if (pciaddname_tmp) 2525 res = kstrtol(pciaddname_tmp, 10, 2526 &num_crtc); 2527 2528 if (!res) { 2529 if (num_crtc < 1) 2530 num_crtc = 1; 2531 if (num_crtc > 6) 2532 num_crtc = 6; 2533 adev->mode_info.num_crtc = num_crtc; 2534 } else { 2535 adev->mode_info.num_crtc = 1; 2536 } 2537 break; 2538 } 2539 } 2540 2541 dev_info( 2542 adev->dev, 2543 "virtual display string:%s, %s:virtual_display:%d, num_crtc:%d\n", 2544 amdgpu_virtual_display, pci_address_name, 2545 adev->enable_virtual_display, adev->mode_info.num_crtc); 2546 2547 kfree(pciaddstr); 2548 } 2549 } 2550 2551 void amdgpu_device_set_sriov_virtual_display(struct amdgpu_device *adev) 2552 { 2553 if (amdgpu_sriov_vf(adev) && !adev->enable_virtual_display) { 2554 adev->mode_info.num_crtc = 1; 2555 adev->enable_virtual_display = true; 2556 dev_info(adev->dev, "virtual_display:%d, num_crtc:%d\n", 2557 adev->enable_virtual_display, 2558 adev->mode_info.num_crtc); 2559 } 2560 } 2561 2562 /** 2563 * amdgpu_device_parse_gpu_info_fw - parse gpu info firmware 2564 * 2565 * @adev: amdgpu_device pointer 2566 * 2567 * Parses the asic configuration parameters specified in the gpu info 2568 * firmware and makes them available to the driver for use in configuring 2569 * the asic. 2570 * Returns 0 on success, -EINVAL on failure. 2571 */ 2572 static int amdgpu_device_parse_gpu_info_fw(struct amdgpu_device *adev) 2573 { 2574 const char *chip_name; 2575 int err; 2576 const struct gpu_info_firmware_header_v1_0 *hdr; 2577 2578 adev->firmware.gpu_info_fw = NULL; 2579 2580 if (adev->mman.discovery_bin) 2581 return 0; 2582 2583 switch (adev->asic_type) { 2584 default: 2585 return 0; 2586 case CHIP_VEGA10: 2587 chip_name = "vega10"; 2588 break; 2589 case CHIP_VEGA12: 2590 chip_name = "vega12"; 2591 break; 2592 case CHIP_RAVEN: 2593 if (adev->apu_flags & AMD_APU_IS_RAVEN2) 2594 chip_name = "raven2"; 2595 else if (adev->apu_flags & AMD_APU_IS_PICASSO) 2596 chip_name = "picasso"; 2597 else 2598 chip_name = "raven"; 2599 break; 2600 case CHIP_ARCTURUS: 2601 chip_name = "arcturus"; 2602 break; 2603 case CHIP_NAVI12: 2604 chip_name = "navi12"; 2605 break; 2606 } 2607 2608 err = amdgpu_ucode_request(adev, &adev->firmware.gpu_info_fw, 2609 AMDGPU_UCODE_OPTIONAL, 2610 "amdgpu/%s_gpu_info.bin", chip_name); 2611 if (err) { 2612 dev_err(adev->dev, 2613 "Failed to get gpu_info firmware \"%s_gpu_info.bin\"\n", 2614 chip_name); 2615 goto out; 2616 } 2617 2618 hdr = (const struct gpu_info_firmware_header_v1_0 *)adev->firmware.gpu_info_fw->data; 2619 amdgpu_ucode_print_gpu_info_hdr(&hdr->header); 2620 2621 switch (hdr->version_major) { 2622 case 1: 2623 { 2624 const struct gpu_info_firmware_v1_0 *gpu_info_fw = 2625 (const struct gpu_info_firmware_v1_0 *)(adev->firmware.gpu_info_fw->data + 2626 le32_to_cpu(hdr->header.ucode_array_offset_bytes)); 2627 2628 /* 2629 * Should be dropped when DAL no longer needs it. 2630 */ 2631 if (adev->asic_type == CHIP_NAVI12) 2632 goto parse_soc_bounding_box; 2633 2634 adev->gfx.config.max_shader_engines = le32_to_cpu(gpu_info_fw->gc_num_se); 2635 adev->gfx.config.max_cu_per_sh = le32_to_cpu(gpu_info_fw->gc_num_cu_per_sh); 2636 adev->gfx.config.max_sh_per_se = le32_to_cpu(gpu_info_fw->gc_num_sh_per_se); 2637 adev->gfx.config.max_backends_per_se = le32_to_cpu(gpu_info_fw->gc_num_rb_per_se); 2638 adev->gfx.config.max_texture_channel_caches = 2639 le32_to_cpu(gpu_info_fw->gc_num_tccs); 2640 adev->gfx.config.max_gprs = le32_to_cpu(gpu_info_fw->gc_num_gprs); 2641 adev->gfx.config.max_gs_threads = le32_to_cpu(gpu_info_fw->gc_num_max_gs_thds); 2642 adev->gfx.config.gs_vgt_table_depth = le32_to_cpu(gpu_info_fw->gc_gs_table_depth); 2643 adev->gfx.config.gs_prim_buffer_depth = le32_to_cpu(gpu_info_fw->gc_gsprim_buff_depth); 2644 adev->gfx.config.double_offchip_lds_buf = 2645 le32_to_cpu(gpu_info_fw->gc_double_offchip_lds_buffer); 2646 adev->gfx.cu_info.wave_front_size = le32_to_cpu(gpu_info_fw->gc_wave_size); 2647 adev->gfx.cu_info.max_waves_per_simd = 2648 le32_to_cpu(gpu_info_fw->gc_max_waves_per_simd); 2649 adev->gfx.cu_info.max_scratch_slots_per_cu = 2650 le32_to_cpu(gpu_info_fw->gc_max_scratch_slots_per_cu); 2651 adev->gfx.cu_info.lds_size = le32_to_cpu(gpu_info_fw->gc_lds_size); 2652 if (hdr->version_minor >= 1) { 2653 const struct gpu_info_firmware_v1_1 *gpu_info_fw = 2654 (const struct gpu_info_firmware_v1_1 *)(adev->firmware.gpu_info_fw->data + 2655 le32_to_cpu(hdr->header.ucode_array_offset_bytes)); 2656 adev->gfx.config.num_sc_per_sh = 2657 le32_to_cpu(gpu_info_fw->num_sc_per_sh); 2658 adev->gfx.config.num_packer_per_sc = 2659 le32_to_cpu(gpu_info_fw->num_packer_per_sc); 2660 } 2661 2662 parse_soc_bounding_box: 2663 /* 2664 * soc bounding box info is not integrated in disocovery table, 2665 * we always need to parse it from gpu info firmware if needed. 2666 */ 2667 if (hdr->version_minor == 2) { 2668 const struct gpu_info_firmware_v1_2 *gpu_info_fw = 2669 (const struct gpu_info_firmware_v1_2 *)(adev->firmware.gpu_info_fw->data + 2670 le32_to_cpu(hdr->header.ucode_array_offset_bytes)); 2671 adev->dm.soc_bounding_box = &gpu_info_fw->soc_bounding_box; 2672 } 2673 break; 2674 } 2675 default: 2676 dev_err(adev->dev, 2677 "Unsupported gpu_info table %d\n", hdr->header.ucode_version); 2678 err = -EINVAL; 2679 goto out; 2680 } 2681 out: 2682 return err; 2683 } 2684 2685 /** 2686 * amdgpu_device_ip_early_init - run early init for hardware IPs 2687 * 2688 * @adev: amdgpu_device pointer 2689 * 2690 * Early initialization pass for hardware IPs. The hardware IPs that make 2691 * up each asic are discovered each IP's early_init callback is run. This 2692 * is the first stage in initializing the asic. 2693 * Returns 0 on success, negative error code on failure. 2694 */ 2695 static int amdgpu_device_ip_early_init(struct amdgpu_device *adev) 2696 { 2697 struct amdgpu_ip_block *ip_block; 2698 struct pci_dev *parent; 2699 bool total, skip_bios; 2700 uint32_t bios_flags; 2701 int i, r; 2702 2703 amdgpu_device_enable_virtual_display(adev); 2704 2705 if (amdgpu_sriov_vf(adev)) { 2706 r = amdgpu_virt_request_full_gpu(adev, true); 2707 if (r) 2708 return r; 2709 } 2710 2711 switch (adev->asic_type) { 2712 #ifdef CONFIG_DRM_AMDGPU_SI 2713 case CHIP_VERDE: 2714 case CHIP_TAHITI: 2715 case CHIP_PITCAIRN: 2716 case CHIP_OLAND: 2717 case CHIP_HAINAN: 2718 adev->family = AMDGPU_FAMILY_SI; 2719 r = si_set_ip_blocks(adev); 2720 if (r) 2721 return r; 2722 break; 2723 #endif 2724 #ifdef CONFIG_DRM_AMDGPU_CIK 2725 case CHIP_BONAIRE: 2726 case CHIP_HAWAII: 2727 case CHIP_KAVERI: 2728 case CHIP_KABINI: 2729 case CHIP_MULLINS: 2730 if (adev->flags & AMD_IS_APU) 2731 adev->family = AMDGPU_FAMILY_KV; 2732 else 2733 adev->family = AMDGPU_FAMILY_CI; 2734 2735 r = cik_set_ip_blocks(adev); 2736 if (r) 2737 return r; 2738 break; 2739 #endif 2740 case CHIP_TOPAZ: 2741 case CHIP_TONGA: 2742 case CHIP_FIJI: 2743 case CHIP_POLARIS10: 2744 case CHIP_POLARIS11: 2745 case CHIP_POLARIS12: 2746 case CHIP_VEGAM: 2747 case CHIP_CARRIZO: 2748 case CHIP_STONEY: 2749 if (adev->flags & AMD_IS_APU) 2750 adev->family = AMDGPU_FAMILY_CZ; 2751 else 2752 adev->family = AMDGPU_FAMILY_VI; 2753 2754 r = vi_set_ip_blocks(adev); 2755 if (r) 2756 return r; 2757 break; 2758 default: 2759 r = amdgpu_discovery_set_ip_blocks(adev); 2760 if (r) 2761 return r; 2762 break; 2763 } 2764 2765 /* Check for IP version 9.4.3 with A0 hardware */ 2766 if (amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 3) && 2767 !amdgpu_device_get_rev_id(adev)) { 2768 dev_err(adev->dev, "Unsupported A0 hardware\n"); 2769 return -ENODEV; /* device unsupported - no device error */ 2770 } 2771 2772 if (amdgpu_has_atpx() && 2773 (amdgpu_is_atpx_hybrid() || 2774 amdgpu_has_atpx_dgpu_power_cntl()) && 2775 ((adev->flags & AMD_IS_APU) == 0) && 2776 !dev_is_removable(&adev->pdev->dev)) 2777 adev->flags |= AMD_IS_PX; 2778 2779 if (!(adev->flags & AMD_IS_APU)) { 2780 parent = pcie_find_root_port(adev->pdev); 2781 adev->has_pr3 = parent ? pci_pr3_present(parent) : false; 2782 } 2783 2784 adev->pm.pp_feature = amdgpu_pp_feature_mask; 2785 if (amdgpu_sriov_vf(adev) || sched_policy == KFD_SCHED_POLICY_NO_HWS) 2786 adev->pm.pp_feature &= ~PP_GFXOFF_MASK; 2787 if (amdgpu_sriov_vf(adev) && adev->asic_type == CHIP_SIENNA_CICHLID) 2788 adev->pm.pp_feature &= ~PP_OVERDRIVE_MASK; 2789 if (!amdgpu_device_pcie_dynamic_switching_supported(adev)) 2790 adev->pm.pp_feature &= ~PP_PCIE_DPM_MASK; 2791 2792 adev->virt.is_xgmi_node_migrate_enabled = false; 2793 if (amdgpu_sriov_vf(adev)) { 2794 adev->virt.is_xgmi_node_migrate_enabled = 2795 amdgpu_ip_version((adev), GC_HWIP, 0) == IP_VERSION(9, 4, 4); 2796 } 2797 2798 total = true; 2799 for (i = 0; i < adev->num_ip_blocks; i++) { 2800 ip_block = &adev->ip_blocks[i]; 2801 2802 if ((amdgpu_ip_block_mask & (1 << i)) == 0) { 2803 dev_warn(adev->dev, "disabled ip block: %d <%s>\n", i, 2804 adev->ip_blocks[i].version->funcs->name); 2805 adev->ip_blocks[i].status.valid = false; 2806 } else if (ip_block->version->funcs->early_init) { 2807 r = ip_block->version->funcs->early_init(ip_block); 2808 if (r == -ENOENT) { 2809 adev->ip_blocks[i].status.valid = false; 2810 } else if (r) { 2811 dev_err(adev->dev, 2812 "early_init of IP block <%s> failed %d\n", 2813 adev->ip_blocks[i].version->funcs->name, 2814 r); 2815 total = false; 2816 } else { 2817 adev->ip_blocks[i].status.valid = true; 2818 } 2819 } else { 2820 adev->ip_blocks[i].status.valid = true; 2821 } 2822 /* get the vbios after the asic_funcs are set up */ 2823 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON) { 2824 r = amdgpu_device_parse_gpu_info_fw(adev); 2825 if (r) 2826 return r; 2827 2828 bios_flags = amdgpu_device_get_vbios_flags(adev); 2829 skip_bios = !!(bios_flags & AMDGPU_VBIOS_SKIP); 2830 /* Read BIOS */ 2831 if (!skip_bios) { 2832 bool optional = 2833 !!(bios_flags & AMDGPU_VBIOS_OPTIONAL); 2834 if (!amdgpu_get_bios(adev) && !optional) 2835 return -EINVAL; 2836 2837 if (optional && !adev->bios) 2838 dev_info( 2839 adev->dev, 2840 "VBIOS image optional, proceeding without VBIOS image"); 2841 2842 if (adev->bios) { 2843 r = amdgpu_atombios_init(adev); 2844 if (r) { 2845 dev_err(adev->dev, 2846 "amdgpu_atombios_init failed\n"); 2847 amdgpu_vf_error_put( 2848 adev, 2849 AMDGIM_ERROR_VF_ATOMBIOS_INIT_FAIL, 2850 0, 0); 2851 return r; 2852 } 2853 } 2854 } 2855 2856 /*get pf2vf msg info at it's earliest time*/ 2857 if (amdgpu_sriov_vf(adev)) 2858 amdgpu_virt_init_data_exchange(adev); 2859 2860 } 2861 } 2862 if (!total) 2863 return -ENODEV; 2864 2865 if (adev->gmc.xgmi.supported) 2866 amdgpu_xgmi_early_init(adev); 2867 2868 ip_block = amdgpu_device_ip_get_ip_block(adev, AMD_IP_BLOCK_TYPE_GFX); 2869 if (ip_block->status.valid != false) 2870 amdgpu_amdkfd_device_probe(adev); 2871 2872 adev->cg_flags &= amdgpu_cg_mask; 2873 adev->pg_flags &= amdgpu_pg_mask; 2874 2875 return 0; 2876 } 2877 2878 static int amdgpu_device_ip_hw_init_phase1(struct amdgpu_device *adev) 2879 { 2880 int i, r; 2881 2882 for (i = 0; i < adev->num_ip_blocks; i++) { 2883 if (!adev->ip_blocks[i].status.sw) 2884 continue; 2885 if (adev->ip_blocks[i].status.hw) 2886 continue; 2887 if (!amdgpu_ip_member_of_hwini( 2888 adev, adev->ip_blocks[i].version->type)) 2889 continue; 2890 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON || 2891 (amdgpu_sriov_vf(adev) && (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP)) || 2892 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH) { 2893 r = adev->ip_blocks[i].version->funcs->hw_init(&adev->ip_blocks[i]); 2894 if (r) { 2895 dev_err(adev->dev, 2896 "hw_init of IP block <%s> failed %d\n", 2897 adev->ip_blocks[i].version->funcs->name, 2898 r); 2899 return r; 2900 } 2901 adev->ip_blocks[i].status.hw = true; 2902 } 2903 } 2904 2905 return 0; 2906 } 2907 2908 static int amdgpu_device_ip_hw_init_phase2(struct amdgpu_device *adev) 2909 { 2910 int i, r; 2911 2912 for (i = 0; i < adev->num_ip_blocks; i++) { 2913 if (!adev->ip_blocks[i].status.sw) 2914 continue; 2915 if (adev->ip_blocks[i].status.hw) 2916 continue; 2917 if (!amdgpu_ip_member_of_hwini( 2918 adev, adev->ip_blocks[i].version->type)) 2919 continue; 2920 r = adev->ip_blocks[i].version->funcs->hw_init(&adev->ip_blocks[i]); 2921 if (r) { 2922 dev_err(adev->dev, 2923 "hw_init of IP block <%s> failed %d\n", 2924 adev->ip_blocks[i].version->funcs->name, r); 2925 return r; 2926 } 2927 adev->ip_blocks[i].status.hw = true; 2928 } 2929 2930 return 0; 2931 } 2932 2933 static int amdgpu_device_fw_loading(struct amdgpu_device *adev) 2934 { 2935 int r = 0; 2936 int i; 2937 uint32_t smu_version; 2938 2939 if (adev->asic_type >= CHIP_VEGA10) { 2940 for (i = 0; i < adev->num_ip_blocks; i++) { 2941 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_PSP) 2942 continue; 2943 2944 if (!amdgpu_ip_member_of_hwini(adev, 2945 AMD_IP_BLOCK_TYPE_PSP)) 2946 break; 2947 2948 if (!adev->ip_blocks[i].status.sw) 2949 continue; 2950 2951 /* no need to do the fw loading again if already done*/ 2952 if (adev->ip_blocks[i].status.hw == true) 2953 break; 2954 2955 if (amdgpu_in_reset(adev) || adev->in_suspend) { 2956 r = amdgpu_ip_block_resume(&adev->ip_blocks[i]); 2957 if (r) 2958 return r; 2959 } else { 2960 r = adev->ip_blocks[i].version->funcs->hw_init(&adev->ip_blocks[i]); 2961 if (r) { 2962 dev_err(adev->dev, 2963 "hw_init of IP block <%s> failed %d\n", 2964 adev->ip_blocks[i] 2965 .version->funcs->name, 2966 r); 2967 return r; 2968 } 2969 adev->ip_blocks[i].status.hw = true; 2970 } 2971 break; 2972 } 2973 } 2974 2975 if (!amdgpu_sriov_vf(adev) || adev->asic_type == CHIP_TONGA) 2976 r = amdgpu_pm_load_smu_firmware(adev, &smu_version); 2977 2978 return r; 2979 } 2980 2981 static int amdgpu_device_init_schedulers(struct amdgpu_device *adev) 2982 { 2983 struct drm_sched_init_args args = { 2984 .ops = &amdgpu_sched_ops, 2985 .num_rqs = DRM_SCHED_PRIORITY_COUNT, 2986 .timeout_wq = adev->reset_domain->wq, 2987 .dev = adev->dev, 2988 }; 2989 long timeout; 2990 int r, i; 2991 2992 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 2993 struct amdgpu_ring *ring = adev->rings[i]; 2994 2995 /* No need to setup the GPU scheduler for rings that don't need it */ 2996 if (!ring || ring->no_scheduler) 2997 continue; 2998 2999 switch (ring->funcs->type) { 3000 case AMDGPU_RING_TYPE_GFX: 3001 timeout = adev->gfx_timeout; 3002 break; 3003 case AMDGPU_RING_TYPE_COMPUTE: 3004 timeout = adev->compute_timeout; 3005 break; 3006 case AMDGPU_RING_TYPE_SDMA: 3007 timeout = adev->sdma_timeout; 3008 break; 3009 default: 3010 timeout = adev->video_timeout; 3011 break; 3012 } 3013 3014 args.timeout = timeout; 3015 args.credit_limit = ring->num_hw_submission; 3016 args.score = ring->sched_score; 3017 args.name = ring->name; 3018 3019 r = drm_sched_init(&ring->sched, &args); 3020 if (r) { 3021 dev_err(adev->dev, 3022 "Failed to create scheduler on ring %s.\n", 3023 ring->name); 3024 return r; 3025 } 3026 r = amdgpu_uvd_entity_init(adev, ring); 3027 if (r) { 3028 dev_err(adev->dev, 3029 "Failed to create UVD scheduling entity on ring %s.\n", 3030 ring->name); 3031 return r; 3032 } 3033 r = amdgpu_vce_entity_init(adev, ring); 3034 if (r) { 3035 dev_err(adev->dev, 3036 "Failed to create VCE scheduling entity on ring %s.\n", 3037 ring->name); 3038 return r; 3039 } 3040 } 3041 3042 if (adev->xcp_mgr) 3043 amdgpu_xcp_update_partition_sched_list(adev); 3044 3045 return 0; 3046 } 3047 3048 3049 /** 3050 * amdgpu_device_ip_init - run init for hardware IPs 3051 * 3052 * @adev: amdgpu_device pointer 3053 * 3054 * Main initialization pass for hardware IPs. The list of all the hardware 3055 * IPs that make up the asic is walked and the sw_init and hw_init callbacks 3056 * are run. sw_init initializes the software state associated with each IP 3057 * and hw_init initializes the hardware associated with each IP. 3058 * Returns 0 on success, negative error code on failure. 3059 */ 3060 static int amdgpu_device_ip_init(struct amdgpu_device *adev) 3061 { 3062 bool init_badpage; 3063 int i, r; 3064 3065 r = amdgpu_ras_init(adev); 3066 if (r) 3067 return r; 3068 3069 for (i = 0; i < adev->num_ip_blocks; i++) { 3070 if (!adev->ip_blocks[i].status.valid) 3071 continue; 3072 if (adev->ip_blocks[i].version->funcs->sw_init) { 3073 r = adev->ip_blocks[i].version->funcs->sw_init(&adev->ip_blocks[i]); 3074 if (r) { 3075 dev_err(adev->dev, 3076 "sw_init of IP block <%s> failed %d\n", 3077 adev->ip_blocks[i].version->funcs->name, 3078 r); 3079 goto init_failed; 3080 } 3081 } 3082 adev->ip_blocks[i].status.sw = true; 3083 3084 if (!amdgpu_ip_member_of_hwini( 3085 adev, adev->ip_blocks[i].version->type)) 3086 continue; 3087 3088 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON) { 3089 /* need to do common hw init early so everything is set up for gmc */ 3090 r = adev->ip_blocks[i].version->funcs->hw_init(&adev->ip_blocks[i]); 3091 if (r) { 3092 dev_err(adev->dev, "hw_init %d failed %d\n", i, 3093 r); 3094 goto init_failed; 3095 } 3096 adev->ip_blocks[i].status.hw = true; 3097 } else if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) { 3098 /* need to do gmc hw init early so we can allocate gpu mem */ 3099 /* Try to reserve bad pages early */ 3100 if (amdgpu_sriov_vf(adev)) 3101 amdgpu_virt_exchange_data(adev); 3102 3103 r = amdgpu_device_mem_scratch_init(adev); 3104 if (r) { 3105 dev_err(adev->dev, 3106 "amdgpu_mem_scratch_init failed %d\n", 3107 r); 3108 goto init_failed; 3109 } 3110 r = adev->ip_blocks[i].version->funcs->hw_init(&adev->ip_blocks[i]); 3111 if (r) { 3112 dev_err(adev->dev, "hw_init %d failed %d\n", i, 3113 r); 3114 goto init_failed; 3115 } 3116 r = amdgpu_device_wb_init(adev); 3117 if (r) { 3118 dev_err(adev->dev, 3119 "amdgpu_device_wb_init failed %d\n", r); 3120 goto init_failed; 3121 } 3122 adev->ip_blocks[i].status.hw = true; 3123 3124 /* right after GMC hw init, we create CSA */ 3125 if (adev->gfx.mcbp) { 3126 r = amdgpu_allocate_static_csa(adev, &adev->virt.csa_obj, 3127 AMDGPU_GEM_DOMAIN_VRAM | 3128 AMDGPU_GEM_DOMAIN_GTT, 3129 AMDGPU_CSA_SIZE); 3130 if (r) { 3131 dev_err(adev->dev, 3132 "allocate CSA failed %d\n", r); 3133 goto init_failed; 3134 } 3135 } 3136 3137 r = amdgpu_seq64_init(adev); 3138 if (r) { 3139 dev_err(adev->dev, "allocate seq64 failed %d\n", 3140 r); 3141 goto init_failed; 3142 } 3143 } 3144 } 3145 3146 if (amdgpu_sriov_vf(adev)) 3147 amdgpu_virt_init_data_exchange(adev); 3148 3149 r = amdgpu_ib_pool_init(adev); 3150 if (r) { 3151 dev_err(adev->dev, "IB initialization failed (%d).\n", r); 3152 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_IB_INIT_FAIL, 0, r); 3153 goto init_failed; 3154 } 3155 3156 r = amdgpu_ucode_create_bo(adev); /* create ucode bo when sw_init complete*/ 3157 if (r) 3158 goto init_failed; 3159 3160 r = amdgpu_device_ip_hw_init_phase1(adev); 3161 if (r) 3162 goto init_failed; 3163 3164 r = amdgpu_device_fw_loading(adev); 3165 if (r) 3166 goto init_failed; 3167 3168 r = amdgpu_device_ip_hw_init_phase2(adev); 3169 if (r) 3170 goto init_failed; 3171 3172 /* 3173 * retired pages will be loaded from eeprom and reserved here, 3174 * it should be called after amdgpu_device_ip_hw_init_phase2 since 3175 * for some ASICs the RAS EEPROM code relies on SMU fully functioning 3176 * for I2C communication which only true at this point. 3177 * 3178 * amdgpu_ras_recovery_init may fail, but the upper only cares the 3179 * failure from bad gpu situation and stop amdgpu init process 3180 * accordingly. For other failed cases, it will still release all 3181 * the resource and print error message, rather than returning one 3182 * negative value to upper level. 3183 * 3184 * Note: theoretically, this should be called before all vram allocations 3185 * to protect retired page from abusing 3186 */ 3187 init_badpage = (adev->init_lvl->level != AMDGPU_INIT_LEVEL_MINIMAL_XGMI); 3188 r = amdgpu_ras_recovery_init(adev, init_badpage); 3189 if (r) 3190 goto init_failed; 3191 3192 /** 3193 * In case of XGMI grab extra reference for reset domain for this device 3194 */ 3195 if (adev->gmc.xgmi.num_physical_nodes > 1) { 3196 if (amdgpu_xgmi_add_device(adev) == 0) { 3197 if (!amdgpu_sriov_vf(adev)) { 3198 struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev); 3199 3200 if (WARN_ON(!hive)) { 3201 r = -ENOENT; 3202 goto init_failed; 3203 } 3204 3205 if (!hive->reset_domain || 3206 !amdgpu_reset_get_reset_domain(hive->reset_domain)) { 3207 r = -ENOENT; 3208 amdgpu_put_xgmi_hive(hive); 3209 goto init_failed; 3210 } 3211 3212 /* Drop the early temporary reset domain we created for device */ 3213 amdgpu_reset_put_reset_domain(adev->reset_domain); 3214 adev->reset_domain = hive->reset_domain; 3215 amdgpu_put_xgmi_hive(hive); 3216 } 3217 } 3218 } 3219 3220 r = amdgpu_device_init_schedulers(adev); 3221 if (r) 3222 goto init_failed; 3223 3224 if (adev->mman.buffer_funcs_ring->sched.ready) 3225 amdgpu_ttm_set_buffer_funcs_status(adev, true); 3226 3227 /* Don't init kfd if whole hive need to be reset during init */ 3228 if (adev->init_lvl->level != AMDGPU_INIT_LEVEL_MINIMAL_XGMI) { 3229 kgd2kfd_init_zone_device(adev); 3230 amdgpu_amdkfd_device_init(adev); 3231 } 3232 3233 amdgpu_fru_get_product_info(adev); 3234 3235 if (!amdgpu_sriov_vf(adev) || amdgpu_sriov_ras_cper_en(adev)) 3236 r = amdgpu_cper_init(adev); 3237 3238 init_failed: 3239 3240 return r; 3241 } 3242 3243 /** 3244 * amdgpu_device_fill_reset_magic - writes reset magic to gart pointer 3245 * 3246 * @adev: amdgpu_device pointer 3247 * 3248 * Writes a reset magic value to the gart pointer in VRAM. The driver calls 3249 * this function before a GPU reset. If the value is retained after a 3250 * GPU reset, VRAM has not been lost. Some GPU resets may destroy VRAM contents. 3251 */ 3252 static void amdgpu_device_fill_reset_magic(struct amdgpu_device *adev) 3253 { 3254 memcpy(adev->reset_magic, adev->gart.ptr, AMDGPU_RESET_MAGIC_NUM); 3255 } 3256 3257 /** 3258 * amdgpu_device_check_vram_lost - check if vram is valid 3259 * 3260 * @adev: amdgpu_device pointer 3261 * 3262 * Checks the reset magic value written to the gart pointer in VRAM. 3263 * The driver calls this after a GPU reset to see if the contents of 3264 * VRAM is lost or now. 3265 * returns true if vram is lost, false if not. 3266 */ 3267 static bool amdgpu_device_check_vram_lost(struct amdgpu_device *adev) 3268 { 3269 if (memcmp(adev->gart.ptr, adev->reset_magic, 3270 AMDGPU_RESET_MAGIC_NUM)) 3271 return true; 3272 3273 if (!amdgpu_in_reset(adev)) 3274 return false; 3275 3276 /* 3277 * For all ASICs with baco/mode1 reset, the VRAM is 3278 * always assumed to be lost. 3279 */ 3280 switch (amdgpu_asic_reset_method(adev)) { 3281 case AMD_RESET_METHOD_LINK: 3282 case AMD_RESET_METHOD_BACO: 3283 case AMD_RESET_METHOD_MODE1: 3284 return true; 3285 default: 3286 return false; 3287 } 3288 } 3289 3290 /** 3291 * amdgpu_device_set_cg_state - set clockgating for amdgpu device 3292 * 3293 * @adev: amdgpu_device pointer 3294 * @state: clockgating state (gate or ungate) 3295 * 3296 * The list of all the hardware IPs that make up the asic is walked and the 3297 * set_clockgating_state callbacks are run. 3298 * Late initialization pass enabling clockgating for hardware IPs. 3299 * Fini or suspend, pass disabling clockgating for hardware IPs. 3300 * Returns 0 on success, negative error code on failure. 3301 */ 3302 3303 int amdgpu_device_set_cg_state(struct amdgpu_device *adev, 3304 enum amd_clockgating_state state) 3305 { 3306 int i, j, r; 3307 3308 if (amdgpu_emu_mode == 1) 3309 return 0; 3310 3311 for (j = 0; j < adev->num_ip_blocks; j++) { 3312 i = state == AMD_CG_STATE_GATE ? j : adev->num_ip_blocks - j - 1; 3313 if (!adev->ip_blocks[i].status.late_initialized) 3314 continue; 3315 /* skip CG for GFX, SDMA on S0ix */ 3316 if (adev->in_s0ix && 3317 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GFX || 3318 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SDMA)) 3319 continue; 3320 /* skip CG for VCE/UVD, it's handled specially */ 3321 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_UVD && 3322 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCE && 3323 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCN && 3324 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_JPEG && 3325 adev->ip_blocks[i].version->funcs->set_clockgating_state) { 3326 /* enable clockgating to save power */ 3327 r = adev->ip_blocks[i].version->funcs->set_clockgating_state(&adev->ip_blocks[i], 3328 state); 3329 if (r) { 3330 dev_err(adev->dev, 3331 "set_clockgating_state(gate) of IP block <%s> failed %d\n", 3332 adev->ip_blocks[i].version->funcs->name, 3333 r); 3334 return r; 3335 } 3336 } 3337 } 3338 3339 return 0; 3340 } 3341 3342 int amdgpu_device_set_pg_state(struct amdgpu_device *adev, 3343 enum amd_powergating_state state) 3344 { 3345 int i, j, r; 3346 3347 if (amdgpu_emu_mode == 1) 3348 return 0; 3349 3350 for (j = 0; j < adev->num_ip_blocks; j++) { 3351 i = state == AMD_PG_STATE_GATE ? j : adev->num_ip_blocks - j - 1; 3352 if (!adev->ip_blocks[i].status.late_initialized) 3353 continue; 3354 /* skip PG for GFX, SDMA on S0ix */ 3355 if (adev->in_s0ix && 3356 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GFX || 3357 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SDMA)) 3358 continue; 3359 /* skip CG for VCE/UVD, it's handled specially */ 3360 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_UVD && 3361 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCE && 3362 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCN && 3363 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_JPEG && 3364 adev->ip_blocks[i].version->funcs->set_powergating_state) { 3365 /* enable powergating to save power */ 3366 r = adev->ip_blocks[i].version->funcs->set_powergating_state(&adev->ip_blocks[i], 3367 state); 3368 if (r) { 3369 dev_err(adev->dev, 3370 "set_powergating_state(gate) of IP block <%s> failed %d\n", 3371 adev->ip_blocks[i].version->funcs->name, 3372 r); 3373 return r; 3374 } 3375 } 3376 } 3377 return 0; 3378 } 3379 3380 static int amdgpu_device_enable_mgpu_fan_boost(void) 3381 { 3382 struct amdgpu_gpu_instance *gpu_ins; 3383 struct amdgpu_device *adev; 3384 int i, ret = 0; 3385 3386 mutex_lock(&mgpu_info.mutex); 3387 3388 /* 3389 * MGPU fan boost feature should be enabled 3390 * only when there are two or more dGPUs in 3391 * the system 3392 */ 3393 if (mgpu_info.num_dgpu < 2) 3394 goto out; 3395 3396 for (i = 0; i < mgpu_info.num_dgpu; i++) { 3397 gpu_ins = &(mgpu_info.gpu_ins[i]); 3398 adev = gpu_ins->adev; 3399 if (!(adev->flags & AMD_IS_APU) && 3400 !gpu_ins->mgpu_fan_enabled) { 3401 ret = amdgpu_dpm_enable_mgpu_fan_boost(adev); 3402 if (ret) 3403 break; 3404 3405 gpu_ins->mgpu_fan_enabled = 1; 3406 } 3407 } 3408 3409 out: 3410 mutex_unlock(&mgpu_info.mutex); 3411 3412 return ret; 3413 } 3414 3415 /** 3416 * amdgpu_device_ip_late_init - run late init for hardware IPs 3417 * 3418 * @adev: amdgpu_device pointer 3419 * 3420 * Late initialization pass for hardware IPs. The list of all the hardware 3421 * IPs that make up the asic is walked and the late_init callbacks are run. 3422 * late_init covers any special initialization that an IP requires 3423 * after all of the have been initialized or something that needs to happen 3424 * late in the init process. 3425 * Returns 0 on success, negative error code on failure. 3426 */ 3427 static int amdgpu_device_ip_late_init(struct amdgpu_device *adev) 3428 { 3429 struct amdgpu_gpu_instance *gpu_instance; 3430 int i = 0, r; 3431 3432 for (i = 0; i < adev->num_ip_blocks; i++) { 3433 if (!adev->ip_blocks[i].status.hw) 3434 continue; 3435 if (adev->ip_blocks[i].version->funcs->late_init) { 3436 r = adev->ip_blocks[i].version->funcs->late_init(&adev->ip_blocks[i]); 3437 if (r) { 3438 dev_err(adev->dev, 3439 "late_init of IP block <%s> failed %d\n", 3440 adev->ip_blocks[i].version->funcs->name, 3441 r); 3442 return r; 3443 } 3444 } 3445 adev->ip_blocks[i].status.late_initialized = true; 3446 } 3447 3448 r = amdgpu_ras_late_init(adev); 3449 if (r) { 3450 dev_err(adev->dev, "amdgpu_ras_late_init failed %d", r); 3451 return r; 3452 } 3453 3454 if (!amdgpu_reset_in_recovery(adev)) 3455 amdgpu_ras_set_error_query_ready(adev, true); 3456 3457 amdgpu_device_set_cg_state(adev, AMD_CG_STATE_GATE); 3458 amdgpu_device_set_pg_state(adev, AMD_PG_STATE_GATE); 3459 3460 amdgpu_device_fill_reset_magic(adev); 3461 3462 r = amdgpu_device_enable_mgpu_fan_boost(); 3463 if (r) 3464 dev_err(adev->dev, "enable mgpu fan boost failed (%d).\n", r); 3465 3466 /* For passthrough configuration on arcturus and aldebaran, enable special handling SBR */ 3467 if (amdgpu_passthrough(adev) && 3468 ((adev->asic_type == CHIP_ARCTURUS && adev->gmc.xgmi.num_physical_nodes > 1) || 3469 adev->asic_type == CHIP_ALDEBARAN)) 3470 amdgpu_dpm_handle_passthrough_sbr(adev, true); 3471 3472 if (adev->gmc.xgmi.num_physical_nodes > 1) { 3473 mutex_lock(&mgpu_info.mutex); 3474 3475 /* 3476 * Reset device p-state to low as this was booted with high. 3477 * 3478 * This should be performed only after all devices from the same 3479 * hive get initialized. 3480 * 3481 * However, it's unknown how many device in the hive in advance. 3482 * As this is counted one by one during devices initializations. 3483 * 3484 * So, we wait for all XGMI interlinked devices initialized. 3485 * This may bring some delays as those devices may come from 3486 * different hives. But that should be OK. 3487 */ 3488 if (mgpu_info.num_dgpu == adev->gmc.xgmi.num_physical_nodes) { 3489 for (i = 0; i < mgpu_info.num_gpu; i++) { 3490 gpu_instance = &(mgpu_info.gpu_ins[i]); 3491 if (gpu_instance->adev->flags & AMD_IS_APU) 3492 continue; 3493 3494 r = amdgpu_xgmi_set_pstate(gpu_instance->adev, 3495 AMDGPU_XGMI_PSTATE_MIN); 3496 if (r) { 3497 dev_err(adev->dev, 3498 "pstate setting failed (%d).\n", 3499 r); 3500 break; 3501 } 3502 } 3503 } 3504 3505 mutex_unlock(&mgpu_info.mutex); 3506 } 3507 3508 return 0; 3509 } 3510 3511 static void amdgpu_ip_block_hw_fini(struct amdgpu_ip_block *ip_block) 3512 { 3513 struct amdgpu_device *adev = ip_block->adev; 3514 int r; 3515 3516 if (!ip_block->version->funcs->hw_fini) { 3517 dev_err(adev->dev, "hw_fini of IP block <%s> not defined\n", 3518 ip_block->version->funcs->name); 3519 } else { 3520 r = ip_block->version->funcs->hw_fini(ip_block); 3521 /* XXX handle errors */ 3522 if (r) { 3523 dev_dbg(adev->dev, 3524 "hw_fini of IP block <%s> failed %d\n", 3525 ip_block->version->funcs->name, r); 3526 } 3527 } 3528 3529 ip_block->status.hw = false; 3530 } 3531 3532 /** 3533 * amdgpu_device_smu_fini_early - smu hw_fini wrapper 3534 * 3535 * @adev: amdgpu_device pointer 3536 * 3537 * For ASICs need to disable SMC first 3538 */ 3539 static void amdgpu_device_smu_fini_early(struct amdgpu_device *adev) 3540 { 3541 int i; 3542 3543 if (amdgpu_ip_version(adev, GC_HWIP, 0) > IP_VERSION(9, 0, 0)) 3544 return; 3545 3546 for (i = 0; i < adev->num_ip_blocks; i++) { 3547 if (!adev->ip_blocks[i].status.hw) 3548 continue; 3549 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) { 3550 amdgpu_ip_block_hw_fini(&adev->ip_blocks[i]); 3551 break; 3552 } 3553 } 3554 } 3555 3556 static int amdgpu_device_ip_fini_early(struct amdgpu_device *adev) 3557 { 3558 int i, r; 3559 3560 for (i = 0; i < adev->num_ip_blocks; i++) { 3561 if (!adev->ip_blocks[i].version->funcs->early_fini) 3562 continue; 3563 3564 r = adev->ip_blocks[i].version->funcs->early_fini(&adev->ip_blocks[i]); 3565 if (r) { 3566 dev_dbg(adev->dev, 3567 "early_fini of IP block <%s> failed %d\n", 3568 adev->ip_blocks[i].version->funcs->name, r); 3569 } 3570 } 3571 3572 amdgpu_device_set_pg_state(adev, AMD_PG_STATE_UNGATE); 3573 amdgpu_device_set_cg_state(adev, AMD_CG_STATE_UNGATE); 3574 3575 amdgpu_amdkfd_suspend(adev, true); 3576 amdgpu_userq_suspend(adev); 3577 3578 /* Workaround for ASICs need to disable SMC first */ 3579 amdgpu_device_smu_fini_early(adev); 3580 3581 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 3582 if (!adev->ip_blocks[i].status.hw) 3583 continue; 3584 3585 amdgpu_ip_block_hw_fini(&adev->ip_blocks[i]); 3586 } 3587 3588 if (amdgpu_sriov_vf(adev)) { 3589 if (amdgpu_virt_release_full_gpu(adev, false)) 3590 dev_err(adev->dev, 3591 "failed to release exclusive mode on fini\n"); 3592 } 3593 3594 return 0; 3595 } 3596 3597 /** 3598 * amdgpu_device_ip_fini - run fini for hardware IPs 3599 * 3600 * @adev: amdgpu_device pointer 3601 * 3602 * Main teardown pass for hardware IPs. The list of all the hardware 3603 * IPs that make up the asic is walked and the hw_fini and sw_fini callbacks 3604 * are run. hw_fini tears down the hardware associated with each IP 3605 * and sw_fini tears down any software state associated with each IP. 3606 * Returns 0 on success, negative error code on failure. 3607 */ 3608 static int amdgpu_device_ip_fini(struct amdgpu_device *adev) 3609 { 3610 int i, r; 3611 3612 amdgpu_cper_fini(adev); 3613 3614 if (amdgpu_sriov_vf(adev) && adev->virt.ras_init_done) 3615 amdgpu_virt_release_ras_err_handler_data(adev); 3616 3617 if (adev->gmc.xgmi.num_physical_nodes > 1) 3618 amdgpu_xgmi_remove_device(adev); 3619 3620 amdgpu_amdkfd_device_fini_sw(adev); 3621 3622 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 3623 if (!adev->ip_blocks[i].status.sw) 3624 continue; 3625 3626 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) { 3627 amdgpu_ucode_free_bo(adev); 3628 amdgpu_free_static_csa(&adev->virt.csa_obj); 3629 amdgpu_device_wb_fini(adev); 3630 amdgpu_device_mem_scratch_fini(adev); 3631 amdgpu_ib_pool_fini(adev); 3632 amdgpu_seq64_fini(adev); 3633 amdgpu_doorbell_fini(adev); 3634 } 3635 if (adev->ip_blocks[i].version->funcs->sw_fini) { 3636 r = adev->ip_blocks[i].version->funcs->sw_fini(&adev->ip_blocks[i]); 3637 /* XXX handle errors */ 3638 if (r) { 3639 dev_dbg(adev->dev, 3640 "sw_fini of IP block <%s> failed %d\n", 3641 adev->ip_blocks[i].version->funcs->name, 3642 r); 3643 } 3644 } 3645 adev->ip_blocks[i].status.sw = false; 3646 adev->ip_blocks[i].status.valid = false; 3647 } 3648 3649 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 3650 if (!adev->ip_blocks[i].status.late_initialized) 3651 continue; 3652 if (adev->ip_blocks[i].version->funcs->late_fini) 3653 adev->ip_blocks[i].version->funcs->late_fini(&adev->ip_blocks[i]); 3654 adev->ip_blocks[i].status.late_initialized = false; 3655 } 3656 3657 amdgpu_ras_fini(adev); 3658 3659 return 0; 3660 } 3661 3662 /** 3663 * amdgpu_device_delayed_init_work_handler - work handler for IB tests 3664 * 3665 * @work: work_struct. 3666 */ 3667 static void amdgpu_device_delayed_init_work_handler(struct work_struct *work) 3668 { 3669 struct amdgpu_device *adev = 3670 container_of(work, struct amdgpu_device, delayed_init_work.work); 3671 int r; 3672 3673 r = amdgpu_ib_ring_tests(adev); 3674 if (r) 3675 dev_err(adev->dev, "ib ring test failed (%d).\n", r); 3676 } 3677 3678 static void amdgpu_device_delay_enable_gfx_off(struct work_struct *work) 3679 { 3680 struct amdgpu_device *adev = 3681 container_of(work, struct amdgpu_device, gfx.gfx_off_delay_work.work); 3682 3683 WARN_ON_ONCE(adev->gfx.gfx_off_state); 3684 WARN_ON_ONCE(adev->gfx.gfx_off_req_count); 3685 3686 if (!amdgpu_dpm_set_powergating_by_smu(adev, AMD_IP_BLOCK_TYPE_GFX, true, 0)) 3687 adev->gfx.gfx_off_state = true; 3688 } 3689 3690 /** 3691 * amdgpu_device_ip_suspend_phase1 - run suspend for hardware IPs (phase 1) 3692 * 3693 * @adev: amdgpu_device pointer 3694 * 3695 * Main suspend function for hardware IPs. The list of all the hardware 3696 * IPs that make up the asic is walked, clockgating is disabled and the 3697 * suspend callbacks are run. suspend puts the hardware and software state 3698 * in each IP into a state suitable for suspend. 3699 * Returns 0 on success, negative error code on failure. 3700 */ 3701 static int amdgpu_device_ip_suspend_phase1(struct amdgpu_device *adev) 3702 { 3703 int i, r; 3704 3705 amdgpu_device_set_pg_state(adev, AMD_PG_STATE_UNGATE); 3706 amdgpu_device_set_cg_state(adev, AMD_CG_STATE_UNGATE); 3707 3708 /* 3709 * Per PMFW team's suggestion, driver needs to handle gfxoff 3710 * and df cstate features disablement for gpu reset(e.g. Mode1Reset) 3711 * scenario. Add the missing df cstate disablement here. 3712 */ 3713 if (amdgpu_dpm_set_df_cstate(adev, DF_CSTATE_DISALLOW)) 3714 dev_warn(adev->dev, "Failed to disallow df cstate"); 3715 3716 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 3717 if (!adev->ip_blocks[i].status.valid) 3718 continue; 3719 3720 /* displays are handled separately */ 3721 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_DCE) 3722 continue; 3723 3724 /* XXX handle errors */ 3725 r = amdgpu_ip_block_suspend(&adev->ip_blocks[i]); 3726 if (r) 3727 return r; 3728 } 3729 3730 return 0; 3731 } 3732 3733 /** 3734 * amdgpu_device_ip_suspend_phase2 - run suspend for hardware IPs (phase 2) 3735 * 3736 * @adev: amdgpu_device pointer 3737 * 3738 * Main suspend function for hardware IPs. The list of all the hardware 3739 * IPs that make up the asic is walked, clockgating is disabled and the 3740 * suspend callbacks are run. suspend puts the hardware and software state 3741 * in each IP into a state suitable for suspend. 3742 * Returns 0 on success, negative error code on failure. 3743 */ 3744 static int amdgpu_device_ip_suspend_phase2(struct amdgpu_device *adev) 3745 { 3746 int i, r; 3747 3748 if (adev->in_s0ix) 3749 amdgpu_dpm_gfx_state_change(adev, sGpuChangeState_D3Entry); 3750 3751 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 3752 if (!adev->ip_blocks[i].status.valid) 3753 continue; 3754 /* displays are handled in phase1 */ 3755 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE) 3756 continue; 3757 /* PSP lost connection when err_event_athub occurs */ 3758 if (amdgpu_ras_intr_triggered() && 3759 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) { 3760 adev->ip_blocks[i].status.hw = false; 3761 continue; 3762 } 3763 3764 /* skip unnecessary suspend if we do not initialize them yet */ 3765 if (!amdgpu_ip_member_of_hwini( 3766 adev, adev->ip_blocks[i].version->type)) 3767 continue; 3768 3769 /* Since we skip suspend for S0i3, we need to cancel the delayed 3770 * idle work here as the suspend callback never gets called. 3771 */ 3772 if (adev->in_s0ix && 3773 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GFX && 3774 amdgpu_ip_version(adev, GC_HWIP, 0) >= IP_VERSION(10, 0, 0)) 3775 cancel_delayed_work_sync(&adev->gfx.idle_work); 3776 /* skip suspend of gfx/mes and psp for S0ix 3777 * gfx is in gfxoff state, so on resume it will exit gfxoff just 3778 * like at runtime. PSP is also part of the always on hardware 3779 * so no need to suspend it. 3780 */ 3781 if (adev->in_s0ix && 3782 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP || 3783 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GFX || 3784 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_MES)) 3785 continue; 3786 3787 /* SDMA 5.x+ is part of GFX power domain so it's covered by GFXOFF */ 3788 if (adev->in_s0ix && 3789 (amdgpu_ip_version(adev, SDMA0_HWIP, 0) >= 3790 IP_VERSION(5, 0, 0)) && 3791 (adev->ip_blocks[i].version->type == 3792 AMD_IP_BLOCK_TYPE_SDMA)) 3793 continue; 3794 3795 /* Once swPSP provides the IMU, RLC FW binaries to TOS during cold-boot. 3796 * These are in TMR, hence are expected to be reused by PSP-TOS to reload 3797 * from this location and RLC Autoload automatically also gets loaded 3798 * from here based on PMFW -> PSP message during re-init sequence. 3799 * Therefore, the psp suspend & resume should be skipped to avoid destroy 3800 * the TMR and reload FWs again for IMU enabled APU ASICs. 3801 */ 3802 if (amdgpu_in_reset(adev) && 3803 (adev->flags & AMD_IS_APU) && adev->gfx.imu.funcs && 3804 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) 3805 continue; 3806 3807 /* XXX handle errors */ 3808 r = amdgpu_ip_block_suspend(&adev->ip_blocks[i]); 3809 adev->ip_blocks[i].status.hw = false; 3810 3811 /* handle putting the SMC in the appropriate state */ 3812 if (!amdgpu_sriov_vf(adev)) { 3813 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) { 3814 r = amdgpu_dpm_set_mp1_state(adev, adev->mp1_state); 3815 if (r) { 3816 dev_err(adev->dev, 3817 "SMC failed to set mp1 state %d, %d\n", 3818 adev->mp1_state, r); 3819 return r; 3820 } 3821 } 3822 } 3823 } 3824 3825 return 0; 3826 } 3827 3828 /** 3829 * amdgpu_device_ip_suspend - run suspend for hardware IPs 3830 * 3831 * @adev: amdgpu_device pointer 3832 * 3833 * Main suspend function for hardware IPs. The list of all the hardware 3834 * IPs that make up the asic is walked, clockgating is disabled and the 3835 * suspend callbacks are run. suspend puts the hardware and software state 3836 * in each IP into a state suitable for suspend. 3837 * Returns 0 on success, negative error code on failure. 3838 */ 3839 int amdgpu_device_ip_suspend(struct amdgpu_device *adev) 3840 { 3841 int r; 3842 3843 if (amdgpu_sriov_vf(adev)) { 3844 amdgpu_virt_fini_data_exchange(adev); 3845 amdgpu_virt_request_full_gpu(adev, false); 3846 } 3847 3848 amdgpu_ttm_set_buffer_funcs_status(adev, false); 3849 3850 r = amdgpu_device_ip_suspend_phase1(adev); 3851 if (r) 3852 return r; 3853 r = amdgpu_device_ip_suspend_phase2(adev); 3854 3855 if (amdgpu_sriov_vf(adev)) 3856 amdgpu_virt_release_full_gpu(adev, false); 3857 3858 return r; 3859 } 3860 3861 static int amdgpu_device_ip_reinit_early_sriov(struct amdgpu_device *adev) 3862 { 3863 int i, r; 3864 3865 static enum amd_ip_block_type ip_order[] = { 3866 AMD_IP_BLOCK_TYPE_COMMON, 3867 AMD_IP_BLOCK_TYPE_GMC, 3868 AMD_IP_BLOCK_TYPE_PSP, 3869 AMD_IP_BLOCK_TYPE_IH, 3870 }; 3871 3872 for (i = 0; i < adev->num_ip_blocks; i++) { 3873 int j; 3874 struct amdgpu_ip_block *block; 3875 3876 block = &adev->ip_blocks[i]; 3877 block->status.hw = false; 3878 3879 for (j = 0; j < ARRAY_SIZE(ip_order); j++) { 3880 3881 if (block->version->type != ip_order[j] || 3882 !block->status.valid) 3883 continue; 3884 3885 r = block->version->funcs->hw_init(&adev->ip_blocks[i]); 3886 if (r) { 3887 dev_err(adev->dev, "RE-INIT-early: %s failed\n", 3888 block->version->funcs->name); 3889 return r; 3890 } 3891 block->status.hw = true; 3892 } 3893 } 3894 3895 return 0; 3896 } 3897 3898 static int amdgpu_device_ip_reinit_late_sriov(struct amdgpu_device *adev) 3899 { 3900 struct amdgpu_ip_block *block; 3901 int i, r = 0; 3902 3903 static enum amd_ip_block_type ip_order[] = { 3904 AMD_IP_BLOCK_TYPE_SMC, 3905 AMD_IP_BLOCK_TYPE_DCE, 3906 AMD_IP_BLOCK_TYPE_GFX, 3907 AMD_IP_BLOCK_TYPE_SDMA, 3908 AMD_IP_BLOCK_TYPE_MES, 3909 AMD_IP_BLOCK_TYPE_UVD, 3910 AMD_IP_BLOCK_TYPE_VCE, 3911 AMD_IP_BLOCK_TYPE_VCN, 3912 AMD_IP_BLOCK_TYPE_JPEG 3913 }; 3914 3915 for (i = 0; i < ARRAY_SIZE(ip_order); i++) { 3916 block = amdgpu_device_ip_get_ip_block(adev, ip_order[i]); 3917 3918 if (!block) 3919 continue; 3920 3921 if (block->status.valid && !block->status.hw) { 3922 if (block->version->type == AMD_IP_BLOCK_TYPE_SMC) { 3923 r = amdgpu_ip_block_resume(block); 3924 } else { 3925 r = block->version->funcs->hw_init(block); 3926 } 3927 3928 if (r) { 3929 dev_err(adev->dev, "RE-INIT-late: %s failed\n", 3930 block->version->funcs->name); 3931 break; 3932 } 3933 block->status.hw = true; 3934 } 3935 } 3936 3937 return r; 3938 } 3939 3940 /** 3941 * amdgpu_device_ip_resume_phase1 - run resume for hardware IPs 3942 * 3943 * @adev: amdgpu_device pointer 3944 * 3945 * First resume function for hardware IPs. The list of all the hardware 3946 * IPs that make up the asic is walked and the resume callbacks are run for 3947 * COMMON, GMC, and IH. resume puts the hardware into a functional state 3948 * after a suspend and updates the software state as necessary. This 3949 * function is also used for restoring the GPU after a GPU reset. 3950 * Returns 0 on success, negative error code on failure. 3951 */ 3952 static int amdgpu_device_ip_resume_phase1(struct amdgpu_device *adev) 3953 { 3954 int i, r; 3955 3956 for (i = 0; i < adev->num_ip_blocks; i++) { 3957 if (!adev->ip_blocks[i].status.valid || adev->ip_blocks[i].status.hw) 3958 continue; 3959 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON || 3960 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC || 3961 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH || 3962 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP && amdgpu_sriov_vf(adev))) { 3963 3964 r = amdgpu_ip_block_resume(&adev->ip_blocks[i]); 3965 if (r) 3966 return r; 3967 } 3968 } 3969 3970 return 0; 3971 } 3972 3973 /** 3974 * amdgpu_device_ip_resume_phase2 - run resume for hardware IPs 3975 * 3976 * @adev: amdgpu_device pointer 3977 * 3978 * Second resume function for hardware IPs. The list of all the hardware 3979 * IPs that make up the asic is walked and the resume callbacks are run for 3980 * all blocks except COMMON, GMC, and IH. resume puts the hardware into a 3981 * functional state after a suspend and updates the software state as 3982 * necessary. This function is also used for restoring the GPU after a GPU 3983 * reset. 3984 * Returns 0 on success, negative error code on failure. 3985 */ 3986 static int amdgpu_device_ip_resume_phase2(struct amdgpu_device *adev) 3987 { 3988 int i, r; 3989 3990 for (i = 0; i < adev->num_ip_blocks; i++) { 3991 if (!adev->ip_blocks[i].status.valid || adev->ip_blocks[i].status.hw) 3992 continue; 3993 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON || 3994 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC || 3995 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH || 3996 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE || 3997 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) 3998 continue; 3999 r = amdgpu_ip_block_resume(&adev->ip_blocks[i]); 4000 if (r) 4001 return r; 4002 } 4003 4004 return 0; 4005 } 4006 4007 /** 4008 * amdgpu_device_ip_resume_phase3 - run resume for hardware IPs 4009 * 4010 * @adev: amdgpu_device pointer 4011 * 4012 * Third resume function for hardware IPs. The list of all the hardware 4013 * IPs that make up the asic is walked and the resume callbacks are run for 4014 * all DCE. resume puts the hardware into a functional state after a suspend 4015 * and updates the software state as necessary. This function is also used 4016 * for restoring the GPU after a GPU reset. 4017 * 4018 * Returns 0 on success, negative error code on failure. 4019 */ 4020 static int amdgpu_device_ip_resume_phase3(struct amdgpu_device *adev) 4021 { 4022 int i, r; 4023 4024 for (i = 0; i < adev->num_ip_blocks; i++) { 4025 if (!adev->ip_blocks[i].status.valid || adev->ip_blocks[i].status.hw) 4026 continue; 4027 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE) { 4028 r = amdgpu_ip_block_resume(&adev->ip_blocks[i]); 4029 if (r) 4030 return r; 4031 } 4032 } 4033 4034 return 0; 4035 } 4036 4037 /** 4038 * amdgpu_device_ip_resume - run resume for hardware IPs 4039 * 4040 * @adev: amdgpu_device pointer 4041 * 4042 * Main resume function for hardware IPs. The hardware IPs 4043 * are split into two resume functions because they are 4044 * also used in recovering from a GPU reset and some additional 4045 * steps need to be take between them. In this case (S3/S4) they are 4046 * run sequentially. 4047 * Returns 0 on success, negative error code on failure. 4048 */ 4049 static int amdgpu_device_ip_resume(struct amdgpu_device *adev) 4050 { 4051 int r; 4052 4053 r = amdgpu_device_ip_resume_phase1(adev); 4054 if (r) 4055 return r; 4056 4057 r = amdgpu_device_fw_loading(adev); 4058 if (r) 4059 return r; 4060 4061 r = amdgpu_device_ip_resume_phase2(adev); 4062 4063 if (adev->mman.buffer_funcs_ring->sched.ready) 4064 amdgpu_ttm_set_buffer_funcs_status(adev, true); 4065 4066 if (r) 4067 return r; 4068 4069 amdgpu_fence_driver_hw_init(adev); 4070 4071 r = amdgpu_device_ip_resume_phase3(adev); 4072 4073 return r; 4074 } 4075 4076 /** 4077 * amdgpu_device_detect_sriov_bios - determine if the board supports SR-IOV 4078 * 4079 * @adev: amdgpu_device pointer 4080 * 4081 * Query the VBIOS data tables to determine if the board supports SR-IOV. 4082 */ 4083 static void amdgpu_device_detect_sriov_bios(struct amdgpu_device *adev) 4084 { 4085 if (amdgpu_sriov_vf(adev)) { 4086 if (adev->is_atom_fw) { 4087 if (amdgpu_atomfirmware_gpu_virtualization_supported(adev)) 4088 adev->virt.caps |= AMDGPU_SRIOV_CAPS_SRIOV_VBIOS; 4089 } else { 4090 if (amdgpu_atombios_has_gpu_virtualization_table(adev)) 4091 adev->virt.caps |= AMDGPU_SRIOV_CAPS_SRIOV_VBIOS; 4092 } 4093 4094 if (!(adev->virt.caps & AMDGPU_SRIOV_CAPS_SRIOV_VBIOS)) 4095 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_NO_VBIOS, 0, 0); 4096 } 4097 } 4098 4099 /** 4100 * amdgpu_device_asic_has_dc_support - determine if DC supports the asic 4101 * 4102 * @asic_type: AMD asic type 4103 * 4104 * Check if there is DC (new modesetting infrastructre) support for an asic. 4105 * returns true if DC has support, false if not. 4106 */ 4107 bool amdgpu_device_asic_has_dc_support(enum amd_asic_type asic_type) 4108 { 4109 switch (asic_type) { 4110 #ifdef CONFIG_DRM_AMDGPU_SI 4111 case CHIP_HAINAN: 4112 #endif 4113 case CHIP_TOPAZ: 4114 /* chips with no display hardware */ 4115 return false; 4116 #if defined(CONFIG_DRM_AMD_DC) 4117 case CHIP_TAHITI: 4118 case CHIP_PITCAIRN: 4119 case CHIP_VERDE: 4120 case CHIP_OLAND: 4121 /* 4122 * We have systems in the wild with these ASICs that require 4123 * LVDS and VGA support which is not supported with DC. 4124 * 4125 * Fallback to the non-DC driver here by default so as not to 4126 * cause regressions. 4127 */ 4128 #if defined(CONFIG_DRM_AMD_DC_SI) 4129 return amdgpu_dc > 0; 4130 #else 4131 return false; 4132 #endif 4133 case CHIP_BONAIRE: 4134 case CHIP_KAVERI: 4135 case CHIP_KABINI: 4136 case CHIP_MULLINS: 4137 /* 4138 * We have systems in the wild with these ASICs that require 4139 * VGA support which is not supported with DC. 4140 * 4141 * Fallback to the non-DC driver here by default so as not to 4142 * cause regressions. 4143 */ 4144 return amdgpu_dc > 0; 4145 default: 4146 return amdgpu_dc != 0; 4147 #else 4148 default: 4149 if (amdgpu_dc > 0) 4150 dev_info_once( 4151 adev->dev, 4152 "Display Core has been requested via kernel parameter but isn't supported by ASIC, ignoring\n"); 4153 return false; 4154 #endif 4155 } 4156 } 4157 4158 /** 4159 * amdgpu_device_has_dc_support - check if dc is supported 4160 * 4161 * @adev: amdgpu_device pointer 4162 * 4163 * Returns true for supported, false for not supported 4164 */ 4165 bool amdgpu_device_has_dc_support(struct amdgpu_device *adev) 4166 { 4167 if (adev->enable_virtual_display || 4168 (adev->harvest_ip_mask & AMD_HARVEST_IP_DMU_MASK)) 4169 return false; 4170 4171 return amdgpu_device_asic_has_dc_support(adev->asic_type); 4172 } 4173 4174 static void amdgpu_device_xgmi_reset_func(struct work_struct *__work) 4175 { 4176 struct amdgpu_device *adev = 4177 container_of(__work, struct amdgpu_device, xgmi_reset_work); 4178 struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev); 4179 4180 /* It's a bug to not have a hive within this function */ 4181 if (WARN_ON(!hive)) 4182 return; 4183 4184 /* 4185 * Use task barrier to synchronize all xgmi reset works across the 4186 * hive. task_barrier_enter and task_barrier_exit will block 4187 * until all the threads running the xgmi reset works reach 4188 * those points. task_barrier_full will do both blocks. 4189 */ 4190 if (amdgpu_asic_reset_method(adev) == AMD_RESET_METHOD_BACO) { 4191 4192 task_barrier_enter(&hive->tb); 4193 adev->asic_reset_res = amdgpu_device_baco_enter(adev_to_drm(adev)); 4194 4195 if (adev->asic_reset_res) 4196 goto fail; 4197 4198 task_barrier_exit(&hive->tb); 4199 adev->asic_reset_res = amdgpu_device_baco_exit(adev_to_drm(adev)); 4200 4201 if (adev->asic_reset_res) 4202 goto fail; 4203 4204 amdgpu_ras_reset_error_count(adev, AMDGPU_RAS_BLOCK__MMHUB); 4205 } else { 4206 4207 task_barrier_full(&hive->tb); 4208 adev->asic_reset_res = amdgpu_asic_reset(adev); 4209 } 4210 4211 fail: 4212 if (adev->asic_reset_res) 4213 dev_warn(adev->dev, 4214 "ASIC reset failed with error, %d for drm dev, %s", 4215 adev->asic_reset_res, adev_to_drm(adev)->unique); 4216 amdgpu_put_xgmi_hive(hive); 4217 } 4218 4219 static int amdgpu_device_get_job_timeout_settings(struct amdgpu_device *adev) 4220 { 4221 char *input = amdgpu_lockup_timeout; 4222 char *timeout_setting = NULL; 4223 int index = 0; 4224 long timeout; 4225 int ret = 0; 4226 4227 /* 4228 * By default timeout for non compute jobs is 10000 4229 * and 60000 for compute jobs. 4230 * In SR-IOV or passthrough mode, timeout for compute 4231 * jobs are 60000 by default. 4232 */ 4233 adev->gfx_timeout = msecs_to_jiffies(10000); 4234 adev->sdma_timeout = adev->video_timeout = adev->gfx_timeout; 4235 if (amdgpu_sriov_vf(adev)) 4236 adev->compute_timeout = amdgpu_sriov_is_pp_one_vf(adev) ? 4237 msecs_to_jiffies(60000) : msecs_to_jiffies(10000); 4238 else 4239 adev->compute_timeout = msecs_to_jiffies(60000); 4240 4241 if (strnlen(input, AMDGPU_MAX_TIMEOUT_PARAM_LENGTH)) { 4242 while ((timeout_setting = strsep(&input, ",")) && 4243 strnlen(timeout_setting, AMDGPU_MAX_TIMEOUT_PARAM_LENGTH)) { 4244 ret = kstrtol(timeout_setting, 0, &timeout); 4245 if (ret) 4246 return ret; 4247 4248 if (timeout == 0) { 4249 index++; 4250 continue; 4251 } else if (timeout < 0) { 4252 timeout = MAX_SCHEDULE_TIMEOUT; 4253 dev_warn(adev->dev, "lockup timeout disabled"); 4254 add_taint(TAINT_SOFTLOCKUP, LOCKDEP_STILL_OK); 4255 } else { 4256 timeout = msecs_to_jiffies(timeout); 4257 } 4258 4259 switch (index++) { 4260 case 0: 4261 adev->gfx_timeout = timeout; 4262 break; 4263 case 1: 4264 adev->compute_timeout = timeout; 4265 break; 4266 case 2: 4267 adev->sdma_timeout = timeout; 4268 break; 4269 case 3: 4270 adev->video_timeout = timeout; 4271 break; 4272 default: 4273 break; 4274 } 4275 } 4276 /* 4277 * There is only one value specified and 4278 * it should apply to all non-compute jobs. 4279 */ 4280 if (index == 1) { 4281 adev->sdma_timeout = adev->video_timeout = adev->gfx_timeout; 4282 if (amdgpu_sriov_vf(adev) || amdgpu_passthrough(adev)) 4283 adev->compute_timeout = adev->gfx_timeout; 4284 } 4285 } 4286 4287 return ret; 4288 } 4289 4290 /** 4291 * amdgpu_device_check_iommu_direct_map - check if RAM direct mapped to GPU 4292 * 4293 * @adev: amdgpu_device pointer 4294 * 4295 * RAM direct mapped to GPU if IOMMU is not enabled or is pass through mode 4296 */ 4297 static void amdgpu_device_check_iommu_direct_map(struct amdgpu_device *adev) 4298 { 4299 struct iommu_domain *domain; 4300 4301 domain = iommu_get_domain_for_dev(adev->dev); 4302 if (!domain || domain->type == IOMMU_DOMAIN_IDENTITY) 4303 adev->ram_is_direct_mapped = true; 4304 } 4305 4306 #if defined(CONFIG_HSA_AMD_P2P) 4307 /** 4308 * amdgpu_device_check_iommu_remap - Check if DMA remapping is enabled. 4309 * 4310 * @adev: amdgpu_device pointer 4311 * 4312 * return if IOMMU remapping bar address 4313 */ 4314 static bool amdgpu_device_check_iommu_remap(struct amdgpu_device *adev) 4315 { 4316 struct iommu_domain *domain; 4317 4318 domain = iommu_get_domain_for_dev(adev->dev); 4319 if (domain && (domain->type == IOMMU_DOMAIN_DMA || 4320 domain->type == IOMMU_DOMAIN_DMA_FQ)) 4321 return true; 4322 4323 return false; 4324 } 4325 #endif 4326 4327 static void amdgpu_device_set_mcbp(struct amdgpu_device *adev) 4328 { 4329 if (amdgpu_mcbp == 1) 4330 adev->gfx.mcbp = true; 4331 else if (amdgpu_mcbp == 0) 4332 adev->gfx.mcbp = false; 4333 4334 if (amdgpu_sriov_vf(adev)) 4335 adev->gfx.mcbp = true; 4336 4337 if (adev->gfx.mcbp) 4338 dev_info(adev->dev, "MCBP is enabled\n"); 4339 } 4340 4341 /** 4342 * amdgpu_device_init - initialize the driver 4343 * 4344 * @adev: amdgpu_device pointer 4345 * @flags: driver flags 4346 * 4347 * Initializes the driver info and hw (all asics). 4348 * Returns 0 for success or an error on failure. 4349 * Called at driver startup. 4350 */ 4351 int amdgpu_device_init(struct amdgpu_device *adev, 4352 uint32_t flags) 4353 { 4354 struct drm_device *ddev = adev_to_drm(adev); 4355 struct pci_dev *pdev = adev->pdev; 4356 int r, i; 4357 bool px = false; 4358 u32 max_MBps; 4359 int tmp; 4360 4361 adev->shutdown = false; 4362 adev->flags = flags; 4363 4364 if (amdgpu_force_asic_type >= 0 && amdgpu_force_asic_type < CHIP_LAST) 4365 adev->asic_type = amdgpu_force_asic_type; 4366 else 4367 adev->asic_type = flags & AMD_ASIC_MASK; 4368 4369 adev->usec_timeout = AMDGPU_MAX_USEC_TIMEOUT; 4370 if (amdgpu_emu_mode == 1) 4371 adev->usec_timeout *= 10; 4372 adev->gmc.gart_size = 512 * 1024 * 1024; 4373 adev->accel_working = false; 4374 adev->num_rings = 0; 4375 RCU_INIT_POINTER(adev->gang_submit, dma_fence_get_stub()); 4376 adev->mman.buffer_funcs = NULL; 4377 adev->mman.buffer_funcs_ring = NULL; 4378 adev->vm_manager.vm_pte_funcs = NULL; 4379 adev->vm_manager.vm_pte_num_scheds = 0; 4380 adev->gmc.gmc_funcs = NULL; 4381 adev->harvest_ip_mask = 0x0; 4382 adev->fence_context = dma_fence_context_alloc(AMDGPU_MAX_RINGS); 4383 bitmap_zero(adev->gfx.pipe_reserve_bitmap, AMDGPU_MAX_COMPUTE_QUEUES); 4384 4385 adev->smc_rreg = &amdgpu_invalid_rreg; 4386 adev->smc_wreg = &amdgpu_invalid_wreg; 4387 adev->pcie_rreg = &amdgpu_invalid_rreg; 4388 adev->pcie_wreg = &amdgpu_invalid_wreg; 4389 adev->pcie_rreg_ext = &amdgpu_invalid_rreg_ext; 4390 adev->pcie_wreg_ext = &amdgpu_invalid_wreg_ext; 4391 adev->pciep_rreg = &amdgpu_invalid_rreg; 4392 adev->pciep_wreg = &amdgpu_invalid_wreg; 4393 adev->pcie_rreg64 = &amdgpu_invalid_rreg64; 4394 adev->pcie_wreg64 = &amdgpu_invalid_wreg64; 4395 adev->pcie_rreg64_ext = &amdgpu_invalid_rreg64_ext; 4396 adev->pcie_wreg64_ext = &amdgpu_invalid_wreg64_ext; 4397 adev->uvd_ctx_rreg = &amdgpu_invalid_rreg; 4398 adev->uvd_ctx_wreg = &amdgpu_invalid_wreg; 4399 adev->didt_rreg = &amdgpu_invalid_rreg; 4400 adev->didt_wreg = &amdgpu_invalid_wreg; 4401 adev->gc_cac_rreg = &amdgpu_invalid_rreg; 4402 adev->gc_cac_wreg = &amdgpu_invalid_wreg; 4403 adev->audio_endpt_rreg = &amdgpu_block_invalid_rreg; 4404 adev->audio_endpt_wreg = &amdgpu_block_invalid_wreg; 4405 4406 dev_info( 4407 adev->dev, 4408 "initializing kernel modesetting (%s 0x%04X:0x%04X 0x%04X:0x%04X 0x%02X).\n", 4409 amdgpu_asic_name[adev->asic_type], pdev->vendor, pdev->device, 4410 pdev->subsystem_vendor, pdev->subsystem_device, pdev->revision); 4411 4412 /* mutex initialization are all done here so we 4413 * can recall function without having locking issues 4414 */ 4415 mutex_init(&adev->firmware.mutex); 4416 mutex_init(&adev->pm.mutex); 4417 mutex_init(&adev->gfx.gpu_clock_mutex); 4418 mutex_init(&adev->srbm_mutex); 4419 mutex_init(&adev->gfx.pipe_reserve_mutex); 4420 mutex_init(&adev->gfx.gfx_off_mutex); 4421 mutex_init(&adev->gfx.partition_mutex); 4422 mutex_init(&adev->grbm_idx_mutex); 4423 mutex_init(&adev->mn_lock); 4424 mutex_init(&adev->virt.vf_errors.lock); 4425 hash_init(adev->mn_hash); 4426 mutex_init(&adev->psp.mutex); 4427 mutex_init(&adev->notifier_lock); 4428 mutex_init(&adev->pm.stable_pstate_ctx_lock); 4429 mutex_init(&adev->benchmark_mutex); 4430 mutex_init(&adev->gfx.reset_sem_mutex); 4431 /* Initialize the mutex for cleaner shader isolation between GFX and compute processes */ 4432 mutex_init(&adev->enforce_isolation_mutex); 4433 for (i = 0; i < MAX_XCP; ++i) { 4434 adev->isolation[i].spearhead = dma_fence_get_stub(); 4435 amdgpu_sync_create(&adev->isolation[i].active); 4436 amdgpu_sync_create(&adev->isolation[i].prev); 4437 } 4438 mutex_init(&adev->gfx.userq_sch_mutex); 4439 mutex_init(&adev->gfx.workload_profile_mutex); 4440 mutex_init(&adev->vcn.workload_profile_mutex); 4441 mutex_init(&adev->userq_mutex); 4442 4443 amdgpu_device_init_apu_flags(adev); 4444 4445 r = amdgpu_device_check_arguments(adev); 4446 if (r) 4447 return r; 4448 4449 spin_lock_init(&adev->mmio_idx_lock); 4450 spin_lock_init(&adev->smc_idx_lock); 4451 spin_lock_init(&adev->pcie_idx_lock); 4452 spin_lock_init(&adev->uvd_ctx_idx_lock); 4453 spin_lock_init(&adev->didt_idx_lock); 4454 spin_lock_init(&adev->gc_cac_idx_lock); 4455 spin_lock_init(&adev->se_cac_idx_lock); 4456 spin_lock_init(&adev->audio_endpt_idx_lock); 4457 spin_lock_init(&adev->mm_stats.lock); 4458 spin_lock_init(&adev->virt.rlcg_reg_lock); 4459 spin_lock_init(&adev->wb.lock); 4460 4461 xa_init_flags(&adev->userq_xa, XA_FLAGS_LOCK_IRQ); 4462 4463 INIT_LIST_HEAD(&adev->reset_list); 4464 4465 INIT_LIST_HEAD(&adev->ras_list); 4466 4467 INIT_LIST_HEAD(&adev->pm.od_kobj_list); 4468 4469 INIT_LIST_HEAD(&adev->userq_mgr_list); 4470 4471 INIT_DELAYED_WORK(&adev->delayed_init_work, 4472 amdgpu_device_delayed_init_work_handler); 4473 INIT_DELAYED_WORK(&adev->gfx.gfx_off_delay_work, 4474 amdgpu_device_delay_enable_gfx_off); 4475 /* 4476 * Initialize the enforce_isolation work structures for each XCP 4477 * partition. This work handler is responsible for enforcing shader 4478 * isolation on AMD GPUs. It counts the number of emitted fences for 4479 * each GFX and compute ring. If there are any fences, it schedules 4480 * the `enforce_isolation_work` to be run after a delay. If there are 4481 * no fences, it signals the Kernel Fusion Driver (KFD) to resume the 4482 * runqueue. 4483 */ 4484 for (i = 0; i < MAX_XCP; i++) { 4485 INIT_DELAYED_WORK(&adev->gfx.enforce_isolation[i].work, 4486 amdgpu_gfx_enforce_isolation_handler); 4487 adev->gfx.enforce_isolation[i].adev = adev; 4488 adev->gfx.enforce_isolation[i].xcp_id = i; 4489 } 4490 4491 INIT_WORK(&adev->xgmi_reset_work, amdgpu_device_xgmi_reset_func); 4492 4493 adev->gfx.gfx_off_req_count = 1; 4494 adev->gfx.gfx_off_residency = 0; 4495 adev->gfx.gfx_off_entrycount = 0; 4496 adev->pm.ac_power = power_supply_is_system_supplied() > 0; 4497 4498 atomic_set(&adev->throttling_logging_enabled, 1); 4499 /* 4500 * If throttling continues, logging will be performed every minute 4501 * to avoid log flooding. "-1" is subtracted since the thermal 4502 * throttling interrupt comes every second. Thus, the total logging 4503 * interval is 59 seconds(retelimited printk interval) + 1(waiting 4504 * for throttling interrupt) = 60 seconds. 4505 */ 4506 ratelimit_state_init(&adev->throttling_logging_rs, (60 - 1) * HZ, 1); 4507 4508 ratelimit_set_flags(&adev->throttling_logging_rs, RATELIMIT_MSG_ON_RELEASE); 4509 4510 /* Registers mapping */ 4511 /* TODO: block userspace mapping of io register */ 4512 if (adev->asic_type >= CHIP_BONAIRE) { 4513 adev->rmmio_base = pci_resource_start(adev->pdev, 5); 4514 adev->rmmio_size = pci_resource_len(adev->pdev, 5); 4515 } else { 4516 adev->rmmio_base = pci_resource_start(adev->pdev, 2); 4517 adev->rmmio_size = pci_resource_len(adev->pdev, 2); 4518 } 4519 4520 for (i = 0; i < AMD_IP_BLOCK_TYPE_NUM; i++) 4521 atomic_set(&adev->pm.pwr_state[i], POWER_STATE_UNKNOWN); 4522 4523 adev->rmmio = ioremap(adev->rmmio_base, adev->rmmio_size); 4524 if (!adev->rmmio) 4525 return -ENOMEM; 4526 4527 dev_info(adev->dev, "register mmio base: 0x%08X\n", 4528 (uint32_t)adev->rmmio_base); 4529 dev_info(adev->dev, "register mmio size: %u\n", 4530 (unsigned int)adev->rmmio_size); 4531 4532 /* 4533 * Reset domain needs to be present early, before XGMI hive discovered 4534 * (if any) and initialized to use reset sem and in_gpu reset flag 4535 * early on during init and before calling to RREG32. 4536 */ 4537 adev->reset_domain = amdgpu_reset_create_reset_domain(SINGLE_DEVICE, "amdgpu-reset-dev"); 4538 if (!adev->reset_domain) 4539 return -ENOMEM; 4540 4541 /* detect hw virtualization here */ 4542 amdgpu_virt_init(adev); 4543 4544 amdgpu_device_get_pcie_info(adev); 4545 4546 r = amdgpu_device_get_job_timeout_settings(adev); 4547 if (r) { 4548 dev_err(adev->dev, "invalid lockup_timeout parameter syntax\n"); 4549 return r; 4550 } 4551 4552 amdgpu_device_set_mcbp(adev); 4553 4554 /* 4555 * By default, use default mode where all blocks are expected to be 4556 * initialized. At present a 'swinit' of blocks is required to be 4557 * completed before the need for a different level is detected. 4558 */ 4559 amdgpu_set_init_level(adev, AMDGPU_INIT_LEVEL_DEFAULT); 4560 /* early init functions */ 4561 r = amdgpu_device_ip_early_init(adev); 4562 if (r) 4563 return r; 4564 4565 /* 4566 * No need to remove conflicting FBs for non-display class devices. 4567 * This prevents the sysfb from being freed accidently. 4568 */ 4569 if ((pdev->class >> 8) == PCI_CLASS_DISPLAY_VGA || 4570 (pdev->class >> 8) == PCI_CLASS_DISPLAY_OTHER) { 4571 /* Get rid of things like offb */ 4572 r = aperture_remove_conflicting_pci_devices(adev->pdev, amdgpu_kms_driver.name); 4573 if (r) 4574 return r; 4575 } 4576 4577 /* Enable TMZ based on IP_VERSION */ 4578 amdgpu_gmc_tmz_set(adev); 4579 4580 if (amdgpu_sriov_vf(adev) && 4581 amdgpu_ip_version(adev, GC_HWIP, 0) >= IP_VERSION(10, 3, 0)) 4582 /* VF MMIO access (except mailbox range) from CPU 4583 * will be blocked during sriov runtime 4584 */ 4585 adev->virt.caps |= AMDGPU_VF_MMIO_ACCESS_PROTECT; 4586 4587 amdgpu_gmc_noretry_set(adev); 4588 /* Need to get xgmi info early to decide the reset behavior*/ 4589 if (adev->gmc.xgmi.supported) { 4590 r = adev->gfxhub.funcs->get_xgmi_info(adev); 4591 if (r) 4592 return r; 4593 } 4594 4595 /* enable PCIE atomic ops */ 4596 if (amdgpu_sriov_vf(adev)) { 4597 if (adev->virt.fw_reserve.p_pf2vf) 4598 adev->have_atomics_support = ((struct amd_sriov_msg_pf2vf_info *) 4599 adev->virt.fw_reserve.p_pf2vf)->pcie_atomic_ops_support_flags == 4600 (PCI_EXP_DEVCAP2_ATOMIC_COMP32 | PCI_EXP_DEVCAP2_ATOMIC_COMP64); 4601 /* APUs w/ gfx9 onwards doesn't reply on PCIe atomics, rather it is a 4602 * internal path natively support atomics, set have_atomics_support to true. 4603 */ 4604 } else if ((adev->flags & AMD_IS_APU) && 4605 (amdgpu_ip_version(adev, GC_HWIP, 0) > 4606 IP_VERSION(9, 0, 0))) { 4607 adev->have_atomics_support = true; 4608 } else { 4609 adev->have_atomics_support = 4610 !pci_enable_atomic_ops_to_root(adev->pdev, 4611 PCI_EXP_DEVCAP2_ATOMIC_COMP32 | 4612 PCI_EXP_DEVCAP2_ATOMIC_COMP64); 4613 } 4614 4615 if (!adev->have_atomics_support) 4616 dev_info(adev->dev, "PCIE atomic ops is not supported\n"); 4617 4618 /* doorbell bar mapping and doorbell index init*/ 4619 amdgpu_doorbell_init(adev); 4620 4621 if (amdgpu_emu_mode == 1) { 4622 /* post the asic on emulation mode */ 4623 emu_soc_asic_init(adev); 4624 goto fence_driver_init; 4625 } 4626 4627 amdgpu_reset_init(adev); 4628 4629 /* detect if we are with an SRIOV vbios */ 4630 if (adev->bios) 4631 amdgpu_device_detect_sriov_bios(adev); 4632 4633 /* check if we need to reset the asic 4634 * E.g., driver was not cleanly unloaded previously, etc. 4635 */ 4636 if (!amdgpu_sriov_vf(adev) && amdgpu_asic_need_reset_on_init(adev)) { 4637 if (adev->gmc.xgmi.num_physical_nodes) { 4638 dev_info(adev->dev, "Pending hive reset.\n"); 4639 amdgpu_set_init_level(adev, 4640 AMDGPU_INIT_LEVEL_MINIMAL_XGMI); 4641 } else if (amdgpu_ip_version(adev, MP1_HWIP, 0) == IP_VERSION(13, 0, 10) && 4642 !amdgpu_device_has_display_hardware(adev)) { 4643 r = psp_gpu_reset(adev); 4644 } else { 4645 tmp = amdgpu_reset_method; 4646 /* It should do a default reset when loading or reloading the driver, 4647 * regardless of the module parameter reset_method. 4648 */ 4649 amdgpu_reset_method = AMD_RESET_METHOD_NONE; 4650 r = amdgpu_asic_reset(adev); 4651 amdgpu_reset_method = tmp; 4652 } 4653 4654 if (r) { 4655 dev_err(adev->dev, "asic reset on init failed\n"); 4656 goto failed; 4657 } 4658 } 4659 4660 /* Post card if necessary */ 4661 if (amdgpu_device_need_post(adev)) { 4662 if (!adev->bios) { 4663 dev_err(adev->dev, "no vBIOS found\n"); 4664 r = -EINVAL; 4665 goto failed; 4666 } 4667 dev_info(adev->dev, "GPU posting now...\n"); 4668 r = amdgpu_device_asic_init(adev); 4669 if (r) { 4670 dev_err(adev->dev, "gpu post error!\n"); 4671 goto failed; 4672 } 4673 } 4674 4675 if (adev->bios) { 4676 if (adev->is_atom_fw) { 4677 /* Initialize clocks */ 4678 r = amdgpu_atomfirmware_get_clock_info(adev); 4679 if (r) { 4680 dev_err(adev->dev, "amdgpu_atomfirmware_get_clock_info failed\n"); 4681 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_GET_CLOCK_FAIL, 0, 0); 4682 goto failed; 4683 } 4684 } else { 4685 /* Initialize clocks */ 4686 r = amdgpu_atombios_get_clock_info(adev); 4687 if (r) { 4688 dev_err(adev->dev, "amdgpu_atombios_get_clock_info failed\n"); 4689 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_GET_CLOCK_FAIL, 0, 0); 4690 goto failed; 4691 } 4692 /* init i2c buses */ 4693 amdgpu_i2c_init(adev); 4694 } 4695 } 4696 4697 fence_driver_init: 4698 /* Fence driver */ 4699 r = amdgpu_fence_driver_sw_init(adev); 4700 if (r) { 4701 dev_err(adev->dev, "amdgpu_fence_driver_sw_init failed\n"); 4702 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_FENCE_INIT_FAIL, 0, 0); 4703 goto failed; 4704 } 4705 4706 /* init the mode config */ 4707 drm_mode_config_init(adev_to_drm(adev)); 4708 4709 r = amdgpu_device_ip_init(adev); 4710 if (r) { 4711 dev_err(adev->dev, "amdgpu_device_ip_init failed\n"); 4712 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_AMDGPU_INIT_FAIL, 0, 0); 4713 goto release_ras_con; 4714 } 4715 4716 amdgpu_fence_driver_hw_init(adev); 4717 4718 dev_info(adev->dev, 4719 "SE %d, SH per SE %d, CU per SH %d, active_cu_number %d\n", 4720 adev->gfx.config.max_shader_engines, 4721 adev->gfx.config.max_sh_per_se, 4722 adev->gfx.config.max_cu_per_sh, 4723 adev->gfx.cu_info.number); 4724 4725 adev->accel_working = true; 4726 4727 amdgpu_vm_check_compute_bug(adev); 4728 4729 /* Initialize the buffer migration limit. */ 4730 if (amdgpu_moverate >= 0) 4731 max_MBps = amdgpu_moverate; 4732 else 4733 max_MBps = 8; /* Allow 8 MB/s. */ 4734 /* Get a log2 for easy divisions. */ 4735 adev->mm_stats.log2_max_MBps = ilog2(max(1u, max_MBps)); 4736 4737 /* 4738 * Register gpu instance before amdgpu_device_enable_mgpu_fan_boost. 4739 * Otherwise the mgpu fan boost feature will be skipped due to the 4740 * gpu instance is counted less. 4741 */ 4742 amdgpu_register_gpu_instance(adev); 4743 4744 /* enable clockgating, etc. after ib tests, etc. since some blocks require 4745 * explicit gating rather than handling it automatically. 4746 */ 4747 if (adev->init_lvl->level != AMDGPU_INIT_LEVEL_MINIMAL_XGMI) { 4748 r = amdgpu_device_ip_late_init(adev); 4749 if (r) { 4750 dev_err(adev->dev, "amdgpu_device_ip_late_init failed\n"); 4751 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_AMDGPU_LATE_INIT_FAIL, 0, r); 4752 goto release_ras_con; 4753 } 4754 /* must succeed. */ 4755 amdgpu_ras_resume(adev); 4756 queue_delayed_work(system_wq, &adev->delayed_init_work, 4757 msecs_to_jiffies(AMDGPU_RESUME_MS)); 4758 } 4759 4760 if (amdgpu_sriov_vf(adev)) { 4761 amdgpu_virt_release_full_gpu(adev, true); 4762 flush_delayed_work(&adev->delayed_init_work); 4763 } 4764 4765 /* 4766 * Place those sysfs registering after `late_init`. As some of those 4767 * operations performed in `late_init` might affect the sysfs 4768 * interfaces creating. 4769 */ 4770 r = amdgpu_atombios_sysfs_init(adev); 4771 if (r) 4772 drm_err(&adev->ddev, 4773 "registering atombios sysfs failed (%d).\n", r); 4774 4775 r = amdgpu_pm_sysfs_init(adev); 4776 if (r) 4777 dev_err(adev->dev, "registering pm sysfs failed (%d).\n", r); 4778 4779 r = amdgpu_ucode_sysfs_init(adev); 4780 if (r) { 4781 adev->ucode_sysfs_en = false; 4782 dev_err(adev->dev, "Creating firmware sysfs failed (%d).\n", r); 4783 } else 4784 adev->ucode_sysfs_en = true; 4785 4786 r = amdgpu_device_attr_sysfs_init(adev); 4787 if (r) 4788 dev_err(adev->dev, "Could not create amdgpu device attr\n"); 4789 4790 r = devm_device_add_group(adev->dev, &amdgpu_board_attrs_group); 4791 if (r) 4792 dev_err(adev->dev, 4793 "Could not create amdgpu board attributes\n"); 4794 4795 amdgpu_fru_sysfs_init(adev); 4796 amdgpu_reg_state_sysfs_init(adev); 4797 amdgpu_xcp_sysfs_init(adev); 4798 4799 if (IS_ENABLED(CONFIG_PERF_EVENTS)) 4800 r = amdgpu_pmu_init(adev); 4801 if (r) 4802 dev_err(adev->dev, "amdgpu_pmu_init failed\n"); 4803 4804 /* Have stored pci confspace at hand for restore in sudden PCI error */ 4805 if (amdgpu_device_cache_pci_state(adev->pdev)) 4806 pci_restore_state(pdev); 4807 4808 /* if we have > 1 VGA cards, then disable the amdgpu VGA resources */ 4809 /* this will fail for cards that aren't VGA class devices, just 4810 * ignore it 4811 */ 4812 if ((adev->pdev->class >> 8) == PCI_CLASS_DISPLAY_VGA) 4813 vga_client_register(adev->pdev, amdgpu_device_vga_set_decode); 4814 4815 px = amdgpu_device_supports_px(ddev); 4816 4817 if (px || (!dev_is_removable(&adev->pdev->dev) && 4818 apple_gmux_detect(NULL, NULL))) 4819 vga_switcheroo_register_client(adev->pdev, 4820 &amdgpu_switcheroo_ops, px); 4821 4822 if (px) 4823 vga_switcheroo_init_domain_pm_ops(adev->dev, &adev->vga_pm_domain); 4824 4825 if (adev->init_lvl->level == AMDGPU_INIT_LEVEL_MINIMAL_XGMI) 4826 amdgpu_xgmi_reset_on_init(adev); 4827 4828 amdgpu_device_check_iommu_direct_map(adev); 4829 4830 adev->pm_nb.notifier_call = amdgpu_device_pm_notifier; 4831 r = register_pm_notifier(&adev->pm_nb); 4832 if (r) 4833 goto failed; 4834 4835 return 0; 4836 4837 release_ras_con: 4838 if (amdgpu_sriov_vf(adev)) 4839 amdgpu_virt_release_full_gpu(adev, true); 4840 4841 /* failed in exclusive mode due to timeout */ 4842 if (amdgpu_sriov_vf(adev) && 4843 !amdgpu_sriov_runtime(adev) && 4844 amdgpu_virt_mmio_blocked(adev) && 4845 !amdgpu_virt_wait_reset(adev)) { 4846 dev_err(adev->dev, "VF exclusive mode timeout\n"); 4847 /* Don't send request since VF is inactive. */ 4848 adev->virt.caps &= ~AMDGPU_SRIOV_CAPS_RUNTIME; 4849 adev->virt.ops = NULL; 4850 r = -EAGAIN; 4851 } 4852 amdgpu_release_ras_context(adev); 4853 4854 failed: 4855 amdgpu_vf_error_trans_all(adev); 4856 4857 return r; 4858 } 4859 4860 static void amdgpu_device_unmap_mmio(struct amdgpu_device *adev) 4861 { 4862 4863 /* Clear all CPU mappings pointing to this device */ 4864 unmap_mapping_range(adev->ddev.anon_inode->i_mapping, 0, 0, 1); 4865 4866 /* Unmap all mapped bars - Doorbell, registers and VRAM */ 4867 amdgpu_doorbell_fini(adev); 4868 4869 iounmap(adev->rmmio); 4870 adev->rmmio = NULL; 4871 if (adev->mman.aper_base_kaddr) 4872 iounmap(adev->mman.aper_base_kaddr); 4873 adev->mman.aper_base_kaddr = NULL; 4874 4875 /* Memory manager related */ 4876 if (!adev->gmc.xgmi.connected_to_cpu && !adev->gmc.is_app_apu) { 4877 arch_phys_wc_del(adev->gmc.vram_mtrr); 4878 arch_io_free_memtype_wc(adev->gmc.aper_base, adev->gmc.aper_size); 4879 } 4880 } 4881 4882 /** 4883 * amdgpu_device_fini_hw - tear down the driver 4884 * 4885 * @adev: amdgpu_device pointer 4886 * 4887 * Tear down the driver info (all asics). 4888 * Called at driver shutdown. 4889 */ 4890 void amdgpu_device_fini_hw(struct amdgpu_device *adev) 4891 { 4892 dev_info(adev->dev, "amdgpu: finishing device.\n"); 4893 flush_delayed_work(&adev->delayed_init_work); 4894 4895 if (adev->mman.initialized) 4896 drain_workqueue(adev->mman.bdev.wq); 4897 adev->shutdown = true; 4898 4899 unregister_pm_notifier(&adev->pm_nb); 4900 4901 /* make sure IB test finished before entering exclusive mode 4902 * to avoid preemption on IB test 4903 */ 4904 if (amdgpu_sriov_vf(adev)) { 4905 amdgpu_virt_request_full_gpu(adev, false); 4906 amdgpu_virt_fini_data_exchange(adev); 4907 } 4908 4909 /* disable all interrupts */ 4910 amdgpu_irq_disable_all(adev); 4911 if (adev->mode_info.mode_config_initialized) { 4912 if (!drm_drv_uses_atomic_modeset(adev_to_drm(adev))) 4913 drm_helper_force_disable_all(adev_to_drm(adev)); 4914 else 4915 drm_atomic_helper_shutdown(adev_to_drm(adev)); 4916 } 4917 amdgpu_fence_driver_hw_fini(adev); 4918 4919 if (adev->pm.sysfs_initialized) 4920 amdgpu_pm_sysfs_fini(adev); 4921 if (adev->ucode_sysfs_en) 4922 amdgpu_ucode_sysfs_fini(adev); 4923 amdgpu_device_attr_sysfs_fini(adev); 4924 amdgpu_fru_sysfs_fini(adev); 4925 4926 amdgpu_reg_state_sysfs_fini(adev); 4927 amdgpu_xcp_sysfs_fini(adev); 4928 4929 /* disable ras feature must before hw fini */ 4930 amdgpu_ras_pre_fini(adev); 4931 4932 amdgpu_ttm_set_buffer_funcs_status(adev, false); 4933 4934 amdgpu_device_ip_fini_early(adev); 4935 4936 amdgpu_irq_fini_hw(adev); 4937 4938 if (adev->mman.initialized) 4939 ttm_device_clear_dma_mappings(&adev->mman.bdev); 4940 4941 amdgpu_gart_dummy_page_fini(adev); 4942 4943 if (drm_dev_is_unplugged(adev_to_drm(adev))) 4944 amdgpu_device_unmap_mmio(adev); 4945 4946 } 4947 4948 void amdgpu_device_fini_sw(struct amdgpu_device *adev) 4949 { 4950 int i, idx; 4951 bool px; 4952 4953 amdgpu_device_ip_fini(adev); 4954 amdgpu_fence_driver_sw_fini(adev); 4955 amdgpu_ucode_release(&adev->firmware.gpu_info_fw); 4956 adev->accel_working = false; 4957 dma_fence_put(rcu_dereference_protected(adev->gang_submit, true)); 4958 for (i = 0; i < MAX_XCP; ++i) { 4959 dma_fence_put(adev->isolation[i].spearhead); 4960 amdgpu_sync_free(&adev->isolation[i].active); 4961 amdgpu_sync_free(&adev->isolation[i].prev); 4962 } 4963 4964 amdgpu_reset_fini(adev); 4965 4966 /* free i2c buses */ 4967 amdgpu_i2c_fini(adev); 4968 4969 if (adev->bios) { 4970 if (amdgpu_emu_mode != 1) 4971 amdgpu_atombios_fini(adev); 4972 amdgpu_bios_release(adev); 4973 } 4974 4975 kfree(adev->fru_info); 4976 adev->fru_info = NULL; 4977 4978 kfree(adev->xcp_mgr); 4979 adev->xcp_mgr = NULL; 4980 4981 px = amdgpu_device_supports_px(adev_to_drm(adev)); 4982 4983 if (px || (!dev_is_removable(&adev->pdev->dev) && 4984 apple_gmux_detect(NULL, NULL))) 4985 vga_switcheroo_unregister_client(adev->pdev); 4986 4987 if (px) 4988 vga_switcheroo_fini_domain_pm_ops(adev->dev); 4989 4990 if ((adev->pdev->class >> 8) == PCI_CLASS_DISPLAY_VGA) 4991 vga_client_unregister(adev->pdev); 4992 4993 if (drm_dev_enter(adev_to_drm(adev), &idx)) { 4994 4995 iounmap(adev->rmmio); 4996 adev->rmmio = NULL; 4997 drm_dev_exit(idx); 4998 } 4999 5000 if (IS_ENABLED(CONFIG_PERF_EVENTS)) 5001 amdgpu_pmu_fini(adev); 5002 if (adev->mman.discovery_bin) 5003 amdgpu_discovery_fini(adev); 5004 5005 amdgpu_reset_put_reset_domain(adev->reset_domain); 5006 adev->reset_domain = NULL; 5007 5008 kfree(adev->pci_state); 5009 5010 } 5011 5012 /** 5013 * amdgpu_device_evict_resources - evict device resources 5014 * @adev: amdgpu device object 5015 * 5016 * Evicts all ttm device resources(vram BOs, gart table) from the lru list 5017 * of the vram memory type. Mainly used for evicting device resources 5018 * at suspend time. 5019 * 5020 */ 5021 static int amdgpu_device_evict_resources(struct amdgpu_device *adev) 5022 { 5023 int ret; 5024 5025 /* No need to evict vram on APUs unless going to S4 */ 5026 if (!adev->in_s4 && (adev->flags & AMD_IS_APU)) 5027 return 0; 5028 5029 ret = amdgpu_ttm_evict_resources(adev, TTM_PL_VRAM); 5030 if (ret) 5031 dev_warn(adev->dev, "evicting device resources failed\n"); 5032 return ret; 5033 } 5034 5035 /* 5036 * Suspend & resume. 5037 */ 5038 /** 5039 * amdgpu_device_pm_notifier - Notification block for Suspend/Hibernate events 5040 * @nb: notifier block 5041 * @mode: suspend mode 5042 * @data: data 5043 * 5044 * This function is called when the system is about to suspend or hibernate. 5045 * It is used to set the appropriate flags so that eviction can be optimized 5046 * in the pm prepare callback. 5047 */ 5048 static int amdgpu_device_pm_notifier(struct notifier_block *nb, unsigned long mode, 5049 void *data) 5050 { 5051 struct amdgpu_device *adev = container_of(nb, struct amdgpu_device, pm_nb); 5052 5053 switch (mode) { 5054 case PM_HIBERNATION_PREPARE: 5055 adev->in_s4 = true; 5056 break; 5057 case PM_POST_HIBERNATION: 5058 adev->in_s4 = false; 5059 break; 5060 } 5061 5062 return NOTIFY_DONE; 5063 } 5064 5065 /** 5066 * amdgpu_device_prepare - prepare for device suspend 5067 * 5068 * @dev: drm dev pointer 5069 * 5070 * Prepare to put the hw in the suspend state (all asics). 5071 * Returns 0 for success or an error on failure. 5072 * Called at driver suspend. 5073 */ 5074 int amdgpu_device_prepare(struct drm_device *dev) 5075 { 5076 struct amdgpu_device *adev = drm_to_adev(dev); 5077 int i, r; 5078 5079 if (dev->switch_power_state == DRM_SWITCH_POWER_OFF) 5080 return 0; 5081 5082 /* Evict the majority of BOs before starting suspend sequence */ 5083 r = amdgpu_device_evict_resources(adev); 5084 if (r) 5085 return r; 5086 5087 flush_delayed_work(&adev->gfx.gfx_off_delay_work); 5088 5089 for (i = 0; i < adev->num_ip_blocks; i++) { 5090 if (!adev->ip_blocks[i].status.valid) 5091 continue; 5092 if (!adev->ip_blocks[i].version->funcs->prepare_suspend) 5093 continue; 5094 r = adev->ip_blocks[i].version->funcs->prepare_suspend(&adev->ip_blocks[i]); 5095 if (r) 5096 return r; 5097 } 5098 5099 return 0; 5100 } 5101 5102 /** 5103 * amdgpu_device_complete - complete power state transition 5104 * 5105 * @dev: drm dev pointer 5106 * 5107 * Undo the changes from amdgpu_device_prepare. This will be 5108 * called on all resume transitions, including those that failed. 5109 */ 5110 void amdgpu_device_complete(struct drm_device *dev) 5111 { 5112 struct amdgpu_device *adev = drm_to_adev(dev); 5113 int i; 5114 5115 for (i = 0; i < adev->num_ip_blocks; i++) { 5116 if (!adev->ip_blocks[i].status.valid) 5117 continue; 5118 if (!adev->ip_blocks[i].version->funcs->complete) 5119 continue; 5120 adev->ip_blocks[i].version->funcs->complete(&adev->ip_blocks[i]); 5121 } 5122 } 5123 5124 /** 5125 * amdgpu_device_suspend - initiate device suspend 5126 * 5127 * @dev: drm dev pointer 5128 * @notify_clients: notify in-kernel DRM clients 5129 * 5130 * Puts the hw in the suspend state (all asics). 5131 * Returns 0 for success or an error on failure. 5132 * Called at driver suspend. 5133 */ 5134 int amdgpu_device_suspend(struct drm_device *dev, bool notify_clients) 5135 { 5136 struct amdgpu_device *adev = drm_to_adev(dev); 5137 int r = 0; 5138 5139 if (dev->switch_power_state == DRM_SWITCH_POWER_OFF) 5140 return 0; 5141 5142 adev->in_suspend = true; 5143 5144 if (amdgpu_sriov_vf(adev)) { 5145 if (!adev->in_s0ix && !adev->in_runpm) 5146 amdgpu_amdkfd_suspend_process(adev); 5147 amdgpu_virt_fini_data_exchange(adev); 5148 r = amdgpu_virt_request_full_gpu(adev, false); 5149 if (r) 5150 return r; 5151 } 5152 5153 if (amdgpu_acpi_smart_shift_update(dev, AMDGPU_SS_DEV_D3)) 5154 dev_warn(adev->dev, "smart shift update failed\n"); 5155 5156 if (notify_clients) 5157 drm_client_dev_suspend(adev_to_drm(adev), false); 5158 5159 cancel_delayed_work_sync(&adev->delayed_init_work); 5160 5161 amdgpu_ras_suspend(adev); 5162 5163 amdgpu_device_ip_suspend_phase1(adev); 5164 5165 if (!adev->in_s0ix) { 5166 amdgpu_amdkfd_suspend(adev, !amdgpu_sriov_vf(adev) && !adev->in_runpm); 5167 amdgpu_userq_suspend(adev); 5168 } 5169 5170 r = amdgpu_device_evict_resources(adev); 5171 if (r) 5172 return r; 5173 5174 amdgpu_ttm_set_buffer_funcs_status(adev, false); 5175 5176 amdgpu_fence_driver_hw_fini(adev); 5177 5178 amdgpu_device_ip_suspend_phase2(adev); 5179 5180 if (amdgpu_sriov_vf(adev)) 5181 amdgpu_virt_release_full_gpu(adev, false); 5182 5183 r = amdgpu_dpm_notify_rlc_state(adev, false); 5184 if (r) 5185 return r; 5186 5187 return 0; 5188 } 5189 5190 static inline int amdgpu_virt_resume(struct amdgpu_device *adev) 5191 { 5192 int r; 5193 unsigned int prev_physical_node_id = adev->gmc.xgmi.physical_node_id; 5194 5195 /* During VM resume, QEMU programming of VF MSIX table (register GFXMSIX_VECT0_ADDR_LO) 5196 * may not work. The access could be blocked by nBIF protection as VF isn't in 5197 * exclusive access mode. Exclusive access is enabled now, disable/enable MSIX 5198 * so that QEMU reprograms MSIX table. 5199 */ 5200 amdgpu_restore_msix(adev); 5201 5202 r = adev->gfxhub.funcs->get_xgmi_info(adev); 5203 if (r) 5204 return r; 5205 5206 dev_info(adev->dev, "xgmi node, old id %d, new id %d\n", 5207 prev_physical_node_id, adev->gmc.xgmi.physical_node_id); 5208 5209 adev->vm_manager.vram_base_offset = adev->gfxhub.funcs->get_mc_fb_offset(adev); 5210 adev->vm_manager.vram_base_offset += 5211 adev->gmc.xgmi.physical_node_id * adev->gmc.xgmi.node_segment_size; 5212 5213 return 0; 5214 } 5215 5216 /** 5217 * amdgpu_device_resume - initiate device resume 5218 * 5219 * @dev: drm dev pointer 5220 * @notify_clients: notify in-kernel DRM clients 5221 * 5222 * Bring the hw back to operating state (all asics). 5223 * Returns 0 for success or an error on failure. 5224 * Called at driver resume. 5225 */ 5226 int amdgpu_device_resume(struct drm_device *dev, bool notify_clients) 5227 { 5228 struct amdgpu_device *adev = drm_to_adev(dev); 5229 int r = 0; 5230 5231 if (amdgpu_sriov_vf(adev)) { 5232 r = amdgpu_virt_request_full_gpu(adev, true); 5233 if (r) 5234 return r; 5235 } 5236 5237 if (amdgpu_virt_xgmi_migrate_enabled(adev)) { 5238 r = amdgpu_virt_resume(adev); 5239 if (r) 5240 goto exit; 5241 } 5242 5243 if (dev->switch_power_state == DRM_SWITCH_POWER_OFF) 5244 return 0; 5245 5246 if (adev->in_s0ix) 5247 amdgpu_dpm_gfx_state_change(adev, sGpuChangeState_D0Entry); 5248 5249 /* post card */ 5250 if (amdgpu_device_need_post(adev)) { 5251 r = amdgpu_device_asic_init(adev); 5252 if (r) 5253 dev_err(adev->dev, "amdgpu asic init failed\n"); 5254 } 5255 5256 r = amdgpu_device_ip_resume(adev); 5257 5258 if (r) { 5259 dev_err(adev->dev, "amdgpu_device_ip_resume failed (%d).\n", r); 5260 goto exit; 5261 } 5262 5263 if (!adev->in_s0ix) { 5264 r = amdgpu_amdkfd_resume(adev, !amdgpu_sriov_vf(adev) && !adev->in_runpm); 5265 if (r) 5266 goto exit; 5267 5268 r = amdgpu_userq_resume(adev); 5269 if (r) 5270 goto exit; 5271 } 5272 5273 r = amdgpu_device_ip_late_init(adev); 5274 if (r) 5275 goto exit; 5276 5277 queue_delayed_work(system_wq, &adev->delayed_init_work, 5278 msecs_to_jiffies(AMDGPU_RESUME_MS)); 5279 exit: 5280 if (amdgpu_sriov_vf(adev)) { 5281 amdgpu_virt_init_data_exchange(adev); 5282 amdgpu_virt_release_full_gpu(adev, true); 5283 5284 if (!adev->in_s0ix && !r && !adev->in_runpm) 5285 r = amdgpu_amdkfd_resume_process(adev); 5286 } 5287 5288 if (r) 5289 return r; 5290 5291 /* Make sure IB tests flushed */ 5292 flush_delayed_work(&adev->delayed_init_work); 5293 5294 if (notify_clients) 5295 drm_client_dev_resume(adev_to_drm(adev), false); 5296 5297 amdgpu_ras_resume(adev); 5298 5299 if (adev->mode_info.num_crtc) { 5300 /* 5301 * Most of the connector probing functions try to acquire runtime pm 5302 * refs to ensure that the GPU is powered on when connector polling is 5303 * performed. Since we're calling this from a runtime PM callback, 5304 * trying to acquire rpm refs will cause us to deadlock. 5305 * 5306 * Since we're guaranteed to be holding the rpm lock, it's safe to 5307 * temporarily disable the rpm helpers so this doesn't deadlock us. 5308 */ 5309 #ifdef CONFIG_PM 5310 dev->dev->power.disable_depth++; 5311 #endif 5312 if (!adev->dc_enabled) 5313 drm_helper_hpd_irq_event(dev); 5314 else 5315 drm_kms_helper_hotplug_event(dev); 5316 #ifdef CONFIG_PM 5317 dev->dev->power.disable_depth--; 5318 #endif 5319 } 5320 adev->in_suspend = false; 5321 5322 if (amdgpu_acpi_smart_shift_update(dev, AMDGPU_SS_DEV_D0)) 5323 dev_warn(adev->dev, "smart shift update failed\n"); 5324 5325 return 0; 5326 } 5327 5328 /** 5329 * amdgpu_device_ip_check_soft_reset - did soft reset succeed 5330 * 5331 * @adev: amdgpu_device pointer 5332 * 5333 * The list of all the hardware IPs that make up the asic is walked and 5334 * the check_soft_reset callbacks are run. check_soft_reset determines 5335 * if the asic is still hung or not. 5336 * Returns true if any of the IPs are still in a hung state, false if not. 5337 */ 5338 static bool amdgpu_device_ip_check_soft_reset(struct amdgpu_device *adev) 5339 { 5340 int i; 5341 bool asic_hang = false; 5342 5343 if (amdgpu_sriov_vf(adev)) 5344 return true; 5345 5346 if (amdgpu_asic_need_full_reset(adev)) 5347 return true; 5348 5349 for (i = 0; i < adev->num_ip_blocks; i++) { 5350 if (!adev->ip_blocks[i].status.valid) 5351 continue; 5352 if (adev->ip_blocks[i].version->funcs->check_soft_reset) 5353 adev->ip_blocks[i].status.hang = 5354 adev->ip_blocks[i].version->funcs->check_soft_reset( 5355 &adev->ip_blocks[i]); 5356 if (adev->ip_blocks[i].status.hang) { 5357 dev_info(adev->dev, "IP block:%s is hung!\n", adev->ip_blocks[i].version->funcs->name); 5358 asic_hang = true; 5359 } 5360 } 5361 return asic_hang; 5362 } 5363 5364 /** 5365 * amdgpu_device_ip_pre_soft_reset - prepare for soft reset 5366 * 5367 * @adev: amdgpu_device pointer 5368 * 5369 * The list of all the hardware IPs that make up the asic is walked and the 5370 * pre_soft_reset callbacks are run if the block is hung. pre_soft_reset 5371 * handles any IP specific hardware or software state changes that are 5372 * necessary for a soft reset to succeed. 5373 * Returns 0 on success, negative error code on failure. 5374 */ 5375 static int amdgpu_device_ip_pre_soft_reset(struct amdgpu_device *adev) 5376 { 5377 int i, r = 0; 5378 5379 for (i = 0; i < adev->num_ip_blocks; i++) { 5380 if (!adev->ip_blocks[i].status.valid) 5381 continue; 5382 if (adev->ip_blocks[i].status.hang && 5383 adev->ip_blocks[i].version->funcs->pre_soft_reset) { 5384 r = adev->ip_blocks[i].version->funcs->pre_soft_reset(&adev->ip_blocks[i]); 5385 if (r) 5386 return r; 5387 } 5388 } 5389 5390 return 0; 5391 } 5392 5393 /** 5394 * amdgpu_device_ip_need_full_reset - check if a full asic reset is needed 5395 * 5396 * @adev: amdgpu_device pointer 5397 * 5398 * Some hardware IPs cannot be soft reset. If they are hung, a full gpu 5399 * reset is necessary to recover. 5400 * Returns true if a full asic reset is required, false if not. 5401 */ 5402 static bool amdgpu_device_ip_need_full_reset(struct amdgpu_device *adev) 5403 { 5404 int i; 5405 5406 if (amdgpu_asic_need_full_reset(adev)) 5407 return true; 5408 5409 for (i = 0; i < adev->num_ip_blocks; i++) { 5410 if (!adev->ip_blocks[i].status.valid) 5411 continue; 5412 if ((adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) || 5413 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) || 5414 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_ACP) || 5415 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE) || 5416 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) { 5417 if (adev->ip_blocks[i].status.hang) { 5418 dev_info(adev->dev, "Some block need full reset!\n"); 5419 return true; 5420 } 5421 } 5422 } 5423 return false; 5424 } 5425 5426 /** 5427 * amdgpu_device_ip_soft_reset - do a soft reset 5428 * 5429 * @adev: amdgpu_device pointer 5430 * 5431 * The list of all the hardware IPs that make up the asic is walked and the 5432 * soft_reset callbacks are run if the block is hung. soft_reset handles any 5433 * IP specific hardware or software state changes that are necessary to soft 5434 * reset the IP. 5435 * Returns 0 on success, negative error code on failure. 5436 */ 5437 static int amdgpu_device_ip_soft_reset(struct amdgpu_device *adev) 5438 { 5439 int i, r = 0; 5440 5441 for (i = 0; i < adev->num_ip_blocks; i++) { 5442 if (!adev->ip_blocks[i].status.valid) 5443 continue; 5444 if (adev->ip_blocks[i].status.hang && 5445 adev->ip_blocks[i].version->funcs->soft_reset) { 5446 r = adev->ip_blocks[i].version->funcs->soft_reset(&adev->ip_blocks[i]); 5447 if (r) 5448 return r; 5449 } 5450 } 5451 5452 return 0; 5453 } 5454 5455 /** 5456 * amdgpu_device_ip_post_soft_reset - clean up from soft reset 5457 * 5458 * @adev: amdgpu_device pointer 5459 * 5460 * The list of all the hardware IPs that make up the asic is walked and the 5461 * post_soft_reset callbacks are run if the asic was hung. post_soft_reset 5462 * handles any IP specific hardware or software state changes that are 5463 * necessary after the IP has been soft reset. 5464 * Returns 0 on success, negative error code on failure. 5465 */ 5466 static int amdgpu_device_ip_post_soft_reset(struct amdgpu_device *adev) 5467 { 5468 int i, r = 0; 5469 5470 for (i = 0; i < adev->num_ip_blocks; i++) { 5471 if (!adev->ip_blocks[i].status.valid) 5472 continue; 5473 if (adev->ip_blocks[i].status.hang && 5474 adev->ip_blocks[i].version->funcs->post_soft_reset) 5475 r = adev->ip_blocks[i].version->funcs->post_soft_reset(&adev->ip_blocks[i]); 5476 if (r) 5477 return r; 5478 } 5479 5480 return 0; 5481 } 5482 5483 /** 5484 * amdgpu_device_reset_sriov - reset ASIC for SR-IOV vf 5485 * 5486 * @adev: amdgpu_device pointer 5487 * @reset_context: amdgpu reset context pointer 5488 * 5489 * do VF FLR and reinitialize Asic 5490 * return 0 means succeeded otherwise failed 5491 */ 5492 static int amdgpu_device_reset_sriov(struct amdgpu_device *adev, 5493 struct amdgpu_reset_context *reset_context) 5494 { 5495 int r; 5496 struct amdgpu_hive_info *hive = NULL; 5497 5498 if (test_bit(AMDGPU_HOST_FLR, &reset_context->flags)) { 5499 if (!amdgpu_ras_get_fed_status(adev)) 5500 amdgpu_virt_ready_to_reset(adev); 5501 amdgpu_virt_wait_reset(adev); 5502 clear_bit(AMDGPU_HOST_FLR, &reset_context->flags); 5503 r = amdgpu_virt_request_full_gpu(adev, true); 5504 } else { 5505 r = amdgpu_virt_reset_gpu(adev); 5506 } 5507 if (r) 5508 return r; 5509 5510 amdgpu_ras_clear_err_state(adev); 5511 amdgpu_irq_gpu_reset_resume_helper(adev); 5512 5513 /* some sw clean up VF needs to do before recover */ 5514 amdgpu_virt_post_reset(adev); 5515 5516 /* Resume IP prior to SMC */ 5517 r = amdgpu_device_ip_reinit_early_sriov(adev); 5518 if (r) 5519 return r; 5520 5521 amdgpu_virt_init_data_exchange(adev); 5522 5523 r = amdgpu_device_fw_loading(adev); 5524 if (r) 5525 return r; 5526 5527 /* now we are okay to resume SMC/CP/SDMA */ 5528 r = amdgpu_device_ip_reinit_late_sriov(adev); 5529 if (r) 5530 return r; 5531 5532 hive = amdgpu_get_xgmi_hive(adev); 5533 /* Update PSP FW topology after reset */ 5534 if (hive && adev->gmc.xgmi.num_physical_nodes > 1) 5535 r = amdgpu_xgmi_update_topology(hive, adev); 5536 if (hive) 5537 amdgpu_put_xgmi_hive(hive); 5538 if (r) 5539 return r; 5540 5541 r = amdgpu_ib_ring_tests(adev); 5542 if (r) 5543 return r; 5544 5545 if (adev->virt.gim_feature & AMDGIM_FEATURE_GIM_FLR_VRAMLOST) 5546 amdgpu_inc_vram_lost(adev); 5547 5548 /* need to be called during full access so we can't do it later like 5549 * bare-metal does. 5550 */ 5551 amdgpu_amdkfd_post_reset(adev); 5552 amdgpu_virt_release_full_gpu(adev, true); 5553 5554 /* Aldebaran and gfx_11_0_3 support ras in SRIOV, so need resume ras during reset */ 5555 if (amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 2) || 5556 amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 3) || 5557 amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 4) || 5558 amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 5, 0) || 5559 amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(11, 0, 3)) 5560 amdgpu_ras_resume(adev); 5561 5562 amdgpu_virt_ras_telemetry_post_reset(adev); 5563 5564 return 0; 5565 } 5566 5567 /** 5568 * amdgpu_device_has_job_running - check if there is any unfinished job 5569 * 5570 * @adev: amdgpu_device pointer 5571 * 5572 * check if there is any job running on the device when guest driver receives 5573 * FLR notification from host driver. If there are still jobs running, then 5574 * the guest driver will not respond the FLR reset. Instead, let the job hit 5575 * the timeout and guest driver then issue the reset request. 5576 */ 5577 bool amdgpu_device_has_job_running(struct amdgpu_device *adev) 5578 { 5579 int i; 5580 5581 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 5582 struct amdgpu_ring *ring = adev->rings[i]; 5583 5584 if (!amdgpu_ring_sched_ready(ring)) 5585 continue; 5586 5587 if (amdgpu_fence_count_emitted(ring)) 5588 return true; 5589 } 5590 return false; 5591 } 5592 5593 /** 5594 * amdgpu_device_should_recover_gpu - check if we should try GPU recovery 5595 * 5596 * @adev: amdgpu_device pointer 5597 * 5598 * Check amdgpu_gpu_recovery and SRIOV status to see if we should try to recover 5599 * a hung GPU. 5600 */ 5601 bool amdgpu_device_should_recover_gpu(struct amdgpu_device *adev) 5602 { 5603 5604 if (amdgpu_gpu_recovery == 0) 5605 goto disabled; 5606 5607 /* Skip soft reset check in fatal error mode */ 5608 if (!amdgpu_ras_is_poison_mode_supported(adev)) 5609 return true; 5610 5611 if (amdgpu_sriov_vf(adev)) 5612 return true; 5613 5614 if (amdgpu_gpu_recovery == -1) { 5615 switch (adev->asic_type) { 5616 #ifdef CONFIG_DRM_AMDGPU_SI 5617 case CHIP_VERDE: 5618 case CHIP_TAHITI: 5619 case CHIP_PITCAIRN: 5620 case CHIP_OLAND: 5621 case CHIP_HAINAN: 5622 #endif 5623 #ifdef CONFIG_DRM_AMDGPU_CIK 5624 case CHIP_KAVERI: 5625 case CHIP_KABINI: 5626 case CHIP_MULLINS: 5627 #endif 5628 case CHIP_CARRIZO: 5629 case CHIP_STONEY: 5630 case CHIP_CYAN_SKILLFISH: 5631 goto disabled; 5632 default: 5633 break; 5634 } 5635 } 5636 5637 return true; 5638 5639 disabled: 5640 dev_info(adev->dev, "GPU recovery disabled.\n"); 5641 return false; 5642 } 5643 5644 int amdgpu_device_mode1_reset(struct amdgpu_device *adev) 5645 { 5646 u32 i; 5647 int ret = 0; 5648 5649 if (adev->bios) 5650 amdgpu_atombios_scratch_regs_engine_hung(adev, true); 5651 5652 dev_info(adev->dev, "GPU mode1 reset\n"); 5653 5654 /* Cache the state before bus master disable. The saved config space 5655 * values are used in other cases like restore after mode-2 reset. 5656 */ 5657 amdgpu_device_cache_pci_state(adev->pdev); 5658 5659 /* disable BM */ 5660 pci_clear_master(adev->pdev); 5661 5662 if (amdgpu_dpm_is_mode1_reset_supported(adev)) { 5663 dev_info(adev->dev, "GPU smu mode1 reset\n"); 5664 ret = amdgpu_dpm_mode1_reset(adev); 5665 } else { 5666 dev_info(adev->dev, "GPU psp mode1 reset\n"); 5667 ret = psp_gpu_reset(adev); 5668 } 5669 5670 if (ret) 5671 goto mode1_reset_failed; 5672 5673 amdgpu_device_load_pci_state(adev->pdev); 5674 ret = amdgpu_psp_wait_for_bootloader(adev); 5675 if (ret) 5676 goto mode1_reset_failed; 5677 5678 /* wait for asic to come out of reset */ 5679 for (i = 0; i < adev->usec_timeout; i++) { 5680 u32 memsize = adev->nbio.funcs->get_memsize(adev); 5681 5682 if (memsize != 0xffffffff) 5683 break; 5684 udelay(1); 5685 } 5686 5687 if (i >= adev->usec_timeout) { 5688 ret = -ETIMEDOUT; 5689 goto mode1_reset_failed; 5690 } 5691 5692 if (adev->bios) 5693 amdgpu_atombios_scratch_regs_engine_hung(adev, false); 5694 5695 return 0; 5696 5697 mode1_reset_failed: 5698 dev_err(adev->dev, "GPU mode1 reset failed\n"); 5699 return ret; 5700 } 5701 5702 int amdgpu_device_link_reset(struct amdgpu_device *adev) 5703 { 5704 int ret = 0; 5705 5706 dev_info(adev->dev, "GPU link reset\n"); 5707 5708 if (!adev->pcie_reset_ctx.occurs_dpc) 5709 ret = amdgpu_dpm_link_reset(adev); 5710 5711 if (ret) 5712 goto link_reset_failed; 5713 5714 ret = amdgpu_psp_wait_for_bootloader(adev); 5715 if (ret) 5716 goto link_reset_failed; 5717 5718 return 0; 5719 5720 link_reset_failed: 5721 dev_err(adev->dev, "GPU link reset failed\n"); 5722 return ret; 5723 } 5724 5725 int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev, 5726 struct amdgpu_reset_context *reset_context) 5727 { 5728 int i, r = 0; 5729 struct amdgpu_job *job = NULL; 5730 struct amdgpu_device *tmp_adev = reset_context->reset_req_dev; 5731 bool need_full_reset = 5732 test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 5733 5734 if (reset_context->reset_req_dev == adev) 5735 job = reset_context->job; 5736 5737 if (amdgpu_sriov_vf(adev)) 5738 amdgpu_virt_pre_reset(adev); 5739 5740 amdgpu_fence_driver_isr_toggle(adev, true); 5741 5742 /* block all schedulers and reset given job's ring */ 5743 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 5744 struct amdgpu_ring *ring = adev->rings[i]; 5745 5746 if (!amdgpu_ring_sched_ready(ring)) 5747 continue; 5748 5749 /* Clear job fence from fence drv to avoid force_completion 5750 * leave NULL and vm flush fence in fence drv 5751 */ 5752 amdgpu_fence_driver_clear_job_fences(ring); 5753 5754 /* after all hw jobs are reset, hw fence is meaningless, so force_completion */ 5755 amdgpu_fence_driver_force_completion(ring); 5756 } 5757 5758 amdgpu_fence_driver_isr_toggle(adev, false); 5759 5760 if (job && job->vm) 5761 drm_sched_increase_karma(&job->base); 5762 5763 r = amdgpu_reset_prepare_hwcontext(adev, reset_context); 5764 /* If reset handler not implemented, continue; otherwise return */ 5765 if (r == -EOPNOTSUPP) 5766 r = 0; 5767 else 5768 return r; 5769 5770 /* Don't suspend on bare metal if we are not going to HW reset the ASIC */ 5771 if (!amdgpu_sriov_vf(adev)) { 5772 5773 if (!need_full_reset) 5774 need_full_reset = amdgpu_device_ip_need_full_reset(adev); 5775 5776 if (!need_full_reset && amdgpu_gpu_recovery && 5777 amdgpu_device_ip_check_soft_reset(adev)) { 5778 amdgpu_device_ip_pre_soft_reset(adev); 5779 r = amdgpu_device_ip_soft_reset(adev); 5780 amdgpu_device_ip_post_soft_reset(adev); 5781 if (r || amdgpu_device_ip_check_soft_reset(adev)) { 5782 dev_info(adev->dev, "soft reset failed, will fallback to full reset!\n"); 5783 need_full_reset = true; 5784 } 5785 } 5786 5787 if (!test_bit(AMDGPU_SKIP_COREDUMP, &reset_context->flags)) { 5788 dev_info(tmp_adev->dev, "Dumping IP State\n"); 5789 /* Trigger ip dump before we reset the asic */ 5790 for (i = 0; i < tmp_adev->num_ip_blocks; i++) 5791 if (tmp_adev->ip_blocks[i].version->funcs->dump_ip_state) 5792 tmp_adev->ip_blocks[i].version->funcs 5793 ->dump_ip_state((void *)&tmp_adev->ip_blocks[i]); 5794 dev_info(tmp_adev->dev, "Dumping IP State Completed\n"); 5795 } 5796 5797 if (need_full_reset) 5798 r = amdgpu_device_ip_suspend(adev); 5799 if (need_full_reset) 5800 set_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 5801 else 5802 clear_bit(AMDGPU_NEED_FULL_RESET, 5803 &reset_context->flags); 5804 } 5805 5806 return r; 5807 } 5808 5809 int amdgpu_device_reinit_after_reset(struct amdgpu_reset_context *reset_context) 5810 { 5811 struct list_head *device_list_handle; 5812 bool full_reset, vram_lost = false; 5813 struct amdgpu_device *tmp_adev; 5814 int r, init_level; 5815 5816 device_list_handle = reset_context->reset_device_list; 5817 5818 if (!device_list_handle) 5819 return -EINVAL; 5820 5821 full_reset = test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 5822 5823 /** 5824 * If it's reset on init, it's default init level, otherwise keep level 5825 * as recovery level. 5826 */ 5827 if (reset_context->method == AMD_RESET_METHOD_ON_INIT) 5828 init_level = AMDGPU_INIT_LEVEL_DEFAULT; 5829 else 5830 init_level = AMDGPU_INIT_LEVEL_RESET_RECOVERY; 5831 5832 r = 0; 5833 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 5834 amdgpu_set_init_level(tmp_adev, init_level); 5835 if (full_reset) { 5836 /* post card */ 5837 amdgpu_ras_clear_err_state(tmp_adev); 5838 r = amdgpu_device_asic_init(tmp_adev); 5839 if (r) { 5840 dev_warn(tmp_adev->dev, "asic atom init failed!"); 5841 } else { 5842 dev_info(tmp_adev->dev, "GPU reset succeeded, trying to resume\n"); 5843 5844 r = amdgpu_device_ip_resume_phase1(tmp_adev); 5845 if (r) 5846 goto out; 5847 5848 vram_lost = amdgpu_device_check_vram_lost(tmp_adev); 5849 5850 if (!test_bit(AMDGPU_SKIP_COREDUMP, &reset_context->flags)) 5851 amdgpu_coredump(tmp_adev, false, vram_lost, reset_context->job); 5852 5853 if (vram_lost) { 5854 dev_info( 5855 tmp_adev->dev, 5856 "VRAM is lost due to GPU reset!\n"); 5857 amdgpu_inc_vram_lost(tmp_adev); 5858 } 5859 5860 r = amdgpu_device_fw_loading(tmp_adev); 5861 if (r) 5862 return r; 5863 5864 r = amdgpu_xcp_restore_partition_mode( 5865 tmp_adev->xcp_mgr); 5866 if (r) 5867 goto out; 5868 5869 r = amdgpu_device_ip_resume_phase2(tmp_adev); 5870 if (r) 5871 goto out; 5872 5873 if (tmp_adev->mman.buffer_funcs_ring->sched.ready) 5874 amdgpu_ttm_set_buffer_funcs_status(tmp_adev, true); 5875 5876 r = amdgpu_device_ip_resume_phase3(tmp_adev); 5877 if (r) 5878 goto out; 5879 5880 if (vram_lost) 5881 amdgpu_device_fill_reset_magic(tmp_adev); 5882 5883 /* 5884 * Add this ASIC as tracked as reset was already 5885 * complete successfully. 5886 */ 5887 amdgpu_register_gpu_instance(tmp_adev); 5888 5889 if (!reset_context->hive && 5890 tmp_adev->gmc.xgmi.num_physical_nodes > 1) 5891 amdgpu_xgmi_add_device(tmp_adev); 5892 5893 r = amdgpu_device_ip_late_init(tmp_adev); 5894 if (r) 5895 goto out; 5896 5897 drm_client_dev_resume(adev_to_drm(tmp_adev), false); 5898 5899 /* 5900 * The GPU enters bad state once faulty pages 5901 * by ECC has reached the threshold, and ras 5902 * recovery is scheduled next. So add one check 5903 * here to break recovery if it indeed exceeds 5904 * bad page threshold, and remind user to 5905 * retire this GPU or setting one bigger 5906 * bad_page_threshold value to fix this once 5907 * probing driver again. 5908 */ 5909 if (!amdgpu_ras_is_rma(tmp_adev)) { 5910 /* must succeed. */ 5911 amdgpu_ras_resume(tmp_adev); 5912 } else { 5913 r = -EINVAL; 5914 goto out; 5915 } 5916 5917 /* Update PSP FW topology after reset */ 5918 if (reset_context->hive && 5919 tmp_adev->gmc.xgmi.num_physical_nodes > 1) 5920 r = amdgpu_xgmi_update_topology( 5921 reset_context->hive, tmp_adev); 5922 } 5923 } 5924 5925 out: 5926 if (!r) { 5927 /* IP init is complete now, set level as default */ 5928 amdgpu_set_init_level(tmp_adev, 5929 AMDGPU_INIT_LEVEL_DEFAULT); 5930 amdgpu_irq_gpu_reset_resume_helper(tmp_adev); 5931 r = amdgpu_ib_ring_tests(tmp_adev); 5932 if (r) { 5933 dev_err(tmp_adev->dev, "ib ring test failed (%d).\n", r); 5934 r = -EAGAIN; 5935 goto end; 5936 } 5937 } 5938 5939 if (r) 5940 tmp_adev->asic_reset_res = r; 5941 } 5942 5943 end: 5944 return r; 5945 } 5946 5947 int amdgpu_do_asic_reset(struct list_head *device_list_handle, 5948 struct amdgpu_reset_context *reset_context) 5949 { 5950 struct amdgpu_device *tmp_adev = NULL; 5951 bool need_full_reset, skip_hw_reset; 5952 int r = 0; 5953 5954 /* Try reset handler method first */ 5955 tmp_adev = list_first_entry(device_list_handle, struct amdgpu_device, 5956 reset_list); 5957 5958 reset_context->reset_device_list = device_list_handle; 5959 r = amdgpu_reset_perform_reset(tmp_adev, reset_context); 5960 /* If reset handler not implemented, continue; otherwise return */ 5961 if (r == -EOPNOTSUPP) 5962 r = 0; 5963 else 5964 return r; 5965 5966 /* Reset handler not implemented, use the default method */ 5967 need_full_reset = 5968 test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 5969 skip_hw_reset = test_bit(AMDGPU_SKIP_HW_RESET, &reset_context->flags); 5970 5971 /* 5972 * ASIC reset has to be done on all XGMI hive nodes ASAP 5973 * to allow proper links negotiation in FW (within 1 sec) 5974 */ 5975 if (!skip_hw_reset && need_full_reset) { 5976 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 5977 /* For XGMI run all resets in parallel to speed up the process */ 5978 if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) { 5979 if (!queue_work(system_unbound_wq, 5980 &tmp_adev->xgmi_reset_work)) 5981 r = -EALREADY; 5982 } else 5983 r = amdgpu_asic_reset(tmp_adev); 5984 5985 if (r) { 5986 dev_err(tmp_adev->dev, 5987 "ASIC reset failed with error, %d for drm dev, %s", 5988 r, adev_to_drm(tmp_adev)->unique); 5989 goto out; 5990 } 5991 } 5992 5993 /* For XGMI wait for all resets to complete before proceed */ 5994 if (!r) { 5995 list_for_each_entry(tmp_adev, device_list_handle, 5996 reset_list) { 5997 if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) { 5998 flush_work(&tmp_adev->xgmi_reset_work); 5999 r = tmp_adev->asic_reset_res; 6000 if (r) 6001 break; 6002 } 6003 } 6004 } 6005 } 6006 6007 if (!r && amdgpu_ras_intr_triggered()) { 6008 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 6009 amdgpu_ras_reset_error_count(tmp_adev, 6010 AMDGPU_RAS_BLOCK__MMHUB); 6011 } 6012 6013 amdgpu_ras_intr_cleared(); 6014 } 6015 6016 r = amdgpu_device_reinit_after_reset(reset_context); 6017 if (r == -EAGAIN) 6018 set_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 6019 else 6020 clear_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 6021 6022 out: 6023 return r; 6024 } 6025 6026 static void amdgpu_device_set_mp1_state(struct amdgpu_device *adev) 6027 { 6028 6029 switch (amdgpu_asic_reset_method(adev)) { 6030 case AMD_RESET_METHOD_MODE1: 6031 case AMD_RESET_METHOD_LINK: 6032 adev->mp1_state = PP_MP1_STATE_SHUTDOWN; 6033 break; 6034 case AMD_RESET_METHOD_MODE2: 6035 adev->mp1_state = PP_MP1_STATE_RESET; 6036 break; 6037 default: 6038 adev->mp1_state = PP_MP1_STATE_NONE; 6039 break; 6040 } 6041 } 6042 6043 static void amdgpu_device_unset_mp1_state(struct amdgpu_device *adev) 6044 { 6045 amdgpu_vf_error_trans_all(adev); 6046 adev->mp1_state = PP_MP1_STATE_NONE; 6047 } 6048 6049 static void amdgpu_device_resume_display_audio(struct amdgpu_device *adev) 6050 { 6051 struct pci_dev *p = NULL; 6052 6053 p = pci_get_domain_bus_and_slot(pci_domain_nr(adev->pdev->bus), 6054 adev->pdev->bus->number, 1); 6055 if (p) { 6056 pm_runtime_enable(&(p->dev)); 6057 pm_runtime_resume(&(p->dev)); 6058 } 6059 6060 pci_dev_put(p); 6061 } 6062 6063 static int amdgpu_device_suspend_display_audio(struct amdgpu_device *adev) 6064 { 6065 enum amd_reset_method reset_method; 6066 struct pci_dev *p = NULL; 6067 u64 expires; 6068 6069 /* 6070 * For now, only BACO and mode1 reset are confirmed 6071 * to suffer the audio issue without proper suspended. 6072 */ 6073 reset_method = amdgpu_asic_reset_method(adev); 6074 if ((reset_method != AMD_RESET_METHOD_BACO) && 6075 (reset_method != AMD_RESET_METHOD_MODE1)) 6076 return -EINVAL; 6077 6078 p = pci_get_domain_bus_and_slot(pci_domain_nr(adev->pdev->bus), 6079 adev->pdev->bus->number, 1); 6080 if (!p) 6081 return -ENODEV; 6082 6083 expires = pm_runtime_autosuspend_expiration(&(p->dev)); 6084 if (!expires) 6085 /* 6086 * If we cannot get the audio device autosuspend delay, 6087 * a fixed 4S interval will be used. Considering 3S is 6088 * the audio controller default autosuspend delay setting. 6089 * 4S used here is guaranteed to cover that. 6090 */ 6091 expires = ktime_get_mono_fast_ns() + NSEC_PER_SEC * 4ULL; 6092 6093 while (!pm_runtime_status_suspended(&(p->dev))) { 6094 if (!pm_runtime_suspend(&(p->dev))) 6095 break; 6096 6097 if (expires < ktime_get_mono_fast_ns()) { 6098 dev_warn(adev->dev, "failed to suspend display audio\n"); 6099 pci_dev_put(p); 6100 /* TODO: abort the succeeding gpu reset? */ 6101 return -ETIMEDOUT; 6102 } 6103 } 6104 6105 pm_runtime_disable(&(p->dev)); 6106 6107 pci_dev_put(p); 6108 return 0; 6109 } 6110 6111 static inline void amdgpu_device_stop_pending_resets(struct amdgpu_device *adev) 6112 { 6113 struct amdgpu_ras *con = amdgpu_ras_get_context(adev); 6114 6115 #if defined(CONFIG_DEBUG_FS) 6116 if (!amdgpu_sriov_vf(adev)) 6117 cancel_work(&adev->reset_work); 6118 #endif 6119 6120 if (adev->kfd.dev) 6121 cancel_work(&adev->kfd.reset_work); 6122 6123 if (amdgpu_sriov_vf(adev)) 6124 cancel_work(&adev->virt.flr_work); 6125 6126 if (con && adev->ras_enabled) 6127 cancel_work(&con->recovery_work); 6128 6129 } 6130 6131 static int amdgpu_device_health_check(struct list_head *device_list_handle) 6132 { 6133 struct amdgpu_device *tmp_adev; 6134 int ret = 0; 6135 6136 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 6137 ret |= amdgpu_device_bus_status_check(tmp_adev); 6138 } 6139 6140 return ret; 6141 } 6142 6143 static int amdgpu_device_recovery_prepare(struct amdgpu_device *adev, 6144 struct list_head *device_list, 6145 struct amdgpu_hive_info *hive) 6146 { 6147 struct amdgpu_device *tmp_adev = NULL; 6148 int r; 6149 6150 /* 6151 * Build list of devices to reset. 6152 * In case we are in XGMI hive mode, resort the device list 6153 * to put adev in the 1st position. 6154 */ 6155 if (!amdgpu_sriov_vf(adev) && (adev->gmc.xgmi.num_physical_nodes > 1) && hive) { 6156 list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head) { 6157 list_add_tail(&tmp_adev->reset_list, device_list); 6158 if (adev->shutdown) 6159 tmp_adev->shutdown = true; 6160 if (adev->pcie_reset_ctx.occurs_dpc) 6161 tmp_adev->pcie_reset_ctx.in_link_reset = true; 6162 } 6163 if (!list_is_first(&adev->reset_list, device_list)) 6164 list_rotate_to_front(&adev->reset_list, device_list); 6165 } else { 6166 list_add_tail(&adev->reset_list, device_list); 6167 } 6168 6169 if (!amdgpu_sriov_vf(adev) && (!adev->pcie_reset_ctx.occurs_dpc)) { 6170 r = amdgpu_device_health_check(device_list); 6171 if (r) 6172 return r; 6173 } 6174 6175 return 0; 6176 } 6177 6178 static void amdgpu_device_recovery_get_reset_lock(struct amdgpu_device *adev, 6179 struct list_head *device_list) 6180 { 6181 struct amdgpu_device *tmp_adev = NULL; 6182 6183 if (list_empty(device_list)) 6184 return; 6185 tmp_adev = 6186 list_first_entry(device_list, struct amdgpu_device, reset_list); 6187 amdgpu_device_lock_reset_domain(tmp_adev->reset_domain); 6188 } 6189 6190 static void amdgpu_device_recovery_put_reset_lock(struct amdgpu_device *adev, 6191 struct list_head *device_list) 6192 { 6193 struct amdgpu_device *tmp_adev = NULL; 6194 6195 if (list_empty(device_list)) 6196 return; 6197 tmp_adev = 6198 list_first_entry(device_list, struct amdgpu_device, reset_list); 6199 amdgpu_device_unlock_reset_domain(tmp_adev->reset_domain); 6200 } 6201 6202 static int amdgpu_device_halt_activities( 6203 struct amdgpu_device *adev, struct amdgpu_job *job, 6204 struct amdgpu_reset_context *reset_context, 6205 struct list_head *device_list, struct amdgpu_hive_info *hive, 6206 bool need_emergency_restart) 6207 { 6208 struct amdgpu_device *tmp_adev = NULL; 6209 int i, r = 0; 6210 6211 /* block all schedulers and reset given job's ring */ 6212 list_for_each_entry(tmp_adev, device_list, reset_list) { 6213 amdgpu_device_set_mp1_state(tmp_adev); 6214 6215 /* 6216 * Try to put the audio codec into suspend state 6217 * before gpu reset started. 6218 * 6219 * Due to the power domain of the graphics device 6220 * is shared with AZ power domain. Without this, 6221 * we may change the audio hardware from behind 6222 * the audio driver's back. That will trigger 6223 * some audio codec errors. 6224 */ 6225 if (!amdgpu_device_suspend_display_audio(tmp_adev)) 6226 tmp_adev->pcie_reset_ctx.audio_suspended = true; 6227 6228 amdgpu_ras_set_error_query_ready(tmp_adev, false); 6229 6230 cancel_delayed_work_sync(&tmp_adev->delayed_init_work); 6231 6232 amdgpu_amdkfd_pre_reset(tmp_adev, reset_context); 6233 6234 /* 6235 * Mark these ASICs to be reset as untracked first 6236 * And add them back after reset completed 6237 */ 6238 amdgpu_unregister_gpu_instance(tmp_adev); 6239 6240 drm_client_dev_suspend(adev_to_drm(tmp_adev), false); 6241 6242 /* disable ras on ALL IPs */ 6243 if (!need_emergency_restart && 6244 (!adev->pcie_reset_ctx.occurs_dpc) && 6245 amdgpu_device_ip_need_full_reset(tmp_adev)) 6246 amdgpu_ras_suspend(tmp_adev); 6247 6248 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 6249 struct amdgpu_ring *ring = tmp_adev->rings[i]; 6250 6251 if (!amdgpu_ring_sched_ready(ring)) 6252 continue; 6253 6254 drm_sched_stop(&ring->sched, job ? &job->base : NULL); 6255 6256 if (need_emergency_restart) 6257 amdgpu_job_stop_all_jobs_on_sched(&ring->sched); 6258 } 6259 atomic_inc(&tmp_adev->gpu_reset_counter); 6260 } 6261 6262 return r; 6263 } 6264 6265 static int amdgpu_device_asic_reset(struct amdgpu_device *adev, 6266 struct list_head *device_list, 6267 struct amdgpu_reset_context *reset_context) 6268 { 6269 struct amdgpu_device *tmp_adev = NULL; 6270 int retry_limit = AMDGPU_MAX_RETRY_LIMIT; 6271 int r = 0; 6272 6273 retry: /* Rest of adevs pre asic reset from XGMI hive. */ 6274 list_for_each_entry(tmp_adev, device_list, reset_list) { 6275 if (adev->pcie_reset_ctx.occurs_dpc) 6276 tmp_adev->no_hw_access = true; 6277 r = amdgpu_device_pre_asic_reset(tmp_adev, reset_context); 6278 if (adev->pcie_reset_ctx.occurs_dpc) 6279 tmp_adev->no_hw_access = false; 6280 /*TODO Should we stop ?*/ 6281 if (r) { 6282 dev_err(tmp_adev->dev, "GPU pre asic reset failed with err, %d for drm dev, %s ", 6283 r, adev_to_drm(tmp_adev)->unique); 6284 tmp_adev->asic_reset_res = r; 6285 } 6286 } 6287 6288 /* Actual ASIC resets if needed.*/ 6289 /* Host driver will handle XGMI hive reset for SRIOV */ 6290 if (amdgpu_sriov_vf(adev)) { 6291 6292 /* Bail out of reset early */ 6293 if (amdgpu_ras_is_rma(adev)) 6294 return -ENODEV; 6295 6296 if (amdgpu_ras_get_fed_status(adev) || amdgpu_virt_rcvd_ras_interrupt(adev)) { 6297 dev_dbg(adev->dev, "Detected RAS error, wait for FLR completion\n"); 6298 amdgpu_ras_set_fed(adev, true); 6299 set_bit(AMDGPU_HOST_FLR, &reset_context->flags); 6300 } 6301 6302 r = amdgpu_device_reset_sriov(adev, reset_context); 6303 if (AMDGPU_RETRY_SRIOV_RESET(r) && (retry_limit--) > 0) { 6304 amdgpu_virt_release_full_gpu(adev, true); 6305 goto retry; 6306 } 6307 if (r) 6308 adev->asic_reset_res = r; 6309 } else { 6310 r = amdgpu_do_asic_reset(device_list, reset_context); 6311 if (r && r == -EAGAIN) 6312 goto retry; 6313 } 6314 6315 list_for_each_entry(tmp_adev, device_list, reset_list) { 6316 /* 6317 * Drop any pending non scheduler resets queued before reset is done. 6318 * Any reset scheduled after this point would be valid. Scheduler resets 6319 * were already dropped during drm_sched_stop and no new ones can come 6320 * in before drm_sched_start. 6321 */ 6322 amdgpu_device_stop_pending_resets(tmp_adev); 6323 } 6324 6325 return r; 6326 } 6327 6328 static int amdgpu_device_sched_resume(struct list_head *device_list, 6329 struct amdgpu_reset_context *reset_context, 6330 bool job_signaled) 6331 { 6332 struct amdgpu_device *tmp_adev = NULL; 6333 int i, r = 0; 6334 6335 /* Post ASIC reset for all devs .*/ 6336 list_for_each_entry(tmp_adev, device_list, reset_list) { 6337 6338 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 6339 struct amdgpu_ring *ring = tmp_adev->rings[i]; 6340 6341 if (!amdgpu_ring_sched_ready(ring)) 6342 continue; 6343 6344 drm_sched_start(&ring->sched, 0); 6345 } 6346 6347 if (!drm_drv_uses_atomic_modeset(adev_to_drm(tmp_adev)) && !job_signaled) 6348 drm_helper_resume_force_mode(adev_to_drm(tmp_adev)); 6349 6350 if (tmp_adev->asic_reset_res) 6351 r = tmp_adev->asic_reset_res; 6352 6353 tmp_adev->asic_reset_res = 0; 6354 6355 if (r) { 6356 /* bad news, how to tell it to userspace ? 6357 * for ras error, we should report GPU bad status instead of 6358 * reset failure 6359 */ 6360 if (reset_context->src != AMDGPU_RESET_SRC_RAS || 6361 !amdgpu_ras_eeprom_check_err_threshold(tmp_adev)) 6362 dev_info(tmp_adev->dev, "GPU reset(%d) failed\n", 6363 atomic_read(&tmp_adev->gpu_reset_counter)); 6364 amdgpu_vf_error_put(tmp_adev, AMDGIM_ERROR_VF_GPU_RESET_FAIL, 0, r); 6365 } else { 6366 dev_info(tmp_adev->dev, "GPU reset(%d) succeeded!\n", atomic_read(&tmp_adev->gpu_reset_counter)); 6367 if (amdgpu_acpi_smart_shift_update(adev_to_drm(tmp_adev), AMDGPU_SS_DEV_D0)) 6368 dev_warn(tmp_adev->dev, 6369 "smart shift update failed\n"); 6370 } 6371 } 6372 6373 return r; 6374 } 6375 6376 static void amdgpu_device_gpu_resume(struct amdgpu_device *adev, 6377 struct list_head *device_list, 6378 bool need_emergency_restart) 6379 { 6380 struct amdgpu_device *tmp_adev = NULL; 6381 6382 list_for_each_entry(tmp_adev, device_list, reset_list) { 6383 /* unlock kfd: SRIOV would do it separately */ 6384 if (!need_emergency_restart && !amdgpu_sriov_vf(tmp_adev)) 6385 amdgpu_amdkfd_post_reset(tmp_adev); 6386 6387 /* kfd_post_reset will do nothing if kfd device is not initialized, 6388 * need to bring up kfd here if it's not be initialized before 6389 */ 6390 if (!adev->kfd.init_complete) 6391 amdgpu_amdkfd_device_init(adev); 6392 6393 if (tmp_adev->pcie_reset_ctx.audio_suspended) 6394 amdgpu_device_resume_display_audio(tmp_adev); 6395 6396 amdgpu_device_unset_mp1_state(tmp_adev); 6397 6398 amdgpu_ras_set_error_query_ready(tmp_adev, true); 6399 6400 } 6401 } 6402 6403 6404 /** 6405 * amdgpu_device_gpu_recover - reset the asic and recover scheduler 6406 * 6407 * @adev: amdgpu_device pointer 6408 * @job: which job trigger hang 6409 * @reset_context: amdgpu reset context pointer 6410 * 6411 * Attempt to reset the GPU if it has hung (all asics). 6412 * Attempt to do soft-reset or full-reset and reinitialize Asic 6413 * Returns 0 for success or an error on failure. 6414 */ 6415 6416 int amdgpu_device_gpu_recover(struct amdgpu_device *adev, 6417 struct amdgpu_job *job, 6418 struct amdgpu_reset_context *reset_context) 6419 { 6420 struct list_head device_list; 6421 bool job_signaled = false; 6422 struct amdgpu_hive_info *hive = NULL; 6423 int r = 0; 6424 bool need_emergency_restart = false; 6425 6426 /* 6427 * If it reaches here because of hang/timeout and a RAS error is 6428 * detected at the same time, let RAS recovery take care of it. 6429 */ 6430 if (amdgpu_ras_is_err_state(adev, AMDGPU_RAS_BLOCK__ANY) && 6431 !amdgpu_sriov_vf(adev) && 6432 reset_context->src != AMDGPU_RESET_SRC_RAS) { 6433 dev_dbg(adev->dev, 6434 "Gpu recovery from source: %d yielding to RAS error recovery handling", 6435 reset_context->src); 6436 return 0; 6437 } 6438 6439 /* 6440 * Special case: RAS triggered and full reset isn't supported 6441 */ 6442 need_emergency_restart = amdgpu_ras_need_emergency_restart(adev); 6443 6444 /* 6445 * Flush RAM to disk so that after reboot 6446 * the user can read log and see why the system rebooted. 6447 */ 6448 if (need_emergency_restart && amdgpu_ras_get_context(adev) && 6449 amdgpu_ras_get_context(adev)->reboot) { 6450 dev_warn(adev->dev, "Emergency reboot."); 6451 6452 ksys_sync_helper(); 6453 emergency_restart(); 6454 } 6455 6456 dev_info(adev->dev, "GPU %s begin!\n", 6457 need_emergency_restart ? "jobs stop":"reset"); 6458 6459 if (!amdgpu_sriov_vf(adev)) 6460 hive = amdgpu_get_xgmi_hive(adev); 6461 if (hive) 6462 mutex_lock(&hive->hive_lock); 6463 6464 reset_context->job = job; 6465 reset_context->hive = hive; 6466 INIT_LIST_HEAD(&device_list); 6467 6468 if (amdgpu_device_recovery_prepare(adev, &device_list, hive)) 6469 goto end_reset; 6470 6471 /* We need to lock reset domain only once both for XGMI and single device */ 6472 amdgpu_device_recovery_get_reset_lock(adev, &device_list); 6473 6474 r = amdgpu_device_halt_activities(adev, job, reset_context, &device_list, 6475 hive, need_emergency_restart); 6476 if (r) 6477 goto reset_unlock; 6478 6479 if (need_emergency_restart) 6480 goto skip_sched_resume; 6481 /* 6482 * Must check guilty signal here since after this point all old 6483 * HW fences are force signaled. 6484 * 6485 * job->base holds a reference to parent fence 6486 */ 6487 if (job && dma_fence_is_signaled(&job->hw_fence.base)) { 6488 job_signaled = true; 6489 dev_info(adev->dev, "Guilty job already signaled, skipping HW reset"); 6490 goto skip_hw_reset; 6491 } 6492 6493 r = amdgpu_device_asic_reset(adev, &device_list, reset_context); 6494 if (r) 6495 goto reset_unlock; 6496 skip_hw_reset: 6497 r = amdgpu_device_sched_resume(&device_list, reset_context, job_signaled); 6498 if (r) 6499 goto reset_unlock; 6500 skip_sched_resume: 6501 amdgpu_device_gpu_resume(adev, &device_list, need_emergency_restart); 6502 reset_unlock: 6503 amdgpu_device_recovery_put_reset_lock(adev, &device_list); 6504 end_reset: 6505 if (hive) { 6506 mutex_unlock(&hive->hive_lock); 6507 amdgpu_put_xgmi_hive(hive); 6508 } 6509 6510 if (r) 6511 dev_info(adev->dev, "GPU reset end with ret = %d\n", r); 6512 6513 atomic_set(&adev->reset_domain->reset_res, r); 6514 6515 if (!r) 6516 drm_dev_wedged_event(adev_to_drm(adev), DRM_WEDGE_RECOVERY_NONE); 6517 6518 return r; 6519 } 6520 6521 /** 6522 * amdgpu_device_partner_bandwidth - find the bandwidth of appropriate partner 6523 * 6524 * @adev: amdgpu_device pointer 6525 * @speed: pointer to the speed of the link 6526 * @width: pointer to the width of the link 6527 * 6528 * Evaluate the hierarchy to find the speed and bandwidth capabilities of the 6529 * first physical partner to an AMD dGPU. 6530 * This will exclude any virtual switches and links. 6531 */ 6532 static void amdgpu_device_partner_bandwidth(struct amdgpu_device *adev, 6533 enum pci_bus_speed *speed, 6534 enum pcie_link_width *width) 6535 { 6536 struct pci_dev *parent = adev->pdev; 6537 6538 if (!speed || !width) 6539 return; 6540 6541 *speed = PCI_SPEED_UNKNOWN; 6542 *width = PCIE_LNK_WIDTH_UNKNOWN; 6543 6544 if (amdgpu_device_pcie_dynamic_switching_supported(adev)) { 6545 while ((parent = pci_upstream_bridge(parent))) { 6546 /* skip upstream/downstream switches internal to dGPU*/ 6547 if (parent->vendor == PCI_VENDOR_ID_ATI) 6548 continue; 6549 *speed = pcie_get_speed_cap(parent); 6550 *width = pcie_get_width_cap(parent); 6551 break; 6552 } 6553 } else { 6554 /* use the current speeds rather than max if switching is not supported */ 6555 pcie_bandwidth_available(adev->pdev, NULL, speed, width); 6556 } 6557 } 6558 6559 /** 6560 * amdgpu_device_gpu_bandwidth - find the bandwidth of the GPU 6561 * 6562 * @adev: amdgpu_device pointer 6563 * @speed: pointer to the speed of the link 6564 * @width: pointer to the width of the link 6565 * 6566 * Evaluate the hierarchy to find the speed and bandwidth capabilities of the 6567 * AMD dGPU which may be a virtual upstream bridge. 6568 */ 6569 static void amdgpu_device_gpu_bandwidth(struct amdgpu_device *adev, 6570 enum pci_bus_speed *speed, 6571 enum pcie_link_width *width) 6572 { 6573 struct pci_dev *parent = adev->pdev; 6574 6575 if (!speed || !width) 6576 return; 6577 6578 parent = pci_upstream_bridge(parent); 6579 if (parent && parent->vendor == PCI_VENDOR_ID_ATI) { 6580 /* use the upstream/downstream switches internal to dGPU */ 6581 *speed = pcie_get_speed_cap(parent); 6582 *width = pcie_get_width_cap(parent); 6583 while ((parent = pci_upstream_bridge(parent))) { 6584 if (parent->vendor == PCI_VENDOR_ID_ATI) { 6585 /* use the upstream/downstream switches internal to dGPU */ 6586 *speed = pcie_get_speed_cap(parent); 6587 *width = pcie_get_width_cap(parent); 6588 } 6589 } 6590 } else { 6591 /* use the device itself */ 6592 *speed = pcie_get_speed_cap(adev->pdev); 6593 *width = pcie_get_width_cap(adev->pdev); 6594 } 6595 } 6596 6597 /** 6598 * amdgpu_device_get_pcie_info - fence pcie info about the PCIE slot 6599 * 6600 * @adev: amdgpu_device pointer 6601 * 6602 * Fetches and stores in the driver the PCIE capabilities (gen speed 6603 * and lanes) of the slot the device is in. Handles APUs and 6604 * virtualized environments where PCIE config space may not be available. 6605 */ 6606 static void amdgpu_device_get_pcie_info(struct amdgpu_device *adev) 6607 { 6608 enum pci_bus_speed speed_cap, platform_speed_cap; 6609 enum pcie_link_width platform_link_width, link_width; 6610 6611 if (amdgpu_pcie_gen_cap) 6612 adev->pm.pcie_gen_mask = amdgpu_pcie_gen_cap; 6613 6614 if (amdgpu_pcie_lane_cap) 6615 adev->pm.pcie_mlw_mask = amdgpu_pcie_lane_cap; 6616 6617 /* covers APUs as well */ 6618 if (pci_is_root_bus(adev->pdev->bus) && !amdgpu_passthrough(adev)) { 6619 if (adev->pm.pcie_gen_mask == 0) 6620 adev->pm.pcie_gen_mask = AMDGPU_DEFAULT_PCIE_GEN_MASK; 6621 if (adev->pm.pcie_mlw_mask == 0) 6622 adev->pm.pcie_mlw_mask = AMDGPU_DEFAULT_PCIE_MLW_MASK; 6623 return; 6624 } 6625 6626 if (adev->pm.pcie_gen_mask && adev->pm.pcie_mlw_mask) 6627 return; 6628 6629 amdgpu_device_partner_bandwidth(adev, &platform_speed_cap, 6630 &platform_link_width); 6631 amdgpu_device_gpu_bandwidth(adev, &speed_cap, &link_width); 6632 6633 if (adev->pm.pcie_gen_mask == 0) { 6634 /* asic caps */ 6635 if (speed_cap == PCI_SPEED_UNKNOWN) { 6636 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 6637 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 | 6638 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3); 6639 } else { 6640 if (speed_cap == PCIE_SPEED_32_0GT) 6641 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 6642 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 | 6643 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3 | 6644 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN4 | 6645 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN5); 6646 else if (speed_cap == PCIE_SPEED_16_0GT) 6647 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 6648 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 | 6649 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3 | 6650 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN4); 6651 else if (speed_cap == PCIE_SPEED_8_0GT) 6652 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 6653 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 | 6654 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3); 6655 else if (speed_cap == PCIE_SPEED_5_0GT) 6656 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 6657 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2); 6658 else 6659 adev->pm.pcie_gen_mask |= CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1; 6660 } 6661 /* platform caps */ 6662 if (platform_speed_cap == PCI_SPEED_UNKNOWN) { 6663 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 6664 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2); 6665 } else { 6666 if (platform_speed_cap == PCIE_SPEED_32_0GT) 6667 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 6668 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 | 6669 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3 | 6670 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN4 | 6671 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN5); 6672 else if (platform_speed_cap == PCIE_SPEED_16_0GT) 6673 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 6674 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 | 6675 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3 | 6676 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN4); 6677 else if (platform_speed_cap == PCIE_SPEED_8_0GT) 6678 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 6679 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 | 6680 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3); 6681 else if (platform_speed_cap == PCIE_SPEED_5_0GT) 6682 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 6683 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2); 6684 else 6685 adev->pm.pcie_gen_mask |= CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1; 6686 6687 } 6688 } 6689 if (adev->pm.pcie_mlw_mask == 0) { 6690 /* asic caps */ 6691 if (link_width == PCIE_LNK_WIDTH_UNKNOWN) { 6692 adev->pm.pcie_mlw_mask |= AMDGPU_DEFAULT_ASIC_PCIE_MLW_MASK; 6693 } else { 6694 switch (link_width) { 6695 case PCIE_LNK_X32: 6696 adev->pm.pcie_mlw_mask |= (CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X32 | 6697 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X16 | 6698 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X12 | 6699 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X8 | 6700 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X4 | 6701 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X2 | 6702 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X1); 6703 break; 6704 case PCIE_LNK_X16: 6705 adev->pm.pcie_mlw_mask |= (CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X16 | 6706 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X12 | 6707 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X8 | 6708 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X4 | 6709 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X2 | 6710 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X1); 6711 break; 6712 case PCIE_LNK_X12: 6713 adev->pm.pcie_mlw_mask |= (CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X12 | 6714 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X8 | 6715 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X4 | 6716 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X2 | 6717 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X1); 6718 break; 6719 case PCIE_LNK_X8: 6720 adev->pm.pcie_mlw_mask |= (CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X8 | 6721 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X4 | 6722 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X2 | 6723 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X1); 6724 break; 6725 case PCIE_LNK_X4: 6726 adev->pm.pcie_mlw_mask |= (CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X4 | 6727 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X2 | 6728 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X1); 6729 break; 6730 case PCIE_LNK_X2: 6731 adev->pm.pcie_mlw_mask |= (CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X2 | 6732 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X1); 6733 break; 6734 case PCIE_LNK_X1: 6735 adev->pm.pcie_mlw_mask |= CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X1; 6736 break; 6737 default: 6738 break; 6739 } 6740 } 6741 /* platform caps */ 6742 if (platform_link_width == PCIE_LNK_WIDTH_UNKNOWN) { 6743 adev->pm.pcie_mlw_mask |= AMDGPU_DEFAULT_PCIE_MLW_MASK; 6744 } else { 6745 switch (platform_link_width) { 6746 case PCIE_LNK_X32: 6747 adev->pm.pcie_mlw_mask |= (CAIL_PCIE_LINK_WIDTH_SUPPORT_X32 | 6748 CAIL_PCIE_LINK_WIDTH_SUPPORT_X16 | 6749 CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 | 6750 CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 | 6751 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 6752 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 6753 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 6754 break; 6755 case PCIE_LNK_X16: 6756 adev->pm.pcie_mlw_mask |= (CAIL_PCIE_LINK_WIDTH_SUPPORT_X16 | 6757 CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 | 6758 CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 | 6759 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 6760 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 6761 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 6762 break; 6763 case PCIE_LNK_X12: 6764 adev->pm.pcie_mlw_mask |= (CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 | 6765 CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 | 6766 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 6767 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 6768 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 6769 break; 6770 case PCIE_LNK_X8: 6771 adev->pm.pcie_mlw_mask |= (CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 | 6772 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 6773 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 6774 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 6775 break; 6776 case PCIE_LNK_X4: 6777 adev->pm.pcie_mlw_mask |= (CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 6778 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 6779 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 6780 break; 6781 case PCIE_LNK_X2: 6782 adev->pm.pcie_mlw_mask |= (CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 6783 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 6784 break; 6785 case PCIE_LNK_X1: 6786 adev->pm.pcie_mlw_mask |= CAIL_PCIE_LINK_WIDTH_SUPPORT_X1; 6787 break; 6788 default: 6789 break; 6790 } 6791 } 6792 } 6793 } 6794 6795 /** 6796 * amdgpu_device_is_peer_accessible - Check peer access through PCIe BAR 6797 * 6798 * @adev: amdgpu_device pointer 6799 * @peer_adev: amdgpu_device pointer for peer device trying to access @adev 6800 * 6801 * Return true if @peer_adev can access (DMA) @adev through the PCIe 6802 * BAR, i.e. @adev is "large BAR" and the BAR matches the DMA mask of 6803 * @peer_adev. 6804 */ 6805 bool amdgpu_device_is_peer_accessible(struct amdgpu_device *adev, 6806 struct amdgpu_device *peer_adev) 6807 { 6808 #ifdef CONFIG_HSA_AMD_P2P 6809 bool p2p_access = 6810 !adev->gmc.xgmi.connected_to_cpu && 6811 !(pci_p2pdma_distance(adev->pdev, peer_adev->dev, false) < 0); 6812 if (!p2p_access) 6813 dev_info(adev->dev, "PCIe P2P access from peer device %s is not supported by the chipset\n", 6814 pci_name(peer_adev->pdev)); 6815 6816 bool is_large_bar = adev->gmc.visible_vram_size && 6817 adev->gmc.real_vram_size == adev->gmc.visible_vram_size; 6818 bool p2p_addressable = amdgpu_device_check_iommu_remap(peer_adev); 6819 6820 if (!p2p_addressable) { 6821 uint64_t address_mask = peer_adev->dev->dma_mask ? 6822 ~*peer_adev->dev->dma_mask : ~((1ULL << 32) - 1); 6823 resource_size_t aper_limit = 6824 adev->gmc.aper_base + adev->gmc.aper_size - 1; 6825 6826 p2p_addressable = !(adev->gmc.aper_base & address_mask || 6827 aper_limit & address_mask); 6828 } 6829 return pcie_p2p && is_large_bar && p2p_access && p2p_addressable; 6830 #else 6831 return false; 6832 #endif 6833 } 6834 6835 int amdgpu_device_baco_enter(struct drm_device *dev) 6836 { 6837 struct amdgpu_device *adev = drm_to_adev(dev); 6838 struct amdgpu_ras *ras = amdgpu_ras_get_context(adev); 6839 6840 if (!amdgpu_device_supports_baco(dev)) 6841 return -ENOTSUPP; 6842 6843 if (ras && adev->ras_enabled && 6844 adev->nbio.funcs->enable_doorbell_interrupt) 6845 adev->nbio.funcs->enable_doorbell_interrupt(adev, false); 6846 6847 return amdgpu_dpm_baco_enter(adev); 6848 } 6849 6850 int amdgpu_device_baco_exit(struct drm_device *dev) 6851 { 6852 struct amdgpu_device *adev = drm_to_adev(dev); 6853 struct amdgpu_ras *ras = amdgpu_ras_get_context(adev); 6854 int ret = 0; 6855 6856 if (!amdgpu_device_supports_baco(dev)) 6857 return -ENOTSUPP; 6858 6859 ret = amdgpu_dpm_baco_exit(adev); 6860 if (ret) 6861 return ret; 6862 6863 if (ras && adev->ras_enabled && 6864 adev->nbio.funcs->enable_doorbell_interrupt) 6865 adev->nbio.funcs->enable_doorbell_interrupt(adev, true); 6866 6867 if (amdgpu_passthrough(adev) && adev->nbio.funcs && 6868 adev->nbio.funcs->clear_doorbell_interrupt) 6869 adev->nbio.funcs->clear_doorbell_interrupt(adev); 6870 6871 return 0; 6872 } 6873 6874 /** 6875 * amdgpu_pci_error_detected - Called when a PCI error is detected. 6876 * @pdev: PCI device struct 6877 * @state: PCI channel state 6878 * 6879 * Description: Called when a PCI error is detected. 6880 * 6881 * Return: PCI_ERS_RESULT_NEED_RESET or PCI_ERS_RESULT_DISCONNECT. 6882 */ 6883 pci_ers_result_t amdgpu_pci_error_detected(struct pci_dev *pdev, pci_channel_state_t state) 6884 { 6885 struct drm_device *dev = pci_get_drvdata(pdev); 6886 struct amdgpu_device *adev = drm_to_adev(dev); 6887 struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev); 6888 struct amdgpu_reset_context reset_context; 6889 struct list_head device_list; 6890 int r = 0; 6891 6892 dev_info(adev->dev, "PCI error: detected callback!!\n"); 6893 6894 if (!amdgpu_dpm_is_link_reset_supported(adev)) { 6895 dev_warn(adev->dev, "No support for XGMI hive yet...\n"); 6896 return PCI_ERS_RESULT_DISCONNECT; 6897 } 6898 6899 adev->pci_channel_state = state; 6900 6901 switch (state) { 6902 case pci_channel_io_normal: 6903 dev_info(adev->dev, "pci_channel_io_normal: state(%d)!!\n", state); 6904 return PCI_ERS_RESULT_CAN_RECOVER; 6905 case pci_channel_io_frozen: 6906 /* Fatal error, prepare for slot reset */ 6907 dev_info(adev->dev, "pci_channel_io_frozen: state(%d)!!\n", state); 6908 6909 if (hive) 6910 mutex_lock(&hive->hive_lock); 6911 adev->pcie_reset_ctx.occurs_dpc = true; 6912 memset(&reset_context, 0, sizeof(reset_context)); 6913 INIT_LIST_HEAD(&device_list); 6914 6915 amdgpu_device_recovery_prepare(adev, &device_list, hive); 6916 amdgpu_device_recovery_get_reset_lock(adev, &device_list); 6917 r = amdgpu_device_halt_activities(adev, NULL, &reset_context, &device_list, 6918 hive, false); 6919 if (hive) { 6920 mutex_unlock(&hive->hive_lock); 6921 amdgpu_put_xgmi_hive(hive); 6922 } 6923 if (r) 6924 return PCI_ERS_RESULT_DISCONNECT; 6925 return PCI_ERS_RESULT_NEED_RESET; 6926 case pci_channel_io_perm_failure: 6927 /* Permanent error, prepare for device removal */ 6928 dev_info(adev->dev, "pci_channel_io_perm_failure: state(%d)!!\n", state); 6929 return PCI_ERS_RESULT_DISCONNECT; 6930 } 6931 6932 return PCI_ERS_RESULT_NEED_RESET; 6933 } 6934 6935 /** 6936 * amdgpu_pci_mmio_enabled - Enable MMIO and dump debug registers 6937 * @pdev: pointer to PCI device 6938 */ 6939 pci_ers_result_t amdgpu_pci_mmio_enabled(struct pci_dev *pdev) 6940 { 6941 struct drm_device *dev = pci_get_drvdata(pdev); 6942 struct amdgpu_device *adev = drm_to_adev(dev); 6943 6944 dev_info(adev->dev, "PCI error: mmio enabled callback!!\n"); 6945 6946 /* TODO - dump whatever for debugging purposes */ 6947 6948 /* This called only if amdgpu_pci_error_detected returns 6949 * PCI_ERS_RESULT_CAN_RECOVER. Read/write to the device still 6950 * works, no need to reset slot. 6951 */ 6952 6953 return PCI_ERS_RESULT_RECOVERED; 6954 } 6955 6956 /** 6957 * amdgpu_pci_slot_reset - Called when PCI slot has been reset. 6958 * @pdev: PCI device struct 6959 * 6960 * Description: This routine is called by the pci error recovery 6961 * code after the PCI slot has been reset, just before we 6962 * should resume normal operations. 6963 */ 6964 pci_ers_result_t amdgpu_pci_slot_reset(struct pci_dev *pdev) 6965 { 6966 struct drm_device *dev = pci_get_drvdata(pdev); 6967 struct amdgpu_device *adev = drm_to_adev(dev); 6968 struct amdgpu_reset_context reset_context; 6969 struct amdgpu_device *tmp_adev; 6970 struct amdgpu_hive_info *hive; 6971 struct list_head device_list; 6972 int r = 0, i; 6973 u32 memsize; 6974 6975 /* PCI error slot reset should be skipped During RAS recovery */ 6976 if ((amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 3) || 6977 amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 4)) && 6978 amdgpu_ras_in_recovery(adev)) 6979 return PCI_ERS_RESULT_RECOVERED; 6980 6981 dev_info(adev->dev, "PCI error: slot reset callback!!\n"); 6982 6983 memset(&reset_context, 0, sizeof(reset_context)); 6984 6985 /* wait for asic to come out of reset */ 6986 msleep(700); 6987 6988 /* Restore PCI confspace */ 6989 amdgpu_device_load_pci_state(pdev); 6990 6991 /* confirm ASIC came out of reset */ 6992 for (i = 0; i < adev->usec_timeout; i++) { 6993 memsize = amdgpu_asic_get_config_memsize(adev); 6994 6995 if (memsize != 0xffffffff) 6996 break; 6997 udelay(1); 6998 } 6999 if (memsize == 0xffffffff) { 7000 r = -ETIME; 7001 goto out; 7002 } 7003 7004 reset_context.method = AMD_RESET_METHOD_NONE; 7005 reset_context.reset_req_dev = adev; 7006 set_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags); 7007 set_bit(AMDGPU_SKIP_COREDUMP, &reset_context.flags); 7008 INIT_LIST_HEAD(&device_list); 7009 7010 hive = amdgpu_get_xgmi_hive(adev); 7011 if (hive) { 7012 mutex_lock(&hive->hive_lock); 7013 reset_context.hive = hive; 7014 list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head) { 7015 tmp_adev->pcie_reset_ctx.in_link_reset = true; 7016 list_add_tail(&tmp_adev->reset_list, &device_list); 7017 } 7018 } else { 7019 set_bit(AMDGPU_SKIP_HW_RESET, &reset_context.flags); 7020 list_add_tail(&adev->reset_list, &device_list); 7021 } 7022 7023 r = amdgpu_device_asic_reset(adev, &device_list, &reset_context); 7024 out: 7025 if (!r) { 7026 if (amdgpu_device_cache_pci_state(adev->pdev)) 7027 pci_restore_state(adev->pdev); 7028 dev_info(adev->dev, "PCIe error recovery succeeded\n"); 7029 } else { 7030 dev_err(adev->dev, "PCIe error recovery failed, err:%d\n", r); 7031 if (hive) { 7032 list_for_each_entry(tmp_adev, &device_list, reset_list) 7033 amdgpu_device_unset_mp1_state(tmp_adev); 7034 } 7035 amdgpu_device_recovery_put_reset_lock(adev, &device_list); 7036 } 7037 7038 if (hive) { 7039 mutex_unlock(&hive->hive_lock); 7040 amdgpu_put_xgmi_hive(hive); 7041 } 7042 7043 return r ? PCI_ERS_RESULT_DISCONNECT : PCI_ERS_RESULT_RECOVERED; 7044 } 7045 7046 /** 7047 * amdgpu_pci_resume() - resume normal ops after PCI reset 7048 * @pdev: pointer to PCI device 7049 * 7050 * Called when the error recovery driver tells us that its 7051 * OK to resume normal operation. 7052 */ 7053 void amdgpu_pci_resume(struct pci_dev *pdev) 7054 { 7055 struct drm_device *dev = pci_get_drvdata(pdev); 7056 struct amdgpu_device *adev = drm_to_adev(dev); 7057 struct list_head device_list; 7058 struct amdgpu_hive_info *hive = NULL; 7059 struct amdgpu_device *tmp_adev = NULL; 7060 7061 dev_info(adev->dev, "PCI error: resume callback!!\n"); 7062 7063 /* Only continue execution for the case of pci_channel_io_frozen */ 7064 if (adev->pci_channel_state != pci_channel_io_frozen) 7065 return; 7066 7067 INIT_LIST_HEAD(&device_list); 7068 7069 hive = amdgpu_get_xgmi_hive(adev); 7070 if (hive) { 7071 mutex_lock(&hive->hive_lock); 7072 list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head) { 7073 tmp_adev->pcie_reset_ctx.in_link_reset = false; 7074 list_add_tail(&tmp_adev->reset_list, &device_list); 7075 } 7076 } else 7077 list_add_tail(&adev->reset_list, &device_list); 7078 7079 amdgpu_device_sched_resume(&device_list, NULL, NULL); 7080 amdgpu_device_gpu_resume(adev, &device_list, false); 7081 amdgpu_device_recovery_put_reset_lock(adev, &device_list); 7082 adev->pcie_reset_ctx.occurs_dpc = false; 7083 7084 if (hive) { 7085 mutex_unlock(&hive->hive_lock); 7086 amdgpu_put_xgmi_hive(hive); 7087 } 7088 } 7089 7090 bool amdgpu_device_cache_pci_state(struct pci_dev *pdev) 7091 { 7092 struct drm_device *dev = pci_get_drvdata(pdev); 7093 struct amdgpu_device *adev = drm_to_adev(dev); 7094 int r; 7095 7096 if (amdgpu_sriov_vf(adev)) 7097 return false; 7098 7099 r = pci_save_state(pdev); 7100 if (!r) { 7101 kfree(adev->pci_state); 7102 7103 adev->pci_state = pci_store_saved_state(pdev); 7104 7105 if (!adev->pci_state) { 7106 dev_err(adev->dev, "Failed to store PCI saved state"); 7107 return false; 7108 } 7109 } else { 7110 dev_warn(adev->dev, "Failed to save PCI state, err:%d\n", r); 7111 return false; 7112 } 7113 7114 return true; 7115 } 7116 7117 bool amdgpu_device_load_pci_state(struct pci_dev *pdev) 7118 { 7119 struct drm_device *dev = pci_get_drvdata(pdev); 7120 struct amdgpu_device *adev = drm_to_adev(dev); 7121 int r; 7122 7123 if (!adev->pci_state) 7124 return false; 7125 7126 r = pci_load_saved_state(pdev, adev->pci_state); 7127 7128 if (!r) { 7129 pci_restore_state(pdev); 7130 } else { 7131 dev_warn(adev->dev, "Failed to load PCI state, err:%d\n", r); 7132 return false; 7133 } 7134 7135 return true; 7136 } 7137 7138 void amdgpu_device_flush_hdp(struct amdgpu_device *adev, 7139 struct amdgpu_ring *ring) 7140 { 7141 #ifdef CONFIG_X86_64 7142 if ((adev->flags & AMD_IS_APU) && !amdgpu_passthrough(adev)) 7143 return; 7144 #endif 7145 if (adev->gmc.xgmi.connected_to_cpu) 7146 return; 7147 7148 if (ring && ring->funcs->emit_hdp_flush) 7149 amdgpu_ring_emit_hdp_flush(ring); 7150 else 7151 amdgpu_asic_flush_hdp(adev, ring); 7152 } 7153 7154 void amdgpu_device_invalidate_hdp(struct amdgpu_device *adev, 7155 struct amdgpu_ring *ring) 7156 { 7157 #ifdef CONFIG_X86_64 7158 if ((adev->flags & AMD_IS_APU) && !amdgpu_passthrough(adev)) 7159 return; 7160 #endif 7161 if (adev->gmc.xgmi.connected_to_cpu) 7162 return; 7163 7164 amdgpu_asic_invalidate_hdp(adev, ring); 7165 } 7166 7167 int amdgpu_in_reset(struct amdgpu_device *adev) 7168 { 7169 return atomic_read(&adev->reset_domain->in_gpu_reset); 7170 } 7171 7172 /** 7173 * amdgpu_device_halt() - bring hardware to some kind of halt state 7174 * 7175 * @adev: amdgpu_device pointer 7176 * 7177 * Bring hardware to some kind of halt state so that no one can touch it 7178 * any more. It will help to maintain error context when error occurred. 7179 * Compare to a simple hang, the system will keep stable at least for SSH 7180 * access. Then it should be trivial to inspect the hardware state and 7181 * see what's going on. Implemented as following: 7182 * 7183 * 1. drm_dev_unplug() makes device inaccessible to user space(IOCTLs, etc), 7184 * clears all CPU mappings to device, disallows remappings through page faults 7185 * 2. amdgpu_irq_disable_all() disables all interrupts 7186 * 3. amdgpu_fence_driver_hw_fini() signals all HW fences 7187 * 4. set adev->no_hw_access to avoid potential crashes after setp 5 7188 * 5. amdgpu_device_unmap_mmio() clears all MMIO mappings 7189 * 6. pci_disable_device() and pci_wait_for_pending_transaction() 7190 * flush any in flight DMA operations 7191 */ 7192 void amdgpu_device_halt(struct amdgpu_device *adev) 7193 { 7194 struct pci_dev *pdev = adev->pdev; 7195 struct drm_device *ddev = adev_to_drm(adev); 7196 7197 amdgpu_xcp_dev_unplug(adev); 7198 drm_dev_unplug(ddev); 7199 7200 amdgpu_irq_disable_all(adev); 7201 7202 amdgpu_fence_driver_hw_fini(adev); 7203 7204 adev->no_hw_access = true; 7205 7206 amdgpu_device_unmap_mmio(adev); 7207 7208 pci_disable_device(pdev); 7209 pci_wait_for_pending_transaction(pdev); 7210 } 7211 7212 u32 amdgpu_device_pcie_port_rreg(struct amdgpu_device *adev, 7213 u32 reg) 7214 { 7215 unsigned long flags, address, data; 7216 u32 r; 7217 7218 address = adev->nbio.funcs->get_pcie_port_index_offset(adev); 7219 data = adev->nbio.funcs->get_pcie_port_data_offset(adev); 7220 7221 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 7222 WREG32(address, reg * 4); 7223 (void)RREG32(address); 7224 r = RREG32(data); 7225 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 7226 return r; 7227 } 7228 7229 void amdgpu_device_pcie_port_wreg(struct amdgpu_device *adev, 7230 u32 reg, u32 v) 7231 { 7232 unsigned long flags, address, data; 7233 7234 address = adev->nbio.funcs->get_pcie_port_index_offset(adev); 7235 data = adev->nbio.funcs->get_pcie_port_data_offset(adev); 7236 7237 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 7238 WREG32(address, reg * 4); 7239 (void)RREG32(address); 7240 WREG32(data, v); 7241 (void)RREG32(data); 7242 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 7243 } 7244 7245 /** 7246 * amdgpu_device_get_gang - return a reference to the current gang 7247 * @adev: amdgpu_device pointer 7248 * 7249 * Returns: A new reference to the current gang leader. 7250 */ 7251 struct dma_fence *amdgpu_device_get_gang(struct amdgpu_device *adev) 7252 { 7253 struct dma_fence *fence; 7254 7255 rcu_read_lock(); 7256 fence = dma_fence_get_rcu_safe(&adev->gang_submit); 7257 rcu_read_unlock(); 7258 return fence; 7259 } 7260 7261 /** 7262 * amdgpu_device_switch_gang - switch to a new gang 7263 * @adev: amdgpu_device pointer 7264 * @gang: the gang to switch to 7265 * 7266 * Try to switch to a new gang. 7267 * Returns: NULL if we switched to the new gang or a reference to the current 7268 * gang leader. 7269 */ 7270 struct dma_fence *amdgpu_device_switch_gang(struct amdgpu_device *adev, 7271 struct dma_fence *gang) 7272 { 7273 struct dma_fence *old = NULL; 7274 7275 dma_fence_get(gang); 7276 do { 7277 dma_fence_put(old); 7278 old = amdgpu_device_get_gang(adev); 7279 if (old == gang) 7280 break; 7281 7282 if (!dma_fence_is_signaled(old)) { 7283 dma_fence_put(gang); 7284 return old; 7285 } 7286 7287 } while (cmpxchg((struct dma_fence __force **)&adev->gang_submit, 7288 old, gang) != old); 7289 7290 /* 7291 * Drop it once for the exchanged reference in adev and once for the 7292 * thread local reference acquired in amdgpu_device_get_gang(). 7293 */ 7294 dma_fence_put(old); 7295 dma_fence_put(old); 7296 return NULL; 7297 } 7298 7299 /** 7300 * amdgpu_device_enforce_isolation - enforce HW isolation 7301 * @adev: the amdgpu device pointer 7302 * @ring: the HW ring the job is supposed to run on 7303 * @job: the job which is about to be pushed to the HW ring 7304 * 7305 * Makes sure that only one client at a time can use the GFX block. 7306 * Returns: The dependency to wait on before the job can be pushed to the HW. 7307 * The function is called multiple times until NULL is returned. 7308 */ 7309 struct dma_fence *amdgpu_device_enforce_isolation(struct amdgpu_device *adev, 7310 struct amdgpu_ring *ring, 7311 struct amdgpu_job *job) 7312 { 7313 struct amdgpu_isolation *isolation = &adev->isolation[ring->xcp_id]; 7314 struct drm_sched_fence *f = job->base.s_fence; 7315 struct dma_fence *dep; 7316 void *owner; 7317 int r; 7318 7319 /* 7320 * For now enforce isolation only for the GFX block since we only need 7321 * the cleaner shader on those rings. 7322 */ 7323 if (ring->funcs->type != AMDGPU_RING_TYPE_GFX && 7324 ring->funcs->type != AMDGPU_RING_TYPE_COMPUTE) 7325 return NULL; 7326 7327 /* 7328 * All submissions where enforce isolation is false are handled as if 7329 * they come from a single client. Use ~0l as the owner to distinct it 7330 * from kernel submissions where the owner is NULL. 7331 */ 7332 owner = job->enforce_isolation ? f->owner : (void *)~0l; 7333 7334 mutex_lock(&adev->enforce_isolation_mutex); 7335 7336 /* 7337 * The "spearhead" submission is the first one which changes the 7338 * ownership to its client. We always need to wait for it to be 7339 * pushed to the HW before proceeding with anything. 7340 */ 7341 if (&f->scheduled != isolation->spearhead && 7342 !dma_fence_is_signaled(isolation->spearhead)) { 7343 dep = isolation->spearhead; 7344 goto out_grab_ref; 7345 } 7346 7347 if (isolation->owner != owner) { 7348 7349 /* 7350 * Wait for any gang to be assembled before switching to a 7351 * different owner or otherwise we could deadlock the 7352 * submissions. 7353 */ 7354 if (!job->gang_submit) { 7355 dep = amdgpu_device_get_gang(adev); 7356 if (!dma_fence_is_signaled(dep)) 7357 goto out_return_dep; 7358 dma_fence_put(dep); 7359 } 7360 7361 dma_fence_put(isolation->spearhead); 7362 isolation->spearhead = dma_fence_get(&f->scheduled); 7363 amdgpu_sync_move(&isolation->active, &isolation->prev); 7364 trace_amdgpu_isolation(isolation->owner, owner); 7365 isolation->owner = owner; 7366 } 7367 7368 /* 7369 * Specifying the ring here helps to pipeline submissions even when 7370 * isolation is enabled. If that is not desired for testing NULL can be 7371 * used instead of the ring to enforce a CPU round trip while switching 7372 * between clients. 7373 */ 7374 dep = amdgpu_sync_peek_fence(&isolation->prev, ring); 7375 r = amdgpu_sync_fence(&isolation->active, &f->finished, GFP_NOWAIT); 7376 if (r) 7377 dev_warn(adev->dev, "OOM tracking isolation\n"); 7378 7379 out_grab_ref: 7380 dma_fence_get(dep); 7381 out_return_dep: 7382 mutex_unlock(&adev->enforce_isolation_mutex); 7383 return dep; 7384 } 7385 7386 bool amdgpu_device_has_display_hardware(struct amdgpu_device *adev) 7387 { 7388 switch (adev->asic_type) { 7389 #ifdef CONFIG_DRM_AMDGPU_SI 7390 case CHIP_HAINAN: 7391 #endif 7392 case CHIP_TOPAZ: 7393 /* chips with no display hardware */ 7394 return false; 7395 #ifdef CONFIG_DRM_AMDGPU_SI 7396 case CHIP_TAHITI: 7397 case CHIP_PITCAIRN: 7398 case CHIP_VERDE: 7399 case CHIP_OLAND: 7400 #endif 7401 #ifdef CONFIG_DRM_AMDGPU_CIK 7402 case CHIP_BONAIRE: 7403 case CHIP_HAWAII: 7404 case CHIP_KAVERI: 7405 case CHIP_KABINI: 7406 case CHIP_MULLINS: 7407 #endif 7408 case CHIP_TONGA: 7409 case CHIP_FIJI: 7410 case CHIP_POLARIS10: 7411 case CHIP_POLARIS11: 7412 case CHIP_POLARIS12: 7413 case CHIP_VEGAM: 7414 case CHIP_CARRIZO: 7415 case CHIP_STONEY: 7416 /* chips with display hardware */ 7417 return true; 7418 default: 7419 /* IP discovery */ 7420 if (!amdgpu_ip_version(adev, DCE_HWIP, 0) || 7421 (adev->harvest_ip_mask & AMD_HARVEST_IP_DMU_MASK)) 7422 return false; 7423 return true; 7424 } 7425 } 7426 7427 uint32_t amdgpu_device_wait_on_rreg(struct amdgpu_device *adev, 7428 uint32_t inst, uint32_t reg_addr, char reg_name[], 7429 uint32_t expected_value, uint32_t mask) 7430 { 7431 uint32_t ret = 0; 7432 uint32_t old_ = 0; 7433 uint32_t tmp_ = RREG32(reg_addr); 7434 uint32_t loop = adev->usec_timeout; 7435 7436 while ((tmp_ & (mask)) != (expected_value)) { 7437 if (old_ != tmp_) { 7438 loop = adev->usec_timeout; 7439 old_ = tmp_; 7440 } else 7441 udelay(1); 7442 tmp_ = RREG32(reg_addr); 7443 loop--; 7444 if (!loop) { 7445 dev_warn( 7446 adev->dev, 7447 "Register(%d) [%s] failed to reach value 0x%08x != 0x%08xn", 7448 inst, reg_name, (uint32_t)expected_value, 7449 (uint32_t)(tmp_ & (mask))); 7450 ret = -ETIMEDOUT; 7451 break; 7452 } 7453 } 7454 return ret; 7455 } 7456 7457 ssize_t amdgpu_get_soft_full_reset_mask(struct amdgpu_ring *ring) 7458 { 7459 ssize_t size = 0; 7460 7461 if (!ring || !ring->adev) 7462 return size; 7463 7464 if (amdgpu_device_should_recover_gpu(ring->adev)) 7465 size |= AMDGPU_RESET_TYPE_FULL; 7466 7467 if (unlikely(!ring->adev->debug_disable_soft_recovery) && 7468 !amdgpu_sriov_vf(ring->adev) && ring->funcs->soft_recovery) 7469 size |= AMDGPU_RESET_TYPE_SOFT_RESET; 7470 7471 return size; 7472 } 7473 7474 ssize_t amdgpu_show_reset_mask(char *buf, uint32_t supported_reset) 7475 { 7476 ssize_t size = 0; 7477 7478 if (supported_reset == 0) { 7479 size += sysfs_emit_at(buf, size, "unsupported"); 7480 size += sysfs_emit_at(buf, size, "\n"); 7481 return size; 7482 7483 } 7484 7485 if (supported_reset & AMDGPU_RESET_TYPE_SOFT_RESET) 7486 size += sysfs_emit_at(buf, size, "soft "); 7487 7488 if (supported_reset & AMDGPU_RESET_TYPE_PER_QUEUE) 7489 size += sysfs_emit_at(buf, size, "queue "); 7490 7491 if (supported_reset & AMDGPU_RESET_TYPE_PER_PIPE) 7492 size += sysfs_emit_at(buf, size, "pipe "); 7493 7494 if (supported_reset & AMDGPU_RESET_TYPE_FULL) 7495 size += sysfs_emit_at(buf, size, "full "); 7496 7497 size += sysfs_emit_at(buf, size, "\n"); 7498 return size; 7499 } 7500