1 /* 2 * Copyright 2008 Advanced Micro Devices, Inc. 3 * Copyright 2008 Red Hat Inc. 4 * Copyright 2009 Jerome Glisse. 5 * 6 * Permission is hereby granted, free of charge, to any person obtaining a 7 * copy of this software and associated documentation files (the "Software"), 8 * to deal in the Software without restriction, including without limitation 9 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 10 * and/or sell copies of the Software, and to permit persons to whom the 11 * Software is furnished to do so, subject to the following conditions: 12 * 13 * The above copyright notice and this permission notice shall be included in 14 * all copies or substantial portions of the Software. 15 * 16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 19 * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR 20 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, 21 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR 22 * OTHER DEALINGS IN THE SOFTWARE. 23 * 24 * Authors: Dave Airlie 25 * Alex Deucher 26 * Jerome Glisse 27 */ 28 29 #include <linux/aperture.h> 30 #include <linux/power_supply.h> 31 #include <linux/kthread.h> 32 #include <linux/module.h> 33 #include <linux/console.h> 34 #include <linux/slab.h> 35 #include <linux/iommu.h> 36 #include <linux/pci.h> 37 #include <linux/pci-p2pdma.h> 38 #include <linux/apple-gmux.h> 39 40 #include <drm/drm_atomic_helper.h> 41 #include <drm/drm_client_event.h> 42 #include <drm/drm_crtc_helper.h> 43 #include <drm/drm_probe_helper.h> 44 #include <drm/amdgpu_drm.h> 45 #include <linux/device.h> 46 #include <linux/vgaarb.h> 47 #include <linux/vga_switcheroo.h> 48 #include <linux/efi.h> 49 #include "amdgpu.h" 50 #include "amdgpu_trace.h" 51 #include "amdgpu_i2c.h" 52 #include "atom.h" 53 #include "amdgpu_atombios.h" 54 #include "amdgpu_atomfirmware.h" 55 #include "amd_pcie.h" 56 #ifdef CONFIG_DRM_AMDGPU_SI 57 #include "si.h" 58 #endif 59 #ifdef CONFIG_DRM_AMDGPU_CIK 60 #include "cik.h" 61 #endif 62 #include "vi.h" 63 #include "soc15.h" 64 #include "nv.h" 65 #include "bif/bif_4_1_d.h" 66 #include <linux/firmware.h> 67 #include "amdgpu_vf_error.h" 68 69 #include "amdgpu_amdkfd.h" 70 #include "amdgpu_pm.h" 71 72 #include "amdgpu_xgmi.h" 73 #include "amdgpu_ras.h" 74 #include "amdgpu_ras_mgr.h" 75 #include "amdgpu_pmu.h" 76 #include "amdgpu_fru_eeprom.h" 77 #include "amdgpu_reset.h" 78 #include "amdgpu_virt.h" 79 #include "amdgpu_dev_coredump.h" 80 81 #include <linux/suspend.h> 82 #include <drm/task_barrier.h> 83 #include <linux/pm_runtime.h> 84 85 #include <drm/drm_drv.h> 86 87 #if IS_ENABLED(CONFIG_X86) 88 #include <asm/intel-family.h> 89 #include <asm/cpu_device_id.h> 90 #endif 91 92 MODULE_FIRMWARE("amdgpu/vega10_gpu_info.bin"); 93 MODULE_FIRMWARE("amdgpu/vega12_gpu_info.bin"); 94 MODULE_FIRMWARE("amdgpu/raven_gpu_info.bin"); 95 MODULE_FIRMWARE("amdgpu/picasso_gpu_info.bin"); 96 MODULE_FIRMWARE("amdgpu/raven2_gpu_info.bin"); 97 MODULE_FIRMWARE("amdgpu/arcturus_gpu_info.bin"); 98 MODULE_FIRMWARE("amdgpu/navi12_gpu_info.bin"); 99 MODULE_FIRMWARE("amdgpu/cyan_skillfish_gpu_info.bin"); 100 101 #define AMDGPU_RESUME_MS 2000 102 #define AMDGPU_MAX_RETRY_LIMIT 2 103 #define AMDGPU_RETRY_SRIOV_RESET(r) ((r) == -EBUSY || (r) == -ETIMEDOUT || (r) == -EINVAL) 104 #define AMDGPU_PCIE_INDEX_FALLBACK (0x38 >> 2) 105 #define AMDGPU_PCIE_INDEX_HI_FALLBACK (0x44 >> 2) 106 #define AMDGPU_PCIE_DATA_FALLBACK (0x3C >> 2) 107 108 #define AMDGPU_VBIOS_SKIP (1U << 0) 109 #define AMDGPU_VBIOS_OPTIONAL (1U << 1) 110 111 static const struct drm_driver amdgpu_kms_driver; 112 113 const char *amdgpu_asic_name[] = { 114 "TAHITI", 115 "PITCAIRN", 116 "VERDE", 117 "OLAND", 118 "HAINAN", 119 "BONAIRE", 120 "KAVERI", 121 "KABINI", 122 "HAWAII", 123 "MULLINS", 124 "TOPAZ", 125 "TONGA", 126 "FIJI", 127 "CARRIZO", 128 "STONEY", 129 "POLARIS10", 130 "POLARIS11", 131 "POLARIS12", 132 "VEGAM", 133 "VEGA10", 134 "VEGA12", 135 "VEGA20", 136 "RAVEN", 137 "ARCTURUS", 138 "RENOIR", 139 "ALDEBARAN", 140 "NAVI10", 141 "CYAN_SKILLFISH", 142 "NAVI14", 143 "NAVI12", 144 "SIENNA_CICHLID", 145 "NAVY_FLOUNDER", 146 "VANGOGH", 147 "DIMGREY_CAVEFISH", 148 "BEIGE_GOBY", 149 "YELLOW_CARP", 150 "IP DISCOVERY", 151 "LAST", 152 }; 153 154 #define AMDGPU_IP_BLK_MASK_ALL GENMASK(AMD_IP_BLOCK_TYPE_NUM - 1, 0) 155 /* 156 * Default init level where all blocks are expected to be initialized. This is 157 * the level of initialization expected by default and also after a full reset 158 * of the device. 159 */ 160 struct amdgpu_init_level amdgpu_init_default = { 161 .level = AMDGPU_INIT_LEVEL_DEFAULT, 162 .hwini_ip_block_mask = AMDGPU_IP_BLK_MASK_ALL, 163 }; 164 165 struct amdgpu_init_level amdgpu_init_recovery = { 166 .level = AMDGPU_INIT_LEVEL_RESET_RECOVERY, 167 .hwini_ip_block_mask = AMDGPU_IP_BLK_MASK_ALL, 168 }; 169 170 /* 171 * Minimal blocks needed to be initialized before a XGMI hive can be reset. This 172 * is used for cases like reset on initialization where the entire hive needs to 173 * be reset before first use. 174 */ 175 struct amdgpu_init_level amdgpu_init_minimal_xgmi = { 176 .level = AMDGPU_INIT_LEVEL_MINIMAL_XGMI, 177 .hwini_ip_block_mask = 178 BIT(AMD_IP_BLOCK_TYPE_GMC) | BIT(AMD_IP_BLOCK_TYPE_SMC) | 179 BIT(AMD_IP_BLOCK_TYPE_COMMON) | BIT(AMD_IP_BLOCK_TYPE_IH) | 180 BIT(AMD_IP_BLOCK_TYPE_PSP) 181 }; 182 183 static int amdgpu_device_ip_resume_phase1(struct amdgpu_device *adev); 184 static int amdgpu_device_ip_resume_phase2(struct amdgpu_device *adev); 185 static int amdgpu_device_ip_resume_phase3(struct amdgpu_device *adev); 186 187 static void amdgpu_device_load_switch_state(struct amdgpu_device *adev); 188 189 static inline bool amdgpu_ip_member_of_hwini(struct amdgpu_device *adev, 190 enum amd_ip_block_type block) 191 { 192 return (adev->init_lvl->hwini_ip_block_mask & (1U << block)) != 0; 193 } 194 195 void amdgpu_set_init_level(struct amdgpu_device *adev, 196 enum amdgpu_init_lvl_id lvl) 197 { 198 switch (lvl) { 199 case AMDGPU_INIT_LEVEL_MINIMAL_XGMI: 200 adev->init_lvl = &amdgpu_init_minimal_xgmi; 201 break; 202 case AMDGPU_INIT_LEVEL_RESET_RECOVERY: 203 adev->init_lvl = &amdgpu_init_recovery; 204 break; 205 case AMDGPU_INIT_LEVEL_DEFAULT: 206 fallthrough; 207 default: 208 adev->init_lvl = &amdgpu_init_default; 209 break; 210 } 211 } 212 213 static inline void amdgpu_device_stop_pending_resets(struct amdgpu_device *adev); 214 static int amdgpu_device_pm_notifier(struct notifier_block *nb, unsigned long mode, 215 void *data); 216 217 /** 218 * DOC: pcie_replay_count 219 * 220 * The amdgpu driver provides a sysfs API for reporting the total number 221 * of PCIe replays (NAKs). 222 * The file pcie_replay_count is used for this and returns the total 223 * number of replays as a sum of the NAKs generated and NAKs received. 224 */ 225 226 static ssize_t amdgpu_device_get_pcie_replay_count(struct device *dev, 227 struct device_attribute *attr, char *buf) 228 { 229 struct drm_device *ddev = dev_get_drvdata(dev); 230 struct amdgpu_device *adev = drm_to_adev(ddev); 231 uint64_t cnt = amdgpu_asic_get_pcie_replay_count(adev); 232 233 return sysfs_emit(buf, "%llu\n", cnt); 234 } 235 236 static DEVICE_ATTR(pcie_replay_count, 0444, 237 amdgpu_device_get_pcie_replay_count, NULL); 238 239 static int amdgpu_device_attr_sysfs_init(struct amdgpu_device *adev) 240 { 241 int ret = 0; 242 243 if (amdgpu_nbio_is_replay_cnt_supported(adev)) 244 ret = sysfs_create_file(&adev->dev->kobj, 245 &dev_attr_pcie_replay_count.attr); 246 247 return ret; 248 } 249 250 static void amdgpu_device_attr_sysfs_fini(struct amdgpu_device *adev) 251 { 252 if (amdgpu_nbio_is_replay_cnt_supported(adev)) 253 sysfs_remove_file(&adev->dev->kobj, 254 &dev_attr_pcie_replay_count.attr); 255 } 256 257 static ssize_t amdgpu_sysfs_reg_state_get(struct file *f, struct kobject *kobj, 258 const struct bin_attribute *attr, char *buf, 259 loff_t ppos, size_t count) 260 { 261 struct device *dev = kobj_to_dev(kobj); 262 struct drm_device *ddev = dev_get_drvdata(dev); 263 struct amdgpu_device *adev = drm_to_adev(ddev); 264 ssize_t bytes_read; 265 266 switch (ppos) { 267 case AMDGPU_SYS_REG_STATE_XGMI: 268 bytes_read = amdgpu_asic_get_reg_state( 269 adev, AMDGPU_REG_STATE_TYPE_XGMI, buf, count); 270 break; 271 case AMDGPU_SYS_REG_STATE_WAFL: 272 bytes_read = amdgpu_asic_get_reg_state( 273 adev, AMDGPU_REG_STATE_TYPE_WAFL, buf, count); 274 break; 275 case AMDGPU_SYS_REG_STATE_PCIE: 276 bytes_read = amdgpu_asic_get_reg_state( 277 adev, AMDGPU_REG_STATE_TYPE_PCIE, buf, count); 278 break; 279 case AMDGPU_SYS_REG_STATE_USR: 280 bytes_read = amdgpu_asic_get_reg_state( 281 adev, AMDGPU_REG_STATE_TYPE_USR, buf, count); 282 break; 283 case AMDGPU_SYS_REG_STATE_USR_1: 284 bytes_read = amdgpu_asic_get_reg_state( 285 adev, AMDGPU_REG_STATE_TYPE_USR_1, buf, count); 286 break; 287 default: 288 return -EINVAL; 289 } 290 291 return bytes_read; 292 } 293 294 static const BIN_ATTR(reg_state, 0444, amdgpu_sysfs_reg_state_get, NULL, 295 AMDGPU_SYS_REG_STATE_END); 296 297 int amdgpu_reg_state_sysfs_init(struct amdgpu_device *adev) 298 { 299 int ret; 300 301 if (!amdgpu_asic_get_reg_state_supported(adev)) 302 return 0; 303 304 ret = sysfs_create_bin_file(&adev->dev->kobj, &bin_attr_reg_state); 305 306 return ret; 307 } 308 309 void amdgpu_reg_state_sysfs_fini(struct amdgpu_device *adev) 310 { 311 if (!amdgpu_asic_get_reg_state_supported(adev)) 312 return; 313 sysfs_remove_bin_file(&adev->dev->kobj, &bin_attr_reg_state); 314 } 315 316 /** 317 * DOC: board_info 318 * 319 * The amdgpu driver provides a sysfs API for giving board related information. 320 * It provides the form factor information in the format 321 * 322 * type : form factor 323 * 324 * Possible form factor values 325 * 326 * - "cem" - PCIE CEM card 327 * - "oam" - Open Compute Accelerator Module 328 * - "unknown" - Not known 329 * 330 */ 331 332 static ssize_t amdgpu_device_get_board_info(struct device *dev, 333 struct device_attribute *attr, 334 char *buf) 335 { 336 struct drm_device *ddev = dev_get_drvdata(dev); 337 struct amdgpu_device *adev = drm_to_adev(ddev); 338 enum amdgpu_pkg_type pkg_type = AMDGPU_PKG_TYPE_CEM; 339 const char *pkg; 340 341 if (adev->smuio.funcs && adev->smuio.funcs->get_pkg_type) 342 pkg_type = adev->smuio.funcs->get_pkg_type(adev); 343 344 switch (pkg_type) { 345 case AMDGPU_PKG_TYPE_CEM: 346 pkg = "cem"; 347 break; 348 case AMDGPU_PKG_TYPE_OAM: 349 pkg = "oam"; 350 break; 351 default: 352 pkg = "unknown"; 353 break; 354 } 355 356 return sysfs_emit(buf, "%s : %s\n", "type", pkg); 357 } 358 359 static DEVICE_ATTR(board_info, 0444, amdgpu_device_get_board_info, NULL); 360 361 static struct attribute *amdgpu_board_attrs[] = { 362 &dev_attr_board_info.attr, 363 NULL, 364 }; 365 366 static umode_t amdgpu_board_attrs_is_visible(struct kobject *kobj, 367 struct attribute *attr, int n) 368 { 369 struct device *dev = kobj_to_dev(kobj); 370 struct drm_device *ddev = dev_get_drvdata(dev); 371 struct amdgpu_device *adev = drm_to_adev(ddev); 372 373 if (adev->flags & AMD_IS_APU) 374 return 0; 375 376 return attr->mode; 377 } 378 379 static const struct attribute_group amdgpu_board_attrs_group = { 380 .attrs = amdgpu_board_attrs, 381 .is_visible = amdgpu_board_attrs_is_visible 382 }; 383 384 static void amdgpu_device_get_pcie_info(struct amdgpu_device *adev); 385 386 /** 387 * amdgpu_device_supports_px - Is the device a dGPU with ATPX power control 388 * 389 * @adev: amdgpu device pointer 390 * 391 * Returns true if the device is a dGPU with ATPX power control, 392 * otherwise return false. 393 */ 394 bool amdgpu_device_supports_px(struct amdgpu_device *adev) 395 { 396 if ((adev->flags & AMD_IS_PX) && !amdgpu_is_atpx_hybrid()) 397 return true; 398 return false; 399 } 400 401 /** 402 * amdgpu_device_supports_boco - Is the device a dGPU with ACPI power resources 403 * 404 * @adev: amdgpu device pointer 405 * 406 * Returns true if the device is a dGPU with ACPI power control, 407 * otherwise return false. 408 */ 409 bool amdgpu_device_supports_boco(struct amdgpu_device *adev) 410 { 411 if (!IS_ENABLED(CONFIG_HOTPLUG_PCI_PCIE)) 412 return false; 413 414 if (adev->has_pr3 || 415 ((adev->flags & AMD_IS_PX) && amdgpu_is_atpx_hybrid())) 416 return true; 417 return false; 418 } 419 420 /** 421 * amdgpu_device_supports_baco - Does the device support BACO 422 * 423 * @adev: amdgpu device pointer 424 * 425 * Return: 426 * 1 if the device supports BACO; 427 * 3 if the device supports MACO (only works if BACO is supported) 428 * otherwise return 0. 429 */ 430 int amdgpu_device_supports_baco(struct amdgpu_device *adev) 431 { 432 return amdgpu_asic_supports_baco(adev); 433 } 434 435 void amdgpu_device_detect_runtime_pm_mode(struct amdgpu_device *adev) 436 { 437 int bamaco_support; 438 439 adev->pm.rpm_mode = AMDGPU_RUNPM_NONE; 440 bamaco_support = amdgpu_device_supports_baco(adev); 441 442 switch (amdgpu_runtime_pm) { 443 case 2: 444 if (bamaco_support & MACO_SUPPORT) { 445 adev->pm.rpm_mode = AMDGPU_RUNPM_BAMACO; 446 dev_info(adev->dev, "Forcing BAMACO for runtime pm\n"); 447 } else if (bamaco_support == BACO_SUPPORT) { 448 adev->pm.rpm_mode = AMDGPU_RUNPM_BACO; 449 dev_info(adev->dev, "Requested mode BAMACO not available,fallback to use BACO\n"); 450 } 451 break; 452 case 1: 453 if (bamaco_support & BACO_SUPPORT) { 454 adev->pm.rpm_mode = AMDGPU_RUNPM_BACO; 455 dev_info(adev->dev, "Forcing BACO for runtime pm\n"); 456 } 457 break; 458 case -1: 459 case -2: 460 if (amdgpu_device_supports_px(adev)) { 461 /* enable PX as runtime mode */ 462 adev->pm.rpm_mode = AMDGPU_RUNPM_PX; 463 dev_info(adev->dev, "Using ATPX for runtime pm\n"); 464 } else if (amdgpu_device_supports_boco(adev)) { 465 /* enable boco as runtime mode */ 466 adev->pm.rpm_mode = AMDGPU_RUNPM_BOCO; 467 dev_info(adev->dev, "Using BOCO for runtime pm\n"); 468 } else { 469 if (!bamaco_support) 470 goto no_runtime_pm; 471 472 switch (adev->asic_type) { 473 case CHIP_VEGA20: 474 case CHIP_ARCTURUS: 475 /* BACO are not supported on vega20 and arctrus */ 476 break; 477 case CHIP_VEGA10: 478 /* enable BACO as runpm mode if noretry=0 */ 479 if (!adev->gmc.noretry && !amdgpu_passthrough(adev)) 480 adev->pm.rpm_mode = AMDGPU_RUNPM_BACO; 481 break; 482 default: 483 /* enable BACO as runpm mode on CI+ */ 484 if (!amdgpu_passthrough(adev)) 485 adev->pm.rpm_mode = AMDGPU_RUNPM_BACO; 486 break; 487 } 488 489 if (adev->pm.rpm_mode == AMDGPU_RUNPM_BACO) { 490 if (bamaco_support & MACO_SUPPORT) { 491 adev->pm.rpm_mode = AMDGPU_RUNPM_BAMACO; 492 dev_info(adev->dev, "Using BAMACO for runtime pm\n"); 493 } else { 494 dev_info(adev->dev, "Using BACO for runtime pm\n"); 495 } 496 } 497 } 498 break; 499 case 0: 500 dev_info(adev->dev, "runtime pm is manually disabled\n"); 501 break; 502 default: 503 break; 504 } 505 506 no_runtime_pm: 507 if (adev->pm.rpm_mode == AMDGPU_RUNPM_NONE) 508 dev_info(adev->dev, "Runtime PM not available\n"); 509 } 510 /** 511 * amdgpu_device_supports_smart_shift - Is the device dGPU with 512 * smart shift support 513 * 514 * @adev: amdgpu device pointer 515 * 516 * Returns true if the device is a dGPU with Smart Shift support, 517 * otherwise returns false. 518 */ 519 bool amdgpu_device_supports_smart_shift(struct amdgpu_device *adev) 520 { 521 return (amdgpu_device_supports_boco(adev) && 522 amdgpu_acpi_is_power_shift_control_supported()); 523 } 524 525 /* 526 * VRAM access helper functions 527 */ 528 529 /** 530 * amdgpu_device_mm_access - access vram by MM_INDEX/MM_DATA 531 * 532 * @adev: amdgpu_device pointer 533 * @pos: offset of the buffer in vram 534 * @buf: virtual address of the buffer in system memory 535 * @size: read/write size, sizeof(@buf) must > @size 536 * @write: true - write to vram, otherwise - read from vram 537 */ 538 void amdgpu_device_mm_access(struct amdgpu_device *adev, loff_t pos, 539 void *buf, size_t size, bool write) 540 { 541 unsigned long flags; 542 uint32_t hi = ~0, tmp = 0; 543 uint32_t *data = buf; 544 uint64_t last; 545 int idx; 546 547 if (!drm_dev_enter(adev_to_drm(adev), &idx)) 548 return; 549 550 BUG_ON(!IS_ALIGNED(pos, 4) || !IS_ALIGNED(size, 4)); 551 552 spin_lock_irqsave(&adev->mmio_idx_lock, flags); 553 for (last = pos + size; pos < last; pos += 4) { 554 tmp = pos >> 31; 555 556 WREG32_NO_KIQ(mmMM_INDEX, ((uint32_t)pos) | 0x80000000); 557 if (tmp != hi) { 558 WREG32_NO_KIQ(mmMM_INDEX_HI, tmp); 559 hi = tmp; 560 } 561 if (write) 562 WREG32_NO_KIQ(mmMM_DATA, *data++); 563 else 564 *data++ = RREG32_NO_KIQ(mmMM_DATA); 565 } 566 567 spin_unlock_irqrestore(&adev->mmio_idx_lock, flags); 568 drm_dev_exit(idx); 569 } 570 571 /** 572 * amdgpu_device_aper_access - access vram by vram aperture 573 * 574 * @adev: amdgpu_device pointer 575 * @pos: offset of the buffer in vram 576 * @buf: virtual address of the buffer in system memory 577 * @size: read/write size, sizeof(@buf) must > @size 578 * @write: true - write to vram, otherwise - read from vram 579 * 580 * The return value means how many bytes have been transferred. 581 */ 582 size_t amdgpu_device_aper_access(struct amdgpu_device *adev, loff_t pos, 583 void *buf, size_t size, bool write) 584 { 585 #ifdef CONFIG_64BIT 586 void __iomem *addr; 587 size_t count = 0; 588 uint64_t last; 589 590 if (!adev->mman.aper_base_kaddr) 591 return 0; 592 593 last = min(pos + size, adev->gmc.visible_vram_size); 594 if (last > pos) { 595 addr = adev->mman.aper_base_kaddr + pos; 596 count = last - pos; 597 598 if (write) { 599 memcpy_toio(addr, buf, count); 600 /* Make sure HDP write cache flush happens without any reordering 601 * after the system memory contents are sent over PCIe device 602 */ 603 mb(); 604 amdgpu_device_flush_hdp(adev, NULL); 605 } else { 606 amdgpu_device_invalidate_hdp(adev, NULL); 607 /* Make sure HDP read cache is invalidated before issuing a read 608 * to the PCIe device 609 */ 610 mb(); 611 memcpy_fromio(buf, addr, count); 612 } 613 614 } 615 616 return count; 617 #else 618 return 0; 619 #endif 620 } 621 622 /** 623 * amdgpu_device_vram_access - read/write a buffer in vram 624 * 625 * @adev: amdgpu_device pointer 626 * @pos: offset of the buffer in vram 627 * @buf: virtual address of the buffer in system memory 628 * @size: read/write size, sizeof(@buf) must > @size 629 * @write: true - write to vram, otherwise - read from vram 630 */ 631 void amdgpu_device_vram_access(struct amdgpu_device *adev, loff_t pos, 632 void *buf, size_t size, bool write) 633 { 634 size_t count; 635 636 /* try to using vram apreature to access vram first */ 637 count = amdgpu_device_aper_access(adev, pos, buf, size, write); 638 size -= count; 639 if (size) { 640 /* using MM to access rest vram */ 641 pos += count; 642 buf += count; 643 amdgpu_device_mm_access(adev, pos, buf, size, write); 644 } 645 } 646 647 /* 648 * register access helper functions. 649 */ 650 651 /* Check if hw access should be skipped because of hotplug or device error */ 652 bool amdgpu_device_skip_hw_access(struct amdgpu_device *adev) 653 { 654 if (adev->no_hw_access) 655 return true; 656 657 #ifdef CONFIG_LOCKDEP 658 /* 659 * This is a bit complicated to understand, so worth a comment. What we assert 660 * here is that the GPU reset is not running on another thread in parallel. 661 * 662 * For this we trylock the read side of the reset semaphore, if that succeeds 663 * we know that the reset is not running in parallel. 664 * 665 * If the trylock fails we assert that we are either already holding the read 666 * side of the lock or are the reset thread itself and hold the write side of 667 * the lock. 668 */ 669 if (in_task()) { 670 if (down_read_trylock(&adev->reset_domain->sem)) 671 up_read(&adev->reset_domain->sem); 672 else 673 lockdep_assert_held(&adev->reset_domain->sem); 674 } 675 #endif 676 return false; 677 } 678 679 /** 680 * amdgpu_device_rreg - read a memory mapped IO or indirect register 681 * 682 * @adev: amdgpu_device pointer 683 * @reg: dword aligned register offset 684 * @acc_flags: access flags which require special behavior 685 * 686 * Returns the 32 bit value from the offset specified. 687 */ 688 uint32_t amdgpu_device_rreg(struct amdgpu_device *adev, 689 uint32_t reg, uint32_t acc_flags) 690 { 691 uint32_t ret; 692 693 if (amdgpu_device_skip_hw_access(adev)) 694 return 0; 695 696 if ((reg * 4) < adev->rmmio_size) { 697 if (!(acc_flags & AMDGPU_REGS_NO_KIQ) && 698 amdgpu_sriov_runtime(adev) && 699 down_read_trylock(&adev->reset_domain->sem)) { 700 ret = amdgpu_kiq_rreg(adev, reg, 0); 701 up_read(&adev->reset_domain->sem); 702 } else { 703 ret = readl(((void __iomem *)adev->rmmio) + (reg * 4)); 704 } 705 } else { 706 ret = adev->pcie_rreg(adev, reg * 4); 707 } 708 709 trace_amdgpu_device_rreg(adev->pdev->device, reg, ret); 710 711 return ret; 712 } 713 714 /* 715 * MMIO register read with bytes helper functions 716 * @offset:bytes offset from MMIO start 717 */ 718 719 /** 720 * amdgpu_mm_rreg8 - read a memory mapped IO register 721 * 722 * @adev: amdgpu_device pointer 723 * @offset: byte aligned register offset 724 * 725 * Returns the 8 bit value from the offset specified. 726 */ 727 uint8_t amdgpu_mm_rreg8(struct amdgpu_device *adev, uint32_t offset) 728 { 729 if (amdgpu_device_skip_hw_access(adev)) 730 return 0; 731 732 if (offset < adev->rmmio_size) 733 return (readb(adev->rmmio + offset)); 734 BUG(); 735 } 736 737 738 /** 739 * amdgpu_device_xcc_rreg - read a memory mapped IO or indirect register with specific XCC 740 * 741 * @adev: amdgpu_device pointer 742 * @reg: dword aligned register offset 743 * @acc_flags: access flags which require special behavior 744 * @xcc_id: xcc accelerated compute core id 745 * 746 * Returns the 32 bit value from the offset specified. 747 */ 748 uint32_t amdgpu_device_xcc_rreg(struct amdgpu_device *adev, 749 uint32_t reg, uint32_t acc_flags, 750 uint32_t xcc_id) 751 { 752 uint32_t ret, rlcg_flag; 753 754 if (amdgpu_device_skip_hw_access(adev)) 755 return 0; 756 757 if ((reg * 4) < adev->rmmio_size) { 758 if (amdgpu_sriov_vf(adev) && 759 !amdgpu_sriov_runtime(adev) && 760 adev->gfx.rlc.rlcg_reg_access_supported && 761 amdgpu_virt_get_rlcg_reg_access_flag(adev, acc_flags, 762 GC_HWIP, false, 763 &rlcg_flag)) { 764 ret = amdgpu_virt_rlcg_reg_rw(adev, reg, 0, rlcg_flag, GET_INST(GC, xcc_id)); 765 } else if (!(acc_flags & AMDGPU_REGS_NO_KIQ) && 766 amdgpu_sriov_runtime(adev) && 767 down_read_trylock(&adev->reset_domain->sem)) { 768 ret = amdgpu_kiq_rreg(adev, reg, xcc_id); 769 up_read(&adev->reset_domain->sem); 770 } else { 771 ret = readl(((void __iomem *)adev->rmmio) + (reg * 4)); 772 } 773 } else { 774 ret = adev->pcie_rreg(adev, reg * 4); 775 } 776 777 return ret; 778 } 779 780 /* 781 * MMIO register write with bytes helper functions 782 * @offset:bytes offset from MMIO start 783 * @value: the value want to be written to the register 784 */ 785 786 /** 787 * amdgpu_mm_wreg8 - read a memory mapped IO register 788 * 789 * @adev: amdgpu_device pointer 790 * @offset: byte aligned register offset 791 * @value: 8 bit value to write 792 * 793 * Writes the value specified to the offset specified. 794 */ 795 void amdgpu_mm_wreg8(struct amdgpu_device *adev, uint32_t offset, uint8_t value) 796 { 797 if (amdgpu_device_skip_hw_access(adev)) 798 return; 799 800 if (offset < adev->rmmio_size) 801 writeb(value, adev->rmmio + offset); 802 else 803 BUG(); 804 } 805 806 /** 807 * amdgpu_device_wreg - write to a memory mapped IO or indirect register 808 * 809 * @adev: amdgpu_device pointer 810 * @reg: dword aligned register offset 811 * @v: 32 bit value to write to the register 812 * @acc_flags: access flags which require special behavior 813 * 814 * Writes the value specified to the offset specified. 815 */ 816 void amdgpu_device_wreg(struct amdgpu_device *adev, 817 uint32_t reg, uint32_t v, 818 uint32_t acc_flags) 819 { 820 if (amdgpu_device_skip_hw_access(adev)) 821 return; 822 823 if ((reg * 4) < adev->rmmio_size) { 824 if (!(acc_flags & AMDGPU_REGS_NO_KIQ) && 825 amdgpu_sriov_runtime(adev) && 826 down_read_trylock(&adev->reset_domain->sem)) { 827 amdgpu_kiq_wreg(adev, reg, v, 0); 828 up_read(&adev->reset_domain->sem); 829 } else { 830 writel(v, ((void __iomem *)adev->rmmio) + (reg * 4)); 831 } 832 } else { 833 adev->pcie_wreg(adev, reg * 4, v); 834 } 835 836 trace_amdgpu_device_wreg(adev->pdev->device, reg, v); 837 } 838 839 /** 840 * amdgpu_mm_wreg_mmio_rlc - write register either with direct/indirect mmio or with RLC path if in range 841 * 842 * @adev: amdgpu_device pointer 843 * @reg: mmio/rlc register 844 * @v: value to write 845 * @xcc_id: xcc accelerated compute core id 846 * 847 * this function is invoked only for the debugfs register access 848 */ 849 void amdgpu_mm_wreg_mmio_rlc(struct amdgpu_device *adev, 850 uint32_t reg, uint32_t v, 851 uint32_t xcc_id) 852 { 853 if (amdgpu_device_skip_hw_access(adev)) 854 return; 855 856 if (amdgpu_sriov_fullaccess(adev) && 857 adev->gfx.rlc.funcs && 858 adev->gfx.rlc.funcs->is_rlcg_access_range) { 859 if (adev->gfx.rlc.funcs->is_rlcg_access_range(adev, reg)) 860 return amdgpu_sriov_wreg(adev, reg, v, 0, 0, xcc_id); 861 } else if ((reg * 4) >= adev->rmmio_size) { 862 adev->pcie_wreg(adev, reg * 4, v); 863 } else { 864 writel(v, ((void __iomem *)adev->rmmio) + (reg * 4)); 865 } 866 } 867 868 /** 869 * amdgpu_device_xcc_wreg - write to a memory mapped IO or indirect register with specific XCC 870 * 871 * @adev: amdgpu_device pointer 872 * @reg: dword aligned register offset 873 * @v: 32 bit value to write to the register 874 * @acc_flags: access flags which require special behavior 875 * @xcc_id: xcc accelerated compute core id 876 * 877 * Writes the value specified to the offset specified. 878 */ 879 void amdgpu_device_xcc_wreg(struct amdgpu_device *adev, 880 uint32_t reg, uint32_t v, 881 uint32_t acc_flags, uint32_t xcc_id) 882 { 883 uint32_t rlcg_flag; 884 885 if (amdgpu_device_skip_hw_access(adev)) 886 return; 887 888 if ((reg * 4) < adev->rmmio_size) { 889 if (amdgpu_sriov_vf(adev) && 890 !amdgpu_sriov_runtime(adev) && 891 adev->gfx.rlc.rlcg_reg_access_supported && 892 amdgpu_virt_get_rlcg_reg_access_flag(adev, acc_flags, 893 GC_HWIP, true, 894 &rlcg_flag)) { 895 amdgpu_virt_rlcg_reg_rw(adev, reg, v, rlcg_flag, GET_INST(GC, xcc_id)); 896 } else if (!(acc_flags & AMDGPU_REGS_NO_KIQ) && 897 amdgpu_sriov_runtime(adev) && 898 down_read_trylock(&adev->reset_domain->sem)) { 899 amdgpu_kiq_wreg(adev, reg, v, xcc_id); 900 up_read(&adev->reset_domain->sem); 901 } else { 902 writel(v, ((void __iomem *)adev->rmmio) + (reg * 4)); 903 } 904 } else { 905 adev->pcie_wreg(adev, reg * 4, v); 906 } 907 } 908 909 /** 910 * amdgpu_device_indirect_rreg - read an indirect register 911 * 912 * @adev: amdgpu_device pointer 913 * @reg_addr: indirect register address to read from 914 * 915 * Returns the value of indirect register @reg_addr 916 */ 917 u32 amdgpu_device_indirect_rreg(struct amdgpu_device *adev, 918 u32 reg_addr) 919 { 920 unsigned long flags, pcie_index, pcie_data; 921 void __iomem *pcie_index_offset; 922 void __iomem *pcie_data_offset; 923 u32 r; 924 925 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 926 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 927 928 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 929 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 930 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 931 932 writel(reg_addr, pcie_index_offset); 933 readl(pcie_index_offset); 934 r = readl(pcie_data_offset); 935 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 936 937 return r; 938 } 939 940 u32 amdgpu_device_indirect_rreg_ext(struct amdgpu_device *adev, 941 u64 reg_addr) 942 { 943 unsigned long flags, pcie_index, pcie_index_hi, pcie_data; 944 u32 r; 945 void __iomem *pcie_index_offset; 946 void __iomem *pcie_index_hi_offset; 947 void __iomem *pcie_data_offset; 948 949 if (unlikely(!adev->nbio.funcs)) { 950 pcie_index = AMDGPU_PCIE_INDEX_FALLBACK; 951 pcie_data = AMDGPU_PCIE_DATA_FALLBACK; 952 } else { 953 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 954 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 955 } 956 957 if (reg_addr >> 32) { 958 if (unlikely(!adev->nbio.funcs)) 959 pcie_index_hi = AMDGPU_PCIE_INDEX_HI_FALLBACK; 960 else 961 pcie_index_hi = adev->nbio.funcs->get_pcie_index_hi_offset(adev); 962 } else { 963 pcie_index_hi = 0; 964 } 965 966 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 967 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 968 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 969 if (pcie_index_hi != 0) 970 pcie_index_hi_offset = (void __iomem *)adev->rmmio + 971 pcie_index_hi * 4; 972 973 writel(reg_addr, pcie_index_offset); 974 readl(pcie_index_offset); 975 if (pcie_index_hi != 0) { 976 writel((reg_addr >> 32) & 0xff, pcie_index_hi_offset); 977 readl(pcie_index_hi_offset); 978 } 979 r = readl(pcie_data_offset); 980 981 /* clear the high bits */ 982 if (pcie_index_hi != 0) { 983 writel(0, pcie_index_hi_offset); 984 readl(pcie_index_hi_offset); 985 } 986 987 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 988 989 return r; 990 } 991 992 /** 993 * amdgpu_device_indirect_rreg64 - read a 64bits indirect register 994 * 995 * @adev: amdgpu_device pointer 996 * @reg_addr: indirect register address to read from 997 * 998 * Returns the value of indirect register @reg_addr 999 */ 1000 u64 amdgpu_device_indirect_rreg64(struct amdgpu_device *adev, 1001 u32 reg_addr) 1002 { 1003 unsigned long flags, pcie_index, pcie_data; 1004 void __iomem *pcie_index_offset; 1005 void __iomem *pcie_data_offset; 1006 u64 r; 1007 1008 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 1009 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 1010 1011 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 1012 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 1013 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 1014 1015 /* read low 32 bits */ 1016 writel(reg_addr, pcie_index_offset); 1017 readl(pcie_index_offset); 1018 r = readl(pcie_data_offset); 1019 /* read high 32 bits */ 1020 writel(reg_addr + 4, pcie_index_offset); 1021 readl(pcie_index_offset); 1022 r |= ((u64)readl(pcie_data_offset) << 32); 1023 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 1024 1025 return r; 1026 } 1027 1028 u64 amdgpu_device_indirect_rreg64_ext(struct amdgpu_device *adev, 1029 u64 reg_addr) 1030 { 1031 unsigned long flags, pcie_index, pcie_data; 1032 unsigned long pcie_index_hi = 0; 1033 void __iomem *pcie_index_offset; 1034 void __iomem *pcie_index_hi_offset; 1035 void __iomem *pcie_data_offset; 1036 u64 r; 1037 1038 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 1039 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 1040 if ((reg_addr >> 32) && (adev->nbio.funcs->get_pcie_index_hi_offset)) 1041 pcie_index_hi = adev->nbio.funcs->get_pcie_index_hi_offset(adev); 1042 1043 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 1044 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 1045 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 1046 if (pcie_index_hi != 0) 1047 pcie_index_hi_offset = (void __iomem *)adev->rmmio + 1048 pcie_index_hi * 4; 1049 1050 /* read low 32 bits */ 1051 writel(reg_addr, pcie_index_offset); 1052 readl(pcie_index_offset); 1053 if (pcie_index_hi != 0) { 1054 writel((reg_addr >> 32) & 0xff, pcie_index_hi_offset); 1055 readl(pcie_index_hi_offset); 1056 } 1057 r = readl(pcie_data_offset); 1058 /* read high 32 bits */ 1059 writel(reg_addr + 4, pcie_index_offset); 1060 readl(pcie_index_offset); 1061 if (pcie_index_hi != 0) { 1062 writel((reg_addr >> 32) & 0xff, pcie_index_hi_offset); 1063 readl(pcie_index_hi_offset); 1064 } 1065 r |= ((u64)readl(pcie_data_offset) << 32); 1066 1067 /* clear the high bits */ 1068 if (pcie_index_hi != 0) { 1069 writel(0, pcie_index_hi_offset); 1070 readl(pcie_index_hi_offset); 1071 } 1072 1073 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 1074 1075 return r; 1076 } 1077 1078 /** 1079 * amdgpu_device_indirect_wreg - write an indirect register address 1080 * 1081 * @adev: amdgpu_device pointer 1082 * @reg_addr: indirect register offset 1083 * @reg_data: indirect register data 1084 * 1085 */ 1086 void amdgpu_device_indirect_wreg(struct amdgpu_device *adev, 1087 u32 reg_addr, u32 reg_data) 1088 { 1089 unsigned long flags, pcie_index, pcie_data; 1090 void __iomem *pcie_index_offset; 1091 void __iomem *pcie_data_offset; 1092 1093 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 1094 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 1095 1096 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 1097 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 1098 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 1099 1100 writel(reg_addr, pcie_index_offset); 1101 readl(pcie_index_offset); 1102 writel(reg_data, pcie_data_offset); 1103 readl(pcie_data_offset); 1104 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 1105 } 1106 1107 void amdgpu_device_indirect_wreg_ext(struct amdgpu_device *adev, 1108 u64 reg_addr, u32 reg_data) 1109 { 1110 unsigned long flags, pcie_index, pcie_index_hi, pcie_data; 1111 void __iomem *pcie_index_offset; 1112 void __iomem *pcie_index_hi_offset; 1113 void __iomem *pcie_data_offset; 1114 1115 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 1116 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 1117 if ((reg_addr >> 32) && (adev->nbio.funcs->get_pcie_index_hi_offset)) 1118 pcie_index_hi = adev->nbio.funcs->get_pcie_index_hi_offset(adev); 1119 else 1120 pcie_index_hi = 0; 1121 1122 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 1123 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 1124 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 1125 if (pcie_index_hi != 0) 1126 pcie_index_hi_offset = (void __iomem *)adev->rmmio + 1127 pcie_index_hi * 4; 1128 1129 writel(reg_addr, pcie_index_offset); 1130 readl(pcie_index_offset); 1131 if (pcie_index_hi != 0) { 1132 writel((reg_addr >> 32) & 0xff, pcie_index_hi_offset); 1133 readl(pcie_index_hi_offset); 1134 } 1135 writel(reg_data, pcie_data_offset); 1136 readl(pcie_data_offset); 1137 1138 /* clear the high bits */ 1139 if (pcie_index_hi != 0) { 1140 writel(0, pcie_index_hi_offset); 1141 readl(pcie_index_hi_offset); 1142 } 1143 1144 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 1145 } 1146 1147 /** 1148 * amdgpu_device_indirect_wreg64 - write a 64bits indirect register address 1149 * 1150 * @adev: amdgpu_device pointer 1151 * @reg_addr: indirect register offset 1152 * @reg_data: indirect register data 1153 * 1154 */ 1155 void amdgpu_device_indirect_wreg64(struct amdgpu_device *adev, 1156 u32 reg_addr, u64 reg_data) 1157 { 1158 unsigned long flags, pcie_index, pcie_data; 1159 void __iomem *pcie_index_offset; 1160 void __iomem *pcie_data_offset; 1161 1162 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 1163 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 1164 1165 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 1166 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 1167 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 1168 1169 /* write low 32 bits */ 1170 writel(reg_addr, pcie_index_offset); 1171 readl(pcie_index_offset); 1172 writel((u32)(reg_data & 0xffffffffULL), pcie_data_offset); 1173 readl(pcie_data_offset); 1174 /* write high 32 bits */ 1175 writel(reg_addr + 4, pcie_index_offset); 1176 readl(pcie_index_offset); 1177 writel((u32)(reg_data >> 32), pcie_data_offset); 1178 readl(pcie_data_offset); 1179 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 1180 } 1181 1182 void amdgpu_device_indirect_wreg64_ext(struct amdgpu_device *adev, 1183 u64 reg_addr, u64 reg_data) 1184 { 1185 unsigned long flags, pcie_index, pcie_data; 1186 unsigned long pcie_index_hi = 0; 1187 void __iomem *pcie_index_offset; 1188 void __iomem *pcie_index_hi_offset; 1189 void __iomem *pcie_data_offset; 1190 1191 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 1192 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 1193 if ((reg_addr >> 32) && (adev->nbio.funcs->get_pcie_index_hi_offset)) 1194 pcie_index_hi = adev->nbio.funcs->get_pcie_index_hi_offset(adev); 1195 1196 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 1197 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 1198 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 1199 if (pcie_index_hi != 0) 1200 pcie_index_hi_offset = (void __iomem *)adev->rmmio + 1201 pcie_index_hi * 4; 1202 1203 /* write low 32 bits */ 1204 writel(reg_addr, pcie_index_offset); 1205 readl(pcie_index_offset); 1206 if (pcie_index_hi != 0) { 1207 writel((reg_addr >> 32) & 0xff, pcie_index_hi_offset); 1208 readl(pcie_index_hi_offset); 1209 } 1210 writel((u32)(reg_data & 0xffffffffULL), pcie_data_offset); 1211 readl(pcie_data_offset); 1212 /* write high 32 bits */ 1213 writel(reg_addr + 4, pcie_index_offset); 1214 readl(pcie_index_offset); 1215 if (pcie_index_hi != 0) { 1216 writel((reg_addr >> 32) & 0xff, pcie_index_hi_offset); 1217 readl(pcie_index_hi_offset); 1218 } 1219 writel((u32)(reg_data >> 32), pcie_data_offset); 1220 readl(pcie_data_offset); 1221 1222 /* clear the high bits */ 1223 if (pcie_index_hi != 0) { 1224 writel(0, pcie_index_hi_offset); 1225 readl(pcie_index_hi_offset); 1226 } 1227 1228 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 1229 } 1230 1231 /** 1232 * amdgpu_device_get_rev_id - query device rev_id 1233 * 1234 * @adev: amdgpu_device pointer 1235 * 1236 * Return device rev_id 1237 */ 1238 u32 amdgpu_device_get_rev_id(struct amdgpu_device *adev) 1239 { 1240 return adev->nbio.funcs->get_rev_id(adev); 1241 } 1242 1243 /** 1244 * amdgpu_invalid_rreg - dummy reg read function 1245 * 1246 * @adev: amdgpu_device pointer 1247 * @reg: offset of register 1248 * 1249 * Dummy register read function. Used for register blocks 1250 * that certain asics don't have (all asics). 1251 * Returns the value in the register. 1252 */ 1253 static uint32_t amdgpu_invalid_rreg(struct amdgpu_device *adev, uint32_t reg) 1254 { 1255 dev_err(adev->dev, "Invalid callback to read register 0x%04X\n", reg); 1256 BUG(); 1257 return 0; 1258 } 1259 1260 static uint32_t amdgpu_invalid_rreg_ext(struct amdgpu_device *adev, uint64_t reg) 1261 { 1262 dev_err(adev->dev, "Invalid callback to read register 0x%llX\n", reg); 1263 BUG(); 1264 return 0; 1265 } 1266 1267 /** 1268 * amdgpu_invalid_wreg - dummy reg write function 1269 * 1270 * @adev: amdgpu_device pointer 1271 * @reg: offset of register 1272 * @v: value to write to the register 1273 * 1274 * Dummy register read function. Used for register blocks 1275 * that certain asics don't have (all asics). 1276 */ 1277 static void amdgpu_invalid_wreg(struct amdgpu_device *adev, uint32_t reg, uint32_t v) 1278 { 1279 dev_err(adev->dev, 1280 "Invalid callback to write register 0x%04X with 0x%08X\n", reg, 1281 v); 1282 BUG(); 1283 } 1284 1285 static void amdgpu_invalid_wreg_ext(struct amdgpu_device *adev, uint64_t reg, uint32_t v) 1286 { 1287 dev_err(adev->dev, 1288 "Invalid callback to write register 0x%llX with 0x%08X\n", reg, 1289 v); 1290 BUG(); 1291 } 1292 1293 /** 1294 * amdgpu_invalid_rreg64 - dummy 64 bit reg read function 1295 * 1296 * @adev: amdgpu_device pointer 1297 * @reg: offset of register 1298 * 1299 * Dummy register read function. Used for register blocks 1300 * that certain asics don't have (all asics). 1301 * Returns the value in the register. 1302 */ 1303 static uint64_t amdgpu_invalid_rreg64(struct amdgpu_device *adev, uint32_t reg) 1304 { 1305 dev_err(adev->dev, "Invalid callback to read 64 bit register 0x%04X\n", 1306 reg); 1307 BUG(); 1308 return 0; 1309 } 1310 1311 static uint64_t amdgpu_invalid_rreg64_ext(struct amdgpu_device *adev, uint64_t reg) 1312 { 1313 dev_err(adev->dev, "Invalid callback to read register 0x%llX\n", reg); 1314 BUG(); 1315 return 0; 1316 } 1317 1318 /** 1319 * amdgpu_invalid_wreg64 - dummy reg write function 1320 * 1321 * @adev: amdgpu_device pointer 1322 * @reg: offset of register 1323 * @v: value to write to the register 1324 * 1325 * Dummy register read function. Used for register blocks 1326 * that certain asics don't have (all asics). 1327 */ 1328 static void amdgpu_invalid_wreg64(struct amdgpu_device *adev, uint32_t reg, uint64_t v) 1329 { 1330 dev_err(adev->dev, 1331 "Invalid callback to write 64 bit register 0x%04X with 0x%08llX\n", 1332 reg, v); 1333 BUG(); 1334 } 1335 1336 static void amdgpu_invalid_wreg64_ext(struct amdgpu_device *adev, uint64_t reg, uint64_t v) 1337 { 1338 dev_err(adev->dev, 1339 "Invalid callback to write 64 bit register 0x%llX with 0x%08llX\n", 1340 reg, v); 1341 BUG(); 1342 } 1343 1344 /** 1345 * amdgpu_block_invalid_rreg - dummy reg read function 1346 * 1347 * @adev: amdgpu_device pointer 1348 * @block: offset of instance 1349 * @reg: offset of register 1350 * 1351 * Dummy register read function. Used for register blocks 1352 * that certain asics don't have (all asics). 1353 * Returns the value in the register. 1354 */ 1355 static uint32_t amdgpu_block_invalid_rreg(struct amdgpu_device *adev, 1356 uint32_t block, uint32_t reg) 1357 { 1358 dev_err(adev->dev, 1359 "Invalid callback to read register 0x%04X in block 0x%04X\n", 1360 reg, block); 1361 BUG(); 1362 return 0; 1363 } 1364 1365 /** 1366 * amdgpu_block_invalid_wreg - dummy reg write function 1367 * 1368 * @adev: amdgpu_device pointer 1369 * @block: offset of instance 1370 * @reg: offset of register 1371 * @v: value to write to the register 1372 * 1373 * Dummy register read function. Used for register blocks 1374 * that certain asics don't have (all asics). 1375 */ 1376 static void amdgpu_block_invalid_wreg(struct amdgpu_device *adev, 1377 uint32_t block, 1378 uint32_t reg, uint32_t v) 1379 { 1380 dev_err(adev->dev, 1381 "Invalid block callback to write register 0x%04X in block 0x%04X with 0x%08X\n", 1382 reg, block, v); 1383 BUG(); 1384 } 1385 1386 static uint32_t amdgpu_device_get_vbios_flags(struct amdgpu_device *adev) 1387 { 1388 if (hweight32(adev->aid_mask) && (adev->flags & AMD_IS_APU)) 1389 return AMDGPU_VBIOS_SKIP; 1390 1391 if (hweight32(adev->aid_mask) && amdgpu_passthrough(adev)) 1392 return AMDGPU_VBIOS_OPTIONAL; 1393 1394 return 0; 1395 } 1396 1397 /** 1398 * amdgpu_device_asic_init - Wrapper for atom asic_init 1399 * 1400 * @adev: amdgpu_device pointer 1401 * 1402 * Does any asic specific work and then calls atom asic init. 1403 */ 1404 static int amdgpu_device_asic_init(struct amdgpu_device *adev) 1405 { 1406 uint32_t flags; 1407 bool optional; 1408 int ret; 1409 1410 amdgpu_asic_pre_asic_init(adev); 1411 flags = amdgpu_device_get_vbios_flags(adev); 1412 optional = !!(flags & (AMDGPU_VBIOS_OPTIONAL | AMDGPU_VBIOS_SKIP)); 1413 1414 if (amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 3) || 1415 amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 4) || 1416 amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 5, 0) || 1417 amdgpu_ip_version(adev, GC_HWIP, 0) >= IP_VERSION(11, 0, 0)) { 1418 amdgpu_psp_wait_for_bootloader(adev); 1419 if (optional && !adev->bios) 1420 return 0; 1421 1422 ret = amdgpu_atomfirmware_asic_init(adev, true); 1423 return ret; 1424 } else { 1425 if (optional && !adev->bios) 1426 return 0; 1427 1428 return amdgpu_atom_asic_init(adev->mode_info.atom_context); 1429 } 1430 1431 return 0; 1432 } 1433 1434 /** 1435 * amdgpu_device_mem_scratch_init - allocate the VRAM scratch page 1436 * 1437 * @adev: amdgpu_device pointer 1438 * 1439 * Allocates a scratch page of VRAM for use by various things in the 1440 * driver. 1441 */ 1442 static int amdgpu_device_mem_scratch_init(struct amdgpu_device *adev) 1443 { 1444 return amdgpu_bo_create_kernel(adev, AMDGPU_GPU_PAGE_SIZE, PAGE_SIZE, 1445 AMDGPU_GEM_DOMAIN_VRAM | 1446 AMDGPU_GEM_DOMAIN_GTT, 1447 &adev->mem_scratch.robj, 1448 &adev->mem_scratch.gpu_addr, 1449 (void **)&adev->mem_scratch.ptr); 1450 } 1451 1452 /** 1453 * amdgpu_device_mem_scratch_fini - Free the VRAM scratch page 1454 * 1455 * @adev: amdgpu_device pointer 1456 * 1457 * Frees the VRAM scratch page. 1458 */ 1459 static void amdgpu_device_mem_scratch_fini(struct amdgpu_device *adev) 1460 { 1461 amdgpu_bo_free_kernel(&adev->mem_scratch.robj, NULL, NULL); 1462 } 1463 1464 /** 1465 * amdgpu_device_program_register_sequence - program an array of registers. 1466 * 1467 * @adev: amdgpu_device pointer 1468 * @registers: pointer to the register array 1469 * @array_size: size of the register array 1470 * 1471 * Programs an array or registers with and or masks. 1472 * This is a helper for setting golden registers. 1473 */ 1474 void amdgpu_device_program_register_sequence(struct amdgpu_device *adev, 1475 const u32 *registers, 1476 const u32 array_size) 1477 { 1478 u32 tmp, reg, and_mask, or_mask; 1479 int i; 1480 1481 if (array_size % 3) 1482 return; 1483 1484 for (i = 0; i < array_size; i += 3) { 1485 reg = registers[i + 0]; 1486 and_mask = registers[i + 1]; 1487 or_mask = registers[i + 2]; 1488 1489 if (and_mask == 0xffffffff) { 1490 tmp = or_mask; 1491 } else { 1492 tmp = RREG32(reg); 1493 tmp &= ~and_mask; 1494 if (adev->family >= AMDGPU_FAMILY_AI) 1495 tmp |= (or_mask & and_mask); 1496 else 1497 tmp |= or_mask; 1498 } 1499 WREG32(reg, tmp); 1500 } 1501 } 1502 1503 /** 1504 * amdgpu_device_pci_config_reset - reset the GPU 1505 * 1506 * @adev: amdgpu_device pointer 1507 * 1508 * Resets the GPU using the pci config reset sequence. 1509 * Only applicable to asics prior to vega10. 1510 */ 1511 void amdgpu_device_pci_config_reset(struct amdgpu_device *adev) 1512 { 1513 pci_write_config_dword(adev->pdev, 0x7c, AMDGPU_ASIC_RESET_DATA); 1514 } 1515 1516 /** 1517 * amdgpu_device_pci_reset - reset the GPU using generic PCI means 1518 * 1519 * @adev: amdgpu_device pointer 1520 * 1521 * Resets the GPU using generic pci reset interfaces (FLR, SBR, etc.). 1522 */ 1523 int amdgpu_device_pci_reset(struct amdgpu_device *adev) 1524 { 1525 return pci_reset_function(adev->pdev); 1526 } 1527 1528 /* 1529 * amdgpu_device_wb_*() 1530 * Writeback is the method by which the GPU updates special pages in memory 1531 * with the status of certain GPU events (fences, ring pointers,etc.). 1532 */ 1533 1534 /** 1535 * amdgpu_device_wb_fini - Disable Writeback and free memory 1536 * 1537 * @adev: amdgpu_device pointer 1538 * 1539 * Disables Writeback and frees the Writeback memory (all asics). 1540 * Used at driver shutdown. 1541 */ 1542 static void amdgpu_device_wb_fini(struct amdgpu_device *adev) 1543 { 1544 if (adev->wb.wb_obj) { 1545 amdgpu_bo_free_kernel(&adev->wb.wb_obj, 1546 &adev->wb.gpu_addr, 1547 (void **)&adev->wb.wb); 1548 adev->wb.wb_obj = NULL; 1549 } 1550 } 1551 1552 /** 1553 * amdgpu_device_wb_init - Init Writeback driver info and allocate memory 1554 * 1555 * @adev: amdgpu_device pointer 1556 * 1557 * Initializes writeback and allocates writeback memory (all asics). 1558 * Used at driver startup. 1559 * Returns 0 on success or an -error on failure. 1560 */ 1561 static int amdgpu_device_wb_init(struct amdgpu_device *adev) 1562 { 1563 int r; 1564 1565 if (adev->wb.wb_obj == NULL) { 1566 /* AMDGPU_MAX_WB * sizeof(uint32_t) * 8 = AMDGPU_MAX_WB 256bit slots */ 1567 r = amdgpu_bo_create_kernel(adev, AMDGPU_MAX_WB * sizeof(uint32_t) * 8, 1568 PAGE_SIZE, AMDGPU_GEM_DOMAIN_GTT, 1569 &adev->wb.wb_obj, &adev->wb.gpu_addr, 1570 (void **)&adev->wb.wb); 1571 if (r) { 1572 dev_warn(adev->dev, "(%d) create WB bo failed\n", r); 1573 return r; 1574 } 1575 1576 adev->wb.num_wb = AMDGPU_MAX_WB; 1577 memset(&adev->wb.used, 0, sizeof(adev->wb.used)); 1578 1579 /* clear wb memory */ 1580 memset((char *)adev->wb.wb, 0, AMDGPU_MAX_WB * sizeof(uint32_t) * 8); 1581 } 1582 1583 return 0; 1584 } 1585 1586 /** 1587 * amdgpu_device_wb_get - Allocate a wb entry 1588 * 1589 * @adev: amdgpu_device pointer 1590 * @wb: wb index 1591 * 1592 * Allocate a wb slot for use by the driver (all asics). 1593 * Returns 0 on success or -EINVAL on failure. 1594 */ 1595 int amdgpu_device_wb_get(struct amdgpu_device *adev, u32 *wb) 1596 { 1597 unsigned long flags, offset; 1598 1599 spin_lock_irqsave(&adev->wb.lock, flags); 1600 offset = find_first_zero_bit(adev->wb.used, adev->wb.num_wb); 1601 if (offset < adev->wb.num_wb) { 1602 __set_bit(offset, adev->wb.used); 1603 spin_unlock_irqrestore(&adev->wb.lock, flags); 1604 *wb = offset << 3; /* convert to dw offset */ 1605 return 0; 1606 } else { 1607 spin_unlock_irqrestore(&adev->wb.lock, flags); 1608 return -EINVAL; 1609 } 1610 } 1611 1612 /** 1613 * amdgpu_device_wb_free - Free a wb entry 1614 * 1615 * @adev: amdgpu_device pointer 1616 * @wb: wb index 1617 * 1618 * Free a wb slot allocated for use by the driver (all asics) 1619 */ 1620 void amdgpu_device_wb_free(struct amdgpu_device *adev, u32 wb) 1621 { 1622 unsigned long flags; 1623 1624 wb >>= 3; 1625 spin_lock_irqsave(&adev->wb.lock, flags); 1626 if (wb < adev->wb.num_wb) 1627 __clear_bit(wb, adev->wb.used); 1628 spin_unlock_irqrestore(&adev->wb.lock, flags); 1629 } 1630 1631 /** 1632 * amdgpu_device_resize_fb_bar - try to resize FB BAR 1633 * 1634 * @adev: amdgpu_device pointer 1635 * 1636 * Try to resize FB BAR to make all VRAM CPU accessible. We try very hard not 1637 * to fail, but if any of the BARs is not accessible after the size we abort 1638 * driver loading by returning -ENODEV. 1639 */ 1640 int amdgpu_device_resize_fb_bar(struct amdgpu_device *adev) 1641 { 1642 int rbar_size = pci_rebar_bytes_to_size(adev->gmc.real_vram_size); 1643 struct pci_bus *root; 1644 struct resource *res; 1645 unsigned int i; 1646 u16 cmd; 1647 int r; 1648 1649 if (!IS_ENABLED(CONFIG_PHYS_ADDR_T_64BIT)) 1650 return 0; 1651 1652 /* Bypass for VF */ 1653 if (amdgpu_sriov_vf(adev)) 1654 return 0; 1655 1656 if (!amdgpu_rebar) 1657 return 0; 1658 1659 /* resizing on Dell G5 SE platforms causes problems with runtime pm */ 1660 if ((amdgpu_runtime_pm != 0) && 1661 adev->pdev->vendor == PCI_VENDOR_ID_ATI && 1662 adev->pdev->device == 0x731f && 1663 adev->pdev->subsystem_vendor == PCI_VENDOR_ID_DELL) 1664 return 0; 1665 1666 /* PCI_EXT_CAP_ID_VNDR extended capability is located at 0x100 */ 1667 if (!pci_find_ext_capability(adev->pdev, PCI_EXT_CAP_ID_VNDR)) 1668 dev_warn( 1669 adev->dev, 1670 "System can't access extended configuration space, please check!!\n"); 1671 1672 /* skip if the bios has already enabled large BAR */ 1673 if (adev->gmc.real_vram_size && 1674 (pci_resource_len(adev->pdev, 0) >= adev->gmc.real_vram_size)) 1675 return 0; 1676 1677 /* Check if the root BUS has 64bit memory resources */ 1678 root = adev->pdev->bus; 1679 while (root->parent) 1680 root = root->parent; 1681 1682 pci_bus_for_each_resource(root, res, i) { 1683 if (res && res->flags & (IORESOURCE_MEM | IORESOURCE_MEM_64) && 1684 res->start > 0x100000000ull) 1685 break; 1686 } 1687 1688 /* Trying to resize is pointless without a root hub window above 4GB */ 1689 if (!res) 1690 return 0; 1691 1692 /* Limit the BAR size to what is available */ 1693 rbar_size = min(fls(pci_rebar_get_possible_sizes(adev->pdev, 0)) - 1, 1694 rbar_size); 1695 1696 /* Disable memory decoding while we change the BAR addresses and size */ 1697 pci_read_config_word(adev->pdev, PCI_COMMAND, &cmd); 1698 pci_write_config_word(adev->pdev, PCI_COMMAND, 1699 cmd & ~PCI_COMMAND_MEMORY); 1700 1701 /* Free the VRAM and doorbell BAR, we most likely need to move both. */ 1702 amdgpu_doorbell_fini(adev); 1703 if (adev->asic_type >= CHIP_BONAIRE) 1704 pci_release_resource(adev->pdev, 2); 1705 1706 pci_release_resource(adev->pdev, 0); 1707 1708 r = pci_resize_resource(adev->pdev, 0, rbar_size); 1709 if (r == -ENOSPC) 1710 dev_info(adev->dev, 1711 "Not enough PCI address space for a large BAR."); 1712 else if (r && r != -ENOTSUPP) 1713 dev_err(adev->dev, "Problem resizing BAR0 (%d).", r); 1714 1715 pci_assign_unassigned_bus_resources(adev->pdev->bus); 1716 1717 /* When the doorbell or fb BAR isn't available we have no chance of 1718 * using the device. 1719 */ 1720 r = amdgpu_doorbell_init(adev); 1721 if (r || (pci_resource_flags(adev->pdev, 0) & IORESOURCE_UNSET)) 1722 return -ENODEV; 1723 1724 pci_write_config_word(adev->pdev, PCI_COMMAND, cmd); 1725 1726 return 0; 1727 } 1728 1729 /* 1730 * GPU helpers function. 1731 */ 1732 /** 1733 * amdgpu_device_need_post - check if the hw need post or not 1734 * 1735 * @adev: amdgpu_device pointer 1736 * 1737 * Check if the asic has been initialized (all asics) at driver startup 1738 * or post is needed if hw reset is performed. 1739 * Returns true if need or false if not. 1740 */ 1741 bool amdgpu_device_need_post(struct amdgpu_device *adev) 1742 { 1743 uint32_t reg, flags; 1744 1745 if (amdgpu_sriov_vf(adev)) 1746 return false; 1747 1748 flags = amdgpu_device_get_vbios_flags(adev); 1749 if (flags & AMDGPU_VBIOS_SKIP) 1750 return false; 1751 if ((flags & AMDGPU_VBIOS_OPTIONAL) && !adev->bios) 1752 return false; 1753 1754 if (amdgpu_passthrough(adev)) { 1755 /* for FIJI: In whole GPU pass-through virtualization case, after VM reboot 1756 * some old smc fw still need driver do vPost otherwise gpu hang, while 1757 * those smc fw version above 22.15 doesn't have this flaw, so we force 1758 * vpost executed for smc version below 22.15 1759 */ 1760 if (adev->asic_type == CHIP_FIJI) { 1761 int err; 1762 uint32_t fw_ver; 1763 1764 err = request_firmware(&adev->pm.fw, "amdgpu/fiji_smc.bin", adev->dev); 1765 /* force vPost if error occurred */ 1766 if (err) 1767 return true; 1768 1769 fw_ver = *((uint32_t *)adev->pm.fw->data + 69); 1770 release_firmware(adev->pm.fw); 1771 if (fw_ver < 0x00160e00) 1772 return true; 1773 } 1774 } 1775 1776 /* Don't post if we need to reset whole hive on init */ 1777 if (adev->init_lvl->level == AMDGPU_INIT_LEVEL_MINIMAL_XGMI) 1778 return false; 1779 1780 if (adev->has_hw_reset) { 1781 adev->has_hw_reset = false; 1782 return true; 1783 } 1784 1785 /* bios scratch used on CIK+ */ 1786 if (adev->asic_type >= CHIP_BONAIRE) 1787 return amdgpu_atombios_scratch_need_asic_init(adev); 1788 1789 /* check MEM_SIZE for older asics */ 1790 reg = amdgpu_asic_get_config_memsize(adev); 1791 1792 if ((reg != 0) && (reg != 0xffffffff)) 1793 return false; 1794 1795 return true; 1796 } 1797 1798 /* 1799 * Check whether seamless boot is supported. 1800 * 1801 * So far we only support seamless boot on DCE 3.0 or later. 1802 * If users report that it works on older ASICS as well, we may 1803 * loosen this. 1804 */ 1805 bool amdgpu_device_seamless_boot_supported(struct amdgpu_device *adev) 1806 { 1807 switch (amdgpu_seamless) { 1808 case -1: 1809 break; 1810 case 1: 1811 return true; 1812 case 0: 1813 return false; 1814 default: 1815 dev_err(adev->dev, "Invalid value for amdgpu.seamless: %d\n", 1816 amdgpu_seamless); 1817 return false; 1818 } 1819 1820 if (!(adev->flags & AMD_IS_APU)) 1821 return false; 1822 1823 if (adev->mman.keep_stolen_vga_memory) 1824 return false; 1825 1826 return amdgpu_ip_version(adev, DCE_HWIP, 0) >= IP_VERSION(3, 0, 0); 1827 } 1828 1829 /* 1830 * Intel hosts such as Rocket Lake, Alder Lake, Raptor Lake and Sapphire Rapids 1831 * don't support dynamic speed switching. Until we have confirmation from Intel 1832 * that a specific host supports it, it's safer that we keep it disabled for all. 1833 * 1834 * https://edc.intel.com/content/www/us/en/design/products/platforms/details/raptor-lake-s/13th-generation-core-processors-datasheet-volume-1-of-2/005/pci-express-support/ 1835 * https://gitlab.freedesktop.org/drm/amd/-/issues/2663 1836 */ 1837 static bool amdgpu_device_pcie_dynamic_switching_supported(struct amdgpu_device *adev) 1838 { 1839 #if IS_ENABLED(CONFIG_X86) 1840 struct cpuinfo_x86 *c = &cpu_data(0); 1841 1842 /* eGPU change speeds based on USB4 fabric conditions */ 1843 if (dev_is_removable(adev->dev)) 1844 return true; 1845 1846 if (c->x86_vendor == X86_VENDOR_INTEL) 1847 return false; 1848 #endif 1849 return true; 1850 } 1851 1852 static bool amdgpu_device_aspm_support_quirk(struct amdgpu_device *adev) 1853 { 1854 /* Enabling ASPM causes randoms hangs on Tahiti and Oland on Zen4. 1855 * It's unclear if this is a platform-specific or GPU-specific issue. 1856 * Disable ASPM on SI for the time being. 1857 */ 1858 if (adev->family == AMDGPU_FAMILY_SI) 1859 return true; 1860 1861 #if IS_ENABLED(CONFIG_X86) 1862 struct cpuinfo_x86 *c = &cpu_data(0); 1863 1864 if (!(amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(12, 0, 0) || 1865 amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(12, 0, 1))) 1866 return false; 1867 1868 if (c->x86 == 6 && 1869 adev->pm.pcie_gen_mask & CAIL_PCIE_LINK_SPEED_SUPPORT_GEN5) { 1870 switch (c->x86_model) { 1871 case VFM_MODEL(INTEL_ALDERLAKE): 1872 case VFM_MODEL(INTEL_ALDERLAKE_L): 1873 case VFM_MODEL(INTEL_RAPTORLAKE): 1874 case VFM_MODEL(INTEL_RAPTORLAKE_P): 1875 case VFM_MODEL(INTEL_RAPTORLAKE_S): 1876 return true; 1877 default: 1878 return false; 1879 } 1880 } else { 1881 return false; 1882 } 1883 #else 1884 return false; 1885 #endif 1886 } 1887 1888 /** 1889 * amdgpu_device_should_use_aspm - check if the device should program ASPM 1890 * 1891 * @adev: amdgpu_device pointer 1892 * 1893 * Confirm whether the module parameter and pcie bridge agree that ASPM should 1894 * be set for this device. 1895 * 1896 * Returns true if it should be used or false if not. 1897 */ 1898 bool amdgpu_device_should_use_aspm(struct amdgpu_device *adev) 1899 { 1900 switch (amdgpu_aspm) { 1901 case -1: 1902 break; 1903 case 0: 1904 return false; 1905 case 1: 1906 return true; 1907 default: 1908 return false; 1909 } 1910 if (adev->flags & AMD_IS_APU) 1911 return false; 1912 if (amdgpu_device_aspm_support_quirk(adev)) 1913 return false; 1914 return pcie_aspm_enabled(adev->pdev); 1915 } 1916 1917 /* if we get transitioned to only one device, take VGA back */ 1918 /** 1919 * amdgpu_device_vga_set_decode - enable/disable vga decode 1920 * 1921 * @pdev: PCI device pointer 1922 * @state: enable/disable vga decode 1923 * 1924 * Enable/disable vga decode (all asics). 1925 * Returns VGA resource flags. 1926 */ 1927 static unsigned int amdgpu_device_vga_set_decode(struct pci_dev *pdev, 1928 bool state) 1929 { 1930 struct amdgpu_device *adev = drm_to_adev(pci_get_drvdata(pdev)); 1931 1932 amdgpu_asic_set_vga_state(adev, state); 1933 if (state) 1934 return VGA_RSRC_LEGACY_IO | VGA_RSRC_LEGACY_MEM | 1935 VGA_RSRC_NORMAL_IO | VGA_RSRC_NORMAL_MEM; 1936 else 1937 return VGA_RSRC_NORMAL_IO | VGA_RSRC_NORMAL_MEM; 1938 } 1939 1940 /** 1941 * amdgpu_device_check_block_size - validate the vm block size 1942 * 1943 * @adev: amdgpu_device pointer 1944 * 1945 * Validates the vm block size specified via module parameter. 1946 * The vm block size defines number of bits in page table versus page directory, 1947 * a page is 4KB so we have 12 bits offset, minimum 9 bits in the 1948 * page table and the remaining bits are in the page directory. 1949 */ 1950 static void amdgpu_device_check_block_size(struct amdgpu_device *adev) 1951 { 1952 /* defines number of bits in page table versus page directory, 1953 * a page is 4KB so we have 12 bits offset, minimum 9 bits in the 1954 * page table and the remaining bits are in the page directory 1955 */ 1956 if (amdgpu_vm_block_size == -1) 1957 return; 1958 1959 if (amdgpu_vm_block_size < 9) { 1960 dev_warn(adev->dev, "VM page table size (%d) too small\n", 1961 amdgpu_vm_block_size); 1962 amdgpu_vm_block_size = -1; 1963 } 1964 } 1965 1966 /** 1967 * amdgpu_device_check_vm_size - validate the vm size 1968 * 1969 * @adev: amdgpu_device pointer 1970 * 1971 * Validates the vm size in GB specified via module parameter. 1972 * The VM size is the size of the GPU virtual memory space in GB. 1973 */ 1974 static void amdgpu_device_check_vm_size(struct amdgpu_device *adev) 1975 { 1976 /* no need to check the default value */ 1977 if (amdgpu_vm_size == -1) 1978 return; 1979 1980 if (amdgpu_vm_size < 1) { 1981 dev_warn(adev->dev, "VM size (%d) too small, min is 1GB\n", 1982 amdgpu_vm_size); 1983 amdgpu_vm_size = -1; 1984 } 1985 } 1986 1987 static void amdgpu_device_check_smu_prv_buffer_size(struct amdgpu_device *adev) 1988 { 1989 struct sysinfo si; 1990 bool is_os_64 = (sizeof(void *) == 8); 1991 uint64_t total_memory; 1992 uint64_t dram_size_seven_GB = 0x1B8000000; 1993 uint64_t dram_size_three_GB = 0xB8000000; 1994 1995 if (amdgpu_smu_memory_pool_size == 0) 1996 return; 1997 1998 if (!is_os_64) { 1999 dev_warn(adev->dev, "Not 64-bit OS, feature not supported\n"); 2000 goto def_value; 2001 } 2002 si_meminfo(&si); 2003 total_memory = (uint64_t)si.totalram * si.mem_unit; 2004 2005 if ((amdgpu_smu_memory_pool_size == 1) || 2006 (amdgpu_smu_memory_pool_size == 2)) { 2007 if (total_memory < dram_size_three_GB) 2008 goto def_value1; 2009 } else if ((amdgpu_smu_memory_pool_size == 4) || 2010 (amdgpu_smu_memory_pool_size == 8)) { 2011 if (total_memory < dram_size_seven_GB) 2012 goto def_value1; 2013 } else { 2014 dev_warn(adev->dev, "Smu memory pool size not supported\n"); 2015 goto def_value; 2016 } 2017 adev->pm.smu_prv_buffer_size = amdgpu_smu_memory_pool_size << 28; 2018 2019 return; 2020 2021 def_value1: 2022 dev_warn(adev->dev, "No enough system memory\n"); 2023 def_value: 2024 adev->pm.smu_prv_buffer_size = 0; 2025 } 2026 2027 static int amdgpu_device_init_apu_flags(struct amdgpu_device *adev) 2028 { 2029 if (!(adev->flags & AMD_IS_APU) || 2030 adev->asic_type < CHIP_RAVEN) 2031 return 0; 2032 2033 switch (adev->asic_type) { 2034 case CHIP_RAVEN: 2035 if (adev->pdev->device == 0x15dd) 2036 adev->apu_flags |= AMD_APU_IS_RAVEN; 2037 if (adev->pdev->device == 0x15d8) 2038 adev->apu_flags |= AMD_APU_IS_PICASSO; 2039 break; 2040 case CHIP_RENOIR: 2041 if ((adev->pdev->device == 0x1636) || 2042 (adev->pdev->device == 0x164c)) 2043 adev->apu_flags |= AMD_APU_IS_RENOIR; 2044 else 2045 adev->apu_flags |= AMD_APU_IS_GREEN_SARDINE; 2046 break; 2047 case CHIP_VANGOGH: 2048 adev->apu_flags |= AMD_APU_IS_VANGOGH; 2049 break; 2050 case CHIP_YELLOW_CARP: 2051 break; 2052 case CHIP_CYAN_SKILLFISH: 2053 if ((adev->pdev->device == 0x13FE) || 2054 (adev->pdev->device == 0x143F)) 2055 adev->apu_flags |= AMD_APU_IS_CYAN_SKILLFISH2; 2056 break; 2057 default: 2058 break; 2059 } 2060 2061 return 0; 2062 } 2063 2064 /** 2065 * amdgpu_device_check_arguments - validate module params 2066 * 2067 * @adev: amdgpu_device pointer 2068 * 2069 * Validates certain module parameters and updates 2070 * the associated values used by the driver (all asics). 2071 */ 2072 static int amdgpu_device_check_arguments(struct amdgpu_device *adev) 2073 { 2074 int i; 2075 2076 if (amdgpu_sched_jobs < 4) { 2077 dev_warn(adev->dev, "sched jobs (%d) must be at least 4\n", 2078 amdgpu_sched_jobs); 2079 amdgpu_sched_jobs = 4; 2080 } else if (!is_power_of_2(amdgpu_sched_jobs)) { 2081 dev_warn(adev->dev, "sched jobs (%d) must be a power of 2\n", 2082 amdgpu_sched_jobs); 2083 amdgpu_sched_jobs = roundup_pow_of_two(amdgpu_sched_jobs); 2084 } 2085 2086 if (amdgpu_gart_size != -1 && amdgpu_gart_size < 32) { 2087 /* gart size must be greater or equal to 32M */ 2088 dev_warn(adev->dev, "gart size (%d) too small\n", 2089 amdgpu_gart_size); 2090 amdgpu_gart_size = -1; 2091 } 2092 2093 if (amdgpu_gtt_size != -1 && amdgpu_gtt_size < 32) { 2094 /* gtt size must be greater or equal to 32M */ 2095 dev_warn(adev->dev, "gtt size (%d) too small\n", 2096 amdgpu_gtt_size); 2097 amdgpu_gtt_size = -1; 2098 } 2099 2100 /* valid range is between 4 and 9 inclusive */ 2101 if (amdgpu_vm_fragment_size != -1 && 2102 (amdgpu_vm_fragment_size > 9 || amdgpu_vm_fragment_size < 4)) { 2103 dev_warn(adev->dev, "valid range is between 4 and 9\n"); 2104 amdgpu_vm_fragment_size = -1; 2105 } 2106 2107 if (amdgpu_sched_hw_submission < 2) { 2108 dev_warn(adev->dev, "sched hw submission jobs (%d) must be at least 2\n", 2109 amdgpu_sched_hw_submission); 2110 amdgpu_sched_hw_submission = 2; 2111 } else if (!is_power_of_2(amdgpu_sched_hw_submission)) { 2112 dev_warn(adev->dev, "sched hw submission jobs (%d) must be a power of 2\n", 2113 amdgpu_sched_hw_submission); 2114 amdgpu_sched_hw_submission = roundup_pow_of_two(amdgpu_sched_hw_submission); 2115 } 2116 2117 if (amdgpu_reset_method < -1 || amdgpu_reset_method > 4) { 2118 dev_warn(adev->dev, "invalid option for reset method, reverting to default\n"); 2119 amdgpu_reset_method = -1; 2120 } 2121 2122 amdgpu_device_check_smu_prv_buffer_size(adev); 2123 2124 amdgpu_device_check_vm_size(adev); 2125 2126 amdgpu_device_check_block_size(adev); 2127 2128 adev->firmware.load_type = amdgpu_ucode_get_load_type(adev, amdgpu_fw_load_type); 2129 2130 for (i = 0; i < MAX_XCP; i++) { 2131 switch (amdgpu_enforce_isolation) { 2132 case -1: 2133 case 0: 2134 default: 2135 /* disable */ 2136 adev->enforce_isolation[i] = AMDGPU_ENFORCE_ISOLATION_DISABLE; 2137 break; 2138 case 1: 2139 /* enable */ 2140 adev->enforce_isolation[i] = 2141 AMDGPU_ENFORCE_ISOLATION_ENABLE; 2142 break; 2143 case 2: 2144 /* enable legacy mode */ 2145 adev->enforce_isolation[i] = 2146 AMDGPU_ENFORCE_ISOLATION_ENABLE_LEGACY; 2147 break; 2148 case 3: 2149 /* enable only process isolation without submitting cleaner shader */ 2150 adev->enforce_isolation[i] = 2151 AMDGPU_ENFORCE_ISOLATION_NO_CLEANER_SHADER; 2152 break; 2153 } 2154 } 2155 2156 return 0; 2157 } 2158 2159 /** 2160 * amdgpu_switcheroo_set_state - set switcheroo state 2161 * 2162 * @pdev: pci dev pointer 2163 * @state: vga_switcheroo state 2164 * 2165 * Callback for the switcheroo driver. Suspends or resumes 2166 * the asics before or after it is powered up using ACPI methods. 2167 */ 2168 static void amdgpu_switcheroo_set_state(struct pci_dev *pdev, 2169 enum vga_switcheroo_state state) 2170 { 2171 struct drm_device *dev = pci_get_drvdata(pdev); 2172 int r; 2173 2174 if (amdgpu_device_supports_px(drm_to_adev(dev)) && 2175 state == VGA_SWITCHEROO_OFF) 2176 return; 2177 2178 if (state == VGA_SWITCHEROO_ON) { 2179 pr_info("switched on\n"); 2180 /* don't suspend or resume card normally */ 2181 dev->switch_power_state = DRM_SWITCH_POWER_CHANGING; 2182 2183 pci_set_power_state(pdev, PCI_D0); 2184 amdgpu_device_load_pci_state(pdev); 2185 r = pci_enable_device(pdev); 2186 if (r) 2187 dev_warn(&pdev->dev, "pci_enable_device failed (%d)\n", 2188 r); 2189 amdgpu_device_resume(dev, true); 2190 2191 dev->switch_power_state = DRM_SWITCH_POWER_ON; 2192 } else { 2193 dev_info(&pdev->dev, "switched off\n"); 2194 dev->switch_power_state = DRM_SWITCH_POWER_CHANGING; 2195 amdgpu_device_prepare(dev); 2196 amdgpu_device_suspend(dev, true); 2197 amdgpu_device_cache_pci_state(pdev); 2198 /* Shut down the device */ 2199 pci_disable_device(pdev); 2200 pci_set_power_state(pdev, PCI_D3cold); 2201 dev->switch_power_state = DRM_SWITCH_POWER_OFF; 2202 } 2203 } 2204 2205 /** 2206 * amdgpu_switcheroo_can_switch - see if switcheroo state can change 2207 * 2208 * @pdev: pci dev pointer 2209 * 2210 * Callback for the switcheroo driver. Check of the switcheroo 2211 * state can be changed. 2212 * Returns true if the state can be changed, false if not. 2213 */ 2214 static bool amdgpu_switcheroo_can_switch(struct pci_dev *pdev) 2215 { 2216 struct drm_device *dev = pci_get_drvdata(pdev); 2217 2218 /* 2219 * FIXME: open_count is protected by drm_global_mutex but that would lead to 2220 * locking inversion with the driver load path. And the access here is 2221 * completely racy anyway. So don't bother with locking for now. 2222 */ 2223 return atomic_read(&dev->open_count) == 0; 2224 } 2225 2226 static const struct vga_switcheroo_client_ops amdgpu_switcheroo_ops = { 2227 .set_gpu_state = amdgpu_switcheroo_set_state, 2228 .reprobe = NULL, 2229 .can_switch = amdgpu_switcheroo_can_switch, 2230 }; 2231 2232 /** 2233 * amdgpu_device_enable_virtual_display - enable virtual display feature 2234 * 2235 * @adev: amdgpu_device pointer 2236 * 2237 * Enabled the virtual display feature if the user has enabled it via 2238 * the module parameter virtual_display. This feature provides a virtual 2239 * display hardware on headless boards or in virtualized environments. 2240 * This function parses and validates the configuration string specified by 2241 * the user and configures the virtual display configuration (number of 2242 * virtual connectors, crtcs, etc.) specified. 2243 */ 2244 static void amdgpu_device_enable_virtual_display(struct amdgpu_device *adev) 2245 { 2246 adev->enable_virtual_display = false; 2247 2248 if (amdgpu_virtual_display) { 2249 const char *pci_address_name = pci_name(adev->pdev); 2250 char *pciaddstr, *pciaddstr_tmp, *pciaddname_tmp, *pciaddname; 2251 2252 pciaddstr = kstrdup(amdgpu_virtual_display, GFP_KERNEL); 2253 pciaddstr_tmp = pciaddstr; 2254 while ((pciaddname_tmp = strsep(&pciaddstr_tmp, ";"))) { 2255 pciaddname = strsep(&pciaddname_tmp, ","); 2256 if (!strcmp("all", pciaddname) 2257 || !strcmp(pci_address_name, pciaddname)) { 2258 long num_crtc; 2259 int res = -1; 2260 2261 adev->enable_virtual_display = true; 2262 2263 if (pciaddname_tmp) 2264 res = kstrtol(pciaddname_tmp, 10, 2265 &num_crtc); 2266 2267 if (!res) { 2268 if (num_crtc < 1) 2269 num_crtc = 1; 2270 if (num_crtc > 6) 2271 num_crtc = 6; 2272 adev->mode_info.num_crtc = num_crtc; 2273 } else { 2274 adev->mode_info.num_crtc = 1; 2275 } 2276 break; 2277 } 2278 } 2279 2280 dev_info( 2281 adev->dev, 2282 "virtual display string:%s, %s:virtual_display:%d, num_crtc:%d\n", 2283 amdgpu_virtual_display, pci_address_name, 2284 adev->enable_virtual_display, adev->mode_info.num_crtc); 2285 2286 kfree(pciaddstr); 2287 } 2288 } 2289 2290 void amdgpu_device_set_sriov_virtual_display(struct amdgpu_device *adev) 2291 { 2292 if (amdgpu_sriov_vf(adev) && !adev->enable_virtual_display) { 2293 adev->mode_info.num_crtc = 1; 2294 adev->enable_virtual_display = true; 2295 dev_info(adev->dev, "virtual_display:%d, num_crtc:%d\n", 2296 adev->enable_virtual_display, 2297 adev->mode_info.num_crtc); 2298 } 2299 } 2300 2301 /** 2302 * amdgpu_device_parse_gpu_info_fw - parse gpu info firmware 2303 * 2304 * @adev: amdgpu_device pointer 2305 * 2306 * Parses the asic configuration parameters specified in the gpu info 2307 * firmware and makes them available to the driver for use in configuring 2308 * the asic. 2309 * Returns 0 on success, -EINVAL on failure. 2310 */ 2311 static int amdgpu_device_parse_gpu_info_fw(struct amdgpu_device *adev) 2312 { 2313 const char *chip_name; 2314 int err; 2315 const struct gpu_info_firmware_header_v1_0 *hdr; 2316 2317 adev->firmware.gpu_info_fw = NULL; 2318 2319 switch (adev->asic_type) { 2320 default: 2321 return 0; 2322 case CHIP_VEGA10: 2323 chip_name = "vega10"; 2324 break; 2325 case CHIP_VEGA12: 2326 chip_name = "vega12"; 2327 break; 2328 case CHIP_RAVEN: 2329 if (adev->apu_flags & AMD_APU_IS_RAVEN2) 2330 chip_name = "raven2"; 2331 else if (adev->apu_flags & AMD_APU_IS_PICASSO) 2332 chip_name = "picasso"; 2333 else 2334 chip_name = "raven"; 2335 break; 2336 case CHIP_ARCTURUS: 2337 chip_name = "arcturus"; 2338 break; 2339 case CHIP_NAVI12: 2340 if (adev->discovery.bin) 2341 return 0; 2342 chip_name = "navi12"; 2343 break; 2344 case CHIP_CYAN_SKILLFISH: 2345 if (adev->discovery.bin) 2346 return 0; 2347 chip_name = "cyan_skillfish"; 2348 break; 2349 } 2350 2351 err = amdgpu_ucode_request(adev, &adev->firmware.gpu_info_fw, 2352 AMDGPU_UCODE_OPTIONAL, 2353 "amdgpu/%s_gpu_info.bin", chip_name); 2354 if (err) { 2355 dev_err(adev->dev, 2356 "Failed to get gpu_info firmware \"%s_gpu_info.bin\"\n", 2357 chip_name); 2358 goto out; 2359 } 2360 2361 hdr = (const struct gpu_info_firmware_header_v1_0 *)adev->firmware.gpu_info_fw->data; 2362 amdgpu_ucode_print_gpu_info_hdr(&hdr->header); 2363 2364 switch (hdr->version_major) { 2365 case 1: 2366 { 2367 const struct gpu_info_firmware_v1_0 *gpu_info_fw = 2368 (const struct gpu_info_firmware_v1_0 *)(adev->firmware.gpu_info_fw->data + 2369 le32_to_cpu(hdr->header.ucode_array_offset_bytes)); 2370 2371 /* 2372 * Should be dropped when DAL no longer needs it. 2373 */ 2374 if (adev->asic_type == CHIP_NAVI12) 2375 goto parse_soc_bounding_box; 2376 2377 adev->gfx.config.max_shader_engines = le32_to_cpu(gpu_info_fw->gc_num_se); 2378 adev->gfx.config.max_cu_per_sh = le32_to_cpu(gpu_info_fw->gc_num_cu_per_sh); 2379 adev->gfx.config.max_sh_per_se = le32_to_cpu(gpu_info_fw->gc_num_sh_per_se); 2380 adev->gfx.config.max_backends_per_se = le32_to_cpu(gpu_info_fw->gc_num_rb_per_se); 2381 adev->gfx.config.max_texture_channel_caches = 2382 le32_to_cpu(gpu_info_fw->gc_num_tccs); 2383 adev->gfx.config.max_gprs = le32_to_cpu(gpu_info_fw->gc_num_gprs); 2384 adev->gfx.config.max_gs_threads = le32_to_cpu(gpu_info_fw->gc_num_max_gs_thds); 2385 adev->gfx.config.gs_vgt_table_depth = le32_to_cpu(gpu_info_fw->gc_gs_table_depth); 2386 adev->gfx.config.gs_prim_buffer_depth = le32_to_cpu(gpu_info_fw->gc_gsprim_buff_depth); 2387 adev->gfx.config.double_offchip_lds_buf = 2388 le32_to_cpu(gpu_info_fw->gc_double_offchip_lds_buffer); 2389 adev->gfx.cu_info.wave_front_size = le32_to_cpu(gpu_info_fw->gc_wave_size); 2390 adev->gfx.cu_info.max_waves_per_simd = 2391 le32_to_cpu(gpu_info_fw->gc_max_waves_per_simd); 2392 adev->gfx.cu_info.max_scratch_slots_per_cu = 2393 le32_to_cpu(gpu_info_fw->gc_max_scratch_slots_per_cu); 2394 adev->gfx.cu_info.lds_size = le32_to_cpu(gpu_info_fw->gc_lds_size); 2395 if (hdr->version_minor >= 1) { 2396 const struct gpu_info_firmware_v1_1 *gpu_info_fw = 2397 (const struct gpu_info_firmware_v1_1 *)(adev->firmware.gpu_info_fw->data + 2398 le32_to_cpu(hdr->header.ucode_array_offset_bytes)); 2399 adev->gfx.config.num_sc_per_sh = 2400 le32_to_cpu(gpu_info_fw->num_sc_per_sh); 2401 adev->gfx.config.num_packer_per_sc = 2402 le32_to_cpu(gpu_info_fw->num_packer_per_sc); 2403 } 2404 2405 parse_soc_bounding_box: 2406 /* 2407 * soc bounding box info is not integrated in disocovery table, 2408 * we always need to parse it from gpu info firmware if needed. 2409 */ 2410 if (hdr->version_minor == 2) { 2411 const struct gpu_info_firmware_v1_2 *gpu_info_fw = 2412 (const struct gpu_info_firmware_v1_2 *)(adev->firmware.gpu_info_fw->data + 2413 le32_to_cpu(hdr->header.ucode_array_offset_bytes)); 2414 adev->dm.soc_bounding_box = &gpu_info_fw->soc_bounding_box; 2415 } 2416 break; 2417 } 2418 default: 2419 dev_err(adev->dev, 2420 "Unsupported gpu_info table %d\n", hdr->header.ucode_version); 2421 err = -EINVAL; 2422 goto out; 2423 } 2424 out: 2425 return err; 2426 } 2427 2428 static void amdgpu_uid_init(struct amdgpu_device *adev) 2429 { 2430 /* Initialize the UID for the device */ 2431 adev->uid_info = kzalloc(sizeof(struct amdgpu_uid), GFP_KERNEL); 2432 if (!adev->uid_info) { 2433 dev_warn(adev->dev, "Failed to allocate memory for UID\n"); 2434 return; 2435 } 2436 adev->uid_info->adev = adev; 2437 } 2438 2439 static void amdgpu_uid_fini(struct amdgpu_device *adev) 2440 { 2441 /* Free the UID memory */ 2442 kfree(adev->uid_info); 2443 adev->uid_info = NULL; 2444 } 2445 2446 /** 2447 * amdgpu_device_ip_early_init - run early init for hardware IPs 2448 * 2449 * @adev: amdgpu_device pointer 2450 * 2451 * Early initialization pass for hardware IPs. The hardware IPs that make 2452 * up each asic are discovered each IP's early_init callback is run. This 2453 * is the first stage in initializing the asic. 2454 * Returns 0 on success, negative error code on failure. 2455 */ 2456 static int amdgpu_device_ip_early_init(struct amdgpu_device *adev) 2457 { 2458 struct amdgpu_ip_block *ip_block; 2459 struct pci_dev *parent; 2460 bool total, skip_bios; 2461 uint32_t bios_flags; 2462 int i, r; 2463 2464 amdgpu_device_enable_virtual_display(adev); 2465 2466 if (amdgpu_sriov_vf(adev)) { 2467 r = amdgpu_virt_request_full_gpu(adev, true); 2468 if (r) 2469 return r; 2470 2471 r = amdgpu_virt_init_critical_region(adev); 2472 if (r) 2473 return r; 2474 } 2475 2476 switch (adev->asic_type) { 2477 #ifdef CONFIG_DRM_AMDGPU_SI 2478 case CHIP_VERDE: 2479 case CHIP_TAHITI: 2480 case CHIP_PITCAIRN: 2481 case CHIP_OLAND: 2482 case CHIP_HAINAN: 2483 adev->family = AMDGPU_FAMILY_SI; 2484 r = si_set_ip_blocks(adev); 2485 if (r) 2486 return r; 2487 break; 2488 #endif 2489 #ifdef CONFIG_DRM_AMDGPU_CIK 2490 case CHIP_BONAIRE: 2491 case CHIP_HAWAII: 2492 case CHIP_KAVERI: 2493 case CHIP_KABINI: 2494 case CHIP_MULLINS: 2495 if (adev->flags & AMD_IS_APU) 2496 adev->family = AMDGPU_FAMILY_KV; 2497 else 2498 adev->family = AMDGPU_FAMILY_CI; 2499 2500 r = cik_set_ip_blocks(adev); 2501 if (r) 2502 return r; 2503 break; 2504 #endif 2505 case CHIP_TOPAZ: 2506 case CHIP_TONGA: 2507 case CHIP_FIJI: 2508 case CHIP_POLARIS10: 2509 case CHIP_POLARIS11: 2510 case CHIP_POLARIS12: 2511 case CHIP_VEGAM: 2512 case CHIP_CARRIZO: 2513 case CHIP_STONEY: 2514 if (adev->flags & AMD_IS_APU) 2515 adev->family = AMDGPU_FAMILY_CZ; 2516 else 2517 adev->family = AMDGPU_FAMILY_VI; 2518 2519 r = vi_set_ip_blocks(adev); 2520 if (r) 2521 return r; 2522 break; 2523 default: 2524 r = amdgpu_discovery_set_ip_blocks(adev); 2525 if (r) 2526 return r; 2527 break; 2528 } 2529 2530 /* Check for IP version 9.4.3 with A0 hardware */ 2531 if (amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 3) && 2532 !amdgpu_device_get_rev_id(adev)) { 2533 dev_err(adev->dev, "Unsupported A0 hardware\n"); 2534 return -ENODEV; /* device unsupported - no device error */ 2535 } 2536 2537 if (amdgpu_has_atpx() && 2538 (amdgpu_is_atpx_hybrid() || 2539 amdgpu_has_atpx_dgpu_power_cntl()) && 2540 ((adev->flags & AMD_IS_APU) == 0) && 2541 !dev_is_removable(&adev->pdev->dev)) 2542 adev->flags |= AMD_IS_PX; 2543 2544 if (!(adev->flags & AMD_IS_APU)) { 2545 parent = pcie_find_root_port(adev->pdev); 2546 adev->has_pr3 = parent ? pci_pr3_present(parent) : false; 2547 } 2548 2549 adev->pm.pp_feature = amdgpu_pp_feature_mask; 2550 if (amdgpu_sriov_vf(adev) || sched_policy == KFD_SCHED_POLICY_NO_HWS) 2551 adev->pm.pp_feature &= ~PP_GFXOFF_MASK; 2552 if (amdgpu_sriov_vf(adev) && adev->asic_type == CHIP_SIENNA_CICHLID) 2553 adev->pm.pp_feature &= ~PP_OVERDRIVE_MASK; 2554 if (!amdgpu_device_pcie_dynamic_switching_supported(adev)) 2555 adev->pm.pp_feature &= ~PP_PCIE_DPM_MASK; 2556 2557 adev->virt.is_xgmi_node_migrate_enabled = false; 2558 if (amdgpu_sriov_vf(adev)) { 2559 adev->virt.is_xgmi_node_migrate_enabled = 2560 amdgpu_ip_version((adev), GC_HWIP, 0) == IP_VERSION(9, 4, 4); 2561 } 2562 2563 total = true; 2564 for (i = 0; i < adev->num_ip_blocks; i++) { 2565 ip_block = &adev->ip_blocks[i]; 2566 2567 if ((amdgpu_ip_block_mask & (1 << i)) == 0) { 2568 dev_warn(adev->dev, "disabled ip block: %d <%s>\n", i, 2569 adev->ip_blocks[i].version->funcs->name); 2570 adev->ip_blocks[i].status.valid = false; 2571 } else if (ip_block->version->funcs->early_init) { 2572 r = ip_block->version->funcs->early_init(ip_block); 2573 if (r == -ENOENT) { 2574 adev->ip_blocks[i].status.valid = false; 2575 } else if (r) { 2576 dev_err(adev->dev, 2577 "early_init of IP block <%s> failed %d\n", 2578 adev->ip_blocks[i].version->funcs->name, 2579 r); 2580 total = false; 2581 } else { 2582 adev->ip_blocks[i].status.valid = true; 2583 } 2584 } else { 2585 adev->ip_blocks[i].status.valid = true; 2586 } 2587 /* get the vbios after the asic_funcs are set up */ 2588 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON) { 2589 r = amdgpu_device_parse_gpu_info_fw(adev); 2590 if (r) 2591 return r; 2592 2593 bios_flags = amdgpu_device_get_vbios_flags(adev); 2594 skip_bios = !!(bios_flags & AMDGPU_VBIOS_SKIP); 2595 /* Read BIOS */ 2596 if (!skip_bios) { 2597 bool optional = 2598 !!(bios_flags & AMDGPU_VBIOS_OPTIONAL); 2599 if (!amdgpu_get_bios(adev) && !optional) 2600 return -EINVAL; 2601 2602 if (optional && !adev->bios) 2603 dev_info( 2604 adev->dev, 2605 "VBIOS image optional, proceeding without VBIOS image"); 2606 2607 if (adev->bios) { 2608 r = amdgpu_atombios_init(adev); 2609 if (r) { 2610 dev_err(adev->dev, 2611 "amdgpu_atombios_init failed\n"); 2612 amdgpu_vf_error_put( 2613 adev, 2614 AMDGIM_ERROR_VF_ATOMBIOS_INIT_FAIL, 2615 0, 0); 2616 return r; 2617 } 2618 } 2619 } 2620 2621 /*get pf2vf msg info at it's earliest time*/ 2622 if (amdgpu_sriov_vf(adev)) 2623 amdgpu_virt_init_data_exchange(adev); 2624 2625 } 2626 } 2627 if (!total) 2628 return -ENODEV; 2629 2630 if (adev->gmc.xgmi.supported) 2631 amdgpu_xgmi_early_init(adev); 2632 2633 if (amdgpu_is_multi_aid(adev)) 2634 amdgpu_uid_init(adev); 2635 ip_block = amdgpu_device_ip_get_ip_block(adev, AMD_IP_BLOCK_TYPE_GFX); 2636 if (ip_block->status.valid != false) 2637 amdgpu_amdkfd_device_probe(adev); 2638 2639 adev->cg_flags &= amdgpu_cg_mask; 2640 adev->pg_flags &= amdgpu_pg_mask; 2641 2642 return 0; 2643 } 2644 2645 static int amdgpu_device_ip_hw_init_phase1(struct amdgpu_device *adev) 2646 { 2647 int i, r; 2648 2649 for (i = 0; i < adev->num_ip_blocks; i++) { 2650 if (!adev->ip_blocks[i].status.sw) 2651 continue; 2652 if (adev->ip_blocks[i].status.hw) 2653 continue; 2654 if (!amdgpu_ip_member_of_hwini( 2655 adev, adev->ip_blocks[i].version->type)) 2656 continue; 2657 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON || 2658 (amdgpu_sriov_vf(adev) && (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP)) || 2659 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH) { 2660 r = adev->ip_blocks[i].version->funcs->hw_init(&adev->ip_blocks[i]); 2661 if (r) { 2662 dev_err(adev->dev, 2663 "hw_init of IP block <%s> failed %d\n", 2664 adev->ip_blocks[i].version->funcs->name, 2665 r); 2666 return r; 2667 } 2668 adev->ip_blocks[i].status.hw = true; 2669 } 2670 } 2671 2672 return 0; 2673 } 2674 2675 static int amdgpu_device_ip_hw_init_phase2(struct amdgpu_device *adev) 2676 { 2677 int i, r; 2678 2679 for (i = 0; i < adev->num_ip_blocks; i++) { 2680 if (!adev->ip_blocks[i].status.sw) 2681 continue; 2682 if (adev->ip_blocks[i].status.hw) 2683 continue; 2684 if (!amdgpu_ip_member_of_hwini( 2685 adev, adev->ip_blocks[i].version->type)) 2686 continue; 2687 r = adev->ip_blocks[i].version->funcs->hw_init(&adev->ip_blocks[i]); 2688 if (r) { 2689 dev_err(adev->dev, 2690 "hw_init of IP block <%s> failed %d\n", 2691 adev->ip_blocks[i].version->funcs->name, r); 2692 return r; 2693 } 2694 adev->ip_blocks[i].status.hw = true; 2695 } 2696 2697 return 0; 2698 } 2699 2700 static int amdgpu_device_fw_loading(struct amdgpu_device *adev) 2701 { 2702 int r = 0; 2703 int i; 2704 uint32_t smu_version; 2705 2706 if (adev->asic_type >= CHIP_VEGA10) { 2707 for (i = 0; i < adev->num_ip_blocks; i++) { 2708 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_PSP) 2709 continue; 2710 2711 if (!amdgpu_ip_member_of_hwini(adev, 2712 AMD_IP_BLOCK_TYPE_PSP)) 2713 break; 2714 2715 if (!adev->ip_blocks[i].status.sw) 2716 continue; 2717 2718 /* no need to do the fw loading again if already done*/ 2719 if (adev->ip_blocks[i].status.hw == true) 2720 break; 2721 2722 if (amdgpu_in_reset(adev) || adev->in_suspend) { 2723 r = amdgpu_ip_block_resume(&adev->ip_blocks[i]); 2724 if (r) 2725 return r; 2726 } else { 2727 r = adev->ip_blocks[i].version->funcs->hw_init(&adev->ip_blocks[i]); 2728 if (r) { 2729 dev_err(adev->dev, 2730 "hw_init of IP block <%s> failed %d\n", 2731 adev->ip_blocks[i] 2732 .version->funcs->name, 2733 r); 2734 return r; 2735 } 2736 adev->ip_blocks[i].status.hw = true; 2737 } 2738 break; 2739 } 2740 } 2741 2742 if (!amdgpu_sriov_vf(adev) || adev->asic_type == CHIP_TONGA) 2743 r = amdgpu_pm_load_smu_firmware(adev, &smu_version); 2744 2745 return r; 2746 } 2747 2748 static int amdgpu_device_init_schedulers(struct amdgpu_device *adev) 2749 { 2750 struct drm_sched_init_args args = { 2751 .ops = &amdgpu_sched_ops, 2752 .num_rqs = DRM_SCHED_PRIORITY_COUNT, 2753 .timeout_wq = adev->reset_domain->wq, 2754 .dev = adev->dev, 2755 }; 2756 long timeout; 2757 int r, i; 2758 2759 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 2760 struct amdgpu_ring *ring = adev->rings[i]; 2761 2762 /* No need to setup the GPU scheduler for rings that don't need it */ 2763 if (!ring || ring->no_scheduler) 2764 continue; 2765 2766 switch (ring->funcs->type) { 2767 case AMDGPU_RING_TYPE_GFX: 2768 timeout = adev->gfx_timeout; 2769 break; 2770 case AMDGPU_RING_TYPE_COMPUTE: 2771 timeout = adev->compute_timeout; 2772 break; 2773 case AMDGPU_RING_TYPE_SDMA: 2774 timeout = adev->sdma_timeout; 2775 break; 2776 default: 2777 timeout = adev->video_timeout; 2778 break; 2779 } 2780 2781 args.timeout = timeout; 2782 args.credit_limit = ring->num_hw_submission; 2783 args.score = ring->sched_score; 2784 args.name = ring->name; 2785 2786 r = drm_sched_init(&ring->sched, &args); 2787 if (r) { 2788 dev_err(adev->dev, 2789 "Failed to create scheduler on ring %s.\n", 2790 ring->name); 2791 return r; 2792 } 2793 r = amdgpu_uvd_entity_init(adev, ring); 2794 if (r) { 2795 dev_err(adev->dev, 2796 "Failed to create UVD scheduling entity on ring %s.\n", 2797 ring->name); 2798 return r; 2799 } 2800 r = amdgpu_vce_entity_init(adev, ring); 2801 if (r) { 2802 dev_err(adev->dev, 2803 "Failed to create VCE scheduling entity on ring %s.\n", 2804 ring->name); 2805 return r; 2806 } 2807 } 2808 2809 if (adev->xcp_mgr) 2810 amdgpu_xcp_update_partition_sched_list(adev); 2811 2812 return 0; 2813 } 2814 2815 2816 /** 2817 * amdgpu_device_ip_init - run init for hardware IPs 2818 * 2819 * @adev: amdgpu_device pointer 2820 * 2821 * Main initialization pass for hardware IPs. The list of all the hardware 2822 * IPs that make up the asic is walked and the sw_init and hw_init callbacks 2823 * are run. sw_init initializes the software state associated with each IP 2824 * and hw_init initializes the hardware associated with each IP. 2825 * Returns 0 on success, negative error code on failure. 2826 */ 2827 static int amdgpu_device_ip_init(struct amdgpu_device *adev) 2828 { 2829 bool init_badpage; 2830 int i, r; 2831 2832 r = amdgpu_ras_init(adev); 2833 if (r) 2834 return r; 2835 2836 for (i = 0; i < adev->num_ip_blocks; i++) { 2837 if (!adev->ip_blocks[i].status.valid) 2838 continue; 2839 if (adev->ip_blocks[i].version->funcs->sw_init) { 2840 r = adev->ip_blocks[i].version->funcs->sw_init(&adev->ip_blocks[i]); 2841 if (r) { 2842 dev_err(adev->dev, 2843 "sw_init of IP block <%s> failed %d\n", 2844 adev->ip_blocks[i].version->funcs->name, 2845 r); 2846 goto init_failed; 2847 } 2848 } 2849 adev->ip_blocks[i].status.sw = true; 2850 2851 if (!amdgpu_ip_member_of_hwini( 2852 adev, adev->ip_blocks[i].version->type)) 2853 continue; 2854 2855 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON) { 2856 /* need to do common hw init early so everything is set up for gmc */ 2857 r = adev->ip_blocks[i].version->funcs->hw_init(&adev->ip_blocks[i]); 2858 if (r) { 2859 dev_err(adev->dev, "hw_init %d failed %d\n", i, 2860 r); 2861 goto init_failed; 2862 } 2863 adev->ip_blocks[i].status.hw = true; 2864 } else if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) { 2865 /* need to do gmc hw init early so we can allocate gpu mem */ 2866 /* Try to reserve bad pages early */ 2867 if (amdgpu_sriov_vf(adev)) 2868 amdgpu_virt_exchange_data(adev); 2869 2870 r = amdgpu_device_mem_scratch_init(adev); 2871 if (r) { 2872 dev_err(adev->dev, 2873 "amdgpu_mem_scratch_init failed %d\n", 2874 r); 2875 goto init_failed; 2876 } 2877 r = adev->ip_blocks[i].version->funcs->hw_init(&adev->ip_blocks[i]); 2878 if (r) { 2879 dev_err(adev->dev, "hw_init %d failed %d\n", i, 2880 r); 2881 goto init_failed; 2882 } 2883 r = amdgpu_device_wb_init(adev); 2884 if (r) { 2885 dev_err(adev->dev, 2886 "amdgpu_device_wb_init failed %d\n", r); 2887 goto init_failed; 2888 } 2889 adev->ip_blocks[i].status.hw = true; 2890 2891 /* right after GMC hw init, we create CSA */ 2892 if (adev->gfx.mcbp) { 2893 r = amdgpu_allocate_static_csa(adev, &adev->virt.csa_obj, 2894 AMDGPU_GEM_DOMAIN_VRAM | 2895 AMDGPU_GEM_DOMAIN_GTT, 2896 AMDGPU_CSA_SIZE); 2897 if (r) { 2898 dev_err(adev->dev, 2899 "allocate CSA failed %d\n", r); 2900 goto init_failed; 2901 } 2902 } 2903 2904 r = amdgpu_seq64_init(adev); 2905 if (r) { 2906 dev_err(adev->dev, "allocate seq64 failed %d\n", 2907 r); 2908 goto init_failed; 2909 } 2910 } 2911 } 2912 2913 if (amdgpu_sriov_vf(adev)) 2914 amdgpu_virt_init_data_exchange(adev); 2915 2916 r = amdgpu_ib_pool_init(adev); 2917 if (r) { 2918 dev_err(adev->dev, "IB initialization failed (%d).\n", r); 2919 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_IB_INIT_FAIL, 0, r); 2920 goto init_failed; 2921 } 2922 2923 r = amdgpu_ucode_create_bo(adev); /* create ucode bo when sw_init complete*/ 2924 if (r) 2925 goto init_failed; 2926 2927 r = amdgpu_device_ip_hw_init_phase1(adev); 2928 if (r) 2929 goto init_failed; 2930 2931 r = amdgpu_device_fw_loading(adev); 2932 if (r) 2933 goto init_failed; 2934 2935 r = amdgpu_device_ip_hw_init_phase2(adev); 2936 if (r) 2937 goto init_failed; 2938 2939 /* 2940 * retired pages will be loaded from eeprom and reserved here, 2941 * it should be called after amdgpu_device_ip_hw_init_phase2 since 2942 * for some ASICs the RAS EEPROM code relies on SMU fully functioning 2943 * for I2C communication which only true at this point. 2944 * 2945 * amdgpu_ras_recovery_init may fail, but the upper only cares the 2946 * failure from bad gpu situation and stop amdgpu init process 2947 * accordingly. For other failed cases, it will still release all 2948 * the resource and print error message, rather than returning one 2949 * negative value to upper level. 2950 * 2951 * Note: theoretically, this should be called before all vram allocations 2952 * to protect retired page from abusing 2953 */ 2954 init_badpage = (adev->init_lvl->level != AMDGPU_INIT_LEVEL_MINIMAL_XGMI); 2955 r = amdgpu_ras_recovery_init(adev, init_badpage); 2956 if (r) 2957 goto init_failed; 2958 2959 /** 2960 * In case of XGMI grab extra reference for reset domain for this device 2961 */ 2962 if (adev->gmc.xgmi.num_physical_nodes > 1) { 2963 if (amdgpu_xgmi_add_device(adev) == 0) { 2964 if (!amdgpu_sriov_vf(adev)) { 2965 struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev); 2966 2967 if (WARN_ON(!hive)) { 2968 r = -ENOENT; 2969 goto init_failed; 2970 } 2971 2972 if (!hive->reset_domain || 2973 !amdgpu_reset_get_reset_domain(hive->reset_domain)) { 2974 r = -ENOENT; 2975 amdgpu_put_xgmi_hive(hive); 2976 goto init_failed; 2977 } 2978 2979 /* Drop the early temporary reset domain we created for device */ 2980 amdgpu_reset_put_reset_domain(adev->reset_domain); 2981 adev->reset_domain = hive->reset_domain; 2982 amdgpu_put_xgmi_hive(hive); 2983 } 2984 } 2985 } 2986 2987 r = amdgpu_device_init_schedulers(adev); 2988 if (r) 2989 goto init_failed; 2990 2991 if (adev->mman.buffer_funcs_ring && 2992 adev->mman.buffer_funcs_ring->sched.ready) 2993 amdgpu_ttm_set_buffer_funcs_status(adev, true); 2994 2995 /* Don't init kfd if whole hive need to be reset during init */ 2996 if (adev->init_lvl->level != AMDGPU_INIT_LEVEL_MINIMAL_XGMI) { 2997 amdgpu_amdkfd_device_init(adev); 2998 } 2999 3000 amdgpu_fru_get_product_info(adev); 3001 3002 r = amdgpu_cper_init(adev); 3003 3004 init_failed: 3005 3006 return r; 3007 } 3008 3009 /** 3010 * amdgpu_device_fill_reset_magic - writes reset magic to gart pointer 3011 * 3012 * @adev: amdgpu_device pointer 3013 * 3014 * Writes a reset magic value to the gart pointer in VRAM. The driver calls 3015 * this function before a GPU reset. If the value is retained after a 3016 * GPU reset, VRAM has not been lost. Some GPU resets may destroy VRAM contents. 3017 */ 3018 static void amdgpu_device_fill_reset_magic(struct amdgpu_device *adev) 3019 { 3020 memcpy(adev->reset_magic, adev->gart.ptr, AMDGPU_RESET_MAGIC_NUM); 3021 } 3022 3023 /** 3024 * amdgpu_device_check_vram_lost - check if vram is valid 3025 * 3026 * @adev: amdgpu_device pointer 3027 * 3028 * Checks the reset magic value written to the gart pointer in VRAM. 3029 * The driver calls this after a GPU reset to see if the contents of 3030 * VRAM is lost or now. 3031 * returns true if vram is lost, false if not. 3032 */ 3033 static bool amdgpu_device_check_vram_lost(struct amdgpu_device *adev) 3034 { 3035 if (memcmp(adev->gart.ptr, adev->reset_magic, 3036 AMDGPU_RESET_MAGIC_NUM)) 3037 return true; 3038 3039 if (!amdgpu_in_reset(adev)) 3040 return false; 3041 3042 /* 3043 * For all ASICs with baco/mode1 reset, the VRAM is 3044 * always assumed to be lost. 3045 */ 3046 switch (amdgpu_asic_reset_method(adev)) { 3047 case AMD_RESET_METHOD_LEGACY: 3048 case AMD_RESET_METHOD_LINK: 3049 case AMD_RESET_METHOD_BACO: 3050 case AMD_RESET_METHOD_MODE1: 3051 return true; 3052 default: 3053 return false; 3054 } 3055 } 3056 3057 /** 3058 * amdgpu_device_set_cg_state - set clockgating for amdgpu device 3059 * 3060 * @adev: amdgpu_device pointer 3061 * @state: clockgating state (gate or ungate) 3062 * 3063 * The list of all the hardware IPs that make up the asic is walked and the 3064 * set_clockgating_state callbacks are run. 3065 * Late initialization pass enabling clockgating for hardware IPs. 3066 * Fini or suspend, pass disabling clockgating for hardware IPs. 3067 * Returns 0 on success, negative error code on failure. 3068 */ 3069 3070 int amdgpu_device_set_cg_state(struct amdgpu_device *adev, 3071 enum amd_clockgating_state state) 3072 { 3073 int i, j, r; 3074 3075 if (amdgpu_emu_mode == 1) 3076 return 0; 3077 3078 for (j = 0; j < adev->num_ip_blocks; j++) { 3079 i = state == AMD_CG_STATE_GATE ? j : adev->num_ip_blocks - j - 1; 3080 if (!adev->ip_blocks[i].status.late_initialized) 3081 continue; 3082 /* skip CG for GFX, SDMA on S0ix */ 3083 if (adev->in_s0ix && 3084 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GFX || 3085 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SDMA)) 3086 continue; 3087 /* skip CG for VCE/UVD, it's handled specially */ 3088 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_UVD && 3089 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCE && 3090 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCN && 3091 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_JPEG && 3092 adev->ip_blocks[i].version->funcs->set_clockgating_state) { 3093 /* enable clockgating to save power */ 3094 r = adev->ip_blocks[i].version->funcs->set_clockgating_state(&adev->ip_blocks[i], 3095 state); 3096 if (r) { 3097 dev_err(adev->dev, 3098 "set_clockgating_state(gate) of IP block <%s> failed %d\n", 3099 adev->ip_blocks[i].version->funcs->name, 3100 r); 3101 return r; 3102 } 3103 } 3104 } 3105 3106 return 0; 3107 } 3108 3109 int amdgpu_device_set_pg_state(struct amdgpu_device *adev, 3110 enum amd_powergating_state state) 3111 { 3112 int i, j, r; 3113 3114 if (amdgpu_emu_mode == 1) 3115 return 0; 3116 3117 for (j = 0; j < adev->num_ip_blocks; j++) { 3118 i = state == AMD_PG_STATE_GATE ? j : adev->num_ip_blocks - j - 1; 3119 if (!adev->ip_blocks[i].status.late_initialized) 3120 continue; 3121 /* skip PG for GFX, SDMA on S0ix */ 3122 if (adev->in_s0ix && 3123 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GFX || 3124 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SDMA)) 3125 continue; 3126 /* skip CG for VCE/UVD, it's handled specially */ 3127 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_UVD && 3128 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCE && 3129 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCN && 3130 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_JPEG && 3131 adev->ip_blocks[i].version->funcs->set_powergating_state) { 3132 /* enable powergating to save power */ 3133 r = adev->ip_blocks[i].version->funcs->set_powergating_state(&adev->ip_blocks[i], 3134 state); 3135 if (r) { 3136 dev_err(adev->dev, 3137 "set_powergating_state(gate) of IP block <%s> failed %d\n", 3138 adev->ip_blocks[i].version->funcs->name, 3139 r); 3140 return r; 3141 } 3142 } 3143 } 3144 return 0; 3145 } 3146 3147 static int amdgpu_device_enable_mgpu_fan_boost(void) 3148 { 3149 struct amdgpu_gpu_instance *gpu_ins; 3150 struct amdgpu_device *adev; 3151 int i, ret = 0; 3152 3153 mutex_lock(&mgpu_info.mutex); 3154 3155 /* 3156 * MGPU fan boost feature should be enabled 3157 * only when there are two or more dGPUs in 3158 * the system 3159 */ 3160 if (mgpu_info.num_dgpu < 2) 3161 goto out; 3162 3163 for (i = 0; i < mgpu_info.num_dgpu; i++) { 3164 gpu_ins = &(mgpu_info.gpu_ins[i]); 3165 adev = gpu_ins->adev; 3166 if (!(adev->flags & AMD_IS_APU || amdgpu_sriov_multi_vf_mode(adev)) && 3167 !gpu_ins->mgpu_fan_enabled) { 3168 ret = amdgpu_dpm_enable_mgpu_fan_boost(adev); 3169 if (ret) 3170 break; 3171 3172 gpu_ins->mgpu_fan_enabled = 1; 3173 } 3174 } 3175 3176 out: 3177 mutex_unlock(&mgpu_info.mutex); 3178 3179 return ret; 3180 } 3181 3182 /** 3183 * amdgpu_device_ip_late_init - run late init for hardware IPs 3184 * 3185 * @adev: amdgpu_device pointer 3186 * 3187 * Late initialization pass for hardware IPs. The list of all the hardware 3188 * IPs that make up the asic is walked and the late_init callbacks are run. 3189 * late_init covers any special initialization that an IP requires 3190 * after all of the have been initialized or something that needs to happen 3191 * late in the init process. 3192 * Returns 0 on success, negative error code on failure. 3193 */ 3194 static int amdgpu_device_ip_late_init(struct amdgpu_device *adev) 3195 { 3196 struct amdgpu_gpu_instance *gpu_instance; 3197 int i = 0, r; 3198 3199 for (i = 0; i < adev->num_ip_blocks; i++) { 3200 if (!adev->ip_blocks[i].status.hw) 3201 continue; 3202 if (adev->ip_blocks[i].version->funcs->late_init) { 3203 r = adev->ip_blocks[i].version->funcs->late_init(&adev->ip_blocks[i]); 3204 if (r) { 3205 dev_err(adev->dev, 3206 "late_init of IP block <%s> failed %d\n", 3207 adev->ip_blocks[i].version->funcs->name, 3208 r); 3209 return r; 3210 } 3211 } 3212 adev->ip_blocks[i].status.late_initialized = true; 3213 } 3214 3215 r = amdgpu_ras_late_init(adev); 3216 if (r) { 3217 dev_err(adev->dev, "amdgpu_ras_late_init failed %d", r); 3218 return r; 3219 } 3220 3221 if (!amdgpu_reset_in_recovery(adev)) 3222 amdgpu_ras_set_error_query_ready(adev, true); 3223 3224 amdgpu_device_set_cg_state(adev, AMD_CG_STATE_GATE); 3225 amdgpu_device_set_pg_state(adev, AMD_PG_STATE_GATE); 3226 3227 amdgpu_device_fill_reset_magic(adev); 3228 3229 r = amdgpu_device_enable_mgpu_fan_boost(); 3230 if (r) 3231 dev_err(adev->dev, "enable mgpu fan boost failed (%d).\n", r); 3232 3233 /* For passthrough configuration on arcturus and aldebaran, enable special handling SBR */ 3234 if (amdgpu_passthrough(adev) && 3235 ((adev->asic_type == CHIP_ARCTURUS && adev->gmc.xgmi.num_physical_nodes > 1) || 3236 adev->asic_type == CHIP_ALDEBARAN)) 3237 amdgpu_dpm_handle_passthrough_sbr(adev, true); 3238 3239 if (adev->gmc.xgmi.num_physical_nodes > 1) { 3240 mutex_lock(&mgpu_info.mutex); 3241 3242 /* 3243 * Reset device p-state to low as this was booted with high. 3244 * 3245 * This should be performed only after all devices from the same 3246 * hive get initialized. 3247 * 3248 * However, it's unknown how many device in the hive in advance. 3249 * As this is counted one by one during devices initializations. 3250 * 3251 * So, we wait for all XGMI interlinked devices initialized. 3252 * This may bring some delays as those devices may come from 3253 * different hives. But that should be OK. 3254 */ 3255 if (mgpu_info.num_dgpu == adev->gmc.xgmi.num_physical_nodes) { 3256 for (i = 0; i < mgpu_info.num_gpu; i++) { 3257 gpu_instance = &(mgpu_info.gpu_ins[i]); 3258 if (gpu_instance->adev->flags & AMD_IS_APU) 3259 continue; 3260 3261 r = amdgpu_xgmi_set_pstate(gpu_instance->adev, 3262 AMDGPU_XGMI_PSTATE_MIN); 3263 if (r) { 3264 dev_err(adev->dev, 3265 "pstate setting failed (%d).\n", 3266 r); 3267 break; 3268 } 3269 } 3270 } 3271 3272 mutex_unlock(&mgpu_info.mutex); 3273 } 3274 3275 return 0; 3276 } 3277 3278 static void amdgpu_ip_block_hw_fini(struct amdgpu_ip_block *ip_block) 3279 { 3280 struct amdgpu_device *adev = ip_block->adev; 3281 int r; 3282 3283 if (!ip_block->version->funcs->hw_fini) { 3284 dev_err(adev->dev, "hw_fini of IP block <%s> not defined\n", 3285 ip_block->version->funcs->name); 3286 } else { 3287 r = ip_block->version->funcs->hw_fini(ip_block); 3288 /* XXX handle errors */ 3289 if (r) { 3290 dev_dbg(adev->dev, 3291 "hw_fini of IP block <%s> failed %d\n", 3292 ip_block->version->funcs->name, r); 3293 } 3294 } 3295 3296 ip_block->status.hw = false; 3297 } 3298 3299 /** 3300 * amdgpu_device_smu_fini_early - smu hw_fini wrapper 3301 * 3302 * @adev: amdgpu_device pointer 3303 * 3304 * For ASICs need to disable SMC first 3305 */ 3306 static void amdgpu_device_smu_fini_early(struct amdgpu_device *adev) 3307 { 3308 int i; 3309 3310 if (amdgpu_ip_version(adev, GC_HWIP, 0) > IP_VERSION(9, 0, 0)) 3311 return; 3312 3313 for (i = 0; i < adev->num_ip_blocks; i++) { 3314 if (!adev->ip_blocks[i].status.hw) 3315 continue; 3316 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) { 3317 amdgpu_ip_block_hw_fini(&adev->ip_blocks[i]); 3318 break; 3319 } 3320 } 3321 } 3322 3323 static int amdgpu_device_ip_fini_early(struct amdgpu_device *adev) 3324 { 3325 int i, r; 3326 3327 for (i = 0; i < adev->num_ip_blocks; i++) { 3328 if (!adev->ip_blocks[i].version->funcs->early_fini) 3329 continue; 3330 3331 r = adev->ip_blocks[i].version->funcs->early_fini(&adev->ip_blocks[i]); 3332 if (r) { 3333 dev_dbg(adev->dev, 3334 "early_fini of IP block <%s> failed %d\n", 3335 adev->ip_blocks[i].version->funcs->name, r); 3336 } 3337 } 3338 3339 amdgpu_device_set_pg_state(adev, AMD_PG_STATE_UNGATE); 3340 amdgpu_device_set_cg_state(adev, AMD_CG_STATE_UNGATE); 3341 3342 amdgpu_amdkfd_suspend(adev, true); 3343 amdgpu_userq_suspend(adev); 3344 3345 /* Workaround for ASICs need to disable SMC first */ 3346 amdgpu_device_smu_fini_early(adev); 3347 3348 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 3349 if (!adev->ip_blocks[i].status.hw) 3350 continue; 3351 3352 amdgpu_ip_block_hw_fini(&adev->ip_blocks[i]); 3353 } 3354 3355 if (amdgpu_sriov_vf(adev)) { 3356 if (amdgpu_virt_release_full_gpu(adev, false)) 3357 dev_err(adev->dev, 3358 "failed to release exclusive mode on fini\n"); 3359 } 3360 3361 /* 3362 * Driver reload on the APU can fail due to firmware validation because 3363 * the PSP is always running, as it is shared across the whole SoC. 3364 * This same issue does not occur on dGPU because it has a mechanism 3365 * that checks whether the PSP is running. A solution for those issues 3366 * in the APU is to trigger a GPU reset, but this should be done during 3367 * the unload phase to avoid adding boot latency and screen flicker. 3368 */ 3369 if ((adev->flags & AMD_IS_APU) && !adev->gmc.is_app_apu) { 3370 r = amdgpu_asic_reset(adev); 3371 if (r) 3372 dev_err(adev->dev, "asic reset on %s failed\n", __func__); 3373 } 3374 3375 return 0; 3376 } 3377 3378 /** 3379 * amdgpu_device_ip_fini - run fini for hardware IPs 3380 * 3381 * @adev: amdgpu_device pointer 3382 * 3383 * Main teardown pass for hardware IPs. The list of all the hardware 3384 * IPs that make up the asic is walked and the hw_fini and sw_fini callbacks 3385 * are run. hw_fini tears down the hardware associated with each IP 3386 * and sw_fini tears down any software state associated with each IP. 3387 * Returns 0 on success, negative error code on failure. 3388 */ 3389 static int amdgpu_device_ip_fini(struct amdgpu_device *adev) 3390 { 3391 int i, r; 3392 3393 amdgpu_cper_fini(adev); 3394 3395 if (amdgpu_sriov_vf(adev) && adev->virt.ras_init_done) 3396 amdgpu_virt_release_ras_err_handler_data(adev); 3397 3398 if (adev->gmc.xgmi.num_physical_nodes > 1) 3399 amdgpu_xgmi_remove_device(adev); 3400 3401 amdgpu_amdkfd_device_fini_sw(adev); 3402 3403 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 3404 if (!adev->ip_blocks[i].status.sw) 3405 continue; 3406 3407 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) { 3408 amdgpu_ucode_free_bo(adev); 3409 amdgpu_free_static_csa(&adev->virt.csa_obj); 3410 amdgpu_device_wb_fini(adev); 3411 amdgpu_device_mem_scratch_fini(adev); 3412 amdgpu_ib_pool_fini(adev); 3413 amdgpu_seq64_fini(adev); 3414 amdgpu_doorbell_fini(adev); 3415 } 3416 if (adev->ip_blocks[i].version->funcs->sw_fini) { 3417 r = adev->ip_blocks[i].version->funcs->sw_fini(&adev->ip_blocks[i]); 3418 /* XXX handle errors */ 3419 if (r) { 3420 dev_dbg(adev->dev, 3421 "sw_fini of IP block <%s> failed %d\n", 3422 adev->ip_blocks[i].version->funcs->name, 3423 r); 3424 } 3425 } 3426 adev->ip_blocks[i].status.sw = false; 3427 adev->ip_blocks[i].status.valid = false; 3428 } 3429 3430 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 3431 if (!adev->ip_blocks[i].status.late_initialized) 3432 continue; 3433 if (adev->ip_blocks[i].version->funcs->late_fini) 3434 adev->ip_blocks[i].version->funcs->late_fini(&adev->ip_blocks[i]); 3435 adev->ip_blocks[i].status.late_initialized = false; 3436 } 3437 3438 amdgpu_ras_fini(adev); 3439 amdgpu_uid_fini(adev); 3440 3441 return 0; 3442 } 3443 3444 /** 3445 * amdgpu_device_delayed_init_work_handler - work handler for IB tests 3446 * 3447 * @work: work_struct. 3448 */ 3449 static void amdgpu_device_delayed_init_work_handler(struct work_struct *work) 3450 { 3451 struct amdgpu_device *adev = 3452 container_of(work, struct amdgpu_device, delayed_init_work.work); 3453 int r; 3454 3455 r = amdgpu_ib_ring_tests(adev); 3456 if (r) 3457 dev_err(adev->dev, "ib ring test failed (%d).\n", r); 3458 } 3459 3460 static void amdgpu_device_delay_enable_gfx_off(struct work_struct *work) 3461 { 3462 struct amdgpu_device *adev = 3463 container_of(work, struct amdgpu_device, gfx.gfx_off_delay_work.work); 3464 3465 WARN_ON_ONCE(adev->gfx.gfx_off_state); 3466 WARN_ON_ONCE(adev->gfx.gfx_off_req_count); 3467 3468 if (!amdgpu_dpm_set_powergating_by_smu(adev, AMD_IP_BLOCK_TYPE_GFX, true, 0)) 3469 adev->gfx.gfx_off_state = true; 3470 } 3471 3472 /** 3473 * amdgpu_device_ip_suspend_phase1 - run suspend for hardware IPs (phase 1) 3474 * 3475 * @adev: amdgpu_device pointer 3476 * 3477 * Main suspend function for hardware IPs. The list of all the hardware 3478 * IPs that make up the asic is walked, clockgating is disabled and the 3479 * suspend callbacks are run. suspend puts the hardware and software state 3480 * in each IP into a state suitable for suspend. 3481 * Returns 0 on success, negative error code on failure. 3482 */ 3483 static int amdgpu_device_ip_suspend_phase1(struct amdgpu_device *adev) 3484 { 3485 int i, r, rec; 3486 3487 amdgpu_device_set_pg_state(adev, AMD_PG_STATE_UNGATE); 3488 amdgpu_device_set_cg_state(adev, AMD_CG_STATE_UNGATE); 3489 3490 /* 3491 * Per PMFW team's suggestion, driver needs to handle gfxoff 3492 * and df cstate features disablement for gpu reset(e.g. Mode1Reset) 3493 * scenario. Add the missing df cstate disablement here. 3494 */ 3495 if (amdgpu_dpm_set_df_cstate(adev, DF_CSTATE_DISALLOW)) 3496 dev_warn(adev->dev, "Failed to disallow df cstate"); 3497 3498 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 3499 if (!adev->ip_blocks[i].status.valid) 3500 continue; 3501 3502 /* displays are handled separately */ 3503 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_DCE) 3504 continue; 3505 3506 r = amdgpu_ip_block_suspend(&adev->ip_blocks[i]); 3507 if (r) 3508 goto unwind; 3509 } 3510 3511 return 0; 3512 unwind: 3513 rec = amdgpu_device_ip_resume_phase3(adev); 3514 if (rec) 3515 dev_err(adev->dev, 3516 "amdgpu_device_ip_resume_phase3 failed during unwind: %d\n", 3517 rec); 3518 3519 amdgpu_dpm_set_df_cstate(adev, DF_CSTATE_ALLOW); 3520 3521 amdgpu_device_set_pg_state(adev, AMD_PG_STATE_GATE); 3522 amdgpu_device_set_cg_state(adev, AMD_CG_STATE_GATE); 3523 3524 return r; 3525 } 3526 3527 /** 3528 * amdgpu_device_ip_suspend_phase2 - run suspend for hardware IPs (phase 2) 3529 * 3530 * @adev: amdgpu_device pointer 3531 * 3532 * Main suspend function for hardware IPs. The list of all the hardware 3533 * IPs that make up the asic is walked, clockgating is disabled and the 3534 * suspend callbacks are run. suspend puts the hardware and software state 3535 * in each IP into a state suitable for suspend. 3536 * Returns 0 on success, negative error code on failure. 3537 */ 3538 static int amdgpu_device_ip_suspend_phase2(struct amdgpu_device *adev) 3539 { 3540 int i, r, rec; 3541 3542 if (adev->in_s0ix) 3543 amdgpu_dpm_gfx_state_change(adev, sGpuChangeState_D3Entry); 3544 3545 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 3546 if (!adev->ip_blocks[i].status.valid) 3547 continue; 3548 /* displays are handled in phase1 */ 3549 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE) 3550 continue; 3551 /* PSP lost connection when err_event_athub occurs */ 3552 if (amdgpu_ras_intr_triggered() && 3553 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) { 3554 adev->ip_blocks[i].status.hw = false; 3555 continue; 3556 } 3557 3558 /* skip unnecessary suspend if we do not initialize them yet */ 3559 if (!amdgpu_ip_member_of_hwini( 3560 adev, adev->ip_blocks[i].version->type)) 3561 continue; 3562 3563 /* Since we skip suspend for S0i3, we need to cancel the delayed 3564 * idle work here as the suspend callback never gets called. 3565 */ 3566 if (adev->in_s0ix && 3567 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GFX && 3568 amdgpu_ip_version(adev, GC_HWIP, 0) >= IP_VERSION(10, 0, 0)) 3569 cancel_delayed_work_sync(&adev->gfx.idle_work); 3570 /* skip suspend of gfx/mes and psp for S0ix 3571 * gfx is in gfxoff state, so on resume it will exit gfxoff just 3572 * like at runtime. PSP is also part of the always on hardware 3573 * so no need to suspend it. 3574 */ 3575 if (adev->in_s0ix && 3576 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP || 3577 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GFX || 3578 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_MES)) 3579 continue; 3580 3581 /* SDMA 5.x+ is part of GFX power domain so it's covered by GFXOFF */ 3582 if (adev->in_s0ix && 3583 (amdgpu_ip_version(adev, SDMA0_HWIP, 0) >= 3584 IP_VERSION(5, 0, 0)) && 3585 (adev->ip_blocks[i].version->type == 3586 AMD_IP_BLOCK_TYPE_SDMA)) 3587 continue; 3588 3589 /* Once swPSP provides the IMU, RLC FW binaries to TOS during cold-boot. 3590 * These are in TMR, hence are expected to be reused by PSP-TOS to reload 3591 * from this location and RLC Autoload automatically also gets loaded 3592 * from here based on PMFW -> PSP message during re-init sequence. 3593 * Therefore, the psp suspend & resume should be skipped to avoid destroy 3594 * the TMR and reload FWs again for IMU enabled APU ASICs. 3595 */ 3596 if (amdgpu_in_reset(adev) && 3597 (adev->flags & AMD_IS_APU) && adev->gfx.imu.funcs && 3598 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) 3599 continue; 3600 3601 r = amdgpu_ip_block_suspend(&adev->ip_blocks[i]); 3602 if (r) 3603 goto unwind; 3604 3605 /* handle putting the SMC in the appropriate state */ 3606 if (!amdgpu_sriov_vf(adev)) { 3607 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) { 3608 r = amdgpu_dpm_set_mp1_state(adev, adev->mp1_state); 3609 if (r) { 3610 dev_err(adev->dev, 3611 "SMC failed to set mp1 state %d, %d\n", 3612 adev->mp1_state, r); 3613 goto unwind; 3614 } 3615 } 3616 } 3617 } 3618 3619 return 0; 3620 unwind: 3621 /* suspend phase 2 = resume phase 1 + resume phase 2 */ 3622 rec = amdgpu_device_ip_resume_phase1(adev); 3623 if (rec) { 3624 dev_err(adev->dev, 3625 "amdgpu_device_ip_resume_phase1 failed during unwind: %d\n", 3626 rec); 3627 return r; 3628 } 3629 3630 rec = amdgpu_device_fw_loading(adev); 3631 if (rec) { 3632 dev_err(adev->dev, 3633 "amdgpu_device_fw_loading failed during unwind: %d\n", 3634 rec); 3635 return r; 3636 } 3637 3638 rec = amdgpu_device_ip_resume_phase2(adev); 3639 if (rec) { 3640 dev_err(adev->dev, 3641 "amdgpu_device_ip_resume_phase2 failed during unwind: %d\n", 3642 rec); 3643 return r; 3644 } 3645 3646 return r; 3647 } 3648 3649 /** 3650 * amdgpu_device_ip_suspend - run suspend for hardware IPs 3651 * 3652 * @adev: amdgpu_device pointer 3653 * 3654 * Main suspend function for hardware IPs. The list of all the hardware 3655 * IPs that make up the asic is walked, clockgating is disabled and the 3656 * suspend callbacks are run. suspend puts the hardware and software state 3657 * in each IP into a state suitable for suspend. 3658 * Returns 0 on success, negative error code on failure. 3659 */ 3660 static int amdgpu_device_ip_suspend(struct amdgpu_device *adev) 3661 { 3662 int r; 3663 3664 if (amdgpu_sriov_vf(adev)) { 3665 amdgpu_virt_fini_data_exchange(adev); 3666 amdgpu_virt_request_full_gpu(adev, false); 3667 } 3668 3669 amdgpu_ttm_set_buffer_funcs_status(adev, false); 3670 3671 r = amdgpu_device_ip_suspend_phase1(adev); 3672 if (r) 3673 return r; 3674 r = amdgpu_device_ip_suspend_phase2(adev); 3675 3676 if (amdgpu_sriov_vf(adev)) 3677 amdgpu_virt_release_full_gpu(adev, false); 3678 3679 return r; 3680 } 3681 3682 static int amdgpu_device_ip_reinit_early_sriov(struct amdgpu_device *adev) 3683 { 3684 int i, r; 3685 3686 static enum amd_ip_block_type ip_order[] = { 3687 AMD_IP_BLOCK_TYPE_COMMON, 3688 AMD_IP_BLOCK_TYPE_GMC, 3689 AMD_IP_BLOCK_TYPE_PSP, 3690 AMD_IP_BLOCK_TYPE_IH, 3691 }; 3692 3693 for (i = 0; i < adev->num_ip_blocks; i++) { 3694 int j; 3695 struct amdgpu_ip_block *block; 3696 3697 block = &adev->ip_blocks[i]; 3698 block->status.hw = false; 3699 3700 for (j = 0; j < ARRAY_SIZE(ip_order); j++) { 3701 3702 if (block->version->type != ip_order[j] || 3703 !block->status.valid) 3704 continue; 3705 3706 r = block->version->funcs->hw_init(&adev->ip_blocks[i]); 3707 if (r) { 3708 dev_err(adev->dev, "RE-INIT-early: %s failed\n", 3709 block->version->funcs->name); 3710 return r; 3711 } 3712 block->status.hw = true; 3713 } 3714 } 3715 3716 return 0; 3717 } 3718 3719 static int amdgpu_device_ip_reinit_late_sriov(struct amdgpu_device *adev) 3720 { 3721 struct amdgpu_ip_block *block; 3722 int i, r = 0; 3723 3724 static enum amd_ip_block_type ip_order[] = { 3725 AMD_IP_BLOCK_TYPE_SMC, 3726 AMD_IP_BLOCK_TYPE_DCE, 3727 AMD_IP_BLOCK_TYPE_GFX, 3728 AMD_IP_BLOCK_TYPE_SDMA, 3729 AMD_IP_BLOCK_TYPE_MES, 3730 AMD_IP_BLOCK_TYPE_UVD, 3731 AMD_IP_BLOCK_TYPE_VCE, 3732 AMD_IP_BLOCK_TYPE_VCN, 3733 AMD_IP_BLOCK_TYPE_JPEG 3734 }; 3735 3736 for (i = 0; i < ARRAY_SIZE(ip_order); i++) { 3737 block = amdgpu_device_ip_get_ip_block(adev, ip_order[i]); 3738 3739 if (!block) 3740 continue; 3741 3742 if (block->status.valid && !block->status.hw) { 3743 if (block->version->type == AMD_IP_BLOCK_TYPE_SMC) { 3744 r = amdgpu_ip_block_resume(block); 3745 } else { 3746 r = block->version->funcs->hw_init(block); 3747 } 3748 3749 if (r) { 3750 dev_err(adev->dev, "RE-INIT-late: %s failed\n", 3751 block->version->funcs->name); 3752 break; 3753 } 3754 block->status.hw = true; 3755 } 3756 } 3757 3758 return r; 3759 } 3760 3761 /** 3762 * amdgpu_device_ip_resume_phase1 - run resume for hardware IPs 3763 * 3764 * @adev: amdgpu_device pointer 3765 * 3766 * First resume function for hardware IPs. The list of all the hardware 3767 * IPs that make up the asic is walked and the resume callbacks are run for 3768 * COMMON, GMC, and IH. resume puts the hardware into a functional state 3769 * after a suspend and updates the software state as necessary. This 3770 * function is also used for restoring the GPU after a GPU reset. 3771 * Returns 0 on success, negative error code on failure. 3772 */ 3773 static int amdgpu_device_ip_resume_phase1(struct amdgpu_device *adev) 3774 { 3775 int i, r; 3776 3777 for (i = 0; i < adev->num_ip_blocks; i++) { 3778 if (!adev->ip_blocks[i].status.valid || adev->ip_blocks[i].status.hw) 3779 continue; 3780 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON || 3781 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC || 3782 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH || 3783 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP && amdgpu_sriov_vf(adev))) { 3784 3785 r = amdgpu_ip_block_resume(&adev->ip_blocks[i]); 3786 if (r) 3787 return r; 3788 } 3789 } 3790 3791 return 0; 3792 } 3793 3794 /** 3795 * amdgpu_device_ip_resume_phase2 - run resume for hardware IPs 3796 * 3797 * @adev: amdgpu_device pointer 3798 * 3799 * Second resume function for hardware IPs. The list of all the hardware 3800 * IPs that make up the asic is walked and the resume callbacks are run for 3801 * all blocks except COMMON, GMC, and IH. resume puts the hardware into a 3802 * functional state after a suspend and updates the software state as 3803 * necessary. This function is also used for restoring the GPU after a GPU 3804 * reset. 3805 * Returns 0 on success, negative error code on failure. 3806 */ 3807 static int amdgpu_device_ip_resume_phase2(struct amdgpu_device *adev) 3808 { 3809 int i, r; 3810 3811 for (i = 0; i < adev->num_ip_blocks; i++) { 3812 if (!adev->ip_blocks[i].status.valid || adev->ip_blocks[i].status.hw) 3813 continue; 3814 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON || 3815 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC || 3816 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH || 3817 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE || 3818 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) 3819 continue; 3820 r = amdgpu_ip_block_resume(&adev->ip_blocks[i]); 3821 if (r) 3822 return r; 3823 } 3824 3825 return 0; 3826 } 3827 3828 /** 3829 * amdgpu_device_ip_resume_phase3 - run resume for hardware IPs 3830 * 3831 * @adev: amdgpu_device pointer 3832 * 3833 * Third resume function for hardware IPs. The list of all the hardware 3834 * IPs that make up the asic is walked and the resume callbacks are run for 3835 * all DCE. resume puts the hardware into a functional state after a suspend 3836 * and updates the software state as necessary. This function is also used 3837 * for restoring the GPU after a GPU reset. 3838 * 3839 * Returns 0 on success, negative error code on failure. 3840 */ 3841 static int amdgpu_device_ip_resume_phase3(struct amdgpu_device *adev) 3842 { 3843 int i, r; 3844 3845 for (i = 0; i < adev->num_ip_blocks; i++) { 3846 if (!adev->ip_blocks[i].status.valid || adev->ip_blocks[i].status.hw) 3847 continue; 3848 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE) { 3849 r = amdgpu_ip_block_resume(&adev->ip_blocks[i]); 3850 if (r) 3851 return r; 3852 } 3853 } 3854 3855 return 0; 3856 } 3857 3858 /** 3859 * amdgpu_device_ip_resume - run resume for hardware IPs 3860 * 3861 * @adev: amdgpu_device pointer 3862 * 3863 * Main resume function for hardware IPs. The hardware IPs 3864 * are split into two resume functions because they are 3865 * also used in recovering from a GPU reset and some additional 3866 * steps need to be take between them. In this case (S3/S4) they are 3867 * run sequentially. 3868 * Returns 0 on success, negative error code on failure. 3869 */ 3870 static int amdgpu_device_ip_resume(struct amdgpu_device *adev) 3871 { 3872 int r; 3873 3874 r = amdgpu_device_ip_resume_phase1(adev); 3875 if (r) 3876 return r; 3877 3878 r = amdgpu_device_fw_loading(adev); 3879 if (r) 3880 return r; 3881 3882 r = amdgpu_device_ip_resume_phase2(adev); 3883 3884 if (adev->mman.buffer_funcs_ring->sched.ready) 3885 amdgpu_ttm_set_buffer_funcs_status(adev, true); 3886 3887 if (r) 3888 return r; 3889 3890 amdgpu_fence_driver_hw_init(adev); 3891 3892 r = amdgpu_device_ip_resume_phase3(adev); 3893 3894 return r; 3895 } 3896 3897 /** 3898 * amdgpu_device_detect_sriov_bios - determine if the board supports SR-IOV 3899 * 3900 * @adev: amdgpu_device pointer 3901 * 3902 * Query the VBIOS data tables to determine if the board supports SR-IOV. 3903 */ 3904 static void amdgpu_device_detect_sriov_bios(struct amdgpu_device *adev) 3905 { 3906 if (amdgpu_sriov_vf(adev)) { 3907 if (adev->is_atom_fw) { 3908 if (amdgpu_atomfirmware_gpu_virtualization_supported(adev)) 3909 adev->virt.caps |= AMDGPU_SRIOV_CAPS_SRIOV_VBIOS; 3910 } else { 3911 if (amdgpu_atombios_has_gpu_virtualization_table(adev)) 3912 adev->virt.caps |= AMDGPU_SRIOV_CAPS_SRIOV_VBIOS; 3913 } 3914 3915 if (!(adev->virt.caps & AMDGPU_SRIOV_CAPS_SRIOV_VBIOS)) 3916 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_NO_VBIOS, 0, 0); 3917 } 3918 } 3919 3920 /** 3921 * amdgpu_device_asic_has_dc_support - determine if DC supports the asic 3922 * 3923 * @pdev : pci device context 3924 * @asic_type: AMD asic type 3925 * 3926 * Check if there is DC (new modesetting infrastructre) support for an asic. 3927 * returns true if DC has support, false if not. 3928 */ 3929 bool amdgpu_device_asic_has_dc_support(struct pci_dev *pdev, 3930 enum amd_asic_type asic_type) 3931 { 3932 switch (asic_type) { 3933 #ifdef CONFIG_DRM_AMDGPU_SI 3934 case CHIP_HAINAN: 3935 #endif 3936 case CHIP_TOPAZ: 3937 /* chips with no display hardware */ 3938 return false; 3939 #if defined(CONFIG_DRM_AMD_DC) 3940 case CHIP_TAHITI: 3941 case CHIP_PITCAIRN: 3942 case CHIP_VERDE: 3943 case CHIP_OLAND: 3944 return amdgpu_dc != 0 && IS_ENABLED(CONFIG_DRM_AMD_DC_SI); 3945 case CHIP_KAVERI: 3946 case CHIP_KABINI: 3947 case CHIP_MULLINS: 3948 /* 3949 * We have systems in the wild with these ASICs that require 3950 * TRAVIS and NUTMEG support which is not supported with DC. 3951 * 3952 * Fallback to the non-DC driver here by default so as not to 3953 * cause regressions. 3954 */ 3955 return amdgpu_dc > 0; 3956 default: 3957 return amdgpu_dc != 0; 3958 #else 3959 default: 3960 if (amdgpu_dc > 0) 3961 dev_info_once( 3962 &pdev->dev, 3963 "Display Core has been requested via kernel parameter but isn't supported by ASIC, ignoring\n"); 3964 return false; 3965 #endif 3966 } 3967 } 3968 3969 /** 3970 * amdgpu_device_has_dc_support - check if dc is supported 3971 * 3972 * @adev: amdgpu_device pointer 3973 * 3974 * Returns true for supported, false for not supported 3975 */ 3976 bool amdgpu_device_has_dc_support(struct amdgpu_device *adev) 3977 { 3978 if (adev->enable_virtual_display || 3979 (adev->harvest_ip_mask & AMD_HARVEST_IP_DMU_MASK)) 3980 return false; 3981 3982 return amdgpu_device_asic_has_dc_support(adev->pdev, adev->asic_type); 3983 } 3984 3985 static void amdgpu_device_xgmi_reset_func(struct work_struct *__work) 3986 { 3987 struct amdgpu_device *adev = 3988 container_of(__work, struct amdgpu_device, xgmi_reset_work); 3989 struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev); 3990 3991 /* It's a bug to not have a hive within this function */ 3992 if (WARN_ON(!hive)) 3993 return; 3994 3995 /* 3996 * Use task barrier to synchronize all xgmi reset works across the 3997 * hive. task_barrier_enter and task_barrier_exit will block 3998 * until all the threads running the xgmi reset works reach 3999 * those points. task_barrier_full will do both blocks. 4000 */ 4001 if (amdgpu_asic_reset_method(adev) == AMD_RESET_METHOD_BACO) { 4002 4003 task_barrier_enter(&hive->tb); 4004 adev->asic_reset_res = amdgpu_device_baco_enter(adev); 4005 4006 if (adev->asic_reset_res) 4007 goto fail; 4008 4009 task_barrier_exit(&hive->tb); 4010 adev->asic_reset_res = amdgpu_device_baco_exit(adev); 4011 4012 if (adev->asic_reset_res) 4013 goto fail; 4014 4015 amdgpu_ras_reset_error_count(adev, AMDGPU_RAS_BLOCK__MMHUB); 4016 } else { 4017 4018 task_barrier_full(&hive->tb); 4019 adev->asic_reset_res = amdgpu_asic_reset(adev); 4020 } 4021 4022 fail: 4023 if (adev->asic_reset_res) 4024 dev_warn(adev->dev, 4025 "ASIC reset failed with error, %d for drm dev, %s", 4026 adev->asic_reset_res, adev_to_drm(adev)->unique); 4027 amdgpu_put_xgmi_hive(hive); 4028 } 4029 4030 static int amdgpu_device_get_job_timeout_settings(struct amdgpu_device *adev) 4031 { 4032 char *input = amdgpu_lockup_timeout; 4033 char *timeout_setting = NULL; 4034 int index = 0; 4035 long timeout; 4036 int ret = 0; 4037 4038 /* By default timeout for all queues is 2 sec */ 4039 adev->gfx_timeout = adev->compute_timeout = adev->sdma_timeout = 4040 adev->video_timeout = msecs_to_jiffies(2000); 4041 4042 if (!strnlen(input, AMDGPU_MAX_TIMEOUT_PARAM_LENGTH)) 4043 return 0; 4044 4045 while ((timeout_setting = strsep(&input, ",")) && 4046 strnlen(timeout_setting, AMDGPU_MAX_TIMEOUT_PARAM_LENGTH)) { 4047 ret = kstrtol(timeout_setting, 0, &timeout); 4048 if (ret) 4049 return ret; 4050 4051 if (timeout == 0) { 4052 index++; 4053 continue; 4054 } else if (timeout < 0) { 4055 timeout = MAX_SCHEDULE_TIMEOUT; 4056 dev_warn(adev->dev, "lockup timeout disabled"); 4057 add_taint(TAINT_SOFTLOCKUP, LOCKDEP_STILL_OK); 4058 } else { 4059 timeout = msecs_to_jiffies(timeout); 4060 } 4061 4062 switch (index++) { 4063 case 0: 4064 adev->gfx_timeout = timeout; 4065 break; 4066 case 1: 4067 adev->compute_timeout = timeout; 4068 break; 4069 case 2: 4070 adev->sdma_timeout = timeout; 4071 break; 4072 case 3: 4073 adev->video_timeout = timeout; 4074 break; 4075 default: 4076 break; 4077 } 4078 } 4079 4080 /* When only one value specified apply it to all queues. */ 4081 if (index == 1) 4082 adev->gfx_timeout = adev->compute_timeout = adev->sdma_timeout = 4083 adev->video_timeout = timeout; 4084 4085 return ret; 4086 } 4087 4088 /** 4089 * amdgpu_device_check_iommu_direct_map - check if RAM direct mapped to GPU 4090 * 4091 * @adev: amdgpu_device pointer 4092 * 4093 * RAM direct mapped to GPU if IOMMU is not enabled or is pass through mode 4094 */ 4095 static void amdgpu_device_check_iommu_direct_map(struct amdgpu_device *adev) 4096 { 4097 struct iommu_domain *domain; 4098 4099 domain = iommu_get_domain_for_dev(adev->dev); 4100 if (!domain || domain->type == IOMMU_DOMAIN_IDENTITY) 4101 adev->ram_is_direct_mapped = true; 4102 } 4103 4104 #if defined(CONFIG_HSA_AMD_P2P) 4105 /** 4106 * amdgpu_device_check_iommu_remap - Check if DMA remapping is enabled. 4107 * 4108 * @adev: amdgpu_device pointer 4109 * 4110 * return if IOMMU remapping bar address 4111 */ 4112 static bool amdgpu_device_check_iommu_remap(struct amdgpu_device *adev) 4113 { 4114 struct iommu_domain *domain; 4115 4116 domain = iommu_get_domain_for_dev(adev->dev); 4117 if (domain && (domain->type == IOMMU_DOMAIN_DMA || 4118 domain->type == IOMMU_DOMAIN_DMA_FQ)) 4119 return true; 4120 4121 return false; 4122 } 4123 #endif 4124 4125 static void amdgpu_device_set_mcbp(struct amdgpu_device *adev) 4126 { 4127 if (amdgpu_mcbp == 1) 4128 adev->gfx.mcbp = true; 4129 else if (amdgpu_mcbp == 0) 4130 adev->gfx.mcbp = false; 4131 4132 if (amdgpu_sriov_vf(adev)) 4133 adev->gfx.mcbp = true; 4134 4135 if (adev->gfx.mcbp) 4136 dev_info(adev->dev, "MCBP is enabled\n"); 4137 } 4138 4139 static int amdgpu_device_sys_interface_init(struct amdgpu_device *adev) 4140 { 4141 int r; 4142 4143 r = amdgpu_atombios_sysfs_init(adev); 4144 if (r) 4145 drm_err(&adev->ddev, 4146 "registering atombios sysfs failed (%d).\n", r); 4147 4148 r = amdgpu_pm_sysfs_init(adev); 4149 if (r) 4150 dev_err(adev->dev, "registering pm sysfs failed (%d).\n", r); 4151 4152 r = amdgpu_ucode_sysfs_init(adev); 4153 if (r) { 4154 adev->ucode_sysfs_en = false; 4155 dev_err(adev->dev, "Creating firmware sysfs failed (%d).\n", r); 4156 } else 4157 adev->ucode_sysfs_en = true; 4158 4159 r = amdgpu_device_attr_sysfs_init(adev); 4160 if (r) 4161 dev_err(adev->dev, "Could not create amdgpu device attr\n"); 4162 4163 r = devm_device_add_group(adev->dev, &amdgpu_board_attrs_group); 4164 if (r) 4165 dev_err(adev->dev, 4166 "Could not create amdgpu board attributes\n"); 4167 4168 amdgpu_fru_sysfs_init(adev); 4169 amdgpu_reg_state_sysfs_init(adev); 4170 amdgpu_xcp_sysfs_init(adev); 4171 4172 return r; 4173 } 4174 4175 static void amdgpu_device_sys_interface_fini(struct amdgpu_device *adev) 4176 { 4177 if (adev->pm.sysfs_initialized) 4178 amdgpu_pm_sysfs_fini(adev); 4179 if (adev->ucode_sysfs_en) 4180 amdgpu_ucode_sysfs_fini(adev); 4181 amdgpu_device_attr_sysfs_fini(adev); 4182 amdgpu_fru_sysfs_fini(adev); 4183 4184 amdgpu_reg_state_sysfs_fini(adev); 4185 amdgpu_xcp_sysfs_fini(adev); 4186 } 4187 4188 /** 4189 * amdgpu_device_init - initialize the driver 4190 * 4191 * @adev: amdgpu_device pointer 4192 * @flags: driver flags 4193 * 4194 * Initializes the driver info and hw (all asics). 4195 * Returns 0 for success or an error on failure. 4196 * Called at driver startup. 4197 */ 4198 int amdgpu_device_init(struct amdgpu_device *adev, 4199 uint32_t flags) 4200 { 4201 struct pci_dev *pdev = adev->pdev; 4202 int r, i; 4203 bool px = false; 4204 u32 max_MBps; 4205 int tmp; 4206 4207 adev->shutdown = false; 4208 adev->flags = flags; 4209 4210 if (amdgpu_force_asic_type >= 0 && amdgpu_force_asic_type < CHIP_LAST) 4211 adev->asic_type = amdgpu_force_asic_type; 4212 else 4213 adev->asic_type = flags & AMD_ASIC_MASK; 4214 4215 adev->usec_timeout = AMDGPU_MAX_USEC_TIMEOUT; 4216 if (amdgpu_emu_mode == 1) 4217 adev->usec_timeout *= 10; 4218 adev->gmc.gart_size = 512 * 1024 * 1024; 4219 adev->accel_working = false; 4220 adev->num_rings = 0; 4221 RCU_INIT_POINTER(adev->gang_submit, dma_fence_get_stub()); 4222 adev->mman.buffer_funcs = NULL; 4223 adev->mman.buffer_funcs_ring = NULL; 4224 adev->vm_manager.vm_pte_funcs = NULL; 4225 adev->vm_manager.vm_pte_num_scheds = 0; 4226 adev->gmc.gmc_funcs = NULL; 4227 adev->harvest_ip_mask = 0x0; 4228 adev->fence_context = dma_fence_context_alloc(AMDGPU_MAX_RINGS); 4229 bitmap_zero(adev->gfx.pipe_reserve_bitmap, AMDGPU_MAX_COMPUTE_QUEUES); 4230 4231 adev->smc_rreg = &amdgpu_invalid_rreg; 4232 adev->smc_wreg = &amdgpu_invalid_wreg; 4233 adev->pcie_rreg = &amdgpu_invalid_rreg; 4234 adev->pcie_wreg = &amdgpu_invalid_wreg; 4235 adev->pcie_rreg_ext = &amdgpu_invalid_rreg_ext; 4236 adev->pcie_wreg_ext = &amdgpu_invalid_wreg_ext; 4237 adev->pciep_rreg = &amdgpu_invalid_rreg; 4238 adev->pciep_wreg = &amdgpu_invalid_wreg; 4239 adev->pcie_rreg64 = &amdgpu_invalid_rreg64; 4240 adev->pcie_wreg64 = &amdgpu_invalid_wreg64; 4241 adev->pcie_rreg64_ext = &amdgpu_invalid_rreg64_ext; 4242 adev->pcie_wreg64_ext = &amdgpu_invalid_wreg64_ext; 4243 adev->uvd_ctx_rreg = &amdgpu_invalid_rreg; 4244 adev->uvd_ctx_wreg = &amdgpu_invalid_wreg; 4245 adev->didt_rreg = &amdgpu_invalid_rreg; 4246 adev->didt_wreg = &amdgpu_invalid_wreg; 4247 adev->gc_cac_rreg = &amdgpu_invalid_rreg; 4248 adev->gc_cac_wreg = &amdgpu_invalid_wreg; 4249 adev->audio_endpt_rreg = &amdgpu_block_invalid_rreg; 4250 adev->audio_endpt_wreg = &amdgpu_block_invalid_wreg; 4251 4252 dev_info( 4253 adev->dev, 4254 "initializing kernel modesetting (%s 0x%04X:0x%04X 0x%04X:0x%04X 0x%02X).\n", 4255 amdgpu_asic_name[adev->asic_type], pdev->vendor, pdev->device, 4256 pdev->subsystem_vendor, pdev->subsystem_device, pdev->revision); 4257 4258 /* mutex initialization are all done here so we 4259 * can recall function without having locking issues 4260 */ 4261 mutex_init(&adev->firmware.mutex); 4262 mutex_init(&adev->pm.mutex); 4263 mutex_init(&adev->gfx.gpu_clock_mutex); 4264 mutex_init(&adev->srbm_mutex); 4265 mutex_init(&adev->gfx.pipe_reserve_mutex); 4266 mutex_init(&adev->gfx.gfx_off_mutex); 4267 mutex_init(&adev->gfx.partition_mutex); 4268 mutex_init(&adev->grbm_idx_mutex); 4269 mutex_init(&adev->mn_lock); 4270 mutex_init(&adev->virt.vf_errors.lock); 4271 hash_init(adev->mn_hash); 4272 mutex_init(&adev->psp.mutex); 4273 mutex_init(&adev->notifier_lock); 4274 mutex_init(&adev->pm.stable_pstate_ctx_lock); 4275 mutex_init(&adev->benchmark_mutex); 4276 mutex_init(&adev->gfx.reset_sem_mutex); 4277 /* Initialize the mutex for cleaner shader isolation between GFX and compute processes */ 4278 mutex_init(&adev->enforce_isolation_mutex); 4279 for (i = 0; i < MAX_XCP; ++i) { 4280 adev->isolation[i].spearhead = dma_fence_get_stub(); 4281 amdgpu_sync_create(&adev->isolation[i].active); 4282 amdgpu_sync_create(&adev->isolation[i].prev); 4283 } 4284 mutex_init(&adev->gfx.userq_sch_mutex); 4285 mutex_init(&adev->gfx.workload_profile_mutex); 4286 mutex_init(&adev->vcn.workload_profile_mutex); 4287 4288 amdgpu_device_init_apu_flags(adev); 4289 4290 r = amdgpu_device_check_arguments(adev); 4291 if (r) 4292 return r; 4293 4294 spin_lock_init(&adev->mmio_idx_lock); 4295 spin_lock_init(&adev->smc_idx_lock); 4296 spin_lock_init(&adev->pcie_idx_lock); 4297 spin_lock_init(&adev->uvd_ctx_idx_lock); 4298 spin_lock_init(&adev->didt_idx_lock); 4299 spin_lock_init(&adev->gc_cac_idx_lock); 4300 spin_lock_init(&adev->se_cac_idx_lock); 4301 spin_lock_init(&adev->audio_endpt_idx_lock); 4302 spin_lock_init(&adev->mm_stats.lock); 4303 spin_lock_init(&adev->virt.rlcg_reg_lock); 4304 spin_lock_init(&adev->wb.lock); 4305 4306 xa_init_flags(&adev->userq_xa, XA_FLAGS_LOCK_IRQ); 4307 4308 INIT_LIST_HEAD(&adev->reset_list); 4309 4310 INIT_LIST_HEAD(&adev->ras_list); 4311 4312 INIT_LIST_HEAD(&adev->pm.od_kobj_list); 4313 4314 xa_init(&adev->userq_doorbell_xa); 4315 4316 INIT_DELAYED_WORK(&adev->delayed_init_work, 4317 amdgpu_device_delayed_init_work_handler); 4318 INIT_DELAYED_WORK(&adev->gfx.gfx_off_delay_work, 4319 amdgpu_device_delay_enable_gfx_off); 4320 /* 4321 * Initialize the enforce_isolation work structures for each XCP 4322 * partition. This work handler is responsible for enforcing shader 4323 * isolation on AMD GPUs. It counts the number of emitted fences for 4324 * each GFX and compute ring. If there are any fences, it schedules 4325 * the `enforce_isolation_work` to be run after a delay. If there are 4326 * no fences, it signals the Kernel Fusion Driver (KFD) to resume the 4327 * runqueue. 4328 */ 4329 for (i = 0; i < MAX_XCP; i++) { 4330 INIT_DELAYED_WORK(&adev->gfx.enforce_isolation[i].work, 4331 amdgpu_gfx_enforce_isolation_handler); 4332 adev->gfx.enforce_isolation[i].adev = adev; 4333 adev->gfx.enforce_isolation[i].xcp_id = i; 4334 } 4335 4336 INIT_WORK(&adev->xgmi_reset_work, amdgpu_device_xgmi_reset_func); 4337 INIT_WORK(&adev->userq_reset_work, amdgpu_userq_reset_work); 4338 4339 adev->gfx.gfx_off_req_count = 1; 4340 adev->gfx.gfx_off_residency = 0; 4341 adev->gfx.gfx_off_entrycount = 0; 4342 adev->pm.ac_power = power_supply_is_system_supplied() > 0; 4343 4344 atomic_set(&adev->throttling_logging_enabled, 1); 4345 /* 4346 * If throttling continues, logging will be performed every minute 4347 * to avoid log flooding. "-1" is subtracted since the thermal 4348 * throttling interrupt comes every second. Thus, the total logging 4349 * interval is 59 seconds(retelimited printk interval) + 1(waiting 4350 * for throttling interrupt) = 60 seconds. 4351 */ 4352 ratelimit_state_init(&adev->throttling_logging_rs, (60 - 1) * HZ, 1); 4353 4354 ratelimit_set_flags(&adev->throttling_logging_rs, RATELIMIT_MSG_ON_RELEASE); 4355 4356 /* Registers mapping */ 4357 /* TODO: block userspace mapping of io register */ 4358 if (adev->asic_type >= CHIP_BONAIRE) { 4359 adev->rmmio_base = pci_resource_start(adev->pdev, 5); 4360 adev->rmmio_size = pci_resource_len(adev->pdev, 5); 4361 } else { 4362 adev->rmmio_base = pci_resource_start(adev->pdev, 2); 4363 adev->rmmio_size = pci_resource_len(adev->pdev, 2); 4364 } 4365 4366 for (i = 0; i < AMD_IP_BLOCK_TYPE_NUM; i++) 4367 atomic_set(&adev->pm.pwr_state[i], POWER_STATE_UNKNOWN); 4368 4369 adev->rmmio = ioremap(adev->rmmio_base, adev->rmmio_size); 4370 if (!adev->rmmio) 4371 return -ENOMEM; 4372 4373 dev_info(adev->dev, "register mmio base: 0x%08X\n", 4374 (uint32_t)adev->rmmio_base); 4375 dev_info(adev->dev, "register mmio size: %u\n", 4376 (unsigned int)adev->rmmio_size); 4377 4378 /* 4379 * Reset domain needs to be present early, before XGMI hive discovered 4380 * (if any) and initialized to use reset sem and in_gpu reset flag 4381 * early on during init and before calling to RREG32. 4382 */ 4383 adev->reset_domain = amdgpu_reset_create_reset_domain(SINGLE_DEVICE, "amdgpu-reset-dev"); 4384 if (!adev->reset_domain) 4385 return -ENOMEM; 4386 4387 /* detect hw virtualization here */ 4388 amdgpu_virt_init(adev); 4389 4390 amdgpu_device_get_pcie_info(adev); 4391 4392 r = amdgpu_device_get_job_timeout_settings(adev); 4393 if (r) { 4394 dev_err(adev->dev, "invalid lockup_timeout parameter syntax\n"); 4395 return r; 4396 } 4397 4398 amdgpu_device_set_mcbp(adev); 4399 4400 /* 4401 * By default, use default mode where all blocks are expected to be 4402 * initialized. At present a 'swinit' of blocks is required to be 4403 * completed before the need for a different level is detected. 4404 */ 4405 amdgpu_set_init_level(adev, AMDGPU_INIT_LEVEL_DEFAULT); 4406 /* early init functions */ 4407 r = amdgpu_device_ip_early_init(adev); 4408 if (r) 4409 return r; 4410 4411 /* 4412 * No need to remove conflicting FBs for non-display class devices. 4413 * This prevents the sysfb from being freed accidently. 4414 */ 4415 if ((pdev->class >> 8) == PCI_CLASS_DISPLAY_VGA || 4416 (pdev->class >> 8) == PCI_CLASS_DISPLAY_OTHER) { 4417 /* Get rid of things like offb */ 4418 r = aperture_remove_conflicting_pci_devices(adev->pdev, amdgpu_kms_driver.name); 4419 if (r) 4420 return r; 4421 } 4422 4423 /* Enable TMZ based on IP_VERSION */ 4424 amdgpu_gmc_tmz_set(adev); 4425 4426 if (amdgpu_sriov_vf(adev) && 4427 amdgpu_ip_version(adev, GC_HWIP, 0) >= IP_VERSION(10, 3, 0)) 4428 /* VF MMIO access (except mailbox range) from CPU 4429 * will be blocked during sriov runtime 4430 */ 4431 adev->virt.caps |= AMDGPU_VF_MMIO_ACCESS_PROTECT; 4432 4433 amdgpu_gmc_noretry_set(adev); 4434 /* Need to get xgmi info early to decide the reset behavior*/ 4435 if (adev->gmc.xgmi.supported) { 4436 r = adev->gfxhub.funcs->get_xgmi_info(adev); 4437 if (r) 4438 return r; 4439 } 4440 4441 /* enable PCIE atomic ops */ 4442 if (amdgpu_sriov_vf(adev)) { 4443 if (adev->virt.fw_reserve.p_pf2vf) 4444 adev->have_atomics_support = ((struct amd_sriov_msg_pf2vf_info *) 4445 adev->virt.fw_reserve.p_pf2vf)->pcie_atomic_ops_support_flags == 4446 (PCI_EXP_DEVCAP2_ATOMIC_COMP32 | PCI_EXP_DEVCAP2_ATOMIC_COMP64); 4447 /* APUs w/ gfx9 onwards doesn't reply on PCIe atomics, rather it is a 4448 * internal path natively support atomics, set have_atomics_support to true. 4449 */ 4450 } else if ((adev->flags & AMD_IS_APU) && 4451 (amdgpu_ip_version(adev, GC_HWIP, 0) > 4452 IP_VERSION(9, 0, 0))) { 4453 adev->have_atomics_support = true; 4454 } else { 4455 adev->have_atomics_support = 4456 !pci_enable_atomic_ops_to_root(adev->pdev, 4457 PCI_EXP_DEVCAP2_ATOMIC_COMP32 | 4458 PCI_EXP_DEVCAP2_ATOMIC_COMP64); 4459 } 4460 4461 if (!adev->have_atomics_support) 4462 dev_info(adev->dev, "PCIE atomic ops is not supported\n"); 4463 4464 /* doorbell bar mapping and doorbell index init*/ 4465 amdgpu_doorbell_init(adev); 4466 4467 if (amdgpu_emu_mode == 1) { 4468 /* post the asic on emulation mode */ 4469 emu_soc_asic_init(adev); 4470 goto fence_driver_init; 4471 } 4472 4473 amdgpu_reset_init(adev); 4474 4475 /* detect if we are with an SRIOV vbios */ 4476 if (adev->bios) 4477 amdgpu_device_detect_sriov_bios(adev); 4478 4479 /* check if we need to reset the asic 4480 * E.g., driver was not cleanly unloaded previously, etc. 4481 */ 4482 if (!amdgpu_sriov_vf(adev) && amdgpu_asic_need_reset_on_init(adev)) { 4483 if (adev->gmc.xgmi.num_physical_nodes) { 4484 dev_info(adev->dev, "Pending hive reset.\n"); 4485 amdgpu_set_init_level(adev, 4486 AMDGPU_INIT_LEVEL_MINIMAL_XGMI); 4487 } else if (amdgpu_ip_version(adev, MP1_HWIP, 0) == IP_VERSION(13, 0, 10) && 4488 !amdgpu_device_has_display_hardware(adev)) { 4489 r = psp_gpu_reset(adev); 4490 } else { 4491 tmp = amdgpu_reset_method; 4492 /* It should do a default reset when loading or reloading the driver, 4493 * regardless of the module parameter reset_method. 4494 */ 4495 amdgpu_reset_method = AMD_RESET_METHOD_NONE; 4496 r = amdgpu_asic_reset(adev); 4497 amdgpu_reset_method = tmp; 4498 } 4499 4500 if (r) { 4501 dev_err(adev->dev, "asic reset on init failed\n"); 4502 goto failed; 4503 } 4504 } 4505 4506 /* Post card if necessary */ 4507 if (amdgpu_device_need_post(adev)) { 4508 if (!adev->bios) { 4509 dev_err(adev->dev, "no vBIOS found\n"); 4510 r = -EINVAL; 4511 goto failed; 4512 } 4513 dev_info(adev->dev, "GPU posting now...\n"); 4514 r = amdgpu_device_asic_init(adev); 4515 if (r) { 4516 dev_err(adev->dev, "gpu post error!\n"); 4517 goto failed; 4518 } 4519 } 4520 4521 if (adev->bios) { 4522 if (adev->is_atom_fw) { 4523 /* Initialize clocks */ 4524 r = amdgpu_atomfirmware_get_clock_info(adev); 4525 if (r) { 4526 dev_err(adev->dev, "amdgpu_atomfirmware_get_clock_info failed\n"); 4527 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_GET_CLOCK_FAIL, 0, 0); 4528 goto failed; 4529 } 4530 } else { 4531 /* Initialize clocks */ 4532 r = amdgpu_atombios_get_clock_info(adev); 4533 if (r) { 4534 dev_err(adev->dev, "amdgpu_atombios_get_clock_info failed\n"); 4535 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_GET_CLOCK_FAIL, 0, 0); 4536 goto failed; 4537 } 4538 /* init i2c buses */ 4539 amdgpu_i2c_init(adev); 4540 } 4541 } 4542 4543 fence_driver_init: 4544 /* Fence driver */ 4545 r = amdgpu_fence_driver_sw_init(adev); 4546 if (r) { 4547 dev_err(adev->dev, "amdgpu_fence_driver_sw_init failed\n"); 4548 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_FENCE_INIT_FAIL, 0, 0); 4549 goto failed; 4550 } 4551 4552 /* init the mode config */ 4553 drm_mode_config_init(adev_to_drm(adev)); 4554 4555 r = amdgpu_device_ip_init(adev); 4556 if (r) { 4557 dev_err(adev->dev, "amdgpu_device_ip_init failed\n"); 4558 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_AMDGPU_INIT_FAIL, 0, 0); 4559 goto release_ras_con; 4560 } 4561 4562 amdgpu_fence_driver_hw_init(adev); 4563 4564 dev_info(adev->dev, 4565 "SE %d, SH per SE %d, CU per SH %d, active_cu_number %d\n", 4566 adev->gfx.config.max_shader_engines, 4567 adev->gfx.config.max_sh_per_se, 4568 adev->gfx.config.max_cu_per_sh, 4569 adev->gfx.cu_info.number); 4570 4571 adev->accel_working = true; 4572 4573 amdgpu_vm_check_compute_bug(adev); 4574 4575 /* Initialize the buffer migration limit. */ 4576 if (amdgpu_moverate >= 0) 4577 max_MBps = amdgpu_moverate; 4578 else 4579 max_MBps = 8; /* Allow 8 MB/s. */ 4580 /* Get a log2 for easy divisions. */ 4581 adev->mm_stats.log2_max_MBps = ilog2(max(1u, max_MBps)); 4582 4583 /* 4584 * Register gpu instance before amdgpu_device_enable_mgpu_fan_boost. 4585 * Otherwise the mgpu fan boost feature will be skipped due to the 4586 * gpu instance is counted less. 4587 */ 4588 amdgpu_register_gpu_instance(adev); 4589 4590 /* enable clockgating, etc. after ib tests, etc. since some blocks require 4591 * explicit gating rather than handling it automatically. 4592 */ 4593 if (adev->init_lvl->level != AMDGPU_INIT_LEVEL_MINIMAL_XGMI) { 4594 r = amdgpu_device_ip_late_init(adev); 4595 if (r) { 4596 dev_err(adev->dev, "amdgpu_device_ip_late_init failed\n"); 4597 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_AMDGPU_LATE_INIT_FAIL, 0, r); 4598 goto release_ras_con; 4599 } 4600 /* must succeed. */ 4601 amdgpu_ras_resume(adev); 4602 queue_delayed_work(system_wq, &adev->delayed_init_work, 4603 msecs_to_jiffies(AMDGPU_RESUME_MS)); 4604 } 4605 4606 if (amdgpu_sriov_vf(adev)) { 4607 amdgpu_virt_release_full_gpu(adev, true); 4608 flush_delayed_work(&adev->delayed_init_work); 4609 } 4610 4611 /* Don't init kfd if whole hive need to be reset during init */ 4612 if (adev->init_lvl->level != AMDGPU_INIT_LEVEL_MINIMAL_XGMI) { 4613 kgd2kfd_init_zone_device(adev); 4614 kfd_update_svm_support_properties(adev); 4615 } 4616 4617 if (adev->init_lvl->level == AMDGPU_INIT_LEVEL_MINIMAL_XGMI) 4618 amdgpu_xgmi_reset_on_init(adev); 4619 4620 /* 4621 * Place those sysfs registering after `late_init`. As some of those 4622 * operations performed in `late_init` might affect the sysfs 4623 * interfaces creating. 4624 */ 4625 r = amdgpu_device_sys_interface_init(adev); 4626 4627 if (IS_ENABLED(CONFIG_PERF_EVENTS)) 4628 r = amdgpu_pmu_init(adev); 4629 if (r) 4630 dev_err(adev->dev, "amdgpu_pmu_init failed\n"); 4631 4632 /* Have stored pci confspace at hand for restore in sudden PCI error */ 4633 if (amdgpu_device_cache_pci_state(adev->pdev)) 4634 pci_restore_state(pdev); 4635 4636 /* if we have > 1 VGA cards, then disable the amdgpu VGA resources */ 4637 /* this will fail for cards that aren't VGA class devices, just 4638 * ignore it 4639 */ 4640 if ((adev->pdev->class >> 8) == PCI_CLASS_DISPLAY_VGA) 4641 vga_client_register(adev->pdev, amdgpu_device_vga_set_decode); 4642 4643 px = amdgpu_device_supports_px(adev); 4644 4645 if (px || (!dev_is_removable(&adev->pdev->dev) && 4646 apple_gmux_detect(NULL, NULL))) 4647 vga_switcheroo_register_client(adev->pdev, 4648 &amdgpu_switcheroo_ops, px); 4649 4650 if (px) 4651 vga_switcheroo_init_domain_pm_ops(adev->dev, &adev->vga_pm_domain); 4652 4653 amdgpu_device_check_iommu_direct_map(adev); 4654 4655 adev->pm_nb.notifier_call = amdgpu_device_pm_notifier; 4656 r = register_pm_notifier(&adev->pm_nb); 4657 if (r) 4658 goto failed; 4659 4660 return 0; 4661 4662 release_ras_con: 4663 if (amdgpu_sriov_vf(adev)) 4664 amdgpu_virt_release_full_gpu(adev, true); 4665 4666 /* failed in exclusive mode due to timeout */ 4667 if (amdgpu_sriov_vf(adev) && 4668 !amdgpu_sriov_runtime(adev) && 4669 amdgpu_virt_mmio_blocked(adev) && 4670 !amdgpu_virt_wait_reset(adev)) { 4671 dev_err(adev->dev, "VF exclusive mode timeout\n"); 4672 /* Don't send request since VF is inactive. */ 4673 adev->virt.caps &= ~AMDGPU_SRIOV_CAPS_RUNTIME; 4674 adev->virt.ops = NULL; 4675 r = -EAGAIN; 4676 } 4677 amdgpu_release_ras_context(adev); 4678 4679 failed: 4680 amdgpu_vf_error_trans_all(adev); 4681 4682 return r; 4683 } 4684 4685 static void amdgpu_device_unmap_mmio(struct amdgpu_device *adev) 4686 { 4687 4688 /* Clear all CPU mappings pointing to this device */ 4689 unmap_mapping_range(adev->ddev.anon_inode->i_mapping, 0, 0, 1); 4690 4691 /* Unmap all mapped bars - Doorbell, registers and VRAM */ 4692 amdgpu_doorbell_fini(adev); 4693 4694 iounmap(adev->rmmio); 4695 adev->rmmio = NULL; 4696 if (adev->mman.aper_base_kaddr) 4697 iounmap(adev->mman.aper_base_kaddr); 4698 adev->mman.aper_base_kaddr = NULL; 4699 4700 /* Memory manager related */ 4701 if (!adev->gmc.xgmi.connected_to_cpu && !adev->gmc.is_app_apu) { 4702 arch_phys_wc_del(adev->gmc.vram_mtrr); 4703 arch_io_free_memtype_wc(adev->gmc.aper_base, adev->gmc.aper_size); 4704 } 4705 } 4706 4707 /** 4708 * amdgpu_device_fini_hw - tear down the driver 4709 * 4710 * @adev: amdgpu_device pointer 4711 * 4712 * Tear down the driver info (all asics). 4713 * Called at driver shutdown. 4714 */ 4715 void amdgpu_device_fini_hw(struct amdgpu_device *adev) 4716 { 4717 dev_info(adev->dev, "amdgpu: finishing device.\n"); 4718 flush_delayed_work(&adev->delayed_init_work); 4719 4720 if (adev->mman.initialized) 4721 drain_workqueue(adev->mman.bdev.wq); 4722 adev->shutdown = true; 4723 4724 unregister_pm_notifier(&adev->pm_nb); 4725 4726 /* make sure IB test finished before entering exclusive mode 4727 * to avoid preemption on IB test 4728 */ 4729 if (amdgpu_sriov_vf(adev)) { 4730 amdgpu_virt_request_full_gpu(adev, false); 4731 amdgpu_virt_fini_data_exchange(adev); 4732 } 4733 4734 /* disable all interrupts */ 4735 amdgpu_irq_disable_all(adev); 4736 if (adev->mode_info.mode_config_initialized) { 4737 if (!drm_drv_uses_atomic_modeset(adev_to_drm(adev))) 4738 drm_helper_force_disable_all(adev_to_drm(adev)); 4739 else 4740 drm_atomic_helper_shutdown(adev_to_drm(adev)); 4741 } 4742 amdgpu_fence_driver_hw_fini(adev); 4743 4744 amdgpu_device_sys_interface_fini(adev); 4745 4746 /* disable ras feature must before hw fini */ 4747 amdgpu_ras_pre_fini(adev); 4748 4749 amdgpu_ttm_set_buffer_funcs_status(adev, false); 4750 4751 amdgpu_device_ip_fini_early(adev); 4752 4753 amdgpu_irq_fini_hw(adev); 4754 4755 if (adev->mman.initialized) 4756 ttm_device_clear_dma_mappings(&adev->mman.bdev); 4757 4758 amdgpu_gart_dummy_page_fini(adev); 4759 4760 if (drm_dev_is_unplugged(adev_to_drm(adev))) 4761 amdgpu_device_unmap_mmio(adev); 4762 4763 } 4764 4765 void amdgpu_device_fini_sw(struct amdgpu_device *adev) 4766 { 4767 int i, idx; 4768 bool px; 4769 4770 amdgpu_device_ip_fini(adev); 4771 amdgpu_fence_driver_sw_fini(adev); 4772 amdgpu_ucode_release(&adev->firmware.gpu_info_fw); 4773 adev->accel_working = false; 4774 dma_fence_put(rcu_dereference_protected(adev->gang_submit, true)); 4775 for (i = 0; i < MAX_XCP; ++i) { 4776 dma_fence_put(adev->isolation[i].spearhead); 4777 amdgpu_sync_free(&adev->isolation[i].active); 4778 amdgpu_sync_free(&adev->isolation[i].prev); 4779 } 4780 4781 amdgpu_reset_fini(adev); 4782 4783 /* free i2c buses */ 4784 amdgpu_i2c_fini(adev); 4785 4786 if (adev->bios) { 4787 if (amdgpu_emu_mode != 1) 4788 amdgpu_atombios_fini(adev); 4789 amdgpu_bios_release(adev); 4790 } 4791 4792 kfree(adev->fru_info); 4793 adev->fru_info = NULL; 4794 4795 kfree(adev->xcp_mgr); 4796 adev->xcp_mgr = NULL; 4797 4798 px = amdgpu_device_supports_px(adev); 4799 4800 if (px || (!dev_is_removable(&adev->pdev->dev) && 4801 apple_gmux_detect(NULL, NULL))) 4802 vga_switcheroo_unregister_client(adev->pdev); 4803 4804 if (px) 4805 vga_switcheroo_fini_domain_pm_ops(adev->dev); 4806 4807 if ((adev->pdev->class >> 8) == PCI_CLASS_DISPLAY_VGA) 4808 vga_client_unregister(adev->pdev); 4809 4810 if (drm_dev_enter(adev_to_drm(adev), &idx)) { 4811 4812 iounmap(adev->rmmio); 4813 adev->rmmio = NULL; 4814 drm_dev_exit(idx); 4815 } 4816 4817 if (IS_ENABLED(CONFIG_PERF_EVENTS)) 4818 amdgpu_pmu_fini(adev); 4819 if (adev->discovery.bin) 4820 amdgpu_discovery_fini(adev); 4821 4822 amdgpu_reset_put_reset_domain(adev->reset_domain); 4823 adev->reset_domain = NULL; 4824 4825 kfree(adev->pci_state); 4826 kfree(adev->pcie_reset_ctx.swds_pcistate); 4827 kfree(adev->pcie_reset_ctx.swus_pcistate); 4828 } 4829 4830 /** 4831 * amdgpu_device_evict_resources - evict device resources 4832 * @adev: amdgpu device object 4833 * 4834 * Evicts all ttm device resources(vram BOs, gart table) from the lru list 4835 * of the vram memory type. Mainly used for evicting device resources 4836 * at suspend time. 4837 * 4838 */ 4839 static int amdgpu_device_evict_resources(struct amdgpu_device *adev) 4840 { 4841 int ret; 4842 4843 /* No need to evict vram on APUs unless going to S4 */ 4844 if (!adev->in_s4 && (adev->flags & AMD_IS_APU)) 4845 return 0; 4846 4847 /* No need to evict when going to S5 through S4 callbacks */ 4848 if (system_state == SYSTEM_POWER_OFF) 4849 return 0; 4850 4851 ret = amdgpu_ttm_evict_resources(adev, TTM_PL_VRAM); 4852 if (ret) { 4853 dev_warn(adev->dev, "evicting device resources failed\n"); 4854 return ret; 4855 } 4856 4857 if (adev->in_s4) { 4858 ret = ttm_device_prepare_hibernation(&adev->mman.bdev); 4859 if (ret) 4860 dev_err(adev->dev, "prepare hibernation failed, %d\n", ret); 4861 } 4862 return ret; 4863 } 4864 4865 /* 4866 * Suspend & resume. 4867 */ 4868 /** 4869 * amdgpu_device_pm_notifier - Notification block for Suspend/Hibernate events 4870 * @nb: notifier block 4871 * @mode: suspend mode 4872 * @data: data 4873 * 4874 * This function is called when the system is about to suspend or hibernate. 4875 * It is used to set the appropriate flags so that eviction can be optimized 4876 * in the pm prepare callback. 4877 */ 4878 static int amdgpu_device_pm_notifier(struct notifier_block *nb, unsigned long mode, 4879 void *data) 4880 { 4881 struct amdgpu_device *adev = container_of(nb, struct amdgpu_device, pm_nb); 4882 4883 switch (mode) { 4884 case PM_HIBERNATION_PREPARE: 4885 adev->in_s4 = true; 4886 break; 4887 case PM_POST_HIBERNATION: 4888 adev->in_s4 = false; 4889 break; 4890 } 4891 4892 return NOTIFY_DONE; 4893 } 4894 4895 /** 4896 * amdgpu_device_prepare - prepare for device suspend 4897 * 4898 * @dev: drm dev pointer 4899 * 4900 * Prepare to put the hw in the suspend state (all asics). 4901 * Returns 0 for success or an error on failure. 4902 * Called at driver suspend. 4903 */ 4904 int amdgpu_device_prepare(struct drm_device *dev) 4905 { 4906 struct amdgpu_device *adev = drm_to_adev(dev); 4907 int i, r; 4908 4909 if (dev->switch_power_state == DRM_SWITCH_POWER_OFF) 4910 return 0; 4911 4912 /* Evict the majority of BOs before starting suspend sequence */ 4913 r = amdgpu_device_evict_resources(adev); 4914 if (r) 4915 return r; 4916 4917 flush_delayed_work(&adev->gfx.gfx_off_delay_work); 4918 4919 for (i = 0; i < adev->num_ip_blocks; i++) { 4920 if (!adev->ip_blocks[i].status.valid) 4921 continue; 4922 if (!adev->ip_blocks[i].version->funcs->prepare_suspend) 4923 continue; 4924 r = adev->ip_blocks[i].version->funcs->prepare_suspend(&adev->ip_blocks[i]); 4925 if (r) 4926 return r; 4927 } 4928 4929 return 0; 4930 } 4931 4932 /** 4933 * amdgpu_device_complete - complete power state transition 4934 * 4935 * @dev: drm dev pointer 4936 * 4937 * Undo the changes from amdgpu_device_prepare. This will be 4938 * called on all resume transitions, including those that failed. 4939 */ 4940 void amdgpu_device_complete(struct drm_device *dev) 4941 { 4942 struct amdgpu_device *adev = drm_to_adev(dev); 4943 int i; 4944 4945 for (i = 0; i < adev->num_ip_blocks; i++) { 4946 if (!adev->ip_blocks[i].status.valid) 4947 continue; 4948 if (!adev->ip_blocks[i].version->funcs->complete) 4949 continue; 4950 adev->ip_blocks[i].version->funcs->complete(&adev->ip_blocks[i]); 4951 } 4952 } 4953 4954 /** 4955 * amdgpu_device_suspend - initiate device suspend 4956 * 4957 * @dev: drm dev pointer 4958 * @notify_clients: notify in-kernel DRM clients 4959 * 4960 * Puts the hw in the suspend state (all asics). 4961 * Returns 0 for success or an error on failure. 4962 * Called at driver suspend. 4963 */ 4964 int amdgpu_device_suspend(struct drm_device *dev, bool notify_clients) 4965 { 4966 struct amdgpu_device *adev = drm_to_adev(dev); 4967 int r, rec; 4968 4969 if (dev->switch_power_state == DRM_SWITCH_POWER_OFF) 4970 return 0; 4971 4972 adev->in_suspend = true; 4973 4974 if (amdgpu_sriov_vf(adev)) { 4975 if (!adev->in_runpm) 4976 amdgpu_amdkfd_suspend_process(adev); 4977 amdgpu_virt_fini_data_exchange(adev); 4978 r = amdgpu_virt_request_full_gpu(adev, false); 4979 if (r) 4980 return r; 4981 } 4982 4983 r = amdgpu_acpi_smart_shift_update(adev, AMDGPU_SS_DEV_D3); 4984 if (r) 4985 goto unwind_sriov; 4986 4987 if (notify_clients) 4988 drm_client_dev_suspend(adev_to_drm(adev)); 4989 4990 cancel_delayed_work_sync(&adev->delayed_init_work); 4991 4992 amdgpu_ras_suspend(adev); 4993 4994 r = amdgpu_device_ip_suspend_phase1(adev); 4995 if (r) 4996 goto unwind_smartshift; 4997 4998 amdgpu_amdkfd_suspend(adev, !amdgpu_sriov_vf(adev) && !adev->in_runpm); 4999 r = amdgpu_userq_suspend(adev); 5000 if (r) 5001 goto unwind_ip_phase1; 5002 5003 r = amdgpu_device_evict_resources(adev); 5004 if (r) 5005 goto unwind_userq; 5006 5007 amdgpu_ttm_set_buffer_funcs_status(adev, false); 5008 5009 amdgpu_fence_driver_hw_fini(adev); 5010 5011 r = amdgpu_device_ip_suspend_phase2(adev); 5012 if (r) 5013 goto unwind_evict; 5014 5015 if (amdgpu_sriov_vf(adev)) 5016 amdgpu_virt_release_full_gpu(adev, false); 5017 5018 return 0; 5019 5020 unwind_evict: 5021 if (adev->mman.buffer_funcs_ring->sched.ready) 5022 amdgpu_ttm_set_buffer_funcs_status(adev, true); 5023 amdgpu_fence_driver_hw_init(adev); 5024 5025 unwind_userq: 5026 rec = amdgpu_userq_resume(adev); 5027 if (rec) { 5028 dev_warn(adev->dev, "failed to re-initialize user queues: %d\n", rec); 5029 return r; 5030 } 5031 rec = amdgpu_amdkfd_resume(adev, !amdgpu_sriov_vf(adev) && !adev->in_runpm); 5032 if (rec) { 5033 dev_warn(adev->dev, "failed to re-initialize kfd: %d\n", rec); 5034 return r; 5035 } 5036 5037 unwind_ip_phase1: 5038 /* suspend phase 1 = resume phase 3 */ 5039 rec = amdgpu_device_ip_resume_phase3(adev); 5040 if (rec) { 5041 dev_warn(adev->dev, "failed to re-initialize IPs phase1: %d\n", rec); 5042 return r; 5043 } 5044 5045 unwind_smartshift: 5046 rec = amdgpu_acpi_smart_shift_update(adev, AMDGPU_SS_DEV_D0); 5047 if (rec) { 5048 dev_warn(adev->dev, "failed to re-update smart shift: %d\n", rec); 5049 return r; 5050 } 5051 5052 if (notify_clients) 5053 drm_client_dev_resume(adev_to_drm(adev)); 5054 5055 amdgpu_ras_resume(adev); 5056 5057 unwind_sriov: 5058 if (amdgpu_sriov_vf(adev)) { 5059 rec = amdgpu_virt_request_full_gpu(adev, true); 5060 if (rec) { 5061 dev_warn(adev->dev, "failed to reinitialize sriov: %d\n", rec); 5062 return r; 5063 } 5064 } 5065 5066 adev->in_suspend = adev->in_s0ix = adev->in_s3 = false; 5067 5068 return r; 5069 } 5070 5071 static inline int amdgpu_virt_resume(struct amdgpu_device *adev) 5072 { 5073 int r; 5074 unsigned int prev_physical_node_id = adev->gmc.xgmi.physical_node_id; 5075 5076 /* During VM resume, QEMU programming of VF MSIX table (register GFXMSIX_VECT0_ADDR_LO) 5077 * may not work. The access could be blocked by nBIF protection as VF isn't in 5078 * exclusive access mode. Exclusive access is enabled now, disable/enable MSIX 5079 * so that QEMU reprograms MSIX table. 5080 */ 5081 amdgpu_restore_msix(adev); 5082 5083 r = adev->gfxhub.funcs->get_xgmi_info(adev); 5084 if (r) 5085 return r; 5086 5087 dev_info(adev->dev, "xgmi node, old id %d, new id %d\n", 5088 prev_physical_node_id, adev->gmc.xgmi.physical_node_id); 5089 5090 adev->vm_manager.vram_base_offset = adev->gfxhub.funcs->get_mc_fb_offset(adev); 5091 adev->vm_manager.vram_base_offset += 5092 adev->gmc.xgmi.physical_node_id * adev->gmc.xgmi.node_segment_size; 5093 5094 return 0; 5095 } 5096 5097 /** 5098 * amdgpu_device_resume - initiate device resume 5099 * 5100 * @dev: drm dev pointer 5101 * @notify_clients: notify in-kernel DRM clients 5102 * 5103 * Bring the hw back to operating state (all asics). 5104 * Returns 0 for success or an error on failure. 5105 * Called at driver resume. 5106 */ 5107 int amdgpu_device_resume(struct drm_device *dev, bool notify_clients) 5108 { 5109 struct amdgpu_device *adev = drm_to_adev(dev); 5110 int r = 0; 5111 5112 if (amdgpu_sriov_vf(adev)) { 5113 r = amdgpu_virt_request_full_gpu(adev, true); 5114 if (r) 5115 return r; 5116 } 5117 5118 if (amdgpu_virt_xgmi_migrate_enabled(adev)) { 5119 r = amdgpu_virt_resume(adev); 5120 if (r) 5121 goto exit; 5122 } 5123 5124 if (dev->switch_power_state == DRM_SWITCH_POWER_OFF) 5125 return 0; 5126 5127 if (adev->in_s0ix) 5128 amdgpu_dpm_gfx_state_change(adev, sGpuChangeState_D0Entry); 5129 5130 /* post card */ 5131 if (amdgpu_device_need_post(adev)) { 5132 r = amdgpu_device_asic_init(adev); 5133 if (r) 5134 dev_err(adev->dev, "amdgpu asic init failed\n"); 5135 } 5136 5137 r = amdgpu_device_ip_resume(adev); 5138 5139 if (r) { 5140 dev_err(adev->dev, "amdgpu_device_ip_resume failed (%d).\n", r); 5141 goto exit; 5142 } 5143 5144 r = amdgpu_amdkfd_resume(adev, !amdgpu_sriov_vf(adev) && !adev->in_runpm); 5145 if (r) 5146 goto exit; 5147 5148 r = amdgpu_userq_resume(adev); 5149 if (r) 5150 goto exit; 5151 5152 r = amdgpu_device_ip_late_init(adev); 5153 if (r) 5154 goto exit; 5155 5156 queue_delayed_work(system_wq, &adev->delayed_init_work, 5157 msecs_to_jiffies(AMDGPU_RESUME_MS)); 5158 exit: 5159 if (amdgpu_sriov_vf(adev)) { 5160 amdgpu_virt_init_data_exchange(adev); 5161 amdgpu_virt_release_full_gpu(adev, true); 5162 5163 if (!r && !adev->in_runpm) 5164 r = amdgpu_amdkfd_resume_process(adev); 5165 } 5166 5167 if (r) 5168 return r; 5169 5170 /* Make sure IB tests flushed */ 5171 flush_delayed_work(&adev->delayed_init_work); 5172 5173 if (notify_clients) 5174 drm_client_dev_resume(adev_to_drm(adev)); 5175 5176 amdgpu_ras_resume(adev); 5177 5178 if (adev->mode_info.num_crtc) { 5179 /* 5180 * Most of the connector probing functions try to acquire runtime pm 5181 * refs to ensure that the GPU is powered on when connector polling is 5182 * performed. Since we're calling this from a runtime PM callback, 5183 * trying to acquire rpm refs will cause us to deadlock. 5184 * 5185 * Since we're guaranteed to be holding the rpm lock, it's safe to 5186 * temporarily disable the rpm helpers so this doesn't deadlock us. 5187 */ 5188 #ifdef CONFIG_PM 5189 dev->dev->power.disable_depth++; 5190 #endif 5191 if (!adev->dc_enabled) 5192 drm_helper_hpd_irq_event(dev); 5193 else 5194 drm_kms_helper_hotplug_event(dev); 5195 #ifdef CONFIG_PM 5196 dev->dev->power.disable_depth--; 5197 #endif 5198 } 5199 5200 amdgpu_vram_mgr_clear_reset_blocks(adev); 5201 adev->in_suspend = false; 5202 5203 if (amdgpu_acpi_smart_shift_update(adev, AMDGPU_SS_DEV_D0)) 5204 dev_warn(adev->dev, "smart shift update failed\n"); 5205 5206 return 0; 5207 } 5208 5209 /** 5210 * amdgpu_device_ip_check_soft_reset - did soft reset succeed 5211 * 5212 * @adev: amdgpu_device pointer 5213 * 5214 * The list of all the hardware IPs that make up the asic is walked and 5215 * the check_soft_reset callbacks are run. check_soft_reset determines 5216 * if the asic is still hung or not. 5217 * Returns true if any of the IPs are still in a hung state, false if not. 5218 */ 5219 static bool amdgpu_device_ip_check_soft_reset(struct amdgpu_device *adev) 5220 { 5221 int i; 5222 bool asic_hang = false; 5223 5224 if (amdgpu_sriov_vf(adev)) 5225 return true; 5226 5227 if (amdgpu_asic_need_full_reset(adev)) 5228 return true; 5229 5230 for (i = 0; i < adev->num_ip_blocks; i++) { 5231 if (!adev->ip_blocks[i].status.valid) 5232 continue; 5233 if (adev->ip_blocks[i].version->funcs->check_soft_reset) 5234 adev->ip_blocks[i].status.hang = 5235 adev->ip_blocks[i].version->funcs->check_soft_reset( 5236 &adev->ip_blocks[i]); 5237 if (adev->ip_blocks[i].status.hang) { 5238 dev_info(adev->dev, "IP block:%s is hung!\n", adev->ip_blocks[i].version->funcs->name); 5239 asic_hang = true; 5240 } 5241 } 5242 return asic_hang; 5243 } 5244 5245 /** 5246 * amdgpu_device_ip_pre_soft_reset - prepare for soft reset 5247 * 5248 * @adev: amdgpu_device pointer 5249 * 5250 * The list of all the hardware IPs that make up the asic is walked and the 5251 * pre_soft_reset callbacks are run if the block is hung. pre_soft_reset 5252 * handles any IP specific hardware or software state changes that are 5253 * necessary for a soft reset to succeed. 5254 * Returns 0 on success, negative error code on failure. 5255 */ 5256 static int amdgpu_device_ip_pre_soft_reset(struct amdgpu_device *adev) 5257 { 5258 int i, r = 0; 5259 5260 for (i = 0; i < adev->num_ip_blocks; i++) { 5261 if (!adev->ip_blocks[i].status.valid) 5262 continue; 5263 if (adev->ip_blocks[i].status.hang && 5264 adev->ip_blocks[i].version->funcs->pre_soft_reset) { 5265 r = adev->ip_blocks[i].version->funcs->pre_soft_reset(&adev->ip_blocks[i]); 5266 if (r) 5267 return r; 5268 } 5269 } 5270 5271 return 0; 5272 } 5273 5274 /** 5275 * amdgpu_device_ip_need_full_reset - check if a full asic reset is needed 5276 * 5277 * @adev: amdgpu_device pointer 5278 * 5279 * Some hardware IPs cannot be soft reset. If they are hung, a full gpu 5280 * reset is necessary to recover. 5281 * Returns true if a full asic reset is required, false if not. 5282 */ 5283 static bool amdgpu_device_ip_need_full_reset(struct amdgpu_device *adev) 5284 { 5285 int i; 5286 5287 if (amdgpu_asic_need_full_reset(adev)) 5288 return true; 5289 5290 for (i = 0; i < adev->num_ip_blocks; i++) { 5291 if (!adev->ip_blocks[i].status.valid) 5292 continue; 5293 if ((adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) || 5294 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) || 5295 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_ACP) || 5296 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE) || 5297 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) { 5298 if (adev->ip_blocks[i].status.hang) { 5299 dev_info(adev->dev, "Some block need full reset!\n"); 5300 return true; 5301 } 5302 } 5303 } 5304 return false; 5305 } 5306 5307 /** 5308 * amdgpu_device_ip_soft_reset - do a soft reset 5309 * 5310 * @adev: amdgpu_device pointer 5311 * 5312 * The list of all the hardware IPs that make up the asic is walked and the 5313 * soft_reset callbacks are run if the block is hung. soft_reset handles any 5314 * IP specific hardware or software state changes that are necessary to soft 5315 * reset the IP. 5316 * Returns 0 on success, negative error code on failure. 5317 */ 5318 static int amdgpu_device_ip_soft_reset(struct amdgpu_device *adev) 5319 { 5320 int i, r = 0; 5321 5322 for (i = 0; i < adev->num_ip_blocks; i++) { 5323 if (!adev->ip_blocks[i].status.valid) 5324 continue; 5325 if (adev->ip_blocks[i].status.hang && 5326 adev->ip_blocks[i].version->funcs->soft_reset) { 5327 r = adev->ip_blocks[i].version->funcs->soft_reset(&adev->ip_blocks[i]); 5328 if (r) 5329 return r; 5330 } 5331 } 5332 5333 return 0; 5334 } 5335 5336 /** 5337 * amdgpu_device_ip_post_soft_reset - clean up from soft reset 5338 * 5339 * @adev: amdgpu_device pointer 5340 * 5341 * The list of all the hardware IPs that make up the asic is walked and the 5342 * post_soft_reset callbacks are run if the asic was hung. post_soft_reset 5343 * handles any IP specific hardware or software state changes that are 5344 * necessary after the IP has been soft reset. 5345 * Returns 0 on success, negative error code on failure. 5346 */ 5347 static int amdgpu_device_ip_post_soft_reset(struct amdgpu_device *adev) 5348 { 5349 int i, r = 0; 5350 5351 for (i = 0; i < adev->num_ip_blocks; i++) { 5352 if (!adev->ip_blocks[i].status.valid) 5353 continue; 5354 if (adev->ip_blocks[i].status.hang && 5355 adev->ip_blocks[i].version->funcs->post_soft_reset) 5356 r = adev->ip_blocks[i].version->funcs->post_soft_reset(&adev->ip_blocks[i]); 5357 if (r) 5358 return r; 5359 } 5360 5361 return 0; 5362 } 5363 5364 /** 5365 * amdgpu_device_reset_sriov - reset ASIC for SR-IOV vf 5366 * 5367 * @adev: amdgpu_device pointer 5368 * @reset_context: amdgpu reset context pointer 5369 * 5370 * do VF FLR and reinitialize Asic 5371 * return 0 means succeeded otherwise failed 5372 */ 5373 static int amdgpu_device_reset_sriov(struct amdgpu_device *adev, 5374 struct amdgpu_reset_context *reset_context) 5375 { 5376 int r; 5377 struct amdgpu_hive_info *hive = NULL; 5378 5379 if (test_bit(AMDGPU_HOST_FLR, &reset_context->flags)) { 5380 if (!amdgpu_ras_get_fed_status(adev)) 5381 amdgpu_virt_ready_to_reset(adev); 5382 amdgpu_virt_wait_reset(adev); 5383 clear_bit(AMDGPU_HOST_FLR, &reset_context->flags); 5384 r = amdgpu_virt_request_full_gpu(adev, true); 5385 } else { 5386 r = amdgpu_virt_reset_gpu(adev); 5387 } 5388 if (r) 5389 return r; 5390 5391 amdgpu_ras_clear_err_state(adev); 5392 amdgpu_irq_gpu_reset_resume_helper(adev); 5393 5394 /* some sw clean up VF needs to do before recover */ 5395 amdgpu_virt_post_reset(adev); 5396 5397 /* Resume IP prior to SMC */ 5398 r = amdgpu_device_ip_reinit_early_sriov(adev); 5399 if (r) 5400 return r; 5401 5402 amdgpu_virt_init_data_exchange(adev); 5403 5404 r = amdgpu_device_fw_loading(adev); 5405 if (r) 5406 return r; 5407 5408 /* now we are okay to resume SMC/CP/SDMA */ 5409 r = amdgpu_device_ip_reinit_late_sriov(adev); 5410 if (r) 5411 return r; 5412 5413 hive = amdgpu_get_xgmi_hive(adev); 5414 /* Update PSP FW topology after reset */ 5415 if (hive && adev->gmc.xgmi.num_physical_nodes > 1) 5416 r = amdgpu_xgmi_update_topology(hive, adev); 5417 if (hive) 5418 amdgpu_put_xgmi_hive(hive); 5419 if (r) 5420 return r; 5421 5422 r = amdgpu_ib_ring_tests(adev); 5423 if (r) 5424 return r; 5425 5426 if (adev->virt.gim_feature & AMDGIM_FEATURE_GIM_FLR_VRAMLOST) 5427 amdgpu_inc_vram_lost(adev); 5428 5429 /* need to be called during full access so we can't do it later like 5430 * bare-metal does. 5431 */ 5432 amdgpu_amdkfd_post_reset(adev); 5433 amdgpu_virt_release_full_gpu(adev, true); 5434 5435 /* Aldebaran and gfx_11_0_3 support ras in SRIOV, so need resume ras during reset */ 5436 if (amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 2) || 5437 amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 3) || 5438 amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 4) || 5439 amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 5, 0) || 5440 amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(11, 0, 3)) 5441 amdgpu_ras_resume(adev); 5442 5443 amdgpu_virt_ras_telemetry_post_reset(adev); 5444 5445 return 0; 5446 } 5447 5448 /** 5449 * amdgpu_device_has_job_running - check if there is any unfinished job 5450 * 5451 * @adev: amdgpu_device pointer 5452 * 5453 * check if there is any job running on the device when guest driver receives 5454 * FLR notification from host driver. If there are still jobs running, then 5455 * the guest driver will not respond the FLR reset. Instead, let the job hit 5456 * the timeout and guest driver then issue the reset request. 5457 */ 5458 bool amdgpu_device_has_job_running(struct amdgpu_device *adev) 5459 { 5460 int i; 5461 5462 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 5463 struct amdgpu_ring *ring = adev->rings[i]; 5464 5465 if (!amdgpu_ring_sched_ready(ring)) 5466 continue; 5467 5468 if (amdgpu_fence_count_emitted(ring)) 5469 return true; 5470 } 5471 return false; 5472 } 5473 5474 /** 5475 * amdgpu_device_should_recover_gpu - check if we should try GPU recovery 5476 * 5477 * @adev: amdgpu_device pointer 5478 * 5479 * Check amdgpu_gpu_recovery and SRIOV status to see if we should try to recover 5480 * a hung GPU. 5481 */ 5482 bool amdgpu_device_should_recover_gpu(struct amdgpu_device *adev) 5483 { 5484 5485 if (amdgpu_gpu_recovery == 0) 5486 goto disabled; 5487 5488 /* Skip soft reset check in fatal error mode */ 5489 if (!amdgpu_ras_is_poison_mode_supported(adev)) 5490 return true; 5491 5492 if (amdgpu_sriov_vf(adev)) 5493 return true; 5494 5495 if (amdgpu_gpu_recovery == -1) { 5496 switch (adev->asic_type) { 5497 #ifdef CONFIG_DRM_AMDGPU_SI 5498 case CHIP_VERDE: 5499 case CHIP_TAHITI: 5500 case CHIP_PITCAIRN: 5501 case CHIP_OLAND: 5502 case CHIP_HAINAN: 5503 #endif 5504 #ifdef CONFIG_DRM_AMDGPU_CIK 5505 case CHIP_KAVERI: 5506 case CHIP_KABINI: 5507 case CHIP_MULLINS: 5508 #endif 5509 case CHIP_CARRIZO: 5510 case CHIP_STONEY: 5511 case CHIP_CYAN_SKILLFISH: 5512 goto disabled; 5513 default: 5514 break; 5515 } 5516 } 5517 5518 return true; 5519 5520 disabled: 5521 dev_info(adev->dev, "GPU recovery disabled.\n"); 5522 return false; 5523 } 5524 5525 int amdgpu_device_mode1_reset(struct amdgpu_device *adev) 5526 { 5527 u32 i; 5528 int ret = 0; 5529 5530 if (adev->bios) 5531 amdgpu_atombios_scratch_regs_engine_hung(adev, true); 5532 5533 dev_info(adev->dev, "GPU mode1 reset\n"); 5534 5535 /* Cache the state before bus master disable. The saved config space 5536 * values are used in other cases like restore after mode-2 reset. 5537 */ 5538 amdgpu_device_cache_pci_state(adev->pdev); 5539 5540 /* disable BM */ 5541 pci_clear_master(adev->pdev); 5542 5543 if (amdgpu_dpm_is_mode1_reset_supported(adev)) { 5544 dev_info(adev->dev, "GPU smu mode1 reset\n"); 5545 ret = amdgpu_dpm_mode1_reset(adev); 5546 } else { 5547 dev_info(adev->dev, "GPU psp mode1 reset\n"); 5548 ret = psp_gpu_reset(adev); 5549 } 5550 5551 if (ret) 5552 goto mode1_reset_failed; 5553 5554 amdgpu_device_load_pci_state(adev->pdev); 5555 ret = amdgpu_psp_wait_for_bootloader(adev); 5556 if (ret) 5557 goto mode1_reset_failed; 5558 5559 /* wait for asic to come out of reset */ 5560 for (i = 0; i < adev->usec_timeout; i++) { 5561 u32 memsize = adev->nbio.funcs->get_memsize(adev); 5562 5563 if (memsize != 0xffffffff) 5564 break; 5565 udelay(1); 5566 } 5567 5568 if (i >= adev->usec_timeout) { 5569 ret = -ETIMEDOUT; 5570 goto mode1_reset_failed; 5571 } 5572 5573 if (adev->bios) 5574 amdgpu_atombios_scratch_regs_engine_hung(adev, false); 5575 5576 return 0; 5577 5578 mode1_reset_failed: 5579 dev_err(adev->dev, "GPU mode1 reset failed\n"); 5580 return ret; 5581 } 5582 5583 int amdgpu_device_link_reset(struct amdgpu_device *adev) 5584 { 5585 int ret = 0; 5586 5587 dev_info(adev->dev, "GPU link reset\n"); 5588 5589 if (!amdgpu_reset_in_dpc(adev)) 5590 ret = amdgpu_dpm_link_reset(adev); 5591 5592 if (ret) 5593 goto link_reset_failed; 5594 5595 ret = amdgpu_psp_wait_for_bootloader(adev); 5596 if (ret) 5597 goto link_reset_failed; 5598 5599 return 0; 5600 5601 link_reset_failed: 5602 dev_err(adev->dev, "GPU link reset failed\n"); 5603 return ret; 5604 } 5605 5606 int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev, 5607 struct amdgpu_reset_context *reset_context) 5608 { 5609 int i, r = 0; 5610 struct amdgpu_job *job = NULL; 5611 struct amdgpu_device *tmp_adev = reset_context->reset_req_dev; 5612 bool need_full_reset = 5613 test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 5614 5615 if (reset_context->reset_req_dev == adev) 5616 job = reset_context->job; 5617 5618 if (amdgpu_sriov_vf(adev)) 5619 amdgpu_virt_pre_reset(adev); 5620 5621 amdgpu_fence_driver_isr_toggle(adev, true); 5622 5623 /* block all schedulers and reset given job's ring */ 5624 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 5625 struct amdgpu_ring *ring = adev->rings[i]; 5626 5627 if (!amdgpu_ring_sched_ready(ring)) 5628 continue; 5629 5630 /* after all hw jobs are reset, hw fence is meaningless, so force_completion */ 5631 amdgpu_fence_driver_force_completion(ring); 5632 } 5633 5634 amdgpu_fence_driver_isr_toggle(adev, false); 5635 5636 if (job && job->vm) 5637 drm_sched_increase_karma(&job->base); 5638 5639 r = amdgpu_reset_prepare_hwcontext(adev, reset_context); 5640 /* If reset handler not implemented, continue; otherwise return */ 5641 if (r == -EOPNOTSUPP) 5642 r = 0; 5643 else 5644 return r; 5645 5646 /* Don't suspend on bare metal if we are not going to HW reset the ASIC */ 5647 if (!amdgpu_sriov_vf(adev)) { 5648 5649 if (!need_full_reset) 5650 need_full_reset = amdgpu_device_ip_need_full_reset(adev); 5651 5652 if (!need_full_reset && amdgpu_gpu_recovery && 5653 amdgpu_device_ip_check_soft_reset(adev)) { 5654 amdgpu_device_ip_pre_soft_reset(adev); 5655 r = amdgpu_device_ip_soft_reset(adev); 5656 amdgpu_device_ip_post_soft_reset(adev); 5657 if (r || amdgpu_device_ip_check_soft_reset(adev)) { 5658 dev_info(adev->dev, "soft reset failed, will fallback to full reset!\n"); 5659 need_full_reset = true; 5660 } 5661 } 5662 5663 if (!test_bit(AMDGPU_SKIP_COREDUMP, &reset_context->flags)) { 5664 dev_info(tmp_adev->dev, "Dumping IP State\n"); 5665 /* Trigger ip dump before we reset the asic */ 5666 for (i = 0; i < tmp_adev->num_ip_blocks; i++) 5667 if (tmp_adev->ip_blocks[i].version->funcs->dump_ip_state) 5668 tmp_adev->ip_blocks[i].version->funcs 5669 ->dump_ip_state((void *)&tmp_adev->ip_blocks[i]); 5670 dev_info(tmp_adev->dev, "Dumping IP State Completed\n"); 5671 } 5672 5673 if (need_full_reset) 5674 r = amdgpu_device_ip_suspend(adev); 5675 if (need_full_reset) 5676 set_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 5677 else 5678 clear_bit(AMDGPU_NEED_FULL_RESET, 5679 &reset_context->flags); 5680 } 5681 5682 return r; 5683 } 5684 5685 int amdgpu_device_reinit_after_reset(struct amdgpu_reset_context *reset_context) 5686 { 5687 struct list_head *device_list_handle; 5688 bool full_reset, vram_lost = false; 5689 struct amdgpu_device *tmp_adev; 5690 int r, init_level; 5691 5692 device_list_handle = reset_context->reset_device_list; 5693 5694 if (!device_list_handle) 5695 return -EINVAL; 5696 5697 full_reset = test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 5698 5699 /** 5700 * If it's reset on init, it's default init level, otherwise keep level 5701 * as recovery level. 5702 */ 5703 if (reset_context->method == AMD_RESET_METHOD_ON_INIT) 5704 init_level = AMDGPU_INIT_LEVEL_DEFAULT; 5705 else 5706 init_level = AMDGPU_INIT_LEVEL_RESET_RECOVERY; 5707 5708 r = 0; 5709 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 5710 amdgpu_set_init_level(tmp_adev, init_level); 5711 if (full_reset) { 5712 /* post card */ 5713 amdgpu_reset_set_dpc_status(tmp_adev, false); 5714 amdgpu_ras_clear_err_state(tmp_adev); 5715 r = amdgpu_device_asic_init(tmp_adev); 5716 if (r) { 5717 dev_warn(tmp_adev->dev, "asic atom init failed!"); 5718 } else { 5719 dev_info(tmp_adev->dev, "GPU reset succeeded, trying to resume\n"); 5720 5721 r = amdgpu_device_ip_resume_phase1(tmp_adev); 5722 if (r) 5723 goto out; 5724 5725 vram_lost = amdgpu_device_check_vram_lost(tmp_adev); 5726 5727 if (!test_bit(AMDGPU_SKIP_COREDUMP, &reset_context->flags)) 5728 amdgpu_coredump(tmp_adev, false, vram_lost, reset_context->job); 5729 5730 if (vram_lost) { 5731 dev_info( 5732 tmp_adev->dev, 5733 "VRAM is lost due to GPU reset!\n"); 5734 amdgpu_inc_vram_lost(tmp_adev); 5735 } 5736 5737 r = amdgpu_device_fw_loading(tmp_adev); 5738 if (r) 5739 return r; 5740 5741 r = amdgpu_xcp_restore_partition_mode( 5742 tmp_adev->xcp_mgr); 5743 if (r) 5744 goto out; 5745 5746 r = amdgpu_device_ip_resume_phase2(tmp_adev); 5747 if (r) 5748 goto out; 5749 5750 if (tmp_adev->mman.buffer_funcs_ring->sched.ready) 5751 amdgpu_ttm_set_buffer_funcs_status(tmp_adev, true); 5752 5753 r = amdgpu_device_ip_resume_phase3(tmp_adev); 5754 if (r) 5755 goto out; 5756 5757 if (vram_lost) 5758 amdgpu_device_fill_reset_magic(tmp_adev); 5759 5760 /* 5761 * Add this ASIC as tracked as reset was already 5762 * complete successfully. 5763 */ 5764 amdgpu_register_gpu_instance(tmp_adev); 5765 5766 if (!reset_context->hive && 5767 tmp_adev->gmc.xgmi.num_physical_nodes > 1) 5768 amdgpu_xgmi_add_device(tmp_adev); 5769 5770 r = amdgpu_device_ip_late_init(tmp_adev); 5771 if (r) 5772 goto out; 5773 5774 r = amdgpu_userq_post_reset(tmp_adev, vram_lost); 5775 if (r) 5776 goto out; 5777 5778 drm_client_dev_resume(adev_to_drm(tmp_adev)); 5779 5780 /* 5781 * The GPU enters bad state once faulty pages 5782 * by ECC has reached the threshold, and ras 5783 * recovery is scheduled next. So add one check 5784 * here to break recovery if it indeed exceeds 5785 * bad page threshold, and remind user to 5786 * retire this GPU or setting one bigger 5787 * bad_page_threshold value to fix this once 5788 * probing driver again. 5789 */ 5790 if (!amdgpu_ras_is_rma(tmp_adev)) { 5791 /* must succeed. */ 5792 amdgpu_ras_resume(tmp_adev); 5793 } else { 5794 r = -EINVAL; 5795 goto out; 5796 } 5797 5798 /* Update PSP FW topology after reset */ 5799 if (reset_context->hive && 5800 tmp_adev->gmc.xgmi.num_physical_nodes > 1) 5801 r = amdgpu_xgmi_update_topology( 5802 reset_context->hive, tmp_adev); 5803 } 5804 } 5805 5806 out: 5807 if (!r) { 5808 /* IP init is complete now, set level as default */ 5809 amdgpu_set_init_level(tmp_adev, 5810 AMDGPU_INIT_LEVEL_DEFAULT); 5811 amdgpu_irq_gpu_reset_resume_helper(tmp_adev); 5812 r = amdgpu_ib_ring_tests(tmp_adev); 5813 if (r) { 5814 dev_err(tmp_adev->dev, "ib ring test failed (%d).\n", r); 5815 r = -EAGAIN; 5816 goto end; 5817 } 5818 } 5819 5820 if (r) 5821 tmp_adev->asic_reset_res = r; 5822 } 5823 5824 end: 5825 return r; 5826 } 5827 5828 int amdgpu_do_asic_reset(struct list_head *device_list_handle, 5829 struct amdgpu_reset_context *reset_context) 5830 { 5831 struct amdgpu_device *tmp_adev = NULL; 5832 bool need_full_reset, skip_hw_reset; 5833 int r = 0; 5834 5835 /* Try reset handler method first */ 5836 tmp_adev = list_first_entry(device_list_handle, struct amdgpu_device, 5837 reset_list); 5838 5839 reset_context->reset_device_list = device_list_handle; 5840 r = amdgpu_reset_perform_reset(tmp_adev, reset_context); 5841 /* If reset handler not implemented, continue; otherwise return */ 5842 if (r == -EOPNOTSUPP) 5843 r = 0; 5844 else 5845 return r; 5846 5847 /* Reset handler not implemented, use the default method */ 5848 need_full_reset = 5849 test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 5850 skip_hw_reset = test_bit(AMDGPU_SKIP_HW_RESET, &reset_context->flags); 5851 5852 /* 5853 * ASIC reset has to be done on all XGMI hive nodes ASAP 5854 * to allow proper links negotiation in FW (within 1 sec) 5855 */ 5856 if (!skip_hw_reset && need_full_reset) { 5857 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 5858 /* For XGMI run all resets in parallel to speed up the process */ 5859 if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) { 5860 if (!queue_work(system_unbound_wq, 5861 &tmp_adev->xgmi_reset_work)) 5862 r = -EALREADY; 5863 } else 5864 r = amdgpu_asic_reset(tmp_adev); 5865 5866 if (r) { 5867 dev_err(tmp_adev->dev, 5868 "ASIC reset failed with error, %d for drm dev, %s", 5869 r, adev_to_drm(tmp_adev)->unique); 5870 goto out; 5871 } 5872 } 5873 5874 /* For XGMI wait for all resets to complete before proceed */ 5875 if (!r) { 5876 list_for_each_entry(tmp_adev, device_list_handle, 5877 reset_list) { 5878 if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) { 5879 flush_work(&tmp_adev->xgmi_reset_work); 5880 r = tmp_adev->asic_reset_res; 5881 if (r) 5882 break; 5883 } 5884 } 5885 } 5886 } 5887 5888 if (!r && amdgpu_ras_intr_triggered()) { 5889 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 5890 amdgpu_ras_reset_error_count(tmp_adev, 5891 AMDGPU_RAS_BLOCK__MMHUB); 5892 } 5893 5894 amdgpu_ras_intr_cleared(); 5895 } 5896 5897 r = amdgpu_device_reinit_after_reset(reset_context); 5898 if (r == -EAGAIN) 5899 set_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 5900 else 5901 clear_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 5902 5903 out: 5904 return r; 5905 } 5906 5907 static void amdgpu_device_set_mp1_state(struct amdgpu_device *adev) 5908 { 5909 5910 switch (amdgpu_asic_reset_method(adev)) { 5911 case AMD_RESET_METHOD_MODE1: 5912 case AMD_RESET_METHOD_LINK: 5913 adev->mp1_state = PP_MP1_STATE_SHUTDOWN; 5914 break; 5915 case AMD_RESET_METHOD_MODE2: 5916 adev->mp1_state = PP_MP1_STATE_RESET; 5917 break; 5918 default: 5919 adev->mp1_state = PP_MP1_STATE_NONE; 5920 break; 5921 } 5922 } 5923 5924 static void amdgpu_device_unset_mp1_state(struct amdgpu_device *adev) 5925 { 5926 amdgpu_vf_error_trans_all(adev); 5927 adev->mp1_state = PP_MP1_STATE_NONE; 5928 } 5929 5930 static void amdgpu_device_resume_display_audio(struct amdgpu_device *adev) 5931 { 5932 struct pci_dev *p = NULL; 5933 5934 p = pci_get_domain_bus_and_slot(pci_domain_nr(adev->pdev->bus), 5935 adev->pdev->bus->number, 1); 5936 if (p) { 5937 pm_runtime_enable(&(p->dev)); 5938 pm_runtime_resume(&(p->dev)); 5939 } 5940 5941 pci_dev_put(p); 5942 } 5943 5944 static int amdgpu_device_suspend_display_audio(struct amdgpu_device *adev) 5945 { 5946 enum amd_reset_method reset_method; 5947 struct pci_dev *p = NULL; 5948 u64 expires; 5949 5950 /* 5951 * For now, only BACO and mode1 reset are confirmed 5952 * to suffer the audio issue without proper suspended. 5953 */ 5954 reset_method = amdgpu_asic_reset_method(adev); 5955 if ((reset_method != AMD_RESET_METHOD_BACO) && 5956 (reset_method != AMD_RESET_METHOD_MODE1)) 5957 return -EINVAL; 5958 5959 p = pci_get_domain_bus_and_slot(pci_domain_nr(adev->pdev->bus), 5960 adev->pdev->bus->number, 1); 5961 if (!p) 5962 return -ENODEV; 5963 5964 expires = pm_runtime_autosuspend_expiration(&(p->dev)); 5965 if (!expires) 5966 /* 5967 * If we cannot get the audio device autosuspend delay, 5968 * a fixed 4S interval will be used. Considering 3S is 5969 * the audio controller default autosuspend delay setting. 5970 * 4S used here is guaranteed to cover that. 5971 */ 5972 expires = ktime_get_mono_fast_ns() + NSEC_PER_SEC * 4ULL; 5973 5974 while (!pm_runtime_status_suspended(&(p->dev))) { 5975 if (!pm_runtime_suspend(&(p->dev))) 5976 break; 5977 5978 if (expires < ktime_get_mono_fast_ns()) { 5979 dev_warn(adev->dev, "failed to suspend display audio\n"); 5980 pci_dev_put(p); 5981 /* TODO: abort the succeeding gpu reset? */ 5982 return -ETIMEDOUT; 5983 } 5984 } 5985 5986 pm_runtime_disable(&(p->dev)); 5987 5988 pci_dev_put(p); 5989 return 0; 5990 } 5991 5992 static inline void amdgpu_device_stop_pending_resets(struct amdgpu_device *adev) 5993 { 5994 struct amdgpu_ras *con = amdgpu_ras_get_context(adev); 5995 5996 #if defined(CONFIG_DEBUG_FS) 5997 if (!amdgpu_sriov_vf(adev)) 5998 cancel_work(&adev->reset_work); 5999 #endif 6000 cancel_work(&adev->userq_reset_work); 6001 6002 if (adev->kfd.dev) 6003 cancel_work(&adev->kfd.reset_work); 6004 6005 if (amdgpu_sriov_vf(adev)) 6006 cancel_work(&adev->virt.flr_work); 6007 6008 if (con && adev->ras_enabled) 6009 cancel_work(&con->recovery_work); 6010 6011 } 6012 6013 static int amdgpu_device_health_check(struct list_head *device_list_handle) 6014 { 6015 struct amdgpu_device *tmp_adev; 6016 int ret = 0; 6017 6018 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 6019 ret |= amdgpu_device_bus_status_check(tmp_adev); 6020 } 6021 6022 return ret; 6023 } 6024 6025 static void amdgpu_device_recovery_prepare(struct amdgpu_device *adev, 6026 struct list_head *device_list, 6027 struct amdgpu_hive_info *hive) 6028 { 6029 struct amdgpu_device *tmp_adev = NULL; 6030 6031 /* 6032 * Build list of devices to reset. 6033 * In case we are in XGMI hive mode, resort the device list 6034 * to put adev in the 1st position. 6035 */ 6036 if (!amdgpu_sriov_vf(adev) && (adev->gmc.xgmi.num_physical_nodes > 1) && hive) { 6037 list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head) { 6038 list_add_tail(&tmp_adev->reset_list, device_list); 6039 if (adev->shutdown) 6040 tmp_adev->shutdown = true; 6041 if (amdgpu_reset_in_dpc(adev)) 6042 tmp_adev->pcie_reset_ctx.in_link_reset = true; 6043 } 6044 if (!list_is_first(&adev->reset_list, device_list)) 6045 list_rotate_to_front(&adev->reset_list, device_list); 6046 } else { 6047 list_add_tail(&adev->reset_list, device_list); 6048 } 6049 } 6050 6051 static void amdgpu_device_recovery_get_reset_lock(struct amdgpu_device *adev, 6052 struct list_head *device_list) 6053 { 6054 struct amdgpu_device *tmp_adev = NULL; 6055 6056 if (list_empty(device_list)) 6057 return; 6058 tmp_adev = 6059 list_first_entry(device_list, struct amdgpu_device, reset_list); 6060 amdgpu_device_lock_reset_domain(tmp_adev->reset_domain); 6061 } 6062 6063 static void amdgpu_device_recovery_put_reset_lock(struct amdgpu_device *adev, 6064 struct list_head *device_list) 6065 { 6066 struct amdgpu_device *tmp_adev = NULL; 6067 6068 if (list_empty(device_list)) 6069 return; 6070 tmp_adev = 6071 list_first_entry(device_list, struct amdgpu_device, reset_list); 6072 amdgpu_device_unlock_reset_domain(tmp_adev->reset_domain); 6073 } 6074 6075 static void amdgpu_device_halt_activities(struct amdgpu_device *adev, 6076 struct amdgpu_job *job, 6077 struct amdgpu_reset_context *reset_context, 6078 struct list_head *device_list, 6079 struct amdgpu_hive_info *hive, 6080 bool need_emergency_restart) 6081 { 6082 struct amdgpu_device *tmp_adev = NULL; 6083 int i; 6084 6085 /* block all schedulers and reset given job's ring */ 6086 list_for_each_entry(tmp_adev, device_list, reset_list) { 6087 amdgpu_device_set_mp1_state(tmp_adev); 6088 6089 /* 6090 * Try to put the audio codec into suspend state 6091 * before gpu reset started. 6092 * 6093 * Due to the power domain of the graphics device 6094 * is shared with AZ power domain. Without this, 6095 * we may change the audio hardware from behind 6096 * the audio driver's back. That will trigger 6097 * some audio codec errors. 6098 */ 6099 if (!amdgpu_device_suspend_display_audio(tmp_adev)) 6100 tmp_adev->pcie_reset_ctx.audio_suspended = true; 6101 6102 amdgpu_ras_set_error_query_ready(tmp_adev, false); 6103 6104 cancel_delayed_work_sync(&tmp_adev->delayed_init_work); 6105 6106 amdgpu_amdkfd_pre_reset(tmp_adev, reset_context); 6107 6108 /* 6109 * Mark these ASICs to be reset as untracked first 6110 * And add them back after reset completed 6111 */ 6112 amdgpu_unregister_gpu_instance(tmp_adev); 6113 6114 drm_client_dev_suspend(adev_to_drm(tmp_adev)); 6115 6116 /* disable ras on ALL IPs */ 6117 if (!need_emergency_restart && !amdgpu_reset_in_dpc(adev) && 6118 amdgpu_device_ip_need_full_reset(tmp_adev)) 6119 amdgpu_ras_suspend(tmp_adev); 6120 6121 amdgpu_userq_pre_reset(tmp_adev); 6122 6123 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 6124 struct amdgpu_ring *ring = tmp_adev->rings[i]; 6125 6126 if (!amdgpu_ring_sched_ready(ring)) 6127 continue; 6128 6129 drm_sched_stop(&ring->sched, job ? &job->base : NULL); 6130 6131 if (need_emergency_restart) 6132 amdgpu_job_stop_all_jobs_on_sched(&ring->sched); 6133 } 6134 atomic_inc(&tmp_adev->gpu_reset_counter); 6135 } 6136 } 6137 6138 static int amdgpu_device_asic_reset(struct amdgpu_device *adev, 6139 struct list_head *device_list, 6140 struct amdgpu_reset_context *reset_context) 6141 { 6142 struct amdgpu_device *tmp_adev = NULL; 6143 int retry_limit = AMDGPU_MAX_RETRY_LIMIT; 6144 int r = 0; 6145 6146 retry: /* Rest of adevs pre asic reset from XGMI hive. */ 6147 list_for_each_entry(tmp_adev, device_list, reset_list) { 6148 r = amdgpu_device_pre_asic_reset(tmp_adev, reset_context); 6149 /*TODO Should we stop ?*/ 6150 if (r) { 6151 dev_err(tmp_adev->dev, "GPU pre asic reset failed with err, %d for drm dev, %s ", 6152 r, adev_to_drm(tmp_adev)->unique); 6153 tmp_adev->asic_reset_res = r; 6154 } 6155 } 6156 6157 /* Actual ASIC resets if needed.*/ 6158 /* Host driver will handle XGMI hive reset for SRIOV */ 6159 if (amdgpu_sriov_vf(adev)) { 6160 6161 /* Bail out of reset early */ 6162 if (amdgpu_ras_is_rma(adev)) 6163 return -ENODEV; 6164 6165 if (amdgpu_ras_get_fed_status(adev) || amdgpu_virt_rcvd_ras_interrupt(adev)) { 6166 dev_dbg(adev->dev, "Detected RAS error, wait for FLR completion\n"); 6167 amdgpu_ras_set_fed(adev, true); 6168 set_bit(AMDGPU_HOST_FLR, &reset_context->flags); 6169 } 6170 6171 r = amdgpu_device_reset_sriov(adev, reset_context); 6172 if (AMDGPU_RETRY_SRIOV_RESET(r) && (retry_limit--) > 0) { 6173 amdgpu_virt_release_full_gpu(adev, true); 6174 goto retry; 6175 } 6176 if (r) 6177 adev->asic_reset_res = r; 6178 } else { 6179 r = amdgpu_do_asic_reset(device_list, reset_context); 6180 if (r && r == -EAGAIN) 6181 goto retry; 6182 } 6183 6184 list_for_each_entry(tmp_adev, device_list, reset_list) { 6185 /* 6186 * Drop any pending non scheduler resets queued before reset is done. 6187 * Any reset scheduled after this point would be valid. Scheduler resets 6188 * were already dropped during drm_sched_stop and no new ones can come 6189 * in before drm_sched_start. 6190 */ 6191 amdgpu_device_stop_pending_resets(tmp_adev); 6192 } 6193 6194 return r; 6195 } 6196 6197 static int amdgpu_device_sched_resume(struct list_head *device_list, 6198 struct amdgpu_reset_context *reset_context, 6199 bool job_signaled) 6200 { 6201 struct amdgpu_device *tmp_adev = NULL; 6202 int i, r = 0; 6203 6204 /* Post ASIC reset for all devs .*/ 6205 list_for_each_entry(tmp_adev, device_list, reset_list) { 6206 6207 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 6208 struct amdgpu_ring *ring = tmp_adev->rings[i]; 6209 6210 if (!amdgpu_ring_sched_ready(ring)) 6211 continue; 6212 6213 drm_sched_start(&ring->sched, 0); 6214 } 6215 6216 if (!drm_drv_uses_atomic_modeset(adev_to_drm(tmp_adev)) && !job_signaled) 6217 drm_helper_resume_force_mode(adev_to_drm(tmp_adev)); 6218 6219 if (tmp_adev->asic_reset_res) { 6220 /* bad news, how to tell it to userspace ? 6221 * for ras error, we should report GPU bad status instead of 6222 * reset failure 6223 */ 6224 if (reset_context->src != AMDGPU_RESET_SRC_RAS || 6225 !amdgpu_ras_eeprom_check_err_threshold(tmp_adev)) 6226 dev_info( 6227 tmp_adev->dev, 6228 "GPU reset(%d) failed with error %d \n", 6229 atomic_read( 6230 &tmp_adev->gpu_reset_counter), 6231 tmp_adev->asic_reset_res); 6232 amdgpu_vf_error_put(tmp_adev, 6233 AMDGIM_ERROR_VF_GPU_RESET_FAIL, 0, 6234 tmp_adev->asic_reset_res); 6235 if (!r) 6236 r = tmp_adev->asic_reset_res; 6237 tmp_adev->asic_reset_res = 0; 6238 } else { 6239 dev_info(tmp_adev->dev, "GPU reset(%d) succeeded!\n", 6240 atomic_read(&tmp_adev->gpu_reset_counter)); 6241 if (amdgpu_acpi_smart_shift_update(tmp_adev, 6242 AMDGPU_SS_DEV_D0)) 6243 dev_warn(tmp_adev->dev, 6244 "smart shift update failed\n"); 6245 } 6246 } 6247 6248 return r; 6249 } 6250 6251 static void amdgpu_device_gpu_resume(struct amdgpu_device *adev, 6252 struct list_head *device_list, 6253 bool need_emergency_restart) 6254 { 6255 struct amdgpu_device *tmp_adev = NULL; 6256 6257 list_for_each_entry(tmp_adev, device_list, reset_list) { 6258 /* unlock kfd: SRIOV would do it separately */ 6259 if (!need_emergency_restart && !amdgpu_sriov_vf(tmp_adev)) 6260 amdgpu_amdkfd_post_reset(tmp_adev); 6261 6262 /* kfd_post_reset will do nothing if kfd device is not initialized, 6263 * need to bring up kfd here if it's not be initialized before 6264 */ 6265 if (!adev->kfd.init_complete) 6266 amdgpu_amdkfd_device_init(adev); 6267 6268 if (tmp_adev->pcie_reset_ctx.audio_suspended) 6269 amdgpu_device_resume_display_audio(tmp_adev); 6270 6271 amdgpu_device_unset_mp1_state(tmp_adev); 6272 6273 amdgpu_ras_set_error_query_ready(tmp_adev, true); 6274 6275 } 6276 } 6277 6278 6279 /** 6280 * amdgpu_device_gpu_recover - reset the asic and recover scheduler 6281 * 6282 * @adev: amdgpu_device pointer 6283 * @job: which job trigger hang 6284 * @reset_context: amdgpu reset context pointer 6285 * 6286 * Attempt to reset the GPU if it has hung (all asics). 6287 * Attempt to do soft-reset or full-reset and reinitialize Asic 6288 * Returns 0 for success or an error on failure. 6289 */ 6290 6291 int amdgpu_device_gpu_recover(struct amdgpu_device *adev, 6292 struct amdgpu_job *job, 6293 struct amdgpu_reset_context *reset_context) 6294 { 6295 struct list_head device_list; 6296 bool job_signaled = false; 6297 struct amdgpu_hive_info *hive = NULL; 6298 int r = 0; 6299 bool need_emergency_restart = false; 6300 /* save the pasid here as the job may be freed before the end of the reset */ 6301 int pasid = job ? job->pasid : -EINVAL; 6302 6303 /* 6304 * If it reaches here because of hang/timeout and a RAS error is 6305 * detected at the same time, let RAS recovery take care of it. 6306 */ 6307 if (amdgpu_ras_is_err_state(adev, AMDGPU_RAS_BLOCK__ANY) && 6308 !amdgpu_sriov_vf(adev) && 6309 reset_context->src != AMDGPU_RESET_SRC_RAS) { 6310 dev_dbg(adev->dev, 6311 "Gpu recovery from source: %d yielding to RAS error recovery handling", 6312 reset_context->src); 6313 return 0; 6314 } 6315 6316 /* 6317 * Special case: RAS triggered and full reset isn't supported 6318 */ 6319 need_emergency_restart = amdgpu_ras_need_emergency_restart(adev); 6320 6321 /* 6322 * Flush RAM to disk so that after reboot 6323 * the user can read log and see why the system rebooted. 6324 */ 6325 if (need_emergency_restart && amdgpu_ras_get_context(adev) && 6326 amdgpu_ras_get_context(adev)->reboot) { 6327 dev_warn(adev->dev, "Emergency reboot."); 6328 6329 ksys_sync_helper(); 6330 emergency_restart(); 6331 } 6332 6333 dev_info(adev->dev, "GPU %s begin!. Source: %d\n", 6334 need_emergency_restart ? "jobs stop" : "reset", 6335 reset_context->src); 6336 6337 if (!amdgpu_sriov_vf(adev)) 6338 hive = amdgpu_get_xgmi_hive(adev); 6339 if (hive) 6340 mutex_lock(&hive->hive_lock); 6341 6342 reset_context->job = job; 6343 reset_context->hive = hive; 6344 INIT_LIST_HEAD(&device_list); 6345 6346 amdgpu_device_recovery_prepare(adev, &device_list, hive); 6347 6348 if (!amdgpu_sriov_vf(adev)) { 6349 r = amdgpu_device_health_check(&device_list); 6350 if (r) 6351 goto end_reset; 6352 } 6353 6354 /* Cannot be called after locking reset domain */ 6355 amdgpu_ras_pre_reset(adev, &device_list); 6356 6357 /* We need to lock reset domain only once both for XGMI and single device */ 6358 amdgpu_device_recovery_get_reset_lock(adev, &device_list); 6359 6360 amdgpu_device_halt_activities(adev, job, reset_context, &device_list, 6361 hive, need_emergency_restart); 6362 if (need_emergency_restart) 6363 goto skip_sched_resume; 6364 /* 6365 * Must check guilty signal here since after this point all old 6366 * HW fences are force signaled. 6367 * 6368 * job->base holds a reference to parent fence 6369 */ 6370 if (job && dma_fence_is_signaled(&job->hw_fence->base)) { 6371 job_signaled = true; 6372 dev_info(adev->dev, "Guilty job already signaled, skipping HW reset"); 6373 goto skip_hw_reset; 6374 } 6375 6376 r = amdgpu_device_asic_reset(adev, &device_list, reset_context); 6377 if (r) 6378 goto reset_unlock; 6379 skip_hw_reset: 6380 r = amdgpu_device_sched_resume(&device_list, reset_context, job_signaled); 6381 if (r) 6382 goto reset_unlock; 6383 skip_sched_resume: 6384 amdgpu_device_gpu_resume(adev, &device_list, need_emergency_restart); 6385 reset_unlock: 6386 amdgpu_device_recovery_put_reset_lock(adev, &device_list); 6387 amdgpu_ras_post_reset(adev, &device_list); 6388 end_reset: 6389 if (hive) { 6390 mutex_unlock(&hive->hive_lock); 6391 amdgpu_put_xgmi_hive(hive); 6392 } 6393 6394 if (r) 6395 dev_info(adev->dev, "GPU reset end with ret = %d\n", r); 6396 6397 atomic_set(&adev->reset_domain->reset_res, r); 6398 6399 if (!r) { 6400 struct amdgpu_task_info *ti = NULL; 6401 6402 /* 6403 * The job may already be freed at this point via the sched tdr workqueue so 6404 * use the cached pasid. 6405 */ 6406 if (pasid >= 0) 6407 ti = amdgpu_vm_get_task_info_pasid(adev, pasid); 6408 6409 drm_dev_wedged_event(adev_to_drm(adev), DRM_WEDGE_RECOVERY_NONE, 6410 ti ? &ti->task : NULL); 6411 6412 amdgpu_vm_put_task_info(ti); 6413 } 6414 6415 return r; 6416 } 6417 6418 /** 6419 * amdgpu_device_partner_bandwidth - find the bandwidth of appropriate partner 6420 * 6421 * @adev: amdgpu_device pointer 6422 * @speed: pointer to the speed of the link 6423 * @width: pointer to the width of the link 6424 * 6425 * Evaluate the hierarchy to find the speed and bandwidth capabilities of the 6426 * first physical partner to an AMD dGPU. 6427 * This will exclude any virtual switches and links. 6428 */ 6429 static void amdgpu_device_partner_bandwidth(struct amdgpu_device *adev, 6430 enum pci_bus_speed *speed, 6431 enum pcie_link_width *width) 6432 { 6433 struct pci_dev *parent = adev->pdev; 6434 6435 if (!speed || !width) 6436 return; 6437 6438 *speed = PCI_SPEED_UNKNOWN; 6439 *width = PCIE_LNK_WIDTH_UNKNOWN; 6440 6441 if (amdgpu_device_pcie_dynamic_switching_supported(adev)) { 6442 while ((parent = pci_upstream_bridge(parent))) { 6443 /* skip upstream/downstream switches internal to dGPU*/ 6444 if (parent->vendor == PCI_VENDOR_ID_ATI) 6445 continue; 6446 *speed = pcie_get_speed_cap(parent); 6447 *width = pcie_get_width_cap(parent); 6448 break; 6449 } 6450 } else { 6451 /* use the current speeds rather than max if switching is not supported */ 6452 pcie_bandwidth_available(adev->pdev, NULL, speed, width); 6453 } 6454 } 6455 6456 /** 6457 * amdgpu_device_gpu_bandwidth - find the bandwidth of the GPU 6458 * 6459 * @adev: amdgpu_device pointer 6460 * @speed: pointer to the speed of the link 6461 * @width: pointer to the width of the link 6462 * 6463 * Evaluate the hierarchy to find the speed and bandwidth capabilities of the 6464 * AMD dGPU which may be a virtual upstream bridge. 6465 */ 6466 static void amdgpu_device_gpu_bandwidth(struct amdgpu_device *adev, 6467 enum pci_bus_speed *speed, 6468 enum pcie_link_width *width) 6469 { 6470 struct pci_dev *parent = adev->pdev; 6471 6472 if (!speed || !width) 6473 return; 6474 6475 parent = pci_upstream_bridge(parent); 6476 if (parent && parent->vendor == PCI_VENDOR_ID_ATI) { 6477 /* use the upstream/downstream switches internal to dGPU */ 6478 *speed = pcie_get_speed_cap(parent); 6479 *width = pcie_get_width_cap(parent); 6480 while ((parent = pci_upstream_bridge(parent))) { 6481 if (parent->vendor == PCI_VENDOR_ID_ATI) { 6482 /* use the upstream/downstream switches internal to dGPU */ 6483 *speed = pcie_get_speed_cap(parent); 6484 *width = pcie_get_width_cap(parent); 6485 } 6486 } 6487 } else { 6488 /* use the device itself */ 6489 *speed = pcie_get_speed_cap(adev->pdev); 6490 *width = pcie_get_width_cap(adev->pdev); 6491 } 6492 } 6493 6494 /** 6495 * amdgpu_device_get_pcie_info - fence pcie info about the PCIE slot 6496 * 6497 * @adev: amdgpu_device pointer 6498 * 6499 * Fetches and stores in the driver the PCIE capabilities (gen speed 6500 * and lanes) of the slot the device is in. Handles APUs and 6501 * virtualized environments where PCIE config space may not be available. 6502 */ 6503 static void amdgpu_device_get_pcie_info(struct amdgpu_device *adev) 6504 { 6505 enum pci_bus_speed speed_cap, platform_speed_cap; 6506 enum pcie_link_width platform_link_width, link_width; 6507 6508 if (amdgpu_pcie_gen_cap) 6509 adev->pm.pcie_gen_mask = amdgpu_pcie_gen_cap; 6510 6511 if (amdgpu_pcie_lane_cap) 6512 adev->pm.pcie_mlw_mask = amdgpu_pcie_lane_cap; 6513 6514 /* covers APUs as well */ 6515 if (pci_is_root_bus(adev->pdev->bus) && !amdgpu_passthrough(adev)) { 6516 if (adev->pm.pcie_gen_mask == 0) 6517 adev->pm.pcie_gen_mask = AMDGPU_DEFAULT_PCIE_GEN_MASK; 6518 if (adev->pm.pcie_mlw_mask == 0) 6519 adev->pm.pcie_mlw_mask = AMDGPU_DEFAULT_PCIE_MLW_MASK; 6520 return; 6521 } 6522 6523 if (adev->pm.pcie_gen_mask && adev->pm.pcie_mlw_mask) 6524 return; 6525 6526 amdgpu_device_partner_bandwidth(adev, &platform_speed_cap, 6527 &platform_link_width); 6528 amdgpu_device_gpu_bandwidth(adev, &speed_cap, &link_width); 6529 6530 if (adev->pm.pcie_gen_mask == 0) { 6531 /* asic caps */ 6532 if (speed_cap == PCI_SPEED_UNKNOWN) { 6533 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 6534 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 | 6535 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3); 6536 } else { 6537 if (speed_cap == PCIE_SPEED_32_0GT) 6538 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 6539 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 | 6540 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3 | 6541 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN4 | 6542 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN5); 6543 else if (speed_cap == PCIE_SPEED_16_0GT) 6544 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 6545 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 | 6546 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3 | 6547 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN4); 6548 else if (speed_cap == PCIE_SPEED_8_0GT) 6549 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 6550 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 | 6551 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3); 6552 else if (speed_cap == PCIE_SPEED_5_0GT) 6553 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 6554 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2); 6555 else 6556 adev->pm.pcie_gen_mask |= CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1; 6557 } 6558 /* platform caps */ 6559 if (platform_speed_cap == PCI_SPEED_UNKNOWN) { 6560 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 6561 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2); 6562 } else { 6563 if (platform_speed_cap == PCIE_SPEED_32_0GT) 6564 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 6565 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 | 6566 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3 | 6567 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN4 | 6568 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN5); 6569 else if (platform_speed_cap == PCIE_SPEED_16_0GT) 6570 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 6571 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 | 6572 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3 | 6573 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN4); 6574 else if (platform_speed_cap == PCIE_SPEED_8_0GT) 6575 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 6576 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 | 6577 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3); 6578 else if (platform_speed_cap == PCIE_SPEED_5_0GT) 6579 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 6580 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2); 6581 else 6582 adev->pm.pcie_gen_mask |= CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1; 6583 6584 } 6585 } 6586 if (adev->pm.pcie_mlw_mask == 0) { 6587 /* asic caps */ 6588 if (link_width == PCIE_LNK_WIDTH_UNKNOWN) { 6589 adev->pm.pcie_mlw_mask |= AMDGPU_DEFAULT_ASIC_PCIE_MLW_MASK; 6590 } else { 6591 switch (link_width) { 6592 case PCIE_LNK_X32: 6593 adev->pm.pcie_mlw_mask |= (CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X32 | 6594 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X16 | 6595 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X12 | 6596 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X8 | 6597 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X4 | 6598 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X2 | 6599 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X1); 6600 break; 6601 case PCIE_LNK_X16: 6602 adev->pm.pcie_mlw_mask |= (CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X16 | 6603 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X12 | 6604 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X8 | 6605 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X4 | 6606 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X2 | 6607 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X1); 6608 break; 6609 case PCIE_LNK_X12: 6610 adev->pm.pcie_mlw_mask |= (CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X12 | 6611 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X8 | 6612 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X4 | 6613 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X2 | 6614 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X1); 6615 break; 6616 case PCIE_LNK_X8: 6617 adev->pm.pcie_mlw_mask |= (CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X8 | 6618 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X4 | 6619 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X2 | 6620 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X1); 6621 break; 6622 case PCIE_LNK_X4: 6623 adev->pm.pcie_mlw_mask |= (CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X4 | 6624 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X2 | 6625 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X1); 6626 break; 6627 case PCIE_LNK_X2: 6628 adev->pm.pcie_mlw_mask |= (CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X2 | 6629 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X1); 6630 break; 6631 case PCIE_LNK_X1: 6632 adev->pm.pcie_mlw_mask |= CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X1; 6633 break; 6634 default: 6635 break; 6636 } 6637 } 6638 /* platform caps */ 6639 if (platform_link_width == PCIE_LNK_WIDTH_UNKNOWN) { 6640 adev->pm.pcie_mlw_mask |= AMDGPU_DEFAULT_PCIE_MLW_MASK; 6641 } else { 6642 switch (platform_link_width) { 6643 case PCIE_LNK_X32: 6644 adev->pm.pcie_mlw_mask |= (CAIL_PCIE_LINK_WIDTH_SUPPORT_X32 | 6645 CAIL_PCIE_LINK_WIDTH_SUPPORT_X16 | 6646 CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 | 6647 CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 | 6648 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 6649 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 6650 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 6651 break; 6652 case PCIE_LNK_X16: 6653 adev->pm.pcie_mlw_mask |= (CAIL_PCIE_LINK_WIDTH_SUPPORT_X16 | 6654 CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 | 6655 CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 | 6656 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 6657 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 6658 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 6659 break; 6660 case PCIE_LNK_X12: 6661 adev->pm.pcie_mlw_mask |= (CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 | 6662 CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 | 6663 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 6664 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 6665 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 6666 break; 6667 case PCIE_LNK_X8: 6668 adev->pm.pcie_mlw_mask |= (CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 | 6669 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 6670 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 6671 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 6672 break; 6673 case PCIE_LNK_X4: 6674 adev->pm.pcie_mlw_mask |= (CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 6675 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 6676 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 6677 break; 6678 case PCIE_LNK_X2: 6679 adev->pm.pcie_mlw_mask |= (CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 6680 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 6681 break; 6682 case PCIE_LNK_X1: 6683 adev->pm.pcie_mlw_mask |= CAIL_PCIE_LINK_WIDTH_SUPPORT_X1; 6684 break; 6685 default: 6686 break; 6687 } 6688 } 6689 } 6690 } 6691 6692 /** 6693 * amdgpu_device_is_peer_accessible - Check peer access through PCIe BAR 6694 * 6695 * @adev: amdgpu_device pointer 6696 * @peer_adev: amdgpu_device pointer for peer device trying to access @adev 6697 * 6698 * Return true if @peer_adev can access (DMA) @adev through the PCIe 6699 * BAR, i.e. @adev is "large BAR" and the BAR matches the DMA mask of 6700 * @peer_adev. 6701 */ 6702 bool amdgpu_device_is_peer_accessible(struct amdgpu_device *adev, 6703 struct amdgpu_device *peer_adev) 6704 { 6705 #ifdef CONFIG_HSA_AMD_P2P 6706 bool p2p_access = 6707 !adev->gmc.xgmi.connected_to_cpu && 6708 !(pci_p2pdma_distance(adev->pdev, peer_adev->dev, false) < 0); 6709 if (!p2p_access) 6710 dev_info(adev->dev, "PCIe P2P access from peer device %s is not supported by the chipset\n", 6711 pci_name(peer_adev->pdev)); 6712 6713 bool is_large_bar = adev->gmc.visible_vram_size && 6714 adev->gmc.real_vram_size == adev->gmc.visible_vram_size; 6715 bool p2p_addressable = amdgpu_device_check_iommu_remap(peer_adev); 6716 6717 if (!p2p_addressable) { 6718 uint64_t address_mask = peer_adev->dev->dma_mask ? 6719 ~*peer_adev->dev->dma_mask : ~((1ULL << 32) - 1); 6720 resource_size_t aper_limit = 6721 adev->gmc.aper_base + adev->gmc.aper_size - 1; 6722 6723 p2p_addressable = !(adev->gmc.aper_base & address_mask || 6724 aper_limit & address_mask); 6725 } 6726 return pcie_p2p && is_large_bar && p2p_access && p2p_addressable; 6727 #else 6728 return false; 6729 #endif 6730 } 6731 6732 int amdgpu_device_baco_enter(struct amdgpu_device *adev) 6733 { 6734 struct amdgpu_ras *ras = amdgpu_ras_get_context(adev); 6735 6736 if (!amdgpu_device_supports_baco(adev)) 6737 return -ENOTSUPP; 6738 6739 if (ras && adev->ras_enabled && 6740 adev->nbio.funcs->enable_doorbell_interrupt) 6741 adev->nbio.funcs->enable_doorbell_interrupt(adev, false); 6742 6743 return amdgpu_dpm_baco_enter(adev); 6744 } 6745 6746 int amdgpu_device_baco_exit(struct amdgpu_device *adev) 6747 { 6748 struct amdgpu_ras *ras = amdgpu_ras_get_context(adev); 6749 int ret = 0; 6750 6751 if (!amdgpu_device_supports_baco(adev)) 6752 return -ENOTSUPP; 6753 6754 ret = amdgpu_dpm_baco_exit(adev); 6755 if (ret) 6756 return ret; 6757 6758 if (ras && adev->ras_enabled && 6759 adev->nbio.funcs->enable_doorbell_interrupt) 6760 adev->nbio.funcs->enable_doorbell_interrupt(adev, true); 6761 6762 if (amdgpu_passthrough(adev) && adev->nbio.funcs && 6763 adev->nbio.funcs->clear_doorbell_interrupt) 6764 adev->nbio.funcs->clear_doorbell_interrupt(adev); 6765 6766 return 0; 6767 } 6768 6769 /** 6770 * amdgpu_pci_error_detected - Called when a PCI error is detected. 6771 * @pdev: PCI device struct 6772 * @state: PCI channel state 6773 * 6774 * Description: Called when a PCI error is detected. 6775 * 6776 * Return: PCI_ERS_RESULT_NEED_RESET or PCI_ERS_RESULT_DISCONNECT. 6777 */ 6778 pci_ers_result_t amdgpu_pci_error_detected(struct pci_dev *pdev, pci_channel_state_t state) 6779 { 6780 struct drm_device *dev = pci_get_drvdata(pdev); 6781 struct amdgpu_device *adev = drm_to_adev(dev); 6782 struct amdgpu_hive_info *hive __free(xgmi_put_hive) = 6783 amdgpu_get_xgmi_hive(adev); 6784 struct amdgpu_reset_context reset_context; 6785 struct list_head device_list; 6786 6787 dev_info(adev->dev, "PCI error: detected callback!!\n"); 6788 6789 adev->pci_channel_state = state; 6790 6791 switch (state) { 6792 case pci_channel_io_normal: 6793 dev_info(adev->dev, "pci_channel_io_normal: state(%d)!!\n", state); 6794 return PCI_ERS_RESULT_CAN_RECOVER; 6795 case pci_channel_io_frozen: 6796 /* Fatal error, prepare for slot reset */ 6797 dev_info(adev->dev, "pci_channel_io_frozen: state(%d)!!\n", state); 6798 if (hive) { 6799 /* Hive devices should be able to support FW based 6800 * link reset on other devices, if not return. 6801 */ 6802 if (!amdgpu_dpm_is_link_reset_supported(adev)) { 6803 dev_warn(adev->dev, 6804 "No support for XGMI hive yet...\n"); 6805 return PCI_ERS_RESULT_DISCONNECT; 6806 } 6807 /* Set dpc status only if device is part of hive 6808 * Non-hive devices should be able to recover after 6809 * link reset. 6810 */ 6811 amdgpu_reset_set_dpc_status(adev, true); 6812 6813 mutex_lock(&hive->hive_lock); 6814 } 6815 memset(&reset_context, 0, sizeof(reset_context)); 6816 INIT_LIST_HEAD(&device_list); 6817 6818 amdgpu_device_recovery_prepare(adev, &device_list, hive); 6819 amdgpu_device_recovery_get_reset_lock(adev, &device_list); 6820 amdgpu_device_halt_activities(adev, NULL, &reset_context, &device_list, 6821 hive, false); 6822 if (hive) 6823 mutex_unlock(&hive->hive_lock); 6824 return PCI_ERS_RESULT_NEED_RESET; 6825 case pci_channel_io_perm_failure: 6826 /* Permanent error, prepare for device removal */ 6827 dev_info(adev->dev, "pci_channel_io_perm_failure: state(%d)!!\n", state); 6828 return PCI_ERS_RESULT_DISCONNECT; 6829 } 6830 6831 return PCI_ERS_RESULT_NEED_RESET; 6832 } 6833 6834 /** 6835 * amdgpu_pci_mmio_enabled - Enable MMIO and dump debug registers 6836 * @pdev: pointer to PCI device 6837 */ 6838 pci_ers_result_t amdgpu_pci_mmio_enabled(struct pci_dev *pdev) 6839 { 6840 struct drm_device *dev = pci_get_drvdata(pdev); 6841 struct amdgpu_device *adev = drm_to_adev(dev); 6842 6843 dev_info(adev->dev, "PCI error: mmio enabled callback!!\n"); 6844 6845 /* TODO - dump whatever for debugging purposes */ 6846 6847 /* This called only if amdgpu_pci_error_detected returns 6848 * PCI_ERS_RESULT_CAN_RECOVER. Read/write to the device still 6849 * works, no need to reset slot. 6850 */ 6851 6852 return PCI_ERS_RESULT_RECOVERED; 6853 } 6854 6855 /** 6856 * amdgpu_pci_slot_reset - Called when PCI slot has been reset. 6857 * @pdev: PCI device struct 6858 * 6859 * Description: This routine is called by the pci error recovery 6860 * code after the PCI slot has been reset, just before we 6861 * should resume normal operations. 6862 */ 6863 pci_ers_result_t amdgpu_pci_slot_reset(struct pci_dev *pdev) 6864 { 6865 struct drm_device *dev = pci_get_drvdata(pdev); 6866 struct amdgpu_device *adev = drm_to_adev(dev); 6867 struct amdgpu_reset_context reset_context; 6868 struct amdgpu_device *tmp_adev; 6869 struct amdgpu_hive_info *hive; 6870 struct list_head device_list; 6871 struct pci_dev *link_dev; 6872 int r = 0, i, timeout; 6873 u32 memsize; 6874 u16 status; 6875 6876 dev_info(adev->dev, "PCI error: slot reset callback!!\n"); 6877 6878 memset(&reset_context, 0, sizeof(reset_context)); 6879 6880 if (adev->pcie_reset_ctx.swus) 6881 link_dev = adev->pcie_reset_ctx.swus; 6882 else 6883 link_dev = adev->pdev; 6884 /* wait for asic to come out of reset, timeout = 10s */ 6885 timeout = 10000; 6886 do { 6887 usleep_range(10000, 10500); 6888 r = pci_read_config_word(link_dev, PCI_VENDOR_ID, &status); 6889 timeout -= 10; 6890 } while (timeout > 0 && (status != PCI_VENDOR_ID_ATI) && 6891 (status != PCI_VENDOR_ID_AMD)); 6892 6893 if ((status != PCI_VENDOR_ID_ATI) && (status != PCI_VENDOR_ID_AMD)) { 6894 r = -ETIME; 6895 goto out; 6896 } 6897 6898 amdgpu_device_load_switch_state(adev); 6899 /* Restore PCI confspace */ 6900 amdgpu_device_load_pci_state(pdev); 6901 6902 /* confirm ASIC came out of reset */ 6903 for (i = 0; i < adev->usec_timeout; i++) { 6904 memsize = amdgpu_asic_get_config_memsize(adev); 6905 6906 if (memsize != 0xffffffff) 6907 break; 6908 udelay(1); 6909 } 6910 if (memsize == 0xffffffff) { 6911 r = -ETIME; 6912 goto out; 6913 } 6914 6915 reset_context.method = AMD_RESET_METHOD_NONE; 6916 reset_context.reset_req_dev = adev; 6917 set_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags); 6918 set_bit(AMDGPU_SKIP_COREDUMP, &reset_context.flags); 6919 INIT_LIST_HEAD(&device_list); 6920 6921 hive = amdgpu_get_xgmi_hive(adev); 6922 if (hive) { 6923 mutex_lock(&hive->hive_lock); 6924 reset_context.hive = hive; 6925 list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head) { 6926 tmp_adev->pcie_reset_ctx.in_link_reset = true; 6927 list_add_tail(&tmp_adev->reset_list, &device_list); 6928 } 6929 } else { 6930 set_bit(AMDGPU_SKIP_HW_RESET, &reset_context.flags); 6931 list_add_tail(&adev->reset_list, &device_list); 6932 } 6933 6934 r = amdgpu_device_asic_reset(adev, &device_list, &reset_context); 6935 out: 6936 if (!r) { 6937 if (amdgpu_device_cache_pci_state(adev->pdev)) 6938 pci_restore_state(adev->pdev); 6939 dev_info(adev->dev, "PCIe error recovery succeeded\n"); 6940 } else { 6941 dev_err(adev->dev, "PCIe error recovery failed, err:%d\n", r); 6942 if (hive) { 6943 list_for_each_entry(tmp_adev, &device_list, reset_list) 6944 amdgpu_device_unset_mp1_state(tmp_adev); 6945 } 6946 amdgpu_device_recovery_put_reset_lock(adev, &device_list); 6947 } 6948 6949 if (hive) { 6950 mutex_unlock(&hive->hive_lock); 6951 amdgpu_put_xgmi_hive(hive); 6952 } 6953 6954 return r ? PCI_ERS_RESULT_DISCONNECT : PCI_ERS_RESULT_RECOVERED; 6955 } 6956 6957 /** 6958 * amdgpu_pci_resume() - resume normal ops after PCI reset 6959 * @pdev: pointer to PCI device 6960 * 6961 * Called when the error recovery driver tells us that its 6962 * OK to resume normal operation. 6963 */ 6964 void amdgpu_pci_resume(struct pci_dev *pdev) 6965 { 6966 struct drm_device *dev = pci_get_drvdata(pdev); 6967 struct amdgpu_device *adev = drm_to_adev(dev); 6968 struct list_head device_list; 6969 struct amdgpu_hive_info *hive = NULL; 6970 struct amdgpu_device *tmp_adev = NULL; 6971 6972 dev_info(adev->dev, "PCI error: resume callback!!\n"); 6973 6974 /* Only continue execution for the case of pci_channel_io_frozen */ 6975 if (adev->pci_channel_state != pci_channel_io_frozen) 6976 return; 6977 6978 INIT_LIST_HEAD(&device_list); 6979 6980 hive = amdgpu_get_xgmi_hive(adev); 6981 if (hive) { 6982 mutex_lock(&hive->hive_lock); 6983 list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head) { 6984 tmp_adev->pcie_reset_ctx.in_link_reset = false; 6985 list_add_tail(&tmp_adev->reset_list, &device_list); 6986 } 6987 } else 6988 list_add_tail(&adev->reset_list, &device_list); 6989 6990 amdgpu_device_sched_resume(&device_list, NULL, NULL); 6991 amdgpu_device_gpu_resume(adev, &device_list, false); 6992 amdgpu_device_recovery_put_reset_lock(adev, &device_list); 6993 6994 if (hive) { 6995 mutex_unlock(&hive->hive_lock); 6996 amdgpu_put_xgmi_hive(hive); 6997 } 6998 } 6999 7000 static void amdgpu_device_cache_switch_state(struct amdgpu_device *adev) 7001 { 7002 struct pci_dev *swus, *swds; 7003 int r; 7004 7005 swds = pci_upstream_bridge(adev->pdev); 7006 if (!swds || swds->vendor != PCI_VENDOR_ID_ATI || 7007 pci_pcie_type(swds) != PCI_EXP_TYPE_DOWNSTREAM) 7008 return; 7009 swus = pci_upstream_bridge(swds); 7010 if (!swus || 7011 (swus->vendor != PCI_VENDOR_ID_ATI && 7012 swus->vendor != PCI_VENDOR_ID_AMD) || 7013 pci_pcie_type(swus) != PCI_EXP_TYPE_UPSTREAM) 7014 return; 7015 7016 /* If already saved, return */ 7017 if (adev->pcie_reset_ctx.swus) 7018 return; 7019 /* Upstream bridge is ATI, assume it's SWUS/DS architecture */ 7020 r = pci_save_state(swds); 7021 if (r) 7022 return; 7023 adev->pcie_reset_ctx.swds_pcistate = pci_store_saved_state(swds); 7024 7025 r = pci_save_state(swus); 7026 if (r) 7027 return; 7028 adev->pcie_reset_ctx.swus_pcistate = pci_store_saved_state(swus); 7029 7030 adev->pcie_reset_ctx.swus = swus; 7031 } 7032 7033 static void amdgpu_device_load_switch_state(struct amdgpu_device *adev) 7034 { 7035 struct pci_dev *pdev; 7036 int r; 7037 7038 if (!adev->pcie_reset_ctx.swds_pcistate || 7039 !adev->pcie_reset_ctx.swus_pcistate) 7040 return; 7041 7042 pdev = adev->pcie_reset_ctx.swus; 7043 r = pci_load_saved_state(pdev, adev->pcie_reset_ctx.swus_pcistate); 7044 if (!r) { 7045 pci_restore_state(pdev); 7046 } else { 7047 dev_warn(adev->dev, "Failed to load SWUS state, err:%d\n", r); 7048 return; 7049 } 7050 7051 pdev = pci_upstream_bridge(adev->pdev); 7052 r = pci_load_saved_state(pdev, adev->pcie_reset_ctx.swds_pcistate); 7053 if (!r) 7054 pci_restore_state(pdev); 7055 else 7056 dev_warn(adev->dev, "Failed to load SWDS state, err:%d\n", r); 7057 } 7058 7059 bool amdgpu_device_cache_pci_state(struct pci_dev *pdev) 7060 { 7061 struct drm_device *dev = pci_get_drvdata(pdev); 7062 struct amdgpu_device *adev = drm_to_adev(dev); 7063 int r; 7064 7065 if (amdgpu_sriov_vf(adev)) 7066 return false; 7067 7068 r = pci_save_state(pdev); 7069 if (!r) { 7070 kfree(adev->pci_state); 7071 7072 adev->pci_state = pci_store_saved_state(pdev); 7073 7074 if (!adev->pci_state) { 7075 dev_err(adev->dev, "Failed to store PCI saved state"); 7076 return false; 7077 } 7078 } else { 7079 dev_warn(adev->dev, "Failed to save PCI state, err:%d\n", r); 7080 return false; 7081 } 7082 7083 amdgpu_device_cache_switch_state(adev); 7084 7085 return true; 7086 } 7087 7088 bool amdgpu_device_load_pci_state(struct pci_dev *pdev) 7089 { 7090 struct drm_device *dev = pci_get_drvdata(pdev); 7091 struct amdgpu_device *adev = drm_to_adev(dev); 7092 int r; 7093 7094 if (!adev->pci_state) 7095 return false; 7096 7097 r = pci_load_saved_state(pdev, adev->pci_state); 7098 7099 if (!r) { 7100 pci_restore_state(pdev); 7101 } else { 7102 dev_warn(adev->dev, "Failed to load PCI state, err:%d\n", r); 7103 return false; 7104 } 7105 7106 return true; 7107 } 7108 7109 void amdgpu_device_flush_hdp(struct amdgpu_device *adev, 7110 struct amdgpu_ring *ring) 7111 { 7112 #ifdef CONFIG_X86_64 7113 if ((adev->flags & AMD_IS_APU) && !amdgpu_passthrough(adev)) 7114 return; 7115 #endif 7116 if (adev->gmc.xgmi.connected_to_cpu) 7117 return; 7118 7119 if (ring && ring->funcs->emit_hdp_flush) { 7120 amdgpu_ring_emit_hdp_flush(ring); 7121 return; 7122 } 7123 7124 if (!ring && amdgpu_sriov_runtime(adev)) { 7125 if (!amdgpu_kiq_hdp_flush(adev)) 7126 return; 7127 } 7128 7129 amdgpu_hdp_flush(adev, ring); 7130 } 7131 7132 void amdgpu_device_invalidate_hdp(struct amdgpu_device *adev, 7133 struct amdgpu_ring *ring) 7134 { 7135 #ifdef CONFIG_X86_64 7136 if ((adev->flags & AMD_IS_APU) && !amdgpu_passthrough(adev)) 7137 return; 7138 #endif 7139 if (adev->gmc.xgmi.connected_to_cpu) 7140 return; 7141 7142 amdgpu_hdp_invalidate(adev, ring); 7143 } 7144 7145 int amdgpu_in_reset(struct amdgpu_device *adev) 7146 { 7147 return atomic_read(&adev->reset_domain->in_gpu_reset); 7148 } 7149 7150 /** 7151 * amdgpu_device_halt() - bring hardware to some kind of halt state 7152 * 7153 * @adev: amdgpu_device pointer 7154 * 7155 * Bring hardware to some kind of halt state so that no one can touch it 7156 * any more. It will help to maintain error context when error occurred. 7157 * Compare to a simple hang, the system will keep stable at least for SSH 7158 * access. Then it should be trivial to inspect the hardware state and 7159 * see what's going on. Implemented as following: 7160 * 7161 * 1. drm_dev_unplug() makes device inaccessible to user space(IOCTLs, etc), 7162 * clears all CPU mappings to device, disallows remappings through page faults 7163 * 2. amdgpu_irq_disable_all() disables all interrupts 7164 * 3. amdgpu_fence_driver_hw_fini() signals all HW fences 7165 * 4. set adev->no_hw_access to avoid potential crashes after setp 5 7166 * 5. amdgpu_device_unmap_mmio() clears all MMIO mappings 7167 * 6. pci_disable_device() and pci_wait_for_pending_transaction() 7168 * flush any in flight DMA operations 7169 */ 7170 void amdgpu_device_halt(struct amdgpu_device *adev) 7171 { 7172 struct pci_dev *pdev = adev->pdev; 7173 struct drm_device *ddev = adev_to_drm(adev); 7174 7175 amdgpu_xcp_dev_unplug(adev); 7176 drm_dev_unplug(ddev); 7177 7178 amdgpu_irq_disable_all(adev); 7179 7180 amdgpu_fence_driver_hw_fini(adev); 7181 7182 adev->no_hw_access = true; 7183 7184 amdgpu_device_unmap_mmio(adev); 7185 7186 pci_disable_device(pdev); 7187 pci_wait_for_pending_transaction(pdev); 7188 } 7189 7190 u32 amdgpu_device_pcie_port_rreg(struct amdgpu_device *adev, 7191 u32 reg) 7192 { 7193 unsigned long flags, address, data; 7194 u32 r; 7195 7196 address = adev->nbio.funcs->get_pcie_port_index_offset(adev); 7197 data = adev->nbio.funcs->get_pcie_port_data_offset(adev); 7198 7199 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 7200 WREG32(address, reg * 4); 7201 (void)RREG32(address); 7202 r = RREG32(data); 7203 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 7204 return r; 7205 } 7206 7207 void amdgpu_device_pcie_port_wreg(struct amdgpu_device *adev, 7208 u32 reg, u32 v) 7209 { 7210 unsigned long flags, address, data; 7211 7212 address = adev->nbio.funcs->get_pcie_port_index_offset(adev); 7213 data = adev->nbio.funcs->get_pcie_port_data_offset(adev); 7214 7215 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 7216 WREG32(address, reg * 4); 7217 (void)RREG32(address); 7218 WREG32(data, v); 7219 (void)RREG32(data); 7220 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 7221 } 7222 7223 /** 7224 * amdgpu_device_get_gang - return a reference to the current gang 7225 * @adev: amdgpu_device pointer 7226 * 7227 * Returns: A new reference to the current gang leader. 7228 */ 7229 struct dma_fence *amdgpu_device_get_gang(struct amdgpu_device *adev) 7230 { 7231 struct dma_fence *fence; 7232 7233 rcu_read_lock(); 7234 fence = dma_fence_get_rcu_safe(&adev->gang_submit); 7235 rcu_read_unlock(); 7236 return fence; 7237 } 7238 7239 /** 7240 * amdgpu_device_switch_gang - switch to a new gang 7241 * @adev: amdgpu_device pointer 7242 * @gang: the gang to switch to 7243 * 7244 * Try to switch to a new gang. 7245 * Returns: NULL if we switched to the new gang or a reference to the current 7246 * gang leader. 7247 */ 7248 struct dma_fence *amdgpu_device_switch_gang(struct amdgpu_device *adev, 7249 struct dma_fence *gang) 7250 { 7251 struct dma_fence *old = NULL; 7252 7253 dma_fence_get(gang); 7254 do { 7255 dma_fence_put(old); 7256 old = amdgpu_device_get_gang(adev); 7257 if (old == gang) 7258 break; 7259 7260 if (!dma_fence_is_signaled(old)) { 7261 dma_fence_put(gang); 7262 return old; 7263 } 7264 7265 } while (cmpxchg((struct dma_fence __force **)&adev->gang_submit, 7266 old, gang) != old); 7267 7268 /* 7269 * Drop it once for the exchanged reference in adev and once for the 7270 * thread local reference acquired in amdgpu_device_get_gang(). 7271 */ 7272 dma_fence_put(old); 7273 dma_fence_put(old); 7274 return NULL; 7275 } 7276 7277 /** 7278 * amdgpu_device_enforce_isolation - enforce HW isolation 7279 * @adev: the amdgpu device pointer 7280 * @ring: the HW ring the job is supposed to run on 7281 * @job: the job which is about to be pushed to the HW ring 7282 * 7283 * Makes sure that only one client at a time can use the GFX block. 7284 * Returns: The dependency to wait on before the job can be pushed to the HW. 7285 * The function is called multiple times until NULL is returned. 7286 */ 7287 struct dma_fence *amdgpu_device_enforce_isolation(struct amdgpu_device *adev, 7288 struct amdgpu_ring *ring, 7289 struct amdgpu_job *job) 7290 { 7291 struct amdgpu_isolation *isolation = &adev->isolation[ring->xcp_id]; 7292 struct drm_sched_fence *f = job->base.s_fence; 7293 struct dma_fence *dep; 7294 void *owner; 7295 int r; 7296 7297 /* 7298 * For now enforce isolation only for the GFX block since we only need 7299 * the cleaner shader on those rings. 7300 */ 7301 if (ring->funcs->type != AMDGPU_RING_TYPE_GFX && 7302 ring->funcs->type != AMDGPU_RING_TYPE_COMPUTE) 7303 return NULL; 7304 7305 /* 7306 * All submissions where enforce isolation is false are handled as if 7307 * they come from a single client. Use ~0l as the owner to distinct it 7308 * from kernel submissions where the owner is NULL. 7309 */ 7310 owner = job->enforce_isolation ? f->owner : (void *)~0l; 7311 7312 mutex_lock(&adev->enforce_isolation_mutex); 7313 7314 /* 7315 * The "spearhead" submission is the first one which changes the 7316 * ownership to its client. We always need to wait for it to be 7317 * pushed to the HW before proceeding with anything. 7318 */ 7319 if (&f->scheduled != isolation->spearhead && 7320 !dma_fence_is_signaled(isolation->spearhead)) { 7321 dep = isolation->spearhead; 7322 goto out_grab_ref; 7323 } 7324 7325 if (isolation->owner != owner) { 7326 7327 /* 7328 * Wait for any gang to be assembled before switching to a 7329 * different owner or otherwise we could deadlock the 7330 * submissions. 7331 */ 7332 if (!job->gang_submit) { 7333 dep = amdgpu_device_get_gang(adev); 7334 if (!dma_fence_is_signaled(dep)) 7335 goto out_return_dep; 7336 dma_fence_put(dep); 7337 } 7338 7339 dma_fence_put(isolation->spearhead); 7340 isolation->spearhead = dma_fence_get(&f->scheduled); 7341 amdgpu_sync_move(&isolation->active, &isolation->prev); 7342 trace_amdgpu_isolation(isolation->owner, owner); 7343 isolation->owner = owner; 7344 } 7345 7346 /* 7347 * Specifying the ring here helps to pipeline submissions even when 7348 * isolation is enabled. If that is not desired for testing NULL can be 7349 * used instead of the ring to enforce a CPU round trip while switching 7350 * between clients. 7351 */ 7352 dep = amdgpu_sync_peek_fence(&isolation->prev, ring); 7353 r = amdgpu_sync_fence(&isolation->active, &f->finished, GFP_NOWAIT); 7354 if (r) 7355 dev_warn(adev->dev, "OOM tracking isolation\n"); 7356 7357 out_grab_ref: 7358 dma_fence_get(dep); 7359 out_return_dep: 7360 mutex_unlock(&adev->enforce_isolation_mutex); 7361 return dep; 7362 } 7363 7364 bool amdgpu_device_has_display_hardware(struct amdgpu_device *adev) 7365 { 7366 switch (adev->asic_type) { 7367 #ifdef CONFIG_DRM_AMDGPU_SI 7368 case CHIP_HAINAN: 7369 #endif 7370 case CHIP_TOPAZ: 7371 /* chips with no display hardware */ 7372 return false; 7373 #ifdef CONFIG_DRM_AMDGPU_SI 7374 case CHIP_TAHITI: 7375 case CHIP_PITCAIRN: 7376 case CHIP_VERDE: 7377 case CHIP_OLAND: 7378 #endif 7379 #ifdef CONFIG_DRM_AMDGPU_CIK 7380 case CHIP_BONAIRE: 7381 case CHIP_HAWAII: 7382 case CHIP_KAVERI: 7383 case CHIP_KABINI: 7384 case CHIP_MULLINS: 7385 #endif 7386 case CHIP_TONGA: 7387 case CHIP_FIJI: 7388 case CHIP_POLARIS10: 7389 case CHIP_POLARIS11: 7390 case CHIP_POLARIS12: 7391 case CHIP_VEGAM: 7392 case CHIP_CARRIZO: 7393 case CHIP_STONEY: 7394 /* chips with display hardware */ 7395 return true; 7396 default: 7397 /* IP discovery */ 7398 if (!amdgpu_ip_version(adev, DCE_HWIP, 0) || 7399 (adev->harvest_ip_mask & AMD_HARVEST_IP_DMU_MASK)) 7400 return false; 7401 return true; 7402 } 7403 } 7404 7405 uint32_t amdgpu_device_wait_on_rreg(struct amdgpu_device *adev, 7406 uint32_t inst, uint32_t reg_addr, char reg_name[], 7407 uint32_t expected_value, uint32_t mask) 7408 { 7409 uint32_t ret = 0; 7410 uint32_t old_ = 0; 7411 uint32_t tmp_ = RREG32(reg_addr); 7412 uint32_t loop = adev->usec_timeout; 7413 7414 while ((tmp_ & (mask)) != (expected_value)) { 7415 if (old_ != tmp_) { 7416 loop = adev->usec_timeout; 7417 old_ = tmp_; 7418 } else 7419 udelay(1); 7420 tmp_ = RREG32(reg_addr); 7421 loop--; 7422 if (!loop) { 7423 dev_warn( 7424 adev->dev, 7425 "Register(%d) [%s] failed to reach value 0x%08x != 0x%08xn", 7426 inst, reg_name, (uint32_t)expected_value, 7427 (uint32_t)(tmp_ & (mask))); 7428 ret = -ETIMEDOUT; 7429 break; 7430 } 7431 } 7432 return ret; 7433 } 7434 7435 ssize_t amdgpu_get_soft_full_reset_mask(struct amdgpu_ring *ring) 7436 { 7437 ssize_t size = 0; 7438 7439 if (!ring || !ring->adev) 7440 return size; 7441 7442 if (amdgpu_device_should_recover_gpu(ring->adev)) 7443 size |= AMDGPU_RESET_TYPE_FULL; 7444 7445 if (unlikely(!ring->adev->debug_disable_soft_recovery) && 7446 !amdgpu_sriov_vf(ring->adev) && ring->funcs->soft_recovery) 7447 size |= AMDGPU_RESET_TYPE_SOFT_RESET; 7448 7449 return size; 7450 } 7451 7452 ssize_t amdgpu_show_reset_mask(char *buf, uint32_t supported_reset) 7453 { 7454 ssize_t size = 0; 7455 7456 if (supported_reset == 0) { 7457 size += sysfs_emit_at(buf, size, "unsupported"); 7458 size += sysfs_emit_at(buf, size, "\n"); 7459 return size; 7460 7461 } 7462 7463 if (supported_reset & AMDGPU_RESET_TYPE_SOFT_RESET) 7464 size += sysfs_emit_at(buf, size, "soft "); 7465 7466 if (supported_reset & AMDGPU_RESET_TYPE_PER_QUEUE) 7467 size += sysfs_emit_at(buf, size, "queue "); 7468 7469 if (supported_reset & AMDGPU_RESET_TYPE_PER_PIPE) 7470 size += sysfs_emit_at(buf, size, "pipe "); 7471 7472 if (supported_reset & AMDGPU_RESET_TYPE_FULL) 7473 size += sysfs_emit_at(buf, size, "full "); 7474 7475 size += sysfs_emit_at(buf, size, "\n"); 7476 return size; 7477 } 7478 7479 void amdgpu_device_set_uid(struct amdgpu_uid *uid_info, 7480 enum amdgpu_uid_type type, uint8_t inst, 7481 uint64_t uid) 7482 { 7483 if (!uid_info) 7484 return; 7485 7486 if (type >= AMDGPU_UID_TYPE_MAX) { 7487 dev_err_once(uid_info->adev->dev, "Invalid UID type %d\n", 7488 type); 7489 return; 7490 } 7491 7492 if (inst >= AMDGPU_UID_INST_MAX) { 7493 dev_err_once(uid_info->adev->dev, "Invalid UID instance %d\n", 7494 inst); 7495 return; 7496 } 7497 7498 if (uid_info->uid[type][inst] != 0) { 7499 dev_warn_once( 7500 uid_info->adev->dev, 7501 "Overwriting existing UID %llu for type %d instance %d\n", 7502 uid_info->uid[type][inst], type, inst); 7503 } 7504 7505 uid_info->uid[type][inst] = uid; 7506 } 7507 7508 u64 amdgpu_device_get_uid(struct amdgpu_uid *uid_info, 7509 enum amdgpu_uid_type type, uint8_t inst) 7510 { 7511 if (!uid_info) 7512 return 0; 7513 7514 if (type >= AMDGPU_UID_TYPE_MAX) { 7515 dev_err_once(uid_info->adev->dev, "Invalid UID type %d\n", 7516 type); 7517 return 0; 7518 } 7519 7520 if (inst >= AMDGPU_UID_INST_MAX) { 7521 dev_err_once(uid_info->adev->dev, "Invalid UID instance %d\n", 7522 inst); 7523 return 0; 7524 } 7525 7526 return uid_info->uid[type][inst]; 7527 } 7528