1 /* 2 * Copyright 2008 Advanced Micro Devices, Inc. 3 * Copyright 2008 Red Hat Inc. 4 * Copyright 2009 Jerome Glisse. 5 * 6 * Permission is hereby granted, free of charge, to any person obtaining a 7 * copy of this software and associated documentation files (the "Software"), 8 * to deal in the Software without restriction, including without limitation 9 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 10 * and/or sell copies of the Software, and to permit persons to whom the 11 * Software is furnished to do so, subject to the following conditions: 12 * 13 * The above copyright notice and this permission notice shall be included in 14 * all copies or substantial portions of the Software. 15 * 16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 19 * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR 20 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, 21 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR 22 * OTHER DEALINGS IN THE SOFTWARE. 23 * 24 * Authors: Dave Airlie 25 * Alex Deucher 26 * Jerome Glisse 27 */ 28 29 #include <linux/aperture.h> 30 #include <linux/power_supply.h> 31 #include <linux/kthread.h> 32 #include <linux/module.h> 33 #include <linux/console.h> 34 #include <linux/slab.h> 35 #include <linux/iommu.h> 36 #include <linux/pci.h> 37 #include <linux/pci-p2pdma.h> 38 #include <linux/apple-gmux.h> 39 40 #include <drm/drm_atomic_helper.h> 41 #include <drm/drm_client_event.h> 42 #include <drm/drm_crtc_helper.h> 43 #include <drm/drm_probe_helper.h> 44 #include <drm/amdgpu_drm.h> 45 #include <linux/device.h> 46 #include <linux/vgaarb.h> 47 #include <linux/vga_switcheroo.h> 48 #include <linux/efi.h> 49 #include "amdgpu.h" 50 #include "amdgpu_trace.h" 51 #include "amdgpu_i2c.h" 52 #include "atom.h" 53 #include "amdgpu_atombios.h" 54 #include "amdgpu_atomfirmware.h" 55 #include "amd_pcie.h" 56 #ifdef CONFIG_DRM_AMDGPU_SI 57 #include "si.h" 58 #endif 59 #ifdef CONFIG_DRM_AMDGPU_CIK 60 #include "cik.h" 61 #endif 62 #include "vi.h" 63 #include "soc15.h" 64 #include "nv.h" 65 #include "bif/bif_4_1_d.h" 66 #include <linux/firmware.h> 67 #include "amdgpu_vf_error.h" 68 69 #include "amdgpu_amdkfd.h" 70 #include "amdgpu_pm.h" 71 72 #include "amdgpu_xgmi.h" 73 #include "amdgpu_ras.h" 74 #include "amdgpu_pmu.h" 75 #include "amdgpu_fru_eeprom.h" 76 #include "amdgpu_reset.h" 77 #include "amdgpu_virt.h" 78 #include "amdgpu_dev_coredump.h" 79 80 #include <linux/suspend.h> 81 #include <drm/task_barrier.h> 82 #include <linux/pm_runtime.h> 83 84 #include <drm/drm_drv.h> 85 86 #if IS_ENABLED(CONFIG_X86) 87 #include <asm/intel-family.h> 88 #include <asm/cpu_device_id.h> 89 #endif 90 91 MODULE_FIRMWARE("amdgpu/vega10_gpu_info.bin"); 92 MODULE_FIRMWARE("amdgpu/vega12_gpu_info.bin"); 93 MODULE_FIRMWARE("amdgpu/raven_gpu_info.bin"); 94 MODULE_FIRMWARE("amdgpu/picasso_gpu_info.bin"); 95 MODULE_FIRMWARE("amdgpu/raven2_gpu_info.bin"); 96 MODULE_FIRMWARE("amdgpu/arcturus_gpu_info.bin"); 97 MODULE_FIRMWARE("amdgpu/navi12_gpu_info.bin"); 98 99 #define AMDGPU_RESUME_MS 2000 100 #define AMDGPU_MAX_RETRY_LIMIT 2 101 #define AMDGPU_RETRY_SRIOV_RESET(r) ((r) == -EBUSY || (r) == -ETIMEDOUT || (r) == -EINVAL) 102 #define AMDGPU_PCIE_INDEX_FALLBACK (0x38 >> 2) 103 #define AMDGPU_PCIE_INDEX_HI_FALLBACK (0x44 >> 2) 104 #define AMDGPU_PCIE_DATA_FALLBACK (0x3C >> 2) 105 106 #define AMDGPU_VBIOS_SKIP (1U << 0) 107 #define AMDGPU_VBIOS_OPTIONAL (1U << 1) 108 109 static const struct drm_driver amdgpu_kms_driver; 110 111 const char *amdgpu_asic_name[] = { 112 "TAHITI", 113 "PITCAIRN", 114 "VERDE", 115 "OLAND", 116 "HAINAN", 117 "BONAIRE", 118 "KAVERI", 119 "KABINI", 120 "HAWAII", 121 "MULLINS", 122 "TOPAZ", 123 "TONGA", 124 "FIJI", 125 "CARRIZO", 126 "STONEY", 127 "POLARIS10", 128 "POLARIS11", 129 "POLARIS12", 130 "VEGAM", 131 "VEGA10", 132 "VEGA12", 133 "VEGA20", 134 "RAVEN", 135 "ARCTURUS", 136 "RENOIR", 137 "ALDEBARAN", 138 "NAVI10", 139 "CYAN_SKILLFISH", 140 "NAVI14", 141 "NAVI12", 142 "SIENNA_CICHLID", 143 "NAVY_FLOUNDER", 144 "VANGOGH", 145 "DIMGREY_CAVEFISH", 146 "BEIGE_GOBY", 147 "YELLOW_CARP", 148 "IP DISCOVERY", 149 "LAST", 150 }; 151 152 #define AMDGPU_IP_BLK_MASK_ALL GENMASK(AMD_IP_BLOCK_TYPE_NUM - 1, 0) 153 /* 154 * Default init level where all blocks are expected to be initialized. This is 155 * the level of initialization expected by default and also after a full reset 156 * of the device. 157 */ 158 struct amdgpu_init_level amdgpu_init_default = { 159 .level = AMDGPU_INIT_LEVEL_DEFAULT, 160 .hwini_ip_block_mask = AMDGPU_IP_BLK_MASK_ALL, 161 }; 162 163 struct amdgpu_init_level amdgpu_init_recovery = { 164 .level = AMDGPU_INIT_LEVEL_RESET_RECOVERY, 165 .hwini_ip_block_mask = AMDGPU_IP_BLK_MASK_ALL, 166 }; 167 168 /* 169 * Minimal blocks needed to be initialized before a XGMI hive can be reset. This 170 * is used for cases like reset on initialization where the entire hive needs to 171 * be reset before first use. 172 */ 173 struct amdgpu_init_level amdgpu_init_minimal_xgmi = { 174 .level = AMDGPU_INIT_LEVEL_MINIMAL_XGMI, 175 .hwini_ip_block_mask = 176 BIT(AMD_IP_BLOCK_TYPE_GMC) | BIT(AMD_IP_BLOCK_TYPE_SMC) | 177 BIT(AMD_IP_BLOCK_TYPE_COMMON) | BIT(AMD_IP_BLOCK_TYPE_IH) | 178 BIT(AMD_IP_BLOCK_TYPE_PSP) 179 }; 180 181 static inline bool amdgpu_ip_member_of_hwini(struct amdgpu_device *adev, 182 enum amd_ip_block_type block) 183 { 184 return (adev->init_lvl->hwini_ip_block_mask & (1U << block)) != 0; 185 } 186 187 void amdgpu_set_init_level(struct amdgpu_device *adev, 188 enum amdgpu_init_lvl_id lvl) 189 { 190 switch (lvl) { 191 case AMDGPU_INIT_LEVEL_MINIMAL_XGMI: 192 adev->init_lvl = &amdgpu_init_minimal_xgmi; 193 break; 194 case AMDGPU_INIT_LEVEL_RESET_RECOVERY: 195 adev->init_lvl = &amdgpu_init_recovery; 196 break; 197 case AMDGPU_INIT_LEVEL_DEFAULT: 198 fallthrough; 199 default: 200 adev->init_lvl = &amdgpu_init_default; 201 break; 202 } 203 } 204 205 static inline void amdgpu_device_stop_pending_resets(struct amdgpu_device *adev); 206 static int amdgpu_device_pm_notifier(struct notifier_block *nb, unsigned long mode, 207 void *data); 208 209 /** 210 * DOC: pcie_replay_count 211 * 212 * The amdgpu driver provides a sysfs API for reporting the total number 213 * of PCIe replays (NAKs). 214 * The file pcie_replay_count is used for this and returns the total 215 * number of replays as a sum of the NAKs generated and NAKs received. 216 */ 217 218 static ssize_t amdgpu_device_get_pcie_replay_count(struct device *dev, 219 struct device_attribute *attr, char *buf) 220 { 221 struct drm_device *ddev = dev_get_drvdata(dev); 222 struct amdgpu_device *adev = drm_to_adev(ddev); 223 uint64_t cnt = amdgpu_asic_get_pcie_replay_count(adev); 224 225 return sysfs_emit(buf, "%llu\n", cnt); 226 } 227 228 static DEVICE_ATTR(pcie_replay_count, 0444, 229 amdgpu_device_get_pcie_replay_count, NULL); 230 231 static int amdgpu_device_attr_sysfs_init(struct amdgpu_device *adev) 232 { 233 int ret = 0; 234 235 if (amdgpu_nbio_is_replay_cnt_supported(adev)) 236 ret = sysfs_create_file(&adev->dev->kobj, 237 &dev_attr_pcie_replay_count.attr); 238 239 return ret; 240 } 241 242 static void amdgpu_device_attr_sysfs_fini(struct amdgpu_device *adev) 243 { 244 if (amdgpu_nbio_is_replay_cnt_supported(adev)) 245 sysfs_remove_file(&adev->dev->kobj, 246 &dev_attr_pcie_replay_count.attr); 247 } 248 249 static ssize_t amdgpu_sysfs_reg_state_get(struct file *f, struct kobject *kobj, 250 const struct bin_attribute *attr, char *buf, 251 loff_t ppos, size_t count) 252 { 253 struct device *dev = kobj_to_dev(kobj); 254 struct drm_device *ddev = dev_get_drvdata(dev); 255 struct amdgpu_device *adev = drm_to_adev(ddev); 256 ssize_t bytes_read; 257 258 switch (ppos) { 259 case AMDGPU_SYS_REG_STATE_XGMI: 260 bytes_read = amdgpu_asic_get_reg_state( 261 adev, AMDGPU_REG_STATE_TYPE_XGMI, buf, count); 262 break; 263 case AMDGPU_SYS_REG_STATE_WAFL: 264 bytes_read = amdgpu_asic_get_reg_state( 265 adev, AMDGPU_REG_STATE_TYPE_WAFL, buf, count); 266 break; 267 case AMDGPU_SYS_REG_STATE_PCIE: 268 bytes_read = amdgpu_asic_get_reg_state( 269 adev, AMDGPU_REG_STATE_TYPE_PCIE, buf, count); 270 break; 271 case AMDGPU_SYS_REG_STATE_USR: 272 bytes_read = amdgpu_asic_get_reg_state( 273 adev, AMDGPU_REG_STATE_TYPE_USR, buf, count); 274 break; 275 case AMDGPU_SYS_REG_STATE_USR_1: 276 bytes_read = amdgpu_asic_get_reg_state( 277 adev, AMDGPU_REG_STATE_TYPE_USR_1, buf, count); 278 break; 279 default: 280 return -EINVAL; 281 } 282 283 return bytes_read; 284 } 285 286 static const BIN_ATTR(reg_state, 0444, amdgpu_sysfs_reg_state_get, NULL, 287 AMDGPU_SYS_REG_STATE_END); 288 289 int amdgpu_reg_state_sysfs_init(struct amdgpu_device *adev) 290 { 291 int ret; 292 293 if (!amdgpu_asic_get_reg_state_supported(adev)) 294 return 0; 295 296 ret = sysfs_create_bin_file(&adev->dev->kobj, &bin_attr_reg_state); 297 298 return ret; 299 } 300 301 void amdgpu_reg_state_sysfs_fini(struct amdgpu_device *adev) 302 { 303 if (!amdgpu_asic_get_reg_state_supported(adev)) 304 return; 305 sysfs_remove_bin_file(&adev->dev->kobj, &bin_attr_reg_state); 306 } 307 308 int amdgpu_ip_block_suspend(struct amdgpu_ip_block *ip_block) 309 { 310 int r; 311 312 if (ip_block->version->funcs->suspend) { 313 r = ip_block->version->funcs->suspend(ip_block); 314 if (r) { 315 dev_err(ip_block->adev->dev, 316 "suspend of IP block <%s> failed %d\n", 317 ip_block->version->funcs->name, r); 318 return r; 319 } 320 } 321 322 ip_block->status.hw = false; 323 return 0; 324 } 325 326 int amdgpu_ip_block_resume(struct amdgpu_ip_block *ip_block) 327 { 328 int r; 329 330 if (ip_block->version->funcs->resume) { 331 r = ip_block->version->funcs->resume(ip_block); 332 if (r) { 333 dev_err(ip_block->adev->dev, 334 "resume of IP block <%s> failed %d\n", 335 ip_block->version->funcs->name, r); 336 return r; 337 } 338 } 339 340 ip_block->status.hw = true; 341 return 0; 342 } 343 344 /** 345 * DOC: board_info 346 * 347 * The amdgpu driver provides a sysfs API for giving board related information. 348 * It provides the form factor information in the format 349 * 350 * type : form factor 351 * 352 * Possible form factor values 353 * 354 * - "cem" - PCIE CEM card 355 * - "oam" - Open Compute Accelerator Module 356 * - "unknown" - Not known 357 * 358 */ 359 360 static ssize_t amdgpu_device_get_board_info(struct device *dev, 361 struct device_attribute *attr, 362 char *buf) 363 { 364 struct drm_device *ddev = dev_get_drvdata(dev); 365 struct amdgpu_device *adev = drm_to_adev(ddev); 366 enum amdgpu_pkg_type pkg_type = AMDGPU_PKG_TYPE_CEM; 367 const char *pkg; 368 369 if (adev->smuio.funcs && adev->smuio.funcs->get_pkg_type) 370 pkg_type = adev->smuio.funcs->get_pkg_type(adev); 371 372 switch (pkg_type) { 373 case AMDGPU_PKG_TYPE_CEM: 374 pkg = "cem"; 375 break; 376 case AMDGPU_PKG_TYPE_OAM: 377 pkg = "oam"; 378 break; 379 default: 380 pkg = "unknown"; 381 break; 382 } 383 384 return sysfs_emit(buf, "%s : %s\n", "type", pkg); 385 } 386 387 static DEVICE_ATTR(board_info, 0444, amdgpu_device_get_board_info, NULL); 388 389 static struct attribute *amdgpu_board_attrs[] = { 390 &dev_attr_board_info.attr, 391 NULL, 392 }; 393 394 static umode_t amdgpu_board_attrs_is_visible(struct kobject *kobj, 395 struct attribute *attr, int n) 396 { 397 struct device *dev = kobj_to_dev(kobj); 398 struct drm_device *ddev = dev_get_drvdata(dev); 399 struct amdgpu_device *adev = drm_to_adev(ddev); 400 401 if (adev->flags & AMD_IS_APU) 402 return 0; 403 404 return attr->mode; 405 } 406 407 static const struct attribute_group amdgpu_board_attrs_group = { 408 .attrs = amdgpu_board_attrs, 409 .is_visible = amdgpu_board_attrs_is_visible 410 }; 411 412 static void amdgpu_device_get_pcie_info(struct amdgpu_device *adev); 413 414 415 /** 416 * amdgpu_device_supports_px - Is the device a dGPU with ATPX power control 417 * 418 * @dev: drm_device pointer 419 * 420 * Returns true if the device is a dGPU with ATPX power control, 421 * otherwise return false. 422 */ 423 bool amdgpu_device_supports_px(struct drm_device *dev) 424 { 425 struct amdgpu_device *adev = drm_to_adev(dev); 426 427 if ((adev->flags & AMD_IS_PX) && !amdgpu_is_atpx_hybrid()) 428 return true; 429 return false; 430 } 431 432 /** 433 * amdgpu_device_supports_boco - Is the device a dGPU with ACPI power resources 434 * 435 * @dev: drm_device pointer 436 * 437 * Returns true if the device is a dGPU with ACPI power control, 438 * otherwise return false. 439 */ 440 bool amdgpu_device_supports_boco(struct drm_device *dev) 441 { 442 struct amdgpu_device *adev = drm_to_adev(dev); 443 444 if (!IS_ENABLED(CONFIG_HOTPLUG_PCI_PCIE)) 445 return false; 446 447 if (adev->has_pr3 || 448 ((adev->flags & AMD_IS_PX) && amdgpu_is_atpx_hybrid())) 449 return true; 450 return false; 451 } 452 453 /** 454 * amdgpu_device_supports_baco - Does the device support BACO 455 * 456 * @dev: drm_device pointer 457 * 458 * Return: 459 * 1 if the device supports BACO; 460 * 3 if the device supports MACO (only works if BACO is supported) 461 * otherwise return 0. 462 */ 463 int amdgpu_device_supports_baco(struct drm_device *dev) 464 { 465 struct amdgpu_device *adev = drm_to_adev(dev); 466 467 return amdgpu_asic_supports_baco(adev); 468 } 469 470 void amdgpu_device_detect_runtime_pm_mode(struct amdgpu_device *adev) 471 { 472 struct drm_device *dev; 473 int bamaco_support; 474 475 dev = adev_to_drm(adev); 476 477 adev->pm.rpm_mode = AMDGPU_RUNPM_NONE; 478 bamaco_support = amdgpu_device_supports_baco(dev); 479 480 switch (amdgpu_runtime_pm) { 481 case 2: 482 if (bamaco_support & MACO_SUPPORT) { 483 adev->pm.rpm_mode = AMDGPU_RUNPM_BAMACO; 484 dev_info(adev->dev, "Forcing BAMACO for runtime pm\n"); 485 } else if (bamaco_support == BACO_SUPPORT) { 486 adev->pm.rpm_mode = AMDGPU_RUNPM_BACO; 487 dev_info(adev->dev, "Requested mode BAMACO not available,fallback to use BACO\n"); 488 } 489 break; 490 case 1: 491 if (bamaco_support & BACO_SUPPORT) { 492 adev->pm.rpm_mode = AMDGPU_RUNPM_BACO; 493 dev_info(adev->dev, "Forcing BACO for runtime pm\n"); 494 } 495 break; 496 case -1: 497 case -2: 498 if (amdgpu_device_supports_px(dev)) { /* enable PX as runtime mode */ 499 adev->pm.rpm_mode = AMDGPU_RUNPM_PX; 500 dev_info(adev->dev, "Using ATPX for runtime pm\n"); 501 } else if (amdgpu_device_supports_boco(dev)) { /* enable boco as runtime mode */ 502 adev->pm.rpm_mode = AMDGPU_RUNPM_BOCO; 503 dev_info(adev->dev, "Using BOCO for runtime pm\n"); 504 } else { 505 if (!bamaco_support) 506 goto no_runtime_pm; 507 508 switch (adev->asic_type) { 509 case CHIP_VEGA20: 510 case CHIP_ARCTURUS: 511 /* BACO are not supported on vega20 and arctrus */ 512 break; 513 case CHIP_VEGA10: 514 /* enable BACO as runpm mode if noretry=0 */ 515 if (!adev->gmc.noretry && !amdgpu_passthrough(adev)) 516 adev->pm.rpm_mode = AMDGPU_RUNPM_BACO; 517 break; 518 default: 519 /* enable BACO as runpm mode on CI+ */ 520 if (!amdgpu_passthrough(adev)) 521 adev->pm.rpm_mode = AMDGPU_RUNPM_BACO; 522 break; 523 } 524 525 if (adev->pm.rpm_mode == AMDGPU_RUNPM_BACO) { 526 if (bamaco_support & MACO_SUPPORT) { 527 adev->pm.rpm_mode = AMDGPU_RUNPM_BAMACO; 528 dev_info(adev->dev, "Using BAMACO for runtime pm\n"); 529 } else { 530 dev_info(adev->dev, "Using BACO for runtime pm\n"); 531 } 532 } 533 } 534 break; 535 case 0: 536 dev_info(adev->dev, "runtime pm is manually disabled\n"); 537 break; 538 default: 539 break; 540 } 541 542 no_runtime_pm: 543 if (adev->pm.rpm_mode == AMDGPU_RUNPM_NONE) 544 dev_info(adev->dev, "Runtime PM not available\n"); 545 } 546 /** 547 * amdgpu_device_supports_smart_shift - Is the device dGPU with 548 * smart shift support 549 * 550 * @dev: drm_device pointer 551 * 552 * Returns true if the device is a dGPU with Smart Shift support, 553 * otherwise returns false. 554 */ 555 bool amdgpu_device_supports_smart_shift(struct drm_device *dev) 556 { 557 return (amdgpu_device_supports_boco(dev) && 558 amdgpu_acpi_is_power_shift_control_supported()); 559 } 560 561 /* 562 * VRAM access helper functions 563 */ 564 565 /** 566 * amdgpu_device_mm_access - access vram by MM_INDEX/MM_DATA 567 * 568 * @adev: amdgpu_device pointer 569 * @pos: offset of the buffer in vram 570 * @buf: virtual address of the buffer in system memory 571 * @size: read/write size, sizeof(@buf) must > @size 572 * @write: true - write to vram, otherwise - read from vram 573 */ 574 void amdgpu_device_mm_access(struct amdgpu_device *adev, loff_t pos, 575 void *buf, size_t size, bool write) 576 { 577 unsigned long flags; 578 uint32_t hi = ~0, tmp = 0; 579 uint32_t *data = buf; 580 uint64_t last; 581 int idx; 582 583 if (!drm_dev_enter(adev_to_drm(adev), &idx)) 584 return; 585 586 BUG_ON(!IS_ALIGNED(pos, 4) || !IS_ALIGNED(size, 4)); 587 588 spin_lock_irqsave(&adev->mmio_idx_lock, flags); 589 for (last = pos + size; pos < last; pos += 4) { 590 tmp = pos >> 31; 591 592 WREG32_NO_KIQ(mmMM_INDEX, ((uint32_t)pos) | 0x80000000); 593 if (tmp != hi) { 594 WREG32_NO_KIQ(mmMM_INDEX_HI, tmp); 595 hi = tmp; 596 } 597 if (write) 598 WREG32_NO_KIQ(mmMM_DATA, *data++); 599 else 600 *data++ = RREG32_NO_KIQ(mmMM_DATA); 601 } 602 603 spin_unlock_irqrestore(&adev->mmio_idx_lock, flags); 604 drm_dev_exit(idx); 605 } 606 607 /** 608 * amdgpu_device_aper_access - access vram by vram aperture 609 * 610 * @adev: amdgpu_device pointer 611 * @pos: offset of the buffer in vram 612 * @buf: virtual address of the buffer in system memory 613 * @size: read/write size, sizeof(@buf) must > @size 614 * @write: true - write to vram, otherwise - read from vram 615 * 616 * The return value means how many bytes have been transferred. 617 */ 618 size_t amdgpu_device_aper_access(struct amdgpu_device *adev, loff_t pos, 619 void *buf, size_t size, bool write) 620 { 621 #ifdef CONFIG_64BIT 622 void __iomem *addr; 623 size_t count = 0; 624 uint64_t last; 625 626 if (!adev->mman.aper_base_kaddr) 627 return 0; 628 629 last = min(pos + size, adev->gmc.visible_vram_size); 630 if (last > pos) { 631 addr = adev->mman.aper_base_kaddr + pos; 632 count = last - pos; 633 634 if (write) { 635 memcpy_toio(addr, buf, count); 636 /* Make sure HDP write cache flush happens without any reordering 637 * after the system memory contents are sent over PCIe device 638 */ 639 mb(); 640 amdgpu_device_flush_hdp(adev, NULL); 641 } else { 642 amdgpu_device_invalidate_hdp(adev, NULL); 643 /* Make sure HDP read cache is invalidated before issuing a read 644 * to the PCIe device 645 */ 646 mb(); 647 memcpy_fromio(buf, addr, count); 648 } 649 650 } 651 652 return count; 653 #else 654 return 0; 655 #endif 656 } 657 658 /** 659 * amdgpu_device_vram_access - read/write a buffer in vram 660 * 661 * @adev: amdgpu_device pointer 662 * @pos: offset of the buffer in vram 663 * @buf: virtual address of the buffer in system memory 664 * @size: read/write size, sizeof(@buf) must > @size 665 * @write: true - write to vram, otherwise - read from vram 666 */ 667 void amdgpu_device_vram_access(struct amdgpu_device *adev, loff_t pos, 668 void *buf, size_t size, bool write) 669 { 670 size_t count; 671 672 /* try to using vram apreature to access vram first */ 673 count = amdgpu_device_aper_access(adev, pos, buf, size, write); 674 size -= count; 675 if (size) { 676 /* using MM to access rest vram */ 677 pos += count; 678 buf += count; 679 amdgpu_device_mm_access(adev, pos, buf, size, write); 680 } 681 } 682 683 /* 684 * register access helper functions. 685 */ 686 687 /* Check if hw access should be skipped because of hotplug or device error */ 688 bool amdgpu_device_skip_hw_access(struct amdgpu_device *adev) 689 { 690 if (adev->no_hw_access) 691 return true; 692 693 #ifdef CONFIG_LOCKDEP 694 /* 695 * This is a bit complicated to understand, so worth a comment. What we assert 696 * here is that the GPU reset is not running on another thread in parallel. 697 * 698 * For this we trylock the read side of the reset semaphore, if that succeeds 699 * we know that the reset is not running in parallel. 700 * 701 * If the trylock fails we assert that we are either already holding the read 702 * side of the lock or are the reset thread itself and hold the write side of 703 * the lock. 704 */ 705 if (in_task()) { 706 if (down_read_trylock(&adev->reset_domain->sem)) 707 up_read(&adev->reset_domain->sem); 708 else 709 lockdep_assert_held(&adev->reset_domain->sem); 710 } 711 #endif 712 return false; 713 } 714 715 /** 716 * amdgpu_device_rreg - read a memory mapped IO or indirect register 717 * 718 * @adev: amdgpu_device pointer 719 * @reg: dword aligned register offset 720 * @acc_flags: access flags which require special behavior 721 * 722 * Returns the 32 bit value from the offset specified. 723 */ 724 uint32_t amdgpu_device_rreg(struct amdgpu_device *adev, 725 uint32_t reg, uint32_t acc_flags) 726 { 727 uint32_t ret; 728 729 if (amdgpu_device_skip_hw_access(adev)) 730 return 0; 731 732 if ((reg * 4) < adev->rmmio_size) { 733 if (!(acc_flags & AMDGPU_REGS_NO_KIQ) && 734 amdgpu_sriov_runtime(adev) && 735 down_read_trylock(&adev->reset_domain->sem)) { 736 ret = amdgpu_kiq_rreg(adev, reg, 0); 737 up_read(&adev->reset_domain->sem); 738 } else { 739 ret = readl(((void __iomem *)adev->rmmio) + (reg * 4)); 740 } 741 } else { 742 ret = adev->pcie_rreg(adev, reg * 4); 743 } 744 745 trace_amdgpu_device_rreg(adev->pdev->device, reg, ret); 746 747 return ret; 748 } 749 750 /* 751 * MMIO register read with bytes helper functions 752 * @offset:bytes offset from MMIO start 753 */ 754 755 /** 756 * amdgpu_mm_rreg8 - read a memory mapped IO register 757 * 758 * @adev: amdgpu_device pointer 759 * @offset: byte aligned register offset 760 * 761 * Returns the 8 bit value from the offset specified. 762 */ 763 uint8_t amdgpu_mm_rreg8(struct amdgpu_device *adev, uint32_t offset) 764 { 765 if (amdgpu_device_skip_hw_access(adev)) 766 return 0; 767 768 if (offset < adev->rmmio_size) 769 return (readb(adev->rmmio + offset)); 770 BUG(); 771 } 772 773 774 /** 775 * amdgpu_device_xcc_rreg - read a memory mapped IO or indirect register with specific XCC 776 * 777 * @adev: amdgpu_device pointer 778 * @reg: dword aligned register offset 779 * @acc_flags: access flags which require special behavior 780 * @xcc_id: xcc accelerated compute core id 781 * 782 * Returns the 32 bit value from the offset specified. 783 */ 784 uint32_t amdgpu_device_xcc_rreg(struct amdgpu_device *adev, 785 uint32_t reg, uint32_t acc_flags, 786 uint32_t xcc_id) 787 { 788 uint32_t ret, rlcg_flag; 789 790 if (amdgpu_device_skip_hw_access(adev)) 791 return 0; 792 793 if ((reg * 4) < adev->rmmio_size) { 794 if (amdgpu_sriov_vf(adev) && 795 !amdgpu_sriov_runtime(adev) && 796 adev->gfx.rlc.rlcg_reg_access_supported && 797 amdgpu_virt_get_rlcg_reg_access_flag(adev, acc_flags, 798 GC_HWIP, false, 799 &rlcg_flag)) { 800 ret = amdgpu_virt_rlcg_reg_rw(adev, reg, 0, rlcg_flag, GET_INST(GC, xcc_id)); 801 } else if (!(acc_flags & AMDGPU_REGS_NO_KIQ) && 802 amdgpu_sriov_runtime(adev) && 803 down_read_trylock(&adev->reset_domain->sem)) { 804 ret = amdgpu_kiq_rreg(adev, reg, xcc_id); 805 up_read(&adev->reset_domain->sem); 806 } else { 807 ret = readl(((void __iomem *)adev->rmmio) + (reg * 4)); 808 } 809 } else { 810 ret = adev->pcie_rreg(adev, reg * 4); 811 } 812 813 return ret; 814 } 815 816 /* 817 * MMIO register write with bytes helper functions 818 * @offset:bytes offset from MMIO start 819 * @value: the value want to be written to the register 820 */ 821 822 /** 823 * amdgpu_mm_wreg8 - read a memory mapped IO register 824 * 825 * @adev: amdgpu_device pointer 826 * @offset: byte aligned register offset 827 * @value: 8 bit value to write 828 * 829 * Writes the value specified to the offset specified. 830 */ 831 void amdgpu_mm_wreg8(struct amdgpu_device *adev, uint32_t offset, uint8_t value) 832 { 833 if (amdgpu_device_skip_hw_access(adev)) 834 return; 835 836 if (offset < adev->rmmio_size) 837 writeb(value, adev->rmmio + offset); 838 else 839 BUG(); 840 } 841 842 /** 843 * amdgpu_device_wreg - write to a memory mapped IO or indirect register 844 * 845 * @adev: amdgpu_device pointer 846 * @reg: dword aligned register offset 847 * @v: 32 bit value to write to the register 848 * @acc_flags: access flags which require special behavior 849 * 850 * Writes the value specified to the offset specified. 851 */ 852 void amdgpu_device_wreg(struct amdgpu_device *adev, 853 uint32_t reg, uint32_t v, 854 uint32_t acc_flags) 855 { 856 if (amdgpu_device_skip_hw_access(adev)) 857 return; 858 859 if ((reg * 4) < adev->rmmio_size) { 860 if (!(acc_flags & AMDGPU_REGS_NO_KIQ) && 861 amdgpu_sriov_runtime(adev) && 862 down_read_trylock(&adev->reset_domain->sem)) { 863 amdgpu_kiq_wreg(adev, reg, v, 0); 864 up_read(&adev->reset_domain->sem); 865 } else { 866 writel(v, ((void __iomem *)adev->rmmio) + (reg * 4)); 867 } 868 } else { 869 adev->pcie_wreg(adev, reg * 4, v); 870 } 871 872 trace_amdgpu_device_wreg(adev->pdev->device, reg, v); 873 } 874 875 /** 876 * amdgpu_mm_wreg_mmio_rlc - write register either with direct/indirect mmio or with RLC path if in range 877 * 878 * @adev: amdgpu_device pointer 879 * @reg: mmio/rlc register 880 * @v: value to write 881 * @xcc_id: xcc accelerated compute core id 882 * 883 * this function is invoked only for the debugfs register access 884 */ 885 void amdgpu_mm_wreg_mmio_rlc(struct amdgpu_device *adev, 886 uint32_t reg, uint32_t v, 887 uint32_t xcc_id) 888 { 889 if (amdgpu_device_skip_hw_access(adev)) 890 return; 891 892 if (amdgpu_sriov_fullaccess(adev) && 893 adev->gfx.rlc.funcs && 894 adev->gfx.rlc.funcs->is_rlcg_access_range) { 895 if (adev->gfx.rlc.funcs->is_rlcg_access_range(adev, reg)) 896 return amdgpu_sriov_wreg(adev, reg, v, 0, 0, xcc_id); 897 } else if ((reg * 4) >= adev->rmmio_size) { 898 adev->pcie_wreg(adev, reg * 4, v); 899 } else { 900 writel(v, ((void __iomem *)adev->rmmio) + (reg * 4)); 901 } 902 } 903 904 /** 905 * amdgpu_device_xcc_wreg - write to a memory mapped IO or indirect register with specific XCC 906 * 907 * @adev: amdgpu_device pointer 908 * @reg: dword aligned register offset 909 * @v: 32 bit value to write to the register 910 * @acc_flags: access flags which require special behavior 911 * @xcc_id: xcc accelerated compute core id 912 * 913 * Writes the value specified to the offset specified. 914 */ 915 void amdgpu_device_xcc_wreg(struct amdgpu_device *adev, 916 uint32_t reg, uint32_t v, 917 uint32_t acc_flags, uint32_t xcc_id) 918 { 919 uint32_t rlcg_flag; 920 921 if (amdgpu_device_skip_hw_access(adev)) 922 return; 923 924 if ((reg * 4) < adev->rmmio_size) { 925 if (amdgpu_sriov_vf(adev) && 926 !amdgpu_sriov_runtime(adev) && 927 adev->gfx.rlc.rlcg_reg_access_supported && 928 amdgpu_virt_get_rlcg_reg_access_flag(adev, acc_flags, 929 GC_HWIP, true, 930 &rlcg_flag)) { 931 amdgpu_virt_rlcg_reg_rw(adev, reg, v, rlcg_flag, GET_INST(GC, xcc_id)); 932 } else if (!(acc_flags & AMDGPU_REGS_NO_KIQ) && 933 amdgpu_sriov_runtime(adev) && 934 down_read_trylock(&adev->reset_domain->sem)) { 935 amdgpu_kiq_wreg(adev, reg, v, xcc_id); 936 up_read(&adev->reset_domain->sem); 937 } else { 938 writel(v, ((void __iomem *)adev->rmmio) + (reg * 4)); 939 } 940 } else { 941 adev->pcie_wreg(adev, reg * 4, v); 942 } 943 } 944 945 /** 946 * amdgpu_device_indirect_rreg - read an indirect register 947 * 948 * @adev: amdgpu_device pointer 949 * @reg_addr: indirect register address to read from 950 * 951 * Returns the value of indirect register @reg_addr 952 */ 953 u32 amdgpu_device_indirect_rreg(struct amdgpu_device *adev, 954 u32 reg_addr) 955 { 956 unsigned long flags, pcie_index, pcie_data; 957 void __iomem *pcie_index_offset; 958 void __iomem *pcie_data_offset; 959 u32 r; 960 961 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 962 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 963 964 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 965 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 966 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 967 968 writel(reg_addr, pcie_index_offset); 969 readl(pcie_index_offset); 970 r = readl(pcie_data_offset); 971 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 972 973 return r; 974 } 975 976 u32 amdgpu_device_indirect_rreg_ext(struct amdgpu_device *adev, 977 u64 reg_addr) 978 { 979 unsigned long flags, pcie_index, pcie_index_hi, pcie_data; 980 u32 r; 981 void __iomem *pcie_index_offset; 982 void __iomem *pcie_index_hi_offset; 983 void __iomem *pcie_data_offset; 984 985 if (unlikely(!adev->nbio.funcs)) { 986 pcie_index = AMDGPU_PCIE_INDEX_FALLBACK; 987 pcie_data = AMDGPU_PCIE_DATA_FALLBACK; 988 } else { 989 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 990 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 991 } 992 993 if (reg_addr >> 32) { 994 if (unlikely(!adev->nbio.funcs)) 995 pcie_index_hi = AMDGPU_PCIE_INDEX_HI_FALLBACK; 996 else 997 pcie_index_hi = adev->nbio.funcs->get_pcie_index_hi_offset(adev); 998 } else { 999 pcie_index_hi = 0; 1000 } 1001 1002 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 1003 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 1004 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 1005 if (pcie_index_hi != 0) 1006 pcie_index_hi_offset = (void __iomem *)adev->rmmio + 1007 pcie_index_hi * 4; 1008 1009 writel(reg_addr, pcie_index_offset); 1010 readl(pcie_index_offset); 1011 if (pcie_index_hi != 0) { 1012 writel((reg_addr >> 32) & 0xff, pcie_index_hi_offset); 1013 readl(pcie_index_hi_offset); 1014 } 1015 r = readl(pcie_data_offset); 1016 1017 /* clear the high bits */ 1018 if (pcie_index_hi != 0) { 1019 writel(0, pcie_index_hi_offset); 1020 readl(pcie_index_hi_offset); 1021 } 1022 1023 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 1024 1025 return r; 1026 } 1027 1028 /** 1029 * amdgpu_device_indirect_rreg64 - read a 64bits indirect register 1030 * 1031 * @adev: amdgpu_device pointer 1032 * @reg_addr: indirect register address to read from 1033 * 1034 * Returns the value of indirect register @reg_addr 1035 */ 1036 u64 amdgpu_device_indirect_rreg64(struct amdgpu_device *adev, 1037 u32 reg_addr) 1038 { 1039 unsigned long flags, pcie_index, pcie_data; 1040 void __iomem *pcie_index_offset; 1041 void __iomem *pcie_data_offset; 1042 u64 r; 1043 1044 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 1045 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 1046 1047 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 1048 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 1049 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 1050 1051 /* read low 32 bits */ 1052 writel(reg_addr, pcie_index_offset); 1053 readl(pcie_index_offset); 1054 r = readl(pcie_data_offset); 1055 /* read high 32 bits */ 1056 writel(reg_addr + 4, pcie_index_offset); 1057 readl(pcie_index_offset); 1058 r |= ((u64)readl(pcie_data_offset) << 32); 1059 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 1060 1061 return r; 1062 } 1063 1064 u64 amdgpu_device_indirect_rreg64_ext(struct amdgpu_device *adev, 1065 u64 reg_addr) 1066 { 1067 unsigned long flags, pcie_index, pcie_data; 1068 unsigned long pcie_index_hi = 0; 1069 void __iomem *pcie_index_offset; 1070 void __iomem *pcie_index_hi_offset; 1071 void __iomem *pcie_data_offset; 1072 u64 r; 1073 1074 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 1075 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 1076 if ((reg_addr >> 32) && (adev->nbio.funcs->get_pcie_index_hi_offset)) 1077 pcie_index_hi = adev->nbio.funcs->get_pcie_index_hi_offset(adev); 1078 1079 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 1080 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 1081 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 1082 if (pcie_index_hi != 0) 1083 pcie_index_hi_offset = (void __iomem *)adev->rmmio + 1084 pcie_index_hi * 4; 1085 1086 /* read low 32 bits */ 1087 writel(reg_addr, pcie_index_offset); 1088 readl(pcie_index_offset); 1089 if (pcie_index_hi != 0) { 1090 writel((reg_addr >> 32) & 0xff, pcie_index_hi_offset); 1091 readl(pcie_index_hi_offset); 1092 } 1093 r = readl(pcie_data_offset); 1094 /* read high 32 bits */ 1095 writel(reg_addr + 4, pcie_index_offset); 1096 readl(pcie_index_offset); 1097 if (pcie_index_hi != 0) { 1098 writel((reg_addr >> 32) & 0xff, pcie_index_hi_offset); 1099 readl(pcie_index_hi_offset); 1100 } 1101 r |= ((u64)readl(pcie_data_offset) << 32); 1102 1103 /* clear the high bits */ 1104 if (pcie_index_hi != 0) { 1105 writel(0, pcie_index_hi_offset); 1106 readl(pcie_index_hi_offset); 1107 } 1108 1109 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 1110 1111 return r; 1112 } 1113 1114 /** 1115 * amdgpu_device_indirect_wreg - write an indirect register address 1116 * 1117 * @adev: amdgpu_device pointer 1118 * @reg_addr: indirect register offset 1119 * @reg_data: indirect register data 1120 * 1121 */ 1122 void amdgpu_device_indirect_wreg(struct amdgpu_device *adev, 1123 u32 reg_addr, u32 reg_data) 1124 { 1125 unsigned long flags, pcie_index, pcie_data; 1126 void __iomem *pcie_index_offset; 1127 void __iomem *pcie_data_offset; 1128 1129 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 1130 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 1131 1132 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 1133 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 1134 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 1135 1136 writel(reg_addr, pcie_index_offset); 1137 readl(pcie_index_offset); 1138 writel(reg_data, pcie_data_offset); 1139 readl(pcie_data_offset); 1140 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 1141 } 1142 1143 void amdgpu_device_indirect_wreg_ext(struct amdgpu_device *adev, 1144 u64 reg_addr, u32 reg_data) 1145 { 1146 unsigned long flags, pcie_index, pcie_index_hi, pcie_data; 1147 void __iomem *pcie_index_offset; 1148 void __iomem *pcie_index_hi_offset; 1149 void __iomem *pcie_data_offset; 1150 1151 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 1152 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 1153 if ((reg_addr >> 32) && (adev->nbio.funcs->get_pcie_index_hi_offset)) 1154 pcie_index_hi = adev->nbio.funcs->get_pcie_index_hi_offset(adev); 1155 else 1156 pcie_index_hi = 0; 1157 1158 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 1159 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 1160 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 1161 if (pcie_index_hi != 0) 1162 pcie_index_hi_offset = (void __iomem *)adev->rmmio + 1163 pcie_index_hi * 4; 1164 1165 writel(reg_addr, pcie_index_offset); 1166 readl(pcie_index_offset); 1167 if (pcie_index_hi != 0) { 1168 writel((reg_addr >> 32) & 0xff, pcie_index_hi_offset); 1169 readl(pcie_index_hi_offset); 1170 } 1171 writel(reg_data, pcie_data_offset); 1172 readl(pcie_data_offset); 1173 1174 /* clear the high bits */ 1175 if (pcie_index_hi != 0) { 1176 writel(0, pcie_index_hi_offset); 1177 readl(pcie_index_hi_offset); 1178 } 1179 1180 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 1181 } 1182 1183 /** 1184 * amdgpu_device_indirect_wreg64 - write a 64bits indirect register address 1185 * 1186 * @adev: amdgpu_device pointer 1187 * @reg_addr: indirect register offset 1188 * @reg_data: indirect register data 1189 * 1190 */ 1191 void amdgpu_device_indirect_wreg64(struct amdgpu_device *adev, 1192 u32 reg_addr, u64 reg_data) 1193 { 1194 unsigned long flags, pcie_index, pcie_data; 1195 void __iomem *pcie_index_offset; 1196 void __iomem *pcie_data_offset; 1197 1198 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 1199 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 1200 1201 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 1202 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 1203 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 1204 1205 /* write low 32 bits */ 1206 writel(reg_addr, pcie_index_offset); 1207 readl(pcie_index_offset); 1208 writel((u32)(reg_data & 0xffffffffULL), pcie_data_offset); 1209 readl(pcie_data_offset); 1210 /* write high 32 bits */ 1211 writel(reg_addr + 4, pcie_index_offset); 1212 readl(pcie_index_offset); 1213 writel((u32)(reg_data >> 32), pcie_data_offset); 1214 readl(pcie_data_offset); 1215 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 1216 } 1217 1218 void amdgpu_device_indirect_wreg64_ext(struct amdgpu_device *adev, 1219 u64 reg_addr, u64 reg_data) 1220 { 1221 unsigned long flags, pcie_index, pcie_data; 1222 unsigned long pcie_index_hi = 0; 1223 void __iomem *pcie_index_offset; 1224 void __iomem *pcie_index_hi_offset; 1225 void __iomem *pcie_data_offset; 1226 1227 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 1228 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 1229 if ((reg_addr >> 32) && (adev->nbio.funcs->get_pcie_index_hi_offset)) 1230 pcie_index_hi = adev->nbio.funcs->get_pcie_index_hi_offset(adev); 1231 1232 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 1233 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 1234 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 1235 if (pcie_index_hi != 0) 1236 pcie_index_hi_offset = (void __iomem *)adev->rmmio + 1237 pcie_index_hi * 4; 1238 1239 /* write low 32 bits */ 1240 writel(reg_addr, pcie_index_offset); 1241 readl(pcie_index_offset); 1242 if (pcie_index_hi != 0) { 1243 writel((reg_addr >> 32) & 0xff, pcie_index_hi_offset); 1244 readl(pcie_index_hi_offset); 1245 } 1246 writel((u32)(reg_data & 0xffffffffULL), pcie_data_offset); 1247 readl(pcie_data_offset); 1248 /* write high 32 bits */ 1249 writel(reg_addr + 4, pcie_index_offset); 1250 readl(pcie_index_offset); 1251 if (pcie_index_hi != 0) { 1252 writel((reg_addr >> 32) & 0xff, pcie_index_hi_offset); 1253 readl(pcie_index_hi_offset); 1254 } 1255 writel((u32)(reg_data >> 32), pcie_data_offset); 1256 readl(pcie_data_offset); 1257 1258 /* clear the high bits */ 1259 if (pcie_index_hi != 0) { 1260 writel(0, pcie_index_hi_offset); 1261 readl(pcie_index_hi_offset); 1262 } 1263 1264 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 1265 } 1266 1267 /** 1268 * amdgpu_device_get_rev_id - query device rev_id 1269 * 1270 * @adev: amdgpu_device pointer 1271 * 1272 * Return device rev_id 1273 */ 1274 u32 amdgpu_device_get_rev_id(struct amdgpu_device *adev) 1275 { 1276 return adev->nbio.funcs->get_rev_id(adev); 1277 } 1278 1279 /** 1280 * amdgpu_invalid_rreg - dummy reg read function 1281 * 1282 * @adev: amdgpu_device pointer 1283 * @reg: offset of register 1284 * 1285 * Dummy register read function. Used for register blocks 1286 * that certain asics don't have (all asics). 1287 * Returns the value in the register. 1288 */ 1289 static uint32_t amdgpu_invalid_rreg(struct amdgpu_device *adev, uint32_t reg) 1290 { 1291 DRM_ERROR("Invalid callback to read register 0x%04X\n", reg); 1292 BUG(); 1293 return 0; 1294 } 1295 1296 static uint32_t amdgpu_invalid_rreg_ext(struct amdgpu_device *adev, uint64_t reg) 1297 { 1298 DRM_ERROR("Invalid callback to read register 0x%llX\n", reg); 1299 BUG(); 1300 return 0; 1301 } 1302 1303 /** 1304 * amdgpu_invalid_wreg - dummy reg write function 1305 * 1306 * @adev: amdgpu_device pointer 1307 * @reg: offset of register 1308 * @v: value to write to the register 1309 * 1310 * Dummy register read function. Used for register blocks 1311 * that certain asics don't have (all asics). 1312 */ 1313 static void amdgpu_invalid_wreg(struct amdgpu_device *adev, uint32_t reg, uint32_t v) 1314 { 1315 DRM_ERROR("Invalid callback to write register 0x%04X with 0x%08X\n", 1316 reg, v); 1317 BUG(); 1318 } 1319 1320 static void amdgpu_invalid_wreg_ext(struct amdgpu_device *adev, uint64_t reg, uint32_t v) 1321 { 1322 DRM_ERROR("Invalid callback to write register 0x%llX with 0x%08X\n", 1323 reg, v); 1324 BUG(); 1325 } 1326 1327 /** 1328 * amdgpu_invalid_rreg64 - dummy 64 bit reg read function 1329 * 1330 * @adev: amdgpu_device pointer 1331 * @reg: offset of register 1332 * 1333 * Dummy register read function. Used for register blocks 1334 * that certain asics don't have (all asics). 1335 * Returns the value in the register. 1336 */ 1337 static uint64_t amdgpu_invalid_rreg64(struct amdgpu_device *adev, uint32_t reg) 1338 { 1339 DRM_ERROR("Invalid callback to read 64 bit register 0x%04X\n", reg); 1340 BUG(); 1341 return 0; 1342 } 1343 1344 static uint64_t amdgpu_invalid_rreg64_ext(struct amdgpu_device *adev, uint64_t reg) 1345 { 1346 DRM_ERROR("Invalid callback to read register 0x%llX\n", reg); 1347 BUG(); 1348 return 0; 1349 } 1350 1351 /** 1352 * amdgpu_invalid_wreg64 - dummy reg write function 1353 * 1354 * @adev: amdgpu_device pointer 1355 * @reg: offset of register 1356 * @v: value to write to the register 1357 * 1358 * Dummy register read function. Used for register blocks 1359 * that certain asics don't have (all asics). 1360 */ 1361 static void amdgpu_invalid_wreg64(struct amdgpu_device *adev, uint32_t reg, uint64_t v) 1362 { 1363 DRM_ERROR("Invalid callback to write 64 bit register 0x%04X with 0x%08llX\n", 1364 reg, v); 1365 BUG(); 1366 } 1367 1368 static void amdgpu_invalid_wreg64_ext(struct amdgpu_device *adev, uint64_t reg, uint64_t v) 1369 { 1370 DRM_ERROR("Invalid callback to write 64 bit register 0x%llX with 0x%08llX\n", 1371 reg, v); 1372 BUG(); 1373 } 1374 1375 /** 1376 * amdgpu_block_invalid_rreg - dummy reg read function 1377 * 1378 * @adev: amdgpu_device pointer 1379 * @block: offset of instance 1380 * @reg: offset of register 1381 * 1382 * Dummy register read function. Used for register blocks 1383 * that certain asics don't have (all asics). 1384 * Returns the value in the register. 1385 */ 1386 static uint32_t amdgpu_block_invalid_rreg(struct amdgpu_device *adev, 1387 uint32_t block, uint32_t reg) 1388 { 1389 DRM_ERROR("Invalid callback to read register 0x%04X in block 0x%04X\n", 1390 reg, block); 1391 BUG(); 1392 return 0; 1393 } 1394 1395 /** 1396 * amdgpu_block_invalid_wreg - dummy reg write function 1397 * 1398 * @adev: amdgpu_device pointer 1399 * @block: offset of instance 1400 * @reg: offset of register 1401 * @v: value to write to the register 1402 * 1403 * Dummy register read function. Used for register blocks 1404 * that certain asics don't have (all asics). 1405 */ 1406 static void amdgpu_block_invalid_wreg(struct amdgpu_device *adev, 1407 uint32_t block, 1408 uint32_t reg, uint32_t v) 1409 { 1410 DRM_ERROR("Invalid block callback to write register 0x%04X in block 0x%04X with 0x%08X\n", 1411 reg, block, v); 1412 BUG(); 1413 } 1414 1415 static uint32_t amdgpu_device_get_vbios_flags(struct amdgpu_device *adev) 1416 { 1417 if (hweight32(adev->aid_mask) && (adev->flags & AMD_IS_APU)) 1418 return AMDGPU_VBIOS_SKIP; 1419 1420 if (hweight32(adev->aid_mask) && amdgpu_passthrough(adev)) 1421 return AMDGPU_VBIOS_OPTIONAL; 1422 1423 return 0; 1424 } 1425 1426 /** 1427 * amdgpu_device_asic_init - Wrapper for atom asic_init 1428 * 1429 * @adev: amdgpu_device pointer 1430 * 1431 * Does any asic specific work and then calls atom asic init. 1432 */ 1433 static int amdgpu_device_asic_init(struct amdgpu_device *adev) 1434 { 1435 uint32_t flags; 1436 bool optional; 1437 int ret; 1438 1439 amdgpu_asic_pre_asic_init(adev); 1440 flags = amdgpu_device_get_vbios_flags(adev); 1441 optional = !!(flags & (AMDGPU_VBIOS_OPTIONAL | AMDGPU_VBIOS_SKIP)); 1442 1443 if (amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 3) || 1444 amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 4) || 1445 amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 5, 0) || 1446 amdgpu_ip_version(adev, GC_HWIP, 0) >= IP_VERSION(11, 0, 0)) { 1447 amdgpu_psp_wait_for_bootloader(adev); 1448 if (optional && !adev->bios) 1449 return 0; 1450 1451 ret = amdgpu_atomfirmware_asic_init(adev, true); 1452 return ret; 1453 } else { 1454 if (optional && !adev->bios) 1455 return 0; 1456 1457 return amdgpu_atom_asic_init(adev->mode_info.atom_context); 1458 } 1459 1460 return 0; 1461 } 1462 1463 /** 1464 * amdgpu_device_mem_scratch_init - allocate the VRAM scratch page 1465 * 1466 * @adev: amdgpu_device pointer 1467 * 1468 * Allocates a scratch page of VRAM for use by various things in the 1469 * driver. 1470 */ 1471 static int amdgpu_device_mem_scratch_init(struct amdgpu_device *adev) 1472 { 1473 return amdgpu_bo_create_kernel(adev, AMDGPU_GPU_PAGE_SIZE, PAGE_SIZE, 1474 AMDGPU_GEM_DOMAIN_VRAM | 1475 AMDGPU_GEM_DOMAIN_GTT, 1476 &adev->mem_scratch.robj, 1477 &adev->mem_scratch.gpu_addr, 1478 (void **)&adev->mem_scratch.ptr); 1479 } 1480 1481 /** 1482 * amdgpu_device_mem_scratch_fini - Free the VRAM scratch page 1483 * 1484 * @adev: amdgpu_device pointer 1485 * 1486 * Frees the VRAM scratch page. 1487 */ 1488 static void amdgpu_device_mem_scratch_fini(struct amdgpu_device *adev) 1489 { 1490 amdgpu_bo_free_kernel(&adev->mem_scratch.robj, NULL, NULL); 1491 } 1492 1493 /** 1494 * amdgpu_device_program_register_sequence - program an array of registers. 1495 * 1496 * @adev: amdgpu_device pointer 1497 * @registers: pointer to the register array 1498 * @array_size: size of the register array 1499 * 1500 * Programs an array or registers with and or masks. 1501 * This is a helper for setting golden registers. 1502 */ 1503 void amdgpu_device_program_register_sequence(struct amdgpu_device *adev, 1504 const u32 *registers, 1505 const u32 array_size) 1506 { 1507 u32 tmp, reg, and_mask, or_mask; 1508 int i; 1509 1510 if (array_size % 3) 1511 return; 1512 1513 for (i = 0; i < array_size; i += 3) { 1514 reg = registers[i + 0]; 1515 and_mask = registers[i + 1]; 1516 or_mask = registers[i + 2]; 1517 1518 if (and_mask == 0xffffffff) { 1519 tmp = or_mask; 1520 } else { 1521 tmp = RREG32(reg); 1522 tmp &= ~and_mask; 1523 if (adev->family >= AMDGPU_FAMILY_AI) 1524 tmp |= (or_mask & and_mask); 1525 else 1526 tmp |= or_mask; 1527 } 1528 WREG32(reg, tmp); 1529 } 1530 } 1531 1532 /** 1533 * amdgpu_device_pci_config_reset - reset the GPU 1534 * 1535 * @adev: amdgpu_device pointer 1536 * 1537 * Resets the GPU using the pci config reset sequence. 1538 * Only applicable to asics prior to vega10. 1539 */ 1540 void amdgpu_device_pci_config_reset(struct amdgpu_device *adev) 1541 { 1542 pci_write_config_dword(adev->pdev, 0x7c, AMDGPU_ASIC_RESET_DATA); 1543 } 1544 1545 /** 1546 * amdgpu_device_pci_reset - reset the GPU using generic PCI means 1547 * 1548 * @adev: amdgpu_device pointer 1549 * 1550 * Resets the GPU using generic pci reset interfaces (FLR, SBR, etc.). 1551 */ 1552 int amdgpu_device_pci_reset(struct amdgpu_device *adev) 1553 { 1554 return pci_reset_function(adev->pdev); 1555 } 1556 1557 /* 1558 * amdgpu_device_wb_*() 1559 * Writeback is the method by which the GPU updates special pages in memory 1560 * with the status of certain GPU events (fences, ring pointers,etc.). 1561 */ 1562 1563 /** 1564 * amdgpu_device_wb_fini - Disable Writeback and free memory 1565 * 1566 * @adev: amdgpu_device pointer 1567 * 1568 * Disables Writeback and frees the Writeback memory (all asics). 1569 * Used at driver shutdown. 1570 */ 1571 static void amdgpu_device_wb_fini(struct amdgpu_device *adev) 1572 { 1573 if (adev->wb.wb_obj) { 1574 amdgpu_bo_free_kernel(&adev->wb.wb_obj, 1575 &adev->wb.gpu_addr, 1576 (void **)&adev->wb.wb); 1577 adev->wb.wb_obj = NULL; 1578 } 1579 } 1580 1581 /** 1582 * amdgpu_device_wb_init - Init Writeback driver info and allocate memory 1583 * 1584 * @adev: amdgpu_device pointer 1585 * 1586 * Initializes writeback and allocates writeback memory (all asics). 1587 * Used at driver startup. 1588 * Returns 0 on success or an -error on failure. 1589 */ 1590 static int amdgpu_device_wb_init(struct amdgpu_device *adev) 1591 { 1592 int r; 1593 1594 if (adev->wb.wb_obj == NULL) { 1595 /* AMDGPU_MAX_WB * sizeof(uint32_t) * 8 = AMDGPU_MAX_WB 256bit slots */ 1596 r = amdgpu_bo_create_kernel(adev, AMDGPU_MAX_WB * sizeof(uint32_t) * 8, 1597 PAGE_SIZE, AMDGPU_GEM_DOMAIN_GTT, 1598 &adev->wb.wb_obj, &adev->wb.gpu_addr, 1599 (void **)&adev->wb.wb); 1600 if (r) { 1601 dev_warn(adev->dev, "(%d) create WB bo failed\n", r); 1602 return r; 1603 } 1604 1605 adev->wb.num_wb = AMDGPU_MAX_WB; 1606 memset(&adev->wb.used, 0, sizeof(adev->wb.used)); 1607 1608 /* clear wb memory */ 1609 memset((char *)adev->wb.wb, 0, AMDGPU_MAX_WB * sizeof(uint32_t) * 8); 1610 } 1611 1612 return 0; 1613 } 1614 1615 /** 1616 * amdgpu_device_wb_get - Allocate a wb entry 1617 * 1618 * @adev: amdgpu_device pointer 1619 * @wb: wb index 1620 * 1621 * Allocate a wb slot for use by the driver (all asics). 1622 * Returns 0 on success or -EINVAL on failure. 1623 */ 1624 int amdgpu_device_wb_get(struct amdgpu_device *adev, u32 *wb) 1625 { 1626 unsigned long flags, offset; 1627 1628 spin_lock_irqsave(&adev->wb.lock, flags); 1629 offset = find_first_zero_bit(adev->wb.used, adev->wb.num_wb); 1630 if (offset < adev->wb.num_wb) { 1631 __set_bit(offset, adev->wb.used); 1632 spin_unlock_irqrestore(&adev->wb.lock, flags); 1633 *wb = offset << 3; /* convert to dw offset */ 1634 return 0; 1635 } else { 1636 spin_unlock_irqrestore(&adev->wb.lock, flags); 1637 return -EINVAL; 1638 } 1639 } 1640 1641 /** 1642 * amdgpu_device_wb_free - Free a wb entry 1643 * 1644 * @adev: amdgpu_device pointer 1645 * @wb: wb index 1646 * 1647 * Free a wb slot allocated for use by the driver (all asics) 1648 */ 1649 void amdgpu_device_wb_free(struct amdgpu_device *adev, u32 wb) 1650 { 1651 unsigned long flags; 1652 1653 wb >>= 3; 1654 spin_lock_irqsave(&adev->wb.lock, flags); 1655 if (wb < adev->wb.num_wb) 1656 __clear_bit(wb, adev->wb.used); 1657 spin_unlock_irqrestore(&adev->wb.lock, flags); 1658 } 1659 1660 /** 1661 * amdgpu_device_resize_fb_bar - try to resize FB BAR 1662 * 1663 * @adev: amdgpu_device pointer 1664 * 1665 * Try to resize FB BAR to make all VRAM CPU accessible. We try very hard not 1666 * to fail, but if any of the BARs is not accessible after the size we abort 1667 * driver loading by returning -ENODEV. 1668 */ 1669 int amdgpu_device_resize_fb_bar(struct amdgpu_device *adev) 1670 { 1671 int rbar_size = pci_rebar_bytes_to_size(adev->gmc.real_vram_size); 1672 struct pci_bus *root; 1673 struct resource *res; 1674 unsigned int i; 1675 u16 cmd; 1676 int r; 1677 1678 if (!IS_ENABLED(CONFIG_PHYS_ADDR_T_64BIT)) 1679 return 0; 1680 1681 /* Bypass for VF */ 1682 if (amdgpu_sriov_vf(adev)) 1683 return 0; 1684 1685 if (!amdgpu_rebar) 1686 return 0; 1687 1688 /* resizing on Dell G5 SE platforms causes problems with runtime pm */ 1689 if ((amdgpu_runtime_pm != 0) && 1690 adev->pdev->vendor == PCI_VENDOR_ID_ATI && 1691 adev->pdev->device == 0x731f && 1692 adev->pdev->subsystem_vendor == PCI_VENDOR_ID_DELL) 1693 return 0; 1694 1695 /* PCI_EXT_CAP_ID_VNDR extended capability is located at 0x100 */ 1696 if (!pci_find_ext_capability(adev->pdev, PCI_EXT_CAP_ID_VNDR)) 1697 DRM_WARN("System can't access extended configuration space, please check!!\n"); 1698 1699 /* skip if the bios has already enabled large BAR */ 1700 if (adev->gmc.real_vram_size && 1701 (pci_resource_len(adev->pdev, 0) >= adev->gmc.real_vram_size)) 1702 return 0; 1703 1704 /* Check if the root BUS has 64bit memory resources */ 1705 root = adev->pdev->bus; 1706 while (root->parent) 1707 root = root->parent; 1708 1709 pci_bus_for_each_resource(root, res, i) { 1710 if (res && res->flags & (IORESOURCE_MEM | IORESOURCE_MEM_64) && 1711 res->start > 0x100000000ull) 1712 break; 1713 } 1714 1715 /* Trying to resize is pointless without a root hub window above 4GB */ 1716 if (!res) 1717 return 0; 1718 1719 /* Limit the BAR size to what is available */ 1720 rbar_size = min(fls(pci_rebar_get_possible_sizes(adev->pdev, 0)) - 1, 1721 rbar_size); 1722 1723 /* Disable memory decoding while we change the BAR addresses and size */ 1724 pci_read_config_word(adev->pdev, PCI_COMMAND, &cmd); 1725 pci_write_config_word(adev->pdev, PCI_COMMAND, 1726 cmd & ~PCI_COMMAND_MEMORY); 1727 1728 /* Free the VRAM and doorbell BAR, we most likely need to move both. */ 1729 amdgpu_doorbell_fini(adev); 1730 if (adev->asic_type >= CHIP_BONAIRE) 1731 pci_release_resource(adev->pdev, 2); 1732 1733 pci_release_resource(adev->pdev, 0); 1734 1735 r = pci_resize_resource(adev->pdev, 0, rbar_size); 1736 if (r == -ENOSPC) 1737 DRM_INFO("Not enough PCI address space for a large BAR."); 1738 else if (r && r != -ENOTSUPP) 1739 DRM_ERROR("Problem resizing BAR0 (%d).", r); 1740 1741 pci_assign_unassigned_bus_resources(adev->pdev->bus); 1742 1743 /* When the doorbell or fb BAR isn't available we have no chance of 1744 * using the device. 1745 */ 1746 r = amdgpu_doorbell_init(adev); 1747 if (r || (pci_resource_flags(adev->pdev, 0) & IORESOURCE_UNSET)) 1748 return -ENODEV; 1749 1750 pci_write_config_word(adev->pdev, PCI_COMMAND, cmd); 1751 1752 return 0; 1753 } 1754 1755 /* 1756 * GPU helpers function. 1757 */ 1758 /** 1759 * amdgpu_device_need_post - check if the hw need post or not 1760 * 1761 * @adev: amdgpu_device pointer 1762 * 1763 * Check if the asic has been initialized (all asics) at driver startup 1764 * or post is needed if hw reset is performed. 1765 * Returns true if need or false if not. 1766 */ 1767 bool amdgpu_device_need_post(struct amdgpu_device *adev) 1768 { 1769 uint32_t reg, flags; 1770 1771 if (amdgpu_sriov_vf(adev)) 1772 return false; 1773 1774 flags = amdgpu_device_get_vbios_flags(adev); 1775 if (flags & AMDGPU_VBIOS_SKIP) 1776 return false; 1777 if ((flags & AMDGPU_VBIOS_OPTIONAL) && !adev->bios) 1778 return false; 1779 1780 if (amdgpu_passthrough(adev)) { 1781 /* for FIJI: In whole GPU pass-through virtualization case, after VM reboot 1782 * some old smc fw still need driver do vPost otherwise gpu hang, while 1783 * those smc fw version above 22.15 doesn't have this flaw, so we force 1784 * vpost executed for smc version below 22.15 1785 */ 1786 if (adev->asic_type == CHIP_FIJI) { 1787 int err; 1788 uint32_t fw_ver; 1789 1790 err = request_firmware(&adev->pm.fw, "amdgpu/fiji_smc.bin", adev->dev); 1791 /* force vPost if error occurred */ 1792 if (err) 1793 return true; 1794 1795 fw_ver = *((uint32_t *)adev->pm.fw->data + 69); 1796 release_firmware(adev->pm.fw); 1797 if (fw_ver < 0x00160e00) 1798 return true; 1799 } 1800 } 1801 1802 /* Don't post if we need to reset whole hive on init */ 1803 if (adev->init_lvl->level == AMDGPU_INIT_LEVEL_MINIMAL_XGMI) 1804 return false; 1805 1806 if (adev->has_hw_reset) { 1807 adev->has_hw_reset = false; 1808 return true; 1809 } 1810 1811 /* bios scratch used on CIK+ */ 1812 if (adev->asic_type >= CHIP_BONAIRE) 1813 return amdgpu_atombios_scratch_need_asic_init(adev); 1814 1815 /* check MEM_SIZE for older asics */ 1816 reg = amdgpu_asic_get_config_memsize(adev); 1817 1818 if ((reg != 0) && (reg != 0xffffffff)) 1819 return false; 1820 1821 return true; 1822 } 1823 1824 /* 1825 * Check whether seamless boot is supported. 1826 * 1827 * So far we only support seamless boot on DCE 3.0 or later. 1828 * If users report that it works on older ASICS as well, we may 1829 * loosen this. 1830 */ 1831 bool amdgpu_device_seamless_boot_supported(struct amdgpu_device *adev) 1832 { 1833 switch (amdgpu_seamless) { 1834 case -1: 1835 break; 1836 case 1: 1837 return true; 1838 case 0: 1839 return false; 1840 default: 1841 DRM_ERROR("Invalid value for amdgpu.seamless: %d\n", 1842 amdgpu_seamless); 1843 return false; 1844 } 1845 1846 if (!(adev->flags & AMD_IS_APU)) 1847 return false; 1848 1849 if (adev->mman.keep_stolen_vga_memory) 1850 return false; 1851 1852 return amdgpu_ip_version(adev, DCE_HWIP, 0) >= IP_VERSION(3, 0, 0); 1853 } 1854 1855 /* 1856 * Intel hosts such as Rocket Lake, Alder Lake, Raptor Lake and Sapphire Rapids 1857 * don't support dynamic speed switching. Until we have confirmation from Intel 1858 * that a specific host supports it, it's safer that we keep it disabled for all. 1859 * 1860 * https://edc.intel.com/content/www/us/en/design/products/platforms/details/raptor-lake-s/13th-generation-core-processors-datasheet-volume-1-of-2/005/pci-express-support/ 1861 * https://gitlab.freedesktop.org/drm/amd/-/issues/2663 1862 */ 1863 static bool amdgpu_device_pcie_dynamic_switching_supported(struct amdgpu_device *adev) 1864 { 1865 #if IS_ENABLED(CONFIG_X86) 1866 struct cpuinfo_x86 *c = &cpu_data(0); 1867 1868 /* eGPU change speeds based on USB4 fabric conditions */ 1869 if (dev_is_removable(adev->dev)) 1870 return true; 1871 1872 if (c->x86_vendor == X86_VENDOR_INTEL) 1873 return false; 1874 #endif 1875 return true; 1876 } 1877 1878 static bool amdgpu_device_aspm_support_quirk(struct amdgpu_device *adev) 1879 { 1880 #if IS_ENABLED(CONFIG_X86) 1881 struct cpuinfo_x86 *c = &cpu_data(0); 1882 1883 if (!(amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(12, 0, 0) || 1884 amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(12, 0, 1))) 1885 return false; 1886 1887 if (c->x86 == 6 && 1888 adev->pm.pcie_gen_mask & CAIL_PCIE_LINK_SPEED_SUPPORT_GEN5) { 1889 switch (c->x86_model) { 1890 case VFM_MODEL(INTEL_ALDERLAKE): 1891 case VFM_MODEL(INTEL_ALDERLAKE_L): 1892 case VFM_MODEL(INTEL_RAPTORLAKE): 1893 case VFM_MODEL(INTEL_RAPTORLAKE_P): 1894 case VFM_MODEL(INTEL_RAPTORLAKE_S): 1895 return true; 1896 default: 1897 return false; 1898 } 1899 } else { 1900 return false; 1901 } 1902 #else 1903 return false; 1904 #endif 1905 } 1906 1907 /** 1908 * amdgpu_device_should_use_aspm - check if the device should program ASPM 1909 * 1910 * @adev: amdgpu_device pointer 1911 * 1912 * Confirm whether the module parameter and pcie bridge agree that ASPM should 1913 * be set for this device. 1914 * 1915 * Returns true if it should be used or false if not. 1916 */ 1917 bool amdgpu_device_should_use_aspm(struct amdgpu_device *adev) 1918 { 1919 switch (amdgpu_aspm) { 1920 case -1: 1921 break; 1922 case 0: 1923 return false; 1924 case 1: 1925 return true; 1926 default: 1927 return false; 1928 } 1929 if (adev->flags & AMD_IS_APU) 1930 return false; 1931 if (amdgpu_device_aspm_support_quirk(adev)) 1932 return false; 1933 return pcie_aspm_enabled(adev->pdev); 1934 } 1935 1936 /* if we get transitioned to only one device, take VGA back */ 1937 /** 1938 * amdgpu_device_vga_set_decode - enable/disable vga decode 1939 * 1940 * @pdev: PCI device pointer 1941 * @state: enable/disable vga decode 1942 * 1943 * Enable/disable vga decode (all asics). 1944 * Returns VGA resource flags. 1945 */ 1946 static unsigned int amdgpu_device_vga_set_decode(struct pci_dev *pdev, 1947 bool state) 1948 { 1949 struct amdgpu_device *adev = drm_to_adev(pci_get_drvdata(pdev)); 1950 1951 amdgpu_asic_set_vga_state(adev, state); 1952 if (state) 1953 return VGA_RSRC_LEGACY_IO | VGA_RSRC_LEGACY_MEM | 1954 VGA_RSRC_NORMAL_IO | VGA_RSRC_NORMAL_MEM; 1955 else 1956 return VGA_RSRC_NORMAL_IO | VGA_RSRC_NORMAL_MEM; 1957 } 1958 1959 /** 1960 * amdgpu_device_check_block_size - validate the vm block size 1961 * 1962 * @adev: amdgpu_device pointer 1963 * 1964 * Validates the vm block size specified via module parameter. 1965 * The vm block size defines number of bits in page table versus page directory, 1966 * a page is 4KB so we have 12 bits offset, minimum 9 bits in the 1967 * page table and the remaining bits are in the page directory. 1968 */ 1969 static void amdgpu_device_check_block_size(struct amdgpu_device *adev) 1970 { 1971 /* defines number of bits in page table versus page directory, 1972 * a page is 4KB so we have 12 bits offset, minimum 9 bits in the 1973 * page table and the remaining bits are in the page directory 1974 */ 1975 if (amdgpu_vm_block_size == -1) 1976 return; 1977 1978 if (amdgpu_vm_block_size < 9) { 1979 dev_warn(adev->dev, "VM page table size (%d) too small\n", 1980 amdgpu_vm_block_size); 1981 amdgpu_vm_block_size = -1; 1982 } 1983 } 1984 1985 /** 1986 * amdgpu_device_check_vm_size - validate the vm size 1987 * 1988 * @adev: amdgpu_device pointer 1989 * 1990 * Validates the vm size in GB specified via module parameter. 1991 * The VM size is the size of the GPU virtual memory space in GB. 1992 */ 1993 static void amdgpu_device_check_vm_size(struct amdgpu_device *adev) 1994 { 1995 /* no need to check the default value */ 1996 if (amdgpu_vm_size == -1) 1997 return; 1998 1999 if (amdgpu_vm_size < 1) { 2000 dev_warn(adev->dev, "VM size (%d) too small, min is 1GB\n", 2001 amdgpu_vm_size); 2002 amdgpu_vm_size = -1; 2003 } 2004 } 2005 2006 static void amdgpu_device_check_smu_prv_buffer_size(struct amdgpu_device *adev) 2007 { 2008 struct sysinfo si; 2009 bool is_os_64 = (sizeof(void *) == 8); 2010 uint64_t total_memory; 2011 uint64_t dram_size_seven_GB = 0x1B8000000; 2012 uint64_t dram_size_three_GB = 0xB8000000; 2013 2014 if (amdgpu_smu_memory_pool_size == 0) 2015 return; 2016 2017 if (!is_os_64) { 2018 DRM_WARN("Not 64-bit OS, feature not supported\n"); 2019 goto def_value; 2020 } 2021 si_meminfo(&si); 2022 total_memory = (uint64_t)si.totalram * si.mem_unit; 2023 2024 if ((amdgpu_smu_memory_pool_size == 1) || 2025 (amdgpu_smu_memory_pool_size == 2)) { 2026 if (total_memory < dram_size_three_GB) 2027 goto def_value1; 2028 } else if ((amdgpu_smu_memory_pool_size == 4) || 2029 (amdgpu_smu_memory_pool_size == 8)) { 2030 if (total_memory < dram_size_seven_GB) 2031 goto def_value1; 2032 } else { 2033 DRM_WARN("Smu memory pool size not supported\n"); 2034 goto def_value; 2035 } 2036 adev->pm.smu_prv_buffer_size = amdgpu_smu_memory_pool_size << 28; 2037 2038 return; 2039 2040 def_value1: 2041 DRM_WARN("No enough system memory\n"); 2042 def_value: 2043 adev->pm.smu_prv_buffer_size = 0; 2044 } 2045 2046 static int amdgpu_device_init_apu_flags(struct amdgpu_device *adev) 2047 { 2048 if (!(adev->flags & AMD_IS_APU) || 2049 adev->asic_type < CHIP_RAVEN) 2050 return 0; 2051 2052 switch (adev->asic_type) { 2053 case CHIP_RAVEN: 2054 if (adev->pdev->device == 0x15dd) 2055 adev->apu_flags |= AMD_APU_IS_RAVEN; 2056 if (adev->pdev->device == 0x15d8) 2057 adev->apu_flags |= AMD_APU_IS_PICASSO; 2058 break; 2059 case CHIP_RENOIR: 2060 if ((adev->pdev->device == 0x1636) || 2061 (adev->pdev->device == 0x164c)) 2062 adev->apu_flags |= AMD_APU_IS_RENOIR; 2063 else 2064 adev->apu_flags |= AMD_APU_IS_GREEN_SARDINE; 2065 break; 2066 case CHIP_VANGOGH: 2067 adev->apu_flags |= AMD_APU_IS_VANGOGH; 2068 break; 2069 case CHIP_YELLOW_CARP: 2070 break; 2071 case CHIP_CYAN_SKILLFISH: 2072 if ((adev->pdev->device == 0x13FE) || 2073 (adev->pdev->device == 0x143F)) 2074 adev->apu_flags |= AMD_APU_IS_CYAN_SKILLFISH2; 2075 break; 2076 default: 2077 break; 2078 } 2079 2080 return 0; 2081 } 2082 2083 /** 2084 * amdgpu_device_check_arguments - validate module params 2085 * 2086 * @adev: amdgpu_device pointer 2087 * 2088 * Validates certain module parameters and updates 2089 * the associated values used by the driver (all asics). 2090 */ 2091 static int amdgpu_device_check_arguments(struct amdgpu_device *adev) 2092 { 2093 int i; 2094 2095 if (amdgpu_sched_jobs < 4) { 2096 dev_warn(adev->dev, "sched jobs (%d) must be at least 4\n", 2097 amdgpu_sched_jobs); 2098 amdgpu_sched_jobs = 4; 2099 } else if (!is_power_of_2(amdgpu_sched_jobs)) { 2100 dev_warn(adev->dev, "sched jobs (%d) must be a power of 2\n", 2101 amdgpu_sched_jobs); 2102 amdgpu_sched_jobs = roundup_pow_of_two(amdgpu_sched_jobs); 2103 } 2104 2105 if (amdgpu_gart_size != -1 && amdgpu_gart_size < 32) { 2106 /* gart size must be greater or equal to 32M */ 2107 dev_warn(adev->dev, "gart size (%d) too small\n", 2108 amdgpu_gart_size); 2109 amdgpu_gart_size = -1; 2110 } 2111 2112 if (amdgpu_gtt_size != -1 && amdgpu_gtt_size < 32) { 2113 /* gtt size must be greater or equal to 32M */ 2114 dev_warn(adev->dev, "gtt size (%d) too small\n", 2115 amdgpu_gtt_size); 2116 amdgpu_gtt_size = -1; 2117 } 2118 2119 /* valid range is between 4 and 9 inclusive */ 2120 if (amdgpu_vm_fragment_size != -1 && 2121 (amdgpu_vm_fragment_size > 9 || amdgpu_vm_fragment_size < 4)) { 2122 dev_warn(adev->dev, "valid range is between 4 and 9\n"); 2123 amdgpu_vm_fragment_size = -1; 2124 } 2125 2126 if (amdgpu_sched_hw_submission < 2) { 2127 dev_warn(adev->dev, "sched hw submission jobs (%d) must be at least 2\n", 2128 amdgpu_sched_hw_submission); 2129 amdgpu_sched_hw_submission = 2; 2130 } else if (!is_power_of_2(amdgpu_sched_hw_submission)) { 2131 dev_warn(adev->dev, "sched hw submission jobs (%d) must be a power of 2\n", 2132 amdgpu_sched_hw_submission); 2133 amdgpu_sched_hw_submission = roundup_pow_of_two(amdgpu_sched_hw_submission); 2134 } 2135 2136 if (amdgpu_reset_method < -1 || amdgpu_reset_method > 4) { 2137 dev_warn(adev->dev, "invalid option for reset method, reverting to default\n"); 2138 amdgpu_reset_method = -1; 2139 } 2140 2141 amdgpu_device_check_smu_prv_buffer_size(adev); 2142 2143 amdgpu_device_check_vm_size(adev); 2144 2145 amdgpu_device_check_block_size(adev); 2146 2147 adev->firmware.load_type = amdgpu_ucode_get_load_type(adev, amdgpu_fw_load_type); 2148 2149 for (i = 0; i < MAX_XCP; i++) { 2150 switch (amdgpu_enforce_isolation) { 2151 case -1: 2152 case 0: 2153 default: 2154 /* disable */ 2155 adev->enforce_isolation[i] = AMDGPU_ENFORCE_ISOLATION_DISABLE; 2156 break; 2157 case 1: 2158 /* enable */ 2159 adev->enforce_isolation[i] = 2160 AMDGPU_ENFORCE_ISOLATION_ENABLE; 2161 break; 2162 case 2: 2163 /* enable legacy mode */ 2164 adev->enforce_isolation[i] = 2165 AMDGPU_ENFORCE_ISOLATION_ENABLE_LEGACY; 2166 break; 2167 case 3: 2168 /* enable only process isolation without submitting cleaner shader */ 2169 adev->enforce_isolation[i] = 2170 AMDGPU_ENFORCE_ISOLATION_NO_CLEANER_SHADER; 2171 break; 2172 } 2173 } 2174 2175 return 0; 2176 } 2177 2178 /** 2179 * amdgpu_switcheroo_set_state - set switcheroo state 2180 * 2181 * @pdev: pci dev pointer 2182 * @state: vga_switcheroo state 2183 * 2184 * Callback for the switcheroo driver. Suspends or resumes 2185 * the asics before or after it is powered up using ACPI methods. 2186 */ 2187 static void amdgpu_switcheroo_set_state(struct pci_dev *pdev, 2188 enum vga_switcheroo_state state) 2189 { 2190 struct drm_device *dev = pci_get_drvdata(pdev); 2191 int r; 2192 2193 if (amdgpu_device_supports_px(dev) && state == VGA_SWITCHEROO_OFF) 2194 return; 2195 2196 if (state == VGA_SWITCHEROO_ON) { 2197 pr_info("switched on\n"); 2198 /* don't suspend or resume card normally */ 2199 dev->switch_power_state = DRM_SWITCH_POWER_CHANGING; 2200 2201 pci_set_power_state(pdev, PCI_D0); 2202 amdgpu_device_load_pci_state(pdev); 2203 r = pci_enable_device(pdev); 2204 if (r) 2205 DRM_WARN("pci_enable_device failed (%d)\n", r); 2206 amdgpu_device_resume(dev, true); 2207 2208 dev->switch_power_state = DRM_SWITCH_POWER_ON; 2209 } else { 2210 pr_info("switched off\n"); 2211 dev->switch_power_state = DRM_SWITCH_POWER_CHANGING; 2212 amdgpu_device_prepare(dev); 2213 amdgpu_device_suspend(dev, true); 2214 amdgpu_device_cache_pci_state(pdev); 2215 /* Shut down the device */ 2216 pci_disable_device(pdev); 2217 pci_set_power_state(pdev, PCI_D3cold); 2218 dev->switch_power_state = DRM_SWITCH_POWER_OFF; 2219 } 2220 } 2221 2222 /** 2223 * amdgpu_switcheroo_can_switch - see if switcheroo state can change 2224 * 2225 * @pdev: pci dev pointer 2226 * 2227 * Callback for the switcheroo driver. Check of the switcheroo 2228 * state can be changed. 2229 * Returns true if the state can be changed, false if not. 2230 */ 2231 static bool amdgpu_switcheroo_can_switch(struct pci_dev *pdev) 2232 { 2233 struct drm_device *dev = pci_get_drvdata(pdev); 2234 2235 /* 2236 * FIXME: open_count is protected by drm_global_mutex but that would lead to 2237 * locking inversion with the driver load path. And the access here is 2238 * completely racy anyway. So don't bother with locking for now. 2239 */ 2240 return atomic_read(&dev->open_count) == 0; 2241 } 2242 2243 static const struct vga_switcheroo_client_ops amdgpu_switcheroo_ops = { 2244 .set_gpu_state = amdgpu_switcheroo_set_state, 2245 .reprobe = NULL, 2246 .can_switch = amdgpu_switcheroo_can_switch, 2247 }; 2248 2249 /** 2250 * amdgpu_device_ip_set_clockgating_state - set the CG state 2251 * 2252 * @dev: amdgpu_device pointer 2253 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.) 2254 * @state: clockgating state (gate or ungate) 2255 * 2256 * Sets the requested clockgating state for all instances of 2257 * the hardware IP specified. 2258 * Returns the error code from the last instance. 2259 */ 2260 int amdgpu_device_ip_set_clockgating_state(void *dev, 2261 enum amd_ip_block_type block_type, 2262 enum amd_clockgating_state state) 2263 { 2264 struct amdgpu_device *adev = dev; 2265 int i, r = 0; 2266 2267 for (i = 0; i < adev->num_ip_blocks; i++) { 2268 if (!adev->ip_blocks[i].status.valid) 2269 continue; 2270 if (adev->ip_blocks[i].version->type != block_type) 2271 continue; 2272 if (!adev->ip_blocks[i].version->funcs->set_clockgating_state) 2273 continue; 2274 r = adev->ip_blocks[i].version->funcs->set_clockgating_state( 2275 &adev->ip_blocks[i], state); 2276 if (r) 2277 DRM_ERROR("set_clockgating_state of IP block <%s> failed %d\n", 2278 adev->ip_blocks[i].version->funcs->name, r); 2279 } 2280 return r; 2281 } 2282 2283 /** 2284 * amdgpu_device_ip_set_powergating_state - set the PG state 2285 * 2286 * @dev: amdgpu_device pointer 2287 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.) 2288 * @state: powergating state (gate or ungate) 2289 * 2290 * Sets the requested powergating state for all instances of 2291 * the hardware IP specified. 2292 * Returns the error code from the last instance. 2293 */ 2294 int amdgpu_device_ip_set_powergating_state(void *dev, 2295 enum amd_ip_block_type block_type, 2296 enum amd_powergating_state state) 2297 { 2298 struct amdgpu_device *adev = dev; 2299 int i, r = 0; 2300 2301 for (i = 0; i < adev->num_ip_blocks; i++) { 2302 if (!adev->ip_blocks[i].status.valid) 2303 continue; 2304 if (adev->ip_blocks[i].version->type != block_type) 2305 continue; 2306 if (!adev->ip_blocks[i].version->funcs->set_powergating_state) 2307 continue; 2308 r = adev->ip_blocks[i].version->funcs->set_powergating_state( 2309 &adev->ip_blocks[i], state); 2310 if (r) 2311 DRM_ERROR("set_powergating_state of IP block <%s> failed %d\n", 2312 adev->ip_blocks[i].version->funcs->name, r); 2313 } 2314 return r; 2315 } 2316 2317 /** 2318 * amdgpu_device_ip_get_clockgating_state - get the CG state 2319 * 2320 * @adev: amdgpu_device pointer 2321 * @flags: clockgating feature flags 2322 * 2323 * Walks the list of IPs on the device and updates the clockgating 2324 * flags for each IP. 2325 * Updates @flags with the feature flags for each hardware IP where 2326 * clockgating is enabled. 2327 */ 2328 void amdgpu_device_ip_get_clockgating_state(struct amdgpu_device *adev, 2329 u64 *flags) 2330 { 2331 int i; 2332 2333 for (i = 0; i < adev->num_ip_blocks; i++) { 2334 if (!adev->ip_blocks[i].status.valid) 2335 continue; 2336 if (adev->ip_blocks[i].version->funcs->get_clockgating_state) 2337 adev->ip_blocks[i].version->funcs->get_clockgating_state( 2338 &adev->ip_blocks[i], flags); 2339 } 2340 } 2341 2342 /** 2343 * amdgpu_device_ip_wait_for_idle - wait for idle 2344 * 2345 * @adev: amdgpu_device pointer 2346 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.) 2347 * 2348 * Waits for the request hardware IP to be idle. 2349 * Returns 0 for success or a negative error code on failure. 2350 */ 2351 int amdgpu_device_ip_wait_for_idle(struct amdgpu_device *adev, 2352 enum amd_ip_block_type block_type) 2353 { 2354 int i, r; 2355 2356 for (i = 0; i < adev->num_ip_blocks; i++) { 2357 if (!adev->ip_blocks[i].status.valid) 2358 continue; 2359 if (adev->ip_blocks[i].version->type == block_type) { 2360 if (adev->ip_blocks[i].version->funcs->wait_for_idle) { 2361 r = adev->ip_blocks[i].version->funcs->wait_for_idle( 2362 &adev->ip_blocks[i]); 2363 if (r) 2364 return r; 2365 } 2366 break; 2367 } 2368 } 2369 return 0; 2370 2371 } 2372 2373 /** 2374 * amdgpu_device_ip_is_valid - is the hardware IP enabled 2375 * 2376 * @adev: amdgpu_device pointer 2377 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.) 2378 * 2379 * Check if the hardware IP is enable or not. 2380 * Returns true if it the IP is enable, false if not. 2381 */ 2382 bool amdgpu_device_ip_is_valid(struct amdgpu_device *adev, 2383 enum amd_ip_block_type block_type) 2384 { 2385 int i; 2386 2387 for (i = 0; i < adev->num_ip_blocks; i++) { 2388 if (adev->ip_blocks[i].version->type == block_type) 2389 return adev->ip_blocks[i].status.valid; 2390 } 2391 return false; 2392 2393 } 2394 2395 /** 2396 * amdgpu_device_ip_get_ip_block - get a hw IP pointer 2397 * 2398 * @adev: amdgpu_device pointer 2399 * @type: Type of hardware IP (SMU, GFX, UVD, etc.) 2400 * 2401 * Returns a pointer to the hardware IP block structure 2402 * if it exists for the asic, otherwise NULL. 2403 */ 2404 struct amdgpu_ip_block * 2405 amdgpu_device_ip_get_ip_block(struct amdgpu_device *adev, 2406 enum amd_ip_block_type type) 2407 { 2408 int i; 2409 2410 for (i = 0; i < adev->num_ip_blocks; i++) 2411 if (adev->ip_blocks[i].version->type == type) 2412 return &adev->ip_blocks[i]; 2413 2414 return NULL; 2415 } 2416 2417 /** 2418 * amdgpu_device_ip_block_version_cmp 2419 * 2420 * @adev: amdgpu_device pointer 2421 * @type: enum amd_ip_block_type 2422 * @major: major version 2423 * @minor: minor version 2424 * 2425 * return 0 if equal or greater 2426 * return 1 if smaller or the ip_block doesn't exist 2427 */ 2428 int amdgpu_device_ip_block_version_cmp(struct amdgpu_device *adev, 2429 enum amd_ip_block_type type, 2430 u32 major, u32 minor) 2431 { 2432 struct amdgpu_ip_block *ip_block = amdgpu_device_ip_get_ip_block(adev, type); 2433 2434 if (ip_block && ((ip_block->version->major > major) || 2435 ((ip_block->version->major == major) && 2436 (ip_block->version->minor >= minor)))) 2437 return 0; 2438 2439 return 1; 2440 } 2441 2442 /** 2443 * amdgpu_device_ip_block_add 2444 * 2445 * @adev: amdgpu_device pointer 2446 * @ip_block_version: pointer to the IP to add 2447 * 2448 * Adds the IP block driver information to the collection of IPs 2449 * on the asic. 2450 */ 2451 int amdgpu_device_ip_block_add(struct amdgpu_device *adev, 2452 const struct amdgpu_ip_block_version *ip_block_version) 2453 { 2454 if (!ip_block_version) 2455 return -EINVAL; 2456 2457 switch (ip_block_version->type) { 2458 case AMD_IP_BLOCK_TYPE_VCN: 2459 if (adev->harvest_ip_mask & AMD_HARVEST_IP_VCN_MASK) 2460 return 0; 2461 break; 2462 case AMD_IP_BLOCK_TYPE_JPEG: 2463 if (adev->harvest_ip_mask & AMD_HARVEST_IP_JPEG_MASK) 2464 return 0; 2465 break; 2466 default: 2467 break; 2468 } 2469 2470 dev_info(adev->dev, "detected ip block number %d <%s>\n", 2471 adev->num_ip_blocks, ip_block_version->funcs->name); 2472 2473 adev->ip_blocks[adev->num_ip_blocks].adev = adev; 2474 2475 adev->ip_blocks[adev->num_ip_blocks++].version = ip_block_version; 2476 2477 return 0; 2478 } 2479 2480 /** 2481 * amdgpu_device_enable_virtual_display - enable virtual display feature 2482 * 2483 * @adev: amdgpu_device pointer 2484 * 2485 * Enabled the virtual display feature if the user has enabled it via 2486 * the module parameter virtual_display. This feature provides a virtual 2487 * display hardware on headless boards or in virtualized environments. 2488 * This function parses and validates the configuration string specified by 2489 * the user and configures the virtual display configuration (number of 2490 * virtual connectors, crtcs, etc.) specified. 2491 */ 2492 static void amdgpu_device_enable_virtual_display(struct amdgpu_device *adev) 2493 { 2494 adev->enable_virtual_display = false; 2495 2496 if (amdgpu_virtual_display) { 2497 const char *pci_address_name = pci_name(adev->pdev); 2498 char *pciaddstr, *pciaddstr_tmp, *pciaddname_tmp, *pciaddname; 2499 2500 pciaddstr = kstrdup(amdgpu_virtual_display, GFP_KERNEL); 2501 pciaddstr_tmp = pciaddstr; 2502 while ((pciaddname_tmp = strsep(&pciaddstr_tmp, ";"))) { 2503 pciaddname = strsep(&pciaddname_tmp, ","); 2504 if (!strcmp("all", pciaddname) 2505 || !strcmp(pci_address_name, pciaddname)) { 2506 long num_crtc; 2507 int res = -1; 2508 2509 adev->enable_virtual_display = true; 2510 2511 if (pciaddname_tmp) 2512 res = kstrtol(pciaddname_tmp, 10, 2513 &num_crtc); 2514 2515 if (!res) { 2516 if (num_crtc < 1) 2517 num_crtc = 1; 2518 if (num_crtc > 6) 2519 num_crtc = 6; 2520 adev->mode_info.num_crtc = num_crtc; 2521 } else { 2522 adev->mode_info.num_crtc = 1; 2523 } 2524 break; 2525 } 2526 } 2527 2528 DRM_INFO("virtual display string:%s, %s:virtual_display:%d, num_crtc:%d\n", 2529 amdgpu_virtual_display, pci_address_name, 2530 adev->enable_virtual_display, adev->mode_info.num_crtc); 2531 2532 kfree(pciaddstr); 2533 } 2534 } 2535 2536 void amdgpu_device_set_sriov_virtual_display(struct amdgpu_device *adev) 2537 { 2538 if (amdgpu_sriov_vf(adev) && !adev->enable_virtual_display) { 2539 adev->mode_info.num_crtc = 1; 2540 adev->enable_virtual_display = true; 2541 DRM_INFO("virtual_display:%d, num_crtc:%d\n", 2542 adev->enable_virtual_display, adev->mode_info.num_crtc); 2543 } 2544 } 2545 2546 /** 2547 * amdgpu_device_parse_gpu_info_fw - parse gpu info firmware 2548 * 2549 * @adev: amdgpu_device pointer 2550 * 2551 * Parses the asic configuration parameters specified in the gpu info 2552 * firmware and makes them available to the driver for use in configuring 2553 * the asic. 2554 * Returns 0 on success, -EINVAL on failure. 2555 */ 2556 static int amdgpu_device_parse_gpu_info_fw(struct amdgpu_device *adev) 2557 { 2558 const char *chip_name; 2559 int err; 2560 const struct gpu_info_firmware_header_v1_0 *hdr; 2561 2562 adev->firmware.gpu_info_fw = NULL; 2563 2564 if (adev->mman.discovery_bin) 2565 return 0; 2566 2567 switch (adev->asic_type) { 2568 default: 2569 return 0; 2570 case CHIP_VEGA10: 2571 chip_name = "vega10"; 2572 break; 2573 case CHIP_VEGA12: 2574 chip_name = "vega12"; 2575 break; 2576 case CHIP_RAVEN: 2577 if (adev->apu_flags & AMD_APU_IS_RAVEN2) 2578 chip_name = "raven2"; 2579 else if (adev->apu_flags & AMD_APU_IS_PICASSO) 2580 chip_name = "picasso"; 2581 else 2582 chip_name = "raven"; 2583 break; 2584 case CHIP_ARCTURUS: 2585 chip_name = "arcturus"; 2586 break; 2587 case CHIP_NAVI12: 2588 chip_name = "navi12"; 2589 break; 2590 } 2591 2592 err = amdgpu_ucode_request(adev, &adev->firmware.gpu_info_fw, 2593 AMDGPU_UCODE_OPTIONAL, 2594 "amdgpu/%s_gpu_info.bin", chip_name); 2595 if (err) { 2596 dev_err(adev->dev, 2597 "Failed to get gpu_info firmware \"%s_gpu_info.bin\"\n", 2598 chip_name); 2599 goto out; 2600 } 2601 2602 hdr = (const struct gpu_info_firmware_header_v1_0 *)adev->firmware.gpu_info_fw->data; 2603 amdgpu_ucode_print_gpu_info_hdr(&hdr->header); 2604 2605 switch (hdr->version_major) { 2606 case 1: 2607 { 2608 const struct gpu_info_firmware_v1_0 *gpu_info_fw = 2609 (const struct gpu_info_firmware_v1_0 *)(adev->firmware.gpu_info_fw->data + 2610 le32_to_cpu(hdr->header.ucode_array_offset_bytes)); 2611 2612 /* 2613 * Should be dropped when DAL no longer needs it. 2614 */ 2615 if (adev->asic_type == CHIP_NAVI12) 2616 goto parse_soc_bounding_box; 2617 2618 adev->gfx.config.max_shader_engines = le32_to_cpu(gpu_info_fw->gc_num_se); 2619 adev->gfx.config.max_cu_per_sh = le32_to_cpu(gpu_info_fw->gc_num_cu_per_sh); 2620 adev->gfx.config.max_sh_per_se = le32_to_cpu(gpu_info_fw->gc_num_sh_per_se); 2621 adev->gfx.config.max_backends_per_se = le32_to_cpu(gpu_info_fw->gc_num_rb_per_se); 2622 adev->gfx.config.max_texture_channel_caches = 2623 le32_to_cpu(gpu_info_fw->gc_num_tccs); 2624 adev->gfx.config.max_gprs = le32_to_cpu(gpu_info_fw->gc_num_gprs); 2625 adev->gfx.config.max_gs_threads = le32_to_cpu(gpu_info_fw->gc_num_max_gs_thds); 2626 adev->gfx.config.gs_vgt_table_depth = le32_to_cpu(gpu_info_fw->gc_gs_table_depth); 2627 adev->gfx.config.gs_prim_buffer_depth = le32_to_cpu(gpu_info_fw->gc_gsprim_buff_depth); 2628 adev->gfx.config.double_offchip_lds_buf = 2629 le32_to_cpu(gpu_info_fw->gc_double_offchip_lds_buffer); 2630 adev->gfx.cu_info.wave_front_size = le32_to_cpu(gpu_info_fw->gc_wave_size); 2631 adev->gfx.cu_info.max_waves_per_simd = 2632 le32_to_cpu(gpu_info_fw->gc_max_waves_per_simd); 2633 adev->gfx.cu_info.max_scratch_slots_per_cu = 2634 le32_to_cpu(gpu_info_fw->gc_max_scratch_slots_per_cu); 2635 adev->gfx.cu_info.lds_size = le32_to_cpu(gpu_info_fw->gc_lds_size); 2636 if (hdr->version_minor >= 1) { 2637 const struct gpu_info_firmware_v1_1 *gpu_info_fw = 2638 (const struct gpu_info_firmware_v1_1 *)(adev->firmware.gpu_info_fw->data + 2639 le32_to_cpu(hdr->header.ucode_array_offset_bytes)); 2640 adev->gfx.config.num_sc_per_sh = 2641 le32_to_cpu(gpu_info_fw->num_sc_per_sh); 2642 adev->gfx.config.num_packer_per_sc = 2643 le32_to_cpu(gpu_info_fw->num_packer_per_sc); 2644 } 2645 2646 parse_soc_bounding_box: 2647 /* 2648 * soc bounding box info is not integrated in disocovery table, 2649 * we always need to parse it from gpu info firmware if needed. 2650 */ 2651 if (hdr->version_minor == 2) { 2652 const struct gpu_info_firmware_v1_2 *gpu_info_fw = 2653 (const struct gpu_info_firmware_v1_2 *)(adev->firmware.gpu_info_fw->data + 2654 le32_to_cpu(hdr->header.ucode_array_offset_bytes)); 2655 adev->dm.soc_bounding_box = &gpu_info_fw->soc_bounding_box; 2656 } 2657 break; 2658 } 2659 default: 2660 dev_err(adev->dev, 2661 "Unsupported gpu_info table %d\n", hdr->header.ucode_version); 2662 err = -EINVAL; 2663 goto out; 2664 } 2665 out: 2666 return err; 2667 } 2668 2669 /** 2670 * amdgpu_device_ip_early_init - run early init for hardware IPs 2671 * 2672 * @adev: amdgpu_device pointer 2673 * 2674 * Early initialization pass for hardware IPs. The hardware IPs that make 2675 * up each asic are discovered each IP's early_init callback is run. This 2676 * is the first stage in initializing the asic. 2677 * Returns 0 on success, negative error code on failure. 2678 */ 2679 static int amdgpu_device_ip_early_init(struct amdgpu_device *adev) 2680 { 2681 struct amdgpu_ip_block *ip_block; 2682 struct pci_dev *parent; 2683 bool total, skip_bios; 2684 uint32_t bios_flags; 2685 int i, r; 2686 2687 amdgpu_device_enable_virtual_display(adev); 2688 2689 if (amdgpu_sriov_vf(adev)) { 2690 r = amdgpu_virt_request_full_gpu(adev, true); 2691 if (r) 2692 return r; 2693 } 2694 2695 switch (adev->asic_type) { 2696 #ifdef CONFIG_DRM_AMDGPU_SI 2697 case CHIP_VERDE: 2698 case CHIP_TAHITI: 2699 case CHIP_PITCAIRN: 2700 case CHIP_OLAND: 2701 case CHIP_HAINAN: 2702 adev->family = AMDGPU_FAMILY_SI; 2703 r = si_set_ip_blocks(adev); 2704 if (r) 2705 return r; 2706 break; 2707 #endif 2708 #ifdef CONFIG_DRM_AMDGPU_CIK 2709 case CHIP_BONAIRE: 2710 case CHIP_HAWAII: 2711 case CHIP_KAVERI: 2712 case CHIP_KABINI: 2713 case CHIP_MULLINS: 2714 if (adev->flags & AMD_IS_APU) 2715 adev->family = AMDGPU_FAMILY_KV; 2716 else 2717 adev->family = AMDGPU_FAMILY_CI; 2718 2719 r = cik_set_ip_blocks(adev); 2720 if (r) 2721 return r; 2722 break; 2723 #endif 2724 case CHIP_TOPAZ: 2725 case CHIP_TONGA: 2726 case CHIP_FIJI: 2727 case CHIP_POLARIS10: 2728 case CHIP_POLARIS11: 2729 case CHIP_POLARIS12: 2730 case CHIP_VEGAM: 2731 case CHIP_CARRIZO: 2732 case CHIP_STONEY: 2733 if (adev->flags & AMD_IS_APU) 2734 adev->family = AMDGPU_FAMILY_CZ; 2735 else 2736 adev->family = AMDGPU_FAMILY_VI; 2737 2738 r = vi_set_ip_blocks(adev); 2739 if (r) 2740 return r; 2741 break; 2742 default: 2743 r = amdgpu_discovery_set_ip_blocks(adev); 2744 if (r) 2745 return r; 2746 break; 2747 } 2748 2749 /* Check for IP version 9.4.3 with A0 hardware */ 2750 if (amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 3) && 2751 !amdgpu_device_get_rev_id(adev)) { 2752 dev_err(adev->dev, "Unsupported A0 hardware\n"); 2753 return -ENODEV; /* device unsupported - no device error */ 2754 } 2755 2756 if (amdgpu_has_atpx() && 2757 (amdgpu_is_atpx_hybrid() || 2758 amdgpu_has_atpx_dgpu_power_cntl()) && 2759 ((adev->flags & AMD_IS_APU) == 0) && 2760 !dev_is_removable(&adev->pdev->dev)) 2761 adev->flags |= AMD_IS_PX; 2762 2763 if (!(adev->flags & AMD_IS_APU)) { 2764 parent = pcie_find_root_port(adev->pdev); 2765 adev->has_pr3 = parent ? pci_pr3_present(parent) : false; 2766 } 2767 2768 adev->pm.pp_feature = amdgpu_pp_feature_mask; 2769 if (amdgpu_sriov_vf(adev) || sched_policy == KFD_SCHED_POLICY_NO_HWS) 2770 adev->pm.pp_feature &= ~PP_GFXOFF_MASK; 2771 if (amdgpu_sriov_vf(adev) && adev->asic_type == CHIP_SIENNA_CICHLID) 2772 adev->pm.pp_feature &= ~PP_OVERDRIVE_MASK; 2773 if (!amdgpu_device_pcie_dynamic_switching_supported(adev)) 2774 adev->pm.pp_feature &= ~PP_PCIE_DPM_MASK; 2775 2776 adev->virt.is_xgmi_node_migrate_enabled = false; 2777 if (amdgpu_sriov_vf(adev)) { 2778 adev->virt.is_xgmi_node_migrate_enabled = 2779 amdgpu_ip_version((adev), GC_HWIP, 0) == IP_VERSION(9, 4, 4); 2780 } 2781 2782 total = true; 2783 for (i = 0; i < adev->num_ip_blocks; i++) { 2784 ip_block = &adev->ip_blocks[i]; 2785 2786 if ((amdgpu_ip_block_mask & (1 << i)) == 0) { 2787 DRM_WARN("disabled ip block: %d <%s>\n", 2788 i, adev->ip_blocks[i].version->funcs->name); 2789 adev->ip_blocks[i].status.valid = false; 2790 } else if (ip_block->version->funcs->early_init) { 2791 r = ip_block->version->funcs->early_init(ip_block); 2792 if (r == -ENOENT) { 2793 adev->ip_blocks[i].status.valid = false; 2794 } else if (r) { 2795 DRM_ERROR("early_init of IP block <%s> failed %d\n", 2796 adev->ip_blocks[i].version->funcs->name, r); 2797 total = false; 2798 } else { 2799 adev->ip_blocks[i].status.valid = true; 2800 } 2801 } else { 2802 adev->ip_blocks[i].status.valid = true; 2803 } 2804 /* get the vbios after the asic_funcs are set up */ 2805 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON) { 2806 r = amdgpu_device_parse_gpu_info_fw(adev); 2807 if (r) 2808 return r; 2809 2810 bios_flags = amdgpu_device_get_vbios_flags(adev); 2811 skip_bios = !!(bios_flags & AMDGPU_VBIOS_SKIP); 2812 /* Read BIOS */ 2813 if (!skip_bios) { 2814 bool optional = 2815 !!(bios_flags & AMDGPU_VBIOS_OPTIONAL); 2816 if (!amdgpu_get_bios(adev) && !optional) 2817 return -EINVAL; 2818 2819 if (optional && !adev->bios) 2820 dev_info( 2821 adev->dev, 2822 "VBIOS image optional, proceeding without VBIOS image"); 2823 2824 if (adev->bios) { 2825 r = amdgpu_atombios_init(adev); 2826 if (r) { 2827 dev_err(adev->dev, 2828 "amdgpu_atombios_init failed\n"); 2829 amdgpu_vf_error_put( 2830 adev, 2831 AMDGIM_ERROR_VF_ATOMBIOS_INIT_FAIL, 2832 0, 0); 2833 return r; 2834 } 2835 } 2836 } 2837 2838 /*get pf2vf msg info at it's earliest time*/ 2839 if (amdgpu_sriov_vf(adev)) 2840 amdgpu_virt_init_data_exchange(adev); 2841 2842 } 2843 } 2844 if (!total) 2845 return -ENODEV; 2846 2847 if (adev->gmc.xgmi.supported) 2848 amdgpu_xgmi_early_init(adev); 2849 2850 ip_block = amdgpu_device_ip_get_ip_block(adev, AMD_IP_BLOCK_TYPE_GFX); 2851 if (ip_block->status.valid != false) 2852 amdgpu_amdkfd_device_probe(adev); 2853 2854 adev->cg_flags &= amdgpu_cg_mask; 2855 adev->pg_flags &= amdgpu_pg_mask; 2856 2857 return 0; 2858 } 2859 2860 static int amdgpu_device_ip_hw_init_phase1(struct amdgpu_device *adev) 2861 { 2862 int i, r; 2863 2864 for (i = 0; i < adev->num_ip_blocks; i++) { 2865 if (!adev->ip_blocks[i].status.sw) 2866 continue; 2867 if (adev->ip_blocks[i].status.hw) 2868 continue; 2869 if (!amdgpu_ip_member_of_hwini( 2870 adev, adev->ip_blocks[i].version->type)) 2871 continue; 2872 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON || 2873 (amdgpu_sriov_vf(adev) && (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP)) || 2874 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH) { 2875 r = adev->ip_blocks[i].version->funcs->hw_init(&adev->ip_blocks[i]); 2876 if (r) { 2877 DRM_ERROR("hw_init of IP block <%s> failed %d\n", 2878 adev->ip_blocks[i].version->funcs->name, r); 2879 return r; 2880 } 2881 adev->ip_blocks[i].status.hw = true; 2882 } 2883 } 2884 2885 return 0; 2886 } 2887 2888 static int amdgpu_device_ip_hw_init_phase2(struct amdgpu_device *adev) 2889 { 2890 int i, r; 2891 2892 for (i = 0; i < adev->num_ip_blocks; i++) { 2893 if (!adev->ip_blocks[i].status.sw) 2894 continue; 2895 if (adev->ip_blocks[i].status.hw) 2896 continue; 2897 if (!amdgpu_ip_member_of_hwini( 2898 adev, adev->ip_blocks[i].version->type)) 2899 continue; 2900 r = adev->ip_blocks[i].version->funcs->hw_init(&adev->ip_blocks[i]); 2901 if (r) { 2902 DRM_ERROR("hw_init of IP block <%s> failed %d\n", 2903 adev->ip_blocks[i].version->funcs->name, r); 2904 return r; 2905 } 2906 adev->ip_blocks[i].status.hw = true; 2907 } 2908 2909 return 0; 2910 } 2911 2912 static int amdgpu_device_fw_loading(struct amdgpu_device *adev) 2913 { 2914 int r = 0; 2915 int i; 2916 uint32_t smu_version; 2917 2918 if (adev->asic_type >= CHIP_VEGA10) { 2919 for (i = 0; i < adev->num_ip_blocks; i++) { 2920 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_PSP) 2921 continue; 2922 2923 if (!amdgpu_ip_member_of_hwini(adev, 2924 AMD_IP_BLOCK_TYPE_PSP)) 2925 break; 2926 2927 if (!adev->ip_blocks[i].status.sw) 2928 continue; 2929 2930 /* no need to do the fw loading again if already done*/ 2931 if (adev->ip_blocks[i].status.hw == true) 2932 break; 2933 2934 if (amdgpu_in_reset(adev) || adev->in_suspend) { 2935 r = amdgpu_ip_block_resume(&adev->ip_blocks[i]); 2936 if (r) 2937 return r; 2938 } else { 2939 r = adev->ip_blocks[i].version->funcs->hw_init(&adev->ip_blocks[i]); 2940 if (r) { 2941 DRM_ERROR("hw_init of IP block <%s> failed %d\n", 2942 adev->ip_blocks[i].version->funcs->name, r); 2943 return r; 2944 } 2945 adev->ip_blocks[i].status.hw = true; 2946 } 2947 break; 2948 } 2949 } 2950 2951 if (!amdgpu_sriov_vf(adev) || adev->asic_type == CHIP_TONGA) 2952 r = amdgpu_pm_load_smu_firmware(adev, &smu_version); 2953 2954 return r; 2955 } 2956 2957 static int amdgpu_device_init_schedulers(struct amdgpu_device *adev) 2958 { 2959 struct drm_sched_init_args args = { 2960 .ops = &amdgpu_sched_ops, 2961 .num_rqs = DRM_SCHED_PRIORITY_COUNT, 2962 .timeout_wq = adev->reset_domain->wq, 2963 .dev = adev->dev, 2964 }; 2965 long timeout; 2966 int r, i; 2967 2968 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 2969 struct amdgpu_ring *ring = adev->rings[i]; 2970 2971 /* No need to setup the GPU scheduler for rings that don't need it */ 2972 if (!ring || ring->no_scheduler) 2973 continue; 2974 2975 switch (ring->funcs->type) { 2976 case AMDGPU_RING_TYPE_GFX: 2977 timeout = adev->gfx_timeout; 2978 break; 2979 case AMDGPU_RING_TYPE_COMPUTE: 2980 timeout = adev->compute_timeout; 2981 break; 2982 case AMDGPU_RING_TYPE_SDMA: 2983 timeout = adev->sdma_timeout; 2984 break; 2985 default: 2986 timeout = adev->video_timeout; 2987 break; 2988 } 2989 2990 args.timeout = timeout; 2991 args.credit_limit = ring->num_hw_submission; 2992 args.score = ring->sched_score; 2993 args.name = ring->name; 2994 2995 r = drm_sched_init(&ring->sched, &args); 2996 if (r) { 2997 DRM_ERROR("Failed to create scheduler on ring %s.\n", 2998 ring->name); 2999 return r; 3000 } 3001 r = amdgpu_uvd_entity_init(adev, ring); 3002 if (r) { 3003 DRM_ERROR("Failed to create UVD scheduling entity on ring %s.\n", 3004 ring->name); 3005 return r; 3006 } 3007 r = amdgpu_vce_entity_init(adev, ring); 3008 if (r) { 3009 DRM_ERROR("Failed to create VCE scheduling entity on ring %s.\n", 3010 ring->name); 3011 return r; 3012 } 3013 } 3014 3015 amdgpu_xcp_update_partition_sched_list(adev); 3016 3017 return 0; 3018 } 3019 3020 3021 /** 3022 * amdgpu_device_ip_init - run init for hardware IPs 3023 * 3024 * @adev: amdgpu_device pointer 3025 * 3026 * Main initialization pass for hardware IPs. The list of all the hardware 3027 * IPs that make up the asic is walked and the sw_init and hw_init callbacks 3028 * are run. sw_init initializes the software state associated with each IP 3029 * and hw_init initializes the hardware associated with each IP. 3030 * Returns 0 on success, negative error code on failure. 3031 */ 3032 static int amdgpu_device_ip_init(struct amdgpu_device *adev) 3033 { 3034 bool init_badpage; 3035 int i, r; 3036 3037 r = amdgpu_ras_init(adev); 3038 if (r) 3039 return r; 3040 3041 for (i = 0; i < adev->num_ip_blocks; i++) { 3042 if (!adev->ip_blocks[i].status.valid) 3043 continue; 3044 if (adev->ip_blocks[i].version->funcs->sw_init) { 3045 r = adev->ip_blocks[i].version->funcs->sw_init(&adev->ip_blocks[i]); 3046 if (r) { 3047 DRM_ERROR("sw_init of IP block <%s> failed %d\n", 3048 adev->ip_blocks[i].version->funcs->name, r); 3049 goto init_failed; 3050 } 3051 } 3052 adev->ip_blocks[i].status.sw = true; 3053 3054 if (!amdgpu_ip_member_of_hwini( 3055 adev, adev->ip_blocks[i].version->type)) 3056 continue; 3057 3058 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON) { 3059 /* need to do common hw init early so everything is set up for gmc */ 3060 r = adev->ip_blocks[i].version->funcs->hw_init(&adev->ip_blocks[i]); 3061 if (r) { 3062 DRM_ERROR("hw_init %d failed %d\n", i, r); 3063 goto init_failed; 3064 } 3065 adev->ip_blocks[i].status.hw = true; 3066 } else if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) { 3067 /* need to do gmc hw init early so we can allocate gpu mem */ 3068 /* Try to reserve bad pages early */ 3069 if (amdgpu_sriov_vf(adev)) 3070 amdgpu_virt_exchange_data(adev); 3071 3072 r = amdgpu_device_mem_scratch_init(adev); 3073 if (r) { 3074 DRM_ERROR("amdgpu_mem_scratch_init failed %d\n", r); 3075 goto init_failed; 3076 } 3077 r = adev->ip_blocks[i].version->funcs->hw_init(&adev->ip_blocks[i]); 3078 if (r) { 3079 DRM_ERROR("hw_init %d failed %d\n", i, r); 3080 goto init_failed; 3081 } 3082 r = amdgpu_device_wb_init(adev); 3083 if (r) { 3084 DRM_ERROR("amdgpu_device_wb_init failed %d\n", r); 3085 goto init_failed; 3086 } 3087 adev->ip_blocks[i].status.hw = true; 3088 3089 /* right after GMC hw init, we create CSA */ 3090 if (adev->gfx.mcbp) { 3091 r = amdgpu_allocate_static_csa(adev, &adev->virt.csa_obj, 3092 AMDGPU_GEM_DOMAIN_VRAM | 3093 AMDGPU_GEM_DOMAIN_GTT, 3094 AMDGPU_CSA_SIZE); 3095 if (r) { 3096 DRM_ERROR("allocate CSA failed %d\n", r); 3097 goto init_failed; 3098 } 3099 } 3100 3101 r = amdgpu_seq64_init(adev); 3102 if (r) { 3103 DRM_ERROR("allocate seq64 failed %d\n", r); 3104 goto init_failed; 3105 } 3106 } 3107 } 3108 3109 if (amdgpu_sriov_vf(adev)) 3110 amdgpu_virt_init_data_exchange(adev); 3111 3112 r = amdgpu_ib_pool_init(adev); 3113 if (r) { 3114 dev_err(adev->dev, "IB initialization failed (%d).\n", r); 3115 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_IB_INIT_FAIL, 0, r); 3116 goto init_failed; 3117 } 3118 3119 r = amdgpu_ucode_create_bo(adev); /* create ucode bo when sw_init complete*/ 3120 if (r) 3121 goto init_failed; 3122 3123 r = amdgpu_device_ip_hw_init_phase1(adev); 3124 if (r) 3125 goto init_failed; 3126 3127 r = amdgpu_device_fw_loading(adev); 3128 if (r) 3129 goto init_failed; 3130 3131 r = amdgpu_device_ip_hw_init_phase2(adev); 3132 if (r) 3133 goto init_failed; 3134 3135 /* 3136 * retired pages will be loaded from eeprom and reserved here, 3137 * it should be called after amdgpu_device_ip_hw_init_phase2 since 3138 * for some ASICs the RAS EEPROM code relies on SMU fully functioning 3139 * for I2C communication which only true at this point. 3140 * 3141 * amdgpu_ras_recovery_init may fail, but the upper only cares the 3142 * failure from bad gpu situation and stop amdgpu init process 3143 * accordingly. For other failed cases, it will still release all 3144 * the resource and print error message, rather than returning one 3145 * negative value to upper level. 3146 * 3147 * Note: theoretically, this should be called before all vram allocations 3148 * to protect retired page from abusing 3149 */ 3150 init_badpage = (adev->init_lvl->level != AMDGPU_INIT_LEVEL_MINIMAL_XGMI); 3151 r = amdgpu_ras_recovery_init(adev, init_badpage); 3152 if (r) 3153 goto init_failed; 3154 3155 /** 3156 * In case of XGMI grab extra reference for reset domain for this device 3157 */ 3158 if (adev->gmc.xgmi.num_physical_nodes > 1) { 3159 if (amdgpu_xgmi_add_device(adev) == 0) { 3160 if (!amdgpu_sriov_vf(adev)) { 3161 struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev); 3162 3163 if (WARN_ON(!hive)) { 3164 r = -ENOENT; 3165 goto init_failed; 3166 } 3167 3168 if (!hive->reset_domain || 3169 !amdgpu_reset_get_reset_domain(hive->reset_domain)) { 3170 r = -ENOENT; 3171 amdgpu_put_xgmi_hive(hive); 3172 goto init_failed; 3173 } 3174 3175 /* Drop the early temporary reset domain we created for device */ 3176 amdgpu_reset_put_reset_domain(adev->reset_domain); 3177 adev->reset_domain = hive->reset_domain; 3178 amdgpu_put_xgmi_hive(hive); 3179 } 3180 } 3181 } 3182 3183 r = amdgpu_device_init_schedulers(adev); 3184 if (r) 3185 goto init_failed; 3186 3187 if (adev->mman.buffer_funcs_ring->sched.ready) 3188 amdgpu_ttm_set_buffer_funcs_status(adev, true); 3189 3190 /* Don't init kfd if whole hive need to be reset during init */ 3191 if (adev->init_lvl->level != AMDGPU_INIT_LEVEL_MINIMAL_XGMI) { 3192 kgd2kfd_init_zone_device(adev); 3193 amdgpu_amdkfd_device_init(adev); 3194 } 3195 3196 amdgpu_fru_get_product_info(adev); 3197 3198 if (!amdgpu_sriov_vf(adev) || amdgpu_sriov_ras_cper_en(adev)) 3199 r = amdgpu_cper_init(adev); 3200 3201 init_failed: 3202 3203 return r; 3204 } 3205 3206 /** 3207 * amdgpu_device_fill_reset_magic - writes reset magic to gart pointer 3208 * 3209 * @adev: amdgpu_device pointer 3210 * 3211 * Writes a reset magic value to the gart pointer in VRAM. The driver calls 3212 * this function before a GPU reset. If the value is retained after a 3213 * GPU reset, VRAM has not been lost. Some GPU resets may destroy VRAM contents. 3214 */ 3215 static void amdgpu_device_fill_reset_magic(struct amdgpu_device *adev) 3216 { 3217 memcpy(adev->reset_magic, adev->gart.ptr, AMDGPU_RESET_MAGIC_NUM); 3218 } 3219 3220 /** 3221 * amdgpu_device_check_vram_lost - check if vram is valid 3222 * 3223 * @adev: amdgpu_device pointer 3224 * 3225 * Checks the reset magic value written to the gart pointer in VRAM. 3226 * The driver calls this after a GPU reset to see if the contents of 3227 * VRAM is lost or now. 3228 * returns true if vram is lost, false if not. 3229 */ 3230 static bool amdgpu_device_check_vram_lost(struct amdgpu_device *adev) 3231 { 3232 if (memcmp(adev->gart.ptr, adev->reset_magic, 3233 AMDGPU_RESET_MAGIC_NUM)) 3234 return true; 3235 3236 if (!amdgpu_in_reset(adev)) 3237 return false; 3238 3239 /* 3240 * For all ASICs with baco/mode1 reset, the VRAM is 3241 * always assumed to be lost. 3242 */ 3243 switch (amdgpu_asic_reset_method(adev)) { 3244 case AMD_RESET_METHOD_LINK: 3245 case AMD_RESET_METHOD_BACO: 3246 case AMD_RESET_METHOD_MODE1: 3247 return true; 3248 default: 3249 return false; 3250 } 3251 } 3252 3253 /** 3254 * amdgpu_device_set_cg_state - set clockgating for amdgpu device 3255 * 3256 * @adev: amdgpu_device pointer 3257 * @state: clockgating state (gate or ungate) 3258 * 3259 * The list of all the hardware IPs that make up the asic is walked and the 3260 * set_clockgating_state callbacks are run. 3261 * Late initialization pass enabling clockgating for hardware IPs. 3262 * Fini or suspend, pass disabling clockgating for hardware IPs. 3263 * Returns 0 on success, negative error code on failure. 3264 */ 3265 3266 int amdgpu_device_set_cg_state(struct amdgpu_device *adev, 3267 enum amd_clockgating_state state) 3268 { 3269 int i, j, r; 3270 3271 if (amdgpu_emu_mode == 1) 3272 return 0; 3273 3274 for (j = 0; j < adev->num_ip_blocks; j++) { 3275 i = state == AMD_CG_STATE_GATE ? j : adev->num_ip_blocks - j - 1; 3276 if (!adev->ip_blocks[i].status.late_initialized) 3277 continue; 3278 /* skip CG for GFX, SDMA on S0ix */ 3279 if (adev->in_s0ix && 3280 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GFX || 3281 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SDMA)) 3282 continue; 3283 /* skip CG for VCE/UVD, it's handled specially */ 3284 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_UVD && 3285 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCE && 3286 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCN && 3287 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_JPEG && 3288 adev->ip_blocks[i].version->funcs->set_clockgating_state) { 3289 /* enable clockgating to save power */ 3290 r = adev->ip_blocks[i].version->funcs->set_clockgating_state(&adev->ip_blocks[i], 3291 state); 3292 if (r) { 3293 DRM_ERROR("set_clockgating_state(gate) of IP block <%s> failed %d\n", 3294 adev->ip_blocks[i].version->funcs->name, r); 3295 return r; 3296 } 3297 } 3298 } 3299 3300 return 0; 3301 } 3302 3303 int amdgpu_device_set_pg_state(struct amdgpu_device *adev, 3304 enum amd_powergating_state state) 3305 { 3306 int i, j, r; 3307 3308 if (amdgpu_emu_mode == 1) 3309 return 0; 3310 3311 for (j = 0; j < adev->num_ip_blocks; j++) { 3312 i = state == AMD_PG_STATE_GATE ? j : adev->num_ip_blocks - j - 1; 3313 if (!adev->ip_blocks[i].status.late_initialized) 3314 continue; 3315 /* skip PG for GFX, SDMA on S0ix */ 3316 if (adev->in_s0ix && 3317 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GFX || 3318 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SDMA)) 3319 continue; 3320 /* skip CG for VCE/UVD, it's handled specially */ 3321 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_UVD && 3322 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCE && 3323 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCN && 3324 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_JPEG && 3325 adev->ip_blocks[i].version->funcs->set_powergating_state) { 3326 /* enable powergating to save power */ 3327 r = adev->ip_blocks[i].version->funcs->set_powergating_state(&adev->ip_blocks[i], 3328 state); 3329 if (r) { 3330 DRM_ERROR("set_powergating_state(gate) of IP block <%s> failed %d\n", 3331 adev->ip_blocks[i].version->funcs->name, r); 3332 return r; 3333 } 3334 } 3335 } 3336 return 0; 3337 } 3338 3339 static int amdgpu_device_enable_mgpu_fan_boost(void) 3340 { 3341 struct amdgpu_gpu_instance *gpu_ins; 3342 struct amdgpu_device *adev; 3343 int i, ret = 0; 3344 3345 mutex_lock(&mgpu_info.mutex); 3346 3347 /* 3348 * MGPU fan boost feature should be enabled 3349 * only when there are two or more dGPUs in 3350 * the system 3351 */ 3352 if (mgpu_info.num_dgpu < 2) 3353 goto out; 3354 3355 for (i = 0; i < mgpu_info.num_dgpu; i++) { 3356 gpu_ins = &(mgpu_info.gpu_ins[i]); 3357 adev = gpu_ins->adev; 3358 if (!(adev->flags & AMD_IS_APU) && 3359 !gpu_ins->mgpu_fan_enabled) { 3360 ret = amdgpu_dpm_enable_mgpu_fan_boost(adev); 3361 if (ret) 3362 break; 3363 3364 gpu_ins->mgpu_fan_enabled = 1; 3365 } 3366 } 3367 3368 out: 3369 mutex_unlock(&mgpu_info.mutex); 3370 3371 return ret; 3372 } 3373 3374 /** 3375 * amdgpu_device_ip_late_init - run late init for hardware IPs 3376 * 3377 * @adev: amdgpu_device pointer 3378 * 3379 * Late initialization pass for hardware IPs. The list of all the hardware 3380 * IPs that make up the asic is walked and the late_init callbacks are run. 3381 * late_init covers any special initialization that an IP requires 3382 * after all of the have been initialized or something that needs to happen 3383 * late in the init process. 3384 * Returns 0 on success, negative error code on failure. 3385 */ 3386 static int amdgpu_device_ip_late_init(struct amdgpu_device *adev) 3387 { 3388 struct amdgpu_gpu_instance *gpu_instance; 3389 int i = 0, r; 3390 3391 for (i = 0; i < adev->num_ip_blocks; i++) { 3392 if (!adev->ip_blocks[i].status.hw) 3393 continue; 3394 if (adev->ip_blocks[i].version->funcs->late_init) { 3395 r = adev->ip_blocks[i].version->funcs->late_init(&adev->ip_blocks[i]); 3396 if (r) { 3397 DRM_ERROR("late_init of IP block <%s> failed %d\n", 3398 adev->ip_blocks[i].version->funcs->name, r); 3399 return r; 3400 } 3401 } 3402 adev->ip_blocks[i].status.late_initialized = true; 3403 } 3404 3405 r = amdgpu_ras_late_init(adev); 3406 if (r) { 3407 DRM_ERROR("amdgpu_ras_late_init failed %d", r); 3408 return r; 3409 } 3410 3411 if (!amdgpu_reset_in_recovery(adev)) 3412 amdgpu_ras_set_error_query_ready(adev, true); 3413 3414 amdgpu_device_set_cg_state(adev, AMD_CG_STATE_GATE); 3415 amdgpu_device_set_pg_state(adev, AMD_PG_STATE_GATE); 3416 3417 amdgpu_device_fill_reset_magic(adev); 3418 3419 r = amdgpu_device_enable_mgpu_fan_boost(); 3420 if (r) 3421 DRM_ERROR("enable mgpu fan boost failed (%d).\n", r); 3422 3423 /* For passthrough configuration on arcturus and aldebaran, enable special handling SBR */ 3424 if (amdgpu_passthrough(adev) && 3425 ((adev->asic_type == CHIP_ARCTURUS && adev->gmc.xgmi.num_physical_nodes > 1) || 3426 adev->asic_type == CHIP_ALDEBARAN)) 3427 amdgpu_dpm_handle_passthrough_sbr(adev, true); 3428 3429 if (adev->gmc.xgmi.num_physical_nodes > 1) { 3430 mutex_lock(&mgpu_info.mutex); 3431 3432 /* 3433 * Reset device p-state to low as this was booted with high. 3434 * 3435 * This should be performed only after all devices from the same 3436 * hive get initialized. 3437 * 3438 * However, it's unknown how many device in the hive in advance. 3439 * As this is counted one by one during devices initializations. 3440 * 3441 * So, we wait for all XGMI interlinked devices initialized. 3442 * This may bring some delays as those devices may come from 3443 * different hives. But that should be OK. 3444 */ 3445 if (mgpu_info.num_dgpu == adev->gmc.xgmi.num_physical_nodes) { 3446 for (i = 0; i < mgpu_info.num_gpu; i++) { 3447 gpu_instance = &(mgpu_info.gpu_ins[i]); 3448 if (gpu_instance->adev->flags & AMD_IS_APU) 3449 continue; 3450 3451 r = amdgpu_xgmi_set_pstate(gpu_instance->adev, 3452 AMDGPU_XGMI_PSTATE_MIN); 3453 if (r) { 3454 DRM_ERROR("pstate setting failed (%d).\n", r); 3455 break; 3456 } 3457 } 3458 } 3459 3460 mutex_unlock(&mgpu_info.mutex); 3461 } 3462 3463 return 0; 3464 } 3465 3466 static void amdgpu_ip_block_hw_fini(struct amdgpu_ip_block *ip_block) 3467 { 3468 int r; 3469 3470 if (!ip_block->version->funcs->hw_fini) { 3471 DRM_ERROR("hw_fini of IP block <%s> not defined\n", 3472 ip_block->version->funcs->name); 3473 } else { 3474 r = ip_block->version->funcs->hw_fini(ip_block); 3475 /* XXX handle errors */ 3476 if (r) { 3477 DRM_DEBUG("hw_fini of IP block <%s> failed %d\n", 3478 ip_block->version->funcs->name, r); 3479 } 3480 } 3481 3482 ip_block->status.hw = false; 3483 } 3484 3485 /** 3486 * amdgpu_device_smu_fini_early - smu hw_fini wrapper 3487 * 3488 * @adev: amdgpu_device pointer 3489 * 3490 * For ASICs need to disable SMC first 3491 */ 3492 static void amdgpu_device_smu_fini_early(struct amdgpu_device *adev) 3493 { 3494 int i; 3495 3496 if (amdgpu_ip_version(adev, GC_HWIP, 0) > IP_VERSION(9, 0, 0)) 3497 return; 3498 3499 for (i = 0; i < adev->num_ip_blocks; i++) { 3500 if (!adev->ip_blocks[i].status.hw) 3501 continue; 3502 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) { 3503 amdgpu_ip_block_hw_fini(&adev->ip_blocks[i]); 3504 break; 3505 } 3506 } 3507 } 3508 3509 static int amdgpu_device_ip_fini_early(struct amdgpu_device *adev) 3510 { 3511 int i, r; 3512 3513 for (i = 0; i < adev->num_ip_blocks; i++) { 3514 if (!adev->ip_blocks[i].version->funcs->early_fini) 3515 continue; 3516 3517 r = adev->ip_blocks[i].version->funcs->early_fini(&adev->ip_blocks[i]); 3518 if (r) { 3519 DRM_DEBUG("early_fini of IP block <%s> failed %d\n", 3520 adev->ip_blocks[i].version->funcs->name, r); 3521 } 3522 } 3523 3524 amdgpu_device_set_pg_state(adev, AMD_PG_STATE_UNGATE); 3525 amdgpu_device_set_cg_state(adev, AMD_CG_STATE_UNGATE); 3526 3527 amdgpu_amdkfd_suspend(adev, true); 3528 amdgpu_userq_suspend(adev); 3529 3530 /* Workaround for ASICs need to disable SMC first */ 3531 amdgpu_device_smu_fini_early(adev); 3532 3533 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 3534 if (!adev->ip_blocks[i].status.hw) 3535 continue; 3536 3537 amdgpu_ip_block_hw_fini(&adev->ip_blocks[i]); 3538 } 3539 3540 if (amdgpu_sriov_vf(adev)) { 3541 if (amdgpu_virt_release_full_gpu(adev, false)) 3542 DRM_ERROR("failed to release exclusive mode on fini\n"); 3543 } 3544 3545 return 0; 3546 } 3547 3548 /** 3549 * amdgpu_device_ip_fini - run fini for hardware IPs 3550 * 3551 * @adev: amdgpu_device pointer 3552 * 3553 * Main teardown pass for hardware IPs. The list of all the hardware 3554 * IPs that make up the asic is walked and the hw_fini and sw_fini callbacks 3555 * are run. hw_fini tears down the hardware associated with each IP 3556 * and sw_fini tears down any software state associated with each IP. 3557 * Returns 0 on success, negative error code on failure. 3558 */ 3559 static int amdgpu_device_ip_fini(struct amdgpu_device *adev) 3560 { 3561 int i, r; 3562 3563 amdgpu_cper_fini(adev); 3564 3565 if (amdgpu_sriov_vf(adev) && adev->virt.ras_init_done) 3566 amdgpu_virt_release_ras_err_handler_data(adev); 3567 3568 if (adev->gmc.xgmi.num_physical_nodes > 1) 3569 amdgpu_xgmi_remove_device(adev); 3570 3571 amdgpu_amdkfd_device_fini_sw(adev); 3572 3573 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 3574 if (!adev->ip_blocks[i].status.sw) 3575 continue; 3576 3577 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) { 3578 amdgpu_ucode_free_bo(adev); 3579 amdgpu_free_static_csa(&adev->virt.csa_obj); 3580 amdgpu_device_wb_fini(adev); 3581 amdgpu_device_mem_scratch_fini(adev); 3582 amdgpu_ib_pool_fini(adev); 3583 amdgpu_seq64_fini(adev); 3584 amdgpu_doorbell_fini(adev); 3585 } 3586 if (adev->ip_blocks[i].version->funcs->sw_fini) { 3587 r = adev->ip_blocks[i].version->funcs->sw_fini(&adev->ip_blocks[i]); 3588 /* XXX handle errors */ 3589 if (r) { 3590 DRM_DEBUG("sw_fini of IP block <%s> failed %d\n", 3591 adev->ip_blocks[i].version->funcs->name, r); 3592 } 3593 } 3594 adev->ip_blocks[i].status.sw = false; 3595 adev->ip_blocks[i].status.valid = false; 3596 } 3597 3598 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 3599 if (!adev->ip_blocks[i].status.late_initialized) 3600 continue; 3601 if (adev->ip_blocks[i].version->funcs->late_fini) 3602 adev->ip_blocks[i].version->funcs->late_fini(&adev->ip_blocks[i]); 3603 adev->ip_blocks[i].status.late_initialized = false; 3604 } 3605 3606 amdgpu_ras_fini(adev); 3607 3608 return 0; 3609 } 3610 3611 /** 3612 * amdgpu_device_delayed_init_work_handler - work handler for IB tests 3613 * 3614 * @work: work_struct. 3615 */ 3616 static void amdgpu_device_delayed_init_work_handler(struct work_struct *work) 3617 { 3618 struct amdgpu_device *adev = 3619 container_of(work, struct amdgpu_device, delayed_init_work.work); 3620 int r; 3621 3622 r = amdgpu_ib_ring_tests(adev); 3623 if (r) 3624 DRM_ERROR("ib ring test failed (%d).\n", r); 3625 } 3626 3627 static void amdgpu_device_delay_enable_gfx_off(struct work_struct *work) 3628 { 3629 struct amdgpu_device *adev = 3630 container_of(work, struct amdgpu_device, gfx.gfx_off_delay_work.work); 3631 3632 WARN_ON_ONCE(adev->gfx.gfx_off_state); 3633 WARN_ON_ONCE(adev->gfx.gfx_off_req_count); 3634 3635 if (!amdgpu_dpm_set_powergating_by_smu(adev, AMD_IP_BLOCK_TYPE_GFX, true, 0)) 3636 adev->gfx.gfx_off_state = true; 3637 } 3638 3639 /** 3640 * amdgpu_device_ip_suspend_phase1 - run suspend for hardware IPs (phase 1) 3641 * 3642 * @adev: amdgpu_device pointer 3643 * 3644 * Main suspend function for hardware IPs. The list of all the hardware 3645 * IPs that make up the asic is walked, clockgating is disabled and the 3646 * suspend callbacks are run. suspend puts the hardware and software state 3647 * in each IP into a state suitable for suspend. 3648 * Returns 0 on success, negative error code on failure. 3649 */ 3650 static int amdgpu_device_ip_suspend_phase1(struct amdgpu_device *adev) 3651 { 3652 int i, r; 3653 3654 amdgpu_device_set_pg_state(adev, AMD_PG_STATE_UNGATE); 3655 amdgpu_device_set_cg_state(adev, AMD_CG_STATE_UNGATE); 3656 3657 /* 3658 * Per PMFW team's suggestion, driver needs to handle gfxoff 3659 * and df cstate features disablement for gpu reset(e.g. Mode1Reset) 3660 * scenario. Add the missing df cstate disablement here. 3661 */ 3662 if (amdgpu_dpm_set_df_cstate(adev, DF_CSTATE_DISALLOW)) 3663 dev_warn(adev->dev, "Failed to disallow df cstate"); 3664 3665 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 3666 if (!adev->ip_blocks[i].status.valid) 3667 continue; 3668 3669 /* displays are handled separately */ 3670 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_DCE) 3671 continue; 3672 3673 /* XXX handle errors */ 3674 r = amdgpu_ip_block_suspend(&adev->ip_blocks[i]); 3675 if (r) 3676 return r; 3677 } 3678 3679 return 0; 3680 } 3681 3682 /** 3683 * amdgpu_device_ip_suspend_phase2 - run suspend for hardware IPs (phase 2) 3684 * 3685 * @adev: amdgpu_device pointer 3686 * 3687 * Main suspend function for hardware IPs. The list of all the hardware 3688 * IPs that make up the asic is walked, clockgating is disabled and the 3689 * suspend callbacks are run. suspend puts the hardware and software state 3690 * in each IP into a state suitable for suspend. 3691 * Returns 0 on success, negative error code on failure. 3692 */ 3693 static int amdgpu_device_ip_suspend_phase2(struct amdgpu_device *adev) 3694 { 3695 int i, r; 3696 3697 if (adev->in_s0ix) 3698 amdgpu_dpm_gfx_state_change(adev, sGpuChangeState_D3Entry); 3699 3700 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 3701 if (!adev->ip_blocks[i].status.valid) 3702 continue; 3703 /* displays are handled in phase1 */ 3704 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE) 3705 continue; 3706 /* PSP lost connection when err_event_athub occurs */ 3707 if (amdgpu_ras_intr_triggered() && 3708 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) { 3709 adev->ip_blocks[i].status.hw = false; 3710 continue; 3711 } 3712 3713 /* skip unnecessary suspend if we do not initialize them yet */ 3714 if (!amdgpu_ip_member_of_hwini( 3715 adev, adev->ip_blocks[i].version->type)) 3716 continue; 3717 3718 /* Since we skip suspend for S0i3, we need to cancel the delayed 3719 * idle work here as the suspend callback never gets called. 3720 */ 3721 if (adev->in_s0ix && 3722 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GFX && 3723 amdgpu_ip_version(adev, GC_HWIP, 0) >= IP_VERSION(10, 0, 0)) 3724 cancel_delayed_work_sync(&adev->gfx.idle_work); 3725 /* skip suspend of gfx/mes and psp for S0ix 3726 * gfx is in gfxoff state, so on resume it will exit gfxoff just 3727 * like at runtime. PSP is also part of the always on hardware 3728 * so no need to suspend it. 3729 */ 3730 if (adev->in_s0ix && 3731 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP || 3732 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GFX || 3733 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_MES)) 3734 continue; 3735 3736 /* SDMA 5.x+ is part of GFX power domain so it's covered by GFXOFF */ 3737 if (adev->in_s0ix && 3738 (amdgpu_ip_version(adev, SDMA0_HWIP, 0) >= 3739 IP_VERSION(5, 0, 0)) && 3740 (adev->ip_blocks[i].version->type == 3741 AMD_IP_BLOCK_TYPE_SDMA)) 3742 continue; 3743 3744 /* Once swPSP provides the IMU, RLC FW binaries to TOS during cold-boot. 3745 * These are in TMR, hence are expected to be reused by PSP-TOS to reload 3746 * from this location and RLC Autoload automatically also gets loaded 3747 * from here based on PMFW -> PSP message during re-init sequence. 3748 * Therefore, the psp suspend & resume should be skipped to avoid destroy 3749 * the TMR and reload FWs again for IMU enabled APU ASICs. 3750 */ 3751 if (amdgpu_in_reset(adev) && 3752 (adev->flags & AMD_IS_APU) && adev->gfx.imu.funcs && 3753 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) 3754 continue; 3755 3756 /* XXX handle errors */ 3757 r = amdgpu_ip_block_suspend(&adev->ip_blocks[i]); 3758 adev->ip_blocks[i].status.hw = false; 3759 3760 /* handle putting the SMC in the appropriate state */ 3761 if (!amdgpu_sriov_vf(adev)) { 3762 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) { 3763 r = amdgpu_dpm_set_mp1_state(adev, adev->mp1_state); 3764 if (r) { 3765 DRM_ERROR("SMC failed to set mp1 state %d, %d\n", 3766 adev->mp1_state, r); 3767 return r; 3768 } 3769 } 3770 } 3771 } 3772 3773 return 0; 3774 } 3775 3776 /** 3777 * amdgpu_device_ip_suspend - run suspend for hardware IPs 3778 * 3779 * @adev: amdgpu_device pointer 3780 * 3781 * Main suspend function for hardware IPs. The list of all the hardware 3782 * IPs that make up the asic is walked, clockgating is disabled and the 3783 * suspend callbacks are run. suspend puts the hardware and software state 3784 * in each IP into a state suitable for suspend. 3785 * Returns 0 on success, negative error code on failure. 3786 */ 3787 int amdgpu_device_ip_suspend(struct amdgpu_device *adev) 3788 { 3789 int r; 3790 3791 if (amdgpu_sriov_vf(adev)) { 3792 amdgpu_virt_fini_data_exchange(adev); 3793 amdgpu_virt_request_full_gpu(adev, false); 3794 } 3795 3796 amdgpu_ttm_set_buffer_funcs_status(adev, false); 3797 3798 r = amdgpu_device_ip_suspend_phase1(adev); 3799 if (r) 3800 return r; 3801 r = amdgpu_device_ip_suspend_phase2(adev); 3802 3803 if (amdgpu_sriov_vf(adev)) 3804 amdgpu_virt_release_full_gpu(adev, false); 3805 3806 return r; 3807 } 3808 3809 static int amdgpu_device_ip_reinit_early_sriov(struct amdgpu_device *adev) 3810 { 3811 int i, r; 3812 3813 static enum amd_ip_block_type ip_order[] = { 3814 AMD_IP_BLOCK_TYPE_COMMON, 3815 AMD_IP_BLOCK_TYPE_GMC, 3816 AMD_IP_BLOCK_TYPE_PSP, 3817 AMD_IP_BLOCK_TYPE_IH, 3818 }; 3819 3820 for (i = 0; i < adev->num_ip_blocks; i++) { 3821 int j; 3822 struct amdgpu_ip_block *block; 3823 3824 block = &adev->ip_blocks[i]; 3825 block->status.hw = false; 3826 3827 for (j = 0; j < ARRAY_SIZE(ip_order); j++) { 3828 3829 if (block->version->type != ip_order[j] || 3830 !block->status.valid) 3831 continue; 3832 3833 r = block->version->funcs->hw_init(&adev->ip_blocks[i]); 3834 if (r) { 3835 dev_err(adev->dev, "RE-INIT-early: %s failed\n", 3836 block->version->funcs->name); 3837 return r; 3838 } 3839 block->status.hw = true; 3840 } 3841 } 3842 3843 return 0; 3844 } 3845 3846 static int amdgpu_device_ip_reinit_late_sriov(struct amdgpu_device *adev) 3847 { 3848 struct amdgpu_ip_block *block; 3849 int i, r = 0; 3850 3851 static enum amd_ip_block_type ip_order[] = { 3852 AMD_IP_BLOCK_TYPE_SMC, 3853 AMD_IP_BLOCK_TYPE_DCE, 3854 AMD_IP_BLOCK_TYPE_GFX, 3855 AMD_IP_BLOCK_TYPE_SDMA, 3856 AMD_IP_BLOCK_TYPE_MES, 3857 AMD_IP_BLOCK_TYPE_UVD, 3858 AMD_IP_BLOCK_TYPE_VCE, 3859 AMD_IP_BLOCK_TYPE_VCN, 3860 AMD_IP_BLOCK_TYPE_JPEG 3861 }; 3862 3863 for (i = 0; i < ARRAY_SIZE(ip_order); i++) { 3864 block = amdgpu_device_ip_get_ip_block(adev, ip_order[i]); 3865 3866 if (!block) 3867 continue; 3868 3869 if (block->status.valid && !block->status.hw) { 3870 if (block->version->type == AMD_IP_BLOCK_TYPE_SMC) { 3871 r = amdgpu_ip_block_resume(block); 3872 } else { 3873 r = block->version->funcs->hw_init(block); 3874 } 3875 3876 if (r) { 3877 dev_err(adev->dev, "RE-INIT-late: %s failed\n", 3878 block->version->funcs->name); 3879 break; 3880 } 3881 block->status.hw = true; 3882 } 3883 } 3884 3885 return r; 3886 } 3887 3888 /** 3889 * amdgpu_device_ip_resume_phase1 - run resume for hardware IPs 3890 * 3891 * @adev: amdgpu_device pointer 3892 * 3893 * First resume function for hardware IPs. The list of all the hardware 3894 * IPs that make up the asic is walked and the resume callbacks are run for 3895 * COMMON, GMC, and IH. resume puts the hardware into a functional state 3896 * after a suspend and updates the software state as necessary. This 3897 * function is also used for restoring the GPU after a GPU reset. 3898 * Returns 0 on success, negative error code on failure. 3899 */ 3900 static int amdgpu_device_ip_resume_phase1(struct amdgpu_device *adev) 3901 { 3902 int i, r; 3903 3904 for (i = 0; i < adev->num_ip_blocks; i++) { 3905 if (!adev->ip_blocks[i].status.valid || adev->ip_blocks[i].status.hw) 3906 continue; 3907 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON || 3908 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC || 3909 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH || 3910 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP && amdgpu_sriov_vf(adev))) { 3911 3912 r = amdgpu_ip_block_resume(&adev->ip_blocks[i]); 3913 if (r) 3914 return r; 3915 } 3916 } 3917 3918 return 0; 3919 } 3920 3921 /** 3922 * amdgpu_device_ip_resume_phase2 - run resume for hardware IPs 3923 * 3924 * @adev: amdgpu_device pointer 3925 * 3926 * Second resume function for hardware IPs. The list of all the hardware 3927 * IPs that make up the asic is walked and the resume callbacks are run for 3928 * all blocks except COMMON, GMC, and IH. resume puts the hardware into a 3929 * functional state after a suspend and updates the software state as 3930 * necessary. This function is also used for restoring the GPU after a GPU 3931 * reset. 3932 * Returns 0 on success, negative error code on failure. 3933 */ 3934 static int amdgpu_device_ip_resume_phase2(struct amdgpu_device *adev) 3935 { 3936 int i, r; 3937 3938 for (i = 0; i < adev->num_ip_blocks; i++) { 3939 if (!adev->ip_blocks[i].status.valid || adev->ip_blocks[i].status.hw) 3940 continue; 3941 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON || 3942 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC || 3943 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH || 3944 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE || 3945 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) 3946 continue; 3947 r = amdgpu_ip_block_resume(&adev->ip_blocks[i]); 3948 if (r) 3949 return r; 3950 } 3951 3952 return 0; 3953 } 3954 3955 /** 3956 * amdgpu_device_ip_resume_phase3 - run resume for hardware IPs 3957 * 3958 * @adev: amdgpu_device pointer 3959 * 3960 * Third resume function for hardware IPs. The list of all the hardware 3961 * IPs that make up the asic is walked and the resume callbacks are run for 3962 * all DCE. resume puts the hardware into a functional state after a suspend 3963 * and updates the software state as necessary. This function is also used 3964 * for restoring the GPU after a GPU reset. 3965 * 3966 * Returns 0 on success, negative error code on failure. 3967 */ 3968 static int amdgpu_device_ip_resume_phase3(struct amdgpu_device *adev) 3969 { 3970 int i, r; 3971 3972 for (i = 0; i < adev->num_ip_blocks; i++) { 3973 if (!adev->ip_blocks[i].status.valid || adev->ip_blocks[i].status.hw) 3974 continue; 3975 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE) { 3976 r = amdgpu_ip_block_resume(&adev->ip_blocks[i]); 3977 if (r) 3978 return r; 3979 } 3980 } 3981 3982 return 0; 3983 } 3984 3985 /** 3986 * amdgpu_device_ip_resume - run resume for hardware IPs 3987 * 3988 * @adev: amdgpu_device pointer 3989 * 3990 * Main resume function for hardware IPs. The hardware IPs 3991 * are split into two resume functions because they are 3992 * also used in recovering from a GPU reset and some additional 3993 * steps need to be take between them. In this case (S3/S4) they are 3994 * run sequentially. 3995 * Returns 0 on success, negative error code on failure. 3996 */ 3997 static int amdgpu_device_ip_resume(struct amdgpu_device *adev) 3998 { 3999 int r; 4000 4001 r = amdgpu_device_ip_resume_phase1(adev); 4002 if (r) 4003 return r; 4004 4005 r = amdgpu_device_fw_loading(adev); 4006 if (r) 4007 return r; 4008 4009 r = amdgpu_device_ip_resume_phase2(adev); 4010 4011 if (adev->mman.buffer_funcs_ring->sched.ready) 4012 amdgpu_ttm_set_buffer_funcs_status(adev, true); 4013 4014 if (r) 4015 return r; 4016 4017 amdgpu_fence_driver_hw_init(adev); 4018 4019 r = amdgpu_device_ip_resume_phase3(adev); 4020 4021 return r; 4022 } 4023 4024 /** 4025 * amdgpu_device_detect_sriov_bios - determine if the board supports SR-IOV 4026 * 4027 * @adev: amdgpu_device pointer 4028 * 4029 * Query the VBIOS data tables to determine if the board supports SR-IOV. 4030 */ 4031 static void amdgpu_device_detect_sriov_bios(struct amdgpu_device *adev) 4032 { 4033 if (amdgpu_sriov_vf(adev)) { 4034 if (adev->is_atom_fw) { 4035 if (amdgpu_atomfirmware_gpu_virtualization_supported(adev)) 4036 adev->virt.caps |= AMDGPU_SRIOV_CAPS_SRIOV_VBIOS; 4037 } else { 4038 if (amdgpu_atombios_has_gpu_virtualization_table(adev)) 4039 adev->virt.caps |= AMDGPU_SRIOV_CAPS_SRIOV_VBIOS; 4040 } 4041 4042 if (!(adev->virt.caps & AMDGPU_SRIOV_CAPS_SRIOV_VBIOS)) 4043 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_NO_VBIOS, 0, 0); 4044 } 4045 } 4046 4047 /** 4048 * amdgpu_device_asic_has_dc_support - determine if DC supports the asic 4049 * 4050 * @asic_type: AMD asic type 4051 * 4052 * Check if there is DC (new modesetting infrastructre) support for an asic. 4053 * returns true if DC has support, false if not. 4054 */ 4055 bool amdgpu_device_asic_has_dc_support(enum amd_asic_type asic_type) 4056 { 4057 switch (asic_type) { 4058 #ifdef CONFIG_DRM_AMDGPU_SI 4059 case CHIP_HAINAN: 4060 #endif 4061 case CHIP_TOPAZ: 4062 /* chips with no display hardware */ 4063 return false; 4064 #if defined(CONFIG_DRM_AMD_DC) 4065 case CHIP_TAHITI: 4066 case CHIP_PITCAIRN: 4067 case CHIP_VERDE: 4068 case CHIP_OLAND: 4069 /* 4070 * We have systems in the wild with these ASICs that require 4071 * LVDS and VGA support which is not supported with DC. 4072 * 4073 * Fallback to the non-DC driver here by default so as not to 4074 * cause regressions. 4075 */ 4076 #if defined(CONFIG_DRM_AMD_DC_SI) 4077 return amdgpu_dc > 0; 4078 #else 4079 return false; 4080 #endif 4081 case CHIP_BONAIRE: 4082 case CHIP_KAVERI: 4083 case CHIP_KABINI: 4084 case CHIP_MULLINS: 4085 /* 4086 * We have systems in the wild with these ASICs that require 4087 * VGA support which is not supported with DC. 4088 * 4089 * Fallback to the non-DC driver here by default so as not to 4090 * cause regressions. 4091 */ 4092 return amdgpu_dc > 0; 4093 default: 4094 return amdgpu_dc != 0; 4095 #else 4096 default: 4097 if (amdgpu_dc > 0) 4098 DRM_INFO_ONCE("Display Core has been requested via kernel parameter but isn't supported by ASIC, ignoring\n"); 4099 return false; 4100 #endif 4101 } 4102 } 4103 4104 /** 4105 * amdgpu_device_has_dc_support - check if dc is supported 4106 * 4107 * @adev: amdgpu_device pointer 4108 * 4109 * Returns true for supported, false for not supported 4110 */ 4111 bool amdgpu_device_has_dc_support(struct amdgpu_device *adev) 4112 { 4113 if (adev->enable_virtual_display || 4114 (adev->harvest_ip_mask & AMD_HARVEST_IP_DMU_MASK)) 4115 return false; 4116 4117 return amdgpu_device_asic_has_dc_support(adev->asic_type); 4118 } 4119 4120 static void amdgpu_device_xgmi_reset_func(struct work_struct *__work) 4121 { 4122 struct amdgpu_device *adev = 4123 container_of(__work, struct amdgpu_device, xgmi_reset_work); 4124 struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev); 4125 4126 /* It's a bug to not have a hive within this function */ 4127 if (WARN_ON(!hive)) 4128 return; 4129 4130 /* 4131 * Use task barrier to synchronize all xgmi reset works across the 4132 * hive. task_barrier_enter and task_barrier_exit will block 4133 * until all the threads running the xgmi reset works reach 4134 * those points. task_barrier_full will do both blocks. 4135 */ 4136 if (amdgpu_asic_reset_method(adev) == AMD_RESET_METHOD_BACO) { 4137 4138 task_barrier_enter(&hive->tb); 4139 adev->asic_reset_res = amdgpu_device_baco_enter(adev_to_drm(adev)); 4140 4141 if (adev->asic_reset_res) 4142 goto fail; 4143 4144 task_barrier_exit(&hive->tb); 4145 adev->asic_reset_res = amdgpu_device_baco_exit(adev_to_drm(adev)); 4146 4147 if (adev->asic_reset_res) 4148 goto fail; 4149 4150 amdgpu_ras_reset_error_count(adev, AMDGPU_RAS_BLOCK__MMHUB); 4151 } else { 4152 4153 task_barrier_full(&hive->tb); 4154 adev->asic_reset_res = amdgpu_asic_reset(adev); 4155 } 4156 4157 fail: 4158 if (adev->asic_reset_res) 4159 DRM_WARN("ASIC reset failed with error, %d for drm dev, %s", 4160 adev->asic_reset_res, adev_to_drm(adev)->unique); 4161 amdgpu_put_xgmi_hive(hive); 4162 } 4163 4164 static int amdgpu_device_get_job_timeout_settings(struct amdgpu_device *adev) 4165 { 4166 char *input = amdgpu_lockup_timeout; 4167 char *timeout_setting = NULL; 4168 int index = 0; 4169 long timeout; 4170 int ret = 0; 4171 4172 /* 4173 * By default timeout for non compute jobs is 10000 4174 * and 60000 for compute jobs. 4175 * In SR-IOV or passthrough mode, timeout for compute 4176 * jobs are 60000 by default. 4177 */ 4178 adev->gfx_timeout = msecs_to_jiffies(10000); 4179 adev->sdma_timeout = adev->video_timeout = adev->gfx_timeout; 4180 if (amdgpu_sriov_vf(adev)) 4181 adev->compute_timeout = amdgpu_sriov_is_pp_one_vf(adev) ? 4182 msecs_to_jiffies(60000) : msecs_to_jiffies(10000); 4183 else 4184 adev->compute_timeout = msecs_to_jiffies(60000); 4185 4186 if (strnlen(input, AMDGPU_MAX_TIMEOUT_PARAM_LENGTH)) { 4187 while ((timeout_setting = strsep(&input, ",")) && 4188 strnlen(timeout_setting, AMDGPU_MAX_TIMEOUT_PARAM_LENGTH)) { 4189 ret = kstrtol(timeout_setting, 0, &timeout); 4190 if (ret) 4191 return ret; 4192 4193 if (timeout == 0) { 4194 index++; 4195 continue; 4196 } else if (timeout < 0) { 4197 timeout = MAX_SCHEDULE_TIMEOUT; 4198 dev_warn(adev->dev, "lockup timeout disabled"); 4199 add_taint(TAINT_SOFTLOCKUP, LOCKDEP_STILL_OK); 4200 } else { 4201 timeout = msecs_to_jiffies(timeout); 4202 } 4203 4204 switch (index++) { 4205 case 0: 4206 adev->gfx_timeout = timeout; 4207 break; 4208 case 1: 4209 adev->compute_timeout = timeout; 4210 break; 4211 case 2: 4212 adev->sdma_timeout = timeout; 4213 break; 4214 case 3: 4215 adev->video_timeout = timeout; 4216 break; 4217 default: 4218 break; 4219 } 4220 } 4221 /* 4222 * There is only one value specified and 4223 * it should apply to all non-compute jobs. 4224 */ 4225 if (index == 1) { 4226 adev->sdma_timeout = adev->video_timeout = adev->gfx_timeout; 4227 if (amdgpu_sriov_vf(adev) || amdgpu_passthrough(adev)) 4228 adev->compute_timeout = adev->gfx_timeout; 4229 } 4230 } 4231 4232 return ret; 4233 } 4234 4235 /** 4236 * amdgpu_device_check_iommu_direct_map - check if RAM direct mapped to GPU 4237 * 4238 * @adev: amdgpu_device pointer 4239 * 4240 * RAM direct mapped to GPU if IOMMU is not enabled or is pass through mode 4241 */ 4242 static void amdgpu_device_check_iommu_direct_map(struct amdgpu_device *adev) 4243 { 4244 struct iommu_domain *domain; 4245 4246 domain = iommu_get_domain_for_dev(adev->dev); 4247 if (!domain || domain->type == IOMMU_DOMAIN_IDENTITY) 4248 adev->ram_is_direct_mapped = true; 4249 } 4250 4251 #if defined(CONFIG_HSA_AMD_P2P) 4252 /** 4253 * amdgpu_device_check_iommu_remap - Check if DMA remapping is enabled. 4254 * 4255 * @adev: amdgpu_device pointer 4256 * 4257 * return if IOMMU remapping bar address 4258 */ 4259 static bool amdgpu_device_check_iommu_remap(struct amdgpu_device *adev) 4260 { 4261 struct iommu_domain *domain; 4262 4263 domain = iommu_get_domain_for_dev(adev->dev); 4264 if (domain && (domain->type == IOMMU_DOMAIN_DMA || 4265 domain->type == IOMMU_DOMAIN_DMA_FQ)) 4266 return true; 4267 4268 return false; 4269 } 4270 #endif 4271 4272 static void amdgpu_device_set_mcbp(struct amdgpu_device *adev) 4273 { 4274 if (amdgpu_mcbp == 1) 4275 adev->gfx.mcbp = true; 4276 else if (amdgpu_mcbp == 0) 4277 adev->gfx.mcbp = false; 4278 4279 if (amdgpu_sriov_vf(adev)) 4280 adev->gfx.mcbp = true; 4281 4282 if (adev->gfx.mcbp) 4283 DRM_INFO("MCBP is enabled\n"); 4284 } 4285 4286 /** 4287 * amdgpu_device_init - initialize the driver 4288 * 4289 * @adev: amdgpu_device pointer 4290 * @flags: driver flags 4291 * 4292 * Initializes the driver info and hw (all asics). 4293 * Returns 0 for success or an error on failure. 4294 * Called at driver startup. 4295 */ 4296 int amdgpu_device_init(struct amdgpu_device *adev, 4297 uint32_t flags) 4298 { 4299 struct drm_device *ddev = adev_to_drm(adev); 4300 struct pci_dev *pdev = adev->pdev; 4301 int r, i; 4302 bool px = false; 4303 u32 max_MBps; 4304 int tmp; 4305 4306 adev->shutdown = false; 4307 adev->flags = flags; 4308 4309 if (amdgpu_force_asic_type >= 0 && amdgpu_force_asic_type < CHIP_LAST) 4310 adev->asic_type = amdgpu_force_asic_type; 4311 else 4312 adev->asic_type = flags & AMD_ASIC_MASK; 4313 4314 adev->usec_timeout = AMDGPU_MAX_USEC_TIMEOUT; 4315 if (amdgpu_emu_mode == 1) 4316 adev->usec_timeout *= 10; 4317 adev->gmc.gart_size = 512 * 1024 * 1024; 4318 adev->accel_working = false; 4319 adev->num_rings = 0; 4320 RCU_INIT_POINTER(adev->gang_submit, dma_fence_get_stub()); 4321 adev->mman.buffer_funcs = NULL; 4322 adev->mman.buffer_funcs_ring = NULL; 4323 adev->vm_manager.vm_pte_funcs = NULL; 4324 adev->vm_manager.vm_pte_num_scheds = 0; 4325 adev->gmc.gmc_funcs = NULL; 4326 adev->harvest_ip_mask = 0x0; 4327 adev->fence_context = dma_fence_context_alloc(AMDGPU_MAX_RINGS); 4328 bitmap_zero(adev->gfx.pipe_reserve_bitmap, AMDGPU_MAX_COMPUTE_QUEUES); 4329 4330 adev->smc_rreg = &amdgpu_invalid_rreg; 4331 adev->smc_wreg = &amdgpu_invalid_wreg; 4332 adev->pcie_rreg = &amdgpu_invalid_rreg; 4333 adev->pcie_wreg = &amdgpu_invalid_wreg; 4334 adev->pcie_rreg_ext = &amdgpu_invalid_rreg_ext; 4335 adev->pcie_wreg_ext = &amdgpu_invalid_wreg_ext; 4336 adev->pciep_rreg = &amdgpu_invalid_rreg; 4337 adev->pciep_wreg = &amdgpu_invalid_wreg; 4338 adev->pcie_rreg64 = &amdgpu_invalid_rreg64; 4339 adev->pcie_wreg64 = &amdgpu_invalid_wreg64; 4340 adev->pcie_rreg64_ext = &amdgpu_invalid_rreg64_ext; 4341 adev->pcie_wreg64_ext = &amdgpu_invalid_wreg64_ext; 4342 adev->uvd_ctx_rreg = &amdgpu_invalid_rreg; 4343 adev->uvd_ctx_wreg = &amdgpu_invalid_wreg; 4344 adev->didt_rreg = &amdgpu_invalid_rreg; 4345 adev->didt_wreg = &amdgpu_invalid_wreg; 4346 adev->gc_cac_rreg = &amdgpu_invalid_rreg; 4347 adev->gc_cac_wreg = &amdgpu_invalid_wreg; 4348 adev->audio_endpt_rreg = &amdgpu_block_invalid_rreg; 4349 adev->audio_endpt_wreg = &amdgpu_block_invalid_wreg; 4350 4351 DRM_INFO("initializing kernel modesetting (%s 0x%04X:0x%04X 0x%04X:0x%04X 0x%02X).\n", 4352 amdgpu_asic_name[adev->asic_type], pdev->vendor, pdev->device, 4353 pdev->subsystem_vendor, pdev->subsystem_device, pdev->revision); 4354 4355 /* mutex initialization are all done here so we 4356 * can recall function without having locking issues 4357 */ 4358 mutex_init(&adev->firmware.mutex); 4359 mutex_init(&adev->pm.mutex); 4360 mutex_init(&adev->gfx.gpu_clock_mutex); 4361 mutex_init(&adev->srbm_mutex); 4362 mutex_init(&adev->gfx.pipe_reserve_mutex); 4363 mutex_init(&adev->gfx.gfx_off_mutex); 4364 mutex_init(&adev->gfx.partition_mutex); 4365 mutex_init(&adev->grbm_idx_mutex); 4366 mutex_init(&adev->mn_lock); 4367 mutex_init(&adev->virt.vf_errors.lock); 4368 hash_init(adev->mn_hash); 4369 mutex_init(&adev->psp.mutex); 4370 mutex_init(&adev->notifier_lock); 4371 mutex_init(&adev->pm.stable_pstate_ctx_lock); 4372 mutex_init(&adev->benchmark_mutex); 4373 mutex_init(&adev->gfx.reset_sem_mutex); 4374 /* Initialize the mutex for cleaner shader isolation between GFX and compute processes */ 4375 mutex_init(&adev->enforce_isolation_mutex); 4376 for (i = 0; i < MAX_XCP; ++i) { 4377 adev->isolation[i].spearhead = dma_fence_get_stub(); 4378 amdgpu_sync_create(&adev->isolation[i].active); 4379 amdgpu_sync_create(&adev->isolation[i].prev); 4380 } 4381 mutex_init(&adev->gfx.userq_sch_mutex); 4382 mutex_init(&adev->gfx.workload_profile_mutex); 4383 mutex_init(&adev->vcn.workload_profile_mutex); 4384 mutex_init(&adev->userq_mutex); 4385 4386 amdgpu_device_init_apu_flags(adev); 4387 4388 r = amdgpu_device_check_arguments(adev); 4389 if (r) 4390 return r; 4391 4392 spin_lock_init(&adev->mmio_idx_lock); 4393 spin_lock_init(&adev->smc_idx_lock); 4394 spin_lock_init(&adev->pcie_idx_lock); 4395 spin_lock_init(&adev->uvd_ctx_idx_lock); 4396 spin_lock_init(&adev->didt_idx_lock); 4397 spin_lock_init(&adev->gc_cac_idx_lock); 4398 spin_lock_init(&adev->se_cac_idx_lock); 4399 spin_lock_init(&adev->audio_endpt_idx_lock); 4400 spin_lock_init(&adev->mm_stats.lock); 4401 spin_lock_init(&adev->virt.rlcg_reg_lock); 4402 spin_lock_init(&adev->wb.lock); 4403 4404 xa_init_flags(&adev->userq_xa, XA_FLAGS_LOCK_IRQ); 4405 4406 INIT_LIST_HEAD(&adev->reset_list); 4407 4408 INIT_LIST_HEAD(&adev->ras_list); 4409 4410 INIT_LIST_HEAD(&adev->pm.od_kobj_list); 4411 4412 INIT_LIST_HEAD(&adev->userq_mgr_list); 4413 4414 INIT_DELAYED_WORK(&adev->delayed_init_work, 4415 amdgpu_device_delayed_init_work_handler); 4416 INIT_DELAYED_WORK(&adev->gfx.gfx_off_delay_work, 4417 amdgpu_device_delay_enable_gfx_off); 4418 /* 4419 * Initialize the enforce_isolation work structures for each XCP 4420 * partition. This work handler is responsible for enforcing shader 4421 * isolation on AMD GPUs. It counts the number of emitted fences for 4422 * each GFX and compute ring. If there are any fences, it schedules 4423 * the `enforce_isolation_work` to be run after a delay. If there are 4424 * no fences, it signals the Kernel Fusion Driver (KFD) to resume the 4425 * runqueue. 4426 */ 4427 for (i = 0; i < MAX_XCP; i++) { 4428 INIT_DELAYED_WORK(&adev->gfx.enforce_isolation[i].work, 4429 amdgpu_gfx_enforce_isolation_handler); 4430 adev->gfx.enforce_isolation[i].adev = adev; 4431 adev->gfx.enforce_isolation[i].xcp_id = i; 4432 } 4433 4434 INIT_WORK(&adev->xgmi_reset_work, amdgpu_device_xgmi_reset_func); 4435 4436 adev->gfx.gfx_off_req_count = 1; 4437 adev->gfx.gfx_off_residency = 0; 4438 adev->gfx.gfx_off_entrycount = 0; 4439 adev->pm.ac_power = power_supply_is_system_supplied() > 0; 4440 4441 atomic_set(&adev->throttling_logging_enabled, 1); 4442 /* 4443 * If throttling continues, logging will be performed every minute 4444 * to avoid log flooding. "-1" is subtracted since the thermal 4445 * throttling interrupt comes every second. Thus, the total logging 4446 * interval is 59 seconds(retelimited printk interval) + 1(waiting 4447 * for throttling interrupt) = 60 seconds. 4448 */ 4449 ratelimit_state_init(&adev->throttling_logging_rs, (60 - 1) * HZ, 1); 4450 4451 ratelimit_set_flags(&adev->throttling_logging_rs, RATELIMIT_MSG_ON_RELEASE); 4452 4453 /* Registers mapping */ 4454 /* TODO: block userspace mapping of io register */ 4455 if (adev->asic_type >= CHIP_BONAIRE) { 4456 adev->rmmio_base = pci_resource_start(adev->pdev, 5); 4457 adev->rmmio_size = pci_resource_len(adev->pdev, 5); 4458 } else { 4459 adev->rmmio_base = pci_resource_start(adev->pdev, 2); 4460 adev->rmmio_size = pci_resource_len(adev->pdev, 2); 4461 } 4462 4463 for (i = 0; i < AMD_IP_BLOCK_TYPE_NUM; i++) 4464 atomic_set(&adev->pm.pwr_state[i], POWER_STATE_UNKNOWN); 4465 4466 adev->rmmio = ioremap(adev->rmmio_base, adev->rmmio_size); 4467 if (!adev->rmmio) 4468 return -ENOMEM; 4469 4470 DRM_INFO("register mmio base: 0x%08X\n", (uint32_t)adev->rmmio_base); 4471 DRM_INFO("register mmio size: %u\n", (unsigned int)adev->rmmio_size); 4472 4473 /* 4474 * Reset domain needs to be present early, before XGMI hive discovered 4475 * (if any) and initialized to use reset sem and in_gpu reset flag 4476 * early on during init and before calling to RREG32. 4477 */ 4478 adev->reset_domain = amdgpu_reset_create_reset_domain(SINGLE_DEVICE, "amdgpu-reset-dev"); 4479 if (!adev->reset_domain) 4480 return -ENOMEM; 4481 4482 /* detect hw virtualization here */ 4483 amdgpu_virt_init(adev); 4484 4485 amdgpu_device_get_pcie_info(adev); 4486 4487 r = amdgpu_device_get_job_timeout_settings(adev); 4488 if (r) { 4489 dev_err(adev->dev, "invalid lockup_timeout parameter syntax\n"); 4490 return r; 4491 } 4492 4493 amdgpu_device_set_mcbp(adev); 4494 4495 /* 4496 * By default, use default mode where all blocks are expected to be 4497 * initialized. At present a 'swinit' of blocks is required to be 4498 * completed before the need for a different level is detected. 4499 */ 4500 amdgpu_set_init_level(adev, AMDGPU_INIT_LEVEL_DEFAULT); 4501 /* early init functions */ 4502 r = amdgpu_device_ip_early_init(adev); 4503 if (r) 4504 return r; 4505 4506 /* 4507 * No need to remove conflicting FBs for non-display class devices. 4508 * This prevents the sysfb from being freed accidently. 4509 */ 4510 if ((pdev->class >> 8) == PCI_CLASS_DISPLAY_VGA || 4511 (pdev->class >> 8) == PCI_CLASS_DISPLAY_OTHER) { 4512 /* Get rid of things like offb */ 4513 r = aperture_remove_conflicting_pci_devices(adev->pdev, amdgpu_kms_driver.name); 4514 if (r) 4515 return r; 4516 } 4517 4518 /* Enable TMZ based on IP_VERSION */ 4519 amdgpu_gmc_tmz_set(adev); 4520 4521 if (amdgpu_sriov_vf(adev) && 4522 amdgpu_ip_version(adev, GC_HWIP, 0) >= IP_VERSION(10, 3, 0)) 4523 /* VF MMIO access (except mailbox range) from CPU 4524 * will be blocked during sriov runtime 4525 */ 4526 adev->virt.caps |= AMDGPU_VF_MMIO_ACCESS_PROTECT; 4527 4528 amdgpu_gmc_noretry_set(adev); 4529 /* Need to get xgmi info early to decide the reset behavior*/ 4530 if (adev->gmc.xgmi.supported) { 4531 r = adev->gfxhub.funcs->get_xgmi_info(adev); 4532 if (r) 4533 return r; 4534 } 4535 4536 /* enable PCIE atomic ops */ 4537 if (amdgpu_sriov_vf(adev)) { 4538 if (adev->virt.fw_reserve.p_pf2vf) 4539 adev->have_atomics_support = ((struct amd_sriov_msg_pf2vf_info *) 4540 adev->virt.fw_reserve.p_pf2vf)->pcie_atomic_ops_support_flags == 4541 (PCI_EXP_DEVCAP2_ATOMIC_COMP32 | PCI_EXP_DEVCAP2_ATOMIC_COMP64); 4542 /* APUs w/ gfx9 onwards doesn't reply on PCIe atomics, rather it is a 4543 * internal path natively support atomics, set have_atomics_support to true. 4544 */ 4545 } else if ((adev->flags & AMD_IS_APU) && 4546 (amdgpu_ip_version(adev, GC_HWIP, 0) > 4547 IP_VERSION(9, 0, 0))) { 4548 adev->have_atomics_support = true; 4549 } else { 4550 adev->have_atomics_support = 4551 !pci_enable_atomic_ops_to_root(adev->pdev, 4552 PCI_EXP_DEVCAP2_ATOMIC_COMP32 | 4553 PCI_EXP_DEVCAP2_ATOMIC_COMP64); 4554 } 4555 4556 if (!adev->have_atomics_support) 4557 dev_info(adev->dev, "PCIE atomic ops is not supported\n"); 4558 4559 /* doorbell bar mapping and doorbell index init*/ 4560 amdgpu_doorbell_init(adev); 4561 4562 if (amdgpu_emu_mode == 1) { 4563 /* post the asic on emulation mode */ 4564 emu_soc_asic_init(adev); 4565 goto fence_driver_init; 4566 } 4567 4568 amdgpu_reset_init(adev); 4569 4570 /* detect if we are with an SRIOV vbios */ 4571 if (adev->bios) 4572 amdgpu_device_detect_sriov_bios(adev); 4573 4574 /* check if we need to reset the asic 4575 * E.g., driver was not cleanly unloaded previously, etc. 4576 */ 4577 if (!amdgpu_sriov_vf(adev) && amdgpu_asic_need_reset_on_init(adev)) { 4578 if (adev->gmc.xgmi.num_physical_nodes) { 4579 dev_info(adev->dev, "Pending hive reset.\n"); 4580 amdgpu_set_init_level(adev, 4581 AMDGPU_INIT_LEVEL_MINIMAL_XGMI); 4582 } else if (amdgpu_ip_version(adev, MP1_HWIP, 0) == IP_VERSION(13, 0, 10) && 4583 !amdgpu_device_has_display_hardware(adev)) { 4584 r = psp_gpu_reset(adev); 4585 } else { 4586 tmp = amdgpu_reset_method; 4587 /* It should do a default reset when loading or reloading the driver, 4588 * regardless of the module parameter reset_method. 4589 */ 4590 amdgpu_reset_method = AMD_RESET_METHOD_NONE; 4591 r = amdgpu_asic_reset(adev); 4592 amdgpu_reset_method = tmp; 4593 } 4594 4595 if (r) { 4596 dev_err(adev->dev, "asic reset on init failed\n"); 4597 goto failed; 4598 } 4599 } 4600 4601 /* Post card if necessary */ 4602 if (amdgpu_device_need_post(adev)) { 4603 if (!adev->bios) { 4604 dev_err(adev->dev, "no vBIOS found\n"); 4605 r = -EINVAL; 4606 goto failed; 4607 } 4608 DRM_INFO("GPU posting now...\n"); 4609 r = amdgpu_device_asic_init(adev); 4610 if (r) { 4611 dev_err(adev->dev, "gpu post error!\n"); 4612 goto failed; 4613 } 4614 } 4615 4616 if (adev->bios) { 4617 if (adev->is_atom_fw) { 4618 /* Initialize clocks */ 4619 r = amdgpu_atomfirmware_get_clock_info(adev); 4620 if (r) { 4621 dev_err(adev->dev, "amdgpu_atomfirmware_get_clock_info failed\n"); 4622 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_GET_CLOCK_FAIL, 0, 0); 4623 goto failed; 4624 } 4625 } else { 4626 /* Initialize clocks */ 4627 r = amdgpu_atombios_get_clock_info(adev); 4628 if (r) { 4629 dev_err(adev->dev, "amdgpu_atombios_get_clock_info failed\n"); 4630 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_GET_CLOCK_FAIL, 0, 0); 4631 goto failed; 4632 } 4633 /* init i2c buses */ 4634 amdgpu_i2c_init(adev); 4635 } 4636 } 4637 4638 fence_driver_init: 4639 /* Fence driver */ 4640 r = amdgpu_fence_driver_sw_init(adev); 4641 if (r) { 4642 dev_err(adev->dev, "amdgpu_fence_driver_sw_init failed\n"); 4643 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_FENCE_INIT_FAIL, 0, 0); 4644 goto failed; 4645 } 4646 4647 /* init the mode config */ 4648 drm_mode_config_init(adev_to_drm(adev)); 4649 4650 r = amdgpu_device_ip_init(adev); 4651 if (r) { 4652 dev_err(adev->dev, "amdgpu_device_ip_init failed\n"); 4653 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_AMDGPU_INIT_FAIL, 0, 0); 4654 goto release_ras_con; 4655 } 4656 4657 amdgpu_fence_driver_hw_init(adev); 4658 4659 dev_info(adev->dev, 4660 "SE %d, SH per SE %d, CU per SH %d, active_cu_number %d\n", 4661 adev->gfx.config.max_shader_engines, 4662 adev->gfx.config.max_sh_per_se, 4663 adev->gfx.config.max_cu_per_sh, 4664 adev->gfx.cu_info.number); 4665 4666 adev->accel_working = true; 4667 4668 amdgpu_vm_check_compute_bug(adev); 4669 4670 /* Initialize the buffer migration limit. */ 4671 if (amdgpu_moverate >= 0) 4672 max_MBps = amdgpu_moverate; 4673 else 4674 max_MBps = 8; /* Allow 8 MB/s. */ 4675 /* Get a log2 for easy divisions. */ 4676 adev->mm_stats.log2_max_MBps = ilog2(max(1u, max_MBps)); 4677 4678 /* 4679 * Register gpu instance before amdgpu_device_enable_mgpu_fan_boost. 4680 * Otherwise the mgpu fan boost feature will be skipped due to the 4681 * gpu instance is counted less. 4682 */ 4683 amdgpu_register_gpu_instance(adev); 4684 4685 /* enable clockgating, etc. after ib tests, etc. since some blocks require 4686 * explicit gating rather than handling it automatically. 4687 */ 4688 if (adev->init_lvl->level != AMDGPU_INIT_LEVEL_MINIMAL_XGMI) { 4689 r = amdgpu_device_ip_late_init(adev); 4690 if (r) { 4691 dev_err(adev->dev, "amdgpu_device_ip_late_init failed\n"); 4692 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_AMDGPU_LATE_INIT_FAIL, 0, r); 4693 goto release_ras_con; 4694 } 4695 /* must succeed. */ 4696 amdgpu_ras_resume(adev); 4697 queue_delayed_work(system_wq, &adev->delayed_init_work, 4698 msecs_to_jiffies(AMDGPU_RESUME_MS)); 4699 } 4700 4701 if (amdgpu_sriov_vf(adev)) { 4702 amdgpu_virt_release_full_gpu(adev, true); 4703 flush_delayed_work(&adev->delayed_init_work); 4704 } 4705 4706 /* 4707 * Place those sysfs registering after `late_init`. As some of those 4708 * operations performed in `late_init` might affect the sysfs 4709 * interfaces creating. 4710 */ 4711 r = amdgpu_atombios_sysfs_init(adev); 4712 if (r) 4713 drm_err(&adev->ddev, 4714 "registering atombios sysfs failed (%d).\n", r); 4715 4716 r = amdgpu_pm_sysfs_init(adev); 4717 if (r) 4718 DRM_ERROR("registering pm sysfs failed (%d).\n", r); 4719 4720 r = amdgpu_ucode_sysfs_init(adev); 4721 if (r) { 4722 adev->ucode_sysfs_en = false; 4723 DRM_ERROR("Creating firmware sysfs failed (%d).\n", r); 4724 } else 4725 adev->ucode_sysfs_en = true; 4726 4727 r = amdgpu_device_attr_sysfs_init(adev); 4728 if (r) 4729 dev_err(adev->dev, "Could not create amdgpu device attr\n"); 4730 4731 r = devm_device_add_group(adev->dev, &amdgpu_board_attrs_group); 4732 if (r) 4733 dev_err(adev->dev, 4734 "Could not create amdgpu board attributes\n"); 4735 4736 amdgpu_fru_sysfs_init(adev); 4737 amdgpu_reg_state_sysfs_init(adev); 4738 amdgpu_xcp_sysfs_init(adev); 4739 4740 if (IS_ENABLED(CONFIG_PERF_EVENTS)) 4741 r = amdgpu_pmu_init(adev); 4742 if (r) 4743 dev_err(adev->dev, "amdgpu_pmu_init failed\n"); 4744 4745 /* Have stored pci confspace at hand for restore in sudden PCI error */ 4746 if (amdgpu_device_cache_pci_state(adev->pdev)) 4747 pci_restore_state(pdev); 4748 4749 /* if we have > 1 VGA cards, then disable the amdgpu VGA resources */ 4750 /* this will fail for cards that aren't VGA class devices, just 4751 * ignore it 4752 */ 4753 if ((adev->pdev->class >> 8) == PCI_CLASS_DISPLAY_VGA) 4754 vga_client_register(adev->pdev, amdgpu_device_vga_set_decode); 4755 4756 px = amdgpu_device_supports_px(ddev); 4757 4758 if (px || (!dev_is_removable(&adev->pdev->dev) && 4759 apple_gmux_detect(NULL, NULL))) 4760 vga_switcheroo_register_client(adev->pdev, 4761 &amdgpu_switcheroo_ops, px); 4762 4763 if (px) 4764 vga_switcheroo_init_domain_pm_ops(adev->dev, &adev->vga_pm_domain); 4765 4766 if (adev->init_lvl->level == AMDGPU_INIT_LEVEL_MINIMAL_XGMI) 4767 amdgpu_xgmi_reset_on_init(adev); 4768 4769 amdgpu_device_check_iommu_direct_map(adev); 4770 4771 adev->pm_nb.notifier_call = amdgpu_device_pm_notifier; 4772 r = register_pm_notifier(&adev->pm_nb); 4773 if (r) 4774 goto failed; 4775 4776 return 0; 4777 4778 release_ras_con: 4779 if (amdgpu_sriov_vf(adev)) 4780 amdgpu_virt_release_full_gpu(adev, true); 4781 4782 /* failed in exclusive mode due to timeout */ 4783 if (amdgpu_sriov_vf(adev) && 4784 !amdgpu_sriov_runtime(adev) && 4785 amdgpu_virt_mmio_blocked(adev) && 4786 !amdgpu_virt_wait_reset(adev)) { 4787 dev_err(adev->dev, "VF exclusive mode timeout\n"); 4788 /* Don't send request since VF is inactive. */ 4789 adev->virt.caps &= ~AMDGPU_SRIOV_CAPS_RUNTIME; 4790 adev->virt.ops = NULL; 4791 r = -EAGAIN; 4792 } 4793 amdgpu_release_ras_context(adev); 4794 4795 failed: 4796 amdgpu_vf_error_trans_all(adev); 4797 4798 return r; 4799 } 4800 4801 static void amdgpu_device_unmap_mmio(struct amdgpu_device *adev) 4802 { 4803 4804 /* Clear all CPU mappings pointing to this device */ 4805 unmap_mapping_range(adev->ddev.anon_inode->i_mapping, 0, 0, 1); 4806 4807 /* Unmap all mapped bars - Doorbell, registers and VRAM */ 4808 amdgpu_doorbell_fini(adev); 4809 4810 iounmap(adev->rmmio); 4811 adev->rmmio = NULL; 4812 if (adev->mman.aper_base_kaddr) 4813 iounmap(adev->mman.aper_base_kaddr); 4814 adev->mman.aper_base_kaddr = NULL; 4815 4816 /* Memory manager related */ 4817 if (!adev->gmc.xgmi.connected_to_cpu && !adev->gmc.is_app_apu) { 4818 arch_phys_wc_del(adev->gmc.vram_mtrr); 4819 arch_io_free_memtype_wc(adev->gmc.aper_base, adev->gmc.aper_size); 4820 } 4821 } 4822 4823 /** 4824 * amdgpu_device_fini_hw - tear down the driver 4825 * 4826 * @adev: amdgpu_device pointer 4827 * 4828 * Tear down the driver info (all asics). 4829 * Called at driver shutdown. 4830 */ 4831 void amdgpu_device_fini_hw(struct amdgpu_device *adev) 4832 { 4833 dev_info(adev->dev, "amdgpu: finishing device.\n"); 4834 flush_delayed_work(&adev->delayed_init_work); 4835 4836 if (adev->mman.initialized) 4837 drain_workqueue(adev->mman.bdev.wq); 4838 adev->shutdown = true; 4839 4840 unregister_pm_notifier(&adev->pm_nb); 4841 4842 /* make sure IB test finished before entering exclusive mode 4843 * to avoid preemption on IB test 4844 */ 4845 if (amdgpu_sriov_vf(adev)) { 4846 amdgpu_virt_request_full_gpu(adev, false); 4847 amdgpu_virt_fini_data_exchange(adev); 4848 } 4849 4850 /* disable all interrupts */ 4851 amdgpu_irq_disable_all(adev); 4852 if (adev->mode_info.mode_config_initialized) { 4853 if (!drm_drv_uses_atomic_modeset(adev_to_drm(adev))) 4854 drm_helper_force_disable_all(adev_to_drm(adev)); 4855 else 4856 drm_atomic_helper_shutdown(adev_to_drm(adev)); 4857 } 4858 amdgpu_fence_driver_hw_fini(adev); 4859 4860 if (adev->pm.sysfs_initialized) 4861 amdgpu_pm_sysfs_fini(adev); 4862 if (adev->ucode_sysfs_en) 4863 amdgpu_ucode_sysfs_fini(adev); 4864 amdgpu_device_attr_sysfs_fini(adev); 4865 amdgpu_fru_sysfs_fini(adev); 4866 4867 amdgpu_reg_state_sysfs_fini(adev); 4868 amdgpu_xcp_sysfs_fini(adev); 4869 4870 /* disable ras feature must before hw fini */ 4871 amdgpu_ras_pre_fini(adev); 4872 4873 amdgpu_ttm_set_buffer_funcs_status(adev, false); 4874 4875 amdgpu_device_ip_fini_early(adev); 4876 4877 amdgpu_irq_fini_hw(adev); 4878 4879 if (adev->mman.initialized) 4880 ttm_device_clear_dma_mappings(&adev->mman.bdev); 4881 4882 amdgpu_gart_dummy_page_fini(adev); 4883 4884 if (drm_dev_is_unplugged(adev_to_drm(adev))) 4885 amdgpu_device_unmap_mmio(adev); 4886 4887 } 4888 4889 void amdgpu_device_fini_sw(struct amdgpu_device *adev) 4890 { 4891 int i, idx; 4892 bool px; 4893 4894 amdgpu_device_ip_fini(adev); 4895 amdgpu_fence_driver_sw_fini(adev); 4896 amdgpu_ucode_release(&adev->firmware.gpu_info_fw); 4897 adev->accel_working = false; 4898 dma_fence_put(rcu_dereference_protected(adev->gang_submit, true)); 4899 for (i = 0; i < MAX_XCP; ++i) { 4900 dma_fence_put(adev->isolation[i].spearhead); 4901 amdgpu_sync_free(&adev->isolation[i].active); 4902 amdgpu_sync_free(&adev->isolation[i].prev); 4903 } 4904 4905 amdgpu_reset_fini(adev); 4906 4907 /* free i2c buses */ 4908 amdgpu_i2c_fini(adev); 4909 4910 if (adev->bios) { 4911 if (amdgpu_emu_mode != 1) 4912 amdgpu_atombios_fini(adev); 4913 amdgpu_bios_release(adev); 4914 } 4915 4916 kfree(adev->fru_info); 4917 adev->fru_info = NULL; 4918 4919 kfree(adev->xcp_mgr); 4920 adev->xcp_mgr = NULL; 4921 4922 px = amdgpu_device_supports_px(adev_to_drm(adev)); 4923 4924 if (px || (!dev_is_removable(&adev->pdev->dev) && 4925 apple_gmux_detect(NULL, NULL))) 4926 vga_switcheroo_unregister_client(adev->pdev); 4927 4928 if (px) 4929 vga_switcheroo_fini_domain_pm_ops(adev->dev); 4930 4931 if ((adev->pdev->class >> 8) == PCI_CLASS_DISPLAY_VGA) 4932 vga_client_unregister(adev->pdev); 4933 4934 if (drm_dev_enter(adev_to_drm(adev), &idx)) { 4935 4936 iounmap(adev->rmmio); 4937 adev->rmmio = NULL; 4938 drm_dev_exit(idx); 4939 } 4940 4941 if (IS_ENABLED(CONFIG_PERF_EVENTS)) 4942 amdgpu_pmu_fini(adev); 4943 if (adev->mman.discovery_bin) 4944 amdgpu_discovery_fini(adev); 4945 4946 amdgpu_reset_put_reset_domain(adev->reset_domain); 4947 adev->reset_domain = NULL; 4948 4949 kfree(adev->pci_state); 4950 4951 } 4952 4953 /** 4954 * amdgpu_device_evict_resources - evict device resources 4955 * @adev: amdgpu device object 4956 * 4957 * Evicts all ttm device resources(vram BOs, gart table) from the lru list 4958 * of the vram memory type. Mainly used for evicting device resources 4959 * at suspend time. 4960 * 4961 */ 4962 static int amdgpu_device_evict_resources(struct amdgpu_device *adev) 4963 { 4964 int ret; 4965 4966 /* No need to evict vram on APUs unless going to S4 */ 4967 if (!adev->in_s4 && (adev->flags & AMD_IS_APU)) 4968 return 0; 4969 4970 ret = amdgpu_ttm_evict_resources(adev, TTM_PL_VRAM); 4971 if (ret) 4972 DRM_WARN("evicting device resources failed\n"); 4973 return ret; 4974 } 4975 4976 /* 4977 * Suspend & resume. 4978 */ 4979 /** 4980 * amdgpu_device_pm_notifier - Notification block for Suspend/Hibernate events 4981 * @nb: notifier block 4982 * @mode: suspend mode 4983 * @data: data 4984 * 4985 * This function is called when the system is about to suspend or hibernate. 4986 * It is used to set the appropriate flags so that eviction can be optimized 4987 * in the pm prepare callback. 4988 */ 4989 static int amdgpu_device_pm_notifier(struct notifier_block *nb, unsigned long mode, 4990 void *data) 4991 { 4992 struct amdgpu_device *adev = container_of(nb, struct amdgpu_device, pm_nb); 4993 4994 switch (mode) { 4995 case PM_HIBERNATION_PREPARE: 4996 adev->in_s4 = true; 4997 break; 4998 case PM_POST_HIBERNATION: 4999 adev->in_s4 = false; 5000 break; 5001 } 5002 5003 return NOTIFY_DONE; 5004 } 5005 5006 /** 5007 * amdgpu_device_prepare - prepare for device suspend 5008 * 5009 * @dev: drm dev pointer 5010 * 5011 * Prepare to put the hw in the suspend state (all asics). 5012 * Returns 0 for success or an error on failure. 5013 * Called at driver suspend. 5014 */ 5015 int amdgpu_device_prepare(struct drm_device *dev) 5016 { 5017 struct amdgpu_device *adev = drm_to_adev(dev); 5018 int i, r; 5019 5020 if (dev->switch_power_state == DRM_SWITCH_POWER_OFF) 5021 return 0; 5022 5023 /* Evict the majority of BOs before starting suspend sequence */ 5024 r = amdgpu_device_evict_resources(adev); 5025 if (r) 5026 return r; 5027 5028 flush_delayed_work(&adev->gfx.gfx_off_delay_work); 5029 5030 for (i = 0; i < adev->num_ip_blocks; i++) { 5031 if (!adev->ip_blocks[i].status.valid) 5032 continue; 5033 if (!adev->ip_blocks[i].version->funcs->prepare_suspend) 5034 continue; 5035 r = adev->ip_blocks[i].version->funcs->prepare_suspend(&adev->ip_blocks[i]); 5036 if (r) 5037 return r; 5038 } 5039 5040 return 0; 5041 } 5042 5043 /** 5044 * amdgpu_device_complete - complete power state transition 5045 * 5046 * @dev: drm dev pointer 5047 * 5048 * Undo the changes from amdgpu_device_prepare. This will be 5049 * called on all resume transitions, including those that failed. 5050 */ 5051 void amdgpu_device_complete(struct drm_device *dev) 5052 { 5053 struct amdgpu_device *adev = drm_to_adev(dev); 5054 int i; 5055 5056 for (i = 0; i < adev->num_ip_blocks; i++) { 5057 if (!adev->ip_blocks[i].status.valid) 5058 continue; 5059 if (!adev->ip_blocks[i].version->funcs->complete) 5060 continue; 5061 adev->ip_blocks[i].version->funcs->complete(&adev->ip_blocks[i]); 5062 } 5063 } 5064 5065 /** 5066 * amdgpu_device_suspend - initiate device suspend 5067 * 5068 * @dev: drm dev pointer 5069 * @notify_clients: notify in-kernel DRM clients 5070 * 5071 * Puts the hw in the suspend state (all asics). 5072 * Returns 0 for success or an error on failure. 5073 * Called at driver suspend. 5074 */ 5075 int amdgpu_device_suspend(struct drm_device *dev, bool notify_clients) 5076 { 5077 struct amdgpu_device *adev = drm_to_adev(dev); 5078 int r = 0; 5079 5080 if (dev->switch_power_state == DRM_SWITCH_POWER_OFF) 5081 return 0; 5082 5083 adev->in_suspend = true; 5084 5085 if (amdgpu_sriov_vf(adev)) { 5086 if (!adev->in_s0ix && !adev->in_runpm) 5087 amdgpu_amdkfd_suspend_process(adev); 5088 amdgpu_virt_fini_data_exchange(adev); 5089 r = amdgpu_virt_request_full_gpu(adev, false); 5090 if (r) 5091 return r; 5092 } 5093 5094 if (amdgpu_acpi_smart_shift_update(dev, AMDGPU_SS_DEV_D3)) 5095 DRM_WARN("smart shift update failed\n"); 5096 5097 if (notify_clients) 5098 drm_client_dev_suspend(adev_to_drm(adev), false); 5099 5100 cancel_delayed_work_sync(&adev->delayed_init_work); 5101 5102 amdgpu_ras_suspend(adev); 5103 5104 amdgpu_device_ip_suspend_phase1(adev); 5105 5106 if (!adev->in_s0ix) { 5107 amdgpu_amdkfd_suspend(adev, !amdgpu_sriov_vf(adev) && !adev->in_runpm); 5108 amdgpu_userq_suspend(adev); 5109 } 5110 5111 r = amdgpu_device_evict_resources(adev); 5112 if (r) 5113 return r; 5114 5115 amdgpu_ttm_set_buffer_funcs_status(adev, false); 5116 5117 amdgpu_fence_driver_hw_fini(adev); 5118 5119 amdgpu_device_ip_suspend_phase2(adev); 5120 5121 if (amdgpu_sriov_vf(adev)) 5122 amdgpu_virt_release_full_gpu(adev, false); 5123 5124 r = amdgpu_dpm_notify_rlc_state(adev, false); 5125 if (r) 5126 return r; 5127 5128 return 0; 5129 } 5130 5131 static inline int amdgpu_virt_resume(struct amdgpu_device *adev) 5132 { 5133 int r; 5134 unsigned int prev_physical_node_id = adev->gmc.xgmi.physical_node_id; 5135 5136 /* During VM resume, QEMU programming of VF MSIX table (register GFXMSIX_VECT0_ADDR_LO) 5137 * may not work. The access could be blocked by nBIF protection as VF isn't in 5138 * exclusive access mode. Exclusive access is enabled now, disable/enable MSIX 5139 * so that QEMU reprograms MSIX table. 5140 */ 5141 amdgpu_restore_msix(adev); 5142 5143 r = adev->gfxhub.funcs->get_xgmi_info(adev); 5144 if (r) 5145 return r; 5146 5147 dev_info(adev->dev, "xgmi node, old id %d, new id %d\n", 5148 prev_physical_node_id, adev->gmc.xgmi.physical_node_id); 5149 5150 adev->vm_manager.vram_base_offset = adev->gfxhub.funcs->get_mc_fb_offset(adev); 5151 adev->vm_manager.vram_base_offset += 5152 adev->gmc.xgmi.physical_node_id * adev->gmc.xgmi.node_segment_size; 5153 5154 return 0; 5155 } 5156 5157 /** 5158 * amdgpu_device_resume - initiate device resume 5159 * 5160 * @dev: drm dev pointer 5161 * @notify_clients: notify in-kernel DRM clients 5162 * 5163 * Bring the hw back to operating state (all asics). 5164 * Returns 0 for success or an error on failure. 5165 * Called at driver resume. 5166 */ 5167 int amdgpu_device_resume(struct drm_device *dev, bool notify_clients) 5168 { 5169 struct amdgpu_device *adev = drm_to_adev(dev); 5170 int r = 0; 5171 5172 if (amdgpu_sriov_vf(adev)) { 5173 r = amdgpu_virt_request_full_gpu(adev, true); 5174 if (r) 5175 return r; 5176 } 5177 5178 if (amdgpu_virt_xgmi_migrate_enabled(adev)) { 5179 r = amdgpu_virt_resume(adev); 5180 if (r) 5181 goto exit; 5182 } 5183 5184 if (dev->switch_power_state == DRM_SWITCH_POWER_OFF) 5185 return 0; 5186 5187 if (adev->in_s0ix) 5188 amdgpu_dpm_gfx_state_change(adev, sGpuChangeState_D0Entry); 5189 5190 /* post card */ 5191 if (amdgpu_device_need_post(adev)) { 5192 r = amdgpu_device_asic_init(adev); 5193 if (r) 5194 dev_err(adev->dev, "amdgpu asic init failed\n"); 5195 } 5196 5197 r = amdgpu_device_ip_resume(adev); 5198 5199 if (r) { 5200 dev_err(adev->dev, "amdgpu_device_ip_resume failed (%d).\n", r); 5201 goto exit; 5202 } 5203 5204 if (!adev->in_s0ix) { 5205 r = amdgpu_amdkfd_resume(adev, !amdgpu_sriov_vf(adev) && !adev->in_runpm); 5206 if (r) 5207 goto exit; 5208 5209 r = amdgpu_userq_resume(adev); 5210 if (r) 5211 goto exit; 5212 } 5213 5214 r = amdgpu_device_ip_late_init(adev); 5215 if (r) 5216 goto exit; 5217 5218 queue_delayed_work(system_wq, &adev->delayed_init_work, 5219 msecs_to_jiffies(AMDGPU_RESUME_MS)); 5220 exit: 5221 if (amdgpu_sriov_vf(adev)) { 5222 amdgpu_virt_init_data_exchange(adev); 5223 amdgpu_virt_release_full_gpu(adev, true); 5224 5225 if (!adev->in_s0ix && !r && !adev->in_runpm) 5226 r = amdgpu_amdkfd_resume_process(adev); 5227 } 5228 5229 if (r) 5230 return r; 5231 5232 /* Make sure IB tests flushed */ 5233 flush_delayed_work(&adev->delayed_init_work); 5234 5235 if (notify_clients) 5236 drm_client_dev_resume(adev_to_drm(adev), false); 5237 5238 amdgpu_ras_resume(adev); 5239 5240 if (adev->mode_info.num_crtc) { 5241 /* 5242 * Most of the connector probing functions try to acquire runtime pm 5243 * refs to ensure that the GPU is powered on when connector polling is 5244 * performed. Since we're calling this from a runtime PM callback, 5245 * trying to acquire rpm refs will cause us to deadlock. 5246 * 5247 * Since we're guaranteed to be holding the rpm lock, it's safe to 5248 * temporarily disable the rpm helpers so this doesn't deadlock us. 5249 */ 5250 #ifdef CONFIG_PM 5251 dev->dev->power.disable_depth++; 5252 #endif 5253 if (!adev->dc_enabled) 5254 drm_helper_hpd_irq_event(dev); 5255 else 5256 drm_kms_helper_hotplug_event(dev); 5257 #ifdef CONFIG_PM 5258 dev->dev->power.disable_depth--; 5259 #endif 5260 } 5261 adev->in_suspend = false; 5262 5263 if (amdgpu_acpi_smart_shift_update(dev, AMDGPU_SS_DEV_D0)) 5264 DRM_WARN("smart shift update failed\n"); 5265 5266 return 0; 5267 } 5268 5269 /** 5270 * amdgpu_device_ip_check_soft_reset - did soft reset succeed 5271 * 5272 * @adev: amdgpu_device pointer 5273 * 5274 * The list of all the hardware IPs that make up the asic is walked and 5275 * the check_soft_reset callbacks are run. check_soft_reset determines 5276 * if the asic is still hung or not. 5277 * Returns true if any of the IPs are still in a hung state, false if not. 5278 */ 5279 static bool amdgpu_device_ip_check_soft_reset(struct amdgpu_device *adev) 5280 { 5281 int i; 5282 bool asic_hang = false; 5283 5284 if (amdgpu_sriov_vf(adev)) 5285 return true; 5286 5287 if (amdgpu_asic_need_full_reset(adev)) 5288 return true; 5289 5290 for (i = 0; i < adev->num_ip_blocks; i++) { 5291 if (!adev->ip_blocks[i].status.valid) 5292 continue; 5293 if (adev->ip_blocks[i].version->funcs->check_soft_reset) 5294 adev->ip_blocks[i].status.hang = 5295 adev->ip_blocks[i].version->funcs->check_soft_reset( 5296 &adev->ip_blocks[i]); 5297 if (adev->ip_blocks[i].status.hang) { 5298 dev_info(adev->dev, "IP block:%s is hung!\n", adev->ip_blocks[i].version->funcs->name); 5299 asic_hang = true; 5300 } 5301 } 5302 return asic_hang; 5303 } 5304 5305 /** 5306 * amdgpu_device_ip_pre_soft_reset - prepare for soft reset 5307 * 5308 * @adev: amdgpu_device pointer 5309 * 5310 * The list of all the hardware IPs that make up the asic is walked and the 5311 * pre_soft_reset callbacks are run if the block is hung. pre_soft_reset 5312 * handles any IP specific hardware or software state changes that are 5313 * necessary for a soft reset to succeed. 5314 * Returns 0 on success, negative error code on failure. 5315 */ 5316 static int amdgpu_device_ip_pre_soft_reset(struct amdgpu_device *adev) 5317 { 5318 int i, r = 0; 5319 5320 for (i = 0; i < adev->num_ip_blocks; i++) { 5321 if (!adev->ip_blocks[i].status.valid) 5322 continue; 5323 if (adev->ip_blocks[i].status.hang && 5324 adev->ip_blocks[i].version->funcs->pre_soft_reset) { 5325 r = adev->ip_blocks[i].version->funcs->pre_soft_reset(&adev->ip_blocks[i]); 5326 if (r) 5327 return r; 5328 } 5329 } 5330 5331 return 0; 5332 } 5333 5334 /** 5335 * amdgpu_device_ip_need_full_reset - check if a full asic reset is needed 5336 * 5337 * @adev: amdgpu_device pointer 5338 * 5339 * Some hardware IPs cannot be soft reset. If they are hung, a full gpu 5340 * reset is necessary to recover. 5341 * Returns true if a full asic reset is required, false if not. 5342 */ 5343 static bool amdgpu_device_ip_need_full_reset(struct amdgpu_device *adev) 5344 { 5345 int i; 5346 5347 if (amdgpu_asic_need_full_reset(adev)) 5348 return true; 5349 5350 for (i = 0; i < adev->num_ip_blocks; i++) { 5351 if (!adev->ip_blocks[i].status.valid) 5352 continue; 5353 if ((adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) || 5354 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) || 5355 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_ACP) || 5356 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE) || 5357 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) { 5358 if (adev->ip_blocks[i].status.hang) { 5359 dev_info(adev->dev, "Some block need full reset!\n"); 5360 return true; 5361 } 5362 } 5363 } 5364 return false; 5365 } 5366 5367 /** 5368 * amdgpu_device_ip_soft_reset - do a soft reset 5369 * 5370 * @adev: amdgpu_device pointer 5371 * 5372 * The list of all the hardware IPs that make up the asic is walked and the 5373 * soft_reset callbacks are run if the block is hung. soft_reset handles any 5374 * IP specific hardware or software state changes that are necessary to soft 5375 * reset the IP. 5376 * Returns 0 on success, negative error code on failure. 5377 */ 5378 static int amdgpu_device_ip_soft_reset(struct amdgpu_device *adev) 5379 { 5380 int i, r = 0; 5381 5382 for (i = 0; i < adev->num_ip_blocks; i++) { 5383 if (!adev->ip_blocks[i].status.valid) 5384 continue; 5385 if (adev->ip_blocks[i].status.hang && 5386 adev->ip_blocks[i].version->funcs->soft_reset) { 5387 r = adev->ip_blocks[i].version->funcs->soft_reset(&adev->ip_blocks[i]); 5388 if (r) 5389 return r; 5390 } 5391 } 5392 5393 return 0; 5394 } 5395 5396 /** 5397 * amdgpu_device_ip_post_soft_reset - clean up from soft reset 5398 * 5399 * @adev: amdgpu_device pointer 5400 * 5401 * The list of all the hardware IPs that make up the asic is walked and the 5402 * post_soft_reset callbacks are run if the asic was hung. post_soft_reset 5403 * handles any IP specific hardware or software state changes that are 5404 * necessary after the IP has been soft reset. 5405 * Returns 0 on success, negative error code on failure. 5406 */ 5407 static int amdgpu_device_ip_post_soft_reset(struct amdgpu_device *adev) 5408 { 5409 int i, r = 0; 5410 5411 for (i = 0; i < adev->num_ip_blocks; i++) { 5412 if (!adev->ip_blocks[i].status.valid) 5413 continue; 5414 if (adev->ip_blocks[i].status.hang && 5415 adev->ip_blocks[i].version->funcs->post_soft_reset) 5416 r = adev->ip_blocks[i].version->funcs->post_soft_reset(&adev->ip_blocks[i]); 5417 if (r) 5418 return r; 5419 } 5420 5421 return 0; 5422 } 5423 5424 /** 5425 * amdgpu_device_reset_sriov - reset ASIC for SR-IOV vf 5426 * 5427 * @adev: amdgpu_device pointer 5428 * @reset_context: amdgpu reset context pointer 5429 * 5430 * do VF FLR and reinitialize Asic 5431 * return 0 means succeeded otherwise failed 5432 */ 5433 static int amdgpu_device_reset_sriov(struct amdgpu_device *adev, 5434 struct amdgpu_reset_context *reset_context) 5435 { 5436 int r; 5437 struct amdgpu_hive_info *hive = NULL; 5438 5439 if (test_bit(AMDGPU_HOST_FLR, &reset_context->flags)) { 5440 if (!amdgpu_ras_get_fed_status(adev)) 5441 amdgpu_virt_ready_to_reset(adev); 5442 amdgpu_virt_wait_reset(adev); 5443 clear_bit(AMDGPU_HOST_FLR, &reset_context->flags); 5444 r = amdgpu_virt_request_full_gpu(adev, true); 5445 } else { 5446 r = amdgpu_virt_reset_gpu(adev); 5447 } 5448 if (r) 5449 return r; 5450 5451 amdgpu_ras_clear_err_state(adev); 5452 amdgpu_irq_gpu_reset_resume_helper(adev); 5453 5454 /* some sw clean up VF needs to do before recover */ 5455 amdgpu_virt_post_reset(adev); 5456 5457 /* Resume IP prior to SMC */ 5458 r = amdgpu_device_ip_reinit_early_sriov(adev); 5459 if (r) 5460 return r; 5461 5462 amdgpu_virt_init_data_exchange(adev); 5463 5464 r = amdgpu_device_fw_loading(adev); 5465 if (r) 5466 return r; 5467 5468 /* now we are okay to resume SMC/CP/SDMA */ 5469 r = amdgpu_device_ip_reinit_late_sriov(adev); 5470 if (r) 5471 return r; 5472 5473 hive = amdgpu_get_xgmi_hive(adev); 5474 /* Update PSP FW topology after reset */ 5475 if (hive && adev->gmc.xgmi.num_physical_nodes > 1) 5476 r = amdgpu_xgmi_update_topology(hive, adev); 5477 if (hive) 5478 amdgpu_put_xgmi_hive(hive); 5479 if (r) 5480 return r; 5481 5482 r = amdgpu_ib_ring_tests(adev); 5483 if (r) 5484 return r; 5485 5486 if (adev->virt.gim_feature & AMDGIM_FEATURE_GIM_FLR_VRAMLOST) 5487 amdgpu_inc_vram_lost(adev); 5488 5489 /* need to be called during full access so we can't do it later like 5490 * bare-metal does. 5491 */ 5492 amdgpu_amdkfd_post_reset(adev); 5493 amdgpu_virt_release_full_gpu(adev, true); 5494 5495 /* Aldebaran and gfx_11_0_3 support ras in SRIOV, so need resume ras during reset */ 5496 if (amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 2) || 5497 amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 3) || 5498 amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 4) || 5499 amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 5, 0) || 5500 amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(11, 0, 3)) 5501 amdgpu_ras_resume(adev); 5502 5503 amdgpu_virt_ras_telemetry_post_reset(adev); 5504 5505 return 0; 5506 } 5507 5508 /** 5509 * amdgpu_device_has_job_running - check if there is any unfinished job 5510 * 5511 * @adev: amdgpu_device pointer 5512 * 5513 * check if there is any job running on the device when guest driver receives 5514 * FLR notification from host driver. If there are still jobs running, then 5515 * the guest driver will not respond the FLR reset. Instead, let the job hit 5516 * the timeout and guest driver then issue the reset request. 5517 */ 5518 bool amdgpu_device_has_job_running(struct amdgpu_device *adev) 5519 { 5520 int i; 5521 5522 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 5523 struct amdgpu_ring *ring = adev->rings[i]; 5524 5525 if (!amdgpu_ring_sched_ready(ring)) 5526 continue; 5527 5528 if (amdgpu_fence_count_emitted(ring)) 5529 return true; 5530 } 5531 return false; 5532 } 5533 5534 /** 5535 * amdgpu_device_should_recover_gpu - check if we should try GPU recovery 5536 * 5537 * @adev: amdgpu_device pointer 5538 * 5539 * Check amdgpu_gpu_recovery and SRIOV status to see if we should try to recover 5540 * a hung GPU. 5541 */ 5542 bool amdgpu_device_should_recover_gpu(struct amdgpu_device *adev) 5543 { 5544 5545 if (amdgpu_gpu_recovery == 0) 5546 goto disabled; 5547 5548 /* Skip soft reset check in fatal error mode */ 5549 if (!amdgpu_ras_is_poison_mode_supported(adev)) 5550 return true; 5551 5552 if (amdgpu_sriov_vf(adev)) 5553 return true; 5554 5555 if (amdgpu_gpu_recovery == -1) { 5556 switch (adev->asic_type) { 5557 #ifdef CONFIG_DRM_AMDGPU_SI 5558 case CHIP_VERDE: 5559 case CHIP_TAHITI: 5560 case CHIP_PITCAIRN: 5561 case CHIP_OLAND: 5562 case CHIP_HAINAN: 5563 #endif 5564 #ifdef CONFIG_DRM_AMDGPU_CIK 5565 case CHIP_KAVERI: 5566 case CHIP_KABINI: 5567 case CHIP_MULLINS: 5568 #endif 5569 case CHIP_CARRIZO: 5570 case CHIP_STONEY: 5571 case CHIP_CYAN_SKILLFISH: 5572 goto disabled; 5573 default: 5574 break; 5575 } 5576 } 5577 5578 return true; 5579 5580 disabled: 5581 dev_info(adev->dev, "GPU recovery disabled.\n"); 5582 return false; 5583 } 5584 5585 int amdgpu_device_mode1_reset(struct amdgpu_device *adev) 5586 { 5587 u32 i; 5588 int ret = 0; 5589 5590 if (adev->bios) 5591 amdgpu_atombios_scratch_regs_engine_hung(adev, true); 5592 5593 dev_info(adev->dev, "GPU mode1 reset\n"); 5594 5595 /* Cache the state before bus master disable. The saved config space 5596 * values are used in other cases like restore after mode-2 reset. 5597 */ 5598 amdgpu_device_cache_pci_state(adev->pdev); 5599 5600 /* disable BM */ 5601 pci_clear_master(adev->pdev); 5602 5603 if (amdgpu_dpm_is_mode1_reset_supported(adev)) { 5604 dev_info(adev->dev, "GPU smu mode1 reset\n"); 5605 ret = amdgpu_dpm_mode1_reset(adev); 5606 } else { 5607 dev_info(adev->dev, "GPU psp mode1 reset\n"); 5608 ret = psp_gpu_reset(adev); 5609 } 5610 5611 if (ret) 5612 goto mode1_reset_failed; 5613 5614 amdgpu_device_load_pci_state(adev->pdev); 5615 ret = amdgpu_psp_wait_for_bootloader(adev); 5616 if (ret) 5617 goto mode1_reset_failed; 5618 5619 /* wait for asic to come out of reset */ 5620 for (i = 0; i < adev->usec_timeout; i++) { 5621 u32 memsize = adev->nbio.funcs->get_memsize(adev); 5622 5623 if (memsize != 0xffffffff) 5624 break; 5625 udelay(1); 5626 } 5627 5628 if (i >= adev->usec_timeout) { 5629 ret = -ETIMEDOUT; 5630 goto mode1_reset_failed; 5631 } 5632 5633 if (adev->bios) 5634 amdgpu_atombios_scratch_regs_engine_hung(adev, false); 5635 5636 return 0; 5637 5638 mode1_reset_failed: 5639 dev_err(adev->dev, "GPU mode1 reset failed\n"); 5640 return ret; 5641 } 5642 5643 int amdgpu_device_link_reset(struct amdgpu_device *adev) 5644 { 5645 int ret = 0; 5646 5647 dev_info(adev->dev, "GPU link reset\n"); 5648 5649 if (!adev->pcie_reset_ctx.occurs_dpc) 5650 ret = amdgpu_dpm_link_reset(adev); 5651 5652 if (ret) 5653 goto link_reset_failed; 5654 5655 ret = amdgpu_psp_wait_for_bootloader(adev); 5656 if (ret) 5657 goto link_reset_failed; 5658 5659 return 0; 5660 5661 link_reset_failed: 5662 dev_err(adev->dev, "GPU link reset failed\n"); 5663 return ret; 5664 } 5665 5666 int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev, 5667 struct amdgpu_reset_context *reset_context) 5668 { 5669 int i, r = 0; 5670 struct amdgpu_job *job = NULL; 5671 struct amdgpu_device *tmp_adev = reset_context->reset_req_dev; 5672 bool need_full_reset = 5673 test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 5674 5675 if (reset_context->reset_req_dev == adev) 5676 job = reset_context->job; 5677 5678 if (amdgpu_sriov_vf(adev)) 5679 amdgpu_virt_pre_reset(adev); 5680 5681 amdgpu_fence_driver_isr_toggle(adev, true); 5682 5683 /* block all schedulers and reset given job's ring */ 5684 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 5685 struct amdgpu_ring *ring = adev->rings[i]; 5686 5687 if (!amdgpu_ring_sched_ready(ring)) 5688 continue; 5689 5690 /* Clear job fence from fence drv to avoid force_completion 5691 * leave NULL and vm flush fence in fence drv 5692 */ 5693 amdgpu_fence_driver_clear_job_fences(ring); 5694 5695 /* after all hw jobs are reset, hw fence is meaningless, so force_completion */ 5696 amdgpu_fence_driver_force_completion(ring); 5697 } 5698 5699 amdgpu_fence_driver_isr_toggle(adev, false); 5700 5701 if (job && job->vm) 5702 drm_sched_increase_karma(&job->base); 5703 5704 r = amdgpu_reset_prepare_hwcontext(adev, reset_context); 5705 /* If reset handler not implemented, continue; otherwise return */ 5706 if (r == -EOPNOTSUPP) 5707 r = 0; 5708 else 5709 return r; 5710 5711 /* Don't suspend on bare metal if we are not going to HW reset the ASIC */ 5712 if (!amdgpu_sriov_vf(adev)) { 5713 5714 if (!need_full_reset) 5715 need_full_reset = amdgpu_device_ip_need_full_reset(adev); 5716 5717 if (!need_full_reset && amdgpu_gpu_recovery && 5718 amdgpu_device_ip_check_soft_reset(adev)) { 5719 amdgpu_device_ip_pre_soft_reset(adev); 5720 r = amdgpu_device_ip_soft_reset(adev); 5721 amdgpu_device_ip_post_soft_reset(adev); 5722 if (r || amdgpu_device_ip_check_soft_reset(adev)) { 5723 dev_info(adev->dev, "soft reset failed, will fallback to full reset!\n"); 5724 need_full_reset = true; 5725 } 5726 } 5727 5728 if (!test_bit(AMDGPU_SKIP_COREDUMP, &reset_context->flags)) { 5729 dev_info(tmp_adev->dev, "Dumping IP State\n"); 5730 /* Trigger ip dump before we reset the asic */ 5731 for (i = 0; i < tmp_adev->num_ip_blocks; i++) 5732 if (tmp_adev->ip_blocks[i].version->funcs->dump_ip_state) 5733 tmp_adev->ip_blocks[i].version->funcs 5734 ->dump_ip_state((void *)&tmp_adev->ip_blocks[i]); 5735 dev_info(tmp_adev->dev, "Dumping IP State Completed\n"); 5736 } 5737 5738 if (need_full_reset) 5739 r = amdgpu_device_ip_suspend(adev); 5740 if (need_full_reset) 5741 set_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 5742 else 5743 clear_bit(AMDGPU_NEED_FULL_RESET, 5744 &reset_context->flags); 5745 } 5746 5747 return r; 5748 } 5749 5750 int amdgpu_device_reinit_after_reset(struct amdgpu_reset_context *reset_context) 5751 { 5752 struct list_head *device_list_handle; 5753 bool full_reset, vram_lost = false; 5754 struct amdgpu_device *tmp_adev; 5755 int r, init_level; 5756 5757 device_list_handle = reset_context->reset_device_list; 5758 5759 if (!device_list_handle) 5760 return -EINVAL; 5761 5762 full_reset = test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 5763 5764 /** 5765 * If it's reset on init, it's default init level, otherwise keep level 5766 * as recovery level. 5767 */ 5768 if (reset_context->method == AMD_RESET_METHOD_ON_INIT) 5769 init_level = AMDGPU_INIT_LEVEL_DEFAULT; 5770 else 5771 init_level = AMDGPU_INIT_LEVEL_RESET_RECOVERY; 5772 5773 r = 0; 5774 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 5775 amdgpu_set_init_level(tmp_adev, init_level); 5776 if (full_reset) { 5777 /* post card */ 5778 amdgpu_ras_clear_err_state(tmp_adev); 5779 r = amdgpu_device_asic_init(tmp_adev); 5780 if (r) { 5781 dev_warn(tmp_adev->dev, "asic atom init failed!"); 5782 } else { 5783 dev_info(tmp_adev->dev, "GPU reset succeeded, trying to resume\n"); 5784 5785 r = amdgpu_device_ip_resume_phase1(tmp_adev); 5786 if (r) 5787 goto out; 5788 5789 vram_lost = amdgpu_device_check_vram_lost(tmp_adev); 5790 5791 if (!test_bit(AMDGPU_SKIP_COREDUMP, &reset_context->flags)) 5792 amdgpu_coredump(tmp_adev, false, vram_lost, reset_context->job); 5793 5794 if (vram_lost) { 5795 DRM_INFO("VRAM is lost due to GPU reset!\n"); 5796 amdgpu_inc_vram_lost(tmp_adev); 5797 } 5798 5799 r = amdgpu_device_fw_loading(tmp_adev); 5800 if (r) 5801 return r; 5802 5803 r = amdgpu_xcp_restore_partition_mode( 5804 tmp_adev->xcp_mgr); 5805 if (r) 5806 goto out; 5807 5808 r = amdgpu_device_ip_resume_phase2(tmp_adev); 5809 if (r) 5810 goto out; 5811 5812 if (tmp_adev->mman.buffer_funcs_ring->sched.ready) 5813 amdgpu_ttm_set_buffer_funcs_status(tmp_adev, true); 5814 5815 r = amdgpu_device_ip_resume_phase3(tmp_adev); 5816 if (r) 5817 goto out; 5818 5819 if (vram_lost) 5820 amdgpu_device_fill_reset_magic(tmp_adev); 5821 5822 /* 5823 * Add this ASIC as tracked as reset was already 5824 * complete successfully. 5825 */ 5826 amdgpu_register_gpu_instance(tmp_adev); 5827 5828 if (!reset_context->hive && 5829 tmp_adev->gmc.xgmi.num_physical_nodes > 1) 5830 amdgpu_xgmi_add_device(tmp_adev); 5831 5832 r = amdgpu_device_ip_late_init(tmp_adev); 5833 if (r) 5834 goto out; 5835 5836 drm_client_dev_resume(adev_to_drm(tmp_adev), false); 5837 5838 /* 5839 * The GPU enters bad state once faulty pages 5840 * by ECC has reached the threshold, and ras 5841 * recovery is scheduled next. So add one check 5842 * here to break recovery if it indeed exceeds 5843 * bad page threshold, and remind user to 5844 * retire this GPU or setting one bigger 5845 * bad_page_threshold value to fix this once 5846 * probing driver again. 5847 */ 5848 if (!amdgpu_ras_is_rma(tmp_adev)) { 5849 /* must succeed. */ 5850 amdgpu_ras_resume(tmp_adev); 5851 } else { 5852 r = -EINVAL; 5853 goto out; 5854 } 5855 5856 /* Update PSP FW topology after reset */ 5857 if (reset_context->hive && 5858 tmp_adev->gmc.xgmi.num_physical_nodes > 1) 5859 r = amdgpu_xgmi_update_topology( 5860 reset_context->hive, tmp_adev); 5861 } 5862 } 5863 5864 out: 5865 if (!r) { 5866 /* IP init is complete now, set level as default */ 5867 amdgpu_set_init_level(tmp_adev, 5868 AMDGPU_INIT_LEVEL_DEFAULT); 5869 amdgpu_irq_gpu_reset_resume_helper(tmp_adev); 5870 r = amdgpu_ib_ring_tests(tmp_adev); 5871 if (r) { 5872 dev_err(tmp_adev->dev, "ib ring test failed (%d).\n", r); 5873 r = -EAGAIN; 5874 goto end; 5875 } 5876 } 5877 5878 if (r) 5879 tmp_adev->asic_reset_res = r; 5880 } 5881 5882 end: 5883 return r; 5884 } 5885 5886 int amdgpu_do_asic_reset(struct list_head *device_list_handle, 5887 struct amdgpu_reset_context *reset_context) 5888 { 5889 struct amdgpu_device *tmp_adev = NULL; 5890 bool need_full_reset, skip_hw_reset; 5891 int r = 0; 5892 5893 /* Try reset handler method first */ 5894 tmp_adev = list_first_entry(device_list_handle, struct amdgpu_device, 5895 reset_list); 5896 5897 reset_context->reset_device_list = device_list_handle; 5898 r = amdgpu_reset_perform_reset(tmp_adev, reset_context); 5899 /* If reset handler not implemented, continue; otherwise return */ 5900 if (r == -EOPNOTSUPP) 5901 r = 0; 5902 else 5903 return r; 5904 5905 /* Reset handler not implemented, use the default method */ 5906 need_full_reset = 5907 test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 5908 skip_hw_reset = test_bit(AMDGPU_SKIP_HW_RESET, &reset_context->flags); 5909 5910 /* 5911 * ASIC reset has to be done on all XGMI hive nodes ASAP 5912 * to allow proper links negotiation in FW (within 1 sec) 5913 */ 5914 if (!skip_hw_reset && need_full_reset) { 5915 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 5916 /* For XGMI run all resets in parallel to speed up the process */ 5917 if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) { 5918 if (!queue_work(system_unbound_wq, 5919 &tmp_adev->xgmi_reset_work)) 5920 r = -EALREADY; 5921 } else 5922 r = amdgpu_asic_reset(tmp_adev); 5923 5924 if (r) { 5925 dev_err(tmp_adev->dev, 5926 "ASIC reset failed with error, %d for drm dev, %s", 5927 r, adev_to_drm(tmp_adev)->unique); 5928 goto out; 5929 } 5930 } 5931 5932 /* For XGMI wait for all resets to complete before proceed */ 5933 if (!r) { 5934 list_for_each_entry(tmp_adev, device_list_handle, 5935 reset_list) { 5936 if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) { 5937 flush_work(&tmp_adev->xgmi_reset_work); 5938 r = tmp_adev->asic_reset_res; 5939 if (r) 5940 break; 5941 } 5942 } 5943 } 5944 } 5945 5946 if (!r && amdgpu_ras_intr_triggered()) { 5947 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 5948 amdgpu_ras_reset_error_count(tmp_adev, 5949 AMDGPU_RAS_BLOCK__MMHUB); 5950 } 5951 5952 amdgpu_ras_intr_cleared(); 5953 } 5954 5955 r = amdgpu_device_reinit_after_reset(reset_context); 5956 if (r == -EAGAIN) 5957 set_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 5958 else 5959 clear_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 5960 5961 out: 5962 return r; 5963 } 5964 5965 static void amdgpu_device_set_mp1_state(struct amdgpu_device *adev) 5966 { 5967 5968 switch (amdgpu_asic_reset_method(adev)) { 5969 case AMD_RESET_METHOD_MODE1: 5970 case AMD_RESET_METHOD_LINK: 5971 adev->mp1_state = PP_MP1_STATE_SHUTDOWN; 5972 break; 5973 case AMD_RESET_METHOD_MODE2: 5974 adev->mp1_state = PP_MP1_STATE_RESET; 5975 break; 5976 default: 5977 adev->mp1_state = PP_MP1_STATE_NONE; 5978 break; 5979 } 5980 } 5981 5982 static void amdgpu_device_unset_mp1_state(struct amdgpu_device *adev) 5983 { 5984 amdgpu_vf_error_trans_all(adev); 5985 adev->mp1_state = PP_MP1_STATE_NONE; 5986 } 5987 5988 static void amdgpu_device_resume_display_audio(struct amdgpu_device *adev) 5989 { 5990 struct pci_dev *p = NULL; 5991 5992 p = pci_get_domain_bus_and_slot(pci_domain_nr(adev->pdev->bus), 5993 adev->pdev->bus->number, 1); 5994 if (p) { 5995 pm_runtime_enable(&(p->dev)); 5996 pm_runtime_resume(&(p->dev)); 5997 } 5998 5999 pci_dev_put(p); 6000 } 6001 6002 static int amdgpu_device_suspend_display_audio(struct amdgpu_device *adev) 6003 { 6004 enum amd_reset_method reset_method; 6005 struct pci_dev *p = NULL; 6006 u64 expires; 6007 6008 /* 6009 * For now, only BACO and mode1 reset are confirmed 6010 * to suffer the audio issue without proper suspended. 6011 */ 6012 reset_method = amdgpu_asic_reset_method(adev); 6013 if ((reset_method != AMD_RESET_METHOD_BACO) && 6014 (reset_method != AMD_RESET_METHOD_MODE1)) 6015 return -EINVAL; 6016 6017 p = pci_get_domain_bus_and_slot(pci_domain_nr(adev->pdev->bus), 6018 adev->pdev->bus->number, 1); 6019 if (!p) 6020 return -ENODEV; 6021 6022 expires = pm_runtime_autosuspend_expiration(&(p->dev)); 6023 if (!expires) 6024 /* 6025 * If we cannot get the audio device autosuspend delay, 6026 * a fixed 4S interval will be used. Considering 3S is 6027 * the audio controller default autosuspend delay setting. 6028 * 4S used here is guaranteed to cover that. 6029 */ 6030 expires = ktime_get_mono_fast_ns() + NSEC_PER_SEC * 4ULL; 6031 6032 while (!pm_runtime_status_suspended(&(p->dev))) { 6033 if (!pm_runtime_suspend(&(p->dev))) 6034 break; 6035 6036 if (expires < ktime_get_mono_fast_ns()) { 6037 dev_warn(adev->dev, "failed to suspend display audio\n"); 6038 pci_dev_put(p); 6039 /* TODO: abort the succeeding gpu reset? */ 6040 return -ETIMEDOUT; 6041 } 6042 } 6043 6044 pm_runtime_disable(&(p->dev)); 6045 6046 pci_dev_put(p); 6047 return 0; 6048 } 6049 6050 static inline void amdgpu_device_stop_pending_resets(struct amdgpu_device *adev) 6051 { 6052 struct amdgpu_ras *con = amdgpu_ras_get_context(adev); 6053 6054 #if defined(CONFIG_DEBUG_FS) 6055 if (!amdgpu_sriov_vf(adev)) 6056 cancel_work(&adev->reset_work); 6057 #endif 6058 6059 if (adev->kfd.dev) 6060 cancel_work(&adev->kfd.reset_work); 6061 6062 if (amdgpu_sriov_vf(adev)) 6063 cancel_work(&adev->virt.flr_work); 6064 6065 if (con && adev->ras_enabled) 6066 cancel_work(&con->recovery_work); 6067 6068 } 6069 6070 static int amdgpu_device_health_check(struct list_head *device_list_handle) 6071 { 6072 struct amdgpu_device *tmp_adev; 6073 int ret = 0; 6074 6075 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 6076 ret |= amdgpu_device_bus_status_check(tmp_adev); 6077 } 6078 6079 return ret; 6080 } 6081 6082 static int amdgpu_device_recovery_prepare(struct amdgpu_device *adev, 6083 struct list_head *device_list, 6084 struct amdgpu_hive_info *hive) 6085 { 6086 struct amdgpu_device *tmp_adev = NULL; 6087 int r; 6088 6089 /* 6090 * Build list of devices to reset. 6091 * In case we are in XGMI hive mode, resort the device list 6092 * to put adev in the 1st position. 6093 */ 6094 if (!amdgpu_sriov_vf(adev) && (adev->gmc.xgmi.num_physical_nodes > 1) && hive) { 6095 list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head) { 6096 list_add_tail(&tmp_adev->reset_list, device_list); 6097 if (adev->shutdown) 6098 tmp_adev->shutdown = true; 6099 if (adev->pcie_reset_ctx.occurs_dpc) 6100 tmp_adev->pcie_reset_ctx.in_link_reset = true; 6101 } 6102 if (!list_is_first(&adev->reset_list, device_list)) 6103 list_rotate_to_front(&adev->reset_list, device_list); 6104 } else { 6105 list_add_tail(&adev->reset_list, device_list); 6106 } 6107 6108 if (!amdgpu_sriov_vf(adev) && (!adev->pcie_reset_ctx.occurs_dpc)) { 6109 r = amdgpu_device_health_check(device_list); 6110 if (r) 6111 return r; 6112 } 6113 6114 return 0; 6115 } 6116 6117 static void amdgpu_device_recovery_get_reset_lock(struct amdgpu_device *adev, 6118 struct list_head *device_list) 6119 { 6120 struct amdgpu_device *tmp_adev = NULL; 6121 6122 if (list_empty(device_list)) 6123 return; 6124 tmp_adev = 6125 list_first_entry(device_list, struct amdgpu_device, reset_list); 6126 amdgpu_device_lock_reset_domain(tmp_adev->reset_domain); 6127 } 6128 6129 static void amdgpu_device_recovery_put_reset_lock(struct amdgpu_device *adev, 6130 struct list_head *device_list) 6131 { 6132 struct amdgpu_device *tmp_adev = NULL; 6133 6134 if (list_empty(device_list)) 6135 return; 6136 tmp_adev = 6137 list_first_entry(device_list, struct amdgpu_device, reset_list); 6138 amdgpu_device_unlock_reset_domain(tmp_adev->reset_domain); 6139 } 6140 6141 static int amdgpu_device_halt_activities( 6142 struct amdgpu_device *adev, struct amdgpu_job *job, 6143 struct amdgpu_reset_context *reset_context, 6144 struct list_head *device_list, struct amdgpu_hive_info *hive, 6145 bool need_emergency_restart) 6146 { 6147 struct amdgpu_device *tmp_adev = NULL; 6148 int i, r = 0; 6149 6150 /* block all schedulers and reset given job's ring */ 6151 list_for_each_entry(tmp_adev, device_list, reset_list) { 6152 amdgpu_device_set_mp1_state(tmp_adev); 6153 6154 /* 6155 * Try to put the audio codec into suspend state 6156 * before gpu reset started. 6157 * 6158 * Due to the power domain of the graphics device 6159 * is shared with AZ power domain. Without this, 6160 * we may change the audio hardware from behind 6161 * the audio driver's back. That will trigger 6162 * some audio codec errors. 6163 */ 6164 if (!amdgpu_device_suspend_display_audio(tmp_adev)) 6165 tmp_adev->pcie_reset_ctx.audio_suspended = true; 6166 6167 amdgpu_ras_set_error_query_ready(tmp_adev, false); 6168 6169 cancel_delayed_work_sync(&tmp_adev->delayed_init_work); 6170 6171 amdgpu_amdkfd_pre_reset(tmp_adev, reset_context); 6172 6173 /* 6174 * Mark these ASICs to be reset as untracked first 6175 * And add them back after reset completed 6176 */ 6177 amdgpu_unregister_gpu_instance(tmp_adev); 6178 6179 drm_client_dev_suspend(adev_to_drm(tmp_adev), false); 6180 6181 /* disable ras on ALL IPs */ 6182 if (!need_emergency_restart && 6183 (!adev->pcie_reset_ctx.occurs_dpc) && 6184 amdgpu_device_ip_need_full_reset(tmp_adev)) 6185 amdgpu_ras_suspend(tmp_adev); 6186 6187 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 6188 struct amdgpu_ring *ring = tmp_adev->rings[i]; 6189 6190 if (!amdgpu_ring_sched_ready(ring)) 6191 continue; 6192 6193 drm_sched_stop(&ring->sched, job ? &job->base : NULL); 6194 6195 if (need_emergency_restart) 6196 amdgpu_job_stop_all_jobs_on_sched(&ring->sched); 6197 } 6198 atomic_inc(&tmp_adev->gpu_reset_counter); 6199 } 6200 6201 return r; 6202 } 6203 6204 static int amdgpu_device_asic_reset(struct amdgpu_device *adev, 6205 struct list_head *device_list, 6206 struct amdgpu_reset_context *reset_context) 6207 { 6208 struct amdgpu_device *tmp_adev = NULL; 6209 int retry_limit = AMDGPU_MAX_RETRY_LIMIT; 6210 int r = 0; 6211 6212 retry: /* Rest of adevs pre asic reset from XGMI hive. */ 6213 list_for_each_entry(tmp_adev, device_list, reset_list) { 6214 if (adev->pcie_reset_ctx.occurs_dpc) 6215 tmp_adev->no_hw_access = true; 6216 r = amdgpu_device_pre_asic_reset(tmp_adev, reset_context); 6217 if (adev->pcie_reset_ctx.occurs_dpc) 6218 tmp_adev->no_hw_access = false; 6219 /*TODO Should we stop ?*/ 6220 if (r) { 6221 dev_err(tmp_adev->dev, "GPU pre asic reset failed with err, %d for drm dev, %s ", 6222 r, adev_to_drm(tmp_adev)->unique); 6223 tmp_adev->asic_reset_res = r; 6224 } 6225 } 6226 6227 /* Actual ASIC resets if needed.*/ 6228 /* Host driver will handle XGMI hive reset for SRIOV */ 6229 if (amdgpu_sriov_vf(adev)) { 6230 6231 /* Bail out of reset early */ 6232 if (amdgpu_ras_is_rma(adev)) 6233 return -ENODEV; 6234 6235 if (amdgpu_ras_get_fed_status(adev) || amdgpu_virt_rcvd_ras_interrupt(adev)) { 6236 dev_dbg(adev->dev, "Detected RAS error, wait for FLR completion\n"); 6237 amdgpu_ras_set_fed(adev, true); 6238 set_bit(AMDGPU_HOST_FLR, &reset_context->flags); 6239 } 6240 6241 r = amdgpu_device_reset_sriov(adev, reset_context); 6242 if (AMDGPU_RETRY_SRIOV_RESET(r) && (retry_limit--) > 0) { 6243 amdgpu_virt_release_full_gpu(adev, true); 6244 goto retry; 6245 } 6246 if (r) 6247 adev->asic_reset_res = r; 6248 } else { 6249 r = amdgpu_do_asic_reset(device_list, reset_context); 6250 if (r && r == -EAGAIN) 6251 goto retry; 6252 } 6253 6254 list_for_each_entry(tmp_adev, device_list, reset_list) { 6255 /* 6256 * Drop any pending non scheduler resets queued before reset is done. 6257 * Any reset scheduled after this point would be valid. Scheduler resets 6258 * were already dropped during drm_sched_stop and no new ones can come 6259 * in before drm_sched_start. 6260 */ 6261 amdgpu_device_stop_pending_resets(tmp_adev); 6262 } 6263 6264 return r; 6265 } 6266 6267 static int amdgpu_device_sched_resume(struct list_head *device_list, 6268 struct amdgpu_reset_context *reset_context, 6269 bool job_signaled) 6270 { 6271 struct amdgpu_device *tmp_adev = NULL; 6272 int i, r = 0; 6273 6274 /* Post ASIC reset for all devs .*/ 6275 list_for_each_entry(tmp_adev, device_list, reset_list) { 6276 6277 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 6278 struct amdgpu_ring *ring = tmp_adev->rings[i]; 6279 6280 if (!amdgpu_ring_sched_ready(ring)) 6281 continue; 6282 6283 drm_sched_start(&ring->sched, 0); 6284 } 6285 6286 if (!drm_drv_uses_atomic_modeset(adev_to_drm(tmp_adev)) && !job_signaled) 6287 drm_helper_resume_force_mode(adev_to_drm(tmp_adev)); 6288 6289 if (tmp_adev->asic_reset_res) 6290 r = tmp_adev->asic_reset_res; 6291 6292 tmp_adev->asic_reset_res = 0; 6293 6294 if (r) { 6295 /* bad news, how to tell it to userspace ? 6296 * for ras error, we should report GPU bad status instead of 6297 * reset failure 6298 */ 6299 if (reset_context->src != AMDGPU_RESET_SRC_RAS || 6300 !amdgpu_ras_eeprom_check_err_threshold(tmp_adev)) 6301 dev_info(tmp_adev->dev, "GPU reset(%d) failed\n", 6302 atomic_read(&tmp_adev->gpu_reset_counter)); 6303 amdgpu_vf_error_put(tmp_adev, AMDGIM_ERROR_VF_GPU_RESET_FAIL, 0, r); 6304 } else { 6305 dev_info(tmp_adev->dev, "GPU reset(%d) succeeded!\n", atomic_read(&tmp_adev->gpu_reset_counter)); 6306 if (amdgpu_acpi_smart_shift_update(adev_to_drm(tmp_adev), AMDGPU_SS_DEV_D0)) 6307 DRM_WARN("smart shift update failed\n"); 6308 } 6309 } 6310 6311 return r; 6312 } 6313 6314 static void amdgpu_device_gpu_resume(struct amdgpu_device *adev, 6315 struct list_head *device_list, 6316 bool need_emergency_restart) 6317 { 6318 struct amdgpu_device *tmp_adev = NULL; 6319 6320 list_for_each_entry(tmp_adev, device_list, reset_list) { 6321 /* unlock kfd: SRIOV would do it separately */ 6322 if (!need_emergency_restart && !amdgpu_sriov_vf(tmp_adev)) 6323 amdgpu_amdkfd_post_reset(tmp_adev); 6324 6325 /* kfd_post_reset will do nothing if kfd device is not initialized, 6326 * need to bring up kfd here if it's not be initialized before 6327 */ 6328 if (!adev->kfd.init_complete) 6329 amdgpu_amdkfd_device_init(adev); 6330 6331 if (tmp_adev->pcie_reset_ctx.audio_suspended) 6332 amdgpu_device_resume_display_audio(tmp_adev); 6333 6334 amdgpu_device_unset_mp1_state(tmp_adev); 6335 6336 amdgpu_ras_set_error_query_ready(tmp_adev, true); 6337 6338 } 6339 } 6340 6341 6342 /** 6343 * amdgpu_device_gpu_recover - reset the asic and recover scheduler 6344 * 6345 * @adev: amdgpu_device pointer 6346 * @job: which job trigger hang 6347 * @reset_context: amdgpu reset context pointer 6348 * 6349 * Attempt to reset the GPU if it has hung (all asics). 6350 * Attempt to do soft-reset or full-reset and reinitialize Asic 6351 * Returns 0 for success or an error on failure. 6352 */ 6353 6354 int amdgpu_device_gpu_recover(struct amdgpu_device *adev, 6355 struct amdgpu_job *job, 6356 struct amdgpu_reset_context *reset_context) 6357 { 6358 struct list_head device_list; 6359 bool job_signaled = false; 6360 struct amdgpu_hive_info *hive = NULL; 6361 int r = 0; 6362 bool need_emergency_restart = false; 6363 6364 /* 6365 * If it reaches here because of hang/timeout and a RAS error is 6366 * detected at the same time, let RAS recovery take care of it. 6367 */ 6368 if (amdgpu_ras_is_err_state(adev, AMDGPU_RAS_BLOCK__ANY) && 6369 !amdgpu_sriov_vf(adev) && 6370 reset_context->src != AMDGPU_RESET_SRC_RAS) { 6371 dev_dbg(adev->dev, 6372 "Gpu recovery from source: %d yielding to RAS error recovery handling", 6373 reset_context->src); 6374 return 0; 6375 } 6376 6377 /* 6378 * Special case: RAS triggered and full reset isn't supported 6379 */ 6380 need_emergency_restart = amdgpu_ras_need_emergency_restart(adev); 6381 6382 /* 6383 * Flush RAM to disk so that after reboot 6384 * the user can read log and see why the system rebooted. 6385 */ 6386 if (need_emergency_restart && amdgpu_ras_get_context(adev) && 6387 amdgpu_ras_get_context(adev)->reboot) { 6388 DRM_WARN("Emergency reboot."); 6389 6390 ksys_sync_helper(); 6391 emergency_restart(); 6392 } 6393 6394 dev_info(adev->dev, "GPU %s begin!\n", 6395 need_emergency_restart ? "jobs stop":"reset"); 6396 6397 if (!amdgpu_sriov_vf(adev)) 6398 hive = amdgpu_get_xgmi_hive(adev); 6399 if (hive) 6400 mutex_lock(&hive->hive_lock); 6401 6402 reset_context->job = job; 6403 reset_context->hive = hive; 6404 INIT_LIST_HEAD(&device_list); 6405 6406 if (amdgpu_device_recovery_prepare(adev, &device_list, hive)) 6407 goto end_reset; 6408 6409 /* We need to lock reset domain only once both for XGMI and single device */ 6410 amdgpu_device_recovery_get_reset_lock(adev, &device_list); 6411 6412 r = amdgpu_device_halt_activities(adev, job, reset_context, &device_list, 6413 hive, need_emergency_restart); 6414 if (r) 6415 goto reset_unlock; 6416 6417 if (need_emergency_restart) 6418 goto skip_sched_resume; 6419 /* 6420 * Must check guilty signal here since after this point all old 6421 * HW fences are force signaled. 6422 * 6423 * job->base holds a reference to parent fence 6424 */ 6425 if (job && dma_fence_is_signaled(&job->hw_fence.base)) { 6426 job_signaled = true; 6427 dev_info(adev->dev, "Guilty job already signaled, skipping HW reset"); 6428 goto skip_hw_reset; 6429 } 6430 6431 r = amdgpu_device_asic_reset(adev, &device_list, reset_context); 6432 if (r) 6433 goto reset_unlock; 6434 skip_hw_reset: 6435 r = amdgpu_device_sched_resume(&device_list, reset_context, job_signaled); 6436 if (r) 6437 goto reset_unlock; 6438 skip_sched_resume: 6439 amdgpu_device_gpu_resume(adev, &device_list, need_emergency_restart); 6440 reset_unlock: 6441 amdgpu_device_recovery_put_reset_lock(adev, &device_list); 6442 end_reset: 6443 if (hive) { 6444 mutex_unlock(&hive->hive_lock); 6445 amdgpu_put_xgmi_hive(hive); 6446 } 6447 6448 if (r) 6449 dev_info(adev->dev, "GPU reset end with ret = %d\n", r); 6450 6451 atomic_set(&adev->reset_domain->reset_res, r); 6452 6453 if (!r) 6454 drm_dev_wedged_event(adev_to_drm(adev), DRM_WEDGE_RECOVERY_NONE); 6455 6456 return r; 6457 } 6458 6459 /** 6460 * amdgpu_device_partner_bandwidth - find the bandwidth of appropriate partner 6461 * 6462 * @adev: amdgpu_device pointer 6463 * @speed: pointer to the speed of the link 6464 * @width: pointer to the width of the link 6465 * 6466 * Evaluate the hierarchy to find the speed and bandwidth capabilities of the 6467 * first physical partner to an AMD dGPU. 6468 * This will exclude any virtual switches and links. 6469 */ 6470 static void amdgpu_device_partner_bandwidth(struct amdgpu_device *adev, 6471 enum pci_bus_speed *speed, 6472 enum pcie_link_width *width) 6473 { 6474 struct pci_dev *parent = adev->pdev; 6475 6476 if (!speed || !width) 6477 return; 6478 6479 *speed = PCI_SPEED_UNKNOWN; 6480 *width = PCIE_LNK_WIDTH_UNKNOWN; 6481 6482 if (amdgpu_device_pcie_dynamic_switching_supported(adev)) { 6483 while ((parent = pci_upstream_bridge(parent))) { 6484 /* skip upstream/downstream switches internal to dGPU*/ 6485 if (parent->vendor == PCI_VENDOR_ID_ATI) 6486 continue; 6487 *speed = pcie_get_speed_cap(parent); 6488 *width = pcie_get_width_cap(parent); 6489 break; 6490 } 6491 } else { 6492 /* use the current speeds rather than max if switching is not supported */ 6493 pcie_bandwidth_available(adev->pdev, NULL, speed, width); 6494 } 6495 } 6496 6497 /** 6498 * amdgpu_device_gpu_bandwidth - find the bandwidth of the GPU 6499 * 6500 * @adev: amdgpu_device pointer 6501 * @speed: pointer to the speed of the link 6502 * @width: pointer to the width of the link 6503 * 6504 * Evaluate the hierarchy to find the speed and bandwidth capabilities of the 6505 * AMD dGPU which may be a virtual upstream bridge. 6506 */ 6507 static void amdgpu_device_gpu_bandwidth(struct amdgpu_device *adev, 6508 enum pci_bus_speed *speed, 6509 enum pcie_link_width *width) 6510 { 6511 struct pci_dev *parent = adev->pdev; 6512 6513 if (!speed || !width) 6514 return; 6515 6516 parent = pci_upstream_bridge(parent); 6517 if (parent && parent->vendor == PCI_VENDOR_ID_ATI) { 6518 /* use the upstream/downstream switches internal to dGPU */ 6519 *speed = pcie_get_speed_cap(parent); 6520 *width = pcie_get_width_cap(parent); 6521 while ((parent = pci_upstream_bridge(parent))) { 6522 if (parent->vendor == PCI_VENDOR_ID_ATI) { 6523 /* use the upstream/downstream switches internal to dGPU */ 6524 *speed = pcie_get_speed_cap(parent); 6525 *width = pcie_get_width_cap(parent); 6526 } 6527 } 6528 } else { 6529 /* use the device itself */ 6530 *speed = pcie_get_speed_cap(adev->pdev); 6531 *width = pcie_get_width_cap(adev->pdev); 6532 } 6533 } 6534 6535 /** 6536 * amdgpu_device_get_pcie_info - fence pcie info about the PCIE slot 6537 * 6538 * @adev: amdgpu_device pointer 6539 * 6540 * Fetches and stores in the driver the PCIE capabilities (gen speed 6541 * and lanes) of the slot the device is in. Handles APUs and 6542 * virtualized environments where PCIE config space may not be available. 6543 */ 6544 static void amdgpu_device_get_pcie_info(struct amdgpu_device *adev) 6545 { 6546 enum pci_bus_speed speed_cap, platform_speed_cap; 6547 enum pcie_link_width platform_link_width, link_width; 6548 6549 if (amdgpu_pcie_gen_cap) 6550 adev->pm.pcie_gen_mask = amdgpu_pcie_gen_cap; 6551 6552 if (amdgpu_pcie_lane_cap) 6553 adev->pm.pcie_mlw_mask = amdgpu_pcie_lane_cap; 6554 6555 /* covers APUs as well */ 6556 if (pci_is_root_bus(adev->pdev->bus) && !amdgpu_passthrough(adev)) { 6557 if (adev->pm.pcie_gen_mask == 0) 6558 adev->pm.pcie_gen_mask = AMDGPU_DEFAULT_PCIE_GEN_MASK; 6559 if (adev->pm.pcie_mlw_mask == 0) 6560 adev->pm.pcie_mlw_mask = AMDGPU_DEFAULT_PCIE_MLW_MASK; 6561 return; 6562 } 6563 6564 if (adev->pm.pcie_gen_mask && adev->pm.pcie_mlw_mask) 6565 return; 6566 6567 amdgpu_device_partner_bandwidth(adev, &platform_speed_cap, 6568 &platform_link_width); 6569 amdgpu_device_gpu_bandwidth(adev, &speed_cap, &link_width); 6570 6571 if (adev->pm.pcie_gen_mask == 0) { 6572 /* asic caps */ 6573 if (speed_cap == PCI_SPEED_UNKNOWN) { 6574 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 6575 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 | 6576 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3); 6577 } else { 6578 if (speed_cap == PCIE_SPEED_32_0GT) 6579 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 6580 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 | 6581 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3 | 6582 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN4 | 6583 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN5); 6584 else if (speed_cap == PCIE_SPEED_16_0GT) 6585 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 6586 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 | 6587 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3 | 6588 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN4); 6589 else if (speed_cap == PCIE_SPEED_8_0GT) 6590 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 6591 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 | 6592 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3); 6593 else if (speed_cap == PCIE_SPEED_5_0GT) 6594 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 6595 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2); 6596 else 6597 adev->pm.pcie_gen_mask |= CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1; 6598 } 6599 /* platform caps */ 6600 if (platform_speed_cap == PCI_SPEED_UNKNOWN) { 6601 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 6602 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2); 6603 } else { 6604 if (platform_speed_cap == PCIE_SPEED_32_0GT) 6605 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 6606 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 | 6607 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3 | 6608 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN4 | 6609 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN5); 6610 else if (platform_speed_cap == PCIE_SPEED_16_0GT) 6611 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 6612 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 | 6613 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3 | 6614 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN4); 6615 else if (platform_speed_cap == PCIE_SPEED_8_0GT) 6616 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 6617 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 | 6618 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3); 6619 else if (platform_speed_cap == PCIE_SPEED_5_0GT) 6620 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 6621 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2); 6622 else 6623 adev->pm.pcie_gen_mask |= CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1; 6624 6625 } 6626 } 6627 if (adev->pm.pcie_mlw_mask == 0) { 6628 /* asic caps */ 6629 if (link_width == PCIE_LNK_WIDTH_UNKNOWN) { 6630 adev->pm.pcie_mlw_mask |= AMDGPU_DEFAULT_ASIC_PCIE_MLW_MASK; 6631 } else { 6632 switch (link_width) { 6633 case PCIE_LNK_X32: 6634 adev->pm.pcie_mlw_mask |= (CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X32 | 6635 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X16 | 6636 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X12 | 6637 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X8 | 6638 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X4 | 6639 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X2 | 6640 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X1); 6641 break; 6642 case PCIE_LNK_X16: 6643 adev->pm.pcie_mlw_mask |= (CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X16 | 6644 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X12 | 6645 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X8 | 6646 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X4 | 6647 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X2 | 6648 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X1); 6649 break; 6650 case PCIE_LNK_X12: 6651 adev->pm.pcie_mlw_mask |= (CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X12 | 6652 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X8 | 6653 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X4 | 6654 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X2 | 6655 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X1); 6656 break; 6657 case PCIE_LNK_X8: 6658 adev->pm.pcie_mlw_mask |= (CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X8 | 6659 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X4 | 6660 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X2 | 6661 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X1); 6662 break; 6663 case PCIE_LNK_X4: 6664 adev->pm.pcie_mlw_mask |= (CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X4 | 6665 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X2 | 6666 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X1); 6667 break; 6668 case PCIE_LNK_X2: 6669 adev->pm.pcie_mlw_mask |= (CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X2 | 6670 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X1); 6671 break; 6672 case PCIE_LNK_X1: 6673 adev->pm.pcie_mlw_mask |= CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X1; 6674 break; 6675 default: 6676 break; 6677 } 6678 } 6679 /* platform caps */ 6680 if (platform_link_width == PCIE_LNK_WIDTH_UNKNOWN) { 6681 adev->pm.pcie_mlw_mask |= AMDGPU_DEFAULT_PCIE_MLW_MASK; 6682 } else { 6683 switch (platform_link_width) { 6684 case PCIE_LNK_X32: 6685 adev->pm.pcie_mlw_mask |= (CAIL_PCIE_LINK_WIDTH_SUPPORT_X32 | 6686 CAIL_PCIE_LINK_WIDTH_SUPPORT_X16 | 6687 CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 | 6688 CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 | 6689 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 6690 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 6691 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 6692 break; 6693 case PCIE_LNK_X16: 6694 adev->pm.pcie_mlw_mask |= (CAIL_PCIE_LINK_WIDTH_SUPPORT_X16 | 6695 CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 | 6696 CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 | 6697 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 6698 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 6699 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 6700 break; 6701 case PCIE_LNK_X12: 6702 adev->pm.pcie_mlw_mask |= (CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 | 6703 CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 | 6704 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 6705 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 6706 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 6707 break; 6708 case PCIE_LNK_X8: 6709 adev->pm.pcie_mlw_mask |= (CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 | 6710 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 6711 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 6712 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 6713 break; 6714 case PCIE_LNK_X4: 6715 adev->pm.pcie_mlw_mask |= (CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 6716 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 6717 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 6718 break; 6719 case PCIE_LNK_X2: 6720 adev->pm.pcie_mlw_mask |= (CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 6721 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 6722 break; 6723 case PCIE_LNK_X1: 6724 adev->pm.pcie_mlw_mask |= CAIL_PCIE_LINK_WIDTH_SUPPORT_X1; 6725 break; 6726 default: 6727 break; 6728 } 6729 } 6730 } 6731 } 6732 6733 /** 6734 * amdgpu_device_is_peer_accessible - Check peer access through PCIe BAR 6735 * 6736 * @adev: amdgpu_device pointer 6737 * @peer_adev: amdgpu_device pointer for peer device trying to access @adev 6738 * 6739 * Return true if @peer_adev can access (DMA) @adev through the PCIe 6740 * BAR, i.e. @adev is "large BAR" and the BAR matches the DMA mask of 6741 * @peer_adev. 6742 */ 6743 bool amdgpu_device_is_peer_accessible(struct amdgpu_device *adev, 6744 struct amdgpu_device *peer_adev) 6745 { 6746 #ifdef CONFIG_HSA_AMD_P2P 6747 bool p2p_access = 6748 !adev->gmc.xgmi.connected_to_cpu && 6749 !(pci_p2pdma_distance(adev->pdev, peer_adev->dev, false) < 0); 6750 if (!p2p_access) 6751 dev_info(adev->dev, "PCIe P2P access from peer device %s is not supported by the chipset\n", 6752 pci_name(peer_adev->pdev)); 6753 6754 bool is_large_bar = adev->gmc.visible_vram_size && 6755 adev->gmc.real_vram_size == adev->gmc.visible_vram_size; 6756 bool p2p_addressable = amdgpu_device_check_iommu_remap(peer_adev); 6757 6758 if (!p2p_addressable) { 6759 uint64_t address_mask = peer_adev->dev->dma_mask ? 6760 ~*peer_adev->dev->dma_mask : ~((1ULL << 32) - 1); 6761 resource_size_t aper_limit = 6762 adev->gmc.aper_base + adev->gmc.aper_size - 1; 6763 6764 p2p_addressable = !(adev->gmc.aper_base & address_mask || 6765 aper_limit & address_mask); 6766 } 6767 return pcie_p2p && is_large_bar && p2p_access && p2p_addressable; 6768 #else 6769 return false; 6770 #endif 6771 } 6772 6773 int amdgpu_device_baco_enter(struct drm_device *dev) 6774 { 6775 struct amdgpu_device *adev = drm_to_adev(dev); 6776 struct amdgpu_ras *ras = amdgpu_ras_get_context(adev); 6777 6778 if (!amdgpu_device_supports_baco(dev)) 6779 return -ENOTSUPP; 6780 6781 if (ras && adev->ras_enabled && 6782 adev->nbio.funcs->enable_doorbell_interrupt) 6783 adev->nbio.funcs->enable_doorbell_interrupt(adev, false); 6784 6785 return amdgpu_dpm_baco_enter(adev); 6786 } 6787 6788 int amdgpu_device_baco_exit(struct drm_device *dev) 6789 { 6790 struct amdgpu_device *adev = drm_to_adev(dev); 6791 struct amdgpu_ras *ras = amdgpu_ras_get_context(adev); 6792 int ret = 0; 6793 6794 if (!amdgpu_device_supports_baco(dev)) 6795 return -ENOTSUPP; 6796 6797 ret = amdgpu_dpm_baco_exit(adev); 6798 if (ret) 6799 return ret; 6800 6801 if (ras && adev->ras_enabled && 6802 adev->nbio.funcs->enable_doorbell_interrupt) 6803 adev->nbio.funcs->enable_doorbell_interrupt(adev, true); 6804 6805 if (amdgpu_passthrough(adev) && adev->nbio.funcs && 6806 adev->nbio.funcs->clear_doorbell_interrupt) 6807 adev->nbio.funcs->clear_doorbell_interrupt(adev); 6808 6809 return 0; 6810 } 6811 6812 /** 6813 * amdgpu_pci_error_detected - Called when a PCI error is detected. 6814 * @pdev: PCI device struct 6815 * @state: PCI channel state 6816 * 6817 * Description: Called when a PCI error is detected. 6818 * 6819 * Return: PCI_ERS_RESULT_NEED_RESET or PCI_ERS_RESULT_DISCONNECT. 6820 */ 6821 pci_ers_result_t amdgpu_pci_error_detected(struct pci_dev *pdev, pci_channel_state_t state) 6822 { 6823 struct drm_device *dev = pci_get_drvdata(pdev); 6824 struct amdgpu_device *adev = drm_to_adev(dev); 6825 struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev); 6826 struct amdgpu_reset_context reset_context; 6827 struct list_head device_list; 6828 int r = 0; 6829 6830 dev_info(adev->dev, "PCI error: detected callback!!\n"); 6831 6832 if (!amdgpu_dpm_is_link_reset_supported(adev)) { 6833 dev_warn(adev->dev, "No support for XGMI hive yet...\n"); 6834 return PCI_ERS_RESULT_DISCONNECT; 6835 } 6836 6837 adev->pci_channel_state = state; 6838 6839 switch (state) { 6840 case pci_channel_io_normal: 6841 dev_info(adev->dev, "pci_channel_io_normal: state(%d)!!\n", state); 6842 return PCI_ERS_RESULT_CAN_RECOVER; 6843 case pci_channel_io_frozen: 6844 /* Fatal error, prepare for slot reset */ 6845 dev_info(adev->dev, "pci_channel_io_frozen: state(%d)!!\n", state); 6846 6847 if (hive) 6848 mutex_lock(&hive->hive_lock); 6849 adev->pcie_reset_ctx.occurs_dpc = true; 6850 memset(&reset_context, 0, sizeof(reset_context)); 6851 INIT_LIST_HEAD(&device_list); 6852 6853 amdgpu_device_recovery_prepare(adev, &device_list, hive); 6854 amdgpu_device_recovery_get_reset_lock(adev, &device_list); 6855 r = amdgpu_device_halt_activities(adev, NULL, &reset_context, &device_list, 6856 hive, false); 6857 if (hive) { 6858 mutex_unlock(&hive->hive_lock); 6859 amdgpu_put_xgmi_hive(hive); 6860 } 6861 if (r) 6862 return PCI_ERS_RESULT_DISCONNECT; 6863 return PCI_ERS_RESULT_NEED_RESET; 6864 case pci_channel_io_perm_failure: 6865 /* Permanent error, prepare for device removal */ 6866 dev_info(adev->dev, "pci_channel_io_perm_failure: state(%d)!!\n", state); 6867 return PCI_ERS_RESULT_DISCONNECT; 6868 } 6869 6870 return PCI_ERS_RESULT_NEED_RESET; 6871 } 6872 6873 /** 6874 * amdgpu_pci_mmio_enabled - Enable MMIO and dump debug registers 6875 * @pdev: pointer to PCI device 6876 */ 6877 pci_ers_result_t amdgpu_pci_mmio_enabled(struct pci_dev *pdev) 6878 { 6879 struct drm_device *dev = pci_get_drvdata(pdev); 6880 struct amdgpu_device *adev = drm_to_adev(dev); 6881 6882 dev_info(adev->dev, "PCI error: mmio enabled callback!!\n"); 6883 6884 /* TODO - dump whatever for debugging purposes */ 6885 6886 /* This called only if amdgpu_pci_error_detected returns 6887 * PCI_ERS_RESULT_CAN_RECOVER. Read/write to the device still 6888 * works, no need to reset slot. 6889 */ 6890 6891 return PCI_ERS_RESULT_RECOVERED; 6892 } 6893 6894 /** 6895 * amdgpu_pci_slot_reset - Called when PCI slot has been reset. 6896 * @pdev: PCI device struct 6897 * 6898 * Description: This routine is called by the pci error recovery 6899 * code after the PCI slot has been reset, just before we 6900 * should resume normal operations. 6901 */ 6902 pci_ers_result_t amdgpu_pci_slot_reset(struct pci_dev *pdev) 6903 { 6904 struct drm_device *dev = pci_get_drvdata(pdev); 6905 struct amdgpu_device *adev = drm_to_adev(dev); 6906 struct amdgpu_reset_context reset_context; 6907 struct amdgpu_device *tmp_adev; 6908 struct amdgpu_hive_info *hive; 6909 struct list_head device_list; 6910 int r = 0, i; 6911 u32 memsize; 6912 6913 /* PCI error slot reset should be skipped During RAS recovery */ 6914 if ((amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 3) || 6915 amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 4)) && 6916 amdgpu_ras_in_recovery(adev)) 6917 return PCI_ERS_RESULT_RECOVERED; 6918 6919 dev_info(adev->dev, "PCI error: slot reset callback!!\n"); 6920 6921 memset(&reset_context, 0, sizeof(reset_context)); 6922 6923 /* wait for asic to come out of reset */ 6924 msleep(700); 6925 6926 /* Restore PCI confspace */ 6927 amdgpu_device_load_pci_state(pdev); 6928 6929 /* confirm ASIC came out of reset */ 6930 for (i = 0; i < adev->usec_timeout; i++) { 6931 memsize = amdgpu_asic_get_config_memsize(adev); 6932 6933 if (memsize != 0xffffffff) 6934 break; 6935 udelay(1); 6936 } 6937 if (memsize == 0xffffffff) { 6938 r = -ETIME; 6939 goto out; 6940 } 6941 6942 reset_context.method = AMD_RESET_METHOD_NONE; 6943 reset_context.reset_req_dev = adev; 6944 set_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags); 6945 set_bit(AMDGPU_SKIP_COREDUMP, &reset_context.flags); 6946 INIT_LIST_HEAD(&device_list); 6947 6948 hive = amdgpu_get_xgmi_hive(adev); 6949 if (hive) { 6950 mutex_lock(&hive->hive_lock); 6951 reset_context.hive = hive; 6952 list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head) { 6953 tmp_adev->pcie_reset_ctx.in_link_reset = true; 6954 list_add_tail(&tmp_adev->reset_list, &device_list); 6955 } 6956 } else { 6957 set_bit(AMDGPU_SKIP_HW_RESET, &reset_context.flags); 6958 list_add_tail(&adev->reset_list, &device_list); 6959 } 6960 6961 r = amdgpu_device_asic_reset(adev, &device_list, &reset_context); 6962 out: 6963 if (!r) { 6964 if (amdgpu_device_cache_pci_state(adev->pdev)) 6965 pci_restore_state(adev->pdev); 6966 dev_info(adev->dev, "PCIe error recovery succeeded\n"); 6967 } else { 6968 dev_err(adev->dev, "PCIe error recovery failed, err:%d\n", r); 6969 if (hive) { 6970 list_for_each_entry(tmp_adev, &device_list, reset_list) 6971 amdgpu_device_unset_mp1_state(tmp_adev); 6972 } 6973 amdgpu_device_recovery_put_reset_lock(adev, &device_list); 6974 } 6975 6976 if (hive) { 6977 mutex_unlock(&hive->hive_lock); 6978 amdgpu_put_xgmi_hive(hive); 6979 } 6980 6981 return r ? PCI_ERS_RESULT_DISCONNECT : PCI_ERS_RESULT_RECOVERED; 6982 } 6983 6984 /** 6985 * amdgpu_pci_resume() - resume normal ops after PCI reset 6986 * @pdev: pointer to PCI device 6987 * 6988 * Called when the error recovery driver tells us that its 6989 * OK to resume normal operation. 6990 */ 6991 void amdgpu_pci_resume(struct pci_dev *pdev) 6992 { 6993 struct drm_device *dev = pci_get_drvdata(pdev); 6994 struct amdgpu_device *adev = drm_to_adev(dev); 6995 struct list_head device_list; 6996 struct amdgpu_hive_info *hive = NULL; 6997 struct amdgpu_device *tmp_adev = NULL; 6998 6999 dev_info(adev->dev, "PCI error: resume callback!!\n"); 7000 7001 /* Only continue execution for the case of pci_channel_io_frozen */ 7002 if (adev->pci_channel_state != pci_channel_io_frozen) 7003 return; 7004 7005 INIT_LIST_HEAD(&device_list); 7006 7007 hive = amdgpu_get_xgmi_hive(adev); 7008 if (hive) { 7009 mutex_lock(&hive->hive_lock); 7010 list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head) { 7011 tmp_adev->pcie_reset_ctx.in_link_reset = false; 7012 list_add_tail(&tmp_adev->reset_list, &device_list); 7013 } 7014 } else 7015 list_add_tail(&adev->reset_list, &device_list); 7016 7017 amdgpu_device_sched_resume(&device_list, NULL, NULL); 7018 amdgpu_device_gpu_resume(adev, &device_list, false); 7019 amdgpu_device_recovery_put_reset_lock(adev, &device_list); 7020 adev->pcie_reset_ctx.occurs_dpc = false; 7021 7022 if (hive) { 7023 mutex_unlock(&hive->hive_lock); 7024 amdgpu_put_xgmi_hive(hive); 7025 } 7026 } 7027 7028 bool amdgpu_device_cache_pci_state(struct pci_dev *pdev) 7029 { 7030 struct drm_device *dev = pci_get_drvdata(pdev); 7031 struct amdgpu_device *adev = drm_to_adev(dev); 7032 int r; 7033 7034 if (amdgpu_sriov_vf(adev)) 7035 return false; 7036 7037 r = pci_save_state(pdev); 7038 if (!r) { 7039 kfree(adev->pci_state); 7040 7041 adev->pci_state = pci_store_saved_state(pdev); 7042 7043 if (!adev->pci_state) { 7044 DRM_ERROR("Failed to store PCI saved state"); 7045 return false; 7046 } 7047 } else { 7048 DRM_WARN("Failed to save PCI state, err:%d\n", r); 7049 return false; 7050 } 7051 7052 return true; 7053 } 7054 7055 bool amdgpu_device_load_pci_state(struct pci_dev *pdev) 7056 { 7057 struct drm_device *dev = pci_get_drvdata(pdev); 7058 struct amdgpu_device *adev = drm_to_adev(dev); 7059 int r; 7060 7061 if (!adev->pci_state) 7062 return false; 7063 7064 r = pci_load_saved_state(pdev, adev->pci_state); 7065 7066 if (!r) { 7067 pci_restore_state(pdev); 7068 } else { 7069 DRM_WARN("Failed to load PCI state, err:%d\n", r); 7070 return false; 7071 } 7072 7073 return true; 7074 } 7075 7076 void amdgpu_device_flush_hdp(struct amdgpu_device *adev, 7077 struct amdgpu_ring *ring) 7078 { 7079 #ifdef CONFIG_X86_64 7080 if ((adev->flags & AMD_IS_APU) && !amdgpu_passthrough(adev)) 7081 return; 7082 #endif 7083 if (adev->gmc.xgmi.connected_to_cpu) 7084 return; 7085 7086 if (ring && ring->funcs->emit_hdp_flush) 7087 amdgpu_ring_emit_hdp_flush(ring); 7088 else 7089 amdgpu_asic_flush_hdp(adev, ring); 7090 } 7091 7092 void amdgpu_device_invalidate_hdp(struct amdgpu_device *adev, 7093 struct amdgpu_ring *ring) 7094 { 7095 #ifdef CONFIG_X86_64 7096 if ((adev->flags & AMD_IS_APU) && !amdgpu_passthrough(adev)) 7097 return; 7098 #endif 7099 if (adev->gmc.xgmi.connected_to_cpu) 7100 return; 7101 7102 amdgpu_asic_invalidate_hdp(adev, ring); 7103 } 7104 7105 int amdgpu_in_reset(struct amdgpu_device *adev) 7106 { 7107 return atomic_read(&adev->reset_domain->in_gpu_reset); 7108 } 7109 7110 /** 7111 * amdgpu_device_halt() - bring hardware to some kind of halt state 7112 * 7113 * @adev: amdgpu_device pointer 7114 * 7115 * Bring hardware to some kind of halt state so that no one can touch it 7116 * any more. It will help to maintain error context when error occurred. 7117 * Compare to a simple hang, the system will keep stable at least for SSH 7118 * access. Then it should be trivial to inspect the hardware state and 7119 * see what's going on. Implemented as following: 7120 * 7121 * 1. drm_dev_unplug() makes device inaccessible to user space(IOCTLs, etc), 7122 * clears all CPU mappings to device, disallows remappings through page faults 7123 * 2. amdgpu_irq_disable_all() disables all interrupts 7124 * 3. amdgpu_fence_driver_hw_fini() signals all HW fences 7125 * 4. set adev->no_hw_access to avoid potential crashes after setp 5 7126 * 5. amdgpu_device_unmap_mmio() clears all MMIO mappings 7127 * 6. pci_disable_device() and pci_wait_for_pending_transaction() 7128 * flush any in flight DMA operations 7129 */ 7130 void amdgpu_device_halt(struct amdgpu_device *adev) 7131 { 7132 struct pci_dev *pdev = adev->pdev; 7133 struct drm_device *ddev = adev_to_drm(adev); 7134 7135 amdgpu_xcp_dev_unplug(adev); 7136 drm_dev_unplug(ddev); 7137 7138 amdgpu_irq_disable_all(adev); 7139 7140 amdgpu_fence_driver_hw_fini(adev); 7141 7142 adev->no_hw_access = true; 7143 7144 amdgpu_device_unmap_mmio(adev); 7145 7146 pci_disable_device(pdev); 7147 pci_wait_for_pending_transaction(pdev); 7148 } 7149 7150 u32 amdgpu_device_pcie_port_rreg(struct amdgpu_device *adev, 7151 u32 reg) 7152 { 7153 unsigned long flags, address, data; 7154 u32 r; 7155 7156 address = adev->nbio.funcs->get_pcie_port_index_offset(adev); 7157 data = adev->nbio.funcs->get_pcie_port_data_offset(adev); 7158 7159 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 7160 WREG32(address, reg * 4); 7161 (void)RREG32(address); 7162 r = RREG32(data); 7163 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 7164 return r; 7165 } 7166 7167 void amdgpu_device_pcie_port_wreg(struct amdgpu_device *adev, 7168 u32 reg, u32 v) 7169 { 7170 unsigned long flags, address, data; 7171 7172 address = adev->nbio.funcs->get_pcie_port_index_offset(adev); 7173 data = adev->nbio.funcs->get_pcie_port_data_offset(adev); 7174 7175 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 7176 WREG32(address, reg * 4); 7177 (void)RREG32(address); 7178 WREG32(data, v); 7179 (void)RREG32(data); 7180 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 7181 } 7182 7183 /** 7184 * amdgpu_device_get_gang - return a reference to the current gang 7185 * @adev: amdgpu_device pointer 7186 * 7187 * Returns: A new reference to the current gang leader. 7188 */ 7189 struct dma_fence *amdgpu_device_get_gang(struct amdgpu_device *adev) 7190 { 7191 struct dma_fence *fence; 7192 7193 rcu_read_lock(); 7194 fence = dma_fence_get_rcu_safe(&adev->gang_submit); 7195 rcu_read_unlock(); 7196 return fence; 7197 } 7198 7199 /** 7200 * amdgpu_device_switch_gang - switch to a new gang 7201 * @adev: amdgpu_device pointer 7202 * @gang: the gang to switch to 7203 * 7204 * Try to switch to a new gang. 7205 * Returns: NULL if we switched to the new gang or a reference to the current 7206 * gang leader. 7207 */ 7208 struct dma_fence *amdgpu_device_switch_gang(struct amdgpu_device *adev, 7209 struct dma_fence *gang) 7210 { 7211 struct dma_fence *old = NULL; 7212 7213 dma_fence_get(gang); 7214 do { 7215 dma_fence_put(old); 7216 old = amdgpu_device_get_gang(adev); 7217 if (old == gang) 7218 break; 7219 7220 if (!dma_fence_is_signaled(old)) { 7221 dma_fence_put(gang); 7222 return old; 7223 } 7224 7225 } while (cmpxchg((struct dma_fence __force **)&adev->gang_submit, 7226 old, gang) != old); 7227 7228 /* 7229 * Drop it once for the exchanged reference in adev and once for the 7230 * thread local reference acquired in amdgpu_device_get_gang(). 7231 */ 7232 dma_fence_put(old); 7233 dma_fence_put(old); 7234 return NULL; 7235 } 7236 7237 /** 7238 * amdgpu_device_enforce_isolation - enforce HW isolation 7239 * @adev: the amdgpu device pointer 7240 * @ring: the HW ring the job is supposed to run on 7241 * @job: the job which is about to be pushed to the HW ring 7242 * 7243 * Makes sure that only one client at a time can use the GFX block. 7244 * Returns: The dependency to wait on before the job can be pushed to the HW. 7245 * The function is called multiple times until NULL is returned. 7246 */ 7247 struct dma_fence *amdgpu_device_enforce_isolation(struct amdgpu_device *adev, 7248 struct amdgpu_ring *ring, 7249 struct amdgpu_job *job) 7250 { 7251 struct amdgpu_isolation *isolation = &adev->isolation[ring->xcp_id]; 7252 struct drm_sched_fence *f = job->base.s_fence; 7253 struct dma_fence *dep; 7254 void *owner; 7255 int r; 7256 7257 /* 7258 * For now enforce isolation only for the GFX block since we only need 7259 * the cleaner shader on those rings. 7260 */ 7261 if (ring->funcs->type != AMDGPU_RING_TYPE_GFX && 7262 ring->funcs->type != AMDGPU_RING_TYPE_COMPUTE) 7263 return NULL; 7264 7265 /* 7266 * All submissions where enforce isolation is false are handled as if 7267 * they come from a single client. Use ~0l as the owner to distinct it 7268 * from kernel submissions where the owner is NULL. 7269 */ 7270 owner = job->enforce_isolation ? f->owner : (void *)~0l; 7271 7272 mutex_lock(&adev->enforce_isolation_mutex); 7273 7274 /* 7275 * The "spearhead" submission is the first one which changes the 7276 * ownership to its client. We always need to wait for it to be 7277 * pushed to the HW before proceeding with anything. 7278 */ 7279 if (&f->scheduled != isolation->spearhead && 7280 !dma_fence_is_signaled(isolation->spearhead)) { 7281 dep = isolation->spearhead; 7282 goto out_grab_ref; 7283 } 7284 7285 if (isolation->owner != owner) { 7286 7287 /* 7288 * Wait for any gang to be assembled before switching to a 7289 * different owner or otherwise we could deadlock the 7290 * submissions. 7291 */ 7292 if (!job->gang_submit) { 7293 dep = amdgpu_device_get_gang(adev); 7294 if (!dma_fence_is_signaled(dep)) 7295 goto out_return_dep; 7296 dma_fence_put(dep); 7297 } 7298 7299 dma_fence_put(isolation->spearhead); 7300 isolation->spearhead = dma_fence_get(&f->scheduled); 7301 amdgpu_sync_move(&isolation->active, &isolation->prev); 7302 trace_amdgpu_isolation(isolation->owner, owner); 7303 isolation->owner = owner; 7304 } 7305 7306 /* 7307 * Specifying the ring here helps to pipeline submissions even when 7308 * isolation is enabled. If that is not desired for testing NULL can be 7309 * used instead of the ring to enforce a CPU round trip while switching 7310 * between clients. 7311 */ 7312 dep = amdgpu_sync_peek_fence(&isolation->prev, ring); 7313 r = amdgpu_sync_fence(&isolation->active, &f->finished, GFP_NOWAIT); 7314 if (r) 7315 DRM_WARN("OOM tracking isolation\n"); 7316 7317 out_grab_ref: 7318 dma_fence_get(dep); 7319 out_return_dep: 7320 mutex_unlock(&adev->enforce_isolation_mutex); 7321 return dep; 7322 } 7323 7324 bool amdgpu_device_has_display_hardware(struct amdgpu_device *adev) 7325 { 7326 switch (adev->asic_type) { 7327 #ifdef CONFIG_DRM_AMDGPU_SI 7328 case CHIP_HAINAN: 7329 #endif 7330 case CHIP_TOPAZ: 7331 /* chips with no display hardware */ 7332 return false; 7333 #ifdef CONFIG_DRM_AMDGPU_SI 7334 case CHIP_TAHITI: 7335 case CHIP_PITCAIRN: 7336 case CHIP_VERDE: 7337 case CHIP_OLAND: 7338 #endif 7339 #ifdef CONFIG_DRM_AMDGPU_CIK 7340 case CHIP_BONAIRE: 7341 case CHIP_HAWAII: 7342 case CHIP_KAVERI: 7343 case CHIP_KABINI: 7344 case CHIP_MULLINS: 7345 #endif 7346 case CHIP_TONGA: 7347 case CHIP_FIJI: 7348 case CHIP_POLARIS10: 7349 case CHIP_POLARIS11: 7350 case CHIP_POLARIS12: 7351 case CHIP_VEGAM: 7352 case CHIP_CARRIZO: 7353 case CHIP_STONEY: 7354 /* chips with display hardware */ 7355 return true; 7356 default: 7357 /* IP discovery */ 7358 if (!amdgpu_ip_version(adev, DCE_HWIP, 0) || 7359 (adev->harvest_ip_mask & AMD_HARVEST_IP_DMU_MASK)) 7360 return false; 7361 return true; 7362 } 7363 } 7364 7365 uint32_t amdgpu_device_wait_on_rreg(struct amdgpu_device *adev, 7366 uint32_t inst, uint32_t reg_addr, char reg_name[], 7367 uint32_t expected_value, uint32_t mask) 7368 { 7369 uint32_t ret = 0; 7370 uint32_t old_ = 0; 7371 uint32_t tmp_ = RREG32(reg_addr); 7372 uint32_t loop = adev->usec_timeout; 7373 7374 while ((tmp_ & (mask)) != (expected_value)) { 7375 if (old_ != tmp_) { 7376 loop = adev->usec_timeout; 7377 old_ = tmp_; 7378 } else 7379 udelay(1); 7380 tmp_ = RREG32(reg_addr); 7381 loop--; 7382 if (!loop) { 7383 DRM_WARN("Register(%d) [%s] failed to reach value 0x%08x != 0x%08xn", 7384 inst, reg_name, (uint32_t)expected_value, 7385 (uint32_t)(tmp_ & (mask))); 7386 ret = -ETIMEDOUT; 7387 break; 7388 } 7389 } 7390 return ret; 7391 } 7392 7393 ssize_t amdgpu_get_soft_full_reset_mask(struct amdgpu_ring *ring) 7394 { 7395 ssize_t size = 0; 7396 7397 if (!ring || !ring->adev) 7398 return size; 7399 7400 if (amdgpu_device_should_recover_gpu(ring->adev)) 7401 size |= AMDGPU_RESET_TYPE_FULL; 7402 7403 if (unlikely(!ring->adev->debug_disable_soft_recovery) && 7404 !amdgpu_sriov_vf(ring->adev) && ring->funcs->soft_recovery) 7405 size |= AMDGPU_RESET_TYPE_SOFT_RESET; 7406 7407 return size; 7408 } 7409 7410 ssize_t amdgpu_show_reset_mask(char *buf, uint32_t supported_reset) 7411 { 7412 ssize_t size = 0; 7413 7414 if (supported_reset == 0) { 7415 size += sysfs_emit_at(buf, size, "unsupported"); 7416 size += sysfs_emit_at(buf, size, "\n"); 7417 return size; 7418 7419 } 7420 7421 if (supported_reset & AMDGPU_RESET_TYPE_SOFT_RESET) 7422 size += sysfs_emit_at(buf, size, "soft "); 7423 7424 if (supported_reset & AMDGPU_RESET_TYPE_PER_QUEUE) 7425 size += sysfs_emit_at(buf, size, "queue "); 7426 7427 if (supported_reset & AMDGPU_RESET_TYPE_PER_PIPE) 7428 size += sysfs_emit_at(buf, size, "pipe "); 7429 7430 if (supported_reset & AMDGPU_RESET_TYPE_FULL) 7431 size += sysfs_emit_at(buf, size, "full "); 7432 7433 size += sysfs_emit_at(buf, size, "\n"); 7434 return size; 7435 } 7436