1 /* 2 * Copyright 2008 Advanced Micro Devices, Inc. 3 * Copyright 2008 Red Hat Inc. 4 * Copyright 2009 Jerome Glisse. 5 * 6 * Permission is hereby granted, free of charge, to any person obtaining a 7 * copy of this software and associated documentation files (the "Software"), 8 * to deal in the Software without restriction, including without limitation 9 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 10 * and/or sell copies of the Software, and to permit persons to whom the 11 * Software is furnished to do so, subject to the following conditions: 12 * 13 * The above copyright notice and this permission notice shall be included in 14 * all copies or substantial portions of the Software. 15 * 16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 19 * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR 20 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, 21 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR 22 * OTHER DEALINGS IN THE SOFTWARE. 23 * 24 * Authors: Dave Airlie 25 * Alex Deucher 26 * Jerome Glisse 27 */ 28 29 #include <linux/aperture.h> 30 #include <linux/power_supply.h> 31 #include <linux/kthread.h> 32 #include <linux/module.h> 33 #include <linux/console.h> 34 #include <linux/slab.h> 35 #include <linux/iommu.h> 36 #include <linux/pci.h> 37 #include <linux/pci-p2pdma.h> 38 #include <linux/apple-gmux.h> 39 40 #include <drm/drm_atomic_helper.h> 41 #include <drm/drm_client_event.h> 42 #include <drm/drm_crtc_helper.h> 43 #include <drm/drm_probe_helper.h> 44 #include <drm/amdgpu_drm.h> 45 #include <linux/device.h> 46 #include <linux/vgaarb.h> 47 #include <linux/vga_switcheroo.h> 48 #include <linux/efi.h> 49 #include "amdgpu.h" 50 #include "amdgpu_trace.h" 51 #include "amdgpu_i2c.h" 52 #include "atom.h" 53 #include "amdgpu_atombios.h" 54 #include "amdgpu_atomfirmware.h" 55 #include "amd_pcie.h" 56 #ifdef CONFIG_DRM_AMDGPU_SI 57 #include "si.h" 58 #endif 59 #ifdef CONFIG_DRM_AMDGPU_CIK 60 #include "cik.h" 61 #endif 62 #include "vi.h" 63 #include "soc15.h" 64 #include "nv.h" 65 #include "bif/bif_4_1_d.h" 66 #include <linux/firmware.h> 67 #include "amdgpu_vf_error.h" 68 69 #include "amdgpu_amdkfd.h" 70 #include "amdgpu_pm.h" 71 72 #include "amdgpu_xgmi.h" 73 #include "amdgpu_ras.h" 74 #include "amdgpu_pmu.h" 75 #include "amdgpu_fru_eeprom.h" 76 #include "amdgpu_reset.h" 77 #include "amdgpu_virt.h" 78 #include "amdgpu_dev_coredump.h" 79 80 #include <linux/suspend.h> 81 #include <drm/task_barrier.h> 82 #include <linux/pm_runtime.h> 83 84 #include <drm/drm_drv.h> 85 86 #if IS_ENABLED(CONFIG_X86) 87 #include <asm/intel-family.h> 88 #include <asm/cpu_device_id.h> 89 #endif 90 91 MODULE_FIRMWARE("amdgpu/vega10_gpu_info.bin"); 92 MODULE_FIRMWARE("amdgpu/vega12_gpu_info.bin"); 93 MODULE_FIRMWARE("amdgpu/raven_gpu_info.bin"); 94 MODULE_FIRMWARE("amdgpu/picasso_gpu_info.bin"); 95 MODULE_FIRMWARE("amdgpu/raven2_gpu_info.bin"); 96 MODULE_FIRMWARE("amdgpu/arcturus_gpu_info.bin"); 97 MODULE_FIRMWARE("amdgpu/navi12_gpu_info.bin"); 98 99 #define AMDGPU_RESUME_MS 2000 100 #define AMDGPU_MAX_RETRY_LIMIT 2 101 #define AMDGPU_RETRY_SRIOV_RESET(r) ((r) == -EBUSY || (r) == -ETIMEDOUT || (r) == -EINVAL) 102 #define AMDGPU_PCIE_INDEX_FALLBACK (0x38 >> 2) 103 #define AMDGPU_PCIE_INDEX_HI_FALLBACK (0x44 >> 2) 104 #define AMDGPU_PCIE_DATA_FALLBACK (0x3C >> 2) 105 106 #define AMDGPU_VBIOS_SKIP (1U << 0) 107 #define AMDGPU_VBIOS_OPTIONAL (1U << 1) 108 109 static const struct drm_driver amdgpu_kms_driver; 110 111 const char *amdgpu_asic_name[] = { 112 "TAHITI", 113 "PITCAIRN", 114 "VERDE", 115 "OLAND", 116 "HAINAN", 117 "BONAIRE", 118 "KAVERI", 119 "KABINI", 120 "HAWAII", 121 "MULLINS", 122 "TOPAZ", 123 "TONGA", 124 "FIJI", 125 "CARRIZO", 126 "STONEY", 127 "POLARIS10", 128 "POLARIS11", 129 "POLARIS12", 130 "VEGAM", 131 "VEGA10", 132 "VEGA12", 133 "VEGA20", 134 "RAVEN", 135 "ARCTURUS", 136 "RENOIR", 137 "ALDEBARAN", 138 "NAVI10", 139 "CYAN_SKILLFISH", 140 "NAVI14", 141 "NAVI12", 142 "SIENNA_CICHLID", 143 "NAVY_FLOUNDER", 144 "VANGOGH", 145 "DIMGREY_CAVEFISH", 146 "BEIGE_GOBY", 147 "YELLOW_CARP", 148 "IP DISCOVERY", 149 "LAST", 150 }; 151 152 #define AMDGPU_IP_BLK_MASK_ALL GENMASK(AMD_IP_BLOCK_TYPE_NUM - 1, 0) 153 /* 154 * Default init level where all blocks are expected to be initialized. This is 155 * the level of initialization expected by default and also after a full reset 156 * of the device. 157 */ 158 struct amdgpu_init_level amdgpu_init_default = { 159 .level = AMDGPU_INIT_LEVEL_DEFAULT, 160 .hwini_ip_block_mask = AMDGPU_IP_BLK_MASK_ALL, 161 }; 162 163 struct amdgpu_init_level amdgpu_init_recovery = { 164 .level = AMDGPU_INIT_LEVEL_RESET_RECOVERY, 165 .hwini_ip_block_mask = AMDGPU_IP_BLK_MASK_ALL, 166 }; 167 168 /* 169 * Minimal blocks needed to be initialized before a XGMI hive can be reset. This 170 * is used for cases like reset on initialization where the entire hive needs to 171 * be reset before first use. 172 */ 173 struct amdgpu_init_level amdgpu_init_minimal_xgmi = { 174 .level = AMDGPU_INIT_LEVEL_MINIMAL_XGMI, 175 .hwini_ip_block_mask = 176 BIT(AMD_IP_BLOCK_TYPE_GMC) | BIT(AMD_IP_BLOCK_TYPE_SMC) | 177 BIT(AMD_IP_BLOCK_TYPE_COMMON) | BIT(AMD_IP_BLOCK_TYPE_IH) | 178 BIT(AMD_IP_BLOCK_TYPE_PSP) 179 }; 180 181 static void amdgpu_device_load_switch_state(struct amdgpu_device *adev); 182 183 static inline bool amdgpu_ip_member_of_hwini(struct amdgpu_device *adev, 184 enum amd_ip_block_type block) 185 { 186 return (adev->init_lvl->hwini_ip_block_mask & (1U << block)) != 0; 187 } 188 189 void amdgpu_set_init_level(struct amdgpu_device *adev, 190 enum amdgpu_init_lvl_id lvl) 191 { 192 switch (lvl) { 193 case AMDGPU_INIT_LEVEL_MINIMAL_XGMI: 194 adev->init_lvl = &amdgpu_init_minimal_xgmi; 195 break; 196 case AMDGPU_INIT_LEVEL_RESET_RECOVERY: 197 adev->init_lvl = &amdgpu_init_recovery; 198 break; 199 case AMDGPU_INIT_LEVEL_DEFAULT: 200 fallthrough; 201 default: 202 adev->init_lvl = &amdgpu_init_default; 203 break; 204 } 205 } 206 207 static inline void amdgpu_device_stop_pending_resets(struct amdgpu_device *adev); 208 static int amdgpu_device_pm_notifier(struct notifier_block *nb, unsigned long mode, 209 void *data); 210 211 /** 212 * DOC: pcie_replay_count 213 * 214 * The amdgpu driver provides a sysfs API for reporting the total number 215 * of PCIe replays (NAKs). 216 * The file pcie_replay_count is used for this and returns the total 217 * number of replays as a sum of the NAKs generated and NAKs received. 218 */ 219 220 static ssize_t amdgpu_device_get_pcie_replay_count(struct device *dev, 221 struct device_attribute *attr, char *buf) 222 { 223 struct drm_device *ddev = dev_get_drvdata(dev); 224 struct amdgpu_device *adev = drm_to_adev(ddev); 225 uint64_t cnt = amdgpu_asic_get_pcie_replay_count(adev); 226 227 return sysfs_emit(buf, "%llu\n", cnt); 228 } 229 230 static DEVICE_ATTR(pcie_replay_count, 0444, 231 amdgpu_device_get_pcie_replay_count, NULL); 232 233 static int amdgpu_device_attr_sysfs_init(struct amdgpu_device *adev) 234 { 235 int ret = 0; 236 237 if (amdgpu_nbio_is_replay_cnt_supported(adev)) 238 ret = sysfs_create_file(&adev->dev->kobj, 239 &dev_attr_pcie_replay_count.attr); 240 241 return ret; 242 } 243 244 static void amdgpu_device_attr_sysfs_fini(struct amdgpu_device *adev) 245 { 246 if (amdgpu_nbio_is_replay_cnt_supported(adev)) 247 sysfs_remove_file(&adev->dev->kobj, 248 &dev_attr_pcie_replay_count.attr); 249 } 250 251 static ssize_t amdgpu_sysfs_reg_state_get(struct file *f, struct kobject *kobj, 252 const struct bin_attribute *attr, char *buf, 253 loff_t ppos, size_t count) 254 { 255 struct device *dev = kobj_to_dev(kobj); 256 struct drm_device *ddev = dev_get_drvdata(dev); 257 struct amdgpu_device *adev = drm_to_adev(ddev); 258 ssize_t bytes_read; 259 260 switch (ppos) { 261 case AMDGPU_SYS_REG_STATE_XGMI: 262 bytes_read = amdgpu_asic_get_reg_state( 263 adev, AMDGPU_REG_STATE_TYPE_XGMI, buf, count); 264 break; 265 case AMDGPU_SYS_REG_STATE_WAFL: 266 bytes_read = amdgpu_asic_get_reg_state( 267 adev, AMDGPU_REG_STATE_TYPE_WAFL, buf, count); 268 break; 269 case AMDGPU_SYS_REG_STATE_PCIE: 270 bytes_read = amdgpu_asic_get_reg_state( 271 adev, AMDGPU_REG_STATE_TYPE_PCIE, buf, count); 272 break; 273 case AMDGPU_SYS_REG_STATE_USR: 274 bytes_read = amdgpu_asic_get_reg_state( 275 adev, AMDGPU_REG_STATE_TYPE_USR, buf, count); 276 break; 277 case AMDGPU_SYS_REG_STATE_USR_1: 278 bytes_read = amdgpu_asic_get_reg_state( 279 adev, AMDGPU_REG_STATE_TYPE_USR_1, buf, count); 280 break; 281 default: 282 return -EINVAL; 283 } 284 285 return bytes_read; 286 } 287 288 static const BIN_ATTR(reg_state, 0444, amdgpu_sysfs_reg_state_get, NULL, 289 AMDGPU_SYS_REG_STATE_END); 290 291 int amdgpu_reg_state_sysfs_init(struct amdgpu_device *adev) 292 { 293 int ret; 294 295 if (!amdgpu_asic_get_reg_state_supported(adev)) 296 return 0; 297 298 ret = sysfs_create_bin_file(&adev->dev->kobj, &bin_attr_reg_state); 299 300 return ret; 301 } 302 303 void amdgpu_reg_state_sysfs_fini(struct amdgpu_device *adev) 304 { 305 if (!amdgpu_asic_get_reg_state_supported(adev)) 306 return; 307 sysfs_remove_bin_file(&adev->dev->kobj, &bin_attr_reg_state); 308 } 309 310 int amdgpu_ip_block_suspend(struct amdgpu_ip_block *ip_block) 311 { 312 int r; 313 314 if (ip_block->version->funcs->suspend) { 315 r = ip_block->version->funcs->suspend(ip_block); 316 if (r) { 317 dev_err(ip_block->adev->dev, 318 "suspend of IP block <%s> failed %d\n", 319 ip_block->version->funcs->name, r); 320 return r; 321 } 322 } 323 324 ip_block->status.hw = false; 325 return 0; 326 } 327 328 int amdgpu_ip_block_resume(struct amdgpu_ip_block *ip_block) 329 { 330 int r; 331 332 if (ip_block->version->funcs->resume) { 333 r = ip_block->version->funcs->resume(ip_block); 334 if (r) { 335 dev_err(ip_block->adev->dev, 336 "resume of IP block <%s> failed %d\n", 337 ip_block->version->funcs->name, r); 338 return r; 339 } 340 } 341 342 ip_block->status.hw = true; 343 return 0; 344 } 345 346 /** 347 * DOC: board_info 348 * 349 * The amdgpu driver provides a sysfs API for giving board related information. 350 * It provides the form factor information in the format 351 * 352 * type : form factor 353 * 354 * Possible form factor values 355 * 356 * - "cem" - PCIE CEM card 357 * - "oam" - Open Compute Accelerator Module 358 * - "unknown" - Not known 359 * 360 */ 361 362 static ssize_t amdgpu_device_get_board_info(struct device *dev, 363 struct device_attribute *attr, 364 char *buf) 365 { 366 struct drm_device *ddev = dev_get_drvdata(dev); 367 struct amdgpu_device *adev = drm_to_adev(ddev); 368 enum amdgpu_pkg_type pkg_type = AMDGPU_PKG_TYPE_CEM; 369 const char *pkg; 370 371 if (adev->smuio.funcs && adev->smuio.funcs->get_pkg_type) 372 pkg_type = adev->smuio.funcs->get_pkg_type(adev); 373 374 switch (pkg_type) { 375 case AMDGPU_PKG_TYPE_CEM: 376 pkg = "cem"; 377 break; 378 case AMDGPU_PKG_TYPE_OAM: 379 pkg = "oam"; 380 break; 381 default: 382 pkg = "unknown"; 383 break; 384 } 385 386 return sysfs_emit(buf, "%s : %s\n", "type", pkg); 387 } 388 389 static DEVICE_ATTR(board_info, 0444, amdgpu_device_get_board_info, NULL); 390 391 static struct attribute *amdgpu_board_attrs[] = { 392 &dev_attr_board_info.attr, 393 NULL, 394 }; 395 396 static umode_t amdgpu_board_attrs_is_visible(struct kobject *kobj, 397 struct attribute *attr, int n) 398 { 399 struct device *dev = kobj_to_dev(kobj); 400 struct drm_device *ddev = dev_get_drvdata(dev); 401 struct amdgpu_device *adev = drm_to_adev(ddev); 402 403 if (adev->flags & AMD_IS_APU) 404 return 0; 405 406 return attr->mode; 407 } 408 409 static const struct attribute_group amdgpu_board_attrs_group = { 410 .attrs = amdgpu_board_attrs, 411 .is_visible = amdgpu_board_attrs_is_visible 412 }; 413 414 static void amdgpu_device_get_pcie_info(struct amdgpu_device *adev); 415 416 /** 417 * amdgpu_device_supports_px - Is the device a dGPU with ATPX power control 418 * 419 * @adev: amdgpu device pointer 420 * 421 * Returns true if the device is a dGPU with ATPX power control, 422 * otherwise return false. 423 */ 424 bool amdgpu_device_supports_px(struct amdgpu_device *adev) 425 { 426 if ((adev->flags & AMD_IS_PX) && !amdgpu_is_atpx_hybrid()) 427 return true; 428 return false; 429 } 430 431 /** 432 * amdgpu_device_supports_boco - Is the device a dGPU with ACPI power resources 433 * 434 * @adev: amdgpu device pointer 435 * 436 * Returns true if the device is a dGPU with ACPI power control, 437 * otherwise return false. 438 */ 439 bool amdgpu_device_supports_boco(struct amdgpu_device *adev) 440 { 441 if (!IS_ENABLED(CONFIG_HOTPLUG_PCI_PCIE)) 442 return false; 443 444 if (adev->has_pr3 || 445 ((adev->flags & AMD_IS_PX) && amdgpu_is_atpx_hybrid())) 446 return true; 447 return false; 448 } 449 450 /** 451 * amdgpu_device_supports_baco - Does the device support BACO 452 * 453 * @adev: amdgpu device pointer 454 * 455 * Return: 456 * 1 if the device supports BACO; 457 * 3 if the device supports MACO (only works if BACO is supported) 458 * otherwise return 0. 459 */ 460 int amdgpu_device_supports_baco(struct amdgpu_device *adev) 461 { 462 return amdgpu_asic_supports_baco(adev); 463 } 464 465 void amdgpu_device_detect_runtime_pm_mode(struct amdgpu_device *adev) 466 { 467 int bamaco_support; 468 469 adev->pm.rpm_mode = AMDGPU_RUNPM_NONE; 470 bamaco_support = amdgpu_device_supports_baco(adev); 471 472 switch (amdgpu_runtime_pm) { 473 case 2: 474 if (bamaco_support & MACO_SUPPORT) { 475 adev->pm.rpm_mode = AMDGPU_RUNPM_BAMACO; 476 dev_info(adev->dev, "Forcing BAMACO for runtime pm\n"); 477 } else if (bamaco_support == BACO_SUPPORT) { 478 adev->pm.rpm_mode = AMDGPU_RUNPM_BACO; 479 dev_info(adev->dev, "Requested mode BAMACO not available,fallback to use BACO\n"); 480 } 481 break; 482 case 1: 483 if (bamaco_support & BACO_SUPPORT) { 484 adev->pm.rpm_mode = AMDGPU_RUNPM_BACO; 485 dev_info(adev->dev, "Forcing BACO for runtime pm\n"); 486 } 487 break; 488 case -1: 489 case -2: 490 if (amdgpu_device_supports_px(adev)) { 491 /* enable PX as runtime mode */ 492 adev->pm.rpm_mode = AMDGPU_RUNPM_PX; 493 dev_info(adev->dev, "Using ATPX for runtime pm\n"); 494 } else if (amdgpu_device_supports_boco(adev)) { 495 /* enable boco as runtime mode */ 496 adev->pm.rpm_mode = AMDGPU_RUNPM_BOCO; 497 dev_info(adev->dev, "Using BOCO for runtime pm\n"); 498 } else { 499 if (!bamaco_support) 500 goto no_runtime_pm; 501 502 switch (adev->asic_type) { 503 case CHIP_VEGA20: 504 case CHIP_ARCTURUS: 505 /* BACO are not supported on vega20 and arctrus */ 506 break; 507 case CHIP_VEGA10: 508 /* enable BACO as runpm mode if noretry=0 */ 509 if (!adev->gmc.noretry && !amdgpu_passthrough(adev)) 510 adev->pm.rpm_mode = AMDGPU_RUNPM_BACO; 511 break; 512 default: 513 /* enable BACO as runpm mode on CI+ */ 514 if (!amdgpu_passthrough(adev)) 515 adev->pm.rpm_mode = AMDGPU_RUNPM_BACO; 516 break; 517 } 518 519 if (adev->pm.rpm_mode == AMDGPU_RUNPM_BACO) { 520 if (bamaco_support & MACO_SUPPORT) { 521 adev->pm.rpm_mode = AMDGPU_RUNPM_BAMACO; 522 dev_info(adev->dev, "Using BAMACO for runtime pm\n"); 523 } else { 524 dev_info(adev->dev, "Using BACO for runtime pm\n"); 525 } 526 } 527 } 528 break; 529 case 0: 530 dev_info(adev->dev, "runtime pm is manually disabled\n"); 531 break; 532 default: 533 break; 534 } 535 536 no_runtime_pm: 537 if (adev->pm.rpm_mode == AMDGPU_RUNPM_NONE) 538 dev_info(adev->dev, "Runtime PM not available\n"); 539 } 540 /** 541 * amdgpu_device_supports_smart_shift - Is the device dGPU with 542 * smart shift support 543 * 544 * @adev: amdgpu device pointer 545 * 546 * Returns true if the device is a dGPU with Smart Shift support, 547 * otherwise returns false. 548 */ 549 bool amdgpu_device_supports_smart_shift(struct amdgpu_device *adev) 550 { 551 return (amdgpu_device_supports_boco(adev) && 552 amdgpu_acpi_is_power_shift_control_supported()); 553 } 554 555 /* 556 * VRAM access helper functions 557 */ 558 559 /** 560 * amdgpu_device_mm_access - access vram by MM_INDEX/MM_DATA 561 * 562 * @adev: amdgpu_device pointer 563 * @pos: offset of the buffer in vram 564 * @buf: virtual address of the buffer in system memory 565 * @size: read/write size, sizeof(@buf) must > @size 566 * @write: true - write to vram, otherwise - read from vram 567 */ 568 void amdgpu_device_mm_access(struct amdgpu_device *adev, loff_t pos, 569 void *buf, size_t size, bool write) 570 { 571 unsigned long flags; 572 uint32_t hi = ~0, tmp = 0; 573 uint32_t *data = buf; 574 uint64_t last; 575 int idx; 576 577 if (!drm_dev_enter(adev_to_drm(adev), &idx)) 578 return; 579 580 BUG_ON(!IS_ALIGNED(pos, 4) || !IS_ALIGNED(size, 4)); 581 582 spin_lock_irqsave(&adev->mmio_idx_lock, flags); 583 for (last = pos + size; pos < last; pos += 4) { 584 tmp = pos >> 31; 585 586 WREG32_NO_KIQ(mmMM_INDEX, ((uint32_t)pos) | 0x80000000); 587 if (tmp != hi) { 588 WREG32_NO_KIQ(mmMM_INDEX_HI, tmp); 589 hi = tmp; 590 } 591 if (write) 592 WREG32_NO_KIQ(mmMM_DATA, *data++); 593 else 594 *data++ = RREG32_NO_KIQ(mmMM_DATA); 595 } 596 597 spin_unlock_irqrestore(&adev->mmio_idx_lock, flags); 598 drm_dev_exit(idx); 599 } 600 601 /** 602 * amdgpu_device_aper_access - access vram by vram aperture 603 * 604 * @adev: amdgpu_device pointer 605 * @pos: offset of the buffer in vram 606 * @buf: virtual address of the buffer in system memory 607 * @size: read/write size, sizeof(@buf) must > @size 608 * @write: true - write to vram, otherwise - read from vram 609 * 610 * The return value means how many bytes have been transferred. 611 */ 612 size_t amdgpu_device_aper_access(struct amdgpu_device *adev, loff_t pos, 613 void *buf, size_t size, bool write) 614 { 615 #ifdef CONFIG_64BIT 616 void __iomem *addr; 617 size_t count = 0; 618 uint64_t last; 619 620 if (!adev->mman.aper_base_kaddr) 621 return 0; 622 623 last = min(pos + size, adev->gmc.visible_vram_size); 624 if (last > pos) { 625 addr = adev->mman.aper_base_kaddr + pos; 626 count = last - pos; 627 628 if (write) { 629 memcpy_toio(addr, buf, count); 630 /* Make sure HDP write cache flush happens without any reordering 631 * after the system memory contents are sent over PCIe device 632 */ 633 mb(); 634 amdgpu_device_flush_hdp(adev, NULL); 635 } else { 636 amdgpu_device_invalidate_hdp(adev, NULL); 637 /* Make sure HDP read cache is invalidated before issuing a read 638 * to the PCIe device 639 */ 640 mb(); 641 memcpy_fromio(buf, addr, count); 642 } 643 644 } 645 646 return count; 647 #else 648 return 0; 649 #endif 650 } 651 652 /** 653 * amdgpu_device_vram_access - read/write a buffer in vram 654 * 655 * @adev: amdgpu_device pointer 656 * @pos: offset of the buffer in vram 657 * @buf: virtual address of the buffer in system memory 658 * @size: read/write size, sizeof(@buf) must > @size 659 * @write: true - write to vram, otherwise - read from vram 660 */ 661 void amdgpu_device_vram_access(struct amdgpu_device *adev, loff_t pos, 662 void *buf, size_t size, bool write) 663 { 664 size_t count; 665 666 /* try to using vram apreature to access vram first */ 667 count = amdgpu_device_aper_access(adev, pos, buf, size, write); 668 size -= count; 669 if (size) { 670 /* using MM to access rest vram */ 671 pos += count; 672 buf += count; 673 amdgpu_device_mm_access(adev, pos, buf, size, write); 674 } 675 } 676 677 /* 678 * register access helper functions. 679 */ 680 681 /* Check if hw access should be skipped because of hotplug or device error */ 682 bool amdgpu_device_skip_hw_access(struct amdgpu_device *adev) 683 { 684 if (adev->no_hw_access) 685 return true; 686 687 #ifdef CONFIG_LOCKDEP 688 /* 689 * This is a bit complicated to understand, so worth a comment. What we assert 690 * here is that the GPU reset is not running on another thread in parallel. 691 * 692 * For this we trylock the read side of the reset semaphore, if that succeeds 693 * we know that the reset is not running in parallel. 694 * 695 * If the trylock fails we assert that we are either already holding the read 696 * side of the lock or are the reset thread itself and hold the write side of 697 * the lock. 698 */ 699 if (in_task()) { 700 if (down_read_trylock(&adev->reset_domain->sem)) 701 up_read(&adev->reset_domain->sem); 702 else 703 lockdep_assert_held(&adev->reset_domain->sem); 704 } 705 #endif 706 return false; 707 } 708 709 /** 710 * amdgpu_device_rreg - read a memory mapped IO or indirect register 711 * 712 * @adev: amdgpu_device pointer 713 * @reg: dword aligned register offset 714 * @acc_flags: access flags which require special behavior 715 * 716 * Returns the 32 bit value from the offset specified. 717 */ 718 uint32_t amdgpu_device_rreg(struct amdgpu_device *adev, 719 uint32_t reg, uint32_t acc_flags) 720 { 721 uint32_t ret; 722 723 if (amdgpu_device_skip_hw_access(adev)) 724 return 0; 725 726 if ((reg * 4) < adev->rmmio_size) { 727 if (!(acc_flags & AMDGPU_REGS_NO_KIQ) && 728 amdgpu_sriov_runtime(adev) && 729 down_read_trylock(&adev->reset_domain->sem)) { 730 ret = amdgpu_kiq_rreg(adev, reg, 0); 731 up_read(&adev->reset_domain->sem); 732 } else { 733 ret = readl(((void __iomem *)adev->rmmio) + (reg * 4)); 734 } 735 } else { 736 ret = adev->pcie_rreg(adev, reg * 4); 737 } 738 739 trace_amdgpu_device_rreg(adev->pdev->device, reg, ret); 740 741 return ret; 742 } 743 744 /* 745 * MMIO register read with bytes helper functions 746 * @offset:bytes offset from MMIO start 747 */ 748 749 /** 750 * amdgpu_mm_rreg8 - read a memory mapped IO register 751 * 752 * @adev: amdgpu_device pointer 753 * @offset: byte aligned register offset 754 * 755 * Returns the 8 bit value from the offset specified. 756 */ 757 uint8_t amdgpu_mm_rreg8(struct amdgpu_device *adev, uint32_t offset) 758 { 759 if (amdgpu_device_skip_hw_access(adev)) 760 return 0; 761 762 if (offset < adev->rmmio_size) 763 return (readb(adev->rmmio + offset)); 764 BUG(); 765 } 766 767 768 /** 769 * amdgpu_device_xcc_rreg - read a memory mapped IO or indirect register with specific XCC 770 * 771 * @adev: amdgpu_device pointer 772 * @reg: dword aligned register offset 773 * @acc_flags: access flags which require special behavior 774 * @xcc_id: xcc accelerated compute core id 775 * 776 * Returns the 32 bit value from the offset specified. 777 */ 778 uint32_t amdgpu_device_xcc_rreg(struct amdgpu_device *adev, 779 uint32_t reg, uint32_t acc_flags, 780 uint32_t xcc_id) 781 { 782 uint32_t ret, rlcg_flag; 783 784 if (amdgpu_device_skip_hw_access(adev)) 785 return 0; 786 787 if ((reg * 4) < adev->rmmio_size) { 788 if (amdgpu_sriov_vf(adev) && 789 !amdgpu_sriov_runtime(adev) && 790 adev->gfx.rlc.rlcg_reg_access_supported && 791 amdgpu_virt_get_rlcg_reg_access_flag(adev, acc_flags, 792 GC_HWIP, false, 793 &rlcg_flag)) { 794 ret = amdgpu_virt_rlcg_reg_rw(adev, reg, 0, rlcg_flag, GET_INST(GC, xcc_id)); 795 } else if (!(acc_flags & AMDGPU_REGS_NO_KIQ) && 796 amdgpu_sriov_runtime(adev) && 797 down_read_trylock(&adev->reset_domain->sem)) { 798 ret = amdgpu_kiq_rreg(adev, reg, xcc_id); 799 up_read(&adev->reset_domain->sem); 800 } else { 801 ret = readl(((void __iomem *)adev->rmmio) + (reg * 4)); 802 } 803 } else { 804 ret = adev->pcie_rreg(adev, reg * 4); 805 } 806 807 return ret; 808 } 809 810 /* 811 * MMIO register write with bytes helper functions 812 * @offset:bytes offset from MMIO start 813 * @value: the value want to be written to the register 814 */ 815 816 /** 817 * amdgpu_mm_wreg8 - read a memory mapped IO register 818 * 819 * @adev: amdgpu_device pointer 820 * @offset: byte aligned register offset 821 * @value: 8 bit value to write 822 * 823 * Writes the value specified to the offset specified. 824 */ 825 void amdgpu_mm_wreg8(struct amdgpu_device *adev, uint32_t offset, uint8_t value) 826 { 827 if (amdgpu_device_skip_hw_access(adev)) 828 return; 829 830 if (offset < adev->rmmio_size) 831 writeb(value, adev->rmmio + offset); 832 else 833 BUG(); 834 } 835 836 /** 837 * amdgpu_device_wreg - write to a memory mapped IO or indirect register 838 * 839 * @adev: amdgpu_device pointer 840 * @reg: dword aligned register offset 841 * @v: 32 bit value to write to the register 842 * @acc_flags: access flags which require special behavior 843 * 844 * Writes the value specified to the offset specified. 845 */ 846 void amdgpu_device_wreg(struct amdgpu_device *adev, 847 uint32_t reg, uint32_t v, 848 uint32_t acc_flags) 849 { 850 if (amdgpu_device_skip_hw_access(adev)) 851 return; 852 853 if ((reg * 4) < adev->rmmio_size) { 854 if (!(acc_flags & AMDGPU_REGS_NO_KIQ) && 855 amdgpu_sriov_runtime(adev) && 856 down_read_trylock(&adev->reset_domain->sem)) { 857 amdgpu_kiq_wreg(adev, reg, v, 0); 858 up_read(&adev->reset_domain->sem); 859 } else { 860 writel(v, ((void __iomem *)adev->rmmio) + (reg * 4)); 861 } 862 } else { 863 adev->pcie_wreg(adev, reg * 4, v); 864 } 865 866 trace_amdgpu_device_wreg(adev->pdev->device, reg, v); 867 } 868 869 /** 870 * amdgpu_mm_wreg_mmio_rlc - write register either with direct/indirect mmio or with RLC path if in range 871 * 872 * @adev: amdgpu_device pointer 873 * @reg: mmio/rlc register 874 * @v: value to write 875 * @xcc_id: xcc accelerated compute core id 876 * 877 * this function is invoked only for the debugfs register access 878 */ 879 void amdgpu_mm_wreg_mmio_rlc(struct amdgpu_device *adev, 880 uint32_t reg, uint32_t v, 881 uint32_t xcc_id) 882 { 883 if (amdgpu_device_skip_hw_access(adev)) 884 return; 885 886 if (amdgpu_sriov_fullaccess(adev) && 887 adev->gfx.rlc.funcs && 888 adev->gfx.rlc.funcs->is_rlcg_access_range) { 889 if (adev->gfx.rlc.funcs->is_rlcg_access_range(adev, reg)) 890 return amdgpu_sriov_wreg(adev, reg, v, 0, 0, xcc_id); 891 } else if ((reg * 4) >= adev->rmmio_size) { 892 adev->pcie_wreg(adev, reg * 4, v); 893 } else { 894 writel(v, ((void __iomem *)adev->rmmio) + (reg * 4)); 895 } 896 } 897 898 /** 899 * amdgpu_device_xcc_wreg - write to a memory mapped IO or indirect register with specific XCC 900 * 901 * @adev: amdgpu_device pointer 902 * @reg: dword aligned register offset 903 * @v: 32 bit value to write to the register 904 * @acc_flags: access flags which require special behavior 905 * @xcc_id: xcc accelerated compute core id 906 * 907 * Writes the value specified to the offset specified. 908 */ 909 void amdgpu_device_xcc_wreg(struct amdgpu_device *adev, 910 uint32_t reg, uint32_t v, 911 uint32_t acc_flags, uint32_t xcc_id) 912 { 913 uint32_t rlcg_flag; 914 915 if (amdgpu_device_skip_hw_access(adev)) 916 return; 917 918 if ((reg * 4) < adev->rmmio_size) { 919 if (amdgpu_sriov_vf(adev) && 920 !amdgpu_sriov_runtime(adev) && 921 adev->gfx.rlc.rlcg_reg_access_supported && 922 amdgpu_virt_get_rlcg_reg_access_flag(adev, acc_flags, 923 GC_HWIP, true, 924 &rlcg_flag)) { 925 amdgpu_virt_rlcg_reg_rw(adev, reg, v, rlcg_flag, GET_INST(GC, xcc_id)); 926 } else if (!(acc_flags & AMDGPU_REGS_NO_KIQ) && 927 amdgpu_sriov_runtime(adev) && 928 down_read_trylock(&adev->reset_domain->sem)) { 929 amdgpu_kiq_wreg(adev, reg, v, xcc_id); 930 up_read(&adev->reset_domain->sem); 931 } else { 932 writel(v, ((void __iomem *)adev->rmmio) + (reg * 4)); 933 } 934 } else { 935 adev->pcie_wreg(adev, reg * 4, v); 936 } 937 } 938 939 /** 940 * amdgpu_device_indirect_rreg - read an indirect register 941 * 942 * @adev: amdgpu_device pointer 943 * @reg_addr: indirect register address to read from 944 * 945 * Returns the value of indirect register @reg_addr 946 */ 947 u32 amdgpu_device_indirect_rreg(struct amdgpu_device *adev, 948 u32 reg_addr) 949 { 950 unsigned long flags, pcie_index, pcie_data; 951 void __iomem *pcie_index_offset; 952 void __iomem *pcie_data_offset; 953 u32 r; 954 955 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 956 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 957 958 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 959 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 960 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 961 962 writel(reg_addr, pcie_index_offset); 963 readl(pcie_index_offset); 964 r = readl(pcie_data_offset); 965 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 966 967 return r; 968 } 969 970 u32 amdgpu_device_indirect_rreg_ext(struct amdgpu_device *adev, 971 u64 reg_addr) 972 { 973 unsigned long flags, pcie_index, pcie_index_hi, pcie_data; 974 u32 r; 975 void __iomem *pcie_index_offset; 976 void __iomem *pcie_index_hi_offset; 977 void __iomem *pcie_data_offset; 978 979 if (unlikely(!adev->nbio.funcs)) { 980 pcie_index = AMDGPU_PCIE_INDEX_FALLBACK; 981 pcie_data = AMDGPU_PCIE_DATA_FALLBACK; 982 } else { 983 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 984 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 985 } 986 987 if (reg_addr >> 32) { 988 if (unlikely(!adev->nbio.funcs)) 989 pcie_index_hi = AMDGPU_PCIE_INDEX_HI_FALLBACK; 990 else 991 pcie_index_hi = adev->nbio.funcs->get_pcie_index_hi_offset(adev); 992 } else { 993 pcie_index_hi = 0; 994 } 995 996 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 997 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 998 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 999 if (pcie_index_hi != 0) 1000 pcie_index_hi_offset = (void __iomem *)adev->rmmio + 1001 pcie_index_hi * 4; 1002 1003 writel(reg_addr, pcie_index_offset); 1004 readl(pcie_index_offset); 1005 if (pcie_index_hi != 0) { 1006 writel((reg_addr >> 32) & 0xff, pcie_index_hi_offset); 1007 readl(pcie_index_hi_offset); 1008 } 1009 r = readl(pcie_data_offset); 1010 1011 /* clear the high bits */ 1012 if (pcie_index_hi != 0) { 1013 writel(0, pcie_index_hi_offset); 1014 readl(pcie_index_hi_offset); 1015 } 1016 1017 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 1018 1019 return r; 1020 } 1021 1022 /** 1023 * amdgpu_device_indirect_rreg64 - read a 64bits indirect register 1024 * 1025 * @adev: amdgpu_device pointer 1026 * @reg_addr: indirect register address to read from 1027 * 1028 * Returns the value of indirect register @reg_addr 1029 */ 1030 u64 amdgpu_device_indirect_rreg64(struct amdgpu_device *adev, 1031 u32 reg_addr) 1032 { 1033 unsigned long flags, pcie_index, pcie_data; 1034 void __iomem *pcie_index_offset; 1035 void __iomem *pcie_data_offset; 1036 u64 r; 1037 1038 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 1039 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 1040 1041 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 1042 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 1043 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 1044 1045 /* read low 32 bits */ 1046 writel(reg_addr, pcie_index_offset); 1047 readl(pcie_index_offset); 1048 r = readl(pcie_data_offset); 1049 /* read high 32 bits */ 1050 writel(reg_addr + 4, pcie_index_offset); 1051 readl(pcie_index_offset); 1052 r |= ((u64)readl(pcie_data_offset) << 32); 1053 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 1054 1055 return r; 1056 } 1057 1058 u64 amdgpu_device_indirect_rreg64_ext(struct amdgpu_device *adev, 1059 u64 reg_addr) 1060 { 1061 unsigned long flags, pcie_index, pcie_data; 1062 unsigned long pcie_index_hi = 0; 1063 void __iomem *pcie_index_offset; 1064 void __iomem *pcie_index_hi_offset; 1065 void __iomem *pcie_data_offset; 1066 u64 r; 1067 1068 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 1069 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 1070 if ((reg_addr >> 32) && (adev->nbio.funcs->get_pcie_index_hi_offset)) 1071 pcie_index_hi = adev->nbio.funcs->get_pcie_index_hi_offset(adev); 1072 1073 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 1074 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 1075 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 1076 if (pcie_index_hi != 0) 1077 pcie_index_hi_offset = (void __iomem *)adev->rmmio + 1078 pcie_index_hi * 4; 1079 1080 /* read low 32 bits */ 1081 writel(reg_addr, pcie_index_offset); 1082 readl(pcie_index_offset); 1083 if (pcie_index_hi != 0) { 1084 writel((reg_addr >> 32) & 0xff, pcie_index_hi_offset); 1085 readl(pcie_index_hi_offset); 1086 } 1087 r = readl(pcie_data_offset); 1088 /* read high 32 bits */ 1089 writel(reg_addr + 4, pcie_index_offset); 1090 readl(pcie_index_offset); 1091 if (pcie_index_hi != 0) { 1092 writel((reg_addr >> 32) & 0xff, pcie_index_hi_offset); 1093 readl(pcie_index_hi_offset); 1094 } 1095 r |= ((u64)readl(pcie_data_offset) << 32); 1096 1097 /* clear the high bits */ 1098 if (pcie_index_hi != 0) { 1099 writel(0, pcie_index_hi_offset); 1100 readl(pcie_index_hi_offset); 1101 } 1102 1103 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 1104 1105 return r; 1106 } 1107 1108 /** 1109 * amdgpu_device_indirect_wreg - write an indirect register address 1110 * 1111 * @adev: amdgpu_device pointer 1112 * @reg_addr: indirect register offset 1113 * @reg_data: indirect register data 1114 * 1115 */ 1116 void amdgpu_device_indirect_wreg(struct amdgpu_device *adev, 1117 u32 reg_addr, u32 reg_data) 1118 { 1119 unsigned long flags, pcie_index, pcie_data; 1120 void __iomem *pcie_index_offset; 1121 void __iomem *pcie_data_offset; 1122 1123 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 1124 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 1125 1126 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 1127 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 1128 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 1129 1130 writel(reg_addr, pcie_index_offset); 1131 readl(pcie_index_offset); 1132 writel(reg_data, pcie_data_offset); 1133 readl(pcie_data_offset); 1134 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 1135 } 1136 1137 void amdgpu_device_indirect_wreg_ext(struct amdgpu_device *adev, 1138 u64 reg_addr, u32 reg_data) 1139 { 1140 unsigned long flags, pcie_index, pcie_index_hi, pcie_data; 1141 void __iomem *pcie_index_offset; 1142 void __iomem *pcie_index_hi_offset; 1143 void __iomem *pcie_data_offset; 1144 1145 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 1146 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 1147 if ((reg_addr >> 32) && (adev->nbio.funcs->get_pcie_index_hi_offset)) 1148 pcie_index_hi = adev->nbio.funcs->get_pcie_index_hi_offset(adev); 1149 else 1150 pcie_index_hi = 0; 1151 1152 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 1153 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 1154 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 1155 if (pcie_index_hi != 0) 1156 pcie_index_hi_offset = (void __iomem *)adev->rmmio + 1157 pcie_index_hi * 4; 1158 1159 writel(reg_addr, pcie_index_offset); 1160 readl(pcie_index_offset); 1161 if (pcie_index_hi != 0) { 1162 writel((reg_addr >> 32) & 0xff, pcie_index_hi_offset); 1163 readl(pcie_index_hi_offset); 1164 } 1165 writel(reg_data, pcie_data_offset); 1166 readl(pcie_data_offset); 1167 1168 /* clear the high bits */ 1169 if (pcie_index_hi != 0) { 1170 writel(0, pcie_index_hi_offset); 1171 readl(pcie_index_hi_offset); 1172 } 1173 1174 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 1175 } 1176 1177 /** 1178 * amdgpu_device_indirect_wreg64 - write a 64bits indirect register address 1179 * 1180 * @adev: amdgpu_device pointer 1181 * @reg_addr: indirect register offset 1182 * @reg_data: indirect register data 1183 * 1184 */ 1185 void amdgpu_device_indirect_wreg64(struct amdgpu_device *adev, 1186 u32 reg_addr, u64 reg_data) 1187 { 1188 unsigned long flags, pcie_index, pcie_data; 1189 void __iomem *pcie_index_offset; 1190 void __iomem *pcie_data_offset; 1191 1192 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 1193 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 1194 1195 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 1196 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 1197 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 1198 1199 /* write low 32 bits */ 1200 writel(reg_addr, pcie_index_offset); 1201 readl(pcie_index_offset); 1202 writel((u32)(reg_data & 0xffffffffULL), pcie_data_offset); 1203 readl(pcie_data_offset); 1204 /* write high 32 bits */ 1205 writel(reg_addr + 4, pcie_index_offset); 1206 readl(pcie_index_offset); 1207 writel((u32)(reg_data >> 32), pcie_data_offset); 1208 readl(pcie_data_offset); 1209 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 1210 } 1211 1212 void amdgpu_device_indirect_wreg64_ext(struct amdgpu_device *adev, 1213 u64 reg_addr, u64 reg_data) 1214 { 1215 unsigned long flags, pcie_index, pcie_data; 1216 unsigned long pcie_index_hi = 0; 1217 void __iomem *pcie_index_offset; 1218 void __iomem *pcie_index_hi_offset; 1219 void __iomem *pcie_data_offset; 1220 1221 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 1222 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 1223 if ((reg_addr >> 32) && (adev->nbio.funcs->get_pcie_index_hi_offset)) 1224 pcie_index_hi = adev->nbio.funcs->get_pcie_index_hi_offset(adev); 1225 1226 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 1227 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 1228 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 1229 if (pcie_index_hi != 0) 1230 pcie_index_hi_offset = (void __iomem *)adev->rmmio + 1231 pcie_index_hi * 4; 1232 1233 /* write low 32 bits */ 1234 writel(reg_addr, pcie_index_offset); 1235 readl(pcie_index_offset); 1236 if (pcie_index_hi != 0) { 1237 writel((reg_addr >> 32) & 0xff, pcie_index_hi_offset); 1238 readl(pcie_index_hi_offset); 1239 } 1240 writel((u32)(reg_data & 0xffffffffULL), pcie_data_offset); 1241 readl(pcie_data_offset); 1242 /* write high 32 bits */ 1243 writel(reg_addr + 4, pcie_index_offset); 1244 readl(pcie_index_offset); 1245 if (pcie_index_hi != 0) { 1246 writel((reg_addr >> 32) & 0xff, pcie_index_hi_offset); 1247 readl(pcie_index_hi_offset); 1248 } 1249 writel((u32)(reg_data >> 32), pcie_data_offset); 1250 readl(pcie_data_offset); 1251 1252 /* clear the high bits */ 1253 if (pcie_index_hi != 0) { 1254 writel(0, pcie_index_hi_offset); 1255 readl(pcie_index_hi_offset); 1256 } 1257 1258 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 1259 } 1260 1261 /** 1262 * amdgpu_device_get_rev_id - query device rev_id 1263 * 1264 * @adev: amdgpu_device pointer 1265 * 1266 * Return device rev_id 1267 */ 1268 u32 amdgpu_device_get_rev_id(struct amdgpu_device *adev) 1269 { 1270 return adev->nbio.funcs->get_rev_id(adev); 1271 } 1272 1273 /** 1274 * amdgpu_invalid_rreg - dummy reg read function 1275 * 1276 * @adev: amdgpu_device pointer 1277 * @reg: offset of register 1278 * 1279 * Dummy register read function. Used for register blocks 1280 * that certain asics don't have (all asics). 1281 * Returns the value in the register. 1282 */ 1283 static uint32_t amdgpu_invalid_rreg(struct amdgpu_device *adev, uint32_t reg) 1284 { 1285 dev_err(adev->dev, "Invalid callback to read register 0x%04X\n", reg); 1286 BUG(); 1287 return 0; 1288 } 1289 1290 static uint32_t amdgpu_invalid_rreg_ext(struct amdgpu_device *adev, uint64_t reg) 1291 { 1292 dev_err(adev->dev, "Invalid callback to read register 0x%llX\n", reg); 1293 BUG(); 1294 return 0; 1295 } 1296 1297 /** 1298 * amdgpu_invalid_wreg - dummy reg write function 1299 * 1300 * @adev: amdgpu_device pointer 1301 * @reg: offset of register 1302 * @v: value to write to the register 1303 * 1304 * Dummy register read function. Used for register blocks 1305 * that certain asics don't have (all asics). 1306 */ 1307 static void amdgpu_invalid_wreg(struct amdgpu_device *adev, uint32_t reg, uint32_t v) 1308 { 1309 dev_err(adev->dev, 1310 "Invalid callback to write register 0x%04X with 0x%08X\n", reg, 1311 v); 1312 BUG(); 1313 } 1314 1315 static void amdgpu_invalid_wreg_ext(struct amdgpu_device *adev, uint64_t reg, uint32_t v) 1316 { 1317 dev_err(adev->dev, 1318 "Invalid callback to write register 0x%llX with 0x%08X\n", reg, 1319 v); 1320 BUG(); 1321 } 1322 1323 /** 1324 * amdgpu_invalid_rreg64 - dummy 64 bit reg read function 1325 * 1326 * @adev: amdgpu_device pointer 1327 * @reg: offset of register 1328 * 1329 * Dummy register read function. Used for register blocks 1330 * that certain asics don't have (all asics). 1331 * Returns the value in the register. 1332 */ 1333 static uint64_t amdgpu_invalid_rreg64(struct amdgpu_device *adev, uint32_t reg) 1334 { 1335 dev_err(adev->dev, "Invalid callback to read 64 bit register 0x%04X\n", 1336 reg); 1337 BUG(); 1338 return 0; 1339 } 1340 1341 static uint64_t amdgpu_invalid_rreg64_ext(struct amdgpu_device *adev, uint64_t reg) 1342 { 1343 dev_err(adev->dev, "Invalid callback to read register 0x%llX\n", reg); 1344 BUG(); 1345 return 0; 1346 } 1347 1348 /** 1349 * amdgpu_invalid_wreg64 - dummy reg write function 1350 * 1351 * @adev: amdgpu_device pointer 1352 * @reg: offset of register 1353 * @v: value to write to the register 1354 * 1355 * Dummy register read function. Used for register blocks 1356 * that certain asics don't have (all asics). 1357 */ 1358 static void amdgpu_invalid_wreg64(struct amdgpu_device *adev, uint32_t reg, uint64_t v) 1359 { 1360 dev_err(adev->dev, 1361 "Invalid callback to write 64 bit register 0x%04X with 0x%08llX\n", 1362 reg, v); 1363 BUG(); 1364 } 1365 1366 static void amdgpu_invalid_wreg64_ext(struct amdgpu_device *adev, uint64_t reg, uint64_t v) 1367 { 1368 dev_err(adev->dev, 1369 "Invalid callback to write 64 bit register 0x%llX with 0x%08llX\n", 1370 reg, v); 1371 BUG(); 1372 } 1373 1374 /** 1375 * amdgpu_block_invalid_rreg - dummy reg read function 1376 * 1377 * @adev: amdgpu_device pointer 1378 * @block: offset of instance 1379 * @reg: offset of register 1380 * 1381 * Dummy register read function. Used for register blocks 1382 * that certain asics don't have (all asics). 1383 * Returns the value in the register. 1384 */ 1385 static uint32_t amdgpu_block_invalid_rreg(struct amdgpu_device *adev, 1386 uint32_t block, uint32_t reg) 1387 { 1388 dev_err(adev->dev, 1389 "Invalid callback to read register 0x%04X in block 0x%04X\n", 1390 reg, block); 1391 BUG(); 1392 return 0; 1393 } 1394 1395 /** 1396 * amdgpu_block_invalid_wreg - dummy reg write function 1397 * 1398 * @adev: amdgpu_device pointer 1399 * @block: offset of instance 1400 * @reg: offset of register 1401 * @v: value to write to the register 1402 * 1403 * Dummy register read function. Used for register blocks 1404 * that certain asics don't have (all asics). 1405 */ 1406 static void amdgpu_block_invalid_wreg(struct amdgpu_device *adev, 1407 uint32_t block, 1408 uint32_t reg, uint32_t v) 1409 { 1410 dev_err(adev->dev, 1411 "Invalid block callback to write register 0x%04X in block 0x%04X with 0x%08X\n", 1412 reg, block, v); 1413 BUG(); 1414 } 1415 1416 static uint32_t amdgpu_device_get_vbios_flags(struct amdgpu_device *adev) 1417 { 1418 if (hweight32(adev->aid_mask) && (adev->flags & AMD_IS_APU)) 1419 return AMDGPU_VBIOS_SKIP; 1420 1421 if (hweight32(adev->aid_mask) && amdgpu_passthrough(adev)) 1422 return AMDGPU_VBIOS_OPTIONAL; 1423 1424 return 0; 1425 } 1426 1427 /** 1428 * amdgpu_device_asic_init - Wrapper for atom asic_init 1429 * 1430 * @adev: amdgpu_device pointer 1431 * 1432 * Does any asic specific work and then calls atom asic init. 1433 */ 1434 static int amdgpu_device_asic_init(struct amdgpu_device *adev) 1435 { 1436 uint32_t flags; 1437 bool optional; 1438 int ret; 1439 1440 amdgpu_asic_pre_asic_init(adev); 1441 flags = amdgpu_device_get_vbios_flags(adev); 1442 optional = !!(flags & (AMDGPU_VBIOS_OPTIONAL | AMDGPU_VBIOS_SKIP)); 1443 1444 if (amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 3) || 1445 amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 4) || 1446 amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 5, 0) || 1447 amdgpu_ip_version(adev, GC_HWIP, 0) >= IP_VERSION(11, 0, 0)) { 1448 amdgpu_psp_wait_for_bootloader(adev); 1449 if (optional && !adev->bios) 1450 return 0; 1451 1452 ret = amdgpu_atomfirmware_asic_init(adev, true); 1453 return ret; 1454 } else { 1455 if (optional && !adev->bios) 1456 return 0; 1457 1458 return amdgpu_atom_asic_init(adev->mode_info.atom_context); 1459 } 1460 1461 return 0; 1462 } 1463 1464 /** 1465 * amdgpu_device_mem_scratch_init - allocate the VRAM scratch page 1466 * 1467 * @adev: amdgpu_device pointer 1468 * 1469 * Allocates a scratch page of VRAM for use by various things in the 1470 * driver. 1471 */ 1472 static int amdgpu_device_mem_scratch_init(struct amdgpu_device *adev) 1473 { 1474 return amdgpu_bo_create_kernel(adev, AMDGPU_GPU_PAGE_SIZE, PAGE_SIZE, 1475 AMDGPU_GEM_DOMAIN_VRAM | 1476 AMDGPU_GEM_DOMAIN_GTT, 1477 &adev->mem_scratch.robj, 1478 &adev->mem_scratch.gpu_addr, 1479 (void **)&adev->mem_scratch.ptr); 1480 } 1481 1482 /** 1483 * amdgpu_device_mem_scratch_fini - Free the VRAM scratch page 1484 * 1485 * @adev: amdgpu_device pointer 1486 * 1487 * Frees the VRAM scratch page. 1488 */ 1489 static void amdgpu_device_mem_scratch_fini(struct amdgpu_device *adev) 1490 { 1491 amdgpu_bo_free_kernel(&adev->mem_scratch.robj, NULL, NULL); 1492 } 1493 1494 /** 1495 * amdgpu_device_program_register_sequence - program an array of registers. 1496 * 1497 * @adev: amdgpu_device pointer 1498 * @registers: pointer to the register array 1499 * @array_size: size of the register array 1500 * 1501 * Programs an array or registers with and or masks. 1502 * This is a helper for setting golden registers. 1503 */ 1504 void amdgpu_device_program_register_sequence(struct amdgpu_device *adev, 1505 const u32 *registers, 1506 const u32 array_size) 1507 { 1508 u32 tmp, reg, and_mask, or_mask; 1509 int i; 1510 1511 if (array_size % 3) 1512 return; 1513 1514 for (i = 0; i < array_size; i += 3) { 1515 reg = registers[i + 0]; 1516 and_mask = registers[i + 1]; 1517 or_mask = registers[i + 2]; 1518 1519 if (and_mask == 0xffffffff) { 1520 tmp = or_mask; 1521 } else { 1522 tmp = RREG32(reg); 1523 tmp &= ~and_mask; 1524 if (adev->family >= AMDGPU_FAMILY_AI) 1525 tmp |= (or_mask & and_mask); 1526 else 1527 tmp |= or_mask; 1528 } 1529 WREG32(reg, tmp); 1530 } 1531 } 1532 1533 /** 1534 * amdgpu_device_pci_config_reset - reset the GPU 1535 * 1536 * @adev: amdgpu_device pointer 1537 * 1538 * Resets the GPU using the pci config reset sequence. 1539 * Only applicable to asics prior to vega10. 1540 */ 1541 void amdgpu_device_pci_config_reset(struct amdgpu_device *adev) 1542 { 1543 pci_write_config_dword(adev->pdev, 0x7c, AMDGPU_ASIC_RESET_DATA); 1544 } 1545 1546 /** 1547 * amdgpu_device_pci_reset - reset the GPU using generic PCI means 1548 * 1549 * @adev: amdgpu_device pointer 1550 * 1551 * Resets the GPU using generic pci reset interfaces (FLR, SBR, etc.). 1552 */ 1553 int amdgpu_device_pci_reset(struct amdgpu_device *adev) 1554 { 1555 return pci_reset_function(adev->pdev); 1556 } 1557 1558 /* 1559 * amdgpu_device_wb_*() 1560 * Writeback is the method by which the GPU updates special pages in memory 1561 * with the status of certain GPU events (fences, ring pointers,etc.). 1562 */ 1563 1564 /** 1565 * amdgpu_device_wb_fini - Disable Writeback and free memory 1566 * 1567 * @adev: amdgpu_device pointer 1568 * 1569 * Disables Writeback and frees the Writeback memory (all asics). 1570 * Used at driver shutdown. 1571 */ 1572 static void amdgpu_device_wb_fini(struct amdgpu_device *adev) 1573 { 1574 if (adev->wb.wb_obj) { 1575 amdgpu_bo_free_kernel(&adev->wb.wb_obj, 1576 &adev->wb.gpu_addr, 1577 (void **)&adev->wb.wb); 1578 adev->wb.wb_obj = NULL; 1579 } 1580 } 1581 1582 /** 1583 * amdgpu_device_wb_init - Init Writeback driver info and allocate memory 1584 * 1585 * @adev: amdgpu_device pointer 1586 * 1587 * Initializes writeback and allocates writeback memory (all asics). 1588 * Used at driver startup. 1589 * Returns 0 on success or an -error on failure. 1590 */ 1591 static int amdgpu_device_wb_init(struct amdgpu_device *adev) 1592 { 1593 int r; 1594 1595 if (adev->wb.wb_obj == NULL) { 1596 /* AMDGPU_MAX_WB * sizeof(uint32_t) * 8 = AMDGPU_MAX_WB 256bit slots */ 1597 r = amdgpu_bo_create_kernel(adev, AMDGPU_MAX_WB * sizeof(uint32_t) * 8, 1598 PAGE_SIZE, AMDGPU_GEM_DOMAIN_GTT, 1599 &adev->wb.wb_obj, &adev->wb.gpu_addr, 1600 (void **)&adev->wb.wb); 1601 if (r) { 1602 dev_warn(adev->dev, "(%d) create WB bo failed\n", r); 1603 return r; 1604 } 1605 1606 adev->wb.num_wb = AMDGPU_MAX_WB; 1607 memset(&adev->wb.used, 0, sizeof(adev->wb.used)); 1608 1609 /* clear wb memory */ 1610 memset((char *)adev->wb.wb, 0, AMDGPU_MAX_WB * sizeof(uint32_t) * 8); 1611 } 1612 1613 return 0; 1614 } 1615 1616 /** 1617 * amdgpu_device_wb_get - Allocate a wb entry 1618 * 1619 * @adev: amdgpu_device pointer 1620 * @wb: wb index 1621 * 1622 * Allocate a wb slot for use by the driver (all asics). 1623 * Returns 0 on success or -EINVAL on failure. 1624 */ 1625 int amdgpu_device_wb_get(struct amdgpu_device *adev, u32 *wb) 1626 { 1627 unsigned long flags, offset; 1628 1629 spin_lock_irqsave(&adev->wb.lock, flags); 1630 offset = find_first_zero_bit(adev->wb.used, adev->wb.num_wb); 1631 if (offset < adev->wb.num_wb) { 1632 __set_bit(offset, adev->wb.used); 1633 spin_unlock_irqrestore(&adev->wb.lock, flags); 1634 *wb = offset << 3; /* convert to dw offset */ 1635 return 0; 1636 } else { 1637 spin_unlock_irqrestore(&adev->wb.lock, flags); 1638 return -EINVAL; 1639 } 1640 } 1641 1642 /** 1643 * amdgpu_device_wb_free - Free a wb entry 1644 * 1645 * @adev: amdgpu_device pointer 1646 * @wb: wb index 1647 * 1648 * Free a wb slot allocated for use by the driver (all asics) 1649 */ 1650 void amdgpu_device_wb_free(struct amdgpu_device *adev, u32 wb) 1651 { 1652 unsigned long flags; 1653 1654 wb >>= 3; 1655 spin_lock_irqsave(&adev->wb.lock, flags); 1656 if (wb < adev->wb.num_wb) 1657 __clear_bit(wb, adev->wb.used); 1658 spin_unlock_irqrestore(&adev->wb.lock, flags); 1659 } 1660 1661 /** 1662 * amdgpu_device_resize_fb_bar - try to resize FB BAR 1663 * 1664 * @adev: amdgpu_device pointer 1665 * 1666 * Try to resize FB BAR to make all VRAM CPU accessible. We try very hard not 1667 * to fail, but if any of the BARs is not accessible after the size we abort 1668 * driver loading by returning -ENODEV. 1669 */ 1670 int amdgpu_device_resize_fb_bar(struct amdgpu_device *adev) 1671 { 1672 int rbar_size = pci_rebar_bytes_to_size(adev->gmc.real_vram_size); 1673 struct pci_bus *root; 1674 struct resource *res; 1675 unsigned int i; 1676 u16 cmd; 1677 int r; 1678 1679 if (!IS_ENABLED(CONFIG_PHYS_ADDR_T_64BIT)) 1680 return 0; 1681 1682 /* Bypass for VF */ 1683 if (amdgpu_sriov_vf(adev)) 1684 return 0; 1685 1686 if (!amdgpu_rebar) 1687 return 0; 1688 1689 /* resizing on Dell G5 SE platforms causes problems with runtime pm */ 1690 if ((amdgpu_runtime_pm != 0) && 1691 adev->pdev->vendor == PCI_VENDOR_ID_ATI && 1692 adev->pdev->device == 0x731f && 1693 adev->pdev->subsystem_vendor == PCI_VENDOR_ID_DELL) 1694 return 0; 1695 1696 /* PCI_EXT_CAP_ID_VNDR extended capability is located at 0x100 */ 1697 if (!pci_find_ext_capability(adev->pdev, PCI_EXT_CAP_ID_VNDR)) 1698 dev_warn( 1699 adev->dev, 1700 "System can't access extended configuration space, please check!!\n"); 1701 1702 /* skip if the bios has already enabled large BAR */ 1703 if (adev->gmc.real_vram_size && 1704 (pci_resource_len(adev->pdev, 0) >= adev->gmc.real_vram_size)) 1705 return 0; 1706 1707 /* Check if the root BUS has 64bit memory resources */ 1708 root = adev->pdev->bus; 1709 while (root->parent) 1710 root = root->parent; 1711 1712 pci_bus_for_each_resource(root, res, i) { 1713 if (res && res->flags & (IORESOURCE_MEM | IORESOURCE_MEM_64) && 1714 res->start > 0x100000000ull) 1715 break; 1716 } 1717 1718 /* Trying to resize is pointless without a root hub window above 4GB */ 1719 if (!res) 1720 return 0; 1721 1722 /* Limit the BAR size to what is available */ 1723 rbar_size = min(fls(pci_rebar_get_possible_sizes(adev->pdev, 0)) - 1, 1724 rbar_size); 1725 1726 /* Disable memory decoding while we change the BAR addresses and size */ 1727 pci_read_config_word(adev->pdev, PCI_COMMAND, &cmd); 1728 pci_write_config_word(adev->pdev, PCI_COMMAND, 1729 cmd & ~PCI_COMMAND_MEMORY); 1730 1731 /* Free the VRAM and doorbell BAR, we most likely need to move both. */ 1732 amdgpu_doorbell_fini(adev); 1733 if (adev->asic_type >= CHIP_BONAIRE) 1734 pci_release_resource(adev->pdev, 2); 1735 1736 pci_release_resource(adev->pdev, 0); 1737 1738 r = pci_resize_resource(adev->pdev, 0, rbar_size); 1739 if (r == -ENOSPC) 1740 dev_info(adev->dev, 1741 "Not enough PCI address space for a large BAR."); 1742 else if (r && r != -ENOTSUPP) 1743 dev_err(adev->dev, "Problem resizing BAR0 (%d).", r); 1744 1745 pci_assign_unassigned_bus_resources(adev->pdev->bus); 1746 1747 /* When the doorbell or fb BAR isn't available we have no chance of 1748 * using the device. 1749 */ 1750 r = amdgpu_doorbell_init(adev); 1751 if (r || (pci_resource_flags(adev->pdev, 0) & IORESOURCE_UNSET)) 1752 return -ENODEV; 1753 1754 pci_write_config_word(adev->pdev, PCI_COMMAND, cmd); 1755 1756 return 0; 1757 } 1758 1759 /* 1760 * GPU helpers function. 1761 */ 1762 /** 1763 * amdgpu_device_need_post - check if the hw need post or not 1764 * 1765 * @adev: amdgpu_device pointer 1766 * 1767 * Check if the asic has been initialized (all asics) at driver startup 1768 * or post is needed if hw reset is performed. 1769 * Returns true if need or false if not. 1770 */ 1771 bool amdgpu_device_need_post(struct amdgpu_device *adev) 1772 { 1773 uint32_t reg, flags; 1774 1775 if (amdgpu_sriov_vf(adev)) 1776 return false; 1777 1778 flags = amdgpu_device_get_vbios_flags(adev); 1779 if (flags & AMDGPU_VBIOS_SKIP) 1780 return false; 1781 if ((flags & AMDGPU_VBIOS_OPTIONAL) && !adev->bios) 1782 return false; 1783 1784 if (amdgpu_passthrough(adev)) { 1785 /* for FIJI: In whole GPU pass-through virtualization case, after VM reboot 1786 * some old smc fw still need driver do vPost otherwise gpu hang, while 1787 * those smc fw version above 22.15 doesn't have this flaw, so we force 1788 * vpost executed for smc version below 22.15 1789 */ 1790 if (adev->asic_type == CHIP_FIJI) { 1791 int err; 1792 uint32_t fw_ver; 1793 1794 err = request_firmware(&adev->pm.fw, "amdgpu/fiji_smc.bin", adev->dev); 1795 /* force vPost if error occurred */ 1796 if (err) 1797 return true; 1798 1799 fw_ver = *((uint32_t *)adev->pm.fw->data + 69); 1800 release_firmware(adev->pm.fw); 1801 if (fw_ver < 0x00160e00) 1802 return true; 1803 } 1804 } 1805 1806 /* Don't post if we need to reset whole hive on init */ 1807 if (adev->init_lvl->level == AMDGPU_INIT_LEVEL_MINIMAL_XGMI) 1808 return false; 1809 1810 if (adev->has_hw_reset) { 1811 adev->has_hw_reset = false; 1812 return true; 1813 } 1814 1815 /* bios scratch used on CIK+ */ 1816 if (adev->asic_type >= CHIP_BONAIRE) 1817 return amdgpu_atombios_scratch_need_asic_init(adev); 1818 1819 /* check MEM_SIZE for older asics */ 1820 reg = amdgpu_asic_get_config_memsize(adev); 1821 1822 if ((reg != 0) && (reg != 0xffffffff)) 1823 return false; 1824 1825 return true; 1826 } 1827 1828 /* 1829 * Check whether seamless boot is supported. 1830 * 1831 * So far we only support seamless boot on DCE 3.0 or later. 1832 * If users report that it works on older ASICS as well, we may 1833 * loosen this. 1834 */ 1835 bool amdgpu_device_seamless_boot_supported(struct amdgpu_device *adev) 1836 { 1837 switch (amdgpu_seamless) { 1838 case -1: 1839 break; 1840 case 1: 1841 return true; 1842 case 0: 1843 return false; 1844 default: 1845 dev_err(adev->dev, "Invalid value for amdgpu.seamless: %d\n", 1846 amdgpu_seamless); 1847 return false; 1848 } 1849 1850 if (!(adev->flags & AMD_IS_APU)) 1851 return false; 1852 1853 if (adev->mman.keep_stolen_vga_memory) 1854 return false; 1855 1856 return amdgpu_ip_version(adev, DCE_HWIP, 0) >= IP_VERSION(3, 0, 0); 1857 } 1858 1859 /* 1860 * Intel hosts such as Rocket Lake, Alder Lake, Raptor Lake and Sapphire Rapids 1861 * don't support dynamic speed switching. Until we have confirmation from Intel 1862 * that a specific host supports it, it's safer that we keep it disabled for all. 1863 * 1864 * https://edc.intel.com/content/www/us/en/design/products/platforms/details/raptor-lake-s/13th-generation-core-processors-datasheet-volume-1-of-2/005/pci-express-support/ 1865 * https://gitlab.freedesktop.org/drm/amd/-/issues/2663 1866 */ 1867 static bool amdgpu_device_pcie_dynamic_switching_supported(struct amdgpu_device *adev) 1868 { 1869 #if IS_ENABLED(CONFIG_X86) 1870 struct cpuinfo_x86 *c = &cpu_data(0); 1871 1872 /* eGPU change speeds based on USB4 fabric conditions */ 1873 if (dev_is_removable(adev->dev)) 1874 return true; 1875 1876 if (c->x86_vendor == X86_VENDOR_INTEL) 1877 return false; 1878 #endif 1879 return true; 1880 } 1881 1882 static bool amdgpu_device_aspm_support_quirk(struct amdgpu_device *adev) 1883 { 1884 #if IS_ENABLED(CONFIG_X86) 1885 struct cpuinfo_x86 *c = &cpu_data(0); 1886 1887 if (!(amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(12, 0, 0) || 1888 amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(12, 0, 1))) 1889 return false; 1890 1891 if (c->x86 == 6 && 1892 adev->pm.pcie_gen_mask & CAIL_PCIE_LINK_SPEED_SUPPORT_GEN5) { 1893 switch (c->x86_model) { 1894 case VFM_MODEL(INTEL_ALDERLAKE): 1895 case VFM_MODEL(INTEL_ALDERLAKE_L): 1896 case VFM_MODEL(INTEL_RAPTORLAKE): 1897 case VFM_MODEL(INTEL_RAPTORLAKE_P): 1898 case VFM_MODEL(INTEL_RAPTORLAKE_S): 1899 return true; 1900 default: 1901 return false; 1902 } 1903 } else { 1904 return false; 1905 } 1906 #else 1907 return false; 1908 #endif 1909 } 1910 1911 /** 1912 * amdgpu_device_should_use_aspm - check if the device should program ASPM 1913 * 1914 * @adev: amdgpu_device pointer 1915 * 1916 * Confirm whether the module parameter and pcie bridge agree that ASPM should 1917 * be set for this device. 1918 * 1919 * Returns true if it should be used or false if not. 1920 */ 1921 bool amdgpu_device_should_use_aspm(struct amdgpu_device *adev) 1922 { 1923 switch (amdgpu_aspm) { 1924 case -1: 1925 break; 1926 case 0: 1927 return false; 1928 case 1: 1929 return true; 1930 default: 1931 return false; 1932 } 1933 if (adev->flags & AMD_IS_APU) 1934 return false; 1935 if (amdgpu_device_aspm_support_quirk(adev)) 1936 return false; 1937 return pcie_aspm_enabled(adev->pdev); 1938 } 1939 1940 /* if we get transitioned to only one device, take VGA back */ 1941 /** 1942 * amdgpu_device_vga_set_decode - enable/disable vga decode 1943 * 1944 * @pdev: PCI device pointer 1945 * @state: enable/disable vga decode 1946 * 1947 * Enable/disable vga decode (all asics). 1948 * Returns VGA resource flags. 1949 */ 1950 static unsigned int amdgpu_device_vga_set_decode(struct pci_dev *pdev, 1951 bool state) 1952 { 1953 struct amdgpu_device *adev = drm_to_adev(pci_get_drvdata(pdev)); 1954 1955 amdgpu_asic_set_vga_state(adev, state); 1956 if (state) 1957 return VGA_RSRC_LEGACY_IO | VGA_RSRC_LEGACY_MEM | 1958 VGA_RSRC_NORMAL_IO | VGA_RSRC_NORMAL_MEM; 1959 else 1960 return VGA_RSRC_NORMAL_IO | VGA_RSRC_NORMAL_MEM; 1961 } 1962 1963 /** 1964 * amdgpu_device_check_block_size - validate the vm block size 1965 * 1966 * @adev: amdgpu_device pointer 1967 * 1968 * Validates the vm block size specified via module parameter. 1969 * The vm block size defines number of bits in page table versus page directory, 1970 * a page is 4KB so we have 12 bits offset, minimum 9 bits in the 1971 * page table and the remaining bits are in the page directory. 1972 */ 1973 static void amdgpu_device_check_block_size(struct amdgpu_device *adev) 1974 { 1975 /* defines number of bits in page table versus page directory, 1976 * a page is 4KB so we have 12 bits offset, minimum 9 bits in the 1977 * page table and the remaining bits are in the page directory 1978 */ 1979 if (amdgpu_vm_block_size == -1) 1980 return; 1981 1982 if (amdgpu_vm_block_size < 9) { 1983 dev_warn(adev->dev, "VM page table size (%d) too small\n", 1984 amdgpu_vm_block_size); 1985 amdgpu_vm_block_size = -1; 1986 } 1987 } 1988 1989 /** 1990 * amdgpu_device_check_vm_size - validate the vm size 1991 * 1992 * @adev: amdgpu_device pointer 1993 * 1994 * Validates the vm size in GB specified via module parameter. 1995 * The VM size is the size of the GPU virtual memory space in GB. 1996 */ 1997 static void amdgpu_device_check_vm_size(struct amdgpu_device *adev) 1998 { 1999 /* no need to check the default value */ 2000 if (amdgpu_vm_size == -1) 2001 return; 2002 2003 if (amdgpu_vm_size < 1) { 2004 dev_warn(adev->dev, "VM size (%d) too small, min is 1GB\n", 2005 amdgpu_vm_size); 2006 amdgpu_vm_size = -1; 2007 } 2008 } 2009 2010 static void amdgpu_device_check_smu_prv_buffer_size(struct amdgpu_device *adev) 2011 { 2012 struct sysinfo si; 2013 bool is_os_64 = (sizeof(void *) == 8); 2014 uint64_t total_memory; 2015 uint64_t dram_size_seven_GB = 0x1B8000000; 2016 uint64_t dram_size_three_GB = 0xB8000000; 2017 2018 if (amdgpu_smu_memory_pool_size == 0) 2019 return; 2020 2021 if (!is_os_64) { 2022 dev_warn(adev->dev, "Not 64-bit OS, feature not supported\n"); 2023 goto def_value; 2024 } 2025 si_meminfo(&si); 2026 total_memory = (uint64_t)si.totalram * si.mem_unit; 2027 2028 if ((amdgpu_smu_memory_pool_size == 1) || 2029 (amdgpu_smu_memory_pool_size == 2)) { 2030 if (total_memory < dram_size_three_GB) 2031 goto def_value1; 2032 } else if ((amdgpu_smu_memory_pool_size == 4) || 2033 (amdgpu_smu_memory_pool_size == 8)) { 2034 if (total_memory < dram_size_seven_GB) 2035 goto def_value1; 2036 } else { 2037 dev_warn(adev->dev, "Smu memory pool size not supported\n"); 2038 goto def_value; 2039 } 2040 adev->pm.smu_prv_buffer_size = amdgpu_smu_memory_pool_size << 28; 2041 2042 return; 2043 2044 def_value1: 2045 dev_warn(adev->dev, "No enough system memory\n"); 2046 def_value: 2047 adev->pm.smu_prv_buffer_size = 0; 2048 } 2049 2050 static int amdgpu_device_init_apu_flags(struct amdgpu_device *adev) 2051 { 2052 if (!(adev->flags & AMD_IS_APU) || 2053 adev->asic_type < CHIP_RAVEN) 2054 return 0; 2055 2056 switch (adev->asic_type) { 2057 case CHIP_RAVEN: 2058 if (adev->pdev->device == 0x15dd) 2059 adev->apu_flags |= AMD_APU_IS_RAVEN; 2060 if (adev->pdev->device == 0x15d8) 2061 adev->apu_flags |= AMD_APU_IS_PICASSO; 2062 break; 2063 case CHIP_RENOIR: 2064 if ((adev->pdev->device == 0x1636) || 2065 (adev->pdev->device == 0x164c)) 2066 adev->apu_flags |= AMD_APU_IS_RENOIR; 2067 else 2068 adev->apu_flags |= AMD_APU_IS_GREEN_SARDINE; 2069 break; 2070 case CHIP_VANGOGH: 2071 adev->apu_flags |= AMD_APU_IS_VANGOGH; 2072 break; 2073 case CHIP_YELLOW_CARP: 2074 break; 2075 case CHIP_CYAN_SKILLFISH: 2076 if ((adev->pdev->device == 0x13FE) || 2077 (adev->pdev->device == 0x143F)) 2078 adev->apu_flags |= AMD_APU_IS_CYAN_SKILLFISH2; 2079 break; 2080 default: 2081 break; 2082 } 2083 2084 return 0; 2085 } 2086 2087 /** 2088 * amdgpu_device_check_arguments - validate module params 2089 * 2090 * @adev: amdgpu_device pointer 2091 * 2092 * Validates certain module parameters and updates 2093 * the associated values used by the driver (all asics). 2094 */ 2095 static int amdgpu_device_check_arguments(struct amdgpu_device *adev) 2096 { 2097 int i; 2098 2099 if (amdgpu_sched_jobs < 4) { 2100 dev_warn(adev->dev, "sched jobs (%d) must be at least 4\n", 2101 amdgpu_sched_jobs); 2102 amdgpu_sched_jobs = 4; 2103 } else if (!is_power_of_2(amdgpu_sched_jobs)) { 2104 dev_warn(adev->dev, "sched jobs (%d) must be a power of 2\n", 2105 amdgpu_sched_jobs); 2106 amdgpu_sched_jobs = roundup_pow_of_two(amdgpu_sched_jobs); 2107 } 2108 2109 if (amdgpu_gart_size != -1 && amdgpu_gart_size < 32) { 2110 /* gart size must be greater or equal to 32M */ 2111 dev_warn(adev->dev, "gart size (%d) too small\n", 2112 amdgpu_gart_size); 2113 amdgpu_gart_size = -1; 2114 } 2115 2116 if (amdgpu_gtt_size != -1 && amdgpu_gtt_size < 32) { 2117 /* gtt size must be greater or equal to 32M */ 2118 dev_warn(adev->dev, "gtt size (%d) too small\n", 2119 amdgpu_gtt_size); 2120 amdgpu_gtt_size = -1; 2121 } 2122 2123 /* valid range is between 4 and 9 inclusive */ 2124 if (amdgpu_vm_fragment_size != -1 && 2125 (amdgpu_vm_fragment_size > 9 || amdgpu_vm_fragment_size < 4)) { 2126 dev_warn(adev->dev, "valid range is between 4 and 9\n"); 2127 amdgpu_vm_fragment_size = -1; 2128 } 2129 2130 if (amdgpu_sched_hw_submission < 2) { 2131 dev_warn(adev->dev, "sched hw submission jobs (%d) must be at least 2\n", 2132 amdgpu_sched_hw_submission); 2133 amdgpu_sched_hw_submission = 2; 2134 } else if (!is_power_of_2(amdgpu_sched_hw_submission)) { 2135 dev_warn(adev->dev, "sched hw submission jobs (%d) must be a power of 2\n", 2136 amdgpu_sched_hw_submission); 2137 amdgpu_sched_hw_submission = roundup_pow_of_two(amdgpu_sched_hw_submission); 2138 } 2139 2140 if (amdgpu_reset_method < -1 || amdgpu_reset_method > 4) { 2141 dev_warn(adev->dev, "invalid option for reset method, reverting to default\n"); 2142 amdgpu_reset_method = -1; 2143 } 2144 2145 amdgpu_device_check_smu_prv_buffer_size(adev); 2146 2147 amdgpu_device_check_vm_size(adev); 2148 2149 amdgpu_device_check_block_size(adev); 2150 2151 adev->firmware.load_type = amdgpu_ucode_get_load_type(adev, amdgpu_fw_load_type); 2152 2153 for (i = 0; i < MAX_XCP; i++) { 2154 switch (amdgpu_enforce_isolation) { 2155 case -1: 2156 case 0: 2157 default: 2158 /* disable */ 2159 adev->enforce_isolation[i] = AMDGPU_ENFORCE_ISOLATION_DISABLE; 2160 break; 2161 case 1: 2162 /* enable */ 2163 adev->enforce_isolation[i] = 2164 AMDGPU_ENFORCE_ISOLATION_ENABLE; 2165 break; 2166 case 2: 2167 /* enable legacy mode */ 2168 adev->enforce_isolation[i] = 2169 AMDGPU_ENFORCE_ISOLATION_ENABLE_LEGACY; 2170 break; 2171 case 3: 2172 /* enable only process isolation without submitting cleaner shader */ 2173 adev->enforce_isolation[i] = 2174 AMDGPU_ENFORCE_ISOLATION_NO_CLEANER_SHADER; 2175 break; 2176 } 2177 } 2178 2179 return 0; 2180 } 2181 2182 /** 2183 * amdgpu_switcheroo_set_state - set switcheroo state 2184 * 2185 * @pdev: pci dev pointer 2186 * @state: vga_switcheroo state 2187 * 2188 * Callback for the switcheroo driver. Suspends or resumes 2189 * the asics before or after it is powered up using ACPI methods. 2190 */ 2191 static void amdgpu_switcheroo_set_state(struct pci_dev *pdev, 2192 enum vga_switcheroo_state state) 2193 { 2194 struct drm_device *dev = pci_get_drvdata(pdev); 2195 int r; 2196 2197 if (amdgpu_device_supports_px(drm_to_adev(dev)) && 2198 state == VGA_SWITCHEROO_OFF) 2199 return; 2200 2201 if (state == VGA_SWITCHEROO_ON) { 2202 pr_info("switched on\n"); 2203 /* don't suspend or resume card normally */ 2204 dev->switch_power_state = DRM_SWITCH_POWER_CHANGING; 2205 2206 pci_set_power_state(pdev, PCI_D0); 2207 amdgpu_device_load_pci_state(pdev); 2208 r = pci_enable_device(pdev); 2209 if (r) 2210 dev_warn(&pdev->dev, "pci_enable_device failed (%d)\n", 2211 r); 2212 amdgpu_device_resume(dev, true); 2213 2214 dev->switch_power_state = DRM_SWITCH_POWER_ON; 2215 } else { 2216 dev_info(&pdev->dev, "switched off\n"); 2217 dev->switch_power_state = DRM_SWITCH_POWER_CHANGING; 2218 amdgpu_device_prepare(dev); 2219 amdgpu_device_suspend(dev, true); 2220 amdgpu_device_cache_pci_state(pdev); 2221 /* Shut down the device */ 2222 pci_disable_device(pdev); 2223 pci_set_power_state(pdev, PCI_D3cold); 2224 dev->switch_power_state = DRM_SWITCH_POWER_OFF; 2225 } 2226 } 2227 2228 /** 2229 * amdgpu_switcheroo_can_switch - see if switcheroo state can change 2230 * 2231 * @pdev: pci dev pointer 2232 * 2233 * Callback for the switcheroo driver. Check of the switcheroo 2234 * state can be changed. 2235 * Returns true if the state can be changed, false if not. 2236 */ 2237 static bool amdgpu_switcheroo_can_switch(struct pci_dev *pdev) 2238 { 2239 struct drm_device *dev = pci_get_drvdata(pdev); 2240 2241 /* 2242 * FIXME: open_count is protected by drm_global_mutex but that would lead to 2243 * locking inversion with the driver load path. And the access here is 2244 * completely racy anyway. So don't bother with locking for now. 2245 */ 2246 return atomic_read(&dev->open_count) == 0; 2247 } 2248 2249 static const struct vga_switcheroo_client_ops amdgpu_switcheroo_ops = { 2250 .set_gpu_state = amdgpu_switcheroo_set_state, 2251 .reprobe = NULL, 2252 .can_switch = amdgpu_switcheroo_can_switch, 2253 }; 2254 2255 /** 2256 * amdgpu_device_ip_set_clockgating_state - set the CG state 2257 * 2258 * @dev: amdgpu_device pointer 2259 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.) 2260 * @state: clockgating state (gate or ungate) 2261 * 2262 * Sets the requested clockgating state for all instances of 2263 * the hardware IP specified. 2264 * Returns the error code from the last instance. 2265 */ 2266 int amdgpu_device_ip_set_clockgating_state(void *dev, 2267 enum amd_ip_block_type block_type, 2268 enum amd_clockgating_state state) 2269 { 2270 struct amdgpu_device *adev = dev; 2271 int i, r = 0; 2272 2273 for (i = 0; i < adev->num_ip_blocks; i++) { 2274 if (!adev->ip_blocks[i].status.valid) 2275 continue; 2276 if (adev->ip_blocks[i].version->type != block_type) 2277 continue; 2278 if (!adev->ip_blocks[i].version->funcs->set_clockgating_state) 2279 continue; 2280 r = adev->ip_blocks[i].version->funcs->set_clockgating_state( 2281 &adev->ip_blocks[i], state); 2282 if (r) 2283 dev_err(adev->dev, 2284 "set_clockgating_state of IP block <%s> failed %d\n", 2285 adev->ip_blocks[i].version->funcs->name, r); 2286 } 2287 return r; 2288 } 2289 2290 /** 2291 * amdgpu_device_ip_set_powergating_state - set the PG state 2292 * 2293 * @dev: amdgpu_device pointer 2294 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.) 2295 * @state: powergating state (gate or ungate) 2296 * 2297 * Sets the requested powergating state for all instances of 2298 * the hardware IP specified. 2299 * Returns the error code from the last instance. 2300 */ 2301 int amdgpu_device_ip_set_powergating_state(void *dev, 2302 enum amd_ip_block_type block_type, 2303 enum amd_powergating_state state) 2304 { 2305 struct amdgpu_device *adev = dev; 2306 int i, r = 0; 2307 2308 for (i = 0; i < adev->num_ip_blocks; i++) { 2309 if (!adev->ip_blocks[i].status.valid) 2310 continue; 2311 if (adev->ip_blocks[i].version->type != block_type) 2312 continue; 2313 if (!adev->ip_blocks[i].version->funcs->set_powergating_state) 2314 continue; 2315 r = adev->ip_blocks[i].version->funcs->set_powergating_state( 2316 &adev->ip_blocks[i], state); 2317 if (r) 2318 dev_err(adev->dev, 2319 "set_powergating_state of IP block <%s> failed %d\n", 2320 adev->ip_blocks[i].version->funcs->name, r); 2321 } 2322 return r; 2323 } 2324 2325 /** 2326 * amdgpu_device_ip_get_clockgating_state - get the CG state 2327 * 2328 * @adev: amdgpu_device pointer 2329 * @flags: clockgating feature flags 2330 * 2331 * Walks the list of IPs on the device and updates the clockgating 2332 * flags for each IP. 2333 * Updates @flags with the feature flags for each hardware IP where 2334 * clockgating is enabled. 2335 */ 2336 void amdgpu_device_ip_get_clockgating_state(struct amdgpu_device *adev, 2337 u64 *flags) 2338 { 2339 int i; 2340 2341 for (i = 0; i < adev->num_ip_blocks; i++) { 2342 if (!adev->ip_blocks[i].status.valid) 2343 continue; 2344 if (adev->ip_blocks[i].version->funcs->get_clockgating_state) 2345 adev->ip_blocks[i].version->funcs->get_clockgating_state( 2346 &adev->ip_blocks[i], flags); 2347 } 2348 } 2349 2350 /** 2351 * amdgpu_device_ip_wait_for_idle - wait for idle 2352 * 2353 * @adev: amdgpu_device pointer 2354 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.) 2355 * 2356 * Waits for the request hardware IP to be idle. 2357 * Returns 0 for success or a negative error code on failure. 2358 */ 2359 int amdgpu_device_ip_wait_for_idle(struct amdgpu_device *adev, 2360 enum amd_ip_block_type block_type) 2361 { 2362 int i, r; 2363 2364 for (i = 0; i < adev->num_ip_blocks; i++) { 2365 if (!adev->ip_blocks[i].status.valid) 2366 continue; 2367 if (adev->ip_blocks[i].version->type == block_type) { 2368 if (adev->ip_blocks[i].version->funcs->wait_for_idle) { 2369 r = adev->ip_blocks[i].version->funcs->wait_for_idle( 2370 &adev->ip_blocks[i]); 2371 if (r) 2372 return r; 2373 } 2374 break; 2375 } 2376 } 2377 return 0; 2378 2379 } 2380 2381 /** 2382 * amdgpu_device_ip_is_valid - is the hardware IP enabled 2383 * 2384 * @adev: amdgpu_device pointer 2385 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.) 2386 * 2387 * Check if the hardware IP is enable or not. 2388 * Returns true if it the IP is enable, false if not. 2389 */ 2390 bool amdgpu_device_ip_is_valid(struct amdgpu_device *adev, 2391 enum amd_ip_block_type block_type) 2392 { 2393 int i; 2394 2395 for (i = 0; i < adev->num_ip_blocks; i++) { 2396 if (adev->ip_blocks[i].version->type == block_type) 2397 return adev->ip_blocks[i].status.valid; 2398 } 2399 return false; 2400 2401 } 2402 2403 /** 2404 * amdgpu_device_ip_get_ip_block - get a hw IP pointer 2405 * 2406 * @adev: amdgpu_device pointer 2407 * @type: Type of hardware IP (SMU, GFX, UVD, etc.) 2408 * 2409 * Returns a pointer to the hardware IP block structure 2410 * if it exists for the asic, otherwise NULL. 2411 */ 2412 struct amdgpu_ip_block * 2413 amdgpu_device_ip_get_ip_block(struct amdgpu_device *adev, 2414 enum amd_ip_block_type type) 2415 { 2416 int i; 2417 2418 for (i = 0; i < adev->num_ip_blocks; i++) 2419 if (adev->ip_blocks[i].version->type == type) 2420 return &adev->ip_blocks[i]; 2421 2422 return NULL; 2423 } 2424 2425 /** 2426 * amdgpu_device_ip_block_version_cmp 2427 * 2428 * @adev: amdgpu_device pointer 2429 * @type: enum amd_ip_block_type 2430 * @major: major version 2431 * @minor: minor version 2432 * 2433 * return 0 if equal or greater 2434 * return 1 if smaller or the ip_block doesn't exist 2435 */ 2436 int amdgpu_device_ip_block_version_cmp(struct amdgpu_device *adev, 2437 enum amd_ip_block_type type, 2438 u32 major, u32 minor) 2439 { 2440 struct amdgpu_ip_block *ip_block = amdgpu_device_ip_get_ip_block(adev, type); 2441 2442 if (ip_block && ((ip_block->version->major > major) || 2443 ((ip_block->version->major == major) && 2444 (ip_block->version->minor >= minor)))) 2445 return 0; 2446 2447 return 1; 2448 } 2449 2450 static const char *ip_block_names[] = { 2451 [AMD_IP_BLOCK_TYPE_COMMON] = "common", 2452 [AMD_IP_BLOCK_TYPE_GMC] = "gmc", 2453 [AMD_IP_BLOCK_TYPE_IH] = "ih", 2454 [AMD_IP_BLOCK_TYPE_SMC] = "smu", 2455 [AMD_IP_BLOCK_TYPE_PSP] = "psp", 2456 [AMD_IP_BLOCK_TYPE_DCE] = "dce", 2457 [AMD_IP_BLOCK_TYPE_GFX] = "gfx", 2458 [AMD_IP_BLOCK_TYPE_SDMA] = "sdma", 2459 [AMD_IP_BLOCK_TYPE_UVD] = "uvd", 2460 [AMD_IP_BLOCK_TYPE_VCE] = "vce", 2461 [AMD_IP_BLOCK_TYPE_ACP] = "acp", 2462 [AMD_IP_BLOCK_TYPE_VCN] = "vcn", 2463 [AMD_IP_BLOCK_TYPE_MES] = "mes", 2464 [AMD_IP_BLOCK_TYPE_JPEG] = "jpeg", 2465 [AMD_IP_BLOCK_TYPE_VPE] = "vpe", 2466 [AMD_IP_BLOCK_TYPE_UMSCH_MM] = "umsch_mm", 2467 [AMD_IP_BLOCK_TYPE_ISP] = "isp", 2468 }; 2469 2470 static const char *ip_block_name(struct amdgpu_device *adev, enum amd_ip_block_type type) 2471 { 2472 int idx = (int)type; 2473 2474 return idx < ARRAY_SIZE(ip_block_names) ? ip_block_names[idx] : "unknown"; 2475 } 2476 2477 /** 2478 * amdgpu_device_ip_block_add 2479 * 2480 * @adev: amdgpu_device pointer 2481 * @ip_block_version: pointer to the IP to add 2482 * 2483 * Adds the IP block driver information to the collection of IPs 2484 * on the asic. 2485 */ 2486 int amdgpu_device_ip_block_add(struct amdgpu_device *adev, 2487 const struct amdgpu_ip_block_version *ip_block_version) 2488 { 2489 if (!ip_block_version) 2490 return -EINVAL; 2491 2492 switch (ip_block_version->type) { 2493 case AMD_IP_BLOCK_TYPE_VCN: 2494 if (adev->harvest_ip_mask & AMD_HARVEST_IP_VCN_MASK) 2495 return 0; 2496 break; 2497 case AMD_IP_BLOCK_TYPE_JPEG: 2498 if (adev->harvest_ip_mask & AMD_HARVEST_IP_JPEG_MASK) 2499 return 0; 2500 break; 2501 default: 2502 break; 2503 } 2504 2505 dev_info(adev->dev, "detected ip block number %d <%s_v%d_%d_%d> (%s)\n", 2506 adev->num_ip_blocks, 2507 ip_block_name(adev, ip_block_version->type), 2508 ip_block_version->major, 2509 ip_block_version->minor, 2510 ip_block_version->rev, 2511 ip_block_version->funcs->name); 2512 2513 adev->ip_blocks[adev->num_ip_blocks].adev = adev; 2514 2515 adev->ip_blocks[adev->num_ip_blocks++].version = ip_block_version; 2516 2517 return 0; 2518 } 2519 2520 /** 2521 * amdgpu_device_enable_virtual_display - enable virtual display feature 2522 * 2523 * @adev: amdgpu_device pointer 2524 * 2525 * Enabled the virtual display feature if the user has enabled it via 2526 * the module parameter virtual_display. This feature provides a virtual 2527 * display hardware on headless boards or in virtualized environments. 2528 * This function parses and validates the configuration string specified by 2529 * the user and configures the virtual display configuration (number of 2530 * virtual connectors, crtcs, etc.) specified. 2531 */ 2532 static void amdgpu_device_enable_virtual_display(struct amdgpu_device *adev) 2533 { 2534 adev->enable_virtual_display = false; 2535 2536 if (amdgpu_virtual_display) { 2537 const char *pci_address_name = pci_name(adev->pdev); 2538 char *pciaddstr, *pciaddstr_tmp, *pciaddname_tmp, *pciaddname; 2539 2540 pciaddstr = kstrdup(amdgpu_virtual_display, GFP_KERNEL); 2541 pciaddstr_tmp = pciaddstr; 2542 while ((pciaddname_tmp = strsep(&pciaddstr_tmp, ";"))) { 2543 pciaddname = strsep(&pciaddname_tmp, ","); 2544 if (!strcmp("all", pciaddname) 2545 || !strcmp(pci_address_name, pciaddname)) { 2546 long num_crtc; 2547 int res = -1; 2548 2549 adev->enable_virtual_display = true; 2550 2551 if (pciaddname_tmp) 2552 res = kstrtol(pciaddname_tmp, 10, 2553 &num_crtc); 2554 2555 if (!res) { 2556 if (num_crtc < 1) 2557 num_crtc = 1; 2558 if (num_crtc > 6) 2559 num_crtc = 6; 2560 adev->mode_info.num_crtc = num_crtc; 2561 } else { 2562 adev->mode_info.num_crtc = 1; 2563 } 2564 break; 2565 } 2566 } 2567 2568 dev_info( 2569 adev->dev, 2570 "virtual display string:%s, %s:virtual_display:%d, num_crtc:%d\n", 2571 amdgpu_virtual_display, pci_address_name, 2572 adev->enable_virtual_display, adev->mode_info.num_crtc); 2573 2574 kfree(pciaddstr); 2575 } 2576 } 2577 2578 void amdgpu_device_set_sriov_virtual_display(struct amdgpu_device *adev) 2579 { 2580 if (amdgpu_sriov_vf(adev) && !adev->enable_virtual_display) { 2581 adev->mode_info.num_crtc = 1; 2582 adev->enable_virtual_display = true; 2583 dev_info(adev->dev, "virtual_display:%d, num_crtc:%d\n", 2584 adev->enable_virtual_display, 2585 adev->mode_info.num_crtc); 2586 } 2587 } 2588 2589 /** 2590 * amdgpu_device_parse_gpu_info_fw - parse gpu info firmware 2591 * 2592 * @adev: amdgpu_device pointer 2593 * 2594 * Parses the asic configuration parameters specified in the gpu info 2595 * firmware and makes them available to the driver for use in configuring 2596 * the asic. 2597 * Returns 0 on success, -EINVAL on failure. 2598 */ 2599 static int amdgpu_device_parse_gpu_info_fw(struct amdgpu_device *adev) 2600 { 2601 const char *chip_name; 2602 int err; 2603 const struct gpu_info_firmware_header_v1_0 *hdr; 2604 2605 adev->firmware.gpu_info_fw = NULL; 2606 2607 switch (adev->asic_type) { 2608 default: 2609 return 0; 2610 case CHIP_VEGA10: 2611 chip_name = "vega10"; 2612 break; 2613 case CHIP_VEGA12: 2614 chip_name = "vega12"; 2615 break; 2616 case CHIP_RAVEN: 2617 if (adev->apu_flags & AMD_APU_IS_RAVEN2) 2618 chip_name = "raven2"; 2619 else if (adev->apu_flags & AMD_APU_IS_PICASSO) 2620 chip_name = "picasso"; 2621 else 2622 chip_name = "raven"; 2623 break; 2624 case CHIP_ARCTURUS: 2625 chip_name = "arcturus"; 2626 break; 2627 case CHIP_NAVI12: 2628 if (adev->mman.discovery_bin) 2629 return 0; 2630 chip_name = "navi12"; 2631 break; 2632 } 2633 2634 err = amdgpu_ucode_request(adev, &adev->firmware.gpu_info_fw, 2635 AMDGPU_UCODE_OPTIONAL, 2636 "amdgpu/%s_gpu_info.bin", chip_name); 2637 if (err) { 2638 dev_err(adev->dev, 2639 "Failed to get gpu_info firmware \"%s_gpu_info.bin\"\n", 2640 chip_name); 2641 goto out; 2642 } 2643 2644 hdr = (const struct gpu_info_firmware_header_v1_0 *)adev->firmware.gpu_info_fw->data; 2645 amdgpu_ucode_print_gpu_info_hdr(&hdr->header); 2646 2647 switch (hdr->version_major) { 2648 case 1: 2649 { 2650 const struct gpu_info_firmware_v1_0 *gpu_info_fw = 2651 (const struct gpu_info_firmware_v1_0 *)(adev->firmware.gpu_info_fw->data + 2652 le32_to_cpu(hdr->header.ucode_array_offset_bytes)); 2653 2654 /* 2655 * Should be dropped when DAL no longer needs it. 2656 */ 2657 if (adev->asic_type == CHIP_NAVI12) 2658 goto parse_soc_bounding_box; 2659 2660 adev->gfx.config.max_shader_engines = le32_to_cpu(gpu_info_fw->gc_num_se); 2661 adev->gfx.config.max_cu_per_sh = le32_to_cpu(gpu_info_fw->gc_num_cu_per_sh); 2662 adev->gfx.config.max_sh_per_se = le32_to_cpu(gpu_info_fw->gc_num_sh_per_se); 2663 adev->gfx.config.max_backends_per_se = le32_to_cpu(gpu_info_fw->gc_num_rb_per_se); 2664 adev->gfx.config.max_texture_channel_caches = 2665 le32_to_cpu(gpu_info_fw->gc_num_tccs); 2666 adev->gfx.config.max_gprs = le32_to_cpu(gpu_info_fw->gc_num_gprs); 2667 adev->gfx.config.max_gs_threads = le32_to_cpu(gpu_info_fw->gc_num_max_gs_thds); 2668 adev->gfx.config.gs_vgt_table_depth = le32_to_cpu(gpu_info_fw->gc_gs_table_depth); 2669 adev->gfx.config.gs_prim_buffer_depth = le32_to_cpu(gpu_info_fw->gc_gsprim_buff_depth); 2670 adev->gfx.config.double_offchip_lds_buf = 2671 le32_to_cpu(gpu_info_fw->gc_double_offchip_lds_buffer); 2672 adev->gfx.cu_info.wave_front_size = le32_to_cpu(gpu_info_fw->gc_wave_size); 2673 adev->gfx.cu_info.max_waves_per_simd = 2674 le32_to_cpu(gpu_info_fw->gc_max_waves_per_simd); 2675 adev->gfx.cu_info.max_scratch_slots_per_cu = 2676 le32_to_cpu(gpu_info_fw->gc_max_scratch_slots_per_cu); 2677 adev->gfx.cu_info.lds_size = le32_to_cpu(gpu_info_fw->gc_lds_size); 2678 if (hdr->version_minor >= 1) { 2679 const struct gpu_info_firmware_v1_1 *gpu_info_fw = 2680 (const struct gpu_info_firmware_v1_1 *)(adev->firmware.gpu_info_fw->data + 2681 le32_to_cpu(hdr->header.ucode_array_offset_bytes)); 2682 adev->gfx.config.num_sc_per_sh = 2683 le32_to_cpu(gpu_info_fw->num_sc_per_sh); 2684 adev->gfx.config.num_packer_per_sc = 2685 le32_to_cpu(gpu_info_fw->num_packer_per_sc); 2686 } 2687 2688 parse_soc_bounding_box: 2689 /* 2690 * soc bounding box info is not integrated in disocovery table, 2691 * we always need to parse it from gpu info firmware if needed. 2692 */ 2693 if (hdr->version_minor == 2) { 2694 const struct gpu_info_firmware_v1_2 *gpu_info_fw = 2695 (const struct gpu_info_firmware_v1_2 *)(adev->firmware.gpu_info_fw->data + 2696 le32_to_cpu(hdr->header.ucode_array_offset_bytes)); 2697 adev->dm.soc_bounding_box = &gpu_info_fw->soc_bounding_box; 2698 } 2699 break; 2700 } 2701 default: 2702 dev_err(adev->dev, 2703 "Unsupported gpu_info table %d\n", hdr->header.ucode_version); 2704 err = -EINVAL; 2705 goto out; 2706 } 2707 out: 2708 return err; 2709 } 2710 2711 static void amdgpu_uid_init(struct amdgpu_device *adev) 2712 { 2713 /* Initialize the UID for the device */ 2714 adev->uid_info = kzalloc(sizeof(struct amdgpu_uid), GFP_KERNEL); 2715 if (!adev->uid_info) { 2716 dev_warn(adev->dev, "Failed to allocate memory for UID\n"); 2717 return; 2718 } 2719 adev->uid_info->adev = adev; 2720 } 2721 2722 static void amdgpu_uid_fini(struct amdgpu_device *adev) 2723 { 2724 /* Free the UID memory */ 2725 kfree(adev->uid_info); 2726 adev->uid_info = NULL; 2727 } 2728 2729 /** 2730 * amdgpu_device_ip_early_init - run early init for hardware IPs 2731 * 2732 * @adev: amdgpu_device pointer 2733 * 2734 * Early initialization pass for hardware IPs. The hardware IPs that make 2735 * up each asic are discovered each IP's early_init callback is run. This 2736 * is the first stage in initializing the asic. 2737 * Returns 0 on success, negative error code on failure. 2738 */ 2739 static int amdgpu_device_ip_early_init(struct amdgpu_device *adev) 2740 { 2741 struct amdgpu_ip_block *ip_block; 2742 struct pci_dev *parent; 2743 bool total, skip_bios; 2744 uint32_t bios_flags; 2745 int i, r; 2746 2747 amdgpu_device_enable_virtual_display(adev); 2748 2749 if (amdgpu_sriov_vf(adev)) { 2750 r = amdgpu_virt_request_full_gpu(adev, true); 2751 if (r) 2752 return r; 2753 } 2754 2755 switch (adev->asic_type) { 2756 #ifdef CONFIG_DRM_AMDGPU_SI 2757 case CHIP_VERDE: 2758 case CHIP_TAHITI: 2759 case CHIP_PITCAIRN: 2760 case CHIP_OLAND: 2761 case CHIP_HAINAN: 2762 adev->family = AMDGPU_FAMILY_SI; 2763 r = si_set_ip_blocks(adev); 2764 if (r) 2765 return r; 2766 break; 2767 #endif 2768 #ifdef CONFIG_DRM_AMDGPU_CIK 2769 case CHIP_BONAIRE: 2770 case CHIP_HAWAII: 2771 case CHIP_KAVERI: 2772 case CHIP_KABINI: 2773 case CHIP_MULLINS: 2774 if (adev->flags & AMD_IS_APU) 2775 adev->family = AMDGPU_FAMILY_KV; 2776 else 2777 adev->family = AMDGPU_FAMILY_CI; 2778 2779 r = cik_set_ip_blocks(adev); 2780 if (r) 2781 return r; 2782 break; 2783 #endif 2784 case CHIP_TOPAZ: 2785 case CHIP_TONGA: 2786 case CHIP_FIJI: 2787 case CHIP_POLARIS10: 2788 case CHIP_POLARIS11: 2789 case CHIP_POLARIS12: 2790 case CHIP_VEGAM: 2791 case CHIP_CARRIZO: 2792 case CHIP_STONEY: 2793 if (adev->flags & AMD_IS_APU) 2794 adev->family = AMDGPU_FAMILY_CZ; 2795 else 2796 adev->family = AMDGPU_FAMILY_VI; 2797 2798 r = vi_set_ip_blocks(adev); 2799 if (r) 2800 return r; 2801 break; 2802 default: 2803 r = amdgpu_discovery_set_ip_blocks(adev); 2804 if (r) 2805 return r; 2806 break; 2807 } 2808 2809 /* Check for IP version 9.4.3 with A0 hardware */ 2810 if (amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 3) && 2811 !amdgpu_device_get_rev_id(adev)) { 2812 dev_err(adev->dev, "Unsupported A0 hardware\n"); 2813 return -ENODEV; /* device unsupported - no device error */ 2814 } 2815 2816 if (amdgpu_has_atpx() && 2817 (amdgpu_is_atpx_hybrid() || 2818 amdgpu_has_atpx_dgpu_power_cntl()) && 2819 ((adev->flags & AMD_IS_APU) == 0) && 2820 !dev_is_removable(&adev->pdev->dev)) 2821 adev->flags |= AMD_IS_PX; 2822 2823 if (!(adev->flags & AMD_IS_APU)) { 2824 parent = pcie_find_root_port(adev->pdev); 2825 adev->has_pr3 = parent ? pci_pr3_present(parent) : false; 2826 } 2827 2828 adev->pm.pp_feature = amdgpu_pp_feature_mask; 2829 if (amdgpu_sriov_vf(adev) || sched_policy == KFD_SCHED_POLICY_NO_HWS) 2830 adev->pm.pp_feature &= ~PP_GFXOFF_MASK; 2831 if (amdgpu_sriov_vf(adev) && adev->asic_type == CHIP_SIENNA_CICHLID) 2832 adev->pm.pp_feature &= ~PP_OVERDRIVE_MASK; 2833 if (!amdgpu_device_pcie_dynamic_switching_supported(adev)) 2834 adev->pm.pp_feature &= ~PP_PCIE_DPM_MASK; 2835 2836 adev->virt.is_xgmi_node_migrate_enabled = false; 2837 if (amdgpu_sriov_vf(adev)) { 2838 adev->virt.is_xgmi_node_migrate_enabled = 2839 amdgpu_ip_version((adev), GC_HWIP, 0) == IP_VERSION(9, 4, 4); 2840 } 2841 2842 total = true; 2843 for (i = 0; i < adev->num_ip_blocks; i++) { 2844 ip_block = &adev->ip_blocks[i]; 2845 2846 if ((amdgpu_ip_block_mask & (1 << i)) == 0) { 2847 dev_warn(adev->dev, "disabled ip block: %d <%s>\n", i, 2848 adev->ip_blocks[i].version->funcs->name); 2849 adev->ip_blocks[i].status.valid = false; 2850 } else if (ip_block->version->funcs->early_init) { 2851 r = ip_block->version->funcs->early_init(ip_block); 2852 if (r == -ENOENT) { 2853 adev->ip_blocks[i].status.valid = false; 2854 } else if (r) { 2855 dev_err(adev->dev, 2856 "early_init of IP block <%s> failed %d\n", 2857 adev->ip_blocks[i].version->funcs->name, 2858 r); 2859 total = false; 2860 } else { 2861 adev->ip_blocks[i].status.valid = true; 2862 } 2863 } else { 2864 adev->ip_blocks[i].status.valid = true; 2865 } 2866 /* get the vbios after the asic_funcs are set up */ 2867 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON) { 2868 r = amdgpu_device_parse_gpu_info_fw(adev); 2869 if (r) 2870 return r; 2871 2872 bios_flags = amdgpu_device_get_vbios_flags(adev); 2873 skip_bios = !!(bios_flags & AMDGPU_VBIOS_SKIP); 2874 /* Read BIOS */ 2875 if (!skip_bios) { 2876 bool optional = 2877 !!(bios_flags & AMDGPU_VBIOS_OPTIONAL); 2878 if (!amdgpu_get_bios(adev) && !optional) 2879 return -EINVAL; 2880 2881 if (optional && !adev->bios) 2882 dev_info( 2883 adev->dev, 2884 "VBIOS image optional, proceeding without VBIOS image"); 2885 2886 if (adev->bios) { 2887 r = amdgpu_atombios_init(adev); 2888 if (r) { 2889 dev_err(adev->dev, 2890 "amdgpu_atombios_init failed\n"); 2891 amdgpu_vf_error_put( 2892 adev, 2893 AMDGIM_ERROR_VF_ATOMBIOS_INIT_FAIL, 2894 0, 0); 2895 return r; 2896 } 2897 } 2898 } 2899 2900 /*get pf2vf msg info at it's earliest time*/ 2901 if (amdgpu_sriov_vf(adev)) 2902 amdgpu_virt_init_data_exchange(adev); 2903 2904 } 2905 } 2906 if (!total) 2907 return -ENODEV; 2908 2909 if (adev->gmc.xgmi.supported) 2910 amdgpu_xgmi_early_init(adev); 2911 2912 if (amdgpu_is_multi_aid(adev)) 2913 amdgpu_uid_init(adev); 2914 ip_block = amdgpu_device_ip_get_ip_block(adev, AMD_IP_BLOCK_TYPE_GFX); 2915 if (ip_block->status.valid != false) 2916 amdgpu_amdkfd_device_probe(adev); 2917 2918 adev->cg_flags &= amdgpu_cg_mask; 2919 adev->pg_flags &= amdgpu_pg_mask; 2920 2921 return 0; 2922 } 2923 2924 static int amdgpu_device_ip_hw_init_phase1(struct amdgpu_device *adev) 2925 { 2926 int i, r; 2927 2928 for (i = 0; i < adev->num_ip_blocks; i++) { 2929 if (!adev->ip_blocks[i].status.sw) 2930 continue; 2931 if (adev->ip_blocks[i].status.hw) 2932 continue; 2933 if (!amdgpu_ip_member_of_hwini( 2934 adev, adev->ip_blocks[i].version->type)) 2935 continue; 2936 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON || 2937 (amdgpu_sriov_vf(adev) && (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP)) || 2938 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH) { 2939 r = adev->ip_blocks[i].version->funcs->hw_init(&adev->ip_blocks[i]); 2940 if (r) { 2941 dev_err(adev->dev, 2942 "hw_init of IP block <%s> failed %d\n", 2943 adev->ip_blocks[i].version->funcs->name, 2944 r); 2945 return r; 2946 } 2947 adev->ip_blocks[i].status.hw = true; 2948 } 2949 } 2950 2951 return 0; 2952 } 2953 2954 static int amdgpu_device_ip_hw_init_phase2(struct amdgpu_device *adev) 2955 { 2956 int i, r; 2957 2958 for (i = 0; i < adev->num_ip_blocks; i++) { 2959 if (!adev->ip_blocks[i].status.sw) 2960 continue; 2961 if (adev->ip_blocks[i].status.hw) 2962 continue; 2963 if (!amdgpu_ip_member_of_hwini( 2964 adev, adev->ip_blocks[i].version->type)) 2965 continue; 2966 r = adev->ip_blocks[i].version->funcs->hw_init(&adev->ip_blocks[i]); 2967 if (r) { 2968 dev_err(adev->dev, 2969 "hw_init of IP block <%s> failed %d\n", 2970 adev->ip_blocks[i].version->funcs->name, r); 2971 return r; 2972 } 2973 adev->ip_blocks[i].status.hw = true; 2974 } 2975 2976 return 0; 2977 } 2978 2979 static int amdgpu_device_fw_loading(struct amdgpu_device *adev) 2980 { 2981 int r = 0; 2982 int i; 2983 uint32_t smu_version; 2984 2985 if (adev->asic_type >= CHIP_VEGA10) { 2986 for (i = 0; i < adev->num_ip_blocks; i++) { 2987 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_PSP) 2988 continue; 2989 2990 if (!amdgpu_ip_member_of_hwini(adev, 2991 AMD_IP_BLOCK_TYPE_PSP)) 2992 break; 2993 2994 if (!adev->ip_blocks[i].status.sw) 2995 continue; 2996 2997 /* no need to do the fw loading again if already done*/ 2998 if (adev->ip_blocks[i].status.hw == true) 2999 break; 3000 3001 if (amdgpu_in_reset(adev) || adev->in_suspend) { 3002 r = amdgpu_ip_block_resume(&adev->ip_blocks[i]); 3003 if (r) 3004 return r; 3005 } else { 3006 r = adev->ip_blocks[i].version->funcs->hw_init(&adev->ip_blocks[i]); 3007 if (r) { 3008 dev_err(adev->dev, 3009 "hw_init of IP block <%s> failed %d\n", 3010 adev->ip_blocks[i] 3011 .version->funcs->name, 3012 r); 3013 return r; 3014 } 3015 adev->ip_blocks[i].status.hw = true; 3016 } 3017 break; 3018 } 3019 } 3020 3021 if (!amdgpu_sriov_vf(adev) || adev->asic_type == CHIP_TONGA) 3022 r = amdgpu_pm_load_smu_firmware(adev, &smu_version); 3023 3024 return r; 3025 } 3026 3027 static int amdgpu_device_init_schedulers(struct amdgpu_device *adev) 3028 { 3029 struct drm_sched_init_args args = { 3030 .ops = &amdgpu_sched_ops, 3031 .num_rqs = DRM_SCHED_PRIORITY_COUNT, 3032 .timeout_wq = adev->reset_domain->wq, 3033 .dev = adev->dev, 3034 }; 3035 long timeout; 3036 int r, i; 3037 3038 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 3039 struct amdgpu_ring *ring = adev->rings[i]; 3040 3041 /* No need to setup the GPU scheduler for rings that don't need it */ 3042 if (!ring || ring->no_scheduler) 3043 continue; 3044 3045 switch (ring->funcs->type) { 3046 case AMDGPU_RING_TYPE_GFX: 3047 timeout = adev->gfx_timeout; 3048 break; 3049 case AMDGPU_RING_TYPE_COMPUTE: 3050 timeout = adev->compute_timeout; 3051 break; 3052 case AMDGPU_RING_TYPE_SDMA: 3053 timeout = adev->sdma_timeout; 3054 break; 3055 default: 3056 timeout = adev->video_timeout; 3057 break; 3058 } 3059 3060 args.timeout = timeout; 3061 args.credit_limit = ring->num_hw_submission; 3062 args.score = ring->sched_score; 3063 args.name = ring->name; 3064 3065 r = drm_sched_init(&ring->sched, &args); 3066 if (r) { 3067 dev_err(adev->dev, 3068 "Failed to create scheduler on ring %s.\n", 3069 ring->name); 3070 return r; 3071 } 3072 r = amdgpu_uvd_entity_init(adev, ring); 3073 if (r) { 3074 dev_err(adev->dev, 3075 "Failed to create UVD scheduling entity on ring %s.\n", 3076 ring->name); 3077 return r; 3078 } 3079 r = amdgpu_vce_entity_init(adev, ring); 3080 if (r) { 3081 dev_err(adev->dev, 3082 "Failed to create VCE scheduling entity on ring %s.\n", 3083 ring->name); 3084 return r; 3085 } 3086 } 3087 3088 if (adev->xcp_mgr) 3089 amdgpu_xcp_update_partition_sched_list(adev); 3090 3091 return 0; 3092 } 3093 3094 3095 /** 3096 * amdgpu_device_ip_init - run init for hardware IPs 3097 * 3098 * @adev: amdgpu_device pointer 3099 * 3100 * Main initialization pass for hardware IPs. The list of all the hardware 3101 * IPs that make up the asic is walked and the sw_init and hw_init callbacks 3102 * are run. sw_init initializes the software state associated with each IP 3103 * and hw_init initializes the hardware associated with each IP. 3104 * Returns 0 on success, negative error code on failure. 3105 */ 3106 static int amdgpu_device_ip_init(struct amdgpu_device *adev) 3107 { 3108 bool init_badpage; 3109 int i, r; 3110 3111 r = amdgpu_ras_init(adev); 3112 if (r) 3113 return r; 3114 3115 for (i = 0; i < adev->num_ip_blocks; i++) { 3116 if (!adev->ip_blocks[i].status.valid) 3117 continue; 3118 if (adev->ip_blocks[i].version->funcs->sw_init) { 3119 r = adev->ip_blocks[i].version->funcs->sw_init(&adev->ip_blocks[i]); 3120 if (r) { 3121 dev_err(adev->dev, 3122 "sw_init of IP block <%s> failed %d\n", 3123 adev->ip_blocks[i].version->funcs->name, 3124 r); 3125 goto init_failed; 3126 } 3127 } 3128 adev->ip_blocks[i].status.sw = true; 3129 3130 if (!amdgpu_ip_member_of_hwini( 3131 adev, adev->ip_blocks[i].version->type)) 3132 continue; 3133 3134 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON) { 3135 /* need to do common hw init early so everything is set up for gmc */ 3136 r = adev->ip_blocks[i].version->funcs->hw_init(&adev->ip_blocks[i]); 3137 if (r) { 3138 dev_err(adev->dev, "hw_init %d failed %d\n", i, 3139 r); 3140 goto init_failed; 3141 } 3142 adev->ip_blocks[i].status.hw = true; 3143 } else if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) { 3144 /* need to do gmc hw init early so we can allocate gpu mem */ 3145 /* Try to reserve bad pages early */ 3146 if (amdgpu_sriov_vf(adev)) 3147 amdgpu_virt_exchange_data(adev); 3148 3149 r = amdgpu_device_mem_scratch_init(adev); 3150 if (r) { 3151 dev_err(adev->dev, 3152 "amdgpu_mem_scratch_init failed %d\n", 3153 r); 3154 goto init_failed; 3155 } 3156 r = adev->ip_blocks[i].version->funcs->hw_init(&adev->ip_blocks[i]); 3157 if (r) { 3158 dev_err(adev->dev, "hw_init %d failed %d\n", i, 3159 r); 3160 goto init_failed; 3161 } 3162 r = amdgpu_device_wb_init(adev); 3163 if (r) { 3164 dev_err(adev->dev, 3165 "amdgpu_device_wb_init failed %d\n", r); 3166 goto init_failed; 3167 } 3168 adev->ip_blocks[i].status.hw = true; 3169 3170 /* right after GMC hw init, we create CSA */ 3171 if (adev->gfx.mcbp) { 3172 r = amdgpu_allocate_static_csa(adev, &adev->virt.csa_obj, 3173 AMDGPU_GEM_DOMAIN_VRAM | 3174 AMDGPU_GEM_DOMAIN_GTT, 3175 AMDGPU_CSA_SIZE); 3176 if (r) { 3177 dev_err(adev->dev, 3178 "allocate CSA failed %d\n", r); 3179 goto init_failed; 3180 } 3181 } 3182 3183 r = amdgpu_seq64_init(adev); 3184 if (r) { 3185 dev_err(adev->dev, "allocate seq64 failed %d\n", 3186 r); 3187 goto init_failed; 3188 } 3189 } 3190 } 3191 3192 if (amdgpu_sriov_vf(adev)) 3193 amdgpu_virt_init_data_exchange(adev); 3194 3195 r = amdgpu_ib_pool_init(adev); 3196 if (r) { 3197 dev_err(adev->dev, "IB initialization failed (%d).\n", r); 3198 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_IB_INIT_FAIL, 0, r); 3199 goto init_failed; 3200 } 3201 3202 r = amdgpu_ucode_create_bo(adev); /* create ucode bo when sw_init complete*/ 3203 if (r) 3204 goto init_failed; 3205 3206 r = amdgpu_device_ip_hw_init_phase1(adev); 3207 if (r) 3208 goto init_failed; 3209 3210 r = amdgpu_device_fw_loading(adev); 3211 if (r) 3212 goto init_failed; 3213 3214 r = amdgpu_device_ip_hw_init_phase2(adev); 3215 if (r) 3216 goto init_failed; 3217 3218 /* 3219 * retired pages will be loaded from eeprom and reserved here, 3220 * it should be called after amdgpu_device_ip_hw_init_phase2 since 3221 * for some ASICs the RAS EEPROM code relies on SMU fully functioning 3222 * for I2C communication which only true at this point. 3223 * 3224 * amdgpu_ras_recovery_init may fail, but the upper only cares the 3225 * failure from bad gpu situation and stop amdgpu init process 3226 * accordingly. For other failed cases, it will still release all 3227 * the resource and print error message, rather than returning one 3228 * negative value to upper level. 3229 * 3230 * Note: theoretically, this should be called before all vram allocations 3231 * to protect retired page from abusing 3232 */ 3233 init_badpage = (adev->init_lvl->level != AMDGPU_INIT_LEVEL_MINIMAL_XGMI); 3234 r = amdgpu_ras_recovery_init(adev, init_badpage); 3235 if (r) 3236 goto init_failed; 3237 3238 /** 3239 * In case of XGMI grab extra reference for reset domain for this device 3240 */ 3241 if (adev->gmc.xgmi.num_physical_nodes > 1) { 3242 if (amdgpu_xgmi_add_device(adev) == 0) { 3243 if (!amdgpu_sriov_vf(adev)) { 3244 struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev); 3245 3246 if (WARN_ON(!hive)) { 3247 r = -ENOENT; 3248 goto init_failed; 3249 } 3250 3251 if (!hive->reset_domain || 3252 !amdgpu_reset_get_reset_domain(hive->reset_domain)) { 3253 r = -ENOENT; 3254 amdgpu_put_xgmi_hive(hive); 3255 goto init_failed; 3256 } 3257 3258 /* Drop the early temporary reset domain we created for device */ 3259 amdgpu_reset_put_reset_domain(adev->reset_domain); 3260 adev->reset_domain = hive->reset_domain; 3261 amdgpu_put_xgmi_hive(hive); 3262 } 3263 } 3264 } 3265 3266 r = amdgpu_device_init_schedulers(adev); 3267 if (r) 3268 goto init_failed; 3269 3270 if (adev->mman.buffer_funcs_ring->sched.ready) 3271 amdgpu_ttm_set_buffer_funcs_status(adev, true); 3272 3273 /* Don't init kfd if whole hive need to be reset during init */ 3274 if (adev->init_lvl->level != AMDGPU_INIT_LEVEL_MINIMAL_XGMI) { 3275 kgd2kfd_init_zone_device(adev); 3276 amdgpu_amdkfd_device_init(adev); 3277 } 3278 3279 amdgpu_fru_get_product_info(adev); 3280 3281 if (!amdgpu_sriov_vf(adev) || amdgpu_sriov_ras_cper_en(adev)) 3282 r = amdgpu_cper_init(adev); 3283 3284 init_failed: 3285 3286 return r; 3287 } 3288 3289 /** 3290 * amdgpu_device_fill_reset_magic - writes reset magic to gart pointer 3291 * 3292 * @adev: amdgpu_device pointer 3293 * 3294 * Writes a reset magic value to the gart pointer in VRAM. The driver calls 3295 * this function before a GPU reset. If the value is retained after a 3296 * GPU reset, VRAM has not been lost. Some GPU resets may destroy VRAM contents. 3297 */ 3298 static void amdgpu_device_fill_reset_magic(struct amdgpu_device *adev) 3299 { 3300 memcpy(adev->reset_magic, adev->gart.ptr, AMDGPU_RESET_MAGIC_NUM); 3301 } 3302 3303 /** 3304 * amdgpu_device_check_vram_lost - check if vram is valid 3305 * 3306 * @adev: amdgpu_device pointer 3307 * 3308 * Checks the reset magic value written to the gart pointer in VRAM. 3309 * The driver calls this after a GPU reset to see if the contents of 3310 * VRAM is lost or now. 3311 * returns true if vram is lost, false if not. 3312 */ 3313 static bool amdgpu_device_check_vram_lost(struct amdgpu_device *adev) 3314 { 3315 if (memcmp(adev->gart.ptr, adev->reset_magic, 3316 AMDGPU_RESET_MAGIC_NUM)) 3317 return true; 3318 3319 if (!amdgpu_in_reset(adev)) 3320 return false; 3321 3322 /* 3323 * For all ASICs with baco/mode1 reset, the VRAM is 3324 * always assumed to be lost. 3325 */ 3326 switch (amdgpu_asic_reset_method(adev)) { 3327 case AMD_RESET_METHOD_LEGACY: 3328 case AMD_RESET_METHOD_LINK: 3329 case AMD_RESET_METHOD_BACO: 3330 case AMD_RESET_METHOD_MODE1: 3331 return true; 3332 default: 3333 return false; 3334 } 3335 } 3336 3337 /** 3338 * amdgpu_device_set_cg_state - set clockgating for amdgpu device 3339 * 3340 * @adev: amdgpu_device pointer 3341 * @state: clockgating state (gate or ungate) 3342 * 3343 * The list of all the hardware IPs that make up the asic is walked and the 3344 * set_clockgating_state callbacks are run. 3345 * Late initialization pass enabling clockgating for hardware IPs. 3346 * Fini or suspend, pass disabling clockgating for hardware IPs. 3347 * Returns 0 on success, negative error code on failure. 3348 */ 3349 3350 int amdgpu_device_set_cg_state(struct amdgpu_device *adev, 3351 enum amd_clockgating_state state) 3352 { 3353 int i, j, r; 3354 3355 if (amdgpu_emu_mode == 1) 3356 return 0; 3357 3358 for (j = 0; j < adev->num_ip_blocks; j++) { 3359 i = state == AMD_CG_STATE_GATE ? j : adev->num_ip_blocks - j - 1; 3360 if (!adev->ip_blocks[i].status.late_initialized) 3361 continue; 3362 /* skip CG for GFX, SDMA on S0ix */ 3363 if (adev->in_s0ix && 3364 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GFX || 3365 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SDMA)) 3366 continue; 3367 /* skip CG for VCE/UVD, it's handled specially */ 3368 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_UVD && 3369 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCE && 3370 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCN && 3371 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_JPEG && 3372 adev->ip_blocks[i].version->funcs->set_clockgating_state) { 3373 /* enable clockgating to save power */ 3374 r = adev->ip_blocks[i].version->funcs->set_clockgating_state(&adev->ip_blocks[i], 3375 state); 3376 if (r) { 3377 dev_err(adev->dev, 3378 "set_clockgating_state(gate) of IP block <%s> failed %d\n", 3379 adev->ip_blocks[i].version->funcs->name, 3380 r); 3381 return r; 3382 } 3383 } 3384 } 3385 3386 return 0; 3387 } 3388 3389 int amdgpu_device_set_pg_state(struct amdgpu_device *adev, 3390 enum amd_powergating_state state) 3391 { 3392 int i, j, r; 3393 3394 if (amdgpu_emu_mode == 1) 3395 return 0; 3396 3397 for (j = 0; j < adev->num_ip_blocks; j++) { 3398 i = state == AMD_PG_STATE_GATE ? j : adev->num_ip_blocks - j - 1; 3399 if (!adev->ip_blocks[i].status.late_initialized) 3400 continue; 3401 /* skip PG for GFX, SDMA on S0ix */ 3402 if (adev->in_s0ix && 3403 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GFX || 3404 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SDMA)) 3405 continue; 3406 /* skip CG for VCE/UVD, it's handled specially */ 3407 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_UVD && 3408 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCE && 3409 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCN && 3410 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_JPEG && 3411 adev->ip_blocks[i].version->funcs->set_powergating_state) { 3412 /* enable powergating to save power */ 3413 r = adev->ip_blocks[i].version->funcs->set_powergating_state(&adev->ip_blocks[i], 3414 state); 3415 if (r) { 3416 dev_err(adev->dev, 3417 "set_powergating_state(gate) of IP block <%s> failed %d\n", 3418 adev->ip_blocks[i].version->funcs->name, 3419 r); 3420 return r; 3421 } 3422 } 3423 } 3424 return 0; 3425 } 3426 3427 static int amdgpu_device_enable_mgpu_fan_boost(void) 3428 { 3429 struct amdgpu_gpu_instance *gpu_ins; 3430 struct amdgpu_device *adev; 3431 int i, ret = 0; 3432 3433 mutex_lock(&mgpu_info.mutex); 3434 3435 /* 3436 * MGPU fan boost feature should be enabled 3437 * only when there are two or more dGPUs in 3438 * the system 3439 */ 3440 if (mgpu_info.num_dgpu < 2) 3441 goto out; 3442 3443 for (i = 0; i < mgpu_info.num_dgpu; i++) { 3444 gpu_ins = &(mgpu_info.gpu_ins[i]); 3445 adev = gpu_ins->adev; 3446 if (!(adev->flags & AMD_IS_APU || amdgpu_sriov_multi_vf_mode(adev)) && 3447 !gpu_ins->mgpu_fan_enabled) { 3448 ret = amdgpu_dpm_enable_mgpu_fan_boost(adev); 3449 if (ret) 3450 break; 3451 3452 gpu_ins->mgpu_fan_enabled = 1; 3453 } 3454 } 3455 3456 out: 3457 mutex_unlock(&mgpu_info.mutex); 3458 3459 return ret; 3460 } 3461 3462 /** 3463 * amdgpu_device_ip_late_init - run late init for hardware IPs 3464 * 3465 * @adev: amdgpu_device pointer 3466 * 3467 * Late initialization pass for hardware IPs. The list of all the hardware 3468 * IPs that make up the asic is walked and the late_init callbacks are run. 3469 * late_init covers any special initialization that an IP requires 3470 * after all of the have been initialized or something that needs to happen 3471 * late in the init process. 3472 * Returns 0 on success, negative error code on failure. 3473 */ 3474 static int amdgpu_device_ip_late_init(struct amdgpu_device *adev) 3475 { 3476 struct amdgpu_gpu_instance *gpu_instance; 3477 int i = 0, r; 3478 3479 for (i = 0; i < adev->num_ip_blocks; i++) { 3480 if (!adev->ip_blocks[i].status.hw) 3481 continue; 3482 if (adev->ip_blocks[i].version->funcs->late_init) { 3483 r = adev->ip_blocks[i].version->funcs->late_init(&adev->ip_blocks[i]); 3484 if (r) { 3485 dev_err(adev->dev, 3486 "late_init of IP block <%s> failed %d\n", 3487 adev->ip_blocks[i].version->funcs->name, 3488 r); 3489 return r; 3490 } 3491 } 3492 adev->ip_blocks[i].status.late_initialized = true; 3493 } 3494 3495 r = amdgpu_ras_late_init(adev); 3496 if (r) { 3497 dev_err(adev->dev, "amdgpu_ras_late_init failed %d", r); 3498 return r; 3499 } 3500 3501 if (!amdgpu_reset_in_recovery(adev)) 3502 amdgpu_ras_set_error_query_ready(adev, true); 3503 3504 amdgpu_device_set_cg_state(adev, AMD_CG_STATE_GATE); 3505 amdgpu_device_set_pg_state(adev, AMD_PG_STATE_GATE); 3506 3507 amdgpu_device_fill_reset_magic(adev); 3508 3509 r = amdgpu_device_enable_mgpu_fan_boost(); 3510 if (r) 3511 dev_err(adev->dev, "enable mgpu fan boost failed (%d).\n", r); 3512 3513 /* For passthrough configuration on arcturus and aldebaran, enable special handling SBR */ 3514 if (amdgpu_passthrough(adev) && 3515 ((adev->asic_type == CHIP_ARCTURUS && adev->gmc.xgmi.num_physical_nodes > 1) || 3516 adev->asic_type == CHIP_ALDEBARAN)) 3517 amdgpu_dpm_handle_passthrough_sbr(adev, true); 3518 3519 if (adev->gmc.xgmi.num_physical_nodes > 1) { 3520 mutex_lock(&mgpu_info.mutex); 3521 3522 /* 3523 * Reset device p-state to low as this was booted with high. 3524 * 3525 * This should be performed only after all devices from the same 3526 * hive get initialized. 3527 * 3528 * However, it's unknown how many device in the hive in advance. 3529 * As this is counted one by one during devices initializations. 3530 * 3531 * So, we wait for all XGMI interlinked devices initialized. 3532 * This may bring some delays as those devices may come from 3533 * different hives. But that should be OK. 3534 */ 3535 if (mgpu_info.num_dgpu == adev->gmc.xgmi.num_physical_nodes) { 3536 for (i = 0; i < mgpu_info.num_gpu; i++) { 3537 gpu_instance = &(mgpu_info.gpu_ins[i]); 3538 if (gpu_instance->adev->flags & AMD_IS_APU) 3539 continue; 3540 3541 r = amdgpu_xgmi_set_pstate(gpu_instance->adev, 3542 AMDGPU_XGMI_PSTATE_MIN); 3543 if (r) { 3544 dev_err(adev->dev, 3545 "pstate setting failed (%d).\n", 3546 r); 3547 break; 3548 } 3549 } 3550 } 3551 3552 mutex_unlock(&mgpu_info.mutex); 3553 } 3554 3555 return 0; 3556 } 3557 3558 static void amdgpu_ip_block_hw_fini(struct amdgpu_ip_block *ip_block) 3559 { 3560 struct amdgpu_device *adev = ip_block->adev; 3561 int r; 3562 3563 if (!ip_block->version->funcs->hw_fini) { 3564 dev_err(adev->dev, "hw_fini of IP block <%s> not defined\n", 3565 ip_block->version->funcs->name); 3566 } else { 3567 r = ip_block->version->funcs->hw_fini(ip_block); 3568 /* XXX handle errors */ 3569 if (r) { 3570 dev_dbg(adev->dev, 3571 "hw_fini of IP block <%s> failed %d\n", 3572 ip_block->version->funcs->name, r); 3573 } 3574 } 3575 3576 ip_block->status.hw = false; 3577 } 3578 3579 /** 3580 * amdgpu_device_smu_fini_early - smu hw_fini wrapper 3581 * 3582 * @adev: amdgpu_device pointer 3583 * 3584 * For ASICs need to disable SMC first 3585 */ 3586 static void amdgpu_device_smu_fini_early(struct amdgpu_device *adev) 3587 { 3588 int i; 3589 3590 if (amdgpu_ip_version(adev, GC_HWIP, 0) > IP_VERSION(9, 0, 0)) 3591 return; 3592 3593 for (i = 0; i < adev->num_ip_blocks; i++) { 3594 if (!adev->ip_blocks[i].status.hw) 3595 continue; 3596 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) { 3597 amdgpu_ip_block_hw_fini(&adev->ip_blocks[i]); 3598 break; 3599 } 3600 } 3601 } 3602 3603 static int amdgpu_device_ip_fini_early(struct amdgpu_device *adev) 3604 { 3605 int i, r; 3606 3607 for (i = 0; i < adev->num_ip_blocks; i++) { 3608 if (!adev->ip_blocks[i].version->funcs->early_fini) 3609 continue; 3610 3611 r = adev->ip_blocks[i].version->funcs->early_fini(&adev->ip_blocks[i]); 3612 if (r) { 3613 dev_dbg(adev->dev, 3614 "early_fini of IP block <%s> failed %d\n", 3615 adev->ip_blocks[i].version->funcs->name, r); 3616 } 3617 } 3618 3619 amdgpu_device_set_pg_state(adev, AMD_PG_STATE_UNGATE); 3620 amdgpu_device_set_cg_state(adev, AMD_CG_STATE_UNGATE); 3621 3622 amdgpu_amdkfd_suspend(adev, true); 3623 amdgpu_userq_suspend(adev); 3624 3625 /* Workaround for ASICs need to disable SMC first */ 3626 amdgpu_device_smu_fini_early(adev); 3627 3628 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 3629 if (!adev->ip_blocks[i].status.hw) 3630 continue; 3631 3632 amdgpu_ip_block_hw_fini(&adev->ip_blocks[i]); 3633 } 3634 3635 if (amdgpu_sriov_vf(adev)) { 3636 if (amdgpu_virt_release_full_gpu(adev, false)) 3637 dev_err(adev->dev, 3638 "failed to release exclusive mode on fini\n"); 3639 } 3640 3641 return 0; 3642 } 3643 3644 /** 3645 * amdgpu_device_ip_fini - run fini for hardware IPs 3646 * 3647 * @adev: amdgpu_device pointer 3648 * 3649 * Main teardown pass for hardware IPs. The list of all the hardware 3650 * IPs that make up the asic is walked and the hw_fini and sw_fini callbacks 3651 * are run. hw_fini tears down the hardware associated with each IP 3652 * and sw_fini tears down any software state associated with each IP. 3653 * Returns 0 on success, negative error code on failure. 3654 */ 3655 static int amdgpu_device_ip_fini(struct amdgpu_device *adev) 3656 { 3657 int i, r; 3658 3659 amdgpu_cper_fini(adev); 3660 3661 if (amdgpu_sriov_vf(adev) && adev->virt.ras_init_done) 3662 amdgpu_virt_release_ras_err_handler_data(adev); 3663 3664 if (adev->gmc.xgmi.num_physical_nodes > 1) 3665 amdgpu_xgmi_remove_device(adev); 3666 3667 amdgpu_amdkfd_device_fini_sw(adev); 3668 3669 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 3670 if (!adev->ip_blocks[i].status.sw) 3671 continue; 3672 3673 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) { 3674 amdgpu_ucode_free_bo(adev); 3675 amdgpu_free_static_csa(&adev->virt.csa_obj); 3676 amdgpu_device_wb_fini(adev); 3677 amdgpu_device_mem_scratch_fini(adev); 3678 amdgpu_ib_pool_fini(adev); 3679 amdgpu_seq64_fini(adev); 3680 amdgpu_doorbell_fini(adev); 3681 } 3682 if (adev->ip_blocks[i].version->funcs->sw_fini) { 3683 r = adev->ip_blocks[i].version->funcs->sw_fini(&adev->ip_blocks[i]); 3684 /* XXX handle errors */ 3685 if (r) { 3686 dev_dbg(adev->dev, 3687 "sw_fini of IP block <%s> failed %d\n", 3688 adev->ip_blocks[i].version->funcs->name, 3689 r); 3690 } 3691 } 3692 adev->ip_blocks[i].status.sw = false; 3693 adev->ip_blocks[i].status.valid = false; 3694 } 3695 3696 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 3697 if (!adev->ip_blocks[i].status.late_initialized) 3698 continue; 3699 if (adev->ip_blocks[i].version->funcs->late_fini) 3700 adev->ip_blocks[i].version->funcs->late_fini(&adev->ip_blocks[i]); 3701 adev->ip_blocks[i].status.late_initialized = false; 3702 } 3703 3704 amdgpu_ras_fini(adev); 3705 amdgpu_uid_fini(adev); 3706 3707 return 0; 3708 } 3709 3710 /** 3711 * amdgpu_device_delayed_init_work_handler - work handler for IB tests 3712 * 3713 * @work: work_struct. 3714 */ 3715 static void amdgpu_device_delayed_init_work_handler(struct work_struct *work) 3716 { 3717 struct amdgpu_device *adev = 3718 container_of(work, struct amdgpu_device, delayed_init_work.work); 3719 int r; 3720 3721 r = amdgpu_ib_ring_tests(adev); 3722 if (r) 3723 dev_err(adev->dev, "ib ring test failed (%d).\n", r); 3724 } 3725 3726 static void amdgpu_device_delay_enable_gfx_off(struct work_struct *work) 3727 { 3728 struct amdgpu_device *adev = 3729 container_of(work, struct amdgpu_device, gfx.gfx_off_delay_work.work); 3730 3731 WARN_ON_ONCE(adev->gfx.gfx_off_state); 3732 WARN_ON_ONCE(adev->gfx.gfx_off_req_count); 3733 3734 if (!amdgpu_dpm_set_powergating_by_smu(adev, AMD_IP_BLOCK_TYPE_GFX, true, 0)) 3735 adev->gfx.gfx_off_state = true; 3736 } 3737 3738 /** 3739 * amdgpu_device_ip_suspend_phase1 - run suspend for hardware IPs (phase 1) 3740 * 3741 * @adev: amdgpu_device pointer 3742 * 3743 * Main suspend function for hardware IPs. The list of all the hardware 3744 * IPs that make up the asic is walked, clockgating is disabled and the 3745 * suspend callbacks are run. suspend puts the hardware and software state 3746 * in each IP into a state suitable for suspend. 3747 * Returns 0 on success, negative error code on failure. 3748 */ 3749 static int amdgpu_device_ip_suspend_phase1(struct amdgpu_device *adev) 3750 { 3751 int i, r; 3752 3753 amdgpu_device_set_pg_state(adev, AMD_PG_STATE_UNGATE); 3754 amdgpu_device_set_cg_state(adev, AMD_CG_STATE_UNGATE); 3755 3756 /* 3757 * Per PMFW team's suggestion, driver needs to handle gfxoff 3758 * and df cstate features disablement for gpu reset(e.g. Mode1Reset) 3759 * scenario. Add the missing df cstate disablement here. 3760 */ 3761 if (amdgpu_dpm_set_df_cstate(adev, DF_CSTATE_DISALLOW)) 3762 dev_warn(adev->dev, "Failed to disallow df cstate"); 3763 3764 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 3765 if (!adev->ip_blocks[i].status.valid) 3766 continue; 3767 3768 /* displays are handled separately */ 3769 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_DCE) 3770 continue; 3771 3772 /* XXX handle errors */ 3773 r = amdgpu_ip_block_suspend(&adev->ip_blocks[i]); 3774 if (r) 3775 return r; 3776 } 3777 3778 return 0; 3779 } 3780 3781 /** 3782 * amdgpu_device_ip_suspend_phase2 - run suspend for hardware IPs (phase 2) 3783 * 3784 * @adev: amdgpu_device pointer 3785 * 3786 * Main suspend function for hardware IPs. The list of all the hardware 3787 * IPs that make up the asic is walked, clockgating is disabled and the 3788 * suspend callbacks are run. suspend puts the hardware and software state 3789 * in each IP into a state suitable for suspend. 3790 * Returns 0 on success, negative error code on failure. 3791 */ 3792 static int amdgpu_device_ip_suspend_phase2(struct amdgpu_device *adev) 3793 { 3794 int i, r; 3795 3796 if (adev->in_s0ix) 3797 amdgpu_dpm_gfx_state_change(adev, sGpuChangeState_D3Entry); 3798 3799 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 3800 if (!adev->ip_blocks[i].status.valid) 3801 continue; 3802 /* displays are handled in phase1 */ 3803 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE) 3804 continue; 3805 /* PSP lost connection when err_event_athub occurs */ 3806 if (amdgpu_ras_intr_triggered() && 3807 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) { 3808 adev->ip_blocks[i].status.hw = false; 3809 continue; 3810 } 3811 3812 /* skip unnecessary suspend if we do not initialize them yet */ 3813 if (!amdgpu_ip_member_of_hwini( 3814 adev, adev->ip_blocks[i].version->type)) 3815 continue; 3816 3817 /* Since we skip suspend for S0i3, we need to cancel the delayed 3818 * idle work here as the suspend callback never gets called. 3819 */ 3820 if (adev->in_s0ix && 3821 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GFX && 3822 amdgpu_ip_version(adev, GC_HWIP, 0) >= IP_VERSION(10, 0, 0)) 3823 cancel_delayed_work_sync(&adev->gfx.idle_work); 3824 /* skip suspend of gfx/mes and psp for S0ix 3825 * gfx is in gfxoff state, so on resume it will exit gfxoff just 3826 * like at runtime. PSP is also part of the always on hardware 3827 * so no need to suspend it. 3828 */ 3829 if (adev->in_s0ix && 3830 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP || 3831 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GFX || 3832 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_MES)) 3833 continue; 3834 3835 /* SDMA 5.x+ is part of GFX power domain so it's covered by GFXOFF */ 3836 if (adev->in_s0ix && 3837 (amdgpu_ip_version(adev, SDMA0_HWIP, 0) >= 3838 IP_VERSION(5, 0, 0)) && 3839 (adev->ip_blocks[i].version->type == 3840 AMD_IP_BLOCK_TYPE_SDMA)) 3841 continue; 3842 3843 /* Once swPSP provides the IMU, RLC FW binaries to TOS during cold-boot. 3844 * These are in TMR, hence are expected to be reused by PSP-TOS to reload 3845 * from this location and RLC Autoload automatically also gets loaded 3846 * from here based on PMFW -> PSP message during re-init sequence. 3847 * Therefore, the psp suspend & resume should be skipped to avoid destroy 3848 * the TMR and reload FWs again for IMU enabled APU ASICs. 3849 */ 3850 if (amdgpu_in_reset(adev) && 3851 (adev->flags & AMD_IS_APU) && adev->gfx.imu.funcs && 3852 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) 3853 continue; 3854 3855 /* XXX handle errors */ 3856 r = amdgpu_ip_block_suspend(&adev->ip_blocks[i]); 3857 adev->ip_blocks[i].status.hw = false; 3858 3859 /* handle putting the SMC in the appropriate state */ 3860 if (!amdgpu_sriov_vf(adev)) { 3861 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) { 3862 r = amdgpu_dpm_set_mp1_state(adev, adev->mp1_state); 3863 if (r) { 3864 dev_err(adev->dev, 3865 "SMC failed to set mp1 state %d, %d\n", 3866 adev->mp1_state, r); 3867 return r; 3868 } 3869 } 3870 } 3871 } 3872 3873 return 0; 3874 } 3875 3876 /** 3877 * amdgpu_device_ip_suspend - run suspend for hardware IPs 3878 * 3879 * @adev: amdgpu_device pointer 3880 * 3881 * Main suspend function for hardware IPs. The list of all the hardware 3882 * IPs that make up the asic is walked, clockgating is disabled and the 3883 * suspend callbacks are run. suspend puts the hardware and software state 3884 * in each IP into a state suitable for suspend. 3885 * Returns 0 on success, negative error code on failure. 3886 */ 3887 int amdgpu_device_ip_suspend(struct amdgpu_device *adev) 3888 { 3889 int r; 3890 3891 if (amdgpu_sriov_vf(adev)) { 3892 amdgpu_virt_fini_data_exchange(adev); 3893 amdgpu_virt_request_full_gpu(adev, false); 3894 } 3895 3896 amdgpu_ttm_set_buffer_funcs_status(adev, false); 3897 3898 r = amdgpu_device_ip_suspend_phase1(adev); 3899 if (r) 3900 return r; 3901 r = amdgpu_device_ip_suspend_phase2(adev); 3902 3903 if (amdgpu_sriov_vf(adev)) 3904 amdgpu_virt_release_full_gpu(adev, false); 3905 3906 return r; 3907 } 3908 3909 static int amdgpu_device_ip_reinit_early_sriov(struct amdgpu_device *adev) 3910 { 3911 int i, r; 3912 3913 static enum amd_ip_block_type ip_order[] = { 3914 AMD_IP_BLOCK_TYPE_COMMON, 3915 AMD_IP_BLOCK_TYPE_GMC, 3916 AMD_IP_BLOCK_TYPE_PSP, 3917 AMD_IP_BLOCK_TYPE_IH, 3918 }; 3919 3920 for (i = 0; i < adev->num_ip_blocks; i++) { 3921 int j; 3922 struct amdgpu_ip_block *block; 3923 3924 block = &adev->ip_blocks[i]; 3925 block->status.hw = false; 3926 3927 for (j = 0; j < ARRAY_SIZE(ip_order); j++) { 3928 3929 if (block->version->type != ip_order[j] || 3930 !block->status.valid) 3931 continue; 3932 3933 r = block->version->funcs->hw_init(&adev->ip_blocks[i]); 3934 if (r) { 3935 dev_err(adev->dev, "RE-INIT-early: %s failed\n", 3936 block->version->funcs->name); 3937 return r; 3938 } 3939 block->status.hw = true; 3940 } 3941 } 3942 3943 return 0; 3944 } 3945 3946 static int amdgpu_device_ip_reinit_late_sriov(struct amdgpu_device *adev) 3947 { 3948 struct amdgpu_ip_block *block; 3949 int i, r = 0; 3950 3951 static enum amd_ip_block_type ip_order[] = { 3952 AMD_IP_BLOCK_TYPE_SMC, 3953 AMD_IP_BLOCK_TYPE_DCE, 3954 AMD_IP_BLOCK_TYPE_GFX, 3955 AMD_IP_BLOCK_TYPE_SDMA, 3956 AMD_IP_BLOCK_TYPE_MES, 3957 AMD_IP_BLOCK_TYPE_UVD, 3958 AMD_IP_BLOCK_TYPE_VCE, 3959 AMD_IP_BLOCK_TYPE_VCN, 3960 AMD_IP_BLOCK_TYPE_JPEG 3961 }; 3962 3963 for (i = 0; i < ARRAY_SIZE(ip_order); i++) { 3964 block = amdgpu_device_ip_get_ip_block(adev, ip_order[i]); 3965 3966 if (!block) 3967 continue; 3968 3969 if (block->status.valid && !block->status.hw) { 3970 if (block->version->type == AMD_IP_BLOCK_TYPE_SMC) { 3971 r = amdgpu_ip_block_resume(block); 3972 } else { 3973 r = block->version->funcs->hw_init(block); 3974 } 3975 3976 if (r) { 3977 dev_err(adev->dev, "RE-INIT-late: %s failed\n", 3978 block->version->funcs->name); 3979 break; 3980 } 3981 block->status.hw = true; 3982 } 3983 } 3984 3985 return r; 3986 } 3987 3988 /** 3989 * amdgpu_device_ip_resume_phase1 - run resume for hardware IPs 3990 * 3991 * @adev: amdgpu_device pointer 3992 * 3993 * First resume function for hardware IPs. The list of all the hardware 3994 * IPs that make up the asic is walked and the resume callbacks are run for 3995 * COMMON, GMC, and IH. resume puts the hardware into a functional state 3996 * after a suspend and updates the software state as necessary. This 3997 * function is also used for restoring the GPU after a GPU reset. 3998 * Returns 0 on success, negative error code on failure. 3999 */ 4000 static int amdgpu_device_ip_resume_phase1(struct amdgpu_device *adev) 4001 { 4002 int i, r; 4003 4004 for (i = 0; i < adev->num_ip_blocks; i++) { 4005 if (!adev->ip_blocks[i].status.valid || adev->ip_blocks[i].status.hw) 4006 continue; 4007 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON || 4008 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC || 4009 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH || 4010 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP && amdgpu_sriov_vf(adev))) { 4011 4012 r = amdgpu_ip_block_resume(&adev->ip_blocks[i]); 4013 if (r) 4014 return r; 4015 } 4016 } 4017 4018 return 0; 4019 } 4020 4021 /** 4022 * amdgpu_device_ip_resume_phase2 - run resume for hardware IPs 4023 * 4024 * @adev: amdgpu_device pointer 4025 * 4026 * Second resume function for hardware IPs. The list of all the hardware 4027 * IPs that make up the asic is walked and the resume callbacks are run for 4028 * all blocks except COMMON, GMC, and IH. resume puts the hardware into a 4029 * functional state after a suspend and updates the software state as 4030 * necessary. This function is also used for restoring the GPU after a GPU 4031 * reset. 4032 * Returns 0 on success, negative error code on failure. 4033 */ 4034 static int amdgpu_device_ip_resume_phase2(struct amdgpu_device *adev) 4035 { 4036 int i, r; 4037 4038 for (i = 0; i < adev->num_ip_blocks; i++) { 4039 if (!adev->ip_blocks[i].status.valid || adev->ip_blocks[i].status.hw) 4040 continue; 4041 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON || 4042 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC || 4043 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH || 4044 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE || 4045 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) 4046 continue; 4047 r = amdgpu_ip_block_resume(&adev->ip_blocks[i]); 4048 if (r) 4049 return r; 4050 } 4051 4052 return 0; 4053 } 4054 4055 /** 4056 * amdgpu_device_ip_resume_phase3 - run resume for hardware IPs 4057 * 4058 * @adev: amdgpu_device pointer 4059 * 4060 * Third resume function for hardware IPs. The list of all the hardware 4061 * IPs that make up the asic is walked and the resume callbacks are run for 4062 * all DCE. resume puts the hardware into a functional state after a suspend 4063 * and updates the software state as necessary. This function is also used 4064 * for restoring the GPU after a GPU reset. 4065 * 4066 * Returns 0 on success, negative error code on failure. 4067 */ 4068 static int amdgpu_device_ip_resume_phase3(struct amdgpu_device *adev) 4069 { 4070 int i, r; 4071 4072 for (i = 0; i < adev->num_ip_blocks; i++) { 4073 if (!adev->ip_blocks[i].status.valid || adev->ip_blocks[i].status.hw) 4074 continue; 4075 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE) { 4076 r = amdgpu_ip_block_resume(&adev->ip_blocks[i]); 4077 if (r) 4078 return r; 4079 } 4080 } 4081 4082 return 0; 4083 } 4084 4085 /** 4086 * amdgpu_device_ip_resume - run resume for hardware IPs 4087 * 4088 * @adev: amdgpu_device pointer 4089 * 4090 * Main resume function for hardware IPs. The hardware IPs 4091 * are split into two resume functions because they are 4092 * also used in recovering from a GPU reset and some additional 4093 * steps need to be take between them. In this case (S3/S4) they are 4094 * run sequentially. 4095 * Returns 0 on success, negative error code on failure. 4096 */ 4097 static int amdgpu_device_ip_resume(struct amdgpu_device *adev) 4098 { 4099 int r; 4100 4101 r = amdgpu_device_ip_resume_phase1(adev); 4102 if (r) 4103 return r; 4104 4105 r = amdgpu_device_fw_loading(adev); 4106 if (r) 4107 return r; 4108 4109 r = amdgpu_device_ip_resume_phase2(adev); 4110 4111 if (adev->mman.buffer_funcs_ring->sched.ready) 4112 amdgpu_ttm_set_buffer_funcs_status(adev, true); 4113 4114 if (r) 4115 return r; 4116 4117 amdgpu_fence_driver_hw_init(adev); 4118 4119 r = amdgpu_device_ip_resume_phase3(adev); 4120 4121 return r; 4122 } 4123 4124 /** 4125 * amdgpu_device_detect_sriov_bios - determine if the board supports SR-IOV 4126 * 4127 * @adev: amdgpu_device pointer 4128 * 4129 * Query the VBIOS data tables to determine if the board supports SR-IOV. 4130 */ 4131 static void amdgpu_device_detect_sriov_bios(struct amdgpu_device *adev) 4132 { 4133 if (amdgpu_sriov_vf(adev)) { 4134 if (adev->is_atom_fw) { 4135 if (amdgpu_atomfirmware_gpu_virtualization_supported(adev)) 4136 adev->virt.caps |= AMDGPU_SRIOV_CAPS_SRIOV_VBIOS; 4137 } else { 4138 if (amdgpu_atombios_has_gpu_virtualization_table(adev)) 4139 adev->virt.caps |= AMDGPU_SRIOV_CAPS_SRIOV_VBIOS; 4140 } 4141 4142 if (!(adev->virt.caps & AMDGPU_SRIOV_CAPS_SRIOV_VBIOS)) 4143 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_NO_VBIOS, 0, 0); 4144 } 4145 } 4146 4147 /** 4148 * amdgpu_device_asic_has_dc_support - determine if DC supports the asic 4149 * 4150 * @pdev : pci device context 4151 * @asic_type: AMD asic type 4152 * 4153 * Check if there is DC (new modesetting infrastructre) support for an asic. 4154 * returns true if DC has support, false if not. 4155 */ 4156 bool amdgpu_device_asic_has_dc_support(struct pci_dev *pdev, 4157 enum amd_asic_type asic_type) 4158 { 4159 switch (asic_type) { 4160 #ifdef CONFIG_DRM_AMDGPU_SI 4161 case CHIP_HAINAN: 4162 #endif 4163 case CHIP_TOPAZ: 4164 /* chips with no display hardware */ 4165 return false; 4166 #if defined(CONFIG_DRM_AMD_DC) 4167 case CHIP_TAHITI: 4168 case CHIP_PITCAIRN: 4169 case CHIP_VERDE: 4170 case CHIP_OLAND: 4171 /* 4172 * We have systems in the wild with these ASICs that require 4173 * LVDS and VGA support which is not supported with DC. 4174 * 4175 * Fallback to the non-DC driver here by default so as not to 4176 * cause regressions. 4177 */ 4178 #if defined(CONFIG_DRM_AMD_DC_SI) 4179 return amdgpu_dc > 0; 4180 #else 4181 return false; 4182 #endif 4183 case CHIP_BONAIRE: 4184 case CHIP_KAVERI: 4185 case CHIP_KABINI: 4186 case CHIP_MULLINS: 4187 /* 4188 * We have systems in the wild with these ASICs that require 4189 * VGA support which is not supported with DC. 4190 * 4191 * Fallback to the non-DC driver here by default so as not to 4192 * cause regressions. 4193 */ 4194 return amdgpu_dc > 0; 4195 default: 4196 return amdgpu_dc != 0; 4197 #else 4198 default: 4199 if (amdgpu_dc > 0) 4200 dev_info_once( 4201 &pdev->dev, 4202 "Display Core has been requested via kernel parameter but isn't supported by ASIC, ignoring\n"); 4203 return false; 4204 #endif 4205 } 4206 } 4207 4208 /** 4209 * amdgpu_device_has_dc_support - check if dc is supported 4210 * 4211 * @adev: amdgpu_device pointer 4212 * 4213 * Returns true for supported, false for not supported 4214 */ 4215 bool amdgpu_device_has_dc_support(struct amdgpu_device *adev) 4216 { 4217 if (adev->enable_virtual_display || 4218 (adev->harvest_ip_mask & AMD_HARVEST_IP_DMU_MASK)) 4219 return false; 4220 4221 return amdgpu_device_asic_has_dc_support(adev->pdev, adev->asic_type); 4222 } 4223 4224 static void amdgpu_device_xgmi_reset_func(struct work_struct *__work) 4225 { 4226 struct amdgpu_device *adev = 4227 container_of(__work, struct amdgpu_device, xgmi_reset_work); 4228 struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev); 4229 4230 /* It's a bug to not have a hive within this function */ 4231 if (WARN_ON(!hive)) 4232 return; 4233 4234 /* 4235 * Use task barrier to synchronize all xgmi reset works across the 4236 * hive. task_barrier_enter and task_barrier_exit will block 4237 * until all the threads running the xgmi reset works reach 4238 * those points. task_barrier_full will do both blocks. 4239 */ 4240 if (amdgpu_asic_reset_method(adev) == AMD_RESET_METHOD_BACO) { 4241 4242 task_barrier_enter(&hive->tb); 4243 adev->asic_reset_res = amdgpu_device_baco_enter(adev); 4244 4245 if (adev->asic_reset_res) 4246 goto fail; 4247 4248 task_barrier_exit(&hive->tb); 4249 adev->asic_reset_res = amdgpu_device_baco_exit(adev); 4250 4251 if (adev->asic_reset_res) 4252 goto fail; 4253 4254 amdgpu_ras_reset_error_count(adev, AMDGPU_RAS_BLOCK__MMHUB); 4255 } else { 4256 4257 task_barrier_full(&hive->tb); 4258 adev->asic_reset_res = amdgpu_asic_reset(adev); 4259 } 4260 4261 fail: 4262 if (adev->asic_reset_res) 4263 dev_warn(adev->dev, 4264 "ASIC reset failed with error, %d for drm dev, %s", 4265 adev->asic_reset_res, adev_to_drm(adev)->unique); 4266 amdgpu_put_xgmi_hive(hive); 4267 } 4268 4269 static int amdgpu_device_get_job_timeout_settings(struct amdgpu_device *adev) 4270 { 4271 char *input = amdgpu_lockup_timeout; 4272 char *timeout_setting = NULL; 4273 int index = 0; 4274 long timeout; 4275 int ret = 0; 4276 4277 /* 4278 * By default timeout for jobs is 10 sec 4279 */ 4280 adev->compute_timeout = adev->gfx_timeout = msecs_to_jiffies(10000); 4281 adev->sdma_timeout = adev->video_timeout = adev->gfx_timeout; 4282 4283 if (strnlen(input, AMDGPU_MAX_TIMEOUT_PARAM_LENGTH)) { 4284 while ((timeout_setting = strsep(&input, ",")) && 4285 strnlen(timeout_setting, AMDGPU_MAX_TIMEOUT_PARAM_LENGTH)) { 4286 ret = kstrtol(timeout_setting, 0, &timeout); 4287 if (ret) 4288 return ret; 4289 4290 if (timeout == 0) { 4291 index++; 4292 continue; 4293 } else if (timeout < 0) { 4294 timeout = MAX_SCHEDULE_TIMEOUT; 4295 dev_warn(adev->dev, "lockup timeout disabled"); 4296 add_taint(TAINT_SOFTLOCKUP, LOCKDEP_STILL_OK); 4297 } else { 4298 timeout = msecs_to_jiffies(timeout); 4299 } 4300 4301 switch (index++) { 4302 case 0: 4303 adev->gfx_timeout = timeout; 4304 break; 4305 case 1: 4306 adev->compute_timeout = timeout; 4307 break; 4308 case 2: 4309 adev->sdma_timeout = timeout; 4310 break; 4311 case 3: 4312 adev->video_timeout = timeout; 4313 break; 4314 default: 4315 break; 4316 } 4317 } 4318 /* 4319 * There is only one value specified and 4320 * it should apply to all non-compute jobs. 4321 */ 4322 if (index == 1) { 4323 adev->sdma_timeout = adev->video_timeout = adev->gfx_timeout; 4324 if (amdgpu_sriov_vf(adev) || amdgpu_passthrough(adev)) 4325 adev->compute_timeout = adev->gfx_timeout; 4326 } 4327 } 4328 4329 return ret; 4330 } 4331 4332 /** 4333 * amdgpu_device_check_iommu_direct_map - check if RAM direct mapped to GPU 4334 * 4335 * @adev: amdgpu_device pointer 4336 * 4337 * RAM direct mapped to GPU if IOMMU is not enabled or is pass through mode 4338 */ 4339 static void amdgpu_device_check_iommu_direct_map(struct amdgpu_device *adev) 4340 { 4341 struct iommu_domain *domain; 4342 4343 domain = iommu_get_domain_for_dev(adev->dev); 4344 if (!domain || domain->type == IOMMU_DOMAIN_IDENTITY) 4345 adev->ram_is_direct_mapped = true; 4346 } 4347 4348 #if defined(CONFIG_HSA_AMD_P2P) 4349 /** 4350 * amdgpu_device_check_iommu_remap - Check if DMA remapping is enabled. 4351 * 4352 * @adev: amdgpu_device pointer 4353 * 4354 * return if IOMMU remapping bar address 4355 */ 4356 static bool amdgpu_device_check_iommu_remap(struct amdgpu_device *adev) 4357 { 4358 struct iommu_domain *domain; 4359 4360 domain = iommu_get_domain_for_dev(adev->dev); 4361 if (domain && (domain->type == IOMMU_DOMAIN_DMA || 4362 domain->type == IOMMU_DOMAIN_DMA_FQ)) 4363 return true; 4364 4365 return false; 4366 } 4367 #endif 4368 4369 static void amdgpu_device_set_mcbp(struct amdgpu_device *adev) 4370 { 4371 if (amdgpu_mcbp == 1) 4372 adev->gfx.mcbp = true; 4373 else if (amdgpu_mcbp == 0) 4374 adev->gfx.mcbp = false; 4375 4376 if (amdgpu_sriov_vf(adev)) 4377 adev->gfx.mcbp = true; 4378 4379 if (adev->gfx.mcbp) 4380 dev_info(adev->dev, "MCBP is enabled\n"); 4381 } 4382 4383 /** 4384 * amdgpu_device_init - initialize the driver 4385 * 4386 * @adev: amdgpu_device pointer 4387 * @flags: driver flags 4388 * 4389 * Initializes the driver info and hw (all asics). 4390 * Returns 0 for success or an error on failure. 4391 * Called at driver startup. 4392 */ 4393 int amdgpu_device_init(struct amdgpu_device *adev, 4394 uint32_t flags) 4395 { 4396 struct pci_dev *pdev = adev->pdev; 4397 int r, i; 4398 bool px = false; 4399 u32 max_MBps; 4400 int tmp; 4401 4402 adev->shutdown = false; 4403 adev->flags = flags; 4404 4405 if (amdgpu_force_asic_type >= 0 && amdgpu_force_asic_type < CHIP_LAST) 4406 adev->asic_type = amdgpu_force_asic_type; 4407 else 4408 adev->asic_type = flags & AMD_ASIC_MASK; 4409 4410 adev->usec_timeout = AMDGPU_MAX_USEC_TIMEOUT; 4411 if (amdgpu_emu_mode == 1) 4412 adev->usec_timeout *= 10; 4413 adev->gmc.gart_size = 512 * 1024 * 1024; 4414 adev->accel_working = false; 4415 adev->num_rings = 0; 4416 RCU_INIT_POINTER(adev->gang_submit, dma_fence_get_stub()); 4417 adev->mman.buffer_funcs = NULL; 4418 adev->mman.buffer_funcs_ring = NULL; 4419 adev->vm_manager.vm_pte_funcs = NULL; 4420 adev->vm_manager.vm_pte_num_scheds = 0; 4421 adev->gmc.gmc_funcs = NULL; 4422 adev->harvest_ip_mask = 0x0; 4423 adev->fence_context = dma_fence_context_alloc(AMDGPU_MAX_RINGS); 4424 bitmap_zero(adev->gfx.pipe_reserve_bitmap, AMDGPU_MAX_COMPUTE_QUEUES); 4425 4426 adev->smc_rreg = &amdgpu_invalid_rreg; 4427 adev->smc_wreg = &amdgpu_invalid_wreg; 4428 adev->pcie_rreg = &amdgpu_invalid_rreg; 4429 adev->pcie_wreg = &amdgpu_invalid_wreg; 4430 adev->pcie_rreg_ext = &amdgpu_invalid_rreg_ext; 4431 adev->pcie_wreg_ext = &amdgpu_invalid_wreg_ext; 4432 adev->pciep_rreg = &amdgpu_invalid_rreg; 4433 adev->pciep_wreg = &amdgpu_invalid_wreg; 4434 adev->pcie_rreg64 = &amdgpu_invalid_rreg64; 4435 adev->pcie_wreg64 = &amdgpu_invalid_wreg64; 4436 adev->pcie_rreg64_ext = &amdgpu_invalid_rreg64_ext; 4437 adev->pcie_wreg64_ext = &amdgpu_invalid_wreg64_ext; 4438 adev->uvd_ctx_rreg = &amdgpu_invalid_rreg; 4439 adev->uvd_ctx_wreg = &amdgpu_invalid_wreg; 4440 adev->didt_rreg = &amdgpu_invalid_rreg; 4441 adev->didt_wreg = &amdgpu_invalid_wreg; 4442 adev->gc_cac_rreg = &amdgpu_invalid_rreg; 4443 adev->gc_cac_wreg = &amdgpu_invalid_wreg; 4444 adev->audio_endpt_rreg = &amdgpu_block_invalid_rreg; 4445 adev->audio_endpt_wreg = &amdgpu_block_invalid_wreg; 4446 4447 dev_info( 4448 adev->dev, 4449 "initializing kernel modesetting (%s 0x%04X:0x%04X 0x%04X:0x%04X 0x%02X).\n", 4450 amdgpu_asic_name[adev->asic_type], pdev->vendor, pdev->device, 4451 pdev->subsystem_vendor, pdev->subsystem_device, pdev->revision); 4452 4453 /* mutex initialization are all done here so we 4454 * can recall function without having locking issues 4455 */ 4456 mutex_init(&adev->firmware.mutex); 4457 mutex_init(&adev->pm.mutex); 4458 mutex_init(&adev->gfx.gpu_clock_mutex); 4459 mutex_init(&adev->srbm_mutex); 4460 mutex_init(&adev->gfx.pipe_reserve_mutex); 4461 mutex_init(&adev->gfx.gfx_off_mutex); 4462 mutex_init(&adev->gfx.partition_mutex); 4463 mutex_init(&adev->grbm_idx_mutex); 4464 mutex_init(&adev->mn_lock); 4465 mutex_init(&adev->virt.vf_errors.lock); 4466 hash_init(adev->mn_hash); 4467 mutex_init(&adev->psp.mutex); 4468 mutex_init(&adev->notifier_lock); 4469 mutex_init(&adev->pm.stable_pstate_ctx_lock); 4470 mutex_init(&adev->benchmark_mutex); 4471 mutex_init(&adev->gfx.reset_sem_mutex); 4472 /* Initialize the mutex for cleaner shader isolation between GFX and compute processes */ 4473 mutex_init(&adev->enforce_isolation_mutex); 4474 for (i = 0; i < MAX_XCP; ++i) { 4475 adev->isolation[i].spearhead = dma_fence_get_stub(); 4476 amdgpu_sync_create(&adev->isolation[i].active); 4477 amdgpu_sync_create(&adev->isolation[i].prev); 4478 } 4479 mutex_init(&adev->gfx.userq_sch_mutex); 4480 mutex_init(&adev->gfx.workload_profile_mutex); 4481 mutex_init(&adev->vcn.workload_profile_mutex); 4482 mutex_init(&adev->userq_mutex); 4483 4484 amdgpu_device_init_apu_flags(adev); 4485 4486 r = amdgpu_device_check_arguments(adev); 4487 if (r) 4488 return r; 4489 4490 spin_lock_init(&adev->mmio_idx_lock); 4491 spin_lock_init(&adev->smc_idx_lock); 4492 spin_lock_init(&adev->pcie_idx_lock); 4493 spin_lock_init(&adev->uvd_ctx_idx_lock); 4494 spin_lock_init(&adev->didt_idx_lock); 4495 spin_lock_init(&adev->gc_cac_idx_lock); 4496 spin_lock_init(&adev->se_cac_idx_lock); 4497 spin_lock_init(&adev->audio_endpt_idx_lock); 4498 spin_lock_init(&adev->mm_stats.lock); 4499 spin_lock_init(&adev->virt.rlcg_reg_lock); 4500 spin_lock_init(&adev->wb.lock); 4501 4502 xa_init_flags(&adev->userq_xa, XA_FLAGS_LOCK_IRQ); 4503 4504 INIT_LIST_HEAD(&adev->reset_list); 4505 4506 INIT_LIST_HEAD(&adev->ras_list); 4507 4508 INIT_LIST_HEAD(&adev->pm.od_kobj_list); 4509 4510 INIT_LIST_HEAD(&adev->userq_mgr_list); 4511 4512 INIT_DELAYED_WORK(&adev->delayed_init_work, 4513 amdgpu_device_delayed_init_work_handler); 4514 INIT_DELAYED_WORK(&adev->gfx.gfx_off_delay_work, 4515 amdgpu_device_delay_enable_gfx_off); 4516 /* 4517 * Initialize the enforce_isolation work structures for each XCP 4518 * partition. This work handler is responsible for enforcing shader 4519 * isolation on AMD GPUs. It counts the number of emitted fences for 4520 * each GFX and compute ring. If there are any fences, it schedules 4521 * the `enforce_isolation_work` to be run after a delay. If there are 4522 * no fences, it signals the Kernel Fusion Driver (KFD) to resume the 4523 * runqueue. 4524 */ 4525 for (i = 0; i < MAX_XCP; i++) { 4526 INIT_DELAYED_WORK(&adev->gfx.enforce_isolation[i].work, 4527 amdgpu_gfx_enforce_isolation_handler); 4528 adev->gfx.enforce_isolation[i].adev = adev; 4529 adev->gfx.enforce_isolation[i].xcp_id = i; 4530 } 4531 4532 INIT_WORK(&adev->xgmi_reset_work, amdgpu_device_xgmi_reset_func); 4533 4534 adev->gfx.gfx_off_req_count = 1; 4535 adev->gfx.gfx_off_residency = 0; 4536 adev->gfx.gfx_off_entrycount = 0; 4537 adev->pm.ac_power = power_supply_is_system_supplied() > 0; 4538 4539 atomic_set(&adev->throttling_logging_enabled, 1); 4540 /* 4541 * If throttling continues, logging will be performed every minute 4542 * to avoid log flooding. "-1" is subtracted since the thermal 4543 * throttling interrupt comes every second. Thus, the total logging 4544 * interval is 59 seconds(retelimited printk interval) + 1(waiting 4545 * for throttling interrupt) = 60 seconds. 4546 */ 4547 ratelimit_state_init(&adev->throttling_logging_rs, (60 - 1) * HZ, 1); 4548 4549 ratelimit_set_flags(&adev->throttling_logging_rs, RATELIMIT_MSG_ON_RELEASE); 4550 4551 /* Registers mapping */ 4552 /* TODO: block userspace mapping of io register */ 4553 if (adev->asic_type >= CHIP_BONAIRE) { 4554 adev->rmmio_base = pci_resource_start(adev->pdev, 5); 4555 adev->rmmio_size = pci_resource_len(adev->pdev, 5); 4556 } else { 4557 adev->rmmio_base = pci_resource_start(adev->pdev, 2); 4558 adev->rmmio_size = pci_resource_len(adev->pdev, 2); 4559 } 4560 4561 for (i = 0; i < AMD_IP_BLOCK_TYPE_NUM; i++) 4562 atomic_set(&adev->pm.pwr_state[i], POWER_STATE_UNKNOWN); 4563 4564 adev->rmmio = ioremap(adev->rmmio_base, adev->rmmio_size); 4565 if (!adev->rmmio) 4566 return -ENOMEM; 4567 4568 dev_info(adev->dev, "register mmio base: 0x%08X\n", 4569 (uint32_t)adev->rmmio_base); 4570 dev_info(adev->dev, "register mmio size: %u\n", 4571 (unsigned int)adev->rmmio_size); 4572 4573 /* 4574 * Reset domain needs to be present early, before XGMI hive discovered 4575 * (if any) and initialized to use reset sem and in_gpu reset flag 4576 * early on during init and before calling to RREG32. 4577 */ 4578 adev->reset_domain = amdgpu_reset_create_reset_domain(SINGLE_DEVICE, "amdgpu-reset-dev"); 4579 if (!adev->reset_domain) 4580 return -ENOMEM; 4581 4582 /* detect hw virtualization here */ 4583 amdgpu_virt_init(adev); 4584 4585 amdgpu_device_get_pcie_info(adev); 4586 4587 r = amdgpu_device_get_job_timeout_settings(adev); 4588 if (r) { 4589 dev_err(adev->dev, "invalid lockup_timeout parameter syntax\n"); 4590 return r; 4591 } 4592 4593 amdgpu_device_set_mcbp(adev); 4594 4595 /* 4596 * By default, use default mode where all blocks are expected to be 4597 * initialized. At present a 'swinit' of blocks is required to be 4598 * completed before the need for a different level is detected. 4599 */ 4600 amdgpu_set_init_level(adev, AMDGPU_INIT_LEVEL_DEFAULT); 4601 /* early init functions */ 4602 r = amdgpu_device_ip_early_init(adev); 4603 if (r) 4604 return r; 4605 4606 /* 4607 * No need to remove conflicting FBs for non-display class devices. 4608 * This prevents the sysfb from being freed accidently. 4609 */ 4610 if ((pdev->class >> 8) == PCI_CLASS_DISPLAY_VGA || 4611 (pdev->class >> 8) == PCI_CLASS_DISPLAY_OTHER) { 4612 /* Get rid of things like offb */ 4613 r = aperture_remove_conflicting_pci_devices(adev->pdev, amdgpu_kms_driver.name); 4614 if (r) 4615 return r; 4616 } 4617 4618 /* Enable TMZ based on IP_VERSION */ 4619 amdgpu_gmc_tmz_set(adev); 4620 4621 if (amdgpu_sriov_vf(adev) && 4622 amdgpu_ip_version(adev, GC_HWIP, 0) >= IP_VERSION(10, 3, 0)) 4623 /* VF MMIO access (except mailbox range) from CPU 4624 * will be blocked during sriov runtime 4625 */ 4626 adev->virt.caps |= AMDGPU_VF_MMIO_ACCESS_PROTECT; 4627 4628 amdgpu_gmc_noretry_set(adev); 4629 /* Need to get xgmi info early to decide the reset behavior*/ 4630 if (adev->gmc.xgmi.supported) { 4631 r = adev->gfxhub.funcs->get_xgmi_info(adev); 4632 if (r) 4633 return r; 4634 } 4635 4636 /* enable PCIE atomic ops */ 4637 if (amdgpu_sriov_vf(adev)) { 4638 if (adev->virt.fw_reserve.p_pf2vf) 4639 adev->have_atomics_support = ((struct amd_sriov_msg_pf2vf_info *) 4640 adev->virt.fw_reserve.p_pf2vf)->pcie_atomic_ops_support_flags == 4641 (PCI_EXP_DEVCAP2_ATOMIC_COMP32 | PCI_EXP_DEVCAP2_ATOMIC_COMP64); 4642 /* APUs w/ gfx9 onwards doesn't reply on PCIe atomics, rather it is a 4643 * internal path natively support atomics, set have_atomics_support to true. 4644 */ 4645 } else if ((adev->flags & AMD_IS_APU) && 4646 (amdgpu_ip_version(adev, GC_HWIP, 0) > 4647 IP_VERSION(9, 0, 0))) { 4648 adev->have_atomics_support = true; 4649 } else { 4650 adev->have_atomics_support = 4651 !pci_enable_atomic_ops_to_root(adev->pdev, 4652 PCI_EXP_DEVCAP2_ATOMIC_COMP32 | 4653 PCI_EXP_DEVCAP2_ATOMIC_COMP64); 4654 } 4655 4656 if (!adev->have_atomics_support) 4657 dev_info(adev->dev, "PCIE atomic ops is not supported\n"); 4658 4659 /* doorbell bar mapping and doorbell index init*/ 4660 amdgpu_doorbell_init(adev); 4661 4662 if (amdgpu_emu_mode == 1) { 4663 /* post the asic on emulation mode */ 4664 emu_soc_asic_init(adev); 4665 goto fence_driver_init; 4666 } 4667 4668 amdgpu_reset_init(adev); 4669 4670 /* detect if we are with an SRIOV vbios */ 4671 if (adev->bios) 4672 amdgpu_device_detect_sriov_bios(adev); 4673 4674 /* check if we need to reset the asic 4675 * E.g., driver was not cleanly unloaded previously, etc. 4676 */ 4677 if (!amdgpu_sriov_vf(adev) && amdgpu_asic_need_reset_on_init(adev)) { 4678 if (adev->gmc.xgmi.num_physical_nodes) { 4679 dev_info(adev->dev, "Pending hive reset.\n"); 4680 amdgpu_set_init_level(adev, 4681 AMDGPU_INIT_LEVEL_MINIMAL_XGMI); 4682 } else if (amdgpu_ip_version(adev, MP1_HWIP, 0) == IP_VERSION(13, 0, 10) && 4683 !amdgpu_device_has_display_hardware(adev)) { 4684 r = psp_gpu_reset(adev); 4685 } else { 4686 tmp = amdgpu_reset_method; 4687 /* It should do a default reset when loading or reloading the driver, 4688 * regardless of the module parameter reset_method. 4689 */ 4690 amdgpu_reset_method = AMD_RESET_METHOD_NONE; 4691 r = amdgpu_asic_reset(adev); 4692 amdgpu_reset_method = tmp; 4693 } 4694 4695 if (r) { 4696 dev_err(adev->dev, "asic reset on init failed\n"); 4697 goto failed; 4698 } 4699 } 4700 4701 /* Post card if necessary */ 4702 if (amdgpu_device_need_post(adev)) { 4703 if (!adev->bios) { 4704 dev_err(adev->dev, "no vBIOS found\n"); 4705 r = -EINVAL; 4706 goto failed; 4707 } 4708 dev_info(adev->dev, "GPU posting now...\n"); 4709 r = amdgpu_device_asic_init(adev); 4710 if (r) { 4711 dev_err(adev->dev, "gpu post error!\n"); 4712 goto failed; 4713 } 4714 } 4715 4716 if (adev->bios) { 4717 if (adev->is_atom_fw) { 4718 /* Initialize clocks */ 4719 r = amdgpu_atomfirmware_get_clock_info(adev); 4720 if (r) { 4721 dev_err(adev->dev, "amdgpu_atomfirmware_get_clock_info failed\n"); 4722 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_GET_CLOCK_FAIL, 0, 0); 4723 goto failed; 4724 } 4725 } else { 4726 /* Initialize clocks */ 4727 r = amdgpu_atombios_get_clock_info(adev); 4728 if (r) { 4729 dev_err(adev->dev, "amdgpu_atombios_get_clock_info failed\n"); 4730 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_GET_CLOCK_FAIL, 0, 0); 4731 goto failed; 4732 } 4733 /* init i2c buses */ 4734 amdgpu_i2c_init(adev); 4735 } 4736 } 4737 4738 fence_driver_init: 4739 /* Fence driver */ 4740 r = amdgpu_fence_driver_sw_init(adev); 4741 if (r) { 4742 dev_err(adev->dev, "amdgpu_fence_driver_sw_init failed\n"); 4743 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_FENCE_INIT_FAIL, 0, 0); 4744 goto failed; 4745 } 4746 4747 /* init the mode config */ 4748 drm_mode_config_init(adev_to_drm(adev)); 4749 4750 r = amdgpu_device_ip_init(adev); 4751 if (r) { 4752 dev_err(adev->dev, "amdgpu_device_ip_init failed\n"); 4753 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_AMDGPU_INIT_FAIL, 0, 0); 4754 goto release_ras_con; 4755 } 4756 4757 amdgpu_fence_driver_hw_init(adev); 4758 4759 dev_info(adev->dev, 4760 "SE %d, SH per SE %d, CU per SH %d, active_cu_number %d\n", 4761 adev->gfx.config.max_shader_engines, 4762 adev->gfx.config.max_sh_per_se, 4763 adev->gfx.config.max_cu_per_sh, 4764 adev->gfx.cu_info.number); 4765 4766 adev->accel_working = true; 4767 4768 amdgpu_vm_check_compute_bug(adev); 4769 4770 /* Initialize the buffer migration limit. */ 4771 if (amdgpu_moverate >= 0) 4772 max_MBps = amdgpu_moverate; 4773 else 4774 max_MBps = 8; /* Allow 8 MB/s. */ 4775 /* Get a log2 for easy divisions. */ 4776 adev->mm_stats.log2_max_MBps = ilog2(max(1u, max_MBps)); 4777 4778 /* 4779 * Register gpu instance before amdgpu_device_enable_mgpu_fan_boost. 4780 * Otherwise the mgpu fan boost feature will be skipped due to the 4781 * gpu instance is counted less. 4782 */ 4783 amdgpu_register_gpu_instance(adev); 4784 4785 /* enable clockgating, etc. after ib tests, etc. since some blocks require 4786 * explicit gating rather than handling it automatically. 4787 */ 4788 if (adev->init_lvl->level != AMDGPU_INIT_LEVEL_MINIMAL_XGMI) { 4789 r = amdgpu_device_ip_late_init(adev); 4790 if (r) { 4791 dev_err(adev->dev, "amdgpu_device_ip_late_init failed\n"); 4792 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_AMDGPU_LATE_INIT_FAIL, 0, r); 4793 goto release_ras_con; 4794 } 4795 /* must succeed. */ 4796 amdgpu_ras_resume(adev); 4797 queue_delayed_work(system_wq, &adev->delayed_init_work, 4798 msecs_to_jiffies(AMDGPU_RESUME_MS)); 4799 } 4800 4801 if (amdgpu_sriov_vf(adev)) { 4802 amdgpu_virt_release_full_gpu(adev, true); 4803 flush_delayed_work(&adev->delayed_init_work); 4804 } 4805 4806 /* 4807 * Place those sysfs registering after `late_init`. As some of those 4808 * operations performed in `late_init` might affect the sysfs 4809 * interfaces creating. 4810 */ 4811 r = amdgpu_atombios_sysfs_init(adev); 4812 if (r) 4813 drm_err(&adev->ddev, 4814 "registering atombios sysfs failed (%d).\n", r); 4815 4816 r = amdgpu_pm_sysfs_init(adev); 4817 if (r) 4818 dev_err(adev->dev, "registering pm sysfs failed (%d).\n", r); 4819 4820 r = amdgpu_ucode_sysfs_init(adev); 4821 if (r) { 4822 adev->ucode_sysfs_en = false; 4823 dev_err(adev->dev, "Creating firmware sysfs failed (%d).\n", r); 4824 } else 4825 adev->ucode_sysfs_en = true; 4826 4827 r = amdgpu_device_attr_sysfs_init(adev); 4828 if (r) 4829 dev_err(adev->dev, "Could not create amdgpu device attr\n"); 4830 4831 r = devm_device_add_group(adev->dev, &amdgpu_board_attrs_group); 4832 if (r) 4833 dev_err(adev->dev, 4834 "Could not create amdgpu board attributes\n"); 4835 4836 amdgpu_fru_sysfs_init(adev); 4837 amdgpu_reg_state_sysfs_init(adev); 4838 amdgpu_xcp_sysfs_init(adev); 4839 4840 if (IS_ENABLED(CONFIG_PERF_EVENTS)) 4841 r = amdgpu_pmu_init(adev); 4842 if (r) 4843 dev_err(adev->dev, "amdgpu_pmu_init failed\n"); 4844 4845 /* Have stored pci confspace at hand for restore in sudden PCI error */ 4846 if (amdgpu_device_cache_pci_state(adev->pdev)) 4847 pci_restore_state(pdev); 4848 4849 /* if we have > 1 VGA cards, then disable the amdgpu VGA resources */ 4850 /* this will fail for cards that aren't VGA class devices, just 4851 * ignore it 4852 */ 4853 if ((adev->pdev->class >> 8) == PCI_CLASS_DISPLAY_VGA) 4854 vga_client_register(adev->pdev, amdgpu_device_vga_set_decode); 4855 4856 px = amdgpu_device_supports_px(adev); 4857 4858 if (px || (!dev_is_removable(&adev->pdev->dev) && 4859 apple_gmux_detect(NULL, NULL))) 4860 vga_switcheroo_register_client(adev->pdev, 4861 &amdgpu_switcheroo_ops, px); 4862 4863 if (px) 4864 vga_switcheroo_init_domain_pm_ops(adev->dev, &adev->vga_pm_domain); 4865 4866 if (adev->init_lvl->level == AMDGPU_INIT_LEVEL_MINIMAL_XGMI) 4867 amdgpu_xgmi_reset_on_init(adev); 4868 4869 amdgpu_device_check_iommu_direct_map(adev); 4870 4871 adev->pm_nb.notifier_call = amdgpu_device_pm_notifier; 4872 r = register_pm_notifier(&adev->pm_nb); 4873 if (r) 4874 goto failed; 4875 4876 return 0; 4877 4878 release_ras_con: 4879 if (amdgpu_sriov_vf(adev)) 4880 amdgpu_virt_release_full_gpu(adev, true); 4881 4882 /* failed in exclusive mode due to timeout */ 4883 if (amdgpu_sriov_vf(adev) && 4884 !amdgpu_sriov_runtime(adev) && 4885 amdgpu_virt_mmio_blocked(adev) && 4886 !amdgpu_virt_wait_reset(adev)) { 4887 dev_err(adev->dev, "VF exclusive mode timeout\n"); 4888 /* Don't send request since VF is inactive. */ 4889 adev->virt.caps &= ~AMDGPU_SRIOV_CAPS_RUNTIME; 4890 adev->virt.ops = NULL; 4891 r = -EAGAIN; 4892 } 4893 amdgpu_release_ras_context(adev); 4894 4895 failed: 4896 amdgpu_vf_error_trans_all(adev); 4897 4898 return r; 4899 } 4900 4901 static void amdgpu_device_unmap_mmio(struct amdgpu_device *adev) 4902 { 4903 4904 /* Clear all CPU mappings pointing to this device */ 4905 unmap_mapping_range(adev->ddev.anon_inode->i_mapping, 0, 0, 1); 4906 4907 /* Unmap all mapped bars - Doorbell, registers and VRAM */ 4908 amdgpu_doorbell_fini(adev); 4909 4910 iounmap(adev->rmmio); 4911 adev->rmmio = NULL; 4912 if (adev->mman.aper_base_kaddr) 4913 iounmap(adev->mman.aper_base_kaddr); 4914 adev->mman.aper_base_kaddr = NULL; 4915 4916 /* Memory manager related */ 4917 if (!adev->gmc.xgmi.connected_to_cpu && !adev->gmc.is_app_apu) { 4918 arch_phys_wc_del(adev->gmc.vram_mtrr); 4919 arch_io_free_memtype_wc(adev->gmc.aper_base, adev->gmc.aper_size); 4920 } 4921 } 4922 4923 /** 4924 * amdgpu_device_fini_hw - tear down the driver 4925 * 4926 * @adev: amdgpu_device pointer 4927 * 4928 * Tear down the driver info (all asics). 4929 * Called at driver shutdown. 4930 */ 4931 void amdgpu_device_fini_hw(struct amdgpu_device *adev) 4932 { 4933 dev_info(adev->dev, "amdgpu: finishing device.\n"); 4934 flush_delayed_work(&adev->delayed_init_work); 4935 4936 if (adev->mman.initialized) 4937 drain_workqueue(adev->mman.bdev.wq); 4938 adev->shutdown = true; 4939 4940 unregister_pm_notifier(&adev->pm_nb); 4941 4942 /* make sure IB test finished before entering exclusive mode 4943 * to avoid preemption on IB test 4944 */ 4945 if (amdgpu_sriov_vf(adev)) { 4946 amdgpu_virt_request_full_gpu(adev, false); 4947 amdgpu_virt_fini_data_exchange(adev); 4948 } 4949 4950 /* disable all interrupts */ 4951 amdgpu_irq_disable_all(adev); 4952 if (adev->mode_info.mode_config_initialized) { 4953 if (!drm_drv_uses_atomic_modeset(adev_to_drm(adev))) 4954 drm_helper_force_disable_all(adev_to_drm(adev)); 4955 else 4956 drm_atomic_helper_shutdown(adev_to_drm(adev)); 4957 } 4958 amdgpu_fence_driver_hw_fini(adev); 4959 4960 if (adev->pm.sysfs_initialized) 4961 amdgpu_pm_sysfs_fini(adev); 4962 if (adev->ucode_sysfs_en) 4963 amdgpu_ucode_sysfs_fini(adev); 4964 amdgpu_device_attr_sysfs_fini(adev); 4965 amdgpu_fru_sysfs_fini(adev); 4966 4967 amdgpu_reg_state_sysfs_fini(adev); 4968 amdgpu_xcp_sysfs_fini(adev); 4969 4970 /* disable ras feature must before hw fini */ 4971 amdgpu_ras_pre_fini(adev); 4972 4973 amdgpu_ttm_set_buffer_funcs_status(adev, false); 4974 4975 amdgpu_device_ip_fini_early(adev); 4976 4977 amdgpu_irq_fini_hw(adev); 4978 4979 if (adev->mman.initialized) 4980 ttm_device_clear_dma_mappings(&adev->mman.bdev); 4981 4982 amdgpu_gart_dummy_page_fini(adev); 4983 4984 if (drm_dev_is_unplugged(adev_to_drm(adev))) 4985 amdgpu_device_unmap_mmio(adev); 4986 4987 } 4988 4989 void amdgpu_device_fini_sw(struct amdgpu_device *adev) 4990 { 4991 int i, idx; 4992 bool px; 4993 4994 amdgpu_device_ip_fini(adev); 4995 amdgpu_fence_driver_sw_fini(adev); 4996 amdgpu_ucode_release(&adev->firmware.gpu_info_fw); 4997 adev->accel_working = false; 4998 dma_fence_put(rcu_dereference_protected(adev->gang_submit, true)); 4999 for (i = 0; i < MAX_XCP; ++i) { 5000 dma_fence_put(adev->isolation[i].spearhead); 5001 amdgpu_sync_free(&adev->isolation[i].active); 5002 amdgpu_sync_free(&adev->isolation[i].prev); 5003 } 5004 5005 amdgpu_reset_fini(adev); 5006 5007 /* free i2c buses */ 5008 amdgpu_i2c_fini(adev); 5009 5010 if (adev->bios) { 5011 if (amdgpu_emu_mode != 1) 5012 amdgpu_atombios_fini(adev); 5013 amdgpu_bios_release(adev); 5014 } 5015 5016 kfree(adev->fru_info); 5017 adev->fru_info = NULL; 5018 5019 kfree(adev->xcp_mgr); 5020 adev->xcp_mgr = NULL; 5021 5022 px = amdgpu_device_supports_px(adev); 5023 5024 if (px || (!dev_is_removable(&adev->pdev->dev) && 5025 apple_gmux_detect(NULL, NULL))) 5026 vga_switcheroo_unregister_client(adev->pdev); 5027 5028 if (px) 5029 vga_switcheroo_fini_domain_pm_ops(adev->dev); 5030 5031 if ((adev->pdev->class >> 8) == PCI_CLASS_DISPLAY_VGA) 5032 vga_client_unregister(adev->pdev); 5033 5034 if (drm_dev_enter(adev_to_drm(adev), &idx)) { 5035 5036 iounmap(adev->rmmio); 5037 adev->rmmio = NULL; 5038 drm_dev_exit(idx); 5039 } 5040 5041 if (IS_ENABLED(CONFIG_PERF_EVENTS)) 5042 amdgpu_pmu_fini(adev); 5043 if (adev->mman.discovery_bin) 5044 amdgpu_discovery_fini(adev); 5045 5046 amdgpu_reset_put_reset_domain(adev->reset_domain); 5047 adev->reset_domain = NULL; 5048 5049 kfree(adev->pci_state); 5050 kfree(adev->pcie_reset_ctx.swds_pcistate); 5051 kfree(adev->pcie_reset_ctx.swus_pcistate); 5052 } 5053 5054 /** 5055 * amdgpu_device_evict_resources - evict device resources 5056 * @adev: amdgpu device object 5057 * 5058 * Evicts all ttm device resources(vram BOs, gart table) from the lru list 5059 * of the vram memory type. Mainly used for evicting device resources 5060 * at suspend time. 5061 * 5062 */ 5063 static int amdgpu_device_evict_resources(struct amdgpu_device *adev) 5064 { 5065 int ret; 5066 5067 /* No need to evict vram on APUs unless going to S4 */ 5068 if (!adev->in_s4 && (adev->flags & AMD_IS_APU)) 5069 return 0; 5070 5071 ret = amdgpu_ttm_evict_resources(adev, TTM_PL_VRAM); 5072 if (ret) { 5073 dev_warn(adev->dev, "evicting device resources failed\n"); 5074 return ret; 5075 } 5076 5077 if (adev->in_s4) { 5078 ret = ttm_device_prepare_hibernation(&adev->mman.bdev); 5079 if (ret) 5080 dev_err(adev->dev, "prepare hibernation failed, %d\n", ret); 5081 } 5082 return ret; 5083 } 5084 5085 /* 5086 * Suspend & resume. 5087 */ 5088 /** 5089 * amdgpu_device_pm_notifier - Notification block for Suspend/Hibernate events 5090 * @nb: notifier block 5091 * @mode: suspend mode 5092 * @data: data 5093 * 5094 * This function is called when the system is about to suspend or hibernate. 5095 * It is used to set the appropriate flags so that eviction can be optimized 5096 * in the pm prepare callback. 5097 */ 5098 static int amdgpu_device_pm_notifier(struct notifier_block *nb, unsigned long mode, 5099 void *data) 5100 { 5101 struct amdgpu_device *adev = container_of(nb, struct amdgpu_device, pm_nb); 5102 5103 switch (mode) { 5104 case PM_HIBERNATION_PREPARE: 5105 adev->in_s4 = true; 5106 break; 5107 case PM_POST_HIBERNATION: 5108 adev->in_s4 = false; 5109 break; 5110 } 5111 5112 return NOTIFY_DONE; 5113 } 5114 5115 /** 5116 * amdgpu_device_prepare - prepare for device suspend 5117 * 5118 * @dev: drm dev pointer 5119 * 5120 * Prepare to put the hw in the suspend state (all asics). 5121 * Returns 0 for success or an error on failure. 5122 * Called at driver suspend. 5123 */ 5124 int amdgpu_device_prepare(struct drm_device *dev) 5125 { 5126 struct amdgpu_device *adev = drm_to_adev(dev); 5127 int i, r; 5128 5129 if (dev->switch_power_state == DRM_SWITCH_POWER_OFF) 5130 return 0; 5131 5132 /* Evict the majority of BOs before starting suspend sequence */ 5133 r = amdgpu_device_evict_resources(adev); 5134 if (r) 5135 return r; 5136 5137 flush_delayed_work(&adev->gfx.gfx_off_delay_work); 5138 5139 for (i = 0; i < adev->num_ip_blocks; i++) { 5140 if (!adev->ip_blocks[i].status.valid) 5141 continue; 5142 if (!adev->ip_blocks[i].version->funcs->prepare_suspend) 5143 continue; 5144 r = adev->ip_blocks[i].version->funcs->prepare_suspend(&adev->ip_blocks[i]); 5145 if (r) 5146 return r; 5147 } 5148 5149 return 0; 5150 } 5151 5152 /** 5153 * amdgpu_device_complete - complete power state transition 5154 * 5155 * @dev: drm dev pointer 5156 * 5157 * Undo the changes from amdgpu_device_prepare. This will be 5158 * called on all resume transitions, including those that failed. 5159 */ 5160 void amdgpu_device_complete(struct drm_device *dev) 5161 { 5162 struct amdgpu_device *adev = drm_to_adev(dev); 5163 int i; 5164 5165 for (i = 0; i < adev->num_ip_blocks; i++) { 5166 if (!adev->ip_blocks[i].status.valid) 5167 continue; 5168 if (!adev->ip_blocks[i].version->funcs->complete) 5169 continue; 5170 adev->ip_blocks[i].version->funcs->complete(&adev->ip_blocks[i]); 5171 } 5172 } 5173 5174 /** 5175 * amdgpu_device_suspend - initiate device suspend 5176 * 5177 * @dev: drm dev pointer 5178 * @notify_clients: notify in-kernel DRM clients 5179 * 5180 * Puts the hw in the suspend state (all asics). 5181 * Returns 0 for success or an error on failure. 5182 * Called at driver suspend. 5183 */ 5184 int amdgpu_device_suspend(struct drm_device *dev, bool notify_clients) 5185 { 5186 struct amdgpu_device *adev = drm_to_adev(dev); 5187 int r = 0; 5188 5189 if (dev->switch_power_state == DRM_SWITCH_POWER_OFF) 5190 return 0; 5191 5192 adev->in_suspend = true; 5193 5194 if (amdgpu_sriov_vf(adev)) { 5195 if (!adev->in_s0ix && !adev->in_runpm) 5196 amdgpu_amdkfd_suspend_process(adev); 5197 amdgpu_virt_fini_data_exchange(adev); 5198 r = amdgpu_virt_request_full_gpu(adev, false); 5199 if (r) 5200 return r; 5201 } 5202 5203 if (amdgpu_acpi_smart_shift_update(adev, AMDGPU_SS_DEV_D3)) 5204 dev_warn(adev->dev, "smart shift update failed\n"); 5205 5206 if (notify_clients) 5207 drm_client_dev_suspend(adev_to_drm(adev), false); 5208 5209 cancel_delayed_work_sync(&adev->delayed_init_work); 5210 5211 amdgpu_ras_suspend(adev); 5212 5213 amdgpu_device_ip_suspend_phase1(adev); 5214 5215 if (!adev->in_s0ix) { 5216 amdgpu_amdkfd_suspend(adev, !amdgpu_sriov_vf(adev) && !adev->in_runpm); 5217 amdgpu_userq_suspend(adev); 5218 } 5219 5220 r = amdgpu_device_evict_resources(adev); 5221 if (r) 5222 return r; 5223 5224 amdgpu_ttm_set_buffer_funcs_status(adev, false); 5225 5226 amdgpu_fence_driver_hw_fini(adev); 5227 5228 amdgpu_device_ip_suspend_phase2(adev); 5229 5230 if (amdgpu_sriov_vf(adev)) 5231 amdgpu_virt_release_full_gpu(adev, false); 5232 5233 r = amdgpu_dpm_notify_rlc_state(adev, false); 5234 if (r) 5235 return r; 5236 5237 return 0; 5238 } 5239 5240 static inline int amdgpu_virt_resume(struct amdgpu_device *adev) 5241 { 5242 int r; 5243 unsigned int prev_physical_node_id = adev->gmc.xgmi.physical_node_id; 5244 5245 /* During VM resume, QEMU programming of VF MSIX table (register GFXMSIX_VECT0_ADDR_LO) 5246 * may not work. The access could be blocked by nBIF protection as VF isn't in 5247 * exclusive access mode. Exclusive access is enabled now, disable/enable MSIX 5248 * so that QEMU reprograms MSIX table. 5249 */ 5250 amdgpu_restore_msix(adev); 5251 5252 r = adev->gfxhub.funcs->get_xgmi_info(adev); 5253 if (r) 5254 return r; 5255 5256 dev_info(adev->dev, "xgmi node, old id %d, new id %d\n", 5257 prev_physical_node_id, adev->gmc.xgmi.physical_node_id); 5258 5259 adev->vm_manager.vram_base_offset = adev->gfxhub.funcs->get_mc_fb_offset(adev); 5260 adev->vm_manager.vram_base_offset += 5261 adev->gmc.xgmi.physical_node_id * adev->gmc.xgmi.node_segment_size; 5262 5263 return 0; 5264 } 5265 5266 /** 5267 * amdgpu_device_resume - initiate device resume 5268 * 5269 * @dev: drm dev pointer 5270 * @notify_clients: notify in-kernel DRM clients 5271 * 5272 * Bring the hw back to operating state (all asics). 5273 * Returns 0 for success or an error on failure. 5274 * Called at driver resume. 5275 */ 5276 int amdgpu_device_resume(struct drm_device *dev, bool notify_clients) 5277 { 5278 struct amdgpu_device *adev = drm_to_adev(dev); 5279 int r = 0; 5280 5281 if (amdgpu_sriov_vf(adev)) { 5282 r = amdgpu_virt_request_full_gpu(adev, true); 5283 if (r) 5284 return r; 5285 } 5286 5287 if (amdgpu_virt_xgmi_migrate_enabled(adev)) { 5288 r = amdgpu_virt_resume(adev); 5289 if (r) 5290 goto exit; 5291 } 5292 5293 if (dev->switch_power_state == DRM_SWITCH_POWER_OFF) 5294 return 0; 5295 5296 if (adev->in_s0ix) 5297 amdgpu_dpm_gfx_state_change(adev, sGpuChangeState_D0Entry); 5298 5299 /* post card */ 5300 if (amdgpu_device_need_post(adev)) { 5301 r = amdgpu_device_asic_init(adev); 5302 if (r) 5303 dev_err(adev->dev, "amdgpu asic init failed\n"); 5304 } 5305 5306 r = amdgpu_device_ip_resume(adev); 5307 5308 if (r) { 5309 dev_err(adev->dev, "amdgpu_device_ip_resume failed (%d).\n", r); 5310 goto exit; 5311 } 5312 5313 if (!adev->in_s0ix) { 5314 r = amdgpu_amdkfd_resume(adev, !amdgpu_sriov_vf(adev) && !adev->in_runpm); 5315 if (r) 5316 goto exit; 5317 5318 r = amdgpu_userq_resume(adev); 5319 if (r) 5320 goto exit; 5321 } 5322 5323 r = amdgpu_device_ip_late_init(adev); 5324 if (r) 5325 goto exit; 5326 5327 queue_delayed_work(system_wq, &adev->delayed_init_work, 5328 msecs_to_jiffies(AMDGPU_RESUME_MS)); 5329 exit: 5330 if (amdgpu_sriov_vf(adev)) { 5331 amdgpu_virt_init_data_exchange(adev); 5332 amdgpu_virt_release_full_gpu(adev, true); 5333 5334 if (!adev->in_s0ix && !r && !adev->in_runpm) 5335 r = amdgpu_amdkfd_resume_process(adev); 5336 } 5337 5338 if (r) 5339 return r; 5340 5341 /* Make sure IB tests flushed */ 5342 flush_delayed_work(&adev->delayed_init_work); 5343 5344 if (notify_clients) 5345 drm_client_dev_resume(adev_to_drm(adev), false); 5346 5347 amdgpu_ras_resume(adev); 5348 5349 if (adev->mode_info.num_crtc) { 5350 /* 5351 * Most of the connector probing functions try to acquire runtime pm 5352 * refs to ensure that the GPU is powered on when connector polling is 5353 * performed. Since we're calling this from a runtime PM callback, 5354 * trying to acquire rpm refs will cause us to deadlock. 5355 * 5356 * Since we're guaranteed to be holding the rpm lock, it's safe to 5357 * temporarily disable the rpm helpers so this doesn't deadlock us. 5358 */ 5359 #ifdef CONFIG_PM 5360 dev->dev->power.disable_depth++; 5361 #endif 5362 if (!adev->dc_enabled) 5363 drm_helper_hpd_irq_event(dev); 5364 else 5365 drm_kms_helper_hotplug_event(dev); 5366 #ifdef CONFIG_PM 5367 dev->dev->power.disable_depth--; 5368 #endif 5369 } 5370 adev->in_suspend = false; 5371 5372 if (amdgpu_acpi_smart_shift_update(adev, AMDGPU_SS_DEV_D0)) 5373 dev_warn(adev->dev, "smart shift update failed\n"); 5374 5375 return 0; 5376 } 5377 5378 /** 5379 * amdgpu_device_ip_check_soft_reset - did soft reset succeed 5380 * 5381 * @adev: amdgpu_device pointer 5382 * 5383 * The list of all the hardware IPs that make up the asic is walked and 5384 * the check_soft_reset callbacks are run. check_soft_reset determines 5385 * if the asic is still hung or not. 5386 * Returns true if any of the IPs are still in a hung state, false if not. 5387 */ 5388 static bool amdgpu_device_ip_check_soft_reset(struct amdgpu_device *adev) 5389 { 5390 int i; 5391 bool asic_hang = false; 5392 5393 if (amdgpu_sriov_vf(adev)) 5394 return true; 5395 5396 if (amdgpu_asic_need_full_reset(adev)) 5397 return true; 5398 5399 for (i = 0; i < adev->num_ip_blocks; i++) { 5400 if (!adev->ip_blocks[i].status.valid) 5401 continue; 5402 if (adev->ip_blocks[i].version->funcs->check_soft_reset) 5403 adev->ip_blocks[i].status.hang = 5404 adev->ip_blocks[i].version->funcs->check_soft_reset( 5405 &adev->ip_blocks[i]); 5406 if (adev->ip_blocks[i].status.hang) { 5407 dev_info(adev->dev, "IP block:%s is hung!\n", adev->ip_blocks[i].version->funcs->name); 5408 asic_hang = true; 5409 } 5410 } 5411 return asic_hang; 5412 } 5413 5414 /** 5415 * amdgpu_device_ip_pre_soft_reset - prepare for soft reset 5416 * 5417 * @adev: amdgpu_device pointer 5418 * 5419 * The list of all the hardware IPs that make up the asic is walked and the 5420 * pre_soft_reset callbacks are run if the block is hung. pre_soft_reset 5421 * handles any IP specific hardware or software state changes that are 5422 * necessary for a soft reset to succeed. 5423 * Returns 0 on success, negative error code on failure. 5424 */ 5425 static int amdgpu_device_ip_pre_soft_reset(struct amdgpu_device *adev) 5426 { 5427 int i, r = 0; 5428 5429 for (i = 0; i < adev->num_ip_blocks; i++) { 5430 if (!adev->ip_blocks[i].status.valid) 5431 continue; 5432 if (adev->ip_blocks[i].status.hang && 5433 adev->ip_blocks[i].version->funcs->pre_soft_reset) { 5434 r = adev->ip_blocks[i].version->funcs->pre_soft_reset(&adev->ip_blocks[i]); 5435 if (r) 5436 return r; 5437 } 5438 } 5439 5440 return 0; 5441 } 5442 5443 /** 5444 * amdgpu_device_ip_need_full_reset - check if a full asic reset is needed 5445 * 5446 * @adev: amdgpu_device pointer 5447 * 5448 * Some hardware IPs cannot be soft reset. If they are hung, a full gpu 5449 * reset is necessary to recover. 5450 * Returns true if a full asic reset is required, false if not. 5451 */ 5452 static bool amdgpu_device_ip_need_full_reset(struct amdgpu_device *adev) 5453 { 5454 int i; 5455 5456 if (amdgpu_asic_need_full_reset(adev)) 5457 return true; 5458 5459 for (i = 0; i < adev->num_ip_blocks; i++) { 5460 if (!adev->ip_blocks[i].status.valid) 5461 continue; 5462 if ((adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) || 5463 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) || 5464 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_ACP) || 5465 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE) || 5466 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) { 5467 if (adev->ip_blocks[i].status.hang) { 5468 dev_info(adev->dev, "Some block need full reset!\n"); 5469 return true; 5470 } 5471 } 5472 } 5473 return false; 5474 } 5475 5476 /** 5477 * amdgpu_device_ip_soft_reset - do a soft reset 5478 * 5479 * @adev: amdgpu_device pointer 5480 * 5481 * The list of all the hardware IPs that make up the asic is walked and the 5482 * soft_reset callbacks are run if the block is hung. soft_reset handles any 5483 * IP specific hardware or software state changes that are necessary to soft 5484 * reset the IP. 5485 * Returns 0 on success, negative error code on failure. 5486 */ 5487 static int amdgpu_device_ip_soft_reset(struct amdgpu_device *adev) 5488 { 5489 int i, r = 0; 5490 5491 for (i = 0; i < adev->num_ip_blocks; i++) { 5492 if (!adev->ip_blocks[i].status.valid) 5493 continue; 5494 if (adev->ip_blocks[i].status.hang && 5495 adev->ip_blocks[i].version->funcs->soft_reset) { 5496 r = adev->ip_blocks[i].version->funcs->soft_reset(&adev->ip_blocks[i]); 5497 if (r) 5498 return r; 5499 } 5500 } 5501 5502 return 0; 5503 } 5504 5505 /** 5506 * amdgpu_device_ip_post_soft_reset - clean up from soft reset 5507 * 5508 * @adev: amdgpu_device pointer 5509 * 5510 * The list of all the hardware IPs that make up the asic is walked and the 5511 * post_soft_reset callbacks are run if the asic was hung. post_soft_reset 5512 * handles any IP specific hardware or software state changes that are 5513 * necessary after the IP has been soft reset. 5514 * Returns 0 on success, negative error code on failure. 5515 */ 5516 static int amdgpu_device_ip_post_soft_reset(struct amdgpu_device *adev) 5517 { 5518 int i, r = 0; 5519 5520 for (i = 0; i < adev->num_ip_blocks; i++) { 5521 if (!adev->ip_blocks[i].status.valid) 5522 continue; 5523 if (adev->ip_blocks[i].status.hang && 5524 adev->ip_blocks[i].version->funcs->post_soft_reset) 5525 r = adev->ip_blocks[i].version->funcs->post_soft_reset(&adev->ip_blocks[i]); 5526 if (r) 5527 return r; 5528 } 5529 5530 return 0; 5531 } 5532 5533 /** 5534 * amdgpu_device_reset_sriov - reset ASIC for SR-IOV vf 5535 * 5536 * @adev: amdgpu_device pointer 5537 * @reset_context: amdgpu reset context pointer 5538 * 5539 * do VF FLR and reinitialize Asic 5540 * return 0 means succeeded otherwise failed 5541 */ 5542 static int amdgpu_device_reset_sriov(struct amdgpu_device *adev, 5543 struct amdgpu_reset_context *reset_context) 5544 { 5545 int r; 5546 struct amdgpu_hive_info *hive = NULL; 5547 5548 if (test_bit(AMDGPU_HOST_FLR, &reset_context->flags)) { 5549 if (!amdgpu_ras_get_fed_status(adev)) 5550 amdgpu_virt_ready_to_reset(adev); 5551 amdgpu_virt_wait_reset(adev); 5552 clear_bit(AMDGPU_HOST_FLR, &reset_context->flags); 5553 r = amdgpu_virt_request_full_gpu(adev, true); 5554 } else { 5555 r = amdgpu_virt_reset_gpu(adev); 5556 } 5557 if (r) 5558 return r; 5559 5560 amdgpu_ras_clear_err_state(adev); 5561 amdgpu_irq_gpu_reset_resume_helper(adev); 5562 5563 /* some sw clean up VF needs to do before recover */ 5564 amdgpu_virt_post_reset(adev); 5565 5566 /* Resume IP prior to SMC */ 5567 r = amdgpu_device_ip_reinit_early_sriov(adev); 5568 if (r) 5569 return r; 5570 5571 amdgpu_virt_init_data_exchange(adev); 5572 5573 r = amdgpu_device_fw_loading(adev); 5574 if (r) 5575 return r; 5576 5577 /* now we are okay to resume SMC/CP/SDMA */ 5578 r = amdgpu_device_ip_reinit_late_sriov(adev); 5579 if (r) 5580 return r; 5581 5582 hive = amdgpu_get_xgmi_hive(adev); 5583 /* Update PSP FW topology after reset */ 5584 if (hive && adev->gmc.xgmi.num_physical_nodes > 1) 5585 r = amdgpu_xgmi_update_topology(hive, adev); 5586 if (hive) 5587 amdgpu_put_xgmi_hive(hive); 5588 if (r) 5589 return r; 5590 5591 r = amdgpu_ib_ring_tests(adev); 5592 if (r) 5593 return r; 5594 5595 if (adev->virt.gim_feature & AMDGIM_FEATURE_GIM_FLR_VRAMLOST) 5596 amdgpu_inc_vram_lost(adev); 5597 5598 /* need to be called during full access so we can't do it later like 5599 * bare-metal does. 5600 */ 5601 amdgpu_amdkfd_post_reset(adev); 5602 amdgpu_virt_release_full_gpu(adev, true); 5603 5604 /* Aldebaran and gfx_11_0_3 support ras in SRIOV, so need resume ras during reset */ 5605 if (amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 2) || 5606 amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 3) || 5607 amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 4) || 5608 amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 5, 0) || 5609 amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(11, 0, 3)) 5610 amdgpu_ras_resume(adev); 5611 5612 amdgpu_virt_ras_telemetry_post_reset(adev); 5613 5614 return 0; 5615 } 5616 5617 /** 5618 * amdgpu_device_has_job_running - check if there is any unfinished job 5619 * 5620 * @adev: amdgpu_device pointer 5621 * 5622 * check if there is any job running on the device when guest driver receives 5623 * FLR notification from host driver. If there are still jobs running, then 5624 * the guest driver will not respond the FLR reset. Instead, let the job hit 5625 * the timeout and guest driver then issue the reset request. 5626 */ 5627 bool amdgpu_device_has_job_running(struct amdgpu_device *adev) 5628 { 5629 int i; 5630 5631 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 5632 struct amdgpu_ring *ring = adev->rings[i]; 5633 5634 if (!amdgpu_ring_sched_ready(ring)) 5635 continue; 5636 5637 if (amdgpu_fence_count_emitted(ring)) 5638 return true; 5639 } 5640 return false; 5641 } 5642 5643 /** 5644 * amdgpu_device_should_recover_gpu - check if we should try GPU recovery 5645 * 5646 * @adev: amdgpu_device pointer 5647 * 5648 * Check amdgpu_gpu_recovery and SRIOV status to see if we should try to recover 5649 * a hung GPU. 5650 */ 5651 bool amdgpu_device_should_recover_gpu(struct amdgpu_device *adev) 5652 { 5653 5654 if (amdgpu_gpu_recovery == 0) 5655 goto disabled; 5656 5657 /* Skip soft reset check in fatal error mode */ 5658 if (!amdgpu_ras_is_poison_mode_supported(adev)) 5659 return true; 5660 5661 if (amdgpu_sriov_vf(adev)) 5662 return true; 5663 5664 if (amdgpu_gpu_recovery == -1) { 5665 switch (adev->asic_type) { 5666 #ifdef CONFIG_DRM_AMDGPU_SI 5667 case CHIP_VERDE: 5668 case CHIP_TAHITI: 5669 case CHIP_PITCAIRN: 5670 case CHIP_OLAND: 5671 case CHIP_HAINAN: 5672 #endif 5673 #ifdef CONFIG_DRM_AMDGPU_CIK 5674 case CHIP_KAVERI: 5675 case CHIP_KABINI: 5676 case CHIP_MULLINS: 5677 #endif 5678 case CHIP_CARRIZO: 5679 case CHIP_STONEY: 5680 case CHIP_CYAN_SKILLFISH: 5681 goto disabled; 5682 default: 5683 break; 5684 } 5685 } 5686 5687 return true; 5688 5689 disabled: 5690 dev_info(adev->dev, "GPU recovery disabled.\n"); 5691 return false; 5692 } 5693 5694 int amdgpu_device_mode1_reset(struct amdgpu_device *adev) 5695 { 5696 u32 i; 5697 int ret = 0; 5698 5699 if (adev->bios) 5700 amdgpu_atombios_scratch_regs_engine_hung(adev, true); 5701 5702 dev_info(adev->dev, "GPU mode1 reset\n"); 5703 5704 /* Cache the state before bus master disable. The saved config space 5705 * values are used in other cases like restore after mode-2 reset. 5706 */ 5707 amdgpu_device_cache_pci_state(adev->pdev); 5708 5709 /* disable BM */ 5710 pci_clear_master(adev->pdev); 5711 5712 if (amdgpu_dpm_is_mode1_reset_supported(adev)) { 5713 dev_info(adev->dev, "GPU smu mode1 reset\n"); 5714 ret = amdgpu_dpm_mode1_reset(adev); 5715 } else { 5716 dev_info(adev->dev, "GPU psp mode1 reset\n"); 5717 ret = psp_gpu_reset(adev); 5718 } 5719 5720 if (ret) 5721 goto mode1_reset_failed; 5722 5723 amdgpu_device_load_pci_state(adev->pdev); 5724 ret = amdgpu_psp_wait_for_bootloader(adev); 5725 if (ret) 5726 goto mode1_reset_failed; 5727 5728 /* wait for asic to come out of reset */ 5729 for (i = 0; i < adev->usec_timeout; i++) { 5730 u32 memsize = adev->nbio.funcs->get_memsize(adev); 5731 5732 if (memsize != 0xffffffff) 5733 break; 5734 udelay(1); 5735 } 5736 5737 if (i >= adev->usec_timeout) { 5738 ret = -ETIMEDOUT; 5739 goto mode1_reset_failed; 5740 } 5741 5742 if (adev->bios) 5743 amdgpu_atombios_scratch_regs_engine_hung(adev, false); 5744 5745 return 0; 5746 5747 mode1_reset_failed: 5748 dev_err(adev->dev, "GPU mode1 reset failed\n"); 5749 return ret; 5750 } 5751 5752 int amdgpu_device_link_reset(struct amdgpu_device *adev) 5753 { 5754 int ret = 0; 5755 5756 dev_info(adev->dev, "GPU link reset\n"); 5757 5758 if (!amdgpu_reset_in_dpc(adev)) 5759 ret = amdgpu_dpm_link_reset(adev); 5760 5761 if (ret) 5762 goto link_reset_failed; 5763 5764 ret = amdgpu_psp_wait_for_bootloader(adev); 5765 if (ret) 5766 goto link_reset_failed; 5767 5768 return 0; 5769 5770 link_reset_failed: 5771 dev_err(adev->dev, "GPU link reset failed\n"); 5772 return ret; 5773 } 5774 5775 int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev, 5776 struct amdgpu_reset_context *reset_context) 5777 { 5778 int i, r = 0; 5779 struct amdgpu_job *job = NULL; 5780 struct amdgpu_device *tmp_adev = reset_context->reset_req_dev; 5781 bool need_full_reset = 5782 test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 5783 5784 if (reset_context->reset_req_dev == adev) 5785 job = reset_context->job; 5786 5787 if (amdgpu_sriov_vf(adev)) 5788 amdgpu_virt_pre_reset(adev); 5789 5790 amdgpu_fence_driver_isr_toggle(adev, true); 5791 5792 /* block all schedulers and reset given job's ring */ 5793 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 5794 struct amdgpu_ring *ring = adev->rings[i]; 5795 5796 if (!amdgpu_ring_sched_ready(ring)) 5797 continue; 5798 5799 /* Clear job fence from fence drv to avoid force_completion 5800 * leave NULL and vm flush fence in fence drv 5801 */ 5802 amdgpu_fence_driver_clear_job_fences(ring); 5803 5804 /* after all hw jobs are reset, hw fence is meaningless, so force_completion */ 5805 amdgpu_fence_driver_force_completion(ring); 5806 } 5807 5808 amdgpu_fence_driver_isr_toggle(adev, false); 5809 5810 if (job && job->vm) 5811 drm_sched_increase_karma(&job->base); 5812 5813 r = amdgpu_reset_prepare_hwcontext(adev, reset_context); 5814 /* If reset handler not implemented, continue; otherwise return */ 5815 if (r == -EOPNOTSUPP) 5816 r = 0; 5817 else 5818 return r; 5819 5820 /* Don't suspend on bare metal if we are not going to HW reset the ASIC */ 5821 if (!amdgpu_sriov_vf(adev)) { 5822 5823 if (!need_full_reset) 5824 need_full_reset = amdgpu_device_ip_need_full_reset(adev); 5825 5826 if (!need_full_reset && amdgpu_gpu_recovery && 5827 amdgpu_device_ip_check_soft_reset(adev)) { 5828 amdgpu_device_ip_pre_soft_reset(adev); 5829 r = amdgpu_device_ip_soft_reset(adev); 5830 amdgpu_device_ip_post_soft_reset(adev); 5831 if (r || amdgpu_device_ip_check_soft_reset(adev)) { 5832 dev_info(adev->dev, "soft reset failed, will fallback to full reset!\n"); 5833 need_full_reset = true; 5834 } 5835 } 5836 5837 if (!test_bit(AMDGPU_SKIP_COREDUMP, &reset_context->flags)) { 5838 dev_info(tmp_adev->dev, "Dumping IP State\n"); 5839 /* Trigger ip dump before we reset the asic */ 5840 for (i = 0; i < tmp_adev->num_ip_blocks; i++) 5841 if (tmp_adev->ip_blocks[i].version->funcs->dump_ip_state) 5842 tmp_adev->ip_blocks[i].version->funcs 5843 ->dump_ip_state((void *)&tmp_adev->ip_blocks[i]); 5844 dev_info(tmp_adev->dev, "Dumping IP State Completed\n"); 5845 } 5846 5847 if (need_full_reset) 5848 r = amdgpu_device_ip_suspend(adev); 5849 if (need_full_reset) 5850 set_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 5851 else 5852 clear_bit(AMDGPU_NEED_FULL_RESET, 5853 &reset_context->flags); 5854 } 5855 5856 return r; 5857 } 5858 5859 int amdgpu_device_reinit_after_reset(struct amdgpu_reset_context *reset_context) 5860 { 5861 struct list_head *device_list_handle; 5862 bool full_reset, vram_lost = false; 5863 struct amdgpu_device *tmp_adev; 5864 int r, init_level; 5865 5866 device_list_handle = reset_context->reset_device_list; 5867 5868 if (!device_list_handle) 5869 return -EINVAL; 5870 5871 full_reset = test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 5872 5873 /** 5874 * If it's reset on init, it's default init level, otherwise keep level 5875 * as recovery level. 5876 */ 5877 if (reset_context->method == AMD_RESET_METHOD_ON_INIT) 5878 init_level = AMDGPU_INIT_LEVEL_DEFAULT; 5879 else 5880 init_level = AMDGPU_INIT_LEVEL_RESET_RECOVERY; 5881 5882 r = 0; 5883 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 5884 amdgpu_set_init_level(tmp_adev, init_level); 5885 if (full_reset) { 5886 /* post card */ 5887 amdgpu_reset_set_dpc_status(tmp_adev, false); 5888 amdgpu_ras_clear_err_state(tmp_adev); 5889 r = amdgpu_device_asic_init(tmp_adev); 5890 if (r) { 5891 dev_warn(tmp_adev->dev, "asic atom init failed!"); 5892 } else { 5893 dev_info(tmp_adev->dev, "GPU reset succeeded, trying to resume\n"); 5894 5895 r = amdgpu_device_ip_resume_phase1(tmp_adev); 5896 if (r) 5897 goto out; 5898 5899 vram_lost = amdgpu_device_check_vram_lost(tmp_adev); 5900 5901 if (!test_bit(AMDGPU_SKIP_COREDUMP, &reset_context->flags)) 5902 amdgpu_coredump(tmp_adev, false, vram_lost, reset_context->job); 5903 5904 if (vram_lost) { 5905 dev_info( 5906 tmp_adev->dev, 5907 "VRAM is lost due to GPU reset!\n"); 5908 amdgpu_inc_vram_lost(tmp_adev); 5909 } 5910 5911 r = amdgpu_device_fw_loading(tmp_adev); 5912 if (r) 5913 return r; 5914 5915 r = amdgpu_xcp_restore_partition_mode( 5916 tmp_adev->xcp_mgr); 5917 if (r) 5918 goto out; 5919 5920 r = amdgpu_device_ip_resume_phase2(tmp_adev); 5921 if (r) 5922 goto out; 5923 5924 if (tmp_adev->mman.buffer_funcs_ring->sched.ready) 5925 amdgpu_ttm_set_buffer_funcs_status(tmp_adev, true); 5926 5927 r = amdgpu_device_ip_resume_phase3(tmp_adev); 5928 if (r) 5929 goto out; 5930 5931 if (vram_lost) 5932 amdgpu_device_fill_reset_magic(tmp_adev); 5933 5934 /* 5935 * Add this ASIC as tracked as reset was already 5936 * complete successfully. 5937 */ 5938 amdgpu_register_gpu_instance(tmp_adev); 5939 5940 if (!reset_context->hive && 5941 tmp_adev->gmc.xgmi.num_physical_nodes > 1) 5942 amdgpu_xgmi_add_device(tmp_adev); 5943 5944 r = amdgpu_device_ip_late_init(tmp_adev); 5945 if (r) 5946 goto out; 5947 5948 drm_client_dev_resume(adev_to_drm(tmp_adev), false); 5949 5950 /* 5951 * The GPU enters bad state once faulty pages 5952 * by ECC has reached the threshold, and ras 5953 * recovery is scheduled next. So add one check 5954 * here to break recovery if it indeed exceeds 5955 * bad page threshold, and remind user to 5956 * retire this GPU or setting one bigger 5957 * bad_page_threshold value to fix this once 5958 * probing driver again. 5959 */ 5960 if (!amdgpu_ras_is_rma(tmp_adev)) { 5961 /* must succeed. */ 5962 amdgpu_ras_resume(tmp_adev); 5963 } else { 5964 r = -EINVAL; 5965 goto out; 5966 } 5967 5968 /* Update PSP FW topology after reset */ 5969 if (reset_context->hive && 5970 tmp_adev->gmc.xgmi.num_physical_nodes > 1) 5971 r = amdgpu_xgmi_update_topology( 5972 reset_context->hive, tmp_adev); 5973 } 5974 } 5975 5976 out: 5977 if (!r) { 5978 /* IP init is complete now, set level as default */ 5979 amdgpu_set_init_level(tmp_adev, 5980 AMDGPU_INIT_LEVEL_DEFAULT); 5981 amdgpu_irq_gpu_reset_resume_helper(tmp_adev); 5982 r = amdgpu_ib_ring_tests(tmp_adev); 5983 if (r) { 5984 dev_err(tmp_adev->dev, "ib ring test failed (%d).\n", r); 5985 r = -EAGAIN; 5986 goto end; 5987 } 5988 } 5989 5990 if (r) 5991 tmp_adev->asic_reset_res = r; 5992 } 5993 5994 end: 5995 return r; 5996 } 5997 5998 int amdgpu_do_asic_reset(struct list_head *device_list_handle, 5999 struct amdgpu_reset_context *reset_context) 6000 { 6001 struct amdgpu_device *tmp_adev = NULL; 6002 bool need_full_reset, skip_hw_reset; 6003 int r = 0; 6004 6005 /* Try reset handler method first */ 6006 tmp_adev = list_first_entry(device_list_handle, struct amdgpu_device, 6007 reset_list); 6008 6009 reset_context->reset_device_list = device_list_handle; 6010 r = amdgpu_reset_perform_reset(tmp_adev, reset_context); 6011 /* If reset handler not implemented, continue; otherwise return */ 6012 if (r == -EOPNOTSUPP) 6013 r = 0; 6014 else 6015 return r; 6016 6017 /* Reset handler not implemented, use the default method */ 6018 need_full_reset = 6019 test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 6020 skip_hw_reset = test_bit(AMDGPU_SKIP_HW_RESET, &reset_context->flags); 6021 6022 /* 6023 * ASIC reset has to be done on all XGMI hive nodes ASAP 6024 * to allow proper links negotiation in FW (within 1 sec) 6025 */ 6026 if (!skip_hw_reset && need_full_reset) { 6027 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 6028 /* For XGMI run all resets in parallel to speed up the process */ 6029 if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) { 6030 if (!queue_work(system_unbound_wq, 6031 &tmp_adev->xgmi_reset_work)) 6032 r = -EALREADY; 6033 } else 6034 r = amdgpu_asic_reset(tmp_adev); 6035 6036 if (r) { 6037 dev_err(tmp_adev->dev, 6038 "ASIC reset failed with error, %d for drm dev, %s", 6039 r, adev_to_drm(tmp_adev)->unique); 6040 goto out; 6041 } 6042 } 6043 6044 /* For XGMI wait for all resets to complete before proceed */ 6045 if (!r) { 6046 list_for_each_entry(tmp_adev, device_list_handle, 6047 reset_list) { 6048 if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) { 6049 flush_work(&tmp_adev->xgmi_reset_work); 6050 r = tmp_adev->asic_reset_res; 6051 if (r) 6052 break; 6053 } 6054 } 6055 } 6056 } 6057 6058 if (!r && amdgpu_ras_intr_triggered()) { 6059 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 6060 amdgpu_ras_reset_error_count(tmp_adev, 6061 AMDGPU_RAS_BLOCK__MMHUB); 6062 } 6063 6064 amdgpu_ras_intr_cleared(); 6065 } 6066 6067 r = amdgpu_device_reinit_after_reset(reset_context); 6068 if (r == -EAGAIN) 6069 set_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 6070 else 6071 clear_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 6072 6073 out: 6074 return r; 6075 } 6076 6077 static void amdgpu_device_set_mp1_state(struct amdgpu_device *adev) 6078 { 6079 6080 switch (amdgpu_asic_reset_method(adev)) { 6081 case AMD_RESET_METHOD_MODE1: 6082 case AMD_RESET_METHOD_LINK: 6083 adev->mp1_state = PP_MP1_STATE_SHUTDOWN; 6084 break; 6085 case AMD_RESET_METHOD_MODE2: 6086 adev->mp1_state = PP_MP1_STATE_RESET; 6087 break; 6088 default: 6089 adev->mp1_state = PP_MP1_STATE_NONE; 6090 break; 6091 } 6092 } 6093 6094 static void amdgpu_device_unset_mp1_state(struct amdgpu_device *adev) 6095 { 6096 amdgpu_vf_error_trans_all(adev); 6097 adev->mp1_state = PP_MP1_STATE_NONE; 6098 } 6099 6100 static void amdgpu_device_resume_display_audio(struct amdgpu_device *adev) 6101 { 6102 struct pci_dev *p = NULL; 6103 6104 p = pci_get_domain_bus_and_slot(pci_domain_nr(adev->pdev->bus), 6105 adev->pdev->bus->number, 1); 6106 if (p) { 6107 pm_runtime_enable(&(p->dev)); 6108 pm_runtime_resume(&(p->dev)); 6109 } 6110 6111 pci_dev_put(p); 6112 } 6113 6114 static int amdgpu_device_suspend_display_audio(struct amdgpu_device *adev) 6115 { 6116 enum amd_reset_method reset_method; 6117 struct pci_dev *p = NULL; 6118 u64 expires; 6119 6120 /* 6121 * For now, only BACO and mode1 reset are confirmed 6122 * to suffer the audio issue without proper suspended. 6123 */ 6124 reset_method = amdgpu_asic_reset_method(adev); 6125 if ((reset_method != AMD_RESET_METHOD_BACO) && 6126 (reset_method != AMD_RESET_METHOD_MODE1)) 6127 return -EINVAL; 6128 6129 p = pci_get_domain_bus_and_slot(pci_domain_nr(adev->pdev->bus), 6130 adev->pdev->bus->number, 1); 6131 if (!p) 6132 return -ENODEV; 6133 6134 expires = pm_runtime_autosuspend_expiration(&(p->dev)); 6135 if (!expires) 6136 /* 6137 * If we cannot get the audio device autosuspend delay, 6138 * a fixed 4S interval will be used. Considering 3S is 6139 * the audio controller default autosuspend delay setting. 6140 * 4S used here is guaranteed to cover that. 6141 */ 6142 expires = ktime_get_mono_fast_ns() + NSEC_PER_SEC * 4ULL; 6143 6144 while (!pm_runtime_status_suspended(&(p->dev))) { 6145 if (!pm_runtime_suspend(&(p->dev))) 6146 break; 6147 6148 if (expires < ktime_get_mono_fast_ns()) { 6149 dev_warn(adev->dev, "failed to suspend display audio\n"); 6150 pci_dev_put(p); 6151 /* TODO: abort the succeeding gpu reset? */ 6152 return -ETIMEDOUT; 6153 } 6154 } 6155 6156 pm_runtime_disable(&(p->dev)); 6157 6158 pci_dev_put(p); 6159 return 0; 6160 } 6161 6162 static inline void amdgpu_device_stop_pending_resets(struct amdgpu_device *adev) 6163 { 6164 struct amdgpu_ras *con = amdgpu_ras_get_context(adev); 6165 6166 #if defined(CONFIG_DEBUG_FS) 6167 if (!amdgpu_sriov_vf(adev)) 6168 cancel_work(&adev->reset_work); 6169 #endif 6170 6171 if (adev->kfd.dev) 6172 cancel_work(&adev->kfd.reset_work); 6173 6174 if (amdgpu_sriov_vf(adev)) 6175 cancel_work(&adev->virt.flr_work); 6176 6177 if (con && adev->ras_enabled) 6178 cancel_work(&con->recovery_work); 6179 6180 } 6181 6182 static int amdgpu_device_health_check(struct list_head *device_list_handle) 6183 { 6184 struct amdgpu_device *tmp_adev; 6185 int ret = 0; 6186 6187 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 6188 ret |= amdgpu_device_bus_status_check(tmp_adev); 6189 } 6190 6191 return ret; 6192 } 6193 6194 static void amdgpu_device_recovery_prepare(struct amdgpu_device *adev, 6195 struct list_head *device_list, 6196 struct amdgpu_hive_info *hive) 6197 { 6198 struct amdgpu_device *tmp_adev = NULL; 6199 6200 /* 6201 * Build list of devices to reset. 6202 * In case we are in XGMI hive mode, resort the device list 6203 * to put adev in the 1st position. 6204 */ 6205 if (!amdgpu_sriov_vf(adev) && (adev->gmc.xgmi.num_physical_nodes > 1) && hive) { 6206 list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head) { 6207 list_add_tail(&tmp_adev->reset_list, device_list); 6208 if (adev->shutdown) 6209 tmp_adev->shutdown = true; 6210 if (amdgpu_reset_in_dpc(adev)) 6211 tmp_adev->pcie_reset_ctx.in_link_reset = true; 6212 } 6213 if (!list_is_first(&adev->reset_list, device_list)) 6214 list_rotate_to_front(&adev->reset_list, device_list); 6215 } else { 6216 list_add_tail(&adev->reset_list, device_list); 6217 } 6218 } 6219 6220 static void amdgpu_device_recovery_get_reset_lock(struct amdgpu_device *adev, 6221 struct list_head *device_list) 6222 { 6223 struct amdgpu_device *tmp_adev = NULL; 6224 6225 if (list_empty(device_list)) 6226 return; 6227 tmp_adev = 6228 list_first_entry(device_list, struct amdgpu_device, reset_list); 6229 amdgpu_device_lock_reset_domain(tmp_adev->reset_domain); 6230 } 6231 6232 static void amdgpu_device_recovery_put_reset_lock(struct amdgpu_device *adev, 6233 struct list_head *device_list) 6234 { 6235 struct amdgpu_device *tmp_adev = NULL; 6236 6237 if (list_empty(device_list)) 6238 return; 6239 tmp_adev = 6240 list_first_entry(device_list, struct amdgpu_device, reset_list); 6241 amdgpu_device_unlock_reset_domain(tmp_adev->reset_domain); 6242 } 6243 6244 static void amdgpu_device_halt_activities(struct amdgpu_device *adev, 6245 struct amdgpu_job *job, 6246 struct amdgpu_reset_context *reset_context, 6247 struct list_head *device_list, 6248 struct amdgpu_hive_info *hive, 6249 bool need_emergency_restart) 6250 { 6251 struct amdgpu_device *tmp_adev = NULL; 6252 int i; 6253 6254 /* block all schedulers and reset given job's ring */ 6255 list_for_each_entry(tmp_adev, device_list, reset_list) { 6256 amdgpu_device_set_mp1_state(tmp_adev); 6257 6258 /* 6259 * Try to put the audio codec into suspend state 6260 * before gpu reset started. 6261 * 6262 * Due to the power domain of the graphics device 6263 * is shared with AZ power domain. Without this, 6264 * we may change the audio hardware from behind 6265 * the audio driver's back. That will trigger 6266 * some audio codec errors. 6267 */ 6268 if (!amdgpu_device_suspend_display_audio(tmp_adev)) 6269 tmp_adev->pcie_reset_ctx.audio_suspended = true; 6270 6271 amdgpu_ras_set_error_query_ready(tmp_adev, false); 6272 6273 cancel_delayed_work_sync(&tmp_adev->delayed_init_work); 6274 6275 amdgpu_amdkfd_pre_reset(tmp_adev, reset_context); 6276 6277 /* 6278 * Mark these ASICs to be reset as untracked first 6279 * And add them back after reset completed 6280 */ 6281 amdgpu_unregister_gpu_instance(tmp_adev); 6282 6283 drm_client_dev_suspend(adev_to_drm(tmp_adev), false); 6284 6285 /* disable ras on ALL IPs */ 6286 if (!need_emergency_restart && !amdgpu_reset_in_dpc(adev) && 6287 amdgpu_device_ip_need_full_reset(tmp_adev)) 6288 amdgpu_ras_suspend(tmp_adev); 6289 6290 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 6291 struct amdgpu_ring *ring = tmp_adev->rings[i]; 6292 6293 if (!amdgpu_ring_sched_ready(ring)) 6294 continue; 6295 6296 drm_sched_stop(&ring->sched, job ? &job->base : NULL); 6297 6298 if (need_emergency_restart) 6299 amdgpu_job_stop_all_jobs_on_sched(&ring->sched); 6300 } 6301 atomic_inc(&tmp_adev->gpu_reset_counter); 6302 } 6303 } 6304 6305 static int amdgpu_device_asic_reset(struct amdgpu_device *adev, 6306 struct list_head *device_list, 6307 struct amdgpu_reset_context *reset_context) 6308 { 6309 struct amdgpu_device *tmp_adev = NULL; 6310 int retry_limit = AMDGPU_MAX_RETRY_LIMIT; 6311 int r = 0; 6312 6313 retry: /* Rest of adevs pre asic reset from XGMI hive. */ 6314 list_for_each_entry(tmp_adev, device_list, reset_list) { 6315 r = amdgpu_device_pre_asic_reset(tmp_adev, reset_context); 6316 /*TODO Should we stop ?*/ 6317 if (r) { 6318 dev_err(tmp_adev->dev, "GPU pre asic reset failed with err, %d for drm dev, %s ", 6319 r, adev_to_drm(tmp_adev)->unique); 6320 tmp_adev->asic_reset_res = r; 6321 } 6322 } 6323 6324 /* Actual ASIC resets if needed.*/ 6325 /* Host driver will handle XGMI hive reset for SRIOV */ 6326 if (amdgpu_sriov_vf(adev)) { 6327 6328 /* Bail out of reset early */ 6329 if (amdgpu_ras_is_rma(adev)) 6330 return -ENODEV; 6331 6332 if (amdgpu_ras_get_fed_status(adev) || amdgpu_virt_rcvd_ras_interrupt(adev)) { 6333 dev_dbg(adev->dev, "Detected RAS error, wait for FLR completion\n"); 6334 amdgpu_ras_set_fed(adev, true); 6335 set_bit(AMDGPU_HOST_FLR, &reset_context->flags); 6336 } 6337 6338 r = amdgpu_device_reset_sriov(adev, reset_context); 6339 if (AMDGPU_RETRY_SRIOV_RESET(r) && (retry_limit--) > 0) { 6340 amdgpu_virt_release_full_gpu(adev, true); 6341 goto retry; 6342 } 6343 if (r) 6344 adev->asic_reset_res = r; 6345 } else { 6346 r = amdgpu_do_asic_reset(device_list, reset_context); 6347 if (r && r == -EAGAIN) 6348 goto retry; 6349 } 6350 6351 list_for_each_entry(tmp_adev, device_list, reset_list) { 6352 /* 6353 * Drop any pending non scheduler resets queued before reset is done. 6354 * Any reset scheduled after this point would be valid. Scheduler resets 6355 * were already dropped during drm_sched_stop and no new ones can come 6356 * in before drm_sched_start. 6357 */ 6358 amdgpu_device_stop_pending_resets(tmp_adev); 6359 } 6360 6361 return r; 6362 } 6363 6364 static int amdgpu_device_sched_resume(struct list_head *device_list, 6365 struct amdgpu_reset_context *reset_context, 6366 bool job_signaled) 6367 { 6368 struct amdgpu_device *tmp_adev = NULL; 6369 int i, r = 0; 6370 6371 /* Post ASIC reset for all devs .*/ 6372 list_for_each_entry(tmp_adev, device_list, reset_list) { 6373 6374 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 6375 struct amdgpu_ring *ring = tmp_adev->rings[i]; 6376 6377 if (!amdgpu_ring_sched_ready(ring)) 6378 continue; 6379 6380 drm_sched_start(&ring->sched, 0); 6381 } 6382 6383 if (!drm_drv_uses_atomic_modeset(adev_to_drm(tmp_adev)) && !job_signaled) 6384 drm_helper_resume_force_mode(adev_to_drm(tmp_adev)); 6385 6386 if (tmp_adev->asic_reset_res) 6387 r = tmp_adev->asic_reset_res; 6388 6389 tmp_adev->asic_reset_res = 0; 6390 6391 if (r) { 6392 /* bad news, how to tell it to userspace ? 6393 * for ras error, we should report GPU bad status instead of 6394 * reset failure 6395 */ 6396 if (reset_context->src != AMDGPU_RESET_SRC_RAS || 6397 !amdgpu_ras_eeprom_check_err_threshold(tmp_adev)) 6398 dev_info(tmp_adev->dev, "GPU reset(%d) failed\n", 6399 atomic_read(&tmp_adev->gpu_reset_counter)); 6400 amdgpu_vf_error_put(tmp_adev, AMDGIM_ERROR_VF_GPU_RESET_FAIL, 0, r); 6401 } else { 6402 dev_info(tmp_adev->dev, "GPU reset(%d) succeeded!\n", atomic_read(&tmp_adev->gpu_reset_counter)); 6403 if (amdgpu_acpi_smart_shift_update(tmp_adev, 6404 AMDGPU_SS_DEV_D0)) 6405 dev_warn(tmp_adev->dev, 6406 "smart shift update failed\n"); 6407 } 6408 } 6409 6410 return r; 6411 } 6412 6413 static void amdgpu_device_gpu_resume(struct amdgpu_device *adev, 6414 struct list_head *device_list, 6415 bool need_emergency_restart) 6416 { 6417 struct amdgpu_device *tmp_adev = NULL; 6418 6419 list_for_each_entry(tmp_adev, device_list, reset_list) { 6420 /* unlock kfd: SRIOV would do it separately */ 6421 if (!need_emergency_restart && !amdgpu_sriov_vf(tmp_adev)) 6422 amdgpu_amdkfd_post_reset(tmp_adev); 6423 6424 /* kfd_post_reset will do nothing if kfd device is not initialized, 6425 * need to bring up kfd here if it's not be initialized before 6426 */ 6427 if (!adev->kfd.init_complete) 6428 amdgpu_amdkfd_device_init(adev); 6429 6430 if (tmp_adev->pcie_reset_ctx.audio_suspended) 6431 amdgpu_device_resume_display_audio(tmp_adev); 6432 6433 amdgpu_device_unset_mp1_state(tmp_adev); 6434 6435 amdgpu_ras_set_error_query_ready(tmp_adev, true); 6436 6437 } 6438 } 6439 6440 6441 /** 6442 * amdgpu_device_gpu_recover - reset the asic and recover scheduler 6443 * 6444 * @adev: amdgpu_device pointer 6445 * @job: which job trigger hang 6446 * @reset_context: amdgpu reset context pointer 6447 * 6448 * Attempt to reset the GPU if it has hung (all asics). 6449 * Attempt to do soft-reset or full-reset and reinitialize Asic 6450 * Returns 0 for success or an error on failure. 6451 */ 6452 6453 int amdgpu_device_gpu_recover(struct amdgpu_device *adev, 6454 struct amdgpu_job *job, 6455 struct amdgpu_reset_context *reset_context) 6456 { 6457 struct list_head device_list; 6458 bool job_signaled = false; 6459 struct amdgpu_hive_info *hive = NULL; 6460 int r = 0; 6461 bool need_emergency_restart = false; 6462 6463 /* 6464 * If it reaches here because of hang/timeout and a RAS error is 6465 * detected at the same time, let RAS recovery take care of it. 6466 */ 6467 if (amdgpu_ras_is_err_state(adev, AMDGPU_RAS_BLOCK__ANY) && 6468 !amdgpu_sriov_vf(adev) && 6469 reset_context->src != AMDGPU_RESET_SRC_RAS) { 6470 dev_dbg(adev->dev, 6471 "Gpu recovery from source: %d yielding to RAS error recovery handling", 6472 reset_context->src); 6473 return 0; 6474 } 6475 6476 /* 6477 * Special case: RAS triggered and full reset isn't supported 6478 */ 6479 need_emergency_restart = amdgpu_ras_need_emergency_restart(adev); 6480 6481 /* 6482 * Flush RAM to disk so that after reboot 6483 * the user can read log and see why the system rebooted. 6484 */ 6485 if (need_emergency_restart && amdgpu_ras_get_context(adev) && 6486 amdgpu_ras_get_context(adev)->reboot) { 6487 dev_warn(adev->dev, "Emergency reboot."); 6488 6489 ksys_sync_helper(); 6490 emergency_restart(); 6491 } 6492 6493 dev_info(adev->dev, "GPU %s begin!. Source: %d\n", 6494 need_emergency_restart ? "jobs stop" : "reset", 6495 reset_context->src); 6496 6497 if (!amdgpu_sriov_vf(adev)) 6498 hive = amdgpu_get_xgmi_hive(adev); 6499 if (hive) 6500 mutex_lock(&hive->hive_lock); 6501 6502 reset_context->job = job; 6503 reset_context->hive = hive; 6504 INIT_LIST_HEAD(&device_list); 6505 6506 amdgpu_device_recovery_prepare(adev, &device_list, hive); 6507 6508 if (!amdgpu_sriov_vf(adev)) { 6509 r = amdgpu_device_health_check(&device_list); 6510 if (r) 6511 goto end_reset; 6512 } 6513 6514 /* We need to lock reset domain only once both for XGMI and single device */ 6515 amdgpu_device_recovery_get_reset_lock(adev, &device_list); 6516 6517 amdgpu_device_halt_activities(adev, job, reset_context, &device_list, 6518 hive, need_emergency_restart); 6519 if (need_emergency_restart) 6520 goto skip_sched_resume; 6521 /* 6522 * Must check guilty signal here since after this point all old 6523 * HW fences are force signaled. 6524 * 6525 * job->base holds a reference to parent fence 6526 */ 6527 if (job && dma_fence_is_signaled(&job->hw_fence.base)) { 6528 job_signaled = true; 6529 dev_info(adev->dev, "Guilty job already signaled, skipping HW reset"); 6530 goto skip_hw_reset; 6531 } 6532 6533 r = amdgpu_device_asic_reset(adev, &device_list, reset_context); 6534 if (r) 6535 goto reset_unlock; 6536 skip_hw_reset: 6537 r = amdgpu_device_sched_resume(&device_list, reset_context, job_signaled); 6538 if (r) 6539 goto reset_unlock; 6540 skip_sched_resume: 6541 amdgpu_device_gpu_resume(adev, &device_list, need_emergency_restart); 6542 reset_unlock: 6543 amdgpu_device_recovery_put_reset_lock(adev, &device_list); 6544 end_reset: 6545 if (hive) { 6546 mutex_unlock(&hive->hive_lock); 6547 amdgpu_put_xgmi_hive(hive); 6548 } 6549 6550 if (r) 6551 dev_info(adev->dev, "GPU reset end with ret = %d\n", r); 6552 6553 atomic_set(&adev->reset_domain->reset_res, r); 6554 6555 if (!r) { 6556 struct amdgpu_task_info *ti = NULL; 6557 6558 if (job) 6559 ti = amdgpu_vm_get_task_info_pasid(adev, job->pasid); 6560 6561 drm_dev_wedged_event(adev_to_drm(adev), DRM_WEDGE_RECOVERY_NONE, 6562 ti ? &ti->task : NULL); 6563 6564 amdgpu_vm_put_task_info(ti); 6565 } 6566 6567 return r; 6568 } 6569 6570 /** 6571 * amdgpu_device_partner_bandwidth - find the bandwidth of appropriate partner 6572 * 6573 * @adev: amdgpu_device pointer 6574 * @speed: pointer to the speed of the link 6575 * @width: pointer to the width of the link 6576 * 6577 * Evaluate the hierarchy to find the speed and bandwidth capabilities of the 6578 * first physical partner to an AMD dGPU. 6579 * This will exclude any virtual switches and links. 6580 */ 6581 static void amdgpu_device_partner_bandwidth(struct amdgpu_device *adev, 6582 enum pci_bus_speed *speed, 6583 enum pcie_link_width *width) 6584 { 6585 struct pci_dev *parent = adev->pdev; 6586 6587 if (!speed || !width) 6588 return; 6589 6590 *speed = PCI_SPEED_UNKNOWN; 6591 *width = PCIE_LNK_WIDTH_UNKNOWN; 6592 6593 if (amdgpu_device_pcie_dynamic_switching_supported(adev)) { 6594 while ((parent = pci_upstream_bridge(parent))) { 6595 /* skip upstream/downstream switches internal to dGPU*/ 6596 if (parent->vendor == PCI_VENDOR_ID_ATI) 6597 continue; 6598 *speed = pcie_get_speed_cap(parent); 6599 *width = pcie_get_width_cap(parent); 6600 break; 6601 } 6602 } else { 6603 /* use the current speeds rather than max if switching is not supported */ 6604 pcie_bandwidth_available(adev->pdev, NULL, speed, width); 6605 } 6606 } 6607 6608 /** 6609 * amdgpu_device_gpu_bandwidth - find the bandwidth of the GPU 6610 * 6611 * @adev: amdgpu_device pointer 6612 * @speed: pointer to the speed of the link 6613 * @width: pointer to the width of the link 6614 * 6615 * Evaluate the hierarchy to find the speed and bandwidth capabilities of the 6616 * AMD dGPU which may be a virtual upstream bridge. 6617 */ 6618 static void amdgpu_device_gpu_bandwidth(struct amdgpu_device *adev, 6619 enum pci_bus_speed *speed, 6620 enum pcie_link_width *width) 6621 { 6622 struct pci_dev *parent = adev->pdev; 6623 6624 if (!speed || !width) 6625 return; 6626 6627 parent = pci_upstream_bridge(parent); 6628 if (parent && parent->vendor == PCI_VENDOR_ID_ATI) { 6629 /* use the upstream/downstream switches internal to dGPU */ 6630 *speed = pcie_get_speed_cap(parent); 6631 *width = pcie_get_width_cap(parent); 6632 while ((parent = pci_upstream_bridge(parent))) { 6633 if (parent->vendor == PCI_VENDOR_ID_ATI) { 6634 /* use the upstream/downstream switches internal to dGPU */ 6635 *speed = pcie_get_speed_cap(parent); 6636 *width = pcie_get_width_cap(parent); 6637 } 6638 } 6639 } else { 6640 /* use the device itself */ 6641 *speed = pcie_get_speed_cap(adev->pdev); 6642 *width = pcie_get_width_cap(adev->pdev); 6643 } 6644 } 6645 6646 /** 6647 * amdgpu_device_get_pcie_info - fence pcie info about the PCIE slot 6648 * 6649 * @adev: amdgpu_device pointer 6650 * 6651 * Fetches and stores in the driver the PCIE capabilities (gen speed 6652 * and lanes) of the slot the device is in. Handles APUs and 6653 * virtualized environments where PCIE config space may not be available. 6654 */ 6655 static void amdgpu_device_get_pcie_info(struct amdgpu_device *adev) 6656 { 6657 enum pci_bus_speed speed_cap, platform_speed_cap; 6658 enum pcie_link_width platform_link_width, link_width; 6659 6660 if (amdgpu_pcie_gen_cap) 6661 adev->pm.pcie_gen_mask = amdgpu_pcie_gen_cap; 6662 6663 if (amdgpu_pcie_lane_cap) 6664 adev->pm.pcie_mlw_mask = amdgpu_pcie_lane_cap; 6665 6666 /* covers APUs as well */ 6667 if (pci_is_root_bus(adev->pdev->bus) && !amdgpu_passthrough(adev)) { 6668 if (adev->pm.pcie_gen_mask == 0) 6669 adev->pm.pcie_gen_mask = AMDGPU_DEFAULT_PCIE_GEN_MASK; 6670 if (adev->pm.pcie_mlw_mask == 0) 6671 adev->pm.pcie_mlw_mask = AMDGPU_DEFAULT_PCIE_MLW_MASK; 6672 return; 6673 } 6674 6675 if (adev->pm.pcie_gen_mask && adev->pm.pcie_mlw_mask) 6676 return; 6677 6678 amdgpu_device_partner_bandwidth(adev, &platform_speed_cap, 6679 &platform_link_width); 6680 amdgpu_device_gpu_bandwidth(adev, &speed_cap, &link_width); 6681 6682 if (adev->pm.pcie_gen_mask == 0) { 6683 /* asic caps */ 6684 if (speed_cap == PCI_SPEED_UNKNOWN) { 6685 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 6686 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 | 6687 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3); 6688 } else { 6689 if (speed_cap == PCIE_SPEED_32_0GT) 6690 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 6691 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 | 6692 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3 | 6693 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN4 | 6694 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN5); 6695 else if (speed_cap == PCIE_SPEED_16_0GT) 6696 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 6697 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 | 6698 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3 | 6699 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN4); 6700 else if (speed_cap == PCIE_SPEED_8_0GT) 6701 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 6702 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 | 6703 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3); 6704 else if (speed_cap == PCIE_SPEED_5_0GT) 6705 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 6706 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2); 6707 else 6708 adev->pm.pcie_gen_mask |= CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1; 6709 } 6710 /* platform caps */ 6711 if (platform_speed_cap == PCI_SPEED_UNKNOWN) { 6712 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 6713 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2); 6714 } else { 6715 if (platform_speed_cap == PCIE_SPEED_32_0GT) 6716 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 6717 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 | 6718 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3 | 6719 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN4 | 6720 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN5); 6721 else if (platform_speed_cap == PCIE_SPEED_16_0GT) 6722 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 6723 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 | 6724 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3 | 6725 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN4); 6726 else if (platform_speed_cap == PCIE_SPEED_8_0GT) 6727 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 6728 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 | 6729 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3); 6730 else if (platform_speed_cap == PCIE_SPEED_5_0GT) 6731 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 6732 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2); 6733 else 6734 adev->pm.pcie_gen_mask |= CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1; 6735 6736 } 6737 } 6738 if (adev->pm.pcie_mlw_mask == 0) { 6739 /* asic caps */ 6740 if (link_width == PCIE_LNK_WIDTH_UNKNOWN) { 6741 adev->pm.pcie_mlw_mask |= AMDGPU_DEFAULT_ASIC_PCIE_MLW_MASK; 6742 } else { 6743 switch (link_width) { 6744 case PCIE_LNK_X32: 6745 adev->pm.pcie_mlw_mask |= (CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X32 | 6746 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X16 | 6747 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X12 | 6748 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X8 | 6749 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X4 | 6750 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X2 | 6751 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X1); 6752 break; 6753 case PCIE_LNK_X16: 6754 adev->pm.pcie_mlw_mask |= (CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X16 | 6755 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X12 | 6756 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X8 | 6757 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X4 | 6758 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X2 | 6759 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X1); 6760 break; 6761 case PCIE_LNK_X12: 6762 adev->pm.pcie_mlw_mask |= (CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X12 | 6763 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X8 | 6764 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X4 | 6765 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X2 | 6766 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X1); 6767 break; 6768 case PCIE_LNK_X8: 6769 adev->pm.pcie_mlw_mask |= (CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X8 | 6770 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X4 | 6771 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X2 | 6772 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X1); 6773 break; 6774 case PCIE_LNK_X4: 6775 adev->pm.pcie_mlw_mask |= (CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X4 | 6776 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X2 | 6777 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X1); 6778 break; 6779 case PCIE_LNK_X2: 6780 adev->pm.pcie_mlw_mask |= (CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X2 | 6781 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X1); 6782 break; 6783 case PCIE_LNK_X1: 6784 adev->pm.pcie_mlw_mask |= CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X1; 6785 break; 6786 default: 6787 break; 6788 } 6789 } 6790 /* platform caps */ 6791 if (platform_link_width == PCIE_LNK_WIDTH_UNKNOWN) { 6792 adev->pm.pcie_mlw_mask |= AMDGPU_DEFAULT_PCIE_MLW_MASK; 6793 } else { 6794 switch (platform_link_width) { 6795 case PCIE_LNK_X32: 6796 adev->pm.pcie_mlw_mask |= (CAIL_PCIE_LINK_WIDTH_SUPPORT_X32 | 6797 CAIL_PCIE_LINK_WIDTH_SUPPORT_X16 | 6798 CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 | 6799 CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 | 6800 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 6801 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 6802 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 6803 break; 6804 case PCIE_LNK_X16: 6805 adev->pm.pcie_mlw_mask |= (CAIL_PCIE_LINK_WIDTH_SUPPORT_X16 | 6806 CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 | 6807 CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 | 6808 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 6809 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 6810 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 6811 break; 6812 case PCIE_LNK_X12: 6813 adev->pm.pcie_mlw_mask |= (CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 | 6814 CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 | 6815 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 6816 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 6817 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 6818 break; 6819 case PCIE_LNK_X8: 6820 adev->pm.pcie_mlw_mask |= (CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 | 6821 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 6822 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 6823 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 6824 break; 6825 case PCIE_LNK_X4: 6826 adev->pm.pcie_mlw_mask |= (CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 6827 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 6828 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 6829 break; 6830 case PCIE_LNK_X2: 6831 adev->pm.pcie_mlw_mask |= (CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 6832 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 6833 break; 6834 case PCIE_LNK_X1: 6835 adev->pm.pcie_mlw_mask |= CAIL_PCIE_LINK_WIDTH_SUPPORT_X1; 6836 break; 6837 default: 6838 break; 6839 } 6840 } 6841 } 6842 } 6843 6844 /** 6845 * amdgpu_device_is_peer_accessible - Check peer access through PCIe BAR 6846 * 6847 * @adev: amdgpu_device pointer 6848 * @peer_adev: amdgpu_device pointer for peer device trying to access @adev 6849 * 6850 * Return true if @peer_adev can access (DMA) @adev through the PCIe 6851 * BAR, i.e. @adev is "large BAR" and the BAR matches the DMA mask of 6852 * @peer_adev. 6853 */ 6854 bool amdgpu_device_is_peer_accessible(struct amdgpu_device *adev, 6855 struct amdgpu_device *peer_adev) 6856 { 6857 #ifdef CONFIG_HSA_AMD_P2P 6858 bool p2p_access = 6859 !adev->gmc.xgmi.connected_to_cpu && 6860 !(pci_p2pdma_distance(adev->pdev, peer_adev->dev, false) < 0); 6861 if (!p2p_access) 6862 dev_info(adev->dev, "PCIe P2P access from peer device %s is not supported by the chipset\n", 6863 pci_name(peer_adev->pdev)); 6864 6865 bool is_large_bar = adev->gmc.visible_vram_size && 6866 adev->gmc.real_vram_size == adev->gmc.visible_vram_size; 6867 bool p2p_addressable = amdgpu_device_check_iommu_remap(peer_adev); 6868 6869 if (!p2p_addressable) { 6870 uint64_t address_mask = peer_adev->dev->dma_mask ? 6871 ~*peer_adev->dev->dma_mask : ~((1ULL << 32) - 1); 6872 resource_size_t aper_limit = 6873 adev->gmc.aper_base + adev->gmc.aper_size - 1; 6874 6875 p2p_addressable = !(adev->gmc.aper_base & address_mask || 6876 aper_limit & address_mask); 6877 } 6878 return pcie_p2p && is_large_bar && p2p_access && p2p_addressable; 6879 #else 6880 return false; 6881 #endif 6882 } 6883 6884 int amdgpu_device_baco_enter(struct amdgpu_device *adev) 6885 { 6886 struct amdgpu_ras *ras = amdgpu_ras_get_context(adev); 6887 6888 if (!amdgpu_device_supports_baco(adev)) 6889 return -ENOTSUPP; 6890 6891 if (ras && adev->ras_enabled && 6892 adev->nbio.funcs->enable_doorbell_interrupt) 6893 adev->nbio.funcs->enable_doorbell_interrupt(adev, false); 6894 6895 return amdgpu_dpm_baco_enter(adev); 6896 } 6897 6898 int amdgpu_device_baco_exit(struct amdgpu_device *adev) 6899 { 6900 struct amdgpu_ras *ras = amdgpu_ras_get_context(adev); 6901 int ret = 0; 6902 6903 if (!amdgpu_device_supports_baco(adev)) 6904 return -ENOTSUPP; 6905 6906 ret = amdgpu_dpm_baco_exit(adev); 6907 if (ret) 6908 return ret; 6909 6910 if (ras && adev->ras_enabled && 6911 adev->nbio.funcs->enable_doorbell_interrupt) 6912 adev->nbio.funcs->enable_doorbell_interrupt(adev, true); 6913 6914 if (amdgpu_passthrough(adev) && adev->nbio.funcs && 6915 adev->nbio.funcs->clear_doorbell_interrupt) 6916 adev->nbio.funcs->clear_doorbell_interrupt(adev); 6917 6918 return 0; 6919 } 6920 6921 /** 6922 * amdgpu_pci_error_detected - Called when a PCI error is detected. 6923 * @pdev: PCI device struct 6924 * @state: PCI channel state 6925 * 6926 * Description: Called when a PCI error is detected. 6927 * 6928 * Return: PCI_ERS_RESULT_NEED_RESET or PCI_ERS_RESULT_DISCONNECT. 6929 */ 6930 pci_ers_result_t amdgpu_pci_error_detected(struct pci_dev *pdev, pci_channel_state_t state) 6931 { 6932 struct drm_device *dev = pci_get_drvdata(pdev); 6933 struct amdgpu_device *adev = drm_to_adev(dev); 6934 struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev); 6935 struct amdgpu_reset_context reset_context; 6936 struct list_head device_list; 6937 6938 dev_info(adev->dev, "PCI error: detected callback!!\n"); 6939 6940 adev->pci_channel_state = state; 6941 6942 switch (state) { 6943 case pci_channel_io_normal: 6944 dev_info(adev->dev, "pci_channel_io_normal: state(%d)!!\n", state); 6945 return PCI_ERS_RESULT_CAN_RECOVER; 6946 case pci_channel_io_frozen: 6947 /* Fatal error, prepare for slot reset */ 6948 dev_info(adev->dev, "pci_channel_io_frozen: state(%d)!!\n", state); 6949 if (hive) { 6950 /* Hive devices should be able to support FW based 6951 * link reset on other devices, if not return. 6952 */ 6953 if (!amdgpu_dpm_is_link_reset_supported(adev)) { 6954 dev_warn(adev->dev, 6955 "No support for XGMI hive yet...\n"); 6956 return PCI_ERS_RESULT_DISCONNECT; 6957 } 6958 /* Set dpc status only if device is part of hive 6959 * Non-hive devices should be able to recover after 6960 * link reset. 6961 */ 6962 amdgpu_reset_set_dpc_status(adev, true); 6963 6964 mutex_lock(&hive->hive_lock); 6965 } 6966 memset(&reset_context, 0, sizeof(reset_context)); 6967 INIT_LIST_HEAD(&device_list); 6968 6969 amdgpu_device_recovery_prepare(adev, &device_list, hive); 6970 amdgpu_device_recovery_get_reset_lock(adev, &device_list); 6971 amdgpu_device_halt_activities(adev, NULL, &reset_context, &device_list, 6972 hive, false); 6973 if (hive) { 6974 mutex_unlock(&hive->hive_lock); 6975 amdgpu_put_xgmi_hive(hive); 6976 } 6977 return PCI_ERS_RESULT_NEED_RESET; 6978 case pci_channel_io_perm_failure: 6979 /* Permanent error, prepare for device removal */ 6980 dev_info(adev->dev, "pci_channel_io_perm_failure: state(%d)!!\n", state); 6981 return PCI_ERS_RESULT_DISCONNECT; 6982 } 6983 6984 return PCI_ERS_RESULT_NEED_RESET; 6985 } 6986 6987 /** 6988 * amdgpu_pci_mmio_enabled - Enable MMIO and dump debug registers 6989 * @pdev: pointer to PCI device 6990 */ 6991 pci_ers_result_t amdgpu_pci_mmio_enabled(struct pci_dev *pdev) 6992 { 6993 struct drm_device *dev = pci_get_drvdata(pdev); 6994 struct amdgpu_device *adev = drm_to_adev(dev); 6995 6996 dev_info(adev->dev, "PCI error: mmio enabled callback!!\n"); 6997 6998 /* TODO - dump whatever for debugging purposes */ 6999 7000 /* This called only if amdgpu_pci_error_detected returns 7001 * PCI_ERS_RESULT_CAN_RECOVER. Read/write to the device still 7002 * works, no need to reset slot. 7003 */ 7004 7005 return PCI_ERS_RESULT_RECOVERED; 7006 } 7007 7008 /** 7009 * amdgpu_pci_slot_reset - Called when PCI slot has been reset. 7010 * @pdev: PCI device struct 7011 * 7012 * Description: This routine is called by the pci error recovery 7013 * code after the PCI slot has been reset, just before we 7014 * should resume normal operations. 7015 */ 7016 pci_ers_result_t amdgpu_pci_slot_reset(struct pci_dev *pdev) 7017 { 7018 struct drm_device *dev = pci_get_drvdata(pdev); 7019 struct amdgpu_device *adev = drm_to_adev(dev); 7020 struct amdgpu_reset_context reset_context; 7021 struct amdgpu_device *tmp_adev; 7022 struct amdgpu_hive_info *hive; 7023 struct list_head device_list; 7024 struct pci_dev *link_dev; 7025 int r = 0, i, timeout; 7026 u32 memsize; 7027 u16 status; 7028 7029 dev_info(adev->dev, "PCI error: slot reset callback!!\n"); 7030 7031 memset(&reset_context, 0, sizeof(reset_context)); 7032 7033 if (adev->pcie_reset_ctx.swus) 7034 link_dev = adev->pcie_reset_ctx.swus; 7035 else 7036 link_dev = adev->pdev; 7037 /* wait for asic to come out of reset, timeout = 10s */ 7038 timeout = 10000; 7039 do { 7040 usleep_range(10000, 10500); 7041 r = pci_read_config_word(link_dev, PCI_VENDOR_ID, &status); 7042 timeout -= 10; 7043 } while (timeout > 0 && (status != PCI_VENDOR_ID_ATI) && 7044 (status != PCI_VENDOR_ID_AMD)); 7045 7046 if ((status != PCI_VENDOR_ID_ATI) && (status != PCI_VENDOR_ID_AMD)) { 7047 r = -ETIME; 7048 goto out; 7049 } 7050 7051 amdgpu_device_load_switch_state(adev); 7052 /* Restore PCI confspace */ 7053 amdgpu_device_load_pci_state(pdev); 7054 7055 /* confirm ASIC came out of reset */ 7056 for (i = 0; i < adev->usec_timeout; i++) { 7057 memsize = amdgpu_asic_get_config_memsize(adev); 7058 7059 if (memsize != 0xffffffff) 7060 break; 7061 udelay(1); 7062 } 7063 if (memsize == 0xffffffff) { 7064 r = -ETIME; 7065 goto out; 7066 } 7067 7068 reset_context.method = AMD_RESET_METHOD_NONE; 7069 reset_context.reset_req_dev = adev; 7070 set_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags); 7071 set_bit(AMDGPU_SKIP_COREDUMP, &reset_context.flags); 7072 INIT_LIST_HEAD(&device_list); 7073 7074 hive = amdgpu_get_xgmi_hive(adev); 7075 if (hive) { 7076 mutex_lock(&hive->hive_lock); 7077 reset_context.hive = hive; 7078 list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head) { 7079 tmp_adev->pcie_reset_ctx.in_link_reset = true; 7080 list_add_tail(&tmp_adev->reset_list, &device_list); 7081 } 7082 } else { 7083 set_bit(AMDGPU_SKIP_HW_RESET, &reset_context.flags); 7084 list_add_tail(&adev->reset_list, &device_list); 7085 } 7086 7087 r = amdgpu_device_asic_reset(adev, &device_list, &reset_context); 7088 out: 7089 if (!r) { 7090 if (amdgpu_device_cache_pci_state(adev->pdev)) 7091 pci_restore_state(adev->pdev); 7092 dev_info(adev->dev, "PCIe error recovery succeeded\n"); 7093 } else { 7094 dev_err(adev->dev, "PCIe error recovery failed, err:%d\n", r); 7095 if (hive) { 7096 list_for_each_entry(tmp_adev, &device_list, reset_list) 7097 amdgpu_device_unset_mp1_state(tmp_adev); 7098 } 7099 amdgpu_device_recovery_put_reset_lock(adev, &device_list); 7100 } 7101 7102 if (hive) { 7103 mutex_unlock(&hive->hive_lock); 7104 amdgpu_put_xgmi_hive(hive); 7105 } 7106 7107 return r ? PCI_ERS_RESULT_DISCONNECT : PCI_ERS_RESULT_RECOVERED; 7108 } 7109 7110 /** 7111 * amdgpu_pci_resume() - resume normal ops after PCI reset 7112 * @pdev: pointer to PCI device 7113 * 7114 * Called when the error recovery driver tells us that its 7115 * OK to resume normal operation. 7116 */ 7117 void amdgpu_pci_resume(struct pci_dev *pdev) 7118 { 7119 struct drm_device *dev = pci_get_drvdata(pdev); 7120 struct amdgpu_device *adev = drm_to_adev(dev); 7121 struct list_head device_list; 7122 struct amdgpu_hive_info *hive = NULL; 7123 struct amdgpu_device *tmp_adev = NULL; 7124 7125 dev_info(adev->dev, "PCI error: resume callback!!\n"); 7126 7127 /* Only continue execution for the case of pci_channel_io_frozen */ 7128 if (adev->pci_channel_state != pci_channel_io_frozen) 7129 return; 7130 7131 INIT_LIST_HEAD(&device_list); 7132 7133 hive = amdgpu_get_xgmi_hive(adev); 7134 if (hive) { 7135 mutex_lock(&hive->hive_lock); 7136 list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head) { 7137 tmp_adev->pcie_reset_ctx.in_link_reset = false; 7138 list_add_tail(&tmp_adev->reset_list, &device_list); 7139 } 7140 } else 7141 list_add_tail(&adev->reset_list, &device_list); 7142 7143 amdgpu_device_sched_resume(&device_list, NULL, NULL); 7144 amdgpu_device_gpu_resume(adev, &device_list, false); 7145 amdgpu_device_recovery_put_reset_lock(adev, &device_list); 7146 7147 if (hive) { 7148 mutex_unlock(&hive->hive_lock); 7149 amdgpu_put_xgmi_hive(hive); 7150 } 7151 } 7152 7153 static void amdgpu_device_cache_switch_state(struct amdgpu_device *adev) 7154 { 7155 struct pci_dev *parent = pci_upstream_bridge(adev->pdev); 7156 int r; 7157 7158 if (parent->vendor != PCI_VENDOR_ID_ATI) 7159 return; 7160 7161 /* If already saved, return */ 7162 if (adev->pcie_reset_ctx.swus) 7163 return; 7164 /* Upstream bridge is ATI, assume it's SWUS/DS architecture */ 7165 r = pci_save_state(parent); 7166 if (r) 7167 return; 7168 adev->pcie_reset_ctx.swds_pcistate = pci_store_saved_state(parent); 7169 7170 parent = pci_upstream_bridge(parent); 7171 r = pci_save_state(parent); 7172 if (r) 7173 return; 7174 adev->pcie_reset_ctx.swus_pcistate = pci_store_saved_state(parent); 7175 7176 adev->pcie_reset_ctx.swus = parent; 7177 } 7178 7179 static void amdgpu_device_load_switch_state(struct amdgpu_device *adev) 7180 { 7181 struct pci_dev *pdev; 7182 int r; 7183 7184 if (!adev->pcie_reset_ctx.swds_pcistate || 7185 !adev->pcie_reset_ctx.swus_pcistate) 7186 return; 7187 7188 pdev = adev->pcie_reset_ctx.swus; 7189 r = pci_load_saved_state(pdev, adev->pcie_reset_ctx.swus_pcistate); 7190 if (!r) { 7191 pci_restore_state(pdev); 7192 } else { 7193 dev_warn(adev->dev, "Failed to load SWUS state, err:%d\n", r); 7194 return; 7195 } 7196 7197 pdev = pci_upstream_bridge(adev->pdev); 7198 r = pci_load_saved_state(pdev, adev->pcie_reset_ctx.swds_pcistate); 7199 if (!r) 7200 pci_restore_state(pdev); 7201 else 7202 dev_warn(adev->dev, "Failed to load SWDS state, err:%d\n", r); 7203 } 7204 7205 bool amdgpu_device_cache_pci_state(struct pci_dev *pdev) 7206 { 7207 struct drm_device *dev = pci_get_drvdata(pdev); 7208 struct amdgpu_device *adev = drm_to_adev(dev); 7209 int r; 7210 7211 if (amdgpu_sriov_vf(adev)) 7212 return false; 7213 7214 r = pci_save_state(pdev); 7215 if (!r) { 7216 kfree(adev->pci_state); 7217 7218 adev->pci_state = pci_store_saved_state(pdev); 7219 7220 if (!adev->pci_state) { 7221 dev_err(adev->dev, "Failed to store PCI saved state"); 7222 return false; 7223 } 7224 } else { 7225 dev_warn(adev->dev, "Failed to save PCI state, err:%d\n", r); 7226 return false; 7227 } 7228 7229 amdgpu_device_cache_switch_state(adev); 7230 7231 return true; 7232 } 7233 7234 bool amdgpu_device_load_pci_state(struct pci_dev *pdev) 7235 { 7236 struct drm_device *dev = pci_get_drvdata(pdev); 7237 struct amdgpu_device *adev = drm_to_adev(dev); 7238 int r; 7239 7240 if (!adev->pci_state) 7241 return false; 7242 7243 r = pci_load_saved_state(pdev, adev->pci_state); 7244 7245 if (!r) { 7246 pci_restore_state(pdev); 7247 } else { 7248 dev_warn(adev->dev, "Failed to load PCI state, err:%d\n", r); 7249 return false; 7250 } 7251 7252 return true; 7253 } 7254 7255 void amdgpu_device_flush_hdp(struct amdgpu_device *adev, 7256 struct amdgpu_ring *ring) 7257 { 7258 #ifdef CONFIG_X86_64 7259 if ((adev->flags & AMD_IS_APU) && !amdgpu_passthrough(adev)) 7260 return; 7261 #endif 7262 if (adev->gmc.xgmi.connected_to_cpu) 7263 return; 7264 7265 if (ring && ring->funcs->emit_hdp_flush) 7266 amdgpu_ring_emit_hdp_flush(ring); 7267 else 7268 amdgpu_asic_flush_hdp(adev, ring); 7269 } 7270 7271 void amdgpu_device_invalidate_hdp(struct amdgpu_device *adev, 7272 struct amdgpu_ring *ring) 7273 { 7274 #ifdef CONFIG_X86_64 7275 if ((adev->flags & AMD_IS_APU) && !amdgpu_passthrough(adev)) 7276 return; 7277 #endif 7278 if (adev->gmc.xgmi.connected_to_cpu) 7279 return; 7280 7281 amdgpu_asic_invalidate_hdp(adev, ring); 7282 } 7283 7284 int amdgpu_in_reset(struct amdgpu_device *adev) 7285 { 7286 return atomic_read(&adev->reset_domain->in_gpu_reset); 7287 } 7288 7289 /** 7290 * amdgpu_device_halt() - bring hardware to some kind of halt state 7291 * 7292 * @adev: amdgpu_device pointer 7293 * 7294 * Bring hardware to some kind of halt state so that no one can touch it 7295 * any more. It will help to maintain error context when error occurred. 7296 * Compare to a simple hang, the system will keep stable at least for SSH 7297 * access. Then it should be trivial to inspect the hardware state and 7298 * see what's going on. Implemented as following: 7299 * 7300 * 1. drm_dev_unplug() makes device inaccessible to user space(IOCTLs, etc), 7301 * clears all CPU mappings to device, disallows remappings through page faults 7302 * 2. amdgpu_irq_disable_all() disables all interrupts 7303 * 3. amdgpu_fence_driver_hw_fini() signals all HW fences 7304 * 4. set adev->no_hw_access to avoid potential crashes after setp 5 7305 * 5. amdgpu_device_unmap_mmio() clears all MMIO mappings 7306 * 6. pci_disable_device() and pci_wait_for_pending_transaction() 7307 * flush any in flight DMA operations 7308 */ 7309 void amdgpu_device_halt(struct amdgpu_device *adev) 7310 { 7311 struct pci_dev *pdev = adev->pdev; 7312 struct drm_device *ddev = adev_to_drm(adev); 7313 7314 amdgpu_xcp_dev_unplug(adev); 7315 drm_dev_unplug(ddev); 7316 7317 amdgpu_irq_disable_all(adev); 7318 7319 amdgpu_fence_driver_hw_fini(adev); 7320 7321 adev->no_hw_access = true; 7322 7323 amdgpu_device_unmap_mmio(adev); 7324 7325 pci_disable_device(pdev); 7326 pci_wait_for_pending_transaction(pdev); 7327 } 7328 7329 u32 amdgpu_device_pcie_port_rreg(struct amdgpu_device *adev, 7330 u32 reg) 7331 { 7332 unsigned long flags, address, data; 7333 u32 r; 7334 7335 address = adev->nbio.funcs->get_pcie_port_index_offset(adev); 7336 data = adev->nbio.funcs->get_pcie_port_data_offset(adev); 7337 7338 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 7339 WREG32(address, reg * 4); 7340 (void)RREG32(address); 7341 r = RREG32(data); 7342 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 7343 return r; 7344 } 7345 7346 void amdgpu_device_pcie_port_wreg(struct amdgpu_device *adev, 7347 u32 reg, u32 v) 7348 { 7349 unsigned long flags, address, data; 7350 7351 address = adev->nbio.funcs->get_pcie_port_index_offset(adev); 7352 data = adev->nbio.funcs->get_pcie_port_data_offset(adev); 7353 7354 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 7355 WREG32(address, reg * 4); 7356 (void)RREG32(address); 7357 WREG32(data, v); 7358 (void)RREG32(data); 7359 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 7360 } 7361 7362 /** 7363 * amdgpu_device_get_gang - return a reference to the current gang 7364 * @adev: amdgpu_device pointer 7365 * 7366 * Returns: A new reference to the current gang leader. 7367 */ 7368 struct dma_fence *amdgpu_device_get_gang(struct amdgpu_device *adev) 7369 { 7370 struct dma_fence *fence; 7371 7372 rcu_read_lock(); 7373 fence = dma_fence_get_rcu_safe(&adev->gang_submit); 7374 rcu_read_unlock(); 7375 return fence; 7376 } 7377 7378 /** 7379 * amdgpu_device_switch_gang - switch to a new gang 7380 * @adev: amdgpu_device pointer 7381 * @gang: the gang to switch to 7382 * 7383 * Try to switch to a new gang. 7384 * Returns: NULL if we switched to the new gang or a reference to the current 7385 * gang leader. 7386 */ 7387 struct dma_fence *amdgpu_device_switch_gang(struct amdgpu_device *adev, 7388 struct dma_fence *gang) 7389 { 7390 struct dma_fence *old = NULL; 7391 7392 dma_fence_get(gang); 7393 do { 7394 dma_fence_put(old); 7395 old = amdgpu_device_get_gang(adev); 7396 if (old == gang) 7397 break; 7398 7399 if (!dma_fence_is_signaled(old)) { 7400 dma_fence_put(gang); 7401 return old; 7402 } 7403 7404 } while (cmpxchg((struct dma_fence __force **)&adev->gang_submit, 7405 old, gang) != old); 7406 7407 /* 7408 * Drop it once for the exchanged reference in adev and once for the 7409 * thread local reference acquired in amdgpu_device_get_gang(). 7410 */ 7411 dma_fence_put(old); 7412 dma_fence_put(old); 7413 return NULL; 7414 } 7415 7416 /** 7417 * amdgpu_device_enforce_isolation - enforce HW isolation 7418 * @adev: the amdgpu device pointer 7419 * @ring: the HW ring the job is supposed to run on 7420 * @job: the job which is about to be pushed to the HW ring 7421 * 7422 * Makes sure that only one client at a time can use the GFX block. 7423 * Returns: The dependency to wait on before the job can be pushed to the HW. 7424 * The function is called multiple times until NULL is returned. 7425 */ 7426 struct dma_fence *amdgpu_device_enforce_isolation(struct amdgpu_device *adev, 7427 struct amdgpu_ring *ring, 7428 struct amdgpu_job *job) 7429 { 7430 struct amdgpu_isolation *isolation = &adev->isolation[ring->xcp_id]; 7431 struct drm_sched_fence *f = job->base.s_fence; 7432 struct dma_fence *dep; 7433 void *owner; 7434 int r; 7435 7436 /* 7437 * For now enforce isolation only for the GFX block since we only need 7438 * the cleaner shader on those rings. 7439 */ 7440 if (ring->funcs->type != AMDGPU_RING_TYPE_GFX && 7441 ring->funcs->type != AMDGPU_RING_TYPE_COMPUTE) 7442 return NULL; 7443 7444 /* 7445 * All submissions where enforce isolation is false are handled as if 7446 * they come from a single client. Use ~0l as the owner to distinct it 7447 * from kernel submissions where the owner is NULL. 7448 */ 7449 owner = job->enforce_isolation ? f->owner : (void *)~0l; 7450 7451 mutex_lock(&adev->enforce_isolation_mutex); 7452 7453 /* 7454 * The "spearhead" submission is the first one which changes the 7455 * ownership to its client. We always need to wait for it to be 7456 * pushed to the HW before proceeding with anything. 7457 */ 7458 if (&f->scheduled != isolation->spearhead && 7459 !dma_fence_is_signaled(isolation->spearhead)) { 7460 dep = isolation->spearhead; 7461 goto out_grab_ref; 7462 } 7463 7464 if (isolation->owner != owner) { 7465 7466 /* 7467 * Wait for any gang to be assembled before switching to a 7468 * different owner or otherwise we could deadlock the 7469 * submissions. 7470 */ 7471 if (!job->gang_submit) { 7472 dep = amdgpu_device_get_gang(adev); 7473 if (!dma_fence_is_signaled(dep)) 7474 goto out_return_dep; 7475 dma_fence_put(dep); 7476 } 7477 7478 dma_fence_put(isolation->spearhead); 7479 isolation->spearhead = dma_fence_get(&f->scheduled); 7480 amdgpu_sync_move(&isolation->active, &isolation->prev); 7481 trace_amdgpu_isolation(isolation->owner, owner); 7482 isolation->owner = owner; 7483 } 7484 7485 /* 7486 * Specifying the ring here helps to pipeline submissions even when 7487 * isolation is enabled. If that is not desired for testing NULL can be 7488 * used instead of the ring to enforce a CPU round trip while switching 7489 * between clients. 7490 */ 7491 dep = amdgpu_sync_peek_fence(&isolation->prev, ring); 7492 r = amdgpu_sync_fence(&isolation->active, &f->finished, GFP_NOWAIT); 7493 if (r) 7494 dev_warn(adev->dev, "OOM tracking isolation\n"); 7495 7496 out_grab_ref: 7497 dma_fence_get(dep); 7498 out_return_dep: 7499 mutex_unlock(&adev->enforce_isolation_mutex); 7500 return dep; 7501 } 7502 7503 bool amdgpu_device_has_display_hardware(struct amdgpu_device *adev) 7504 { 7505 switch (adev->asic_type) { 7506 #ifdef CONFIG_DRM_AMDGPU_SI 7507 case CHIP_HAINAN: 7508 #endif 7509 case CHIP_TOPAZ: 7510 /* chips with no display hardware */ 7511 return false; 7512 #ifdef CONFIG_DRM_AMDGPU_SI 7513 case CHIP_TAHITI: 7514 case CHIP_PITCAIRN: 7515 case CHIP_VERDE: 7516 case CHIP_OLAND: 7517 #endif 7518 #ifdef CONFIG_DRM_AMDGPU_CIK 7519 case CHIP_BONAIRE: 7520 case CHIP_HAWAII: 7521 case CHIP_KAVERI: 7522 case CHIP_KABINI: 7523 case CHIP_MULLINS: 7524 #endif 7525 case CHIP_TONGA: 7526 case CHIP_FIJI: 7527 case CHIP_POLARIS10: 7528 case CHIP_POLARIS11: 7529 case CHIP_POLARIS12: 7530 case CHIP_VEGAM: 7531 case CHIP_CARRIZO: 7532 case CHIP_STONEY: 7533 /* chips with display hardware */ 7534 return true; 7535 default: 7536 /* IP discovery */ 7537 if (!amdgpu_ip_version(adev, DCE_HWIP, 0) || 7538 (adev->harvest_ip_mask & AMD_HARVEST_IP_DMU_MASK)) 7539 return false; 7540 return true; 7541 } 7542 } 7543 7544 uint32_t amdgpu_device_wait_on_rreg(struct amdgpu_device *adev, 7545 uint32_t inst, uint32_t reg_addr, char reg_name[], 7546 uint32_t expected_value, uint32_t mask) 7547 { 7548 uint32_t ret = 0; 7549 uint32_t old_ = 0; 7550 uint32_t tmp_ = RREG32(reg_addr); 7551 uint32_t loop = adev->usec_timeout; 7552 7553 while ((tmp_ & (mask)) != (expected_value)) { 7554 if (old_ != tmp_) { 7555 loop = adev->usec_timeout; 7556 old_ = tmp_; 7557 } else 7558 udelay(1); 7559 tmp_ = RREG32(reg_addr); 7560 loop--; 7561 if (!loop) { 7562 dev_warn( 7563 adev->dev, 7564 "Register(%d) [%s] failed to reach value 0x%08x != 0x%08xn", 7565 inst, reg_name, (uint32_t)expected_value, 7566 (uint32_t)(tmp_ & (mask))); 7567 ret = -ETIMEDOUT; 7568 break; 7569 } 7570 } 7571 return ret; 7572 } 7573 7574 ssize_t amdgpu_get_soft_full_reset_mask(struct amdgpu_ring *ring) 7575 { 7576 ssize_t size = 0; 7577 7578 if (!ring || !ring->adev) 7579 return size; 7580 7581 if (amdgpu_device_should_recover_gpu(ring->adev)) 7582 size |= AMDGPU_RESET_TYPE_FULL; 7583 7584 if (unlikely(!ring->adev->debug_disable_soft_recovery) && 7585 !amdgpu_sriov_vf(ring->adev) && ring->funcs->soft_recovery) 7586 size |= AMDGPU_RESET_TYPE_SOFT_RESET; 7587 7588 return size; 7589 } 7590 7591 ssize_t amdgpu_show_reset_mask(char *buf, uint32_t supported_reset) 7592 { 7593 ssize_t size = 0; 7594 7595 if (supported_reset == 0) { 7596 size += sysfs_emit_at(buf, size, "unsupported"); 7597 size += sysfs_emit_at(buf, size, "\n"); 7598 return size; 7599 7600 } 7601 7602 if (supported_reset & AMDGPU_RESET_TYPE_SOFT_RESET) 7603 size += sysfs_emit_at(buf, size, "soft "); 7604 7605 if (supported_reset & AMDGPU_RESET_TYPE_PER_QUEUE) 7606 size += sysfs_emit_at(buf, size, "queue "); 7607 7608 if (supported_reset & AMDGPU_RESET_TYPE_PER_PIPE) 7609 size += sysfs_emit_at(buf, size, "pipe "); 7610 7611 if (supported_reset & AMDGPU_RESET_TYPE_FULL) 7612 size += sysfs_emit_at(buf, size, "full "); 7613 7614 size += sysfs_emit_at(buf, size, "\n"); 7615 return size; 7616 } 7617 7618 void amdgpu_device_set_uid(struct amdgpu_uid *uid_info, 7619 enum amdgpu_uid_type type, uint8_t inst, 7620 uint64_t uid) 7621 { 7622 if (!uid_info) 7623 return; 7624 7625 if (type >= AMDGPU_UID_TYPE_MAX) { 7626 dev_err_once(uid_info->adev->dev, "Invalid UID type %d\n", 7627 type); 7628 return; 7629 } 7630 7631 if (inst >= AMDGPU_UID_INST_MAX) { 7632 dev_err_once(uid_info->adev->dev, "Invalid UID instance %d\n", 7633 inst); 7634 return; 7635 } 7636 7637 if (uid_info->uid[type][inst] != 0) { 7638 dev_warn_once( 7639 uid_info->adev->dev, 7640 "Overwriting existing UID %llu for type %d instance %d\n", 7641 uid_info->uid[type][inst], type, inst); 7642 } 7643 7644 uid_info->uid[type][inst] = uid; 7645 } 7646 7647 u64 amdgpu_device_get_uid(struct amdgpu_uid *uid_info, 7648 enum amdgpu_uid_type type, uint8_t inst) 7649 { 7650 if (!uid_info) 7651 return 0; 7652 7653 if (type >= AMDGPU_UID_TYPE_MAX) { 7654 dev_err_once(uid_info->adev->dev, "Invalid UID type %d\n", 7655 type); 7656 return 0; 7657 } 7658 7659 if (inst >= AMDGPU_UID_INST_MAX) { 7660 dev_err_once(uid_info->adev->dev, "Invalid UID instance %d\n", 7661 inst); 7662 return 0; 7663 } 7664 7665 return uid_info->uid[type][inst]; 7666 } 7667