1 /* 2 * Copyright 2008 Advanced Micro Devices, Inc. 3 * Copyright 2008 Red Hat Inc. 4 * Copyright 2009 Jerome Glisse. 5 * 6 * Permission is hereby granted, free of charge, to any person obtaining a 7 * copy of this software and associated documentation files (the "Software"), 8 * to deal in the Software without restriction, including without limitation 9 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 10 * and/or sell copies of the Software, and to permit persons to whom the 11 * Software is furnished to do so, subject to the following conditions: 12 * 13 * The above copyright notice and this permission notice shall be included in 14 * all copies or substantial portions of the Software. 15 * 16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 19 * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR 20 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, 21 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR 22 * OTHER DEALINGS IN THE SOFTWARE. 23 * 24 * Authors: Dave Airlie 25 * Alex Deucher 26 * Jerome Glisse 27 */ 28 29 #include <linux/aperture.h> 30 #include <linux/power_supply.h> 31 #include <linux/kthread.h> 32 #include <linux/module.h> 33 #include <linux/console.h> 34 #include <linux/slab.h> 35 #include <linux/iommu.h> 36 #include <linux/pci.h> 37 #include <linux/pci-p2pdma.h> 38 #include <linux/apple-gmux.h> 39 40 #include <drm/drm_atomic_helper.h> 41 #include <drm/drm_client_event.h> 42 #include <drm/drm_crtc_helper.h> 43 #include <drm/drm_probe_helper.h> 44 #include <drm/amdgpu_drm.h> 45 #include <linux/device.h> 46 #include <linux/vgaarb.h> 47 #include <linux/vga_switcheroo.h> 48 #include <linux/efi.h> 49 #include "amdgpu.h" 50 #include "amdgpu_trace.h" 51 #include "amdgpu_i2c.h" 52 #include "atom.h" 53 #include "amdgpu_atombios.h" 54 #include "amdgpu_atomfirmware.h" 55 #include "amd_pcie.h" 56 #ifdef CONFIG_DRM_AMDGPU_SI 57 #include "si.h" 58 #endif 59 #ifdef CONFIG_DRM_AMDGPU_CIK 60 #include "cik.h" 61 #endif 62 #include "vi.h" 63 #include "soc15.h" 64 #include "nv.h" 65 #include "bif/bif_4_1_d.h" 66 #include <linux/firmware.h> 67 #include "amdgpu_vf_error.h" 68 69 #include "amdgpu_amdkfd.h" 70 #include "amdgpu_pm.h" 71 72 #include "amdgpu_xgmi.h" 73 #include "amdgpu_ras.h" 74 #include "amdgpu_pmu.h" 75 #include "amdgpu_fru_eeprom.h" 76 #include "amdgpu_reset.h" 77 #include "amdgpu_virt.h" 78 #include "amdgpu_dev_coredump.h" 79 80 #include <linux/suspend.h> 81 #include <drm/task_barrier.h> 82 #include <linux/pm_runtime.h> 83 84 #include <drm/drm_drv.h> 85 86 #if IS_ENABLED(CONFIG_X86) 87 #include <asm/intel-family.h> 88 #include <asm/cpu_device_id.h> 89 #endif 90 91 MODULE_FIRMWARE("amdgpu/vega10_gpu_info.bin"); 92 MODULE_FIRMWARE("amdgpu/vega12_gpu_info.bin"); 93 MODULE_FIRMWARE("amdgpu/raven_gpu_info.bin"); 94 MODULE_FIRMWARE("amdgpu/picasso_gpu_info.bin"); 95 MODULE_FIRMWARE("amdgpu/raven2_gpu_info.bin"); 96 MODULE_FIRMWARE("amdgpu/arcturus_gpu_info.bin"); 97 MODULE_FIRMWARE("amdgpu/navi12_gpu_info.bin"); 98 99 #define AMDGPU_RESUME_MS 2000 100 #define AMDGPU_MAX_RETRY_LIMIT 2 101 #define AMDGPU_RETRY_SRIOV_RESET(r) ((r) == -EBUSY || (r) == -ETIMEDOUT || (r) == -EINVAL) 102 #define AMDGPU_PCIE_INDEX_FALLBACK (0x38 >> 2) 103 #define AMDGPU_PCIE_INDEX_HI_FALLBACK (0x44 >> 2) 104 #define AMDGPU_PCIE_DATA_FALLBACK (0x3C >> 2) 105 106 #define AMDGPU_VBIOS_SKIP (1U << 0) 107 #define AMDGPU_VBIOS_OPTIONAL (1U << 1) 108 109 static const struct drm_driver amdgpu_kms_driver; 110 111 const char *amdgpu_asic_name[] = { 112 "TAHITI", 113 "PITCAIRN", 114 "VERDE", 115 "OLAND", 116 "HAINAN", 117 "BONAIRE", 118 "KAVERI", 119 "KABINI", 120 "HAWAII", 121 "MULLINS", 122 "TOPAZ", 123 "TONGA", 124 "FIJI", 125 "CARRIZO", 126 "STONEY", 127 "POLARIS10", 128 "POLARIS11", 129 "POLARIS12", 130 "VEGAM", 131 "VEGA10", 132 "VEGA12", 133 "VEGA20", 134 "RAVEN", 135 "ARCTURUS", 136 "RENOIR", 137 "ALDEBARAN", 138 "NAVI10", 139 "CYAN_SKILLFISH", 140 "NAVI14", 141 "NAVI12", 142 "SIENNA_CICHLID", 143 "NAVY_FLOUNDER", 144 "VANGOGH", 145 "DIMGREY_CAVEFISH", 146 "BEIGE_GOBY", 147 "YELLOW_CARP", 148 "IP DISCOVERY", 149 "LAST", 150 }; 151 152 #define AMDGPU_IP_BLK_MASK_ALL GENMASK(AMD_IP_BLOCK_TYPE_NUM - 1, 0) 153 /* 154 * Default init level where all blocks are expected to be initialized. This is 155 * the level of initialization expected by default and also after a full reset 156 * of the device. 157 */ 158 struct amdgpu_init_level amdgpu_init_default = { 159 .level = AMDGPU_INIT_LEVEL_DEFAULT, 160 .hwini_ip_block_mask = AMDGPU_IP_BLK_MASK_ALL, 161 }; 162 163 struct amdgpu_init_level amdgpu_init_recovery = { 164 .level = AMDGPU_INIT_LEVEL_RESET_RECOVERY, 165 .hwini_ip_block_mask = AMDGPU_IP_BLK_MASK_ALL, 166 }; 167 168 /* 169 * Minimal blocks needed to be initialized before a XGMI hive can be reset. This 170 * is used for cases like reset on initialization where the entire hive needs to 171 * be reset before first use. 172 */ 173 struct amdgpu_init_level amdgpu_init_minimal_xgmi = { 174 .level = AMDGPU_INIT_LEVEL_MINIMAL_XGMI, 175 .hwini_ip_block_mask = 176 BIT(AMD_IP_BLOCK_TYPE_GMC) | BIT(AMD_IP_BLOCK_TYPE_SMC) | 177 BIT(AMD_IP_BLOCK_TYPE_COMMON) | BIT(AMD_IP_BLOCK_TYPE_IH) | 178 BIT(AMD_IP_BLOCK_TYPE_PSP) 179 }; 180 181 static void amdgpu_device_load_switch_state(struct amdgpu_device *adev); 182 183 static inline bool amdgpu_ip_member_of_hwini(struct amdgpu_device *adev, 184 enum amd_ip_block_type block) 185 { 186 return (adev->init_lvl->hwini_ip_block_mask & (1U << block)) != 0; 187 } 188 189 void amdgpu_set_init_level(struct amdgpu_device *adev, 190 enum amdgpu_init_lvl_id lvl) 191 { 192 switch (lvl) { 193 case AMDGPU_INIT_LEVEL_MINIMAL_XGMI: 194 adev->init_lvl = &amdgpu_init_minimal_xgmi; 195 break; 196 case AMDGPU_INIT_LEVEL_RESET_RECOVERY: 197 adev->init_lvl = &amdgpu_init_recovery; 198 break; 199 case AMDGPU_INIT_LEVEL_DEFAULT: 200 fallthrough; 201 default: 202 adev->init_lvl = &amdgpu_init_default; 203 break; 204 } 205 } 206 207 static inline void amdgpu_device_stop_pending_resets(struct amdgpu_device *adev); 208 static int amdgpu_device_pm_notifier(struct notifier_block *nb, unsigned long mode, 209 void *data); 210 211 /** 212 * DOC: pcie_replay_count 213 * 214 * The amdgpu driver provides a sysfs API for reporting the total number 215 * of PCIe replays (NAKs). 216 * The file pcie_replay_count is used for this and returns the total 217 * number of replays as a sum of the NAKs generated and NAKs received. 218 */ 219 220 static ssize_t amdgpu_device_get_pcie_replay_count(struct device *dev, 221 struct device_attribute *attr, char *buf) 222 { 223 struct drm_device *ddev = dev_get_drvdata(dev); 224 struct amdgpu_device *adev = drm_to_adev(ddev); 225 uint64_t cnt = amdgpu_asic_get_pcie_replay_count(adev); 226 227 return sysfs_emit(buf, "%llu\n", cnt); 228 } 229 230 static DEVICE_ATTR(pcie_replay_count, 0444, 231 amdgpu_device_get_pcie_replay_count, NULL); 232 233 static int amdgpu_device_attr_sysfs_init(struct amdgpu_device *adev) 234 { 235 int ret = 0; 236 237 if (amdgpu_nbio_is_replay_cnt_supported(adev)) 238 ret = sysfs_create_file(&adev->dev->kobj, 239 &dev_attr_pcie_replay_count.attr); 240 241 return ret; 242 } 243 244 static void amdgpu_device_attr_sysfs_fini(struct amdgpu_device *adev) 245 { 246 if (amdgpu_nbio_is_replay_cnt_supported(adev)) 247 sysfs_remove_file(&adev->dev->kobj, 248 &dev_attr_pcie_replay_count.attr); 249 } 250 251 static ssize_t amdgpu_sysfs_reg_state_get(struct file *f, struct kobject *kobj, 252 const struct bin_attribute *attr, char *buf, 253 loff_t ppos, size_t count) 254 { 255 struct device *dev = kobj_to_dev(kobj); 256 struct drm_device *ddev = dev_get_drvdata(dev); 257 struct amdgpu_device *adev = drm_to_adev(ddev); 258 ssize_t bytes_read; 259 260 switch (ppos) { 261 case AMDGPU_SYS_REG_STATE_XGMI: 262 bytes_read = amdgpu_asic_get_reg_state( 263 adev, AMDGPU_REG_STATE_TYPE_XGMI, buf, count); 264 break; 265 case AMDGPU_SYS_REG_STATE_WAFL: 266 bytes_read = amdgpu_asic_get_reg_state( 267 adev, AMDGPU_REG_STATE_TYPE_WAFL, buf, count); 268 break; 269 case AMDGPU_SYS_REG_STATE_PCIE: 270 bytes_read = amdgpu_asic_get_reg_state( 271 adev, AMDGPU_REG_STATE_TYPE_PCIE, buf, count); 272 break; 273 case AMDGPU_SYS_REG_STATE_USR: 274 bytes_read = amdgpu_asic_get_reg_state( 275 adev, AMDGPU_REG_STATE_TYPE_USR, buf, count); 276 break; 277 case AMDGPU_SYS_REG_STATE_USR_1: 278 bytes_read = amdgpu_asic_get_reg_state( 279 adev, AMDGPU_REG_STATE_TYPE_USR_1, buf, count); 280 break; 281 default: 282 return -EINVAL; 283 } 284 285 return bytes_read; 286 } 287 288 static const BIN_ATTR(reg_state, 0444, amdgpu_sysfs_reg_state_get, NULL, 289 AMDGPU_SYS_REG_STATE_END); 290 291 int amdgpu_reg_state_sysfs_init(struct amdgpu_device *adev) 292 { 293 int ret; 294 295 if (!amdgpu_asic_get_reg_state_supported(adev)) 296 return 0; 297 298 ret = sysfs_create_bin_file(&adev->dev->kobj, &bin_attr_reg_state); 299 300 return ret; 301 } 302 303 void amdgpu_reg_state_sysfs_fini(struct amdgpu_device *adev) 304 { 305 if (!amdgpu_asic_get_reg_state_supported(adev)) 306 return; 307 sysfs_remove_bin_file(&adev->dev->kobj, &bin_attr_reg_state); 308 } 309 310 int amdgpu_ip_block_suspend(struct amdgpu_ip_block *ip_block) 311 { 312 int r; 313 314 if (ip_block->version->funcs->suspend) { 315 r = ip_block->version->funcs->suspend(ip_block); 316 if (r) { 317 dev_err(ip_block->adev->dev, 318 "suspend of IP block <%s> failed %d\n", 319 ip_block->version->funcs->name, r); 320 return r; 321 } 322 } 323 324 ip_block->status.hw = false; 325 return 0; 326 } 327 328 int amdgpu_ip_block_resume(struct amdgpu_ip_block *ip_block) 329 { 330 int r; 331 332 if (ip_block->version->funcs->resume) { 333 r = ip_block->version->funcs->resume(ip_block); 334 if (r) { 335 dev_err(ip_block->adev->dev, 336 "resume of IP block <%s> failed %d\n", 337 ip_block->version->funcs->name, r); 338 return r; 339 } 340 } 341 342 ip_block->status.hw = true; 343 return 0; 344 } 345 346 /** 347 * DOC: board_info 348 * 349 * The amdgpu driver provides a sysfs API for giving board related information. 350 * It provides the form factor information in the format 351 * 352 * type : form factor 353 * 354 * Possible form factor values 355 * 356 * - "cem" - PCIE CEM card 357 * - "oam" - Open Compute Accelerator Module 358 * - "unknown" - Not known 359 * 360 */ 361 362 static ssize_t amdgpu_device_get_board_info(struct device *dev, 363 struct device_attribute *attr, 364 char *buf) 365 { 366 struct drm_device *ddev = dev_get_drvdata(dev); 367 struct amdgpu_device *adev = drm_to_adev(ddev); 368 enum amdgpu_pkg_type pkg_type = AMDGPU_PKG_TYPE_CEM; 369 const char *pkg; 370 371 if (adev->smuio.funcs && adev->smuio.funcs->get_pkg_type) 372 pkg_type = adev->smuio.funcs->get_pkg_type(adev); 373 374 switch (pkg_type) { 375 case AMDGPU_PKG_TYPE_CEM: 376 pkg = "cem"; 377 break; 378 case AMDGPU_PKG_TYPE_OAM: 379 pkg = "oam"; 380 break; 381 default: 382 pkg = "unknown"; 383 break; 384 } 385 386 return sysfs_emit(buf, "%s : %s\n", "type", pkg); 387 } 388 389 static DEVICE_ATTR(board_info, 0444, amdgpu_device_get_board_info, NULL); 390 391 static struct attribute *amdgpu_board_attrs[] = { 392 &dev_attr_board_info.attr, 393 NULL, 394 }; 395 396 static umode_t amdgpu_board_attrs_is_visible(struct kobject *kobj, 397 struct attribute *attr, int n) 398 { 399 struct device *dev = kobj_to_dev(kobj); 400 struct drm_device *ddev = dev_get_drvdata(dev); 401 struct amdgpu_device *adev = drm_to_adev(ddev); 402 403 if (adev->flags & AMD_IS_APU) 404 return 0; 405 406 return attr->mode; 407 } 408 409 static const struct attribute_group amdgpu_board_attrs_group = { 410 .attrs = amdgpu_board_attrs, 411 .is_visible = amdgpu_board_attrs_is_visible 412 }; 413 414 static void amdgpu_device_get_pcie_info(struct amdgpu_device *adev); 415 416 /** 417 * amdgpu_device_supports_px - Is the device a dGPU with ATPX power control 418 * 419 * @adev: amdgpu device pointer 420 * 421 * Returns true if the device is a dGPU with ATPX power control, 422 * otherwise return false. 423 */ 424 bool amdgpu_device_supports_px(struct amdgpu_device *adev) 425 { 426 if ((adev->flags & AMD_IS_PX) && !amdgpu_is_atpx_hybrid()) 427 return true; 428 return false; 429 } 430 431 /** 432 * amdgpu_device_supports_boco - Is the device a dGPU with ACPI power resources 433 * 434 * @adev: amdgpu device pointer 435 * 436 * Returns true if the device is a dGPU with ACPI power control, 437 * otherwise return false. 438 */ 439 bool amdgpu_device_supports_boco(struct amdgpu_device *adev) 440 { 441 if (!IS_ENABLED(CONFIG_HOTPLUG_PCI_PCIE)) 442 return false; 443 444 if (adev->has_pr3 || 445 ((adev->flags & AMD_IS_PX) && amdgpu_is_atpx_hybrid())) 446 return true; 447 return false; 448 } 449 450 /** 451 * amdgpu_device_supports_baco - Does the device support BACO 452 * 453 * @adev: amdgpu device pointer 454 * 455 * Return: 456 * 1 if the device supports BACO; 457 * 3 if the device supports MACO (only works if BACO is supported) 458 * otherwise return 0. 459 */ 460 int amdgpu_device_supports_baco(struct amdgpu_device *adev) 461 { 462 return amdgpu_asic_supports_baco(adev); 463 } 464 465 void amdgpu_device_detect_runtime_pm_mode(struct amdgpu_device *adev) 466 { 467 int bamaco_support; 468 469 adev->pm.rpm_mode = AMDGPU_RUNPM_NONE; 470 bamaco_support = amdgpu_device_supports_baco(adev); 471 472 switch (amdgpu_runtime_pm) { 473 case 2: 474 if (bamaco_support & MACO_SUPPORT) { 475 adev->pm.rpm_mode = AMDGPU_RUNPM_BAMACO; 476 dev_info(adev->dev, "Forcing BAMACO for runtime pm\n"); 477 } else if (bamaco_support == BACO_SUPPORT) { 478 adev->pm.rpm_mode = AMDGPU_RUNPM_BACO; 479 dev_info(adev->dev, "Requested mode BAMACO not available,fallback to use BACO\n"); 480 } 481 break; 482 case 1: 483 if (bamaco_support & BACO_SUPPORT) { 484 adev->pm.rpm_mode = AMDGPU_RUNPM_BACO; 485 dev_info(adev->dev, "Forcing BACO for runtime pm\n"); 486 } 487 break; 488 case -1: 489 case -2: 490 if (amdgpu_device_supports_px(adev)) { 491 /* enable PX as runtime mode */ 492 adev->pm.rpm_mode = AMDGPU_RUNPM_PX; 493 dev_info(adev->dev, "Using ATPX for runtime pm\n"); 494 } else if (amdgpu_device_supports_boco(adev)) { 495 /* enable boco as runtime mode */ 496 adev->pm.rpm_mode = AMDGPU_RUNPM_BOCO; 497 dev_info(adev->dev, "Using BOCO for runtime pm\n"); 498 } else { 499 if (!bamaco_support) 500 goto no_runtime_pm; 501 502 switch (adev->asic_type) { 503 case CHIP_VEGA20: 504 case CHIP_ARCTURUS: 505 /* BACO are not supported on vega20 and arctrus */ 506 break; 507 case CHIP_VEGA10: 508 /* enable BACO as runpm mode if noretry=0 */ 509 if (!adev->gmc.noretry && !amdgpu_passthrough(adev)) 510 adev->pm.rpm_mode = AMDGPU_RUNPM_BACO; 511 break; 512 default: 513 /* enable BACO as runpm mode on CI+ */ 514 if (!amdgpu_passthrough(adev)) 515 adev->pm.rpm_mode = AMDGPU_RUNPM_BACO; 516 break; 517 } 518 519 if (adev->pm.rpm_mode == AMDGPU_RUNPM_BACO) { 520 if (bamaco_support & MACO_SUPPORT) { 521 adev->pm.rpm_mode = AMDGPU_RUNPM_BAMACO; 522 dev_info(adev->dev, "Using BAMACO for runtime pm\n"); 523 } else { 524 dev_info(adev->dev, "Using BACO for runtime pm\n"); 525 } 526 } 527 } 528 break; 529 case 0: 530 dev_info(adev->dev, "runtime pm is manually disabled\n"); 531 break; 532 default: 533 break; 534 } 535 536 no_runtime_pm: 537 if (adev->pm.rpm_mode == AMDGPU_RUNPM_NONE) 538 dev_info(adev->dev, "Runtime PM not available\n"); 539 } 540 /** 541 * amdgpu_device_supports_smart_shift - Is the device dGPU with 542 * smart shift support 543 * 544 * @adev: amdgpu device pointer 545 * 546 * Returns true if the device is a dGPU with Smart Shift support, 547 * otherwise returns false. 548 */ 549 bool amdgpu_device_supports_smart_shift(struct amdgpu_device *adev) 550 { 551 return (amdgpu_device_supports_boco(adev) && 552 amdgpu_acpi_is_power_shift_control_supported()); 553 } 554 555 /* 556 * VRAM access helper functions 557 */ 558 559 /** 560 * amdgpu_device_mm_access - access vram by MM_INDEX/MM_DATA 561 * 562 * @adev: amdgpu_device pointer 563 * @pos: offset of the buffer in vram 564 * @buf: virtual address of the buffer in system memory 565 * @size: read/write size, sizeof(@buf) must > @size 566 * @write: true - write to vram, otherwise - read from vram 567 */ 568 void amdgpu_device_mm_access(struct amdgpu_device *adev, loff_t pos, 569 void *buf, size_t size, bool write) 570 { 571 unsigned long flags; 572 uint32_t hi = ~0, tmp = 0; 573 uint32_t *data = buf; 574 uint64_t last; 575 int idx; 576 577 if (!drm_dev_enter(adev_to_drm(adev), &idx)) 578 return; 579 580 BUG_ON(!IS_ALIGNED(pos, 4) || !IS_ALIGNED(size, 4)); 581 582 spin_lock_irqsave(&adev->mmio_idx_lock, flags); 583 for (last = pos + size; pos < last; pos += 4) { 584 tmp = pos >> 31; 585 586 WREG32_NO_KIQ(mmMM_INDEX, ((uint32_t)pos) | 0x80000000); 587 if (tmp != hi) { 588 WREG32_NO_KIQ(mmMM_INDEX_HI, tmp); 589 hi = tmp; 590 } 591 if (write) 592 WREG32_NO_KIQ(mmMM_DATA, *data++); 593 else 594 *data++ = RREG32_NO_KIQ(mmMM_DATA); 595 } 596 597 spin_unlock_irqrestore(&adev->mmio_idx_lock, flags); 598 drm_dev_exit(idx); 599 } 600 601 /** 602 * amdgpu_device_aper_access - access vram by vram aperture 603 * 604 * @adev: amdgpu_device pointer 605 * @pos: offset of the buffer in vram 606 * @buf: virtual address of the buffer in system memory 607 * @size: read/write size, sizeof(@buf) must > @size 608 * @write: true - write to vram, otherwise - read from vram 609 * 610 * The return value means how many bytes have been transferred. 611 */ 612 size_t amdgpu_device_aper_access(struct amdgpu_device *adev, loff_t pos, 613 void *buf, size_t size, bool write) 614 { 615 #ifdef CONFIG_64BIT 616 void __iomem *addr; 617 size_t count = 0; 618 uint64_t last; 619 620 if (!adev->mman.aper_base_kaddr) 621 return 0; 622 623 last = min(pos + size, adev->gmc.visible_vram_size); 624 if (last > pos) { 625 addr = adev->mman.aper_base_kaddr + pos; 626 count = last - pos; 627 628 if (write) { 629 memcpy_toio(addr, buf, count); 630 /* Make sure HDP write cache flush happens without any reordering 631 * after the system memory contents are sent over PCIe device 632 */ 633 mb(); 634 amdgpu_device_flush_hdp(adev, NULL); 635 } else { 636 amdgpu_device_invalidate_hdp(adev, NULL); 637 /* Make sure HDP read cache is invalidated before issuing a read 638 * to the PCIe device 639 */ 640 mb(); 641 memcpy_fromio(buf, addr, count); 642 } 643 644 } 645 646 return count; 647 #else 648 return 0; 649 #endif 650 } 651 652 /** 653 * amdgpu_device_vram_access - read/write a buffer in vram 654 * 655 * @adev: amdgpu_device pointer 656 * @pos: offset of the buffer in vram 657 * @buf: virtual address of the buffer in system memory 658 * @size: read/write size, sizeof(@buf) must > @size 659 * @write: true - write to vram, otherwise - read from vram 660 */ 661 void amdgpu_device_vram_access(struct amdgpu_device *adev, loff_t pos, 662 void *buf, size_t size, bool write) 663 { 664 size_t count; 665 666 /* try to using vram apreature to access vram first */ 667 count = amdgpu_device_aper_access(adev, pos, buf, size, write); 668 size -= count; 669 if (size) { 670 /* using MM to access rest vram */ 671 pos += count; 672 buf += count; 673 amdgpu_device_mm_access(adev, pos, buf, size, write); 674 } 675 } 676 677 /* 678 * register access helper functions. 679 */ 680 681 /* Check if hw access should be skipped because of hotplug or device error */ 682 bool amdgpu_device_skip_hw_access(struct amdgpu_device *adev) 683 { 684 if (adev->no_hw_access) 685 return true; 686 687 #ifdef CONFIG_LOCKDEP 688 /* 689 * This is a bit complicated to understand, so worth a comment. What we assert 690 * here is that the GPU reset is not running on another thread in parallel. 691 * 692 * For this we trylock the read side of the reset semaphore, if that succeeds 693 * we know that the reset is not running in parallel. 694 * 695 * If the trylock fails we assert that we are either already holding the read 696 * side of the lock or are the reset thread itself and hold the write side of 697 * the lock. 698 */ 699 if (in_task()) { 700 if (down_read_trylock(&adev->reset_domain->sem)) 701 up_read(&adev->reset_domain->sem); 702 else 703 lockdep_assert_held(&adev->reset_domain->sem); 704 } 705 #endif 706 return false; 707 } 708 709 /** 710 * amdgpu_device_rreg - read a memory mapped IO or indirect register 711 * 712 * @adev: amdgpu_device pointer 713 * @reg: dword aligned register offset 714 * @acc_flags: access flags which require special behavior 715 * 716 * Returns the 32 bit value from the offset specified. 717 */ 718 uint32_t amdgpu_device_rreg(struct amdgpu_device *adev, 719 uint32_t reg, uint32_t acc_flags) 720 { 721 uint32_t ret; 722 723 if (amdgpu_device_skip_hw_access(adev)) 724 return 0; 725 726 if ((reg * 4) < adev->rmmio_size) { 727 if (!(acc_flags & AMDGPU_REGS_NO_KIQ) && 728 amdgpu_sriov_runtime(adev) && 729 down_read_trylock(&adev->reset_domain->sem)) { 730 ret = amdgpu_kiq_rreg(adev, reg, 0); 731 up_read(&adev->reset_domain->sem); 732 } else { 733 ret = readl(((void __iomem *)adev->rmmio) + (reg * 4)); 734 } 735 } else { 736 ret = adev->pcie_rreg(adev, reg * 4); 737 } 738 739 trace_amdgpu_device_rreg(adev->pdev->device, reg, ret); 740 741 return ret; 742 } 743 744 /* 745 * MMIO register read with bytes helper functions 746 * @offset:bytes offset from MMIO start 747 */ 748 749 /** 750 * amdgpu_mm_rreg8 - read a memory mapped IO register 751 * 752 * @adev: amdgpu_device pointer 753 * @offset: byte aligned register offset 754 * 755 * Returns the 8 bit value from the offset specified. 756 */ 757 uint8_t amdgpu_mm_rreg8(struct amdgpu_device *adev, uint32_t offset) 758 { 759 if (amdgpu_device_skip_hw_access(adev)) 760 return 0; 761 762 if (offset < adev->rmmio_size) 763 return (readb(adev->rmmio + offset)); 764 BUG(); 765 } 766 767 768 /** 769 * amdgpu_device_xcc_rreg - read a memory mapped IO or indirect register with specific XCC 770 * 771 * @adev: amdgpu_device pointer 772 * @reg: dword aligned register offset 773 * @acc_flags: access flags which require special behavior 774 * @xcc_id: xcc accelerated compute core id 775 * 776 * Returns the 32 bit value from the offset specified. 777 */ 778 uint32_t amdgpu_device_xcc_rreg(struct amdgpu_device *adev, 779 uint32_t reg, uint32_t acc_flags, 780 uint32_t xcc_id) 781 { 782 uint32_t ret, rlcg_flag; 783 784 if (amdgpu_device_skip_hw_access(adev)) 785 return 0; 786 787 if ((reg * 4) < adev->rmmio_size) { 788 if (amdgpu_sriov_vf(adev) && 789 !amdgpu_sriov_runtime(adev) && 790 adev->gfx.rlc.rlcg_reg_access_supported && 791 amdgpu_virt_get_rlcg_reg_access_flag(adev, acc_flags, 792 GC_HWIP, false, 793 &rlcg_flag)) { 794 ret = amdgpu_virt_rlcg_reg_rw(adev, reg, 0, rlcg_flag, GET_INST(GC, xcc_id)); 795 } else if (!(acc_flags & AMDGPU_REGS_NO_KIQ) && 796 amdgpu_sriov_runtime(adev) && 797 down_read_trylock(&adev->reset_domain->sem)) { 798 ret = amdgpu_kiq_rreg(adev, reg, xcc_id); 799 up_read(&adev->reset_domain->sem); 800 } else { 801 ret = readl(((void __iomem *)adev->rmmio) + (reg * 4)); 802 } 803 } else { 804 ret = adev->pcie_rreg(adev, reg * 4); 805 } 806 807 return ret; 808 } 809 810 /* 811 * MMIO register write with bytes helper functions 812 * @offset:bytes offset from MMIO start 813 * @value: the value want to be written to the register 814 */ 815 816 /** 817 * amdgpu_mm_wreg8 - read a memory mapped IO register 818 * 819 * @adev: amdgpu_device pointer 820 * @offset: byte aligned register offset 821 * @value: 8 bit value to write 822 * 823 * Writes the value specified to the offset specified. 824 */ 825 void amdgpu_mm_wreg8(struct amdgpu_device *adev, uint32_t offset, uint8_t value) 826 { 827 if (amdgpu_device_skip_hw_access(adev)) 828 return; 829 830 if (offset < adev->rmmio_size) 831 writeb(value, adev->rmmio + offset); 832 else 833 BUG(); 834 } 835 836 /** 837 * amdgpu_device_wreg - write to a memory mapped IO or indirect register 838 * 839 * @adev: amdgpu_device pointer 840 * @reg: dword aligned register offset 841 * @v: 32 bit value to write to the register 842 * @acc_flags: access flags which require special behavior 843 * 844 * Writes the value specified to the offset specified. 845 */ 846 void amdgpu_device_wreg(struct amdgpu_device *adev, 847 uint32_t reg, uint32_t v, 848 uint32_t acc_flags) 849 { 850 if (amdgpu_device_skip_hw_access(adev)) 851 return; 852 853 if ((reg * 4) < adev->rmmio_size) { 854 if (!(acc_flags & AMDGPU_REGS_NO_KIQ) && 855 amdgpu_sriov_runtime(adev) && 856 down_read_trylock(&adev->reset_domain->sem)) { 857 amdgpu_kiq_wreg(adev, reg, v, 0); 858 up_read(&adev->reset_domain->sem); 859 } else { 860 writel(v, ((void __iomem *)adev->rmmio) + (reg * 4)); 861 } 862 } else { 863 adev->pcie_wreg(adev, reg * 4, v); 864 } 865 866 trace_amdgpu_device_wreg(adev->pdev->device, reg, v); 867 } 868 869 /** 870 * amdgpu_mm_wreg_mmio_rlc - write register either with direct/indirect mmio or with RLC path if in range 871 * 872 * @adev: amdgpu_device pointer 873 * @reg: mmio/rlc register 874 * @v: value to write 875 * @xcc_id: xcc accelerated compute core id 876 * 877 * this function is invoked only for the debugfs register access 878 */ 879 void amdgpu_mm_wreg_mmio_rlc(struct amdgpu_device *adev, 880 uint32_t reg, uint32_t v, 881 uint32_t xcc_id) 882 { 883 if (amdgpu_device_skip_hw_access(adev)) 884 return; 885 886 if (amdgpu_sriov_fullaccess(adev) && 887 adev->gfx.rlc.funcs && 888 adev->gfx.rlc.funcs->is_rlcg_access_range) { 889 if (adev->gfx.rlc.funcs->is_rlcg_access_range(adev, reg)) 890 return amdgpu_sriov_wreg(adev, reg, v, 0, 0, xcc_id); 891 } else if ((reg * 4) >= adev->rmmio_size) { 892 adev->pcie_wreg(adev, reg * 4, v); 893 } else { 894 writel(v, ((void __iomem *)adev->rmmio) + (reg * 4)); 895 } 896 } 897 898 /** 899 * amdgpu_device_xcc_wreg - write to a memory mapped IO or indirect register with specific XCC 900 * 901 * @adev: amdgpu_device pointer 902 * @reg: dword aligned register offset 903 * @v: 32 bit value to write to the register 904 * @acc_flags: access flags which require special behavior 905 * @xcc_id: xcc accelerated compute core id 906 * 907 * Writes the value specified to the offset specified. 908 */ 909 void amdgpu_device_xcc_wreg(struct amdgpu_device *adev, 910 uint32_t reg, uint32_t v, 911 uint32_t acc_flags, uint32_t xcc_id) 912 { 913 uint32_t rlcg_flag; 914 915 if (amdgpu_device_skip_hw_access(adev)) 916 return; 917 918 if ((reg * 4) < adev->rmmio_size) { 919 if (amdgpu_sriov_vf(adev) && 920 !amdgpu_sriov_runtime(adev) && 921 adev->gfx.rlc.rlcg_reg_access_supported && 922 amdgpu_virt_get_rlcg_reg_access_flag(adev, acc_flags, 923 GC_HWIP, true, 924 &rlcg_flag)) { 925 amdgpu_virt_rlcg_reg_rw(adev, reg, v, rlcg_flag, GET_INST(GC, xcc_id)); 926 } else if (!(acc_flags & AMDGPU_REGS_NO_KIQ) && 927 amdgpu_sriov_runtime(adev) && 928 down_read_trylock(&adev->reset_domain->sem)) { 929 amdgpu_kiq_wreg(adev, reg, v, xcc_id); 930 up_read(&adev->reset_domain->sem); 931 } else { 932 writel(v, ((void __iomem *)adev->rmmio) + (reg * 4)); 933 } 934 } else { 935 adev->pcie_wreg(adev, reg * 4, v); 936 } 937 } 938 939 /** 940 * amdgpu_device_indirect_rreg - read an indirect register 941 * 942 * @adev: amdgpu_device pointer 943 * @reg_addr: indirect register address to read from 944 * 945 * Returns the value of indirect register @reg_addr 946 */ 947 u32 amdgpu_device_indirect_rreg(struct amdgpu_device *adev, 948 u32 reg_addr) 949 { 950 unsigned long flags, pcie_index, pcie_data; 951 void __iomem *pcie_index_offset; 952 void __iomem *pcie_data_offset; 953 u32 r; 954 955 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 956 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 957 958 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 959 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 960 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 961 962 writel(reg_addr, pcie_index_offset); 963 readl(pcie_index_offset); 964 r = readl(pcie_data_offset); 965 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 966 967 return r; 968 } 969 970 u32 amdgpu_device_indirect_rreg_ext(struct amdgpu_device *adev, 971 u64 reg_addr) 972 { 973 unsigned long flags, pcie_index, pcie_index_hi, pcie_data; 974 u32 r; 975 void __iomem *pcie_index_offset; 976 void __iomem *pcie_index_hi_offset; 977 void __iomem *pcie_data_offset; 978 979 if (unlikely(!adev->nbio.funcs)) { 980 pcie_index = AMDGPU_PCIE_INDEX_FALLBACK; 981 pcie_data = AMDGPU_PCIE_DATA_FALLBACK; 982 } else { 983 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 984 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 985 } 986 987 if (reg_addr >> 32) { 988 if (unlikely(!adev->nbio.funcs)) 989 pcie_index_hi = AMDGPU_PCIE_INDEX_HI_FALLBACK; 990 else 991 pcie_index_hi = adev->nbio.funcs->get_pcie_index_hi_offset(adev); 992 } else { 993 pcie_index_hi = 0; 994 } 995 996 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 997 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 998 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 999 if (pcie_index_hi != 0) 1000 pcie_index_hi_offset = (void __iomem *)adev->rmmio + 1001 pcie_index_hi * 4; 1002 1003 writel(reg_addr, pcie_index_offset); 1004 readl(pcie_index_offset); 1005 if (pcie_index_hi != 0) { 1006 writel((reg_addr >> 32) & 0xff, pcie_index_hi_offset); 1007 readl(pcie_index_hi_offset); 1008 } 1009 r = readl(pcie_data_offset); 1010 1011 /* clear the high bits */ 1012 if (pcie_index_hi != 0) { 1013 writel(0, pcie_index_hi_offset); 1014 readl(pcie_index_hi_offset); 1015 } 1016 1017 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 1018 1019 return r; 1020 } 1021 1022 /** 1023 * amdgpu_device_indirect_rreg64 - read a 64bits indirect register 1024 * 1025 * @adev: amdgpu_device pointer 1026 * @reg_addr: indirect register address to read from 1027 * 1028 * Returns the value of indirect register @reg_addr 1029 */ 1030 u64 amdgpu_device_indirect_rreg64(struct amdgpu_device *adev, 1031 u32 reg_addr) 1032 { 1033 unsigned long flags, pcie_index, pcie_data; 1034 void __iomem *pcie_index_offset; 1035 void __iomem *pcie_data_offset; 1036 u64 r; 1037 1038 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 1039 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 1040 1041 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 1042 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 1043 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 1044 1045 /* read low 32 bits */ 1046 writel(reg_addr, pcie_index_offset); 1047 readl(pcie_index_offset); 1048 r = readl(pcie_data_offset); 1049 /* read high 32 bits */ 1050 writel(reg_addr + 4, pcie_index_offset); 1051 readl(pcie_index_offset); 1052 r |= ((u64)readl(pcie_data_offset) << 32); 1053 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 1054 1055 return r; 1056 } 1057 1058 u64 amdgpu_device_indirect_rreg64_ext(struct amdgpu_device *adev, 1059 u64 reg_addr) 1060 { 1061 unsigned long flags, pcie_index, pcie_data; 1062 unsigned long pcie_index_hi = 0; 1063 void __iomem *pcie_index_offset; 1064 void __iomem *pcie_index_hi_offset; 1065 void __iomem *pcie_data_offset; 1066 u64 r; 1067 1068 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 1069 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 1070 if ((reg_addr >> 32) && (adev->nbio.funcs->get_pcie_index_hi_offset)) 1071 pcie_index_hi = adev->nbio.funcs->get_pcie_index_hi_offset(adev); 1072 1073 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 1074 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 1075 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 1076 if (pcie_index_hi != 0) 1077 pcie_index_hi_offset = (void __iomem *)adev->rmmio + 1078 pcie_index_hi * 4; 1079 1080 /* read low 32 bits */ 1081 writel(reg_addr, pcie_index_offset); 1082 readl(pcie_index_offset); 1083 if (pcie_index_hi != 0) { 1084 writel((reg_addr >> 32) & 0xff, pcie_index_hi_offset); 1085 readl(pcie_index_hi_offset); 1086 } 1087 r = readl(pcie_data_offset); 1088 /* read high 32 bits */ 1089 writel(reg_addr + 4, pcie_index_offset); 1090 readl(pcie_index_offset); 1091 if (pcie_index_hi != 0) { 1092 writel((reg_addr >> 32) & 0xff, pcie_index_hi_offset); 1093 readl(pcie_index_hi_offset); 1094 } 1095 r |= ((u64)readl(pcie_data_offset) << 32); 1096 1097 /* clear the high bits */ 1098 if (pcie_index_hi != 0) { 1099 writel(0, pcie_index_hi_offset); 1100 readl(pcie_index_hi_offset); 1101 } 1102 1103 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 1104 1105 return r; 1106 } 1107 1108 /** 1109 * amdgpu_device_indirect_wreg - write an indirect register address 1110 * 1111 * @adev: amdgpu_device pointer 1112 * @reg_addr: indirect register offset 1113 * @reg_data: indirect register data 1114 * 1115 */ 1116 void amdgpu_device_indirect_wreg(struct amdgpu_device *adev, 1117 u32 reg_addr, u32 reg_data) 1118 { 1119 unsigned long flags, pcie_index, pcie_data; 1120 void __iomem *pcie_index_offset; 1121 void __iomem *pcie_data_offset; 1122 1123 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 1124 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 1125 1126 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 1127 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 1128 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 1129 1130 writel(reg_addr, pcie_index_offset); 1131 readl(pcie_index_offset); 1132 writel(reg_data, pcie_data_offset); 1133 readl(pcie_data_offset); 1134 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 1135 } 1136 1137 void amdgpu_device_indirect_wreg_ext(struct amdgpu_device *adev, 1138 u64 reg_addr, u32 reg_data) 1139 { 1140 unsigned long flags, pcie_index, pcie_index_hi, pcie_data; 1141 void __iomem *pcie_index_offset; 1142 void __iomem *pcie_index_hi_offset; 1143 void __iomem *pcie_data_offset; 1144 1145 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 1146 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 1147 if ((reg_addr >> 32) && (adev->nbio.funcs->get_pcie_index_hi_offset)) 1148 pcie_index_hi = adev->nbio.funcs->get_pcie_index_hi_offset(adev); 1149 else 1150 pcie_index_hi = 0; 1151 1152 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 1153 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 1154 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 1155 if (pcie_index_hi != 0) 1156 pcie_index_hi_offset = (void __iomem *)adev->rmmio + 1157 pcie_index_hi * 4; 1158 1159 writel(reg_addr, pcie_index_offset); 1160 readl(pcie_index_offset); 1161 if (pcie_index_hi != 0) { 1162 writel((reg_addr >> 32) & 0xff, pcie_index_hi_offset); 1163 readl(pcie_index_hi_offset); 1164 } 1165 writel(reg_data, pcie_data_offset); 1166 readl(pcie_data_offset); 1167 1168 /* clear the high bits */ 1169 if (pcie_index_hi != 0) { 1170 writel(0, pcie_index_hi_offset); 1171 readl(pcie_index_hi_offset); 1172 } 1173 1174 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 1175 } 1176 1177 /** 1178 * amdgpu_device_indirect_wreg64 - write a 64bits indirect register address 1179 * 1180 * @adev: amdgpu_device pointer 1181 * @reg_addr: indirect register offset 1182 * @reg_data: indirect register data 1183 * 1184 */ 1185 void amdgpu_device_indirect_wreg64(struct amdgpu_device *adev, 1186 u32 reg_addr, u64 reg_data) 1187 { 1188 unsigned long flags, pcie_index, pcie_data; 1189 void __iomem *pcie_index_offset; 1190 void __iomem *pcie_data_offset; 1191 1192 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 1193 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 1194 1195 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 1196 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 1197 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 1198 1199 /* write low 32 bits */ 1200 writel(reg_addr, pcie_index_offset); 1201 readl(pcie_index_offset); 1202 writel((u32)(reg_data & 0xffffffffULL), pcie_data_offset); 1203 readl(pcie_data_offset); 1204 /* write high 32 bits */ 1205 writel(reg_addr + 4, pcie_index_offset); 1206 readl(pcie_index_offset); 1207 writel((u32)(reg_data >> 32), pcie_data_offset); 1208 readl(pcie_data_offset); 1209 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 1210 } 1211 1212 void amdgpu_device_indirect_wreg64_ext(struct amdgpu_device *adev, 1213 u64 reg_addr, u64 reg_data) 1214 { 1215 unsigned long flags, pcie_index, pcie_data; 1216 unsigned long pcie_index_hi = 0; 1217 void __iomem *pcie_index_offset; 1218 void __iomem *pcie_index_hi_offset; 1219 void __iomem *pcie_data_offset; 1220 1221 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); 1222 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); 1223 if ((reg_addr >> 32) && (adev->nbio.funcs->get_pcie_index_hi_offset)) 1224 pcie_index_hi = adev->nbio.funcs->get_pcie_index_hi_offset(adev); 1225 1226 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 1227 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; 1228 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; 1229 if (pcie_index_hi != 0) 1230 pcie_index_hi_offset = (void __iomem *)adev->rmmio + 1231 pcie_index_hi * 4; 1232 1233 /* write low 32 bits */ 1234 writel(reg_addr, pcie_index_offset); 1235 readl(pcie_index_offset); 1236 if (pcie_index_hi != 0) { 1237 writel((reg_addr >> 32) & 0xff, pcie_index_hi_offset); 1238 readl(pcie_index_hi_offset); 1239 } 1240 writel((u32)(reg_data & 0xffffffffULL), pcie_data_offset); 1241 readl(pcie_data_offset); 1242 /* write high 32 bits */ 1243 writel(reg_addr + 4, pcie_index_offset); 1244 readl(pcie_index_offset); 1245 if (pcie_index_hi != 0) { 1246 writel((reg_addr >> 32) & 0xff, pcie_index_hi_offset); 1247 readl(pcie_index_hi_offset); 1248 } 1249 writel((u32)(reg_data >> 32), pcie_data_offset); 1250 readl(pcie_data_offset); 1251 1252 /* clear the high bits */ 1253 if (pcie_index_hi != 0) { 1254 writel(0, pcie_index_hi_offset); 1255 readl(pcie_index_hi_offset); 1256 } 1257 1258 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 1259 } 1260 1261 /** 1262 * amdgpu_device_get_rev_id - query device rev_id 1263 * 1264 * @adev: amdgpu_device pointer 1265 * 1266 * Return device rev_id 1267 */ 1268 u32 amdgpu_device_get_rev_id(struct amdgpu_device *adev) 1269 { 1270 return adev->nbio.funcs->get_rev_id(adev); 1271 } 1272 1273 /** 1274 * amdgpu_invalid_rreg - dummy reg read function 1275 * 1276 * @adev: amdgpu_device pointer 1277 * @reg: offset of register 1278 * 1279 * Dummy register read function. Used for register blocks 1280 * that certain asics don't have (all asics). 1281 * Returns the value in the register. 1282 */ 1283 static uint32_t amdgpu_invalid_rreg(struct amdgpu_device *adev, uint32_t reg) 1284 { 1285 dev_err(adev->dev, "Invalid callback to read register 0x%04X\n", reg); 1286 BUG(); 1287 return 0; 1288 } 1289 1290 static uint32_t amdgpu_invalid_rreg_ext(struct amdgpu_device *adev, uint64_t reg) 1291 { 1292 dev_err(adev->dev, "Invalid callback to read register 0x%llX\n", reg); 1293 BUG(); 1294 return 0; 1295 } 1296 1297 /** 1298 * amdgpu_invalid_wreg - dummy reg write function 1299 * 1300 * @adev: amdgpu_device pointer 1301 * @reg: offset of register 1302 * @v: value to write to the register 1303 * 1304 * Dummy register read function. Used for register blocks 1305 * that certain asics don't have (all asics). 1306 */ 1307 static void amdgpu_invalid_wreg(struct amdgpu_device *adev, uint32_t reg, uint32_t v) 1308 { 1309 dev_err(adev->dev, 1310 "Invalid callback to write register 0x%04X with 0x%08X\n", reg, 1311 v); 1312 BUG(); 1313 } 1314 1315 static void amdgpu_invalid_wreg_ext(struct amdgpu_device *adev, uint64_t reg, uint32_t v) 1316 { 1317 dev_err(adev->dev, 1318 "Invalid callback to write register 0x%llX with 0x%08X\n", reg, 1319 v); 1320 BUG(); 1321 } 1322 1323 /** 1324 * amdgpu_invalid_rreg64 - dummy 64 bit reg read function 1325 * 1326 * @adev: amdgpu_device pointer 1327 * @reg: offset of register 1328 * 1329 * Dummy register read function. Used for register blocks 1330 * that certain asics don't have (all asics). 1331 * Returns the value in the register. 1332 */ 1333 static uint64_t amdgpu_invalid_rreg64(struct amdgpu_device *adev, uint32_t reg) 1334 { 1335 dev_err(adev->dev, "Invalid callback to read 64 bit register 0x%04X\n", 1336 reg); 1337 BUG(); 1338 return 0; 1339 } 1340 1341 static uint64_t amdgpu_invalid_rreg64_ext(struct amdgpu_device *adev, uint64_t reg) 1342 { 1343 dev_err(adev->dev, "Invalid callback to read register 0x%llX\n", reg); 1344 BUG(); 1345 return 0; 1346 } 1347 1348 /** 1349 * amdgpu_invalid_wreg64 - dummy reg write function 1350 * 1351 * @adev: amdgpu_device pointer 1352 * @reg: offset of register 1353 * @v: value to write to the register 1354 * 1355 * Dummy register read function. Used for register blocks 1356 * that certain asics don't have (all asics). 1357 */ 1358 static void amdgpu_invalid_wreg64(struct amdgpu_device *adev, uint32_t reg, uint64_t v) 1359 { 1360 dev_err(adev->dev, 1361 "Invalid callback to write 64 bit register 0x%04X with 0x%08llX\n", 1362 reg, v); 1363 BUG(); 1364 } 1365 1366 static void amdgpu_invalid_wreg64_ext(struct amdgpu_device *adev, uint64_t reg, uint64_t v) 1367 { 1368 dev_err(adev->dev, 1369 "Invalid callback to write 64 bit register 0x%llX with 0x%08llX\n", 1370 reg, v); 1371 BUG(); 1372 } 1373 1374 /** 1375 * amdgpu_block_invalid_rreg - dummy reg read function 1376 * 1377 * @adev: amdgpu_device pointer 1378 * @block: offset of instance 1379 * @reg: offset of register 1380 * 1381 * Dummy register read function. Used for register blocks 1382 * that certain asics don't have (all asics). 1383 * Returns the value in the register. 1384 */ 1385 static uint32_t amdgpu_block_invalid_rreg(struct amdgpu_device *adev, 1386 uint32_t block, uint32_t reg) 1387 { 1388 dev_err(adev->dev, 1389 "Invalid callback to read register 0x%04X in block 0x%04X\n", 1390 reg, block); 1391 BUG(); 1392 return 0; 1393 } 1394 1395 /** 1396 * amdgpu_block_invalid_wreg - dummy reg write function 1397 * 1398 * @adev: amdgpu_device pointer 1399 * @block: offset of instance 1400 * @reg: offset of register 1401 * @v: value to write to the register 1402 * 1403 * Dummy register read function. Used for register blocks 1404 * that certain asics don't have (all asics). 1405 */ 1406 static void amdgpu_block_invalid_wreg(struct amdgpu_device *adev, 1407 uint32_t block, 1408 uint32_t reg, uint32_t v) 1409 { 1410 dev_err(adev->dev, 1411 "Invalid block callback to write register 0x%04X in block 0x%04X with 0x%08X\n", 1412 reg, block, v); 1413 BUG(); 1414 } 1415 1416 static uint32_t amdgpu_device_get_vbios_flags(struct amdgpu_device *adev) 1417 { 1418 if (hweight32(adev->aid_mask) && (adev->flags & AMD_IS_APU)) 1419 return AMDGPU_VBIOS_SKIP; 1420 1421 if (hweight32(adev->aid_mask) && amdgpu_passthrough(adev)) 1422 return AMDGPU_VBIOS_OPTIONAL; 1423 1424 return 0; 1425 } 1426 1427 /** 1428 * amdgpu_device_asic_init - Wrapper for atom asic_init 1429 * 1430 * @adev: amdgpu_device pointer 1431 * 1432 * Does any asic specific work and then calls atom asic init. 1433 */ 1434 static int amdgpu_device_asic_init(struct amdgpu_device *adev) 1435 { 1436 uint32_t flags; 1437 bool optional; 1438 int ret; 1439 1440 amdgpu_asic_pre_asic_init(adev); 1441 flags = amdgpu_device_get_vbios_flags(adev); 1442 optional = !!(flags & (AMDGPU_VBIOS_OPTIONAL | AMDGPU_VBIOS_SKIP)); 1443 1444 if (amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 3) || 1445 amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 4) || 1446 amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 5, 0) || 1447 amdgpu_ip_version(adev, GC_HWIP, 0) >= IP_VERSION(11, 0, 0)) { 1448 amdgpu_psp_wait_for_bootloader(adev); 1449 if (optional && !adev->bios) 1450 return 0; 1451 1452 ret = amdgpu_atomfirmware_asic_init(adev, true); 1453 return ret; 1454 } else { 1455 if (optional && !adev->bios) 1456 return 0; 1457 1458 return amdgpu_atom_asic_init(adev->mode_info.atom_context); 1459 } 1460 1461 return 0; 1462 } 1463 1464 /** 1465 * amdgpu_device_mem_scratch_init - allocate the VRAM scratch page 1466 * 1467 * @adev: amdgpu_device pointer 1468 * 1469 * Allocates a scratch page of VRAM for use by various things in the 1470 * driver. 1471 */ 1472 static int amdgpu_device_mem_scratch_init(struct amdgpu_device *adev) 1473 { 1474 return amdgpu_bo_create_kernel(adev, AMDGPU_GPU_PAGE_SIZE, PAGE_SIZE, 1475 AMDGPU_GEM_DOMAIN_VRAM | 1476 AMDGPU_GEM_DOMAIN_GTT, 1477 &adev->mem_scratch.robj, 1478 &adev->mem_scratch.gpu_addr, 1479 (void **)&adev->mem_scratch.ptr); 1480 } 1481 1482 /** 1483 * amdgpu_device_mem_scratch_fini - Free the VRAM scratch page 1484 * 1485 * @adev: amdgpu_device pointer 1486 * 1487 * Frees the VRAM scratch page. 1488 */ 1489 static void amdgpu_device_mem_scratch_fini(struct amdgpu_device *adev) 1490 { 1491 amdgpu_bo_free_kernel(&adev->mem_scratch.robj, NULL, NULL); 1492 } 1493 1494 /** 1495 * amdgpu_device_program_register_sequence - program an array of registers. 1496 * 1497 * @adev: amdgpu_device pointer 1498 * @registers: pointer to the register array 1499 * @array_size: size of the register array 1500 * 1501 * Programs an array or registers with and or masks. 1502 * This is a helper for setting golden registers. 1503 */ 1504 void amdgpu_device_program_register_sequence(struct amdgpu_device *adev, 1505 const u32 *registers, 1506 const u32 array_size) 1507 { 1508 u32 tmp, reg, and_mask, or_mask; 1509 int i; 1510 1511 if (array_size % 3) 1512 return; 1513 1514 for (i = 0; i < array_size; i += 3) { 1515 reg = registers[i + 0]; 1516 and_mask = registers[i + 1]; 1517 or_mask = registers[i + 2]; 1518 1519 if (and_mask == 0xffffffff) { 1520 tmp = or_mask; 1521 } else { 1522 tmp = RREG32(reg); 1523 tmp &= ~and_mask; 1524 if (adev->family >= AMDGPU_FAMILY_AI) 1525 tmp |= (or_mask & and_mask); 1526 else 1527 tmp |= or_mask; 1528 } 1529 WREG32(reg, tmp); 1530 } 1531 } 1532 1533 /** 1534 * amdgpu_device_pci_config_reset - reset the GPU 1535 * 1536 * @adev: amdgpu_device pointer 1537 * 1538 * Resets the GPU using the pci config reset sequence. 1539 * Only applicable to asics prior to vega10. 1540 */ 1541 void amdgpu_device_pci_config_reset(struct amdgpu_device *adev) 1542 { 1543 pci_write_config_dword(adev->pdev, 0x7c, AMDGPU_ASIC_RESET_DATA); 1544 } 1545 1546 /** 1547 * amdgpu_device_pci_reset - reset the GPU using generic PCI means 1548 * 1549 * @adev: amdgpu_device pointer 1550 * 1551 * Resets the GPU using generic pci reset interfaces (FLR, SBR, etc.). 1552 */ 1553 int amdgpu_device_pci_reset(struct amdgpu_device *adev) 1554 { 1555 return pci_reset_function(adev->pdev); 1556 } 1557 1558 /* 1559 * amdgpu_device_wb_*() 1560 * Writeback is the method by which the GPU updates special pages in memory 1561 * with the status of certain GPU events (fences, ring pointers,etc.). 1562 */ 1563 1564 /** 1565 * amdgpu_device_wb_fini - Disable Writeback and free memory 1566 * 1567 * @adev: amdgpu_device pointer 1568 * 1569 * Disables Writeback and frees the Writeback memory (all asics). 1570 * Used at driver shutdown. 1571 */ 1572 static void amdgpu_device_wb_fini(struct amdgpu_device *adev) 1573 { 1574 if (adev->wb.wb_obj) { 1575 amdgpu_bo_free_kernel(&adev->wb.wb_obj, 1576 &adev->wb.gpu_addr, 1577 (void **)&adev->wb.wb); 1578 adev->wb.wb_obj = NULL; 1579 } 1580 } 1581 1582 /** 1583 * amdgpu_device_wb_init - Init Writeback driver info and allocate memory 1584 * 1585 * @adev: amdgpu_device pointer 1586 * 1587 * Initializes writeback and allocates writeback memory (all asics). 1588 * Used at driver startup. 1589 * Returns 0 on success or an -error on failure. 1590 */ 1591 static int amdgpu_device_wb_init(struct amdgpu_device *adev) 1592 { 1593 int r; 1594 1595 if (adev->wb.wb_obj == NULL) { 1596 /* AMDGPU_MAX_WB * sizeof(uint32_t) * 8 = AMDGPU_MAX_WB 256bit slots */ 1597 r = amdgpu_bo_create_kernel(adev, AMDGPU_MAX_WB * sizeof(uint32_t) * 8, 1598 PAGE_SIZE, AMDGPU_GEM_DOMAIN_GTT, 1599 &adev->wb.wb_obj, &adev->wb.gpu_addr, 1600 (void **)&adev->wb.wb); 1601 if (r) { 1602 dev_warn(adev->dev, "(%d) create WB bo failed\n", r); 1603 return r; 1604 } 1605 1606 adev->wb.num_wb = AMDGPU_MAX_WB; 1607 memset(&adev->wb.used, 0, sizeof(adev->wb.used)); 1608 1609 /* clear wb memory */ 1610 memset((char *)adev->wb.wb, 0, AMDGPU_MAX_WB * sizeof(uint32_t) * 8); 1611 } 1612 1613 return 0; 1614 } 1615 1616 /** 1617 * amdgpu_device_wb_get - Allocate a wb entry 1618 * 1619 * @adev: amdgpu_device pointer 1620 * @wb: wb index 1621 * 1622 * Allocate a wb slot for use by the driver (all asics). 1623 * Returns 0 on success or -EINVAL on failure. 1624 */ 1625 int amdgpu_device_wb_get(struct amdgpu_device *adev, u32 *wb) 1626 { 1627 unsigned long flags, offset; 1628 1629 spin_lock_irqsave(&adev->wb.lock, flags); 1630 offset = find_first_zero_bit(adev->wb.used, adev->wb.num_wb); 1631 if (offset < adev->wb.num_wb) { 1632 __set_bit(offset, adev->wb.used); 1633 spin_unlock_irqrestore(&adev->wb.lock, flags); 1634 *wb = offset << 3; /* convert to dw offset */ 1635 return 0; 1636 } else { 1637 spin_unlock_irqrestore(&adev->wb.lock, flags); 1638 return -EINVAL; 1639 } 1640 } 1641 1642 /** 1643 * amdgpu_device_wb_free - Free a wb entry 1644 * 1645 * @adev: amdgpu_device pointer 1646 * @wb: wb index 1647 * 1648 * Free a wb slot allocated for use by the driver (all asics) 1649 */ 1650 void amdgpu_device_wb_free(struct amdgpu_device *adev, u32 wb) 1651 { 1652 unsigned long flags; 1653 1654 wb >>= 3; 1655 spin_lock_irqsave(&adev->wb.lock, flags); 1656 if (wb < adev->wb.num_wb) 1657 __clear_bit(wb, adev->wb.used); 1658 spin_unlock_irqrestore(&adev->wb.lock, flags); 1659 } 1660 1661 /** 1662 * amdgpu_device_resize_fb_bar - try to resize FB BAR 1663 * 1664 * @adev: amdgpu_device pointer 1665 * 1666 * Try to resize FB BAR to make all VRAM CPU accessible. We try very hard not 1667 * to fail, but if any of the BARs is not accessible after the size we abort 1668 * driver loading by returning -ENODEV. 1669 */ 1670 int amdgpu_device_resize_fb_bar(struct amdgpu_device *adev) 1671 { 1672 int rbar_size = pci_rebar_bytes_to_size(adev->gmc.real_vram_size); 1673 struct pci_bus *root; 1674 struct resource *res; 1675 unsigned int i; 1676 u16 cmd; 1677 int r; 1678 1679 if (!IS_ENABLED(CONFIG_PHYS_ADDR_T_64BIT)) 1680 return 0; 1681 1682 /* Bypass for VF */ 1683 if (amdgpu_sriov_vf(adev)) 1684 return 0; 1685 1686 if (!amdgpu_rebar) 1687 return 0; 1688 1689 /* resizing on Dell G5 SE platforms causes problems with runtime pm */ 1690 if ((amdgpu_runtime_pm != 0) && 1691 adev->pdev->vendor == PCI_VENDOR_ID_ATI && 1692 adev->pdev->device == 0x731f && 1693 adev->pdev->subsystem_vendor == PCI_VENDOR_ID_DELL) 1694 return 0; 1695 1696 /* PCI_EXT_CAP_ID_VNDR extended capability is located at 0x100 */ 1697 if (!pci_find_ext_capability(adev->pdev, PCI_EXT_CAP_ID_VNDR)) 1698 dev_warn( 1699 adev->dev, 1700 "System can't access extended configuration space, please check!!\n"); 1701 1702 /* skip if the bios has already enabled large BAR */ 1703 if (adev->gmc.real_vram_size && 1704 (pci_resource_len(adev->pdev, 0) >= adev->gmc.real_vram_size)) 1705 return 0; 1706 1707 /* Check if the root BUS has 64bit memory resources */ 1708 root = adev->pdev->bus; 1709 while (root->parent) 1710 root = root->parent; 1711 1712 pci_bus_for_each_resource(root, res, i) { 1713 if (res && res->flags & (IORESOURCE_MEM | IORESOURCE_MEM_64) && 1714 res->start > 0x100000000ull) 1715 break; 1716 } 1717 1718 /* Trying to resize is pointless without a root hub window above 4GB */ 1719 if (!res) 1720 return 0; 1721 1722 /* Limit the BAR size to what is available */ 1723 rbar_size = min(fls(pci_rebar_get_possible_sizes(adev->pdev, 0)) - 1, 1724 rbar_size); 1725 1726 /* Disable memory decoding while we change the BAR addresses and size */ 1727 pci_read_config_word(adev->pdev, PCI_COMMAND, &cmd); 1728 pci_write_config_word(adev->pdev, PCI_COMMAND, 1729 cmd & ~PCI_COMMAND_MEMORY); 1730 1731 /* Free the VRAM and doorbell BAR, we most likely need to move both. */ 1732 amdgpu_doorbell_fini(adev); 1733 if (adev->asic_type >= CHIP_BONAIRE) 1734 pci_release_resource(adev->pdev, 2); 1735 1736 pci_release_resource(adev->pdev, 0); 1737 1738 r = pci_resize_resource(adev->pdev, 0, rbar_size); 1739 if (r == -ENOSPC) 1740 dev_info(adev->dev, 1741 "Not enough PCI address space for a large BAR."); 1742 else if (r && r != -ENOTSUPP) 1743 dev_err(adev->dev, "Problem resizing BAR0 (%d).", r); 1744 1745 pci_assign_unassigned_bus_resources(adev->pdev->bus); 1746 1747 /* When the doorbell or fb BAR isn't available we have no chance of 1748 * using the device. 1749 */ 1750 r = amdgpu_doorbell_init(adev); 1751 if (r || (pci_resource_flags(adev->pdev, 0) & IORESOURCE_UNSET)) 1752 return -ENODEV; 1753 1754 pci_write_config_word(adev->pdev, PCI_COMMAND, cmd); 1755 1756 return 0; 1757 } 1758 1759 /* 1760 * GPU helpers function. 1761 */ 1762 /** 1763 * amdgpu_device_need_post - check if the hw need post or not 1764 * 1765 * @adev: amdgpu_device pointer 1766 * 1767 * Check if the asic has been initialized (all asics) at driver startup 1768 * or post is needed if hw reset is performed. 1769 * Returns true if need or false if not. 1770 */ 1771 bool amdgpu_device_need_post(struct amdgpu_device *adev) 1772 { 1773 uint32_t reg, flags; 1774 1775 if (amdgpu_sriov_vf(adev)) 1776 return false; 1777 1778 flags = amdgpu_device_get_vbios_flags(adev); 1779 if (flags & AMDGPU_VBIOS_SKIP) 1780 return false; 1781 if ((flags & AMDGPU_VBIOS_OPTIONAL) && !adev->bios) 1782 return false; 1783 1784 if (amdgpu_passthrough(adev)) { 1785 /* for FIJI: In whole GPU pass-through virtualization case, after VM reboot 1786 * some old smc fw still need driver do vPost otherwise gpu hang, while 1787 * those smc fw version above 22.15 doesn't have this flaw, so we force 1788 * vpost executed for smc version below 22.15 1789 */ 1790 if (adev->asic_type == CHIP_FIJI) { 1791 int err; 1792 uint32_t fw_ver; 1793 1794 err = request_firmware(&adev->pm.fw, "amdgpu/fiji_smc.bin", adev->dev); 1795 /* force vPost if error occurred */ 1796 if (err) 1797 return true; 1798 1799 fw_ver = *((uint32_t *)adev->pm.fw->data + 69); 1800 release_firmware(adev->pm.fw); 1801 if (fw_ver < 0x00160e00) 1802 return true; 1803 } 1804 } 1805 1806 /* Don't post if we need to reset whole hive on init */ 1807 if (adev->init_lvl->level == AMDGPU_INIT_LEVEL_MINIMAL_XGMI) 1808 return false; 1809 1810 if (adev->has_hw_reset) { 1811 adev->has_hw_reset = false; 1812 return true; 1813 } 1814 1815 /* bios scratch used on CIK+ */ 1816 if (adev->asic_type >= CHIP_BONAIRE) 1817 return amdgpu_atombios_scratch_need_asic_init(adev); 1818 1819 /* check MEM_SIZE for older asics */ 1820 reg = amdgpu_asic_get_config_memsize(adev); 1821 1822 if ((reg != 0) && (reg != 0xffffffff)) 1823 return false; 1824 1825 return true; 1826 } 1827 1828 /* 1829 * Check whether seamless boot is supported. 1830 * 1831 * So far we only support seamless boot on DCE 3.0 or later. 1832 * If users report that it works on older ASICS as well, we may 1833 * loosen this. 1834 */ 1835 bool amdgpu_device_seamless_boot_supported(struct amdgpu_device *adev) 1836 { 1837 switch (amdgpu_seamless) { 1838 case -1: 1839 break; 1840 case 1: 1841 return true; 1842 case 0: 1843 return false; 1844 default: 1845 dev_err(adev->dev, "Invalid value for amdgpu.seamless: %d\n", 1846 amdgpu_seamless); 1847 return false; 1848 } 1849 1850 if (!(adev->flags & AMD_IS_APU)) 1851 return false; 1852 1853 if (adev->mman.keep_stolen_vga_memory) 1854 return false; 1855 1856 return amdgpu_ip_version(adev, DCE_HWIP, 0) >= IP_VERSION(3, 0, 0); 1857 } 1858 1859 /* 1860 * Intel hosts such as Rocket Lake, Alder Lake, Raptor Lake and Sapphire Rapids 1861 * don't support dynamic speed switching. Until we have confirmation from Intel 1862 * that a specific host supports it, it's safer that we keep it disabled for all. 1863 * 1864 * https://edc.intel.com/content/www/us/en/design/products/platforms/details/raptor-lake-s/13th-generation-core-processors-datasheet-volume-1-of-2/005/pci-express-support/ 1865 * https://gitlab.freedesktop.org/drm/amd/-/issues/2663 1866 */ 1867 static bool amdgpu_device_pcie_dynamic_switching_supported(struct amdgpu_device *adev) 1868 { 1869 #if IS_ENABLED(CONFIG_X86) 1870 struct cpuinfo_x86 *c = &cpu_data(0); 1871 1872 /* eGPU change speeds based on USB4 fabric conditions */ 1873 if (dev_is_removable(adev->dev)) 1874 return true; 1875 1876 if (c->x86_vendor == X86_VENDOR_INTEL) 1877 return false; 1878 #endif 1879 return true; 1880 } 1881 1882 static bool amdgpu_device_aspm_support_quirk(struct amdgpu_device *adev) 1883 { 1884 #if IS_ENABLED(CONFIG_X86) 1885 struct cpuinfo_x86 *c = &cpu_data(0); 1886 1887 if (!(amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(12, 0, 0) || 1888 amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(12, 0, 1))) 1889 return false; 1890 1891 if (c->x86 == 6 && 1892 adev->pm.pcie_gen_mask & CAIL_PCIE_LINK_SPEED_SUPPORT_GEN5) { 1893 switch (c->x86_model) { 1894 case VFM_MODEL(INTEL_ALDERLAKE): 1895 case VFM_MODEL(INTEL_ALDERLAKE_L): 1896 case VFM_MODEL(INTEL_RAPTORLAKE): 1897 case VFM_MODEL(INTEL_RAPTORLAKE_P): 1898 case VFM_MODEL(INTEL_RAPTORLAKE_S): 1899 return true; 1900 default: 1901 return false; 1902 } 1903 } else { 1904 return false; 1905 } 1906 #else 1907 return false; 1908 #endif 1909 } 1910 1911 /** 1912 * amdgpu_device_should_use_aspm - check if the device should program ASPM 1913 * 1914 * @adev: amdgpu_device pointer 1915 * 1916 * Confirm whether the module parameter and pcie bridge agree that ASPM should 1917 * be set for this device. 1918 * 1919 * Returns true if it should be used or false if not. 1920 */ 1921 bool amdgpu_device_should_use_aspm(struct amdgpu_device *adev) 1922 { 1923 switch (amdgpu_aspm) { 1924 case -1: 1925 break; 1926 case 0: 1927 return false; 1928 case 1: 1929 return true; 1930 default: 1931 return false; 1932 } 1933 if (adev->flags & AMD_IS_APU) 1934 return false; 1935 if (amdgpu_device_aspm_support_quirk(adev)) 1936 return false; 1937 return pcie_aspm_enabled(adev->pdev); 1938 } 1939 1940 /* if we get transitioned to only one device, take VGA back */ 1941 /** 1942 * amdgpu_device_vga_set_decode - enable/disable vga decode 1943 * 1944 * @pdev: PCI device pointer 1945 * @state: enable/disable vga decode 1946 * 1947 * Enable/disable vga decode (all asics). 1948 * Returns VGA resource flags. 1949 */ 1950 static unsigned int amdgpu_device_vga_set_decode(struct pci_dev *pdev, 1951 bool state) 1952 { 1953 struct amdgpu_device *adev = drm_to_adev(pci_get_drvdata(pdev)); 1954 1955 amdgpu_asic_set_vga_state(adev, state); 1956 if (state) 1957 return VGA_RSRC_LEGACY_IO | VGA_RSRC_LEGACY_MEM | 1958 VGA_RSRC_NORMAL_IO | VGA_RSRC_NORMAL_MEM; 1959 else 1960 return VGA_RSRC_NORMAL_IO | VGA_RSRC_NORMAL_MEM; 1961 } 1962 1963 /** 1964 * amdgpu_device_check_block_size - validate the vm block size 1965 * 1966 * @adev: amdgpu_device pointer 1967 * 1968 * Validates the vm block size specified via module parameter. 1969 * The vm block size defines number of bits in page table versus page directory, 1970 * a page is 4KB so we have 12 bits offset, minimum 9 bits in the 1971 * page table and the remaining bits are in the page directory. 1972 */ 1973 static void amdgpu_device_check_block_size(struct amdgpu_device *adev) 1974 { 1975 /* defines number of bits in page table versus page directory, 1976 * a page is 4KB so we have 12 bits offset, minimum 9 bits in the 1977 * page table and the remaining bits are in the page directory 1978 */ 1979 if (amdgpu_vm_block_size == -1) 1980 return; 1981 1982 if (amdgpu_vm_block_size < 9) { 1983 dev_warn(adev->dev, "VM page table size (%d) too small\n", 1984 amdgpu_vm_block_size); 1985 amdgpu_vm_block_size = -1; 1986 } 1987 } 1988 1989 /** 1990 * amdgpu_device_check_vm_size - validate the vm size 1991 * 1992 * @adev: amdgpu_device pointer 1993 * 1994 * Validates the vm size in GB specified via module parameter. 1995 * The VM size is the size of the GPU virtual memory space in GB. 1996 */ 1997 static void amdgpu_device_check_vm_size(struct amdgpu_device *adev) 1998 { 1999 /* no need to check the default value */ 2000 if (amdgpu_vm_size == -1) 2001 return; 2002 2003 if (amdgpu_vm_size < 1) { 2004 dev_warn(adev->dev, "VM size (%d) too small, min is 1GB\n", 2005 amdgpu_vm_size); 2006 amdgpu_vm_size = -1; 2007 } 2008 } 2009 2010 static void amdgpu_device_check_smu_prv_buffer_size(struct amdgpu_device *adev) 2011 { 2012 struct sysinfo si; 2013 bool is_os_64 = (sizeof(void *) == 8); 2014 uint64_t total_memory; 2015 uint64_t dram_size_seven_GB = 0x1B8000000; 2016 uint64_t dram_size_three_GB = 0xB8000000; 2017 2018 if (amdgpu_smu_memory_pool_size == 0) 2019 return; 2020 2021 if (!is_os_64) { 2022 dev_warn(adev->dev, "Not 64-bit OS, feature not supported\n"); 2023 goto def_value; 2024 } 2025 si_meminfo(&si); 2026 total_memory = (uint64_t)si.totalram * si.mem_unit; 2027 2028 if ((amdgpu_smu_memory_pool_size == 1) || 2029 (amdgpu_smu_memory_pool_size == 2)) { 2030 if (total_memory < dram_size_three_GB) 2031 goto def_value1; 2032 } else if ((amdgpu_smu_memory_pool_size == 4) || 2033 (amdgpu_smu_memory_pool_size == 8)) { 2034 if (total_memory < dram_size_seven_GB) 2035 goto def_value1; 2036 } else { 2037 dev_warn(adev->dev, "Smu memory pool size not supported\n"); 2038 goto def_value; 2039 } 2040 adev->pm.smu_prv_buffer_size = amdgpu_smu_memory_pool_size << 28; 2041 2042 return; 2043 2044 def_value1: 2045 dev_warn(adev->dev, "No enough system memory\n"); 2046 def_value: 2047 adev->pm.smu_prv_buffer_size = 0; 2048 } 2049 2050 static int amdgpu_device_init_apu_flags(struct amdgpu_device *adev) 2051 { 2052 if (!(adev->flags & AMD_IS_APU) || 2053 adev->asic_type < CHIP_RAVEN) 2054 return 0; 2055 2056 switch (adev->asic_type) { 2057 case CHIP_RAVEN: 2058 if (adev->pdev->device == 0x15dd) 2059 adev->apu_flags |= AMD_APU_IS_RAVEN; 2060 if (adev->pdev->device == 0x15d8) 2061 adev->apu_flags |= AMD_APU_IS_PICASSO; 2062 break; 2063 case CHIP_RENOIR: 2064 if ((adev->pdev->device == 0x1636) || 2065 (adev->pdev->device == 0x164c)) 2066 adev->apu_flags |= AMD_APU_IS_RENOIR; 2067 else 2068 adev->apu_flags |= AMD_APU_IS_GREEN_SARDINE; 2069 break; 2070 case CHIP_VANGOGH: 2071 adev->apu_flags |= AMD_APU_IS_VANGOGH; 2072 break; 2073 case CHIP_YELLOW_CARP: 2074 break; 2075 case CHIP_CYAN_SKILLFISH: 2076 if ((adev->pdev->device == 0x13FE) || 2077 (adev->pdev->device == 0x143F)) 2078 adev->apu_flags |= AMD_APU_IS_CYAN_SKILLFISH2; 2079 break; 2080 default: 2081 break; 2082 } 2083 2084 return 0; 2085 } 2086 2087 /** 2088 * amdgpu_device_check_arguments - validate module params 2089 * 2090 * @adev: amdgpu_device pointer 2091 * 2092 * Validates certain module parameters and updates 2093 * the associated values used by the driver (all asics). 2094 */ 2095 static int amdgpu_device_check_arguments(struct amdgpu_device *adev) 2096 { 2097 int i; 2098 2099 if (amdgpu_sched_jobs < 4) { 2100 dev_warn(adev->dev, "sched jobs (%d) must be at least 4\n", 2101 amdgpu_sched_jobs); 2102 amdgpu_sched_jobs = 4; 2103 } else if (!is_power_of_2(amdgpu_sched_jobs)) { 2104 dev_warn(adev->dev, "sched jobs (%d) must be a power of 2\n", 2105 amdgpu_sched_jobs); 2106 amdgpu_sched_jobs = roundup_pow_of_two(amdgpu_sched_jobs); 2107 } 2108 2109 if (amdgpu_gart_size != -1 && amdgpu_gart_size < 32) { 2110 /* gart size must be greater or equal to 32M */ 2111 dev_warn(adev->dev, "gart size (%d) too small\n", 2112 amdgpu_gart_size); 2113 amdgpu_gart_size = -1; 2114 } 2115 2116 if (amdgpu_gtt_size != -1 && amdgpu_gtt_size < 32) { 2117 /* gtt size must be greater or equal to 32M */ 2118 dev_warn(adev->dev, "gtt size (%d) too small\n", 2119 amdgpu_gtt_size); 2120 amdgpu_gtt_size = -1; 2121 } 2122 2123 /* valid range is between 4 and 9 inclusive */ 2124 if (amdgpu_vm_fragment_size != -1 && 2125 (amdgpu_vm_fragment_size > 9 || amdgpu_vm_fragment_size < 4)) { 2126 dev_warn(adev->dev, "valid range is between 4 and 9\n"); 2127 amdgpu_vm_fragment_size = -1; 2128 } 2129 2130 if (amdgpu_sched_hw_submission < 2) { 2131 dev_warn(adev->dev, "sched hw submission jobs (%d) must be at least 2\n", 2132 amdgpu_sched_hw_submission); 2133 amdgpu_sched_hw_submission = 2; 2134 } else if (!is_power_of_2(amdgpu_sched_hw_submission)) { 2135 dev_warn(adev->dev, "sched hw submission jobs (%d) must be a power of 2\n", 2136 amdgpu_sched_hw_submission); 2137 amdgpu_sched_hw_submission = roundup_pow_of_two(amdgpu_sched_hw_submission); 2138 } 2139 2140 if (amdgpu_reset_method < -1 || amdgpu_reset_method > 4) { 2141 dev_warn(adev->dev, "invalid option for reset method, reverting to default\n"); 2142 amdgpu_reset_method = -1; 2143 } 2144 2145 amdgpu_device_check_smu_prv_buffer_size(adev); 2146 2147 amdgpu_device_check_vm_size(adev); 2148 2149 amdgpu_device_check_block_size(adev); 2150 2151 adev->firmware.load_type = amdgpu_ucode_get_load_type(adev, amdgpu_fw_load_type); 2152 2153 for (i = 0; i < MAX_XCP; i++) { 2154 switch (amdgpu_enforce_isolation) { 2155 case -1: 2156 case 0: 2157 default: 2158 /* disable */ 2159 adev->enforce_isolation[i] = AMDGPU_ENFORCE_ISOLATION_DISABLE; 2160 break; 2161 case 1: 2162 /* enable */ 2163 adev->enforce_isolation[i] = 2164 AMDGPU_ENFORCE_ISOLATION_ENABLE; 2165 break; 2166 case 2: 2167 /* enable legacy mode */ 2168 adev->enforce_isolation[i] = 2169 AMDGPU_ENFORCE_ISOLATION_ENABLE_LEGACY; 2170 break; 2171 case 3: 2172 /* enable only process isolation without submitting cleaner shader */ 2173 adev->enforce_isolation[i] = 2174 AMDGPU_ENFORCE_ISOLATION_NO_CLEANER_SHADER; 2175 break; 2176 } 2177 } 2178 2179 return 0; 2180 } 2181 2182 /** 2183 * amdgpu_switcheroo_set_state - set switcheroo state 2184 * 2185 * @pdev: pci dev pointer 2186 * @state: vga_switcheroo state 2187 * 2188 * Callback for the switcheroo driver. Suspends or resumes 2189 * the asics before or after it is powered up using ACPI methods. 2190 */ 2191 static void amdgpu_switcheroo_set_state(struct pci_dev *pdev, 2192 enum vga_switcheroo_state state) 2193 { 2194 struct drm_device *dev = pci_get_drvdata(pdev); 2195 int r; 2196 2197 if (amdgpu_device_supports_px(drm_to_adev(dev)) && 2198 state == VGA_SWITCHEROO_OFF) 2199 return; 2200 2201 if (state == VGA_SWITCHEROO_ON) { 2202 pr_info("switched on\n"); 2203 /* don't suspend or resume card normally */ 2204 dev->switch_power_state = DRM_SWITCH_POWER_CHANGING; 2205 2206 pci_set_power_state(pdev, PCI_D0); 2207 amdgpu_device_load_pci_state(pdev); 2208 r = pci_enable_device(pdev); 2209 if (r) 2210 dev_warn(&pdev->dev, "pci_enable_device failed (%d)\n", 2211 r); 2212 amdgpu_device_resume(dev, true); 2213 2214 dev->switch_power_state = DRM_SWITCH_POWER_ON; 2215 } else { 2216 dev_info(&pdev->dev, "switched off\n"); 2217 dev->switch_power_state = DRM_SWITCH_POWER_CHANGING; 2218 amdgpu_device_prepare(dev); 2219 amdgpu_device_suspend(dev, true); 2220 amdgpu_device_cache_pci_state(pdev); 2221 /* Shut down the device */ 2222 pci_disable_device(pdev); 2223 pci_set_power_state(pdev, PCI_D3cold); 2224 dev->switch_power_state = DRM_SWITCH_POWER_OFF; 2225 } 2226 } 2227 2228 /** 2229 * amdgpu_switcheroo_can_switch - see if switcheroo state can change 2230 * 2231 * @pdev: pci dev pointer 2232 * 2233 * Callback for the switcheroo driver. Check of the switcheroo 2234 * state can be changed. 2235 * Returns true if the state can be changed, false if not. 2236 */ 2237 static bool amdgpu_switcheroo_can_switch(struct pci_dev *pdev) 2238 { 2239 struct drm_device *dev = pci_get_drvdata(pdev); 2240 2241 /* 2242 * FIXME: open_count is protected by drm_global_mutex but that would lead to 2243 * locking inversion with the driver load path. And the access here is 2244 * completely racy anyway. So don't bother with locking for now. 2245 */ 2246 return atomic_read(&dev->open_count) == 0; 2247 } 2248 2249 static const struct vga_switcheroo_client_ops amdgpu_switcheroo_ops = { 2250 .set_gpu_state = amdgpu_switcheroo_set_state, 2251 .reprobe = NULL, 2252 .can_switch = amdgpu_switcheroo_can_switch, 2253 }; 2254 2255 /** 2256 * amdgpu_device_ip_set_clockgating_state - set the CG state 2257 * 2258 * @dev: amdgpu_device pointer 2259 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.) 2260 * @state: clockgating state (gate or ungate) 2261 * 2262 * Sets the requested clockgating state for all instances of 2263 * the hardware IP specified. 2264 * Returns the error code from the last instance. 2265 */ 2266 int amdgpu_device_ip_set_clockgating_state(void *dev, 2267 enum amd_ip_block_type block_type, 2268 enum amd_clockgating_state state) 2269 { 2270 struct amdgpu_device *adev = dev; 2271 int i, r = 0; 2272 2273 for (i = 0; i < adev->num_ip_blocks; i++) { 2274 if (!adev->ip_blocks[i].status.valid) 2275 continue; 2276 if (adev->ip_blocks[i].version->type != block_type) 2277 continue; 2278 if (!adev->ip_blocks[i].version->funcs->set_clockgating_state) 2279 continue; 2280 r = adev->ip_blocks[i].version->funcs->set_clockgating_state( 2281 &adev->ip_blocks[i], state); 2282 if (r) 2283 dev_err(adev->dev, 2284 "set_clockgating_state of IP block <%s> failed %d\n", 2285 adev->ip_blocks[i].version->funcs->name, r); 2286 } 2287 return r; 2288 } 2289 2290 /** 2291 * amdgpu_device_ip_set_powergating_state - set the PG state 2292 * 2293 * @dev: amdgpu_device pointer 2294 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.) 2295 * @state: powergating state (gate or ungate) 2296 * 2297 * Sets the requested powergating state for all instances of 2298 * the hardware IP specified. 2299 * Returns the error code from the last instance. 2300 */ 2301 int amdgpu_device_ip_set_powergating_state(void *dev, 2302 enum amd_ip_block_type block_type, 2303 enum amd_powergating_state state) 2304 { 2305 struct amdgpu_device *adev = dev; 2306 int i, r = 0; 2307 2308 for (i = 0; i < adev->num_ip_blocks; i++) { 2309 if (!adev->ip_blocks[i].status.valid) 2310 continue; 2311 if (adev->ip_blocks[i].version->type != block_type) 2312 continue; 2313 if (!adev->ip_blocks[i].version->funcs->set_powergating_state) 2314 continue; 2315 r = adev->ip_blocks[i].version->funcs->set_powergating_state( 2316 &adev->ip_blocks[i], state); 2317 if (r) 2318 dev_err(adev->dev, 2319 "set_powergating_state of IP block <%s> failed %d\n", 2320 adev->ip_blocks[i].version->funcs->name, r); 2321 } 2322 return r; 2323 } 2324 2325 /** 2326 * amdgpu_device_ip_get_clockgating_state - get the CG state 2327 * 2328 * @adev: amdgpu_device pointer 2329 * @flags: clockgating feature flags 2330 * 2331 * Walks the list of IPs on the device and updates the clockgating 2332 * flags for each IP. 2333 * Updates @flags with the feature flags for each hardware IP where 2334 * clockgating is enabled. 2335 */ 2336 void amdgpu_device_ip_get_clockgating_state(struct amdgpu_device *adev, 2337 u64 *flags) 2338 { 2339 int i; 2340 2341 for (i = 0; i < adev->num_ip_blocks; i++) { 2342 if (!adev->ip_blocks[i].status.valid) 2343 continue; 2344 if (adev->ip_blocks[i].version->funcs->get_clockgating_state) 2345 adev->ip_blocks[i].version->funcs->get_clockgating_state( 2346 &adev->ip_blocks[i], flags); 2347 } 2348 } 2349 2350 /** 2351 * amdgpu_device_ip_wait_for_idle - wait for idle 2352 * 2353 * @adev: amdgpu_device pointer 2354 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.) 2355 * 2356 * Waits for the request hardware IP to be idle. 2357 * Returns 0 for success or a negative error code on failure. 2358 */ 2359 int amdgpu_device_ip_wait_for_idle(struct amdgpu_device *adev, 2360 enum amd_ip_block_type block_type) 2361 { 2362 int i, r; 2363 2364 for (i = 0; i < adev->num_ip_blocks; i++) { 2365 if (!adev->ip_blocks[i].status.valid) 2366 continue; 2367 if (adev->ip_blocks[i].version->type == block_type) { 2368 if (adev->ip_blocks[i].version->funcs->wait_for_idle) { 2369 r = adev->ip_blocks[i].version->funcs->wait_for_idle( 2370 &adev->ip_blocks[i]); 2371 if (r) 2372 return r; 2373 } 2374 break; 2375 } 2376 } 2377 return 0; 2378 2379 } 2380 2381 /** 2382 * amdgpu_device_ip_is_valid - is the hardware IP enabled 2383 * 2384 * @adev: amdgpu_device pointer 2385 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.) 2386 * 2387 * Check if the hardware IP is enable or not. 2388 * Returns true if it the IP is enable, false if not. 2389 */ 2390 bool amdgpu_device_ip_is_valid(struct amdgpu_device *adev, 2391 enum amd_ip_block_type block_type) 2392 { 2393 int i; 2394 2395 for (i = 0; i < adev->num_ip_blocks; i++) { 2396 if (adev->ip_blocks[i].version->type == block_type) 2397 return adev->ip_blocks[i].status.valid; 2398 } 2399 return false; 2400 2401 } 2402 2403 /** 2404 * amdgpu_device_ip_get_ip_block - get a hw IP pointer 2405 * 2406 * @adev: amdgpu_device pointer 2407 * @type: Type of hardware IP (SMU, GFX, UVD, etc.) 2408 * 2409 * Returns a pointer to the hardware IP block structure 2410 * if it exists for the asic, otherwise NULL. 2411 */ 2412 struct amdgpu_ip_block * 2413 amdgpu_device_ip_get_ip_block(struct amdgpu_device *adev, 2414 enum amd_ip_block_type type) 2415 { 2416 int i; 2417 2418 for (i = 0; i < adev->num_ip_blocks; i++) 2419 if (adev->ip_blocks[i].version->type == type) 2420 return &adev->ip_blocks[i]; 2421 2422 return NULL; 2423 } 2424 2425 /** 2426 * amdgpu_device_ip_block_version_cmp 2427 * 2428 * @adev: amdgpu_device pointer 2429 * @type: enum amd_ip_block_type 2430 * @major: major version 2431 * @minor: minor version 2432 * 2433 * return 0 if equal or greater 2434 * return 1 if smaller or the ip_block doesn't exist 2435 */ 2436 int amdgpu_device_ip_block_version_cmp(struct amdgpu_device *adev, 2437 enum amd_ip_block_type type, 2438 u32 major, u32 minor) 2439 { 2440 struct amdgpu_ip_block *ip_block = amdgpu_device_ip_get_ip_block(adev, type); 2441 2442 if (ip_block && ((ip_block->version->major > major) || 2443 ((ip_block->version->major == major) && 2444 (ip_block->version->minor >= minor)))) 2445 return 0; 2446 2447 return 1; 2448 } 2449 2450 /** 2451 * amdgpu_device_ip_block_add 2452 * 2453 * @adev: amdgpu_device pointer 2454 * @ip_block_version: pointer to the IP to add 2455 * 2456 * Adds the IP block driver information to the collection of IPs 2457 * on the asic. 2458 */ 2459 int amdgpu_device_ip_block_add(struct amdgpu_device *adev, 2460 const struct amdgpu_ip_block_version *ip_block_version) 2461 { 2462 if (!ip_block_version) 2463 return -EINVAL; 2464 2465 switch (ip_block_version->type) { 2466 case AMD_IP_BLOCK_TYPE_VCN: 2467 if (adev->harvest_ip_mask & AMD_HARVEST_IP_VCN_MASK) 2468 return 0; 2469 break; 2470 case AMD_IP_BLOCK_TYPE_JPEG: 2471 if (adev->harvest_ip_mask & AMD_HARVEST_IP_JPEG_MASK) 2472 return 0; 2473 break; 2474 default: 2475 break; 2476 } 2477 2478 dev_info(adev->dev, "detected ip block number %d <%s>\n", 2479 adev->num_ip_blocks, ip_block_version->funcs->name); 2480 2481 adev->ip_blocks[adev->num_ip_blocks].adev = adev; 2482 2483 adev->ip_blocks[adev->num_ip_blocks++].version = ip_block_version; 2484 2485 return 0; 2486 } 2487 2488 /** 2489 * amdgpu_device_enable_virtual_display - enable virtual display feature 2490 * 2491 * @adev: amdgpu_device pointer 2492 * 2493 * Enabled the virtual display feature if the user has enabled it via 2494 * the module parameter virtual_display. This feature provides a virtual 2495 * display hardware on headless boards or in virtualized environments. 2496 * This function parses and validates the configuration string specified by 2497 * the user and configures the virtual display configuration (number of 2498 * virtual connectors, crtcs, etc.) specified. 2499 */ 2500 static void amdgpu_device_enable_virtual_display(struct amdgpu_device *adev) 2501 { 2502 adev->enable_virtual_display = false; 2503 2504 if (amdgpu_virtual_display) { 2505 const char *pci_address_name = pci_name(adev->pdev); 2506 char *pciaddstr, *pciaddstr_tmp, *pciaddname_tmp, *pciaddname; 2507 2508 pciaddstr = kstrdup(amdgpu_virtual_display, GFP_KERNEL); 2509 pciaddstr_tmp = pciaddstr; 2510 while ((pciaddname_tmp = strsep(&pciaddstr_tmp, ";"))) { 2511 pciaddname = strsep(&pciaddname_tmp, ","); 2512 if (!strcmp("all", pciaddname) 2513 || !strcmp(pci_address_name, pciaddname)) { 2514 long num_crtc; 2515 int res = -1; 2516 2517 adev->enable_virtual_display = true; 2518 2519 if (pciaddname_tmp) 2520 res = kstrtol(pciaddname_tmp, 10, 2521 &num_crtc); 2522 2523 if (!res) { 2524 if (num_crtc < 1) 2525 num_crtc = 1; 2526 if (num_crtc > 6) 2527 num_crtc = 6; 2528 adev->mode_info.num_crtc = num_crtc; 2529 } else { 2530 adev->mode_info.num_crtc = 1; 2531 } 2532 break; 2533 } 2534 } 2535 2536 dev_info( 2537 adev->dev, 2538 "virtual display string:%s, %s:virtual_display:%d, num_crtc:%d\n", 2539 amdgpu_virtual_display, pci_address_name, 2540 adev->enable_virtual_display, adev->mode_info.num_crtc); 2541 2542 kfree(pciaddstr); 2543 } 2544 } 2545 2546 void amdgpu_device_set_sriov_virtual_display(struct amdgpu_device *adev) 2547 { 2548 if (amdgpu_sriov_vf(adev) && !adev->enable_virtual_display) { 2549 adev->mode_info.num_crtc = 1; 2550 adev->enable_virtual_display = true; 2551 dev_info(adev->dev, "virtual_display:%d, num_crtc:%d\n", 2552 adev->enable_virtual_display, 2553 adev->mode_info.num_crtc); 2554 } 2555 } 2556 2557 /** 2558 * amdgpu_device_parse_gpu_info_fw - parse gpu info firmware 2559 * 2560 * @adev: amdgpu_device pointer 2561 * 2562 * Parses the asic configuration parameters specified in the gpu info 2563 * firmware and makes them available to the driver for use in configuring 2564 * the asic. 2565 * Returns 0 on success, -EINVAL on failure. 2566 */ 2567 static int amdgpu_device_parse_gpu_info_fw(struct amdgpu_device *adev) 2568 { 2569 const char *chip_name; 2570 int err; 2571 const struct gpu_info_firmware_header_v1_0 *hdr; 2572 2573 adev->firmware.gpu_info_fw = NULL; 2574 2575 switch (adev->asic_type) { 2576 default: 2577 return 0; 2578 case CHIP_VEGA10: 2579 chip_name = "vega10"; 2580 break; 2581 case CHIP_VEGA12: 2582 chip_name = "vega12"; 2583 break; 2584 case CHIP_RAVEN: 2585 if (adev->apu_flags & AMD_APU_IS_RAVEN2) 2586 chip_name = "raven2"; 2587 else if (adev->apu_flags & AMD_APU_IS_PICASSO) 2588 chip_name = "picasso"; 2589 else 2590 chip_name = "raven"; 2591 break; 2592 case CHIP_ARCTURUS: 2593 chip_name = "arcturus"; 2594 break; 2595 case CHIP_NAVI12: 2596 if (adev->mman.discovery_bin) 2597 return 0; 2598 chip_name = "navi12"; 2599 break; 2600 } 2601 2602 err = amdgpu_ucode_request(adev, &adev->firmware.gpu_info_fw, 2603 AMDGPU_UCODE_OPTIONAL, 2604 "amdgpu/%s_gpu_info.bin", chip_name); 2605 if (err) { 2606 dev_err(adev->dev, 2607 "Failed to get gpu_info firmware \"%s_gpu_info.bin\"\n", 2608 chip_name); 2609 goto out; 2610 } 2611 2612 hdr = (const struct gpu_info_firmware_header_v1_0 *)adev->firmware.gpu_info_fw->data; 2613 amdgpu_ucode_print_gpu_info_hdr(&hdr->header); 2614 2615 switch (hdr->version_major) { 2616 case 1: 2617 { 2618 const struct gpu_info_firmware_v1_0 *gpu_info_fw = 2619 (const struct gpu_info_firmware_v1_0 *)(adev->firmware.gpu_info_fw->data + 2620 le32_to_cpu(hdr->header.ucode_array_offset_bytes)); 2621 2622 /* 2623 * Should be dropped when DAL no longer needs it. 2624 */ 2625 if (adev->asic_type == CHIP_NAVI12) 2626 goto parse_soc_bounding_box; 2627 2628 adev->gfx.config.max_shader_engines = le32_to_cpu(gpu_info_fw->gc_num_se); 2629 adev->gfx.config.max_cu_per_sh = le32_to_cpu(gpu_info_fw->gc_num_cu_per_sh); 2630 adev->gfx.config.max_sh_per_se = le32_to_cpu(gpu_info_fw->gc_num_sh_per_se); 2631 adev->gfx.config.max_backends_per_se = le32_to_cpu(gpu_info_fw->gc_num_rb_per_se); 2632 adev->gfx.config.max_texture_channel_caches = 2633 le32_to_cpu(gpu_info_fw->gc_num_tccs); 2634 adev->gfx.config.max_gprs = le32_to_cpu(gpu_info_fw->gc_num_gprs); 2635 adev->gfx.config.max_gs_threads = le32_to_cpu(gpu_info_fw->gc_num_max_gs_thds); 2636 adev->gfx.config.gs_vgt_table_depth = le32_to_cpu(gpu_info_fw->gc_gs_table_depth); 2637 adev->gfx.config.gs_prim_buffer_depth = le32_to_cpu(gpu_info_fw->gc_gsprim_buff_depth); 2638 adev->gfx.config.double_offchip_lds_buf = 2639 le32_to_cpu(gpu_info_fw->gc_double_offchip_lds_buffer); 2640 adev->gfx.cu_info.wave_front_size = le32_to_cpu(gpu_info_fw->gc_wave_size); 2641 adev->gfx.cu_info.max_waves_per_simd = 2642 le32_to_cpu(gpu_info_fw->gc_max_waves_per_simd); 2643 adev->gfx.cu_info.max_scratch_slots_per_cu = 2644 le32_to_cpu(gpu_info_fw->gc_max_scratch_slots_per_cu); 2645 adev->gfx.cu_info.lds_size = le32_to_cpu(gpu_info_fw->gc_lds_size); 2646 if (hdr->version_minor >= 1) { 2647 const struct gpu_info_firmware_v1_1 *gpu_info_fw = 2648 (const struct gpu_info_firmware_v1_1 *)(adev->firmware.gpu_info_fw->data + 2649 le32_to_cpu(hdr->header.ucode_array_offset_bytes)); 2650 adev->gfx.config.num_sc_per_sh = 2651 le32_to_cpu(gpu_info_fw->num_sc_per_sh); 2652 adev->gfx.config.num_packer_per_sc = 2653 le32_to_cpu(gpu_info_fw->num_packer_per_sc); 2654 } 2655 2656 parse_soc_bounding_box: 2657 /* 2658 * soc bounding box info is not integrated in disocovery table, 2659 * we always need to parse it from gpu info firmware if needed. 2660 */ 2661 if (hdr->version_minor == 2) { 2662 const struct gpu_info_firmware_v1_2 *gpu_info_fw = 2663 (const struct gpu_info_firmware_v1_2 *)(adev->firmware.gpu_info_fw->data + 2664 le32_to_cpu(hdr->header.ucode_array_offset_bytes)); 2665 adev->dm.soc_bounding_box = &gpu_info_fw->soc_bounding_box; 2666 } 2667 break; 2668 } 2669 default: 2670 dev_err(adev->dev, 2671 "Unsupported gpu_info table %d\n", hdr->header.ucode_version); 2672 err = -EINVAL; 2673 goto out; 2674 } 2675 out: 2676 return err; 2677 } 2678 2679 static void amdgpu_uid_init(struct amdgpu_device *adev) 2680 { 2681 /* Initialize the UID for the device */ 2682 adev->uid_info = kzalloc(sizeof(struct amdgpu_uid), GFP_KERNEL); 2683 if (!adev->uid_info) { 2684 dev_warn(adev->dev, "Failed to allocate memory for UID\n"); 2685 return; 2686 } 2687 adev->uid_info->adev = adev; 2688 } 2689 2690 static void amdgpu_uid_fini(struct amdgpu_device *adev) 2691 { 2692 /* Free the UID memory */ 2693 kfree(adev->uid_info); 2694 adev->uid_info = NULL; 2695 } 2696 2697 /** 2698 * amdgpu_device_ip_early_init - run early init for hardware IPs 2699 * 2700 * @adev: amdgpu_device pointer 2701 * 2702 * Early initialization pass for hardware IPs. The hardware IPs that make 2703 * up each asic are discovered each IP's early_init callback is run. This 2704 * is the first stage in initializing the asic. 2705 * Returns 0 on success, negative error code on failure. 2706 */ 2707 static int amdgpu_device_ip_early_init(struct amdgpu_device *adev) 2708 { 2709 struct amdgpu_ip_block *ip_block; 2710 struct pci_dev *parent; 2711 bool total, skip_bios; 2712 uint32_t bios_flags; 2713 int i, r; 2714 2715 amdgpu_device_enable_virtual_display(adev); 2716 2717 if (amdgpu_sriov_vf(adev)) { 2718 r = amdgpu_virt_request_full_gpu(adev, true); 2719 if (r) 2720 return r; 2721 } 2722 2723 switch (adev->asic_type) { 2724 #ifdef CONFIG_DRM_AMDGPU_SI 2725 case CHIP_VERDE: 2726 case CHIP_TAHITI: 2727 case CHIP_PITCAIRN: 2728 case CHIP_OLAND: 2729 case CHIP_HAINAN: 2730 adev->family = AMDGPU_FAMILY_SI; 2731 r = si_set_ip_blocks(adev); 2732 if (r) 2733 return r; 2734 break; 2735 #endif 2736 #ifdef CONFIG_DRM_AMDGPU_CIK 2737 case CHIP_BONAIRE: 2738 case CHIP_HAWAII: 2739 case CHIP_KAVERI: 2740 case CHIP_KABINI: 2741 case CHIP_MULLINS: 2742 if (adev->flags & AMD_IS_APU) 2743 adev->family = AMDGPU_FAMILY_KV; 2744 else 2745 adev->family = AMDGPU_FAMILY_CI; 2746 2747 r = cik_set_ip_blocks(adev); 2748 if (r) 2749 return r; 2750 break; 2751 #endif 2752 case CHIP_TOPAZ: 2753 case CHIP_TONGA: 2754 case CHIP_FIJI: 2755 case CHIP_POLARIS10: 2756 case CHIP_POLARIS11: 2757 case CHIP_POLARIS12: 2758 case CHIP_VEGAM: 2759 case CHIP_CARRIZO: 2760 case CHIP_STONEY: 2761 if (adev->flags & AMD_IS_APU) 2762 adev->family = AMDGPU_FAMILY_CZ; 2763 else 2764 adev->family = AMDGPU_FAMILY_VI; 2765 2766 r = vi_set_ip_blocks(adev); 2767 if (r) 2768 return r; 2769 break; 2770 default: 2771 r = amdgpu_discovery_set_ip_blocks(adev); 2772 if (r) 2773 return r; 2774 break; 2775 } 2776 2777 /* Check for IP version 9.4.3 with A0 hardware */ 2778 if (amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 3) && 2779 !amdgpu_device_get_rev_id(adev)) { 2780 dev_err(adev->dev, "Unsupported A0 hardware\n"); 2781 return -ENODEV; /* device unsupported - no device error */ 2782 } 2783 2784 if (amdgpu_has_atpx() && 2785 (amdgpu_is_atpx_hybrid() || 2786 amdgpu_has_atpx_dgpu_power_cntl()) && 2787 ((adev->flags & AMD_IS_APU) == 0) && 2788 !dev_is_removable(&adev->pdev->dev)) 2789 adev->flags |= AMD_IS_PX; 2790 2791 if (!(adev->flags & AMD_IS_APU)) { 2792 parent = pcie_find_root_port(adev->pdev); 2793 adev->has_pr3 = parent ? pci_pr3_present(parent) : false; 2794 } 2795 2796 adev->pm.pp_feature = amdgpu_pp_feature_mask; 2797 if (amdgpu_sriov_vf(adev) || sched_policy == KFD_SCHED_POLICY_NO_HWS) 2798 adev->pm.pp_feature &= ~PP_GFXOFF_MASK; 2799 if (amdgpu_sriov_vf(adev) && adev->asic_type == CHIP_SIENNA_CICHLID) 2800 adev->pm.pp_feature &= ~PP_OVERDRIVE_MASK; 2801 if (!amdgpu_device_pcie_dynamic_switching_supported(adev)) 2802 adev->pm.pp_feature &= ~PP_PCIE_DPM_MASK; 2803 2804 adev->virt.is_xgmi_node_migrate_enabled = false; 2805 if (amdgpu_sriov_vf(adev)) { 2806 adev->virt.is_xgmi_node_migrate_enabled = 2807 amdgpu_ip_version((adev), GC_HWIP, 0) == IP_VERSION(9, 4, 4); 2808 } 2809 2810 total = true; 2811 for (i = 0; i < adev->num_ip_blocks; i++) { 2812 ip_block = &adev->ip_blocks[i]; 2813 2814 if ((amdgpu_ip_block_mask & (1 << i)) == 0) { 2815 dev_warn(adev->dev, "disabled ip block: %d <%s>\n", i, 2816 adev->ip_blocks[i].version->funcs->name); 2817 adev->ip_blocks[i].status.valid = false; 2818 } else if (ip_block->version->funcs->early_init) { 2819 r = ip_block->version->funcs->early_init(ip_block); 2820 if (r == -ENOENT) { 2821 adev->ip_blocks[i].status.valid = false; 2822 } else if (r) { 2823 dev_err(adev->dev, 2824 "early_init of IP block <%s> failed %d\n", 2825 adev->ip_blocks[i].version->funcs->name, 2826 r); 2827 total = false; 2828 } else { 2829 adev->ip_blocks[i].status.valid = true; 2830 } 2831 } else { 2832 adev->ip_blocks[i].status.valid = true; 2833 } 2834 /* get the vbios after the asic_funcs are set up */ 2835 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON) { 2836 r = amdgpu_device_parse_gpu_info_fw(adev); 2837 if (r) 2838 return r; 2839 2840 bios_flags = amdgpu_device_get_vbios_flags(adev); 2841 skip_bios = !!(bios_flags & AMDGPU_VBIOS_SKIP); 2842 /* Read BIOS */ 2843 if (!skip_bios) { 2844 bool optional = 2845 !!(bios_flags & AMDGPU_VBIOS_OPTIONAL); 2846 if (!amdgpu_get_bios(adev) && !optional) 2847 return -EINVAL; 2848 2849 if (optional && !adev->bios) 2850 dev_info( 2851 adev->dev, 2852 "VBIOS image optional, proceeding without VBIOS image"); 2853 2854 if (adev->bios) { 2855 r = amdgpu_atombios_init(adev); 2856 if (r) { 2857 dev_err(adev->dev, 2858 "amdgpu_atombios_init failed\n"); 2859 amdgpu_vf_error_put( 2860 adev, 2861 AMDGIM_ERROR_VF_ATOMBIOS_INIT_FAIL, 2862 0, 0); 2863 return r; 2864 } 2865 } 2866 } 2867 2868 /*get pf2vf msg info at it's earliest time*/ 2869 if (amdgpu_sriov_vf(adev)) 2870 amdgpu_virt_init_data_exchange(adev); 2871 2872 } 2873 } 2874 if (!total) 2875 return -ENODEV; 2876 2877 if (adev->gmc.xgmi.supported) 2878 amdgpu_xgmi_early_init(adev); 2879 2880 if (amdgpu_is_multi_aid(adev)) 2881 amdgpu_uid_init(adev); 2882 ip_block = amdgpu_device_ip_get_ip_block(adev, AMD_IP_BLOCK_TYPE_GFX); 2883 if (ip_block->status.valid != false) 2884 amdgpu_amdkfd_device_probe(adev); 2885 2886 adev->cg_flags &= amdgpu_cg_mask; 2887 adev->pg_flags &= amdgpu_pg_mask; 2888 2889 return 0; 2890 } 2891 2892 static int amdgpu_device_ip_hw_init_phase1(struct amdgpu_device *adev) 2893 { 2894 int i, r; 2895 2896 for (i = 0; i < adev->num_ip_blocks; i++) { 2897 if (!adev->ip_blocks[i].status.sw) 2898 continue; 2899 if (adev->ip_blocks[i].status.hw) 2900 continue; 2901 if (!amdgpu_ip_member_of_hwini( 2902 adev, adev->ip_blocks[i].version->type)) 2903 continue; 2904 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON || 2905 (amdgpu_sriov_vf(adev) && (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP)) || 2906 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH) { 2907 r = adev->ip_blocks[i].version->funcs->hw_init(&adev->ip_blocks[i]); 2908 if (r) { 2909 dev_err(adev->dev, 2910 "hw_init of IP block <%s> failed %d\n", 2911 adev->ip_blocks[i].version->funcs->name, 2912 r); 2913 return r; 2914 } 2915 adev->ip_blocks[i].status.hw = true; 2916 } 2917 } 2918 2919 return 0; 2920 } 2921 2922 static int amdgpu_device_ip_hw_init_phase2(struct amdgpu_device *adev) 2923 { 2924 int i, r; 2925 2926 for (i = 0; i < adev->num_ip_blocks; i++) { 2927 if (!adev->ip_blocks[i].status.sw) 2928 continue; 2929 if (adev->ip_blocks[i].status.hw) 2930 continue; 2931 if (!amdgpu_ip_member_of_hwini( 2932 adev, adev->ip_blocks[i].version->type)) 2933 continue; 2934 r = adev->ip_blocks[i].version->funcs->hw_init(&adev->ip_blocks[i]); 2935 if (r) { 2936 dev_err(adev->dev, 2937 "hw_init of IP block <%s> failed %d\n", 2938 adev->ip_blocks[i].version->funcs->name, r); 2939 return r; 2940 } 2941 adev->ip_blocks[i].status.hw = true; 2942 } 2943 2944 return 0; 2945 } 2946 2947 static int amdgpu_device_fw_loading(struct amdgpu_device *adev) 2948 { 2949 int r = 0; 2950 int i; 2951 uint32_t smu_version; 2952 2953 if (adev->asic_type >= CHIP_VEGA10) { 2954 for (i = 0; i < adev->num_ip_blocks; i++) { 2955 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_PSP) 2956 continue; 2957 2958 if (!amdgpu_ip_member_of_hwini(adev, 2959 AMD_IP_BLOCK_TYPE_PSP)) 2960 break; 2961 2962 if (!adev->ip_blocks[i].status.sw) 2963 continue; 2964 2965 /* no need to do the fw loading again if already done*/ 2966 if (adev->ip_blocks[i].status.hw == true) 2967 break; 2968 2969 if (amdgpu_in_reset(adev) || adev->in_suspend) { 2970 r = amdgpu_ip_block_resume(&adev->ip_blocks[i]); 2971 if (r) 2972 return r; 2973 } else { 2974 r = adev->ip_blocks[i].version->funcs->hw_init(&adev->ip_blocks[i]); 2975 if (r) { 2976 dev_err(adev->dev, 2977 "hw_init of IP block <%s> failed %d\n", 2978 adev->ip_blocks[i] 2979 .version->funcs->name, 2980 r); 2981 return r; 2982 } 2983 adev->ip_blocks[i].status.hw = true; 2984 } 2985 break; 2986 } 2987 } 2988 2989 if (!amdgpu_sriov_vf(adev) || adev->asic_type == CHIP_TONGA) 2990 r = amdgpu_pm_load_smu_firmware(adev, &smu_version); 2991 2992 return r; 2993 } 2994 2995 static int amdgpu_device_init_schedulers(struct amdgpu_device *adev) 2996 { 2997 struct drm_sched_init_args args = { 2998 .ops = &amdgpu_sched_ops, 2999 .num_rqs = DRM_SCHED_PRIORITY_COUNT, 3000 .timeout_wq = adev->reset_domain->wq, 3001 .dev = adev->dev, 3002 }; 3003 long timeout; 3004 int r, i; 3005 3006 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 3007 struct amdgpu_ring *ring = adev->rings[i]; 3008 3009 /* No need to setup the GPU scheduler for rings that don't need it */ 3010 if (!ring || ring->no_scheduler) 3011 continue; 3012 3013 switch (ring->funcs->type) { 3014 case AMDGPU_RING_TYPE_GFX: 3015 timeout = adev->gfx_timeout; 3016 break; 3017 case AMDGPU_RING_TYPE_COMPUTE: 3018 timeout = adev->compute_timeout; 3019 break; 3020 case AMDGPU_RING_TYPE_SDMA: 3021 timeout = adev->sdma_timeout; 3022 break; 3023 default: 3024 timeout = adev->video_timeout; 3025 break; 3026 } 3027 3028 args.timeout = timeout; 3029 args.credit_limit = ring->num_hw_submission; 3030 args.score = ring->sched_score; 3031 args.name = ring->name; 3032 3033 r = drm_sched_init(&ring->sched, &args); 3034 if (r) { 3035 dev_err(adev->dev, 3036 "Failed to create scheduler on ring %s.\n", 3037 ring->name); 3038 return r; 3039 } 3040 r = amdgpu_uvd_entity_init(adev, ring); 3041 if (r) { 3042 dev_err(adev->dev, 3043 "Failed to create UVD scheduling entity on ring %s.\n", 3044 ring->name); 3045 return r; 3046 } 3047 r = amdgpu_vce_entity_init(adev, ring); 3048 if (r) { 3049 dev_err(adev->dev, 3050 "Failed to create VCE scheduling entity on ring %s.\n", 3051 ring->name); 3052 return r; 3053 } 3054 } 3055 3056 if (adev->xcp_mgr) 3057 amdgpu_xcp_update_partition_sched_list(adev); 3058 3059 return 0; 3060 } 3061 3062 3063 /** 3064 * amdgpu_device_ip_init - run init for hardware IPs 3065 * 3066 * @adev: amdgpu_device pointer 3067 * 3068 * Main initialization pass for hardware IPs. The list of all the hardware 3069 * IPs that make up the asic is walked and the sw_init and hw_init callbacks 3070 * are run. sw_init initializes the software state associated with each IP 3071 * and hw_init initializes the hardware associated with each IP. 3072 * Returns 0 on success, negative error code on failure. 3073 */ 3074 static int amdgpu_device_ip_init(struct amdgpu_device *adev) 3075 { 3076 bool init_badpage; 3077 int i, r; 3078 3079 r = amdgpu_ras_init(adev); 3080 if (r) 3081 return r; 3082 3083 for (i = 0; i < adev->num_ip_blocks; i++) { 3084 if (!adev->ip_blocks[i].status.valid) 3085 continue; 3086 if (adev->ip_blocks[i].version->funcs->sw_init) { 3087 r = adev->ip_blocks[i].version->funcs->sw_init(&adev->ip_blocks[i]); 3088 if (r) { 3089 dev_err(adev->dev, 3090 "sw_init of IP block <%s> failed %d\n", 3091 adev->ip_blocks[i].version->funcs->name, 3092 r); 3093 goto init_failed; 3094 } 3095 } 3096 adev->ip_blocks[i].status.sw = true; 3097 3098 if (!amdgpu_ip_member_of_hwini( 3099 adev, adev->ip_blocks[i].version->type)) 3100 continue; 3101 3102 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON) { 3103 /* need to do common hw init early so everything is set up for gmc */ 3104 r = adev->ip_blocks[i].version->funcs->hw_init(&adev->ip_blocks[i]); 3105 if (r) { 3106 dev_err(adev->dev, "hw_init %d failed %d\n", i, 3107 r); 3108 goto init_failed; 3109 } 3110 adev->ip_blocks[i].status.hw = true; 3111 } else if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) { 3112 /* need to do gmc hw init early so we can allocate gpu mem */ 3113 /* Try to reserve bad pages early */ 3114 if (amdgpu_sriov_vf(adev)) 3115 amdgpu_virt_exchange_data(adev); 3116 3117 r = amdgpu_device_mem_scratch_init(adev); 3118 if (r) { 3119 dev_err(adev->dev, 3120 "amdgpu_mem_scratch_init failed %d\n", 3121 r); 3122 goto init_failed; 3123 } 3124 r = adev->ip_blocks[i].version->funcs->hw_init(&adev->ip_blocks[i]); 3125 if (r) { 3126 dev_err(adev->dev, "hw_init %d failed %d\n", i, 3127 r); 3128 goto init_failed; 3129 } 3130 r = amdgpu_device_wb_init(adev); 3131 if (r) { 3132 dev_err(adev->dev, 3133 "amdgpu_device_wb_init failed %d\n", r); 3134 goto init_failed; 3135 } 3136 adev->ip_blocks[i].status.hw = true; 3137 3138 /* right after GMC hw init, we create CSA */ 3139 if (adev->gfx.mcbp) { 3140 r = amdgpu_allocate_static_csa(adev, &adev->virt.csa_obj, 3141 AMDGPU_GEM_DOMAIN_VRAM | 3142 AMDGPU_GEM_DOMAIN_GTT, 3143 AMDGPU_CSA_SIZE); 3144 if (r) { 3145 dev_err(adev->dev, 3146 "allocate CSA failed %d\n", r); 3147 goto init_failed; 3148 } 3149 } 3150 3151 r = amdgpu_seq64_init(adev); 3152 if (r) { 3153 dev_err(adev->dev, "allocate seq64 failed %d\n", 3154 r); 3155 goto init_failed; 3156 } 3157 } 3158 } 3159 3160 if (amdgpu_sriov_vf(adev)) 3161 amdgpu_virt_init_data_exchange(adev); 3162 3163 r = amdgpu_ib_pool_init(adev); 3164 if (r) { 3165 dev_err(adev->dev, "IB initialization failed (%d).\n", r); 3166 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_IB_INIT_FAIL, 0, r); 3167 goto init_failed; 3168 } 3169 3170 r = amdgpu_ucode_create_bo(adev); /* create ucode bo when sw_init complete*/ 3171 if (r) 3172 goto init_failed; 3173 3174 r = amdgpu_device_ip_hw_init_phase1(adev); 3175 if (r) 3176 goto init_failed; 3177 3178 r = amdgpu_device_fw_loading(adev); 3179 if (r) 3180 goto init_failed; 3181 3182 r = amdgpu_device_ip_hw_init_phase2(adev); 3183 if (r) 3184 goto init_failed; 3185 3186 /* 3187 * retired pages will be loaded from eeprom and reserved here, 3188 * it should be called after amdgpu_device_ip_hw_init_phase2 since 3189 * for some ASICs the RAS EEPROM code relies on SMU fully functioning 3190 * for I2C communication which only true at this point. 3191 * 3192 * amdgpu_ras_recovery_init may fail, but the upper only cares the 3193 * failure from bad gpu situation and stop amdgpu init process 3194 * accordingly. For other failed cases, it will still release all 3195 * the resource and print error message, rather than returning one 3196 * negative value to upper level. 3197 * 3198 * Note: theoretically, this should be called before all vram allocations 3199 * to protect retired page from abusing 3200 */ 3201 init_badpage = (adev->init_lvl->level != AMDGPU_INIT_LEVEL_MINIMAL_XGMI); 3202 r = amdgpu_ras_recovery_init(adev, init_badpage); 3203 if (r) 3204 goto init_failed; 3205 3206 /** 3207 * In case of XGMI grab extra reference for reset domain for this device 3208 */ 3209 if (adev->gmc.xgmi.num_physical_nodes > 1) { 3210 if (amdgpu_xgmi_add_device(adev) == 0) { 3211 if (!amdgpu_sriov_vf(adev)) { 3212 struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev); 3213 3214 if (WARN_ON(!hive)) { 3215 r = -ENOENT; 3216 goto init_failed; 3217 } 3218 3219 if (!hive->reset_domain || 3220 !amdgpu_reset_get_reset_domain(hive->reset_domain)) { 3221 r = -ENOENT; 3222 amdgpu_put_xgmi_hive(hive); 3223 goto init_failed; 3224 } 3225 3226 /* Drop the early temporary reset domain we created for device */ 3227 amdgpu_reset_put_reset_domain(adev->reset_domain); 3228 adev->reset_domain = hive->reset_domain; 3229 amdgpu_put_xgmi_hive(hive); 3230 } 3231 } 3232 } 3233 3234 r = amdgpu_device_init_schedulers(adev); 3235 if (r) 3236 goto init_failed; 3237 3238 if (adev->mman.buffer_funcs_ring->sched.ready) 3239 amdgpu_ttm_set_buffer_funcs_status(adev, true); 3240 3241 /* Don't init kfd if whole hive need to be reset during init */ 3242 if (adev->init_lvl->level != AMDGPU_INIT_LEVEL_MINIMAL_XGMI) { 3243 kgd2kfd_init_zone_device(adev); 3244 amdgpu_amdkfd_device_init(adev); 3245 } 3246 3247 amdgpu_fru_get_product_info(adev); 3248 3249 if (!amdgpu_sriov_vf(adev) || amdgpu_sriov_ras_cper_en(adev)) 3250 r = amdgpu_cper_init(adev); 3251 3252 init_failed: 3253 3254 return r; 3255 } 3256 3257 /** 3258 * amdgpu_device_fill_reset_magic - writes reset magic to gart pointer 3259 * 3260 * @adev: amdgpu_device pointer 3261 * 3262 * Writes a reset magic value to the gart pointer in VRAM. The driver calls 3263 * this function before a GPU reset. If the value is retained after a 3264 * GPU reset, VRAM has not been lost. Some GPU resets may destroy VRAM contents. 3265 */ 3266 static void amdgpu_device_fill_reset_magic(struct amdgpu_device *adev) 3267 { 3268 memcpy(adev->reset_magic, adev->gart.ptr, AMDGPU_RESET_MAGIC_NUM); 3269 } 3270 3271 /** 3272 * amdgpu_device_check_vram_lost - check if vram is valid 3273 * 3274 * @adev: amdgpu_device pointer 3275 * 3276 * Checks the reset magic value written to the gart pointer in VRAM. 3277 * The driver calls this after a GPU reset to see if the contents of 3278 * VRAM is lost or now. 3279 * returns true if vram is lost, false if not. 3280 */ 3281 static bool amdgpu_device_check_vram_lost(struct amdgpu_device *adev) 3282 { 3283 if (memcmp(adev->gart.ptr, adev->reset_magic, 3284 AMDGPU_RESET_MAGIC_NUM)) 3285 return true; 3286 3287 if (!amdgpu_in_reset(adev)) 3288 return false; 3289 3290 /* 3291 * For all ASICs with baco/mode1 reset, the VRAM is 3292 * always assumed to be lost. 3293 */ 3294 switch (amdgpu_asic_reset_method(adev)) { 3295 case AMD_RESET_METHOD_LEGACY: 3296 case AMD_RESET_METHOD_LINK: 3297 case AMD_RESET_METHOD_BACO: 3298 case AMD_RESET_METHOD_MODE1: 3299 return true; 3300 default: 3301 return false; 3302 } 3303 } 3304 3305 /** 3306 * amdgpu_device_set_cg_state - set clockgating for amdgpu device 3307 * 3308 * @adev: amdgpu_device pointer 3309 * @state: clockgating state (gate or ungate) 3310 * 3311 * The list of all the hardware IPs that make up the asic is walked and the 3312 * set_clockgating_state callbacks are run. 3313 * Late initialization pass enabling clockgating for hardware IPs. 3314 * Fini or suspend, pass disabling clockgating for hardware IPs. 3315 * Returns 0 on success, negative error code on failure. 3316 */ 3317 3318 int amdgpu_device_set_cg_state(struct amdgpu_device *adev, 3319 enum amd_clockgating_state state) 3320 { 3321 int i, j, r; 3322 3323 if (amdgpu_emu_mode == 1) 3324 return 0; 3325 3326 for (j = 0; j < adev->num_ip_blocks; j++) { 3327 i = state == AMD_CG_STATE_GATE ? j : adev->num_ip_blocks - j - 1; 3328 if (!adev->ip_blocks[i].status.late_initialized) 3329 continue; 3330 /* skip CG for GFX, SDMA on S0ix */ 3331 if (adev->in_s0ix && 3332 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GFX || 3333 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SDMA)) 3334 continue; 3335 /* skip CG for VCE/UVD, it's handled specially */ 3336 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_UVD && 3337 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCE && 3338 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCN && 3339 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_JPEG && 3340 adev->ip_blocks[i].version->funcs->set_clockgating_state) { 3341 /* enable clockgating to save power */ 3342 r = adev->ip_blocks[i].version->funcs->set_clockgating_state(&adev->ip_blocks[i], 3343 state); 3344 if (r) { 3345 dev_err(adev->dev, 3346 "set_clockgating_state(gate) of IP block <%s> failed %d\n", 3347 adev->ip_blocks[i].version->funcs->name, 3348 r); 3349 return r; 3350 } 3351 } 3352 } 3353 3354 return 0; 3355 } 3356 3357 int amdgpu_device_set_pg_state(struct amdgpu_device *adev, 3358 enum amd_powergating_state state) 3359 { 3360 int i, j, r; 3361 3362 if (amdgpu_emu_mode == 1) 3363 return 0; 3364 3365 for (j = 0; j < adev->num_ip_blocks; j++) { 3366 i = state == AMD_PG_STATE_GATE ? j : adev->num_ip_blocks - j - 1; 3367 if (!adev->ip_blocks[i].status.late_initialized) 3368 continue; 3369 /* skip PG for GFX, SDMA on S0ix */ 3370 if (adev->in_s0ix && 3371 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GFX || 3372 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SDMA)) 3373 continue; 3374 /* skip CG for VCE/UVD, it's handled specially */ 3375 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_UVD && 3376 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCE && 3377 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCN && 3378 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_JPEG && 3379 adev->ip_blocks[i].version->funcs->set_powergating_state) { 3380 /* enable powergating to save power */ 3381 r = adev->ip_blocks[i].version->funcs->set_powergating_state(&adev->ip_blocks[i], 3382 state); 3383 if (r) { 3384 dev_err(adev->dev, 3385 "set_powergating_state(gate) of IP block <%s> failed %d\n", 3386 adev->ip_blocks[i].version->funcs->name, 3387 r); 3388 return r; 3389 } 3390 } 3391 } 3392 return 0; 3393 } 3394 3395 static int amdgpu_device_enable_mgpu_fan_boost(void) 3396 { 3397 struct amdgpu_gpu_instance *gpu_ins; 3398 struct amdgpu_device *adev; 3399 int i, ret = 0; 3400 3401 mutex_lock(&mgpu_info.mutex); 3402 3403 /* 3404 * MGPU fan boost feature should be enabled 3405 * only when there are two or more dGPUs in 3406 * the system 3407 */ 3408 if (mgpu_info.num_dgpu < 2) 3409 goto out; 3410 3411 for (i = 0; i < mgpu_info.num_dgpu; i++) { 3412 gpu_ins = &(mgpu_info.gpu_ins[i]); 3413 adev = gpu_ins->adev; 3414 if (!(adev->flags & AMD_IS_APU || amdgpu_sriov_multi_vf_mode(adev)) && 3415 !gpu_ins->mgpu_fan_enabled) { 3416 ret = amdgpu_dpm_enable_mgpu_fan_boost(adev); 3417 if (ret) 3418 break; 3419 3420 gpu_ins->mgpu_fan_enabled = 1; 3421 } 3422 } 3423 3424 out: 3425 mutex_unlock(&mgpu_info.mutex); 3426 3427 return ret; 3428 } 3429 3430 /** 3431 * amdgpu_device_ip_late_init - run late init for hardware IPs 3432 * 3433 * @adev: amdgpu_device pointer 3434 * 3435 * Late initialization pass for hardware IPs. The list of all the hardware 3436 * IPs that make up the asic is walked and the late_init callbacks are run. 3437 * late_init covers any special initialization that an IP requires 3438 * after all of the have been initialized or something that needs to happen 3439 * late in the init process. 3440 * Returns 0 on success, negative error code on failure. 3441 */ 3442 static int amdgpu_device_ip_late_init(struct amdgpu_device *adev) 3443 { 3444 struct amdgpu_gpu_instance *gpu_instance; 3445 int i = 0, r; 3446 3447 for (i = 0; i < adev->num_ip_blocks; i++) { 3448 if (!adev->ip_blocks[i].status.hw) 3449 continue; 3450 if (adev->ip_blocks[i].version->funcs->late_init) { 3451 r = adev->ip_blocks[i].version->funcs->late_init(&adev->ip_blocks[i]); 3452 if (r) { 3453 dev_err(adev->dev, 3454 "late_init of IP block <%s> failed %d\n", 3455 adev->ip_blocks[i].version->funcs->name, 3456 r); 3457 return r; 3458 } 3459 } 3460 adev->ip_blocks[i].status.late_initialized = true; 3461 } 3462 3463 r = amdgpu_ras_late_init(adev); 3464 if (r) { 3465 dev_err(adev->dev, "amdgpu_ras_late_init failed %d", r); 3466 return r; 3467 } 3468 3469 if (!amdgpu_reset_in_recovery(adev)) 3470 amdgpu_ras_set_error_query_ready(adev, true); 3471 3472 amdgpu_device_set_cg_state(adev, AMD_CG_STATE_GATE); 3473 amdgpu_device_set_pg_state(adev, AMD_PG_STATE_GATE); 3474 3475 amdgpu_device_fill_reset_magic(adev); 3476 3477 r = amdgpu_device_enable_mgpu_fan_boost(); 3478 if (r) 3479 dev_err(adev->dev, "enable mgpu fan boost failed (%d).\n", r); 3480 3481 /* For passthrough configuration on arcturus and aldebaran, enable special handling SBR */ 3482 if (amdgpu_passthrough(adev) && 3483 ((adev->asic_type == CHIP_ARCTURUS && adev->gmc.xgmi.num_physical_nodes > 1) || 3484 adev->asic_type == CHIP_ALDEBARAN)) 3485 amdgpu_dpm_handle_passthrough_sbr(adev, true); 3486 3487 if (adev->gmc.xgmi.num_physical_nodes > 1) { 3488 mutex_lock(&mgpu_info.mutex); 3489 3490 /* 3491 * Reset device p-state to low as this was booted with high. 3492 * 3493 * This should be performed only after all devices from the same 3494 * hive get initialized. 3495 * 3496 * However, it's unknown how many device in the hive in advance. 3497 * As this is counted one by one during devices initializations. 3498 * 3499 * So, we wait for all XGMI interlinked devices initialized. 3500 * This may bring some delays as those devices may come from 3501 * different hives. But that should be OK. 3502 */ 3503 if (mgpu_info.num_dgpu == adev->gmc.xgmi.num_physical_nodes) { 3504 for (i = 0; i < mgpu_info.num_gpu; i++) { 3505 gpu_instance = &(mgpu_info.gpu_ins[i]); 3506 if (gpu_instance->adev->flags & AMD_IS_APU) 3507 continue; 3508 3509 r = amdgpu_xgmi_set_pstate(gpu_instance->adev, 3510 AMDGPU_XGMI_PSTATE_MIN); 3511 if (r) { 3512 dev_err(adev->dev, 3513 "pstate setting failed (%d).\n", 3514 r); 3515 break; 3516 } 3517 } 3518 } 3519 3520 mutex_unlock(&mgpu_info.mutex); 3521 } 3522 3523 return 0; 3524 } 3525 3526 static void amdgpu_ip_block_hw_fini(struct amdgpu_ip_block *ip_block) 3527 { 3528 struct amdgpu_device *adev = ip_block->adev; 3529 int r; 3530 3531 if (!ip_block->version->funcs->hw_fini) { 3532 dev_err(adev->dev, "hw_fini of IP block <%s> not defined\n", 3533 ip_block->version->funcs->name); 3534 } else { 3535 r = ip_block->version->funcs->hw_fini(ip_block); 3536 /* XXX handle errors */ 3537 if (r) { 3538 dev_dbg(adev->dev, 3539 "hw_fini of IP block <%s> failed %d\n", 3540 ip_block->version->funcs->name, r); 3541 } 3542 } 3543 3544 ip_block->status.hw = false; 3545 } 3546 3547 /** 3548 * amdgpu_device_smu_fini_early - smu hw_fini wrapper 3549 * 3550 * @adev: amdgpu_device pointer 3551 * 3552 * For ASICs need to disable SMC first 3553 */ 3554 static void amdgpu_device_smu_fini_early(struct amdgpu_device *adev) 3555 { 3556 int i; 3557 3558 if (amdgpu_ip_version(adev, GC_HWIP, 0) > IP_VERSION(9, 0, 0)) 3559 return; 3560 3561 for (i = 0; i < adev->num_ip_blocks; i++) { 3562 if (!adev->ip_blocks[i].status.hw) 3563 continue; 3564 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) { 3565 amdgpu_ip_block_hw_fini(&adev->ip_blocks[i]); 3566 break; 3567 } 3568 } 3569 } 3570 3571 static int amdgpu_device_ip_fini_early(struct amdgpu_device *adev) 3572 { 3573 int i, r; 3574 3575 for (i = 0; i < adev->num_ip_blocks; i++) { 3576 if (!adev->ip_blocks[i].version->funcs->early_fini) 3577 continue; 3578 3579 r = adev->ip_blocks[i].version->funcs->early_fini(&adev->ip_blocks[i]); 3580 if (r) { 3581 dev_dbg(adev->dev, 3582 "early_fini of IP block <%s> failed %d\n", 3583 adev->ip_blocks[i].version->funcs->name, r); 3584 } 3585 } 3586 3587 amdgpu_device_set_pg_state(adev, AMD_PG_STATE_UNGATE); 3588 amdgpu_device_set_cg_state(adev, AMD_CG_STATE_UNGATE); 3589 3590 amdgpu_amdkfd_suspend(adev, true); 3591 amdgpu_userq_suspend(adev); 3592 3593 /* Workaround for ASICs need to disable SMC first */ 3594 amdgpu_device_smu_fini_early(adev); 3595 3596 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 3597 if (!adev->ip_blocks[i].status.hw) 3598 continue; 3599 3600 amdgpu_ip_block_hw_fini(&adev->ip_blocks[i]); 3601 } 3602 3603 if (amdgpu_sriov_vf(adev)) { 3604 if (amdgpu_virt_release_full_gpu(adev, false)) 3605 dev_err(adev->dev, 3606 "failed to release exclusive mode on fini\n"); 3607 } 3608 3609 return 0; 3610 } 3611 3612 /** 3613 * amdgpu_device_ip_fini - run fini for hardware IPs 3614 * 3615 * @adev: amdgpu_device pointer 3616 * 3617 * Main teardown pass for hardware IPs. The list of all the hardware 3618 * IPs that make up the asic is walked and the hw_fini and sw_fini callbacks 3619 * are run. hw_fini tears down the hardware associated with each IP 3620 * and sw_fini tears down any software state associated with each IP. 3621 * Returns 0 on success, negative error code on failure. 3622 */ 3623 static int amdgpu_device_ip_fini(struct amdgpu_device *adev) 3624 { 3625 int i, r; 3626 3627 amdgpu_cper_fini(adev); 3628 3629 if (amdgpu_sriov_vf(adev) && adev->virt.ras_init_done) 3630 amdgpu_virt_release_ras_err_handler_data(adev); 3631 3632 if (adev->gmc.xgmi.num_physical_nodes > 1) 3633 amdgpu_xgmi_remove_device(adev); 3634 3635 amdgpu_amdkfd_device_fini_sw(adev); 3636 3637 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 3638 if (!adev->ip_blocks[i].status.sw) 3639 continue; 3640 3641 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) { 3642 amdgpu_ucode_free_bo(adev); 3643 amdgpu_free_static_csa(&adev->virt.csa_obj); 3644 amdgpu_device_wb_fini(adev); 3645 amdgpu_device_mem_scratch_fini(adev); 3646 amdgpu_ib_pool_fini(adev); 3647 amdgpu_seq64_fini(adev); 3648 amdgpu_doorbell_fini(adev); 3649 } 3650 if (adev->ip_blocks[i].version->funcs->sw_fini) { 3651 r = adev->ip_blocks[i].version->funcs->sw_fini(&adev->ip_blocks[i]); 3652 /* XXX handle errors */ 3653 if (r) { 3654 dev_dbg(adev->dev, 3655 "sw_fini of IP block <%s> failed %d\n", 3656 adev->ip_blocks[i].version->funcs->name, 3657 r); 3658 } 3659 } 3660 adev->ip_blocks[i].status.sw = false; 3661 adev->ip_blocks[i].status.valid = false; 3662 } 3663 3664 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 3665 if (!adev->ip_blocks[i].status.late_initialized) 3666 continue; 3667 if (adev->ip_blocks[i].version->funcs->late_fini) 3668 adev->ip_blocks[i].version->funcs->late_fini(&adev->ip_blocks[i]); 3669 adev->ip_blocks[i].status.late_initialized = false; 3670 } 3671 3672 amdgpu_ras_fini(adev); 3673 amdgpu_uid_fini(adev); 3674 3675 return 0; 3676 } 3677 3678 /** 3679 * amdgpu_device_delayed_init_work_handler - work handler for IB tests 3680 * 3681 * @work: work_struct. 3682 */ 3683 static void amdgpu_device_delayed_init_work_handler(struct work_struct *work) 3684 { 3685 struct amdgpu_device *adev = 3686 container_of(work, struct amdgpu_device, delayed_init_work.work); 3687 int r; 3688 3689 r = amdgpu_ib_ring_tests(adev); 3690 if (r) 3691 dev_err(adev->dev, "ib ring test failed (%d).\n", r); 3692 } 3693 3694 static void amdgpu_device_delay_enable_gfx_off(struct work_struct *work) 3695 { 3696 struct amdgpu_device *adev = 3697 container_of(work, struct amdgpu_device, gfx.gfx_off_delay_work.work); 3698 3699 WARN_ON_ONCE(adev->gfx.gfx_off_state); 3700 WARN_ON_ONCE(adev->gfx.gfx_off_req_count); 3701 3702 if (!amdgpu_dpm_set_powergating_by_smu(adev, AMD_IP_BLOCK_TYPE_GFX, true, 0)) 3703 adev->gfx.gfx_off_state = true; 3704 } 3705 3706 /** 3707 * amdgpu_device_ip_suspend_phase1 - run suspend for hardware IPs (phase 1) 3708 * 3709 * @adev: amdgpu_device pointer 3710 * 3711 * Main suspend function for hardware IPs. The list of all the hardware 3712 * IPs that make up the asic is walked, clockgating is disabled and the 3713 * suspend callbacks are run. suspend puts the hardware and software state 3714 * in each IP into a state suitable for suspend. 3715 * Returns 0 on success, negative error code on failure. 3716 */ 3717 static int amdgpu_device_ip_suspend_phase1(struct amdgpu_device *adev) 3718 { 3719 int i, r; 3720 3721 amdgpu_device_set_pg_state(adev, AMD_PG_STATE_UNGATE); 3722 amdgpu_device_set_cg_state(adev, AMD_CG_STATE_UNGATE); 3723 3724 /* 3725 * Per PMFW team's suggestion, driver needs to handle gfxoff 3726 * and df cstate features disablement for gpu reset(e.g. Mode1Reset) 3727 * scenario. Add the missing df cstate disablement here. 3728 */ 3729 if (amdgpu_dpm_set_df_cstate(adev, DF_CSTATE_DISALLOW)) 3730 dev_warn(adev->dev, "Failed to disallow df cstate"); 3731 3732 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 3733 if (!adev->ip_blocks[i].status.valid) 3734 continue; 3735 3736 /* displays are handled separately */ 3737 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_DCE) 3738 continue; 3739 3740 /* XXX handle errors */ 3741 r = amdgpu_ip_block_suspend(&adev->ip_blocks[i]); 3742 if (r) 3743 return r; 3744 } 3745 3746 return 0; 3747 } 3748 3749 /** 3750 * amdgpu_device_ip_suspend_phase2 - run suspend for hardware IPs (phase 2) 3751 * 3752 * @adev: amdgpu_device pointer 3753 * 3754 * Main suspend function for hardware IPs. The list of all the hardware 3755 * IPs that make up the asic is walked, clockgating is disabled and the 3756 * suspend callbacks are run. suspend puts the hardware and software state 3757 * in each IP into a state suitable for suspend. 3758 * Returns 0 on success, negative error code on failure. 3759 */ 3760 static int amdgpu_device_ip_suspend_phase2(struct amdgpu_device *adev) 3761 { 3762 int i, r; 3763 3764 if (adev->in_s0ix) 3765 amdgpu_dpm_gfx_state_change(adev, sGpuChangeState_D3Entry); 3766 3767 for (i = adev->num_ip_blocks - 1; i >= 0; i--) { 3768 if (!adev->ip_blocks[i].status.valid) 3769 continue; 3770 /* displays are handled in phase1 */ 3771 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE) 3772 continue; 3773 /* PSP lost connection when err_event_athub occurs */ 3774 if (amdgpu_ras_intr_triggered() && 3775 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) { 3776 adev->ip_blocks[i].status.hw = false; 3777 continue; 3778 } 3779 3780 /* skip unnecessary suspend if we do not initialize them yet */ 3781 if (!amdgpu_ip_member_of_hwini( 3782 adev, adev->ip_blocks[i].version->type)) 3783 continue; 3784 3785 /* Since we skip suspend for S0i3, we need to cancel the delayed 3786 * idle work here as the suspend callback never gets called. 3787 */ 3788 if (adev->in_s0ix && 3789 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GFX && 3790 amdgpu_ip_version(adev, GC_HWIP, 0) >= IP_VERSION(10, 0, 0)) 3791 cancel_delayed_work_sync(&adev->gfx.idle_work); 3792 /* skip suspend of gfx/mes and psp for S0ix 3793 * gfx is in gfxoff state, so on resume it will exit gfxoff just 3794 * like at runtime. PSP is also part of the always on hardware 3795 * so no need to suspend it. 3796 */ 3797 if (adev->in_s0ix && 3798 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP || 3799 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GFX || 3800 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_MES)) 3801 continue; 3802 3803 /* SDMA 5.x+ is part of GFX power domain so it's covered by GFXOFF */ 3804 if (adev->in_s0ix && 3805 (amdgpu_ip_version(adev, SDMA0_HWIP, 0) >= 3806 IP_VERSION(5, 0, 0)) && 3807 (adev->ip_blocks[i].version->type == 3808 AMD_IP_BLOCK_TYPE_SDMA)) 3809 continue; 3810 3811 /* Once swPSP provides the IMU, RLC FW binaries to TOS during cold-boot. 3812 * These are in TMR, hence are expected to be reused by PSP-TOS to reload 3813 * from this location and RLC Autoload automatically also gets loaded 3814 * from here based on PMFW -> PSP message during re-init sequence. 3815 * Therefore, the psp suspend & resume should be skipped to avoid destroy 3816 * the TMR and reload FWs again for IMU enabled APU ASICs. 3817 */ 3818 if (amdgpu_in_reset(adev) && 3819 (adev->flags & AMD_IS_APU) && adev->gfx.imu.funcs && 3820 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) 3821 continue; 3822 3823 /* XXX handle errors */ 3824 r = amdgpu_ip_block_suspend(&adev->ip_blocks[i]); 3825 adev->ip_blocks[i].status.hw = false; 3826 3827 /* handle putting the SMC in the appropriate state */ 3828 if (!amdgpu_sriov_vf(adev)) { 3829 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) { 3830 r = amdgpu_dpm_set_mp1_state(adev, adev->mp1_state); 3831 if (r) { 3832 dev_err(adev->dev, 3833 "SMC failed to set mp1 state %d, %d\n", 3834 adev->mp1_state, r); 3835 return r; 3836 } 3837 } 3838 } 3839 } 3840 3841 return 0; 3842 } 3843 3844 /** 3845 * amdgpu_device_ip_suspend - run suspend for hardware IPs 3846 * 3847 * @adev: amdgpu_device pointer 3848 * 3849 * Main suspend function for hardware IPs. The list of all the hardware 3850 * IPs that make up the asic is walked, clockgating is disabled and the 3851 * suspend callbacks are run. suspend puts the hardware and software state 3852 * in each IP into a state suitable for suspend. 3853 * Returns 0 on success, negative error code on failure. 3854 */ 3855 int amdgpu_device_ip_suspend(struct amdgpu_device *adev) 3856 { 3857 int r; 3858 3859 if (amdgpu_sriov_vf(adev)) { 3860 amdgpu_virt_fini_data_exchange(adev); 3861 amdgpu_virt_request_full_gpu(adev, false); 3862 } 3863 3864 amdgpu_ttm_set_buffer_funcs_status(adev, false); 3865 3866 r = amdgpu_device_ip_suspend_phase1(adev); 3867 if (r) 3868 return r; 3869 r = amdgpu_device_ip_suspend_phase2(adev); 3870 3871 if (amdgpu_sriov_vf(adev)) 3872 amdgpu_virt_release_full_gpu(adev, false); 3873 3874 return r; 3875 } 3876 3877 static int amdgpu_device_ip_reinit_early_sriov(struct amdgpu_device *adev) 3878 { 3879 int i, r; 3880 3881 static enum amd_ip_block_type ip_order[] = { 3882 AMD_IP_BLOCK_TYPE_COMMON, 3883 AMD_IP_BLOCK_TYPE_GMC, 3884 AMD_IP_BLOCK_TYPE_PSP, 3885 AMD_IP_BLOCK_TYPE_IH, 3886 }; 3887 3888 for (i = 0; i < adev->num_ip_blocks; i++) { 3889 int j; 3890 struct amdgpu_ip_block *block; 3891 3892 block = &adev->ip_blocks[i]; 3893 block->status.hw = false; 3894 3895 for (j = 0; j < ARRAY_SIZE(ip_order); j++) { 3896 3897 if (block->version->type != ip_order[j] || 3898 !block->status.valid) 3899 continue; 3900 3901 r = block->version->funcs->hw_init(&adev->ip_blocks[i]); 3902 if (r) { 3903 dev_err(adev->dev, "RE-INIT-early: %s failed\n", 3904 block->version->funcs->name); 3905 return r; 3906 } 3907 block->status.hw = true; 3908 } 3909 } 3910 3911 return 0; 3912 } 3913 3914 static int amdgpu_device_ip_reinit_late_sriov(struct amdgpu_device *adev) 3915 { 3916 struct amdgpu_ip_block *block; 3917 int i, r = 0; 3918 3919 static enum amd_ip_block_type ip_order[] = { 3920 AMD_IP_BLOCK_TYPE_SMC, 3921 AMD_IP_BLOCK_TYPE_DCE, 3922 AMD_IP_BLOCK_TYPE_GFX, 3923 AMD_IP_BLOCK_TYPE_SDMA, 3924 AMD_IP_BLOCK_TYPE_MES, 3925 AMD_IP_BLOCK_TYPE_UVD, 3926 AMD_IP_BLOCK_TYPE_VCE, 3927 AMD_IP_BLOCK_TYPE_VCN, 3928 AMD_IP_BLOCK_TYPE_JPEG 3929 }; 3930 3931 for (i = 0; i < ARRAY_SIZE(ip_order); i++) { 3932 block = amdgpu_device_ip_get_ip_block(adev, ip_order[i]); 3933 3934 if (!block) 3935 continue; 3936 3937 if (block->status.valid && !block->status.hw) { 3938 if (block->version->type == AMD_IP_BLOCK_TYPE_SMC) { 3939 r = amdgpu_ip_block_resume(block); 3940 } else { 3941 r = block->version->funcs->hw_init(block); 3942 } 3943 3944 if (r) { 3945 dev_err(adev->dev, "RE-INIT-late: %s failed\n", 3946 block->version->funcs->name); 3947 break; 3948 } 3949 block->status.hw = true; 3950 } 3951 } 3952 3953 return r; 3954 } 3955 3956 /** 3957 * amdgpu_device_ip_resume_phase1 - run resume for hardware IPs 3958 * 3959 * @adev: amdgpu_device pointer 3960 * 3961 * First resume function for hardware IPs. The list of all the hardware 3962 * IPs that make up the asic is walked and the resume callbacks are run for 3963 * COMMON, GMC, and IH. resume puts the hardware into a functional state 3964 * after a suspend and updates the software state as necessary. This 3965 * function is also used for restoring the GPU after a GPU reset. 3966 * Returns 0 on success, negative error code on failure. 3967 */ 3968 static int amdgpu_device_ip_resume_phase1(struct amdgpu_device *adev) 3969 { 3970 int i, r; 3971 3972 for (i = 0; i < adev->num_ip_blocks; i++) { 3973 if (!adev->ip_blocks[i].status.valid || adev->ip_blocks[i].status.hw) 3974 continue; 3975 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON || 3976 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC || 3977 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH || 3978 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP && amdgpu_sriov_vf(adev))) { 3979 3980 r = amdgpu_ip_block_resume(&adev->ip_blocks[i]); 3981 if (r) 3982 return r; 3983 } 3984 } 3985 3986 return 0; 3987 } 3988 3989 /** 3990 * amdgpu_device_ip_resume_phase2 - run resume for hardware IPs 3991 * 3992 * @adev: amdgpu_device pointer 3993 * 3994 * Second resume function for hardware IPs. The list of all the hardware 3995 * IPs that make up the asic is walked and the resume callbacks are run for 3996 * all blocks except COMMON, GMC, and IH. resume puts the hardware into a 3997 * functional state after a suspend and updates the software state as 3998 * necessary. This function is also used for restoring the GPU after a GPU 3999 * reset. 4000 * Returns 0 on success, negative error code on failure. 4001 */ 4002 static int amdgpu_device_ip_resume_phase2(struct amdgpu_device *adev) 4003 { 4004 int i, r; 4005 4006 for (i = 0; i < adev->num_ip_blocks; i++) { 4007 if (!adev->ip_blocks[i].status.valid || adev->ip_blocks[i].status.hw) 4008 continue; 4009 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON || 4010 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC || 4011 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH || 4012 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE || 4013 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) 4014 continue; 4015 r = amdgpu_ip_block_resume(&adev->ip_blocks[i]); 4016 if (r) 4017 return r; 4018 } 4019 4020 return 0; 4021 } 4022 4023 /** 4024 * amdgpu_device_ip_resume_phase3 - run resume for hardware IPs 4025 * 4026 * @adev: amdgpu_device pointer 4027 * 4028 * Third resume function for hardware IPs. The list of all the hardware 4029 * IPs that make up the asic is walked and the resume callbacks are run for 4030 * all DCE. resume puts the hardware into a functional state after a suspend 4031 * and updates the software state as necessary. This function is also used 4032 * for restoring the GPU after a GPU reset. 4033 * 4034 * Returns 0 on success, negative error code on failure. 4035 */ 4036 static int amdgpu_device_ip_resume_phase3(struct amdgpu_device *adev) 4037 { 4038 int i, r; 4039 4040 for (i = 0; i < adev->num_ip_blocks; i++) { 4041 if (!adev->ip_blocks[i].status.valid || adev->ip_blocks[i].status.hw) 4042 continue; 4043 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE) { 4044 r = amdgpu_ip_block_resume(&adev->ip_blocks[i]); 4045 if (r) 4046 return r; 4047 } 4048 } 4049 4050 return 0; 4051 } 4052 4053 /** 4054 * amdgpu_device_ip_resume - run resume for hardware IPs 4055 * 4056 * @adev: amdgpu_device pointer 4057 * 4058 * Main resume function for hardware IPs. The hardware IPs 4059 * are split into two resume functions because they are 4060 * also used in recovering from a GPU reset and some additional 4061 * steps need to be take between them. In this case (S3/S4) they are 4062 * run sequentially. 4063 * Returns 0 on success, negative error code on failure. 4064 */ 4065 static int amdgpu_device_ip_resume(struct amdgpu_device *adev) 4066 { 4067 int r; 4068 4069 r = amdgpu_device_ip_resume_phase1(adev); 4070 if (r) 4071 return r; 4072 4073 r = amdgpu_device_fw_loading(adev); 4074 if (r) 4075 return r; 4076 4077 r = amdgpu_device_ip_resume_phase2(adev); 4078 4079 if (adev->mman.buffer_funcs_ring->sched.ready) 4080 amdgpu_ttm_set_buffer_funcs_status(adev, true); 4081 4082 if (r) 4083 return r; 4084 4085 amdgpu_fence_driver_hw_init(adev); 4086 4087 r = amdgpu_device_ip_resume_phase3(adev); 4088 4089 return r; 4090 } 4091 4092 /** 4093 * amdgpu_device_detect_sriov_bios - determine if the board supports SR-IOV 4094 * 4095 * @adev: amdgpu_device pointer 4096 * 4097 * Query the VBIOS data tables to determine if the board supports SR-IOV. 4098 */ 4099 static void amdgpu_device_detect_sriov_bios(struct amdgpu_device *adev) 4100 { 4101 if (amdgpu_sriov_vf(adev)) { 4102 if (adev->is_atom_fw) { 4103 if (amdgpu_atomfirmware_gpu_virtualization_supported(adev)) 4104 adev->virt.caps |= AMDGPU_SRIOV_CAPS_SRIOV_VBIOS; 4105 } else { 4106 if (amdgpu_atombios_has_gpu_virtualization_table(adev)) 4107 adev->virt.caps |= AMDGPU_SRIOV_CAPS_SRIOV_VBIOS; 4108 } 4109 4110 if (!(adev->virt.caps & AMDGPU_SRIOV_CAPS_SRIOV_VBIOS)) 4111 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_NO_VBIOS, 0, 0); 4112 } 4113 } 4114 4115 /** 4116 * amdgpu_device_asic_has_dc_support - determine if DC supports the asic 4117 * 4118 * @pdev : pci device context 4119 * @asic_type: AMD asic type 4120 * 4121 * Check if there is DC (new modesetting infrastructre) support for an asic. 4122 * returns true if DC has support, false if not. 4123 */ 4124 bool amdgpu_device_asic_has_dc_support(struct pci_dev *pdev, 4125 enum amd_asic_type asic_type) 4126 { 4127 switch (asic_type) { 4128 #ifdef CONFIG_DRM_AMDGPU_SI 4129 case CHIP_HAINAN: 4130 #endif 4131 case CHIP_TOPAZ: 4132 /* chips with no display hardware */ 4133 return false; 4134 #if defined(CONFIG_DRM_AMD_DC) 4135 case CHIP_TAHITI: 4136 case CHIP_PITCAIRN: 4137 case CHIP_VERDE: 4138 case CHIP_OLAND: 4139 /* 4140 * We have systems in the wild with these ASICs that require 4141 * LVDS and VGA support which is not supported with DC. 4142 * 4143 * Fallback to the non-DC driver here by default so as not to 4144 * cause regressions. 4145 */ 4146 #if defined(CONFIG_DRM_AMD_DC_SI) 4147 return amdgpu_dc > 0; 4148 #else 4149 return false; 4150 #endif 4151 case CHIP_BONAIRE: 4152 case CHIP_KAVERI: 4153 case CHIP_KABINI: 4154 case CHIP_MULLINS: 4155 /* 4156 * We have systems in the wild with these ASICs that require 4157 * VGA support which is not supported with DC. 4158 * 4159 * Fallback to the non-DC driver here by default so as not to 4160 * cause regressions. 4161 */ 4162 return amdgpu_dc > 0; 4163 default: 4164 return amdgpu_dc != 0; 4165 #else 4166 default: 4167 if (amdgpu_dc > 0) 4168 dev_info_once( 4169 &pdev->dev, 4170 "Display Core has been requested via kernel parameter but isn't supported by ASIC, ignoring\n"); 4171 return false; 4172 #endif 4173 } 4174 } 4175 4176 /** 4177 * amdgpu_device_has_dc_support - check if dc is supported 4178 * 4179 * @adev: amdgpu_device pointer 4180 * 4181 * Returns true for supported, false for not supported 4182 */ 4183 bool amdgpu_device_has_dc_support(struct amdgpu_device *adev) 4184 { 4185 if (adev->enable_virtual_display || 4186 (adev->harvest_ip_mask & AMD_HARVEST_IP_DMU_MASK)) 4187 return false; 4188 4189 return amdgpu_device_asic_has_dc_support(adev->pdev, adev->asic_type); 4190 } 4191 4192 static void amdgpu_device_xgmi_reset_func(struct work_struct *__work) 4193 { 4194 struct amdgpu_device *adev = 4195 container_of(__work, struct amdgpu_device, xgmi_reset_work); 4196 struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev); 4197 4198 /* It's a bug to not have a hive within this function */ 4199 if (WARN_ON(!hive)) 4200 return; 4201 4202 /* 4203 * Use task barrier to synchronize all xgmi reset works across the 4204 * hive. task_barrier_enter and task_barrier_exit will block 4205 * until all the threads running the xgmi reset works reach 4206 * those points. task_barrier_full will do both blocks. 4207 */ 4208 if (amdgpu_asic_reset_method(adev) == AMD_RESET_METHOD_BACO) { 4209 4210 task_barrier_enter(&hive->tb); 4211 adev->asic_reset_res = amdgpu_device_baco_enter(adev); 4212 4213 if (adev->asic_reset_res) 4214 goto fail; 4215 4216 task_barrier_exit(&hive->tb); 4217 adev->asic_reset_res = amdgpu_device_baco_exit(adev); 4218 4219 if (adev->asic_reset_res) 4220 goto fail; 4221 4222 amdgpu_ras_reset_error_count(adev, AMDGPU_RAS_BLOCK__MMHUB); 4223 } else { 4224 4225 task_barrier_full(&hive->tb); 4226 adev->asic_reset_res = amdgpu_asic_reset(adev); 4227 } 4228 4229 fail: 4230 if (adev->asic_reset_res) 4231 dev_warn(adev->dev, 4232 "ASIC reset failed with error, %d for drm dev, %s", 4233 adev->asic_reset_res, adev_to_drm(adev)->unique); 4234 amdgpu_put_xgmi_hive(hive); 4235 } 4236 4237 static int amdgpu_device_get_job_timeout_settings(struct amdgpu_device *adev) 4238 { 4239 char *input = amdgpu_lockup_timeout; 4240 char *timeout_setting = NULL; 4241 int index = 0; 4242 long timeout; 4243 int ret = 0; 4244 4245 /* 4246 * By default timeout for jobs is 10 sec 4247 */ 4248 adev->compute_timeout = adev->gfx_timeout = msecs_to_jiffies(10000); 4249 adev->sdma_timeout = adev->video_timeout = adev->gfx_timeout; 4250 4251 if (strnlen(input, AMDGPU_MAX_TIMEOUT_PARAM_LENGTH)) { 4252 while ((timeout_setting = strsep(&input, ",")) && 4253 strnlen(timeout_setting, AMDGPU_MAX_TIMEOUT_PARAM_LENGTH)) { 4254 ret = kstrtol(timeout_setting, 0, &timeout); 4255 if (ret) 4256 return ret; 4257 4258 if (timeout == 0) { 4259 index++; 4260 continue; 4261 } else if (timeout < 0) { 4262 timeout = MAX_SCHEDULE_TIMEOUT; 4263 dev_warn(adev->dev, "lockup timeout disabled"); 4264 add_taint(TAINT_SOFTLOCKUP, LOCKDEP_STILL_OK); 4265 } else { 4266 timeout = msecs_to_jiffies(timeout); 4267 } 4268 4269 switch (index++) { 4270 case 0: 4271 adev->gfx_timeout = timeout; 4272 break; 4273 case 1: 4274 adev->compute_timeout = timeout; 4275 break; 4276 case 2: 4277 adev->sdma_timeout = timeout; 4278 break; 4279 case 3: 4280 adev->video_timeout = timeout; 4281 break; 4282 default: 4283 break; 4284 } 4285 } 4286 /* 4287 * There is only one value specified and 4288 * it should apply to all non-compute jobs. 4289 */ 4290 if (index == 1) { 4291 adev->sdma_timeout = adev->video_timeout = adev->gfx_timeout; 4292 if (amdgpu_sriov_vf(adev) || amdgpu_passthrough(adev)) 4293 adev->compute_timeout = adev->gfx_timeout; 4294 } 4295 } 4296 4297 return ret; 4298 } 4299 4300 /** 4301 * amdgpu_device_check_iommu_direct_map - check if RAM direct mapped to GPU 4302 * 4303 * @adev: amdgpu_device pointer 4304 * 4305 * RAM direct mapped to GPU if IOMMU is not enabled or is pass through mode 4306 */ 4307 static void amdgpu_device_check_iommu_direct_map(struct amdgpu_device *adev) 4308 { 4309 struct iommu_domain *domain; 4310 4311 domain = iommu_get_domain_for_dev(adev->dev); 4312 if (!domain || domain->type == IOMMU_DOMAIN_IDENTITY) 4313 adev->ram_is_direct_mapped = true; 4314 } 4315 4316 #if defined(CONFIG_HSA_AMD_P2P) 4317 /** 4318 * amdgpu_device_check_iommu_remap - Check if DMA remapping is enabled. 4319 * 4320 * @adev: amdgpu_device pointer 4321 * 4322 * return if IOMMU remapping bar address 4323 */ 4324 static bool amdgpu_device_check_iommu_remap(struct amdgpu_device *adev) 4325 { 4326 struct iommu_domain *domain; 4327 4328 domain = iommu_get_domain_for_dev(adev->dev); 4329 if (domain && (domain->type == IOMMU_DOMAIN_DMA || 4330 domain->type == IOMMU_DOMAIN_DMA_FQ)) 4331 return true; 4332 4333 return false; 4334 } 4335 #endif 4336 4337 static void amdgpu_device_set_mcbp(struct amdgpu_device *adev) 4338 { 4339 if (amdgpu_mcbp == 1) 4340 adev->gfx.mcbp = true; 4341 else if (amdgpu_mcbp == 0) 4342 adev->gfx.mcbp = false; 4343 4344 if (amdgpu_sriov_vf(adev)) 4345 adev->gfx.mcbp = true; 4346 4347 if (adev->gfx.mcbp) 4348 dev_info(adev->dev, "MCBP is enabled\n"); 4349 } 4350 4351 /** 4352 * amdgpu_device_init - initialize the driver 4353 * 4354 * @adev: amdgpu_device pointer 4355 * @flags: driver flags 4356 * 4357 * Initializes the driver info and hw (all asics). 4358 * Returns 0 for success or an error on failure. 4359 * Called at driver startup. 4360 */ 4361 int amdgpu_device_init(struct amdgpu_device *adev, 4362 uint32_t flags) 4363 { 4364 struct pci_dev *pdev = adev->pdev; 4365 int r, i; 4366 bool px = false; 4367 u32 max_MBps; 4368 int tmp; 4369 4370 adev->shutdown = false; 4371 adev->flags = flags; 4372 4373 if (amdgpu_force_asic_type >= 0 && amdgpu_force_asic_type < CHIP_LAST) 4374 adev->asic_type = amdgpu_force_asic_type; 4375 else 4376 adev->asic_type = flags & AMD_ASIC_MASK; 4377 4378 adev->usec_timeout = AMDGPU_MAX_USEC_TIMEOUT; 4379 if (amdgpu_emu_mode == 1) 4380 adev->usec_timeout *= 10; 4381 adev->gmc.gart_size = 512 * 1024 * 1024; 4382 adev->accel_working = false; 4383 adev->num_rings = 0; 4384 RCU_INIT_POINTER(adev->gang_submit, dma_fence_get_stub()); 4385 adev->mman.buffer_funcs = NULL; 4386 adev->mman.buffer_funcs_ring = NULL; 4387 adev->vm_manager.vm_pte_funcs = NULL; 4388 adev->vm_manager.vm_pte_num_scheds = 0; 4389 adev->gmc.gmc_funcs = NULL; 4390 adev->harvest_ip_mask = 0x0; 4391 adev->fence_context = dma_fence_context_alloc(AMDGPU_MAX_RINGS); 4392 bitmap_zero(adev->gfx.pipe_reserve_bitmap, AMDGPU_MAX_COMPUTE_QUEUES); 4393 4394 adev->smc_rreg = &amdgpu_invalid_rreg; 4395 adev->smc_wreg = &amdgpu_invalid_wreg; 4396 adev->pcie_rreg = &amdgpu_invalid_rreg; 4397 adev->pcie_wreg = &amdgpu_invalid_wreg; 4398 adev->pcie_rreg_ext = &amdgpu_invalid_rreg_ext; 4399 adev->pcie_wreg_ext = &amdgpu_invalid_wreg_ext; 4400 adev->pciep_rreg = &amdgpu_invalid_rreg; 4401 adev->pciep_wreg = &amdgpu_invalid_wreg; 4402 adev->pcie_rreg64 = &amdgpu_invalid_rreg64; 4403 adev->pcie_wreg64 = &amdgpu_invalid_wreg64; 4404 adev->pcie_rreg64_ext = &amdgpu_invalid_rreg64_ext; 4405 adev->pcie_wreg64_ext = &amdgpu_invalid_wreg64_ext; 4406 adev->uvd_ctx_rreg = &amdgpu_invalid_rreg; 4407 adev->uvd_ctx_wreg = &amdgpu_invalid_wreg; 4408 adev->didt_rreg = &amdgpu_invalid_rreg; 4409 adev->didt_wreg = &amdgpu_invalid_wreg; 4410 adev->gc_cac_rreg = &amdgpu_invalid_rreg; 4411 adev->gc_cac_wreg = &amdgpu_invalid_wreg; 4412 adev->audio_endpt_rreg = &amdgpu_block_invalid_rreg; 4413 adev->audio_endpt_wreg = &amdgpu_block_invalid_wreg; 4414 4415 dev_info( 4416 adev->dev, 4417 "initializing kernel modesetting (%s 0x%04X:0x%04X 0x%04X:0x%04X 0x%02X).\n", 4418 amdgpu_asic_name[adev->asic_type], pdev->vendor, pdev->device, 4419 pdev->subsystem_vendor, pdev->subsystem_device, pdev->revision); 4420 4421 /* mutex initialization are all done here so we 4422 * can recall function without having locking issues 4423 */ 4424 mutex_init(&adev->firmware.mutex); 4425 mutex_init(&adev->pm.mutex); 4426 mutex_init(&adev->gfx.gpu_clock_mutex); 4427 mutex_init(&adev->srbm_mutex); 4428 mutex_init(&adev->gfx.pipe_reserve_mutex); 4429 mutex_init(&adev->gfx.gfx_off_mutex); 4430 mutex_init(&adev->gfx.partition_mutex); 4431 mutex_init(&adev->grbm_idx_mutex); 4432 mutex_init(&adev->mn_lock); 4433 mutex_init(&adev->virt.vf_errors.lock); 4434 hash_init(adev->mn_hash); 4435 mutex_init(&adev->psp.mutex); 4436 mutex_init(&adev->notifier_lock); 4437 mutex_init(&adev->pm.stable_pstate_ctx_lock); 4438 mutex_init(&adev->benchmark_mutex); 4439 mutex_init(&adev->gfx.reset_sem_mutex); 4440 /* Initialize the mutex for cleaner shader isolation between GFX and compute processes */ 4441 mutex_init(&adev->enforce_isolation_mutex); 4442 for (i = 0; i < MAX_XCP; ++i) { 4443 adev->isolation[i].spearhead = dma_fence_get_stub(); 4444 amdgpu_sync_create(&adev->isolation[i].active); 4445 amdgpu_sync_create(&adev->isolation[i].prev); 4446 } 4447 mutex_init(&adev->gfx.userq_sch_mutex); 4448 mutex_init(&adev->gfx.workload_profile_mutex); 4449 mutex_init(&adev->vcn.workload_profile_mutex); 4450 mutex_init(&adev->userq_mutex); 4451 4452 amdgpu_device_init_apu_flags(adev); 4453 4454 r = amdgpu_device_check_arguments(adev); 4455 if (r) 4456 return r; 4457 4458 spin_lock_init(&adev->mmio_idx_lock); 4459 spin_lock_init(&adev->smc_idx_lock); 4460 spin_lock_init(&adev->pcie_idx_lock); 4461 spin_lock_init(&adev->uvd_ctx_idx_lock); 4462 spin_lock_init(&adev->didt_idx_lock); 4463 spin_lock_init(&adev->gc_cac_idx_lock); 4464 spin_lock_init(&adev->se_cac_idx_lock); 4465 spin_lock_init(&adev->audio_endpt_idx_lock); 4466 spin_lock_init(&adev->mm_stats.lock); 4467 spin_lock_init(&adev->virt.rlcg_reg_lock); 4468 spin_lock_init(&adev->wb.lock); 4469 4470 xa_init_flags(&adev->userq_xa, XA_FLAGS_LOCK_IRQ); 4471 4472 INIT_LIST_HEAD(&adev->reset_list); 4473 4474 INIT_LIST_HEAD(&adev->ras_list); 4475 4476 INIT_LIST_HEAD(&adev->pm.od_kobj_list); 4477 4478 INIT_LIST_HEAD(&adev->userq_mgr_list); 4479 4480 INIT_DELAYED_WORK(&adev->delayed_init_work, 4481 amdgpu_device_delayed_init_work_handler); 4482 INIT_DELAYED_WORK(&adev->gfx.gfx_off_delay_work, 4483 amdgpu_device_delay_enable_gfx_off); 4484 /* 4485 * Initialize the enforce_isolation work structures for each XCP 4486 * partition. This work handler is responsible for enforcing shader 4487 * isolation on AMD GPUs. It counts the number of emitted fences for 4488 * each GFX and compute ring. If there are any fences, it schedules 4489 * the `enforce_isolation_work` to be run after a delay. If there are 4490 * no fences, it signals the Kernel Fusion Driver (KFD) to resume the 4491 * runqueue. 4492 */ 4493 for (i = 0; i < MAX_XCP; i++) { 4494 INIT_DELAYED_WORK(&adev->gfx.enforce_isolation[i].work, 4495 amdgpu_gfx_enforce_isolation_handler); 4496 adev->gfx.enforce_isolation[i].adev = adev; 4497 adev->gfx.enforce_isolation[i].xcp_id = i; 4498 } 4499 4500 INIT_WORK(&adev->xgmi_reset_work, amdgpu_device_xgmi_reset_func); 4501 4502 adev->gfx.gfx_off_req_count = 1; 4503 adev->gfx.gfx_off_residency = 0; 4504 adev->gfx.gfx_off_entrycount = 0; 4505 adev->pm.ac_power = power_supply_is_system_supplied() > 0; 4506 4507 atomic_set(&adev->throttling_logging_enabled, 1); 4508 /* 4509 * If throttling continues, logging will be performed every minute 4510 * to avoid log flooding. "-1" is subtracted since the thermal 4511 * throttling interrupt comes every second. Thus, the total logging 4512 * interval is 59 seconds(retelimited printk interval) + 1(waiting 4513 * for throttling interrupt) = 60 seconds. 4514 */ 4515 ratelimit_state_init(&adev->throttling_logging_rs, (60 - 1) * HZ, 1); 4516 4517 ratelimit_set_flags(&adev->throttling_logging_rs, RATELIMIT_MSG_ON_RELEASE); 4518 4519 /* Registers mapping */ 4520 /* TODO: block userspace mapping of io register */ 4521 if (adev->asic_type >= CHIP_BONAIRE) { 4522 adev->rmmio_base = pci_resource_start(adev->pdev, 5); 4523 adev->rmmio_size = pci_resource_len(adev->pdev, 5); 4524 } else { 4525 adev->rmmio_base = pci_resource_start(adev->pdev, 2); 4526 adev->rmmio_size = pci_resource_len(adev->pdev, 2); 4527 } 4528 4529 for (i = 0; i < AMD_IP_BLOCK_TYPE_NUM; i++) 4530 atomic_set(&adev->pm.pwr_state[i], POWER_STATE_UNKNOWN); 4531 4532 adev->rmmio = ioremap(adev->rmmio_base, adev->rmmio_size); 4533 if (!adev->rmmio) 4534 return -ENOMEM; 4535 4536 dev_info(adev->dev, "register mmio base: 0x%08X\n", 4537 (uint32_t)adev->rmmio_base); 4538 dev_info(adev->dev, "register mmio size: %u\n", 4539 (unsigned int)adev->rmmio_size); 4540 4541 /* 4542 * Reset domain needs to be present early, before XGMI hive discovered 4543 * (if any) and initialized to use reset sem and in_gpu reset flag 4544 * early on during init and before calling to RREG32. 4545 */ 4546 adev->reset_domain = amdgpu_reset_create_reset_domain(SINGLE_DEVICE, "amdgpu-reset-dev"); 4547 if (!adev->reset_domain) 4548 return -ENOMEM; 4549 4550 /* detect hw virtualization here */ 4551 amdgpu_virt_init(adev); 4552 4553 amdgpu_device_get_pcie_info(adev); 4554 4555 r = amdgpu_device_get_job_timeout_settings(adev); 4556 if (r) { 4557 dev_err(adev->dev, "invalid lockup_timeout parameter syntax\n"); 4558 return r; 4559 } 4560 4561 amdgpu_device_set_mcbp(adev); 4562 4563 /* 4564 * By default, use default mode where all blocks are expected to be 4565 * initialized. At present a 'swinit' of blocks is required to be 4566 * completed before the need for a different level is detected. 4567 */ 4568 amdgpu_set_init_level(adev, AMDGPU_INIT_LEVEL_DEFAULT); 4569 /* early init functions */ 4570 r = amdgpu_device_ip_early_init(adev); 4571 if (r) 4572 return r; 4573 4574 /* 4575 * No need to remove conflicting FBs for non-display class devices. 4576 * This prevents the sysfb from being freed accidently. 4577 */ 4578 if ((pdev->class >> 8) == PCI_CLASS_DISPLAY_VGA || 4579 (pdev->class >> 8) == PCI_CLASS_DISPLAY_OTHER) { 4580 /* Get rid of things like offb */ 4581 r = aperture_remove_conflicting_pci_devices(adev->pdev, amdgpu_kms_driver.name); 4582 if (r) 4583 return r; 4584 } 4585 4586 /* Enable TMZ based on IP_VERSION */ 4587 amdgpu_gmc_tmz_set(adev); 4588 4589 if (amdgpu_sriov_vf(adev) && 4590 amdgpu_ip_version(adev, GC_HWIP, 0) >= IP_VERSION(10, 3, 0)) 4591 /* VF MMIO access (except mailbox range) from CPU 4592 * will be blocked during sriov runtime 4593 */ 4594 adev->virt.caps |= AMDGPU_VF_MMIO_ACCESS_PROTECT; 4595 4596 amdgpu_gmc_noretry_set(adev); 4597 /* Need to get xgmi info early to decide the reset behavior*/ 4598 if (adev->gmc.xgmi.supported) { 4599 r = adev->gfxhub.funcs->get_xgmi_info(adev); 4600 if (r) 4601 return r; 4602 } 4603 4604 /* enable PCIE atomic ops */ 4605 if (amdgpu_sriov_vf(adev)) { 4606 if (adev->virt.fw_reserve.p_pf2vf) 4607 adev->have_atomics_support = ((struct amd_sriov_msg_pf2vf_info *) 4608 adev->virt.fw_reserve.p_pf2vf)->pcie_atomic_ops_support_flags == 4609 (PCI_EXP_DEVCAP2_ATOMIC_COMP32 | PCI_EXP_DEVCAP2_ATOMIC_COMP64); 4610 /* APUs w/ gfx9 onwards doesn't reply on PCIe atomics, rather it is a 4611 * internal path natively support atomics, set have_atomics_support to true. 4612 */ 4613 } else if ((adev->flags & AMD_IS_APU) && 4614 (amdgpu_ip_version(adev, GC_HWIP, 0) > 4615 IP_VERSION(9, 0, 0))) { 4616 adev->have_atomics_support = true; 4617 } else { 4618 adev->have_atomics_support = 4619 !pci_enable_atomic_ops_to_root(adev->pdev, 4620 PCI_EXP_DEVCAP2_ATOMIC_COMP32 | 4621 PCI_EXP_DEVCAP2_ATOMIC_COMP64); 4622 } 4623 4624 if (!adev->have_atomics_support) 4625 dev_info(adev->dev, "PCIE atomic ops is not supported\n"); 4626 4627 /* doorbell bar mapping and doorbell index init*/ 4628 amdgpu_doorbell_init(adev); 4629 4630 if (amdgpu_emu_mode == 1) { 4631 /* post the asic on emulation mode */ 4632 emu_soc_asic_init(adev); 4633 goto fence_driver_init; 4634 } 4635 4636 amdgpu_reset_init(adev); 4637 4638 /* detect if we are with an SRIOV vbios */ 4639 if (adev->bios) 4640 amdgpu_device_detect_sriov_bios(adev); 4641 4642 /* check if we need to reset the asic 4643 * E.g., driver was not cleanly unloaded previously, etc. 4644 */ 4645 if (!amdgpu_sriov_vf(adev) && amdgpu_asic_need_reset_on_init(adev)) { 4646 if (adev->gmc.xgmi.num_physical_nodes) { 4647 dev_info(adev->dev, "Pending hive reset.\n"); 4648 amdgpu_set_init_level(adev, 4649 AMDGPU_INIT_LEVEL_MINIMAL_XGMI); 4650 } else if (amdgpu_ip_version(adev, MP1_HWIP, 0) == IP_VERSION(13, 0, 10) && 4651 !amdgpu_device_has_display_hardware(adev)) { 4652 r = psp_gpu_reset(adev); 4653 } else { 4654 tmp = amdgpu_reset_method; 4655 /* It should do a default reset when loading or reloading the driver, 4656 * regardless of the module parameter reset_method. 4657 */ 4658 amdgpu_reset_method = AMD_RESET_METHOD_NONE; 4659 r = amdgpu_asic_reset(adev); 4660 amdgpu_reset_method = tmp; 4661 } 4662 4663 if (r) { 4664 dev_err(adev->dev, "asic reset on init failed\n"); 4665 goto failed; 4666 } 4667 } 4668 4669 /* Post card if necessary */ 4670 if (amdgpu_device_need_post(adev)) { 4671 if (!adev->bios) { 4672 dev_err(adev->dev, "no vBIOS found\n"); 4673 r = -EINVAL; 4674 goto failed; 4675 } 4676 dev_info(adev->dev, "GPU posting now...\n"); 4677 r = amdgpu_device_asic_init(adev); 4678 if (r) { 4679 dev_err(adev->dev, "gpu post error!\n"); 4680 goto failed; 4681 } 4682 } 4683 4684 if (adev->bios) { 4685 if (adev->is_atom_fw) { 4686 /* Initialize clocks */ 4687 r = amdgpu_atomfirmware_get_clock_info(adev); 4688 if (r) { 4689 dev_err(adev->dev, "amdgpu_atomfirmware_get_clock_info failed\n"); 4690 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_GET_CLOCK_FAIL, 0, 0); 4691 goto failed; 4692 } 4693 } else { 4694 /* Initialize clocks */ 4695 r = amdgpu_atombios_get_clock_info(adev); 4696 if (r) { 4697 dev_err(adev->dev, "amdgpu_atombios_get_clock_info failed\n"); 4698 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_GET_CLOCK_FAIL, 0, 0); 4699 goto failed; 4700 } 4701 /* init i2c buses */ 4702 amdgpu_i2c_init(adev); 4703 } 4704 } 4705 4706 fence_driver_init: 4707 /* Fence driver */ 4708 r = amdgpu_fence_driver_sw_init(adev); 4709 if (r) { 4710 dev_err(adev->dev, "amdgpu_fence_driver_sw_init failed\n"); 4711 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_FENCE_INIT_FAIL, 0, 0); 4712 goto failed; 4713 } 4714 4715 /* init the mode config */ 4716 drm_mode_config_init(adev_to_drm(adev)); 4717 4718 r = amdgpu_device_ip_init(adev); 4719 if (r) { 4720 dev_err(adev->dev, "amdgpu_device_ip_init failed\n"); 4721 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_AMDGPU_INIT_FAIL, 0, 0); 4722 goto release_ras_con; 4723 } 4724 4725 amdgpu_fence_driver_hw_init(adev); 4726 4727 dev_info(adev->dev, 4728 "SE %d, SH per SE %d, CU per SH %d, active_cu_number %d\n", 4729 adev->gfx.config.max_shader_engines, 4730 adev->gfx.config.max_sh_per_se, 4731 adev->gfx.config.max_cu_per_sh, 4732 adev->gfx.cu_info.number); 4733 4734 adev->accel_working = true; 4735 4736 amdgpu_vm_check_compute_bug(adev); 4737 4738 /* Initialize the buffer migration limit. */ 4739 if (amdgpu_moverate >= 0) 4740 max_MBps = amdgpu_moverate; 4741 else 4742 max_MBps = 8; /* Allow 8 MB/s. */ 4743 /* Get a log2 for easy divisions. */ 4744 adev->mm_stats.log2_max_MBps = ilog2(max(1u, max_MBps)); 4745 4746 /* 4747 * Register gpu instance before amdgpu_device_enable_mgpu_fan_boost. 4748 * Otherwise the mgpu fan boost feature will be skipped due to the 4749 * gpu instance is counted less. 4750 */ 4751 amdgpu_register_gpu_instance(adev); 4752 4753 /* enable clockgating, etc. after ib tests, etc. since some blocks require 4754 * explicit gating rather than handling it automatically. 4755 */ 4756 if (adev->init_lvl->level != AMDGPU_INIT_LEVEL_MINIMAL_XGMI) { 4757 r = amdgpu_device_ip_late_init(adev); 4758 if (r) { 4759 dev_err(adev->dev, "amdgpu_device_ip_late_init failed\n"); 4760 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_AMDGPU_LATE_INIT_FAIL, 0, r); 4761 goto release_ras_con; 4762 } 4763 /* must succeed. */ 4764 amdgpu_ras_resume(adev); 4765 queue_delayed_work(system_wq, &adev->delayed_init_work, 4766 msecs_to_jiffies(AMDGPU_RESUME_MS)); 4767 } 4768 4769 if (amdgpu_sriov_vf(adev)) { 4770 amdgpu_virt_release_full_gpu(adev, true); 4771 flush_delayed_work(&adev->delayed_init_work); 4772 } 4773 4774 /* 4775 * Place those sysfs registering after `late_init`. As some of those 4776 * operations performed in `late_init` might affect the sysfs 4777 * interfaces creating. 4778 */ 4779 r = amdgpu_atombios_sysfs_init(adev); 4780 if (r) 4781 drm_err(&adev->ddev, 4782 "registering atombios sysfs failed (%d).\n", r); 4783 4784 r = amdgpu_pm_sysfs_init(adev); 4785 if (r) 4786 dev_err(adev->dev, "registering pm sysfs failed (%d).\n", r); 4787 4788 r = amdgpu_ucode_sysfs_init(adev); 4789 if (r) { 4790 adev->ucode_sysfs_en = false; 4791 dev_err(adev->dev, "Creating firmware sysfs failed (%d).\n", r); 4792 } else 4793 adev->ucode_sysfs_en = true; 4794 4795 r = amdgpu_device_attr_sysfs_init(adev); 4796 if (r) 4797 dev_err(adev->dev, "Could not create amdgpu device attr\n"); 4798 4799 r = devm_device_add_group(adev->dev, &amdgpu_board_attrs_group); 4800 if (r) 4801 dev_err(adev->dev, 4802 "Could not create amdgpu board attributes\n"); 4803 4804 amdgpu_fru_sysfs_init(adev); 4805 amdgpu_reg_state_sysfs_init(adev); 4806 amdgpu_xcp_sysfs_init(adev); 4807 4808 if (IS_ENABLED(CONFIG_PERF_EVENTS)) 4809 r = amdgpu_pmu_init(adev); 4810 if (r) 4811 dev_err(adev->dev, "amdgpu_pmu_init failed\n"); 4812 4813 /* Have stored pci confspace at hand for restore in sudden PCI error */ 4814 if (amdgpu_device_cache_pci_state(adev->pdev)) 4815 pci_restore_state(pdev); 4816 4817 /* if we have > 1 VGA cards, then disable the amdgpu VGA resources */ 4818 /* this will fail for cards that aren't VGA class devices, just 4819 * ignore it 4820 */ 4821 if ((adev->pdev->class >> 8) == PCI_CLASS_DISPLAY_VGA) 4822 vga_client_register(adev->pdev, amdgpu_device_vga_set_decode); 4823 4824 px = amdgpu_device_supports_px(adev); 4825 4826 if (px || (!dev_is_removable(&adev->pdev->dev) && 4827 apple_gmux_detect(NULL, NULL))) 4828 vga_switcheroo_register_client(adev->pdev, 4829 &amdgpu_switcheroo_ops, px); 4830 4831 if (px) 4832 vga_switcheroo_init_domain_pm_ops(adev->dev, &adev->vga_pm_domain); 4833 4834 if (adev->init_lvl->level == AMDGPU_INIT_LEVEL_MINIMAL_XGMI) 4835 amdgpu_xgmi_reset_on_init(adev); 4836 4837 amdgpu_device_check_iommu_direct_map(adev); 4838 4839 adev->pm_nb.notifier_call = amdgpu_device_pm_notifier; 4840 r = register_pm_notifier(&adev->pm_nb); 4841 if (r) 4842 goto failed; 4843 4844 return 0; 4845 4846 release_ras_con: 4847 if (amdgpu_sriov_vf(adev)) 4848 amdgpu_virt_release_full_gpu(adev, true); 4849 4850 /* failed in exclusive mode due to timeout */ 4851 if (amdgpu_sriov_vf(adev) && 4852 !amdgpu_sriov_runtime(adev) && 4853 amdgpu_virt_mmio_blocked(adev) && 4854 !amdgpu_virt_wait_reset(adev)) { 4855 dev_err(adev->dev, "VF exclusive mode timeout\n"); 4856 /* Don't send request since VF is inactive. */ 4857 adev->virt.caps &= ~AMDGPU_SRIOV_CAPS_RUNTIME; 4858 adev->virt.ops = NULL; 4859 r = -EAGAIN; 4860 } 4861 amdgpu_release_ras_context(adev); 4862 4863 failed: 4864 amdgpu_vf_error_trans_all(adev); 4865 4866 return r; 4867 } 4868 4869 static void amdgpu_device_unmap_mmio(struct amdgpu_device *adev) 4870 { 4871 4872 /* Clear all CPU mappings pointing to this device */ 4873 unmap_mapping_range(adev->ddev.anon_inode->i_mapping, 0, 0, 1); 4874 4875 /* Unmap all mapped bars - Doorbell, registers and VRAM */ 4876 amdgpu_doorbell_fini(adev); 4877 4878 iounmap(adev->rmmio); 4879 adev->rmmio = NULL; 4880 if (adev->mman.aper_base_kaddr) 4881 iounmap(adev->mman.aper_base_kaddr); 4882 adev->mman.aper_base_kaddr = NULL; 4883 4884 /* Memory manager related */ 4885 if (!adev->gmc.xgmi.connected_to_cpu && !adev->gmc.is_app_apu) { 4886 arch_phys_wc_del(adev->gmc.vram_mtrr); 4887 arch_io_free_memtype_wc(adev->gmc.aper_base, adev->gmc.aper_size); 4888 } 4889 } 4890 4891 /** 4892 * amdgpu_device_fini_hw - tear down the driver 4893 * 4894 * @adev: amdgpu_device pointer 4895 * 4896 * Tear down the driver info (all asics). 4897 * Called at driver shutdown. 4898 */ 4899 void amdgpu_device_fini_hw(struct amdgpu_device *adev) 4900 { 4901 dev_info(adev->dev, "amdgpu: finishing device.\n"); 4902 flush_delayed_work(&adev->delayed_init_work); 4903 4904 if (adev->mman.initialized) 4905 drain_workqueue(adev->mman.bdev.wq); 4906 adev->shutdown = true; 4907 4908 unregister_pm_notifier(&adev->pm_nb); 4909 4910 /* make sure IB test finished before entering exclusive mode 4911 * to avoid preemption on IB test 4912 */ 4913 if (amdgpu_sriov_vf(adev)) { 4914 amdgpu_virt_request_full_gpu(adev, false); 4915 amdgpu_virt_fini_data_exchange(adev); 4916 } 4917 4918 /* disable all interrupts */ 4919 amdgpu_irq_disable_all(adev); 4920 if (adev->mode_info.mode_config_initialized) { 4921 if (!drm_drv_uses_atomic_modeset(adev_to_drm(adev))) 4922 drm_helper_force_disable_all(adev_to_drm(adev)); 4923 else 4924 drm_atomic_helper_shutdown(adev_to_drm(adev)); 4925 } 4926 amdgpu_fence_driver_hw_fini(adev); 4927 4928 if (adev->pm.sysfs_initialized) 4929 amdgpu_pm_sysfs_fini(adev); 4930 if (adev->ucode_sysfs_en) 4931 amdgpu_ucode_sysfs_fini(adev); 4932 amdgpu_device_attr_sysfs_fini(adev); 4933 amdgpu_fru_sysfs_fini(adev); 4934 4935 amdgpu_reg_state_sysfs_fini(adev); 4936 amdgpu_xcp_sysfs_fini(adev); 4937 4938 /* disable ras feature must before hw fini */ 4939 amdgpu_ras_pre_fini(adev); 4940 4941 amdgpu_ttm_set_buffer_funcs_status(adev, false); 4942 4943 amdgpu_device_ip_fini_early(adev); 4944 4945 amdgpu_irq_fini_hw(adev); 4946 4947 if (adev->mman.initialized) 4948 ttm_device_clear_dma_mappings(&adev->mman.bdev); 4949 4950 amdgpu_gart_dummy_page_fini(adev); 4951 4952 if (drm_dev_is_unplugged(adev_to_drm(adev))) 4953 amdgpu_device_unmap_mmio(adev); 4954 4955 } 4956 4957 void amdgpu_device_fini_sw(struct amdgpu_device *adev) 4958 { 4959 int i, idx; 4960 bool px; 4961 4962 amdgpu_device_ip_fini(adev); 4963 amdgpu_fence_driver_sw_fini(adev); 4964 amdgpu_ucode_release(&adev->firmware.gpu_info_fw); 4965 adev->accel_working = false; 4966 dma_fence_put(rcu_dereference_protected(adev->gang_submit, true)); 4967 for (i = 0; i < MAX_XCP; ++i) { 4968 dma_fence_put(adev->isolation[i].spearhead); 4969 amdgpu_sync_free(&adev->isolation[i].active); 4970 amdgpu_sync_free(&adev->isolation[i].prev); 4971 } 4972 4973 amdgpu_reset_fini(adev); 4974 4975 /* free i2c buses */ 4976 amdgpu_i2c_fini(adev); 4977 4978 if (adev->bios) { 4979 if (amdgpu_emu_mode != 1) 4980 amdgpu_atombios_fini(adev); 4981 amdgpu_bios_release(adev); 4982 } 4983 4984 kfree(adev->fru_info); 4985 adev->fru_info = NULL; 4986 4987 kfree(adev->xcp_mgr); 4988 adev->xcp_mgr = NULL; 4989 4990 px = amdgpu_device_supports_px(adev); 4991 4992 if (px || (!dev_is_removable(&adev->pdev->dev) && 4993 apple_gmux_detect(NULL, NULL))) 4994 vga_switcheroo_unregister_client(adev->pdev); 4995 4996 if (px) 4997 vga_switcheroo_fini_domain_pm_ops(adev->dev); 4998 4999 if ((adev->pdev->class >> 8) == PCI_CLASS_DISPLAY_VGA) 5000 vga_client_unregister(adev->pdev); 5001 5002 if (drm_dev_enter(adev_to_drm(adev), &idx)) { 5003 5004 iounmap(adev->rmmio); 5005 adev->rmmio = NULL; 5006 drm_dev_exit(idx); 5007 } 5008 5009 if (IS_ENABLED(CONFIG_PERF_EVENTS)) 5010 amdgpu_pmu_fini(adev); 5011 if (adev->mman.discovery_bin) 5012 amdgpu_discovery_fini(adev); 5013 5014 amdgpu_reset_put_reset_domain(adev->reset_domain); 5015 adev->reset_domain = NULL; 5016 5017 kfree(adev->pci_state); 5018 kfree(adev->pcie_reset_ctx.swds_pcistate); 5019 kfree(adev->pcie_reset_ctx.swus_pcistate); 5020 } 5021 5022 /** 5023 * amdgpu_device_evict_resources - evict device resources 5024 * @adev: amdgpu device object 5025 * 5026 * Evicts all ttm device resources(vram BOs, gart table) from the lru list 5027 * of the vram memory type. Mainly used for evicting device resources 5028 * at suspend time. 5029 * 5030 */ 5031 static int amdgpu_device_evict_resources(struct amdgpu_device *adev) 5032 { 5033 int ret; 5034 5035 /* No need to evict vram on APUs unless going to S4 */ 5036 if (!adev->in_s4 && (adev->flags & AMD_IS_APU)) 5037 return 0; 5038 5039 ret = amdgpu_ttm_evict_resources(adev, TTM_PL_VRAM); 5040 if (ret) { 5041 dev_warn(adev->dev, "evicting device resources failed\n"); 5042 return ret; 5043 } 5044 5045 if (adev->in_s4) { 5046 ret = ttm_device_prepare_hibernation(&adev->mman.bdev); 5047 if (ret) 5048 dev_err(adev->dev, "prepare hibernation failed, %d\n", ret); 5049 } 5050 return ret; 5051 } 5052 5053 /* 5054 * Suspend & resume. 5055 */ 5056 /** 5057 * amdgpu_device_pm_notifier - Notification block for Suspend/Hibernate events 5058 * @nb: notifier block 5059 * @mode: suspend mode 5060 * @data: data 5061 * 5062 * This function is called when the system is about to suspend or hibernate. 5063 * It is used to set the appropriate flags so that eviction can be optimized 5064 * in the pm prepare callback. 5065 */ 5066 static int amdgpu_device_pm_notifier(struct notifier_block *nb, unsigned long mode, 5067 void *data) 5068 { 5069 struct amdgpu_device *adev = container_of(nb, struct amdgpu_device, pm_nb); 5070 5071 switch (mode) { 5072 case PM_HIBERNATION_PREPARE: 5073 adev->in_s4 = true; 5074 break; 5075 case PM_POST_HIBERNATION: 5076 adev->in_s4 = false; 5077 break; 5078 } 5079 5080 return NOTIFY_DONE; 5081 } 5082 5083 /** 5084 * amdgpu_device_prepare - prepare for device suspend 5085 * 5086 * @dev: drm dev pointer 5087 * 5088 * Prepare to put the hw in the suspend state (all asics). 5089 * Returns 0 for success or an error on failure. 5090 * Called at driver suspend. 5091 */ 5092 int amdgpu_device_prepare(struct drm_device *dev) 5093 { 5094 struct amdgpu_device *adev = drm_to_adev(dev); 5095 int i, r; 5096 5097 if (dev->switch_power_state == DRM_SWITCH_POWER_OFF) 5098 return 0; 5099 5100 /* Evict the majority of BOs before starting suspend sequence */ 5101 r = amdgpu_device_evict_resources(adev); 5102 if (r) 5103 return r; 5104 5105 flush_delayed_work(&adev->gfx.gfx_off_delay_work); 5106 5107 for (i = 0; i < adev->num_ip_blocks; i++) { 5108 if (!adev->ip_blocks[i].status.valid) 5109 continue; 5110 if (!adev->ip_blocks[i].version->funcs->prepare_suspend) 5111 continue; 5112 r = adev->ip_blocks[i].version->funcs->prepare_suspend(&adev->ip_blocks[i]); 5113 if (r) 5114 return r; 5115 } 5116 5117 return 0; 5118 } 5119 5120 /** 5121 * amdgpu_device_complete - complete power state transition 5122 * 5123 * @dev: drm dev pointer 5124 * 5125 * Undo the changes from amdgpu_device_prepare. This will be 5126 * called on all resume transitions, including those that failed. 5127 */ 5128 void amdgpu_device_complete(struct drm_device *dev) 5129 { 5130 struct amdgpu_device *adev = drm_to_adev(dev); 5131 int i; 5132 5133 for (i = 0; i < adev->num_ip_blocks; i++) { 5134 if (!adev->ip_blocks[i].status.valid) 5135 continue; 5136 if (!adev->ip_blocks[i].version->funcs->complete) 5137 continue; 5138 adev->ip_blocks[i].version->funcs->complete(&adev->ip_blocks[i]); 5139 } 5140 } 5141 5142 /** 5143 * amdgpu_device_suspend - initiate device suspend 5144 * 5145 * @dev: drm dev pointer 5146 * @notify_clients: notify in-kernel DRM clients 5147 * 5148 * Puts the hw in the suspend state (all asics). 5149 * Returns 0 for success or an error on failure. 5150 * Called at driver suspend. 5151 */ 5152 int amdgpu_device_suspend(struct drm_device *dev, bool notify_clients) 5153 { 5154 struct amdgpu_device *adev = drm_to_adev(dev); 5155 int r = 0; 5156 5157 if (dev->switch_power_state == DRM_SWITCH_POWER_OFF) 5158 return 0; 5159 5160 adev->in_suspend = true; 5161 5162 if (amdgpu_sriov_vf(adev)) { 5163 if (!adev->in_s0ix && !adev->in_runpm) 5164 amdgpu_amdkfd_suspend_process(adev); 5165 amdgpu_virt_fini_data_exchange(adev); 5166 r = amdgpu_virt_request_full_gpu(adev, false); 5167 if (r) 5168 return r; 5169 } 5170 5171 if (amdgpu_acpi_smart_shift_update(adev, AMDGPU_SS_DEV_D3)) 5172 dev_warn(adev->dev, "smart shift update failed\n"); 5173 5174 if (notify_clients) 5175 drm_client_dev_suspend(adev_to_drm(adev), false); 5176 5177 cancel_delayed_work_sync(&adev->delayed_init_work); 5178 5179 amdgpu_ras_suspend(adev); 5180 5181 amdgpu_device_ip_suspend_phase1(adev); 5182 5183 if (!adev->in_s0ix) { 5184 amdgpu_amdkfd_suspend(adev, !amdgpu_sriov_vf(adev) && !adev->in_runpm); 5185 amdgpu_userq_suspend(adev); 5186 } 5187 5188 r = amdgpu_device_evict_resources(adev); 5189 if (r) 5190 return r; 5191 5192 amdgpu_ttm_set_buffer_funcs_status(adev, false); 5193 5194 amdgpu_fence_driver_hw_fini(adev); 5195 5196 amdgpu_device_ip_suspend_phase2(adev); 5197 5198 if (amdgpu_sriov_vf(adev)) 5199 amdgpu_virt_release_full_gpu(adev, false); 5200 5201 r = amdgpu_dpm_notify_rlc_state(adev, false); 5202 if (r) 5203 return r; 5204 5205 return 0; 5206 } 5207 5208 static inline int amdgpu_virt_resume(struct amdgpu_device *adev) 5209 { 5210 int r; 5211 unsigned int prev_physical_node_id = adev->gmc.xgmi.physical_node_id; 5212 5213 /* During VM resume, QEMU programming of VF MSIX table (register GFXMSIX_VECT0_ADDR_LO) 5214 * may not work. The access could be blocked by nBIF protection as VF isn't in 5215 * exclusive access mode. Exclusive access is enabled now, disable/enable MSIX 5216 * so that QEMU reprograms MSIX table. 5217 */ 5218 amdgpu_restore_msix(adev); 5219 5220 r = adev->gfxhub.funcs->get_xgmi_info(adev); 5221 if (r) 5222 return r; 5223 5224 dev_info(adev->dev, "xgmi node, old id %d, new id %d\n", 5225 prev_physical_node_id, adev->gmc.xgmi.physical_node_id); 5226 5227 adev->vm_manager.vram_base_offset = adev->gfxhub.funcs->get_mc_fb_offset(adev); 5228 adev->vm_manager.vram_base_offset += 5229 adev->gmc.xgmi.physical_node_id * adev->gmc.xgmi.node_segment_size; 5230 5231 return 0; 5232 } 5233 5234 /** 5235 * amdgpu_device_resume - initiate device resume 5236 * 5237 * @dev: drm dev pointer 5238 * @notify_clients: notify in-kernel DRM clients 5239 * 5240 * Bring the hw back to operating state (all asics). 5241 * Returns 0 for success or an error on failure. 5242 * Called at driver resume. 5243 */ 5244 int amdgpu_device_resume(struct drm_device *dev, bool notify_clients) 5245 { 5246 struct amdgpu_device *adev = drm_to_adev(dev); 5247 int r = 0; 5248 5249 if (amdgpu_sriov_vf(adev)) { 5250 r = amdgpu_virt_request_full_gpu(adev, true); 5251 if (r) 5252 return r; 5253 } 5254 5255 if (amdgpu_virt_xgmi_migrate_enabled(adev)) { 5256 r = amdgpu_virt_resume(adev); 5257 if (r) 5258 goto exit; 5259 } 5260 5261 if (dev->switch_power_state == DRM_SWITCH_POWER_OFF) 5262 return 0; 5263 5264 if (adev->in_s0ix) 5265 amdgpu_dpm_gfx_state_change(adev, sGpuChangeState_D0Entry); 5266 5267 /* post card */ 5268 if (amdgpu_device_need_post(adev)) { 5269 r = amdgpu_device_asic_init(adev); 5270 if (r) 5271 dev_err(adev->dev, "amdgpu asic init failed\n"); 5272 } 5273 5274 r = amdgpu_device_ip_resume(adev); 5275 5276 if (r) { 5277 dev_err(adev->dev, "amdgpu_device_ip_resume failed (%d).\n", r); 5278 goto exit; 5279 } 5280 5281 if (!adev->in_s0ix) { 5282 r = amdgpu_amdkfd_resume(adev, !amdgpu_sriov_vf(adev) && !adev->in_runpm); 5283 if (r) 5284 goto exit; 5285 5286 r = amdgpu_userq_resume(adev); 5287 if (r) 5288 goto exit; 5289 } 5290 5291 r = amdgpu_device_ip_late_init(adev); 5292 if (r) 5293 goto exit; 5294 5295 queue_delayed_work(system_wq, &adev->delayed_init_work, 5296 msecs_to_jiffies(AMDGPU_RESUME_MS)); 5297 exit: 5298 if (amdgpu_sriov_vf(adev)) { 5299 amdgpu_virt_init_data_exchange(adev); 5300 amdgpu_virt_release_full_gpu(adev, true); 5301 5302 if (!adev->in_s0ix && !r && !adev->in_runpm) 5303 r = amdgpu_amdkfd_resume_process(adev); 5304 } 5305 5306 if (r) 5307 return r; 5308 5309 /* Make sure IB tests flushed */ 5310 flush_delayed_work(&adev->delayed_init_work); 5311 5312 if (notify_clients) 5313 drm_client_dev_resume(adev_to_drm(adev), false); 5314 5315 amdgpu_ras_resume(adev); 5316 5317 if (adev->mode_info.num_crtc) { 5318 /* 5319 * Most of the connector probing functions try to acquire runtime pm 5320 * refs to ensure that the GPU is powered on when connector polling is 5321 * performed. Since we're calling this from a runtime PM callback, 5322 * trying to acquire rpm refs will cause us to deadlock. 5323 * 5324 * Since we're guaranteed to be holding the rpm lock, it's safe to 5325 * temporarily disable the rpm helpers so this doesn't deadlock us. 5326 */ 5327 #ifdef CONFIG_PM 5328 dev->dev->power.disable_depth++; 5329 #endif 5330 if (!adev->dc_enabled) 5331 drm_helper_hpd_irq_event(dev); 5332 else 5333 drm_kms_helper_hotplug_event(dev); 5334 #ifdef CONFIG_PM 5335 dev->dev->power.disable_depth--; 5336 #endif 5337 } 5338 adev->in_suspend = false; 5339 5340 if (amdgpu_acpi_smart_shift_update(adev, AMDGPU_SS_DEV_D0)) 5341 dev_warn(adev->dev, "smart shift update failed\n"); 5342 5343 return 0; 5344 } 5345 5346 /** 5347 * amdgpu_device_ip_check_soft_reset - did soft reset succeed 5348 * 5349 * @adev: amdgpu_device pointer 5350 * 5351 * The list of all the hardware IPs that make up the asic is walked and 5352 * the check_soft_reset callbacks are run. check_soft_reset determines 5353 * if the asic is still hung or not. 5354 * Returns true if any of the IPs are still in a hung state, false if not. 5355 */ 5356 static bool amdgpu_device_ip_check_soft_reset(struct amdgpu_device *adev) 5357 { 5358 int i; 5359 bool asic_hang = false; 5360 5361 if (amdgpu_sriov_vf(adev)) 5362 return true; 5363 5364 if (amdgpu_asic_need_full_reset(adev)) 5365 return true; 5366 5367 for (i = 0; i < adev->num_ip_blocks; i++) { 5368 if (!adev->ip_blocks[i].status.valid) 5369 continue; 5370 if (adev->ip_blocks[i].version->funcs->check_soft_reset) 5371 adev->ip_blocks[i].status.hang = 5372 adev->ip_blocks[i].version->funcs->check_soft_reset( 5373 &adev->ip_blocks[i]); 5374 if (adev->ip_blocks[i].status.hang) { 5375 dev_info(adev->dev, "IP block:%s is hung!\n", adev->ip_blocks[i].version->funcs->name); 5376 asic_hang = true; 5377 } 5378 } 5379 return asic_hang; 5380 } 5381 5382 /** 5383 * amdgpu_device_ip_pre_soft_reset - prepare for soft reset 5384 * 5385 * @adev: amdgpu_device pointer 5386 * 5387 * The list of all the hardware IPs that make up the asic is walked and the 5388 * pre_soft_reset callbacks are run if the block is hung. pre_soft_reset 5389 * handles any IP specific hardware or software state changes that are 5390 * necessary for a soft reset to succeed. 5391 * Returns 0 on success, negative error code on failure. 5392 */ 5393 static int amdgpu_device_ip_pre_soft_reset(struct amdgpu_device *adev) 5394 { 5395 int i, r = 0; 5396 5397 for (i = 0; i < adev->num_ip_blocks; i++) { 5398 if (!adev->ip_blocks[i].status.valid) 5399 continue; 5400 if (adev->ip_blocks[i].status.hang && 5401 adev->ip_blocks[i].version->funcs->pre_soft_reset) { 5402 r = adev->ip_blocks[i].version->funcs->pre_soft_reset(&adev->ip_blocks[i]); 5403 if (r) 5404 return r; 5405 } 5406 } 5407 5408 return 0; 5409 } 5410 5411 /** 5412 * amdgpu_device_ip_need_full_reset - check if a full asic reset is needed 5413 * 5414 * @adev: amdgpu_device pointer 5415 * 5416 * Some hardware IPs cannot be soft reset. If they are hung, a full gpu 5417 * reset is necessary to recover. 5418 * Returns true if a full asic reset is required, false if not. 5419 */ 5420 static bool amdgpu_device_ip_need_full_reset(struct amdgpu_device *adev) 5421 { 5422 int i; 5423 5424 if (amdgpu_asic_need_full_reset(adev)) 5425 return true; 5426 5427 for (i = 0; i < adev->num_ip_blocks; i++) { 5428 if (!adev->ip_blocks[i].status.valid) 5429 continue; 5430 if ((adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) || 5431 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) || 5432 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_ACP) || 5433 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE) || 5434 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) { 5435 if (adev->ip_blocks[i].status.hang) { 5436 dev_info(adev->dev, "Some block need full reset!\n"); 5437 return true; 5438 } 5439 } 5440 } 5441 return false; 5442 } 5443 5444 /** 5445 * amdgpu_device_ip_soft_reset - do a soft reset 5446 * 5447 * @adev: amdgpu_device pointer 5448 * 5449 * The list of all the hardware IPs that make up the asic is walked and the 5450 * soft_reset callbacks are run if the block is hung. soft_reset handles any 5451 * IP specific hardware or software state changes that are necessary to soft 5452 * reset the IP. 5453 * Returns 0 on success, negative error code on failure. 5454 */ 5455 static int amdgpu_device_ip_soft_reset(struct amdgpu_device *adev) 5456 { 5457 int i, r = 0; 5458 5459 for (i = 0; i < adev->num_ip_blocks; i++) { 5460 if (!adev->ip_blocks[i].status.valid) 5461 continue; 5462 if (adev->ip_blocks[i].status.hang && 5463 adev->ip_blocks[i].version->funcs->soft_reset) { 5464 r = adev->ip_blocks[i].version->funcs->soft_reset(&adev->ip_blocks[i]); 5465 if (r) 5466 return r; 5467 } 5468 } 5469 5470 return 0; 5471 } 5472 5473 /** 5474 * amdgpu_device_ip_post_soft_reset - clean up from soft reset 5475 * 5476 * @adev: amdgpu_device pointer 5477 * 5478 * The list of all the hardware IPs that make up the asic is walked and the 5479 * post_soft_reset callbacks are run if the asic was hung. post_soft_reset 5480 * handles any IP specific hardware or software state changes that are 5481 * necessary after the IP has been soft reset. 5482 * Returns 0 on success, negative error code on failure. 5483 */ 5484 static int amdgpu_device_ip_post_soft_reset(struct amdgpu_device *adev) 5485 { 5486 int i, r = 0; 5487 5488 for (i = 0; i < adev->num_ip_blocks; i++) { 5489 if (!adev->ip_blocks[i].status.valid) 5490 continue; 5491 if (adev->ip_blocks[i].status.hang && 5492 adev->ip_blocks[i].version->funcs->post_soft_reset) 5493 r = adev->ip_blocks[i].version->funcs->post_soft_reset(&adev->ip_blocks[i]); 5494 if (r) 5495 return r; 5496 } 5497 5498 return 0; 5499 } 5500 5501 /** 5502 * amdgpu_device_reset_sriov - reset ASIC for SR-IOV vf 5503 * 5504 * @adev: amdgpu_device pointer 5505 * @reset_context: amdgpu reset context pointer 5506 * 5507 * do VF FLR and reinitialize Asic 5508 * return 0 means succeeded otherwise failed 5509 */ 5510 static int amdgpu_device_reset_sriov(struct amdgpu_device *adev, 5511 struct amdgpu_reset_context *reset_context) 5512 { 5513 int r; 5514 struct amdgpu_hive_info *hive = NULL; 5515 5516 if (test_bit(AMDGPU_HOST_FLR, &reset_context->flags)) { 5517 if (!amdgpu_ras_get_fed_status(adev)) 5518 amdgpu_virt_ready_to_reset(adev); 5519 amdgpu_virt_wait_reset(adev); 5520 clear_bit(AMDGPU_HOST_FLR, &reset_context->flags); 5521 r = amdgpu_virt_request_full_gpu(adev, true); 5522 } else { 5523 r = amdgpu_virt_reset_gpu(adev); 5524 } 5525 if (r) 5526 return r; 5527 5528 amdgpu_ras_clear_err_state(adev); 5529 amdgpu_irq_gpu_reset_resume_helper(adev); 5530 5531 /* some sw clean up VF needs to do before recover */ 5532 amdgpu_virt_post_reset(adev); 5533 5534 /* Resume IP prior to SMC */ 5535 r = amdgpu_device_ip_reinit_early_sriov(adev); 5536 if (r) 5537 return r; 5538 5539 amdgpu_virt_init_data_exchange(adev); 5540 5541 r = amdgpu_device_fw_loading(adev); 5542 if (r) 5543 return r; 5544 5545 /* now we are okay to resume SMC/CP/SDMA */ 5546 r = amdgpu_device_ip_reinit_late_sriov(adev); 5547 if (r) 5548 return r; 5549 5550 hive = amdgpu_get_xgmi_hive(adev); 5551 /* Update PSP FW topology after reset */ 5552 if (hive && adev->gmc.xgmi.num_physical_nodes > 1) 5553 r = amdgpu_xgmi_update_topology(hive, adev); 5554 if (hive) 5555 amdgpu_put_xgmi_hive(hive); 5556 if (r) 5557 return r; 5558 5559 r = amdgpu_ib_ring_tests(adev); 5560 if (r) 5561 return r; 5562 5563 if (adev->virt.gim_feature & AMDGIM_FEATURE_GIM_FLR_VRAMLOST) 5564 amdgpu_inc_vram_lost(adev); 5565 5566 /* need to be called during full access so we can't do it later like 5567 * bare-metal does. 5568 */ 5569 amdgpu_amdkfd_post_reset(adev); 5570 amdgpu_virt_release_full_gpu(adev, true); 5571 5572 /* Aldebaran and gfx_11_0_3 support ras in SRIOV, so need resume ras during reset */ 5573 if (amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 2) || 5574 amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 3) || 5575 amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 4) || 5576 amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 5, 0) || 5577 amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(11, 0, 3)) 5578 amdgpu_ras_resume(adev); 5579 5580 amdgpu_virt_ras_telemetry_post_reset(adev); 5581 5582 return 0; 5583 } 5584 5585 /** 5586 * amdgpu_device_has_job_running - check if there is any unfinished job 5587 * 5588 * @adev: amdgpu_device pointer 5589 * 5590 * check if there is any job running on the device when guest driver receives 5591 * FLR notification from host driver. If there are still jobs running, then 5592 * the guest driver will not respond the FLR reset. Instead, let the job hit 5593 * the timeout and guest driver then issue the reset request. 5594 */ 5595 bool amdgpu_device_has_job_running(struct amdgpu_device *adev) 5596 { 5597 int i; 5598 5599 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 5600 struct amdgpu_ring *ring = adev->rings[i]; 5601 5602 if (!amdgpu_ring_sched_ready(ring)) 5603 continue; 5604 5605 if (amdgpu_fence_count_emitted(ring)) 5606 return true; 5607 } 5608 return false; 5609 } 5610 5611 /** 5612 * amdgpu_device_should_recover_gpu - check if we should try GPU recovery 5613 * 5614 * @adev: amdgpu_device pointer 5615 * 5616 * Check amdgpu_gpu_recovery and SRIOV status to see if we should try to recover 5617 * a hung GPU. 5618 */ 5619 bool amdgpu_device_should_recover_gpu(struct amdgpu_device *adev) 5620 { 5621 5622 if (amdgpu_gpu_recovery == 0) 5623 goto disabled; 5624 5625 /* Skip soft reset check in fatal error mode */ 5626 if (!amdgpu_ras_is_poison_mode_supported(adev)) 5627 return true; 5628 5629 if (amdgpu_sriov_vf(adev)) 5630 return true; 5631 5632 if (amdgpu_gpu_recovery == -1) { 5633 switch (adev->asic_type) { 5634 #ifdef CONFIG_DRM_AMDGPU_SI 5635 case CHIP_VERDE: 5636 case CHIP_TAHITI: 5637 case CHIP_PITCAIRN: 5638 case CHIP_OLAND: 5639 case CHIP_HAINAN: 5640 #endif 5641 #ifdef CONFIG_DRM_AMDGPU_CIK 5642 case CHIP_KAVERI: 5643 case CHIP_KABINI: 5644 case CHIP_MULLINS: 5645 #endif 5646 case CHIP_CARRIZO: 5647 case CHIP_STONEY: 5648 case CHIP_CYAN_SKILLFISH: 5649 goto disabled; 5650 default: 5651 break; 5652 } 5653 } 5654 5655 return true; 5656 5657 disabled: 5658 dev_info(adev->dev, "GPU recovery disabled.\n"); 5659 return false; 5660 } 5661 5662 int amdgpu_device_mode1_reset(struct amdgpu_device *adev) 5663 { 5664 u32 i; 5665 int ret = 0; 5666 5667 if (adev->bios) 5668 amdgpu_atombios_scratch_regs_engine_hung(adev, true); 5669 5670 dev_info(adev->dev, "GPU mode1 reset\n"); 5671 5672 /* Cache the state before bus master disable. The saved config space 5673 * values are used in other cases like restore after mode-2 reset. 5674 */ 5675 amdgpu_device_cache_pci_state(adev->pdev); 5676 5677 /* disable BM */ 5678 pci_clear_master(adev->pdev); 5679 5680 if (amdgpu_dpm_is_mode1_reset_supported(adev)) { 5681 dev_info(adev->dev, "GPU smu mode1 reset\n"); 5682 ret = amdgpu_dpm_mode1_reset(adev); 5683 } else { 5684 dev_info(adev->dev, "GPU psp mode1 reset\n"); 5685 ret = psp_gpu_reset(adev); 5686 } 5687 5688 if (ret) 5689 goto mode1_reset_failed; 5690 5691 amdgpu_device_load_pci_state(adev->pdev); 5692 ret = amdgpu_psp_wait_for_bootloader(adev); 5693 if (ret) 5694 goto mode1_reset_failed; 5695 5696 /* wait for asic to come out of reset */ 5697 for (i = 0; i < adev->usec_timeout; i++) { 5698 u32 memsize = adev->nbio.funcs->get_memsize(adev); 5699 5700 if (memsize != 0xffffffff) 5701 break; 5702 udelay(1); 5703 } 5704 5705 if (i >= adev->usec_timeout) { 5706 ret = -ETIMEDOUT; 5707 goto mode1_reset_failed; 5708 } 5709 5710 if (adev->bios) 5711 amdgpu_atombios_scratch_regs_engine_hung(adev, false); 5712 5713 return 0; 5714 5715 mode1_reset_failed: 5716 dev_err(adev->dev, "GPU mode1 reset failed\n"); 5717 return ret; 5718 } 5719 5720 int amdgpu_device_link_reset(struct amdgpu_device *adev) 5721 { 5722 int ret = 0; 5723 5724 dev_info(adev->dev, "GPU link reset\n"); 5725 5726 if (!amdgpu_reset_in_dpc(adev)) 5727 ret = amdgpu_dpm_link_reset(adev); 5728 5729 if (ret) 5730 goto link_reset_failed; 5731 5732 ret = amdgpu_psp_wait_for_bootloader(adev); 5733 if (ret) 5734 goto link_reset_failed; 5735 5736 return 0; 5737 5738 link_reset_failed: 5739 dev_err(adev->dev, "GPU link reset failed\n"); 5740 return ret; 5741 } 5742 5743 int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev, 5744 struct amdgpu_reset_context *reset_context) 5745 { 5746 int i, r = 0; 5747 struct amdgpu_job *job = NULL; 5748 struct amdgpu_device *tmp_adev = reset_context->reset_req_dev; 5749 bool need_full_reset = 5750 test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 5751 5752 if (reset_context->reset_req_dev == adev) 5753 job = reset_context->job; 5754 5755 if (amdgpu_sriov_vf(adev)) 5756 amdgpu_virt_pre_reset(adev); 5757 5758 amdgpu_fence_driver_isr_toggle(adev, true); 5759 5760 /* block all schedulers and reset given job's ring */ 5761 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 5762 struct amdgpu_ring *ring = adev->rings[i]; 5763 5764 if (!amdgpu_ring_sched_ready(ring)) 5765 continue; 5766 5767 /* Clear job fence from fence drv to avoid force_completion 5768 * leave NULL and vm flush fence in fence drv 5769 */ 5770 amdgpu_fence_driver_clear_job_fences(ring); 5771 5772 /* after all hw jobs are reset, hw fence is meaningless, so force_completion */ 5773 amdgpu_fence_driver_force_completion(ring); 5774 } 5775 5776 amdgpu_fence_driver_isr_toggle(adev, false); 5777 5778 if (job && job->vm) 5779 drm_sched_increase_karma(&job->base); 5780 5781 r = amdgpu_reset_prepare_hwcontext(adev, reset_context); 5782 /* If reset handler not implemented, continue; otherwise return */ 5783 if (r == -EOPNOTSUPP) 5784 r = 0; 5785 else 5786 return r; 5787 5788 /* Don't suspend on bare metal if we are not going to HW reset the ASIC */ 5789 if (!amdgpu_sriov_vf(adev)) { 5790 5791 if (!need_full_reset) 5792 need_full_reset = amdgpu_device_ip_need_full_reset(adev); 5793 5794 if (!need_full_reset && amdgpu_gpu_recovery && 5795 amdgpu_device_ip_check_soft_reset(adev)) { 5796 amdgpu_device_ip_pre_soft_reset(adev); 5797 r = amdgpu_device_ip_soft_reset(adev); 5798 amdgpu_device_ip_post_soft_reset(adev); 5799 if (r || amdgpu_device_ip_check_soft_reset(adev)) { 5800 dev_info(adev->dev, "soft reset failed, will fallback to full reset!\n"); 5801 need_full_reset = true; 5802 } 5803 } 5804 5805 if (!test_bit(AMDGPU_SKIP_COREDUMP, &reset_context->flags)) { 5806 dev_info(tmp_adev->dev, "Dumping IP State\n"); 5807 /* Trigger ip dump before we reset the asic */ 5808 for (i = 0; i < tmp_adev->num_ip_blocks; i++) 5809 if (tmp_adev->ip_blocks[i].version->funcs->dump_ip_state) 5810 tmp_adev->ip_blocks[i].version->funcs 5811 ->dump_ip_state((void *)&tmp_adev->ip_blocks[i]); 5812 dev_info(tmp_adev->dev, "Dumping IP State Completed\n"); 5813 } 5814 5815 if (need_full_reset) 5816 r = amdgpu_device_ip_suspend(adev); 5817 if (need_full_reset) 5818 set_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 5819 else 5820 clear_bit(AMDGPU_NEED_FULL_RESET, 5821 &reset_context->flags); 5822 } 5823 5824 return r; 5825 } 5826 5827 int amdgpu_device_reinit_after_reset(struct amdgpu_reset_context *reset_context) 5828 { 5829 struct list_head *device_list_handle; 5830 bool full_reset, vram_lost = false; 5831 struct amdgpu_device *tmp_adev; 5832 int r, init_level; 5833 5834 device_list_handle = reset_context->reset_device_list; 5835 5836 if (!device_list_handle) 5837 return -EINVAL; 5838 5839 full_reset = test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 5840 5841 /** 5842 * If it's reset on init, it's default init level, otherwise keep level 5843 * as recovery level. 5844 */ 5845 if (reset_context->method == AMD_RESET_METHOD_ON_INIT) 5846 init_level = AMDGPU_INIT_LEVEL_DEFAULT; 5847 else 5848 init_level = AMDGPU_INIT_LEVEL_RESET_RECOVERY; 5849 5850 r = 0; 5851 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 5852 amdgpu_set_init_level(tmp_adev, init_level); 5853 if (full_reset) { 5854 /* post card */ 5855 amdgpu_reset_set_dpc_status(tmp_adev, false); 5856 amdgpu_ras_clear_err_state(tmp_adev); 5857 r = amdgpu_device_asic_init(tmp_adev); 5858 if (r) { 5859 dev_warn(tmp_adev->dev, "asic atom init failed!"); 5860 } else { 5861 dev_info(tmp_adev->dev, "GPU reset succeeded, trying to resume\n"); 5862 5863 r = amdgpu_device_ip_resume_phase1(tmp_adev); 5864 if (r) 5865 goto out; 5866 5867 vram_lost = amdgpu_device_check_vram_lost(tmp_adev); 5868 5869 if (!test_bit(AMDGPU_SKIP_COREDUMP, &reset_context->flags)) 5870 amdgpu_coredump(tmp_adev, false, vram_lost, reset_context->job); 5871 5872 if (vram_lost) { 5873 dev_info( 5874 tmp_adev->dev, 5875 "VRAM is lost due to GPU reset!\n"); 5876 amdgpu_inc_vram_lost(tmp_adev); 5877 } 5878 5879 r = amdgpu_device_fw_loading(tmp_adev); 5880 if (r) 5881 return r; 5882 5883 r = amdgpu_xcp_restore_partition_mode( 5884 tmp_adev->xcp_mgr); 5885 if (r) 5886 goto out; 5887 5888 r = amdgpu_device_ip_resume_phase2(tmp_adev); 5889 if (r) 5890 goto out; 5891 5892 if (tmp_adev->mman.buffer_funcs_ring->sched.ready) 5893 amdgpu_ttm_set_buffer_funcs_status(tmp_adev, true); 5894 5895 r = amdgpu_device_ip_resume_phase3(tmp_adev); 5896 if (r) 5897 goto out; 5898 5899 if (vram_lost) 5900 amdgpu_device_fill_reset_magic(tmp_adev); 5901 5902 /* 5903 * Add this ASIC as tracked as reset was already 5904 * complete successfully. 5905 */ 5906 amdgpu_register_gpu_instance(tmp_adev); 5907 5908 if (!reset_context->hive && 5909 tmp_adev->gmc.xgmi.num_physical_nodes > 1) 5910 amdgpu_xgmi_add_device(tmp_adev); 5911 5912 r = amdgpu_device_ip_late_init(tmp_adev); 5913 if (r) 5914 goto out; 5915 5916 drm_client_dev_resume(adev_to_drm(tmp_adev), false); 5917 5918 /* 5919 * The GPU enters bad state once faulty pages 5920 * by ECC has reached the threshold, and ras 5921 * recovery is scheduled next. So add one check 5922 * here to break recovery if it indeed exceeds 5923 * bad page threshold, and remind user to 5924 * retire this GPU or setting one bigger 5925 * bad_page_threshold value to fix this once 5926 * probing driver again. 5927 */ 5928 if (!amdgpu_ras_is_rma(tmp_adev)) { 5929 /* must succeed. */ 5930 amdgpu_ras_resume(tmp_adev); 5931 } else { 5932 r = -EINVAL; 5933 goto out; 5934 } 5935 5936 /* Update PSP FW topology after reset */ 5937 if (reset_context->hive && 5938 tmp_adev->gmc.xgmi.num_physical_nodes > 1) 5939 r = amdgpu_xgmi_update_topology( 5940 reset_context->hive, tmp_adev); 5941 } 5942 } 5943 5944 out: 5945 if (!r) { 5946 /* IP init is complete now, set level as default */ 5947 amdgpu_set_init_level(tmp_adev, 5948 AMDGPU_INIT_LEVEL_DEFAULT); 5949 amdgpu_irq_gpu_reset_resume_helper(tmp_adev); 5950 r = amdgpu_ib_ring_tests(tmp_adev); 5951 if (r) { 5952 dev_err(tmp_adev->dev, "ib ring test failed (%d).\n", r); 5953 r = -EAGAIN; 5954 goto end; 5955 } 5956 } 5957 5958 if (r) 5959 tmp_adev->asic_reset_res = r; 5960 } 5961 5962 end: 5963 return r; 5964 } 5965 5966 int amdgpu_do_asic_reset(struct list_head *device_list_handle, 5967 struct amdgpu_reset_context *reset_context) 5968 { 5969 struct amdgpu_device *tmp_adev = NULL; 5970 bool need_full_reset, skip_hw_reset; 5971 int r = 0; 5972 5973 /* Try reset handler method first */ 5974 tmp_adev = list_first_entry(device_list_handle, struct amdgpu_device, 5975 reset_list); 5976 5977 reset_context->reset_device_list = device_list_handle; 5978 r = amdgpu_reset_perform_reset(tmp_adev, reset_context); 5979 /* If reset handler not implemented, continue; otherwise return */ 5980 if (r == -EOPNOTSUPP) 5981 r = 0; 5982 else 5983 return r; 5984 5985 /* Reset handler not implemented, use the default method */ 5986 need_full_reset = 5987 test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 5988 skip_hw_reset = test_bit(AMDGPU_SKIP_HW_RESET, &reset_context->flags); 5989 5990 /* 5991 * ASIC reset has to be done on all XGMI hive nodes ASAP 5992 * to allow proper links negotiation in FW (within 1 sec) 5993 */ 5994 if (!skip_hw_reset && need_full_reset) { 5995 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 5996 /* For XGMI run all resets in parallel to speed up the process */ 5997 if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) { 5998 if (!queue_work(system_unbound_wq, 5999 &tmp_adev->xgmi_reset_work)) 6000 r = -EALREADY; 6001 } else 6002 r = amdgpu_asic_reset(tmp_adev); 6003 6004 if (r) { 6005 dev_err(tmp_adev->dev, 6006 "ASIC reset failed with error, %d for drm dev, %s", 6007 r, adev_to_drm(tmp_adev)->unique); 6008 goto out; 6009 } 6010 } 6011 6012 /* For XGMI wait for all resets to complete before proceed */ 6013 if (!r) { 6014 list_for_each_entry(tmp_adev, device_list_handle, 6015 reset_list) { 6016 if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) { 6017 flush_work(&tmp_adev->xgmi_reset_work); 6018 r = tmp_adev->asic_reset_res; 6019 if (r) 6020 break; 6021 } 6022 } 6023 } 6024 } 6025 6026 if (!r && amdgpu_ras_intr_triggered()) { 6027 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 6028 amdgpu_ras_reset_error_count(tmp_adev, 6029 AMDGPU_RAS_BLOCK__MMHUB); 6030 } 6031 6032 amdgpu_ras_intr_cleared(); 6033 } 6034 6035 r = amdgpu_device_reinit_after_reset(reset_context); 6036 if (r == -EAGAIN) 6037 set_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 6038 else 6039 clear_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 6040 6041 out: 6042 return r; 6043 } 6044 6045 static void amdgpu_device_set_mp1_state(struct amdgpu_device *adev) 6046 { 6047 6048 switch (amdgpu_asic_reset_method(adev)) { 6049 case AMD_RESET_METHOD_MODE1: 6050 case AMD_RESET_METHOD_LINK: 6051 adev->mp1_state = PP_MP1_STATE_SHUTDOWN; 6052 break; 6053 case AMD_RESET_METHOD_MODE2: 6054 adev->mp1_state = PP_MP1_STATE_RESET; 6055 break; 6056 default: 6057 adev->mp1_state = PP_MP1_STATE_NONE; 6058 break; 6059 } 6060 } 6061 6062 static void amdgpu_device_unset_mp1_state(struct amdgpu_device *adev) 6063 { 6064 amdgpu_vf_error_trans_all(adev); 6065 adev->mp1_state = PP_MP1_STATE_NONE; 6066 } 6067 6068 static void amdgpu_device_resume_display_audio(struct amdgpu_device *adev) 6069 { 6070 struct pci_dev *p = NULL; 6071 6072 p = pci_get_domain_bus_and_slot(pci_domain_nr(adev->pdev->bus), 6073 adev->pdev->bus->number, 1); 6074 if (p) { 6075 pm_runtime_enable(&(p->dev)); 6076 pm_runtime_resume(&(p->dev)); 6077 } 6078 6079 pci_dev_put(p); 6080 } 6081 6082 static int amdgpu_device_suspend_display_audio(struct amdgpu_device *adev) 6083 { 6084 enum amd_reset_method reset_method; 6085 struct pci_dev *p = NULL; 6086 u64 expires; 6087 6088 /* 6089 * For now, only BACO and mode1 reset are confirmed 6090 * to suffer the audio issue without proper suspended. 6091 */ 6092 reset_method = amdgpu_asic_reset_method(adev); 6093 if ((reset_method != AMD_RESET_METHOD_BACO) && 6094 (reset_method != AMD_RESET_METHOD_MODE1)) 6095 return -EINVAL; 6096 6097 p = pci_get_domain_bus_and_slot(pci_domain_nr(adev->pdev->bus), 6098 adev->pdev->bus->number, 1); 6099 if (!p) 6100 return -ENODEV; 6101 6102 expires = pm_runtime_autosuspend_expiration(&(p->dev)); 6103 if (!expires) 6104 /* 6105 * If we cannot get the audio device autosuspend delay, 6106 * a fixed 4S interval will be used. Considering 3S is 6107 * the audio controller default autosuspend delay setting. 6108 * 4S used here is guaranteed to cover that. 6109 */ 6110 expires = ktime_get_mono_fast_ns() + NSEC_PER_SEC * 4ULL; 6111 6112 while (!pm_runtime_status_suspended(&(p->dev))) { 6113 if (!pm_runtime_suspend(&(p->dev))) 6114 break; 6115 6116 if (expires < ktime_get_mono_fast_ns()) { 6117 dev_warn(adev->dev, "failed to suspend display audio\n"); 6118 pci_dev_put(p); 6119 /* TODO: abort the succeeding gpu reset? */ 6120 return -ETIMEDOUT; 6121 } 6122 } 6123 6124 pm_runtime_disable(&(p->dev)); 6125 6126 pci_dev_put(p); 6127 return 0; 6128 } 6129 6130 static inline void amdgpu_device_stop_pending_resets(struct amdgpu_device *adev) 6131 { 6132 struct amdgpu_ras *con = amdgpu_ras_get_context(adev); 6133 6134 #if defined(CONFIG_DEBUG_FS) 6135 if (!amdgpu_sriov_vf(adev)) 6136 cancel_work(&adev->reset_work); 6137 #endif 6138 6139 if (adev->kfd.dev) 6140 cancel_work(&adev->kfd.reset_work); 6141 6142 if (amdgpu_sriov_vf(adev)) 6143 cancel_work(&adev->virt.flr_work); 6144 6145 if (con && adev->ras_enabled) 6146 cancel_work(&con->recovery_work); 6147 6148 } 6149 6150 static int amdgpu_device_health_check(struct list_head *device_list_handle) 6151 { 6152 struct amdgpu_device *tmp_adev; 6153 int ret = 0; 6154 6155 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 6156 ret |= amdgpu_device_bus_status_check(tmp_adev); 6157 } 6158 6159 return ret; 6160 } 6161 6162 static void amdgpu_device_recovery_prepare(struct amdgpu_device *adev, 6163 struct list_head *device_list, 6164 struct amdgpu_hive_info *hive) 6165 { 6166 struct amdgpu_device *tmp_adev = NULL; 6167 6168 /* 6169 * Build list of devices to reset. 6170 * In case we are in XGMI hive mode, resort the device list 6171 * to put adev in the 1st position. 6172 */ 6173 if (!amdgpu_sriov_vf(adev) && (adev->gmc.xgmi.num_physical_nodes > 1) && hive) { 6174 list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head) { 6175 list_add_tail(&tmp_adev->reset_list, device_list); 6176 if (adev->shutdown) 6177 tmp_adev->shutdown = true; 6178 if (amdgpu_reset_in_dpc(adev)) 6179 tmp_adev->pcie_reset_ctx.in_link_reset = true; 6180 } 6181 if (!list_is_first(&adev->reset_list, device_list)) 6182 list_rotate_to_front(&adev->reset_list, device_list); 6183 } else { 6184 list_add_tail(&adev->reset_list, device_list); 6185 } 6186 } 6187 6188 static void amdgpu_device_recovery_get_reset_lock(struct amdgpu_device *adev, 6189 struct list_head *device_list) 6190 { 6191 struct amdgpu_device *tmp_adev = NULL; 6192 6193 if (list_empty(device_list)) 6194 return; 6195 tmp_adev = 6196 list_first_entry(device_list, struct amdgpu_device, reset_list); 6197 amdgpu_device_lock_reset_domain(tmp_adev->reset_domain); 6198 } 6199 6200 static void amdgpu_device_recovery_put_reset_lock(struct amdgpu_device *adev, 6201 struct list_head *device_list) 6202 { 6203 struct amdgpu_device *tmp_adev = NULL; 6204 6205 if (list_empty(device_list)) 6206 return; 6207 tmp_adev = 6208 list_first_entry(device_list, struct amdgpu_device, reset_list); 6209 amdgpu_device_unlock_reset_domain(tmp_adev->reset_domain); 6210 } 6211 6212 static void amdgpu_device_halt_activities(struct amdgpu_device *adev, 6213 struct amdgpu_job *job, 6214 struct amdgpu_reset_context *reset_context, 6215 struct list_head *device_list, 6216 struct amdgpu_hive_info *hive, 6217 bool need_emergency_restart) 6218 { 6219 struct amdgpu_device *tmp_adev = NULL; 6220 int i; 6221 6222 /* block all schedulers and reset given job's ring */ 6223 list_for_each_entry(tmp_adev, device_list, reset_list) { 6224 amdgpu_device_set_mp1_state(tmp_adev); 6225 6226 /* 6227 * Try to put the audio codec into suspend state 6228 * before gpu reset started. 6229 * 6230 * Due to the power domain of the graphics device 6231 * is shared with AZ power domain. Without this, 6232 * we may change the audio hardware from behind 6233 * the audio driver's back. That will trigger 6234 * some audio codec errors. 6235 */ 6236 if (!amdgpu_device_suspend_display_audio(tmp_adev)) 6237 tmp_adev->pcie_reset_ctx.audio_suspended = true; 6238 6239 amdgpu_ras_set_error_query_ready(tmp_adev, false); 6240 6241 cancel_delayed_work_sync(&tmp_adev->delayed_init_work); 6242 6243 amdgpu_amdkfd_pre_reset(tmp_adev, reset_context); 6244 6245 /* 6246 * Mark these ASICs to be reset as untracked first 6247 * And add them back after reset completed 6248 */ 6249 amdgpu_unregister_gpu_instance(tmp_adev); 6250 6251 drm_client_dev_suspend(adev_to_drm(tmp_adev), false); 6252 6253 /* disable ras on ALL IPs */ 6254 if (!need_emergency_restart && !amdgpu_reset_in_dpc(adev) && 6255 amdgpu_device_ip_need_full_reset(tmp_adev)) 6256 amdgpu_ras_suspend(tmp_adev); 6257 6258 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 6259 struct amdgpu_ring *ring = tmp_adev->rings[i]; 6260 6261 if (!amdgpu_ring_sched_ready(ring)) 6262 continue; 6263 6264 drm_sched_stop(&ring->sched, job ? &job->base : NULL); 6265 6266 if (need_emergency_restart) 6267 amdgpu_job_stop_all_jobs_on_sched(&ring->sched); 6268 } 6269 atomic_inc(&tmp_adev->gpu_reset_counter); 6270 } 6271 } 6272 6273 static int amdgpu_device_asic_reset(struct amdgpu_device *adev, 6274 struct list_head *device_list, 6275 struct amdgpu_reset_context *reset_context) 6276 { 6277 struct amdgpu_device *tmp_adev = NULL; 6278 int retry_limit = AMDGPU_MAX_RETRY_LIMIT; 6279 int r = 0; 6280 6281 retry: /* Rest of adevs pre asic reset from XGMI hive. */ 6282 list_for_each_entry(tmp_adev, device_list, reset_list) { 6283 r = amdgpu_device_pre_asic_reset(tmp_adev, reset_context); 6284 /*TODO Should we stop ?*/ 6285 if (r) { 6286 dev_err(tmp_adev->dev, "GPU pre asic reset failed with err, %d for drm dev, %s ", 6287 r, adev_to_drm(tmp_adev)->unique); 6288 tmp_adev->asic_reset_res = r; 6289 } 6290 } 6291 6292 /* Actual ASIC resets if needed.*/ 6293 /* Host driver will handle XGMI hive reset for SRIOV */ 6294 if (amdgpu_sriov_vf(adev)) { 6295 6296 /* Bail out of reset early */ 6297 if (amdgpu_ras_is_rma(adev)) 6298 return -ENODEV; 6299 6300 if (amdgpu_ras_get_fed_status(adev) || amdgpu_virt_rcvd_ras_interrupt(adev)) { 6301 dev_dbg(adev->dev, "Detected RAS error, wait for FLR completion\n"); 6302 amdgpu_ras_set_fed(adev, true); 6303 set_bit(AMDGPU_HOST_FLR, &reset_context->flags); 6304 } 6305 6306 r = amdgpu_device_reset_sriov(adev, reset_context); 6307 if (AMDGPU_RETRY_SRIOV_RESET(r) && (retry_limit--) > 0) { 6308 amdgpu_virt_release_full_gpu(adev, true); 6309 goto retry; 6310 } 6311 if (r) 6312 adev->asic_reset_res = r; 6313 } else { 6314 r = amdgpu_do_asic_reset(device_list, reset_context); 6315 if (r && r == -EAGAIN) 6316 goto retry; 6317 } 6318 6319 list_for_each_entry(tmp_adev, device_list, reset_list) { 6320 /* 6321 * Drop any pending non scheduler resets queued before reset is done. 6322 * Any reset scheduled after this point would be valid. Scheduler resets 6323 * were already dropped during drm_sched_stop and no new ones can come 6324 * in before drm_sched_start. 6325 */ 6326 amdgpu_device_stop_pending_resets(tmp_adev); 6327 } 6328 6329 return r; 6330 } 6331 6332 static int amdgpu_device_sched_resume(struct list_head *device_list, 6333 struct amdgpu_reset_context *reset_context, 6334 bool job_signaled) 6335 { 6336 struct amdgpu_device *tmp_adev = NULL; 6337 int i, r = 0; 6338 6339 /* Post ASIC reset for all devs .*/ 6340 list_for_each_entry(tmp_adev, device_list, reset_list) { 6341 6342 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 6343 struct amdgpu_ring *ring = tmp_adev->rings[i]; 6344 6345 if (!amdgpu_ring_sched_ready(ring)) 6346 continue; 6347 6348 drm_sched_start(&ring->sched, 0); 6349 } 6350 6351 if (!drm_drv_uses_atomic_modeset(adev_to_drm(tmp_adev)) && !job_signaled) 6352 drm_helper_resume_force_mode(adev_to_drm(tmp_adev)); 6353 6354 if (tmp_adev->asic_reset_res) 6355 r = tmp_adev->asic_reset_res; 6356 6357 tmp_adev->asic_reset_res = 0; 6358 6359 if (r) { 6360 /* bad news, how to tell it to userspace ? 6361 * for ras error, we should report GPU bad status instead of 6362 * reset failure 6363 */ 6364 if (reset_context->src != AMDGPU_RESET_SRC_RAS || 6365 !amdgpu_ras_eeprom_check_err_threshold(tmp_adev)) 6366 dev_info(tmp_adev->dev, "GPU reset(%d) failed\n", 6367 atomic_read(&tmp_adev->gpu_reset_counter)); 6368 amdgpu_vf_error_put(tmp_adev, AMDGIM_ERROR_VF_GPU_RESET_FAIL, 0, r); 6369 } else { 6370 dev_info(tmp_adev->dev, "GPU reset(%d) succeeded!\n", atomic_read(&tmp_adev->gpu_reset_counter)); 6371 if (amdgpu_acpi_smart_shift_update(tmp_adev, 6372 AMDGPU_SS_DEV_D0)) 6373 dev_warn(tmp_adev->dev, 6374 "smart shift update failed\n"); 6375 } 6376 } 6377 6378 return r; 6379 } 6380 6381 static void amdgpu_device_gpu_resume(struct amdgpu_device *adev, 6382 struct list_head *device_list, 6383 bool need_emergency_restart) 6384 { 6385 struct amdgpu_device *tmp_adev = NULL; 6386 6387 list_for_each_entry(tmp_adev, device_list, reset_list) { 6388 /* unlock kfd: SRIOV would do it separately */ 6389 if (!need_emergency_restart && !amdgpu_sriov_vf(tmp_adev)) 6390 amdgpu_amdkfd_post_reset(tmp_adev); 6391 6392 /* kfd_post_reset will do nothing if kfd device is not initialized, 6393 * need to bring up kfd here if it's not be initialized before 6394 */ 6395 if (!adev->kfd.init_complete) 6396 amdgpu_amdkfd_device_init(adev); 6397 6398 if (tmp_adev->pcie_reset_ctx.audio_suspended) 6399 amdgpu_device_resume_display_audio(tmp_adev); 6400 6401 amdgpu_device_unset_mp1_state(tmp_adev); 6402 6403 amdgpu_ras_set_error_query_ready(tmp_adev, true); 6404 6405 } 6406 } 6407 6408 6409 /** 6410 * amdgpu_device_gpu_recover - reset the asic and recover scheduler 6411 * 6412 * @adev: amdgpu_device pointer 6413 * @job: which job trigger hang 6414 * @reset_context: amdgpu reset context pointer 6415 * 6416 * Attempt to reset the GPU if it has hung (all asics). 6417 * Attempt to do soft-reset or full-reset and reinitialize Asic 6418 * Returns 0 for success or an error on failure. 6419 */ 6420 6421 int amdgpu_device_gpu_recover(struct amdgpu_device *adev, 6422 struct amdgpu_job *job, 6423 struct amdgpu_reset_context *reset_context) 6424 { 6425 struct list_head device_list; 6426 bool job_signaled = false; 6427 struct amdgpu_hive_info *hive = NULL; 6428 int r = 0; 6429 bool need_emergency_restart = false; 6430 6431 /* 6432 * If it reaches here because of hang/timeout and a RAS error is 6433 * detected at the same time, let RAS recovery take care of it. 6434 */ 6435 if (amdgpu_ras_is_err_state(adev, AMDGPU_RAS_BLOCK__ANY) && 6436 !amdgpu_sriov_vf(adev) && 6437 reset_context->src != AMDGPU_RESET_SRC_RAS) { 6438 dev_dbg(adev->dev, 6439 "Gpu recovery from source: %d yielding to RAS error recovery handling", 6440 reset_context->src); 6441 return 0; 6442 } 6443 6444 /* 6445 * Special case: RAS triggered and full reset isn't supported 6446 */ 6447 need_emergency_restart = amdgpu_ras_need_emergency_restart(adev); 6448 6449 /* 6450 * Flush RAM to disk so that after reboot 6451 * the user can read log and see why the system rebooted. 6452 */ 6453 if (need_emergency_restart && amdgpu_ras_get_context(adev) && 6454 amdgpu_ras_get_context(adev)->reboot) { 6455 dev_warn(adev->dev, "Emergency reboot."); 6456 6457 ksys_sync_helper(); 6458 emergency_restart(); 6459 } 6460 6461 dev_info(adev->dev, "GPU %s begin!. Source: %d\n", 6462 need_emergency_restart ? "jobs stop" : "reset", 6463 reset_context->src); 6464 6465 if (!amdgpu_sriov_vf(adev)) 6466 hive = amdgpu_get_xgmi_hive(adev); 6467 if (hive) 6468 mutex_lock(&hive->hive_lock); 6469 6470 reset_context->job = job; 6471 reset_context->hive = hive; 6472 INIT_LIST_HEAD(&device_list); 6473 6474 amdgpu_device_recovery_prepare(adev, &device_list, hive); 6475 6476 if (!amdgpu_sriov_vf(adev)) { 6477 r = amdgpu_device_health_check(&device_list); 6478 if (r) 6479 goto end_reset; 6480 } 6481 6482 /* We need to lock reset domain only once both for XGMI and single device */ 6483 amdgpu_device_recovery_get_reset_lock(adev, &device_list); 6484 6485 amdgpu_device_halt_activities(adev, job, reset_context, &device_list, 6486 hive, need_emergency_restart); 6487 if (need_emergency_restart) 6488 goto skip_sched_resume; 6489 /* 6490 * Must check guilty signal here since after this point all old 6491 * HW fences are force signaled. 6492 * 6493 * job->base holds a reference to parent fence 6494 */ 6495 if (job && dma_fence_is_signaled(&job->hw_fence.base)) { 6496 job_signaled = true; 6497 dev_info(adev->dev, "Guilty job already signaled, skipping HW reset"); 6498 goto skip_hw_reset; 6499 } 6500 6501 r = amdgpu_device_asic_reset(adev, &device_list, reset_context); 6502 if (r) 6503 goto reset_unlock; 6504 skip_hw_reset: 6505 r = amdgpu_device_sched_resume(&device_list, reset_context, job_signaled); 6506 if (r) 6507 goto reset_unlock; 6508 skip_sched_resume: 6509 amdgpu_device_gpu_resume(adev, &device_list, need_emergency_restart); 6510 reset_unlock: 6511 amdgpu_device_recovery_put_reset_lock(adev, &device_list); 6512 end_reset: 6513 if (hive) { 6514 mutex_unlock(&hive->hive_lock); 6515 amdgpu_put_xgmi_hive(hive); 6516 } 6517 6518 if (r) 6519 dev_info(adev->dev, "GPU reset end with ret = %d\n", r); 6520 6521 atomic_set(&adev->reset_domain->reset_res, r); 6522 6523 if (!r) { 6524 struct amdgpu_task_info *ti = NULL; 6525 6526 if (job) 6527 ti = amdgpu_vm_get_task_info_pasid(adev, job->pasid); 6528 6529 drm_dev_wedged_event(adev_to_drm(adev), DRM_WEDGE_RECOVERY_NONE, 6530 ti ? &ti->task : NULL); 6531 6532 amdgpu_vm_put_task_info(ti); 6533 } 6534 6535 return r; 6536 } 6537 6538 /** 6539 * amdgpu_device_partner_bandwidth - find the bandwidth of appropriate partner 6540 * 6541 * @adev: amdgpu_device pointer 6542 * @speed: pointer to the speed of the link 6543 * @width: pointer to the width of the link 6544 * 6545 * Evaluate the hierarchy to find the speed and bandwidth capabilities of the 6546 * first physical partner to an AMD dGPU. 6547 * This will exclude any virtual switches and links. 6548 */ 6549 static void amdgpu_device_partner_bandwidth(struct amdgpu_device *adev, 6550 enum pci_bus_speed *speed, 6551 enum pcie_link_width *width) 6552 { 6553 struct pci_dev *parent = adev->pdev; 6554 6555 if (!speed || !width) 6556 return; 6557 6558 *speed = PCI_SPEED_UNKNOWN; 6559 *width = PCIE_LNK_WIDTH_UNKNOWN; 6560 6561 if (amdgpu_device_pcie_dynamic_switching_supported(adev)) { 6562 while ((parent = pci_upstream_bridge(parent))) { 6563 /* skip upstream/downstream switches internal to dGPU*/ 6564 if (parent->vendor == PCI_VENDOR_ID_ATI) 6565 continue; 6566 *speed = pcie_get_speed_cap(parent); 6567 *width = pcie_get_width_cap(parent); 6568 break; 6569 } 6570 } else { 6571 /* use the current speeds rather than max if switching is not supported */ 6572 pcie_bandwidth_available(adev->pdev, NULL, speed, width); 6573 } 6574 } 6575 6576 /** 6577 * amdgpu_device_gpu_bandwidth - find the bandwidth of the GPU 6578 * 6579 * @adev: amdgpu_device pointer 6580 * @speed: pointer to the speed of the link 6581 * @width: pointer to the width of the link 6582 * 6583 * Evaluate the hierarchy to find the speed and bandwidth capabilities of the 6584 * AMD dGPU which may be a virtual upstream bridge. 6585 */ 6586 static void amdgpu_device_gpu_bandwidth(struct amdgpu_device *adev, 6587 enum pci_bus_speed *speed, 6588 enum pcie_link_width *width) 6589 { 6590 struct pci_dev *parent = adev->pdev; 6591 6592 if (!speed || !width) 6593 return; 6594 6595 parent = pci_upstream_bridge(parent); 6596 if (parent && parent->vendor == PCI_VENDOR_ID_ATI) { 6597 /* use the upstream/downstream switches internal to dGPU */ 6598 *speed = pcie_get_speed_cap(parent); 6599 *width = pcie_get_width_cap(parent); 6600 while ((parent = pci_upstream_bridge(parent))) { 6601 if (parent->vendor == PCI_VENDOR_ID_ATI) { 6602 /* use the upstream/downstream switches internal to dGPU */ 6603 *speed = pcie_get_speed_cap(parent); 6604 *width = pcie_get_width_cap(parent); 6605 } 6606 } 6607 } else { 6608 /* use the device itself */ 6609 *speed = pcie_get_speed_cap(adev->pdev); 6610 *width = pcie_get_width_cap(adev->pdev); 6611 } 6612 } 6613 6614 /** 6615 * amdgpu_device_get_pcie_info - fence pcie info about the PCIE slot 6616 * 6617 * @adev: amdgpu_device pointer 6618 * 6619 * Fetches and stores in the driver the PCIE capabilities (gen speed 6620 * and lanes) of the slot the device is in. Handles APUs and 6621 * virtualized environments where PCIE config space may not be available. 6622 */ 6623 static void amdgpu_device_get_pcie_info(struct amdgpu_device *adev) 6624 { 6625 enum pci_bus_speed speed_cap, platform_speed_cap; 6626 enum pcie_link_width platform_link_width, link_width; 6627 6628 if (amdgpu_pcie_gen_cap) 6629 adev->pm.pcie_gen_mask = amdgpu_pcie_gen_cap; 6630 6631 if (amdgpu_pcie_lane_cap) 6632 adev->pm.pcie_mlw_mask = amdgpu_pcie_lane_cap; 6633 6634 /* covers APUs as well */ 6635 if (pci_is_root_bus(adev->pdev->bus) && !amdgpu_passthrough(adev)) { 6636 if (adev->pm.pcie_gen_mask == 0) 6637 adev->pm.pcie_gen_mask = AMDGPU_DEFAULT_PCIE_GEN_MASK; 6638 if (adev->pm.pcie_mlw_mask == 0) 6639 adev->pm.pcie_mlw_mask = AMDGPU_DEFAULT_PCIE_MLW_MASK; 6640 return; 6641 } 6642 6643 if (adev->pm.pcie_gen_mask && adev->pm.pcie_mlw_mask) 6644 return; 6645 6646 amdgpu_device_partner_bandwidth(adev, &platform_speed_cap, 6647 &platform_link_width); 6648 amdgpu_device_gpu_bandwidth(adev, &speed_cap, &link_width); 6649 6650 if (adev->pm.pcie_gen_mask == 0) { 6651 /* asic caps */ 6652 if (speed_cap == PCI_SPEED_UNKNOWN) { 6653 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 6654 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 | 6655 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3); 6656 } else { 6657 if (speed_cap == PCIE_SPEED_32_0GT) 6658 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 6659 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 | 6660 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3 | 6661 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN4 | 6662 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN5); 6663 else if (speed_cap == PCIE_SPEED_16_0GT) 6664 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 6665 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 | 6666 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3 | 6667 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN4); 6668 else if (speed_cap == PCIE_SPEED_8_0GT) 6669 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 6670 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 | 6671 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3); 6672 else if (speed_cap == PCIE_SPEED_5_0GT) 6673 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 | 6674 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2); 6675 else 6676 adev->pm.pcie_gen_mask |= CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1; 6677 } 6678 /* platform caps */ 6679 if (platform_speed_cap == PCI_SPEED_UNKNOWN) { 6680 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 6681 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2); 6682 } else { 6683 if (platform_speed_cap == PCIE_SPEED_32_0GT) 6684 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 6685 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 | 6686 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3 | 6687 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN4 | 6688 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN5); 6689 else if (platform_speed_cap == PCIE_SPEED_16_0GT) 6690 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 6691 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 | 6692 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3 | 6693 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN4); 6694 else if (platform_speed_cap == PCIE_SPEED_8_0GT) 6695 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 6696 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 | 6697 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3); 6698 else if (platform_speed_cap == PCIE_SPEED_5_0GT) 6699 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | 6700 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2); 6701 else 6702 adev->pm.pcie_gen_mask |= CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1; 6703 6704 } 6705 } 6706 if (adev->pm.pcie_mlw_mask == 0) { 6707 /* asic caps */ 6708 if (link_width == PCIE_LNK_WIDTH_UNKNOWN) { 6709 adev->pm.pcie_mlw_mask |= AMDGPU_DEFAULT_ASIC_PCIE_MLW_MASK; 6710 } else { 6711 switch (link_width) { 6712 case PCIE_LNK_X32: 6713 adev->pm.pcie_mlw_mask |= (CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X32 | 6714 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X16 | 6715 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X12 | 6716 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X8 | 6717 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X4 | 6718 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X2 | 6719 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X1); 6720 break; 6721 case PCIE_LNK_X16: 6722 adev->pm.pcie_mlw_mask |= (CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X16 | 6723 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X12 | 6724 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X8 | 6725 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X4 | 6726 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X2 | 6727 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X1); 6728 break; 6729 case PCIE_LNK_X12: 6730 adev->pm.pcie_mlw_mask |= (CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X12 | 6731 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X8 | 6732 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X4 | 6733 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X2 | 6734 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X1); 6735 break; 6736 case PCIE_LNK_X8: 6737 adev->pm.pcie_mlw_mask |= (CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X8 | 6738 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X4 | 6739 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X2 | 6740 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X1); 6741 break; 6742 case PCIE_LNK_X4: 6743 adev->pm.pcie_mlw_mask |= (CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X4 | 6744 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X2 | 6745 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X1); 6746 break; 6747 case PCIE_LNK_X2: 6748 adev->pm.pcie_mlw_mask |= (CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X2 | 6749 CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X1); 6750 break; 6751 case PCIE_LNK_X1: 6752 adev->pm.pcie_mlw_mask |= CAIL_ASIC_PCIE_LINK_WIDTH_SUPPORT_X1; 6753 break; 6754 default: 6755 break; 6756 } 6757 } 6758 /* platform caps */ 6759 if (platform_link_width == PCIE_LNK_WIDTH_UNKNOWN) { 6760 adev->pm.pcie_mlw_mask |= AMDGPU_DEFAULT_PCIE_MLW_MASK; 6761 } else { 6762 switch (platform_link_width) { 6763 case PCIE_LNK_X32: 6764 adev->pm.pcie_mlw_mask |= (CAIL_PCIE_LINK_WIDTH_SUPPORT_X32 | 6765 CAIL_PCIE_LINK_WIDTH_SUPPORT_X16 | 6766 CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 | 6767 CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 | 6768 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 6769 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 6770 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 6771 break; 6772 case PCIE_LNK_X16: 6773 adev->pm.pcie_mlw_mask |= (CAIL_PCIE_LINK_WIDTH_SUPPORT_X16 | 6774 CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 | 6775 CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 | 6776 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 6777 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 6778 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 6779 break; 6780 case PCIE_LNK_X12: 6781 adev->pm.pcie_mlw_mask |= (CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 | 6782 CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 | 6783 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 6784 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 6785 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 6786 break; 6787 case PCIE_LNK_X8: 6788 adev->pm.pcie_mlw_mask |= (CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 | 6789 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 6790 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 6791 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 6792 break; 6793 case PCIE_LNK_X4: 6794 adev->pm.pcie_mlw_mask |= (CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 | 6795 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 6796 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 6797 break; 6798 case PCIE_LNK_X2: 6799 adev->pm.pcie_mlw_mask |= (CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 | 6800 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1); 6801 break; 6802 case PCIE_LNK_X1: 6803 adev->pm.pcie_mlw_mask |= CAIL_PCIE_LINK_WIDTH_SUPPORT_X1; 6804 break; 6805 default: 6806 break; 6807 } 6808 } 6809 } 6810 } 6811 6812 /** 6813 * amdgpu_device_is_peer_accessible - Check peer access through PCIe BAR 6814 * 6815 * @adev: amdgpu_device pointer 6816 * @peer_adev: amdgpu_device pointer for peer device trying to access @adev 6817 * 6818 * Return true if @peer_adev can access (DMA) @adev through the PCIe 6819 * BAR, i.e. @adev is "large BAR" and the BAR matches the DMA mask of 6820 * @peer_adev. 6821 */ 6822 bool amdgpu_device_is_peer_accessible(struct amdgpu_device *adev, 6823 struct amdgpu_device *peer_adev) 6824 { 6825 #ifdef CONFIG_HSA_AMD_P2P 6826 bool p2p_access = 6827 !adev->gmc.xgmi.connected_to_cpu && 6828 !(pci_p2pdma_distance(adev->pdev, peer_adev->dev, false) < 0); 6829 if (!p2p_access) 6830 dev_info(adev->dev, "PCIe P2P access from peer device %s is not supported by the chipset\n", 6831 pci_name(peer_adev->pdev)); 6832 6833 bool is_large_bar = adev->gmc.visible_vram_size && 6834 adev->gmc.real_vram_size == adev->gmc.visible_vram_size; 6835 bool p2p_addressable = amdgpu_device_check_iommu_remap(peer_adev); 6836 6837 if (!p2p_addressable) { 6838 uint64_t address_mask = peer_adev->dev->dma_mask ? 6839 ~*peer_adev->dev->dma_mask : ~((1ULL << 32) - 1); 6840 resource_size_t aper_limit = 6841 adev->gmc.aper_base + adev->gmc.aper_size - 1; 6842 6843 p2p_addressable = !(adev->gmc.aper_base & address_mask || 6844 aper_limit & address_mask); 6845 } 6846 return pcie_p2p && is_large_bar && p2p_access && p2p_addressable; 6847 #else 6848 return false; 6849 #endif 6850 } 6851 6852 int amdgpu_device_baco_enter(struct amdgpu_device *adev) 6853 { 6854 struct amdgpu_ras *ras = amdgpu_ras_get_context(adev); 6855 6856 if (!amdgpu_device_supports_baco(adev)) 6857 return -ENOTSUPP; 6858 6859 if (ras && adev->ras_enabled && 6860 adev->nbio.funcs->enable_doorbell_interrupt) 6861 adev->nbio.funcs->enable_doorbell_interrupt(adev, false); 6862 6863 return amdgpu_dpm_baco_enter(adev); 6864 } 6865 6866 int amdgpu_device_baco_exit(struct amdgpu_device *adev) 6867 { 6868 struct amdgpu_ras *ras = amdgpu_ras_get_context(adev); 6869 int ret = 0; 6870 6871 if (!amdgpu_device_supports_baco(adev)) 6872 return -ENOTSUPP; 6873 6874 ret = amdgpu_dpm_baco_exit(adev); 6875 if (ret) 6876 return ret; 6877 6878 if (ras && adev->ras_enabled && 6879 adev->nbio.funcs->enable_doorbell_interrupt) 6880 adev->nbio.funcs->enable_doorbell_interrupt(adev, true); 6881 6882 if (amdgpu_passthrough(adev) && adev->nbio.funcs && 6883 adev->nbio.funcs->clear_doorbell_interrupt) 6884 adev->nbio.funcs->clear_doorbell_interrupt(adev); 6885 6886 return 0; 6887 } 6888 6889 /** 6890 * amdgpu_pci_error_detected - Called when a PCI error is detected. 6891 * @pdev: PCI device struct 6892 * @state: PCI channel state 6893 * 6894 * Description: Called when a PCI error is detected. 6895 * 6896 * Return: PCI_ERS_RESULT_NEED_RESET or PCI_ERS_RESULT_DISCONNECT. 6897 */ 6898 pci_ers_result_t amdgpu_pci_error_detected(struct pci_dev *pdev, pci_channel_state_t state) 6899 { 6900 struct drm_device *dev = pci_get_drvdata(pdev); 6901 struct amdgpu_device *adev = drm_to_adev(dev); 6902 struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev); 6903 struct amdgpu_reset_context reset_context; 6904 struct list_head device_list; 6905 6906 dev_info(adev->dev, "PCI error: detected callback!!\n"); 6907 6908 adev->pci_channel_state = state; 6909 6910 switch (state) { 6911 case pci_channel_io_normal: 6912 dev_info(adev->dev, "pci_channel_io_normal: state(%d)!!\n", state); 6913 return PCI_ERS_RESULT_CAN_RECOVER; 6914 case pci_channel_io_frozen: 6915 /* Fatal error, prepare for slot reset */ 6916 dev_info(adev->dev, "pci_channel_io_frozen: state(%d)!!\n", state); 6917 if (hive) { 6918 /* Hive devices should be able to support FW based 6919 * link reset on other devices, if not return. 6920 */ 6921 if (!amdgpu_dpm_is_link_reset_supported(adev)) { 6922 dev_warn(adev->dev, 6923 "No support for XGMI hive yet...\n"); 6924 return PCI_ERS_RESULT_DISCONNECT; 6925 } 6926 /* Set dpc status only if device is part of hive 6927 * Non-hive devices should be able to recover after 6928 * link reset. 6929 */ 6930 amdgpu_reset_set_dpc_status(adev, true); 6931 6932 mutex_lock(&hive->hive_lock); 6933 } 6934 memset(&reset_context, 0, sizeof(reset_context)); 6935 INIT_LIST_HEAD(&device_list); 6936 6937 amdgpu_device_recovery_prepare(adev, &device_list, hive); 6938 amdgpu_device_recovery_get_reset_lock(adev, &device_list); 6939 amdgpu_device_halt_activities(adev, NULL, &reset_context, &device_list, 6940 hive, false); 6941 if (hive) { 6942 mutex_unlock(&hive->hive_lock); 6943 amdgpu_put_xgmi_hive(hive); 6944 } 6945 return PCI_ERS_RESULT_NEED_RESET; 6946 case pci_channel_io_perm_failure: 6947 /* Permanent error, prepare for device removal */ 6948 dev_info(adev->dev, "pci_channel_io_perm_failure: state(%d)!!\n", state); 6949 return PCI_ERS_RESULT_DISCONNECT; 6950 } 6951 6952 return PCI_ERS_RESULT_NEED_RESET; 6953 } 6954 6955 /** 6956 * amdgpu_pci_mmio_enabled - Enable MMIO and dump debug registers 6957 * @pdev: pointer to PCI device 6958 */ 6959 pci_ers_result_t amdgpu_pci_mmio_enabled(struct pci_dev *pdev) 6960 { 6961 struct drm_device *dev = pci_get_drvdata(pdev); 6962 struct amdgpu_device *adev = drm_to_adev(dev); 6963 6964 dev_info(adev->dev, "PCI error: mmio enabled callback!!\n"); 6965 6966 /* TODO - dump whatever for debugging purposes */ 6967 6968 /* This called only if amdgpu_pci_error_detected returns 6969 * PCI_ERS_RESULT_CAN_RECOVER. Read/write to the device still 6970 * works, no need to reset slot. 6971 */ 6972 6973 return PCI_ERS_RESULT_RECOVERED; 6974 } 6975 6976 /** 6977 * amdgpu_pci_slot_reset - Called when PCI slot has been reset. 6978 * @pdev: PCI device struct 6979 * 6980 * Description: This routine is called by the pci error recovery 6981 * code after the PCI slot has been reset, just before we 6982 * should resume normal operations. 6983 */ 6984 pci_ers_result_t amdgpu_pci_slot_reset(struct pci_dev *pdev) 6985 { 6986 struct drm_device *dev = pci_get_drvdata(pdev); 6987 struct amdgpu_device *adev = drm_to_adev(dev); 6988 struct amdgpu_reset_context reset_context; 6989 struct amdgpu_device *tmp_adev; 6990 struct amdgpu_hive_info *hive; 6991 struct list_head device_list; 6992 struct pci_dev *link_dev; 6993 int r = 0, i, timeout; 6994 u32 memsize; 6995 u16 status; 6996 6997 dev_info(adev->dev, "PCI error: slot reset callback!!\n"); 6998 6999 memset(&reset_context, 0, sizeof(reset_context)); 7000 7001 if (adev->pcie_reset_ctx.swus) 7002 link_dev = adev->pcie_reset_ctx.swus; 7003 else 7004 link_dev = adev->pdev; 7005 /* wait for asic to come out of reset, timeout = 10s */ 7006 timeout = 10000; 7007 do { 7008 usleep_range(10000, 10500); 7009 r = pci_read_config_word(link_dev, PCI_VENDOR_ID, &status); 7010 timeout -= 10; 7011 } while (timeout > 0 && (status != PCI_VENDOR_ID_ATI) && 7012 (status != PCI_VENDOR_ID_AMD)); 7013 7014 if ((status != PCI_VENDOR_ID_ATI) && (status != PCI_VENDOR_ID_AMD)) { 7015 r = -ETIME; 7016 goto out; 7017 } 7018 7019 amdgpu_device_load_switch_state(adev); 7020 /* Restore PCI confspace */ 7021 amdgpu_device_load_pci_state(pdev); 7022 7023 /* confirm ASIC came out of reset */ 7024 for (i = 0; i < adev->usec_timeout; i++) { 7025 memsize = amdgpu_asic_get_config_memsize(adev); 7026 7027 if (memsize != 0xffffffff) 7028 break; 7029 udelay(1); 7030 } 7031 if (memsize == 0xffffffff) { 7032 r = -ETIME; 7033 goto out; 7034 } 7035 7036 reset_context.method = AMD_RESET_METHOD_NONE; 7037 reset_context.reset_req_dev = adev; 7038 set_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags); 7039 set_bit(AMDGPU_SKIP_COREDUMP, &reset_context.flags); 7040 INIT_LIST_HEAD(&device_list); 7041 7042 hive = amdgpu_get_xgmi_hive(adev); 7043 if (hive) { 7044 mutex_lock(&hive->hive_lock); 7045 reset_context.hive = hive; 7046 list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head) { 7047 tmp_adev->pcie_reset_ctx.in_link_reset = true; 7048 list_add_tail(&tmp_adev->reset_list, &device_list); 7049 } 7050 } else { 7051 set_bit(AMDGPU_SKIP_HW_RESET, &reset_context.flags); 7052 list_add_tail(&adev->reset_list, &device_list); 7053 } 7054 7055 r = amdgpu_device_asic_reset(adev, &device_list, &reset_context); 7056 out: 7057 if (!r) { 7058 if (amdgpu_device_cache_pci_state(adev->pdev)) 7059 pci_restore_state(adev->pdev); 7060 dev_info(adev->dev, "PCIe error recovery succeeded\n"); 7061 } else { 7062 dev_err(adev->dev, "PCIe error recovery failed, err:%d\n", r); 7063 if (hive) { 7064 list_for_each_entry(tmp_adev, &device_list, reset_list) 7065 amdgpu_device_unset_mp1_state(tmp_adev); 7066 } 7067 amdgpu_device_recovery_put_reset_lock(adev, &device_list); 7068 } 7069 7070 if (hive) { 7071 mutex_unlock(&hive->hive_lock); 7072 amdgpu_put_xgmi_hive(hive); 7073 } 7074 7075 return r ? PCI_ERS_RESULT_DISCONNECT : PCI_ERS_RESULT_RECOVERED; 7076 } 7077 7078 /** 7079 * amdgpu_pci_resume() - resume normal ops after PCI reset 7080 * @pdev: pointer to PCI device 7081 * 7082 * Called when the error recovery driver tells us that its 7083 * OK to resume normal operation. 7084 */ 7085 void amdgpu_pci_resume(struct pci_dev *pdev) 7086 { 7087 struct drm_device *dev = pci_get_drvdata(pdev); 7088 struct amdgpu_device *adev = drm_to_adev(dev); 7089 struct list_head device_list; 7090 struct amdgpu_hive_info *hive = NULL; 7091 struct amdgpu_device *tmp_adev = NULL; 7092 7093 dev_info(adev->dev, "PCI error: resume callback!!\n"); 7094 7095 /* Only continue execution for the case of pci_channel_io_frozen */ 7096 if (adev->pci_channel_state != pci_channel_io_frozen) 7097 return; 7098 7099 INIT_LIST_HEAD(&device_list); 7100 7101 hive = amdgpu_get_xgmi_hive(adev); 7102 if (hive) { 7103 mutex_lock(&hive->hive_lock); 7104 list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head) { 7105 tmp_adev->pcie_reset_ctx.in_link_reset = false; 7106 list_add_tail(&tmp_adev->reset_list, &device_list); 7107 } 7108 } else 7109 list_add_tail(&adev->reset_list, &device_list); 7110 7111 amdgpu_device_sched_resume(&device_list, NULL, NULL); 7112 amdgpu_device_gpu_resume(adev, &device_list, false); 7113 amdgpu_device_recovery_put_reset_lock(adev, &device_list); 7114 7115 if (hive) { 7116 mutex_unlock(&hive->hive_lock); 7117 amdgpu_put_xgmi_hive(hive); 7118 } 7119 } 7120 7121 static void amdgpu_device_cache_switch_state(struct amdgpu_device *adev) 7122 { 7123 struct pci_dev *parent = pci_upstream_bridge(adev->pdev); 7124 int r; 7125 7126 if (parent->vendor != PCI_VENDOR_ID_ATI) 7127 return; 7128 7129 /* If already saved, return */ 7130 if (adev->pcie_reset_ctx.swus) 7131 return; 7132 /* Upstream bridge is ATI, assume it's SWUS/DS architecture */ 7133 r = pci_save_state(parent); 7134 if (r) 7135 return; 7136 adev->pcie_reset_ctx.swds_pcistate = pci_store_saved_state(parent); 7137 7138 parent = pci_upstream_bridge(parent); 7139 r = pci_save_state(parent); 7140 if (r) 7141 return; 7142 adev->pcie_reset_ctx.swus_pcistate = pci_store_saved_state(parent); 7143 7144 adev->pcie_reset_ctx.swus = parent; 7145 } 7146 7147 static void amdgpu_device_load_switch_state(struct amdgpu_device *adev) 7148 { 7149 struct pci_dev *pdev; 7150 int r; 7151 7152 if (!adev->pcie_reset_ctx.swds_pcistate || 7153 !adev->pcie_reset_ctx.swus_pcistate) 7154 return; 7155 7156 pdev = adev->pcie_reset_ctx.swus; 7157 r = pci_load_saved_state(pdev, adev->pcie_reset_ctx.swus_pcistate); 7158 if (!r) { 7159 pci_restore_state(pdev); 7160 } else { 7161 dev_warn(adev->dev, "Failed to load SWUS state, err:%d\n", r); 7162 return; 7163 } 7164 7165 pdev = pci_upstream_bridge(adev->pdev); 7166 r = pci_load_saved_state(pdev, adev->pcie_reset_ctx.swds_pcistate); 7167 if (!r) 7168 pci_restore_state(pdev); 7169 else 7170 dev_warn(adev->dev, "Failed to load SWDS state, err:%d\n", r); 7171 } 7172 7173 bool amdgpu_device_cache_pci_state(struct pci_dev *pdev) 7174 { 7175 struct drm_device *dev = pci_get_drvdata(pdev); 7176 struct amdgpu_device *adev = drm_to_adev(dev); 7177 int r; 7178 7179 if (amdgpu_sriov_vf(adev)) 7180 return false; 7181 7182 r = pci_save_state(pdev); 7183 if (!r) { 7184 kfree(adev->pci_state); 7185 7186 adev->pci_state = pci_store_saved_state(pdev); 7187 7188 if (!adev->pci_state) { 7189 dev_err(adev->dev, "Failed to store PCI saved state"); 7190 return false; 7191 } 7192 } else { 7193 dev_warn(adev->dev, "Failed to save PCI state, err:%d\n", r); 7194 return false; 7195 } 7196 7197 amdgpu_device_cache_switch_state(adev); 7198 7199 return true; 7200 } 7201 7202 bool amdgpu_device_load_pci_state(struct pci_dev *pdev) 7203 { 7204 struct drm_device *dev = pci_get_drvdata(pdev); 7205 struct amdgpu_device *adev = drm_to_adev(dev); 7206 int r; 7207 7208 if (!adev->pci_state) 7209 return false; 7210 7211 r = pci_load_saved_state(pdev, adev->pci_state); 7212 7213 if (!r) { 7214 pci_restore_state(pdev); 7215 } else { 7216 dev_warn(adev->dev, "Failed to load PCI state, err:%d\n", r); 7217 return false; 7218 } 7219 7220 return true; 7221 } 7222 7223 void amdgpu_device_flush_hdp(struct amdgpu_device *adev, 7224 struct amdgpu_ring *ring) 7225 { 7226 #ifdef CONFIG_X86_64 7227 if ((adev->flags & AMD_IS_APU) && !amdgpu_passthrough(adev)) 7228 return; 7229 #endif 7230 if (adev->gmc.xgmi.connected_to_cpu) 7231 return; 7232 7233 if (ring && ring->funcs->emit_hdp_flush) 7234 amdgpu_ring_emit_hdp_flush(ring); 7235 else 7236 amdgpu_asic_flush_hdp(adev, ring); 7237 } 7238 7239 void amdgpu_device_invalidate_hdp(struct amdgpu_device *adev, 7240 struct amdgpu_ring *ring) 7241 { 7242 #ifdef CONFIG_X86_64 7243 if ((adev->flags & AMD_IS_APU) && !amdgpu_passthrough(adev)) 7244 return; 7245 #endif 7246 if (adev->gmc.xgmi.connected_to_cpu) 7247 return; 7248 7249 amdgpu_asic_invalidate_hdp(adev, ring); 7250 } 7251 7252 int amdgpu_in_reset(struct amdgpu_device *adev) 7253 { 7254 return atomic_read(&adev->reset_domain->in_gpu_reset); 7255 } 7256 7257 /** 7258 * amdgpu_device_halt() - bring hardware to some kind of halt state 7259 * 7260 * @adev: amdgpu_device pointer 7261 * 7262 * Bring hardware to some kind of halt state so that no one can touch it 7263 * any more. It will help to maintain error context when error occurred. 7264 * Compare to a simple hang, the system will keep stable at least for SSH 7265 * access. Then it should be trivial to inspect the hardware state and 7266 * see what's going on. Implemented as following: 7267 * 7268 * 1. drm_dev_unplug() makes device inaccessible to user space(IOCTLs, etc), 7269 * clears all CPU mappings to device, disallows remappings through page faults 7270 * 2. amdgpu_irq_disable_all() disables all interrupts 7271 * 3. amdgpu_fence_driver_hw_fini() signals all HW fences 7272 * 4. set adev->no_hw_access to avoid potential crashes after setp 5 7273 * 5. amdgpu_device_unmap_mmio() clears all MMIO mappings 7274 * 6. pci_disable_device() and pci_wait_for_pending_transaction() 7275 * flush any in flight DMA operations 7276 */ 7277 void amdgpu_device_halt(struct amdgpu_device *adev) 7278 { 7279 struct pci_dev *pdev = adev->pdev; 7280 struct drm_device *ddev = adev_to_drm(adev); 7281 7282 amdgpu_xcp_dev_unplug(adev); 7283 drm_dev_unplug(ddev); 7284 7285 amdgpu_irq_disable_all(adev); 7286 7287 amdgpu_fence_driver_hw_fini(adev); 7288 7289 adev->no_hw_access = true; 7290 7291 amdgpu_device_unmap_mmio(adev); 7292 7293 pci_disable_device(pdev); 7294 pci_wait_for_pending_transaction(pdev); 7295 } 7296 7297 u32 amdgpu_device_pcie_port_rreg(struct amdgpu_device *adev, 7298 u32 reg) 7299 { 7300 unsigned long flags, address, data; 7301 u32 r; 7302 7303 address = adev->nbio.funcs->get_pcie_port_index_offset(adev); 7304 data = adev->nbio.funcs->get_pcie_port_data_offset(adev); 7305 7306 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 7307 WREG32(address, reg * 4); 7308 (void)RREG32(address); 7309 r = RREG32(data); 7310 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 7311 return r; 7312 } 7313 7314 void amdgpu_device_pcie_port_wreg(struct amdgpu_device *adev, 7315 u32 reg, u32 v) 7316 { 7317 unsigned long flags, address, data; 7318 7319 address = adev->nbio.funcs->get_pcie_port_index_offset(adev); 7320 data = adev->nbio.funcs->get_pcie_port_data_offset(adev); 7321 7322 spin_lock_irqsave(&adev->pcie_idx_lock, flags); 7323 WREG32(address, reg * 4); 7324 (void)RREG32(address); 7325 WREG32(data, v); 7326 (void)RREG32(data); 7327 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); 7328 } 7329 7330 /** 7331 * amdgpu_device_get_gang - return a reference to the current gang 7332 * @adev: amdgpu_device pointer 7333 * 7334 * Returns: A new reference to the current gang leader. 7335 */ 7336 struct dma_fence *amdgpu_device_get_gang(struct amdgpu_device *adev) 7337 { 7338 struct dma_fence *fence; 7339 7340 rcu_read_lock(); 7341 fence = dma_fence_get_rcu_safe(&adev->gang_submit); 7342 rcu_read_unlock(); 7343 return fence; 7344 } 7345 7346 /** 7347 * amdgpu_device_switch_gang - switch to a new gang 7348 * @adev: amdgpu_device pointer 7349 * @gang: the gang to switch to 7350 * 7351 * Try to switch to a new gang. 7352 * Returns: NULL if we switched to the new gang or a reference to the current 7353 * gang leader. 7354 */ 7355 struct dma_fence *amdgpu_device_switch_gang(struct amdgpu_device *adev, 7356 struct dma_fence *gang) 7357 { 7358 struct dma_fence *old = NULL; 7359 7360 dma_fence_get(gang); 7361 do { 7362 dma_fence_put(old); 7363 old = amdgpu_device_get_gang(adev); 7364 if (old == gang) 7365 break; 7366 7367 if (!dma_fence_is_signaled(old)) { 7368 dma_fence_put(gang); 7369 return old; 7370 } 7371 7372 } while (cmpxchg((struct dma_fence __force **)&adev->gang_submit, 7373 old, gang) != old); 7374 7375 /* 7376 * Drop it once for the exchanged reference in adev and once for the 7377 * thread local reference acquired in amdgpu_device_get_gang(). 7378 */ 7379 dma_fence_put(old); 7380 dma_fence_put(old); 7381 return NULL; 7382 } 7383 7384 /** 7385 * amdgpu_device_enforce_isolation - enforce HW isolation 7386 * @adev: the amdgpu device pointer 7387 * @ring: the HW ring the job is supposed to run on 7388 * @job: the job which is about to be pushed to the HW ring 7389 * 7390 * Makes sure that only one client at a time can use the GFX block. 7391 * Returns: The dependency to wait on before the job can be pushed to the HW. 7392 * The function is called multiple times until NULL is returned. 7393 */ 7394 struct dma_fence *amdgpu_device_enforce_isolation(struct amdgpu_device *adev, 7395 struct amdgpu_ring *ring, 7396 struct amdgpu_job *job) 7397 { 7398 struct amdgpu_isolation *isolation = &adev->isolation[ring->xcp_id]; 7399 struct drm_sched_fence *f = job->base.s_fence; 7400 struct dma_fence *dep; 7401 void *owner; 7402 int r; 7403 7404 /* 7405 * For now enforce isolation only for the GFX block since we only need 7406 * the cleaner shader on those rings. 7407 */ 7408 if (ring->funcs->type != AMDGPU_RING_TYPE_GFX && 7409 ring->funcs->type != AMDGPU_RING_TYPE_COMPUTE) 7410 return NULL; 7411 7412 /* 7413 * All submissions where enforce isolation is false are handled as if 7414 * they come from a single client. Use ~0l as the owner to distinct it 7415 * from kernel submissions where the owner is NULL. 7416 */ 7417 owner = job->enforce_isolation ? f->owner : (void *)~0l; 7418 7419 mutex_lock(&adev->enforce_isolation_mutex); 7420 7421 /* 7422 * The "spearhead" submission is the first one which changes the 7423 * ownership to its client. We always need to wait for it to be 7424 * pushed to the HW before proceeding with anything. 7425 */ 7426 if (&f->scheduled != isolation->spearhead && 7427 !dma_fence_is_signaled(isolation->spearhead)) { 7428 dep = isolation->spearhead; 7429 goto out_grab_ref; 7430 } 7431 7432 if (isolation->owner != owner) { 7433 7434 /* 7435 * Wait for any gang to be assembled before switching to a 7436 * different owner or otherwise we could deadlock the 7437 * submissions. 7438 */ 7439 if (!job->gang_submit) { 7440 dep = amdgpu_device_get_gang(adev); 7441 if (!dma_fence_is_signaled(dep)) 7442 goto out_return_dep; 7443 dma_fence_put(dep); 7444 } 7445 7446 dma_fence_put(isolation->spearhead); 7447 isolation->spearhead = dma_fence_get(&f->scheduled); 7448 amdgpu_sync_move(&isolation->active, &isolation->prev); 7449 trace_amdgpu_isolation(isolation->owner, owner); 7450 isolation->owner = owner; 7451 } 7452 7453 /* 7454 * Specifying the ring here helps to pipeline submissions even when 7455 * isolation is enabled. If that is not desired for testing NULL can be 7456 * used instead of the ring to enforce a CPU round trip while switching 7457 * between clients. 7458 */ 7459 dep = amdgpu_sync_peek_fence(&isolation->prev, ring); 7460 r = amdgpu_sync_fence(&isolation->active, &f->finished, GFP_NOWAIT); 7461 if (r) 7462 dev_warn(adev->dev, "OOM tracking isolation\n"); 7463 7464 out_grab_ref: 7465 dma_fence_get(dep); 7466 out_return_dep: 7467 mutex_unlock(&adev->enforce_isolation_mutex); 7468 return dep; 7469 } 7470 7471 bool amdgpu_device_has_display_hardware(struct amdgpu_device *adev) 7472 { 7473 switch (adev->asic_type) { 7474 #ifdef CONFIG_DRM_AMDGPU_SI 7475 case CHIP_HAINAN: 7476 #endif 7477 case CHIP_TOPAZ: 7478 /* chips with no display hardware */ 7479 return false; 7480 #ifdef CONFIG_DRM_AMDGPU_SI 7481 case CHIP_TAHITI: 7482 case CHIP_PITCAIRN: 7483 case CHIP_VERDE: 7484 case CHIP_OLAND: 7485 #endif 7486 #ifdef CONFIG_DRM_AMDGPU_CIK 7487 case CHIP_BONAIRE: 7488 case CHIP_HAWAII: 7489 case CHIP_KAVERI: 7490 case CHIP_KABINI: 7491 case CHIP_MULLINS: 7492 #endif 7493 case CHIP_TONGA: 7494 case CHIP_FIJI: 7495 case CHIP_POLARIS10: 7496 case CHIP_POLARIS11: 7497 case CHIP_POLARIS12: 7498 case CHIP_VEGAM: 7499 case CHIP_CARRIZO: 7500 case CHIP_STONEY: 7501 /* chips with display hardware */ 7502 return true; 7503 default: 7504 /* IP discovery */ 7505 if (!amdgpu_ip_version(adev, DCE_HWIP, 0) || 7506 (adev->harvest_ip_mask & AMD_HARVEST_IP_DMU_MASK)) 7507 return false; 7508 return true; 7509 } 7510 } 7511 7512 uint32_t amdgpu_device_wait_on_rreg(struct amdgpu_device *adev, 7513 uint32_t inst, uint32_t reg_addr, char reg_name[], 7514 uint32_t expected_value, uint32_t mask) 7515 { 7516 uint32_t ret = 0; 7517 uint32_t old_ = 0; 7518 uint32_t tmp_ = RREG32(reg_addr); 7519 uint32_t loop = adev->usec_timeout; 7520 7521 while ((tmp_ & (mask)) != (expected_value)) { 7522 if (old_ != tmp_) { 7523 loop = adev->usec_timeout; 7524 old_ = tmp_; 7525 } else 7526 udelay(1); 7527 tmp_ = RREG32(reg_addr); 7528 loop--; 7529 if (!loop) { 7530 dev_warn( 7531 adev->dev, 7532 "Register(%d) [%s] failed to reach value 0x%08x != 0x%08xn", 7533 inst, reg_name, (uint32_t)expected_value, 7534 (uint32_t)(tmp_ & (mask))); 7535 ret = -ETIMEDOUT; 7536 break; 7537 } 7538 } 7539 return ret; 7540 } 7541 7542 ssize_t amdgpu_get_soft_full_reset_mask(struct amdgpu_ring *ring) 7543 { 7544 ssize_t size = 0; 7545 7546 if (!ring || !ring->adev) 7547 return size; 7548 7549 if (amdgpu_device_should_recover_gpu(ring->adev)) 7550 size |= AMDGPU_RESET_TYPE_FULL; 7551 7552 if (unlikely(!ring->adev->debug_disable_soft_recovery) && 7553 !amdgpu_sriov_vf(ring->adev) && ring->funcs->soft_recovery) 7554 size |= AMDGPU_RESET_TYPE_SOFT_RESET; 7555 7556 return size; 7557 } 7558 7559 ssize_t amdgpu_show_reset_mask(char *buf, uint32_t supported_reset) 7560 { 7561 ssize_t size = 0; 7562 7563 if (supported_reset == 0) { 7564 size += sysfs_emit_at(buf, size, "unsupported"); 7565 size += sysfs_emit_at(buf, size, "\n"); 7566 return size; 7567 7568 } 7569 7570 if (supported_reset & AMDGPU_RESET_TYPE_SOFT_RESET) 7571 size += sysfs_emit_at(buf, size, "soft "); 7572 7573 if (supported_reset & AMDGPU_RESET_TYPE_PER_QUEUE) 7574 size += sysfs_emit_at(buf, size, "queue "); 7575 7576 if (supported_reset & AMDGPU_RESET_TYPE_PER_PIPE) 7577 size += sysfs_emit_at(buf, size, "pipe "); 7578 7579 if (supported_reset & AMDGPU_RESET_TYPE_FULL) 7580 size += sysfs_emit_at(buf, size, "full "); 7581 7582 size += sysfs_emit_at(buf, size, "\n"); 7583 return size; 7584 } 7585 7586 void amdgpu_device_set_uid(struct amdgpu_uid *uid_info, 7587 enum amdgpu_uid_type type, uint8_t inst, 7588 uint64_t uid) 7589 { 7590 if (!uid_info) 7591 return; 7592 7593 if (type >= AMDGPU_UID_TYPE_MAX) { 7594 dev_err_once(uid_info->adev->dev, "Invalid UID type %d\n", 7595 type); 7596 return; 7597 } 7598 7599 if (inst >= AMDGPU_UID_INST_MAX) { 7600 dev_err_once(uid_info->adev->dev, "Invalid UID instance %d\n", 7601 inst); 7602 return; 7603 } 7604 7605 if (uid_info->uid[type][inst] != 0) { 7606 dev_warn_once( 7607 uid_info->adev->dev, 7608 "Overwriting existing UID %llu for type %d instance %d\n", 7609 uid_info->uid[type][inst], type, inst); 7610 } 7611 7612 uid_info->uid[type][inst] = uid; 7613 } 7614 7615 u64 amdgpu_device_get_uid(struct amdgpu_uid *uid_info, 7616 enum amdgpu_uid_type type, uint8_t inst) 7617 { 7618 if (!uid_info) 7619 return 0; 7620 7621 if (type >= AMDGPU_UID_TYPE_MAX) { 7622 dev_err_once(uid_info->adev->dev, "Invalid UID type %d\n", 7623 type); 7624 return 0; 7625 } 7626 7627 if (inst >= AMDGPU_UID_INST_MAX) { 7628 dev_err_once(uid_info->adev->dev, "Invalid UID instance %d\n", 7629 inst); 7630 return 0; 7631 } 7632 7633 return uid_info->uid[type][inst]; 7634 } 7635